scalapack-2.0.2/000755 000766 000024 00000000000 11750301630 013705 5ustar00juliestaff000000 000000 scalapack-2.0.2/BLACS/000755 000766 000024 00000000000 11750301555 014537 5ustar00juliestaff000000 000000 scalapack-2.0.2/CMAKE/000755 000766 000024 00000000000 11750301553 014531 5ustar00juliestaff000000 000000 scalapack-2.0.2/CMakeLists.txt000644 000766 000024 00000025016 11750130340 016447 0ustar00juliestaff000000 000000 cmake_minimum_required(VERSION 2.8) project(SCALAPACK C Fortran) # Configure the warning and code coverage suppression file configure_file( "${SCALAPACK_SOURCE_DIR}/CMAKE/CTestCustom.cmake.in" "${SCALAPACK_BINARY_DIR}/CTestCustom.cmake" COPYONLY ) # Add the CMake directory for custon CMake modules set(CMAKE_MODULE_PATH "${SCALAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH}) if (UNIX) if ( "${CMAKE_Fortran_COMPILER}" MATCHES "ifort" ) set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fltconsistency -fp_port" ) endif () endif () # # MPI # #set(MPI_BASE_DIR "/Users/julie/opt/openmpi/" CACHE PATH "MPI Path") #set(MPI_BASE_DIR "/Users/julie/opt/mpich2/" CACHE PATH "MPI Path") set(CMAKE_PREFIX_PATH "${MPI_BASE_DIR};${CMAKE_PREFIX_PATH}") #set(MPI_COMPILER ${MPI_BASE_DIR}/bin/mpicc) find_package(MPI) if (MPI_FOUND) message(STATUS "Found MPI_LIBRARY : ${MPI_FOUND} ") INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH}) find_program(MPI_C_COMPILER NAMES mpicc HINTS "${MPI_BASE_DIR}" PATH_SUFFIXES bin DOC "MPI C compiler.") MARK_AS_ADVANCED(MPI_C_COMPILER) if ("${MPI_C_COMPILER}" STREQUAL "MPI_C_COMPILER-NOTFOUND") message(ERROR "--> MPI C Compiler NOT FOUND (please set MPI_BASE_DIR accordingly") else() message(STATUS "--> MPI C Compiler : ${MPI_C_COMPILER}") SET(CMAKE_C_COMPILER "${MPI_C_COMPILER}") message(STATUS "--> C Compiler : ${CMAKE_C_COMPILER}") endif() find_program(MPI_Fortran_COMPILER NAMES mpif77 HINTS "${MPI_BASE_DIR}" PATH_SUFFIXES bin DOC "MPI Fortran compiler.") MARK_AS_ADVANCED(MPI_Fortran_COMPILER) if ("${MPI_Fortran_COMPILER}" STREQUAL "MPI_Fortran_COMPILER-NOTFOUND") message(ERROR "--> MPI Fortran Compiler NOT FOUND (please set MPI_BASE_DIR accordingly") else() message(STATUS "--> MPI Fortran Compiler : ${MPI_Fortran_COMPILER}") SET(Fortran_COMPILER "${CMAKE_Fortran_COMPILER}") SET(CMAKE_Fortran_COMPILER "${MPI_Fortran_COMPILER}") message(STATUS "--> Fortran Compiler : ${CMAKE_Fortran_COMPILER}") endif() else() message(STATUS "Found MPI_LIBRARY : ${MPI_FOUND} ") set(MPI_BASE_DIR ${MPI_BASE_DIR} CACHE PATH "MPI Path") UNSET(MPIEXEC CACHE) UNSET(MPIEXEC_POSTFLAGS CACHE) UNSET(MPIEXEC_PREFLAGS CACHE) UNSET(MPIEXEC_MAX_NUMPROCS CACHE) UNSET(MPIEXEC_NUMPROC_FLAG CACHE) UNSET(MPI_COMPILE_FLAGS CACHE) UNSET(MPI_LINK_FLAGS CACHE) UNSET(MPI_INCLUDE_PATH CACHE) message(FATAL_ERROR "--> MPI Library NOT FOUND -- please set MPI_BASE_DIR accordingly --") endif() if (UNIX) if ( "${CMAKE_Fortran_COMPILER}" MATCHES "ifort" ) set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fltconsistency -fp_port" ) endif () endif () macro(SCALAPACK_install_library lib) install(TARGETS ${lib} EXPORT scalapack-targets ARCHIVE DESTINATION lib${LIB_SUFFIX} LIBRARY DESTINATION lib${LIB_SUFFIX} RUNTIME DESTINATION Testing ) endmacro() # -------------------------------------------------- # Testing SET(DART_TESTING_TIMEOUT 600) enable_testing() include(CTest) enable_testing() # -------------------------------------------------- # Organize output files. On Windows this also keeps .dll files next # to the .exe files that need them, making tests easy to run. set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/TESTING) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/lib) # -------------------------------------------------- # Check for any necessary platform specific compiler flags include( CheckBLACSCompilerFlags ) CheckBLACSCompilerFlags() set(prefix ${CMAKE_INSTALL_PREFIX}) set(libdir ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}) set(PKG_CONFIG_DIR ${libdir}/pkgconfig) # -------------------------------------------------- # BLACS Internal variables # # Fortran Mangling, MPI Tests and BLACS settings # include(FortranMangling) COMPILE(install_COMPILED) FORTRAN_MANGLING(CDEFS) #MESSAGE(STATUS "Setting CDEFS = ${CDEFS}") #set(CDEFS ${CDEFS} CACHE STRING "Fortran Mangling" FORCE) MESSAGE(STATUS "=========") # -------------------------------------------------- # Compiler Flags ADD_DEFINITIONS( "-D${CDEFS}") # -------------------------------------------------- # ScaLAPACK needs BLAS and LAPACK option(USE_OPTIMIZED_LAPACK_BLAS "Whether or not to search for optimized LAPACK and BLAS libraries on your machine (if not found, Reference LAPACK and BLAS will be downloaded and installed)" ON) message(STATUS "CHECKING BLAS AND LAPACK LIBRARIES") IF(LAPACK_LIBRARIES) include(CheckFortranFunctionExists) message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.") set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) CHECK_FORTRAN_FUNCTION_EXISTS("dgesv" LAPACK_FOUND) unset( CMAKE_REQUIRED_LIBRARIES ) message(STATUS "--> LAPACK routine dgesv is found: ${LAPACK_FOUND}.") ENDIF() if(LAPACK_FOUND) message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.") else(LAPACK_FOUND) if(USE_OPTIMIZED_LAPACK_BLAS) message(STATUS "--> Searching for optimized LAPACK and BLAS libraries on your machine.") find_package(LAPACK) ENDIF(USE_OPTIMIZED_LAPACK_BLAS) if(NOT LAPACK_FOUND) message(STATUS "--> LAPACK and BLAS were not found. Reference LAPACK and BLAS will be downloaded and installed") include(ExternalProject) ExternalProject_Add( lapack URL http://www.netlib.org/lapack/lapack.tgz CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SCALAPACK_BINARY_DIR} PREFIX ${SCALAPACK_BINARY_DIR}/dependencies ) if (UNIX) SET(LAPACK_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/liblapack.a CACHE STRING "LAPACK library" FORCE) SET(BLAS_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/libblas.a CACHE STRING "BLAS library" FORCE) else (UNIX) # On Windows SET(LAPACK_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/liblapack.lib CACHE STRING "LAPACK library" FORCE) SET(BLAS_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/libblas.lib CACHE STRING "BLAS library" FORCE) endif (UNIX) ENDIF() ENDIF() message(STATUS "BLAS library: ${BLAS_LIBRARIES}") message(STATUS "LAPACK library: ${LAPACK_LIBRARIES}") MESSAGE(STATUS "=========") # -------------------------------------------------- # By default static library OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF ) OPTION(BUILD_STATIC_LIBS "Build static libraries" ON ) # -------------------------------------------------- # Subdirectories that need to be processed macro(append_subdir_files variable dirname) get_directory_property(holder DIRECTORY ${dirname} DEFINITION ${variable}) foreach(depfile ${holder}) list(APPEND ${variable} "${dirname}/${depfile}") endforeach() endmacro() # # BLACS # add_subdirectory(BLACS) append_subdir_files(blacs "BLACS/SRC") # # TOOLS # add_subdirectory(TOOLS) append_subdir_files(tools TOOLS) append_subdir_files(tools-C TOOLS) append_subdir_files(extra_lapack "TOOLS/LAPACK") # # PBLAS # add_subdirectory(PBLAS) append_subdir_files(pblas "PBLAS/SRC") append_subdir_files(pblas-F "PBLAS/SRC") append_subdir_files(pbblas "PBLAS/SRC/PBBLAS") append_subdir_files(ptzblas "PBLAS/SRC/PTZBLAS") append_subdir_files(ptools "PBLAS/SRC/PTOOLS") # # REDIST # add_subdirectory(REDIST) append_subdir_files(redist "REDIST/SRC") # # SRC # add_subdirectory(SRC) append_subdir_files(src "SRC") append_subdir_files(src-C "SRC") if (UNIX) add_library(scalapack ${blacs} ${tools} ${tools-C} ${extra_lapack} ${pblas} ${pblas-F} ${ptzblas} ${ptools} ${pbblas} ${redist} ${src} ${src-C}) target_link_libraries( scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) scalapack_install_library(scalapack) else (UNIX) # Need to separate Fortran and C Code OPTION(BUILD_SHARED_LIBS "Build shared libraries" ON ) add_library(scalapack ${blacs} ${tools-C} ${pblas} ${ptools} ${redist} ${src-C}) target_link_libraries( scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) add_library(scalapack-F ${pblas-F} ${pbblas} ${ptzblas} ${tools} ${src} ${extra_lapack} ) target_link_libraries( scalapack-F ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) scalapack_install_library(scalapack) scalapack_install_library(scalapack-F) endif (UNIX) add_subdirectory(TESTING) # -------------------------------------------------- # CPACK Packaging SET(CPACK_PACKAGE_NAME "ScaLAPACK") SET(CPACK_PACKAGE_VENDOR "University of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd") SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "ScaLAPACK- Linear Algebra Package") set(SCALAPACK_VERSION 2.0.2) set(CPACK_PACKAGE_VERSION_MAJOR 2) set(CPACK_PACKAGE_VERSION_MINOR 0) set(CPACK_PACKAGE_VERSION_PATCH 2) set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") SET(CPACK_PACKAGE_INSTALL_DIRECTORY "SCALAPACK") IF(WIN32 AND NOT UNIX) # There is a bug in NSI that does not handle full unix paths properly. Make # sure there is at least one set of four (4) backlasshes. SET(CPACK_NSIS_HELP_LINK "http:\\\\\\\\http://icl.cs.utk.edu/lapack-forum") SET(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\www.netlib.org/scalapack") SET(CPACK_NSIS_CONTACT "scalapack@eecs.utk.edu") SET(CPACK_NSIS_MODIFY_PATH ON) SET(CPACK_NSIS_DISPLAY_NAME "SCALAPACK-${SCALAPACK_VERSION}") set(CPACK_PACKAGE_RELOCATABLE "true") ELSE(WIN32 AND NOT UNIX) SET(CPACK_GENERATOR "TGZ") SET(CPACK_SOURCE_GENERATOR TGZ) SET(CPACK_SOURCE_PACKAGE_FILE_NAME "scalapack-${SCALAPACK_VERSION}" ) SET(CPACK_SOURCE_IGNORE_FILES ~$ .svn ${CPACK_SOURCE_IGNORE_FILES} ) ENDIF(WIN32 AND NOT UNIX) INCLUDE(CPack) # -------------------------------------------------- export(TARGETS scalapack FILE scalapack-targets.cmake) if( NOT LAPACK_FOUND ) install(FILES ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} DESTINATION lib ) endif( NOT LAPACK_FOUND ) configure_file(${SCALAPACK_SOURCE_DIR}/CMAKE/scalapack-config-version.cmake.in ${SCALAPACK_BINARY_DIR}/scalapack-config-version.cmake @ONLY) configure_file(${SCALAPACK_SOURCE_DIR}/CMAKE/scalapack-config-build.cmake.in ${SCALAPACK_BINARY_DIR}/scalapack-config.cmake @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/scalapack.pc.in ${CMAKE_CURRENT_BINARY_DIR}/scalapack.pc) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/scalapack.pc DESTINATION ${PKG_CONFIG_DIR} ) configure_file(${SCALAPACK_SOURCE_DIR}/CMAKE/scalapack-config-install.cmake.in ${SCALAPACK_BINARY_DIR}/CMakeFiles/scalapack-config.cmake @ONLY) install(FILES ${SCALAPACK_BINARY_DIR}/CMakeFiles/scalapack-config.cmake ${SCALAPACK_BINARY_DIR}/scalapack-config-version.cmake DESTINATION lib/cmake/scalapack-${SCALAPACK_VERSION} ) install(EXPORT scalapack-targets DESTINATION lib/cmake/scalapack-${SCALAPACK_VERSION}) scalapack-2.0.2/CTestConfig.cmake000644 000766 000024 00000001037 11672742633 017077 0ustar00juliestaff000000 000000 ## This file should be placed in the root directory of your project. ## Then modify the CMakeLists.txt file in the root directory of your ## project to incorporate the testing dashboard. ## # The following are required to uses Dart and the Cdash dashboard ## ENABLE_TESTING() ## INCLUDE(CTest) set(CTEST_PROJECT_NAME "ScaLAPACK") set(CTEST_NIGHTLY_START_TIME "00:00:00 EST") set(CTEST_DROP_METHOD "http") set(CTEST_DROP_SITE "icl.cs.utk.edu/cdash") set(CTEST_DROP_LOCATION "/submit.php?project=ScaLAPACK") set(CTEST_DROP_SITE_CDASH TRUE) scalapack-2.0.2/EXAMPLE/000755 000766 000024 00000000000 11750301602 014777 5ustar00juliestaff000000 000000 scalapack-2.0.2/LICENSE000644 000766 000024 00000004332 11657003705 014724 0ustar00juliestaff000000 000000 Copyright (c) 1992-2011 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. Copyright (c) 2000-2011 The University of California Berkeley. All rights reserved. Copyright (c) 2006-2011 The University of Colorado Denver. All rights reserved. $COPYRIGHT$ Additional copyrights may follow $HEADER$ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution. - Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. The copyright holders provide no reassurances that the source code provided does not infringe any patent, copyright, or any other intellectual property rights of third parties. The copyright holders disclaim any liability to any recipient for claims brought against recipient by any third party for infringement of that parties intellectual property rights. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. scalapack-2.0.2/Makefile000644 000766 000024 00000005356 11647605407 015374 0ustar00juliestaff000000 000000 ############################################################################ # # Program: ScaLAPACK # # Module: Makefile # # Purpose: Top-level Makefile # # Creation date: March 20, 1995 # # Modified: February 15, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include SLmake.inc PRECISIONS = single double complex complex16 ############################################################################ # # The library can be set up to include routines for any combination of the # four PRECISIONS. First, modify the ARCH, ARCHFLAGS, RANLIB, F77, CC, # F77FLAGS, CCFLAGS, F77LOADER, CCLOADER, F77LOADFLAGS, CCLOADFLAGS and # CDEFS definitions in SLmake.inc to match your library archiver, compiler # and the options to be used. # # The command # make # without any arguments creates the library of precisions defined by the # environment variable PRECISIONS as well as the corresponding testing # executables, # make lib # creates only the library, # make exe # creates only the testing executables. # make example # creates only the example # # The name of the library is defined in the file called SLmake.inc and # is created at this directory level. # # To remove the object files after the library and testing executables # are created, enter # make clean # ############################################################################ all: lib exe example lib: blacslib toolslib pblaslib redistlib scalapacklib exe: blacsexe pblasexe redistexe scalapackexe clean: cleanlib cleanexe cleanexample blacslib: ( cd BLACS; $(MAKE) lib ) pblaslib: ( cd PBLAS/SRC; $(MAKE) $(PRECISIONS) ) redistlib: ( cd REDIST/SRC; $(MAKE) integer $(PRECISIONS) ) scalapacklib: ( cd SRC; $(MAKE) $(PRECISIONS) ) toolslib: ( cd TOOLS; $(MAKE) $(PRECISIONS) ) blacsexe: ( cd BLACS; $(MAKE) tester ) pblasexe: ( cd PBLAS/TESTING; $(MAKE) $(PRECISIONS) ) ( cd PBLAS/TIMING; $(MAKE) $(PRECISIONS) ) scalapackexe: ( cd TESTING/LIN; $(MAKE) $(PRECISIONS) ) ( cd TESTING/EIG; $(MAKE) $(PRECISIONS) ) redistexe: ( cd REDIST/TESTING; $(MAKE) integer $(PRECISIONS) ) example: ( cd EXAMPLE; $(MAKE) $(PRECISIONS) ) cleanexe: ( cd PBLAS/TESTING; $(MAKE) clean ) ( cd PBLAS/TIMING; $(MAKE) clean ) ( cd TESTING/LIN; $(MAKE) clean ) ( cd TESTING/EIG; $(MAKE) clean ) ( cd REDIST/TESTING; $(MAKE) clean ) ( cd BLACS/TESTING; $(MAKE) clean ) ( cd TESTING; rm -f x* ) cleanlib: ( cd BLACS; $(MAKE) clean ) ( cd PBLAS/SRC; $(MAKE) clean ) ( cd SRC; $(MAKE) clean ) ( cd TOOLS; $(MAKE) clean ) ( cd REDIST/SRC; $(MAKE) clean ) ( rm -f $(SCALAPACKLIB) ) cleanexample: ( cd EXAMPLE; $(MAKE) clean ) scalapack-2.0.2/PBLAS/000755 000766 000024 00000000000 11750301613 014547 5ustar00juliestaff000000 000000 scalapack-2.0.2/README000644 000766 000024 00000011376 11750130340 014573 0ustar00juliestaff000000 000000 ===================== ScaLAPACK README FILE ===================== VERSION 1.0 : February 28, 1995 VERSION 1.1 : March 20, 1995 VERSION 1.2 : May 10, 1996 VERSION 1.3 : June 5, 1996 VERSION 1.4 : November 17, 1996 VERSION 1.5 : May 1, 1997 VERSION 1.6 : November 15, 1997 VERSION 1.7 : August, 2001 VERSION 1.8 : April 2007 VERSION 2.0 : November, 2011 VERSION 2.0.1 : January, 2012 VERSION 2.0.2 : May, 2012 ScaLAPACK, or Scalable LAPACK, is a library of high performance linear algebra routines for distributed memory computers supporting MPI. The complete ScaLAPACK package is freely available on netlib and can be obtained via the World Wide Web or anonymous ftp. http://www.netlib.org/scalapack/ ScaLAPACK, version 2.0, includes routines for the solution of dense, band, and tridiagonal linear systems of equations, condition estimation and iterative refinement, for LU and Cholesky factorization, matrix inversion, full-rank linear least squares problems, orthogonal and generalized orthogonal factorizations, orthogonal transformation routines, reductions to upper Hessenberg, bidiagonal and tridiagonal form, reduction of a symmetric-definite/ Hermitian-definite generalized eigenproblem to standard form, the symmetric/Hermitian, divide-and-conquer symmetric/Hermitian, generalized symmetric/Hermitian and the nonsymmetric eigenproblem, and the singular value decomposition. With the exception of the singular value decomposition, most routines are available in four types: single precision real, double precision real, single precision complex, and double precision complex. New in version 2.0: - ScaLAPACK now only supports MPI. - The BLACS is now part of ScaLAPACK, and is compiled into the ScaLAPACK library. It is no longer necessary to link against BLACS libraries. - Building ScaLAPACK using cmake is now supported. - New MRRR Symmetric Eigenvalue Problem routines are included: pssyevr, pdsyevr, pcheevr and pzheevr. - New Nonsymmetric Eigenvalue Problem QR routines for computing eigenvalues of a Hessenberg matrix are included for real matrices: pshseqr and pdhseqr. Unless otherwise noted, the current scalapack.tgz on netlib contains all available updates. Errata for ScaLAPACK (source code and documentation) can be found at: http://www.netlib.org/scalapack/errata.html ScaLAPACK example programs can be found at: http://www.netlib.org/scalapack/examples/ A basic example is included in the EXAMPLE directory. The ScaLAPACK User's Guide for ScaLAPACK version 1.5 is available from SIAM at: http://www.ec-securehost.com/SIAM/SE04.html To view an HTML version of the Users' Guide for version 1.5, see: http://www.netlib.org/scalapack/slug/ A number of technical reports were written during the development of ScaLAPACK and published as LAPACK Working Notes by the University of Tennessee. These working notes are available at: http://www.netlib.org/lapack/lawns/ All questions/comments should be directed to scalapack@cs.utk.edu. ---------------------------------------------------------------------- The Makefiles in ScaLAPACK and its support libraries assume the basic directory structure below: SCALAPACK/ _____________________________/ | \______________________________________ / | | | | SLmake.inc | | | \ | | | | | | | | | | | | | | BLACS/ EXAMPLE/ PBLAS/ REDIST/ SRC/ TESTING/ TOOLS/ / Input \ / Files & \ /Executables\ NOTE: It is assumed that the BLAS and LAPACK libraries (and MPI) are available on your machine. These libraries are NOT included with this distribution, and may be obtained at the sites below. http://www.netlib.org/blas/ http://www.netlib.org/lapack/ http://www.mcs.anl.gov/mpi/mpich/ http://www.lam-mpi.org/ http://www.open-mpi.org/ All ScaLAPACK routines -- driver, computational, and auxiliary -- can be found in the SRC/ directory. Testing routines and input files can be found in the TESTING/ directory. All machine-specific parameters and the locations of BLAS and LAPACK libraries are specified in the SCALAPACK/SLmake.inc file. This include file is then referenced in all subdirectory Makefiles. Once the include file has been modified, the entire installation process (including the building of testing executables) can be performed by typing ``make'' in the top-level ScaLAPACK directory. scalapack-2.0.2/REDIST/000755 000766 000024 00000000000 11750301601 014675 5ustar00juliestaff000000 000000 scalapack-2.0.2/scalapack.pc.in000644 000766 000024 00000000334 11656312637 016576 0ustar00juliestaff000000 000000 prefix=@prefix@ libdir=@libdir@ Name: scalapack Description: SCALAPACK reference implementation Version: @SCALAPACK_VERSION@ URL: http://www.netlib.org/scalapack/ Libs: -L${libdir} -lscalapack Requires: mpi lapack blas scalapack-2.0.2/scalapack_build.cmake000644 000766 000024 00000017066 11745717064 020042 0ustar00juliestaff000000 000000 cmake_minimum_required(VERSION 2.8) ################################################################### # The values in this section must always be provided ################################################################### if(UNIX) if(NOT compiler) set(compiler gcc) endif(NOT compiler) if(NOT c_compiler) set(c_compiler gcc) endif(NOT c_compiler) if(NOT full_compiler) set(full_compiler g++) endif(NOT full_compiler) endif(UNIX) if(EXISTS "/proc/cpuinfo") set(parallel 1) file(STRINGS "/proc/cpuinfo" CPUINFO) foreach(line ${CPUINFO}) if("${line}" MATCHES processor) math(EXPR parallel "${parallel} + 1") endif() endforeach(line) endif() if(WIN32) set(VSLOCATIONS "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\6.0\\Setup;VsCommonDir]/MSDev98/Bin" "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\7.0\\Setup\\VS;EnvironmentDirectory]" "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\7.1\\Setup\\VS;EnvironmentDirectory]" "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\8.0;InstallDir]" "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\8.0\\Setup;Dbghelp_path]" "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\9.0\\Setup\\VS;EnvironmentDirectory]" ) set(GENERATORS "Visual Studio 6" "Visual Studio 7" "Visual Studio 7 .NET 2003" "Visual Studio 8 2005" "Visual Studio 8 2005" "Visual Studio 9 2008" ) set(vstype 0) foreach(p ${VSLOCATIONS}) get_filename_component(VSPATH ${p} PATH) if(NOT "${VSPATH}" STREQUAL "/" AND EXISTS "${VSPATH}") message(" found VS install = ${VSPATH}") set(genIndex ${vstype}) endif() math(EXPR vstype "${vstype} +1") endforeach() if(NOT DEFINED genIndex) message(FATAL_ERROR "Could not find installed visual stuido") endif() list(GET GENERATORS ${genIndex} GENERATOR) set(CTEST_CMAKE_GENERATOR "${GENERATOR}") message("${CTEST_CMAKE_GENERATOR} - found") set(compiler cl) endif(WIN32) find_program(HOSTNAME NAMES hostname) find_program(UNAME NAMES uname) # Get the build name and hostname exec_program(${HOSTNAME} ARGS OUTPUT_VARIABLE hostname) string(REGEX REPLACE "[/\\\\+<> #]" "-" hostname "${hostname}") message("HOSTNAME: ${hostname}") # default to parallel 1 if(NOT DEFINED parallel) set(parallel 1) endif(NOT DEFINED parallel) # find SVN find_program(SVN svn PATHS $ENV{HOME}/bin /vol/local/bin) if(NOT SVN) message(FATAL_ERROR "SVN not found") endif() set(CTEST_UPDATE_COMMAND ${SVN}) macro(getuname name flag) exec_program("${UNAME}" ARGS "${flag}" OUTPUT_VARIABLE "${name}") string(REGEX REPLACE "[/\\\\+<> #]" "-" "${name}" "${${name}}") string(REGEX REPLACE "^(......|.....|....|...|..|.).*" "\\1" "${name}" "${${name}}") endmacro(getuname) getuname(osname -s) getuname(osver -v) getuname(osrel -r) getuname(cpu -m) if("${osname}" MATCHES Darwin) find_program(SW_VER sw_vers) execute_process(COMMAND "${SW_VER}" -productVersion OUTPUT_VARIABLE osver) string(REPLACE "\n" "" osver "${osver}") set(osname "MacOSX") set(osrel "") if("${cpu}" MATCHES "Power") set(cpu "ppc") endif("${cpu}" MATCHES "Power") endif("${osname}" MATCHES Darwin) if(NOT compiler) message(FATAL_ERROR "compiler must be set") endif(NOT compiler) set(BUILDNAME "${osname}${osver}${osrel}${cpu}-${compiler}") message("BUILDNAME: ${BUILDNAME}") # this is the cvs module name that should be checked out set (CTEST_MODULE_NAME scalapack) set (CTEST_DIR_NAME "${CTEST_MODULE_NAME}SVN") # Settings: message("NOSPACES = ${NOSPACES}") if(NOSPACES) set(CTEST_DASHBOARD_ROOT "$ENV{HOME}/Dashboards/MyTests-${BUILDNAME}") else(NOSPACES) set(CTEST_DASHBOARD_ROOT "$ENV{HOME}/Dashboards/My Tests-${BUILDNAME}") endif(NOSPACES) set(CTEST_SITE "${hostname}") set(CTEST_BUILD_NAME "${BUILDNAME}") set(CTEST_TEST_TIMEOUT "600") # CVS command and the checkout command if(NOT EXISTS "${CTEST_DASHBOARD_ROOT}/${CTEST_DIR_NAME}") set(CTEST_CHECKOUT_COMMAND "\"${CTEST_UPDATE_COMMAND}\" co https://icl.cs.utk.edu/svn/scalapack-dev/scalapack/trunk ${CTEST_DIR_NAME}") endif(NOT EXISTS "${CTEST_DASHBOARD_ROOT}/${CTEST_DIR_NAME}") # Set the generator and build configuration if(NOT DEFINED CTEST_CMAKE_GENERATOR) set(CTEST_CMAKE_GENERATOR "Unix Makefiles") endif(NOT DEFINED CTEST_CMAKE_GENERATOR) set(CTEST_PROJECT_NAME "ScaLAPACK") set(CTEST_BUILD_CONFIGURATION "Release") # Extra special variables set(ENV{DISPLAY} "") if(CTEST_CMAKE_GENERATOR MATCHES Makefiles) set(ENV{CC} "${c_compiler}") set(ENV{FC} "${f_compiler}") set(ENV{CXX} "${full_compiler}") endif(CTEST_CMAKE_GENERATOR MATCHES Makefiles) #---------------------------------------------------------------------------------- # Should not need to edit under this line #---------------------------------------------------------------------------------- # if you do not want to use the default location for a # dashboard then set this variable to the directory # the dashboard should be in make_directory("${CTEST_DASHBOARD_ROOT}") # these are the the name of the source and binary directory on disk. # They will be appended to DASHBOARD_ROOT set(CTEST_SOURCE_DIRECTORY "${CTEST_DASHBOARD_ROOT}/${CTEST_DIR_NAME}") set(CTEST_BINARY_DIRECTORY "${CTEST_SOURCE_DIRECTORY}-${CTEST_BUILD_NAME}") set(CTEST_NOTES_FILES "${CTEST_NOTES_FILES}" "${CMAKE_CURRENT_LIST_FILE}" ) # check for parallel if(parallel GREATER 1) if(NOT CTEST_BUILD_COMMAND) set(CTEST_BUILD_COMMAND "make -j${parallel} -i") endif(NOT CTEST_BUILD_COMMAND) message("Use parallel build") message("CTEST_BUILD_COMMAND: ${CTEST_BUILD_COMMAND}") message("CTEST_CONFIGURE_COMMAND: ${CTEST_CONFIGURE_COMMAND}") endif(parallel GREATER 1) ################################################################### # Values for the cmake build ################################################################### set( CACHE_CONTENTS " SITE:STRING=${hostname} BUILDNAME:STRING=${BUILDNAME} DART_ROOT:PATH= SVNCOMMAND:FILEPATH=${CTEST_UPDATE_COMMAND} DROP_METHOD:STRING=https DART_TESTING_TIMEOUT:STRING=${CTEST_TEST_TIMEOUT} # Specific Fortran Compiler (uncomment and add flags directly after = ) #CMAKE_Fortran_COMPILER:STRING= # Specific Fortran Compiler Flags (uncomment and add flags directly after = ) #CMAKE_Fortran_FLAGS:STRING= # Use Reference BLAS and LAPACK by default USE_OPTIMIZED_LAPACK_BLAS:OPTION=OFF " ) ########################################################################## # wipe the binary dir message("Remove binary directory...") ctest_empty_binary_directory("${CTEST_BINARY_DIRECTORY}") message("CTest Directory: ${CTEST_DASHBOARD_ROOT}") message("Initial checkout: ${CTEST_CVS_CHECKOUT}") message("Initial cmake: ${CTEST_CMAKE_COMMAND}") message("CTest command: ${CTEST_COMMAND}") # this is the initial cache to use for the binary tree, be careful to escape # any quotes inside of this string if you use it file(WRITE "${CTEST_BINARY_DIRECTORY}/CMakeCache.txt" "${CACHE_CONTENTS}") message("Start dashboard...") ctest_start(Nightly) #ctest_start(Experimental) message(" Update") ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}" RETURN_VALUE res) message(" Configure") ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}" RETURN_VALUE res) message("read custom files after configure") ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}") message(" Build") ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}" RETURN_VALUE res) message(" Test") ctest_test(BUILD "${CTEST_BINARY_DIRECTORY}" RETURN_VALUE res) message(" Submit") ctest_submit(RETURN_VALUE res) message(" All done") scalapack-2.0.2/SLmake.inc.example000644 000766 000024 00000002717 11654025546 017231 0ustar00juliestaff000000 000000 ############################################################################ # # Program: ScaLAPACK # # Module: SLmake.inc # # Purpose: Top-level Definitions # # Creation date: February 15, 2000 # # Modified: October 13, 2011 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ # # C preprocessor definitions: set CDEFS to one of the following: # # -DNoChange (fortran subprogram names are lower case without any suffix) # -DUpCase (fortran subprogram names are upper case without any suffix) # -DAdd_ (fortran subprogram names are lower case with "_" appended) CDEFS = -DAdd_ # # The fortran and C compilers, loaders, and their flags # FC = mpif90 CC = mpicc NOOPT = -O0 FCFLAGS = -O3 CCFLAGS = -O3 FCLOADER = $(FC) CCLOADER = $(CC) FCLOADFLAGS = $(FCFLAGS) CCLOADFLAGS = $(CCFLAGS) # # The archiver and the flag(s) to use when building archive (library) # Also the ranlib routine. If your system has no ranlib, set RANLIB = echo # ARCH = ar ARCHFLAGS = cr RANLIB = ranlib # # The name of the ScaLAPACK library to be created # SCALAPACKLIB = libscalapack.a # # BLAS, LAPACK (and possibly other) libraries needed for linking test programs # BLASLIB = -lblas LAPACKLIB = -llapack LIBS = $(LAPACKLIB) $(BLASLIB) scalapack-2.0.2/SRC/000755 000766 000024 00000000000 11750301600 014331 5ustar00juliestaff000000 000000 scalapack-2.0.2/TESTING/000755 000766 000024 00000000000 11750301630 015022 5ustar00juliestaff000000 000000 scalapack-2.0.2/TOOLS/000755 000766 000024 00000000000 11750301604 014606 5ustar00juliestaff000000 000000 scalapack-2.0.2/TOOLS/ccdotc.f000644 000766 000024 00000001561 10363532303 016220 0ustar00juliestaff000000 000000 SUBROUTINE CCDOTC( N, DOTC, X, INCX, Y, INCY ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, INCY, N COMPLEX DOTC * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * .. * * Purpose * ======= * * CCDOTC is a simple FORTRAN wrapper around the BLAS function * CDOTC returning the result in the parameter list instead. * * ===================================================================== * * .. External Functions .. COMPLEX CDOTC EXTERNAL CDOTC * .. * .. Executable Statements .. * DOTC = CDOTC( N, X, INCX, Y, INCY ) * RETURN * * End of CCDOTC * END scalapack-2.0.2/TOOLS/ccdotu.f000644 000766 000024 00000001561 10363532303 016242 0ustar00juliestaff000000 000000 SUBROUTINE CCDOTU( N, DOTU, X, INCX, Y, INCY ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, INCY, N COMPLEX DOTU * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * .. * * Purpose * ======= * * CCDOTU is a simple FORTRAN wrapper around the BLAS function * CDOTU returning the result in the parameter list instead. * * ===================================================================== * * .. External Functions .. COMPLEX CDOTU EXTERNAL CDOTU * .. * .. Executable Statements .. * DOTU = CDOTU( N, X, INCX, Y, INCY ) * RETURN * * End of CCDOTU * END scalapack-2.0.2/TOOLS/chk1mat.f000644 000766 000024 00000014027 10363532303 016312 0ustar00juliestaff000000 000000 SUBROUTINE CHK1MAT( MA, MAPOS0, NA, NAPOS0, IA, JA, DESCA, $ DESCAPOS0, INFO ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER DESCAPOS0, IA, INFO, JA, MA, MAPOS0, NA, NAPOS0 * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * CHK1MAT checks that the values associated with one distributed matrix * make sense from a local viewpoint * * Arguments * ========= * * MA (global input) INTEGER * The number or matrix rows of A being operated on. * * MAPOS0 (global input) INTEGER * Where in the calling routine's parameter list MA appears. * * NA (global input) INTEGER * The number of matrix columns of A being operated on. * * NAPOS0 (global input) INTEGER * Where in the calling routine's parameter list NA appears. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * DESCAPOS0 (global input) INTEGER * Where in the calling routine's parameter list DESCA * appears. Note that we assume IA and JA are respectively 2 * and 1 entries behind DESCA. * * INFO (local input/local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) * .. * .. Local Scalars .. INTEGER DESCAPOS, IAPOS, JAPOS, MAPOS, NAPOS, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MAX * .. * .. Executable Statements .. * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the des- * criptor multiplier * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Figure where in parameter list each parameter was, factoring in * descriptor multiplier * MAPOS = MAPOS0 * DESCMULT NAPOS = NAPOS0 * DESCMULT IAPOS = (DESCAPOS0-2) * DESCMULT JAPOS = (DESCAPOS0-1) * DESCMULT DESCAPOS = DESCAPOS0 * DESCMULT * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * * Check that matrix values make sense from local viewpoint * IF( DESCA( DTYPE_ ) .NE. BLOCK_CYCLIC_2D ) THEN INFO = MIN( INFO, DESCAPOS+DTYPE_ ) ELSE IF( MA.LT.0 ) THEN INFO = MIN( INFO, MAPOS ) ELSE IF( NA.LT.0 ) THEN INFO = MIN( INFO, NAPOS ) ELSE IF( IA.LT.1 ) THEN INFO = MIN( INFO, IAPOS ) ELSE IF( JA.LT.1 ) THEN INFO = MIN( INFO, JAPOS ) ELSE IF( DESCA( MB_ ).LT.1 ) THEN INFO = MIN( INFO, DESCAPOS+MB_ ) ELSE IF( DESCA( NB_ ).LT.1 ) THEN INFO = MIN( INFO, DESCAPOS+NB_ ) ELSE IF( DESCA( RSRC_ ).LT.0 .OR. DESCA( RSRC_ ).GE.NPROW ) THEN INFO = MIN( INFO, DESCAPOS+RSRC_ ) ELSE IF( DESCA( CSRC_ ).LT.0 .OR. DESCA( CSRC_ ).GE.NPCOL ) THEN INFO = MIN( INFO, DESCAPOS+CSRC_ ) ELSE IF( DESCA( LLD_ ).LT.1 ) THEN INFO = MIN( INFO, DESCAPOS+LLD_ ) ELSE IF( DESCA( LLD_ ) .LT. $ NUMROC( DESCA( M_ ), DESCA( MB_ ), MYROW, DESCA(RSRC_), $ NPROW ) ) THEN IF( NUMROC( DESCA( N_ ), DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) .GT. 0 ) $ INFO = MIN( INFO, DESCAPOS+LLD_ ) END IF * IF( MA.EQ.0 .OR. NA.EQ.0 ) THEN * * NULL matrix, relax some checks * IF( DESCA(M_).LT.0 ) $ INFO = MIN( INFO, DESCAPOS+M_ ) IF( DESCA(N_).LT.0 ) $ INFO = MIN( INFO, DESCAPOS+N_ ) * ELSE * * more rigorous checks for non-degenerate matrices * IF( DESCA( M_ ).LT.1 ) THEN INFO = MIN( INFO, DESCAPOS+M_ ) ELSE IF( DESCA( N_ ).LT.1 ) THEN INFO = MIN( INFO, DESCAPOS+N_ ) ELSE IF( IA.GT.DESCA( M_ ) ) THEN INFO = MIN( INFO, IAPOS ) ELSE IF( JA.GT.DESCA( N_ ) ) THEN INFO = MIN( INFO, JAPOS ) ELSE IF( IA+MA-1.GT.DESCA( M_ ) ) $ INFO = MIN( INFO, MAPOS ) IF( JA+NA-1.GT.DESCA( N_ ) ) $ INFO = MIN( INFO, NAPOS ) END IF END IF * END IF * * Prepare output: set info = 0 if no error, and divide by * DESCMULT if error is not in a descriptor entry * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * RETURN * * End CHK1MAT * END scalapack-2.0.2/TOOLS/clatcpy.f000644 000766 000024 00000004434 10363532303 016422 0ustar00juliestaff000000 000000 SUBROUTINE CLATCPY( UPLO, M, N, A, LDA, B, LDB ) * * * .. Scalar Arguments .. CHARACTER UPLO INTEGER LDA, LDB, M, N * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CLATCPY copies all or part of a two-dimensional matrix A to another * matrix B in transpose form. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies the part of the matrix A to be copied to B. * = 'U': Upper triangular part * = 'L': Lower triangular part * Otherwise: All of the matrix A * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input) DOUBLE PRECISION array, dimension (LDA,N) * The m by n matrix A. If UPLO = 'U', only the upper triangle * or trapezoid is accessed; if UPLO = 'L', only the lower * triangle or trapezoid is accessed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * B (output) DOUBLE PRECISION array, dimension (LDB,M) * On exit, B = A^T in the locations specified by UPLO. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN INTRINSIC CONJG * * .. * .. Executable Statements .. * IF( LSAME( UPLO, 'U' ) ) THEN DO 20 J = 1, N DO 10 I = 1, MIN( J, M ) B( J, I ) = CONJG( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN DO 40 J = 1, N DO 30 I = J, M B( J, I ) = CONJG( A( I, J ) ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( J, I ) = CONJG( A( I, J ) ) 50 CONTINUE 60 CONTINUE END IF RETURN * * End of CLATCPY * END scalapack-2.0.2/TOOLS/CMakeLists.txt000644 000766 000024 00000003151 11661300033 017342 0ustar00juliestaff000000 000000 add_subdirectory(LAPACK) set (ATOOLS iceil.f ilacpy.f ilcm.f indxg2p.f indxg2l.f indxl2g.f infog1l.f infog2l.f npreroc.f numroc.f chk1mat.f pchkxmat.f sltimer.f desc_convert.f descinit.f descset.f SL_init.f) set (ITOOLS picol2row.f pirow2col.f pilaprnt.f pitreecomb.f pifillpad.f pichekpad.f pielset.f pielset2.f pielget.f) set (STOOLS dsnrm2.f dsasum.f slatcpy.f ssdot.f smatadd.f psmatadd.f pscol2row.f psrow2col.f pslaprnt.f pstreecomb.f psfillpad.f pschekpad.f pselset.f pselset2.f pselget.f pslaread.f pslawrite.f) set (DTOOLS dddot.f dlatcpy.f dmatadd.f pdmatadd.f pdcol2row.f pdrow2col.f pdlaprnt.f pdtreecomb.f pdfillpad.f pdchekpad.f pdelset.f pdelset2.f pdelget.f pdlaread.f pdlawrite.f) set (CTOOLS dscnrm2.f dscasum.f ccdotu.f ccdotc.f clatcpy.f cmatadd.f pcmatadd.f pccol2row.f pcrow2col.f pclaprnt.f pctreecomb.f pcfillpad.f pcchekpad.f pcelset.f pcelset2.f pcelget.f pclaread.f pclawrite.f) set (ZTOOLS zzdotu.f zzdotc.f zlatcpy.f zmatadd.f pzmatadd.f pzcol2row.f pzrow2col.f pzlaprnt.f pztreecomb.f pzfillpad.f pzchekpad.f pzelset.f pzelset2.f pzelget.f pzlaread.f pzlawrite.f) set(tools ${ATOOLS} ${ITOOLS} ${STOOLS} ${DTOOLS} ${CTOOLS} ${ZTOOLS}) set(tools-C reshape.c SL_gridreshape.c ) scalapack-2.0.2/TOOLS/cmatadd.f000644 000766 000024 00000010346 10363532303 016357 0ustar00juliestaff000000 000000 SUBROUTINE CMATADD( M, N, ALPHA, A, LDA, BETA, C, LDC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER LDA, LDC, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * CMATADD performs the following local matrix-matrix operation * * C := alpha * A + beta * C, * * where alpha and beta are scalars, and A and C are m by n arrays. * * Arguments * ========= * * M (local input) INTEGER * The number of rows of the array A. M >= 0. * * N (local input) INTEGER * The number of columns of the array A. N >= 0. * * ALPHA (local input) COMPLEX * The scalar ALPHA. * * A (local input) COMPLEX * Array, dimension (LDA,*), the array A. * * LDA (local input) INTEGER * The leading dimension of the array A, LDA >= MAX(1, M) * * BETA (local input) COMPLEX * The scalar BETA. * * C (local input/local output) COMPLEX * Array, dimension (LDC,*), the array C. * * LDC (local input) INTEGER * The leading dimension of the array C, LDC >= MAX(1, M) * * ===================================================================== * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * * Quick return if possible. * IF( (M.EQ.0).OR.(N.EQ.0).OR.((ALPHA.EQ.ZERO).AND.(BETA.EQ.ONE)) ) $ RETURN * IF( N.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 10 I = 1, M C( I, 1 ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) 20 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN DO 30 I = 1, M C( I, 1 ) = A( I, 1 ) + C( I, 1 ) 30 CONTINUE ELSE DO 40 I = 1, M C( I, 1 ) = A( I, 1 ) + BETA*C( I, 1 ) 40 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN DO 50 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) + C( I, 1 ) 50 CONTINUE ELSE DO 60 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) + BETA*C( I, 1 ) 60 CONTINUE END IF END IF ELSE IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M C( I, J ) = ZERO 70 CONTINUE 80 CONTINUE ELSE DO 100 J = 1, N DO 90 I = 1, M C( I, J ) = ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN DO 120 J = 1, N DO 110 I = 1, M C( I, J ) = A( I, J ) + C( I, J ) 110 CONTINUE 120 CONTINUE ELSE DO 140 J = 1, N DO 130 I = 1, M C( I, J ) = A( I, J ) + BETA * C( I, J ) 130 CONTINUE 140 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN DO 160 J = 1, N DO 150 I = 1, M C( I, J ) = C( I, J ) + ALPHA * A( I, J ) 150 CONTINUE 160 CONTINUE ELSE DO 180 J = 1, N DO 170 I = 1, M C( I, J ) = ALPHA * A( I, J ) + BETA * C( I, J ) 170 CONTINUE 180 CONTINUE END IF END IF END IF * RETURN * * End of CMATADD * END scalapack-2.0.2/TOOLS/dddot.f000644 000766 000024 00000001547 10363532303 016063 0ustar00juliestaff000000 000000 SUBROUTINE DDDOT( N, DOT, X, INCX, Y, INCY ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, INCY, N DOUBLE PRECISION DOT * .. * .. Array Arguments .. DOUBLE PRECISION X( * ), Y( * ) * .. * * Purpose * ======= * * DDDOT is a simple FORTRAN wrapper around the BLAS function * DDOT returning the result in the parameter list instead. * * ===================================================================== * * .. External Functions .. DOUBLE PRECISION DDOT EXTERNAL DDOT * .. * .. Executable Statements .. * DOT = DDOT( N, X, INCX, Y, INCY ) * RETURN * * End of DDDOT * END scalapack-2.0.2/TOOLS/desc_convert.f000644 000766 000024 00000006013 11654631032 017437 0ustar00juliestaff000000 000000 SUBROUTINE DESC_CONVERT( DESC_IN, DESC_OUT, INFO ) * * * .. Array Arguments .. INTEGER DESC_IN( * ), DESC_OUT( * ), INFO * .. * * Purpose * ======= * * Converts descriptors from one type to another if they are compatible. * * Supports *ONLY* an output descriptor type of 1D_horizontal (type * number 501) or 1D_vertical (number 502). * Supports only one-dimensional 1xP input grids if descriptor_in is 2D. * * Arguments * ========= * * DESC_IN: (input) input descriptor * * DESC_OUT: (output) output descriptor (required to be 1D_horizontal * in this release). * * INFO: (output) return code * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * * .. Local Scalars .. INTEGER DESC_TYPE, DESC_TYPE_IN, ICTXT INTEGER CSRC, RSRC, MB, NB, LLDA INTEGER M, N, NPROW, NPCOL, IDUM1, IDUM2 * * .. External routines .. * EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * INFO = 0 * DESC_TYPE_IN = DESC_IN( 1 ) * * .. Initialize Variables .. * RSRC = 0 NB = 0 N = 0 MB = 0 M = 0 LLDA = 0 CSRC = 0 * IF( DESC_TYPE_IN .EQ. BLOCK_CYCLIC_2D ) THEN ICTXT = DESC_IN( CTXT_ ) RSRC = DESC_IN( RSRC_ ) CSRC = DESC_IN( CSRC_ ) MB = DESC_IN( MB_ ) NB = DESC_IN( NB_ ) LLDA = DESC_IN( LLD_ ) M = DESC_IN( M_ ) N = DESC_IN( N_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, IDUM1, IDUM2 ) ELSEIF ( DESC_TYPE_IN .EQ. 502 ) THEN ICTXT = DESC_IN( 2 ) RSRC = DESC_IN( 5 ) CSRC = 1 MB = DESC_IN( 4 ) NB = 1 LLDA = DESC_IN( 6 ) M = DESC_IN( 3 ) N = 1 NPROW = 0 NPCOL = 1 ELSEIF ( DESC_TYPE_IN .EQ. 501 ) THEN ICTXT = DESC_IN( 2 ) RSRC = 1 CSRC = DESC_IN( 5 ) MB = 1 NB = DESC_IN( 4 ) LLDA = DESC_IN( 6 ) M = 1 N = DESC_IN( 3 ) NPROW = 1 NPCOL = 0 ENDIF * * DESC_TYPE = DESC_OUT( 1 ) * IF( DESC_TYPE .EQ. 501 ) THEN IF( NPROW .NE. 1 )THEN INFO = -1 RETURN ENDIF DESC_OUT( 2 ) = ICTXT DESC_OUT( 5 ) = CSRC DESC_OUT( 4 ) = NB DESC_OUT( 6 ) = LLDA DESC_OUT( 3 ) = N ELSEIF( DESC_TYPE .EQ. 502 ) THEN IF( NPCOL .NE. 1 )THEN INFO = -1 RETURN ENDIF DESC_OUT( 2 ) = ICTXT DESC_OUT( 5 ) = RSRC DESC_OUT( 4 ) = MB DESC_OUT( 6 ) = LLDA DESC_OUT( 3 ) = M ENDIF * RETURN * * End of DESC_CONVERT * END scalapack-2.0.2/TOOLS/descinit.f000644 000766 000024 00000016100 10363532303 016556 0ustar00juliestaff000000 000000 SUBROUTINE DESCINIT( DESC, M, N, MB, NB, IRSRC, ICSRC, ICTXT, $ LLD, INFO ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICSRC, ICTXT, INFO, IRSRC, LLD, M, MB, N, NB * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * DESCINIT initializes the descriptor vector with the 8 input arguments * M, N, MB, NB, IRSRC, ICSRC, ICTXT, LLD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DESC (output) INTEGER array of dimension DLEN_. * The array descriptor of a distributed matrix to be set. * * M (global input) INTEGER * The number of rows in the distributed matrix. M >= 0. * * N (global input) INTEGER * The number of columns in the distributed matrix. N >= 0. * * MB (global input) INTEGER * The blocking factor used to distribute the rows of the * matrix. MB >= 1. * * NB (global input) INTEGER * The blocking factor used to distribute the columns of the * matrix. NB >= 1. * * IRSRC (global input) INTEGER * The process row over which the first row of the matrix is * distributed. 0 <= IRSRC < NPROW. * * ICSRC (global input) INTEGER * The process column over which the first column of the * matrix is distributed. 0 <= ICSRC < NPCOL. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation on the matrix. The context itself is global. * * LLD (local input) INTEGER * The leading dimension of the local array storing the local * blocks of the distributed matrix. LLD >= MAX(1,LOCr(M)). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Note * ==== * * If the routine can recover from an erroneous input argument, it will * return an acceptable descriptor vector. For example, if LLD = 0 on * input, DESC(LLD_) will contain the smallest leading dimension * required to store the specified M-by-N distributed matrix, INFO * will be set -9 in that case. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 IF( M.LT.0 ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( MB.LT.1 ) THEN INFO = -4 ELSE IF( NB.LT.1 ) THEN INFO = -5 ELSE IF( IRSRC.LT.0 .OR. IRSRC.GE.NPROW ) THEN INFO = -6 ELSE IF( ICSRC.LT.0 .OR. ICSRC.GE.NPCOL ) THEN INFO = -7 ELSE IF( NPROW.EQ.-1 ) THEN INFO = -8 ELSE IF( LLD.LT.MAX( 1, NUMROC( M, MB, MYROW, IRSRC, $ NPROW ) ) ) THEN INFO = -9 END IF * IF( INFO.NE.0 ) $ CALL PXERBLA( ICTXT, 'DESCINIT', -INFO ) * DESC( DTYPE_ ) = BLOCK_CYCLIC_2D DESC( M_ ) = MAX( 0, M ) DESC( N_ ) = MAX( 0, N ) DESC( MB_ ) = MAX( 1, MB ) DESC( NB_ ) = MAX( 1, NB ) DESC( RSRC_ ) = MAX( 0, MIN( IRSRC, NPROW-1 ) ) DESC( CSRC_ ) = MAX( 0, MIN( ICSRC, NPCOL-1 ) ) DESC( CTXT_ ) = ICTXT DESC( LLD_ ) = MAX( LLD, MAX( 1, NUMROC( DESC( M_ ), DESC( MB_ ), $ MYROW, DESC( RSRC_ ), NPROW ) ) ) * RETURN * * End DESCINIT * END scalapack-2.0.2/TOOLS/descset.f000644 000766 000024 00000012537 10363532303 016420 0ustar00juliestaff000000 000000 SUBROUTINE DESCSET( DESC, M, N, MB, NB, IRSRC, ICSRC, ICTXT, $ LLD ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICSRC, ICTXT, IRSRC, LLD, M, MB, N, NB * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * DESCSET initializes a descriptor vector with the 8 input arguments * M, N, MB, NB, IRSRC, ICSRC, ICTXT, LLD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DESC (output) INTEGER array of dimension DLEN. * The array descriptor of a distributed matrix to be set. * * M (global input) INTEGER * The number of rows in the distributed matrix. M >= 0. * * N (global input) INTEGER * The number of columns in the distributed matrix. N >= 0. * * MB (global input) INTEGER * The blocking factor used to distribute the rows of the * matrix. MB >= 1. * * NB (global input) INTEGER * The blocking factor used to distribute the columns of the * matrix. NB >= 1. * * IRSRC (global input) INTEGER * The process row over which the first row of the matrix is * distributed. 0 <= IRSRC < NPROW. * * ICSRC (global input) INTEGER * The process column over which the first column of the * matrix is distributed. 0 <= ICSRC < NPCOL. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation on the matrix. The context itself is global. * * LLD (local input) INTEGER * The leading dimension of the local array storing the local * blocks of the distributed matrix. LLD >= MAX(1,LOCr(M)). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * * .. Executable Statements .. * DESC( DTYPE_ ) = BLOCK_CYCLIC_2D DESC( M_ ) = M DESC( N_ ) = N DESC( MB_ ) = MB DESC( NB_ ) = NB DESC( RSRC_ ) = IRSRC DESC( CSRC_ ) = ICSRC DESC( CTXT_ ) = ICTXT DESC( LLD_ ) = LLD * RETURN * * End DESCSET * END scalapack-2.0.2/TOOLS/dlatcpy.f000644 000766 000024 00000004342 10363532303 016421 0ustar00juliestaff000000 000000 SUBROUTINE DLATCPY( UPLO, M, N, A, LDA, B, LDB ) * * * .. Scalar Arguments .. CHARACTER UPLO INTEGER LDA, LDB, M, N * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DLATCPY copies all or part of a two-dimensional matrix A to another * matrix B in transpose form. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies the part of the matrix A to be copied to B. * = 'U': Upper triangular part * = 'L': Lower triangular part * Otherwise: All of the matrix A * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input) DOUBLE PRECISION array, dimension (LDA,N) * The m by n matrix A. If UPLO = 'U', only the upper triangle * or trapezoid is accessed; if UPLO = 'L', only the lower * triangle or trapezoid is accessed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * B (output) DOUBLE PRECISION array, dimension (LDB,M) * On exit, B = A^T in the locations specified by UPLO. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * * .. * .. Executable Statements .. * IF( LSAME( UPLO, 'U' ) ) THEN DO 20 J = 1, N DO 10 I = 1, MIN( J, M ) B( J, I ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN DO 40 J = 1, N DO 30 I = J, M B( J, I ) = A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( J, I ) = A( I, J ) 50 CONTINUE 60 CONTINUE END IF RETURN * * End of DLATCPY * END scalapack-2.0.2/TOOLS/dmatadd.f000644 000766 000024 00000010327 10363532303 016357 0ustar00juliestaff000000 000000 SUBROUTINE DMATADD( M, N, ALPHA, A, LDA, BETA, C, LDC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER LDA, LDC, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * DMATADD performs the following local matrix-matrix operation * * C := alpha * A + beta * C, * * where alpha and beta are scalars, and A and C are m by n arrays. * * Arguments * ========= * * M (local input) INTEGER * The number of rows of the array A. M >= 0. * * N (local input) INTEGER * The number of columns of the array A. N >= 0. * * ALPHA (local input) DOUBLE PRECISION * The scalar ALPHA. * * A (local input) DOUBLE PRECISION * Array, dimension (LDA,*), the array A. * * LDA (local input) INTEGER * The leading dimension of the array A, LDA >= MAX(1, M) * * BETA (local input) DOUBLE PRECISION * The scalar BETA. * * C (local input/local output) DOUBLE PRECISION * Array, dimension (LDC,*), the array C. * * LDC (local input) INTEGER * The leading dimension of the array C, LDC >= MAX(1, M) * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * * Quick return if possible. * IF( (M.EQ.0).OR.(N.EQ.0).OR.((ALPHA.EQ.ZERO).AND.(BETA.EQ.ONE)) ) $ RETURN * IF( N.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 10 I = 1, M C( I, 1 ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) 20 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN DO 30 I = 1, M C( I, 1 ) = A( I, 1 ) + C( I, 1 ) 30 CONTINUE ELSE DO 40 I = 1, M C( I, 1 ) = A( I, 1 ) + BETA*C( I, 1 ) 40 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN DO 50 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) + C( I, 1 ) 50 CONTINUE ELSE DO 60 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) + BETA*C( I, 1 ) 60 CONTINUE END IF END IF ELSE IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M C( I, J ) = ZERO 70 CONTINUE 80 CONTINUE ELSE DO 100 J = 1, N DO 90 I = 1, M C( I, J ) = ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN DO 120 J = 1, N DO 110 I = 1, M C( I, J ) = A( I, J ) + C( I, J ) 110 CONTINUE 120 CONTINUE ELSE DO 140 J = 1, N DO 130 I = 1, M C( I, J ) = A( I, J ) + BETA * C( I, J ) 130 CONTINUE 140 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN DO 160 J = 1, N DO 150 I = 1, M C( I, J ) = C( I, J ) + ALPHA * A( I, J ) 150 CONTINUE 160 CONTINUE ELSE DO 180 J = 1, N DO 170 I = 1, M C( I, J ) = ALPHA * A( I, J ) + BETA * C( I, J ) 170 CONTINUE 180 CONTINUE END IF END IF END IF * RETURN * * End of DMATADD * END scalapack-2.0.2/TOOLS/dsasum.f000644 000766 000024 00000001651 10363532303 016255 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION DSASUM( N, X, INCX ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, N * .. * .. Array Arguments .. REAL X( * ) * .. * * Purpose * ======= * * DSASUM is a simple FORTRAN wrapper around the BLAS function SASUM * returning the result as a double allowing it to be callable by C * programs. * * ===================================================================== * * .. External Functions .. REAL SASUM EXTERNAL SASUM * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Executable Statements .. * DSASUM = DBLE( SASUM( N, X, INCX ) ) * RETURN * * End of DSASUM * END scalapack-2.0.2/TOOLS/dscasum.f000644 000766 000024 00000001661 10363532303 016421 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION DSCASUM( N, X, INCX ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, N * .. * .. Array Arguments .. COMPLEX X( * ) * .. * * Purpose * ======= * * DSCASUM is a simple FORTRAN wrapper around the BLAS function SCASUM * returning the result as a double allowing it to be callable by C * programs. * * ===================================================================== * * .. External Functions .. REAL SCASUM EXTERNAL SCASUM * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Executable Statements .. * DSCASUM = DBLE( SCASUM( N, X, INCX ) ) * RETURN * * End of DSCASUM * END scalapack-2.0.2/TOOLS/dscnrm2.f000644 000766 000024 00000001661 10363532303 016332 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION DSCNRM2( N, X, INCX ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, N * .. * .. Array Arguments .. COMPLEX X( * ) * .. * * Purpose * ======= * * DSCNRM2 is a simple FORTRAN wrapper around the BLAS function SCNRM2 * returning the result as a double allowing it to be callable by C * programs. * * ===================================================================== * * .. External Functions .. REAL SCNRM2 EXTERNAL SCNRM2 * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Executable Statements .. * DSCNRM2 = DBLE( SCNRM2( N, X, INCX ) ) * RETURN * * End of DSCNRM2 * END scalapack-2.0.2/TOOLS/dsnrm2.f000644 000766 000024 00000001651 10363532303 016166 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION DSNRM2( N, X, INCX ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, N * .. * .. Array Arguments .. REAL X( * ) * .. * * Purpose * ======= * * DSNRM2 is a simple FORTRAN wrapper around the BLAS function SNRM2 * returning the result as a double allowing it to be callable by C * programs. * * ===================================================================== * * .. External Functions .. REAL SNRM2 EXTERNAL SNRM2 * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Executable Statements .. * DSNRM2 = DBLE( SNRM2( N, X, INCX ) ) * RETURN * * End of DSNRM2 * END scalapack-2.0.2/TOOLS/iceil.f000644 000766 000024 00000001425 10363532303 016045 0ustar00juliestaff000000 000000 INTEGER FUNCTION ICEIL( INUM, IDENOM ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IDENOM, INUM * .. * * Purpose * ======= * * ICEIL returns the ceiling of the division of two integers. * * Arguments * ========= * * INUM (local input) INTEGER * The numerator, * * IDENOM (local input) INTEGER * and the denominator of the fraction to be evaluated. * * ===================================================================== * * .. Executable Statements .. * ICEIL = (INUM+IDENOM-1) / IDENOM * RETURN * * End of ICEIL * END scalapack-2.0.2/TOOLS/ilacpy.f000644 000766 000024 00000004634 10363532303 016246 0ustar00juliestaff000000 000000 SUBROUTINE ILACPY( UPLO, M, N, A, LDA, B, LDB ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER LDA, LDB, M, N * .. * .. Array Arguments .. INTEGER A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ILACPY copies all or part of a local array A to another array B. * * Arguments * ========= * * UPLO (local input) CHARACTER*1 * Specifies the part of the array A to be copied to B. * = 'U': Upper triangular part * = 'L': Lower triangular part * Otherwise: All of the array A * * M (local input) INTEGER * The number of rows of the array A. M >= 0. * * N (local input) INTEGER * The number of columns of the array A. N >= 0. * * A (local input) INTEGER * Array, dimension (LDA,N), the m by n array A. * If UPLO = 'U', only the upper triangle or trapezoid is * accessed; if UPLO = 'L', only the lower triangle or trapezoid * is accessed. * * LDA (local input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * B (local output) INTEGER * Array, dimension (LDB,N), on exit, B = A in the locations * specified by UPLO. * * LDB (local input) INTEGER * The leading dimension of the array B. LDB >= max(1,M). * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( LSAME( UPLO, 'U' ) ) THEN DO 20 J = 1, N DO 10 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN DO 40 J = 1, N DO 30 I = J, M B( I, J ) = A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( I, J ) = A( I, J ) 50 CONTINUE 60 CONTINUE END IF RETURN * * End of ILACPY * END scalapack-2.0.2/TOOLS/ilcm.f000644 000766 000024 00000002344 10363532303 015705 0ustar00juliestaff000000 000000 INTEGER FUNCTION ILCM( M, N ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER M, N * .. * * Purpose * ======= * * ILCM computes and returns the Least Common Multiple (LCM) of two * positive integers M and N. In fact the routine computes the greatest * common divisor (GCD) and use the fact that M*N = GCD*LCM. * * Arguments * ========= * * M (input) INTEGER * On entry, M >=0. Unchanged on exit. * * N (input) INTEGER * On entry, N >=0. Unchanged on exit. * * ===================================================================== * * .. Local Scalars .. INTEGER IA, IQ, IR * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IA = M ILCM = N ELSE IA = N ILCM = M ENDIF * 10 CONTINUE IQ = IA / ILCM IR = IA - IQ * ILCM IF( IR.EQ.0 ) THEN ILCM = ( M * N ) / ILCM RETURN END IF IA = ILCM ILCM = IR GO TO 10 * * End of ILCM * END scalapack-2.0.2/TOOLS/indxg2l.f000644 000766 000024 00000002740 10363532303 016330 0ustar00juliestaff000000 000000 INTEGER FUNCTION INDXG2L( INDXGLOB, NB, IPROC, ISRCPROC, NPROCS ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INDXGLOB, IPROC, ISRCPROC, NB, NPROCS * .. * * Purpose * ======= * * INDXG2L computes the local index of a distributed matrix entry * pointed to by the global index INDXGLOB. * * Arguments * ========= * * INDXGLOB (global input) INTEGER * The global index of the distributed matrix entry. * * NB (global input) INTEGER * Block size, size of the blocks the distributed matrix is * split into. * * IPROC (local dummy) INTEGER * Dummy argument in this case in order to unify the calling * sequence of the tool-routines. * * ISRCPROC (local dummy) INTEGER * Dummy argument in this case in order to unify the calling * sequence of the tool-routines. * * NPROCS (global input) INTEGER * The total number processes over which the distributed * matrix is distributed. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * INDXG2L = NB*((INDXGLOB-1)/(NB*NPROCS))+MOD(INDXGLOB-1,NB)+1 * RETURN * * End of INDXG2L * END scalapack-2.0.2/TOOLS/indxg2p.f000644 000766 000024 00000002732 10363532303 016335 0ustar00juliestaff000000 000000 INTEGER FUNCTION INDXG2P( INDXGLOB, NB, IPROC, ISRCPROC, NPROCS ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INDXGLOB, IPROC, ISRCPROC, NB, NPROCS * .. * * Purpose * ======= * * INDXG2P computes the process coordinate which posseses the entry of a * distributed matrix specified by a global index INDXGLOB. * * Arguments * ========= * * INDXGLOB (global input) INTEGER * The global index of the element. * * NB (global input) INTEGER * Block size, size of the blocks the distributed matrix is * split into. * * IPROC (local dummy) INTEGER * Dummy argument in this case in order to unify the calling * sequence of the tool-routines. * * ISRCPROC (global input) INTEGER * The coordinate of the process that possesses the first * row/column of the distributed matrix. * * NPROCS (global input) INTEGER * The total number processes over which the matrix is * distributed. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * INDXG2P = MOD( ISRCPROC + (INDXGLOB - 1) / NB, NPROCS ) * RETURN * * End of INDXG2P * END scalapack-2.0.2/TOOLS/indxl2g.f000644 000766 000024 00000003072 10363532303 016327 0ustar00juliestaff000000 000000 INTEGER FUNCTION INDXL2G( INDXLOC, NB, IPROC, ISRCPROC, NPROCS ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INDXLOC, IPROC, ISRCPROC, NB, NPROCS * .. * * Purpose * ======= * * INDXL2G computes the global index of a distributed matrix entry * pointed to by the local index INDXLOC of the process indicated by * IPROC. * * Arguments * ========= * * INDXLOC (global input) INTEGER * The local index of the distributed matrix entry. * * NB (global input) INTEGER * Block size, size of the blocks the distributed matrix is * split into. * * IPROC (local input) INTEGER * The coordinate of the process whose local array row or * column is to be determined. * * ISRCPROC (global input) INTEGER * The coordinate of the process that possesses the first * row/column of the distributed matrix. * * NPROCS (global input) INTEGER * The total number processes over which the distributed * matrix is distributed. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * INDXL2G = NPROCS*NB*((INDXLOC-1)/NB) + MOD(INDXLOC-1,NB) + $ MOD(NPROCS+IPROC-ISRCPROC, NPROCS)*NB + 1 * RETURN * * End of INDXL2G * END scalapack-2.0.2/TOOLS/infog1l.f000644 000766 000024 00000004325 10363532303 016321 0ustar00juliestaff000000 000000 SUBROUTINE INFOG1L( GINDX, NB, NPROCS, MYROC, ISRCPROC, LINDX, $ ROCSRC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER GINDX, ISRCPROC, LINDX, MYROC, NB, NPROCS, $ ROCSRC * .. * * Purpose * ======= * * INFOG1L computes the starting local indexes LINDX corresponding to * the distributed submatrix starting globally at the entry pointed by * GINDX. This routine returns the coordinates of the process in the * grid owning the submatrix entry of global index GINDX: ROCSRC. * INFOG1L is a 1-dimensional version of INFOG2L. * * Arguments * ========= * * GINDX (global input) INTEGER * The global starting index of the submatrix. * * NB (global input) INTEGER * The block size. * * NPROCS (global input) INTEGER * The total number of processes over which the distributed * submatrix is distributed. * * MYROC (local input) INTEGER * The coordinate of the process calling this routine. * * ISRCPROC (global input) INTEGER * The coordinate of the process having the first entry of * the distributed submatrix. * * LINDX (local output) INTEGER * The local starting indexes of the distributed submatrix. * * ROCSRC (global output) INTEGER * The coordinate of the process that possesses the first * row and column of the submatrix. * * ===================================================================== * * .. Local Scalars .. INTEGER GCPY, IBLK * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * GCPY = GINDX-1 IBLK = GCPY / NB ROCSRC = MOD( IBLK + ISRCPROC, NPROCS ) * LINDX = ( IBLK / NPROCS + 1 ) * NB + 1 * IF( MOD(MYROC+NPROCS-ISRCPROC,NPROCS).GE.MOD(IBLK, NPROCS) ) THEN IF( MYROC.EQ.ROCSRC ) $ LINDX = LINDX + MOD( GCPY, NB ) LINDX = LINDX - NB END IF * RETURN * * End of INFOG1L * END scalapack-2.0.2/TOOLS/infog2l.f000644 000766 000024 00000014641 10363532303 016324 0ustar00juliestaff000000 000000 SUBROUTINE INFOG2L( GRINDX, GCINDX, DESC, NPROW, NPCOL, MYROW, $ MYCOL, LRINDX, LCINDX, RSRC, CSRC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CSRC, GCINDX, GRINDX, LRINDX, LCINDX, MYCOL, $ MYROW, NPCOL, NPROW, RSRC * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * INFOG2L computes the starting local indexes LRINDX, LCINDX corres- * ponding to the distributed submatrix starting globally at the entry * pointed by GRINDX, GCINDX. This routine returns the coordinates in * the grid of the process owning the matrix entry of global indexes * GRINDX, GCINDX, namely RSRC and CSRC. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * GRINDX (global input) INTEGER * The global row starting index of the submatrix. * * GCINDX (global input) INTEGER * The global column starting index of the submatrix. * * DESC (input) INTEGER array of dimension DLEN_. * The array descriptor for the underlying distributed matrix. * * NPROW (global input) INTEGER * The total number of process rows over which the distributed * matrix is distributed. * * NPCOL (global input) INTEGER * The total number of process columns over which the * distributed matrix is distributed. * * MYROW (local input) INTEGER * The row coordinate of the process calling this routine. * * MYCOL (local input) INTEGER * The column coordinate of the process calling this routine. * * LRINDX (local output) INTEGER * The local rows starting index of the submatrix. * * LCINDX (local output) INTEGER * The local columns starting index of the submatrix. * * RSRC (global output) INTEGER * The row coordinate of the process that possesses the first * row and column of the submatrix. * * CSRC (global output) INTEGER * The column coordinate of the process that possesses the * first row and column of the submatrix. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CBLK, GCCPY, GRCPY, RBLK * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * GRCPY = GRINDX-1 GCCPY = GCINDX-1 * RBLK = GRCPY / DESC(MB_) CBLK = GCCPY / DESC(NB_) RSRC = MOD( RBLK + DESC(RSRC_), NPROW ) CSRC = MOD( CBLK + DESC(CSRC_), NPCOL ) * LRINDX = ( RBLK / NPROW + 1 ) * DESC(MB_) + 1 LCINDX = ( CBLK / NPCOL + 1 ) * DESC(NB_) + 1 * IF( MOD( MYROW+NPROW-DESC(RSRC_), NPROW ) .GE. $ MOD( RBLK, NPROW ) ) THEN IF( MYROW.EQ.RSRC ) $ LRINDX = LRINDX + MOD( GRCPY, DESC(MB_) ) LRINDX = LRINDX - DESC(MB_) END IF * IF( MOD( MYCOL+NPCOL-DESC(CSRC_), NPCOL ) .GE. $ MOD( CBLK, NPCOL ) ) THEN IF( MYCOL.EQ.CSRC ) $ LCINDX = LCINDX + MOD( GCCPY, DESC(NB_) ) LCINDX = LCINDX - DESC(NB_) END IF * RETURN * * End of INFOG2L * END scalapack-2.0.2/TOOLS/LAPACK/000755 000766 000024 00000000000 11750301603 015540 5ustar00juliestaff000000 000000 scalapack-2.0.2/TOOLS/Makefile000644 000766 000024 00000005425 11654025546 016267 0ustar00juliestaff000000 000000 ############################################################################ # # Program: ScaLAPACK # # Module: Makefile # # Purpose: Tools Makefile # # Creation date: March 20, 1995 # # Modified: February 15, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../SLmake.inc ATOOLS = iceil.o ilacpy.o ilcm.o indxg2p.o indxg2l.o \ indxl2g.o infog1l.o infog2l.o npreroc.o numroc.o \ chk1mat.o pchkxmat.o sltimer.o desc_convert.o \ descinit.o descset.o reshape.o SL_gridreshape.o \ SL_init.o ITOOLS = picol2row.o pirow2col.o pilaprnt.o pitreecomb.o pifillpad.o \ pichekpad.o pielset.o pielset2.o pielget.o STOOLS = dsnrm2.o dsasum.o slatcpy.o ssdot.o smatadd.o \ psmatadd.o pscol2row.o psrow2col.o pslaprnt.o pstreecomb.o \ psfillpad.o pschekpad.o pselset.o pselset2.o pselget.o \ pslaread.o pslawrite.o DTOOLS = dddot.o dlatcpy.o dmatadd.o pdmatadd.o pdcol2row.o \ pdrow2col.o pdlaprnt.o pdtreecomb.o pdfillpad.o pdchekpad.o \ pdelset.o pdelset2.o pdelget.o \ pdlaread.o pdlawrite.o CTOOLS = dscnrm2.o dscasum.o ccdotu.o ccdotc.o clatcpy.o \ cmatadd.o pcmatadd.o pccol2row.o pcrow2col.o pclaprnt.o \ pctreecomb.o pcfillpad.o pcchekpad.o pcelset.o pcelset2.o \ pcelget.o \ pclaread.o pclawrite.o ZTOOLS = zzdotu.o zzdotc.o zlatcpy.o zmatadd.o pzmatadd.o \ pzcol2row.o pzrow2col.o pzlaprnt.o pztreecomb.o pzfillpad.o \ pzchekpad.o pzelset.o pzelset2.o pzelget.o \ pzlaread.o pzlawrite.o all: single double complex complex16 slapackaux: ( cd LAPACK; $(MAKE) single ) dlapackaux: ( cd LAPACK; $(MAKE) double ) clapackaux: ( cd LAPACK; $(MAKE) complex ) zlapackaux: ( cd LAPACK; $(MAKE) complex16 ) integer: $(ATOOLS) $(ITOOLS) $(ARCH) $(ARCHFLAGS) ../$(SCALAPACKLIB) $(ATOOLS) $(ITOOLS) $(RANLIB) ../$(SCALAPACKLIB) single: slapackaux integer $(STOOLS) $(ARCH) $(ARCHFLAGS) ../$(SCALAPACKLIB) $(STOOLS) $(RANLIB) ../$(SCALAPACKLIB) double: dlapackaux integer $(DTOOLS) $(ARCH) $(ARCHFLAGS) ../$(SCALAPACKLIB) $(DTOOLS) $(RANLIB) ../$(SCALAPACKLIB) complex: clapackaux integer $(CTOOLS) $(ARCH) $(ARCHFLAGS) ../$(SCALAPACKLIB) $(CTOOLS) $(RANLIB) ../$(SCALAPACKLIB) complex16: zlapackaux integer $(ZTOOLS) $(ARCH) $(ARCHFLAGS) ../$(SCALAPACKLIB) $(ZTOOLS) $(RANLIB) ../$(SCALAPACKLIB) clean : rm -f *.o ( cd LAPACK; $(MAKE) clean ) .f.o : ; $(FC) -c $(FCFLAGS) $*.f .c.o : ; $(CC) -c $(CDEFS) $(CCFLAGS) $*.c scalapack-2.0.2/TOOLS/npreroc.f000644 000766 000024 00000004711 10363532303 016431 0ustar00juliestaff000000 000000 INTEGER FUNCTION NPREROC( N, NB, IPROC, ISRCPROC, NPROCS ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IPROC, ISRCPROC, N, NB, NPROCS * .. * * Purpose * ======= * * NPREROC computes the Number of PREceeding Rows Or Columns of a * distributed matrix that are possessed by processes closer to * ISRCPROC than IPROC. Therefore, if ISRCPROC=0 and IPROC=4, then * NPREROC returns the number of distributed matrix rows or columns * owned by processes 0, 1, 2, and 3. * * Arguments * ========= * * N (global input) INTEGER * The number of rows or columns in the distributed matrix. * * NB (global input) INTEGER * Block size, size of the blocks the distributed matrix is * split into. * * IPROC (local intput) INTEGER * The coordinate of the process whose preceeding distributed * matrix rows or columns are to be determined. * * ISRCPROC (global input) INTEGER * The coordinate of the process that possesses the first * row or column of the distributed matrix. * * NPROCS (global input) INTEGER * The total number processes over which the matrix is * distributed. * * ===================================================================== * * .. Local Scalars .. INTEGER EXTRABLKS, MYDIST, NBLOCKS * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Figure PROC's distance from source process * MYDIST = MOD( NPROCS+IPROC-ISRCPROC, NPROCS ) * * Figure the total number of whole NB blocks N is split up into * NBLOCKS = N / NB * * Figure the minimum number of rows/cols previous processes could have * NPREROC = (NBLOCKS/NPROCS) * NB * MYDIST * * See if there are any extra blocks * EXTRABLKS = MOD( NBLOCKS, NPROCS ) * * If I have an extra block, all processes in front of me got one too * IF( MYDIST.LE.EXTRABLKS ) THEN NPREROC = NPREROC + NB*MYDIST * * If I have don't have an extra block, add in extra blocks of * preceeding processes and the partial block, if it exists * ELSE NPREROC = NPREROC + EXTRABLKS*NB + MOD( N, NB ) END IF * RETURN * * End of NPREROC * END scalapack-2.0.2/TOOLS/numroc.f000644 000766 000024 00000004160 10363532303 016262 0ustar00juliestaff000000 000000 INTEGER FUNCTION NUMROC( N, NB, IPROC, ISRCPROC, NPROCS ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IPROC, ISRCPROC, N, NB, NPROCS * .. * * Purpose * ======= * * NUMROC computes the NUMber of Rows Or Columns of a distributed * matrix owned by the process indicated by IPROC. * * Arguments * ========= * * N (global input) INTEGER * The number of rows/columns in distributed matrix. * * NB (global input) INTEGER * Block size, size of the blocks the distributed matrix is * split into. * * IPROC (local input) INTEGER * The coordinate of the process whose local array row or * column is to be determined. * * ISRCPROC (global input) INTEGER * The coordinate of the process that possesses the first * row or column of the distributed matrix. * * NPROCS (global input) INTEGER * The total number processes over which the matrix is * distributed. * * ===================================================================== * * .. Local Scalars .. INTEGER EXTRABLKS, MYDIST, NBLOCKS * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Figure PROC's distance from source process * MYDIST = MOD( NPROCS+IPROC-ISRCPROC, NPROCS ) * * Figure the total number of whole NB blocks N is split up into * NBLOCKS = N / NB * * Figure the minimum number of rows/cols a process can have * NUMROC = (NBLOCKS/NPROCS) * NB * * See if there are any extra blocks * EXTRABLKS = MOD( NBLOCKS, NPROCS ) * * If I have an extra block * IF( MYDIST.LT.EXTRABLKS ) THEN NUMROC = NUMROC + NB * * If I have last block, it may be a partial block * ELSE IF( MYDIST.EQ.EXTRABLKS ) THEN NUMROC = NUMROC + MOD( N, NB ) END IF * RETURN * * End of NUMROC * END scalapack-2.0.2/TOOLS/pcchekpad.f000644 000766 000024 00000012527 10363532303 016707 0ustar00juliestaff000000 000000 SUBROUTINE PCCHEKPAD( ICTXT, MESS, M, N, A, LDA, IPRE, IPOST, $ CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N COMPLEX CHKVAL * .. * .. Array Arguments .. CHARACTER MESS*(*) COMPLEX A( * ) * .. * * Purpose * ======= * * PCCHEKPAD checks that the padding around a local array has not * been overwritten since the call to PCFILLPAD. 3 types of errors * are reported: * * 1) Overwrite in pre-guardzone. This indicates a memory overwrite has * occurred in the first IPRE elements which form a buffer before the * beginning of A. Therefore, the error message: * 'Overwrite in pre-guardzone: loc( 5) = 18.00000' * tells you that the 5th element of the IPRE long buffer has been * overwritten with the value 18, where it should still have the value * of CHKVAL. * * 2) Overwrite in post-guardzone. This indicates a memory overwrite has * occurred in the last IPOST elements which form a buffer after the end * of A. Error reports are refered from the end of A. Therefore, * 'Overwrite in post-guardzone: loc( 19) = 24.00000' * tells you that the 19th element after the end of A was overwritten * with the value 24, where it should still have the value of CHKVAL. * * 3) Overwrite in lda-m gap. Tells you elements between M and LDA were * overwritten. So, * 'Overwrite in lda-m gap: A( 12, 3) = 22.00000' * tells you that the element at the 12th row and 3rd column of A was * overwritten with the value of 22, where it should still have the * value of CHKVAL. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * MESS (local input) CHARACTER*(*) * String containing a user-defined message. * * M (local input) INTEGER * The number of rows in the local array A. * * N (input) INTEGER * The number of columns in the local array A. * * A (local input) COMPLEX array of dimension (LDA,N). * A location IPRE elements in front of the array to be checked. * * LDA (local input) INTEGER * The leading Dimension of the local array to be checked. * * IPRE (local input) INTEGER * The size of the guard zone before the start of padded array. * * IPOST (local input) INTEGER * The size of guard zone after the padded array. * * CHKVAL (local input) COMPLEX * The value the local array was padded with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, IAM, IDUMM, INFO, J, K, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC REAL, AIMAG * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW*NPCOL + MYCOL INFO = -1 * * Check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, ' pre', I, $ REAL( A( I ) ), AIMAG( A( I ) ) INFO = IAM END IF 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PCCHEKPAD' END IF * * Check buffer after A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, 'post', $ I-J+1, REAL( A( I ) ), $ AIMAG( A( I ) ) INFO = IAM END IF 20 CONTINUE ELSE WRITE( *, FMT = * ) $ 'WARNING no post-guardzone buffer in PCCHEKPAD' END IF * * Check all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9997 ) MYROW, MYCOL, MESS, $ I-IPRE-LDA*(J-1), J, REAL( A( I ) ), $ AIMAG( A( I ) ) INFO = IAM END IF 30 CONTINUE K = K + LDA 40 CONTINUE END IF * CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUMM, IDUMM, -1, $ 0, 0 ) IF( IAM.EQ.0 .AND. INFO.GE.0 ) THEN WRITE( *, FMT = 9999 ) INFO / NPCOL, MOD( INFO, NPCOL ), MESS END IF * 9999 FORMAT( '{', I5, ',', I5, '}: Memory overwrite in ', A ) 9998 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ A4, '-guardzone: loc(', I3, ') = ', G11.4, '+ i*', $ G11.4 ) 9997 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ 'lda-m gap: loc(', I3, ',', I3, ') = ', G11.4, $ '+ i*', G11.4 ) * RETURN * * End of PCCHEKPAD * END scalapack-2.0.2/TOOLS/pccol2row.f000644 000766 000024 00000021577 11654631032 016707 0ustar00juliestaff000000 000000 SUBROUTINE PCCOL2ROW( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, RSRC, $ CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. COMPLEX VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a column of processes, and distribute those rows over a row of * processes. This routine minimizes communication by sending all * information it has that a given process in the RDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process column consisting of P processes, and I want to send all of * that vector that I own to a new vector distributed over Q processes * within a process row, that after I find the process in RDEST that * owns the row of the vector I'm currently looking at, he will want * every ( (LCM(P,Q) / P ) block of my vector (the block being of size * NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block. * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a column. * * VS (local input) COMPLEX * Array of dimension (LDVS,N), the block of vectors stored on * process column CSRC to be put into memory VD, and stored * on process row RDEST. * * LDVS (local input) INTEGER * The leading dimension of VS, LDVS >= MAX( 1, MP ). * * VD (local output) COMPLEX * Array of dimension (LDVD,N), on output, the contents of VS * stored on process row RDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD, LDVD >= MAX( 1, MQ ). * * RSRC (global input) INTEGER * The process row the distributed block of vectors VS begins * on. * * CSRC (global input) INTEGER * The process column VS is distributed over. * * RDEST (global input) INTEGER * The process row to distribute VD over. * * CDEST (global input) INTEGER * The process column that VD begins on. * * WORK (local workspace) COMPLEX * Array of dimension (LDW), the required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mp = number of rows of VS on my process. * nprow = number of process rows * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mp/NB )/(LCM/nprow) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, ICDEST, II, IRSRC, ISTART, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGESD2D, CGERV2D, CLACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * ICPY = 0 * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there * is no copying required * IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYCOL.EQ.CSRC ) THEN * ISTART = 1 * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my first block * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * * Loop over all possible destination processes * DO 20 K = 1, CBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MP, NB*CBLKSKIP JB = MIN(NB, MP-II+1) CALL CLACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary (NOTE: will send extra if NB > JB) * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL CGESD2D( ICTXT, JJ, 1, WORK, JJ, RDEST, $ ICDEST ) * ELSE * * I am both source and destination, save where to start * copying from for later use. * ICPY = ISTART END IF * ISTART = ISTART + NB ICDEST = MOD(ICDEST+NPROW, NPCOL) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYROW.EQ.RDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) DO 50 K = 1, RBLKSKIP * * If I don't already possess the required data * IF( (MYCOL.NE.CSRC).OR.(MYROW.NE.IRSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MQ - ISTART + NB) / NB JJ = ((NBLOCKS+RBLKSKIP-1) / RBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL CGERV2D( ICTXT, JJ, N, WORK, JJ, IRSRC, CSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL CLACPY( 'G', JB, N, WORK(JJ), JB, $ VD(II,1), LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL CLACPY( 'G', JB, N, VS(JJ,1), LDVS, $ VD(II,1), LDVD ) JJ = JJ + NB*CBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB IRSRC = MOD( IRSRC+NPCOL, NPROW ) 50 CONTINUE END IF * * If NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYCOL.EQ.CSRC ) THEN * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my piece of the vector. * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN CALL CGESD2D( ICTXT, MP, N, VS, LDVS, RDEST, ICDEST ) ELSE CALL CLACPY( 'G', MP, N, VS, LDVS, VD, LDVD ) END IF END IF * IF( MYROW.EQ.RDEST ) THEN * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my piece of the vector. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) * IF( (MYROW.NE.IRSRC).OR.(MYCOL.NE.CSRC) ) $ CALL CGERV2D( ICTXT, MQ, N, VD, LDVD, IRSRC, CSRC ) * END IF * END IF * RETURN * * End of PCCOL2ROW * END scalapack-2.0.2/TOOLS/pcelget.f000644 000766 000024 00000016301 10363532303 016402 0ustar00juliestaff000000 000000 SUBROUTINE PCELGET( SCOPE, TOP, ALPHA, A, IA, JA, DESCA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 SCOPE, TOP INTEGER IA, JA COMPLEX ALPHA * .. * .. Array arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCELGET sets alpha to the distributed matrix entry A( IA, JA ). * The value of alpha is set according to the scope. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SCOPE (global input) CHARACTER*1 * The BLACS scope in which alpha is updated. * If SCOPE = 'R', alpha is updated only in the process row * containing A( IA, JA ), * If SCOPE = 'C', alpha is updated only in the process column * containing A( IA, JA ), * If SCOPE = 'A', alpha is updated in all the processes of the * grid, * otherwise alpha is updated only in the process containing * A( IA, JA ). * * TOP (global input) CHARACTER*1 * The topology to be used if broadcast is needed. * * ALPHA (global output) COMPLEX, the scalar alpha. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, IIA, IOFFA, JJA, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, INFOG2L * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * ALPHA = ZERO * IF( LSAME( SCOPE, 'R' ) ) THEN IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL CGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL CGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'C' ) ) THEN IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL CGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL CGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'A' ) ) THEN IF( ( MYROW.EQ.IAROW ).AND.( MYCOL.EQ.IACOL ) ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL CGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL CGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF ELSE IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ ALPHA = A( IIA+(JJA-1)*DESCA( LLD_ ) ) END IF * RETURN * * End of PCELGET * END scalapack-2.0.2/TOOLS/pcelset.f000644 000766 000024 00000011773 10363532303 016426 0ustar00juliestaff000000 000000 SUBROUTINE PCELSET( A, IA, JA, DESCA, ALPHA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA COMPLEX ALPHA * .. * .. Array arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCELSET sets the distributed matrix entry A( IA, JA ) to ALPHA. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ALPHA (local input) COMPLEX * The scalar alpha. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, JJA, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ A( IIA+(JJA-1)*DESCA( LLD_ ) ) = ALPHA * RETURN * * End of PCELSET * END scalapack-2.0.2/TOOLS/pcelset2.f000644 000766 000024 00000012453 10363532303 016504 0ustar00juliestaff000000 000000 SUBROUTINE PCELSET2( ALPHA, A, IA, JA, DESCA, BETA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA COMPLEX ALPHA, BETA * .. * .. Array arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCELSET2 sets alpha to the distributed matrix entry A(IA,JA) * and A(IA,JA) to beta. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * ALPHA (local output) COMPLEX * The scalar alpha. * * A (local input/local ouput) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,*) containing * the local pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * BETA (local input) COMPLEX * The scalar beta. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, IOFFA, JJA, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) ALPHA = A( IOFFA ) A( IOFFA ) = BETA ELSE ALPHA = ZERO END IF * RETURN * * End of PCELSET2 * END scalapack-2.0.2/TOOLS/pcfillpad.f000644 000766 000024 00000005562 10363532303 016724 0ustar00juliestaff000000 000000 SUBROUTINE PCFILLPAD( ICTXT, M, N, A, LDA, IPRE, IPOST, CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N COMPLEX CHKVAL * .. * .. Array Arguments .. COMPLEX A( * ) * .. * * Purpose * ======= * * PCFILLPAD surrounds a two dimensional local array with a guard- * zone initialized to the value CHKVAL. The user may later call the * routine PCCHEKPAD to discover if the guardzone has been * violated. There are three guardzones. The first is a buffer of size * IPRE that is before the start of the array. The second is the buffer * of size IPOST which is after the end of the array to be padded. * Finally, there is a guard zone inside every column of the array to * be padded, in the elements of A(M+1:LDA, J). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (local input) INTEGER * The number of rows in the local array. * * N (local input) INTEGER * The number of columns in the local array. * * A (local input/local output) COMPLEX, array of * dimension (LDA,N). A location IPRE elements in front of * the matrix to be padded. * * LDA (local input) INTEGER * The leading Dimension of the local array to be padded. * * IPRE (local input) INTEGER * The size of the guard zone to put before the start of * padded array. * * IPOST (local input) INTEGER * The size of the guard zone to put after padded array. * * CHKVAL (local input) COMPLEX * The value to pad matrix with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE A( I ) = CHKVAL 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PCFILLPAD' END IF * * Put check buffer in back of A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 A( I ) = CHKVAL 20 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no post-guardzone in PCFILLPAD' END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 A( I ) = CHKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * RETURN * * End of PCFILLPAD * END scalapack-2.0.2/TOOLS/pchkxmat.f000644 000766 000024 00000037217 10363532303 016607 0ustar00juliestaff000000 000000 SUBROUTINE PCHK1MAT( MA, MAPOS0, NA, NAPOS0, IA, JA, DESCA, $ DESCAPOS0, NEXTRA, EX, EXPOS, INFO ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER DESCAPOS0, IA, INFO, JA, MA, MAPOS0, NA, $ NAPOS0, NEXTRA * .. * .. Array Arguments .. INTEGER DESCA( * ), EX( NEXTRA ), EXPOS( NEXTRA ) * .. * * Purpose * ======= * * PCHK1MAT checks that the values associated with one distributed * matrix are consistant across the entire process grid. * * Notes * ===== * * This routine checks that all values are the same across the grid. * It does no local checking; it is therefore legal to abuse the * definitions of the non-descriptor arguments, i.e., if the routine * you are checking does not possess a MA value, you may pass some * other integer that must be global into this argument instead. * * Arguments * ========= * * MA (global input) INTEGER * The global number of matrix rows of A being operated on. * * MAPOS0 (global input) INTEGER * Where in the calling routine's parameter list MA appears. * * NA (global input) INTEGER * The global number of matrix columns of A being operated on. * * NAPOS0 (global input) INTEGER * Where in the calling routine's parameter list NA appears. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * DESCAPOS0 (global input) INTEGER * Where in the calling routine's parameter list DESCA * appears. Note that we assume IA and JA are respectively 2 * and 1 entries behind DESCA. * * NEXTRA (global input) INTEGER * The number of extra parameters (i.e., besides the ones * above) to check. NEXTRA <= LDW - 11. * * EX (local input) INTEGER array of dimension (NEXTRA) * The values of these extra parameters * * EXPOS (local input) INTEGER array of dimension (NEXTRA) * The parameter list positions of these extra values. * * INFO (local input/global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER BIGNUM, DESCMULT, LDW PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT, $ LDW = 25 ) * .. * .. Local Scalars .. INTEGER DESCPOS, K * .. * .. Local Arrays .. INTEGER IWORK( LDW, 2 ), IWORK2( LDW ) * .. * .. External Subroutines .. EXTERNAL GLOBCHK * .. * .. Executable Statements .. * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Pack values and their positions in the parameter list, factoring * in the descriptor multiplier * IWORK( 1, 1 ) = MA IWORK( 1, 2 ) = MAPOS0 * DESCMULT IWORK( 2, 1 ) = NA IWORK( 2, 2 ) = NAPOS0 * DESCMULT IWORK( 3, 1 ) = IA IWORK( 3, 2 ) = (DESCAPOS0-2) * DESCMULT IWORK( 4, 1 ) = JA IWORK( 4, 2 ) = (DESCAPOS0-1) * DESCMULT DESCPOS = DESCAPOS0 * DESCMULT * IWORK( 5, 1 ) = DESCA( DTYPE_ ) IWORK( 5, 2 ) = DESCPOS + DTYPE_ IWORK( 6, 1 ) = DESCA( M_ ) IWORK( 6, 2 ) = DESCPOS + M_ IWORK( 7, 1 ) = DESCA( N_ ) IWORK( 7, 2 ) = DESCPOS + N_ IWORK( 8, 1 ) = DESCA( MB_ ) IWORK( 8, 2 ) = DESCPOS + MB_ IWORK( 9, 1 ) = DESCA( NB_ ) IWORK( 9, 2 ) = DESCPOS + NB_ IWORK( 10, 1 ) = DESCA( RSRC_ ) IWORK( 10, 2 ) = DESCPOS + RSRC_ IWORK( 11, 1 ) = DESCA( CSRC_ ) IWORK( 11, 2 ) = DESCPOS + CSRC_ * IF( NEXTRA.GT.0 ) THEN DO 10 K = 1, NEXTRA IWORK( 11+K, 1 ) = EX( K ) IWORK( 11+K, 2 ) = EXPOS( K ) 10 CONTINUE END IF K = 11 + NEXTRA * * Get the smallest error detected anywhere (BIGNUM if no error) * CALL GLOBCHK( DESCA( CTXT_ ), K, IWORK, LDW, IWORK2, INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT if * error is not in a descriptor entry * IF( INFO .EQ. BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * RETURN * * End of PCHK1MAT * END * SUBROUTINE PCHK2MAT( MA, MAPOS0, NA, NAPOS0, IA, JA, DESCA, $ DESCAPOS0, MB, MBPOS0, NB, NBPOS0, IB, JB, $ DESCB, DESCBPOS0, NEXTRA, EX, EXPOS, INFO ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER DESCAPOS0, DESCBPOS0, IA, IB, INFO, JA, JB, MA, $ MAPOS0, MB, MBPOS0, NA, NAPOS0, NB, NBPOS0, $ NEXTRA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( 8 ), EX( NEXTRA ), $ EXPOS( NEXTRA ) * .. * * Purpose * ======= * * PCHK2MAT checks that the values associated with two distributed * matrices are consistant across the entire process grid. * * Notes * ===== * * This routine checks that all values are the same across the grid. * It does no local checking; it is therefore legal to abuse the * definitions of the non-descriptor arguments, i.e., if the routine * you are checking does not possess a MA value, you may pass some * other integer that must be global into this argument instead. * * Arguments * ========= * * MA (global input) INTEGER * The global number of matrix rows of A being operated on. * * MAPOS0 (global input) INTEGER * Where in the calling routine's parameter list MA appears. * * NA (global input) INTEGER * The global number of matrix columns of A being operated on. * * NAPOS0 (global input) INTEGER * Where in the calling routine's parameter list NA appears. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * DESCAPOS0 (global input) INTEGER * Where in the calling routine's parameter list DESCA * appears. Note that we assume IA and JA are respectively 2 * and 1 entries behind DESCA. * * MB (global input) INTEGER * The global number of matrix rows of B being operated on. * * MBPOS0 (global input) INTEGER * Where in the calling routine's parameter list MB appears. * * NB (global input) INTEGER * The global number of matrix columns of B being operated on. * * NBPOS0 (global input) INTEGER * Where in the calling routine's parameter list NB appears. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * DESCBPOS0 (global input) INTEGER * Where in the calling routine's parameter list DESCB * appears. Note that we assume IB and JB are respectively 2 * and 1 entries behind DESCB. * * NEXTRA (global input) INTEGER * The number of extra parameters (i.e., besides the ones * above) to check. NEXTRA <= LDW - 22. * * EX (local input) INTEGER array of dimension (NEXTRA) * The values of these extra parameters * * EXPOS (local input) INTEGER array of dimension (NEXTRA) * The parameter list positions of these extra values. * * INFO (local input/global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DESCMULT, BIGNUM, LDW PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT, $ LDW = 35 ) * .. * .. Local Scalars .. INTEGER K, DESCPOS * .. * .. Local Arrays .. INTEGER IWORK( LDW, 2 ), IWORK2( LDW ) * .. * .. External Subroutines .. EXTERNAL GLOBCHK * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Pack values and their positions in the parameter list, factoring * in the descriptor multiplier * IWORK( 1, 1 ) = MA IWORK( 1, 2 ) = MAPOS0 * DESCMULT IWORK( 2, 1 ) = NA IWORK( 2, 2 ) = NAPOS0 * DESCMULT IWORK( 3, 1 ) = IA IWORK( 3, 2 ) = (DESCAPOS0-2) * DESCMULT IWORK( 4, 1 ) = JA IWORK( 4, 2 ) = (DESCAPOS0-1) * DESCMULT DESCPOS = DESCAPOS0 * DESCMULT * IWORK( 5, 1 ) = DESCA( DTYPE_ ) IWORK( 5, 2 ) = DESCPOS + DTYPE_ IWORK( 6, 1 ) = DESCA( M_ ) IWORK( 6, 2 ) = DESCPOS + M_ IWORK( 7, 1 ) = DESCA( N_ ) IWORK( 7, 2 ) = DESCPOS + N_ IWORK( 8, 1 ) = DESCA( MB_ ) IWORK( 8, 2 ) = DESCPOS + MB_ IWORK( 9, 1 ) = DESCA( NB_ ) IWORK( 9, 2 ) = DESCPOS + NB_ IWORK( 10, 1 ) = DESCA( RSRC_ ) IWORK( 10, 2 ) = DESCPOS + RSRC_ IWORK( 11, 1 ) = DESCA( CSRC_ ) IWORK( 11, 2 ) = DESCPOS + CSRC_ * IWORK( 12, 1 ) = MB IWORK( 12, 2 ) = MBPOS0 * DESCMULT IWORK( 13, 1 ) = NB IWORK( 13, 2 ) = NBPOS0 * DESCMULT IWORK( 14, 1 ) = IB IWORK( 14, 2 ) = (DESCBPOS0-2) * DESCMULT IWORK( 15, 1 ) = JB IWORK( 15, 2 ) = (DESCBPOS0-1) * DESCMULT DESCPOS = DESCBPOS0 * DESCMULT * IWORK( 16, 1 ) = DESCB( DTYPE_ ) IWORK( 16, 2 ) = DESCPOS + DTYPE_ IWORK( 17, 1 ) = DESCB( M_ ) IWORK( 17, 2 ) = DESCPOS + M_ IWORK( 18, 1 ) = DESCB( N_ ) IWORK( 18, 2 ) = DESCPOS + N_ IWORK( 19, 1 ) = DESCB( MB_ ) IWORK( 19, 2 ) = DESCPOS + MB_ IWORK( 20, 1 ) = DESCB( NB_ ) IWORK( 20, 2 ) = DESCPOS + NB_ IWORK( 21, 1 ) = DESCB( RSRC_ ) IWORK( 21, 2 ) = DESCPOS + RSRC_ IWORK( 22, 1 ) = DESCB( CSRC_ ) IWORK( 22, 2 ) = DESCPOS + CSRC_ * IF( NEXTRA.GT.0 ) THEN DO 10 K = 1, NEXTRA IWORK( 22+K, 1 ) = EX( K ) IWORK( 22+K, 2 ) = EXPOS( K ) 10 CONTINUE END IF K = 22 + NEXTRA * * Get the smallest error detected anywhere (BIGNUM if no error) * CALL GLOBCHK( DESCA( CTXT_ ), K, IWORK, LDW, IWORK2, INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * RETURN * * End of PCHK2MAT * END * SUBROUTINE GLOBCHK( ICTXT, N, X, LDX, IWORK, INFO ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, INFO, LDX, N * .. * .. Array Arguments .. INTEGER IWORK( N ), X( LDX, 2 ) * .. * * Purpose * ======= * * GLOBCHK checks that values in X(i,1) are the same on all processes * in the process grid indicated by ICTXT. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle indicating the context over which * the values are to be the same. * * N (global input) INTEGER * The number of values to be compared. * * X (local input) INTEGER array, dimension (N,2) * The 1st column contains the values which should be the same * on all processes. The 2nd column indicates where in the * calling routine's parameter list the corresponding value * from column 1 came from. * * LDX (local input) INTEGER * The leading dimension of the array X. LDX >= MAX(1,N). * * IWORK (local workspace) INTEGER array, dimension (N) * Used to receive other processes' values for comparing with X. * * INFO (local input/global output) INTEGER * On entry, the smallest error flag so far generated, or BIGNUM * for no error. On exit: * = BIGNUM : no error * < 0: if INFO = -i*100, the i-th argument had an illegal * value, or was different between processes. * * ===================================================================== * * .. Local Scalars .. INTEGER K, MYROW, MYCOL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMN2D, IGEBR2D, IGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, IWORK, K, MYROW, MYCOL ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'All', ' ', N, 1, X, N ) ELSE CALL IGEBR2D( ICTXT, 'All', ' ', N, 1, IWORK, N, 0, 0 ) DO 10 K = 1, N IF( X( K, 1 ).NE.IWORK( K ) ) $ INFO = MIN( INFO, X( K, 2 ) ) 10 CONTINUE END IF * CALL IGAMN2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, K, K, -1, -1, 0 ) * RETURN * * End GLOBCHK * END scalapack-2.0.2/TOOLS/pclaprnt.f000644 000766 000024 00000030777 10363532303 016617 0ustar00juliestaff000000 000000 SUBROUTINE PCLAPRNT( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, WORK ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCLAPRNT prints to the standard output a distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and * printed by the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX pointer into the local memory to a * local array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IRPRNT (global input) INTEGER * The row index of the printing process. * * ICPRNT (global input) INTEGER * The column index of the printing process. * * CMATNM (global input) CHARACTER*(*) * Identifier of the distributed matrix to be printed. * * NOUT (global input) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. * * WORK (local workspace) COMPLEX * Working array of minimum size equal to MB_A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L, $ CGERV2D, CGESD2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC AIMAG, MIN, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ICURROW = IAROW ICURCOL = IACOL II = IIA JJ = JJA LDA = DESCA( LLD_ ) * * Handle the first block of column separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 DO 60 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, JA+H, $ REAL( A(II+K+(JJ+H-1)*LDA) ), $ AIMAG( A(II+K+(JJ+H-1)*LDA) ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, JA+H, REAL( WORK( K ) ), $ AIMAG( WORK( K ) ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, JA+H, $ REAL( A( II+K+(JJ+H-1)*LDA ) ), $ AIMAG( A( II+K+(JJ+H-1)*LDA ) ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, JA+H, REAL( WORK( K ) ), $ AIMAG( WORK( K ) ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, J+H, $ REAL( A( II+K+(JJ+H-1)*LDA ) ), $ AIMAG( A( II+K+(JJ+H-1)*LDA ) ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, J+H, REAL( WORK( K ) ), $ AIMAG( WORK( K ) ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, J+H, $ REAL( A( II+K+(JJ+H-1)*LDA ) ), $ AIMAG( A( II+K+(JJ+H-1)*LDA ) ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, J+H, REAL( WORK( K ) ), $ AIMAG( WORK( K ) ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * 9999 FORMAT(A,'(',I6,',',I6,')=',E16.8, '+i*(',E16.8, ')') * RETURN * * End of PCLAPRNT * END scalapack-2.0.2/TOOLS/pclaread.f000644 000766 000024 00000011142 10604624224 016532 0ustar00juliestaff000000 000000 SUBROUTINE PCLAREAD( FILNAM, A, DESCA, IRREAD, ICREAD, WORK ) * * -- ScaLAPACK tools routine (version 1.8) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * adapted by Julie Langou, April 2007 (julie@cs.utk.edu) * * .. Scalar Arguments .. INTEGER ICREAD, IRREAD * .. * .. Array Arguments .. CHARACTER*(*) FILNAM INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCLAREAD reads from a file named FILNAM a matrix and distribute * it to the process grid. * * Only the process of coordinates {IRREAD, ICREAD} read the file. * * WORK must be of size >= MB_ = DESCA( MB_ ). * * ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IB, ICTXT, ICURCOL, ICURROW, II, J, JB, $ JJ, K, LDA, M, MYCOL, MYROW, N, NPCOL, NPROW REAL REAL_PART, IMAG_PART * .. * .. Local Arrays .. INTEGER IWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, CGERV2D, CGESD2D, $ IGEBS2D, IGEBR2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN OPEN( NIN, FILE=FILNAM, STATUS='OLD' ) READ( NIN, FMT = * ) ( IWORK( I ), I = 1, 2 ) CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, IWORK, 2 ) ELSE CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, IWORK, 2, IRREAD, $ ICREAD ) END IF M = IWORK( 1 ) N = IWORK( 2 ) * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * IF( M.GT.DESCA( M_ ).OR. N.GT.DESCA( N_ ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( *, FMT = * ) 'PCLAREAD: Matrix too big to fit in' WRITE( *, FMT = * ) 'Abort ...' END IF CALL BLACS_ABORT( ICTXT, 0 ) END IF * II = 1 JJ = 1 ICURROW = DESCA( RSRC_ ) ICURCOL = DESCA( CSRC_ ) LDA = DESCA( LLD_ ) * * Loop over column blocks * DO 50 J = 1, N, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), N-J+1 ) DO 40 H = 0, JB-1 * * Loop over block of rows * DO 30 I = 1, M, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), M-I+1 ) IF( ICURROW.EQ.IRREAD .AND. ICURCOL.EQ.ICREAD ) THEN IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN DO 10 K = 0, IB-1 READ( NIN , FMT = *) REAL_PART, IMAG_PART A( II+K+(JJ+H-1)*LDA ) = CMPLX(REAL_PART, IMAG_PART) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGERV2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRREAD, ICREAD ) ELSE IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN DO 20 K = 1, IB READ( NIN, FMT = * ) REAL_PART, IMAG_PART WORK(K)=CMPLX(REAL_PART,IMAG_PART) 20 CONTINUE CALL CGESD2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) 30 CONTINUE * II = 1 ICURROW = DESCA( RSRC_ ) 40 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) * 50 CONTINUE * IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN CLOSE( NIN ) END IF * RETURN * * End of PCLAREAD * END scalapack-2.0.2/TOOLS/pclawrite.f000644 000766 000024 00000020400 10604624224 016746 0ustar00juliestaff000000 000000 SUBROUTINE PCLAWRITE( FILNAM, M, N, A, IA, JA, DESCA, IRWRIT, $ ICWRIT, WORK ) * * -- ScaLAPACK tools routine (version 1.8) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * adapted by Julie Langou, April 2007 (julie@cs.utk.edu) * * .. Scalar Arguments .. INTEGER IA, ICWRIT, IRWRIT, JA, M, N * .. * .. Array Arguments .. CHARACTER*(*) FILNAM INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCLAWRITE writes to a file named FILNAMa distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent to and * written by the process of coordinates (IRWWRITE, ICWRIT). * * WORK must be of size >= MB_ = DESCA( MB_ ). * * ===================================================================== * * .. Parameters .. INTEGER NOUT PARAMETER ( NOUT = 13 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L, $ CGERV2D, CGESD2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC AIMAG, REAL, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN OPEN( NOUT, FILE=FILNAM, STATUS='UNKNOWN' ) WRITE( NOUT, FMT = * ) M, N END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ICURROW = IAROW ICURCOL = IACOL II = IIA JJ = JJA LDA = DESCA( LLD_ ) * * Handle the first block of column separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 DO 60 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) REAL(WORK( K )), $ AIMAG(WORK( K )) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ REAL (A( II+K+(JJ+H-1)*LDA )), $ AIMAG (A( II+K+(JJ+H-1)*LDA )) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) REAL (WORK( K )), $ AIMAG (WORK( K )) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ REAL (A( II+K+(JJ+H-1)*LDA )), $ AIMAG (A( II+K+(JJ+H-1)*LDA )) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) REAL (WORK( K )), $ AIMAG (WORK( K)) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ REAL (A( II+K+(JJ+H-1)*LDA )), $ AIMAG (A( II+K+(JJ+H-1)*LDA )) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) REAL (WORK( K )), $ AIMAG (WORK( K )) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CLOSE( NOUT ) END IF * 9999 FORMAT( D30.18,D30.18 ) * RETURN * * End of PCLAWRITE * END scalapack-2.0.2/TOOLS/pcmatadd.f000644 000766 000024 00000027040 10363532303 016536 0ustar00juliestaff000000 000000 SUBROUTINE PCMATADD( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, $ JC, DESCC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IC, JA, JC, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ) * .. * * Purpose * ======= * * PCMATADD performs a distributed matrix-matrix addition * * sub( C ) := alpha * sub( A ) + beta * sub( C ), * * where sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1) and sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1). No communications are performed in this * routine, the arrays are supposed to be aligned. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrices sub( A ) and sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrices sub( A ) and * sub( C ). N >= 0. * * ALPHA (global input) COMPLEX * The scalar ALPHA. * * A (local input) COMPLEX pointer into the local memory * to a local array of dimension (LLD_A, LOCc(JA+N-1) ). This * array contains the local pieces of the distributed matrix * sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * BETA (global input) COMPLEX * The scalar BETA. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * This array contains the local pieces of the distributed * matrix sub( C ). On exit, this array contains the local * pieces of the resulting distributed matrix. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICCOL, ICOFF, ICROW, IIA, $ IIC, IOFFA, IOFFC, IROFF, J, JJA, JJC, LDA, $ LDC, MP, MYCOL, MYROW, NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA(CTXT_), NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible. * IF( (M.EQ.0).OR.(N.EQ.0).OR. $ ((ALPHA.EQ.ZERO).AND.(BETA.EQ.ONE)) ) $ RETURN * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) * IROFF = MOD( IA-1, DESCA(MB_) ) ICOFF = MOD( JA-1, DESCA(NB_) ) MP = NUMROC( M+IROFF, DESCA(MB_), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA(NB_), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF LDA = DESCA(LLD_) LDC = DESCC(LLD_) * IF( NQ.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN IOFFC = IIC + (JJC-1)*LDC DO 10 I = IOFFC, IOFFC+MP-1 C( I ) = ZERO 10 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 20 I = IOFFC, IOFFC+MP-1 C( I ) = ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 20 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 30 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 30 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 40 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 40 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 50 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 50 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 60 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 60 CONTINUE END IF END IF ELSE IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN IOFFC = IIC+(JJC-1)*LDC DO 80 J = 1, NQ DO 70 I = IOFFC, IOFFC+MP-1 C( I ) = ZERO 70 CONTINUE IOFFC = IOFFC + LDC 80 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 100 J = 1, NQ DO 90 I = IOFFC, IOFFC+MP-1 C( I ) = ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 90 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 100 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 120 J = 1, NQ DO 110 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 110 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 120 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 140 J = 1, NQ DO 130 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 130 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 140 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 160 J = 1, NQ DO 150 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 150 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 160 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 180 J = 1, NQ DO 170 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 170 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 180 CONTINUE END IF END IF END IF * RETURN * * End of PCMATADD * END scalapack-2.0.2/TOOLS/pcrow2col.f000644 000766 000024 00000021513 11654631032 016675 0ustar00juliestaff000000 000000 SUBROUTINE PCROW2COL( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, $ RSRC, CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. COMPLEX VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a row of processes, and distribute those rows over a column of * processes. This routine minimizes communication by sending all * information it has that a given process in the CDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process row consisting of Q processes, and I want to send all of that * vector that I own to a new vector distributed over P processes within * a process column, that after I find the process in RDEST that owns * the row of the vector I'm currently looking at, he will want every * ( (LCM(P,Q)/Q ) block of my vector (the block being of size NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a row. * * VS (local input) COMPLEX * Array of dimension (LDVS,N), the block of vectors stored on * process row RSRC to be put into memory VD, and stored on * process column CDEST. * * LDVS (local input) INTEGER * The leading dimension of VS. * * VD (local output) COMPLEX * Array of dimension (LDVD,N), on output, the contents of VD * stored on process column CDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD. * * RSRC (global input) INTEGER * The process row VS is distributed over. * * CSRC (global input) INTEGER * The process column the distributed block of vectors VS * begins on. * * RDEST (global input) INTEGER * The process row that VD begins on. * * CDEST (global input) INTEGER * The process column to distribute VD over. * * WORK (local workspace) COMPLEX * Array, dimension (LDW). The required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mq = number of rows of VS on my process. * npcol = number of process columns * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mq/NB )/(LCM/npcol) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, II, ISTART, ICSRC, IRDEST, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGESD2D, CGERV2D, CLACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * ICPY = 0 * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there is no * copying required * IF( NPROW .NE. NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYROW.EQ.RSRC ) THEN * ISTART = 1 * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my first block * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) * * Loop over all possible destination processes * DO 20 K = 1, RBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL CLACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary, NOTE: will send extra if NB > JB * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL CGESD2D( ICTXT, JJ, 1, WORK, JJ, IRDEST, $ CDEST ) * * I am both source and destination, save where to start * copying from for later use * ELSE ICPY = ISTART END IF * ISTART = ISTART + NB IRDEST = MOD( IRDEST+NPCOL, NPROW ) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYCOL.EQ.CDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) * * Loop over all sending processes * DO 50 K = 1, CBLKSKIP * * If I don't already possess the required data * IF( (MYROW.NE.RSRC).OR.(MYCOL.NE.ICSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MP - ISTART + NB) / NB JJ = ((NBLOCKS+CBLKSKIP-1) / CBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL CGERV2D( ICTXT, JJ, N, WORK, JJ, RSRC, ICSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL CLACPY( 'G', JB, N, WORK(JJ), JB, VD(II,1), $ LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL CLACPY( 'G', JB, N, VS(JJ,1), LDVS, VD(II,1), $ LDVD ) JJ = JJ + NB*RBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB ICSRC = MOD( ICSRC+NPROW, NPCOL ) 50 CONTINUE END IF * * if NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYROW.EQ.RSRC ) THEN * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my piece of the vector * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN CALL CGESD2D( ICTXT, MQ, N, VS, LDVS, IRDEST, CDEST ) ELSE CALL CLACPY( 'G', MQ, N, VS, LDVS, VD, LDVD ) END IF END IF IF( MYCOL.EQ.CDEST ) THEN * * Figure my distance from RDEST: the process in RSRC the same * distance from CSRC will have my piece of the vector * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) IF( (MYCOL.NE.ICSRC).OR.(MYROW.NE. RSRC) ) $ CALL CGERV2D( ICTXT, MP, N, VD, LDVD, RSRC, ICSRC ) END IF END IF * RETURN * * End of PCROW2COL * END scalapack-2.0.2/TOOLS/pctreecomb.f000644 000766 000024 00000015770 11654631032 017116 0ustar00juliestaff000000 000000 SUBROUTINE PCTREECOMB( ICTXT, SCOPE, N, MINE, RDEST0, CDEST0, $ SUBPTR ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SCOPE INTEGER CDEST0, ICTXT, N, RDEST0 * .. * .. Array Arguments .. COMPLEX MINE( * ) * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PCTREECOMB does a 1-tree parallel combine operation on scalars, * using the subroutine indicated by SUBPTR to perform the required * computation. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * SCOPE (global input) CHARACTER * The scope of the operation: 'Rowwise', 'Columnwise', or * 'All'. * * N (global input) INTEGER * The number of elements in MINE. N = 1 for the norm-2 * computation and 2 for the sum of square. * * MINE (local input/global output) COMPLEX array of * dimension at least equal to N. The local data to use in the * combine. * * RDEST0 (global input) INTEGER * The process row to receive the answer. If RDEST0 = -1, * every process in the scope gets the answer. * * CDEST0 (global input) INTEGER * The process column to receive the answer. If CDEST0 = -1, * every process in the scope gets the answer. * * SUBPTR (local input) Pointer to the subroutine to call to perform * the required combine. * * ===================================================================== * * .. Local Scalars .. LOGICAL BCAST, RSCOPE, CSCOPE INTEGER CMSSG, DEST, DIST, HISDIST, I, IAM, MYCOL, $ MYROW, MYDIST, MYDIST2, NP, NPCOL, NPROW, $ RMSSG, TCDEST, TRDEST * .. * .. Local Arrays .. COMPLEX HIS( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, $ CGERV2D, CGESD2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * DEST = 0 * * See if everyone wants the answer (need to broadcast the answer) * BCAST = ( ( RDEST0.EQ.-1 ).OR.( CDEST0.EQ.-1 ) ) IF( BCAST ) THEN TRDEST = 0 TCDEST = 0 ELSE TRDEST = RDEST0 TCDEST = CDEST0 END IF * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure scope-dependant variables, or report illegal scope * RSCOPE = LSAME( SCOPE, 'R' ) CSCOPE = LSAME( SCOPE, 'C' ) * IF( RSCOPE ) THEN IF( BCAST ) THEN TRDEST = MYROW ELSE IF( MYROW.NE.TRDEST ) THEN RETURN END IF NP = NPCOL MYDIST = MOD( NPCOL + MYCOL - TCDEST, NPCOL ) ELSE IF( CSCOPE ) THEN IF( BCAST ) THEN TCDEST = MYCOL ELSE IF( MYCOL.NE.TCDEST ) THEN RETURN END IF NP = NPROW MYDIST = MOD( NPROW + MYROW - TRDEST, NPROW ) ELSE IF( LSAME( SCOPE, 'A' ) ) THEN NP = NPROW * NPCOL IAM = MYROW*NPCOL + MYCOL DEST = TRDEST*NPCOL + TCDEST MYDIST = MOD( NP + IAM - DEST, NP ) ELSE RETURN END IF * IF( NP.LT.2 ) $ RETURN * MYDIST2 = MYDIST RMSSG = MYROW CMSSG = MYCOL I = 1 * 10 CONTINUE * IF( MOD( MYDIST, 2 ).NE.0 ) THEN * * If I am process that sends information * DIST = I * ( MYDIST - MOD( MYDIST, 2 ) ) * * Figure coordinates of dest of message * IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) END IF * CALL CGESD2D( ICTXT, N, 1, MINE, N, RMSSG, CMSSG ) * GO TO 20 * ELSE * * If I am a process receiving information, figure coordinates * of source of message * DIST = MYDIST2 + I IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) HISDIST = MOD( NP + CMSSG - TCDEST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) HISDIST = MOD( NP + RMSSG - TRDEST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) HISDIST = MOD( NP + RMSSG*NPCOL+CMSSG - DEST, NP ) END IF * IF( MYDIST2.LT.HISDIST ) THEN * * If I have anyone sending to me * CALL CGERV2D( ICTXT, N, 1, HIS, N, RMSSG, CMSSG ) CALL SUBPTR( MINE, HIS ) * END IF MYDIST = MYDIST / 2 * END IF I = I * 2 * IF( I.LT.NP ) $ GO TO 10 * 20 CONTINUE * IF( BCAST ) THEN IF( MYDIST2.EQ.0 ) THEN CALL CGEBS2D( ICTXT, SCOPE, ' ', N, 1, MINE, N ) ELSE CALL CGEBR2D( ICTXT, SCOPE, ' ', N, 1, MINE, N, $ TRDEST, TCDEST ) END IF END IF * RETURN * * End of PCTREECOMB * END * SUBROUTINE CCOMBAMAX( V1, V2 ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. COMPLEX V1( 2 ), V2( 2 ) * .. * * Purpose * ======= * * CCOMBAMAX finds the element having max. absolute value as well * as its corresponding globl index. * * Arguments * ========= * * V1 (local input/local output) COMPLEX array of * dimension 2. The first maximum absolute value element and * its global index. V1(1) = AMAX, V1(2) = INDX. * * V2 (local input) COMPLEX array of dimension 2. * The second maximum absolute value element and its global * index. V2(1) = AMAX, V2(2) = INDX. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, REAL, AIMAG * .. * .. Statement Functions .. COMPLEX ZDUM REAL CABS1 CABS1(ZDUM) = ABS(REAL(ZDUM)) + ABS(AIMAG(ZDUM)) * .. * .. Executable Statements .. * IF( CABS1( V1( 1 ) ).LT.CABS1( V2( 1 ) ) ) THEN V1( 1 ) = V2( 1 ) V1( 2 ) = V2( 2 ) END IF * RETURN * * End of CCOMBAMAX * END scalapack-2.0.2/TOOLS/pdchekpad.f000644 000766 000024 00000012133 10363532303 016701 0ustar00juliestaff000000 000000 SUBROUTINE PDCHEKPAD( ICTXT, MESS, M, N, A, LDA, IPRE, IPOST, $ CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N DOUBLE PRECISION CHKVAL * .. * .. Array Arguments .. CHARACTER MESS*(*) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDCHEKPAD checks that the padding around a local array has not * been overwritten since the call to PDFILLPAD. 3 types of errors * are reported: * * 1) Overwrite in pre-guardzone. This indicates a memory overwrite has * occurred in the first IPRE elements which form a buffer before the * beginning of A. Therefore, the error message: * 'Overwrite in pre-guardzone: loc( 5) = 18.00000' * tells you that the 5th element of the IPRE long buffer has been * overwritten with the value 18, where it should still have the value * of CHKVAL. * * 2) Overwrite in post-guardzone. This indicates a memory overwrite has * occurred in the last IPOST elements which form a buffer after the end * of A. Error reports are refered from the end of A. Therefore, * 'Overwrite in post-guardzone: loc( 19) = 24.00000' * tells you that the 19th element after the end of A was overwritten * with the value 24, where it should still have the value of CHKVAL. * * 3) Overwrite in lda-m gap. Tells you elements between M and LDA were * overwritten. So, * 'Overwrite in lda-m gap: A( 12, 3) = 22.00000' * tells you that the element at the 12th row and 3rd column of A was * overwritten with the value of 22, where it should still have the * value of CHKVAL. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * MESS (local input) CHARACTER*(*) * String containing a user-defined message. * * M (local input) INTEGER * The number of rows in the local array A. * * N (input) INTEGER * The number of columns in the local array A. * * A (local input) DOUBLE PRECISION array of dimension (LDA,N). * A location IPRE elements in front of the array to be checked. * * LDA (local input) INTEGER * The leading Dimension of the local array to be checked. * * IPRE (local input) INTEGER * The size of the guard zone before the start of padded array. * * IPOST (local input) INTEGER * The size of guard zone after the padded array. * * CHKVAL (local input) DOUBLE PRECISION * The value the local array was padded with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, IAM, IDUMM, INFO, J, K, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW*NPCOL + MYCOL INFO = -1 * * Check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, ' pre', I, $ A( I ) INFO = IAM END IF 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PDCHEKPAD' END IF * * Check buffer after A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, 'post', $ I-J+1, A( I ) INFO = IAM END IF 20 CONTINUE ELSE WRITE( *, FMT = * ) $ 'WARNING no post-guardzone buffer in PDCHEKPAD' END IF * * Check all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9997 ) MYROW, MYCOL, MESS, $ I-IPRE-LDA*(J-1), J, A( I ) INFO = IAM END IF 30 CONTINUE K = K + LDA 40 CONTINUE END IF * CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUMM, IDUMM, -1, $ 0, 0 ) IF( IAM.EQ.0 .AND. INFO.GE.0 ) THEN WRITE( *, FMT = 9999 ) INFO / NPCOL, MOD( INFO, NPCOL ), MESS END IF * 9999 FORMAT( '{', I5, ',', I5, '}: Memory overwrite in ', A ) 9998 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ A4, '-guardzone: loc(', I3, ') = ', G20.7 ) 9997 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ 'lda-m gap: loc(', I3, ',', I3, ') = ', G20.7 ) * RETURN * * End of PDCHEKPAD * END scalapack-2.0.2/TOOLS/pdcol2row.f000644 000766 000024 00000021701 11654631032 016675 0ustar00juliestaff000000 000000 SUBROUTINE PDCOL2ROW( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, RSRC, $ CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. DOUBLE PRECISION VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a column of processes, and distribute those rows over a row of * processes. This routine minimizes communication by sending all * information it has that a given process in the RDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process column consisting of P processes, and I want to send all of * that vector that I own to a new vector distributed over Q processes * within a process row, that after I find the process in RDEST that * owns the row of the vector I'm currently looking at, he will want * every ( (LCM(P,Q) / P ) block of my vector (the block being of size * NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block. * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a column. * * VS (local input) DOUBLE PRECISION * Array of dimension (LDVS,N), the block of vectors stored on * process column CSRC to be put into memory VD, and stored * on process row RDEST. * * LDVS (local input) INTEGER * The leading dimension of VS, LDVS >= MAX( 1, MP ). * * VD (local output) DOUBLE PRECISION * Array of dimension (LDVD,N), on output, the contents of VS * stored on process row RDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD, LDVD >= MAX( 1, MQ ). * * RSRC (global input) INTEGER * The process row the distributed block of vectors VS begins * on. * * CSRC (global input) INTEGER * The process column VS is distributed over. * * RDEST (global input) INTEGER * The process row to distribute VD over. * * CDEST (global input) INTEGER * The process column that VD begins on. * * WORK (local workspace) DOUBLE PRECISION * Array of dimension (LDW), the required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mp = number of rows of VS on my process. * nprow = number of process rows * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mp/NB )/(LCM/nprow) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, ICDEST, II, IRSRC, ISTART, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGESD2D, DGERV2D, DLACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * * .. Initialize Variables .. * ICPY = 0 * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there * is no copying required * IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYCOL.EQ.CSRC ) THEN * ISTART = 1 * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my first block * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * * Loop over all possible destination processes * DO 20 K = 1, CBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MP, NB*CBLKSKIP JB = MIN(NB, MP-II+1) CALL DLACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary (NOTE: will send extra if NB > JB) * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL DGESD2D( ICTXT, JJ, 1, WORK, JJ, RDEST, $ ICDEST ) * ELSE * * I am both source and destination, save where to start * copying from for later use. * ICPY = ISTART END IF * ISTART = ISTART + NB ICDEST = MOD(ICDEST+NPROW, NPCOL) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYROW.EQ.RDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) DO 50 K = 1, RBLKSKIP * * If I don't already possess the required data * IF( (MYCOL.NE.CSRC).OR.(MYROW.NE.IRSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MQ - ISTART + NB) / NB JJ = ((NBLOCKS+RBLKSKIP-1) / RBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL DGERV2D( ICTXT, JJ, N, WORK, JJ, IRSRC, CSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL DLACPY( 'G', JB, N, WORK(JJ), JB, $ VD(II,1), LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL DLACPY( 'G', JB, N, VS(JJ,1), LDVS, $ VD(II,1), LDVD ) JJ = JJ + NB*CBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB IRSRC = MOD( IRSRC+NPCOL, NPROW ) 50 CONTINUE END IF * * If NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYCOL.EQ.CSRC ) THEN * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my piece of the vector. * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN CALL DGESD2D( ICTXT, MP, N, VS, LDVS, RDEST, ICDEST ) ELSE CALL DLACPY( 'G', MP, N, VS, LDVS, VD, LDVD ) END IF END IF * IF( MYROW.EQ.RDEST ) THEN * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my piece of the vector. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) * IF( (MYROW.NE.IRSRC).OR.(MYCOL.NE.CSRC) ) $ CALL DGERV2D( ICTXT, MQ, N, VD, LDVD, IRSRC, CSRC ) * END IF * END IF * RETURN * * End of PDCOL2ROW * END scalapack-2.0.2/TOOLS/pdelget.f000644 000766 000024 00000016307 10363532303 016411 0ustar00juliestaff000000 000000 SUBROUTINE PDELGET( SCOPE, TOP, ALPHA, A, IA, JA, DESCA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 SCOPE, TOP INTEGER IA, JA DOUBLE PRECISION ALPHA * .. * .. Array arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDELGET sets alpha to the distributed matrix entry A( IA, JA ). * The value of alpha is set according to the scope. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SCOPE (global input) CHARACTER*1 * The BLACS scope in which alpha is updated. * If SCOPE = 'R', alpha is updated only in the process row * containing A( IA, JA ), * If SCOPE = 'C', alpha is updated only in the process column * containing A( IA, JA ), * If SCOPE = 'A', alpha is updated in all the processes of the * grid, * otherwise alpha is updated only in the process containing * A( IA, JA ). * * TOP (global input) CHARACTER*1 * The topology to be used if broadcast is needed. * * ALPHA (global output) DOUBLE PRECISION, the scalar alpha. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, IIA, IOFFA, JJA, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, INFOG2L * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * ALPHA = ZERO * IF( LSAME( SCOPE, 'R' ) ) THEN IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL DGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL DGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'C' ) ) THEN IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL DGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL DGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'A' ) ) THEN IF( ( MYROW.EQ.IAROW ).AND.( MYCOL.EQ.IACOL ) ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL DGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL DGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF ELSE IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ ALPHA = A( IIA+(JJA-1)*DESCA( LLD_ ) ) END IF * RETURN * * End of PDELGET * END scalapack-2.0.2/TOOLS/pdelset.f000644 000766 000024 00000012015 10363532303 016415 0ustar00juliestaff000000 000000 SUBROUTINE PDELSET( A, IA, JA, DESCA, ALPHA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA DOUBLE PRECISION ALPHA * .. * .. Array arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDELSET sets the distributed matrix entry A( IA, JA ) to ALPHA. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ALPHA (local input) DOUBLE PRECISION * The scalar alpha. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, JJA, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ A( IIA+(JJA-1)*DESCA( LLD_ ) ) = ALPHA * RETURN * * End of PDELSET * END scalapack-2.0.2/TOOLS/pdelset2.f000644 000766 000024 00000012472 10363532303 016506 0ustar00juliestaff000000 000000 SUBROUTINE PDELSET2( ALPHA, A, IA, JA, DESCA, BETA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA DOUBLE PRECISION ALPHA, BETA * .. * .. Array arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDELSET2 sets alpha to the distributed matrix entry A(IA,JA) * and A(IA,JA) to beta. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * ALPHA (local output) DOUBLE PRECISION * The scalar alpha. * * A (local input/local ouput) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,*) containing * the local pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * BETA (local input) DOUBLE PRECISION * The scalar beta. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, IOFFA, JJA, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) ALPHA = A( IOFFA ) A( IOFFA ) = BETA ELSE ALPHA = ZERO END IF * RETURN * * End of PDELSET2 * END scalapack-2.0.2/TOOLS/pdfillpad.f000644 000766 000024 00000005604 10363532303 016722 0ustar00juliestaff000000 000000 SUBROUTINE PDFILLPAD( ICTXT, M, N, A, LDA, IPRE, IPOST, CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N DOUBLE PRECISION CHKVAL * .. * .. Array Arguments .. DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDFILLPAD surrounds a two dimensional local array with a guard- * zone initialized to the value CHKVAL. The user may later call the * routine PDCHEKPAD to discover if the guardzone has been * violated. There are three guardzones. The first is a buffer of size * IPRE that is before the start of the array. The second is the buffer * of size IPOST which is after the end of the array to be padded. * Finally, there is a guard zone inside every column of the array to * be padded, in the elements of A(M+1:LDA, J). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (local input) INTEGER * The number of rows in the local array. * * N (local input) INTEGER * The number of columns in the local array. * * A (local input/local output) DOUBLE PRECISION, array of * dimension (LDA,N). A location IPRE elements in front of * the matrix to be padded. * * LDA (local input) INTEGER * The leading Dimension of the local array to be padded. * * IPRE (local input) INTEGER * The size of the guard zone to put before the start of * padded array. * * IPOST (local input) INTEGER * The size of the guard zone to put after padded array. * * CHKVAL (local input) DOUBLE PRECISION * The value to pad matrix with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE A( I ) = CHKVAL 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PDFILLPAD' END IF * * Put check buffer in back of A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 A( I ) = CHKVAL 20 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no post-guardzone in PDFILLPAD' END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 A( I ) = CHKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * RETURN * * End of PDFILLPAD * END scalapack-2.0.2/TOOLS/pdlaprnt.f000644 000766 000024 00000027622 10363532303 016613 0ustar00juliestaff000000 000000 SUBROUTINE PDLAPRNT( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, WORK ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDLAPRNT prints to the standard output a distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and * printed by the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory to a * local array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IRPRNT (global input) INTEGER * The row index of the printing process. * * ICPRNT (global input) INTEGER * The column index of the printing process. * * CMATNM (global input) CHARACTER*(*) * Identifier of the distributed matrix to be printed. * * NOUT (global input) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. * * WORK (local workspace) DOUBLE PRECISION * Working array of minimum size equal to MB_A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L, $ DGERV2D, DGESD2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ICURROW = IAROW ICURCOL = IACOL II = IIA JJ = JJA LDA = DESCA( LLD_ ) * * Handle the first block of column separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 DO 60 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, JA+H, WORK( K ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, JA+H, WORK( K ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, J+H, A( II+K+(JJ+H-1)*LDA ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, J+H, WORK( K ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, J+H, A( II+K+(JJ+H-1)*LDA ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, J+H, WORK( K ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * 9999 FORMAT(A,'(',I6,',',I6,')=',D30.18) * RETURN * * End of PDLAPRNT * END scalapack-2.0.2/TOOLS/pdlaread.f000644 000766 000024 00000010645 10604624224 016542 0ustar00juliestaff000000 000000 SUBROUTINE PDLAREAD( FILNAM, A, DESCA, IRREAD, ICREAD, WORK ) * * -- ScaLAPACK tools routine (version 1.8) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * adapted by Julie Langou, April 2007 (julie@cs.utk.edu) * * .. Scalar Arguments .. INTEGER ICREAD, IRREAD * .. * .. Array Arguments .. CHARACTER*(*) FILNAM INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDLAREAD reads from a file named FILNAM a matrix and distribute * it to the process grid. * * Only the process of coordinates {IRREAD, ICREAD} read the file. * * WORK must be of size >= MB_ = DESCA( MB_ ). * * ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IB, ICTXT, ICURCOL, ICURROW, II, J, JB, $ JJ, K, LDA, M, MYCOL, MYROW, N, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, DGERV2D, DGESD2D, $ IGEBS2D, IGEBR2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN OPEN( NIN, FILE=FILNAM, STATUS='OLD' ) READ( NIN, FMT = * ) ( IWORK( I ), I = 1, 2 ) CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, IWORK, 2 ) ELSE CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, IWORK, 2, IRREAD, $ ICREAD ) END IF M = IWORK( 1 ) N = IWORK( 2 ) * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * IF( M.GT.DESCA( M_ ).OR. N.GT.DESCA( N_ ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( *, FMT = * ) 'PDLAREAD: Matrix too big to fit in' WRITE( *, FMT = * ) 'Abort ...' END IF CALL BLACS_ABORT( ICTXT, 0 ) END IF * II = 1 JJ = 1 ICURROW = DESCA( RSRC_ ) ICURCOL = DESCA( CSRC_ ) LDA = DESCA( LLD_ ) * * Loop over column blocks * DO 50 J = 1, N, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), N-J+1 ) DO 40 H = 0, JB-1 * * Loop over block of rows * DO 30 I = 1, M, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), M-I+1 ) IF( ICURROW.EQ.IRREAD .AND. ICURCOL.EQ.ICREAD ) THEN IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN DO 10 K = 0, IB-1 READ( NIN, FMT = * ) A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGERV2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRREAD, ICREAD ) ELSE IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN DO 20 K = 1, IB READ( NIN, FMT = * ) WORK( K ) 20 CONTINUE CALL DGESD2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) 30 CONTINUE * II = 1 ICURROW = DESCA( RSRC_ ) 40 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) * 50 CONTINUE * IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN CLOSE( NIN ) END IF * RETURN * * End of PDLAREAD * END scalapack-2.0.2/TOOLS/pdlawrite.f000644 000766 000024 00000017336 10604624224 016765 0ustar00juliestaff000000 000000 SUBROUTINE PDLAWRITE( FILNAM, M, N, A, IA, JA, DESCA, IRWRIT, $ ICWRIT, WORK ) * * -- ScaLAPACK tools routine (version 1.8) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * adapted by Julie Langou, April 2007 (julie@cs.utk.edu) * * .. Scalar Arguments .. INTEGER IA, ICWRIT, IRWRIT, JA, M, N * .. * .. Array Arguments .. CHARACTER*(*) FILNAM INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDLAWRITE writes to a file named FILNAMa distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent to and * written by the process of coordinates (IRWWRITE, ICWRIT). * * WORK must be of size >= MB_ = DESCA( MB_ ). * * ===================================================================== * * .. Parameters .. INTEGER NOUT PARAMETER ( NOUT = 13 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L, $ DGERV2D, DGESD2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN OPEN( NOUT, FILE=FILNAM, STATUS='UNKNOWN' ) WRITE( NOUT, FMT = * ) M, N END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ICURROW = IAROW ICURCOL = IACOL II = IIA JJ = JJA LDA = DESCA( LLD_ ) * * Handle the first block of column separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 DO 60 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) WORK( K ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) WORK( K ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) WORK( K ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) WORK( K ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CLOSE( NOUT ) END IF * 9999 FORMAT( D30.18 ) * RETURN * * End of PDLAWRITE * END scalapack-2.0.2/TOOLS/pdmatadd.f000644 000766 000024 00000027021 10363532303 016536 0ustar00juliestaff000000 000000 SUBROUTINE PDMATADD( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, $ JC, DESCC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IC, JA, JC, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ) * .. * * Purpose * ======= * * PDMATADD performs a distributed matrix-matrix addition * * sub( C ) := alpha * sub( A ) + beta * sub( C ), * * where sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1) and sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1). No communications are performed in this * routine, the arrays are supposed to be aligned. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrices sub( A ) and sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrices sub( A ) and * sub( C ). N >= 0. * * ALPHA (global input) DOUBLE PRECISION * The scalar ALPHA. * * A (local input) DOUBLE PRECISION pointer into the local memory * to a local array of dimension (LLD_A, LOCc(JA+N-1) ). This * array contains the local pieces of the distributed matrix * sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * BETA (global input) DOUBLE PRECISION * The scalar BETA. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * This array contains the local pieces of the distributed * matrix sub( C ). On exit, this array contains the local * pieces of the resulting distributed matrix. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICCOL, ICOFF, ICROW, IIA, $ IIC, IOFFA, IOFFC, IROFF, J, JJA, JJC, LDA, $ LDC, MP, MYCOL, MYROW, NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA(CTXT_), NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible. * IF( (M.EQ.0).OR.(N.EQ.0).OR. $ ((ALPHA.EQ.ZERO).AND.(BETA.EQ.ONE)) ) $ RETURN * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) * IROFF = MOD( IA-1, DESCA(MB_) ) ICOFF = MOD( JA-1, DESCA(NB_) ) MP = NUMROC( M+IROFF, DESCA(MB_), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA(NB_), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF LDA = DESCA(LLD_) LDC = DESCC(LLD_) * IF( NQ.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN IOFFC = IIC + (JJC-1)*LDC DO 10 I = IOFFC, IOFFC+MP-1 C( I ) = ZERO 10 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 20 I = IOFFC, IOFFC+MP-1 C( I ) = ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 20 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 30 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 30 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 40 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 40 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 50 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 50 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 60 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 60 CONTINUE END IF END IF ELSE IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN IOFFC = IIC+(JJC-1)*LDC DO 80 J = 1, NQ DO 70 I = IOFFC, IOFFC+MP-1 C( I ) = ZERO 70 CONTINUE IOFFC = IOFFC + LDC 80 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 100 J = 1, NQ DO 90 I = IOFFC, IOFFC+MP-1 C( I ) = ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 90 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 100 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 120 J = 1, NQ DO 110 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 110 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 120 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 140 J = 1, NQ DO 130 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 130 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 140 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 160 J = 1, NQ DO 150 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 150 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 160 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 180 J = 1, NQ DO 170 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 170 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 180 CONTINUE END IF END IF END IF * RETURN * * End of PDMATADD * END scalapack-2.0.2/TOOLS/pdrow2col.f000644 000766 000024 00000021625 11654631032 016702 0ustar00juliestaff000000 000000 SUBROUTINE PDROW2COL( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, $ RSRC, CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. DOUBLE PRECISION VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a row of processes, and distribute those rows over a column of * processes. This routine minimizes communication by sending all * information it has that a given process in the CDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process row consisting of Q processes, and I want to send all of that * vector that I own to a new vector distributed over P processes within * a process column, that after I find the process in RDEST that owns * the row of the vector I'm currently looking at, he will want every * ( (LCM(P,Q)/Q ) block of my vector (the block being of size NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a row. * * VS (local input) DOUBLE PRECISION * Array of dimension (LDVS,N), the block of vectors stored on * process row RSRC to be put into memory VD, and stored on * process column CDEST. * * LDVS (local input) INTEGER * The leading dimension of VS. * * VD (local output) DOUBLE PRECISION * Array of dimension (LDVD,N), on output, the contents of VD * stored on process column CDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD. * * RSRC (global input) INTEGER * The process row VS is distributed over. * * CSRC (global input) INTEGER * The process column the distributed block of vectors VS * begins on. * * RDEST (global input) INTEGER * The process row that VD begins on. * * CDEST (global input) INTEGER * The process column to distribute VD over. * * WORK (local workspace) DOUBLE PRECISION * Array, dimension (LDW). The required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mq = number of rows of VS on my process. * npcol = number of process columns * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mq/NB )/(LCM/npcol) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, II, ISTART, ICSRC, IRDEST, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGESD2D, DGERV2D, DLACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * * * .. Initialize Variables .. * ICPY = 0 * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there is no * copying required * IF( NPROW .NE. NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYROW.EQ.RSRC ) THEN * ISTART = 1 * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my first block * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) * * Loop over all possible destination processes * DO 20 K = 1, RBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL DLACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary, NOTE: will send extra if NB > JB * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL DGESD2D( ICTXT, JJ, 1, WORK, JJ, IRDEST, $ CDEST ) * * I am both source and destination, save where to start * copying from for later use * ELSE ICPY = ISTART END IF * ISTART = ISTART + NB IRDEST = MOD( IRDEST+NPCOL, NPROW ) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYCOL.EQ.CDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) * * Loop over all sending processes * DO 50 K = 1, CBLKSKIP * * If I don't already possess the required data * IF( (MYROW.NE.RSRC).OR.(MYCOL.NE.ICSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MP - ISTART + NB) / NB JJ = ((NBLOCKS+CBLKSKIP-1) / CBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL DGERV2D( ICTXT, JJ, N, WORK, JJ, RSRC, ICSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL DLACPY( 'G', JB, N, WORK(JJ), JB, VD(II,1), $ LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL DLACPY( 'G', JB, N, VS(JJ,1), LDVS, VD(II,1), $ LDVD ) JJ = JJ + NB*RBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB ICSRC = MOD( ICSRC+NPROW, NPCOL ) 50 CONTINUE END IF * * if NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYROW.EQ.RSRC ) THEN * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my piece of the vector * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN CALL DGESD2D( ICTXT, MQ, N, VS, LDVS, IRDEST, CDEST ) ELSE CALL DLACPY( 'G', MQ, N, VS, LDVS, VD, LDVD ) END IF END IF IF( MYCOL.EQ.CDEST ) THEN * * Figure my distance from RDEST: the process in RSRC the same * distance from CSRC will have my piece of the vector * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) IF( (MYCOL.NE.ICSRC).OR.(MYROW.NE. RSRC) ) $ CALL DGERV2D( ICTXT, MP, N, VD, LDVD, RSRC, ICSRC ) END IF END IF * RETURN * * End of PDROW2COL * END scalapack-2.0.2/TOOLS/pdtreecomb.f000644 000766 000024 00000022345 11654631032 017113 0ustar00juliestaff000000 000000 SUBROUTINE PDTREECOMB( ICTXT, SCOPE, N, MINE, RDEST0, CDEST0, $ SUBPTR ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SCOPE INTEGER CDEST0, ICTXT, N, RDEST0 * .. * .. Array Arguments .. DOUBLE PRECISION MINE( * ) * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PDTREECOMB does a 1-tree parallel combine operation on scalars, * using the subroutine indicated by SUBPTR to perform the required * computation. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * SCOPE (global input) CHARACTER * The scope of the operation: 'Rowwise', 'Columnwise', or * 'All'. * * N (global input) INTEGER * The number of elements in MINE. N = 1 for the norm-2 * computation and 2 for the sum of square. * * MINE (local input/global output) DOUBLE PRECISION array of * dimension at least equal to N. The local data to use in the * combine. * * RDEST0 (global input) INTEGER * The process row to receive the answer. If RDEST0 = -1, * every process in the scope gets the answer. * * CDEST0 (global input) INTEGER * The process column to receive the answer. If CDEST0 = -1, * every process in the scope gets the answer. * * SUBPTR (local input) Pointer to the subroutine to call to perform * the required combine. * * ===================================================================== * * .. Local Scalars .. LOGICAL BCAST, RSCOPE, CSCOPE INTEGER CMSSG, DEST, DIST, HISDIST, I, IAM, MYCOL, $ MYROW, MYDIST, MYDIST2, NP, NPCOL, NPROW, $ RMSSG, TCDEST, TRDEST * .. * .. Local Arrays .. DOUBLE PRECISION HIS( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * DEST = 0 * * See if everyone wants the answer (need to broadcast the answer) * BCAST = ( ( RDEST0.EQ.-1 ).OR.( CDEST0.EQ.-1 ) ) IF( BCAST ) THEN TRDEST = 0 TCDEST = 0 ELSE TRDEST = RDEST0 TCDEST = CDEST0 END IF * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure scope-dependant variables, or report illegal scope * RSCOPE = LSAME( SCOPE, 'R' ) CSCOPE = LSAME( SCOPE, 'C' ) * IF( RSCOPE ) THEN IF( BCAST ) THEN TRDEST = MYROW ELSE IF( MYROW.NE.TRDEST ) THEN RETURN END IF NP = NPCOL MYDIST = MOD( NPCOL + MYCOL - TCDEST, NPCOL ) ELSE IF( CSCOPE ) THEN IF( BCAST ) THEN TCDEST = MYCOL ELSE IF( MYCOL.NE.TCDEST ) THEN RETURN END IF NP = NPROW MYDIST = MOD( NPROW + MYROW - TRDEST, NPROW ) ELSE IF( LSAME( SCOPE, 'A' ) ) THEN NP = NPROW * NPCOL IAM = MYROW*NPCOL + MYCOL DEST = TRDEST*NPCOL + TCDEST MYDIST = MOD( NP + IAM - DEST, NP ) ELSE RETURN END IF * IF( NP.LT.2 ) $ RETURN * MYDIST2 = MYDIST RMSSG = MYROW CMSSG = MYCOL I = 1 * 10 CONTINUE * IF( MOD( MYDIST, 2 ).NE.0 ) THEN * * If I am process that sends information * DIST = I * ( MYDIST - MOD( MYDIST, 2 ) ) * * Figure coordinates of dest of message * IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) END IF * CALL DGESD2D( ICTXT, N, 1, MINE, N, RMSSG, CMSSG ) * GO TO 20 * ELSE * * If I am a process receiving information, figure coordinates * of source of message * DIST = MYDIST2 + I IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) HISDIST = MOD( NP + CMSSG - TCDEST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) HISDIST = MOD( NP + RMSSG - TRDEST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) HISDIST = MOD( NP + RMSSG*NPCOL+CMSSG - DEST, NP ) END IF * IF( MYDIST2.LT.HISDIST ) THEN * * If I have anyone sending to me * CALL DGERV2D( ICTXT, N, 1, HIS, N, RMSSG, CMSSG ) CALL SUBPTR( MINE, HIS ) * END IF MYDIST = MYDIST / 2 * END IF I = I * 2 * IF( I.LT.NP ) $ GO TO 10 * 20 CONTINUE * IF( BCAST ) THEN IF( MYDIST2.EQ.0 ) THEN CALL DGEBS2D( ICTXT, SCOPE, ' ', N, 1, MINE, N ) ELSE CALL DGEBR2D( ICTXT, SCOPE, ' ', N, 1, MINE, N, $ TRDEST, TCDEST ) END IF END IF * RETURN * * End of PDTREECOMB * END * SUBROUTINE DCOMBAMAX( V1, V2 ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. DOUBLE PRECISION V1( 2 ), V2( 2 ) * .. * * Purpose * ======= * * DCOMBAMAX finds the element having max. absolute value as well * as its corresponding globl index. * * Arguments * ========= * * V1 (local input/local output) DOUBLE PRECISION array of * dimension 2. The first maximum absolute value element and * its global index. V1(1) = AMAX, V1(2) = INDX. * * V2 (local input) DOUBLE PRECISION array of dimension 2. * The second maximum absolute value element and its global * index. V2(1) = AMAX, V2(2) = INDX. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * IF( ABS( V1( 1 ) ).LT.ABS( V2( 1 ) ) ) THEN V1( 1 ) = V2( 1 ) V1( 2 ) = V2( 2 ) END IF * RETURN * * End of DCOMBAMAX * END * SUBROUTINE DCOMBSSQ( V1, V2 ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. DOUBLE PRECISION V1( 2 ), V2( 2 ) * .. * * Purpose * ======= * * DCOMBSSQ does a scaled sum of squares on two scalars. * * Arguments * ========= * * V1 (local input/local output) DOUBLE PRECISION array of * dimension 2. The first scaled sum. V1(1) = SCALE, * V1(2) = SUMSQ. * * V2 (local input) DOUBLE PRECISION array of dimension 2. * The second scaled sum. V2(1) = SCALE, V2(2) = SUMSQ. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Executable Statements .. * IF( V1( 1 ).GE.V2( 1 ) ) THEN IF( V1( 1 ).NE.ZERO ) $ V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 ) ELSE V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 ) V1( 1 ) = V2( 1 ) END IF * RETURN * * End of DCOMBSSQ * END * SUBROUTINE DCOMBNRM2( X, Y ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. * * Purpose * ======= * * DCOMBNRM2 combines local norm 2 results, taking care not to cause * unnecessary overflow. * * Arguments * ========= * * X (local input) DOUBLE PRECISION * Y (local input) DOUBLE PRECISION * X and Y specify the values x and y. X and Y are supposed to * be >= 0. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION W, Z * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT * .. * .. Executable Statements .. * W = MAX( X, Y ) Z = MIN( X, Y ) * IF( Z.EQ.ZERO ) THEN X = W ELSE X = W*SQRT( ONE+( Z / W )**2 ) END IF * RETURN * * End of DCOMBNRM2 * END scalapack-2.0.2/TOOLS/pichekpad.f000644 000766 000024 00000012105 10363532303 016705 0ustar00juliestaff000000 000000 SUBROUTINE PICHEKPAD( ICTXT, MESS, M, N, A, LDA, IPRE, IPOST, $ CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N INTEGER CHKVAL * .. * .. Array Arguments .. CHARACTER MESS*(*) INTEGER A( * ) * .. * * Purpose * ======= * * PICHEKPAD checks that the padding around a local array has not * been overwritten since the call to PIFILLPAD. 3 types of errors * are reported: * * 1) Overwrite in pre-guardzone. This indicates a memory overwrite has * occurred in the first IPRE elements which form a buffer before the * beginning of A. Therefore, the error message: * 'Overwrite in pre-guardzone: loc( 5) = 18.00000' * tells you that the 5th element of the IPRE long buffer has been * overwritten with the value 18, where it should still have the value * of CHKVAL. * * 2) Overwrite in post-guardzone. This indicates a memory overwrite has * occurred in the last IPOST elements which form a buffer after the end * of A. Error reports are refered from the end of A. Therefore, * 'Overwrite in post-guardzone: loc( 19) = 24.00000' * tells you that the 19th element after the end of A was overwritten * with the value 24, where it should still have the value of CHKVAL. * * 3) Overwrite in lda-m gap. Tells you elements between M and LDA were * overwritten. So, * 'Overwrite in lda-m gap: A( 12, 3) = 22.00000' * tells you that the element at the 12th row and 3rd column of A was * overwritten with the value of 22, where it should still have the * value of CHKVAL. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * MESS (local input) CHARACTER*(*) * String containing a user-defined message. * * M (local input) INTEGER * The number of rows in the local array A. * * N (input) INTEGER * The number of columns in the local array A. * * A (local input) @(typec) array of dimension (LDA,N). * A location IPRE elements in front of the array to be checked. * * LDA (local input) INTEGER * The leading Dimension of the local array to be checked. * * IPRE (local input) INTEGER * The size of the guard zone before the start of padded array. * * IPOST (local input) INTEGER * The size of guard zone after the padded array. * * CHKVAL (local input) @(typec) * The value the local array was padded with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, IAM, IDUMM, INFO, J, K, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW*NPCOL + MYCOL INFO = -1 * * Check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, ' pre', I, $ A( I ) INFO = IAM END IF 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PICHEKPAD' END IF * * Check buffer after A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, 'post', $ I-J+1, A( I ) INFO = IAM END IF 20 CONTINUE ELSE WRITE( *, FMT = * ) $ 'WARNING no post-guardzone buffer in PICHEKPAD' END IF * * Check all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9997 ) MYROW, MYCOL, MESS, $ I-IPRE-LDA*(J-1), J, A( I ) INFO = IAM END IF 30 CONTINUE K = K + LDA 40 CONTINUE END IF * CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUMM, IDUMM, -1, $ 0, 0 ) IF( IAM.EQ.0 .AND. INFO.GE.0 ) THEN WRITE( *, FMT = 9999 ) INFO / NPCOL, MOD( INFO, NPCOL ), MESS END IF * 9999 FORMAT( '{', I5, ',', I5, '}: Memory overwrite in ', A ) 9998 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ A4, '-guardzone: loc(', I3, ') = ', I8 ) 9997 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ 'lda-m gap: loc(', I3, ',', I3, ') = ', I8 ) * RETURN * * End of PICHEKPAD * END scalapack-2.0.2/TOOLS/picol2row.f000644 000766 000024 00000021651 11654631032 016706 0ustar00juliestaff000000 000000 SUBROUTINE PICOL2ROW( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, RSRC, $ CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. INTEGER VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a column of processes, and distribute those rows over a row of * processes. This routine minimizes communication by sending all * information it has that a given process in the RDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process column consisting of P processes, and I want to send all of * that vector that I own to a new vector distributed over Q processes * within a process row, that after I find the process in RDEST that * owns the row of the vector I'm currently looking at, he will want * every ( (LCM(P,Q) / P ) block of my vector (the block being of size * NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block. * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a column. * * VS (local input) @(typec) * Array of dimension (LDVS,N), the block of vectors stored on * process column CSRC to be put into memory VD, and stored * on process row RDEST. * * LDVS (local input) INTEGER * The leading dimension of VS, LDVS >= MAX( 1, MP ). * * VD (local output) @(typec) * Array of dimension (LDVD,N), on output, the contents of VS * stored on process row RDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD, LDVD >= MAX( 1, MQ ). * * RSRC (global input) INTEGER * The process row the distributed block of vectors VS begins * on. * * CSRC (global input) INTEGER * The process column VS is distributed over. * * RDEST (global input) INTEGER * The process row to distribute VD over. * * CDEST (global input) INTEGER * The process column that VD begins on. * * WORK (local workspace) @(typec) * Array of dimension (LDW), the required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mp = number of rows of VS on my process. * nprow = number of process rows * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mp/NB )/(LCM/nprow) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, ICDEST, II, IRSRC, ISTART, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGESD2D, IGERV2D, ILACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * * .. Initialize Variables .. * ICPY = 0 * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there * is no copying required * IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYCOL.EQ.CSRC ) THEN * ISTART = 1 * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my first block * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * * Loop over all possible destination processes * DO 20 K = 1, CBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MP, NB*CBLKSKIP JB = MIN(NB, MP-II+1) CALL ILACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary (NOTE: will send extra if NB > JB) * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL IGESD2D( ICTXT, JJ, 1, WORK, JJ, RDEST, $ ICDEST ) * ELSE * * I am both source and destination, save where to start * copying from for later use. * ICPY = ISTART END IF * ISTART = ISTART + NB ICDEST = MOD(ICDEST+NPROW, NPCOL) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYROW.EQ.RDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) DO 50 K = 1, RBLKSKIP * * If I don't already possess the required data * IF( (MYCOL.NE.CSRC).OR.(MYROW.NE.IRSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MQ - ISTART + NB) / NB JJ = ((NBLOCKS+RBLKSKIP-1) / RBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL IGERV2D( ICTXT, JJ, N, WORK, JJ, IRSRC, CSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL ILACPY( 'G', JB, N, WORK(JJ), JB, $ VD(II,1), LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL ILACPY( 'G', JB, N, VS(JJ,1), LDVS, $ VD(II,1), LDVD ) JJ = JJ + NB*CBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB IRSRC = MOD( IRSRC+NPCOL, NPROW ) 50 CONTINUE END IF * * If NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYCOL.EQ.CSRC ) THEN * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my piece of the vector. * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN CALL IGESD2D( ICTXT, MP, N, VS, LDVS, RDEST, ICDEST ) ELSE CALL ILACPY( 'G', MP, N, VS, LDVS, VD, LDVD ) END IF END IF * IF( MYROW.EQ.RDEST ) THEN * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my piece of the vector. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) * IF( (MYROW.NE.IRSRC).OR.(MYCOL.NE.CSRC) ) $ CALL IGERV2D( ICTXT, MQ, N, VD, LDVD, IRSRC, CSRC ) * END IF * END IF * RETURN * * End of PICOL2ROW * END scalapack-2.0.2/TOOLS/pielget.f000644 000766 000024 00000016202 10363532303 016410 0ustar00juliestaff000000 000000 SUBROUTINE PIELGET( SCOPE, TOP, ALPHA, A, IA, JA, DESCA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 SCOPE, TOP INTEGER ALPHA, IA, JA * .. * .. Array arguments .. INTEGER A( * ), DESCA( * ) * .. * * Purpose * ======= * * PIELGET sets alpha to the distributed matrix entry A( IA, JA ). * The value of alpha is set according to the scope. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SCOPE (global input) CHARACTER*1 * The BLACS scope in which alpha is updated. * If SCOPE = 'R', alpha is updated only in the process row * containing A( IA, JA ), * If SCOPE = 'C', alpha is updated only in the process column * containing A( IA, JA ), * If SCOPE = 'A', alpha is updated in all the processes of the * grid, * otherwise alpha is updated only in the process containing * A( IA, JA ). * * TOP (global input) CHARACTER*1 * The topology to be used if broadcast is needed. * * ALPHA (global output) @(typec), the scalar alpha. * * A (local input) @(typec) pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ZERO PARAMETER ( ZERO = 0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, IIA, IOFFA, JJA, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, INFOG2L * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * ALPHA = ZERO * IF( LSAME( SCOPE, 'R' ) ) THEN IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL IGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL IGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'C' ) ) THEN IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL IGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL IGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'A' ) ) THEN IF( ( MYROW.EQ.IAROW ).AND.( MYCOL.EQ.IACOL ) ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL IGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL IGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF ELSE IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ ALPHA = A( IIA+(JJA-1)*DESCA( LLD_ ) ) END IF * RETURN * * End of PIELGET * END scalapack-2.0.2/TOOLS/pielset.f000644 000766 000024 00000011715 10363532303 016430 0ustar00juliestaff000000 000000 SUBROUTINE PIELSET( A, IA, JA, DESCA, ALPHA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ALPHA, IA, JA * .. * .. Array arguments .. INTEGER A( * ), DESCA( * ) * .. * * Purpose * ======= * * PIELSET sets the distributed matrix entry A( IA, JA ) to ALPHA. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (local output) @(typec) pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ALPHA (local input) @(typec) * The scalar alpha. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, JJA, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ A( IIA+(JJA-1)*DESCA( LLD_ ) ) = ALPHA * RETURN * * End of PIELSET * END scalapack-2.0.2/TOOLS/pielset2.f000644 000766 000024 00000012355 10363532303 016513 0ustar00juliestaff000000 000000 SUBROUTINE PIELSET2( ALPHA, A, IA, JA, DESCA, BETA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ALPHA, BETA, IA, JA * .. * .. Array arguments .. INTEGER A( * ), DESCA( * ) * .. * * Purpose * ======= * * PIELSET2 sets alpha to the distributed matrix entry A(IA,JA) * and A(IA,JA) to beta. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * ALPHA (local output) @(typec) * The scalar alpha. * * A (local input/local ouput) @(typec) pointer into the * local memory to an array of dimension (LLD_A,*) containing * the local pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * BETA (local input) @(typec) * The scalar beta. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ZERO PARAMETER ( ZERO = 0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, IOFFA, JJA, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) ALPHA = A( IOFFA ) A( IOFFA ) = BETA ELSE ALPHA = ZERO END IF * RETURN * * End of PIELSET2 * END scalapack-2.0.2/TOOLS/pifillpad.f000644 000766 000024 00000005564 10363532303 016734 0ustar00juliestaff000000 000000 SUBROUTINE PIFILLPAD( ICTXT, M, N, A, LDA, IPRE, IPOST, CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N INTEGER CHKVAL * .. * .. Array Arguments .. INTEGER A( * ) * .. * * Purpose * ======= * * PIFILLPAD surrounds a two dimensional local array with a guard- * zone initialized to the value CHKVAL. The user may later call the * routine PICHEKPAD to discover if the guardzone has been * violated. There are three guardzones. The first is a buffer of size * IPRE that is before the start of the array. The second is the buffer * of size IPOST which is after the end of the array to be padded. * Finally, there is a guard zone inside every column of the array to * be padded, in the elements of A(M+1:LDA, J). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (local input) INTEGER * The number of rows in the local array. * * N (local input) INTEGER * The number of columns in the local array. * * A (local input/local output) @(typec), array of * dimension (LDA,N). A location IPRE elements in front of * the matrix to be padded. * * LDA (local input) INTEGER * The leading Dimension of the local array to be padded. * * IPRE (local input) INTEGER * The size of the guard zone to put before the start of * padded array. * * IPOST (local input) INTEGER * The size of the guard zone to put after padded array. * * CHKVAL (local input) @(typec) * The value to pad matrix with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE A( I ) = CHKVAL 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PIFILLPAD' END IF * * Put check buffer in back of A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 A( I ) = CHKVAL 20 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no post-guardzone in PIFILLPAD' END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 A( I ) = CHKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * RETURN * * End of PIFILLPAD * END scalapack-2.0.2/TOOLS/pilaprnt.f000644 000766 000024 00000027576 10363532303 016630 0ustar00juliestaff000000 000000 SUBROUTINE PILAPRNT( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, WORK ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) INTEGER A( * ), WORK( * ) * .. * * Purpose * ======= * * PILAPRNT prints to the standard output a distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and * printed by the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) @(typec) pointer into the local memory to a * local array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IRPRNT (global input) INTEGER * The row index of the printing process. * * ICPRNT (global input) INTEGER * The column index of the printing process. * * CMATNM (global input) CHARACTER*(*) * Identifier of the distributed matrix to be printed. * * NOUT (global input) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. * * WORK (local workspace) @(typec) * Working array of minimum size equal to MB_A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L, $ IGERV2D, IGESD2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ICURROW = IAROW ICURCOL = IACOL II = IIA JJ = JJA LDA = DESCA( LLD_ ) * * Handle the first block of column separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 DO 60 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL IGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL IGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, JA+H, WORK( K ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL IGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL IGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, JA+H, WORK( K ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, J+H, A( II+K+(JJ+H-1)*LDA ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL IGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL IGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, J+H, WORK( K ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, J+H, A( II+K+(JJ+H-1)*LDA ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL IGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL IGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, J+H, WORK( K ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * 9999 FORMAT(A,'(',I6,',',I6,')=',I8) * RETURN * * End of PILAPRNT * END scalapack-2.0.2/TOOLS/pirow2col.f000644 000766 000024 00000021575 11654631032 016713 0ustar00juliestaff000000 000000 SUBROUTINE PIROW2COL( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, $ RSRC, CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. INTEGER VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a row of processes, and distribute those rows over a column of * processes. This routine minimizes communication by sending all * information it has that a given process in the CDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process row consisting of Q processes, and I want to send all of that * vector that I own to a new vector distributed over P processes within * a process column, that after I find the process in RDEST that owns * the row of the vector I'm currently looking at, he will want every * ( (LCM(P,Q)/Q ) block of my vector (the block being of size NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a row. * * VS (local input) @(typec) * Array of dimension (LDVS,N), the block of vectors stored on * process row RSRC to be put into memory VD, and stored on * process column CDEST. * * LDVS (local input) INTEGER * The leading dimension of VS. * * VD (local output) @(typec) * Array of dimension (LDVD,N), on output, the contents of VD * stored on process column CDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD. * * RSRC (global input) INTEGER * The process row VS is distributed over. * * CSRC (global input) INTEGER * The process column the distributed block of vectors VS * begins on. * * RDEST (global input) INTEGER * The process row that VD begins on. * * CDEST (global input) INTEGER * The process column to distribute VD over. * * WORK (local workspace) @(typec) * Array, dimension (LDW). The required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mq = number of rows of VS on my process. * npcol = number of process columns * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mq/NB )/(LCM/npcol) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, II, ISTART, ICSRC, IRDEST, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGESD2D, IGERV2D, ILACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * * * .. Initialize Variables .. * ICPY = 0 * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there is no * copying required * IF( NPROW .NE. NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYROW.EQ.RSRC ) THEN * ISTART = 1 * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my first block * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) * * Loop over all possible destination processes * DO 20 K = 1, RBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL ILACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary, NOTE: will send extra if NB > JB * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL IGESD2D( ICTXT, JJ, 1, WORK, JJ, IRDEST, $ CDEST ) * * I am both source and destination, save where to start * copying from for later use * ELSE ICPY = ISTART END IF * ISTART = ISTART + NB IRDEST = MOD( IRDEST+NPCOL, NPROW ) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYCOL.EQ.CDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) * * Loop over all sending processes * DO 50 K = 1, CBLKSKIP * * If I don't already possess the required data * IF( (MYROW.NE.RSRC).OR.(MYCOL.NE.ICSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MP - ISTART + NB) / NB JJ = ((NBLOCKS+CBLKSKIP-1) / CBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL IGERV2D( ICTXT, JJ, N, WORK, JJ, RSRC, ICSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL ILACPY( 'G', JB, N, WORK(JJ), JB, VD(II,1), $ LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL ILACPY( 'G', JB, N, VS(JJ,1), LDVS, VD(II,1), $ LDVD ) JJ = JJ + NB*RBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB ICSRC = MOD( ICSRC+NPROW, NPCOL ) 50 CONTINUE END IF * * if NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYROW.EQ.RSRC ) THEN * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my piece of the vector * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN CALL IGESD2D( ICTXT, MQ, N, VS, LDVS, IRDEST, CDEST ) ELSE CALL ILACPY( 'G', MQ, N, VS, LDVS, VD, LDVD ) END IF END IF IF( MYCOL.EQ.CDEST ) THEN * * Figure my distance from RDEST: the process in RSRC the same * distance from CSRC will have my piece of the vector * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) IF( (MYCOL.NE.ICSRC).OR.(MYROW.NE. RSRC) ) $ CALL IGERV2D( ICTXT, MP, N, VD, LDVD, RSRC, ICSRC ) END IF END IF * RETURN * * End of PIROW2COL * END scalapack-2.0.2/TOOLS/pitreecomb.f000644 000766 000024 00000013254 11654631032 017117 0ustar00juliestaff000000 000000 SUBROUTINE PITREECOMB( ICTXT, SCOPE, N, MINE, RDEST0, CDEST0, $ SUBPTR ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SCOPE INTEGER CDEST0, ICTXT, N, RDEST0 * .. * .. Array Arguments .. INTEGER MINE( * ) * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PITREECOMB does a 1-tree parallel combine operation on scalars, * using the subroutine indicated by SUBPTR to perform the required * computation. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * SCOPE (global input) CHARACTER * The scope of the operation: 'Rowwise', 'Columnwise', or * 'All'. * * N (global input) INTEGER * The number of elements in MINE. N = 1 for the norm-2 * computation and 2 for the sum of square. * * MINE (local input/global output) @(typec) array of * dimension at least equal to N. The local data to use in the * combine. * * RDEST0 (global input) INTEGER * The process row to receive the answer. If RDEST0 = -1, * every process in the scope gets the answer. * * CDEST0 (global input) INTEGER * The process column to receive the answer. If CDEST0 = -1, * every process in the scope gets the answer. * * SUBPTR (local input) Pointer to the subroutine to call to perform * the required combine. * * ===================================================================== * * .. Local Scalars .. LOGICAL BCAST, RSCOPE, CSCOPE INTEGER CMSSG, DEST, DIST, HISDIST, I, IAM, MYCOL, $ MYROW, MYDIST, MYDIST2, NP, NPCOL, NPROW, $ RMSSG, TCDEST, TRDEST * .. * .. Local Arrays .. INTEGER HIS( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, $ IGERV2D, IGESD2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * DEST = 0 * * See if everyone wants the answer (need to broadcast the answer) * BCAST = ( ( RDEST0.EQ.-1 ).OR.( CDEST0.EQ.-1 ) ) IF( BCAST ) THEN TRDEST = 0 TCDEST = 0 ELSE TRDEST = RDEST0 TCDEST = CDEST0 END IF * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure scope-dependant variables, or report illegal scope * RSCOPE = LSAME( SCOPE, 'R' ) CSCOPE = LSAME( SCOPE, 'C' ) * IF( RSCOPE ) THEN IF( BCAST ) THEN TRDEST = MYROW ELSE IF( MYROW.NE.TRDEST ) THEN RETURN END IF NP = NPCOL MYDIST = MOD( NPCOL + MYCOL - TCDEST, NPCOL ) ELSE IF( CSCOPE ) THEN IF( BCAST ) THEN TCDEST = MYCOL ELSE IF( MYCOL.NE.TCDEST ) THEN RETURN END IF NP = NPROW MYDIST = MOD( NPROW + MYROW - TRDEST, NPROW ) ELSE IF( LSAME( SCOPE, 'A' ) ) THEN NP = NPROW * NPCOL IAM = MYROW*NPCOL + MYCOL DEST = TRDEST*NPCOL + TCDEST MYDIST = MOD( NP + IAM - DEST, NP ) ELSE RETURN END IF * IF( NP.LT.2 ) $ RETURN * MYDIST2 = MYDIST RMSSG = MYROW CMSSG = MYCOL I = 1 * 10 CONTINUE * IF( MOD( MYDIST, 2 ).NE.0 ) THEN * * If I am process that sends information * DIST = I * ( MYDIST - MOD( MYDIST, 2 ) ) * * Figure coordinates of dest of message * IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) END IF * CALL IGESD2D( ICTXT, N, 1, MINE, N, RMSSG, CMSSG ) * GO TO 20 * ELSE * * If I am a process receiving information, figure coordinates * of source of message * DIST = MYDIST2 + I IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) HISDIST = MOD( NP + CMSSG - TCDEST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) HISDIST = MOD( NP + RMSSG - TRDEST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) HISDIST = MOD( NP + RMSSG*NPCOL+CMSSG - DEST, NP ) END IF * IF( MYDIST2.LT.HISDIST ) THEN * * If I have anyone sending to me * CALL IGERV2D( ICTXT, N, 1, HIS, N, RMSSG, CMSSG ) CALL SUBPTR( MINE, HIS ) * END IF MYDIST = MYDIST / 2 * END IF I = I * 2 * IF( I.LT.NP ) $ GO TO 10 * 20 CONTINUE * IF( BCAST ) THEN IF( MYDIST2.EQ.0 ) THEN CALL IGEBS2D( ICTXT, SCOPE, ' ', N, 1, MINE, N ) ELSE CALL IGEBR2D( ICTXT, SCOPE, ' ', N, 1, MINE, N, $ TRDEST, TCDEST ) END IF END IF * RETURN * * End of PITREECOMB * END scalapack-2.0.2/TOOLS/pschekpad.f000644 000766 000024 00000012103 10363532303 016715 0ustar00juliestaff000000 000000 SUBROUTINE PSCHEKPAD( ICTXT, MESS, M, N, A, LDA, IPRE, IPOST, $ CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N REAL CHKVAL * .. * .. Array Arguments .. CHARACTER MESS*(*) REAL A( * ) * .. * * Purpose * ======= * * PSCHEKPAD checks that the padding around a local array has not * been overwritten since the call to PSFILLPAD. 3 types of errors * are reported: * * 1) Overwrite in pre-guardzone. This indicates a memory overwrite has * occurred in the first IPRE elements which form a buffer before the * beginning of A. Therefore, the error message: * 'Overwrite in pre-guardzone: loc( 5) = 18.00000' * tells you that the 5th element of the IPRE long buffer has been * overwritten with the value 18, where it should still have the value * of CHKVAL. * * 2) Overwrite in post-guardzone. This indicates a memory overwrite has * occurred in the last IPOST elements which form a buffer after the end * of A. Error reports are refered from the end of A. Therefore, * 'Overwrite in post-guardzone: loc( 19) = 24.00000' * tells you that the 19th element after the end of A was overwritten * with the value 24, where it should still have the value of CHKVAL. * * 3) Overwrite in lda-m gap. Tells you elements between M and LDA were * overwritten. So, * 'Overwrite in lda-m gap: A( 12, 3) = 22.00000' * tells you that the element at the 12th row and 3rd column of A was * overwritten with the value of 22, where it should still have the * value of CHKVAL. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * MESS (local input) CHARACTER*(*) * String containing a user-defined message. * * M (local input) INTEGER * The number of rows in the local array A. * * N (input) INTEGER * The number of columns in the local array A. * * A (local input) REAL array of dimension (LDA,N). * A location IPRE elements in front of the array to be checked. * * LDA (local input) INTEGER * The leading Dimension of the local array to be checked. * * IPRE (local input) INTEGER * The size of the guard zone before the start of padded array. * * IPOST (local input) INTEGER * The size of guard zone after the padded array. * * CHKVAL (local input) REAL * The value the local array was padded with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, IAM, IDUMM, INFO, J, K, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW*NPCOL + MYCOL INFO = -1 * * Check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, ' pre', I, $ A( I ) INFO = IAM END IF 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PSCHEKPAD' END IF * * Check buffer after A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, 'post', $ I-J+1, A( I ) INFO = IAM END IF 20 CONTINUE ELSE WRITE( *, FMT = * ) $ 'WARNING no post-guardzone buffer in PSCHEKPAD' END IF * * Check all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9997 ) MYROW, MYCOL, MESS, $ I-IPRE-LDA*(J-1), J, A( I ) INFO = IAM END IF 30 CONTINUE K = K + LDA 40 CONTINUE END IF * CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUMM, IDUMM, -1, $ 0, 0 ) IF( IAM.EQ.0 .AND. INFO.GE.0 ) THEN WRITE( *, FMT = 9999 ) INFO / NPCOL, MOD( INFO, NPCOL ), MESS END IF * 9999 FORMAT( '{', I5, ',', I5, '}: Memory overwrite in ', A ) 9998 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ A4, '-guardzone: loc(', I3, ') = ', G11.4 ) 9997 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ 'lda-m gap: loc(', I3, ',', I3, ') = ', G11.4 ) * RETURN * * End of PSCHEKPAD * END scalapack-2.0.2/TOOLS/pscol2row.f000644 000766 000024 00000021645 11654631032 016723 0ustar00juliestaff000000 000000 SUBROUTINE PSCOL2ROW( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, RSRC, $ CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. REAL VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a column of processes, and distribute those rows over a row of * processes. This routine minimizes communication by sending all * information it has that a given process in the RDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process column consisting of P processes, and I want to send all of * that vector that I own to a new vector distributed over Q processes * within a process row, that after I find the process in RDEST that * owns the row of the vector I'm currently looking at, he will want * every ( (LCM(P,Q) / P ) block of my vector (the block being of size * NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block. * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a column. * * VS (local input) REAL * Array of dimension (LDVS,N), the block of vectors stored on * process column CSRC to be put into memory VD, and stored * on process row RDEST. * * LDVS (local input) INTEGER * The leading dimension of VS, LDVS >= MAX( 1, MP ). * * VD (local output) REAL * Array of dimension (LDVD,N), on output, the contents of VS * stored on process row RDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD, LDVD >= MAX( 1, MQ ). * * RSRC (global input) INTEGER * The process row the distributed block of vectors VS begins * on. * * CSRC (global input) INTEGER * The process column VS is distributed over. * * RDEST (global input) INTEGER * The process row to distribute VD over. * * CDEST (global input) INTEGER * The process column that VD begins on. * * WORK (local workspace) REAL * Array of dimension (LDW), the required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mp = number of rows of VS on my process. * nprow = number of process rows * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mp/NB )/(LCM/nprow) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, ICDEST, II, IRSRC, ISTART, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGESD2D, SGERV2D, SLACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * * * .. Initialize Variables .. * ICPY = 0 * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there * is no copying required * IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYCOL.EQ.CSRC ) THEN * ISTART = 1 * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my first block * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * * Loop over all possible destination processes * DO 20 K = 1, CBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MP, NB*CBLKSKIP JB = MIN(NB, MP-II+1) CALL SLACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary (NOTE: will send extra if NB > JB) * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL SGESD2D( ICTXT, JJ, 1, WORK, JJ, RDEST, $ ICDEST ) * ELSE * * I am both source and destination, save where to start * copying from for later use. * ICPY = ISTART END IF * ISTART = ISTART + NB ICDEST = MOD(ICDEST+NPROW, NPCOL) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYROW.EQ.RDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) DO 50 K = 1, RBLKSKIP * * If I don't already possess the required data * IF( (MYCOL.NE.CSRC).OR.(MYROW.NE.IRSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MQ - ISTART + NB) / NB JJ = ((NBLOCKS+RBLKSKIP-1) / RBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL SGERV2D( ICTXT, JJ, N, WORK, JJ, IRSRC, CSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL SLACPY( 'G', JB, N, WORK(JJ), JB, $ VD(II,1), LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL SLACPY( 'G', JB, N, VS(JJ,1), LDVS, $ VD(II,1), LDVD ) JJ = JJ + NB*CBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB IRSRC = MOD( IRSRC+NPCOL, NPROW ) 50 CONTINUE END IF * * If NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYCOL.EQ.CSRC ) THEN * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my piece of the vector. * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN CALL SGESD2D( ICTXT, MP, N, VS, LDVS, RDEST, ICDEST ) ELSE CALL SLACPY( 'G', MP, N, VS, LDVS, VD, LDVD ) END IF END IF * IF( MYROW.EQ.RDEST ) THEN * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my piece of the vector. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) * IF( (MYROW.NE.IRSRC).OR.(MYCOL.NE.CSRC) ) $ CALL SGERV2D( ICTXT, MQ, N, VD, LDVD, IRSRC, CSRC ) * END IF * END IF * RETURN * * End of PSCOL2ROW * END scalapack-2.0.2/TOOLS/pselget.f000644 000766 000024 00000016257 10363532303 016434 0ustar00juliestaff000000 000000 SUBROUTINE PSELGET( SCOPE, TOP, ALPHA, A, IA, JA, DESCA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 SCOPE, TOP INTEGER IA, JA REAL ALPHA * .. * .. Array arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSELGET sets alpha to the distributed matrix entry A( IA, JA ). * The value of alpha is set according to the scope. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SCOPE (global input) CHARACTER*1 * The BLACS scope in which alpha is updated. * If SCOPE = 'R', alpha is updated only in the process row * containing A( IA, JA ), * If SCOPE = 'C', alpha is updated only in the process column * containing A( IA, JA ), * If SCOPE = 'A', alpha is updated in all the processes of the * grid, * otherwise alpha is updated only in the process containing * A( IA, JA ). * * TOP (global input) CHARACTER*1 * The topology to be used if broadcast is needed. * * ALPHA (global output) REAL, the scalar alpha. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, IIA, IOFFA, JJA, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * ALPHA = ZERO * IF( LSAME( SCOPE, 'R' ) ) THEN IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL SGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL SGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'C' ) ) THEN IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL SGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL SGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'A' ) ) THEN IF( ( MYROW.EQ.IAROW ).AND.( MYCOL.EQ.IACOL ) ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL SGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL SGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF ELSE IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ ALPHA = A( IIA+(JJA-1)*DESCA( LLD_ ) ) END IF * RETURN * * End of PSELGET * END scalapack-2.0.2/TOOLS/pselset.f000644 000766 000024 00000011765 10363532303 016447 0ustar00juliestaff000000 000000 SUBROUTINE PSELSET( A, IA, JA, DESCA, ALPHA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA REAL ALPHA * .. * .. Array arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSELSET sets the distributed matrix entry A( IA, JA ) to ALPHA. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (local output) REAL pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ALPHA (local input) REAL * The scalar alpha. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, JJA, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ A( IIA+(JJA-1)*DESCA( LLD_ ) ) = ALPHA * RETURN * * End of PSELSET * END scalapack-2.0.2/TOOLS/pselset2.f000644 000766 000024 00000012426 10363532303 016524 0ustar00juliestaff000000 000000 SUBROUTINE PSELSET2( ALPHA, A, IA, JA, DESCA, BETA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA REAL ALPHA, BETA * .. * .. Array arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSELSET2 sets alpha to the distributed matrix entry A(IA,JA) * and A(IA,JA) to beta. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * ALPHA (local output) REAL * The scalar alpha. * * A (local input/local ouput) REAL pointer into the * local memory to an array of dimension (LLD_A,*) containing * the local pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * BETA (local input) REAL * The scalar beta. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, IOFFA, JJA, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) ALPHA = A( IOFFA ) A( IOFFA ) = BETA ELSE ALPHA = ZERO END IF * RETURN * * End of PSELSET2 * END scalapack-2.0.2/TOOLS/psfillpad.f000644 000766 000024 00000005554 10363532303 016745 0ustar00juliestaff000000 000000 SUBROUTINE PSFILLPAD( ICTXT, M, N, A, LDA, IPRE, IPOST, CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N REAL CHKVAL * .. * .. Array Arguments .. REAL A( * ) * .. * * Purpose * ======= * * PSFILLPAD surrounds a two dimensional local array with a guard- * zone initialized to the value CHKVAL. The user may later call the * routine PSCHEKPAD to discover if the guardzone has been * violated. There are three guardzones. The first is a buffer of size * IPRE that is before the start of the array. The second is the buffer * of size IPOST which is after the end of the array to be padded. * Finally, there is a guard zone inside every column of the array to * be padded, in the elements of A(M+1:LDA, J). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (local input) INTEGER * The number of rows in the local array. * * N (local input) INTEGER * The number of columns in the local array. * * A (local input/local output) REAL, array of * dimension (LDA,N). A location IPRE elements in front of * the matrix to be padded. * * LDA (local input) INTEGER * The leading Dimension of the local array to be padded. * * IPRE (local input) INTEGER * The size of the guard zone to put before the start of * padded array. * * IPOST (local input) INTEGER * The size of the guard zone to put after padded array. * * CHKVAL (local input) REAL * The value to pad matrix with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE A( I ) = CHKVAL 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PSFILLPAD' END IF * * Put check buffer in back of A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 A( I ) = CHKVAL 20 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no post-guardzone in PSFILLPAD' END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 A( I ) = CHKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * RETURN * * End of PSFILLPAD * END scalapack-2.0.2/TOOLS/pslaprnt.f000644 000766 000024 00000027571 10363532303 016635 0ustar00juliestaff000000 000000 SUBROUTINE PSLAPRNT( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, WORK ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSLAPRNT prints to the standard output a distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and * printed by the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) REAL pointer into the local memory to a * local array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IRPRNT (global input) INTEGER * The row index of the printing process. * * ICPRNT (global input) INTEGER * The column index of the printing process. * * CMATNM (global input) CHARACTER*(*) * Identifier of the distributed matrix to be printed. * * NOUT (global input) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. * * WORK (local workspace) REAL * Working array of minimum size equal to MB_A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L, $ SGERV2D, SGESD2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ICURROW = IAROW ICURCOL = IACOL II = IIA JJ = JJA LDA = DESCA( LLD_ ) * * Handle the first block of column separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 DO 60 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, JA+H, WORK( K ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, JA+H, WORK( K ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, J+H, A( II+K+(JJ+H-1)*LDA ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, J+H, WORK( K ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, J+H, A( II+K+(JJ+H-1)*LDA ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, J+H, WORK( K ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * 9999 FORMAT(A,'(',I6,',',I6,')=',E16.8) * RETURN * * End of PSLAPRNT * END scalapack-2.0.2/TOOLS/pslaread.f000644 000766 000024 00000010645 10604624224 016561 0ustar00juliestaff000000 000000 SUBROUTINE PSLAREAD( FILNAM, A, DESCA, IRREAD, ICREAD, WORK ) * * -- ScaLAPACK tools routine (version 1.8) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * adapted by Julie Langou, April 2007 (julie@cs.utk.edu) * * .. Scalar Arguments .. INTEGER ICREAD, IRREAD * .. * .. Array Arguments .. CHARACTER*(*) FILNAM INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSLAREAD reads from a file named FILNAM a matrix and distribute * it to the process grid. * * Only the process of coordinates {IRREAD, ICREAD} read the file. * * WORK must be of size >= MB_ = DESCA( MB_ ). * * ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IB, ICTXT, ICURCOL, ICURROW, II, J, JB, $ JJ, K, LDA, M, MYCOL, MYROW, N, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SGERV2D, SGESD2D, $ IGEBS2D, IGEBR2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN OPEN( NIN, FILE=FILNAM, STATUS='OLD' ) READ( NIN, FMT = * ) ( IWORK( I ), I = 1, 2 ) CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, IWORK, 2 ) ELSE CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, IWORK, 2, IRREAD, $ ICREAD ) END IF M = IWORK( 1 ) N = IWORK( 2 ) * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * IF( M.GT.DESCA( M_ ).OR. N.GT.DESCA( N_ ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( *, FMT = * ) 'PSLAREAD: Matrix too big to fit in' WRITE( *, FMT = * ) 'Abort ...' END IF CALL BLACS_ABORT( ICTXT, 0 ) END IF * II = 1 JJ = 1 ICURROW = DESCA( RSRC_ ) ICURCOL = DESCA( CSRC_ ) LDA = DESCA( LLD_ ) * * Loop over column blocks * DO 50 J = 1, N, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), N-J+1 ) DO 40 H = 0, JB-1 * * Loop over block of rows * DO 30 I = 1, M, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), M-I+1 ) IF( ICURROW.EQ.IRREAD .AND. ICURCOL.EQ.ICREAD ) THEN IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN DO 10 K = 0, IB-1 READ( NIN, FMT = * ) A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGERV2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRREAD, ICREAD ) ELSE IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN DO 20 K = 1, IB READ( NIN, FMT = * ) WORK( K ) 20 CONTINUE CALL SGESD2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) 30 CONTINUE * II = 1 ICURROW = DESCA( RSRC_ ) 40 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) * 50 CONTINUE * IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN CLOSE( NIN ) END IF * RETURN * * End of PSLAREAD * END scalapack-2.0.2/TOOLS/pslawrite.f000644 000766 000024 00000017330 11645634736 017015 0ustar00juliestaff000000 000000 SUBROUTINE PSLAWRITE( FILNAM, M, N, A, IA, JA, DESCA, IRWRIT, $ ICWRIT, WORK ) * * -- ScaLAPACK tools routine (version 1.8) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * adapted by Julie Langou, April 2007 (julie@cs.utk.edu) * * .. Scalar Arguments .. INTEGER IA, ICWRIT, IRWRIT, JA, M, N * .. * .. Array Arguments .. CHARACTER*(*) FILNAM INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSLAWRITE writes to a file named FILNAMa distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent to and * written by the process of coordinates (IRWWRITE, ICWRIT). * * WORK must be of size >= MB_ = DESCA( MB_ ). * * ===================================================================== * * .. Parameters .. INTEGER NOUT PARAMETER ( NOUT = 13 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L, $ SGERV2D, SGESD2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN OPEN( NOUT, FILE=FILNAM, STATUS='UNKNOWN' ) WRITE( NOUT, FMT = * ) M, N END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ICURROW = IAROW ICURCOL = IACOL II = IIA JJ = JJA LDA = DESCA( LLD_ ) * * Handle the first block of column separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 DO 60 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) WORK( K ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) WORK( K ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) WORK( K ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) WORK( K ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CLOSE( NOUT ) END IF * 9999 FORMAT( E15.8 ) * RETURN * * End of PSLAWRITE * END scalapack-2.0.2/TOOLS/psmatadd.f000644 000766 000024 00000026741 10363532303 016565 0ustar00juliestaff000000 000000 SUBROUTINE PSMATADD( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, $ JC, DESCC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IC, JA, JC, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ) * .. * * Purpose * ======= * * PSMATADD performs a distributed matrix-matrix addition * * sub( C ) := alpha * sub( A ) + beta * sub( C ), * * where sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1) and sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1). No communications are performed in this * routine, the arrays are supposed to be aligned. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrices sub( A ) and sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrices sub( A ) and * sub( C ). N >= 0. * * ALPHA (global input) REAL * The scalar ALPHA. * * A (local input) REAL pointer into the local memory * to a local array of dimension (LLD_A, LOCc(JA+N-1) ). This * array contains the local pieces of the distributed matrix * sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * BETA (global input) REAL * The scalar BETA. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * This array contains the local pieces of the distributed * matrix sub( C ). On exit, this array contains the local * pieces of the resulting distributed matrix. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICCOL, ICOFF, ICROW, IIA, $ IIC, IOFFA, IOFFC, IROFF, J, JJA, JJC, LDA, $ LDC, MP, MYCOL, MYROW, NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA(CTXT_), NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible. * IF( (M.EQ.0).OR.(N.EQ.0).OR. $ ((ALPHA.EQ.ZERO).AND.(BETA.EQ.ONE)) ) $ RETURN * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) * IROFF = MOD( IA-1, DESCA(MB_) ) ICOFF = MOD( JA-1, DESCA(NB_) ) MP = NUMROC( M+IROFF, DESCA(MB_), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA(NB_), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF LDA = DESCA(LLD_) LDC = DESCC(LLD_) * IF( NQ.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN IOFFC = IIC + (JJC-1)*LDC DO 10 I = IOFFC, IOFFC+MP-1 C( I ) = ZERO 10 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 20 I = IOFFC, IOFFC+MP-1 C( I ) = ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 20 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 30 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 30 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 40 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 40 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 50 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 50 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 60 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 60 CONTINUE END IF END IF ELSE IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN IOFFC = IIC+(JJC-1)*LDC DO 80 J = 1, NQ DO 70 I = IOFFC, IOFFC+MP-1 C( I ) = ZERO 70 CONTINUE IOFFC = IOFFC + LDC 80 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 100 J = 1, NQ DO 90 I = IOFFC, IOFFC+MP-1 C( I ) = ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 90 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 100 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 120 J = 1, NQ DO 110 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 110 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 120 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 140 J = 1, NQ DO 130 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 130 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 140 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 160 J = 1, NQ DO 150 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 150 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 160 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 180 J = 1, NQ DO 170 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 170 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 180 CONTINUE END IF END IF END IF * RETURN * * End of PSMATADD * END scalapack-2.0.2/TOOLS/psrow2col.f000644 000766 000024 00000021561 11654631032 016720 0ustar00juliestaff000000 000000 SUBROUTINE PSROW2COL( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, $ RSRC, CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. REAL VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a row of processes, and distribute those rows over a column of * processes. This routine minimizes communication by sending all * information it has that a given process in the CDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process row consisting of Q processes, and I want to send all of that * vector that I own to a new vector distributed over P processes within * a process column, that after I find the process in RDEST that owns * the row of the vector I'm currently looking at, he will want every * ( (LCM(P,Q)/Q ) block of my vector (the block being of size NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a row. * * VS (local input) REAL * Array of dimension (LDVS,N), the block of vectors stored on * process row RSRC to be put into memory VD, and stored on * process column CDEST. * * LDVS (local input) INTEGER * The leading dimension of VS. * * VD (local output) REAL * Array of dimension (LDVD,N), on output, the contents of VD * stored on process column CDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD. * * RSRC (global input) INTEGER * The process row VS is distributed over. * * CSRC (global input) INTEGER * The process column the distributed block of vectors VS * begins on. * * RDEST (global input) INTEGER * The process row that VD begins on. * * CDEST (global input) INTEGER * The process column to distribute VD over. * * WORK (local workspace) REAL * Array, dimension (LDW). The required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mq = number of rows of VS on my process. * npcol = number of process columns * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mq/NB )/(LCM/npcol) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, II, ISTART, ICSRC, IRDEST, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGESD2D, SGERV2D, SLACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * * * .. Initialize Variables .. * ICPY = 0 * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there is no * copying required * IF( NPROW .NE. NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYROW.EQ.RSRC ) THEN * ISTART = 1 * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my first block * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) * * Loop over all possible destination processes * DO 20 K = 1, RBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL SLACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary, NOTE: will send extra if NB > JB * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL SGESD2D( ICTXT, JJ, 1, WORK, JJ, IRDEST, $ CDEST ) * * I am both source and destination, save where to start * copying from for later use * ELSE ICPY = ISTART END IF * ISTART = ISTART + NB IRDEST = MOD( IRDEST+NPCOL, NPROW ) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYCOL.EQ.CDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) * * Loop over all sending processes * DO 50 K = 1, CBLKSKIP * * If I don't already possess the required data * IF( (MYROW.NE.RSRC).OR.(MYCOL.NE.ICSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MP - ISTART + NB) / NB JJ = ((NBLOCKS+CBLKSKIP-1) / CBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL SGERV2D( ICTXT, JJ, N, WORK, JJ, RSRC, ICSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL SLACPY( 'G', JB, N, WORK(JJ), JB, VD(II,1), $ LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL SLACPY( 'G', JB, N, VS(JJ,1), LDVS, VD(II,1), $ LDVD ) JJ = JJ + NB*RBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB ICSRC = MOD( ICSRC+NPROW, NPCOL ) 50 CONTINUE END IF * * if NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYROW.EQ.RSRC ) THEN * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my piece of the vector * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN CALL SGESD2D( ICTXT, MQ, N, VS, LDVS, IRDEST, CDEST ) ELSE CALL SLACPY( 'G', MQ, N, VS, LDVS, VD, LDVD ) END IF END IF IF( MYCOL.EQ.CDEST ) THEN * * Figure my distance from RDEST: the process in RSRC the same * distance from CSRC will have my piece of the vector * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) IF( (MYCOL.NE.ICSRC).OR.(MYROW.NE. RSRC) ) $ CALL SGERV2D( ICTXT, MP, N, VD, LDVD, RSRC, ICSRC ) END IF END IF * RETURN * * End of PSROW2COL * END scalapack-2.0.2/TOOLS/pstreecomb.f000644 000766 000024 00000022217 11654631032 017130 0ustar00juliestaff000000 000000 SUBROUTINE PSTREECOMB( ICTXT, SCOPE, N, MINE, RDEST0, CDEST0, $ SUBPTR ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SCOPE INTEGER CDEST0, ICTXT, N, RDEST0 * .. * .. Array Arguments .. REAL MINE( * ) * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PSTREECOMB does a 1-tree parallel combine operation on scalars, * using the subroutine indicated by SUBPTR to perform the required * computation. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * SCOPE (global input) CHARACTER * The scope of the operation: 'Rowwise', 'Columnwise', or * 'All'. * * N (global input) INTEGER * The number of elements in MINE. N = 1 for the norm-2 * computation and 2 for the sum of square. * * MINE (local input/global output) REAL array of * dimension at least equal to N. The local data to use in the * combine. * * RDEST0 (global input) INTEGER * The process row to receive the answer. If RDEST0 = -1, * every process in the scope gets the answer. * * CDEST0 (global input) INTEGER * The process column to receive the answer. If CDEST0 = -1, * every process in the scope gets the answer. * * SUBPTR (local input) Pointer to the subroutine to call to perform * the required combine. * * ===================================================================== * * .. Local Scalars .. LOGICAL BCAST, RSCOPE, CSCOPE INTEGER CMSSG, DEST, DIST, HISDIST, I, IAM, MYCOL, $ MYROW, MYDIST, MYDIST2, NP, NPCOL, NPROW, $ RMSSG, TCDEST, TRDEST * .. * .. Local Arrays .. REAL HIS( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * DEST = 0 * See if everyone wants the answer (need to broadcast the answer) * BCAST = ( ( RDEST0.EQ.-1 ).OR.( CDEST0.EQ.-1 ) ) IF( BCAST ) THEN TRDEST = 0 TCDEST = 0 ELSE TRDEST = RDEST0 TCDEST = CDEST0 END IF * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure scope-dependant variables, or report illegal scope * RSCOPE = LSAME( SCOPE, 'R' ) CSCOPE = LSAME( SCOPE, 'C' ) * IF( RSCOPE ) THEN IF( BCAST ) THEN TRDEST = MYROW ELSE IF( MYROW.NE.TRDEST ) THEN RETURN END IF NP = NPCOL MYDIST = MOD( NPCOL + MYCOL - TCDEST, NPCOL ) ELSE IF( CSCOPE ) THEN IF( BCAST ) THEN TCDEST = MYCOL ELSE IF( MYCOL.NE.TCDEST ) THEN RETURN END IF NP = NPROW MYDIST = MOD( NPROW + MYROW - TRDEST, NPROW ) ELSE IF( LSAME( SCOPE, 'A' ) ) THEN NP = NPROW * NPCOL IAM = MYROW*NPCOL + MYCOL DEST = TRDEST*NPCOL + TCDEST MYDIST = MOD( NP + IAM - DEST, NP ) ELSE RETURN END IF * IF( NP.LT.2 ) $ RETURN * MYDIST2 = MYDIST RMSSG = MYROW CMSSG = MYCOL I = 1 * 10 CONTINUE * IF( MOD( MYDIST, 2 ).NE.0 ) THEN * * If I am process that sends information * DIST = I * ( MYDIST - MOD( MYDIST, 2 ) ) * * Figure coordinates of dest of message * IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) END IF * CALL SGESD2D( ICTXT, N, 1, MINE, N, RMSSG, CMSSG ) * GO TO 20 * ELSE * * If I am a process receiving information, figure coordinates * of source of message * DIST = MYDIST2 + I IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) HISDIST = MOD( NP + CMSSG - TCDEST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) HISDIST = MOD( NP + RMSSG - TRDEST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) HISDIST = MOD( NP + RMSSG*NPCOL+CMSSG - DEST, NP ) END IF * IF( MYDIST2.LT.HISDIST ) THEN * * If I have anyone sending to me * CALL SGERV2D( ICTXT, N, 1, HIS, N, RMSSG, CMSSG ) CALL SUBPTR( MINE, HIS ) * END IF MYDIST = MYDIST / 2 * END IF I = I * 2 * IF( I.LT.NP ) $ GO TO 10 * 20 CONTINUE * IF( BCAST ) THEN IF( MYDIST2.EQ.0 ) THEN CALL SGEBS2D( ICTXT, SCOPE, ' ', N, 1, MINE, N ) ELSE CALL SGEBR2D( ICTXT, SCOPE, ' ', N, 1, MINE, N, $ TRDEST, TCDEST ) END IF END IF * RETURN * * End of PSTREECOMB * END * SUBROUTINE SCOMBAMAX( V1, V2 ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. REAL V1( 2 ), V2( 2 ) * .. * * Purpose * ======= * * SCOMBAMAX finds the element having max. absolute value as well * as its corresponding globl index. * * Arguments * ========= * * V1 (local input/local output) REAL array of * dimension 2. The first maximum absolute value element and * its global index. V1(1) = AMAX, V1(2) = INDX. * * V2 (local input) REAL array of dimension 2. * The second maximum absolute value element and its global * index. V2(1) = AMAX, V2(2) = INDX. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * IF( ABS( V1( 1 ) ).LT.ABS( V2( 1 ) ) ) THEN V1( 1 ) = V2( 1 ) V1( 2 ) = V2( 2 ) END IF * RETURN * * End of SCOMBAMAX * END * SUBROUTINE SCOMBSSQ( V1, V2 ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. REAL V1( 2 ), V2( 2 ) * .. * * Purpose * ======= * * SCOMBSSQ does a scaled sum of squares on two scalars. * * Arguments * ========= * * V1 (local input/local output) REAL array of * dimension 2. The first scaled sum. V1(1) = SCALE, * V1(2) = SUMSQ. * * V2 (local input) REAL array of dimension 2. * The second scaled sum. V2(1) = SCALE, V2(2) = SUMSQ. * * ===================================================================== * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Executable Statements .. * IF( V1( 1 ).GE.V2( 1 ) ) THEN IF( V1( 1 ).NE.ZERO ) $ V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 ) ELSE V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 ) V1( 1 ) = V2( 1 ) END IF * RETURN * * End of SCOMBSSQ * END * SUBROUTINE SCOMBNRM2( X, Y ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. REAL X, Y * .. * * Purpose * ======= * * SCOMBNRM2 combines local norm 2 results, taking care not to cause * unnecessary overflow. * * Arguments * ========= * * X (local input) REAL * Y (local input) REAL * X and Y specify the values x and y. X and Y are supposed to * be >= 0. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. REAL W, Z * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT * .. * .. Executable Statements .. * W = MAX( X, Y ) Z = MIN( X, Y ) * IF( Z.EQ.ZERO ) THEN X = W ELSE X = W*SQRT( ONE+( Z / W )**2 ) END IF * RETURN * * End of SCOMBNRM2 * END scalapack-2.0.2/TOOLS/pzchekpad.f000644 000766 000024 00000012535 10363532303 016735 0ustar00juliestaff000000 000000 SUBROUTINE PZCHEKPAD( ICTXT, MESS, M, N, A, LDA, IPRE, IPOST, $ CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N COMPLEX*16 CHKVAL * .. * .. Array Arguments .. CHARACTER MESS*(*) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZCHEKPAD checks that the padding around a local array has not * been overwritten since the call to PZFILLPAD. 3 types of errors * are reported: * * 1) Overwrite in pre-guardzone. This indicates a memory overwrite has * occurred in the first IPRE elements which form a buffer before the * beginning of A. Therefore, the error message: * 'Overwrite in pre-guardzone: loc( 5) = 18.00000' * tells you that the 5th element of the IPRE long buffer has been * overwritten with the value 18, where it should still have the value * of CHKVAL. * * 2) Overwrite in post-guardzone. This indicates a memory overwrite has * occurred in the last IPOST elements which form a buffer after the end * of A. Error reports are refered from the end of A. Therefore, * 'Overwrite in post-guardzone: loc( 19) = 24.00000' * tells you that the 19th element after the end of A was overwritten * with the value 24, where it should still have the value of CHKVAL. * * 3) Overwrite in lda-m gap. Tells you elements between M and LDA were * overwritten. So, * 'Overwrite in lda-m gap: A( 12, 3) = 22.00000' * tells you that the element at the 12th row and 3rd column of A was * overwritten with the value of 22, where it should still have the * value of CHKVAL. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * MESS (local input) CHARACTER*(*) * String containing a user-defined message. * * M (local input) INTEGER * The number of rows in the local array A. * * N (input) INTEGER * The number of columns in the local array A. * * A (local input) COMPLEX*16 array of dimension (LDA,N). * A location IPRE elements in front of the array to be checked. * * LDA (local input) INTEGER * The leading Dimension of the local array to be checked. * * IPRE (local input) INTEGER * The size of the guard zone before the start of padded array. * * IPOST (local input) INTEGER * The size of guard zone after the padded array. * * CHKVAL (local input) COMPLEX*16 * The value the local array was padded with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, IAM, IDUMM, INFO, J, K, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DIMAG * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW*NPCOL + MYCOL INFO = -1 * * Check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, ' pre', I, $ DBLE( A( I ) ), DIMAG( A( I ) ) INFO = IAM END IF 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PZCHEKPAD' END IF * * Check buffer after A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, 'post', $ I-J+1, DBLE( A( I ) ), $ DIMAG( A( I ) ) INFO = IAM END IF 20 CONTINUE ELSE WRITE( *, FMT = * ) $ 'WARNING no post-guardzone buffer in PZCHEKPAD' END IF * * Check all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9997 ) MYROW, MYCOL, MESS, $ I-IPRE-LDA*(J-1), J, DBLE( A( I ) ), $ DIMAG( A( I ) ) INFO = IAM END IF 30 CONTINUE K = K + LDA 40 CONTINUE END IF * CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUMM, IDUMM, -1, $ 0, 0 ) IF( IAM.EQ.0 .AND. INFO.GE.0 ) THEN WRITE( *, FMT = 9999 ) INFO / NPCOL, MOD( INFO, NPCOL ), MESS END IF * 9999 FORMAT( '{', I5, ',', I5, '}: Memory overwrite in ', A ) 9998 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ A4, '-guardzone: loc(', I3, ') = ', G20.7, '+ i*', $ G20.7 ) 9997 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ 'lda-m gap: loc(', I3, ',', I3, ') = ', G20.7, $ '+ i*', G20.7 ) * RETURN * * End of PZCHEKPAD * END scalapack-2.0.2/TOOLS/pzcol2row.f000644 000766 000024 00000021610 11654631032 016722 0ustar00juliestaff000000 000000 SUBROUTINE PZCOL2ROW( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, RSRC, $ CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. COMPLEX*16 VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a column of processes, and distribute those rows over a row of * processes. This routine minimizes communication by sending all * information it has that a given process in the RDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process column consisting of P processes, and I want to send all of * that vector that I own to a new vector distributed over Q processes * within a process row, that after I find the process in RDEST that * owns the row of the vector I'm currently looking at, he will want * every ( (LCM(P,Q) / P ) block of my vector (the block being of size * NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block. * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a column. * * VS (local input) COMPLEX*16 * Array of dimension (LDVS,N), the block of vectors stored on * process column CSRC to be put into memory VD, and stored * on process row RDEST. * * LDVS (local input) INTEGER * The leading dimension of VS, LDVS >= MAX( 1, MP ). * * VD (local output) COMPLEX*16 * Array of dimension (LDVD,N), on output, the contents of VS * stored on process row RDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD, LDVD >= MAX( 1, MQ ). * * RSRC (global input) INTEGER * The process row the distributed block of vectors VS begins * on. * * CSRC (global input) INTEGER * The process column VS is distributed over. * * RDEST (global input) INTEGER * The process row to distribute VD over. * * CDEST (global input) INTEGER * The process column that VD begins on. * * WORK (local workspace) COMPLEX*16 * Array of dimension (LDW), the required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mp = number of rows of VS on my process. * nprow = number of process rows * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mp/NB )/(LCM/nprow) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, ICDEST, II, IRSRC, ISTART, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, ZGESD2D, ZGERV2D, ZLACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * ICPY = 0 * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there * is no copying required * IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYCOL.EQ.CSRC ) THEN * ISTART = 1 * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my first block * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * * Loop over all possible destination processes * DO 20 K = 1, CBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MP, NB*CBLKSKIP JB = MIN(NB, MP-II+1) CALL ZLACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary (NOTE: will send extra if NB > JB) * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL ZGESD2D( ICTXT, JJ, 1, WORK, JJ, RDEST, $ ICDEST ) * ELSE * * I am both source and destination, save where to start * copying from for later use. * ICPY = ISTART END IF * ISTART = ISTART + NB ICDEST = MOD(ICDEST+NPROW, NPCOL) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYROW.EQ.RDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) DO 50 K = 1, RBLKSKIP * * If I don't already possess the required data * IF( (MYCOL.NE.CSRC).OR.(MYROW.NE.IRSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MQ - ISTART + NB) / NB JJ = ((NBLOCKS+RBLKSKIP-1) / RBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL ZGERV2D( ICTXT, JJ, N, WORK, JJ, IRSRC, CSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL ZLACPY( 'G', JB, N, WORK(JJ), JB, $ VD(II,1), LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL ZLACPY( 'G', JB, N, VS(JJ,1), LDVS, $ VD(II,1), LDVD ) JJ = JJ + NB*CBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB IRSRC = MOD( IRSRC+NPCOL, NPROW ) 50 CONTINUE END IF * * If NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYCOL.EQ.CSRC ) THEN * * Figure my distance from RSRC: the process in RDEST the same * distance from CDEST will want my piece of the vector. * MYDIST = MOD( NPROW+MYROW-RSRC, NPROW ) MP = NUMROC( M, NB, MYROW, RSRC, NPROW ) ICDEST = MOD( CDEST+MYDIST, NPCOL ) * IF( (MYCOL.NE.ICDEST).OR.(MYROW.NE.RDEST) ) THEN CALL ZGESD2D( ICTXT, MP, N, VS, LDVS, RDEST, ICDEST ) ELSE CALL ZLACPY( 'G', MP, N, VS, LDVS, VD, LDVD ) END IF END IF * IF( MYROW.EQ.RDEST ) THEN * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my piece of the vector. * MYDIST = MOD( NPCOL+MYCOL-CDEST, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CDEST, NPCOL ) IRSRC = MOD( RSRC+MYDIST, NPROW ) * IF( (MYROW.NE.IRSRC).OR.(MYCOL.NE.CSRC) ) $ CALL ZGERV2D( ICTXT, MQ, N, VD, LDVD, IRSRC, CSRC ) * END IF * END IF * RETURN * * End of PZCOL2ROW * END scalapack-2.0.2/TOOLS/pzelget.f000644 000766 000024 00000016307 10363532303 016437 0ustar00juliestaff000000 000000 SUBROUTINE PZELGET( SCOPE, TOP, ALPHA, A, IA, JA, DESCA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 SCOPE, TOP INTEGER IA, JA COMPLEX*16 ALPHA * .. * .. Array arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZELGET sets alpha to the distributed matrix entry A( IA, JA ). * The value of alpha is set according to the scope. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SCOPE (global input) CHARACTER*1 * The BLACS scope in which alpha is updated. * If SCOPE = 'R', alpha is updated only in the process row * containing A( IA, JA ), * If SCOPE = 'C', alpha is updated only in the process column * containing A( IA, JA ), * If SCOPE = 'A', alpha is updated in all the processes of the * grid, * otherwise alpha is updated only in the process containing * A( IA, JA ). * * TOP (global input) CHARACTER*1 * The topology to be used if broadcast is needed. * * ALPHA (global output) COMPLEX*16, the scalar alpha. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, IIA, IOFFA, JJA, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, ZGEBR2D, ZGEBS2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * ALPHA = ZERO * IF( LSAME( SCOPE, 'R' ) ) THEN IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL ZGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL ZGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'C' ) ) THEN IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL ZGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL ZGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF END IF ELSE IF( LSAME( SCOPE, 'A' ) ) THEN IF( ( MYROW.EQ.IAROW ).AND.( MYCOL.EQ.IACOL ) ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) CALL ZGEBS2D( ICTXT, SCOPE, TOP, 1, 1, A( IOFFA ), 1 ) ALPHA = A( IOFFA ) ELSE CALL ZGEBR2D( ICTXT, SCOPE, TOP, 1, 1, ALPHA, 1, $ IAROW, IACOL ) END IF ELSE IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ ALPHA = A( IIA+(JJA-1)*DESCA( LLD_ ) ) END IF * RETURN * * End of PZELGET * END scalapack-2.0.2/TOOLS/pzelset.f000644 000766 000024 00000012001 10363532303 016436 0ustar00juliestaff000000 000000 SUBROUTINE PZELSET( A, IA, JA, DESCA, ALPHA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA COMPLEX*16 ALPHA * .. * .. Array arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZELSET sets the distributed matrix entry A( IA, JA ) to ALPHA. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,*) containing the local * pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ALPHA (local input) COMPLEX*16 * The scalar alpha. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, JJA, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) $ A( IIA+(JJA-1)*DESCA( LLD_ ) ) = ALPHA * RETURN * * End of PZELSET * END scalapack-2.0.2/TOOLS/pzelset2.f000644 000766 000024 00000012464 10363532303 016535 0ustar00juliestaff000000 000000 SUBROUTINE PZELSET2( ALPHA, A, IA, JA, DESCA, BETA ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA COMPLEX*16 ALPHA, BETA * .. * .. Array arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZELSET2 sets alpha to the distributed matrix entry A(IA,JA) * and A(IA,JA) to beta. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * ALPHA (local output) COMPLEX*16 * The scalar alpha. * * A (local input/local ouput) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,*) containing * the local pieces of the distributed matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * BETA (local input) COMPLEX*16 * The scalar beta. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IIA, IOFFA, JJA, MYCOL, MYROW, $ NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN IOFFA = IIA+(JJA-1)*DESCA( LLD_ ) ALPHA = A( IOFFA ) A( IOFFA ) = BETA ELSE ALPHA = ZERO END IF * RETURN * * End of PZELSET2 * END scalapack-2.0.2/TOOLS/pzfillpad.f000644 000766 000024 00000005570 10363532303 016752 0ustar00juliestaff000000 000000 SUBROUTINE PZFILLPAD( ICTXT, M, N, A, LDA, IPRE, IPOST, CHKVAL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N COMPLEX*16 CHKVAL * .. * .. Array Arguments .. COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZFILLPAD surrounds a two dimensional local array with a guard- * zone initialized to the value CHKVAL. The user may later call the * routine PZCHEKPAD to discover if the guardzone has been * violated. There are three guardzones. The first is a buffer of size * IPRE that is before the start of the array. The second is the buffer * of size IPOST which is after the end of the array to be padded. * Finally, there is a guard zone inside every column of the array to * be padded, in the elements of A(M+1:LDA, J). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (local input) INTEGER * The number of rows in the local array. * * N (local input) INTEGER * The number of columns in the local array. * * A (local input/local output) COMPLEX*16, array of * dimension (LDA,N). A location IPRE elements in front of * the matrix to be padded. * * LDA (local input) INTEGER * The leading Dimension of the local array to be padded. * * IPRE (local input) INTEGER * The size of the guard zone to put before the start of * padded array. * * IPOST (local input) INTEGER * The size of the guard zone to put after padded array. * * CHKVAL (local input) COMPLEX*16 * The value to pad matrix with. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE A( I ) = CHKVAL 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PZFILLPAD' END IF * * Put check buffer in back of A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 A( I ) = CHKVAL 20 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no post-guardzone in PZFILLPAD' END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 A( I ) = CHKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * RETURN * * End of PZFILLPAD * END scalapack-2.0.2/TOOLS/pzlaprnt.f000644 000766 000024 00000031013 10363532303 016626 0ustar00juliestaff000000 000000 SUBROUTINE PZLAPRNT( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, WORK ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZLAPRNT prints to the standard output a distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and * printed by the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory to a * local array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IRPRNT (global input) INTEGER * The row index of the printing process. * * ICPRNT (global input) INTEGER * The column index of the printing process. * * CMATNM (global input) CHARACTER*(*) * Identifier of the distributed matrix to be printed. * * NOUT (global input) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. * * WORK (local workspace) COMPLEX*16 * Working array of minimum size equal to MB_A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L, $ ZGERV2D, ZGESD2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DIMAG, MIN * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ICURROW = IAROW ICURCOL = IACOL II = IIA JJ = JJA LDA = DESCA( LLD_ ) * * Handle the first block of column separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 DO 60 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, JA+H, $ DBLE( A( II+K+(JJ+H-1)*LDA ) ), $ DIMAG( A( II+K+(JJ+H-1)*LDA ) ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, JA+H, DBLE( WORK( K ) ), $ DIMAG( WORK( K ) ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, JA+H, $ DBLE( A( II+K+(JJ+H-1)*LDA ) ), $ DIMAG( A( II+K+(JJ+H-1)*LDA ) ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, JA+H, DBLE( WORK( K ) ), $ DIMAG( WORK( K ) ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, J+H, $ DBLE( A( II+K+(JJ+H-1)*LDA ) ), $ DIMAG( A( II+K+(JJ+H-1)*LDA ) ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, J+H, DBLE( WORK( K ) ), $ DIMAG( WORK( K ) ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, J+H, $ DBLE( A( II+K+(JJ+H-1)*LDA ) ), $ DIMAG( A( II+K+(JJ+H-1)*LDA ) ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, J+H, DBLE( WORK( K ) ), $ DIMAG( WORK( K ) ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * 9999 FORMAT(A,'(',I6,',',I6,')=',D30.18, '+i*(',D30.18, ')') * RETURN * * End of PZLAPRNT * END scalapack-2.0.2/TOOLS/pzlaread.f000644 000766 000024 00000011144 10604624224 016563 0ustar00juliestaff000000 000000 SUBROUTINE PZLAREAD( FILNAM, A, DESCA, IRREAD, ICREAD, WORK ) * * -- ScaLAPACK tools routine (version 1.8) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * adapted by Julie Langou, April 2007 (julie@cs.utk.edu) * * .. Scalar Arguments .. INTEGER ICREAD, IRREAD * .. * .. Array Arguments .. CHARACTER*(*) FILNAM INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZLAREAD reads from a file named FILNAM a matrix and distribute * it to the process grid. * * Only the process of coordinates {IRREAD, ICREAD} read the file. * * WORK must be of size >= MB_ = DESCA( MB_ ). * * ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IB, ICTXT, ICURCOL, ICURROW, II, J, JB, $ JJ, K, LDA, M, MYCOL, MYROW, N, NPCOL, NPROW DOUBLE PRECISION REAL_PART, IMAG_PART * .. * .. Local Arrays .. INTEGER IWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, ZGERV2D, ZGESD2D, $ IGEBS2D, IGEBR2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN OPEN( NIN, FILE=FILNAM, STATUS='OLD' ) READ( NIN, FMT = * ) ( IWORK( I ), I = 1, 2 ) CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, IWORK, 2 ) ELSE CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, IWORK, 2, IRREAD, $ ICREAD ) END IF M = IWORK( 1 ) N = IWORK( 2 ) * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * IF( M.GT.DESCA( M_ ).OR. N.GT.DESCA( N_ ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( *, FMT = * ) 'PZLAREAD: Matrix too big to fit in' WRITE( *, FMT = * ) 'Abort ...' END IF CALL BLACS_ABORT( ICTXT, 0 ) END IF * II = 1 JJ = 1 ICURROW = DESCA( RSRC_ ) ICURCOL = DESCA( CSRC_ ) LDA = DESCA( LLD_ ) * * Loop over column blocks * DO 50 J = 1, N, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), N-J+1 ) DO 40 H = 0, JB-1 * * Loop over block of rows * DO 30 I = 1, M, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), M-I+1 ) IF( ICURROW.EQ.IRREAD .AND. ICURCOL.EQ.ICREAD ) THEN IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN DO 10 K = 0, IB-1 READ( NIN , FMT = *) REAL_PART, IMAG_PART A( II+K+(JJ+H-1)*LDA ) = DCMPLX(REAL_PART, IMAG_PART) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGERV2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRREAD, ICREAD ) ELSE IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN DO 20 K = 1, IB READ( NIN, FMT = * ) REAL_PART, IMAG_PART WORK(K)=DCMPLX(REAL_PART,IMAG_PART) 20 CONTINUE CALL ZGESD2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) 30 CONTINUE * II = 1 ICURROW = DESCA( RSRC_ ) 40 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) * 50 CONTINUE * IF( MYROW.EQ.IRREAD .AND. MYCOL.EQ.ICREAD ) THEN CLOSE( NIN ) END IF * RETURN * * End of PZLAREAD * END scalapack-2.0.2/TOOLS/pzlawrite.f000644 000766 000024 00000020376 11645634736 017030 0ustar00juliestaff000000 000000 SUBROUTINE PZLAWRITE( FILNAM, M, N, A, IA, JA, DESCA, IRWRIT, $ ICWRIT, WORK ) * * -- ScaLAPACK tools routine (version 1.8) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * adapted by Julie Langou, April 2007 (julie@cs.utk.edu) * * .. Scalar Arguments .. INTEGER IA, ICWRIT, IRWRIT, JA, M, N * .. * .. Array Arguments .. CHARACTER*(*) FILNAM INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZLAWRITE writes to a file named FILNAMa distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent to and * written by the process of coordinates (IRWWRITE, ICWRIT). * * WORK must be of size >= MB_ = DESCA( MB_ ). * * ===================================================================== * * .. Parameters .. INTEGER NOUT PARAMETER ( NOUT = 13 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L, $ ZGERV2D, ZGESD2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DIMAG, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN OPEN( NOUT, FILE=FILNAM, STATUS='UNKNOWN' ) WRITE( NOUT, FMT = * ) M, N END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ICURROW = IAROW ICURCOL = IACOL II = IIA JJ = JJA LDA = DESCA( LLD_ ) * * Handle the first block of column separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 DO 60 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) DBLE(WORK( K )), $ DIMAG(WORK( K )) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ DBLE (A( II+K+(JJ+H-1)*LDA )), $ DIMAG (A( II+K+(JJ+H-1)*LDA )) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) DBLE (WORK( K )), $ DIMAG (WORK( K )) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IB = IN-IA+1 IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ DBLE (A( II+K+(JJ+H-1)*LDA )), $ DIMAG (A( II+K+(JJ+H-1)*LDA )) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) DBLE (WORK( K )), $ DIMAG (WORK( K)) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRWRIT .AND. ICURCOL.EQ.ICWRIT ) THEN IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ DBLE (A( II+K+(JJ+H-1)*LDA )), $ DIMAG (A( II+K+(JJ+H-1)*LDA )) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRWRIT, ICWRIT ) ELSE IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ), $ ICURROW, ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) DBLE (WORK( K )), $ DIMAG (WORK( K )) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * IF( MYROW.EQ.IRWRIT .AND. MYCOL.EQ.ICWRIT ) THEN CLOSE( NOUT ) END IF * 9999 FORMAT( E15.8,E15.8 ) * RETURN * * End of PZLAWRITE * END scalapack-2.0.2/TOOLS/pzmatadd.f000644 000766 000024 00000027054 10363532303 016572 0ustar00juliestaff000000 000000 SUBROUTINE PZMATADD( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, $ JC, DESCC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IC, JA, JC, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ) * .. * * Purpose * ======= * * PZMATADD performs a distributed matrix-matrix addition * * sub( C ) := alpha * sub( A ) + beta * sub( C ), * * where sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1) and sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1). No communications are performed in this * routine, the arrays are supposed to be aligned. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrices sub( A ) and sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrices sub( A ) and * sub( C ). N >= 0. * * ALPHA (global input) COMPLEX*16 * The scalar ALPHA. * * A (local input) COMPLEX*16 pointer into the local memory * to a local array of dimension (LLD_A, LOCc(JA+N-1) ). This * array contains the local pieces of the distributed matrix * sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * BETA (global input) COMPLEX*16 * The scalar BETA. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * This array contains the local pieces of the distributed * matrix sub( C ). On exit, this array contains the local * pieces of the resulting distributed matrix. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICCOL, ICOFF, ICROW, IIA, $ IIC, IOFFA, IOFFC, IROFF, J, JJA, JJC, LDA, $ LDC, MP, MYCOL, MYROW, NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * * Get grid parameters. * CALL BLACS_GRIDINFO( DESCA(CTXT_), NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible. * IF( (M.EQ.0).OR.(N.EQ.0).OR. $ ((ALPHA.EQ.ZERO).AND.(BETA.EQ.ONE)) ) $ RETURN * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) * IROFF = MOD( IA-1, DESCA(MB_) ) ICOFF = MOD( JA-1, DESCA(NB_) ) MP = NUMROC( M+IROFF, DESCA(MB_), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA(NB_), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF LDA = DESCA(LLD_) LDC = DESCC(LLD_) * IF( NQ.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN IOFFC = IIC + (JJC-1)*LDC DO 10 I = IOFFC, IOFFC+MP-1 C( I ) = ZERO 10 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 20 I = IOFFC, IOFFC+MP-1 C( I ) = ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 20 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 30 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 30 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 40 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 40 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 50 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 50 CONTINUE ELSE IOFFA = IIA + (JJA-1)*LDA IOFFC = IIC + (JJC-1)*LDC DO 60 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 60 CONTINUE END IF END IF ELSE IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN IOFFC = IIC+(JJC-1)*LDC DO 80 J = 1, NQ DO 70 I = IOFFC, IOFFC+MP-1 C( I ) = ZERO 70 CONTINUE IOFFC = IOFFC + LDC 80 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 100 J = 1, NQ DO 90 I = IOFFC, IOFFC+MP-1 C( I ) = ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 90 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 100 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 120 J = 1, NQ DO 110 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 110 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 120 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 140 J = 1, NQ DO 130 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + A( IOFFA ) IOFFA = IOFFA + 1 130 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 140 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 160 J = 1, NQ DO 150 I = IOFFC, IOFFC+MP-1 C( I ) = C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 150 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 160 CONTINUE ELSE IOFFA = IIA+(JJA-1)*LDA IOFFC = IIC+(JJC-1)*LDC DO 180 J = 1, NQ DO 170 I = IOFFC, IOFFC+MP-1 C( I ) = BETA * C( I ) + ALPHA * A( IOFFA ) IOFFA = IOFFA + 1 170 CONTINUE IOFFA = IOFFA + LDA - MP IOFFC = IOFFC + LDC 180 CONTINUE END IF END IF END IF * RETURN * * End of PZMATADD * END scalapack-2.0.2/TOOLS/pzrow2col.f000644 000766 000024 00000021524 11654631032 016726 0ustar00juliestaff000000 000000 SUBROUTINE PZROW2COL( ICTXT, M, N, NB, VS, LDVS, VD, LDVD, $ RSRC, CSRC, RDEST, CDEST, WORK) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB, $ RDEST, RSRC * .. * .. Array Arguments .. COMPLEX*16 VD( LDVD, * ), VS( LDVS, * ), WORK( * ) * .. * * Purpose * ======= * * Take a block of vectors with M total rows which are distributed over * a row of processes, and distribute those rows over a column of * processes. This routine minimizes communication by sending all * information it has that a given process in the CDEST needs at once. * To do this it uses the least common multiple (LCM) concept. This is * simply the realization that if I have part of a vector split over a * process row consisting of Q processes, and I want to send all of that * vector that I own to a new vector distributed over P processes within * a process column, that after I find the process in RDEST that owns * the row of the vector I'm currently looking at, he will want every * ( (LCM(P,Q)/Q ) block of my vector (the block being of size NB x N). * * Arguments * ========= * * Rem: MP, resp. NQ, denotes the number of local rows, resp. local * ==== columns, necessary to store a global vector of dimension M * across P processes, resp. N over Q processes. * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * M (global input) INTEGER * The number of global rows each vector has. * * N (global input) INTEGER * The number of vectors in the vector block * * NB (global input) INTEGER * The blocking factor used to divide the rows of the vector * amongst the processes of a row. * * VS (local input) COMPLEX*16 * Array of dimension (LDVS,N), the block of vectors stored on * process row RSRC to be put into memory VD, and stored on * process column CDEST. * * LDVS (local input) INTEGER * The leading dimension of VS. * * VD (local output) COMPLEX*16 * Array of dimension (LDVD,N), on output, the contents of VD * stored on process column CDEST will be here. * * LDVD (local input) INTEGER * The leading dimension of VD. * * RSRC (global input) INTEGER * The process row VS is distributed over. * * CSRC (global input) INTEGER * The process column the distributed block of vectors VS * begins on. * * RDEST (global input) INTEGER * The process row that VD begins on. * * CDEST (global input) INTEGER * The process column to distribute VD over. * * WORK (local workspace) COMPLEX*16 * Array, dimension (LDW). The required size of work varies: * if( nprow.eq.npcol ) then * LDW = 0; WORK not accessed. * else * lcm = least common multiple of process rows and columns. * Mq = number of rows of VS on my process. * npcol = number of process columns * CEIL = the ceiling of given operation * LDW = NB*N*CEIL( CEIL( Mq/NB )/(LCM/npcol) ) * end if * * ===================================================================== * * .. Local Scalars .. INTEGER CBLKSKIP, ICPY, II, ISTART, ICSRC, IRDEST, JB, $ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW, $ NBLOCKS, NPCOL, NPROW, RBLKSKIP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, ZGESD2D, ZGERV2D, ZLACPY * .. * .. External Functions .. INTEGER ILCM, NUMROC EXTERNAL ILCM, NUMROC * .. * .. Executable Statements .. * ICPY = 0 * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If we are not in special case for NPROW = NPCOL where there is no * copying required * IF( NPROW .NE. NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) RBLKSKIP = LCM / NPCOL CBLKSKIP = LCM / NPROW * * If I have part of VS, the source vector(s) * IF( MYROW.EQ.RSRC ) THEN * ISTART = 1 * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my first block * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) * * Loop over all possible destination processes * DO 20 K = 1, RBLKSKIP JJ = 1 * * If I am not destination process * IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN * * Pack all data I own that destination needs * DO 10 II = ISTART, MQ, NB*RBLKSKIP JB = MIN( NB, MQ-II+1 ) CALL ZLACPY( 'G', JB, N, VS(II,1), LDVS, $ WORK(JJ), JB ) JJ = JJ + NB*N 10 CONTINUE * * Figure how many rows are to be sent and send them if * necessary, NOTE: will send extra if NB > JB * JJ = JJ - 1 IF( JJ.GT.0 ) $ CALL ZGESD2D( ICTXT, JJ, 1, WORK, JJ, IRDEST, $ CDEST ) * * I am both source and destination, save where to start * copying from for later use * ELSE ICPY = ISTART END IF * ISTART = ISTART + NB IRDEST = MOD( IRDEST+NPCOL, NPROW ) 20 CONTINUE END IF * * If I should receive info into VD * IF( MYCOL.EQ.CDEST ) THEN * ISTART = 1 * * Figure my distance from CDEST: the process in CSRC the same * distance from RSRC will have my first block * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) * * Loop over all sending processes * DO 50 K = 1, CBLKSKIP * * If I don't already possess the required data * IF( (MYROW.NE.RSRC).OR.(MYCOL.NE.ICSRC) ) THEN * * Figure how many rows to receive, and receive them * NOTE: may receive to much -- NB instead of JB * NBLOCKS = (MP - ISTART + NB) / NB JJ = ((NBLOCKS+CBLKSKIP-1) / CBLKSKIP)*NB IF( JJ.GT.0 ) $ CALL ZGERV2D( ICTXT, JJ, N, WORK, JJ, RSRC, ICSRC ) * * Copy data to destination vector * JJ = 1 DO 30 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL ZLACPY( 'G', JB, N, WORK(JJ), JB, VD(II,1), $ LDVD ) JJ = JJ + NB*N 30 CONTINUE * * If I am both source and destination * ELSE JJ = ICPY DO 40 II = ISTART, MP, NB*CBLKSKIP JB = MIN( NB, MP-II+1 ) CALL ZLACPY( 'G', JB, N, VS(JJ,1), LDVS, VD(II,1), $ LDVD ) JJ = JJ + NB*RBLKSKIP 40 CONTINUE END IF ISTART = ISTART + NB ICSRC = MOD( ICSRC+NPROW, NPCOL ) 50 CONTINUE END IF * * if NPROW = NPCOL, there is a one-to-one correspondance between * process rows and columns, so no work space or copying required * ELSE * IF( MYROW.EQ.RSRC ) THEN * * Figure my distance from CSRC: the process in CDEST the same * distance from RDEST will want my piece of the vector * MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL ) MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL ) IRDEST = MOD( RDEST+MYDIST, NPROW ) IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN CALL ZGESD2D( ICTXT, MQ, N, VS, LDVS, IRDEST, CDEST ) ELSE CALL ZLACPY( 'G', MQ, N, VS, LDVS, VD, LDVD ) END IF END IF IF( MYCOL.EQ.CDEST ) THEN * * Figure my distance from RDEST: the process in RSRC the same * distance from CSRC will have my piece of the vector * MYDIST = MOD( NPROW+MYROW-RDEST, NPROW ) MP = NUMROC( M, NB, MYROW, RDEST, NPROW ) ICSRC = MOD( CSRC+MYDIST, NPCOL ) IF( (MYCOL.NE.ICSRC).OR.(MYROW.NE. RSRC) ) $ CALL ZGERV2D( ICTXT, MP, N, VD, LDVD, RSRC, ICSRC ) END IF END IF * RETURN * * End of PZROW2COL * END scalapack-2.0.2/TOOLS/pztreecomb.f000644 000766 000024 00000016013 11654631032 017134 0ustar00juliestaff000000 000000 SUBROUTINE PZTREECOMB( ICTXT, SCOPE, N, MINE, RDEST0, CDEST0, $ SUBPTR ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SCOPE INTEGER CDEST0, ICTXT, N, RDEST0 * .. * .. Array Arguments .. COMPLEX*16 MINE( * ) * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PZTREECOMB does a 1-tree parallel combine operation on scalars, * using the subroutine indicated by SUBPTR to perform the required * computation. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * SCOPE (global input) CHARACTER * The scope of the operation: 'Rowwise', 'Columnwise', or * 'All'. * * N (global input) INTEGER * The number of elements in MINE. N = 1 for the norm-2 * computation and 2 for the sum of square. * * MINE (local input/global output) COMPLEX*16 array of * dimension at least equal to N. The local data to use in the * combine. * * RDEST0 (global input) INTEGER * The process row to receive the answer. If RDEST0 = -1, * every process in the scope gets the answer. * * CDEST0 (global input) INTEGER * The process column to receive the answer. If CDEST0 = -1, * every process in the scope gets the answer. * * SUBPTR (local input) Pointer to the subroutine to call to perform * the required combine. * * ===================================================================== * * .. Local Scalars .. LOGICAL BCAST, RSCOPE, CSCOPE INTEGER CMSSG, DEST, DIST, HISDIST, I, IAM, MYCOL, $ MYROW, MYDIST, MYDIST2, NP, NPCOL, NPROW, $ RMSSG, TCDEST, TRDEST * .. * .. Local Arrays .. COMPLEX*16 HIS( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, ZGEBR2D, ZGEBS2D, $ ZGERV2D, ZGESD2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * DEST = 0 * * See if everyone wants the answer (need to broadcast the answer) * BCAST = ( ( RDEST0.EQ.-1 ).OR.( CDEST0.EQ.-1 ) ) IF( BCAST ) THEN TRDEST = 0 TCDEST = 0 ELSE TRDEST = RDEST0 TCDEST = CDEST0 END IF * * Get grid parameters. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure scope-dependant variables, or report illegal scope * RSCOPE = LSAME( SCOPE, 'R' ) CSCOPE = LSAME( SCOPE, 'C' ) * IF( RSCOPE ) THEN IF( BCAST ) THEN TRDEST = MYROW ELSE IF( MYROW.NE.TRDEST ) THEN RETURN END IF NP = NPCOL MYDIST = MOD( NPCOL + MYCOL - TCDEST, NPCOL ) ELSE IF( CSCOPE ) THEN IF( BCAST ) THEN TCDEST = MYCOL ELSE IF( MYCOL.NE.TCDEST ) THEN RETURN END IF NP = NPROW MYDIST = MOD( NPROW + MYROW - TRDEST, NPROW ) ELSE IF( LSAME( SCOPE, 'A' ) ) THEN NP = NPROW * NPCOL IAM = MYROW*NPCOL + MYCOL DEST = TRDEST*NPCOL + TCDEST MYDIST = MOD( NP + IAM - DEST, NP ) ELSE RETURN END IF * IF( NP.LT.2 ) $ RETURN * MYDIST2 = MYDIST RMSSG = MYROW CMSSG = MYCOL I = 1 * 10 CONTINUE * IF( MOD( MYDIST, 2 ).NE.0 ) THEN * * If I am process that sends information * DIST = I * ( MYDIST - MOD( MYDIST, 2 ) ) * * Figure coordinates of dest of message * IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) END IF * CALL ZGESD2D( ICTXT, N, 1, MINE, N, RMSSG, CMSSG ) * GO TO 20 * ELSE * * If I am a process receiving information, figure coordinates * of source of message * DIST = MYDIST2 + I IF( RSCOPE ) THEN CMSSG = MOD( TCDEST + DIST, NP ) HISDIST = MOD( NP + CMSSG - TCDEST, NP ) ELSE IF( CSCOPE ) THEN RMSSG = MOD( TRDEST + DIST, NP ) HISDIST = MOD( NP + RMSSG - TRDEST, NP ) ELSE CMSSG = MOD( DEST + DIST, NP ) RMSSG = CMSSG / NPCOL CMSSG = MOD( CMSSG, NPCOL ) HISDIST = MOD( NP + RMSSG*NPCOL+CMSSG - DEST, NP ) END IF * IF( MYDIST2.LT.HISDIST ) THEN * * If I have anyone sending to me * CALL ZGERV2D( ICTXT, N, 1, HIS, N, RMSSG, CMSSG ) CALL SUBPTR( MINE, HIS ) * END IF MYDIST = MYDIST / 2 * END IF I = I * 2 * IF( I.LT.NP ) $ GO TO 10 * 20 CONTINUE * IF( BCAST ) THEN IF( MYDIST2.EQ.0 ) THEN CALL ZGEBS2D( ICTXT, SCOPE, ' ', N, 1, MINE, N ) ELSE CALL ZGEBR2D( ICTXT, SCOPE, ' ', N, 1, MINE, N, $ TRDEST, TCDEST ) END IF END IF * RETURN * * End of PZTREECOMB * END * SUBROUTINE ZCOMBAMAX( V1, V2 ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. COMPLEX*16 V1( 2 ), V2( 2 ) * .. * * Purpose * ======= * * ZCOMBAMAX finds the element having max. absolute value as well * as its corresponding globl index. * * Arguments * ========= * * V1 (local input/local output) COMPLEX*16 array of * dimension 2. The first maximum absolute value element and * its global index. V1(1) = AMAX, V1(2) = INDX. * * V2 (local input) COMPLEX*16 array of dimension 2. * The second maximum absolute value element and its global * index. V2(1) = AMAX, V2(2) = INDX. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG * .. * .. Statement Functions .. COMPLEX*16 ZDUM DOUBLE PRECISION CABS1 CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * IF( CABS1( V1( 1 ) ).LT.CABS1( V2( 1 ) ) ) THEN V1( 1 ) = V2( 1 ) V1( 2 ) = V2( 2 ) END IF * RETURN * * End of ZCOMBAMAX * END scalapack-2.0.2/TOOLS/reshape.c000644 000766 000024 00000010145 10363532303 016403 0ustar00juliestaff000000 000000 #include void Creshape( context_in, major_in, context_out, major_out, first_proc, nprow_new, npcol_new ) int context_in, *context_out, first_proc, major_in, major_out, nprow_new, npcol_new; /* major in, major out represent whether processors go row major (1) or column major (2) in the input and output grids */ { /** called subprograms **/ void proc_inc(); void Cblacs_gridinfo(); int Cblacs_pnum(); void Cblacs_get(); void Cblacs_gridmap(); /** variables **/ int i, j; int nprow_in, npcol_in, myrow_in, mycol_in; int nprocs_new; int myrow_old, mycol_old, myrow_new, mycol_new; int pnum; int *grid_new; /********** executable statements ************/ nprocs_new = nprow_new * npcol_new; Cblacs_gridinfo( context_in, &nprow_in, &npcol_in, &myrow_in, &mycol_in ); /* Quick return if possible */ if( ( nprow_in == nprow_new ) && ( npcol_in == npcol_new ) && ( first_proc == 0 ) && ( major_in == major_out ) ) { *context_out = context_in; return; } /* allocate space for new process mapping */ grid_new = (int *) malloc( nprocs_new * sizeof( int ) ); /* set place in old grid to start grabbing processors for new grid */ myrow_old = 0; mycol_old = 0; if ( major_in == 1 ) /* row major */ { myrow_old = first_proc / nprow_in; mycol_old = first_proc % nprow_in; } else /* col major */ { myrow_old = first_proc % nprow_in; mycol_old = first_proc / nprow_in; } myrow_new = 0; mycol_new = 0; /* Set up array of process numbers for new grid */ for (i=0; i< nprocs_new; i++ ) { pnum = Cblacs_pnum( context_in, myrow_old, mycol_old ); grid_new[ (mycol_new * nprow_new) + myrow_new ] = pnum; proc_inc( &myrow_old, &mycol_old, nprow_in, npcol_in, major_in ); proc_inc( &myrow_new, &mycol_new, nprow_new, npcol_new, major_out ); } /* get context */ Cblacs_get( context_in, 10, context_out ); /* allocate grid */ Cblacs_gridmap( context_out, grid_new, nprow_new, nprow_new, npcol_new ); /* free malloced space */ free( grid_new ); } /*************************************************************************/ void reshape( context_in, major_in, context_out, major_out, first_proc, nprow_new, npcol_new ) int *context_in, *context_out, *first_proc, *major_in, *major_out, *nprow_new, *npcol_new; { Creshape( *context_in, *major_in, context_out, *major_out, *first_proc, *nprow_new, *npcol_new ); } /*************************************************************************/ void RESHAPE( context_in, major_in, context_out, major_out, first_proc, nprow_new, npcol_new ) int *context_in, *context_out, *first_proc, *major_in, *major_out, *nprow_new, *npcol_new; { Creshape( *context_in, *major_in, context_out, *major_out, *first_proc, *nprow_new, *npcol_new ); } /*************************************************************************/ void reshape_( context_in, major_in, context_out, major_out, first_proc, nprow_new, npcol_new ) int *context_in, *context_out, *first_proc, *major_in, *major_out, *nprow_new, *npcol_new; { Creshape( *context_in, *major_in, context_out, *major_out, *first_proc, *nprow_new, *npcol_new ); } /*************************************************************************/ void proc_inc( myrow, mycol, nprow, npcol, major ) int *myrow, *mycol, nprow, npcol, major; { if( major == 1) /* row major */ { if( *mycol == npcol-1 ) { *mycol = 0; if( *myrow == nprow-1 ) { *myrow = 0; } else { *myrow = *myrow + 1; } } else { *mycol = *mycol + 1; } } else /* col major */ { if( *myrow == nprow-1 ) { *myrow = 0; if( *mycol == npcol-1 ) { *mycol = 0; } else { *mycol = *mycol + 1; } } else { *myrow = *myrow + 1; } } } scalapack-2.0.2/TOOLS/SL_gridreshape.c000644 000766 000024 00000004304 10363532303 017647 0ustar00juliestaff000000 000000 #include #include int SL_Cgridreshape(ctxt, pstart, row_major_in, row_major_out, P, Q) int ctxt, pstart, row_major_in, row_major_out, P, Q; { int Cblacs_pnum(); int nctxt, P0, Q0, Np, i, *g; Cblacs_gridinfo(ctxt, &P0, &Q0, &i, &Np); Np = P * Q; if (Np+pstart > P0*Q0) { fprintf(stderr, "Illegal reshape command in %s\n",__FILE__); Cblacs_abort(ctxt, -22); } g = (int *) malloc(Np * sizeof(int)); if (!g) { fprintf(stderr, "Cannot allocate memory in %s\n",__FILE__); Cblacs_abort(ctxt, -23); } if (row_major_in) /* Read in in row-major order */ { if (row_major_out) for (i=0; i != Np; i++) g[(i%Q)*P+i/Q] = Cblacs_pnum(ctxt, (pstart+i)/Q0, (pstart+i)%Q0); else for (i=0; i != Np; i++) g[i] = Cblacs_pnum(ctxt, (pstart+i)/Q0, (pstart+i)%Q0); } else /* read in in column-major order */ { if (row_major_out) for (i=0; i != Np; i++) g[(i%Q)*P+i/Q] = Cblacs_pnum(ctxt, (pstart+i)%P0, (pstart+i)/P0); else for (i=0; i != Np; i++) g[i] = Cblacs_pnum(ctxt, (pstart+i)%P0, (pstart+i)/P0); } Cblacs_get(ctxt, 10, &nctxt); Cblacs_gridmap(&nctxt, g, P, P, Q); free(g); return(nctxt); } int sl_gridreshape_(ctxt, pstart, row_major_in, row_major_out, P, Q) int *ctxt, *pstart, *row_major_in, *row_major_out, *P, *Q; { return( SL_Cgridreshape(*ctxt, *pstart, *row_major_in, *row_major_out, *P, *Q) ); } int SL_GRIDRESHAPE(ctxt, pstart, row_major_in, row_major_out, P, Q) int *ctxt, *pstart, *row_major_in, *row_major_out, *P, *Q; { return( SL_Cgridreshape(*ctxt, *pstart, *row_major_in, *row_major_out, *P, *Q) ); } int sl_gridreshape__(ctxt, pstart, row_major_in, row_major_out, P, Q) int *ctxt, *pstart, *row_major_in, *row_major_out, *P, *Q; { return( SL_Cgridreshape(*ctxt, *pstart, *row_major_in, *row_major_out, *P, *Q) ); } int sl_gridreshape(ctxt, pstart, row_major_in, row_major_out, P, Q) int *ctxt, *pstart, *row_major_in, *row_major_out, *P, *Q; { return( SL_Cgridreshape(*ctxt, *pstart, *row_major_in, *row_major_out, *P, *Q) ); } scalapack-2.0.2/TOOLS/SL_init.f000644 000766 000024 00000003446 10363532303 016326 0ustar00juliestaff000000 000000 SUBROUTINE SL_INIT( ICTXT, NPROW, NPCOL ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT, NPCOL, NPROW * .. * * Purpose * ======= * * SL_INIT initializes an NPROW x NPCOL process grid using a row-major * ordering of the processes. This routine retrieves a default system * context which will include all available processes. In addition it * spawns the processes if needed. * * Arguments * ========= * * ICTXT (global output) INTEGER * ICTXT specifies the BLACS context handle identifying the * created process grid. The context itself is global. * * NPROW (global input) INTEGER * NPROW specifies the number of process rows in the grid * to be created. * * NPCOL (global input) INTEGER * NPCOL specifies the number of process columns in the grid * to be created. * * ===================================================================== * * .. Local Scalars .. INTEGER IAM, NPROCS * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDINIT, BLACS_PINFO, $ BLACS_SETUP * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * If machine needs additional set up, do it now * IF( NPROCS.LT.1 ) THEN IF( IAM.EQ.0 ) $ NPROCS = NPROW * NPCOL CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * RETURN * * End of SL_INIT * END scalapack-2.0.2/TOOLS/slatcpy.f000644 000766 000024 00000004342 10363532303 016440 0ustar00juliestaff000000 000000 SUBROUTINE SLATCPY( UPLO, M, N, A, LDA, B, LDB ) * * * .. Scalar Arguments .. CHARACTER UPLO INTEGER LDA, LDB, M, N * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SLATCPY copies all or part of a two-dimensional matrix A to another * matrix B in transpose form. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies the part of the matrix A to be copied to B. * = 'U': Upper triangular part * = 'L': Lower triangular part * Otherwise: All of the matrix A * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input) DOUBLE PRECISION array, dimension (LDA,N) * The m by n matrix A. If UPLO = 'U', only the upper triangle * or trapezoid is accessed; if UPLO = 'L', only the lower * triangle or trapezoid is accessed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * B (output) DOUBLE PRECISION array, dimension (LDB,M) * On exit, B = A^T in the locations specified by UPLO. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * * .. * .. Executable Statements .. * IF( LSAME( UPLO, 'U' ) ) THEN DO 20 J = 1, N DO 10 I = 1, MIN( J, M ) B( J, I ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN DO 40 J = 1, N DO 30 I = J, M B( J, I ) = A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( J, I ) = A( I, J ) 50 CONTINUE 60 CONTINUE END IF RETURN * * End of SLATCPY * END scalapack-2.0.2/TOOLS/sltimer.f000644 000766 000024 00000025473 10363532303 016450 0ustar00juliestaff000000 000000 SUBROUTINE SLBOOT() * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * SLBOOT (re)sets all timers to 0, and enables SLtimer. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) DOUBLE PRECISION STARTFLAG, ZERO PARAMETER ( STARTFLAG = -5.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * DISABLED = .FALSE. DO 10 I = 1, NTIMER CPUSEC( I ) = ZERO WALLSEC( I ) = ZERO CPUSTART( I ) = STARTFLAG WALLSTART( I ) = STARTFLAG 10 CONTINUE * RETURN * * End of SLBOOT * END * SUBROUTINE SLTIMER( I ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER I * .. * * Purpose * ======= * * SLtimer provides a "stopwatch" functionality cpu/wall timer * (in seconds). Up to 64 separate timers can be functioning at once. * The first call starts the timer, and the second stops it. This * routine can be disenabled, so that calls to the timer are ignored. * This feature can be used to make sure certain sections of code do * not affect timings, even if they call routines which have SLtimer * calls in them. * * Arguments * ========= * * I (global input) INTEGER * The timer to stop/start. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) DOUBLE PRECISION STARTFLAG PARAMETER ( STARTFLAG = -5.0D+0 ) * .. * .. External Functions .. DOUBLE PRECISION DCPUTIME00, DWALLTIME00 EXTERNAL DCPUTIME00, DWALLTIME00 * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * * If timing disabled, return * IF( DISABLED ) $ RETURN * IF( WALLSTART( I ).EQ.STARTFLAG ) THEN * * If timer has not been started, start it * WALLSTART( I ) = DWALLTIME00() CPUSTART( I ) = DCPUTIME00() * ELSE * * Stop timer and add interval to count * CPUSEC( I ) = CPUSEC( I ) + DCPUTIME00() - CPUSTART( I ) WALLSEC( I ) = WALLSEC( I ) + DWALLTIME00() - WALLSTART( I ) WALLSTART( I ) = STARTFLAG * END IF * RETURN * * End of SLTIMER * END * SUBROUTINE SLENABLE() * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * SLENABLE sets it so calls to SLtimer are not ignored. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * DISABLED = .FALSE. * RETURN * END * SUBROUTINE SLDISABLE() * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * SLDISABLE sets it so calls to SLTIMER are ignored. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * DISABLED = .TRUE. * RETURN * * End of SLDISABLE * END * DOUBLE PRECISION FUNCTION SLINQUIRE( TIMETYPE, I ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 TIMETYPE INTEGER I * .. * * Purpose * ======= * * SLINQUIRE returns wall or cpu time that has accumulated in timer I. * * Arguments * ========= * * TIMETYPE (global input) CHARACTER * Controls what time will be returned: * = 'W': wall clock time is returned, * = 'C': CPU time is returned (default). * * I (global input) INTEGER * The timer to return. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) DOUBLE PRECISION ERRFLAG PARAMETER ( ERRFLAG = -1.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION TIME * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DCPUTIME00, DWALLTIME00 EXTERNAL DCPUTIME00, DWALLTIME00, LSAME * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * IF( LSAME( TIMETYPE, 'W' ) ) THEN * * If walltime not available on this machine, return -1 flag * IF( DWALLTIME00().EQ.ERRFLAG ) THEN TIME = ERRFLAG ELSE TIME = WALLSEC( I ) END IF ELSE IF( DCPUTIME00().EQ.ERRFLAG ) THEN TIME = ERRFLAG ELSE TIME = CPUSEC( I ) END IF END IF * SLINQUIRE = TIME * RETURN * * End of SLINQUIRE * END * SUBROUTINE SLCOMBINE( ICTXT, SCOPE, OP, TIMETYPE, N, IBEG, $ TIMES ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER OP, SCOPE, TIMETYPE INTEGER IBEG, ICTXT, N * .. * .. Array Arguments .. DOUBLE PRECISION TIMES( N ) * .. * * Purpose * ======= * * SLCOMBINE takes the timing information stored on a scope of processes * and combines them into the user's TIMES array. * * Arguments * ========= * * ICTXT (local input) INTEGER * The BLACS context handle. * * SCOPE (global input) CHARACTER * Controls what processes in grid participate in combine. * Options are 'Rowwise', 'Columnwise', or 'All'. * * OP (global input) CHARACTER * Controls what combine should be done: * = '>': get maximal time on any process (default), * = '<': get minimal time on any process, * = '+': get sum of times across processes. * * TIMETYPE (global input) CHARACTER * Controls what time will be returned in TIMES: * = 'W': wall clock time, * = 'C': CPU time (default). * * N (global input) INTEGER * The number of timers to combine. * * IBEG (global input) INTEGER * The first timer to be combined. * * TIMES (global output) DOUBLE PRECISION array, dimension (N) * The requested timing information is returned in this array. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) DOUBLE PRECISION ERRFLAG PARAMETER ( ERRFLAG = -1.0D+0 ) * .. * .. Local Scalars .. LOGICAL TMPDIS INTEGER I * .. * .. External Subroutines .. EXTERNAL DGAMX2D, DGAMN2D, DGSUM2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DCPUTIME00, DWALLTIME00 EXTERNAL DCPUTIME00, DWALLTIME00, LSAME * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * * Disable timer for combine operation * TMPDIS = DISABLED DISABLED = .TRUE. * * Copy timer information into user's times array * IF( LSAME( TIMETYPE, 'W' ) ) THEN * * If walltime not available on this machine, fill in times * with -1 flag, and return * IF( DWALLTIME00().EQ.ERRFLAG ) THEN DO 10 I = 1, N TIMES( I ) = ERRFLAG 10 CONTINUE RETURN ELSE DO 20 I = 1, N TIMES( I ) = WALLSEC( IBEG + I - 1 ) 20 CONTINUE END IF ELSE IF( DCPUTIME00().EQ.ERRFLAG ) THEN DO 30 I = 1, N TIMES( I ) = ERRFLAG 30 CONTINUE RETURN ELSE DO 40 I = 1, N TIMES( I ) = CPUSEC( IBEG + I - 1 ) 40 CONTINUE END IF ENDIF * * Combine all nodes' information, restore disabled, and return * IF( OP.EQ.'>' ) THEN CALL DGAMX2D( ICTXT, SCOPE, ' ', N, 1, TIMES, N, -1, -1, $ -1, -1, 0 ) ELSE IF( OP.EQ.'<' ) THEN CALL DGAMN2D( ICTXT, SCOPE, ' ', N, 1, TIMES, N, -1, -1, $ -1, -1, 0 ) ELSE IF( OP.EQ.'+' ) THEN CALL DGSUM2D( ICTXT, SCOPE, ' ', N, 1, TIMES, N, -1, 0 ) ELSE CALL DGAMX2D( ICTXT, SCOPE, ' ', N, 1, TIMES, N, -1, -1, $ -1, -1, 0 ) END IF * DISABLED = TMPDIS * RETURN * * End of SLCOMBINE * END scalapack-2.0.2/TOOLS/smatadd.f000644 000766 000024 00000010247 10363532303 016377 0ustar00juliestaff000000 000000 SUBROUTINE SMATADD( M, N, ALPHA, A, LDA, BETA, C, LDC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER LDA, LDC, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * SMATADD performs the following local matrix-matrix operation * * C := alpha * A + beta * C, * * where alpha and beta are scalars, and A and C are m by n arrays. * * Arguments * ========= * * M (local input) INTEGER * The number of rows of the array A. M >= 0. * * N (local input) INTEGER * The number of columns of the array A. N >= 0. * * ALPHA (local input) REAL * The scalar ALPHA. * * A (local input) REAL * Array, dimension (LDA,*), the array A. * * LDA (local input) INTEGER * The leading dimension of the array A, LDA >= MAX(1, M) * * BETA (local input) REAL * The scalar BETA. * * C (local input/local output) REAL * Array, dimension (LDC,*), the array C. * * LDC (local input) INTEGER * The leading dimension of the array C, LDC >= MAX(1, M) * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * * Quick return if possible. * IF( (M.EQ.0).OR.(N.EQ.0).OR.((ALPHA.EQ.ZERO).AND.(BETA.EQ.ONE)) ) $ RETURN * IF( N.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 10 I = 1, M C( I, 1 ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) 20 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN DO 30 I = 1, M C( I, 1 ) = A( I, 1 ) + C( I, 1 ) 30 CONTINUE ELSE DO 40 I = 1, M C( I, 1 ) = A( I, 1 ) + BETA*C( I, 1 ) 40 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN DO 50 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) + C( I, 1 ) 50 CONTINUE ELSE DO 60 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) + BETA*C( I, 1 ) 60 CONTINUE END IF END IF ELSE IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M C( I, J ) = ZERO 70 CONTINUE 80 CONTINUE ELSE DO 100 J = 1, N DO 90 I = 1, M C( I, J ) = ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN DO 120 J = 1, N DO 110 I = 1, M C( I, J ) = A( I, J ) + C( I, J ) 110 CONTINUE 120 CONTINUE ELSE DO 140 J = 1, N DO 130 I = 1, M C( I, J ) = A( I, J ) + BETA * C( I, J ) 130 CONTINUE 140 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN DO 160 J = 1, N DO 150 I = 1, M C( I, J ) = C( I, J ) + ALPHA * A( I, J ) 150 CONTINUE 160 CONTINUE ELSE DO 180 J = 1, N DO 170 I = 1, M C( I, J ) = ALPHA * A( I, J ) + BETA * C( I, J ) 170 CONTINUE 180 CONTINUE END IF END IF END IF * RETURN * * End of SMATADD * END scalapack-2.0.2/TOOLS/ssdot.f000644 000766 000024 00000001547 10363532303 016121 0ustar00juliestaff000000 000000 SUBROUTINE SSDOT( N, DOT, X, INCX, Y, INCY ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, INCY, N REAL DOT * .. * .. Array Arguments .. REAL X( * ), Y( * ) * .. * * Purpose * ======= * * SSDOT is a simple FORTRAN wrapper around the BLAS function * SDOT returning the result in the parameter list instead. * * ===================================================================== * * .. External Functions .. REAL SDOT EXTERNAL SDOT * .. * .. Executable Statements .. * DOT = SDOT( N, X, INCX, Y, INCY ) * RETURN * * End of SSDOT * END scalapack-2.0.2/TOOLS/zlatcpy.f000644 000766 000024 00000004440 10363532303 016446 0ustar00juliestaff000000 000000 SUBROUTINE ZLATCPY( UPLO, M, N, A, LDA, B, LDB ) * * * .. Scalar Arguments .. CHARACTER UPLO INTEGER LDA, LDB, M, N * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZLATCPY copies all or part of a two-dimensional matrix A to another * matrix B in transpose form. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies the part of the matrix A to be copied to B. * = 'U': Upper triangular part * = 'L': Lower triangular part * Otherwise: All of the matrix A * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input) DOUBLE PRECISION array, dimension (LDA,N) * The m by n matrix A. If UPLO = 'U', only the upper triangle * or trapezoid is accessed; if UPLO = 'L', only the lower * triangle or trapezoid is accessed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * B (output) DOUBLE PRECISION array, dimension (LDB,M) * On exit, B = A^T in the locations specified by UPLO. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN INTRINSIC DCONJG * * .. * .. Executable Statements .. * IF( LSAME( UPLO, 'U' ) ) THEN DO 20 J = 1, N DO 10 I = 1, MIN( J, M ) B( J, I ) = DCONJG( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN DO 40 J = 1, N DO 30 I = J, M B( J, I ) = DCONJG( A( I, J ) ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( J, I ) = DCONJG( A( I, J ) ) 50 CONTINUE 60 CONTINUE END IF RETURN * * End of ZLATCPY * END scalapack-2.0.2/TOOLS/zmatadd.f000644 000766 000024 00000010362 10363532303 016404 0ustar00juliestaff000000 000000 SUBROUTINE ZMATADD( M, N, ALPHA, A, LDA, BETA, C, LDC ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER LDA, LDC, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * ZMATADD performs the following local matrix-matrix operation * * C := alpha * A + beta * C, * * where alpha and beta are scalars, and A and C are m by n arrays. * * Arguments * ========= * * M (local input) INTEGER * The number of rows of the array A. M >= 0. * * N (local input) INTEGER * The number of columns of the array A. N >= 0. * * ALPHA (local input) COMPLEX*16 * The scalar ALPHA. * * A (local input) COMPLEX*16 * Array, dimension (LDA,*), the array A. * * LDA (local input) INTEGER * The leading dimension of the array A, LDA >= MAX(1, M) * * BETA (local input) COMPLEX*16 * The scalar BETA. * * C (local input/local output) COMPLEX*16 * Array, dimension (LDC,*), the array C. * * LDC (local input) INTEGER * The leading dimension of the array C, LDC >= MAX(1, M) * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * * Quick return if possible. * IF( (M.EQ.0).OR.(N.EQ.0).OR.((ALPHA.EQ.ZERO).AND.(BETA.EQ.ONE)) ) $ RETURN * IF( N.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 10 I = 1, M C( I, 1 ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) 20 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN DO 30 I = 1, M C( I, 1 ) = A( I, 1 ) + C( I, 1 ) 30 CONTINUE ELSE DO 40 I = 1, M C( I, 1 ) = A( I, 1 ) + BETA*C( I, 1 ) 40 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN DO 50 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) + C( I, 1 ) 50 CONTINUE ELSE DO 60 I = 1, M C( I, 1 ) = ALPHA*A( I, 1 ) + BETA*C( I, 1 ) 60 CONTINUE END IF END IF ELSE IF( BETA.EQ.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M C( I, J ) = ZERO 70 CONTINUE 80 CONTINUE ELSE DO 100 J = 1, N DO 90 I = 1, M C( I, J ) = ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ONE ) THEN DO 120 J = 1, N DO 110 I = 1, M C( I, J ) = A( I, J ) + C( I, J ) 110 CONTINUE 120 CONTINUE ELSE DO 140 J = 1, N DO 130 I = 1, M C( I, J ) = A( I, J ) + BETA * C( I, J ) 130 CONTINUE 140 CONTINUE END IF ELSE IF( BETA.EQ.ONE ) THEN DO 160 J = 1, N DO 150 I = 1, M C( I, J ) = C( I, J ) + ALPHA * A( I, J ) 150 CONTINUE 160 CONTINUE ELSE DO 180 J = 1, N DO 170 I = 1, M C( I, J ) = ALPHA * A( I, J ) + BETA * C( I, J ) 170 CONTINUE 180 CONTINUE END IF END IF END IF * RETURN * * End of ZMATADD * END scalapack-2.0.2/TOOLS/zzdotc.f000644 000766 000024 00000001561 10363532303 016276 0ustar00juliestaff000000 000000 SUBROUTINE ZZDOTC( N, DOTC, X, INCX, Y, INCY ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, INCY, N COMPLEX*16 DOTC * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * .. * * Purpose * ======= * * ZZDOTC is a simple FORTRAN wrapper around the BLAS function * ZDOTC returning the result in the parameter list instead. * * ===================================================================== * * .. External Functions .. COMPLEX*16 ZDOTC EXTERNAL ZDOTC * .. * .. Executable Statements .. * DOTC = ZDOTC( N, X, INCX, Y, INCY ) * RETURN * * End of ZZDOTC * END scalapack-2.0.2/TOOLS/zzdotu.f000644 000766 000024 00000001561 10363532303 016320 0ustar00juliestaff000000 000000 SUBROUTINE ZZDOTU( N, DOTU, X, INCX, Y, INCY ) * * -- ScaLAPACK tools routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, INCY, N COMPLEX*16 DOTU * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * .. * * Purpose * ======= * * ZZDOTU is a simple FORTRAN wrapper around the BLAS function * ZDOTU returning the result in the parameter list instead. * * ===================================================================== * * .. External Functions .. COMPLEX*16 ZDOTU EXTERNAL ZDOTU * .. * .. Executable Statements .. * DOTU = ZDOTU( N, X, INCX, Y, INCY ) * RETURN * * End of ZZDOTU * END scalapack-2.0.2/TOOLS/LAPACK/clagge.f000644 000766 000024 00000022460 10604316307 017141 0ustar00juliestaff000000 000000 SUBROUTINE CLAGGE( M, N, KL, KU, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDA, M, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL D( * ) COMPLEX A( LDA, * ), WORK( * ) * .. * * Purpose * ======= * * CLAGGE generates a complex general m by n matrix A, by pre- and post- * multiplying a real diagonal matrix D with random unitary matrices: * A = U*D*V. The lower and upper bandwidths may then be reduced to * kl and ku by additional unitary transformations. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= KL <= M-1. * * KU (input) INTEGER * The number of nonzero superdiagonals within the band of A. * 0 <= KU <= N-1. * * D (input) REAL array, dimension (min(M,N)) * The diagonal elements of the diagonal matrix D. * * A (output) COMPLEX array, dimension (LDA,N) * The generated m by n matrix A. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= M. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) COMPLEX array, dimension (M+N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J REAL WN COMPLEX TAU, WA, WB * .. * .. External Subroutines .. EXTERNAL CGEMV, CGERC, CLACGV, CLARNV, CSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, REAL * .. * .. External Functions .. REAL SCNRM2 EXTERNAL SCNRM2 * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( KL.LT.0 .OR. KL.GT.M-1 ) THEN INFO = -3 ELSE IF( KU.LT.0 .OR. KU.GT.N-1 ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'CLAGGE', -INFO ) RETURN END IF * * initialize A to diagonal matrix * DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, MIN( M, N ) A( I, I ) = D( I ) 30 CONTINUE * * pre- and post-multiply A by random unitary matrices * DO 40 I = MIN( M, N ), 1, -1 IF( I.LT.M ) THEN * * generate random reflection * CALL CLARNV( 3, ISEED, M-I+1, WORK ) WN = SCNRM2( M-I+1, WORK, 1 ) WA = ( WN / ABS( WORK( 1 ) ) )*WORK( 1 ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL CSCAL( M-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = REAL( WB / WA ) END IF * * multiply A(i:m,i:n) by random reflection from the left * CALL CGEMV( 'Conjugate transpose', M-I+1, N-I+1, ONE, $ A( I, I ), LDA, WORK, 1, ZERO, WORK( M+1 ), 1 ) CALL CGERC( M-I+1, N-I+1, -TAU, WORK, 1, WORK( M+1 ), 1, $ A( I, I ), LDA ) END IF IF( I.LT.N ) THEN * * generate random reflection * CALL CLARNV( 3, ISEED, N-I+1, WORK ) WN = SCNRM2( N-I+1, WORK, 1 ) WA = ( WN / ABS( WORK( 1 ) ) )*WORK( 1 ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL CSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = REAL( WB / WA ) END IF * * multiply A(i:m,i:n) by random reflection from the right * CALL CGEMV( 'No transpose', M-I+1, N-I+1, ONE, A( I, I ), $ LDA, WORK, 1, ZERO, WORK( N+1 ), 1 ) CALL CGERC( M-I+1, N-I+1, -TAU, WORK( N+1 ), 1, WORK, 1, $ A( I, I ), LDA ) END IF 40 CONTINUE * * Reduce number of subdiagonals to KL and number of superdiagonals * to KU * DO 70 I = 1, MAX( M-1-KL, N-1-KU ) IF( KL.LE.KU ) THEN * * annihilate subdiagonal elements first (necessary if KL = 0) * IF( I.LE.MIN( M-1-KL, N ) ) THEN * * generate reflection to annihilate A(kl+i+1:m,i) * WN = SCNRM2( M-KL-I+1, A( KL+I, I ), 1 ) WA = ( WN / ABS( A( KL+I, I ) ) )*A( KL+I, I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( KL+I, I ) + WA CALL CSCAL( M-KL-I, ONE / WB, A( KL+I+1, I ), 1 ) A( KL+I, I ) = ONE TAU = REAL( WB / WA ) END IF * * apply reflection to A(kl+i:m,i+1:n) from the left * CALL CGEMV( 'Conjugate transpose', M-KL-I+1, N-I, ONE, $ A( KL+I, I+1 ), LDA, A( KL+I, I ), 1, ZERO, $ WORK, 1 ) CALL CGERC( M-KL-I+1, N-I, -TAU, A( KL+I, I ), 1, WORK, $ 1, A( KL+I, I+1 ), LDA ) A( KL+I, I ) = -WA END IF * IF( I.LE.MIN( N-1-KU, M ) ) THEN * * generate reflection to annihilate A(i,ku+i+1:n) * WN = SCNRM2( N-KU-I+1, A( I, KU+I ), LDA ) WA = ( WN / ABS( A( I, KU+I ) ) )*A( I, KU+I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( I, KU+I ) + WA CALL CSCAL( N-KU-I, ONE / WB, A( I, KU+I+1 ), LDA ) A( I, KU+I ) = ONE TAU = REAL( WB / WA ) END IF * * apply reflection to A(i+1:m,ku+i:n) from the right * CALL CLACGV( N-KU-I+1, A( I, KU+I ), LDA ) CALL CGEMV( 'No transpose', M-I, N-KU-I+1, ONE, $ A( I+1, KU+I ), LDA, A( I, KU+I ), LDA, ZERO, $ WORK, 1 ) CALL CGERC( M-I, N-KU-I+1, -TAU, WORK, 1, A( I, KU+I ), $ LDA, A( I+1, KU+I ), LDA ) A( I, KU+I ) = -WA END IF ELSE * * annihilate superdiagonal elements first (necessary if * KU = 0) * IF( I.LE.MIN( N-1-KU, M ) ) THEN * * generate reflection to annihilate A(i,ku+i+1:n) * WN = SCNRM2( N-KU-I+1, A( I, KU+I ), LDA ) WA = ( WN / ABS( A( I, KU+I ) ) )*A( I, KU+I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( I, KU+I ) + WA CALL CSCAL( N-KU-I, ONE / WB, A( I, KU+I+1 ), LDA ) A( I, KU+I ) = ONE TAU = REAL( WB / WA ) END IF * * apply reflection to A(i+1:m,ku+i:n) from the right * CALL CLACGV( N-KU-I+1, A( I, KU+I ), LDA ) CALL CGEMV( 'No transpose', M-I, N-KU-I+1, ONE, $ A( I+1, KU+I ), LDA, A( I, KU+I ), LDA, ZERO, $ WORK, 1 ) CALL CGERC( M-I, N-KU-I+1, -TAU, WORK, 1, A( I, KU+I ), $ LDA, A( I+1, KU+I ), LDA ) A( I, KU+I ) = -WA END IF * IF( I.LE.MIN( M-1-KL, N ) ) THEN * * generate reflection to annihilate A(kl+i+1:m,i) * WN = SCNRM2( M-KL-I+1, A( KL+I, I ), 1 ) WA = ( WN / ABS( A( KL+I, I ) ) )*A( KL+I, I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( KL+I, I ) + WA CALL CSCAL( M-KL-I, ONE / WB, A( KL+I+1, I ), 1 ) A( KL+I, I ) = ONE TAU = REAL( WB / WA ) END IF * * apply reflection to A(kl+i:m,i+1:n) from the left * CALL CGEMV( 'Conjugate transpose', M-KL-I+1, N-I, ONE, $ A( KL+I, I+1 ), LDA, A( KL+I, I ), 1, ZERO, $ WORK, 1 ) CALL CGERC( M-KL-I+1, N-I, -TAU, A( KL+I, I ), 1, WORK, $ 1, A( KL+I, I+1 ), LDA ) A( KL+I, I ) = -WA END IF END IF * DO 50 J = KL + I + 1, M A( J, I ) = ZERO 50 CONTINUE * DO 60 J = KU + I + 1, N A( I, J ) = ZERO 60 CONTINUE 70 CONTINUE RETURN * * End of CLAGGE * END scalapack-2.0.2/TOOLS/LAPACK/claghe.f000644 000766 000024 00000013472 10604316307 017145 0ustar00juliestaff000000 000000 SUBROUTINE CLAGHE( N, K, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, K, LDA, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL D( * ) COMPLEX A( LDA, * ), WORK( * ) * .. * * Purpose * ======= * * CLAGHE generates a complex hermitian matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random unitary matrix: * A = U*D*U'. The semi-bandwidth may then be reduced to k by additional * unitary transformations. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * K (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * * D (input) REAL array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (output) COMPLEX array, dimension (LDA,N) * The generated n by n hermitian matrix A (the full matrix is * stored). * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= N. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) COMPLEX array, dimension (2*N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX ZERO, ONE, HALF PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ), $ HALF = ( 0.5E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J REAL WN COMPLEX ALPHA, TAU, WA, WB * .. * .. External Subroutines .. EXTERNAL CAXPY, CGEMV, CGERC, CHEMV, CHER2, CLARNV, $ CSCAL, XERBLA * .. * .. External Functions .. REAL SCNRM2 COMPLEX CDOTC EXTERNAL SCNRM2, CDOTC * .. * .. Intrinsic Functions .. INTRINSIC ABS, CONJG, MAX, REAL * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'CLAGHE', -INFO ) RETURN END IF * * initialize lower triangle of A to diagonal matrix * DO 20 J = 1, N DO 10 I = J + 1, N A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, N A( I, I ) = D( I ) 30 CONTINUE * * Generate lower triangle of hermitian matrix * DO 40 I = N - 1, 1, -1 * * generate random reflection * CALL CLARNV( 3, ISEED, N-I+1, WORK ) WN = SCNRM2( N-I+1, WORK, 1 ) WA = ( WN / ABS( WORK( 1 ) ) )*WORK( 1 ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL CSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = REAL( WB / WA ) END IF * * apply random reflection to A(i:n,i:n) from the left * and the right * * compute y := tau * A * u * CALL CHEMV( 'Lower', N-I+1, TAU, A( I, I ), LDA, WORK, 1, ZERO, $ WORK( N+1 ), 1 ) * * compute v := y - 1/2 * tau * ( y, u ) * u * ALPHA = -HALF*TAU*CDOTC( N-I+1, WORK( N+1 ), 1, WORK, 1 ) CALL CAXPY( N-I+1, ALPHA, WORK, 1, WORK( N+1 ), 1 ) * * apply the transformation as a rank-2 update to A(i:n,i:n) * CALL CHER2( 'Lower', N-I+1, -ONE, WORK, 1, WORK( N+1 ), 1, $ A( I, I ), LDA ) 40 CONTINUE * * Reduce number of subdiagonals to K * DO 60 I = 1, N - 1 - K * * generate reflection to annihilate A(k+i+1:n,i) * WN = SCNRM2( N-K-I+1, A( K+I, I ), 1 ) WA = ( WN / ABS( A( K+I, I ) ) )*A( K+I, I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( K+I, I ) + WA CALL CSCAL( N-K-I, ONE / WB, A( K+I+1, I ), 1 ) A( K+I, I ) = ONE TAU = REAL( WB / WA ) END IF * * apply reflection to A(k+i:n,i+1:k+i-1) from the left * CALL CGEMV( 'Conjugate transpose', N-K-I+1, K-1, ONE, $ A( K+I, I+1 ), LDA, A( K+I, I ), 1, ZERO, WORK, 1 ) CALL CGERC( N-K-I+1, K-1, -TAU, A( K+I, I ), 1, WORK, 1, $ A( K+I, I+1 ), LDA ) * * apply reflection to A(k+i:n,k+i:n) from the left and the right * * compute y := tau * A * u * CALL CHEMV( 'Lower', N-K-I+1, TAU, A( K+I, K+I ), LDA, $ A( K+I, I ), 1, ZERO, WORK, 1 ) * * compute v := y - 1/2 * tau * ( y, u ) * u * ALPHA = -HALF*TAU*CDOTC( N-K-I+1, WORK, 1, A( K+I, I ), 1 ) CALL CAXPY( N-K-I+1, ALPHA, A( K+I, I ), 1, WORK, 1 ) * * apply hermitian rank-2 update to A(k+i:n,k+i:n) * CALL CHER2( 'Lower', N-K-I+1, -ONE, A( K+I, I ), 1, WORK, 1, $ A( K+I, K+I ), LDA ) * A( K+I, I ) = -WA DO 50 J = K + I + 1, N A( J, I ) = ZERO 50 CONTINUE 60 CONTINUE * * Store full hermitian matrix * DO 80 J = 1, N DO 70 I = J + 1, N A( J, I ) = CONJG( A( I, J ) ) 70 CONTINUE 80 CONTINUE RETURN * * End of CLAGHE * END scalapack-2.0.2/TOOLS/LAPACK/clagsy.f000644 000766 000024 00000014747 10604316307 017212 0ustar00juliestaff000000 000000 SUBROUTINE CLAGSY( N, K, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, K, LDA, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL D( * ) COMPLEX A( LDA, * ), WORK( * ) * .. * * Purpose * ======= * * CLAGSY generates a complex symmetric matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random unitary matrix: * A = U*D*U**T. The semi-bandwidth may then be reduced to k by * additional unitary transformations. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * K (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * * D (input) REAL array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (output) COMPLEX array, dimension (LDA,N) * The generated n by n symmetric matrix A (the full matrix is * stored). * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= N. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) COMPLEX array, dimension (2*N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX ZERO, ONE, HALF PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ), $ HALF = ( 0.5E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, II, J, JJ REAL WN COMPLEX ALPHA, TAU, WA, WB * .. * .. External Subroutines .. EXTERNAL CAXPY, CGEMV, CGERC, CLACGV, CLARNV, CSCAL, $ CSYMV, XERBLA * .. * .. External Functions .. REAL SCNRM2 COMPLEX CDOTC EXTERNAL SCNRM2, CDOTC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, REAL * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'CLAGSY', -INFO ) RETURN END IF * * initialize lower triangle of A to diagonal matrix * DO 20 J = 1, N DO 10 I = J + 1, N A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, N A( I, I ) = D( I ) 30 CONTINUE * * Generate lower triangle of symmetric matrix * DO 60 I = N - 1, 1, -1 * * generate random reflection * CALL CLARNV( 3, ISEED, N-I+1, WORK ) WN = SCNRM2( N-I+1, WORK, 1 ) WA = ( WN / ABS( WORK( 1 ) ) )*WORK( 1 ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL CSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = REAL( WB / WA ) END IF * * apply random reflection to A(i:n,i:n) from the left * and the right * * compute y := tau * A * conjg(u) * CALL CLACGV( N-I+1, WORK, 1 ) CALL CSYMV( 'Lower', N-I+1, TAU, A( I, I ), LDA, WORK, 1, ZERO, $ WORK( N+1 ), 1 ) CALL CLACGV( N-I+1, WORK, 1 ) * * compute v := y - 1/2 * tau * ( u, y ) * u * ALPHA = -HALF*TAU*CDOTC( N-I+1, WORK, 1, WORK( N+1 ), 1 ) CALL CAXPY( N-I+1, ALPHA, WORK, 1, WORK( N+1 ), 1 ) * * apply the transformation as a rank-2 update to A(i:n,i:n) * * CALL CSYR2( 'Lower', N-I+1, -ONE, WORK, 1, WORK( N+1 ), 1, * $ A( I, I ), LDA ) * DO 50 JJ = I, N DO 40 II = JJ, N A( II, JJ ) = A( II, JJ ) - $ WORK( II-I+1 )*WORK( N+JJ-I+1 ) - $ WORK( N+II-I+1 )*WORK( JJ-I+1 ) 40 CONTINUE 50 CONTINUE 60 CONTINUE * * Reduce number of subdiagonals to K * DO 100 I = 1, N - 1 - K * * generate reflection to annihilate A(k+i+1:n,i) * WN = SCNRM2( N-K-I+1, A( K+I, I ), 1 ) WA = ( WN / ABS( A( K+I, I ) ) )*A( K+I, I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( K+I, I ) + WA CALL CSCAL( N-K-I, ONE / WB, A( K+I+1, I ), 1 ) A( K+I, I ) = ONE TAU = REAL( WB / WA ) END IF * * apply reflection to A(k+i:n,i+1:k+i-1) from the left * CALL CGEMV( 'Conjugate transpose', N-K-I+1, K-1, ONE, $ A( K+I, I+1 ), LDA, A( K+I, I ), 1, ZERO, WORK, 1 ) CALL CGERC( N-K-I+1, K-1, -TAU, A( K+I, I ), 1, WORK, 1, $ A( K+I, I+1 ), LDA ) * * apply reflection to A(k+i:n,k+i:n) from the left and the right * * compute y := tau * A * conjg(u) * CALL CLACGV( N-K-I+1, A( K+I, I ), 1 ) CALL CSYMV( 'Lower', N-K-I+1, TAU, A( K+I, K+I ), LDA, $ A( K+I, I ), 1, ZERO, WORK, 1 ) CALL CLACGV( N-K-I+1, A( K+I, I ), 1 ) * * compute v := y - 1/2 * tau * ( u, y ) * u * ALPHA = -HALF*TAU*CDOTC( N-K-I+1, A( K+I, I ), 1, WORK, 1 ) CALL CAXPY( N-K-I+1, ALPHA, A( K+I, I ), 1, WORK, 1 ) * * apply symmetric rank-2 update to A(k+i:n,k+i:n) * * CALL CSYR2( 'Lower', N-K-I+1, -ONE, A( K+I, I ), 1, WORK, 1, * $ A( K+I, K+I ), LDA ) * DO 80 JJ = K + I, N DO 70 II = JJ, N A( II, JJ ) = A( II, JJ ) - A( II, I )*WORK( JJ-K-I+1 ) - $ WORK( II-K-I+1 )*A( JJ, I ) 70 CONTINUE 80 CONTINUE * A( K+I, I ) = -WA DO 90 J = K + I + 1, N A( J, I ) = ZERO 90 CONTINUE 100 CONTINUE * * Store full symmetric matrix * DO 120 J = 1, N DO 110 I = J + 1, N A( J, I ) = A( I, J ) 110 CONTINUE 120 CONTINUE RETURN * * End of CLAGSY * END scalapack-2.0.2/TOOLS/LAPACK/clarnd.f000644 000766 000024 00000005643 11654631032 017167 0ustar00juliestaff000000 000000 COMPLEX FUNCTION CLARND( IDIST, ISEED ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER IDIST * .. * .. Array Arguments .. INTEGER ISEED( 4 ) * .. * * Purpose * ======= * * CLARND returns a random complex number from a uniform or normal * distribution. * * Arguments * ========= * * IDIST (input) INTEGER * Specifies the distribution of the random numbers: * = 1: real and imaginary parts each uniform (0,1) * = 2: real and imaginary parts each uniform (-1,1) * = 3: real and imaginary parts each normal (0,1) * = 4: uniformly distributed on the disc abs(z) <= 1 * = 5: uniformly distributed on the circle abs(z) = 1 * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * Further Details * =============== * * This routine calls the auxiliary routine SLARAN to generate a random * real number from a uniform (0,1) distribution. The Box-Muller method * is used to transform numbers from a uniform to a normal distribution. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE, TWO PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) REAL TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663E+0 ) * .. * .. Local Scalars .. REAL T1, T2 * .. * .. External Functions .. REAL SLARAN EXTERNAL SLARAN * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, EXP, LOG, SQRT * .. * .. Executable Statements .. * * Generate a pair of real random numbers from a uniform (0,1) * distribution * T1 = SLARAN( ISEED ) T2 = SLARAN( ISEED ) * IF( IDIST.EQ.1 ) THEN * * real and imaginary parts each uniform (0,1) * CLARND = CMPLX( T1, T2 ) ELSE IF( IDIST.EQ.2 ) THEN * * real and imaginary parts each uniform (-1,1) * CLARND = CMPLX( TWO*T1-ONE, TWO*T2-ONE ) ELSE IF( IDIST.EQ.3 ) THEN * * real and imaginary parts each normal (0,1) * CLARND = SQRT( -TWO*LOG( T1 ) )*EXP( CMPLX( ZERO, TWOPI*T2 ) ) ELSE IF( IDIST.EQ.4 ) THEN * * uniform distribution on the unit disc abs(z) <= 1 * CLARND = SQRT( T1 )*EXP( CMPLX( ZERO, TWOPI*T2 ) ) ELSE IF( IDIST.EQ.5 ) THEN * * uniform distribution on the unit circle abs(z) = 1 * CLARND = EXP( CMPLX( ZERO, TWOPI*T2 ) ) ELSE CLARND = CMPLX(ZERO,ZERO) END IF RETURN * * End of CLARND * END scalapack-2.0.2/TOOLS/LAPACK/clarnv.f000644 000766 000024 00000007722 10363532303 017206 0ustar00juliestaff000000 000000 SUBROUTINE CLARNV( IDIST, ISEED, N, X ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. INTEGER IDIST, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX X( * ) * .. * * Purpose * ======= * * CLARNV returns a vector of n random complex numbers from a uniform or * normal distribution. * * Arguments * ========= * * IDIST (input) INTEGER * Specifies the distribution of the random numbers: * = 1: real and imaginary parts each uniform (0,1) * = 2: real and imaginary parts each uniform (-1,1) * = 3: real and imaginary parts each normal (0,1) * = 4: uniformly distributed on the disc abs(z) < 1 * = 5: uniformly distributed on the circle abs(z) = 1 * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * N (input) INTEGER * The number of random numbers to be generated. * * X (output) COMPLEX array, dimension (N) * The generated random numbers. * * Further Details * =============== * * This routine calls the auxiliary routine SLARUV to generate random * real numbers from a uniform (0,1) distribution, in batches of up to * 128 using vectorisable code. The Box-Muller method is used to * transform numbers from a uniform to a normal distribution. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE, TWO PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) INTEGER LV PARAMETER ( LV = 128 ) REAL TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663E+0 ) * .. * .. Local Scalars .. INTEGER I, IL, IV * .. * .. Local Arrays .. REAL U( LV ) * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, EXP, LOG, MIN, SQRT * .. * .. External Subroutines .. EXTERNAL SLARUV * .. * .. Executable Statements .. * DO 60 IV = 1, N, LV / 2 IL = MIN( LV / 2, N-IV+1 ) * * Call SLARUV to generate 2*IL real numbers from a uniform (0,1) * distribution (2*IL <= LV) * CALL SLARUV( ISEED, 2*IL, U ) * IF( IDIST.EQ.1 ) THEN * * Copy generated numbers * DO 10 I = 1, IL X( IV+I-1 ) = CMPLX( U( 2*I-1 ), U( 2*I ) ) 10 CONTINUE ELSE IF( IDIST.EQ.2 ) THEN * * Convert generated numbers to uniform (-1,1) distribution * DO 20 I = 1, IL X( IV+I-1 ) = CMPLX( TWO*U( 2*I-1 )-ONE, $ TWO*U( 2*I )-ONE ) 20 CONTINUE ELSE IF( IDIST.EQ.3 ) THEN * * Convert generated numbers to normal (0,1) distribution * DO 30 I = 1, IL X( IV+I-1 ) = SQRT( -TWO*LOG( U( 2*I-1 ) ) )* $ EXP( CMPLX( ZERO, TWOPI*U( 2*I ) ) ) 30 CONTINUE ELSE IF( IDIST.EQ.4 ) THEN * * Convert generated numbers to complex numbers uniformly * distributed on the unit disk * DO 40 I = 1, IL X( IV+I-1 ) = SQRT( U( 2*I-1 ) )* $ EXP( CMPLX( ZERO, TWOPI*U( 2*I ) ) ) 40 CONTINUE ELSE IF( IDIST.EQ.5 ) THEN * * Convert generated numbers to complex numbers uniformly * distributed on the unit circle * DO 50 I = 1, IL X( IV+I-1 ) = EXP( CMPLX( ZERO, TWOPI*U( 2*I ) ) ) 50 CONTINUE END IF 60 CONTINUE RETURN * * End of CLARNV * END scalapack-2.0.2/TOOLS/LAPACK/clarot.f000644 000766 000024 00000023621 10604316307 017203 0ustar00juliestaff000000 000000 SUBROUTINE CLAROT( LROWS, LLEFT, LRIGHT, NL, C, S, A, LDA, XLEFT, $ XRIGHT ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. LOGICAL LLEFT, LRIGHT, LROWS INTEGER LDA, NL COMPLEX C, S, XLEFT, XRIGHT * .. * .. Array Arguments .. COMPLEX A( * ) * .. * * Purpose * ======= * * CLAROT applies a (Givens) rotation to two adjacent rows or * columns, where one element of the first and/or last column/row * November 2006 * for use on matrices stored in some format other than GE, so * that elements of the matrix may be used or modified for which * no array element is provided. * * One example is a symmetric matrix in SB format (bandwidth=4), for * which UPLO='L': Two adjacent rows will have the format: * * row j: * * * * * . . . . * row j+1: * * * * * . . . . * * '*' indicates elements for which storage is provided, * '.' indicates elements for which no storage is provided, but * are not necessarily zero; their values are determined by * symmetry. ' ' indicates elements which are necessarily zero, * and have no storage provided. * * Those columns which have two '*'s can be handled by SROT. * Those columns which have no '*'s can be ignored, since as long * as the Givens rotations are carefully applied to preserve * symmetry, their values are determined. * Those columns which have one '*' have to be handled separately, * by using separate variables "p" and "q": * * row j: * * * * * p . . . * row j+1: q * * * * * . . . . * * The element p would have to be set correctly, then that column * is rotated, setting p to its new value. The next call to * CLAROT would rotate columns j and j+1, using p, and restore * symmetry. The element q would start out being zero, and be * made non-zero by the rotation. Later, rotations would presumably * be chosen to zero q out. * * Typical Calling Sequences: rotating the i-th and (i+1)-st rows. * ------- ------- --------- * * General dense matrix: * * CALL CLAROT(.TRUE.,.FALSE.,.FALSE., N, C,S, * A(i,1),LDA, DUMMY, DUMMY) * * General banded matrix in GB format: * * j = MAX(1, i-KL ) * NL = MIN( N, i+KU+1 ) + 1-j * CALL CLAROT( .TRUE., i-KL.GE.1, i+KU.LT.N, NL, C,S, * A(KU+i+1-j,j),LDA-1, XLEFT, XRIGHT ) * * [ note that i+1-j is just MIN(i,KL+1) ] * * Symmetric banded matrix in SY format, bandwidth K, * lower triangle only: * * j = MAX(1, i-K ) * NL = MIN( K+1, i ) + 1 * CALL CLAROT( .TRUE., i-K.GE.1, .TRUE., NL, C,S, * A(i,j), LDA, XLEFT, XRIGHT ) * * Same, but upper triangle only: * * NL = MIN( K+1, N-i ) + 1 * CALL CLAROT( .TRUE., .TRUE., i+K.LT.N, NL, C,S, * A(i,i), LDA, XLEFT, XRIGHT ) * * Symmetric banded matrix in SB format, bandwidth K, * lower triangle only: * * [ same as for SY, except:] * . . . . * A(i+1-j,j), LDA-1, XLEFT, XRIGHT ) * * [ note that i+1-j is just MIN(i,K+1) ] * * Same, but upper triangle only: * . . . * A(K+1,i), LDA-1, XLEFT, XRIGHT ) * * Rotating columns is just the transpose of rotating rows, except * for GB and SB: (rotating columns i and i+1) * * GB: * j = MAX(1, i-KU ) * NL = MIN( N, i+KL+1 ) + 1-j * CALL CLAROT( .TRUE., i-KU.GE.1, i+KL.LT.N, NL, C,S, * A(KU+j+1-i,i),LDA-1, XTOP, XBOTTM ) * * [note that KU+j+1-i is just MAX(1,KU+2-i)] * * SB: (upper triangle) * * . . . . . . * A(K+j+1-i,i),LDA-1, XTOP, XBOTTM ) * * SB: (lower triangle) * * . . . . . . * A(1,i),LDA-1, XTOP, XBOTTM ) * * Arguments * ========= * * LROWS - LOGICAL * If .TRUE., then CLAROT will rotate two rows. If .FALSE., * then it will rotate two columns. * Not modified. * * LLEFT - LOGICAL * If .TRUE., then XLEFT will be used instead of the * corresponding element of A for the first element in the * second row (if LROWS=.FALSE.) or column (if LROWS=.TRUE.) * If .FALSE., then the corresponding element of A will be * used. * Not modified. * * LRIGHT - LOGICAL * If .TRUE., then XRIGHT will be used instead of the * corresponding element of A for the last element in the * first row (if LROWS=.FALSE.) or column (if LROWS=.TRUE.) If * .FALSE., then the corresponding element of A will be used. * Not modified. * * NL - INTEGER * The length of the rows (if LROWS=.TRUE.) or columns (if * LROWS=.FALSE.) to be rotated. If XLEFT and/or XRIGHT are * used, the columns/rows they are in should be included in * NL, e.g., if LLEFT = LRIGHT = .TRUE., then NL must be at * least 2. The number of rows/columns to be rotated * exclusive of those involving XLEFT and/or XRIGHT may * not be negative, i.e., NL minus how many of LLEFT and * LRIGHT are .TRUE. must be at least zero; if not, XERBLA * will be called. * Not modified. * * C, S - COMPLEX * Specify the Givens rotation to be applied. If LROWS is * true, then the matrix ( c s ) * ( _ _ ) * (-s c ) is applied from the left; * if false, then the transpose (not conjugated) thereof is * applied from the right. Note that in contrast to the * output of CROTG or to most versions of CROT, both C and S * are complex. For a Givens rotation, |C|**2 + |S|**2 should * be 1, but this is not checked. * Not modified. * * A - COMPLEX array. * The array containing the rows/columns to be rotated. The * first element of A should be the upper left element to * be rotated. * Read and modified. * * LDA - INTEGER * The "effective" leading dimension of A. If A contains * a matrix stored in GE, HE, or SY format, then this is just * the leading dimension of A as dimensioned in the calling * routine. If A contains a matrix stored in band (GB, HB, or * SB) format, then this should be *one less* than the leading * dimension used in the calling routine. Thus, if A were * dimensioned A(LDA,*) in CLAROT, then A(1,j) would be the * j-th element in the first of the two rows to be rotated, * and A(2,j) would be the j-th in the second, regardless of * how the array may be stored in the calling routine. [A * cannot, however, actually be dimensioned thus, since for * band format, the row number may exceed LDA, which is not * legal FORTRAN.] * If LROWS=.TRUE., then LDA must be at least 1, otherwise * it must be at least NL minus the number of .TRUE. values * in XLEFT and XRIGHT. * Not modified. * * XLEFT - COMPLEX * If LLEFT is .TRUE., then XLEFT will be used and modified * instead of A(2,1) (if LROWS=.TRUE.) or A(1,2) * (if LROWS=.FALSE.). * Read and modified. * * XRIGHT - COMPLEX * If LRIGHT is .TRUE., then XRIGHT will be used and modified * instead of A(1,NL) (if LROWS=.TRUE.) or A(NL,1) * (if LROWS=.FALSE.). * Read and modified. * * ===================================================================== * * .. Local Scalars .. INTEGER IINC, INEXT, IX, IY, IYT, J, NT COMPLEX TEMPX * .. * .. Local Arrays .. COMPLEX XT( 2 ), YT( 2 ) * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG * .. * .. Executable Statements .. * * Set up indices, arrays for ends * IF( LROWS ) THEN IINC = LDA INEXT = 1 ELSE IINC = 1 INEXT = LDA END IF * IF( LLEFT ) THEN NT = 1 IX = 1 + IINC IY = 2 + LDA XT( 1 ) = A( 1 ) YT( 1 ) = XLEFT ELSE NT = 0 IX = 1 IY = 1 + INEXT END IF * IF( LRIGHT ) THEN IYT = 1 + INEXT + ( NL-1 )*IINC NT = NT + 1 XT( NT ) = XRIGHT YT( NT ) = A( IYT ) END IF * * Check for errors * IF( NL.LT.NT ) THEN CALL XERBLA( 'CLAROT', 4 ) RETURN END IF IF( LDA.LE.0 .OR. ( .NOT.LROWS .AND. LDA.LT.NL-NT ) ) THEN CALL XERBLA( 'CLAROT', 8 ) RETURN END IF * * Rotate * * CROT( NL-NT, A(IX),IINC, A(IY),IINC, C, S ) with complex C, S * DO 10 J = 0, NL - NT - 1 TEMPX = C*A( IX+J*IINC ) + S*A( IY+J*IINC ) A( IY+J*IINC ) = -CONJG( S )*A( IX+J*IINC ) + $ CONJG( C )*A( IY+J*IINC ) A( IX+J*IINC ) = TEMPX 10 CONTINUE * * CROT( NT, XT,1, YT,1, C, S ) with complex C, S * DO 20 J = 1, NT TEMPX = C*XT( J ) + S*YT( J ) YT( J ) = -CONJG( S )*XT( J ) + CONJG( C )*YT( J ) XT( J ) = TEMPX 20 CONTINUE * * Stuff values back into XLEFT, XRIGHT, etc. * IF( LLEFT ) THEN A( 1 ) = XT( 1 ) XLEFT = YT( 1 ) END IF * IF( LRIGHT ) THEN XRIGHT = XT( NT ) A( IYT ) = YT( NT ) END IF * RETURN * * End of CLAROT * END scalapack-2.0.2/TOOLS/LAPACK/clatm1.f000644 000766 000024 00000015750 10604316307 017104 0ustar00juliestaff000000 000000 SUBROUTINE CLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, N, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER IDIST, INFO, IRSIGN, MODE, N REAL COND * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX D( * ) * .. * * Purpose * ======= * * CLATM1 computes the entries of D(1..N) as specified by * MODE, COND and IRSIGN. IDIST and ISEED determine the generation * of random numbers. CLATM1 is called by CLATMR to generate * random test matrices for LAPACK programs. * * Arguments * ========= * * MODE - INTEGER * On entry describes how D is to be computed: * MODE = 0 means do not change D. * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * Not modified. * * COND - REAL * On entry, used as described under MODE above. * If used, it must be >= 1. Not modified. * * IRSIGN - INTEGER * On entry, if MODE neither -6, 0 nor 6, determines sign of * entries of D * 0 => leave entries of D unchanged * 1 => multiply each entry of D by random complex number * uniformly distributed with absolute value 1 * * IDIST - CHARACTER*1 * On entry, IDIST specifies the type of distribution to be * used to generate a random matrix . * 1 => real and imaginary parts each UNIFORM( 0, 1 ) * 2 => real and imaginary parts each UNIFORM( -1, 1 ) * 3 => real and imaginary parts each NORMAL( 0, 1 ) * 4 => complex number uniform in DISK( 0, 1 ) * Not modified. * * ISEED - INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. The random number generator uses a * linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to CLATM1 * to continue the same random number sequence. * Changed on exit. * * D - COMPLEX array, dimension ( MIN( M , N ) ) * Array to be computed according to MODE, COND and IRSIGN. * May be changed on exit if MODE is nonzero. * * N - INTEGER * Number of entries of D. Not modified. * * INFO - INTEGER * 0 => normal termination * -1 => if MODE not in range -6 to 6 * -2 => if MODE neither -6, 0 nor 6, and * IRSIGN neither 0 nor 1 * -3 => if MODE neither -6, 0 nor 6 and COND less than 1 * -4 => if MODE equals 6 or -6 and IDIST not in range 1 to 4 * -7 => if N negative * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E0 ) * .. * .. Local Scalars .. INTEGER I REAL ALPHA, TEMP COMPLEX CTEMP * .. * .. External Functions .. REAL SLARAN COMPLEX CLARND EXTERNAL SLARAN, CLARND * .. * .. External Subroutines .. EXTERNAL CLARNV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, EXP, LOG, REAL * .. * .. Executable Statements .. * * Decode and Test the input parameters. Initialize flags & seed. * INFO = 0 * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Set INFO if an error * IF( MODE.LT.-6 .OR. MODE.GT.6 ) THEN INFO = -1 ELSE IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ ( IRSIGN.NE.0 .AND. IRSIGN.NE.1 ) ) THEN INFO = -2 ELSE IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ COND.LT.ONE ) THEN INFO = -3 ELSE IF( ( MODE.EQ.6 .OR. MODE.EQ.-6 ) .AND. $ ( IDIST.LT.1 .OR. IDIST.GT.4 ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -7 END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLATM1', -INFO ) RETURN END IF * * Compute D according to COND and MODE * IF( MODE.NE.0 ) THEN GO TO ( 10, 30, 50, 70, 90, 110 )ABS( MODE ) * * One large D value: * 10 CONTINUE DO 20 I = 1, N D( I ) = ONE / COND 20 CONTINUE D( 1 ) = ONE GO TO 120 * * One small D value: * 30 CONTINUE DO 40 I = 1, N D( I ) = ONE 40 CONTINUE D( N ) = ONE / COND GO TO 120 * * Exponentially distributed D values: * 50 CONTINUE D( 1 ) = ONE IF( N.GT.1 ) THEN ALPHA = COND**( -ONE / REAL( N-1 ) ) DO 60 I = 2, N D( I ) = ALPHA**( I-1 ) 60 CONTINUE END IF GO TO 120 * * Arithmetically distributed D values: * 70 CONTINUE D( 1 ) = ONE IF( N.GT.1 ) THEN TEMP = ONE / COND ALPHA = ( ONE-TEMP ) / REAL( N-1 ) DO 80 I = 2, N D( I ) = REAL( N-I )*ALPHA + TEMP 80 CONTINUE END IF GO TO 120 * * Randomly distributed D values on ( 1/COND , 1): * 90 CONTINUE ALPHA = LOG( ONE / COND ) DO 100 I = 1, N D( I ) = EXP( ALPHA*SLARAN( ISEED ) ) 100 CONTINUE GO TO 120 * * Randomly distributed D values from IDIST * 110 CONTINUE CALL CLARNV( IDIST, ISEED, N, D ) * 120 CONTINUE * * If MODE neither -6 nor 0 nor 6, and IRSIGN = 1, assign * random signs to D * IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ IRSIGN.EQ.1 ) THEN DO 130 I = 1, N CTEMP = CLARND( 3, ISEED ) D( I ) = D( I )*( CTEMP / ABS( CTEMP ) ) 130 CONTINUE END IF * * Reverse if MODE < 0 * IF( MODE.LT.0 ) THEN DO 140 I = 1, N / 2 CTEMP = D( I ) D( I ) = D( N+1-I ) D( N+1-I ) = CTEMP 140 CONTINUE END IF * END IF * RETURN * * End of CLATM1 * END scalapack-2.0.2/TOOLS/LAPACK/clatms.f000644 000766 000024 00000124521 11654631032 017204 0ustar00juliestaff000000 000000 SUBROUTINE CLATMS( M, N, DIST, ISEED, SYM, D, MODE, COND, DMAX, $ KL, KU, PACK, A, LDA, WORK, INFO ) * * -- LAPACK test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER DIST, PACK, SYM INTEGER INFO, KL, KU, LDA, M, MODE, N REAL COND, DMAX * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL D( * ) COMPLEX A( LDA, * ), WORK( * ) * .. * * Purpose * ======= * * CLATMS generates random matrices with specified singular values * (or hermitian with specified eigenvalues) * for testing LAPACK programs. * * CLATMS operates by applying the following sequence of * operations: * * Set the diagonal to D, where D may be input or * computed according to MODE, COND, DMAX, and SYM * as described below. * * Generate a matrix with the appropriate band structure, by one * of two methods: * * Method A: * Generate a dense M x N matrix by multiplying D on the left * and the right by random unitary matrices, then: * * Reduce the bandwidth according to KL and KU, using * Householder transformations. * * Method B: * Convert the bandwidth-0 (i.e., diagonal) matrix to a * bandwidth-1 matrix using Givens rotations, "chasing" * out-of-band elements back, much as in QR; then convert * the bandwidth-1 to a bandwidth-2 matrix, etc. Note * that for reasonably small bandwidths (relative to M and * N) this requires less storage, as a dense matrix is not * generated. Also, for hermitian or symmetric matrices, * only one triangle is generated. * * Method A is chosen if the bandwidth is a large fraction of the * order of the matrix, and LDA is at least M (so a dense * matrix can be stored.) Method B is chosen if the bandwidth * is small (< 1/2 N for hermitian or symmetric, < .3 N+M for * non-symmetric), or LDA is less than M and not less than the * bandwidth. * * Pack the matrix if desired. Options specified by PACK are: * no packing * zero out upper half (if hermitian) * zero out lower half (if hermitian) * store the upper half columnwise (if hermitian or upper * triangular) * store the lower half columnwise (if hermitian or lower * triangular) * store the lower triangle in banded format (if hermitian or * lower triangular) * store the upper triangle in banded format (if hermitian or * upper triangular) * store the entire matrix in banded format * If Method B is chosen, and band format is specified, then the * matrix will be generated in the band format, so no repacking * will be necessary. * * Arguments * ========= * * M - INTEGER * The number of rows of A. Not modified. * * N - INTEGER * The number of columns of A. N must equal M if the matrix * is symmetric or hermitian (i.e., if SYM is not 'N') * Not modified. * * DIST - CHARACTER*1 * On entry, DIST specifies the type of distribution to be used * to generate the random eigen-/singular values. * 'U' => UNIFORM( 0, 1 ) ( 'U' for uniform ) * 'S' => UNIFORM( -1, 1 ) ( 'S' for symmetric ) * 'N' => NORMAL( 0, 1 ) ( 'N' for normal ) * Not modified. * * ISEED - INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. They should lie between 0 and 4095 inclusive, * and ISEED(4) should be odd. The random number generator * uses a linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to CLATMS * to continue the same random number sequence. * Changed on exit. * * SYM - CHARACTER*1 * If SYM='H', the generated matrix is hermitian, with * eigenvalues specified by D, COND, MODE, and DMAX; they * may be positive, negative, or zero. * If SYM='P', the generated matrix is hermitian, with * eigenvalues (= singular values) specified by D, COND, * MODE, and DMAX; they will not be negative. * If SYM='N', the generated matrix is nonsymmetric, with * singular values specified by D, COND, MODE, and DMAX; * they will not be negative. * If SYM='S', the generated matrix is (complex) symmetric, * with singular values specified by D, COND, MODE, and * DMAX; they will not be negative. * Not modified. * * D - REAL array, dimension ( MIN( M, N ) ) * This array is used to specify the singular values or * eigenvalues of A (see SYM, above.) If MODE=0, then D is * assumed to contain the singular/eigenvalues, otherwise * they will be computed according to MODE, COND, and DMAX, * and placed in D. * Modified if MODE is nonzero. * * MODE - INTEGER * On entry this describes how the singular/eigenvalues are to * be specified: * MODE = 0 means use D as input * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * If SYM='H', and MODE is neither 0, 6, nor -6, then * the elements of D will also be multiplied by a random * sign (i.e., +1 or -1.) * Not modified. * * COND - REAL * On entry, this is used as described under MODE above. * If used, it must be >= 1. Not modified. * * DMAX - REAL * If MODE is neither -6, 0 nor 6, the contents of D, as * computed according to MODE and COND, will be scaled by * DMAX / max(abs(D(i))); thus, the maximum absolute eigen- or * singular value (which is to say the norm) will be abs(DMAX). * Note that DMAX need not be positive: if DMAX is negative * (or zero), D will be scaled by a negative number (or zero). * Not modified. * * KL - INTEGER * This specifies the lower bandwidth of the matrix. For * example, KL=0 implies upper triangular, KL=1 implies upper * Hessenberg, and KL being at least M-1 means that the matrix * has full lower bandwidth. KL must equal KU if the matrix * is symmetric or hermitian. * Not modified. * * KU - INTEGER * This specifies the upper bandwidth of the matrix. For * example, KU=0 implies lower triangular, KU=1 implies lower * Hessenberg, and KU being at least N-1 means that the matrix * has full upper bandwidth. KL must equal KU if the matrix * is symmetric or hermitian. * Not modified. * * PACK - CHARACTER*1 * This specifies packing of matrix as follows: * 'N' => no packing * 'U' => zero out all subdiagonal entries (if symmetric * or hermitian) * 'L' => zero out all superdiagonal entries (if symmetric * or hermitian) * 'C' => store the upper triangle columnwise (only if the * matrix is symmetric, hermitian, or upper triangular) * 'R' => store the lower triangle columnwise (only if the * matrix is symmetric, hermitian, or lower triangular) * 'B' => store the lower triangle in band storage scheme * (only if the matrix is symmetric, hermitian, or * lower triangular) * 'Q' => store the upper triangle in band storage scheme * (only if the matrix is symmetric, hermitian, or * upper triangular) * 'Z' => store the entire matrix in band storage scheme * (pivoting can be provided for by using this * option to store A in the trailing rows of * the allocated storage) * * Using these options, the various LAPACK packed and banded * storage schemes can be obtained: * GB - use 'Z' * PB, SB, HB, or TB - use 'B' or 'Q' * PP, SP, HB, or TP - use 'C' or 'R' * * If two calls to CLATMS differ only in the PACK parameter, * they will generate mathematically equivalent matrices. * Not modified. * * A - COMPLEX array, dimension ( LDA, N ) * On exit A is the desired test matrix. A is first generated * in full (unpacked) form, and then packed, if so specified * by PACK. Thus, the first M elements of the first N * columns will always be modified. If PACK specifies a * packed or banded storage scheme, all LDA elements of the * first N columns will be modified; the elements of the * array which do not correspond to elements of the generated * matrix are set to zero. * Modified. * * LDA - INTEGER * LDA specifies the first dimension of A as declared in the * calling program. If PACK='N', 'U', 'L', 'C', or 'R', then * LDA must be at least M. If PACK='B' or 'Q', then LDA must * be at least MIN( KL, M-1) (which is equal to MIN(KU,N-1)). * If PACK='Z', LDA must be large enough to hold the packed * array: MIN( KU, N-1) + MIN( KL, M-1) + 1. * Not modified. * * WORK - COMPLEX array, dimension ( 3*MAX( N, M ) ) * Workspace. * Modified. * * INFO - INTEGER * Error code. On exit, INFO will be set to one of the * following values: * 0 => normal return * -1 => M negative or unequal to N and SYM='S', 'H', or 'P' * -2 => N negative * -3 => DIST illegal string * -5 => SYM illegal string * -7 => MODE not in range -6 to 6 * -8 => COND less than 1.0, and MODE neither -6, 0 nor 6 * -10 => KL negative * -11 => KU negative, or SYM is not 'N' and KU is not equal to * KL * -12 => PACK illegal string, or PACK='U' or 'L', and SYM='N'; * or PACK='C' or 'Q' and SYM='N' and KL is not zero; * or PACK='R' or 'B' and SYM='N' and KU is not zero; * or PACK='U', 'L', 'C', 'R', 'B', or 'Q', and M is not * N. * -14 => LDA is less than M, or PACK='Z' and LDA is less than * MIN(KU,N-1) + MIN(KL,M-1) + 1. * 1 => Error return from SLATM1 * 2 => Cannot scale to DMAX (max. sing. value is 0) * 3 => Error return from CLAGGE, CLAGHE or CLAGSY * * ===================================================================== * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) COMPLEX CZERO PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) REAL TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663E+0 ) * .. * .. Local Scalars .. LOGICAL CSYM, GIVENS, ILEXTR, ILTEMP, TOPDWN INTEGER I, IC, ICOL, IDIST, IENDCH, IINFO, IL, ILDA, $ IOFFG, IOFFST, IPACK, IPACKG, IR, IR1, IR2, $ IROW, IRSIGN, ISKEW, ISYM, ISYMPK, J, JC, JCH, $ JKL, JKU, JR, K, LLB, MINLDA, MNMIN, MR, NC, $ UUB REAL ALPHA, ANGLE, REALC, TEMP COMPLEX C, CT, CTEMP, DUMMY, EXTRA, S, ST * .. * .. External Functions .. LOGICAL LSAME REAL SLARND COMPLEX CLARND EXTERNAL LSAME, SLARND, CLARND * .. * .. External Subroutines .. EXTERNAL CLAGGE, CLAGHE, CLAGSY, CLAROT, CLARTG, CLASET, $ SLATM1, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, CONJG, COS, MAX, MIN, MOD, REAL, $ SIN * .. * .. Executable Statements .. * * 1) Decode and Test the input parameters. * Initialize flags & seed. * INFO = 0 * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Decode DIST * IF( LSAME( DIST, 'U' ) ) THEN IDIST = 1 ELSE IF( LSAME( DIST, 'S' ) ) THEN IDIST = 2 ELSE IF( LSAME( DIST, 'N' ) ) THEN IDIST = 3 ELSE IDIST = -1 END IF * * Decode SYM * IF( LSAME( SYM, 'N' ) ) THEN ISYM = 1 IRSIGN = 0 CSYM = .FALSE. ELSE IF( LSAME( SYM, 'P' ) ) THEN ISYM = 2 IRSIGN = 0 CSYM = .FALSE. ELSE IF( LSAME( SYM, 'S' ) ) THEN ISYM = 2 IRSIGN = 0 CSYM = .TRUE. ELSE IF( LSAME( SYM, 'H' ) ) THEN ISYM = 2 IRSIGN = 1 CSYM = .FALSE. ELSE ISYM = -1 END IF * * Decode PACK * ISYMPK = 0 IF( LSAME( PACK, 'N' ) ) THEN IPACK = 0 ELSE IF( LSAME( PACK, 'U' ) ) THEN IPACK = 1 ISYMPK = 1 ELSE IF( LSAME( PACK, 'L' ) ) THEN IPACK = 2 ISYMPK = 1 ELSE IF( LSAME( PACK, 'C' ) ) THEN IPACK = 3 ISYMPK = 2 ELSE IF( LSAME( PACK, 'R' ) ) THEN IPACK = 4 ISYMPK = 3 ELSE IF( LSAME( PACK, 'B' ) ) THEN IPACK = 5 ISYMPK = 3 ELSE IF( LSAME( PACK, 'Q' ) ) THEN IPACK = 6 ISYMPK = 2 ELSE IF( LSAME( PACK, 'Z' ) ) THEN IPACK = 7 ELSE IPACK = -1 END IF * * Set certain internal parameters * MNMIN = MIN( M, N ) LLB = MIN( KL, M-1 ) UUB = MIN( KU, N-1 ) MR = MIN( M, N+LLB ) NC = MIN( N, M+UUB ) IROW = 1 ICOL = 1 CSYM = .FALSE. * IF( IPACK.EQ.5 .OR. IPACK.EQ.6 ) THEN MINLDA = UUB + 1 ELSE IF( IPACK.EQ.7 ) THEN MINLDA = LLB + UUB + 1 ELSE MINLDA = M END IF * * Use Givens rotation method if bandwidth small enough, * or if LDA is too small to store the matrix unpacked. * GIVENS = .FALSE. IF( ISYM.EQ.1 ) THEN IF( REAL( LLB+UUB ).LT.0.3*REAL( MAX( 1, MR+NC ) ) ) $ GIVENS = .TRUE. ELSE IF( 2*LLB.LT.M ) $ GIVENS = .TRUE. END IF IF( LDA.LT.M .AND. LDA.GE.MINLDA ) $ GIVENS = .TRUE. * * Set INFO if an error * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( M.NE.N .AND. ISYM.NE.1 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( IDIST.EQ.-1 ) THEN INFO = -3 ELSE IF( ISYM.EQ.-1 ) THEN INFO = -5 ELSE IF( ABS( MODE ).GT.6 ) THEN INFO = -7 ELSE IF( ( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) .AND. COND.LT.ONE ) $ THEN INFO = -8 ELSE IF( KL.LT.0 ) THEN INFO = -10 ELSE IF( KU.LT.0 .OR. ( ISYM.NE.1 .AND. KL.NE.KU ) ) THEN INFO = -11 ELSE IF( IPACK.EQ.-1 .OR. ( ISYMPK.EQ.1 .AND. ISYM.EQ.1 ) .OR. $ ( ISYMPK.EQ.2 .AND. ISYM.EQ.1 .AND. KL.GT.0 ) .OR. $ ( ISYMPK.EQ.3 .AND. ISYM.EQ.1 .AND. KU.GT.0 ) .OR. $ ( ISYMPK.NE.0 .AND. M.NE.N ) ) THEN INFO = -12 ELSE IF( LDA.LT.MAX( 1, MINLDA ) ) THEN INFO = -14 END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLATMS', -INFO ) RETURN END IF * * Initialize random number generator * DO 10 I = 1, 4 ISEED( I ) = MOD( ABS( ISEED( I ) ), 4096 ) 10 CONTINUE * IF( MOD( ISEED( 4 ), 2 ).NE.1 ) $ ISEED( 4 ) = ISEED( 4 ) + 1 * * 2) Set up D if indicated. * * Compute D according to COND and MODE * CALL SLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, MNMIN, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 1 RETURN END IF * * Choose Top-Down if D is (apparently) increasing, * Bottom-Up if D is (apparently) decreasing. * IF( ABS( D( 1 ) ).LE.ABS( D( MNMIN ) ) ) THEN TOPDWN = .TRUE. ELSE TOPDWN = .FALSE. END IF * IF( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) THEN * * Scale by DMAX * TEMP = ABS( D( 1 ) ) DO 20 I = 2, MNMIN TEMP = MAX( TEMP, ABS( D( I ) ) ) 20 CONTINUE * IF( TEMP.GT.ZERO ) THEN ALPHA = DMAX / TEMP ELSE INFO = 2 RETURN END IF * CALL SSCAL( MNMIN, ALPHA, D, 1 ) * END IF * CALL CLASET( 'Full', LDA, N, CZERO, CZERO, A, LDA ) * * 3) Generate Banded Matrix using Givens rotations. * Also the special case of UUB=LLB=0 * * Compute Addressing constants to cover all * storage formats. Whether GE, HE, SY, GB, HB, or SB, * upper or lower triangle or both, * the (i,j)-th element is in * A( i - ISKEW*j + IOFFST, j ) * IF( IPACK.GT.4 ) THEN ILDA = LDA - 1 ISKEW = 1 IF( IPACK.GT.5 ) THEN IOFFST = UUB + 1 ELSE IOFFST = 1 END IF ELSE ILDA = LDA ISKEW = 0 IOFFST = 0 END IF * * IPACKG is the format that the matrix is generated in. If this is * different from IPACK, then the matrix must be repacked at the * end. It also signals how to compute the norm, for scaling. * IPACKG = 0 * * Diagonal Matrix -- We are done, unless it * is to be stored HP/SP/PP/TP (PACK='R' or 'C') * IF( LLB.EQ.0 .AND. UUB.EQ.0 ) THEN DO 30 J = 1, MNMIN A( ( 1-ISKEW )*J+IOFFST, J ) = CMPLX( D( J ) ) 30 CONTINUE * IF( IPACK.LE.2 .OR. IPACK.GE.5 ) $ IPACKG = IPACK * ELSE IF( GIVENS ) THEN * * Check whether to use Givens rotations, * Householder transformations, or nothing. * IF( ISYM.EQ.1 ) THEN * * Non-symmetric -- A = U D V * IF( IPACK.GT.4 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF * DO 40 J = 1, MNMIN A( ( 1-ISKEW )*J+IOFFST, J ) = CMPLX( D( J ) ) 40 CONTINUE * IF( TOPDWN ) THEN JKL = 0 DO 70 JKU = 1, UUB * * Transform from bandwidth JKL, JKU-1 to JKL, JKU * * Last row actually rotated is M * Last column actually rotated is MIN( M+JKU, N ) * DO 60 JR = 1, MIN( M+JKU, N ) + JKL - 1 EXTRA = CZERO ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE )*CLARND( 5, ISEED ) S = SIN( ANGLE )*CLARND( 5, ISEED ) ICOL = MAX( 1, JR-JKL ) IF( JR.LT.M ) THEN IL = MIN( N, JR+JKU ) + 1 - ICOL CALL CLAROT( .TRUE., JR.GT.JKL, .FALSE., IL, C, $ S, A( JR-ISKEW*ICOL+IOFFST, ICOL ), $ ILDA, EXTRA, DUMMY ) END IF * * Chase "EXTRA" back up * IR = JR IC = ICOL DO 50 JCH = JR - JKL, 1, -JKL - JKU IF( IR.LT.M ) THEN CALL CLARTG( A( IR+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), EXTRA, REALC, S, DUMMY ) DUMMY = CLARND( 5, ISEED ) C = CONJG( REALC*DUMMY ) S = CONJG( -S*DUMMY ) END IF IROW = MAX( 1, JCH-JKU ) IL = IR + 2 - IROW CTEMP = CZERO ILTEMP = JCH.GT.JKU CALL CLAROT( .FALSE., ILTEMP, .TRUE., IL, C, S, $ A( IROW-ISKEW*IC+IOFFST, IC ), $ ILDA, CTEMP, EXTRA ) IF( ILTEMP ) THEN CALL CLARTG( A( IROW+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), CTEMP, REALC, S, DUMMY ) DUMMY = CLARND( 5, ISEED ) C = CONJG( REALC*DUMMY ) S = CONJG( -S*DUMMY ) * ICOL = MAX( 1, JCH-JKU-JKL ) IL = IC + 2 - ICOL EXTRA = CZERO CALL CLAROT( .TRUE., JCH.GT.JKU+JKL, .TRUE., $ IL, C, S, A( IROW-ISKEW*ICOL+ $ IOFFST, ICOL ), ILDA, EXTRA, $ CTEMP ) IC = ICOL IR = IROW END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE * JKU = UUB DO 100 JKL = 1, LLB * * Transform from bandwidth JKL-1, JKU to JKL, JKU * DO 90 JC = 1, MIN( N+JKL, M ) + JKU - 1 EXTRA = CZERO ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE )*CLARND( 5, ISEED ) S = SIN( ANGLE )*CLARND( 5, ISEED ) IROW = MAX( 1, JC-JKU ) IF( JC.LT.N ) THEN IL = MIN( M, JC+JKL ) + 1 - IROW CALL CLAROT( .FALSE., JC.GT.JKU, .FALSE., IL, C, $ S, A( IROW-ISKEW*JC+IOFFST, JC ), $ ILDA, EXTRA, DUMMY ) END IF * * Chase "EXTRA" back up * IC = JC IR = IROW DO 80 JCH = JC - JKU, 1, -JKL - JKU IF( IC.LT.N ) THEN CALL CLARTG( A( IR+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), EXTRA, REALC, S, DUMMY ) DUMMY = CLARND( 5, ISEED ) C = CONJG( REALC*DUMMY ) S = CONJG( -S*DUMMY ) END IF ICOL = MAX( 1, JCH-JKL ) IL = IC + 2 - ICOL CTEMP = CZERO ILTEMP = JCH.GT.JKL CALL CLAROT( .TRUE., ILTEMP, .TRUE., IL, C, S, $ A( IR-ISKEW*ICOL+IOFFST, ICOL ), $ ILDA, CTEMP, EXTRA ) IF( ILTEMP ) THEN CALL CLARTG( A( IR+1-ISKEW*( ICOL+1 )+IOFFST, $ ICOL+1 ), CTEMP, REALC, S, $ DUMMY ) DUMMY = CLARND( 5, ISEED ) C = CONJG( REALC*DUMMY ) S = CONJG( -S*DUMMY ) IROW = MAX( 1, JCH-JKL-JKU ) IL = IR + 2 - IROW EXTRA = CZERO CALL CLAROT( .FALSE., JCH.GT.JKL+JKU, .TRUE., $ IL, C, S, A( IROW-ISKEW*ICOL+ $ IOFFST, ICOL ), ILDA, EXTRA, $ CTEMP ) IC = ICOL IR = IROW END IF 80 CONTINUE 90 CONTINUE 100 CONTINUE * ELSE * * Bottom-Up -- Start at the bottom right. * JKL = 0 DO 130 JKU = 1, UUB * * Transform from bandwidth JKL, JKU-1 to JKL, JKU * * First row actually rotated is M * First column actually rotated is MIN( M+JKU, N ) * IENDCH = MIN( M, N+JKL ) - 1 DO 120 JC = MIN( M+JKU, N ) - 1, 1 - JKL, -1 EXTRA = CZERO ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE )*CLARND( 5, ISEED ) S = SIN( ANGLE )*CLARND( 5, ISEED ) IROW = MAX( 1, JC-JKU+1 ) IF( JC.GT.0 ) THEN IL = MIN( M, JC+JKL+1 ) + 1 - IROW CALL CLAROT( .FALSE., .FALSE., JC+JKL.LT.M, IL, $ C, S, A( IROW-ISKEW*JC+IOFFST, $ JC ), ILDA, DUMMY, EXTRA ) END IF * * Chase "EXTRA" back down * IC = JC DO 110 JCH = JC + JKL, IENDCH, JKL + JKU ILEXTR = IC.GT.0 IF( ILEXTR ) THEN CALL CLARTG( A( JCH-ISKEW*IC+IOFFST, IC ), $ EXTRA, REALC, S, DUMMY ) DUMMY = CLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY END IF IC = MAX( 1, IC ) ICOL = MIN( N-1, JCH+JKU ) ILTEMP = JCH + JKU.LT.N CTEMP = CZERO CALL CLAROT( .TRUE., ILEXTR, ILTEMP, ICOL+2-IC, $ C, S, A( JCH-ISKEW*IC+IOFFST, IC ), $ ILDA, EXTRA, CTEMP ) IF( ILTEMP ) THEN CALL CLARTG( A( JCH-ISKEW*ICOL+IOFFST, $ ICOL ), CTEMP, REALC, S, DUMMY ) DUMMY = CLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY IL = MIN( IENDCH, JCH+JKL+JKU ) + 2 - JCH EXTRA = CZERO CALL CLAROT( .FALSE., .TRUE., $ JCH+JKL+JKU.LE.IENDCH, IL, C, S, $ A( JCH-ISKEW*ICOL+IOFFST, $ ICOL ), ILDA, CTEMP, EXTRA ) IC = ICOL END IF 110 CONTINUE 120 CONTINUE 130 CONTINUE * JKU = UUB DO 160 JKL = 1, LLB * * Transform from bandwidth JKL-1, JKU to JKL, JKU * * First row actually rotated is MIN( N+JKL, M ) * First column actually rotated is N * IENDCH = MIN( N, M+JKU ) - 1 DO 150 JR = MIN( N+JKL, M ) - 1, 1 - JKU, -1 EXTRA = CZERO ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE )*CLARND( 5, ISEED ) S = SIN( ANGLE )*CLARND( 5, ISEED ) ICOL = MAX( 1, JR-JKL+1 ) IF( JR.GT.0 ) THEN IL = MIN( N, JR+JKU+1 ) + 1 - ICOL CALL CLAROT( .TRUE., .FALSE., JR+JKU.LT.N, IL, $ C, S, A( JR-ISKEW*ICOL+IOFFST, $ ICOL ), ILDA, DUMMY, EXTRA ) END IF * * Chase "EXTRA" back down * IR = JR DO 140 JCH = JR + JKU, IENDCH, JKL + JKU ILEXTR = IR.GT.0 IF( ILEXTR ) THEN CALL CLARTG( A( IR-ISKEW*JCH+IOFFST, JCH ), $ EXTRA, REALC, S, DUMMY ) DUMMY = CLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY END IF IR = MAX( 1, IR ) IROW = MIN( M-1, JCH+JKL ) ILTEMP = JCH + JKL.LT.M CTEMP = CZERO CALL CLAROT( .FALSE., ILEXTR, ILTEMP, IROW+2-IR, $ C, S, A( IR-ISKEW*JCH+IOFFST, $ JCH ), ILDA, EXTRA, CTEMP ) IF( ILTEMP ) THEN CALL CLARTG( A( IROW-ISKEW*JCH+IOFFST, JCH ), $ CTEMP, REALC, S, DUMMY ) DUMMY = CLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY IL = MIN( IENDCH, JCH+JKL+JKU ) + 2 - JCH EXTRA = CZERO CALL CLAROT( .TRUE., .TRUE., $ JCH+JKL+JKU.LE.IENDCH, IL, C, S, $ A( IROW-ISKEW*JCH+IOFFST, JCH ), $ ILDA, CTEMP, EXTRA ) IR = IROW END IF 140 CONTINUE 150 CONTINUE 160 CONTINUE * END IF * ELSE * * Symmetric -- A = U D U' * Hermitian -- A = U D U* * IPACKG = IPACK IOFFG = IOFFST * IF( TOPDWN ) THEN * * Top-Down -- Generate Upper triangle only * IF( IPACK.GE.5 ) THEN IPACKG = 6 IOFFG = UUB + 1 ELSE IPACKG = 1 END IF * DO 170 J = 1, MNMIN A( ( 1-ISKEW )*J+IOFFG, J ) = CMPLX( D( J ) ) 170 CONTINUE * DO 200 K = 1, UUB DO 190 JC = 1, N - 1 IROW = MAX( 1, JC-K ) IL = MIN( JC+1, K+2 ) EXTRA = CZERO CTEMP = A( JC-ISKEW*( JC+1 )+IOFFG, JC+1 ) ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE )*CLARND( 5, ISEED ) S = SIN( ANGLE )*CLARND( 5, ISEED ) IF( CSYM ) THEN CT = C ST = S ELSE CTEMP = CONJG( CTEMP ) CT = CONJG( C ) ST = CONJG( S ) END IF CALL CLAROT( .FALSE., JC.GT.K, .TRUE., IL, C, S, $ A( IROW-ISKEW*JC+IOFFG, JC ), ILDA, $ EXTRA, CTEMP ) CALL CLAROT( .TRUE., .TRUE., .FALSE., $ MIN( K, N-JC )+1, CT, ST, $ A( ( 1-ISKEW )*JC+IOFFG, JC ), ILDA, $ CTEMP, DUMMY ) * * Chase EXTRA back up the matrix * ICOL = JC DO 180 JCH = JC - K, 1, -K CALL CLARTG( A( JCH+1-ISKEW*( ICOL+1 )+IOFFG, $ ICOL+1 ), EXTRA, REALC, S, DUMMY ) DUMMY = CLARND( 5, ISEED ) C = CONJG( REALC*DUMMY ) S = CONJG( -S*DUMMY ) CTEMP = A( JCH-ISKEW*( JCH+1 )+IOFFG, JCH+1 ) IF( CSYM ) THEN CT = C ST = S ELSE CTEMP = CONJG( CTEMP ) CT = CONJG( C ) ST = CONJG( S ) END IF CALL CLAROT( .TRUE., .TRUE., .TRUE., K+2, C, S, $ A( ( 1-ISKEW )*JCH+IOFFG, JCH ), $ ILDA, CTEMP, EXTRA ) IROW = MAX( 1, JCH-K ) IL = MIN( JCH+1, K+2 ) EXTRA = CZERO CALL CLAROT( .FALSE., JCH.GT.K, .TRUE., IL, CT, $ ST, A( IROW-ISKEW*JCH+IOFFG, JCH ), $ ILDA, EXTRA, CTEMP ) ICOL = JCH 180 CONTINUE 190 CONTINUE 200 CONTINUE * * If we need lower triangle, copy from upper. Note that * the order of copying is chosen to work for 'q' -> 'b' * IF( IPACK.NE.IPACKG .AND. IPACK.NE.3 ) THEN DO 230 JC = 1, N IROW = IOFFST - ISKEW*JC IF( CSYM ) THEN DO 210 JR = JC, MIN( N, JC+UUB ) A( JR+IROW, JC ) = A( JC-ISKEW*JR+IOFFG, JR ) 210 CONTINUE ELSE DO 220 JR = JC, MIN( N, JC+UUB ) A( JR+IROW, JC ) = CONJG( A( JC-ISKEW*JR+ $ IOFFG, JR ) ) 220 CONTINUE END IF 230 CONTINUE IF( IPACK.EQ.5 ) THEN DO 250 JC = N - UUB + 1, N DO 240 JR = N + 2 - JC, UUB + 1 A( JR, JC ) = CZERO 240 CONTINUE 250 CONTINUE END IF IF( IPACKG.EQ.6 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF END IF ELSE * * Bottom-Up -- Generate Lower triangle only * IF( IPACK.GE.5 ) THEN IPACKG = 5 IF( IPACK.EQ.6 ) $ IOFFG = 1 ELSE IPACKG = 2 END IF * DO 260 J = 1, MNMIN A( ( 1-ISKEW )*J+IOFFG, J ) = CMPLX( D( J ) ) 260 CONTINUE * DO 290 K = 1, UUB DO 280 JC = N - 1, 1, -1 IL = MIN( N+1-JC, K+2 ) EXTRA = CZERO CTEMP = A( 1+( 1-ISKEW )*JC+IOFFG, JC ) ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE )*CLARND( 5, ISEED ) S = SIN( ANGLE )*CLARND( 5, ISEED ) IF( CSYM ) THEN CT = C ST = S ELSE CTEMP = CONJG( CTEMP ) CT = CONJG( C ) ST = CONJG( S ) END IF CALL CLAROT( .FALSE., .TRUE., N-JC.GT.K, IL, C, S, $ A( ( 1-ISKEW )*JC+IOFFG, JC ), ILDA, $ CTEMP, EXTRA ) ICOL = MAX( 1, JC-K+1 ) CALL CLAROT( .TRUE., .FALSE., .TRUE., JC+2-ICOL, $ CT, ST, A( JC-ISKEW*ICOL+IOFFG, $ ICOL ), ILDA, DUMMY, CTEMP ) * * Chase EXTRA back down the matrix * ICOL = JC DO 270 JCH = JC + K, N - 1, K CALL CLARTG( A( JCH-ISKEW*ICOL+IOFFG, ICOL ), $ EXTRA, REALC, S, DUMMY ) DUMMY = CLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY CTEMP = A( 1+( 1-ISKEW )*JCH+IOFFG, JCH ) IF( CSYM ) THEN CT = C ST = S ELSE CTEMP = CONJG( CTEMP ) CT = CONJG( C ) ST = CONJG( S ) END IF CALL CLAROT( .TRUE., .TRUE., .TRUE., K+2, C, S, $ A( JCH-ISKEW*ICOL+IOFFG, ICOL ), $ ILDA, EXTRA, CTEMP ) IL = MIN( N+1-JCH, K+2 ) EXTRA = CZERO CALL CLAROT( .FALSE., .TRUE., N-JCH.GT.K, IL, $ CT, ST, A( ( 1-ISKEW )*JCH+IOFFG, $ JCH ), ILDA, CTEMP, EXTRA ) ICOL = JCH 270 CONTINUE 280 CONTINUE 290 CONTINUE * * If we need upper triangle, copy from lower. Note that * the order of copying is chosen to work for 'b' -> 'q' * IF( IPACK.NE.IPACKG .AND. IPACK.NE.4 ) THEN DO 320 JC = N, 1, -1 IROW = IOFFST - ISKEW*JC IF( CSYM ) THEN DO 300 JR = JC, MAX( 1, JC-UUB ), -1 A( JR+IROW, JC ) = A( JC-ISKEW*JR+IOFFG, JR ) 300 CONTINUE ELSE DO 310 JR = JC, MAX( 1, JC-UUB ), -1 A( JR+IROW, JC ) = CONJG( A( JC-ISKEW*JR+ $ IOFFG, JR ) ) 310 CONTINUE END IF 320 CONTINUE IF( IPACK.EQ.6 ) THEN DO 340 JC = 1, UUB DO 330 JR = 1, UUB + 1 - JC A( JR, JC ) = CZERO 330 CONTINUE 340 CONTINUE END IF IF( IPACKG.EQ.5 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF END IF END IF * * Ensure that the diagonal is real if Hermitian * IF( .NOT.CSYM ) THEN DO 350 JC = 1, N IROW = IOFFST + ( 1-ISKEW )*JC A( IROW, JC ) = CMPLX( REAL( A( IROW, JC ) ) ) 350 CONTINUE END IF * END IF * ELSE * * 4) Generate Banded Matrix by first * Rotating by random Unitary matrices, * then reducing the bandwidth using Householder * transformations. * * Note: we should get here only if LDA .ge. N * IF( ISYM.EQ.1 ) THEN * * Non-symmetric -- A = U D V * CALL CLAGGE( MR, NC, LLB, UUB, D, A, LDA, ISEED, WORK, $ IINFO ) ELSE * * Symmetric -- A = U D U' or * Hermitian -- A = U D U* * IF( CSYM ) THEN CALL CLAGSY( M, LLB, D, A, LDA, ISEED, WORK, IINFO ) ELSE CALL CLAGHE( M, LLB, D, A, LDA, ISEED, WORK, IINFO ) END IF END IF * IF( IINFO.NE.0 ) THEN INFO = 3 RETURN END IF END IF * * 5) Pack the matrix * IF( IPACK.NE.IPACKG ) THEN IF( IPACK.EQ.1 ) THEN * * 'U' -- Upper triangular, not packed * DO 370 J = 1, M DO 360 I = J + 1, M A( I, J ) = CZERO 360 CONTINUE 370 CONTINUE * ELSE IF( IPACK.EQ.2 ) THEN * * 'L' -- Lower triangular, not packed * DO 390 J = 2, M DO 380 I = 1, J - 1 A( I, J ) = CZERO 380 CONTINUE 390 CONTINUE * ELSE IF( IPACK.EQ.3 ) THEN * * 'C' -- Upper triangle packed Columnwise. * ICOL = 1 IROW = 0 DO 410 J = 1, M DO 400 I = 1, J IROW = IROW + 1 IF( IROW.GT.LDA ) THEN IROW = 1 ICOL = ICOL + 1 END IF A( IROW, ICOL ) = A( I, J ) 400 CONTINUE 410 CONTINUE * ELSE IF( IPACK.EQ.4 ) THEN * * 'R' -- Lower triangle packed Columnwise. * ICOL = 1 IROW = 0 DO 430 J = 1, M DO 420 I = J, M IROW = IROW + 1 IF( IROW.GT.LDA ) THEN IROW = 1 ICOL = ICOL + 1 END IF A( IROW, ICOL ) = A( I, J ) 420 CONTINUE 430 CONTINUE * ELSE IF( IPACK.GE.5 ) THEN * * 'B' -- The lower triangle is packed as a band matrix. * 'Q' -- The upper triangle is packed as a band matrix. * 'Z' -- The whole matrix is packed as a band matrix. * IF( IPACK.EQ.5 ) $ UUB = 0 IF( IPACK.EQ.6 ) $ LLB = 0 * DO 450 J = 1, UUB DO 440 I = MIN( J+LLB, M ), 1, -1 A( I-J+UUB+1, J ) = A( I, J ) 440 CONTINUE 450 CONTINUE * DO 470 J = UUB + 2, N DO 460 I = J - UUB, MIN( J+LLB, M ) A( I-J+UUB+1, J ) = A( I, J ) 460 CONTINUE 470 CONTINUE END IF * * If packed, zero out extraneous elements. * * Symmetric/Triangular Packed -- * zero out everything after A(IROW,ICOL) * IF( IPACK.EQ.3 .OR. IPACK.EQ.4 ) THEN DO 490 JC = ICOL, M DO 480 JR = IROW + 1, LDA A( JR, JC ) = CZERO 480 CONTINUE IROW = 0 490 CONTINUE * ELSE IF( IPACK.GE.5 ) THEN * * Packed Band -- * 1st row is now in A( UUB+2-j, j), zero above it * m-th row is now in A( M+UUB-j,j), zero below it * last non-zero diagonal is now in A( UUB+LLB+1,j ), * zero below it, too. * IR1 = UUB + LLB + 2 IR2 = UUB + M + 2 DO 520 JC = 1, N DO 500 JR = 1, UUB + 1 - JC A( JR, JC ) = CZERO 500 CONTINUE DO 510 JR = MAX( 1, MIN( IR1, IR2-JC ) ), LDA A( JR, JC ) = CZERO 510 CONTINUE 520 CONTINUE END IF END IF * RETURN * * End of CLATMS * END scalapack-2.0.2/TOOLS/LAPACK/CMakeLists.txt000644 000766 000024 00000000743 11656312637 020322 0ustar00juliestaff000000 000000 set (ALLAUX icopy.f) set (SCATGEN slatm1.f slaran.f slarnd.f) set (SMATGEN slatms.f slagge.f slagsy.f slarot.f) set (CMATGEN clarnv.f clatm1.f clatms.f clagge.f claghe.f clagsy.f clarot.f clarnd.f) set (DZATGEN dlatm1.f dlaran.f dlarnd.f) set (DMATGEN dlatms.f dlagge.f dlagsy.f dlarot.f) set (ZMATGEN zlarnv.f zlatm1.f zlatms.f zlagge.f zlaghe.f zlagsy.f zlarot.f zlarnd.f) set (extra_lapack ${ALLAUX} ${SCATGEN} ${SMATGEN} ${CMATGEN} ${DZATGEN} ${DMATGEN} ${ZMATGEN}) scalapack-2.0.2/TOOLS/LAPACK/dlagge.f000644 000766 000024 00000021654 10604316307 017146 0ustar00juliestaff000000 000000 SUBROUTINE DLAGGE( M, N, KL, KU, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDA, M, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION A( LDA, * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * DLAGGE generates a real general m by n matrix A, by pre- and post- * multiplying a real diagonal matrix D with random orthogonal matrices: * A = U*D*V. The lower and upper bandwidths may then be reduced to * kl and ku by additional orthogonal transformations. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= KL <= M-1. * * KU (input) INTEGER * The number of nonzero superdiagonals within the band of A. * 0 <= KU <= N-1. * * D (input) DOUBLE PRECISION array, dimension (min(M,N)) * The diagonal elements of the diagonal matrix D. * * A (output) DOUBLE PRECISION array, dimension (LDA,N) * The generated m by n matrix A. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= M. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) DOUBLE PRECISION array, dimension (M+N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J DOUBLE PRECISION TAU, WA, WB, WN * .. * .. External Subroutines .. EXTERNAL DGEMV, DGER, DLARNV, DSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SIGN * .. * .. External Functions .. DOUBLE PRECISION DNRM2 EXTERNAL DNRM2 * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( KL.LT.0 .OR. KL.GT.M-1 ) THEN INFO = -3 ELSE IF( KU.LT.0 .OR. KU.GT.N-1 ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'DLAGGE', -INFO ) RETURN END IF * * initialize A to diagonal matrix * DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, MIN( M, N ) A( I, I ) = D( I ) 30 CONTINUE * * pre- and post-multiply A by random orthogonal matrices * DO 40 I = MIN( M, N ), 1, -1 IF( I.LT.M ) THEN * * generate random reflection * CALL DLARNV( 3, ISEED, M-I+1, WORK ) WN = DNRM2( M-I+1, WORK, 1 ) WA = SIGN( WN, WORK( 1 ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL DSCAL( M-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = WB / WA END IF * * multiply A(i:m,i:n) by random reflection from the left * CALL DGEMV( 'Transpose', M-I+1, N-I+1, ONE, A( I, I ), LDA, $ WORK, 1, ZERO, WORK( M+1 ), 1 ) CALL DGER( M-I+1, N-I+1, -TAU, WORK, 1, WORK( M+1 ), 1, $ A( I, I ), LDA ) END IF IF( I.LT.N ) THEN * * generate random reflection * CALL DLARNV( 3, ISEED, N-I+1, WORK ) WN = DNRM2( N-I+1, WORK, 1 ) WA = SIGN( WN, WORK( 1 ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL DSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = WB / WA END IF * * multiply A(i:m,i:n) by random reflection from the right * CALL DGEMV( 'No transpose', M-I+1, N-I+1, ONE, A( I, I ), $ LDA, WORK, 1, ZERO, WORK( N+1 ), 1 ) CALL DGER( M-I+1, N-I+1, -TAU, WORK( N+1 ), 1, WORK, 1, $ A( I, I ), LDA ) END IF 40 CONTINUE * * Reduce number of subdiagonals to KL and number of superdiagonals * to KU * DO 70 I = 1, MAX( M-1-KL, N-1-KU ) IF( KL.LE.KU ) THEN * * annihilate subdiagonal elements first (necessary if KL = 0) * IF( I.LE.MIN( M-1-KL, N ) ) THEN * * generate reflection to annihilate A(kl+i+1:m,i) * WN = DNRM2( M-KL-I+1, A( KL+I, I ), 1 ) WA = SIGN( WN, A( KL+I, I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( KL+I, I ) + WA CALL DSCAL( M-KL-I, ONE / WB, A( KL+I+1, I ), 1 ) A( KL+I, I ) = ONE TAU = WB / WA END IF * * apply reflection to A(kl+i:m,i+1:n) from the left * CALL DGEMV( 'Transpose', M-KL-I+1, N-I, ONE, $ A( KL+I, I+1 ), LDA, A( KL+I, I ), 1, ZERO, $ WORK, 1 ) CALL DGER( M-KL-I+1, N-I, -TAU, A( KL+I, I ), 1, WORK, 1, $ A( KL+I, I+1 ), LDA ) A( KL+I, I ) = -WA END IF * IF( I.LE.MIN( N-1-KU, M ) ) THEN * * generate reflection to annihilate A(i,ku+i+1:n) * WN = DNRM2( N-KU-I+1, A( I, KU+I ), LDA ) WA = SIGN( WN, A( I, KU+I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( I, KU+I ) + WA CALL DSCAL( N-KU-I, ONE / WB, A( I, KU+I+1 ), LDA ) A( I, KU+I ) = ONE TAU = WB / WA END IF * * apply reflection to A(i+1:m,ku+i:n) from the right * CALL DGEMV( 'No transpose', M-I, N-KU-I+1, ONE, $ A( I+1, KU+I ), LDA, A( I, KU+I ), LDA, ZERO, $ WORK, 1 ) CALL DGER( M-I, N-KU-I+1, -TAU, WORK, 1, A( I, KU+I ), $ LDA, A( I+1, KU+I ), LDA ) A( I, KU+I ) = -WA END IF ELSE * * annihilate superdiagonal elements first (necessary if * KU = 0) * IF( I.LE.MIN( N-1-KU, M ) ) THEN * * generate reflection to annihilate A(i,ku+i+1:n) * WN = DNRM2( N-KU-I+1, A( I, KU+I ), LDA ) WA = SIGN( WN, A( I, KU+I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( I, KU+I ) + WA CALL DSCAL( N-KU-I, ONE / WB, A( I, KU+I+1 ), LDA ) A( I, KU+I ) = ONE TAU = WB / WA END IF * * apply reflection to A(i+1:m,ku+i:n) from the right * CALL DGEMV( 'No transpose', M-I, N-KU-I+1, ONE, $ A( I+1, KU+I ), LDA, A( I, KU+I ), LDA, ZERO, $ WORK, 1 ) CALL DGER( M-I, N-KU-I+1, -TAU, WORK, 1, A( I, KU+I ), $ LDA, A( I+1, KU+I ), LDA ) A( I, KU+I ) = -WA END IF * IF( I.LE.MIN( M-1-KL, N ) ) THEN * * generate reflection to annihilate A(kl+i+1:m,i) * WN = DNRM2( M-KL-I+1, A( KL+I, I ), 1 ) WA = SIGN( WN, A( KL+I, I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( KL+I, I ) + WA CALL DSCAL( M-KL-I, ONE / WB, A( KL+I+1, I ), 1 ) A( KL+I, I ) = ONE TAU = WB / WA END IF * * apply reflection to A(kl+i:m,i+1:n) from the left * CALL DGEMV( 'Transpose', M-KL-I+1, N-I, ONE, $ A( KL+I, I+1 ), LDA, A( KL+I, I ), 1, ZERO, $ WORK, 1 ) CALL DGER( M-KL-I+1, N-I, -TAU, A( KL+I, I ), 1, WORK, 1, $ A( KL+I, I+1 ), LDA ) A( KL+I, I ) = -WA END IF END IF * DO 50 J = KL + I + 1, M A( J, I ) = ZERO 50 CONTINUE * DO 60 J = KU + I + 1, N A( I, J ) = ZERO 60 CONTINUE 70 CONTINUE RETURN * * End of DLAGGE * END scalapack-2.0.2/TOOLS/LAPACK/dlagsy.f000644 000766 000024 00000013142 10604316307 017177 0ustar00juliestaff000000 000000 SUBROUTINE DLAGSY( N, K, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, K, LDA, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION A( LDA, * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * DLAGSY generates a real symmetric matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random orthogonal matrix: * A = U*D*U'. The semi-bandwidth may then be reduced to k by additional * orthogonal transformations. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * K (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * * D (input) DOUBLE PRECISION array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (output) DOUBLE PRECISION array, dimension (LDA,N) * The generated n by n symmetric matrix A (the full matrix is * stored). * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= N. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) DOUBLE PRECISION array, dimension (2*N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, HALF PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, HALF = 0.5D+0 ) * .. * .. Local Scalars .. INTEGER I, J DOUBLE PRECISION ALPHA, TAU, WA, WB, WN * .. * .. External Subroutines .. EXTERNAL DAXPY, DGEMV, DGER, DLARNV, DSCAL, DSYMV, $ DSYR2, XERBLA * .. * .. External Functions .. DOUBLE PRECISION DDOT, DNRM2 EXTERNAL DDOT, DNRM2 * .. * .. Intrinsic Functions .. INTRINSIC MAX, SIGN * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'DLAGSY', -INFO ) RETURN END IF * * initialize lower triangle of A to diagonal matrix * DO 20 J = 1, N DO 10 I = J + 1, N A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, N A( I, I ) = D( I ) 30 CONTINUE * * Generate lower triangle of symmetric matrix * DO 40 I = N - 1, 1, -1 * * generate random reflection * CALL DLARNV( 3, ISEED, N-I+1, WORK ) WN = DNRM2( N-I+1, WORK, 1 ) WA = SIGN( WN, WORK( 1 ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL DSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = WB / WA END IF * * apply random reflection to A(i:n,i:n) from the left * and the right * * compute y := tau * A * u * CALL DSYMV( 'Lower', N-I+1, TAU, A( I, I ), LDA, WORK, 1, ZERO, $ WORK( N+1 ), 1 ) * * compute v := y - 1/2 * tau * ( y, u ) * u * ALPHA = -HALF*TAU*DDOT( N-I+1, WORK( N+1 ), 1, WORK, 1 ) CALL DAXPY( N-I+1, ALPHA, WORK, 1, WORK( N+1 ), 1 ) * * apply the transformation as a rank-2 update to A(i:n,i:n) * CALL DSYR2( 'Lower', N-I+1, -ONE, WORK, 1, WORK( N+1 ), 1, $ A( I, I ), LDA ) 40 CONTINUE * * Reduce number of subdiagonals to K * DO 60 I = 1, N - 1 - K * * generate reflection to annihilate A(k+i+1:n,i) * WN = DNRM2( N-K-I+1, A( K+I, I ), 1 ) WA = SIGN( WN, A( K+I, I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( K+I, I ) + WA CALL DSCAL( N-K-I, ONE / WB, A( K+I+1, I ), 1 ) A( K+I, I ) = ONE TAU = WB / WA END IF * * apply reflection to A(k+i:n,i+1:k+i-1) from the left * CALL DGEMV( 'Transpose', N-K-I+1, K-1, ONE, A( K+I, I+1 ), LDA, $ A( K+I, I ), 1, ZERO, WORK, 1 ) CALL DGER( N-K-I+1, K-1, -TAU, A( K+I, I ), 1, WORK, 1, $ A( K+I, I+1 ), LDA ) * * apply reflection to A(k+i:n,k+i:n) from the left and the right * * compute y := tau * A * u * CALL DSYMV( 'Lower', N-K-I+1, TAU, A( K+I, K+I ), LDA, $ A( K+I, I ), 1, ZERO, WORK, 1 ) * * compute v := y - 1/2 * tau * ( y, u ) * u * ALPHA = -HALF*TAU*DDOT( N-K-I+1, WORK, 1, A( K+I, I ), 1 ) CALL DAXPY( N-K-I+1, ALPHA, A( K+I, I ), 1, WORK, 1 ) * * apply symmetric rank-2 update to A(k+i:n,k+i:n) * CALL DSYR2( 'Lower', N-K-I+1, -ONE, A( K+I, I ), 1, WORK, 1, $ A( K+I, K+I ), LDA ) * A( K+I, I ) = -WA DO 50 J = K + I + 1, N A( J, I ) = ZERO 50 CONTINUE 60 CONTINUE * * Store full symmetric matrix * DO 80 J = 1, N DO 70 I = J + 1, N A( J, I ) = A( I, J ) 70 CONTINUE 80 CONTINUE RETURN * * End of DLAGSY * END scalapack-2.0.2/TOOLS/LAPACK/dlaran.f000644 000766 000024 00000006274 10604316307 017165 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION DLARAN( ISEED ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Array Arguments .. INTEGER ISEED( 4 ) * .. * * Purpose * ======= * * DLARAN returns a random real number from a uniform (0,1) * distribution. * * Arguments * ========= * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * Further Details * =============== * * This routine uses a multiplicative congruential method with modulus * 2**48 and multiplier 33952834046453 (see G.S.Fishman, * 'Multiplicative congruential random number generators with modulus * 2**b: an exhaustive analysis for b = 32 and a partial analysis for * b = 48', Math. Comp. 189, pp 331-344, 1990). * * 48-bit integers are stored in 4 integer array elements with 12 bits * per element. Hence the routine is portable across machines with * integers of 32 bits or more. * * ===================================================================== * * .. Parameters .. INTEGER M1, M2, M3, M4 PARAMETER ( M1 = 494, M2 = 322, M3 = 2508, M4 = 2549 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) INTEGER IPW2 DOUBLE PRECISION R PARAMETER ( IPW2 = 4096, R = ONE / IPW2 ) * .. * .. Local Scalars .. INTEGER IT1, IT2, IT3, IT4 DOUBLE PRECISION RNDOUT * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MOD * .. * .. Executable Statements .. 10 CONTINUE * * multiply the seed by the multiplier modulo 2**48 * IT4 = ISEED( 4 )*M4 IT3 = IT4 / IPW2 IT4 = IT4 - IPW2*IT3 IT3 = IT3 + ISEED( 3 )*M4 + ISEED( 4 )*M3 IT2 = IT3 / IPW2 IT3 = IT3 - IPW2*IT2 IT2 = IT2 + ISEED( 2 )*M4 + ISEED( 3 )*M3 + ISEED( 4 )*M2 IT1 = IT2 / IPW2 IT2 = IT2 - IPW2*IT1 IT1 = IT1 + ISEED( 1 )*M4 + ISEED( 2 )*M3 + ISEED( 3 )*M2 + $ ISEED( 4 )*M1 IT1 = MOD( IT1, IPW2 ) * * return updated seed * ISEED( 1 ) = IT1 ISEED( 2 ) = IT2 ISEED( 3 ) = IT3 ISEED( 4 ) = IT4 * * convert 48-bit integer to a real number in the interval (0,1) * RNDOUT = R*( DBLE( IT1 )+R*( DBLE( IT2 )+R*( DBLE( IT3 )+R* $ ( DBLE( IT4 ) ) ) ) ) * IF (RNDOUT.EQ.1.0D+0) THEN * If a real number has n bits of precision, and the first * n bits of the 48-bit integer above happen to be all 1 (which * will occur about once every 2**n calls), then DLARAN will * be rounded to exactly 1.0. * Since DLARAN is not supposed to return exactly 0.0 or 1.0 * (and some callers of DLARAN, such as CLARND, depend on that), * the statistically correct thing to do in this situation is * simply to iterate again. * N.B. the case DLARAN = 0.0 should not be possible. * GOTO 10 END IF * DLARAN = RNDOUT RETURN * * End of DLARAN * END scalapack-2.0.2/TOOLS/LAPACK/dlarnd.f000644 000766 000024 00000004333 11654631032 017163 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION DLARND( IDIST, ISEED ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER IDIST * .. * .. Array Arguments .. INTEGER ISEED( 4 ) * .. * * Purpose * ======= * * DLARND returns a random real number from a uniform or normal * distribution. * * Arguments * ========= * * IDIST (input) INTEGER * Specifies the distribution of the random numbers: * = 1: uniform (0,1) * = 2: uniform (-1,1) * = 3: normal (0,1) * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * Further Details * =============== * * This routine calls the auxiliary routine DLARAN to generate a random * real number from a uniform (0,1) distribution. The Box-Muller method * is used to transform numbers from a uniform to a normal distribution. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0 ) DOUBLE PRECISION TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION T1, T2 * .. * .. External Functions .. DOUBLE PRECISION DLARAN EXTERNAL DLARAN * .. * .. Intrinsic Functions .. INTRINSIC COS, LOG, SQRT * .. * .. Executable Statements .. * * Generate a real random number from a uniform (0,1) distribution * T1 = DLARAN( ISEED ) * IF( IDIST.EQ.1 ) THEN * * uniform (0,1) * DLARND = T1 ELSE IF( IDIST.EQ.2 ) THEN * * uniform (-1,1) * DLARND = TWO*T1 - ONE ELSE IF( IDIST.EQ.3 ) THEN * * normal (0,1) * T2 = DLARAN( ISEED ) DLARND = SQRT( -TWO*LOG( T1 ) )*COS( TWOPI*T2 ) ELSE DLARND = ONE END IF RETURN * * End of DLARND * END scalapack-2.0.2/TOOLS/LAPACK/dlarot.f000644 000766 000024 00000022400 11654631032 017177 0ustar00juliestaff000000 000000 SUBROUTINE DLAROT( LROWS, LLEFT, LRIGHT, NL, C, S, A, LDA, XLEFT, $ XRIGHT ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. LOGICAL LLEFT, LRIGHT, LROWS INTEGER LDA, NL DOUBLE PRECISION C, S, XLEFT, XRIGHT * .. * .. Array Arguments .. DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * DLAROT applies a (Givens) rotation to two adjacent rows or * columns, where one element of the first and/or last column/row * November 2006 * for use on matrices stored in some format other than GE, so * that elements of the matrix may be used or modified for which * no array element is provided. * * One example is a symmetric matrix in SB format (bandwidth=4), for * which UPLO='L': Two adjacent rows will have the format: * * row j: * * * * * . . . . * row j+1: * * * * * . . . . * * '*' indicates elements for which storage is provided, * '.' indicates elements for which no storage is provided, but * are not necessarily zero; their values are determined by * symmetry. ' ' indicates elements which are necessarily zero, * and have no storage provided. * * Those columns which have two '*'s can be handled by DROT. * Those columns which have no '*'s can be ignored, since as long * as the Givens rotations are carefully applied to preserve * symmetry, their values are determined. * Those columns which have one '*' have to be handled separately, * by using separate variables "p" and "q": * * row j: * * * * * p . . . * row j+1: q * * * * * . . . . * * The element p would have to be set correctly, then that column * is rotated, setting p to its new value. The next call to * DLAROT would rotate columns j and j+1, using p, and restore * symmetry. The element q would start out being zero, and be * made non-zero by the rotation. Later, rotations would presumably * be chosen to zero q out. * * Typical Calling Sequences: rotating the i-th and (i+1)-st rows. * ------- ------- --------- * * General dense matrix: * * CALL DLAROT(.TRUE.,.FALSE.,.FALSE., N, C,S, * A(i,1),LDA, DUMMY, DUMMY) * * General banded matrix in GB format: * * j = MAX(1, i-KL ) * NL = MIN( N, i+KU+1 ) + 1-j * CALL DLAROT( .TRUE., i-KL.GE.1, i+KU.LT.N, NL, C,S, * A(KU+i+1-j,j),LDA-1, XLEFT, XRIGHT ) * * [ note that i+1-j is just MIN(i,KL+1) ] * * Symmetric banded matrix in SY format, bandwidth K, * lower triangle only: * * j = MAX(1, i-K ) * NL = MIN( K+1, i ) + 1 * CALL DLAROT( .TRUE., i-K.GE.1, .TRUE., NL, C,S, * A(i,j), LDA, XLEFT, XRIGHT ) * * Same, but upper triangle only: * * NL = MIN( K+1, N-i ) + 1 * CALL DLAROT( .TRUE., .TRUE., i+K.LT.N, NL, C,S, * A(i,i), LDA, XLEFT, XRIGHT ) * * Symmetric banded matrix in SB format, bandwidth K, * lower triangle only: * * [ same as for SY, except:] * . . . . * A(i+1-j,j), LDA-1, XLEFT, XRIGHT ) * * [ note that i+1-j is just MIN(i,K+1) ] * * Same, but upper triangle only: * . . . * A(K+1,i), LDA-1, XLEFT, XRIGHT ) * * Rotating columns is just the transpose of rotating rows, except * for GB and SB: (rotating columns i and i+1) * * GB: * j = MAX(1, i-KU ) * NL = MIN( N, i+KL+1 ) + 1-j * CALL DLAROT( .TRUE., i-KU.GE.1, i+KL.LT.N, NL, C,S, * A(KU+j+1-i,i),LDA-1, XTOP, XBOTTM ) * * [note that KU+j+1-i is just MAX(1,KU+2-i)] * * SB: (upper triangle) * * . . . . . . * A(K+j+1-i,i),LDA-1, XTOP, XBOTTM ) * * SB: (lower triangle) * * . . . . . . * A(1,i),LDA-1, XTOP, XBOTTM ) * * Arguments * ========= * * LROWS - LOGICAL * If .TRUE., then DLAROT will rotate two rows. If .FALSE., * then it will rotate two columns. * Not modified. * * LLEFT - LOGICAL * If .TRUE., then XLEFT will be used instead of the * corresponding element of A for the first element in the * second row (if LROWS=.FALSE.) or column (if LROWS=.TRUE.) * If .FALSE., then the corresponding element of A will be * used. * Not modified. * * LRIGHT - LOGICAL * If .TRUE., then XRIGHT will be used instead of the * corresponding element of A for the last element in the * first row (if LROWS=.FALSE.) or column (if LROWS=.TRUE.) If * .FALSE., then the corresponding element of A will be used. * Not modified. * * NL - INTEGER * The length of the rows (if LROWS=.TRUE.) or columns (if * LROWS=.FALSE.) to be rotated. If XLEFT and/or XRIGHT are * used, the columns/rows they are in should be included in * NL, e.g., if LLEFT = LRIGHT = .TRUE., then NL must be at * least 2. The number of rows/columns to be rotated * exclusive of those involving XLEFT and/or XRIGHT may * not be negative, i.e., NL minus how many of LLEFT and * LRIGHT are .TRUE. must be at least zero; if not, XERBLA * will be called. * Not modified. * * C, S - DOUBLE PRECISION * Specify the Givens rotation to be applied. If LROWS is * true, then the matrix ( c s ) * (-s c ) is applied from the left; * if false, then the transpose thereof is applied from the * right. For a Givens rotation, C**2 + S**2 should be 1, * but this is not checked. * Not modified. * * A - DOUBLE PRECISION array. * The array containing the rows/columns to be rotated. The * first element of A should be the upper left element to * be rotated. * Read and modified. * * LDA - INTEGER * The "effective" leading dimension of A. If A contains * a matrix stored in GE or SY format, then this is just * the leading dimension of A as dimensioned in the calling * routine. If A contains a matrix stored in band (GB or SB) * format, then this should be *one less* than the leading * dimension used in the calling routine. Thus, if * A were dimensioned A(LDA,*) in DLAROT, then A(1,j) would * be the j-th element in the first of the two rows * to be rotated, and A(2,j) would be the j-th in the second, * regardless of how the array may be stored in the calling * routine. [A cannot, however, actually be dimensioned thus, * since for band format, the row number may exceed LDA, which * is not legal FORTRAN.] * If LROWS=.TRUE., then LDA must be at least 1, otherwise * it must be at least NL minus the number of .TRUE. values * in XLEFT and XRIGHT. * Not modified. * * XLEFT - DOUBLE PRECISION * If LLEFT is .TRUE., then XLEFT will be used and modified * instead of A(2,1) (if LROWS=.TRUE.) or A(1,2) * (if LROWS=.FALSE.). * Read and modified. * * XRIGHT - DOUBLE PRECISION * If LRIGHT is .TRUE., then XRIGHT will be used and modified * instead of A(1,NL) (if LROWS=.TRUE.) or A(NL,1) * (if LROWS=.FALSE.). * Read and modified. * * ===================================================================== * * .. Local Scalars .. INTEGER IINC, INEXT, IX, IY, IYT, NT * .. * .. Local Arrays .. DOUBLE PRECISION XT( 2 ), YT( 2 ) * .. * .. External Subroutines .. EXTERNAL DROT, XERBLA * .. * .. Executable Statements .. * * Set up indices, arrays for ends * IF( LROWS ) THEN IINC = LDA INEXT = 1 ELSE IINC = 1 INEXT = LDA END IF * IF( LLEFT ) THEN NT = 1 IX = 1 + IINC IY = 2 + LDA XT( 1 ) = A( 1 ) YT( 1 ) = XLEFT ELSE NT = 0 IX = 1 IY = 1 + INEXT END IF * IF( LRIGHT ) THEN IYT = 1 + INEXT + ( NL-1 )*IINC NT = NT + 1 XT( NT ) = XRIGHT YT( NT ) = A( IYT ) ELSE IYT = 1 END IF * * Check for errors * IF( NL.LT.NT ) THEN CALL XERBLA( 'DLAROT', 4 ) RETURN END IF IF( LDA.LE.0 .OR. ( .NOT.LROWS .AND. LDA.LT.NL-NT ) ) THEN CALL XERBLA( 'DLAROT', 8 ) RETURN END IF * * Rotate * CALL DROT( NL-NT, A( IX ), IINC, A( IY ), IINC, C, S ) CALL DROT( NT, XT, 1, YT, 1, C, S ) * * Stuff values back into XLEFT, XRIGHT, etc. * IF( LLEFT ) THEN A( 1 ) = XT( 1 ) XLEFT = YT( 1 ) END IF * IF( LRIGHT ) THEN XRIGHT = XT( NT ) A( IYT ) = YT( NT ) END IF * RETURN * * End of DLAROT * END scalapack-2.0.2/TOOLS/LAPACK/dlatm1.f000644 000766 000024 00000015474 10604316307 017110 0ustar00juliestaff000000 000000 SUBROUTINE DLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, N, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER IDIST, INFO, IRSIGN, MODE, N DOUBLE PRECISION COND * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION D( * ) * .. * * Purpose * ======= * * DLATM1 computes the entries of D(1..N) as specified by * MODE, COND and IRSIGN. IDIST and ISEED determine the generation * of random numbers. DLATM1 is called by SLATMR to generate * random test matrices for LAPACK programs. * * Arguments * ========= * * MODE - INTEGER * On entry describes how D is to be computed: * MODE = 0 means do not change D. * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * Not modified. * * COND - DOUBLE PRECISION * On entry, used as described under MODE above. * If used, it must be >= 1. Not modified. * * IRSIGN - INTEGER * On entry, if MODE neither -6, 0 nor 6, determines sign of * entries of D * 0 => leave entries of D unchanged * 1 => multiply each entry of D by 1 or -1 with probability .5 * * IDIST - CHARACTER*1 * On entry, IDIST specifies the type of distribution to be * used to generate a random matrix . * 1 => UNIFORM( 0, 1 ) * 2 => UNIFORM( -1, 1 ) * 3 => NORMAL( 0, 1 ) * Not modified. * * ISEED - INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. The random number generator uses a * linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to DLATM1 * to continue the same random number sequence. * Changed on exit. * * D - DOUBLE PRECISION array, dimension ( MIN( M , N ) ) * Array to be computed according to MODE, COND and IRSIGN. * May be changed on exit if MODE is nonzero. * * N - INTEGER * Number of entries of D. Not modified. * * INFO - INTEGER * 0 => normal termination * -1 => if MODE not in range -6 to 6 * -2 => if MODE neither -6, 0 nor 6, and * IRSIGN neither 0 nor 1 * -3 => if MODE neither -6, 0 nor 6 and COND less than 1 * -4 => if MODE equals 6 or -6 and IDIST not in range 1 to 3 * -7 => if N negative * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D0 ) DOUBLE PRECISION HALF PARAMETER ( HALF = 0.5D0 ) * .. * .. Local Scalars .. INTEGER I DOUBLE PRECISION ALPHA, TEMP * .. * .. External Functions .. DOUBLE PRECISION DLARAN EXTERNAL DLARAN * .. * .. External Subroutines .. EXTERNAL DLARNV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, EXP, LOG * .. * .. Executable Statements .. * * Decode and Test the input parameters. Initialize flags & seed. * INFO = 0 * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Set INFO if an error * IF( MODE.LT.-6 .OR. MODE.GT.6 ) THEN INFO = -1 ELSE IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ ( IRSIGN.NE.0 .AND. IRSIGN.NE.1 ) ) THEN INFO = -2 ELSE IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ COND.LT.ONE ) THEN INFO = -3 ELSE IF( ( MODE.EQ.6 .OR. MODE.EQ.-6 ) .AND. $ ( IDIST.LT.1 .OR. IDIST.GT.3 ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -7 END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLATM1', -INFO ) RETURN END IF * * Compute D according to COND and MODE * IF( MODE.NE.0 ) THEN GO TO ( 10, 30, 50, 70, 90, 110 )ABS( MODE ) * * One large D value: * 10 CONTINUE DO 20 I = 1, N D( I ) = ONE / COND 20 CONTINUE D( 1 ) = ONE GO TO 120 * * One small D value: * 30 CONTINUE DO 40 I = 1, N D( I ) = ONE 40 CONTINUE D( N ) = ONE / COND GO TO 120 * * Exponentially distributed D values: * 50 CONTINUE D( 1 ) = ONE IF( N.GT.1 ) THEN ALPHA = COND**( -ONE / DBLE( N-1 ) ) DO 60 I = 2, N D( I ) = ALPHA**( I-1 ) 60 CONTINUE END IF GO TO 120 * * Arithmetically distributed D values: * 70 CONTINUE D( 1 ) = ONE IF( N.GT.1 ) THEN TEMP = ONE / COND ALPHA = ( ONE-TEMP ) / DBLE( N-1 ) DO 80 I = 2, N D( I ) = DBLE( N-I )*ALPHA + TEMP 80 CONTINUE END IF GO TO 120 * * Randomly distributed D values on ( 1/COND , 1): * 90 CONTINUE ALPHA = LOG( ONE / COND ) DO 100 I = 1, N D( I ) = EXP( ALPHA*DLARAN( ISEED ) ) 100 CONTINUE GO TO 120 * * Randomly distributed D values from IDIST * 110 CONTINUE CALL DLARNV( IDIST, ISEED, N, D ) * 120 CONTINUE * * If MODE neither -6 nor 0 nor 6, and IRSIGN = 1, assign * random signs to D * IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ IRSIGN.EQ.1 ) THEN DO 130 I = 1, N TEMP = DLARAN( ISEED ) IF( TEMP.GT.HALF ) $ D( I ) = -D( I ) 130 CONTINUE END IF * * Reverse if MODE < 0 * IF( MODE.LT.0 ) THEN DO 140 I = 1, N / 2 TEMP = D( I ) D( I ) = D( N+1-I ) D( N+1-I ) = TEMP 140 CONTINUE END IF * END IF * RETURN * * End of DLATM1 * END scalapack-2.0.2/TOOLS/LAPACK/dlatms.f000644 000766 000024 00000112466 11654631032 017212 0ustar00juliestaff000000 000000 SUBROUTINE DLATMS( M, N, DIST, ISEED, SYM, D, MODE, COND, DMAX, $ KL, KU, PACK, A, LDA, WORK, INFO ) * * -- LAPACK test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER DIST, PACK, SYM INTEGER INFO, KL, KU, LDA, M, MODE, N DOUBLE PRECISION COND, DMAX * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION A( LDA, * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * DLATMS generates random matrices with specified singular values * (or symmetric/hermitian with specified eigenvalues) * for testing LAPACK programs. * * DLATMS operates by applying the following sequence of * operations: * * Set the diagonal to D, where D may be input or * computed according to MODE, COND, DMAX, and SYM * as described below. * * Generate a matrix with the appropriate band structure, by one * of two methods: * * Method A: * Generate a dense M x N matrix by multiplying D on the left * and the right by random unitary matrices, then: * * Reduce the bandwidth according to KL and KU, using * Householder transformations. * * Method B: * Convert the bandwidth-0 (i.e., diagonal) matrix to a * bandwidth-1 matrix using Givens rotations, "chasing" * out-of-band elements back, much as in QR; then * convert the bandwidth-1 to a bandwidth-2 matrix, etc. * Note that for reasonably small bandwidths (relative to * M and N) this requires less storage, as a dense matrix * is not generated. Also, for symmetric matrices, only * one triangle is generated. * * Method A is chosen if the bandwidth is a large fraction of the * order of the matrix, and LDA is at least M (so a dense * matrix can be stored.) Method B is chosen if the bandwidth * is small (< 1/2 N for symmetric, < .3 N+M for * non-symmetric), or LDA is less than M and not less than the * bandwidth. * * Pack the matrix if desired. Options specified by PACK are: * no packing * zero out upper half (if symmetric) * zero out lower half (if symmetric) * store the upper half columnwise (if symmetric or upper * triangular) * store the lower half columnwise (if symmetric or lower * triangular) * store the lower triangle in banded format (if symmetric * or lower triangular) * store the upper triangle in banded format (if symmetric * or upper triangular) * store the entire matrix in banded format * If Method B is chosen, and band format is specified, then the * matrix will be generated in the band format, so no repacking * will be necessary. * * Arguments * ========= * * M - INTEGER * The number of rows of A. Not modified. * * N - INTEGER * The number of columns of A. Not modified. * * DIST - CHARACTER*1 * On entry, DIST specifies the type of distribution to be used * to generate the random eigen-/singular values. * 'U' => UNIFORM( 0, 1 ) ( 'U' for uniform ) * 'S' => UNIFORM( -1, 1 ) ( 'S' for symmetric ) * 'N' => NORMAL( 0, 1 ) ( 'N' for normal ) * Not modified. * * ISEED - INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. They should lie between 0 and 4095 inclusive, * and ISEED(4) should be odd. The random number generator * uses a linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to DLATMS * to continue the same random number sequence. * Changed on exit. * * SYM - CHARACTER*1 * If SYM='S' or 'H', the generated matrix is symmetric, with * eigenvalues specified by D, COND, MODE, and DMAX; they * may be positive, negative, or zero. * If SYM='P', the generated matrix is symmetric, with * eigenvalues (= singular values) specified by D, COND, * MODE, and DMAX; they will not be negative. * If SYM='N', the generated matrix is nonsymmetric, with * singular values specified by D, COND, MODE, and DMAX; * they will not be negative. * Not modified. * * D - DOUBLE PRECISION array, dimension ( MIN( M , N ) ) * This array is used to specify the singular values or * eigenvalues of A (see SYM, above.) If MODE=0, then D is * assumed to contain the singular/eigenvalues, otherwise * they will be computed according to MODE, COND, and DMAX, * and placed in D. * Modified if MODE is nonzero. * * MODE - INTEGER * On entry this describes how the singular/eigenvalues are to * be specified: * MODE = 0 means use D as input * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * If SYM='S' or 'H', and MODE is neither 0, 6, nor -6, then * the elements of D will also be multiplied by a random * sign (i.e., +1 or -1.) * Not modified. * * COND - DOUBLE PRECISION * On entry, this is used as described under MODE above. * If used, it must be >= 1. Not modified. * * DMAX - DOUBLE PRECISION * If MODE is neither -6, 0 nor 6, the contents of D, as * computed according to MODE and COND, will be scaled by * DMAX / max(abs(D(i))); thus, the maximum absolute eigen- or * singular value (which is to say the norm) will be abs(DMAX). * Note that DMAX need not be positive: if DMAX is negative * (or zero), D will be scaled by a negative number (or zero). * Not modified. * * KL - INTEGER * This specifies the lower bandwidth of the matrix. For * example, KL=0 implies upper triangular, KL=1 implies upper * Hessenberg, and KL being at least M-1 means that the matrix * has full lower bandwidth. KL must equal KU if the matrix * is symmetric. * Not modified. * * KU - INTEGER * This specifies the upper bandwidth of the matrix. For * example, KU=0 implies lower triangular, KU=1 implies lower * Hessenberg, and KU being at least N-1 means that the matrix * has full upper bandwidth. KL must equal KU if the matrix * is symmetric. * Not modified. * * PACK - CHARACTER*1 * This specifies packing of matrix as follows: * 'N' => no packing * 'U' => zero out all subdiagonal entries (if symmetric) * 'L' => zero out all superdiagonal entries (if symmetric) * 'C' => store the upper triangle columnwise * (only if the matrix is symmetric or upper triangular) * 'R' => store the lower triangle columnwise * (only if the matrix is symmetric or lower triangular) * 'B' => store the lower triangle in band storage scheme * (only if matrix symmetric or lower triangular) * 'Q' => store the upper triangle in band storage scheme * (only if matrix symmetric or upper triangular) * 'Z' => store the entire matrix in band storage scheme * (pivoting can be provided for by using this * option to store A in the trailing rows of * the allocated storage) * * Using these options, the various LAPACK packed and banded * storage schemes can be obtained: * GB - use 'Z' * PB, SB or TB - use 'B' or 'Q' * PP, SP or TP - use 'C' or 'R' * * If two calls to DLATMS differ only in the PACK parameter, * they will generate mathematically equivalent matrices. * Not modified. * * A - DOUBLE PRECISION array, dimension ( LDA, N ) * On exit A is the desired test matrix. A is first generated * in full (unpacked) form, and then packed, if so specified * by PACK. Thus, the first M elements of the first N * columns will always be modified. If PACK specifies a * packed or banded storage scheme, all LDA elements of the * first N columns will be modified; the elements of the * array which do not correspond to elements of the generated * matrix are set to zero. * Modified. * * LDA - INTEGER * LDA specifies the first dimension of A as declared in the * calling program. If PACK='N', 'U', 'L', 'C', or 'R', then * LDA must be at least M. If PACK='B' or 'Q', then LDA must * be at least MIN( KL, M-1) (which is equal to MIN(KU,N-1)). * If PACK='Z', LDA must be large enough to hold the packed * array: MIN( KU, N-1) + MIN( KL, M-1) + 1. * Not modified. * * WORK - DOUBLE PRECISION array, dimension ( 3*MAX( N , M ) ) * Workspace. * Modified. * * INFO - INTEGER * Error code. On exit, INFO will be set to one of the * following values: * 0 => normal return * -1 => M negative or unequal to N and SYM='S', 'H', or 'P' * -2 => N negative * -3 => DIST illegal string * -5 => SYM illegal string * -7 => MODE not in range -6 to 6 * -8 => COND less than 1.0, and MODE neither -6, 0 nor 6 * -10 => KL negative * -11 => KU negative, or SYM='S' or 'H' and KU not equal to KL * -12 => PACK illegal string, or PACK='U' or 'L', and SYM='N'; * or PACK='C' or 'Q' and SYM='N' and KL is not zero; * or PACK='R' or 'B' and SYM='N' and KU is not zero; * or PACK='U', 'L', 'C', 'R', 'B', or 'Q', and M is not * N. * -14 => LDA is less than M, or PACK='Z' and LDA is less than * MIN(KU,N-1) + MIN(KL,M-1) + 1. * 1 => Error return from DLATM1 * 2 => Cannot scale to DMAX (max. sing. value is 0) * 3 => Error return from DLAGGE or SLAGSY * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D0 ) DOUBLE PRECISION TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663D+0 ) * .. * .. Local Scalars .. LOGICAL GIVENS, ILEXTR, ILTEMP, TOPDWN INTEGER I, IC, ICOL, IDIST, IENDCH, IINFO, IL, ILDA, $ IOFFG, IOFFST, IPACK, IPACKG, IR, IR1, IR2, $ IROW, IRSIGN, ISKEW, ISYM, ISYMPK, J, JC, JCH, $ JKL, JKU, JR, K, LLB, MINLDA, MNMIN, MR, NC, $ UUB DOUBLE PRECISION ALPHA, ANGLE, C, DUMMY, EXTRA, S, TEMP * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLARND EXTERNAL LSAME, DLARND * .. * .. External Subroutines .. EXTERNAL DCOPY, DLAGGE, DLAGSY, DLAROT, DLARTG, DLASET, $ DLATM1, DSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, COS, DBLE, MAX, MIN, MOD, SIN * .. * .. Executable Statements .. * * 1) Decode and Test the input parameters. * Initialize flags & seed. * INFO = 0 * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Decode DIST * IF( LSAME( DIST, 'U' ) ) THEN IDIST = 1 ELSE IF( LSAME( DIST, 'S' ) ) THEN IDIST = 2 ELSE IF( LSAME( DIST, 'N' ) ) THEN IDIST = 3 ELSE IDIST = -1 END IF * * Decode SYM * IF( LSAME( SYM, 'N' ) ) THEN ISYM = 1 IRSIGN = 0 ELSE IF( LSAME( SYM, 'P' ) ) THEN ISYM = 2 IRSIGN = 0 ELSE IF( LSAME( SYM, 'S' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE IF( LSAME( SYM, 'H' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE ISYM = -1 END IF * * Decode PACK * ISYMPK = 0 IF( LSAME( PACK, 'N' ) ) THEN IPACK = 0 ELSE IF( LSAME( PACK, 'U' ) ) THEN IPACK = 1 ISYMPK = 1 ELSE IF( LSAME( PACK, 'L' ) ) THEN IPACK = 2 ISYMPK = 1 ELSE IF( LSAME( PACK, 'C' ) ) THEN IPACK = 3 ISYMPK = 2 ELSE IF( LSAME( PACK, 'R' ) ) THEN IPACK = 4 ISYMPK = 3 ELSE IF( LSAME( PACK, 'B' ) ) THEN IPACK = 5 ISYMPK = 3 ELSE IF( LSAME( PACK, 'Q' ) ) THEN IPACK = 6 ISYMPK = 2 ELSE IF( LSAME( PACK, 'Z' ) ) THEN IPACK = 7 ELSE IPACK = -1 END IF * * Set certain internal parameters * MNMIN = MIN( M, N ) LLB = MIN( KL, M-1 ) UUB = MIN( KU, N-1 ) MR = MIN( M, N+LLB ) NC = MIN( N, M+UUB ) IROW = 1 ICOL = 1 * IF( IPACK.EQ.5 .OR. IPACK.EQ.6 ) THEN MINLDA = UUB + 1 ELSE IF( IPACK.EQ.7 ) THEN MINLDA = LLB + UUB + 1 ELSE MINLDA = M END IF * * Use Givens rotation method if bandwidth small enough, * or if LDA is too small to store the matrix unpacked. * GIVENS = .FALSE. IF( ISYM.EQ.1 ) THEN IF( DBLE( LLB+UUB ).LT.0.3D0*DBLE( MAX( 1, MR+NC ) ) ) $ GIVENS = .TRUE. ELSE IF( 2*LLB.LT.M ) $ GIVENS = .TRUE. END IF IF( LDA.LT.M .AND. LDA.GE.MINLDA ) $ GIVENS = .TRUE. * * Set INFO if an error * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( M.NE.N .AND. ISYM.NE.1 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( IDIST.EQ.-1 ) THEN INFO = -3 ELSE IF( ISYM.EQ.-1 ) THEN INFO = -5 ELSE IF( ABS( MODE ).GT.6 ) THEN INFO = -7 ELSE IF( ( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) .AND. COND.LT.ONE ) $ THEN INFO = -8 ELSE IF( KL.LT.0 ) THEN INFO = -10 ELSE IF( KU.LT.0 .OR. ( ISYM.NE.1 .AND. KL.NE.KU ) ) THEN INFO = -11 ELSE IF( IPACK.EQ.-1 .OR. ( ISYMPK.EQ.1 .AND. ISYM.EQ.1 ) .OR. $ ( ISYMPK.EQ.2 .AND. ISYM.EQ.1 .AND. KL.GT.0 ) .OR. $ ( ISYMPK.EQ.3 .AND. ISYM.EQ.1 .AND. KU.GT.0 ) .OR. $ ( ISYMPK.NE.0 .AND. M.NE.N ) ) THEN INFO = -12 ELSE IF( LDA.LT.MAX( 1, MINLDA ) ) THEN INFO = -14 END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLATMS', -INFO ) RETURN END IF * * Initialize random number generator * DO 10 I = 1, 4 ISEED( I ) = MOD( ABS( ISEED( I ) ), 4096 ) 10 CONTINUE * IF( MOD( ISEED( 4 ), 2 ).NE.1 ) $ ISEED( 4 ) = ISEED( 4 ) + 1 * * 2) Set up D if indicated. * * Compute D according to COND and MODE * CALL DLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, MNMIN, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 1 RETURN END IF * * Choose Top-Down if D is (apparently) increasing, * Bottom-Up if D is (apparently) decreasing. * IF( ABS( D( 1 ) ).LE.ABS( D( MNMIN ) ) ) THEN TOPDWN = .TRUE. ELSE TOPDWN = .FALSE. END IF * IF( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) THEN * * Scale by DMAX * TEMP = ABS( D( 1 ) ) DO 20 I = 2, MNMIN TEMP = MAX( TEMP, ABS( D( I ) ) ) 20 CONTINUE * IF( TEMP.GT.ZERO ) THEN ALPHA = DMAX / TEMP ELSE INFO = 2 RETURN END IF * CALL DSCAL( MNMIN, ALPHA, D, 1 ) * END IF * * 3) Generate Banded Matrix using Givens rotations. * Also the special case of UUB=LLB=0 * * Compute Addressing constants to cover all * storage formats. Whether GE, SY, GB, or SB, * upper or lower triangle or both, * the (i,j)-th element is in * A( i - ISKEW*j + IOFFST, j ) * IF( IPACK.GT.4 ) THEN ILDA = LDA - 1 ISKEW = 1 IF( IPACK.GT.5 ) THEN IOFFST = UUB + 1 ELSE IOFFST = 1 END IF ELSE ILDA = LDA ISKEW = 0 IOFFST = 0 END IF * * IPACKG is the format that the matrix is generated in. If this is * different from IPACK, then the matrix must be repacked at the * end. It also signals how to compute the norm, for scaling. * IPACKG = 0 CALL DLASET( 'Full', LDA, N, ZERO, ZERO, A, LDA ) * * Diagonal Matrix -- We are done, unless it * is to be stored SP/PP/TP (PACK='R' or 'C') * IF( LLB.EQ.0 .AND. UUB.EQ.0 ) THEN CALL DCOPY( MNMIN, D, 1, A( 1-ISKEW+IOFFST, 1 ), ILDA+1 ) IF( IPACK.LE.2 .OR. IPACK.GE.5 ) $ IPACKG = IPACK * ELSE IF( GIVENS ) THEN * * Check whether to use Givens rotations, * Householder transformations, or nothing. * IF( ISYM.EQ.1 ) THEN * * Non-symmetric -- A = U D V * IF( IPACK.GT.4 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF * CALL DCOPY( MNMIN, D, 1, A( 1-ISKEW+IOFFST, 1 ), ILDA+1 ) * IF( TOPDWN ) THEN JKL = 0 DO 50 JKU = 1, UUB * * Transform from bandwidth JKL, JKU-1 to JKL, JKU * * Last row actually rotated is M * Last column actually rotated is MIN( M+JKU, N ) * DO 40 JR = 1, MIN( M+JKU, N ) + JKL - 1 EXTRA = ZERO ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) ICOL = MAX( 1, JR-JKL ) IF( JR.LT.M ) THEN IL = MIN( N, JR+JKU ) + 1 - ICOL CALL DLAROT( .TRUE., JR.GT.JKL, .FALSE., IL, C, $ S, A( JR-ISKEW*ICOL+IOFFST, ICOL ), $ ILDA, EXTRA, DUMMY ) END IF * * Chase "EXTRA" back up * IR = JR IC = ICOL DO 30 JCH = JR - JKL, 1, -JKL - JKU IF( IR.LT.M ) THEN CALL DLARTG( A( IR+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), EXTRA, C, S, DUMMY ) END IF IROW = MAX( 1, JCH-JKU ) IL = IR + 2 - IROW TEMP = ZERO ILTEMP = JCH.GT.JKU CALL DLAROT( .FALSE., ILTEMP, .TRUE., IL, C, -S, $ A( IROW-ISKEW*IC+IOFFST, IC ), $ ILDA, TEMP, EXTRA ) IF( ILTEMP ) THEN CALL DLARTG( A( IROW+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), TEMP, C, S, DUMMY ) ICOL = MAX( 1, JCH-JKU-JKL ) IL = IC + 2 - ICOL EXTRA = ZERO CALL DLAROT( .TRUE., JCH.GT.JKU+JKL, .TRUE., $ IL, C, -S, A( IROW-ISKEW*ICOL+ $ IOFFST, ICOL ), ILDA, EXTRA, $ TEMP ) IC = ICOL IR = IROW END IF 30 CONTINUE 40 CONTINUE 50 CONTINUE * JKU = UUB DO 80 JKL = 1, LLB * * Transform from bandwidth JKL-1, JKU to JKL, JKU * DO 70 JC = 1, MIN( N+JKL, M ) + JKU - 1 EXTRA = ZERO ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) IROW = MAX( 1, JC-JKU ) IF( JC.LT.N ) THEN IL = MIN( M, JC+JKL ) + 1 - IROW CALL DLAROT( .FALSE., JC.GT.JKU, .FALSE., IL, C, $ S, A( IROW-ISKEW*JC+IOFFST, JC ), $ ILDA, EXTRA, DUMMY ) END IF * * Chase "EXTRA" back up * IC = JC IR = IROW DO 60 JCH = JC - JKU, 1, -JKL - JKU IF( IC.LT.N ) THEN CALL DLARTG( A( IR+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), EXTRA, C, S, DUMMY ) END IF ICOL = MAX( 1, JCH-JKL ) IL = IC + 2 - ICOL TEMP = ZERO ILTEMP = JCH.GT.JKL CALL DLAROT( .TRUE., ILTEMP, .TRUE., IL, C, -S, $ A( IR-ISKEW*ICOL+IOFFST, ICOL ), $ ILDA, TEMP, EXTRA ) IF( ILTEMP ) THEN CALL DLARTG( A( IR+1-ISKEW*( ICOL+1 )+IOFFST, $ ICOL+1 ), TEMP, C, S, DUMMY ) IROW = MAX( 1, JCH-JKL-JKU ) IL = IR + 2 - IROW EXTRA = ZERO CALL DLAROT( .FALSE., JCH.GT.JKL+JKU, .TRUE., $ IL, C, -S, A( IROW-ISKEW*ICOL+ $ IOFFST, ICOL ), ILDA, EXTRA, $ TEMP ) IC = ICOL IR = IROW END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE * ELSE * * Bottom-Up -- Start at the bottom right. * JKL = 0 DO 110 JKU = 1, UUB * * Transform from bandwidth JKL, JKU-1 to JKL, JKU * * First row actually rotated is M * First column actually rotated is MIN( M+JKU, N ) * IENDCH = MIN( M, N+JKL ) - 1 DO 100 JC = MIN( M+JKU, N ) - 1, 1 - JKL, -1 EXTRA = ZERO ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) IROW = MAX( 1, JC-JKU+1 ) IF( JC.GT.0 ) THEN IL = MIN( M, JC+JKL+1 ) + 1 - IROW CALL DLAROT( .FALSE., .FALSE., JC+JKL.LT.M, IL, $ C, S, A( IROW-ISKEW*JC+IOFFST, $ JC ), ILDA, DUMMY, EXTRA ) END IF * * Chase "EXTRA" back down * IC = JC DO 90 JCH = JC + JKL, IENDCH, JKL + JKU ILEXTR = IC.GT.0 IF( ILEXTR ) THEN CALL DLARTG( A( JCH-ISKEW*IC+IOFFST, IC ), $ EXTRA, C, S, DUMMY ) END IF IC = MAX( 1, IC ) ICOL = MIN( N-1, JCH+JKU ) ILTEMP = JCH + JKU.LT.N TEMP = ZERO CALL DLAROT( .TRUE., ILEXTR, ILTEMP, ICOL+2-IC, $ C, S, A( JCH-ISKEW*IC+IOFFST, IC ), $ ILDA, EXTRA, TEMP ) IF( ILTEMP ) THEN CALL DLARTG( A( JCH-ISKEW*ICOL+IOFFST, $ ICOL ), TEMP, C, S, DUMMY ) IL = MIN( IENDCH, JCH+JKL+JKU ) + 2 - JCH EXTRA = ZERO CALL DLAROT( .FALSE., .TRUE., $ JCH+JKL+JKU.LE.IENDCH, IL, C, S, $ A( JCH-ISKEW*ICOL+IOFFST, $ ICOL ), ILDA, TEMP, EXTRA ) IC = ICOL END IF 90 CONTINUE 100 CONTINUE 110 CONTINUE * JKU = UUB DO 140 JKL = 1, LLB * * Transform from bandwidth JKL-1, JKU to JKL, JKU * * First row actually rotated is MIN( N+JKL, M ) * First column actually rotated is N * IENDCH = MIN( N, M+JKU ) - 1 DO 130 JR = MIN( N+JKL, M ) - 1, 1 - JKU, -1 EXTRA = ZERO ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) ICOL = MAX( 1, JR-JKL+1 ) IF( JR.GT.0 ) THEN IL = MIN( N, JR+JKU+1 ) + 1 - ICOL CALL DLAROT( .TRUE., .FALSE., JR+JKU.LT.N, IL, $ C, S, A( JR-ISKEW*ICOL+IOFFST, $ ICOL ), ILDA, DUMMY, EXTRA ) END IF * * Chase "EXTRA" back down * IR = JR DO 120 JCH = JR + JKU, IENDCH, JKL + JKU ILEXTR = IR.GT.0 IF( ILEXTR ) THEN CALL DLARTG( A( IR-ISKEW*JCH+IOFFST, JCH ), $ EXTRA, C, S, DUMMY ) END IF IR = MAX( 1, IR ) IROW = MIN( M-1, JCH+JKL ) ILTEMP = JCH + JKL.LT.M TEMP = ZERO CALL DLAROT( .FALSE., ILEXTR, ILTEMP, IROW+2-IR, $ C, S, A( IR-ISKEW*JCH+IOFFST, $ JCH ), ILDA, EXTRA, TEMP ) IF( ILTEMP ) THEN CALL DLARTG( A( IROW-ISKEW*JCH+IOFFST, JCH ), $ TEMP, C, S, DUMMY ) IL = MIN( IENDCH, JCH+JKL+JKU ) + 2 - JCH EXTRA = ZERO CALL DLAROT( .TRUE., .TRUE., $ JCH+JKL+JKU.LE.IENDCH, IL, C, S, $ A( IROW-ISKEW*JCH+IOFFST, JCH ), $ ILDA, TEMP, EXTRA ) IR = IROW END IF 120 CONTINUE 130 CONTINUE 140 CONTINUE END IF * ELSE * * Symmetric -- A = U D U' * IPACKG = IPACK IOFFG = IOFFST * IF( TOPDWN ) THEN * * Top-Down -- Generate Upper triangle only * IF( IPACK.GE.5 ) THEN IPACKG = 6 IOFFG = UUB + 1 ELSE IPACKG = 1 END IF CALL DCOPY( MNMIN, D, 1, A( 1-ISKEW+IOFFG, 1 ), ILDA+1 ) * DO 170 K = 1, UUB DO 160 JC = 1, N - 1 IROW = MAX( 1, JC-K ) IL = MIN( JC+1, K+2 ) EXTRA = ZERO TEMP = A( JC-ISKEW*( JC+1 )+IOFFG, JC+1 ) ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) CALL DLAROT( .FALSE., JC.GT.K, .TRUE., IL, C, S, $ A( IROW-ISKEW*JC+IOFFG, JC ), ILDA, $ EXTRA, TEMP ) CALL DLAROT( .TRUE., .TRUE., .FALSE., $ MIN( K, N-JC )+1, C, S, $ A( ( 1-ISKEW )*JC+IOFFG, JC ), ILDA, $ TEMP, DUMMY ) * * Chase EXTRA back up the matrix * ICOL = JC DO 150 JCH = JC - K, 1, -K CALL DLARTG( A( JCH+1-ISKEW*( ICOL+1 )+IOFFG, $ ICOL+1 ), EXTRA, C, S, DUMMY ) TEMP = A( JCH-ISKEW*( JCH+1 )+IOFFG, JCH+1 ) CALL DLAROT( .TRUE., .TRUE., .TRUE., K+2, C, -S, $ A( ( 1-ISKEW )*JCH+IOFFG, JCH ), $ ILDA, TEMP, EXTRA ) IROW = MAX( 1, JCH-K ) IL = MIN( JCH+1, K+2 ) EXTRA = ZERO CALL DLAROT( .FALSE., JCH.GT.K, .TRUE., IL, C, $ -S, A( IROW-ISKEW*JCH+IOFFG, JCH ), $ ILDA, EXTRA, TEMP ) ICOL = JCH 150 CONTINUE 160 CONTINUE 170 CONTINUE * * If we need lower triangle, copy from upper. Note that * the order of copying is chosen to work for 'q' -> 'b' * IF( IPACK.NE.IPACKG .AND. IPACK.NE.3 ) THEN DO 190 JC = 1, N IROW = IOFFST - ISKEW*JC DO 180 JR = JC, MIN( N, JC+UUB ) A( JR+IROW, JC ) = A( JC-ISKEW*JR+IOFFG, JR ) 180 CONTINUE 190 CONTINUE IF( IPACK.EQ.5 ) THEN DO 210 JC = N - UUB + 1, N DO 200 JR = N + 2 - JC, UUB + 1 A( JR, JC ) = ZERO 200 CONTINUE 210 CONTINUE END IF IF( IPACKG.EQ.6 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF END IF ELSE * * Bottom-Up -- Generate Lower triangle only * IF( IPACK.GE.5 ) THEN IPACKG = 5 IF( IPACK.EQ.6 ) $ IOFFG = 1 ELSE IPACKG = 2 END IF CALL DCOPY( MNMIN, D, 1, A( 1-ISKEW+IOFFG, 1 ), ILDA+1 ) * DO 240 K = 1, UUB DO 230 JC = N - 1, 1, -1 IL = MIN( N+1-JC, K+2 ) EXTRA = ZERO TEMP = A( 1+( 1-ISKEW )*JC+IOFFG, JC ) ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE ) S = -SIN( ANGLE ) CALL DLAROT( .FALSE., .TRUE., N-JC.GT.K, IL, C, S, $ A( ( 1-ISKEW )*JC+IOFFG, JC ), ILDA, $ TEMP, EXTRA ) ICOL = MAX( 1, JC-K+1 ) CALL DLAROT( .TRUE., .FALSE., .TRUE., JC+2-ICOL, C, $ S, A( JC-ISKEW*ICOL+IOFFG, ICOL ), $ ILDA, DUMMY, TEMP ) * * Chase EXTRA back down the matrix * ICOL = JC DO 220 JCH = JC + K, N - 1, K CALL DLARTG( A( JCH-ISKEW*ICOL+IOFFG, ICOL ), $ EXTRA, C, S, DUMMY ) TEMP = A( 1+( 1-ISKEW )*JCH+IOFFG, JCH ) CALL DLAROT( .TRUE., .TRUE., .TRUE., K+2, C, S, $ A( JCH-ISKEW*ICOL+IOFFG, ICOL ), $ ILDA, EXTRA, TEMP ) IL = MIN( N+1-JCH, K+2 ) EXTRA = ZERO CALL DLAROT( .FALSE., .TRUE., N-JCH.GT.K, IL, C, $ S, A( ( 1-ISKEW )*JCH+IOFFG, JCH ), $ ILDA, TEMP, EXTRA ) ICOL = JCH 220 CONTINUE 230 CONTINUE 240 CONTINUE * * If we need upper triangle, copy from lower. Note that * the order of copying is chosen to work for 'b' -> 'q' * IF( IPACK.NE.IPACKG .AND. IPACK.NE.4 ) THEN DO 260 JC = N, 1, -1 IROW = IOFFST - ISKEW*JC DO 250 JR = JC, MAX( 1, JC-UUB ), -1 A( JR+IROW, JC ) = A( JC-ISKEW*JR+IOFFG, JR ) 250 CONTINUE 260 CONTINUE IF( IPACK.EQ.6 ) THEN DO 280 JC = 1, UUB DO 270 JR = 1, UUB + 1 - JC A( JR, JC ) = ZERO 270 CONTINUE 280 CONTINUE END IF IF( IPACKG.EQ.5 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF END IF END IF END IF * ELSE * * 4) Generate Banded Matrix by first * Rotating by random Unitary matrices, * then reducing the bandwidth using Householder * transformations. * * Note: we should get here only if LDA .ge. N * IF( ISYM.EQ.1 ) THEN * * Non-symmetric -- A = U D V * CALL DLAGGE( MR, NC, LLB, UUB, D, A, LDA, ISEED, WORK, $ IINFO ) ELSE * * Symmetric -- A = U D U' * CALL DLAGSY( M, LLB, D, A, LDA, ISEED, WORK, IINFO ) * END IF IF( IINFO.NE.0 ) THEN INFO = 3 RETURN END IF END IF * * 5) Pack the matrix * IF( IPACK.NE.IPACKG ) THEN IF( IPACK.EQ.1 ) THEN * * 'U' -- Upper triangular, not packed * DO 300 J = 1, M DO 290 I = J + 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE * ELSE IF( IPACK.EQ.2 ) THEN * * 'L' -- Lower triangular, not packed * DO 320 J = 2, M DO 310 I = 1, J - 1 A( I, J ) = ZERO 310 CONTINUE 320 CONTINUE * ELSE IF( IPACK.EQ.3 ) THEN * * 'C' -- Upper triangle packed Columnwise. * ICOL = 1 IROW = 0 DO 340 J = 1, M DO 330 I = 1, J IROW = IROW + 1 IF( IROW.GT.LDA ) THEN IROW = 1 ICOL = ICOL + 1 END IF A( IROW, ICOL ) = A( I, J ) 330 CONTINUE 340 CONTINUE * ELSE IF( IPACK.EQ.4 ) THEN * * 'R' -- Lower triangle packed Columnwise. * ICOL = 1 IROW = 0 DO 360 J = 1, M DO 350 I = J, M IROW = IROW + 1 IF( IROW.GT.LDA ) THEN IROW = 1 ICOL = ICOL + 1 END IF A( IROW, ICOL ) = A( I, J ) 350 CONTINUE 360 CONTINUE * ELSE IF( IPACK.GE.5 ) THEN * * 'B' -- The lower triangle is packed as a band matrix. * 'Q' -- The upper triangle is packed as a band matrix. * 'Z' -- The whole matrix is packed as a band matrix. * IF( IPACK.EQ.5 ) $ UUB = 0 IF( IPACK.EQ.6 ) $ LLB = 0 * DO 380 J = 1, UUB DO 370 I = MIN( J+LLB, M ), 1, -1 A( I-J+UUB+1, J ) = A( I, J ) 370 CONTINUE 380 CONTINUE * DO 400 J = UUB + 2, N DO 390 I = J - UUB, MIN( J+LLB, M ) A( I-J+UUB+1, J ) = A( I, J ) 390 CONTINUE 400 CONTINUE END IF * * If packed, zero out extraneous elements. * * Symmetric/Triangular Packed -- * zero out everything after A(IROW,ICOL) * IF( IPACK.EQ.3 .OR. IPACK.EQ.4 ) THEN DO 420 JC = ICOL, M DO 410 JR = IROW + 1, LDA A( JR, JC ) = ZERO 410 CONTINUE IROW = 0 420 CONTINUE * ELSE IF( IPACK.GE.5 ) THEN * * Packed Band -- * 1st row is now in A( UUB+2-j, j), zero above it * m-th row is now in A( M+UUB-j,j), zero below it * last non-zero diagonal is now in A( UUB+LLB+1,j ), * zero below it, too. * IR1 = UUB + LLB + 2 IR2 = UUB + M + 2 DO 450 JC = 1, N DO 430 JR = 1, UUB + 1 - JC A( JR, JC ) = ZERO 430 CONTINUE DO 440 JR = MAX( 1, MIN( IR1, IR2-JC ) ), LDA A( JR, JC ) = ZERO 440 CONTINUE 450 CONTINUE END IF END IF * RETURN * * End of DLATMS * END scalapack-2.0.2/TOOLS/LAPACK/icopy.f000644 000766 000024 00000004344 10363532303 017041 0ustar00juliestaff000000 000000 SUBROUTINE ICOPY( N, SX, INCX, SY, INCY ) * * -- LAPACK auxiliary test routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * February 29, 1992 * * .. Scalar Arguments .. INTEGER INCX, INCY, N * .. * .. Array Arguments .. INTEGER SX( * ), SY( * ) * .. * * Purpose * ======= * * ICOPY copies an integer vector x to an integer vector y. * Uses unrolled loops for increments equal to 1. * * Arguments * ========= * * N (input) INTEGER * The length of the vectors SX and SY. * * SX (input) INTEGER array, dimension (1+(N-1)*abs(INCX)) * The vector X. * * INCX (input) INTEGER * The spacing between consecutive elements of SX. * * SY (output) INTEGER array, dimension (1+(N-1)*abs(INCY)) * The vector Y. * * INCY (input) INTEGER * The spacing between consecutive elements of SY. * * ===================================================================== * * .. Local Scalars .. INTEGER I, IX, IY, M, MP1 * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * IF( N.LE.0 ) $ RETURN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) $ GO TO 20 * * Code for unequal increments or equal increments not equal to 1 * IX = 1 IY = 1 IF( INCX.LT.0 ) $ IX = ( -N+1 )*INCX + 1 IF( INCY.LT.0 ) $ IY = ( -N+1 )*INCY + 1 DO 10 I = 1, N SY( IY ) = SX( IX ) IX = IX + INCX IY = IY + INCY 10 CONTINUE RETURN * * Code for both increments equal to 1 * * Clean-up loop * 20 CONTINUE M = MOD( N, 7 ) IF( M.EQ.0 ) $ GO TO 40 DO 30 I = 1, M SY( I ) = SX( I ) 30 CONTINUE IF( N.LT.7 ) $ RETURN 40 CONTINUE MP1 = M + 1 DO 50 I = MP1, N, 7 SY( I ) = SX( I ) SY( I+1 ) = SX( I+1 ) SY( I+2 ) = SX( I+2 ) SY( I+3 ) = SX( I+3 ) SY( I+4 ) = SX( I+4 ) SY( I+5 ) = SX( I+5 ) SY( I+6 ) = SX( I+6 ) 50 CONTINUE RETURN * * End of ICOPY * END scalapack-2.0.2/TOOLS/LAPACK/Makefile000644 000766 000024 00000003227 11654025546 017220 0ustar00juliestaff000000 000000 ############################################################################ # # Program: ScaLAPACK # # Module: Makefile # # Purpose: Tools - LAPACK Makefile # # Creation date: March 20, 1995 # # Modified: February 16, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../SLmake.inc ALLAUX = icopy.o SCATGEN = slatm1.o slaran.o slarnd.o SMATGEN = slatms.o slagge.o slagsy.o slarot.o CMATGEN = clarnv.o clatm1.o clatms.o clagge.o claghe.o clagsy.o clarot.o \ clarnd.o DZATGEN = dlatm1.o dlaran.o dlarnd.o DMATGEN = dlatms.o dlagge.o dlagsy.o dlarot.o ZMATGEN = zlarnv.o zlatm1.o zlatms.o zlagge.o zlaghe.o zlagsy.o zlarot.o \ zlarnd.o all : single complex double complex16 single: $(ALLAUX) $(SMATGEN) $(SCATGEN) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(SLASRC) $(ALLAUX) $(SCLAUX) \ $(SMATGEN) $(SCATGEN) $(RANLIB) ../../$(SCALAPACKLIB) complex: $(ALLAUX) $(CMATGEN) $(SCATGEN) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(CLASRC) $(ALLAUX) $(SCLAUX) \ $(CMATGEN) $(SCATGEN) $(RANLIB) ../../$(SCALAPACKLIB) double: $(ALLAUX) $(DMATGEN) $(DZATGEN) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(DLASRC) $(ALLAUX) $(DZLAUX) \ $(DMATGEN) $(DZATGEN) $(RANLIB) ../../$(SCALAPACKLIB) complex16: $(ALLAUX) $(ZMATGEN) $(DZATGEN) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(ZLASRC) $(ALLAUX) $(DZLAUX) \ $(ZMATGEN) $(DZATGEN) $(RANLIB) ../../$(SCALAPACKLIB) clean : rm -f *.o slamch.o: $(FC) -c $(NOOPT) slamch.f dlamch.o: $(FC) -c $(NOOPT) dlamch.f .f.o : ; $(FC) -c $(FCFLAGS) $*.f scalapack-2.0.2/TOOLS/LAPACK/slagge.f000644 000766 000024 00000021610 10604316307 017155 0ustar00juliestaff000000 000000 SUBROUTINE SLAGGE( M, N, KL, KU, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDA, M, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL A( LDA, * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * SLAGGE generates a real general m by n matrix A, by pre- and post- * multiplying a real diagonal matrix D with random orthogonal matrices: * A = U*D*V. The lower and upper bandwidths may then be reduced to * kl and ku by additional orthogonal transformations. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= KL <= M-1. * * KU (input) INTEGER * The number of nonzero superdiagonals within the band of A. * 0 <= KU <= N-1. * * D (input) REAL array, dimension (min(M,N)) * The diagonal elements of the diagonal matrix D. * * A (output) REAL array, dimension (LDA,N) * The generated m by n matrix A. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= M. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) REAL array, dimension (M+N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J REAL TAU, WA, WB, WN * .. * .. External Subroutines .. EXTERNAL SGEMV, SGER, SLARNV, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SIGN * .. * .. External Functions .. REAL SNRM2 EXTERNAL SNRM2 * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( KL.LT.0 .OR. KL.GT.M-1 ) THEN INFO = -3 ELSE IF( KU.LT.0 .OR. KU.GT.N-1 ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'SLAGGE', -INFO ) RETURN END IF * * initialize A to diagonal matrix * DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, MIN( M, N ) A( I, I ) = D( I ) 30 CONTINUE * * pre- and post-multiply A by random orthogonal matrices * DO 40 I = MIN( M, N ), 1, -1 IF( I.LT.M ) THEN * * generate random reflection * CALL SLARNV( 3, ISEED, M-I+1, WORK ) WN = SNRM2( M-I+1, WORK, 1 ) WA = SIGN( WN, WORK( 1 ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL SSCAL( M-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = WB / WA END IF * * multiply A(i:m,i:n) by random reflection from the left * CALL SGEMV( 'Transpose', M-I+1, N-I+1, ONE, A( I, I ), LDA, $ WORK, 1, ZERO, WORK( M+1 ), 1 ) CALL SGER( M-I+1, N-I+1, -TAU, WORK, 1, WORK( M+1 ), 1, $ A( I, I ), LDA ) END IF IF( I.LT.N ) THEN * * generate random reflection * CALL SLARNV( 3, ISEED, N-I+1, WORK ) WN = SNRM2( N-I+1, WORK, 1 ) WA = SIGN( WN, WORK( 1 ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL SSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = WB / WA END IF * * multiply A(i:m,i:n) by random reflection from the right * CALL SGEMV( 'No transpose', M-I+1, N-I+1, ONE, A( I, I ), $ LDA, WORK, 1, ZERO, WORK( N+1 ), 1 ) CALL SGER( M-I+1, N-I+1, -TAU, WORK( N+1 ), 1, WORK, 1, $ A( I, I ), LDA ) END IF 40 CONTINUE * * Reduce number of subdiagonals to KL and number of superdiagonals * to KU * DO 70 I = 1, MAX( M-1-KL, N-1-KU ) IF( KL.LE.KU ) THEN * * annihilate subdiagonal elements first (necessary if KL = 0) * IF( I.LE.MIN( M-1-KL, N ) ) THEN * * generate reflection to annihilate A(kl+i+1:m,i) * WN = SNRM2( M-KL-I+1, A( KL+I, I ), 1 ) WA = SIGN( WN, A( KL+I, I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( KL+I, I ) + WA CALL SSCAL( M-KL-I, ONE / WB, A( KL+I+1, I ), 1 ) A( KL+I, I ) = ONE TAU = WB / WA END IF * * apply reflection to A(kl+i:m,i+1:n) from the left * CALL SGEMV( 'Transpose', M-KL-I+1, N-I, ONE, $ A( KL+I, I+1 ), LDA, A( KL+I, I ), 1, ZERO, $ WORK, 1 ) CALL SGER( M-KL-I+1, N-I, -TAU, A( KL+I, I ), 1, WORK, 1, $ A( KL+I, I+1 ), LDA ) A( KL+I, I ) = -WA END IF * IF( I.LE.MIN( N-1-KU, M ) ) THEN * * generate reflection to annihilate A(i,ku+i+1:n) * WN = SNRM2( N-KU-I+1, A( I, KU+I ), LDA ) WA = SIGN( WN, A( I, KU+I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( I, KU+I ) + WA CALL SSCAL( N-KU-I, ONE / WB, A( I, KU+I+1 ), LDA ) A( I, KU+I ) = ONE TAU = WB / WA END IF * * apply reflection to A(i+1:m,ku+i:n) from the right * CALL SGEMV( 'No transpose', M-I, N-KU-I+1, ONE, $ A( I+1, KU+I ), LDA, A( I, KU+I ), LDA, ZERO, $ WORK, 1 ) CALL SGER( M-I, N-KU-I+1, -TAU, WORK, 1, A( I, KU+I ), $ LDA, A( I+1, KU+I ), LDA ) A( I, KU+I ) = -WA END IF ELSE * * annihilate superdiagonal elements first (necessary if * KU = 0) * IF( I.LE.MIN( N-1-KU, M ) ) THEN * * generate reflection to annihilate A(i,ku+i+1:n) * WN = SNRM2( N-KU-I+1, A( I, KU+I ), LDA ) WA = SIGN( WN, A( I, KU+I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( I, KU+I ) + WA CALL SSCAL( N-KU-I, ONE / WB, A( I, KU+I+1 ), LDA ) A( I, KU+I ) = ONE TAU = WB / WA END IF * * apply reflection to A(i+1:m,ku+i:n) from the right * CALL SGEMV( 'No transpose', M-I, N-KU-I+1, ONE, $ A( I+1, KU+I ), LDA, A( I, KU+I ), LDA, ZERO, $ WORK, 1 ) CALL SGER( M-I, N-KU-I+1, -TAU, WORK, 1, A( I, KU+I ), $ LDA, A( I+1, KU+I ), LDA ) A( I, KU+I ) = -WA END IF * IF( I.LE.MIN( M-1-KL, N ) ) THEN * * generate reflection to annihilate A(kl+i+1:m,i) * WN = SNRM2( M-KL-I+1, A( KL+I, I ), 1 ) WA = SIGN( WN, A( KL+I, I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( KL+I, I ) + WA CALL SSCAL( M-KL-I, ONE / WB, A( KL+I+1, I ), 1 ) A( KL+I, I ) = ONE TAU = WB / WA END IF * * apply reflection to A(kl+i:m,i+1:n) from the left * CALL SGEMV( 'Transpose', M-KL-I+1, N-I, ONE, $ A( KL+I, I+1 ), LDA, A( KL+I, I ), 1, ZERO, $ WORK, 1 ) CALL SGER( M-KL-I+1, N-I, -TAU, A( KL+I, I ), 1, WORK, 1, $ A( KL+I, I+1 ), LDA ) A( KL+I, I ) = -WA END IF END IF * DO 50 J = KL + I + 1, M A( J, I ) = ZERO 50 CONTINUE * DO 60 J = KU + I + 1, N A( I, J ) = ZERO 60 CONTINUE 70 CONTINUE RETURN * * End of SLAGGE * END scalapack-2.0.2/TOOLS/LAPACK/slagsy.f000644 000766 000024 00000013076 10604316307 017224 0ustar00juliestaff000000 000000 SUBROUTINE SLAGSY( N, K, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, K, LDA, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL A( LDA, * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * SLAGSY generates a real symmetric matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random orthogonal matrix: * A = U*D*U'. The semi-bandwidth may then be reduced to k by additional * orthogonal transformations. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * K (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * * D (input) REAL array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (output) REAL array, dimension (LDA,N) * The generated n by n symmetric matrix A (the full matrix is * stored). * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= N. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) REAL array, dimension (2*N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE, HALF PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, HALF = 0.5E+0 ) * .. * .. Local Scalars .. INTEGER I, J REAL ALPHA, TAU, WA, WB, WN * .. * .. External Subroutines .. EXTERNAL SAXPY, SGEMV, SGER, SLARNV, SSCAL, SSYMV, $ SSYR2, XERBLA * .. * .. External Functions .. REAL SDOT, SNRM2 EXTERNAL SDOT, SNRM2 * .. * .. Intrinsic Functions .. INTRINSIC MAX, SIGN * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'SLAGSY', -INFO ) RETURN END IF * * initialize lower triangle of A to diagonal matrix * DO 20 J = 1, N DO 10 I = J + 1, N A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, N A( I, I ) = D( I ) 30 CONTINUE * * Generate lower triangle of symmetric matrix * DO 40 I = N - 1, 1, -1 * * generate random reflection * CALL SLARNV( 3, ISEED, N-I+1, WORK ) WN = SNRM2( N-I+1, WORK, 1 ) WA = SIGN( WN, WORK( 1 ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL SSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = WB / WA END IF * * apply random reflection to A(i:n,i:n) from the left * and the right * * compute y := tau * A * u * CALL SSYMV( 'Lower', N-I+1, TAU, A( I, I ), LDA, WORK, 1, ZERO, $ WORK( N+1 ), 1 ) * * compute v := y - 1/2 * tau * ( y, u ) * u * ALPHA = -HALF*TAU*SDOT( N-I+1, WORK( N+1 ), 1, WORK, 1 ) CALL SAXPY( N-I+1, ALPHA, WORK, 1, WORK( N+1 ), 1 ) * * apply the transformation as a rank-2 update to A(i:n,i:n) * CALL SSYR2( 'Lower', N-I+1, -ONE, WORK, 1, WORK( N+1 ), 1, $ A( I, I ), LDA ) 40 CONTINUE * * Reduce number of subdiagonals to K * DO 60 I = 1, N - 1 - K * * generate reflection to annihilate A(k+i+1:n,i) * WN = SNRM2( N-K-I+1, A( K+I, I ), 1 ) WA = SIGN( WN, A( K+I, I ) ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( K+I, I ) + WA CALL SSCAL( N-K-I, ONE / WB, A( K+I+1, I ), 1 ) A( K+I, I ) = ONE TAU = WB / WA END IF * * apply reflection to A(k+i:n,i+1:k+i-1) from the left * CALL SGEMV( 'Transpose', N-K-I+1, K-1, ONE, A( K+I, I+1 ), LDA, $ A( K+I, I ), 1, ZERO, WORK, 1 ) CALL SGER( N-K-I+1, K-1, -TAU, A( K+I, I ), 1, WORK, 1, $ A( K+I, I+1 ), LDA ) * * apply reflection to A(k+i:n,k+i:n) from the left and the right * * compute y := tau * A * u * CALL SSYMV( 'Lower', N-K-I+1, TAU, A( K+I, K+I ), LDA, $ A( K+I, I ), 1, ZERO, WORK, 1 ) * * compute v := y - 1/2 * tau * ( y, u ) * u * ALPHA = -HALF*TAU*SDOT( N-K-I+1, WORK, 1, A( K+I, I ), 1 ) CALL SAXPY( N-K-I+1, ALPHA, A( K+I, I ), 1, WORK, 1 ) * * apply symmetric rank-2 update to A(k+i:n,k+i:n) * CALL SSYR2( 'Lower', N-K-I+1, -ONE, A( K+I, I ), 1, WORK, 1, $ A( K+I, K+I ), LDA ) * A( K+I, I ) = -WA DO 50 J = K + I + 1, N A( J, I ) = ZERO 50 CONTINUE 60 CONTINUE * * Store full symmetric matrix * DO 80 J = 1, N DO 70 I = J + 1, N A( J, I ) = A( I, J ) 70 CONTINUE 80 CONTINUE RETURN * * End of SLAGSY * END scalapack-2.0.2/TOOLS/LAPACK/slaran.f000644 000766 000024 00000006412 10604316307 017176 0ustar00juliestaff000000 000000 REAL FUNCTION SLARAN( ISEED ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Array Arguments .. INTEGER ISEED( 4 ) * .. * * Purpose * ======= * * SLARAN returns a random real number from a uniform (0,1) * distribution. * * Arguments * ========= * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * Further Details * =============== * * This routine uses a multiplicative congruential method with modulus * 2**48 and multiplier 33952834046453 (see G.S.Fishman, * 'Multiplicative congruential random number generators with modulus * 2**b: an exhaustive analysis for b = 32 and a partial analysis for * b = 48', Math. Comp. 189, pp 331-344, 1990). * * 48-bit integers are stored in 4 integer array elements with 12 bits * per element. Hence the routine is portable across machines with * integers of 32 bits or more. * * ===================================================================== * * .. Parameters .. INTEGER M1, M2, M3, M4 PARAMETER ( M1 = 494, M2 = 322, M3 = 2508, M4 = 2549 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) INTEGER IPW2 REAL R PARAMETER ( IPW2 = 4096, R = ONE / IPW2 ) * .. * .. Local Scalars .. INTEGER IT1, IT2, IT3, IT4 REAL RNDOUT * .. * .. Intrinsic Functions .. INTRINSIC MOD, REAL * .. * .. Executable Statements .. 10 CONTINUE * * multiply the seed by the multiplier modulo 2**48 * IT4 = ISEED( 4 )*M4 IT3 = IT4 / IPW2 IT4 = IT4 - IPW2*IT3 IT3 = IT3 + ISEED( 3 )*M4 + ISEED( 4 )*M3 IT2 = IT3 / IPW2 IT3 = IT3 - IPW2*IT2 IT2 = IT2 + ISEED( 2 )*M4 + ISEED( 3 )*M3 + ISEED( 4 )*M2 IT1 = IT2 / IPW2 IT2 = IT2 - IPW2*IT1 IT1 = IT1 + ISEED( 1 )*M4 + ISEED( 2 )*M3 + ISEED( 3 )*M2 + $ ISEED( 4 )*M1 IT1 = MOD( IT1, IPW2 ) * * return updated seed * ISEED( 1 ) = IT1 ISEED( 2 ) = IT2 ISEED( 3 ) = IT3 ISEED( 4 ) = IT4 * * convert 48-bit integer to a real number in the interval (0,1) * RNDOUT = R*( REAL( IT1 )+R*( REAL( IT2 )+R*( REAL( IT3 )+R* $ ( REAL( IT4 ) ) ) ) ) * IF (RNDOUT.EQ.1.0) THEN * If a real number has n bits of precision, and the first * n bits of the 48-bit integer above happen to be all 1 (which * will occur about once every 2**n calls), then SLARAN will * be rounded to exactly 1.0. In IEEE single precision arithmetic, * this will happen relatively often since n = 24. * Since SLARAN is not supposed to return exactly 0.0 or 1.0 * (and some callers of SLARAN, such as CLARND, depend on that), * the statistically correct thing to do in this situation is * simply to iterate again. * N.B. the case SLARAN = 0.0 should not be possible. * GOTO 10 END IF * SLARAN = RNDOUT RETURN * * End of SLARAN * END scalapack-2.0.2/TOOLS/LAPACK/slarnd.f000644 000766 000024 00000004333 11654631032 017202 0ustar00juliestaff000000 000000 REAL FUNCTION SLARND( IDIST, ISEED ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER IDIST * .. * .. Array Arguments .. INTEGER ISEED( 4 ) * .. * * Purpose * ======= * * SLARND returns a random real number from a uniform or normal * distribution. * * Arguments * ========= * * IDIST (input) INTEGER * Specifies the distribution of the random numbers: * = 1: uniform (0,1) * = 2: uniform (-1,1) * = 3: normal (0,1) * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * Further Details * =============== * * This routine calls the auxiliary routine SLARAN to generate a random * real number from a uniform (0,1) distribution. The Box-Muller method * is used to transform numbers from a uniform to a normal distribution. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) REAL TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663E+0 ) * .. * .. Local Scalars .. REAL T1, T2 * .. * .. External Functions .. REAL SLARAN EXTERNAL SLARAN * .. * .. Intrinsic Functions .. INTRINSIC COS, LOG, SQRT * .. * .. Executable Statements .. * * Generate a real random number from a uniform (0,1) distribution * T1 = SLARAN( ISEED ) * IF( IDIST.EQ.1 ) THEN * * uniform (0,1) * SLARND = T1 ELSE IF( IDIST.EQ.2 ) THEN * * uniform (-1,1) * SLARND = TWO*T1 - ONE ELSE IF( IDIST.EQ.3 ) THEN * * normal (0,1) * T2 = SLARAN( ISEED ) SLARND = SQRT( -TWO*LOG( T1 ) )*COS( TWOPI*T2 ) ELSE SLARND = ONE END IF RETURN * * End of SLARND * END scalapack-2.0.2/TOOLS/LAPACK/slarot.f000644 000766 000024 00000022320 11654631032 017217 0ustar00juliestaff000000 000000 SUBROUTINE SLAROT( LROWS, LLEFT, LRIGHT, NL, C, S, A, LDA, XLEFT, $ XRIGHT ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. LOGICAL LLEFT, LRIGHT, LROWS INTEGER LDA, NL REAL C, S, XLEFT, XRIGHT * .. * .. Array Arguments .. REAL A( * ) * .. * * Purpose * ======= * * SLAROT applies a (Givens) rotation to two adjacent rows or * columns, where one element of the first and/or last column/row * November 2006 * for use on matrices stored in some format other than GE, so * that elements of the matrix may be used or modified for which * no array element is provided. * * One example is a symmetric matrix in SB format (bandwidth=4), for * which UPLO='L': Two adjacent rows will have the format: * * row j: * * * * * . . . . * row j+1: * * * * * . . . . * * '*' indicates elements for which storage is provided, * '.' indicates elements for which no storage is provided, but * are not necessarily zero; their values are determined by * symmetry. ' ' indicates elements which are necessarily zero, * and have no storage provided. * * Those columns which have two '*'s can be handled by SROT. * Those columns which have no '*'s can be ignored, since as long * as the Givens rotations are carefully applied to preserve * symmetry, their values are determined. * Those columns which have one '*' have to be handled separately, * by using separate variables "p" and "q": * * row j: * * * * * p . . . * row j+1: q * * * * * . . . . * * The element p would have to be set correctly, then that column * is rotated, setting p to its new value. The next call to * SLAROT would rotate columns j and j+1, using p, and restore * symmetry. The element q would start out being zero, and be * made non-zero by the rotation. Later, rotations would presumably * be chosen to zero q out. * * Typical Calling Sequences: rotating the i-th and (i+1)-st rows. * ------- ------- --------- * * General dense matrix: * * CALL SLAROT(.TRUE.,.FALSE.,.FALSE., N, C,S, * A(i,1),LDA, DUMMY, DUMMY) * * General banded matrix in GB format: * * j = MAX(1, i-KL ) * NL = MIN( N, i+KU+1 ) + 1-j * CALL SLAROT( .TRUE., i-KL.GE.1, i+KU.LT.N, NL, C,S, * A(KU+i+1-j,j),LDA-1, XLEFT, XRIGHT ) * * [ note that i+1-j is just MIN(i,KL+1) ] * * Symmetric banded matrix in SY format, bandwidth K, * lower triangle only: * * j = MAX(1, i-K ) * NL = MIN( K+1, i ) + 1 * CALL SLAROT( .TRUE., i-K.GE.1, .TRUE., NL, C,S, * A(i,j), LDA, XLEFT, XRIGHT ) * * Same, but upper triangle only: * * NL = MIN( K+1, N-i ) + 1 * CALL SLAROT( .TRUE., .TRUE., i+K.LT.N, NL, C,S, * A(i,i), LDA, XLEFT, XRIGHT ) * * Symmetric banded matrix in SB format, bandwidth K, * lower triangle only: * * [ same as for SY, except:] * . . . . * A(i+1-j,j), LDA-1, XLEFT, XRIGHT ) * * [ note that i+1-j is just MIN(i,K+1) ] * * Same, but upper triangle only: * . . . * A(K+1,i), LDA-1, XLEFT, XRIGHT ) * * Rotating columns is just the transpose of rotating rows, except * for GB and SB: (rotating columns i and i+1) * * GB: * j = MAX(1, i-KU ) * NL = MIN( N, i+KL+1 ) + 1-j * CALL SLAROT( .TRUE., i-KU.GE.1, i+KL.LT.N, NL, C,S, * A(KU+j+1-i,i),LDA-1, XTOP, XBOTTM ) * * [note that KU+j+1-i is just MAX(1,KU+2-i)] * * SB: (upper triangle) * * . . . . . . * A(K+j+1-i,i),LDA-1, XTOP, XBOTTM ) * * SB: (lower triangle) * * . . . . . . * A(1,i),LDA-1, XTOP, XBOTTM ) * * Arguments * ========= * * LROWS - LOGICAL * If .TRUE., then SLAROT will rotate two rows. If .FALSE., * then it will rotate two columns. * Not modified. * * LLEFT - LOGICAL * If .TRUE., then XLEFT will be used instead of the * corresponding element of A for the first element in the * second row (if LROWS=.FALSE.) or column (if LROWS=.TRUE.) * If .FALSE., then the corresponding element of A will be * used. * Not modified. * * LRIGHT - LOGICAL * If .TRUE., then XRIGHT will be used instead of the * corresponding element of A for the last element in the * first row (if LROWS=.FALSE.) or column (if LROWS=.TRUE.) If * .FALSE., then the corresponding element of A will be used. * Not modified. * * NL - INTEGER * The length of the rows (if LROWS=.TRUE.) or columns (if * LROWS=.FALSE.) to be rotated. If XLEFT and/or XRIGHT are * used, the columns/rows they are in should be included in * NL, e.g., if LLEFT = LRIGHT = .TRUE., then NL must be at * least 2. The number of rows/columns to be rotated * exclusive of those involving XLEFT and/or XRIGHT may * not be negative, i.e., NL minus how many of LLEFT and * LRIGHT are .TRUE. must be at least zero; if not, XERBLA * will be called. * Not modified. * * C, S - REAL * Specify the Givens rotation to be applied. If LROWS is * true, then the matrix ( c s ) * (-s c ) is applied from the left; * if false, then the transpose thereof is applied from the * right. For a Givens rotation, C**2 + S**2 should be 1, * but this is not checked. * Not modified. * * A - REAL array. * The array containing the rows/columns to be rotated. The * first element of A should be the upper left element to * be rotated. * Read and modified. * * LDA - INTEGER * The "effective" leading dimension of A. If A contains * a matrix stored in GE or SY format, then this is just * the leading dimension of A as dimensioned in the calling * routine. If A contains a matrix stored in band (GB or SB) * format, then this should be *one less* than the leading * dimension used in the calling routine. Thus, if * A were dimensioned A(LDA,*) in SLAROT, then A(1,j) would * be the j-th element in the first of the two rows * to be rotated, and A(2,j) would be the j-th in the second, * regardless of how the array may be stored in the calling * routine. [A cannot, however, actually be dimensioned thus, * since for band format, the row number may exceed LDA, which * is not legal FORTRAN.] * If LROWS=.TRUE., then LDA must be at least 1, otherwise * it must be at least NL minus the number of .TRUE. values * in XLEFT and XRIGHT. * Not modified. * * XLEFT - REAL * If LLEFT is .TRUE., then XLEFT will be used and modified * instead of A(2,1) (if LROWS=.TRUE.) or A(1,2) * (if LROWS=.FALSE.). * Read and modified. * * XRIGHT - REAL * If LRIGHT is .TRUE., then XRIGHT will be used and modified * instead of A(1,NL) (if LROWS=.TRUE.) or A(NL,1) * (if LROWS=.FALSE.). * Read and modified. * * ===================================================================== * * .. Local Scalars .. INTEGER IINC, INEXT, IX, IY, IYT, NT * .. * .. Local Arrays .. REAL XT( 2 ), YT( 2 ) * .. * .. External Subroutines .. EXTERNAL SROT, XERBLA * .. * .. Executable Statements .. * * Set up indices, arrays for ends * IF( LROWS ) THEN IINC = LDA INEXT = 1 ELSE IINC = 1 INEXT = LDA END IF * IF( LLEFT ) THEN NT = 1 IX = 1 + IINC IY = 2 + LDA XT( 1 ) = A( 1 ) YT( 1 ) = XLEFT ELSE NT = 0 IX = 1 IY = 1 + INEXT END IF * IF( LRIGHT ) THEN IYT = 1 + INEXT + ( NL-1 )*IINC NT = NT + 1 XT( NT ) = XRIGHT YT( NT ) = A( IYT ) ELSE IYT = 1 END IF * * Check for errors * IF( NL.LT.NT ) THEN CALL XERBLA( 'SLAROT', 4 ) RETURN END IF IF( LDA.LE.0 .OR. ( .NOT.LROWS .AND. LDA.LT.NL-NT ) ) THEN CALL XERBLA( 'SLAROT', 8 ) RETURN END IF * * Rotate * CALL SROT( NL-NT, A( IX ), IINC, A( IY ), IINC, C, S ) CALL SROT( NT, XT, 1, YT, 1, C, S ) * * Stuff values back into XLEFT, XRIGHT, etc. * IF( LLEFT ) THEN A( 1 ) = XT( 1 ) XLEFT = YT( 1 ) END IF * IF( LRIGHT ) THEN XRIGHT = XT( NT ) A( IYT ) = YT( NT ) END IF * RETURN * * End of SLAROT * END scalapack-2.0.2/TOOLS/LAPACK/slatm1.f000644 000766 000024 00000015444 10604316307 017124 0ustar00juliestaff000000 000000 SUBROUTINE SLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, N, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER IDIST, INFO, IRSIGN, MODE, N REAL COND * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL D( * ) * .. * * Purpose * ======= * * SLATM1 computes the entries of D(1..N) as specified by * MODE, COND and IRSIGN. IDIST and ISEED determine the generation * of random numbers. SLATM1 is called by SLATMR to generate * random test matrices for LAPACK programs. * * Arguments * ========= * * MODE - INTEGER * On entry describes how D is to be computed: * MODE = 0 means do not change D. * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * Not modified. * * COND - REAL * On entry, used as described under MODE above. * If used, it must be >= 1. Not modified. * * IRSIGN - INTEGER * On entry, if MODE neither -6, 0 nor 6, determines sign of * entries of D * 0 => leave entries of D unchanged * 1 => multiply each entry of D by 1 or -1 with probability .5 * * IDIST - CHARACTER*1 * On entry, IDIST specifies the type of distribution to be * used to generate a random matrix . * 1 => UNIFORM( 0, 1 ) * 2 => UNIFORM( -1, 1 ) * 3 => NORMAL( 0, 1 ) * Not modified. * * ISEED - INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. The random number generator uses a * linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to SLATM1 * to continue the same random number sequence. * Changed on exit. * * D - REAL array, dimension ( MIN( M , N ) ) * Array to be computed according to MODE, COND and IRSIGN. * May be changed on exit if MODE is nonzero. * * N - INTEGER * Number of entries of D. Not modified. * * INFO - INTEGER * 0 => normal termination * -1 => if MODE not in range -6 to 6 * -2 => if MODE neither -6, 0 nor 6, and * IRSIGN neither 0 nor 1 * -3 => if MODE neither -6, 0 nor 6 and COND less than 1 * -4 => if MODE equals 6 or -6 and IDIST not in range 1 to 3 * -7 => if N negative * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E0 ) REAL HALF PARAMETER ( HALF = 0.5E0 ) * .. * .. Local Scalars .. INTEGER I REAL ALPHA, TEMP * .. * .. External Functions .. REAL SLARAN EXTERNAL SLARAN * .. * .. External Subroutines .. EXTERNAL SLARNV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, EXP, LOG, REAL * .. * .. Executable Statements .. * * Decode and Test the input parameters. Initialize flags & seed. * INFO = 0 * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Set INFO if an error * IF( MODE.LT.-6 .OR. MODE.GT.6 ) THEN INFO = -1 ELSE IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ ( IRSIGN.NE.0 .AND. IRSIGN.NE.1 ) ) THEN INFO = -2 ELSE IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ COND.LT.ONE ) THEN INFO = -3 ELSE IF( ( MODE.EQ.6 .OR. MODE.EQ.-6 ) .AND. $ ( IDIST.LT.1 .OR. IDIST.GT.3 ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -7 END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLATM1', -INFO ) RETURN END IF * * Compute D according to COND and MODE * IF( MODE.NE.0 ) THEN GO TO ( 10, 30, 50, 70, 90, 110 )ABS( MODE ) * * One large D value: * 10 CONTINUE DO 20 I = 1, N D( I ) = ONE / COND 20 CONTINUE D( 1 ) = ONE GO TO 120 * * One small D value: * 30 CONTINUE DO 40 I = 1, N D( I ) = ONE 40 CONTINUE D( N ) = ONE / COND GO TO 120 * * Exponentially distributed D values: * 50 CONTINUE D( 1 ) = ONE IF( N.GT.1 ) THEN ALPHA = COND**( -ONE / REAL( N-1 ) ) DO 60 I = 2, N D( I ) = ALPHA**( I-1 ) 60 CONTINUE END IF GO TO 120 * * Arithmetically distributed D values: * 70 CONTINUE D( 1 ) = ONE IF( N.GT.1 ) THEN TEMP = ONE / COND ALPHA = ( ONE-TEMP ) / REAL( N-1 ) DO 80 I = 2, N D( I ) = REAL( N-I )*ALPHA + TEMP 80 CONTINUE END IF GO TO 120 * * Randomly distributed D values on ( 1/COND , 1): * 90 CONTINUE ALPHA = LOG( ONE / COND ) DO 100 I = 1, N D( I ) = EXP( ALPHA*SLARAN( ISEED ) ) 100 CONTINUE GO TO 120 * * Randomly distributed D values from IDIST * 110 CONTINUE CALL SLARNV( IDIST, ISEED, N, D ) * 120 CONTINUE * * If MODE neither -6 nor 0 nor 6, and IRSIGN = 1, assign * random signs to D * IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ IRSIGN.EQ.1 ) THEN DO 130 I = 1, N TEMP = SLARAN( ISEED ) IF( TEMP.GT.HALF ) $ D( I ) = -D( I ) 130 CONTINUE END IF * * Reverse if MODE < 0 * IF( MODE.LT.0 ) THEN DO 140 I = 1, N / 2 TEMP = D( I ) D( I ) = D( N+1-I ) D( N+1-I ) = TEMP 140 CONTINUE END IF * END IF * RETURN * * End of SLATM1 * END scalapack-2.0.2/TOOLS/LAPACK/slatms.f000644 000766 000024 00000112370 11654631032 017223 0ustar00juliestaff000000 000000 SUBROUTINE SLATMS( M, N, DIST, ISEED, SYM, D, MODE, COND, DMAX, $ KL, KU, PACK, A, LDA, WORK, INFO ) * * -- LAPACK test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER DIST, PACK, SYM INTEGER INFO, KL, KU, LDA, M, MODE, N REAL COND, DMAX * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL A( LDA, * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * SLATMS generates random matrices with specified singular values * (or symmetric/hermitian with specified eigenvalues) * for testing LAPACK programs. * * SLATMS operates by applying the following sequence of * operations: * * Set the diagonal to D, where D may be input or * computed according to MODE, COND, DMAX, and SYM * as described below. * * Generate a matrix with the appropriate band structure, by one * of two methods: * * Method A: * Generate a dense M x N matrix by multiplying D on the left * and the right by random unitary matrices, then: * * Reduce the bandwidth according to KL and KU, using * Householder transformations. * * Method B: * Convert the bandwidth-0 (i.e., diagonal) matrix to a * bandwidth-1 matrix using Givens rotations, "chasing" * out-of-band elements back, much as in QR; then * convert the bandwidth-1 to a bandwidth-2 matrix, etc. * Note that for reasonably small bandwidths (relative to * M and N) this requires less storage, as a dense matrix * is not generated. Also, for symmetric matrices, only * one triangle is generated. * * Method A is chosen if the bandwidth is a large fraction of the * order of the matrix, and LDA is at least M (so a dense * matrix can be stored.) Method B is chosen if the bandwidth * is small (< 1/2 N for symmetric, < .3 N+M for * non-symmetric), or LDA is less than M and not less than the * bandwidth. * * Pack the matrix if desired. Options specified by PACK are: * no packing * zero out upper half (if symmetric) * zero out lower half (if symmetric) * store the upper half columnwise (if symmetric or upper * triangular) * store the lower half columnwise (if symmetric or lower * triangular) * store the lower triangle in banded format (if symmetric * or lower triangular) * store the upper triangle in banded format (if symmetric * or upper triangular) * store the entire matrix in banded format * If Method B is chosen, and band format is specified, then the * matrix will be generated in the band format, so no repacking * will be necessary. * * Arguments * ========= * * M - INTEGER * The number of rows of A. Not modified. * * N - INTEGER * The number of columns of A. Not modified. * * DIST - CHARACTER*1 * On entry, DIST specifies the type of distribution to be used * to generate the random eigen-/singular values. * 'U' => UNIFORM( 0, 1 ) ( 'U' for uniform ) * 'S' => UNIFORM( -1, 1 ) ( 'S' for symmetric ) * 'N' => NORMAL( 0, 1 ) ( 'N' for normal ) * Not modified. * * ISEED - INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. They should lie between 0 and 4095 inclusive, * and ISEED(4) should be odd. The random number generator * uses a linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to SLATMS * to continue the same random number sequence. * Changed on exit. * * SYM - CHARACTER*1 * If SYM='S' or 'H', the generated matrix is symmetric, with * eigenvalues specified by D, COND, MODE, and DMAX; they * may be positive, negative, or zero. * If SYM='P', the generated matrix is symmetric, with * eigenvalues (= singular values) specified by D, COND, * MODE, and DMAX; they will not be negative. * If SYM='N', the generated matrix is nonsymmetric, with * singular values specified by D, COND, MODE, and DMAX; * they will not be negative. * Not modified. * * D - REAL array, dimension ( MIN( M , N ) ) * This array is used to specify the singular values or * eigenvalues of A (see SYM, above.) If MODE=0, then D is * assumed to contain the singular/eigenvalues, otherwise * they will be computed according to MODE, COND, and DMAX, * and placed in D. * Modified if MODE is nonzero. * * MODE - INTEGER * On entry this describes how the singular/eigenvalues are to * be specified: * MODE = 0 means use D as input * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * If SYM='S' or 'H', and MODE is neither 0, 6, nor -6, then * the elements of D will also be multiplied by a random * sign (i.e., +1 or -1.) * Not modified. * * COND - REAL * On entry, this is used as described under MODE above. * If used, it must be >= 1. Not modified. * * DMAX - REAL * If MODE is neither -6, 0 nor 6, the contents of D, as * computed according to MODE and COND, will be scaled by * DMAX / max(abs(D(i))); thus, the maximum absolute eigen- or * singular value (which is to say the norm) will be abs(DMAX). * Note that DMAX need not be positive: if DMAX is negative * (or zero), D will be scaled by a negative number (or zero). * Not modified. * * KL - INTEGER * This specifies the lower bandwidth of the matrix. For * example, KL=0 implies upper triangular, KL=1 implies upper * Hessenberg, and KL being at least M-1 means that the matrix * has full lower bandwidth. KL must equal KU if the matrix * is symmetric. * Not modified. * * KU - INTEGER * This specifies the upper bandwidth of the matrix. For * example, KU=0 implies lower triangular, KU=1 implies lower * Hessenberg, and KU being at least N-1 means that the matrix * has full upper bandwidth. KL must equal KU if the matrix * is symmetric. * Not modified. * * PACK - CHARACTER*1 * This specifies packing of matrix as follows: * 'N' => no packing * 'U' => zero out all subdiagonal entries (if symmetric) * 'L' => zero out all superdiagonal entries (if symmetric) * 'C' => store the upper triangle columnwise * (only if the matrix is symmetric or upper triangular) * 'R' => store the lower triangle columnwise * (only if the matrix is symmetric or lower triangular) * 'B' => store the lower triangle in band storage scheme * (only if matrix symmetric or lower triangular) * 'Q' => store the upper triangle in band storage scheme * (only if matrix symmetric or upper triangular) * 'Z' => store the entire matrix in band storage scheme * (pivoting can be provided for by using this * option to store A in the trailing rows of * the allocated storage) * * Using these options, the various LAPACK packed and banded * storage schemes can be obtained: * GB - use 'Z' * PB, SB or TB - use 'B' or 'Q' * PP, SP or TP - use 'C' or 'R' * * If two calls to SLATMS differ only in the PACK parameter, * they will generate mathematically equivalent matrices. * Not modified. * * A - REAL array, dimension ( LDA, N ) * On exit A is the desired test matrix. A is first generated * in full (unpacked) form, and then packed, if so specified * by PACK. Thus, the first M elements of the first N * columns will always be modified. If PACK specifies a * packed or banded storage scheme, all LDA elements of the * first N columns will be modified; the elements of the * array which do not correspond to elements of the generated * matrix are set to zero. * Modified. * * LDA - INTEGER * LDA specifies the first dimension of A as declared in the * calling program. If PACK='N', 'U', 'L', 'C', or 'R', then * LDA must be at least M. If PACK='B' or 'Q', then LDA must * be at least MIN( KL, M-1) (which is equal to MIN(KU,N-1)). * If PACK='Z', LDA must be large enough to hold the packed * array: MIN( KU, N-1) + MIN( KL, M-1) + 1. * Not modified. * * WORK - REAL array, dimension ( 3*MAX( N , M ) ) * Workspace. * Modified. * * INFO - INTEGER * Error code. On exit, INFO will be set to one of the * following values: * 0 => normal return * -1 => M negative or unequal to N and SYM='S', 'H', or 'P' * -2 => N negative * -3 => DIST illegal string * -5 => SYM illegal string * -7 => MODE not in range -6 to 6 * -8 => COND less than 1.0, and MODE neither -6, 0 nor 6 * -10 => KL negative * -11 => KU negative, or SYM='S' or 'H' and KU not equal to KL * -12 => PACK illegal string, or PACK='U' or 'L', and SYM='N'; * or PACK='C' or 'Q' and SYM='N' and KL is not zero; * or PACK='R' or 'B' and SYM='N' and KU is not zero; * or PACK='U', 'L', 'C', 'R', 'B', or 'Q', and M is not * N. * -14 => LDA is less than M, or PACK='Z' and LDA is less than * MIN(KU,N-1) + MIN(KL,M-1) + 1. * 1 => Error return from SLATM1 * 2 => Cannot scale to DMAX (max. sing. value is 0) * 3 => Error return from SLAGGE or SLAGSY * * ===================================================================== * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E0 ) REAL ONE PARAMETER ( ONE = 1.0E0 ) REAL TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663E+0 ) * .. * .. Local Scalars .. LOGICAL GIVENS, ILEXTR, ILTEMP, TOPDWN INTEGER I, IC, ICOL, IDIST, IENDCH, IINFO, IL, ILDA, $ IOFFG, IOFFST, IPACK, IPACKG, IR, IR1, IR2, $ IROW, IRSIGN, ISKEW, ISYM, ISYMPK, J, JC, JCH, $ JKL, JKU, JR, K, LLB, MINLDA, MNMIN, MR, NC, $ UUB REAL ALPHA, ANGLE, C, DUMMY, EXTRA, S, TEMP * .. * .. External Functions .. LOGICAL LSAME REAL SLARND EXTERNAL LSAME, SLARND * .. * .. External Subroutines .. EXTERNAL SCOPY, SLAGGE, SLAGSY, SLAROT, SLARTG, SLATM1, $ SLASET, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, COS, MAX, MIN, MOD, REAL, SIN * .. * .. Executable Statements .. * * 1) Decode and Test the input parameters. * Initialize flags & seed. * INFO = 0 * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Decode DIST * IF( LSAME( DIST, 'U' ) ) THEN IDIST = 1 ELSE IF( LSAME( DIST, 'S' ) ) THEN IDIST = 2 ELSE IF( LSAME( DIST, 'N' ) ) THEN IDIST = 3 ELSE IDIST = -1 END IF * * Decode SYM * IF( LSAME( SYM, 'N' ) ) THEN ISYM = 1 IRSIGN = 0 ELSE IF( LSAME( SYM, 'P' ) ) THEN ISYM = 2 IRSIGN = 0 ELSE IF( LSAME( SYM, 'S' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE IF( LSAME( SYM, 'H' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE ISYM = -1 END IF * * Decode PACK * ISYMPK = 0 IF( LSAME( PACK, 'N' ) ) THEN IPACK = 0 ELSE IF( LSAME( PACK, 'U' ) ) THEN IPACK = 1 ISYMPK = 1 ELSE IF( LSAME( PACK, 'L' ) ) THEN IPACK = 2 ISYMPK = 1 ELSE IF( LSAME( PACK, 'C' ) ) THEN IPACK = 3 ISYMPK = 2 ELSE IF( LSAME( PACK, 'R' ) ) THEN IPACK = 4 ISYMPK = 3 ELSE IF( LSAME( PACK, 'B' ) ) THEN IPACK = 5 ISYMPK = 3 ELSE IF( LSAME( PACK, 'Q' ) ) THEN IPACK = 6 ISYMPK = 2 ELSE IF( LSAME( PACK, 'Z' ) ) THEN IPACK = 7 ELSE IPACK = -1 END IF * * Set certain internal parameters * MNMIN = MIN( M, N ) LLB = MIN( KL, M-1 ) UUB = MIN( KU, N-1 ) MR = MIN( M, N+LLB ) NC = MIN( N, M+UUB ) IROW = 1 ICOL = 1 * IF( IPACK.EQ.5 .OR. IPACK.EQ.6 ) THEN MINLDA = UUB + 1 ELSE IF( IPACK.EQ.7 ) THEN MINLDA = LLB + UUB + 1 ELSE MINLDA = M END IF * * Use Givens rotation method if bandwidth small enough, * or if LDA is too small to store the matrix unpacked. * GIVENS = .FALSE. IF( ISYM.EQ.1 ) THEN IF( REAL( LLB+UUB ).LT.0.3*REAL( MAX( 1, MR+NC ) ) ) $ GIVENS = .TRUE. ELSE IF( 2*LLB.LT.M ) $ GIVENS = .TRUE. END IF IF( LDA.LT.M .AND. LDA.GE.MINLDA ) $ GIVENS = .TRUE. * * Set INFO if an error * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( M.NE.N .AND. ISYM.NE.1 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( IDIST.EQ.-1 ) THEN INFO = -3 ELSE IF( ISYM.EQ.-1 ) THEN INFO = -5 ELSE IF( ABS( MODE ).GT.6 ) THEN INFO = -7 ELSE IF( ( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) .AND. COND.LT.ONE ) $ THEN INFO = -8 ELSE IF( KL.LT.0 ) THEN INFO = -10 ELSE IF( KU.LT.0 .OR. ( ISYM.NE.1 .AND. KL.NE.KU ) ) THEN INFO = -11 ELSE IF( IPACK.EQ.-1 .OR. ( ISYMPK.EQ.1 .AND. ISYM.EQ.1 ) .OR. $ ( ISYMPK.EQ.2 .AND. ISYM.EQ.1 .AND. KL.GT.0 ) .OR. $ ( ISYMPK.EQ.3 .AND. ISYM.EQ.1 .AND. KU.GT.0 ) .OR. $ ( ISYMPK.NE.0 .AND. M.NE.N ) ) THEN INFO = -12 ELSE IF( LDA.LT.MAX( 1, MINLDA ) ) THEN INFO = -14 END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLATMS', -INFO ) RETURN END IF * * Initialize random number generator * DO 10 I = 1, 4 ISEED( I ) = MOD( ABS( ISEED( I ) ), 4096 ) 10 CONTINUE * IF( MOD( ISEED( 4 ), 2 ).NE.1 ) $ ISEED( 4 ) = ISEED( 4 ) + 1 * * 2) Set up D if indicated. * * Compute D according to COND and MODE * CALL SLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, MNMIN, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 1 RETURN END IF * * Choose Top-Down if D is (apparently) increasing, * Bottom-Up if D is (apparently) decreasing. * IF( ABS( D( 1 ) ).LE.ABS( D( MNMIN ) ) ) THEN TOPDWN = .TRUE. ELSE TOPDWN = .FALSE. END IF * IF( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) THEN * * Scale by DMAX * TEMP = ABS( D( 1 ) ) DO 20 I = 2, MNMIN TEMP = MAX( TEMP, ABS( D( I ) ) ) 20 CONTINUE * IF( TEMP.GT.ZERO ) THEN ALPHA = DMAX / TEMP ELSE INFO = 2 RETURN END IF * CALL SSCAL( MNMIN, ALPHA, D, 1 ) * END IF * * 3) Generate Banded Matrix using Givens rotations. * Also the special case of UUB=LLB=0 * * Compute Addressing constants to cover all * storage formats. Whether GE, SY, GB, or SB, * upper or lower triangle or both, * the (i,j)-th element is in * A( i - ISKEW*j + IOFFST, j ) * IF( IPACK.GT.4 ) THEN ILDA = LDA - 1 ISKEW = 1 IF( IPACK.GT.5 ) THEN IOFFST = UUB + 1 ELSE IOFFST = 1 END IF ELSE ILDA = LDA ISKEW = 0 IOFFST = 0 END IF * * IPACKG is the format that the matrix is generated in. If this is * different from IPACK, then the matrix must be repacked at the * end. It also signals how to compute the norm, for scaling. * IPACKG = 0 CALL SLASET( 'Full', LDA, N, ZERO, ZERO, A, LDA ) * * Diagonal Matrix -- We are done, unless it * is to be stored SP/PP/TP (PACK='R' or 'C') * IF( LLB.EQ.0 .AND. UUB.EQ.0 ) THEN CALL SCOPY( MNMIN, D, 1, A( 1-ISKEW+IOFFST, 1 ), ILDA+1 ) IF( IPACK.LE.2 .OR. IPACK.GE.5 ) $ IPACKG = IPACK * ELSE IF( GIVENS ) THEN * * Check whether to use Givens rotations, * Householder transformations, or nothing. * IF( ISYM.EQ.1 ) THEN * * Non-symmetric -- A = U D V * IF( IPACK.GT.4 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF * CALL SCOPY( MNMIN, D, 1, A( 1-ISKEW+IOFFST, 1 ), ILDA+1 ) * IF( TOPDWN ) THEN JKL = 0 DO 50 JKU = 1, UUB * * Transform from bandwidth JKL, JKU-1 to JKL, JKU * * Last row actually rotated is M * Last column actually rotated is MIN( M+JKU, N ) * DO 40 JR = 1, MIN( M+JKU, N ) + JKL - 1 EXTRA = ZERO ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) ICOL = MAX( 1, JR-JKL ) IF( JR.LT.M ) THEN IL = MIN( N, JR+JKU ) + 1 - ICOL CALL SLAROT( .TRUE., JR.GT.JKL, .FALSE., IL, C, $ S, A( JR-ISKEW*ICOL+IOFFST, ICOL ), $ ILDA, EXTRA, DUMMY ) END IF * * Chase "EXTRA" back up * IR = JR IC = ICOL DO 30 JCH = JR - JKL, 1, -JKL - JKU IF( IR.LT.M ) THEN CALL SLARTG( A( IR+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), EXTRA, C, S, DUMMY ) END IF IROW = MAX( 1, JCH-JKU ) IL = IR + 2 - IROW TEMP = ZERO ILTEMP = JCH.GT.JKU CALL SLAROT( .FALSE., ILTEMP, .TRUE., IL, C, -S, $ A( IROW-ISKEW*IC+IOFFST, IC ), $ ILDA, TEMP, EXTRA ) IF( ILTEMP ) THEN CALL SLARTG( A( IROW+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), TEMP, C, S, DUMMY ) ICOL = MAX( 1, JCH-JKU-JKL ) IL = IC + 2 - ICOL EXTRA = ZERO CALL SLAROT( .TRUE., JCH.GT.JKU+JKL, .TRUE., $ IL, C, -S, A( IROW-ISKEW*ICOL+ $ IOFFST, ICOL ), ILDA, EXTRA, $ TEMP ) IC = ICOL IR = IROW END IF 30 CONTINUE 40 CONTINUE 50 CONTINUE * JKU = UUB DO 80 JKL = 1, LLB * * Transform from bandwidth JKL-1, JKU to JKL, JKU * DO 70 JC = 1, MIN( N+JKL, M ) + JKU - 1 EXTRA = ZERO ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) IROW = MAX( 1, JC-JKU ) IF( JC.LT.N ) THEN IL = MIN( M, JC+JKL ) + 1 - IROW CALL SLAROT( .FALSE., JC.GT.JKU, .FALSE., IL, C, $ S, A( IROW-ISKEW*JC+IOFFST, JC ), $ ILDA, EXTRA, DUMMY ) END IF * * Chase "EXTRA" back up * IC = JC IR = IROW DO 60 JCH = JC - JKU, 1, -JKL - JKU IF( IC.LT.N ) THEN CALL SLARTG( A( IR+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), EXTRA, C, S, DUMMY ) END IF ICOL = MAX( 1, JCH-JKL ) IL = IC + 2 - ICOL TEMP = ZERO ILTEMP = JCH.GT.JKL CALL SLAROT( .TRUE., ILTEMP, .TRUE., IL, C, -S, $ A( IR-ISKEW*ICOL+IOFFST, ICOL ), $ ILDA, TEMP, EXTRA ) IF( ILTEMP ) THEN CALL SLARTG( A( IR+1-ISKEW*( ICOL+1 )+IOFFST, $ ICOL+1 ), TEMP, C, S, DUMMY ) IROW = MAX( 1, JCH-JKL-JKU ) IL = IR + 2 - IROW EXTRA = ZERO CALL SLAROT( .FALSE., JCH.GT.JKL+JKU, .TRUE., $ IL, C, -S, A( IROW-ISKEW*ICOL+ $ IOFFST, ICOL ), ILDA, EXTRA, $ TEMP ) IC = ICOL IR = IROW END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE * ELSE * * Bottom-Up -- Start at the bottom right. * JKL = 0 DO 110 JKU = 1, UUB * * Transform from bandwidth JKL, JKU-1 to JKL, JKU * * First row actually rotated is M * First column actually rotated is MIN( M+JKU, N ) * IENDCH = MIN( M, N+JKL ) - 1 DO 100 JC = MIN( M+JKU, N ) - 1, 1 - JKL, -1 EXTRA = ZERO ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) IROW = MAX( 1, JC-JKU+1 ) IF( JC.GT.0 ) THEN IL = MIN( M, JC+JKL+1 ) + 1 - IROW CALL SLAROT( .FALSE., .FALSE., JC+JKL.LT.M, IL, $ C, S, A( IROW-ISKEW*JC+IOFFST, $ JC ), ILDA, DUMMY, EXTRA ) END IF * * Chase "EXTRA" back down * IC = JC DO 90 JCH = JC + JKL, IENDCH, JKL + JKU ILEXTR = IC.GT.0 IF( ILEXTR ) THEN CALL SLARTG( A( JCH-ISKEW*IC+IOFFST, IC ), $ EXTRA, C, S, DUMMY ) END IF IC = MAX( 1, IC ) ICOL = MIN( N-1, JCH+JKU ) ILTEMP = JCH + JKU.LT.N TEMP = ZERO CALL SLAROT( .TRUE., ILEXTR, ILTEMP, ICOL+2-IC, $ C, S, A( JCH-ISKEW*IC+IOFFST, IC ), $ ILDA, EXTRA, TEMP ) IF( ILTEMP ) THEN CALL SLARTG( A( JCH-ISKEW*ICOL+IOFFST, $ ICOL ), TEMP, C, S, DUMMY ) IL = MIN( IENDCH, JCH+JKL+JKU ) + 2 - JCH EXTRA = ZERO CALL SLAROT( .FALSE., .TRUE., $ JCH+JKL+JKU.LE.IENDCH, IL, C, S, $ A( JCH-ISKEW*ICOL+IOFFST, $ ICOL ), ILDA, TEMP, EXTRA ) IC = ICOL END IF 90 CONTINUE 100 CONTINUE 110 CONTINUE * JKU = UUB DO 140 JKL = 1, LLB * * Transform from bandwidth JKL-1, JKU to JKL, JKU * * First row actually rotated is MIN( N+JKL, M ) * First column actually rotated is N * IENDCH = MIN( N, M+JKU ) - 1 DO 130 JR = MIN( N+JKL, M ) - 1, 1 - JKU, -1 EXTRA = ZERO ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) ICOL = MAX( 1, JR-JKL+1 ) IF( JR.GT.0 ) THEN IL = MIN( N, JR+JKU+1 ) + 1 - ICOL CALL SLAROT( .TRUE., .FALSE., JR+JKU.LT.N, IL, $ C, S, A( JR-ISKEW*ICOL+IOFFST, $ ICOL ), ILDA, DUMMY, EXTRA ) END IF * * Chase "EXTRA" back down * IR = JR DO 120 JCH = JR + JKU, IENDCH, JKL + JKU ILEXTR = IR.GT.0 IF( ILEXTR ) THEN CALL SLARTG( A( IR-ISKEW*JCH+IOFFST, JCH ), $ EXTRA, C, S, DUMMY ) END IF IR = MAX( 1, IR ) IROW = MIN( M-1, JCH+JKL ) ILTEMP = JCH + JKL.LT.M TEMP = ZERO CALL SLAROT( .FALSE., ILEXTR, ILTEMP, IROW+2-IR, $ C, S, A( IR-ISKEW*JCH+IOFFST, $ JCH ), ILDA, EXTRA, TEMP ) IF( ILTEMP ) THEN CALL SLARTG( A( IROW-ISKEW*JCH+IOFFST, JCH ), $ TEMP, C, S, DUMMY ) IL = MIN( IENDCH, JCH+JKL+JKU ) + 2 - JCH EXTRA = ZERO CALL SLAROT( .TRUE., .TRUE., $ JCH+JKL+JKU.LE.IENDCH, IL, C, S, $ A( IROW-ISKEW*JCH+IOFFST, JCH ), $ ILDA, TEMP, EXTRA ) IR = IROW END IF 120 CONTINUE 130 CONTINUE 140 CONTINUE END IF * ELSE * * Symmetric -- A = U D U' * IPACKG = IPACK IOFFG = IOFFST * IF( TOPDWN ) THEN * * Top-Down -- Generate Upper triangle only * IF( IPACK.GE.5 ) THEN IPACKG = 6 IOFFG = UUB + 1 ELSE IPACKG = 1 END IF CALL SCOPY( MNMIN, D, 1, A( 1-ISKEW+IOFFG, 1 ), ILDA+1 ) * DO 170 K = 1, UUB DO 160 JC = 1, N - 1 IROW = MAX( 1, JC-K ) IL = MIN( JC+1, K+2 ) EXTRA = ZERO TEMP = A( JC-ISKEW*( JC+1 )+IOFFG, JC+1 ) ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE ) S = SIN( ANGLE ) CALL SLAROT( .FALSE., JC.GT.K, .TRUE., IL, C, S, $ A( IROW-ISKEW*JC+IOFFG, JC ), ILDA, $ EXTRA, TEMP ) CALL SLAROT( .TRUE., .TRUE., .FALSE., $ MIN( K, N-JC )+1, C, S, $ A( ( 1-ISKEW )*JC+IOFFG, JC ), ILDA, $ TEMP, DUMMY ) * * Chase EXTRA back up the matrix * ICOL = JC DO 150 JCH = JC - K, 1, -K CALL SLARTG( A( JCH+1-ISKEW*( ICOL+1 )+IOFFG, $ ICOL+1 ), EXTRA, C, S, DUMMY ) TEMP = A( JCH-ISKEW*( JCH+1 )+IOFFG, JCH+1 ) CALL SLAROT( .TRUE., .TRUE., .TRUE., K+2, C, -S, $ A( ( 1-ISKEW )*JCH+IOFFG, JCH ), $ ILDA, TEMP, EXTRA ) IROW = MAX( 1, JCH-K ) IL = MIN( JCH+1, K+2 ) EXTRA = ZERO CALL SLAROT( .FALSE., JCH.GT.K, .TRUE., IL, C, $ -S, A( IROW-ISKEW*JCH+IOFFG, JCH ), $ ILDA, EXTRA, TEMP ) ICOL = JCH 150 CONTINUE 160 CONTINUE 170 CONTINUE * * If we need lower triangle, copy from upper. Note that * the order of copying is chosen to work for 'q' -> 'b' * IF( IPACK.NE.IPACKG .AND. IPACK.NE.3 ) THEN DO 190 JC = 1, N IROW = IOFFST - ISKEW*JC DO 180 JR = JC, MIN( N, JC+UUB ) A( JR+IROW, JC ) = A( JC-ISKEW*JR+IOFFG, JR ) 180 CONTINUE 190 CONTINUE IF( IPACK.EQ.5 ) THEN DO 210 JC = N - UUB + 1, N DO 200 JR = N + 2 - JC, UUB + 1 A( JR, JC ) = ZERO 200 CONTINUE 210 CONTINUE END IF IF( IPACKG.EQ.6 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF END IF ELSE * * Bottom-Up -- Generate Lower triangle only * IF( IPACK.GE.5 ) THEN IPACKG = 5 IF( IPACK.EQ.6 ) $ IOFFG = 1 ELSE IPACKG = 2 END IF CALL SCOPY( MNMIN, D, 1, A( 1-ISKEW+IOFFG, 1 ), ILDA+1 ) * DO 240 K = 1, UUB DO 230 JC = N - 1, 1, -1 IL = MIN( N+1-JC, K+2 ) EXTRA = ZERO TEMP = A( 1+( 1-ISKEW )*JC+IOFFG, JC ) ANGLE = TWOPI*SLARND( 1, ISEED ) C = COS( ANGLE ) S = -SIN( ANGLE ) CALL SLAROT( .FALSE., .TRUE., N-JC.GT.K, IL, C, S, $ A( ( 1-ISKEW )*JC+IOFFG, JC ), ILDA, $ TEMP, EXTRA ) ICOL = MAX( 1, JC-K+1 ) CALL SLAROT( .TRUE., .FALSE., .TRUE., JC+2-ICOL, C, $ S, A( JC-ISKEW*ICOL+IOFFG, ICOL ), $ ILDA, DUMMY, TEMP ) * * Chase EXTRA back down the matrix * ICOL = JC DO 220 JCH = JC + K, N - 1, K CALL SLARTG( A( JCH-ISKEW*ICOL+IOFFG, ICOL ), $ EXTRA, C, S, DUMMY ) TEMP = A( 1+( 1-ISKEW )*JCH+IOFFG, JCH ) CALL SLAROT( .TRUE., .TRUE., .TRUE., K+2, C, S, $ A( JCH-ISKEW*ICOL+IOFFG, ICOL ), $ ILDA, EXTRA, TEMP ) IL = MIN( N+1-JCH, K+2 ) EXTRA = ZERO CALL SLAROT( .FALSE., .TRUE., N-JCH.GT.K, IL, C, $ S, A( ( 1-ISKEW )*JCH+IOFFG, JCH ), $ ILDA, TEMP, EXTRA ) ICOL = JCH 220 CONTINUE 230 CONTINUE 240 CONTINUE * * If we need upper triangle, copy from lower. Note that * the order of copying is chosen to work for 'b' -> 'q' * IF( IPACK.NE.IPACKG .AND. IPACK.NE.4 ) THEN DO 260 JC = N, 1, -1 IROW = IOFFST - ISKEW*JC DO 250 JR = JC, MAX( 1, JC-UUB ), -1 A( JR+IROW, JC ) = A( JC-ISKEW*JR+IOFFG, JR ) 250 CONTINUE 260 CONTINUE IF( IPACK.EQ.6 ) THEN DO 280 JC = 1, UUB DO 270 JR = 1, UUB + 1 - JC A( JR, JC ) = ZERO 270 CONTINUE 280 CONTINUE END IF IF( IPACKG.EQ.5 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF END IF END IF END IF * ELSE * * 4) Generate Banded Matrix by first * Rotating by random Unitary matrices, * then reducing the bandwidth using Householder * transformations. * * Note: we should get here only if LDA .ge. N * IF( ISYM.EQ.1 ) THEN * * Non-symmetric -- A = U D V * CALL SLAGGE( MR, NC, LLB, UUB, D, A, LDA, ISEED, WORK, $ IINFO ) ELSE * * Symmetric -- A = U D U' * CALL SLAGSY( M, LLB, D, A, LDA, ISEED, WORK, IINFO ) * END IF IF( IINFO.NE.0 ) THEN INFO = 3 RETURN END IF END IF * * 5) Pack the matrix * IF( IPACK.NE.IPACKG ) THEN IF( IPACK.EQ.1 ) THEN * * 'U' -- Upper triangular, not packed * DO 300 J = 1, M DO 290 I = J + 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE * ELSE IF( IPACK.EQ.2 ) THEN * * 'L' -- Lower triangular, not packed * DO 320 J = 2, M DO 310 I = 1, J - 1 A( I, J ) = ZERO 310 CONTINUE 320 CONTINUE * ELSE IF( IPACK.EQ.3 ) THEN * * 'C' -- Upper triangle packed Columnwise. * ICOL = 1 IROW = 0 DO 340 J = 1, M DO 330 I = 1, J IROW = IROW + 1 IF( IROW.GT.LDA ) THEN IROW = 1 ICOL = ICOL + 1 END IF A( IROW, ICOL ) = A( I, J ) 330 CONTINUE 340 CONTINUE * ELSE IF( IPACK.EQ.4 ) THEN * * 'R' -- Lower triangle packed Columnwise. * ICOL = 1 IROW = 0 DO 360 J = 1, M DO 350 I = J, M IROW = IROW + 1 IF( IROW.GT.LDA ) THEN IROW = 1 ICOL = ICOL + 1 END IF A( IROW, ICOL ) = A( I, J ) 350 CONTINUE 360 CONTINUE * ELSE IF( IPACK.GE.5 ) THEN * * 'B' -- The lower triangle is packed as a band matrix. * 'Q' -- The upper triangle is packed as a band matrix. * 'Z' -- The whole matrix is packed as a band matrix. * IF( IPACK.EQ.5 ) $ UUB = 0 IF( IPACK.EQ.6 ) $ LLB = 0 * DO 380 J = 1, UUB DO 370 I = MIN( J+LLB, M ), 1, -1 A( I-J+UUB+1, J ) = A( I, J ) 370 CONTINUE 380 CONTINUE * DO 400 J = UUB + 2, N DO 390 I = J - UUB, MIN( J+LLB, M ) A( I-J+UUB+1, J ) = A( I, J ) 390 CONTINUE 400 CONTINUE END IF * * If packed, zero out extraneous elements. * * Symmetric/Triangular Packed -- * zero out everything after A(IROW,ICOL) * IF( IPACK.EQ.3 .OR. IPACK.EQ.4 ) THEN DO 420 JC = ICOL, M DO 410 JR = IROW + 1, LDA A( JR, JC ) = ZERO 410 CONTINUE IROW = 0 420 CONTINUE * ELSE IF( IPACK.GE.5 ) THEN * * Packed Band -- * 1st row is now in A( UUB+2-j, j), zero above it * m-th row is now in A( M+UUB-j,j), zero below it * last non-zero diagonal is now in A( UUB+LLB+1,j ), * zero below it, too. * IR1 = UUB + LLB + 2 IR2 = UUB + M + 2 DO 450 JC = 1, N DO 430 JR = 1, UUB + 1 - JC A( JR, JC ) = ZERO 430 CONTINUE DO 440 JR = MAX( 1, MIN( IR1, IR2-JC ) ), LDA A( JR, JC ) = ZERO 440 CONTINUE 450 CONTINUE END IF END IF * RETURN * * End of SLATMS * END scalapack-2.0.2/TOOLS/LAPACK/zlagge.f000644 000766 000024 00000022502 10604316307 017165 0ustar00juliestaff000000 000000 SUBROUTINE ZLAGGE( M, N, KL, KU, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDA, M, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION D( * ) COMPLEX*16 A( LDA, * ), WORK( * ) * .. * * Purpose * ======= * * ZLAGGE generates a complex general m by n matrix A, by pre- and post- * multiplying a real diagonal matrix D with random unitary matrices: * A = U*D*V. The lower and upper bandwidths may then be reduced to * kl and ku by additional unitary transformations. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= KL <= M-1. * * KU (input) INTEGER * The number of nonzero superdiagonals within the band of A. * 0 <= KU <= N-1. * * D (input) DOUBLE PRECISION array, dimension (min(M,N)) * The diagonal elements of the diagonal matrix D. * * A (output) COMPLEX*16 array, dimension (LDA,N) * The generated m by n matrix A. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= M. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) COMPLEX*16 array, dimension (M+N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J DOUBLE PRECISION WN COMPLEX*16 TAU, WA, WB * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEMV, ZGERC, ZLACGV, ZLARNV, ZSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN * .. * .. External Functions .. DOUBLE PRECISION DZNRM2 EXTERNAL DZNRM2 * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( KL.LT.0 .OR. KL.GT.M-1 ) THEN INFO = -3 ELSE IF( KU.LT.0 .OR. KU.GT.N-1 ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'ZLAGGE', -INFO ) RETURN END IF * * initialize A to diagonal matrix * DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, MIN( M, N ) A( I, I ) = D( I ) 30 CONTINUE * * pre- and post-multiply A by random unitary matrices * DO 40 I = MIN( M, N ), 1, -1 IF( I.LT.M ) THEN * * generate random reflection * CALL ZLARNV( 3, ISEED, M-I+1, WORK ) WN = DZNRM2( M-I+1, WORK, 1 ) WA = ( WN / ABS( WORK( 1 ) ) )*WORK( 1 ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL ZSCAL( M-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = DBLE( WB / WA ) END IF * * multiply A(i:m,i:n) by random reflection from the left * CALL ZGEMV( 'Conjugate transpose', M-I+1, N-I+1, ONE, $ A( I, I ), LDA, WORK, 1, ZERO, WORK( M+1 ), 1 ) CALL ZGERC( M-I+1, N-I+1, -TAU, WORK, 1, WORK( M+1 ), 1, $ A( I, I ), LDA ) END IF IF( I.LT.N ) THEN * * generate random reflection * CALL ZLARNV( 3, ISEED, N-I+1, WORK ) WN = DZNRM2( N-I+1, WORK, 1 ) WA = ( WN / ABS( WORK( 1 ) ) )*WORK( 1 ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL ZSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = DBLE( WB / WA ) END IF * * multiply A(i:m,i:n) by random reflection from the right * CALL ZGEMV( 'No transpose', M-I+1, N-I+1, ONE, A( I, I ), $ LDA, WORK, 1, ZERO, WORK( N+1 ), 1 ) CALL ZGERC( M-I+1, N-I+1, -TAU, WORK( N+1 ), 1, WORK, 1, $ A( I, I ), LDA ) END IF 40 CONTINUE * * Reduce number of subdiagonals to KL and number of superdiagonals * to KU * DO 70 I = 1, MAX( M-1-KL, N-1-KU ) IF( KL.LE.KU ) THEN * * annihilate subdiagonal elements first (necessary if KL = 0) * IF( I.LE.MIN( M-1-KL, N ) ) THEN * * generate reflection to annihilate A(kl+i+1:m,i) * WN = DZNRM2( M-KL-I+1, A( KL+I, I ), 1 ) WA = ( WN / ABS( A( KL+I, I ) ) )*A( KL+I, I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( KL+I, I ) + WA CALL ZSCAL( M-KL-I, ONE / WB, A( KL+I+1, I ), 1 ) A( KL+I, I ) = ONE TAU = DBLE( WB / WA ) END IF * * apply reflection to A(kl+i:m,i+1:n) from the left * CALL ZGEMV( 'Conjugate transpose', M-KL-I+1, N-I, ONE, $ A( KL+I, I+1 ), LDA, A( KL+I, I ), 1, ZERO, $ WORK, 1 ) CALL ZGERC( M-KL-I+1, N-I, -TAU, A( KL+I, I ), 1, WORK, $ 1, A( KL+I, I+1 ), LDA ) A( KL+I, I ) = -WA END IF * IF( I.LE.MIN( N-1-KU, M ) ) THEN * * generate reflection to annihilate A(i,ku+i+1:n) * WN = DZNRM2( N-KU-I+1, A( I, KU+I ), LDA ) WA = ( WN / ABS( A( I, KU+I ) ) )*A( I, KU+I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( I, KU+I ) + WA CALL ZSCAL( N-KU-I, ONE / WB, A( I, KU+I+1 ), LDA ) A( I, KU+I ) = ONE TAU = DBLE( WB / WA ) END IF * * apply reflection to A(i+1:m,ku+i:n) from the right * CALL ZLACGV( N-KU-I+1, A( I, KU+I ), LDA ) CALL ZGEMV( 'No transpose', M-I, N-KU-I+1, ONE, $ A( I+1, KU+I ), LDA, A( I, KU+I ), LDA, ZERO, $ WORK, 1 ) CALL ZGERC( M-I, N-KU-I+1, -TAU, WORK, 1, A( I, KU+I ), $ LDA, A( I+1, KU+I ), LDA ) A( I, KU+I ) = -WA END IF ELSE * * annihilate superdiagonal elements first (necessary if * KU = 0) * IF( I.LE.MIN( N-1-KU, M ) ) THEN * * generate reflection to annihilate A(i,ku+i+1:n) * WN = DZNRM2( N-KU-I+1, A( I, KU+I ), LDA ) WA = ( WN / ABS( A( I, KU+I ) ) )*A( I, KU+I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( I, KU+I ) + WA CALL ZSCAL( N-KU-I, ONE / WB, A( I, KU+I+1 ), LDA ) A( I, KU+I ) = ONE TAU = DBLE( WB / WA ) END IF * * apply reflection to A(i+1:m,ku+i:n) from the right * CALL ZLACGV( N-KU-I+1, A( I, KU+I ), LDA ) CALL ZGEMV( 'No transpose', M-I, N-KU-I+1, ONE, $ A( I+1, KU+I ), LDA, A( I, KU+I ), LDA, ZERO, $ WORK, 1 ) CALL ZGERC( M-I, N-KU-I+1, -TAU, WORK, 1, A( I, KU+I ), $ LDA, A( I+1, KU+I ), LDA ) A( I, KU+I ) = -WA END IF * IF( I.LE.MIN( M-1-KL, N ) ) THEN * * generate reflection to annihilate A(kl+i+1:m,i) * WN = DZNRM2( M-KL-I+1, A( KL+I, I ), 1 ) WA = ( WN / ABS( A( KL+I, I ) ) )*A( KL+I, I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( KL+I, I ) + WA CALL ZSCAL( M-KL-I, ONE / WB, A( KL+I+1, I ), 1 ) A( KL+I, I ) = ONE TAU = DBLE( WB / WA ) END IF * * apply reflection to A(kl+i:m,i+1:n) from the left * CALL ZGEMV( 'Conjugate transpose', M-KL-I+1, N-I, ONE, $ A( KL+I, I+1 ), LDA, A( KL+I, I ), 1, ZERO, $ WORK, 1 ) CALL ZGERC( M-KL-I+1, N-I, -TAU, A( KL+I, I ), 1, WORK, $ 1, A( KL+I, I+1 ), LDA ) A( KL+I, I ) = -WA END IF END IF * DO 50 J = KL + I + 1, M A( J, I ) = ZERO 50 CONTINUE * DO 60 J = KU + I + 1, N A( I, J ) = ZERO 60 CONTINUE 70 CONTINUE RETURN * * End of ZLAGGE * END scalapack-2.0.2/TOOLS/LAPACK/zlaghe.f000644 000766 000024 00000013516 10604316307 017173 0ustar00juliestaff000000 000000 SUBROUTINE ZLAGHE( N, K, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, K, LDA, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION D( * ) COMPLEX*16 A( LDA, * ), WORK( * ) * .. * * Purpose * ======= * * ZLAGHE generates a complex hermitian matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random unitary matrix: * A = U*D*U'. The semi-bandwidth may then be reduced to k by additional * unitary transformations. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * K (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * * D (input) DOUBLE PRECISION array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (output) COMPLEX*16 array, dimension (LDA,N) * The generated n by n hermitian matrix A (the full matrix is * stored). * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= N. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) COMPLEX*16 array, dimension (2*N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE, HALF PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ), $ HALF = ( 0.5D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J DOUBLE PRECISION WN COMPLEX*16 ALPHA, TAU, WA, WB * .. * .. External Subroutines .. EXTERNAL XERBLA, ZAXPY, ZGEMV, ZGERC, ZHEMV, ZHER2, $ ZLARNV, ZSCAL * .. * .. External Functions .. DOUBLE PRECISION DZNRM2 COMPLEX*16 ZDOTC EXTERNAL DZNRM2, ZDOTC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, MAX * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'ZLAGHE', -INFO ) RETURN END IF * * initialize lower triangle of A to diagonal matrix * DO 20 J = 1, N DO 10 I = J + 1, N A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, N A( I, I ) = D( I ) 30 CONTINUE * * Generate lower triangle of hermitian matrix * DO 40 I = N - 1, 1, -1 * * generate random reflection * CALL ZLARNV( 3, ISEED, N-I+1, WORK ) WN = DZNRM2( N-I+1, WORK, 1 ) WA = ( WN / ABS( WORK( 1 ) ) )*WORK( 1 ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL ZSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = DBLE( WB / WA ) END IF * * apply random reflection to A(i:n,i:n) from the left * and the right * * compute y := tau * A * u * CALL ZHEMV( 'Lower', N-I+1, TAU, A( I, I ), LDA, WORK, 1, ZERO, $ WORK( N+1 ), 1 ) * * compute v := y - 1/2 * tau * ( y, u ) * u * ALPHA = -HALF*TAU*ZDOTC( N-I+1, WORK( N+1 ), 1, WORK, 1 ) CALL ZAXPY( N-I+1, ALPHA, WORK, 1, WORK( N+1 ), 1 ) * * apply the transformation as a rank-2 update to A(i:n,i:n) * CALL ZHER2( 'Lower', N-I+1, -ONE, WORK, 1, WORK( N+1 ), 1, $ A( I, I ), LDA ) 40 CONTINUE * * Reduce number of subdiagonals to K * DO 60 I = 1, N - 1 - K * * generate reflection to annihilate A(k+i+1:n,i) * WN = DZNRM2( N-K-I+1, A( K+I, I ), 1 ) WA = ( WN / ABS( A( K+I, I ) ) )*A( K+I, I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( K+I, I ) + WA CALL ZSCAL( N-K-I, ONE / WB, A( K+I+1, I ), 1 ) A( K+I, I ) = ONE TAU = DBLE( WB / WA ) END IF * * apply reflection to A(k+i:n,i+1:k+i-1) from the left * CALL ZGEMV( 'Conjugate transpose', N-K-I+1, K-1, ONE, $ A( K+I, I+1 ), LDA, A( K+I, I ), 1, ZERO, WORK, 1 ) CALL ZGERC( N-K-I+1, K-1, -TAU, A( K+I, I ), 1, WORK, 1, $ A( K+I, I+1 ), LDA ) * * apply reflection to A(k+i:n,k+i:n) from the left and the right * * compute y := tau * A * u * CALL ZHEMV( 'Lower', N-K-I+1, TAU, A( K+I, K+I ), LDA, $ A( K+I, I ), 1, ZERO, WORK, 1 ) * * compute v := y - 1/2 * tau * ( y, u ) * u * ALPHA = -HALF*TAU*ZDOTC( N-K-I+1, WORK, 1, A( K+I, I ), 1 ) CALL ZAXPY( N-K-I+1, ALPHA, A( K+I, I ), 1, WORK, 1 ) * * apply hermitian rank-2 update to A(k+i:n,k+i:n) * CALL ZHER2( 'Lower', N-K-I+1, -ONE, A( K+I, I ), 1, WORK, 1, $ A( K+I, K+I ), LDA ) * A( K+I, I ) = -WA DO 50 J = K + I + 1, N A( J, I ) = ZERO 50 CONTINUE 60 CONTINUE * * Store full hermitian matrix * DO 80 J = 1, N DO 70 I = J + 1, N A( J, I ) = DCONJG( A( I, J ) ) 70 CONTINUE 80 CONTINUE RETURN * * End of ZLAGHE * END scalapack-2.0.2/TOOLS/LAPACK/zlagsy.f000644 000766 000024 00000014771 10604316307 017236 0ustar00juliestaff000000 000000 SUBROUTINE ZLAGSY( N, K, D, A, LDA, ISEED, WORK, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, K, LDA, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION D( * ) COMPLEX*16 A( LDA, * ), WORK( * ) * .. * * Purpose * ======= * * ZLAGSY generates a complex symmetric matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random unitary matrix: * A = U*D*U**T. The semi-bandwidth may then be reduced to k by * additional unitary transformations. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * K (input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * * D (input) DOUBLE PRECISION array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (output) COMPLEX*16 array, dimension (LDA,N) * The generated n by n symmetric matrix A (the full matrix is * stored). * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= N. * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * WORK (workspace) COMPLEX*16 array, dimension (2*N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE, HALF PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ), $ HALF = ( 0.5D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, II, J, JJ DOUBLE PRECISION WN COMPLEX*16 ALPHA, TAU, WA, WB * .. * .. External Subroutines .. EXTERNAL XERBLA, ZAXPY, ZGEMV, ZGERC, ZLACGV, ZLARNV, $ ZSCAL, ZSYMV * .. * .. External Functions .. DOUBLE PRECISION DZNRM2 COMPLEX*16 ZDOTC EXTERNAL DZNRM2, ZDOTC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX * .. * .. Executable Statements .. * * Test the input arguments * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.LT.0 ) THEN CALL XERBLA( 'ZLAGSY', -INFO ) RETURN END IF * * initialize lower triangle of A to diagonal matrix * DO 20 J = 1, N DO 10 I = J + 1, N A( I, J ) = ZERO 10 CONTINUE 20 CONTINUE DO 30 I = 1, N A( I, I ) = D( I ) 30 CONTINUE * * Generate lower triangle of symmetric matrix * DO 60 I = N - 1, 1, -1 * * generate random reflection * CALL ZLARNV( 3, ISEED, N-I+1, WORK ) WN = DZNRM2( N-I+1, WORK, 1 ) WA = ( WN / ABS( WORK( 1 ) ) )*WORK( 1 ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = WORK( 1 ) + WA CALL ZSCAL( N-I, ONE / WB, WORK( 2 ), 1 ) WORK( 1 ) = ONE TAU = DBLE( WB / WA ) END IF * * apply random reflection to A(i:n,i:n) from the left * and the right * * compute y := tau * A * conjg(u) * CALL ZLACGV( N-I+1, WORK, 1 ) CALL ZSYMV( 'Lower', N-I+1, TAU, A( I, I ), LDA, WORK, 1, ZERO, $ WORK( N+1 ), 1 ) CALL ZLACGV( N-I+1, WORK, 1 ) * * compute v := y - 1/2 * tau * ( u, y ) * u * ALPHA = -HALF*TAU*ZDOTC( N-I+1, WORK, 1, WORK( N+1 ), 1 ) CALL ZAXPY( N-I+1, ALPHA, WORK, 1, WORK( N+1 ), 1 ) * * apply the transformation as a rank-2 update to A(i:n,i:n) * * CALL ZSYR2( 'Lower', N-I+1, -ONE, WORK, 1, WORK( N+1 ), 1, * $ A( I, I ), LDA ) * DO 50 JJ = I, N DO 40 II = JJ, N A( II, JJ ) = A( II, JJ ) - $ WORK( II-I+1 )*WORK( N+JJ-I+1 ) - $ WORK( N+II-I+1 )*WORK( JJ-I+1 ) 40 CONTINUE 50 CONTINUE 60 CONTINUE * * Reduce number of subdiagonals to K * DO 100 I = 1, N - 1 - K * * generate reflection to annihilate A(k+i+1:n,i) * WN = DZNRM2( N-K-I+1, A( K+I, I ), 1 ) WA = ( WN / ABS( A( K+I, I ) ) )*A( K+I, I ) IF( WN.EQ.ZERO ) THEN TAU = ZERO ELSE WB = A( K+I, I ) + WA CALL ZSCAL( N-K-I, ONE / WB, A( K+I+1, I ), 1 ) A( K+I, I ) = ONE TAU = DBLE( WB / WA ) END IF * * apply reflection to A(k+i:n,i+1:k+i-1) from the left * CALL ZGEMV( 'Conjugate transpose', N-K-I+1, K-1, ONE, $ A( K+I, I+1 ), LDA, A( K+I, I ), 1, ZERO, WORK, 1 ) CALL ZGERC( N-K-I+1, K-1, -TAU, A( K+I, I ), 1, WORK, 1, $ A( K+I, I+1 ), LDA ) * * apply reflection to A(k+i:n,k+i:n) from the left and the right * * compute y := tau * A * conjg(u) * CALL ZLACGV( N-K-I+1, A( K+I, I ), 1 ) CALL ZSYMV( 'Lower', N-K-I+1, TAU, A( K+I, K+I ), LDA, $ A( K+I, I ), 1, ZERO, WORK, 1 ) CALL ZLACGV( N-K-I+1, A( K+I, I ), 1 ) * * compute v := y - 1/2 * tau * ( u, y ) * u * ALPHA = -HALF*TAU*ZDOTC( N-K-I+1, A( K+I, I ), 1, WORK, 1 ) CALL ZAXPY( N-K-I+1, ALPHA, A( K+I, I ), 1, WORK, 1 ) * * apply symmetric rank-2 update to A(k+i:n,k+i:n) * * CALL ZSYR2( 'Lower', N-K-I+1, -ONE, A( K+I, I ), 1, WORK, 1, * $ A( K+I, K+I ), LDA ) * DO 80 JJ = K + I, N DO 70 II = JJ, N A( II, JJ ) = A( II, JJ ) - A( II, I )*WORK( JJ-K-I+1 ) - $ WORK( II-K-I+1 )*A( JJ, I ) 70 CONTINUE 80 CONTINUE * A( K+I, I ) = -WA DO 90 J = K + I + 1, N A( J, I ) = ZERO 90 CONTINUE 100 CONTINUE * * Store full symmetric matrix * DO 120 J = 1, N DO 110 I = J + 1, N A( J, I ) = A( I, J ) 110 CONTINUE 120 CONTINUE RETURN * * End of ZLAGSY * END scalapack-2.0.2/TOOLS/LAPACK/zlarnd.f000644 000766 000024 00000005652 11654631032 017216 0ustar00juliestaff000000 000000 DOUBLE COMPLEX FUNCTION ZLARND( IDIST, ISEED ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER IDIST * .. * .. Array Arguments .. INTEGER ISEED( 4 ) * .. * * Purpose * ======= * * ZLARND returns a random complex number from a uniform or normal * distribution. * * Arguments * ========= * * IDIST (input) INTEGER * Specifies the distribution of the random numbers: * = 1: real and imaginary parts each uniform (0,1) * = 2: real and imaginary parts each uniform (-1,1) * = 3: real and imaginary parts each normal (0,1) * = 4: uniformly distributed on the disc abs(z) <= 1 * = 5: uniformly distributed on the circle abs(z) = 1 * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * Further Details * =============== * * This routine calls the auxiliary routine DLARAN to generate a random * real number from a uniform (0,1) distribution. The Box-Muller method * is used to transform numbers from a uniform to a normal distribution. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, TWO PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) DOUBLE PRECISION TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION T1, T2 * .. * .. External Functions .. DOUBLE PRECISION DLARAN EXTERNAL DLARAN * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, EXP, LOG, SQRT * .. * .. Executable Statements .. * * Generate a pair of real random numbers from a uniform (0,1) * distribution * T1 = DLARAN( ISEED ) T2 = DLARAN( ISEED ) * IF( IDIST.EQ.1 ) THEN * * real and imaginary parts each uniform (0,1) * ZLARND = DCMPLX( T1, T2 ) ELSE IF( IDIST.EQ.2 ) THEN * * real and imaginary parts each uniform (-1,1) * ZLARND = DCMPLX( TWO*T1-ONE, TWO*T2-ONE ) ELSE IF( IDIST.EQ.3 ) THEN * * real and imaginary parts each normal (0,1) * ZLARND = SQRT( -TWO*LOG( T1 ) )*EXP( DCMPLX( ZERO, TWOPI*T2 ) ) ELSE IF( IDIST.EQ.4 ) THEN * * uniform distribution on the unit disc abs(z) <= 1 * ZLARND = SQRT( T1 )*EXP( DCMPLX( ZERO, TWOPI*T2 ) ) ELSE IF( IDIST.EQ.5 ) THEN * * uniform distribution on the unit circle abs(z) = 1 * ZLARND = EXP( DCMPLX( ZERO, TWOPI*T2 ) ) ELSE ZLARND = DCMPLX(ZERO,ZERO) END IF RETURN * * End of ZLARND * END scalapack-2.0.2/TOOLS/LAPACK/zlarnv.f000644 000766 000024 00000007733 10363532303 017237 0ustar00juliestaff000000 000000 SUBROUTINE ZLARNV( IDIST, ISEED, N, X ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. INTEGER IDIST, N * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX*16 X( * ) * .. * * Purpose * ======= * * ZLARNV returns a vector of n random complex numbers from a uniform or * normal distribution. * * Arguments * ========= * * IDIST (input) INTEGER * Specifies the distribution of the random numbers: * = 1: real and imaginary parts each uniform (0,1) * = 2: real and imaginary parts each uniform (-1,1) * = 3: real and imaginary parts each normal (0,1) * = 4: uniformly distributed on the disc abs(z) < 1 * = 5: uniformly distributed on the circle abs(z) = 1 * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * N (input) INTEGER * The number of random numbers to be generated. * * X (output) COMPLEX*16 array, dimension (N) * The generated random numbers. * * Further Details * =============== * * This routine calls the auxiliary routine DLARUV to generate random * real numbers from a uniform (0,1) distribution, in batches of up to * 128 using vectorisable code. The Box-Muller method is used to * transform numbers from a uniform to a normal distribution. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, TWO PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) INTEGER LV PARAMETER ( LV = 128 ) DOUBLE PRECISION TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663D+0 ) * .. * .. Local Scalars .. INTEGER I, IL, IV * .. * .. Local Arrays .. DOUBLE PRECISION U( LV ) * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, EXP, LOG, MIN, SQRT * .. * .. External Subroutines .. EXTERNAL DLARUV * .. * .. Executable Statements .. * DO 60 IV = 1, N, LV / 2 IL = MIN( LV / 2, N-IV+1 ) * * Call DLARUV to generate 2*IL real numbers from a uniform (0,1) * distribution (2*IL <= LV) * CALL DLARUV( ISEED, 2*IL, U ) * IF( IDIST.EQ.1 ) THEN * * Copy generated numbers * DO 10 I = 1, IL X( IV+I-1 ) = DCMPLX( U( 2*I-1 ), U( 2*I ) ) 10 CONTINUE ELSE IF( IDIST.EQ.2 ) THEN * * Convert generated numbers to uniform (-1,1) distribution * DO 20 I = 1, IL X( IV+I-1 ) = DCMPLX( TWO*U( 2*I-1 )-ONE, $ TWO*U( 2*I )-ONE ) 20 CONTINUE ELSE IF( IDIST.EQ.3 ) THEN * * Convert generated numbers to normal (0,1) distribution * DO 30 I = 1, IL X( IV+I-1 ) = SQRT( -TWO*LOG( U( 2*I-1 ) ) )* $ EXP( DCMPLX( ZERO, TWOPI*U( 2*I ) ) ) 30 CONTINUE ELSE IF( IDIST.EQ.4 ) THEN * * Convert generated numbers to complex numbers uniformly * distributed on the unit disk * DO 40 I = 1, IL X( IV+I-1 ) = SQRT( U( 2*I-1 ) )* $ EXP( DCMPLX( ZERO, TWOPI*U( 2*I ) ) ) 40 CONTINUE ELSE IF( IDIST.EQ.5 ) THEN * * Convert generated numbers to complex numbers uniformly * distributed on the unit circle * DO 50 I = 1, IL X( IV+I-1 ) = EXP( DCMPLX( ZERO, TWOPI*U( 2*I ) ) ) 50 CONTINUE END IF 60 CONTINUE RETURN * * End of ZLARNV * END scalapack-2.0.2/TOOLS/LAPACK/zlarot.f000644 000766 000024 00000023642 10604316307 017235 0ustar00juliestaff000000 000000 SUBROUTINE ZLAROT( LROWS, LLEFT, LRIGHT, NL, C, S, A, LDA, XLEFT, $ XRIGHT ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. LOGICAL LLEFT, LRIGHT, LROWS INTEGER LDA, NL COMPLEX*16 C, S, XLEFT, XRIGHT * .. * .. Array Arguments .. COMPLEX*16 A( * ) * .. * * Purpose * ======= * * ZLAROT applies a (Givens) rotation to two adjacent rows or * columns, where one element of the first and/or last column/row * November 2006 * for use on matrices stored in some format other than GE, so * that elements of the matrix may be used or modified for which * no array element is provided. * * One example is a symmetric matrix in SB format (bandwidth=4), for * which UPLO='L': Two adjacent rows will have the format: * * row j: * * * * * . . . . * row j+1: * * * * * . . . . * * '*' indicates elements for which storage is provided, * '.' indicates elements for which no storage is provided, but * are not necessarily zero; their values are determined by * symmetry. ' ' indicates elements which are necessarily zero, * and have no storage provided. * * Those columns which have two '*'s can be handled by DROT. * Those columns which have no '*'s can be ignored, since as long * as the Givens rotations are carefully applied to preserve * symmetry, their values are determined. * Those columns which have one '*' have to be handled separately, * by using separate variables "p" and "q": * * row j: * * * * * p . . . * row j+1: q * * * * * . . . . * * The element p would have to be set correctly, then that column * is rotated, setting p to its new value. The next call to * ZLAROT would rotate columns j and j+1, using p, and restore * symmetry. The element q would start out being zero, and be * made non-zero by the rotation. Later, rotations would presumably * be chosen to zero q out. * * Typical Calling Sequences: rotating the i-th and (i+1)-st rows. * ------- ------- --------- * * General dense matrix: * * CALL ZLAROT(.TRUE.,.FALSE.,.FALSE., N, C,S, * A(i,1),LDA, DUMMY, DUMMY) * * General banded matrix in GB format: * * j = MAX(1, i-KL ) * NL = MIN( N, i+KU+1 ) + 1-j * CALL ZLAROT( .TRUE., i-KL.GE.1, i+KU.LT.N, NL, C,S, * A(KU+i+1-j,j),LDA-1, XLEFT, XRIGHT ) * * [ note that i+1-j is just MIN(i,KL+1) ] * * Symmetric banded matrix in SY format, bandwidth K, * lower triangle only: * * j = MAX(1, i-K ) * NL = MIN( K+1, i ) + 1 * CALL ZLAROT( .TRUE., i-K.GE.1, .TRUE., NL, C,S, * A(i,j), LDA, XLEFT, XRIGHT ) * * Same, but upper triangle only: * * NL = MIN( K+1, N-i ) + 1 * CALL ZLAROT( .TRUE., .TRUE., i+K.LT.N, NL, C,S, * A(i,i), LDA, XLEFT, XRIGHT ) * * Symmetric banded matrix in SB format, bandwidth K, * lower triangle only: * * [ same as for SY, except:] * . . . . * A(i+1-j,j), LDA-1, XLEFT, XRIGHT ) * * [ note that i+1-j is just MIN(i,K+1) ] * * Same, but upper triangle only: * . . . * A(K+1,i), LDA-1, XLEFT, XRIGHT ) * * Rotating columns is just the transpose of rotating rows, except * for GB and SB: (rotating columns i and i+1) * * GB: * j = MAX(1, i-KU ) * NL = MIN( N, i+KL+1 ) + 1-j * CALL ZLAROT( .TRUE., i-KU.GE.1, i+KL.LT.N, NL, C,S, * A(KU+j+1-i,i),LDA-1, XTOP, XBOTTM ) * * [note that KU+j+1-i is just MAX(1,KU+2-i)] * * SB: (upper triangle) * * . . . . . . * A(K+j+1-i,i),LDA-1, XTOP, XBOTTM ) * * SB: (lower triangle) * * . . . . . . * A(1,i),LDA-1, XTOP, XBOTTM ) * * Arguments * ========= * * LROWS - LOGICAL * If .TRUE., then ZLAROT will rotate two rows. If .FALSE., * then it will rotate two columns. * Not modified. * * LLEFT - LOGICAL * If .TRUE., then XLEFT will be used instead of the * corresponding element of A for the first element in the * second row (if LROWS=.FALSE.) or column (if LROWS=.TRUE.) * If .FALSE., then the corresponding element of A will be * used. * Not modified. * * LRIGHT - LOGICAL * If .TRUE., then XRIGHT will be used instead of the * corresponding element of A for the last element in the * first row (if LROWS=.FALSE.) or column (if LROWS=.TRUE.) If * .FALSE., then the corresponding element of A will be used. * Not modified. * * NL - INTEGER * The length of the rows (if LROWS=.TRUE.) or columns (if * LROWS=.FALSE.) to be rotated. If XLEFT and/or XRIGHT are * used, the columns/rows they are in should be included in * NL, e.g., if LLEFT = LRIGHT = .TRUE., then NL must be at * least 2. The number of rows/columns to be rotated * exclusive of those involving XLEFT and/or XRIGHT may * not be negative, i.e., NL minus how many of LLEFT and * LRIGHT are .TRUE. must be at least zero; if not, XERBLA * will be called. * Not modified. * * C, S - COMPLEX*16 * Specify the Givens rotation to be applied. If LROWS is * true, then the matrix ( c s ) * ( _ _ ) * (-s c ) is applied from the left; * if false, then the transpose (not conjugated) thereof is * applied from the right. Note that in contrast to the * output of ZROTG or to most versions of ZROT, both C and S * are complex. For a Givens rotation, |C|**2 + |S|**2 should * be 1, but this is not checked. * Not modified. * * A - COMPLEX*16 array. * The array containing the rows/columns to be rotated. The * first element of A should be the upper left element to * be rotated. * Read and modified. * * LDA - INTEGER * The "effective" leading dimension of A. If A contains * a matrix stored in GE, HE, or SY format, then this is just * the leading dimension of A as dimensioned in the calling * routine. If A contains a matrix stored in band (GB, HB, or * SB) format, then this should be *one less* than the leading * dimension used in the calling routine. Thus, if A were * dimensioned A(LDA,*) in ZLAROT, then A(1,j) would be the * j-th element in the first of the two rows to be rotated, * and A(2,j) would be the j-th in the second, regardless of * how the array may be stored in the calling routine. [A * cannot, however, actually be dimensioned thus, since for * band format, the row number may exceed LDA, which is not * legal FORTRAN.] * If LROWS=.TRUE., then LDA must be at least 1, otherwise * it must be at least NL minus the number of .TRUE. values * in XLEFT and XRIGHT. * Not modified. * * XLEFT - COMPLEX*16 * If LLEFT is .TRUE., then XLEFT will be used and modified * instead of A(2,1) (if LROWS=.TRUE.) or A(1,2) * (if LROWS=.FALSE.). * Read and modified. * * XRIGHT - COMPLEX*16 * If LRIGHT is .TRUE., then XRIGHT will be used and modified * instead of A(1,NL) (if LROWS=.TRUE.) or A(NL,1) * (if LROWS=.FALSE.). * Read and modified. * * ===================================================================== * * .. Local Scalars .. INTEGER IINC, INEXT, IX, IY, IYT, J, NT COMPLEX*16 TEMPX * .. * .. Local Arrays .. COMPLEX*16 XT( 2 ), YT( 2 ) * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DCONJG * .. * .. Executable Statements .. * * Set up indices, arrays for ends * IF( LROWS ) THEN IINC = LDA INEXT = 1 ELSE IINC = 1 INEXT = LDA END IF * IF( LLEFT ) THEN NT = 1 IX = 1 + IINC IY = 2 + LDA XT( 1 ) = A( 1 ) YT( 1 ) = XLEFT ELSE NT = 0 IX = 1 IY = 1 + INEXT END IF * IF( LRIGHT ) THEN IYT = 1 + INEXT + ( NL-1 )*IINC NT = NT + 1 XT( NT ) = XRIGHT YT( NT ) = A( IYT ) END IF * * Check for errors * IF( NL.LT.NT ) THEN CALL XERBLA( 'ZLAROT', 4 ) RETURN END IF IF( LDA.LE.0 .OR. ( .NOT.LROWS .AND. LDA.LT.NL-NT ) ) THEN CALL XERBLA( 'ZLAROT', 8 ) RETURN END IF * * Rotate * * ZROT( NL-NT, A(IX),IINC, A(IY),IINC, C, S ) with complex C, S * DO 10 J = 0, NL - NT - 1 TEMPX = C*A( IX+J*IINC ) + S*A( IY+J*IINC ) A( IY+J*IINC ) = -DCONJG( S )*A( IX+J*IINC ) + $ DCONJG( C )*A( IY+J*IINC ) A( IX+J*IINC ) = TEMPX 10 CONTINUE * * ZROT( NT, XT,1, YT,1, C, S ) with complex C, S * DO 20 J = 1, NT TEMPX = C*XT( J ) + S*YT( J ) YT( J ) = -DCONJG( S )*XT( J ) + DCONJG( C )*YT( J ) XT( J ) = TEMPX 20 CONTINUE * * Stuff values back into XLEFT, XRIGHT, etc. * IF( LLEFT ) THEN A( 1 ) = XT( 1 ) XLEFT = YT( 1 ) END IF * IF( LRIGHT ) THEN XRIGHT = XT( NT ) A( IYT ) = YT( NT ) END IF * RETURN * * End of ZLAROT * END scalapack-2.0.2/TOOLS/LAPACK/zlatm1.f000644 000766 000024 00000015767 10604316307 017143 0ustar00juliestaff000000 000000 SUBROUTINE ZLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, N, INFO ) * * -- LAPACK auxiliary test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER IDIST, INFO, IRSIGN, MODE, N DOUBLE PRECISION COND * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX*16 D( * ) * .. * * Purpose * ======= * * ZLATM1 computes the entries of D(1..N) as specified by * MODE, COND and IRSIGN. IDIST and ISEED determine the generation * of random numbers. ZLATM1 is called by CLATMR to generate * random test matrices for LAPACK programs. * * Arguments * ========= * * MODE - INTEGER * On entry describes how D is to be computed: * MODE = 0 means do not change D. * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * Not modified. * * COND - DOUBLE PRECISION * On entry, used as described under MODE above. * If used, it must be >= 1. Not modified. * * IRSIGN - INTEGER * On entry, if MODE neither -6, 0 nor 6, determines sign of * entries of D * 0 => leave entries of D unchanged * 1 => multiply each entry of D by random complex number * uniformly distributed with absolute value 1 * * IDIST - CHARACTER*1 * On entry, IDIST specifies the type of distribution to be * used to generate a random matrix . * 1 => real and imaginary parts each UNIFORM( 0, 1 ) * 2 => real and imaginary parts each UNIFORM( -1, 1 ) * 3 => real and imaginary parts each NORMAL( 0, 1 ) * 4 => complex number uniform in DISK( 0, 1 ) * Not modified. * * ISEED - INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. The random number generator uses a * linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to ZLATM1 * to continue the same random number sequence. * Changed on exit. * * D - COMPLEX*16 array, dimension ( MIN( M , N ) ) * Array to be computed according to MODE, COND and IRSIGN. * May be changed on exit if MODE is nonzero. * * N - INTEGER * Number of entries of D. Not modified. * * INFO - INTEGER * 0 => normal termination * -1 => if MODE not in range -6 to 6 * -2 => if MODE neither -6, 0 nor 6, and * IRSIGN neither 0 nor 1 * -3 => if MODE neither -6, 0 nor 6 and COND less than 1 * -4 => if MODE equals 6 or -6 and IDIST not in range 1 to 4 * -7 => if N negative * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D0 ) * .. * .. Local Scalars .. INTEGER I DOUBLE PRECISION ALPHA, TEMP COMPLEX*16 CTEMP * .. * .. External Functions .. DOUBLE PRECISION DLARAN COMPLEX*16 ZLARND EXTERNAL DLARAN, ZLARND * .. * .. External Subroutines .. EXTERNAL XERBLA, ZLARNV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, EXP, LOG * .. * .. Executable Statements .. * * Decode and Test the input parameters. Initialize flags & seed. * INFO = 0 * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Set INFO if an error * IF( MODE.LT.-6 .OR. MODE.GT.6 ) THEN INFO = -1 ELSE IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ ( IRSIGN.NE.0 .AND. IRSIGN.NE.1 ) ) THEN INFO = -2 ELSE IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ COND.LT.ONE ) THEN INFO = -3 ELSE IF( ( MODE.EQ.6 .OR. MODE.EQ.-6 ) .AND. $ ( IDIST.LT.1 .OR. IDIST.GT.4 ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -7 END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLATM1', -INFO ) RETURN END IF * * Compute D according to COND and MODE * IF( MODE.NE.0 ) THEN GO TO ( 10, 30, 50, 70, 90, 110 )ABS( MODE ) * * One large D value: * 10 CONTINUE DO 20 I = 1, N D( I ) = ONE / COND 20 CONTINUE D( 1 ) = ONE GO TO 120 * * One small D value: * 30 CONTINUE DO 40 I = 1, N D( I ) = ONE 40 CONTINUE D( N ) = ONE / COND GO TO 120 * * Exponentially distributed D values: * 50 CONTINUE D( 1 ) = ONE IF( N.GT.1 ) THEN ALPHA = COND**( -ONE / DBLE( N-1 ) ) DO 60 I = 2, N D( I ) = ALPHA**( I-1 ) 60 CONTINUE END IF GO TO 120 * * Arithmetically distributed D values: * 70 CONTINUE D( 1 ) = ONE IF( N.GT.1 ) THEN TEMP = ONE / COND ALPHA = ( ONE-TEMP ) / DBLE( N-1 ) DO 80 I = 2, N D( I ) = DBLE( N-I )*ALPHA + TEMP 80 CONTINUE END IF GO TO 120 * * Randomly distributed D values on ( 1/COND , 1): * 90 CONTINUE ALPHA = LOG( ONE / COND ) DO 100 I = 1, N D( I ) = EXP( ALPHA*DLARAN( ISEED ) ) 100 CONTINUE GO TO 120 * * Randomly distributed D values from IDIST * 110 CONTINUE CALL ZLARNV( IDIST, ISEED, N, D ) * 120 CONTINUE * * If MODE neither -6 nor 0 nor 6, and IRSIGN = 1, assign * random signs to D * IF( ( MODE.NE.-6 .AND. MODE.NE.0 .AND. MODE.NE.6 ) .AND. $ IRSIGN.EQ.1 ) THEN DO 130 I = 1, N CTEMP = ZLARND( 3, ISEED ) D( I ) = D( I )*( CTEMP / ABS( CTEMP ) ) 130 CONTINUE END IF * * Reverse if MODE < 0 * IF( MODE.LT.0 ) THEN DO 140 I = 1, N / 2 CTEMP = D( I ) D( I ) = D( N+1-I ) D( N+1-I ) = CTEMP 140 CONTINUE END IF * END IF * RETURN * * End of ZLATM1 * END scalapack-2.0.2/TOOLS/LAPACK/zlatms.f000644 000766 000024 00000124634 11654631032 017240 0ustar00juliestaff000000 000000 SUBROUTINE ZLATMS( M, N, DIST, ISEED, SYM, D, MODE, COND, DMAX, $ KL, KU, PACK, A, LDA, WORK, INFO ) * * -- LAPACK test routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER DIST, PACK, SYM INTEGER INFO, KL, KU, LDA, M, MODE, N DOUBLE PRECISION COND, DMAX * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION D( * ) COMPLEX*16 A( LDA, * ), WORK( * ) * .. * * Purpose * ======= * * ZLATMS generates random matrices with specified singular values * (or hermitian with specified eigenvalues) * for testing LAPACK programs. * * ZLATMS operates by applying the following sequence of * operations: * * Set the diagonal to D, where D may be input or * computed according to MODE, COND, DMAX, and SYM * as described below. * * Generate a matrix with the appropriate band structure, by one * of two methods: * * Method A: * Generate a dense M x N matrix by multiplying D on the left * and the right by random unitary matrices, then: * * Reduce the bandwidth according to KL and KU, using * Householder transformations. * * Method B: * Convert the bandwidth-0 (i.e., diagonal) matrix to a * bandwidth-1 matrix using Givens rotations, "chasing" * out-of-band elements back, much as in QR; then convert * the bandwidth-1 to a bandwidth-2 matrix, etc. Note * that for reasonably small bandwidths (relative to M and * N) this requires less storage, as a dense matrix is not * generated. Also, for hermitian or symmetric matrices, * only one triangle is generated. * * Method A is chosen if the bandwidth is a large fraction of the * order of the matrix, and LDA is at least M (so a dense * matrix can be stored.) Method B is chosen if the bandwidth * is small (< 1/2 N for hermitian or symmetric, < .3 N+M for * non-symmetric), or LDA is less than M and not less than the * bandwidth. * * Pack the matrix if desired. Options specified by PACK are: * no packing * zero out upper half (if hermitian) * zero out lower half (if hermitian) * store the upper half columnwise (if hermitian or upper * triangular) * store the lower half columnwise (if hermitian or lower * triangular) * store the lower triangle in banded format (if hermitian or * lower triangular) * store the upper triangle in banded format (if hermitian or * upper triangular) * store the entire matrix in banded format * If Method B is chosen, and band format is specified, then the * matrix will be generated in the band format, so no repacking * will be necessary. * * Arguments * ========= * * M - INTEGER * The number of rows of A. Not modified. * * N - INTEGER * The number of columns of A. N must equal M if the matrix * is symmetric or hermitian (i.e., if SYM is not 'N') * Not modified. * * DIST - CHARACTER*1 * On entry, DIST specifies the type of distribution to be used * to generate the random eigen-/singular values. * 'U' => UNIFORM( 0, 1 ) ( 'U' for uniform ) * 'S' => UNIFORM( -1, 1 ) ( 'S' for symmetric ) * 'N' => NORMAL( 0, 1 ) ( 'N' for normal ) * Not modified. * * ISEED - INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. They should lie between 0 and 4095 inclusive, * and ISEED(4) should be odd. The random number generator * uses a linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to ZLATMS * to continue the same random number sequence. * Changed on exit. * * SYM - CHARACTER*1 * If SYM='H', the generated matrix is hermitian, with * eigenvalues specified by D, COND, MODE, and DMAX; they * may be positive, negative, or zero. * If SYM='P', the generated matrix is hermitian, with * eigenvalues (= singular values) specified by D, COND, * MODE, and DMAX; they will not be negative. * If SYM='N', the generated matrix is nonsymmetric, with * singular values specified by D, COND, MODE, and DMAX; * they will not be negative. * If SYM='S', the generated matrix is (complex) symmetric, * with singular values specified by D, COND, MODE, and * DMAX; they will not be negative. * Not modified. * * D - DOUBLE PRECISION array, dimension ( MIN( M, N ) ) * This array is used to specify the singular values or * eigenvalues of A (see SYM, above.) If MODE=0, then D is * assumed to contain the singular/eigenvalues, otherwise * they will be computed according to MODE, COND, and DMAX, * and placed in D. * Modified if MODE is nonzero. * * MODE - INTEGER * On entry this describes how the singular/eigenvalues are to * be specified: * MODE = 0 means use D as input * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * If SYM='H', and MODE is neither 0, 6, nor -6, then * the elements of D will also be multiplied by a random * sign (i.e., +1 or -1.) * Not modified. * * COND - DOUBLE PRECISION * On entry, this is used as described under MODE above. * If used, it must be >= 1. Not modified. * * DMAX - DOUBLE PRECISION * If MODE is neither -6, 0 nor 6, the contents of D, as * computed according to MODE and COND, will be scaled by * DMAX / max(abs(D(i))); thus, the maximum absolute eigen- or * singular value (which is to say the norm) will be abs(DMAX). * Note that DMAX need not be positive: if DMAX is negative * (or zero), D will be scaled by a negative number (or zero). * Not modified. * * KL - INTEGER * This specifies the lower bandwidth of the matrix. For * example, KL=0 implies upper triangular, KL=1 implies upper * Hessenberg, and KL being at least M-1 means that the matrix * has full lower bandwidth. KL must equal KU if the matrix * is symmetric or hermitian. * Not modified. * * KU - INTEGER * This specifies the upper bandwidth of the matrix. For * example, KU=0 implies lower triangular, KU=1 implies lower * Hessenberg, and KU being at least N-1 means that the matrix * has full upper bandwidth. KL must equal KU if the matrix * is symmetric or hermitian. * Not modified. * * PACK - CHARACTER*1 * This specifies packing of matrix as follows: * 'N' => no packing * 'U' => zero out all subdiagonal entries (if symmetric * or hermitian) * 'L' => zero out all superdiagonal entries (if symmetric * or hermitian) * 'C' => store the upper triangle columnwise (only if the * matrix is symmetric, hermitian, or upper triangular) * 'R' => store the lower triangle columnwise (only if the * matrix is symmetric, hermitian, or lower triangular) * 'B' => store the lower triangle in band storage scheme * (only if the matrix is symmetric, hermitian, or * lower triangular) * 'Q' => store the upper triangle in band storage scheme * (only if the matrix is symmetric, hermitian, or * upper triangular) * 'Z' => store the entire matrix in band storage scheme * (pivoting can be provided for by using this * option to store A in the trailing rows of * the allocated storage) * * Using these options, the various LAPACK packed and banded * storage schemes can be obtained: * GB - use 'Z' * PB, SB, HB, or TB - use 'B' or 'Q' * PP, SP, HB, or TP - use 'C' or 'R' * * If two calls to ZLATMS differ only in the PACK parameter, * they will generate mathematically equivalent matrices. * Not modified. * * A - COMPLEX*16 array, dimension ( LDA, N ) * On exit A is the desired test matrix. A is first generated * in full (unpacked) form, and then packed, if so specified * by PACK. Thus, the first M elements of the first N * columns will always be modified. If PACK specifies a * packed or banded storage scheme, all LDA elements of the * first N columns will be modified; the elements of the * array which do not correspond to elements of the generated * matrix are set to zero. * Modified. * * LDA - INTEGER * LDA specifies the first dimension of A as declared in the * calling program. If PACK='N', 'U', 'L', 'C', or 'R', then * LDA must be at least M. If PACK='B' or 'Q', then LDA must * be at least MIN( KL, M-1) (which is equal to MIN(KU,N-1)). * If PACK='Z', LDA must be large enough to hold the packed * array: MIN( KU, N-1) + MIN( KL, M-1) + 1. * Not modified. * * WORK - COMPLEX*16 array, dimension ( 3*MAX( N, M ) ) * Workspace. * Modified. * * INFO - INTEGER * Error code. On exit, INFO will be set to one of the * following values: * 0 => normal return * -1 => M negative or unequal to N and SYM='S', 'H', or 'P' * -2 => N negative * -3 => DIST illegal string * -5 => SYM illegal string * -7 => MODE not in range -6 to 6 * -8 => COND less than 1.0, and MODE neither -6, 0 nor 6 * -10 => KL negative * -11 => KU negative, or SYM is not 'N' and KU is not equal to * KL * -12 => PACK illegal string, or PACK='U' or 'L', and SYM='N'; * or PACK='C' or 'Q' and SYM='N' and KL is not zero; * or PACK='R' or 'B' and SYM='N' and KU is not zero; * or PACK='U', 'L', 'C', 'R', 'B', or 'Q', and M is not * N. * -14 => LDA is less than M, or PACK='Z' and LDA is less than * MIN(KU,N-1) + MIN(KL,M-1) + 1. * 1 => Error return from DLATM1 * 2 => Cannot scale to DMAX (max. sing. value is 0) * 3 => Error return from ZLAGGE, CLAGHE or CLAGSY * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) COMPLEX*16 CZERO PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) DOUBLE PRECISION TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663D+0 ) * .. * .. Local Scalars .. LOGICAL GIVENS, ILEXTR, ILTEMP, TOPDWN, ZSYM INTEGER I, IC, ICOL, IDIST, IENDCH, IINFO, IL, ILDA, $ IOFFG, IOFFST, IPACK, IPACKG, IR, IR1, IR2, $ IROW, IRSIGN, ISKEW, ISYM, ISYMPK, J, JC, JCH, $ JKL, JKU, JR, K, LLB, MINLDA, MNMIN, MR, NC, $ UUB DOUBLE PRECISION ALPHA, ANGLE, REALC, TEMP COMPLEX*16 C, CT, CTEMP, DUMMY, EXTRA, S, ST * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLARND COMPLEX*16 ZLARND EXTERNAL LSAME, DLARND, ZLARND * .. * .. External Subroutines .. EXTERNAL DLATM1, DSCAL, XERBLA, ZLAGGE, ZLAGHE, ZLAGSY, $ ZLAROT, ZLARTG, ZLASET * .. * .. Intrinsic Functions .. INTRINSIC ABS, COS, DBLE, DCMPLX, DCONJG, MAX, MIN, MOD, $ SIN * .. * .. Executable Statements .. * * 1) Decode and Test the input parameters. * Initialize flags & seed. * INFO = 0 * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Decode DIST * IF( LSAME( DIST, 'U' ) ) THEN IDIST = 1 ELSE IF( LSAME( DIST, 'S' ) ) THEN IDIST = 2 ELSE IF( LSAME( DIST, 'N' ) ) THEN IDIST = 3 ELSE IDIST = -1 END IF * * Decode SYM * IF( LSAME( SYM, 'N' ) ) THEN ISYM = 1 IRSIGN = 0 ZSYM = .FALSE. ELSE IF( LSAME( SYM, 'P' ) ) THEN ISYM = 2 IRSIGN = 0 ZSYM = .FALSE. ELSE IF( LSAME( SYM, 'S' ) ) THEN ISYM = 2 IRSIGN = 0 ZSYM = .TRUE. ELSE IF( LSAME( SYM, 'H' ) ) THEN ISYM = 2 IRSIGN = 1 ZSYM = .FALSE. ELSE ISYM = -1 END IF * * Decode PACK * ISYMPK = 0 IF( LSAME( PACK, 'N' ) ) THEN IPACK = 0 ELSE IF( LSAME( PACK, 'U' ) ) THEN IPACK = 1 ISYMPK = 1 ELSE IF( LSAME( PACK, 'L' ) ) THEN IPACK = 2 ISYMPK = 1 ELSE IF( LSAME( PACK, 'C' ) ) THEN IPACK = 3 ISYMPK = 2 ELSE IF( LSAME( PACK, 'R' ) ) THEN IPACK = 4 ISYMPK = 3 ELSE IF( LSAME( PACK, 'B' ) ) THEN IPACK = 5 ISYMPK = 3 ELSE IF( LSAME( PACK, 'Q' ) ) THEN IPACK = 6 ISYMPK = 2 ELSE IF( LSAME( PACK, 'Z' ) ) THEN IPACK = 7 ELSE IPACK = -1 END IF * * Set certain internal parameters * MNMIN = MIN( M, N ) LLB = MIN( KL, M-1 ) UUB = MIN( KU, N-1 ) MR = MIN( M, N+LLB ) NC = MIN( N, M+UUB ) IROW = 1 ICOL = 1 ZSYM = .FALSE. * IF( IPACK.EQ.5 .OR. IPACK.EQ.6 ) THEN MINLDA = UUB + 1 ELSE IF( IPACK.EQ.7 ) THEN MINLDA = LLB + UUB + 1 ELSE MINLDA = M END IF * * Use Givens rotation method if bandwidth small enough, * or if LDA is too small to store the matrix unpacked. * GIVENS = .FALSE. IF( ISYM.EQ.1 ) THEN IF( DBLE( LLB+UUB ).LT.0.3D0*DBLE( MAX( 1, MR+NC ) ) ) $ GIVENS = .TRUE. ELSE IF( 2*LLB.LT.M ) $ GIVENS = .TRUE. END IF IF( LDA.LT.M .AND. LDA.GE.MINLDA ) $ GIVENS = .TRUE. * * Set INFO if an error * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( M.NE.N .AND. ISYM.NE.1 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( IDIST.EQ.-1 ) THEN INFO = -3 ELSE IF( ISYM.EQ.-1 ) THEN INFO = -5 ELSE IF( ABS( MODE ).GT.6 ) THEN INFO = -7 ELSE IF( ( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) .AND. COND.LT.ONE ) $ THEN INFO = -8 ELSE IF( KL.LT.0 ) THEN INFO = -10 ELSE IF( KU.LT.0 .OR. ( ISYM.NE.1 .AND. KL.NE.KU ) ) THEN INFO = -11 ELSE IF( IPACK.EQ.-1 .OR. ( ISYMPK.EQ.1 .AND. ISYM.EQ.1 ) .OR. $ ( ISYMPK.EQ.2 .AND. ISYM.EQ.1 .AND. KL.GT.0 ) .OR. $ ( ISYMPK.EQ.3 .AND. ISYM.EQ.1 .AND. KU.GT.0 ) .OR. $ ( ISYMPK.NE.0 .AND. M.NE.N ) ) THEN INFO = -12 ELSE IF( LDA.LT.MAX( 1, MINLDA ) ) THEN INFO = -14 END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLATMS', -INFO ) RETURN END IF * * Initialize random number generator * DO 10 I = 1, 4 ISEED( I ) = MOD( ABS( ISEED( I ) ), 4096 ) 10 CONTINUE * IF( MOD( ISEED( 4 ), 2 ).NE.1 ) $ ISEED( 4 ) = ISEED( 4 ) + 1 * * 2) Set up D if indicated. * * Compute D according to COND and MODE * CALL DLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, MNMIN, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 1 RETURN END IF * * Choose Top-Down if D is (apparently) increasing, * Bottom-Up if D is (apparently) decreasing. * IF( ABS( D( 1 ) ).LE.ABS( D( MNMIN ) ) ) THEN TOPDWN = .TRUE. ELSE TOPDWN = .FALSE. END IF * IF( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) THEN * * Scale by DMAX * TEMP = ABS( D( 1 ) ) DO 20 I = 2, MNMIN TEMP = MAX( TEMP, ABS( D( I ) ) ) 20 CONTINUE * IF( TEMP.GT.ZERO ) THEN ALPHA = DMAX / TEMP ELSE INFO = 2 RETURN END IF * CALL DSCAL( MNMIN, ALPHA, D, 1 ) * END IF * CALL ZLASET( 'Full', LDA, N, CZERO, CZERO, A, LDA ) * * 3) Generate Banded Matrix using Givens rotations. * Also the special case of UUB=LLB=0 * * Compute Addressing constants to cover all * storage formats. Whether GE, HE, SY, GB, HB, or SB, * upper or lower triangle or both, * the (i,j)-th element is in * A( i - ISKEW*j + IOFFST, j ) * IF( IPACK.GT.4 ) THEN ILDA = LDA - 1 ISKEW = 1 IF( IPACK.GT.5 ) THEN IOFFST = UUB + 1 ELSE IOFFST = 1 END IF ELSE ILDA = LDA ISKEW = 0 IOFFST = 0 END IF * * IPACKG is the format that the matrix is generated in. If this is * different from IPACK, then the matrix must be repacked at the * end. It also signals how to compute the norm, for scaling. * IPACKG = 0 * * Diagonal Matrix -- We are done, unless it * is to be stored HP/SP/PP/TP (PACK='R' or 'C') * IF( LLB.EQ.0 .AND. UUB.EQ.0 ) THEN DO 30 J = 1, MNMIN A( ( 1-ISKEW )*J+IOFFST, J ) = DCMPLX( D( J ) ) 30 CONTINUE * IF( IPACK.LE.2 .OR. IPACK.GE.5 ) $ IPACKG = IPACK * ELSE IF( GIVENS ) THEN * * Check whether to use Givens rotations, * Householder transformations, or nothing. * IF( ISYM.EQ.1 ) THEN * * Non-symmetric -- A = U D V * IF( IPACK.GT.4 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF * DO 40 J = 1, MNMIN A( ( 1-ISKEW )*J+IOFFST, J ) = DCMPLX( D( J ) ) 40 CONTINUE * IF( TOPDWN ) THEN JKL = 0 DO 70 JKU = 1, UUB * * Transform from bandwidth JKL, JKU-1 to JKL, JKU * * Last row actually rotated is M * Last column actually rotated is MIN( M+JKU, N ) * DO 60 JR = 1, MIN( M+JKU, N ) + JKL - 1 EXTRA = CZERO ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE )*ZLARND( 5, ISEED ) S = SIN( ANGLE )*ZLARND( 5, ISEED ) ICOL = MAX( 1, JR-JKL ) IF( JR.LT.M ) THEN IL = MIN( N, JR+JKU ) + 1 - ICOL CALL ZLAROT( .TRUE., JR.GT.JKL, .FALSE., IL, C, $ S, A( JR-ISKEW*ICOL+IOFFST, ICOL ), $ ILDA, EXTRA, DUMMY ) END IF * * Chase "EXTRA" back up * IR = JR IC = ICOL DO 50 JCH = JR - JKL, 1, -JKL - JKU IF( IR.LT.M ) THEN CALL ZLARTG( A( IR+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), EXTRA, REALC, S, DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = DCONJG( REALC*DUMMY ) S = DCONJG( -S*DUMMY ) END IF IROW = MAX( 1, JCH-JKU ) IL = IR + 2 - IROW CTEMP = CZERO ILTEMP = JCH.GT.JKU CALL ZLAROT( .FALSE., ILTEMP, .TRUE., IL, C, S, $ A( IROW-ISKEW*IC+IOFFST, IC ), $ ILDA, CTEMP, EXTRA ) IF( ILTEMP ) THEN CALL ZLARTG( A( IROW+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), CTEMP, REALC, S, DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = DCONJG( REALC*DUMMY ) S = DCONJG( -S*DUMMY ) * ICOL = MAX( 1, JCH-JKU-JKL ) IL = IC + 2 - ICOL EXTRA = CZERO CALL ZLAROT( .TRUE., JCH.GT.JKU+JKL, .TRUE., $ IL, C, S, A( IROW-ISKEW*ICOL+ $ IOFFST, ICOL ), ILDA, EXTRA, $ CTEMP ) IC = ICOL IR = IROW END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE * JKU = UUB DO 100 JKL = 1, LLB * * Transform from bandwidth JKL-1, JKU to JKL, JKU * DO 90 JC = 1, MIN( N+JKL, M ) + JKU - 1 EXTRA = CZERO ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE )*ZLARND( 5, ISEED ) S = SIN( ANGLE )*ZLARND( 5, ISEED ) IROW = MAX( 1, JC-JKU ) IF( JC.LT.N ) THEN IL = MIN( M, JC+JKL ) + 1 - IROW CALL ZLAROT( .FALSE., JC.GT.JKU, .FALSE., IL, C, $ S, A( IROW-ISKEW*JC+IOFFST, JC ), $ ILDA, EXTRA, DUMMY ) END IF * * Chase "EXTRA" back up * IC = JC IR = IROW DO 80 JCH = JC - JKU, 1, -JKL - JKU IF( IC.LT.N ) THEN CALL ZLARTG( A( IR+1-ISKEW*( IC+1 )+IOFFST, $ IC+1 ), EXTRA, REALC, S, DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = DCONJG( REALC*DUMMY ) S = DCONJG( -S*DUMMY ) END IF ICOL = MAX( 1, JCH-JKL ) IL = IC + 2 - ICOL CTEMP = CZERO ILTEMP = JCH.GT.JKL CALL ZLAROT( .TRUE., ILTEMP, .TRUE., IL, C, S, $ A( IR-ISKEW*ICOL+IOFFST, ICOL ), $ ILDA, CTEMP, EXTRA ) IF( ILTEMP ) THEN CALL ZLARTG( A( IR+1-ISKEW*( ICOL+1 )+IOFFST, $ ICOL+1 ), CTEMP, REALC, S, $ DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = DCONJG( REALC*DUMMY ) S = DCONJG( -S*DUMMY ) IROW = MAX( 1, JCH-JKL-JKU ) IL = IR + 2 - IROW EXTRA = CZERO CALL ZLAROT( .FALSE., JCH.GT.JKL+JKU, .TRUE., $ IL, C, S, A( IROW-ISKEW*ICOL+ $ IOFFST, ICOL ), ILDA, EXTRA, $ CTEMP ) IC = ICOL IR = IROW END IF 80 CONTINUE 90 CONTINUE 100 CONTINUE * ELSE * * Bottom-Up -- Start at the bottom right. * JKL = 0 DO 130 JKU = 1, UUB * * Transform from bandwidth JKL, JKU-1 to JKL, JKU * * First row actually rotated is M * First column actually rotated is MIN( M+JKU, N ) * IENDCH = MIN( M, N+JKL ) - 1 DO 120 JC = MIN( M+JKU, N ) - 1, 1 - JKL, -1 EXTRA = CZERO ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE )*ZLARND( 5, ISEED ) S = SIN( ANGLE )*ZLARND( 5, ISEED ) IROW = MAX( 1, JC-JKU+1 ) IF( JC.GT.0 ) THEN IL = MIN( M, JC+JKL+1 ) + 1 - IROW CALL ZLAROT( .FALSE., .FALSE., JC+JKL.LT.M, IL, $ C, S, A( IROW-ISKEW*JC+IOFFST, $ JC ), ILDA, DUMMY, EXTRA ) END IF * * Chase "EXTRA" back down * IC = JC DO 110 JCH = JC + JKL, IENDCH, JKL + JKU ILEXTR = IC.GT.0 IF( ILEXTR ) THEN CALL ZLARTG( A( JCH-ISKEW*IC+IOFFST, IC ), $ EXTRA, REALC, S, DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY END IF IC = MAX( 1, IC ) ICOL = MIN( N-1, JCH+JKU ) ILTEMP = JCH + JKU.LT.N CTEMP = CZERO CALL ZLAROT( .TRUE., ILEXTR, ILTEMP, ICOL+2-IC, $ C, S, A( JCH-ISKEW*IC+IOFFST, IC ), $ ILDA, EXTRA, CTEMP ) IF( ILTEMP ) THEN CALL ZLARTG( A( JCH-ISKEW*ICOL+IOFFST, $ ICOL ), CTEMP, REALC, S, DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY IL = MIN( IENDCH, JCH+JKL+JKU ) + 2 - JCH EXTRA = CZERO CALL ZLAROT( .FALSE., .TRUE., $ JCH+JKL+JKU.LE.IENDCH, IL, C, S, $ A( JCH-ISKEW*ICOL+IOFFST, $ ICOL ), ILDA, CTEMP, EXTRA ) IC = ICOL END IF 110 CONTINUE 120 CONTINUE 130 CONTINUE * JKU = UUB DO 160 JKL = 1, LLB * * Transform from bandwidth JKL-1, JKU to JKL, JKU * * First row actually rotated is MIN( N+JKL, M ) * First column actually rotated is N * IENDCH = MIN( N, M+JKU ) - 1 DO 150 JR = MIN( N+JKL, M ) - 1, 1 - JKU, -1 EXTRA = CZERO ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE )*ZLARND( 5, ISEED ) S = SIN( ANGLE )*ZLARND( 5, ISEED ) ICOL = MAX( 1, JR-JKL+1 ) IF( JR.GT.0 ) THEN IL = MIN( N, JR+JKU+1 ) + 1 - ICOL CALL ZLAROT( .TRUE., .FALSE., JR+JKU.LT.N, IL, $ C, S, A( JR-ISKEW*ICOL+IOFFST, $ ICOL ), ILDA, DUMMY, EXTRA ) END IF * * Chase "EXTRA" back down * IR = JR DO 140 JCH = JR + JKU, IENDCH, JKL + JKU ILEXTR = IR.GT.0 IF( ILEXTR ) THEN CALL ZLARTG( A( IR-ISKEW*JCH+IOFFST, JCH ), $ EXTRA, REALC, S, DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY END IF IR = MAX( 1, IR ) IROW = MIN( M-1, JCH+JKL ) ILTEMP = JCH + JKL.LT.M CTEMP = CZERO CALL ZLAROT( .FALSE., ILEXTR, ILTEMP, IROW+2-IR, $ C, S, A( IR-ISKEW*JCH+IOFFST, $ JCH ), ILDA, EXTRA, CTEMP ) IF( ILTEMP ) THEN CALL ZLARTG( A( IROW-ISKEW*JCH+IOFFST, JCH ), $ CTEMP, REALC, S, DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY IL = MIN( IENDCH, JCH+JKL+JKU ) + 2 - JCH EXTRA = CZERO CALL ZLAROT( .TRUE., .TRUE., $ JCH+JKL+JKU.LE.IENDCH, IL, C, S, $ A( IROW-ISKEW*JCH+IOFFST, JCH ), $ ILDA, CTEMP, EXTRA ) IR = IROW END IF 140 CONTINUE 150 CONTINUE 160 CONTINUE * END IF * ELSE * * Symmetric -- A = U D U' * Hermitian -- A = U D U* * IPACKG = IPACK IOFFG = IOFFST * IF( TOPDWN ) THEN * * Top-Down -- Generate Upper triangle only * IF( IPACK.GE.5 ) THEN IPACKG = 6 IOFFG = UUB + 1 ELSE IPACKG = 1 END IF * DO 170 J = 1, MNMIN A( ( 1-ISKEW )*J+IOFFG, J ) = DCMPLX( D( J ) ) 170 CONTINUE * DO 200 K = 1, UUB DO 190 JC = 1, N - 1 IROW = MAX( 1, JC-K ) IL = MIN( JC+1, K+2 ) EXTRA = CZERO CTEMP = A( JC-ISKEW*( JC+1 )+IOFFG, JC+1 ) ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE )*ZLARND( 5, ISEED ) S = SIN( ANGLE )*ZLARND( 5, ISEED ) IF( ZSYM ) THEN CT = C ST = S ELSE CTEMP = DCONJG( CTEMP ) CT = DCONJG( C ) ST = DCONJG( S ) END IF CALL ZLAROT( .FALSE., JC.GT.K, .TRUE., IL, C, S, $ A( IROW-ISKEW*JC+IOFFG, JC ), ILDA, $ EXTRA, CTEMP ) CALL ZLAROT( .TRUE., .TRUE., .FALSE., $ MIN( K, N-JC )+1, CT, ST, $ A( ( 1-ISKEW )*JC+IOFFG, JC ), ILDA, $ CTEMP, DUMMY ) * * Chase EXTRA back up the matrix * ICOL = JC DO 180 JCH = JC - K, 1, -K CALL ZLARTG( A( JCH+1-ISKEW*( ICOL+1 )+IOFFG, $ ICOL+1 ), EXTRA, REALC, S, DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = DCONJG( REALC*DUMMY ) S = DCONJG( -S*DUMMY ) CTEMP = A( JCH-ISKEW*( JCH+1 )+IOFFG, JCH+1 ) IF( ZSYM ) THEN CT = C ST = S ELSE CTEMP = DCONJG( CTEMP ) CT = DCONJG( C ) ST = DCONJG( S ) END IF CALL ZLAROT( .TRUE., .TRUE., .TRUE., K+2, C, S, $ A( ( 1-ISKEW )*JCH+IOFFG, JCH ), $ ILDA, CTEMP, EXTRA ) IROW = MAX( 1, JCH-K ) IL = MIN( JCH+1, K+2 ) EXTRA = CZERO CALL ZLAROT( .FALSE., JCH.GT.K, .TRUE., IL, CT, $ ST, A( IROW-ISKEW*JCH+IOFFG, JCH ), $ ILDA, EXTRA, CTEMP ) ICOL = JCH 180 CONTINUE 190 CONTINUE 200 CONTINUE * * If we need lower triangle, copy from upper. Note that * the order of copying is chosen to work for 'q' -> 'b' * IF( IPACK.NE.IPACKG .AND. IPACK.NE.3 ) THEN DO 230 JC = 1, N IROW = IOFFST - ISKEW*JC IF( ZSYM ) THEN DO 210 JR = JC, MIN( N, JC+UUB ) A( JR+IROW, JC ) = A( JC-ISKEW*JR+IOFFG, JR ) 210 CONTINUE ELSE DO 220 JR = JC, MIN( N, JC+UUB ) A( JR+IROW, JC ) = DCONJG( A( JC-ISKEW*JR+ $ IOFFG, JR ) ) 220 CONTINUE END IF 230 CONTINUE IF( IPACK.EQ.5 ) THEN DO 250 JC = N - UUB + 1, N DO 240 JR = N + 2 - JC, UUB + 1 A( JR, JC ) = CZERO 240 CONTINUE 250 CONTINUE END IF IF( IPACKG.EQ.6 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF END IF ELSE * * Bottom-Up -- Generate Lower triangle only * IF( IPACK.GE.5 ) THEN IPACKG = 5 IF( IPACK.EQ.6 ) $ IOFFG = 1 ELSE IPACKG = 2 END IF * DO 260 J = 1, MNMIN A( ( 1-ISKEW )*J+IOFFG, J ) = DCMPLX( D( J ) ) 260 CONTINUE * DO 290 K = 1, UUB DO 280 JC = N - 1, 1, -1 IL = MIN( N+1-JC, K+2 ) EXTRA = CZERO CTEMP = A( 1+( 1-ISKEW )*JC+IOFFG, JC ) ANGLE = TWOPI*DLARND( 1, ISEED ) C = COS( ANGLE )*ZLARND( 5, ISEED ) S = SIN( ANGLE )*ZLARND( 5, ISEED ) IF( ZSYM ) THEN CT = C ST = S ELSE CTEMP = DCONJG( CTEMP ) CT = DCONJG( C ) ST = DCONJG( S ) END IF CALL ZLAROT( .FALSE., .TRUE., N-JC.GT.K, IL, C, S, $ A( ( 1-ISKEW )*JC+IOFFG, JC ), ILDA, $ CTEMP, EXTRA ) ICOL = MAX( 1, JC-K+1 ) CALL ZLAROT( .TRUE., .FALSE., .TRUE., JC+2-ICOL, $ CT, ST, A( JC-ISKEW*ICOL+IOFFG, $ ICOL ), ILDA, DUMMY, CTEMP ) * * Chase EXTRA back down the matrix * ICOL = JC DO 270 JCH = JC + K, N - 1, K CALL ZLARTG( A( JCH-ISKEW*ICOL+IOFFG, ICOL ), $ EXTRA, REALC, S, DUMMY ) DUMMY = ZLARND( 5, ISEED ) C = REALC*DUMMY S = S*DUMMY CTEMP = A( 1+( 1-ISKEW )*JCH+IOFFG, JCH ) IF( ZSYM ) THEN CT = C ST = S ELSE CTEMP = DCONJG( CTEMP ) CT = DCONJG( C ) ST = DCONJG( S ) END IF CALL ZLAROT( .TRUE., .TRUE., .TRUE., K+2, C, S, $ A( JCH-ISKEW*ICOL+IOFFG, ICOL ), $ ILDA, EXTRA, CTEMP ) IL = MIN( N+1-JCH, K+2 ) EXTRA = CZERO CALL ZLAROT( .FALSE., .TRUE., N-JCH.GT.K, IL, $ CT, ST, A( ( 1-ISKEW )*JCH+IOFFG, $ JCH ), ILDA, CTEMP, EXTRA ) ICOL = JCH 270 CONTINUE 280 CONTINUE 290 CONTINUE * * If we need upper triangle, copy from lower. Note that * the order of copying is chosen to work for 'b' -> 'q' * IF( IPACK.NE.IPACKG .AND. IPACK.NE.4 ) THEN DO 320 JC = N, 1, -1 IROW = IOFFST - ISKEW*JC IF( ZSYM ) THEN DO 300 JR = JC, MAX( 1, JC-UUB ), -1 A( JR+IROW, JC ) = A( JC-ISKEW*JR+IOFFG, JR ) 300 CONTINUE ELSE DO 310 JR = JC, MAX( 1, JC-UUB ), -1 A( JR+IROW, JC ) = DCONJG( A( JC-ISKEW*JR+ $ IOFFG, JR ) ) 310 CONTINUE END IF 320 CONTINUE IF( IPACK.EQ.6 ) THEN DO 340 JC = 1, UUB DO 330 JR = 1, UUB + 1 - JC A( JR, JC ) = CZERO 330 CONTINUE 340 CONTINUE END IF IF( IPACKG.EQ.5 ) THEN IPACKG = IPACK ELSE IPACKG = 0 END IF END IF END IF * * Ensure that the diagonal is real if Hermitian * IF( .NOT.ZSYM ) THEN DO 350 JC = 1, N IROW = IOFFST + ( 1-ISKEW )*JC A( IROW, JC ) = DCMPLX( DBLE( A( IROW, JC ) ) ) 350 CONTINUE END IF * END IF * ELSE * * 4) Generate Banded Matrix by first * Rotating by random Unitary matrices, * then reducing the bandwidth using Householder * transformations. * * Note: we should get here only if LDA .ge. N * IF( ISYM.EQ.1 ) THEN * * Non-symmetric -- A = U D V * CALL ZLAGGE( MR, NC, LLB, UUB, D, A, LDA, ISEED, WORK, $ IINFO ) ELSE * * Symmetric -- A = U D U' or * Hermitian -- A = U D U* * IF( ZSYM ) THEN CALL ZLAGSY( M, LLB, D, A, LDA, ISEED, WORK, IINFO ) ELSE CALL ZLAGHE( M, LLB, D, A, LDA, ISEED, WORK, IINFO ) END IF END IF * IF( IINFO.NE.0 ) THEN INFO = 3 RETURN END IF END IF * * 5) Pack the matrix * IF( IPACK.NE.IPACKG ) THEN IF( IPACK.EQ.1 ) THEN * * 'U' -- Upper triangular, not packed * DO 370 J = 1, M DO 360 I = J + 1, M A( I, J ) = CZERO 360 CONTINUE 370 CONTINUE * ELSE IF( IPACK.EQ.2 ) THEN * * 'L' -- Lower triangular, not packed * DO 390 J = 2, M DO 380 I = 1, J - 1 A( I, J ) = CZERO 380 CONTINUE 390 CONTINUE * ELSE IF( IPACK.EQ.3 ) THEN * * 'C' -- Upper triangle packed Columnwise. * ICOL = 1 IROW = 0 DO 410 J = 1, M DO 400 I = 1, J IROW = IROW + 1 IF( IROW.GT.LDA ) THEN IROW = 1 ICOL = ICOL + 1 END IF A( IROW, ICOL ) = A( I, J ) 400 CONTINUE 410 CONTINUE * ELSE IF( IPACK.EQ.4 ) THEN * * 'R' -- Lower triangle packed Columnwise. * ICOL = 1 IROW = 0 DO 430 J = 1, M DO 420 I = J, M IROW = IROW + 1 IF( IROW.GT.LDA ) THEN IROW = 1 ICOL = ICOL + 1 END IF A( IROW, ICOL ) = A( I, J ) 420 CONTINUE 430 CONTINUE * ELSE IF( IPACK.GE.5 ) THEN * * 'B' -- The lower triangle is packed as a band matrix. * 'Q' -- The upper triangle is packed as a band matrix. * 'Z' -- The whole matrix is packed as a band matrix. * IF( IPACK.EQ.5 ) $ UUB = 0 IF( IPACK.EQ.6 ) $ LLB = 0 * DO 450 J = 1, UUB DO 440 I = MIN( J+LLB, M ), 1, -1 A( I-J+UUB+1, J ) = A( I, J ) 440 CONTINUE 450 CONTINUE * DO 470 J = UUB + 2, N DO 460 I = J - UUB, MIN( J+LLB, M ) A( I-J+UUB+1, J ) = A( I, J ) 460 CONTINUE 470 CONTINUE END IF * * If packed, zero out extraneous elements. * * Symmetric/Triangular Packed -- * zero out everything after A(IROW,ICOL) * IF( IPACK.EQ.3 .OR. IPACK.EQ.4 ) THEN DO 490 JC = ICOL, M DO 480 JR = IROW + 1, LDA A( JR, JC ) = CZERO 480 CONTINUE IROW = 0 490 CONTINUE * ELSE IF( IPACK.GE.5 ) THEN * * Packed Band -- * 1st row is now in A( UUB+2-j, j), zero above it * m-th row is now in A( M+UUB-j,j), zero below it * last non-zero diagonal is now in A( UUB+LLB+1,j ), * zero below it, too. * IR1 = UUB + LLB + 2 IR2 = UUB + M + 2 DO 520 JC = 1, N DO 500 JR = 1, UUB + 1 - JC A( JR, JC ) = CZERO 500 CONTINUE DO 510 JR = MAX( 1, MIN( IR1, IR2-JC ) ), LDA A( JR, JC ) = CZERO 510 CONTINUE 520 CONTINUE END IF END IF * RETURN * * End of ZLATMS * END scalapack-2.0.2/TESTING/BLLT.dat000644 000766 000024 00000001661 10363532303 016257 0ustar00juliestaff000000 000000 'ScaLAPACK, Version 1.2, banded linear systems input file' 'MPI machine' '' output file name (if any) 6 device out 'U' define Lower or Upper 8 number of problem sizes 3 5 17 28 37 121 200 1023 values of N 6 number of bandwidths 1 2 4 10 31 64 values of BW 1 number of NB's -1 values of NB (-1 for automatic determination) 1 number of NRHS's (must be 1) 4 values of NRHS 1 number of NBRHS's (ignored) 1 values of NBRHS (ignored) 4 number of process grids 1 2 3 4 values of "Number of Process Columns" 3.0 threshold scalapack-2.0.2/TESTING/BLU.dat000644 000766 000024 00000001740 10363532303 016142 0ustar00juliestaff000000 000000 'ScaLAPACK, Version 1.2, banded linear systems input file' 'MPI machine' '' output file name (if any) 6 device out 'N' define transpose or not 3 number of problem sizes 3 5 17 28 37 121 200 1023 values of N 3 number of bandwidths 1 3 15 6 13 20 values of BWL 1 1 4 18 24 33 values of BWU 1 number of NB's -1 values of NB (-1 for automatic determination) 1 number of NRHS's (must be 1) 4 values of NRHS 1 number of NBRHS's (ignored) 1 values of NBRHS (ignored) 4 number of process grids 1 2 3 4 values of "Number of Process Columns" 3.0 threshold scalapack-2.0.2/TESTING/BRD.dat000644 000766 000024 00000000552 10363532303 016127 0ustar00juliestaff000000 000000 'ScaLAPACK BRD input file' 'MPI machine' 'BRD.out' output file name (if any) 6 device out 4 number of problems sizes 4 10 17 13 23 31 57 values of M 4 12 13 13 23 31 50 values of N 4 number of NB's 2 3 4 5 values of NB 4 number of processor grids (ordered pairs of P & Q) 1 2 1 4 2 3 8 values of P 1 2 4 1 3 2 1 values of Q 10.0 threshold scalapack-2.0.2/TESTING/CMakeLists.txt000644 000766 000024 00000012512 11656312637 017601 0ustar00juliestaff000000 000000 add_subdirectory(LIN) add_subdirectory(EIG) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/TESTING) file(COPY BLLT.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY BLU.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY BRD.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY EVC.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY HRD.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY INV.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY LLT.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY LS.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY LU.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY NEP.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY QR.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY SEP.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY SEPR.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY SVD.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY TRD.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) add_test(xslu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xslu) add_test(xdlu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdlu) add_test(xclu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xclu) add_test(xzlu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzlu) add_test(xsdblu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsdblu) add_test(xddblu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xddblu) add_test(xcdblu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcdblu) add_test(xzdblu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzdblu) add_test(xsdtlu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsdtlu) add_test(xddtlu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xddtlu) add_test(xcdtlu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcdtlu) add_test(xzdtlu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzdtlu) add_test(xsgblu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsgblu) add_test(xdgblu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdgblu) add_test(xcgblu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcgblu) add_test(xzgblu ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzgblu) add_test(xsllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsllt) add_test(xdllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdllt) add_test(xcllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcllt) add_test(xzllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzllt) add_test(xspbllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xspbllt) add_test(xdpbllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdpbllt) add_test(xcpbllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcpbllt) add_test(xzpbllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzpbllt) add_test(xsptllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsptllt) add_test(xdptllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdptllt) add_test(xcptllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcptllt) add_test(xzptllt ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzptllt) add_test(xsinv ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsinv) add_test(xdinv ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdinv) add_test(xcinv ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcinv) add_test(xzinv ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzinv) add_test(xsqr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsqr) add_test(xdqr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdqr) add_test(xcqr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcqr) add_test(xzqr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzqr) add_test(xsbrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsbrd) add_test(xdbrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdbrd) add_test(xcbrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcbrd) add_test(xzbrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzbrd) add_test(xshrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xshrd) add_test(xdhrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdhrd) add_test(xchrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xchrd) add_test(xzhrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzhrd) add_test(xstrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xstrd) add_test(xdtrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdtrd) add_test(xctrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xctrd) add_test(xztrd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xztrd) add_test(xssvd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xssvd) add_test(xdsvd ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdsvd) add_test(xssep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xssep) add_test(xdsep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdsep) add_test(xcsep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcsep) add_test(xzsep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzsep) add_test(xsgsep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsgsep) add_test(xdgsep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdgsep) add_test(xcgsep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcgsep) add_test(xzgsep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzgsep) add_test(xsnep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsnep) add_test(xdnep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdnep) add_test(xcnep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcnep) add_test(xznep ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xznep) add_test(xcevc ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcevc) add_test(xzevc ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzevc) add_test(xssyevr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xssyevr) add_test(xdsyevr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdsyevr) add_test(xcheevr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcheevr) add_test(xzheevr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzheevr) add_test(xshseqr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xshseqr) add_test(xdhseqr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdhseqr) scalapack-2.0.2/TESTING/EIG/000755 000766 000024 00000000000 11750301630 015426 5ustar00juliestaff000000 000000 scalapack-2.0.2/TESTING/EVC.dat000644 000766 000024 00000000771 10363532303 016140 0ustar00juliestaff000000 000000 'SCALAPACK NEP (Nonsymmetric Eigenvalue Problem) input file' 'MPI Machine' 'EVC.out' output file name (if any) 6 device out 1 number of problems sizes 100 1000 1500 2000 2500 3000 Probs 1 number of NB's 8 values of NB 4 number of process grids (ordered pairs of P & Q) 1 1 4 2 3 2 2 1 values of P 1 4 1 2 3 1 4 8 values of Q 20.0 threshold scalapack-2.0.2/TESTING/HRD.dat000644 000766 000024 00000000600 10363532303 016127 0ustar00juliestaff000000 000000 'ScaLAPACK HRD input file' 'MPI machine' 'HRD.out' output file name (if any) 6 device out 4 number of problems sizes 50 50 50 50 values of N 1 2 3 5 values of ILO 50 48 45 49 values of IHI 3 number of NB's 2 3 4 values of NB 4 number of processor grids (ordered pairs of P & Q) 1 2 1 4 2 3 8 values of P 1 2 4 1 3 2 1 values of Q 3.0 threshold scalapack-2.0.2/TESTING/INV.dat000644 000766 000024 00000000710 10363532303 016150 0ustar00juliestaff000000 000000 'ScaLAPACK, Version 1.0, Matrix Inversion Testing input file' 'MPI machine.' 'INV.out' output file name (if any) 6 device out 5 number of matrix types (next line) 'GEN' 'UTR' 'LTR' 'UPD' 'LPD' GEN, UTR, LTR, UPD, LPD 4 number of problems sizes 2 5 10 15 13 20 30 50 values of N 4 number of NB's 2 3 4 5 6 20 values of NB 4 number of process grids (ordered P & Q) 1 2 1 4 2 3 8 values of P 1 1 4 1 3 2 1 values of Q 1.0 threshold scalapack-2.0.2/TESTING/LIN/000755 000766 000024 00000000000 11750301622 015445 5ustar00juliestaff000000 000000 scalapack-2.0.2/TESTING/LLT.dat000644 000766 000024 00000001056 10363532303 016153 0ustar00juliestaff000000 000000 'ScaLAPACK, LLt factorization input file' 'MPI machine' 'LLT.out' output file name (if any) 6 device out 'U' define Lower or Upper 4 number of problems sizes 4 10 17 13 23 31 57 values of N 3 number of NB's 2 3 4 5 values of NB 3 number of NRHS's 1 3 9 28 values of NRHS 3 number of NBRHS's 1 3 5 7 values of NBRHS 4 number of process grids (ordered pairs P & Q) 1 2 1 4 2 3 8 values of P 1 2 4 1 3 2 1 values of Q 3.0 threshold T (T or F) Test Cond. Est. and Iter. Ref. Routines scalapack-2.0.2/TESTING/LS.dat000644 000766 000024 00000000667 10363532303 016045 0ustar00juliestaff000000 000000 'ScaLAPACK LS solve input file' 'MPI machine' 'LS.out' output file name (if any) 6 device out 3 number of problems sizes 15 7 31 values of M 5 21 31 values of N 2 number of NB's 2 3 5 values of NB 2 number of NRHS's 2 3 5 values of NRHS 2 number of NBRHS's 1 2 values of NBRHS 4 number of process grids (ordered pairs P & Q) 1 1 4 2 2 3 8 values of P 1 4 1 2 3 2 1 values of Q 4.0 threshold scalapack-2.0.2/TESTING/LU.dat000644 000766 000024 00000001000 10363532303 016025 0ustar00juliestaff000000 000000 'SCALAPACK, LU factorization input file' 'MPI Machine' 'LU.out' output file name (if any) 6 device out 4 number of problems sizes 4 10 17 13 23 31 57 values of M 4 12 13 13 23 31 50 values of N 3 number of NB's 2 3 4 5 values of NB 3 number of NRHS's 1 3 9 28 values of NRHS 3 Number of NBRHS's 1 3 5 7 values of NBRHS 4 number of process grids (ordered pairs of P & Q) 1 2 1 4 2 3 8 values of P 1 2 4 1 3 2 1 values of Q 1.0 threshold T (T or F) Test Cond. Est. and Iter. Ref. Routines scalapack-2.0.2/TESTING/NEP.dat000644 000766 000024 00000000756 10363532303 016150 0ustar00juliestaff000000 000000 'SCALAPACK NEP (Nonsymmetric Eigenvalue Problem) input file' 'MPI machine' 'NEP.out' output file name (if any) 6 device out 7 number of problems sizes 1 2 3 4 6 10 50 Probs 3 number of NB's 6 8 17 values of NB 2 number of process grids (ordered pairs of P & Q) 1 2 1 1 4 2 1 values of P 1 2 3 4 1 4 8 values of Q 20.0 threshold scalapack-2.0.2/TESTING/QR.dat000644 000766 000024 00000001014 11703617442 016043 0ustar00juliestaff000000 000000 'ScaLAPACK, Orthogonal factorizations input file' 'MPI machine' 'QR.out' output file name (if any) 6 device out 6 number of factorizations 'QR' 'QL' 'LQ' 'RQ' 'QP' 'TZ' factorizations: QR, QL, LQ, RQ, QP, TZ 4 number of problems sizes 2 5 13 15 13 26 30 15 values of M 2 7 8 10 17 20 30 35 values of N 4 number of blocking sizes 4 3 5 5 4 6 values of MB 4 7 3 5 8 2 values of NB 4 number of process grids (ordered pairs P & Q) 1 2 1 4 2 3 8 values of P 1 2 4 1 3 2 1 values of Q 5.0 threshold scalapack-2.0.2/TESTING/SEP.dat000644 000766 000024 00000010505 10363532303 016146 0ustar00juliestaff000000 000000 'ScaLAPACK Symmetric Eigensolver Test File' ' ' 'sep.out' output file name (if any) 6 device out (13 & 14 reserved for internal testing) 4 maximum number of processes 'N' disable pxsyev tests, recommended for heterogeneous systems. ' ' 'TEST 1 - test tiny matrices - different process configurations' 3 number of matrices 0 1 2 matrix size 1 number of uplo choices 'L' uplo choices 2 number of processor configurations (P, Q, NB) 1 1 values of P (NPROW) 2 1 values of Q (NPCOL) 1 1 values of NB 1 number of matrix types 8 matrix types (see pdseptst.f) 'N' perform subset tests? 80.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 2 - test tiny matrices - all requests' 2 number of matrices 0 1 1 number of uplo choices 'L' uplo choices 1 number of processor configurations (P, Q, NB) 1 values of P (NPROW) 2 values of Q (NPCOL) 1 values of NB 1 number of matrix types 8 matrix types (see pdseptst.f) 'Y' perform subset tests? 80.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 3 - test a small matrix - all types' 1 number of matrices 5 1 number of uplo choices 'L' uplo choices 2 number of processor configurations (P, Q, NB) 1 1 values of P (NPROW) 1 2 values of Q (NPCOL) 1 2 values of NB 22 number of matrix types 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 'N' perform subset tests? 250.0 Threshold -1 Absolute Tolerance ' ' 'TEST 4 - test a small matrix - all requests' 1 number of matrices 4 1 number of uplo choices 'L' uplo choices 2 number of processor configurations (P, Q, NB) 1 1 values of P (NPROW) 1 2 values of Q (NPCOL) 1 2 values of NB 2 number of matrix types 10 22 matrix types 'Y' perform subset tests? 250.0 Threshold -1 Absolute Tolerance ' ' 'TEST 5 - test a small matrix - all processor configurations' 1 number of matrices 6 matrix size 2 number of uplo choices 'L' 'U' uplo choices 13 number of processor configurations (P, Q, NB) 1 1 2 1 2 1 3 1 3 1 2 2 2 values of P (NPROW) 1 1 1 2 1 2 1 3 1 3 2 2 2 values of Q (NPCOL) 1 3 1 1 2 2 1 1 2 2 1 2 3 values of NB 1 number of matrix types 8 matrix types (see pdseptst.f) 'N' perform subset tests? 50.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 6 - test a medium matrix - hard matrix types' 1 number of matrices 21 1 number of uplo choices 'U' uplo choices 1 number of processor configurations (P, Q, NB) 2 values of P (NPROW) 2 values of Q (NPCOL) 8 values of NB 4 number of matrix types 9 10 21 22 'N' perform subset tests? 50.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 7 - test a medium matrix - all processor configurations' 1 number of matrices 27 1 number of uplo choices 'U' uplo choices 13 number of processor configurations (P, Q, NB) 1 1 2 1 2 1 3 1 3 1 2 2 2 values of P (NPROW) 1 1 1 2 1 2 1 3 1 3 2 2 2 values of Q (NPCOL) 1 3 1 1 2 2 1 1 2 2 1 2 3 values of NB 1 number of matrix types 10 matrix types (see pdseptst.f) 'N' perform subset tests? 50.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 8 - test a medium matrix - L and U' 1 number of matrices 24 2 number of uplo choices 'L' 'U' uplo choices 4 number of processor configurations (P, Q, NB) 1 1 3 1 values of P (NPROW) 1 2 1 4 values of Q (NPCOL) 1 3 1 1 values of NB 1 number of matrix types 22 matrix types (see pdseptst.f) 'N' perform subset tests? 20.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 9 - test one large matrix' 1 number of matrices 100 1 number of uplo choices 'U' uplo choices 1 number of processor configurations (P, Q, NB) 2 values of P (NPROW) 2 values of Q (NPCOL) 8 values of NB 1 number of matrix types 8 matrix types (see pdseptst.f) 'N' perform subset tests? 20.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'End of tests' -1 scalapack-2.0.2/TESTING/SEPR.dat000644 000766 000024 00000012113 11645634736 016306 0ustar00juliestaff000000 000000 'ScaLAPACK Symmetric Eigensolver Test File' ' ' 'sepr.out' output file name (if any) 6 device out (13 & 14 reserved for internal testing) 4 maximum number of processes 'Y' Switch set to 'Y' ' ' 'TEST 1 - test tiny matrices - different process configurations' 3 number of matrices 0 1 2 Matrix sizes 1 number of uplo choices 'L' uplo choices 1 number of processor configurations (P, Q, NB) 1 values of P (NPROW) 2 values of Q (NPCOL) 1 values of NB 1 number of matrix types 8 matrix types (see pdseprtst.f) 'N' perform subset tests? 80.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 2 - test tiny matrices - all requests' 2 number of matrices 0 1 Matrix sizes 1 number of uplo choices 'L' uplo choices 1 number of processor configurations (P, Q, NB) 1 values of P (NPROW) 2 values of Q (NPCOL) 1 values of NB 1 number of matrix types 8 matrix types (see pdseprtst.f) 'Y' perform subset tests? 80.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 3 - test a small matrix - all types' 1 number of matrices 5 Matrix sizes 1 number of uplo choices 'L' uplo choices 3 number of processor configurations (P, Q, NB) 1 1 1 values of P (NPROW) 1 2 2 values of Q (NPCOL) 1 1 2 values of NB 22 number of matrix types 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 'N' perform subset tests? 250.0 Threshold -1 Absolute Tolerance ' ' 'TEST 4 - test a small matrix - all requests' 1 number of matrices 4 Matrix sizes 1 number of uplo choices 'L' uplo choices 3 number of processor configurations (P, Q, NB) 1 1 1 values of P (NPROW) 1 2 2 values of Q (NPCOL) 1 1 2 values of NB 2 number of matrix types 10 22 matrix types 'Y' perform subset tests? 250.0 Threshold -1 Absolute Tolerance ' ' 'TEST 5 - test a small matrix - all processor configurations' 1 number of matrices 6 matrix size 2 number of uplo choices 'L' 'U' uplo choices 13 number of processor configurations (P, Q, NB) 1 1 2 1 2 1 3 1 3 1 2 2 2 values of P (NPROW) 1 1 1 2 1 2 1 3 1 3 2 2 2 values of Q (NPCOL) 1 3 1 1 2 2 1 1 2 2 1 2 3 values of NB 1 number of matrix types 8 matrix types (see pdseprtst.f) 'N' perform subset tests? 100.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 6 - test a medium matrix - hard matrix types' 1 number of matrices 21 Matrix sizes 1 number of uplo choices 'U' uplo choices 1 number of processor configurations (P, Q, NB) 2 values of P (NPROW) 2 values of Q (NPCOL) 8 values of NB 4 number of matrix types 9 10 21 22 'N' perform subset tests? 100.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 7 - test a medium matrix - all processor configurations' 1 number of matrices 27 1 number of uplo choices 'U' uplo choices 13 number of processor configurations (P, Q, NB) 1 1 2 1 2 1 3 1 3 1 2 2 2 values of P (NPROW) 1 1 1 2 1 2 1 3 1 3 2 2 2 values of Q (NPCOL) 1 3 1 1 2 2 1 1 2 2 1 2 3 values of NB 1 number of matrix types 10 matrix types (see pdseprtst.f) 'N' perform subset tests? 50.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 8 - test a medium matrix - L and U' 1 number of matrices 24 Matrix sizes 2 number of uplo choices 'L' 'U' uplo choices 4 number of processor configurations (P, Q, NB) 1 1 3 1 values of P (NPROW) 1 2 1 4 values of Q (NPCOL) 1 3 1 1 values of NB 1 number of matrix types 22 matrix types (see pdseprtst.f) 'N' perform subset tests? 20.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 9 - test one larger matrix' 1 number of matrices 100 Matrix sizes 1 number of uplo choices 'U' uplo choices 1 number of processor configurations (P, Q, NB) 2 values of P (NPROW) 2 values of Q (NPCOL) 8 values of NB 1 number of matrix types 8 matrix types (see pdseprtst.f) 'Y' perform subset tests? 150.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'TEST 10 - test one large matrix' 1 number of matrices 500 Matrix sizes 1 number of uplo choices 'U' uplo choices 7 number of processor configurations (P, Q, NB) 1 2 1 2 2 1 4 values of P (NPROW) 1 1 2 2 2 4 1 values of Q (NPCOL) 1 1 1 1 2 4 4 values of NB 1 number of matrix types 8 matrix types (see pdseprtst.f) 'Y' perform subset tests? 250.0 Threshold (* 5 for generalized tests) -1 Absolute Tolerance ' ' 'End of tests' -1 scalapack-2.0.2/TESTING/SVD.dat000644 000766 000024 00000003014 10363532303 016150 0ustar00juliestaff000000 000000 'ScaLAPACK Singular Value Decomposition input file' 6 device out 4 maxnodes ' ' 'TEST 1 - test medium matrices - all types and requests' 20.0 Threshold 1 number of matrices 100 number of rows 25 number of columns 1 number of processor configurations (P, Q, NB) 2 values of P (NPROW) 2 values of Q (NPCOL) 8 values of NB ' ' 'TEST 2 - test medium matrices - all processor configurations' 20.0 Threshold 1 number of matrices 80 number of rows 32 number of columns 1 number of processor configurations (P, Q, NB) 2 values of P (NPROW) 2 values of Q (NPCOL) 8 values of NB ' ' 'TEST 3 - test one large matrix' 15.0 Threshold 1 number of matrices 24 number of rows 8 number of columns 1 number of processor configurations (P, Q, NB) 2 values of P (NPROW) 2 values of Q (NPCOL) 8 values of NB ' ' 'End of tests' -1 scalapack-2.0.2/TESTING/TRD.dat000644 000766 000024 00000000664 10363532303 016155 0ustar00juliestaff000000 000000 'ScaLAPACK TRD computation input file' 'MPI machine' 'TRD.out' output file name 6 device out 'L' define Lower or Upper 4 number of problems sizes 16 50 6 11 21 22 23 values of N 4 number of NB's 1 2 3 4 5 values of NB 3 Number of processor grids (ordered pairs of P & Q) 1 1 4 2 1 3 1 values of P 1 4 1 2 3 1 1 values of Q 10.0 threshold scalapack-2.0.2/TESTING/LIN/CMakeLists.txt000644 000766 000024 00000015743 11656312637 020234 0ustar00juliestaff000000 000000 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/TESTING) set (smatgen psmatgen.f pmatgeninc.f) set (dmatgen pdmatgen.f pmatgeninc.f) set (cmatgen pcmatgen.f pmatgeninc.f) set (zmatgen pzmatgen.f pmatgeninc.f) set (slinchk pslaschk.f pslafchk.f) set (dlinchk pdlaschk.f pdlafchk.f) set (clinchk pclaschk.f pclafchk.f) set (zlinchk pzlaschk.f pzlafchk.f) add_executable(xslu psludriver.f psluinfo.f psgetrrv.f ${smatgen} ${slinchk}) add_executable(xdlu pdludriver.f pdluinfo.f pdgetrrv.f ${dmatgen} ${dlinchk}) add_executable(xclu pcludriver.f pcluinfo.f pcgetrrv.f ${cmatgen} ${clinchk}) add_executable(xzlu pzludriver.f pzluinfo.f pzgetrrv.f ${zmatgen} ${zlinchk}) add_executable(xsdblu psdbdriver.f psdbinfo.f psdblaschk.f psdbmv1.f psbmatgen.f ${smatgen}) add_executable(xddblu pddbdriver.f pddbinfo.f pddblaschk.f pddbmv1.f pdbmatgen.f ${dmatgen}) add_executable(xcdblu pcdbdriver.f pcdbinfo.f pcdblaschk.f pcdbmv1.f pcbmatgen.f ${cmatgen}) add_executable(xzdblu pzdbdriver.f pzdbinfo.f pzdblaschk.f pzdbmv1.f pzbmatgen.f ${zmatgen}) add_executable(xsdtlu psdtdriver.f psdtinfo.f psdtlaschk.f psdbmv1.f psbmatgen.f ${smatgen}) add_executable(xddtlu pddtdriver.f pddtinfo.f pddtlaschk.f pddbmv1.f pdbmatgen.f ${dmatgen}) add_executable(xcdtlu pcdtdriver.f pcdtinfo.f pcdtlaschk.f pcdbmv1.f pcbmatgen.f ${cmatgen}) add_executable(xzdtlu pzdtdriver.f pzdtinfo.f pzdtlaschk.f pzdbmv1.f pzbmatgen.f ${zmatgen}) add_executable(xsgblu psgbdriver.f psgbinfo.f psdblaschk.f psgbmv1.f psbmatgen.f ${smatgen}) add_executable(xdgblu pdgbdriver.f pdgbinfo.f pddblaschk.f pdgbmv1.f pdbmatgen.f ${dmatgen}) add_executable(xcgblu pcgbdriver.f pcgbinfo.f pcdblaschk.f pcgbmv1.f pcbmatgen.f ${cmatgen}) add_executable(xzgblu pzgbdriver.f pzgbinfo.f pzdblaschk.f pzgbmv1.f pzbmatgen.f ${zmatgen}) add_executable(xsllt pslltdriver.f pslltinfo.f pspotrrv.f ${smatgen} ${slinchk}) add_executable(xdllt pdlltdriver.f pdlltinfo.f pdpotrrv.f ${dmatgen} ${dlinchk}) add_executable(xcllt pclltdriver.f pclltinfo.f pcpotrrv.f ${cmatgen} ${clinchk}) add_executable(xzllt pzlltdriver.f pzlltinfo.f pzpotrrv.f ${zmatgen} ${zlinchk}) add_executable(xspbllt pspbdriver.f pspbinfo.f pspblaschk.f pspbmv1.f psbmatgen.f ${smatgen}) add_executable(xdpbllt pdpbdriver.f pdpbinfo.f pdpblaschk.f pdpbmv1.f pdbmatgen.f ${dmatgen}) add_executable(xcpbllt pcpbdriver.f pcpbinfo.f pcpblaschk.f pcpbmv1.f pcbmatgen.f ${cmatgen}) add_executable(xzpbllt pzpbdriver.f pzpbinfo.f pzpblaschk.f pzpbmv1.f pzbmatgen.f ${zmatgen}) add_executable(xsptllt psptdriver.f psptinfo.f psptlaschk.f pspbmv1.f psbmatgen.f ${smatgen}) add_executable(xdptllt pdptdriver.f pdptinfo.f pdptlaschk.f pdpbmv1.f pdbmatgen.f ${dmatgen}) add_executable(xcptllt pcptdriver.f pcptinfo.f pcptlaschk.f pcpbmv1.f pcbmatgen.f ${cmatgen}) add_executable(xzptllt pzptdriver.f pzptinfo.f pzptlaschk.f pzpbmv1.f pzbmatgen.f ${zmatgen}) add_executable(xsinv psinvdriver.f psinvinfo.f psinvchk.f ${smatgen}) add_executable(xdinv pdinvdriver.f pdinvinfo.f pdinvchk.f ${dmatgen}) add_executable(xcinv pcinvdriver.f pcinvinfo.f pcinvchk.f ${cmatgen}) add_executable(xzinv pzinvdriver.f pzinvinfo.f pzinvchk.f ${zmatgen}) add_executable(xsqr psqrdriver.f psqrinfo.f psgeqrrv.f psgeqlrv.f psgelqrv.f psgerqrv.f pstzrzrv.f pslafchk.f ${smatgen}) add_executable(xdqr pdqrdriver.f pdqrinfo.f pdgeqrrv.f pdgeqlrv.f pdgelqrv.f pdgerqrv.f pdtzrzrv.f pdlafchk.f ${dmatgen}) add_executable(xcqr pcqrdriver.f pcqrinfo.f pcgeqrrv.f pcgeqlrv.f pcgelqrv.f pcgerqrv.f pctzrzrv.f pclafchk.f ${cmatgen}) add_executable(xzqr pzqrdriver.f pzqrinfo.f pzgeqrrv.f pzgeqlrv.f pzgelqrv.f pzgerqrv.f pztzrzrv.f pzlafchk.f ${zmatgen}) add_executable(xsls pslsdriver.f pslsinfo.f psqrt13.f psqrt14.f psqrt16.f psqrt17.f ${smatgen}) add_executable(xdls pdlsdriver.f pdlsinfo.f pdqrt13.f pdqrt14.f pdqrt16.f pdqrt17.f ${dmatgen}) add_executable(xcls pclsdriver.f pclsinfo.f pcqrt13.f pcqrt14.f pcqrt16.f pcqrt17.f ${cmatgen}) add_executable(xzls pzlsdriver.f pzlsinfo.f pzqrt13.f pzqrt14.f pzqrt16.f pzqrt17.f ${zmatgen}) target_link_libraries(xslu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdlu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xclu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzlu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsdblu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xddblu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcdblu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzdblu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsdtlu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xddtlu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcdtlu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzdtlu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsgblu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdgblu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcgblu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzgblu scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xspbllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdpbllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcpbllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzpbllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsptllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdptllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcptllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzptllt scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsinv scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdinv scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcinv scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzinv scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsls scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdls scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcls scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzls scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) scalapack-2.0.2/TESTING/LIN/Makefile000644 000766 000024 00000030033 11707554745 017126 0ustar00juliestaff000000 000000 ############################################################################ # # Program: ScaLAPACK # # Module: Makefile # # Purpose: Linear Equations Testing Makefile # # Creation date: March 20, 1995 # # Modified: # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../SLmake.inc sluexe = ../xslu dluexe = ../xdlu cluexe = ../xclu zluexe = ../xzlu sdbluexe = ../xsdblu ddbluexe = ../xddblu cdbluexe = ../xcdblu zdbluexe = ../xzdblu sdtluexe = ../xsdtlu ddtluexe = ../xddtlu cdtluexe = ../xcdtlu zdtluexe = ../xzdtlu sgbluexe = ../xsgblu dgbluexe = ../xdgblu cgbluexe = ../xcgblu zgbluexe = ../xzgblu slltexe = ../xsllt dlltexe = ../xdllt clltexe = ../xcllt zlltexe = ../xzllt spblltexe = ../xspbllt dpblltexe = ../xdpbllt cpblltexe = ../xcpbllt zpblltexe = ../xzpbllt sptlltexe = ../xsptllt dptlltexe = ../xdptllt cptlltexe = ../xcptllt zptlltexe = ../xzptllt sinvexe = ../xsinv dinvexe = ../xdinv cinvexe = ../xcinv zinvexe = ../xzinv sqrexe = ../xsqr dqrexe = ../xdqr cqrexe = ../xcqr zqrexe = ../xzqr slsexe = ../xsls dlsexe = ../xdls clsexe = ../xcls zlsexe = ../xzls smatgen = psmatgen.o pmatgeninc.o dmatgen = pdmatgen.o pmatgeninc.o cmatgen = pcmatgen.o pmatgeninc.o zmatgen = pzmatgen.o pmatgeninc.o slinchk = pslaschk.o pslafchk.o dlinchk = pdlaschk.o pdlafchk.o clinchk = pclaschk.o pclafchk.o zlinchk = pzlaschk.o pzlafchk.o slu = psludriver.o psluinfo.o psgetrrv.o $(smatgen) $(slinchk) dlu = pdludriver.o pdluinfo.o pdgetrrv.o $(dmatgen) $(dlinchk) clu = pcludriver.o pcluinfo.o pcgetrrv.o $(cmatgen) $(clinchk) zlu = pzludriver.o pzluinfo.o pzgetrrv.o $(zmatgen) $(zlinchk) sdblu = psdbdriver.o psdbinfo.o psdblaschk.o psdbmv1.o psbmatgen.o $(smatgen) ddblu = pddbdriver.o pddbinfo.o pddblaschk.o pddbmv1.o pdbmatgen.o $(dmatgen) cdblu = pcdbdriver.o pcdbinfo.o pcdblaschk.o pcdbmv1.o pcbmatgen.o $(cmatgen) zdblu = pzdbdriver.o pzdbinfo.o pzdblaschk.o pzdbmv1.o pzbmatgen.o $(zmatgen) sdtlu = psdtdriver.o psdtinfo.o psdtlaschk.o psdbmv1.o psbmatgen.o $(smatgen) ddtlu = pddtdriver.o pddtinfo.o pddtlaschk.o pddbmv1.o pdbmatgen.o $(dmatgen) cdtlu = pcdtdriver.o pcdtinfo.o pcdtlaschk.o pcdbmv1.o pcbmatgen.o $(cmatgen) zdtlu = pzdtdriver.o pzdtinfo.o pzdtlaschk.o pzdbmv1.o pzbmatgen.o $(zmatgen) sgblu = psgbdriver.o psgbinfo.o psdblaschk.o psgbmv1.o psbmatgen.o $(smatgen) dgblu = pdgbdriver.o pdgbinfo.o pddblaschk.o pdgbmv1.o pdbmatgen.o $(dmatgen) cgblu = pcgbdriver.o pcgbinfo.o pcdblaschk.o pcgbmv1.o pcbmatgen.o $(cmatgen) zgblu = pzgbdriver.o pzgbinfo.o pzdblaschk.o pzgbmv1.o pzbmatgen.o $(zmatgen) sllt = pslltdriver.o pslltinfo.o pspotrrv.o $(smatgen) $(slinchk) dllt = pdlltdriver.o pdlltinfo.o pdpotrrv.o $(dmatgen) $(dlinchk) cllt = pclltdriver.o pclltinfo.o pcpotrrv.o $(cmatgen) $(clinchk) zllt = pzlltdriver.o pzlltinfo.o pzpotrrv.o $(zmatgen) $(zlinchk) spbllt = pspbdriver.o pspbinfo.o pspblaschk.o pspbmv1.o psbmatgen.o $(smatgen) dpbllt = pdpbdriver.o pdpbinfo.o pdpblaschk.o pdpbmv1.o pdbmatgen.o $(dmatgen) cpbllt = pcpbdriver.o pcpbinfo.o pcpblaschk.o pcpbmv1.o pcbmatgen.o $(cmatgen) zpbllt = pzpbdriver.o pzpbinfo.o pzpblaschk.o pzpbmv1.o pzbmatgen.o $(zmatgen) sptllt = psptdriver.o psptinfo.o psptlaschk.o pspbmv1.o psbmatgen.o $(smatgen) dptllt = pdptdriver.o pdptinfo.o pdptlaschk.o pdpbmv1.o pdbmatgen.o $(dmatgen) cptllt = pcptdriver.o pcptinfo.o pcptlaschk.o pcpbmv1.o pcbmatgen.o $(cmatgen) zptllt = pzptdriver.o pzptinfo.o pzptlaschk.o pzpbmv1.o pzbmatgen.o $(zmatgen) sinv = psinvdriver.o psinvinfo.o psinvchk.o $(smatgen) dinv = pdinvdriver.o pdinvinfo.o pdinvchk.o $(dmatgen) cinv = pcinvdriver.o pcinvinfo.o pcinvchk.o $(cmatgen) zinv = pzinvdriver.o pzinvinfo.o pzinvchk.o $(zmatgen) sqr = psqrdriver.o psqrinfo.o psgeqrrv.o psgeqlrv.o psgelqrv.o \ psgerqrv.o pstzrzrv.o pslafchk.o $(smatgen) dqr = pdqrdriver.o pdqrinfo.o pdgeqrrv.o pdgeqlrv.o pdgelqrv.o \ pdgerqrv.o pdtzrzrv.o pdlafchk.o $(dmatgen) cqr = pcqrdriver.o pcqrinfo.o pcgeqrrv.o pcgeqlrv.o pcgelqrv.o \ pcgerqrv.o pctzrzrv.o pclafchk.o $(cmatgen) zqr = pzqrdriver.o pzqrinfo.o pzgeqrrv.o pzgeqlrv.o pzgelqrv.o \ pzgerqrv.o pztzrzrv.o pzlafchk.o $(zmatgen) sls = pslsdriver.o pslsinfo.o psqrt13.o psqrt14.o psqrt16.o \ psqrt17.o $(smatgen) dls = pdlsdriver.o pdlsinfo.o pdqrt13.o pdqrt14.o pdqrt16.o \ pdqrt17.o $(dmatgen) cls = pclsdriver.o pclsinfo.o pcqrt13.o pcqrt14.o pcqrt16.o \ pcqrt17.o $(cmatgen) zls = pzlsdriver.o pzlsinfo.o pzqrt13.o pzqrt14.o pzqrt16.o \ pzqrt17.o $(zmatgen) all : single double complex complex16 single: $(sluexe) $(sdbluexe) $(sdtluexe) $(sgbluexe) $(slltexe) \ $(spblltexe) $(sptlltexe) $(sinvexe) $(sqrexe) $(slsexe) double: $(dluexe) $(ddbluexe) $(ddtluexe) $(dgbluexe) $(dlltexe) \ $(dpblltexe) $(dptlltexe) $(dinvexe) $(dqrexe) $(dlsexe) complex: $(cluexe) $(cdbluexe) $(cdtluexe) $(cgbluexe) $(clltexe) \ $(cpblltexe) $(cptlltexe) $(cinvexe) $(cqrexe) $(clsexe) complex16: $(zluexe) $(zdbluexe) $(zdtluexe) $(zgbluexe) $(zlltexe) \ $(zpblltexe) $(zptlltexe) $(zinvexe) $(zqrexe) $(zlsexe) $(sluexe): ../../$(SCALAPACKLIB) $(slu) $(FCLOADER) $(FCLOADFLAGS) -o $(sluexe) $(slu) ../../$(SCALAPACKLIB) $(LIBS) $(dluexe): ../../$(SCALAPACKLIB) $(dlu) $(FCLOADER) $(FCLOADFLAGS) -o $(dluexe) $(dlu) ../../$(SCALAPACKLIB) $(LIBS) $(cluexe): ../../$(SCALAPACKLIB) $(clu) $(FCLOADER) $(FCLOADFLAGS) -o $(cluexe) $(clu) ../../$(SCALAPACKLIB) $(LIBS) $(zluexe): ../../$(SCALAPACKLIB) $(zlu) $(FCLOADER) $(FCLOADFLAGS) -o $(zluexe) $(zlu) ../../$(SCALAPACKLIB) $(LIBS) $(sdbluexe): ../../$(SCALAPACKLIB) $(sdblu) $(FCLOADER) $(FCLOADFLAGS) -o $(sdbluexe) $(sdblu) ../../$(SCALAPACKLIB) $(LIBS) $(ddbluexe): ../../$(SCALAPACKLIB) $(ddblu) $(FCLOADER) $(FCLOADFLAGS) -o $(ddbluexe) $(ddblu) ../../$(SCALAPACKLIB) $(LIBS) $(cdbluexe): ../../$(SCALAPACKLIB) $(cdblu) $(FCLOADER) $(FCLOADFLAGS) -o $(cdbluexe) $(cdblu) ../../$(SCALAPACKLIB) $(LIBS) $(zdbluexe): ../../$(SCALAPACKLIB) $(zdblu) $(FCLOADER) $(FCLOADFLAGS) -o $(zdbluexe) $(zdblu) ../../$(SCALAPACKLIB) $(LIBS) $(sdtluexe): ../../$(SCALAPACKLIB) $(sdtlu) $(FCLOADER) $(FCLOADFLAGS) -o $(sdtluexe) $(sdtlu) ../../$(SCALAPACKLIB) $(LIBS) $(ddtluexe): ../../$(SCALAPACKLIB) $(ddtlu) $(FCLOADER) $(FCLOADFLAGS) -o $(ddtluexe) $(ddtlu) ../../$(SCALAPACKLIB) $(LIBS) $(cdtluexe): ../../$(SCALAPACKLIB) $(cdtlu) $(FCLOADER) $(FCLOADFLAGS) -o $(cdtluexe) $(cdtlu) ../../$(SCALAPACKLIB) $(LIBS) $(zdtluexe): ../../$(SCALAPACKLIB) $(zdtlu) $(FCLOADER) $(FCLOADFLAGS) -o $(zdtluexe) $(zdtlu) ../../$(SCALAPACKLIB) $(LIBS) $(sgbluexe): ../../$(SCALAPACKLIB) $(sgblu) $(FCLOADER) $(FCLOADFLAGS) -o $(sgbluexe) $(sgblu) ../../$(SCALAPACKLIB) $(LIBS) $(dgbluexe): ../../$(SCALAPACKLIB) $(dgblu) $(FCLOADER) $(FCLOADFLAGS) -o $(dgbluexe) $(dgblu) ../../$(SCALAPACKLIB) $(LIBS) $(cgbluexe): ../../$(SCALAPACKLIB) $(cgblu) $(FCLOADER) $(FCLOADFLAGS) -o $(cgbluexe) $(cgblu) ../../$(SCALAPACKLIB) $(LIBS) $(zgbluexe): ../../$(SCALAPACKLIB) $(zgblu) $(FCLOADER) $(FCLOADFLAGS) -o $(zgbluexe) $(zgblu) ../../$(SCALAPACKLIB) $(LIBS) $(slltexe): ../../$(SCALAPACKLIB) $(sllt) $(FCLOADER) $(FCLOADFLAGS) -o $(slltexe) $(sllt) ../../$(SCALAPACKLIB) $(LIBS) $(dlltexe): ../../$(SCALAPACKLIB) $(dllt) $(FCLOADER) $(FCLOADFLAGS) -o $(dlltexe) $(dllt) ../../$(SCALAPACKLIB) $(LIBS) $(clltexe): ../../$(SCALAPACKLIB) $(cllt) $(FCLOADER) $(FCLOADFLAGS) -o $(clltexe) $(cllt) ../../$(SCALAPACKLIB) $(LIBS) $(zlltexe): ../../$(SCALAPACKLIB) $(zllt) $(FCLOADER) $(FCLOADFLAGS) -o $(zlltexe) $(zllt) ../../$(SCALAPACKLIB) $(LIBS) $(spblltexe): ../../$(SCALAPACKLIB) $(spbllt) $(FCLOADER) $(FCLOADFLAGS) -o $(spblltexe) $(spbllt) ../../$(SCALAPACKLIB) $(LIBS) $(dpblltexe): ../../$(SCALAPACKLIB) $(dpbllt) $(FCLOADER) $(FCLOADFLAGS) -o $(dpblltexe) $(dpbllt) ../../$(SCALAPACKLIB) $(LIBS) $(cpblltexe): ../../$(SCALAPACKLIB) $(cpbllt) $(FCLOADER) $(FCLOADFLAGS) -o $(cpblltexe) $(cpbllt) ../../$(SCALAPACKLIB) $(LIBS) $(zpblltexe): ../../$(SCALAPACKLIB) $(zpbllt) $(FCLOADER) $(FCLOADFLAGS) -o $(zpblltexe) $(zpbllt) ../../$(SCALAPACKLIB) $(LIBS) $(sptlltexe): ../../$(SCALAPACKLIB) $(sptllt) $(FCLOADER) $(FCLOADFLAGS) -o $(sptlltexe) $(sptllt) ../../$(SCALAPACKLIB) $(LIBS) $(dptlltexe): ../../$(SCALAPACKLIB) $(dptllt) $(FCLOADER) $(FCLOADFLAGS) -o $(dptlltexe) $(dptllt) ../../$(SCALAPACKLIB) $(LIBS) $(cptlltexe): ../../$(SCALAPACKLIB) $(cptllt) $(FCLOADER) $(FCLOADFLAGS) -o $(cptlltexe) $(cptllt) ../../$(SCALAPACKLIB) $(LIBS) $(zptlltexe): ../../$(SCALAPACKLIB) $(zptllt) $(FCLOADER) $(FCLOADFLAGS) -o $(zptlltexe) $(zptllt) ../../$(SCALAPACKLIB) $(LIBS) $(sinvexe): ../../$(SCALAPACKLIB) $(sinv) $(FCLOADER) $(FCLOADFLAGS) -o $(sinvexe) $(sinv) ../../$(SCALAPACKLIB) $(LIBS) $(dinvexe): ../../$(SCALAPACKLIB) $(dinv) $(FCLOADER) $(FCLOADFLAGS) -o $(dinvexe) $(dinv) ../../$(SCALAPACKLIB) $(LIBS) $(cinvexe): ../../$(SCALAPACKLIB) $(cinv) $(FCLOADER) $(FCLOADFLAGS) -o $(cinvexe) $(cinv) ../../$(SCALAPACKLIB) $(LIBS) $(zinvexe): ../../$(SCALAPACKLIB) $(zinv) $(FCLOADER) $(FCLOADFLAGS) -o $(zinvexe) $(zinv) ../../$(SCALAPACKLIB) $(LIBS) $(sqrexe): ../../$(SCALAPACKLIB) $(sqr) $(FCLOADER) $(FCLOADFLAGS) -o $(sqrexe) $(sqr) ../../$(SCALAPACKLIB) $(LIBS) $(dqrexe): ../../$(SCALAPACKLIB) $(dqr) $(FCLOADER) $(FCLOADFLAGS) -o $(dqrexe) $(dqr) ../../$(SCALAPACKLIB) $(LIBS) $(cqrexe): ../../$(SCALAPACKLIB) $(cqr) $(FCLOADER) $(FCLOADFLAGS) -o $(cqrexe) $(cqr) ../../$(SCALAPACKLIB) $(LIBS) $(zqrexe): ../../$(SCALAPACKLIB) $(zqr) $(FCLOADER) $(FCLOADFLAGS) -o $(zqrexe) $(zqr) ../../$(SCALAPACKLIB) $(LIBS) $(slsexe): ../../$(SCALAPACKLIB) $(sls) $(FCLOADER) $(FCLOADFLAGS) -o $(slsexe) $(sls) ../../$(SCALAPACKLIB) $(LIBS) $(dlsexe): ../../$(SCALAPACKLIB) $(dls) $(FCLOADER) $(FCLOADFLAGS) -o $(dlsexe) $(dls) ../../$(SCALAPACKLIB) $(LIBS) $(clsexe): ../../$(SCALAPACKLIB) $(cls) $(FCLOADER) $(FCLOADFLAGS) -o $(clsexe) $(cls) ../../$(SCALAPACKLIB) $(LIBS) $(zlsexe): ../../$(SCALAPACKLIB) $(zls) $(FCLOADER) $(FCLOADFLAGS) -o $(zlsexe) $(zls) ../../$(SCALAPACKLIB) $(LIBS) clean : rm -f *.o psludriver.o: psludriver.f $(FC) $(FCFLAGS) -c $< pdludriver.o: pdludriver.f $(FC) $(FCFLAGS) -c $< pcludriver.o: pcludriver.f $(FC) $(FCFLAGS) -c $< pzludriver.o: pzludriver.f $(FC) $(FCFLAGS) -c $< psdbdriver.o: psdbdriver.f $(FC) $(FCFLAGS) -c $< pddbdriver.o: pddbdriver.f $(FC) $(FCFLAGS) -c $< pcdbdriver.o: pcdbdriver.f $(FC) $(FCFLAGS) -c $< pzdbdriver.o: pzdbdriver.f $(FC) $(FCFLAGS) -c $< psdtdriver.o: psdtdriver.f $(FC) $(FCFLAGS) -c $< pddtdriver.o: pddtdriver.f $(FC) $(FCFLAGS) -c $< pcdtdriver.o: pcdtdriver.f $(FC) $(FCFLAGS) -c $< pzdtdriver.o: pzdtdriver.f $(FC) $(FCFLAGS) -c $< psgbdriver.o: psgbdriver.f $(FC) $(FCFLAGS) -c $< pdgbdriver.o: pdgbdriver.f $(FC) $(FCFLAGS) -c $< pcgbdriver.o: pcgbdriver.f $(FC) $(FCFLAGS) -c $< pzgbdriver.o: pzgbdriver.f $(FC) $(FCFLAGS) -c $< pslltdriver.o: pslltdriver.f $(FC) $(FCFLAGS) -c $< pdlltdriver.o: pdlltdriver.f $(FC) $(FCFLAGS) -c $< pclltdriver.o: pclltdriver.f $(FC) $(FCFLAGS) -c $< pzlltdriver.o: pzlltdriver.f $(FC) $(FCFLAGS) -c $< pspbdriver.o: pspbdriver.f $(FC) $(FCFLAGS) -c $< pdpbdriver.o: pdpbdriver.f $(FC) $(FCFLAGS) -c $< pcpbdriver.o: pcpbdriver.f $(FC) $(FCFLAGS) -c $< pzpbdriver.o: pzpbdriver.f $(FC) $(FCFLAGS) -c $< psptdriver.o: psptdriver.f $(FC) $(FCFLAGS) -c $< pdptdriver.o: pdptdriver.f $(FC) $(FCFLAGS) -c $< pcptdriver.o: pcptdriver.f $(FC) $(FCFLAGS) -c $< pzptdriver.o: pzptdriver.f $(FC) $(FCFLAGS) -c $< psinvdriver.o: psinvdriver.f $(FC) $(FCFLAGS) -c $< pdinvdriver.o: pdinvdriver.f $(FC) $(FCFLAGS) -c $< pcinvdriver.o: pcinvdriver.f $(FC) $(FCFLAGS) -c $< pzinvdriver.o: pzinvdriver.f $(FC) $(FCFLAGS) -c $< psqrdriver.o: psqrdriver.f $(FC) $(FCFLAGS) -c $< pdqrdriver.o: pdqrdriver.f $(FC) $(FCFLAGS) -c $< pcqrdriver.o: pcqrdriver.f $(FC) $(FCFLAGS) -c $< pzqrdriver.o: pzqrdriver.f $(FC) $(FCFLAGS) -c $< pslsdriver.o: pslsdriver.f $(FC) $(FCFLAGS) -c $< pdlsdriver.o: pdlsdriver.f $(FC) $(FCFLAGS) -c $< pclsdriver.o: pclsdriver.f $(FC) $(FCFLAGS) -c $< pzlsdriver.o: pzlsdriver.f $(FC) $(FCFLAGS) -c $< .f.o : ; $(FC) -c $(FCFLAGS) $*.f scalapack-2.0.2/TESTING/LIN/pcbmatgen.f000644 000766 000024 00000017136 10363532303 017565 0ustar00juliestaff000000 000000 SUBROUTINE PCBMATGEN( ICTXT, AFORM, AFORM2, BWL, BWU, N, $ MB, NB, A, $ LDA, IAROW, IACOL, ISEED, $ MYROW, MYCOL, NPROW, NPCOL ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. * .. Scalar Arguments .. CHARACTER*1 AFORM, AFORM2 INTEGER IACOL, IAROW, ICTXT, $ ISEED, LDA, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW, BWL, BWU * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PCBMATGEN : Parallel Complex Single precision Band MATrix GENerator. * (Re)Generate a distributed Band matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'L' : A is returned as a hermitian lower * triangular matrix, and is diagonally dominant. * if AFORM = 'U' : A is returned as a hermitian upper * triangular matrix, and is diagonally dominant. * if AFORM = 'G' : A is returned as a general matrix. * if AFORM = 'T' : A is returned as a general matrix in * tridiagonal-compatible form. * * AFORM2 (global input) CHARACTER*1 * if the matrix is general: * if AFORM2 = 'D' : A is returned diagonally dominant. * if AFORM2 != 'D' : A is not returned diagonally dominant. * if the matrix is symmetric or hermitian: * if AFORM2 = 'T' : A is returned in tridiagonally-compatible * form (a transpose form). * if AFORM2 != 'T' : A is returned in banded-compatible form. * * M (global input) INTEGER * The number of nonzero rows in the generated distributed * band matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) COMPLEX, pointer into the local memory to * an array of dimension ( LDA, * ) containing the local * pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * A( DIAG_INDEX, I ) = A( DIAG_INDEX, I ) + BWL+BWU * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * This code is a simple wrapper around PCMATGEN, for band matrices. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER DIAG_INDEX, I, J, M_MATGEN, NQ, N_MATGEN, $ START_INDEX * .. * .. External Subroutines .. EXTERNAL PCMATGEN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC, LSAME * .. * .. Executable Statements .. * * IF( LSAME( AFORM, 'L' ).OR.LSAME( AFORM, 'U' ) ) THEN M_MATGEN = BWL + 1 N_MATGEN = N START_INDEX = 1 IF( LSAME( AFORM, 'L' ) ) THEN DIAG_INDEX = 1 ELSE DIAG_INDEX = BWL + 1 ENDIF ELSE M_MATGEN = BWL + BWU + 1 N_MATGEN = N DIAG_INDEX = BWU + 1 START_INDEX = 1 ENDIF * NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * * * Generate a random matrix initially * IF( LSAME( AFORM, 'T' ) .OR. $ ( LSAME( AFORM2, 'T' ) ) ) THEN * CALL PCMATGEN( ICTXT, 'T', 'N', $ N_MATGEN, M_MATGEN, $ NB, M_MATGEN, A( START_INDEX, 1 ), $ LDA, IAROW, IACOL, $ ISEED, 0, NQ, 0, M_MATGEN, $ MYCOL, MYROW, NPCOL, NPROW ) * ELSE * CALL PCMATGEN( ICTXT, 'N', 'N', $ M_MATGEN, N_MATGEN, $ M_MATGEN, NB, A( START_INDEX, 1 ), $ LDA, IAROW, IACOL, $ ISEED, 0, M_MATGEN, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Zero out padding at tops of columns * DO 1000 J=1,NB * DO 2000 I=1, LDA-M_MATGEN * * Indexing goes negative; BMATGEN assumes that space * has been preallocated above the first column as it * has to be if the matrix is to be input to * Scalapack's band solvers. * A( I-LDA+M_MATGEN, J ) = CZERO * 2000 CONTINUE * 1000 CONTINUE * ENDIF * IF( LSAME( AFORM2, 'D' ).OR. $ ( LSAME( AFORM, 'L' ).OR.LSAME( AFORM, 'U' ) ) ) THEN * * Loop over diagonal elements stored on this processor. * * DO 330 I=1, NQ IF( LSAME( AFORM, 'T' ) .OR. $ ( LSAME( AFORM2, 'T' ) ) ) THEN IF( NPROW .EQ. 1 ) THEN A( I, DIAG_INDEX ) = CMPLX( REAL( A( I, DIAG_INDEX ) ) $ + REAL( 2*( BWL+BWU+1 ) ) ) ENDIF ELSE IF( NPROW .EQ. 1 ) THEN A( DIAG_INDEX, I ) = CMPLX( REAL( A( DIAG_INDEX, I ) ) $ + REAL( 2*( BWL+BWU+1 ) ) ) ENDIF END IF 330 CONTINUE * * ELSE * * Must add elements to keep condition of matrix in check * DO 380 I=1, NQ * IF( NPROW .EQ. 1 ) THEN * IF( MOD(I+MYCOL*NB,2) .EQ. 1 ) THEN A( DIAG_INDEX+1, I ) = $ CMPLX( REAL( A( DIAG_INDEX+1, I ) ) $ + REAL( 2*( BWL+BWU+1 ) ) ) * ELSE * A( DIAG_INDEX-1, I ) = $ CMPLX( REAL( A( DIAG_INDEX-1, I ) ) $ + REAL( 2*( BWL+BWU+1 ) ) ) ENDIF * ENDIF * 380 CONTINUE * END IF * RETURN * * End of PCBMATGEN * END scalapack-2.0.2/TESTING/LIN/pcdbdriver.f000644 000766 000024 00000076301 10363532303 017750 0ustar00juliestaff000000 000000 PROGRAM PCDBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PCDBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by CDB. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM DOUBLE PRECISION array, dimension ( TOTMEM/CPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER CPLXSZ, MEMSIZ, NTESTS COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCBMATGEN, $ PCCHEKPAD, PCDBINFO, PCDBLASCHK, PCDBTRF, $ PCDBTRS, PCFILLPAD, PCMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PCLANGE EXTERNAL LSAME, NUMROC, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PCDBINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = BWLVAL( BW_NUM ) IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = BWUVAL( BW_NUM ) IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*MAX(BWL,BWU)-1)/NPCOL + 1 ) $ + MAX(BWL,BWU) NB = MAX( NB, 2*MAX(BWL,BWU) ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*MAX(BWL,BWU), N ) ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (BWL+BWU+1), (BWL+BWU+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((BWL+BWU+1)+10) IMIDPAD = 10 IPOSTPAD = ((BWL+BWU+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (BWL+BWU+1), N, $ (BWL+BWU+1), NB, 0, 0, $ ICTXT,((BWL+BWU+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((BWL+BWU+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = MAX(BWL,BWU)*MAX(BWL,BWU) * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((BWL+BWU+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PCLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PCDBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PCBMATGEN( ICTXT, 'G', 'D', BWL, BWU, N, $ (BWL+BWU+1), NB, MEM( IPA ), $ ((BWL+BWU+1)+10), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) * CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((BWL+BWU+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PCLANGE( '1', (BWL+BWU+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((BWL+BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PCDBTRF( N, BWL, BWU, MEM( IPA ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PCDBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PCCHEKPAD( ICTXT, 'PCDBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((BWL+BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (MAX(BWL,BWU)*NRHS) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PCMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PCFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PCDBTRS( TRANS, N, BWL, BWU, NRHS, MEM( IPA ), $ 1, DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PCDBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCDBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PCDBLASCHK( 'N', 'D', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl bwu + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE(BWU)) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+bwu) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE(BWU)) $ *DBLE(NRHS)) * * Multiply by 4 to get complex count * NOPS = NOPS * DBLE(4) * * Second calc to represent actual hardware speed * * 2*N_FIRST bwl*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE(BWL)*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST bwl*bwu * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE(BWL) $ *DBLE(BWU)) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB bwl*bwu flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE(BWL) $ *DBLE(BWU)) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( BWL*BWU*BWL/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*BWL*BWU*BWL) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+bwu) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE(BWU)) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * (bwl+bwu) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE(BWL)+ $ DBLE(BWU)))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * (bwl+bwu) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE(BWL)+ $ DBLE(BWU)))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*(BWL*BWU ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * BWL*BWU ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PCDBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pcdbinfo.f000644 000766 000024 00000044340 10363532303 017406 0ustar00juliestaff000000 000000 SUBROUTINE PCDBINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PCDBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'C' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'C' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCDBINFO * END scalapack-2.0.2/TESTING/LIN/pcdblaschk.f000644 000766 000024 00000026160 10363532303 017720 0ustar00juliestaff000000 000000 SUBROUTINE PCDBLASCHK( SYMM, UPLO, TRANS, N, BWL, BWU, NRHS, X, $ IX, JX, DESCX, IASEED, A, IA, JA, DESCA, $ IBSEED, ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, TRANS, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PCDBLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'H', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * TRANS if TRANS= 'C', A 'Conjugate transpose' is used as the * coefficient matrix in the solve. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,max(max(bwl,bwu)*(max(bwl,bwu)+2),NB))+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,max(max(bwl,bwu)*(max(bwl,bwu)+2),NB))+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER BW, INFO, IPPRODUCT, WORK_MIN REAL DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGAMX2D, CGEMM, CGSUM2D, $ CLASET, PBCTRAN, PCMATGEN, SGEBR2D, $ SGEBS2D, SGERV2D, SGESD2D * .. * .. External Functions .. INTEGER ICAMAX, NUMROC REAL PSLAMCH EXTERNAL ICAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'H' ) ) THEN BW = BWL WORK_MIN = MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ELSE BW = MAX(BWL, BWU) WORK_MIN = MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PCBLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (BWL+BWU+1), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'H' )) THEN CALL PCBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PCBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PCGBDCMV( BWL+BWU+1, BWL, BWU, TRANS, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), $ (MAX(BWL,BWU)+2)*MAX(BWL,BWU), INFO ) * * * Regenerate column of B * CALL PCMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PCAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PSCNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PSCNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PCBLASCHK * END scalapack-2.0.2/TESTING/LIN/pcdbmv1.f000644 000766 000024 00000076131 10607174520 017165 0ustar00juliestaff000000 000000 SUBROUTINE PCGBDCMV( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, $ B, IB, DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCDBTRF and this is stored in AF. If a linear system * is to be solved using PCDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M, $ DU_N_N, DU_P_M, DU_P_N, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM2, IDUM3, J, JA_NEW, $ LLDA, LLDB, MAX_BW, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PCDBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PCDBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCDBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = CZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (MAX_BW+2)*MAX_BW WORK( I ) = CZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( TRANS, 'N' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * * Use main partition in each processor to multiply locally * CALL CGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, CONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, CZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL CCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWL-DL_N_N ), 1 ) * CALL CTRMV( 'U', 'N', 'N', BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL CGESD2D( ICTXT, BWL, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWL, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = CZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL CCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * CALL CTRMV( $ 'L', $ 'N', $ 'N', BWU, $ A( 1 ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 30 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL CGESD2D( ICTXT, BWU, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL CGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL CAXPY( BWL, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL CGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL CAXPY( BWU, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWU+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( TRANS, 'C' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL CLATCPY( 'L', BWU, BWU, A( OFST+1 ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to left * CALL CTRSD2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL-1 ) * ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL CLATCPY( 'U', BWL, BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to right * CALL CTRSD2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL CGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, CONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, CZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL CCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWU-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL CTRRV2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), MAX_BW, MYROW, MYCOL+1 ) * CALL CTRMV( 'U', 'N', 'N', BWU, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL CGESD2D( ICTXT, BWU, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWU, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = CZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL CCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL CTRRV2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), MAX_BW, MYROW, MYCOL-1 ) * CALL CTRMV( $ 'L', $ 'N', $ 'N', BWL, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 60 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL CGESD2D( ICTXT, BWL, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL CGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL CAXPY( BWU, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL CGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL CAXPY( BWL, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWL+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PCBhBMV1 * END scalapack-2.0.2/TESTING/LIN/pcdtdriver.f000644 000766 000024 00000076605 10363532303 020001 0ustar00juliestaff000000 000000 PROGRAM PCDTDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PCDTDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by CDT. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM DOUBLE PRECISION array, dimension ( TOTMEM/CPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER CPLXSZ, MEMSIZ, NTESTS COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCBMATGEN, $ PCCHEKPAD, PCDTINFO, PCDTLASCHK, PCDTTRF, $ PCDTTRS, PCFILLPAD, PCMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PCLANGE EXTERNAL LSAME, NUMROC, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PCDTINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = 1 IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = 1 IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*INT_ONE-1)/NPCOL + 1 ) $ + INT_ONE NB = MAX( NB, 2*INT_ONE ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*INT_ONE, N ) ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (3), (3), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((3)+10) IMIDPAD = 10 IPOSTPAD = ((3)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, N, (3), $ NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((3)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + (NB+10)*(3) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (12*NPCOL+3*NB) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 8*NPCOL * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((3)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PCLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PCDTLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,NB)+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PCBMATGEN( ICTXT, 'T', 'D', BWL, BWU, N, (3), NB, $ MEM( IPA ), NB+10, 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) CALL PCFILLPAD( ICTXT, NQ, NP, MEM( IPA-IPREPAD ), $ NB+10, IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PCLANGE( 'I', N, $ (3), MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', NQ, NP, $ MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PCDTTRF( N, MEM( IPA+2*( NB+10 ) ), $ MEM( IPA+1*( NB+10 ) ), MEM( IPA ), 1, $ DESCA, MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW ), IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PCDTTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PCCHEKPAD( ICTXT, 'PCDTTRF', NQ, $ NP, MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = 10*NPCOL+4*NRHS * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PCMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PCFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PCDTTRS( TRANS, N, NRHS, $ MEM( IPA+2*( NB+10 ) ), $ MEM( IPA+1*( NB+10 ) ), MEM( IPA ), $ 1, DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PCDTTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCDTTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * * Reset descriptor describing A to 1-by-P grid for * use in banded utility routines * CALL DESCINIT( DESCA2D, (3), N, $ (3), NB, 0, 0, $ ICTXT, (3), IERR( 1 ) ) CALL PCDTLASCHK( 'N', 'D', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl INT_ONE + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE(INT_ONE)) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+INT_ONE) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE(INT_ONE)) $ *DBLE(NRHS)) * * Multiply by 4 to get complex count * NOPS = NOPS * DBLE(4) * * Second calc to represent actual hardware speed * * 2*N_FIRST bwl*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE(BWL)*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST bwl*INT_ONE * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE(BWL) $ *DBLE(INT_ONE)) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB bwl*INT_ONE flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE(BWL) $ *DBLE(INT_ONE)) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( BWL*INT_ONE*BWL/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*BWL*INT_ONE*BWL) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+INT_ONE) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE(INT_ONE)) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * (bwl+INT_ONE) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE(BWL)+ $ DBLE(INT_ONE)))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * (bwl+INT_ONE) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE(BWL)+ $ DBLE(INT_ONE)))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*(BWL*INT_ONE ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * BWL*INT_ONE ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PCDTTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pcdtinfo.f000644 000766 000024 00000044361 10363532303 017433 0ustar00juliestaff000000 000000 SUBROUTINE PCDTINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PCDTINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW NBW = 1 IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'C' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'C' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCDTINFO * END scalapack-2.0.2/TESTING/LIN/pcdtlaschk.f000644 000766 000024 00000030670 10363532303 017743 0ustar00juliestaff000000 000000 SUBROUTINE PCDTLASCHK( SYMM, UPLO, TRANS, N, BWL, BWU, NRHS, X, $ IX, JX, DESCX, IASEED, A, IA, JA, DESCA, $ IBSEED, ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, TRANS, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PCDTLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'H', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * TRANS if TRANS= 'C', A 'Conjugate transpose' is used as the * coefficient matrix in the solve. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,NB)+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,NB)+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER I, START INTEGER BW, INFO, IPPRODUCT, WORK_MIN REAL DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGAMX2D, CGEMM, CGSUM2D, $ CLASET, PBCTRAN, PCMATGEN, SGEBR2D, $ SGEBS2D, SGERV2D, SGESD2D * .. * .. External Functions .. INTEGER ICAMAX, NUMROC REAL PSLAMCH EXTERNAL ICAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'H' ) ) THEN BW = BWL START = 1 WORK_MIN = MAX(5,NB)+2*NB ELSE BW = MAX(BWL, BWU) IF( LSAME( UPLO, 'D' )) THEN START = 1 ELSE START = 2 ENDIF WORK_MIN = MAX(5,NB)+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PCTLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (3), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'H' )) THEN CALL PCBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PCBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Matrix formed above has the diagonals shifted from what was * input to the tridiagonal routine. Shift them back. * * Send elements to neighboring processors * IF( MYCOL.GT.0 ) THEN CALL CGESD2D( ICTXT, 1, 1, A( START+2), LDA, $ MYROW, MYCOL-1 ) ENDIF * IF( MYCOL.LT.NPCOL-1 ) THEN CALL CGESD2D( ICTXT, 1, 1, $ A( START+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * * Shift local elements * DO 220 I=0,DESCA( NB_ )-1 A( START+2+(I)*LDA ) = A( START+2+(I+1)*LDA ) 220 CONTINUE * DO 230 I=DESCA( NB_ )-1,0,-1 A( START+(I+1)*LDA ) = A( START+(I)*LDA ) 230 CONTINUE * * Receive elements from neighboring processors * IF( MYCOL.LT.NPCOL-1 ) THEN CALL CGERV2D( ICTXT, 1, 1, $ A( START+2+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * IF( MYCOL.GT.0 ) THEN CALL CGERV2D( ICTXT, 1, 1, A( START), LDA, $ MYROW, MYCOL-1 ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PCGBDCMV( BWL+BWU+1, BWL, BWU, TRANS, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), $ (INT_ONE+2)*INT_ONE, INFO ) * * * Regenerate column of B * CALL PCMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PCAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PSCNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PSCNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PCTLASCHK * END scalapack-2.0.2/TESTING/LIN/pcgbdriver.f000644 000766 000024 00000100074 10363532303 017746 0ustar00juliestaff000000 000000 PROGRAM PCGBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PCGBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by CGB. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTMEM INTEGER, default = 2048. * INTMEM is the size of the integer workspace used in this * driver as input as the IPIV vector. It represents an * upper bound on NB, the blocksize of the data * distribution. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM DOUBLE PRECISION array, dimension ( TOTMEM/CPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER INTMEM PARAMETER ( INTMEM = 2048 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER CPLXSZ, MEMSIZ, NTESTS COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER IPIV(INTMEM) INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCBMATGEN, $ PCCHEKPAD, PCDBLASCHK, PCFILLPAD, PCGBINFO, $ PCGBTRF, PCGBTRS, PCMATGEN, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PCLANGE EXTERNAL LSAME, NUMROC, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PCGBINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = BWLVAL( BW_NUM ) IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = BWUVAL( BW_NUM ) IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*(BWL+BWU)-1)/NPCOL + 1 ) $ + (BWL+BWU) NB = MAX( NB, 2*(BWL+BWU) ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 * IF( NB.GT.INTMEM ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) THEN WRITE( NOUT,* )'You have chosen an ' $ ,'NB > INTMEM in the driver.' WRITE(NOUT, *)'Please edit the driver ' $ ,'and increase the value of INTMEM' ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (2*BWL+2*BWU+1), (2*BWL+2*BWU+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((2*BWL+2*BWU+1)+10) IMIDPAD = 10 IPOSTPAD = ((2*BWL+2*BWU+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (2*BWL+2*BWU+1), N, $ (2*BWL+2*BWU+1), NB, 0, 0, $ ICTXT,((2*BWL+2*BWU+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((2*BWL+2*BWU+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 1 * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((2*BWL+2*BWU+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PCLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PCDBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PCBMATGEN( ICTXT, 'G', 'N', BWL, BWU, N, $ (2*BWL+2*BWU+1), NB, MEM( IPA+BWL+BWU ), $ ((2*BWL+2*BWU+1)+10), 0, 0, IASEED, $ MYROW, MYCOL, NPROW, NPCOL ) * CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((2*BWL+2*BWU+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PCLANGE( '1', (2*BWL+2*BWU+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((2*BWL+2*BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PCGBTRF( N, BWL, BWU, MEM( IPA ), 1, DESCA, IPIV, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PCGBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PCCHEKPAD( ICTXT, 'PCGBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((2*BWL+2*BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = NRHS*(NB+2*BWL+4*BWU) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PCMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PCFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PCGBTRS( TRANS, N, BWL, BWU, NRHS, MEM( IPA ), $ 1, DESCA, IPIV, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PCGBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PCDBLASCHK( 'N', 'N', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA+BWL+BWU ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl (bwl+bwu) + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE((BWL+BWU))) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+(bwl+bwu)) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE((BWL+BWU))) $ *DBLE(NRHS)) * * Multiply by 4 to get complex count * NOPS = NOPS * DBLE(4) * * Second calc to represent actual hardware speed * * 2*N_FIRST (bwl+bwu)*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE((BWL+BWU))*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST (bwl+bwu)*(bwl+bwu) * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE((BWL+BWU)) $ *DBLE((BWL+BWU))) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB (bwl+bwu)*(bwl+bwu) flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE((BWL+BWU)) $ *DBLE((BWL+BWU))) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( (BWL+BWU)*(BWL+BWU)*(BWL+BWU)/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*(BWL+BWU)*(BWL+BWU)*(BWL+BWU)) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+(bwl+bwu)) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE((BWL+BWU))) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * ((bwl+bwu)+(bwl+bwu)) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE((BWL+BWU))+ $ DBLE((BWL+BWU))))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * ((bwl+bwu)+(bwl+bwu)) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE((BWL+BWU))+ $ DBLE((BWL+BWU))))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*((BWL+BWU)*(BWL+BWU) ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * (BWL+BWU)*(BWL+BWU) ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PCGBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pcgbinfo.f000644 000766 000024 00000044340 10363532303 017411 0ustar00juliestaff000000 000000 SUBROUTINE PCGBINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PCGBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'C' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'C' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCGBINFO * END scalapack-2.0.2/TESTING/LIN/pcgbmv1.f000644 000766 000024 00000076131 10607174520 017170 0ustar00juliestaff000000 000000 SUBROUTINE PCGBDCMV( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, $ B, IB, DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCDBTRF and this is stored in AF. If a linear system * is to be solved using PCDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M, $ DU_N_N, DU_P_M, DU_P_N, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM2, IDUM3, J, JA_NEW, $ LLDA, LLDB, MAX_BW, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PCDBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PCDBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCDBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = CZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (MAX_BW+2)*MAX_BW WORK( I ) = CZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( TRANS, 'N' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * * Use main partition in each processor to multiply locally * CALL CGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, CONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, CZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL CCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWL-DL_N_N ), 1 ) * CALL CTRMV( 'U', 'N', 'N', BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL CGESD2D( ICTXT, BWL, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWL, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = CZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL CCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * CALL CTRMV( $ 'L', $ 'N', $ 'N', BWU, $ A( 1 ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 30 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL CGESD2D( ICTXT, BWU, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL CGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL CAXPY( BWL, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL CGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL CAXPY( BWU, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWU+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( TRANS, 'C' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL CLATCPY( 'L', BWU, BWU, A( OFST+1 ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to left * CALL CTRSD2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL-1 ) * ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL CLATCPY( 'U', BWL, BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to right * CALL CTRSD2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL CGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, CONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, CZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL CCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWU-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL CTRRV2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), MAX_BW, MYROW, MYCOL+1 ) * CALL CTRMV( 'U', 'N', 'N', BWU, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL CGESD2D( ICTXT, BWU, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWU, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = CZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL CCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL CTRRV2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), MAX_BW, MYROW, MYCOL-1 ) * CALL CTRMV( $ 'L', $ 'N', $ 'N', BWL, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 60 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL CGESD2D( ICTXT, BWL, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL CGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL CAXPY( BWU, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL CGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL CAXPY( BWL, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWL+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PCBhBMV1 * END scalapack-2.0.2/TESTING/LIN/pcgelqrv.f000644 000766 000024 00000023101 10363532303 017435 0ustar00juliestaff000000 000000 SUBROUTINE PCGELQRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGELQRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from L, Q * computed by PCGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors L and Q computed * by PCGELQF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors computed by PCGELQF. TAU * is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX array, dimension * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IL, IN, $ IPT, IPV, IPW, J, JJA, JV, K, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCLACPY, $ PCLARFB, PCLARFT, PCLASET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IL = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) * ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IL, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * CALL DESCSET( DESCV, DESCA( MB_ ), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * DO 10 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+K-I, DESCA( MB_ ) ) J = JA + I - IA JV = 1 + I - IA + ICOFF * * Compute upper triangular matrix T * CALL PCLARFT( 'Forward', 'Rowwise', N-J+JA, IB, A, I, J, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'Upper', IB, N-J+JA, A, I, J, DESCA, WORK( IPV ), $ 1, JV, DESCV ) CALL PCLASET( 'Lower', IB, N-J+JA, ZERO, ONE, WORK( IPV ), 1, $ JV, DESCV ) * * Zeroes the strict upper triangular part of sub( A ) to get * block column of L * CALL PCLASET( 'Upper', IB, N-J+JA-1, ZERO, ZERO, A, I, J+1, $ DESCA ) * * Apply block Householder transformation * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-I+IA, N-J+JA, IB, WORK( IPV ), 1, $ JV, DESCV, WORK( IPT ), A, I, J, DESCA, $ WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) * 10 CONTINUE * * Handle first block separately * IB = IN - IA + 1 * * Compute upper triangular matrix T * CALL PCLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, TAU, $ WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'Upper', IB, N, A, IA, JA, DESCA, WORK( IPV ), 1, $ ICOFF+1, DESCV ) CALL PCLASET( 'Lower', IB, N, ZERO, ONE, WORK, 1, ICOFF+1, DESCV ) * * Zeroes the strict upper triangular part of sub( A ) to get * block column of L * CALL PCLASET( 'Upper', IB, N-1, ZERO, ZERO, A, IA, JA+1, DESCA ) * * Apply block Householder transformation * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M, N, IB, WORK( IPV ), 1, ICOFF+1, DESCV, $ WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PCGELQRV * END scalapack-2.0.2/TESTING/LIN/pcgeqlrv.f000644 000766 000024 00000024071 10363532303 017444 0ustar00juliestaff000000 000000 SUBROUTINE PCGEQLRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEQLRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from L, Q * computed by PCGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors L and Q computed * by PCGEQLF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(N_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PCGEQLF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK = NB_A * ( 2*Mp0 + Nq0 + NB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IIA, IPT, IPV, IPW, IROFF, $ IV, J, JB, JJA, JN, K, MP, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCLACPY, $ PCLARFB, PCLARFT, PCLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) CALL INFOG2L( IA, JA+N-K, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPV = 1 IPT = IPV + MP * DESCA( NB_ ) IPW = IPT + DESCA( NB_ ) * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * CALL DESCSET( DESCV, M+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, MP ) ) * * Handle first block separately * IV = 1 + M - K + IROFF JB = JN - JA - N + K + 1 * * Compute upper triangular matrix T * CALL PCLARFT( 'Backward', 'Columnwise', M-N+JN-JA+1, JB, A, IA, $ JA+N-K, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'All', M-N+JN-JA+1, JB, A, IA, JA+N-K, DESCA, $ WORK( IPV ), IROFF+1, 1, DESCV ) CALL PCLASET( 'Lower', JB, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeoes the strict upper triangular part of A to get block * row of L * CALL PCLASET( 'All', M-K, JB, ZERO, ZERO, A, IA, JA+N-K, $ DESCA ) CALL PCLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA+M-K, $ JA+N-K+1, DESCA ) * * Apply block Householder transformation * CALL PCLARFB( 'Left', 'No transpose', 'Backward', 'Columnwise', $ M-N+JN-JA+1, JN-JA+1, JB, WORK( IPV ), IROFF+1, 1, $ DESCV, WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) * * Loop over the remaining column blocks * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) IV = 1 + M - N + J - JA + IROFF * * Compute upper triangular matrix T * CALL PCLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, A, IA, $ J, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'All', M-N+J+JB-JA, JB, A, IA, J, DESCA, $ WORK( IPV ), IROFF+1, 1, DESCV ) CALL PCLASET( 'Lower', JB, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeoes the strict upper triangular part of sub( A ) to get * block row of L * CALL PCLASET( 'All', M-N+J-JA, JB, ZERO, ZERO, A, IA, J, $ DESCA ) CALL PCLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA+M-N+J-JA, $ J+1, DESCA ) * * Apply block Householder transformation * CALL PCLARFB( 'Left', 'No transpose', 'Backward', 'Columnwise', $ M-N+J+JB-JA, J+JB-JA, JB, WORK( IPV ), IROFF+1, $ 1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PCGEQLRV * END scalapack-2.0.2/TESTING/LIN/pcgeqrrv.f000644 000766 000024 00000023472 10363532303 017456 0ustar00juliestaff000000 000000 SUBROUTINE PCGEQRRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEQRRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from Q, R * computed by PCGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors Q and R computed * by PCGEQRF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors computed by PCGEQRF. TAU * is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK = NB_A * ( 2*Mp0 + Nq0 + NB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, IIA, IPT, IPV, IPW, $ IROFF, IV, J, JB, JJA, JL, JN, K, MP, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCLACPY, $ PCLARFB, PCLARFT, PCLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPV = 1 IPT = IPV + MP * DESCA( NB_ ) IPW = IPT + DESCA( NB_ ) * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * K = MIN( M, N ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JL = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) * CALL DESCSET( DESCV, M+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, INDXG2P( JL, DESCA( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ), ICTXT, $ MAX( 1, MP ) ) * DO 10 J = JL, JN+1, -DESCA( NB_ ) JB = MIN( JA+K-J, DESCA( NB_ ) ) I = IA + J - JA IV = 1 + J - JA + IROFF * * Compute upper triangular matrix T * CALL PCLARFT( 'Forward', 'Columnwise', M-I+IA, JB, A, I, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'Lower', M-I+IA, JB, A, I, J, DESCA, WORK( IPV ), $ IV, 1, DESCV ) CALL PCLASET( 'Upper', M-I+IA, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeroes the strict lower triangular part of sub( A ) to get * block column of R * CALL PCLASET( 'Lower', M-I+IA-1, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Apply block Householder transformation * CALL PCLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M-I+IA, N-J+JA, JB, WORK( IPV ), IV, 1, DESCV, $ WORK( IPT ), A, I, J, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle first block separately * JB = JN - JA + 1 * * Compute upper triangular matrix T * CALL PCLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPV ), $ IROFF+1, 1, DESCV ) CALL PCLASET( 'Upper', M, JB, ZERO, ONE, WORK, IROFF+1, 1, DESCV ) * * Zeroes the strict lower triangular part of sub( A ) to get block * column of R * CALL PCLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Apply block Householder transformation * CALL PCLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', M, $ N, JB, WORK( IPV ), IROFF+1, 1, DESCV, WORK( IPT ), $ A, IA, JA, DESCA, WORK( IPW ) ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PCGEQRRV * END scalapack-2.0.2/TESTING/LIN/pcgerqrv.f000644 000766 000024 00000023754 10363532303 017461 0ustar00juliestaff000000 000000 SUBROUTINE PCGERQRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGERQRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from R, Q * computed by PCGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors R and Q computed * by PCGERQF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCr(M_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PCGERQF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IN, $ IPT, IPV, IPW, JJA, JV, K, MYCOL, MYROW, NPCOL, $ NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCLACPY, $ PCLARFB, PCLARFT, PCLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) * ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA+M-K, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * CALL DESCSET( DESCV, DESCA( MB_), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * * Handle first block separately * IB = IN - IA - M + K + 1 JV = 1 + N - K + ICOFF * * Compute upper triangular matrix T * CALL PCLARFT( 'Backward', 'Rowwise', N-M+IN-IA+1, IB, A, IA+M-K, $ JA, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'All', IB, N-M+IN-IA+1, A, IA+M-K, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) CALL PCLASET( 'Upper', IB, IB, ZERO, ONE, WORK( IPV ), 1, JV, $ DESCV ) * * Zeoes the strict lower triangular part of sub( A ) to get block * column of R * CALL PCLASET( 'All', IB, N-K, ZERO, ZERO, A, IA+M-K, JA, $ DESCA ) CALL PCLASET( 'Lower', IB-1, IB, ZERO, ZERO, A, IA+M-K+1, $ JA+N-K, DESCA ) * * Apply block Householder transformation * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', IN-IA+1, N-M+IN-IA+1, IB, WORK( IPV ), 1, $ ICOFF+1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * * Loop over the remaining row blocks * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) JV = 1 + N - M + I - IA + ICOFF * * Compute upper triangular matrix T * CALL PCLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, I, JA, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'All', IB, N-M+I+IB-IA, A, I, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) CALL PCLASET( 'Upper', IB, IB, ZERO, ONE, WORK( IPV ), 1, JV, $ DESCV ) * * Zeoes the strict Lower triangular part of sub( A ) to get * block column of R * CALL PCLASET( 'All', IB, N-M+I-IA, ZERO, ZERO, A, I, JA, $ DESCA ) CALL PCLASET( 'Lower', IB-1, IB, ZERO, ZERO, A, I+1, $ JA+N-M+I-IA, DESCA ) * * Apply block Householder transformation * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', I+IB-IA, N-M+I+IB-IA, IB, WORK( IPV ), $ 1, ICOFF+1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PCGERQRV * END scalapack-2.0.2/TESTING/LIN/pcgetrrv.f000644 000766 000024 00000030074 10363532303 017455 0ustar00juliestaff000000 000000 SUBROUTINE PCGETRRV( M, N, A, IA, JA, DESCA, IPIV, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCGETRRV reforms sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from the * triangular matrices L and U returned by PCGETRF. It multiplies * an upper triangular matrix stored in the upper triangle of sub( A ) * times the unit lower triangular matrix stored in the lower triangle. * To accomplish this, the routine basically performs the PCGETRF * routine in reverse. * * It computes L*U first, and then apply P: P*L*U => sub( A ). In the * J-th loop, the block column (or column panel), which has the lower * triangular unit matrix L is multiplied with the block row (or row * panel), which contains the upper triangular matrix U. * * ( L1 ) ( 0 0 ) ( L1*U1 L1*U2 ) * A` = L * U + A` = ( ) * (U1 U2) + ( ) = ( ) * ( L2 ) ( 0 A`) ( L2*U1 L2*U2+A` ) * * where L1 is a lower unit triangular matrix and U1 is an upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the distributed matrix sub( A ) * contains the the factors L and U from the factorization * sub( A ) = P*L*U; the unit diagonal elements of L are not * stored. On exit, the original distributed matrix sub( A ) * is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX array of dimension (LWORK) * LWORK >= MpA0 * NB_A + NqA0 * MB_A, where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * WORK is used to store a block of columns of L, and a block of * rows of U. INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IL, IPL, IPU, IROFF, J, $ JB, JL, JN, MN, MP, MYCOL, MYROW, NPCOL, NPROW * .. Local Arrays .. INTEGER DESCIP( DLEN_ ), DESCL( DLEN_ ), $ DESCU( DLEN_ ), IDUM( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PCGEMM, PCLACPY, $ PCLAPIV, PCLASET, PB_TOPGET, PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPL = 1 IPU = IPL + MP * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Define array descriptors for L and U * MN = MIN( M, N ) IL = MAX( ( ( IA+MN-2 ) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) JL = MAX( ( ( JA+MN-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+MN-1 ) IAROW = INDXG2P( IL, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) * CALL DESCSET( DESCL, IA+M-IL, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, MP ) ) * CALL DESCSET( DESCU, DESCA( MB_ ), JA+N-JL, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * CALL DESCSET( DESCIP, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ NUMROC( DESCA( M_ ), DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) + DESCA( MB_ ) ) * * DO 10 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+MN-J, DESCA( NB_ ) ) * * Copy unit lower triangular part of sub( A ) into WORK * CALL PCLACPY( 'Lower', M-IL+IA, JB, A, IL, J, DESCA, $ WORK( IPL ), 1, 1, DESCL ) CALL PCLASET( 'Upper', M-IL+IA, JB, ZERO, ONE, WORK( IPL ), $ 1, 1, DESCL ) * * Copy upper triangular part of sub( A ) into WORK(IPU) * CALL PCLACPY( 'Upper', JB, JA+N-J, A, IL, J, DESCA, $ WORK( IPU ), 1, 1, DESCU ) CALL PCLASET( 'Lower', JB-1, JA+N-J, ZERO, ZERO, $ WORK( IPU ), 2, 1, DESCU ) * * Zero the strict lower triangular piece of the current block. * CALL PCLASET( 'Lower', IA+M-IL-1, JB, ZERO, ZERO, A, IL+1, J, $ DESCA ) * * Zero the upper triangular piece of the current block. * CALL PCLASET( 'Upper', JB, JA+N-J, ZERO, ZERO, A, IL, J, $ DESCA ) * * Update the matrix sub( A ). * CALL PCGEMM( 'No transpose', 'No transpose', IA+M-IL, $ JA+N-J, JB, ONE, WORK( IPL ), 1, 1, DESCL, $ WORK( IPU ), 1, 1, DESCU, ONE, A, IL, J, DESCA ) * IL = IL - DESCA( MB_ ) DESCL( M_ ) = DESCL( M_ ) + DESCL( MB_ ) DESCL( RSRC_ ) = MOD( DESCL( RSRC_ ) + NPROW - 1, NPROW ) DESCL( CSRC_ ) = MOD( DESCL( CSRC_ ) + NPCOL - 1, NPCOL ) DESCU( N_ ) = DESCU( N_ ) + DESCU( NB_ ) DESCU( RSRC_ ) = DESCL( RSRC_ ) DESCU( CSRC_ ) = DESCL( CSRC_ ) * 10 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Copy unit lower triangular part of sub( A ) into WORK * CALL PCLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPL ), $ 1, 1, DESCL ) CALL PCLASET( 'Upper', M, JB, ZERO, ONE, WORK( IPL ), 1, 1, $ DESCL ) * * Copy upper triangular part of sub( A ) into WORK(IPU) * CALL PCLACPY( 'Upper', JB, N, A, IA, JA, DESCA, WORK( IPU ), 1, $ 1, DESCU ) CALL PCLASET( 'Lower', JB-1, N, ZERO, ZERO, WORK( IPU ), 2, 1, $ DESCU ) * * Zero the strict lower triangular piece of the current block. * CALL PCLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Zero the upper triangular piece of the current block. * CALL PCLASET( 'Upper', JB, N, ZERO, ZERO, A, IA, JA, DESCA ) * * Update the matrix sub( A ). * CALL PCGEMM( 'No transpose', 'No transpose', M, N, JB, ONE, $ WORK( IPL ), 1, 1, DESCL, WORK( IPU ), 1, 1, $ DESCU, ONE, A, IA, JA, DESCA ) * * Apply pivots so that sub( A ) = P*L*U * CALL PCLAPIV( 'Backward', 'Row', 'Col', MIN( M, N ), N, A, IA, JA, $ DESCA, IPIV, IA, 1, DESCIP, IDUM ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PCGETRRV * END scalapack-2.0.2/TESTING/LIN/pcinvchk.f000644 000766 000024 00000033777 10363532303 017443 0ustar00juliestaff000000 000000 SUBROUTINE PCINVCHK( MATTYP, N, A, IA, JA, DESCA, IASEED, ANORM, $ FRESID, RCOND, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, IASEED, JA, N REAL ANORM, FRESID, RCOND * .. * .. Array Arguments .. CHARACTER*3 MATTYP INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCINVCHK computes the scaled residual * * || sub( A ) * inv( sub( A ) ) - I || / ( || sub( A ) || * N * eps ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1). to check the result * returned by the matrix inversion routines. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MATTYP (global input) CHARACTER*3 * The type of the distributed matrix to be generated: * if MATTYP = 'GEN' then GENeral matrix, * if MATTYP = 'UTR' then Upper TRiangular matrix, * if MATTYP = 'LTR' then Lower TRiangular matrix, * if MATTYP = 'UPD' then (Upper) Hermitian Positive Definite, * if MATTYP = 'LPD' then (Lower) Hermitian Positive Definite. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of local dimension (LLD_A, LOCc(JA+N-1)). On * entry, sub( A ) contains the distributed matrix inverse * computed by PCGETRI, PCPOTRI or PCTRTRI. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * Seed for the random generation of sub( A ). * * ANORM (global input) REAL * The 1-norm of the original matrix sub( A ). * * FRESID (global output) REAL * The inversion residual. * * RCOND (global output) REAL * The condition number of the original distributed matrix. * RCOND = || sub( A ) ||.|| sub( A )^{-1} || where ||A|| * denotes the 1-norm of A. * * WORK (local workspace) COMPLEX array, dimension * MAX(2*LOCr(N_A+MOD(IA-1,MB_A))*MB_A, LDW) * where LDW is the workspace requirement for the norm computa- * tions, see PCLANGE, PCLANHE, PCLANSY and PCLANTR. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. CHARACTER AFORM, DIAG, UPLO INTEGER ICTXT, ICURCOL, ICURROW, II, IIA, IPW, IROFF, $ IW, J, JB, JJA, JN, KK, MYCOL, MYROW, NP, $ NPCOL, NPROW REAL AUXNORM, EPS, NRMINVAXA, TEMP * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCGEMM, $ PCHEMM, PCLASET, PCMATGEN, PCTRMM * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, NUMROC REAL PCLANGE, PCLANHE, PCLANTR, PSLAMCH EXTERNAL ICEIL, LSAMEN, NUMROC, PCLANGE, PCLANHE, $ PCLANSY, PCLANTR, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * EPS = PSLAMCH( DESCA( CTXT_ ), 'eps' ) * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Compute the condition number * IF( LSAMEN( 1, MATTYP( 1:1 ), 'U' ) ) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * AFORM = 'N' DIAG = 'D' AUXNORM = PCLANGE( '1', N, N, A, IA, JA, DESCA, WORK ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN * AFORM = 'N' DIAG = 'D' AUXNORM = PCLANTR( '1', UPLO, 'Non unit', N, N, A, IA, JA, $ DESCA, WORK ) ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * AFORM = 'H' DIAG = 'D' AUXNORM = PCLANHE( '1', UPLO, N, A, IA, JA, DESCA, WORK ) * END IF RCOND = ANORM*AUXNORM * * Compute inv(A)*A * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ ICURROW, ICURCOL ) * * Define array descriptor for working array WORK * IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, ICURROW, NPROW ) CALL DESCSET( DESCW, N+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), ICURROW, ICURCOL, DESCA( CTXT_ ), $ MAX( 1, NP ) ) IPW = DESCW( LLD_ ) * DESCW( NB_ ) + 1 * IF( MYROW.EQ.ICURROW ) THEN II = IROFF + 1 NP = NP - IROFF ELSE II = 1 END IF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN - JA + 1 * * Handle first block separately, regenerate a block of columns of A * IW = IROFF + 1 IF( MYCOL.EQ.ICURCOL ) THEN IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), WORK, $ DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, IIA-1, NP, $ JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) IF( LSAMEN( 3, MATTYP, 'UTR' ) ) THEN CALL PCLASET( 'Lower', N-1, JB, ZERO, ZERO, WORK, IW+1, $ 1, DESCW ) ELSE CALL PCLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, WORK, IW, $ 2, DESCW ) END IF ELSE CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), WORK( IPW ), $ DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, $ IIA-1, NP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) END IF END IF * * Multiply A^{-1}*A * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * CALL PCGEMM( 'No tranpose', 'No transpose', N, JB, N, ONE, A, $ IA, JA, DESCA, WORK( IPW ), IW, 1, DESCW, ZERO, $ WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN * CALL PCTRMM( 'Left', UPLO, 'No tranpose', 'Non unit', N, JB, $ ONE, A, IA, JA, DESCA, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * CALL PCHEMM( 'Left', UPLO, N, JB, ONE, A, IA, JA, DESCA, $ WORK(IPW), IW, 1, DESCW, ZERO, WORK, IW, 1, $ DESCW ) * END IF * * subtract the identity matrix to the diagonal block of these cols * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 10 KK = 0, JB-1 WORK( II+KK*(DESCW(LLD_)+1) ) = $ WORK( II+KK*(DESCW( LLD_ )+1) )-ONE 10 CONTINUE END IF * NRMINVAXA = PCLANGE( '1', N, JB, WORK, IW, 1, DESCW, WORK( IPW ) ) * IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJA = JJA + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) DESCW( CSRC_ ) = ICURCOL * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) * JB = MIN( N-J+JA, DESCA( NB_ ) ) * * regenerate a block of columns of A * IF( MYCOL.EQ.ICURCOL ) THEN IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCW( MB_ ), DESCW( NB_ ), $ WORK, DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), $ IASEED, IIA-1, NP, JJA-1, JB, MYROW, $ MYCOL, NPROW, NPCOL ) IF( LSAMEN( 3, MATTYP, 'UTR' ) ) THEN CALL PCLASET( 'Lower', JA+N-J-1, JB, ZERO, ZERO, $ WORK, IW+J-JA+1, 1, DESCW ) ELSE CALL PCLASET( 'All', J-JA, JB, ZERO, ZERO, WORK, IW, $ 1, DESCW ) CALL PCLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, $ WORK, IW+J-JA, 2, DESCW ) END IF ELSE CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCW( MB_ ), DESCW( NB_ ), $ WORK( IPW ), DESCW( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, NP, $ JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) END IF END IF * * Multiply A^{-1}*A * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * CALL PCGEMM( 'No tranpose', 'No transpose', N, JB, N, ONE, $ A, IA, JA, DESCA, WORK( IPW ), IW, 1, DESCW, $ ZERO, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP(2:3), 'TR' ) ) THEN * CALL PCTRMM( 'Left', UPLO, 'No tranpose', 'Non unit', N, JB, $ ONE, A, IA, JA, DESCA, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * CALL PCHEMM( 'Left', UPLO, N, JB, ONE, A, IA, JA, DESCA, $ WORK(IPW), IW, 1, DESCW, ZERO, WORK, IW, 1, $ DESCW ) * END IF * * subtract the identity matrix to the diagonal block of these * cols * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 20 KK = 0, JB-1 WORK( II+KK*(DESCW( LLD_ )+1) ) = $ WORK( II+KK*(DESCW( LLD_ )+1) ) - ONE 20 CONTINUE END IF * * Compute the 1-norm of these JB cols * TEMP = PCLANGE( '1', N, JB, WORK, IW, 1, DESCW, WORK( IPW ) ) NRMINVAXA = MAX( TEMP, NRMINVAXA ) * IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJA = JJA + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) DESCW( CSRC_ ) = ICURCOL * 30 CONTINUE * * Compute the scaled residual * FRESID = NRMINVAXA / ( N * EPS * ANORM ) * RETURN * * End of PCINVCHK * END scalapack-2.0.2/TESTING/LIN/pcinvdriver.f000644 000766 000024 00000101703 10430450246 020152 0ustar00juliestaff000000 000000 PROGRAM PCINVDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PCINVDRIVER is the main test program for the COMPLEX * SCALAPACK matrix inversion routines. This test driver computes the * inverse of different kind of matrix and tests the results. * * The program must be driven by a short data file. An annotated example * of a data file can be obtained by deleting the first 3 characters * from the following 14 lines: * 'ScaLAPACK Matrix Inversion Testing input file' * 'PVM machine.' * 'INV.out' output file name (if any) * 6 device out * 5 number of matrix types (next line) * 'GEN' 'UTR' 'LTR' 'UPD' LPD' GEN, UTR, LTR, UPD, LPD * 4 number of problems sizes * 1000 2000 3000 4000 values of N * 3 number of NB's * 4 30 35 values of NB * 2 number of process grids (ordered P & Q) * 4 2 values of P * 4 4 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ, REALSZ and CPLXSZ indicate the length in bytes on * the given platform for an integer, a single precision real * and a single precision complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, INTGSZ, MEMSIZ, NTESTS, REALSZ, TOTMEM COMPLEX PADVAL, ZERO PARAMETER ( CPLXSZ = 8, INTGSZ = 4, REALSZ = 4, $ TOTMEM = 2000000, MEMSIZ = TOTMEM / CPLXSZ, $ NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. CHARACTER UPLO CHARACTER*3 MTYP CHARACTER*6 PASSED CHARACTER*80 OUTFILE LOGICAL CHECK INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, $ IPPIV, IPREPAD, IPOSTPAD, IPIW, IPW, ITEMP, J, $ K, KTESTS, KPASS, KFAIL, KSKIP, L, LCM, LIPIV, $ LIWORK, LWORK, MYCOL, MYROW, N, NB, NGRIDS, $ NMAT, NMTYP, NNB, NOUT, NP, NPCOL, NPROCS, $ NPROW, NQ, WORKIINV, WORKINV, WORKSIZ REAL ANORM, FRESID, RCOND, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. CHARACTER*3 MATTYP( NTESTS ) INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), $ QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCCHEKPAD, $ PCFILLPAD, PCGETRF, PCGETRI, $ PCINVCHK, PCINVINFO, PCLASET, $ PCMATGEN, PCPOTRF, PCPOTRI, $ PCTRTRI, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, ILCM, NUMROC REAL PCLANGE, PCLANHE, PCLANSY, PCLANTR EXTERNAL ICEIL, ILCM, LSAMEN, NUMROC, PCLANGE, $ PCLANHE, PCLANSY, PCLANTR * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP /4*0/ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PCINVINFO( OUTFILE, NOUT, NMTYP, MATTYP, NTESTS, NMAT, NVAL, $ NTESTS, NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Loop over the different matrix types * DO 40 I = 1, NMTYP * MTYP = MATTYP( I ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a general matrix.' ELSE IF( LSAMEN( 3, MTYP, 'UTR' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is an upper triangular matrix.' ELSE IF( LSAMEN( 3, MTYP, 'LTR' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a lower triangular matrix.' ELSE IF( LSAMEN( 3, MTYP, 'UPD' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a Hermitian positive definite matrix.' WRITE( NOUT, FMT = 9986 ) $ 'Only the upper triangular part will be '// $ 'referenced.' ELSE IF( LSAMEN( 3, MTYP, 'LPD' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a Hermitian positive definite matrix.' WRITE( NOUT, FMT = 9986 ) $ 'Only the lower triangular part will be '// $ 'referenced.' END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 J = 1, NGRIDS * NPROW = PVAL( J ) NPCOL = QVAL( J ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 K = 1, NMAT * N = NVAL( K ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 L = 1, NNB * NB = NBVAL( L ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for ScaLAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 * LCM = ILCM( NPROW, NPCOL ) IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * Pivots are needed by LU factorization * IPPIV = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + $ IPREPAD LIPIV = ICEIL( INTGSZ * ( NP + NB ), CPLXSZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * LWORK = MAX( 1, NP * DESCA( NB_ ) ) WORKINV = LWORK + IPOSTPAD * * Figure the amount of workspace required by the * general matrix inversion * IF( NPROW.EQ.NPCOL ) THEN LIWORK = NQ + DESCA( NB_ ) ELSE * * change the integer workspace needed for PDGETRI * LIWORK = MAX( DESCA( NB_ ), DESCA( MB_ ) * * $ ICEIL( ICEIL( DESCA( LLD_ ), * $ DESCA( MB_ ) ), LCM / NPROW ) ) * $ + NQ LIWORK = NUMROC( DESCA( M_ ) + $ DESCA( MB_ ) * NPROW $ + MOD ( 1 - 1, DESCA( MB_ ) ), DESCA ( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ) + $ MAX ( DESCA( MB_ ) * ICEIL ( ICEIL( $ NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW, $ DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ), $ DESCA( MB_ ) ), LCM / NPROW ), DESCA( NB_ ) ) * END IF WORKIINV = ICEIL( LIWORK*INTGSZ, CPLXSZ ) + $ IPOSTPAD IPIW = IPW + WORKINV + IPREPAD WORKSIZ = WORKINV + IPREPAD + WORKIINV * ELSE * * No pivots or workspace needed for triangular or * Hermitian positive definite matrices. * IPW = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + IPREPAD WORKSIZ = 1 + IPOSTPAD * END IF * IF( CHECK ) THEN * * Figure amount of work space for the norm * computations * IF( LSAMEN( 3, MTYP, 'GEN' ).OR. $ LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN ITEMP = NQ ELSE ITEMP = 2 * NQ + NP IF( NPROW.NE.NPCOL ) THEN ITEMP = ITEMP + $ NB * ICEIL( ICEIL( NP, NB ), $ LCM / NPROW ) END IF END IF WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ ICEIL( REALSZ * ITEMP, CPLXSZ ) ) * * Figure the amount of workspace required by the * checking routine * WORKSIZ = MAX( WORKSIZ, 2 * NB * MAX( 1, NP ) ) + $ IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'inversion', $ ( IPW + WORKSIZ ) * CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( LSAMEN( 3, MTYP, 'GEN' ).OR. $ LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * Generate a general diagonally dominant matrix A * CALL PCMATGEN( ICTXT, 'N', 'D', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, $ NQ, MYROW, MYCOL, NPROW, NPCOL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * Generate a Hermitian positive definite matrix A * CALL PCMATGEN( ICTXT, 'H', 'D', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, $ NQ, MYROW, MYCOL, NPROW, NPCOL ) * END IF * * Zeros not-referenced part of A, if any. * IF( LSAMEN( 1, MTYP, 'U' ) ) THEN * UPLO = 'U' CALL PCLASET( 'Lower', N-1, N-1, ZERO, ZERO, $ MEM( IPA ), 2, 1, DESCA ) * ELSE IF( LSAMEN( 1, MTYP, 'L' ) ) THEN * UPLO = 'L' CALL PCLASET( 'Upper', N-1, N-1, ZERO, ZERO, $ MEM( IPA ), 1, 2, DESCA ) * ELSE * UPLO = 'G' * END IF * * Need 1-norm of A for checking * IF( CHECK ) THEN * CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * CALL PCFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANGE( '1', N, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKINV-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKIINV-IPOSTPAD, 1, $ MEM( IPIW-IPREPAD ), $ WORKIINV-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * ANORM = PCLANTR( '1', UPLO, 'Non unit', N, N, $ MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANTR', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANTR', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * ANORM = PCLANHE( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'SY' ) ) THEN * CALL PCFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANSY( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANSY', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD,IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'HE' ) ) THEN CALL PCFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANHE( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * END IF * END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * Perform LU factorization * CALL SLTIMER( 1 ) CALL PCGETRF( N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGETRF', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRF', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * Perform the general matrix inversion * CALL SLTIMER( 2 ) CALL PCGETRI( N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ), LWORK, $ MEM( IPIW ), LIWORK, INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGETRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRI', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRI', $ WORKIINV-IPOSTPAD, 1, $ MEM( IPIW-IPREPAD ), $ WORKIINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRI', $ WORKINV-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * Perform the general matrix inversion * CALL SLTIMER( 2 ) CALL PCTRTRI( UPLO, 'Non unit', N, MEM( IPA ), 1, $ 1, DESCA, INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCTRTRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * Perform Cholesky factorization * CALL SLTIMER( 1 ) CALL PCPOTRF( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCPOTRF', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * Perform the Hermitian positive definite matrix * inversion * CALL SLTIMER( 2 ) CALL PCPOTRI( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCPOTRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * END IF * IF( CHECK ) THEN * CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Compute fresid = || inv(A)*A-I || * CALL PCINVCHK( MTYP, N, MEM( IPA ), 1, 1, DESCA, $ IASEED, ANORM, FRESID, RCOND, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCINVCHK', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCINVCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. INFO.EQ.0 .AND. $ ( (FRESID-FRESID) .EQ. 0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 IF( INFO.GT.0 ) THEN PASSED = 'SINGUL' ELSE PASSED = 'FAILED' END IF END IF * ELSE * * Don't perform the checking, only the timing * operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * 8/3 N^3 - N^2 flops for LU factorization * NOPS = ( 8.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) - $ DBLE( N )**2 * * 16/3 N^3 for matrix inversion * NOPS = NOPS + $ ( 16.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * 4/3 N^3 + 2 N^2 for triangular matrix inversion * CTIME(1) = 0.0D+0 WTIME(1) = 0.0D+0 NOPS = ( 4.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ 2.0D+0 * ( DBLE( N )**2 ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * 4/3 N^3 + 3 N^2 flops for Cholesky factorization * NOPS = ( 4.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ 2.0D+0 * ( DBLE( N )**2 ) * * 8/3 N^3 + 5 N^2 flops for Cholesky inversion * NOPS = NOPS + $ ( 8.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ 5.0D+0 * ( DBLE( N )**2 ) * END IF * * Figure total megaflops -- factorization and * inversion, for WALL and CPU time, and print * output. * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 2 ) .GE. 0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ RCOND, FRESID, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ) + CTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ) .GE. 0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ RCOND, FRESID, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * 40 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q Fct Time Inv Time ', $ ' MFLOPS Cond Resid CHECK' ) 9994 FORMAT( '---- ----- --- ----- ----- -------- -------- ', $ '----------- ------- ------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I5, 1X, I5, 1X, F8.2, 1X, F8.2, $ 1X, F11.2, 1X, F7.1, 1X, F7.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( A ) * STOP * * End of PCINVDRIVER * END scalapack-2.0.2/TESTING/LIN/pcinvinfo.f000644 000766 000024 00000035257 10363532303 017624 0ustar00juliestaff000000 000000 SUBROUTINE PCINVINFO( SUMMRY, NOUT, NMTYP, MATTYP, LDMTYP, NMAT, $ NVAL, LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, $ PVAL, LDPVAL, QVAL, LDQVAL, THRESH, WORK, $ IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDMTYP, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NMTYP, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. CHARACTER*3 MATTYP( LDMTYP ) CHARACTER*( * ) SUMMRY INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PCINVINFO gets needed startup information for matrix inversion * tests and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMTYP (global output) INTEGER * The number of different matrix types to be tested. * * MATTYP (global output) CHARACTER*3 array of dimension of LDMTYP, * The types of matrix to be generated: * if MATTYP(i) = 'GEN' then GENeral matrix, * if MATTYP(i) = 'UTR' then Upper TRiangular matrix, * if MATTYP(i) = 'LTR' then Lower TRiangular matrix, * if MATTYP(i) = 'UPD' then (Upper) hermitian Pos. Definite, * if MATTYP(i) = 'LPD' then (Lower) hermitian Pos. Definite. * * LDMTYP (global input) INTEGER * The maximum number of different matrix types to be tested. * LDMTYP >= NMTYP. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 4, LDMTYP+LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack * all input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT, K REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAMEN REAL PSLAMCH EXTERNAL LSAMEN, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='INV.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get the matrix types to be tested * READ( NIN, FMT = * ) NMTYP IF( NMTYP.LT.1 .OR. NMTYP.GT.LDMTYP ) THEN WRITE( NOUT, FMT = 9994 ) 'nb of matrix types', LDMTYP GO TO 40 END IF READ( NIN, FMT = * ) ( MATTYP( I ), I = 1, NMTYP ) * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 40 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 40 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 40 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 40 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I ) * QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS WORK( 4 ) = NMTYP CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 DO 20 K = 1, NMTYP IF( LSAMEN( 3, MATTYP( K ), 'GEN' ) ) THEN WORK( I ) = 1 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'UTR' ) ) THEN WORK( I ) = 2 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'LTR' ) ) THEN WORK( I ) = 3 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'UPD' ) ) THEN WORK( I ) = 4 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'LPD' ) ) THEN WORK( I ) = 5 I = I + 1 END IF 20 CONTINUE * CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Matrix Inversion routines.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision Matrix Inversion '// $ 'routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Inverse residual = ||inv(A)*A - I|| '// $ '/ (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less '// $ 'than THRESH, CHECK is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'Fct time : Time in seconds to factor the'// $ ' matrix, if needed.' WRITE( NOUT, FMT = 9999 ) $ 'Inv Time : Time in seconds to inverse the'// $ ' matrix.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and inverse.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) NMTYP = WORK( 4 ) * I = NMTYP+NMAT+NNB+2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * DO 30 K = 1, NMTYP IF( WORK( K ).EQ.1 ) THEN MATTYP( K ) = 'GEN' ELSE IF( WORK( K ).EQ.2 ) THEN MATTYP( K ) = 'UTR' ELSE IF( WORK( K ).EQ.3 ) THEN MATTYP( K ) = 'LTR' ELSE IF( WORK( K ).EQ.4 ) THEN MATTYP( K ) = 'UPD' ELSE IF( WORK( K ).EQ.5 ) THEN MATTYP( K ) = 'LPD' END IF 30 CONTINUE * I = NMTYP + 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 40 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCINVINFO * END scalapack-2.0.2/TESTING/LIN/pclafchk.f000644 000766 000024 00000026755 10363532303 017407 0ustar00juliestaff000000 000000 SUBROUTINE PCLAFCHK( AFORM, DIAG, M, N, A, IA, JA, DESCA, IASEED, $ ANORM, FRESID, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER AFORM, DIAG INTEGER IA, IASEED, JA, M, N REAL ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCLAFCHK computes the residual * || sub( A ) - sub( Ao ) || / (|| sub( Ao ) ||*eps*MAX(M,N)), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A( IA:IA+M-1, JA:JA+N-1 ) and ||.|| stands for the infini- * ty norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * AFORM (global input) CHARACTER * sub( A ) is overwritten with: * - a symmetric matrix, if AFORM = 'S'; * - a Hermitian matrix, if AFORM = 'H'; * - the transpose of what would normally be generated, * if AFORM = 'T'; * - the conjugate transpose of what would normally be * generated, if AFORM = 'C'; * - otherwise a random matrix. * * DIAG (global input) CHARACTER * if DIAG = 'D' : sub( A ) is diagonally dominant. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub( A ) - sub( Ao ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * ANORM (global input) REAL * The Infinity norm of sub( A ). * * FRESID (global output) REAL * The maximum (worst) factorizational error. * * WORK (local workspace) COMPLEX array, dimension (LWORK). * LWORK >= MpA0 * NB_A, where * * IROFFA = MOD( IA-1, MB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * WORK is used to store a block of columns of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = (1.0E+0, 0.0E+0) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURCOL, ICURROW, $ II, IIA, IOFFA, IROFF, JB, JJ, JJA, JN, KK, $ LDA, LDW, LDWP1, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CMATADD, INFOG2L, PCMATGEN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSLAMCH, PCLANGE EXTERNAL ICEIL, LSAME, NUMROC, PCLANGE, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( ICTXT, 'eps' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * * Compute sub( A ) := sub( A ) - sub( Ao ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDW = MAX( 1, MP ) LDWP1 = LDW + 1 LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA - 1 )*LDA * IF( LSAME( AFORM, 'H' ) ) THEN * * Handle first block of columns separately * II = 1 ICURROW = IAROW ICURCOL = IACOL JB = JN - JA + 1 * IF( MYCOL.EQ.ICURCOL ) THEN CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) IF( MYROW.EQ.ICURROW ) THEN DO 10, KK = 0, JB-1 WORK( II+KK*LDWP1 ) = REAL( WORK( II+KK*LDWP1 ) ) 10 CONTINUE END IF CALL CMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 30, JJ = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-JJ, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ WORK, LDW, DESCA( RSRC_ ), DESCA( CSRC_ ), $ IASEED, IIA-1, MP, JJA-1, JB, MYROW, $ MYCOL, NPROW, NPCOL ) IF( MYROW.EQ.ICURROW ) THEN DO 20, KK = 0, JB-1 WORK( II+KK*LDWP1 ) = REAL( WORK( II+KK*LDWP1 ) ) 20 CONTINUE END IF CALL CMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE * ELSE * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN JB = JN-JA+1 CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL CMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB NQ = NQ - JB IOFFA = IOFFA + JB * LDA END IF * * Handle the remaning blocks of columns * DO 40 JJ = JJA, JJA+NQ-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JJA+NQ-JJ ) IOFFA = IIA + ( JJ - 1 )*LDA CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJ-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL CMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) 40 CONTINUE * END IF * * Calculate factor residual * FRESID = PCLANGE( 'I', M, N, A, IA, JA, DESCA, WORK ) / $ ( MAX( M, N ) * EPS * ANORM ) * RETURN * * End PCLAFCHK * END scalapack-2.0.2/TESTING/LIN/pclaschk.f000644 000766 000024 00000030355 10363532303 017413 0ustar00juliestaff000000 000000 SUBROUTINE PCLASCHK( SYMM, DIAG, N, NRHS, X, IX, JX, DESCX, $ IASEED, IA, JA, DESCA, IBSEED, ANORM, RESID, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, SYMM INTEGER IA, IASEED, IBSEED, IX, JA, JX, N, NRHS REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX WORK( * ), X( * ) * .. * * Purpose * ======= * * PCLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed matrix, * otherwise sub( A ) is a general distributed matrix. * * DIAG (global input) CHARACTER * If DIAG = 'D', sub( A ) is diagonally dominant. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 0. * * X (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_X,LOCc(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK >= MAX(1,Np)*NB_X + Nq*NB_X + MAX( MAX(NQ*MB_A,2*NB_X), * NB_X * NUMROC( NUMROC(N,MB_X,0,0,NPCOL), MB_X, 0, 0, LCMQ ) ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO, ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IB, ICOFF, ICTXT, ICURCOL, IDUMM, $ II, IIA, IIX, IOFFX, IPA, IPB, IPW, IPX, IROFF, $ IXCOL, IXROW, J, JBRHS, JJ, JJA, JJX, LDX, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL DIVISOR, EPS, RESID1 COMPLEX BETA * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGAMX2D, CGEMM, CGSUM2D, $ CLASET, PBCTRAN, PCMATGEN, SGEBR2D, $ SGEBS2D, SGERV2D, SGESD2D * .. * .. External Functions .. INTEGER ICAMAX, NUMROC REAL PSLAMCH EXTERNAL ICAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * LDX = MAX( 1, NP ) IPB = 1 IPX = IPB + NP * DESCX( NB_ ) IPA = IPX + NQ * DESCX( NB_ ) * IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF * ICURCOL = IXCOL * * Loop over the rhs * DO 40 J = 1, NRHS, DESCX( NB_ ) JBRHS = MIN( DESCX( NB_ ), NRHS-J+1 ) * * Transpose x from ICURCOL to all rows * IOFFX = IIX + ( JJX - 1 ) * DESCX( LLD_ ) CALL PBCTRAN( ICTXT, 'Column', 'Transpose', N, JBRHS, $ DESCX( MB_ ), X( IOFFX ), DESCX( LLD_ ), ZERO, $ WORK( IPX ), JBRHS, IXROW, ICURCOL, -1, IACOL, $ WORK( IPA ) ) * * Regenerate B in IXCOL * IF( MYCOL.EQ.ICURCOL ) THEN CALL PCMATGEN( ICTXT, 'N', 'N', DESCX( M_ ), DESCX( N_ ), $ DESCX( MB_ ), DESCX( NB_ ), WORK( IPB ), LDX, $ IXROW, IXCOL, IBSEED, IIX-1, NP, JJX-1, $ JBRHS, MYROW, MYCOL, NPROW, NPCOL ) BETA = ONE ELSE BETA = ZERO END IF * IF( NQ.GT.0 ) THEN DO 10 II = IIA, IIA+NP-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IIA+NP-II ) * * Regenerate ib rows of the matrix A(IA:IA+N-1,JA:JA+N-1). * CALL PCMATGEN( ICTXT, SYMM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ WORK( IPA ), IB, DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, II-1, IB, $ JJA-1, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Compute B <= B - A * X. * CALL CGEMM( 'No transpose', 'Transpose', IB, JBRHS, NQ, $ -ONE, WORK( IPA ), IB, WORK( IPX ), JBRHS, $ BETA, WORK( IPB+II-IIA ), LDX ) * 10 CONTINUE * ELSE IF( MYCOL.NE.ICURCOL ) THEN * CALL CLASET( 'All', NP, JBRHS, ZERO, ZERO, WORK( IPB ), $ LDX ) * END IF * * Add B rowwise to ICURCOL * CALL CGSUM2D( ICTXT, 'Row', ' ', NP, JBRHS, WORK( IPB ), LDX, $ MYROW, ICURCOL ) * IF( MYCOL.EQ.ICURCOL ) THEN * * Figure || A * X - B || & || X || * IPW = IPA + JBRHS DO 20 JJ = 0, JBRHS - 1 IF( NP.GT.0 ) THEN II = ICAMAX( NP, WORK( IPB+JJ*LDX ), 1 ) WORK( IPA+JJ ) = ABS( WORK( IPB+II-1+JJ*LDX ) ) WORK( IPW+JJ ) = ABS( X( IOFFX + ICAMAX( NP, $ X( IOFFX + JJ*DESCX( LLD_ ) ), 1 )-1+JJ* $ DESCX( LLD_ ) ) ) ELSE WORK( IPA+JJ ) = ZERO WORK( IPW+JJ ) = ZERO END IF 20 CONTINUE * * After CGAMX2D computation, * WORK(IPB) has the maximum of || Ax - b ||, and * WORK(IPX) has the maximum of || X ||. * CALL CGAMX2D( ICTXT, 'Column', ' ', 1, 2*JBRHS, $ WORK( IPA ), 1, IDUMM, IDUMM, -1, 0, ICURCOL ) * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * IF( MYROW.EQ.0 ) THEN DO 30 JJ = 0, JBRHS - 1 RESID1 = REAL( WORK( IPA+JJ ) ) / $ ( REAL( WORK( IPW+JJ ) )*DIVISOR ) IF( RESID.LT.RESID1 ) $ RESID = RESID1 30 CONTINUE IF( MYCOL.NE.0 ) $ CALL SGESD2D( ICTXT, 1, 1, RESID, 1, 0, 0 ) END IF * ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * CALL SGERV2D( ICTXT, 1, 1, RESID1, 1, 0, ICURCOL ) IF( RESID.LT.RESID1 ) $ RESID = RESID1 * END IF * IF( MYCOL.EQ.ICURCOL ) $ JJX = JJX + JBRHS ICURCOL = MOD( ICURCOL+1, NPCOL ) * 40 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, RESID, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, RESID, 1, 0, 0 ) END IF * RETURN * * End of PCLASCHK * END scalapack-2.0.2/TESTING/LIN/pclltdriver.f000644 000766 000024 00000110441 10363532303 020150 0ustar00juliestaff000000 000000 PROGRAM PCLLTDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PCLLTDRIVER is the main test program for the COMPLEX * ScaLAPACK Cholesky routines. This test driver performs an * A = L*L**H or A = U**H*U factorization and solve, and optionally * performs condition estimation and iterative refinement. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'ScaLAPACK LLt factorization input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LLT.out' output file name (if any) * 6 device out * 'U' define Lower or Upper * 1 number of problems sizes * 31 100 200 values of N * 1 number of NB's * 2 10 24 values of NB * 1 number of NRHS's * 1 values of NRHS * 1 Number of NBRHS's * 1 values of NBRHS * 1 number of process grids (ordered pairs of P & Q) * 2 values of P * 2 values of Q * 1.0 threshold * T (T or F) Test Cond. Est. and Iter. Ref. Routines * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, MEMSIZ, NTESTS, REALSZ, TOTMEM REAL ZERO COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, EST CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, IMIDPAD, $ INFO, IPA, IPA0, IPB, IPB0, IPBERR, IPFERR, $ IPREPAD, IPOSTPAD, IPW, IPW2, ITEMP, J, K, $ KFAIL, KK, KPASS, KSKIP, KTESTS, LCM, LCMQ, $ LRWORK, LWORK, LW2, MYCOL, MYRHS, MYROW, N, NB, $ NBRHS, NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, $ NPCOL, NPROCS, NPROW, NQ, NRHS, WORKSIZ REAL ANORM, ANORM1, FRESID, RCOND, SRESID, SRESID2, $ THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, DESCINIT, $ IGSUM2D, BLACS_PINFO, PCCHEKPAD, PCFILLPAD, $ PCLAFCHK, PCLASCHK, PCLLTINFO, $ PCMATGEN, PCPOCON, PCPORFS, $ PCPOTRF, PCPOTRRV, PCPOTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC REAL PCLANHE EXTERNAL ICEIL, ILCM, LSAME, NUMROC, PCLANHE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 CALL PCLLTINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NNR, NRVAL, NTESTS, NNBR, NBRVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, EST, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 50 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 50 * DO 40 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 40 END IF * DO 30 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IF( EST ) THEN IPA0 = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPA0 + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD ELSE IPW = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD END IF * * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines PCLAFCHK, PCPOTRRV, and * PCLANHE * WORKSIZ = NP * DESCA( NB_ ) * WORKSIZ = MAX( WORKSIZ, DESCA( MB_ ) * DESCA( NB_ ) ) * LCM = ILCM( NPROW, NPCOL ) ITEMP = MAX( 2, 2 * NQ ) + NP IF( NPROW.NE.NPCOL ) THEN ITEMP = ITEMP + $ NB * ICEIL( ICEIL( NP, NB ), LCM / NPROW ) END IF WORKSIZ = MAX( WORKSIZ, $ ICEIL( REALSZ * ITEMP, CPLXSZ ) ) WORKSIZ = WORKSIZ + IPOSTPAD * ELSE * WORKSIZ = IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate a Hermitian positive definite matrix A * CALL PCMATGEN( ICTXT, 'Herm', 'Diag', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANHE( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) ANORM1 = PCLANHE( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) END IF * IF( EST ) THEN CALL PCMATGEN( ICTXT, 'Herm', 'Diag', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA0 ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) IF( CHECK ) $ CALL PCFILLPAD( ICTXT, NP, NQ, $ MEM( IPA0-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform LLt factorization * CALL SLTIMER( 1 ) * CALL PCPOTRF( UPLO, N, MEM( IPA ), 1, 1, DESCA, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PCPOTRF INFO=', INFO KFAIL = KFAIL + 1 RCOND = ZERO GO TO 60 END IF * IF( CHECK ) THEN * * Check for memory overwrite in LLt factorization * CALL PCCHEKPAD( ICTXT, 'PCPOTRF', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( EST ) THEN * * Calculate workspace required for PCPOCON * LWORK = MAX( 1, 2*NP ) + $ MAX( 2, DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), $ NQ + DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LRWORK = MAX( 1, 2*NQ ) LW2 = ICEIL( LRWORK*REALSZ, CPLXSZ ) + IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'cond est', $ ( IPW2+LW2 )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 60 END IF * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Compute condition number of the matrix * CALL PCPOCON( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ ANORM1, RCOND, MEM( IPW ), LWORK, $ MEM( IPW2 ), LRWORK, INFO ) * IF( CHECK ) THEN CALL PCCHEKPAD( ICTXT, 'PCPOCON', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPOCON', $ LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPOCON', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF END IF * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 10 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB, N, NRHS, NB, NBRHS, 0, 0, $ ICTXT, MAX( 1, NP )+IMIDPAD, $ IERR( 1 ) ) * * move IPW to allow room for RHS * MYRHS = NUMROC( DESCB( N_ ), DESCB( NB_ ), MYCOL, $ DESCB( CSRC_ ), NPCOL ) IPB = IPW * IF( EST ) THEN IPB0 = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD IPFERR = IPB0 + DESCB( LLD_ )*MYRHS + IPOSTPAD $ + IPREPAD IPBERR = MYRHS + IPFERR + IPOSTPAD + IPREPAD IPW = MYRHS + IPBERR + IPOSTPAD + IPREPAD ELSE IPW = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD END IF * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines PCLASCHK * LCMQ = LCM / NPCOL WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ NQ * NBRHS + NP * NBRHS + $ MAX( MAX( NQ*NB, 2*NBRHS ), $ NBRHS * NUMROC( NUMROC(N,NB,0,0,NPCOL),NB, $ 0,0,LCMQ ) ) ) WORKSIZ = IPOSTPAD + WORKSIZ ELSE WORKSIZ = IPOSTPAD END IF * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate RHS * CALL PCMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, NPCOL ) * IF( CHECK ) $ CALL PCFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) * IF( EST ) THEN CALL PCMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB0 ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 2 ) * * Solve linear system via Cholesky factorization * CALL PCPOTRS( UPLO, N, NRHS, MEM( IPA ), 1, 1, $ DESCA, MEM( IPB ), 1, 1, DESCB, $ INFO ) * CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCPOTRS', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPOTRS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PCLASCHK( 'Herm', 'Diag', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, IBSEED, $ ANORM, SRESID, MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' END IF * IF( EST ) THEN * * Calculate workspace required for PCPORFS * LWORK = MAX( 1, 2*NP ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LRWORK = MAX( 1, NP ) LW2 = ICEIL( LRWORK*REALSZ, CPLXSZ ) + $ IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'iter ref', ( IPW2+LW2 )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, LW2-IPOSTPAD, $ 1, MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Use iterative refinement to improve the * computed solution * CALL PCPORFS( UPLO, N, NRHS, MEM( IPA0 ), $ 1, 1, DESCA, MEM( IPA ), 1, 1, $ DESCA, MEM( IPB0 ), 1, 1, $ DESCB, MEM( IPB ), 1, 1, DESCB, $ MEM( IPFERR ), MEM( IPBERR ), $ MEM( IPW ), LWORK, MEM( IPW2 ), $ LRWORK, INFO ) * * check for memory overwrite * IF( CHECK ) THEN CALL PCCHEKPAD( ICTXT, 'PCPORFS', NP, $ NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPORFS', NP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPORFS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPORFS', NP, $ MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPORFS', 1, $ MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPORFS', 1, $ MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPORFS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCPORFS', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PCLASCHK( 'Herm', 'Diag', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, $ IBSEED, ANORM, SRESID2, $ MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID2.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID2 * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 4/3 N^3 + 3 N^2 flops for LLt factorization * NOPS = 4.0D+0*(DBLE(N)**3)/3.0D+0 + $ 3.0D+0*(DBLE(N)**2) * * nrhs * 8 N^2 flops for LLt solve. * NOPS = NOPS + 8.0D+0*(DBLE(N)**2)*DBLE(NRHS) * * Calculate total megaflops -- factorization and * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ PASSED * END IF 10 CONTINUE 20 CONTINUE * IF( CHECK .AND. SRESID.GT.THRESH ) THEN * * Compute FRESID = ||A - LL'|| / (||A|| * N * eps) * CALL PCPOTRRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PCLAFCHK( 'Symm', 'Diag', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCPOTRRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * IF( IAM.EQ.0 ) THEN IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9986 ) 'L*L''', FRESID ELSE WRITE( NOUT, FMT = 9986 ) 'U''*U', FRESID END IF END IF END IF * 30 CONTINUE 40 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 50 CONTINUE * * Print ending messages and close output file * 60 CONTINUE IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UPLO N NB NRHS NBRHS P Q LLt Time ', $ 'Slv Time MFLOPS CHECK' ) 9994 FORMAT( '---- ---- ----- --- ---- ----- ---- ---- -------- ', $ '-------- -------- ------' ) 9993 FORMAT( A4, 4X, A1, 1X, I5, 1X, I3, 1X, I4, 1X, I5, 1X, I4, 1X, $ I4, 1X, F8.2, 1X, F8.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PCLLTDRIVER * END scalapack-2.0.2/TESTING/LIN/pclltinfo.f000644 000766 000024 00000042150 10363532303 017611 0ustar00juliestaff000000 000000 SUBROUTINE PCLLTINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NNR, NRVAL, LDNRVAL, NNBR, $ NBRVAL, LDNBRVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, THRESH, EST, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL EST CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PCLLTINFO get needed startup information for LLt factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * EST (global output) LOGICAL * Flag indicating if condition estimation and iterative * refinement routines are to be exercised. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 7, LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'LLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Read the flag that indicates whether to test the condition * estimation and iterative refinement routines. * READ( NIN, FMT = * ) EST * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS IF( LSAME( UPLO, 'L' ) ) THEN WORK( 6 ) = 1 ELSE WORK( 6 ) = 2 END IF IF( EST ) THEN WORK( 7 ) = 1 ELSE WORK( 7 ) = 0 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 7, 1, WORK, 7 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Ax=b by LLt factorization.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision LLt factorization '// $ 'and solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data is stored in ''Upper'// $ ''' or ''Lower'' portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'LLt time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 7, 1, WORK, 7, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) IF( WORK( 6 ).EQ.1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF IF( WORK( 7 ).EQ.1 ) THEN EST = .TRUE. ELSE EST = .FALSE. END IF * I = NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCLLTINFO * END scalapack-2.0.2/TESTING/LIN/pclsdriver.f000644 000766 000024 00000135551 10363532303 020004 0ustar00juliestaff000000 000000 PROGRAM PCLSDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * Purpose * ======= * * PCLSDRIVER is the main test program for the COMPLEX * SCALAPACK (full rank) Least Squares routines. This test driver solves * full-rank least square problems. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 17 lines: * 'ScaLapack LS solve input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LS.out' output file name (if any) * 6 device out * 4 number of problems sizes * 55 17 31 201 values of M * 5 71 31 201 values of N * 3 number of NB's * 2 3 5 values of NB * 3 number of NRHS's * 2 3 5 values of NRHS * 2 number of NBRHS's * 1 2 values of NBRHS * 7 number of process grids (ordered P & Q) * 1 2 1 4 2 3 8 values of P * 7 2 4 1 3 2 1 values of Q * 3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, MEMSIZ, NTESTS, REALSZ, TOTMEM REAL RZERO, RONE COMPLEX ONE, PADVAL, ZERO PARAMETER ( CPLXSZ = 8, REALSZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ) ) PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), RZERO = 0.0E+0, $ RONE = 1.0E+0, ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK, TPSD CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, II, IMIDPAD, $ INFO, IPA, IPB, IPOSTPAD, IPREPAD, IPW, IPW2, $ IPX, ISCALE, ITRAN, ITYPE, J, JJ, K, KFAIL, KK, $ KPASS, KSKIP, KTESTS, LCM, LCMP, LTAU, LWF, $ LWORK, LWS, M, MNP, MNRHSP, MP, MQ, MYCOL, $ MYROW, N, NB, NBRHS, NCOLS, NGRIDS, NMAT, NNB, $ NNBR, NNR, NNRHSQ, NOUT, NP, NPCOL, NPROCS, $ NPROW, NROWS, NQ, NRHS, NRHSP, NRHSQ, WORKSIZ REAL ANORM, BNORM, SRESID, THRESH DOUBLE PRECISION ADDFAC, ADDS, MULFAC, MULTS, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCW( LLD_ ), $ DESCX( DLEN_ ), IERR( 2 ), MVAL( NTESTS ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL RESULT( 2 ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCCHEKPAD, $ PCFILLPAD, PCGELS, PCGEMM, PCLACPY, $ PCLSINFO, PCMATGEN, PSCNRM2, $ PCSSCAL, PCQRT13, PCQRT16, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC REAL PCLANGE, PCQRT14, PCQRT17 EXTERNAL ICEIL, ILCM, LSAME, NUMROC, PCLANGE, $ PCQRT14, PCQRT17 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * IASEED = 100 IBSEED = 200 CALL PCLSINFO( OUTFILE, NOUT, NMAT, MVAL, NTESTS, NVAL, $ NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, NTESTS, $ NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 90 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 90 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( ( MYROW.GE.NPROW ).OR.( MYCOL.GE.NPCOL ) ) $ GO TO 90 * DO 80 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 80 END IF * * Loop over different blocking sizes * DO 70 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 70 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) MQ = NUMROC( M, NB, MYCOL, 0, NPCOL ) NP = NUMROC( N, NB, MYROW, 0, NPROW ) MNP = MAX( MP, NP ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 70 END IF * DO 60 ISCALE = 1, 3 * ITYPE = ISCALE * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPX = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPX * WORKSIZ = NQ + IPOSTPAD * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'MEMORY', $ ( IPX+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 70 END IF * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Generate the matrix A and calculate its 1-norm * CALL PCQRT13( ISCALE, M, N, MEM( IPA ), 1, 1, $ DESCA, ANORM, IASEED, MEM( IPW ) ) * IF( CHECK ) THEN CALL PCCHEKPAD( ICTXT, 'PCQRT13', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCQRT13', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * DO 50 ITRAN = 1, 2 * IF( ITRAN.EQ.1 ) THEN NROWS = M NCOLS = N TRANS = 'N' TPSD = .FALSE. ELSE NROWS = N NCOLS = M TRANS = 'C' TPSD = .TRUE. END IF * * Loop over the different values for NRHS * DO 40 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 30 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * NRHSP = NUMROC( NRHS, NBRHS, MYROW, 0, $ NPROW ) NRHSQ = NUMROC( NRHS, NBRHS, MYCOL, 0, $ NPCOL ) * * Define Array descriptor for rhs MAX(M,N)xNRHS * CALL DESCINIT( DESCX, MAX( M, N ), NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, MNP ) + IMIDPAD, $ IERR( 1 ) ) IF( TPSD ) THEN CALL DESCINIT( DESCW, M, NRHS, NB, NBRHS, $ 0, 0, ICTXT, MAX( 1, MP ) + $ IMIDPAD, IERR( 2 ) ) ELSE CALL DESCINIT( DESCW, N, NRHS, NB, NBRHS, $ 0, 0, ICTXT, MAX( 1, NP ) + $ IMIDPAD, IERR( 2 ) ) END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, $ 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Check for enough memory * IPX = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + $ IPREPAD IPW = IPX + DESCX( LLD_ )*NRHSQ + IPOSTPAD + $ IPREPAD WORKSIZ = DESCW( LLD_ )*NRHSQ + IPOSTPAD * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Generation', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate RHS * IF( TPSD ) THEN CALL PCMATGEN( ICTXT, 'No', 'No', $ DESCW( M_ ), DESCW( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), $ MEM( IPW ), DESCW( LLD_ ), $ DESCW( RSRC_ ), $ DESCW( CSRC_ ), IBSEED, 0, $ MP, 0, NRHSQ, MYROW, MYCOL, $ NPROW, NPCOL ) ELSE CALL PCMATGEN( ICTXT, 'No', 'No', $ DESCW( M_ ), DESCW( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), $ MEM( IPW ), DESCW( LLD_ ), $ DESCW( RSRC_ ), $ DESCW( CSRC_ ), IBSEED, 0, $ NP, 0, NRHSQ, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PCFILLPAD( ICTXT, MP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PCFILLPAD( ICTXT, NP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * DO 10 JJ = 1, NRHS CALL PSCNRM2( NCOLS, BNORM, MEM( IPW ), $ 1, JJ, DESCW, 1 ) IF( BNORM.GT.RZERO ) $ CALL PCSSCAL( NCOLS, RONE / BNORM, $ MEM( IPW ), 1, JJ, DESCW, $ 1 ) 10 CONTINUE * CALL PCGEMM( TRANS, 'N', NROWS, NRHS, NCOLS, $ ONE, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ), 1, 1, DESCW, ZERO, $ MEM( IPX ), 1, 1, DESCX ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'Generation', MP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'Generation', MNP, $ NRHSQ, MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PCCHEKPAD( ICTXT, 'Generation', $ MP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PCCHEKPAD( ICTXT, 'Generation', $ NP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Allocate space for copy of rhs * IPB = IPW * IF( TPSD ) THEN CALL DESCINIT( DESCB, N, NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, $ IERR( 1 ) ) ELSE CALL DESCINIT( DESCB, M, NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, $ IERR( 1 ) ) END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * IPW = IPB + DESCB( LLD_ )*NRHSQ + $ IPOSTPAD + IPREPAD * END IF * * Calculate the amount of workspace for PCGELS * IF( M.GE.N ) THEN LTAU = NUMROC( MIN(M,N), NB, MYCOL, 0, $ NPCOL ) LWF = NB * ( MP + NQ + NB ) LWS = MAX( ( NB*( NB - 1 ) ) / 2, $ ( MP + NRHSQ ) * NB ) + NB*NB ELSE LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LTAU = NUMROC( MIN(M,N), NB, MYROW, 0, $ NPROW ) LWF = NB * ( MP + NQ + NB ) LWS = MAX( ( NB*( NB - 1 ) ) / 2, ( NP + $ MAX( NQ + NUMROC( NUMROC( N, NB, 0, $ 0, NPROW ), NB, 0, 0, LCMP ), $ NRHSQ ) ) * NB ) + NB*NB END IF * LWORK = LTAU + MAX( LWF, LWS ) WORKSIZ = LWORK + IPOSTPAD * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'solve', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Make the copy of the right hand side * CALL PCLACPY( 'All', NROWS, NRHS, $ MEM( IPX ), 1, 1, DESCX, $ MEM( IPB ), 1, 1, DESCB ) * IF( TPSD ) THEN CALL PCFILLPAD( ICTXT, NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PCFILLPAD( ICTXT, MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF CALL PCFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * CALL SLBOOT( ) CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Solve the LS or overdetermined system * CALL PCGELS( TRANS, M, N, NRHS, MEM( IPA ), $ 1, 1, DESCA, MEM( IPX ), 1, 1, $ DESCX, MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGELS', MP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGELS', MNP, $ NRHSQ, MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGELS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Regenerate A in place for testing and next * iteration * CALL PCQRT13( ISCALE, M, N, MEM( IPA ), 1, 1, $ DESCA, ANORM, IASEED, $ MEM( IPW ) ) * * check the solution to rhs * IF( CHECK ) THEN * * Am I going to call PCQRT17 ? * IF( ( M.GE.N .AND. ( .NOT.TPSD ) ) .OR. $ ( M.LT.N .AND. TPSD ) ) THEN * * Call PCQRT17 first, A, X, and B remain * unchanged. Solving LS system * * Check amount of memory for PCQRT17 * IF( TPSD ) THEN WORKSIZ = NP*NRHSQ + NRHSP*MQ IPW2 = IPW + WORKSIZ WORKSIZ = WORKSIZ + $ ICEIL( REALSZ*MAX( NQ, MAX( $ MQ, NRHSQ ) ), CPLXSZ ) + $ IPOSTPAD ELSE WORKSIZ = MP*NRHSQ + NRHSP*NQ IPW2 = IPW + WORKSIZ WORKSIZ = WORKSIZ + $ ICEIL( REALSZ*MAX( NQ, $ NRHSQ ), CPLXSZ ) + $ IPOSTPAD END IF * * Check for adequate memory for problem * size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'MEMORY', ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PCFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * RESULT( 2 ) = PCQRT17( TRANS, 1, M, N, $ NRHS, $ MEM( IPA ), $ 1, 1, DESCA, $ MEM( IPX ), 1, $ 1, DESCX, $ MEM( IPB ), $ 1, 1, DESCB, $ MEM( IPW ), $ MEM( IPW2 ) ) SRESID = RESULT( 2 ) * CALL PCCHEKPAD( ICTXT, 'PCQRT17', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCQRT17', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PCCHEKPAD( ICTXT, 'PCQRT17', $ NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) ELSE CALL PCCHEKPAD( ICTXT, 'PCQRT17', $ MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF CALL PCCHEKPAD( ICTXT, 'PCQRT17', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Call PCQRT16, B will be destroyed. * IF( TPSD ) THEN WORKSIZ = MP + IPOSTPAD ELSE WORKSIZ = NQ + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'MEMORY', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PCFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PCQRT16( TRANS, M, N, NRHS, $ MEM( IPA ), 1, 1, DESCA, $ MEM( IPX ), 1, 1, DESCX, $ MEM( IPB ), 1, 1, DESCB, $ MEM( IPW ), RESULT( 1 ) ) * CALL PCCHEKPAD( ICTXT, 'PCQRT16', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCQRT16', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PCCHEKPAD( ICTXT, 'PCQRT16', $ NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) ELSE CALL PCCHEKPAD( ICTXT, 'PCQRT16', $ MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF CALL PCCHEKPAD( ICTXT, 'PCQRT16', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * * Call PCQRT14 * IF( ( M.GE.N .AND. TPSD ) .OR. $ ( M.LT.N .AND. ( .NOT.TPSD ) ) ) THEN * IPW = IPB * IF( TPSD ) THEN * NNRHSQ = NUMROC( N+NRHS, NB, MYCOL, $ 0, NPCOL ) LTAU = NUMROC( MIN( M, N+NRHS ), NB, $ MYCOL, 0, NPCOL ) LWF = NB * ( NB + MP + NNRHSQ ) WORKSIZ = MP * NNRHSQ + LTAU + LWF + $ IPOSTPAD * ELSE * MNRHSP = NUMROC( M+NRHS, NB, MYROW, $ 0, NPROW ) LTAU = NUMROC( MIN( M+NRHS, N ), NB, $ MYROW, 0, NPROW ) LWF = NB * ( NB + MNRHSP + NQ ) WORKSIZ = MNRHSP * NQ + LTAU + LWF + $ IPOSTPAD * END IF * * Check for adequate memory for problem * size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'MEMORY', ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PCFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * * Solve underdetermined system * RESULT( 2 ) = PCQRT14( TRANS, M, N, $ NRHS, $ MEM( IPA ), 1, $ 1, DESCA, $ MEM( IPX ), $ 1, 1, DESCX, $ MEM( IPW ) ) SRESID = RESULT( 2 ) * CALL PCCHEKPAD( ICTXT, 'PCQRT14', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCQRT14', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCQRT14', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Print information about the tests that * did not pass the threshold. * PASSED = 'PASSED' DO 20 II = 1, 2 IF( ( RESULT( II ).GE.THRESH ) .AND. $ ( RESULT( II )-RESULT( II ).EQ.0.0E+0 $ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 )TRANS, $ M, N, NRHS, NB, ITYPE, II, $ RESULT( II ) KFAIL = KFAIL + 1 PASSED = 'FAILED' ELSE KPASS = KPASS + 1 END IF 20 CONTINUE * ELSE * * By-pass the solve check * KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock * timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN ADDFAC = 2 MULFAC = 6 IF( M.GE.N ) THEN * * NOPS = SOPLA( 'CGEQRF', M, N, 0, 0, * NB ) + SOPLA( 'CUNMQR', M, NRHS, N, * 0, NB ) * MULTS = N*( ( ( 23.D0 / 6.D0 )+M+N / $ 2.D0 )+ N*( M-N / 3.D0 ) ) + $ N*NRHS*( 2.D0*M+2.D0-N ) ADDS = N*( ( 5.D0 / 6.D0 )+N* $ ( 1.D0 / 2.D0+( M-N / 3.D0 ) ) ) $ + N*NRHS*( 2.D0*M+1.D0-N ) ELSE * * NOPS = SOPLA( 'CGELQF', M, N, 0, 0, * NB ) + SOPLA( 'CUNMLQ', M, * NRHS, N, 0, NB ) * MULTS = M*( ( ( 29.D0 / 6.D0 )+2.D0*N-M $ / 2.D0 )+M*( N-M / 3.D0 ) ) $ + N*NRHS*( 2.D0*M+2.D0-N ) ADDS = M*( ( 5.D0 / 6.D0 )+M / 2.D0+M* $ ( N-M / 3.D0 ) ) $ + N*NRHS*( 2.D0*M+1.D0-N ) END IF NOPS = ADDFAC*ADDS + MULFAC*MULTS * * Calculate total megaflops, for WALL and * CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) $ 'WALL', TRANS, M, N, NB, NRHS, $ NBRHS, NPROW, NPCOL, WTIME( 1 ), $ TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) $ 'CPU ', TRANS, M, N, NB, NRHS, $ NBRHS, NPROW, NPCOL, CTIME( 1 ), $ TMFLOPS, PASSED END IF 30 CONTINUE 40 CONTINUE 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 90 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'Time TRANS M N NB NRHS NBRHS P Q ', $ 'LS Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- ------ ------ --- ----- ----- ----- ----- ', $ '--------- -------- ------' ) 9993 FORMAT( A4, 3X, A1, 3X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, $ I5, 1X, I5, 1X, F9.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( ' TRANS=''', A1, ''', M=', I5, ', N=', I5, ', NRHS=', I4, $ ', NB=', I4, ', type', I2, ', test(', I2, ')=', G12.5 ) * STOP * * End of PCLSDRIVER * END scalapack-2.0.2/TESTING/LIN/pclsinfo.f000644 000766 000024 00000040775 10363532303 017447 0ustar00juliestaff000000 000000 SUBROUTINE PCLSINFO( SUMMRY, NOUT, NMAT, MVAL, LDMVAL, $ NVAL, LDNVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDMVAL, LDNBRVAL, LDNBVAL, LDNRVAL, $ LDNVAL, LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, $ NNBR, NNR, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDMVAL ), NBRVAL( LDNBRVAL ), $ NBVAL( LDNBVAL ), NRVAL( LDNRVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), $ QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PCLSINFO gets needed startup information for LS solve and * transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * MVAL (global output) INTEGER array, dimension (LDNVAL) * The values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M, LDNVAL > = NMAT. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (workspace) INTEGER array of dimension >= * MAX(5,LDMVAL+LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL) * Used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='LS.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 ELSE IF( NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'M', LDMVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 5, 1, WORK, 5 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK min ||Ax-b|| by QR factorizations.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision least-square solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - QR|| / '// $ '(||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used. If CPU and WALL time' WRITE( NOUT, FMT = 9999 ) $ ' are the same, only one line '// $ 'is printed, and the label is ''BOTH''.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 ) $ 'QR time : Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 5, 1, WORK, 5, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) * I = 2*NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCLSINFO * END scalapack-2.0.2/TESTING/LIN/pcludriver.f000644 000766 000024 00000125302 11657237147 020016 0ustar00juliestaff000000 000000 PROGRAM PCLUDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======== * * PCLUDRIVER is the main test program for the COMPLEX * SCALAPACK LU routines. This test driver performs an LU factorization * and solve. If the input matrix is non-square, only the factorization * is performed. Condition estimation and iterative refinement are * optionally performed. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'SCALAPACK, Version 2.0, LU factorization input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LU.out' output file name (if any) * 6 device out * 1 number of problems sizes * 31 201 values of M * 31 201 values of N * 1 number of NB's * 2 values of NB * 1 number of NRHS's * 1 values of NRHS * 1 number of NBRHS's * 1 values of NBRHS * 1 number of process grids (ordered pairs of P & Q) * 2 1 4 2 3 8 values of P * 2 4 1 3 2 1 values of Q * 1.0 threshold * T (T or F) Test Cond. Est. and Iter. Ref. Routines * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, INTGSZ, MEMSIZ, NTESTS, REALSZ, TOTMEM REAL ZERO COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, INTGSZ = 4, REALSZ = 4, $ TOTMEM = 4000000, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, EST CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, IMIDPAD, $ INFO, IPA, IPA0, IPB, IPB0, IPBERR, IPFERR, $ IPOSTPAD, IPPIV, IPREPAD, IPW, IPW2, J, K, $ KFAIL, KK, KPASS, KSKIP, KTESTS, LCM, LCMQ, $ LIPIV, LRWORK, LWORK, LW2, M, MAXMN, $ MINMN, MP, MYCOL, MYRHS, MYROW, N, NB, NBRHS, $ NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, NPCOL, $ NPROCS, NPROW, NQ, NRHS, WORKSIZ REAL ANORM, ANORM1, FRESID, RCOND, SRESID, SRESID2, $ THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), IERR( 1 ), $ MVAL( NTESTS ), NBRVAL( NTESTS ), $ NBVAL( NTESTS ), NRVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), $ QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCCHEKPAD, $ PCFILLPAD, PCGECON, PCGERFS, $ PCGETRF, PCGETRRV, PCGETRS, $ PCLAFCHK, PCLASCHK, PCLUINFO, $ PCMATGEN, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ICEIL, ILCM, NUMROC REAL PCLANGE EXTERNAL ICEIL, ILCM, NUMROC, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 CALL PCLUINFO( OUTFILE, NOUT, NMAT, MVAL, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NNR, NRVAL, NTESTS, NNBR, NBRVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, THRESH, $ EST, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 50 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 50 * DO 40 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 40 END IF * DO 30 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IF( EST .AND. M.EQ.N ) THEN IPA0 = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPPIV = IPA0 + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD ELSE IPPIV = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD END IF LIPIV = ICEIL( INTGSZ*( MP+NB ), CPLXSZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * IF( CHECK ) THEN * * Calculate the amount of workspace required by the * checking routines PCLANGE, PCGETRRV, and * PCLAFCHK * WORKSIZ = MAX( 2, NQ ) * WORKSIZ = MAX( WORKSIZ, MP*DESCA( NB_ )+ $ NQ*DESCA( MB_ ) ) * WORKSIZ = MAX( WORKSIZ, MP * DESCA( NB_ ) ) * WORKSIZ = WORKSIZ + IPOSTPAD * ELSE * WORKSIZ = IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate matrix A of Ax = b * CALL PCMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ MP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, LIPIV, 1, MEM( IPPIV-IPREPAD ), $ LIPIV, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANGE( 'I', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) ANORM1 = PCLANGE( '1', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) END IF * IF( EST .AND. M.EQ.N ) THEN CALL PCMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA0 ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) IF( CHECK ) $ CALL PCFILLPAD( ICTXT, MP, NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform LU factorization * CALL PCGETRF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PCGETRF INFO=', INFO KFAIL = KFAIL + 1 RCOND = ZERO GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in LU factorization * CALL PCCHEKPAD( ICTXT, 'PCGETRF', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRF', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * IF( M.NE.N ) THEN * * For non-square matrices, factorization only * NRHS = 0 NBRHS = 0 * IF( CHECK ) THEN * * Compute FRESID = ||A - P*L*U|| / (||A|| * N * eps) * CALL PCGETRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ) ) CALL PCLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGETRRV', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRRV', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( (FRESID-FRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * MAXMN = MAX( M, N ) MINMN = MIN( M, N ) * * 4 M N^2 - 4/3 N^3 + 2 M N - 3 N^2 flops for LU * factorization M >= N * NOPS = 4.0D+0*DBLE(MAXMN)*(DBLE(MINMN)**2) - $ (4.0D+0 / 3.0D+0)*( DBLE( MINMN )**3 ) + $ (2.0D+0)*DBLE( MAXMN )*DBLE( MINMN ) - $ (3.0D+0)*( DBLE( MINMN )**2 ) * * Calculate total megaflops -- factorization only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * WTIME( 2 ) = 0.0D+0 IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, NB, $ NRHS, NBRHS, NPROW, NPCOL, WTIME( 1 ), $ WTIME( 2 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * CTIME( 2 ) = 0.0D+0 IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, NB, $ NRHS, NBRHS, NPROW, NPCOL, CTIME( 1 ), $ CTIME( 2 ), TMFLOPS, PASSED END IF * ELSE * * If M = N * IF( EST ) THEN * * Calculate workspace required for PCGECON * LWORK = MAX( 1, 2*NP ) + $ MAX( 2, DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), $ NQ + DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LRWORK = MAX( 1, 2*NQ ) LW2 = ICEIL( LRWORK*REALSZ, CPLXSZ ) + IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'cond est', $ ( IPW2+LW2 )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Compute condition number of the matrix * CALL PCGECON( '1', N, MEM( IPA ), 1, 1, DESCA, $ ANORM1, RCOND, MEM( IPW ), LWORK, $ MEM( IPW2 ), LRWORK, INFO ) * IF( CHECK ) THEN CALL PCCHEKPAD( ICTXT, 'PCGECON', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGECON', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGECON', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 10 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB, N, NRHS, NB, NBRHS, 0, 0, $ ICTXT, MAX( 1, NP )+IMIDPAD, $ IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * move IPW to allow room for RHS * MYRHS = NUMROC( DESCB( N_ ), DESCB( NB_ ), $ MYCOL, DESCB( CSRC_ ), NPCOL ) IPB = IPW * IF( EST ) THEN IPB0 = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD IPFERR = IPB0 + DESCB( LLD_ )*MYRHS + $ IPOSTPAD + IPREPAD IPBERR = MYRHS + IPFERR + IPOSTPAD + IPREPAD IPW = MYRHS + IPBERR + IPOSTPAD + IPREPAD ELSE IPW = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD END IF * * Set worksiz: routines requiring most workspace * is PCLASCHK * IF( CHECK ) THEN LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ NQ * NBRHS + NP * NBRHS + $ MAX( MAX( NQ*NB, 2*NBRHS ), $ NBRHS * NUMROC( NUMROC(N,NB,0,0,NPCOL),NB, $ 0,0,LCMQ ) ) ) WORKSIZ = IPOSTPAD + WORKSIZ ELSE WORKSIZ = IPOSTPAD END IF * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate RHS * CALL PCMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) * IF( CHECK ) $ CALL PCFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * IF( EST ) THEN CALL PCMATGEN( ICTXT, 'No', 'No', $ DESCB( M_ ), DESCB( N_ ), $ DESCB( MB_ ), DESCB( NB_ ), $ MEM( IPB0 ), DESCB( LLD_ ), $ DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, $ 0, MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 2 ) * * Solve linear sytem via LU factorization * CALL PCGETRS( 'No', N, NRHS, MEM( IPA ), 1, 1, $ DESCA, MEM( IPPIV ), MEM( IPB ), $ 1, 1, DESCB, INFO ) * CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGETRS', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRS', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PCLASCHK( 'No', 'N', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, IBSEED, $ ANORM, SRESID, MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * The second test is a NaN trap * IF( SRESID.LE.THRESH .AND. $ ( SRESID-SRESID ).EQ.0.0E+0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' END IF * IF( EST ) THEN * * Calculate workspace required for PCGERFS * LWORK = MAX( 1, 2*NP ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LRWORK = MAX( 1, NP ) LW2 = ICEIL( LRWORK*REALSZ, CPLXSZ ) + $ IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'iter ref', ( IPW2+LW2 )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Use iterative refinement to improve the * computed solution * CALL PCGERFS( 'No', N, NRHS, MEM( IPA0 ), 1, $ 1, DESCA, MEM( IPA ), 1, 1, $ DESCA, MEM( IPPIV ), $ MEM( IPB0 ), 1, 1, DESCB, $ MEM( IPB ), 1, 1, DESCB, $ MEM( IPFERR ), MEM( IPBERR ), $ MEM( IPW ), LWORK, MEM( IPW2 ), $ LRWORK, INFO ) * IF( CHECK ) THEN CALL PCCHEKPAD( ICTXT, 'PCGERFS', NP, $ NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGERFS', NP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGERFS', LIPIV, $ 1, MEM( IPPIV-IPREPAD ), $ LIPIV, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGERFS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGERFS', NP, $ MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGERFS', 1, $ MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGERFS', 1, $ MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGERFS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGERFS', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PCLASCHK( 'No', 'N', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, $ IBSEED, ANORM, SRESID2, $ MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID2.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID2 * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * * Gather max. of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 8/3 N^3 - N^2 flops for LU factorization * NOPS = (8.0D+0/3.0D+0)*( DBLE(N)**3 ) - $ DBLE(N)**2 * * nrhs * 8 N^2 flops for LU solve. * NOPS = NOPS + 8.0D+0*(DBLE(N)**2)*DBLE(NRHS) * * Calculate total megaflops -- factorization * and solve -- for WALL and CPU time, and print * output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) $ THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * * Print WALL time if supported * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ PASSED * * Print CPU time if supported * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) $ THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ PASSED END IF 10 CONTINUE 20 CONTINUE * IF( CHECK.AND.( SRESID.GT.THRESH ) ) THEN * * Compute fresid = ||A - P*L*U|| / (||A|| * N * eps) * CALL PCGETRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ) ) CALL PCLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGETRRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRRV', LIPIV, $ 1, MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID END IF END IF 30 CONTINUE 40 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 50 CONTINUE * * Print ending messages and close output file * 60 CONTINUE IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N NB NRHS NBRHS P Q LU Time ', $ 'Sol Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- ----- --- ---- ----- ---- ---- -------- ', $ '-------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I5, 1X, I3, 1X, I5, 1X, I4, 1X, I4, 1X, $ I4, 1X, F8.2, 1X, F8.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - P*L*U|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PCLUDRIVER * END scalapack-2.0.2/TESTING/LIN/pcluinfo.f000644 000766 000024 00000041101 10363532303 017431 0ustar00juliestaff000000 000000 SUBROUTINE PCLUINFO( SUMMRY, NOUT, NMAT, MVAL, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NNR, NRVAL, LDNRVAL, NNBR, $ NBRVAL, LDNBRVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, EST, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL EST CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDNVAL ), NBRVAL( LDNBRVAL ), $ NBVAL( LDNBVAL ), NRVAL( LDNRVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ WORK( * ) * .. * * Purpose * ======= * * PCLUINFO gets needed startup information for LU factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for M and N. * * MVAL (global output) INTEGER array, dimension (LDNVAL) * The values of M (number of rows in matrix) to run the code * with. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * M and N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * EST (global output) LOGICAL * Flag indicating if condition estimation and iterative * refinement routines are to be exercised. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 6, 2*LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL ) * Used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='LU.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Read the flag that indicates whether to test the condition * estimation and iterative refinement routines. * READ( NIN, FMT = * ) EST * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS IF( EST ) THEN WORK( 6 ) = 1 ELSE WORK( 6 ) = 0 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 6, 1, WORK, 6 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Ax=b by LU factorization.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision LU factorization '// $ 'and solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| / '// $ '(||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'LU time : Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 6, 1, WORK, 6, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) IF( WORK( 6 ).EQ.1 ) THEN EST = .TRUE. ELSE EST = .FALSE. END IF * I = 2*NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCLUINFO * END scalapack-2.0.2/TESTING/LIN/pcmatgen.f000644 000766 000024 00000046323 10363532303 017423 0ustar00juliestaff000000 000000 SUBROUTINE PCMATGEN( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PCMATGEN : Parallel Complex Single precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * otherwise a random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) COMPLEX, pointer into the local memory to * an array of dimension ( LDA, * ) containing the local * pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB REAL DUMMY * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MOD, REAL * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSRAND EXTERNAL ICEIL, NUMROC, LSAME, PSRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) * INFO = 0 IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCMATGEN', INFO ) RETURN END IF * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = 2*NPMB JUMP3 = 2*M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = 2*MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = CMPLX( PSRAND(0), PSRAND(0) ) 30 CONTINUE IF( SYMM ) THEN A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) ELSE A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), ZERO ) DUMMY = PSRAND(0) END IF DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) IF( HERM ) THEN A(IK,JK+J) = CONJG( A(IK+J,JK) ) ELSE A(IK,JK+J) = A(IK+J,JK) END IF 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = 2*NQNB JUMP3 = 2*N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = 2*NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 IF( SYMM ) THEN A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) ELSE A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ TWO*PSRAND(0) - ONE ) END IF JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = 2*NQNB JUMP3 = 2*N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = 2*NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 IF( TRAN ) THEN A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) ELSE A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ TWO*PSRAND(0) - ONE ) END IF JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSE * JUMP1 = 1 JUMP2 = 2*NPMB JUMP3 = 2*M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = 2*MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 IF( HERM ) THEN A(IK,JK+J) = CMPLX( $ ABS(REAL(A(IK,JK+J)))+2*MAXMN, ZERO ) ELSE A(IK,JK+J) = CMPLX( ABS(REAL(A(IK,JK+J)))+MAXMN, $ ABS(AIMAG(A(IK,JK+J)))+ MAXMN ) END IF IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PCMATGEN * END scalapack-2.0.2/TESTING/LIN/pcpbdriver.f000644 000766 000024 00000074174 10363532303 017772 0ustar00juliestaff000000 000000 PROGRAM PCPBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PCPBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by CPB. This test driver performs an * A = L*L**H factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM DOUBLE PRECISION array, dimension ( TOTMEM/CPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER CPLXSZ, MEMSIZ, NTESTS COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BW, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, I, $ IAM, IASEED, IBSEED, ICTXT, ICTXTB, IERR_TEMP, $ IMIDPAD, INFO, IPA, IPB, IPOSTPAD, IPREPAD, $ IPW, IPW_SIZE, IPW_SOLVE, IPW_SOLVE_SIZE, $ IP_DRIVER_W, IP_FILLIN, J, K, KFAIL, KPASS, $ KSKIP, KTESTS, MYCOL, MYRHS_SIZE, MYROW, N, NB, $ NBW, NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, $ NPCOL, NPROCS, NPROCS_REAL, NPROW, NQ, NRHS, $ N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWVAL( NTESTS ), DESCA( 7 ), DESCA2D( DLEN_ ), $ DESCB( 7 ), DESCB2D( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCBMATGEN, $ PCCHEKPAD, PCFILLPAD, PCMATGEN, PCPBINFO, $ PCPBLASCHK, PCPBTRF, PCPBTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PCLANGE EXTERNAL LSAME, NUMROC, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PCPBINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NBW, $ BWVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, $ NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BW = BWVAL( BW_NUM ) IF( BW.LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Band', 'bw', BW IERR( 1 ) = 1 END IF * IF( BW.GT.N-1 ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*BW-1)/NPCOL + 1 ) $ + BW NB = MAX( NB, 2*BW ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*BW, N ) ) THEN IERR( 1 ) = 1 ENDIF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (BW+1), (BW+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((BW+1)+10) IMIDPAD = 10 IPOSTPAD = ((BW+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (BW+1), N, $ (BW+1), NB, 0, 0, $ ICTXT,((BW+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((BW+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (NB+2*BW)*BW * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = BW*BW * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((BW+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PCLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PCPBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(BW*(BW+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PCBMATGEN( ICTXT, UPLO, 'B', BW, BW, N, (BW+1), NB, $ MEM( IPA ), ((BW+1)+10), 0, 0, IASEED, $ MYROW, MYCOL, NPROW, NPCOL ) * CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((BW+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PCLANGE( '1', (BW+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((BW+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PCPBTRF( UPLO, N, BW, MEM( IPA ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PCPBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PCCHEKPAD( ICTXT, 'PCPBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((BW+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (BW*NRHS) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PCMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PCFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PCPBTRS( UPLO, N, BW, NRHS, MEM( IPA ), 1, $ DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PCPBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCPBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PCPBLASCHK( 'H', UPLO, N, BW, BW, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * NOPS = NOPS + DBLE(BW)*( -2.D0 / 3.D0+DBLE(BW)* $ ( -1.D0+DBLE(BW)*( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( 1.D0+DBLE(BW)*( 3.D0 / $ 2.D0+DBLE(BW)*( 1.D0 / 2.D0 ) ) ) NOPS = NOPS + DBLE(BW)*( -1.D0 / 6.D0+DBLE(BW) $ *( -1.D0 /2.D0+DBLE(BW) $ *( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( DBLE(BW) / $ 2.D0*( 1.D0+DBLE(BW) ) ) * NOPS = NOPS + $ DBLE(NRHS)*( ( 2*DBLE(N)-DBLE(BW) )* $ ( DBLE(BW)+1.D0 ) )+ DBLE(NRHS)* $ ( DBLE(BW)*( 2*DBLE(N)- $ ( DBLE(BW)+1.D0 ) ) ) * * * Second calc to represent actual hardware speed * * NB bw^2 flops for LLt factorization in 1st proc * NOPS2 = ( (DBLE(N_FIRST))* DBLE(BW)**2 ) * IF ( NPROCS_REAL .GT. 1) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in last processor * NOPS2 = NOPS2 + $ 4*( (DBLE(N_LAST)*DBLE(BW)**2) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 4*( (DBLE(NB)*DBLE(BW)**2) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ ( NPROCS_REAL-1 ) * ( BW*BW*BW/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ ( NPROCS_REAL-2 ) * ( 2 * BW*BW*BW ) ENDIF * * * nrhs * 4 n_first*bw flops for LLt solve in proc 1. * NOPS2 = NOPS2 + $ ( 4.0D+0*(DBLE(N_FIRST)*DBLE(BW))*DBLE(NRHS) ) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*4 n_last*bw flops for LLt solve in last. * NOPS2 = NOPS2 + $ 2*( 4.0D+0*(DBLE(N_LAST)*DBLE(BW))*DBLE(NRHS) ) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2 * nrhs * 4 NB*bw flops for LLt solve in others. * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( 4.0D+0*(DBLE(NB)*DBLE(BW))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1 ) * ( BW*BW ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * ( 3 * BW*BW ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UL N BW NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- ---- ----- -- ---- -------- ', $ '-------- ------ ------ ------' ) 9993 FORMAT( A4, 2X, A1, 1X, I6, 1X, I3, 1X, I4, 1X, $ I5, 1X, I2, 1X, $ I4, 1X, F8.3, F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PCPBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pcpbinfo.f000644 000766 000024 00000044727 10363532303 017433 0ustar00juliestaff000000 000000 SUBROUTINE PCPBINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NBW, $ BWVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PCPBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of subdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( UPLO, 'L' ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data represents ''Upper'// $ ''' or ''Lower'' triangular portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'TRANS : Whether solve is to be done with'// $ ' ''Transpose'' of matrix A (T,C) or not (N).' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bw : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bw ', ( BWVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCPBINFO * END scalapack-2.0.2/TESTING/LIN/pcpblaschk.f000644 000766 000024 00000025513 10363532303 017735 0ustar00juliestaff000000 000000 SUBROUTINE PCPBLASCHK( SYMM, UPLO, N, BWL, BWU, NRHS, X, IX, JX, $ DESCX, IASEED, A, IA, JA, DESCA, IBSEED, $ ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PCPBLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'H', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,max(bw*(bw+2),NB))+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,max(bw*(bw+2),NB))+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER BW, INFO, IPPRODUCT, WORK_MIN REAL DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGAMX2D, CGEMM, CGSUM2D, $ CLASET, PBCTRAN, PCMATGEN, SGEBR2D, $ SGEBS2D, SGERV2D, SGESD2D * .. * .. External Functions .. INTEGER ICAMAX, NUMROC REAL PSLAMCH EXTERNAL ICAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'H' ) ) THEN BW = BWL WORK_MIN = MAX(5,MAX(BW*(BW+2),NB))+2*NB ELSE BW = MAX(BWL, BWU) WORK_MIN = MAX(5,MAX(BW*(BW+2),NB))+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PCBLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (BW+1), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'H' )) THEN CALL PCBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PCBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PCPBDCMV( BW+1, BW, UPLO, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), (BW+2)*BW, INFO ) * * * Regenerate column of B * CALL PCMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PCAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PSCNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PSCNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PCBLASCHK * END scalapack-2.0.2/TESTING/LIN/pcpbmv1.f000644 000766 000024 00000073154 10607174520 017203 0ustar00juliestaff000000 000000 SUBROUTINE PCPBDCMV( LDBW, BW, UPLO, N, A, JA, DESCA, NRHS, B, IB, $ DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCPBTRF and this is stored in AF. If a linear system * is to be solved using PCPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, $ FIRST_PROC, I, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, J, JA_NEW, LLDA, LLDB, MYCOL, $ MYROW, MY_NUM_COLS, NB, NP, NPCOL, NPROW, $ NP_SAVE, ODD_SIZE, OFST, PART_OFFSET, $ PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Pre-calculate bw^2 * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -14 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 7*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 7*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PCPBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, $ 'PCPBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1005 PARAM_CHECK( 15, 2 ) = 1004 PARAM_CHECK( 14, 2 ) = 1003 PARAM_CHECK( 13, 2 ) = 1002 PARAM_CHECK( 12, 2 ) = 1001 PARAM_CHECK( 11, 2 ) = 9 PARAM_CHECK( 10, 2 ) = 705 PARAM_CHECK( 9, 2 ) = 704 PARAM_CHECK( 8, 2 ) = 703 PARAM_CHECK( 7, 2 ) = 701 PARAM_CHECK( 6, 2 ) = 6 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCPBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = CZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (BW+2)*BW WORK( I ) = CZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( UPLO, 'L' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL CLATCPY( 'U', BW, BW, $ A( LLDA*( NUMROC_SIZE-BW )+1+BW ), $ LLDA-1, WORK( 1 ), BW ) * * Send the triangle to neighboring processor to right * CALL CTRSD2D(ICTXT, 'L', 'N', $ BW, BW, $ WORK( 1 ), $ BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL CHBMV( 'L', NUMROC_SIZE, BW, CONE, A( OFST+1 ), LLDA, $ B(PART_OFFSET+1), 1, CZERO, X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL CCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( BW*BW+1+BW-DL_N_N ), 1 ) * CALL CTRMV( 'U', 'N', 'N', BW, $ A( LLDA*( NUMROC_SIZE-BW )+1+BW ), LLDA-1, $ WORK( BW*BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( BW*BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL CGESD2D( ICTXT, BW, 1, $ WORK( BW*BW+1 ), BW, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, BW*( BW+2 ) WORK( I ) = CZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL CCOPY( DL_P_M, B( 1 ), 1, $ WORK( BW*BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL CTRRV2D(ICTXT, 'L', 'N', $ BW, BW, $ WORK( 1 ), BW, MYROW, MYCOL-1 ) * CALL CTRMV( $ 'L', $ 'N', $ 'N', BW, $ WORK( 1 ), BW, $ WORK( BW*BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DL_P_M .GT. DL_P_N ) THEN DO 30 I=1, DL_P_M-DL_P_N WORK( BW*BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL CGESD2D( ICTXT, BW, 1, WORK(BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL CGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL CAXPY( BW, CONE, $ WORK( BW*BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL CGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL CAXPY( BW, CONE, $ WORK( BW*BW+1 ), 1, $ X( NUMROC_SIZE-BW+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( UPLO, 'U' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL CLATCPY( 'L', BW, BW, A( OFST+1 ), $ LLDA-1, WORK( 1 ), BW ) * * Send the triangle to neighboring processor to left * CALL CTRSD2D(ICTXT, 'U', 'N', $ BW, BW, $ WORK( 1 ), $ BW, MYROW, MYCOL-1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL CHBMV( 'U', NUMROC_SIZE, BW, CONE, A( OFST+1 ), LLDA, $ B(PART_OFFSET+1), 1, CZERO, X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL CCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( BW*BW+1+BW-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL CTRRV2D(ICTXT, 'U', 'N', $ BW, BW, $ WORK( 1 ), BW, MYROW, MYCOL+1 ) * CALL CTRMV( 'U', 'N', 'N', BW, $ WORK( 1 ), BW, $ WORK( BW*BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( BW*BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL CGESD2D( ICTXT, BW, 1, $ WORK( BW*BW+1 ), BW, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, BW*( BW+2 ) WORK( I ) = CZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL CCOPY( DL_P_M, B( 1 ), 1, $ WORK( BW*BW+1 ), 1) * CALL CTRMV( $ 'L', $ 'N', $ 'N', BW, $ A( 1 ), LLDA-1, $ WORK( BW*BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DL_P_M .GT. DL_P_N ) THEN DO 60 I=1, DL_P_M-DL_P_N WORK( BW*BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL CGESD2D( ICTXT, BW, 1, WORK(BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL CGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL CAXPY( BW, CONE, $ WORK( BW*BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL CGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL CAXPY( BW, CONE, $ WORK( BW*BW+1 ), 1, $ X( NUMROC_SIZE-BW+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PCBhBMV1 * END scalapack-2.0.2/TESTING/LIN/pcpotrrv.f000644 000766 000024 00000027716 10363532303 017511 0ustar00juliestaff000000 000000 SUBROUTINE PCPOTRRV( UPLO, N, A, IA, JA, DESCA, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCPOTRRV recomputes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from L or U * computed by PCPOTRF. The routine performs the Cholesky factorization * in reverse. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * hermitian distributed matrix sub( A ) is stored: * stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the factors L or U of the * distributed matrix sub( A ) from the Cholesky factorization. * On exit, the original distributed matrix sub( A ) is * restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK >= MB_A*NB_A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) COMPLEX CONE, ZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IL, J, JB, JL, JN, MYCOL, $ MYROW, NPCOL, NPROW * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PCLACPY, PCLASET, $ PCHERK, PCTRMM, PB_TOPGET, PB_TOPSET * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL ICEIL, INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * UPPER = LSAME( UPLO, 'U' ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JL = MAX( ( ( JA+N-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) IL = MAX( ( ( IA+N-2 ) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) IAROW = INDXG2P( IL, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) * * Define array descriptor for working array WORK * CALL DESCSET( DESCW, DESCA( MB_ ), DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * IF ( UPPER ) THEN * * Compute A from the Cholesky factor U : A = U'*U. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'S-ring' ) * DO 10 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + U'*U * CALL PCHERK( 'Upper', 'Conjugate Transpose', JA+N-J-JB, JB, $ ONE, A, IL, J+JB, DESCA, ONE, A, IL+JB, J+JB, $ DESCA ) * * Copy current diagonal block of A into workspace * CALL PCLACPY( 'All', JB, JB, A, IL, J, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict lower triangular part of diagonal block, to make * it U1. * CALL PCLASET( 'Lower', JB-1, JB, ZERO, ZERO, A, IL+1, J, $ DESCA ) * * Update the row panel U with the triangular matrix * CALL PCTRMM( 'Left', 'Upper', 'Conjugate Transpose', $ 'Non-Unit', JB, N-J+JA, CONE, WORK, 1, 1, $ DESCW, A, IL, J, DESCA ) * * Restore the strict lower triangular part of diagonal block. * CALL PCLACPY( 'Lower', JB-1, JB, WORK, 2, 1, DESCW, A, $ IL+1, J, DESCA ) * IL = IL - DESCA( MB_ ) DESCW( RSRC_ ) = MOD( DESCW( RSRC_ ) + NPROW - 1, NPROW ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + U'*U * CALL PCHERK( 'Upper', 'Conjugate Transpose', N-JB, JB, ONE, A, $ IA, JA+JB, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PCLACPY( 'All', JB, JB, A, IA, JA, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict lower triangular part of diagonal block, to make * it U1. * CALL PCLASET( 'Lower', JB-1, JB, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Update the row panel U with the triangular matrix * CALL PCTRMM( 'Left', 'Upper', 'Conjugate Transpose', 'Non-Unit', $ JB, N, CONE, WORK, 1, 1, DESCW, A, IA, JA, DESCA ) * * Restore the strict lower triangular part of diagonal block. * CALL PCLACPY( 'Lower', JB-1, JB, WORK, 2, 1, DESCW, A, IA+1, $ JA, DESCA ) * ELSE * * Compute A from the Cholesky factor L : A = L*L'. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * DO 20 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + L*L' * CALL PCHERK( 'Lower', 'No Transpose', IA+N-IL-JB, JB, ONE, A, $ IL+JB, J, DESCA, ONE, A, IL+JB, J+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PCLACPY( 'All', JB, JB, A, IL, J, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict upper triangular part of diagonal block, to make * it L1. * CALL PCLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IL, J+1, $ DESCA ) * * Update the column panel L with the triangular matrix * CALL PCTRMM( 'Right', 'Lower', 'Conjugate transpose', $ 'Non-Unit', IA+N-IL, JB, CONE, WORK, 1, 1, $ DESCW, A, IL, J, DESCA ) * * Restore the strict upper triangular part of diagonal block. * CALL PCLACPY( 'Upper', JB, JB-1, WORK, 1, 2, DESCW, A, $ IL, J+1, DESCA ) * IL = IL - DESCA( MB_ ) DESCW( RSRC_ ) = MOD( DESCW( RSRC_ ) + NPROW - 1, NPROW ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 20 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + L*L' * CALL PCHERK( 'Lower', 'No Transpose', N-JB, JB, ONE, A, $ IA+JB, JA, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PCLACPY( 'All', JB, JB, A, IA, JA, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict upper triangular part of diagonal block, to make * it L1. * CALL PCLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA, JA+1, $ DESCA ) * * Update the column panel L with the triangular matrix * CALL PCTRMM( 'Right', 'Lower', 'Conjugate transpose', $ 'Non-Unit', N, JB, CONE, WORK, 1, 1, DESCW, A, $ IA, JA, DESCA ) * * Restore the strict upper triangular part of diagonal block. * CALL PCLACPY( 'Upper', JB, JB-1, WORK, 1, 2, DESCW, A, IA, $ JA+1, DESCA ) * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PCPOTRRV * END scalapack-2.0.2/TESTING/LIN/pcptdriver.f000644 000766 000024 00000076325 10363532303 020014 0ustar00juliestaff000000 000000 PROGRAM PCPTDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PCPTDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by CPT. This test driver performs an * A = L*L**H factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM DOUBLE PRECISION array, dimension ( TOTMEM/CPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER CPLXSZ, MEMSIZ, NTESTS COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BW, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, I, $ IAM, IASEED, IBSEED, ICTXT, ICTXTB, IERR_TEMP, $ IMIDPAD, INFO, INT_TEMP, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWVAL( NTESTS ), DESCA( 7 ), DESCA2D( DLEN_ ), $ DESCB( 7 ), DESCB2D( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCBMATGEN, $ PCCHEKPAD, PCFILLPAD, PCMATGEN, PCPTINFO, $ PCPTLASCHK, PCPTTRF, PCPTTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PCLANGE EXTERNAL LSAME, NUMROC, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PCPTINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NBW, $ BWVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, $ NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BW = 1 IF( BW.LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Band', 'bw', BW IERR( 1 ) = 1 END IF * IF( BW.GT.N-1 ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*INT_ONE-1)/NPCOL + 1 ) $ + INT_ONE NB = MAX( NB, 2*INT_ONE ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*INT_ONE, N ) ) THEN IERR( 1 ) = 1 ENDIF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (2), (2), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((2)+10) IMIDPAD = 10 IPOSTPAD = ((2)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, N, (2), $ NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((2)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + (NB+10)*(2) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (12*NPCOL + 3*NB) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 8*NPCOL * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((2)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PCLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PCPTLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,NB)+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PCBMATGEN( ICTXT, UPLO, 'T', BW, BW, N, (2), NB, $ MEM( IPA ), NB+10, 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) CALL PCFILLPAD( ICTXT, NQ, NP, MEM( IPA-IPREPAD ), $ NB+10, IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PCLANGE( 'I', N, $ (2), MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', NQ, NP, $ MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( LSAME( UPLO, 'L' ) ) THEN INT_TEMP = 0 ELSE INT_TEMP = DESCA2D( LLD_ ) ENDIF * * For SPD Tridiagonal complex matrices, diagonal is stored * as a real. Thus, compact D into half the space * DO 10 H=1, NUMROC(N,NB,MYCOL,0,NPCOL)/2 MEM( IPA+INT_TEMP+H-1 ) = MEM( IPA+INT_TEMP+2*H-2 ) $ +MEM( IPA+INT_TEMP+2*H-1 )*( 0.0E+0, 1.0E+0 ) 10 CONTINUE IF( 2*(NUMROC(N,NB,MYCOL,0,NPCOL)/2).NE. $ NUMROC(N,NB,MYCOL,0,NPCOL) ) THEN H=NUMROC(N,NB,MYCOL,0,NPCOL)/2+1 MEM( IPA+INT_TEMP+H-1 ) = MEM( IPA+INT_TEMP+2*H-2 ) ENDIF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PCPTTRF( N, MEM( IPA+INT_TEMP ), $ MEM( IPA+1*( NB+10-INT_TEMP ) ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PCPTTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PCCHEKPAD( ICTXT, 'PCPTTRF', NQ, $ NP, MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (10+2*MIN(100,NRHS))*NPCOL+4*NRHS * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PCMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PCFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PCPTTRS( UPLO, N, NRHS, MEM( IPA+INT_TEMP ), $ MEM( IPA+1*( NB+10-INT_TEMP ) ), 1, $ DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PCPTTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCPTTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * * Reset descriptor describing A to 1-by-P grid for * use in banded utility routines * CALL DESCINIT( DESCA2D, (2), N, $ (2), NB, 0, 0, $ ICTXT, (2), IERR( 1 ) ) CALL PCPTLASCHK( 'H', UPLO, N, BW, BW, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * NOPS = NOPS + DBLE(BW)*( -2.D0 / 3.D0+DBLE(BW)* $ ( -1.D0+DBLE(BW)*( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( 1.D0+DBLE(BW)*( 3.D0 / $ 2.D0+DBLE(BW)*( 1.D0 / 2.D0 ) ) ) NOPS = NOPS + DBLE(BW)*( -1.D0 / 6.D0+DBLE(BW) $ *( -1.D0 /2.D0+DBLE(BW) $ *( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( DBLE(BW) / $ 2.D0*( 1.D0+DBLE(BW) ) ) * NOPS = NOPS + $ DBLE(NRHS)*( ( 2*DBLE(N)-DBLE(BW) )* $ ( DBLE(BW)+1.D0 ) )+ DBLE(NRHS)* $ ( DBLE(BW)*( 2*DBLE(N)- $ ( DBLE(BW)+1.D0 ) ) ) * * * Second calc to represent actual hardware speed * * NB bw^2 flops for LLt factorization in 1st proc * NOPS2 = ( (DBLE(N_FIRST))* DBLE(BW)**2 ) * IF ( NPROCS_REAL .GT. 1) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in last processor * NOPS2 = NOPS2 + $ 4*( (DBLE(N_LAST)*DBLE(BW)**2) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 4*( (DBLE(NB)*DBLE(BW)**2) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ ( NPROCS_REAL-1 ) * ( BW*BW*BW/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ ( NPROCS_REAL-2 ) * ( 2 * BW*BW*BW ) ENDIF * * * nrhs * 4 n_first*bw flops for LLt solve in proc 1. * NOPS2 = NOPS2 + $ ( 4.0D+0*(DBLE(N_FIRST)*DBLE(BW))*DBLE(NRHS) ) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*4 n_last*bw flops for LLt solve in last. * NOPS2 = NOPS2 + $ 2*( 4.0D+0*(DBLE(N_LAST)*DBLE(BW))*DBLE(NRHS) ) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2 * nrhs * 4 NB*bw flops for LLt solve in others. * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( 4.0D+0*(DBLE(NB)*DBLE(BW))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1 ) * ( BW*BW ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * ( 3 * BW*BW ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UL N BW NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- ---- ----- -- ---- -------- ', $ '-------- ------ ------ ------' ) 9993 FORMAT( A4, 2X, A1, 1X, I6, 1X, I3, 1X, I4, 1X, $ I5, 1X, I2, 1X, $ I4, 1X, F8.3, F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PCPTTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pcptinfo.f000644 000766 000024 00000044750 10363532303 017451 0ustar00juliestaff000000 000000 SUBROUTINE PCPTINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NBW, $ BWVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PCPTINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of subdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW NBW = 1 IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( UPLO, 'L' ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data represents ''Upper'// $ ''' or ''Lower'' triangular portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'TRANS : Whether solve is to be done with'// $ ' ''Transpose'' of matrix A (T,C) or not (N).' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bw : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bw ', ( BWVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCPTINFO * END scalapack-2.0.2/TESTING/LIN/pcptlaschk.f000644 000766 000024 00000027451 10363532303 017762 0ustar00juliestaff000000 000000 SUBROUTINE PCPTLASCHK( SYMM, UPLO, N, BWL, BWU, NRHS, X, IX, JX, $ DESCX, IASEED, A, IA, JA, DESCA, IBSEED, $ ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PCPTLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'H', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,NB)+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,NB)+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER I, START INTEGER BW, INFO, IPPRODUCT, WORK_MIN REAL DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGAMX2D, CGEMM, CGSUM2D, $ CLASET, PBCTRAN, PCMATGEN, SGEBR2D, $ SGEBS2D, SGERV2D, SGESD2D * .. * .. External Functions .. INTEGER ICAMAX, NUMROC REAL PSLAMCH EXTERNAL ICAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'H' ) ) THEN BW = BWL START = 1 WORK_MIN = MAX(5,NB)+2*NB ELSE BW = MAX(BWL, BWU) IF( LSAME( UPLO, 'D' )) THEN START = 1 ELSE START = 2 ENDIF WORK_MIN = MAX(5,NB)+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PCTLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (2), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'H' )) THEN CALL PCBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PCBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF IF( LSAME( UPLO, 'U' ) ) THEN * * * Matrix formed above has the diagonals shifted from what was * input to the tridiagonal routine. Shift them back. * * Send elements to neighboring processors * IF( MYCOL.LT.NPCOL-1 ) THEN CALL CGESD2D( ICTXT, 1, 1, $ A( START+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * * Shift local elements * DO 230 I=DESCA( NB_ )-1,0,-1 A( START+(I+1)*LDA ) = A( START+(I)*LDA ) 230 CONTINUE * * Receive elements from neighboring processors * IF( MYCOL.GT.0 ) THEN CALL CGERV2D( ICTXT, 1, 1, A( START), LDA, $ MYROW, MYCOL-1 ) ENDIF * ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PCPBDCMV( BW+1, BW, UPLO, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), (BW+2)*BW, INFO ) * * * Regenerate column of B * CALL PCMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PCAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PSCNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PSCNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PCTLASCHK * END scalapack-2.0.2/TESTING/LIN/pcqrdriver.f000644 000766 000024 00000123775 10363532303 020015 0ustar00juliestaff000000 000000 PROGRAM PCQRDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * Purpose * ======= * * PCQRDRIVER is the main test program for the COMPLEX * SCALAPACK QR factorization routines. This test driver performs a QR * QL, LQ, RQ, QP (QR factorization with column pivoting) or TZ * (complete unitary factorization) factorization and checks the * results. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 16 lines: * 'ScaLAPACK QR factorizations input file' * 'PVM machine' * 'QR.out' output file name (if any) * 6 device out * 6 number of factorizations * 'QR' 'QL' 'LQ' 'RQ' 'QP' 'TZ' factorization: QR, QL, LQ, RQ, QP, TZ * 4 number of problems sizes * 55 17 31 201 values of M * 5 71 31 201 values of N * 3 number of MB's and NB's * 4 3 5 values of MB * 4 7 3 values of NB * 7 number of process grids (ordered P & Q) * 1 2 1 4 2 3 8 values of P * 7 2 4 1 3 2 1 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ, REALSZ and CPLXSZ indicate the length in bytes on * the given platform for an integer, a single precision real * and a single precision complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, INTGSZ, MEMSIZ, NTESTS, REALSZ, TOTMEM COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, INTGSZ = 4, REALSZ = 4, $ TOTMEM = 2000000, MEMSIZ = TOTMEM / CPLXSZ, $ NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ) ) * .. * .. Local Scalars .. CHARACTER*2 FACT CHARACTER*6 PASSED CHARACTER*7 ROUT CHARACTER*8 ROUTCHK CHARACTER*80 OUTFILE LOGICAL CHECK INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPPIV, IPREPAD, IPTAU, IPRW, IPW, J, $ K, KFAIL, KPASS, KSKIP, KTESTS, L, LIPIV, $ LRWORK, LTAU, LWORK, M, MAXMN, MB, MINMN, MNP, $ MNQ, MP, MYCOL, MYROW, N, NB, NFACT, NGRIDS, $ NMAT, NNB, NOUT, NPCOL, NPROCS, NPROW, NQ, $ WORKFCT, WORKRFCT, WORKSIZ REAL ANORM, FRESID, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Arrays .. CHARACTER*2 FACTOR( NTESTS ) INTEGER DESCA( DLEN_ ), IERR( 1 ), MBVAL( NTESTS ), $ MVAL( NTESTS ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCCHEKPAD, $ PCFILLPAD, PCGELQF, PCGELQRV, $ PCGEQLF, PCGEQLRV, PCGEQPF, $ PCQPPIV, PCGEQRF, PCGEQRRV, $ PCGERQF, PCGERQRV, PCTZRZRV, $ PCMATGEN, PCLAFCHK, PCQRINFO, $ PCTZRZF, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, NUMROC REAL PCLANGE EXTERNAL ICEIL, LSAMEN, NUMROC, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP /4*0/ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PCQRINFO( OUTFILE, NOUT, NFACT, FACTOR, NTESTS, NMAT, MVAL, $ NTESTS, NVAL, NTESTS, NNB, MBVAL, NTESTS, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Loop over the different factorization types * DO 40 I = 1, NFACT * FACT = FACTOR( I ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) IF( LSAMEN( 2, FACT, 'QR' ) ) THEN ROUT = 'PCGEQRF' ROUTCHK = 'PCGEQRRV' WRITE( NOUT, FMT = 9986 ) $ 'QR factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN ROUT = 'PCGEQLF' ROUTCHK = 'PCGEQLRV' WRITE( NOUT, FMT = 9986 ) $ 'QL factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN ROUT = 'PCGELQF' ROUTCHK = 'PCGELQRV' WRITE( NOUT, FMT = 9986 ) $ 'LQ factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN ROUT = 'PCGERQF' ROUTCHK = 'PCGERQRV' WRITE( NOUT, FMT = 9986 ) $ 'RQ factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN ROUT = 'PCGEQPF' ROUTCHK = 'PCGEQRRV' WRITE( NOUT, FMT = 9986 ) $ 'QR factorization with column pivoting tests.' ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN ROUT = 'PCTZRZF' ROUTCHK = 'PCTZRZRV' WRITE( NOUT, FMT = 9986 ) $ 'Complete unitary factorization tests.' END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 J = 1, NGRIDS * NPROW = PVAL( J ) NPCOL = QVAL( J ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 K = 1, NMAT * M = MVAL( K ) N = NVAL( K ) * * Make sure matrix information is correct * IERR(1) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 L = 1, NNB * MB = MBVAL( L ) NB = NBVAL( L ) * * Make sure mb is legal * IERR( 1 ) = 0 IF( MB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MB', 'MB', MB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * MP = NUMROC( M, MB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) MNP = NUMROC( MIN( M, N ), MB, MYROW, 0, NPROW ) MNQ = NUMROC( MIN( M, N ), NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( MB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, MB, NB, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for ScaLAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IPTAU = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + IPREPAD * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN * LTAU = MNQ IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QR * factorization * LWORK = DESCA( NB_ ) * ( MP + NQ + DESCA( NB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PCLAFCHK, PCGEQRRV and * PCLANGE * WORKSIZ = LWORK + MP*DESCA( NB_ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN * LTAU = NQ IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QL * factorization * LWORK = DESCA( NB_ ) * ( MP + NQ + DESCA( NB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PCLAFCHK, PCGEQLRV and * PCLANGE * WORKSIZ = LWORK + MP*DESCA( NB_ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN * LTAU = MNP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the LQ * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PCLAFCHK, PCGELQRV and * PCLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN * LTAU = MP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QR * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PCLAFCHK, PCGERQRV and * PCLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * LTAU = MNQ IPPIV = IPTAU + LTAU + IPOSTPAD + IPREPAD LIPIV = ICEIL( INTGSZ*NQ, CPLXSZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the * factorization i.e from IPW on. * LWORK = MAX( 3, MP + MAX( 1, NQ ) ) WORKFCT = LWORK + IPOSTPAD LRWORK = MAX( 1, 2 * NQ ) WORKRFCT = ICEIL( LRWORK*REALSZ, CPLXSZ ) + $ IPOSTPAD IPRW = IPW + WORKFCT + IPREPAD WORKSIZ = WORKFCT + IPREPAD + WORKRFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PCLAFCHK, PCGEQRRV, * PCLANGE. * WORKSIZ = MAX( WORKSIZ - IPOSTPAD, $ DESCA( NB_ )*( 2*MP + NQ + DESCA( NB_ ) ) ) + $ IPOSTPAD END IF * ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN * LTAU = MP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the TZ * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PCLAFCHK, PCTZRZRV and * PCLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ FACT // ' factorization', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PCMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need the Infinity of A for checking * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PCFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PCFILLPAD( ICTXT, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANGE( 'I', M, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PCFILLPAD( ICTXT, WORKRFCT-IPOSTPAD, 1, $ MEM( IPRW-IPREPAD ), $ WORKRFCT-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PCFILLPAD( ICTXT, WORKFCT-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKFCT-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform QR factorizations * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN CALL SLTIMER( 1 ) CALL PCGEQRF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN CALL SLTIMER( 1 ) CALL PCGEQLF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN CALL SLTIMER( 1 ) CALL PCGELQF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN CALL SLTIMER( 1 ) CALL PCGERQF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL SLTIMER( 1 ) CALL PCGEQPF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPTAU ), $ MEM( IPW ), LWORK, MEM( IPRW ), $ LRWORK, INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN CALL SLTIMER( 1 ) IF( N.GE.M ) $ CALL PCTZRZF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PCCHEKPAD( ICTXT, ROUT, MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, ROUT, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PCCHEKPAD( ICTXT, ROUT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, ROUT, WORKRFCT-IPOSTPAD, $ 1, MEM( IPRW-IPREPAD ), $ WORKRFCT-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PCCHEKPAD( ICTXT, ROUT, WORKFCT-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKFCT-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN * * Compute residual = ||A-Q*R|| / (||A||*N*eps) * CALL PCGEQRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PCLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN * * Compute residual = ||A-Q*L|| / (||A||*N*eps) * CALL PCGEQLRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PCLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN * * Compute residual = ||A-L*Q|| / (||A||*N*eps) * CALL PCGELQRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PCLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN * * Compute residual = ||A-R*Q|| / (||A||*N*eps) * CALL PCGERQRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PCLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * * Compute residual = ||AP-Q*R|| / (||A||*N*eps) * CALL PCGEQRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN * * Compute residual = ||A-T*Z|| / (||A||*N*eps) * IF( N.GE.M ) THEN CALL PCTZRZRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) END IF CALL PCLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) END IF * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, ROUTCHK, MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, ROUTCHK, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, ROUTCHK, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * CALL PCQPPIV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCQPPIV', MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCQPPIV', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PCLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCLAFCHK', MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLAFCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Test residual and detect NaN result * IF( LSAMEN( 2, FACT, 'TZ' ) .AND. N.LT.M ) THEN KSKIP = KSKIP + 1 PASSED = 'BYPASS' ELSE IF( FRESID.LE.THRESH .AND. $ (FRESID-FRESID).EQ.0.0E+0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * MINMN = MIN( M, N ) MAXMN = MAX( M, N ) * IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN IF( M.GE.N ) THEN NOPS = 0.0D+0 ELSE * * 9 ( M^2 N - M^3 ) + 13 M N - M^2 for * complete unitary factorization (M <= N). * NOPS = 9.0D+0 * ( $ DBLE( N )*( DBLE( M )**2 ) - $ DBLE( M )**3 ) + $ 13.0D+0*DBLE( N )*DBLE( M ) - $ DBLE( M )**2 END IF * ELSE * * 8 M N^2 - 8/3 N^2 + 6 M N + 8 N^2 for QR type * factorization when M >= N. * NOPS = 8.0D+0 * ( DBLE( MINMN )**2 ) * $ ( DBLE( MAXMN )-DBLE( MINMN ) / 3.0D+0 ) + $ ( 6.0D+0 * DBLE( MAXMN ) + $ 8.0D+0 * DBLE( MINMN ) ) * $ DBLE( MINMN ) END IF * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, MB, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, $ PASSED, FRESID * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, MB, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, $ PASSED, FRESID * END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * 40 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N MB NB P Q Fact Time ', $ ' MFLOPS CHECK Residual' ) 9994 FORMAT( '---- ------ ------ --- --- ----- ----- --------- ', $ '----------- ------ --------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I3, 1X, I3, 1X, I5, 1X, I5, 1X, $ F9.2, 1X, F11.2, 1X, A6, 2X, G8.1 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( A ) * STOP * * End of PCQRDRIVER * END * SUBROUTINE PCQPPIV( M, N, A, IA, JA, DESCA, IPIV ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCQPPIV applies to sub( A ) = A(IA:IA+M-1,JA:JA+N-1) the pivots * returned by PCGEQPF in reverse order for checking purposes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be permuted. On exit, the local pieces * of the distributed permuted submatrix sub( A ) * Inv( P ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension LOCc(JA+N-1). * On exit, if IPIV(I) = K, the local i-th column of sub( A )*P * was the global K-th column of sub( A ). IPIV is tied to the * distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IACOL, ICOFFA, ICTXT, IITMP, IPVT, IPCOL, $ IPROW, ITMP, J, JJ, JJA, KK, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, IGERV2D, $ IGESD2D, IGAMN2D, INFOG1L, PCSWAP * .. * .. External Functions .. INTEGER INDXL2G, NUMROC EXTERNAL INDXL2G, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG1L( JA, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), JJA, $ IACOL ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * DO 20 J = JA, JA+N-2 * IPVT = JA+N-1 ITMP = JA+N * * Find first the local minimum candidate for pivoting * CALL INFOG1L( J, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), $ JJ, IACOL ) DO 10 KK = JJ, JJA+NQ-1 IF( IPIV( KK ).LT.IPVT )THEN IITMP = KK IPVT = IPIV( KK ) END IF 10 CONTINUE * * Find the global minimum pivot * CALL IGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, IPVT, 1, IPROW, $ IPCOL, 1, -1, MYCOL ) * * Broadcast the corresponding index to the other process columns * IF( MYCOL.EQ.IPCOL ) THEN ITMP = INDXL2G( IITMP, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL IGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ITMP, 1 ) IF( IPCOL.NE.IACOL ) THEN CALL IGERV2D( ICTXT, 1, 1, IPIV( IITMP ), 1, MYROW, $ IACOL ) ELSE IF( MYCOL.EQ.IACOL ) $ IPIV( IITMP ) = IPIV( JJ ) END IF ELSE CALL IGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ITMP, 1, MYROW, $ IPCOL ) IF( MYCOL.EQ.IACOL .AND. IPCOL.NE.IACOL ) $ CALL IGESD2D( ICTXT, 1, 1, IPIV( JJ ), 1, MYROW, IPCOL ) END IF * * Swap the columns of A * CALL PCSWAP( M, A, IA, ITMP, DESCA, 1, A, IA, J, DESCA, 1 ) * 20 CONTINUE * * End of PCQPPIV * END scalapack-2.0.2/TESTING/LIN/pcqrinfo.f000644 000766 000024 00000042542 10363532303 017445 0ustar00juliestaff000000 000000 SUBROUTINE PCQRINFO( SUMMRY, NOUT, NFACT, FACTOR, LDFACT, NMAT, $ MVAL, LDMVAL, NVAL, LDNVAL, NNB, MBVAL, $ LDMBVAL, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDFACT, LDMBVAL, LDMVAL, LDNBVAL, LDNVAL, $ LDPVAL, LDQVAL, NFACT, NGRIDS, NMAT, NNB, $ NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. CHARACTER*2 FACTOR( LDFACT ) CHARACTER*(*) SUMMRY INTEGER MBVAL( LDMBVAL ), MVAL( LDMVAL ), $ NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PCQRINFO gets needed startup information for the QR factoriza- * tion routines and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NFACT (global output) INTEGER * The number of different factorization types to be tested. * * FACTOR (global output) CHARACTER*2 array of dimension of LDFACT, * The factorization types to be tested: * if FACTOR(i) = 'QR' then QR factorization, * if FACTOR(i) = 'QL' then QL factorization, * if FACTOR(i) = 'LQ' then LQ factorization, * if FACTOR(i) = 'RQ' then RQ factorization, * if FACTOR(i) = 'QP' then QR factorization with column * pivoting. * if FACTOR(i) = 'TZ' then complete unitary factorization. * * LDFACT (global input) INTEGER * The maximum number of different factorization types to be * tested. LDFACT >= NFACT. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * MVAL (global output) INTEGER array of dimension (LDNVAL), the * values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M, LDNVAL > = NMAT. * * NVAL (global output) INTEGER array of dimension (LDNVAL), the * values of N (number of columns in matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for MB and * NB. * * MBVAL (global output) INTEGER array of dimension (LDMBVAL), the * values of MB (row blocksize) to run the code with. * * LDMBVAL (global input) INTEGER * The maximum number of different values that can be used for * MB, LDMBVAL >= NNB. * * NBVAL (global output) INTEGER array of dimension (LDNBVAL), the * values of NB (column blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array of dimension (LDPVAL), the * values of P (number of process rows) to run the code with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array of dimension (LDQVAL), the * values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 4, LDFACT+LDMVAL+LDNVAL+LDMBVAL+LDNBVAL+LDPVAL+LDQVAL ) * used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT, K REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAMEN REAL PSLAMCH EXTERNAL LSAMEN, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='QR.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get the matrix types to be tested * READ( NIN, FMT = * ) NFACT IF( NFACT.LT.1 .OR. NFACT.GT.LDFACT ) THEN WRITE( NOUT, FMT = 9994 ) 'nb of factorization', LDFACT GO TO 40 END IF READ( NIN, FMT = * ) ( FACTOR( I ), I = 1, NFACT ) * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 40 ELSE IF( NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'M', LDMVAL GO TO 40 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDMBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'MB', LDMBVAL GO TO 40 ELSE IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 40 END IF READ( NIN, FMT = * ) ( MBVAL( I ), I = 1, NNB ) READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 40 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 40 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I ) * QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS WORK( 4 ) = NFACT CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 DO 20 K = 1, NFACT IF( LSAMEN( 2, FACTOR( K ), 'QR' ) ) THEN WORK( I ) = 1 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'QL' ) ) THEN WORK( I ) = 2 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'LQ' ) ) THEN WORK( I ) = 3 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'RQ' ) ) THEN WORK( I ) = 4 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'QP' ) ) THEN WORK( I ) = 5 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'TZ' ) ) THEN WORK( I ) = 6 I = I + 1 END IF 20 CONTINUE * CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, MBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK QR factorizations routines.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision QR factorizations '// $ 'routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' || A - QR || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - QL || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - LQ || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - RQ || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - QRP || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - TZ || / (|| A || * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'MB : The row blocksize of the blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The column blocksize of the blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 ) $ 'Fact Time: Time in seconds to factor the'// $ ' matrix.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Execution rate of the '// $ 'factorization.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'MB ', ( MBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) NFACT = WORK( 4 ) * I = NFACT + 2*NMAT + 2*NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * DO 30 K = 1, NFACT IF( WORK( K ).EQ.1 ) THEN FACTOR( K ) = 'QR' ELSE IF( WORK( K ).EQ.2 ) THEN FACTOR( K ) = 'QL' ELSE IF( WORK( K ).EQ.3 ) THEN FACTOR( K ) = 'LQ' ELSE IF( WORK( K ).EQ.4 ) THEN FACTOR( K ) = 'RQ' ELSE IF( WORK( K ).EQ.5 ) THEN FACTOR( K ) = 'QP' ELSE IF( WORK( K ).EQ.6 ) THEN FACTOR( K ) = 'TZ' END IF 30 CONTINUE * I = NFACT + 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, MBVAL, 1 ) I = I + NNB CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 40 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCQRINFO * END scalapack-2.0.2/TESTING/LIN/pcqrt13.f000644 000766 000024 00000020574 10363532303 017122 0ustar00juliestaff000000 000000 SUBROUTINE PCQRT13( SCALE, M, N, A, IA, JA, DESCA, NORMA, ISEED, $ WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, ISEED, JA, M, N, SCALE REAL NORMA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL WORK( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCQRT13 generates a full-rank matrix that may be scaled to have * large or small norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SCALE (global input) INTEGER * SCALE = 1: normally scaled matrix * SCALE = 2: matrix scaled up * SCALE = 3: matrix scaled down * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * NORMA (global output) REAL * The one-norm of A. * * ISEED (global input/global output) INTEGER * Seed for random number generator. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK >= Nq0, where * * ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), and * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, IIA, INFO, $ IROFFA, J, JJA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ REAL ASUM, BIGNUM, SMLNUM COMPLEX AJJ * .. * .. External Functions .. INTEGER NUMROC REAL PCLANGE, PSLAMCH EXTERNAL NUMROC, PCLANGE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PCLASCL, PCMATGEN, $ PCELGET, PCELSET, PSCASUM, $ PSLABAD * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MOD, REAL, SIGN * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * generate the matrix * IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) MP = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * CALL PCMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), A, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED, IIA-1, MP, $ JJA-1, NQ, MYROW, MYCOL, NPROW, NPCOL ) * DO 10 J = JA, JA+N-1 I = IA + J - JA IF( I.LE.IA+M-1 ) THEN CALL PSCASUM( M, ASUM, A, IA, J, DESCA, 1 ) CALL PCELGET( 'Column', ' ', AJJ, A, I, J, DESCA ) AJJ = AJJ + CMPLX( SIGN( ASUM, REAL( AJJ ) ) ) CALL PCELSET( A, I, J, DESCA, AJJ ) END IF 10 CONTINUE * * scaled versions * IF( SCALE.NE.1 ) THEN * NORMA = PCLANGE( 'M', M, N, A, IA, JA, DESCA, WORK ) SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM CALL PSLABAD( ICTXT, SMLNUM, BIGNUM ) SMLNUM = SMLNUM / PSLAMCH( ICTXT, 'Epsilon' ) BIGNUM = ONE / SMLNUM * IF( SCALE.EQ.2 ) THEN * * matrix scaled up * CALL PCLASCL( 'General', NORMA, BIGNUM, M, N, A, IA, $ JA, DESCA, INFO ) * ELSE IF( SCALE.EQ.3 ) THEN * * matrix scaled down * CALL PCLASCL( 'General', NORMA, SMLNUM, M, N, A, IA, $ JA, DESCA, INFO ) * END IF * END IF * NORMA = PCLANGE( 'One-norm', M, N, A, IA, JA, DESCA, WORK ) * RETURN * * End of PCQRT13 * END scalapack-2.0.2/TESTING/LIN/pcqrt14.f000644 000766 000024 00000034010 10363532303 017111 0ustar00juliestaff000000 000000 REAL FUNCTION PCQRT14( TRANS, M, N, NRHS, A, IA, JA, $ DESCA, X, IX, JX, DESCX, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IX, JA, JX, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX A( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PCQRT14 checks whether sub( X ) is in the row space of sub( A ) or * sub( A )', where sub( A ) denotes A( IA:IA+M-1, JA:JA+N-1 ) and * sub( X ) denotes X( IX:IX+N-1, JX:JX+NRHS-1 ) if TRANS = 'N', and * X( IX:IX+N-1, JX:JX+NRHS-1 ) otherwise. It does so by scaling both * sub( X ) and sub( A ) such that their norms are in the range * [sqrt(eps), 1/sqrt(eps)], then computing an LQ factorization of * [sub( A )',sub( X )]' (if TRANS = 'N') or a QR factorization of * [sub( A ),sub( X )] otherwise, and returning the norm of the trailing * triangle, scaled by MAX(M,N,NRHS)*eps. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * = 'N': No transpose, check for sub( X ) in the row space of * sub( A ), * = 'C': Conjugate transpose, check for sub( X ) in row space * of sub( A )'. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( X ). NRHS >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)). This array * contains the local pieces of the M-by-N distributed matrix * sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) COMPLEX pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). * On entry, this array contains the local pieces of the * N-by-NRHS distributed submatrix sub( X ) if TRANS = 'N', * and the M-by-NRHS distributed submatrix sub( X ) otherwise. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * WORK (local workspace) COMPLEX array dimension (LWORK) * If TRANS='N', LWORK >= MNRHSP * NQ + LTAU + LWF and * LWORK >= MP * NNRHSQ + LTAU + LWF otherwise, where * * IF TRANS='N', (LQ fact) * MNRHSP = NUMROC( M+NRHS+IROFFA, MB_A, MYROW, IAROW, * NPROW ) * LTAU = NUMROC( IA+MIN( M+NRHS, N )-1, MB_A, MYROW, * RSRC_A, NPROW ) * LWF = MB_A * ( MB_A + MNRHSP + NQ0 ) * ELSE (QR fact) * NNRHSQ = NUMROC( N+NRHS+ICOFFA, NB_A, MYCOL, IACOL, * NPCOL ) * LTAU = NUMROC( JA+MIN( M, N+NRHS )-1, NB_A, MYCOL, * CSRC_A, NPCOL ) * LWF = NB_A * ( NB_A + MP0 + NNRHSQ ) * END IF * * and, * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MP0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NQ0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL TPSD INTEGER IACOL, IAROW, ICOFFA, ICTXT, IDUM, IIA, INFO, $ IPTAU, IPW, IPWA, IROFFA, IWA, IWX, J, JJA, $ JWA, JWX, LDW, LWORK, MPWA, MPW, MQW, MYCOL, $ MYROW, NPCOL, NPROW, NPW, NQWA, NQW REAL ANRM, ERR, XNRM COMPLEX AMAX * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) REAL RWORK( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL PCLANGE, PSLAMCH EXTERNAL LSAME, NUMROC, PCLANGE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCMAX1, $ PCCOPY, PCGELQF, PCGEQRF, PCLACGV, $ PCLACPY, PCLASCL, PXERBLA, SGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * PCQRT14 = ZERO * IPWA = 1 IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IWA = IROFFA + 1 JWA = ICOFFA + 1 CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) MPWA = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQWA = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * INFO = 0 IF( LSAME( TRANS, 'N' ) ) THEN IF( N.LE.0 .OR. NRHS.LE.0 ) $ RETURN TPSD = .FALSE. MPW = NUMROC( M+NRHS+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) NQW = NQWA * * Assign descriptor DESCW for workspace WORK and pointers to * matrices sub( A ) and sub( X ) in workspace * IWX = IWA + M JWX = JWA LDW = MAX( 1, MPW ) CALL DESCSET( DESCW, M+NRHS+IROFFA, N+ICOFFA, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, LDW ) * ELSE IF( LSAME( TRANS, 'C' ) ) THEN IF( M.LE.0 .OR. NRHS.LE.0 ) $ RETURN TPSD = .TRUE. MPW = MPWA NQW = NUMROC( N+NRHS+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) * * Assign descriptor DESCW for workspace WORK and pointers to * matrices sub( A ) and sub( X ) in workspace * IWX = IWA JWX = JWA + N LDW = MAX( 1, MPW ) CALL DESCSET( DESCW, M+IROFFA, N+NRHS+ICOFFA, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, LDW ) ELSE CALL PXERBLA( ICTXT, 'PCQRT14', -1 ) RETURN END IF * * Copy and scale sub( A ) * IPTAU = IPWA + MPW*NQW CALL PCLACPY( 'All', M, N, A, IA, JA, DESCA, WORK( IPWA ), IWA, $ JWA, DESCW ) RWORK( 1 ) = ZERO ANRM = PCLANGE( 'M', M, N, WORK( IPWA ), IWA, JWA, DESCW, RWORK ) IF( ANRM.NE.ZERO ) $ CALL PCLASCL( 'G', ANRM, ONE, M, N, WORK( IPWA ), IWA, $ JWA, DESCW, INFO ) * * Copy sub( X ) or sub( X )' into the right place and scale it * IF( TPSD ) THEN * * Copy sub( X ) into columns jwa+n:jwa+n+nrhs-1 of work * DO 10 J = 1, NRHS CALL PCCOPY( M, X, IX, JX+J-1, DESCX, 1, WORK( IPWA ), IWX, $ JWX+J-1, DESCW, 1 ) 10 CONTINUE XNRM = PCLANGE( 'M', M, NRHS, WORK( IPWA ), IWX, JWX, DESCW, $ RWORK ) IF( XNRM.NE.ZERO ) $ CALL PCLASCL( 'G', XNRM, ONE, M, NRHS, WORK( IPWA ), IWX, $ JWX, DESCW, INFO ) * * Compute QR factorization of work(iwa:iwa+m-1,jwa:jwa+n+nrhs-1) * MQW = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPW = IPTAU + MIN( MQW, NQW ) LWORK = DESCW( NB_ ) * ( MPW + NQW + DESCW( NB_ ) ) CALL PCGEQRF( M, N+NRHS, WORK( IPWA ), IWA, JWA, DESCW, $ WORK( IPTAU ), WORK( IPW ), LWORK, INFO ) * * Compute largest entry in upper triangle of * work(iwa+n:iwa+m-1,jwa+n:jwa+n+nrhs-1) * ERR = ZERO IF( N.LT.M ) THEN DO 20 J = JWX, JWA+N+NRHS-1 CALL PCMAX1( MIN(M-N,J-JWX+1), AMAX, IDUM, WORK( IPWA ), $ IWA+N, J, DESCW, 1 ) ERR = MAX( ERR, ABS( AMAX ) ) 20 CONTINUE END IF CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, IDUM1, IDUM2, $ -1, -1, 0 ) * ELSE * * Copy sub( X )' into rows iwa+m:iwa+m+nrhs-1 of work * DO 30 J = 1, NRHS CALL PCCOPY( N, X, IX, JX+J-1, DESCX, 1, WORK( IPWA ), $ IWX+J-1, JWX, DESCW, DESCW( M_ ) ) CALL PCLACGV( N, WORK( IPWA ), IWX+J-1, JWX, DESCW, $ DESCW( M_ ) ) 30 CONTINUE * XNRM = PCLANGE( 'M', NRHS, N, WORK( IPWA ), IWX, JWX, DESCW, $ RWORK ) IF( XNRM.NE.ZERO ) $ CALL PCLASCL( 'G', XNRM, ONE, NRHS, N, WORK( IPWA ), IWX, $ JWX, DESCW, INFO ) * * Compute LQ factorization of work(iwa:iwa+m+nrhs-1,jwa:jwa+n-1) * NPW = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPW = IPTAU + MIN( MPW, NPW ) LWORK = DESCW( MB_ ) * ( MPW + NQW + DESCW( MB_ ) ) CALL PCGELQF( M+NRHS, N, WORK( IPWA ), IWA, JWA, DESCW, $ WORK( IPTAU ), WORK( IPW ), LWORK, INFO ) * * Compute largest entry in lower triangle in * work(iwa+m:iwa+m+nrhs-1,jwa+m:jwa+n-1) * ERR = ZERO DO 40 J = JWA+M, MIN( JWA+N-1, JWA+M+NRHS-1 ) CALL PCMAX1( JWA+M+NRHS-J, AMAX, IDUM, WORK( IPWA ), $ IWX+J-JWA-M, J, DESCW, 1 ) ERR = MAX( ERR, ABS( AMAX ) ) 40 CONTINUE CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, IDUM1, IDUM2, $ -1, -1, 0 ) * END IF * PCQRT14 = ERR / ( REAL( MAX( M, N, NRHS ) ) * $ PSLAMCH( ICTXT, 'Epsilon' ) ) * RETURN * * End of PCQRT14 * END scalapack-2.0.2/TESTING/LIN/pcqrt16.f000644 000766 000024 00000025367 11622500733 017133 0ustar00juliestaff000000 000000 SUBROUTINE PCQRT16( TRANS, M, N, NRHS, A, IA, JA, DESCA, X, IX, $ JX, DESCX, B, IB, JB, DESCB, RWORK, RESID ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, IX, JA, JB, JX, M, N, NRHS REAL RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) REAL RWORK( * ) COMPLEX A( * ), B( * ), X( * ) * .. * * Purpose * ======= * * PCQRT16 computes the residual for a solution of a system of linear * equations sub( A )*sub( X ) = B or sub( A' )*sub( X ) = B: * RESID = norm(B - sub( A )*sub( X ) ) / * ( max(m,n) * norm(sub( A ) ) * norm(sub( X ) ) * EPS ), * where EPS is the machine epsilon, sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), and sub( X ) denotes * X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations: * = 'N': sub( A )*sub( X ) = sub( B ) * = 'T': sub( A' )*sub( X )= sub( B ), where A' is the * transpose of sub( A ). * = 'C': sub( A' )*sub( X )= B, where A' is the conjugate * transpose of sub( A ). * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) COMPLEX pointer into the local * memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * The original M x N matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) COMPLEX pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). This * array contains the local pieces of the computed solution * distributed vectors for the system of linear equations. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * B (local input/local output) COMPLEX pointer into * the local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the distributes right hand side vectors for * the system of linear equations. On exit, sub( B ) is over- * written with the difference sub( B ) - sub( A )*sub( X ) or * sub( B ) - sub( A )'*sub( X ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * RWORK (local workspace) REAL array, dimension (LRWORK) * LWORK >= Nq0 if TRANS = 'N', and LRWORK >= Mp0 otherwise. * * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * RESID (global output) REAL * The maximum over the number of right hand sides of * norm( sub( B )- sub( A )*sub( X ) ) / * ( max(m,n) * norm( sub( A ) ) * norm( sub( X ) ) * EPS ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER ICTXT, IDUMM, J, MYCOL, MYROW, N1, N2, NPCOL, $ NPROW REAL ANORM, BNORM, EPS, XNORM * .. * .. Local Arrays .. REAL TEMP( 2 ) * .. * .. External Functions .. LOGICAL LSAME REAL PCLANGE, PSLAMCH EXTERNAL LSAME, PCLANGE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PCGEMM, PSCASUM, $ SGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick exit if M = 0 or N = 0 or NRHS = 0 * IF( M.LE.0 .OR. N.LE.0 .OR. NRHS.EQ.0 ) THEN RESID = ZERO RETURN END IF * IF( LSAME( TRANS, 'T' ) .OR. LSAME( TRANS, 'C' ) ) THEN ANORM = PCLANGE( 'I', M, N, A, IA, JA, DESCA, RWORK ) N1 = N N2 = M ELSE ANORM = PCLANGE( '1', M, N, A, IA, JA, DESCA, RWORK ) N1 = M N2 = N END IF * EPS = PSLAMCH( ICTXT, 'Epsilon' ) * * Compute B - sub( A )*sub( X ) (or B - sub( A' )*sub( X ) ) and * store in B. * CALL PCGEMM( TRANS, 'No transpose', N1, NRHS, N2, -CONE, A, IA, $ JA, DESCA, X, IX, JX, DESCX, CONE, B, IB, JB, DESCB ) * * Compute the maximum over the number of right hand sides of * norm( sub( B ) - sub( A )*sub( X ) ) / * ( max(m,n) * norm( sub( A ) ) * norm( sub( X ) ) * EPS ). * RESID = ZERO DO 10 J = 1, NRHS * CALL PSCASUM( N1, BNORM, B, IB, JB+J-1, DESCB, 1 ) CALL PSCASUM( N2, XNORM, X, IX, JX+J-1, DESCX, 1 ) * * Only the process columns owning the vector operands will have * the correct result, the other will have zero. * TEMP( 1 ) = BNORM TEMP( 2 ) = XNORM IDUMM = 0 CALL SGAMX2D( ICTXT, 'All', ' ', 2, 1, TEMP, 2, IDUMM, IDUMM, $ -1, -1, IDUMM ) BNORM = TEMP( 1 ) XNORM = TEMP( 2 ) * * Every processes have ANORM, BNORM and XNORM now. * IF( ANORM.EQ.ZERO .AND. BNORM.EQ.ZERO ) THEN RESID = ZERO ELSE IF( ANORM.LE.ZERO .OR. XNORM.LE.ZERO ) THEN RESID = ONE / EPS ELSE RESID = MAX( RESID, ( ( BNORM / ANORM ) / XNORM ) / $ ( MAX( M, N )*EPS ) ) END IF * 10 CONTINUE * RETURN * * End of PCQRT16 * END scalapack-2.0.2/TESTING/LIN/pcqrt17.f000644 000766 000024 00000033014 10363532303 017117 0ustar00juliestaff000000 000000 REAL FUNCTION PCQRT17( TRANS, IRESID, M, N, NRHS, A, $ IA, JA, DESCA, X, IX, JX, $ DESCX, B, IB, JB, DESCB, WORK, $ RWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, IRESID, IX, JA, JB, JX, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) COMPLEX A( * ), B( * ), WORK( * ), X( * ) REAL RWORK( * ) * .. * * Purpose * ======= * * PCQRT17 computes the ratio * * || R'*op( sub( A ) ) ||/(||sub( A )||*alpha*max(M,N,NRHS)*eps) * * where R = op( sub( A ) )*sub( X ) - B, op(A) is A or A', and * * alpha = ||B|| if IRESID = 1 (zero-residual problem) * alpha = ||R|| if IRESID = 2 (otherwise). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies whether or not the transpose of sub( A ) is used. * = 'N': No transpose, op( sub( A ) ) = sub( A ). * = 'C': Conjugate transpose, op( sub( A ) ) = sub( A' ). * * IRESID (global input) INTEGER * IRESID = 1 indicates zero-residual problem. * IRESID = 2 indicates non-zero residual. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * If TRANS = 'N', the number of rows of the distributed * submatrix sub( B ). * If TRANS = 'C', the number of rows of the distributed * submatrix sub( X ). * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * If TRANS = 'N', the number of rows of the distributed * submatrix sub( X ). Otherwise N is the number of rows of * the distributed submatrix sub( B ). * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrices sub( X ) and sub( B ). * NRHS >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed M-by-N * submatrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) COMPLEX pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). * If TRANS = 'N', this array contains the local pieces of the * N-by-NRHS distributed submatrix sub( X ). Otherwise, this * array contains the local pieces of the M-by-NRHS distributed * submatrix sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * B (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_B,LOCc(JB+NRHS-1)). * If TRANS='N', this array contains the local pieces of the * distributed M-by-NRHS submatrix operand sub( B ). Otherwise, * this array contains the local pieces of the distributed * N-by-NRHS submatrix operand sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * If TRANS = 'N', LWORK >= Mp0 * NRHSq0 + NRHSp0 * Nq0 * otherwise LWORK >= Np0 * NRHSq0 + NRHSp0 * Mq0 * * RWORK (local workspace) REAL array, dimension (LRWORK) * LRWORK >= Nq0, if TRANS = 'N', and LRWORK >= Mp0 otherwise. * * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Np0 = NUMROC( N+ICOFFA, NB_A, MYROW, IAROW, NPROW ), * Mq0 = NUMROC( M+IROFFA, NB_A, MYCOL, IACOL, NPCOL ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * NRHSp0 = NUMROC( NRHS+ICOFFB, NB_B, MYROW, IBROW, NPROW ), * NRHSq0 = NUMROC( NRHS+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) * .. * .. Local Scalars .. INTEGER IACOL, IBCOL, IBROW, ICOFFB, ICTXT, INFO, $ IOFFA, IROFFB, ISCL, IW, IW2, JW, JW2, MYCOL, $ NRHSQ, NRHSP, MYROW, NCOLS, NPCOL, NPROW, $ NROWS, NROWSP REAL ERR, NORMA, NORMB, NORMRS, NORMX, SMLNUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), DESCW2( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC REAL PCLANGE, PSLAMCH EXTERNAL INDXG2P, LSAME, NUMROC, PCLANGE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PCGEMM, PCLACPY, $ PCLASCL, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, REAL * .. * .. Executable Statements .. * PCQRT17 = ZERO * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 IF( LSAME( TRANS, 'N' ) ) THEN NROWS = M NCOLS = N IOFFA = MOD( JA-1, DESCA( NB_ ) ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN NROWS = N NCOLS = M IOFFA = MOD( IA-1, DESCA( MB_ ) ) ELSE CALL PXERBLA( ICTXT, 'PCQRT17', -1 ) RETURN END IF * IF( M.LE.0 .OR. N.LE.0 .OR. NRHS.LE.0 ) $ RETURN * IROFFB = MOD( IA-1, DESCA( MB_ ) ) ICOFFB = MOD( JA-1, DESCA( NB_ ) ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * NRHSQ = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) NRHSP = NUMROC( NRHS+IROFFB, DESCB( NB_ ), MYROW, IBROW, NPROW ) NROWSP = NUMROC( NROWS+IROFFB, DESCA( MB_ ), MYROW, IBROW, NPROW ) * * Assign array descriptor DESCW for workspace WORK, where DESCW * holds a copy of the distributed submatrix sub( B ) * CALL DESCSET( DESCW, NROWS+IROFFB, NRHS+ICOFFB, DESCB( MB_ ), $ DESCB( NB_ ), IBROW, IBCOL, ICTXT, MAX( 1, $ NROWSP ) ) * * Assign array descriptor DESCW2 for workspace WORK, where DESCW2 * holds a copy of the distributed submatrix sub( X**T ) * CALL DESCSET( DESCW2, NRHS+ICOFFB, NCOLS+IOFFA, DESCX( NB_ ), $ DESCX( MB_ ), IBROW, IACOL, ICTXT, MAX( 1, $ NRHSP ) ) * NORMA = PCLANGE( 'One-norm', M, N, A, IA, JA, DESCA, RWORK ) SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) SMLNUM = SMLNUM / PSLAMCH( ICTXT, 'Precision' ) ISCL = 0 * * compute residual and scale it * IW = 1 + IROFFB JW = 1 + ICOFFB CALL PCLACPY( 'All', NROWS, NRHS, B, IB, JB, DESCB, WORK, IW, JW, $ DESCW ) CALL PCGEMM( TRANS, 'No transpose', NROWS, NRHS, NCOLS, $ CMPLX( -ONE ), A, IA, JA, DESCA, X, IX, JX, DESCX, $ CMPLX( ONE ), WORK, IW, JW, DESCW ) NORMRS = PCLANGE( 'Max', NROWS, NRHS, WORK, IW, JW, DESCW, $ RWORK ) IF( NORMRS.GT.SMLNUM ) THEN ISCL = 1 CALL PCLASCL( 'General', NORMRS, ONE, NROWS, NRHS, WORK, $ IW, JW, DESCW, INFO ) END IF * * compute R'*sub( A ) * IW2 = 1 + ICOFFB JW2 = 1 + IOFFA CALL PCGEMM( 'Conjugate transpose', TRANS, NRHS, NCOLS, NROWS, $ CMPLX( ONE ), WORK, IW, JW, DESCW, A, IA, JA, DESCA, $ CMPLX( ZERO ), WORK( NROWSP*NRHSQ+1 ), IW2, JW2, $ DESCW2 ) * * compute and properly scale error * ERR = PCLANGE( 'One-norm', NRHS, NCOLS, WORK( NROWSP*NRHSQ+1 ), $ IW2, JW2, DESCW2, RWORK ) IF( NORMA.NE.ZERO ) $ ERR = ERR / NORMA * IF( ISCL.EQ.1 ) $ ERR = ERR*NORMRS * IF( IRESID.EQ.1 ) THEN NORMB = PCLANGE( 'One-norm', NROWS, NRHS, B, IB, JB, DESCB, $ RWORK ) IF( NORMB.NE.ZERO ) $ ERR = ERR / NORMB ELSE NORMX = PCLANGE( 'One-norm', NCOLS, NRHS, X, IX, JX, DESCX, $ RWORK ) IF( NORMX.NE.ZERO ) $ ERR = ERR / NORMX END IF * PCQRT17 = ERR / ( PSLAMCH( ICTXT, 'Epsilon' ) * $ REAL( MAX( M, N, NRHS ) ) ) * RETURN * * End of PCQRT17 * END scalapack-2.0.2/TESTING/LIN/pctzrzrv.f000644 000766 000024 00000024433 10363532303 017527 0ustar00juliestaff000000 000000 SUBROUTINE PCTZRZRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCTZRZRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from T, Z * computed by PCTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= M >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors T and Z computed * by PCTZRZF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCr(M_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PCTZRZF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IN, $ IPT, IPV, IPW, JJA, JM1, JV, L, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCLACPY, $ PCLARZB, PCLARZT, PCLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.LT.M ) $ RETURN * L = N - M JM1 = JA + MIN( M+1, N ) - 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * CALL DESCSET( DESCV, DESCA( MB_ ), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * * Handle first block separately * IB = IN - IA + 1 JV = ICOFF + JM1 - JA + 1 * * Compute upper triangular matrix T * CALL PCLARZT( 'Backward', 'Rowwise', L, IB, A, IA, JM1, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'All', IB, L, A, IA, JM1, DESCA, WORK( IPV ), 1, $ JV, DESCV ) * * Save temporarily strict lower part of A(IA:IA+IB-1,JA:JA+IB-1) * CALL PCLACPY( 'Lower', IB-1, IB-1, A, IA+1, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) * * Zeroes the row panel of sub( A ) to get T(IA:IN,JA:JA+N-1) * CALL PCLASET( 'All', IB, L, ZERO, ZERO, A, IA, JM1, DESCA ) CALL PCLASET( 'Lower', IB-1, IB-1, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Apply block Householder transformation * CALL PCLARZB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', IN-IA+1, N, IB, L, WORK( IPV ), 1, JV, $ DESCV, WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * * Restore strict lower part of A( IA:IA+IB-1, JA:JA+N-1 ) * CALL PCLACPY( 'Lower', IB-1, IB-1, WORK( IPV ), 1, ICOFF+1, DESCV, $ A, IA+1, JA, DESCA ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * * Loop over the remaining row blocks * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Compute upper triangular matrix T * CALL PCLARZT( 'Backward', 'Rowwise', L, IB, A, I, JM1, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PCLACPY( 'All', IB, L, A, I, JM1, DESCA, WORK( IPV ), 1, $ JV, DESCV ) * * Save temporarily strict lower part of A(I:I+IB-1,J:J+IB-1 ) * CALL PCLACPY( 'Lower', IB-1, IB-1, A, I+1, JA+I-IA, DESCA, $ WORK( IPV ), 1, ICOFF+1+I-IA, DESCV ) * * Zeoes the row panel of sub( A ) to get T(IA:I-1,JA+I-IA:JA+N-1) * CALL PCLASET( 'All', IB, L, ZERO, ZERO, A, I, JM1, DESCA ) CALL PCLASET( 'Lower', IB-1, IB-1, ZERO, ZERO, A, I+1, JA+I-IA, $ DESCA ) * * Apply block Householder transformation * CALL PCLARZB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', I+IB-IA, N-I+IA, IB, L, WORK( IPV ), $ 1, JV, DESCV, WORK( IPT ), A, IA, JA+I-IA, DESCA, $ WORK( IPW ) ) * CALL PCLACPY( 'Lower', IB-1, IB-1, WORK( IPV ), 1, $ ICOFF+1+I-IA, DESCV, A, I+1, JA+I-IA, DESCA ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PCTZRZRV * END scalapack-2.0.2/TESTING/LIN/pdbmatgen.f000644 000766 000024 00000016477 10363532303 017575 0ustar00juliestaff000000 000000 SUBROUTINE PDBMATGEN( ICTXT, AFORM, AFORM2, BWL, BWU, N, $ MB, NB, A, $ LDA, IAROW, IACOL, ISEED, $ MYROW, MYCOL, NPROW, NPCOL ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. * .. Scalar Arguments .. CHARACTER*1 AFORM, AFORM2 INTEGER IACOL, IAROW, ICTXT, $ ISEED, LDA, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW, BWL, BWU * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PDBMATGEN : Parallel Real Double precision Band MATrix GENerator. * (Re)Generate a distributed Band matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'L' : A is returned as a symmetric lower * triangular matrix, and is diagonally dominant. * if AFORM = 'U' : A is returned as a symmetric upper * triangular matrix, and is diagonally dominant. * if AFORM = 'G' : A is returned as a general matrix. * if AFORM = 'T' : A is returned as a general matrix in * tridiagonal-compatible form. * * AFORM2 (global input) CHARACTER*1 * if the matrix is general: * if AFORM2 = 'D' : A is returned diagonally dominant. * if AFORM2 != 'D' : A is not returned diagonally dominant. * if the matrix is symmetric or hermitian: * if AFORM2 = 'T' : A is returned in tridiagonally-compatible * form (a transpose form). * if AFORM2 != 'T' : A is returned in banded-compatible form. * * M (global input) INTEGER * The number of nonzero rows in the generated distributed * band matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) DOUBLE PRECISION, pointer into the local * memory to an array of dimension ( LDA, * ) containing the * local pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * A( DIAG_INDEX, I ) = A( DIAG_INDEX, I ) + BWL+BWU * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * This code is a simple wrapper around PDMATGEN, for band matrices. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER DIAG_INDEX, I, J, M_MATGEN, NQ, N_MATGEN, $ START_INDEX * .. * .. External Subroutines .. EXTERNAL PDMATGEN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC, LSAME * .. * .. Executable Statements .. * * IF( LSAME( AFORM, 'L' ).OR.LSAME( AFORM, 'U' ) ) THEN M_MATGEN = BWL + 1 N_MATGEN = N START_INDEX = 1 IF( LSAME( AFORM, 'L' ) ) THEN DIAG_INDEX = 1 ELSE DIAG_INDEX = BWL + 1 ENDIF ELSE M_MATGEN = BWL + BWU + 1 N_MATGEN = N DIAG_INDEX = BWU + 1 START_INDEX = 1 ENDIF * NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * * * Generate a random matrix initially * IF( LSAME( AFORM, 'T' ) .OR. $ ( LSAME( AFORM2, 'T' ) ) ) THEN * CALL PDMATGEN( ICTXT, 'T', 'N', $ N_MATGEN, M_MATGEN, $ NB, M_MATGEN, A( START_INDEX, 1 ), $ LDA, IAROW, IACOL, $ ISEED, 0, NQ, 0, M_MATGEN, $ MYCOL, MYROW, NPCOL, NPROW ) * ELSE * CALL PDMATGEN( ICTXT, 'N', 'N', $ M_MATGEN, N_MATGEN, $ M_MATGEN, NB, A( START_INDEX, 1 ), $ LDA, IAROW, IACOL, $ ISEED, 0, M_MATGEN, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Zero out padding at tops of columns * DO 1000 J=1,NB * DO 2000 I=1, LDA-M_MATGEN * * Indexing goes negative; BMATGEN assumes that space * has been preallocated above the first column as it * has to be if the matrix is to be input to * Scalapack's band solvers. * A( I-LDA+M_MATGEN, J ) = ZERO * 2000 CONTINUE * 1000 CONTINUE * ENDIF * IF( LSAME( AFORM2, 'D' ).OR. $ ( LSAME( AFORM, 'L' ).OR.LSAME( AFORM, 'U' ) ) ) THEN * * Loop over diagonal elements stored on this processor. * * DO 330 I=1, NQ IF( LSAME( AFORM, 'T' ) .OR. $ ( LSAME( AFORM2, 'T' ) ) ) THEN IF( NPROW .EQ. 1 ) THEN A( I, DIAG_INDEX ) = A( I, DIAG_INDEX ) $ + DBLE( BWL+BWU+1 ) ENDIF ELSE IF( NPROW .EQ. 1 ) THEN A( DIAG_INDEX, I ) = A( DIAG_INDEX, I ) $ + DBLE( BWL+BWU+1 ) ENDIF END IF 330 CONTINUE * * ELSE * * Must add elements to keep condition of matrix in check * DO 380 I=1, NQ * IF( NPROW .EQ. 1 ) THEN * IF( MOD(I+MYCOL*NB,2) .EQ. 1 ) THEN A( DIAG_INDEX+1, I ) = A( DIAG_INDEX+1, I ) $ + DBLE( BWL+BWU+1 ) * ELSE * A( DIAG_INDEX-1, I ) = A( DIAG_INDEX-1, I ) $ + DBLE( BWL+BWU+1 ) ENDIF * ENDIF * 380 CONTINUE * END IF * RETURN * * End of PDBMATGEN * END scalapack-2.0.2/TESTING/LIN/pddbdriver.f000644 000766 000024 00000075700 10363532303 017753 0ustar00juliestaff000000 000000 PROGRAM PDDBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PDDBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by DDB. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER DBLESZ, MEMSIZ, NTESTS DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0, ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), MEM( MEMSIZ ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDBMATGEN, $ PDCHEKPAD, PDDBINFO, PDDBLASCHK, PDDBTRF, $ PDDBTRS, PDFILLPAD, PDMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PDLANGE EXTERNAL LSAME, NUMROC, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PDDBINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = BWLVAL( BW_NUM ) IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = BWUVAL( BW_NUM ) IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*MAX(BWL,BWU)-1)/NPCOL + 1 ) $ + MAX(BWL,BWU) NB = MAX( NB, 2*MAX(BWL,BWU) ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*MAX(BWL,BWU), N ) ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (BWL+BWU+1), (BWL+BWU+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((BWL+BWU+1)+10) IMIDPAD = 10 IPOSTPAD = ((BWL+BWU+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (BWL+BWU+1), N, $ (BWL+BWU+1), NB, 0, 0, $ ICTXT,((BWL+BWU+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((BWL+BWU+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = MAX(BWL,BWU)*MAX(BWL,BWU) * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((BWL+BWU+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PDLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PDDBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PDBMATGEN( ICTXT, 'G', 'D', BWL, BWU, N, $ (BWL+BWU+1), NB, MEM( IPA ), $ ((BWL+BWU+1)+10), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) * CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((BWL+BWU+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PDLANGE( '1', (BWL+BWU+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((BWL+BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PDDBTRF( N, BWL, BWU, MEM( IPA ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PDDBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PDCHEKPAD( ICTXT, 'PDDBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((BWL+BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (MAX(BWL,BWU)*NRHS) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PDMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PDFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PDDBTRS( TRANS, N, BWL, BWU, NRHS, MEM( IPA ), $ 1, DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PDDBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDDBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PDDBLASCHK( 'N', 'D', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl bwu + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE(BWU)) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+bwu) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE(BWU)) $ *DBLE(NRHS)) * * Second calc to represent actual hardware speed * * 2*N_FIRST bwl*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE(BWL)*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST bwl*bwu * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE(BWL) $ *DBLE(BWU)) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB bwl*bwu flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE(BWL) $ *DBLE(BWU)) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( BWL*BWU*BWL/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*BWL*BWU*BWL) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+bwu) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE(BWU)) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * (bwl+bwu) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE(BWL)+ $ DBLE(BWU)))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * (bwl+bwu) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE(BWL)+ $ DBLE(BWU)))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*(BWL*BWU ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * BWL*BWU ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PDDBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pddbinfo.f000644 000766 000024 00000044335 10363532303 017413 0ustar00juliestaff000000 000000 SUBROUTINE PDDBINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PDDBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'T' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'T' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDDBINFO * END scalapack-2.0.2/TESTING/LIN/pddblaschk.f000644 000766 000024 00000026133 10363532303 017721 0ustar00juliestaff000000 000000 SUBROUTINE PDDBLASCHK( SYMM, UPLO, TRANS, N, BWL, BWU, NRHS, X, $ IX, JX, DESCX, IASEED, A, IA, JA, DESCA, $ IBSEED, ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, TRANS, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PDDBLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'S', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * TRANS if TRANS= 'T', A 'Transpose' is used as the * coefficient matrix in the solve. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,max(max(bwl,bwu)*(max(bwl,bwu)+2),NB))+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,max(max(bwl,bwu)*(max(bwl,bwu)+2),NB))+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER BW, INFO, IPPRODUCT, WORK_MIN DOUBLE PRECISION DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, DGEBR2D, $ DGEBS2D, DGEMM, DGERV2D, DGESD2D, $ DGSUM2D, DLASET, PBDTRAN, PDMATGEN * .. * .. External Functions .. INTEGER IDAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IDAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'S' ) ) THEN BW = BWL WORK_MIN = MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ELSE BW = MAX(BWL, BWU) WORK_MIN = MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PDBLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (BWL+BWU+1), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'S' )) THEN CALL PDBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PDBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PDGBDCMV( BWL+BWU+1, BWL, BWU, TRANS, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), $ (MAX(BWL,BWU)+2)*MAX(BWL,BWU), INFO ) * * * Regenerate column of B * CALL PDMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PDAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PDNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PDNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PDBLASCHK * END scalapack-2.0.2/TESTING/LIN/pddbmv1.f000644 000766 000024 00000076017 10607174520 017171 0ustar00juliestaff000000 000000 SUBROUTINE PDGBDCMV( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, $ B, IB, DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDDBTRF and this is stored in AF. If a linear system * is to be solved using PDDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M, $ DU_N_N, DU_P_M, DU_P_N, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM2, IDUM3, J, JA_NEW, $ LLDA, LLDB, MAX_BW, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PDDBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PDDBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDDBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = ZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (MAX_BW+2)*MAX_BW WORK( I ) = ZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( TRANS, 'N' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * * Use main partition in each processor to multiply locally * CALL DGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, ONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, ZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL DCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWL-DL_N_N ), 1 ) * CALL DTRMV( 'U', 'N', 'N', BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL DGESD2D( ICTXT, BWL, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWL, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = ZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL DCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * CALL DTRMV( $ 'L', $ 'N', $ 'N', BWU, $ A( 1 ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 30 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL DGESD2D( ICTXT, BWU, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL DGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL DAXPY( BWL, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL DGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL DAXPY( BWU, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWU+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( TRANS, 'T' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL DLATCPY( 'L', BWU, BWU, A( OFST+1 ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to left * CALL DTRSD2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL-1 ) * ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL DLATCPY( 'U', BWL, BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to right * CALL DTRSD2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL DGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, ONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, ZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL DCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWU-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL DTRRV2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), MAX_BW, MYROW, MYCOL+1 ) * CALL DTRMV( 'U', 'N', 'N', BWU, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL DGESD2D( ICTXT, BWU, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWU, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = ZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL DCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL DTRRV2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), MAX_BW, MYROW, MYCOL-1 ) * CALL DTRMV( $ 'L', $ 'N', $ 'N', BWL, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 60 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL DGESD2D( ICTXT, BWL, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL DGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL DAXPY( BWU, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL DGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL DAXPY( BWL, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWL+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PDBsBMV1 * END scalapack-2.0.2/TESTING/LIN/pddtdriver.f000644 000766 000024 00000076204 10363532303 017775 0ustar00juliestaff000000 000000 PROGRAM PDDTDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PDDTDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by DDT. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER DBLESZ, MEMSIZ, NTESTS DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0, ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), MEM( MEMSIZ ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDBMATGEN, $ PDCHEKPAD, PDDTINFO, PDDTLASCHK, PDDTTRF, $ PDDTTRS, PDFILLPAD, PDMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PDLANGE EXTERNAL LSAME, NUMROC, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PDDTINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = 1 IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = 1 IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*INT_ONE-1)/NPCOL + 1 ) $ + INT_ONE NB = MAX( NB, 2*INT_ONE ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*INT_ONE, N ) ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (3), (3), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((3)+10) IMIDPAD = 10 IPOSTPAD = ((3)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, N, (3), $ NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((3)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + (NB+10)*(3) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (12*NPCOL+3*NB) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 8*NPCOL * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((3)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PDLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PDDTLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,NB)+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PDBMATGEN( ICTXT, 'T', 'D', BWL, BWU, N, (3), NB, $ MEM( IPA ), NB+10, 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) CALL PDFILLPAD( ICTXT, NQ, NP, MEM( IPA-IPREPAD ), $ NB+10, IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PDLANGE( 'I', N, $ (3), MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', NQ, NP, $ MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PDDTTRF( N, MEM( IPA+2*( NB+10 ) ), $ MEM( IPA+1*( NB+10 ) ), MEM( IPA ), 1, $ DESCA, MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW ), IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PDDTTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PDCHEKPAD( ICTXT, 'PDDTTRF', NQ, $ NP, MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = 10*NPCOL+4*NRHS * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PDMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PDFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PDDTTRS( TRANS, N, NRHS, $ MEM( IPA+2*( NB+10 ) ), $ MEM( IPA+1*( NB+10 ) ), MEM( IPA ), $ 1, DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PDDTTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDDTTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * * Reset descriptor describing A to 1-by-P grid for * use in banded utility routines * CALL DESCINIT( DESCA2D, (3), N, $ (3), NB, 0, 0, $ ICTXT, (3), IERR( 1 ) ) CALL PDDTLASCHK( 'N', 'D', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl INT_ONE + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE(INT_ONE)) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+INT_ONE) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE(INT_ONE)) $ *DBLE(NRHS)) * * Second calc to represent actual hardware speed * * 2*N_FIRST bwl*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE(BWL)*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST bwl*INT_ONE * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE(BWL) $ *DBLE(INT_ONE)) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB bwl*INT_ONE flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE(BWL) $ *DBLE(INT_ONE)) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( BWL*INT_ONE*BWL/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*BWL*INT_ONE*BWL) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+INT_ONE) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE(INT_ONE)) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * (bwl+INT_ONE) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE(BWL)+ $ DBLE(INT_ONE)))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * (bwl+INT_ONE) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE(BWL)+ $ DBLE(INT_ONE)))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*(BWL*INT_ONE ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * BWL*INT_ONE ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PDDTTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pddtinfo.f000644 000766 000024 00000044356 10363532303 017440 0ustar00juliestaff000000 000000 SUBROUTINE PDDTINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PDDTINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW NBW = 1 IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'T' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'T' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDDTINFO * END scalapack-2.0.2/TESTING/LIN/pddtlaschk.f000644 000766 000024 00000030643 10363532303 017744 0ustar00juliestaff000000 000000 SUBROUTINE PDDTLASCHK( SYMM, UPLO, TRANS, N, BWL, BWU, NRHS, X, $ IX, JX, DESCX, IASEED, A, IA, JA, DESCA, $ IBSEED, ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, TRANS, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PDDTLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'S', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * TRANS if TRANS= 'T', A 'Transpose' is used as the * coefficient matrix in the solve. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,NB)+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,NB)+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER I, START INTEGER BW, INFO, IPPRODUCT, WORK_MIN DOUBLE PRECISION DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, DGEBR2D, $ DGEBS2D, DGEMM, DGERV2D, DGESD2D, $ DGSUM2D, DLASET, PBDTRAN, PDMATGEN * .. * .. External Functions .. INTEGER IDAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IDAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'S' ) ) THEN BW = BWL START = 1 WORK_MIN = MAX(5,NB)+2*NB ELSE BW = MAX(BWL, BWU) IF( LSAME( UPLO, 'D' )) THEN START = 1 ELSE START = 2 ENDIF WORK_MIN = MAX(5,NB)+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PDTLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (3), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'S' )) THEN CALL PDBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PDBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Matrix formed above has the diagonals shifted from what was * input to the tridiagonal routine. Shift them back. * * Send elements to neighboring processors * IF( MYCOL.GT.0 ) THEN CALL DGESD2D( ICTXT, 1, 1, A( START+2), LDA, $ MYROW, MYCOL-1 ) ENDIF * IF( MYCOL.LT.NPCOL-1 ) THEN CALL DGESD2D( ICTXT, 1, 1, $ A( START+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * * Shift local elements * DO 220 I=0,DESCA( NB_ )-1 A( START+2+(I)*LDA ) = A( START+2+(I+1)*LDA ) 220 CONTINUE * DO 230 I=DESCA( NB_ )-1,0,-1 A( START+(I+1)*LDA ) = A( START+(I)*LDA ) 230 CONTINUE * * Receive elements from neighboring processors * IF( MYCOL.LT.NPCOL-1 ) THEN CALL DGERV2D( ICTXT, 1, 1, $ A( START+2+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * IF( MYCOL.GT.0 ) THEN CALL DGERV2D( ICTXT, 1, 1, A( START), LDA, $ MYROW, MYCOL-1 ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PDGBDCMV( BWL+BWU+1, BWL, BWU, TRANS, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), $ (INT_ONE+2)*INT_ONE, INFO ) * * * Regenerate column of B * CALL PDMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PDAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PDNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PDNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PDTLASCHK * END scalapack-2.0.2/TESTING/LIN/pdgbdriver.f000644 000766 000024 00000077473 10363532303 017767 0ustar00juliestaff000000 000000 PROGRAM PDGBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PDGBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by DGB. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTMEM INTEGER, default = 2048. * INTMEM is the size of the integer workspace used in this * driver as input as the IPIV vector. It represents an * upper bound on NB, the blocksize of the data * distribution. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER INTMEM PARAMETER ( INTMEM = 2048 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER DBLESZ, MEMSIZ, NTESTS DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0, ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER IPIV(INTMEM) INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), MEM( MEMSIZ ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDBMATGEN, $ PDCHEKPAD, PDDBLASCHK, PDFILLPAD, PDGBINFO, $ PDGBTRF, PDGBTRS, PDMATGEN, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PDLANGE EXTERNAL LSAME, NUMROC, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PDGBINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = BWLVAL( BW_NUM ) IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = BWUVAL( BW_NUM ) IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*(BWL+BWU)-1)/NPCOL + 1 ) $ + (BWL+BWU) NB = MAX( NB, 2*(BWL+BWU) ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 * IF( NB.GT.INTMEM ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) THEN WRITE( NOUT,* )'You have chosen an ' $ ,'NB > INTMEM in the driver.' WRITE(NOUT, *)'Please edit the driver ' $ ,'and increase the value of INTMEM' ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (2*BWL+2*BWU+1), (2*BWL+2*BWU+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((2*BWL+2*BWU+1)+10) IMIDPAD = 10 IPOSTPAD = ((2*BWL+2*BWU+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (2*BWL+2*BWU+1), N, $ (2*BWL+2*BWU+1), NB, 0, 0, $ ICTXT,((2*BWL+2*BWU+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((2*BWL+2*BWU+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 1 * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((2*BWL+2*BWU+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PDLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PDDBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PDBMATGEN( ICTXT, 'G', 'N', BWL, BWU, N, $ (2*BWL+2*BWU+1), NB, MEM( IPA+BWL+BWU ), $ ((2*BWL+2*BWU+1)+10), 0, 0, IASEED, $ MYROW, MYCOL, NPROW, NPCOL ) * CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((2*BWL+2*BWU+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PDLANGE( '1', (2*BWL+2*BWU+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((2*BWL+2*BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PDGBTRF( N, BWL, BWU, MEM( IPA ), 1, DESCA, IPIV, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PDGBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PDCHEKPAD( ICTXT, 'PDGBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((2*BWL+2*BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = NRHS*(NB+2*BWL+4*BWU) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PDMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PDFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PDGBTRS( TRANS, N, BWL, BWU, NRHS, MEM( IPA ), $ 1, DESCA, IPIV, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PDGBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PDDBLASCHK( 'N', 'N', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA+BWL+BWU ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl (bwl+bwu) + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE((BWL+BWU))) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+(bwl+bwu)) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE((BWL+BWU))) $ *DBLE(NRHS)) * * Second calc to represent actual hardware speed * * 2*N_FIRST (bwl+bwu)*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE((BWL+BWU))*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST (bwl+bwu)*(bwl+bwu) * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE((BWL+BWU)) $ *DBLE((BWL+BWU))) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB (bwl+bwu)*(bwl+bwu) flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE((BWL+BWU)) $ *DBLE((BWL+BWU))) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( (BWL+BWU)*(BWL+BWU)*(BWL+BWU)/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*(BWL+BWU)*(BWL+BWU)*(BWL+BWU)) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+(bwl+bwu)) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE((BWL+BWU))) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * ((bwl+bwu)+(bwl+bwu)) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE((BWL+BWU))+ $ DBLE((BWL+BWU))))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * ((bwl+bwu)+(bwl+bwu)) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE((BWL+BWU))+ $ DBLE((BWL+BWU))))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*((BWL+BWU)*(BWL+BWU) ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * (BWL+BWU)*(BWL+BWU) ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PDGBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pdgbinfo.f000644 000766 000024 00000044335 10363532303 017416 0ustar00juliestaff000000 000000 SUBROUTINE PDGBINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PDGBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'T' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'T' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDGBINFO * END scalapack-2.0.2/TESTING/LIN/pdgbmv1.f000644 000766 000024 00000076017 10607174520 017174 0ustar00juliestaff000000 000000 SUBROUTINE PDGBDCMV( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, $ B, IB, DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDDBTRF and this is stored in AF. If a linear system * is to be solved using PDDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M, $ DU_N_N, DU_P_M, DU_P_N, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM2, IDUM3, J, JA_NEW, $ LLDA, LLDB, MAX_BW, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PDDBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PDDBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDDBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = ZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (MAX_BW+2)*MAX_BW WORK( I ) = ZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( TRANS, 'N' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * * Use main partition in each processor to multiply locally * CALL DGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, ONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, ZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL DCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWL-DL_N_N ), 1 ) * CALL DTRMV( 'U', 'N', 'N', BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL DGESD2D( ICTXT, BWL, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWL, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = ZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL DCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * CALL DTRMV( $ 'L', $ 'N', $ 'N', BWU, $ A( 1 ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 30 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL DGESD2D( ICTXT, BWU, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL DGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL DAXPY( BWL, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL DGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL DAXPY( BWU, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWU+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( TRANS, 'T' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL DLATCPY( 'L', BWU, BWU, A( OFST+1 ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to left * CALL DTRSD2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL-1 ) * ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL DLATCPY( 'U', BWL, BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to right * CALL DTRSD2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL DGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, ONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, ZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL DCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWU-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL DTRRV2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), MAX_BW, MYROW, MYCOL+1 ) * CALL DTRMV( 'U', 'N', 'N', BWU, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL DGESD2D( ICTXT, BWU, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWU, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = ZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL DCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL DTRRV2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), MAX_BW, MYROW, MYCOL-1 ) * CALL DTRMV( $ 'L', $ 'N', $ 'N', BWL, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 60 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL DGESD2D( ICTXT, BWL, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL DGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL DAXPY( BWU, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL DGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL DAXPY( BWL, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWL+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PDBsBMV1 * END scalapack-2.0.2/TESTING/LIN/pdgelqrv.f000644 000766 000024 00000022775 10363532303 017456 0ustar00juliestaff000000 000000 SUBROUTINE PDGELQRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGELQRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from L, Q * computed by PDGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors L and Q computed * by PDGELQF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors computed by PDGELQF. TAU * is tied to the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IL, IN, $ IPT, IPV, IPW, J, JJA, JV, K, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PDLACPY, $ PDLARFB, PDLARFT, PDLASET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IL = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) * ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IL, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * CALL DESCSET( DESCV, DESCA( MB_ ), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * DO 10 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+K-I, DESCA( MB_ ) ) J = JA + I - IA JV = 1 + I - IA + ICOFF * * Compute upper triangular matrix T * CALL PDLARFT( 'Forward', 'Rowwise', N-J+JA, IB, A, I, J, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'Upper', IB, N-J+JA, A, I, J, DESCA, WORK( IPV ), $ 1, JV, DESCV ) CALL PDLASET( 'Lower', IB, N-J+JA, ZERO, ONE, WORK( IPV ), 1, $ JV, DESCV ) * * Zeroes the strict upper triangular part of sub( A ) to get * block column of L * CALL PDLASET( 'Upper', IB, N-J+JA-1, ZERO, ZERO, A, I, J+1, $ DESCA ) * * Apply block Householder transformation * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', $ M-I+IA, N-J+JA, IB, WORK( IPV ), 1, JV, DESCV, $ WORK( IPT ), A, I, J, DESCA, WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) * 10 CONTINUE * * Handle first block separately * IB = IN - IA + 1 * * Compute upper triangular matrix T * CALL PDLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, TAU, $ WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'Upper', IB, N, A, IA, JA, DESCA, WORK( IPV ), 1, $ ICOFF+1, DESCV ) CALL PDLASET( 'Lower', IB, N, ZERO, ONE, WORK, 1, ICOFF+1, DESCV ) * * Zeroes the strict upper triangular part of sub( A ) to get * block column of L * CALL PDLASET( 'Upper', IB, N-1, ZERO, ZERO, A, IA, JA+1, DESCA ) * * Apply block Householder transformation * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', M, N, $ IB, WORK( IPV ), 1, ICOFF+1, DESCV, WORK( IPT ), A, $ IA, JA, DESCA, WORK( IPW ) ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PDGELQRV * END scalapack-2.0.2/TESTING/LIN/pdgeqlrv.f000644 000766 000024 00000024040 10363532303 017441 0ustar00juliestaff000000 000000 SUBROUTINE PDGEQLRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEQLRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from L, Q * computed by PDGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors L and Q computed * by PDGEQLF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION, array, dimension LOCc(N_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PDGEQLF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK = NB_A * ( 2*Mp0 + Nq0 + NB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IIA, IPT, IPV, IPW, IROFF, $ IV, J, JB, JJA, JN, K, MP, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PDLACPY, $ PDLARFB, PDLARFT, PDLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) CALL INFOG2L( IA, JA+N-K, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPV = 1 IPT = IPV + MP * DESCA( NB_ ) IPW = IPT + DESCA( NB_ ) * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * CALL DESCSET( DESCV, M+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, MP ) ) * * Handle first block separately * IV = 1 + M - K + IROFF JB = JN - JA - N + K + 1 * * Compute upper triangular matrix T * CALL PDLARFT( 'Backward', 'Columnwise', M-N+JN-JA+1, JB, A, IA, $ JA+N-K, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'All', M-N+JN-JA+1, JB, A, IA, JA+N-K, DESCA, $ WORK( IPV ), IROFF+1, 1, DESCV ) CALL PDLASET( 'Lower', JB, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeoes the strict upper triangular part of A to get block * row of L * CALL PDLASET( 'All', M-K, JB, ZERO, ZERO, A, IA, JA+N-K, $ DESCA ) CALL PDLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA+M-K, $ JA+N-K+1, DESCA ) * * Apply block Householder transformation * CALL PDLARFB( 'Left', 'No transpose', 'Backward', 'Columnwise', $ M-N+JN-JA+1, JN-JA+1, JB, WORK( IPV ), IROFF+1, 1, $ DESCV, WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) * * Loop over the remaining column blocks * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) IV = 1 + M - N + J - JA + IROFF * * Compute upper triangular matrix T * CALL PDLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, A, IA, $ J, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'All', M-N+J+JB-JA, JB, A, IA, J, DESCA, $ WORK( IPV ), IROFF+1, 1, DESCV ) CALL PDLASET( 'Lower', JB, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeoes the strict upper triangular part of sub( A ) to get * block row of L * CALL PDLASET( 'All', M-N+J-JA, JB, ZERO, ZERO, A, IA, J, $ DESCA ) CALL PDLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA+M-N+J-JA, $ J+1, DESCA ) * * Apply block Householder transformation * CALL PDLARFB( 'Left', 'No transpose', 'Backward', 'Columnwise', $ M-N+J+JB-JA, J+JB-JA, JB, WORK( IPV ), IROFF+1, $ 1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PDGEQLRV * END scalapack-2.0.2/TESTING/LIN/pdgeqrrv.f000644 000766 000024 00000023441 10363532303 017453 0ustar00juliestaff000000 000000 SUBROUTINE PDGEQRRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEQRRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from Q, R * computed by PDGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors Q and R computed * by PDGEQRF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors computed by PDGEQRF. TAU * is tied to the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK = NB_A * ( 2*Mp0 + Nq0 + NB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, IIA, IPT, IPV, IPW, $ IROFF, IV, J, JB, JJA, JL, JN, K, MP, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PDLACPY, $ PDLARFB, PDLARFT, PDLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPV = 1 IPT = IPV + MP * DESCA( NB_ ) IPW = IPT + DESCA( NB_ ) * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * K = MIN( M, N ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JL = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) * CALL DESCSET( DESCV, M+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, INDXG2P( JL, DESCA( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ), ICTXT, $ MAX( 1, MP ) ) * DO 10 J = JL, JN+1, -DESCA( NB_ ) JB = MIN( JA+K-J, DESCA( NB_ ) ) I = IA + J - JA IV = 1 + J - JA + IROFF * * Compute upper triangular matrix T * CALL PDLARFT( 'Forward', 'Columnwise', M-I+IA, JB, A, I, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'Lower', M-I+IA, JB, A, I, J, DESCA, WORK( IPV ), $ IV, 1, DESCV ) CALL PDLASET( 'Upper', M-I+IA, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeroes the strict lower triangular part of sub( A ) to get * block column of R * CALL PDLASET( 'Lower', M-I+IA-1, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Apply block Householder transformation * CALL PDLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M-I+IA, N-J+JA, JB, WORK( IPV ), IV, 1, DESCV, $ WORK( IPT ), A, I, J, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle first block separately * JB = JN - JA + 1 * * Compute upper triangular matrix T * CALL PDLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPV ), $ IROFF+1, 1, DESCV ) CALL PDLASET( 'Upper', M, JB, ZERO, ONE, WORK, IROFF+1, 1, DESCV ) * * Zeroes the strict lower triangular part of sub( A ) to get block * column of R * CALL PDLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Apply block Householder transformation * CALL PDLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', M, $ N, JB, WORK( IPV ), IROFF+1, 1, DESCV, WORK( IPT ), $ A, IA, JA, DESCA, WORK( IPW ) ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PDGEQRRV * END scalapack-2.0.2/TESTING/LIN/pdgerqrv.f000644 000766 000024 00000023653 10363532303 017460 0ustar00juliestaff000000 000000 SUBROUTINE PDGERQRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGERQRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from R, Q * computed by PDGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors R and Q computed * by PDGERQF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION, array, dimension LOCr(M_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PDGERQF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IN, $ IPT, IPV, IPW, JJA, JV, K, MYCOL, MYROW, NPCOL, $ NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PDLACPY, $ PDLARFB, PDLARFT, PDLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) * ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA+M-K, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * CALL DESCSET( DESCV, DESCA( MB_), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * * Handle first block separately * IB = IN - IA - M + K + 1 JV = 1 + N - K + ICOFF * * Compute upper triangular matrix T * CALL PDLARFT( 'Backward', 'Rowwise', N-M+IN-IA+1, IB, A, IA+M-K, $ JA, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'All', IB, N-M+IN-IA+1, A, IA+M-K, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) CALL PDLASET( 'Upper', IB, IB, ZERO, ONE, WORK( IPV ), 1, JV, $ DESCV ) * * Zeoes the strict lower triangular part of sub( A ) to get block * column of R * CALL PDLASET( 'All', IB, N-K, ZERO, ZERO, A, IA+M-K, JA, $ DESCA ) CALL PDLASET( 'Lower', IB-1, IB, ZERO, ZERO, A, IA+M-K+1, $ JA+N-K, DESCA ) * * Apply block Householder transformation * CALL PDLARFB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ IN-IA+1, N-M+IN-IA+1, IB, WORK( IPV ), 1, ICOFF+1, $ DESCV, WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * * Loop over the remaining row blocks * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) JV = 1 + N - M + I - IA + ICOFF * * Compute upper triangular matrix T * CALL PDLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, I, JA, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'All', IB, N-M+I+IB-IA, A, I, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) CALL PDLASET( 'Upper', IB, IB, ZERO, ONE, WORK( IPV ), 1, JV, $ DESCV ) * * Zeoes the strict Lower triangular part of sub( A ) to get * block column of R * CALL PDLASET( 'All', IB, N-M+I-IA, ZERO, ZERO, A, I, JA, $ DESCA ) CALL PDLASET( 'Lower', IB-1, IB, ZERO, ZERO, A, I+1, $ JA+N-M+I-IA, DESCA ) * * Apply block Householder transformation * CALL PDLARFB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ I+IB-IA, N-M+I+IB-IA, IB, WORK( IPV ), 1, $ ICOFF+1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PDGERQRV * END scalapack-2.0.2/TESTING/LIN/pdgetrrv.f000644 000766 000024 00000030116 10363532303 017453 0ustar00juliestaff000000 000000 SUBROUTINE PDGETRRV( M, N, A, IA, JA, DESCA, IPIV, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDGETRRV reforms sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from the * triangular matrices L and U returned by PDGETRF. It multiplies * an upper triangular matrix stored in the upper triangle of sub( A ) * times the unit lower triangular matrix stored in the lower triangle. * To accomplish this, the routine basically performs the PDGETRF * routine in reverse. * * It computes L*U first, and then apply P: P*L*U => sub( A ). In the * J-th loop, the block column (or column panel), which has the lower * triangular unit matrix L is multiplied with the block row (or row * panel), which contains the upper triangular matrix U. * * ( L1 ) ( 0 0 ) ( L1*U1 L1*U2 ) * A` = L * U + A` = ( ) * (U1 U2) + ( ) = ( ) * ( L2 ) ( 0 A`) ( L2*U1 L2*U2+A` ) * * where L1 is a lower unit triangular matrix and U1 is an upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the distributed matrix sub( A ) * contains the the factors L and U from the factorization * sub( A ) = P*L*U; the unit diagonal elements of L are not * stored. On exit, the original distributed matrix sub( A ) * is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array of dimension (LWORK) * LWORK >= MpA0 * NB_A + NqA0 * MB_A, where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * WORK is used to store a block of columns of L, and a block of * rows of U. INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IL, IPL, IPU, IROFF, J, $ JB, JL, JN, MN, MP, MYCOL, MYROW, NPCOL, NPROW * .. Local Arrays .. INTEGER DESCIP( DLEN_ ), DESCL( DLEN_ ), $ DESCU( DLEN_ ), IDUM( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PDGEMM, PDLACPY, $ PDLAPIV, PDLASET, PB_TOPGET, PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPL = 1 IPU = IPL + MP * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Define array descriptors for L and U * MN = MIN( M, N ) IL = MAX( ( ( IA+MN-2 ) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) JL = MAX( ( ( JA+MN-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+MN-1 ) IAROW = INDXG2P( IL, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) * CALL DESCSET( DESCL, IA+M-IL, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, MP ) ) * CALL DESCSET( DESCU, DESCA( MB_ ), JA+N-JL, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * CALL DESCSET( DESCIP, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ NUMROC( DESCA( M_ ), DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) + DESCA( MB_ ) ) * * DO 10 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+MN-J, DESCA( NB_ ) ) * * Copy unit lower triangular part of sub( A ) into WORK * CALL PDLACPY( 'Lower', M-IL+IA, JB, A, IL, J, DESCA, $ WORK( IPL ), 1, 1, DESCL ) CALL PDLASET( 'Upper', M-IL+IA, JB, ZERO, ONE, WORK( IPL ), $ 1, 1, DESCL ) * * Copy upper triangular part of sub( A ) into WORK(IPU) * CALL PDLACPY( 'Upper', JB, JA+N-J, A, IL, J, DESCA, $ WORK( IPU ), 1, 1, DESCU ) CALL PDLASET( 'Lower', JB-1, JA+N-J, ZERO, ZERO, $ WORK( IPU ), 2, 1, DESCU ) * * Zero the strict lower triangular piece of the current block. * CALL PDLASET( 'Lower', IA+M-IL-1, JB, ZERO, ZERO, A, IL+1, J, $ DESCA ) * * Zero the upper triangular piece of the current block. * CALL PDLASET( 'Upper', JB, JA+N-J, ZERO, ZERO, A, IL, J, $ DESCA ) * * Update the matrix sub( A ). * CALL PDGEMM( 'No transpose', 'No transpose', IA+M-IL, $ JA+N-J, JB, ONE, WORK( IPL ), 1, 1, DESCL, $ WORK( IPU ), 1, 1, DESCU, ONE, A, IL, J, DESCA ) * IL = IL - DESCA( MB_ ) DESCL( M_ ) = DESCL( M_ ) + DESCL( MB_ ) DESCL( RSRC_ ) = MOD( DESCL( RSRC_ ) + NPROW - 1, NPROW ) DESCL( CSRC_ ) = MOD( DESCL( CSRC_ ) + NPCOL - 1, NPCOL ) DESCU( N_ ) = DESCU( N_ ) + DESCU( NB_ ) DESCU( RSRC_ ) = DESCL( RSRC_ ) DESCU( CSRC_ ) = DESCL( CSRC_ ) * 10 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Copy unit lower triangular part of sub( A ) into WORK * CALL PDLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPL ), $ 1, 1, DESCL ) CALL PDLASET( 'Upper', M, JB, ZERO, ONE, WORK( IPL ), 1, 1, $ DESCL ) * * Copy upper triangular part of sub( A ) into WORK(IPU) * CALL PDLACPY( 'Upper', JB, N, A, IA, JA, DESCA, WORK( IPU ), 1, $ 1, DESCU ) CALL PDLASET( 'Lower', JB-1, N, ZERO, ZERO, WORK( IPU ), 2, 1, $ DESCU ) * * Zero the strict lower triangular piece of the current block. * CALL PDLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Zero the upper triangular piece of the current block. * CALL PDLASET( 'Upper', JB, N, ZERO, ZERO, A, IA, JA, DESCA ) * * Update the matrix sub( A ). * CALL PDGEMM( 'No transpose', 'No transpose', M, N, JB, ONE, $ WORK( IPL ), 1, 1, DESCL, WORK( IPU ), 1, 1, $ DESCU, ONE, A, IA, JA, DESCA ) * * Apply pivots so that sub( A ) = P*L*U * CALL PDLAPIV( 'Backward', 'Row', 'Col', MIN( M, N ), N, A, IA, JA, $ DESCA, IPIV, IA, 1, DESCIP, IDUM ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PDGETRRV * END scalapack-2.0.2/TESTING/LIN/pdinvchk.f000644 000766 000024 00000034045 10363532303 017431 0ustar00juliestaff000000 000000 SUBROUTINE PDINVCHK( MATTYP, N, A, IA, JA, DESCA, IASEED, ANORM, $ FRESID, RCOND, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, IASEED, JA, N DOUBLE PRECISION ANORM, FRESID, RCOND * .. * .. Array Arguments .. CHARACTER*3 MATTYP INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDINVCHK computes the scaled residual * * || sub( A ) * inv( sub( A ) ) - I || / ( || sub( A ) || * N * eps ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1). to check the result * returned by the matrix inversion routines. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MATTYP (global input) CHARACTER*3 * The type of the distributed matrix to be generated: * if MATTYP = 'GEN' then GENeral matrix, * if MATTYP = 'UTR' then Upper TRiangular matrix, * if MATTYP = 'LTR' then Lower TRiangular matrix, * if MATTYP = 'UPD' then (Upper) symmetric Positive Definite, * if MATTYP = 'LPD' then (Lower) symmetric Positive Definite, * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of local dimension (LLD_A, LOCc(JA+N-1)). On * entry, sub( A ) contains the distributed matrix inverse * computed by PDGETRI, PDPOTRI or PDTRTRI. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * Seed for the random generation of sub( A ). * * ANORM (global input) DOUBLE PRECISION * The 1-norm of the original matrix sub( A ). * * FRESID (global output) DOUBLE PRECISION * The inversion residual. * * RCOND (global output) DOUBLE PRECISION * The condition number of the original distributed matrix. * RCOND = || sub( A ) ||.|| sub( A )^{-1} || where ||A|| * denotes the 1-norm of A. * * WORK (local workspace) DOUBLE PRECISION array, dimension * MAX(2*LOCr(N_A+MOD(IA-1,MB_A))*MB_A, LDW) * where LDW is the workspace requirement for the norm computa- * tions, see PDLANGE, PDLANSY and PDLANTR. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. CHARACTER AFORM, DIAG, UPLO INTEGER ICTXT, ICURCOL, ICURROW, II, IIA, IPW, IROFF, $ IW, J, JB, JJA, JN, KK, MYCOL, MYROW, NP, $ NPCOL, NPROW DOUBLE PRECISION AUXNORM, EPS, NRMINVAXA, TEMP * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PDGEMM, $ PDLASET, PDMATGEN, PDSYMM, PDTRMM * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE, PDLANSY, PDLANTR EXTERNAL ICEIL, LSAMEN, NUMROC, PDLAMCH, PDLANGE, $ PDLANSY, PDLANTR * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * EPS = PDLAMCH( DESCA( CTXT_ ), 'eps' ) * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Compute the condition number * IF( LSAMEN( 1, MATTYP( 1:1 ), 'U' ) ) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * AFORM = 'N' DIAG = 'D' AUXNORM = PDLANGE( '1', N, N, A, IA, JA, DESCA, WORK ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN * AFORM = 'N' DIAG = 'D' AUXNORM = PDLANTR( '1', UPLO, 'Non unit', N, N, A, IA, JA, $ DESCA, WORK ) ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * AFORM = 'S' DIAG = 'D' AUXNORM = PDLANSY( '1', UPLO, N, A, IA, JA, DESCA, WORK ) * END IF RCOND = ANORM*AUXNORM * * Compute inv(A)*A * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ ICURROW, ICURCOL ) * * Define array descriptor for working array WORK * IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, ICURROW, NPROW ) CALL DESCSET( DESCW, N+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), ICURROW, ICURCOL, DESCA( CTXT_ ), $ MAX( 1, NP ) ) IPW = DESCW( LLD_ ) * DESCW( NB_ ) + 1 * IF( MYROW.EQ.ICURROW ) THEN II = IROFF + 1 NP = NP - IROFF ELSE II = 1 END IF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN - JA + 1 * * Handle first block separately, regenerate a block of columns of A * IW = IROFF + 1 IF( MYCOL.EQ.ICURCOL ) THEN IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN CALL PDMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), WORK, $ DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, IIA-1, NP, $ JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) IF( LSAMEN( 3, MATTYP, 'UTR' ) ) THEN CALL PDLASET( 'Lower', N-1, JB, ZERO, ZERO, WORK, IW+1, $ 1, DESCW ) ELSE CALL PDLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, WORK, IW, $ 2, DESCW ) END IF ELSE CALL PDMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), WORK( IPW ), $ DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, $ IIA-1, NP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) END IF END IF * * Multiply A^{-1}*A * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * CALL PDGEMM( 'No tranpose', 'No transpose', N, JB, N, ONE, A, $ IA, JA, DESCA, WORK( IPW ), IW, 1, DESCW, ZERO, $ WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN * CALL PDTRMM( 'Left', UPLO, 'No tranpose', 'Non unit', N, JB, $ ONE, A, IA, JA, DESCA, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * CALL PDSYMM( 'Left', UPLO, N, JB, ONE, A, IA, JA, DESCA, $ WORK( IPW ), IW, 1, DESCW, ZERO, WORK, IW, 1, $ DESCW ) * END IF * * subtract the identity matrix to the diagonal block of these cols * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 10 KK = 0, JB-1 WORK( II+KK*(DESCW(LLD_)+1) ) = $ WORK( II+KK*(DESCW( LLD_ )+1) )-ONE 10 CONTINUE END IF * NRMINVAXA = PDLANGE( '1', N, JB, WORK, IW, 1, DESCW, WORK( IPW ) ) * IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJA = JJA + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) DESCW( CSRC_ ) = ICURCOL * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) * JB = MIN( N-J+JA, DESCA( NB_ ) ) * * regenerate a block of columns of A * IF( MYCOL.EQ.ICURCOL ) THEN IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN CALL PDMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCW( MB_ ), DESCW( NB_ ), $ WORK, DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), $ IASEED, IIA-1, NP, JJA-1, JB, MYROW, $ MYCOL, NPROW, NPCOL ) IF( LSAMEN( 3, MATTYP, 'UTR' ) ) THEN CALL PDLASET( 'Lower', JA+N-J-1, JB, ZERO, ZERO, $ WORK, IW+J-JA+1, 1, DESCW ) ELSE CALL PDLASET( 'All', J-JA, JB, ZERO, ZERO, WORK, IW, $ 1, DESCW ) CALL PDLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, $ WORK, IW+J-JA, 2, DESCW ) END IF ELSE CALL PDMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCW( MB_ ), DESCW( NB_ ), $ WORK( IPW ), DESCW( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, NP, $ JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) END IF END IF * * Multiply A^{-1}*A * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * CALL PDGEMM( 'No tranpose', 'No transpose', N, JB, N, ONE, $ A, IA, JA, DESCA, WORK( IPW ), IW, 1, DESCW, $ ZERO, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP(2:3), 'TR' ) ) THEN * CALL PDTRMM( 'Left', UPLO, 'No tranpose', 'Non unit', N, JB, $ ONE, A, IA, JA, DESCA, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * CALL PDSYMM( 'Left', UPLO, N, JB, ONE, A, IA, JA, DESCA, $ WORK(IPW), IW, 1, DESCW, ZERO, WORK, IW, 1, $ DESCW ) * END IF * * subtract the identity matrix to the diagonal block of these * cols * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 20 KK = 0, JB-1 WORK( II+KK*(DESCW( LLD_ )+1) ) = $ WORK( II+KK*(DESCW( LLD_ )+1) ) - ONE 20 CONTINUE END IF * * Compute the 1-norm of these JB cols * TEMP = PDLANGE( '1', N, JB, WORK, IW, 1, DESCW, WORK( IPW ) ) NRMINVAXA = MAX( TEMP, NRMINVAXA ) * IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJA = JJA + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) DESCW( CSRC_ ) = ICURCOL * 30 CONTINUE * * Compute the scaled residual * FRESID = NRMINVAXA / ( N * EPS * ANORM ) * RETURN * * End of PDINVCHK * END scalapack-2.0.2/TESTING/LIN/pdinvdriver.f000644 000766 000024 00000077604 10430450246 020167 0ustar00juliestaff000000 000000 PROGRAM PDINVDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PDINVDRIVER is the main test program for the DOUBLE PRECISION * SCALAPACK matrix inversion routines. This test driver computes the * inverse of different kind of matrix and tests the results. * * The program must be driven by a short data file. An annotated example * of a data file can be obtained by deleting the first 3 characters * from the following 14 lines: * 'ScaLAPACK Matrix Inversion Testing input file' * 'PVM machine.' * 'INV.out' output file name (if any) * 6 device out * 5 number of matrix types (next line) * 'GEN' 'UTR' 'LTR' 'UPD' LPD' GEN, UTR, LTR, UPD, LPD * 4 number of problems sizes * 1000 2000 3000 4000 values of N * 3 number of NB's * 4 30 35 values of NB * 2 number of process grids (ordered P & Q) * 4 2 values of P * 4 4 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, INTGSZ, MEMSIZ, NTESTS, TOTMEM DOUBLE PRECISION PADVAL, ZERO PARAMETER ( DBLESZ = 8, INTGSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER UPLO CHARACTER*3 MTYP CHARACTER*6 PASSED CHARACTER*80 OUTFILE LOGICAL CHECK INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, $ IPPIV, IPREPAD, IPOSTPAD, IPIW, IPW, ITEMP, J, $ K, KTESTS, KPASS, KFAIL, KSKIP, L, LCM, LIPIV, $ LIWORK, LWORK, MYCOL, MYROW, N, NB, NGRIDS, $ NMAT, NMTYP, NNB, NOUT, NP, NPCOL, NPROCS, $ NPROW, NQ, WORKIINV, WORKINV, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, RCOND, TMFLOPS * .. * .. Local Arrays .. CHARACTER*3 MATTYP( NTESTS ) INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), $ QVAL( NTESTS ) DOUBLE PRECISION MEM( MEMSIZ ), CTIME( 2 ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDCHEKPAD, $ PDFILLPAD, PDGETRF, PDGETRI, $ PDINVCHK, PDINVINFO, PDLASET, $ PDMATGEN, PDPOTRF, PDPOTRI, $ PDTRTRI, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PDLANGE, PDLANSY, PDLANTR EXTERNAL ICEIL, ILCM, LSAMEN, NUMROC, PDLANGE, $ PDLANSY, PDLANTR * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MOD * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP /4*0/ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PDINVINFO( OUTFILE, NOUT, NMTYP, MATTYP, NTESTS, NMAT, NVAL, $ NTESTS, NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Loop over the different matrix types * DO 40 I = 1, NMTYP * MTYP = MATTYP( I ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a general matrix.' ELSE IF( LSAMEN( 3, MTYP, 'UTR' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is an upper triangular matrix.' ELSE IF( LSAMEN( 3, MTYP, 'LTR' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a lower triangular matrix.' ELSE IF( LSAMEN( 3, MTYP, 'UPD' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a symmetric positive definite matrix.' WRITE( NOUT, FMT = 9986 ) $ 'Only the upper triangular part will be '// $ 'referenced.' ELSE IF( LSAMEN( 3, MTYP, 'LPD' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a symmetric positive definite matrix.' WRITE( NOUT, FMT = 9986 ) $ 'Only the lower triangular part will be '// $ 'referenced.' END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 J = 1, NGRIDS * NPROW = PVAL( J ) NPCOL = QVAL( J ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 K = 1, NMAT * N = NVAL( K ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 L = 1, NNB * NB = NBVAL( L ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for ScaLAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 * LCM = ILCM( NPROW, NPCOL ) IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * Pivots are needed by LU factorization * IPPIV = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + $ IPREPAD LIPIV = ICEIL( INTGSZ * ( NP + NB ), DBLESZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * LWORK = MAX( 1, NP * DESCA( NB_ ) ) WORKINV = LWORK + IPOSTPAD * * Figure the amount of workspace required by the * general matrix inversion * IF( NPROW.EQ.NPCOL ) THEN LIWORK = NQ + DESCA( NB_ ) ELSE * * change the integer workspace needed for PDGETRI * LIWORK = MAX( DESCA( NB_ ), DESCA( MB_ ) * * $ ICEIL( ICEIL( DESCA( LLD_ ), * $ DESCA( MB_ ) ), LCM / NPROW ) ) * $ + NQ LIWORK = NUMROC( DESCA( M_ ) + $ DESCA( MB_ ) * NPROW $ + MOD ( 1 - 1, DESCA( MB_ ) ), DESCA ( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ) + $ MAX ( DESCA( MB_ ) * ICEIL ( ICEIL( $ NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW, $ DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ), $ DESCA( MB_ ) ), LCM / NPROW ), DESCA( NB_ ) ) * END IF WORKIINV = ICEIL( LIWORK*INTGSZ, DBLESZ ) + $ IPOSTPAD IPIW = IPW + WORKINV + IPREPAD WORKSIZ = WORKINV + IPREPAD + WORKIINV * ELSE * * No pivots or workspace needed for triangular or * symmetric positive definite matrices. * IPW = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + IPREPAD WORKSIZ = 1 + IPOSTPAD * END IF * IF( CHECK ) THEN * * Figure amount of work space for the norm * computations * IF( LSAMEN( 3, MTYP, 'GEN' ).OR. $ LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN ITEMP = NQ ELSE ITEMP = 2 * NQ + NP IF( NPROW.NE.NPCOL ) THEN ITEMP = ITEMP + $ NB * ICEIL( ICEIL( NP, NB ), $ LCM / NPROW ) END IF END IF WORKSIZ = MAX( WORKSIZ-IPOSTPAD, ITEMP ) * * Figure the amount of workspace required by the * checking routine * WORKSIZ = MAX( WORKSIZ, 2 * NB * MAX( 1, NP ) ) + $ IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'inversion', $ ( IPW + WORKSIZ ) * DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( LSAMEN( 3, MTYP, 'GEN' ).OR. $ LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * Generate a general diagonally dominant matrix A * CALL PDMATGEN( ICTXT, 'N', 'D', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, $ NQ, MYROW, MYCOL, NPROW, NPCOL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * Generate a symmetric positive definite matrix * CALL PDMATGEN( ICTXT, 'S', 'D', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, $ NQ, MYROW, MYCOL, NPROW, NPCOL ) * END IF * * Zeros not-referenced part of A, if any. * IF( LSAMEN( 1, MTYP, 'U' ) ) THEN * UPLO = 'U' CALL PDLASET( 'Lower', N-1, N-1, ZERO, ZERO, $ MEM( IPA ), 2, 1, DESCA ) * ELSE IF( LSAMEN( 1, MTYP, 'L' ) ) THEN * UPLO = 'L' CALL PDLASET( 'Upper', N-1, N-1, ZERO, ZERO, $ MEM( IPA ), 1, 2, DESCA ) * ELSE * UPLO = 'G' * END IF * * Need 1-norm of A for checking * IF( CHECK ) THEN * CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * CALL PDFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANGE( '1', N, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKINV-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKIINV-IPOSTPAD, 1, $ MEM( IPIW-IPREPAD ), $ WORKIINV-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * ANORM = PDLANTR( '1', UPLO, 'Non unit', N, N, $ MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANTR', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANTR', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * ANORM = PDLANSY( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'SY' ) ) THEN * CALL PDFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANSY( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD,IPOSTPAD, PADVAL ) * END IF * END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * Perform LU factorization * CALL SLTIMER( 1 ) CALL PDGETRF( N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGETRF', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRF', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * Perform the general matrix inversion * CALL SLTIMER( 2 ) CALL PDGETRI( N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ), LWORK, $ MEM( IPIW ), LIWORK, INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGETRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRI', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRI', $ WORKIINV-IPOSTPAD, 1, $ MEM( IPIW-IPREPAD ), $ WORKIINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRI', $ WORKINV-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * Perform the general matrix inversion * CALL SLTIMER( 2 ) CALL PDTRTRI( UPLO, 'Non unit', N, MEM( IPA ), 1, $ 1, DESCA, INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDTRTRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * Perform Cholesky factorization * CALL SLTIMER( 1 ) CALL PDPOTRF( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDPOTRF', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * Perform the symmetric positive definite matrix * inversion * CALL SLTIMER( 2 ) CALL PDPOTRI( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDPOTRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * END IF * IF( CHECK ) THEN * CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Compute fresid = || inv(A)*A-I || * CALL PDINVCHK( MTYP, N, MEM( IPA ), 1, 1, DESCA, $ IASEED, ANORM, FRESID, RCOND, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDINVCHK', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDINVCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. INFO.EQ.0 .AND. $ ( (FRESID-FRESID) .EQ. 0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 IF( INFO.GT.0 ) THEN PASSED = 'SINGUL' ELSE PASSED = 'FAILED' END IF END IF * ELSE * * Don't perform the checking, only the timing * operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * 2/3 N^3 - 1/2 N^2 flops for LU factorization * NOPS = ( 2.0D+0 / 3.0D+0 )*( DBLE( N )**3 ) - $ ( 1.0D+0 / 2.0D+0 )*( DBLE( N )**2 ) * * 4/3 N^3 - N^2 flops for inversion * NOPS = NOPS + $ ( 4.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) - $ DBLE( N )**2 * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * 1/3 N^3 + 2/3 N flops for triangular inversion * CTIME(1) = 0.0D+0 WTIME(1) = 0.0D+0 NOPS = ( 1.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ ( 2.0D+0 / 3.0D+0 ) * ( DBLE( N ) ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * 1/3 N^3 + 1/2 N^2 flops for Cholesky * factorization * NOPS = ( 1.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ ( 1.0D+0 / 2.0D+0 ) * ( DBLE( N )**2 ) * * 2/3 N^3 + 1/2 N^2 flops for Cholesky inversion * NOPS = NOPS + $ ( 2.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ ( 1.0D+0 / 2.0D+0 ) * ( DBLE( N )**2 ) * END IF * * Figure total megaflops -- factorization and * inversion, for WALL and CPU time, and print * output. * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 2 ) .GE. 0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ RCOND, FRESID, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ) + CTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ) .GE. 0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ RCOND, FRESID, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * 40 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q Fct Time Inv Time ', $ ' MFLOPS Cond Resid CHECK' ) 9994 FORMAT( '---- ----- --- ----- ----- -------- -------- ', $ '----------- ------- ------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I5, 1X, I5, 1X, F8.2, 1X, F8.2, $ 1X, F11.2, 1X, F7.1, 1X, F7.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( A ) * STOP * * End of PDINVDRIVER * END scalapack-2.0.2/TESTING/LIN/pdinvinfo.f000644 000766 000024 00000035254 10363532303 017622 0ustar00juliestaff000000 000000 SUBROUTINE PDINVINFO( SUMMRY, NOUT, NMTYP, MATTYP, LDMTYP, NMAT, $ NVAL, LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, $ PVAL, LDPVAL, QVAL, LDQVAL, THRESH, WORK, $ IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDMTYP, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NMTYP, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. CHARACTER*3 MATTYP( LDMTYP ) CHARACTER*( * ) SUMMRY INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PDINVINFO gets needed startup information for matrix inversion * tests and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMTYP (global output) INTEGER * The number of different matrix types to be tested. * * MATTYP (global output) CHARACTER*3 array of dimension of LDMTYP, * The types of matrix to be generated: * if MATTYP(i) = 'GEN' then GENeral matrix, * if MATTYP(i) = 'UTR' then Upper TRiangular matrix, * if MATTYP(i) = 'LTR' then Lower TRiangular matrix, * if MATTYP(i) = 'UPD' then (Upper) symmetric Pos. Definite, * if MATTYP(i) = 'LPD' then (Lower) symmetric Pos. Definite, * * LDMTYP (global input) INTEGER * The maximum number of different matrix types to be tested. * LDMTYP >= NMTYP. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 4, LDMTYP+LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack * all input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT, K DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAMEN DOUBLE PRECISION PDLAMCH EXTERNAL LSAMEN, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='INV.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get the matrix types to be tested * READ( NIN, FMT = * ) NMTYP IF( NMTYP.LT.1 .OR. NMTYP.GT.LDMTYP ) THEN WRITE( NOUT, FMT = 9994 ) 'nb of matrix types', LDMTYP GO TO 40 END IF READ( NIN, FMT = * ) ( MATTYP( I ), I = 1, NMTYP ) * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 40 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 40 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 40 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 40 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I ) * QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS WORK( 4 ) = NMTYP CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 DO 20 K = 1, NMTYP IF( LSAMEN( 3, MATTYP( K ), 'GEN' ) ) THEN WORK( I ) = 1 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'UTR' ) ) THEN WORK( I ) = 2 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'LTR' ) ) THEN WORK( I ) = 3 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'UPD' ) ) THEN WORK( I ) = 4 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'LPD' ) ) THEN WORK( I ) = 5 I = I + 1 END IF 20 CONTINUE * CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Matrix Inversion routines.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision Matrix Inversion '// $ 'routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Inverse residual = ||inv(A)*A - I|| '// $ '/ (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less '// $ 'than THRESH, CHECK is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'Fct time : Time in seconds to factor the'// $ ' matrix, if needed.' WRITE( NOUT, FMT = 9999 ) $ 'Inv Time : Time in seconds to inverse the'// $ ' matrix.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and inverse.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) NMTYP = WORK( 4 ) * I = NMTYP+NMAT+NNB+2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * DO 30 K = 1, NMTYP IF( WORK( K ).EQ.1 ) THEN MATTYP( K ) = 'GEN' ELSE IF( WORK( K ).EQ.2 ) THEN MATTYP( K ) = 'UTR' ELSE IF( WORK( K ).EQ.3 ) THEN MATTYP( K ) = 'LTR' ELSE IF( WORK( K ).EQ.4 ) THEN MATTYP( K ) = 'UPD' ELSE IF( WORK( K ).EQ.5 ) THEN MATTYP( K ) = 'LPD' END IF 30 CONTINUE * I = NMTYP + 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 40 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDINVINFO * END scalapack-2.0.2/TESTING/LIN/pdlafchk.f000644 000766 000024 00000022260 10363532303 017373 0ustar00juliestaff000000 000000 SUBROUTINE PDLAFCHK( AFORM, DIAG, M, N, A, IA, JA, DESCA, IASEED, $ ANORM, FRESID, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER AFORM, DIAG INTEGER IA, IASEED, JA, M, N DOUBLE PRECISION ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDLAFCHK computes the residual * || sub( A ) - sub( Ao ) || / (|| sub( Ao ) ||*eps*MAX(M,N)), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A( IA:IA+M-1, JA:JA+N-1 ) and ||.|| stands for the infini- * ty norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * AFORM (global input) CHARACTER * sub( A ) is overwritten with: * - a symmetric matrix, if AFORM = 'S'; * - a Hermitian matrix, if AFORM = 'H'; * - the transpose of what would normally be generated, * if AFORM = 'T'; * - the conjugate transpose of what would normally be * generated, if AFORM = 'C'; * - otherwise a random matrix. * * DIAG (global input) CHARACTER * if DIAG = 'D' : sub( A ) is diagonally dominant. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub( A ) - sub( Ao ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * ANORM (global input) DOUBLE PRECISION * The Infinity norm of sub( A ). * * FRESID (global output) DOUBLE PRECISION * The maximum (worst) factorizational error. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK). * LWORK >= MpA0 * NB_A, where * * IROFFA = MOD( IA-1, MB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * WORK is used to store a block of columns of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IOFFA, IROFF, $ JB, JJ, JJA, JN, LDA, LDW, MP, MYCOL, MYROW, $ NPCOL, NPROW, NQ DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DMATADD, INFOG2L, PDMATGEN * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL ICEIL, NUMROC, PDLAMCH, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( ICTXT, 'eps' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * * Compute sub( A ) := sub( A ) - sub( Ao ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDW = MAX( 1, MP ) LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA - 1 )*LDA * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN JB = JN-JA+1 CALL PDMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ MP, JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL DMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) JJA = JJA + JB NQ = NQ - JB IOFFA = IOFFA + JB * LDA END IF * * Handle the remaning blocks of columns * DO 10 JJ = JJA, JJA+NQ-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JJA+NQ-JJ ) IOFFA = IIA + ( JJ - 1 ) * LDA CALL PDMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ MP, JJ-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL DMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) 10 CONTINUE * * Calculate factor residual * FRESID = PDLANGE( 'I', M, N, A, IA, JA, DESCA, WORK ) / $ ( MAX( M, N ) * EPS * ANORM ) * RETURN * * End PDLAFCHK * END scalapack-2.0.2/TESTING/LIN/pdlaschk.f000644 000766 000024 00000030241 10363532303 017406 0ustar00juliestaff000000 000000 SUBROUTINE PDLASCHK( SYMM, DIAG, N, NRHS, X, IX, JX, DESCX, $ IASEED, IA, JA, DESCA, IBSEED, ANORM, RESID, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, SYMM INTEGER IA, IASEED, IBSEED, IX, JA, JX, N, NRHS DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION WORK( * ), X( * ) * .. * * Purpose * ======= * * PDLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed matrix, * otherwise sub( A ) is a general distributed matrix. * * DIAG (global input) CHARACTER * If DIAG = 'D', sub( A ) is diagonally dominant. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 0. * * X (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_X,LOCc(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK >= MAX(1,Np)*NB_X + Nq*NB_X + MAX( MAX(NQ*MB_A,2*NB_X), * NB_X * NUMROC( NUMROC(N,MB_X,0,0,NPCOL), MB_X, 0, 0, LCMQ ) ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IB, ICOFF, ICTXT, ICURCOL, IDUMM, $ II, IIA, IIX, IOFFX, IPA, IPB, IPW, IPX, IROFF, $ IXCOL, IXROW, J, JBRHS, JJ, JJA, JJX, LDX, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION BETA, DIVISOR, EPS, RESID1 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, DGEBR2D, $ DGEBS2D, DGEMM, DGERV2D, DGESD2D, $ DGSUM2D, DLASET, PBDTRAN, PDMATGEN * .. * .. External Functions .. INTEGER IDAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IDAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * LDX = MAX( 1, NP ) IPB = 1 IPX = IPB + NP * DESCX( NB_ ) IPA = IPX + NQ * DESCX( NB_ ) * IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF * ICURCOL = IXCOL * * Loop over the rhs * DO 40 J = 1, NRHS, DESCX( NB_ ) JBRHS = MIN( DESCX( NB_ ), NRHS-J+1 ) * * Transpose x from ICURCOL to all rows * IOFFX = IIX + ( JJX - 1 ) * DESCX( LLD_ ) CALL PBDTRAN( ICTXT, 'Column', 'Transpose', N, JBRHS, $ DESCX( MB_ ), X( IOFFX ), DESCX( LLD_ ), ZERO, $ WORK( IPX ), JBRHS, IXROW, ICURCOL, -1, IACOL, $ WORK( IPA ) ) * * Regenerate B in IXCOL * IF( MYCOL.EQ.ICURCOL ) THEN CALL PDMATGEN( ICTXT, 'N', 'N', DESCX( M_ ), DESCX( N_ ), $ DESCX( MB_ ), DESCX( NB_ ), WORK( IPB ), LDX, $ IXROW, IXCOL, IBSEED, IIX-1, NP, JJX-1, $ JBRHS, MYROW, MYCOL, NPROW, NPCOL ) BETA = ONE ELSE BETA = ZERO END IF * IF( NQ.GT.0 ) THEN DO 10 II = IIA, IIA+NP-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IIA+NP-II ) * * Regenerate ib rows of the matrix A(IA:IA+N-1,JA:JA+N-1). * CALL PDMATGEN( ICTXT, SYMM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ WORK( IPA ), IB, DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, II-1, IB, $ JJA-1, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Compute B <= B - A * X. * CALL DGEMM( 'No transpose', 'Transpose', IB, JBRHS, NQ, $ -ONE, WORK( IPA ), IB, WORK( IPX ), JBRHS, $ BETA, WORK( IPB+II-IIA ), LDX ) * 10 CONTINUE * ELSE IF( MYCOL.NE.ICURCOL ) THEN * CALL DLASET( 'All', NP, JBRHS, ZERO, ZERO, WORK( IPB ), $ LDX ) * END IF * * Add B rowwise to ICURCOL * CALL DGSUM2D( ICTXT, 'Row', ' ', NP, JBRHS, WORK( IPB ), LDX, $ MYROW, ICURCOL ) * IF( MYCOL.EQ.ICURCOL ) THEN * * Figure || A * X - B || & || X || * IPW = IPA + JBRHS DO 20 JJ = 0, JBRHS - 1 IF( NP.GT.0 ) THEN II = IDAMAX( NP, WORK( IPB+JJ*LDX ), 1 ) WORK( IPA+JJ ) = ABS( WORK( IPB+II-1+JJ*LDX ) ) WORK( IPW+JJ ) = ABS( X( IOFFX + IDAMAX( NP, $ X( IOFFX + JJ*DESCX( LLD_ ) ), 1 )-1+JJ* $ DESCX( LLD_ ) ) ) ELSE WORK( IPA+JJ ) = ZERO WORK( IPW+JJ ) = ZERO END IF 20 CONTINUE * * After DGAMX2D computation, * WORK(IPB) has the maximum of || Ax - b ||, and * WORK(IPX) has the maximum of || X ||. * CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 2*JBRHS, $ WORK( IPA ), 1, IDUMM, IDUMM, -1, 0, ICURCOL ) * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * IF( MYROW.EQ.0 ) THEN DO 30 JJ = 0, JBRHS - 1 RESID1 = WORK( IPA+JJ ) / ( WORK( IPW+JJ )*DIVISOR ) IF( RESID.LT.RESID1 ) $ RESID = RESID1 30 CONTINUE IF( MYCOL.NE.0 ) $ CALL DGESD2D( ICTXT, 1, 1, RESID, 1, 0, 0 ) END IF * ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * CALL DGERV2D( ICTXT, 1, 1, RESID1, 1, 0, ICURCOL ) IF( RESID.LT.RESID1 ) $ RESID = RESID1 * END IF * IF( MYCOL.EQ.ICURCOL ) $ JJX = JJX + JBRHS ICURCOL = MOD( ICURCOL+1, NPCOL ) * 40 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, RESID, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, RESID, 1, 0, 0 ) END IF * RETURN * * End of PDLASCHK * END scalapack-2.0.2/TESTING/LIN/pdlltdriver.f000644 000766 000024 00000110226 10363532303 020152 0ustar00juliestaff000000 000000 PROGRAM PDLLTDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PDLLTDRIVER is the main test program for the DOUBLE PRECISION * ScaLAPACK Cholesky routines. This test driver performs an * A = L*L**T or A = U**T*U factorization and solve, and optionally * performs condition estimation and iterative refinement. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'ScaLAPACK LLt factorization input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LLT.out' output file name (if any) * 6 device out * 'U' define Lower or Upper * 1 number of problems sizes * 31 100 200 values of N * 1 number of NB's * 2 10 24 values of NB * 1 number of NRHS's * 1 values of NRHS * 1 Number of NBRHS's * 1 values of NBRHS * 1 number of process grids (ordered pairs of P & Q) * 2 values of P * 2 values of Q * 1.0 threshold * T (T or F) Test Cond. Est. and Iter. Ref. Routines * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, INTGSZ, MEMSIZ, NTESTS, TOTMEM DOUBLE PRECISION PADVAL, ZERO PARAMETER ( DBLESZ = 8, INTGSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, EST CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, IMIDPAD, $ INFO, IPA, IPA0, IPB, IPB0, IPBERR, IPFERR, $ IPREPAD, IPOSTPAD, IPW, IPW2, ITEMP, J, K, $ KFAIL, KK, KPASS, KSKIP, KTESTS, LCM, LCMQ, $ LIWORK, LWORK, LW2, MYCOL, MYRHS, MYROW, N, NB, $ NBRHS, NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, $ NPCOL, NPROCS, NPROW, NQ, NRHS, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, ANORM1, FRESID, NOPS, RCOND, $ SRESID, SRESID2, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), MEM( MEMSIZ ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, DESCINIT, $ IGSUM2D, BLACS_PINFO, PDCHEKPAD, PDFILLPAD, $ PDLAFCHK, PDLASCHK, PDLLTINFO, $ PDMATGEN, PDPOCON, PDPORFS, $ PDPOTRF, PDPOTRRV, PDPOTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PDLANSY EXTERNAL ICEIL, ILCM, LSAME, NUMROC, PDLANSY * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 CALL PDLLTINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NNR, NRVAL, NTESTS, NNBR, NBRVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, EST, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 50 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 50 * DO 40 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 40 END IF * DO 30 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IF( EST ) THEN IPA0 = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPA0 + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD ELSE IPW = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD END IF * * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines PDLAFCHK, PDPOTRRV, and * PDLANSY * WORKSIZ = NP * DESCA( NB_ ) * WORKSIZ = MAX( WORKSIZ, DESCA( MB_ ) * DESCA( NB_ ) ) * LCM = ILCM( NPROW, NPCOL ) ITEMP = MAX( 2, 2 * NQ ) + NP IF( NPROW.NE.NPCOL ) THEN ITEMP = ITEMP + $ NB * ICEIL( ICEIL( NP, NB ), LCM / NPROW ) END IF WORKSIZ = MAX( WORKSIZ, ITEMP ) WORKSIZ = WORKSIZ + IPOSTPAD * ELSE * WORKSIZ = IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate a symmetric positive definite matrix A * CALL PDMATGEN( ICTXT, 'Symm', 'Diag', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANSY( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) ANORM1 = PDLANSY( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( EST ) THEN CALL PDMATGEN( ICTXT, 'Symm', 'Diag', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA0 ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) IF( CHECK ) $ CALL PDFILLPAD( ICTXT, NP, NQ, $ MEM( IPA0-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform LLt factorization * CALL SLTIMER( 1 ) * CALL PDPOTRF( UPLO, N, MEM( IPA ), 1, 1, DESCA, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PDPOTRF INFO=', INFO KFAIL = KFAIL + 1 RCOND = ZERO GO TO 60 END IF * IF( CHECK ) THEN * * Check for memory overwrite in LLt factorization * CALL PDCHEKPAD( ICTXT, 'PDPOTRF', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( EST ) THEN * * Calculate workspace required for PDPOCON * LWORK = MAX( 1, 2*NP ) + MAX( 1, 2*NQ ) + $ MAX( 2, DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), $ NQ + DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LIWORK = MAX( 1, NP ) LW2 = ICEIL( LIWORK*INTGSZ, DBLESZ ) + IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'cond est', $ ( IPW2+LW2 )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 60 END IF * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Compute condition number of the matrix * CALL PDPOCON( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ ANORM1, RCOND, MEM( IPW ), LWORK, $ MEM( IPW2 ), LIWORK, INFO ) * IF( CHECK ) THEN CALL PDCHEKPAD( ICTXT, 'PDPOCON', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPOCON', $ LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPOCON', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF END IF * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 10 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB, N, NRHS, NB, NBRHS, 0, 0, $ ICTXT, MAX( 1, NP )+IMIDPAD, $ IERR( 1 ) ) * * move IPW to allow room for RHS * MYRHS = NUMROC( DESCB( N_ ), DESCB( NB_ ), MYCOL, $ DESCB( CSRC_ ), NPCOL ) IPB = IPW * IF( EST ) THEN IPB0 = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD IPFERR = IPB0 + DESCB( LLD_ )*MYRHS + IPOSTPAD $ + IPREPAD IPBERR = MYRHS + IPFERR + IPOSTPAD + IPREPAD IPW = MYRHS + IPBERR + IPOSTPAD + IPREPAD ELSE IPW = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD END IF * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines PDLASCHK * LCMQ = LCM / NPCOL WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ NQ * NBRHS + NP * NBRHS + $ MAX( MAX( NQ*NB, 2*NBRHS ), $ NBRHS * NUMROC( NUMROC(N,NB,0,0,NPCOL),NB, $ 0,0,LCMQ ) ) ) WORKSIZ = IPOSTPAD + WORKSIZ ELSE WORKSIZ = IPOSTPAD END IF * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate RHS * CALL PDMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, NPCOL ) * IF( CHECK ) $ CALL PDFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) * IF( EST ) THEN CALL PDMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB0 ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 2 ) * * Solve linear system via Cholesky factorization * CALL PDPOTRS( UPLO, N, NRHS, MEM( IPA ), 1, 1, $ DESCA, MEM( IPB ), 1, 1, DESCB, $ INFO ) * CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDPOTRS', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPOTRS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PDLASCHK( 'Symm', 'Diag', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, IBSEED, $ ANORM, SRESID, MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' END IF * IF( EST ) THEN * * Calculate workspace required for PDPORFS * LWORK = MAX( 1, 3*NP ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LIWORK = MAX( 1, NP ) LW2 = ICEIL( LIWORK*INTGSZ, DBLESZ ) + $ IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'iter ref', ( IPW2+LW2 )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, LW2-IPOSTPAD, $ 1, MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Use iterative refinement to improve the * computed solution * CALL PDPORFS( UPLO, N, NRHS, MEM( IPA0 ), $ 1, 1, DESCA, MEM( IPA ), 1, 1, $ DESCA, MEM( IPB0 ), 1, 1, $ DESCB, MEM( IPB ), 1, 1, DESCB, $ MEM( IPFERR ), MEM( IPBERR ), $ MEM( IPW ), LWORK, MEM( IPW2 ), $ LIWORK, INFO ) * * check for memory overwrite * IF( CHECK ) THEN CALL PDCHEKPAD( ICTXT, 'PDPORFS', NP, $ NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPORFS', NP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPORFS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPORFS', NP, $ MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPORFS', 1, $ MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPORFS', 1, $ MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPORFS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDPORFS', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PDLASCHK( 'Symm', 'Diag', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, $ IBSEED, ANORM, SRESID2, $ MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID2.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID2 * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 1/3 N^3 + 1/2 N^2 flops for LLt factorization * NOPS = (DBLE(N)**3)/3.0D+0 + $ (DBLE(N)**2)/2.0D+0 * * nrhs * 2 N^2 flops for LLt solve. * NOPS = NOPS + 2.0D+0*(DBLE(N)**2)*DBLE(NRHS) * * Calculate total megaflops -- factorization and * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ PASSED * END IF 10 CONTINUE 20 CONTINUE * IF( CHECK .AND. SRESID.GT.THRESH ) THEN * * Compute FRESID = ||A - LL'|| / (||A|| * N * eps) * CALL PDPOTRRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PDLAFCHK( 'Symm', 'Diag', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDPOTRRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * IF( IAM.EQ.0 ) THEN IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9986 ) 'L*L''', FRESID ELSE WRITE( NOUT, FMT = 9986 ) 'U''*U', FRESID END IF END IF END IF * 30 CONTINUE 40 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 50 CONTINUE * * Print ending messages and close output file * 60 CONTINUE IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UPLO N NB NRHS NBRHS P Q LLt Time ', $ 'Slv Time MFLOPS CHECK' ) 9994 FORMAT( '---- ---- ----- --- ---- ----- ---- ---- -------- ', $ '-------- -------- ------' ) 9993 FORMAT( A4, 4X, A1, 1X, I5, 1X, I3, 1X, I4, 1X, I5, 1X, I4, 1X, $ I4, 1X, F8.2, 1X, F8.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PDLLTDRIVER * END scalapack-2.0.2/TESTING/LIN/pdlltinfo.f000644 000766 000024 00000042145 10363532303 017616 0ustar00juliestaff000000 000000 SUBROUTINE PDLLTINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NNR, NRVAL, LDNRVAL, NNBR, $ NBRVAL, LDNBRVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, THRESH, EST, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL EST CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PDLLTINFO get needed startup information for LLt factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * EST (global output) LOGICAL * Flag indicating if condition estimation and iterative * refinement routines are to be exercised. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 7, LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'LLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Read the flag that indicates whether to test the condition * estimation and iterative refinement routines. * READ( NIN, FMT = * ) EST * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS IF( LSAME( UPLO, 'L' ) ) THEN WORK( 6 ) = 1 ELSE WORK( 6 ) = 2 END IF IF( EST ) THEN WORK( 7 ) = 1 ELSE WORK( 7 ) = 0 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 7, 1, WORK, 7 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Ax=b by LLt factorization.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision LLt factorization '// $ 'and solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data is stored in ''Upper'// $ ''' or ''Lower'' portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'LLt time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 7, 1, WORK, 7, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) IF( WORK( 6 ).EQ.1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF IF( WORK( 7 ).EQ.1 ) THEN EST = .TRUE. ELSE EST = .FALSE. END IF * I = NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDLLTINFO * END scalapack-2.0.2/TESTING/LIN/pdlsdriver.f000644 000766 000024 00000135064 10363532303 020004 0ustar00juliestaff000000 000000 PROGRAM PDLSDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * Purpose * ======= * * PDLSDRIVER is the main test program for the DOUBLE PRECISION * SCALAPACK (full rank) Least Squares routines. This test driver solves * full-rank least square problems. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 17 lines: * 'ScaLapack LS solve input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LS.out' output file name (if any) * 6 device out * 4 number of problems sizes * 55 17 31 201 values of M * 5 71 31 201 values of N * 3 number of NB's * 2 3 5 values of NB * 3 number of NRHS's * 2 3 5 values of NRHS * 2 number of NBRHS's * 1 2 values of NBRHS * 7 number of process grids (ordered P & Q) * 1 2 1 4 2 3 8 values of P * 7 2 4 1 3 2 1 values of Q * 3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, MEMSIZ, NTESTS, TOTMEM DOUBLE PRECISION PADVAL DOUBLE PRECISION ONE, ZERO PARAMETER ( DBLESZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0 ) PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, TPSD CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, II, IMIDPAD, $ INFO, IPA, IPB, IPOSTPAD, IPREPAD, IPW, IPW2, $ IPX, ISCALE, ITRAN, ITYPE, J, JJ, K, KFAIL, KK, $ KPASS, KSKIP, KTESTS, LCM, LCMP, LTAU, LWF, $ LWORK, LWS, M, MNP, MNRHSP, MP, MQ, MYCOL, $ MYROW, N, NB, NBRHS, NCOLS, NGRIDS, NMAT, NNB, $ NNBR, NNR, NNRHSQ, NOUT, NP, NPCOL, NPROCS, $ NPROW, NROWS, NQ, NRHS, NRHSP, NRHSQ, WORKSIZ REAL THRESH DOUBLE PRECISION ADDFAC, ADDS, ANORM, BNORM, MULFAC, MULTS, $ NOPS, SRESID, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCW( LLD_ ), $ DESCX( DLEN_ ), IERR( 2 ), MVAL( NTESTS ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), MEM( MEMSIZ ), RESULT( 2 ), $ WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDCHEKPAD, $ PDFILLPAD, PDGELS, PDGEMM, PDLACPY, $ PDLSINFO, PDMATGEN, PDNRM2, PDSCAL, $ PDQRT13, PDQRT16, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PDLANGE, PDQRT14, PDQRT17 EXTERNAL ICEIL, ILCM, LSAME, NUMROC, PDLANGE, $ PDQRT14, PDQRT17 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * IASEED = 100 IBSEED = 200 CALL PDLSINFO( OUTFILE, NOUT, NMAT, MVAL, NTESTS, NVAL, $ NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, NTESTS, $ NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 90 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 90 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( ( MYROW.GE.NPROW ).OR.( MYCOL.GE.NPCOL ) ) $ GO TO 90 * DO 80 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 80 END IF * * Loop over different blocking sizes * DO 70 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 70 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) MQ = NUMROC( M, NB, MYCOL, 0, NPCOL ) NP = NUMROC( N, NB, MYROW, 0, NPROW ) MNP = MAX( MP, NP ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 70 END IF * DO 60 ISCALE = 1, 3 * ITYPE = ISCALE * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPX = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPX * WORKSIZ = NQ + IPOSTPAD * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'MEMORY', $ ( IPX+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 70 END IF * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Generate the matrix A and calculate its 1-norm * CALL PDQRT13( ISCALE, M, N, MEM( IPA ), 1, 1, $ DESCA, ANORM, IASEED, MEM( IPW ) ) * IF( CHECK ) THEN CALL PDCHEKPAD( ICTXT, 'PDQRT13', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDQRT13', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * DO 50 ITRAN = 1, 2 * IF( ITRAN.EQ.1 ) THEN NROWS = M NCOLS = N TRANS = 'N' TPSD = .FALSE. ELSE NROWS = N NCOLS = M TRANS = 'T' TPSD = .TRUE. END IF * * Loop over the different values for NRHS * DO 40 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 30 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * NRHSP = NUMROC( NRHS, NBRHS, MYROW, 0, $ NPROW ) NRHSQ = NUMROC( NRHS, NBRHS, MYCOL, 0, $ NPCOL ) * * Define Array descriptor for rhs MAX(M,N)xNRHS * CALL DESCINIT( DESCX, MAX( M, N ), NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, MNP ) + IMIDPAD, $ IERR( 1 ) ) IF( TPSD ) THEN CALL DESCINIT( DESCW, M, NRHS, NB, NBRHS, $ 0, 0, ICTXT, MAX( 1, MP ) + $ IMIDPAD, IERR( 2 ) ) ELSE CALL DESCINIT( DESCW, N, NRHS, NB, NBRHS, $ 0, 0, ICTXT, MAX( 1, NP ) + $ IMIDPAD, IERR( 2 ) ) END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, $ 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Check for enough memory * IPX = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + $ IPREPAD IPW = IPX + DESCX( LLD_ )*NRHSQ + IPOSTPAD + $ IPREPAD WORKSIZ = DESCW( LLD_ )*NRHSQ + IPOSTPAD * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Generation', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate RHS * IF( TPSD ) THEN CALL PDMATGEN( ICTXT, 'No', 'No', $ DESCW( M_ ), DESCW( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), $ MEM( IPW ), DESCW( LLD_ ), $ DESCW( RSRC_ ), $ DESCW( CSRC_ ), IBSEED, 0, $ MP, 0, NRHSQ, MYROW, MYCOL, $ NPROW, NPCOL ) ELSE CALL PDMATGEN( ICTXT, 'No', 'No', $ DESCW( M_ ), DESCW( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), $ MEM( IPW ), DESCW( LLD_ ), $ DESCW( RSRC_ ), $ DESCW( CSRC_ ), IBSEED, 0, $ NP, 0, NRHSQ, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PDFILLPAD( ICTXT, MP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PDFILLPAD( ICTXT, NP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * DO 10 JJ = 1, NRHS CALL PDNRM2( NCOLS, BNORM, MEM( IPW ), 1, $ JJ, DESCW, 1 ) IF( BNORM.GT.ZERO ) $ CALL PDSCAL( NCOLS, ONE / BNORM, $ MEM( IPW ), 1, JJ, DESCW, $ 1 ) 10 CONTINUE * CALL PDGEMM( TRANS, 'N', NROWS, NRHS, NCOLS, $ ONE, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ), 1, 1, DESCW, ZERO, $ MEM( IPX ), 1, 1, DESCX ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'Generation', MP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'Generation', MNP, $ NRHSQ, MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PDCHEKPAD( ICTXT, 'Generation', $ MP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PDCHEKPAD( ICTXT, 'Generation', $ NP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Allocate space for copy of rhs * IPB = IPW * IF( TPSD ) THEN CALL DESCINIT( DESCB, N, NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, $ IERR( 1 ) ) ELSE CALL DESCINIT( DESCB, M, NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, $ IERR( 1 ) ) END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * IPW = IPB + DESCB( LLD_ )*NRHSQ + $ IPOSTPAD + IPREPAD * END IF * * Calculate the amount of workspace for PDGELS * IF( M.GE.N ) THEN LTAU = NUMROC( MIN(M,N), NB, MYCOL, 0, $ NPCOL ) LWF = NB * ( MP + NQ + NB ) LWS = MAX( ( NB*( NB - 1 ) ) / 2, $ ( MP + NRHSQ ) * NB ) + NB*NB ELSE LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LTAU = NUMROC( MIN(M,N), NB, MYROW, 0, $ NPROW ) LWF = NB * ( MP + NQ + NB ) LWS = MAX( ( NB*( NB - 1 ) ) / 2, ( NP + $ MAX( NQ + NUMROC( NUMROC( N, NB, 0, $ 0, NPROW ), NB, 0, 0, LCMP ), $ NRHSQ ) ) * NB ) + NB*NB END IF * LWORK = LTAU + MAX( LWF, LWS ) WORKSIZ = LWORK + IPOSTPAD * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'solve', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Make the copy of the right hand side * CALL PDLACPY( 'All', NROWS, NRHS, $ MEM( IPX ), 1, 1, DESCX, $ MEM( IPB ), 1, 1, DESCB ) * IF( TPSD ) THEN CALL PDFILLPAD( ICTXT, NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PDFILLPAD( ICTXT, MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF CALL PDFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * CALL SLBOOT( ) CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Solve the LS or overdetermined system * CALL PDGELS( TRANS, M, N, NRHS, MEM( IPA ), $ 1, 1, DESCA, MEM( IPX ), 1, 1, $ DESCX, MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGELS', MP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGELS', MNP, $ NRHSQ, MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGELS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Regenerate A in place for testing and next * iteration * CALL PDQRT13( ISCALE, M, N, MEM( IPA ), 1, 1, $ DESCA, ANORM, IASEED, $ MEM( IPW ) ) * * check the solution to rhs * IF( CHECK ) THEN * * Am I going to call PDQRT17 ? * IF( ( M.GE.N .AND. ( .NOT.TPSD ) ) .OR. $ ( M.LT.N .AND. TPSD ) ) THEN * * Call PDQRT17 first, A, X, and B remain * unchanged. Solving LS system * * Check amount of memory for PDQRT17 * IF( TPSD ) THEN WORKSIZ = NP*NRHSQ + NRHSP*MQ IPW2 = IPW + WORKSIZ WORKSIZ = WORKSIZ + $ MAX( NQ, MAX( MQ, NRHSQ ) ) + $ IPOSTPAD ELSE WORKSIZ = MP*NRHSQ + NRHSP*NQ IPW2 = IPW + WORKSIZ WORKSIZ = WORKSIZ + $ MAX( NQ, NRHSQ ) + $ IPOSTPAD END IF * * Check for adequate memory for problem * size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'MEMORY', ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PDFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * RESULT( 2 ) = PDQRT17( TRANS, 1, M, N, $ NRHS, $ MEM( IPA ), $ 1, 1, DESCA, $ MEM( IPX ), 1, $ 1, DESCX, $ MEM( IPB ), $ 1, 1, DESCB, $ MEM( IPW ), $ MEM( IPW2 ) ) SRESID = RESULT( 2 ) * CALL PDCHEKPAD( ICTXT, 'PDQRT17', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDQRT17', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PDCHEKPAD( ICTXT, 'PDQRT17', $ NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) ELSE CALL PDCHEKPAD( ICTXT, 'PDQRT17', $ MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF CALL PDCHEKPAD( ICTXT, 'PDQRT17', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Call PDQRT16, B will be destroyed. * IF( TPSD ) THEN WORKSIZ = MP + IPOSTPAD ELSE WORKSIZ = NQ + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'MEMORY', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PDFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PDQRT16( TRANS, M, N, NRHS, $ MEM( IPA ), 1, 1, DESCA, $ MEM( IPX ), 1, 1, DESCX, $ MEM( IPB ), 1, 1, DESCB, $ MEM( IPW ), RESULT( 1 ) ) * CALL PDCHEKPAD( ICTXT, 'PDQRT16', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDQRT16', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PDCHEKPAD( ICTXT, 'PDQRT16', $ NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) ELSE CALL PDCHEKPAD( ICTXT, 'PDQRT16', $ MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF CALL PDCHEKPAD( ICTXT, 'PDQRT16', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * * Call PDQRT14 * IF( ( M.GE.N .AND. TPSD ) .OR. $ ( M.LT.N .AND. ( .NOT.TPSD ) ) ) THEN * IPW = IPB * IF( TPSD ) THEN * NNRHSQ = NUMROC( N+NRHS, NB, MYCOL, $ 0, NPCOL ) LTAU = NUMROC( MIN( M, N+NRHS ), NB, $ MYCOL, 0, NPCOL ) LWF = NB * ( NB + MP + NNRHSQ ) WORKSIZ = MP * NNRHSQ + LTAU + LWF + $ IPOSTPAD * ELSE * MNRHSP = NUMROC( M+NRHS, NB, MYROW, $ 0, NPROW ) LTAU = NUMROC( MIN( M+NRHS, N ), NB, $ MYROW, 0, NPROW ) LWF = NB * ( NB + MNRHSP + NQ ) WORKSIZ = MNRHSP * NQ + LTAU + LWF + $ IPOSTPAD * END IF * * Check for adequate memory for problem * size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'MEMORY', ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PDFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * * Solve underdetermined system * RESULT( 2 ) = PDQRT14( TRANS, M, N, $ NRHS, $ MEM( IPA ), 1, $ 1, DESCA, $ MEM( IPX ), $ 1, 1, DESCX, $ MEM( IPW ) ) SRESID = RESULT( 2 ) * CALL PDCHEKPAD( ICTXT, 'PDQRT14', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDQRT14', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDQRT14', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Print information about the tests that * did not pass the threshold. * PASSED = 'PASSED' DO 20 II = 1, 2 IF( ( RESULT( II ).GE.THRESH ) .AND. $ ( RESULT( II )-RESULT( II ).EQ.0.0E+0 $ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 )TRANS, $ M, N, NRHS, NB, ITYPE, II, $ RESULT( II ) KFAIL = KFAIL + 1 PASSED = 'FAILED' ELSE KPASS = KPASS + 1 END IF 20 CONTINUE * ELSE * * By-pass the solve check * KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock * timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN ADDFAC = 1 MULFAC = 1 IF( M.GE.N ) THEN * * NOPS = DOPLA( 'DGEQRF', M, N, 0, 0, * NB ) + DOPLA( 'DORMQR', M, NRHS, N, * 0, NB ) * MULTS = N*( ( ( 23.D0 / 6.D0 )+M+N / $ 2.D0 )+ N*( M-N / 3.D0 ) ) + $ N*NRHS*( 2.D0*M+2.D0-N ) ADDS = N*( ( 5.D0 / 6.D0 )+N* $ ( 1.D0 / 2.D0+( M-N / 3.D0 ) ) ) $ + N*NRHS*( 2.D0*M+1.D0-N ) ELSE * * NOPS = DOPLA( 'DGELQF', M, N, 0, 0, * NB ) + DOPLA( 'DORMLQ', M, * NRHS, N, 0, NB ) * MULTS = M*( ( ( 29.D0 / 6.D0 )+2.D0*N-M $ / 2.D0 )+M*( N-M / 3.D0 ) ) $ + N*NRHS*( 2.D0*M+2.D0-N ) ADDS = M*( ( 5.D0 / 6.D0 )+M / 2.D0+M* $ ( N-M / 3.D0 ) ) $ + N*NRHS*( 2.D0*M+1.D0-N ) END IF NOPS = ADDFAC*ADDS + MULFAC*MULTS * * Calculate total megaflops, for WALL and * CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) $ 'WALL', TRANS, M, N, NB, NRHS, $ NBRHS, NPROW, NPCOL, WTIME( 1 ), $ TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) $ 'CPU ', TRANS, M, N, NB, NRHS, $ NBRHS, NPROW, NPCOL, CTIME( 1 ), $ TMFLOPS, PASSED END IF 30 CONTINUE 40 CONTINUE 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 90 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'Time TRANS M N NB NRHS NBRHS P Q ', $ 'LS Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- ------ ------ --- ----- ----- ----- ----- ', $ '--------- -------- ------' ) 9993 FORMAT( A4, 3X, A1, 3X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, $ I5, 1X, I5, 1X, F9.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( ' TRANS=''', A1, ''', M=', I5, ', N=', I5, ', NRHS=', I4, $ ', NB=', I4, ', type', I2, ', test(', I2, ')=', G12.5 ) * STOP * * End of PDLSDRIVER * END scalapack-2.0.2/TESTING/LIN/pdlsinfo.f000644 000766 000024 00000040772 10363532303 017445 0ustar00juliestaff000000 000000 SUBROUTINE PDLSINFO( SUMMRY, NOUT, NMAT, MVAL, LDMVAL, $ NVAL, LDNVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDMVAL, LDNBRVAL, LDNBVAL, LDNRVAL, $ LDNVAL, LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, $ NNBR, NNR, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDMVAL ), NBRVAL( LDNBRVAL ), $ NBVAL( LDNBVAL ), NRVAL( LDNRVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), $ QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PDLSINFO gets needed startup information for LS solve and * transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * MVAL (global output) INTEGER array, dimension (LDNVAL) * The values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M, LDNVAL > = NMAT. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (workspace) INTEGER array of dimension >= * MAX(5,LDMVAL+LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL) * Used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='LS.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 ELSE IF( NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'M', LDMVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 5, 1, WORK, 5 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK min ||Ax-b|| by QR factorizations.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision least-square solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - QR|| / '// $ '(||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used. If CPU and WALL time' WRITE( NOUT, FMT = 9999 ) $ ' are the same, only one line '// $ 'is printed, and the label is ''BOTH''.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 ) $ 'QR time : Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 5, 1, WORK, 5, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) * I = 2*NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDLSINFO * END scalapack-2.0.2/TESTING/LIN/pdludriver.f000644 000766 000024 00000125042 11657237147 020020 0ustar00juliestaff000000 000000 PROGRAM PDLUDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======== * * PDLUDRIVER is the main test program for the DOUBLE PRECISION * SCALAPACK LU routines. This test driver performs an LU factorization * and solve. If the input matrix is non-square, only the factorization * is performed. Condition estimation and iterative refinement are * optionally performed. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'SCALAPACK, Version 2.0, LU factorization input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LU.out' output file name (if any) * 6 device out * 1 number of problems sizes * 31 201 values of M * 31 201 values of N * 1 number of NB's * 2 values of NB * 1 number of NRHS's * 1 values of NRHS * 1 number of NBRHS's * 1 values of NBRHS * 1 number of process grids (ordered pairs of P & Q) * 2 1 4 2 3 8 values of P * 2 4 1 3 2 1 values of Q * 1.0 threshold * T (T or F) Test Cond. Est. and Iter. Ref. Routines * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, INTGSZ, MEMSIZ, NTESTS, TOTMEM DOUBLE PRECISION PADVAL, ZERO PARAMETER ( DBLESZ = 8, INTGSZ = 4, TOTMEM = 4000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, EST CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, IMIDPAD, $ INFO, IPA, IPA0, IPB, IPB0, IPBERR, IPFERR, $ IPOSTPAD, IPPIV, IPREPAD, IPW, IPW2, J, K, $ KFAIL, KK, KPASS, KSKIP, KTESTS, LCM, LCMQ, $ LIPIV, LIWORK, LWORK, LW2, M, MAXMN, $ MINMN, MP, MYCOL, MYRHS, MYROW, N, NB, NBRHS, $ NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, NPCOL, $ NPROCS, NPROW, NQ, NRHS, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, ANORM1, FRESID, NOPS, RCOND, $ SRESID, SRESID2, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), IERR( 1 ), $ MVAL( NTESTS ), NBRVAL( NTESTS ), $ NBVAL( NTESTS ), NRVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), $ QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), MEM( MEMSIZ ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDCHEKPAD, $ PDFILLPAD, PDGECON, PDGERFS, $ PDGETRF, PDGETRRV, PDGETRS, $ PDLAFCHK, PDLASCHK, PDLUINFO, $ PDMATGEN, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PDLANGE EXTERNAL ICEIL, ILCM, NUMROC, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 CALL PDLUINFO( OUTFILE, NOUT, NMAT, MVAL, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NNR, NRVAL, NTESTS, NNBR, NBRVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, THRESH, $ EST, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 50 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 50 * DO 40 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 40 END IF * DO 30 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IF( EST .AND. M.EQ.N ) THEN IPA0 = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPPIV = IPA0 + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD ELSE IPPIV = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD END IF LIPIV = ICEIL( INTGSZ*( MP+NB ), DBLESZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * IF( CHECK ) THEN * * Calculate the amount of workspace required by the * checking routines PDLANGE, PDGETRRV, and * PDLAFCHK * WORKSIZ = MAX( 2, NQ ) * WORKSIZ = MAX( WORKSIZ, MP*DESCA( NB_ )+ $ NQ*DESCA( MB_ ) ) * WORKSIZ = MAX( WORKSIZ, MP * DESCA( NB_ ) ) * WORKSIZ = WORKSIZ + IPOSTPAD * ELSE * WORKSIZ = IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate matrix A of Ax = b * CALL PDMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ MP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, LIPIV, 1, MEM( IPPIV-IPREPAD ), $ LIPIV, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANGE( 'I', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) ANORM1 = PDLANGE( '1', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) END IF * IF( EST .AND. M.EQ.N ) THEN CALL PDMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA0 ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) IF( CHECK ) $ CALL PDFILLPAD( ICTXT, MP, NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform LU factorization * CALL PDGETRF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PDGETRF INFO=', INFO KFAIL = KFAIL + 1 RCOND = ZERO GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in LU factorization * CALL PDCHEKPAD( ICTXT, 'PDGETRF', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRF', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * IF( M.NE.N ) THEN * * For non-square matrices, factorization only * NRHS = 0 NBRHS = 0 * IF( CHECK ) THEN * * Compute FRESID = ||A - P*L*U|| / (||A|| * N * eps) * CALL PDGETRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ) ) CALL PDLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGETRRV', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRRV', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( (FRESID-FRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * MAXMN = MAX( M, N ) MINMN = MIN( M, N ) * * M N^2 - 1/3 N^3 - 1/2 N^2 flops for LU * factorization when M >= N * NOPS = DBLE( MAXMN )*( DBLE( MINMN )**2 ) - $ (1.0D+0 / 3.0D+0)*( DBLE( MINMN )**3 ) - $ (1.0D+0 / 2.0D+0)*( DBLE( MINMN )**2 ) * * Calculate total megaflops -- factorization only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * WTIME( 2 ) = 0.0D+0 IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, NB, $ NRHS, NBRHS, NPROW, NPCOL, WTIME( 1 ), $ WTIME( 2 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * CTIME( 2 ) = 0.0D+0 IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, NB, $ NRHS, NBRHS, NPROW, NPCOL, CTIME( 1 ), $ CTIME( 2 ), TMFLOPS, PASSED END IF * ELSE * * If M = N * IF( EST ) THEN * * Calculate workspace required for PDGECON * LWORK = MAX( 1, 2*NP ) + MAX( 1, 2*NQ ) + $ MAX( 2, DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), $ NQ + DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LIWORK = MAX( 1, NP ) LW2 = ICEIL( LIWORK*INTGSZ, DBLESZ ) + IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'cond est', $ ( IPW2+LW2 )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Compute condition number of the matrix * CALL PDGECON( '1', N, MEM( IPA ), 1, 1, DESCA, $ ANORM1, RCOND, MEM( IPW ), LWORK, $ MEM( IPW2 ), LIWORK, INFO ) * IF( CHECK ) THEN CALL PDCHEKPAD( ICTXT, 'PDGECON', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGECON', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGECON', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 10 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB, N, NRHS, NB, NBRHS, 0, 0, $ ICTXT, MAX( 1, NP )+IMIDPAD, $ IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * move IPW to allow room for RHS * MYRHS = NUMROC( DESCB( N_ ), DESCB( NB_ ), $ MYCOL, DESCB( CSRC_ ), NPCOL ) IPB = IPW * IF( EST ) THEN IPB0 = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD IPFERR = IPB0 + DESCB( LLD_ )*MYRHS + $ IPOSTPAD + IPREPAD IPBERR = MYRHS + IPFERR + IPOSTPAD + IPREPAD IPW = MYRHS + IPBERR + IPOSTPAD + IPREPAD ELSE IPW = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD END IF * * Set worksiz: routines requiring most workspace * is PDLASCHK * IF( CHECK ) THEN LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ NQ * NBRHS + NP * NBRHS + $ MAX( MAX( NQ*NB, 2*NBRHS ), $ NBRHS * NUMROC( NUMROC(N,NB,0,0,NPCOL),NB, $ 0,0,LCMQ ) ) ) WORKSIZ = IPOSTPAD + WORKSIZ ELSE WORKSIZ = IPOSTPAD END IF * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate RHS * CALL PDMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) * IF( CHECK ) $ CALL PDFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * IF( EST ) THEN CALL PDMATGEN( ICTXT, 'No', 'No', $ DESCB( M_ ), DESCB( N_ ), $ DESCB( MB_ ), DESCB( NB_ ), $ MEM( IPB0 ), DESCB( LLD_ ), $ DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, $ 0, MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 2 ) * * Solve linear sytem via LU factorization * CALL PDGETRS( 'No', N, NRHS, MEM( IPA ), 1, 1, $ DESCA, MEM( IPPIV ), MEM( IPB ), $ 1, 1, DESCB, INFO ) * CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGETRS', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRS', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PDLASCHK( 'No', 'N', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, IBSEED, $ ANORM, SRESID, MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * The second test is a NaN trap * IF( SRESID.LE.THRESH .AND. $ ( SRESID-SRESID ).EQ.0.0D+0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' END IF * IF( EST ) THEN * * Calculate workspace required for PDGERFS * LWORK = MAX( 1, 3*NP ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LIWORK = MAX( 1, NP ) LW2 = ICEIL( LIWORK*INTGSZ, DBLESZ ) + $ IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'iter ref', ( IPW2+LW2 )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Use iterative refinement to improve the * computed solution * CALL PDGERFS( 'No', N, NRHS, MEM( IPA0 ), 1, $ 1, DESCA, MEM( IPA ), 1, 1, $ DESCA, MEM( IPPIV ), $ MEM( IPB0 ), 1, 1, DESCB, $ MEM( IPB ), 1, 1, DESCB, $ MEM( IPFERR ), MEM( IPBERR ), $ MEM( IPW ), LWORK, MEM( IPW2 ), $ LIWORK, INFO ) * IF( CHECK ) THEN CALL PDCHEKPAD( ICTXT, 'PDGERFS', NP, $ NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGERFS', NP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGERFS', LIPIV, $ 1, MEM( IPPIV-IPREPAD ), $ LIPIV, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGERFS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGERFS', NP, $ MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGERFS', 1, $ MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGERFS', 1, $ MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGERFS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGERFS', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PDLASCHK( 'No', 'N', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, $ IBSEED, ANORM, SRESID2, $ MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID2.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID2 * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * * Gather max. of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 2/3 N^3 - 1/2 N^2 flops for LU factorization * NOPS = (2.0D+0/3.0D+0)*( DBLE(N)**3 ) - $ (1.0D+0/2.0D+0)*( DBLE(N)**2 ) * * nrhs * 2 N^2 flops for LU solve. * NOPS = NOPS + 2.0D+0*(DBLE(N)**2)*DBLE(NRHS) * * Calculate total megaflops -- factorization * and solve -- for WALL and CPU time, and print * output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) $ THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * * Print WALL time if supported * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ PASSED * * Print CPU time if supported * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) $ THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ PASSED END IF 10 CONTINUE 20 CONTINUE * IF( CHECK.AND.( SRESID.GT.THRESH ) ) THEN * * Compute fresid = ||A - P*L*U|| / (||A|| * N * eps) * CALL PDGETRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ) ) CALL PDLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGETRRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRRV', LIPIV, $ 1, MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID END IF END IF 30 CONTINUE 40 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 50 CONTINUE * * Print ending messages and close output file * 60 CONTINUE IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N NB NRHS NBRHS P Q LU Time ', $ 'Sol Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- ----- --- ---- ----- ---- ---- -------- ', $ '-------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I5, 1X, I3, 1X, I5, 1X, I4, 1X, I4, 1X, $ I4, 1X, F8.2, 1X, F8.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - P*L*U|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PDLUDRIVER * END scalapack-2.0.2/TESTING/LIN/pdluinfo.f000644 000766 000024 00000041076 10363532303 017445 0ustar00juliestaff000000 000000 SUBROUTINE PDLUINFO( SUMMRY, NOUT, NMAT, MVAL, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NNR, NRVAL, LDNRVAL, NNBR, $ NBRVAL, LDNBRVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, EST, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL EST CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDNVAL ), NBRVAL( LDNBRVAL ), $ NBVAL( LDNBVAL ), NRVAL( LDNRVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ WORK( * ) * .. * * Purpose * ======= * * PDLUINFO gets needed startup information for LU factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for M and N. * * MVAL (global output) INTEGER array, dimension (LDNVAL) * The values of M (number of rows in matrix) to run the code * with. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * M and N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * EST (global output) LOGICAL * Flag indicating if condition estimation and iterative * refinement routines are to be exercised. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 6, 2*LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL ) * Used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='LU.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Read the flag that indicates whether to test the condition * estimation and iterative refinement routines. * READ( NIN, FMT = * ) EST * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS IF( EST ) THEN WORK( 6 ) = 1 ELSE WORK( 6 ) = 0 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 6, 1, WORK, 6 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Ax=b by LU factorization.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision LU factorization '// $ 'and solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| / '// $ '(||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'LU time : Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 6, 1, WORK, 6, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) IF( WORK( 6 ).EQ.1 ) THEN EST = .TRUE. ELSE EST = .FALSE. END IF * I = 2*NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDLUINFO * END scalapack-2.0.2/TESTING/LIN/pdmatgen.f000644 000766 000024 00000043124 10363532303 017420 0ustar00juliestaff000000 000000 SUBROUTINE PDMATGEN( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PDMATGEN : Parallel Real Double precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * otherwise a random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) DOUBLE PRECISION, pointer into the local * memory to an array of dimension ( LDA, * ) containing the * local pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) DOUBLE PRECISION ONE, TWO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0 ) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDRAND EXTERNAL ICEIL, NUMROC, LSAME, PDRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) * INFO = 0 IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDMATGEN', INFO ) RETURN END IF * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = ONE - TWO*PDRAND(0) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = ONE - TWO*PDRAND(0) 30 CONTINUE A(IK,JK) = ONE - TWO*PDRAND(0) DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = ONE - TWO*PDRAND(0) A(IK,JK+J) = A(IK+J,JK) 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 A(IK,JK) = ONE - TWO*PDRAND(0) JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 A(IK,JK) = ONE - TWO*PDRAND(0) JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSE * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = ONE - TWO*PDRAND(0) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 A(IK,JK+J) = ABS(A(IK,JK+J)) + MAXMN IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PDMATGEN * END scalapack-2.0.2/TESTING/LIN/pdpbdriver.f000644 000766 000024 00000073750 10363532303 017772 0ustar00juliestaff000000 000000 PROGRAM PDPBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PDPBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by DPB. This test driver performs an * A = L*L**T factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER DBLESZ, MEMSIZ, NTESTS DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0, ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BW, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, I, $ IAM, IASEED, IBSEED, ICTXT, ICTXTB, IERR_TEMP, $ IMIDPAD, INFO, IPA, IPB, IPOSTPAD, IPREPAD, $ IPW, IPW_SIZE, IPW_SOLVE, IPW_SOLVE_SIZE, $ IP_DRIVER_W, IP_FILLIN, J, K, KFAIL, KPASS, $ KSKIP, KTESTS, MYCOL, MYRHS_SIZE, MYROW, N, NB, $ NBW, NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, $ NPCOL, NPROCS, NPROCS_REAL, NPROW, NQ, NRHS, $ N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWVAL( NTESTS ), DESCA( 7 ), DESCA2D( DLEN_ ), $ DESCB( 7 ), DESCB2D( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), MEM( MEMSIZ ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDBMATGEN, $ PDCHEKPAD, PDFILLPAD, PDMATGEN, PDPBINFO, $ PDPBLASCHK, PDPBTRF, PDPBTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PDLANGE EXTERNAL LSAME, NUMROC, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PDPBINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NBW, $ BWVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, $ NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BW = BWVAL( BW_NUM ) IF( BW.LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Band', 'bw', BW IERR( 1 ) = 1 END IF * IF( BW.GT.N-1 ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*BW-1)/NPCOL + 1 ) $ + BW NB = MAX( NB, 2*BW ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*BW, N ) ) THEN IERR( 1 ) = 1 ENDIF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (BW+1), (BW+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((BW+1)+10) IMIDPAD = 10 IPOSTPAD = ((BW+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (BW+1), N, $ (BW+1), NB, 0, 0, $ ICTXT,((BW+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((BW+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (NB+2*BW)*BW * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = BW*BW * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((BW+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PDLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PDPBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(BW*(BW+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PDBMATGEN( ICTXT, UPLO, 'B', BW, BW, N, (BW+1), NB, $ MEM( IPA ), ((BW+1)+10), 0, 0, IASEED, $ MYROW, MYCOL, NPROW, NPCOL ) * CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((BW+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PDLANGE( '1', (BW+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((BW+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PDPBTRF( UPLO, N, BW, MEM( IPA ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PDPBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PDCHEKPAD( ICTXT, 'PDPBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((BW+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (BW*NRHS) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PDMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PDFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PDPBTRS( UPLO, N, BW, NRHS, MEM( IPA ), 1, $ DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PDPBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDPBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PDPBLASCHK( 'S', UPLO, N, BW, BW, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * NOPS = NOPS + DBLE(BW)*( -2.D0 / 3.D0+DBLE(BW)* $ ( -1.D0+DBLE(BW)*( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( 1.D0+DBLE(BW)*( 3.D0 / $ 2.D0+DBLE(BW)*( 1.D0 / 2.D0 ) ) ) NOPS = NOPS + DBLE(BW)*( -1.D0 / 6.D0+DBLE(BW) $ *( -1.D0 /2.D0+DBLE(BW) $ *( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( DBLE(BW) / $ 2.D0*( 1.D0+DBLE(BW) ) ) * NOPS = NOPS + $ DBLE(NRHS)*( ( 2*DBLE(N)-DBLE(BW) )* $ ( DBLE(BW)+1.D0 ) )+ DBLE(NRHS)* $ ( DBLE(BW)*( 2*DBLE(N)- $ ( DBLE(BW)+1.D0 ) ) ) * * * Second calc to represent actual hardware speed * * NB bw^2 flops for LLt factorization in 1st proc * NOPS2 = ( (DBLE(N_FIRST))* DBLE(BW)**2 ) * IF ( NPROCS_REAL .GT. 1) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in last processor * NOPS2 = NOPS2 + $ 4*( (DBLE(N_LAST)*DBLE(BW)**2) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 4*( (DBLE(NB)*DBLE(BW)**2) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ ( NPROCS_REAL-1 ) * ( BW*BW*BW/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ ( NPROCS_REAL-2 ) * ( 2 * BW*BW*BW ) ENDIF * * * nrhs * 4 n_first*bw flops for LLt solve in proc 1. * NOPS2 = NOPS2 + $ ( 4.0D+0*(DBLE(N_FIRST)*DBLE(BW))*DBLE(NRHS) ) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*4 n_last*bw flops for LLt solve in last. * NOPS2 = NOPS2 + $ 2*( 4.0D+0*(DBLE(N_LAST)*DBLE(BW))*DBLE(NRHS) ) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2 * nrhs * 4 NB*bw flops for LLt solve in others. * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( 4.0D+0*(DBLE(NB)*DBLE(BW))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1 ) * ( BW*BW ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * ( 3 * BW*BW ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UL N BW NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- ---- ----- -- ---- -------- ', $ '-------- ------ ------ ------' ) 9993 FORMAT( A4, 2X, A1, 1X, I6, 1X, I3, 1X, I4, 1X, $ I5, 1X, I2, 1X, $ I4, 1X, F8.3, F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PDPBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pdpbinfo.f000644 000766 000024 00000044724 10363532303 017431 0ustar00juliestaff000000 000000 SUBROUTINE PDPBINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NBW, $ BWVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PDPBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of subdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( UPLO, 'L' ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data represents ''Upper'// $ ''' or ''Lower'' triangular portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'TRANS : Whether solve is to be done with'// $ ' ''Transpose'' of matrix A (T,C) or not (N).' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bw : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bw ', ( BWVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDPBINFO * END scalapack-2.0.2/TESTING/LIN/pdpblaschk.f000644 000766 000024 00000025500 10363532303 017732 0ustar00juliestaff000000 000000 SUBROUTINE PDPBLASCHK( SYMM, UPLO, N, BWL, BWU, NRHS, X, IX, JX, $ DESCX, IASEED, A, IA, JA, DESCA, IBSEED, $ ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PDPBLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'S', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,max(bw*(bw+2),NB))+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,max(bw*(bw+2),NB))+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER BW, INFO, IPPRODUCT, WORK_MIN DOUBLE PRECISION DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, DGEBR2D, $ DGEBS2D, DGEMM, DGERV2D, DGESD2D, $ DGSUM2D, DLASET, PBDTRAN, PDMATGEN * .. * .. External Functions .. INTEGER IDAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IDAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'S' ) ) THEN BW = BWL WORK_MIN = MAX(5,MAX(BW*(BW+2),NB))+2*NB ELSE BW = MAX(BWL, BWU) WORK_MIN = MAX(5,MAX(BW*(BW+2),NB))+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PDBLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (BW+1), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'S' )) THEN CALL PDBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PDBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PDPBDCMV( BW+1, BW, UPLO, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), (BW+2)*BW, INFO ) * * * Regenerate column of B * CALL PDMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PDAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PDNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PDNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PDBLASCHK * END scalapack-2.0.2/TESTING/LIN/pdpbmv1.f000644 000766 000024 00000072747 10607174520 017213 0ustar00juliestaff000000 000000 SUBROUTINE PDPBDCMV( LDBW, BW, UPLO, N, A, JA, DESCA, NRHS, B, IB, $ DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDPBTRF and this is stored in AF. If a linear system * is to be solved using PDPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, $ FIRST_PROC, I, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, J, JA_NEW, LLDA, LLDB, MYCOL, $ MYROW, MY_NUM_COLS, NB, NP, NPCOL, NPROW, $ NP_SAVE, ODD_SIZE, OFST, PART_OFFSET, $ PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Pre-calculate bw^2 * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -14 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 7*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 7*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PDPBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, $ 'PDPBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1005 PARAM_CHECK( 15, 2 ) = 1004 PARAM_CHECK( 14, 2 ) = 1003 PARAM_CHECK( 13, 2 ) = 1002 PARAM_CHECK( 12, 2 ) = 1001 PARAM_CHECK( 11, 2 ) = 9 PARAM_CHECK( 10, 2 ) = 705 PARAM_CHECK( 9, 2 ) = 704 PARAM_CHECK( 8, 2 ) = 703 PARAM_CHECK( 7, 2 ) = 701 PARAM_CHECK( 6, 2 ) = 6 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDPBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = ZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (BW+2)*BW WORK( I ) = ZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( UPLO, 'L' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL DLATCPY( 'U', BW, BW, $ A( LLDA*( NUMROC_SIZE-BW )+1+BW ), $ LLDA-1, WORK( 1 ), BW ) * * Send the triangle to neighboring processor to right * CALL DTRSD2D(ICTXT, 'L', 'N', $ BW, BW, $ WORK( 1 ), $ BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL DSBMV( 'L', NUMROC_SIZE, BW, ONE, A( OFST+1 ), LLDA, $ B(PART_OFFSET+1), 1, ZERO, X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL DCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( BW*BW+1+BW-DL_N_N ), 1 ) * CALL DTRMV( 'U', 'N', 'N', BW, $ A( LLDA*( NUMROC_SIZE-BW )+1+BW ), LLDA-1, $ WORK( BW*BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( BW*BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL DGESD2D( ICTXT, BW, 1, $ WORK( BW*BW+1 ), BW, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, BW*( BW+2 ) WORK( I ) = ZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL DCOPY( DL_P_M, B( 1 ), 1, $ WORK( BW*BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL DTRRV2D(ICTXT, 'L', 'N', $ BW, BW, $ WORK( 1 ), BW, MYROW, MYCOL-1 ) * CALL DTRMV( $ 'L', $ 'N', $ 'N', BW, $ WORK( 1 ), BW, $ WORK( BW*BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DL_P_M .GT. DL_P_N ) THEN DO 30 I=1, DL_P_M-DL_P_N WORK( BW*BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL DGESD2D( ICTXT, BW, 1, WORK(BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL DGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL DAXPY( BW, ONE, $ WORK( BW*BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL DGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL DAXPY( BW, ONE, $ WORK( BW*BW+1 ), 1, $ X( NUMROC_SIZE-BW+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( UPLO, 'U' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL DLATCPY( 'L', BW, BW, A( OFST+1 ), $ LLDA-1, WORK( 1 ), BW ) * * Send the triangle to neighboring processor to left * CALL DTRSD2D(ICTXT, 'U', 'N', $ BW, BW, $ WORK( 1 ), $ BW, MYROW, MYCOL-1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL DSBMV( 'U', NUMROC_SIZE, BW, ONE, A( OFST+1 ), LLDA, $ B(PART_OFFSET+1), 1, ZERO, X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL DCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( BW*BW+1+BW-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL DTRRV2D(ICTXT, 'U', 'N', $ BW, BW, $ WORK( 1 ), BW, MYROW, MYCOL+1 ) * CALL DTRMV( 'U', 'N', 'N', BW, $ WORK( 1 ), BW, $ WORK( BW*BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( BW*BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL DGESD2D( ICTXT, BW, 1, $ WORK( BW*BW+1 ), BW, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, BW*( BW+2 ) WORK( I ) = ZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL DCOPY( DL_P_M, B( 1 ), 1, $ WORK( BW*BW+1 ), 1) * CALL DTRMV( $ 'L', $ 'N', $ 'N', BW, $ A( 1 ), LLDA-1, $ WORK( BW*BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DL_P_M .GT. DL_P_N ) THEN DO 60 I=1, DL_P_M-DL_P_N WORK( BW*BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL DGESD2D( ICTXT, BW, 1, WORK(BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL DGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL DAXPY( BW, ONE, $ WORK( BW*BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL DGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL DAXPY( BW, ONE, $ WORK( BW*BW+1 ), 1, $ X( NUMROC_SIZE-BW+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PDBsBMV1 * END scalapack-2.0.2/TESTING/LIN/pdpotrrv.f000644 000766 000024 00000027365 10363532303 017512 0ustar00juliestaff000000 000000 SUBROUTINE PDPOTRRV( UPLO, N, A, IA, JA, DESCA, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDPOTRRV recomputes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from L or U * computed by PDPOTRF. The routine performs the Cholesky factorization * in reverse. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric distributed matrix sub( A ) is stored: * stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the factors L or U of the * distributed matrix sub( A ) from the Cholesky factorization. * On exit, the original distributed matrix sub( A ) is * restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK >= MB_A*NB_A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IL, J, JB, JL, JN, MYCOL, $ MYROW, NPCOL, NPROW * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PDLACPY, PDLASET, $ PDSYRK, PDTRMM, PB_TOPGET, PB_TOPSET * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL ICEIL, INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * UPPER = LSAME( UPLO, 'U' ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JL = MAX( ( ( JA+N-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) IL = MAX( ( ( IA+N-2 ) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) IAROW = INDXG2P( IL, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) * * Define array descriptor for working array WORK * CALL DESCSET( DESCW, DESCA( MB_ ), DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * IF ( UPPER ) THEN * * Compute A from the Cholesky factor U : A = U'*U. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'S-ring' ) * DO 10 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + U'*U * CALL PDSYRK( 'Upper', 'Transpose', JA+N-J-JB, JB, ONE, A, IL, $ J+JB, DESCA, ONE, A, IL+JB, J+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PDLACPY( 'All', JB, JB, A, IL, J, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict lower triangular part of diagonal block, to make * it U1. * CALL PDLASET( 'Lower', JB-1, JB, ZERO, ZERO, A, IL+1, J, $ DESCA ) * * Update the row panel U with the triangular matrix * CALL PDTRMM( 'Left', 'Upper', 'Transpose', 'Non-Unit', JB, $ N-J+JA, ONE, WORK, 1, 1, DESCW, A, IL, J, $ DESCA ) * * Restore the strict lower triangular part of diagonal block. * CALL PDLACPY( 'Lower', JB-1, JB, WORK, 2, 1, DESCW, A, $ IL+1, J, DESCA ) * IL = IL - DESCA( MB_ ) DESCW( RSRC_ ) = MOD( DESCW( RSRC_ ) + NPROW - 1, NPROW ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + U'*U * CALL PDSYRK( 'Upper', 'Transpose', N-JB, JB, ONE, A, IA, JA+JB, $ DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PDLACPY( 'All', JB, JB, A, IA, JA, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict lower triangular part of diagonal block, to make * it U1. * CALL PDLASET( 'Lower', JB-1, JB, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Update the row panel U with the triangular matrix * CALL PDTRMM( 'Left', 'Upper', 'Transpose', 'Non-Unit', JB, $ N, ONE, WORK, 1, 1, DESCW, A, IA, JA, DESCA ) * * Restore the strict lower triangular part of diagonal block. * CALL PDLACPY( 'Lower', JB-1, JB, WORK, 2, 1, DESCW, A, IA+1, $ JA, DESCA ) * ELSE * * Compute A from the Cholesky factor L : A = L*L'. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * DO 20 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + L*L' * CALL PDSYRK( 'Lower', 'No transpose', IA+N-IL-JB, JB, ONE, A, $ IL+JB, J, DESCA, ONE, A, IL+JB, J+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PDLACPY( 'All', JB, JB, A, IL, J, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict upper triangular part of diagonal block, to make * it L1. * CALL PDLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IL, J+1, $ DESCA ) * * Update the column panel L with the triangular matrix * CALL PDTRMM( 'Right', 'Lower', 'Transpose', 'Non-Unit', $ IA+N-IL, JB, ONE, WORK, 1, 1, DESCW, A, IL, $ J, DESCA ) * * Restore the strict upper triangular part of diagonal block. * CALL PDLACPY( 'Upper', JB, JB-1, WORK, 1, 2, DESCW, A, $ IL, J+1, DESCA ) * IL = IL - DESCA( MB_ ) DESCW( RSRC_ ) = MOD( DESCW( RSRC_ ) + NPROW - 1, NPROW ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 20 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + L*L' * CALL PDSYRK( 'Lower', 'No transpose', N-JB, JB, ONE, A, $ IA+JB, JA, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PDLACPY( 'All', JB, JB, A, IA, JA, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict upper triangular part of diagonal block, to make * it L1. * CALL PDLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA, JA+1, $ DESCA ) * * Update the column panel L with the triangular matrix * CALL PDTRMM( 'Right', 'Lower', 'Transpose', 'Non-Unit', N, JB, $ ONE, WORK, 1, 1, DESCW, A, IA, JA, DESCA ) * * Restore the strict upper triangular part of diagonal block. * CALL PDLACPY( 'Upper', JB, JB-1, WORK, 1, 2, DESCW, A, IA, $ JA+1, DESCA ) * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PDPOTRRV * END scalapack-2.0.2/TESTING/LIN/pdptdriver.f000644 000766 000024 00000074747 10363532303 020023 0ustar00juliestaff000000 000000 PROGRAM PDPTDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PDPTDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by DPT. This test driver performs an * A = L*L**T factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER DBLESZ, MEMSIZ, NTESTS DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0, ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BW, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, I, $ IAM, IASEED, IBSEED, ICTXT, ICTXTB, IERR_TEMP, $ IMIDPAD, INFO, INT_TEMP, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWVAL( NTESTS ), DESCA( 7 ), DESCA2D( DLEN_ ), $ DESCB( 7 ), DESCB2D( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), MEM( MEMSIZ ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDBMATGEN, $ PDCHEKPAD, PDFILLPAD, PDMATGEN, PDPTINFO, $ PDPTLASCHK, PDPTTRF, PDPTTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PDLANGE EXTERNAL LSAME, NUMROC, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PDPTINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NBW, $ BWVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, $ NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BW = 1 IF( BW.LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Band', 'bw', BW IERR( 1 ) = 1 END IF * IF( BW.GT.N-1 ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*INT_ONE-1)/NPCOL + 1 ) $ + INT_ONE NB = MAX( NB, 2*INT_ONE ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*INT_ONE, N ) ) THEN IERR( 1 ) = 1 ENDIF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (2), (2), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((2)+10) IMIDPAD = 10 IPOSTPAD = ((2)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, N, (2), $ NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((2)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + (NB+10)*(2) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (12*NPCOL + 3*NB) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 8*NPCOL * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((2)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PDLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PDPTLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,NB)+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PDBMATGEN( ICTXT, UPLO, 'T', BW, BW, N, (2), NB, $ MEM( IPA ), NB+10, 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) CALL PDFILLPAD( ICTXT, NQ, NP, MEM( IPA-IPREPAD ), $ NB+10, IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PDLANGE( 'I', N, $ (2), MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', NQ, NP, $ MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( LSAME( UPLO, 'L' ) ) THEN INT_TEMP = 0 ELSE INT_TEMP = DESCA2D( LLD_ ) ENDIF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PDPTTRF( N, MEM( IPA+INT_TEMP ), $ MEM( IPA+1*( NB+10-INT_TEMP ) ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PDPTTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PDCHEKPAD( ICTXT, 'PDPTTRF', NQ, $ NP, MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (10+2*MIN(100,NRHS))*NPCOL+4*NRHS * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PDMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PDFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PDPTTRS( N, NRHS, MEM( IPA+INT_TEMP ), $ MEM( IPA+1*( NB+10-INT_TEMP ) ), 1, $ DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PDPTTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDPTTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * * Reset descriptor describing A to 1-by-P grid for * use in banded utility routines * CALL DESCINIT( DESCA2D, (2), N, $ (2), NB, 0, 0, $ ICTXT, (2), IERR( 1 ) ) CALL PDPTLASCHK( 'S', UPLO, N, BW, BW, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * NOPS = NOPS + DBLE(BW)*( -2.D0 / 3.D0+DBLE(BW)* $ ( -1.D0+DBLE(BW)*( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( 1.D0+DBLE(BW)*( 3.D0 / $ 2.D0+DBLE(BW)*( 1.D0 / 2.D0 ) ) ) NOPS = NOPS + DBLE(BW)*( -1.D0 / 6.D0+DBLE(BW) $ *( -1.D0 /2.D0+DBLE(BW) $ *( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( DBLE(BW) / $ 2.D0*( 1.D0+DBLE(BW) ) ) * NOPS = NOPS + $ DBLE(NRHS)*( ( 2*DBLE(N)-DBLE(BW) )* $ ( DBLE(BW)+1.D0 ) )+ DBLE(NRHS)* $ ( DBLE(BW)*( 2*DBLE(N)- $ ( DBLE(BW)+1.D0 ) ) ) * * * Second calc to represent actual hardware speed * * NB bw^2 flops for LLt factorization in 1st proc * NOPS2 = ( (DBLE(N_FIRST))* DBLE(BW)**2 ) * IF ( NPROCS_REAL .GT. 1) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in last processor * NOPS2 = NOPS2 + $ 4*( (DBLE(N_LAST)*DBLE(BW)**2) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 4*( (DBLE(NB)*DBLE(BW)**2) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ ( NPROCS_REAL-1 ) * ( BW*BW*BW/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ ( NPROCS_REAL-2 ) * ( 2 * BW*BW*BW ) ENDIF * * * nrhs * 4 n_first*bw flops for LLt solve in proc 1. * NOPS2 = NOPS2 + $ ( 4.0D+0*(DBLE(N_FIRST)*DBLE(BW))*DBLE(NRHS) ) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*4 n_last*bw flops for LLt solve in last. * NOPS2 = NOPS2 + $ 2*( 4.0D+0*(DBLE(N_LAST)*DBLE(BW))*DBLE(NRHS) ) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2 * nrhs * 4 NB*bw flops for LLt solve in others. * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( 4.0D+0*(DBLE(NB)*DBLE(BW))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1 ) * ( BW*BW ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * ( 3 * BW*BW ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UL N BW NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- ---- ----- -- ---- -------- ', $ '-------- ------ ------ ------' ) 9993 FORMAT( A4, 2X, A1, 1X, I6, 1X, I3, 1X, I4, 1X, $ I5, 1X, I2, 1X, $ I4, 1X, F8.3, F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PDPTTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pdptinfo.f000644 000766 000024 00000044745 10363532303 017456 0ustar00juliestaff000000 000000 SUBROUTINE PDPTINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NBW, $ BWVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PDPTINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of subdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW NBW = 1 IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( UPLO, 'L' ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data represents ''Upper'// $ ''' or ''Lower'' triangular portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'TRANS : Whether solve is to be done with'// $ ' ''Transpose'' of matrix A (T,C) or not (N).' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bw : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bw ', ( BWVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDPTINFO * END scalapack-2.0.2/TESTING/LIN/pdptlaschk.f000644 000766 000024 00000027436 10363532303 017766 0ustar00juliestaff000000 000000 SUBROUTINE PDPTLASCHK( SYMM, UPLO, N, BWL, BWU, NRHS, X, IX, JX, $ DESCX, IASEED, A, IA, JA, DESCA, IBSEED, $ ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PDPTLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'S', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,NB)+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,NB)+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER I, START INTEGER BW, INFO, IPPRODUCT, WORK_MIN DOUBLE PRECISION DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, DGEBR2D, $ DGEBS2D, DGEMM, DGERV2D, DGESD2D, $ DGSUM2D, DLASET, PBDTRAN, PDMATGEN * .. * .. External Functions .. INTEGER IDAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IDAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'S' ) ) THEN BW = BWL START = 1 WORK_MIN = MAX(5,NB)+2*NB ELSE BW = MAX(BWL, BWU) IF( LSAME( UPLO, 'D' )) THEN START = 1 ELSE START = 2 ENDIF WORK_MIN = MAX(5,NB)+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PDTLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (2), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'S' )) THEN CALL PDBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PDBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF IF( LSAME( UPLO, 'U' ) ) THEN * * * Matrix formed above has the diagonals shifted from what was * input to the tridiagonal routine. Shift them back. * * Send elements to neighboring processors * IF( MYCOL.LT.NPCOL-1 ) THEN CALL DGESD2D( ICTXT, 1, 1, $ A( START+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * * Shift local elements * DO 230 I=DESCA( NB_ )-1,0,-1 A( START+(I+1)*LDA ) = A( START+(I)*LDA ) 230 CONTINUE * * Receive elements from neighboring processors * IF( MYCOL.GT.0 ) THEN CALL DGERV2D( ICTXT, 1, 1, A( START), LDA, $ MYROW, MYCOL-1 ) ENDIF * ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PDPBDCMV( BW+1, BW, UPLO, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), (BW+2)*BW, INFO ) * * * Regenerate column of B * CALL PDMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PDAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PDNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PDNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PDTLASCHK * END scalapack-2.0.2/TESTING/LIN/pdqrdriver.f000644 000766 000024 00000121657 10363532303 020013 0ustar00juliestaff000000 000000 PROGRAM PDQRDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * Purpose * ======= * * PDQRDRIVER is the main test program for the DOUBLE PRECISION * SCALAPACK QR factorization routines. This test driver performs a QR * QL, LQ, RQ, QP (QR factorization with column pivoting) or TZ * (complete orthogonal factorization) factorization and checks the * results. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 16 lines: * 'ScaLAPACK QR factorizations input file' * 'PVM machine' * 'QR.out' output file name (if any) * 6 device out * 6 number of factorizations * 'QR' 'QL' 'LQ' 'RQ' 'QP' 'TZ' factorization: QR, QL, LQ, RQ, QP, TZ * 4 number of problems sizes * 55 17 31 201 values of M * 5 71 31 201 values of N * 3 number of MB's and NB's * 4 3 5 values of MB * 4 7 3 values of NB * 7 number of process grids (ordered P & Q) * 1 2 1 4 2 3 8 values of P * 7 2 4 1 3 2 1 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, INTGSZ, MEMSIZ, NTESTS, TOTMEM DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, INTGSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0 ) * .. * .. Local Scalars .. CHARACTER*2 FACT CHARACTER*6 PASSED CHARACTER*7 ROUT CHARACTER*8 ROUTCHK CHARACTER*80 OUTFILE LOGICAL CHECK INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPPIV, IPREPAD, IPTAU, IPW, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, L, LIPIV, LTAU, $ LWORK, M, MAXMN, MB, MINMN, MNP, MNQ, MP, $ MYCOL, MYROW, N, NB, NFACT, NGRIDS, NMAT, NNB, $ NOUT, NPCOL, NPROCS, NPROW, NQ, WORKFCT, $ WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Arrays .. CHARACTER*2 FACTOR( NTESTS ) INTEGER DESCA( DLEN_ ), IERR( 1 ), MBVAL( NTESTS ), $ MVAL( NTESTS ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), MEM( MEMSIZ ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDCHEKPAD, $ PDFILLPAD, PDGELQF, PDGELQRV, $ PDGEQLF, PDGEQLRV, PDGEQPF, $ PDQPPIV, PDGEQRF, PDGEQRRV, $ PDGERQF, PDGERQRV, PDTZRZRV, $ PDMATGEN, PDLAFCHK, PDQRINFO, $ PDTZRZF, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLANGE EXTERNAL ICEIL, LSAMEN, NUMROC, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP /4*0/ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PDQRINFO( OUTFILE, NOUT, NFACT, FACTOR, NTESTS, NMAT, MVAL, $ NTESTS, NVAL, NTESTS, NNB, MBVAL, NTESTS, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Loop over the different factorization types * DO 40 I = 1, NFACT * FACT = FACTOR( I ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) IF( LSAMEN( 2, FACT, 'QR' ) ) THEN ROUT = 'PDGEQRF' ROUTCHK = 'PDGEQRRV' WRITE( NOUT, FMT = 9986 ) $ 'QR factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN ROUT = 'PDGEQLF' ROUTCHK = 'PDGEQLRV' WRITE( NOUT, FMT = 9986 ) $ 'QL factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN ROUT = 'PDGELQF' ROUTCHK = 'PDGELQRV' WRITE( NOUT, FMT = 9986 ) $ 'LQ factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN ROUT = 'PDGERQF' ROUTCHK = 'PDGERQRV' WRITE( NOUT, FMT = 9986 ) $ 'RQ factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN ROUT = 'PDGEQPF' ROUTCHK = 'PDGEQRRV' WRITE( NOUT, FMT = 9986 ) $ 'QR factorization with column pivoting tests.' ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN ROUT = 'PDTZRZF' ROUTCHK = 'PDTZRZRV' WRITE( NOUT, FMT = 9986 ) $ 'Complete orthogonal factorization tests.' END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 J = 1, NGRIDS * NPROW = PVAL( J ) NPCOL = QVAL( J ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 K = 1, NMAT * M = MVAL( K ) N = NVAL( K ) * * Make sure matrix information is correct * IERR(1) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 L = 1, NNB * MB = MBVAL( L ) NB = NBVAL( L ) * * Make sure mb is legal * IERR( 1 ) = 0 IF( MB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MB', 'MB', MB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * MP = NUMROC( M, MB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) MNP = NUMROC( MIN( M, N ), MB, MYROW, 0, NPROW ) MNQ = NUMROC( MIN( M, N ), NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( MB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, MB, NB, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for ScaLAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IPTAU = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + IPREPAD * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN * LTAU = MNQ IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QR * factorization * LWORK = DESCA( NB_ ) * ( MP + NQ + DESCA( NB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PDLAFCHK, PDGEQRRV and * PDLANGE * WORKSIZ = LWORK + MP*DESCA( NB_ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN * LTAU = NQ IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QL * factorization * LWORK = DESCA( NB_ ) * ( MP + NQ + DESCA( NB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PDLAFCHK, PDGEQLRV and * PDLANGE * WORKSIZ = LWORK + MP*DESCA( NB_ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN * LTAU = MNP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the LQ * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PDLAFCHK, PDGELQRV and * PDLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN * LTAU = MP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QR * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PDLAFCHK, PDGERQRV and * PDLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * LTAU = MNQ IPPIV = IPTAU + LTAU + IPOSTPAD + IPREPAD LIPIV = ICEIL( INTGSZ*NQ, DBLESZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the * factorization i.e from IPW on. * LWORK = MAX( 3, MP + MAX( 1, NQ ) ) + 2 * NQ WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PDLAFCHK, PDGEQRRV, * PDLANGE. * WORKSIZ = MAX( WORKSIZ - IPOSTPAD, $ DESCA( NB_ )*( 2*MP + NQ + DESCA( NB_ ) ) ) + $ IPOSTPAD END IF * ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN * LTAU = MP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the TZ * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PDLAFCHK, PDTZRZRV and * PDLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ FACT // ' factorization', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PDMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need the Infinity of A for checking * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PDFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PDFILLPAD( ICTXT, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANGE( 'I', M, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKFCT-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKFCT-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform QR factorizations * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN CALL SLTIMER( 1 ) CALL PDGEQRF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN CALL SLTIMER( 1 ) CALL PDGEQLF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN CALL SLTIMER( 1 ) CALL PDGELQF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN CALL SLTIMER( 1 ) CALL PDGERQF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL SLTIMER( 1 ) CALL PDGEQPF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPTAU ), $ MEM( IPW ), LWORK, INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN CALL SLTIMER( 1 ) IF( N.GE.M ) $ CALL PDTZRZF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PDCHEKPAD( ICTXT, ROUT, MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, ROUT, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PDCHEKPAD( ICTXT, ROUT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PDCHEKPAD( ICTXT, ROUT, WORKFCT-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKFCT-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN * * Compute residual = ||A-Q*R|| / (||A||*N*eps) * CALL PDGEQRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PDLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN * * Compute residual = ||A-Q*L|| / (||A||*N*eps) * CALL PDGEQLRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PDLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN * * Compute residual = ||A-L*Q|| / (||A||*N*eps) * CALL PDGELQRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PDLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN * * Compute residual = ||A-R*Q|| / (||A||*N*eps) * CALL PDGERQRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PDLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * * Compute residual = ||AP-Q*R|| / (||A||*N*eps) * CALL PDGEQRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN * * Compute residual = ||A-T*Z|| / (||A||*N*eps) * IF( N.GE.M ) THEN CALL PDTZRZRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) END IF CALL PDLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) END IF * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, ROUTCHK, MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, ROUTCHK, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, ROUTCHK, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * CALL PDQPPIV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDQPPIV', MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDQPPIV', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PDLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDLAFCHK', MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLAFCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Test residual and detect NaN result * IF( LSAMEN( 2, FACT, 'TZ' ) .AND. N.LT.M ) THEN KSKIP = KSKIP + 1 PASSED = 'BYPASS' ELSE IF( FRESID.LE.THRESH .AND. $ (FRESID-FRESID).EQ.0.0D+0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * MINMN = MIN( M, N ) MAXMN = MAX( M, N ) * IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN IF( M.GE.N ) THEN NOPS = 0.0D+0 ELSE * * 5/2 ( M^2 N - M^3 ) + 5/2 N M + 1/2 M^2 for * complete orthogonal factorization (M <= N). * NOPS = ( 5.0D+0 * ( $ DBLE( N )*( DBLE( M )**2 ) - $ DBLE( M )**3 + $ DBLE( N )*DBLE( M ) ) + $ DBLE( M )**2 ) / 2.0D+0 END IF * ELSE * * 2 M N^2 - 2/3 N^2 + M N + N^2 for QR type * factorization when M >= N. * NOPS = 2.0D+0 * ( DBLE( MINMN )**2 ) * $ ( DBLE( MAXMN )-DBLE( MINMN ) / 3.0D+0 ) + $ ( DBLE( MAXMN )+DBLE( MINMN ) )*DBLE( MINMN ) END IF * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, MB, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, $ PASSED, FRESID * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, MB, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, $ PASSED, FRESID * END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * 40 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N MB NB P Q Fact Time ', $ ' MFLOPS CHECK Residual' ) 9994 FORMAT( '---- ------ ------ --- --- ----- ----- --------- ', $ '----------- ------ --------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I3, 1X, I3, 1X, I5, 1X, I5, 1X, $ F9.2, 1X, F11.2, 1X, A6, 2X, G8.1 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( A ) * STOP * * End of PDQRDRIVER * END * SUBROUTINE PDQPPIV( M, N, A, IA, JA, DESCA, IPIV ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDQPPIV applies to sub( A ) = A(IA:IA+M-1,JA:JA+N-1) the pivots * returned by PDGEQPF in reverse order for checking purposes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be permuted. On exit, the local pieces * of the distributed permuted submatrix sub( A ) * Inv( P ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension LOCc(JA+N-1). * On exit, if IPIV(I) = K, the local i-th column of sub( A )*P * was the global K-th column of sub( A ). IPIV is tied to the * distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IACOL, ICOFFA, ICTXT, IITMP, IPVT, IPCOL, $ IPROW, ITMP, J, JJ, JJA, KK, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, IGERV2D, $ IGESD2D, IGAMN2D, INFOG1L, PDSWAP * .. * .. External Functions .. INTEGER INDXL2G, NUMROC EXTERNAL INDXL2G, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG1L( JA, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), JJA, $ IACOL ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * DO 20 J = JA, JA+N-2 * IPVT = JA+N-1 ITMP = JA+N * * Find first the local minimum candidate for pivoting * CALL INFOG1L( J, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), $ JJ, IACOL ) DO 10 KK = JJ, JJA+NQ-1 IF( IPIV( KK ).LT.IPVT )THEN IITMP = KK IPVT = IPIV( KK ) END IF 10 CONTINUE * * Find the global minimum pivot * CALL IGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, IPVT, 1, IPROW, $ IPCOL, 1, -1, MYCOL ) * * Broadcast the corresponding index to the other process columns * IF( MYCOL.EQ.IPCOL ) THEN ITMP = INDXL2G( IITMP, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL IGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ITMP, 1 ) IF( IPCOL.NE.IACOL ) THEN CALL IGERV2D( ICTXT, 1, 1, IPIV( IITMP ), 1, MYROW, $ IACOL ) ELSE IF( MYCOL.EQ.IACOL ) $ IPIV( IITMP ) = IPIV( JJ ) END IF ELSE CALL IGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ITMP, 1, MYROW, $ IPCOL ) IF( MYCOL.EQ.IACOL .AND. IPCOL.NE.IACOL ) $ CALL IGESD2D( ICTXT, 1, 1, IPIV( JJ ), 1, MYROW, IPCOL ) END IF * * Swap the columns of A * CALL PDSWAP( M, A, IA, ITMP, DESCA, 1, A, IA, J, DESCA, 1 ) * 20 CONTINUE * * End of PDQPPIV * END scalapack-2.0.2/TESTING/LIN/pdqrinfo.f000644 000766 000024 00000042542 10363532303 017446 0ustar00juliestaff000000 000000 SUBROUTINE PDQRINFO( SUMMRY, NOUT, NFACT, FACTOR, LDFACT, NMAT, $ MVAL, LDMVAL, NVAL, LDNVAL, NNB, MBVAL, $ LDMBVAL, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDFACT, LDMBVAL, LDMVAL, LDNBVAL, LDNVAL, $ LDPVAL, LDQVAL, NFACT, NGRIDS, NMAT, NNB, $ NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. CHARACTER*2 FACTOR( LDFACT ) CHARACTER*(*) SUMMRY INTEGER MBVAL( LDMBVAL ), MVAL( LDMVAL ), $ NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PDQRINFO gets needed startup information for the QR factoriza- * tion routines and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NFACT (global output) INTEGER * The number of different factorization types to be tested. * * FACTOR (global output) CHARACTER*2 array of dimension of LDFACT, * The factorization types to be tested: * if FACTOR(i) = 'QR' then QR factorization, * if FACTOR(i) = 'QL' then QL factorization, * if FACTOR(i) = 'LQ' then LQ factorization, * if FACTOR(i) = 'RQ' then RQ factorization, * if FACTOR(i) = 'QP' then QR factorization with column * pivoting. * if FACTOR(i) = 'TZ' then complete orthogonal factorization. * * LDFACT (global input) INTEGER * The maximum number of different factorization types to be * tested. LDFACT >= NFACT. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * MVAL (global output) INTEGER array of dimension (LDNVAL), the * values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M, LDNVAL > = NMAT. * * NVAL (global output) INTEGER array of dimension (LDNVAL), the * values of N (number of columns in matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for MB and * NB. * * MBVAL (global output) INTEGER array of dimension (LDMBVAL), the * values of MB (row blocksize) to run the code with. * * LDMBVAL (global input) INTEGER * The maximum number of different values that can be used for * MB, LDMBVAL >= NNB. * * NBVAL (global output) INTEGER array of dimension (LDNBVAL), the * values of NB (column blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array of dimension (LDPVAL), the * values of P (number of process rows) to run the code with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array of dimension (LDQVAL), the * values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 4, LDFACT+LDMVAL+LDNVAL+LDMBVAL+LDNBVAL+LDPVAL+LDQVAL ) * used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT, K DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAMEN DOUBLE PRECISION PDLAMCH EXTERNAL LSAMEN, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='QR.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get the matrix types to be tested * READ( NIN, FMT = * ) NFACT IF( NFACT.LT.1 .OR. NFACT.GT.LDFACT ) THEN WRITE( NOUT, FMT = 9994 ) 'nb of factorization', LDFACT GO TO 40 END IF READ( NIN, FMT = * ) ( FACTOR( I ), I = 1, NFACT ) * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 40 ELSE IF( NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'M', LDMVAL GO TO 40 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDMBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'MB', LDMBVAL GO TO 40 ELSE IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 40 END IF READ( NIN, FMT = * ) ( MBVAL( I ), I = 1, NNB ) READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 40 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 40 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I ) * QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS WORK( 4 ) = NFACT CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 DO 20 K = 1, NFACT IF( LSAMEN( 2, FACTOR( K ), 'QR' ) ) THEN WORK( I ) = 1 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'QL' ) ) THEN WORK( I ) = 2 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'LQ' ) ) THEN WORK( I ) = 3 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'RQ' ) ) THEN WORK( I ) = 4 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'QP' ) ) THEN WORK( I ) = 5 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'TZ' ) ) THEN WORK( I ) = 6 I = I + 1 END IF 20 CONTINUE * CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, MBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK QR factorizations routines.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision QR factorizations '// $ 'routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' || A - QR || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - QL || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - LQ || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - RQ || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - QRP || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - TZ || / (|| A || * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'MB : The row blocksize of the blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The column blocksize of the blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 ) $ 'Fact Time: Time in seconds to factor the'// $ ' matrix.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Execution rate of the '// $ 'factorization.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'MB ', ( MBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) NFACT = WORK( 4 ) * I = NFACT + 2*NMAT + 2*NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * DO 30 K = 1, NFACT IF( WORK( K ).EQ.1 ) THEN FACTOR( K ) = 'QR' ELSE IF( WORK( K ).EQ.2 ) THEN FACTOR( K ) = 'QL' ELSE IF( WORK( K ).EQ.3 ) THEN FACTOR( K ) = 'LQ' ELSE IF( WORK( K ).EQ.4 ) THEN FACTOR( K ) = 'RQ' ELSE IF( WORK( K ).EQ.5 ) THEN FACTOR( K ) = 'QP' ELSE IF( WORK( K ).EQ.6 ) THEN FACTOR( K ) = 'TZ' END IF 30 CONTINUE * I = NFACT + 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, MBVAL, 1 ) I = I + NNB CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 40 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDQRINFO * END scalapack-2.0.2/TESTING/LIN/pdqrt13.f000644 000766 000024 00000020500 10363532303 017110 0ustar00juliestaff000000 000000 SUBROUTINE PDQRT13( SCALE, M, N, A, IA, JA, DESCA, NORMA, ISEED, $ WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, ISEED, JA, M, N, SCALE DOUBLE PRECISION NORMA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION WORK( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDQRT13 generates a full-rank matrix that may be scaled to have * large or small norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SCALE (global input) INTEGER * SCALE = 1: normally scaled matrix * SCALE = 2: matrix scaled up * SCALE = 3: matrix scaled down * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * NORMA (global output) DOUBLE PRECISION * The one-norm of A. * * ISEED (global input/global output) INTEGER * Seed for random number generator. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK >= Nq0, where * * ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), and * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, IIA, INFO, $ IROFFA, J, JJA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ DOUBLE PRECISION AJJ, ASUM, BIGNUM, SMLNUM * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL NUMROC, PDLAMCH, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PDLABAD, PDLASCL, $ PDMATGEN, PDASUM, PDELGET, PDELSET * .. * .. Intrinsic Functions .. INTRINSIC MOD, SIGN * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * generate the matrix * IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) MP = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * CALL PDMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), A, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED, IIA-1, MP, $ JJA-1, NQ, MYROW, MYCOL, NPROW, NPCOL ) * DO 10 J = JA, JA+N-1 I = IA + J - JA IF( I.LE.IA+M-1 ) THEN CALL PDASUM( M, ASUM, A, IA, J, DESCA, 1 ) CALL PDELGET( 'Column', ' ', AJJ, A, I, J, DESCA ) AJJ = AJJ + SIGN( ASUM, AJJ ) CALL PDELSET( A, I, J, DESCA, AJJ ) END IF 10 CONTINUE * * scaled versions * IF( SCALE.NE.1 ) THEN * NORMA = PDLANGE( 'M', M, N, A, IA, JA, DESCA, WORK ) SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM CALL PDLABAD( ICTXT, SMLNUM, BIGNUM ) SMLNUM = SMLNUM / PDLAMCH( ICTXT, 'Epsilon' ) BIGNUM = ONE / SMLNUM * IF( SCALE.EQ.2 ) THEN * * matrix scaled up * CALL PDLASCL( 'General', NORMA, BIGNUM, M, N, A, IA, $ JA, DESCA, INFO ) * ELSE IF( SCALE.EQ.3 ) THEN * * matrix scaled down * CALL PDLASCL( 'General', NORMA, SMLNUM, M, N, A, IA, $ JA, DESCA, INFO ) * END IF * END IF * NORMA = PDLANGE( 'One-norm', M, N, A, IA, JA, DESCA, WORK ) * RETURN * * End of PDQRT13 * END scalapack-2.0.2/TESTING/LIN/pdqrt14.f000644 000766 000024 00000033620 10363532303 017120 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PDQRT14( TRANS, M, N, NRHS, A, IA, JA, $ DESCA, X, IX, JX, DESCX, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IX, JA, JX, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION A( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PDQRT14 checks whether sub( X ) is in the row space of sub( A ) or * sub( A )', where sub( A ) denotes A( IA:IA+M-1, JA:JA+N-1 ) and * sub( X ) denotes X( IX:IX+N-1, JX:JX+NRHS-1 ) if TRANS = 'N', and * X( IX:IX+N-1, JX:JX+NRHS-1 ) otherwise. It does so by scaling both * sub( X ) and sub( A ) such that their norms are in the range * [sqrt(eps), 1/sqrt(eps)], then computing an LQ factorization of * [sub( A )',sub( X )]' (if TRANS = 'N') or a QR factorization of * [sub( A ),sub( X )] otherwise, and returning the norm of the trailing * triangle, scaled by MAX(M,N,NRHS)*eps. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * = 'N': No transpose, check for sub( X ) in the row space of * sub( A ), * = 'T': Transpose, check for sub( X ) in row space of * sub( A )'. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( X ). NRHS >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)). This array * contains the local pieces of the M-by-N distributed matrix * sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) DOUBLE PRECISION pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). * On entry, this array contains the local pieces of the * N-by-NRHS distributed submatrix sub( X ) if TRANS = 'N', * and the M-by-NRHS distributed submatrix sub( X ) otherwise. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * If TRANS='N', LWORK >= MNRHSP * NQ + LTAU + LWF and * LWORK >= MP * NNRHSQ + LTAU + LWF otherwise, where * * IF TRANS='N', (LQ fact) * MNRHSP = NUMROC( M+NRHS+IROFFA, MB_A, MYROW, IAROW, * NPROW ) * LTAU = NUMROC( IA+MIN( M+NRHS, N )-1, MB_A, MYROW, * RSRC_A, NPROW ) * LWF = MB_A * ( MB_A + MNRHSP + NQ0 ) * ELSE (QR fact) * NNRHSQ = NUMROC( N+NRHS+ICOFFA, NB_A, MYCOL, IACOL, * NPCOL ) * LTAU = NUMROC( JA+MIN( M, N+NRHS )-1, NB_A, MYCOL, * CSRC_A, NPCOL ) * LWF = NB_A * ( NB_A + MP0 + NNRHSQ ) * END IF * * and, * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MP0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NQ0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL TPSD INTEGER IACOL, IAROW, ICOFFA, ICTXT, IDUM, IIA, INFO, $ IPTAU, IPW, IPWA, IROFFA, IWA, IWX, J, JJA, $ JWA, JWX, LDW, LWORK, MPWA, MPW, MQW, MYCOL, $ MYROW, NPCOL, NPROW, NPW, NQWA, NQW DOUBLE PRECISION AMAX, ANRM, ERR, XNRM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) DOUBLE PRECISION RWORK( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLANGE, PDLAMCH EXTERNAL LSAME, NUMROC, PDLANGE, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, DGAMX2D, INFOG2L, $ PDAMAX, PDCOPY, PDGELQF, PDGEQRF, $ PDLACPY, PDLASCL, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * PDQRT14 = ZERO * IPWA = 1 IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IWA = IROFFA + 1 JWA = ICOFFA + 1 CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) MPWA = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQWA = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * INFO = 0 IF( LSAME( TRANS, 'N' ) ) THEN IF( N.LE.0 .OR. NRHS.LE.0 ) $ RETURN TPSD = .FALSE. MPW = NUMROC( M+NRHS+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) NQW = NQWA * * Assign descriptor DESCW for workspace WORK and pointers to * matrices sub( A ) and sub( X ) in workspace * IWX = IWA + M JWX = JWA LDW = MAX( 1, MPW ) CALL DESCSET( DESCW, M+NRHS+IROFFA, N+ICOFFA, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, LDW ) * ELSE IF( LSAME( TRANS, 'T' ) ) THEN IF( M.LE.0 .OR. NRHS.LE.0 ) $ RETURN TPSD = .TRUE. MPW = MPWA NQW = NUMROC( N+NRHS+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) * * Assign descriptor DESCW for workspace WORK and pointers to * matrices sub( A ) and sub( X ) in workspace * IWX = IWA JWX = JWA + N LDW = MAX( 1, MPW ) CALL DESCSET( DESCW, M+IROFFA, N+NRHS+ICOFFA, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, LDW ) ELSE CALL PXERBLA( ICTXT, 'PDQRT14', -1 ) RETURN END IF * * Copy and scale sub( A ) * IPTAU = IPWA + MPW*NQW CALL PDLACPY( 'All', M, N, A, IA, JA, DESCA, WORK( IPWA ), IWA, $ JWA, DESCW ) RWORK( 1 ) = ZERO ANRM = PDLANGE( 'M', M, N, WORK( IPWA ), IWA, JWA, DESCW, RWORK ) IF( ANRM.NE.ZERO ) $ CALL PDLASCL( 'G', ANRM, ONE, M, N, WORK( IPWA ), IWA, $ JWA, DESCW, INFO ) * * Copy sub( X ) or sub( X )' into the right place and scale it * IF( TPSD ) THEN * * Copy sub( X ) into columns jwa+n:jwa+n+nrhs-1 of work * DO 10 J = 1, NRHS CALL PDCOPY( M, X, IX, JX+J-1, DESCX, 1, WORK( IPWA ), IWX, $ JWX+J-1, DESCW, 1 ) 10 CONTINUE XNRM = PDLANGE( 'M', M, NRHS, WORK( IPWA ), IWX, JWX, DESCW, $ RWORK ) IF( XNRM.NE.ZERO ) $ CALL PDLASCL( 'G', XNRM, ONE, M, NRHS, WORK( IPWA ), IWX, $ JWX, DESCW, INFO ) * * Compute QR factorization of work(iwa:iwa+m-1,jwa:jwa+n+nrhs-1) * MQW = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPW = IPTAU + MIN( MQW, NQW ) LWORK = DESCW( NB_ ) * ( MPW + NQW + DESCW( NB_ ) ) CALL PDGEQRF( M, N+NRHS, WORK( IPWA ), IWA, JWA, DESCW, $ WORK( IPTAU ), WORK( IPW ), LWORK, INFO ) * * Compute largest entry in upper triangle of * work(iwa+n:iwa+m-1,jwa+n:jwa+n+nrhs-1) * ERR = ZERO IF( N.LT.M ) THEN DO 20 J = JWX, JWA+N+NRHS-1 CALL PDAMAX( MIN(M-N,J-JWX+1), AMAX, IDUM, WORK( IPWA ), $ IWA+N, J, DESCW, 1 ) ERR = MAX( ERR, ABS( AMAX ) ) 20 CONTINUE END IF CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, IDUM1, IDUM2, $ -1, -1, 0 ) * ELSE * * Copy sub( X )' into rows iwa+m:iwa+m+nrhs-1 of work * DO 30 J = 1, NRHS CALL PDCOPY( N, X, IX, JX+J-1, DESCX, 1, WORK( IPWA ), $ IWX+J-1, JWX, DESCW, DESCW( M_ ) ) 30 CONTINUE * XNRM = PDLANGE( 'M', NRHS, N, WORK( IPWA ), IWX, JWX, DESCW, $ RWORK ) IF( XNRM.NE.ZERO ) $ CALL PDLASCL( 'G', XNRM, ONE, NRHS, N, WORK( IPWA ), IWX, $ JWX, DESCW, INFO ) * * Compute LQ factorization of work(iwa:iwa+m+nrhs-1,jwa:jwa+n-1) * NPW = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPW = IPTAU + MIN( MPW, NPW ) LWORK = DESCW( MB_ ) * ( MPW + NQW + DESCW( MB_ ) ) CALL PDGELQF( M+NRHS, N, WORK( IPWA ), IWA, JWA, DESCW, $ WORK( IPTAU ), WORK( IPW ), LWORK, INFO ) * * Compute largest entry in lower triangle in * work(iwa+m:iwa+m+nrhs-1,jwa+m:jwa+n-1) * ERR = ZERO DO 40 J = JWA+M, MIN( JWA+N-1, JWA+M+NRHS-1 ) CALL PDAMAX( JWA+M+NRHS-J, AMAX, IDUM, WORK( IPWA ), $ IWX+J-JWA-M, J, DESCW, 1 ) ERR = MAX( ERR, ABS( AMAX ) ) 40 CONTINUE CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, IDUM1, IDUM2, $ -1, -1, 0 ) * END IF * PDQRT14 = ERR / ( DBLE( MAX( M, N, NRHS ) ) * $ PDLAMCH( ICTXT, 'Epsilon' ) ) * RETURN * * End of PDQRT14 * END scalapack-2.0.2/TESTING/LIN/pdqrt16.f000644 000766 000024 00000025225 11622500733 017125 0ustar00juliestaff000000 000000 SUBROUTINE PDQRT16( TRANS, M, N, NRHS, A, IA, JA, DESCA, X, IX, $ JX, DESCX, B, IB, JB, DESCB, RWORK, RESID ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, IX, JA, JB, JX, M, N, NRHS DOUBLE PRECISION RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) DOUBLE PRECISION A( * ), B( * ), RWORK( * ), X( * ) * .. * * Purpose * ======= * * PDQRT16 computes the residual for a solution of a system of linear * equations sub( A )*sub( X ) = B or sub( A' )*sub( X ) = B: * RESID = norm(B - sub( A )*sub( X ) ) / * ( max(m,n) * norm(sub( A ) ) * norm(sub( X ) ) * EPS ), * where EPS is the machine epsilon, sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), and sub( X ) denotes * X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations: * = 'N': sub( A )*sub( X ) = sub( B ) * = 'T': sub( A' )*sub( X )= sub( B ), where A' is the * transpose of sub( A ). * = 'C': sub( A' )*sub( X )= B, where A' is the transpose * of sub( A ). * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) DOUBLE PRECISION pointer into the local * memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * The original M x N matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) DOUBLE PRECISION pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). This * array contains the local pieces of the computed solution * distributed vectors for the system of linear equations. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * B (local input/local output) DOUBLE PRECISION pointer into * the local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the distributes right hand side vectors for * the system of linear equations. On exit, sub( B ) is over- * written with the difference sub( B ) - sub( A )*sub( X ) or * sub( B ) - sub( A )'*sub( X ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * LWORK >= Nq0 if TRANS = 'N', and LRWORK >= Mp0 otherwise. * * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * RESID (global output) DOUBLE PRECISION * The maximum over the number of right hand sides of * norm( sub( B )- sub( A )*sub( X ) ) / * ( max(m,n) * norm( sub( A ) ) * norm( sub( X ) ) * EPS ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER ICTXT, IDUMM, J, MYCOL, MYROW, N1, N2, NPCOL, $ NPROW DOUBLE PRECISION ANORM, BNORM, EPS, XNORM * .. * .. Local Arrays .. DOUBLE PRECISION TEMP( 2 ) * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL LSAME, PDLAMCH, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PDASUM, PDGEMM * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick exit if M = 0 or N = 0 or NRHS = 0 * IF( M.LE.0 .OR. N.LE.0 .OR. NRHS.EQ.0 ) THEN RESID = ZERO RETURN END IF * IF( LSAME( TRANS, 'T' ) .OR. LSAME( TRANS, 'C' ) ) THEN ANORM = PDLANGE( 'I', M, N, A, IA, JA, DESCA, RWORK ) N1 = N N2 = M ELSE ANORM = PDLANGE( '1', M, N, A, IA, JA, DESCA, RWORK ) N1 = M N2 = N END IF * EPS = PDLAMCH( ICTXT, 'Epsilon' ) * * Compute B - sub( A )*sub( X ) (or B - sub( A' )*sub( X ) ) and * store in B. * CALL PDGEMM( TRANS, 'No transpose', N1, NRHS, N2, -ONE, A, IA, $ JA, DESCA, X, IX, JX, DESCX, ONE, B, IB, JB, DESCB ) * * Compute the maximum over the number of right hand sides of * norm( sub( B ) - sub( A )*sub( X ) ) / * ( max(m,n) * norm( sub( A ) ) * norm( sub( X ) ) * EPS ). * RESID = ZERO DO 10 J = 1, NRHS * CALL PDASUM( N1, BNORM, B, IB, JB+J-1, DESCB, 1 ) CALL PDASUM( N2, XNORM, X, IX, JX+J-1, DESCX, 1 ) * * Only the process columns owning the vector operands will have * the correct result, the other will have zero. * TEMP( 1 ) = BNORM TEMP( 2 ) = XNORM IDUMM = 0 CALL DGAMX2D( ICTXT, 'All', ' ', 2, 1, TEMP, 2, IDUMM, IDUMM, $ -1, -1, IDUMM ) BNORM = TEMP( 1 ) XNORM = TEMP( 2 ) * * Every processes have ANORM, BNORM and XNORM now. * IF( ANORM.EQ.ZERO .AND. BNORM.EQ.ZERO ) THEN RESID = ZERO ELSE IF( ANORM.LE.ZERO .OR. XNORM.LE.ZERO ) THEN RESID = ONE / EPS ELSE RESID = MAX( RESID, ( ( BNORM / ANORM ) / XNORM ) / $ ( MAX( M, N )*EPS ) ) END IF * 10 CONTINUE * RETURN * * End of PDQRT16 * END scalapack-2.0.2/TESTING/LIN/pdqrt17.f000644 000766 000024 00000032752 10363532303 017130 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PDQRT17( TRANS, IRESID, M, N, NRHS, A, $ IA, JA, DESCA, X, IX, JX, $ DESCX, B, IB, JB, DESCB, WORK, $ RWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, IRESID, IX, JA, JB, JX, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) DOUBLE PRECISION A( * ), B( * ), WORK( * ), X( * ) DOUBLE PRECISION RWORK( * ) * .. * * Purpose * ======= * * PDQRT17 computes the ratio * * || R'*op( sub( A ) ) ||/(||sub( A )||*alpha*max(M,N,NRHS)*eps) * * where R = op( sub( A ) )*sub( X ) - B, op(A) is A or A', and * * alpha = ||B|| if IRESID = 1 (zero-residual problem) * alpha = ||R|| if IRESID = 2 (otherwise). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies whether or not the transpose of sub( A ) is used. * = 'N': No transpose, op( sub( A ) ) = sub( A ). * = 'T': Transpose, op( sub( A ) ) = sub( A' ). * * IRESID (global input) INTEGER * IRESID = 1 indicates zero-residual problem. * IRESID = 2 indicates non-zero residual. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * If TRANS = 'N', the number of rows of the distributed * submatrix sub( B ). * If TRANS = 'T', the number of rows of the distributed * submatrix sub( X ). * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * If TRANS = 'N', the number of rows of the distributed * submatrix sub( X ). Otherwise N is the number of rows of * the distributed submatrix sub( B ). * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrices sub( X ) and sub( B ). * NRHS >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed M-by-N * submatrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) DOUBLE PRECISION pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). * If TRANS = 'N', this array contains the local pieces of the * N-by-NRHS distributed submatrix sub( X ). Otherwise, this * array contains the local pieces of the M-by-NRHS distributed * submatrix sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * B (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_B,LOCc(JB+NRHS-1)). * If TRANS='N', this array contains the local pieces of the * distributed M-by-NRHS submatrix operand sub( B ). Otherwise, * this array contains the local pieces of the distributed * N-by-NRHS submatrix operand sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * If TRANS = 'N', LWORK >= Mp0 * NRHSq0 + NRHSp0 * Nq0 * otherwise LWORK >= Np0 * NRHSq0 + NRHSp0 * Mq0 * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * LRWORK >= Nq0, if TRANS = 'N', and LRWORK >= Mp0 otherwise. * * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Np0 = NUMROC( N+ICOFFA, NB_A, MYROW, IAROW, NPROW ), * Mq0 = NUMROC( M+IROFFA, NB_A, MYCOL, IACOL, NPCOL ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * NRHSp0 = NUMROC( NRHS+ICOFFB, NB_B, MYROW, IBROW, NPROW ), * NRHSq0 = NUMROC( NRHS+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. INTEGER IACOL, IBCOL, IBROW, ICOFFB, ICTXT, INFO, $ IOFFA, IROFFB, ISCL, IW, IW2, JW, JW2, MYCOL, $ NRHSQ, NRHSP, MYROW, NCOLS, NPCOL, NPROW, $ NROWS, NROWSP DOUBLE PRECISION ERR, NORMA, NORMB, NORMRS, NORMX, SMLNUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), DESCW2( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL INDXG2P, LSAME, NUMROC, PDLAMCH, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PDGEMM, PDLACPY, $ PDLASCL, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Executable Statements .. * PDQRT17 = ZERO * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 IF( LSAME( TRANS, 'N' ) ) THEN NROWS = M NCOLS = N IOFFA = MOD( JA-1, DESCA( NB_ ) ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN NROWS = N NCOLS = M IOFFA = MOD( IA-1, DESCA( MB_ ) ) ELSE CALL PXERBLA( ICTXT, 'PDQRT17', -1 ) RETURN END IF * IF( M.LE.0 .OR. N.LE.0 .OR. NRHS.LE.0 ) $ RETURN * IROFFB = MOD( IA-1, DESCA( MB_ ) ) ICOFFB = MOD( JA-1, DESCA( NB_ ) ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * NRHSQ = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) NRHSP = NUMROC( NRHS+IROFFB, DESCB( NB_ ), MYROW, IBROW, NPROW ) NROWSP = NUMROC( NROWS+IROFFB, DESCA( MB_ ), MYROW, IBROW, NPROW ) * * Assign array descriptor DESCW for workspace WORK, where DESCW * holds a copy of the distributed submatrix sub( B ) * CALL DESCSET( DESCW, NROWS+IROFFB, NRHS+ICOFFB, DESCB( MB_ ), $ DESCB( NB_ ), IBROW, IBCOL, ICTXT, MAX( 1, $ NROWSP ) ) * * Assign array descriptor DESCW2 for workspace WORK, where DESCW2 * holds a copy of the distributed submatrix sub( X**T ) * CALL DESCSET( DESCW2, NRHS+ICOFFB, NCOLS+IOFFA, DESCX( NB_ ), $ DESCX( MB_ ), IBROW, IACOL, ICTXT, MAX( 1, $ NRHSP ) ) * NORMA = PDLANGE( 'One-norm', M, N, A, IA, JA, DESCA, RWORK ) SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) SMLNUM = SMLNUM / PDLAMCH( ICTXT, 'Precision' ) ISCL = 0 * * compute residual and scale it * IW = 1 + IROFFB JW = 1 + ICOFFB CALL PDLACPY( 'All', NROWS, NRHS, B, IB, JB, DESCB, WORK, IW, JW, $ DESCW ) CALL PDGEMM( TRANS, 'No transpose', NROWS, NRHS, NCOLS, -ONE, A, $ IA, JA, DESCA, X, IX, JX, DESCX, ONE, WORK, IW, JW, $ DESCW ) NORMRS = PDLANGE( 'Max', NROWS, NRHS, WORK, IW, JW, DESCW, $ RWORK ) IF( NORMRS.GT.SMLNUM ) THEN ISCL = 1 CALL PDLASCL( 'General', NORMRS, ONE, NROWS, NRHS, WORK, $ IW, JW, DESCW, INFO ) END IF * * compute R'*sub( A ) * IW2 = 1 + ICOFFB JW2 = 1 + IOFFA CALL PDGEMM( 'Transpose', TRANS, NRHS, NCOLS, NROWS, ONE, WORK, $ IW, JW, DESCW, A, IA, JA, DESCA, ZERO, $ WORK( NROWSP*NRHSQ+1 ), IW2, JW2, DESCW2 ) * * compute and properly scale error * ERR = PDLANGE( 'One-norm', NRHS, NCOLS, WORK( NROWSP*NRHSQ+1 ), $ IW2, JW2, DESCW2, RWORK ) IF( NORMA.NE.ZERO ) $ ERR = ERR / NORMA * IF( ISCL.EQ.1 ) $ ERR = ERR*NORMRS * IF( IRESID.EQ.1 ) THEN NORMB = PDLANGE( 'One-norm', NROWS, NRHS, B, IB, JB, DESCB, $ RWORK ) IF( NORMB.NE.ZERO ) $ ERR = ERR / NORMB ELSE NORMX = PDLANGE( 'One-norm', NCOLS, NRHS, X, IX, JX, DESCX, $ RWORK ) IF( NORMX.NE.ZERO ) $ ERR = ERR / NORMX END IF * PDQRT17 = ERR / ( PDLAMCH( ICTXT, 'Epsilon' ) * $ DBLE( MAX( M, N, NRHS ) ) ) * RETURN * * End of PDQRT17 * END scalapack-2.0.2/TESTING/LIN/pdtzrzrv.f000644 000766 000024 00000024426 10363532303 017532 0ustar00juliestaff000000 000000 SUBROUTINE PDTZRZRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDTZRZRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from T, Z * computed by PDTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= M >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors T and Z computed * by PDTZRZF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION, array, dimension LOCr(M_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PDTZRZF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IN, $ IPT, IPV, IPW, JJA, JM1, JV, L, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PDLACPY, $ PDLARZB, PDLARZT, PDLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.LT.M ) $ RETURN * L = N - M JM1 = JA + MIN( M+1, N ) - 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * CALL DESCSET( DESCV, DESCA( MB_ ), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * * Handle first block separately * IB = IN - IA + 1 JV = ICOFF + JM1 - JA + 1 * * Compute upper triangular matrix T * CALL PDLARZT( 'Backward', 'Rowwise', L, IB, A, IA, JM1, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'All', IB, L, A, IA, JM1, DESCA, WORK( IPV ), 1, $ JV, DESCV ) * * Save temporarily strict lower part of A(IA:IA+IB-1,JA:JA+IB-1) * CALL PDLACPY( 'Lower', IB-1, IB-1, A, IA+1, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) * * Zeroes the row panel of sub( A ) to get T(IA:IN,JA:JA+N-1) * CALL PDLASET( 'All', IB, L, ZERO, ZERO, A, IA, JM1, DESCA ) CALL PDLASET( 'Lower', IB-1, IB-1, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Apply block Householder transformation * CALL PDLARZB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ IN-IA+1, N, IB, L, WORK( IPV ), 1, JV, DESCV, $ WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * * Restore strict lower part of A( IA:IA+IB-1, JA:JA+N-1 ) * CALL PDLACPY( 'Lower', IB-1, IB-1, WORK( IPV ), 1, ICOFF+1, DESCV, $ A, IA+1, JA, DESCA ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * * Loop over the remaining row blocks * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Compute upper triangular matrix T * CALL PDLARZT( 'Backward', 'Rowwise', L, IB, A, I, JM1, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PDLACPY( 'All', IB, L, A, I, JM1, DESCA, WORK( IPV ), 1, $ JV, DESCV ) * * Save temporarily strict lower part of A(I:I+IB-1,J:J+IB-1 ) * CALL PDLACPY( 'Lower', IB-1, IB-1, A, I+1, JA+I-IA, DESCA, $ WORK( IPV ), 1, ICOFF+1+I-IA, DESCV ) * * Zeoes the row panel of sub( A ) to get T(IA:I-1,JA+I-IA:JA+N-1) * CALL PDLASET( 'All', IB, L, ZERO, ZERO, A, I, JM1, DESCA ) CALL PDLASET( 'Lower', IB-1, IB-1, ZERO, ZERO, A, I+1, JA+I-IA, $ DESCA ) * * Apply block Householder transformation * CALL PDLARZB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ I+IB-IA, N-I+IA, IB, L, WORK( IPV ), 1, JV, $ DESCV, WORK( IPT ), A, IA, JA+I-IA, DESCA, $ WORK( IPW ) ) * CALL PDLACPY( 'Lower', IB-1, IB-1, WORK( IPV ), 1, $ ICOFF+1+I-IA, DESCV, A, I+1, JA+I-IA, DESCA ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PDTZRZRV * END scalapack-2.0.2/TESTING/LIN/pmatgeninc.f000644 000766 000024 00000020031 10363532303 017736 0ustar00juliestaff000000 000000 * ===================================================================== * SUBROUTINE LADD * ===================================================================== * SUBROUTINE LADD( J, K, I ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. INTEGER I(2), J(2), K(2) * .. * * ===================================================================== * * .. Parameters .. INTEGER IPOW16, IPOW15 PARAMETER ( IPOW16=2**16, IPOW15=2**15 ) * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * I(1) = MOD( K(1)+J(1), IPOW16 ) I(2) = MOD( (K(1)+J(1)) / IPOW16+K(2)+J(2), IPOW15 ) * RETURN * * End of LADD * END * * ===================================================================== * SUBROUTINE LMUL * ===================================================================== * SUBROUTINE LMUL( K, J, I ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. INTEGER I(2), J(2), K(2) * .. * * ===================================================================== * * .. Parameters .. INTEGER IPOW15, IPOW16, IPOW30 PARAMETER ( IPOW15=2**15, IPOW16=2**16, IPOW30=2**30 ) * .. * .. Local Scalars .. INTEGER KT, LT * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * KT = K(1)*J(1) IF( KT.LT.0 ) KT = (KT+IPOW30) + IPOW30 I(1) = MOD(KT,IPOW16) LT = K(1)*J(2) + K(2)*J(1) IF( LT.LT.0 ) LT = (LT+IPOW30) + IPOW30 KT = KT/IPOW16 + LT IF( KT.LT.0 ) KT = (KT+IPOW30) + IPOW30 I(2) = MOD( KT, IPOW15 ) * RETURN * * End of LMUL * END * * ===================================================================== * SUBROUTINE XJUMPM * ===================================================================== * SUBROUTINE XJUMPM( JUMPM, MULT, IADD, IRANN, IRANM, IAM, ICM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER JUMPM * .. * .. Array Arguments .. INTEGER IADD(2), IAM(2), ICM(2), IRANM(2), IRANN(2) INTEGER MULT(2) * .. * * ===================================================================== * * .. Local Scalars .. INTEGER I * .. * .. Local Arrays .. INTEGER J(2) * .. * .. External Subroutines .. EXTERNAL LADD, LMUL * .. * .. Executable Statements .. * IF( JUMPM.GT.0 ) THEN DO 10 I = 1, 2 IAM(I) = MULT(I) ICM(I) = IADD(I) 10 CONTINUE DO 20 I = 1, JUMPM-1 CALL LMUL( IAM, MULT, J ) IAM(1) = J(1) IAM(2) = J(2) CALL LMUL( ICM, MULT, J ) CALL LADD( IADD, J, ICM ) 20 CONTINUE CALL LMUL( IRANN, IAM, J ) CALL LADD( J, ICM, IRANM ) ELSE IRANM(1) = IRANN(1) IRANM(2) = IRANN(2) END IF * RETURN * * End of XJUMPM * END * * ===================================================================== * SUBROUTINE SETRAN * ===================================================================== * SUBROUTINE SETRAN( IRAN, IA, IC ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. INTEGER IA(2), IC(2), IRAN(2) * .. * * ===================================================================== * * .. Local Scalars .. INTEGER I * .. * .. Local Arrays .. INTEGER IAS(2), ICS(2), IRAND(2) * .. * .. Common Blocks .. COMMON /RANCOM/ IRAND, IAS, ICS SAVE /RANCOM/ * .. * .. Executable Statements .. * DO 10 I = 1, 2 IRAND(I) = IRAN(I) IAS(I) = IA(I) ICS(I) = IC(I) 10 CONTINUE * RETURN * * End of SETRAN * END * * ===================================================================== * SUBROUTINE JUMPIT * ===================================================================== * SUBROUTINE JUMPIT( MULT, IADD, IRANN, IRANM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. INTEGER IADD(2), IRANM(2), IRANN(2), MULT(2) * .. * * ===================================================================== * * .. Local Arrays .. INTEGER IAS(2), ICS(2), IRAND(2), J(2) * .. * .. External Subroutines .. EXTERNAL LADD, LMUL * .. * .. Common Blocks .. COMMON /RANCOM/ IRAND, IAS, ICS SAVE /RANCOM/ * .. * .. Executable Statements .. * CALL LMUL( IRANN, MULT, J ) CALL LADD( J, IADD, IRANM ) * IRAND(1) = IRANM(1) IRAND(2) = IRANM(2) * RETURN * * End of JUMPIT * END * * ===================================================================== * REAL FUNCTION PSRAND * ===================================================================== * REAL FUNCTION PSRAND( IDUMM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * ===================================================================== * * .. Parameters .. REAL DIVFAC, POW16 PARAMETER ( DIVFAC=2.147483648E+9, POW16=6.5536E+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL LADD, LMUL * .. * .. Intrinsic Functions .. INTRINSIC REAL * .. * .. Common Blocks .. INTEGER IAS(2), ICS(2), IRAND(2) COMMON /RANCOM/ IRAND, IAS, ICS SAVE /RANCOM/ * .. * .. Executable Statements .. * PSRAND = ( REAL(IRAND(1)) + POW16 * REAL(IRAND(2)) ) / DIVFAC * CALL LMUL( IRAND, IAS, J ) CALL LADD( J, ICS, IRAND ) * RETURN * * End of PSRAND * END * * ===================================================================== * DOUBLE PRECISION FUNCTION PDRAND * ===================================================================== * DOUBLE PRECISION FUNCTION PDRAND( IDUMM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION DIVFAC, POW16 PARAMETER ( DIVFAC=2.147483648D+9, POW16=6.5536D+4 ) * .. * .. Local Arrays .. INTEGER J(2) * .. * .. External Subroutines .. EXTERNAL LADD, LMUL * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Common Blocks .. INTEGER IAS(2), ICS(2), IRAND(2) COMMON /RANCOM/ IRAND, IAS, ICS SAVE /RANCOM/ * .. * .. Executable Statements .. * PDRAND = ( DBLE(IRAND(1)) + POW16 * DBLE(IRAND(2)) ) / DIVFAC * CALL LMUL( IRAND, IAS, J ) CALL LADD( J, ICS, IRAND ) * RETURN * * End of PDRAND * END scalapack-2.0.2/TESTING/LIN/psbmatgen.f000644 000766 000024 00000016652 10363532303 017607 0ustar00juliestaff000000 000000 SUBROUTINE PSBMATGEN( ICTXT, AFORM, AFORM2, BWL, BWU, N, $ MB, NB, A, $ LDA, IAROW, IACOL, ISEED, $ MYROW, MYCOL, NPROW, NPCOL ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. * .. Scalar Arguments .. CHARACTER*1 AFORM, AFORM2 INTEGER IACOL, IAROW, ICTXT, $ ISEED, LDA, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW, BWL, BWU * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * PSBMATGEN : Parallel Real Single precision Band MATrix GENerator. * (Re)Generate a distributed Band matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'L' : A is returned as a symmetric lower * triangular matrix, and is diagonally dominant. * if AFORM = 'U' : A is returned as a symmetric upper * triangular matrix, and is diagonally dominant. * if AFORM = 'G' : A is returned as a general matrix. * if AFORM = 'T' : A is returned as a general matrix in * tridiagonal-compatible form. * * AFORM2 (global input) CHARACTER*1 * if the matrix is general: * if AFORM2 = 'D' : A is returned diagonally dominant. * if AFORM2 != 'D' : A is not returned diagonally dominant. * if the matrix is symmetric or hermitian: * if AFORM2 = 'T' : A is returned in tridiagonally-compatible * form (a transpose form). * if AFORM2 != 'T' : A is returned in banded-compatible form. * * M (global input) INTEGER * The number of nonzero rows in the generated distributed * band matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) REAL, pointer into the local memory * to an array of dimension ( LDA, * ) containing the local * pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * A( DIAG_INDEX, I ) = A( DIAG_INDEX, I ) + BWL+BWU * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * This code is a simple wrapper around PSMATGEN, for band matrices. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER DIAG_INDEX, I, J, M_MATGEN, NQ, N_MATGEN, $ START_INDEX * .. * .. External Subroutines .. EXTERNAL PSMATGEN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC, LSAME * .. * .. Executable Statements .. * * IF( LSAME( AFORM, 'L' ).OR.LSAME( AFORM, 'U' ) ) THEN M_MATGEN = BWL + 1 N_MATGEN = N START_INDEX = 1 IF( LSAME( AFORM, 'L' ) ) THEN DIAG_INDEX = 1 ELSE DIAG_INDEX = BWL + 1 ENDIF ELSE M_MATGEN = BWL + BWU + 1 N_MATGEN = N DIAG_INDEX = BWU + 1 START_INDEX = 1 ENDIF * NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * * * Generate a random matrix initially * IF( LSAME( AFORM, 'T' ) .OR. $ ( LSAME( AFORM2, 'T' ) ) ) THEN * CALL PSMATGEN( ICTXT, 'T', 'N', $ N_MATGEN, M_MATGEN, $ NB, M_MATGEN, A( START_INDEX, 1 ), $ LDA, IAROW, IACOL, $ ISEED, 0, NQ, 0, M_MATGEN, $ MYCOL, MYROW, NPCOL, NPROW ) * ELSE * CALL PSMATGEN( ICTXT, 'N', 'N', $ M_MATGEN, N_MATGEN, $ M_MATGEN, NB, A( START_INDEX, 1 ), $ LDA, IAROW, IACOL, $ ISEED, 0, M_MATGEN, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Zero out padding at tops of columns * DO 1000 J=1,NB * c write(*,*) 'LDA-M_MATGEN =',LDA-M_MATGEN DO 2000 I=1, LDA-M_MATGEN * * Indexing goes negative; BMATGEN assumes that space * has been preallocated above the first column as it * has to be if the matrix is to be input to * Scalapack's band solvers. * c write(*,*) 'I LDA M_MATGEN and J =',I,LDA,M_MATGEN, J A( I-LDA+M_MATGEN, J ) = ZERO * 2000 CONTINUE * 1000 CONTINUE * ENDIF * IF( LSAME( AFORM2, 'D' ).OR. $ ( LSAME( AFORM, 'L' ).OR.LSAME( AFORM, 'U' ) ) ) THEN * * Loop over diagonal elements stored on this processor. * * DO 330 I=1, NQ IF( LSAME( AFORM, 'T' ) .OR. $ ( LSAME( AFORM2, 'T' ) ) ) THEN IF( NPROW .EQ. 1 ) THEN A( I, DIAG_INDEX ) = A( I, DIAG_INDEX ) $ + REAL( BWL+BWU+1 ) ENDIF ELSE IF( NPROW .EQ. 1 ) THEN A( DIAG_INDEX, I ) = A( DIAG_INDEX, I ) $ + REAL( BWL+BWU+1 ) ENDIF END IF 330 CONTINUE * * ELSE * * Must add elements to keep condition of matrix in check * DO 380 I=1, NQ * IF( NPROW .EQ. 1 ) THEN * IF( MOD(I+MYCOL*NB,2) .EQ. 1 ) THEN A( DIAG_INDEX+1, I ) = A( DIAG_INDEX+1, I ) $ + REAL( BWL+BWU+1 ) * ELSE * A( DIAG_INDEX-1, I ) = A( DIAG_INDEX-1, I ) $ + REAL( BWL+BWU+1 ) ENDIF * ENDIF * 380 CONTINUE * END IF * RETURN * * End of PSBMATGEN * END scalapack-2.0.2/TESTING/LIN/psdbdriver.f000644 000766 000024 00000075645 10363532303 020002 0ustar00juliestaff000000 000000 PROGRAM PSDBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PSDBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by SDB. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER MEMSIZ, NTESTS, REALSZ REAL PADVAL PARAMETER ( REALSZ = 4, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0, ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSBMATGEN, $ PSCHEKPAD, PSDBINFO, PSDBLASCHK, PSDBTRF, $ PSDBTRS, PSFILLPAD, PSMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PSLANGE EXTERNAL LSAME, NUMROC, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PSDBINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = BWLVAL( BW_NUM ) IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = BWUVAL( BW_NUM ) IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*MAX(BWL,BWU)-1)/NPCOL + 1 ) $ + MAX(BWL,BWU) NB = MAX( NB, 2*MAX(BWL,BWU) ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*MAX(BWL,BWU), N ) ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (BWL+BWU+1), (BWL+BWU+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((BWL+BWU+1)+10) IMIDPAD = 10 IPOSTPAD = ((BWL+BWU+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (BWL+BWU+1), N, $ (BWL+BWU+1), NB, 0, 0, $ ICTXT,((BWL+BWU+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((BWL+BWU+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = MAX(BWL,BWU)*MAX(BWL,BWU) * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((BWL+BWU+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PSLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PSDBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PSBMATGEN( ICTXT, 'G', 'D', BWL, BWU, N, $ (BWL+BWU+1), NB, MEM( IPA ), $ ((BWL+BWU+1)+10), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) * CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((BWL+BWU+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PSLANGE( '1', (BWL+BWU+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((BWL+BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PSDBTRF( N, BWL, BWU, MEM( IPA ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PSDBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PSCHEKPAD( ICTXT, 'PSDBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((BWL+BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (MAX(BWL,BWU)*NRHS) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PSMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PSFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PSDBTRS( TRANS, N, BWL, BWU, NRHS, MEM( IPA ), $ 1, DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PSDBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSDBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PSDBLASCHK( 'N', 'D', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl bwu + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE(BWU)) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+bwu) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE(BWU)) $ *DBLE(NRHS)) * * Second calc to represent actual hardware speed * * 2*N_FIRST bwl*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE(BWL)*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST bwl*bwu * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE(BWL) $ *DBLE(BWU)) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB bwl*bwu flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE(BWL) $ *DBLE(BWU)) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( BWL*BWU*BWL/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*BWL*BWU*BWL) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+bwu) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE(BWU)) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * (bwl+bwu) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE(BWL)+ $ DBLE(BWU)))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * (bwl+bwu) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE(BWL)+ $ DBLE(BWU)))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*(BWL*BWU ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * BWL*BWU ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PSDBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/psdbinfo.f000644 000766 000024 00000044335 10363532303 017432 0ustar00juliestaff000000 000000 SUBROUTINE PSDBINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PSDBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'T' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'T' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSDBINFO * END scalapack-2.0.2/TESTING/LIN/psdblaschk.f000644 000766 000024 00000026053 10363532303 017741 0ustar00juliestaff000000 000000 SUBROUTINE PSDBLASCHK( SYMM, UPLO, TRANS, N, BWL, BWU, NRHS, X, $ IX, JX, DESCX, IASEED, A, IA, JA, DESCA, $ IBSEED, ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, TRANS, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) REAL A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PSDBLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'S', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * TRANS if TRANS= 'T', A 'Transpose' is used as the * coefficient matrix in the solve. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) REAL pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) REAL array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,max(max(bwl,bwu)*(max(bwl,bwu)+2),NB))+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,max(max(bwl,bwu)*(max(bwl,bwu)+2),NB))+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER BW, INFO, IPPRODUCT, WORK_MIN REAL DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PBSTRAN, PSMATGEN, $ SGAMX2D, SGEBR2D, SGEBS2D, SGEMM, $ SGERV2D, SGESD2D, SGSUM2D, SLASET * .. * .. External Functions .. INTEGER ISAMAX, NUMROC REAL PSLAMCH EXTERNAL ISAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'S' ) ) THEN BW = BWL WORK_MIN = MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ELSE BW = MAX(BWL, BWU) WORK_MIN = MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PSBLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (BWL+BWU+1), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'S' )) THEN CALL PSBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PSBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PSGBDCMV( BWL+BWU+1, BWL, BWU, TRANS, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), $ (MAX(BWL,BWU)+2)*MAX(BWL,BWU), INFO ) * * * Regenerate column of B * CALL PSMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PSAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PSNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PSNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PSBLASCHK * END scalapack-2.0.2/TESTING/LIN/psdbmv1.f000644 000766 000024 00000075753 10607174520 017216 0ustar00juliestaff000000 000000 SUBROUTINE PSGBDCMV( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, $ B, IB, DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSDBTRF and this is stored in AF. If a linear system * is to be solved using PSDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M, $ DU_N_N, DU_P_M, DU_P_N, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM2, IDUM3, J, JA_NEW, $ LLDA, LLDB, MAX_BW, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PSDBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PSDBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSDBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = ZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (MAX_BW+2)*MAX_BW WORK( I ) = ZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( TRANS, 'N' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * * Use main partition in each processor to multiply locally * CALL SGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, ONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, ZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL SCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWL-DL_N_N ), 1 ) * CALL STRMV( 'U', 'N', 'N', BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL SGESD2D( ICTXT, BWL, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWL, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = ZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL SCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * CALL STRMV( $ 'L', $ 'N', $ 'N', BWU, $ A( 1 ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 30 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL SGESD2D( ICTXT, BWU, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL SGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL SAXPY( BWL, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL SGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL SAXPY( BWU, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWU+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( TRANS, 'T' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL SLATCPY( 'L', BWU, BWU, A( OFST+1 ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to left * CALL STRSD2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL-1 ) * ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL SLATCPY( 'U', BWL, BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to right * CALL STRSD2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL SGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, ONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, ZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL SCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWU-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL STRRV2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), MAX_BW, MYROW, MYCOL+1 ) * CALL STRMV( 'U', 'N', 'N', BWU, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL SGESD2D( ICTXT, BWU, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWU, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = ZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL SCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL STRRV2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), MAX_BW, MYROW, MYCOL-1 ) * CALL STRMV( $ 'L', $ 'N', $ 'N', BWL, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 60 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL SGESD2D( ICTXT, BWL, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL SGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL SAXPY( BWU, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL SGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL SAXPY( BWL, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWL+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PSBsBMV1 * END scalapack-2.0.2/TESTING/LIN/psdtdriver.f000644 000766 000024 00000076151 10363532303 020015 0ustar00juliestaff000000 000000 PROGRAM PSDTDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PSDTDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by SDT. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER MEMSIZ, NTESTS, REALSZ REAL PADVAL PARAMETER ( REALSZ = 4, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0, ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSBMATGEN, $ PSCHEKPAD, PSDTINFO, PSDTLASCHK, PSDTTRF, $ PSDTTRS, PSFILLPAD, PSMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PSLANGE EXTERNAL LSAME, NUMROC, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PSDTINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = 1 IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = 1 IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*INT_ONE-1)/NPCOL + 1 ) $ + INT_ONE NB = MAX( NB, 2*INT_ONE ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*INT_ONE, N ) ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (3), (3), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((3)+10) IMIDPAD = 10 IPOSTPAD = ((3)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, N, (3), $ NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((3)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + (NB+10)*(3) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (12*NPCOL+3*NB) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 8*NPCOL * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((3)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PSLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PSDTLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,NB)+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PSBMATGEN( ICTXT, 'T', 'D', BWL, BWU, N, (3), NB, $ MEM( IPA ), NB+10, 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) CALL PSFILLPAD( ICTXT, NQ, NP, MEM( IPA-IPREPAD ), $ NB+10, IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PSLANGE( 'I', N, $ (3), MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', NQ, NP, $ MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PSDTTRF( N, MEM( IPA+2*( NB+10 ) ), $ MEM( IPA+1*( NB+10 ) ), MEM( IPA ), 1, $ DESCA, MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW ), IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PSDTTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PSCHEKPAD( ICTXT, 'PSDTTRF', NQ, $ NP, MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = 10*NPCOL+4*NRHS * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PSMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PSFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PSDTTRS( TRANS, N, NRHS, $ MEM( IPA+2*( NB+10 ) ), $ MEM( IPA+1*( NB+10 ) ), MEM( IPA ), $ 1, DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PSDTTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSDTTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * * Reset descriptor describing A to 1-by-P grid for * use in banded utility routines * CALL DESCINIT( DESCA2D, (3), N, $ (3), NB, 0, 0, $ ICTXT, (3), IERR( 1 ) ) CALL PSDTLASCHK( 'N', 'D', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl INT_ONE + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE(INT_ONE)) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+INT_ONE) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE(INT_ONE)) $ *DBLE(NRHS)) * * Second calc to represent actual hardware speed * * 2*N_FIRST bwl*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE(BWL)*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST bwl*INT_ONE * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE(BWL) $ *DBLE(INT_ONE)) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB bwl*INT_ONE flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE(BWL) $ *DBLE(INT_ONE)) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( BWL*INT_ONE*BWL/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*BWL*INT_ONE*BWL) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+INT_ONE) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE(INT_ONE)) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * (bwl+INT_ONE) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE(BWL)+ $ DBLE(INT_ONE)))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * (bwl+INT_ONE) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE(BWL)+ $ DBLE(INT_ONE)))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*(BWL*INT_ONE ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * BWL*INT_ONE ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PSDTTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/psdtinfo.f000644 000766 000024 00000044356 10363532303 017457 0ustar00juliestaff000000 000000 SUBROUTINE PSDTINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PSDTINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW NBW = 1 IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'T' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'T' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSDTINFO * END scalapack-2.0.2/TESTING/LIN/psdtlaschk.f000644 000766 000024 00000030563 10363532303 017764 0ustar00juliestaff000000 000000 SUBROUTINE PSDTLASCHK( SYMM, UPLO, TRANS, N, BWL, BWU, NRHS, X, $ IX, JX, DESCX, IASEED, A, IA, JA, DESCA, $ IBSEED, ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, TRANS, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) REAL A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PSDTLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'S', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * TRANS if TRANS= 'T', A 'Transpose' is used as the * coefficient matrix in the solve. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) REAL pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) REAL array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,NB)+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,NB)+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER I, START INTEGER BW, INFO, IPPRODUCT, WORK_MIN REAL DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PBSTRAN, PSMATGEN, $ SGAMX2D, SGEBR2D, SGEBS2D, SGEMM, $ SGERV2D, SGESD2D, SGSUM2D, SLASET * .. * .. External Functions .. INTEGER ISAMAX, NUMROC REAL PSLAMCH EXTERNAL ISAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'S' ) ) THEN BW = BWL START = 1 WORK_MIN = MAX(5,NB)+2*NB ELSE BW = MAX(BWL, BWU) IF( LSAME( UPLO, 'D' )) THEN START = 1 ELSE START = 2 ENDIF WORK_MIN = MAX(5,NB)+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PSTLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (3), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'S' )) THEN CALL PSBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PSBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Matrix formed above has the diagonals shifted from what was * input to the tridiagonal routine. Shift them back. * * Send elements to neighboring processors * IF( MYCOL.GT.0 ) THEN CALL SGESD2D( ICTXT, 1, 1, A( START+2), LDA, $ MYROW, MYCOL-1 ) ENDIF * IF( MYCOL.LT.NPCOL-1 ) THEN CALL SGESD2D( ICTXT, 1, 1, $ A( START+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * * Shift local elements * DO 220 I=0,DESCA( NB_ )-1 A( START+2+(I)*LDA ) = A( START+2+(I+1)*LDA ) 220 CONTINUE * DO 230 I=DESCA( NB_ )-1,0,-1 A( START+(I+1)*LDA ) = A( START+(I)*LDA ) 230 CONTINUE * * Receive elements from neighboring processors * IF( MYCOL.LT.NPCOL-1 ) THEN CALL SGERV2D( ICTXT, 1, 1, $ A( START+2+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * IF( MYCOL.GT.0 ) THEN CALL SGERV2D( ICTXT, 1, 1, A( START), LDA, $ MYROW, MYCOL-1 ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PSGBDCMV( BWL+BWU+1, BWL, BWU, TRANS, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), $ (INT_ONE+2)*INT_ONE, INFO ) * * * Regenerate column of B * CALL PSMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PSAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PSNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PSNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PSTLASCHK * END scalapack-2.0.2/TESTING/LIN/psgbdriver.f000644 000766 000024 00000077440 10363532303 020000 0ustar00juliestaff000000 000000 PROGRAM PSGBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PSGBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by SGB. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTMEM INTEGER, default = 2048. * INTMEM is the size of the integer workspace used in this * driver as input as the IPIV vector. It represents an * upper bound on NB, the blocksize of the data * distribution. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER INTMEM PARAMETER ( INTMEM = 2048 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER MEMSIZ, NTESTS, REALSZ REAL PADVAL PARAMETER ( REALSZ = 4, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0, ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER IPIV(INTMEM) INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSBMATGEN, $ PSCHEKPAD, PSDBLASCHK, PSFILLPAD, PSGBINFO, $ PSGBTRF, PSGBTRS, PSMATGEN, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PSLANGE EXTERNAL LSAME, NUMROC, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PSGBINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = BWLVAL( BW_NUM ) IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = BWUVAL( BW_NUM ) IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*(BWL+BWU)-1)/NPCOL + 1 ) $ + (BWL+BWU) NB = MAX( NB, 2*(BWL+BWU) ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 * IF( NB.GT.INTMEM ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) THEN WRITE( NOUT,* )'You have chosen an ' $ ,'NB > INTMEM in the driver.' WRITE(NOUT, *)'Please edit the driver ' $ ,'and increase the value of INTMEM' ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (2*BWL+2*BWU+1), (2*BWL+2*BWU+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((2*BWL+2*BWU+1)+10) IMIDPAD = 10 IPOSTPAD = ((2*BWL+2*BWU+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (2*BWL+2*BWU+1), N, $ (2*BWL+2*BWU+1), NB, 0, 0, $ ICTXT,((2*BWL+2*BWU+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((2*BWL+2*BWU+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 1 * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((2*BWL+2*BWU+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PSLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PSDBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PSBMATGEN( ICTXT, 'G', 'N', BWL, BWU, N, $ (2*BWL+2*BWU+1), NB, MEM( IPA+BWL+BWU ), $ ((2*BWL+2*BWU+1)+10), 0, 0, IASEED, $ MYROW, MYCOL, NPROW, NPCOL ) * CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((2*BWL+2*BWU+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PSLANGE( '1', (2*BWL+2*BWU+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((2*BWL+2*BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PSGBTRF( N, BWL, BWU, MEM( IPA ), 1, DESCA, IPIV, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PSGBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PSCHEKPAD( ICTXT, 'PSGBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((2*BWL+2*BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = NRHS*(NB+2*BWL+4*BWU) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PSMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PSFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PSGBTRS( TRANS, N, BWL, BWU, NRHS, MEM( IPA ), $ 1, DESCA, IPIV, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PSGBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PSDBLASCHK( 'N', 'N', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA+BWL+BWU ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl (bwl+bwu) + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE((BWL+BWU))) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+(bwl+bwu)) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE((BWL+BWU))) $ *DBLE(NRHS)) * * Second calc to represent actual hardware speed * * 2*N_FIRST (bwl+bwu)*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE((BWL+BWU))*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST (bwl+bwu)*(bwl+bwu) * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE((BWL+BWU)) $ *DBLE((BWL+BWU))) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB (bwl+bwu)*(bwl+bwu) flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE((BWL+BWU)) $ *DBLE((BWL+BWU))) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( (BWL+BWU)*(BWL+BWU)*(BWL+BWU)/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*(BWL+BWU)*(BWL+BWU)*(BWL+BWU)) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+(bwl+bwu)) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE((BWL+BWU))) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * ((bwl+bwu)+(bwl+bwu)) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE((BWL+BWU))+ $ DBLE((BWL+BWU))))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * ((bwl+bwu)+(bwl+bwu)) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE((BWL+BWU))+ $ DBLE((BWL+BWU))))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*((BWL+BWU)*(BWL+BWU) ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * (BWL+BWU)*(BWL+BWU) ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PSGBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/psgbinfo.f000644 000766 000024 00000044335 10363532303 017435 0ustar00juliestaff000000 000000 SUBROUTINE PSGBINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PSGBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'T' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'T' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSGBINFO * END scalapack-2.0.2/TESTING/LIN/psgbmv1.f000644 000766 000024 00000075753 10607174520 017221 0ustar00juliestaff000000 000000 SUBROUTINE PSGBDCMV( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, $ B, IB, DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSDBTRF and this is stored in AF. If a linear system * is to be solved using PSDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M, $ DU_N_N, DU_P_M, DU_P_N, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM2, IDUM3, J, JA_NEW, $ LLDA, LLDB, MAX_BW, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PSDBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PSDBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSDBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = ZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (MAX_BW+2)*MAX_BW WORK( I ) = ZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( TRANS, 'N' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * * Use main partition in each processor to multiply locally * CALL SGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, ONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, ZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL SCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWL-DL_N_N ), 1 ) * CALL STRMV( 'U', 'N', 'N', BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL SGESD2D( ICTXT, BWL, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWL, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = ZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL SCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * CALL STRMV( $ 'L', $ 'N', $ 'N', BWU, $ A( 1 ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 30 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL SGESD2D( ICTXT, BWU, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL SGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL SAXPY( BWL, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL SGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL SAXPY( BWU, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWU+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( TRANS, 'T' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL SLATCPY( 'L', BWU, BWU, A( OFST+1 ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to left * CALL STRSD2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL-1 ) * ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL SLATCPY( 'U', BWL, BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to right * CALL STRSD2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL SGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, ONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, ZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL SCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWU-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL STRRV2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), MAX_BW, MYROW, MYCOL+1 ) * CALL STRMV( 'U', 'N', 'N', BWU, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL SGESD2D( ICTXT, BWU, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWU, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = ZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL SCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL STRRV2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), MAX_BW, MYROW, MYCOL-1 ) * CALL STRMV( $ 'L', $ 'N', $ 'N', BWL, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 60 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL SGESD2D( ICTXT, BWL, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL SGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL SAXPY( BWU, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL SGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL SAXPY( BWL, ONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWL+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PSBsBMV1 * END scalapack-2.0.2/TESTING/LIN/psgelqrv.f000644 000766 000024 00000022731 10363532303 017465 0ustar00juliestaff000000 000000 SUBROUTINE PSGELQRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGELQRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from L, Q * computed by PSGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors L and Q computed * by PSGELQF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors computed by PSGELQF. TAU * is tied to the distributed matrix A. * * WORK (local workspace) REAL array, dimension * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IL, IN, $ IPT, IPV, IPW, J, JJA, JV, K, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSLACPY, $ PSLARFB, PSLARFT, PSLASET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IL = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) * ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IL, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * CALL DESCSET( DESCV, DESCA( MB_ ), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * DO 10 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+K-I, DESCA( MB_ ) ) J = JA + I - IA JV = 1 + I - IA + ICOFF * * Compute upper triangular matrix T * CALL PSLARFT( 'Forward', 'Rowwise', N-J+JA, IB, A, I, J, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'Upper', IB, N-J+JA, A, I, J, DESCA, WORK( IPV ), $ 1, JV, DESCV ) CALL PSLASET( 'Lower', IB, N-J+JA, ZERO, ONE, WORK( IPV ), 1, $ JV, DESCV ) * * Zeroes the strict upper triangular part of sub( A ) to get * block column of L * CALL PSLASET( 'Upper', IB, N-J+JA-1, ZERO, ZERO, A, I, J+1, $ DESCA ) * * Apply block Householder transformation * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', $ M-I+IA, N-J+JA, IB, WORK( IPV ), 1, JV, DESCV, $ WORK( IPT ), A, I, J, DESCA, WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) * 10 CONTINUE * * Handle first block separately * IB = IN - IA + 1 * * Compute upper triangular matrix T * CALL PSLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, TAU, $ WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'Upper', IB, N, A, IA, JA, DESCA, WORK( IPV ), 1, $ ICOFF+1, DESCV ) CALL PSLASET( 'Lower', IB, N, ZERO, ONE, WORK, 1, ICOFF+1, DESCV ) * * Zeroes the strict upper triangular part of sub( A ) to get * block column of L * CALL PSLASET( 'Upper', IB, N-1, ZERO, ZERO, A, IA, JA+1, DESCA ) * * Apply block Householder transformation * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', M, N, $ IB, WORK( IPV ), 1, ICOFF+1, DESCV, WORK( IPT ), A, $ IA, JA, DESCA, WORK( IPW ) ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PSGELQRV * END scalapack-2.0.2/TESTING/LIN/psgeqlrv.f000644 000766 000024 00000023774 10363532303 017475 0ustar00juliestaff000000 000000 SUBROUTINE PSGEQLRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEQLRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from L, Q * computed by PSGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors L and Q computed * by PSGEQLF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(N_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PSGEQLF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK = NB_A * ( 2*Mp0 + Nq0 + NB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IIA, IPT, IPV, IPW, IROFF, $ IV, J, JB, JJA, JN, K, MP, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSLACPY, $ PSLARFB, PSLARFT, PSLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) CALL INFOG2L( IA, JA+N-K, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPV = 1 IPT = IPV + MP * DESCA( NB_ ) IPW = IPT + DESCA( NB_ ) * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * CALL DESCSET( DESCV, M+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, MP ) ) * * Handle first block separately * IV = 1 + M - K + IROFF JB = JN - JA - N + K + 1 * * Compute upper triangular matrix T * CALL PSLARFT( 'Backward', 'Columnwise', M-N+JN-JA+1, JB, A, IA, $ JA+N-K, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'All', M-N+JN-JA+1, JB, A, IA, JA+N-K, DESCA, $ WORK( IPV ), IROFF+1, 1, DESCV ) CALL PSLASET( 'Lower', JB, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeoes the strict upper triangular part of A to get block * row of L * CALL PSLASET( 'All', M-K, JB, ZERO, ZERO, A, IA, JA+N-K, $ DESCA ) CALL PSLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA+M-K, $ JA+N-K+1, DESCA ) * * Apply block Householder transformation * CALL PSLARFB( 'Left', 'No transpose', 'Backward', 'Columnwise', $ M-N+JN-JA+1, JN-JA+1, JB, WORK( IPV ), IROFF+1, 1, $ DESCV, WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) * * Loop over the remaining column blocks * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) IV = 1 + M - N + J - JA + IROFF * * Compute upper triangular matrix T * CALL PSLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, A, IA, $ J, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'All', M-N+J+JB-JA, JB, A, IA, J, DESCA, $ WORK( IPV ), IROFF+1, 1, DESCV ) CALL PSLASET( 'Lower', JB, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeoes the strict upper triangular part of sub( A ) to get * block row of L * CALL PSLASET( 'All', M-N+J-JA, JB, ZERO, ZERO, A, IA, J, $ DESCA ) CALL PSLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA+M-N+J-JA, $ J+1, DESCA ) * * Apply block Householder transformation * CALL PSLARFB( 'Left', 'No transpose', 'Backward', 'Columnwise', $ M-N+J+JB-JA, J+JB-JA, JB, WORK( IPV ), IROFF+1, $ 1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PSGEQLRV * END scalapack-2.0.2/TESTING/LIN/psgeqrrv.f000644 000766 000024 00000023375 10363532303 017500 0ustar00juliestaff000000 000000 SUBROUTINE PSGEQRRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEQRRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from Q, R * computed by PSGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors Q and R computed * by PSGEQRF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors computed by PSGEQRF. TAU * is tied to the distributed matrix A. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK = NB_A * ( 2*Mp0 + Nq0 + NB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, IIA, IPT, IPV, IPW, $ IROFF, IV, J, JB, JJA, JL, JN, K, MP, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSLACPY, $ PSLARFB, PSLARFT, PSLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPV = 1 IPT = IPV + MP * DESCA( NB_ ) IPW = IPT + DESCA( NB_ ) * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * K = MIN( M, N ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JL = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) * CALL DESCSET( DESCV, M+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, INDXG2P( JL, DESCA( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ), ICTXT, $ MAX( 1, MP ) ) * DO 10 J = JL, JN+1, -DESCA( NB_ ) JB = MIN( JA+K-J, DESCA( NB_ ) ) I = IA + J - JA IV = 1 + J - JA + IROFF * * Compute upper triangular matrix T * CALL PSLARFT( 'Forward', 'Columnwise', M-I+IA, JB, A, I, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'Lower', M-I+IA, JB, A, I, J, DESCA, WORK( IPV ), $ IV, 1, DESCV ) CALL PSLASET( 'Upper', M-I+IA, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeroes the strict lower triangular part of sub( A ) to get * block column of R * CALL PSLASET( 'Lower', M-I+IA-1, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Apply block Householder transformation * CALL PSLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M-I+IA, N-J+JA, JB, WORK( IPV ), IV, 1, DESCV, $ WORK( IPT ), A, I, J, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle first block separately * JB = JN - JA + 1 * * Compute upper triangular matrix T * CALL PSLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPV ), $ IROFF+1, 1, DESCV ) CALL PSLASET( 'Upper', M, JB, ZERO, ONE, WORK, IROFF+1, 1, DESCV ) * * Zeroes the strict lower triangular part of sub( A ) to get block * column of R * CALL PSLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Apply block Householder transformation * CALL PSLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', M, $ N, JB, WORK( IPV ), IROFF+1, 1, DESCV, WORK( IPT ), $ A, IA, JA, DESCA, WORK( IPW ) ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PSGEQRRV * END scalapack-2.0.2/TESTING/LIN/psgerqrv.f000644 000766 000024 00000023607 10363532303 017476 0ustar00juliestaff000000 000000 SUBROUTINE PSGERQRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGERQRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from R, Q * computed by PSGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors R and Q computed * by PSGERQF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCr(M_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PSGERQF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IN, $ IPT, IPV, IPW, JJA, JV, K, MYCOL, MYROW, NPCOL, $ NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSLACPY, $ PSLARFB, PSLARFT, PSLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) * ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA+M-K, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * CALL DESCSET( DESCV, DESCA( MB_), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * * Handle first block separately * IB = IN - IA - M + K + 1 JV = 1 + N - K + ICOFF * * Compute upper triangular matrix T * CALL PSLARFT( 'Backward', 'Rowwise', N-M+IN-IA+1, IB, A, IA+M-K, $ JA, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'All', IB, N-M+IN-IA+1, A, IA+M-K, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) CALL PSLASET( 'Upper', IB, IB, ZERO, ONE, WORK( IPV ), 1, JV, $ DESCV ) * * Zeoes the strict lower triangular part of sub( A ) to get block * column of R * CALL PSLASET( 'All', IB, N-K, ZERO, ZERO, A, IA+M-K, JA, $ DESCA ) CALL PSLASET( 'Lower', IB-1, IB, ZERO, ZERO, A, IA+M-K+1, $ JA+N-K, DESCA ) * * Apply block Householder transformation * CALL PSLARFB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ IN-IA+1, N-M+IN-IA+1, IB, WORK( IPV ), 1, ICOFF+1, $ DESCV, WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * * Loop over the remaining row blocks * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) JV = 1 + N - M + I - IA + ICOFF * * Compute upper triangular matrix T * CALL PSLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, I, JA, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'All', IB, N-M+I+IB-IA, A, I, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) CALL PSLASET( 'Upper', IB, IB, ZERO, ONE, WORK( IPV ), 1, JV, $ DESCV ) * * Zeoes the strict Lower triangular part of sub( A ) to get * block column of R * CALL PSLASET( 'All', IB, N-M+I-IA, ZERO, ZERO, A, I, JA, $ DESCA ) CALL PSLASET( 'Lower', IB-1, IB, ZERO, ZERO, A, I+1, $ JA+N-M+I-IA, DESCA ) * * Apply block Householder transformation * CALL PSLARFB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ I+IB-IA, N-M+I+IB-IA, IB, WORK( IPV ), 1, $ ICOFF+1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PSGERQRV * END scalapack-2.0.2/TESTING/LIN/psgetrrv.f000644 000766 000024 00000030066 10363532303 017476 0ustar00juliestaff000000 000000 SUBROUTINE PSGETRRV( M, N, A, IA, JA, DESCA, IPIV, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSGETRRV reforms sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from the * triangular matrices L and U returned by PSGETRF. It multiplies * an upper triangular matrix stored in the upper triangle of sub( A ) * times the unit lower triangular matrix stored in the lower triangle. * To accomplish this, the routine basically performs the PSGETRF * routine in reverse. * * It computes L*U first, and then apply P: P*L*U => sub( A ). In the * J-th loop, the block column (or column panel), which has the lower * triangular unit matrix L is multiplied with the block row (or row * panel), which contains the upper triangular matrix U. * * ( L1 ) ( 0 0 ) ( L1*U1 L1*U2 ) * A` = L * U + A` = ( ) * (U1 U2) + ( ) = ( ) * ( L2 ) ( 0 A`) ( L2*U1 L2*U2+A` ) * * where L1 is a lower unit triangular matrix and U1 is an upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the distributed matrix sub( A ) * contains the the factors L and U from the factorization * sub( A ) = P*L*U; the unit diagonal elements of L are not * stored. On exit, the original distributed matrix sub( A ) * is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * WORK (local workspace) REAL array of dimension (LWORK) * LWORK >= MpA0 * NB_A + NqA0 * MB_A, where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * WORK is used to store a block of columns of L, and a block of * rows of U. INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IL, IPL, IPU, IROFF, J, $ JB, JL, JN, MN, MP, MYCOL, MYROW, NPCOL, NPROW * .. Local Arrays .. INTEGER DESCIP( DLEN_ ), DESCL( DLEN_ ), $ DESCU( DLEN_ ), IDUM( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PSGEMM, PSLACPY, $ PSLAPIV, PSLASET, PB_TOPGET, PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPL = 1 IPU = IPL + MP * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Define array descriptors for L and U * MN = MIN( M, N ) IL = MAX( ( ( IA+MN-2 ) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) JL = MAX( ( ( JA+MN-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+MN-1 ) IAROW = INDXG2P( IL, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) * CALL DESCSET( DESCL, IA+M-IL, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, MP ) ) * CALL DESCSET( DESCU, DESCA( MB_ ), JA+N-JL, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * CALL DESCSET( DESCIP, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ NUMROC( DESCA( M_ ), DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) + DESCA( MB_ ) ) * * DO 10 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+MN-J, DESCA( NB_ ) ) * * Copy unit lower triangular part of sub( A ) into WORK * CALL PSLACPY( 'Lower', M-IL+IA, JB, A, IL, J, DESCA, $ WORK( IPL ), 1, 1, DESCL ) CALL PSLASET( 'Upper', M-IL+IA, JB, ZERO, ONE, WORK( IPL ), $ 1, 1, DESCL ) * * Copy upper triangular part of sub( A ) into WORK(IPU) * CALL PSLACPY( 'Upper', JB, JA+N-J, A, IL, J, DESCA, $ WORK( IPU ), 1, 1, DESCU ) CALL PSLASET( 'Lower', JB-1, JA+N-J, ZERO, ZERO, $ WORK( IPU ), 2, 1, DESCU ) * * Zero the strict lower triangular piece of the current block. * CALL PSLASET( 'Lower', IA+M-IL-1, JB, ZERO, ZERO, A, IL+1, J, $ DESCA ) * * Zero the upper triangular piece of the current block. * CALL PSLASET( 'Upper', JB, JA+N-J, ZERO, ZERO, A, IL, J, $ DESCA ) * * Update the matrix sub( A ). * CALL PSGEMM( 'No transpose', 'No transpose', IA+M-IL, $ JA+N-J, JB, ONE, WORK( IPL ), 1, 1, DESCL, $ WORK( IPU ), 1, 1, DESCU, ONE, A, IL, J, DESCA ) * IL = IL - DESCA( MB_ ) DESCL( M_ ) = DESCL( M_ ) + DESCL( MB_ ) DESCL( RSRC_ ) = MOD( DESCL( RSRC_ ) + NPROW - 1, NPROW ) DESCL( CSRC_ ) = MOD( DESCL( CSRC_ ) + NPCOL - 1, NPCOL ) DESCU( N_ ) = DESCU( N_ ) + DESCU( NB_ ) DESCU( RSRC_ ) = DESCL( RSRC_ ) DESCU( CSRC_ ) = DESCL( CSRC_ ) * 10 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Copy unit lower triangular part of sub( A ) into WORK * CALL PSLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPL ), $ 1, 1, DESCL ) CALL PSLASET( 'Upper', M, JB, ZERO, ONE, WORK( IPL ), 1, 1, $ DESCL ) * * Copy upper triangular part of sub( A ) into WORK(IPU) * CALL PSLACPY( 'Upper', JB, N, A, IA, JA, DESCA, WORK( IPU ), 1, $ 1, DESCU ) CALL PSLASET( 'Lower', JB-1, N, ZERO, ZERO, WORK( IPU ), 2, 1, $ DESCU ) * * Zero the strict lower triangular piece of the current block. * CALL PSLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Zero the upper triangular piece of the current block. * CALL PSLASET( 'Upper', JB, N, ZERO, ZERO, A, IA, JA, DESCA ) * * Update the matrix sub( A ). * CALL PSGEMM( 'No transpose', 'No transpose', M, N, JB, ONE, $ WORK( IPL ), 1, 1, DESCL, WORK( IPU ), 1, 1, $ DESCU, ONE, A, IA, JA, DESCA ) * * Apply pivots so that sub( A ) = P*L*U * CALL PSLAPIV( 'Backward', 'Row', 'Col', MIN( M, N ), N, A, IA, JA, $ DESCA, IPIV, IA, 1, DESCIP, IDUM ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PSGETRRV * END scalapack-2.0.2/TESTING/LIN/psinvchk.f000644 000766 000024 00000033751 10363532303 017453 0ustar00juliestaff000000 000000 SUBROUTINE PSINVCHK( MATTYP, N, A, IA, JA, DESCA, IASEED, ANORM, $ FRESID, RCOND, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, IASEED, JA, N REAL ANORM, FRESID, RCOND * .. * .. Array Arguments .. CHARACTER*3 MATTYP INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSINVCHK computes the scaled residual * * || sub( A ) * inv( sub( A ) ) - I || / ( || sub( A ) || * N * eps ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1). to check the result * returned by the matrix inversion routines. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MATTYP (global input) CHARACTER*3 * The type of the distributed matrix to be generated: * if MATTYP = 'GEN' then GENeral matrix, * if MATTYP = 'UTR' then Upper TRiangular matrix, * if MATTYP = 'LTR' then Lower TRiangular matrix, * if MATTYP = 'UPD' then (Upper) symmetric Positive Definite, * if MATTYP = 'LPD' then (Lower) symmetric Positive Definite, * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) REAL pointer into the local memory * to an array of local dimension (LLD_A, LOCc(JA+N-1)). On * entry, sub( A ) contains the distributed matrix inverse * computed by PSGETRI, PSPOTRI or PSTRTRI. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * Seed for the random generation of sub( A ). * * ANORM (global input) REAL * The 1-norm of the original matrix sub( A ). * * FRESID (global output) REAL * The inversion residual. * * RCOND (global output) REAL * The condition number of the original distributed matrix. * RCOND = || sub( A ) ||.|| sub( A )^{-1} || where ||A|| * denotes the 1-norm of A. * * WORK (local workspace) REAL array, dimension * MAX(2*LOCr(N_A+MOD(IA-1,MB_A))*MB_A, LDW) * where LDW is the workspace requirement for the norm computa- * tions, see PSLANGE, PSLANSY and PSLANTR. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. CHARACTER AFORM, DIAG, UPLO INTEGER ICTXT, ICURCOL, ICURROW, II, IIA, IPW, IROFF, $ IW, J, JB, JJA, JN, KK, MYCOL, MYROW, NP, $ NPCOL, NPROW REAL AUXNORM, EPS, NRMINVAXA, TEMP * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSGEMM, $ PSLASET, PSMATGEN, PSSYMM, PSTRMM * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, NUMROC REAL PSLAMCH, PSLANGE, PSLANSY, PSLANTR EXTERNAL ICEIL, LSAMEN, NUMROC, PSLAMCH, PSLANGE, $ PSLANSY, PSLANTR * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * EPS = PSLAMCH( DESCA( CTXT_ ), 'eps' ) * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Compute the condition number * IF( LSAMEN( 1, MATTYP( 1:1 ), 'U' ) ) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * AFORM = 'N' DIAG = 'D' AUXNORM = PSLANGE( '1', N, N, A, IA, JA, DESCA, WORK ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN * AFORM = 'N' DIAG = 'D' AUXNORM = PSLANTR( '1', UPLO, 'Non unit', N, N, A, IA, JA, $ DESCA, WORK ) ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * AFORM = 'S' DIAG = 'D' AUXNORM = PSLANSY( '1', UPLO, N, A, IA, JA, DESCA, WORK ) * END IF RCOND = ANORM*AUXNORM * * Compute inv(A)*A * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ ICURROW, ICURCOL ) * * Define array descriptor for working array WORK * IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, ICURROW, NPROW ) CALL DESCSET( DESCW, N+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), ICURROW, ICURCOL, DESCA( CTXT_ ), $ MAX( 1, NP ) ) IPW = DESCW( LLD_ ) * DESCW( NB_ ) + 1 * IF( MYROW.EQ.ICURROW ) THEN II = IROFF + 1 NP = NP - IROFF ELSE II = 1 END IF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN - JA + 1 * * Handle first block separately, regenerate a block of columns of A * IW = IROFF + 1 IF( MYCOL.EQ.ICURCOL ) THEN IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN CALL PSMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), WORK, $ DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, IIA-1, NP, $ JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) IF( LSAMEN( 3, MATTYP, 'UTR' ) ) THEN CALL PSLASET( 'Lower', N-1, JB, ZERO, ZERO, WORK, IW+1, $ 1, DESCW ) ELSE CALL PSLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, WORK, IW, $ 2, DESCW ) END IF ELSE CALL PSMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), WORK( IPW ), $ DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, $ IIA-1, NP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) END IF END IF * * Multiply A^{-1}*A * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * CALL PSGEMM( 'No tranpose', 'No transpose', N, JB, N, ONE, A, $ IA, JA, DESCA, WORK( IPW ), IW, 1, DESCW, ZERO, $ WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN * CALL PSTRMM( 'Left', UPLO, 'No tranpose', 'Non unit', N, JB, $ ONE, A, IA, JA, DESCA, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * CALL PSSYMM( 'Left', UPLO, N, JB, ONE, A, IA, JA, DESCA, $ WORK( IPW ), IW, 1, DESCW, ZERO, WORK, IW, 1, $ DESCW ) * END IF * * subtract the identity matrix to the diagonal block of these cols * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 10 KK = 0, JB-1 WORK( II+KK*(DESCW(LLD_)+1) ) = $ WORK( II+KK*(DESCW( LLD_ )+1) )-ONE 10 CONTINUE END IF * NRMINVAXA = PSLANGE( '1', N, JB, WORK, IW, 1, DESCW, WORK( IPW ) ) * IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJA = JJA + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) DESCW( CSRC_ ) = ICURCOL * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) * JB = MIN( N-J+JA, DESCA( NB_ ) ) * * regenerate a block of columns of A * IF( MYCOL.EQ.ICURCOL ) THEN IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN CALL PSMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCW( MB_ ), DESCW( NB_ ), $ WORK, DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), $ IASEED, IIA-1, NP, JJA-1, JB, MYROW, $ MYCOL, NPROW, NPCOL ) IF( LSAMEN( 3, MATTYP, 'UTR' ) ) THEN CALL PSLASET( 'Lower', JA+N-J-1, JB, ZERO, ZERO, $ WORK, IW+J-JA+1, 1, DESCW ) ELSE CALL PSLASET( 'All', J-JA, JB, ZERO, ZERO, WORK, IW, $ 1, DESCW ) CALL PSLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, $ WORK, IW+J-JA, 2, DESCW ) END IF ELSE CALL PSMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCW( MB_ ), DESCW( NB_ ), $ WORK( IPW ), DESCW( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, NP, $ JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) END IF END IF * * Multiply A^{-1}*A * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * CALL PSGEMM( 'No tranpose', 'No transpose', N, JB, N, ONE, $ A, IA, JA, DESCA, WORK( IPW ), IW, 1, DESCW, $ ZERO, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP(2:3), 'TR' ) ) THEN * CALL PSTRMM( 'Left', UPLO, 'No tranpose', 'Non unit', N, JB, $ ONE, A, IA, JA, DESCA, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * CALL PSSYMM( 'Left', UPLO, N, JB, ONE, A, IA, JA, DESCA, $ WORK(IPW), IW, 1, DESCW, ZERO, WORK, IW, 1, $ DESCW ) * END IF * * subtract the identity matrix to the diagonal block of these * cols * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 20 KK = 0, JB-1 WORK( II+KK*(DESCW( LLD_ )+1) ) = $ WORK( II+KK*(DESCW( LLD_ )+1) ) - ONE 20 CONTINUE END IF * * Compute the 1-norm of these JB cols * TEMP = PSLANGE( '1', N, JB, WORK, IW, 1, DESCW, WORK( IPW ) ) NRMINVAXA = MAX( TEMP, NRMINVAXA ) * IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJA = JJA + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) DESCW( CSRC_ ) = ICURCOL * 30 CONTINUE * * Compute the scaled residual * FRESID = NRMINVAXA / ( N * EPS * ANORM ) * RETURN * * End of PSINVCHK * END scalapack-2.0.2/TESTING/LIN/psinvdriver.f000644 000766 000024 00000077577 10430450246 020217 0ustar00juliestaff000000 000000 PROGRAM PSINVDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PSINVDRIVER is the main test program for the REAL * SCALAPACK matrix inversion routines. This test driver computes the * inverse of different kind of matrix and tests the results. * * The program must be driven by a short data file. An annotated example * of a data file can be obtained by deleting the first 3 characters * from the following 14 lines: * 'ScaLAPACK Matrix Inversion Testing input file' * 'PVM machine.' * 'INV.out' output file name (if any) * 6 device out * 5 number of matrix types (next line) * 'GEN' 'UTR' 'LTR' 'UPD' LPD' GEN, UTR, LTR, UPD, LPD * 4 number of problems sizes * 1000 2000 3000 4000 values of N * 3 number of NB's * 4 30 35 values of NB * 2 number of process grids (ordered P & Q) * 4 2 values of P * 4 4 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INTGSZ, MEMSIZ, NTESTS, REALSZ, TOTMEM REAL PADVAL, ZERO PARAMETER ( INTGSZ = 4, REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER UPLO CHARACTER*3 MTYP CHARACTER*6 PASSED CHARACTER*80 OUTFILE LOGICAL CHECK INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, $ IPPIV, IPREPAD, IPOSTPAD, IPIW, IPW, ITEMP, J, $ K, KTESTS, KPASS, KFAIL, KSKIP, L, LCM, LIPIV, $ LIWORK, LWORK, MYCOL, MYROW, N, NB, NGRIDS, $ NMAT, NMTYP, NNB, NOUT, NP, NPCOL, NPROCS, $ NPROW, NQ, WORKIINV, WORKINV, WORKSIZ REAL ANORM, FRESID, RCOND, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. CHARACTER*3 MATTYP( NTESTS ) INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), $ QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSCHEKPAD, $ PSFILLPAD, PSGETRF, PSGETRI, $ PSINVCHK, PSINVINFO, PSLASET, $ PSMATGEN, PSPOTRF, PSPOTRI, $ PSTRTRI, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, ILCM, NUMROC REAL PSLANGE, PSLANSY, PSLANTR EXTERNAL ICEIL, ILCM, LSAMEN, NUMROC, PSLANGE, $ PSLANSY, PSLANTR * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP /4*0/ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PSINVINFO( OUTFILE, NOUT, NMTYP, MATTYP, NTESTS, NMAT, NVAL, $ NTESTS, NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Loop over the different matrix types * DO 40 I = 1, NMTYP * MTYP = MATTYP( I ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a general matrix.' ELSE IF( LSAMEN( 3, MTYP, 'UTR' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is an upper triangular matrix.' ELSE IF( LSAMEN( 3, MTYP, 'LTR' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a lower triangular matrix.' ELSE IF( LSAMEN( 3, MTYP, 'UPD' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a symmetric positive definite matrix.' WRITE( NOUT, FMT = 9986 ) $ 'Only the upper triangular part will be '// $ 'referenced.' ELSE IF( LSAMEN( 3, MTYP, 'LPD' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a symmetric positive definite matrix.' WRITE( NOUT, FMT = 9986 ) $ 'Only the lower triangular part will be '// $ 'referenced.' END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 J = 1, NGRIDS * NPROW = PVAL( J ) NPCOL = QVAL( J ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 K = 1, NMAT * N = NVAL( K ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 L = 1, NNB * NB = NBVAL( L ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for ScaLAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 * LCM = ILCM( NPROW, NPCOL ) IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * Pivots are needed by LU factorization * IPPIV = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + $ IPREPAD LIPIV = ICEIL( INTGSZ * ( NP + NB ), REALSZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * LWORK = MAX( 1, NP * DESCA( NB_ ) ) WORKINV = LWORK + IPOSTPAD * * Figure the amount of workspace required by the * general matrix inversion * IF( NPROW.EQ.NPCOL ) THEN LIWORK = NQ + DESCA( NB_ ) ELSE * * change the integer workspace needed for PDGETRI * LIWORK = MAX( DESCA( NB_ ), DESCA( MB_ ) * * $ ICEIL( ICEIL( DESCA( LLD_ ), * $ DESCA( MB_ ) ), LCM / NPROW ) ) * $ + NQ LIWORK = NUMROC( DESCA( M_ ) + $ DESCA( MB_ ) * NPROW $ + MOD ( 1 - 1, DESCA( MB_ ) ), DESCA ( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ) + $ MAX ( DESCA( MB_ ) * ICEIL ( ICEIL( $ NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW, $ DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ), $ DESCA( MB_ ) ), LCM / NPROW ), DESCA( NB_ ) ) * END IF WORKIINV = ICEIL( LIWORK*INTGSZ, REALSZ ) + $ IPOSTPAD IPIW = IPW + WORKINV + IPREPAD WORKSIZ = WORKINV + IPREPAD + WORKIINV * ELSE * * No pivots or workspace needed for triangular or * symmetric positive definite matrices. * IPW = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + IPREPAD WORKSIZ = 1 + IPOSTPAD * END IF * IF( CHECK ) THEN * * Figure amount of work space for the norm * computations * IF( LSAMEN( 3, MTYP, 'GEN' ).OR. $ LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN ITEMP = NQ ELSE ITEMP = 2 * NQ + NP IF( NPROW.NE.NPCOL ) THEN ITEMP = ITEMP + $ NB * ICEIL( ICEIL( NP, NB ), $ LCM / NPROW ) END IF END IF WORKSIZ = MAX( WORKSIZ-IPOSTPAD, ITEMP ) * * Figure the amount of workspace required by the * checking routine * WORKSIZ = MAX( WORKSIZ, 2 * NB * MAX( 1, NP ) ) + $ IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'inversion', $ ( IPW + WORKSIZ ) * REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( LSAMEN( 3, MTYP, 'GEN' ).OR. $ LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * Generate a general diagonally dominant matrix A * CALL PSMATGEN( ICTXT, 'N', 'D', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, $ NQ, MYROW, MYCOL, NPROW, NPCOL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * Generate a symmetric positive definite matrix * CALL PSMATGEN( ICTXT, 'S', 'D', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, $ NQ, MYROW, MYCOL, NPROW, NPCOL ) * END IF * * Zeros not-referenced part of A, if any. * IF( LSAMEN( 1, MTYP, 'U' ) ) THEN * UPLO = 'U' CALL PSLASET( 'Lower', N-1, N-1, ZERO, ZERO, $ MEM( IPA ), 2, 1, DESCA ) * ELSE IF( LSAMEN( 1, MTYP, 'L' ) ) THEN * UPLO = 'L' CALL PSLASET( 'Upper', N-1, N-1, ZERO, ZERO, $ MEM( IPA ), 1, 2, DESCA ) * ELSE * UPLO = 'G' * END IF * * Need 1-norm of A for checking * IF( CHECK ) THEN * CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * CALL PSFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANGE( '1', N, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKINV-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKIINV-IPOSTPAD, 1, $ MEM( IPIW-IPREPAD ), $ WORKIINV-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * ANORM = PSLANTR( '1', UPLO, 'Non unit', N, N, $ MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANTR', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANTR', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * ANORM = PSLANSY( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'SY' ) ) THEN * CALL PSFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANSY( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD,IPOSTPAD, PADVAL ) * END IF * END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * Perform LU factorization * CALL SLTIMER( 1 ) CALL PSGETRF( N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGETRF', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRF', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * Perform the general matrix inversion * CALL SLTIMER( 2 ) CALL PSGETRI( N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ), LWORK, $ MEM( IPIW ), LIWORK, INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGETRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRI', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRI', $ WORKIINV-IPOSTPAD, 1, $ MEM( IPIW-IPREPAD ), $ WORKIINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRI', $ WORKINV-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * Perform the general matrix inversion * CALL SLTIMER( 2 ) CALL PSTRTRI( UPLO, 'Non unit', N, MEM( IPA ), 1, $ 1, DESCA, INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSTRTRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * Perform Cholesky factorization * CALL SLTIMER( 1 ) CALL PSPOTRF( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSPOTRF', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * Perform the symmetric positive definite matrix * inversion * CALL SLTIMER( 2 ) CALL PSPOTRI( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSPOTRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * END IF * IF( CHECK ) THEN * CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Compute fresid = || inv(A)*A-I || * CALL PSINVCHK( MTYP, N, MEM( IPA ), 1, 1, DESCA, $ IASEED, ANORM, FRESID, RCOND, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSINVCHK', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSINVCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. INFO.EQ.0 .AND. $ ( (FRESID-FRESID) .EQ. 0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 IF( INFO.GT.0 ) THEN PASSED = 'SINGUL' ELSE PASSED = 'FAILED' END IF END IF * ELSE * * Don't perform the checking, only the timing * operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * 2/3 N^3 - 1/2 N^2 flops for LU factorization * NOPS = ( 2.0D+0 / 3.0D+0 )*( DBLE( N )**3 ) - $ ( 1.0D+0 / 2.0D+0 )*( DBLE( N )**2 ) * * 4/3 N^3 - N^2 flops for inversion * NOPS = NOPS + $ ( 4.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) - $ DBLE( N )**2 * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * 1/3 N^3 + 2/3 N flops for triangular inversion * CTIME(1) = 0.0D+0 WTIME(1) = 0.0D+0 NOPS = ( 1.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ ( 2.0D+0 / 3.0D+0 ) * ( DBLE( N ) ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * 1/3 N^3 + 1/2 N^2 flops for Cholesky * factorization * NOPS = ( 1.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ ( 1.0D+0 / 2.0D+0 ) * ( DBLE( N )**2 ) * * 2/3 N^3 + 1/2 N^2 flops for Cholesky inversion * NOPS = NOPS + $ ( 2.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ ( 1.0D+0 / 2.0D+0 ) * ( DBLE( N )**2 ) * END IF * * Figure total megaflops -- factorization and * inversion, for WALL and CPU time, and print * output. * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 2 ) .GE. 0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ RCOND, FRESID, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ) + CTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ) .GE. 0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ RCOND, FRESID, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * 40 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q Fct Time Inv Time ', $ ' MFLOPS Cond Resid CHECK' ) 9994 FORMAT( '---- ----- --- ----- ----- -------- -------- ', $ '----------- ------- ------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I5, 1X, I5, 1X, F8.2, 1X, F8.2, $ 1X, F11.2, 1X, F7.1, 1X, F7.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( A ) * STOP * * End of PSINVDRIVER * END scalapack-2.0.2/TESTING/LIN/psinvinfo.f000644 000766 000024 00000035254 10363532303 017641 0ustar00juliestaff000000 000000 SUBROUTINE PSINVINFO( SUMMRY, NOUT, NMTYP, MATTYP, LDMTYP, NMAT, $ NVAL, LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, $ PVAL, LDPVAL, QVAL, LDQVAL, THRESH, WORK, $ IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDMTYP, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NMTYP, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. CHARACTER*3 MATTYP( LDMTYP ) CHARACTER*( * ) SUMMRY INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PSINVINFO gets needed startup information for matrix inversion * tests and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMTYP (global output) INTEGER * The number of different matrix types to be tested. * * MATTYP (global output) CHARACTER*3 array of dimension of LDMTYP, * The types of matrix to be generated: * if MATTYP(i) = 'GEN' then GENeral matrix, * if MATTYP(i) = 'UTR' then Upper TRiangular matrix, * if MATTYP(i) = 'LTR' then Lower TRiangular matrix, * if MATTYP(i) = 'UPD' then (Upper) symmetric Pos. Definite, * if MATTYP(i) = 'LPD' then (Lower) symmetric Pos. Definite, * * LDMTYP (global input) INTEGER * The maximum number of different matrix types to be tested. * LDMTYP >= NMTYP. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 4, LDMTYP+LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack * all input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT, K REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAMEN REAL PSLAMCH EXTERNAL LSAMEN, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='INV.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get the matrix types to be tested * READ( NIN, FMT = * ) NMTYP IF( NMTYP.LT.1 .OR. NMTYP.GT.LDMTYP ) THEN WRITE( NOUT, FMT = 9994 ) 'nb of matrix types', LDMTYP GO TO 40 END IF READ( NIN, FMT = * ) ( MATTYP( I ), I = 1, NMTYP ) * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 40 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 40 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 40 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 40 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I ) * QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS WORK( 4 ) = NMTYP CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 DO 20 K = 1, NMTYP IF( LSAMEN( 3, MATTYP( K ), 'GEN' ) ) THEN WORK( I ) = 1 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'UTR' ) ) THEN WORK( I ) = 2 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'LTR' ) ) THEN WORK( I ) = 3 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'UPD' ) ) THEN WORK( I ) = 4 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'LPD' ) ) THEN WORK( I ) = 5 I = I + 1 END IF 20 CONTINUE * CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Matrix Inversion routines.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision Matrix Inversion '// $ 'routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Inverse residual = ||inv(A)*A - I|| '// $ '/ (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less '// $ 'than THRESH, CHECK is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'Fct time : Time in seconds to factor the'// $ ' matrix, if needed.' WRITE( NOUT, FMT = 9999 ) $ 'Inv Time : Time in seconds to inverse the'// $ ' matrix.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and inverse.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) NMTYP = WORK( 4 ) * I = NMTYP+NMAT+NNB+2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * DO 30 K = 1, NMTYP IF( WORK( K ).EQ.1 ) THEN MATTYP( K ) = 'GEN' ELSE IF( WORK( K ).EQ.2 ) THEN MATTYP( K ) = 'UTR' ELSE IF( WORK( K ).EQ.3 ) THEN MATTYP( K ) = 'LTR' ELSE IF( WORK( K ).EQ.4 ) THEN MATTYP( K ) = 'UPD' ELSE IF( WORK( K ).EQ.5 ) THEN MATTYP( K ) = 'LPD' END IF 30 CONTINUE * I = NMTYP + 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 40 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSINVINFO * END scalapack-2.0.2/TESTING/LIN/pslafchk.f000644 000766 000024 00000022200 10363532303 017404 0ustar00juliestaff000000 000000 SUBROUTINE PSLAFCHK( AFORM, DIAG, M, N, A, IA, JA, DESCA, IASEED, $ ANORM, FRESID, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER AFORM, DIAG INTEGER IA, IASEED, JA, M, N REAL ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSLAFCHK computes the residual * || sub( A ) - sub( Ao ) || / (|| sub( Ao ) ||*eps*MAX(M,N)), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A( IA:IA+M-1, JA:JA+N-1 ) and ||.|| stands for the infini- * ty norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * AFORM (global input) CHARACTER * sub( A ) is overwritten with: * - a symmetric matrix, if AFORM = 'S'; * - a Hermitian matrix, if AFORM = 'H'; * - the transpose of what would normally be generated, * if AFORM = 'T'; * - the conjugate transpose of what would normally be * generated, if AFORM = 'C'; * - otherwise a random matrix. * * DIAG (global input) CHARACTER * if DIAG = 'D' : sub( A ) is diagonally dominant. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub( A ) - sub( Ao ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * ANORM (global input) REAL * The Infinity norm of sub( A ). * * FRESID (global output) REAL * The maximum (worst) factorizational error. * * WORK (local workspace) REAL array, dimension (LWORK). * LWORK >= MpA0 * NB_A, where * * IROFFA = MOD( IA-1, MB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * WORK is used to store a block of columns of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IOFFA, IROFF, $ JB, JJ, JJA, JN, LDA, LDW, MP, MYCOL, MYROW, $ NPCOL, NPROW, NQ REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSMATGEN, SMATADD * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PSLAMCH, PSLANGE EXTERNAL ICEIL, NUMROC, PSLAMCH, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( ICTXT, 'eps' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * * Compute sub( A ) := sub( A ) - sub( Ao ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDW = MAX( 1, MP ) LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA - 1 )*LDA * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN JB = JN-JA+1 CALL PSMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ MP, JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL SMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) JJA = JJA + JB NQ = NQ - JB IOFFA = IOFFA + JB * LDA END IF * * Handle the remaning blocks of columns * DO 10 JJ = JJA, JJA+NQ-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JJA+NQ-JJ ) IOFFA = IIA + ( JJ - 1 ) * LDA CALL PSMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ MP, JJ-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL SMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) 10 CONTINUE * * Calculate factor residual * FRESID = PSLANGE( 'I', M, N, A, IA, JA, DESCA, WORK ) / $ ( MAX( M, N ) * EPS * ANORM ) * RETURN * * End PSLAFCHK * END scalapack-2.0.2/TESTING/LIN/pslaschk.f000644 000766 000024 00000030161 10363532303 017426 0ustar00juliestaff000000 000000 SUBROUTINE PSLASCHK( SYMM, DIAG, N, NRHS, X, IX, JX, DESCX, $ IASEED, IA, JA, DESCA, IBSEED, ANORM, RESID, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, SYMM INTEGER IA, IASEED, IBSEED, IX, JA, JX, N, NRHS REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) REAL WORK( * ), X( * ) * .. * * Purpose * ======= * * PSLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed matrix, * otherwise sub( A ) is a general distributed matrix. * * DIAG (global input) CHARACTER * If DIAG = 'D', sub( A ) is diagonally dominant. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 0. * * X (local input) REAL pointer into the local memory * to an array of dimension (LLD_X,LOCc(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK >= MAX(1,Np)*NB_X + Nq*NB_X + MAX( MAX(NQ*MB_A,2*NB_X), * NB_X * NUMROC( NUMROC(N,MB_X,0,0,NPCOL), MB_X, 0, 0, LCMQ ) ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IB, ICOFF, ICTXT, ICURCOL, IDUMM, $ II, IIA, IIX, IOFFX, IPA, IPB, IPW, IPX, IROFF, $ IXCOL, IXROW, J, JBRHS, JJ, JJA, JJX, LDX, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL BETA, DIVISOR, EPS, RESID1 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PBSTRAN, PSMATGEN, $ SGAMX2D, SGEBR2D, SGEBS2D, SGEMM, $ SGERV2D, SGESD2D, SGSUM2D, SLASET * .. * .. External Functions .. INTEGER ISAMAX, NUMROC REAL PSLAMCH EXTERNAL ISAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * LDX = MAX( 1, NP ) IPB = 1 IPX = IPB + NP * DESCX( NB_ ) IPA = IPX + NQ * DESCX( NB_ ) * IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF * ICURCOL = IXCOL * * Loop over the rhs * DO 40 J = 1, NRHS, DESCX( NB_ ) JBRHS = MIN( DESCX( NB_ ), NRHS-J+1 ) * * Transpose x from ICURCOL to all rows * IOFFX = IIX + ( JJX - 1 ) * DESCX( LLD_ ) CALL PBSTRAN( ICTXT, 'Column', 'Transpose', N, JBRHS, $ DESCX( MB_ ), X( IOFFX ), DESCX( LLD_ ), ZERO, $ WORK( IPX ), JBRHS, IXROW, ICURCOL, -1, IACOL, $ WORK( IPA ) ) * * Regenerate B in IXCOL * IF( MYCOL.EQ.ICURCOL ) THEN CALL PSMATGEN( ICTXT, 'N', 'N', DESCX( M_ ), DESCX( N_ ), $ DESCX( MB_ ), DESCX( NB_ ), WORK( IPB ), LDX, $ IXROW, IXCOL, IBSEED, IIX-1, NP, JJX-1, $ JBRHS, MYROW, MYCOL, NPROW, NPCOL ) BETA = ONE ELSE BETA = ZERO END IF * IF( NQ.GT.0 ) THEN DO 10 II = IIA, IIA+NP-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IIA+NP-II ) * * Regenerate ib rows of the matrix A(IA:IA+N-1,JA:JA+N-1). * CALL PSMATGEN( ICTXT, SYMM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ WORK( IPA ), IB, DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, II-1, IB, $ JJA-1, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Compute B <= B - A * X. * CALL SGEMM( 'No transpose', 'Transpose', IB, JBRHS, NQ, $ -ONE, WORK( IPA ), IB, WORK( IPX ), JBRHS, $ BETA, WORK( IPB+II-IIA ), LDX ) * 10 CONTINUE * ELSE IF( MYCOL.NE.ICURCOL ) THEN * CALL SLASET( 'All', NP, JBRHS, ZERO, ZERO, WORK( IPB ), $ LDX ) * END IF * * Add B rowwise to ICURCOL * CALL SGSUM2D( ICTXT, 'Row', ' ', NP, JBRHS, WORK( IPB ), LDX, $ MYROW, ICURCOL ) * IF( MYCOL.EQ.ICURCOL ) THEN * * Figure || A * X - B || & || X || * IPW = IPA + JBRHS DO 20 JJ = 0, JBRHS - 1 IF( NP.GT.0 ) THEN II = ISAMAX( NP, WORK( IPB+JJ*LDX ), 1 ) WORK( IPA+JJ ) = ABS( WORK( IPB+II-1+JJ*LDX ) ) WORK( IPW+JJ ) = ABS( X( IOFFX + ISAMAX( NP, $ X( IOFFX + JJ*DESCX( LLD_ ) ), 1 )-1+JJ* $ DESCX( LLD_ ) ) ) ELSE WORK( IPA+JJ ) = ZERO WORK( IPW+JJ ) = ZERO END IF 20 CONTINUE * * After SGAMX2D computation, * WORK(IPB) has the maximum of || Ax - b ||, and * WORK(IPX) has the maximum of || X ||. * CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 2*JBRHS, $ WORK( IPA ), 1, IDUMM, IDUMM, -1, 0, ICURCOL ) * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * IF( MYROW.EQ.0 ) THEN DO 30 JJ = 0, JBRHS - 1 RESID1 = WORK( IPA+JJ ) / ( WORK( IPW+JJ )*DIVISOR ) IF( RESID.LT.RESID1 ) $ RESID = RESID1 30 CONTINUE IF( MYCOL.NE.0 ) $ CALL SGESD2D( ICTXT, 1, 1, RESID, 1, 0, 0 ) END IF * ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * CALL SGERV2D( ICTXT, 1, 1, RESID1, 1, 0, ICURCOL ) IF( RESID.LT.RESID1 ) $ RESID = RESID1 * END IF * IF( MYCOL.EQ.ICURCOL ) $ JJX = JJX + JBRHS ICURCOL = MOD( ICURCOL+1, NPCOL ) * 40 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, RESID, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, RESID, 1, 0, 0 ) END IF * RETURN * * End of PSLASCHK * END scalapack-2.0.2/TESTING/LIN/pslltdriver.f000644 000766 000024 00000110226 10363532303 020171 0ustar00juliestaff000000 000000 PROGRAM PSLLTDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PSLLTDRIVER is the main test program for the REAL * ScaLAPACK Cholesky routines. This test driver performs an * A = L*L**T or A = U**T*U factorization and solve, and optionally * performs condition estimation and iterative refinement. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'ScaLAPACK LLt factorization input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LLT.out' output file name (if any) * 6 device out * 'U' define Lower or Upper * 1 number of problems sizes * 31 100 200 values of N * 1 number of NB's * 2 10 24 values of NB * 1 number of NRHS's * 1 values of NRHS * 1 Number of NBRHS's * 1 values of NBRHS * 1 number of process grids (ordered pairs of P & Q) * 2 values of P * 2 values of Q * 1.0 threshold * T (T or F) Test Cond. Est. and Iter. Ref. Routines * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INTGSZ, MEMSIZ, NTESTS, REALSZ, TOTMEM REAL PADVAL, ZERO PARAMETER ( INTGSZ = 4, REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, EST CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, IMIDPAD, $ INFO, IPA, IPA0, IPB, IPB0, IPBERR, IPFERR, $ IPREPAD, IPOSTPAD, IPW, IPW2, ITEMP, J, K, $ KFAIL, KK, KPASS, KSKIP, KTESTS, LCM, LCMQ, $ LIWORK, LWORK, LW2, MYCOL, MYRHS, MYROW, N, NB, $ NBRHS, NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, $ NPCOL, NPROCS, NPROW, NQ, NRHS, WORKSIZ REAL ANORM, ANORM1, FRESID, RCOND, SRESID, SRESID2, $ THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, DESCINIT, $ IGSUM2D, BLACS_PINFO, PSCHEKPAD, PSFILLPAD, $ PSLAFCHK, PSLASCHK, PSLLTINFO, $ PSMATGEN, PSPOCON, PSPORFS, $ PSPOTRF, PSPOTRRV, PSPOTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC REAL PSLANSY EXTERNAL ICEIL, ILCM, LSAME, NUMROC, PSLANSY * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 CALL PSLLTINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NNR, NRVAL, NTESTS, NNBR, NBRVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, EST, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 50 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 50 * DO 40 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 40 END IF * DO 30 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IF( EST ) THEN IPA0 = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPA0 + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD ELSE IPW = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD END IF * * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines PSLAFCHK, PSPOTRRV, and * PSLANSY * WORKSIZ = NP * DESCA( NB_ ) * WORKSIZ = MAX( WORKSIZ, DESCA( MB_ ) * DESCA( NB_ ) ) * LCM = ILCM( NPROW, NPCOL ) ITEMP = MAX( 2, 2 * NQ ) + NP IF( NPROW.NE.NPCOL ) THEN ITEMP = ITEMP + $ NB * ICEIL( ICEIL( NP, NB ), LCM / NPROW ) END IF WORKSIZ = MAX( WORKSIZ, ITEMP ) WORKSIZ = WORKSIZ + IPOSTPAD * ELSE * WORKSIZ = IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate a symmetric positive definite matrix A * CALL PSMATGEN( ICTXT, 'Symm', 'Diag', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANSY( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) ANORM1 = PSLANSY( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( EST ) THEN CALL PSMATGEN( ICTXT, 'Symm', 'Diag', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA0 ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) IF( CHECK ) $ CALL PSFILLPAD( ICTXT, NP, NQ, $ MEM( IPA0-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform LLt factorization * CALL SLTIMER( 1 ) * CALL PSPOTRF( UPLO, N, MEM( IPA ), 1, 1, DESCA, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PSPOTRF INFO=', INFO KFAIL = KFAIL + 1 RCOND = ZERO GO TO 60 END IF * IF( CHECK ) THEN * * Check for memory overwrite in LLt factorization * CALL PSCHEKPAD( ICTXT, 'PSPOTRF', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( EST ) THEN * * Calculate workspace required for PSPOCON * LWORK = MAX( 1, 2*NP ) + MAX( 1, 2*NQ ) + $ MAX( 2, DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), $ NQ + DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LIWORK = MAX( 1, NP ) LW2 = ICEIL( LIWORK*INTGSZ, REALSZ ) + IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'cond est', $ ( IPW2+LW2 )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 60 END IF * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Compute condition number of the matrix * CALL PSPOCON( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ ANORM1, RCOND, MEM( IPW ), LWORK, $ MEM( IPW2 ), LIWORK, INFO ) * IF( CHECK ) THEN CALL PSCHEKPAD( ICTXT, 'PSPOCON', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPOCON', $ LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPOCON', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF END IF * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 10 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB, N, NRHS, NB, NBRHS, 0, 0, $ ICTXT, MAX( 1, NP )+IMIDPAD, $ IERR( 1 ) ) * * move IPW to allow room for RHS * MYRHS = NUMROC( DESCB( N_ ), DESCB( NB_ ), MYCOL, $ DESCB( CSRC_ ), NPCOL ) IPB = IPW * IF( EST ) THEN IPB0 = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD IPFERR = IPB0 + DESCB( LLD_ )*MYRHS + IPOSTPAD $ + IPREPAD IPBERR = MYRHS + IPFERR + IPOSTPAD + IPREPAD IPW = MYRHS + IPBERR + IPOSTPAD + IPREPAD ELSE IPW = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD END IF * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines PSLASCHK * LCMQ = LCM / NPCOL WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ NQ * NBRHS + NP * NBRHS + $ MAX( MAX( NQ*NB, 2*NBRHS ), $ NBRHS * NUMROC( NUMROC(N,NB,0,0,NPCOL),NB, $ 0,0,LCMQ ) ) ) WORKSIZ = IPOSTPAD + WORKSIZ ELSE WORKSIZ = IPOSTPAD END IF * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate RHS * CALL PSMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, NPCOL ) * IF( CHECK ) $ CALL PSFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) * IF( EST ) THEN CALL PSMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB0 ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 2 ) * * Solve linear system via Cholesky factorization * CALL PSPOTRS( UPLO, N, NRHS, MEM( IPA ), 1, 1, $ DESCA, MEM( IPB ), 1, 1, DESCB, $ INFO ) * CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSPOTRS', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPOTRS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PSLASCHK( 'Symm', 'Diag', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, IBSEED, $ ANORM, SRESID, MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' END IF * IF( EST ) THEN * * Calculate workspace required for PSPORFS * LWORK = MAX( 1, 3*NP ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LIWORK = MAX( 1, NP ) LW2 = ICEIL( LIWORK*INTGSZ, REALSZ ) + $ IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'iter ref', ( IPW2+LW2 )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, LW2-IPOSTPAD, $ 1, MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Use iterative refinement to improve the * computed solution * CALL PSPORFS( UPLO, N, NRHS, MEM( IPA0 ), $ 1, 1, DESCA, MEM( IPA ), 1, 1, $ DESCA, MEM( IPB0 ), 1, 1, $ DESCB, MEM( IPB ), 1, 1, DESCB, $ MEM( IPFERR ), MEM( IPBERR ), $ MEM( IPW ), LWORK, MEM( IPW2 ), $ LIWORK, INFO ) * * check for memory overwrite * IF( CHECK ) THEN CALL PSCHEKPAD( ICTXT, 'PSPORFS', NP, $ NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPORFS', NP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPORFS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPORFS', NP, $ MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPORFS', 1, $ MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPORFS', 1, $ MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPORFS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSPORFS', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PSLASCHK( 'Symm', 'Diag', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, $ IBSEED, ANORM, SRESID2, $ MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID2.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID2 * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 1/3 N^3 + 1/2 N^2 flops for LLt factorization * NOPS = (DBLE(N)**3)/3.0D+0 + $ (DBLE(N)**2)/2.0D+0 * * nrhs * 2 N^2 flops for LLt solve. * NOPS = NOPS + 2.0D+0*(DBLE(N)**2)*DBLE(NRHS) * * Calculate total megaflops -- factorization and * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ PASSED * END IF 10 CONTINUE 20 CONTINUE * IF( CHECK .AND. SRESID.GT.THRESH ) THEN * * Compute FRESID = ||A - LL'|| / (||A|| * N * eps) * CALL PSPOTRRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PSLAFCHK( 'Symm', 'Diag', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSPOTRRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * IF( IAM.EQ.0 ) THEN IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9986 ) 'L*L''', FRESID ELSE WRITE( NOUT, FMT = 9986 ) 'U''*U', FRESID END IF END IF END IF * 30 CONTINUE 40 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 50 CONTINUE * * Print ending messages and close output file * 60 CONTINUE IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UPLO N NB NRHS NBRHS P Q LLt Time ', $ 'Slv Time MFLOPS CHECK' ) 9994 FORMAT( '---- ---- ----- --- ---- ----- ---- ---- -------- ', $ '-------- -------- ------' ) 9993 FORMAT( A4, 4X, A1, 1X, I5, 1X, I3, 1X, I4, 1X, I5, 1X, I4, 1X, $ I4, 1X, F8.2, 1X, F8.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PSLLTDRIVER * END scalapack-2.0.2/TESTING/LIN/pslltinfo.f000644 000766 000024 00000042145 10363532303 017635 0ustar00juliestaff000000 000000 SUBROUTINE PSLLTINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NNR, NRVAL, LDNRVAL, NNBR, $ NBRVAL, LDNBRVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, THRESH, EST, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL EST CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PSLLTINFO get needed startup information for LLt factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * EST (global output) LOGICAL * Flag indicating if condition estimation and iterative * refinement routines are to be exercised. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 7, LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'LLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Read the flag that indicates whether to test the condition * estimation and iterative refinement routines. * READ( NIN, FMT = * ) EST * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS IF( LSAME( UPLO, 'L' ) ) THEN WORK( 6 ) = 1 ELSE WORK( 6 ) = 2 END IF IF( EST ) THEN WORK( 7 ) = 1 ELSE WORK( 7 ) = 0 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 7, 1, WORK, 7 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Ax=b by LLt factorization.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision LLt factorization '// $ 'and solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data is stored in ''Upper'// $ ''' or ''Lower'' portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'LLt time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 7, 1, WORK, 7, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) IF( WORK( 6 ).EQ.1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF IF( WORK( 7 ).EQ.1 ) THEN EST = .TRUE. ELSE EST = .FALSE. END IF * I = NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSLLTINFO * END scalapack-2.0.2/TESTING/LIN/pslsdriver.f000644 000766 000024 00000135002 10363532303 020013 0ustar00juliestaff000000 000000 PROGRAM PSLSDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * Purpose * ======= * * PSLSDRIVER is the main test program for the REAL * SCALAPACK (full rank) Least Squares routines. This test driver solves * full-rank least square problems. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 17 lines: * 'ScaLapack LS solve input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LS.out' output file name (if any) * 6 device out * 4 number of problems sizes * 55 17 31 201 values of M * 5 71 31 201 values of N * 3 number of NB's * 2 3 5 values of NB * 3 number of NRHS's * 2 3 5 values of NRHS * 2 number of NBRHS's * 1 2 values of NBRHS * 7 number of process grids (ordered P & Q) * 1 2 1 4 2 3 8 values of P * 7 2 4 1 3 2 1 values of Q * 3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER MEMSIZ, NTESTS, REALSZ, TOTMEM REAL PADVAL REAL ONE, ZERO PARAMETER ( REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0 ) PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, TPSD CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, II, IMIDPAD, $ INFO, IPA, IPB, IPOSTPAD, IPREPAD, IPW, IPW2, $ IPX, ISCALE, ITRAN, ITYPE, J, JJ, K, KFAIL, KK, $ KPASS, KSKIP, KTESTS, LCM, LCMP, LTAU, LWF, $ LWORK, LWS, M, MNP, MNRHSP, MP, MQ, MYCOL, $ MYROW, N, NB, NBRHS, NCOLS, NGRIDS, NMAT, NNB, $ NNBR, NNR, NNRHSQ, NOUT, NP, NPCOL, NPROCS, $ NPROW, NROWS, NQ, NRHS, NRHSP, NRHSQ, WORKSIZ REAL ANORM, BNORM, SRESID, THRESH DOUBLE PRECISION ADDFAC, ADDS, MULFAC, MULTS, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCW( LLD_ ), $ DESCX( DLEN_ ), IERR( 2 ), MVAL( NTESTS ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ), RESULT( 2 ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSCHEKPAD, $ PSFILLPAD, PSGELS, PSGEMM, PSLACPY, $ PSLSINFO, PSMATGEN, PSNRM2, PSSCAL, $ PSQRT13, PSQRT16, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC REAL PSLANGE, PSQRT14, PSQRT17 EXTERNAL ICEIL, ILCM, LSAME, NUMROC, PSLANGE, $ PSQRT14, PSQRT17 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * IASEED = 100 IBSEED = 200 CALL PSLSINFO( OUTFILE, NOUT, NMAT, MVAL, NTESTS, NVAL, $ NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, NTESTS, $ NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 90 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 90 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( ( MYROW.GE.NPROW ).OR.( MYCOL.GE.NPCOL ) ) $ GO TO 90 * DO 80 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 80 END IF * * Loop over different blocking sizes * DO 70 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 70 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) MQ = NUMROC( M, NB, MYCOL, 0, NPCOL ) NP = NUMROC( N, NB, MYROW, 0, NPROW ) MNP = MAX( MP, NP ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 70 END IF * DO 60 ISCALE = 1, 3 * ITYPE = ISCALE * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPX = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPX * WORKSIZ = NQ + IPOSTPAD * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'MEMORY', $ ( IPX+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 70 END IF * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Generate the matrix A and calculate its 1-norm * CALL PSQRT13( ISCALE, M, N, MEM( IPA ), 1, 1, $ DESCA, ANORM, IASEED, MEM( IPW ) ) * IF( CHECK ) THEN CALL PSCHEKPAD( ICTXT, 'PSQRT13', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSQRT13', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * DO 50 ITRAN = 1, 2 * IF( ITRAN.EQ.1 ) THEN NROWS = M NCOLS = N TRANS = 'N' TPSD = .FALSE. ELSE NROWS = N NCOLS = M TRANS = 'T' TPSD = .TRUE. END IF * * Loop over the different values for NRHS * DO 40 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 30 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * NRHSP = NUMROC( NRHS, NBRHS, MYROW, 0, $ NPROW ) NRHSQ = NUMROC( NRHS, NBRHS, MYCOL, 0, $ NPCOL ) * * Define Array descriptor for rhs MAX(M,N)xNRHS * CALL DESCINIT( DESCX, MAX( M, N ), NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, MNP ) + IMIDPAD, $ IERR( 1 ) ) IF( TPSD ) THEN CALL DESCINIT( DESCW, M, NRHS, NB, NBRHS, $ 0, 0, ICTXT, MAX( 1, MP ) + $ IMIDPAD, IERR( 2 ) ) ELSE CALL DESCINIT( DESCW, N, NRHS, NB, NBRHS, $ 0, 0, ICTXT, MAX( 1, NP ) + $ IMIDPAD, IERR( 2 ) ) END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, $ 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Check for enough memory * IPX = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + $ IPREPAD IPW = IPX + DESCX( LLD_ )*NRHSQ + IPOSTPAD + $ IPREPAD WORKSIZ = DESCW( LLD_ )*NRHSQ + IPOSTPAD * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Generation', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate RHS * IF( TPSD ) THEN CALL PSMATGEN( ICTXT, 'No', 'No', $ DESCW( M_ ), DESCW( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), $ MEM( IPW ), DESCW( LLD_ ), $ DESCW( RSRC_ ), $ DESCW( CSRC_ ), IBSEED, 0, $ MP, 0, NRHSQ, MYROW, MYCOL, $ NPROW, NPCOL ) ELSE CALL PSMATGEN( ICTXT, 'No', 'No', $ DESCW( M_ ), DESCW( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), $ MEM( IPW ), DESCW( LLD_ ), $ DESCW( RSRC_ ), $ DESCW( CSRC_ ), IBSEED, 0, $ NP, 0, NRHSQ, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PSFILLPAD( ICTXT, MP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PSFILLPAD( ICTXT, NP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * DO 10 JJ = 1, NRHS CALL PSNRM2( NCOLS, BNORM, MEM( IPW ), 1, $ JJ, DESCW, 1 ) IF( BNORM.GT.ZERO ) $ CALL PSSCAL( NCOLS, ONE / BNORM, $ MEM( IPW ), 1, JJ, DESCW, $ 1 ) 10 CONTINUE * CALL PSGEMM( TRANS, 'N', NROWS, NRHS, NCOLS, $ ONE, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ), 1, 1, DESCW, ZERO, $ MEM( IPX ), 1, 1, DESCX ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'Generation', MP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'Generation', MNP, $ NRHSQ, MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PSCHEKPAD( ICTXT, 'Generation', $ MP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PSCHEKPAD( ICTXT, 'Generation', $ NP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Allocate space for copy of rhs * IPB = IPW * IF( TPSD ) THEN CALL DESCINIT( DESCB, N, NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, $ IERR( 1 ) ) ELSE CALL DESCINIT( DESCB, M, NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, $ IERR( 1 ) ) END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * IPW = IPB + DESCB( LLD_ )*NRHSQ + $ IPOSTPAD + IPREPAD * END IF * * Calculate the amount of workspace for PSGELS * IF( M.GE.N ) THEN LTAU = NUMROC( MIN(M,N), NB, MYCOL, 0, $ NPCOL ) LWF = NB * ( MP + NQ + NB ) LWS = MAX( ( NB*( NB - 1 ) ) / 2, $ ( MP + NRHSQ ) * NB ) + NB*NB ELSE LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LTAU = NUMROC( MIN(M,N), NB, MYROW, 0, $ NPROW ) LWF = NB * ( MP + NQ + NB ) LWS = MAX( ( NB*( NB - 1 ) ) / 2, ( NP + $ MAX( NQ + NUMROC( NUMROC( N, NB, 0, $ 0, NPROW ), NB, 0, 0, LCMP ), $ NRHSQ ) ) * NB ) + NB*NB END IF * LWORK = LTAU + MAX( LWF, LWS ) WORKSIZ = LWORK + IPOSTPAD * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'solve', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Make the copy of the right hand side * CALL PSLACPY( 'All', NROWS, NRHS, $ MEM( IPX ), 1, 1, DESCX, $ MEM( IPB ), 1, 1, DESCB ) * IF( TPSD ) THEN CALL PSFILLPAD( ICTXT, NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PSFILLPAD( ICTXT, MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF CALL PSFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * CALL SLBOOT( ) CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Solve the LS or overdetermined system * CALL PSGELS( TRANS, M, N, NRHS, MEM( IPA ), $ 1, 1, DESCA, MEM( IPX ), 1, 1, $ DESCX, MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGELS', MP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGELS', MNP, $ NRHSQ, MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGELS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Regenerate A in place for testing and next * iteration * CALL PSQRT13( ISCALE, M, N, MEM( IPA ), 1, 1, $ DESCA, ANORM, IASEED, $ MEM( IPW ) ) * * check the solution to rhs * IF( CHECK ) THEN * * Am I going to call PSQRT17 ? * IF( ( M.GE.N .AND. ( .NOT.TPSD ) ) .OR. $ ( M.LT.N .AND. TPSD ) ) THEN * * Call PSQRT17 first, A, X, and B remain * unchanged. Solving LS system * * Check amount of memory for PSQRT17 * IF( TPSD ) THEN WORKSIZ = NP*NRHSQ + NRHSP*MQ IPW2 = IPW + WORKSIZ WORKSIZ = WORKSIZ + $ MAX( NQ, MAX( MQ, NRHSQ ) ) + $ IPOSTPAD ELSE WORKSIZ = MP*NRHSQ + NRHSP*NQ IPW2 = IPW + WORKSIZ WORKSIZ = WORKSIZ + $ MAX( NQ, NRHSQ ) + $ IPOSTPAD END IF * * Check for adequate memory for problem * size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'MEMORY', ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PSFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * RESULT( 2 ) = PSQRT17( TRANS, 1, M, N, $ NRHS, $ MEM( IPA ), $ 1, 1, DESCA, $ MEM( IPX ), 1, $ 1, DESCX, $ MEM( IPB ), $ 1, 1, DESCB, $ MEM( IPW ), $ MEM( IPW2 ) ) SRESID = RESULT( 2 ) * CALL PSCHEKPAD( ICTXT, 'PSQRT17', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSQRT17', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PSCHEKPAD( ICTXT, 'PSQRT17', $ NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) ELSE CALL PSCHEKPAD( ICTXT, 'PSQRT17', $ MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF CALL PSCHEKPAD( ICTXT, 'PSQRT17', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Call PSQRT16, B will be destroyed. * IF( TPSD ) THEN WORKSIZ = MP + IPOSTPAD ELSE WORKSIZ = NQ + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'MEMORY', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PSFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PSQRT16( TRANS, M, N, NRHS, $ MEM( IPA ), 1, 1, DESCA, $ MEM( IPX ), 1, 1, DESCX, $ MEM( IPB ), 1, 1, DESCB, $ MEM( IPW ), RESULT( 1 ) ) * CALL PSCHEKPAD( ICTXT, 'PSQRT16', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSQRT16', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PSCHEKPAD( ICTXT, 'PSQRT16', $ NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) ELSE CALL PSCHEKPAD( ICTXT, 'PSQRT16', $ MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF CALL PSCHEKPAD( ICTXT, 'PSQRT16', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * * Call PSQRT14 * IF( ( M.GE.N .AND. TPSD ) .OR. $ ( M.LT.N .AND. ( .NOT.TPSD ) ) ) THEN * IPW = IPB * IF( TPSD ) THEN * NNRHSQ = NUMROC( N+NRHS, NB, MYCOL, $ 0, NPCOL ) LTAU = NUMROC( MIN( M, N+NRHS ), NB, $ MYCOL, 0, NPCOL ) LWF = NB * ( NB + MP + NNRHSQ ) WORKSIZ = MP * NNRHSQ + LTAU + LWF + $ IPOSTPAD * ELSE * MNRHSP = NUMROC( M+NRHS, NB, MYROW, $ 0, NPROW ) LTAU = NUMROC( MIN( M+NRHS, N ), NB, $ MYROW, 0, NPROW ) LWF = NB * ( NB + MNRHSP + NQ ) WORKSIZ = MNRHSP * NQ + LTAU + LWF + $ IPOSTPAD * END IF * * Check for adequate memory for problem * size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'MEMORY', ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PSFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * * Solve underdetermined system * RESULT( 2 ) = PSQRT14( TRANS, M, N, $ NRHS, $ MEM( IPA ), 1, $ 1, DESCA, $ MEM( IPX ), $ 1, 1, DESCX, $ MEM( IPW ) ) SRESID = RESULT( 2 ) * CALL PSCHEKPAD( ICTXT, 'PSQRT14', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSQRT14', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSQRT14', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Print information about the tests that * did not pass the threshold. * PASSED = 'PASSED' DO 20 II = 1, 2 IF( ( RESULT( II ).GE.THRESH ) .AND. $ ( RESULT( II )-RESULT( II ).EQ.0.0E+0 $ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 )TRANS, $ M, N, NRHS, NB, ITYPE, II, $ RESULT( II ) KFAIL = KFAIL + 1 PASSED = 'FAILED' ELSE KPASS = KPASS + 1 END IF 20 CONTINUE * ELSE * * By-pass the solve check * KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock * timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN ADDFAC = 1 MULFAC = 1 IF( M.GE.N ) THEN * * NOPS = SOPLA( 'SGEQRF', M, N, 0, 0, * NB ) + SOPLA( 'SORMQR', M, NRHS, N, * 0, NB ) * MULTS = N*( ( ( 23.D0 / 6.D0 )+M+N / $ 2.D0 )+ N*( M-N / 3.D0 ) ) + $ N*NRHS*( 2.D0*M+2.D0-N ) ADDS = N*( ( 5.D0 / 6.D0 )+N* $ ( 1.D0 / 2.D0+( M-N / 3.D0 ) ) ) $ + N*NRHS*( 2.D0*M+1.D0-N ) ELSE * * NOPS = SOPLA( 'SGELQF', M, N, 0, 0, * NB ) + SOPLA( 'SORMLQ', M, * NRHS, N, 0, NB ) * MULTS = M*( ( ( 29.D0 / 6.D0 )+2.D0*N-M $ / 2.D0 )+M*( N-M / 3.D0 ) ) $ + N*NRHS*( 2.D0*M+2.D0-N ) ADDS = M*( ( 5.D0 / 6.D0 )+M / 2.D0+M* $ ( N-M / 3.D0 ) ) $ + N*NRHS*( 2.D0*M+1.D0-N ) END IF NOPS = ADDFAC*ADDS + MULFAC*MULTS * * Calculate total megaflops, for WALL and * CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) $ 'WALL', TRANS, M, N, NB, NRHS, $ NBRHS, NPROW, NPCOL, WTIME( 1 ), $ TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) $ 'CPU ', TRANS, M, N, NB, NRHS, $ NBRHS, NPROW, NPCOL, CTIME( 1 ), $ TMFLOPS, PASSED END IF 30 CONTINUE 40 CONTINUE 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 90 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'Time TRANS M N NB NRHS NBRHS P Q ', $ 'LS Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- ------ ------ --- ----- ----- ----- ----- ', $ '--------- -------- ------' ) 9993 FORMAT( A4, 3X, A1, 3X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, $ I5, 1X, I5, 1X, F9.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( ' TRANS=''', A1, ''', M=', I5, ', N=', I5, ', NRHS=', I4, $ ', NB=', I4, ', type', I2, ', test(', I2, ')=', G12.5 ) * STOP * * End of PSLSDRIVER * END scalapack-2.0.2/TESTING/LIN/pslsinfo.f000644 000766 000024 00000040772 10363532303 017464 0ustar00juliestaff000000 000000 SUBROUTINE PSLSINFO( SUMMRY, NOUT, NMAT, MVAL, LDMVAL, $ NVAL, LDNVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDMVAL, LDNBRVAL, LDNBVAL, LDNRVAL, $ LDNVAL, LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, $ NNBR, NNR, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDMVAL ), NBRVAL( LDNBRVAL ), $ NBVAL( LDNBVAL ), NRVAL( LDNRVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), $ QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PSLSINFO gets needed startup information for LS solve and * transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * MVAL (global output) INTEGER array, dimension (LDNVAL) * The values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M, LDNVAL > = NMAT. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (workspace) INTEGER array of dimension >= * MAX(5,LDMVAL+LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL) * Used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='LS.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 ELSE IF( NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'M', LDMVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 5, 1, WORK, 5 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK min ||Ax-b|| by QR factorizations.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision least-square solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - QR|| / '// $ '(||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used. If CPU and WALL time' WRITE( NOUT, FMT = 9999 ) $ ' are the same, only one line '// $ 'is printed, and the label is ''BOTH''.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 ) $ 'QR time : Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 5, 1, WORK, 5, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) * I = 2*NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSLSINFO * END scalapack-2.0.2/TESTING/LIN/psludriver.f000644 000766 000024 00000125042 10363532303 020020 0ustar00juliestaff000000 000000 PROGRAM PSLUDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======== * * PSLUDRIVER is the main test program for the REAL * SCALAPACK LU routines. This test driver performs an LU factorization * and solve. If the input matrix is non-square, only the factorization * is performed. Condition estimation and iterative refinement are * optionally performed. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'SCALAPACK, Version 2.0, LU factorization input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LU.out' output file name (if any) * 6 device out * 1 number of problems sizes * 31 201 values of M * 31 201 values of N * 1 number of NB's * 2 values of NB * 1 number of NRHS's * 1 values of NRHS * 1 number of NBRHS's * 1 values of NBRHS * 1 number of process grids (ordered pairs of P & Q) * 2 1 4 2 3 8 values of P * 2 4 1 3 2 1 values of Q * 1.0 threshold * T (T or F) Test Cond. Est. and Iter. Ref. Routines * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INTGSZ, MEMSIZ, NTESTS, REALSZ, TOTMEM REAL PADVAL, ZERO PARAMETER ( INTGSZ = 4, REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, EST CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, IMIDPAD, $ INFO, IPA, IPA0, IPB, IPB0, IPBERR, IPFERR, $ IPOSTPAD, IPPIV, IPREPAD, IPW, IPW2, J, K, $ KFAIL, KK, KPASS, KSKIP, KTESTS, LCM, LCMQ, $ LIPIV, LIWORK, LWORK, LW2, M, MAXMN, $ MINMN, MP, MYCOL, MYRHS, MYROW, N, NB, NBRHS, $ NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, NPCOL, $ NPROCS, NPROW, NQ, NRHS, WORKSIZ REAL ANORM, ANORM1, FRESID, RCOND, SRESID, SRESID2, $ THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), IERR( 1 ), $ MVAL( NTESTS ), NBRVAL( NTESTS ), $ NBVAL( NTESTS ), NRVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), $ QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSCHEKPAD, $ PSFILLPAD, PSGECON, PSGERFS, $ PSGETRF, PSGETRRV, PSGETRS, $ PSLAFCHK, PSLASCHK, PSLUINFO, $ PSMATGEN, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ICEIL, ILCM, NUMROC REAL PSLANGE EXTERNAL ICEIL, ILCM, NUMROC, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 CALL PSLUINFO( OUTFILE, NOUT, NMAT, MVAL, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NNR, NRVAL, NTESTS, NNBR, NBRVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, THRESH, $ EST, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 50 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 50 * DO 40 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 40 END IF * DO 30 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IF( EST .AND. M.EQ.N ) THEN IPA0 = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPPIV = IPA0 + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD ELSE IPPIV = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD END IF LIPIV = ICEIL( INTGSZ*( MP+NB ), REALSZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * IF( CHECK ) THEN * * Calculate the amount of workspace required by the * checking routines PSLANGE, PSGETRRV, and * PSLAFCHK * WORKSIZ = MAX( 2, NQ ) * WORKSIZ = MAX( WORKSIZ, MP*DESCA( NB_ )+ $ NQ*DESCA( MB_ ) ) * WORKSIZ = MAX( WORKSIZ, MP * DESCA( NB_ ) ) * WORKSIZ = WORKSIZ + IPOSTPAD * ELSE * WORKSIZ = IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate matrix A of Ax = b * CALL PSMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ MP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, LIPIV, 1, MEM( IPPIV-IPREPAD ), $ LIPIV, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANGE( 'I', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) ANORM1 = PSLANGE( '1', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) END IF * IF( EST .AND. M.EQ.N ) THEN CALL PSMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA0 ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) IF( CHECK ) $ CALL PSFILLPAD( ICTXT, MP, NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform LU factorization * CALL PSGETRF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PSGETRF INFO=', INFO KFAIL = KFAIL + 1 RCOND = ZERO GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in LU factorization * CALL PSCHEKPAD( ICTXT, 'PSGETRF', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRF', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * IF( M.NE.N ) THEN * * For non-square matrices, factorization only * NRHS = 0 NBRHS = 0 * IF( CHECK ) THEN * * Compute FRESID = ||A - P*L*U|| / (||A|| * N * eps) * CALL PSGETRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ) ) CALL PSLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGETRRV', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRRV', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( (FRESID-FRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * MAXMN = MAX( M, N ) MINMN = MIN( M, N ) * * M N^2 - 1/3 N^3 - 1/2 N^2 flops for LU * factorization when M >= N * NOPS = DBLE( MAXMN )*( DBLE( MINMN )**2 ) - $ (1.0D+0 / 3.0D+0)*( DBLE( MINMN )**3 ) - $ (1.0D+0 / 2.0D+0)*( DBLE( MINMN )**2 ) * * Calculate total megaflops -- factorization only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * WTIME( 2 ) = 0.0D+0 IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, NB, $ NRHS, NBRHS, NPROW, NPCOL, WTIME( 1 ), $ WTIME( 2 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * CTIME( 2 ) = 0.0D+0 IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, NB, $ NRHS, NBRHS, NPROW, NPCOL, CTIME( 1 ), $ CTIME( 2 ), TMFLOPS, PASSED END IF * ELSE * * If M = N * IF( EST ) THEN * * Calculate workspace required for PSGECON * LWORK = MAX( 1, 2*NP ) + MAX( 1, 2*NQ ) + $ MAX( 2, DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), $ NQ + DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LIWORK = MAX( 1, NP ) LW2 = ICEIL( LIWORK*INTGSZ, REALSZ ) + IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'cond est', $ ( IPW2+LW2 )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Compute condition number of the matrix * CALL PSGECON( '1', N, MEM( IPA ), 1, 1, DESCA, $ ANORM1, RCOND, MEM( IPW ), LWORK, $ MEM( IPW2 ), LIWORK, INFO ) * IF( CHECK ) THEN CALL PSCHEKPAD( ICTXT, 'PSGECON', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGECON', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGECON', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 10 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB, N, NRHS, NB, NBRHS, 0, 0, $ ICTXT, MAX( 1, NP )+IMIDPAD, $ IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * move IPW to allow room for RHS * MYRHS = NUMROC( DESCB( N_ ), DESCB( NB_ ), $ MYCOL, DESCB( CSRC_ ), NPCOL ) IPB = IPW * IF( EST ) THEN IPB0 = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD IPFERR = IPB0 + DESCB( LLD_ )*MYRHS + $ IPOSTPAD + IPREPAD IPBERR = MYRHS + IPFERR + IPOSTPAD + IPREPAD IPW = MYRHS + IPBERR + IPOSTPAD + IPREPAD ELSE IPW = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD END IF * * Set worksiz: routines requiring most workspace * is PSLASCHK * IF( CHECK ) THEN LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ NQ * NBRHS + NP * NBRHS + $ MAX( MAX( NQ*NB, 2*NBRHS ), $ NBRHS * NUMROC( NUMROC(N,NB,0,0,NPCOL),NB, $ 0,0,LCMQ ) ) ) WORKSIZ = IPOSTPAD + WORKSIZ ELSE WORKSIZ = IPOSTPAD END IF * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate RHS * CALL PSMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) * IF( CHECK ) $ CALL PSFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * IF( EST ) THEN CALL PSMATGEN( ICTXT, 'No', 'No', $ DESCB( M_ ), DESCB( N_ ), $ DESCB( MB_ ), DESCB( NB_ ), $ MEM( IPB0 ), DESCB( LLD_ ), $ DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, $ 0, MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 2 ) * * Solve linear sytem via LU factorization * CALL PSGETRS( 'No', N, NRHS, MEM( IPA ), 1, 1, $ DESCA, MEM( IPPIV ), MEM( IPB ), $ 1, 1, DESCB, INFO ) * CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGETRS', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRS', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PSLASCHK( 'No', 'N', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, IBSEED, $ ANORM, SRESID, MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * The second test is a NaN trap * IF( SRESID.LE.THRESH .AND. $ ( SRESID-SRESID ).EQ.0.0E+0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' END IF * IF( EST ) THEN * * Calculate workspace required for PSGERFS * LWORK = MAX( 1, 3*NP ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LIWORK = MAX( 1, NP ) LW2 = ICEIL( LIWORK*INTGSZ, REALSZ ) + $ IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'iter ref', ( IPW2+LW2 )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Use iterative refinement to improve the * computed solution * CALL PSGERFS( 'No', N, NRHS, MEM( IPA0 ), 1, $ 1, DESCA, MEM( IPA ), 1, 1, $ DESCA, MEM( IPPIV ), $ MEM( IPB0 ), 1, 1, DESCB, $ MEM( IPB ), 1, 1, DESCB, $ MEM( IPFERR ), MEM( IPBERR ), $ MEM( IPW ), LWORK, MEM( IPW2 ), $ LIWORK, INFO ) * IF( CHECK ) THEN CALL PSCHEKPAD( ICTXT, 'PSGERFS', NP, $ NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGERFS', NP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGERFS', LIPIV, $ 1, MEM( IPPIV-IPREPAD ), $ LIPIV, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGERFS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGERFS', NP, $ MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGERFS', 1, $ MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGERFS', 1, $ MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGERFS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGERFS', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PSLASCHK( 'No', 'N', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, $ IBSEED, ANORM, SRESID2, $ MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID2.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID2 * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * * Gather max. of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 2/3 N^3 - 1/2 N^2 flops for LU factorization * NOPS = (2.0D+0/3.0D+0)*( DBLE(N)**3 ) - $ (1.0D+0/2.0D+0)*( DBLE(N)**2 ) * * nrhs * 2 N^2 flops for LU solve. * NOPS = NOPS + 2.0D+0*(DBLE(N)**2)*DBLE(NRHS) * * Calculate total megaflops -- factorization * and solve -- for WALL and CPU time, and print * output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) $ THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * * Print WALL time if supported * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ PASSED * * Print CPU time if supported * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) $ THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ PASSED END IF 10 CONTINUE 20 CONTINUE * IF( CHECK.AND.( SRESID.GT.THRESH ) ) THEN * * Compute fresid = ||A - P*L*U|| / (||A|| * N * eps) * CALL PSGETRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ) ) CALL PSLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGETRRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRRV', LIPIV, $ 1, MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID END IF END IF 30 CONTINUE 40 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 50 CONTINUE * * Print ending messages and close output file * 60 CONTINUE IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N NB NRHS NBRHS P Q LU Time ', $ 'Sol Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- ----- --- ---- ----- ---- ---- -------- ', $ '-------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I5, 1X, I3, 1X, I5, 1X, I4, 1X, I4, 1X, $ I4, 1X, F8.2, 1X, F8.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - P*L*U|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PSLUDRIVER * END scalapack-2.0.2/TESTING/LIN/psluinfo.f000644 000766 000024 00000041076 10363532303 017464 0ustar00juliestaff000000 000000 SUBROUTINE PSLUINFO( SUMMRY, NOUT, NMAT, MVAL, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NNR, NRVAL, LDNRVAL, NNBR, $ NBRVAL, LDNBRVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, EST, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL EST CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDNVAL ), NBRVAL( LDNBRVAL ), $ NBVAL( LDNBVAL ), NRVAL( LDNRVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ WORK( * ) * .. * * Purpose * ======= * * PSLUINFO gets needed startup information for LU factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for M and N. * * MVAL (global output) INTEGER array, dimension (LDNVAL) * The values of M (number of rows in matrix) to run the code * with. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * M and N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * EST (global output) LOGICAL * Flag indicating if condition estimation and iterative * refinement routines are to be exercised. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 6, 2*LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL ) * Used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='LU.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Read the flag that indicates whether to test the condition * estimation and iterative refinement routines. * READ( NIN, FMT = * ) EST * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS IF( EST ) THEN WORK( 6 ) = 1 ELSE WORK( 6 ) = 0 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 6, 1, WORK, 6 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Ax=b by LU factorization.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision LU factorization '// $ 'and solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| / '// $ '(||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'LU time : Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 6, 1, WORK, 6, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) IF( WORK( 6 ).EQ.1 ) THEN EST = .TRUE. ELSE EST = .FALSE. END IF * I = 2*NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSLUINFO * END scalapack-2.0.2/TESTING/LIN/psmatgen.f000644 000766 000024 00000043110 10363532303 017432 0ustar00juliestaff000000 000000 SUBROUTINE PSMATGEN( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * PSMATGEN : Parallel Real Single precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * otherwise a random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) REAL, pointer into the local memory * to an array of dimension ( LDA, * ) containing the local * pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSRAND EXTERNAL ICEIL, NUMROC, LSAME, PSRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) * INFO = 0 IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSMATGEN', INFO ) RETURN END IF * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = ONE - TWO*PSRAND(0) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = ONE - TWO*PSRAND(0) 30 CONTINUE A(IK,JK) = ONE - TWO*PSRAND(0) DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = ONE - TWO*PSRAND(0) A(IK,JK+J) = A(IK+J,JK) 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 A(IK,JK) = ONE - TWO*PSRAND(0) JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 A(IK,JK) = ONE - TWO*PSRAND(0) JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSE * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = ONE - TWO*PSRAND(0) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 A(IK,JK+J) = ABS(A(IK,JK+J)) + MAXMN IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PSMATGEN * END scalapack-2.0.2/TESTING/LIN/pspbdriver.f000644 000766 000024 00000073715 10363532303 020012 0ustar00juliestaff000000 000000 PROGRAM PSPBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PSPBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by SPB. This test driver performs an * A = L*L**T factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER MEMSIZ, NTESTS, REALSZ REAL PADVAL PARAMETER ( REALSZ = 4, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0, ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BW, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, I, $ IAM, IASEED, IBSEED, ICTXT, ICTXTB, IERR_TEMP, $ IMIDPAD, INFO, IPA, IPB, IPOSTPAD, IPREPAD, $ IPW, IPW_SIZE, IPW_SOLVE, IPW_SOLVE_SIZE, $ IP_DRIVER_W, IP_FILLIN, J, K, KFAIL, KPASS, $ KSKIP, KTESTS, MYCOL, MYRHS_SIZE, MYROW, N, NB, $ NBW, NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, $ NPCOL, NPROCS, NPROCS_REAL, NPROW, NQ, NRHS, $ N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWVAL( NTESTS ), DESCA( 7 ), DESCA2D( DLEN_ ), $ DESCB( 7 ), DESCB2D( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSBMATGEN, $ PSCHEKPAD, PSFILLPAD, PSMATGEN, PSPBINFO, $ PSPBLASCHK, PSPBTRF, PSPBTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PSLANGE EXTERNAL LSAME, NUMROC, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PSPBINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NBW, $ BWVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, $ NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BW = BWVAL( BW_NUM ) IF( BW.LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Band', 'bw', BW IERR( 1 ) = 1 END IF * IF( BW.GT.N-1 ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*BW-1)/NPCOL + 1 ) $ + BW NB = MAX( NB, 2*BW ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*BW, N ) ) THEN IERR( 1 ) = 1 ENDIF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (BW+1), (BW+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((BW+1)+10) IMIDPAD = 10 IPOSTPAD = ((BW+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (BW+1), N, $ (BW+1), NB, 0, 0, $ ICTXT,((BW+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((BW+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (NB+2*BW)*BW * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = BW*BW * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((BW+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PSLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PSPBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(BW*(BW+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PSBMATGEN( ICTXT, UPLO, 'B', BW, BW, N, (BW+1), NB, $ MEM( IPA ), ((BW+1)+10), 0, 0, IASEED, $ MYROW, MYCOL, NPROW, NPCOL ) * CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((BW+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PSLANGE( '1', (BW+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((BW+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PSPBTRF( UPLO, N, BW, MEM( IPA ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PSPBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PSCHEKPAD( ICTXT, 'PSPBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((BW+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (BW*NRHS) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PSMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PSFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PSPBTRS( UPLO, N, BW, NRHS, MEM( IPA ), 1, $ DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PSPBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSPBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PSPBLASCHK( 'S', UPLO, N, BW, BW, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * NOPS = NOPS + DBLE(BW)*( -2.D0 / 3.D0+DBLE(BW)* $ ( -1.D0+DBLE(BW)*( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( 1.D0+DBLE(BW)*( 3.D0 / $ 2.D0+DBLE(BW)*( 1.D0 / 2.D0 ) ) ) NOPS = NOPS + DBLE(BW)*( -1.D0 / 6.D0+DBLE(BW) $ *( -1.D0 /2.D0+DBLE(BW) $ *( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( DBLE(BW) / $ 2.D0*( 1.D0+DBLE(BW) ) ) * NOPS = NOPS + $ DBLE(NRHS)*( ( 2*DBLE(N)-DBLE(BW) )* $ ( DBLE(BW)+1.D0 ) )+ DBLE(NRHS)* $ ( DBLE(BW)*( 2*DBLE(N)- $ ( DBLE(BW)+1.D0 ) ) ) * * * Second calc to represent actual hardware speed * * NB bw^2 flops for LLt factorization in 1st proc * NOPS2 = ( (DBLE(N_FIRST))* DBLE(BW)**2 ) * IF ( NPROCS_REAL .GT. 1) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in last processor * NOPS2 = NOPS2 + $ 4*( (DBLE(N_LAST)*DBLE(BW)**2) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 4*( (DBLE(NB)*DBLE(BW)**2) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ ( NPROCS_REAL-1 ) * ( BW*BW*BW/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ ( NPROCS_REAL-2 ) * ( 2 * BW*BW*BW ) ENDIF * * * nrhs * 4 n_first*bw flops for LLt solve in proc 1. * NOPS2 = NOPS2 + $ ( 4.0D+0*(DBLE(N_FIRST)*DBLE(BW))*DBLE(NRHS) ) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*4 n_last*bw flops for LLt solve in last. * NOPS2 = NOPS2 + $ 2*( 4.0D+0*(DBLE(N_LAST)*DBLE(BW))*DBLE(NRHS) ) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2 * nrhs * 4 NB*bw flops for LLt solve in others. * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( 4.0D+0*(DBLE(NB)*DBLE(BW))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1 ) * ( BW*BW ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * ( 3 * BW*BW ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UL N BW NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- ---- ----- -- ---- -------- ', $ '-------- ------ ------ ------' ) 9993 FORMAT( A4, 2X, A1, 1X, I6, 1X, I3, 1X, I4, 1X, $ I5, 1X, I2, 1X, $ I4, 1X, F8.3, F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PSPBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pspbinfo.f000644 000766 000024 00000044724 10363532303 017450 0ustar00juliestaff000000 000000 SUBROUTINE PSPBINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NBW, $ BWVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PSPBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of subdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( UPLO, 'L' ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data represents ''Upper'// $ ''' or ''Lower'' triangular portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'TRANS : Whether solve is to be done with'// $ ' ''Transpose'' of matrix A (T,C) or not (N).' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bw : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bw ', ( BWVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSPBINFO * END scalapack-2.0.2/TESTING/LIN/pspblaschk.f000644 000766 000024 00000025420 10363532303 017752 0ustar00juliestaff000000 000000 SUBROUTINE PSPBLASCHK( SYMM, UPLO, N, BWL, BWU, NRHS, X, IX, JX, $ DESCX, IASEED, A, IA, JA, DESCA, IBSEED, $ ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) REAL A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PSPBLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'S', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) REAL pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) REAL array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,max(bw*(bw+2),NB))+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,max(bw*(bw+2),NB))+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER BW, INFO, IPPRODUCT, WORK_MIN REAL DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PBSTRAN, PSMATGEN, $ SGAMX2D, SGEBR2D, SGEBS2D, SGEMM, $ SGERV2D, SGESD2D, SGSUM2D, SLASET * .. * .. External Functions .. INTEGER ISAMAX, NUMROC REAL PSLAMCH EXTERNAL ISAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'S' ) ) THEN BW = BWL WORK_MIN = MAX(5,MAX(BW*(BW+2),NB))+2*NB ELSE BW = MAX(BWL, BWU) WORK_MIN = MAX(5,MAX(BW*(BW+2),NB))+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PSBLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (BW+1), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'S' )) THEN CALL PSBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PSBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PSPBDCMV( BW+1, BW, UPLO, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), (BW+2)*BW, INFO ) * * * Regenerate column of B * CALL PSMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PSAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PSNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PSNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PSBLASCHK * END scalapack-2.0.2/TESTING/LIN/pspbmv1.f000644 000766 000024 00000072703 10607174520 017222 0ustar00juliestaff000000 000000 SUBROUTINE PSPBDCMV( LDBW, BW, UPLO, N, A, JA, DESCA, NRHS, B, IB, $ DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSPBTRF and this is stored in AF. If a linear system * is to be solved using PSPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, $ FIRST_PROC, I, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, J, JA_NEW, LLDA, LLDB, MYCOL, $ MYROW, MY_NUM_COLS, NB, NP, NPCOL, NPROW, $ NP_SAVE, ODD_SIZE, OFST, PART_OFFSET, $ PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Pre-calculate bw^2 * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -14 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 7*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 7*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PSPBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, $ 'PSPBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1005 PARAM_CHECK( 15, 2 ) = 1004 PARAM_CHECK( 14, 2 ) = 1003 PARAM_CHECK( 13, 2 ) = 1002 PARAM_CHECK( 12, 2 ) = 1001 PARAM_CHECK( 11, 2 ) = 9 PARAM_CHECK( 10, 2 ) = 705 PARAM_CHECK( 9, 2 ) = 704 PARAM_CHECK( 8, 2 ) = 703 PARAM_CHECK( 7, 2 ) = 701 PARAM_CHECK( 6, 2 ) = 6 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSPBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = ZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (BW+2)*BW WORK( I ) = ZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( UPLO, 'L' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL SLATCPY( 'U', BW, BW, $ A( LLDA*( NUMROC_SIZE-BW )+1+BW ), $ LLDA-1, WORK( 1 ), BW ) * * Send the triangle to neighboring processor to right * CALL STRSD2D(ICTXT, 'L', 'N', $ BW, BW, $ WORK( 1 ), $ BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL SSBMV( 'L', NUMROC_SIZE, BW, ONE, A( OFST+1 ), LLDA, $ B(PART_OFFSET+1), 1, ZERO, X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL SCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( BW*BW+1+BW-DL_N_N ), 1 ) * CALL STRMV( 'U', 'N', 'N', BW, $ A( LLDA*( NUMROC_SIZE-BW )+1+BW ), LLDA-1, $ WORK( BW*BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( BW*BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL SGESD2D( ICTXT, BW, 1, $ WORK( BW*BW+1 ), BW, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, BW*( BW+2 ) WORK( I ) = ZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL SCOPY( DL_P_M, B( 1 ), 1, $ WORK( BW*BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL STRRV2D(ICTXT, 'L', 'N', $ BW, BW, $ WORK( 1 ), BW, MYROW, MYCOL-1 ) * CALL STRMV( $ 'L', $ 'N', $ 'N', BW, $ WORK( 1 ), BW, $ WORK( BW*BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DL_P_M .GT. DL_P_N ) THEN DO 30 I=1, DL_P_M-DL_P_N WORK( BW*BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL SGESD2D( ICTXT, BW, 1, WORK(BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL SGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL SAXPY( BW, ONE, $ WORK( BW*BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL SGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL SAXPY( BW, ONE, $ WORK( BW*BW+1 ), 1, $ X( NUMROC_SIZE-BW+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( UPLO, 'U' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL SLATCPY( 'L', BW, BW, A( OFST+1 ), $ LLDA-1, WORK( 1 ), BW ) * * Send the triangle to neighboring processor to left * CALL STRSD2D(ICTXT, 'U', 'N', $ BW, BW, $ WORK( 1 ), $ BW, MYROW, MYCOL-1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL SSBMV( 'U', NUMROC_SIZE, BW, ONE, A( OFST+1 ), LLDA, $ B(PART_OFFSET+1), 1, ZERO, X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL SCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( BW*BW+1+BW-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL STRRV2D(ICTXT, 'U', 'N', $ BW, BW, $ WORK( 1 ), BW, MYROW, MYCOL+1 ) * CALL STRMV( 'U', 'N', 'N', BW, $ WORK( 1 ), BW, $ WORK( BW*BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( BW*BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL SGESD2D( ICTXT, BW, 1, $ WORK( BW*BW+1 ), BW, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, BW*( BW+2 ) WORK( I ) = ZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL SCOPY( DL_P_M, B( 1 ), 1, $ WORK( BW*BW+1 ), 1) * CALL STRMV( $ 'L', $ 'N', $ 'N', BW, $ A( 1 ), LLDA-1, $ WORK( BW*BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DL_P_M .GT. DL_P_N ) THEN DO 60 I=1, DL_P_M-DL_P_N WORK( BW*BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL SGESD2D( ICTXT, BW, 1, WORK(BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL SGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL SAXPY( BW, ONE, $ WORK( BW*BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL SGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL SAXPY( BW, ONE, $ WORK( BW*BW+1 ), 1, $ X( NUMROC_SIZE-BW+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PSBsBMV1 * END scalapack-2.0.2/TESTING/LIN/pspotrrv.f000644 000766 000024 00000027335 10363532303 017526 0ustar00juliestaff000000 000000 SUBROUTINE PSPOTRRV( UPLO, N, A, IA, JA, DESCA, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSPOTRRV recomputes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from L or U * computed by PSPOTRF. The routine performs the Cholesky factorization * in reverse. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric distributed matrix sub( A ) is stored: * stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the factors L or U of the * distributed matrix sub( A ) from the Cholesky factorization. * On exit, the original distributed matrix sub( A ) is * restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK >= MB_A*NB_A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IL, J, JB, JL, JN, MYCOL, $ MYROW, NPCOL, NPROW * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PSLACPY, PSLASET, $ PSSYRK, PSTRMM, PB_TOPGET, PB_TOPSET * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL ICEIL, INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * UPPER = LSAME( UPLO, 'U' ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JL = MAX( ( ( JA+N-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) IL = MAX( ( ( IA+N-2 ) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) IAROW = INDXG2P( IL, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) * * Define array descriptor for working array WORK * CALL DESCSET( DESCW, DESCA( MB_ ), DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * IF ( UPPER ) THEN * * Compute A from the Cholesky factor U : A = U'*U. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'S-ring' ) * DO 10 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + U'*U * CALL PSSYRK( 'Upper', 'Transpose', JA+N-J-JB, JB, ONE, A, IL, $ J+JB, DESCA, ONE, A, IL+JB, J+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PSLACPY( 'All', JB, JB, A, IL, J, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict lower triangular part of diagonal block, to make * it U1. * CALL PSLASET( 'Lower', JB-1, JB, ZERO, ZERO, A, IL+1, J, $ DESCA ) * * Update the row panel U with the triangular matrix * CALL PSTRMM( 'Left', 'Upper', 'Transpose', 'Non-Unit', JB, $ N-J+JA, ONE, WORK, 1, 1, DESCW, A, IL, J, $ DESCA ) * * Restore the strict lower triangular part of diagonal block. * CALL PSLACPY( 'Lower', JB-1, JB, WORK, 2, 1, DESCW, A, $ IL+1, J, DESCA ) * IL = IL - DESCA( MB_ ) DESCW( RSRC_ ) = MOD( DESCW( RSRC_ ) + NPROW - 1, NPROW ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + U'*U * CALL PSSYRK( 'Upper', 'Transpose', N-JB, JB, ONE, A, IA, JA+JB, $ DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PSLACPY( 'All', JB, JB, A, IA, JA, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict lower triangular part of diagonal block, to make * it U1. * CALL PSLASET( 'Lower', JB-1, JB, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Update the row panel U with the triangular matrix * CALL PSTRMM( 'Left', 'Upper', 'Transpose', 'Non-Unit', JB, $ N, ONE, WORK, 1, 1, DESCW, A, IA, JA, DESCA ) * * Restore the strict lower triangular part of diagonal block. * CALL PSLACPY( 'Lower', JB-1, JB, WORK, 2, 1, DESCW, A, IA+1, $ JA, DESCA ) * ELSE * * Compute A from the Cholesky factor L : A = L*L'. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * DO 20 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + L*L' * CALL PSSYRK( 'Lower', 'No transpose', IA+N-IL-JB, JB, ONE, A, $ IL+JB, J, DESCA, ONE, A, IL+JB, J+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PSLACPY( 'All', JB, JB, A, IL, J, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict upper triangular part of diagonal block, to make * it L1. * CALL PSLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IL, J+1, $ DESCA ) * * Update the column panel L with the triangular matrix * CALL PSTRMM( 'Right', 'Lower', 'Transpose', 'Non-Unit', $ IA+N-IL, JB, ONE, WORK, 1, 1, DESCW, A, IL, $ J, DESCA ) * * Restore the strict upper triangular part of diagonal block. * CALL PSLACPY( 'Upper', JB, JB-1, WORK, 1, 2, DESCW, A, $ IL, J+1, DESCA ) * IL = IL - DESCA( MB_ ) DESCW( RSRC_ ) = MOD( DESCW( RSRC_ ) + NPROW - 1, NPROW ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 20 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + L*L' * CALL PSSYRK( 'Lower', 'No transpose', N-JB, JB, ONE, A, $ IA+JB, JA, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PSLACPY( 'All', JB, JB, A, IA, JA, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict upper triangular part of diagonal block, to make * it L1. * CALL PSLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA, JA+1, $ DESCA ) * * Update the column panel L with the triangular matrix * CALL PSTRMM( 'Right', 'Lower', 'Transpose', 'Non-Unit', N, JB, $ ONE, WORK, 1, 1, DESCW, A, IA, JA, DESCA ) * * Restore the strict upper triangular part of diagonal block. * CALL PSLACPY( 'Upper', JB, JB-1, WORK, 1, 2, DESCW, A, IA, $ JA+1, DESCA ) * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PSPOTRRV * END scalapack-2.0.2/TESTING/LIN/psptdriver.f000644 000766 000024 00000074714 10363532303 020034 0ustar00juliestaff000000 000000 PROGRAM PSPTDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PSPTDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by SPT. This test driver performs an * A = L*L**T factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * REAL ZERO INTEGER MEMSIZ, NTESTS, REALSZ REAL PADVAL PARAMETER ( REALSZ = 4, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0, ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BW, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, I, $ IAM, IASEED, IBSEED, ICTXT, ICTXTB, IERR_TEMP, $ IMIDPAD, INFO, INT_TEMP, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL ANORM, SRESID, THRESH DOUBLE PRECISION NOPS, NOPS2, TMFLOPS, TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWVAL( NTESTS ), DESCA( 7 ), DESCA2D( DLEN_ ), $ DESCB( 7 ), DESCB2D( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSBMATGEN, $ PSCHEKPAD, PSFILLPAD, PSMATGEN, PSPTINFO, $ PSPTLASCHK, PSPTTRF, PSPTTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME REAL PSLANGE EXTERNAL LSAME, NUMROC, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PSPTINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NBW, $ BWVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, $ NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BW = 1 IF( BW.LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Band', 'bw', BW IERR( 1 ) = 1 END IF * IF( BW.GT.N-1 ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*INT_ONE-1)/NPCOL + 1 ) $ + INT_ONE NB = MAX( NB, 2*INT_ONE ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*INT_ONE, N ) ) THEN IERR( 1 ) = 1 ENDIF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (2), (2), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((2)+10) IMIDPAD = 10 IPOSTPAD = ((2)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, N, (2), $ NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((2)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + (NB+10)*(2) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (12*NPCOL + 3*NB) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 8*NPCOL * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((2)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PSLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PSPTLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,NB)+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PSBMATGEN( ICTXT, UPLO, 'T', BW, BW, N, (2), NB, $ MEM( IPA ), NB+10, 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) CALL PSFILLPAD( ICTXT, NQ, NP, MEM( IPA-IPREPAD ), $ NB+10, IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PSLANGE( 'I', N, $ (2), MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', NQ, NP, $ MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( LSAME( UPLO, 'L' ) ) THEN INT_TEMP = 0 ELSE INT_TEMP = DESCA2D( LLD_ ) ENDIF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PSPTTRF( N, MEM( IPA+INT_TEMP ), $ MEM( IPA+1*( NB+10-INT_TEMP ) ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PSPTTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PSCHEKPAD( ICTXT, 'PSPTTRF', NQ, $ NP, MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (10+2*MIN(100,NRHS))*NPCOL+4*NRHS * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PSMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PSFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PSPTTRS( N, NRHS, MEM( IPA+INT_TEMP ), $ MEM( IPA+1*( NB+10-INT_TEMP ) ), 1, $ DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PSPTTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSPTTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * * Reset descriptor describing A to 1-by-P grid for * use in banded utility routines * CALL DESCINIT( DESCA2D, (2), N, $ (2), NB, 0, 0, $ ICTXT, (2), IERR( 1 ) ) CALL PSPTLASCHK( 'S', UPLO, N, BW, BW, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * NOPS = NOPS + DBLE(BW)*( -2.D0 / 3.D0+DBLE(BW)* $ ( -1.D0+DBLE(BW)*( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( 1.D0+DBLE(BW)*( 3.D0 / $ 2.D0+DBLE(BW)*( 1.D0 / 2.D0 ) ) ) NOPS = NOPS + DBLE(BW)*( -1.D0 / 6.D0+DBLE(BW) $ *( -1.D0 /2.D0+DBLE(BW) $ *( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( DBLE(BW) / $ 2.D0*( 1.D0+DBLE(BW) ) ) * NOPS = NOPS + $ DBLE(NRHS)*( ( 2*DBLE(N)-DBLE(BW) )* $ ( DBLE(BW)+1.D0 ) )+ DBLE(NRHS)* $ ( DBLE(BW)*( 2*DBLE(N)- $ ( DBLE(BW)+1.D0 ) ) ) * * * Second calc to represent actual hardware speed * * NB bw^2 flops for LLt factorization in 1st proc * NOPS2 = ( (DBLE(N_FIRST))* DBLE(BW)**2 ) * IF ( NPROCS_REAL .GT. 1) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in last processor * NOPS2 = NOPS2 + $ 4*( (DBLE(N_LAST)*DBLE(BW)**2) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 4*( (DBLE(NB)*DBLE(BW)**2) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ ( NPROCS_REAL-1 ) * ( BW*BW*BW/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ ( NPROCS_REAL-2 ) * ( 2 * BW*BW*BW ) ENDIF * * * nrhs * 4 n_first*bw flops for LLt solve in proc 1. * NOPS2 = NOPS2 + $ ( 4.0D+0*(DBLE(N_FIRST)*DBLE(BW))*DBLE(NRHS) ) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*4 n_last*bw flops for LLt solve in last. * NOPS2 = NOPS2 + $ 2*( 4.0D+0*(DBLE(N_LAST)*DBLE(BW))*DBLE(NRHS) ) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2 * nrhs * 4 NB*bw flops for LLt solve in others. * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( 4.0D+0*(DBLE(NB)*DBLE(BW))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1 ) * ( BW*BW ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * ( 3 * BW*BW ) ENDIF * * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UL N BW NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- ---- ----- -- ---- -------- ', $ '-------- ------ ------ ------' ) 9993 FORMAT( A4, 2X, A1, 1X, I6, 1X, I3, 1X, I4, 1X, $ I5, 1X, I2, 1X, $ I4, 1X, F8.3, F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PSPTTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/psptinfo.f000644 000766 000024 00000044745 10363532303 017475 0ustar00juliestaff000000 000000 SUBROUTINE PSPTINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NBW, $ BWVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PSPTINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of subdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW NBW = 1 IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( UPLO, 'L' ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data represents ''Upper'// $ ''' or ''Lower'' triangular portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'TRANS : Whether solve is to be done with'// $ ' ''Transpose'' of matrix A (T,C) or not (N).' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bw : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bw ', ( BWVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSPTINFO * END scalapack-2.0.2/TESTING/LIN/psptlaschk.f000644 000766 000024 00000027356 10363532303 020006 0ustar00juliestaff000000 000000 SUBROUTINE PSPTLASCHK( SYMM, UPLO, N, BWL, BWU, NRHS, X, IX, JX, $ DESCX, IASEED, A, IA, JA, DESCA, IBSEED, $ ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ REAL ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) REAL A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PSPTLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'S', sub( A ) is a symmetric distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'S', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) REAL pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) REAL * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) REAL * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) REAL array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,NB)+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,NB)+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER I, START INTEGER BW, INFO, IPPRODUCT, WORK_MIN REAL DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PBSTRAN, PSMATGEN, $ SGAMX2D, SGEBR2D, SGEBS2D, SGEMM, $ SGERV2D, SGESD2D, SGSUM2D, SLASET * .. * .. External Functions .. INTEGER ISAMAX, NUMROC REAL PSLAMCH EXTERNAL ISAMAX, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'S' ) ) THEN BW = BWL START = 1 WORK_MIN = MAX(5,NB)+2*NB ELSE BW = MAX(BWL, BWU) IF( LSAME( UPLO, 'D' )) THEN START = 1 ELSE START = 2 ENDIF WORK_MIN = MAX(5,NB)+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PSTLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) RESID = 0.0E+0 DIVISOR = ANORM * EPS * REAL( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (2), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'S' )) THEN CALL PSBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PSBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF IF( LSAME( UPLO, 'U' ) ) THEN * * * Matrix formed above has the diagonals shifted from what was * input to the tridiagonal routine. Shift them back. * * Send elements to neighboring processors * IF( MYCOL.LT.NPCOL-1 ) THEN CALL SGESD2D( ICTXT, 1, 1, $ A( START+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * * Shift local elements * DO 230 I=DESCA( NB_ )-1,0,-1 A( START+(I+1)*LDA ) = A( START+(I)*LDA ) 230 CONTINUE * * Receive elements from neighboring processors * IF( MYCOL.GT.0 ) THEN CALL SGERV2D( ICTXT, 1, 1, A( START), LDA, $ MYROW, MYCOL-1 ) ENDIF * ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PSPBDCMV( BW+1, BW, UPLO, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), (BW+2)*BW, INFO ) * * * Regenerate column of B * CALL PSMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PSAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PSNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PSNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PSTLASCHK * END scalapack-2.0.2/TESTING/LIN/psqrdriver.f000644 000766 000024 00000121643 10363532303 020025 0ustar00juliestaff000000 000000 PROGRAM PSQRDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * Purpose * ======= * * PSQRDRIVER is the main test program for the REAL * SCALAPACK QR factorization routines. This test driver performs a QR * QL, LQ, RQ, QP (QR factorization with column pivoting) or TZ * (complete orthogonal factorization) factorization and checks the * results. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 16 lines: * 'ScaLAPACK QR factorizations input file' * 'PVM machine' * 'QR.out' output file name (if any) * 6 device out * 6 number of factorizations * 'QR' 'QL' 'LQ' 'RQ' 'QP' 'TZ' factorization: QR, QL, LQ, RQ, QP, TZ * 4 number of problems sizes * 55 17 31 201 values of M * 5 71 31 201 values of N * 3 number of MB's and NB's * 4 3 5 values of MB * 4 7 3 values of NB * 7 number of process grids (ordered P & Q) * 1 2 1 4 2 3 8 values of P * 7 2 4 1 3 2 1 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INTGSZ, MEMSIZ, NTESTS, REALSZ, TOTMEM REAL PADVAL PARAMETER ( INTGSZ = 4, REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0 ) * .. * .. Local Scalars .. CHARACTER*2 FACT CHARACTER*6 PASSED CHARACTER*7 ROUT CHARACTER*8 ROUTCHK CHARACTER*80 OUTFILE LOGICAL CHECK INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPPIV, IPREPAD, IPTAU, IPW, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, L, LIPIV, LTAU, $ LWORK, M, MAXMN, MB, MINMN, MNP, MNQ, MP, $ MYCOL, MYROW, N, NB, NFACT, NGRIDS, NMAT, NNB, $ NOUT, NPCOL, NPROCS, NPROW, NQ, WORKFCT, $ WORKSIZ REAL ANORM, FRESID, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Arrays .. CHARACTER*2 FACTOR( NTESTS ) INTEGER DESCA( DLEN_ ), IERR( 1 ), MBVAL( NTESTS ), $ MVAL( NTESTS ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSCHEKPAD, $ PSFILLPAD, PSGELQF, PSGELQRV, $ PSGEQLF, PSGEQLRV, PSGEQPF, $ PSQPPIV, PSGEQRF, PSGEQRRV, $ PSGERQF, PSGERQRV, PSTZRZRV, $ PSMATGEN, PSLAFCHK, PSQRINFO, $ PSTZRZF, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, NUMROC REAL PSLANGE EXTERNAL ICEIL, LSAMEN, NUMROC, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP /4*0/ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PSQRINFO( OUTFILE, NOUT, NFACT, FACTOR, NTESTS, NMAT, MVAL, $ NTESTS, NVAL, NTESTS, NNB, MBVAL, NTESTS, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Loop over the different factorization types * DO 40 I = 1, NFACT * FACT = FACTOR( I ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) IF( LSAMEN( 2, FACT, 'QR' ) ) THEN ROUT = 'PSGEQRF' ROUTCHK = 'PSGEQRRV' WRITE( NOUT, FMT = 9986 ) $ 'QR factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN ROUT = 'PSGEQLF' ROUTCHK = 'PSGEQLRV' WRITE( NOUT, FMT = 9986 ) $ 'QL factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN ROUT = 'PSGELQF' ROUTCHK = 'PSGELQRV' WRITE( NOUT, FMT = 9986 ) $ 'LQ factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN ROUT = 'PSGERQF' ROUTCHK = 'PSGERQRV' WRITE( NOUT, FMT = 9986 ) $ 'RQ factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN ROUT = 'PSGEQPF' ROUTCHK = 'PSGEQRRV' WRITE( NOUT, FMT = 9986 ) $ 'QR factorization with column pivoting tests.' ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN ROUT = 'PSTZRZF' ROUTCHK = 'PSTZRZRV' WRITE( NOUT, FMT = 9986 ) $ 'Complete orthogonal factorization tests.' END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 J = 1, NGRIDS * NPROW = PVAL( J ) NPCOL = QVAL( J ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 K = 1, NMAT * M = MVAL( K ) N = NVAL( K ) * * Make sure matrix information is correct * IERR(1) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 L = 1, NNB * MB = MBVAL( L ) NB = NBVAL( L ) * * Make sure mb is legal * IERR( 1 ) = 0 IF( MB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MB', 'MB', MB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * MP = NUMROC( M, MB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) MNP = NUMROC( MIN( M, N ), MB, MYROW, 0, NPROW ) MNQ = NUMROC( MIN( M, N ), NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( MB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, MB, NB, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for ScaLAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IPTAU = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + IPREPAD * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN * LTAU = MNQ IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QR * factorization * LWORK = DESCA( NB_ ) * ( MP + NQ + DESCA( NB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PSLAFCHK, PSGEQRRV and * PSLANGE * WORKSIZ = LWORK + MP*DESCA( NB_ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN * LTAU = NQ IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QL * factorization * LWORK = DESCA( NB_ ) * ( MP + NQ + DESCA( NB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PSLAFCHK, PSGEQLRV and * PSLANGE * WORKSIZ = LWORK + MP*DESCA( NB_ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN * LTAU = MNP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the LQ * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PSLAFCHK, PSGELQRV and * PSLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN * LTAU = MP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QR * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PSLAFCHK, PSGERQRV and * PSLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * LTAU = MNQ IPPIV = IPTAU + LTAU + IPOSTPAD + IPREPAD LIPIV = ICEIL( INTGSZ*NQ, REALSZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the * factorization i.e from IPW on. * LWORK = MAX( 3, MP + MAX( 1, NQ ) ) + 2 * NQ WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PSLAFCHK, PSGEQRRV, * PSLANGE. * WORKSIZ = MAX( WORKSIZ - IPOSTPAD, $ DESCA( NB_ )*( 2*MP + NQ + DESCA( NB_ ) ) ) + $ IPOSTPAD END IF * ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN * LTAU = MP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the TZ * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PSLAFCHK, PSTZRZRV and * PSLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ FACT // ' factorization', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PSMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need the Infinity of A for checking * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PSFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PSFILLPAD( ICTXT, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANGE( 'I', M, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKFCT-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKFCT-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform QR factorizations * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN CALL SLTIMER( 1 ) CALL PSGEQRF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN CALL SLTIMER( 1 ) CALL PSGEQLF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN CALL SLTIMER( 1 ) CALL PSGELQF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN CALL SLTIMER( 1 ) CALL PSGERQF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL SLTIMER( 1 ) CALL PSGEQPF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPTAU ), $ MEM( IPW ), LWORK, INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN CALL SLTIMER( 1 ) IF( N.GE.M ) $ CALL PSTZRZF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PSCHEKPAD( ICTXT, ROUT, MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, ROUT, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PSCHEKPAD( ICTXT, ROUT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PSCHEKPAD( ICTXT, ROUT, WORKFCT-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKFCT-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN * * Compute residual = ||A-Q*R|| / (||A||*N*eps) * CALL PSGEQRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PSLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN * * Compute residual = ||A-Q*L|| / (||A||*N*eps) * CALL PSGEQLRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PSLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN * * Compute residual = ||A-L*Q|| / (||A||*N*eps) * CALL PSGELQRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PSLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN * * Compute residual = ||A-R*Q|| / (||A||*N*eps) * CALL PSGERQRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PSLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * * Compute residual = ||AP-Q*R|| / (||A||*N*eps) * CALL PSGEQRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN * * Compute residual = ||A-T*Z|| / (||A||*N*eps) * IF( N.GE.M ) THEN CALL PSTZRZRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) END IF CALL PSLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) END IF * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, ROUTCHK, MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, ROUTCHK, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, ROUTCHK, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * CALL PSQPPIV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSQPPIV', MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSQPPIV', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PSLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSLAFCHK', MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLAFCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Test residual and detect NaN result * IF( LSAMEN( 2, FACT, 'TZ' ) .AND. N.LT.M ) THEN KSKIP = KSKIP + 1 PASSED = 'BYPASS' ELSE IF( FRESID.LE.THRESH .AND. $ (FRESID-FRESID).EQ.0.0E+0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * MINMN = MIN( M, N ) MAXMN = MAX( M, N ) * IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN IF( M.GE.N ) THEN NOPS = 0.0D+0 ELSE * * 5/2 ( M^2 N - M^3 ) + 5/2 N M + 1/2 M^2 for * complete orthogonal factorization (M <= N). * NOPS = ( 5.0D+0 * ( $ DBLE( N )*( DBLE( M )**2 ) - $ DBLE( M )**3 + $ DBLE( N )*DBLE( M ) ) + $ DBLE( M )**2 ) / 2.0D+0 END IF * ELSE * * 2 M N^2 - 2/3 N^2 + M N + N^2 for QR type * factorization when M >= N. * NOPS = 2.0D+0 * ( DBLE( MINMN )**2 ) * $ ( DBLE( MAXMN )-DBLE( MINMN ) / 3.0D+0 ) + $ ( DBLE( MAXMN )+DBLE( MINMN ) )*DBLE( MINMN ) END IF * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, MB, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, $ PASSED, FRESID * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, MB, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, $ PASSED, FRESID * END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * 40 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N MB NB P Q Fact Time ', $ ' MFLOPS CHECK Residual' ) 9994 FORMAT( '---- ------ ------ --- --- ----- ----- --------- ', $ '----------- ------ --------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I3, 1X, I3, 1X, I5, 1X, I5, 1X, $ F9.2, 1X, F11.2, 1X, A6, 2X, G8.1 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( A ) * STOP * * End of PSQRDRIVER * END * SUBROUTINE PSQPPIV( M, N, A, IA, JA, DESCA, IPIV ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) REAL A( * ) * .. * * Purpose * ======= * * PSQPPIV applies to sub( A ) = A(IA:IA+M-1,JA:JA+N-1) the pivots * returned by PSGEQPF in reverse order for checking purposes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be permuted. On exit, the local pieces * of the distributed permuted submatrix sub( A ) * Inv( P ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension LOCc(JA+N-1). * On exit, if IPIV(I) = K, the local i-th column of sub( A )*P * was the global K-th column of sub( A ). IPIV is tied to the * distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IACOL, ICOFFA, ICTXT, IITMP, IPVT, IPCOL, $ IPROW, ITMP, J, JJ, JJA, KK, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, IGERV2D, $ IGESD2D, IGAMN2D, INFOG1L, PSSWAP * .. * .. External Functions .. INTEGER INDXL2G, NUMROC EXTERNAL INDXL2G, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG1L( JA, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), JJA, $ IACOL ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * DO 20 J = JA, JA+N-2 * IPVT = JA+N-1 ITMP = JA+N * * Find first the local minimum candidate for pivoting * CALL INFOG1L( J, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), $ JJ, IACOL ) DO 10 KK = JJ, JJA+NQ-1 IF( IPIV( KK ).LT.IPVT )THEN IITMP = KK IPVT = IPIV( KK ) END IF 10 CONTINUE * * Find the global minimum pivot * CALL IGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, IPVT, 1, IPROW, $ IPCOL, 1, -1, MYCOL ) * * Broadcast the corresponding index to the other process columns * IF( MYCOL.EQ.IPCOL ) THEN ITMP = INDXL2G( IITMP, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL IGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ITMP, 1 ) IF( IPCOL.NE.IACOL ) THEN CALL IGERV2D( ICTXT, 1, 1, IPIV( IITMP ), 1, MYROW, $ IACOL ) ELSE IF( MYCOL.EQ.IACOL ) $ IPIV( IITMP ) = IPIV( JJ ) END IF ELSE CALL IGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ITMP, 1, MYROW, $ IPCOL ) IF( MYCOL.EQ.IACOL .AND. IPCOL.NE.IACOL ) $ CALL IGESD2D( ICTXT, 1, 1, IPIV( JJ ), 1, MYROW, IPCOL ) END IF * * Swap the columns of A * CALL PSSWAP( M, A, IA, ITMP, DESCA, 1, A, IA, J, DESCA, 1 ) * 20 CONTINUE * * End of PSQPPIV * END scalapack-2.0.2/TESTING/LIN/psqrinfo.f000644 000766 000024 00000042542 10363532303 017465 0ustar00juliestaff000000 000000 SUBROUTINE PSQRINFO( SUMMRY, NOUT, NFACT, FACTOR, LDFACT, NMAT, $ MVAL, LDMVAL, NVAL, LDNVAL, NNB, MBVAL, $ LDMBVAL, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDFACT, LDMBVAL, LDMVAL, LDNBVAL, LDNVAL, $ LDPVAL, LDQVAL, NFACT, NGRIDS, NMAT, NNB, $ NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. CHARACTER*2 FACTOR( LDFACT ) CHARACTER*(*) SUMMRY INTEGER MBVAL( LDMBVAL ), MVAL( LDMVAL ), $ NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PSQRINFO gets needed startup information for the QR factoriza- * tion routines and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NFACT (global output) INTEGER * The number of different factorization types to be tested. * * FACTOR (global output) CHARACTER*2 array of dimension of LDFACT, * The factorization types to be tested: * if FACTOR(i) = 'QR' then QR factorization, * if FACTOR(i) = 'QL' then QL factorization, * if FACTOR(i) = 'LQ' then LQ factorization, * if FACTOR(i) = 'RQ' then RQ factorization, * if FACTOR(i) = 'QP' then QR factorization with column * pivoting. * if FACTOR(i) = 'TZ' then complete orthogonal factorization. * * LDFACT (global input) INTEGER * The maximum number of different factorization types to be * tested. LDFACT >= NFACT. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * MVAL (global output) INTEGER array of dimension (LDNVAL), the * values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M, LDNVAL > = NMAT. * * NVAL (global output) INTEGER array of dimension (LDNVAL), the * values of N (number of columns in matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for MB and * NB. * * MBVAL (global output) INTEGER array of dimension (LDMBVAL), the * values of MB (row blocksize) to run the code with. * * LDMBVAL (global input) INTEGER * The maximum number of different values that can be used for * MB, LDMBVAL >= NNB. * * NBVAL (global output) INTEGER array of dimension (LDNBVAL), the * values of NB (column blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array of dimension (LDPVAL), the * values of P (number of process rows) to run the code with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array of dimension (LDQVAL), the * values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 4, LDFACT+LDMVAL+LDNVAL+LDMBVAL+LDNBVAL+LDPVAL+LDQVAL ) * used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT, K REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAMEN REAL PSLAMCH EXTERNAL LSAMEN, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='QR.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get the matrix types to be tested * READ( NIN, FMT = * ) NFACT IF( NFACT.LT.1 .OR. NFACT.GT.LDFACT ) THEN WRITE( NOUT, FMT = 9994 ) 'nb of factorization', LDFACT GO TO 40 END IF READ( NIN, FMT = * ) ( FACTOR( I ), I = 1, NFACT ) * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 40 ELSE IF( NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'M', LDMVAL GO TO 40 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDMBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'MB', LDMBVAL GO TO 40 ELSE IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 40 END IF READ( NIN, FMT = * ) ( MBVAL( I ), I = 1, NNB ) READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 40 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 40 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I ) * QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS WORK( 4 ) = NFACT CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 DO 20 K = 1, NFACT IF( LSAMEN( 2, FACTOR( K ), 'QR' ) ) THEN WORK( I ) = 1 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'QL' ) ) THEN WORK( I ) = 2 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'LQ' ) ) THEN WORK( I ) = 3 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'RQ' ) ) THEN WORK( I ) = 4 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'QP' ) ) THEN WORK( I ) = 5 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'TZ' ) ) THEN WORK( I ) = 6 I = I + 1 END IF 20 CONTINUE * CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, MBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK QR factorizations routines.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision QR factorizations '// $ 'routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' || A - QR || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - QL || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - LQ || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - RQ || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - QRP || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - TZ || / (|| A || * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'MB : The row blocksize of the blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The column blocksize of the blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 ) $ 'Fact Time: Time in seconds to factor the'// $ ' matrix.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Execution rate of the '// $ 'factorization.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'MB ', ( MBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) NFACT = WORK( 4 ) * I = NFACT + 2*NMAT + 2*NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * DO 30 K = 1, NFACT IF( WORK( K ).EQ.1 ) THEN FACTOR( K ) = 'QR' ELSE IF( WORK( K ).EQ.2 ) THEN FACTOR( K ) = 'QL' ELSE IF( WORK( K ).EQ.3 ) THEN FACTOR( K ) = 'LQ' ELSE IF( WORK( K ).EQ.4 ) THEN FACTOR( K ) = 'RQ' ELSE IF( WORK( K ).EQ.5 ) THEN FACTOR( K ) = 'QP' ELSE IF( WORK( K ).EQ.6 ) THEN FACTOR( K ) = 'TZ' END IF 30 CONTINUE * I = NFACT + 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, MBVAL, 1 ) I = I + NNB CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 40 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSQRINFO * END scalapack-2.0.2/TESTING/LIN/psqrt13.f000644 000766 000024 00000020450 10363532303 017133 0ustar00juliestaff000000 000000 SUBROUTINE PSQRT13( SCALE, M, N, A, IA, JA, DESCA, NORMA, ISEED, $ WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, ISEED, JA, M, N, SCALE REAL NORMA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL WORK( * ) REAL A( * ) * .. * * Purpose * ======= * * PSQRT13 generates a full-rank matrix that may be scaled to have * large or small norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SCALE (global input) INTEGER * SCALE = 1: normally scaled matrix * SCALE = 2: matrix scaled up * SCALE = 3: matrix scaled down * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local output) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * NORMA (global output) REAL * The one-norm of A. * * ISEED (global input/global output) INTEGER * Seed for random number generator. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK >= Nq0, where * * ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), and * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, IIA, INFO, $ IROFFA, J, JJA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ REAL AJJ, ASUM, BIGNUM, SMLNUM * .. * .. External Functions .. INTEGER NUMROC REAL PSLAMCH, PSLANGE EXTERNAL NUMROC, PSLAMCH, PSLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSLABAD, PSLASCL, $ PSMATGEN, PSASUM, PSELGET, PSELSET * .. * .. Intrinsic Functions .. INTRINSIC MOD, SIGN * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * generate the matrix * IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) MP = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * CALL PSMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), A, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED, IIA-1, MP, $ JJA-1, NQ, MYROW, MYCOL, NPROW, NPCOL ) * DO 10 J = JA, JA+N-1 I = IA + J - JA IF( I.LE.IA+M-1 ) THEN CALL PSASUM( M, ASUM, A, IA, J, DESCA, 1 ) CALL PSELGET( 'Column', ' ', AJJ, A, I, J, DESCA ) AJJ = AJJ + SIGN( ASUM, AJJ ) CALL PSELSET( A, I, J, DESCA, AJJ ) END IF 10 CONTINUE * * scaled versions * IF( SCALE.NE.1 ) THEN * NORMA = PSLANGE( 'M', M, N, A, IA, JA, DESCA, WORK ) SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM CALL PSLABAD( ICTXT, SMLNUM, BIGNUM ) SMLNUM = SMLNUM / PSLAMCH( ICTXT, 'Epsilon' ) BIGNUM = ONE / SMLNUM * IF( SCALE.EQ.2 ) THEN * * matrix scaled up * CALL PSLASCL( 'General', NORMA, BIGNUM, M, N, A, IA, $ JA, DESCA, INFO ) * ELSE IF( SCALE.EQ.3 ) THEN * * matrix scaled down * CALL PSLASCL( 'General', NORMA, SMLNUM, M, N, A, IA, $ JA, DESCA, INFO ) * END IF * END IF * NORMA = PSLANGE( 'One-norm', M, N, A, IA, JA, DESCA, WORK ) * RETURN * * End of PSQRT13 * END scalapack-2.0.2/TESTING/LIN/psqrt14.f000644 000766 000024 00000033554 10363532303 017145 0ustar00juliestaff000000 000000 REAL FUNCTION PSQRT14( TRANS, M, N, NRHS, A, IA, JA, $ DESCA, X, IX, JX, DESCX, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IX, JA, JX, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) REAL A( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PSQRT14 checks whether sub( X ) is in the row space of sub( A ) or * sub( A )', where sub( A ) denotes A( IA:IA+M-1, JA:JA+N-1 ) and * sub( X ) denotes X( IX:IX+N-1, JX:JX+NRHS-1 ) if TRANS = 'N', and * X( IX:IX+N-1, JX:JX+NRHS-1 ) otherwise. It does so by scaling both * sub( X ) and sub( A ) such that their norms are in the range * [sqrt(eps), 1/sqrt(eps)], then computing an LQ factorization of * [sub( A )',sub( X )]' (if TRANS = 'N') or a QR factorization of * [sub( A ),sub( X )] otherwise, and returning the norm of the trailing * triangle, scaled by MAX(M,N,NRHS)*eps. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * = 'N': No transpose, check for sub( X ) in the row space of * sub( A ), * = 'T': Transpose, check for sub( X ) in row space of * sub( A )'. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( X ). NRHS >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)). This array * contains the local pieces of the M-by-N distributed matrix * sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) REAL pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). * On entry, this array contains the local pieces of the * N-by-NRHS distributed submatrix sub( X ) if TRANS = 'N', * and the M-by-NRHS distributed submatrix sub( X ) otherwise. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * WORK (local workspace) REAL array dimension (LWORK) * If TRANS='N', LWORK >= MNRHSP * NQ + LTAU + LWF and * LWORK >= MP * NNRHSQ + LTAU + LWF otherwise, where * * IF TRANS='N', (LQ fact) * MNRHSP = NUMROC( M+NRHS+IROFFA, MB_A, MYROW, IAROW, * NPROW ) * LTAU = NUMROC( IA+MIN( M+NRHS, N )-1, MB_A, MYROW, * RSRC_A, NPROW ) * LWF = MB_A * ( MB_A + MNRHSP + NQ0 ) * ELSE (QR fact) * NNRHSQ = NUMROC( N+NRHS+ICOFFA, NB_A, MYCOL, IACOL, * NPCOL ) * LTAU = NUMROC( JA+MIN( M, N+NRHS )-1, NB_A, MYCOL, * CSRC_A, NPCOL ) * LWF = NB_A * ( NB_A + MP0 + NNRHSQ ) * END IF * * and, * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MP0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NQ0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL TPSD INTEGER IACOL, IAROW, ICOFFA, ICTXT, IDUM, IIA, INFO, $ IPTAU, IPW, IPWA, IROFFA, IWA, IWX, J, JJA, $ JWA, JWX, LDW, LWORK, MPWA, MPW, MQW, MYCOL, $ MYROW, NPCOL, NPROW, NPW, NQWA, NQW REAL AMAX, ANRM, ERR, XNRM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) REAL RWORK( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL PSLANGE, PSLAMCH EXTERNAL LSAME, NUMROC, PSLANGE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSAMAX, $ PSCOPY, PSGELQF, PSGEQRF, PSLACPY, $ PSLASCL, PXERBLA, SGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * PSQRT14 = ZERO * IPWA = 1 IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IWA = IROFFA + 1 JWA = ICOFFA + 1 CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) MPWA = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQWA = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * INFO = 0 IF( LSAME( TRANS, 'N' ) ) THEN IF( N.LE.0 .OR. NRHS.LE.0 ) $ RETURN TPSD = .FALSE. MPW = NUMROC( M+NRHS+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) NQW = NQWA * * Assign descriptor DESCW for workspace WORK and pointers to * matrices sub( A ) and sub( X ) in workspace * IWX = IWA + M JWX = JWA LDW = MAX( 1, MPW ) CALL DESCSET( DESCW, M+NRHS+IROFFA, N+ICOFFA, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, LDW ) * ELSE IF( LSAME( TRANS, 'T' ) ) THEN IF( M.LE.0 .OR. NRHS.LE.0 ) $ RETURN TPSD = .TRUE. MPW = MPWA NQW = NUMROC( N+NRHS+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) * * Assign descriptor DESCW for workspace WORK and pointers to * matrices sub( A ) and sub( X ) in workspace * IWX = IWA JWX = JWA + N LDW = MAX( 1, MPW ) CALL DESCSET( DESCW, M+IROFFA, N+NRHS+ICOFFA, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, LDW ) ELSE CALL PXERBLA( ICTXT, 'PSQRT14', -1 ) RETURN END IF * * Copy and scale sub( A ) * IPTAU = IPWA + MPW*NQW CALL PSLACPY( 'All', M, N, A, IA, JA, DESCA, WORK( IPWA ), IWA, $ JWA, DESCW ) RWORK( 1 ) = ZERO ANRM = PSLANGE( 'M', M, N, WORK( IPWA ), IWA, JWA, DESCW, RWORK ) IF( ANRM.NE.ZERO ) $ CALL PSLASCL( 'G', ANRM, ONE, M, N, WORK( IPWA ), IWA, $ JWA, DESCW, INFO ) * * Copy sub( X ) or sub( X )' into the right place and scale it * IF( TPSD ) THEN * * Copy sub( X ) into columns jwa+n:jwa+n+nrhs-1 of work * DO 10 J = 1, NRHS CALL PSCOPY( M, X, IX, JX+J-1, DESCX, 1, WORK( IPWA ), IWX, $ JWX+J-1, DESCW, 1 ) 10 CONTINUE XNRM = PSLANGE( 'M', M, NRHS, WORK( IPWA ), IWX, JWX, DESCW, $ RWORK ) IF( XNRM.NE.ZERO ) $ CALL PSLASCL( 'G', XNRM, ONE, M, NRHS, WORK( IPWA ), IWX, $ JWX, DESCW, INFO ) * * Compute QR factorization of work(iwa:iwa+m-1,jwa:jwa+n+nrhs-1) * MQW = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPW = IPTAU + MIN( MQW, NQW ) LWORK = DESCW( NB_ ) * ( MPW + NQW + DESCW( NB_ ) ) CALL PSGEQRF( M, N+NRHS, WORK( IPWA ), IWA, JWA, DESCW, $ WORK( IPTAU ), WORK( IPW ), LWORK, INFO ) * * Compute largest entry in upper triangle of * work(iwa+n:iwa+m-1,jwa+n:jwa+n+nrhs-1) * ERR = ZERO IF( N.LT.M ) THEN DO 20 J = JWX, JWA+N+NRHS-1 CALL PSAMAX( MIN(M-N,J-JWX+1), AMAX, IDUM, WORK( IPWA ), $ IWA+N, J, DESCW, 1 ) ERR = MAX( ERR, ABS( AMAX ) ) 20 CONTINUE END IF CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, IDUM1, IDUM2, $ -1, -1, 0 ) * ELSE * * Copy sub( X )' into rows iwa+m:iwa+m+nrhs-1 of work * DO 30 J = 1, NRHS CALL PSCOPY( N, X, IX, JX+J-1, DESCX, 1, WORK( IPWA ), $ IWX+J-1, JWX, DESCW, DESCW( M_ ) ) 30 CONTINUE * XNRM = PSLANGE( 'M', NRHS, N, WORK( IPWA ), IWX, JWX, DESCW, $ RWORK ) IF( XNRM.NE.ZERO ) $ CALL PSLASCL( 'G', XNRM, ONE, NRHS, N, WORK( IPWA ), IWX, $ JWX, DESCW, INFO ) * * Compute LQ factorization of work(iwa:iwa+m+nrhs-1,jwa:jwa+n-1) * NPW = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPW = IPTAU + MIN( MPW, NPW ) LWORK = DESCW( MB_ ) * ( MPW + NQW + DESCW( MB_ ) ) CALL PSGELQF( M+NRHS, N, WORK( IPWA ), IWA, JWA, DESCW, $ WORK( IPTAU ), WORK( IPW ), LWORK, INFO ) * * Compute largest entry in lower triangle in * work(iwa+m:iwa+m+nrhs-1,jwa+m:jwa+n-1) * ERR = ZERO DO 40 J = JWA+M, MIN( JWA+N-1, JWA+M+NRHS-1 ) CALL PSAMAX( JWA+M+NRHS-J, AMAX, IDUM, WORK( IPWA ), $ IWX+J-JWA-M, J, DESCW, 1 ) ERR = MAX( ERR, ABS( AMAX ) ) 40 CONTINUE CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, IDUM1, IDUM2, $ -1, -1, 0 ) * END IF * PSQRT14 = ERR / ( REAL( MAX( M, N, NRHS ) ) * $ PSLAMCH( ICTXT, 'Epsilon' ) ) * RETURN * * End of PSQRT14 * END scalapack-2.0.2/TESTING/LIN/psqrt16.f000644 000766 000024 00000025131 11622500733 017140 0ustar00juliestaff000000 000000 SUBROUTINE PSQRT16( TRANS, M, N, NRHS, A, IA, JA, DESCA, X, IX, $ JX, DESCX, B, IB, JB, DESCB, RWORK, RESID ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, IX, JA, JB, JX, M, N, NRHS REAL RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) REAL A( * ), B( * ), RWORK( * ), X( * ) * .. * * Purpose * ======= * * PSQRT16 computes the residual for a solution of a system of linear * equations sub( A )*sub( X ) = B or sub( A' )*sub( X ) = B: * RESID = norm(B - sub( A )*sub( X ) ) / * ( max(m,n) * norm(sub( A ) ) * norm(sub( X ) ) * EPS ), * where EPS is the machine epsilon, sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), and sub( X ) denotes * X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations: * = 'N': sub( A )*sub( X ) = sub( B ) * = 'T': sub( A' )*sub( X )= sub( B ), where A' is the * transpose of sub( A ). * = 'C': sub( A' )*sub( X )= B, where A' is the transpose * of sub( A ). * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) REAL pointer into the local * memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * The original M x N matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) REAL pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). This * array contains the local pieces of the computed solution * distributed vectors for the system of linear equations. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * B (local input/local output) REAL pointer into * the local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the distributes right hand side vectors for * the system of linear equations. On exit, sub( B ) is over- * written with the difference sub( B ) - sub( A )*sub( X ) or * sub( B ) - sub( A )'*sub( X ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * RWORK (local workspace) REAL array, dimension (LRWORK) * LWORK >= Nq0 if TRANS = 'N', and LRWORK >= Mp0 otherwise. * * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * RESID (global output) REAL * The maximum over the number of right hand sides of * norm( sub( B )- sub( A )*sub( X ) ) / * ( max(m,n) * norm( sub( A ) ) * norm( sub( X ) ) * EPS ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER ICTXT, IDUMM, J, MYCOL, MYROW, N1, N2, NPCOL, $ NPROW REAL ANORM, BNORM, EPS, XNORM * .. * .. Local Arrays .. REAL TEMP( 2 ) * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH, PSLANGE EXTERNAL LSAME, PSLAMCH, PSLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PSASUM, PSGEMM, SGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick exit if M = 0 or N = 0 or NRHS = 0 * IF( M.LE.0 .OR. N.LE.0 .OR. NRHS.EQ.0 ) THEN RESID = ZERO RETURN END IF * IF( LSAME( TRANS, 'T' ) .OR. LSAME( TRANS, 'C' ) ) THEN ANORM = PSLANGE( 'I', M, N, A, IA, JA, DESCA, RWORK ) N1 = N N2 = M ELSE ANORM = PSLANGE( '1', M, N, A, IA, JA, DESCA, RWORK ) N1 = M N2 = N END IF * EPS = PSLAMCH( ICTXT, 'Epsilon' ) * * Compute B - sub( A )*sub( X ) (or B - sub( A' )*sub( X ) ) and * store in B. * CALL PSGEMM( TRANS, 'No transpose', N1, NRHS, N2, -ONE, A, IA, $ JA, DESCA, X, IX, JX, DESCX, ONE, B, IB, JB, DESCB ) * * Compute the maximum over the number of right hand sides of * norm( sub( B ) - sub( A )*sub( X ) ) / * ( max(m,n) * norm( sub( A ) ) * norm( sub( X ) ) * EPS ). * RESID = ZERO DO 10 J = 1, NRHS * CALL PSASUM( N1, BNORM, B, IB, JB+J-1, DESCB, 1 ) CALL PSASUM( N2, XNORM, X, IX, JX+J-1, DESCX, 1 ) * * Only the process columns owning the vector operands will have * the correct result, the other will have zero. * TEMP( 1 ) = BNORM TEMP( 2 ) = XNORM IDUMM = 0 CALL SGAMX2D( ICTXT, 'All', ' ', 2, 1, TEMP, 2, IDUMM, IDUMM, $ -1, -1, IDUMM ) BNORM = TEMP( 1 ) XNORM = TEMP( 2 ) * * Every processes have ANORM, BNORM and XNORM now. * IF( ANORM.EQ.ZERO .AND. BNORM.EQ.ZERO ) THEN RESID = ZERO ELSE IF( ANORM.LE.ZERO .OR. XNORM.LE.ZERO ) THEN RESID = ONE / EPS ELSE RESID = MAX( RESID, ( ( BNORM / ANORM ) / XNORM ) / $ ( MAX( M, N )*EPS ) ) END IF * 10 CONTINUE * RETURN * * End of PSQRT16 * END scalapack-2.0.2/TESTING/LIN/psqrt17.f000644 000766 000024 00000032656 10363532303 017152 0ustar00juliestaff000000 000000 REAL FUNCTION PSQRT17( TRANS, IRESID, M, N, NRHS, A, $ IA, JA, DESCA, X, IX, JX, $ DESCX, B, IB, JB, DESCB, WORK, $ RWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, IRESID, IX, JA, JB, JX, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) REAL A( * ), B( * ), WORK( * ), X( * ) REAL RWORK( * ) * .. * * Purpose * ======= * * PSQRT17 computes the ratio * * || R'*op( sub( A ) ) ||/(||sub( A )||*alpha*max(M,N,NRHS)*eps) * * where R = op( sub( A ) )*sub( X ) - B, op(A) is A or A', and * * alpha = ||B|| if IRESID = 1 (zero-residual problem) * alpha = ||R|| if IRESID = 2 (otherwise). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies whether or not the transpose of sub( A ) is used. * = 'N': No transpose, op( sub( A ) ) = sub( A ). * = 'T': Transpose, op( sub( A ) ) = sub( A' ). * * IRESID (global input) INTEGER * IRESID = 1 indicates zero-residual problem. * IRESID = 2 indicates non-zero residual. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * If TRANS = 'N', the number of rows of the distributed * submatrix sub( B ). * If TRANS = 'T', the number of rows of the distributed * submatrix sub( X ). * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * If TRANS = 'N', the number of rows of the distributed * submatrix sub( X ). Otherwise N is the number of rows of * the distributed submatrix sub( B ). * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrices sub( X ) and sub( B ). * NRHS >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed M-by-N * submatrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) REAL pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). * If TRANS = 'N', this array contains the local pieces of the * N-by-NRHS distributed submatrix sub( X ). Otherwise, this * array contains the local pieces of the M-by-NRHS distributed * submatrix sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * B (local input) REAL pointer into the local memory * to an array of dimension (LLD_B,LOCc(JB+NRHS-1)). * If TRANS='N', this array contains the local pieces of the * distributed M-by-NRHS submatrix operand sub( B ). Otherwise, * this array contains the local pieces of the distributed * N-by-NRHS submatrix operand sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace) REAL array, dimension (LWORK) * If TRANS = 'N', LWORK >= Mp0 * NRHSq0 + NRHSp0 * Nq0 * otherwise LWORK >= Np0 * NRHSq0 + NRHSp0 * Mq0 * * RWORK (local workspace) REAL array, dimension (LRWORK) * LRWORK >= Nq0, if TRANS = 'N', and LRWORK >= Mp0 otherwise. * * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Np0 = NUMROC( N+ICOFFA, NB_A, MYROW, IAROW, NPROW ), * Mq0 = NUMROC( M+IROFFA, NB_A, MYCOL, IACOL, NPCOL ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * NRHSp0 = NUMROC( NRHS+ICOFFB, NB_B, MYROW, IBROW, NPROW ), * NRHSq0 = NUMROC( NRHS+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) * .. * .. Local Scalars .. INTEGER IACOL, IBCOL, IBROW, ICOFFB, ICTXT, INFO, $ IOFFA, IROFFB, ISCL, IW, IW2, JW, JW2, MYCOL, $ NRHSQ, NRHSP, MYROW, NCOLS, NPCOL, NPROW, $ NROWS, NROWSP REAL ERR, NORMA, NORMB, NORMRS, NORMX, SMLNUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), DESCW2( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC REAL PSLANGE, PSLAMCH EXTERNAL INDXG2P, LSAME, NUMROC, PSLANGE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PSGEMM, PSLACPY, $ PSLASCL, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, REAL * .. * .. Executable Statements .. * PSQRT17 = ZERO * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 IF( LSAME( TRANS, 'N' ) ) THEN NROWS = M NCOLS = N IOFFA = MOD( JA-1, DESCA( NB_ ) ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN NROWS = N NCOLS = M IOFFA = MOD( IA-1, DESCA( MB_ ) ) ELSE CALL PXERBLA( ICTXT, 'PSQRT17', -1 ) RETURN END IF * IF( M.LE.0 .OR. N.LE.0 .OR. NRHS.LE.0 ) $ RETURN * IROFFB = MOD( IA-1, DESCA( MB_ ) ) ICOFFB = MOD( JA-1, DESCA( NB_ ) ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * NRHSQ = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) NRHSP = NUMROC( NRHS+IROFFB, DESCB( NB_ ), MYROW, IBROW, NPROW ) NROWSP = NUMROC( NROWS+IROFFB, DESCA( MB_ ), MYROW, IBROW, NPROW ) * * Assign array descriptor DESCW for workspace WORK, where DESCW * holds a copy of the distributed submatrix sub( B ) * CALL DESCSET( DESCW, NROWS+IROFFB, NRHS+ICOFFB, DESCB( MB_ ), $ DESCB( NB_ ), IBROW, IBCOL, ICTXT, MAX( 1, $ NROWSP ) ) * * Assign array descriptor DESCW2 for workspace WORK, where DESCW2 * holds a copy of the distributed submatrix sub( X**T ) * CALL DESCSET( DESCW2, NRHS+ICOFFB, NCOLS+IOFFA, DESCX( NB_ ), $ DESCX( MB_ ), IBROW, IACOL, ICTXT, MAX( 1, $ NRHSP ) ) * NORMA = PSLANGE( 'One-norm', M, N, A, IA, JA, DESCA, RWORK ) SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) SMLNUM = SMLNUM / PSLAMCH( ICTXT, 'Precision' ) ISCL = 0 * * compute residual and scale it * IW = 1 + IROFFB JW = 1 + ICOFFB CALL PSLACPY( 'All', NROWS, NRHS, B, IB, JB, DESCB, WORK, IW, JW, $ DESCW ) CALL PSGEMM( TRANS, 'No transpose', NROWS, NRHS, NCOLS, -ONE, A, $ IA, JA, DESCA, X, IX, JX, DESCX, ONE, WORK, IW, JW, $ DESCW ) NORMRS = PSLANGE( 'Max', NROWS, NRHS, WORK, IW, JW, DESCW, $ RWORK ) IF( NORMRS.GT.SMLNUM ) THEN ISCL = 1 CALL PSLASCL( 'General', NORMRS, ONE, NROWS, NRHS, WORK, $ IW, JW, DESCW, INFO ) END IF * * compute R'*sub( A ) * IW2 = 1 + ICOFFB JW2 = 1 + IOFFA CALL PSGEMM( 'Transpose', TRANS, NRHS, NCOLS, NROWS, ONE, WORK, $ IW, JW, DESCW, A, IA, JA, DESCA, ZERO, $ WORK( NROWSP*NRHSQ+1 ), IW2, JW2, DESCW2 ) * * compute and properly scale error * ERR = PSLANGE( 'One-norm', NRHS, NCOLS, WORK( NROWSP*NRHSQ+1 ), $ IW2, JW2, DESCW2, RWORK ) IF( NORMA.NE.ZERO ) $ ERR = ERR / NORMA * IF( ISCL.EQ.1 ) $ ERR = ERR*NORMRS * IF( IRESID.EQ.1 ) THEN NORMB = PSLANGE( 'One-norm', NROWS, NRHS, B, IB, JB, DESCB, $ RWORK ) IF( NORMB.NE.ZERO ) $ ERR = ERR / NORMB ELSE NORMX = PSLANGE( 'One-norm', NCOLS, NRHS, X, IX, JX, DESCX, $ RWORK ) IF( NORMX.NE.ZERO ) $ ERR = ERR / NORMX END IF * PSQRT17 = ERR / ( PSLAMCH( ICTXT, 'Epsilon' ) * $ REAL( MAX( M, N, NRHS ) ) ) * RETURN * * End of PSQRT17 * END scalapack-2.0.2/TESTING/LIN/pstzrzrv.f000644 000766 000024 00000024362 10363532303 017550 0ustar00juliestaff000000 000000 SUBROUTINE PSTZRZRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSTZRZRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from T, Z * computed by PSTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= M >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors T and Z computed * by PSTZRZF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCr(M_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PSTZRZF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IN, $ IPT, IPV, IPW, JJA, JM1, JV, L, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSLACPY, $ PSLARZB, PSLARZT, PSLASET, PB_TOPGET, $ PB_TOPSET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.LT.M ) $ RETURN * L = N - M JM1 = JA + MIN( M+1, N ) - 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * CALL DESCSET( DESCV, DESCA( MB_ ), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * * Handle first block separately * IB = IN - IA + 1 JV = ICOFF + JM1 - JA + 1 * * Compute upper triangular matrix T * CALL PSLARZT( 'Backward', 'Rowwise', L, IB, A, IA, JM1, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'All', IB, L, A, IA, JM1, DESCA, WORK( IPV ), 1, $ JV, DESCV ) * * Save temporarily strict lower part of A(IA:IA+IB-1,JA:JA+IB-1) * CALL PSLACPY( 'Lower', IB-1, IB-1, A, IA+1, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) * * Zeroes the row panel of sub( A ) to get T(IA:IN,JA:JA+N-1) * CALL PSLASET( 'All', IB, L, ZERO, ZERO, A, IA, JM1, DESCA ) CALL PSLASET( 'Lower', IB-1, IB-1, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Apply block Householder transformation * CALL PSLARZB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ IN-IA+1, N, IB, L, WORK( IPV ), 1, JV, DESCV, $ WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * * Restore strict lower part of A( IA:IA+IB-1, JA:JA+N-1 ) * CALL PSLACPY( 'Lower', IB-1, IB-1, WORK( IPV ), 1, ICOFF+1, DESCV, $ A, IA+1, JA, DESCA ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * * Loop over the remaining row blocks * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Compute upper triangular matrix T * CALL PSLARZT( 'Backward', 'Rowwise', L, IB, A, I, JM1, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PSLACPY( 'All', IB, L, A, I, JM1, DESCA, WORK( IPV ), 1, $ JV, DESCV ) * * Save temporarily strict lower part of A(I:I+IB-1,J:J+IB-1 ) * CALL PSLACPY( 'Lower', IB-1, IB-1, A, I+1, JA+I-IA, DESCA, $ WORK( IPV ), 1, ICOFF+1+I-IA, DESCV ) * * Zeoes the row panel of sub( A ) to get T(IA:I-1,JA+I-IA:JA+N-1) * CALL PSLASET( 'All', IB, L, ZERO, ZERO, A, I, JM1, DESCA ) CALL PSLASET( 'Lower', IB-1, IB-1, ZERO, ZERO, A, I+1, JA+I-IA, $ DESCA ) * * Apply block Householder transformation * CALL PSLARZB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ I+IB-IA, N-I+IA, IB, L, WORK( IPV ), 1, JV, $ DESCV, WORK( IPT ), A, IA, JA+I-IA, DESCA, $ WORK( IPW ) ) * CALL PSLACPY( 'Lower', IB-1, IB-1, WORK( IPV ), 1, $ ICOFF+1+I-IA, DESCV, A, I+1, JA+I-IA, DESCA ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PSTZRZRV * END scalapack-2.0.2/TESTING/LIN/pzbmatgen.f000644 000766 000024 00000017145 10363532303 017614 0ustar00juliestaff000000 000000 SUBROUTINE PZBMATGEN( ICTXT, AFORM, AFORM2, BWL, BWU, N, $ MB, NB, A, $ LDA, IAROW, IACOL, ISEED, $ MYROW, MYCOL, NPROW, NPCOL ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. * .. Scalar Arguments .. CHARACTER*1 AFORM, AFORM2 INTEGER IACOL, IAROW, ICTXT, $ ISEED, LDA, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW, BWL, BWU * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PZBMATGEN : Parallel Complex Double precision Band MATrix GENerator. * (Re)Generate a distributed Band matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'L' : A is returned as a hermitian lower * triangular matrix, and is diagonally dominant. * if AFORM = 'U' : A is returned as a hermitian upper * triangular matrix, and is diagonally dominant. * if AFORM = 'G' : A is returned as a general matrix. * if AFORM = 'T' : A is returned as a general matrix in * tridiagonal-compatible form. * * AFORM2 (global input) CHARACTER*1 * if the matrix is general: * if AFORM2 = 'D' : A is returned diagonally dominant. * if AFORM2 != 'D' : A is not returned diagonally dominant. * if the matrix is symmetric or hermitian: * if AFORM2 = 'T' : A is returned in tridiagonally-compatible * form (a transpose form). * if AFORM2 != 'T' : A is returned in banded-compatible form. * * M (global input) INTEGER * The number of nonzero rows in the generated distributed * band matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) COMPLEX*16, pointer into the local memory * to an array of dimension ( LDA, * ) containing the local * pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * A( DIAG_INDEX, I ) = A( DIAG_INDEX, I ) + BWL+BWU * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * This code is a simple wrapper around PZMATGEN, for band matrices. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER DIAG_INDEX, I, J, M_MATGEN, NQ, N_MATGEN, $ START_INDEX * .. * .. External Subroutines .. EXTERNAL PZMATGEN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC, LSAME * .. * .. Executable Statements .. * * IF( LSAME( AFORM, 'L' ).OR.LSAME( AFORM, 'U' ) ) THEN M_MATGEN = BWL + 1 N_MATGEN = N START_INDEX = 1 IF( LSAME( AFORM, 'L' ) ) THEN DIAG_INDEX = 1 ELSE DIAG_INDEX = BWL + 1 ENDIF ELSE M_MATGEN = BWL + BWU + 1 N_MATGEN = N DIAG_INDEX = BWU + 1 START_INDEX = 1 ENDIF * NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * * * Generate a random matrix initially * IF( LSAME( AFORM, 'T' ) .OR. $ ( LSAME( AFORM2, 'T' ) ) ) THEN * CALL PZMATGEN( ICTXT, 'T', 'N', $ N_MATGEN, M_MATGEN, $ NB, M_MATGEN, A( START_INDEX, 1 ), $ LDA, IAROW, IACOL, $ ISEED, 0, NQ, 0, M_MATGEN, $ MYCOL, MYROW, NPCOL, NPROW ) * ELSE * CALL PZMATGEN( ICTXT, 'N', 'N', $ M_MATGEN, N_MATGEN, $ M_MATGEN, NB, A( START_INDEX, 1 ), $ LDA, IAROW, IACOL, $ ISEED, 0, M_MATGEN, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Zero out padding at tops of columns * DO 1000 J=1,NB * DO 2000 I=1, LDA-M_MATGEN * * Indexing goes negative; BMATGEN assumes that space * has been preallocated above the first column as it * has to be if the matrix is to be input to * Scalapack's band solvers. * A( I-LDA+M_MATGEN, J ) = CZERO * 2000 CONTINUE * 1000 CONTINUE * ENDIF * IF( LSAME( AFORM2, 'D' ).OR. $ ( LSAME( AFORM, 'L' ).OR.LSAME( AFORM, 'U' ) ) ) THEN * * Loop over diagonal elements stored on this processor. * * DO 330 I=1, NQ IF( LSAME( AFORM, 'T' ) .OR. $ ( LSAME( AFORM2, 'T' ) ) ) THEN IF( NPROW .EQ. 1 ) THEN A( I, DIAG_INDEX ) = DCMPLX( DBLE( A( I, DIAG_INDEX ) ) $ + DBLE( 2*( BWL+BWU+1 ) ) ) ENDIF ELSE IF( NPROW .EQ. 1 ) THEN A( DIAG_INDEX, I ) = DCMPLX( DBLE( A( DIAG_INDEX, I ) ) $ + DBLE( 2*( BWL+BWU+1 ) ) ) ENDIF END IF 330 CONTINUE * * ELSE * * Must add elements to keep condition of matrix in check * DO 380 I=1, NQ * IF( NPROW .EQ. 1 ) THEN * IF( MOD(I+MYCOL*NB,2) .EQ. 1 ) THEN A( DIAG_INDEX+1, I ) = $ DCMPLX( DBLE( A( DIAG_INDEX+1, I ) ) $ + DBLE( 2*( BWL+BWU+1 ) ) ) * ELSE * A( DIAG_INDEX-1, I ) = $ DCMPLX( DBLE( A( DIAG_INDEX-1, I ) ) $ + DBLE( 2*( BWL+BWU+1 ) ) ) ENDIF * ENDIF * 380 CONTINUE * END IF * RETURN * * End of PZBMATGEN * END scalapack-2.0.2/TESTING/LIN/pzdbdriver.f000644 000766 000024 00000076344 10363532303 020006 0ustar00juliestaff000000 000000 PROGRAM PZDBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PZDBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by ZDB. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM/ZPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER MEMSIZ, NTESTS, ZPLXSZ COMPLEX*16 PADVAL PARAMETER ( ZPLXSZ = 16, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZBMATGEN, $ PZCHEKPAD, PZDBINFO, PZDBLASCHK, PZDBTRF, $ PZDBTRS, PZFILLPAD, PZMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PZLANGE EXTERNAL LSAME, NUMROC, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PZDBINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = BWLVAL( BW_NUM ) IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = BWUVAL( BW_NUM ) IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*MAX(BWL,BWU)-1)/NPCOL + 1 ) $ + MAX(BWL,BWU) NB = MAX( NB, 2*MAX(BWL,BWU) ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*MAX(BWL,BWU), N ) ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (BWL+BWU+1), (BWL+BWU+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((BWL+BWU+1)+10) IMIDPAD = 10 IPOSTPAD = ((BWL+BWU+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (BWL+BWU+1), N, $ (BWL+BWU+1), NB, 0, 0, $ ICTXT,((BWL+BWU+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((BWL+BWU+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = MAX(BWL,BWU)*MAX(BWL,BWU) * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((BWL+BWU+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PZLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PZDBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PZBMATGEN( ICTXT, 'G', 'D', BWL, BWU, N, $ (BWL+BWU+1), NB, MEM( IPA ), $ ((BWL+BWU+1)+10), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) * CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((BWL+BWU+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PZLANGE( '1', (BWL+BWU+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((BWL+BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PZDBTRF( N, BWL, BWU, MEM( IPA ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PZDBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PZCHEKPAD( ICTXT, 'PZDBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((BWL+BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (MAX(BWL,BWU)*NRHS) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PZMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PZFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PZDBTRS( TRANS, N, BWL, BWU, NRHS, MEM( IPA ), $ 1, DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PZDBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZDBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PZDBLASCHK( 'N', 'D', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl bwu + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE(BWU)) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+bwu) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE(BWU)) $ *DBLE(NRHS)) * * Multiply by 4 to get complex count * NOPS = NOPS * DBLE(4) * * Second calc to represent actual hardware speed * * 2*N_FIRST bwl*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE(BWL)*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST bwl*bwu * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE(BWL) $ *DBLE(BWU)) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB bwl*bwu flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE(BWL) $ *DBLE(BWU)) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( BWL*BWU*BWL/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*BWL*BWU*BWL) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+bwu) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE(BWU)) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * (bwl+bwu) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE(BWL)+ $ DBLE(BWU)))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * (bwl+bwu) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE(BWL)+ $ DBLE(BWU)))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*(BWL*BWU ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * BWL*BWU ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PZDBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pzdbinfo.f000644 000766 000024 00000044340 10363532303 017435 0ustar00juliestaff000000 000000 SUBROUTINE PZDBINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PZDBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'C' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'C' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZDBINFO * END scalapack-2.0.2/TESTING/LIN/pzdblaschk.f000644 000766 000024 00000026247 10363532303 017755 0ustar00juliestaff000000 000000 SUBROUTINE PZDBLASCHK( SYMM, UPLO, TRANS, N, BWL, BWU, NRHS, X, $ IX, JX, DESCX, IASEED, A, IA, JA, DESCA, $ IBSEED, ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, TRANS, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX*16 A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PZDBLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'H', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * TRANS if TRANS= 'C', A 'Conjugate transpose' is used as the * coefficient matrix in the solve. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,max(max(bwl,bwu)*(max(bwl,bwu)+2),NB))+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,max(max(bwl,bwu)*(max(bwl,bwu)+2),NB))+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER BW, INFO, IPPRODUCT, WORK_MIN DOUBLE PRECISION DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D, PBZTRAN, $ PZMATGEN, ZGAMX2D, ZGEMM, ZGSUM2D, $ ZLASET * .. * .. External Functions .. INTEGER IZAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IZAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'H' ) ) THEN BW = BWL WORK_MIN = MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ELSE BW = MAX(BWL, BWU) WORK_MIN = MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PZBLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (BWL+BWU+1), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'H' )) THEN CALL PZBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PZBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PZGBDCMV( BWL+BWU+1, BWL, BWU, TRANS, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), $ (MAX(BWL,BWU)+2)*MAX(BWL,BWU), INFO ) * * * Regenerate column of B * CALL PZMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PZAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PDZNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PDZNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PZBLASCHK * END scalapack-2.0.2/TESTING/LIN/pzdbmv1.f000644 000766 000024 00000076142 10607174520 017216 0ustar00juliestaff000000 000000 SUBROUTINE PZGBDCMV( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, $ B, IB, DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZDBTRF and this is stored in AF. If a linear system * is to be solved using PZDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M, $ DU_N_N, DU_P_M, DU_P_N, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM2, IDUM3, J, JA_NEW, $ LLDA, LLDB, MAX_BW, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PZDBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PZDBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZDBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = CZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (MAX_BW+2)*MAX_BW WORK( I ) = CZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( TRANS, 'N' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * * Use main partition in each processor to multiply locally * CALL ZGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, CONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, CZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL ZCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWL-DL_N_N ), 1 ) * CALL ZTRMV( 'U', 'N', 'N', BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL ZGESD2D( ICTXT, BWL, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWL, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = CZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL ZCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * CALL ZTRMV( $ 'L', $ 'N', $ 'N', BWU, $ A( 1 ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 30 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL ZGESD2D( ICTXT, BWU, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL ZGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL ZAXPY( BWL, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL ZGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL ZAXPY( BWU, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWU+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( TRANS, 'C' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL ZLATCPY( 'L', BWU, BWU, A( OFST+1 ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to left * CALL ZTRSD2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL-1 ) * ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL ZLATCPY( 'U', BWL, BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to right * CALL ZTRSD2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL ZGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, CONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, CZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL ZCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWU-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL ZTRRV2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), MAX_BW, MYROW, MYCOL+1 ) * CALL ZTRMV( 'U', 'N', 'N', BWU, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL ZGESD2D( ICTXT, BWU, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWU, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = CZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL ZCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL ZTRRV2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), MAX_BW, MYROW, MYCOL-1 ) * CALL ZTRMV( $ 'L', $ 'N', $ 'N', BWL, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 60 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL ZGESD2D( ICTXT, BWL, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL ZGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL ZAXPY( BWU, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL ZGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL ZAXPY( BWL, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWL+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PZBhBMV1 * END scalapack-2.0.2/TESTING/LIN/pzdtdriver.f000644 000766 000024 00000076650 10363532303 020030 0ustar00juliestaff000000 000000 PROGRAM PZDTDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PZDTDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by ZDT. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM/ZPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER MEMSIZ, NTESTS, ZPLXSZ COMPLEX*16 PADVAL PARAMETER ( ZPLXSZ = 16, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZBMATGEN, $ PZCHEKPAD, PZDTINFO, PZDTLASCHK, PZDTTRF, $ PZDTTRS, PZFILLPAD, PZMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PZLANGE EXTERNAL LSAME, NUMROC, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PZDTINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = 1 IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = 1 IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*INT_ONE-1)/NPCOL + 1 ) $ + INT_ONE NB = MAX( NB, 2*INT_ONE ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*INT_ONE, N ) ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (3), (3), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((3)+10) IMIDPAD = 10 IPOSTPAD = ((3)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, N, (3), $ NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((3)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + (NB+10)*(3) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (12*NPCOL+3*NB) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 8*NPCOL * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((3)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PZLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PZDTLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,NB)+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PZBMATGEN( ICTXT, 'T', 'D', BWL, BWU, N, (3), NB, $ MEM( IPA ), NB+10, 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) CALL PZFILLPAD( ICTXT, NQ, NP, MEM( IPA-IPREPAD ), $ NB+10, IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PZLANGE( 'I', N, $ (3), MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', NQ, NP, $ MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PZDTTRF( N, MEM( IPA+2*( NB+10 ) ), $ MEM( IPA+1*( NB+10 ) ), MEM( IPA ), 1, $ DESCA, MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW ), IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PZDTTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PZCHEKPAD( ICTXT, 'PZDTTRF', NQ, $ NP, MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = 10*NPCOL+4*NRHS * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PZMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PZFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PZDTTRS( TRANS, N, NRHS, $ MEM( IPA+2*( NB+10 ) ), $ MEM( IPA+1*( NB+10 ) ), MEM( IPA ), $ 1, DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PZDTTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZDTTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * * Reset descriptor describing A to 1-by-P grid for * use in banded utility routines * CALL DESCINIT( DESCA2D, (3), N, $ (3), NB, 0, 0, $ ICTXT, (3), IERR( 1 ) ) CALL PZDTLASCHK( 'N', 'D', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl INT_ONE + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE(INT_ONE)) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+INT_ONE) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE(INT_ONE)) $ *DBLE(NRHS)) * * Multiply by 4 to get complex count * NOPS = NOPS * DBLE(4) * * Second calc to represent actual hardware speed * * 2*N_FIRST bwl*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE(BWL)*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST bwl*INT_ONE * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE(BWL) $ *DBLE(INT_ONE)) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB bwl*INT_ONE flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE(BWL) $ *DBLE(INT_ONE)) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( BWL*INT_ONE*BWL/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*BWL*INT_ONE*BWL) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+INT_ONE) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE(INT_ONE)) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * (bwl+INT_ONE) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE(BWL)+ $ DBLE(INT_ONE)))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * (bwl+INT_ONE) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE(BWL)+ $ DBLE(INT_ONE)))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*(BWL*INT_ONE ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * BWL*INT_ONE ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PZDTTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pzdtinfo.f000644 000766 000024 00000044361 10363532303 017462 0ustar00juliestaff000000 000000 SUBROUTINE PZDTINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PZDTINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW NBW = 1 IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'C' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'C' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZDTINFO * END scalapack-2.0.2/TESTING/LIN/pzdtlaschk.f000644 000766 000024 00000030757 10363532303 020000 0ustar00juliestaff000000 000000 SUBROUTINE PZDTLASCHK( SYMM, UPLO, TRANS, N, BWL, BWU, NRHS, X, $ IX, JX, DESCX, IASEED, A, IA, JA, DESCA, $ IBSEED, ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, TRANS, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX*16 A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PZDTLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'H', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * TRANS if TRANS= 'C', A 'Conjugate transpose' is used as the * coefficient matrix in the solve. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,NB)+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,NB)+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER I, START INTEGER BW, INFO, IPPRODUCT, WORK_MIN DOUBLE PRECISION DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D, PBZTRAN, $ PZMATGEN, ZGAMX2D, ZGEMM, ZGSUM2D, $ ZLASET * .. * .. External Functions .. INTEGER IZAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IZAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'H' ) ) THEN BW = BWL START = 1 WORK_MIN = MAX(5,NB)+2*NB ELSE BW = MAX(BWL, BWU) IF( LSAME( UPLO, 'D' )) THEN START = 1 ELSE START = 2 ENDIF WORK_MIN = MAX(5,NB)+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PZTLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (3), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'H' )) THEN CALL PZBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PZBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Matrix formed above has the diagonals shifted from what was * input to the tridiagonal routine. Shift them back. * * Send elements to neighboring processors * IF( MYCOL.GT.0 ) THEN CALL ZGESD2D( ICTXT, 1, 1, A( START+2), LDA, $ MYROW, MYCOL-1 ) ENDIF * IF( MYCOL.LT.NPCOL-1 ) THEN CALL ZGESD2D( ICTXT, 1, 1, $ A( START+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * * Shift local elements * DO 220 I=0,DESCA( NB_ )-1 A( START+2+(I)*LDA ) = A( START+2+(I+1)*LDA ) 220 CONTINUE * DO 230 I=DESCA( NB_ )-1,0,-1 A( START+(I+1)*LDA ) = A( START+(I)*LDA ) 230 CONTINUE * * Receive elements from neighboring processors * IF( MYCOL.LT.NPCOL-1 ) THEN CALL ZGERV2D( ICTXT, 1, 1, $ A( START+2+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * IF( MYCOL.GT.0 ) THEN CALL ZGERV2D( ICTXT, 1, 1, A( START), LDA, $ MYROW, MYCOL-1 ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PZGBDCMV( BWL+BWU+1, BWL, BWU, TRANS, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), $ (INT_ONE+2)*INT_ONE, INFO ) * * * Regenerate column of B * CALL PZMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PZAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PDZNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PDZNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PZTLASCHK * END scalapack-2.0.2/TESTING/LIN/pzgbdriver.f000644 000766 000024 00000100137 10363532303 017775 0ustar00juliestaff000000 000000 PROGRAM PZGBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PZGBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by ZGB. This test driver performs an * A = L*U factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTMEM INTEGER, default = 2048. * INTMEM is the size of the integer workspace used in this * driver as input as the IPIV vector. It represents an * upper bound on NB, the blocksize of the data * distribution. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM/ZPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER INTMEM PARAMETER ( INTMEM = 2048 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER MEMSIZ, NTESTS, ZPLXSZ COMPLEX*16 PADVAL PARAMETER ( ZPLXSZ = 16, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BWL, BWU, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, $ I, IAM, IASEED, IBSEED, ICTXT, ICTXTB, $ IERR_TEMP, IMIDPAD, INFO, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER IPIV(INTMEM) INTEGER BWLVAL( NTESTS ), BWUVAL( NTESTS ), DESCA( 7 ), $ DESCA2D( DLEN_ ), DESCB( 7 ), DESCB2D( DLEN_ ), $ IERR( 1 ), NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZBMATGEN, $ PZCHEKPAD, PZDBLASCHK, PZFILLPAD, PZGBINFO, $ PZGBTRF, PZGBTRS, PZMATGEN, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PZLANGE EXTERNAL LSAME, NUMROC, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PZGBINFO( OUTFILE, NOUT, TRANS, NMAT, NVAL, NTESTS, NBW, $ BWLVAL, BWUVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, $ NRVAL, NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, $ NTESTS, QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BWL = BWLVAL( BW_NUM ) IF( BWL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Lower Band', 'bwl', BWL IERR( 1 ) = 1 END IF * BWU = BWUVAL( BW_NUM ) IF( BWU.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Upper Band', 'bwu', BWU IERR( 1 ) = 1 END IF * IF( BWL.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * IF( BWU.GT.N-1 ) THEN IF( IAM.EQ.0 ) THEN IERR( 1 ) = 1 ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*(BWL+BWU)-1)/NPCOL + 1 ) $ + (BWL+BWU) NB = MAX( NB, 2*(BWL+BWU) ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 * IF( NB.GT.INTMEM ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) THEN WRITE( NOUT,* )'You have chosen an ' $ ,'NB > INTMEM in the driver.' WRITE(NOUT, *)'Please edit the driver ' $ ,'and increase the value of INTMEM' ENDIF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (2*BWL+2*BWU+1), (2*BWL+2*BWU+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((2*BWL+2*BWU+1)+10) IMIDPAD = 10 IPOSTPAD = ((2*BWL+2*BWU+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (2*BWL+2*BWU+1), N, $ (2*BWL+2*BWU+1), NB, 0, 0, $ ICTXT,((2*BWL+2*BWU+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((2*BWL+2*BWU+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 1 * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((2*BWL+2*BWU+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PZLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PZDBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(MAX(BWL,BWU)*(MAX(BWL,BWU)+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PZBMATGEN( ICTXT, 'G', 'N', BWL, BWU, N, $ (2*BWL+2*BWU+1), NB, MEM( IPA+BWL+BWU ), $ ((2*BWL+2*BWU+1)+10), 0, 0, IASEED, $ MYROW, MYCOL, NPROW, NPCOL ) * CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((2*BWL+2*BWU+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PZLANGE( '1', (2*BWL+2*BWU+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((2*BWL+2*BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PZGBTRF( N, BWL, BWU, MEM( IPA ), 1, DESCA, IPIV, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PZGBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PZCHEKPAD( ICTXT, 'PZGBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((2*BWL+2*BWU+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = NRHS*(NB+2*BWL+4*BWU) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PZMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PZFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PZGBTRS( TRANS, N, BWL, BWU, NRHS, MEM( IPA ), $ 1, DESCA, IPIV, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PZGBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PZDBLASCHK( 'N', 'N', TRANS, $ N, BWL, BWU, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA+BWL+BWU ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * 2 N bwl (bwl+bwu) + N (bwl) flops * for LU factorization * NOPS = 2*(DBLE(N)*DBLE(BWL)* $ DBLE((BWL+BWU))) + $ (DBLE(N)*DBLE(BWL)) * * nrhs * 2 N*(bwl+(bwl+bwu)) flops for LU solve. * NOPS = NOPS + $ 2 * (DBLE(N)*(DBLE(BWL)+DBLE((BWL+BWU))) $ *DBLE(NRHS)) * * Multiply by 4 to get complex count * NOPS = NOPS * DBLE(4) * * Second calc to represent actual hardware speed * * 2*N_FIRST (bwl+bwu)*bwu Flops for LU * factorization in proc 1 * NOPS2 = 2*( (DBLE(N_FIRST)* $ DBLE((BWL+BWU))*DBLE(BWU))) * IF ( NPROCS_REAL .GT. 1) THEN * 8 N_LAST (bwl+bwu)*(bwl+bwu) * flops for LU and spike * calc in last processor * NOPS2 = NOPS2 + $ 8*( (DBLE(N_LAST)*DBLE((BWL+BWU)) $ *DBLE((BWL+BWU))) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 8 NB (bwl+bwu)*(bwl+bwu) flops for LU and spike * calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 8*( (DBLE(NB)*DBLE((BWL+BWU)) $ *DBLE((BWL+BWU))) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-1 ) * $ ( (BWL+BWU)*(BWL+BWU)*(BWL+BWU)/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ 2*( NPROCS_REAL-2 ) * $ (2*(BWL+BWU)*(BWL+BWU)*(BWL+BWU)) ENDIF * * Solve stage * * nrhs*2 n_first* * (bwl+(bwl+bwu)) * flops for L,U solve in proc 1. * NOPS2 = NOPS2 + $ 2* $ DBLE(N_FIRST)* $ DBLE(NRHS) * $ ( DBLE(BWL)+DBLE((BWL+BWU))) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*2 n_last * ((bwl+bwu)+(bwl+bwu)) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ 4* $ (DBLE(N_LAST)*(DBLE((BWL+BWU))+ $ DBLE((BWL+BWU))))*DBLE(NRHS) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2*nrhs*2 NB * ((bwl+bwu)+(bwl+bwu)) * flops for LU solve in other procs * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( (DBLE(NB)*(DBLE((BWL+BWU))+ $ DBLE((BWL+BWU))))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1)*2*((BWL+BWU)*(BWL+BWU) ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * $ ( 6 * (BWL+BWU)*(BWL+BWU) ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', TRANS, $ N, $ BWL, BWU, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME TR N BWL BWU NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- --- ---- ----- ---- ---- -------- ', $ '-------- -------- -------- ------' ) 9993 FORMAT( A4,1X,A1,2X,I6,1X,I3,1X,I3,1X,I4,1X,I5, $ 1X,I4,1X,I4,1X,F9.3, $ F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PZGBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pzgbinfo.f000644 000766 000024 00000044340 10363532303 017440 0ustar00juliestaff000000 000000 SUBROUTINE PZGBINFO( SUMMRY, NOUT, TRANS, NMAT, NVAL, LDNVAL, NBW, $ BWLVAL, BWUVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, $ NNR, NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWLVAL( LDBWVAL),BWUVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PZGBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWLVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BWL (number of subdiagonals in matrix) to run * the code with. * BWUVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of supdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLU.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get TRANS * READ( NIN, FMT = * ) TRANS * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWLVAL( I ), I = 1, NBW ) READ( NIN, FMT = * ) ( BWUVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( TRANS, 'N' ) ) THEN WORK( I ) = 1 ELSE TRANS = 'C' WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWLVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NBW, BWUVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| /'// $ ' (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bwl, bwu : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bwl ', ( BWLVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWLVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'bwu ', ( BWUVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWUVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN TRANS = 'N' ELSE TRANS = 'C' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS I = I + NBW * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWLVAL, 1 ) I = I + NBW CALL ICOPY( NBW, WORK( I ), 1, BWUVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZGBINFO * END scalapack-2.0.2/TESTING/LIN/pzgbmv1.f000644 000766 000024 00000076142 10607174520 017221 0ustar00juliestaff000000 000000 SUBROUTINE PZGBDCMV( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, $ B, IB, DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZDBTRF and this is stored in AF. If a linear system * is to be solved using PZDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M, $ DU_N_N, DU_P_M, DU_P_N, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM2, IDUM3, J, JA_NEW, $ LLDA, LLDB, MAX_BW, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PZDBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PZDBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZDBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = CZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (MAX_BW+2)*MAX_BW WORK( I ) = CZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( TRANS, 'N' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * * Use main partition in each processor to multiply locally * CALL ZGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, CONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, CZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL ZCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWL-DL_N_N ), 1 ) * CALL ZTRMV( 'U', 'N', 'N', BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL ZGESD2D( ICTXT, BWL, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWL, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = CZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL ZCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * CALL ZTRMV( $ 'L', $ 'N', $ 'N', BWU, $ A( 1 ), LLDA-1, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 30 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL ZGESD2D( ICTXT, BWU, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL ZGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL ZAXPY( BWL, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL ZGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL ZAXPY( BWU, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWU+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( TRANS, 'C' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) * DU_P_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) DU_P_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) * DU_N_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DU_N_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL ZLATCPY( 'L', BWU, BWU, A( OFST+1 ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to left * CALL ZTRSD2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL-1 ) * ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL ZLATCPY( 'U', BWL, BWL, $ A( LLDA*( NUMROC_SIZE-BWL )+1+BWU+BWL ), $ LLDA-1, WORK( 1 ), MAX_BW ) * * Send the triangle to neighboring processor to right * CALL ZTRSD2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), $ MAX_BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL ZGBMV( TRANS, NUMROC_SIZE, NUMROC_SIZE, BWL, BWU, CONE, $ A( OFST+1 ), LLDA, B(PART_OFFSET+1), 1, CZERO, $ X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL ZCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( MAX_BW*MAX_BW+1+BWU-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL ZTRRV2D(ICTXT, 'U', 'N', $ BWU, BWU, $ WORK( 1 ), MAX_BW, MYROW, MYCOL+1 ) * CALL ZTRMV( 'U', 'N', 'N', BWU, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( MAX_BW*MAX_BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL ZGESD2D( ICTXT, BWU, 1, $ WORK( MAX_BW*MAX_BW+1 ), BWU, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, MAX_BW*( MAX_BW+2 ) WORK( I ) = CZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL ZCOPY( DU_P_N, B( 1 ), 1, $ WORK( MAX_BW*MAX_BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL ZTRRV2D(ICTXT, 'L', 'N', $ BWL, BWL, $ WORK( 1 ), MAX_BW, MYROW, MYCOL-1 ) * CALL ZTRMV( $ 'L', $ 'N', $ 'N', BWL, $ WORK( 1 ), MAX_BW, $ WORK( MAX_BW*MAX_BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DU_P_N .GT. DU_P_M ) THEN DO 60 I=1, DU_P_N-DU_P_M WORK( MAX_BW*MAX_BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL ZGESD2D( ICTXT, BWL, 1, WORK(MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL ZGERV2D( ICTXT, BWU, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWU, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL ZAXPY( BWU, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL ZGERV2D( ICTXT, BWL, 1, WORK( MAX_BW*MAX_BW+1 ), $ BWL, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL ZAXPY( BWL, CONE, $ WORK( MAX_BW*MAX_BW+1 ), 1, $ X( NUMROC_SIZE-BWL+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PZBhBMV1 * END scalapack-2.0.2/TESTING/LIN/pzgelqrv.f000644 000766 000024 00000023112 10363532303 017466 0ustar00juliestaff000000 000000 SUBROUTINE PZGELQRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGELQRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from L, Q * computed by PZGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors L and Q computed * by PZGELQF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors computed by PZGELQF. TAU * is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IL, IN, $ IPT, IPV, IPW, J, JJA, JV, K, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PZLACPY, $ PZLARFB, PZLARFT, PZLASET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IL = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) * ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IL, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * CALL DESCSET( DESCV, DESCA( MB_ ), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * DO 10 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+K-I, DESCA( MB_ ) ) J = JA + I - IA JV = 1 + I - IA + ICOFF * * Compute upper triangular matrix T * CALL PZLARFT( 'Forward', 'Rowwise', N-J+JA, IB, A, I, J, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'Upper', IB, N-J+JA, A, I, J, DESCA, WORK( IPV ), $ 1, JV, DESCV ) CALL PZLASET( 'Lower', IB, N-J+JA, ZERO, ONE, WORK( IPV ), 1, $ JV, DESCV ) * * Zeroes the strict upper triangular part of sub( A ) to get * block column of L * CALL PZLASET( 'Upper', IB, N-J+JA-1, ZERO, ZERO, A, I, J+1, $ DESCA ) * * Apply block Householder transformation * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-I+IA, N-J+JA, IB, WORK( IPV ), 1, $ JV, DESCV, WORK( IPT ), A, I, J, DESCA, $ WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) * 10 CONTINUE * * Handle first block separately * IB = IN - IA + 1 * * Compute upper triangular matrix T * CALL PZLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, TAU, $ WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'Upper', IB, N, A, IA, JA, DESCA, WORK( IPV ), 1, $ ICOFF+1, DESCV ) CALL PZLASET( 'Lower', IB, N, ZERO, ONE, WORK, 1, ICOFF+1, DESCV ) * * Zeroes the strict upper triangular part of sub( A ) to get * block column of L * CALL PZLASET( 'Upper', IB, N-1, ZERO, ZERO, A, IA, JA+1, DESCA ) * * Apply block Householder transformation * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M, N, IB, WORK( IPV ), 1, ICOFF+1, DESCV, $ WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PZGELQRV * END scalapack-2.0.2/TESTING/LIN/pzgeqlrv.f000644 000766 000024 00000024102 10363532303 017466 0ustar00juliestaff000000 000000 SUBROUTINE PZGEQLRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEQLRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from L, Q * computed by PZGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors L and Q computed * by PZGEQLF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(N_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PZGEQLF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK = NB_A * ( 2*Mp0 + Nq0 + NB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IIA, IPT, IPV, IPW, IROFF, $ IV, J, JB, JJA, JN, K, MP, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PB_TOPGET, $ PB_TOPSET, PZLACPY, PZLARFB, PZLARFT, $ PZLASET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) CALL INFOG2L( IA, JA+N-K, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPV = 1 IPT = IPV + MP * DESCA( NB_ ) IPW = IPT + DESCA( NB_ ) * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * CALL DESCSET( DESCV, M+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, MP ) ) * * Handle first block separately * IV = 1 + M - K + IROFF JB = JN - JA - N + K + 1 * * Compute upper triangular matrix T * CALL PZLARFT( 'Backward', 'Columnwise', M-N+JN-JA+1, JB, A, IA, $ JA+N-K, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'All', M-N+JN-JA+1, JB, A, IA, JA+N-K, DESCA, $ WORK( IPV ), IROFF+1, 1, DESCV ) CALL PZLASET( 'Lower', JB, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeoes the strict upper triangular part of A to get block * row of L * CALL PZLASET( 'All', M-K, JB, ZERO, ZERO, A, IA, JA+N-K, $ DESCA ) CALL PZLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA+M-K, $ JA+N-K+1, DESCA ) * * Apply block Householder transformation * CALL PZLARFB( 'Left', 'No transpose', 'Backward', 'Columnwise', $ M-N+JN-JA+1, JN-JA+1, JB, WORK( IPV ), IROFF+1, 1, $ DESCV, WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) * * Loop over the remaining column blocks * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) IV = 1 + M - N + J - JA + IROFF * * Compute upper triangular matrix T * CALL PZLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, A, IA, $ J, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'All', M-N+J+JB-JA, JB, A, IA, J, DESCA, $ WORK( IPV ), IROFF+1, 1, DESCV ) CALL PZLASET( 'Lower', JB, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeoes the strict upper triangular part of sub( A ) to get * block row of L * CALL PZLASET( 'All', M-N+J-JA, JB, ZERO, ZERO, A, IA, J, $ DESCA ) CALL PZLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA+M-N+J-JA, $ J+1, DESCA ) * * Apply block Householder transformation * CALL PZLARFB( 'Left', 'No transpose', 'Backward', 'Columnwise', $ M-N+J+JB-JA, J+JB-JA, JB, WORK( IPV ), IROFF+1, $ 1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PZGEQLRV * END scalapack-2.0.2/TESTING/LIN/pzgeqrrv.f000644 000766 000024 00000023503 10363532303 017500 0ustar00juliestaff000000 000000 SUBROUTINE PZGEQRRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEQRRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from Q, R * computed by PZGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors Q and R computed * by PZGEQRF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors computed by PZGEQRF. TAU * is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK = NB_A * ( 2*Mp0 + Nq0 + NB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, IIA, IPT, IPV, IPW, $ IROFF, IV, J, JB, JJA, JL, JN, K, MP, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PB_TOPGET, $ PB_TOPSET, PZLACPY, PZLARFB, PZLARFT, $ PZLASET * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPV = 1 IPT = IPV + MP * DESCA( NB_ ) IPW = IPT + DESCA( NB_ ) * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * K = MIN( M, N ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JL = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) * CALL DESCSET( DESCV, M+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, INDXG2P( JL, DESCA( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ), ICTXT, $ MAX( 1, MP ) ) * DO 10 J = JL, JN+1, -DESCA( NB_ ) JB = MIN( JA+K-J, DESCA( NB_ ) ) I = IA + J - JA IV = 1 + J - JA + IROFF * * Compute upper triangular matrix T * CALL PZLARFT( 'Forward', 'Columnwise', M-I+IA, JB, A, I, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'Lower', M-I+IA, JB, A, I, J, DESCA, WORK( IPV ), $ IV, 1, DESCV ) CALL PZLASET( 'Upper', M-I+IA, JB, ZERO, ONE, WORK( IPV ), IV, $ 1, DESCV ) * * Zeroes the strict lower triangular part of sub( A ) to get * block column of R * CALL PZLASET( 'Lower', M-I+IA-1, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Apply block Householder transformation * CALL PZLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M-I+IA, N-J+JA, JB, WORK( IPV ), IV, 1, DESCV, $ WORK( IPT ), A, I, J, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle first block separately * JB = JN - JA + 1 * * Compute upper triangular matrix T * CALL PZLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPV ), $ IROFF+1, 1, DESCV ) CALL PZLASET( 'Upper', M, JB, ZERO, ONE, WORK, IROFF+1, 1, DESCV ) * * Zeroes the strict lower triangular part of sub( A ) to get block * column of R * CALL PZLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Apply block Householder transformation * CALL PZLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', M, $ N, JB, WORK( IPV ), IROFF+1, 1, DESCV, WORK( IPT ), $ A, IA, JA, DESCA, WORK( IPW ) ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PZGEQRRV * END scalapack-2.0.2/TESTING/LIN/pzgerqrv.f000644 000766 000024 00000023765 10363532303 017512 0ustar00juliestaff000000 000000 SUBROUTINE PZGERQRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGERQRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from R, Q * computed by PZGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors R and Q computed * by PZGERQF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCr(M_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PZGERQF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IN, $ IPT, IPV, IPW, JJA, JV, K, MYCOL, MYROW, NPCOL, $ NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PB_TOPGET, $ PB_TOPSET, PZLACPY, PZLARFB, PZLARFT, $ PZLASET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * K = MIN( M, N ) IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) * ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA+M-K, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * CALL DESCSET( DESCV, DESCA( MB_), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * * Handle first block separately * IB = IN - IA - M + K + 1 JV = 1 + N - K + ICOFF * * Compute upper triangular matrix T * CALL PZLARFT( 'Backward', 'Rowwise', N-M+IN-IA+1, IB, A, IA+M-K, $ JA, DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'All', IB, N-M+IN-IA+1, A, IA+M-K, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) CALL PZLASET( 'Upper', IB, IB, ZERO, ONE, WORK( IPV ), 1, JV, $ DESCV ) * * Zeoes the strict lower triangular part of sub( A ) to get block * column of R * CALL PZLASET( 'All', IB, N-K, ZERO, ZERO, A, IA+M-K, JA, $ DESCA ) CALL PZLASET( 'Lower', IB-1, IB, ZERO, ZERO, A, IA+M-K+1, $ JA+N-K, DESCA ) * * Apply block Householder transformation * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', IN-IA+1, N-M+IN-IA+1, IB, WORK( IPV ), 1, $ ICOFF+1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * * Loop over the remaining row blocks * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) JV = 1 + N - M + I - IA + ICOFF * * Compute upper triangular matrix T * CALL PZLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, I, JA, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'All', IB, N-M+I+IB-IA, A, I, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) CALL PZLASET( 'Upper', IB, IB, ZERO, ONE, WORK( IPV ), 1, JV, $ DESCV ) * * Zeoes the strict Lower triangular part of sub( A ) to get * block column of R * CALL PZLASET( 'All', IB, N-M+I-IA, ZERO, ZERO, A, I, JA, $ DESCA ) CALL PZLASET( 'Lower', IB-1, IB, ZERO, ZERO, A, I+1, $ JA+N-M+I-IA, DESCA ) * * Apply block Householder transformation * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', I+IB-IA, N-M+I+IB-IA, IB, WORK( IPV ), $ 1, ICOFF+1, DESCV, WORK( IPT ), A, IA, JA, DESCA, $ WORK( IPW ) ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PZGERQRV * END scalapack-2.0.2/TESTING/LIN/pzgetrrv.f000644 000766 000024 00000030102 10363532303 017474 0ustar00juliestaff000000 000000 SUBROUTINE PZGETRRV( M, N, A, IA, JA, DESCA, IPIV, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZGETRRV reforms sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from the * triangular matrices L and U returned by PZGETRF. It multiplies * an upper triangular matrix stored in the upper triangle of sub( A ) * times the unit lower triangular matrix stored in the lower triangle. * To accomplish this, the routine basically performs the PZGETRF * routine in reverse. * * It computes L*U first, and then apply P: P*L*U => sub( A ). In the * J-th loop, the block column (or column panel), which has the lower * triangular unit matrix L is multiplied with the block row (or row * panel), which contains the upper triangular matrix U. * * ( L1 ) ( 0 0 ) ( L1*U1 L1*U2 ) * A` = L * U + A` = ( ) * (U1 U2) + ( ) = ( ) * ( L2 ) ( 0 A`) ( L2*U1 L2*U2+A` ) * * where L1 is a lower unit triangular matrix and U1 is an upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the distributed matrix sub( A ) * contains the the factors L and U from the factorization * sub( A ) = P*L*U; the unit diagonal elements of L are not * stored. On exit, the original distributed matrix sub( A ) * is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX*16 array of dimension (LWORK) * LWORK >= MpA0 * NB_A + NqA0 * MB_A, where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * WORK is used to store a block of columns of L, and a block of * rows of U. INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IL, IPL, IPU, IROFF, J, $ JB, JL, JN, MN, MP, MYCOL, MYROW, NPCOL, NPROW * .. Local Arrays .. INTEGER DESCIP( DLEN_ ), DESCL( DLEN_ ), $ DESCU( DLEN_ ), IDUM( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PB_TOPGET, PB_TOPSET, $ PZGEMM, PZLACPY, PZLAPIV, PZLASET * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPL = 1 IPU = IPL + MP * DESCA( NB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Define array descriptors for L and U * MN = MIN( M, N ) IL = MAX( ( ( IA+MN-2 ) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) JL = MAX( ( ( JA+MN-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+MN-1 ) IAROW = INDXG2P( IL, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) * CALL DESCSET( DESCL, IA+M-IL, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, MP ) ) * CALL DESCSET( DESCU, DESCA( MB_ ), JA+N-JL, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * CALL DESCSET( DESCIP, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ NUMROC( DESCA( M_ ), DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) + DESCA( MB_ ) ) * * DO 10 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+MN-J, DESCA( NB_ ) ) * * Copy unit lower triangular part of sub( A ) into WORK * CALL PZLACPY( 'Lower', M-IL+IA, JB, A, IL, J, DESCA, $ WORK( IPL ), 1, 1, DESCL ) CALL PZLASET( 'Upper', M-IL+IA, JB, ZERO, ONE, WORK( IPL ), $ 1, 1, DESCL ) * * Copy upper triangular part of sub( A ) into WORK(IPU) * CALL PZLACPY( 'Upper', JB, JA+N-J, A, IL, J, DESCA, $ WORK( IPU ), 1, 1, DESCU ) CALL PZLASET( 'Lower', JB-1, JA+N-J, ZERO, ZERO, $ WORK( IPU ), 2, 1, DESCU ) * * Zero the strict lower triangular piece of the current block. * CALL PZLASET( 'Lower', IA+M-IL-1, JB, ZERO, ZERO, A, IL+1, J, $ DESCA ) * * Zero the upper triangular piece of the current block. * CALL PZLASET( 'Upper', JB, JA+N-J, ZERO, ZERO, A, IL, J, $ DESCA ) * * Update the matrix sub( A ). * CALL PZGEMM( 'No transpose', 'No transpose', IA+M-IL, $ JA+N-J, JB, ONE, WORK( IPL ), 1, 1, DESCL, $ WORK( IPU ), 1, 1, DESCU, ONE, A, IL, J, DESCA ) * IL = IL - DESCA( MB_ ) DESCL( M_ ) = DESCL( M_ ) + DESCL( MB_ ) DESCL( RSRC_ ) = MOD( DESCL( RSRC_ ) + NPROW - 1, NPROW ) DESCL( CSRC_ ) = MOD( DESCL( CSRC_ ) + NPCOL - 1, NPCOL ) DESCU( N_ ) = DESCU( N_ ) + DESCU( NB_ ) DESCU( RSRC_ ) = DESCL( RSRC_ ) DESCU( CSRC_ ) = DESCL( CSRC_ ) * 10 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Copy unit lower triangular part of sub( A ) into WORK * CALL PZLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPL ), $ 1, 1, DESCL ) CALL PZLASET( 'Upper', M, JB, ZERO, ONE, WORK( IPL ), 1, 1, $ DESCL ) * * Copy upper triangular part of sub( A ) into WORK(IPU) * CALL PZLACPY( 'Upper', JB, N, A, IA, JA, DESCA, WORK( IPU ), 1, $ 1, DESCU ) CALL PZLASET( 'Lower', JB-1, N, ZERO, ZERO, WORK( IPU ), 2, 1, $ DESCU ) * * Zero the strict lower triangular piece of the current block. * CALL PZLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Zero the upper triangular piece of the current block. * CALL PZLASET( 'Upper', JB, N, ZERO, ZERO, A, IA, JA, DESCA ) * * Update the matrix sub( A ). * CALL PZGEMM( 'No transpose', 'No transpose', M, N, JB, ONE, $ WORK( IPL ), 1, 1, DESCL, WORK( IPU ), 1, 1, $ DESCU, ONE, A, IA, JA, DESCA ) * * Apply pivots so that sub( A ) = P*L*U * CALL PZLAPIV( 'Backward', 'Row', 'Col', MIN( M, N ), N, A, IA, JA, $ DESCA, IPIV, IA, 1, DESCIP, IDUM ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PZGETRRV * END scalapack-2.0.2/TESTING/LIN/pzinvchk.f000644 000766 000024 00000034051 10363532303 017454 0ustar00juliestaff000000 000000 SUBROUTINE PZINVCHK( MATTYP, N, A, IA, JA, DESCA, IASEED, ANORM, $ FRESID, RCOND, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, IASEED, JA, N DOUBLE PRECISION ANORM, FRESID, RCOND * .. * .. Array Arguments .. CHARACTER*3 MATTYP INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZINVCHK computes the scaled residual * * || sub( A ) * inv( sub( A ) ) - I || / ( || sub( A ) || * N * eps ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1). to check the result * returned by the matrix inversion routines. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MATTYP (global input) CHARACTER*3 * The type of the distributed matrix to be generated: * if MATTYP = 'GEN' then GENeral matrix, * if MATTYP = 'UTR' then Upper TRiangular matrix, * if MATTYP = 'LTR' then Lower TRiangular matrix, * if MATTYP = 'UPD' then (Upper) Hermitian Positive Definite, * if MATTYP = 'LPD' then (Lower) Hermitian Positive Definite. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of local dimension (LLD_A, LOCc(JA+N-1)). On * entry, sub( A ) contains the distributed matrix inverse * computed by PZGETRI, PZPOTRI or PZTRTRI. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * Seed for the random generation of sub( A ). * * ANORM (global input) DOUBLE PRECISION * The 1-norm of the original matrix sub( A ). * * FRESID (global output) DOUBLE PRECISION * The inversion residual. * * RCOND (global output) DOUBLE PRECISION * The condition number of the original distributed matrix. * RCOND = || sub( A ) ||.|| sub( A )^{-1} || where ||A|| * denotes the 1-norm of A. * * WORK (local workspace) COMPLEX*16 array, dimension * MAX(2*LOCr(N_A+MOD(IA-1,MB_A))*MB_A, LDW) * where LDW is the workspace requirement for the norm computa- * tions, see PZLANGE, PZLANHE, PZLANSY and PZLANTR. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. CHARACTER AFORM, DIAG, UPLO INTEGER ICTXT, ICURCOL, ICURROW, II, IIA, IPW, IROFF, $ IW, J, JB, JJA, JN, KK, MYCOL, MYROW, NP, $ NPCOL, NPROW DOUBLE PRECISION AUXNORM, EPS, NRMINVAXA, TEMP * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PZGEMM, $ PZHEMM, PZLASET, PZMATGEN, PZTRMM * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE, PZLANHE, PZLANTR EXTERNAL ICEIL, LSAMEN, NUMROC, PDLAMCH, PZLANGE, $ PZLANHE, PZLANSY, PZLANTR * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * EPS = PDLAMCH( DESCA( CTXT_ ), 'eps' ) * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Compute the condition number * IF( LSAMEN( 1, MATTYP( 1:1 ), 'U' ) ) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * AFORM = 'N' DIAG = 'D' AUXNORM = PZLANGE( '1', N, N, A, IA, JA, DESCA, WORK ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN * AFORM = 'N' DIAG = 'D' AUXNORM = PZLANTR( '1', UPLO, 'Non unit', N, N, A, IA, JA, $ DESCA, WORK ) ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * AFORM = 'H' DIAG = 'D' AUXNORM = PZLANHE( '1', UPLO, N, A, IA, JA, DESCA, WORK ) * END IF RCOND = ANORM*AUXNORM * * Compute inv(A)*A * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ ICURROW, ICURCOL ) * * Define array descriptor for working array WORK * IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, ICURROW, NPROW ) CALL DESCSET( DESCW, N+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), ICURROW, ICURCOL, DESCA( CTXT_ ), $ MAX( 1, NP ) ) IPW = DESCW( LLD_ ) * DESCW( NB_ ) + 1 * IF( MYROW.EQ.ICURROW ) THEN II = IROFF + 1 NP = NP - IROFF ELSE II = 1 END IF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN - JA + 1 * * Handle first block separately, regenerate a block of columns of A * IW = IROFF + 1 IF( MYCOL.EQ.ICURCOL ) THEN IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), WORK, $ DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, IIA-1, NP, $ JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) IF( LSAMEN( 3, MATTYP, 'UTR' ) ) THEN CALL PZLASET( 'Lower', N-1, JB, ZERO, ZERO, WORK, IW+1, $ 1, DESCW ) ELSE CALL PZLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, WORK, IW, $ 2, DESCW ) END IF ELSE CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), WORK( IPW ), $ DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, $ IIA-1, NP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) END IF END IF * * Multiply A^{-1}*A * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * CALL PZGEMM( 'No tranpose', 'No transpose', N, JB, N, ONE, A, $ IA, JA, DESCA, WORK( IPW ), IW, 1, DESCW, ZERO, $ WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN * CALL PZTRMM( 'Left', UPLO, 'No tranpose', 'Non unit', N, JB, $ ONE, A, IA, JA, DESCA, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * CALL PZHEMM( 'Left', UPLO, N, JB, ONE, A, IA, JA, DESCA, $ WORK(IPW), IW, 1, DESCW, ZERO, WORK, IW, 1, $ DESCW ) * END IF * * subtract the identity matrix to the diagonal block of these cols * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 10 KK = 0, JB-1 WORK( II+KK*(DESCW(LLD_)+1) ) = $ WORK( II+KK*(DESCW( LLD_ )+1) )-ONE 10 CONTINUE END IF * NRMINVAXA = PZLANGE( '1', N, JB, WORK, IW, 1, DESCW, WORK( IPW ) ) * IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJA = JJA + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) DESCW( CSRC_ ) = ICURCOL * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) * JB = MIN( N-J+JA, DESCA( NB_ ) ) * * regenerate a block of columns of A * IF( MYCOL.EQ.ICURCOL ) THEN IF( LSAMEN( 2, MATTYP( 2:3 ), 'TR' ) ) THEN CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCW( MB_ ), DESCW( NB_ ), $ WORK, DESCW( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), $ IASEED, IIA-1, NP, JJA-1, JB, MYROW, $ MYCOL, NPROW, NPCOL ) IF( LSAMEN( 3, MATTYP, 'UTR' ) ) THEN CALL PZLASET( 'Lower', JA+N-J-1, JB, ZERO, ZERO, $ WORK, IW+J-JA+1, 1, DESCW ) ELSE CALL PZLASET( 'All', J-JA, JB, ZERO, ZERO, WORK, IW, $ 1, DESCW ) CALL PZLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, $ WORK, IW+J-JA, 2, DESCW ) END IF ELSE CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCW( MB_ ), DESCW( NB_ ), $ WORK( IPW ), DESCW( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, NP, $ JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) END IF END IF * * Multiply A^{-1}*A * IF( LSAMEN( 3, MATTYP, 'GEN' ) ) THEN * CALL PZGEMM( 'No tranpose', 'No transpose', N, JB, N, ONE, $ A, IA, JA, DESCA, WORK( IPW ), IW, 1, DESCW, $ ZERO, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP(2:3), 'TR' ) ) THEN * CALL PZTRMM( 'Left', UPLO, 'No tranpose', 'Non unit', N, JB, $ ONE, A, IA, JA, DESCA, WORK, IW, 1, DESCW ) * ELSE IF( LSAMEN( 2, MATTYP( 2:3 ), 'PD' ) ) THEN * CALL PZHEMM( 'Left', UPLO, N, JB, ONE, A, IA, JA, DESCA, $ WORK(IPW), IW, 1, DESCW, ZERO, WORK, IW, 1, $ DESCW ) * END IF * * subtract the identity matrix to the diagonal block of these * cols * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 20 KK = 0, JB-1 WORK( II+KK*(DESCW( LLD_ )+1) ) = $ WORK( II+KK*(DESCW( LLD_ )+1) ) - ONE 20 CONTINUE END IF * * Compute the 1-norm of these JB cols * TEMP = PZLANGE( '1', N, JB, WORK, IW, 1, DESCW, WORK( IPW ) ) NRMINVAXA = MAX( TEMP, NRMINVAXA ) * IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJA = JJA + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) DESCW( CSRC_ ) = ICURCOL * 30 CONTINUE * * Compute the scaled residual * FRESID = NRMINVAXA / ( N * EPS * ANORM ) * RETURN * * End of PZINVCHK * END scalapack-2.0.2/TESTING/LIN/pzinvdriver.f000644 000766 000024 00000101713 10430450246 020202 0ustar00juliestaff000000 000000 PROGRAM PZINVDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PZINVDRIVER is the main test program for the COMPLEX*16 * SCALAPACK matrix inversion routines. This test driver computes the * inverse of different kind of matrix and tests the results. * * The program must be driven by a short data file. An annotated example * of a data file can be obtained by deleting the first 3 characters * from the following 14 lines: * 'ScaLAPACK Matrix Inversion Testing input file' * 'PVM machine.' * 'INV.out' output file name (if any) * 6 device out * 5 number of matrix types (next line) * 'GEN' 'UTR' 'LTR' 'UPD' LPD' GEN, UTR, LTR, UPD, LPD * 4 number of problems sizes * 1000 2000 3000 4000 values of N * 3 number of NB's * 4 30 35 values of NB * 2 number of process grids (ordered P & Q) * 4 2 values of P * 4 4 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ, DBLESZ and ZPLXSZ indicate the length in bytes on * the given platform for an integer, a double precision real * and a double precision complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, INTGSZ, MEMSIZ, NTESTS, TOTMEM, ZPLXSZ COMPLEX*16 PADVAL, ZERO PARAMETER ( DBLESZ = 8, INTGSZ = 4, TOTMEM = 2000000, $ ZPLXSZ = 16, MEMSIZ = TOTMEM / ZPLXSZ, $ NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER UPLO CHARACTER*3 MTYP CHARACTER*6 PASSED CHARACTER*80 OUTFILE LOGICAL CHECK INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, $ IPPIV, IPREPAD, IPOSTPAD, IPIW, IPW, ITEMP, J, $ K, KTESTS, KPASS, KFAIL, KSKIP, L, LCM, LIPIV, $ LIWORK, LWORK, MYCOL, MYROW, N, NB, NGRIDS, $ NMAT, NMTYP, NNB, NOUT, NP, NPCOL, NPROCS, $ NPROW, NQ, WORKIINV, WORKINV, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, RCOND, TMFLOPS * .. * .. Local Arrays .. CHARACTER*3 MATTYP( NTESTS ) INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), $ QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZCHEKPAD, $ PZFILLPAD, PZGETRF, PZGETRI, $ PZINVCHK, PZINVINFO, PZLASET, $ PZMATGEN, PZPOTRF, PZPOTRI, $ PZTRTRI, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PZLANGE, PZLANHE, PZLANSY, PZLANTR EXTERNAL ICEIL, ILCM, LSAMEN, NUMROC, PZLANGE, $ PZLANHE, PZLANSY, PZLANTR * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP /4*0/ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PZINVINFO( OUTFILE, NOUT, NMTYP, MATTYP, NTESTS, NMAT, NVAL, $ NTESTS, NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Loop over the different matrix types * DO 40 I = 1, NMTYP * MTYP = MATTYP( I ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a general matrix.' ELSE IF( LSAMEN( 3, MTYP, 'UTR' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is an upper triangular matrix.' ELSE IF( LSAMEN( 3, MTYP, 'LTR' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a lower triangular matrix.' ELSE IF( LSAMEN( 3, MTYP, 'UPD' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a Hermitian positive definite matrix.' WRITE( NOUT, FMT = 9986 ) $ 'Only the upper triangular part will be '// $ 'referenced.' ELSE IF( LSAMEN( 3, MTYP, 'LPD' ) ) THEN WRITE( NOUT, FMT = 9986 ) $ 'A is a Hermitian positive definite matrix.' WRITE( NOUT, FMT = 9986 ) $ 'Only the lower triangular part will be '// $ 'referenced.' END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 J = 1, NGRIDS * NPROW = PVAL( J ) NPCOL = QVAL( J ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 K = 1, NMAT * N = NVAL( K ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 L = 1, NNB * NB = NBVAL( L ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for ScaLAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 * LCM = ILCM( NPROW, NPCOL ) IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * Pivots are needed by LU factorization * IPPIV = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + $ IPREPAD LIPIV = ICEIL( INTGSZ * ( NP + NB ), ZPLXSZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * LWORK = MAX( 1, NP * DESCA( NB_ ) ) WORKINV = LWORK + IPOSTPAD * * Figure the amount of workspace required by the * general matrix inversion * IF( NPROW.EQ.NPCOL ) THEN LIWORK = NQ + DESCA( NB_ ) ELSE * * change the integer workspace needed for PDGETRI * LIWORK = MAX( DESCA( NB_ ), DESCA( MB_ ) * * $ ICEIL( ICEIL( DESCA( LLD_ ), * $ DESCA( MB_ ) ), LCM / NPROW ) ) * $ + NQ LIWORK = NUMROC( DESCA( M_ ) + $ DESCA( MB_ ) * NPROW $ + MOD ( 1 - 1, DESCA( MB_ ) ), DESCA ( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ) + $ MAX ( DESCA( MB_ ) * ICEIL ( ICEIL( $ NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW, $ DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ), $ DESCA( MB_ ) ), LCM / NPROW ), DESCA( NB_ ) ) * END IF WORKIINV = ICEIL( LIWORK*INTGSZ, ZPLXSZ ) + $ IPOSTPAD IPIW = IPW + WORKINV + IPREPAD WORKSIZ = WORKINV + IPREPAD + WORKIINV * ELSE * * No pivots or workspace needed for triangular or * Hermitian positive definite matrices. * IPW = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + IPREPAD WORKSIZ = 1 + IPOSTPAD * END IF * IF( CHECK ) THEN * * Figure amount of work space for the norm * computations * IF( LSAMEN( 3, MTYP, 'GEN' ).OR. $ LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN ITEMP = NQ ELSE ITEMP = 2 * NQ + NP IF( NPROW.NE.NPCOL ) THEN ITEMP = ITEMP + $ NB * ICEIL( ICEIL( NP, NB ), $ LCM / NPROW ) END IF END IF WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ ICEIL( DBLESZ * ITEMP, ZPLXSZ ) ) * * Figure the amount of workspace required by the * checking routine * WORKSIZ = MAX( WORKSIZ, 2 * NB * MAX( 1, NP ) ) + $ IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'inversion', $ ( IPW + WORKSIZ ) * ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( LSAMEN( 3, MTYP, 'GEN' ).OR. $ LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * Generate a general diagonally dominant matrix A * CALL PZMATGEN( ICTXT, 'N', 'D', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, $ NQ, MYROW, MYCOL, NPROW, NPCOL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * Generate a Hermitian positive definite matrix A * CALL PZMATGEN( ICTXT, 'H', 'D', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, $ NQ, MYROW, MYCOL, NPROW, NPCOL ) * END IF * * Zeros not-referenced part of A, if any. * IF( LSAMEN( 1, MTYP, 'U' ) ) THEN * UPLO = 'U' CALL PZLASET( 'Lower', N-1, N-1, ZERO, ZERO, $ MEM( IPA ), 2, 1, DESCA ) * ELSE IF( LSAMEN( 1, MTYP, 'L' ) ) THEN * UPLO = 'L' CALL PZLASET( 'Upper', N-1, N-1, ZERO, ZERO, $ MEM( IPA ), 1, 2, DESCA ) * ELSE * UPLO = 'G' * END IF * * Need 1-norm of A for checking * IF( CHECK ) THEN * CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * CALL PZFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANGE( '1', N, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKINV-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKIINV-IPOSTPAD, 1, $ MEM( IPIW-IPREPAD ), $ WORKIINV-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * ANORM = PZLANTR( '1', UPLO, 'Non unit', N, N, $ MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANTR', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANTR', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * ANORM = PZLANHE( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'SY' ) ) THEN * CALL PZFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANSY( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANSY', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD,IPOSTPAD, PADVAL ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'HE' ) ) THEN CALL PZFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANHE( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * END IF * END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * Perform LU factorization * CALL SLTIMER( 1 ) CALL PZGETRF( N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGETRF', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRF', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * Perform the general matrix inversion * CALL SLTIMER( 2 ) CALL PZGETRI( N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ), LWORK, $ MEM( IPIW ), LIWORK, INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGETRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRI', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRI', $ WORKIINV-IPOSTPAD, 1, $ MEM( IPIW-IPREPAD ), $ WORKIINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRI', $ WORKINV-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKINV-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * Perform the general matrix inversion * CALL SLTIMER( 2 ) CALL PZTRTRI( UPLO, 'Non unit', N, MEM( IPA ), 1, $ 1, DESCA, INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZTRTRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * Perform Cholesky factorization * CALL SLTIMER( 1 ) CALL PZPOTRF( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZPOTRF', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * Perform the Hermitian positive definite matrix * inversion * CALL SLTIMER( 2 ) CALL PZPOTRI( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ INFO ) CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZPOTRI', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * END IF * IF( CHECK ) THEN * CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Compute fresid = || inv(A)*A-I || * CALL PZINVCHK( MTYP, N, MEM( IPA ), 1, 1, DESCA, $ IASEED, ANORM, FRESID, RCOND, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZINVCHK', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZINVCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. INFO.EQ.0 .AND. $ ( (FRESID-FRESID) .EQ. 0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 IF( INFO.GT.0 ) THEN PASSED = 'SINGUL' ELSE PASSED = 'FAILED' END IF END IF * ELSE * * Don't perform the checking, only the timing * operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( LSAMEN( 3, MTYP, 'GEN' ) ) THEN * * 8/3 N^3 - N^2 flops for LU factorization * NOPS = ( 8.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) - $ DBLE( N )**2 * * 16/3 N^3 for matrix inversion * NOPS = NOPS + $ ( 16.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'TR' ) ) THEN * * 4/3 N^3 + 2 N^2 for triangular matrix inversion * CTIME(1) = 0.0D+0 WTIME(1) = 0.0D+0 NOPS = ( 4.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ 2.0D+0 * ( DBLE( N )**2 ) * ELSE IF( LSAMEN( 2, MTYP( 2:3 ), 'PD' ) ) THEN * * 4/3 N^3 + 3 N^2 flops for Cholesky factorization * NOPS = ( 4.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ 2.0D+0 * ( DBLE( N )**2 ) * * 8/3 N^3 + 5 N^2 flops for Cholesky inversion * NOPS = NOPS + $ ( 8.0D+0 / 3.0D+0 ) * ( DBLE( N )**3 ) + $ 5.0D+0 * ( DBLE( N )**2 ) * END IF * * Figure total megaflops -- factorization and * inversion, for WALL and CPU time, and print * output. * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 2 ) .GE. 0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ RCOND, FRESID, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ) + CTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ) .GE. 0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ RCOND, FRESID, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * 40 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q Fct Time Inv Time ', $ ' MFLOPS Cond Resid CHECK' ) 9994 FORMAT( '---- ----- --- ----- ----- -------- -------- ', $ '----------- ------- ------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I5, 1X, I5, 1X, F8.2, 1X, F8.2, $ 1X, F11.2, 1X, F7.1, 1X, F7.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( A ) * STOP * * End of PZINVDRIVER * END scalapack-2.0.2/TESTING/LIN/pzinvinfo.f000644 000766 000024 00000035257 10363532303 017653 0ustar00juliestaff000000 000000 SUBROUTINE PZINVINFO( SUMMRY, NOUT, NMTYP, MATTYP, LDMTYP, NMAT, $ NVAL, LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, $ PVAL, LDPVAL, QVAL, LDQVAL, THRESH, WORK, $ IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDMTYP, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NMTYP, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. CHARACTER*3 MATTYP( LDMTYP ) CHARACTER*( * ) SUMMRY INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZINVINFO gets needed startup information for matrix inversion * tests and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMTYP (global output) INTEGER * The number of different matrix types to be tested. * * MATTYP (global output) CHARACTER*3 array of dimension of LDMTYP, * The types of matrix to be generated: * if MATTYP(i) = 'GEN' then GENeral matrix, * if MATTYP(i) = 'UTR' then Upper TRiangular matrix, * if MATTYP(i) = 'LTR' then Lower TRiangular matrix, * if MATTYP(i) = 'UPD' then (Upper) hermitian Pos. Definite, * if MATTYP(i) = 'LPD' then (Lower) hermitian Pos. Definite. * * LDMTYP (global input) INTEGER * The maximum number of different matrix types to be tested. * LDMTYP >= NMTYP. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 4, LDMTYP+LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack * all input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT, K DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAMEN DOUBLE PRECISION PDLAMCH EXTERNAL LSAMEN, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='INV.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get the matrix types to be tested * READ( NIN, FMT = * ) NMTYP IF( NMTYP.LT.1 .OR. NMTYP.GT.LDMTYP ) THEN WRITE( NOUT, FMT = 9994 ) 'nb of matrix types', LDMTYP GO TO 40 END IF READ( NIN, FMT = * ) ( MATTYP( I ), I = 1, NMTYP ) * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 40 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 40 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 40 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 40 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I ) * QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS WORK( 4 ) = NMTYP CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 DO 20 K = 1, NMTYP IF( LSAMEN( 3, MATTYP( K ), 'GEN' ) ) THEN WORK( I ) = 1 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'UTR' ) ) THEN WORK( I ) = 2 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'LTR' ) ) THEN WORK( I ) = 3 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'UPD' ) ) THEN WORK( I ) = 4 I = I + 1 ELSE IF( LSAMEN( 3, MATTYP( K ), 'LPD' ) ) THEN WORK( I ) = 5 I = I + 1 END IF 20 CONTINUE * CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Matrix Inversion routines.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision Matrix Inversion '// $ 'routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Inverse residual = ||inv(A)*A - I|| '// $ '/ (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less '// $ 'than THRESH, CHECK is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'Fct time : Time in seconds to factor the'// $ ' matrix, if needed.' WRITE( NOUT, FMT = 9999 ) $ 'Inv Time : Time in seconds to inverse the'// $ ' matrix.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and inverse.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) NMTYP = WORK( 4 ) * I = NMTYP+NMAT+NNB+2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * DO 30 K = 1, NMTYP IF( WORK( K ).EQ.1 ) THEN MATTYP( K ) = 'GEN' ELSE IF( WORK( K ).EQ.2 ) THEN MATTYP( K ) = 'UTR' ELSE IF( WORK( K ).EQ.3 ) THEN MATTYP( K ) = 'LTR' ELSE IF( WORK( K ).EQ.4 ) THEN MATTYP( K ) = 'UPD' ELSE IF( WORK( K ).EQ.5 ) THEN MATTYP( K ) = 'LPD' END IF 30 CONTINUE * I = NMTYP + 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 40 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZINVINFO * END scalapack-2.0.2/TESTING/LIN/pzlafchk.f000644 000766 000024 00000027013 10363532303 017422 0ustar00juliestaff000000 000000 SUBROUTINE PZLAFCHK( AFORM, DIAG, M, N, A, IA, JA, DESCA, IASEED, $ ANORM, FRESID, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER AFORM, DIAG INTEGER IA, IASEED, JA, M, N DOUBLE PRECISION ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZLAFCHK computes the residual * || sub( A ) - sub( Ao ) || / (|| sub( Ao ) ||*eps*MAX(M,N)), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A( IA:IA+M-1, JA:JA+N-1 ) and ||.|| stands for the infini- * ty norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * AFORM (global input) CHARACTER * sub( A ) is overwritten with: * - a symmetric matrix, if AFORM = 'S'; * - a Hermitian matrix, if AFORM = 'H'; * - the transpose of what would normally be generated, * if AFORM = 'T'; * - the conjugate transpose of what would normally be * generated, if AFORM = 'C'; * - otherwise a random matrix. * * DIAG (global input) CHARACTER * if DIAG = 'D' : sub( A ) is diagonally dominant. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub( A ) - sub( Ao ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * ANORM (global input) DOUBLE PRECISION * The Infinity norm of sub( A ). * * FRESID (global output) DOUBLE PRECISION * The maximum (worst) factorizational error. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK). * LWORK >= MpA0 * NB_A, where * * IROFFA = MOD( IA-1, MB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * WORK is used to store a block of columns of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = (1.0D+0, 0.0D+0) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURCOL, ICURROW, $ II, IIA, IOFFA, IROFF, JB, JJ, JJA, JN, KK, $ LDA, LDW, LDWP1, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PZMATGEN, ZMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL ICEIL, LSAME, NUMROC, PDLAMCH, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( ICTXT, 'eps' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * * Compute sub( A ) := sub( A ) - sub( Ao ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDW = MAX( 1, MP ) LDWP1 = LDW + 1 LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA - 1 )*LDA * IF( LSAME( AFORM, 'H' ) ) THEN * * Handle first block of columns separately * II = 1 ICURROW = IAROW ICURCOL = IACOL JB = JN - JA + 1 * IF( MYCOL.EQ.ICURCOL ) THEN CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) IF( MYROW.EQ.ICURROW ) THEN DO 10, KK = 0, JB-1 WORK( II+KK*LDWP1 ) = DBLE( WORK( II+KK*LDWP1 ) ) 10 CONTINUE END IF CALL ZMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 30, JJ = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-JJ, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ WORK, LDW, DESCA( RSRC_ ), DESCA( CSRC_ ), $ IASEED, IIA-1, MP, JJA-1, JB, MYROW, $ MYCOL, NPROW, NPCOL ) IF( MYROW.EQ.ICURROW ) THEN DO 20, KK = 0, JB-1 WORK( II+KK*LDWP1 ) = DBLE( WORK( II+KK*LDWP1 ) ) 20 CONTINUE END IF CALL ZMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE * ELSE * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN JB = JN-JA+1 CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL ZMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB NQ = NQ - JB IOFFA = IOFFA + JB * LDA END IF * * Handle the remaning blocks of columns * DO 40 JJ = JJA, JJA+NQ-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JJA+NQ-JJ ) IOFFA = IIA + ( JJ - 1 )*LDA CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJ-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL ZMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) 40 CONTINUE * END IF * * Calculate factor residual * FRESID = PZLANGE( 'I', M, N, A, IA, JA, DESCA, WORK ) / $ ( MAX( M, N ) * EPS * ANORM ) * RETURN * * End PZLAFCHK * END scalapack-2.0.2/TESTING/LIN/pzlaschk.f000644 000766 000024 00000030444 10363532303 017441 0ustar00juliestaff000000 000000 SUBROUTINE PZLASCHK( SYMM, DIAG, N, NRHS, X, IX, JX, DESCX, $ IASEED, IA, JA, DESCA, IBSEED, ANORM, RESID, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, SYMM INTEGER IA, IASEED, IBSEED, IX, JA, JX, N, NRHS DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX*16 WORK( * ), X( * ) * .. * * Purpose * ======= * * PZLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed matrix, * otherwise sub( A ) is a general distributed matrix. * * DIAG (global input) CHARACTER * If DIAG = 'D', sub( A ) is diagonally dominant. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 0. * * X (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_X,LOCc(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK >= MAX(1,Np)*NB_X + Nq*NB_X + MAX( MAX(NQ*MB_A,2*NB_X), * NB_X * NUMROC( NUMROC(N,MB_X,0,0,NPCOL), MB_X, 0, 0, LCMQ ) ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, IB, ICOFF, ICTXT, ICURCOL, IDUMM, $ II, IIA, IIX, IOFFX, IPA, IPB, IPW, IPX, IROFF, $ IXCOL, IXROW, J, JBRHS, JJ, JJA, JJX, LDX, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION DIVISOR, EPS, RESID1 COMPLEX*16 BETA * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D, PBZTRAN, $ PZMATGEN, ZGAMX2D, ZGEMM, ZGSUM2D, $ ZLASET * .. * .. External Functions .. INTEGER IZAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IZAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * LDX = MAX( 1, NP ) IPB = 1 IPX = IPB + NP * DESCX( NB_ ) IPA = IPX + NQ * DESCX( NB_ ) * IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF * ICURCOL = IXCOL * * Loop over the rhs * DO 40 J = 1, NRHS, DESCX( NB_ ) JBRHS = MIN( DESCX( NB_ ), NRHS-J+1 ) * * Transpose x from ICURCOL to all rows * IOFFX = IIX + ( JJX - 1 ) * DESCX( LLD_ ) CALL PBZTRAN( ICTXT, 'Column', 'Transpose', N, JBRHS, $ DESCX( MB_ ), X( IOFFX ), DESCX( LLD_ ), ZERO, $ WORK( IPX ), JBRHS, IXROW, ICURCOL, -1, IACOL, $ WORK( IPA ) ) * * Regenerate B in IXCOL * IF( MYCOL.EQ.ICURCOL ) THEN CALL PZMATGEN( ICTXT, 'N', 'N', DESCX( M_ ), DESCX( N_ ), $ DESCX( MB_ ), DESCX( NB_ ), WORK( IPB ), LDX, $ IXROW, IXCOL, IBSEED, IIX-1, NP, JJX-1, $ JBRHS, MYROW, MYCOL, NPROW, NPCOL ) BETA = ONE ELSE BETA = ZERO END IF * IF( NQ.GT.0 ) THEN DO 10 II = IIA, IIA+NP-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IIA+NP-II ) * * Regenerate ib rows of the matrix A(IA:IA+N-1,JA:JA+N-1). * CALL PZMATGEN( ICTXT, SYMM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ WORK( IPA ), IB, DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, II-1, IB, $ JJA-1, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Compute B <= B - A * X. * CALL ZGEMM( 'No transpose', 'Transpose', IB, JBRHS, NQ, $ -ONE, WORK( IPA ), IB, WORK( IPX ), JBRHS, $ BETA, WORK( IPB+II-IIA ), LDX ) * 10 CONTINUE * ELSE IF( MYCOL.NE.ICURCOL ) THEN * CALL ZLASET( 'All', NP, JBRHS, ZERO, ZERO, WORK( IPB ), $ LDX ) * END IF * * Add B rowwise to ICURCOL * CALL ZGSUM2D( ICTXT, 'Row', ' ', NP, JBRHS, WORK( IPB ), LDX, $ MYROW, ICURCOL ) * IF( MYCOL.EQ.ICURCOL ) THEN * * Figure || A * X - B || & || X || * IPW = IPA + JBRHS DO 20 JJ = 0, JBRHS - 1 IF( NP.GT.0 ) THEN II = IZAMAX( NP, WORK( IPB+JJ*LDX ), 1 ) WORK( IPA+JJ ) = ABS( WORK( IPB+II-1+JJ*LDX ) ) WORK( IPW+JJ ) = ABS( X( IOFFX + IZAMAX( NP, $ X( IOFFX + JJ*DESCX( LLD_ ) ), 1 )-1+JJ* $ DESCX( LLD_ ) ) ) ELSE WORK( IPA+JJ ) = ZERO WORK( IPW+JJ ) = ZERO END IF 20 CONTINUE * * After ZGAMX2D computation, * WORK(IPB) has the maximum of || Ax - b ||, and * WORK(IPX) has the maximum of || X ||. * CALL ZGAMX2D( ICTXT, 'Column', ' ', 1, 2*JBRHS, $ WORK( IPA ), 1, IDUMM, IDUMM, -1, 0, ICURCOL ) * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * IF( MYROW.EQ.0 ) THEN DO 30 JJ = 0, JBRHS - 1 RESID1 = DBLE( WORK( IPA+JJ ) ) / $ ( DBLE( WORK( IPW+JJ ) )*DIVISOR ) IF( RESID.LT.RESID1 ) $ RESID = RESID1 30 CONTINUE IF( MYCOL.NE.0 ) $ CALL DGESD2D( ICTXT, 1, 1, RESID, 1, 0, 0 ) END IF * ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * CALL DGERV2D( ICTXT, 1, 1, RESID1, 1, 0, ICURCOL ) IF( RESID.LT.RESID1 ) $ RESID = RESID1 * END IF * IF( MYCOL.EQ.ICURCOL ) $ JJX = JJX + JBRHS ICURCOL = MOD( ICURCOL+1, NPCOL ) * 40 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, RESID, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, RESID, 1, 0, 0 ) END IF * RETURN * * End of PZLASCHK * END scalapack-2.0.2/TESTING/LIN/pzlltdriver.f000644 000766 000024 00000110451 10363532303 020200 0ustar00juliestaff000000 000000 PROGRAM PZLLTDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PZLLTDRIVER is the main test program for the COMPLEX*16 * ScaLAPACK Cholesky routines. This test driver performs an * A = L*L**H or A = U**H*U factorization and solve, and optionally * performs condition estimation and iterative refinement. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'ScaLAPACK LLt factorization input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LLT.out' output file name (if any) * 6 device out * 'U' define Lower or Upper * 1 number of problems sizes * 31 100 200 values of N * 1 number of NB's * 2 10 24 values of NB * 1 number of NRHS's * 1 values of NRHS * 1 Number of NBRHS's * 1 values of NBRHS * 1 number of process grids (ordered pairs of P & Q) * 2 values of P * 2 values of Q * 1.0 threshold * T (T or F) Test Cond. Est. and Iter. Ref. Routines * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, MEMSIZ, NTESTS, TOTMEM, ZPLXSZ DOUBLE PRECISION ZERO COMPLEX*16 PADVAL PARAMETER ( DBLESZ = 8, TOTMEM = 2000000, ZPLXSZ = 16, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, EST CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, IMIDPAD, $ INFO, IPA, IPA0, IPB, IPB0, IPBERR, IPFERR, $ IPREPAD, IPOSTPAD, IPW, IPW2, ITEMP, J, K, $ KFAIL, KK, KPASS, KSKIP, KTESTS, LCM, LCMQ, $ LRWORK, LWORK, LW2, MYCOL, MYRHS, MYROW, N, NB, $ NBRHS, NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, $ NPCOL, NPROCS, NPROW, NQ, NRHS, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, ANORM1, FRESID, NOPS, RCOND, $ SRESID, SRESID2, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, DESCINIT, $ IGSUM2D, BLACS_PINFO, PZCHEKPAD, PZFILLPAD, $ PZLAFCHK, PZLASCHK, PZLLTINFO, $ PZMATGEN, PZPOCON, PZPORFS, $ PZPOTRF, PZPOTRRV, PZPOTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PZLANHE EXTERNAL ICEIL, ILCM, LSAME, NUMROC, PZLANHE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 CALL PZLLTINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NNR, NRVAL, NTESTS, NNBR, NBRVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, EST, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 50 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 50 * DO 40 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 40 END IF * DO 30 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IF( EST ) THEN IPA0 = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPA0 + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD ELSE IPW = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD END IF * * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines PZLAFCHK, PZPOTRRV, and * PZLANHE * WORKSIZ = NP * DESCA( NB_ ) * WORKSIZ = MAX( WORKSIZ, DESCA( MB_ ) * DESCA( NB_ ) ) * LCM = ILCM( NPROW, NPCOL ) ITEMP = MAX( 2, 2 * NQ ) + NP IF( NPROW.NE.NPCOL ) THEN ITEMP = ITEMP + $ NB * ICEIL( ICEIL( NP, NB ), LCM / NPROW ) END IF WORKSIZ = MAX( WORKSIZ, $ ICEIL( DBLESZ * ITEMP, ZPLXSZ ) ) WORKSIZ = WORKSIZ + IPOSTPAD * ELSE * WORKSIZ = IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate a Hermitian positive definite matrix A * CALL PZMATGEN( ICTXT, 'Herm', 'Diag', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANHE( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) ANORM1 = PZLANHE( '1', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) END IF * IF( EST ) THEN CALL PZMATGEN( ICTXT, 'Herm', 'Diag', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA0 ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) IF( CHECK ) $ CALL PZFILLPAD( ICTXT, NP, NQ, $ MEM( IPA0-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform LLt factorization * CALL SLTIMER( 1 ) * CALL PZPOTRF( UPLO, N, MEM( IPA ), 1, 1, DESCA, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PZPOTRF INFO=', INFO KFAIL = KFAIL + 1 RCOND = ZERO GO TO 60 END IF * IF( CHECK ) THEN * * Check for memory overwrite in LLt factorization * CALL PZCHEKPAD( ICTXT, 'PZPOTRF', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( EST ) THEN * * Calculate workspace required for PZPOCON * LWORK = MAX( 1, 2*NP ) + $ MAX( 2, DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), $ NQ + DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LRWORK = MAX( 1, 2*NQ ) LW2 = ICEIL( LRWORK*DBLESZ, ZPLXSZ ) + IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'cond est', $ ( IPW2+LW2 )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 60 END IF * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Compute condition number of the matrix * CALL PZPOCON( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ ANORM1, RCOND, MEM( IPW ), LWORK, $ MEM( IPW2 ), LRWORK, INFO ) * IF( CHECK ) THEN CALL PZCHEKPAD( ICTXT, 'PZPOCON', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPOCON', $ LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPOCON', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF END IF * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 10 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB, N, NRHS, NB, NBRHS, 0, 0, $ ICTXT, MAX( 1, NP )+IMIDPAD, $ IERR( 1 ) ) * * move IPW to allow room for RHS * MYRHS = NUMROC( DESCB( N_ ), DESCB( NB_ ), MYCOL, $ DESCB( CSRC_ ), NPCOL ) IPB = IPW * IF( EST ) THEN IPB0 = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD IPFERR = IPB0 + DESCB( LLD_ )*MYRHS + IPOSTPAD $ + IPREPAD IPBERR = MYRHS + IPFERR + IPOSTPAD + IPREPAD IPW = MYRHS + IPBERR + IPOSTPAD + IPREPAD ELSE IPW = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD END IF * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines PZLASCHK * LCMQ = LCM / NPCOL WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ NQ * NBRHS + NP * NBRHS + $ MAX( MAX( NQ*NB, 2*NBRHS ), $ NBRHS * NUMROC( NUMROC(N,NB,0,0,NPCOL),NB, $ 0,0,LCMQ ) ) ) WORKSIZ = IPOSTPAD + WORKSIZ ELSE WORKSIZ = IPOSTPAD END IF * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate RHS * CALL PZMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, NPCOL ) * IF( CHECK ) $ CALL PZFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) * IF( EST ) THEN CALL PZMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB0 ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 2 ) * * Solve linear system via Cholesky factorization * CALL PZPOTRS( UPLO, N, NRHS, MEM( IPA ), 1, 1, $ DESCA, MEM( IPB ), 1, 1, DESCB, $ INFO ) * CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZPOTRS', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPOTRS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PZLASCHK( 'Herm', 'Diag', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, IBSEED, $ ANORM, SRESID, MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' END IF * IF( EST ) THEN * * Calculate workspace required for PZPORFS * LWORK = MAX( 1, 2*NP ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LRWORK = MAX( 1, NP ) LW2 = ICEIL( LRWORK*DBLESZ, ZPLXSZ ) + $ IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'iter ref', ( IPW2+LW2 )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, LW2-IPOSTPAD, $ 1, MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Use iterative refinement to improve the * computed solution * CALL PZPORFS( UPLO, N, NRHS, MEM( IPA0 ), $ 1, 1, DESCA, MEM( IPA ), 1, 1, $ DESCA, MEM( IPB0 ), 1, 1, $ DESCB, MEM( IPB ), 1, 1, DESCB, $ MEM( IPFERR ), MEM( IPBERR ), $ MEM( IPW ), LWORK, MEM( IPW2 ), $ LRWORK, INFO ) * * check for memory overwrite * IF( CHECK ) THEN CALL PZCHEKPAD( ICTXT, 'PZPORFS', NP, $ NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPORFS', NP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPORFS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPORFS', NP, $ MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPORFS', 1, $ MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPORFS', 1, $ MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPORFS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZPORFS', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PZLASCHK( 'Herm', 'Diag', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, $ IBSEED, ANORM, SRESID2, $ MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID2.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID2 * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 4/3 N^3 + 3 N^2 flops for LLt factorization * NOPS = 4.0D+0*(DBLE(N)**3)/3.0D+0 + $ 3.0D+0*(DBLE(N)**2) * * nrhs * 8 N^2 flops for LLt solve. * NOPS = NOPS + 8.0D+0*(DBLE(N)**2)*DBLE(NRHS) * * Calculate total megaflops -- factorization and * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ PASSED * END IF 10 CONTINUE 20 CONTINUE * IF( CHECK .AND. SRESID.GT.THRESH ) THEN * * Compute FRESID = ||A - LL'|| / (||A|| * N * eps) * CALL PZPOTRRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZLAFCHK( 'Symm', 'Diag', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZPOTRRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * IF( IAM.EQ.0 ) THEN IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9986 ) 'L*L''', FRESID ELSE WRITE( NOUT, FMT = 9986 ) 'U''*U', FRESID END IF END IF END IF * 30 CONTINUE 40 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 50 CONTINUE * * Print ending messages and close output file * 60 CONTINUE IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UPLO N NB NRHS NBRHS P Q LLt Time ', $ 'Slv Time MFLOPS CHECK' ) 9994 FORMAT( '---- ---- ----- --- ---- ----- ---- ---- -------- ', $ '-------- -------- ------' ) 9993 FORMAT( A4, 4X, A1, 1X, I5, 1X, I3, 1X, I4, 1X, I5, 1X, I4, 1X, $ I4, 1X, F8.2, 1X, F8.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PZLLTDRIVER * END scalapack-2.0.2/TESTING/LIN/pzlltinfo.f000644 000766 000024 00000042150 10363532303 017640 0ustar00juliestaff000000 000000 SUBROUTINE PZLLTINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NNR, NRVAL, LDNRVAL, NNBR, $ NBRVAL, LDNBRVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, THRESH, EST, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL EST CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PZLLTINFO get needed startup information for LLt factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * EST (global output) LOGICAL * Flag indicating if condition estimation and iterative * refinement routines are to be exercised. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 7, LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'LLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Read the flag that indicates whether to test the condition * estimation and iterative refinement routines. * READ( NIN, FMT = * ) EST * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS IF( LSAME( UPLO, 'L' ) ) THEN WORK( 6 ) = 1 ELSE WORK( 6 ) = 2 END IF IF( EST ) THEN WORK( 7 ) = 1 ELSE WORK( 7 ) = 0 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 7, 1, WORK, 7 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Ax=b by LLt factorization.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision LLt factorization '// $ 'and solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data is stored in ''Upper'// $ ''' or ''Lower'' portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'LLt time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 7, 1, WORK, 7, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) IF( WORK( 6 ).EQ.1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF IF( WORK( 7 ).EQ.1 ) THEN EST = .TRUE. ELSE EST = .FALSE. END IF * I = NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZLLTINFO * END scalapack-2.0.2/TESTING/LIN/pzlsdriver.f000644 000766 000024 00000135561 10363532303 020034 0ustar00juliestaff000000 000000 PROGRAM PZLSDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * Purpose * ======= * * PZLSDRIVER is the main test program for the COMPLEX*16 * SCALAPACK (full rank) Least Squares routines. This test driver solves * full-rank least square problems. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 17 lines: * 'ScaLapack LS solve input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LS.out' output file name (if any) * 6 device out * 4 number of problems sizes * 55 17 31 201 values of M * 5 71 31 201 values of N * 3 number of NB's * 2 3 5 values of NB * 3 number of NRHS's * 2 3 5 values of NRHS * 2 number of NBRHS's * 1 2 values of NBRHS * 7 number of process grids (ordered P & Q) * 1 2 1 4 2 3 8 values of P * 7 2 4 1 3 2 1 values of Q * 3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, MEMSIZ, NTESTS, TOTMEM, ZPLXSZ DOUBLE PRECISION RZERO, RONE COMPLEX*16 ONE, PADVAL, ZERO PARAMETER ( DBLESZ = 8, ZPLXSZ = 16, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ) ) PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), RZERO = 0.0D+0, $ RONE = 1.0D+0, ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK, TPSD CHARACTER TRANS CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, II, IMIDPAD, $ INFO, IPA, IPB, IPOSTPAD, IPREPAD, IPW, IPW2, $ IPX, ISCALE, ITRAN, ITYPE, J, JJ, K, KFAIL, KK, $ KPASS, KSKIP, KTESTS, LCM, LCMP, LTAU, LWF, $ LWORK, LWS, M, MNP, MNRHSP, MP, MQ, MYCOL, $ MYROW, N, NB, NBRHS, NCOLS, NGRIDS, NMAT, NNB, $ NNBR, NNR, NNRHSQ, NOUT, NP, NPCOL, NPROCS, $ NPROW, NROWS, NQ, NRHS, NRHSP, NRHSQ, WORKSIZ REAL THRESH DOUBLE PRECISION ADDFAC, ADDS, ANORM, BNORM, MULFAC, MULTS, $ NOPS, SRESID, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCW( LLD_ ), $ DESCX( DLEN_ ), IERR( 2 ), MVAL( NTESTS ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), RESULT( 2 ), WTIME( 1 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZCHEKPAD, $ PZFILLPAD, PZGELS, PZGEMM, PZLACPY, $ PZLSINFO, PZMATGEN, PDZNRM2, $ PZDSCAL, PZQRT13, PZQRT16, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PZLANGE, PZQRT14, PZQRT17 EXTERNAL ICEIL, ILCM, LSAME, NUMROC, PZLANGE, $ PZQRT14, PZQRT17 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * IASEED = 100 IBSEED = 200 CALL PZLSINFO( OUTFILE, NOUT, NMAT, MVAL, NTESTS, NVAL, $ NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, NTESTS, $ NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 90 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 90 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( ( MYROW.GE.NPROW ).OR.( MYCOL.GE.NPCOL ) ) $ GO TO 90 * DO 80 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 80 END IF * * Loop over different blocking sizes * DO 70 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 70 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) MQ = NUMROC( M, NB, MYCOL, 0, NPCOL ) NP = NUMROC( N, NB, MYROW, 0, NPROW ) MNP = MAX( MP, NP ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 70 END IF * DO 60 ISCALE = 1, 3 * ITYPE = ISCALE * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPX = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPX * WORKSIZ = NQ + IPOSTPAD * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'MEMORY', $ ( IPX+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 70 END IF * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Generate the matrix A and calculate its 1-norm * CALL PZQRT13( ISCALE, M, N, MEM( IPA ), 1, 1, $ DESCA, ANORM, IASEED, MEM( IPW ) ) * IF( CHECK ) THEN CALL PZCHEKPAD( ICTXT, 'PZQRT13', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZQRT13', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * DO 50 ITRAN = 1, 2 * IF( ITRAN.EQ.1 ) THEN NROWS = M NCOLS = N TRANS = 'N' TPSD = .FALSE. ELSE NROWS = N NCOLS = M TRANS = 'C' TPSD = .TRUE. END IF * * Loop over the different values for NRHS * DO 40 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 30 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * NRHSP = NUMROC( NRHS, NBRHS, MYROW, 0, $ NPROW ) NRHSQ = NUMROC( NRHS, NBRHS, MYCOL, 0, $ NPCOL ) * * Define Array descriptor for rhs MAX(M,N)xNRHS * CALL DESCINIT( DESCX, MAX( M, N ), NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, MNP ) + IMIDPAD, $ IERR( 1 ) ) IF( TPSD ) THEN CALL DESCINIT( DESCW, M, NRHS, NB, NBRHS, $ 0, 0, ICTXT, MAX( 1, MP ) + $ IMIDPAD, IERR( 2 ) ) ELSE CALL DESCINIT( DESCW, N, NRHS, NB, NBRHS, $ 0, 0, ICTXT, MAX( 1, NP ) + $ IMIDPAD, IERR( 2 ) ) END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, $ 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Check for enough memory * IPX = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + $ IPREPAD IPW = IPX + DESCX( LLD_ )*NRHSQ + IPOSTPAD + $ IPREPAD WORKSIZ = DESCW( LLD_ )*NRHSQ + IPOSTPAD * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Generation', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate RHS * IF( TPSD ) THEN CALL PZMATGEN( ICTXT, 'No', 'No', $ DESCW( M_ ), DESCW( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), $ MEM( IPW ), DESCW( LLD_ ), $ DESCW( RSRC_ ), $ DESCW( CSRC_ ), IBSEED, 0, $ MP, 0, NRHSQ, MYROW, MYCOL, $ NPROW, NPCOL ) ELSE CALL PZMATGEN( ICTXT, 'No', 'No', $ DESCW( M_ ), DESCW( N_ ), $ DESCW( MB_ ), DESCW( NB_ ), $ MEM( IPW ), DESCW( LLD_ ), $ DESCW( RSRC_ ), $ DESCW( CSRC_ ), IBSEED, 0, $ NP, 0, NRHSQ, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PZFILLPAD( ICTXT, MP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PZFILLPAD( ICTXT, NP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * DO 10 JJ = 1, NRHS CALL PDZNRM2( NCOLS, BNORM, MEM( IPW ), $ 1, JJ, DESCW, 1 ) IF( BNORM.GT.RZERO ) $ CALL PZDSCAL( NCOLS, RONE / BNORM, $ MEM( IPW ), 1, JJ, DESCW, $ 1 ) 10 CONTINUE * CALL PZGEMM( TRANS, 'N', NROWS, NRHS, NCOLS, $ ONE, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ), 1, 1, DESCW, ZERO, $ MEM( IPX ), 1, 1, DESCX ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'Generation', MP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'Generation', MNP, $ NRHSQ, MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PZCHEKPAD( ICTXT, 'Generation', $ MP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PZCHEKPAD( ICTXT, 'Generation', $ NP, NRHSQ, $ MEM( IPW-IPREPAD ), $ DESCW( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Allocate space for copy of rhs * IPB = IPW * IF( TPSD ) THEN CALL DESCINIT( DESCB, N, NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, $ IERR( 1 ) ) ELSE CALL DESCINIT( DESCB, M, NRHS, NB, $ NBRHS, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, $ IERR( 1 ) ) END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * IPW = IPB + DESCB( LLD_ )*NRHSQ + $ IPOSTPAD + IPREPAD * END IF * * Calculate the amount of workspace for PZGELS * IF( M.GE.N ) THEN LTAU = NUMROC( MIN(M,N), NB, MYCOL, 0, $ NPCOL ) LWF = NB * ( MP + NQ + NB ) LWS = MAX( ( NB*( NB - 1 ) ) / 2, $ ( MP + NRHSQ ) * NB ) + NB*NB ELSE LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LTAU = NUMROC( MIN(M,N), NB, MYROW, 0, $ NPROW ) LWF = NB * ( MP + NQ + NB ) LWS = MAX( ( NB*( NB - 1 ) ) / 2, ( NP + $ MAX( NQ + NUMROC( NUMROC( N, NB, 0, $ 0, NPROW ), NB, 0, 0, LCMP ), $ NRHSQ ) ) * NB ) + NB*NB END IF * LWORK = LTAU + MAX( LWF, LWS ) WORKSIZ = LWORK + IPOSTPAD * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'solve', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Make the copy of the right hand side * CALL PZLACPY( 'All', NROWS, NRHS, $ MEM( IPX ), 1, 1, DESCX, $ MEM( IPB ), 1, 1, DESCB ) * IF( TPSD ) THEN CALL PZFILLPAD( ICTXT, NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) ELSE CALL PZFILLPAD( ICTXT, MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) END IF CALL PZFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * CALL SLBOOT( ) CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Solve the LS or overdetermined system * CALL PZGELS( TRANS, M, N, NRHS, MEM( IPA ), $ 1, 1, DESCA, MEM( IPX ), 1, 1, $ DESCX, MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGELS', MP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGELS', MNP, $ NRHSQ, MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGELS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Regenerate A in place for testing and next * iteration * CALL PZQRT13( ISCALE, M, N, MEM( IPA ), 1, 1, $ DESCA, ANORM, IASEED, $ MEM( IPW ) ) * * check the solution to rhs * IF( CHECK ) THEN * * Am I going to call PZQRT17 ? * IF( ( M.GE.N .AND. ( .NOT.TPSD ) ) .OR. $ ( M.LT.N .AND. TPSD ) ) THEN * * Call PZQRT17 first, A, X, and B remain * unchanged. Solving LS system * * Check amount of memory for PZQRT17 * IF( TPSD ) THEN WORKSIZ = NP*NRHSQ + NRHSP*MQ IPW2 = IPW + WORKSIZ WORKSIZ = WORKSIZ + $ ICEIL( DBLESZ*MAX( NQ, MAX( $ MQ, NRHSQ ) ), ZPLXSZ ) + $ IPOSTPAD ELSE WORKSIZ = MP*NRHSQ + NRHSP*NQ IPW2 = IPW + WORKSIZ WORKSIZ = WORKSIZ + $ ICEIL( DBLESZ*MAX( NQ, $ NRHSQ ), ZPLXSZ ) + $ IPOSTPAD END IF * * Check for adequate memory for problem * size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'MEMORY', ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PZFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * RESULT( 2 ) = PZQRT17( TRANS, 1, M, N, $ NRHS, $ MEM( IPA ), $ 1, 1, DESCA, $ MEM( IPX ), 1, $ 1, DESCX, $ MEM( IPB ), $ 1, 1, DESCB, $ MEM( IPW ), $ MEM( IPW2 ) ) SRESID = RESULT( 2 ) * CALL PZCHEKPAD( ICTXT, 'PZQRT17', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZQRT17', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PZCHEKPAD( ICTXT, 'PZQRT17', $ NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) ELSE CALL PZCHEKPAD( ICTXT, 'PZQRT17', $ MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF CALL PZCHEKPAD( ICTXT, 'PZQRT17', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Call PZQRT16, B will be destroyed. * IF( TPSD ) THEN WORKSIZ = MP + IPOSTPAD ELSE WORKSIZ = NQ + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'MEMORY', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PZFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PZQRT16( TRANS, M, N, NRHS, $ MEM( IPA ), 1, 1, DESCA, $ MEM( IPX ), 1, 1, DESCX, $ MEM( IPB ), 1, 1, DESCB, $ MEM( IPW ), RESULT( 1 ) ) * CALL PZCHEKPAD( ICTXT, 'PZQRT16', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZQRT16', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) IF( TPSD ) THEN CALL PZCHEKPAD( ICTXT, 'PZQRT16', $ NP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) ELSE CALL PZCHEKPAD( ICTXT, 'PZQRT16', $ MP, NRHSQ, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF CALL PZCHEKPAD( ICTXT, 'PZQRT16', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * * Call PZQRT14 * IF( ( M.GE.N .AND. TPSD ) .OR. $ ( M.LT.N .AND. ( .NOT.TPSD ) ) ) THEN * IPW = IPB * IF( TPSD ) THEN * NNRHSQ = NUMROC( N+NRHS, NB, MYCOL, $ 0, NPCOL ) LTAU = NUMROC( MIN( M, N+NRHS ), NB, $ MYCOL, 0, NPCOL ) LWF = NB * ( NB + MP + NNRHSQ ) WORKSIZ = MP * NNRHSQ + LTAU + LWF + $ IPOSTPAD * ELSE * MNRHSP = NUMROC( M+NRHS, NB, MYROW, $ 0, NPROW ) LTAU = NUMROC( MIN( M+NRHS, N ), NB, $ MYROW, 0, NPROW ) LWF = NB * ( NB + MNRHSP + NQ ) WORKSIZ = MNRHSP * NQ + LTAU + LWF + $ IPOSTPAD * END IF * * Check for adequate memory for problem * size * IERR( 1 ) = 0 IF( ( IPW+WORKSIZ ).GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'MEMORY', ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PZFILLPAD( ICTXT, $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) * * Solve underdetermined system * RESULT( 2 ) = PZQRT14( TRANS, M, N, $ NRHS, $ MEM( IPA ), 1, $ 1, DESCA, $ MEM( IPX ), $ 1, 1, DESCX, $ MEM( IPW ) ) SRESID = RESULT( 2 ) * CALL PZCHEKPAD( ICTXT, 'PZQRT14', $ MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZQRT14', $ MNP, NRHSQ, $ MEM( IPX-IPREPAD ), $ DESCX( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZQRT14', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF * * Print information about the tests that * did not pass the threshold. * PASSED = 'PASSED' DO 20 II = 1, 2 IF( ( RESULT( II ).GE.THRESH ) .AND. $ ( RESULT( II )-RESULT( II ).EQ.0.0E+0 $ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 )TRANS, $ M, N, NRHS, NB, ITYPE, II, $ RESULT( II ) KFAIL = KFAIL + 1 PASSED = 'FAILED' ELSE KPASS = KPASS + 1 END IF 20 CONTINUE * ELSE * * By-pass the solve check * KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock * timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN ADDFAC = 2 MULFAC = 6 IF( M.GE.N ) THEN * * NOPS = DOPLA( 'ZGEQRF', M, N, 0, 0, * NB ) + DOPLA( 'ZUNMQR', M, NRHS, N, * 0, NB ) * MULTS = N*( ( ( 23.D0 / 6.D0 )+M+N / $ 2.D0 )+ N*( M-N / 3.D0 ) ) + $ N*NRHS*( 2.D0*M+2.D0-N ) ADDS = N*( ( 5.D0 / 6.D0 )+N* $ ( 1.D0 / 2.D0+( M-N / 3.D0 ) ) ) $ + N*NRHS*( 2.D0*M+1.D0-N ) ELSE * * NOPS = DOPLA( 'ZGELQF', M, N, 0, 0, * NB ) + DOPLA( 'ZUNMLQ', M, * NRHS, N, 0, NB ) * MULTS = M*( ( ( 29.D0 / 6.D0 )+2.D0*N-M $ / 2.D0 )+M*( N-M / 3.D0 ) ) $ + N*NRHS*( 2.D0*M+2.D0-N ) ADDS = M*( ( 5.D0 / 6.D0 )+M / 2.D0+M* $ ( N-M / 3.D0 ) ) $ + N*NRHS*( 2.D0*M+1.D0-N ) END IF NOPS = ADDFAC*ADDS + MULFAC*MULTS * * Calculate total megaflops, for WALL and * CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) $ 'WALL', TRANS, M, N, NB, NRHS, $ NBRHS, NPROW, NPCOL, WTIME( 1 ), $ TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) $ 'CPU ', TRANS, M, N, NB, NRHS, $ NBRHS, NPROW, NPCOL, CTIME( 1 ), $ TMFLOPS, PASSED END IF 30 CONTINUE 40 CONTINUE 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 90 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'Time TRANS M N NB NRHS NBRHS P Q ', $ 'LS Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- ------ ------ --- ----- ----- ----- ----- ', $ '--------- -------- ------' ) 9993 FORMAT( A4, 3X, A1, 3X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, $ I5, 1X, I5, 1X, F9.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( ' TRANS=''', A1, ''', M=', I5, ', N=', I5, ', NRHS=', I4, $ ', NB=', I4, ', type', I2, ', test(', I2, ')=', G12.5 ) * STOP * * End of PZLSDRIVER * END scalapack-2.0.2/TESTING/LIN/pzlsinfo.f000644 000766 000024 00000040775 10363532303 017476 0ustar00juliestaff000000 000000 SUBROUTINE PZLSINFO( SUMMRY, NOUT, NMAT, MVAL, LDMVAL, $ NVAL, LDNVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDMVAL, LDNBRVAL, LDNBVAL, LDNRVAL, $ LDNVAL, LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, $ NNBR, NNR, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDMVAL ), NBRVAL( LDNBRVAL ), $ NBVAL( LDNBVAL ), NRVAL( LDNRVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), $ QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZLSINFO gets needed startup information for LS solve and * transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * MVAL (global output) INTEGER array, dimension (LDNVAL) * The values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M, LDNVAL > = NMAT. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (workspace) INTEGER array of dimension >= * MAX(5,LDMVAL+LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL) * Used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='LS.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 ELSE IF( NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'M', LDMVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 5, 1, WORK, 5 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK min ||Ax-b|| by QR factorizations.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision least-square solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - QR|| / '// $ '(||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used. If CPU and WALL time' WRITE( NOUT, FMT = 9999 ) $ ' are the same, only one line '// $ 'is printed, and the label is ''BOTH''.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 ) $ 'QR time : Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 5, 1, WORK, 5, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) * I = 2*NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZLSINFO * END scalapack-2.0.2/TESTING/LIN/pzludriver.f000644 000766 000024 00000125312 11657237147 020046 0ustar00juliestaff000000 000000 PROGRAM PZLUDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======== * * PZLUDRIVER is the main test program for the COMPLEX*16 * SCALAPACK LU routines. This test driver performs an LU factorization * and solve. If the input matrix is non-square, only the factorization * is performed. Condition estimation and iterative refinement are * optionally performed. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'SCALAPACK, Version 2.0, LU factorization input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'LU.out' output file name (if any) * 6 device out * 1 number of problems sizes * 31 201 values of M * 31 201 values of N * 1 number of NB's * 2 values of NB * 1 number of NRHS's * 1 values of NRHS * 1 number of NBRHS's * 1 values of NBRHS * 1 number of process grids (ordered pairs of P & Q) * 2 1 4 2 3 8 values of P * 2 4 1 3 2 1 values of Q * 1.0 threshold * T (T or F) Test Cond. Est. and Iter. Ref. Routines * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INTGSZ, DBLESZ, MEMSIZ, NTESTS, TOTMEM, ZPLXSZ DOUBLE PRECISION ZERO COMPLEX*16 PADVAL PARAMETER ( INTGSZ = 4, DBLESZ = 8, TOTMEM = 8000000, $ ZPLXSZ = 16, MEMSIZ = TOTMEM / ZPLXSZ, $ NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL CHECK, EST CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER HH, I, IAM, IASEED, IBSEED, ICTXT, IMIDPAD, $ INFO, IPA, IPA0, IPB, IPB0, IPBERR, IPFERR, $ IPOSTPAD, IPPIV, IPREPAD, IPW, IPW2, J, K, $ KFAIL, KK, KPASS, KSKIP, KTESTS, LCM, LCMQ, $ LIPIV, LRWORK, LWORK, LW2, M, MAXMN, $ MINMN, MP, MYCOL, MYRHS, MYROW, N, NB, NBRHS, $ NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, NPCOL, $ NPROCS, NPROW, NQ, NRHS, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, ANORM1, FRESID, NOPS, RCOND, $ SRESID, SRESID2, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), IERR( 1 ), $ MVAL( NTESTS ), NBRVAL( NTESTS ), $ NBVAL( NTESTS ), NRVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), $ QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZCHEKPAD, $ PZFILLPAD, PZGECON, PZGERFS, $ PZGETRF, PZGETRRV, PZGETRS, $ PZLAFCHK, PZLASCHK, PZLUINFO, $ PZMATGEN, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PZLANGE EXTERNAL ICEIL, ILCM, NUMROC, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 CALL PZLUINFO( OUTFILE, NOUT, NMAT, MVAL, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NNR, NRVAL, NTESTS, NNBR, NBRVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, THRESH, $ EST, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 50 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 50 * DO 40 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 40 END IF * DO 30 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IF( EST .AND. M.EQ.N ) THEN IPA0 = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPPIV = IPA0 + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD ELSE IPPIV = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD END IF LIPIV = ICEIL( INTGSZ*( MP+NB ), ZPLXSZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * IF( CHECK ) THEN * * Calculate the amount of workspace required by the * checking routines PZLANGE, PZGETRRV, and * PZLAFCHK * WORKSIZ = MAX( 2, NQ ) * WORKSIZ = MAX( WORKSIZ, MP*DESCA( NB_ )+ $ NQ*DESCA( MB_ ) ) * WORKSIZ = MAX( WORKSIZ, MP * DESCA( NB_ ) ) * WORKSIZ = WORKSIZ + IPOSTPAD * ELSE * WORKSIZ = IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Generate matrix A of Ax = b * CALL PZMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ MP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, LIPIV, 1, MEM( IPPIV-IPREPAD ), $ LIPIV, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANGE( 'I', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) ANORM1 = PZLANGE( '1', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) END IF * IF( EST .AND. M.EQ.N ) THEN CALL PZMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA0 ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) IF( CHECK ) $ CALL PZFILLPAD( ICTXT, MP, NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform LU factorization * CALL PZGETRF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PZGETRF INFO=', INFO KFAIL = KFAIL + 1 RCOND = ZERO GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in LU factorization * CALL PZCHEKPAD( ICTXT, 'PZGETRF', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRF', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * IF( M.NE.N ) THEN * * For non-square matrices, factorization only * NRHS = 0 NBRHS = 0 * IF( CHECK ) THEN * * Compute FRESID = ||A - P*L*U|| / (||A|| * N * eps) * CALL PZGETRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ) ) CALL PZLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGETRRV', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRRV', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( (FRESID-FRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * MAXMN = MAX( M, N ) MINMN = MIN( M, N ) * * 4 M N^2 - 4/3 N^3 + 2 M N - 3 N^2 flops for LU * factorization M >= N * NOPS = 4.0D+0*DBLE(MAXMN)*(DBLE(MINMN)**2) - $ (4.0D+0 / 3.0D+0)*( DBLE( MINMN )**3 ) + $ (2.0D+0)*DBLE( MAXMN )*DBLE( MINMN ) - $ (3.0D+0)*( DBLE( MINMN )**2 ) * * Calculate total megaflops -- factorization only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * WTIME( 2 ) = 0.0D+0 IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, NB, $ NRHS, NBRHS, NPROW, NPCOL, WTIME( 1 ), $ WTIME( 2 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * CTIME( 2 ) = 0.0D+0 IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, NB, $ NRHS, NBRHS, NPROW, NPCOL, CTIME( 1 ), $ CTIME( 2 ), TMFLOPS, PASSED END IF * ELSE * * If M = N * IF( EST ) THEN * * Calculate workspace required for PZGECON * LWORK = MAX( 1, 2*NP ) + $ MAX( 2, DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), $ NQ + DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LRWORK = MAX( 1, 2*NQ ) LW2 = ICEIL( LRWORK*DBLESZ, ZPLXSZ ) + IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'cond est', $ ( IPW2+LW2 )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Compute condition number of the matrix * CALL PZGECON( '1', N, MEM( IPA ), 1, 1, DESCA, $ ANORM1, RCOND, MEM( IPW ), LWORK, $ MEM( IPW2 ), LRWORK, INFO ) * IF( CHECK ) THEN CALL PZCHEKPAD( ICTXT, 'PZGECON', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGECON', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGECON', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * NRHS = NRVAL( HH ) * DO 10 KK = 1, NNBR * NBRHS = NBRVAL( KK ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB, N, NRHS, NB, NBRHS, 0, 0, $ ICTXT, MAX( 1, NP )+IMIDPAD, $ IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * move IPW to allow room for RHS * MYRHS = NUMROC( DESCB( N_ ), DESCB( NB_ ), $ MYCOL, DESCB( CSRC_ ), NPCOL ) IPB = IPW * IF( EST ) THEN IPB0 = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD IPFERR = IPB0 + DESCB( LLD_ )*MYRHS + $ IPOSTPAD + IPREPAD IPBERR = MYRHS + IPFERR + IPOSTPAD + IPREPAD IPW = MYRHS + IPBERR + IPOSTPAD + IPREPAD ELSE IPW = IPB + DESCB( LLD_ )*MYRHS + IPOSTPAD + $ IPREPAD END IF * * Set worksiz: routines requiring most workspace * is PZLASCHK * IF( CHECK ) THEN LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL WORKSIZ = MAX( WORKSIZ-IPOSTPAD, $ NQ * NBRHS + NP * NBRHS + $ MAX( MAX( NQ*NB, 2*NBRHS ), $ NBRHS * NUMROC( NUMROC(N,NB,0,0,NPCOL),NB, $ 0,0,LCMQ ) ) ) WORKSIZ = IPOSTPAD + WORKSIZ ELSE WORKSIZ = IPOSTPAD END IF * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate RHS * CALL PZMATGEN( ICTXT, 'No', 'No', DESCB( M_ ), $ DESCB( N_ ), DESCB( MB_ ), $ DESCB( NB_ ), MEM( IPB ), $ DESCB( LLD_ ), DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, 0, $ MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) * IF( CHECK ) $ CALL PZFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * IF( EST ) THEN CALL PZMATGEN( ICTXT, 'No', 'No', $ DESCB( M_ ), DESCB( N_ ), $ DESCB( MB_ ), DESCB( NB_ ), $ MEM( IPB0 ), DESCB( LLD_ ), $ DESCB( RSRC_ ), $ DESCB( CSRC_ ), IBSEED, 0, NP, $ 0, MYRHS, MYROW, MYCOL, NPROW, $ NPCOL ) IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, 1, MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) END IF END IF * CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 2 ) * * Solve linear sytem via LU factorization * CALL PZGETRS( 'No', N, NRHS, MEM( IPA ), 1, 1, $ DESCA, MEM( IPPIV ), MEM( IPB ), $ 1, 1, DESCB, INFO ) * CALL SLTIMER( 2 ) * IF( CHECK ) THEN * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGETRS', NP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRS', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PZLASCHK( 'No', 'N', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, IBSEED, $ ANORM, SRESID, MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * The second test is a NaN trap * IF( SRESID.LE.THRESH .AND. $ ( SRESID-SRESID ).EQ.0.0D+0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE KPASS = KPASS + 1 SRESID = SRESID - SRESID PASSED = 'BYPASS' END IF * IF( EST ) THEN * * Calculate workspace required for PZGERFS * LWORK = MAX( 1, 2*NP ) IPW2 = IPW + LWORK + IPOSTPAD + IPREPAD LRWORK = MAX( 1, NP ) LW2 = ICEIL( LRWORK*DBLESZ, ZPLXSZ ) + $ IPOSTPAD * IERR( 1 ) = 0 IF( IPW2+LW2.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'iter ref', ( IPW2+LW2 )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) $ 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, LWORK, 1, $ MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Use iterative refinement to improve the * computed solution * CALL PZGERFS( 'No', N, NRHS, MEM( IPA0 ), 1, $ 1, DESCA, MEM( IPA ), 1, 1, $ DESCA, MEM( IPPIV ), $ MEM( IPB0 ), 1, 1, DESCB, $ MEM( IPB ), 1, 1, DESCB, $ MEM( IPFERR ), MEM( IPBERR ), $ MEM( IPW ), LWORK, MEM( IPW2 ), $ LRWORK, INFO ) * IF( CHECK ) THEN CALL PZCHEKPAD( ICTXT, 'PZGERFS', NP, $ NQ, MEM( IPA0-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGERFS', NP, $ NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGERFS', LIPIV, $ 1, MEM( IPPIV-IPREPAD ), $ LIPIV, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGERFS', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGERFS', NP, $ MYRHS, $ MEM( IPB0-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGERFS', 1, $ MYRHS, $ MEM( IPFERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGERFS', 1, $ MYRHS, $ MEM( IPBERR-IPREPAD ), 1, $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGERFS', LWORK, $ 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGERFS', $ LW2-IPOSTPAD, 1, $ MEM( IPW2-IPREPAD ), $ LW2-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * CALL PZLASCHK( 'No', 'N', N, NRHS, $ MEM( IPB ), 1, 1, DESCB, $ IASEED, 1, 1, DESCA, $ IBSEED, ANORM, SRESID2, $ MEM( IPW ) ) * IF( IAM.EQ.0 .AND. SRESID2.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID2 * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZLASCHK', NP, $ MYRHS, MEM( IPB-IPREPAD ), $ DESCB( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLASCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF END IF * * Gather max. of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 8/3 N^3 - N^2 flops for LU factorization * NOPS = (8.0D+0/3.0D+0)*( DBLE(N)**3 ) - $ DBLE(N)**2 * * nrhs * 8 N^2 flops for LU solve. * NOPS = NOPS + 8.0D+0*(DBLE(N)**2)*DBLE(NRHS) * * Calculate total megaflops -- factorization * and solve -- for WALL and CPU time, and print * output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) $ THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * * Print WALL time if supported * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ PASSED * * Print CPU time if supported * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) $ THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, $ NB, NRHS, NBRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ PASSED END IF 10 CONTINUE 20 CONTINUE * IF( CHECK.AND.( SRESID.GT.THRESH ) ) THEN * * Compute fresid = ||A - P*L*U|| / (||A|| * N * eps) * CALL PZGETRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPW ) ) CALL PZLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGETRRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRRV', LIPIV, $ 1, MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGETRRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID END IF END IF 30 CONTINUE 40 CONTINUE CALL BLACS_GRIDEXIT( ICTXT ) 50 CONTINUE * * Print ending messages and close output file * 60 CONTINUE IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N NB NRHS NBRHS P Q LU Time ', $ 'Sol Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- ----- --- ---- ----- ---- ---- -------- ', $ '-------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I5, 1X, I3, 1X, I5, 1X, I4, 1X, I4, 1X, $ I4, 1X, F8.2, 1X, F8.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - P*L*U|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PZLUDRIVER * END scalapack-2.0.2/TESTING/LIN/pzluinfo.f000644 000766 000024 00000041101 10363532303 017460 0ustar00juliestaff000000 000000 SUBROUTINE PZLUINFO( SUMMRY, NOUT, NMAT, MVAL, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NNR, NRVAL, LDNRVAL, NNBR, $ NBRVAL, LDNBRVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, EST, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL EST CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDNVAL ), NBRVAL( LDNBRVAL ), $ NBVAL( LDNBVAL ), NRVAL( LDNRVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ WORK( * ) * .. * * Purpose * ======= * * PZLUINFO gets needed startup information for LU factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for M and N. * * MVAL (global output) INTEGER array, dimension (LDNVAL) * The values of M (number of rows in matrix) to run the code * with. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * M and N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * EST (global output) LOGICAL * Flag indicating if condition estimation and iterative * refinement routines are to be exercised. * * WORK (local workspace) INTEGER array of dimension >= * MAX( 6, 2*LDNVAL+LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL ) * Used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='LU.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Read the flag that indicates whether to test the condition * estimation and iterative refinement routines. * READ( NIN, FMT = * ) EST * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NNR WORK( 4 ) = NNBR WORK( 5 ) = NGRIDS IF( EST ) THEN WORK( 6 ) = 1 ELSE WORK( 6 ) = 0 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 6, 1, WORK, 6 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Ax=b by LU factorization.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex double precision LU factorization '// $ 'and solve.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LU|| / '// $ '(||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'LU time : Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 6, 1, WORK, 6, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NNR = WORK( 3 ) NNBR = WORK( 4 ) NGRIDS = WORK( 5 ) IF( WORK( 6 ).EQ.1 ) THEN EST = .TRUE. ELSE EST = .FALSE. END IF * I = 2*NMAT + NNB + NNR + NNBR + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZLUINFO * END scalapack-2.0.2/TESTING/LIN/pzmatgen.f000644 000766 000024 00000046353 10363532303 017455 0ustar00juliestaff000000 000000 SUBROUTINE PZMATGEN( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PZMATGEN : Parallel Complex Double precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * otherwise a random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) COMPLEX*16, pointer into the local memory * to an array of dimension ( LDA, * ) containing the local * pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB DOUBLE PRECISION DUMMY * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDRAND EXTERNAL ICEIL, NUMROC, LSAME, PDRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) * INFO = 0 IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZMATGEN', INFO ) RETURN END IF * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = 2*NPMB JUMP3 = 2*M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = 2*MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = DCMPLX( PDRAND(0), PDRAND(0) ) 30 CONTINUE IF( SYMM ) THEN A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) ELSE A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), ZERO ) DUMMY = PDRAND(0) END IF DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) IF( HERM ) THEN A(IK,JK+J) = DCONJG( A(IK+J,JK) ) ELSE A(IK,JK+J) = A(IK+J,JK) END IF 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = 2*NQNB JUMP3 = 2*N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = 2*NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 IF( SYMM ) THEN A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) ELSE A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ TWO*PDRAND(0) - ONE ) END IF JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = 2*NQNB JUMP3 = 2*N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = 2*NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 IF( TRAN ) THEN A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) ELSE A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ TWO*PDRAND(0) - ONE ) END IF JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSE * JUMP1 = 1 JUMP2 = 2*NPMB JUMP3 = 2*M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = 2*MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 IF( HERM ) THEN A(IK,JK+J) = DCMPLX( $ ABS(DBLE(A(IK,JK+J)))+2*MAXMN, ZERO ) ELSE A(IK,JK+J)= DCMPLX( ABS(DBLE(A(IK,JK+J)))+MAXMN, $ ABS(DIMAG(A(IK,JK+J)))+ MAXMN ) END IF IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PZMATGEN * END scalapack-2.0.2/TESTING/LIN/pzpbdriver.f000644 000766 000024 00000074237 10363532303 020021 0ustar00juliestaff000000 000000 PROGRAM PZPBDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PZPBDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by ZPB. This test driver performs an * A = L*L**H factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM/ZPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER MEMSIZ, NTESTS, ZPLXSZ COMPLEX*16 PADVAL PARAMETER ( ZPLXSZ = 16, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BW, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, I, $ IAM, IASEED, IBSEED, ICTXT, ICTXTB, IERR_TEMP, $ IMIDPAD, INFO, IPA, IPB, IPOSTPAD, IPREPAD, $ IPW, IPW_SIZE, IPW_SOLVE, IPW_SOLVE_SIZE, $ IP_DRIVER_W, IP_FILLIN, J, K, KFAIL, KPASS, $ KSKIP, KTESTS, MYCOL, MYRHS_SIZE, MYROW, N, NB, $ NBW, NGRIDS, NMAT, NNB, NNBR, NNR, NOUT, NP, $ NPCOL, NPROCS, NPROCS_REAL, NPROW, NQ, NRHS, $ N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWVAL( NTESTS ), DESCA( 7 ), DESCA2D( DLEN_ ), $ DESCB( 7 ), DESCB2D( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZBMATGEN, $ PZCHEKPAD, PZFILLPAD, PZMATGEN, PZPBINFO, $ PZPBLASCHK, PZPBTRF, PZPBTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PZLANGE EXTERNAL LSAME, NUMROC, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PZPBINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NBW, $ BWVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, $ NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BW = BWVAL( BW_NUM ) IF( BW.LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Band', 'bw', BW IERR( 1 ) = 1 END IF * IF( BW.GT.N-1 ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*BW-1)/NPCOL + 1 ) $ + BW NB = MAX( NB, 2*BW ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*BW, N ) ) THEN IERR( 1 ) = 1 ENDIF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (BW+1), (BW+1), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((BW+1)+10) IMIDPAD = 10 IPOSTPAD = ((BW+1)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, (BW+1), N, $ (BW+1), NB, 0, 0, $ ICTXT,((BW+1)+10), IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((BW+1)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + DESCA2D( LLD_ )* $ DESCA2D( NB_ ) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (NB+2*BW)*BW * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = BW*BW * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((BW+1)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PZLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PZPBLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,MAX(BW*(BW+2),NB))+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PZBMATGEN( ICTXT, UPLO, 'B', BW, BW, N, (BW+1), NB, $ MEM( IPA ), ((BW+1)+10), 0, 0, IASEED, $ MYROW, MYCOL, NPROW, NPCOL ) * CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ ((BW+1)+10), IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PZLANGE( '1', (BW+1), $ N, MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), ((BW+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PZPBTRF( UPLO, N, BW, MEM( IPA ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PZPBTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PZCHEKPAD( ICTXT, 'PZPBTRF', NP, $ NQ, MEM( IPA-IPREPAD ), ((BW+1)+10), $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (BW*NRHS) * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PZMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PZFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PZPBTRS( UPLO, N, BW, NRHS, MEM( IPA ), 1, $ DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PZPBTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZPBTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * CALL PZPBLASCHK( 'H', UPLO, N, BW, BW, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * NOPS = NOPS + DBLE(BW)*( -2.D0 / 3.D0+DBLE(BW)* $ ( -1.D0+DBLE(BW)*( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( 1.D0+DBLE(BW)*( 3.D0 / $ 2.D0+DBLE(BW)*( 1.D0 / 2.D0 ) ) ) NOPS = NOPS + DBLE(BW)*( -1.D0 / 6.D0+DBLE(BW) $ *( -1.D0 /2.D0+DBLE(BW) $ *( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( DBLE(BW) / $ 2.D0*( 1.D0+DBLE(BW) ) ) * NOPS = NOPS + $ DBLE(NRHS)*( ( 2*DBLE(N)-DBLE(BW) )* $ ( DBLE(BW)+1.D0 ) )+ DBLE(NRHS)* $ ( DBLE(BW)*( 2*DBLE(N)- $ ( DBLE(BW)+1.D0 ) ) ) * * * Second calc to represent actual hardware speed * * NB bw^2 flops for LLt factorization in 1st proc * NOPS2 = ( (DBLE(N_FIRST))* DBLE(BW)**2 ) * IF ( NPROCS_REAL .GT. 1) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in last processor * NOPS2 = NOPS2 + $ 4*( (DBLE(N_LAST)*DBLE(BW)**2) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 4*( (DBLE(NB)*DBLE(BW)**2) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ ( NPROCS_REAL-1 ) * ( BW*BW*BW/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ ( NPROCS_REAL-2 ) * ( 2 * BW*BW*BW ) ENDIF * * * nrhs * 4 n_first*bw flops for LLt solve in proc 1. * NOPS2 = NOPS2 + $ ( 4.0D+0*(DBLE(N_FIRST)*DBLE(BW))*DBLE(NRHS) ) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*4 n_last*bw flops for LLt solve in last. * NOPS2 = NOPS2 + $ 2*( 4.0D+0*(DBLE(N_LAST)*DBLE(BW))*DBLE(NRHS) ) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2 * nrhs * 4 NB*bw flops for LLt solve in others. * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( 4.0D+0*(DBLE(NB)*DBLE(BW))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1 ) * ( BW*BW ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * ( 3 * BW*BW ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UL N BW NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- ---- ----- -- ---- -------- ', $ '-------- ------ ------ ------' ) 9993 FORMAT( A4, 2X, A1, 1X, I6, 1X, I3, 1X, I4, 1X, $ I5, 1X, I2, 1X, $ I4, 1X, F8.3, F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PZPBTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pzpbinfo.f000644 000766 000024 00000044727 10363532303 017462 0ustar00juliestaff000000 000000 SUBROUTINE PZPBINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NBW, $ BWVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PZPBINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of subdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( UPLO, 'L' ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data represents ''Upper'// $ ''' or ''Lower'' triangular portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'TRANS : Whether solve is to be done with'// $ ' ''Transpose'' of matrix A (T,C) or not (N).' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bw : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bw ', ( BWVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZPBINFO * END scalapack-2.0.2/TESTING/LIN/pzpblaschk.f000644 000766 000024 00000025602 10363532303 017763 0ustar00juliestaff000000 000000 SUBROUTINE PZPBLASCHK( SYMM, UPLO, N, BWL, BWU, NRHS, X, IX, JX, $ DESCX, IASEED, A, IA, JA, DESCA, IBSEED, $ ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX*16 A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PZPBLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'H', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,max(bw*(bw+2),NB))+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,max(bw*(bw+2),NB))+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER BW, INFO, IPPRODUCT, WORK_MIN DOUBLE PRECISION DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D, PBZTRAN, $ PZMATGEN, ZGAMX2D, ZGEMM, ZGSUM2D, $ ZLASET * .. * .. External Functions .. INTEGER IZAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IZAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'H' ) ) THEN BW = BWL WORK_MIN = MAX(5,MAX(BW*(BW+2),NB))+2*NB ELSE BW = MAX(BWL, BWU) WORK_MIN = MAX(5,MAX(BW*(BW+2),NB))+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PZBLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (BW+1), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'H' )) THEN CALL PZBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PZBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PZPBDCMV( BW+1, BW, UPLO, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), (BW+2)*BW, INFO ) * * * Regenerate column of B * CALL PZMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PZAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PDZNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PDZNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PZBLASCHK * END scalapack-2.0.2/TESTING/LIN/pzpbmv1.f000644 000766 000024 00000073165 10607174520 017234 0ustar00juliestaff000000 000000 SUBROUTINE PZPBDCMV( LDBW, BW, UPLO, N, A, JA, DESCA, NRHS, B, IB, $ DESCB, X, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LDBW, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ), WORK( * ), X( * ) * .. * * * Purpose * ======= * * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZPBTRF and this is stored in AF. If a linear system * is to be solved using PZPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, $ FIRST_PROC, I, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, J, JA_NEW, LLDA, LLDB, MYCOL, $ MYROW, MY_NUM_COLS, NB, NP, NPCOL, NPROW, $ NP_SAVE, ODD_SIZE, OFST, PART_OFFSET, $ PART_SIZE, STORE_M_B, STORE_N_A INTEGER NUMROC_SIZE * .. * .. Local Arrays .. INTEGER PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * ICTXT = DESCA( CTXT_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) LLDA = DESCA( LLD_ ) STORE_N_A = DESCA( N_ ) LLDB = DESCB( LLD_ ) STORE_M_B = DESCB( M_ ) * * * Pre-calculate bw^2 * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -14 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 7*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 7*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PZPBDCMV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, $ 'PZPBDCMV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1005 PARAM_CHECK( 15, 2 ) = 1004 PARAM_CHECK( 14, 2 ) = 1003 PARAM_CHECK( 13, 2 ) = 1002 PARAM_CHECK( 12, 2 ) = 1001 PARAM_CHECK( 11, 2 ) = 9 PARAM_CHECK( 10, 2 ) = 705 PARAM_CHECK( 9, 2 ) = 704 PARAM_CHECK( 8, 2 ) = 703 PARAM_CHECK( 7, 2 ) = 701 PARAM_CHECK( 6, 2 ) = 6 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZPBDCMV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW ENDIF * * * * Zero out solution to use to accumulate answer * NUMROC_SIZE = $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL) * DO 2279 J=1,NRHS DO 4502 I=1,NUMROC_SIZE X( (J-1)*LLDB + I ) = CZERO 4502 CONTINUE 2279 CONTINUE * DO 5642 I=1, (BW+2)*BW WORK( I ) = CZERO 5642 CONTINUE * * Begin main code * * ************************************** * IF ( LSAME( UPLO, 'L' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * IF( MYCOL .LT. NPCOL-1 ) THEN * ...must send triangle in upper half of matrix to right * * Transpose triangle in preparation for sending * CALL ZLATCPY( 'U', BW, BW, $ A( LLDA*( NUMROC_SIZE-BW )+1+BW ), $ LLDA-1, WORK( 1 ), BW ) * * Send the triangle to neighboring processor to right * CALL ZTRSD2D(ICTXT, 'L', 'N', $ BW, BW, $ WORK( 1 ), $ BW, MYROW, MYCOL+1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL ZHBMV( 'L', NUMROC_SIZE, BW, CONE, A( OFST+1 ), LLDA, $ B(PART_OFFSET+1), 1, CZERO, X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL ZCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( BW*BW+1+BW-DL_N_N ), 1 ) * CALL ZTRMV( 'U', 'N', 'N', BW, $ A( LLDA*( NUMROC_SIZE-BW )+1+BW ), LLDA-1, $ WORK( BW*BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 10 I = DL_N_M-DL_N_N, DL_N_M WORK( BW*BW+I ) = 0 10 CONTINUE ENDIF * * Send the result to the neighbor * CALL ZGESD2D( ICTXT, BW, 1, $ WORK( BW*BW+1 ), BW, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 20 I=1, BW*( BW+2 ) WORK( I ) = CZERO 20 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL ZCOPY( DL_P_M, B( 1 ), 1, $ WORK( BW*BW+1 ), 1) * * Receive the triangle prior to multiplying by it. * CALL ZTRRV2D(ICTXT, 'L', 'N', $ BW, BW, $ WORK( 1 ), BW, MYROW, MYCOL-1 ) * CALL ZTRMV( $ 'L', $ 'N', $ 'N', BW, $ WORK( 1 ), BW, $ WORK( BW*BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DL_P_M .GT. DL_P_N ) THEN DO 30 I=1, DL_P_M-DL_P_N WORK( BW*BW+I ) = 0 30 CONTINUE ENDIF * * Send result back * CALL ZGESD2D( ICTXT, BW, 1, WORK(BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL ZGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL ZAXPY( BW, CONE, $ WORK( BW*BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL ZGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL ZAXPY( BW, CONE, $ WORK( BW*BW+1 ), 1, $ X( NUMROC_SIZE-BW+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * ************************************** * IF ( LSAME( UPLO, 'U' ) ) THEN * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN * DL_P_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) DL_P_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN * DL_N_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) DL_N_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * IF( MYCOL .GT. 0 ) THEN * ...must send triangle in lower half of matrix to left * * Transpose triangle in preparation for sending * CALL ZLATCPY( 'L', BW, BW, A( OFST+1 ), $ LLDA-1, WORK( 1 ), BW ) * * Send the triangle to neighboring processor to left * CALL ZTRSD2D(ICTXT, 'U', 'N', $ BW, BW, $ WORK( 1 ), $ BW, MYROW, MYCOL-1 ) * ENDIF * * Use main partition in each processor to multiply locally * CALL ZHBMV( 'U', NUMROC_SIZE, BW, CONE, A( OFST+1 ), LLDA, $ B(PART_OFFSET+1), 1, CZERO, X( PART_OFFSET+1 ), 1 ) * * * IF ( MYCOL .LT. NPCOL-1 ) THEN * * Do the multiplication of the triangle in the lower half * CALL ZCOPY( DL_N_N, $ B( NUMROC_SIZE-DL_N_N+1 ), $ 1, WORK( BW*BW+1+BW-DL_N_N ), 1 ) * * Receive the triangle prior to multiplying by it. * CALL ZTRRV2D(ICTXT, 'U', 'N', $ BW, BW, $ WORK( 1 ), BW, MYROW, MYCOL+1 ) * CALL ZTRMV( 'U', 'N', 'N', BW, $ WORK( 1 ), BW, $ WORK( BW*BW+1 ), 1) * * Zero out extraneous elements caused by TRMV if any * IF( DL_N_M .GT. DL_N_N ) THEN DO 40 I = DL_N_M-DL_N_N, DL_N_M WORK( BW*BW+I ) = 0 40 CONTINUE ENDIF * * Send the result to the neighbor * CALL ZGESD2D( ICTXT, BW, 1, $ WORK( BW*BW+1 ), BW, MYROW, MYCOL+1 ) * ENDIF * IF ( MYCOL .GT. 0 ) THEN * DO 50 I=1, BW*( BW+2 ) WORK( I ) = CZERO 50 CONTINUE * * Do the multiplication of the triangle in the upper half * * Copy vector to workspace * CALL ZCOPY( DL_P_M, B( 1 ), 1, $ WORK( BW*BW+1 ), 1) * CALL ZTRMV( $ 'L', $ 'N', $ 'N', BW, $ A( 1 ), LLDA-1, $ WORK( BW*BW+1 ), 1 ) * * Zero out extraneous results from TRMV if any * IF( DL_P_M .GT. DL_P_N ) THEN DO 60 I=1, DL_P_M-DL_P_N WORK( BW*BW+I ) = 0 60 CONTINUE ENDIF * * Send result back * CALL ZGESD2D( ICTXT, BW, 1, WORK(BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Receive vector result from neighboring processor * CALL ZGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL-1 ) * * Do addition of received vector * CALL ZAXPY( BW, CONE, $ WORK( BW*BW+1 ), 1, $ X( 1 ), 1 ) * ENDIF * * * IF( MYCOL .LT. NPCOL-1 ) THEN * * Receive returned result * CALL ZGERV2D( ICTXT, BW, 1, WORK( BW*BW+1 ), $ BW, MYROW, MYCOL+1 ) * * Do addition of received vector * CALL ZAXPY( BW, CONE, $ WORK( BW*BW+1 ), 1, $ X( NUMROC_SIZE-BW+1 ), 1) * ENDIF * * ENDIF * * End of LSAME if * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * RETURN * * End of PZBhBMV1 * END scalapack-2.0.2/TESTING/LIN/pzpotrrv.f000644 000766 000024 00000027724 10363532303 017537 0ustar00juliestaff000000 000000 SUBROUTINE PZPOTRRV( UPLO, N, A, IA, JA, DESCA, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZPOTRRV recomputes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from L or U * computed by PZPOTRF. The routine performs the Cholesky factorization * in reverse. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * hermitian distributed matrix sub( A ) is stored: * stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the factors L or U of the * distributed matrix sub( A ) from the Cholesky factorization. * On exit, the original distributed matrix sub( A ) is * restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK >= MB_A*NB_A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) COMPLEX*16 CONE, ZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IL, J, JB, JL, JN, MYCOL, $ MYROW, NPCOL, NPROW * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PB_TOPGET, PB_TOPSET, $ PZLACPY, PZLASET, PZHERK, PZTRMM * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL ICEIL, INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * UPPER = LSAME( UPLO, 'U' ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JL = MAX( ( ( JA+N-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) IL = MAX( ( ( IA+N-2 ) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) IAROW = INDXG2P( IL, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) * * Define array descriptor for working array WORK * CALL DESCSET( DESCW, DESCA( MB_ ), DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * IF ( UPPER ) THEN * * Compute A from the Cholesky factor U : A = U'*U. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'S-ring' ) * DO 10 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + U'*U * CALL PZHERK( 'Upper', 'Conjugate Transpose', JA+N-J-JB, JB, $ ONE, A, IL, J+JB, DESCA, ONE, A, IL+JB, J+JB, $ DESCA ) * * Copy current diagonal block of A into workspace * CALL PZLACPY( 'All', JB, JB, A, IL, J, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict lower triangular part of diagonal block, to make * it U1. * CALL PZLASET( 'Lower', JB-1, JB, ZERO, ZERO, A, IL+1, J, $ DESCA ) * * Update the row panel U with the triangular matrix * CALL PZTRMM( 'Left', 'Upper', 'Conjugate Transpose', $ 'Non-Unit', JB, N-J+JA, CONE, WORK, 1, 1, $ DESCW, A, IL, J, DESCA ) * * Restore the strict lower triangular part of diagonal block. * CALL PZLACPY( 'Lower', JB-1, JB, WORK, 2, 1, DESCW, A, $ IL+1, J, DESCA ) * IL = IL - DESCA( MB_ ) DESCW( RSRC_ ) = MOD( DESCW( RSRC_ ) + NPROW - 1, NPROW ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + U'*U * CALL PZHERK( 'Upper', 'Conjugate Transpose', N-JB, JB, ONE, A, $ IA, JA+JB, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PZLACPY( 'All', JB, JB, A, IA, JA, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict lower triangular part of diagonal block, to make * it U1. * CALL PZLASET( 'Lower', JB-1, JB, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Update the row panel U with the triangular matrix * CALL PZTRMM( 'Left', 'Upper', 'Conjugate Transpose', 'Non-Unit', $ JB, N, CONE, WORK, 1, 1, DESCW, A, IA, JA, DESCA ) * * Restore the strict lower triangular part of diagonal block. * CALL PZLACPY( 'Lower', JB-1, JB, WORK, 2, 1, DESCW, A, IA+1, $ JA, DESCA ) * ELSE * * Compute A from the Cholesky factor L : A = L*L'. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * DO 20 J = JL, JN+1, -DESCA( NB_ ) * JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + L*L' * CALL PZHERK( 'Lower', 'No Transpose', IA+N-IL-JB, JB, ONE, A, $ IL+JB, J, DESCA, ONE, A, IL+JB, J+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PZLACPY( 'All', JB, JB, A, IL, J, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict upper triangular part of diagonal block, to make * it L1. * CALL PZLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IL, J+1, $ DESCA ) * * Update the column panel L with the triangular matrix * CALL PZTRMM( 'Right', 'Lower', 'Conjugate transpose', $ 'Non-Unit', IA+N-IL, JB, CONE, WORK, 1, 1, $ DESCW, A, IL, J, DESCA ) * * Restore the strict upper triangular part of diagonal block. * CALL PZLACPY( 'Upper', JB, JB-1, WORK, 1, 2, DESCW, A, $ IL, J+1, DESCA ) * IL = IL - DESCA( MB_ ) DESCW( RSRC_ ) = MOD( DESCW( RSRC_ ) + NPROW - 1, NPROW ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 20 CONTINUE * * Handle first block separately * JB = MIN( JN-JA+1, DESCA( NB_ ) ) * * Update the trailing matrix, A = A + L*L' * CALL PZHERK( 'Lower', 'No Transpose', N-JB, JB, ONE, A, $ IA+JB, JA, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * * Copy current diagonal block of A into workspace * CALL PZLACPY( 'All', JB, JB, A, IA, JA, DESCA, WORK, 1, 1, $ DESCW ) * * Zero strict upper triangular part of diagonal block, to make * it L1. * CALL PZLASET( 'Upper', JB, JB-1, ZERO, ZERO, A, IA, JA+1, $ DESCA ) * * Update the column panel L with the triangular matrix * CALL PZTRMM( 'Right', 'Lower', 'Conjugate transpose', $ 'Non-Unit', N, JB, CONE, WORK, 1, 1, DESCW, A, $ IA, JA, DESCA ) * * Restore the strict upper triangular part of diagonal block. * CALL PZLACPY( 'Upper', JB, JB-1, WORK, 1, 2, DESCW, A, IA, $ JA+1, DESCA ) * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PZPOTRRV * END scalapack-2.0.2/TESTING/LIN/pzptdriver.f000644 000766 000024 00000076370 10363532303 020043 0ustar00juliestaff000000 000000 PROGRAM PZPTDRIVER * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Purpose * ======= * * PZPTDRIVER is a test program for the * ScaLAPACK Band Cholesky routines corresponding to the options * indicated by ZPT. This test driver performs an * A = L*L**H factorization * and solves a linear system with the factors for 1 or more RHS. * * The program must be driven by a short data file. * Here's an example file: *'ScaLAPACK, Version 1.2, banded linear systems input file' *'PVM.' *'' output file name (if any) *6 device out *'L' define Lower or Upper *9 number of problem sizes *1 5 17 28 37 121 200 1023 2048 3073 values of N *6 number of bandwidths *1 2 4 10 31 64 values of BW *1 number of NB's *-1 3 4 5 values of NB (-1 for automatic choice) *1 number of NRHS's (must be 1) *8 values of NRHS *1 number of NBRHS's (ignored) *1 values of NBRHS (ignored) *6 number of process grids *1 2 3 4 5 7 8 15 26 47 64 values of "Number of Process Columns" *3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 6200000. * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM/ZPLXSZ ) * All arrays used by ScaLAPACK routines are allocated from * this array and referenced by pointers. The integer IPB, * for example, is a pointer to the starting element of MEM for * the solution vector(s) B. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. INTEGER TOTMEM PARAMETER ( TOTMEM = 3000000 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * DOUBLE PRECISION ZERO INTEGER MEMSIZ, NTESTS, ZPLXSZ COMPLEX*16 PADVAL PARAMETER ( ZPLXSZ = 16, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER BW, BW_NUM, FILLIN_SIZE, FREE_PTR, H, HH, I, $ IAM, IASEED, IBSEED, ICTXT, ICTXTB, IERR_TEMP, $ IMIDPAD, INFO, INT_TEMP, IPA, IPB, IPOSTPAD, $ IPREPAD, IPW, IPW_SIZE, IPW_SOLVE, $ IPW_SOLVE_SIZE, IP_DRIVER_W, IP_FILLIN, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, MYCOL, MYRHS_SIZE, $ MYROW, N, NB, NBW, NGRIDS, NMAT, NNB, NNBR, $ NNR, NOUT, NP, NPCOL, NPROCS, NPROCS_REAL, $ NPROW, NQ, NRHS, N_FIRST, N_LAST, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, NOPS, NOPS2, SRESID, TMFLOPS, $ TMFLOPS2 * .. * .. Local Arrays .. INTEGER BWVAL( NTESTS ), DESCA( 7 ), DESCA2D( DLEN_ ), $ DESCB( 7 ), DESCB2D( DLEN_ ), IERR( 1 ), $ NBRVAL( NTESTS ), NBVAL( NTESTS ), $ NRVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZBMATGEN, $ PZCHEKPAD, PZFILLPAD, PZMATGEN, PZPTINFO, $ PZPTLASCHK, PZPTTRF, PZPTTRS, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC LOGICAL LSAME DOUBLE PRECISION PZLANGE EXTERNAL LSAME, NUMROC, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Data Statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * * * * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 IBSEED = 200 * CALL PZPTINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NBW, $ BWVAL, NTESTS, NNB, NBVAL, NTESTS, NNR, NRVAL, $ NTESTS, NNBR, NBRVAL, NTESTS, NGRIDS, PVAL, NTESTS, $ QVAL, NTESTS, THRESH, MEM, IAM, NPROCS ) * CHECK = ( THRESH.GE.0.0D+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 50 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) * * * Define transpose process grid * CALL BLACS_GET( -1, 0, ICTXTB ) CALL BLACS_GRIDINIT( ICTXTB, 'Column-major', NPCOL, NPROW ) * * Go to bottom of process grid loop if this case doesn't use my * process * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.LT.0 .OR. MYCOL.LT.0 ) THEN GO TO 50 ENDIF * DO 40 J = 1, NMAT * IERR( 1 ) = 0 * N = NVAL( J ) * * Make sure matrix information is correct * IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'size' KSKIP = KSKIP + 1 GO TO 40 END IF * * DO 45 BW_NUM = 1, NBW * IERR( 1 ) = 0 * BW = 1 IF( BW.LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Band', 'bw', BW IERR( 1 ) = 1 END IF * IF( BW.GT.N-1 ) THEN IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 45 END IF * DO 30 K = 1, NNB * IERR( 1 ) = 0 * NB = NBVAL( K ) IF( NB.LT.0 ) THEN NB =( (N-(NPCOL-1)*INT_ONE-1)/NPCOL + 1 ) $ + INT_ONE NB = MAX( NB, 2*INT_ONE ) NB = MIN( N, NB ) END IF * * Make sure NB is legal * IERR( 1 ) = 0 IF( NB.LT.MIN( 2*INT_ONE, N ) ) THEN IERR( 1 ) = 1 ENDIF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, $ -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN KSKIP = KSKIP + 1 GO TO 30 END IF * * Padding constants * NP = NUMROC( (2), (2), $ MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( CHECK ) THEN IPREPAD = ((2)+10) IMIDPAD = 10 IPOSTPAD = ((2)+10) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA2D, N, (2), $ NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCA( 1 ) = 501 DESCA( 3 ) = N DESCA( 4 ) = NB DESCA( 5 ) = 0 DESCA( 2 ) = ICTXT DESCA( 6 ) = ((2)+10) DESCA( 7 ) = 0 * IERR_TEMP = IERR( 1 ) IERR( 1 ) = 0 IERR( 1 ) = MIN( IERR( 1 ), IERR_TEMP ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 30 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * FREE_PTR = 1 IPB = 0 * * Save room for prepadding FREE_PTR = FREE_PTR + IPREPAD * IPA = FREE_PTR FREE_PTR = FREE_PTR + (NB+10)*(2) $ + IPOSTPAD * * Add memory for fillin * Fillin space needs to store: * Fillin spike: * Contribution to previous proc's diagonal block of * reduced system: * Off-diagonal block of reduced system: * Diagonal block of reduced system: * FILLIN_SIZE = $ (12*NPCOL + 3*NB) * * Claim memory for fillin * FREE_PTR = FREE_PTR + IPREPAD IP_FILLIN = FREE_PTR FREE_PTR = FREE_PTR + FILLIN_SIZE * * Workspace needed by computational routines: * IPW_SIZE = 0 * * factorization: * IPW_SIZE = 8*NPCOL * * Claim memory for IPW * IPW = FREE_PTR FREE_PTR = FREE_PTR + IPW_SIZE * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ 'divide and conquer factorization', $ (FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * * Worksize needed for LAPRNT WORKSIZ = MAX( ((2)+10), NB ) * IF( CHECK ) THEN * * Calculate the amount of workspace required by * the checking routines. * * PZLANGE WORKSIZ = MAX( WORKSIZ, DESCA2D( NB_ ) ) * * PZPTLASCHK WORKSIZ = MAX( WORKSIZ, $ MAX(5,NB)+2*NB ) END IF * FREE_PTR = FREE_PTR + IPREPAD IP_DRIVER_W = FREE_PTR FREE_PTR = FREE_PTR + WORKSIZ + IPOSTPAD * * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'factorization', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, $ 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 30 END IF * CALL PZBMATGEN( ICTXT, UPLO, 'T', BW, BW, N, (2), NB, $ MEM( IPA ), NB+10, 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) CALL PZFILLPAD( ICTXT, NQ, NP, MEM( IPA-IPREPAD ), $ NB+10, IPREPAD, IPOSTPAD, $ PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) * * Calculate norm of A for residual error-checking * IF( CHECK ) THEN * ANORM = PZLANGE( 'I', N, $ (2), MEM( IPA ), 1, 1, $ DESCA2D, MEM( IP_DRIVER_W ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', NQ, NP, $ MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), WORKSIZ, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * IF( LSAME( UPLO, 'L' ) ) THEN INT_TEMP = 0 ELSE INT_TEMP = DESCA2D( LLD_ ) ENDIF * * For SPD Tridiagonal complex matrices, diagonal is stored * as a real. Thus, compact D into half the space * DO 10 H=1, NUMROC(N,NB,MYCOL,0,NPCOL)/2 MEM( IPA+INT_TEMP+H-1 ) = MEM( IPA+INT_TEMP+2*H-2 ) $ +MEM( IPA+INT_TEMP+2*H-1 )*( 0.0D+0, 1.0D+0 ) 10 CONTINUE IF( 2*(NUMROC(N,NB,MYCOL,0,NPCOL)/2).NE. $ NUMROC(N,NB,MYCOL,0,NPCOL) ) THEN H=NUMROC(N,NB,MYCOL,0,NPCOL)/2+1 MEM( IPA+INT_TEMP+H-1 ) = MEM( IPA+INT_TEMP+2*H-2 ) ENDIF * * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform factorization * CALL SLTIMER( 1 ) * CALL PZPTTRF( N, MEM( IPA+INT_TEMP ), $ MEM( IPA+1*( NB+10-INT_TEMP ) ), 1, DESCA, $ MEM( IP_FILLIN ), FILLIN_SIZE, MEM( IPW ), $ IPW_SIZE, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) 'PZPTTRF INFO=', INFO ENDIF KFAIL = KFAIL + 1 GO TO 30 END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PZCHEKPAD( ICTXT, 'PZPTTRF', NQ, $ NP, MEM( IPA-IPREPAD ), NB+10, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * * * Loop over the different values for NRHS * DO 20 HH = 1, NNR * IERR( 1 ) = 0 * NRHS = NRVAL( HH ) * * Initialize Array Descriptor for rhs * CALL DESCINIT( DESCB2D, N, NRHS, NB, 1, 0, 0, $ ICTXTB, NB+10, IERR( 1 ) ) * * Convert this to 1D descriptor * DESCB( 1 ) = 502 DESCB( 3 ) = N DESCB( 4 ) = NB DESCB( 5 ) = 0 DESCB( 2 ) = ICTXT DESCB( 6 ) = DESCB2D( LLD_ ) DESCB( 7 ) = 0 * * reset free_ptr to reuse space for right hand sides * IF( IPB .GT. 0 ) THEN FREE_PTR = IPB ENDIF * FREE_PTR = FREE_PTR + IPREPAD IPB = FREE_PTR FREE_PTR = FREE_PTR + NRHS*DESCB2D( LLD_ ) $ + IPOSTPAD * * Allocate workspace for workspace in TRS routine: * IPW_SOLVE_SIZE = (10+2*MIN(100,NRHS))*NPCOL+4*NRHS * IPW_SOLVE = FREE_PTR FREE_PTR = FREE_PTR + IPW_SOLVE_SIZE * IERR( 1 ) = 0 IF( FREE_PTR.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'solve', $ ( FREE_PTR )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, $ IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 15 END IF * MYRHS_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Generate RHS * CALL PZMATGEN(ICTXTB, 'No', 'No', $ DESCB2D( M_ ), DESCB2D( N_ ), $ DESCB2D( MB_ ), DESCB2D( NB_ ), $ MEM( IPB ), $ DESCB2D( LLD_ ), DESCB2D( RSRC_ ), $ DESCB2D( CSRC_ ), $ IBSEED, 0, MYRHS_SIZE, 0, NRHS, MYCOL, $ MYROW, NPCOL, NPROW ) * IF( CHECK ) THEN CALL PZFILLPAD( ICTXTB, NB, NRHS, $ MEM( IPB-IPREPAD ), $ DESCB2D( LLD_ ), $ IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * CALL BLACS_BARRIER( ICTXT, 'All') CALL SLTIMER( 2 ) * * Solve linear system via factorization * CALL PZPTTRS( UPLO, N, NRHS, MEM( IPA+INT_TEMP ), $ MEM( IPA+1*( NB+10-INT_TEMP ) ), 1, $ DESCA, MEM( IPB ), 1, DESCB, $ MEM( IP_FILLIN ), FILLIN_SIZE, $ MEM( IPW_SOLVE ), IPW_SOLVE_SIZE, $ INFO ) * CALL SLTIMER( 2 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) 'PZPTTRS INFO=', INFO KFAIL = KFAIL + 1 PASSED = 'FAILED' GO TO 20 END IF * IF( CHECK ) THEN * * check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZPTTRS-work', $ WORKSIZ, 1, $ MEM( IP_DRIVER_W-IPREPAD ), $ WORKSIZ, IPREPAD, $ IPOSTPAD, PADVAL ) * * check the solution to rhs * SRESID = ZERO * * Reset descriptor describing A to 1-by-P grid for * use in banded utility routines * CALL DESCINIT( DESCA2D, (2), N, $ (2), NB, 0, 0, $ ICTXT, (2), IERR( 1 ) ) CALL PZPTLASCHK( 'H', UPLO, N, BW, BW, NRHS, $ MEM( IPB ), 1, 1, DESCB2D, $ IASEED, MEM( IPA ), 1, 1, DESCA2D, $ IBSEED, ANORM, SRESID, $ MEM( IP_DRIVER_W ), WORKSIZ ) * IF( IAM.EQ.0 ) THEN IF( SRESID.GT.THRESH ) $ WRITE( NOUT, FMT = 9985 ) SRESID END IF * * The second test is a NaN trap * IF( ( SRESID.LE.THRESH ).AND. $ ( (SRESID-SRESID).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * END IF * 15 CONTINUE * Skipped tests jump to here to print out "SKIPPED" * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 2, 1, $ WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 2, 1, $ CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * NOPS = 0 NOPS2 = 0 * N_FIRST = NB NPROCS_REAL = ( N-1 )/NB + 1 N_LAST = MOD( N-1, NB ) + 1 * * NOPS = NOPS + DBLE(BW)*( -2.D0 / 3.D0+DBLE(BW)* $ ( -1.D0+DBLE(BW)*( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( 1.D0+DBLE(BW)*( 3.D0 / $ 2.D0+DBLE(BW)*( 1.D0 / 2.D0 ) ) ) NOPS = NOPS + DBLE(BW)*( -1.D0 / 6.D0+DBLE(BW) $ *( -1.D0 /2.D0+DBLE(BW) $ *( -1.D0 / 3.D0 ) ) ) + $ DBLE(N)*( DBLE(BW) / $ 2.D0*( 1.D0+DBLE(BW) ) ) * NOPS = NOPS + $ DBLE(NRHS)*( ( 2*DBLE(N)-DBLE(BW) )* $ ( DBLE(BW)+1.D0 ) )+ DBLE(NRHS)* $ ( DBLE(BW)*( 2*DBLE(N)- $ ( DBLE(BW)+1.D0 ) ) ) * * * Second calc to represent actual hardware speed * * NB bw^2 flops for LLt factorization in 1st proc * NOPS2 = ( (DBLE(N_FIRST))* DBLE(BW)**2 ) * IF ( NPROCS_REAL .GT. 1) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in last processor * NOPS2 = NOPS2 + $ 4*( (DBLE(N_LAST)*DBLE(BW)**2) ) ENDIF * IF ( NPROCS_REAL .GT. 2) THEN * 4 NB bw^2 flops for LLt factorization and * spike calc in other processors * NOPS2 = NOPS2 + (NPROCS_REAL-2)* $ 4*( (DBLE(NB)*DBLE(BW)**2) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ ( NPROCS_REAL-1 ) * ( BW*BW*BW/3 ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ ( NPROCS_REAL-2 ) * ( 2 * BW*BW*BW ) ENDIF * * * nrhs * 4 n_first*bw flops for LLt solve in proc 1. * NOPS2 = NOPS2 + $ ( 4.0D+0*(DBLE(N_FIRST)*DBLE(BW))*DBLE(NRHS) ) * IF ( NPROCS_REAL .GT. 1 ) THEN * * 2*nrhs*4 n_last*bw flops for LLt solve in last. * NOPS2 = NOPS2 + $ 2*( 4.0D+0*(DBLE(N_LAST)*DBLE(BW))*DBLE(NRHS) ) ENDIF * IF ( NPROCS_REAL .GT. 2 ) THEN * * 2 * nrhs * 4 NB*bw flops for LLt solve in others. * NOPS2 = NOPS2 + $ ( NPROCS_REAL-2)*2* $ ( 4.0D+0*(DBLE(NB)*DBLE(BW))*DBLE(NRHS) ) ENDIF * * Reduced system * NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-1 ) * ( BW*BW ) IF( NPROCS_REAL .GT. 1 ) THEN NOPS2 = NOPS2 + $ NRHS*( NPROCS_REAL-2 ) * ( 3 * BW*BW ) ENDIF * * * Multiply by 4 to get complex count * NOPS2 = NOPS2 * DBLE(4) * * Calculate total megaflops - factorization and/or * solve -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ) + WTIME( 2 ) .GT. 0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( WTIME( 1 )+WTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( WTIME( 1 )+WTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( WTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ WTIME( 1 ), WTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 )+CTIME( 2 ).GT.0.0D+0 ) THEN TMFLOPS2 = NOPS2 / $ ( ( CTIME( 1 )+CTIME( 2 ) ) * 1.0D+6 ) ELSE TMFLOPS2 = 0.0D+0 END IF * IF( CTIME( 2 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', UPLO, $ N, $ BW, $ NB, NRHS, NPROW, NPCOL, $ CTIME( 1 ), CTIME( 2 ), TMFLOPS, $ TMFLOPS2, PASSED * END IF 20 CONTINUE * * 30 CONTINUE * NNB loop * 45 CONTINUE * BW[] loop * 40 CONTINUE * NMAT loop * CALL BLACS_GRIDEXIT( ICTXT ) CALL BLACS_GRIDEXIT( ICTXTB ) * 50 CONTINUE * NGRIDS DROPOUT 60 CONTINUE * NGRIDS loop * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UL N BW NB NRHS P Q L*U Time ', $ 'Slv Time MFLOPS MFLOP2 CHECK' ) 9994 FORMAT( '---- -- ------ --- ---- ----- -- ---- -------- ', $ '-------- ------ ------ ------' ) 9993 FORMAT( A4, 2X, A1, 1X, I6, 1X, I3, 1X, I4, 1X, $ I5, 1X, I2, 1X, $ I4, 1X, F8.3, F9.4, F9.2, F9.2, 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - ', A4, '|| / (||A|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Ax-b||/(||x||*||A||*eps*N) ', F25.7 ) * STOP * * End of PZPTTRS_DRIVER * END * scalapack-2.0.2/TESTING/LIN/pzptinfo.f000644 000766 000024 00000044750 10363532303 017500 0ustar00juliestaff000000 000000 SUBROUTINE PZPTINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NBW, $ BWVAL, LDBWVAL, NNB, NBVAL, LDNBVAL, NNR, $ NRVAL, LDNRVAL, NNBR, NBRVAL, LDNBRVAL, $ NGRIDS, PVAL, LDPVAL, QVAL, LDQVAL, THRESH, $ WORK, IAM, NPROCS ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO CHARACTER*(*) SUMMRY INTEGER IAM, $ LDBWVAL, LDNBRVAL, LDNBVAL, LDNRVAL, LDNVAL, $ LDPVAL, LDQVAL, NGRIDS, NMAT, NNB, NNBR, NBW, $ NPROCS, NNR, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER NBRVAL( LDNBRVAL ), NBVAL( LDNBVAL ), $ NRVAL( LDNRVAL ), NVAL( LDNVAL ), $ BWVAL( LDBWVAL), $ PVAL( LDPVAL ), QVAL(LDQVAL), WORK( * ) * .. * * Purpose * ======= * * PZPTINFO get needed startup information for band factorization * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * NBW (global output) INTEGER * The number of different values that can be used for @bw@. * BWVAL (global output) INTEGER array, dimension (LDNVAL) * The values of BW (number of subdiagonals in matrix) to run * the code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NNR (global output) INTEGER * The number of different values that can be used for NRHS. * * NRVAL (global output) INTEGER array, dimension(LDNRVAL) * The values of NRHS (# of Right Hand Sides) to run the code * with. * * LDNRVAL (global input) INTEGER * The maximum number of different values that can be used for * NRHS, LDNRVAL >= NNR. * * NNBR (global output) INTEGER * The number of different values that can be used for NBRHS. * * NBRVAL (global output) INTEGER array, dimension (LDNBRVAL) * The values of NBRHS (RHS blocksize) to run the code with. * * LDNBRVAL (global input) INTEGER * The maximum number of different values that can be used for * NBRHS, LDNBRVAL >= NBRVAL. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * Not used (will be returned as all 1s) since proc grid is 1D * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 8, LDNVAL+2*LDNBVAL+LDNRVAL+LDNBRVAL+LDPVAL+LDQVAL * $ +3*LDNVAL) * Used to pack input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. INTEGER I, ICTXT CHARACTER*79 USRINFO DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'BLLT.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get bandwidths * READ( NIN, FMT = * ) NBW NBW = 1 IF( NBW.LT.1 .OR. NBW.GT.LDBWVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'BW', LDBWVAL GO TO 20 END IF READ( NIN, FMT = * ) ( BWVAL( I ), I = 1, NBW ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get values of NRHS * READ( NIN, FMT = * ) NNR IF( NNR.LT.1 .OR. NNR.GT.LDNRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NRHS', LDNRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NRVAL( I ), I = 1, NNR ) * * Get values of NBRHS * READ( NIN, FMT = * ) NNBR IF( NNBR.LT.1 .OR. NNBR.GT.LDNBRVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NBRHS', LDNBRVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBRVAL( I ), I = 1, NNBR ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 20 END IF * * Processor grid must be 1D so set PVAL to 1 DO 8738 I = 1, NGRIDS PVAL( I ) = 1 8738 CONTINUE * * Get values of Q * READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) I = 1 WORK( I ) = NMAT I = I+1 WORK( I ) = NBW I = I+1 WORK( I ) = NNB I = I+1 WORK( I ) = NNR I = I+1 WORK( I ) = NNBR I = I+1 WORK( I ) = NGRIDS I = I+1 IF( LSAME( UPLO, 'L' ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 2 END IF I = I+1 * Send number of elements to be sent CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, I-1, 1 ) * Send elements CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NBW, BWVAL, 1, WORK( I ), 1 ) I = I + NBW CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNR, NRVAL, 1, WORK( I ), 1 ) I = I + NNR CALL ICOPY( NNBR, NBRVAL, 1, WORK( I ), 1 ) I = I + NNBR CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', I-1, 1, WORK, I-1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK banded linear systems.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision band matrix solve ' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Solve residual = ||Ax - b|| / '// $ '(||x|| * ||A|| * eps * N)' IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - LL''|| /'// $ ' (||A|| * eps * N)' ELSE WRITE( NOUT, FMT = 9999 ) $ ' Factorization residual = ||A - U''U|| /'// $ ' (||A|| * eps * N)' END IF WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether data represents ''Upper'// $ ''' or ''Lower'' triangular portion of array A.' WRITE( NOUT, FMT = 9999 ) $ 'TRANS : Whether solve is to be done with'// $ ' ''Transpose'' of matrix A (T,C) or not (N).' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'bw : The number of diagonals '// $ 'in the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the column panels the'// $ ' matrix A is split into. [-1 for default]' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The total number of RHS to solve'// $ ' for.' WRITE( NOUT, FMT = 9999 ) $ 'NBRHS : The number of RHS to be put on '// $ 'a column of processes before going' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'Fact time: Time in seconds to factor the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'Sol Time: Time in seconds to solve the'// $ ' system.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for factor '// $ 'and solve using sequential operation count.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOP2 : Rough estimate of speed '// $ 'using actual op count (accurate big P,N).' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL(I), I = 1, MIN(NMAT, 10) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL(I), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'bw ', ( BWVAL(I), I = 1, MIN(NBW, 10) ) IF( NBW.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( BWVAL(I), I = 11, NBW ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL(I), I = 1, MIN(NNB, 10) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL(I), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NRHS ', ( NRVAL(I), I = 1, MIN(NNR, 10) ) IF( NNR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NRVAL(I), I = 11, NNR ) WRITE( NOUT, FMT = 9996 ) $ 'NBRHS', ( NBRVAL(I), I = 1, MIN(NNBR, 10) ) IF( NNBR.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBRVAL(I), I = 11, NNBR ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL(I), I = 1, MIN(NGRIDS, 10) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL(I), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, I, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 NMAT = WORK( I ) I = I+1 NBW = WORK( I ) I = I+1 NNB = WORK( I ) I = I+1 NNR = WORK( I ) I = I+1 NNBR = WORK( I ) I = I+1 NGRIDS = WORK( I ) I = I+1 IF( WORK( I ) .EQ. 1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF I = I+1 * I = NMAT + NBW + NNB + NNR + NNBR + 2*NGRIDS * CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NBW, WORK( I ), 1, BWVAL, 1 ) I = I + NBW CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NNR, WORK( I ), 1, NRVAL, 1 ) I = I + NNR CALL ICOPY( NNBR, WORK( I ), 1, NBRVAL, 1 ) I = I + NNBR CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) * CALL BLACS_ABORT( ICTXT, 1 ) STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ': ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZPTINFO * END scalapack-2.0.2/TESTING/LIN/pzptlaschk.f000644 000766 000024 00000027540 10363532303 020010 0ustar00juliestaff000000 000000 SUBROUTINE PZPTLASCHK( SYMM, UPLO, N, BWL, BWU, NRHS, X, IX, JX, $ DESCX, IASEED, A, IA, JA, DESCA, IBSEED, $ ANORM, RESID, WORK, WORKSIZ ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SYMM, UPLO INTEGER BWL, BWU, IA, IASEED, IBSEED, $ IX, JA, JX, N, NRHS, WORKSIZ DOUBLE PRECISION ANORM, RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX*16 A( * ), WORK( * ), X( * ) * .. External Functions .. LOGICAL LSAME * .. * * Purpose * ======= * * PZPTLASCHK computes the residual * || sub( A )*sub( X ) - B || / (|| sub( A ) ||*|| sub( X ) ||*eps*N) * to check the accuracy of the factorization and solve steps in the * LU and Cholesky decompositions, where sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), sub( X ) denotes X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SYMM (global input) CHARACTER * if SYMM = 'H', sub( A ) is a hermitian distributed band * matrix, otherwise sub( A ) is a general distributed matrix. * * UPLO (global input) CHARACTER * if SYMM = 'H', then * if UPLO = 'L', the lower half of the matrix is stored * if UPLO = 'U', the upper half of the matrix is stored * if SYMM != 'S' or 'H', then * if UPLO = 'D', the matrix is stable during factorization * without interchanges * if UPLO != 'D', the matrix is general * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right-hand-sides, i.e the number of columns * of the distributed matrix sub( X ). NRHS >= 1. * * X (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_X,LOCq(JX+NRHS-1). This array * contains the local pieces of the answer vector(s) sub( X ) of * sub( A ) sub( X ) - B, split up over a column of processes. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IBSEED (global input) INTEGER * The seed number to generate the original matrix B. * * ANORM (global input) DOUBLE PRECISION * The 1-norm or infinity norm of the distributed matrix * sub( A ). * * RESID (global output) DOUBLE PRECISION * The residual error: * ||sub( A )*sub( X )-B|| / (||sub( A )||*||sub( X )||*eps*N). * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * IF SYMM='S' * LWORK >= max(5,NB)+2*NB * IF SYMM!='S' or 'H' * LWORK >= max(5,NB)+2*NB * * WORKSIZ (local input) size of WORK. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, $ IIA, IIX, IPB, IPW, $ IXCOL, IXROW, J, JJA, JJX, LDA, $ MYCOL, MYROW, NB, NP, NPCOL, NPROW, NQ INTEGER I, START INTEGER BW, INFO, IPPRODUCT, WORK_MIN DOUBLE PRECISION DIVISOR, EPS, RESID1, NORMX * .. * .. Local Arrays .. * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D, PBZTRAN, $ PZMATGEN, ZGAMX2D, ZGEMM, ZGSUM2D, $ ZLASET * .. * .. External Functions .. INTEGER IZAMAX, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL IZAMAX, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get needed initial parameters * ICTXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * IF( LSAME( SYMM, 'H' ) ) THEN BW = BWL START = 1 WORK_MIN = MAX(5,NB)+2*NB ELSE BW = MAX(BWL, BWU) IF( LSAME( UPLO, 'D' )) THEN START = 1 ELSE START = 2 ENDIF WORK_MIN = MAX(5,NB)+2*NB ENDIF * IF ( WORKSIZ .LT. WORK_MIN ) THEN CALL PXERBLA( ICTXT, 'PZTLASCHK', -18 ) RETURN END IF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) RESID = 0.0D+0 DIVISOR = ANORM * EPS * DBLE( N ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) NP = NUMROC( (2), DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IPB = 1 IPPRODUCT = 1 + DESCA( NB_ ) IPW = 1 + 2*DESCA( NB_ ) * LDA = DESCA( LLD_ ) * * Regenerate A * IF( LSAME( SYMM, 'H' )) THEN CALL PZBMATGEN( ICTXT, UPLO, 'D', BW, BW, N, BW+1, $ DESCA( NB_ ), A, DESCA( LLD_ ), 0, 0, $ IASEED, MYROW, MYCOL, NPROW, NPCOL ) ELSE * CALL PZBMATGEN( ICTXT, 'N', UPLO, BWL, BWU, N, $ DESCA( MB_ ), DESCA( NB_ ), A, $ DESCA( LLD_ ), 0, 0, IASEED, MYROW, $ MYCOL, NPROW, NPCOL ) ENDIF IF( LSAME( UPLO, 'U' ) ) THEN * * * Matrix formed above has the diagonals shifted from what was * input to the tridiagonal routine. Shift them back. * * Send elements to neighboring processors * IF( MYCOL.LT.NPCOL-1 ) THEN CALL ZGESD2D( ICTXT, 1, 1, $ A( START+( DESCA( NB_ )-1 )*LDA ), $ LDA, MYROW, MYCOL+1 ) ENDIF * * Shift local elements * DO 230 I=DESCA( NB_ )-1,0,-1 A( START+(I+1)*LDA ) = A( START+(I)*LDA ) 230 CONTINUE * * Receive elements from neighboring processors * IF( MYCOL.GT.0 ) THEN CALL ZGERV2D( ICTXT, 1, 1, A( START), LDA, $ MYROW, MYCOL-1 ) ENDIF * ENDIF * * Loop over the rhs * RESID = 0.0 * DO 40 J = 1, NRHS * * Multiply A * current column of X * * CALL PZPBDCMV( BW+1, BW, UPLO, N, A, 1, DESCA, $ 1, X( 1 + (J-1)*DESCX( LLD_ )), 1, DESCX, $ WORK( IPPRODUCT ), WORK( IPW ), (BW+2)*BW, INFO ) * * * Regenerate column of B * CALL PZMATGEN( DESCX( CTXT_ ), 'No', 'No', DESCX( M_ ), $ DESCX( N_ ), DESCX( MB_ ), DESCX( NB_ ), $ WORK( IPB ), DESCX( LLD_ ), DESCX( RSRC_ ), $ DESCX( CSRC_ ), IBSEED, 0, NQ, J-1, 1, MYCOL, $ MYROW, NPCOL, NPROW ) * * Figure || A * X - B || & || X || * CALL PZAXPY( N, -ONE, WORK( IPPRODUCT ), 1, 1, DESCX, 1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * CALL PDZNRM2( N, NORMX, $ X, 1, J, DESCX, 1 ) * CALL PDZNRM2( N, RESID1, $ WORK( IPB ), 1, 1, DESCX, 1 ) * * * Calculate residual = ||Ax-b|| / (||x||*||A||*eps*N) * RESID1 = RESID1 / ( NORMX*DIVISOR ) * RESID = MAX( RESID, RESID1 ) * 40 CONTINUE * RETURN * * End of PZTLASCHK * END scalapack-2.0.2/TESTING/LIN/pzqrdriver.f000644 000766 000024 00000124010 10363532303 020023 0ustar00juliestaff000000 000000 PROGRAM PZQRDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * Purpose * ======= * * PZQRDRIVER is the main test program for the COMPLEX*16 * SCALAPACK QR factorization routines. This test driver performs a QR * QL, LQ, RQ, QP (QR factorization with column pivoting) or TZ * (complete unitary factorization) factorization and checks the * results. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 16 lines: * 'ScaLAPACK QR factorizations input file' * 'PVM machine' * 'QR.out' output file name (if any) * 6 device out * 6 number of factorizations * 'QR' 'QL' 'LQ' 'RQ' 'QP' 'TZ' factorization: QR, QL, LQ, RQ, QP, TZ * 4 number of problems sizes * 55 17 31 201 values of M * 5 71 31 201 values of N * 3 number of MB's and NB's * 4 3 5 values of MB * 4 7 3 values of NB * 7 number of process grids (ordered P & Q) * 1 2 1 4 2 3 8 values of P * 7 2 4 1 3 2 1 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ, DBLESZ and ZPLXSZ indicate the length in bytes on * the given platform for an integer, a double precision real * and a double precision complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, INTGSZ, MEMSIZ, NTESTS, TOTMEM, ZPLXSZ COMPLEX*16 PADVAL PARAMETER ( DBLESZ = 8, INTGSZ = 4, TOTMEM = 2000000, $ ZPLXSZ = 16, MEMSIZ = TOTMEM / ZPLXSZ, $ NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER*2 FACT CHARACTER*6 PASSED CHARACTER*7 ROUT CHARACTER*8 ROUTCHK CHARACTER*80 OUTFILE LOGICAL CHECK INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPPIV, IPREPAD, IPTAU, IPRW, IPW, J, $ K, KFAIL, KPASS, KSKIP, KTESTS, L, LIPIV, $ LRWORK, LTAU, LWORK, M, MAXMN, MB, MINMN, MNP, $ MNQ, MP, MYCOL, MYROW, N, NB, NFACT, NGRIDS, $ NMAT, NNB, NOUT, NPCOL, NPROCS, NPROW, NQ, $ WORKFCT, WORKRFCT, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Arrays .. CHARACTER*2 FACTOR( NTESTS ) INTEGER DESCA( DLEN_ ), IERR( 1 ), MBVAL( NTESTS ), $ MVAL( NTESTS ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZCHEKPAD, $ PZFILLPAD, PZGELQF, PZGELQRV, $ PZGEQLF, PZGEQLRV, PZGEQPF, $ PZQPPIV, PZGEQRF, PZGEQRRV, $ PZGERQF, PZGERQRV, PZTZRZRV, $ PZMATGEN, PZLAFCHK, PZQRINFO, $ PZTZRZF, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAMEN INTEGER ICEIL, NUMROC DOUBLE PRECISION PZLANGE EXTERNAL ICEIL, LSAMEN, NUMROC, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA KTESTS, KPASS, KFAIL, KSKIP /4*0/ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PZQRINFO( OUTFILE, NOUT, NFACT, FACTOR, NTESTS, NMAT, MVAL, $ NTESTS, NVAL, NTESTS, NNB, MBVAL, NTESTS, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Loop over the different factorization types * DO 40 I = 1, NFACT * FACT = FACTOR( I ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) IF( LSAMEN( 2, FACT, 'QR' ) ) THEN ROUT = 'PZGEQRF' ROUTCHK = 'PZGEQRRV' WRITE( NOUT, FMT = 9986 ) $ 'QR factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN ROUT = 'PZGEQLF' ROUTCHK = 'PZGEQLRV' WRITE( NOUT, FMT = 9986 ) $ 'QL factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN ROUT = 'PZGELQF' ROUTCHK = 'PZGELQRV' WRITE( NOUT, FMT = 9986 ) $ 'LQ factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN ROUT = 'PZGERQF' ROUTCHK = 'PZGERQRV' WRITE( NOUT, FMT = 9986 ) $ 'RQ factorization tests.' ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN ROUT = 'PZGEQPF' ROUTCHK = 'PZGEQRRV' WRITE( NOUT, FMT = 9986 ) $ 'QR factorization with column pivoting tests.' ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN ROUT = 'PZTZRZF' ROUTCHK = 'PZTZRZRV' WRITE( NOUT, FMT = 9986 ) $ 'Complete unitary factorization tests.' END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 J = 1, NGRIDS * NPROW = PVAL( J ) NPCOL = QVAL( J ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 K = 1, NMAT * M = MVAL( K ) N = NVAL( K ) * * Make sure matrix information is correct * IERR(1) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 L = 1, NNB * MB = MBVAL( L ) NB = NBVAL( L ) * * Make sure mb is legal * IERR( 1 ) = 0 IF( MB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MB', 'MB', MB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * MP = NUMROC( M, MB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) MNP = NUMROC( MIN( M, N ), MB, MYROW, 0, NPROW ) MNQ = NUMROC( MIN( M, N ), NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( MB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, MB, NB, 0, 0, ICTXT, $ MAX( 1, MP ) + IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for ScaLAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD+1 IPTAU = IPA + DESCA( LLD_ ) * NQ + IPOSTPAD + IPREPAD * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN * LTAU = MNQ IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QR * factorization * LWORK = DESCA( NB_ ) * ( MP + NQ + DESCA( NB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZLAFCHK, PZGEQRRV and * PZLANGE * WORKSIZ = LWORK + MP*DESCA( NB_ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN * LTAU = NQ IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QL * factorization * LWORK = DESCA( NB_ ) * ( MP + NQ + DESCA( NB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZLAFCHK, PZGEQLRV and * PZLANGE * WORKSIZ = LWORK + MP*DESCA( NB_ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN * LTAU = MNP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the LQ * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZLAFCHK, PZGELQRV and * PZLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN * LTAU = MP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the QR * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZLAFCHK, PZGERQRV and * PZLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * LTAU = MNQ IPPIV = IPTAU + LTAU + IPOSTPAD + IPREPAD LIPIV = ICEIL( INTGSZ*NQ, ZPLXSZ ) IPW = IPPIV + LIPIV + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the * factorization i.e from IPW on. * LWORK = MAX( 3, MP + MAX( 1, NQ ) ) WORKFCT = LWORK + IPOSTPAD LRWORK = MAX( 1, 2 * NQ ) WORKRFCT = ICEIL( LRWORK*DBLESZ, ZPLXSZ ) + $ IPOSTPAD IPRW = IPW + WORKFCT + IPREPAD WORKSIZ = WORKFCT + IPREPAD + WORKRFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZLAFCHK, PZGEQRRV, * PZLANGE. * WORKSIZ = MAX( WORKSIZ - IPOSTPAD, $ DESCA( NB_ )*( 2*MP + NQ + DESCA( NB_ ) ) ) + $ IPOSTPAD END IF * ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN * LTAU = MP IPW = IPTAU + LTAU + IPOSTPAD + IPREPAD * * Figure the amount of workspace required by the TZ * factorization * LWORK = DESCA( MB_ ) * ( MP + NQ + DESCA( MB_ ) ) WORKFCT = LWORK + IPOSTPAD WORKSIZ = WORKFCT * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZLAFCHK, PZTZRZRV and * PZLANGE * WORKSIZ = LWORK + $ MAX( MP*DESCA( NB_ ), NQ*DESCA( MB_ ) $ ) + IPOSTPAD * END IF * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) $ FACT // ' factorization', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, $ 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PZMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need the Infinity of A for checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PZFILLPAD( ICTXT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PZFILLPAD( ICTXT, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANGE( 'I', M, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PZFILLPAD( ICTXT, WORKRFCT-IPOSTPAD, 1, $ MEM( IPRW-IPREPAD ), $ WORKRFCT-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PZFILLPAD( ICTXT, WORKFCT-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKFCT-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Perform QR factorizations * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN CALL SLTIMER( 1 ) CALL PZGEQRF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN CALL SLTIMER( 1 ) CALL PZGEQLF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN CALL SLTIMER( 1 ) CALL PZGELQF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN CALL SLTIMER( 1 ) CALL PZGERQF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL SLTIMER( 1 ) CALL PZGEQPF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ), MEM( IPTAU ), $ MEM( IPW ), LWORK, MEM( IPRW ), $ LRWORK, INFO ) CALL SLTIMER( 1 ) ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN CALL SLTIMER( 1 ) IF( N.GE.M ) $ CALL PZTZRZF( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ), LWORK, $ INFO ) CALL SLTIMER( 1 ) END IF * IF( CHECK ) THEN * * Check for memory overwrite in factorization * CALL PZCHEKPAD( ICTXT, ROUT, MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, ROUT, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) IF( LSAMEN( 2, FACT, 'QP' ) ) THEN CALL PZCHEKPAD( ICTXT, ROUT, LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, ROUT, WORKRFCT-IPOSTPAD, $ 1, MEM( IPRW-IPREPAD ), $ WORKRFCT-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF CALL PZCHEKPAD( ICTXT, ROUT, WORKFCT-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKFCT-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * IF( LSAMEN( 2, FACT, 'QR' ) ) THEN * * Compute residual = ||A-Q*R|| / (||A||*N*eps) * CALL PZGEQRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PZLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'QL' ) ) THEN * * Compute residual = ||A-Q*L|| / (||A||*N*eps) * CALL PZGEQLRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PZLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'LQ' ) ) THEN * * Compute residual = ||A-L*Q|| / (||A||*N*eps) * CALL PZGELQRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PZLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'RQ' ) ) THEN * * Compute residual = ||A-R*Q|| / (||A||*N*eps) * CALL PZGERQRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) CALL PZLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * * Compute residual = ||AP-Q*R|| / (||A||*N*eps) * CALL PZGEQRRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) ELSE IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN * * Compute residual = ||A-T*Z|| / (||A||*N*eps) * IF( N.GE.M ) THEN CALL PZTZRZRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPTAU ), MEM( IPW ) ) END IF CALL PZLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) END IF * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, ROUTCHK, MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, ROUTCHK, LTAU, 1, $ MEM( IPTAU-IPREPAD ), LTAU, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, ROUTCHK, WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * IF( LSAMEN( 2, FACT, 'QP' ) ) THEN * CALL PZQPPIV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPPIV ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZQPPIV', MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZQPPIV', LIPIV, 1, $ MEM( IPPIV-IPREPAD ), LIPIV, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PZLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, $ 1, DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZLAFCHK', MP, NQ, $ MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLAFCHK', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) END IF * * Test residual and detect NaN result * IF( LSAMEN( 2, FACT, 'TZ' ) .AND. N.LT.M ) THEN KSKIP = KSKIP + 1 PASSED = 'BYPASS' ELSE IF( FRESID.LE.THRESH .AND. $ (FRESID-FRESID).EQ.0.0D+0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * MINMN = MIN( M, N ) MAXMN = MAX( M, N ) * IF( LSAMEN( 2, FACT, 'TZ' ) ) THEN IF( M.GE.N ) THEN NOPS = 0.0D+0 ELSE * * 9 ( M^2 N - M^3 ) + 13 M N - M^2 for * complete unitary factorization (M <= N). * NOPS = 9.0D+0 * ( $ DBLE( N )*( DBLE( M )**2 ) - $ DBLE( M )**3 ) + $ 13.0D+0*DBLE( N )*DBLE( M ) - $ DBLE( M )**2 END IF * ELSE * * 8 M N^2 - 8/3 N^2 + 6 M N + 8 N^2 for QR type * factorization when M >= N. * NOPS = 8.0D+0 * ( DBLE( MINMN )**2 ) * $ ( DBLE( MAXMN )-DBLE( MINMN ) / 3.0D+0 ) + $ ( 6.0D+0 * DBLE( MAXMN ) + $ 8.0D+0 * DBLE( MINMN ) ) * $ DBLE( MINMN ) END IF * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, MB, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, $ PASSED, FRESID * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, MB, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, $ PASSED, FRESID * END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * 40 CONTINUE * * Print out ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N MB NB P Q Fact Time ', $ ' MFLOPS CHECK Residual' ) 9994 FORMAT( '---- ------ ------ --- --- ----- ----- --------- ', $ '----------- ------ --------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I3, 1X, I3, 1X, I5, 1X, I5, 1X, $ F9.2, 1X, F11.2, 1X, A6, 2X, G8.1 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( A ) * STOP * * End of PZQRDRIVER * END * SUBROUTINE PZQPPIV( M, N, A, IA, JA, DESCA, IPIV ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZQPPIV applies to sub( A ) = A(IA:IA+M-1,JA:JA+N-1) the pivots * returned by PZGEQPF in reverse order for checking purposes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be permuted. On exit, the local pieces * of the distributed permuted submatrix sub( A ) * Inv( P ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension LOCc(JA+N-1). * On exit, if IPIV(I) = K, the local i-th column of sub( A )*P * was the global K-th column of sub( A ). IPIV is tied to the * distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IACOL, ICOFFA, ICTXT, IITMP, IPVT, IPCOL, $ IPROW, ITMP, J, JJ, JJA, KK, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, IGERV2D, $ IGESD2D, IGAMN2D, INFOG1L, PZSWAP * .. * .. External Functions .. INTEGER INDXL2G, NUMROC EXTERNAL INDXL2G, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG1L( JA, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), JJA, $ IACOL ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * DO 20 J = JA, JA+N-2 * IPVT = JA+N-1 ITMP = JA+N * * Find first the local minimum candidate for pivoting * CALL INFOG1L( J, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), $ JJ, IACOL ) DO 10 KK = JJ, JJA+NQ-1 IF( IPIV( KK ).LT.IPVT )THEN IITMP = KK IPVT = IPIV( KK ) END IF 10 CONTINUE * * Find the global minimum pivot * CALL IGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, IPVT, 1, IPROW, $ IPCOL, 1, -1, MYCOL ) * * Broadcast the corresponding index to the other process columns * IF( MYCOL.EQ.IPCOL ) THEN ITMP = INDXL2G( IITMP, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL IGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ITMP, 1 ) IF( IPCOL.NE.IACOL ) THEN CALL IGERV2D( ICTXT, 1, 1, IPIV( IITMP ), 1, MYROW, $ IACOL ) ELSE IF( MYCOL.EQ.IACOL ) $ IPIV( IITMP ) = IPIV( JJ ) END IF ELSE CALL IGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ITMP, 1, MYROW, $ IPCOL ) IF( MYCOL.EQ.IACOL .AND. IPCOL.NE.IACOL ) $ CALL IGESD2D( ICTXT, 1, 1, IPIV( JJ ), 1, MYROW, IPCOL ) END IF * * Swap the columns of A * CALL PZSWAP( M, A, IA, ITMP, DESCA, 1, A, IA, J, DESCA, 1 ) * 20 CONTINUE * * End of PZQPPIV * END scalapack-2.0.2/TESTING/LIN/pzqrinfo.f000644 000766 000024 00000042542 10363532303 017474 0ustar00juliestaff000000 000000 SUBROUTINE PZQRINFO( SUMMRY, NOUT, NFACT, FACTOR, LDFACT, NMAT, $ MVAL, LDMVAL, NVAL, LDNVAL, NNB, MBVAL, $ LDMBVAL, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDFACT, LDMBVAL, LDMVAL, LDNBVAL, LDNVAL, $ LDPVAL, LDQVAL, NFACT, NGRIDS, NMAT, NNB, $ NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. CHARACTER*2 FACTOR( LDFACT ) CHARACTER*(*) SUMMRY INTEGER MBVAL( LDMBVAL ), MVAL( LDMVAL ), $ NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZQRINFO gets needed startup information for the QR factoriza- * tion routines and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NFACT (global output) INTEGER * The number of different factorization types to be tested. * * FACTOR (global output) CHARACTER*2 array of dimension of LDFACT, * The factorization types to be tested: * if FACTOR(i) = 'QR' then QR factorization, * if FACTOR(i) = 'QL' then QL factorization, * if FACTOR(i) = 'LQ' then LQ factorization, * if FACTOR(i) = 'RQ' then RQ factorization, * if FACTOR(i) = 'QP' then QR factorization with column * pivoting. * if FACTOR(i) = 'TZ' then complete unitary factorization. * * LDFACT (global input) INTEGER * The maximum number of different factorization types to be * tested. LDFACT >= NFACT. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * MVAL (global output) INTEGER array of dimension (LDNVAL), the * values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M, LDNVAL > = NMAT. * * NVAL (global output) INTEGER array of dimension (LDNVAL), the * values of N (number of columns in matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for MB and * NB. * * MBVAL (global output) INTEGER array of dimension (LDMBVAL), the * values of MB (row blocksize) to run the code with. * * LDMBVAL (global input) INTEGER * The maximum number of different values that can be used for * MB, LDMBVAL >= NNB. * * NBVAL (global output) INTEGER array of dimension (LDNBVAL), the * values of NB (column blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array of dimension (LDPVAL), the * values of P (number of process rows) to run the code with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array of dimension (LDQVAL), the * values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH, perform * factor check only if solve check fails * * WORK (local workspace) INTEGER array of dimension >= * MAX( 4, LDFACT+LDMVAL+LDNVAL+LDMBVAL+LDNBVAL+LDPVAL+LDQVAL ) * used to pack all input arrays in order to send info in one * message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT, K DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAMEN DOUBLE PRECISION PDLAMCH EXTERNAL LSAMEN, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='QR.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get the matrix types to be tested * READ( NIN, FMT = * ) NFACT IF( NFACT.LT.1 .OR. NFACT.GT.LDFACT ) THEN WRITE( NOUT, FMT = 9994 ) 'nb of factorization', LDFACT GO TO 40 END IF READ( NIN, FMT = * ) ( FACTOR( I ), I = 1, NFACT ) * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GO TO 40 ELSE IF( NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'M', LDMVAL GO TO 40 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDMBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'MB', LDMBVAL GO TO 40 ELSE IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GO TO 40 END IF READ( NIN, FMT = * ) ( MBVAL( I ), I = 1, NNB ) READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GO TO 40 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GO TO 40 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I ) * QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS WORK( 4 ) = NFACT CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 DO 20 K = 1, NFACT IF( LSAMEN( 2, FACTOR( K ), 'QR' ) ) THEN WORK( I ) = 1 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'QL' ) ) THEN WORK( I ) = 2 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'LQ' ) ) THEN WORK( I ) = 3 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'RQ' ) ) THEN WORK( I ) = 4 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'QP' ) ) THEN WORK( I ) = 5 I = I + 1 ELSE IF( LSAMEN( 2, FACTOR( K ), 'TZ' ) ) THEN WORK( I ) = 6 I = I + 1 END IF 20 CONTINUE * CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, MBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK QR factorizations routines.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision QR factorizations '// $ 'routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' || A - QR || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - QL || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - LQ || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - RQ || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - QRP || / (|| A || * eps * N) and/or' WRITE( NOUT, FMT = 9999 ) $ ' || A - TZ || / (|| A || * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the '// $ 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'MB : The row blocksize of the blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The column blocksize of the blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than'// $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 ) $ 'Fact Time: Time in seconds to factor the'// $ ' matrix.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Execution rate of the '// $ 'factorization.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 ) $ 'M ', ( MVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'MB ', ( MBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( MBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) NFACT = WORK( 4 ) * I = NFACT + 2*NMAT + 2*NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * DO 30 K = 1, NFACT IF( WORK( K ).EQ.1 ) THEN FACTOR( K ) = 'QR' ELSE IF( WORK( K ).EQ.2 ) THEN FACTOR( K ) = 'QL' ELSE IF( WORK( K ).EQ.3 ) THEN FACTOR( K ) = 'LQ' ELSE IF( WORK( K ).EQ.4 ) THEN FACTOR( K ) = 'RQ' ELSE IF( WORK( K ).EQ.5 ) THEN FACTOR( K ) = 'QP' ELSE IF( WORK( K ).EQ.6 ) THEN FACTOR( K ) = 'TZ' END IF 30 CONTINUE * I = NFACT + 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, MBVAL, 1 ) I = I + NNB CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 40 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZQRINFO * END scalapack-2.0.2/TESTING/LIN/pzqrt13.f000644 000766 000024 00000020615 10363532303 017145 0ustar00juliestaff000000 000000 SUBROUTINE PZQRT13( SCALE, M, N, A, IA, JA, DESCA, NORMA, ISEED, $ WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, ISEED, JA, M, N, SCALE DOUBLE PRECISION NORMA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION WORK( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZQRT13 generates a full-rank matrix that may be scaled to have * large or small norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SCALE (global input) INTEGER * SCALE = 1: normally scaled matrix * SCALE = 2: matrix scaled up * SCALE = 3: matrix scaled down * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * NORMA (global output) DOUBLE PRECISION * The one-norm of A. * * ISEED (global input/global output) INTEGER * Seed for random number generator. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK >= Nq0, where * * ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), and * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, IIA, INFO, $ IROFFA, J, JJA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ DOUBLE PRECISION ASUM, BIGNUM, SMLNUM COMPLEX*16 AJJ * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL NUMROC, PDLAMCH, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PDLABAD, $ PDZASUM, PZLASCL, PZMATGEN, $ PZELGET, PZELSET * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MOD, SIGN * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * generate the matrix * IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) MP = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * CALL PZMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), A, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED, IIA-1, MP, $ JJA-1, NQ, MYROW, MYCOL, NPROW, NPCOL ) * DO 10 J = JA, JA+N-1 I = IA + J - JA IF( I.LE.IA+M-1 ) THEN CALL PDZASUM( M, ASUM, A, IA, J, DESCA, 1 ) CALL PZELGET( 'Column', ' ', AJJ, A, I, J, DESCA ) AJJ = AJJ + DCMPLX( SIGN( ASUM, DBLE( AJJ ) ) ) CALL PZELSET( A, I, J, DESCA, AJJ ) END IF 10 CONTINUE * * scaled versions * IF( SCALE.NE.1 ) THEN * NORMA = PZLANGE( 'M', M, N, A, IA, JA, DESCA, WORK ) SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM CALL PDLABAD( ICTXT, SMLNUM, BIGNUM ) SMLNUM = SMLNUM / PDLAMCH( ICTXT, 'Epsilon' ) BIGNUM = ONE / SMLNUM * IF( SCALE.EQ.2 ) THEN * * matrix scaled up * CALL PZLASCL( 'General', NORMA, BIGNUM, M, N, A, IA, $ JA, DESCA, INFO ) * ELSE IF( SCALE.EQ.3 ) THEN * * matrix scaled down * CALL PZLASCL( 'General', NORMA, SMLNUM, M, N, A, IA, $ JA, DESCA, INFO ) * END IF * END IF * NORMA = PZLANGE( 'One-norm', M, N, A, IA, JA, DESCA, WORK ) * RETURN * * End of PZQRT13 * END scalapack-2.0.2/TESTING/LIN/pzqrt14.f000644 000766 000024 00000034021 10363532303 017142 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PZQRT14( TRANS, M, N, NRHS, A, IA, JA, $ DESCA, X, IX, JX, DESCX, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IX, JA, JX, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) COMPLEX*16 A( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PZQRT14 checks whether sub( X ) is in the row space of sub( A ) or * sub( A )', where sub( A ) denotes A( IA:IA+M-1, JA:JA+N-1 ) and * sub( X ) denotes X( IX:IX+N-1, JX:JX+NRHS-1 ) if TRANS = 'N', and * X( IX:IX+N-1, JX:JX+NRHS-1 ) otherwise. It does so by scaling both * sub( X ) and sub( A ) such that their norms are in the range * [sqrt(eps), 1/sqrt(eps)], then computing an LQ factorization of * [sub( A )',sub( X )]' (if TRANS = 'N') or a QR factorization of * [sub( A ),sub( X )] otherwise, and returning the norm of the trailing * triangle, scaled by MAX(M,N,NRHS)*eps. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * = 'N': No transpose, check for sub( X ) in the row space of * sub( A ), * = 'C': Conjugate transpose, check for sub( X ) in row space * of sub( A )'. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( X ). NRHS >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)). This array * contains the local pieces of the M-by-N distributed matrix * sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) COMPLEX*16 pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). * On entry, this array contains the local pieces of the * N-by-NRHS distributed submatrix sub( X ) if TRANS = 'N', * and the M-by-NRHS distributed submatrix sub( X ) otherwise. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * WORK (local workspace) COMPLEX*16 array dimension (LWORK) * If TRANS='N', LWORK >= MNRHSP * NQ + LTAU + LWF and * LWORK >= MP * NNRHSQ + LTAU + LWF otherwise, where * * IF TRANS='N', (LQ fact) * MNRHSP = NUMROC( M+NRHS+IROFFA, MB_A, MYROW, IAROW, * NPROW ) * LTAU = NUMROC( IA+MIN( M+NRHS, N )-1, MB_A, MYROW, * RSRC_A, NPROW ) * LWF = MB_A * ( MB_A + MNRHSP + NQ0 ) * ELSE (QR fact) * NNRHSQ = NUMROC( N+NRHS+ICOFFA, NB_A, MYCOL, IACOL, * NPCOL ) * LTAU = NUMROC( JA+MIN( M, N+NRHS )-1, NB_A, MYCOL, * CSRC_A, NPCOL ) * LWF = NB_A * ( NB_A + MP0 + NNRHSQ ) * END IF * * and, * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MP0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NQ0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL TPSD INTEGER IACOL, IAROW, ICOFFA, ICTXT, IDUM, IIA, INFO, $ IPTAU, IPW, IPWA, IROFFA, IWA, IWX, J, JJA, $ JWA, JWX, LDW, LWORK, MPWA, MPW, MQW, MYCOL, $ MYROW, NPCOL, NPROW, NPW, NQWA, NQW DOUBLE PRECISION ANRM, ERR, XNRM COMPLEX*16 AMAX * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) DOUBLE PRECISION RWORK( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL LSAME, NUMROC, PDLAMCH, PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, DGAMX2D, INFOG2L, $ PXERBLA, PZMAX1, PZCOPY, PZGELQF, $ PZGEQRF, PZLACGV, PZLACPY, PZLASCL * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * PZQRT14 = ZERO * IPWA = 1 IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IWA = IROFFA + 1 JWA = ICOFFA + 1 CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) MPWA = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQWA = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * INFO = 0 IF( LSAME( TRANS, 'N' ) ) THEN IF( N.LE.0 .OR. NRHS.LE.0 ) $ RETURN TPSD = .FALSE. MPW = NUMROC( M+NRHS+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) NQW = NQWA * * Assign descriptor DESCW for workspace WORK and pointers to * matrices sub( A ) and sub( X ) in workspace * IWX = IWA + M JWX = JWA LDW = MAX( 1, MPW ) CALL DESCSET( DESCW, M+NRHS+IROFFA, N+ICOFFA, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, LDW ) * ELSE IF( LSAME( TRANS, 'C' ) ) THEN IF( M.LE.0 .OR. NRHS.LE.0 ) $ RETURN TPSD = .TRUE. MPW = MPWA NQW = NUMROC( N+NRHS+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) * * Assign descriptor DESCW for workspace WORK and pointers to * matrices sub( A ) and sub( X ) in workspace * IWX = IWA JWX = JWA + N LDW = MAX( 1, MPW ) CALL DESCSET( DESCW, M+IROFFA, N+NRHS+ICOFFA, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, LDW ) ELSE CALL PXERBLA( ICTXT, 'PZQRT14', -1 ) RETURN END IF * * Copy and scale sub( A ) * IPTAU = IPWA + MPW*NQW CALL PZLACPY( 'All', M, N, A, IA, JA, DESCA, WORK( IPWA ), IWA, $ JWA, DESCW ) RWORK( 1 ) = ZERO ANRM = PZLANGE( 'M', M, N, WORK( IPWA ), IWA, JWA, DESCW, RWORK ) IF( ANRM.NE.ZERO ) $ CALL PZLASCL( 'G', ANRM, ONE, M, N, WORK( IPWA ), IWA, $ JWA, DESCW, INFO ) * * Copy sub( X ) or sub( X )' into the right place and scale it * IF( TPSD ) THEN * * Copy sub( X ) into columns jwa+n:jwa+n+nrhs-1 of work * DO 10 J = 1, NRHS CALL PZCOPY( M, X, IX, JX+J-1, DESCX, 1, WORK( IPWA ), IWX, $ JWX+J-1, DESCW, 1 ) 10 CONTINUE XNRM = PZLANGE( 'M', M, NRHS, WORK( IPWA ), IWX, JWX, DESCW, $ RWORK ) IF( XNRM.NE.ZERO ) $ CALL PZLASCL( 'G', XNRM, ONE, M, NRHS, WORK( IPWA ), IWX, $ JWX, DESCW, INFO ) * * Compute QR factorization of work(iwa:iwa+m-1,jwa:jwa+n+nrhs-1) * MQW = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPW = IPTAU + MIN( MQW, NQW ) LWORK = DESCW( NB_ ) * ( MPW + NQW + DESCW( NB_ ) ) CALL PZGEQRF( M, N+NRHS, WORK( IPWA ), IWA, JWA, DESCW, $ WORK( IPTAU ), WORK( IPW ), LWORK, INFO ) * * Compute largest entry in upper triangle of * work(iwa+n:iwa+m-1,jwa+n:jwa+n+nrhs-1) * ERR = ZERO IF( N.LT.M ) THEN DO 20 J = JWX, JWA+N+NRHS-1 CALL PZMAX1( MIN(M-N,J-JWX+1), AMAX, IDUM, WORK( IPWA ), $ IWA+N, J, DESCW, 1 ) ERR = MAX( ERR, ABS( AMAX ) ) 20 CONTINUE END IF CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, IDUM1, IDUM2, $ -1, -1, 0 ) * ELSE * * Copy sub( X )' into rows iwa+m:iwa+m+nrhs-1 of work * DO 30 J = 1, NRHS CALL PZCOPY( N, X, IX, JX+J-1, DESCX, 1, WORK( IPWA ), $ IWX+J-1, JWX, DESCW, DESCW( M_ ) ) CALL PZLACGV( N, WORK( IPWA ), IWX+J-1, JWX, DESCW, $ DESCW( M_ ) ) 30 CONTINUE * XNRM = PZLANGE( 'M', NRHS, N, WORK( IPWA ), IWX, JWX, DESCW, $ RWORK ) IF( XNRM.NE.ZERO ) $ CALL PZLASCL( 'G', XNRM, ONE, NRHS, N, WORK( IPWA ), IWX, $ JWX, DESCW, INFO ) * * Compute LQ factorization of work(iwa:iwa+m+nrhs-1,jwa:jwa+n-1) * NPW = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IPW = IPTAU + MIN( MPW, NPW ) LWORK = DESCW( MB_ ) * ( MPW + NQW + DESCW( MB_ ) ) CALL PZGELQF( M+NRHS, N, WORK( IPWA ), IWA, JWA, DESCW, $ WORK( IPTAU ), WORK( IPW ), LWORK, INFO ) * * Compute largest entry in lower triangle in * work(iwa+m:iwa+m+nrhs-1,jwa+m:jwa+n-1) * ERR = ZERO DO 40 J = JWA+M, MIN( JWA+N-1, JWA+M+NRHS-1 ) CALL PZMAX1( JWA+M+NRHS-J, AMAX, IDUM, WORK( IPWA ), $ IWX+J-JWA-M, J, DESCW, 1 ) ERR = MAX( ERR, ABS( AMAX ) ) 40 CONTINUE CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, IDUM1, IDUM2, $ -1, -1, 0 ) * END IF * PZQRT14 = ERR / ( DBLE( MAX( M, N, NRHS ) ) * $ PDLAMCH( ICTXT, 'Epsilon' ) ) * RETURN * * End of PZQRT14 * END scalapack-2.0.2/TESTING/LIN/pzqrt16.f000644 000766 000024 00000025430 11622500733 017151 0ustar00juliestaff000000 000000 SUBROUTINE PZQRT16( TRANS, M, N, NRHS, A, IA, JA, DESCA, X, IX, $ JX, DESCX, B, IB, JB, DESCB, RWORK, RESID ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, IX, JA, JB, JX, M, N, NRHS DOUBLE PRECISION RESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) DOUBLE PRECISION RWORK( * ) COMPLEX*16 A( * ), B( * ), X( * ) * .. * * Purpose * ======= * * PZQRT16 computes the residual for a solution of a system of linear * equations sub( A )*sub( X ) = B or sub( A' )*sub( X ) = B: * RESID = norm(B - sub( A )*sub( X ) ) / * ( max(m,n) * norm(sub( A ) ) * norm(sub( X ) ) * EPS ), * where EPS is the machine epsilon, sub( A ) denotes * A(IA:IA+N-1,JA,JA+N-1), and sub( X ) denotes * X(IX:IX+N-1, JX:JX+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations: * = 'N': sub( A )*sub( X ) = sub( B ) * = 'T': sub( A' )*sub( X )= sub( B ), where A' is the * transpose of sub( A ). * = 'C': sub( A' )*sub( X )= B, where A' is the conjugate * transpose of sub( A ). * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) COMPLEX*16 pointer into the local * memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * The original M x N matrix A. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) COMPLEX*16 pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). This * array contains the local pieces of the computed solution * distributed vectors for the system of linear equations. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * B (local input/local output) COMPLEX*16 pointer into * the local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the distributes right hand side vectors for * the system of linear equations. On exit, sub( B ) is over- * written with the difference sub( B ) - sub( A )*sub( X ) or * sub( B ) - sub( A )'*sub( X ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * LWORK >= Nq0 if TRANS = 'N', and LRWORK >= Mp0 otherwise. * * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * RESID (global output) DOUBLE PRECISION * The maximum over the number of right hand sides of * norm( sub( B )- sub( A )*sub( X ) ) / * ( max(m,n) * norm( sub( A ) ) * norm( sub( X ) ) * EPS ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CONE PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER ICTXT, IDUMM, J, MYCOL, MYROW, N1, N2, NPCOL, $ NPROW DOUBLE PRECISION ANORM, BNORM, EPS, XNORM * .. * .. Local Arrays .. DOUBLE PRECISION TEMP( 2 ) * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL LSAME, PDLAMCH, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PDZASUM, $ PZGEMM * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick exit if M = 0 or N = 0 or NRHS = 0 * IF( M.LE.0 .OR. N.LE.0 .OR. NRHS.EQ.0 ) THEN RESID = ZERO RETURN END IF * IF( LSAME( TRANS, 'T' ) .OR. LSAME( TRANS, 'C' ) ) THEN ANORM = PZLANGE( 'I', M, N, A, IA, JA, DESCA, RWORK ) N1 = N N2 = M ELSE ANORM = PZLANGE( '1', M, N, A, IA, JA, DESCA, RWORK ) N1 = M N2 = N END IF * EPS = PDLAMCH( ICTXT, 'Epsilon' ) * * Compute B - sub( A )*sub( X ) (or B - sub( A' )*sub( X ) ) and * store in B. * CALL PZGEMM( TRANS, 'No transpose', N1, NRHS, N2, -CONE, A, IA, $ JA, DESCA, X, IX, JX, DESCX, CONE, B, IB, JB, DESCB ) * * Compute the maximum over the number of right hand sides of * norm( sub( B ) - sub( A )*sub( X ) ) / * ( max(m,n) * norm( sub( A ) ) * norm( sub( X ) ) * EPS ). * RESID = ZERO DO 10 J = 1, NRHS * CALL PDZASUM( N1, BNORM, B, IB, JB+J-1, DESCB, 1 ) CALL PDZASUM( N2, XNORM, X, IX, JX+J-1, DESCX, 1 ) * * Only the process columns owning the vector operands will have * the correct result, the other will have zero. * TEMP( 1 ) = BNORM TEMP( 2 ) = XNORM IDUMM = 0 CALL DGAMX2D( ICTXT, 'All', ' ', 2, 1, TEMP, 2, IDUMM, IDUMM, $ -1, -1, IDUMM ) BNORM = TEMP( 1 ) XNORM = TEMP( 2 ) * * Every processes have ANORM, BNORM and XNORM now. * IF( ANORM.EQ.ZERO .AND. BNORM.EQ.ZERO ) THEN RESID = ZERO ELSE IF( ANORM.LE.ZERO .OR. XNORM.LE.ZERO ) THEN RESID = ONE / EPS ELSE RESID = MAX( RESID, ( ( BNORM / ANORM ) / XNORM ) / $ ( MAX( M, N )*EPS ) ) END IF * 10 CONTINUE * RETURN * * End of PZQRT16 * END scalapack-2.0.2/TESTING/LIN/pzqrt17.f000644 000766 000024 00000033051 10363532303 017147 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PZQRT17( TRANS, IRESID, M, N, NRHS, A, $ IA, JA, DESCA, X, IX, JX, $ DESCX, B, IB, JB, DESCB, WORK, $ RWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, IRESID, IX, JA, JB, JX, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) COMPLEX*16 A( * ), B( * ), WORK( * ), X( * ) DOUBLE PRECISION RWORK( * ) * .. * * Purpose * ======= * * PZQRT17 computes the ratio * * || R'*op( sub( A ) ) ||/(||sub( A )||*alpha*max(M,N,NRHS)*eps) * * where R = op( sub( A ) )*sub( X ) - B, op(A) is A or A', and * * alpha = ||B|| if IRESID = 1 (zero-residual problem) * alpha = ||R|| if IRESID = 2 (otherwise). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies whether or not the transpose of sub( A ) is used. * = 'N': No transpose, op( sub( A ) ) = sub( A ). * = 'C': Conjugate transpose, op( sub( A ) ) = sub( A' ). * * IRESID (global input) INTEGER * IRESID = 1 indicates zero-residual problem. * IRESID = 2 indicates non-zero residual. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * If TRANS = 'N', the number of rows of the distributed * submatrix sub( B ). * If TRANS = 'C', the number of rows of the distributed * submatrix sub( X ). * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * If TRANS = 'N', the number of rows of the distributed * submatrix sub( X ). Otherwise N is the number of rows of * the distributed submatrix sub( B ). * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrices sub( X ) and sub( B ). * NRHS >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed M-by-N * submatrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input) COMPLEX*16 pointer into the local * memory to an array of dimension (LLD_X,LOCc(JX+NRHS-1)). * If TRANS = 'N', this array contains the local pieces of the * N-by-NRHS distributed submatrix sub( X ). Otherwise, this * array contains the local pieces of the M-by-NRHS distributed * submatrix sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * B (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_B,LOCc(JB+NRHS-1)). * If TRANS='N', this array contains the local pieces of the * distributed M-by-NRHS submatrix operand sub( B ). Otherwise, * this array contains the local pieces of the distributed * N-by-NRHS submatrix operand sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * If TRANS = 'N', LWORK >= Mp0 * NRHSq0 + NRHSp0 * Nq0 * otherwise LWORK >= Np0 * NRHSq0 + NRHSp0 * Mq0 * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * LRWORK >= Nq0, if TRANS = 'N', and LRWORK >= Mp0 otherwise. * * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Np0 = NUMROC( N+ICOFFA, NB_A, MYROW, IAROW, NPROW ), * Mq0 = NUMROC( M+IROFFA, NB_A, MYCOL, IACOL, NPCOL ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * NRHSp0 = NUMROC( NRHS+ICOFFB, NB_B, MYROW, IBROW, NPROW ), * NRHSq0 = NUMROC( NRHS+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. INTEGER IACOL, IBCOL, IBROW, ICOFFB, ICTXT, INFO, $ IOFFA, IROFFB, ISCL, IW, IW2, JW, JW2, MYCOL, $ NRHSQ, NRHSP, MYROW, NCOLS, NPCOL, NPROW, $ NROWS, NROWSP DOUBLE PRECISION ERR, NORMA, NORMB, NORMRS, NORMX, SMLNUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), DESCW2( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL INDXG2P, LSAME, NUMROC, PDLAMCH, PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, PZGEMM, PZLACPY, $ PZLASCL, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX * .. * .. Executable Statements .. * PZQRT17 = ZERO * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 IF( LSAME( TRANS, 'N' ) ) THEN NROWS = M NCOLS = N IOFFA = MOD( JA-1, DESCA( NB_ ) ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN NROWS = N NCOLS = M IOFFA = MOD( IA-1, DESCA( MB_ ) ) ELSE CALL PXERBLA( ICTXT, 'PZQRT17', -1 ) RETURN END IF * IF( M.LE.0 .OR. N.LE.0 .OR. NRHS.LE.0 ) $ RETURN * IROFFB = MOD( IA-1, DESCA( MB_ ) ) ICOFFB = MOD( JA-1, DESCA( NB_ ) ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * NRHSQ = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) NRHSP = NUMROC( NRHS+IROFFB, DESCB( NB_ ), MYROW, IBROW, NPROW ) NROWSP = NUMROC( NROWS+IROFFB, DESCA( MB_ ), MYROW, IBROW, NPROW ) * * Assign array descriptor DESCW for workspace WORK, where DESCW * holds a copy of the distributed submatrix sub( B ) * CALL DESCSET( DESCW, NROWS+IROFFB, NRHS+ICOFFB, DESCB( MB_ ), $ DESCB( NB_ ), IBROW, IBCOL, ICTXT, MAX( 1, $ NROWSP ) ) * * Assign array descriptor DESCW2 for workspace WORK, where DESCW2 * holds a copy of the distributed submatrix sub( X**T ) * CALL DESCSET( DESCW2, NRHS+ICOFFB, NCOLS+IOFFA, DESCX( NB_ ), $ DESCX( MB_ ), IBROW, IACOL, ICTXT, MAX( 1, $ NRHSP ) ) * NORMA = PZLANGE( 'One-norm', M, N, A, IA, JA, DESCA, RWORK ) SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) SMLNUM = SMLNUM / PDLAMCH( ICTXT, 'Precision' ) ISCL = 0 * * compute residual and scale it * IW = 1 + IROFFB JW = 1 + ICOFFB CALL PZLACPY( 'All', NROWS, NRHS, B, IB, JB, DESCB, WORK, IW, JW, $ DESCW ) CALL PZGEMM( TRANS, 'No transpose', NROWS, NRHS, NCOLS, $ DCMPLX( -ONE ), A, IA, JA, DESCA, X, IX, JX, DESCX, $ DCMPLX( ONE ), WORK, IW, JW, DESCW ) NORMRS = PZLANGE( 'Max', NROWS, NRHS, WORK, IW, JW, DESCW, $ RWORK ) IF( NORMRS.GT.SMLNUM ) THEN ISCL = 1 CALL PZLASCL( 'General', NORMRS, ONE, NROWS, NRHS, WORK, $ IW, JW, DESCW, INFO ) END IF * * compute R'*sub( A ) * IW2 = 1 + ICOFFB JW2 = 1 + IOFFA CALL PZGEMM( 'Conjugate transpose', TRANS, NRHS, NCOLS, NROWS, $ DCMPLX( ONE ), WORK, IW, JW, DESCW, A, IA, JA, DESCA, $ DCMPLX( ZERO ), WORK( NROWSP*NRHSQ+1 ), IW2, JW2, $ DESCW2 ) * * compute and properly scale error * ERR = PZLANGE( 'One-norm', NRHS, NCOLS, WORK( NROWSP*NRHSQ+1 ), $ IW2, JW2, DESCW2, RWORK ) IF( NORMA.NE.ZERO ) $ ERR = ERR / NORMA * IF( ISCL.EQ.1 ) $ ERR = ERR*NORMRS * IF( IRESID.EQ.1 ) THEN NORMB = PZLANGE( 'One-norm', NROWS, NRHS, B, IB, JB, DESCB, $ RWORK ) IF( NORMB.NE.ZERO ) $ ERR = ERR / NORMB ELSE NORMX = PZLANGE( 'One-norm', NCOLS, NRHS, X, IX, JX, DESCX, $ RWORK ) IF( NORMX.NE.ZERO ) $ ERR = ERR / NORMX END IF * PZQRT17 = ERR / ( PDLAMCH( ICTXT, 'Epsilon' ) * $ DBLE( MAX( M, N, NRHS ) ) ) * RETURN * * End of PZQRT17 * END scalapack-2.0.2/TESTING/LIN/pztzrzrv.f000644 000766 000024 00000024444 10363532303 017560 0ustar00juliestaff000000 000000 SUBROUTINE PZTZRZRV( M, N, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZTZRZRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from T, Z * computed by PZTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= M >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, sub( A ) contains the the factors T and Z computed * by PZTZRZF. On exit, the original matrix is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCr(M_A). * This array contains the scalar factors TAU of the elementary * reflectors computed by PZTZRZF. TAU is tied to the dis- * tributed matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK = MB_A * ( Mp0 + 2*Nq0 + MB_A ), where * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ) * NB_A, * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ) * MB_A, * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), * NPROW ), * IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), * NPCOL ), * and NUMROC, INDXG2P are ScaLAPACK tool functions. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICOFF, ICTXT, IIA, IN, $ IPT, IPV, IPW, JJA, JM1, JV, L, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PB_TOPGET, $ PB_TOPSET, PZLACPY, PZLARZB, PZLARZT, $ PZLASET * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.LT.M ) $ RETURN * L = N - M JM1 = JA + MIN( M+1, N ) - 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IPV = 1 IPT = IPV + NQ * DESCA( MB_ ) IPW = IPT + DESCA( MB_ ) * DESCA( MB_ ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * CALL DESCSET( DESCV, DESCA( MB_ ), N + ICOFF, DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * * Handle first block separately * IB = IN - IA + 1 JV = ICOFF + JM1 - JA + 1 * * Compute upper triangular matrix T * CALL PZLARZT( 'Backward', 'Rowwise', L, IB, A, IA, JM1, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'All', IB, L, A, IA, JM1, DESCA, WORK( IPV ), 1, $ JV, DESCV ) * * Save temporarily strict lower part of A(IA:IA+IB-1,JA:JA+IB-1) * CALL PZLACPY( 'Lower', IB-1, IB-1, A, IA+1, JA, DESCA, $ WORK( IPV ), 1, ICOFF+1, DESCV ) * * Zeroes the row panel of sub( A ) to get T(IA:IN,JA:JA+N-1) * CALL PZLASET( 'All', IB, L, ZERO, ZERO, A, IA, JM1, DESCA ) CALL PZLASET( 'Lower', IB-1, IB-1, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Apply block Householder transformation * CALL PZLARZB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', IN-IA+1, N, IB, L, WORK( IPV ), 1, JV, $ DESCV, WORK( IPT ), A, IA, JA, DESCA, WORK( IPW ) ) * * Restore strict lower part of A( IA:IA+IB-1, JA:JA+N-1 ) * CALL PZLACPY( 'Lower', IB-1, IB-1, WORK( IPV ), 1, ICOFF+1, DESCV, $ A, IA+1, JA, DESCA ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * * Loop over the remaining row blocks * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Compute upper triangular matrix T * CALL PZLARZT( 'Backward', 'Rowwise', L, IB, A, I, JM1, DESCA, $ TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace * CALL PZLACPY( 'All', IB, L, A, I, JM1, DESCA, WORK( IPV ), 1, $ JV, DESCV ) * * Save temporarily strict lower part of A(I:I+IB-1,J:J+IB-1 ) * CALL PZLACPY( 'Lower', IB-1, IB-1, A, I+1, JA+I-IA, DESCA, $ WORK( IPV ), 1, ICOFF+1+I-IA, DESCV ) * * Zeoes the row panel of sub( A ) to get T(IA:I-1,JA+I-IA:JA+N-1) * CALL PZLASET( 'All', IB, L, ZERO, ZERO, A, I, JM1, DESCA ) CALL PZLASET( 'Lower', IB-1, IB-1, ZERO, ZERO, A, I+1, JA+I-IA, $ DESCA ) * * Apply block Householder transformation * CALL PZLARZB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', I+IB-IA, N-I+IA, IB, L, WORK( IPV ), $ 1, JV, DESCV, WORK( IPT ), A, IA, JA+I-IA, DESCA, $ WORK( IPW ) ) * CALL PZLACPY( 'Lower', IB-1, IB-1, WORK( IPV ), 1, $ ICOFF+1+I-IA, DESCV, A, I+1, JA+I-IA, DESCA ) * DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + 1, NPROW ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PZTZRZRV * END scalapack-2.0.2/TESTING/EIG/CMakeLists.txt000644 000766 000024 00000015525 11656312637 020214 0ustar00juliestaff000000 000000 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/TESTING) set (smatgen psmatgen.f pmatgeninc.f) set (dmatgen pdmatgen.f pmatgeninc.f) set (cmatgen pcmatgen.f pmatgeninc.f) set (zmatgen pzmatgen.f pmatgeninc.f) add_executable(xsbrd psbrddriver.f psbrdinfo.f psgebdrv.f pslafchk.f ${smatgen}) add_executable(xdbrd pdbrddriver.f pdbrdinfo.f pdgebdrv.f pdlafchk.f ${dmatgen}) add_executable(xcbrd pcbrddriver.f pcbrdinfo.f pcgebdrv.f pclafchk.f ${cmatgen}) add_executable(xzbrd pzbrddriver.f pzbrdinfo.f pzgebdrv.f pzlafchk.f ${zmatgen}) add_executable(xshrd pshrddriver.f pshrdinfo.f psgehdrv.f pslafchk.f ${smatgen}) add_executable(xdhrd pdhrddriver.f pdhrdinfo.f pdgehdrv.f pdlafchk.f ${dmatgen}) add_executable(xchrd pchrddriver.f pchrdinfo.f pcgehdrv.f pclafchk.f ${cmatgen}) add_executable(xzhrd pzhrddriver.f pzhrdinfo.f pzgehdrv.f pzlafchk.f ${zmatgen}) add_executable(xstrd pstrddriver.f psttrdtester.f pslatran.f pstrdinfo.f pssytdrv.f pslafchk.f xpjlaenv.f ${smatgen}) add_executable(xdtrd pdtrddriver.f pdttrdtester.f pdlatran.f pdtrdinfo.f pdsytdrv.f pdlafchk.f xpjlaenv.f ${dmatgen}) add_executable(xctrd pctrddriver.f pcttrdtester.f pclatran.f pctrdinfo.f pchetdrv.f pclafchk.f xpjlaenv.f ${cmatgen}) add_executable(xztrd pztrddriver.f pzttrdtester.f pzlatran.f pztrdinfo.f pzhetdrv.f pzlafchk.f xpjlaenv.f ${zmatgen}) add_executable(xssvd pssvddriver.f pslagge.f pssvdchk.f pssvdcmp.f pssvdtst.f ${smatgen}) add_executable(xdsvd pdsvddriver.f pdlagge.f pdsvdchk.f pdsvdcmp.f pdsvdtst.f ${dmatgen}) add_executable(xssep psseptst.f pssepsubtst.f pssepchk.f pssepqtq.f pslagsy.f pslatms.f pslasizesep.f pslasizesyevx.f pssepdriver.f pssepreq.f pssepinfo.f pslasizesyev.f pssqpsubtst.f pslasizesqp.f pssdpsubtst.f ${smatgen}) add_executable(xdsep pdseptst.f pdsepsubtst.f pdsepchk.f pdsepqtq.f pdlagsy.f pdlatms.f pdlasizesep.f pdlasizesyevx.f pdsepdriver.f pdsepreq.f pdsepinfo.f pdlasizesyev.f pdsqpsubtst.f pdlasizesqp.f pdsdpsubtst.f ${dmatgen}) add_executable(xcsep pcseptst.f pcsepsubtst.f pcsepchk.f pcsepqtq.f pclagsy.f pclatms.f pclasizesep.f pclasizeheevx.f pcsepdriver.f pcsepreq.f pssepinfo.f pcsdpsubtst.f ${cmatgen}) add_executable(xzsep pzseptst.f pzsepsubtst.f pzsepchk.f pzsepqtq.f pzlagsy.f pzlatms.f pzlasizesep.f pzlasizeheevx.f pzsepdriver.f pzsepreq.f pdsepinfo.f pzsdpsubtst.f ${zmatgen}) add_executable(xsgsep psgseptst.f psgsepsubtst.f psgsepchk.f pslagsy.f pslatms.f pslasizesyevx.f pslasizegsep.f pslasizesep.f psgsepdriver.f psgsepreq.f pssepinfo.f ${smatgen}) add_executable(xdgsep pdgseptst.f pdgsepsubtst.f pdgsepchk.f pdlagsy.f pdlatms.f pdlasizesyevx.f pdlasizegsep.f pdlasizesep.f pdgsepdriver.f pdgsepreq.f pdsepinfo.f ${dmatgen}) add_executable(xcgsep pcgseptst.f pcgsepsubtst.f pcgsepchk.f pclagsy.f pclatms.f pclasizegsep.f pclasizeheevx.f pclasizesep.f pcgsepdriver.f pcgsepreq.f pssepinfo.f ${cmatgen}) add_executable(xzgsep pzgseptst.f pzgsepsubtst.f pzgsepchk.f pzlagsy.f pzlatms.f pzlasizegsep.f pzlasizeheevx.f pzlasizesep.f pzgsepdriver.f pzgsepreq.f pdsepinfo.f ${zmatgen}) add_executable(xsnep psnepdriver.f psnepinfo.f psnepfchk.f ${smatgen}) add_executable(xdnep pdnepdriver.f pdnepinfo.f pdnepfchk.f ${dmatgen}) add_executable(xcnep pcnepdriver.f pcnepinfo.f pcnepfchk.f ${cmatgen}) add_executable(xznep pznepdriver.f pznepinfo.f pznepfchk.f ${zmatgen}) add_executable(xcevc pcevcdriver.f pcevcinfo.f pcget22.f ${cmatgen}) add_executable(xzevc pzevcdriver.f pzevcinfo.f pzget22.f ${zmatgen}) add_executable(xssyevr pslasizesepr.f pslasizesyevr.f psseprdriver.f psseprreq.f psseprsubtst.f pssepchk.f pssepqtq.f pslatms.f psseprtst.f pssepinfo.f pslagsy.f pslasizesep.f ${smatgen}) add_executable(xdsyevr pdlasizesepr.f pdlasizesyevr.f pdseprdriver.f pdseprreq.f pdseprsubtst.f pdsepchk.f pdsepqtq.f pdlatms.f pdseprtst.f pdsepinfo.f pdlagsy.f pdlasizesep.f ${dmatgen}) add_executable(xcheevr pclasizesepr.f pclasizeheevr.f pcseprdriver.f pcseprreq.f pcseprsubtst.f pcsepchk.f pcsepqtq.f pclatms.f pcseprtst.f pssepinfo.f pclagsy.f pclasizesep.f ${cmatgen}) add_executable(xzheevr pzlasizesepr.f pzlasizeheevr.f pzseprdriver.f pzseprreq.f pzseprsubtst.f pzsepchk.f pzsepqtq.f pzlatms.f pzseprtst.f pdsepinfo.f pzlagsy.f pzlasizesep.f ${zmatgen}) add_executable(xshseqr pshseqrdriver.f psmatgen2.f ${cmatgen}) add_executable(xdhseqr pdhseqrdriver.f pdmatgen2.f ${cmatgen}) target_link_libraries(xsbrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdbrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcbrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzbrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xshrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdhrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xchrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzhrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xstrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdtrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xctrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xztrd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xssvd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdsvd scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xssep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdsep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcsep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzsep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsgsep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdgsep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcgsep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzgsep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xsnep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdnep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcnep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xznep scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcevc scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzevc scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xssyevr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdsyevr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xcheevr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xzheevr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xshseqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(xdhseqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) scalapack-2.0.2/TESTING/EIG/listing000644 000766 000024 00000000674 10363532303 017033 0ustar00juliestaff000000 000000 * orthogonal matrix Q, the Hessenberg matrix, and the array TAU returned INTEGER CONTEXTC, CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, DOUBLE PRECISION ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, * orthogonal matrix Q, the Hessenberg matrix, and the array TAU returned INTEGER CONTEXTC, CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, REAL ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, scalapack-2.0.2/TESTING/EIG/Makefile000644 000766 000024 00000023326 11654025546 017110 0ustar00juliestaff000000 000000 ############################################################################ # # Program: ScaLAPACK # # Module: Makefile # # Purpose: Eigenroutine Testing Makefile # # Creation date: February 20, 2000 # # Modified: November, 2011 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../SLmake.inc shrdexe = ../xshrd dhrdexe = ../xdhrd chrdexe = ../xchrd zhrdexe = ../xzhrd strdexe = ../xstrd dtrdexe = ../xdtrd ctrdexe = ../xctrd ztrdexe = ../xztrd sbrdexe = ../xsbrd dbrdexe = ../xdbrd cbrdexe = ../xcbrd zbrdexe = ../xzbrd ssepexe = ../xssep dsepexe = ../xdsep csepexe = ../xcsep zsepexe = ../xzsep sgsepexe = ../xsgsep dgsepexe = ../xdgsep cgsepexe = ../xcgsep zgsepexe = ../xzgsep ssvdexe = ../xssvd dsvdexe = ../xdsvd snepexe = ../xsnep dnepexe = ../xdnep cnepexe = ../xcnep znepexe = ../xznep cevcexe = ../xcevc zevcexe = ../xzevc ssyevrexe = ../xssyevr dsyevrexe = ../xdsyevr cheevrexe = ../xcheevr zheevrexe = ../xzheevr shseqrexe = ../xshseqr dhseqrexe = ../xdhseqr smatgen = psmatgen.o pmatgeninc.o dmatgen = pdmatgen.o pmatgeninc.o cmatgen = pcmatgen.o pmatgeninc.o zmatgen = pzmatgen.o pmatgeninc.o sbrd = psbrddriver.o psbrdinfo.o psgebdrv.o pslafchk.o $(smatgen) dbrd = pdbrddriver.o pdbrdinfo.o pdgebdrv.o pdlafchk.o $(dmatgen) cbrd = pcbrddriver.o pcbrdinfo.o pcgebdrv.o pclafchk.o $(cmatgen) zbrd = pzbrddriver.o pzbrdinfo.o pzgebdrv.o pzlafchk.o $(zmatgen) shrd = pshrddriver.o pshrdinfo.o psgehdrv.o pslafchk.o $(smatgen) dhrd = pdhrddriver.o pdhrdinfo.o pdgehdrv.o pdlafchk.o $(dmatgen) chrd = pchrddriver.o pchrdinfo.o pcgehdrv.o pclafchk.o $(cmatgen) zhrd = pzhrddriver.o pzhrdinfo.o pzgehdrv.o pzlafchk.o $(zmatgen) strd = pstrddriver.o psttrdtester.o pslatran.o pstrdinfo.o pssytdrv.o \ pslafchk.o xpjlaenv.o $(smatgen) dtrd = pdtrddriver.o pdttrdtester.o pdlatran.o pdtrdinfo.o pdsytdrv.o \ pdlafchk.o xpjlaenv.o $(dmatgen) ctrd = pctrddriver.o pcttrdtester.o pclatran.o pctrdinfo.o pchetdrv.o \ pclafchk.o xpjlaenv.o $(cmatgen) ztrd = pztrddriver.o pzttrdtester.o pzlatran.o pztrdinfo.o pzhetdrv.o \ pzlafchk.o xpjlaenv.o $(zmatgen) ssvd = pssvddriver.o pslagge.o pssvdchk.o pssvdcmp.o pssvdtst.o $(smatgen) dsvd = pdsvddriver.o pdlagge.o pdsvdchk.o pdsvdcmp.o pdsvdtst.o $(dmatgen) ssep = psseptst.o pssepsubtst.o pssepchk.o pssepqtq.o pslagsy.o \ pslatms.o pslasizesep.o pslasizesyevx.o pssepdriver.o \ pssepreq.o pssepinfo.o pslasizesyev.o \ pssqpsubtst.o pslasizesqp.o pssdpsubtst.o $(smatgen) dsep = pdseptst.o pdsepsubtst.o pdsepchk.o pdsepqtq.o pdlagsy.o \ pdlatms.o pdlasizesep.o pdlasizesyevx.o pdsepdriver.o \ pdsepreq.o pdsepinfo.o pdlasizesyev.o \ pdsqpsubtst.o pdlasizesqp.o pdsdpsubtst.o $(dmatgen) csep = pcseptst.o pcsepsubtst.o pcsepchk.o pcsepqtq.o pclagsy.o \ pclatms.o pclasizesep.o pclasizeheevx.o pcsepdriver.o \ pcsepreq.o pssepinfo.o pcsdpsubtst.o $(cmatgen) zsep = pzseptst.o pzsepsubtst.o pzsepchk.o pzsepqtq.o pzlagsy.o \ pzlatms.o pzlasizesep.o pzlasizeheevx.o pzsepdriver.o \ pzsepreq.o pdsepinfo.o pzsdpsubtst.o $(zmatgen) sgsep = psgseptst.o psgsepsubtst.o psgsepchk.o pslagsy.o \ pslatms.o pslasizesyevx.o pslasizegsep.o pslasizesep.o \ psgsepdriver.o psgsepreq.o pssepinfo.o $(smatgen) dgsep = pdgseptst.o pdgsepsubtst.o pdgsepchk.o pdlagsy.o \ pdlatms.o pdlasizesyevx.o pdlasizegsep.o pdlasizesep.o \ pdgsepdriver.o pdgsepreq.o pdsepinfo.o $(dmatgen) cgsep = pcgseptst.o pcgsepsubtst.o pcgsepchk.o pclagsy.o \ pclatms.o pclasizegsep.o pclasizeheevx.o pclasizesep.o \ pcgsepdriver.o pcgsepreq.o pssepinfo.o $(cmatgen) zgsep = pzgseptst.o pzgsepsubtst.o pzgsepchk.o pzlagsy.o \ pzlatms.o pzlasizegsep.o pzlasizeheevx.o pzlasizesep.o \ pzgsepdriver.o pzgsepreq.o pdsepinfo.o $(zmatgen) snep = psnepdriver.o psnepinfo.o psnepfchk.o $(smatgen) dnep = pdnepdriver.o pdnepinfo.o pdnepfchk.o $(dmatgen) cnep = pcnepdriver.o pcnepinfo.o pcnepfchk.o $(cmatgen) znep = pznepdriver.o pznepinfo.o pznepfchk.o $(zmatgen) cevc = pcevcdriver.o pcevcinfo.o pcget22.o $(cmatgen) zevc = pzevcdriver.o pzevcinfo.o pzget22.o $(zmatgen) ssyevr = pslasizesepr.o pslasizesyevr.o psseprdriver.o psseprreq.o psseprsubtst.o \ pssepchk.o pssepqtq.o pslatms.o psseprtst.o pssepinfo.o pslagsy.o pslasizesep.o $(smatgen) dsyevr = pdlasizesepr.o pdlasizesyevr.o pdseprdriver.o pdseprreq.o pdseprsubtst.o \ pdsepchk.o pdsepqtq.o pdlatms.o pdseprtst.o pdsepinfo.o pdlagsy.o pdlasizesep.o $(dmatgen) cheevr = pclasizesepr.o pclasizeheevr.o pcseprdriver.o pcseprreq.o pcseprsubtst.o \ pcsepchk.o pcsepqtq.o pclatms.o pcseprtst.o pssepinfo.o pclagsy.o pclasizesep.o $(cmatgen) zheevr = pzlasizesepr.o pzlasizeheevr.o pzseprdriver.o pzseprreq.o pzseprsubtst.o \ pzsepchk.o pzsepqtq.o pzlatms.o pzseprtst.o pdsepinfo.o pzlagsy.o pzlasizesep.o $(zmatgen) shseqr = pshseqrdriver.o psmatgen2.o $(smatgen) dhseqr = pdhseqrdriver.o pdmatgen2.o $(dmatgen) all : single double complex complex16 single: $(shrdexe) $(strdexe) $(sbrdexe) $(ssepexe) $(sgsepexe) $(snepexe) $(ssvdexe) $(ssyevrexe) $(shseqrexe) double: $(dhrdexe) $(dtrdexe) $(dbrdexe) $(dsepexe) $(dgsepexe) $(dnepexe) $(dsvdexe) $(dsyevrexe) $(dhseqrexe) complex: $(chrdexe) $(ctrdexe) $(cbrdexe) $(csepexe) $(cgsepexe) $(cnepexe) $(cevcexe) $(cheevrexe) complex16: $(zhrdexe) $(ztrdexe) $(zbrdexe) $(zsepexe) $(zgsepexe) $(znepexe) $(zevcexe) $(zheevrexe) $(sbrdexe) : ../../$(SCALAPACKLIB) $(sbrd) $(FCLOADER) $(FCLOADFLAGS) -o $(sbrdexe) $(sbrd) ../../$(SCALAPACKLIB) $(LIBS) $(dbrdexe) : ../../$(SCALAPACKLIB) $(dbrd) $(FCLOADER) $(FCLOADFLAGS) -o $(dbrdexe) $(dbrd) ../../$(SCALAPACKLIB) $(LIBS) $(cbrdexe) : ../../$(SCALAPACKLIB) $(cbrd) $(FCLOADER) $(FCLOADFLAGS) -o $(cbrdexe) $(cbrd) ../../$(SCALAPACKLIB) $(LIBS) $(zbrdexe) : ../../$(SCALAPACKLIB) $(zbrd) $(FCLOADER) $(FCLOADFLAGS) -o $(zbrdexe) $(zbrd) ../../$(SCALAPACKLIB) $(LIBS) $(shrdexe) : ../../$(SCALAPACKLIB) $(shrd) $(FCLOADER) $(FCLOADFLAGS) -o $(shrdexe) $(shrd) ../../$(SCALAPACKLIB) $(LIBS) $(dhrdexe) : ../../$(SCALAPACKLIB) $(dhrd) $(FCLOADER) $(FCLOADFLAGS) -o $(dhrdexe) $(dhrd) ../../$(SCALAPACKLIB) $(LIBS) $(chrdexe) : ../../$(SCALAPACKLIB) $(chrd) $(FCLOADER) $(FCLOADFLAGS) -o $(chrdexe) $(chrd) ../../$(SCALAPACKLIB) $(LIBS) $(zhrdexe) : ../../$(SCALAPACKLIB) $(zhrd) $(FCLOADER) $(FCLOADFLAGS) -o $(zhrdexe) $(zhrd) ../../$(SCALAPACKLIB) $(LIBS) $(strdexe) : ../../$(SCALAPACKLIB) $(strd) $(FCLOADER) $(FCLOADFLAGS) -o $(strdexe) $(strd) ../../$(SCALAPACKLIB) $(LIBS) $(dtrdexe) : ../../$(SCALAPACKLIB) $(dtrd) $(FCLOADER) $(FCLOADFLAGS) -o $(dtrdexe) $(dtrd) ../../$(SCALAPACKLIB) $(LIBS) $(ctrdexe) : ../../$(SCALAPACKLIB) $(ctrd) $(FCLOADER) $(FCLOADFLAGS) -o $(ctrdexe) $(ctrd) ../../$(SCALAPACKLIB) $(LIBS) $(ztrdexe) : ../../$(SCALAPACKLIB) $(ztrd) $(FCLOADER) $(FCLOADFLAGS) -o $(ztrdexe) $(ztrd) ../../$(SCALAPACKLIB) $(LIBS) $(ssvdexe) : ../../$(SCALAPACKLIB) $(ssvd) $(FCLOADER) $(FCLOADFLAGS) -o $(ssvdexe) $(ssvd) ../../$(SCALAPACKLIB) $(LIBS) $(dsvdexe) : ../../$(SCALAPACKLIB) $(dsvd) $(FCLOADER) $(FCLOADFLAGS) -o $(dsvdexe) $(dsvd) ../../$(SCALAPACKLIB) $(LIBS) $(ssepexe) : ../../$(SCALAPACKLIB) $(ssep) $(FCLOADER) $(FCLOADFLAGS) -o $(ssepexe) $(ssep) ../../$(SCALAPACKLIB) $(LIBS) $(dsepexe) : ../../$(SCALAPACKLIB) $(dsep) $(FCLOADER) $(FCLOADFLAGS) -o $(dsepexe) $(dsep) ../../$(SCALAPACKLIB) $(LIBS) $(csepexe) : ../../$(SCALAPACKLIB) $(csep) $(FCLOADER) $(FCLOADFLAGS) -o $(csepexe) $(csep) ../../$(SCALAPACKLIB) $(LIBS) $(zsepexe) : ../../$(SCALAPACKLIB) $(zsep) $(FCLOADER) $(FCLOADFLAGS) -o $(zsepexe) $(zsep) ../../$(SCALAPACKLIB) $(LIBS) $(sgsepexe) : ../../$(SCALAPACKLIB) $(sgsep) $(FCLOADER) $(FCLOADFLAGS) -o $(sgsepexe) $(sgsep) ../../$(SCALAPACKLIB) $(LIBS) $(dgsepexe) : ../../$(SCALAPACKLIB) $(dgsep) $(FCLOADER) $(FCLOADFLAGS) -o $(dgsepexe) $(dgsep) ../../$(SCALAPACKLIB) $(LIBS) $(cgsepexe) : ../../$(SCALAPACKLIB) $(cgsep) $(FCLOADER) $(FCLOADFLAGS) -o $(cgsepexe) $(cgsep) ../../$(SCALAPACKLIB) $(LIBS) $(zgsepexe) : ../../$(SCALAPACKLIB) $(zgsep) $(FCLOADER) $(FCLOADFLAGS) -o $(zgsepexe) $(zgsep) ../../$(SCALAPACKLIB) $(LIBS) $(snepexe) : ../../$(SCALAPACKLIB) $(snep) $(FCLOADER) $(FCLOADFLAGS) -o $(snepexe) $(snep) ../../$(SCALAPACKLIB) $(LIBS) $(dnepexe) : ../../$(SCALAPACKLIB) $(dnep) $(FCLOADER) $(FCLOADFLAGS) -o $(dnepexe) $(dnep) ../../$(SCALAPACKLIB) $(LIBS) $(cnepexe) : ../../$(SCALAPACKLIB) $(cnep) $(FCLOADER) $(FCLOADFLAGS) -o $(cnepexe) $(cnep) ../../$(SCALAPACKLIB) $(LIBS) $(znepexe) : ../../$(SCALAPACKLIB) $(znep) $(FCLOADER) $(FCLOADFLAGS) -o $(znepexe) $(znep) ../../$(SCALAPACKLIB) $(LIBS) $(cevcexe): ../../$(SCALAPACKLIB) $(cevc) $(FCLOADER) $(FCLOADFLAGS) -o $(cevcexe) $(cevc) ../../$(SCALAPACKLIB) $(LIBS) $(zevcexe): ../../$(SCALAPACKLIB) $(zevc) $(FCLOADER) $(FCLOADFLAGS) -o $(zevcexe) $(zevc) ../../$(SCALAPACKLIB) $(LIBS) $(ssyevrexe): ../../$(SCALAPACKLIB) $(ssyevr) $(FCLOADER) $(FCLOADFLAGS) -o $(ssyevrexe) $(ssyevr) ../../$(SCALAPACKLIB) $(LIBS) $(dsyevrexe): ../../$(SCALAPACKLIB) $(dsyevr) $(FCLOADER) $(FCLOADFLAGS) -o $(dsyevrexe) $(dsyevr) ../../$(SCALAPACKLIB) $(LIBS) $(cheevrexe): ../../$(SCALAPACKLIB) $(cheevr) $(FCLOADER) $(FCLOADFLAGS) -o $(cheevrexe) $(cheevr) ../../$(SCALAPACKLIB) $(LIBS) $(zheevrexe): ../../$(SCALAPACKLIB) $(zheevr) $(FCLOADER) $(FCLOADFLAGS) -o $(zheevrexe) $(zheevr) ../../$(SCALAPACKLIB) $(LIBS) $(shseqrexe): ../../$(SCALAPACKLIB) $(shseqr) $(FCLOADER) $(FCLOADFLAGS) -o $(shseqrexe) $(shseqr) ../../$(SCALAPACKLIB) $(LIBS) $(dhseqrexe): ../../$(SCALAPACKLIB) $(dhseqr) $(FCLOADER) $(FCLOADFLAGS) -o $(dhseqrexe) $(dhseqr) ../../$(SCALAPACKLIB) $(LIBS) clean: rm -f *.o .f.o: $(FC) -c $(FCFLAGS) $< scalapack-2.0.2/TESTING/EIG/pcbrddriver.f000644 000766 000024 00000050064 10363532303 020112 0ustar00juliestaff000000 000000 PROGRAM PCBRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * Purpose * ======= * * PCBRDDRIVER is the main test program for the COMPLEX * ScaLAPACK BRD (bidiagonal reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 13 lines: * 'ScaLAPACK BRD computation input file' * 'PVM machine' * 'BRD.out' output file name * 6 device out * 3 number of problems sizes * 16 20 18 values of M * 16 18 20 values of N * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 1.0 threshold * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, MEMSIZ, NTESTS, TOTMEM, REALSZ COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, TOTMEM = 2000000, REALSZ = 8, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPTP, IPTQ, IPW, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LWORK, M, MAXMN, $ MINMN, MNP, MNQ, MP, MYCOL, MYROW, N, NB, $ NDIAG, NGRIDS, NMAT, NNB, NOFFD, NOUT, NPCOL, $ NPROCS, NPROW, NQ, WORKBRD, WORKSIZ REAL ANORM, FRESID, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ MVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCCHEKPAD, $ PCBRDINFO, PCFILLPAD, PCLAFCHK, $ PCMATGEN, PCGEBDRV, PCGEBRD, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PCLANGE EXTERNAL ICEIL, NUMROC, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PCBRDINFO( OUTFILE, NOUT, NMAT, MVAL, NTESTS, NVAL, NTESTS, $ NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * * Go to bottom of loop if this case doesn't use my process * DO 20 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) MNP = NUMROC( MIN( M, N ), NB, MYROW, 0, NPROW ) MNQ = NUMROC( MIN( M, N ), NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP )+IMIDPAD, IERR( 1 ) ) * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IF( M.GE.N ) THEN NDIAG = MNQ NOFFD = MNP NDIAG = ICEIL( REALSZ*NDIAG, CPLXSZ ) NOFFD = ICEIL( REALSZ*NOFFD, CPLXSZ ) ELSE NDIAG = MNP NOFFD = NUMROC( MIN( M, N )-1, NB, MYCOL, 0, NPCOL ) NDIAG = ICEIL( REALSZ*NDIAG, CPLXSZ ) NOFFD = ICEIL( REALSZ*NOFFD, CPLXSZ ) END IF * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPTQ = IPE + NOFFD + IPOSTPAD + IPREPAD IPTP = IPTQ + MNQ + IPOSTPAD + IPREPAD IPW = IPTP + MNP + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * LWORK = NB*( MP+NQ+1 ) + NQ WORKBRD = LWORK + IPOSTPAD WORKSIZ = WORKBRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN WORKSIZ = MAX( LWORK, 2*NB*( MP+NQ+NB ) ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Bidiagonal reduction', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PCMATGEN( ICTXT, 'No', 'No', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, MNQ, 1, MEM( IPTQ-IPREPAD ), $ MNQ, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, MNP, 1, MEM( IPTP-IPREPAD ), $ MNP, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANGE( 'I', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKBRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKBRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to bidiagonal form * CALL PCGEBRD( M, N, MEM( IPA ), 1, 1, DESCA, MEM( IPD ), $ MEM( IPE ), MEM( IPTQ ), MEM( IPTP ), $ MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGEBRD', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEBRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEBRD', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEBRD', MNQ, 1, $ MEM( IPTQ-IPREPAD ), MNQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEBRD', MNP, 1, $ MEM( IPTP-IPREPAD ), MNP, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEBRD', WORKBRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKBRD-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A-Q*B*P|| / (||A|| * N * eps) * CALL PCGEBDRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPTQ ), $ MEM( IPTP ), MEM( IPW ), IERR( 1 ) ) CALL PCLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGEBDRV', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEBDRV', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEBDRV', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEBDRV', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ.0.0E+0 $ .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID * KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * ) $ 'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * BRD requires 32/3 N^3 floating point operations * MAXMN = MAX( M, N ) MINMN = MIN( M, N ) NOPS = 16.0D+0 * DBLE( MINMN ) * DBLE( MINMN ) * $ ( DBLE( MAXMN ) - DBLE( MINMN ) / 3.0D+0 ) NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, FRESID, PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N NB P Q BRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ------ ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, F9.2, 1X, $ F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*B*P|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PCBRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pcbrdinfo.f000644 000766 000024 00000032176 10363532303 017556 0ustar00juliestaff000000 000000 SUBROUTINE PCBRDINFO( SUMMRY, NOUT, NMAT, MVAL, LDMVAL, NVAL, $ LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDMVAL, LDNBVAL, LDNVAL, LDPVAL, $ LDQVAL, NGRIDS, NMAT, NNB, NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDMVAL ), NBVAL( LDNBVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), $ QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PCBRDINFO get needed startup information for the bidiagonal * reduction and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, output to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for M & N. * * MVAL (global output) INTEGER array, dimension (LDMVAL) * The values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M. LDMVAL >= NMAT. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N. LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * MAX( 5, LDMVAL+LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack * all input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, IGEBR2D, IGEBS2D, $ SCOPY * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'BRD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( UNIT = NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get values of M, N * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1. .OR. NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'M', LDMVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 1, 3, WORK, 1 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Bidiagonal reduction' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision bidiagonal ' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - Q B P''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less'// $ ' than THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'BRD time : Time in seconds to reduce the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for '// $ 'the bidiagonal reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9995 ) $ 'M ', ( MVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( MVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9995 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9999 ) ' ' WRITE( NOUT, FMT = 9996 ) EPS WRITE( NOUT, FMT = 9993 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 3, WORK, 1, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = 2*NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 CONTINUE WRITE( NOUT, FMT = 9998 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9997 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9996 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9995 FORMAT( 2X, A5, ': ', 10I6 ) 9994 FORMAT( ' ', 10I6 ) 9993 FORMAT( 'Routines pass computational tests if scaled residual is', $ ' less than ', G12.5 ) * * End of PCBRDINFO * END scalapack-2.0.2/TESTING/EIG/pcevcdriver.f000644 000766 000024 00000053715 10602576752 020142 0ustar00juliestaff000000 000000 PROGRAM PCEVCDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * June, 2000 * * Purpose * ======= * * PCEVCDRIVER is the main test program for the COMPLEX * SCALAPACK PCTREVC routine. This test driver performs a right and * left eigenvector calculation of a triangular matrix followed by * a residual checks of the calcuated eigenvectors. * * The program must be driven by a short data file and uses the same * input file as the PCNEPDRIVER. An annotated example of a data file * can be obtained by deleting the first 3 characters from the following * 18 lines: * 'SCALAPACK, Version 1.8, NEP (Nonsymmetric EigenProblem) input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'NEP.out' output file name (if any) * 6 device out * 8 number of problems sizes * 1 2 3 4 6 10 100 200 vales of N * 3 number of NB's * 6 20 40 values of NB * 4 number of process grids (ordered pairs of P & Q) * 1 2 1 4 values of P * 1 2 4 1 values of Q * 20.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * CPLXSZ INTEGER, default = 8 bytes. * CPLXSZ indicate the length in bytes on the given platform * for a single precision complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * Further Details * ============== * * Contributed by Mark Fahey, June, 2000 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, TOTMEM, MEMSIZ, NTESTS PARAMETER ( CPLXSZ = 8, TOTMEM = 200000000, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20 ) COMPLEX PADVAL, ZERO, ONE PARAMETER ( PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICOL, ICTXT, II, III, IMIDPAD, $ INFO, IPA, IPOSTPAD, IPREPAD, IPVL, IPVR, IPW, $ IPWR, IPC, IROW, J, JJ, JJJ, K, KFAIL, KPASS, $ KSKIP, KTESTS, LDA, LDZ, LWORK, M, MYCOL, $ MYROW, N, NB, NGRIDS, NMAT, NNB, NOUT, NP, $ NPCOL, NPROCS, NPROW, NQ, WORKSIZ REAL THRESH REAL ANORM, FRESID, QRESID DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. LOGICAL SELECT( 1 ) INTEGER DESCA( DLEN_ ), DESCZ( DLEN_ ), IERR( 2 ), $ NBVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL RESULT( 2 ), RWORK( 5000 ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, INFOG2L, $ PCCHEKPAD, PCEVCINFO, PCFILLPAD, PCGET22, $ PCLASET, PCMATGEN, PCTREVC, SLBOOT, SLCOMBINE, $ SLTIMER, CGSUM2D * .. * .. External Functions .. INTEGER ILCM, NUMROC REAL PCLANHS EXTERNAL ILCM, NUMROC, PCLANHS * .. * .. Intrinsic Functions .. INTRINSIC REAL, MAX, MIN * .. * .. Data statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PCEVCINFO( OUTFILE, NOUT, NMAT, NVAL, NTESTS, NNB, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 40 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 40 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 40 * DO 30 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 30 END IF * DO 20 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.6 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 20 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) IPREPAD = IPREPAD + 1000 IMIDPAD = IMIDPAD + 1000 IPOSTPAD = IPOSTPAD + 1000 ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Initialize the array descriptor for the matrix Z * CALL DESCINIT( DESCZ, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 2 ) ) * LDA = DESCA( LLD_ ) LDZ = DESCZ( LLD_ ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 20 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAE+1 ) * IPA = IPREPAD + 1 IPC = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWR = IPC + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPVL = IPWR + N + IPOSTPAD + IPREPAD IPVR = IPVL + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPVR + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD III = N / NB IF( III*NB.LT.N ) $ III = III + 1 III = 7*III / ILCM( NPROW, NPCOL ) * * LWORK = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, III ) LWORK = LWORK + MAX( 2*N, ( 8*ILCM( NPROW, NPCOL )+2 )** $ 2 ) * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PCEVCFCHK and PCLANHS * WORKSIZ = LWORK + MAX( NP*DESCA( NB_ ), $ DESCA( MB_ )*NQ ) + IPOSTPAD * ELSE * WORKSIZ = LWORK + IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Schur reduction', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 20 END IF * * Generate matrix Z = In * CALL PCLASET( 'All', N, N, ZERO, ONE, MEM( IPC ), 1, 1, $ DESCZ ) CALL PCLASET( 'All', N, N, ZERO, ZERO, MEM( IPVR ), 1, 1, $ DESCZ ) CALL PCLASET( 'All', N, N, ZERO, ZERO, MEM( IPVL ), 1, 1, $ DESCZ ) * * Generate matrix A upper Hessenberg * CALL PCMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) CALL PCLASET( 'Lower', MAX( 0, N-1 ), MAX( 0, N-1 ), $ ZERO, ZERO, MEM( IPA ), MIN( N, 2 ), 1, $ DESCA ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPVR-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPVL-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPC-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANHS( 'I', N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANHS', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANHS', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PCFILLPAD( ICTXT, N, 1, MEM( IPWR-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, PADVAL ) * END IF * * Set eigenvalues from diagonal * DO 10 JJJ = 1, N CALL INFOG2L( JJJ, JJJ, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW.EQ.II .AND. MYCOL.EQ.JJ ) THEN MEM( IPWR-1+JJJ ) = MEM( IPA-1+( ICOL-1 )*LDA+ $ IROW ) ELSE MEM( IPWR-1+JJJ ) = ZERO END IF 10 CONTINUE CALL CGSUM2D( ICTXT, 'All', ' ', N, 1, MEM( IPWR ), N, $ -1, -1 ) * SELECT( 1 ) = .TRUE. CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform eigenvector calculation * CALL PCTREVC( 'B', 'A', SELECT, N, MEM( IPA ), DESCA, $ MEM( IPVL ), DESCZ, MEM( IPVR ), DESCZ, N, $ M, MEM( IPW ), RWORK, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * )'PCTREVC INFO=', INFO KFAIL = KFAIL + 1 GO TO 20 END IF * IF( CHECK ) THEN * * Check for memory overwrite in NEP factorization * CALL PCCHEKPAD( ICTXT, 'PCTREVC (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCTREVC (VR)', NP, NQ, $ MEM( IPVR-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCTREVC (VL)', NP, NQ, $ MEM( IPVL-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCTREVC (WR)', N, 1, $ MEM( IPWR-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCTREVC (WORK)', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || T * Z - Z * D || / ( N*|| T ||*EPS ) * FRESID = 0.0E+0 QRESID = 0.0E+0 CALL PCGET22( 'N', 'N', 'N', N, MEM( IPA ), DESCA, $ MEM( IPVR ), DESCZ, MEM( IPWR ), $ MEM( IPC ), DESCZ, RWORK, RESULT ) FRESID = RESULT( 1 ) QRESID = RESULT( 2 ) * * Compute || T^H * L - L * D^H || / ( N*|| T ||*EPS ) * CALL PCGET22( 'C', 'N', 'C', N, MEM( IPA ), DESCA, $ MEM( IPVL ), DESCZ, MEM( IPWR ), $ MEM( IPC ), DESCZ, RWORK, RESULT ) FRESID = MAX( FRESID, RESULT( 1 ) ) QRESID = MAX( QRESID, RESULT( 2 ) ) * CALL PCCHEKPAD( ICTXT, 'PCGET22 (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGET22 (VR)', NP, NQ, $ MEM( IPVR-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGET22 (VL)', NP, NQ, $ MEM( IPVL-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGET22 (Z)', NP, NQ, $ MEM( IPC-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( ( FRESID-FRESID ).EQ.0.0E+0 ) .AND. $ ( QRESID.LE.THRESH ) .AND. $ ( ( QRESID-QRESID ).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 )FRESID WRITE( NOUT, FMT = 9985 )QRESID END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID QRESID = QRESID - QRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 2 N^2 flops for PxTREVC * NOPS = 2.0D+0*DBLE( N )**2 * * Calculate total megaflops -- eigenvector calc only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, PASSED END IF * 20 CONTINUE * 30 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 40 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q NEP Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- --- ---- ---- -------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, F8.2, $ 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||H*Z - Z*D|| / (||T|| * N * eps) = ', G25.7 ) 9985 FORMAT( 'max_j(max|Z(j)| - 1) / ( N * eps ) ', G25.7 ) * STOP * * End of PCEVCDRIVER * END scalapack-2.0.2/TESTING/EIG/pcevcinfo.f000644 000766 000024 00000030034 10363532303 017553 0ustar00juliestaff000000 000000 SUBROUTINE PCEVCINFO( SUMMRY, NOUT, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March, 2000 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, NGRIDS, $ NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PCEVCINFO gets needed startup information for PCTREVC driver * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (the order of the matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH * * WORK (local workspace) INTEGER array of dimension >= * MAX( 3, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ), used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Further Details * =============== * * Implemented by: M. Fahey, June 2000 * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'EVC.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * )NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 )'N', LDNVAL GO TO 30 END IF READ( NIN, FMT = * )( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * )NNB IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 )'NB', LDNBVAL GO TO 30 END IF READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) * DO 10 I = 1, NNB IF( NBVAL( I ).LT.6 ) THEN WRITE( NOUT, FMT = 9992 )NBVAL( I ) GO TO 30 END IF 10 CONTINUE * * Get number of grids * READ( NIN, FMT = * )NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDPVAL GO TO 30 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDQVAL GO TO 30 END IF * * Get values of P and Q * READ( NIN, FMT = * )( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * )( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * )THRESH * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 20 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 20 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK nonsymmetric eigenvector calculation.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'Tests of the parallel ' // $ 'complex single precision eigenvector calculation.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Residual = max( ||T*R-R*D||/(||H||*eps*N)' // $ ' , ||T^H*L-L*D^H||/(||H||*eps*N) )' WRITE( NOUT, FMT = 9999 ) $ ' Normalization residual = max(max_j(max|R(j)|-1),' // $ ' max_j(max|L(j)|-1))/(eps*N)' WRITE( NOUT, FMT = 9999 )'The matrix A is randomly ' // $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'An explanation of the input/output ' $ // 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or ' // $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the ' // 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the' // $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than' // $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'NEP time : Time in seconds to decompose the ' // ' matrix' WRITE( NOUT, FMT = 9999 )'MFLOPS : Rate of execution ' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 )'N ', $ ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 )'NB ', $ ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 )'P ', $ ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 )'Q ', $ ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 )EPS WRITE( NOUT, FMT = 9998 )THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 30 CONTINUE WRITE( NOUT, FMT = 9993 ) CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9992 FORMAT( ' Blocking size too small at ', I2, ' must be >=6.' ) * * End of PCEVCINFO * END scalapack-2.0.2/TESTING/EIG/pcgebdrv.f000644 000766 000024 00000047215 10363532303 017404 0ustar00juliestaff000000 000000 SUBROUTINE PCGEBDRV( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INFO, IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL D( * ), E( * ) COMPLEX A( * ), TAUP( * ), TAUQ( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEBDRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from sub( A ), * Q, P returned by PCGEBRD: * * sub( A ) := Q * B * P'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of sub( A ) * as returned by PCGEBRD. On exit, the original distribu- * ted matrix sub( A ) is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local input) REAL array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local input) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local input) COMPLEX array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUQ is * tied to the distributed matrix A. See Further Details. * * TAUP (local input) COMPLEX array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix P. TAUP is * tied to the distributed matrix A. See Further Details. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK >= 2*NB*( MP + NQ + NB ) * * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MP = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * On exit, if INFO <> 0, a discrepancy has been found between * the diagonal and off-diagonal elements of A and the copies * contained in the arrays D and E. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL REIGHT, RZERO PARAMETER ( REIGHT = 8.0E+0, RZERO = 0.0E+0 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, IIA, IL, IPTP, IPTQ, $ IPV, IPW, IPWK, IOFF, IV, J, JB, JJA, JL, JV, $ K, MN, MP, MYCOL, MYROW, NB, NPCOL, NPROW, NQ REAL ADDBND, D2, E2 COMPLEX D1, E1 * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCV( DLEN_ ), $ DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, IGSUM2D, INFOG2L, $ PCLACPY, PCLARFB, PCLARFT, PCLASET, $ PCELGET, PSELGET * .. * .. External Functions .. INTEGER INDXG2P, NUMROC REAL PSLAMCH EXTERNAL INDXG2P, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 NB = DESCA( MB_ ) IOFF = MOD( IA-1, NB ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IOFF, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+IOFF, NB, MYCOL, IACOL, NPCOL ) IPV = 1 IPW = IPV + MP*NB IPTP = IPW + NQ*NB IPTQ = IPTP + NB*NB IPWK = IPTQ + NB*NB * IV = 1 JV = 1 MN = MIN( M, N ) IL = MAX( ( (IA+MN-2) / NB )*NB + 1, IA ) JL = MAX( ( (JA+MN-2) / NB )*NB + 1, JA ) IAROW = INDXG2P( IL, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCV, IA+M-IL, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCW, NB, JA+N-JL, NB, NB, IAROW, IACOL, ICTXT, $ NB ) * ADDBND = REIGHT * PSLAMCH( ICTXT, 'eps' ) * * When A is an upper bidiagonal form * IF( M.GE.N ) THEN * CALL DESCSET( DESCD, 1, JA+MN-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MN-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) * DO 10 J = 0, MN-1 D1 = ZERO E1 = ZERO D2 = RZERO E2 = RZERO CALL PSELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PCELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(MN-1) ) THEN CALL PSELGET( ' ', ' ', E2, E, IA+J, 1, DESCE ) CALL PCELGET( 'Rowwise', ' ', E1, A, IA+J, JA+J+1, $ DESCA ) END IF * IF( ( ABS( D1 - CMPLX( D2 ) ).GT.( ABS( D2 )*ADDBND ) ) .OR. $ ( ABS( E1 - CMPLX( E2 ) ).GT.( ABS( E2 )*ADDBND ) ) ) $ INFO = INFO + 1 10 CONTINUE * DO 20 J = JL, JA+NB-IOFF, -NB JB = MIN( JA+N-J, NB ) I = IA + J - JA K = I - IA + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PCLARFT( 'Forward', 'Columnwise', M-K+1, JB, A, I, J, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'Lower', M-K+1, JB, A, I, J, DESCA, $ WORK( IPV ), IV, JV, DESCV ) CALL PCLASET( 'Upper', M-K+1, JB, ZERO, ONE, WORK( IPV ), $ IV, JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PCLASET( 'Lower', M-K, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PCLARFT( 'Forward', 'Rowwise', N-K, JB, A, I, J+1, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'Upper', JB, N-K, A, I, J+1, DESCA, $ WORK( IPW ), IV, JV+1, DESCW ) CALL PCLASET( 'Lower', JB, N-K, ZERO, ONE, WORK( IPW ), IV, $ JV+1, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PCLASET( 'Upper', JB, N-K-1, ZERO, ZERO, A, I, J+2, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PCLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-K+1, N-K+1, JB, WORK( IPV ), $ IV, JV, DESCV, WORK( IPTQ ), A, I, J, DESCA, $ WORK( IPWK ) ) * * Apply block Householder transformation from Right. * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-K+1, N-K, JB, WORK( IPW ), IV, $ JV+1, DESCW, WORK( IPTP ), A, I, J+1, DESCA, $ WORK( IPWK ) ) * DESCV( M_ ) = DESCV( M_ ) + NB DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCW( N_ ) = DESCW( N_ ) + NB DESCW( RSRC_ ) = DESCV( RSRC_ ) DESCW( CSRC_ ) = DESCV( CSRC_ ) * 20 CONTINUE * * Handle first block separately * JB = MIN( N, NB - IOFF ) IV = IOFF + 1 JV = IOFF + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PCLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPV ), $ IV, JV, DESCV ) CALL PCLASET( 'Upper', M, JB, ZERO, ONE, WORK( IPV ), IV, JV, $ DESCV ) * * Zero out the strict lower triangular part of A. * CALL PCLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PCLARFT( 'Forward', 'Rowwise', N-1, JB, A, IA, JA+1, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'Upper', JB, N-1, A, IA, JA+1, DESCA, $ WORK( IPW ), IV, JV+1, DESCW ) CALL PCLASET( 'Lower', JB, N-1, ZERO, ONE, WORK( IPW ), IV, $ JV+1, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PCLASET( 'Upper', JB, N-2, ZERO, ZERO, A, IA, JA+2, $ DESCA ) * * Apply block Householder transformation from left. * CALL PCLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M, N, JB, WORK( IPV ), IV, JV, DESCV, $ WORK( IPTQ ), A, IA, JA, DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from right. * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M, N-1, JB, WORK( IPW ), IV, JV+1, $ DESCW, WORK( IPTP ), A, IA, JA+1, DESCA, $ WORK( IPWK ) ) * ELSE * CALL DESCSET( DESCD, IA+MN-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MN-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 30 J = 0, MN-1 D1 = ZERO E1 = ZERO D2 = RZERO E2 = RZERO CALL PSELGET( ' ', ' ', D2, D, IA+J, 1, DESCD ) CALL PCELGET( 'Rowwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(MN-1) ) THEN CALL PSELGET( ' ', ' ', E2, E, 1, JA+J, DESCE ) CALL PCELGET( 'Columnwise', ' ', E1, A, IA+J+1, JA+J, $ DESCA ) END IF * IF( ( ABS( D1 - CMPLX( D2 ) ).GT.( ABS( D2 )*ADDBND ) ) .OR. $ ( ABS( E1 - CMPLX( E2 ) ).GT.( ABS( E2 )*ADDBND ) ) ) $ INFO = INFO + 1 30 CONTINUE * DO 40 I = IL, IA+NB-IOFF, -NB JB = MIN( IA+M-I, NB ) J = JA + I - IA K = J - JA + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PCLARFT( 'Forward', 'Columnwise', M-K, JB, A, I+1, J, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'Lower', M-K, JB, A, I+1, J, DESCA, $ WORK( IPV ), IV+1, JV, DESCV ) CALL PCLASET( 'Upper', M-K, JB, ZERO, ONE, WORK( IPV ), $ IV+1, JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PCLASET( 'Lower', M-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PCLARFT( 'Forward', 'Rowwise', N-K+1, JB, A, I, J, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'Upper', JB, N-K+1, A, I, J, DESCA, $ WORK( IPW ), IV, JV, DESCW ) CALL PCLASET( 'Lower', JB, N-K+1, ZERO, ONE, WORK( IPW ), $ IV, JV, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PCLASET( 'Upper', JB, N-K, ZERO, ZERO, A, I, J+1, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PCLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-K, N-K+1, JB, WORK( IPV ), $ IV+1, JV, DESCV, WORK( IPTQ ), A, I+1, J, $ DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from Right. * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-K+1, N-K+1, JB, WORK( IPW ), IV, $ JV, DESCW, WORK( IPTP ), A, I, J, DESCA, $ WORK( IPWK ) ) * DESCV( M_ ) = DESCV( M_ ) + NB DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCW( N_ ) = DESCW( N_ ) + NB DESCW( RSRC_ ) = DESCV( RSRC_ ) DESCW( CSRC_ ) = DESCV( CSRC_ ) * 40 CONTINUE * * Handle first block separately * JB = MIN( M, NB - IOFF ) IV = IOFF + 1 JV = IOFF + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PCLARFT( 'Forward', 'Columnwise', M-1, JB, A, IA+1, JA, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'Lower', M-1, JB, A, IA+1, JA, DESCA, $ WORK( IPV ), IV+1, JV, DESCV ) CALL PCLASET( 'Upper', M-1, JB, ZERO, ONE, WORK( IPV ), IV+1, $ JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PCLASET( 'Lower', M-2, JB, ZERO, ZERO, A, IA+2, JA, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PCLARFT( 'Forward', 'Rowwise', N, JB, A, IA, JA, DESCA, $ TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'Upper', JB, N, A, IA, JA, DESCA, WORK( IPW ), $ IV, JV, DESCW ) CALL PCLASET( 'Lower', JB, N, ZERO, ONE, WORK( IPW ), IV, JV, $ DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PCLASET( 'Upper', JB, N-1, ZERO, ZERO, A, IA, JA+1, $ DESCA ) * * Apply block Householder transformation from left * CALL PCLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M-1, N, JB, WORK( IPV ), IV+1, JV, DESCV, $ WORK( IPTQ ), A, IA+1, JA, DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from right * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M, N, JB, WORK( IPW ), IV, JV, DESCW, $ WORK( IPTP ), A, IA, JA, DESCA, WORK( IPWK ) ) END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * RETURN * * End of PCGEBDRV * END scalapack-2.0.2/TESTING/EIG/pcgehdrv.f000644 000766 000024 00000017747 10602576752 017435 0ustar00juliestaff000000 000000 SUBROUTINE PCGEHDRV( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEHDRV computes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from the * unitary matrix Q, the Hessenberg matrix, and the array TAU returned * by PCGEHRD: * sub( A ) := Q * H * Q' * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows and columns 1:ILO-1 and IHI+1:N. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) reduced to Hessenberg * form by PCGEHRD. The upper triangle and the first sub- * diagonal of sub( A ) contain the upper Hessenberg matrix H, * and the elements below the first subdiagonal, with the array * TAU, represent the unitary matrix Q as a product of * elementary reflectors. On exit, the original distributed * N-by-N matrix sub( A ) is recovered. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors returned by * PCGEHRD. TAU is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX array, dimension (LWORK). * LWORK >= NB*NB + NB*IHLP + MAX[ NB*( IHLP+INLQ ), * NB*( IHLQ + MAX[ IHIP, * IHLP+NUMROC( NUMROC( IHI-ILO+LOFF+1, NB, 0, 0, * NPCOL ), NB, 0, 0, LCMQ ) ] ) ] * * where NB = MB_A = NB_A, * LCM is the least common multiple of NPROW and NPCOL, * LCM = ILCM( NPROW, NPCOL ), LCMQ = LCM / NPCOL, * * IROFFA = MOD( IA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * * ILROW = INDXG2P( IA+ILO-1, NB, MYROW, RSRC_A, NPROW ), * ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, CSRC_A, NPCOL ), * IHLP = NUMROC( IHI-ILO+IROFFA+1, NB, MYROW, ILROW, NPROW ), * IHLQ = NUMROC( IHI-ILO+IROFFA+1, NB, MYCOL, ILCOL, NPCOL ), * INLQ = NUMROC( N-ILO+IROFFA+1, NB, MYCOL, ILCOL, NPCOL ). * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, IHLP, II, IOFF, IPT, $ IPV, IPW, IV, J, JB, JJ, JL, K, MYCOL, MYROW, $ NB, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCLARFB, $ PCLARFT, PCLACPY, PCLASET * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( IHI-ILO.LE.0 ) $ RETURN * NB = DESCA( MB_ ) IOFF = MOD( IA+ILO-2, NB ) CALL INFOG2L( IA+ILO-1, JA+ILO-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, IAROW, IACOL ) IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, IAROW, NPROW ) * IPT = 1 IPV = IPT + NB * NB IPW = IPV + IHLP * NB JL = MAX( ( ( JA+IHI-2 ) / NB ) * NB + 1, JA + ILO - 1 ) CALL DESCSET( DESCV, IHI-ILO+IOFF+1, NB, NB, NB, IAROW, $ INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ), ICTXT, MAX( 1, IHLP ) ) * DO 10 J = JL, ILO+JA+NB-IOFF-1, -NB JB = MIN( JA+IHI-J-1, NB ) I = IA + J - JA K = I - IA + 1 IV = K - ILO + IOFF + 1 * * Compute upper triangular matrix T from TAU. * CALL PCLARFT( 'Forward', 'Columnwise', IHI-K, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'All', IHI-K, JB, A, I+1, J, DESCA, WORK( IPV ), $ IV+1, 1, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PCLASET( 'Lower', IHI-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PCLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ IHI-K, N-K+1, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, I+1, J, DESCA, WORK( IPW ) ) * * Apply block Householder transformation from Right. * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Columnwise', IHI, IHI-K, JB, WORK( IPV ), IV+1, $ 1, DESCV, WORK( IPT ), A, IA, J+1, DESCA, $ WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle the first block separately * IV = IOFF + 1 I = IA + ILO - 1 J = JA + ILO - 1 JB = MIN( NB-IOFF, JA+IHI-J-1 ) * * Compute upper triangular matrix T from TAU. * CALL PCLARFT( 'Forward', 'Columnwise', IHI-ILO, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace. * CALL PCLACPY( 'All', IHI-ILO, JB, A, I+1, J, DESCA, WORK( IPV ), $ IV+1, 1, DESCV ) * * Zero out the strict lower triangular part of A. * IF( IHI-ILO.GT.0 ) $ CALL PCLASET( 'Lower', IHI-ILO-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PCLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ IHI-ILO, N-ILO+1, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, I+1, J, DESCA, WORK( IPW ) ) * * Apply block Householder transformation from Right. * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Columnwise', IHI, IHI-ILO, JB, WORK( IPV ), IV+1, $ 1, DESCV, WORK( IPT ), A, IA, J+1, DESCA, $ WORK( IPW ) ) * RETURN * * End of PCGEHDRV * END scalapack-2.0.2/TESTING/EIG/pcget22.f000644 000766 000024 00000022444 10363532303 017053 0ustar00juliestaff000000 000000 SUBROUTINE PCGET22( TRANSA, TRANSE, TRANSW, N, A, DESCA, E, DESCE, $ W, WORK, DESCW, RWORK, RESULT ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * .. Scalar Arguments .. CHARACTER TRANSA, TRANSE, TRANSW INTEGER N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCE( * ), DESCW( * ) REAL RESULT( 2 ), RWORK( * ) COMPLEX A( * ), E( * ), W( * ), WORK( * ) * .. * * Purpose * ======= * * PCGET22 does an eigenvector check. * * The basic test is: * * RESULT(1) = | A E - E W | / ( |A| |E| ulp ) * * using the 1-norm. It also tests the normalization of E: * * RESULT(2) = max | m-norm(E(j)) - 1 | / ( n ulp ) * j * * where E(j) is the j-th eigenvector, and m-norm is the max-norm of a * vector. The max-norm of a complex n-vector x in this case is the * maximum of |re(x(i)| + |im(x(i)| over i = 1, ..., n. * * Arguments * ========== * * TRANSA (input) CHARACTER*1 * Specifies whether or not A is transposed. * = 'N': No transpose * = 'T': Transpose * = 'C': Conjugate transpose * * TRANSE (input) CHARACTER*1 * Specifies whether or not E is transposed. * = 'N': No transpose, eigenvectors are in columns of E * = 'T': Transpose, eigenvectors are in rows of E * = 'C': Conjugate transpose, eigenvectors are in rows of E * * TRANSW (input) CHARACTER*1 * Specifies whether or not W is transposed. * = 'N': No transpose * = 'T': Transpose, same as TRANSW = 'N' * = 'C': Conjugate transpose, use -WI(j) instead of WI(j) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input) COMPLEX array, dimension (*) * The matrix whose eigenvectors are in E. * * DESCA (input) INTEGER array, dimension(*) * * E (input) COMPLEX array, dimension (*) * The matrix of eigenvectors. If TRANSE = 'N', the eigenvectors * are stored in the columns of E, if TRANSE = 'T' or 'C', the * eigenvectors are stored in the rows of E. * * DESCE (input) INTEGER array, dimension(*) * * W (input) COMPLEX array, dimension (N) * The eigenvalues of A. * * WORK (workspace) COMPLEX array, dimension (*) * DESCW (input) INTEGER array, dimension(*) * * RWORK (workspace) REAL array, dimension (N) * * RESULT (output) REAL array, dimension (2) * RESULT(1) = | A E - E W | / ( |A| |E| ulp ) * RESULT(2) = max | m-norm(E(j)) - 1 | / ( n ulp ) * j * Further Details * =============== * * Contributed by Mark Fahey, June, 2000 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. CHARACTER NORMA, NORME INTEGER ICOL, II, IROW, ITRNSE, ITRNSW, J, JCOL, JJ, $ JROW, JVEC, LDA, LDE, LDW, MB, MYCOL, MYROW, $ NB, NPCOL, NPROW, CONTXT, RA, CA, RSRC, CSRC REAL ANORM, ENORM, ENRMAX, ENRMIN, ERRNRM, TEMP1, $ ULP, UNFL COMPLEX CDUM, WTEMP * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH, PCLANGE EXTERNAL LSAME, PSLAMCH, PCLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGAMN2D, SGAMX2D, INFOG2L, $ PCAXPY, PCGEMM, PCLASET * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. * * Initialize RESULT (in case N=0) * RESULT( 1 ) = ZERO RESULT( 2 ) = ZERO IF( N.LE.0 ) $ RETURN * CONTXT = DESCA( CTXT_ ) RSRC = DESCA( RSRC_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) MB = DESCA( MB_ ) LDA = DESCA( LLD_ ) LDE = DESCE( LLD_ ) LDW = DESCW( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * UNFL = PSLAMCH( CONTXT, 'Safe minimum' ) ULP = PSLAMCH( CONTXT, 'Precision' ) * ITRNSE = 0 ITRNSW = 0 NORMA = 'O' NORME = 'O' * IF( LSAME( TRANSA, 'T' ) .OR. LSAME( TRANSA, 'C' ) ) THEN NORMA = 'I' END IF * IF( LSAME( TRANSE, 'T' ) ) THEN ITRNSE = 1 NORME = 'I' ELSE IF( LSAME( TRANSE, 'C' ) ) THEN ITRNSE = 2 NORME = 'I' END IF * IF( LSAME( TRANSW, 'C' ) ) THEN ITRNSW = 1 END IF * * Normalization of E: * ENRMIN = ONE / ULP ENRMAX = ZERO IF( ITRNSE.EQ.0 ) THEN DO 20 JVEC = 1, N TEMP1 = ZERO DO 10 J = 1, N CALL INFOG2L( J, JVEC, DESCE, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN TEMP1 = MAX( TEMP1, CABS1( E( ( ICOL-1 )*LDE+ $ IROW ) ) ) END IF 10 CONTINUE IF( MYCOL.EQ.JJ ) THEN CALL SGAMX2D( CONTXT, 'Col', ' ', 1, 1, TEMP1, 1, RA, CA, $ -1, -1, -1 ) ENRMIN = MIN( ENRMIN, TEMP1 ) ENRMAX = MAX( ENRMAX, TEMP1 ) END IF 20 CONTINUE CALL SGAMX2D( CONTXT, 'Row', ' ', 1, 1, ENRMAX, 1, RA, CA, -1, $ -1, -1 ) CALL SGAMN2D( CONTXT, 'Row', ' ', 1, 1, ENRMIN, 1, RA, CA, -1, $ -1, -1 ) ELSE DO 40 J = 1, N TEMP1 = ZERO DO 30 JVEC = 1, N CALL INFOG2L( J, JVEC, DESCE, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN TEMP1 = MAX( TEMP1, CABS1( E( ( ICOL-1 )*LDE+ $ IROW ) ) ) END IF 30 CONTINUE IF( MYROW.EQ.II ) THEN CALL SGAMX2D( CONTXT, 'Row', ' ', 1, 1, TEMP1, 1, RA, CA, $ -1, -1, -1 ) ENRMIN = MIN( ENRMIN, TEMP1 ) ENRMAX = MAX( ENRMAX, TEMP1 ) END IF 40 CONTINUE CALL SGAMX2D( CONTXT, 'Row', ' ', 1, 1, ENRMAX, 1, RA, CA, -1, $ -1, -1 ) CALL SGAMN2D( CONTXT, 'Row', ' ', 1, 1, ENRMIN, 1, RA, CA, -1, $ -1, -1 ) END IF * * Norm of A: * ANORM = MAX( PCLANGE( NORMA, N, N, A, 1, 1, DESCA, RWORK ), UNFL ) * * Norm of E: * ENORM = MAX( PCLANGE( NORME, N, N, E, 1, 1, DESCE, RWORK ), ULP ) * * Norm of error: * * Error = AE - EW * CALL PCLASET( 'Full', N, N, CZERO, CZERO, WORK, 1, 1, DESCW ) * DO 60 JCOL = 1, N IF( ITRNSW.EQ.0 ) THEN WTEMP = W( JCOL ) ELSE WTEMP = CONJG( W( JCOL ) ) END IF * IF( ITRNSE.EQ.0 ) THEN CALL PCAXPY( N, WTEMP, E, 1, JCOL, DESCE, 1, WORK, 1, JCOL, $ DESCW, 1 ) ELSE IF( ITRNSE.EQ.1 ) THEN CALL PCAXPY( N, WTEMP, E, JCOL, 1, DESCE, N, WORK, 1, JCOL, $ DESCW, 1 ) ELSE CALL PCAXPY( N, CONJG( WTEMP ), E, JCOL, 1, DESCE, N, WORK, $ 1, JCOL, DESCW, 1 ) DO 50 JROW = 1, N CALL INFOG2L( JROW, JCOL, DESCW, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN WORK( ( JCOL-1 )*LDW+JROW ) $ = CONJG( WORK( ( JCOL-1 )*LDW+JROW ) ) END IF 50 CONTINUE END IF 60 CONTINUE * CALL PCGEMM( TRANSA, TRANSE, N, N, N, CONE, A, 1, 1, DESCA, E, 1, $ 1, DESCE, -CONE, WORK, 1, 1, DESCW ) * ERRNRM = PCLANGE( 'One', N, N, WORK, 1, 1, DESCW, RWORK ) / ENORM * * Compute RESULT(1) (avoiding under/overflow) * IF( ANORM.GT.ERRNRM ) THEN RESULT( 1 ) = ( ERRNRM / ANORM ) / ULP ELSE IF( ANORM.LT.ONE ) THEN RESULT( 1 ) = ( MIN( ERRNRM, ANORM ) / ANORM ) / ULP ELSE RESULT( 1 ) = MIN( ERRNRM / ANORM, ONE ) / ULP END IF END IF * * Compute RESULT(2) : the normalization error in E. * RESULT( 2 ) = MAX( ABS( ENRMAX-ONE ), ABS( ENRMIN-ONE ) ) / $ ( REAL( N )*ULP ) * RETURN * * End of PCGET22 * END scalapack-2.0.2/TESTING/EIG/pcgsepchk.f000644 000766 000024 00000031030 10363532303 017543 0ustar00juliestaff000000 000000 * * SUBROUTINE PCGSEPCHK( IBTYPE, MS, NV, A, IA, JA, DESCA, B, IB, JB, $ DESCB, THRESH, Q, IQ, JQ, DESCQ, C, IC, JC, $ DESCC, W, WORK, LWORK, TSTNRM, RESULT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 15, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, IBTYPE, IC, IQ, JA, JB, JC, JQ, LWORK, $ MS, NV, RESULT REAL THRESH, TSTNRM * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCB( * ), DESCC( * ), DESCQ( * ) REAL W( * ), WORK( * ) COMPLEX A( * ), B( * ), C( * ), Q( * ) * .. * * * Purpose * ======= * * PCGSEPCHK checks a decomposition of the form * * A Q = B Q D or * A B Q = Q D or * B A Q = Q D * * where A is a symmetric matrix, B is * symmetric positive definite, Q is orthogonal, and D is diagonal. * * One of the following test ratios is computed: * * IBTYPE = 1: TSTNRM = | A Q - B Q D | / ( |A| |Q| n ulp ) * * IBTYPE = 2: TSTNRM = | A B Q - Q D | / ( |A| |Q| n ulp ) * * IBTYPE = 3: TSTNRM = | B A Q - Q D | / ( |A| |Q| n ulp ) * * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * MP = number of local rows in A, B and Q * MQ = number of local columns in A * NQ = number of local columns in B and Q * * IBTYPE (input) INTEGER * The form of the symmetric generalized eigenproblem. * = 1: A*Q = (lambda)*B*Q * = 2: A*B*Q = (lambda)*Q * = 3: B*A*Q = (lambda)*Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in A, B, C and Q * Also, the number of columns in A * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q. * * A (local input) REAL pointer to an * array in local memory of dimension (LLD_A, LOCc(JA+N-1)). * This array contains the local pieces of the M-by-N * distributed test matrix A * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix A. * * B (local input) REAL pointer to an * array in local memory of dimension (LLD_B, LOCc(JB+N-1)). * This array contains the local pieces of the M-by-N * distributed test matrix B * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix B. * * THRESH (input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) REAL array * global dimension (MS, NV), * local dimension (DESCA( DLEN_ ), NQ) * * Contains the eigenvectors as computed by PSSYEVX * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix Q. * * C (local workspace) REAL array, * global dimension (MS, NV), * local dimension (DESCA( DLEN_ ), MQ) * * Accumulator for computing AQ -QL * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix C. * * W (global input) REAL array, dimension (NV) * * Contains the computed eigenvalues * * WORK (local workspace) REAL array, * dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * TSTNRM (global output) REAL * * RESULT (global output) INTEGER * 0 if the test passes * 1 if the test fails * * .. Local Scalars .. * INTEGER I, INFO, MYCOL, MYROW, NPCOL, NPROW, NQ REAL ANORM, ULP * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) COMPLEX CONE, CNEGONE, CZERO PARAMETER ( CONE = 1.0E+0, CNEGONE = -1.0E+0, $ CZERO = 0.0E+0 ) * .. * .. External Functions .. INTEGER NUMROC REAL PCLANGE, SLAMCH EXTERNAL NUMROC, PCLANGE, SLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCGEMM, PCSSCAL, $ PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * RESULT = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 CALL CHK1MAT( MS, 1, MS, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( MS, 1, MS, 2, IB, JB, DESCB, 11, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IQ, JQ, DESCQ, 16, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IB, JB, DESCB, 20, INFO ) * IF( INFO.EQ.0 ) THEN * NQ = NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IF( IQ.NE.1 ) THEN INFO = -14 ELSE IF( JQ.NE.1 ) THEN INFO = -15 ELSE IF( IA.NE.1 ) THEN INFO = -5 ELSE IF( JA.NE.1 ) THEN INFO = -6 ELSE IF( IB.NE.1 ) THEN INFO = -9 ELSE IF( JB.NE.1 ) THEN INFO = -10 ELSE IF( LWORK.LT.NQ ) THEN INFO = -23 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PCGSEPCHK', -INFO ) RETURN END IF * RESULT = 0 ULP = SLAMCH( 'Epsilon' ) * * Compute product of Max-norms of A and Q. * ANORM = PCLANGE( 'M', MS, MS, A, IA, JA, DESCA, WORK )* $ PCLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) IF( ANORM.EQ.ZERO ) $ ANORM = ONE * IF( IBTYPE.EQ.1 ) THEN * * Norm of AQ - BQD * * C = AQ * CALL PCGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 10 I = 1, NV CALL PCSSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 10 CONTINUE * * C = C - BQ (i.e. AQ-BQD) * CALL PCGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, Q, $ IQ, JQ, DESCQ, CNEGONE, C, IC, JC, DESCC ) * TSTNRM = ( PCLANGE( 'M', MS, NV, C, IC, JC, DESCC, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * * ELSE IF( IBTYPE.EQ.2 ) THEN * * Norm of ABQ - QD * * * C = BQ * CALL PCGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 20 I = 1, NV CALL PCSSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 20 CONTINUE * * Q = AC - Q * CALL PCGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, C, $ IC, JC, DESCC, CNEGONE, Q, IQ, JQ, DESCQ ) * TSTNRM = ( PCLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * ELSE IF( IBTYPE.EQ.3 ) THEN * * Norm of BAQ - QD * * * C = AQ * CALL PCGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 30 I = 1, NV CALL PCSSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 30 CONTINUE * * Q = BC - Q * CALL PCGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, C, $ IC, JC, DESCC, CNEGONE, Q, IQ, JQ, DESCQ ) * TSTNRM = ( PCLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * END IF * IF( TSTNRM.GT.THRESH .OR. ( TSTNRM-TSTNRM.NE.0.0E0 ) ) THEN RESULT = 1 END IF RETURN * * End of PCGSEPCHK * END scalapack-2.0.2/TESTING/EIG/pcgsepdriver.f000644 000766 000024 00000022547 10363532303 020306 0ustar00juliestaff000000 000000 * * PROGRAM PCGSEPDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Parallel COMPLEX Hermitian eigenproblem test driver * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * The full tester requires approximately (5 n + 5 n^2/p + slop) * COMPLEX words and 6*n integer words. * So, TOTMEM should be set to at least 1.1 * 8 * (5n + 5n^2/p) * * WHAT WE TEST * ============ * * This routine tests PCHEGVX, the expert driver for the parallel * Hermitian eigenvalue problem. We would like to cover all * possible combinations of: matrix size, process configuration * (nprow and npcol), block size (nb), matrix type (??), range * of eigenvalue (all, by value, by position), sorting options, * and upper vs. lower storage. * * We intend to provide two types of test input files, an * installation test and a thorough test. * * We also intend that the reports be meaningful. Our input file * will allow multiple requests where each request is a cross product * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, CPLXSZ, NIN PARAMETER ( TOTMEM = 2000000, CPLXSZ = 8, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / CPLXSZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Functions .. REAL SLAMCH EXTERNAL SLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PCGSEPREQ, PSLACHKIEEE, PSLASNBT * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PSLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) END IF * CALL PSLACHKIEEE( ISIEEE, SLAMCH( 'O' ), SLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) END IF GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Hermitian Eigendecomposition routines.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'generalized ' // 'Hermitian eigenvalue routine: PCHEGVX.' WRITE( NOUT, FMT = 9999 )'A scaled residual check, ' // $ 'will be computed' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ ' : the QTQ norm is allowed to exceed THRESH' // $ ' for those eigenvectors' WRITE( NOUT, FMT = 9999 )' : which could not be ' // $ 'reorthogonalized for lack of workspace.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see pCGSEPtst.f).' WRITE( NOUT, FMT = 9999 ) $ 'IBTYPE : Generalized eigenproblem type' // $ ' (see pCHEGVx.f)' WRITE( NOUT, FMT = 9999 )'SUB : Subtests ' // $ '(see pCGSEPtst).f' WRITE( NOUT, FMT = 9999 )'CHK : The scaled residual' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PCGSEPREQ( NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, NSKIPPED, $ NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * 9999 FORMAT( A ) 9998 FORMAT( ' I am about to check to make sure that overflow' ) 9997 FORMAT( ' is handled in the ieee default manner. If this' ) 9996 FORMAT( ' is the last output you see, you should assume' ) 9995 FORMAT( ' that overflow caused a floating point exception.' ) 9994 FORMAT( ' In that case, we recommend that you add -DNO_IEEE' ) 9993 FORMAT( ' to the CDEFS line in SLmake.inc.' ) 9992 FORMAT( ' Alternatively, you could set CDEFS in SLmake.inc ' ) 9991 FORMAT( ' to enable the default ieee behaviour, However, this' ) 9990 FORMAT( ' may result in good or very bad performance.' ) 9989 FORMAT( ' Either signed zeroes or signed infinities ' ) 9988 FORMAT( ' work incorrectly or your system. Change your' ) 9987 FORMAT( ' SLmake.inc as suggested above.' ) * 9986 FORMAT( ' Your system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP IBTYPE SUB WALL CPU ', $ ' CHK CHECK' ) 9978 FORMAT( ' ----- --- --- --- --- ------ --- -------- --------', $ ' --------- -----' ) * * End of PCGSEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pcgsepreq.f000644 000766 000024 00000025540 10602576752 017612 0ustar00juliestaff000000 000000 * * SUBROUTINE PCGSEPREQ( NIN, MEM, MEMSIZE, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX MEM( MEMSIZE ) * .. * * Purpose * ======= * * PCGSEPREQ performs one request from the input file 'SEP.dat' * A request is the cross product of the specifications in the * input file. PCGSEPREQ prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEP.dat' * * MEM (local input) COMPLEX ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL FIVE PARAMETER ( FIVE = 5.0E+0 ) INTEGER CPLXSZ, INTGSZ PARAMETER ( CPLXSZ = 8, INTGSZ = 4 ) INTEGER REALSZ PARAMETER ( REALSZ = 4 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IBTYPE, IMIDPAD, INITCON, $ IPOSTPAD, IPREPAD, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, LDA, LLRWORK, MATSIZE, MATTYPE, $ MYCOL, MYROW, N, NB, NIBTYPES, NMATSIZES, $ NMATTYPES, NNODES, NP, NPCOL, NPCONFIGS, NPROW, $ NQ, NUPLOS, ORDER, PCONFIG, PTRA, PTRB, $ PTRCOPYA, PTRCOPYB, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRRWORK, PTRW, PTRW2, PTRWORK, PTRZ, $ RES, RSIZECHK, RSIZEHEEVX, RSIZEQTQ, $ RSIZESUBTST, RSIZETST, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST, UPLO REAL ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL LSAME, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PCGSEPTST, PCLASIZEGSEP, PSSEPINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PSSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * * Thresholds must be bigger for the generalized problem. * THRESH = THRESH*FIVE * DO 50 MATSIZE = 1, NMATSIZES * DO 40 PCONFIG = 1, NPCONFIGS * DO 30 MATTYPE = 1, NMATTYPES * DO 20 UPLO = 1, NUPLOS IF( LSAME( SUBTESTS, 'Y' ) ) THEN NIBTYPES = 3 ELSE NIBTYPES = 1 END IF DO 10 IBTYPE = 1, NIBTYPES * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, $ NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, $ MYROW, MYCOL ) IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PCLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, $ RSIZEQTQ, RSIZECHK, $ SIZEHEEVX, RSIZEHEEVX, $ ISIZEHEEVX, SIZESUBTST, $ RSIZESUBTST, ISIZESUBTST, $ SIZETST, RSIZETST, $ ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYB = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRB = PTRCOPYB + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRB + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, CPLXSZ / REALSZ ) PTRWORK = PTRW2 + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, CPLXSZ / REALSZ ) PTRGAP = PTRWORK + SIZETST + IPREPAD + $ IPOSTPAD PTRIFAIL = PTRGAP + ICEIL( NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, $ CPLXSZ / REALSZ ) PTRICLUS = PTRIFAIL + $ ICEIL( N+IPREPAD+IPOSTPAD, $ CPLXSZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, CPLXSZ / INTGSZ ) PTRRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, CPLXSZ / INTGSZ ) LLRWORK = ( MEMSIZE - PTRRWORK - IPOSTPAD - $ IPREPAD + 1 )* ( CPLXSZ / REALSZ ) NTESTS = NTESTS + 1 IF( LLRWORK.LT.RSIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PCGSEPTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), $ IBTYPE, SUBTESTS, THRESH, $ N, ABSTOL, ISEED, $ MEM( PTRA ), $ MEM( PTRCOPYA ), $ MEM( PTRB ), $ MEM( PTRCOPYB ), $ MEM( PTRZ ), LDA, $ MEM( PTRW ), MEM( PTRW2 ), $ MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ SIZETST, MEM( PTRRWORK ), $ LLRWORK, MEM( PTRIWRK ), $ ISIZETST, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * ) $ ' pCGSEPREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE 50 CONTINUE END IF * * RETURN * * End of PCDGSEPREQ * END scalapack-2.0.2/TESTING/EIG/pcgsepsubtst.f000644 000766 000024 00000072506 10363532303 020337 0ustar00juliestaff000000 000000 * * SUBROUTINE PCGSEPSUBTST( WKNOWN, IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, IA, JA, DESCA, WIN, WNEW, $ IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, RWORK, LRWORK, LWORK1, $ IWORK, LIWORK, RESULT, TSTNRM, QTQNRM, $ NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IBTYPE, IL, IPOSTPAD, IPREPAD, IU, JA, $ LIWORK, LRWORK, LWORK, LWORK1, N, NOUT, RESULT REAL ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) REAL GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) COMPLEX A( * ), B( * ), COPYA( * ), COPYB( * ), $ WORK( * ), Z( * ) * .. * * Purpose * ======= * * PCGSEPSUBTST calls PCHEGVX and then tests the output of * PCHEGVX * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < THRESH * |QT * Q - I| / eps * norm(A) < THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PCHEGVX when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PCGSEPSUBTST * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call to PCGSEPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PCSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) COMPLEX array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PCHEGVX for a description of block cyclic layout. * The test matrix, which is then modified by PCHEGVX * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) COMPLEX array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * B (local workspace) COMPLEX array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. * The B test matrix, which is then modified by PCHEGVX * * COPYB (local input) COMPLEX array, dim (N, N) * COPYB is used to hold an identical copy of the array B * identical in both form and content to B * * Z (local workspace) COMPLEX array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PCGSEPCHK and PCSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) REAL array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) REAL array, dimension (N) * The eigenvalues as copmuted by this call to PCHEGVX * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) REAL array, * dimension (NPROW*NPCOL) * * WORK (local workspace) COMPLEX array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * RWORK (local workspace) REAL array, dimension (LRWORK) * RWORK has already been padded front and back, * use RWORK(1+IPREPAD) * * LRWORK (local input) INTEGER * The actual length of the array RWORK after padding. * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PCHEGVX * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PCHEGVX * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) REAL * |AQ- QL| / |A|*N*EPS * * QTQNRM (global output) REAL * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285E+0, FIVE = 5.0E+0, $ NEGONE = -1.0E+0 ) COMPLEX CPADVAL PARAMETER ( CPADVAL = ( 13.989E+0, 1.93E+0 ) ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ RSIZECHK, RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, $ RSIZETST, SIZEHEEVX, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZESUBTST, SIZETMS, SIZETST, VALSIZE, $ VECSIZE REAL EPS, ERROR, MAXERROR, MAXVU, MINERROR, MINVL, $ NORMWIN, OLDVL, OLDVU, ORFAC, SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), DSEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH EXTERNAL LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLACPY, DESCINIT, IGAMN2D, $ IGAMX2D, PCCHEKPAD, PCELSET, PCFILLPAD, $ PCGSEPCHK, PCHEGVX, PCLASIZEGSEP, $ PCLASIZEHEEVX, PICHEKPAD, PIFILLPAD, PSCHEKPAD, $ PSFILLPAD, SGAMN2D, SGAMX2D, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PCLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PSLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 RWORK( I+IPREPAD ) = 14.3E+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE DO 30 I = 1, LWORK, 1 WORK( I+IPREPAD ) = ( 15.63E+0, 1.1E+0 ) 30 CONTINUE * DO 40 I = 1, N WNEW( I+IPREPAD ) = 3.14159E+0 40 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL MINIL = 1 MAXIU = 0 DO 50 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 50 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 160 RESULT = 0 * * * DSEED is not used in this call to PCLASIZEHEEVX, the * following line just makes ftnchek happy. * DSEED( 1 ) = 1 * CALL PCLASIZEHEEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL CLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL CLACPY( 'A', NP, NQ, COPYB, DESCA( LLD_ ), B( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PCFILLPAD( DESCA( CTXT_ ), NP, NQ, B, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, CPADVAL+1.0E+2 ) * CALL PCFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, CPADVAL ) * CALL PCFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, CPADVAL+1.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), LWORK1, 1, RWORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0E+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PCFILLPAD( DESCA( CTXT_ ), LWORK, 1, WORK, LWORK, IPREPAD, $ IPOSTPAD, CPADVAL+4.1E+0 ) * * Make sure that PCHEGVX does not cheat (i.e. use answers * already computed.) * DO 70 I = 1, N, 1 DO 60 J = 1, MAXEIGS, 1 CALL PCELSET( Z( 1+IPREPAD ), I, J, DESCA, $ ( 13.0E+0, 1.34E+0 ) ) 60 CONTINUE 70 CONTINUE * ORFAC = -1.0E+0 * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PCHEGVX( IBTYPE, JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, $ JA, DESCA, B( 1+IPREPAD ), IA, JA, DESCA, VL, VU, $ IL, IU, ABSTOL, M, NZ, WNEW( 1+IPREPAD ), ORFAC, $ Z( 1+IPREPAD ), IA, JA, DESCA, WORK( 1+IPREPAD ), $ SIZEHEEVX, RWORK( 1+IPREPAD ), LWORK1, $ IWORK( 1+IPREPAD ), LIWORK, IFAIL( 1+IPREPAD ), $ ICLUSTR( 1+IPREPAD ), GAP( 1+IPREPAD ), INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEGVX-B', NP, NQ, B, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ CPADVAL+1.0E+2 ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEGVX-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, CPADVAL ) * CALL PCCHEKPAD( DESCZ( CTXT_ ), 'PCHEGVX-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ CPADVAL+1.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEGVX-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEGVX-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEGVX-rWORK', LWORK1, 1, $ RWORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEGVX-WORK', LWORK, 1, WORK, $ LWORK, IPREPAD, IPOSTPAD, CPADVAL+4.1E+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEGVX-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEGVX-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEGVX-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * * Since we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9992 ) RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Make sure that different processes return the same eigenvalues * DO 80 I = 1, M RWORK( I ) = WNEW( I+IPREPAD ) RWORK( I+M ) = WNEW( I+IPREPAD ) 80 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, RWORK, M, $ 1, 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ RWORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 90 I = 1, M * IF( RESULT.EQ.0 .AND. ( ABS( RWORK( I )-RWORK( M+ $ I ) ).GT.FIVE*EPS*ABS( RWORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 90 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 100 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 110 NCLUSTERS = NCLUSTERS + 1 100 CONTINUE 110 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 120 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 120 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * * DO 130 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 130 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 160 * * Note that a couple key variables get redefined in PCGSEPCHK * as described by this table: * * PCGSEPTST name PCGSEPCHK name * ------------- ------------- * COPYA A * Z Q * B B * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the residual check * CALL PSFILLPAD( DESCA( CTXT_ ), RSIZECHK, 1, RWORK, $ RSIZECHK, IPREPAD, IPOSTPAD, 4.3E+0 ) * CALL PCGSEPCHK( IBTYPE, N, NZ, COPYA, IA, JA, DESCA, COPYB, $ IA, JA, DESCA, THRESH, Z( 1+IPREPAD ), IA, $ JA, DESCZ, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), RWORK( 1+IPREPAD ), $ RSIZECHK, TSTNRM, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCGSEPCHK-rWORK', RSIZECHK, $ 1, RWORK, RSIZECHK, IPREPAD, IPOSTPAD, $ 4.3E+0 ) * IF( RES.NE.0 ) $ RESULT = 1 END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN ) THEN * * Set up MYIL if necessary * MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 150 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 140 I = 1, M ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 140 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 150 CONTINUE * * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what PxHEGVX computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 160 CONTINUE * * RETURN * 9999 FORMAT( 'PCHEGVX returned INFO=', I7 ) 9998 FORMAT( 'PCSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PCGSEPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PCHEGVX returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PCHEGVX' ) 9981 FORMAT( 'NZ altered by PCHEGVX with JOBZ=N' ) * * End of PCGSEPSUBTST * END scalapack-2.0.2/TESTING/EIG/pcgseptst.f000644 000766 000024 00000122675 11622500733 017631 0ustar00juliestaff000000 000000 * * SUBROUTINE PCGSEPTST( DESCA, UPLO, N, MATTYPE, IBTYPE, SUBTESTS, $ THRESH, ORDER, ABSTOL, ISEED, A, COPYA, B, $ COPYB, Z, LDA, WIN, WNEW, IFAIL, ICLUSTR, $ GAP, IPREPAD, IPOSTPAD, WORK, LWORK, RWORK, $ LRWORK, IWORK, LIWORK, NOUT, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SUBTESTS, UPLO INTEGER IBTYPE, INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, $ LRWORK, LWORK, MATTYPE, N, NOUT, ORDER REAL ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) REAL GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) COMPLEX A( LDA, * ), B( LDA, * ), COPYA( LDA, * ), $ COPYB( LDA, * ), WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PCGSEPTST builds a random matrix A, and a well conditioned * matrix B, runs PCHEGVX() to compute the eigenvalues * and eigenvectors and then calls PCHEGVCHK to compute * the residual. * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) Hermitian matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PCSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) COMPLEX array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * The test matrix, which is then modified by PCHEGVX * * COPYA (local workspace) COMPLEX array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * B (local workspace) COMPLEX array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. * The B test matrix, which is then modified by PCHEGVX * * COPYB (local workspace) COMPLEX array, dim (N, N) * COPYB is used to hold an identical copy of the array B * identical in both form and content to B * * Z (local workspace) COMPLEX array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PCGSEPCHK * * W (local workspace) REAL array, dimension (N) * On normal exit from PCHEGVX, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * * WORK (local workspace) COMPLEX array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PCLASIZEGSEP * * RWORK (local workspace) COMPLEX array, dimension (LWORK) * * LRWORK (local input) INTEGER * The length of the array WORK. LRWORK >= RSIZETST as * returned by PCLASIZEGSEP * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PCLASIZEGSEP * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE, TEN, HALF PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TEN = 10.0E+0, $ HALF = 0.5E+0 ) COMPLEX PADVAL PARAMETER ( PADVAL = ( 19.25E+0, 1.1E+1 ) ) COMPLEX CZERO PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IINFO, IL, IMODE, IN, INDD, $ INDRWORK, INDWORK, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, ITYPE, IU, J, LHEEVXSIZE, LLRWORK, $ LLWORK, MAXSIZE, MYCOL, MYROW, NB, NGEN, NLOC, $ NNODES, NP, NPCOL, NPROW, NQ, RES, RSIZECHK, $ RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, RSIZETST, $ SIZEHEEVX, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZESUBTST, SIZETMS, SIZETST, VALSIZE, VECSIZE REAL ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH, SLARAN EXTERNAL LSAME, NUMROC, PSLAMCH, SLARAN * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, CLATMS, IGAMX2D, $ IGEBR2D, IGEBS2D, PCCHEKPAD, PCELSET, $ PCFILLPAD, PCGSEPSUBTST, PCLASET, PCLASIZEGSEP, $ PCLASIZEHEEVX, PCLATMS, PCMATGEN, SLABAD, $ SLASRT, SLCOMBINE * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, MAX, MIN, MOD, REAL, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 PASSED = 'PASSED ' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Make sure that we have enough memory * * CALL PCLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) * IF( LRWORK.LT.RSIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDRWORK = INDD + N INDWORK = 1 LLWORK = LWORK - INDWORK + 1 LLRWORK = LRWORK - INDRWORK + 1 * ULP = PSLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PSLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL SLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / REAL( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log Hermitian, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random Hermitian * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * * Zero * * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N RWORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PCLASET( 'All', N, N, CZERO, CZERO, COPYA, 1, 1, $ DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N RWORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PCLASET( 'All', N, N, CZERO, CONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0E+0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0E+0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * Hermitian, eigenvalues specified * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * Hermitian, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PCMATGEN( DESCA( CTXT_ ), 'H', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0E+0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0E+0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PCLASET( 'All', N, N, CZERO, CZERO, COPYA, 1, 1, $ DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( SLARAN( ISEED )*REAL( NLOC ) ), N-NGEN ) * CALL CLATMS( IN, IN, 'S', ISEED, 'P', RWORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PCELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PCELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PCELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PCELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2, ... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 RWORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0E+0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * * * WKNOWN ... NOT SET, GUESS A DEFAULT * WKNOWN = .TRUE. ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL SLASRT( 'I', N, RWORK( INDD ), IINFO ) * * Create the B matrix * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.3E+0 ) * ANORM = ONE * * Update ISEED so that {CLAGSY creates a different Q * ISEED( 4 ) = MOD( ISEED( 4 )+257, 4096 ) ISEED( 3 ) = MOD( ISEED( 3 )+192, 4096 ) ISEED( 2 ) = MOD( ISEED( 2 )+35, 4096 ) ISEED( 1 ) = MOD( ISEED( 1 )+128, 4096 ) CALL PCLATMS( N, N, 'S', ISEED, 'P', RWORK( INDD ), 3, TEN, $ ANORM, N, N, 'N', COPYB, 1, 1, DESCA, ORDER, $ WORK( INDWORK+IPREPAD ), SIZETMS, IINFO ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS5-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.3E+0 ) * * * These values aren't actually used, but they make ftncheck happy. * IL = -1 IU = -2 VL = ONE VU = -ONE * CALL PCLASIZEHEEVX( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, RWORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) * LHEEVXSIZE = MIN( MAXSIZE, LRWORK ) WKNOWN = .FALSE. * CALL PCGSEPSUBTST( WKNOWN, IBTYPE, 'v', 'a', UPLO, N, VL, VU, $ IL, IU, THRESH, ABSTOL, A, COPYA, B, COPYB, $ Z, 1, 1, DESCA, RWORK( INDD ), WIN, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK( INDRWORK ), $ LLRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * * * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'V', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'V' RANGE = 'A' CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 1' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'V', RANGE = 'A', random memory * IF( INFO.EQ.0 ) THEN JOBZ = 'V' RANGE = 'A' CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 2' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * We use PCLASIZEHEEVX to choose IL and IU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'I', maximum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PCLASIZEHEEVX to choose IL and IU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = MAXSIZE * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF * * Subtest 6: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PCLASIZEHEEVX to choose IL and IU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 6' INFO = 1 END IF END IF * * Subtest 7: JOBZ = 'V', RANGE = 'I', random memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PCLASIZEHEEVX to choose IL and IU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) LHEEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 7' INFO = 1 END IF END IF * * Subtest 8: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 8' INFO = 1 END IF END IF * * Subtest 9: JOBZ = 'V', RANGE = 'V', maximum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = MAXSIZE * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 9' INFO = 1 END IF END IF * * Subtest 10: JOBZ = 'V', RANGE = 'V', * minimum memory required for eigenvectors * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest10' INFO = 1 END IF END IF * * Subtest 11: JOBZ = 'V', RANGE = 'V', * random memory (enough for all eigenvectors * but not enough to guarantee orthogonality * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest11' INFO = 1 END IF END IF * * Subtest 12: JOBZ = 'V', RANGE = 'V', * miniimum memory required for eigenvalues only * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest12' INFO = 1 END IF END IF * * Subtest 13: JOBZ = 'V', RANGE = 'V', * random memory (more than minimum required * for eigenvalues, less than required for vectors) * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * * CALL PCGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest13' INFO = 1 END IF END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9984 )IBTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, CTIME( 1 ), MAXTSTNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS END IF END IF * 120 CONTINUE * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, F8.2, 1X, F8.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, 8X, 1X, F8.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, F8.2, 1X, F8.2, 11X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, 8X, 1X, F8.2, 11X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 22X, 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) 9984 FORMAT( ' IBTYPE=', I8 ) 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) 9980 FORMAT( ' Increase TOTMEM in PCGSEPDRIVER' ) * * End of PCGSEPTST * END scalapack-2.0.2/TESTING/EIG/pchetdrv.f000644 000766 000024 00000041470 10363532303 017424 0ustar00juliestaff000000 000000 SUBROUTINE PCHETDRV( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL D( * ), E( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCHETDRV computes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from Q, the * Hermitian tridiagonal matrix T (or D and E), and TAU, which were * computed by PCHETRD: sub( A ) := Q * T * Q'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of sub( A ). On entry, * if UPLO='U', the diagonal and first superdiagonal of sub( A ) * have the corresponding elements of the tridiagonal matrix T, * and the elements above the first superdiagonal, with the * array TAU, represent the unitary matrix Q as a product of * elementary reflectors, and the strictly lower triangular part * of sub( A ) is not referenced. If UPLO='L', the diagonal and * first subdiagonal of sub( A ) have the corresponding elements * of the tridiagonal matrix T, and the elements below the first * subdiagonal, with the array TAU, represent the unitary * matrix Q as a product of elementary reflectors, and the * strictly upper triangular part of sub( A ) is not referenced. * On exit, if UPLO = 'U', the upper triangular part of the * distributed Hermitian matrix sub( A ) is recovered. * If UPLO='L', the lower triangular part of the distributed * Hermitian matrix sub( A ) is recovered. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local input) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local input) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local input) COMPLEX, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK >= 2 * NB *( NB + NP ) * * where NB = MB_A = NB_A, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * On exit, if INFO <> 0, a discrepancy has been found between * the diagonal and off-diagonal elements of A and the copies * contained in the arrays D and E. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL REIGHT, RONE, RZERO PARAMETER ( REIGHT = 8.0E+0, RONE = 1.0E+0, $ RZERO = 0.0E+0 ) COMPLEX HALF, ONE, ZERO PARAMETER ( HALF = ( 0.5E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, IACOL, IAROW, ICTXT, II, IPT, IPV, IPX, $ IPY, J, JB, JJ, JL, K, MYCOL, MYROW, NB, NP, $ NPCOL, NPROW REAL ADDBND, D2, E2 COMPLEX D1, E1 * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCV( DLEN_ ), $ DESCT( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC REAL PSLAMCH EXTERNAL INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, IGSUM2D, $ PCELGET, PCGEMM, PCHEMM, $ PCHER2K, PCLACPY, PCLARFT, $ PCLASET, PCTRMM, PSELGET * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, MAX, MIN, MOD * .. * .. Executable statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 NB = DESCA( MB_ ) UPPER = LSAME( UPLO, 'U' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) * IPT = 1 IPV = NB * NB + IPT IPX = NB * NP + IPV IPY = NB * NP + IPX * CALL DESCSET( DESCD, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * ADDBND = REIGHT * PSLAMCH( ICTXT, 'eps' ) * IF( UPPER ) THEN * CALL DESCSET( DESCE, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 10 J = 0, N-1 D1 = ZERO E1 = ZERO D2 = RZERO E2 = RZERO CALL PSELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PCELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(N-1) ) THEN CALL PSELGET( ' ', ' ', E2, E, 1, JA+J+1, DESCE ) CALL PCELGET( 'Columnwise', ' ', E1, A, IA+J, JA+J+1, $ DESCA ) END IF * IF( ( ABS( D1 - CMPLX( D2 ) ).GT.( ABS( D2 )*ADDBND ) ) .OR. $ ( ABS( E1 - CMPLX( E2 ) ).GT.( ABS( E2 )*ADDBND ) ) ) $ INFO = INFO + 1 10 CONTINUE * * Compute the upper triangle of sub( A ). * CALL DESCSET( DESCV, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCT, NB, NB, NB, NB, IAROW, IACOL, ICTXT, NB ) * DO 20 K = 0, N-1, NB JB = MIN( NB, N-K ) I = IA + K J = JA + K * * Compute the lower triangular matrix T. * CALL PCLARFT( 'Backward', 'Columnwise', K+JB-1, JB, A, IA, $ J, DESCA, TAU, WORK( IPT ), WORK( IPV ) ) * * Copy Householder vectors into WORK( IPV ). * CALL PCLACPY( 'All', K+JB-1, JB, A, IA, J, DESCA, $ WORK( IPV ), 1, 1, DESCV ) * IF( K.GT.0 ) THEN CALL PCLASET( 'Lower', JB+1, JB, ZERO, ONE, WORK( IPV ), $ K, 1, DESCV ) ELSE CALL PCLASET( 'Lower', JB, JB-1, ZERO, ONE, WORK( IPV ), $ 1, 2, DESCV ) CALL PCLASET( 'Ge', JB, 1, ZERO, ZERO, WORK( IPV ), 1, $ 1, DESCV ) END IF * * Zero out the strict upper triangular part of A. * IF( K.GT.0 ) THEN CALL PCLASET( 'Ge', K-1, JB, ZERO, ZERO, A, IA, J, $ DESCA ) CALL PCLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, A, I-1, $ J+1, DESCA ) ELSE IF( JB.GT.1 ) THEN CALL PCLASET( 'Upper', JB-2, JB-2, ZERO, ZERO, A, IA, $ J+2, DESCA ) END IF * * (1) X := A * V * T' * CALL PCHEMM( 'Left', 'Upper', K+JB, JB, ONE, A, IA, JA, $ DESCA, WORK( IPV ), 1, 1, DESCV, ZERO, $ WORK( IPX ), 1, 1, DESCV ) CALL PCTRMM( 'Right', 'Lower', 'Conjugate transpose', $ 'Non-Unit', K+JB, JB, ONE, WORK( IPT ), 1, 1, $ DESCT, WORK( IPX ), 1, 1, DESCV ) * * (2) X := X - 1/2 * V * (T * V' * X) * CALL PCGEMM( 'Conjugate transpose', 'No transpose', JB, JB, $ K+JB, ONE, WORK( IPV ), 1, 1, DESCV, $ WORK( IPX ), 1, 1, DESCV, ZERO, WORK( IPY ), $ 1, 1, DESCT ) CALL PCTRMM( 'Left', 'Lower', 'No transpose', 'Non-Unit', $ JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPY ), 1, 1, DESCT ) CALL PCGEMM( 'No tranpose', 'No transpose', K+JB, JB, JB, $ -HALF, WORK( IPV ), 1, 1, DESCV, WORK( IPY ), $ 1, 1, DESCT, ONE, WORK( IPX ), 1, 1, DESCV ) * * (3) A := A - X * V' - V * X' * CALL PCHER2K( 'Upper', 'No transpose', K+JB, JB, -ONE, $ WORK( IPV ), 1, 1, DESCV, WORK( IPX ), 1, 1, $ DESCV, RONE, A, IA, JA, DESCA ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) DESCT( CSRC_ ) = MOD( DESCT( CSRC_ ) + 1, NPCOL ) * 20 CONTINUE * ELSE * CALL DESCSET( DESCE, 1, JA+N-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 30 J = 0, N-1 D1 = ZERO E1 = ZERO D2 = RZERO E2 = RZERO CALL PSELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PCELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(N-1) ) THEN CALL PSELGET( ' ', ' ', E2, E, 1, JA+J, DESCE ) CALL PCELGET( 'Columnwise', ' ', E1, A, IA+J+1, JA+J, $ DESCA ) END IF * IF( ( ABS( D1 - CMPLX( D2 ) ).GT.( ABS( D2 )*ADDBND ) ) .OR. $ ( ABS( E1 - CMPLX( E2 ) ).GT.( ABS( E2 )*ADDBND ) ) ) $ INFO = INFO + 1 30 CONTINUE * * Compute the lower triangle of sub( A ). * JL = MAX( ( ( JA+N-2 ) / NB ) * NB + 1, JA ) IACOL = INDXG2P( JL, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCV, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCT, NB, NB, NB, NB, INDXG2P( IA+JL-JA+1, NB, $ MYROW, DESCA( RSRC_ ), NPROW ), IACOL, ICTXT, $ NB ) * DO 40 J = JL, JA, -NB K = J - JA + 1 I = IA + K - 1 JB = MIN( N-K+1, NB ) * * Compute upper triangular matrix T from TAU. * CALL PCLARFT( 'Forward', 'Columnwise', N-K, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPV ) ) * * Copy Householder vectors into WORK( IPV ). * CALL PCLACPY( 'Lower', N-K, JB, A, I+1, J, DESCA, $ WORK( IPV ), K+1, 1, DESCV ) CALL PCLASET( 'Upper', N-K, JB, ZERO, ONE, WORK( IPV ), $ K+1, 1, DESCV ) CALL PCLASET( 'Ge', 1, JB, ZERO, ZERO, WORK( IPV ), K, 1, $ DESCV ) * * Zero out the strict lower triangular part of A. * CALL PCLASET( 'Lower', N-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * (1) X := A * V * T' * CALL PCHEMM( 'Left', 'Lower', N-K+1, JB, ONE, A, I, J, $ DESCA, WORK( IPV ), K, 1, DESCV, ZERO, $ WORK( IPX ), K, 1, DESCV ) CALL PCTRMM( 'Right', 'Upper', 'Conjugate transpose', $ 'Non-Unit', N-K+1, JB, ONE, WORK( IPT ), 1, 1, $ DESCT, WORK( IPX ), K, 1, DESCV ) * * (2) X := X - 1/2 * V * (T * V' * X) * CALL PCGEMM( 'Conjugate transpose', 'No transpose', JB, JB, $ N-K+1, ONE, WORK( IPV ), K, 1, DESCV, $ WORK( IPX ), K, 1, DESCV, ZERO, WORK( IPY ), $ 1, 1, DESCT ) CALL PCTRMM( 'Left', 'Upper', 'No transpose', 'Non-Unit', $ JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPY ), 1, 1, DESCT ) CALL PCGEMM( 'No transpose', 'No transpose', N-K+1, JB, JB, $ -HALF, WORK( IPV ), K, 1, DESCV, WORK( IPY ), $ 1, 1, DESCT, ONE, WORK( IPX ), K, 1, DESCV ) * * (3) A := A - X * V' - V * X' * CALL PCHER2K( 'Lower', 'No tranpose', N-K+1, JB, -ONE, $ WORK( IPV ), K, 1, DESCV, WORK( IPX ), K, 1, $ DESCV, RONE, A, I, J, DESCA ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCT( RSRC_ ) = MOD( DESCT( RSRC_ ) + NPROW - 1, NPROW ) DESCT( CSRC_ ) = MOD( DESCT( CSRC_ ) + NPCOL - 1, NPCOL ) * 40 CONTINUE * END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * RETURN * * End of PCHETDRV * END scalapack-2.0.2/TESTING/EIG/pchrddriver.f000644 000766 000024 00000045435 10363532303 020126 0ustar00juliestaff000000 000000 PROGRAM PCHRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * Purpose * ======= * * PCHRDDRIVER is the main test program for the COMPLEX * ScaLAPACK HRD (Hessenberg Reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 14 lines: * 'ScaLAPACK HRD input file' * 'PVM machine' * 'HRD.out' output file name (if any) * 6 device out * 2 number of problems sizes * 100 101 values of N * 2 1 values of ILO * 99 101 values of IHI * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, MEMSIZ, NTESTS, TOTMEM COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IHI, IHIP, IHLP, IHLQ, $ ILCOL, ILO, ILROW, INFO, INLQ, IMIDPAD, IPA, $ IPT, IPW, IPOSTPAD, IPREPAD, ITEMP, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LCM, LCMQ, LOFF, $ LWORK, MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, $ NPROCS, NOUT, NP, NPCOL, NPROW, NQ, WORKHRD, $ WORKSIZ REAL ANORM, FRESID, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), NVHI( NTESTS ), NVLO( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINIT, BLACS_GRIDINFO, $ DESCINIT, IGSUM2D, BLACS_PINFO, PCFILLPAD, $ PCLAFCHK, PCGEHDRV, PCGEHRD, $ PCHRDINFO, PCMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ILCM, INDXG2P, NUMROC REAL PCLANGE EXTERNAL ILCM, INDXG2P, NUMROC, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PCHRDINFO( OUTFILE, NOUT, NMAT, NVAL, NVLO, NVHI, NTESTS, $ NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GOTO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) ILO = NVLO( J ) IHI = NVHI( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPT = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * IHIP = NUMROC( IHI, NB, MYROW, DESCA( RSRC_ ), NPROW ) LOFF = MOD( ILO-1, NB ) ILROW = INDXG2P( ILO, NB, MYROW, DESCA( RSRC_ ), NPROW ) ILCOL = INDXG2P( ILO, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) IHLP = NUMROC( IHI-ILO+LOFF+1, NB, MYROW, ILROW, NPROW ) INLQ = NUMROC( N-ILO+LOFF+1, NB, MYCOL, ILCOL, NPCOL ) LWORK = NB*( NB + MAX( IHIP+1, IHLP+INLQ ) ) WORKHRD = LWORK + IPOSTPAD WORKSIZ = WORKHRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IHLQ = NUMROC( IHI-ILO+LOFF+1, NB, MYCOL, ILCOL, $ NPCOL ) ITEMP = NB*MAX( IHLP+INLQ, IHLQ+MAX( IHIP, $ IHLP+NUMROC( NUMROC( IHI-ILO+LOFF+1, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ) ) ) WORKSIZ = MAX( NB*NB + NB*IHLP + ITEMP, NB * NP ) + $ IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Hessenberg reduction', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate A * CALL PCMATGEN( ICTXT, 'No', 'No', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), $ IASEED, 0, NP, 0, NQ, MYROW, MYCOL, $ NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), $ NQ, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANGE( 'I', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKHRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKHRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce Hessenberg form * CALL PCGEHRD( N, ILO, IHI, MEM( IPA ), 1, 1, DESCA, $ MEM( IPT ), MEM( IPW ), LWORK, INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGEHRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEHRD', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEHRD', WORKHRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKHRD-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - Q H Q'|| / (||A||*N*eps) * CALL PCGEHDRV( N, ILO, IHI, MEM( IPA ), 1, 1, DESCA, $ MEM( IPT ), MEM( IPW ) ) CALL PCLAFCHK( 'No', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCGEHDRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEHDRV', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCGEHDRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ.0.0E+0 ) $ THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather max. of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * HRD requires 40/3 * N^3 floating point ops. (flops) * more precisely, * HRD requires 16/3*(IHI-ILO)^3+8*IHI*(IHI-ILO)^2 flops * NOPS = DBLE( IHI-ILO ) NOPS = NOPS * NOPS * $ ( 8.0D0*DBLE( IHI ) + (16.0D0/3.0D0)*NOPS ) NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', N, ILO, IHI, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, FRESID, $ PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', N, ILO, IHI, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, FRESID, $ PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N ILO IHI NB P Q HRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ------ ------ ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, $ F9.2, 1X, F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*H*Q''|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PCHRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pchrdinfo.f000644 000766 000024 00000032377 10363532303 017567 0ustar00juliestaff000000 000000 SUBROUTINE PCHRDINFO( SUMMRY, NOUT, NMAT, NVAL, NVLO, NVHI, $ LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ NVHI( LDNVAL ), NVLO( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PCHRDINFO get the needed startup information for the Hessenberg * reduction tests and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, output to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for * N, IHI & ILO. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of rows & columns in matrix). * * NVLO (global output) INTEGER array, dimension (LDNVAL) * The values of ILO. * * NVHI (global output) INTEGER array, dimension (LDNVAL) * The values of IHI. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, ILO and IHI. LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * 3*LDNVAL+LDNBVAL+2*LDPVAL. Used to pack all input arrays * in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'HRD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = * ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( UNIT = NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1. .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'N', LDNVAL GO TO 20 END IF * * Get values of N, ILO, IHI * READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVLO( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVHI( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 1, 3, WORK, 1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVLO, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVHI, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS -1 CALL IGEBS2D( ICTXT, 'All', ' ', 1, I, WORK, 1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Reduction routine to Hessenberg form.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision Hessenberg ' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - Q H Q''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'HRD time : Time in seconds to compute HRD ' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for HRD ' // $ 'reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9995 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'ILO ', ( NVLO( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVLO( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'IHI ', ( NVHI( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVHI( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9995 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) EPS WRITE( NOUT, FMT = 9993 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 3, WORK, 1, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = 3*NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) * I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVLO, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVHI, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 CONTINUE WRITE( NOUT, FMT = 9998 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' ILLEGAL INPUT IN FILE ', 40A, '. ABORTING RUN.' ) 9997 FORMAT( ' NUMBER OF VALUES OF ', 5A, $ ' IS LESS THAN 1 OR GREATER ', 'THAN ', I2 ) 9996 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9995 FORMAT( 2X, A5, ': ', 10I6 ) 9994 FORMAT( ' ', 10I6 ) 9993 FORMAT( 'Routines pass computational tests if scaled residual is', $ ' less than ', G14.7 ) * * End of PCHRDINFO * END scalapack-2.0.2/TESTING/EIG/pclafchk.f000644 000766 000024 00000026755 10363532303 017371 0ustar00juliestaff000000 000000 SUBROUTINE PCLAFCHK( AFORM, DIAG, M, N, A, IA, JA, DESCA, IASEED, $ ANORM, FRESID, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER AFORM, DIAG INTEGER IA, IASEED, JA, M, N REAL ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCLAFCHK computes the residual * || sub( A ) - sub( Ao ) || / (|| sub( Ao ) ||*eps*MAX(M,N)), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A( IA:IA+M-1, JA:JA+N-1 ) and ||.|| stands for the infini- * ty norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * AFORM (global input) CHARACTER * sub( A ) is overwritten with: * - a symmetric matrix, if AFORM = 'S'; * - a Hermitian matrix, if AFORM = 'H'; * - the transpose of what would normally be generated, * if AFORM = 'T'; * - the conjugate transpose of what would normally be * generated, if AFORM = 'C'; * - otherwise a random matrix. * * DIAG (global input) CHARACTER * if DIAG = 'D' : sub( A ) is diagonally dominant. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub( A ) - sub( Ao ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * ANORM (global input) REAL * The Infinity norm of sub( A ). * * FRESID (global output) REAL * The maximum (worst) factorizational error. * * WORK (local workspace) COMPLEX array, dimension (LWORK). * LWORK >= MpA0 * NB_A, where * * IROFFA = MOD( IA-1, MB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * WORK is used to store a block of columns of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = (1.0E+0, 0.0E+0) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURCOL, ICURROW, $ II, IIA, IOFFA, IROFF, JB, JJ, JJA, JN, KK, $ LDA, LDW, LDWP1, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CMATADD, INFOG2L, PCMATGEN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSLAMCH, PCLANGE EXTERNAL ICEIL, LSAME, NUMROC, PCLANGE, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( ICTXT, 'eps' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * * Compute sub( A ) := sub( A ) - sub( Ao ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDW = MAX( 1, MP ) LDWP1 = LDW + 1 LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA - 1 )*LDA * IF( LSAME( AFORM, 'H' ) ) THEN * * Handle first block of columns separately * II = 1 ICURROW = IAROW ICURCOL = IACOL JB = JN - JA + 1 * IF( MYCOL.EQ.ICURCOL ) THEN CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) IF( MYROW.EQ.ICURROW ) THEN DO 10, KK = 0, JB-1 WORK( II+KK*LDWP1 ) = REAL( WORK( II+KK*LDWP1 ) ) 10 CONTINUE END IF CALL CMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 30, JJ = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-JJ, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ WORK, LDW, DESCA( RSRC_ ), DESCA( CSRC_ ), $ IASEED, IIA-1, MP, JJA-1, JB, MYROW, $ MYCOL, NPROW, NPCOL ) IF( MYROW.EQ.ICURROW ) THEN DO 20, KK = 0, JB-1 WORK( II+KK*LDWP1 ) = REAL( WORK( II+KK*LDWP1 ) ) 20 CONTINUE END IF CALL CMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE * ELSE * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN JB = JN-JA+1 CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL CMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB NQ = NQ - JB IOFFA = IOFFA + JB * LDA END IF * * Handle the remaning blocks of columns * DO 40 JJ = JJA, JJA+NQ-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JJA+NQ-JJ ) IOFFA = IIA + ( JJ - 1 )*LDA CALL PCMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJ-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL CMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) 40 CONTINUE * END IF * * Calculate factor residual * FRESID = PCLANGE( 'I', M, N, A, IA, JA, DESCA, WORK ) / $ ( MAX( M, N ) * EPS * ANORM ) * RETURN * * End PCLAFCHK * END scalapack-2.0.2/TESTING/EIG/pclagsy.f000644 000766 000024 00000025721 10363532303 017250 0ustar00juliestaff000000 000000 * * SUBROUTINE PCLAGHE( N, K, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, INFO ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, N, ORDER * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) REAL D( * ) COMPLEX A( * ), WORK( * ) * .. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Purpose * ======= * * PCLAGHE generates a real Hermitian matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random orthogonal matrix: * A = U*D*U'. * * This is just a quick implementation which will be replaced in the * future. The random orthogonal matrix is computed by creating a * random matrix and running QR on it. This requires vastly more * computation than necessary, but not significantly more communication * than is used in the rest of this rouinte, and hence is not that much * slower than an efficient solution. * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix A. N >= 0. * * K (global input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * ### K must be 0 or N-1, 0 < K < N-1 is not supported yet. * * D (global input) COMPLEX array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (local output) COMPLEX array * Global dimension (N, N), local dimension (NP, NQ) * The generated n by n Hermitian matrix A (the full matrix is * stored). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated and will remain identical on * all processes in the context. * * ORDER (global input) INTEGER * Number of reflectors in the matrix Q * At present, ORDER .NE. N is not supported * * WORK (local workspace) COMPLEX array, dimension (LWORK) * * LWORK (local input) INTEGER dimension of WORK * LWORK >= SIZETMS as returned by PCLASIZESEP * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX CZERO PARAMETER ( CZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER CSRC_A, I, IACOL, IAROW, ICOFFA, II, IIROW, $ INDAA, INDTAU, INDWORK, IPOSTPAD, IPREPAD, $ IROFFA, ISIZEHEEVX, ISIZESUBTST, ISIZETST, $ JJCOL, LDAA, LII, LIII, LJJ, LJJJ, LWMIN, MAXI, $ MB_A, MYCOL, MYROW, NB_A, NP, NPCOL, NPROW, NQ, $ RSIZECHK, RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, $ RSIZETST, RSRC_A, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST, SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, CLASET, PCGEQRF, $ PCLASIZESEP, PCMATGEN, PCUNMQR, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. * INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Initialize grid information * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * * Check LWORK * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) END IF * LDAA = DESCA( LLD_ ) MB_A = DESCA( MB_ ) NB_A = DESCA( NB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) NP = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ) IPREPAD = 0 IPOSTPAD = 0 CALL PCLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) LWMIN = SIZETMS * * Test the input arguments * IF( INFO.EQ.0 ) THEN IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( N.NE.ORDER ) THEN INFO = -9 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( INFO.LT.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PCLAGHE', -INFO ) RETURN END IF * INDAA = 1 INDTAU = INDAA + LDAA*MAX( 1, NQ ) INDWORK = INDTAU + MAX( 1, NQ ) * IF( K.NE.0 ) THEN CALL CLASET( 'A', LDAA, NQ, CZERO, CZERO, WORK( INDAA ), LDAA ) * * * Build a random matrix * * CALL PCMATGEN( DESCA( CTXT_ ), 'N', 'N', N, ORDER, $ DESCA( MB_ ), DESCA( NB_ ), WORK( INDAA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ ISEED( 1 ), 0, NP, 0, NQ, MYROW, MYCOL, NPROW, $ NPCOL ) CALL PCGEQRF( N, ORDER, WORK( INDAA ), IA, JA, DESCA, $ WORK( INDTAU ), WORK( INDWORK ), SIZEQRF, INFO ) * END IF * * Build a diagonal matrix A with the eigenvalues specified in D * CALL CLASET( 'A', NP, NQ, CZERO, CZERO, A, DESCA( LLD_ ) ) * IIROW = 0 JJCOL = 0 LII = 1 LJJ = 1 * DO 20 II = 1, N, DESCA( MB_ ) MAXI = MIN( N, II+DESCA( MB_ )-1 ) IF( ( MYROW.EQ.IIROW ) .AND. ( MYCOL.EQ.JJCOL ) ) THEN LIII = LII LJJJ = LJJ DO 10 I = II, MAXI A( LIII+( LJJJ-1 )*DESCA( LLD_ ) ) = D( I ) LIII = LIII + 1 LJJJ = LJJJ + 1 10 CONTINUE END IF IF( MYROW.EQ.IIROW ) $ LII = LII + DESCA( MB_ ) IF( MYCOL.EQ.JJCOL ) $ LJJ = LJJ + DESCA( MB_ ) IIROW = MOD( IIROW+1, NPROW ) JJCOL = MOD( JJCOL+1, NPCOL ) 20 CONTINUE * * A = Q * A * IF( K.NE.0 ) THEN * CALL PCUNMQR( 'L', 'Conjugate transpose', N, N, ORDER, $ WORK( INDAA ), IA, JA, DESCA, WORK( INDTAU ), A, $ IA, JA, DESCA, WORK( INDWORK ), SIZEMQRLEFT, $ INFO ) * * * A = A * Q' * * CALL PCUNMQR( 'R', 'N', N, N, ORDER, WORK( INDAA ), IA, JA, $ DESCA, WORK( INDTAU ), A, IA, JA, DESCA, $ WORK( INDWORK ), SIZEMQRRIGHT, INFO ) * END IF * * End of PCLAGHE * END scalapack-2.0.2/TESTING/EIG/pclasizegsep.f000644 000766 000024 00000013241 10363532303 020271 0ustar00juliestaff000000 000000 SUBROUTINE PCLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, $ ISIZEHEEVX, SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, RSIZETST, $ ISIZETST ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, RSIZECHK, RSIZEHEEVX, RSIZEQTQ, $ RSIZESUBTST, RSIZETST, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * PCLASIZEGSEP computes the amount of memory needed by * ======= * * PCLASIZEGSEP computes the amount of memory needed by * various GSEP test routines, as well as HEGVX itself * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PCHEGVX * * SIZEMQRLEFT LWORK for the 1st PCUNMQR call in PCLAGHE * * SIZEMQRRIGHT LWORK for the 2nd PCUNMQR call in PCLAGHE * * SIZEQRF LWORK for PCGEQRF in PCLAGHE * * SIZETMS LWORK for PCLATMS * * RSIZEQTQ LWORK for PCSEPQTQ (nexer complex) * * RSIZECHK LWORK for PCGSEPCHK * * SIZEHEEVX LWORK for PCHEGVX * * RSIZEHEEVX LRWORK for PCHEGVX * * ISIZEHEEVX LIWORK for PCHEGVX * * SIZESUBTST LWORK for PCSUBTST * * RSIZESUBTST LRWORK for PCSUBTST * * ISIZESUBTST LIWORK for PCSUBTST * * SIZETST LWORK for PCTST * * RSIZETST LRWORK for PCTST * * ISIZETST LIWORK for PCTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ANB, CSRC_A, IACOL, IAROW, ICOFFA, ICTXT, $ IROFFA, LCM, LCMQ, LDA, MQ0, MYCOL, MYROW, N, $ NB, NEIG, NHEGST_LWOPT, NHETRD_LWOPT, NN, NNP, $ NP, NP0, NPCOL, NPROW, NPS, NQ, NQ0, RSRC_A, $ SIZECHK, SIZEQTQ, SQNPC * .. * .. External Functions .. * INTEGER ICEIL, ILCM, INDXG2P, NUMROC, PJLAENV EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC, PJLAENV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, SQRT * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 0 SIZECHK = 0 RSIZEQTQ = 0 RSIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZEHEEVX = N + ( NP0+MQ0+NB )*NB RSIZEHEEVX = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZEHEEVX = 6*NNP * ICTXT = DESCA( CTXT_ ) ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS * NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) NHEGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB SIZEHEEVX = MAX( SIZEHEEVX, N+NHETRD_LWOPT, NHEGST_LWOPT ) * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZEHEEVX ) + $ IPREPAD + IPOSTPAD RSIZESUBTST = MAX( RSIZEHEEVX, RSIZEQTQ, RSIZECHK ) + IPREPAD + $ IPOSTPAD ISIZESUBTST = ISIZEHEEVX + IPREPAD + IPOSTPAD * * * Allow room for A, COPYA and Z and WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Room for DIAG, WIN, WNEW, GAP and RWORK * RSIZETST = 4*( N+IPREPAD+IPOSTPAD ) + RSIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PCHEGVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pclasizeheevr.f000644 000766 000024 00000014543 11623527140 020455 0ustar00juliestaff000000 000000 SUBROUTINE PCLASIZEHEEVR( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER RANGE INTEGER IL, IU, MAXSIZE, N, VALSIZE, VECSIZE REAL VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) REAL WIN( * ) * .. * * Purpose * ======= * * PCLASIZEHEEVR computes the amount of memory needed by PCHEEVR * to ensure: * 1) Orthogonal Eigenvectors * 2) Eigenpairs with small residual norms * * Arguments * ========= * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * VL (global input/output ) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VL is set * to a random value near an entry in WIN * * VU (global input/output ) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VU is set * to a random value near an entry in WIN * * IL (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * If IL < 0, RANGE='I' and WKNOWN = .TRUE., IL is set * to a random value from 1 to N * * IU (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * If IU < 0, RANGE='I' and WKNOWN = .TRUE., IU is set * to a random value from IL to N * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * ISEED is not touched unless IL, IU, VL or VU are modified. * * WIN (global input) REAL array, dimension (N) * If WKNOWN=1, WIN contains the eigenvalues of the matrix. * * MAXSIZE (global output) INTEGER * Workspace required to guarantee that PCHEEVR will return * orthogonal eigenvectors. IF WKNOWN=0, MAXSIZE is set to a * a value which guarantees orthogonality no matter what the * spectrum is. If WKNOWN=1, MAXSIZE is set to a value which * guarantees orthogonality on a matrix with eigenvalues given * by WIN. * * VECSIZE (global output) INTEGER * Workspace required to guarantee that PCHEEVR * will compute eigenvectors. * * VALSIZE (global output) INTEGER * Workspace required to guarantee that PCHEEVR * will compute eigenvalues. * * * .. Parameters .. INTEGER CTXT_, MB_ PARAMETER ( CTXT_ = 2, MB_ = 5 ) REAL TWENTY PARAMETER ( TWENTY = 20.0E0 ) * .. * .. Local Scalars .. * INTEGER ILMIN, IUMAX, $ MQ0, MYCOL, MYIL, MYIU, MYROW, NB, NEIG, NN, $ NP0, NPCOL, NPROW REAL ANORM, EPS, SAFMIN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL SLARAN, PSLAMCH EXTERNAL LSAME, ICEIL, NUMROC, SLARAN, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, INT, MAX * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe Minimum' ) NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) VALSIZE = 3 + 5*N + MAX( 12*NN, NB*( NP0+1 ) ) IF( WKNOWN ) THEN ANORM = SAFMIN / EPS IF( N.GE.1 ) $ ANORM = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), ANORM ) IF( LSAME( RANGE, 'I' ) ) THEN IF( IL.LT.0 ) $ IL = INT( SLARAN( ISEED )*REAL( N ) ) + 1 IF( IU.LT.0 ) $ IU = INT( SLARAN( ISEED )*REAL( N-IL ) ) + IL IF( N.EQ.0 ) $ IU = 0 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IF( VL.GT.VU ) THEN MYIL = INT( SLARAN( ISEED )*REAL( N ) ) + 1 MYIU = INT( SLARAN( ISEED )*REAL( N-MYIL ) ) + MYIL VL = WIN( MYIL ) - TWENTY*EPS*ABS( WIN( MYIL ) ) VU = WIN( MYIU ) + TWENTY*EPS*ABS( WIN( MYIU ) ) VU = MAX( VU, VL+EPS*TWENTY*ABS( VL )+SAFMIN ) END IF END IF * END IF IF( LSAME( RANGE, 'V' ) ) THEN * We do not know how many eigenvalues will be computed ILMIN = 1 IUMAX = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN ILMIN = IL IUMAX = IU ELSE IF( LSAME( RANGE, 'A' ) ) THEN ILMIN = 1 IUMAX = N END IF * NEIG = IUMAX - ILMIN + 1 * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * VECSIZE = 3 + 5*N + MAX( 18*NN, NP0*MQ0+2*NB*NB ) + $ (2 + ICEIL( NEIG, NPROW*NPCOL ))*NN VALSIZE = MAX(3, VALSIZE) VECSIZE = MAX(3, VECSIZE) MAXSIZE = VECSIZE * RETURN * * End of PCLASIZEHEEVR * END scalapack-2.0.2/TESTING/EIG/pclasizeheevx.f000644 000766 000024 00000017147 10363532303 020463 0ustar00juliestaff000000 000000 * * SUBROUTINE PCLASIZEHEEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER RANGE INTEGER IL, IU, MAXSIZE, N, VALSIZE, VECSIZE REAL VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) REAL WIN( * ) * .. * * Purpose * ======= * * PCLASIZEHEEVX computes the amount of memory needed by PCHEEVX * to ensure: * 1) Orthogonal Eigenvectors * 2) Eigenvectors * 3) Eigenvalues * * Arguments * ========= * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * VL (global input/output ) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VL is set * to a random value near an entry in WIN * * VU (global input/output ) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VU is set * to a random value near an entry in WIN * * IL (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * If IL < 0, RANGE='I' and WKNOWN = .TRUE., IL is set * to a random value from 1 to N * * IU (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * If IU < 0, RANGE='I' and WKNOWN = .TRUE., IU is set * to a random value from IL to N * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * ISEED is not touched unless IL, IU, VL or VU are modified. * * WIN (global input) REAL array, dimension (N) * If WKNOWN=1, WIN contains the eigenvalues of the matrix. * * MAXSIZE (global output) INTEGER * Workspace required to guarantee that PCHEEVX will return * orthogonal eigenvectors. IF WKNOWN=0, MAXSIZE is set to a * a value which guarantees orthogonality no matter what the * spectrum is. If WKNOWN=1, MAXSIZE is set to a value which * guarantees orthogonality on a matrix with eigenvalues given * by WIN. * * VECSIZE (global output) INTEGER * Workspace required to guarantee that PCHEEVX * will compute eigenvectors. * * VALSIZE (global output) INTEGER * Workspace required to guarantee that PCHEEVX * will compute eigenvalues. * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL TWENTY PARAMETER ( TWENTY = 20.0E0 ) * .. * .. Local Scalars .. * INTEGER CLUSTERSIZE, I, ILMIN, IUMAX, MAXCLUSTERSIZE, $ MQ0, MYCOL, MYIL, MYIU, MYROW, NB, NEIG, NN, $ NP0, NPCOL, NPROW REAL ANORM, EPS, ORFAC, SAFMIN, VLMIN, VUMAX * .. * .. External Functions .. * * LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSLAMCH, SLARAN EXTERNAL LSAME, ICEIL, NUMROC, PSLAMCH, SLARAN * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, MAX, REAL * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * ORFAC = 1.0E-3 * * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe Minimum' ) NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * VALSIZE = 5*NN + 4*N * IF( WKNOWN ) THEN ANORM = SAFMIN / EPS IF( N.GE.1 ) $ ANORM = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), ANORM ) * IF( LSAME( RANGE, 'I' ) ) THEN IF( IL.LT.0 ) $ IL = INT( SLARAN( ISEED )*REAL( N ) ) + 1 IF( IU.LT.0 ) $ IU = INT( SLARAN( ISEED )*REAL( N-IL ) ) + IL IF( N.EQ.0 ) $ IU = 0 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IF( VL.GT.VU ) THEN MYIL = INT( SLARAN( ISEED )*REAL( N ) ) + 1 MYIU = INT( SLARAN( ISEED )*REAL( N-MYIL ) ) + MYIL VL = WIN( MYIL ) + TWENTY*EPS*ABS( WIN( MYIL ) ) VU = WIN( MYIU ) + TWENTY*EPS*ABS( WIN( MYIU ) ) VU = MAX( VU, VL+EPS*TWENTY*ABS( VL )+SAFMIN ) END IF END IF * END IF IF( LSAME( RANGE, 'V' ) ) THEN * * Compute ILMIN, IUMAX (based on VL, VU and WIN) * IF( WKNOWN ) THEN VLMIN = VL - TWENTY*EPS*ANORM VUMAX = VU + TWENTY*EPS*ANORM ILMIN = 1 IUMAX = 0 DO 10 I = 1, N IF( WIN( I ).LT.VLMIN ) $ ILMIN = ILMIN + 1 IF( WIN( I ).LT.VUMAX ) $ IUMAX = IUMAX + 1 10 CONTINUE ELSE ILMIN = 1 IUMAX = N END IF ELSE IF( LSAME( RANGE, 'I' ) ) THEN ILMIN = IL IUMAX = IU ELSE IF( LSAME( RANGE, 'A' ) ) THEN ILMIN = 1 IUMAX = N END IF * NEIG = IUMAX - ILMIN + 1 * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) VECSIZE = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN * IF( WKNOWN ) THEN CLUSTERSIZE = 1 MAXCLUSTERSIZE = 1 DO 20 I = ILMIN + 1, IUMAX IF( ( WIN( I )-WIN( I-1 ) ).LT.ORFAC*2*ANORM ) THEN CLUSTERSIZE = CLUSTERSIZE + 1 IF( CLUSTERSIZE.GT.MAXCLUSTERSIZE ) $ MAXCLUSTERSIZE = CLUSTERSIZE ELSE CLUSTERSIZE = 1 END IF 20 CONTINUE IF( CLUSTERSIZE.GT.MAXCLUSTERSIZE ) $ MAXCLUSTERSIZE = CLUSTERSIZE ELSE MAXCLUSTERSIZE = N END IF * MAXSIZE = VECSIZE + MAX( ( MAXCLUSTERSIZE-1 ), 0 )*N * * RETURN * * End of PCLASIZEHEEVX * END scalapack-2.0.2/TESTING/EIG/pclasizesep.f000644 000766 000024 00000013506 10363532303 020126 0ustar00juliestaff000000 000000 SUBROUTINE PCLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, $ ISIZEHEEVX, SIZEHEEVD, RSIZEHEEVD, $ ISIZEHEEVD, SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, RSIZETST, ISIZETST ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZEHEEVD, ISIZEHEEVX, $ ISIZESUBTST, ISIZETST, RSIZECHK, RSIZEHEEVD, $ RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, RSIZETST, $ SIZEHEEVD, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PCLASIZESEP computes the amount of memory needed by * various SEP test routines, as well as HEEVX itself * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PCHEEVX * * SIZEMQRLEFT LWORK for the 1st PCUNMQR call in PCLAGHE * * SIZEMQRRIGHT LWORK for the 2nd PCUNMQR call in PCLAGHE * * SIZEQRF LWORK for PCGEQRF in PCLAGHE * * SIZETMS LWORK for PCLATMS * * RSIZEQTQ LWORK for PCSEPQTQ (nexer complex) * * RSIZECHK LWORK for PCSEPCHK * * SIZEHEEVX LWORK for PCHEEVX * * RSIZEHEEVX LRWORK for PCHEEVX * * ISIZEHEEVX LIWORK for PCHEEVX * * SIZEHEEVD LWORK for PCHEEVD * * RSIZEHEEVD LRWORK for PCHEEVD * * ISIZEHEEVD LIWORK for PCHEEVD * * SIZESUBTST LWORK for PCSUBTST * * RSIZESUBTST LRWORK for PCSUBTST * * ISIZESUBTST LIWORK for PCSUBTST * * SIZETST LWORK for PCTST * * RSIZETST LRWORK for PCTST * * ISIZETST LIWORK for PCTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ANB, CSRC_A, IACOL, IAROW, ICOFFA, ICTXT, $ IROFFA, LCM, LCMQ, LDA, MQ0, MYCOL, MYROW, N, $ NB, NEIG, NHETRD_LWOPT, NN, NNP, NP, NP0, $ NPCOL, NPROW, NPS, NQ, RSRC_A, SIZECHK, $ SIZEQTQ, SQNPC * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC, PJLAENV EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC, PJLAENV * .. ** .. Executable Statements .. * This is just to keep ftnchek happy * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, REAL, SQRT * .. * .. Executable Statements .. IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 0 SIZECHK = 0 RSIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) RSIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZEHEEVX = N + ( NP0+MQ0+NB )*NB RSIZEHEEVX = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZEHEEVX = 6*NNP * ICTXT = DESCA( CTXT_ ) ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( REAL( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS * SIZEHEEVX = MAX( SIZEHEEVX, N+NHETRD_LWOPT ) * SIZEHEEVD = SIZEHEEVX RSIZEHEEVD = 7*N + 3*NP0*MQ0 ISIZEHEEVD = 7*N + 8*NPCOL + 2 SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZEHEEVX, $ SIZEHEEVD ) + IPREPAD + IPOSTPAD RSIZESUBTST = MAX( RSIZEHEEVX, RSIZEHEEVD, RSIZEQTQ, RSIZECHK ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = MAX( ISIZEHEEVX, ISIZEHEEVD ) + IPREPAD + IPOSTPAD * * Allow room for A, COPYA and Z and WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Room for DIAG, WIN, WNEW, GAP and RWORK * RSIZETST = 4*( N+IPREPAD+IPOSTPAD ) + RSIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PCHEEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pclasizesepr.f000644 000766 000024 00000012422 11623527140 020307 0ustar00juliestaff000000 000000 SUBROUTINE PCLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEHEEVR, RSIZEHEEVR, $ ISIZEHEEVR, SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, RSIZETST, $ ISIZETST ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZEHEEVR, ISIZESUBTST, $ ISIZETST, RSIZEHEEVR, RSIZESUBTST, RSIZETST, $ SIZECHK, SIZEHEEVR, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZETMS, SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * * Purpose * ======= * * PCLASIZESEPR computes the amount of memory needed by * various SEPR test routines, as well as PCHEEVR itself. * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor for dense matrix. * * SIZEMQRLEFT LWORK for the 1st PCUNMQR call in PCLAGHE * * SIZEMQRRIGHT LWORK for the 2nd PCUNMQR call in PCLAGHE * * SIZEQRF LWORK for PCGEQRF in PCLAGHE * * SIZETMS LWORK for PCLATMS * * SIZEQTQ LWORK for PCSEPQTQ * * SIZECHK LWORK for PCSEPCHK * * SIZEHEEVR LWORK for PCHEEVR * * RSIZEHEEVR LRWORK for PCHEEVR * * ISIZEHEEVR LIWORK for PCHEEVR * * SIZESUBTST LWORK for PCSEPRSUBTST * * RSIZESUBTST LRWORK for PCSEPRSUBTST * * ISIZESUBTST LIWORK for PCSEPRSUBTST * * SIZETST LWORK for PCSEPRTST * * RSIZETST LRWORK for PCSEPRTST * * ISIZETST LIWORK for PCSEPRTST * * * .. Parameters .. INTEGER CTXT_, M_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( $ CTXT_ = 2, M_ = 3, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, LCM, $ LCMQ, LDA, MQ0, MYCOL, MYROW, N, NB, NEIG, NN, $ NNP, NP, NP0, NPCOL, NPROW, NQ, RSRC_A INTEGER ANB, ICTXT, NHETRD_LWOPT, NPS, SQNPC * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC INTEGER PJLAENV EXTERNAL PJLAENV * * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, SQRT * .. * .. Executable Statements .. * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) + 1 NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) NNP = MAX( N, NPROW*NPCOL+1, 4 ) * * SIZEHEEVR = 1+N + ( NP0+MQ0+NB )*NB SIZEHEEVR = MAX(3, SIZEHEEVR) RSIZEHEEVR = 1 + 5*N + MAX( 18*NN, NP0*MQ0+2*NB*NB ) + $ (2 + ICEIL( NEIG, NPROW*NPCOL ))*NN RSIZEHEEVR = MAX(3, RSIZEHEEVR) * ISIZEHEEVR = 12*NNP + 2*N * ICTXT = DESCA( CTXT_ ) ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS SIZEHEEVR = MAX( SIZEHEEVR, N + NHETRD_LWOPT ) * SIZESUBTST = MAX( SIZETMS, SIZEHEEVR ) + $ IPREPAD + IPOSTPAD RSIZESUBTST = MAX( SIZEQTQ, SIZECHK, RSIZEHEEVR ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZEHEEVR + IPREPAD + IPOSTPAD * * Allow room for A, COPYA, Z, WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for DIAG, WIN, WNEW, GAP, RWORK * RSIZETST = 4*( N+IPREPAD+IPOSTPAD ) + RSIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK * (only needed for PCHEEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * * RETURN END scalapack-2.0.2/TESTING/EIG/pclatms.f000644 000766 000024 00000032603 10363532303 017246 0ustar00juliestaff000000 000000 * * SUBROUTINE PCLATMS( M, N, DIST, ISEED, SYM, D, MODE, COND, DMAX, $ KL, KU, PACK, A, IA, JA, DESCA, ORDER, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIST, PACK, SYM INTEGER IA, INFO, JA, KL, KU, LWORK, M, MODE, N, ORDER REAL COND, DMAX * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) REAL D( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCLATMS generates random Hermitian matrices with specified * eigenvalues for testing SCALAPACK programs. * * PCLATMS operates by applying the following sequence of * operations: * * Set the diagonal to D, where D may be input or * computed according to MODE, COND, DMAX, and SYM * as described below. * * Generate a dense M x N matrix by multiplying D on the left * and the right by random unitary matrices, then: * * Reduce the bandwidth according to KL and KU, using * Householder transformations. * ### bandwidth reduction NOT SUPPORTED ### * * Arguments * ========= * * M - (global input) INTEGER * The number of rows of A. Not modified. * * N - (global input) INTEGER * The number of columns of A. Not modified. * ### M .ne. N unsupported * * DIST - (global input) CHARACTER*1 * On entry, DIST specifies the type of distribution to be used * to generate the random eigen-/singular values. * 'U' => UNIFORM( 0, 1 ) ( 'U' for uniform ) * 'S' => UNIFORM( -1, 1 ) ( 'S' for symmetric ) * 'N' => NORMAL( 0, 1 ) ( 'N' for normal ) * Not modified. * * ISEED - (global input) INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. They should lie between 0 and 4095 inclusive, * and ISEED(4) should be odd. The random number generator * uses a linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to CLATMS * to continue the same random number sequence. * Changed on exit. * * SYM - (global input) CHARACTER*1 * If SYM='S' or 'H', the generated matrix is Hermitian, with * eigenvalues specified by D, COND, MODE, and DMAX; they * may be positive, negative, or zero. * If SYM='P', the generated matrix is Hermitian, with * eigenvalues (= singular values) specified by D, COND, * MODE, and DMAX; they will not be negative. * If SYM='N', the generated matrix is nonsymmetric, with * singular values specified by D, COND, MODE, and DMAX; * they will not be negative. * ### SYM = 'N' NOT SUPPORTED ### * Not modified. * * D - (local input/output) REAL array, * dimension ( MIN( M , N ) ) * This array is used to specify the singular values or * eigenvalues of A (see SYM, above.) If MODE=0, then D is * assumed to contain the singular/eigenvalues, otherwise * they will be computed according to MODE, COND, and DMAX, * and placed in D. * Modified if MODE is nonzero. * * MODE - (global input) INTEGER * On entry this describes how the singular/eigenvalues are to * be specified: * MODE = 0 means use D as input * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * If SYM='S' or 'H', and MODE is neither 0, 6, nor -6, then * the elements of D will also be multiplied by a random * sign (i.e., +1 or -1.) * Not modified. * * COND - (global input) REAL * On entry, this is used as described under MODE above. * If used, it must be >= 1. Not modified. * * DMAX - (global input) REAL * If MODE is neither -6, 0 nor 6, the contents of D, as * computed according to MODE and COND, will be scaled by * DMAX / max(abs(D(i))); thus, the maximum absolute eigen- or * singular value (which is to say the norm) will be abs(DMAX). * Note that DMAX need not be positive: if DMAX is negative * (or zero), D will be scaled by a negative number (or zero). * Not modified. * * KL - (global input) INTEGER * This specifies the lower bandwidth of the matrix. For * example, KL=0 implies upper triangular, KL=1 implies upper * Hessenberg, and KL being at least M-1 means that the matrix * has full lower bandwidth. KL must equal KU if the matrix * is Hermitian. * Not modified. * ### 1 <= KL < N-1 is NOT SUPPORTED ### * * KU - (global input) INTEGER * This specifies the upper bandwidth of the matrix. For * example, KU=0 implies lower triangular, KU=1 implies lower * Hessenberg, and KU being at least N-1 means that the matrix * has full upper bandwidth. KL must equal KU if the matrix * is Hermitian. * Not modified. * ### 1 <= KU < N-1 is NOT SUPPORTED ### * * PACK - (global input) CHARACTER*1 * This specifies packing of matrix as follows: * 'N' => no packing * ### PACK must be 'N' all other options NOT SUPPORTED ### * * A - (local output) COMPLEX array * Global dimension (M, N), local dimension (MP, NQ) * On exit A is the desired test matrix. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ORDER - (input) INTEGER * The number of reflectors used to define the orthogonal * matrix Q. A = Q * D * Q' * Higher ORDER requires more computation and communication. * * WORK - (local input/output) COMPLEX array, * dimension (LWORK) * * LWORK - (local input) INTEGER dimension of WORK * LWORK >= SIZETMS as returned by PCLASIZESEP * * INFO - (global output) INTEGER * Error code. On exit, INFO will be set to one of the * following values: * 0 => normal return * -1 => M negative or unequal to N and SYM='S', 'H', or 'P' * -2 => N negative * -3 => DIST illegal string * -5 => SYM illegal string * -7 => MODE not in range -6 to 6 * -8 => COND less than 1.0, and MODE neither -6, 0 nor 6 * -10 => KL negative * -11 => KU negative, or SYM='S' or 'H' and KU not equal to KL * -16 => DESCA is inconsistent * -17 => ORDER not in the range 0 to N inclusive * 1 => Error return from SLATM1 * 2 => Cannot scale to DMAX (max. sing. value is 0) * 3 => Error return from PCLAGHE * *----------------------------------------------------------------------- * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IDIST, IINFO, IPACK, IRSIGN, ISYM, LLB, $ MNMIN, MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL ALPHA, TEMP * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, CLASET, PCHK1MAT, $ PCLAGHE, PXERBLA, SLATM1, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * 1) Decode and Test the input parameters. * Initialize flags & seed. * * INFO = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IF( ( MYROW.GE.NPROW .OR. MYROW.LT.0 ) .OR. $ ( MYCOL.GE.NPCOL .OR. MYCOL.LT.0 ) )RETURN * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Decode DIST * IF( LSAME( DIST, 'U' ) ) THEN IDIST = 1 ELSE IF( LSAME( DIST, 'S' ) ) THEN IDIST = 2 ELSE IF( LSAME( DIST, 'N' ) ) THEN IDIST = 3 ELSE IDIST = -1 END IF * * Decode SYM * IF( LSAME( SYM, 'N' ) ) THEN ISYM = 1 IRSIGN = 0 ELSE IF( LSAME( SYM, 'P' ) ) THEN ISYM = 2 IRSIGN = 0 ELSE IF( LSAME( SYM, 'S' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE IF( LSAME( SYM, 'H' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE ISYM = -1 END IF * * Decode PACK * IF( LSAME( PACK, 'N' ) ) THEN IPACK = 0 ELSE IPACK = 1 END IF * * Set certain internal parameters * MNMIN = MIN( M, N ) LLB = MIN( KL, M-1 ) * IF( ORDER.EQ.0 ) $ ORDER = N * * Set INFO if an error * IF( NPROW.EQ.-1 ) THEN INFO = -( 1600+CTXT_ ) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 16, INFO ) IF( INFO.EQ.0 ) THEN IF( M.NE.N .AND. ISYM.NE.1 ) THEN INFO = -2 ELSE IF( IDIST.EQ.-1 ) THEN INFO = -3 ELSE IF( ISYM.EQ.-1 ) THEN INFO = -5 ELSE IF( ABS( MODE ).GT.6 ) THEN INFO = -7 ELSE IF( ( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) .AND. COND.LT. $ ONE ) THEN INFO = -8 ELSE IF( KL.LT.0 ) THEN INFO = -10 ELSE IF( KU.LT.0 .OR. ( ISYM.NE.1 .AND. KL.NE.KU ) ) THEN INFO = -11 ELSE IF( ( ORDER.LT.0 ) .OR. ( ORDER.GT.N ) ) THEN INFO = -17 END IF END IF CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 16, 0, IDUM1, IDUM2, $ INFO ) END IF * * Check for unsupported features * IF( ISYM.NE.2 ) THEN INFO = -5 ELSE IF( IPACK.NE.0 ) THEN INFO = -12 ELSE IF( KL.GT.0 .AND. KL.LT.M-1 ) THEN INFO = -10 ELSE IF( KU.GT.0 .AND. KU.LT.N-1 ) THEN INFO = -11 ELSE IF( LLB.NE.0 .AND. LLB.NE.M-1 ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PCLATMS', -INFO ) RETURN END IF * * Initialize random number generator * DO 10 I = 1, 4 ISEED( I ) = MOD( ABS( ISEED( I ) ), 4096 ) 10 CONTINUE * IF( MOD( ISEED( 4 ), 2 ).NE.1 ) $ ISEED( 4 ) = ISEED( 4 ) + 1 * * 2) Set up D if indicated. * * Compute D according to COND and MODE * CALL SLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, MNMIN, IINFO ) * IF( IINFO.NE.0 ) THEN INFO = 1 RETURN END IF * * IF( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) THEN * * Scale by DMAX * TEMP = ABS( D( 1 ) ) DO 20 I = 2, MNMIN TEMP = MAX( TEMP, ABS( D( I ) ) ) 20 CONTINUE * IF( TEMP.GT.ZERO ) THEN ALPHA = DMAX / TEMP ELSE INFO = 2 RETURN END IF * CALL SSCAL( MNMIN, ALPHA, D, 1 ) * END IF * CALL CLASET( 'A', NP, NQ, CZERO, CZERO, A, DESCA( LLD_ ) ) * * Hermitian -- A = U D U' * CALL PCLAGHE( M, LLB, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, IINFO ) * RETURN * * End of PCLATMS * END scalapack-2.0.2/TESTING/EIG/pclatran.f000644 000766 000024 00000015675 10363532303 017421 0ustar00juliestaff000000 000000 SUBROUTINE PCLATRAN( N, NB, A, IA, JA, DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IA, JA, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * * ======= * * PCLATRAN transpose a lower triangular matrix on to the upper * triangular portion of the same matrix. * * This is an auxiliary routine called by PCHETRD. * * Notes * ===== * * IA must equal 1 * JA must equal 1 * DESCA( MB_ ) must equal 1 * DESCA( NB_ ) must equal 1 * DESCA( RSRC_ ) must equal 1 * DESCA( CSRC_ ) must equal 1 * * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix to be transposed. * * NB (global input) INTEGER * The number of rows and columns to be transposed with each * message sent. NB has no impact on the result, it is striclty * a performance tuning parameter. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). On entry, the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix. On exit, the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is undefined (and may have been modified). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * Must be equal to 1. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * Must be equal to 1. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * DESCA( MB_ ) must equal 1 * DESCA( NB_ ) must equal 1 * DESCA( ICTXT_ ) must point to a square process grid * i.e. one where NPROW is equal to NPCOL * * WORK (local workspace) COMPLEX*16 array, dimension ( LWORK ) * * Where: * LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, IRECV, ISEND, J, JJ, JRECV, JSEND, $ LDA, MAXIRECV, MAXISEND, MAXJRECV, MAXJSEND, $ MINIRECV, MINISEND, MINJRECV, MINJSEND, MYCOL, $ MYROW, NP, NPCOL, NPROW, NQ, RECVNB, SENDNB, $ STARTCOL, STARTROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CTRRV2D, CTRSD2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, MIN * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Further details * * Because the processor grid is square each process needs only send * data to its transpose process. (Likewsie it need only receive * data from its transpose process.) Because the data decomposition * is cyclic, the local portion of the array is triangular. * * This routine requires that the data be buffered (i.e. copied) * on the sending process (because of the triangular shape) and * unbuffered on the receiving process. Hence, two local memory to * memory copies are performed within the communications routines * followed by a memory to memory copy outside of the communications * routines. It would be nice to avoid having back to back memory * to memory copies (as we do presently on the receiving processor). * This could be done by packaging the data ourselves in the sender * and then unpacking it directly into the matrix. However, this * code seems cleaner and so since this routine is not a significant * performance bottleneck we have left it this way. * * * * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * NP = NUMROC( N, 1, MYROW, 0, NPROW ) NQ = NUMROC( N, 1, MYCOL, 0, NPCOL ) * * IF( MYROW.EQ.MYCOL ) THEN * DO 20 J = 1, NP DO 10 I = J + 1, NQ A( J+( I-1 )*LDA ) = CONJG( A( I+( J-1 )*LDA ) ) 10 CONTINUE 20 CONTINUE * ELSE IF( MYROW.GT.MYCOL ) THEN STARTROW = 1 STARTCOL = 2 ELSE IF( MYROW.EQ.MYCOL ) THEN STARTROW = 2 STARTCOL = 2 ELSE STARTROW = 2 STARTCOL = 1 END IF END IF * DO 50 JJ = 1, MAX( NP, NQ ), NB MINJSEND = STARTCOL + JJ - 1 MINJRECV = STARTROW + JJ - 1 MAXJSEND = MIN( MINJSEND+NB-1, NQ ) MAXJRECV = MIN( MINJRECV+NB-1, NP ) * SENDNB = MAXJSEND - MINJSEND + 1 RECVNB = MAXJRECV - MINJRECV + 1 * MINISEND = 1 MINIRECV = 1 MAXISEND = MIN( NP, JJ+SENDNB-1 ) MAXIRECV = MIN( NQ, JJ+RECVNB-1 ) * ISEND = MAXISEND - MINISEND + 1 IRECV = MAXIRECV - MINIRECV + 1 JSEND = MAXJSEND - MINJSEND + 1 JRECV = MAXJRECV - MINJRECV + 1 * * * DO 40 J = MINJRECV, MAXJRECV DO 30 I = MINIRECV, MAXIRECV + J - MAXJRECV WORK( I+( J-MINJRECV )*IRECV ) $ = CONJG( A( J+( I-1 )*LDA ) ) 30 CONTINUE 40 CONTINUE * IF( IRECV.GT.0 .AND. JRECV.GT.0 ) $ CALL CTRSD2D( ICTXT, 'U', 'N', IRECV, JRECV, WORK, IRECV, $ MYCOL, MYROW ) * IF( ISEND.GT.0 .AND. JSEND.GT.0 ) $ CALL CTRRV2D( ICTXT, 'U', 'N', ISEND, JSEND, $ A( MINISEND+( MINJSEND-1 )*LDA ), LDA, $ MYCOL, MYROW ) * * 50 CONTINUE * END IF * RETURN * * End of PCLATRD * END scalapack-2.0.2/TESTING/EIG/pcmatgen.f000644 000766 000024 00000046323 10363532303 017405 0ustar00juliestaff000000 000000 SUBROUTINE PCMATGEN( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PCMATGEN : Parallel Complex Single precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * otherwise a random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) COMPLEX, pointer into the local memory to * an array of dimension ( LDA, * ) containing the local * pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB REAL DUMMY * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MOD, REAL * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSRAND EXTERNAL ICEIL, NUMROC, LSAME, PSRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) * INFO = 0 IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCMATGEN', INFO ) RETURN END IF * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = 2*NPMB JUMP3 = 2*M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = 2*MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = CMPLX( PSRAND(0), PSRAND(0) ) 30 CONTINUE IF( SYMM ) THEN A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) ELSE A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), ZERO ) DUMMY = PSRAND(0) END IF DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) IF( HERM ) THEN A(IK,JK+J) = CONJG( A(IK+J,JK) ) ELSE A(IK,JK+J) = A(IK+J,JK) END IF 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = 2*NQNB JUMP3 = 2*N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = 2*NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 IF( SYMM ) THEN A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) ELSE A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ TWO*PSRAND(0) - ONE ) END IF JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = 2*NQNB JUMP3 = 2*N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = 2*NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 IF( TRAN ) THEN A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) ELSE A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ TWO*PSRAND(0) - ONE ) END IF JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSE * JUMP1 = 1 JUMP2 = 2*NPMB JUMP3 = 2*M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = 2*MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = CMPLX( ONE - TWO*PSRAND(0), $ ONE - TWO*PSRAND(0) ) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 IF( HERM ) THEN A(IK,JK+J) = CMPLX( $ ABS(REAL(A(IK,JK+J)))+2*MAXMN, ZERO ) ELSE A(IK,JK+J) = CMPLX( ABS(REAL(A(IK,JK+J)))+MAXMN, $ ABS(AIMAG(A(IK,JK+J)))+ MAXMN ) END IF IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PCMATGEN * END scalapack-2.0.2/TESTING/EIG/pcnepdriver.f000644 000766 000024 00000050310 10363532303 020117 0ustar00juliestaff000000 000000 PROGRAM PCNEPDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March, 2000 * * Purpose * ======= * * PCNEPDRIVER is the main test program for the COMPLEX * SCALAPACK NEP routines. This test driver performs a Schur * decomposition followed by residual check of a Hessenberg matrix. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'SCALAPACK, Version 1.4, NEP (Nonsymmetric EigenProblem) input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'NEP.out' output file name (if any) * 6 device out * 8 number of problems sizes * 1 2 3 4 6 10 100 200 vales of N * 3 number of NB's * 6 20 40 values of NB * 4 number of process grids (ordered pairs of P & Q) * 1 2 1 4 values of P * 1 2 4 1 values of Q * 20.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * CPLXSZ INTEGER, default = 8 bytes. * CPLXSZ indicate the length in bytes on the given platform * for a single precision complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * Further Details * =============== * * Contributed by Mark Fahey, March 2000. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, TOTMEM, MEMSIZ, NTESTS PARAMETER ( CPLXSZ = 16, TOTMEM = 200000000, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20 ) COMPLEX PADVAL, ZERO, ONE PARAMETER ( PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, III, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPREPAD, IPW, IPWR, IPC, J, K, KFAIL, $ KPASS, KSKIP, KTESTS, LDA, LDWORK, LDZ, LWORK, $ MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, NOUT, $ NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ REAL THRESH REAL ANORM, FRESID, QRESID, ZNORM DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCZ( DLEN_ ), IDUM( 1 ), $ IERR( 2 ), NBVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCCHEKPAD, $ PCFILLPAD, PCGEMM, PCLAHQR, PCLASET, PCMATGEN, $ PCNEPFCHK, PCNEPINFO, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER ILCM, NUMROC REAL PSLAMCH, PCLANGE, PCLANHS EXTERNAL ILCM, NUMROC, PSLAMCH, PCLANGE, PCLANHS * .. * .. Intrinsic Functions .. INTRINSIC REAL, MAX, MIN * .. * .. Data statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PCNEPINFO( OUTFILE, NOUT, NMAT, NVAL, NTESTS, NNB, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.6 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) IPREPAD = IPREPAD + 1000 IMIDPAD = IMIDPAD + 1000 IPOSTPAD = IPOSTPAD + 1000 ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Initialize the array descriptor for the matrix Z * CALL DESCINIT( DESCZ, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 2 ) ) * LDA = DESCA( LLD_ ) LDZ = DESCZ( LLD_ ) LDWORK = DESCZ( LLD_ ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAE+1 ) * IPA = IPREPAD + 1 IPC = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWR = IPC + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPWR + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD III = N / NB IF( III*NB.LT.N ) $ III = III + 1 III = 7*III / ILCM( NPROW, NPCOL ) * * LWORK = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, III ) LWORK = LWORK + MAX( 2*N, ( 8*ILCM( NPROW, NPCOL )+2 )** $ 2 ) * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PCNEPFCHK and PCLANHS * WORKSIZ = LWORK + MAX( NP*DESCA( NB_ ), $ DESCA( MB_ )*NQ ) + IPOSTPAD * ELSE * WORKSIZ = LWORK + IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Schur reduction', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate matrix Z = In * CALL PCLASET( 'All', N, N, ZERO, ONE, MEM( IPC ), 1, 1, $ DESCZ ) * * Generate matrix A upper Hessenberg * CALL PCMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) CALL PCLASET( 'Lower', MAX( 0, N-2 ), MAX( 0, N-2 ), $ ZERO, ZERO, MEM( IPA ), MIN( N, 3 ), 1, $ DESCA ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPC-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANHS( 'I', N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANHS', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANHS', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PCFILLPAD( ICTXT, N, 1, MEM( IPWR-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, PADVAL ) * END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform NEP factorization * CALL PCLAHQR( .TRUE., .TRUE., N, 1, N, MEM( IPA ), DESCA, $ MEM( IPWR ), 1, N, MEM( IPC ), DESCZ, $ MEM( IPW ), LWORK, IDUM, 0, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * )'PCLAHQR INFO=', INFO KFAIL = KFAIL + 1 GO TO 10 END IF * IF( CHECK ) THEN * * Check for memory overwrite in NEP factorization * CALL PCCHEKPAD( ICTXT, 'PCLAHQR (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLAHQR (Z)', NP, NQ, $ MEM( IPC-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLAHQR (WR)', N, 1, $ MEM( IPWR-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLAHQR (WORK)', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || Z * H * Z**T - H0 || / ( N*|| H0 ||*EPS ) * CALL PCNEPFCHK( N, MEM( IPA ), 1, 1, DESCA, IASEED, $ MEM( IPC ), 1, 1, DESCZ, ANORM, $ FRESID, MEM( IPW ) ) * CALL PCCHEKPAD( ICTXT, 'PCNEPFCHK (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCNEPFCHK (Z)', NP, NQ, $ MEM( IPC-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCNEPFCHK (WORK)', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || (Z**T)*Z - In ||_1 * CALL PCLASET( 'All', N, N, ZERO, ONE, MEM( IPA ), 1, $ 1, DESCA ) CALL PCGEMM( 'Cong Tran', 'No transpose', N, N, N, $ -ONE, MEM( IPC ), 1, 1, DESCZ, $ MEM( IPC ), 1, 1, DESCZ, ONE, MEM( IPA ), $ 1, 1, DESCA ) ZNORM = PCLANGE( '1', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) QRESID = ZNORM / ( REAL( N )*PSLAMCH( ICTXT, 'P' ) ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( ( FRESID-FRESID ).EQ.0.0E+0 ) .AND. $ ( QRESID.LE.THRESH ) .AND. $ ( ( QRESID-QRESID ).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 )FRESID WRITE( NOUT, FMT = 9985 )QRESID END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID QRESID = QRESID - QRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 18 N^3 flops for PxLAHQR * NOPS = 18.0D+0*DBLE( N )**3 * * Calculate total megaflops -- factorization only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q NEP Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- --- ---- ---- -------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, F8.2, $ 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||H - Q*S*Q^T|| / (||H|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Q^T*Q - I|| / ( N * eps ) ', G25.7 ) * STOP * * End of PCNEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pcnepfchk.f000644 000766 000024 00000026167 10363532303 017554 0ustar00juliestaff000000 000000 SUBROUTINE PCNEPFCHK( N, A, IA, JA, DESCA, IASEED, Z, IZ, JZ, $ DESCZ, ANORM, FRESID, WORK ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March, 2000 * * .. Scalar Arguments .. INTEGER IA, IASEED, IZ, JA, JZ, N REAL ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) COMPLEX A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PCNEPFCHK computes the residual * || sub(Z)*sub( A )*sub(Z)**T - sub( Ao ) || / (||sub( Ao )||*eps*N), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A(IA:IA+M-1,JA:JA+N-1), sub( Z ) = Z(IZ:IZ+N-1,JZ:JZ+N-1) * and ||.|| stands for the infinity norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The order of sub( A ) and sub( Z ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub(Z)*sub( A )*sub(Z)**T - sub( Ao ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * Z (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_Z,LOCc(JZ+N-1)). On entry, this * array contains the local pieces of the N-by-N distributed * matrix sub( Z ). * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * ANORM (global input) REAL * The Infinity norm of sub( A ). * * FRESID (global output) REAL * The maximum (worst) factorizational error. * * WORK (local workspace) COMPLEX array, dimension (LWORK). * LWORK >= MAX( NpA0 * NB_A, MB_A * NqA0 ) where * * IROFFA = MOD( IA-1, MB_A ), * ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * WORK is used to store a block of rows and a block of columns * of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * Further Details * =============== * * Contributed by Mark Fahey, March, 2000. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, IB, ICTXT, IIA, IOFFA, IROFF, $ IW, J, JB, JJA, JN, LDA, LDW, MYCOL, MYROW, NP, $ NPCOL, NPROW REAL EPS * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCGEMM, $ PCLACPY, PCLASET, PCMATGEN, CMATADD * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PSLAMCH, PCLANGE EXTERNAL ICEIL, NUMROC, PSLAMCH, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( ICTXT, 'eps' ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF LDW = MAX( 1, NP ) * * First compute H <- H * Z**T * CALL DESCSET( DESCW, DESCA( MB_ ), N, DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * DO 10 I = IA, IA + N - 1, DESCA( MB_ ) IB = MIN( IA+N-I, DESCA( MB_ ) ) * CALL PCLACPY( 'All', IB, N, A, I, JA, DESCA, WORK, 1, 1, $ DESCW ) CALL PCGEMM( 'No transpose', 'Cong Tran', IB, N, N, ONE, WORK, $ 1, 1, DESCW, Z, IZ, JZ, DESCZ, ZERO, A, I, JA, $ DESCA ) * DESCW( RSRC_ ) = MOD( DESCW( RSRC_ )+1, NPROW ) * 10 CONTINUE * * Then compute H <- Z * H = Z * H0 * Z**T * CALL DESCSET( DESCW, N, DESCA( NB_ ), DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, LDW ) * DO 20 J = JA, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * CALL PCLACPY( 'All', N, JB, A, IA, J, DESCA, WORK, 1, 1, $ DESCW ) CALL PCGEMM( 'No transpose', 'No transpose', N, JB, N, ONE, Z, $ IZ, JZ, DESCZ, WORK, 1, 1, DESCW, ZERO, A, IA, J, $ DESCA ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * 20 CONTINUE * * Compute H - H0 * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA-1 )*LDA IW = 1 JB = JN - JA + 1 DESCW( CSRC_ ) = IACOL * * Handle first block of columns separately * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PCMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ NP, JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL PCLASET( 'Lower', MAX( 0, N-2 ), JB, ZERO, ZERO, WORK, $ MIN( IW+2, N ), 1, DESCW ) CALL CMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF * IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * DO 30 J = JN + 1, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PCMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, NP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL PCLASET( 'Lower', MAX( 0, N-IW-1 ), JB, ZERO, ZERO, $ WORK, MIN( N, IW+2 ), 1, DESCW ) CALL CMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) 30 CONTINUE * * Calculate factor residual * FRESID = PCLANGE( 'I', N, N, A, IA, JA, DESCA, WORK ) / $ ( N*EPS*ANORM ) * RETURN * * End PCNEPFCHK * END scalapack-2.0.2/TESTING/EIG/pcnepinfo.f000644 000766 000024 00000027760 10363532303 017574 0ustar00juliestaff000000 000000 SUBROUTINE PCNEPINFO( SUMMRY, NOUT, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March, 2000 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, NGRIDS, $ NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PCNEPINFO gets needed startup information for PCHSEQR drivers * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (the order of the matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH * * WORK (local workspace) INTEGER array of dimension >= * MAX( 3, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ), used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Further Details * =============== * * Implemented by: M. Fahey, June 2000 * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a complex * single precision. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'NEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * )NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 )'N', LDNVAL GO TO 30 END IF READ( NIN, FMT = * )( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * )NNB IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 )'NB', LDNBVAL GO TO 30 END IF READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) * DO 10 I = 1, NNB IF( NBVAL( I ).LT.6 ) THEN WRITE( NOUT, FMT = 9992 )NBVAL( I ) GO TO 30 END IF 10 CONTINUE * * Get number of grids * READ( NIN, FMT = * )NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDPVAL GO TO 30 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDQVAL GO TO 30 END IF * * Get values of P and Q * READ( NIN, FMT = * )( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * )( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * )THRESH * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 20 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 20 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK QSQ^H by Schur Decomposition.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'Tests of the parallel ' // $ 'complex single precision Schur decomposition.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Residual = ||H-QSQ^H|| / ' // $ '(||H|| * eps * N )' WRITE( NOUT, FMT = 9999 ) $ ' Orthogonality residual = ||I - Q^HQ|| / ' // '( eps * N )' WRITE( NOUT, FMT = 9999 )'The matrix A is randomly ' // $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'An explanation of the input/output ' $ // 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or ' // $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the ' // 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the' // $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than' // $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'NEP time : Time in seconds to decompose the ' // ' matrix' WRITE( NOUT, FMT = 9999 )'MFLOPS : Rate of execution ' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 )'N ', $ ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 )'NB ', $ ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 )'P ', $ ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 )'Q ', $ ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 )EPS WRITE( NOUT, FMT = 9998 )THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 30 CONTINUE WRITE( NOUT, FMT = 9993 ) CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9992 FORMAT( ' Blocking size too small at ', I2, ' must be >=6.' ) * * End of PCNEPINFO * END scalapack-2.0.2/TESTING/EIG/pcrptseptst.f000644 000766 000024 00000005235 10363532303 020177 0ustar00juliestaff000000 000000 PROGRAM PCRPTSEPTST * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * Repeat parallel Hermitian eigenproblem test * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER MAXN, LWORK, LIWORK PARAMETER ( MAXN = 200, LWORK = 500000, $ LIWORK = 6*MAXN+4 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS, UPLO INTEGER CONTEXT, IAM, INFO, IPOSTPAD, IPREPAD, LDA, $ MATTYPE, N, NB, NPCOL, NPROCS, NPROW REAL ABSTOL, THRESH * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), ICLUSTR( MAXN ), IFAIL( MAXN ), $ ISEED( 4 ), IWORK( LIWORK ) REAL GAP( MAXN ), RWORK( LWORK ), WIN( MAXN ), $ WNEW( MAXN ) COMPLEX A( MAXN*MAXN ), COPYA( MAXN*MAXN ), $ WORK( LWORK ), Z( MAXN*MAXN ) * .. * * * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDINIT, $ BLACS_PINFO, BLACS_SETUP, DESCINIT, PCSEPTST * .. * .. Executable Statements .. * IPREPAD = 3 IPOSTPAD = 3 LDA = MAXN * * These lines should be replaced by the output from pxSEPdriver * * ISEED( 1 ) = 2312 ISEED( 2 ) = 3709 ISEED( 3 ) = 666 ISEED( 4 ) = 3371 UPLO = 'U' SUBTESTS = 'Y' N = 33 NPROW = 2 NPCOL = 2 NB = 4 MATTYPE = 9 * note: the printout often makes a mess of ABSTOL ABSTOL = 0.1175494351E-37 THRESH = .350000E+01 * CALL BLACS_PINFO( IAM, NPROCS ) IF( NPROCS.LT.1 ) THEN * NPROCS = NPROW*NPCOL CALL BLACS_SETUP( IAM, NPROCS ) END IF CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, CONTEXT, LDA, INFO ) * CALL PCSEPTST( DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, N, $ ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, WORK, $ LWORK-IPREPAD-IPOSTPAD, RWORK, $ LWORK-IPREPAD-IPOSTPAD, IWORK, $ LIWORK-IPREPAD-IPOSTPAD, 6, INFO ) * * * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * * CALL BLACS_EXIT( 0 ) STOP * * * * End of PCRPTSEPTST * END scalapack-2.0.2/TESTING/EIG/pcsdpsubtst.f000644 000766 000024 00000040110 10363532303 020151 0ustar00juliestaff000000 000000 SUBROUTINE PCSDPSUBTST( WKNOWN, UPLO, N, THRESH, ABSTOL, A, COPYA, $ Z, IA, JA, DESCA, WIN, WNEW, IPREPAD, $ IPOSTPAD, WORK, LWORK, RWORK, LRWORK, $ LWORK1, IWORK, LIWORK, RESULT, TSTNRM, $ QTQNRM, NOUT ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * February 28, 2000 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER UPLO INTEGER IA, IPOSTPAD, IPREPAD, JA, LIWORK, LRWORK, $ LWORK, LWORK1, N, NOUT, RESULT REAL ABSTOL, QTQNRM, THRESH, TSTNRM * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) REAL RWORK( * ), WIN( * ), WNEW( * ) COMPLEX A( * ), COPYA( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PCSDPSUBTST calls PCHEEVD and then tests the output of * PCHEEVD * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PCHEEVD when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PCSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) COMPLEX array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PCHEEVD for a description of block cyclic layout. * The test matrix, which is then modified by PCHEEVD * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) COMPLEX array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) COMPLEX array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PCSEPCHK and PCSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) REAL array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) REAL array, dimension (N) * The eigenvalues as copmuted by this call to PCHEEVD * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * WORK (local workspace) COMPLEX array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * RWORK (local workspace) REAL array, dimension (LRWORK) * RWORK has already been padded front and back, * use RWORK(1+IPREPAD) * * LRWORK (local input) INTEGER * The actual length of the array RWORK after padding. * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PCHEEVD * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PCHEEVD * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) REAL * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) REAL * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285E+0, FIVE = 5.0E+0, $ NEGONE = -1.0E+0 ) COMPLEX CPADVAL PARAMETER ( CPADVAL = ( 13.989E+0, 1.93E+0 ) ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) COMPLEX CZERO, CONE, CNEGONE PARAMETER ( CZERO = 0.0E+0, CONE = 1.0E+0, $ CNEGONE = -1.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IAM, INFO, ISIZEHEEVD, ISIZEHEEVX, $ ISIZESUBTST, ISIZETST, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ, RES, RSIZECHK, RSIZEHEEVD, $ RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, RSIZETST, $ SIZEHEEVD, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST REAL EPS, EPSNORMA, ERROR, MAXERROR, MINERROR, NORM, $ NORMWIN, SAFMIN, ULP * .. * .. Local Arrays .. INTEGER ITMP( 2 ) * .. * .. External Functions .. * INTEGER NUMROC REAL PCLANGE, PCLANHE, PSLAMCH EXTERNAL NUMROC, PCLANGE, PCLANHE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLACPY, IGAMN2D, IGAMX2D, $ PCCHEKPAD, PCFILLPAD, PCGEMM, PCHEEVD, PCLASET, $ PCLASIZESEP, PCSEPCHK, PICHEKPAD, PIFILLPAD, $ PSCHEKPAD, PSFILLPAD, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, REAL * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL PCLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, SIZESUBTST, $ RSIZESUBTST, ISIZESUBTST, SIZETST, RSIZETST, $ ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PSLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1+IPREPAD ) ), $ ABS( WIN( N+IPREPAD ) ), NORMWIN ) * DO 10 I = 1, LWORK1, 1 RWORK( I+IPREPAD ) = 14.3E+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I ) = 14 20 CONTINUE DO 30 I = 1, LWORK, 1 WORK( I+IPREPAD ) = ( 15.63E+0, 1.1E+0 ) 30 CONTINUE * DO 40 I = 1, N WNEW( I+IPREPAD ) = 3.14159E+0 40 CONTINUE * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 60 RESULT = 0 * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL CLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PCFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, CPADVAL ) * CALL PCFILLPAD( DESCA( CTXT_ ), NP, NQ, Z, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, CPADVAL+1.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), LWORK1, 1, RWORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0E+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PCFILLPAD( DESCA( CTXT_ ), LWORK, 1, WORK, LWORK, IPREPAD, $ IPOSTPAD, CPADVAL+4.1E+0 ) * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) * CALL PCHEEVD( 'V', UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), SIZEHEEVD, RWORK( 1+IPREPAD ), $ LWORK1, IWORK( 1+IPREPAD ), LIWORK, INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEEVD-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, CPADVAL ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEEVD-Z', NP, NQ, Z, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ CPADVAL+1.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEEVD-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEEVD-rWORK', LWORK1, 1, $ RWORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEEVD-WORK', LWORK, 1, WORK, $ LWORK, IPREPAD, IPOSTPAD, CPADVAL+4.1E+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEEVD-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * * Check INFO * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PCLANHE( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ RWORK )*EPS END IF * * Note that a couple key variables get redefined in PCSEPCHK * as described by this table: * * PCSEPTST name PCSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * Perform the |AQ - QE| test * CALL PSFILLPAD( DESCA( CTXT_ ), RSIZECHK, 1, RWORK, RSIZECHK, $ IPREPAD, IPOSTPAD, 4.3E+0 ) * CALL PCSEPCHK( N, N, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCA, A( 1+IPREPAD ), $ IA, JA, DESCA, WNEW( 1+IPREPAD ), $ RWORK( 1+IPREPAD ), RSIZECHK, TSTNRM, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCSDPCHK-rWORK', RSIZECHK, 1, $ RWORK, RSIZECHK, IPREPAD, IPOSTPAD, 4.3E+0 ) * IF( RES.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9995 ) END IF * * Perform the |QTQ - I| test * CALL PSFILLPAD( DESCA( CTXT_ ), RSIZEQTQ, 1, RWORK, RSIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3E+0 ) * * RES = 0 ULP = PSLAMCH( DESCA( CTXT_ ), 'P' ) CALL PCLASET( 'A', N, N, CZERO, CONE, A( 1+IPREPAD ), IA, JA, $ DESCA ) CALL PCGEMM( 'Conjugate transpose', 'N', N, N, N, CNEGONE, $ Z( 1+IPREPAD ), IA, JA, DESCA, Z( 1+IPREPAD ), IA, $ JA, DESCA, CONE, A( 1+IPREPAD ), IA, JA, DESCA ) NORM = PCLANGE( '1', N, N, A( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ) ) QTQNRM = NORM / ( REAL( MAX( N, 1 ) )*ULP ) IF( QTQNRM.GT.THRESH ) THEN RES = 1 END IF CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCSEPQTQ-rWORK', RSIZEQTQ, 1, $ RWORK, RSIZEQTQ, IPREPAD, IPOSTPAD, 4.3E+0 ) * IF( RES.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9994 ) END IF * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN .AND. N.GT.0 ) THEN * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN MAXERROR = 0.0 * DO 50 I = 1, N ERROR = ABS( WIN( I+IPREPAD )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 50 CONTINUE MINERROR = MIN( MAXERROR, MINERROR ) * IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF * * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 60 CONTINUE * RETURN * 9999 FORMAT( 'PCHEEVD returned INFO=', I7 ) 9998 FORMAT( 'PCSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PCSDPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PCHEEVD returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'PCHEEVD failed the |AQ -QE| test' ) 9994 FORMAT( 'PCHEEVD failed the |QTQ -I| test' ) * * End of PCSDPSUBTST * END scalapack-2.0.2/TESTING/EIG/pcsepchk.f000644 000766 000024 00000024114 11750130340 017375 0ustar00juliestaff000000 000000 * * SUBROUTINE PCSEPCHK( MS, NV, A, IA, JA, DESCA, EPSNORMA, THRESH, $ Q, IQ, JQ, DESCQ, C, IC, JC, DESCC, W, WORK, $ LWORK, TSTNRM, RESULT ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER IA, IC, IQ, JA, JC, JQ, LWORK, MS, NV, RESULT REAL EPSNORMA, THRESH, TSTNRM * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCC( * ), DESCQ( * ) REAL W( * ), WORK( * ) COMPLEX A( * ), C( * ), Q( * ) * .. * * Purpose * ======= * * Compute |AQ- QL| / (EPSNORMA * N) * where EPSNORMA = (abstol + eps)*norm(A) when called by pdsqpsubtst. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * MP = number of local rows in A, C and Q * MQ = number of local columns in A * NQ = number of local columns in C and Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in A, C and Q * Also, the number of global columns in A * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q. * * A (local input) COMPLEX pointer to an * array in local memory of dimension (LLD_A, LOCc(JA+N-1)). * This array contains the local pieces of the MS-by-MS * distributed test matrix A * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * EPSNORMA (input) REAL * abstol + eps * inf.norm(A) * Abstol is absolute tolerence for the eigenvalues and is set * in the calling routines, pdsepsubtst and pdsqpsubtst. * * THRESH (input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) COMPLEX array * global dimension (MS, NV), local dimension (DESCA(DLEN_), NQ) * * Contains the eigenvectors as computed by PCHEEVX * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Q. * * C (local workspace) COMPLEX array, * global dimension (NV, NV), local dimension (DESCA(DLEN_), MQ) * * Accumulator for computing AQ -QL * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * W (global input) REAL array, dimension (NV) * * Contains the computed eigenvalues * * WORK (local workspace) REAL array, * dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * TSTNRM (global output) REAL * |AQ- QL| / ( EPSNROMA * MS ) * * RESULT (global output) INTEGER * 0 if the test passes i.e. * |AQ -QL| / (abstol + eps * norm(A) ) <= n* THRESH * 1 if the test fails i.e. * |AQ -QL| / (abstol + eps * norm(A) ) > n * THRESH * * .. Local Scalars .. * INTEGER INFO, J, LOCALCOL, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ, PCOL REAL NORM * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, NEGONE PARAMETER ( ONE = 1.0E+0, NEGONE = -1.0E+0 ) * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC REAL PCLANGE EXTERNAL INDXG2L, INDXG2P, NUMROC, PCLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, CLACPY, CSSCAL, $ PCGEMM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * RESULT = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 CALL CHK1MAT( MS, 1, MS, 1, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IQ, JQ, DESCQ, 12, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IC, JC, DESCC, 16, INFO ) * IF( INFO.EQ.0 ) THEN * MP = NUMROC( MS, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IF( IQ.NE.1 ) THEN INFO = -10 ELSE IF( JQ.NE.1 ) THEN INFO = -11 ELSE IF( IA.NE.1 ) THEN INFO = -4 ELSE IF( JA.NE.1 ) THEN INFO = -5 ELSE IF( IC.NE.1 ) THEN INFO = -14 ELSE IF( JC.NE.1 ) THEN INFO = -15 ELSE IF( LWORK.LT.NQ ) THEN INFO = -19 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PCSEPCHK', -INFO ) RETURN END IF * * C = Q * W * CALL CLACPY( 'A', MP, NQ, Q, DESCQ( LLD_ ), C, DESCC( LLD_ ) ) * * DO 10 J = 1, NV PCOL = INDXG2P( J, DESCC( NB_ ), 0, 0, NPCOL ) LOCALCOL = INDXG2L( J, DESCC( NB_ ), 0, 0, NPCOL ) * IF( MYCOL.EQ.PCOL ) THEN CALL CSSCAL( MP, W( J ), C( ( LOCALCOL-1 )*DESCC( LLD_ )+ $ 1 ), 1 ) END IF 10 CONTINUE * * * C = C - A * Q * CALL PCGEMM( 'N', 'N', MS, NV, MS, NEGONE, A, 1, 1, DESCA, Q, 1, $ 1, DESCQ, ONE, C, 1, 1, DESCC ) * * Compute the norm of C * * NORM = PCLANGE( 'M', MS, NV, C, 1, 1, DESCC, WORK ) * * TSTNRM = NORM / EPSNORMA / MAX( MS, 1 ) * IF( TSTNRM.GT.THRESH .OR. ( TSTNRM-TSTNRM.NE.0.0E0 ) ) THEN RESULT = 1 END IF * * RETURN * * End of PCSEPCHK * END scalapack-2.0.2/TESTING/EIG/pcsepdriver.f000644 000766 000024 00000023563 10363532303 020136 0ustar00juliestaff000000 000000 * * PROGRAM PCSEPDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Parallel COMPLEX Hermitian eigenproblem test driver * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * The full tester requires approximately (5 n + 5 n^2/p + slop) * COMPLEX words and 6*n integer words. * So, TOTMEM should be set to at least 1.1 * 8 * (5n + 5n^2/p) * * WHAT WE TEST * ============ * * This routine tests PCHEEVX, the expert driver for the parallel * Hermitian eigenvalue problem. We would like to cover all * possible combinations of: matrix size, process configuration * (nprow and npcol), block size (nb), matrix type (??), range * of eigenvalue (all, by value, by position), sorting options, * and upper vs. lower storage. * * We intend to provide two types of test input files, an * installation test and a thorough test. * * We also intend that the reports be meaningful. Our input file * will allow multiple requests where each request is a cross product * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, CPLXSZ, NIN PARAMETER ( TOTMEM = 2000000, CPLXSZ = 8, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / CPLXSZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Functions .. REAL SLAMCH EXTERNAL SLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PCSEPREQ, PSLACHKIEEE, PSLASNBT * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PSLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) END IF * CALL PSLACHKIEEE( ISIEEE, SLAMCH( 'O' ), SLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) END IF GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Hermitian Eigendecomposition routines.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'Hermitian eigenvalue routine: PCHEEVX.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 )' ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )' ||Q^T*Q - I|| ' // '/ (N * eps)' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ ' : the QTQ norm is allowed to exceed THRESH' // $ ' for those eigenvectors' WRITE( NOUT, FMT = 9999 )' : which could not be ' // $ 'reorthogonalized for lack of workspace.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see pCSEPtst.f).' WRITE( NOUT, FMT = 9999 )'SUB : Subtests ' // $ '(see pCSEPtst).f' WRITE( NOUT, FMT = 9999 )'CHK : ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )'QTQ : ||Q^T*Q - I||/ (N * eps)' WRITE( NOUT, FMT = 9999 ) $ ' : when the adjusted QTQ exceeds THRESH', $ ' the adjusted QTQ norm is printed' WRITE( NOUT, FMT = 9999 ) $ ' : otherwise the true QTQ norm is printed' WRITE( NOUT, FMT = 9999 ) $ 'If NT>1, CHK and QTQ are the max over all ' // $ 'eigen request tests' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PCSEPREQ( NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, NSKIPPED, $ NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * 9999 FORMAT( A ) 9998 FORMAT( ' I am about to check to make sure that overflow' ) 9997 FORMAT( ' is handled in the ieee default manner. If this' ) 9996 FORMAT( ' is the last output you see, you should assume' ) 9995 FORMAT( ' that overflow caused a floating point exception.' ) 9994 FORMAT( ' In that case, we recommend that you add -DNO_IEEE' ) 9993 FORMAT( ' to the CDEFS line in SLmake.inc.' ) 9992 FORMAT( ' Alternatively, you could set CDEFS in SLmake.inc ' ) 9991 FORMAT( ' to enable the default ieee behaviour, However, this' ) 9990 FORMAT( ' may result in good or very bad performance.' ) 9989 FORMAT( ' Either signed zeroes or signed infinities ' ) 9988 FORMAT( ' work incorrectly or your system. Change your' ) 9987 FORMAT( ' SLmake.inc as suggested above.' ) * 9986 FORMAT( ' Your system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP SUB WALL CPU ', $ ' CHK QTQ CHECK' ) 9978 FORMAT( ' ----- --- --- --- --- --- -------- --------', $ ' --------- --------- -----' ) * * End of PCSEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pcsepqtq.f000644 000766 000024 00000025312 10363532303 017442 0ustar00juliestaff000000 000000 * * SUBROUTINE PCSEPQTQ( MS, NV, THRESH, Q, IQ, JQ, DESCQ, C, IC, JC, $ DESCC, PROCDIST, ICLUSTR, GAP, WORK, LWORK, $ QTQNRM, INFO, RES ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IC, INFO, IQ, JC, JQ, LWORK, MS, NV, RES REAL QTQNRM, THRESH * .. * .. Array Arguments .. * INTEGER DESCC( * ), DESCQ( * ), ICLUSTR( * ), $ PROCDIST( * ) REAL GAP( * ), WORK( * ) COMPLEX C( * ), Q( * ) * .. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Purpose * ======= * * Compute |I - QT * Q| / (ulp * n) * * Arguments * ========= * * NP = number of local rows in C * NQ = number of local columns in C and Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in Q * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) COMPLEX array, * global dimension (MS, NV), local dimension (LDQ, NQ) * * Contains the eigenvectors as computed by PCSTEIN * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Q. * * C (local workspace) COMPLEX array, * global dimension (NV, NV), local dimension (DESCC(DLEN_), NQ) * * Accumulator for computing I - QT * Q * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * W (input) REAL array, dimension (NV) * All procesors have an identical copy of W() * * Contains the computed eigenvalues * * PROCDIST (global input) INTEGER array dimension (NPROW*NPCOL+1) * Identifies which eigenvectors are the last to be computed * by a given process * * ICLUSTR (global input) INTEGER array dimension (2*P) * This input array contains indices of eigenvectors * corresponding to a cluster of eigenvalues that could not be * orthogonalized due to insufficient workspace. * This should be the output of PCSTEIN. * * GAP (global input) REAL array, dimension (P) * This input array contains the gap between eigenvalues whose * eigenvectors could not be orthogonalized. * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= 2 + MAX( DESCC( MB_ ), 2 )*( 2*NP0+MQ0 ) * Where: * NP0 = NUMROC( NV, DESCC( MB_ ), 0, 0, NPROW ) * MQ0 = NUMROC( NV, DESCC( NB_ ), 0, 0, NPCOL ) * * QTQNRM (global output) REAL * |QTQ -I| / EPS * * RES (global output) INTEGER * 0 if the test passes i.e. |I - QT * Q| / (ulp * n) <= THRESH * 1 if the test fails i.e. |I - QT * Q| / (ulp * n) > THRESH * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO, ONE, NEGONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, $ NEGONE = -1.0E+0 ) * .. * .. Intrinsic Functions .. * INTRINSIC CMPLX, MAX, REAL * .. * .. Local Scalars .. INTEGER CLUSTER, FIRSTP, IMAX, IMIN, JMAX, JMIN, LWMIN, $ MQ0, MYCOL, MYROW, NEXTP, NP0, NPCOL, NPROW REAL NORM, QTQNRM2, ULP * .. * .. External Functions .. INTEGER NUMROC REAL PCLANGE, PSLAMCH EXTERNAL NUMROC, PCLANGE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCGEMM, PCLASET, $ PCMATADD, PXERBLA * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * RES = 0 ULP = PSLAMCH( DESCC( CTXT_ ), 'P' ) * CALL BLACS_GRIDINFO( DESCC( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 CALL CHK1MAT( MS, 1, MS, 2, IQ, JQ, DESCQ, 7, INFO ) CALL CHK1MAT( NV, 1, MS, 2, IC, JC, DESCC, 11, INFO ) * IF( INFO.EQ.0 ) THEN NP0 = NUMROC( NV, DESCC( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( NV, DESCC( NB_ ), 0, 0, NPCOL ) * LWMIN = 2 + MAX( DESCC( MB_ ), 2 )*( 2*NP0+MQ0 ) * IF( IQ.NE.1 ) THEN INFO = -5 ELSE IF( JQ.NE.1 ) THEN INFO = -6 ELSE IF( IC.NE.1 ) THEN INFO = -9 ELSE IF( JC.NE.1 ) THEN INFO = -10 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -16 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCC( CTXT_ ), 'PCSEPQTQ', -INFO ) RETURN END IF * * C = Identity matrix * CALL PCLASET( 'A', NV, NV, ZERO, ONE, C, IC, JC, DESCC ) * * C = C - QT * Q * IF( NV*MS.GT.0 ) THEN CALL PCGEMM( 'Conjugate transpose', 'N', NV, NV, MS, NEGONE, Q, $ 1, 1, DESCQ, Q, 1, 1, DESCQ, ONE, C, 1, 1, DESCC ) END IF * * Allow for poorly orthogonalized eigenvectors for large clusters * NORM = PCLANGE( '1', NV, NV, C, 1, 1, DESCC, WORK ) QTQNRM = NORM / ( REAL( MAX( MS, 1 ) )*ULP ) * CLUSTER = 1 10 CONTINUE DO 20 FIRSTP = 1, NPROW*NPCOL IF( PROCDIST( FIRSTP ).GE.ICLUSTR( 2*( CLUSTER-1 )+1 ) ) $ GO TO 30 20 CONTINUE 30 CONTINUE * IMIN = ICLUSTR( 2*CLUSTER-1 ) JMAX = ICLUSTR( 2*CLUSTER ) * * IF( IMIN.EQ.0 ) $ GO TO 60 * DO 40 NEXTP = FIRSTP, NPROW*NPCOL IMAX = PROCDIST( NEXTP ) JMIN = IMAX + 1 * * CALL PCMATADD( IMAX-IMIN+1, JMAX-JMIN+1, ZERO, C, IMIN, JMIN, $ DESCC, CMPLX( GAP( CLUSTER ) / 0.01E+0 ), C, $ IMIN, JMIN, DESCC ) CALL PCMATADD( JMAX-JMIN+1, IMAX-IMIN+1, ZERO, C, JMIN, IMIN, $ DESCC, CMPLX( GAP( CLUSTER ) / 0.01E+0 ), C, $ JMIN, IMIN, DESCC ) IMIN = IMAX * IF( ICLUSTR( 2*CLUSTER ).LT.PROCDIST( NEXTP+1 ) ) $ GO TO 50 40 CONTINUE 50 CONTINUE * CLUSTER = CLUSTER + 1 GO TO 10 60 CONTINUE * * Compute the norm of C * NORM = PCLANGE( '1', NV, NV, C, 1, 1, DESCC, WORK ) * QTQNRM2 = NORM / ( REAL( MAX( MS, 1 ) )*ULP ) * IF( QTQNRM2.GT.THRESH ) THEN RES = 1 QTQNRM = QTQNRM2 END IF RETURN * * End of PCSEPQTQ * END scalapack-2.0.2/TESTING/EIG/pcseprdriver.f000644 000766 000024 00000021335 11623527140 020316 0ustar00juliestaff000000 000000 PROGRAM PCSEPRDRIVER * * Parallel COMPLEX symmetric eigenproblem test driver for PCSYEVR * IMPLICIT NONE * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. REALSZ * indicates the length in bytes on the given platform for a number, * real for SINGLE/DOUBLE PRECISION, and complex for COMPLEX/COMPLEX*16. * For example, on a standard system, the length of a * REAL is 8, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * TESTS PERFORMED * =============== * * This routine performs tests for combinations of: matrix size, process * configuration (nprow and npcol), block size (nb), * matrix type, range of eigenvalue (all, by value, by index), * and upper vs. lower storage. * * It returns an error message when heterogeneity is detected. * * The input file allows multiple requests where each one is * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, REALSZ, NIN PARAMETER ( TOTMEM = 100000000, REALSZ = 8, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / REALSZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Functions .. REAL SLAMCH EXTERNAL SLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PSLACHKIEEE, PSLASNBT, PCSEPRREQ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEPR.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PSLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) END IF * CALL PSLACHKIEEE( ISIEEE, SLAMCH( 'O' ), SLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'Test ScaLAPACK symmetric eigendecomposition routine.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'symmetric eigenvalue routine: PCSYEVR.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 )' ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )' ||Q^T*Q - I|| ' // '/ (N * eps)' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT = PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see PCSEPRTST).' WRITE( NOUT, FMT = 9999 )'SUB : Subtests (Y/N).' WRITE( NOUT, FMT = 9999 )'WALL : Wallclock time.' WRITE( NOUT, FMT = 9999 )'CPU : CPU time.' WRITE( NOUT, FMT = 9999 )'CHK : ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )'QTQ : ||Q^T*Q - I||/ (N * eps)' WRITE( NOUT, FMT = 9999 ) $ ' : when the adjusted QTQ norm exceeds THRESH', $ ' it is printed,' WRITE( NOUT, FMT = 9999 ) $ ' otherwise the true QTQ norm is printed.' WRITE( NOUT, FMT = 9999 ) $ ' : If more than one test is done, CHK and QTQ ' WRITE( NOUT, FMT = 9999 ) $ ' are the max over all eigentests performed.' WRITE( NOUT, FMT = 9999 ) $ 'TEST : EVR - testing PCSYEVR' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PCSEPRREQ( HETERO, NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * c CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', ' ') * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * 9999 FORMAT( A ) 9997 FORMAT( 'Check if overflow is handled in ieee default manner.' ) 9996 FORMAT( 'If this is the last output you see, you should assume') 9995 FORMAT( 'that overflow caused a floating point exception.' ) * 9986 FORMAT( 'Test ok. The system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP SUB WALL CPU ', $ ' CHK QTQ CHECK TEST' ) 9978 FORMAT( ' ----- --- --- --- --- --- -------- --------', $ ' --------- --------- ----- ----' ) * * End of PCSEPRDRIVER * END scalapack-2.0.2/TESTING/EIG/pcsepreq.f000644 000766 000024 00000023145 10363532303 017426 0ustar00juliestaff000000 000000 * * SUBROUTINE PCSEPREQ( NIN, MEM, MEMSIZE, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX MEM( MEMSIZE ) * .. * * Purpose * ======= * * PCSEPREQ performs one request from the input file 'SEP.dat' * A request is the cross product of the specifications in the * input file. PCSEPREQ prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEP.dat' * * MEM (local input) COMPLEX ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, INTGSZ PARAMETER ( CPLXSZ = 8, INTGSZ = 4 ) INTEGER REALSZ PARAMETER ( REALSZ = 4 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IMIDPAD, INITCON, IPOSTPAD, $ IPREPAD, ISIZEHEEVX, ISIZESUBTST, ISIZETST, $ LDA, LLRWORK, MATSIZE, MATTYPE, MYCOL, MYROW, $ N, NB, NMATSIZES, NMATTYPES, NNODES, NP, NPCOL, $ NPCONFIGS, NPROW, NQ, NUPLOS, ORDER, PCONFIG, $ PTRA, PTRCOPYA, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRRWORK, PTRW, PTRW2, PTRWORK, PTRZ, $ RES, RSIZECHK, RSIZEHEEVX, RSIZEQTQ, $ RSIZESUBTST, RSIZETST, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST, UPLO,SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD REAL ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PCLASIZESEP, PCSEPTST, PSSEPINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PSSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * * DO 40 MATSIZE = 1, NMATSIZES * DO 30 PCONFIG = 1, NPCONFIGS * DO 20 MATTYPE = 1, NMATTYPES * DO 10 UPLO = 1, NUPLOS * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, $ MYCOL ) IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PCLASIZESEP( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, $ RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, $ SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, CPLXSZ / REALSZ ) PTRWORK = PTRW2 + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, CPLXSZ / REALSZ ) PTRGAP = PTRWORK + SIZETST + IPREPAD + IPOSTPAD PTRIFAIL = PTRGAP + ICEIL( NPROW*NPCOL+IPREPAD+ $ IPOSTPAD, CPLXSZ / REALSZ ) PTRICLUS = PTRIFAIL + ICEIL( N+IPREPAD+IPOSTPAD, $ CPLXSZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, CPLXSZ / INTGSZ ) PTRRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, CPLXSZ / INTGSZ ) LLRWORK = ( MEMSIZE-PTRRWORK+1 )*CPLXSZ / REALSZ C LLRWORK = ( MEMSIZE-PTRRWORK-IPREPAD-IPOSTPAD+1) C $ *CPLXSZ / REALSZ NTESTS = NTESTS + 1 IF( LLRWORK.LT.RSIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PCSEPTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), SUBTESTS, $ THRESH, N, ABSTOL, ISEED, $ MEM( PTRA ), MEM( PTRCOPYA ), $ MEM( PTRZ ), LDA, MEM( PTRW ), $ MEM( PTRW2 ), MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ SIZETST, MEM( PTRRWORK ), $ LLRWORK, MEM( PTRIWRK ), $ ISIZETST, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT=*)'pCSEPREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE END IF * * RETURN * * End of PCDSEPREQ * END scalapack-2.0.2/TESTING/EIG/pcseprreq.f000644 000766 000024 00000021755 11623527140 017620 0ustar00juliestaff000000 000000 SUBROUTINE PCSEPRREQ( HETERO, NIN, MEM, MEMSIZE, NOUT, ISEED, $ NTESTS, NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER HETERO INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX MEM( MEMSIZE ) * * Purpose * ======= * * PCSEPRREQ performs one request from the input file 'SEPR.dat' * A request is the cross product of the specifications in the * input file. It prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEPR.dat' * * MEM (local input ) COMPLEX ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER REALSZ, INTGSZ PARAMETER ( REALSZ = 4, INTGSZ = 4 ) INTEGER KMPXSZ PARAMETER ( KMPXSZ = 8 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IMIDPAD, INITCON, IPOSTPAD, $ IPREPAD, ISIZESUBTST, ISIZEEVR, ISIZETST, $ LDA, LLWORK, MATSIZE, MATTYPE, MYCOL, MYROW, N, $ NB, NMATSIZES, NMATTYPES, NNODES, NP, NPCOL, $ NPCONFIGS, NPROW, NQ, NUPLOS, ORDER, PCONFIG, $ PTRA, PTRCOPYA, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRW, PTRW2, PTRWORK, PTRZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZEEVR, $ SIZETMS, SIZETST, UPLO INTEGER PTRRWORK, RSIZEEVR, RSIZESUBTST, RSIZETST * REAL ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PCLASIZESEPR, PSSEPINFO, PCSEPRTST * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PSSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * DO 40 MATSIZE = 1, NMATSIZES * DO 30 PCONFIG = 1, NPCONFIGS * DO 20 MATTYPE = 1, NMATTYPES * DO 10 UPLO = 1, NUPLOS * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, $ MYCOL ) * IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PCLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, RSIZEEVR, $ ISIZEEVR, SIZESUBTST, $ RSIZESUBTST, ISIZESUBTST, $ SIZETST, RSIZETST, ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, KMPXSZ / REALSZ ) PTRWORK = PTRW2 + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, KMPXSZ / REALSZ ) PTRGAP = PTRWORK + SIZETST + IPREPAD + IPOSTPAD PTRIFAIL = PTRGAP + ICEIL( NPROW*NPCOL+IPREPAD+ $ IPOSTPAD, KMPXSZ / REALSZ ) PTRICLUS = PTRIFAIL + ICEIL( N+IPREPAD+IPOSTPAD, $ KMPXSZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, KMPXSZ / INTGSZ ) PTRRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, KMPXSZ / INTGSZ ) LLWORK = ( MEMSIZE-PTRRWORK+1 )*KMPXSZ / REALSZ NTESTS = NTESTS + 1 IF( LLWORK.LT.RSIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PCSEPRTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), SUBTESTS, $ THRESH, N, ABSTOL, ISEED, $ MEM( PTRA ), MEM( PTRCOPYA ), $ MEM( PTRZ ), LDA, MEM( PTRW ), $ MEM( PTRW2 ), MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ SIZETST, MEM( PTRRWORK ), $ LLWORK, MEM( PTRIWRK ), $ ISIZETST, HETERO, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * )' PCSEPRREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of PCSEPRREQ * END scalapack-2.0.2/TESTING/EIG/pcseprsubtst.f000644 000766 000024 00000072335 11623527140 020355 0ustar00juliestaff000000 000000 SUBROUTINE PCSEPRSUBTST( WKNOWN, JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, IA, JA, $ DESCA, WIN, WNEW, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK, LWORK, RWORK, $ LRWORK, LWORK1, IWORK, LIWORK, RESULT, $ TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, IPOSTPAD, IPREPAD, IU, JA, LIWORK, $ LWORK, LWORK1, N, NOUT, RESULT REAL ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU INTEGER LRWORK * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) COMPLEX A( * ), COPYA( * ), WORK( * ), Z( * ) REAL GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) * .. * * Purpose * ======= * * PCSEPRSUBTST calls PCSYEVR and then tests its output. * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues computed. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 100 or 250. In particular, * it should not depend on the size of the matrix. * It must be at least zero. * * ABSTOL (global input) REAL * The absolute tolerance for the residual test. * * A (local workspace) COMPLEX array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * The test matrix, which is subsequently overwritten. * A is distributed in a 2D-block cyclic manner over both rows * and columns. * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) COMPLEX array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) COMPLEX array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PCSEPCHK and PCSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) REAL array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) REAL array, dimension (N) * The computed eigenvalues. * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) REAL array, * dimension (NPROW*NPCOL) * * WORK (local workspace) COMPLEX array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * RWORK has already been padded front and back, * use RWORK(1+IPREPAD) * * LRWORK (local input) INTEGER * The actual length of the array RWORK after padding. * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to the eigensolver. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call. * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) REAL * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) REAL * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER DLEN_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( DLEN_ = 9, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285E0, FIVE = 5.0E0, $ NEGONE = -1.0E0 ) COMPLEX ZPADVAL PARAMETER ( ZPADVAL = ( 13.989E0, 1.93E0 ) ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZESUBTST, ISIZEEVR, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZEEVR, SIZETMS, $ SIZETST, VALSIZE, VECSIZE INTEGER RSIZEEVR, RSIZESUBTST, RSIZETST REAL EPS, EPSNORMA, ERROR, MAXERROR, MAXVU, $ MINERROR, MINVL, NORMWIN, OLDVL, OLDVU, $ SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), ISEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH, PCLANHE EXTERNAL LSAME, NUMROC, PSLAMCH, PCLANHE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLACPY, DESCINIT, IGAMN2D, $ IGAMX2D, PCCHEKPAD, PCELSET, PCFILLPAD, $ PCHEEVR, PCLASIZEHEEVR, PCLASIZESEPR, PCSEPCHK, $ PCSEPQTQ, PICHEKPAD, PIFILLPAD, PSCHEKPAD, $ PSFILLPAD, SGAMN2D, SGAMX2D, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * CALL PCLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, RSIZEEVR, ISIZEEVR, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, $ SIZETST, RSIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PSLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that no information from previous calls is used * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 RWORK( I+IPREPAD ) = 14.3E0 10 CONTINUE * DO 15 I = 1, LWORK, 1 WORK( I+IPREPAD ) = ( 15.63E0, 1.1E0 ) 15 CONTINUE * DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159E0 30 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF (LSAME( RANGE, 'V' ) ) THEN * WRITE(*,*) 'VL VU = ', VL, ' ', VU END IF IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL * WRITE(*,*) 'MINVL = ', MINVL, ' MAXVU = ', MAXVU * WRITE(*,*) 'WIN = ', WIN( 1 ) MINIL = 1 MAXIU = 0 DO 40 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 40 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * ISEED( 1 ) = 1 * CALL PCLASIZEHEEVR( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL CLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PCFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, ZPADVAL ) * CALL PCFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, ZPADVAL+1.0E0 ) * * WRITE(*,*) ' NP = ', NP, ' MQ = ', MQ, ' LDZ = ', DESCZ( LLD_ ), * $ ' IPREPAD = ', IPREPAD, ' IPOSTPAD = ', IPOSTPAD, * $ ' MAXEIGS = ', MAXEIGS * WRITE(*,*) ' PADZ( 1 ) = ', Z( 1 ), ' PADZ( 2 ) = ', Z( 2 ), * $ ' PADZ( 3 ) = ', Z( 3 ), ' PADZ( 4 ) = ', Z( 4 ) * CALL PSFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0E0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0E0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), LWORK1, 1, RWORK,LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0E0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PCFILLPAD( DESCA( CTXT_ ), LWORK, 1, WORK, LWORK, IPREPAD, $ IPOSTPAD, ZPADVAL+4.1E0 ) * * Make sure that PCHEEVR does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, MAXEIGS, 1 CALL PCELSET( Z( 1+IPREPAD ), I, J, DESCA, $ ( 13.0E0, 1.34E0 ) ) 50 CONTINUE 60 CONTINUE * * Reset and start the timer * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) ********************************* * * Main call to PCHEEVR * CALL PCHEEVR( JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ VL, VU, IL, IU, M, NZ, WNEW( 1+IPREPAD ), $ Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), SIZEEVR, $ RWORK( 1+IPREPAD ), LWORK1, $ IWORK( 1+IPREPAD ), LIWORK, INFO ) * ********************************* * * Stop timer * CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * * Indicate that there are no unresolved clusters. * This is necessary so that the tester * (adapted from the one originally made for PSSYEVX) * works correctly. ICLUSTR( 1+IPREPAD ) = 0 * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEEVR-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, ZPADVAL ) * CALL PCCHEKPAD( DESCZ( CTXT_ ), 'PCHEEVR-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ ZPADVAL+1.0E0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEEVR-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0E0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEEVR-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0E0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEEVR-RWORK',LWORK1, 1, $ RWORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0E0 ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEEVR-WORK',LWORK, 1, $ WORK, LWORK, IPREPAD, IPOSTPAD, $ ZPADVAL+4.1E0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEEVR-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEEVR-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEEVR-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * If we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PCLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * Check INFO * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) WRITE( NOUT,*) 'M = ', M, '\n', 'N = ', N RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) WRITE( NOUT,*) 'IL = ', IL, ' IU = ', IU, ' M = ', M END IF RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Ensure that different processes return the same eigenvalues * DO 70 I = 1, M RWORK( I ) = WNEW( I+IPREPAD ) RWORK( I+M ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, RWORK, M, $ 1, 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ RWORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 80 I = 1, M IF( RESULT.EQ.0 .AND. ( ABS( RWORK( I )-RWORK( M+ $ I ) ).GT.FIVE*EPS*ABS( RWORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 80 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 90 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 100 NCLUSTERS = NCLUSTERS + 1 90 CONTINUE 100 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 110 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 110 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * DO 120 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 120 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 150 * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PCLANHE( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ RWORK )*EPS END IF * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |A Z - Z W| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, RWORK,SIZECHK, $ IPREPAD, IPOSTPAD, 4.3E0 ) * CALL PCSEPCHK( N, NZ, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), RWORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCSEPCHK-RWORK',SIZECHK, 1, $ RWORK,SIZECHK, IPREPAD, IPOSTPAD, 4.3E0 ) * IF( RES.NE.0 ) $ RESULT = 1 * * Perform the |QTQ - I| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1,RWORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3E0 ) * * CALL PCSEPQTQ( N, NZ, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1+IPREPAD+1 ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ),RWORK( IPREPAD+1 ), SIZEQTQ, $ QTQNRM, INFO, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSEPQTQ-RWORK',SIZEQTQ, 1, $ RWORK,SIZEQTQ, IPREPAD, IPOSTPAD, 4.3E0 ) * IF( RES.NE.0 ) $ RESULT = 1 * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that the right eigenvalues have been obtained * IF( WKNOWN ) THEN * Set up MYIL if necessary MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 140 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 130 I = 1, M * WRITE(*,*) 'WIN WNEW = ', WIN( I+MYIL-1 ), * $ WNEW( I+IPREPAD ) ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 130 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 140 CONTINUE * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what was computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * RETURN * 9999 FORMAT( 'PCHEEVR returned INFO=', I7 ) 9998 FORMAT( 'PCSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PCSEPRSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PCHEEVR returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PCHEEVR' ) 9981 FORMAT( 'NZ altered by PCHEEVR with JOBZ=N' ) * * End of PCSEPRSUBTST * END scalapack-2.0.2/TESTING/EIG/pcseprtst.f000644 000766 000024 00000073761 11623527140 017647 0ustar00juliestaff000000 000000 SUBROUTINE PCSEPRTST(DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, $ ORDER, ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, RWORK, LRWORK, $ IWORK, LIWORK, HETERO, NOUT, INFO ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER HETERO, SUBTESTS, UPLO INTEGER INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, LWORK, $ MATTYPE, N, NOUT, ORDER INTEGER LRWORK REAL ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) REAL GAP( * ), WIN( * ), WNEW( * ), RWORK( * ) COMPLEX A( LDA, * ), COPYA( LDA, * ), $ WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PCSEPRTST builds a random matrix and runs PCHEEVR to * compute the eigenvalues and eigenvectors. Then it performs two tests * to determine if the result is good enough. The two tests are: * |AQ -QL| / (abstol + ulp * norm(A) ) * and * |QT * Q - I| / ulp * norm(A) * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) A matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) REAL * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) COMPLEX array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * The test matrix, which is then overwritten. * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * * COPYA (local workspace) COMPLEX array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * Z (local workspace) COMPLEX array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PCSEPCHK and PCSEPQTQ * * W (local workspace) REAL array, dimension (N) * On normal exit, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * Not used, only for backward compatibility * * WORK (local workspace) COMPLEX array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PCLASIZESEPR * * RWORK (local workspace) REAL array, dimension (LRWORK) * * LRWORK (local input) INTEGER * The length of the array WORK. LRWORK >= RSIZETST as * returned by P@(CRPF)LASIZESEPR * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PCLASIZESEPR * * HETERO (input) INTEGER * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER CTXT_, MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( CTXT_ = 2, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL HALF, ONE, TEN, ZERO PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, $ TEN = 10.0E0, HALF = 0.5E0 ) COMPLEX PADVAL PARAMETER ( PADVAL = ( 19.25E0, 1.1E1 ) ) COMPLEX ZZERO PARAMETER ( ZZERO = ( 0.0E0, 0.0E0 ) ) COMPLEX ZONE PARAMETER ( ZONE = ( 1.0E0, 0.0E0 ) ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IHETERO, IINFO, IL, IMODE, IN, $ INDD, INDWORK, ISIZESUBTST, ISIZEEVR, $ ISIZETST, ITYPE, IU, J, LLWORK, LEVRSIZE, $ MAXSIZE, MYCOL, MYROW, NB, NGEN, NLOC, $ NNODES, NP, NPCOL, NPROW, NQ, RES, SIZECHK, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, $ SIZESUBTST, SIZEEVR, SIZETMS, $ SIZETST, VALSIZE, VECSIZE INTEGER INDRWORK, LLRWORK, RSIZEEVR, RSIZESUBTST, $ RSIZETST REAL ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL SLARAN, PSLAMCH EXTERNAL SLARAN, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, CLATMS, IGAMX2D, $ IGEBR2D, IGEBS2D, PCCHEKPAD, PCELSET, $ PCFILLPAD, PCLASET, PCLASIZEHEEVR, $ PCLASIZESEPR, PCLATMS, PCMATGEN, PCSEPRSUBTST, $ SLABAD, SLASRT, SLCOMBINE * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, INT, MAX, MIN, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * INFO = 0 PASSED = 'PASSED EVR' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * Distribute HETERO across processes * IF( IAM.EQ.0 ) THEN IF( LSAME( HETERO, 'Y' ) ) THEN IHETERO = 2 ELSE IHETERO = 1 END IF CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1, 0, 0 ) END IF IF( IHETERO.EQ.2 ) THEN HETERO = 'Y' ELSE HETERO = 'N' END IF * * Make sure that there is enough memory * CALL PCLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, RSIZEEVR, ISIZEEVR, $ SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, RSIZETST, ISIZETST ) IF( LRWORK.LT.RSIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDRWORK = INDD + N INDWORK = 1 LLWORK = LWORK - INDWORK + 1 LLRWORK = LRWORK - INDRWORK + 1 * ULP = PSLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PSLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL SLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / REAL( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log Hermitian, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random Hermitian * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N RWORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PCLASET( 'All', N, N,ZZERO,ZZERO, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N RWORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PCLASET( 'All', N, N,ZZERO,ZONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0E0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S',RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0E0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * Hermitian, eigenvalues specified * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0E0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S',RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0E0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * Hermitian, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PCMATGEN( DESCA( CTXT_ ), 'H', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0E0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S',RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0E0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PCLASET( 'All', N, N,ZZERO,ZZERO, COPYA, 1, 1, DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( SLARAN( ISEED )*REAL( NLOC ) ), N-NGEN ) * CALL CLATMS( IN, IN, 'S', ISEED, 'P',RWORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PCELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PCELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PCELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PCELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2,... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 RWORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0E0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S',RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0E0 ) * ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL SLASRT( 'I', N,RWORK( INDD ), IINFO ) * CALL PCLASIZEHEEVR( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED,RWORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) LEVRSIZE = MIN( MAXSIZE, LLRWORK ) * CALL PCSEPRSUBTST( WKNOWN, 'v', 'a', UPLO, N, VL, VU, IL, IU, $ THRESH, ABSTOL, A, COPYA, Z, 1, 1, DESCA, $ RWORK( INDD ), WIN, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK( INDWORK ), LLWORK, $ RWORK( INDRWORK ), LLRWORK, $ LEVRSIZE, IWORK, ISIZEEVR, RES, TSTNRM, $ QTQNRM, NOUT ) * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PCLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PCSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 1' INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * Use PCLASIZEHEEVR to choose IL and IU. * CALL PCLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PCSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 2' INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PCLASIZEHEEVR to choose IL and IU for us. * CALL PCLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VECSIZE * CALL PCSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PCLASIZEHEEVR to choose IL and IU for us. * CALL PCLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PCSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVR to choose VL and VU for us. * CALL PCLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VECSIZE * CALL PCSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF END IF * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 .AND. .FALSE. ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF C WRITE(*,*)'************************************************' END IF * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, $ F8.2, 1X, F8.2, 1X, G9.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 21X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 21X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 32X, $ 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) C 9984 FORMAT( ' IBTYPE=', I8 ) C 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) C 9980 FORMAT( ' Increase TOTMEM in PCSEPRDRIVER' ) * * End of PCSEPRTST * END scalapack-2.0.2/TESTING/EIG/pcsepsubtst.f000644 000766 000024 00000072535 10363532303 020172 0ustar00juliestaff000000 000000 * * SUBROUTINE PCSEPSUBTST( WKNOWN, JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, IA, JA, $ DESCA, WIN, WNEW, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK, LWORK, RWORK, $ LRWORK, LWORK1, IWORK, LIWORK, RESULT, $ TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, IPOSTPAD, IPREPAD, IU, JA, LIWORK, $ LRWORK, LWORK, LWORK1, N, NOUT, RESULT REAL ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) REAL GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) COMPLEX A( * ), COPYA( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PCSEPSUBTST calls PCHEEVX and then tests the output of * PCHEEVX * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PCHEEVX when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PCSEPSUBTST * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call to PCSEPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PCSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) COMPLEX array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PCHEEVX for a description of block cyclic layout. * The test matrix, which is then modified by PCHEEVX * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) COMPLEX array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) COMPLEX array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PCSEPCHK and PCSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) REAL array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) REAL array, dimension (N) * The eigenvalues as copmuted by this call to PCHEEVX * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) REAL array, * dimension (NPROW*NPCOL) * * WORK (local workspace) COMPLEX array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * RWORK (local workspace) REAL array, dimension (LRWORK) * RWORK has already been padded front and back, * use RWORK(1+IPREPAD) * * LRWORK (local input) INTEGER * The actual length of the array RWORK after padding. * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PCHEEVX * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PCHEEVX * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) REAL * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) REAL * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285E+0, FIVE = 5.0E+0, $ NEGONE = -1.0E+0 ) COMPLEX CPADVAL PARAMETER ( CPADVAL = ( 13.989E+0, 1.93E+0 ) ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ RSIZECHK, RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, $ RSIZETST, SIZEHEEVX, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZESUBTST, SIZETMS, SIZETST, VALSIZE, $ VECSIZE, SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD REAL EPS, EPSNORMA, ERROR, MAXERROR, MAXVU, $ MINERROR, MINVL, NORMWIN, OLDVL, OLDVU, ORFAC, $ SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), DSEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC REAL PCLANHE, PSLAMCH EXTERNAL LSAME, NUMROC, PCLANHE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLACPY, DESCINIT, IGAMN2D, $ IGAMX2D, PCCHEKPAD, PCELSET, PCFILLPAD, $ PCHEEVX, PCLASIZEHEEVX, PCLASIZESEP, PCSEPCHK, $ PCSEPQTQ, PICHEKPAD, PIFILLPAD, PSCHEKPAD, $ PSFILLPAD, SGAMN2D, SGAMX2D, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PCLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PSLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 RWORK( I+IPREPAD ) = 14.3E+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE DO 30 I = 1, LWORK, 1 WORK( I+IPREPAD ) = ( 15.63E+0, 1.1E+0 ) 30 CONTINUE * DO 40 I = 1, N WNEW( I+IPREPAD ) = 3.14159E+0 40 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL MINIL = 1 MAXIU = 0 DO 50 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 50 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 160 RESULT = 0 * * * DSEED is not used in this call to PCLASIZEHEEVX, the * following line just makes ftnchek happy. * DSEED( 1 ) = 1 * CALL PCLASIZEHEEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL CLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PCFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, CPADVAL ) * CALL PCFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, CPADVAL+1.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), LWORK1, 1, RWORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0E+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PCFILLPAD( DESCA( CTXT_ ), LWORK, 1, WORK, LWORK, IPREPAD, $ IPOSTPAD, CPADVAL+4.1E+0 ) * * Make sure that PCHEEVX does not cheat (i.e. use answers * already computed.) * DO 70 I = 1, N, 1 DO 60 J = 1, MAXEIGS, 1 CALL PCELSET( Z( 1+IPREPAD ), I, J, DESCA, $ ( 13.0E+0, 1.34E+0 ) ) 60 CONTINUE 70 CONTINUE * ORFAC = -1.0E+0 * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PCHEEVX( JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ VL, VU, IL, IU, ABSTOL, M, NZ, WNEW( 1+IPREPAD ), $ ORFAC, Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), SIZEHEEVX, RWORK( 1+IPREPAD ), $ LWORK1, IWORK( 1+IPREPAD ), LIWORK, $ IFAIL( 1+IPREPAD ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEEVX-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, CPADVAL ) * CALL PCCHEKPAD( DESCZ( CTXT_ ), 'PCHEEVX-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ CPADVAL+1.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEEVX-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEEVX-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCHEEVX-rWORK', LWORK1, 1, $ RWORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCHEEVX-WORK', LWORK, 1, WORK, $ LWORK, IPREPAD, IPOSTPAD, CPADVAL+4.1E+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEEVX-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEEVX-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PCHEEVX-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * * Since we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9992 ) RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Make sure that different processes return the same eigenvalues * DO 80 I = 1, M RWORK( I ) = WNEW( I+IPREPAD ) RWORK( I+M ) = WNEW( I+IPREPAD ) 80 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, RWORK, M, $ 1, 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ RWORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 90 I = 1, M * IF( RESULT.EQ.0 .AND. ( ABS( RWORK( I )-RWORK( M+ $ I ) ).GT.FIVE*EPS*ABS( RWORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 90 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 100 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 110 NCLUSTERS = NCLUSTERS + 1 100 CONTINUE 110 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 120 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 120 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * * DO 130 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 130 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 160 * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PCLANHE( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ RWORK )*EPS END IF * * Note that a couple key variables get redefined in PCSEPCHK * as described by this table: * * PCSEPTST name PCSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |AQ - QE| test * CALL PSFILLPAD( DESCA( CTXT_ ), RSIZECHK, 1, RWORK, $ RSIZECHK, IPREPAD, IPOSTPAD, 4.3E+0 ) * CALL PCSEPCHK( N, NZ, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), RWORK( 1+IPREPAD ), $ RSIZECHK, TSTNRM, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCSEPCHK-rWORK', RSIZECHK, $ 1, RWORK, RSIZECHK, IPREPAD, IPOSTPAD, $ 4.3E+0 ) * IF( RES.NE.0 ) $ RESULT = 1 * * Perform the |QTQ - I| test * CALL PSFILLPAD( DESCA( CTXT_ ), RSIZEQTQ, 1, RWORK, $ RSIZEQTQ, IPREPAD, IPOSTPAD, 4.3E+0 ) * * CALL PCSEPQTQ( N, NZ, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1+IPREPAD+1 ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), RWORK( IPREPAD+1 ), $ RSIZEQTQ, QTQNRM, INFO, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PCSEPQTQ-rWORK', RSIZEQTQ, $ 1, RWORK, RSIZEQTQ, IPREPAD, IPOSTPAD, $ 4.3E+0 ) * IF( RES.NE.0 ) $ RESULT = 1 * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN ) THEN * * Set up MYIL if necessary * MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 150 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 140 I = 1, M ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 140 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 150 CONTINUE * * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what PxHEEVX computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 160 CONTINUE * * RETURN * 9999 FORMAT( 'PCHEEVX returned INFO=', I7 ) 9998 FORMAT( 'PCSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PCSEPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PCHEEVX returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PCHEEVX' ) 9981 FORMAT( 'NZ altered by PCHEEVX with JOBZ=N' ) * * End of PCSEPSUBTST * END scalapack-2.0.2/TESTING/EIG/pcseptst.f000644 000766 000024 00000125334 11622500733 017455 0ustar00juliestaff000000 000000 * SUBROUTINE PCSEPTST( DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, $ ORDER, ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, RWORK, LRWORK, IWORK, LIWORK, $ NOUT, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 15, 2002 * * .. Scalar Arguments .. CHARACTER SUBTESTS, UPLO INTEGER INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, LRWORK, $ LWORK, MATTYPE, N, NOUT, ORDER REAL ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) REAL GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) COMPLEX A( LDA, * ), COPYA( LDA, * ), WORK( * ), $ Z( LDA, * ) * .. * * Purpose * ======= * * PCSEPTST builds a random matrix, runs PCHEEVX() to * compute the eigenvalues * and eigenvectors and then performs two tests to * determine if the result * is good enough. The two tests are: * |AQ -QL| / (abstol + ulp * norm(A) ) * and * |QT * Q - I| / ulp * norm(A) * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) Hermitian matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PCSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) COMPLEX array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * The test matrix, which is then modified by PCHEEVX * * COPYA (local workspace) COMPLEX array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * Z (local workspace) COMPLEX array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PCSEPCHK and PCSEPQTQ * * W (local workspace) REAL array, dimension (N) * On normal exit from PCHEEVX, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * * WORK (local workspace) COMPLEX array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PCLASIZESEP * * RWORK (local workspace) COMPLEX array, dimension (LWORK) * * LRWORK (local input) INTEGER * The length of the array WORK. LRWORK >= RSIZETST as * returned by PCLASIZESEP * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PCLASIZESEP * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE, TEN, HALF PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TEN = 10.0E+0, $ HALF = 0.5E+0 ) COMPLEX PADVAL PARAMETER ( PADVAL = ( 19.25E+0, 1.1E+1 ) ) COMPLEX CZERO PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IINFO, IL, IMODE, IN, INDD, $ INDRWORK, INDWORK, ISIZEHEEVD, ISIZEHEEVX, $ ISIZESUBTST, ISIZETST, ITYPE, IU, J, $ LHEEVDSIZE, LHEEVXSIZE, LLRWORK, LLWORK, $ MAXSIZE, MYCOL, MYROW, NB, NGEN, NLOC, NNODES, $ NP, NP0, NPCOL, NPROW, NQ, NQ0, RES, RSIZECHK, $ RSIZEHEEVD, RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, $ RSIZETST, SIZEHEEVD, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST, VALSIZE, VECSIZE REAL ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH, SLARAN EXTERNAL LSAME, NUMROC, PSLAMCH, SLARAN * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, CLATMS, IGAMX2D, $ IGEBR2D, IGEBS2D, PCCHEKPAD, PCELSET, $ PCFILLPAD, PCLASET, PCLASIZEHEEVX, PCLASIZESEP, $ PCLATMS, PCMATGEN, PCSDPSUBTST, PCSEPSUBTST, $ SLABAD, SLASRT, SLCOMBINE * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, MAX, MIN, REAL, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 PASSED = 'PASSED EEVX' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Make sure that we have enough memory * * CALL PCLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, SIZESUBTST, $ RSIZESUBTST, ISIZESUBTST, SIZETST, RSIZETST, $ ISIZETST ) * IF( LRWORK.LT.RSIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDRWORK = INDD + N INDWORK = 1 LLWORK = LWORK - INDWORK + 1 LLRWORK = LRWORK - INDRWORK + 1 * ULP = PSLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PSLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL SLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / REAL( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log Hermitian, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random Hermitian * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * * Zero * * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N RWORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PCLASET( 'All', N, N, CZERO, CZERO, COPYA, 1, 1, $ DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N RWORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PCLASET( 'All', N, N, CZERO, CONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0E+0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0E+0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * Hermitian, eigenvalues specified * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * Hermitian, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PCMATGEN( DESCA( CTXT_ ), 'H', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0E+0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0E+0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PCLASET( 'All', N, N, CZERO, CZERO, COPYA, 1, 1, $ DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( SLARAN( ISEED )*REAL( NLOC ) ), N-NGEN ) * CALL CLATMS( IN, IN, 'S', ISEED, 'P', RWORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PCELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PCELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PCELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PCELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2, ... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 RWORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * * CALL PCFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0E+0 ) * CALL PCLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PCCHEKPAD( DESCA( CTXT_ ), 'PCLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * * * WKNOWN ... NOT SET, GUESS A DEFAULT * WKNOWN = .TRUE. ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL SLASRT( 'I', N, RWORK( INDD ), IINFO ) * * * These values aren't actually used, but they make ftncheck happy. * IL = -1 IU = -2 VL = ONE VU = -ONE * CALL PCLASIZEHEEVX( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, RWORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) * LHEEVXSIZE = MIN( MAXSIZE, LLRWORK ) * CALL PCSEPSUBTST( WKNOWN, 'v', 'a', UPLO, N, VL, VU, IL, IU, $ THRESH, ABSTOL, A, COPYA, Z, 1, 1, DESCA, $ RWORK( INDD ), WIN, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK( INDWORK ), LLWORK, $ RWORK( INDRWORK ), LLRWORK, LHEEVXSIZE, $ IWORK, ISIZEHEEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * * * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'V', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'V' RANGE = 'A' CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 1' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'V', RANGE = 'A', random memory * IF( INFO.EQ.0 ) THEN JOBZ = 'V' RANGE = 'A' CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 2' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * We use PCLASIZEHEEVX to choose IL and IU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'I', maximum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PCLASIZEHEEVX to choose IL and IU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = MAXSIZE * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF * * Subtest 6: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PCLASIZEHEEVX to choose IL and IU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 6' INFO = 1 END IF END IF * * Subtest 7: JOBZ = 'V', RANGE = 'I', random memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PCLASIZEHEEVX to choose IL and IU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) LHEEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 7' INFO = 1 END IF END IF * * Subtest 8: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 8' INFO = 1 END IF END IF * * Subtest 9: JOBZ = 'V', RANGE = 'V', maximum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = MAXSIZE * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 9' INFO = 1 END IF END IF * * Subtest 10: JOBZ = 'V', RANGE = 'V', * minimum memory required for eigenvectors * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest10' INFO = 1 END IF END IF * * Subtest 11: JOBZ = 'V', RANGE = 'V', * random memory (enough for all eigenvectors * but not enough to guarantee orthogonality * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest11' INFO = 1 END IF END IF * * Subtest 12: JOBZ = 'V', RANGE = 'V', * miniimum memory required for eigenvalues only * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest12' INFO = 1 END IF END IF * * Subtest 13: JOBZ = 'V', RANGE = 'V', * random memory (more than minimum required * for eigenvalues, less than required for vectors) * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PCLASIZEHEEVX to choose VL and VU for us. * CALL PCLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * * CALL PCSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest13' INFO = 1 END IF END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF * * Now that PCHEEVX been tested, we check PCHEEVD * PASSED = 'PASSED EEVD' * * PCHEEVD test1: * IF( INFO.EQ.0 ) THEN * NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( MAX( N, 1 ), NB, 0, 0, NPCOL ) LHEEVDSIZE = 1 + 9*N + 3*NP0*NQ0 ISIZEHEEVD = MAX( 1, 2+7*N+8*NPCOL ) * CALL PCSDPSUBTST( WKNOWN, UPLO, N, THRESH, ABSTOL, A, COPYA, Z, $ 1, 1, DESCA, WIN, WNEW, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVDSIZE, IWORK, ISIZEHEEVD, RES, TSTNRM, $ QTQNRM, NOUT ) * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM IF( RES.NE.0 ) THEN PASSED = 'FAILED EEVD' INFO = 1 END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF 120 CONTINUE * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 21X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 21X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 32X, $ 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) 9984 FORMAT( ' IBTYPE=', I8 ) 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) 9980 FORMAT( ' Increase TOTMEM in PCSEPDRIVER' ) * * End of PCSEPTST * END scalapack-2.0.2/TESTING/EIG/pctrddriver.f000644 000766 000024 00000047174 10363532303 020144 0ustar00juliestaff000000 000000 PROGRAM PCTRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * Purpose * ======== * * PCTRDDRIVER is the main test program for the COMPLEX * SCALAPACK TRD (symmetric tridiagonal reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 13 lines: * 'ScaLAPACK TRD computation input file' * 'PVM machine' * 'TRD.out' output file name * 6 device out * 'L' define Lower or Upper * 3 number of problems sizes * 5 31 201 values of N * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * CPLXSZ INTEGER, default = 8 bytes. * INTGSZ and CPLXSZ indicate the length in bytes on the * given platform for an integer and a single precision * complex. * MEM COMPLEX array, dimension ( TOTMEM / CPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER CPLXSZ, REALSZ, TOTMEM, MEMSIZ, NTESTS COMPLEX PADVAL PARAMETER ( CPLXSZ = 8, REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / CPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPT, IPW, ITEMP, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LCM, LWORK, MYCOL, $ MYROW, N, NB, NDIAG, NGRIDS, NMAT, NNB, NOFFD, $ NOUT, NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ, $ WORKTRD REAL ANORM, FRESID, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), QVAL( NTESTS ) COMPLEX MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PCCHEKPAD, $ PCFILLPAD, PCHETDRV, PCHETRD, PCLAFCHK, $ PCMATGEN, PCTRDINFO, PCTTRDTESTER, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC REAL PCLANHE EXTERNAL LSAME, ICEIL, ILCM, NUMROC, PCLANHE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )STOP * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PCTRDINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * NDIAG = NQ IF( LSAME( UPLO, 'U' ) ) THEN NOFFD = NQ ELSE NOFFD = NUMROC( N-1, NB, MYCOL, 0, NPCOL ) END IF NDIAG = ICEIL( REALSZ*NDIAG, CPLXSZ ) NOFFD = ICEIL( REALSZ*NOFFD, CPLXSZ ) * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPT = IPE + NOFFD + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * LWORK = MAX( NB*( NP+1 ), 3*NB ) WORKTRD = LWORK + IPOSTPAD WORKSIZ = WORKTRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN ITEMP = 2*NQ + NP IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) ITEMP = NB*ICEIL( ICEIL( NP, NB ), LCM / NPROW ) + $ ITEMP END IF ITEMP = MAX( ICEIL( REALSZ*ITEMP, CPLXSZ ), $ 2*( NB+NP )*NB ) WORKSIZ = MAX( LWORK, ITEMP ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Tridiagonal reduction', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PCMATGEN( ICTXT, 'Hemm', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), NQ, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANHE( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to symmetric tridiagonal form * CALL PCHETRD( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCHETRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETRD', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETRD', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETRD', WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - QTQ'|| / (||A|| * N * eps) * CALL PCHETDRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), IERR( 1 ) ) CALL PCLAFCHK( 'Hemm', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCHETDRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETDRV', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETDRV', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETDRV', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ. $ 0.0E+0 .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 )FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * )'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * TRD requires 16/3 N^3 floating point operations * NOPS = DBLE( N ) * NOPS = ( 4.0D+0 / 3.0D+0 )*NOPS**3 NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', UPLO, N, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', UPLO, N, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, FRESID, PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * CALL PCTTRDTESTER( IAM, NPROCS, CHECK, NOUT, THRESH, NVAL, NMAT, $ MEM, TOTMEM, KPASS, KFAIL, KSKIP ) * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UPLO N NB P Q TRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ---- ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, A4, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, F9.2, 1X, $ F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*T*Q''|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PCTRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pctrdinfo.f000644 000766 000024 00000032244 10363532303 017574 0ustar00juliestaff000000 000000 SUBROUTINE PCTRDINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NNB, NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY*(*) INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PCTRDINFO gets needed startup information for the Hermitian * tridiagonal reduction and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * MAX( 5, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL PSLAMCH, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='TRD.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GOTO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GOTO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GOTO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GOTO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS IF( LSAME( UPLO, 'L' ) ) THEN WORK( 4 ) = 1 ELSE WORK( 4 ) = 2 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Reduction Routine to Hermitian '// $ 'tridiagonal form.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex single precision Hermitian '// $ 'tridiagonal' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - QTQ''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether the ''Upper'' or ''Low'// $ 'er'' part of A is to be referenced.' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less'// $ 'than THRESH, CHECK is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'TRD time : Time in seconds to reduce the'// $ ' matrix to tridiagonal form.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for '// $ 'Hermitian tridiagonal reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) IF( WORK( 4 ).EQ.1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PCTRDINFO * END scalapack-2.0.2/TESTING/EIG/pcttrdtester.f000644 000766 000024 00000062166 10363532303 020341 0ustar00juliestaff000000 000000 SUBROUTINE PCTTRDTESTER( IAM, NPROCS, CHECK, NOUT, THRESH, NVAL, $ NMAT, MEM, TOTMEM, KPASS, KFAIL, KSKIP ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * February 24, 2000 * * .. Scalar Arguments .. LOGICAL CHECK INTEGER IAM, KFAIL, KPASS, KSKIP, NMAT, NOUT, NPROCS, $ TOTMEM REAL THRESH * .. * .. Array Arguments .. INTEGER NVAL( * ) COMPLEX MEM( * ) * .. * * Purpose * ======= * * PCTTRDTESTER tests PCHETTRD * * Arguments * ========= * * IAM (local input) INTEGER * The local process number * * NPROCS (global input) INTEGER * The number of processors * * CHECK (global input) LOGICAL * Specifies whether the user wants to check the answer * * NOUT (local input) INTEGER * File descriptor * * THRESH (global input) REAL * Acceptable error threshold * * NVAL (global input) INTEGER array dimension NMAT * The matrix sizes to test * * NMAT (global input) INTEGER * The number of matrix sizes to test * * MEM (local input) COMPLEX array dimension MEMSIZ * Where: * MEMSIZ = TOTMEM / CPLXSZ * * TOTMEM (global input) INTEGER * Number of bytes in MEM * * KPASS (local input/output) INTEGER * The number of tests which passed. Only relevant on * processor 0. * * KFAIL (local input/output) INTEGER * The number of tests which failed. Only relevant on * processor 0. * * KSKIP (local input/output) INTEGER * The number of tests which were skipped. Only relevant on * processor 0. * * ================================================================ * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER REALSZ, CPLXSZ COMPLEX PADVAL PARAMETER ( REALSZ = 4, CPLXSZ = 8, $ PADVAL = ( -9923.0E+0, -9924.0E+0 ) ) INTEGER TIMETESTS PARAMETER ( TIMETESTS = 11 ) INTEGER TESTS PARAMETER ( TESTS = 8 ) INTEGER MINTIMEN PARAMETER ( MINTIMEN = 8 ) * .. * .. Local Scalars .. LOGICAL TIME CHARACTER UPLO CHARACTER*6 PASSED INTEGER DUMMY, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPT, IPW, ITEMP, J, K, $ LCM, LWMIN, MAXTESTS, MEMSIZ, MYCOL, MYROW, N, $ NB, NDIAG, NGRIDS, NN, NOFFD, NP, NPCOL, NPROW, $ NPS, NQ, SPLITSTIMED, WORKSIZ, WORKTRD REAL ANORM, FRESID DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER ANBTEST( TESTS ), ANBTIME( TIMETESTS ), $ BALTEST( TESTS ), BALTIME( TIMETESTS ), $ DESCA( DLEN_ ), DESCD( DLEN_ ), IERR( 1 ), $ INTERTEST( TESTS ), INTERTIME( TIMETESTS ), $ PNBTEST( TESTS ), PNBTIME( TIMETESTS ), $ TWOGEMMTEST( TESTS ), TWOGEMMTIME( TIMETESTS ) DOUBLE PRECISION CTIME( 100 ), WTIME( 100 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, DESCINIT, $ IGEBR2D, IGEBS2D, IGSUM2D, PCCHEKPAD, $ PCFILLPAD, PCHETDRV, PCHETTRD, PCLAFCHK, $ PCLATRAN, PCMATGEN, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC, PJLAENV REAL PCLANHE EXTERNAL LSAME, ICEIL, ILCM, NUMROC, PJLAENV, PCLANHE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, REAL, SQRT * .. * * .. Scalars in Common .. INTEGER ANB, BALANCED, BCKBLOCK, GSTBLOCK, INTERLEAVE, $ LLTBLOCK, MINSZ, PNB, TIMEINTERNALS, TIMING, $ TRSBLOCK, TWOGEMMS * .. * .. Common blocks .. COMMON / BLOCKSIZES / GSTBLOCK, LLTBLOCK, BCKBLOCK, $ TRSBLOCK COMMON / MINSIZE / MINSZ COMMON / PJLAENVTIMING / TIMING COMMON / TAILOREDOPTS / PNB, ANB, INTERLEAVE, $ BALANCED, TWOGEMMS COMMON / TIMECONTROL / TIMEINTERNALS * .. * .. Data statements .. DATA BALTIME / 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0 / DATA INTERTIME / 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1 / DATA TWOGEMMTIME / 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0 / DATA ANBTIME / 16, 16, 16, 16, 16, 8, 8, 32, 32, 16, $ 16 / DATA PNBTIME / 32, 32, 32, 32, 32, 32, 32, 32, 32, $ 16, 64 / DATA BALTEST / 0, 0, 0, 0, 1, 1, 1, 1 / DATA INTERTEST / 0, 0, 1, 1, 0, 0, 1, 1 / DATA TWOGEMMTEST / 0, 1, 0, 1, 0, 1, 0, 1 / DATA ANBTEST / 1, 2, 3, 16, 1, 2, 3, 16 / DATA PNBTEST / 1, 16, 8, 1, 16, 8, 1, 16 / * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * IASEED = 100 SPLITSTIMED = 0 NB = 1 UPLO = 'L' MEMSIZ = TOTMEM / CPLXSZ * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * NGRIDS = INT( SQRT( REAL( NPROCS ) ) ) * DO 30 NN = 1, NGRIDS * NPROW = NN NPCOL = NN IERR( 1 ) = 0 * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * IF( N.GT.MINTIMEN ) THEN * * For timing tests, we perform one or two extra tests. * Both of these extra tests are performed with the * default values for the performance tuning parameters. * The second extra test (which is only performed if * split times are non-zero) is performed with timeinternals * set to 1 (which forces barrier syncs between many * phases of the computation). * TIME = .TRUE. MAXTESTS = TIMETESTS + 2 ELSE TIME = .FALSE. MAXTESTS = TESTS END IF * * DO 10 K = 1, MAXTESTS TIMEINTERNALS = 0 IF( TIME ) THEN IF( K.GE.MAXTESTS-1 ) THEN * * For the last two timings, we let pjlaenv set * the execution path values. These dummy * initializations aren't really necessary, * but they illustrate the fact that these values are * set in xpjlaenv. The dummy call to pjlaenv * has the side effect of setting ANB. * MINSZ = -13 BALANCED = -13 INTERLEAVE = -13 TWOGEMMS = -13 ANB = -13 PNB = -13 TIMING = 1 DUMMY = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, $ 0, 0 ) IF( K.EQ.MAXTESTS ) $ TIMEINTERNALS = 1 ELSE TIMING = 0 MINSZ = 1 BALANCED = BALTIME( K ) INTERLEAVE = INTERTIME( K ) TWOGEMMS = TWOGEMMTIME( K ) ANB = ANBTIME( K ) PNB = PNBTIME( K ) END IF ELSE TIMING = 0 MINSZ = 1 BALANCED = BALTEST( K ) INTERLEAVE = INTERTEST( K ) TWOGEMMS = TWOGEMMTEST( K ) ANB = ANBTEST( K ) PNB = PNBTEST( K ) END IF * * Skip the last test (with timeinternals = 1) if * PCHETTRD is not collecting the split times. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, SPLITSTIMED, $ 1 ) ELSE CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, SPLITSTIMED, 1, $ 0, 0 ) END IF * * IF( SPLITSTIMED.EQ.0 .AND. K.EQ.MAXTESTS ) $ GO TO 10 * * The following hack tests to make sure that PNB need not * be the same on all processes. (Provided that PNB is set * to 1 in the TRD.dat file.) * IF( PNB.EQ.1 ) $ PNB = 1 + IAM * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * CALL DESCINIT( DESCD, 1, N, NB, NB, 0, 0, ICTXT, 1, $ INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * NDIAG = NQ IF( LSAME( UPLO, 'U' ) ) THEN NOFFD = NQ ELSE NOFFD = NUMROC( N-1, NB, MYCOL, 0, NPCOL ) END IF NDIAG = ICEIL( REALSZ*NDIAG, CPLXSZ ) NOFFD = ICEIL( REALSZ*NOFFD, CPLXSZ ) * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPT = IPE + NOFFD + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) LWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + NPS * WORKTRD = LWMIN + IPOSTPAD WORKSIZ = WORKTRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN ITEMP = 2*NQ + NP IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) ITEMP = NB*ICEIL( ICEIL( NP, NB ), LCM / NPROW ) + $ ITEMP END IF ITEMP = MAX( ICEIL( REALSZ*ITEMP, CPLXSZ ), $ 2*( NB+NP )*NB ) WORKSIZ = MAX( LWMIN, ITEMP ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Tridiagonal reduction', $ ( IPW+WORKSIZ )*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * * * Generate the matrix A * CALL PCMATGEN( ICTXT, 'Hemm', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PCFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), NQ, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PCLANHE( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCLANHE', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCFILLPAD( ICTXT, WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to symmetric tridiagonal form * CALL PCHETTRD( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), LWMIN, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCHETTRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETTRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PCCHEKPAD( ICTXT, 'PCHETTRDc', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETTRDd', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETTRDe', WORKTRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKTRD-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PCFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - QTQ'|| / (||A|| * N * eps) * CALL PCHETDRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), IERR( 1 ) ) * * TTRD does not preserve the upper triangular part of A. * The following call to PCLATRAN means that we only * check the lower triangular part of A - QTQ' * CALL PCLATRAN( N, 1, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PCLAFCHK( 'Hemm', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PCCHEKPAD( ICTXT, 'PCHETDRVf', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETDRVg', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETDRVh', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PCCHEKPAD( ICTXT, 'PCHETDRVi', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ. $ 0.0E+0 .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9991 )FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' * * END IF * * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * )'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 50, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 50, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * TRD requires 16/3 N^3 floating point operations * NOPS = DBLE( N ) NOPS = ( 16.0D+0 / 3.0D+0 )*NOPS**3 NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9992 )'WALL', N, INTERLEAVE, $ TWOGEMMS, BALANCED, ANB, PNB, NPROW*NPCOL, $ WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9992 )'CPU ', N, INTERLEAVE, $ TWOGEMMS, BALANCED, ANB, PNB, NPROW*NPCOL, $ CTIME( 1 ), TMFLOPS, FRESID, PASSED * * * If split times were collected (in PCHEttrd.f), print * them out. * IF( WTIME( 13 )+WTIME( 15 )+WTIME( 16 ).GT.0.0D+0 .OR. $ CTIME( 13 )+CTIME( 15 )+CTIME( 16 ).GT.0.0D+0 ) $ THEN SPLITSTIMED = 1 END IF IF( SPLITSTIMED.EQ.1 ) THEN WRITE( NOUT, FMT = 9990 )WTIME( 10 ), WTIME( 11 ), $ WTIME( 12 ), WTIME( 13 ), WTIME( 14 ), $ WTIME( 15 ) WRITE( NOUT, FMT = 9989 )WTIME( 16 ), WTIME( 17 ), $ WTIME( 18 ), WTIME( 19 ), WTIME( 20 ), $ WTIME( 21 ) * WRITE( NOUT, FMT = 9988 )CTIME( 10 ), CTIME( 11 ), $ CTIME( 12 ), CTIME( 13 ), CTIME( 14 ), $ CTIME( 15 ) WRITE( NOUT, FMT = 9987 )CTIME( 16 ), CTIME( 17 ), $ CTIME( 18 ), CTIME( 19 ), CTIME( 20 ), $ CTIME( 21 ) WRITE( NOUT, FMT = 9986 )N, NPROW*NPCOL, PNB, ANB, $ INTERLEAVE, BALANCED, TWOGEMMS, TIMEINTERNALS END IF END IF 10 CONTINUE 20 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN IF( SPLITSTIMED.EQ.1 ) THEN WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = 9974 ) WRITE( NOUT, FMT = 9973 ) END IF END IF * * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE RETURN * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) * 9995 FORMAT( 'PCHETTRD, tailored reduction to tridiagonal form, test.' $ ) 9994 FORMAT( 'TIME N int 2gm bal anb pnb prcs TRD Time ', $ ' MFLOPS Residual CHECK' ) 9993 FORMAT( '---- ---- --- --- --- --- --- ---- -------- ', $ '----------- -------- ------' ) 9992 FORMAT( A4, 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 1X, $ I5, 1X, F9.2, 1X, F11.2, 1X, F8.2, 1X, A6 ) 9991 FORMAT( '||A - Q*T*Q''|| / (||A|| * N * eps) = ', G25.7 ) 9990 FORMAT( 'wsplit1=[wsplit1;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9989 FORMAT( 'wsplit2=[wsplit2;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9988 FORMAT( 'csplit1=[csplit1;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9987 FORMAT( 'csplit2=[csplit2;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9986 FORMAT( 'size_opts=[size_opts;', I4, 1X, I4, 1X, I4, 1X, I4, 1X, $ I4, 1X, I4, 1X, I4, 1X, I4, 1X, ' ];' ) 9985 FORMAT( 'N=1; NPROCS=2; PNB=3; ANB=4; INTERLEAVE=5; BALANCED=6;', $ ' TWOGEMMS=7; TIMEINTERNALS=8;' ) 9984 FORMAT( 'S1_OVERHEAD = 1; % Should be mainly cost of barrier' ) 9983 FORMAT( 'S1_BARRIER = 2; % Cost of barrier' ) 9982 FORMAT( 'S1_UPDCURCOL = 3; % Update the current column' ) 9981 FORMAT( 'S1_HOUSE = 4; % Compute the householder vector' ) 9980 FORMAT( 'S1_SPREAD = 5; % Spread across' ) 9979 FORMAT( 'S1_TRANSPOSE = 6; % Transpose' ) 9978 FORMAT( 'S2_UPDCURBLK = 1; % Update the current block column' ) 9977 FORMAT( 'S2_TRMVT = 2; % TRMVT v = A * h; vt = ht * A'' ' ) 9976 FORMAT( 'S2_UPD_V = 3; % v = v + V * HT * h + H * VT * h ' ) 9975 FORMAT( 'S2_TRANS_SUM = 4; % v = v + vt'' ' ) 9974 FORMAT( 'S2_DOT = 5; % c = v'' * h ' ) 9973 FORMAT( 'S2_R2K = 6; % A = A - v * h'' - h * v'' ' ) * * * End of PCTTRDTESTER * END scalapack-2.0.2/TESTING/EIG/pdbrddriver.f000644 000766 000024 00000047431 10363532303 020117 0ustar00juliestaff000000 000000 PROGRAM PDBRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * Purpose * ======= * * PDBRDDRIVER is the main test program for the DOUBLE PRECISION * ScaLAPACK BRD (bidiagonal reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 13 lines: * 'ScaLAPACK BRD computation input file' * 'PVM machine' * 'BRD.out' output file name * 6 device out * 3 number of problems sizes * 16 20 18 values of M * 16 18 20 values of N * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 1.0 threshold * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, MEMSIZ, NTESTS, TOTMEM DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPTP, IPTQ, IPW, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LWORK, M, MAXMN, $ MINMN, MNP, MNQ, MP, MYCOL, MYROW, N, NB, $ NDIAG, NGRIDS, NMAT, NNB, NOFFD, NOUT, NPCOL, $ NPROCS, NPROW, NQ, WORKBRD, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ MVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), MEM( MEMSIZ ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDCHEKPAD, $ PDBRDINFO, PDFILLPAD, PDLAFCHK, $ PDMATGEN, PDGEBDRV, PDGEBRD, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLANGE EXTERNAL ICEIL, NUMROC, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PDBRDINFO( OUTFILE, NOUT, NMAT, MVAL, NTESTS, NVAL, NTESTS, $ NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * * Go to bottom of loop if this case doesn't use my process * DO 20 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) MNP = NUMROC( MIN( M, N ), NB, MYROW, 0, NPROW ) MNQ = NUMROC( MIN( M, N ), NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP )+IMIDPAD, IERR( 1 ) ) * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IF( M.GE.N ) THEN NDIAG = MNQ NOFFD = MNP ELSE NDIAG = MNP NOFFD = NUMROC( MIN( M, N )-1, NB, MYCOL, 0, NPCOL ) END IF * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPTQ = IPE + NOFFD + IPOSTPAD + IPREPAD IPTP = IPTQ + MNQ + IPOSTPAD + IPREPAD IPW = IPTP + MNP + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * LWORK = NB*( MP+NQ+1 ) + NQ WORKBRD = LWORK + IPOSTPAD WORKSIZ = WORKBRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN WORKSIZ = MAX( LWORK, 2*NB*( MP+NQ+NB ) ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Bidiagonal reduction', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PDMATGEN( ICTXT, 'No', 'No', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, MNQ, 1, MEM( IPTQ-IPREPAD ), $ MNQ, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, MNP, 1, MEM( IPTP-IPREPAD ), $ MNP, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANGE( 'I', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKBRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKBRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to bidiagonal form * CALL PDGEBRD( M, N, MEM( IPA ), 1, 1, DESCA, MEM( IPD ), $ MEM( IPE ), MEM( IPTQ ), MEM( IPTP ), $ MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGEBRD', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEBRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEBRD', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEBRD', MNQ, 1, $ MEM( IPTQ-IPREPAD ), MNQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEBRD', MNP, 1, $ MEM( IPTP-IPREPAD ), MNP, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEBRD', WORKBRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKBRD-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A-Q*B*P|| / (||A|| * N * eps) * CALL PDGEBDRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPTQ ), $ MEM( IPTP ), MEM( IPW ), IERR( 1 ) ) CALL PDLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGEBDRV', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEBDRV', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEBDRV', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEBDRV', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ.0.0D+0 $ .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID * KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * ) $ 'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * BRD requires 8/3 N^3 floating point operations * MAXMN = MAX( M, N ) MINMN = MIN( M, N ) NOPS = 4.0D+0 * DBLE( MINMN ) * DBLE( MINMN ) * $ ( DBLE( MAXMN ) - DBLE( MINMN ) / 3.0D+0 ) NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, FRESID, PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N NB P Q BRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ------ ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, F9.2, 1X, $ F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*B*P|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PDBRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pdbrdinfo.f000644 000766 000024 00000032173 10363532303 017554 0ustar00juliestaff000000 000000 SUBROUTINE PDBRDINFO( SUMMRY, NOUT, NMAT, MVAL, LDMVAL, NVAL, $ LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDMVAL, LDNBVAL, LDNVAL, LDPVAL, $ LDQVAL, NGRIDS, NMAT, NNB, NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDMVAL ), NBVAL( LDNBVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), $ QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PDBRDINFO get needed startup information for the bidiagonal * reduction and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, output to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for M & N. * * MVAL (global output) INTEGER array, dimension (LDMVAL) * The values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M. LDMVAL >= NMAT. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N. LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * MAX( 5, LDMVAL+LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack * all input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, IGEBR2D, IGEBS2D, $ SCOPY * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'BRD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( UNIT = NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get values of M, N * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1. .OR. NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'M', LDMVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 1, 3, WORK, 1 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Bidiagonal reduction' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision bidiagonal ' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - Q B P''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less'// $ ' than THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'BRD time : Time in seconds to reduce the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for '// $ 'the bidiagonal reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9995 ) $ 'M ', ( MVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( MVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9995 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9999 ) ' ' WRITE( NOUT, FMT = 9996 ) EPS WRITE( NOUT, FMT = 9993 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 3, WORK, 1, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = 2*NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 CONTINUE WRITE( NOUT, FMT = 9998 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9997 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9996 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9995 FORMAT( 2X, A5, ': ', 10I6 ) 9994 FORMAT( ' ', 10I6 ) 9993 FORMAT( 'Routines pass computational tests if scaled residual is', $ ' less than ', G12.5 ) * * End of PDBRDINFO * END scalapack-2.0.2/TESTING/EIG/pdgebdrv.f000644 000766 000024 00000046630 10363532303 017405 0ustar00juliestaff000000 000000 SUBROUTINE PDGEBDRV( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INFO, IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAUP( * ), TAUQ( * ), $ WORK( * ) * .. * * Purpose * ======= * * PDGEBDRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from sub( A ), * Q, P returned by PDGEBRD: * * sub( A ) := Q * B * P'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of sub( A ) * as returned by PDGEBRD. On exit, the original distribu- * ted matrix sub( A ) is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local input) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local input) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local input) DOUBLE PRECISION array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix Q. TAUQ * is tied to the distributed matrix A. See Further Details. * * TAUP (local input) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix P. TAUP * is tied to the distributed matrix A. See Further Details. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK >= 2*NB*( MP + NQ + NB ) * * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MP = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * On exit, if INFO <> 0, a discrepancy has been found between * the diagonal and off-diagonal elements of A and the copies * contained in the arrays D and E. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION EIGHT, ONE, ZERO PARAMETER ( EIGHT = 8.0D+0, ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, IIA, IL, IPTP, IPTQ, $ IPV, IPW, IPWK, IOFF, IV, J, JB, JJA, JL, JV, $ K, MN, MP, MYCOL, MYROW, NB, NPCOL, NPROW, NQ DOUBLE PRECISION ADDBND, D1, D2, E1, E2 * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCV( DLEN_ ), $ DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, IGSUM2D, INFOG2L, $ PDLACPY, PDLARFB, PDLARFT, PDLASET, $ PDELGET * .. * .. External Functions .. INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL INDXG2P, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 NB = DESCA( MB_ ) IOFF = MOD( IA-1, NB ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IOFF, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+IOFF, NB, MYCOL, IACOL, NPCOL ) IPV = 1 IPW = IPV + MP*NB IPTP = IPW + NQ*NB IPTQ = IPTP + NB*NB IPWK = IPTQ + NB*NB * IV = 1 JV = 1 MN = MIN( M, N ) IL = MAX( ( (IA+MN-2) / NB )*NB + 1, IA ) JL = MAX( ( (JA+MN-2) / NB )*NB + 1, JA ) IAROW = INDXG2P( IL, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCV, IA+M-IL, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCW, NB, JA+N-JL, NB, NB, IAROW, IACOL, ICTXT, $ NB ) * ADDBND = EIGHT * PDLAMCH( ICTXT, 'eps' ) * * When A is an upper bidiagonal form * IF( M.GE.N ) THEN * CALL DESCSET( DESCD, 1, JA+MN-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MN-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) * DO 10 J = 0, MN-1 D1 = ZERO E1 = ZERO D2 = ZERO E2 = ZERO CALL PDELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PDELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(MN-1) ) THEN CALL PDELGET( ' ', ' ', E2, E, IA+J, 1, DESCE ) CALL PDELGET( 'Rowwise', ' ', E1, A, IA+J, JA+J+1, $ DESCA ) END IF * IF( ( ABS( D1 - D2 ).GT.( ABS( D2 ) * ADDBND ) ) .OR. $ ( ABS( E1 - E2 ).GT.( ABS( E2 ) * ADDBND ) ) ) $ INFO = INFO + 1 10 CONTINUE * DO 20 J = JL, JA+NB-IOFF, -NB JB = MIN( JA+N-J, NB ) I = IA + J - JA K = I - IA + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PDLARFT( 'Forward', 'Columnwise', M-K+1, JB, A, I, J, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'Lower', M-K+1, JB, A, I, J, DESCA, $ WORK( IPV ), IV, JV, DESCV ) CALL PDLASET( 'Upper', M-K+1, JB, ZERO, ONE, WORK( IPV ), $ IV, JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PDLASET( 'Lower', M-K, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PDLARFT( 'Forward', 'Rowwise', N-K, JB, A, I, J+1, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'Upper', JB, N-K, A, I, J+1, DESCA, $ WORK( IPW ), IV, JV+1, DESCW ) CALL PDLASET( 'Lower', JB, N-K, ZERO, ONE, WORK( IPW ), IV, $ JV+1, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PDLASET( 'Upper', JB, N-K-1, ZERO, ZERO, A, I, J+2, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PDLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-K+1, N-K+1, JB, WORK( IPV ), $ IV, JV, DESCV, WORK( IPTQ ), A, I, J, DESCA, $ WORK( IPWK ) ) * * Apply block Householder transformation from Right. * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', $ M-K+1, N-K, JB, WORK( IPW ), IV, JV+1, DESCW, $ WORK( IPTP ), A, I, J+1, DESCA, WORK( IPWK ) ) * DESCV( M_ ) = DESCV( M_ ) + NB DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCW( N_ ) = DESCW( N_ ) + NB DESCW( RSRC_ ) = DESCV( RSRC_ ) DESCW( CSRC_ ) = DESCV( CSRC_ ) * 20 CONTINUE * * Handle first block separately * JB = MIN( N, NB - IOFF ) IV = IOFF + 1 JV = IOFF + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PDLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPV ), $ IV, JV, DESCV ) CALL PDLASET( 'Upper', M, JB, ZERO, ONE, WORK( IPV ), IV, JV, $ DESCV ) * * Zero out the strict lower triangular part of A. * CALL PDLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PDLARFT( 'Forward', 'Rowwise', N-1, JB, A, IA, JA+1, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'Upper', JB, N-1, A, IA, JA+1, DESCA, $ WORK( IPW ), IV, JV+1, DESCW ) CALL PDLASET( 'Lower', JB, N-1, ZERO, ONE, WORK( IPW ), IV, $ JV+1, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PDLASET( 'Upper', JB, N-2, ZERO, ZERO, A, IA, JA+2, $ DESCA ) * * Apply block Householder transformation from left. * CALL PDLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M, N, JB, WORK( IPV ), IV, JV, DESCV, $ WORK( IPTQ ), A, IA, JA, DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from right. * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', M, $ N-1, JB, WORK( IPW ), IV, JV+1, DESCW, $ WORK( IPTP ), A, IA, JA+1, DESCA, WORK( IPWK ) ) * ELSE * CALL DESCSET( DESCD, IA+MN-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MN-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 30 J = 0, MN-1 D1 = ZERO E1 = ZERO D2 = ZERO E2 = ZERO CALL PDELGET( ' ', ' ', D2, D, IA+J, 1, DESCD ) CALL PDELGET( 'Rowwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(MN-1) ) THEN CALL PDELGET( ' ', ' ', E2, E, 1, JA+J, DESCE ) CALL PDELGET( 'Columnwise', ' ', E1, A, IA+J+1, JA+J, $ DESCA ) END IF * IF( ( ABS( D1 - D2 ).GT.( ABS( D2 ) * ADDBND ) ) .OR. $ ( ABS( E1 - E2 ).GT.( ABS( E2 ) * ADDBND ) ) ) $ INFO = INFO + 1 30 CONTINUE * DO 40 I = IL, IA+NB-IOFF, -NB JB = MIN( IA+M-I, NB ) J = JA + I - IA K = J - JA + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PDLARFT( 'Forward', 'Columnwise', M-K, JB, A, I+1, J, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'Lower', M-K, JB, A, I+1, J, DESCA, $ WORK( IPV ), IV+1, JV, DESCV ) CALL PDLASET( 'Upper', M-K, JB, ZERO, ONE, WORK( IPV ), $ IV+1, JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PDLASET( 'Lower', M-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PDLARFT( 'Forward', 'Rowwise', N-K+1, JB, A, I, J, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'Upper', JB, N-K+1, A, I, J, DESCA, $ WORK( IPW ), IV, JV, DESCW ) CALL PDLASET( 'Lower', JB, N-K+1, ZERO, ONE, WORK( IPW ), $ IV, JV, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PDLASET( 'Upper', JB, N-K, ZERO, ZERO, A, I, J+1, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PDLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-K, N-K+1, JB, WORK( IPV ), $ IV+1, JV, DESCV, WORK( IPTQ ), A, I+1, J, $ DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from Right. * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', $ M-K+1, N-K+1, JB, WORK( IPW ), IV, JV, DESCW, $ WORK( IPTP ), A, I, J, DESCA, WORK( IPWK ) ) * DESCV( M_ ) = DESCV( M_ ) + NB DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCW( N_ ) = DESCW( N_ ) + NB DESCW( RSRC_ ) = DESCV( RSRC_ ) DESCW( CSRC_ ) = DESCV( CSRC_ ) * 40 CONTINUE * * Handle first block separately * JB = MIN( M, NB - IOFF ) IV = IOFF + 1 JV = IOFF + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PDLARFT( 'Forward', 'Columnwise', M-1, JB, A, IA+1, JA, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'Lower', M-1, JB, A, IA+1, JA, DESCA, $ WORK( IPV ), IV+1, JV, DESCV ) CALL PDLASET( 'Upper', M-1, JB, ZERO, ONE, WORK( IPV ), IV+1, $ JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PDLASET( 'Lower', M-2, JB, ZERO, ZERO, A, IA+2, JA, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PDLARFT( 'Forward', 'Rowwise', N, JB, A, IA, JA, DESCA, $ TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'Upper', JB, N, A, IA, JA, DESCA, WORK( IPW ), $ IV, JV, DESCW ) CALL PDLASET( 'Lower', JB, N, ZERO, ONE, WORK( IPW ), IV, JV, $ DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PDLASET( 'Upper', JB, N-1, ZERO, ZERO, A, IA, JA+1, $ DESCA ) * * Apply block Householder transformation from left * CALL PDLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M-1, N, JB, WORK( IPV ), IV+1, JV, DESCV, $ WORK( IPTQ ), A, IA+1, JA, DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from right * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', M, N, $ JB, WORK( IPW ), IV, JV, DESCW, WORK( IPTP ), $ A, IA, JA, DESCA, WORK( IPWK ) ) END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * RETURN * * End of PDGEBDRV * END scalapack-2.0.2/TESTING/EIG/pdgehdrv.f000644 000766 000024 00000017676 10602576752 017437 0ustar00juliestaff000000 000000 SUBROUTINE PDGEHDRV( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEHDRV computes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from the * orthogonal matrix Q, the Hessenberg matrix, and the array TAU * returned by PDGEHRD: * sub( A ) := Q * H * Q' * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows and columns 1:ILO-1 and IHI+1:N. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) reduced to Hessenberg * form by PDGEHRD. The upper triangle and the first sub- * diagonal of sub( A ) contain the upper Hessenberg matrix H, * and the elements below the first subdiagonal, with the array * TAU, represent the orthogonal matrix Q as a product of * elementary reflectors. On exit, the original distributed * N-by-N matrix sub( A ) is recovered. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors returned by * PDGEHRD. TAU is tied to the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK). * LWORK >= NB*NB + NB*IHLP + MAX[ NB*( IHLP+INLQ ), * NB*( IHLQ + MAX[ IHIP, * IHLP+NUMROC( NUMROC( IHI-ILO+LOFF+1, NB, 0, 0, * NPCOL ), NB, 0, 0, LCMQ ) ] ) ] * * where NB = MB_A = NB_A, * LCM is the least common multiple of NPROW and NPCOL, * LCM = ILCM( NPROW, NPCOL ), LCMQ = LCM / NPCOL, * * IROFFA = MOD( IA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * * ILROW = INDXG2P( IA+ILO-1, NB, MYROW, RSRC_A, NPROW ), * ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, CSRC_A, NPCOL ), * IHLP = NUMROC( IHI-ILO+IROFFA+1, NB, MYROW, ILROW, NPROW ), * IHLQ = NUMROC( IHI-ILO+IROFFA+1, NB, MYCOL, ILCOL, NPCOL ), * INLQ = NUMROC( N-ILO+IROFFA+1, NB, MYCOL, ILCOL, NPCOL ). * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, IHLP, II, IOFF, IPT, $ IPV, IPW, IV, J, JB, JJ, JL, K, MYCOL, MYROW, $ NB, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PDLARFB, $ PDLARFT, PDLACPY, PDLASET * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( IHI-ILO.LE.0 ) $ RETURN * NB = DESCA( MB_ ) IOFF = MOD( IA+ILO-2, NB ) CALL INFOG2L( IA+ILO-1, JA+ILO-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, IAROW, IACOL ) IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, IAROW, NPROW ) * IPT = 1 IPV = IPT + NB * NB IPW = IPV + IHLP * NB JL = MAX( ( ( JA+IHI-2 ) / NB ) * NB + 1, JA + ILO - 1 ) CALL DESCSET( DESCV, IHI-ILO+IOFF+1, NB, NB, NB, IAROW, $ INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ), ICTXT, MAX( 1, IHLP ) ) * DO 10 J = JL, ILO+JA+NB-IOFF-1, -NB JB = MIN( JA+IHI-J-1, NB ) I = IA + J - JA K = I - IA + 1 IV = K - ILO + IOFF + 1 * * Compute upper triangular matrix T from TAU. * CALL PDLARFT( 'Forward', 'Columnwise', IHI-K, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'All', IHI-K, JB, A, I+1, J, DESCA, WORK( IPV ), $ IV+1, 1, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PDLASET( 'Lower', IHI-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PDLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ IHI-K, N-K+1, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, I+1, J, DESCA, WORK( IPW ) ) * * Apply block Householder transformation from Right. * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Columnwise', $ IHI, IHI-K, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, IA, J+1, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle the first block separately * IV = IOFF + 1 I = IA + ILO - 1 J = JA + ILO - 1 JB = MIN( NB-IOFF, JA+IHI-J-1 ) * * Compute upper triangular matrix T from TAU. * CALL PDLARFT( 'Forward', 'Columnwise', IHI-ILO, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace. * CALL PDLACPY( 'All', IHI-ILO, JB, A, I+1, J, DESCA, WORK( IPV ), $ IV+1, 1, DESCV ) * * Zero out the strict lower triangular part of A. * IF( IHI-ILO.GT.0 ) $ CALL PDLASET( 'Lower', IHI-ILO-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PDLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ IHI-ILO, N-ILO+1, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, I+1, J, DESCA, WORK( IPW ) ) * * Apply block Householder transformation from Right. * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Columnwise', IHI, $ IHI-ILO, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, IA, J+1, DESCA, WORK( IPW ) ) * RETURN * * End of PDGEHDRV * END scalapack-2.0.2/TESTING/EIG/pdgrptseptst.f000644 000766 000024 00000005306 10363532303 020346 0ustar00juliestaff000000 000000 * * PROGRAM PDRPTGSEPTST * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Repeat generalized parallel symmetric eigenproblem test * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER MAXN, LWORK, LIWORK PARAMETER ( MAXN = 200, LWORK = 500000, $ LIWORK = 6*MAXN+4 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS, UPLO INTEGER CONTEXT, IAM, IBTYPE, INFO, IPOSTPAD, IPREPAD, $ LDA, MATTYPE, N, NB, NPCOL, NPROCS, NPROW DOUBLE PRECISION ABSTOL, THRESH * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), ICLUSTR( MAXN ), IFAIL( MAXN ), $ ISEED( 4 ), IWORK( LIWORK ) DOUBLE PRECISION A( MAXN*MAXN ), B( MAXN, MAXN ), $ COPYA( MAXN*MAXN ), COPYB( MAXN, MAXN ), $ GAP( MAXN ), WIN( MAXN ), WNEW( MAXN ), $ WORK( LWORK ), Z( MAXN*MAXN ) * .. * * * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDINIT, $ BLACS_PINFO, BLACS_SETUP, DESCINIT, PDGSEPTST * .. * .. Executable Statements .. * IPREPAD = 3 IPOSTPAD = 3 LDA = MAXN * * These lines should be replaced by the output from pxGSEPdriver * * ISEED( 1 ) = 2312 ISEED( 2 ) = 3709 ISEED( 3 ) = 666 ISEED( 4 ) = 3371 UPLO = 'U' SUBTESTS = 'Y' N = 33 NPROW = 2 NPCOL = 2 NB = 4 MATTYPE = 9 IBTYPE = 1 * note: the printout often makes a mess of ABSTOL ABSTOL = 0.1175494351D-37 THRESH = .350000D+01 * CALL BLACS_PINFO( IAM, NPROCS ) IF( NPROCS.LT.1 ) THEN * NPROCS = NPROW*NPCOL CALL BLACS_SETUP( IAM, NPROCS ) END IF CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, CONTEXT, LDA, INFO ) * CALL PDGSEPTST( DESCA, UPLO, N, MATTYPE, IBTYPE, SUBTESTS, THRESH, $ N, ABSTOL, ISEED, A, COPYA, B, COPYB, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK-IPREPAD-IPOSTPAD, IWORK, $ LIWORK-IPREPAD-IPOSTPAD, 6, INFO ) * * * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * * CALL BLACS_EXIT( 0 ) STOP * * * * End of PDRPTGSEPTST * END scalapack-2.0.2/TESTING/EIG/pdgsepchk.f000644 000766 000024 00000031025 10363532303 017550 0ustar00juliestaff000000 000000 * * SUBROUTINE PDGSEPCHK( IBTYPE, MS, NV, A, IA, JA, DESCA, B, IB, JB, $ DESCB, THRESH, Q, IQ, JQ, DESCQ, C, IC, JC, $ DESCC, W, WORK, LWORK, TSTNRM, RESULT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 15, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, IBTYPE, IC, IQ, JA, JB, JC, JQ, LWORK, $ MS, NV, RESULT DOUBLE PRECISION THRESH, TSTNRM * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCB( * ), DESCC( * ), DESCQ( * ) DOUBLE PRECISION A( * ), B( * ), C( * ), Q( * ), W( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PDGSEPCHK checks a decomposition of the form * * A Q = B Q D or * A B Q = Q D or * B A Q = Q D * * where A is a symmetric matrix, B is * symmetric positive definite, Q is orthogonal, and D is diagonal. * * One of the following test ratios is computed: * * IBTYPE = 1: TSTNRM = | A Q - B Q D | / ( |A| |Q| n ulp ) * * IBTYPE = 2: TSTNRM = | A B Q - Q D | / ( |A| |Q| n ulp ) * * IBTYPE = 3: TSTNRM = | B A Q - Q D | / ( |A| |Q| n ulp ) * * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * MP = number of local rows in A, B and Q * MQ = number of local columns in A * NQ = number of local columns in B and Q * * IBTYPE (input) INTEGER * The form of the symmetric generalized eigenproblem. * = 1: A*Q = (lambda)*B*Q * = 2: A*B*Q = (lambda)*Q * = 3: B*A*Q = (lambda)*Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in A, B, C and Q * Also, the number of columns in A * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q. * * A (local input) REAL pointer to an * array in local memory of dimension (LLD_A, LOCc(JA+N-1)). * This array contains the local pieces of the M-by-N * distributed test matrix A * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix A. * * B (local input) REAL pointer to an * array in local memory of dimension (LLD_B, LOCc(JB+N-1)). * This array contains the local pieces of the M-by-N * distributed test matrix B * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix B. * * THRESH (input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) REAL array * global dimension (MS, NV), * local dimension (DESCA( DLEN_ ), NQ) * * Contains the eigenvectors as computed by PSSYEVX * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix Q. * * C (local workspace) REAL array, * global dimension (MS, NV), * local dimension (DESCA( DLEN_ ), MQ) * * Accumulator for computing AQ -QL * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix C. * * W (global input) REAL array, dimension (NV) * * Contains the computed eigenvalues * * WORK (local workspace) REAL array, * dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * TSTNRM (global output) REAL * * RESULT (global output) INTEGER * 0 if the test passes * 1 if the test fails * * .. Local Scalars .. * INTEGER I, INFO, MYCOL, MYROW, NPCOL, NPROW, NQ DOUBLE PRECISION ANORM, ULP * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) DOUBLE PRECISION CONE, CNEGONE, CZERO PARAMETER ( CONE = 1.0D+0, CNEGONE = -1.0D+0, $ CZERO = 0.0D+0 ) * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION DLAMCH, PDLANGE EXTERNAL NUMROC, DLAMCH, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PDGEMM, PDSCAL, $ PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * RESULT = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 CALL CHK1MAT( MS, 1, MS, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( MS, 1, MS, 2, IB, JB, DESCB, 11, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IQ, JQ, DESCQ, 16, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IB, JB, DESCB, 20, INFO ) * IF( INFO.EQ.0 ) THEN * NQ = NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IF( IQ.NE.1 ) THEN INFO = -14 ELSE IF( JQ.NE.1 ) THEN INFO = -15 ELSE IF( IA.NE.1 ) THEN INFO = -5 ELSE IF( JA.NE.1 ) THEN INFO = -6 ELSE IF( IB.NE.1 ) THEN INFO = -9 ELSE IF( JB.NE.1 ) THEN INFO = -10 ELSE IF( LWORK.LT.NQ ) THEN INFO = -23 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PDGSEPCHK', -INFO ) RETURN END IF * RESULT = 0 ULP = DLAMCH( 'Epsilon' ) * * Compute product of Max-norms of A and Q. * ANORM = PDLANGE( 'M', MS, MS, A, IA, JA, DESCA, WORK )* $ PDLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) IF( ANORM.EQ.ZERO ) $ ANORM = ONE * IF( IBTYPE.EQ.1 ) THEN * * Norm of AQ - BQD * * C = AQ * CALL PDGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 10 I = 1, NV CALL PDSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 10 CONTINUE * * C = C - BQ (i.e. AQ-BQD) * CALL PDGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, Q, $ IQ, JQ, DESCQ, CNEGONE, C, IC, JC, DESCC ) * TSTNRM = ( PDLANGE( 'M', MS, NV, C, IC, JC, DESCC, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * * ELSE IF( IBTYPE.EQ.2 ) THEN * * Norm of ABQ - QD * * * C = BQ * CALL PDGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 20 I = 1, NV CALL PDSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 20 CONTINUE * * Q = AC - Q * CALL PDGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, C, $ IC, JC, DESCC, CNEGONE, Q, IQ, JQ, DESCQ ) * TSTNRM = ( PDLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * ELSE IF( IBTYPE.EQ.3 ) THEN * * Norm of BAQ - QD * * * C = AQ * CALL PDGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 30 I = 1, NV CALL PDSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 30 CONTINUE * * Q = BC - Q * CALL PDGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, C, $ IC, JC, DESCC, CNEGONE, Q, IQ, JQ, DESCQ ) * TSTNRM = ( PDLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * END IF * IF( TSTNRM.GT.THRESH .OR. ( TSTNRM-TSTNRM.NE.0.0D0 ) ) THEN RESULT = 1 END IF RETURN * * End of PDGSEPCHK * END scalapack-2.0.2/TESTING/EIG/pdgsepdriver.f000644 000766 000024 00000023377 10363532303 020311 0ustar00juliestaff000000 000000 * * PROGRAM PDGSEPDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Parallel DOUBLE PRECISION symmetric eigenproblem test driver * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. INTSIZ and DBLSIZ * indicate the length in bytes on the given platform for an integer * and a double precision real. * For example, on our system with 8 MB of memory, TOTMEM=6500000 * (leaves 1.5 MB for OS, code, BLACS buffer, etc), the length of a * DOUBLE is 8, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * The full tester requires approximately (5 n + 5 n^2/p + slop) * DOUBLE PRECISION words and 6*n integer words. * So, TOTMEM should be set to at least 1.1 * 8 * (5n + 5n^2/p) * * WHAT WE TEST * ============ * * This routine tests PDSYGVX, the expert driver for the parallel * symmetric eigenvalue problem. We would like to cover all * possible combinations of: matrix size, process configuration * (nprow and npcol), block size (nb), matrix type (??), range * of eigenvalue (all, by value, by position), sorting options, * and upper vs. lower storage. * * We intend to provide two types of test input files, an * installation test and a thorough test. * * We also intend that the reports be meaningful. Our input file * will allow multiple requests where each request is a cross product * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, DBLESZ, NIN PARAMETER ( TOTMEM = 2000000, DBLESZ = 8, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / DBLESZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) DOUBLE PRECISION MEM( MEMSIZ ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PDGSEPREQ, PDLACHKIEEE, PDLASNBT * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PDLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) END IF * CALL PDLACHKIEEE( ISIEEE, DLAMCH( 'O' ), DLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) END IF GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK symmetric Eigendecomposition routines.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'generalized ' // 'symmetric eigenvalue routine: PDSYGVX.' WRITE( NOUT, FMT = 9999 )'A scaled residual check, ' // $ 'will be computed' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ ' : the QTQ norm is allowed to exceed THRESH' // $ ' for those eigenvectors' WRITE( NOUT, FMT = 9999 )' : which could not be ' // $ 'reorthogonalized for lack of workspace.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see pDGSEPtst.f).' WRITE( NOUT, FMT = 9999 ) $ 'IBTYPE : Generalized eigenproblem type' // $ ' (see pDSYGVx.f)' WRITE( NOUT, FMT = 9999 )'SUB : Subtests ' // $ '(see pDGSEPtst).f' WRITE( NOUT, FMT = 9999 )'CHK : The scaled residual' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PDGSEPREQ( NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, NSKIPPED, $ NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * 9999 FORMAT( A ) 9998 FORMAT( ' I am about to check to make sure that overflow' ) 9997 FORMAT( ' is handled in the ieee default manner. If this' ) 9996 FORMAT( ' is the last output you see, you should assume' ) 9995 FORMAT( ' that overflow caused a floating point exception.' ) 9994 FORMAT( ' In that case, we recommend that you add -DNO_IEEE' ) 9993 FORMAT( ' to the CDEFS line in SLmake.inc.' ) 9992 FORMAT( ' Alternatively, you could set CDEFS in SLmake.inc ' ) 9991 FORMAT( ' to enable the default ieee behaviour, However, this' ) 9990 FORMAT( ' may result in good or very bad performance.' ) 9989 FORMAT( ' Either signed zeroes or signed infinities ' ) 9988 FORMAT( ' work incorrectly or your system. Change your' ) 9987 FORMAT( ' SLmake.inc as suggested above.' ) * 9986 FORMAT( ' Your system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP IBTYPE SUB WALL CPU ', $ ' CHK CHECK' ) 9978 FORMAT( ' ----- --- --- --- --- ------ --- -------- --------', $ ' --------- -----' ) * * End of PDGSEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pdgsepreq.f000644 000766 000024 00000024370 10602576752 017613 0ustar00juliestaff000000 000000 * * SUBROUTINE PDGSEPREQ( NIN, MEM, MEMSIZE, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION MEM( MEMSIZE ) * .. * * Purpose * ======= * * PDGSEPREQ performs one request from the input file 'SEP.dat' * A request is the cross product of the specifications in the * input file. PDGSEPREQ prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEP.dat' * * MEM (local input) DOUBLE PRECISION ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION FIVE PARAMETER ( FIVE = 5.0D+0 ) INTEGER DBLESZ, INTGSZ PARAMETER ( DBLESZ = 8, INTGSZ = 4 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IBTYPE, IMIDPAD, INITCON, $ IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, LDA, LLWORK, MATSIZE, MATTYPE, MYCOL, $ MYROW, N, NB, NIBTYPES, NMATSIZES, NMATTYPES, $ NNODES, NP, NPCOL, NPCONFIGS, NPROW, NQ, $ NUPLOS, ORDER, PCONFIG, PTRA, PTRB, PTRCOPYA, $ PTRCOPYB, PTRGAP, PTRICLUS, PTRIFAIL, PTRIWRK, $ PTRW, PTRW2, PTRWORK, PTRZ, RES, SIZECHK, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, $ SIZESUBTST, SIZESYEVX, SIZETMS, SIZETST, UPLO DOUBLE PRECISION ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL LSAME, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PDGSEPTST, PDLASIZEGSEP, PDSEPINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PDSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * * Thresholds must be bigger for the generalized problem. * THRESH = THRESH*FIVE * DO 50 MATSIZE = 1, NMATSIZES * DO 40 PCONFIG = 1, NPCONFIGS * DO 30 MATTYPE = 1, NMATTYPES * DO 20 UPLO = 1, NUPLOS IF( LSAME( SUBTESTS, 'Y' ) ) THEN NIBTYPES = 3 ELSE NIBTYPES = 1 END IF DO 10 IBTYPE = 1, NIBTYPES * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, $ NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, $ MYROW, MYCOL ) IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PDLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, $ ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, $ ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYB = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRB = PTRCOPYB + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRB + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + MAX( N, 1 ) + IPREPAD + $ IPOSTPAD PTRGAP = PTRW2 + MAX( N, 1 ) + IPREPAD + $ IPOSTPAD PTRIFAIL = PTRGAP + NPROW*NPCOL + IPREPAD + $ IPOSTPAD PTRICLUS = PTRIFAIL + $ ICEIL( N+IPREPAD+IPOSTPAD, $ DBLESZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, DBLESZ / INTGSZ ) PTRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, DBLESZ / INTGSZ ) LLWORK = MEMSIZE - PTRWORK - IPOSTPAD - $ IPREPAD + 1 NTESTS = NTESTS + 1 IF( LLWORK.LT.SIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PDGSEPTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), $ IBTYPE, SUBTESTS, THRESH, $ N, ABSTOL, ISEED, $ MEM( PTRA ), $ MEM( PTRCOPYA ), $ MEM( PTRB ), $ MEM( PTRCOPYB ), $ MEM( PTRZ ), LDA, $ MEM( PTRW ), MEM( PTRW2 ), $ MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ LLWORK, MEM( PTRIWRK ), $ ISIZETST, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * ) $ ' pDGSEPREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE 50 CONTINUE END IF * * RETURN * * End of PDDGSEPREQ * END scalapack-2.0.2/TESTING/EIG/pdgsepsubtst.f000644 000766 000024 00000070770 10363532303 020341 0ustar00juliestaff000000 000000 * * SUBROUTINE PDGSEPSUBTST( WKNOWN, IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, IA, JA, DESCA, WIN, WNEW, $ IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, LWORK1, IWORK, LIWORK, $ RESULT, TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IBTYPE, IL, IPOSTPAD, IPREPAD, IU, JA, $ LIWORK, LWORK, LWORK1, N, NOUT, RESULT DOUBLE PRECISION ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) DOUBLE PRECISION A( * ), B( * ), COPYA( * ), COPYB( * ), $ GAP( * ), WIN( * ), WNEW( * ), WORK( * ), $ Z( * ) * .. * * Purpose * ======= * * PDGSEPSUBTST calls PDSYGVX and then tests the output of * PDSYGVX * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < THRESH * |QT * Q - I| / eps * norm(A) < THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PDSYGVX when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PDGSEPSUBTST * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call to PDGSEPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PDSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) DOUBLE PRECISION array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PDSYGVX for a description of block cyclic layout. * The test matrix, which is then modified by PDSYGVX * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) DOUBLE PRECISION array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * B (local workspace) DOUBLE PRECISION array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. * The B test matrix, which is then modified by PDSYGVX * * COPYB (local input) DOUBLE PRECISION array, dim (N, N) * COPYB is used to hold an identical copy of the array B * identical in both form and content to B * * Z (local workspace) DOUBLE PRECISION array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PDGSEPCHK and PDSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) DOUBLE PRECISION array, dimension (N) * The eigenvalues as copmuted by this call to PDSYGVX * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PDSYGVX * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PDSYGVX * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / |A|*N*EPS * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285D+0, FIVE = 5.0D+0, $ NEGONE = -1.0D+0 ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEVX, SIZETMS, $ SIZETST, VALSIZE, VECSIZE DOUBLE PRECISION EPS, ERROR, MAXERROR, MAXVU, MINERROR, MINVL, $ NORMWIN, OLDVL, OLDVU, ORFAC, SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), DSEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, DGAMN2D, DGAMX2D, $ DLACPY, IGAMN2D, IGAMX2D, PDCHEKPAD, PDELSET, $ PDFILLPAD, PDGSEPCHK, PDLASIZEGSEP, $ PDLASIZESYEVX, PDSYGVX, PICHEKPAD, PIFILLPAD, $ SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PDLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PDLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3D+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159D+0 30 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL MINIL = 1 MAXIU = 0 DO 40 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 40 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * * * DSEED is not used in this call to PDLASIZESYEVX, the * following line just makes ftnchek happy. * DSEED( 1 ) = 1 * CALL PDLASIZESYEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL DLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL DLACPY( 'A', NP, NQ, COPYB, DESCA( LLD_ ), B( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PDFILLPAD( DESCA( CTXT_ ), NP, NQ, B, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0D+2 ) * CALL PDFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0D+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * * Make sure that PDSYGVX does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, MAXEIGS, 1 CALL PDELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0D+0 ) 50 CONTINUE 60 CONTINUE * ORFAC = -1.0D+0 * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PDSYGVX( IBTYPE, JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, $ JA, DESCA, B( 1+IPREPAD ), IA, JA, DESCA, VL, VU, $ IL, IU, ABSTOL, M, NZ, WNEW( 1+IPREPAD ), ORFAC, $ Z( 1+IPREPAD ), IA, JA, DESCA, WORK( 1+IPREPAD ), $ LWORK1, IWORK( 1+IPREPAD ), LIWORK, $ IFAIL( 1+IPREPAD ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYGVX-B', NP, NQ, B, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0D+2 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYGVX-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PDCHEKPAD( DESCZ( CTXT_ ), 'PDSYGVX-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYGVX-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYGVX-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYGVX-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PDSYGVX-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PDSYGVX-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PDSYGVX-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * * Since we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9992 ) RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Make sure that different processes return the same eigenvalues * DO 70 I = 1, M WORK( I ) = WNEW( I+IPREPAD ) WORK( I+M ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, WORK, M, 1, $ 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ WORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 80 I = 1, M * IF( RESULT.EQ.0 .AND. ( ABS( WORK( I )-WORK( M+ $ I ) ).GT.FIVE*EPS*ABS( WORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 80 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 90 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 100 NCLUSTERS = NCLUSTERS + 1 90 CONTINUE 100 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 110 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 110 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * * DO 120 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 120 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 150 * * Note that a couple key variables get redefined in PDGSEPCHK * as described by this table: * * PDGSEPTST name PDGSEPCHK name * ------------- ------------- * COPYA A * Z Q * B B * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the residual check * CALL PDFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3D+0 ) * CALL PDGSEPCHK( IBTYPE, N, NZ, COPYA, IA, JA, DESCA, COPYB, $ IA, JA, DESCA, THRESH, Z( 1+IPREPAD ), IA, $ JA, DESCZ, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDGSEPCHK-WORK', SIZECHK, $ 1, WORK, SIZECHK, IPREPAD, IPOSTPAD, $ 4.3D+0 ) * IF( RES.NE.0 ) $ RESULT = 1 END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN ) THEN * * Set up MYIL if necessary * MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 140 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 130 I = 1, M ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 130 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 140 CONTINUE * * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what PxSYGVX computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * * RETURN * 9999 FORMAT( 'PDSYGVX returned INFO=', I7 ) 9998 FORMAT( 'PDSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PDGSEPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PDSYGVX returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PDSYGVX' ) 9981 FORMAT( 'NZ altered by PDSYGVX with JOBZ=N' ) * * End of PDGSEPSUBTST * END scalapack-2.0.2/TESTING/EIG/pdgseptst.f000644 000766 000024 00000121473 11622500733 017625 0ustar00juliestaff000000 000000 * * SUBROUTINE PDGSEPTST( DESCA, UPLO, N, MATTYPE, IBTYPE, SUBTESTS, $ THRESH, ORDER, ABSTOL, ISEED, A, COPYA, B, $ COPYB, Z, LDA, WIN, WNEW, IFAIL, ICLUSTR, $ GAP, IPREPAD, IPOSTPAD, WORK, LWORK, IWORK, $ LIWORK, NOUT, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SUBTESTS, UPLO INTEGER IBTYPE, INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, $ LWORK, MATTYPE, N, NOUT, ORDER DOUBLE PRECISION ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) DOUBLE PRECISION A( LDA, * ), B( LDA, * ), COPYA( LDA, * ), $ COPYB( LDA, * ), GAP( * ), WIN( * ), WNEW( * ), $ WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PDGSEPTST builds a random matrix A, and a well conditioned * matrix B, runs PDSYGVX() to compute the eigenvalues * and eigenvectors and then calls PDSYGVCHK to compute * the residual. * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) symmetric matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PDSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) DOUBLE PRECISION array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * The test matrix, which is then modified by PDSYGVX * * COPYA (local workspace) DOUBLE PRECISION array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * B (local workspace) DOUBLE PRECISION array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. * The B test matrix, which is then modified by PDSYGVX * * COPYB (local workspace) DOUBLE PRECISION array, dim (N, N) * COPYB is used to hold an identical copy of the array B * identical in both form and content to B * * Z (local workspace) DOUBLE PRECISION array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PDGSEPCHK * * W (local workspace) DOUBLE PRECISION array, dimension (N) * On normal exit from PDSYGVX, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PDLASIZEGSEP * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PDLASIZEGSEP * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE, TEN, HALF PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TEN = 10.0D+0, $ HALF = 0.5D+0 ) DOUBLE PRECISION PADVAL PARAMETER ( PADVAL = 19.25D+0 ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IINFO, IL, IMODE, IN, INDD, $ INDWORK, ISIZESUBTST, ISIZESYEVX, ISIZETST, $ ITYPE, IU, J, LLWORK, LSYEVXSIZE, MAXSIZE, $ MYCOL, MYROW, NB, NGEN, NLOC, NNODES, NP, $ NPCOL, NPROW, NQ, RES, SIZECHK, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, SIZESUBTST, $ SIZESYEVX, SIZETMS, SIZETST, VALSIZE, VECSIZE DOUBLE PRECISION ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL LSAME, NUMROC, DLARAN, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, DLABAD, DLASRT, $ DLATMS, IGAMX2D, IGEBR2D, IGEBS2D, PDCHEKPAD, $ PDELSET, PDFILLPAD, PDGSEPSUBTST, PDLASET, $ PDLASIZEGSEP, PDLASIZESYEVX, PDLATMS, PDMATGEN, $ SLCOMBINE * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX, MIN, MOD, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 PASSED = 'PASSED ' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Make sure that we have enough memory * * CALL PDLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * IF( LWORK.LT.SIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDWORK = INDD + N LLWORK = LWORK - INDWORK + 1 * ULP = PDLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PDLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL DLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / DBLE( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log symmetric, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random symmetric * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * * Zero * * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N WORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PDLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N WORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PDLASET( 'All', N, N, ZERO, ONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0D+0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0D+0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * symmetric, eigenvalues specified * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * symmetric, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PDMATGEN( DESCA( CTXT_ ), 'S', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0D+0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0D+0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PDLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( DLARAN( ISEED )*DBLE( NLOC ) ), N-NGEN ) * CALL DLATMS( IN, IN, 'S', ISEED, 'P', WORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PDELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PDELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PDELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PDELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2, ... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 WORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0D+0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * * * WKNOWN ... NOT SET, GUESS A DEFAULT * WKNOWN = .TRUE. ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL DLASRT( 'I', N, WORK( INDD ), IINFO ) * * Create the B matrix * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.3D+0 ) * ANORM = ONE * * Update ISEED so that {DLAGSY creates a different Q * ISEED( 4 ) = MOD( ISEED( 4 )+257, 4096 ) ISEED( 3 ) = MOD( ISEED( 3 )+192, 4096 ) ISEED( 2 ) = MOD( ISEED( 2 )+35, 4096 ) ISEED( 1 ) = MOD( ISEED( 1 )+128, 4096 ) CALL PDLATMS( N, N, 'S', ISEED, 'P', WORK( INDD ), 3, TEN, $ ANORM, N, N, 'N', COPYB, 1, 1, DESCA, ORDER, $ WORK( INDWORK+IPREPAD ), SIZETMS, IINFO ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS5-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.3D+0 ) * * * These values aren't actually used, but they make ftncheck happy. * IL = -1 IU = -2 VL = ONE VU = -ONE * CALL PDLASIZESYEVX( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, WORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) * LSYEVXSIZE = MIN( MAXSIZE, LWORK ) WKNOWN = .FALSE. * CALL PDGSEPSUBTST( WKNOWN, IBTYPE, 'v', 'a', UPLO, N, VL, VU, $ IL, IU, THRESH, ABSTOL, A, COPYA, B, COPYB, $ Z, 1, 1, DESCA, WORK( INDD ), WIN, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, IWORK, $ ISIZESYEVX, RES, TSTNRM, QTQNRM, NOUT ) * * * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'V', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'V' RANGE = 'A' CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 1' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'V', RANGE = 'A', random memory * IF( INFO.EQ.0 ) THEN JOBZ = 'V' RANGE = 'A' CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 2' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * We use PDLASIZESYEVX to choose IL and IU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'I', maximum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PDLASIZESYEVX to choose IL and IU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = MAXSIZE * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF * * Subtest 6: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PDLASIZESYEVX to choose IL and IU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 6' INFO = 1 END IF END IF * * Subtest 7: JOBZ = 'V', RANGE = 'I', random memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PDLASIZESYEVX to choose IL and IU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) LSYEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 7' INFO = 1 END IF END IF * * Subtest 8: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 8' INFO = 1 END IF END IF * * Subtest 9: JOBZ = 'V', RANGE = 'V', maximum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = MAXSIZE * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 9' INFO = 1 END IF END IF * * Subtest 10: JOBZ = 'V', RANGE = 'V', * minimum memory required for eigenvectors * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest10' INFO = 1 END IF END IF * * Subtest 11: JOBZ = 'V', RANGE = 'V', * random memory (enough for all eigenvectors * but not enough to guarantee orthogonality * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest11' INFO = 1 END IF END IF * * Subtest 12: JOBZ = 'V', RANGE = 'V', * miniimum memory required for eigenvalues only * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest12' INFO = 1 END IF END IF * * Subtest 13: JOBZ = 'V', RANGE = 'V', * random memory (more than minimum required * for eigenvalues, less than required for vectors) * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE + INT( DLARAN( ISEED )* $ DBLE( VECSIZE-VALSIZE ) ) * CALL PDGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest13' INFO = 1 END IF END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9984 )IBTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, CTIME( 1 ), MAXTSTNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS END IF END IF * 120 CONTINUE * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, F8.2, 1X, F8.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, 8X, 1X, F8.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, F8.2, 1X, F8.2, 11X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, 8X, 1X, F8.2, 11X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 22X, 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) 9984 FORMAT( ' IBTYPE=', I8 ) 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) 9980 FORMAT( ' Increase TOTMEM in PDGSEPDRIVER' ) * * End of PDGSEPTST * END scalapack-2.0.2/TESTING/EIG/pdhrddriver.f000644 000766 000024 00000045370 10363532303 020125 0ustar00juliestaff000000 000000 PROGRAM PDHRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * Purpose * ======= * * PDHRDDRIVER is the main test program for the DOUBLE PRECISION * ScaLAPACK HRD (Hessenberg Reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 14 lines: * 'ScaLAPACK HRD input file' * 'PVM machine' * 'HRD.out' output file name (if any) * 6 device out * 2 number of problems sizes * 100 101 values of N * 2 1 values of ILO * 99 101 values of IHI * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, MEMSIZ, NTESTS, TOTMEM DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IHI, IHIP, IHLP, IHLQ, $ ILCOL, ILO, ILROW, INFO, INLQ, IMIDPAD, IPA, $ IPT, IPW, IPOSTPAD, IPREPAD, ITEMP, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LCM, LCMQ, LOFF, $ LWORK, MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, $ NPROCS, NOUT, NP, NPCOL, NPROW, NQ, WORKHRD, $ WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), NVHI( NTESTS ), NVLO( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), MEM( MEMSIZ ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINIT, BLACS_GRIDINFO, $ DESCINIT, IGSUM2D, BLACS_PINFO, PDFILLPAD, $ PDLAFCHK, PDGEHDRV, PDGEHRD, $ PDHRDINFO, PDMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ILCM, INDXG2P, NUMROC DOUBLE PRECISION PDLANGE EXTERNAL ILCM, INDXG2P, NUMROC, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PDHRDINFO( OUTFILE, NOUT, NMAT, NVAL, NVLO, NVHI, NTESTS, $ NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GOTO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) ILO = NVLO( J ) IHI = NVHI( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPT = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * IHIP = NUMROC( IHI, NB, MYROW, DESCA( RSRC_ ), NPROW ) LOFF = MOD( ILO-1, NB ) ILROW = INDXG2P( ILO, NB, MYROW, DESCA( RSRC_ ), NPROW ) ILCOL = INDXG2P( ILO, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) IHLP = NUMROC( IHI-ILO+LOFF+1, NB, MYROW, ILROW, NPROW ) INLQ = NUMROC( N-ILO+LOFF+1, NB, MYCOL, ILCOL, NPCOL ) LWORK = NB*( NB + MAX( IHIP+1, IHLP+INLQ ) ) WORKHRD = LWORK + IPOSTPAD WORKSIZ = WORKHRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IHLQ = NUMROC( IHI-ILO+LOFF+1, NB, MYCOL, ILCOL, $ NPCOL ) ITEMP = NB*MAX( IHLP+INLQ, IHLQ+MAX( IHIP, $ IHLP+NUMROC( NUMROC( IHI-ILO+LOFF+1, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ) ) ) WORKSIZ = MAX( NB*NB + NB*IHLP + ITEMP, NB * NP ) + $ IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Hessenberg reduction', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate A * CALL PDMATGEN( ICTXT, 'No', 'No', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), $ IASEED, 0, NP, 0, NQ, MYROW, MYCOL, $ NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), $ NQ, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANGE( 'I', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKHRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKHRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce Hessenberg form * CALL PDGEHRD( N, ILO, IHI, MEM( IPA ), 1, 1, DESCA, $ MEM( IPT ), MEM( IPW ), LWORK, INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGEHRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEHRD', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEHRD', WORKHRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKHRD-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - Q H Q'|| / (||A||*N*eps) * CALL PDGEHDRV( N, ILO, IHI, MEM( IPA ), 1, 1, DESCA, $ MEM( IPT ), MEM( IPW ) ) CALL PDLAFCHK( 'No', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDGEHDRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEHDRV', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDGEHDRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ.0.0D+0 ) $ THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather max. of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * HRD requires 10/3 * N^3 floating point ops. (flops) * more precisely, * HRD requires 4/3*(IHI-ILO)^3 + 2*IHI*(IHI-ILO)^2 flops * NOPS = DBLE( IHI-ILO ) NOPS = NOPS * NOPS * $ ( 2.0D0*DBLE( IHI ) + (4.0D0/3.0D0)*NOPS ) NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', N, ILO, IHI, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, FRESID, $ PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', N, ILO, IHI, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, FRESID, $ PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N ILO IHI NB P Q HRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ------ ------ ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, $ F9.2, 1X, F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*H*Q''|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PDHRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pdhrdinfo.f000644 000766 000024 00000032374 10363532303 017565 0ustar00juliestaff000000 000000 SUBROUTINE PDHRDINFO( SUMMRY, NOUT, NMAT, NVAL, NVLO, NVHI, $ LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ NVHI( LDNVAL ), NVLO( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PDHRDINFO get the needed startup information for the Hessenberg * reduction tests and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, output to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for * N, IHI & ILO. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of rows & columns in matrix). * * NVLO (global output) INTEGER array, dimension (LDNVAL) * The values of ILO. * * NVHI (global output) INTEGER array, dimension (LDNVAL) * The values of IHI. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, ILO and IHI. LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * 3*LDNVAL+LDNBVAL+2*LDPVAL. Used to pack all input arrays * in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'HRD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = * ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( UNIT = NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1. .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'N', LDNVAL GO TO 20 END IF * * Get values of N, ILO, IHI * READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVLO( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVHI( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 1, 3, WORK, 1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVLO, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVHI, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS -1 CALL IGEBS2D( ICTXT, 'All', ' ', 1, I, WORK, 1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Reduction routine to Hessenberg form.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision Hessenberg ' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - Q H Q''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'HRD time : Time in seconds to compute HRD ' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for HRD ' // $ 'reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9995 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'ILO ', ( NVLO( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVLO( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'IHI ', ( NVHI( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVHI( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9995 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) EPS WRITE( NOUT, FMT = 9993 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 3, WORK, 1, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = 3*NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) * I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVLO, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVHI, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 CONTINUE WRITE( NOUT, FMT = 9998 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' ILLEGAL INPUT IN FILE ', 40A, '. ABORTING RUN.' ) 9997 FORMAT( ' NUMBER OF VALUES OF ', 5A, $ ' IS LESS THAN 1 OR GREATER ', 'THAN ', I2 ) 9996 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9995 FORMAT( 2X, A5, ': ', 10I6 ) 9994 FORMAT( ' ', 10I6 ) 9993 FORMAT( 'Routines pass computational tests if scaled residual is', $ ' less than ', G14.7 ) * * End of PDHRDINFO * END scalapack-2.0.2/TESTING/EIG/pdhseqrdriver.f000644 000766 000024 00000051317 11656313237 020501 0ustar00juliestaff000000 000000 *********************************************************************** * Test program for ScaLAPACK-style routine PDHSEQR * *********************************************************************** * * Contributor: Robert Granat and Meiyue Shao * This version is of Feb 2011. * PROGRAM PDHSEQRDRIVER * * Declarations * IMPLICIT NONE * ...Parameters... LOGICAL BALANCE, COMPHESS, COMPRESI, $ COMPORTH LOGICAL DEBUG, PRN, TIMESTEPS, BARR, $ UNI_LAPACK INTEGER SLV_MIN, SLV_MAX PARAMETER ( DEBUG = .FALSE., $ PRN = .FALSE., $ TIMESTEPS = .TRUE., $ COMPHESS = .TRUE., $ COMPRESI = .TRUE., $ COMPORTH = .TRUE., $ BALANCE = .TRUE., $ BARR = .FALSE., * Solver: 1-PDLAQR1, 2-PDHSEQR. $ SLV_MIN = 2, SLV_MAX = 2, $ UNI_LAPACK = .TRUE. ) INTEGER N, NB, ARSRC, ACSRC PARAMETER ( * Problem size. $ N = 500, NB = 50, * What processor should hold the first element in A? $ ARSRC = 0, ACSRC = 0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DPALLOC, INTALLC INTEGER DPSIZ, INTSZ, NOUT, IZERO PARAMETER ( DPSIZ = 8, DPALLOC = 8 000 000, $ INTSZ = 4, INTALLC = 8 000 000, $ NOUT = 6, IZERO = 0) DOUBLE PRECISION ZERO, ONE, TWO PARAMETER ( ZERO = 0.0D+00, ONE = 1.0D+00, TWO = 2.0D+00 ) * * ...Local Scalars... INTEGER ICTXT, IAM, NPROW, NPCOL, MYROW, MYCOL, $ SYS_NPROCS, NPROCS, AROWS, ACOLS, TEMP_ICTXT INTEGER THREADS INTEGER INFO, KTOP, KBOT, ILO, IHI, I INTEGER IPA, IPACPY, IPQ, WR1, WI1, WR2, WI2, IPW1, $ IPW2, IPIW INTEGER TOTIT, SWEEPS, TOTNS, HESS DOUBLE PRECISION EPS, THRESH DOUBLE PRECISION STAMP, TOTTIME, T_BA, T_GEN, T_HS, T_SCH, T_QR, $ T_RES, ITPEREIG, SWPSPEIG, NSPEIG, SPEEDUP, $ EFFICIENCY DOUBLE PRECISION RNORM, ANORM, R1, ORTH, O1, O2, DPDUM, ELEM1, $ ELEM2, ELEM3, EDIFF INTEGER SOLVER CHARACTER*6 PASSED * * ...Local Arrays... INTEGER DESCA( DLEN_ ), DESCQ( DLEN_ ), DESCVEC( DLEN_ ) DOUBLE PRECISION SCALE( N ) DOUBLE PRECISION, ALLOCATABLE :: MEM(:) INTEGER, ALLOCATABLE :: IMEM(:) * * ...Intrinsic Functions... INTRINSIC INT, DBLE, SQRT, MAX, MIN * * ...External Functions... INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE, MPI_WTIME EXTERNAL BLACS_PINFO, BLACS_GET, BLACS_GRIDINIT, $ BLACS_GRIDINFO, BLACS_GRIDEXIT, BLACS_EXIT EXTERNAL NUMROC, PDLAMCH, PDLASET, PDGEHRD, PDLANGE EXTERNAL DGEBAL, DGEHRD EXTERNAL MPI_WTIME EXTERNAL PDGEBAL EXTERNAL PDMATGEN2 * * ...Executable statements... * CALL BLACS_PINFO( IAM, SYS_NPROCS ) NPROW = INT( SQRT( DBLE(SYS_NPROCS) ) ) NPCOL = SYS_NPROCS / NPROW CALL BLACS_GET( 0, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, '2D', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) c print*, iam, ictxt, myrow, mycol c IF ( ( MYROW.GE.NPROW ) .OR. ( MYCOL.GE.NPCOL ) ) GO TO 777 IF ( ICTXT.LT.0 ) GO TO 777 * * Read out the number of underlying threads and set stack size in * kilobytes. * THRESH = 30.0 TOTTIME = MPI_WTIME() T_GEN = 0.0D+00 T_RES = 0.0D+00 T_SCH = 0.0D+00 * * Allocate and Init memory with zeros. * INFO = 0 ALLOCATE ( MEM( DPALLOC ), STAT = INFO ) IF( INFO.NE.0 ) THEN WRITE(*,*) '% Could not allocate MEM. INFO = ', INFO GO TO 777 END IF ALLOCATE ( IMEM( INTALLC ), STAT = INFO ) IF( INFO.NE.0 ) THEN WRITE(*,*) '% Could not allocate IMEM. INFO = ', INFO GO TO 777 END IF MEM( 1:DPALLOC ) = ZERO IMEM( 1:INTALLC ) = IZERO * * Get machine epsilon. * EPS = PDLAMCH( ICTXT, 'Epsilon' ) * * Print welcoming message. * IF( IAM.EQ.0 ) THEN WRITE(*,*) WRITE(*,*) 'ScaLAPACK Test for PDHSEQR' WRITE(*,*) WRITE(*,*) 'epsilon = ', EPS WRITE(*,*) 'threshold = ', THRESH WRITE(*,*) WRITE(*,*) 'Residual and Orthogonality Residual computed by:' WRITE(*,*) WRITE(*,*) 'Residual = ', $ ' || T - Q^T*A*Q ||_F / ( ||A||_F * eps * sqrt(N) )' WRITE(*,*) WRITE(*,*) 'Orthogonality = ', $ ' MAX( || I - Q^T*Q ||_F, || I - Q*Q^T ||_F ) / ', $ ' (eps * N)' WRITE(*,*) WRITE(*,*) $ 'Test passes if both residuals are less then threshold' WRITE( NOUT, * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) END IF * * Loop over problem parameters. * DO KTOP = 1, 1 DO KBOT = N, N DO SOLVER = SLV_MAX, SLV_MIN, -1 * * Set INFO to zero for this run. * INFO = 0 NPROCS = NPROW*NPCOL TEMP_ICTXT = ICTXT * * Count the number of rows and columns of current problem * for the current block sizes and grid properties. * STAMP = MPI_WTIME() AROWS = NUMROC( N, NB, MYROW, 0, NPROW ) ACOLS = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Set up matrix descriptors. * IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Set up descriptors...' IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') CALL DESCINIT( DESCA, N, N, NB, NB, MIN(ARSRC,NPROW-1), $ MIN(NPCOL-1,ACSRC), TEMP_ICTXT, MAX(1, AROWS), INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% DESCINIT DESCA failed, INFO =", INFO GO TO 999 END IF CALL DESCINIT( DESCQ, N, N, NB, NB, MIN(ARSRC,NPROW-1), $ MIN(NPCOL-1,ACSRC), TEMP_ICTXT, MAX(1, AROWS), INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% DESCINIT DESCQ failed, INFO =", INFO GO TO 999 END IF CALL DESCINIT( DESCVEC, N, 1, N, 1, MIN(ARSRC,NPROW-1), $ MIN(NPCOL-1,ACSRC), TEMP_ICTXT, N, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% DESCINIT DESCVEC failed, INFO =", INFO GO TO 999 END IF * * Assign pointer for ScaLAPACK arrays - first set DP memory. * IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Assign pointers...' IPA = 1 IPACPY = IPA + DESCA( LLD_ ) * ACOLS IPQ = IPACPY + DESCA( LLD_ ) * ACOLS WR1 = IPQ + DESCQ( LLD_ ) * ACOLS WI1 = WR1 + N WR2 = WI1 + N WI2 = WR2 + N IPW1 = WI2 + N IPW2 = IPW1 + DESCA( LLD_ ) * ACOLS IF( DEBUG ) WRITE(*,*) '% (IPW2,DPALLOC):', IPW2, DPALLOC * PRINT*, '%', IPA, IPACPY, IPQ, WR1, WI1, WR2, WI2, IPW1, IPW2 IF( IPW2+DESCA(LLD_)*ACOLS .GT. DPALLOC+1 ) THEN WRITE(*,*) '% Not enough DP memory!' GO TO 999 END IF * * Then set integer memory pointers. * IPIW = 1 * * Generate testproblem. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') CALL PDLASET( 'All over', N, N, ZERO, ONE, MEM(IPQ), 1, 1, $ DESCQ ) CALL PDMATGEN2( TEMP_ICTXT, 'Random', 'NoDiagDominant', $ N, N, NB, NB, MEM(IPA), DESCA( LLD_ ), 0, 0, 7, 0, $ AROWS, 0, ACOLS, MYROW, MYCOL, NPROW, NPCOL ) IF( .NOT. COMPHESS ) THEN CALL PDLASET( 'Lower triangular', N-2, N-2, ZERO, ZERO, $ MEM(IPA), 3, 1, DESCA ) CALL PDLASET( 'All over', N, N, ZERO, ONE, MEM(IPQ), $ 1, 1, DESCQ ) IF( KTOP.GT.1 ) $ CALL PDLASET( 'Lower triangular', KTOP-1, KTOP-1, $ ZERO, ZERO, MEM(IPA), 2, 1, DESCQ ) IF( KBOT.LT.N ) $ CALL PDLASET( 'Lower triangular', N-KBOT, N-KBOT, $ ZERO, ZERO, MEM(IPA), KBOT+1, KBOT, DESCQ ) END IF * * Do balancing if general matrix. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') T_BA = MPI_WTIME() IF( COMPHESS .AND. BALANCE ) THEN IF( NPROCS.EQ.1 .AND. SOLVER.NE.2 .AND. UNI_LAPACK ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == dgebal ==' CALL DGEBAL( 'Both', N, MEM(IPA), DESCA(LLD_), ILO, $ IHI, SCALE, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% DGEBAL failed, INFO =", INFO GO TO 999 END IF ELSE IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdgebal ==' CALL PDGEBAL( 'Both', N, MEM(IPA), DESCA, ILO, IHI, $ SCALE, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% PDGEBAL failed, INFO =", INFO GO TO 999 END IF END IF ELSEIF( COMPHESS ) THEN ILO = 1 IHI = N ELSE ILO = KTOP IHI = KBOT END IF T_BA = MPI_WTIME() - T_BA c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Balancing took in seconds:',T_BA IF( DEBUG ) WRITE(*,*) '% #', IAM, ': ILO,IHI=',ILO,IHI * * Make a copy of A. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Copy matrix A' CALL PDLACPY( 'All', N, N, MEM(IPA), 1, 1, DESCA, MEM(IPACPY), $ 1, 1, DESCA ) * * Print matrices to screen in debugging mode. * IF( PRN ) $ CALL PDLAPRNT( N, N, MEM(IPACPY), 1, 1, DESCA, 0, 0, $ 'A', NOUT, MEM(IPW1) ) T_GEN = T_GEN + MPI_WTIME() - STAMP - T_BA c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Generation took in seconds:',T_GEN * * Only compute the Hessenberg form if necessary. * T_HS = MPI_WTIME() IF( .NOT. COMPHESS ) GO TO 30 * * Reduce A to Hessenberg form. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') IF( DEBUG ) WRITE(*,*) '% #', IAM, $ ': Reduce to Hessenberg form...N=',N, ILO,IHI * PRINT*, '% PDGEHRD: IPW2,MEM(IPW2)', IPW2, MEM(IPW2) IF( NPROCS.EQ.1 .AND. SOLVER.NE.2 .AND. UNI_LAPACK ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == dgehrd ==' CALL DGEHRD( N, ILO, IHI, MEM(IPA), DESCA(LLD_), $ MEM(IPW1), MEM(IPW2), -1, INFO ) IF (DPALLOC-IPW2.LT.MEM(IPW2)) THEN WRITE(*,*) "% Not enough memory for DGEHRD" GO TO 999 END IF CALL DGEHRD( N, ILO, IHI, MEM(IPA), DESCA(LLD_), $ MEM(IPW1), MEM(IPW2), DPALLOC-IPW2, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% DGEHRD failed, INFO =", INFO GO TO 999 END IF ELSE IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdgehrd ==' CALL PDGEHRD( N, ILO, IHI, MEM(IPA), 1, 1, DESCA, MEM(IPW1), $ MEM(IPW2), -1, INFO ) IF (DPALLOC-IPW2.LT.MEM(IPW2)) THEN WRITE(*,*) "% Not enough memory for PDGEHRD" GO TO 999 END IF CALL PDGEHRD( N, ILO, IHI, MEM(IPA), 1, 1, DESCA, MEM(IPW1), $ MEM(IPW2), DPALLOC-IPW2, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% PDGEHRD failed, INFO =", INFO GO TO 999 END IF END IF * * Form Q explicitly. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') IF( DEBUG ) WRITE(*,*) '% #', IAM, ':Form Q explicitly' * PRINT*, '% PDORMHR: IPW2,MEM(IPW2)', IPW2, MEM(IPW2) IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdormhr ==' CALL PDORMHR( 'L', 'N', N, N, ILO, IHI, MEM(IPA), 1, 1, $ DESCA, MEM(IPW1), MEM(IPQ), 1, 1, DESCQ, MEM(IPW2), $ -1, INFO ) IF (DPALLOC-IPW2.LT.MEM(IPW2)) THEN WRITE(*,*) "% Not enough memory for PDORMHR" GO TO 999 END IF CALL PDORMHR( 'L', 'N', N, N, ILO, IHI, MEM(IPA), 1, 1, $ DESCA, MEM(IPW1), MEM(IPQ), 1, 1, DESCQ, MEM(IPW2), $ DPALLOC-IPW2, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% PDORMHR failed, INFO =", INFO GO TO 999 END IF * * Extract the upper Hessenberg part of A. * CALL PDLASET( 'Lower triangular', N-2, N-2, ZERO, ZERO, $ MEM(IPA), 3, 1, DESCA ) * * Print reduced matrix A in debugging mode. * IF( PRN ) THEN CALL PDLAPRNT( N, N, MEM(IPA), 1, 1, DESCA, 0, 0, 'H', NOUT, $ MEM(IPW1) ) CALL PDLAPRNT( N, N, MEM(IPQ), 1, 1, DESCQ, 0, 0, 'Q', NOUT, $ MEM(IPW1) ) END IF * 30 CONTINUE T_HS = MPI_WTIME() - T_HS c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Hessenberg took in seconds:',T_HS * * Compute the real Schur form of the Hessenberg matrix A. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') T_QR = MPI_WTIME() IF( SOLVER.EQ.1 ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdlaqr1 ==' * PRINT*, '% PDLAQR1: IPW1,MEM(IPW1)', IPW1, MEM(IPW1) CALL PDLAQR1( .TRUE., .TRUE., N, ILO, IHI, MEM(IPA), DESCA, $ MEM(WR1), MEM(WI1), ILO, IHI, MEM(IPQ), DESCQ, $ MEM(IPW1), -1, IMEM, -1, INFO ) IF (DPALLOC-IPW1.LT.MEM(IPW1)) THEN WRITE(*,*) "% Not enough DP memory for PDLAQR1" GO TO 999 END IF IF (INTALLC.LT.IMEM(1)) THEN WRITE(*,*) "% Not enough INT memory for PDLAQR1" GO TO 999 END IF CALL PDLAQR1( .TRUE., .TRUE., N, ILO, IHI, MEM(IPA), DESCA, $ MEM(WR1), MEM(WI1), ILO, IHI, MEM(IPQ), DESCQ, $ MEM(IPW1), DPALLOC-IPW1+1, IMEM, INTALLC, INFO ) IF (INFO.NE.0) THEN WRITE(*,*) "% PDLAQR1: INFO =", INFO END IF ELSEIF( SOLVER.EQ.2 ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdhseqr ==' * PRINT*, '% PDHSEQR: IPW1,MEM(IPW1)', IPW1, MEM(IPW1) IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') CALL PDHSEQR( 'Schur', 'Vectors', N, ILO, IHI, MEM(IPA), $ DESCA, MEM(WR2), MEM(WI2), MEM(IPQ), DESCQ, MEM(IPW1), $ -1, IMEM, -1, INFO ) IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') IF (DPALLOC-IPW1.LT.MEM(IPW1)) THEN WRITE(*,*) "% Not enough DP memory for PDHSEQR" GO TO 999 END IF IF (INTALLC.LT.IMEM(1)) THEN WRITE(*,*) "% Not enough INT memory for PDHSEQR" GO TO 999 END IF IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') CALL PDHSEQR( 'Schur', 'Vectors', N, ILO, IHI, MEM(IPA), $ DESCA, MEM(WR2), MEM(WI2), MEM(IPQ), DESCQ, MEM(IPW1), $ DPALLOC-IPW1+1, IMEM, INTALLC, INFO ) IF (INFO.NE.0) THEN WRITE(*,*) "% PDHSEQR: INFO =", INFO END IF ELSE WRITE(*,*) '% ERROR: Illegal SOLVER number!' GO TO 999 END IF T_QR = MPI_WTIME() - T_QR c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% QR-algorithm took in seconds:',T_QR T_SCH = T_SCH + T_QR + T_HS + T_BA * TOTIT = IMEM(1) * SWEEPS = IMEM(2) * TOTNS = IMEM(3) ITPEREIG = DBLE(TOTIT) / DBLE(N) SWPSPEIG = DBLE(SWEEPS) / DBLE(N) NSPEIG = DBLE(TOTNS) / DBLE(N) * * Print reduced matrix A in debugging mode. * IF( PRN ) THEN CALL PDLAPRNT( N, N, MEM(IPA), 1, 1, DESCA, 0, 0, 'T', $ NOUT, MEM(IPW1) ) CALL PDLAPRNT( N, N, MEM(IPQ), 1, 1, DESCQ, 0, 0, 'Z', $ NOUT, MEM(IPW1) ) END IF * * Check that returned Schur form is really a quasi-triangular * matrix. * HESS = 0 DO I = 1, N-1 IF( I.GT.1 ) THEN CALL PDELGET( 'All', '1-Tree', ELEM1, MEM(IPA), I, I-1, $ DESCA ) ELSE ELEM1 = ZERO END IF CALL PDELGET( 'All', '1-Tree', ELEM2, MEM(IPA), I+1, I, $ DESCA ) IF( I.LT.N-1 ) THEN CALL PDELGET( 'All', '1-Tree', ELEM3, MEM(IPA), I+2, I+1, $ DESCA ) ELSE ELEM3 = ZERO END IF IF( ELEM2.NE.ZERO .AND. ABS(ELEM1)+ABS(ELEM2)+ABS(ELEM3).GT. $ ABS(ELEM2) ) HESS = HESS + 1 END DO * * Compute residual norms and other results: * * 1) RNORM = || T - Q'*A*Q ||_F / ||A||_F * 2) ORTH = MAX( || I - Q'*Q ||_F, || I - Q*Q' ||_F ) / * (epsilon*N) * STAMP = MPI_WTIME() IF( COMPRESI ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Compute residuals 1' IF( DEBUG ) WRITE(*,*) '% #', IAM, ': pdgemm 3' CALL PDGEMM( 'N', 'N', N, N, N, ONE, MEM(IPACPY), 1, 1, $ DESCA, MEM(IPQ), 1, 1, DESCQ, ZERO, MEM(IPW1), 1, 1, $ DESCA ) IF( DEBUG ) WRITE(*,*) '% #', IAM, ': pdgemm 4' IF( DEBUG ) WRITE(*,*) '% #', IAM, ': N=',N IF( DEBUG ) WRITE(*,*) '% #', IAM, ': DESCA=',DESCA(1:DLEN_) IF( DEBUG ) WRITE(*,*) '% #', IAM, ': DESCQ=',DESCQ(1:DLEN_) CALL PDGEMM( 'T', 'N', N, N, N, -ONE, MEM(IPQ), 1, 1, $ DESCQ, MEM(IPW1), 1, 1, DESCA, ONE, MEM(IPA), 1, 1, $ DESCA ) R1 = PDLANGE( 'Frobenius', N, N, MEM(IPA), 1, 1, DESCA, $ DPDUM ) ANORM = PDLANGE( 'Frobenius', N, N, MEM(IPACPY), 1, 1, $ DESCA, DPDUM ) IF( ANORM.GT.ZERO )THEN RNORM = R1 / (ANORM*EPS*SQRT(DBLE(N))) ELSE RNORM = R1 END IF ELSE RNORM = 0.0D0 END IF * IF( COMPORTH ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Compute residuals 2' CALL PDLASET( 'All', N, N, ZERO, ONE, MEM(IPW1), 1, 1, $ DESCQ ) CALL PDLACPY( 'All', N, N, MEM(IPQ), 1, 1, DESCQ, MEM(IPW2), $ 1, 1, DESCQ ) CALL PDGEMM( 'T', 'N', N, N, N, -ONE, MEM(IPQ), 1, 1, DESCQ, $ MEM(IPW2), 1, 1, DESCQ, ONE, MEM(IPW1), 1, 1, DESCQ ) O1 = PDLANGE( 'Frobenius', N, N, MEM(IPW1), 1, 1, DESCQ, $ DPDUM ) CALL PDLASET( 'All', N, N, ZERO, ONE, MEM(IPW1), 1, 1, $ DESCQ ) CALL PDGEMM( 'N', 'T', N, N, N, -ONE, MEM(IPQ), 1, 1, DESCQ, $ MEM(IPW2), 1, 1, DESCQ, ONE, MEM(IPW1), 1, 1, DESCQ ) O2 = PDLANGE( 'Frobenius', N, N, MEM(IPW1), 1, 1, DESCQ, $ DPDUM ) ORTH = MAX(O1,O2) / (EPS*DBLE(N)) ELSE ORTH = 0.0D0 END IF * T_RES = T_RES + MPI_WTIME() - STAMP c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Residuals took in seconds:',T_RES TOTTIME = MPI_WTIME() - TOTTIME c IF( IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Total execution time in seconds:', TOTTIME * * * Print results to screen. * IF( (ORTH.GT.THRESH).OR.(RNORM.GT.THRESH) ) THEN PASSED = 'FAILED' ELSE PASSED = 'PASSED' END IF IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Print results...' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9993 ) N, NB, NPROW, NPCOL, T_QR, PASSED END IF CALL BLACS_BARRIER( ICTXT, 'All' ) END DO END DO END DO 999 CONTINUE * * Deallocate MEM and IMEM. * DEALLOCATE( MEM, IMEM ) * CALL BLACS_GRIDEXIT( ICTXT ) * 777 CONTINUE * CALL BLACS_EXIT( 0 ) * * Format specifications. * 6666 FORMAT(A2,A3,A6,A4,A5,A6,A3,A3,A3,A9,A9,A9,A8,A8,A9,A9,A9,A9,A9, $ A9,A9,A9,A9,A9,A9,A5,A5,A8,A5,A5) 7777 FORMAT(A2,I3,I6,I4,I5,I6,I3,I3,I3,F9.2,F9.2,F9.2,F8.2,F8.2,F9.2, $ F9.2,F9.2,F9.2,F9.2,F9.2,F9.2,F9.2,E9.2,E9.2,E9.2,I5,I5, $ F8.4,I5,I5,A2) 9995 FORMAT( ' N NB P Q QR Time CHECK' ) 9994 FORMAT( '----- --- ---- ---- -------- ------' ) 9993 FORMAT( I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, A6 ) * END scalapack-2.0.2/TESTING/EIG/pdlafchk.f000644 000766 000024 00000022260 10363532303 017355 0ustar00juliestaff000000 000000 SUBROUTINE PDLAFCHK( AFORM, DIAG, M, N, A, IA, JA, DESCA, IASEED, $ ANORM, FRESID, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER AFORM, DIAG INTEGER IA, IASEED, JA, M, N DOUBLE PRECISION ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDLAFCHK computes the residual * || sub( A ) - sub( Ao ) || / (|| sub( Ao ) ||*eps*MAX(M,N)), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A( IA:IA+M-1, JA:JA+N-1 ) and ||.|| stands for the infini- * ty norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * AFORM (global input) CHARACTER * sub( A ) is overwritten with: * - a symmetric matrix, if AFORM = 'S'; * - a Hermitian matrix, if AFORM = 'H'; * - the transpose of what would normally be generated, * if AFORM = 'T'; * - the conjugate transpose of what would normally be * generated, if AFORM = 'C'; * - otherwise a random matrix. * * DIAG (global input) CHARACTER * if DIAG = 'D' : sub( A ) is diagonally dominant. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub( A ) - sub( Ao ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * ANORM (global input) DOUBLE PRECISION * The Infinity norm of sub( A ). * * FRESID (global output) DOUBLE PRECISION * The maximum (worst) factorizational error. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK). * LWORK >= MpA0 * NB_A, where * * IROFFA = MOD( IA-1, MB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * WORK is used to store a block of columns of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IOFFA, IROFF, $ JB, JJ, JJA, JN, LDA, LDW, MP, MYCOL, MYROW, $ NPCOL, NPROW, NQ DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DMATADD, INFOG2L, PDMATGEN * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL ICEIL, NUMROC, PDLAMCH, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( ICTXT, 'eps' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * * Compute sub( A ) := sub( A ) - sub( Ao ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDW = MAX( 1, MP ) LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA - 1 )*LDA * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN JB = JN-JA+1 CALL PDMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ MP, JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL DMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) JJA = JJA + JB NQ = NQ - JB IOFFA = IOFFA + JB * LDA END IF * * Handle the remaning blocks of columns * DO 10 JJ = JJA, JJA+NQ-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JJA+NQ-JJ ) IOFFA = IIA + ( JJ - 1 ) * LDA CALL PDMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ MP, JJ-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL DMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) 10 CONTINUE * * Calculate factor residual * FRESID = PDLANGE( 'I', M, N, A, IA, JA, DESCA, WORK ) / $ ( MAX( M, N ) * EPS * ANORM ) * RETURN * * End PDLAFCHK * END scalapack-2.0.2/TESTING/EIG/pdlagge.f000644 000766 000024 00000030753 10363532303 017212 0ustar00juliestaff000000 000000 SUBROUTINE PDLAGGE( M, N, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N, ORDER * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) DOUBLE PRECISION A( * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * PDLAGGE generates a real symmetric matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random orthogonal * matrices: A = U*D*VT. * * This is just a quick implementation which will be replaced in the * future. The random matrix A1(m,n) is generated and random left * orthogonal matrix U(m,m) is obtained by running QR on A1: * A1(m,n) = U(m,m)*R, * where U(m,m) is a product of min(m,n) Householder rotations. * Afterwards the space of A1 is reused for a second random matrix * A2(m,n), which is used to obtain the right orthogonal matrix VT(n,n) * by running LQ on A2: * A2(m,n) = L*VT(n,n). * This requires vastly more computation than necessary, but not * significantly more communication than is used in the rest of this * routine, and hence is not that much slower than an efficient * solution. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * Number of rows of the matrix A. M >= 0. * * N (global input) INTEGER * Number of columns of matrix A. N >= 0. * * D (local input) DOUBLE PRECISION array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (local output) DOUBLE PRECISION array * Global dimension (M, N), local dimension (MP, NQ) * * IA (global input) INTEGER * The global row index of the submatrix of the distributed * matrix A to operate on. * * JA (global input) INTEGER * The global column index of the submatrix of the distributed * matrix A to operate on. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix A. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. On exit, the seed is updated and will remain identical * on all processes in the context. * * ORDER (global input) INTEGER * Number of reflectors in the matrix Q * At present, ORDER .NE. N is not supported * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER dimension of WORK * LWORK >= MAX( QR_WORK, LQ_WORK ) * QR_WORK = LDAA*MAX( 1, NQ ) + 200 + MAX( 1, DTAU1 ) + * MAX( SIZEMQRLEFT, SIZEQRF) * LQ_WORK = LDAA*MAX( 1, NQ ) + 200 + MAX( 1, DTAU2) + * MAX( SIZEMLQRIGHT, SIZEQRF ) * Where: * LDAA = DESCA( LLD_ ) * MB_A = DESCA( MB_ ) * NB_A = DESCA( NB_ ) * RSRC_A = DESCA( RSRC_ ) * CSRC_A = DESCA( CSRC_ ) * LCM = ILCM( NPROW, NPCOL ) * LCMQ = LCM / NPCOL * IROFFA = MOD( IA-1, MB_A ) * ICOFFA = MOD( JA-1, NB_A ) * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ) * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) * MP = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ) * NQ = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ) * DTAU1 = NUMROC( JA + SIZE- 1, NB_A, MYCOL, IACOL, NPROW ) * DTAU2 = NUMROC( IA + SIZE- 1, MB_A, MYROW, IAROW, NPROW ) * SIZEMQRLEFT = MAX( (MB_A*(MB_A-1))/2, ( MP + NQ ) * MB_A ) * + ( MP + NB_A ) * NB_A * SIZEMLQRIGHT = MAX( (MB_A*(MB_A-1))/2, (MP + NQ)*MB_A ) + * MB_A * MB_A * SIZEQRF = NB_A*NP + MB_A*NQ + NB_A*NB_A * * INFO (local output) INTEGER * * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER CSRC_A, DTAU1, DTAU2, I, IACOL, IAROW, ICOFFA, $ IROFFA, LCM, LCMQ, LDAA, LQ_WORK, LWMIN, MB_A, $ MP, MYCOL, MYROW, NB_A, NPCOL, NPROW, NQ, $ PTR2AA, PTR2TAU, PTR2WORK, QR_WORK, RSRC_A, $ SIZE, SIZELQF, SIZEMLQRIGHT, SIZEMQRLEFT, $ SIZEQRF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PDELSET, PDGELQF, $ PDGEQRF, PDLASET, PDMATGEN, PDORMLQ, PDORMQR, $ PXERBLA * .. * .. External Functions .. INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*DLEN_*DTYPE_*M_*N_.LT.0 )RETURN * * Initialize grid information. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * * Check LWORK. * INFO = 0 SIZE = MIN( M, N ) IF( NPROW.EQ.-1 ) THEN INFO = -607 ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 8, INFO ) END IF * Calculation of a minimum workspace. LDAA = DESCA( LLD_ ) MB_A = DESCA( MB_ ) NB_A = DESCA( NB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) DTAU1 = NUMROC( JA+SIZE-1, NB_A, MYCOL, IACOL, NPCOL ) DTAU2 = NUMROC( IA+SIZE-1, MB_A, MYROW, IAROW, NPROW ) MP = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ) * SIZEMQRLEFT = MAX( ( MB_A*( MB_A-1 ) ) / 2, ( MP+NQ )*MB_A ) + $ ( MP+NB_A )*NB_A SIZEMLQRIGHT = MAX( ( MB_A*( MB_A-1 ) ) / 2, ( MP+NQ )*MB_A ) + $ MB_A*MB_A SIZEQRF = NB_A*MP + MB_A*NQ + NB_A*NB_A + 100 SIZELQF = NB_A*( MP+NQ+NB_A ) + 100 * QR_WORK = LDAA*MAX( 1, NQ ) + 200 + MAX( 1, DTAU1 ) + $ MAX( SIZEMQRLEFT, SIZEQRF ) LQ_WORK = LDAA*MAX( 1, NQ ) + 200 + MAX( 1, DTAU2 ) + $ MAX( SIZEMLQRIGHT, SIZELQF ) LWMIN = MAX( QR_WORK, LQ_WORK ) WORK( 1 ) = LWMIN IF( LWORK.EQ.-1 ) $ GO TO 20 * * Test the input arguments. * IF( INFO.EQ.0 ) THEN IF( SIZE.NE.ORDER ) THEN INFO = -9 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( INFO.LT.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PDLAGGE', -INFO ) RETURN END IF * * Build a diagonal matrix A with the eigenvalues specified in D. * CALL PDLASET( 'Full', M, N, ZERO, ZERO, A, IA, JA, DESCA ) DO 10 I = 1, SIZE CALL PDELSET( A, I, I, DESCA, D( I ) ) 10 CONTINUE * * Local dimension of array TAU in tis case is LOCc(JA+MIN(M,N)-1). * PTR2AA = 2 PTR2TAU = PTR2AA + LDAA*MAX( 1, NQ ) + 100 PTR2WORK = PTR2TAU + MAX( 1, DTAU1 ) + 100 * CALL PDLASET( 'All', M, N, ZERO, ZERO, WORK( PTR2AA ), IA, JA, $ DESCA ) * * Build a random matrix AA1. * CALL PDMATGEN( DESCA( CTXT_ ), 'N', 'N', M, N, DESCA( MB_ ), $ DESCA( NB_ ), WORK( PTR2AA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), 0, MP, $ 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Produce QR decomposition AA1 -> U*R. * CALL PDGEQRF( M, N, WORK( PTR2AA ), IA, JA, DESCA, $ WORK( PTR2TAU ), WORK( PTR2WORK ), SIZEQRF, INFO ) * * A = U*A. * CALL PDORMQR( 'L', 'N', M, N, SIZE, WORK( PTR2AA ), IA, JA, DESCA, $ WORK( PTR2TAU ), A, IA, JA, DESCA, WORK( PTR2WORK ), $ SIZEMQRLEFT, INFO ) * * Reinitialize pointer to WORK array. Dimension of array TAU in * this case is LOCr(IA+MIN(M,N)-1). * PTR2WORK = PTR2TAU + MAX( 1, DTAU2 ) + 100 * * Use the same workspace to generate a random matrix AA2. * CALL PDMATGEN( DESCA( CTXT_ ), 'N', 'N', M, N, DESCA( MB_ ), $ DESCA( NB_ ), WORK( PTR2AA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 2 ), 0, MP, $ 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Produce LQ decomposition of random matrix AA2 -> L*VT. * CALL PDGELQF( M, N, WORK( PTR2AA ), IA, JA, DESCA, $ WORK( PTR2TAU ), WORK( PTR2WORK ), SIZELQF, INFO ) * * Calculate A = A*VT. * CALL PDORMLQ( 'R', 'N', M, N, SIZE, WORK( PTR2AA ), IA, JA, DESCA, $ WORK( PTR2TAU ), A, IA, JA, DESCA, WORK( PTR2WORK ), $ SIZEMLQRIGHT, INFO ) * * End of PDLAGGE * 20 CONTINUE RETURN END scalapack-2.0.2/TESTING/EIG/pdlagsy.f000644 000766 000024 00000025324 10363532303 017250 0ustar00juliestaff000000 000000 * * SUBROUTINE PDLAGSY( N, K, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, INFO ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, N, ORDER * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) DOUBLE PRECISION A( * ), D( * ), WORK( * ) * .. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Purpose * ======= * * PDLAGSY generates a real symmetric matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random orthogonal matrix: * A = U*D*U'. * * This is just a quick implementation which will be replaced in the * future. The random orthogonal matrix is computed by creating a * random matrix and running QR on it. This requires vastly more * computation than necessary, but not significantly more communication * than is used in the rest of this rouinte, and hence is not that much * slower than an efficient solution. * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix A. N >= 0. * * K (global input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * ### K must be 0 or N-1, 0 < K < N-1 is not supported yet. * * D (global input) DOUBLE PRECISION array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (local output) DOUBLE PRECISION array * Global dimension (N, N), local dimension (NP, NQ) * The generated n by n symmetric matrix A (the full matrix is * stored). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated and will remain identical on * all processes in the context. * * ORDER (global input) INTEGER * Number of reflectors in the matrix Q * At present, ORDER .NE. N is not supported * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER dimension of WORK * LWORK >= SIZETMS as returned by PDLASIZESEP * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER CSRC_A, I, IACOL, IAROW, ICOFFA, II, IIROW, $ INDAA, INDTAU, INDWORK, IPOSTPAD, IPREPAD, $ IROFFA, ISIZESUBTST, ISIZESYEVX, ISIZETST, $ JJCOL, LDAA, LII, LIII, LJJ, LJJJ, LWMIN, MAXI, $ MB_A, MYCOL, MYROW, NB_A, NP, NPCOL, NPROW, NQ, $ RSRC_A, SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZESYEVX, $ SIZETMS, SIZETST * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DLASET, PDGEQRF, $ PDLASIZESEP, PDMATGEN, PDORMQR, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. * INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Initialize grid information * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * * Check LWORK * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) END IF * LDAA = DESCA( LLD_ ) MB_A = DESCA( MB_ ) NB_A = DESCA( NB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) NP = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ) IPREPAD = 0 IPOSTPAD = 0 CALL PDLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) LWMIN = SIZETMS * * Test the input arguments * IF( INFO.EQ.0 ) THEN IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( N.NE.ORDER ) THEN INFO = -9 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( INFO.LT.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PDLAGSY', -INFO ) RETURN END IF * INDAA = 1 INDTAU = INDAA + LDAA*MAX( 1, NQ ) INDWORK = INDTAU + MAX( 1, NQ ) * IF( K.NE.0 ) THEN CALL DLASET( 'A', LDAA, NQ, ZERO, ZERO, WORK( INDAA ), LDAA ) * * * Build a random matrix * * CALL PDMATGEN( DESCA( CTXT_ ), 'N', 'N', N, ORDER, $ DESCA( MB_ ), DESCA( NB_ ), WORK( INDAA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ ISEED( 1 ), 0, NP, 0, NQ, MYROW, MYCOL, NPROW, $ NPCOL ) CALL PDGEQRF( N, ORDER, WORK( INDAA ), IA, JA, DESCA, $ WORK( INDTAU ), WORK( INDWORK ), SIZEQRF, INFO ) * END IF * * Build a diagonal matrix A with the eigenvalues specified in D * CALL DLASET( 'A', NP, NQ, ZERO, ZERO, A, DESCA( LLD_ ) ) * IIROW = 0 JJCOL = 0 LII = 1 LJJ = 1 * DO 20 II = 1, N, DESCA( MB_ ) MAXI = MIN( N, II+DESCA( MB_ )-1 ) IF( ( MYROW.EQ.IIROW ) .AND. ( MYCOL.EQ.JJCOL ) ) THEN LIII = LII LJJJ = LJJ DO 10 I = II, MAXI A( LIII+( LJJJ-1 )*DESCA( LLD_ ) ) = D( I ) LIII = LIII + 1 LJJJ = LJJJ + 1 10 CONTINUE END IF IF( MYROW.EQ.IIROW ) $ LII = LII + DESCA( MB_ ) IF( MYCOL.EQ.JJCOL ) $ LJJ = LJJ + DESCA( MB_ ) IIROW = MOD( IIROW+1, NPROW ) JJCOL = MOD( JJCOL+1, NPCOL ) 20 CONTINUE * * A = Q * A * IF( K.NE.0 ) THEN * CALL PDORMQR( 'L', 'Transpose', N, N, ORDER, WORK( INDAA ), IA, $ JA, DESCA, WORK( INDTAU ), A, IA, JA, DESCA, $ WORK( INDWORK ), SIZEMQRLEFT, INFO ) * * * A = A * Q' * * CALL PDORMQR( 'R', 'N', N, N, ORDER, WORK( INDAA ), IA, JA, $ DESCA, WORK( INDTAU ), A, IA, JA, DESCA, $ WORK( INDWORK ), SIZEMQRRIGHT, INFO ) * END IF * * End of PDLAGSY * END scalapack-2.0.2/TESTING/EIG/pdlasizegsep.f000644 000766 000024 00000011113 10363532303 020266 0ustar00juliestaff000000 000000 * * SUBROUTINE PDLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, $ SIZESUBTST, ISIZESUBTST, SIZETST, $ ISIZETST ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZESYEVX, $ SIZETMS, SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PDLASIZEGSEP computes the amount of memory needed by * various GSEP test routines, as well as SYGVX itself * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PDSYGVX * * SIZEMQRLEFT LWORK for the 1st PDORMQR call in PDLAGSY * * SIZEMQRRIGHT LWORK for the 2nd PDORMQR call in PDLAGSY * * SIZEQRF LWORK for PDGEQRF in PDLAGSY * * SIZETMS LWORK for PDLATMS * * SIZEQTQ LWORK for PDSEPQTQ (nexer complex) * * SIZECHK LWORK for PDGSEPCHK * * SIZESYEVX LWORK for PDSYGVX * * ISIZESYEVX LIWORK for PDSYGVX * * SIZESUBTST LWORK for PDSUBTST * * ISIZESUBTST LIWORK for PDSUBTST * * SIZETST LWORK for PDTST * * ISIZETST LIWORK for PDTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, LCM, $ LCMQ, LDA, MQ0, MYCOL, MYROW, N, NB, NEIG, NN, $ NNP, NP, NP0, NPCOL, NPROW, NQ, RSRC_A * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * .. ** .. Executable Statements .. * This is just to keep ftnchek happy * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 0 SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZESYEVX = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZESYEVX = 6*NNP * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZESYEVX ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZESYEVX + IPREPAD + IPOSTPAD * * * Allow room for A, COPYA and Z and DIAG, WIN, WNEW, GAP, WORK * SIZETST = 5*( LDA*NP+IPREPAD+IPOSTPAD ) + $ 4*( N+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PDSYGVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pdlasizesep.f000644 000766 000024 00000011152 10363532303 020122 0ustar00juliestaff000000 000000 * * SUBROUTINE PDLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, $ SIZESUBTST, ISIZESUBTST, SIZETST, $ ISIZETST ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZESYEVX, $ SIZETMS, SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PDLASIZESEP computes the amount of memory needed by * various SEP test routines, as well as SYEVX itself * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PDSYEVX * * SIZEMQRLEFT LWORK for the 1st PDORMQR call in PDLAGSY * * SIZEMQRRIGHT LWORK for the 2nd PDORMQR call in PDLAGSY * * SIZEQRF LWORK for PDGEQRF in PDLAGSY * * SIZETMS LWORK for PDLATMS * * SIZEQTQ LWORK for PDSEPQTQ (nexer complex) * * SIZECHK LWORK for PDSEPCHK * * SIZESYEVX LWORK for PDSYEVX * * ISIZESYEVX LIWORK for PDSYEVX * * SIZESUBTST LWORK for PDSUBTST * * ISIZESUBTST LIWORK for PDSUBTST * * SIZETST LWORK for PDTST * * ISIZETST LIWORK for PDTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, LCM, $ LCMQ, LDA, MQ0, MYCOL, MYROW, N, NB, NEIG, NN, $ NNP, NP, NP0, NPCOL, NPROW, NQ, RSRC_A * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * .. ** .. Executable Statements .. * This is just to keep ftnchek happy * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZESYEVX = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZESYEVX = 6*NNP * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZESYEVX ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZESYEVX + IPREPAD + IPOSTPAD * * * Allow room for A, COPYA and Z and DIAG, WIN, WNEW, GAP, WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + $ 4*( N+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PDSYEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pdlasizesepr.f000644 000766 000024 00000010651 11623527140 020312 0ustar00juliestaff000000 000000 SUBROUTINE PDLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVR, ISIZESYEVR, $ SIZESUBTST, ISIZESUBTST, SIZETST, $ ISIZETST ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVR, $ ISIZETST, SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZESYEVR, $ SIZETMS, SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * * Purpose * ======= * * PDLASIZESEPR computes the amount of memory needed by * various SEPR test routines, as well as PDSYEVR itself. * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor for dense matrix. * * SIZEMQRLEFT LWORK for the 1st PDORMQR call in PDLAGSY * * SIZEMQRRIGHT LWORK for the 2nd PDORMQR call in PDLAGSY * * SIZEQRF LWORK for PDGEQRF in PDLAGSY * * SIZETMS LWORK for PDLATMS * * SIZEQTQ LWORK for PDSEPQTQ * * SIZECHK LWORK for PDSEPCHK * * SIZESYEVR LWORK for PDSYEVR * * ISIZESYEVR LIWORK for PDSYEVR * * SIZESUBTST LWORK for PDSEPRSUBTST * * ISIZESUBTST LIWORK for PDSEPRSUBTST * * SIZETST LWORK for PDSEPRTST * * ISIZETST LIWORK for PDSEPRTST * * * .. Parameters .. INTEGER CTXT_, M_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( $ CTXT_ = 2, M_ = 3, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, LCM, $ LCMQ, LDA, MQ0, MYCOL, MYROW, N, NB, NEIG, NN, $ NNP, NP, NP0, NPCOL, NPROW, NQ, RSRC_A * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) + 1 NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) NNP = MAX( N, NPROW*NPCOL+1, 4 ) * * SIZESYEVR = 1 + 5*N + MAX( 18*NN, NP0*MQ0+2*NB*NB ) + $ (2 + ICEIL( NEIG, NPROW*NPCOL ))*NN SIZESYEVR = MAX(3, SIZESYEVR) * ISIZESYEVR = 12*NNP + 2*N * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZESYEVR ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZESYEVR + IPREPAD + IPOSTPAD * * Allow room for A, COPYA and Z and DIAG, WIN, WNEW, GAP, WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + $ 4*( N+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK * (only needed for PDSYEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * * RETURN END scalapack-2.0.2/TESTING/EIG/pdlasizesqp.f000644 000766 000024 00000013247 10363532303 020145 0ustar00juliestaff000000 000000 SUBROUTINE PDLASIZESQP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, $ SIZESYEV, SIZESYEVD, ISIZESYEVD, $ SIZESUBTST, ISIZESUBTST, $ SIZETST, ISIZETST ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * February 23, 2000 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVD, $ ISIZESYEVX, ISIZETST, SIZECHK, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, SIZESUBTST, $ SIZESYEV, SIZESYEVD, SIZESYEVX, SIZETMS, $ SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PDLASIZESQP computes the amount of memory needed by * various SEP test routines, as well as PDYEVX and PDSYEV * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PDSYEVX or PDSYEV * * SIZEMQRLEFT LWORK for the 1st PDORMQR call in PDLAGSY * * SIZEMQRRIGHT LWORK for the 2nd PDORMQR call in PDLAGSY * * SIZEQRF LWORK for PDGEQRF in PDLAGSY * * SIZETMS LWORK for PDLATMS * * SIZEQTQ LWORK for PDSEPQTQ (nexer complex) * * SIZECHK LWORK for PDSEPCHK * * SIZESYEVX LWORK for PDSYEVX * * ISIZESYEVX LIWORK for PDSYEVX * * SIZESYEV LWORK for PDSYEV * * SIZESYEVD LWORK for PSSYEVD * * ISIZESYEVD LIWORK for PSSYEVD * * SIZESUBTST LWORK for PDSUBTST * * ISIZESUBTST LIWORK for PDSUBTST * * SIZETST LWORK for PDTST * * ISIZETST LIWORK for PDTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTEXTC, CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, $ LCM, LCMQ, LDA, LDC, MQ0, MYCOL, MYPCOLC, $ MYPROWC, MYROW, N, NB, NEIG, NN, NNP, NP, $ NPCOLC, NPROWC, NP0, NPCOL, NPROW, NQ, RSRC_A * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC, SL_GRIDRESHAPE EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC, SL_GRIDRESHAPE * .. * .. Executable Statements .. * * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_GRIDEXIT * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZESYEVX = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZESYEVX = 6*NNP * * Allow room for the new context created in PDSYEV * CONTEXTC = SL_GRIDRESHAPE( DESCA( CTXT_ ), 0, 1, 1, $ NPROW*NPCOL, 1 ) CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, $ MYPCOLC ) LDC = MAX( 1, NUMROC( N, NB, MYPROWC, 0, NPROW*NPCOL ) ) SIZESYEV = 5*N + MAX( 2*NP0 + MQ0 + NB*NN , 2*NN-2 ) + N*LDC CALL BLACS_GRIDEXIT( CONTEXTC ) * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) NN = MAX( N, NB, 2 ) NNP = 3*N + MAX( NB*( NP+1 ), 3*NB ) SIZESYEVD = MAX( NNP, 1+6*N+2*NP*NQ ) + 2*N ISIZESYEVD = 2+7*N+8*NPCOL * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZESYEVX, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZESYEV ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZESYEVX + IPREPAD + IPOSTPAD * * * Allow room for A, COPYA and Z and DIAG, WIN, WNEW, GAP, WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + $ 4*( N+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PDSYEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pdlasizesyev.f000644 000766 000024 00000010272 10363532303 020323 0ustar00juliestaff000000 000000 * * SUBROUTINE PDLASIZESYEV( JOBZ, N, DESCA, MINSIZE ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER JOBZ INTEGER MINSIZE, N * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PDLASIZESYEV computes the amount of memory needed by PDSYEV * to calculate: * 1) Eigenvectors and eigenvalues if JOBZ = 'V' * 2) Eigenvalues only if JOBZ = 'N' * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * MINSIZE (global output) INTEGER * Workspace required for PDSYEV to: * 1) Eigenvectors and eigenvalues if JOBZ = 'V' * 2) Eigenvalues only if JOBZ = 'N' * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL WANTZ INTEGER CONTEXTC, CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, $ LCM, LCMQ, LDC, MQ0, MYCOL, MYPCOLC, MYPROWC, $ MYROW, NB, NN, NP, NP0, NPCOL, NPCOLC, NPROCS, $ NPROW, NPROWC, NQ, NRC, QRMEM, RSRC_A, $ SIZEMQRLEFT, SIZEMQRRIGHT * .. * .. External Functions .. * * LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC, SL_GRIDRESHAPE EXTERNAL ILCM, INDXG2P, LSAME, NUMROC, SL_GRIDRESHAPE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_GRIDEXIT * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) NB = DESCA( MB_ ) N = DESCA( M_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( NN, NB, 0, 0, NPCOL ) NPROCS = NPROW*NPCOL WANTZ = LSAME( JOBZ, 'V' ) LDC = 0 * * Create the new context that is used in PDSYEV * IF( WANTZ ) THEN CONTEXTC = SL_GRIDRESHAPE( DESCA( CTXT_ ), 0, 1, 1, NPROCS, 1 ) CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, $ MYPCOLC ) NRC = NUMROC( N, NB, MYPROWC, 0, NPROCS) LDC = MAX( 1, NRC ) CALL BLACS_GRIDEXIT( CONTEXTC ) END IF * * Compute the total amount of space needed * IF( WANTZ ) THEN QRMEM = 5*N + MAX( 2*NP0 +MQ0 + NB*NN, 2*NN-2 ) + N*LDC MINSIZE = MAX ( SIZEMQRLEFT, SIZEMQRRIGHT, QRMEM ) ELSE MINSIZE = 5*N + 2*NP0 +MQ0 + NB*NN END IF * RETURN * * End of PDLASIZESYEV * END scalapack-2.0.2/TESTING/EIG/pdlasizesyevr.f000644 000766 000024 00000014543 11623527140 020515 0ustar00juliestaff000000 000000 SUBROUTINE PDLASIZESYEVR( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER RANGE INTEGER IL, IU, MAXSIZE, N, VALSIZE, VECSIZE DOUBLE PRECISION VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) DOUBLE PRECISION WIN( * ) * .. * * Purpose * ======= * * PDLASIZESYEVR computes the amount of memory needed by PDSYEVR * to ensure: * 1) Orthogonal Eigenvectors * 2) Eigenpairs with small residual norms * * Arguments * ========= * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * VL (global input/output ) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VL is set * to a random value near an entry in WIN * * VU (global input/output ) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VU is set * to a random value near an entry in WIN * * IL (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * If IL < 0, RANGE='I' and WKNOWN = .TRUE., IL is set * to a random value from 1 to N * * IU (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * If IU < 0, RANGE='I' and WKNOWN = .TRUE., IU is set * to a random value from IL to N * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * ISEED is not touched unless IL, IU, VL or VU are modified. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If WKNOWN=1, WIN contains the eigenvalues of the matrix. * * MAXSIZE (global output) INTEGER * Workspace required to guarantee that PDSYEVR will return * orthogonal eigenvectors. IF WKNOWN=0, MAXSIZE is set to a * a value which guarantees orthogonality no matter what the * spectrum is. If WKNOWN=1, MAXSIZE is set to a value which * guarantees orthogonality on a matrix with eigenvalues given * by WIN. * * VECSIZE (global output) INTEGER * Workspace required to guarantee that PDSYEVR * will compute eigenvectors. * * VALSIZE (global output) INTEGER * Workspace required to guarantee that PDSYEVR * will compute eigenvalues. * * * .. Parameters .. INTEGER CTXT_, MB_ PARAMETER ( CTXT_ = 2, MB_ = 5 ) DOUBLE PRECISION TWENTY PARAMETER ( TWENTY = 20.0D0 ) * .. * .. Local Scalars .. * INTEGER ILMIN, IUMAX, $ MQ0, MYCOL, MYIL, MYIU, MYROW, NB, NEIG, NN, $ NP0, NPCOL, NPROW DOUBLE PRECISION ANORM, EPS, SAFMIN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL LSAME, ICEIL, NUMROC, DLARAN, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe Minimum' ) NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) VALSIZE = 3 + 5*N + MAX( 12*NN, NB*( NP0+1 ) ) IF( WKNOWN ) THEN ANORM = SAFMIN / EPS IF( N.GE.1 ) $ ANORM = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), ANORM ) IF( LSAME( RANGE, 'I' ) ) THEN IF( IL.LT.0 ) $ IL = INT( DLARAN( ISEED )*DBLE( N ) ) + 1 IF( IU.LT.0 ) $ IU = INT( DLARAN( ISEED )*DBLE( N-IL ) ) + IL IF( N.EQ.0 ) $ IU = 0 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IF( VL.GT.VU ) THEN MYIL = INT( DLARAN( ISEED )*DBLE( N ) ) + 1 MYIU = INT( DLARAN( ISEED )*DBLE( N-MYIL ) ) + MYIL VL = WIN( MYIL ) - TWENTY*EPS*ABS( WIN( MYIL ) ) VU = WIN( MYIU ) + TWENTY*EPS*ABS( WIN( MYIU ) ) VU = MAX( VU, VL+EPS*TWENTY*ABS( VL )+SAFMIN ) END IF END IF * END IF IF( LSAME( RANGE, 'V' ) ) THEN * We do not know how many eigenvalues will be computed ILMIN = 1 IUMAX = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN ILMIN = IL IUMAX = IU ELSE IF( LSAME( RANGE, 'A' ) ) THEN ILMIN = 1 IUMAX = N END IF * NEIG = IUMAX - ILMIN + 1 * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * VECSIZE = 3 + 5*N + MAX( 18*NN, NP0*MQ0+2*NB*NB ) + $ (2 + ICEIL( NEIG, NPROW*NPCOL ))*NN VALSIZE = MAX(3, VALSIZE) VECSIZE = MAX(3, VECSIZE) MAXSIZE = VECSIZE * RETURN * * End of PDLASIZESYEVR * END scalapack-2.0.2/TESTING/EIG/pdlasizesyevx.f000644 000766 000024 00000017250 10363532303 020516 0ustar00juliestaff000000 000000 * * SUBROUTINE PDLASIZESYEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER RANGE INTEGER IL, IU, MAXSIZE, N, VALSIZE, VECSIZE DOUBLE PRECISION VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) DOUBLE PRECISION WIN( * ) * .. * * Purpose * ======= * * PDLASIZESYEVX computes the amount of memory needed by PDSYEVX * to ensure: * 1) Orthogonal Eigenvectors * 2) Eigenvectors * 3) Eigenvalues * * Arguments * ========= * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * VL (global input/output ) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VL is set * to a random value near an entry in WIN * * VU (global input/output ) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VU is set * to a random value near an entry in WIN * * IL (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * If IL < 0, RANGE='I' and WKNOWN = .TRUE., IL is set * to a random value from 1 to N * * IU (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * If IU < 0, RANGE='I' and WKNOWN = .TRUE., IU is set * to a random value from IL to N * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * ISEED is not touched unless IL, IU, VL or VU are modified. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If WKNOWN=1, WIN contains the eigenvalues of the matrix. * * MAXSIZE (global output) INTEGER * Workspace required to guarantee that PDSYEVX will return * orthogonal eigenvectors. IF WKNOWN=0, MAXSIZE is set to a * a value which guarantees orthogonality no matter what the * spectrum is. If WKNOWN=1, MAXSIZE is set to a value which * guarantees orthogonality on a matrix with eigenvalues given * by WIN. * * VECSIZE (global output) INTEGER * Workspace required to guarantee that PDSYEVX * will compute eigenvectors. * * VALSIZE (global output) INTEGER * Workspace required to guarantee that PDSYEVX * will compute eigenvalues. * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION TWENTY PARAMETER ( TWENTY = 20.0D0 ) * .. * .. Local Scalars .. * INTEGER CLUSTERSIZE, I, ILMIN, IUMAX, MAXCLUSTERSIZE, $ MQ0, MYCOL, MYIL, MYIU, MYROW, NB, NEIG, NN, $ NP0, NPCOL, NPROW DOUBLE PRECISION ANORM, EPS, ORFAC, SAFMIN, VLMIN, VUMAX * .. * .. External Functions .. * * LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL LSAME, ICEIL, NUMROC, DLARAN, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * ORFAC = 1.0D-3 * * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe Minimum' ) NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * VALSIZE = 5*N + MAX( 5*NN, NB*( NP0+1 ) ) * IF( WKNOWN ) THEN ANORM = SAFMIN / EPS IF( N.GE.1 ) $ ANORM = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), ANORM ) * IF( LSAME( RANGE, 'I' ) ) THEN IF( IL.LT.0 ) $ IL = INT( DLARAN( ISEED )*DBLE( N ) ) + 1 IF( IU.LT.0 ) $ IU = INT( DLARAN( ISEED )*DBLE( N-IL ) ) + IL IF( N.EQ.0 ) $ IU = 0 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IF( VL.GT.VU ) THEN MYIL = INT( DLARAN( ISEED )*DBLE( N ) ) + 1 MYIU = INT( DLARAN( ISEED )*DBLE( N-MYIL ) ) + MYIL VL = WIN( MYIL ) + TWENTY*EPS*ABS( WIN( MYIL ) ) VU = WIN( MYIU ) + TWENTY*EPS*ABS( WIN( MYIU ) ) VU = MAX( VU, VL+EPS*TWENTY*ABS( VL )+SAFMIN ) END IF END IF * END IF IF( LSAME( RANGE, 'V' ) ) THEN * * Compute ILMIN, IUMAX (based on VL, VU and WIN) * IF( WKNOWN ) THEN VLMIN = VL - TWENTY*EPS*ANORM VUMAX = VU + TWENTY*EPS*ANORM ILMIN = 1 IUMAX = 0 DO 10 I = 1, N IF( WIN( I ).LT.VLMIN ) $ ILMIN = ILMIN + 1 IF( WIN( I ).LT.VUMAX ) $ IUMAX = IUMAX + 1 10 CONTINUE ELSE ILMIN = 1 IUMAX = N END IF ELSE IF( LSAME( RANGE, 'I' ) ) THEN ILMIN = IL IUMAX = IU ELSE IF( LSAME( RANGE, 'A' ) ) THEN ILMIN = 1 IUMAX = N END IF * NEIG = IUMAX - ILMIN + 1 * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) VECSIZE = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN * IF( WKNOWN ) THEN CLUSTERSIZE = 1 MAXCLUSTERSIZE = 1 DO 20 I = ILMIN + 1, IUMAX IF( ( WIN( I )-WIN( I-1 ) ).LT.ORFAC*2*ANORM ) THEN CLUSTERSIZE = CLUSTERSIZE + 1 IF( CLUSTERSIZE.GT.MAXCLUSTERSIZE ) $ MAXCLUSTERSIZE = CLUSTERSIZE ELSE CLUSTERSIZE = 1 END IF 20 CONTINUE IF( CLUSTERSIZE.GT.MAXCLUSTERSIZE ) $ MAXCLUSTERSIZE = CLUSTERSIZE ELSE MAXCLUSTERSIZE = N END IF * MAXSIZE = VECSIZE + MAX( ( MAXCLUSTERSIZE-1 ), 0 )*N * * RETURN * * End of PDLASIZESYEVX * END scalapack-2.0.2/TESTING/EIG/pdlatms.f000644 000766 000024 00000032510 10363532303 017244 0ustar00juliestaff000000 000000 * * SUBROUTINE PDLATMS( M, N, DIST, ISEED, SYM, D, MODE, COND, DMAX, $ KL, KU, PACK, A, IA, JA, DESCA, ORDER, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIST, PACK, SYM INTEGER IA, INFO, JA, KL, KU, LWORK, M, MODE, N, ORDER DOUBLE PRECISION COND, DMAX * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) DOUBLE PRECISION A( * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * PDLATMS generates random symmetric matrices with specified * eigenvalues for testing SCALAPACK programs. * * PDLATMS operates by applying the following sequence of * operations: * * Set the diagonal to D, where D may be input or * computed according to MODE, COND, DMAX, and SYM * as described below. * * Generate a dense M x N matrix by multiplying D on the left * and the right by random unitary matrices, then: * * Reduce the bandwidth according to KL and KU, using * Householder transformations. * ### bandwidth reduction NOT SUPPORTED ### * * Arguments * ========= * * M - (global input) INTEGER * The number of rows of A. Not modified. * * N - (global input) INTEGER * The number of columns of A. Not modified. * ### M .ne. N unsupported * * DIST - (global input) CHARACTER*1 * On entry, DIST specifies the type of distribution to be used * to generate the random eigen-/singular values. * 'U' => UNIFORM( 0, 1 ) ( 'U' for uniform ) * 'S' => UNIFORM( -1, 1 ) ( 'S' for symmetric ) * 'N' => NORMAL( 0, 1 ) ( 'N' for normal ) * Not modified. * * ISEED - (global input) INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. They should lie between 0 and 4095 inclusive, * and ISEED(4) should be odd. The random number generator * uses a linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to DLATMS * to continue the same random number sequence. * Changed on exit. * * SYM - (global input) CHARACTER*1 * If SYM='S' or 'H', the generated matrix is symmetric, with * eigenvalues specified by D, COND, MODE, and DMAX; they * may be positive, negative, or zero. * If SYM='P', the generated matrix is symmetric, with * eigenvalues (= singular values) specified by D, COND, * MODE, and DMAX; they will not be negative. * If SYM='N', the generated matrix is nonsymmetric, with * singular values specified by D, COND, MODE, and DMAX; * they will not be negative. * ### SYM = 'N' NOT SUPPORTED ### * Not modified. * * D - (local input/output) DOUBLE PRECISION array, * dimension ( MIN( M , N ) ) * This array is used to specify the singular values or * eigenvalues of A (see SYM, above.) If MODE=0, then D is * assumed to contain the singular/eigenvalues, otherwise * they will be computed according to MODE, COND, and DMAX, * and placed in D. * Modified if MODE is nonzero. * * MODE - (global input) INTEGER * On entry this describes how the singular/eigenvalues are to * be specified: * MODE = 0 means use D as input * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * If SYM='S' or 'H', and MODE is neither 0, 6, nor -6, then * the elements of D will also be multiplied by a random * sign (i.e., +1 or -1.) * Not modified. * * COND - (global input) DOUBLE PRECISION * On entry, this is used as described under MODE above. * If used, it must be >= 1. Not modified. * * DMAX - (global input) DOUBLE PRECISION * If MODE is neither -6, 0 nor 6, the contents of D, as * computed according to MODE and COND, will be scaled by * DMAX / max(abs(D(i))); thus, the maximum absolute eigen- or * singular value (which is to say the norm) will be abs(DMAX). * Note that DMAX need not be positive: if DMAX is negative * (or zero), D will be scaled by a negative number (or zero). * Not modified. * * KL - (global input) INTEGER * This specifies the lower bandwidth of the matrix. For * example, KL=0 implies upper triangular, KL=1 implies upper * Hessenberg, and KL being at least M-1 means that the matrix * has full lower bandwidth. KL must equal KU if the matrix * is symmetric. * Not modified. * ### 1 <= KL < N-1 is NOT SUPPORTED ### * * KU - (global input) INTEGER * This specifies the upper bandwidth of the matrix. For * example, KU=0 implies lower triangular, KU=1 implies lower * Hessenberg, and KU being at least N-1 means that the matrix * has full upper bandwidth. KL must equal KU if the matrix * is symmetric. * Not modified. * ### 1 <= KU < N-1 is NOT SUPPORTED ### * * PACK - (global input) CHARACTER*1 * This specifies packing of matrix as follows: * 'N' => no packing * ### PACK must be 'N' all other options NOT SUPPORTED ### * * A - (local output) DOUBLE PRECISION array * Global dimension (M, N), local dimension (MP, NQ) * On exit A is the desired test matrix. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ORDER - (input) INTEGER * The number of reflectors used to define the orthogonal * matrix Q. A = Q * D * Q' * Higher ORDER requires more computation and communication. * * WORK - (local input/output) DOUBLE PRECISION array, * dimension (LWORK) * * LWORK - (local input) INTEGER dimension of WORK * LWORK >= SIZETMS as returned by PDLASIZESEP * * INFO - (global output) INTEGER * Error code. On exit, INFO will be set to one of the * following values: * 0 => normal return * -1 => M negative or unequal to N and SYM='S', 'H', or 'P' * -2 => N negative * -3 => DIST illegal string * -5 => SYM illegal string * -7 => MODE not in range -6 to 6 * -8 => COND less than 1.0, and MODE neither -6, 0 nor 6 * -10 => KL negative * -11 => KU negative, or SYM='S' or 'H' and KU not equal to KL * -16 => DESCA is inconsistent * -17 => ORDER not in the range 0 to N inclusive * 1 => Error return from DLATM1 * 2 => Cannot scale to DMAX (max. sing. value is 0) * 3 => Error return from PDLAGSY * *----------------------------------------------------------------------- * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IDIST, IINFO, IPACK, IRSIGN, ISYM, LLB, $ MNMIN, MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION ALPHA, TEMP * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DLASET, DLATM1, DSCAL, $ PCHK1MAT, PDLAGSY, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * 1) Decode and Test the input parameters. * Initialize flags & seed. * * INFO = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IF( ( MYROW.GE.NPROW .OR. MYROW.LT.0 ) .OR. $ ( MYCOL.GE.NPCOL .OR. MYCOL.LT.0 ) )RETURN * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Decode DIST * IF( LSAME( DIST, 'U' ) ) THEN IDIST = 1 ELSE IF( LSAME( DIST, 'S' ) ) THEN IDIST = 2 ELSE IF( LSAME( DIST, 'N' ) ) THEN IDIST = 3 ELSE IDIST = -1 END IF * * Decode SYM * IF( LSAME( SYM, 'N' ) ) THEN ISYM = 1 IRSIGN = 0 ELSE IF( LSAME( SYM, 'P' ) ) THEN ISYM = 2 IRSIGN = 0 ELSE IF( LSAME( SYM, 'S' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE IF( LSAME( SYM, 'H' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE ISYM = -1 END IF * * Decode PACK * IF( LSAME( PACK, 'N' ) ) THEN IPACK = 0 ELSE IPACK = 1 END IF * * Set certain internal parameters * MNMIN = MIN( M, N ) LLB = MIN( KL, M-1 ) * IF( ORDER.EQ.0 ) $ ORDER = N * * Set INFO if an error * IF( NPROW.EQ.-1 ) THEN INFO = -( 1600+CTXT_ ) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 16, INFO ) IF( INFO.EQ.0 ) THEN IF( M.NE.N .AND. ISYM.NE.1 ) THEN INFO = -2 ELSE IF( IDIST.EQ.-1 ) THEN INFO = -3 ELSE IF( ISYM.EQ.-1 ) THEN INFO = -5 ELSE IF( ABS( MODE ).GT.6 ) THEN INFO = -7 ELSE IF( ( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) .AND. COND.LT. $ ONE ) THEN INFO = -8 ELSE IF( KL.LT.0 ) THEN INFO = -10 ELSE IF( KU.LT.0 .OR. ( ISYM.NE.1 .AND. KL.NE.KU ) ) THEN INFO = -11 ELSE IF( ( ORDER.LT.0 ) .OR. ( ORDER.GT.N ) ) THEN INFO = -17 END IF END IF CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 16, 0, IDUM1, IDUM2, $ INFO ) END IF * * Check for unsupported features * IF( ISYM.NE.2 ) THEN INFO = -5 ELSE IF( IPACK.NE.0 ) THEN INFO = -12 ELSE IF( KL.GT.0 .AND. KL.LT.M-1 ) THEN INFO = -10 ELSE IF( KU.GT.0 .AND. KU.LT.N-1 ) THEN INFO = -11 ELSE IF( LLB.NE.0 .AND. LLB.NE.M-1 ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PDLATMS', -INFO ) RETURN END IF * * Initialize random number generator * DO 10 I = 1, 4 ISEED( I ) = MOD( ABS( ISEED( I ) ), 4096 ) 10 CONTINUE * IF( MOD( ISEED( 4 ), 2 ).NE.1 ) $ ISEED( 4 ) = ISEED( 4 ) + 1 * * 2) Set up D if indicated. * * Compute D according to COND and MODE * CALL DLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, MNMIN, IINFO ) * IF( IINFO.NE.0 ) THEN INFO = 1 RETURN END IF * * IF( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) THEN * * Scale by DMAX * TEMP = ABS( D( 1 ) ) DO 20 I = 2, MNMIN TEMP = MAX( TEMP, ABS( D( I ) ) ) 20 CONTINUE * IF( TEMP.GT.ZERO ) THEN ALPHA = DMAX / TEMP ELSE INFO = 2 RETURN END IF * CALL DSCAL( MNMIN, ALPHA, D, 1 ) * END IF * CALL DLASET( 'A', NP, NQ, ZERO, ZERO, A, DESCA( LLD_ ) ) * * symmetric -- A = U D U' * CALL PDLAGSY( M, LLB, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, IINFO ) * RETURN * * End of PDLATMS * END scalapack-2.0.2/TESTING/EIG/pdlatran.f000644 000766 000024 00000015617 10363532303 017416 0ustar00juliestaff000000 000000 SUBROUTINE PDLATRAN( N, NB, A, IA, JA, DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IA, JA, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * * ======= * * PDLATRAN transpose a lower triangular matrix on to the upper * triangular portion of the same matrix. * * This is an auxiliary routine called by PDSYTRD. * * Notes * ===== * * IA must equal 1 * JA must equal 1 * DESCA( MB_ ) must equal 1 * DESCA( NB_ ) must equal 1 * DESCA( RSRC_ ) must equal 1 * DESCA( CSRC_ ) must equal 1 * * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix to be transposed. * * NB (global input) INTEGER * The number of rows and columns to be transposed with each * message sent. NB has no impact on the result, it is striclty * a performance tuning parameter. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). On entry, the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix. On exit, the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is undefined (and may have been modified). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * Must be equal to 1. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * Must be equal to 1. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * DESCA( MB_ ) must equal 1 * DESCA( NB_ ) must equal 1 * DESCA( ICTXT_ ) must point to a square process grid * i.e. one where NPROW is equal to NPCOL * * WORK (local workspace) COMPLEX*16 array, dimension ( LWORK ) * * Where: * LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, IRECV, ISEND, J, JJ, JRECV, JSEND, $ LDA, MAXIRECV, MAXISEND, MAXJRECV, MAXJSEND, $ MINIRECV, MINISEND, MINJRECV, MINJSEND, MYCOL, $ MYROW, NP, NPCOL, NPROW, NQ, RECVNB, SENDNB, $ STARTCOL, STARTROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DTRRV2D, DTRSD2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Further details * * Because the processor grid is square each process needs only send * data to its transpose process. (Likewsie it need only receive * data from its transpose process.) Because the data decomposition * is cyclic, the local portion of the array is triangular. * * This routine requires that the data be buffered (i.e. copied) * on the sending process (because of the triangular shape) and * unbuffered on the receiving process. Hence, two local memory to * memory copies are performed within the communications routines * followed by a memory to memory copy outside of the communications * routines. It would be nice to avoid having back to back memory * to memory copies (as we do presently on the receiving processor). * This could be done by packaging the data ourselves in the sender * and then unpacking it directly into the matrix. However, this * code seems cleaner and so since this routine is not a significant * performance bottleneck we have left it this way. * * * * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * NP = NUMROC( N, 1, MYROW, 0, NPROW ) NQ = NUMROC( N, 1, MYCOL, 0, NPCOL ) * * IF( MYROW.EQ.MYCOL ) THEN * DO 20 J = 1, NP DO 10 I = J + 1, NQ A( J+( I-1 )*LDA ) = A( I+( J-1 )*LDA ) 10 CONTINUE 20 CONTINUE * ELSE IF( MYROW.GT.MYCOL ) THEN STARTROW = 1 STARTCOL = 2 ELSE IF( MYROW.EQ.MYCOL ) THEN STARTROW = 2 STARTCOL = 2 ELSE STARTROW = 2 STARTCOL = 1 END IF END IF * DO 50 JJ = 1, MAX( NP, NQ ), NB MINJSEND = STARTCOL + JJ - 1 MINJRECV = STARTROW + JJ - 1 MAXJSEND = MIN( MINJSEND+NB-1, NQ ) MAXJRECV = MIN( MINJRECV+NB-1, NP ) * SENDNB = MAXJSEND - MINJSEND + 1 RECVNB = MAXJRECV - MINJRECV + 1 * MINISEND = 1 MINIRECV = 1 MAXISEND = MIN( NP, JJ+SENDNB-1 ) MAXIRECV = MIN( NQ, JJ+RECVNB-1 ) * ISEND = MAXISEND - MINISEND + 1 IRECV = MAXIRECV - MINIRECV + 1 JSEND = MAXJSEND - MINJSEND + 1 JRECV = MAXJRECV - MINJRECV + 1 * * * DO 40 J = MINJRECV, MAXJRECV DO 30 I = MINIRECV, MAXIRECV + J - MAXJRECV WORK( I+( J-MINJRECV )*IRECV ) = A( J+( I-1 )*LDA ) 30 CONTINUE 40 CONTINUE * IF( IRECV.GT.0 .AND. JRECV.GT.0 ) $ CALL DTRSD2D( ICTXT, 'U', 'N', IRECV, JRECV, WORK, IRECV, $ MYCOL, MYROW ) * IF( ISEND.GT.0 .AND. JSEND.GT.0 ) $ CALL DTRRV2D( ICTXT, 'U', 'N', ISEND, JSEND, $ A( MINISEND+( MINJSEND-1 )*LDA ), LDA, $ MYCOL, MYROW ) * * 50 CONTINUE * END IF * RETURN * * End of PDLATRD * END scalapack-2.0.2/TESTING/EIG/pdmatgen.f000644 000766 000024 00000043124 10363532303 017402 0ustar00juliestaff000000 000000 SUBROUTINE PDMATGEN( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PDMATGEN : Parallel Real Double precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * otherwise a random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) DOUBLE PRECISION, pointer into the local * memory to an array of dimension ( LDA, * ) containing the * local pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) DOUBLE PRECISION ONE, TWO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0 ) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDRAND EXTERNAL ICEIL, NUMROC, LSAME, PDRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) * INFO = 0 IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDMATGEN', INFO ) RETURN END IF * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = ONE - TWO*PDRAND(0) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = ONE - TWO*PDRAND(0) 30 CONTINUE A(IK,JK) = ONE - TWO*PDRAND(0) DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = ONE - TWO*PDRAND(0) A(IK,JK+J) = A(IK+J,JK) 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 A(IK,JK) = ONE - TWO*PDRAND(0) JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 A(IK,JK) = ONE - TWO*PDRAND(0) JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSE * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = ONE - TWO*PDRAND(0) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 A(IK,JK+J) = ABS(A(IK,JK+J)) + MAXMN IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PDMATGEN * END scalapack-2.0.2/TESTING/EIG/pdmatgen2.f000644 000766 000024 00000055541 11654025546 017504 0ustar00juliestaff000000 000000 SUBROUTINE PDMATGEN2( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * * Modified version by K. L. Dackland (U added) * Modified version by Peter Poromaa, Heavy DIAG * Modified version by Robert Granat, R(andom) added * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PDMATGEN2 : Parallel Real Double precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'U' : A returned is an Upper triangular matrix. * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * if AFORM = 'R' A random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) DOUBLE PRECISION, pointer into the local * memory to an array of dimension ( LDA, * ) containing the * local pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN, UPPR, RANDOM INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDRAND EXTERNAL ICEIL, NUMROC, LSAME, PDRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) UPPR = LSAME( AFORM, 'U' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) RANDOM = LSAME( AFORM, 'R' ) * INFO = 0 IF( .NOT.( UPPR.OR.SYMM.OR.HERM.OR.TRAN.OR.RANDOM ) .AND. $ .NOT.LSAME( AFORM, 'C' ) .AND. $ .NOT.LSAME( AFORM, 'N' ) ) THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( UPPR.OR.SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDMATGEN2', INFO ) RETURN END IF MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = ONE - TWO*PDRAND(0) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = ONE - TWO*PDRAND(0) 30 CONTINUE A(IK,JK) = ONE - TWO*PDRAND(0) DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = ONE - TWO*PDRAND(0) A(IK,JK+J) = A(IK+J,JK) 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 A(IK,JK) = ONE - TWO*PDRAND(0) JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * Generate an upper triangular matrix. * ELSE IF ( UPPR ) THEN JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 1000 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 1000 CONTINUE * JK = 1 DO 8000 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 7000 I = 1, NB IF( JK .GT. ICNUM ) GO TO 8000 * IK = 1 DO 5000 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 6000 DO 3000 J = 1, I-1 A(IK,JK) = ONE - TWO*PDRAND(0) 3000 CONTINUE A(IK,JK) = ONE - TWO*PDRAND(0) DO 4000 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 6000 A(IK,JK+J) = ONE - TWO*PDRAND(0) 4000 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 5000 CONTINUE * 6000 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 7000 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 8000 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 1110 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 1110 CONTINUE * IK = 1 DO 1500 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 1400 J = 1, MB IF( IK .GT. IRNUM ) GO TO 1600 JK = 1 DO 1200 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 1100 I = 1, NB IF( JK .GT. ICNUM ) GO TO 1300 A(IK,JK) = ONE - TWO*PDRAND(0) JK = JK + 1 1100 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 1200 CONTINUE * 1300 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 1400 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 1500 CONTINUE 1600 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 A(IK,JK) = ONE - TWO*PDRAND(0) JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSEIF( RANDOM ) THEN * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = ONE - TWO*PDRAND(0) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 A(IK,JK+J) = ABS(A(IK,JK+J)) + MAXMN IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PDMATGEN2 * END scalapack-2.0.2/TESTING/EIG/pdnepdriver.f000644 000766 000024 00000050507 10363532303 020130 0ustar00juliestaff000000 000000 PROGRAM PDNEPDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PDNEPDRIVER is the main test program for the DOUBLE PRECISION * SCALAPACK NEP routines. This test driver performs a Schur * decomposition followed by residual check of a Hessenberg matrix. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'SCALAPACK, Version 1.4, NEP (Nonsymmetric EigenProblem) input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'NEP.out' output file name (if any) * 6 device out * 8 number of problems sizes * 1 2 3 4 6 10 100 200 vales of N * 3 number of NB's * 6 20 40 values of NB * 4 number of process grids (ordered pairs of P & Q) * 1 2 1 4 values of P * 1 2 4 1 values of Q * 20.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * DBLESZ INTEGER, default = 8 bytes. * DBLESZ indicate the length in bytes on the given platform * for a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, TOTMEM, MEMSIZ, NTESTS DOUBLE PRECISION PADVAL, ZERO, ONE PARAMETER ( DBLESZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0, ZERO = 0.0D+0, $ ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, III, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPREPAD, IPW, IPWI, IPWR, IPZ, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LDA, LDZ, LWORK, $ MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, NOUT, $ NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, QRESID, TMFLOPS, ZNORM * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCZ( DLEN_ ), IERR( 2 ), $ IDUM( 1 ), NBVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), MEM( MEMSIZ ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDCHEKPAD, $ PDFILLPAD, PDGEMM, PDLAHQR, PDLASET, PDMATGEN, $ PDNEPFCHK, PDNEPINFO, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE, PDLANHS EXTERNAL ILCM, NUMROC, PDLAMCH, PDLANGE, PDLANHS * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PDNEPINFO( OUTFILE, NOUT, NMAT, NVAL, NTESTS, NNB, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.6 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) IPREPAD = IPREPAD + 1000 IMIDPAD = IMIDPAD + 1000 IPOSTPAD = IPOSTPAD + 1000 ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Initialize the array descriptor for the matrix Z * CALL DESCINIT( DESCZ, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 2 ) ) * LDA = DESCA( LLD_ ) LDZ = DESCZ( LLD_ ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPZ = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWR = IPZ + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWI = IPWR + N + IPOSTPAD + IPREPAD IPW = IPWI + N + IPOSTPAD + IPREPAD III = N / NB IF( III*NB.LT.N ) $ III = III + 1 III = 7*III / ILCM( NPROW, NPCOL ) * * LWORK = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, III ) LWORK = LWORK + MAX(2*N, (8*ILCM(NPROW,NPCOL)+2)**2 ) * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PDNEPFCHK and PDLANHS * WORKSIZ = LWORK + MAX( NP*DESCA( NB_ ), $ DESCA( MB_ )*NQ ) + IPOSTPAD * ELSE * WORKSIZ = LWORK + IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Schur reduction', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate matrix Z = In * CALL PDLASET( 'All', N, N, ZERO, ONE, MEM( IPZ ), 1, 1, $ DESCZ ) * * Generate matrix A upper Hessenberg * CALL PDMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) CALL PDLASET( 'Lower', MAX( 0, N-2 ), MAX( 0, N-2 ), $ ZERO, ZERO, MEM( IPA ), MIN( N, 3 ), 1, $ DESCA ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPZ-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANHS( 'I', N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANHS', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANHS', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PDFILLPAD( ICTXT, N, 1, MEM( IPWR-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, N, 1, MEM( IPWI-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, PADVAL ) * END IF * CALL SLBOOT( ) CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform NEP factorization * CALL PDLAHQR( .TRUE., .TRUE., N, 1, N, MEM( IPA ), DESCA, $ MEM( IPWR ), MEM( IPWI ), 1, N, MEM( IPZ ), $ DESCZ, MEM( IPW ), LWORK, IDUM, 0, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * )'PDLAHQR INFO=', INFO KFAIL = KFAIL + 1 GO TO 10 END IF * IF( CHECK ) THEN * * Check for memory overwrite in NEP factorization * CALL PDCHEKPAD( ICTXT, 'PDLAHQR (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLAHQR (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLAHQR (WR)', N, 1, $ MEM( IPWR-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLAHQR (WI)', N, 1, $ MEM( IPWI-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLAHQR (WORK)', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || Z * H * Z**T - H0 || / ( N*|| H0 ||*EPS ) * CALL PDNEPFCHK( N, MEM( IPA ), 1, 1, DESCA, IASEED, $ MEM( IPZ ), 1, 1, DESCZ, ANORM, $ FRESID, MEM( IPW ) ) * CALL PDCHEKPAD( ICTXT, 'PDNEPFCHK (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDNEPFCHK (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDNEPFCHK (WORK)', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || (Z**T)*Z - In ||_1 * CALL PDLASET( 'All', N, N, ZERO, ONE, MEM( IPA ), 1, $ 1, DESCA ) CALL PDGEMM( 'Transpose', 'No transpose', N, N, N, $ -ONE, MEM( IPZ ), 1, 1, DESCZ, $ MEM( IPZ ), 1, 1, DESCZ, ONE, MEM( IPA ), $ 1, 1, DESCA ) ZNORM = PDLANGE( '1', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) QRESID = ZNORM / ( DBLE( N )*PDLAMCH( ICTXT, 'P' ) ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( ( FRESID-FRESID ).EQ.0.0D+0 ) .AND. $ ( QRESID.LE.THRESH ) .AND. $ ( ( QRESID-QRESID ).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 )FRESID WRITE( NOUT, FMT = 9985 )QRESID END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID QRESID = QRESID - QRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 18 N^3 flops for PxLAHQR * NOPS = 18.0D+0*DBLE( N )**3 * * Calculate total megaflops -- factorization only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q NEP Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- --- ---- ---- -------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, F8.2, $ 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||H - Q*S*Q^T|| / (||H|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Q^T*Q - I|| / ( N * eps ) ', G25.7 ) * STOP * * End of PDNEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pdnepfchk.f000644 000766 000024 00000026050 10363532303 017544 0ustar00juliestaff000000 000000 SUBROUTINE PDNEPFCHK( N, A, IA, JA, DESCA, IASEED, Z, IZ, JZ, $ DESCZ, ANORM, FRESID, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IASEED, IZ, JA, JZ, N DOUBLE PRECISION ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) DOUBLE PRECISION A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDNEPFCHK computes the residual * || sub(Z)*sub( A )*sub(Z)**T - sub( Ao ) || / (||sub( Ao )||*eps*N), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A(IA:IA+M-1,JA:JA+N-1), sub( Z ) = Z(IZ:IZ+N-1,JZ:JZ+N-1) * and ||.|| stands for the infinity norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The order of sub( A ) and sub( Z ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub(Z)*sub( A )*sub(Z)**T - sub( Ao ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * Z (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_Z,LOCc(JZ+N-1)). On entry, this * array contains the local pieces of the N-by-N distributed * matrix sub( Z ). * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * ANORM (global input) DOUBLE PRECISION * The Infinity norm of sub( A ). * * FRESID (global output) DOUBLE PRECISION * The maximum (worst) factorizational error. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK). * LWORK >= MAX( NpA0 * NB_A, MB_A * NqA0 ) where * * IROFFA = MOD( IA-1, MB_A ), * ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * WORK is used to store a block of rows and a block of columns * of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, IB, ICTXT, IIA, IOFFA, IROFF, $ IW, J, JB, JJA, JN, LDA, LDW, MYCOL, MYROW, NP, $ NPCOL, NPROW DOUBLE PRECISION EPS * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, DMATADD, INFOG2L, $ PDGEMM, PDLACPY, PDLASET, PDMATGEN * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL ICEIL, NUMROC, PDLAMCH, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( ICTXT, 'eps' ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF LDW = MAX( 1, NP ) * * First compute H <- H * Z**T * CALL DESCSET( DESCW, DESCA( MB_ ), N, DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * DO 10 I = IA, IA + N - 1, DESCA( MB_ ) IB = MIN( IA+N-I, DESCA( MB_ ) ) * CALL PDLACPY( 'All', IB, N, A, I, JA, DESCA, WORK, 1, 1, $ DESCW ) CALL PDGEMM( 'No transpose', 'Transpose', IB, N, N, ONE, WORK, $ 1, 1, DESCW, Z, IZ, JZ, DESCZ, ZERO, A, I, JA, $ DESCA ) * DESCW( RSRC_ ) = MOD( DESCW( RSRC_ )+1, NPROW ) * 10 CONTINUE * * Then compute H <- Z * H = Z * H0 * Z**T * CALL DESCSET( DESCW, N, DESCA( NB_ ), DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, LDW ) * DO 20 J = JA, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * CALL PDLACPY( 'All', N, JB, A, IA, J, DESCA, WORK, 1, 1, $ DESCW ) CALL PDGEMM( 'No transpose', 'No transpose', N, JB, N, ONE, Z, $ IZ, JZ, DESCZ, WORK, 1, 1, DESCW, ZERO, A, IA, J, $ DESCA ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * 20 CONTINUE * * Compute H - H0 * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA-1 )*LDA IW = 1 JB = JN - JA + 1 DESCW( CSRC_ ) = IACOL * * Handle first block of columns separately * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PDMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ NP, JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL PDLASET( 'Lower', MAX( 0, N-2 ), JB, ZERO, ZERO, WORK, $ MIN( IW+2, N ), 1, DESCW ) CALL DMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF * IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * DO 30 J = JN + 1, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PDMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, NP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL PDLASET( 'Lower', MAX( 0, N-IW-1 ), JB, ZERO, ZERO, $ WORK, MIN( N, IW+2 ), 1, DESCW ) CALL DMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) 30 CONTINUE * * Calculate factor residual * FRESID = PDLANGE( 'I', N, N, A, IA, JA, DESCA, WORK ) / $ ( N*EPS*ANORM ) * RETURN * * End PDNEPFCHK * END scalapack-2.0.2/TESTING/EIG/pdnepinfo.f000644 000766 000024 00000027677 10363532303 017604 0ustar00juliestaff000000 000000 SUBROUTINE PDNEPINFO( SUMMRY, NOUT, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, NGRIDS, $ NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PDNEPINFO gets needed startup information for PDHSEQR drivers * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (the order of the matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH * * WORK (local workspace) INTEGER array of dimension >= * MAX( 3, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ), used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * * Implemented by: G. Henry, May 10, 1996 * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'NEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * )NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 )'N', LDNVAL GO TO 30 END IF READ( NIN, FMT = * )( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * )NNB IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 )'NB', LDNBVAL GO TO 30 END IF READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) * DO 10 I = 1, NNB IF( NBVAL( I ).LT.6 ) THEN WRITE( NOUT, FMT = 9992 )NBVAL( I ) GO TO 30 END IF 10 CONTINUE * * Get number of grids * READ( NIN, FMT = * )NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDPVAL GO TO 30 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDQVAL GO TO 30 END IF * * Get values of P and Q * READ( NIN, FMT = * )( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * )( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * )THRESH * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 20 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 20 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK QSQ^T by Schur Decomposition.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'Tests of the parallel ' // $ 'real double precision Schur decomposition.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Residual = ||H-QSQ^T|| / ' // $ '(||H|| * eps * N )' WRITE( NOUT, FMT = 9999 ) $ ' Orthogonality residual = ||I - Q^TQ|| / ' // '( eps * N )' WRITE( NOUT, FMT = 9999 )'The matrix A is randomly ' // $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'An explanation of the input/output ' $ // 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or ' // $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the ' // 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the' // $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than' // $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'NEP time : Time in seconds to decompose the ' // ' matrix' WRITE( NOUT, FMT = 9999 )'MFLOPS : Rate of execution ' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 )'N ', $ ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 )'NB ', $ ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 )'P ', $ ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 )'Q ', $ ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 )EPS WRITE( NOUT, FMT = 9998 )THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 30 CONTINUE WRITE( NOUT, FMT = 9993 ) CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9992 FORMAT( ' Blocking size too small at ', I2, ' must be >=6.' ) * * End of PDNEPINFO * END scalapack-2.0.2/TESTING/EIG/pdrptseptst.f000644 000766 000024 00000005261 10363532303 020177 0ustar00juliestaff000000 000000 * * PROGRAM PDRPTSEPTST * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Repeat parallel symmetric eigenproblem test * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER MAXN, LWORK, LIWORK PARAMETER ( MAXN = 200, LWORK = 500000, $ LIWORK = 6*MAXN+4 ) * .. * .. Local Scalars .. CHARACTER HETERO, SUBTESTS, UPLO INTEGER CONTEXT, IAM, INFO, IPOSTPAD, IPREPAD, LDA, $ MATTYPE, N, NB, NPCOL, NPROCS, NPROW DOUBLE PRECISION ABSTOL, THRESH * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), ICLUSTR( MAXN ), IFAIL( MAXN ), $ ISEED( 4 ), IWORK( LIWORK ) DOUBLE PRECISION A( MAXN*MAXN ), COPYA( MAXN*MAXN ), $ GAP( MAXN ), WIN( MAXN ), WNEW( MAXN ), $ WORK( LWORK ), Z( MAXN*MAXN ) * .. * * * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDINIT, $ BLACS_PINFO, BLACS_SETUP, DESCINIT, PDSEPTST * .. * .. Executable Statements .. * IPREPAD = 3 IPOSTPAD = 3 LDA = MAXN * * Set HETERO to 'Y' if you want to turn off the PxSYEV tests * HETERO = 'N' * * These lines should be replaced by the output from pxSEPdriver * * ISEED( 1 ) = 2312 ISEED( 2 ) = 3709 ISEED( 3 ) = 666 ISEED( 4 ) = 3371 UPLO = 'U' SUBTESTS = 'Y' N = 33 NPROW = 2 NPCOL = 2 NB = 4 MATTYPE = 9 * note: the printout often makes a mess of ABSTOL ABSTOL = 0.1175494351D-37 THRESH = .350000D+01 * CALL BLACS_PINFO( IAM, NPROCS ) IF( NPROCS.LT.1 ) THEN * NPROCS = NPROW*NPCOL CALL BLACS_SETUP( IAM, NPROCS ) END IF CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, CONTEXT, LDA, INFO ) * CALL PDSEPTST( DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, N, $ ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, WORK, $ LWORK-IPREPAD-IPOSTPAD, IWORK, $ LIWORK-IPREPAD-IPOSTPAD, HETERO, 6, INFO ) * * * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * * CALL BLACS_EXIT( 0 ) STOP * * * * End of PDRPTSEPTST * END scalapack-2.0.2/TESTING/EIG/pdsdpsubtst.f000644 000766 000024 00000040560 10363532303 020163 0ustar00juliestaff000000 000000 SUBROUTINE PDSDPSUBTST( WKNOWN, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, IA, JA, DESCA, WIN, WNEW, $ IPREPAD, IPOSTPAD, WORK, LWORK, LWORK1, $ IWORK, LIWORK, $ RESULT, TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 16, 2000 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER UPLO INTEGER IA, IPOSTPAD, IPREPAD, JA, LWORK, LWORK1, N, $ NOUT, RESULT, LIWORK DOUBLE PRECISION ABSTOL, QTQNRM, THRESH, TSTNRM * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) DOUBLE PRECISION A( * ), COPYA( * ), WIN( * ), WNEW( * ), $ WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDSDPSUBTST calls PDSYEVD and then tests the output of * PDSYEVD * The following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PDSYEVD when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. * * A (local workspace) DOUBLE PRECISION array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PDSYEVD for a description of block cyclic layout. * The test matrix, which is then modified by PDSYEVD * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) DOUBLE PRECISION array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) DOUBLE PRECISION array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PDSEPCHK and PDSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) DOUBLE PRECISION array, dimension (N) * The eigenvalues as computed by this call to PDSYEVD. * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PDSYEVD * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PDSYEVD * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DT_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION FIVE, NEGONE, PADVAL, ZERO PARAMETER ( PADVAL = 13.5285D+0, FIVE = 5.0D+0, $ NEGONE = -1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IAM, INFO, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, J, MINSIZE, MQ, MYCOL, MYROW, $ NP, NPCOL, NPROW, NQ, RESAQ, RESQTQ, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEV, SIZESYEVX, $ SIZETMS, SIZETST, SIZESYEVD, ISIZESYEVD, $ TRILWMIN DOUBLE PRECISION EPS, EPSNORMA, ERROR, MAXERROR, MINERROR, $ NORMWIN, SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PDLANSY EXTERNAL LSAME, NUMROC, PDLAMCH, PDLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, IGAMN2D, IGAMX2D, $ PDCHEKPAD, PDELSET, PDFILLPAD, PDLASIZESQP, $ PDSEPCHK, PDSEPQTQ, PDSYEVD, DGAMN2D, $ DGAMX2D, DLACPY, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DT_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PDLASIZESQP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESYEV, $ SIZESYEVD, ISIZESYEVD, SIZESUBTST, ISIZESUBTST, $ SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PDLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1+IPREPAD ) ), $ ABS( WIN( N+IPREPAD ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3D+0 10 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159D+0 30 CONTINUE * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * Find the amount of workspace needed with or without eigenvectors. * TRILWMIN = 3*N + MAX( DESCA( NB_ )*( NP+1 ), 3*DESCA( NB_ ) ) MINSIZE = MAX( 1 + 6*N + 2*NP*NQ, TRILWMIN ) + 2*N * CALL DLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PDFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0D+0 ) * * Make sure that PDSYEVD does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, N, 1 CALL PDELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0D+0 ) 50 CONTINUE 60 CONTINUE * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PDSYEVD( 'V', UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), LWORK1, IWORK( 1+IPREPAD ), $ LIWORK, INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVD-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PDCHEKPAD( DESCZ( CTXT_ ), 'PDSYEVD-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVD-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVD-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )INFO IF( INFO.EQ.(N+1) ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 END IF ELSE IF( INFO.EQ.14 .AND. LWORK1.GE.MINSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * IF( RESULT.EQ.0 .OR. INFO.GT.N ) THEN * * Make sure that different processes return the same eigenvalues. * This is a more exhaustive check that provided by PDSYEVD. * DO 70 I = 1, N WORK( I ) = WNEW( I+IPREPAD ) WORK( I+N ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', N, 1, WORK, N, 1, $ 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', N, 1, $ WORK( 1+N ), N, 1, 1, -1, -1, 0 ) * DO 80 I = 1, N * IF( ABS( WORK( I )-WORK( N+I ) ).GT.ZERO ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 GO TO 90 END IF 80 CONTINUE 90 CONTINUE END IF * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PDLANSY( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ WORK )*EPS END IF * * Note that a couple key variables get redefined in PDSEPCHK * as described by this table: * * PDSEPTST name PDSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * * * Perform the |AQ - QE| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3D+0 ) * RESAQ = 0 * CALL PDSEPCHK( N, N, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RESAQ ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSEPCHK-WORK', SIZECHK, 1, $ WORK, SIZECHK, IPREPAD, IPOSTPAD, 4.3D+0 ) * IF( RESAQ.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9993 ) END IF * * Perform the |QTQ - I| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1, WORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3D+0 ) * RESQTQ = 0 * * DO 40 I = 1, 2 IWORK( IPREPAD + I ) = 0 40 CONTINUE CALL PDSEPQTQ( N, N, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1 ), IWORK( 1 ), WORK( 1 ), $ WORK( IPREPAD+1 ), SIZEQTQ, QTQNRM, INFO, $ RESQTQ ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSEPQTQ-WORK', SIZEQTQ, 1, $ WORK, SIZEQTQ, IPREPAD, IPOSTPAD, 4.3D+0 ) * IF( RESQTQ.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9992 ) END IF * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF ENDIF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN .AND. N.GT.0 ) THEN * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN MAXERROR = 0 * DO 140 I = 1, N ERROR = ABS( WIN( I+IPREPAD )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 140 CONTINUE MINERROR = MIN( MAXERROR, MINERROR ) * IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * * RETURN * 9999 FORMAT( 'PDSYEVD returned INFO=', I7 ) 9998 FORMAT( 'PDSEPQTQ in PDSDPSUBTST returned INFO=', I7 ) 9997 FORMAT( 'PDSDPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PDSYEVD returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'Different processes return different eigenvalues' ) 9994 FORMAT( 'Heterogeneity detected by PDSYEVD' ) 9993 FORMAT( 'PDSYEVD failed the |AQ -QE| test' ) 9992 FORMAT( 'PDSYEVD failed the |QTQ -I| test' ) * * End of PDSDPSUBTST * END scalapack-2.0.2/TESTING/EIG/pdsepchk.f000644 000766 000024 00000024207 11750130340 017401 0ustar00juliestaff000000 000000 * * SUBROUTINE PDSEPCHK( MS, NV, A, IA, JA, DESCA, EPSNORMA, THRESH, $ Q, IQ, JQ, DESCQ, C, IC, JC, DESCC, W, WORK, $ LWORK, TSTNRM, RESULT ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER IA, IC, IQ, JA, JC, JQ, LWORK, MS, NV, RESULT DOUBLE PRECISION EPSNORMA, THRESH, TSTNRM * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCC( * ), DESCQ( * ) DOUBLE PRECISION A( * ), C( * ), Q( * ), W( * ), WORK( * ) * .. * * Purpose * ======= * * Compute |AQ- QL| / (EPSNORMA * N) * where EPSNORMA = (abstol + eps)*norm(A) when called by pdsqpsubtst. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * MP = number of local rows in A, C and Q * MQ = number of local columns in A * NQ = number of local columns in C and Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in A, C and Q * Also, the number of global columns in A * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q. * * A (local input) DOUBLE PRECISION pointer to an * array in local memory of dimension (LLD_A, LOCc(JA+N-1)). * This array contains the local pieces of the MS-by-MS * distributed test matrix A * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * EPSNORMA (input) DOUBLE PRECISION * abstol + eps * inf.norm(A) * Abstol is absolute tolerence for the eigenvalues and is set * in the calling routines, pdsepsubtst and pdsqpsubtst. * * THRESH (input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) DOUBLE PRECISION array * global dimension (MS, NV), local dimension (DESCA(DLEN_), NQ) * * Contains the eigenvectors as computed by PDSYEVX * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Q. * * C (local workspace) DOUBLE PRECISION array, * global dimension (NV, NV), local dimension (DESCA(DLEN_), MQ) * * Accumulator for computing AQ -QL * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * W (global input) DOUBLE PRECISION array, dimension (NV) * * Contains the computed eigenvalues * * WORK (local workspace) DOUBLE PRECISION array, * dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / ( EPSNROMA * MS ) * * RESULT (global output) INTEGER * 0 if the test passes i.e. * |AQ -QL| / (abstol + eps * norm(A) ) <= n* THRESH * 1 if the test fails i.e. * |AQ -QL| / (abstol + eps * norm(A) ) > n * THRESH * * .. Local Scalars .. * INTEGER INFO, J, LOCALCOL, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ, PCOL DOUBLE PRECISION NORM * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, NEGONE PARAMETER ( ONE = 1.0D+0, NEGONE = -1.0D+0 ) * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC DOUBLE PRECISION PDLANGE EXTERNAL INDXG2L, INDXG2P, NUMROC, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DLACPY, DSCAL, PDGEMM, $ PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * RESULT = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 CALL CHK1MAT( MS, 1, MS, 1, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IQ, JQ, DESCQ, 12, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IC, JC, DESCC, 16, INFO ) * IF( INFO.EQ.0 ) THEN * MP = NUMROC( MS, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IF( IQ.NE.1 ) THEN INFO = -10 ELSE IF( JQ.NE.1 ) THEN INFO = -11 ELSE IF( IA.NE.1 ) THEN INFO = -4 ELSE IF( JA.NE.1 ) THEN INFO = -5 ELSE IF( IC.NE.1 ) THEN INFO = -14 ELSE IF( JC.NE.1 ) THEN INFO = -15 ELSE IF( LWORK.LT.NQ ) THEN INFO = -19 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PDSEPCHK', -INFO ) RETURN END IF * * C = Q * W * CALL DLACPY( 'A', MP, NQ, Q, DESCQ( LLD_ ), C, DESCC( LLD_ ) ) * * DO 10 J = 1, NV PCOL = INDXG2P( J, DESCC( NB_ ), 0, 0, NPCOL ) LOCALCOL = INDXG2L( J, DESCC( NB_ ), 0, 0, NPCOL ) * IF( MYCOL.EQ.PCOL ) THEN CALL DSCAL( MP, W( J ), C( ( LOCALCOL-1 )*DESCC( LLD_ )+1 ), $ 1 ) END IF 10 CONTINUE * * * C = C - A * Q * CALL PDGEMM( 'N', 'N', MS, NV, MS, NEGONE, A, 1, 1, DESCA, Q, 1, $ 1, DESCQ, ONE, C, 1, 1, DESCC ) * * Compute the norm of C * * NORM = PDLANGE( 'M', MS, NV, C, 1, 1, DESCC, WORK ) * * TSTNRM = NORM / EPSNORMA / MAX( MS, 1 ) * IF( TSTNRM.GT.THRESH .OR. ( TSTNRM-TSTNRM.NE.0.0D0 ) ) THEN RESULT = 1 END IF * * RETURN * * End of PDSEPCHK * END scalapack-2.0.2/TESTING/EIG/pdsepdriver.f000644 000766 000024 00000025261 10363532303 020134 0ustar00juliestaff000000 000000 * * PROGRAM PDSEPDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Parallel DOUBLE PRECISION symmetric eigenproblem test driver * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. INTSIZ and DBLSIZ * indicate the length in bytes on the given platform for an integer * and a double precision real. * For example, on our system with 8 MB of memory, TOTMEM=6500000 * (leaves 1.5 MB for OS, code, BLACS buffer, etc), the length of a * DOUBLE is 8, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * The full tester requires approximately (5 n + 5 n^2/p + slop) * DOUBLE PRECISION words and 6*n integer words. * So, TOTMEM should be set to at least 1.1 * 8 * (5n + 5n^2/p) * * WHAT WE TEST * ============ * * This routine tests PDSYEVX, the expert driver for the parallel * symmetric eigenvalue problem, PDSYEV and PDSYEVD. We would like * to cover all possible combinations of: matrix size, process * configuration (nprow and npcol), block size (nb), * matrix type (??), range of eigenvalue (all, by value, * by position), sorting options, and upper vs. lower storage. * * As PDSYEV returns an error message when heterogeneity is detected, * the PDSYEV tests can be suppressed by changing the appropiate * entry in the input file. * * We intend to provide two types of test input files, an * installation test and a thorough test. * * We also intend that the reports be meaningful. Our input file * will allow multiple requests where each request is a cross product * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, DBLESZ, NIN PARAMETER ( TOTMEM = 2000000, DBLESZ = 8, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / DBLESZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) DOUBLE PRECISION MEM( MEMSIZ ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PDLACHKIEEE, PDLASNBT, PDSEPREQ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PDLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) END IF * CALL PDLACHKIEEE( ISIEEE, DLAMCH( 'O' ), DLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) END IF GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK symmetric Eigendecomposition routines.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'symmetric eigenvalue routine: PDSYEVX & '// $ ' PDSYEV & PDSYEVD.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 )' ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )' ||Q^T*Q - I|| ' // '/ (N * eps)' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ ' : the QTQ norm is allowed to exceed THRESH' // $ ' for those eigenvectors' WRITE( NOUT, FMT = 9999 )' : which could not be ' // $ 'reorthogonalized for lack of workspace.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see PDSEPtst.f).' WRITE( NOUT, FMT = 9999 )'SUB : Subtests ' // $ '(see PDSEPtst).f' WRITE( NOUT, FMT = 9999 )'CHK : ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )'QTQ : ||Q^T*Q - I||/ (N * eps)' WRITE( NOUT, FMT = 9999 ) $ ' : when the adjusted QTQ exceeds THRESH', $ ' the adjusted QTQ norm is printed' WRITE( NOUT, FMT = 9999 ) $ ' : otherwise the true QTQ norm is printed' WRITE( NOUT, FMT = 9999 ) $ ' If NT>1, CHK and QTQ are the max over all ' // $ 'eigen request tests' WRITE( NOUT, FMT = 9999 ) $ 'TEST : EVX - testing PDSYEVX, EV - testing PDSYEV, '// $ 'EVD - testing PDSYEVD' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PDSEPREQ( HETERO, NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * c CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', ' ') * * * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * 9999 FORMAT( A ) 9998 FORMAT( ' I am about to check to make sure that overflow' ) 9997 FORMAT( ' is handled in the ieee default manner. If this' ) 9996 FORMAT( ' is the last output you see, you should assume' ) 9995 FORMAT( ' that overflow caused a floating point exception.' ) 9994 FORMAT( ' In that case, we recommend that you add -DNO_IEEE' ) 9993 FORMAT( ' to the CDEFS line in SLmake.inc.' ) 9992 FORMAT( ' Alternatively, you could set CDEFS in SLmake.inc ' ) 9991 FORMAT( ' to enable the default ieee behaviour, However, this' ) 9990 FORMAT( ' may result in good or very bad performance.' ) 9989 FORMAT( ' Either signed zeroes or signed infinities ' ) 9988 FORMAT( ' work incorrectly or your system. Change your' ) 9987 FORMAT( ' SLmake.inc as suggested above.' ) * 9986 FORMAT( ' Your system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP SUB WALL CPU ', $ ' CHK QTQ CHECK TEST' ) 9978 FORMAT( ' ----- --- --- --- --- --- -------- --------', $ ' --------- --------- ----- ----' ) * * End of PDSEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pdsepinfo.f000644 000766 000024 00000030550 11622500733 017572 0ustar00juliestaff000000 000000 * * SUBROUTINE PDSEPINFO( CONTEXT, IAM, NIN, NOUT, MAXSETSIZE, $ NMATSIZES, MATSIZES, NUPLOS, UPLOS, $ NPCONFIGS, NPROWS, NPCOLS, NBS, NMATTYPES, $ MATTYPES, MAXTYPE, SUBTESTS, THRESH, ORDER, $ ABSTOL, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PDSEPINFO reads the input test data file (INFILE), copies the * information therein to all processes and returns this information * in the corresponding parameters. * * Arguments * ========= * * CONTEXT (global input) INTEGER * BLACS Context * * IAM (local input) INTEGER * process number. * IAM.EQ.0 on the proceesor that performs I/O * * NIN (global input) INTEGER * The unit number of the input file. * * NOUT (global output) INTEGER * The unit number for output file. * if NOUT = 6, ouput to screen, * if NOUT = 0, output to stderr * Only defined for process 0. * * MAXSETSIZE (global output) INTEGER * Maximum set size. Size of the following arrays: * MATSIZES, MATTYPES, NBS, NPCOLS, NPROWS * * NMATSIZES (global output) INTEGER * Number of matrix sizes to test * * MATSIZES (global output) INTEGER array dimension MAXSETSIZE * Matrix sizes to test * * NUPLOS (global output) INTEGER * Number of UPLO values to test * * UPLOS (global output) CHARACTER*1 array dimension 2 * Values of UPLO to test * * NPCONFIGS (global output) INTEGER * Number of process configuratins (NPROW, NPCOL, NB) * * NPROWS (global output) INTEGER array dimension MAXSETSIZE * Values of NPROW to test * * NPCOLS (global output) INTEGER array dimension MAXSETSIZE * Values of NPCOL to test * * NBS (global output) INTEGER array dimension MAXSETSIZE * Values of NB to test * * NMATTYPES (global output) INTEGER * Number of matrix types to test * * MATTYPES (global output) INTEGER array dimension MAXSETSIZE * Matrix types to test * Refer to PDSEPTST for a complete description of the * supported matrix types. * * MAXTYPE (global input) INTEGER * Maximum allowed matrix type * * SUBTESTS (global output) CHARACTER * 'N' = Do not perform subtests * 'Y' = Perfrom subtests * * * THRESH (global output) @(tupc) * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * ( THRESH is set to 1/10 of the value defined in the .dat * file when NOUT = 13. THRESH is set to 1/20 of the value * defined in the .dat file when NOUT = 14. This allows us * to specify more stringent criteria for our internal testing ) * * ORDER (global output) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global output) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PDSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * If ( ABSTOL .EQ. 0 in SEP.dat, it is set to * 2.0 * PDLAMCH( 'u' ) in this routine. * * INFO (global output) INTEGER * 0 = normal return * -1 = end of file * -2 = incorrrect data specification * * .. Scalar Arguments .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, INFO, MAXSETSIZE, MAXTYPE, NIN, $ NMATSIZES, NMATTYPES, NOUT, NPCONFIGS, NUPLOS, $ ORDER DOUBLE PRECISION ABSTOL, THRESH * .. * .. Array Arguments .. CHARACTER UPLOS( 2 ) INTEGER MATSIZES( MAXSETSIZE ), MATTYPES( MAXSETSIZE ), $ NBS( MAXSETSIZE ), NPCOLS( MAXSETSIZE ), $ NPROWS( MAXSETSIZE ) * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION TWO, TEN, TWENTY PARAMETER ( TWO = 2.0D0, TEN = 10.0D0, TWENTY = 20.0D0 ) * .. * .. Local Scalars .. CHARACTER*80 TESTSUMMRY INTEGER I, ISUBTESTS * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * * .. External Subroutines .. EXTERNAL DGEBR2D, DGEBS2D, IGEBR2D, IGEBS2D * .. * * .. Local Arrays .. INTEGER IUPLOS( 2 ) * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 IF( IAM.EQ.0 ) THEN READ( NIN, FMT = 9997 )TESTSUMMRY TESTSUMMRY = ' ' READ( NIN, FMT = 9997 )TESTSUMMRY WRITE( NOUT, FMT = 9997 )TESTSUMMRY END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NMATSIZES CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NMATSIZES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NMATSIZES, 1, 0, 0 ) END IF IF( NMATSIZES.EQ.-1 ) THEN INFO = -1 GO TO 70 END IF IF( NMATSIZES.LT.1 .OR. NMATSIZES.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'Matrix size', NMATSIZES, 1, $ MAXSETSIZE END IF INFO = -2 GO TO 70 END IF * * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( MATSIZES( I ), I = 1, NMATSIZES ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NMATSIZES, MATSIZES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NMATSIZES, MATSIZES, 1, $ 0, 0 ) END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NUPLOS CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NUPLOS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NUPLOS, 1, 0, 0 ) END IF IF( NUPLOS.LT.1 .OR. NUPLOS.GT.2 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'# of UPLOs', NUPLOS, 1, 2 END IF INFO = -2 GO TO 70 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( UPLOS( I ), I = 1, NUPLOS ) DO 10 I = 1, NUPLOS IF( LSAME( UPLOS( I ), 'L' ) ) THEN IUPLOS( I ) = 1 ELSE IUPLOS( I ) = 2 END IF 10 CONTINUE CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NUPLOS, IUPLOS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NUPLOS, IUPLOS, 1, 0, 0 ) END IF DO 20 I = 1, NUPLOS IF( IUPLOS( I ).EQ.1 ) THEN UPLOS( I ) = 'L' ELSE UPLOS( I ) = 'U' END IF 20 CONTINUE * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NPCONFIGS CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NPCONFIGS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NPCONFIGS, 1, 0, 0 ) END IF IF( NPCONFIGS.LT.1 .OR. NPCONFIGS.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'# proc configs', NPCONFIGS, 1, $ MAXSETSIZE END IF INFO = -2 GO TO 70 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NPROWS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPROWS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPROWS, 1, 0, $ 0 ) END IF DO 30 I = 1, NPCONFIGS IF( NPROWS( I ).LE.0 ) $ INFO = -2 30 CONTINUE IF( INFO.EQ.-2 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9996 )' NPROW' END IF GO TO 70 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NPCOLS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPCOLS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPCOLS, 1, 0, $ 0 ) END IF DO 40 I = 1, NPCONFIGS IF( NPCOLS( I ).LE.0 ) $ INFO = -2 40 CONTINUE IF( INFO.EQ.-2 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9996 )' NPCOL' END IF GO TO 70 END IF * * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NBS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NBS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NBS, 1, 0, 0 ) END IF DO 50 I = 1, NPCONFIGS IF( NBS( I ).LE.0 ) $ INFO = -2 50 CONTINUE IF( INFO.EQ.-2 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9996 )' NB' END IF GO TO 70 END IF * * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NMATTYPES CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NMATTYPES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NMATTYPES, 1, 0, 0 ) END IF IF( NMATTYPES.LT.1 .OR. NMATTYPES.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'matrix types', NMATTYPES, 1, $ MAXSETSIZE END IF INFO = -2 GO TO 70 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( MATTYPES( I ), I = 1, NMATTYPES ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NMATTYPES, MATTYPES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NMATTYPES, MATTYPES, 1, $ 0, 0 ) END IF * DO 60 I = 1, NMATTYPES IF( MATTYPES( I ).LT.1 .OR. MATTYPES( I ).GT.MAXTYPE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'matrix type', MATTYPES( I ), $ 1, MAXTYPE END IF MATTYPES( I ) = 1 END IF 60 CONTINUE * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )SUBTESTS IF( LSAME( SUBTESTS, 'Y' ) ) THEN ISUBTESTS = 2 ELSE ISUBTESTS = 1 END IF CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, ISUBTESTS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, ISUBTESTS, 1, 0, 0 ) END IF IF( ISUBTESTS.EQ.2 ) THEN SUBTESTS = 'Y' ELSE SUBTESTS = 'N' END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )THRESH IF( NOUT.EQ.13 ) $ THRESH = THRESH / TEN IF( NOUT.EQ.14 ) $ THRESH = THRESH / TWENTY CALL DGEBS2D( CONTEXT, 'All', ' ', 1, 1, THRESH, 1 ) ELSE CALL DGEBR2D( CONTEXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) END IF * ORDER = 0 * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )ABSTOL CALL DGEBS2D( CONTEXT, 'All', ' ', 1, 1, ABSTOL, 1 ) ELSE CALL DGEBR2D( CONTEXT, 'All', ' ', 1, 1, ABSTOL, 1, 0, 0 ) END IF IF( ABSTOL.LT.0 ) $ ABSTOL = TWO*PDLAMCH( CONTEXT, 'U' ) * INFO = 0 * 70 CONTINUE RETURN * 9999 FORMAT( A20, ' is:', I5, ' must be between:', I5, ' and', I5 ) 9998 FORMAT( A20, ' is:', I5, ' must be:', I5, ' or', I5 ) 9997 FORMAT( A ) 9996 FORMAT( A20, ' must be positive' ) * * End of PDSEPINFO * END scalapack-2.0.2/TESTING/EIG/pdsepqtq.f000644 000766 000024 00000025335 10363532303 017450 0ustar00juliestaff000000 000000 * * SUBROUTINE PDSEPQTQ( MS, NV, THRESH, Q, IQ, JQ, DESCQ, C, IC, JC, $ DESCC, PROCDIST, ICLUSTR, GAP, WORK, LWORK, $ QTQNRM, INFO, RES ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IC, INFO, IQ, JC, JQ, LWORK, MS, NV, RES DOUBLE PRECISION QTQNRM, THRESH * .. * .. Array Arguments .. * INTEGER DESCC( * ), DESCQ( * ), ICLUSTR( * ), $ PROCDIST( * ) DOUBLE PRECISION C( * ), GAP( * ), Q( * ), WORK( * ) * .. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Purpose * ======= * * Compute |I - QT * Q| / (ulp * n) * * Arguments * ========= * * NP = number of local rows in C * NQ = number of local columns in C and Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in Q * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) DOUBLE PRECISION array, * global dimension (MS, NV), local dimension (LDQ, NQ) * * Contains the eigenvectors as computed by PDSTEIN * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Q. * * C (local workspace) DOUBLE PRECISION array, * global dimension (NV, NV), local dimension (DESCC(DLEN_), NQ) * * Accumulator for computing I - QT * Q * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * W (input) DOUBLE PRECISION array, dimension (NV) * All procesors have an identical copy of W() * * Contains the computed eigenvalues * * PROCDIST (global input) INTEGER array dimension (NPROW*NPCOL+1) * Identifies which eigenvectors are the last to be computed * by a given process * * ICLUSTR (global input) INTEGER array dimension (2*P) * This input array contains indices of eigenvectors * corresponding to a cluster of eigenvalues that could not be * orthogonalized due to insufficient workspace. * This should be the output of PDSTEIN. * * GAP (global input) DOUBLE PRECISION array, dimension (P) * This input array contains the gap between eigenvalues whose * eigenvectors could not be orthogonalized. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= 2 + MAX( DESCC( MB_ ), 2 )*( 2*NP0+MQ0 ) * Where: * NP0 = NUMROC( NV, DESCC( MB_ ), 0, 0, NPROW ) * MQ0 = NUMROC( NV, DESCC( NB_ ), 0, 0, NPCOL ) * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / EPS * * RES (global output) INTEGER * 0 if the test passes i.e. |I - QT * Q| / (ulp * n) <= THRESH * 1 if the test fails i.e. |I - QT * Q| / (ulp * n) > THRESH * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE, NEGONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, $ NEGONE = -1.0D+0 ) * .. * .. Intrinsic Functions .. * INTRINSIC DBLE, MAX * .. * .. Local Scalars .. INTEGER CLUSTER, FIRSTP, IMAX, IMIN, JMAX, JMIN, LWMIN, $ MQ0, MYCOL, MYROW, NEXTP, NP0, NPCOL, NPROW DOUBLE PRECISION NORM, QTQNRM2, ULP * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL NUMROC, PDLAMCH, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PDGEMM, PDLASET, $ PDMATADD, PXERBLA * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * RES = 0 ULP = PDLAMCH( DESCC( CTXT_ ), 'P' ) * CALL BLACS_GRIDINFO( DESCC( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 CALL CHK1MAT( MS, 1, MS, 2, IQ, JQ, DESCQ, 7, INFO ) CALL CHK1MAT( NV, 1, MS, 2, IC, JC, DESCC, 11, INFO ) * IF( INFO.EQ.0 ) THEN NP0 = NUMROC( NV, DESCC( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( NV, DESCC( NB_ ), 0, 0, NPCOL ) * LWMIN = 2 + MAX( DESCC( MB_ ), 2 )*( 2*NP0+MQ0 ) * IF( IQ.NE.1 ) THEN INFO = -5 ELSE IF( JQ.NE.1 ) THEN INFO = -6 ELSE IF( IC.NE.1 ) THEN INFO = -9 ELSE IF( JC.NE.1 ) THEN INFO = -10 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -16 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCC( CTXT_ ), 'PDSEPQTQ', -INFO ) RETURN END IF * * C = Identity matrix * CALL PDLASET( 'A', NV, NV, ZERO, ONE, C, IC, JC, DESCC ) * * C = C - QT * Q * IF( NV*MS.GT.0 ) THEN CALL PDGEMM( 'Transpose', 'N', NV, NV, MS, NEGONE, Q, 1, 1, $ DESCQ, Q, 1, 1, DESCQ, ONE, C, 1, 1, DESCC ) END IF * * Allow for poorly orthogonalized eigenvectors for large clusters * NORM = PDLANGE( '1', NV, NV, C, 1, 1, DESCC, WORK ) QTQNRM = NORM / ( DBLE( MAX( MS, 1 ) )*ULP ) * CLUSTER = 1 10 CONTINUE DO 20 FIRSTP = 1, NPROW*NPCOL IF( PROCDIST( FIRSTP ).GE.ICLUSTR( 2*( CLUSTER-1 )+1 ) ) $ GO TO 30 20 CONTINUE 30 CONTINUE * IMIN = ICLUSTR( 2*CLUSTER-1 ) JMAX = ICLUSTR( 2*CLUSTER ) * * IF( IMIN.EQ.0 ) $ GO TO 60 * DO 40 NEXTP = FIRSTP, NPROW*NPCOL IMAX = PROCDIST( NEXTP ) JMIN = IMAX + 1 * * CALL PDMATADD( IMAX-IMIN+1, JMAX-JMIN+1, ZERO, C, IMIN, JMIN, $ DESCC, GAP( CLUSTER ) / 0.01D+0, C, IMIN, JMIN, $ DESCC ) CALL PDMATADD( JMAX-JMIN+1, IMAX-IMIN+1, ZERO, C, JMIN, IMIN, $ DESCC, GAP( CLUSTER ) / 0.01D+0, C, JMIN, IMIN, $ DESCC ) IMIN = IMAX * IF( ICLUSTR( 2*CLUSTER ).LT.PROCDIST( NEXTP+1 ) ) $ GO TO 50 40 CONTINUE 50 CONTINUE * CLUSTER = CLUSTER + 1 GO TO 10 60 CONTINUE * * Compute the norm of C * NORM = PDLANGE( '1', NV, NV, C, 1, 1, DESCC, WORK ) * QTQNRM2 = NORM / ( DBLE( MAX( MS, 1 ) )*ULP ) * IF( QTQNRM2.GT.THRESH ) THEN RES = 1 QTQNRM = QTQNRM2 END IF RETURN * * End of PDSEPQTQ * END scalapack-2.0.2/TESTING/EIG/pdseprdriver.f000644 000766 000024 00000021335 11623527140 020317 0ustar00juliestaff000000 000000 PROGRAM PDSEPRDRIVER * * Parallel DOUBLE PRECISION symmetric eigenproblem test driver for PDSYEVR * IMPLICIT NONE * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. DBLESZ * indicates the length in bytes on the given platform for a number, * real for SINGLE/DOUBLE PRECISION, and complex for COMPLEX/COMPLEX*16. * For example, on a standard system, the length of a * DBLE is 8, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * TESTS PERFORMED * =============== * * This routine performs tests for combinations of: matrix size, process * configuration (nprow and npcol), block size (nb), * matrix type, range of eigenvalue (all, by value, by index), * and upper vs. lower storage. * * It returns an error message when heterogeneity is detected. * * The input file allows multiple requests where each one is * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, DBLESZ, NIN PARAMETER ( TOTMEM = 100000000, DBLESZ = 8, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / DBLESZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) DOUBLE PRECISION MEM( MEMSIZ ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PDLACHKIEEE, PDLASNBT, PDSEPRREQ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEPR.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PDLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) END IF * CALL PDLACHKIEEE( ISIEEE, DLAMCH( 'O' ), DLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'Test ScaLAPACK symmetric eigendecomposition routine.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'symmetric eigenvalue routine: PDSYEVR.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 )' ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )' ||Q^T*Q - I|| ' // '/ (N * eps)' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT = PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see PDSEPRTST).' WRITE( NOUT, FMT = 9999 )'SUB : Subtests (Y/N).' WRITE( NOUT, FMT = 9999 )'WALL : Wallclock time.' WRITE( NOUT, FMT = 9999 )'CPU : CPU time.' WRITE( NOUT, FMT = 9999 )'CHK : ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )'QTQ : ||Q^T*Q - I||/ (N * eps)' WRITE( NOUT, FMT = 9999 ) $ ' : when the adjusted QTQ norm exceeds THRESH', $ ' it is printed,' WRITE( NOUT, FMT = 9999 ) $ ' otherwise the true QTQ norm is printed.' WRITE( NOUT, FMT = 9999 ) $ ' : If more than one test is done, CHK and QTQ ' WRITE( NOUT, FMT = 9999 ) $ ' are the max over all eigentests performed.' WRITE( NOUT, FMT = 9999 ) $ 'TEST : EVR - testing PDSYEVR' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PDSEPRREQ( HETERO, NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * c CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', ' ') * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * 9999 FORMAT( A ) 9997 FORMAT( 'Check if overflow is handled in ieee default manner.' ) 9996 FORMAT( 'If this is the last output you see, you should assume') 9995 FORMAT( 'that overflow caused a floating point exception.' ) * 9986 FORMAT( 'Test ok. The system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP SUB WALL CPU ', $ ' CHK QTQ CHECK TEST' ) 9978 FORMAT( ' ----- --- --- --- --- --- -------- --------', $ ' --------- --------- ----- ----' ) * * End of PDSEPRDRIVER * END scalapack-2.0.2/TESTING/EIG/pdsepreq.f000644 000766 000024 00000022255 10363532303 017430 0ustar00juliestaff000000 000000 * * SUBROUTINE PDSEPREQ( HETERO, NIN, MEM, MEMSIZE, NOUT, ISEED, $ NTESTS, NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER HETERO INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION MEM( MEMSIZE ) * .. * * Purpose * ======= * * PDSEPREQ performs one request from the input file 'SEP.dat' * A request is the cross product of the specifications in the * input file. PDSEPREQ prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEP.dat' * * MEM (local input ) DOUBLE PRECISION ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, INTGSZ PARAMETER ( DBLESZ = 8, INTGSZ = 4 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IMIDPAD, INITCON, IPOSTPAD, $ IPREPAD, ISIZESUBTST, ISIZESYEVX, ISIZETST, $ LDA, LLWORK, MATSIZE, MATTYPE, MYCOL, MYROW, N, $ NB, NMATSIZES, NMATTYPES, NNODES, NP, NPCOL, $ NPCONFIGS, NPROW, NQ, NUPLOS, ORDER, PCONFIG, $ PTRA, PTRCOPYA, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRW, PTRW2, PTRWORK, PTRZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEV, SIZESYEVX, $ SIZETMS, SIZETST, UPLO, ISIZESYEVD, SIZESYEVD DOUBLE PRECISION ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PDLASIZESQP, PDSEPINFO, PDSEPTST * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PDSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * * DO 40 MATSIZE = 1, NMATSIZES * DO 30 PCONFIG = 1, NPCONFIGS * DO 20 MATTYPE = 1, NMATTYPES * DO 10 UPLO = 1, NUPLOS * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, $ MYCOL ) * IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PDLASIZESQP( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, $ ISIZESYEVX, SIZESYEV, $ SIZESYEVD, ISIZESYEVD, $ SIZESUBTST, ISIZESUBTST, $ SIZETST, ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + MAX( N, 1 ) + IPREPAD + IPOSTPAD PTRGAP = PTRW2 + MAX( N, 1 ) + IPREPAD + $ IPOSTPAD PTRIFAIL = PTRGAP + NPROW*NPCOL + IPREPAD + $ IPOSTPAD PTRICLUS = PTRIFAIL + ICEIL( N+IPREPAD+IPOSTPAD, $ DBLESZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, DBLESZ / INTGSZ ) PTRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, DBLESZ / INTGSZ ) LLWORK = MEMSIZE - PTRWORK + 1 C LLWORK = MEMSIZE - PTRWORK - IPREPAD - C $ IPOSTPAD + 1 NTESTS = NTESTS + 1 IF( LLWORK.LT.SIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PDSEPTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), SUBTESTS, $ THRESH, N, ABSTOL, ISEED, $ MEM( PTRA ), MEM( PTRCOPYA ), $ MEM( PTRZ ), LDA, MEM( PTRW ), $ MEM( PTRW2 ), MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ LLWORK, MEM( PTRIWRK ), $ ISIZETST, HETERO, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * )' PDSEPREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE END IF * * RETURN * * End of PDDSEPREQ * END scalapack-2.0.2/TESTING/EIG/pdseprreq.f000644 000766 000024 00000021016 11623527140 017607 0ustar00juliestaff000000 000000 SUBROUTINE PDSEPRREQ( HETERO, NIN, MEM, MEMSIZE, NOUT, ISEED, $ NTESTS, NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER HETERO INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) DOUBLE PRECISION MEM( MEMSIZE ) * * Purpose * ======= * * PDSEPRREQ performs one request from the input file 'SEPR.dat' * A request is the cross product of the specifications in the * input file. It prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEPR.dat' * * MEM (local input ) DOUBLE PRECISION ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER DBLESZ, INTGSZ PARAMETER ( DBLESZ = 8, INTGSZ = 4 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IMIDPAD, INITCON, IPOSTPAD, $ IPREPAD, ISIZESUBTST, ISIZEEVR, ISIZETST, $ LDA, LLWORK, MATSIZE, MATTYPE, MYCOL, MYROW, N, $ NB, NMATSIZES, NMATTYPES, NNODES, NP, NPCOL, $ NPCONFIGS, NPROW, NQ, NUPLOS, ORDER, PCONFIG, $ PTRA, PTRCOPYA, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRW, PTRW2, PTRWORK, PTRZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZEEVR, $ SIZETMS, SIZETST, UPLO * DOUBLE PRECISION ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PDLASIZESEPR, PDSEPINFO, PDSEPRTST * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PDSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * DO 40 MATSIZE = 1, NMATSIZES * DO 30 PCONFIG = 1, NPCONFIGS * DO 20 MATTYPE = 1, NMATTYPES * DO 10 UPLO = 1, NUPLOS * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, $ MYCOL ) * IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PDLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, ISIZEEVR, $ SIZESUBTST, ISIZESUBTST, $ SIZETST, ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + MAX( N, 1 ) + IPREPAD + IPOSTPAD PTRGAP = PTRW2 + MAX( N, 1 ) + IPREPAD + $ IPOSTPAD PTRIFAIL = PTRGAP + NPROW*NPCOL + IPREPAD + $ IPOSTPAD PTRICLUS = PTRIFAIL + ICEIL( N+IPREPAD+IPOSTPAD, $ DBLESZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, DBLESZ / INTGSZ ) PTRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, DBLESZ / INTGSZ ) LLWORK = MEMSIZE - PTRWORK + 1 NTESTS = NTESTS + 1 IF( LLWORK.LT.SIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PDSEPRTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), SUBTESTS, $ THRESH, N, ABSTOL, ISEED, $ MEM( PTRA ), MEM( PTRCOPYA ), $ MEM( PTRZ ), LDA, MEM( PTRW ), $ MEM( PTRW2 ), MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ LLWORK, MEM( PTRIWRK ), $ ISIZETST, HETERO, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * )' PDSEPRREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of PDSEPRREQ * END scalapack-2.0.2/TESTING/EIG/pdseprsubtst.f000644 000766 000024 00000070321 11623527140 020347 0ustar00juliestaff000000 000000 SUBROUTINE PDSEPRSUBTST( WKNOWN, JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, IA, JA, $ DESCA, WIN, WNEW, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK, LWORK, LWORK1, $ IWORK, LIWORK, RESULT, TSTNRM, QTQNRM, $ NOUT ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, IPOSTPAD, IPREPAD, IU, JA, LIWORK, $ LWORK, LWORK1, N, NOUT, RESULT DOUBLE PRECISION ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) DOUBLE PRECISION A( * ), COPYA( * ), GAP( * ), WIN( * ), $ WNEW( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDSEPRSUBTST calls PDSYEVR and then tests its output. * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues computed. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 100 or 250. In particular, * it should not depend on the size of the matrix. * It must be at least zero. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the residual test. * * A (local workspace) DOUBLE PRECISION array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * The test matrix, which is subsequently overwritten. * A is distributed in a 2D-block cyclic manner over both rows * and columns. * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) DOUBLE PRECISION array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) DOUBLE PRECISION array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PDSEPCHK and PDSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) DOUBLE PRECISION array, dimension (N) * The computed eigenvalues. * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to the eigensolver. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call. * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER DLEN_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( DLEN_ = 9, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285D0, FIVE = 5.0D0, $ NEGONE = -1.0D0 ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZESUBTST, ISIZEEVR, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZEEVR, SIZETMS, $ SIZETST, VALSIZE, VECSIZE DOUBLE PRECISION EPS, EPSNORMA, ERROR, MAXERROR, MAXVU, $ MINERROR, MINVL, NORMWIN, OLDVL, OLDVU, $ SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), ISEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PDLANSY EXTERNAL LSAME, NUMROC, PDLAMCH, PDLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, DGAMN2D, DGAMX2D, $ DLACPY, IGAMN2D, IGAMX2D, PDCHEKPAD, PDELSET, $ PDFILLPAD, PDLASIZESEPR, PDLASIZESYEVR, $ PDSEPCHK, PDSEPQTQ, PDSYEVR, PICHEKPAD, $ PIFILLPAD, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * CALL PDLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, ISIZEEVR, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PDLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that no information from previous calls is used * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3D0 10 CONTINUE * DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159D0 30 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF (LSAME( RANGE, 'V' ) ) THEN * WRITE(*,*) 'VL VU = ', VL, ' ', VU END IF IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL * WRITE(*,*) 'MINVL = ', MINVL, ' MAXVU = ', MAXVU * WRITE(*,*) 'WIN = ', WIN( 1 ) MINIL = 1 MAXIU = 0 DO 40 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 40 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * ISEED( 1 ) = 1 * CALL PDLASIZESYEVR( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL DLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PDFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0D0 ) * * WRITE(*,*) ' NP = ', NP, ' MQ = ', MQ, ' LDZ = ', DESCZ( LLD_ ), * $ ' IPREPAD = ', IPREPAD, ' IPOSTPAD = ', IPOSTPAD, * $ ' MAXEIGS = ', MAXEIGS * WRITE(*,*) ' PADZ( 1 ) = ', Z( 1 ), ' PADZ( 2 ) = ', Z( 2 ), * $ ' PADZ( 3 ) = ', Z( 3 ), ' PADZ( 4 ) = ', Z( 4 ) * CALL PDFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0D0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0D0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0D0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * * Make sure that PDSYEVR does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, MAXEIGS, 1 CALL PDELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0D0 ) 50 CONTINUE 60 CONTINUE * * Reset and start the timer * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) ********************************* * * Main call to PDSYEVR * CALL PDSYEVR( JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ VL, VU, IL, IU, M, NZ, WNEW( 1+IPREPAD ), $ Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), LWORK1, IWORK( 1+IPREPAD ), $ LIWORK, INFO ) * ********************************* * * Stop timer * CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * * Indicate that there are no unresolved clusters. * This is necessary so that the tester * (adapted from the one originally made for PDSYEVX) * works correctly. ICLUSTR( 1+IPREPAD ) = 0 * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVR-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PDCHEKPAD( DESCZ( CTXT_ ), 'PDSYEVR-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0D0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVR-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0D0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVR-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0D0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVR-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0D0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PDSYEVR-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PDSYEVR-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PDSYEVR-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * If we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PDLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * Check INFO * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) WRITE( NOUT,*) 'M = ', M, '\n', 'N = ', N RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) WRITE( NOUT,*) 'IL = ', IL, ' IU = ', IU, ' M = ', M END IF RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Ensure that different processes return the same eigenvalues * DO 70 I = 1, M WORK( I ) = WNEW( I+IPREPAD ) WORK( I+M ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, WORK, M, 1, $ 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ WORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 80 I = 1, M IF( RESULT.EQ.0 .AND. ( ABS( WORK( I )-WORK( M+ $ I ) ).GT.FIVE*EPS*ABS( WORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 80 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 90 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 100 NCLUSTERS = NCLUSTERS + 1 90 CONTINUE 100 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 110 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 110 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * DO 120 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 120 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 150 * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PDLANSY( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ WORK )*EPS END IF * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |A Z - Z W| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3D0 ) * CALL PDSEPCHK( N, NZ, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSEPCHK-WORK', SIZECHK, 1, $ WORK, SIZECHK, IPREPAD, IPOSTPAD, 4.3D0 ) * IF( RES.NE.0 ) $ RESULT = 1 * * Perform the |QTQ - I| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1, WORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3D0 ) * * CALL PDSEPQTQ( N, NZ, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1+IPREPAD+1 ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), WORK( IPREPAD+1 ), SIZEQTQ, $ QTQNRM, INFO, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSEPQTQ-WORK', SIZEQTQ, 1, $ WORK, SIZEQTQ, IPREPAD, IPOSTPAD, 4.3D0 ) * IF( RES.NE.0 ) $ RESULT = 1 * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that the right eigenvalues have been obtained * IF( WKNOWN ) THEN * Set up MYIL if necessary MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 140 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 130 I = 1, M * WRITE(*,*) 'WIN WNEW = ', WIN( I+MYIL-1 ), * $ WNEW( I+IPREPAD ) ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 130 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 140 CONTINUE * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what was computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * RETURN * 9999 FORMAT( 'PDSYEVR returned INFO=', I7 ) 9998 FORMAT( 'PDSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PDSEPRSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PDSYEVR returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PDSYEVR' ) 9981 FORMAT( 'NZ altered by PDSYEVR with JOBZ=N' ) * * End of PDSEPRSUBTST * END scalapack-2.0.2/TESTING/EIG/pdseprtst.f000644 000766 000024 00000072031 11623527140 017635 0ustar00juliestaff000000 000000 SUBROUTINE PDSEPRTST(DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, $ ORDER, ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, $ IWORK, LIWORK, HETERO, NOUT, INFO ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER HETERO, SUBTESTS, UPLO INTEGER INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, LWORK, $ MATTYPE, N, NOUT, ORDER DOUBLE PRECISION ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) DOUBLE PRECISION A( LDA, * ), COPYA( LDA, * ), GAP( * ), $ WIN( * ), WNEW( * ), WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PDSEPRTST builds a random matrix and runs PDSYEVR to * compute the eigenvalues and eigenvectors. Then it performs two tests * to determine if the result is good enough. The two tests are: * |AQ -QL| / (abstol + ulp * norm(A) ) * and * |QT * Q - I| / ulp * norm(A) * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) A matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) DOUBLE PRECISION * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) DOUBLE PRECISION array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * The test matrix, which is then overwritten. * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * * COPYA (local workspace) DOUBLE PRECISION array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * Z (local workspace) DOUBLE PRECISION array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PDSEPCHK and PDSEPQTQ * * W (local workspace) DOUBLE PRECISION array, dimension (N) * On normal exit, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * Not used, only for backward compatibility * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PDLASIZESEPR * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PDLASIZESEPR * * HETERO (input) INTEGER * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER CTXT_, MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( CTXT_ = 2, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION HALF, ONE, TEN, ZERO PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, $ TEN = 10.0D0, HALF = 0.5D0 ) DOUBLE PRECISION PADVAL PARAMETER ( PADVAL = 19.25D0 ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IHETERO, IINFO, IL, IMODE, IN, $ INDD, INDWORK, ISIZESUBTST, ISIZEEVR, $ ISIZETST, ITYPE, IU, J, LLWORK, LEVRSIZE, $ MAXSIZE, MYCOL, MYROW, NB, NGEN, NLOC, $ NNODES, NP, NPCOL, NPROW, NQ, RES, SIZECHK, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, $ SIZESUBTST, SIZEEVR, SIZETMS, $ SIZETST, VALSIZE, VECSIZE DOUBLE PRECISION ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL DLARAN, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, DLABAD, DLASRT, $ DLATMS, IGAMX2D, IGEBR2D, IGEBS2D, PDCHEKPAD, $ PDELSET, PDFILLPAD, PDLASET, PDLASIZESEPR, $ PDLASIZESYEVR, PDLATMS, PDMATGEN, PDSEPRSUBTST, $ SLCOMBINE * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX, MIN, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * INFO = 0 PASSED = 'PASSED EVR' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * Distribute HETERO across processes * IF( IAM.EQ.0 ) THEN IF( LSAME( HETERO, 'Y' ) ) THEN IHETERO = 2 ELSE IHETERO = 1 END IF CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1, 0, 0 ) END IF IF( IHETERO.EQ.2 ) THEN HETERO = 'Y' ELSE HETERO = 'N' END IF * * Make sure that there is enough memory * CALL PDLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, ISIZEEVR, $ SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) IF( LWORK.LT.SIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDWORK = INDD + N LLWORK = LWORK - INDWORK + 1 * ULP = PDLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PDLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL DLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / DBLE( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log symmetric, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random symmetric * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N WORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PDLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N WORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PDLASET( 'All', N, N, ZERO, ONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0D0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0D0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * symmetric, eigenvalues specified * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0D0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0D0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * symmetric, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PDMATGEN( DESCA( CTXT_ ), 'S', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0D0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0D0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PDLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( DLARAN( ISEED )*DBLE( NLOC ) ), N-NGEN ) * CALL DLATMS( IN, IN, 'S', ISEED, 'P', WORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PDELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PDELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PDELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PDELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2,... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 WORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0D0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0D0 ) * ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL DLASRT( 'I', N, WORK( INDD ), IINFO ) * CALL PDLASIZESYEVR( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, WORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) LEVRSIZE = MIN( MAXSIZE, LLWORK ) * CALL PDSEPRSUBTST( WKNOWN, 'v', 'a', UPLO, N, VL, VU, IL, IU, $ THRESH, ABSTOL, A, COPYA, Z, 1, 1, DESCA, $ WORK( INDD ), WIN, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK( INDWORK ), LLWORK, $ LEVRSIZE, IWORK, ISIZEEVR, RES, TSTNRM, $ QTQNRM, NOUT ) * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PDLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PDSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 1' INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * Use PDLASIZESYEVR to choose IL and IU. * CALL PDLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PDSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 2' INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PDLASIZESYEVR to choose IL and IU for us. * CALL PDLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VECSIZE * CALL PDSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PDLASIZESYEVR to choose IL and IU for us. * CALL PDLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PDSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVR to choose VL and VU for us. * CALL PDLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VECSIZE * CALL PDSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF END IF * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 .AND. .FALSE. ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF C WRITE(*,*)'************************************************' END IF * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, $ F8.2, 1X, F8.2, 1X, G9.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 21X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 21X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 32X, $ 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) C 9984 FORMAT( ' IBTYPE=', I8 ) C 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) C 9980 FORMAT( ' Increase TOTMEM in PDSEPRDRIVER' ) * * End of PDSEPRTST * END scalapack-2.0.2/TESTING/EIG/pdsepsubtst.f000644 000766 000024 00000070544 10363532303 020171 0ustar00juliestaff000000 000000 SUBROUTINE PDSEPSUBTST( WKNOWN, JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, IA, JA, $ DESCA, WIN, WNEW, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK, LWORK, LWORK1, $ IWORK, LIWORK, RESULT, TSTNRM, QTQNRM, $ NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 15, 2000 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, IPOSTPAD, IPREPAD, IU, JA, LIWORK, $ LWORK, LWORK1, N, NOUT, RESULT DOUBLE PRECISION ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) DOUBLE PRECISION A( * ), COPYA( * ), GAP( * ), WIN( * ), $ WNEW( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDSEPSUBTST calls PDSYEVX and then tests the output of * PDSYEVX * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PDSYEVX when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PDSEPSUBTST * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call to PDSEPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PDSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) DOUBLE PRECISION array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PDSYEVX for a description of block cyclic layout. * The test matrix, which is then modified by PDSYEVX * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) DOUBLE PRECISION array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) DOUBLE PRECISION array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PDSEPCHK and PDSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) DOUBLE PRECISION array, dimension (N) * The eigenvalues as copmuted by this call to PDSYEVX * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PDSYEVX * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PDSYEVX * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285D+0, FIVE = 5.0D+0, $ NEGONE = -1.0D+0 ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEVX, SIZETMS, $ SIZETST, VALSIZE, VECSIZE DOUBLE PRECISION EPS, EPSNORMA, ERROR, MAXERROR, MAXVU, $ MINERROR, MINVL, NORMWIN, OLDVL, OLDVU, ORFAC, $ SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), DSEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PDLANSY EXTERNAL LSAME, NUMROC, PDLAMCH, PDLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, DGAMN2D, DGAMX2D, $ DLACPY, IGAMN2D, IGAMX2D, PDCHEKPAD, PDELSET, $ PDFILLPAD, PDLASIZESEP, PDLASIZESYEVX, $ PDSEPCHK, PDSEPQTQ, PDSYEVX, PICHEKPAD, $ PIFILLPAD, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PDLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PDLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3D+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159D+0 30 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL MINIL = 1 MAXIU = 0 DO 40 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 40 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * * * DSEED is not used in this call to PDLASIZESYEVX, the * following line just makes ftnchek happy. * DSEED( 1 ) = 1 * CALL PDLASIZESYEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL DLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PDFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0D+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * * Make sure that PDSYEVX does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, MAXEIGS, 1 CALL PDELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0D+0 ) 50 CONTINUE 60 CONTINUE * ORFAC = -1.0D+0 * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PDSYEVX( JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ VL, VU, IL, IU, ABSTOL, M, NZ, WNEW( 1+IPREPAD ), $ ORFAC, Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), LWORK1, IWORK( 1+IPREPAD ), $ LIWORK, IFAIL( 1+IPREPAD ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVX-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PDCHEKPAD( DESCZ( CTXT_ ), 'PDSYEVX-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVX-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVX-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEVX-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PDSYEVX-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PDSYEVX-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PDSYEVX-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * * Since we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9992 ) RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Make sure that different processes return the same eigenvalues * DO 70 I = 1, M WORK( I ) = WNEW( I+IPREPAD ) WORK( I+M ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, WORK, M, 1, $ 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ WORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 80 I = 1, M * IF( RESULT.EQ.0 .AND. ( ABS( WORK( I )-WORK( M+ $ I ) ).GT.FIVE*EPS*ABS( WORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 80 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 90 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 100 NCLUSTERS = NCLUSTERS + 1 90 CONTINUE 100 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 110 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 110 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * * DO 120 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 120 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 150 * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PDLANSY( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ WORK )*EPS END IF * * Note that a couple key variables get redefined in PDSEPCHK * as described by this table: * * PDSEPTST name PDSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |AQ - QE| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3D+0 ) * CALL PDSEPCHK( N, NZ, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSEPCHK-WORK', SIZECHK, 1, $ WORK, SIZECHK, IPREPAD, IPOSTPAD, 4.3D+0 ) * IF( RES.NE.0 ) $ RESULT = 1 * * Perform the |QTQ - I| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1, WORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3D+0 ) * * CALL PDSEPQTQ( N, NZ, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1+IPREPAD+1 ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), WORK( IPREPAD+1 ), SIZEQTQ, $ QTQNRM, INFO, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSEPQTQ-WORK', SIZEQTQ, 1, $ WORK, SIZEQTQ, IPREPAD, IPOSTPAD, 4.3D+0 ) * IF( RES.NE.0 ) $ RESULT = 1 * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN ) THEN * * Set up MYIL if necessary * MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 140 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 130 I = 1, M ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 130 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 140 CONTINUE * * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what PxSYEVX computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * * RETURN * 9999 FORMAT( 'PDSYEVX returned INFO=', I7 ) 9998 FORMAT( 'PDSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PDSEPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PDSYEVX returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PDSYEVX' ) 9981 FORMAT( 'NZ altered by PDSYEVX with JOBZ=N' ) * * End of PDSEPSUBTST * END scalapack-2.0.2/TESTING/EIG/pdseptst.f000644 000766 000024 00000135361 11622500733 017457 0ustar00juliestaff000000 000000 SUBROUTINE PDSEPTST( DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, $ ORDER, ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, IWORK, LIWORK, HETERO, NOUT, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * .. Scalar Arguments .. CHARACTER HETERO, SUBTESTS, UPLO INTEGER INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, LWORK, $ MATTYPE, N, NOUT, ORDER DOUBLE PRECISION ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) DOUBLE PRECISION A( LDA, * ), COPYA( LDA, * ), GAP( * ), $ WIN( * ), WNEW( * ), WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PDSEPTST builds a random matrix, runs PDSYEVX and PDSYEV to * compute the eigenvalues * and eigenvectors and then performs two tests to * determine if the result * is good enough. The two tests are: * |AQ -QL| / (abstol + ulp * norm(A) ) * and * |QT * Q - I| / ulp * norm(A) * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) symmetric matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PDSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) DOUBLE PRECISION array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * The test matrix, which is then modified by PDSYEVX * * COPYA (local workspace) DOUBLE PRECISION array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * Z (local workspace) DOUBLE PRECISION array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PDSEPCHK and PDSEPQTQ * * W (local workspace) DOUBLE PRECISION array, dimension (N) * On normal exit from PDSYEVX, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PDLASIZESQP * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PDLASIZESQP * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DT_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION HALF, ONE, TEN, ZERO PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, $ TEN = 10.0D+0, HALF = 0.5D+0 ) DOUBLE PRECISION PADVAL PARAMETER ( PADVAL = 19.25D+0 ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IHETERO, IINFO, IL, IMODE, IN, $ INDD, INDWORK, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, ITYPE, IU, J, LLWORK, LSYEVXSIZE, $ MAXSIZE, MINSIZE, MYCOL, MYROW, NB, NGEN, NLOC, $ NNODES, NP, NPCOL, NPROW, NQ, RES, SIZECHK, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, $ SIZESUBTST, SIZESYEV, SIZESYEVX, SIZETMS, $ SIZETST, VALSIZE, VECSIZE, ISIZESYEVD,SIZESYEVD DOUBLE PRECISION ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL DLARAN, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, DLABAD, DLASRT, $ DLATMS, IGAMX2D, IGEBR2D, IGEBS2D, PDCHEKPAD, $ PDELSET, PDFILLPAD, PDLASET, PDLASIZESQP, $ PDLASIZESYEVX, PDLATMS, PDMATGEN, PDSEPSUBTST, $ PDSQPSUBTST, PDSYEV, SLCOMBINE * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX, MIN, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DT_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 PASSED = 'PASSED EVX' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * Distribute HETERO across processes * IF( IAM.EQ.0 ) THEN IF( LSAME( HETERO, 'Y' ) ) THEN IHETERO = 2 ELSE IHETERO = 1 END IF CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1, 0, 0 ) END IF IF( IHETERO.EQ.2 ) THEN HETERO = 'Y' ELSE HETERO = 'N' END IF * * Make sure that we have enough memory * CALL PDLASIZESQP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESYEV, $ SIZESYEVD, ISIZESYEVD, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * IF( LWORK.LT.SIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDWORK = INDD + N LLWORK = LWORK - INDWORK + 1 * ULP = PDLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PDLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL DLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / DBLE( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log symmetric, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random symmetric * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * * Zero * * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N WORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PDLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N WORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PDLASET( 'All', N, N, ZERO, ONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0D+0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0D+0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * symmetric, eigenvalues specified * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * symmetric, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PDMATGEN( DESCA( CTXT_ ), 'S', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0D+0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0D+0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PDLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( DLARAN( ISEED )*DBLE( NLOC ) ), N-NGEN ) * CALL DLATMS( IN, IN, 'S', ISEED, 'P', WORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PDELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PDELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PDELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PDELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2, ... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 WORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * * CALL PDFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0D+0 ) * CALL PDLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * * * WKNOWN ... NOT SET, GUESS A DEFAULT * WKNOWN = .TRUE. ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL DLASRT( 'I', N, WORK( INDD ), IINFO ) * * * These values aren't actually used, but they make ftncheck happy. * IL = -1 IU = -2 VL = ONE VU = -ONE * CALL PDLASIZESYEVX( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, WORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) * LSYEVXSIZE = MIN( MAXSIZE, LLWORK ) * CALL PDSEPSUBTST( WKNOWN, 'v', 'a', UPLO, N, VL, VU, IL, IU, $ THRESH, ABSTOL, A, COPYA, Z, 1, 1, DESCA, $ WORK( INDD ), WIN, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, TSTNRM, $ QTQNRM, NOUT ) * * * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'V', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'V' RANGE = 'A' CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 1' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'V', RANGE = 'A', random memory * IF( INFO.EQ.0 ) THEN JOBZ = 'V' RANGE = 'A' CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 2' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * We use PDLASIZESYEVX to choose IL and IU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'I', maximum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PDLASIZESYEVX to choose IL and IU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = MAXSIZE * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF * * Subtest 6: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PDLASIZESYEVX to choose IL and IU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 6' INFO = 1 END IF END IF * * Subtest 7: JOBZ = 'V', RANGE = 'I', random memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PDLASIZESYEVX to choose IL and IU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) LSYEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 7' INFO = 1 END IF END IF * * Subtest 8: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 8' INFO = 1 END IF END IF * * Subtest 9: JOBZ = 'V', RANGE = 'V', maximum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = MAXSIZE * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 9' INFO = 1 END IF END IF * * Subtest 10: JOBZ = 'V', RANGE = 'V', * minimum memory required for eigenvectors * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest10' INFO = 1 END IF END IF * * Subtest 11: JOBZ = 'V', RANGE = 'V', * random memory (enough for all eigenvectors * but not enough to guarantee orthogonality * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest11' INFO = 1 END IF END IF * * Subtest 12: JOBZ = 'V', RANGE = 'V', * miniimum memory required for eigenvalues only * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest12' INFO = 1 END IF END IF * * Subtest 13: JOBZ = 'V', RANGE = 'V', * random memory (more than minimum required * for eigenvalues, less than required for vectors) * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PDLASIZESYEVX to choose VL and VU for us. * CALL PDLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE + INT( DLARAN( ISEED )* $ DBLE( VECSIZE-VALSIZE ) ) * CALL PDSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest13' INFO = 1 END IF END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF * * Now that PDSYEVX been tested, we check PDSYEV if we are a * homogeneous machine. * IF( LSAME( HETERO, 'N' ) .AND. LSAME( SUBTESTS, 'N' ) ) THEN PASSED = 'PASSED EV' * * PDSYEV test1: * JOBZ = 'N', eigenvalues only * IF( INFO.NE.0 ) THEN * * If the EVX tests fail, we do not perform the EV tests * PASSED = 'SKIPPED EV' ELSE JOBZ = 'N' * CALL PDSYEV( JOBZ, UPLO, N, A, 1, 1, DESCA, $ WORK( INDWORK ), Z, 1, 1, DESCA, $ WORK( INDWORK ), -1, INFO ) MINSIZE = INT( WORK( INDWORK ) ) * CALL PDSQPSUBTST( WKNOWN, JOBZ, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, 1, 1, DESCA, WIN, WNEW, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ MINSIZE, RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAIL EV test1' INFO = 1 END IF END IF * * PDSYEV test2: * JOBZ = 'V', eigenvalues and eigenvectors * IF( INFO.EQ.0 ) THEN JOBZ = 'V' * CALL PDSYEV( JOBZ, UPLO, N, A, 1, 1, DESCA, $ WORK( INDWORK ), Z, 1, 1, DESCA, $ WORK( INDWORK ), -1, INFO ) MINSIZE = INT( WORK( INDWORK ) ) * CALL PDSQPSUBTST( WKNOWN, JOBZ, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, 1, 1, DESCA, WIN, WNEW, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ MINSIZE, RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAIL EV test2' INFO = 1 END IF END IF IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, $ PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF ENDIF * * Now that PDSYEV been tested, we check PDSYEVD if we are a * homogeneous machine. * IF( LSAME( HETERO, 'N' ) .AND. LSAME( SUBTESTS, 'N' ) ) THEN PASSED = 'PASSED EVD' * * PDSYEVD test1: * IF( INFO.NE.0 ) THEN * * If the EV tests fail, we do not perform the EVD tests * PASSED = 'SKIPPED EVD' ELSE * NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) MINSIZE = MAX( 1+6*N+2*NP*NQ, $ 3*N + MAX( NB*( NP+1 ), 3*NB ) ) + 2*N * CALL PDSDPSUBTST( WKNOWN, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, 1, 1, DESCA, WIN, WNEW, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ MINSIZE, IWORK, ISIZESYEVD, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAIL EVD test1' INFO = 1 END IF END IF IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), TSTNRM, $ QTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, $ PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF END IF RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, $ F8.2, 1X, F8.2, 1X, G9.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 21X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 21X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 32X, $ 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) 9984 FORMAT( ' IBTYPE=', I8 ) 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) 9980 FORMAT( ' Increase TOTMEM in PDSEPDRIVER' ) * * End of PDSEPTST * END scalapack-2.0.2/TESTING/EIG/pdsqpsubtst.f000644 000766 000024 00000040763 10363532303 020205 0ustar00juliestaff000000 000000 * * SUBROUTINE PDSQPSUBTST( WKNOWN, JOBZ, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, IA, JA, DESCA, WIN, WNEW, $ IPREPAD, IPOSTPAD, WORK, LWORK, LWORK1, $ RESULT, TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, UPLO INTEGER IA, IPOSTPAD, IPREPAD, JA, LWORK, LWORK1, N, $ NOUT, RESULT DOUBLE PRECISION ABSTOL, QTQNRM, THRESH, TSTNRM * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), COPYA( * ), WIN( * ), WNEW( * ), $ WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDSQPSUBTST calls PDSYEV and then tests the output of * PDSYEV * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PDSYEV when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PDSQPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. * * A (local workspace) DOUBLE PRECISION array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PDSYEV for a description of block cyclic layout. * The test matrix, which is then modified by PDSYEV * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) DOUBLE PRECISION array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) DOUBLE PRECISION array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PDSEPCHK and PDSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) DOUBLE PRECISION array, dimension (N) * The eigenvalues as computed by this call to PDSYEV. * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PDSYEV * * RESULT (global output) INTEGER * The result of this call to PDSYEV * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DT_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION FIVE, NEGONE, PADVAL, ZERO PARAMETER ( PADVAL = 13.5285D+0, FIVE = 5.0D+0, $ NEGONE = -1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IAM, INFO, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, J, EIGS, MINSIZE, MQ, MYCOL, MYROW, $ NP, NPCOL, NPROW, NQ, RESAQ, RESQTQ, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEV, SIZESYEVX, $ SIZETMS, SIZETST,SIZESYEVD, ISIZESYEVD DOUBLE PRECISION EPS, EPSNORMA, ERROR, MAXERROR, MINERROR, $ NORMWIN, SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), ITMP( 2 ), $ IWORK( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PDLANSY EXTERNAL LSAME, NUMROC, PDLAMCH, PDLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, DGAMN2D, DGAMX2D, $ DLACPY, IGAMN2D, IGAMX2D, PDCHEKPAD, PDELSET, $ PDFILLPAD, PDLASIZESQP, PDSEPCHK, PDSEPQTQ, $ PDSYEV, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DT_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PDLASIZESQP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESYEV, $ SIZESYEVD, ISIZESYEVD, SIZESUBTST, ISIZESUBTST, $ SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PDLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1+IPREPAD ) ), $ ABS( WIN( N+IPREPAD ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3D+0 10 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159D+0 30 CONTINUE * DO 40 I = 1, 2 IWORK( I ) = 0 40 CONTINUE * IF( LSAME( JOBZ, 'N' ) ) THEN EIGS = 0 ELSE EIGS = N END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( EIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * Find the amount of workspace needed with or without eigenvectors. * CALL PDLASIZESYEV( JOBZ, N, DESCA, MINSIZE ) * CALL DLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PDFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0D+0 ) * * Make sure that PDSYEV does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, EIGS, 1 CALL PDELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0D+0 ) 50 CONTINUE 60 CONTINUE * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PDSYEV( JOBZ, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), LWORK1, INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEV-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PDCHEKPAD( DESCZ( CTXT_ ), 'PDSYEV-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEV-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSYEV-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )INFO IF( INFO.EQ.(N+1) ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 END IF ELSE IF( INFO.EQ.14 .AND. LWORK1.GE.MINSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * IF( RESULT.EQ.0 .OR. INFO.GT.N ) THEN * * Make sure that different processes return the same eigenvalues. * This is a more exhaustive check that provided by PDSYEV. * DO 70 I = 1, N WORK( I ) = WNEW( I+IPREPAD ) WORK( I+N ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', N, 1, WORK, N, 1, $ 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', N, 1, $ WORK( 1+N ), N, 1, 1, -1, -1, 0 ) * DO 80 I = 1, N * IF( ABS( WORK( I )-WORK( N+I ) ).GT.ZERO ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 GO TO 90 END IF 80 CONTINUE 90 CONTINUE END IF * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PDLANSY( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ WORK )*EPS END IF * * Note that a couple key variables get redefined in PDSEPCHK * as described by this table: * * PDSEPTST name PDSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |AQ - QE| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3D+0 ) * RESAQ = 0 * CALL PDSEPCHK( N, N, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RESAQ ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSEPCHK-WORK', SIZECHK, 1, $ WORK, SIZECHK, IPREPAD, IPOSTPAD, 4.3D+0 ) * IF( RESAQ.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9993 ) END IF * * Perform the |QTQ - I| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1, WORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3D+0 ) * RESQTQ = 0 * CALL PDSEPQTQ( N, N, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1 ), IWORK( 1 ), WORK( 1 ), $ WORK( IPREPAD+1 ), SIZEQTQ, QTQNRM, INFO, $ RESQTQ ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSEPQTQ-WORK', SIZEQTQ, 1, $ WORK, SIZEQTQ, IPREPAD, IPOSTPAD, 4.3D+0 ) * IF( RESQTQ.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9992 ) END IF * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN .AND. N.GT.0 ) THEN * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN MAXERROR = 0 * DO 140 I = 1, N ERROR = ABS( WIN( I+IPREPAD )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 140 CONTINUE MINERROR = MIN( MAXERROR, MINERROR ) * IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * * RETURN * 9999 FORMAT( 'PDSYEV returned INFO=', I7 ) 9998 FORMAT( 'PDSEPQTQ in PDSQPSUBTST returned INFO=', I7 ) 9997 FORMAT( 'PDSQPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PDSYEV returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'Different processes return different eigenvalues' ) 9994 FORMAT( 'Heterogeneity detected by PDSYEV' ) 9993 FORMAT( 'PDSYEV failed the |AQ -QE| test' ) 9992 FORMAT( 'PDSYEV failed the |QTQ -I| test' ) * * End of PDSQPSUBTST * END scalapack-2.0.2/TESTING/EIG/pdsvdchk.f000644 000766 000024 00000033013 10363532303 017405 0ustar00juliestaff000000 000000 SUBROUTINE PDSVDCHK( M, N, A, IA, JA, DESCA, U, IU, JU, DESCU, VT, $ IVT, JVT, DESCVT, S, THRESH, WORK, LWORK, $ RESULT, CHK, MTM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IU, IVT, JA, JU, JVT, LWORK, M, N DOUBLE PRECISION CHK, MTM, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCU( * ), DESCVT( * ), $ RESULT( * ) DOUBLE PRECISION A( * ), S( * ), U( * ), VT( * ), WORK( * ) * .. * * Purpose * ======= * * For given two-dimensional matrices A, U, VT, and one-dimensional * array D compute the following four tests: * * (1) | A - U*diag(S) VT | / ( |A| max(M,N) ulp ) * * (2) | I - U'*U | / ( M ulp ) * * (3) | I - VT*VT' | / ( N ulp ), * * (4) S contains SIZE = MIN( M, N ) nonnegative values in * decreasing order. * It then compares result of computations (1)-(3) * with TRESH and returns results of comparisons and test (4) in * RESULT(I). When the i-th test fails, value of RESULT( I ) is set * to 1. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MP = number of local rows in A and U * NQ = number of local columns in A and VT * SIZEP = number of local rows in VT * SIZEQ = number of local columns in U * * M (global input) INTEGER * Matrix size. * The number of global rows in A and U and * * N (global input) INTEGER * The number of global columns in A and VT. * * A (input) block cyclic distributed DOUBLE PRECISION array, * global dimension (M, N), local dimension (DESCA( DLEN_ ), NQ) * Contains the original test matrix. * * IA (global input) INTEGER * The global row index of the submatrix of the distributed * matrix A to operate on. * * JA (global input) INTEGER * The global column index of the submatrix of the distributed * matrix A to operate on. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix A. * * U (local input) DOUBLE PRECISION array * global dimension (M, SIZE), local dimension * (DESCU( DLEN_ ), SIZEQ) * Contains left singular vectors of matrix A. * * IU (global input) INTEGER * The global row index of the submatrix of the distributed * matrix U to operate on. * * JU (global input) INTEGER * The global column index of the submatrix of the distributed * matrix U to operate on. * * DESCU (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix U. * * VT (local input) DOUBLE PRECISION array * global dimension (SIZE, N), local dimension * (DESCVT( DLEN_ ), NQ) * Contains right singular vectors of matrix A. * * IVT (global input) INTEGER * The global row index of the submatrix of the distributed * matrix VT to operate on. * * JVT (global input) INTEGER * The global column index of the submatrix of the distributed * matrix VT to operate on. * * DESCVT (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix VT. * * S (global input) DOUBLE PRECISION array, dimension (SIZE) * Contains the computed singular values * * THRESH (input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= 1 + SIZEQ*SIZEP + MAX[WORK(pdlange(size,size)), * WORK(pdlange(m,n))], * where * SIZEQ = NUMROC( SIZE, DESCU( NB_ ), MYCOL, 0, NPCOL ), * SIZEP = NUMROC( SIZE, DESCVT( MB_ ), MYROW, 0, NPROW ), * and worekspaces required to call pdlange are * WORK(pdlange(size,size)) < MAX(SIZEQ0,2) < SIZEB +2, * WORK(pdlange(m,n)) < MAX(NQ0,2) < SIZEB +2, * SIZEB = MAX(M, N) * Finally, upper limit on required workspace is * LWORK > 1 + SIZEQ*SIZEP + SIZEB + 2 * * RESULT (global input/output) INTEGER array. Four first elements of * the array are set to 0 or 1 depending on passing four * respective tests ( see above in Purpose ). The elements of * RESULT are set to * 0 if the test passes i.e. * | A - U*diag(S)*VT | / ( |A| max(M,N) ulp ) <= THRESH * 1 if the test fails i.e. * | A - U*diag(S)*VT | / ( |A| max(M,N) ulp ) > THRESH * * CHK (global output) DOUBLE PRECISION * value of the | A - U*diag(S) VT | / ( |A| max(M,N) ulp ) * * MTM (global output) DOUBLE PRECISION * maximum of the two values: * | I - U'*U | / ( M ulp ) and | I - VT*VT' | / ( N ulp ) * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE, MONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, MONE = -1.0D0 ) * .. * .. Local Scalars .. INTEGER I, INFO, LDR, LOCALCOL, LWMIN, MP, MX, MYCOL, $ MYROW, NPCOL, NPROW, NQ, PCOL, PTRR, PTRWORK, $ SIZE, SIZEP, SIZEPOS, SIZEQ DOUBLE PRECISION FIRST, NORMA, NORMAI, NORMU, NORMVT, SECOND, $ THRESHA, ULP * .. * .. Local Arrays .. INTEGER DESCR( DLEN_ ) * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL INDXG2L, INDXG2P, NUMROC, PDLAMCH, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCINIT, DSCAL, $ PDELSET, PDGEMM, PDLASET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*DTYPE_*M_*N_*RSRC_.LT.0 ) RETURN * * Test the input parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 SIZE = MIN( M, N ) * * Sizepos is a number of parameters to pdsvdchk plus one. It's used * for the error reporting. * SIZEPOS = 22 IF( NPROW.EQ.-1 ) THEN INFO = -607 ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( M, 1, SIZE, SIZEPOS, IU, JU, DESCU, 10, INFO ) CALL CHK1MAT( SIZE, SIZEPOS, N, 2, IVT, JVT, DESCVT, 14, INFO ) END IF * IF( INFO.EQ.0 ) THEN * * Calculate workspace * MP = NUMROC( M, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) SIZEP = NUMROC( SIZE, DESCVT( MB_ ), MYROW, 0, NPROW ) SIZEQ = NUMROC( SIZE, DESCU( NB_ ), MYCOL, 0, NPCOL ) MX = MAX( SIZEQ, NQ ) LWMIN = 2 + SIZEQ*SIZEP + MAX( 2, MX ) WORK( 1 ) = LWMIN IF( LWORK.EQ.-1 ) $ GO TO 40 IF( LWORK.LT.LWMIN ) THEN INFO = -18 ELSE IF( THRESH.LE.0 ) THEN INFO = -16 END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PDSVDCHK', -INFO ) RETURN END IF * LDR = MAX( 1, SIZEP ) ULP = PDLAMCH( DESCA( CTXT_ ), 'P' ) NORMAI = PDLANGE( '1', M, N, A, IA, JA, DESCA, WORK ) * * Allocate array R of global dimension SIZE x SIZE for testing * PTRR = 2 PTRWORK = PTRR + SIZEQ*SIZEP * CALL DESCINIT( DESCR, SIZE, SIZE, DESCVT( MB_ ), DESCU( NB_ ), 0, $ 0, DESCA( CTXT_ ), LDR, INFO ) * * Test 2. Form identity matrix R and make check norm(U'*U - I ) * CALL PDLASET( 'Full', SIZE, SIZE, ZERO, ONE, WORK( PTRR ), 1, 1, $ DESCR ) CALL PDGEMM( 'T', 'N', SIZE, SIZE, M, ONE, U, IU, JU, DESCU, U, $ IU, JU, DESCU, MONE, WORK( PTRR ), 1, 1, DESCR ) * NORMU = PDLANGE( '1', SIZE, SIZE, WORK( PTRR ), 1, 1, DESCR, $ WORK( PTRWORK ) ) * NORMU = NORMU / ULP / SIZE / THRESH IF( NORMU.GT.1. ) $ RESULT( 2 ) = 1 * * Test3. Form identity matrix R and check norm(VT*VT' - I ) * CALL PDLASET( 'Full', SIZE, SIZE, ZERO, ONE, WORK( PTRR ), 1, 1, $ DESCR ) CALL PDGEMM( 'N', 'T', SIZE, SIZE, N, ONE, VT, IVT, JVT, DESCVT, $ VT, IVT, JVT, DESCVT, MONE, WORK( PTRR ), $ 1, 1, DESCR ) NORMVT = PDLANGE( '1', SIZE, SIZE, WORK( PTRR ), 1, 1, DESCR, $ WORK( PTRWORK ) ) * NORMVT = NORMVT / ULP / SIZE / THRESH IF( NORMVT.GT.1. ) $ RESULT( 3 ) = 1 * MTM = MAX( NORMVT, NORMU )*THRESH * * Test 1. * Initialize R = diag( S ) * CALL PDLASET( 'Full', SIZE, SIZE, ZERO, ZERO, WORK( PTRR ), 1, 1, $ DESCR ) * DO 10 I = 1, SIZE CALL PDELSET( WORK( PTRR ), I, I, DESCR, S( I ) ) 10 CONTINUE * * Calculate U = U*R * DO 20 I = 1, SIZE PCOL = INDXG2P( I, DESCU( NB_ ), 0, 0, NPCOL ) LOCALCOL = INDXG2L( I, DESCU( NB_ ), 0, 0, NPCOL ) IF( MYCOL.EQ.PCOL ) THEN CALL DSCAL( MP, S( I ), U( ( LOCALCOL-1 )*DESCU( LLD_ )+1 ), $ 1 ) END IF 20 CONTINUE * * Calculate A = U*VT - A * CALL PDGEMM( 'N', 'N', M, N, SIZE, ONE, U, IU, JU, DESCU, VT, $ IVT, JVT, DESCVT, MONE, A, IA, JA, DESCA ) * NORMA = PDLANGE( '1', M, N, A, IA, JA, DESCA, WORK( PTRWORK ) ) THRESHA = NORMAI*MAX( M, N )*ULP*THRESH * IF( NORMA.GT.THRESHA ) $ RESULT( 1 ) = 1 * IF( THRESHA.EQ.0 ) THEN CHK = 0.0D0 ELSE CHK = NORMA / THRESHA*THRESH END IF * * Test 4. * DO 30 I = 1, SIZE - 1 FIRST = S( I ) SECOND = S( I+1 ) IF( FIRST.LT.SECOND ) $ RESULT( 4 ) = 1 30 CONTINUE 40 CONTINUE RETURN END scalapack-2.0.2/TESTING/EIG/pdsvdcmp.f000644 000766 000024 00000026751 10363532303 017432 0ustar00juliestaff000000 000000 SUBROUTINE PDSVDCMP( M, N, JOBTYPE, S, SC, U, UC, IU, JU, DESCU, $ VT, VTC, IVT, JVT, DESCVT, THRESH, RESULT, $ DELTA, WORK, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IU, IVT, JOBTYPE, JU, JVT, LWORK, M, N DOUBLE PRECISION DELTA, THRESH * .. * .. Array Arguments .. INTEGER DESCU( * ), DESCVT( * ), RESULT( * ) DOUBLE PRECISION S( * ), SC( * ), U( * ), UC( * ), VT( * ), $ VTC( * ), WORK( * ) * .. * * Purpose * ======== * Testing how accurately "full" and "partial" decomposition options * provided by PDGESVD correspond to each other. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========== * * M (global input) INTEGER * Number of rows of the distributed matrix, for which * SVD was calculated * * N (global input) INTEGER * Number of columns of the distributed matrix, for which * SVD was calculated * * JOBTYPE (global input) INTEGER * Depending on the value of this parameter, * the following comparisons are performed: * * JOBTYPE | COMPARISON * ------------------------------------------- * 2 | | U - UC | / ( M ulp ) > THRESH, * 3 | | VT - VTC | / ( N ulp ) > THRESH * * In addition, for JOBTYPE = 2:4 comparison * | S1 - S2 | / ( SIZE ulp |S| ) > THRESH * is performed. Positive result of any of the comparisons * typically indicates erroneous computations and sets * to one corresponding element of array RESULT * * S (global input) DOUBLE PRECISION array of singular values * calculated for JOBTYPE equal to 1 * * SC (global input) DOUBLE PRECISION array of singular values * calculated for JOBTYPE nonequal to 1 * * U (local input) DOUBLE PRECISION array of left singular * vectors calculated for JOBTYPE equal to 1, local * dimension (MP, SIZEQ), global dimension (M, SIZE) * * UC (local input) DOUBLE PRECISION array of left singular * vectors calculated for JOBTYPE non equal to 1, local * dimension (MP, SIZEQ), global dimension (M, SIZE) * * IU (global input) INTEGER * The row index in the global array U indicating the first * row of sub( U ). * * JU (global input) INTEGER * The column index in the global array U indicating the * first column of sub( U ). * * DESCU (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix U and UC * * V (local input) DOUBLE PRECISION array of right singular * vectors calculated for JOBTYPE equal to 1, local * dimension (SIZEP, NQ), global dimension (SIZE, N) * * VC (local input) DOUBLE PRECISION array of right singular * vectors calculated for JOBTYPE non equal to 1, local * dimension (SIZEP, NQ), global dimension (SIZE, N) * * IVT (global input) INTEGER * The row index in the global array VT indicating the first * row of sub( VT ). * * JVT (global input) INTEGER * The column index in the global array VT indicating the * first column of sub( VT ). * * DESCVT (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix VT and * VTC * * THRESH (global input) DOUBLE PRECISION * The threshold value for the test ratios. A result is * included in the output file if RESULT >= THRESH. The test * ratios are scaled to be O(1), so THRESH should be a small * multiple of 1, e.g., 10 or 100. To have every test ratio * printed, use THRESH = 0. * * RESULT (global input/output) INTEGER array. * Every nonzero entry corresponds to erroneous computation. * * DELTA (global output) DOUBLE PRECISION * maximum of the available of the following three values * | U - UC | / ( M ulp THRESH ), * | VT - VT | / ( N ulp THRESH ), * | S1 - S2 | / ( SIZE ulp |S| THRESH ) * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COLPTR, I, INFO, J, LWMIN, MYCOL, MYROW, NPCOL, $ NPROW, NQ, RESULTS, SIZE, SIZEPOS, SIZEQ DOUBLE PRECISION ACCUR, CMP, NORMDIFS, NORMDIFU, NORMDIFV, $ NORMS, ULP * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION DLANGE, PDLAMCH, PDLANGE EXTERNAL NUMROC, DLANGE, PDLAMCH, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*DLEN_*DTYPE_*MB_*M_*N_*RSRC_.LT.0 ) $ RETURN * RESULTS = 0 NORMDIFS = 0 NORMDIFU = 0 NORMDIFV = 0 SIZE = MIN( M, N ) * * Sizepos is a number of parameters to pdsvdcmp plus one. It's used * for the error reporting. * SIZEPOS = 17 INFO = 0 CALL BLACS_GRIDINFO( DESCU( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IF( NPROW.EQ.-1 ) THEN INFO = -607 ELSE CALL CHK1MAT( M, 1, SIZE, SIZEPOS, 1, 1, DESCU, 8, INFO ) CALL CHK1MAT( SIZE, SIZEPOS, N, 2, 1, 1, DESCVT, 11, INFO ) END IF * IF( INFO.EQ.0 ) THEN * * Calculate workspace. * SIZEQ = NUMROC( SIZE, DESCU( NB_ ), MYCOL, 0, NPCOL ) NQ = NUMROC( N, DESCVT( NB_ ), MYCOL, 0, NPCOL ) LWMIN = MAX( SIZEQ, NQ ) + 4 WORK( 1 ) = LWMIN IF( LWORK.EQ.-1 ) $ GO TO 60 IF( LWORK.LT.LWMIN ) THEN INFO = -16 ELSE IF( THRESH.LE.0 ) THEN INFO = -12 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCU( CTXT_ ), 'PDSVDCMP', -INFO ) RETURN END IF * ULP = PDLAMCH( DESCU( CTXT_ ), 'P' ) * * Make comparison of singular values. * NORMS = DLANGE( '1', SIZE, 1, S, SIZE, WORK ) DO 10 I = 1, SIZE SC( I ) = S( I ) - SC( I ) 10 CONTINUE * NORMDIFS = DLANGE( '1', SIZE, 1, SC, SIZE, WORK ) ACCUR = ULP*SIZE*NORMS*THRESH * IF( NORMDIFS.GT.ACCUR ) $ RESULTS = 1 IF( NORMDIFS.EQ.0 .AND. ACCUR.EQ.0 ) THEN NORMDIFS = 0 ELSE NORMDIFS = NORMDIFS / ACCUR END IF * IF( JOBTYPE.EQ.2 ) THEN * RESULT( 5 ) = RESULTS ACCUR = ULP*M*THRESH DO 30 J = 1, SIZEQ COLPTR = DESCU( LLD_ )*( J-1 ) DO 20 I = 1, DESCU( LLD_ ) UC( I+COLPTR ) = U( I+COLPTR ) - UC( I+COLPTR ) 20 CONTINUE 30 CONTINUE * NORMDIFU = PDLANGE( '1', M, SIZE, UC, IU, JU, DESCU, WORK ) * IF( NORMDIFU.GE.ACCUR ) $ RESULT( 6 ) = 1 IF( NORMDIFU.EQ.0 .AND. ACCUR.EQ.0 ) THEN NORMDIFU = 0 ELSE NORMDIFU = NORMDIFU / ACCUR END IF * ELSE IF( JOBTYPE.EQ.3 ) THEN * RESULT( 7 ) = RESULTS ACCUR = ULP*N*THRESH DO 50 J = 1, NQ COLPTR = DESCVT( LLD_ )*( J-1 ) DO 40 I = 1, DESCVT( LLD_ ) VTC( I+COLPTR ) = VT( I+COLPTR ) - VTC( I+COLPTR ) 40 CONTINUE 50 CONTINUE * NORMDIFV = PDLANGE( '1', SIZE, N, VTC, IVT, JVT, DESCVT, WORK ) * IF( NORMDIFV.GE.ACCUR ) $ RESULT( 8 ) = 1 * IF( NORMDIFV.EQ.0 .AND. ACCUR.EQ.0 ) THEN NORMDIFV = 0 ELSE NORMDIFV = NORMDIFV / ACCUR END IF * ELSE IF( JOBTYPE.EQ.4 ) THEN * RESULT( 9 ) = RESULTS * END IF * CMP = MAX( NORMDIFV, NORMDIFU ) DELTA = MAX( CMP, NORMDIFS ) * 60 CONTINUE * * End of PDSVDCMP * RETURN END scalapack-2.0.2/TESTING/EIG/pdsvddriver.f000644 000766 000024 00000026016 10363532303 020140 0ustar00juliestaff000000 000000 PROGRAM PDSVDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======== * * Parallel Double precision singular value decomposition test driver. * * INPUT: * ===== * This routine tests PDGESVD, the parallel singular value * decomposition solver. We would like to cover possible combinations * of: matrix size, process configuration (nprow and npcol), block * size (nb), matrix type, and workspace available. * * Current format of the input file SVD.dat lists the following: * device out * Threshold * number of matrices * number of rows for every matrix * number of columns for every matrix * number of process configurations (P, Q, NB) * values of P (NPROW) for every configuration * values of Q (NPCOL) for every configuration * values of NB for every configuration. * Here threshold is an integer constant with a value between 1 and * 100, which meaning is explained in comments to PDSVDTST. * * WHAT IT DOES: * ============ * PSVDDRIVER checks floating-point arithmetic and parameters * provided by the user in initialization file SVD.dat. It reads and * broadcasts to all process parameters required to run actual testing * code PSVDTST. In case all tests are successful it tells you so. For * the actual "meat" of the tests see comments to PSVDTST. * *======================================================================= * * .. Local Scalars .. CHARACTER*80 SUMMARY INTEGER CONTEXT, ERR, I, IAM, J, K, LWORK, MAXNODES, $ NMATSIZES, NOUT, NPCONFIGS, NPROCS DOUBLE PRECISION THRESH * .. * .. Parameters .. INTEGER MAXSETSIZE, NIN, DBLSIZ, TOTMEM, MEMSIZ PARAMETER ( MAXSETSIZE = 50, NIN = 11, DBLSIZ = 8, $ TOTMEM = 2000000, MEMSIZ = TOTMEM / DBLSIZ ) * .. * .. Local Arrays .. INTEGER ISEED( 4 ), MM( MAXSETSIZE ), $ NBS( MAXSETSIZE ), NN( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ), $ RESULT( 9 ) DOUBLE PRECISION WORK( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ DGEBR2D, DGEBS2D, IGEBR2D, IGEBS2D, PDSVDTST * .. * .. Executable Statements .. * * Get starting information. * CALL BLACS_PINFO( IAM, NPROCS ) * * Open file and skip data header; read output device. * IF( IAM.EQ.0 ) THEN OPEN( UNIT = NIN, FILE = 'SVD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMARY READ( NIN, FMT = * )NOUT READ( NIN, FMT = * )MAXNODES END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * * Initialize variables, arrays, and grids. * ERR = 0 NMATSIZES = 0 NPCONFIGS = 0 LWORK = MEMSIZ ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) WRITE( NOUT, FMT = 9986 ) WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = 9974 ) WRITE( NOUT, FMT = 9973 ) WRITE( NOUT, FMT = 9972 ) WRITE( NOUT, FMT = 9971 ) WRITE( NOUT, FMT = 9970 ) WRITE( NOUT, FMT = 9969 ) WRITE( NOUT, FMT = 9968 ) WRITE( NOUT, FMT = 9967 ) WRITE( NOUT, FMT = 9966 ) WRITE( NOUT, FMT = 9965 ) END IF * * Process 0 reads values in input file and broadcasts them to * all other processes. * 10 CONTINUE IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )SUMMARY READ( NIN, FMT = * )SUMMARY READ( NIN, FMT = * )THRESH WRITE( NOUT, FMT = 9965 )SUMMARY CALL DGEBS2D( CONTEXT, 'All', ' ', 1, 1, THRESH, 1 ) ELSE CALL DGEBR2D( CONTEXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) END IF IF( THRESH.EQ.-1 ) THEN GO TO 80 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NMATSIZES CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NMATSIZES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NMATSIZES, 1, 0, 0 ) END IF * Deal with error IF( NMATSIZES.LT.1 .OR. NMATSIZES.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'Matrix size', NMATSIZES, 1, $ MAXSETSIZE END IF ERR = -1 GO TO 80 END IF * * Read array of MATSIZES. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( MM( I ), I = 1, NMATSIZES ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NMATSIZES, MM, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NMATSIZES, MM, 1, 0, 0 ) END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NN( I ), I = 1, NMATSIZES ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NMATSIZES, NN, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NMATSIZES, NN, 1, 0, 0 ) END IF * * Read and broadcast NPCONFIGS. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NPCONFIGS CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NPCONFIGS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NPCONFIGS, 1, 0, 0 ) END IF * Deal with error IF( NPCONFIGS.LT.1 .OR. NPCONFIGS.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'# proc configs', NPCONFIGS, 1, $ MAXSETSIZE END IF ERR = -1 GO TO 80 END IF * * Read and broadcast array of NPROWS. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NPROWS( I ), I = 1, NPCONFIGS ) * CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPROWS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPROWS, 1, 0, $ 0 ) END IF * Deal with error DO 20 I = 1, NPCONFIGS IF( NPROWS( I ).LE.0 ) $ ERR = -1 20 CONTINUE IF( ERR.EQ.-1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 )' NPROW' END IF GO TO 80 END IF * * Read and broadcast array of NPCOLS. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NPCOLS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPCOLS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPCOLS, 1, 0, $ 0 ) END IF * * Deal with error. * DO 30 I = 1, NPCONFIGS IF( NPCOLS( I ).LE.0 ) $ ERR = -1 30 CONTINUE IF( ERR.EQ.-1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 )' NPCOL' END IF GO TO 80 END IF * * Read and broadcast array of NBs. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NBS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NBS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NBS, 1, 0, 0 ) END IF * Deal with error DO 40 I = 1, NPCONFIGS IF( NBS( I ).LE.0 ) $ ERR = -1 40 CONTINUE IF( ERR.EQ.-1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 )' NB' END IF GO TO 80 END IF * DO 70 J = 1, NMATSIZES DO 60 I = 1, NPCONFIGS * DO 50 K = 1, 9 RESULT( K ) = 0 50 CONTINUE CALL PDSVDTST( MM( J ), NN( J ), NPROWS( I ), NPCOLS( I ), $ NBS( I ), ISEED, THRESH, WORK, RESULT, LWORK, $ NOUT ) * 60 CONTINUE 70 CONTINUE * GO TO 10 * 80 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * End of PDSVDDRIVER * 9999 FORMAT( A20, ' is:', I5, ' must be between:', I5, ' and', I5 ) 9998 FORMAT( A ) 9997 FORMAT( A20, ' must be positive' ) 9996 FORMAT( A ) 9995 FORMAT( 'M = ', I5, ' N = ', I5, ' NPOW = ', I5, 'NPCOL = ', I5, $ ' NB = ', I5 ) * 9994 FORMAT( 'Test #', I5, 'for this configuration has failed' ) 9993 FORMAT( 'All test passed for this configuration' ) 9992 FORMAT( ' ' ) 9991 FORMAT( 'Running tests of the parallel singular value ', $ 'decomposition routine: PDGESVD' ) 9990 FORMAT( 'The following scaled residual checks will be', $ 'computed:' ) 9989 FORMAT( ' || A - U*diag(S)*VT ||/( ||A||*max(M,N)*ulp )' ) 9988 FORMAT( ' || I - UT*U ||/( M*ulp )' ) 9987 FORMAT( ' || I - VT*V ||/( N*ulp )' ) 9986 FORMAT( ' ' ) 9985 FORMAT( 'An explanation of the input/output parameters', $ ' follows:' ) 9984 FORMAT( 'RESULT : passed; or an indication of which', $ ' jobtype test failed' ) 9983 FORMAT( 'M : The number of rows of the matrix A.' ) 9982 FORMAT( 'N : The number of columns of the matrix A.' ) 9981 FORMAT( 'P : The number of process rows.' ) 9980 FORMAT( 'Q : The number of process columns.' ) 9979 FORMAT( 'NB : The size of the square blocks the', $ ' matrix A is split into.' ) 9978 FORMAT( 'THRESH : If a residual value is less than ', $ ' THRESH, RESULT is flagged as PASSED.' ) 9977 FORMAT( 'MTYPE : matrix type (see pdsvdtst.f).' ) 9976 FORMAT( 'CHK : || A - U*diag(S)*VT ||/( ||A||', $ '*max(M,N)*ulp )' ) 9975 FORMAT( 'MTM : maximum of two values:',/, $ ' || I - UT*U ||/( M*ulp ) and', $ ' || I - VT*V ||/( N*ulp )' ) 9974 FORMAT( 'DELTA : maximum of three values:',/, $ ' || U - UC ||/( M*ulp*THRESH ),' ) 9973 FORMAT( ' || VT - VTC ||/( N*ulp*THRESH ), and' ) 9972 FORMAT( ' || S - SC || / ( SIZE*ulp*|S|*THRESH ), ' ) 9971 FORMAT( ' where UC, VTC, SC are singular vectors ', $ 'and values' ) 9970 FORMAT( ' for JOBTYPE.NE.1 (see pdsvdcmp.f) ' ) 9969 FORMAT( 'HET : P if heterogeneity was detected by PDGESVD' ) 9968 FORMAT( ' T if detected by the PDSVSTST, N if', $ ' undetected' ) 9967 FORMAT( ' ' ) 9966 FORMAT( 'RESULT WALL CPU M N P Q', $ ' NB MTYPE CHK MTM DELTA HET' ) 9965 FORMAT( A ) END scalapack-2.0.2/TESTING/EIG/pdsvdtst.f000644 000766 000024 00000056130 11750131523 017457 0ustar00juliestaff000000 000000 SUBROUTINE PDSVDTST( M, N, NPROW, NPCOL, NB, ISEED, THRESH, WORK, $ RESULT, LWORK, NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER LWORK, M, N, NB, NOUT, NPCOL, NPROW DOUBLE PRECISION THRESH * .. * .. Array Arguments .. INTEGER ISEED( 4 ), RESULT( 9 ) DOUBLE PRECISION WORK( * ) * .. * * Purpose * ======= * * PDSVDTST checks the singular value decomposition (SVD) routine * PDGESVD. PDGESVD factors A = U diag(S) VT, where U and VT are * orthogonal and diag(S) is diagonal with the entries of the array * S on its diagonal. The entries of S are the singular values, stored * in decreasing order. U and VT can be optionally not computed, * computed and overwritten on A, or computed partially. * * A is M by N. Let SIZE = min( M, N ). S has dimension SIZE by SIZE. * U is M by SIZE and VT is SIZE by N. PDGESVD optionally calculates * U and VT, depending on the values of its parameters JOBU and JOBVT. * There are four possible combinations of "job" parameters for a call * to PDGESVD, that correspond to four values of internal index JOBTYPE. * The table below shows the mapping between "job" parameters of * PDGESVD and respective values of the index JOBTYPE together * with matrices computed for each type of the job. * * * | JOBU = 'V' | JOBU = 'N' * ---------- ------------------------------------------- * JOBVT = 'V'| JOBTYPE = 1 | JOBTYPE = 3 * | U1, S1, VT1 | S3, VT3 * ---------- ------------------------------------------ * JOBVT = 'N'| JOBTYPE = 2 | JOBTYPE = 4 * | U2, S2 | S4 * * * When PDSVDTST is called, a number of matrix "types" are specified. * For each type of matrix, and for the minimal workspace as well as * for larger than minimal workspace an M x N matrix "A" with known * singular values is generated and used to test the SVD routines. * For each matrix, A will be factored as A = U diag(S) VT and the * following 9 tests computed: * * (1) | A - U1 diag(S1) VT1 | / ( |A| max(M,N) ulp ) * * (2) | I - U1'U1 | / ( M ulp ) * * (3) | I - VT1 VT1' | / ( N ulp ), * * (4) S1 contains SIZE nonnegative values in decreasing order. * (Return 0 if true, 1/ULP if false.) * * (5) | S1 - S2 | / ( SIZE ulp |S| ) * * (6) | U1 - U2 | / ( M ulp ) * * (7) | S1 - S3 | / ( SIZE ulp |S| ) * * (8) | VT1 - VT3 | / ( N ulp ) * * (9) | S1 - S4 | / ( SIZE ulp |S| ) * * Currently, the list of possible matrix types is: * * (1) The zero matrix. * * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP. * (ULP = (first number larger than 1) - 1 ) * * (4) A matrix of the form U D VT, where U, VT are orthogonal and * D has evenly spaced entries 1, ..., ULP. * * (5) Same as (4), but multiplied by SQRT( overflow threshold ) * * (6) Same as (4), but multiplied by SQRT( underflow threshold ) * * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========== * * M (global input) INTEGER dimension * The value of the matrix row dimension. * * N (global input) INTEGER dimension * The value of the matrix column dimension. * * NPROW (global input) INTEGER * Number of process rows * * NPCOL (global input) INTEGER * Number of process columns * * NB (global input) INTEGER * The block size of the matrix A. NB >=1. * * ISEED (global input/local output) INTEGER array, dimension (4) * On entry, the seed of the random number generator. The array * elements should be between 0 and 4095; if not they will be * reduced mod 4096. Also, ISEED(4) must be odd. * On exit, ISEED is changed and can be used in the next call to * SDRVBD to continue the same random number sequence. * * THRESH (global input) DOUBLE PRECISION * The threshold value for the test ratios. A result is * included in the output file if RESULT >= THRESH. The test * ratios are scaled to be O(1), so THRESH should be a small * multiple of 1, e.g., 10 or 100. To have every test ratio * printed, use THRESH = 0. * * RESULT (global input/output) INTEGER array of dimension 9. Initially * RESULT( I ) = 0. On the output, RESULT ( I ) = 1 if test I * ( see above ) wasn't passed. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER * Dimension of the array WORK. It is defined as follows * LWORK = 1 + 2*LDA*NQ + 3*SIZE + * MAX(WPDLAGGE, LDU*SIZEQ + LDVT*NQ + MAX(LDU*SIZEQ, LDVT*NQ) * + WPDGESVD + MAX( WPDSVDCHK, WPDSVDCMP)), * where WPDLAGGE, WPDGESVD, WPDSVDCHK, WPDSVDCMP are amounts * of workspace required respectively by PDLAGGE, PDGESVD, * PDSVDCHK, PDSVDCMP. * Here * LDA = NUMROC( M, NB, MYROW, 0, NPROW ), LDU = LDA, * LDVT = NUMROC( SIZE, NB, MYROW, 0, NPROW ), * NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ), * SIZEQ = NUMROC( SIZE, NB, MYCOL, 0, NPCOL ). * Values of the variables WPDLAGGE, WPDGESVD, WPDSVDCHK, * WPDSVDCMP are found by "dummy" calls to * the respective routines. In every "dummy" call, variable * LWORK is set to -1, thus causing respective routine * immediately return required workspace in WORK(1) without * executing any calculations * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_, NTYPES PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, NTYPES = 6 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. CHARACTER HETERO, JOBU, JOBVT INTEGER CONTEXT, DINFO, I, IA, IAM, INFO, ITYPE, IU, $ IVT, JA, JOBTYPE, JU, JVT, LDA, LDU, LDVT, $ LLWORK, LWMIN, MYCOL, MYROW, NNODES, NQ, PASS, $ PTRA, PTRAC, PTRD, PTRWORK, PTRS, PTRSC, PTRU, $ PTRUC, PTRVT, PTRVTC, SETHET, SIZE, SIZEQ, $ WPDGESVD, WPDLAGGE, WPDSVDCHK, WPDSVDCMP DOUBLE PRECISION CHK, DELTA, H, MTM, OVFL, RTOVFL, RTUNFL, ULP, $ UNFL * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ BLACS_SET, $ DESCINIT, DGAMN2D, DGAMX2D, DLABAD, DSCAL, $ IGAMN2D, IGAMX2D, IGEBR2D, IGEBS2D, PDELSET, $ PDGESVD, PDLACPY, PDLAGGE, PDLASET, PDSVDCHK, $ PDSVDCMP, PXERBLA, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL NUMROC, PDLAMCH * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCU( DLEN_ ), $ DESCVT( DLEN_ ), ITMP( 2 ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, MAX, MIN, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*DTYPE_*LLD_*MB_*M_*NB_*N_*RSRC_.LT.0 ) $ RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * If this process is not a part of the contex, bail out now. * IF( ( MYROW.GE.NPROW ) .OR. ( MYROW.LT.0 ) .OR. $ ( MYCOL.GE.NPCOL ) .OR. ( MYCOL.LT.0 ) )GO TO 110 CALL BLACS_SET( CONTEXT, 15, 1 ) INFO = 0 * * Check input parameters. * IF( M.LE.0 ) THEN INFO = -1 ELSE IF( N.LE.0 ) THEN INFO = -2 ELSE IF( NPROW.LE.0 ) THEN INFO = -3 ELSE IF( NPCOL.LE.0 ) THEN INFO = -4 ELSE IF( NB.LE.0 ) THEN INFO = -5 ELSE IF( THRESH.LE.0 ) THEN INFO = -7 END IF * SIZE = MIN( M, N ) * * Initialize matrix descriptors. * IA = 1 JA = 1 IU = 1 JU = 1 IVT = 1 JVT = 1 * LDA = NUMROC( M, NB, MYROW, 0, NPROW ) LDA = MAX( 1, LDA ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) LDU = LDA SIZEQ = NUMROC( SIZE, NB, MYCOL, 0, NPCOL ) LDVT = NUMROC( SIZE, NB, MYROW, 0, NPROW ) LDVT = MAX( 1, LDVT ) CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, CONTEXT, LDA, DINFO ) CALL DESCINIT( DESCU, M, SIZE, NB, NB, 0, 0, CONTEXT, LDU, DINFO ) CALL DESCINIT( DESCVT, SIZE, N, NB, NB, 0, 0, CONTEXT, LDVT, $ DINFO ) * * Set some pointers to work array in order to do "dummy" calls. * PTRA = 2 PTRAC = PTRA + LDA*NQ PTRD = PTRAC + LDA*NQ PTRS = PTRD + SIZE PTRSC = PTRS + SIZE PTRWORK = PTRSC + SIZE * PTRU = PTRWORK PTRVT = PTRWORK PTRUC = PTRWORK PTRVTC = PTRWORK * * "Dummy" calls -- return required workspace in work(1) without * any calculation. * CALL PDLAGGE( M, N, WORK( PTRD ), WORK( PTRA ), IA, JA, DESCA, $ ISEED, SIZE, WORK( PTRWORK ), -1, DINFO ) WPDLAGGE = INT( WORK( PTRWORK ) ) * CALL PDGESVD( 'V', 'V', M, N, WORK( PTRA ), IA, JA, DESCA, $ WORK( PTRS ), WORK( PTRU ), IU, JU, DESCU, $ WORK( PTRVT ), IVT, JVT, DESCVT, $ WORK( PTRWORK ), -1, DINFO ) WPDGESVD = INT( WORK( PTRWORK ) ) * CALL PDSVDCHK( M, N, WORK( PTRAC ), IA, JA, DESCA, WORK( PTRUC ), $ IU, JU, DESCU, WORK( PTRVT ), IVT, JVT, DESCVT, $ WORK( PTRS ), THRESH, WORK( PTRWORK ), -1, $ RESULT, CHK, MTM ) WPDSVDCHK = INT( WORK( PTRWORK ) ) * CALL PDSVDCMP( M, N, 1, WORK( PTRS ), WORK( PTRSC ), WORK( PTRU ), $ WORK( PTRUC ), IU, JU, DESCU, WORK( PTRVT ), $ WORK( PTRVTC ), IVT, JVT, DESCVT, THRESH, $ RESULT, DELTA, WORK( PTRWORK ), -1 ) WPDSVDCMP = INT( WORK( PTRWORK ) ) * * Calculation of workspace at last. * LWMIN = 1 + 2*LDA*NQ + 3*SIZE + $ MAX( WPDLAGGE, LDU*SIZEQ+LDVT*NQ+MAX( LDU*SIZEQ, $ LDVT*NQ )+WPDGESVD+MAX( WPDSVDCHK, WPDSVDCMP ) ) WORK( 1 ) = LWMIN * * If this is a "dummy" call, return. * IF( LWORK.EQ.-1 ) $ GO TO 120 IF( INFO.EQ.0 ) THEN IF( LWORK.LT.LWMIN ) THEN INFO = -10 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PDSVDTST', -INFO ) RETURN END IF * ULP = PDLAMCH( CONTEXT, 'P' ) UNFL = PDLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL DLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF * * Loop over matrix types. * DO 100 ITYPE = 1, NTYPES * PASS = 0 SETHET = 0 PTRWORK = PTRSC + SIZE LLWORK = LWORK - PTRWORK + 1 * * Compute A. * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix. * DO 10 I = 1, SIZE WORK( PTRD+I-1 ) = ZERO 10 CONTINUE * CALL PDLASET( 'All', M, N, ZERO, ZERO, WORK( PTRA ), $ IA, JA, DESCA ) * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix. * DO 20 I = 1, SIZE WORK( PTRD+I-1 ) = ONE 20 CONTINUE * CALL PDLASET( 'All', M, N, ZERO, ONE, WORK( PTRA ), $ IA, JA, DESCA ) * ELSE IF( ITYPE.GT.2 ) THEN * * Preset Singular Values. * IF( SIZE.NE.1 ) THEN H = ( ULP-1 ) / ( SIZE-1 ) DO 30 I = 1, SIZE WORK( PTRD+I-1 ) = 1 + H*( I-1 ) 30 CONTINUE ELSE WORK( PTRD ) = 1 END IF * IF( ITYPE.EQ.3 ) THEN * * Diagonal Matrix with specified singular values. * CALL PDLASET( 'All', M, N, ZERO, ZERO, WORK( PTRA ), $ IA, JA, DESCA ) * DO 40 I = 1, SIZE CALL PDELSET( WORK( PTRA ), I, I, DESCA, $ WORK( PTRD+I-1 ) ) 40 CONTINUE * ELSE IF( ITYPE.EQ.4 ) THEN * * General matrix with specified singular values. * CALL PDLAGGE( M, N, WORK( PTRD ), WORK( PTRA ), IA, JA, $ DESCA, ISEED, SIZE, WORK( PTRWORK ), $ LLWORK, INFO ) * ELSE IF( ITYPE.EQ.5 ) THEN * * Singular values scaled by overflow. * CALL DSCAL( SIZE, RTOVFL, WORK( PTRD ), 1 ) * CALL PDLAGGE( M, N, WORK( PTRD ), WORK( PTRA ), IA, JA, $ DESCA, ISEED, SIZE, WORK( PTRWORK ), $ LLWORK, INFO ) * ELSE IF( ITYPE.EQ.6 ) THEN * * Singular values scaled by underflow. * CALL DSCAL( SIZE, RTUNFL, WORK( PTRD ), 1 ) CALL PDLAGGE( M, N, WORK( PTRD ), WORK( PTRA ), IA, JA, $ DESCA, ISEED, SIZE, WORK( PTRWORK ), $ LLWORK, INFO ) * END IF * END IF * * Set mapping between JOBTYPE and calling parameters of * PDGESVD, reset pointers to WORK array to save space. * DO 80 JOBTYPE = 1, 4 * IF( JOBTYPE.EQ.1 ) THEN JOBU = 'V' JOBVT = 'V' PTRVT = PTRU + LDU*SIZEQ PTRUC = PTRVT + LDVT*NQ PTRWORK = PTRUC + LDU*SIZEQ LLWORK = LWORK - PTRWORK + 1 ELSE IF( JOBTYPE.EQ.2 ) THEN JOBU = 'V' JOBVT = 'N' ELSE IF( JOBTYPE.EQ.3 ) THEN JOBU = 'N' JOBVT = 'V' PTRVTC = PTRUC PTRWORK = PTRVTC + LDVT*NQ LLWORK = LWORK - PTRWORK + 1 ELSE IF( JOBTYPE.EQ.4 ) THEN JOBU = 'N' JOBVT = 'N' PTRWORK = PTRUC LLWORK = LWORK - PTRWORK + 1 END IF * * Duplicate matrix A. * CALL PDLACPY( 'A', M, N, WORK( PTRA ), IA, JA, DESCA, $ WORK( PTRAC ), IA, JA, DESCA ) * * Test SVD calculation with minimum amount of workspace * calculated earlier. * IF( JOBTYPE.EQ.1 ) THEN * * Run SVD. * CALL SLBOOT CALL BLACS_BARRIER( CONTEXT, 'All' ) CALL SLTIMER( 1 ) * CALL PDGESVD( JOBU, JOBVT, M, N, WORK( PTRAC ), IA, JA, $ DESCA, WORK( PTRS ), WORK( PTRU ), IU, JU, $ DESCU, WORK( PTRVT ), IVT, JVT, DESCVT, $ WORK( PTRWORK ), WPDGESVD, INFO ) * CALL SLTIMER( 1 ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 1, 1, CTIME ) * * Check INFO. Different INFO for different processes mean * something went wrong. * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, $ 1, -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), $ 1, 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' GO TO 120 END IF END IF * * If INFO is negative PXERBLA tells you. So the only thing * is to check for positive INFO -- detected heterogeneous * system. * IF( INFO.EQ.( SIZE+1 ) ) THEN HETERO = 'P' SETHET = 1 END IF * * If INFO was fine do more exhaustive check. * IF( INFO.EQ.ZERO ) THEN * DO 50 I = 1, SIZE WORK( I+PTRWORK ) = WORK( I+PTRS-1 ) WORK( I+SIZE+PTRWORK ) = WORK( I+PTRS-1 ) 50 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', SIZE, 1, $ WORK( 1+PTRWORK ), SIZE, 1, 1, -1, -1, $ 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', SIZE, 1, $ WORK( 1+SIZE+PTRWORK ), SIZE, 1, 1, -1, $ -1, 0 ) * DO 60 I = 1, SIZE IF( ABS( WORK( I+PTRWORK )-WORK( SIZE+I+ $ PTRWORK ) ).GT.ZERO ) THEN WRITE( NOUT, FMT = * )'I= ', I, ' MIN=', $ WORK( I+PTRWORK ), ' MAX=', $ WORK( SIZE+I+PTRWORK ) HETERO = 'T' SETHET = 1 GO TO 70 END IF * 60 CONTINUE 70 CONTINUE * END IF * IF( SETHET.NE.1 ) $ HETERO = 'N' * * Need to copy A again since AC was overwritten by PDGESVD. * CALL PDLACPY( 'A', M, N, WORK( PTRA ), IA, JA, DESCA, $ WORK( PTRAC ), IA, JA, DESCA ) * * PDSVDCHK overwrites U. So before the call to PDSVDCHK * U is copied to UC and a pointer to UC is passed to * PDSVDCHK. * CALL PDLACPY( 'A', M, SIZE, WORK( PTRU ), IU, JU, DESCU, $ WORK( PTRUC ), IU, JU, DESCU ) * * Run tests 1 - 4. * CALL PDSVDCHK( M, N, WORK( PTRAC ), IA, JA, DESCA, $ WORK( PTRUC ), IU, JU, DESCU, $ WORK( PTRVT ), IVT, JVT, DESCVT, $ WORK( PTRS ), THRESH, WORK( PTRWORK ), $ LLWORK, RESULT, CHK, MTM ) * ELSE * * Once again test PDGESVD with min workspace. * CALL PDGESVD( JOBU, JOBVT, M, N, WORK( PTRAC ), IA, JA, $ DESCA, WORK( PTRSC ), WORK( PTRUC ), IU, $ JU, DESCU, WORK( PTRVTC ), IVT, JVT, $ DESCVT, WORK( PTRWORK ), WPDGESVD, INFO ) * CALL PDSVDCMP( M, N, JOBTYPE, WORK( PTRS ), $ WORK( PTRSC ), WORK( PTRU ), $ WORK( PTRUC ), IU, JU, DESCU, $ WORK( PTRVT ), WORK( PTRVTC ), IVT, JVT, $ DESCVT, THRESH, RESULT, DELTA, $ WORK( PTRWORK ), LLWORK ) * END IF * 80 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN DO 90 I = 1, 9 IF( RESULT( I ).EQ.1 ) THEN PASS = 1 WRITE( NOUT, FMT = * )'Test I = ', I, 'has failed' WRITE( NOUT, FMT = * )' ' END IF 90 CONTINUE IF( PASS.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'Passed', WTIME( 1 ), $ CTIME( 1 ), M, N, NPROW, NPCOL, NB, ITYPE, CHK, MTM, $ DELTA, HETERO END IF END IF 100 CONTINUE CALL BLACS_GRIDEXIT( CONTEXT ) 110 CONTINUE * 9999 FORMAT( A6, 2E10.3, 2I6, 2I4, I5, I6, 3F6.2, 4X, A1 ) 120 CONTINUE * * End of PDSVDTST * RETURN END scalapack-2.0.2/TESTING/EIG/pdsytdrv.f000644 000766 000024 00000040762 10363532303 017467 0ustar00juliestaff000000 000000 SUBROUTINE PDSYTDRV( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDSYTDRV computes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from Q, the * symmetric tridiagonal matrix T (or D and E), and TAU, which were * computed by PDSYTRD: sub( A ) := Q * T * Q'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of sub( A ). On entry, * if UPLO='U', the diagonal and first superdiagonal of sub( A ) * have the corresponding elements of the tridiagonal matrix T, * and the elements above the first superdiagonal, with the * array TAU, represent the orthogonal matrix Q as a product of * elementary reflectors, and the strictly lower triangular part * of sub( A ) is not referenced. If UPLO='L', the diagonal and * first subdiagonal of sub( A ) have the corresponding elements * of the tridiagonal matrix T, and the elements below the first * subdiagonal, with the array TAU, represent the orthogonal * matrix Q as a product of elementary reflectors, and the * strictly upper triangular part of sub( A ) is not referenced. * On exit, if UPLO = 'U', the upper triangular part of the * distributed symmetric matrix sub( A ) is recovered. * If UPLO='L', the lower triangular part of the distributed * symmetric matrix sub( A ) is recovered. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local input) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local input) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local input) DOUBLE PRECISION, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK >= 2 * NB *( NB + NP ) * * where NB = MB_A = NB_A, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * On exit, if INFO <> 0, a discrepancy has been found between * the diagonal and off-diagonal elements of A and the copies * contained in the arrays D and E. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION EIGHT, HALF, ONE, ZERO PARAMETER ( EIGHT = 8.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, IACOL, IAROW, ICTXT, II, IPT, IPV, IPX, $ IPY, J, JB, JJ, JL, K, MYCOL, MYROW, NB, NP, $ NPCOL, NPROW DOUBLE PRECISION ADDBND, D1, D2, E1, E2 * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCV( DLEN_ ), $ DESCT( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, IGSUM2D, $ PDELGET, PDGEMM, PDLACPY, $ PDLARFT, PDLASET, PDSYMM, $ PDSYR2K, PDTRMM * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 NB = DESCA( MB_ ) UPPER = LSAME( UPLO, 'U' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) * IPT = 1 IPV = NB * NB + IPT IPX = NB * NP + IPV IPY = NB * NP + IPX * CALL DESCSET( DESCD, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * ADDBND = EIGHT * PDLAMCH( ICTXT, 'eps' ) * IF( UPPER ) THEN * CALL DESCSET( DESCE, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 10 J = 0, N-1 D1 = ZERO E1 = ZERO D2 = ZERO E2 = ZERO CALL PDELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PDELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(N-1) ) THEN CALL PDELGET( ' ', ' ', E2, E, 1, JA+J+1, DESCE ) CALL PDELGET( 'Columnwise', ' ', E1, A, IA+J, JA+J+1, $ DESCA ) END IF * IF( ( ABS( D1 - D2 ).GT.( ABS( D2 ) * ADDBND ) ) .OR. $ ( ABS( E1 - E2 ).GT.( ABS( E2 ) * ADDBND ) ) ) $ INFO = INFO + 1 10 CONTINUE * * Compute the upper triangle of sub( A ). * CALL DESCSET( DESCV, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCT, NB, NB, NB, NB, IAROW, IACOL, ICTXT, NB ) * DO 20 K = 0, N-1, NB JB = MIN( NB, N-K ) I = IA + K J = JA + K * * Compute the lower triangular matrix T. * CALL PDLARFT( 'Backward', 'Columnwise', K+JB-1, JB, A, IA, $ J, DESCA, TAU, WORK( IPT ), WORK( IPV ) ) * * Copy Householder vectors into WORK( IPV ). * CALL PDLACPY( 'All', K+JB-1, JB, A, IA, J, DESCA, $ WORK( IPV ), 1, 1, DESCV ) * IF( K.GT.0 ) THEN CALL PDLASET( 'Lower', JB+1, JB, ZERO, ONE, WORK( IPV ), $ K, 1, DESCV ) ELSE CALL PDLASET( 'Lower', JB, JB-1, ZERO, ONE, WORK( IPV ), $ 1, 2, DESCV ) CALL PDLASET( 'Ge', JB, 1, ZERO, ZERO, WORK( IPV ), 1, $ 1, DESCV ) END IF * * Zero out the strict upper triangular part of A. * IF( K.GT.0 ) THEN CALL PDLASET( 'Ge', K-1, JB, ZERO, ZERO, A, IA, J, $ DESCA ) CALL PDLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, A, I-1, $ J+1, DESCA ) ELSE IF( JB.GT.1 ) THEN CALL PDLASET( 'Upper', JB-2, JB-2, ZERO, ZERO, A, IA, $ J+2, DESCA ) END IF * * (1) X := A * V * T' * CALL PDSYMM( 'Left', 'Upper', K+JB, JB, ONE, A, IA, JA, $ DESCA, WORK( IPV ), 1, 1, DESCV, ZERO, $ WORK( IPX ), 1, 1, DESCV ) CALL PDTRMM( 'Right', 'Lower', 'Transpose', 'Non-Unit', $ K+JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPX ), 1, 1, DESCV ) * * (2) X := X - 1/2 * V * (T * V' * X) * CALL PDGEMM( 'Transpose', 'No transpose', JB, JB, K+JB, ONE, $ WORK( IPV ), 1, 1, DESCV, WORK( IPX ), 1, 1, $ DESCV, ZERO, WORK( IPY ), 1, 1, DESCT ) CALL PDTRMM( 'Left', 'Lower', 'No transpose', 'Non-Unit', $ JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPY ), 1, 1, DESCT ) CALL PDGEMM( 'No tranpose', 'No transpose', K+JB, JB, JB, $ -HALF, WORK( IPV ), 1, 1, DESCV, WORK( IPY ), $ 1, 1, DESCT, ONE, WORK( IPX ), 1, 1, DESCV ) * * (3) A := A - X * V' - V * X' * CALL PDSYR2K( 'Upper', 'No transpose', K+JB, JB, -ONE, $ WORK( IPV ), 1, 1, DESCV, WORK( IPX ), 1, 1, $ DESCV, ONE, A, IA, JA, DESCA ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) DESCT( CSRC_ ) = MOD( DESCT( CSRC_ ) + 1, NPCOL ) * 20 CONTINUE * ELSE * CALL DESCSET( DESCE, 1, JA+N-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 30 J = 0, N-1 D1 = ZERO E1 = ZERO D2 = ZERO E2 = ZERO CALL PDELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PDELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(N-1) ) THEN CALL PDELGET( ' ', ' ', E2, E, 1, JA+J, DESCE ) CALL PDELGET( 'Columnwise', ' ', E1, A, IA+J+1, JA+J, $ DESCA ) END IF * IF( ( ABS( D1 - D2 ).GT.( ABS( D2 ) * ADDBND ) ) .OR. $ ( ABS( E1 - E2 ).GT.( ABS( E2 ) * ADDBND ) ) ) $ INFO = INFO + 1 30 CONTINUE * * Compute the lower triangle of sub( A ). * JL = MAX( ( ( JA+N-2 ) / NB ) * NB + 1, JA ) IACOL = INDXG2P( JL, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCV, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCT, NB, NB, NB, NB, INDXG2P( IA+JL-JA+1, NB, $ MYROW, DESCA( RSRC_ ), NPROW ), IACOL, ICTXT, $ NB ) * DO 40 J = JL, JA, -NB K = J - JA + 1 I = IA + K - 1 JB = MIN( N-K+1, NB ) * * Compute upper triangular matrix T from TAU. * CALL PDLARFT( 'Forward', 'Columnwise', N-K, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPV ) ) * * Copy Householder vectors into WORK( IPV ). * CALL PDLACPY( 'Lower', N-K, JB, A, I+1, J, DESCA, $ WORK( IPV ), K+1, 1, DESCV ) CALL PDLASET( 'Upper', N-K, JB, ZERO, ONE, WORK( IPV ), $ K+1, 1, DESCV ) CALL PDLASET( 'Ge', 1, JB, ZERO, ZERO, WORK( IPV ), K, 1, $ DESCV ) * * Zero out the strict lower triangular part of A. * CALL PDLASET( 'Lower', N-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * (1) X := A * V * T' * CALL PDSYMM( 'Left', 'Lower', N-K+1, JB, ONE, A, I, J, $ DESCA, WORK( IPV ), K, 1, DESCV, ZERO, $ WORK( IPX ), K, 1, DESCV ) CALL PDTRMM( 'Right', 'Upper', 'Transpose', 'Non-Unit', $ N-K+1, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPX ), K, 1, DESCV ) * * (2) X := X - 1/2 * V * (T * V' * X) * CALL PDGEMM( 'Transpose', 'No transpose', JB, JB, N-K+1, $ ONE, WORK( IPV ), K, 1, DESCV, WORK( IPX ), $ K, 1, DESCV, ZERO, WORK( IPY ), 1, 1, DESCT ) CALL PDTRMM( 'Left', 'Upper', 'No transpose', 'Non-Unit', $ JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPY ), 1, 1, DESCT ) CALL PDGEMM( 'No transpose', 'No transpose', N-K+1, JB, JB, $ -HALF, WORK( IPV ), K, 1, DESCV, WORK( IPY ), $ 1, 1, DESCT, ONE, WORK( IPX ), K, 1, DESCV ) * * (3) A := A - X * V' - V * X' * CALL PDSYR2K( 'Lower', 'No tranpose', N-K+1, JB, -ONE, $ WORK( IPV ), K, 1, DESCV, WORK( IPX ), K, 1, $ DESCV, ONE, A, I, J, DESCA ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCT( RSRC_ ) = MOD( DESCT( RSRC_ ) + NPROW - 1, NPROW ) DESCT( CSRC_ ) = MOD( DESCT( CSRC_ ) + NPCOL - 1, NPCOL ) * 40 CONTINUE * END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * RETURN * * End of PDSYTDRV * END scalapack-2.0.2/TESTING/EIG/pdtrddriver.f000644 000766 000024 00000046646 10363532303 020150 0ustar00juliestaff000000 000000 PROGRAM PDTRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * Purpose * ======== * * PDTRDDRIVER is the main test program for the DOUBLE PRECISION * SCALAPACK TRD (symmetric tridiagonal reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 13 lines: * 'ScaLAPACK TRD computation input file' * 'PVM machine' * 'TRD.out' output file name * 6 device out * 'L' define Lower or Upper * 3 number of problems sizes * 5 31 201 values of N * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * DBLESZ INTEGER, default = 8 bytes. * INTGSZ and DBLESZ indicate the length in bytes on the * given platform for an integer and a double precision real. * MEM DOUBLE PRECISION array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, TOTMEM, MEMSIZ, NTESTS DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20, $ PADVAL = -9923.0D+0 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPT, IPW, ITEMP, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LCM, LWORK, MYCOL, $ MYROW, N, NB, NDIAG, NGRIDS, NMAT, NNB, NOFFD, $ NOUT, NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ, $ WORKTRD REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), MEM( MEMSIZ ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PDCHEKPAD, $ PDFILLPAD, PDLAFCHK, PDMATGEN, PDSYTDRV, $ PDSYTRD, PDTRDINFO, PDTTRDTESTER, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PDLANSY EXTERNAL LSAME, ICEIL, ILCM, NUMROC, PDLANSY * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )STOP * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PDTRDINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * NDIAG = NQ IF( LSAME( UPLO, 'U' ) ) THEN NOFFD = NQ ELSE NOFFD = NUMROC( N-1, NB, MYCOL, 0, NPCOL ) END IF * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPT = IPE + NOFFD + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * LWORK = MAX( NB*( NP+1 ), 3*NB ) WORKTRD = LWORK + IPOSTPAD WORKSIZ = WORKTRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN ITEMP = 2*NQ + NP IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) ITEMP = NB*ICEIL( ICEIL( NP, NB ), LCM / NPROW ) + $ ITEMP END IF ITEMP = MAX( ITEMP, 2*( NB+NP )*NB ) WORKSIZ = MAX( LWORK, ITEMP ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Tridiagonal reduction', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PDMATGEN( ICTXT, 'Symm', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), NQ, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANSY( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to symmetric tridiagonal form * CALL PDSYTRD( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDSYTRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTRD', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTRD', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTRD', WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - QTQ'|| / (||A|| * N * eps) * CALL PDSYTDRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), IERR( 1 ) ) CALL PDLAFCHK( 'Symm', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDSYTDRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTDRV', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTDRV', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTDRV', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ. $ 0.0D+0 .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 )FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * )'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * TRD requires 4/3 N^3 floating point operations * NOPS = DBLE( N ) * NOPS = ( 4.0D+0 / 3.0D+0 )*NOPS**3 NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', UPLO, N, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', UPLO, N, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, FRESID, PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * CALL PDTTRDTESTER( IAM, NPROCS, CHECK, NOUT, THRESH, NVAL, NMAT, $ MEM, TOTMEM, KPASS, KFAIL, KSKIP ) * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UPLO N NB P Q TRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ---- ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, A4, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, F9.2, 1X, $ F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*T*Q''|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PDTRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pdtrdinfo.f000644 000766 000024 00000032241 10363532303 017572 0ustar00juliestaff000000 000000 SUBROUTINE PDTRDINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NNB, NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY*(*) INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PDTRDINFO gets needed startup information for the symmetric * tridiagonal reduction and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * MAX( 5, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='TRD.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GOTO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GOTO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GOTO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GOTO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS IF( LSAME( UPLO, 'L' ) ) THEN WORK( 4 ) = 1 ELSE WORK( 4 ) = 2 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Reduction Routine to symmetric '// $ 'tridiagonal form.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real double precision symmetric '// $ 'tridiagonal' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - QTQ''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether the ''Upper'' or ''Low'// $ 'er'' part of A is to be referenced.' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less'// $ 'than THRESH, CHECK is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'TRD time : Time in seconds to reduce the'// $ ' matrix to tridiagonal form.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for '// $ 'symmetric tridiagonal reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) IF( WORK( 4 ).EQ.1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDTRDINFO * END scalapack-2.0.2/TESTING/EIG/pdttrdtester.f000644 000766 000024 00000061622 10363532303 020336 0ustar00juliestaff000000 000000 SUBROUTINE PDTTRDTESTER( IAM, NPROCS, CHECK, NOUT, THRESH, NVAL, $ NMAT, MEM, TOTMEM, KPASS, KFAIL, KSKIP ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * February 24, 2000 * * .. Scalar Arguments .. LOGICAL CHECK INTEGER IAM, KFAIL, KPASS, KSKIP, NMAT, NOUT, NPROCS, $ TOTMEM REAL THRESH * .. * .. Array Arguments .. INTEGER NVAL( * ) DOUBLE PRECISION MEM( * ) * .. * * Purpose * ======= * * PDTTRDTESTER tests PDSYTTRD * * Arguments * ========= * * IAM (local input) INTEGER * The local process number * * NPROCS (global input) INTEGER * The number of processors * * CHECK (global input) LOGICAL * Specifies whether the user wants to check the answer * * NOUT (local input) INTEGER * File descriptor * * THRESH (global input) DOUBLE PRECISION * Acceptable error threshold * * NVAL (global input) INTEGER array dimension NMAT * The matrix sizes to test * * NMAT (global input) INTEGER * The number of matrix sizes to test * * MEM (local input) DOUBLE PRECISION array dimension MEMSIZ * Where: * MEMSIZ = TOTMEM / DBLESZ * * TOTMEM (global input) INTEGER * Number of bytes in MEM * * KPASS (local input/output) INTEGER * The number of tests which passed. Only relevant on * processor 0. * * KFAIL (local input/output) INTEGER * The number of tests which failed. Only relevant on * processor 0. * * KSKIP (local input/output) INTEGER * The number of tests which were skipped. Only relevant on * processor 0. * * ================================================================ * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ DOUBLE PRECISION PADVAL PARAMETER ( DBLESZ = 8, PADVAL = -9923.0D+0 ) INTEGER TIMETESTS PARAMETER ( TIMETESTS = 11 ) INTEGER TESTS PARAMETER ( TESTS = 8 ) INTEGER MINTIMEN PARAMETER ( MINTIMEN = 8 ) * .. * .. Local Scalars .. LOGICAL TIME CHARACTER UPLO CHARACTER*6 PASSED INTEGER DUMMY, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPT, IPW, ITEMP, J, K, $ LCM, LWMIN, MAXTESTS, MEMSIZ, MYCOL, MYROW, N, $ NB, NDIAG, NGRIDS, NN, NOFFD, NP, NPCOL, NPROW, $ NPS, NQ, SPLITSTIMED, WORKSIZ, WORKTRD DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER ANBTEST( TESTS ), ANBTIME( TIMETESTS ), $ BALTEST( TESTS ), BALTIME( TIMETESTS ), $ DESCA( DLEN_ ), DESCD( DLEN_ ), IERR( 1 ), $ INTERTEST( TESTS ), INTERTIME( TIMETESTS ), $ PNBTEST( TESTS ), PNBTIME( TIMETESTS ), $ TWOGEMMTEST( TESTS ), TWOGEMMTIME( TIMETESTS ) DOUBLE PRECISION CTIME( 100 ), WTIME( 100 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, DESCINIT, $ IGEBR2D, IGEBS2D, IGSUM2D, PDCHEKPAD, $ PDFILLPAD, PDLAFCHK, PDLATRAN, PDMATGEN, $ PDSYTDRV, PDSYTTRD, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC, PJLAENV DOUBLE PRECISION PDLANSY EXTERNAL LSAME, ICEIL, ILCM, NUMROC, PJLAENV, PDLANSY * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, SQRT * .. * * .. Scalars in Common .. INTEGER ANB, BALANCED, BCKBLOCK, GSTBLOCK, INTERLEAVE, $ LLTBLOCK, MINSZ, PNB, TIMEINTERNALS, TIMING, $ TRSBLOCK, TWOGEMMS * .. * .. Common blocks .. COMMON / BLOCKSIZES / GSTBLOCK, LLTBLOCK, BCKBLOCK, $ TRSBLOCK COMMON / MINSIZE / MINSZ COMMON / PJLAENVTIMING / TIMING COMMON / TAILOREDOPTS / PNB, ANB, INTERLEAVE, $ BALANCED, TWOGEMMS COMMON / TIMECONTROL / TIMEINTERNALS * .. * .. Data statements .. DATA BALTIME / 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0 / DATA INTERTIME / 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1 / DATA TWOGEMMTIME / 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0 / DATA ANBTIME / 16, 16, 16, 16, 16, 8, 8, 32, 32, 16, $ 16 / DATA PNBTIME / 32, 32, 32, 32, 32, 32, 32, 32, 32, $ 16, 64 / DATA BALTEST / 0, 0, 0, 0, 1, 1, 1, 1 / DATA INTERTEST / 0, 0, 1, 1, 0, 0, 1, 1 / DATA TWOGEMMTEST / 0, 1, 0, 1, 0, 1, 0, 1 / DATA ANBTEST / 1, 2, 3, 16, 1, 2, 3, 16 / DATA PNBTEST / 1, 16, 8, 1, 16, 8, 1, 16 / * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * IASEED = 0 SPLITSTIMED = 0 NB = 1 UPLO = 'L' MEMSIZ = TOTMEM / DBLESZ * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * NGRIDS = INT( SQRT( DBLE( NPROCS ) ) ) * DO 30 NN = 1, NGRIDS * NPROW = NN NPCOL = NN IERR( 1 ) = 0 * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * IF( N.GT.MINTIMEN ) THEN * * For timing tests, we perform one or two extra tests. * Both of these extra tests are performed with the * default values for the performance tuning parameters. * The second extra test (which is only performed if * split times are non-zero) is performed with timeinternals * set to 1 (which forces barrier syncs between many * phases of the computation). * TIME = .TRUE. MAXTESTS = TIMETESTS + 2 ELSE TIME = .FALSE. MAXTESTS = TESTS END IF * * DO 10 K = 1, MAXTESTS TIMEINTERNALS = 0 IF( TIME ) THEN IF( K.GE.MAXTESTS-1 ) THEN * * For the last two timings, we let pjlaenv set * the execution path values. These dummy * initializations aren't really necessary, * but they illustrate the fact that these values are * set in xpjlaenv. The dummy call to pjlaenv * has the side effect of setting ANB. * MINSZ = -13 BALANCED = -13 INTERLEAVE = -13 TWOGEMMS = -13 ANB = -13 PNB = -13 TIMING = 1 DUMMY = PJLAENV( ICTXT, 3, 'PDSYTTRD', 'L', 0, 0, $ 0, 0 ) IF( K.EQ.MAXTESTS ) $ TIMEINTERNALS = 1 ELSE TIMING = 0 MINSZ = 1 BALANCED = BALTIME( K ) INTERLEAVE = INTERTIME( K ) TWOGEMMS = TWOGEMMTIME( K ) ANB = ANBTIME( K ) PNB = PNBTIME( K ) END IF ELSE TIMING = 0 MINSZ = 1 BALANCED = BALTEST( K ) INTERLEAVE = INTERTEST( K ) TWOGEMMS = TWOGEMMTEST( K ) ANB = ANBTEST( K ) PNB = PNBTEST( K ) END IF * * Skip the last test (with timeinternals = 1) if * PDSYTTRD is not collecting the split times. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, SPLITSTIMED, $ 1 ) ELSE CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, SPLITSTIMED, 1, $ 0, 0 ) END IF * * IF( SPLITSTIMED.EQ.0 .AND. K.EQ.MAXTESTS ) $ GO TO 10 * * The following hack tests to make sure that PNB need not * be the same on all processes. (Provided that PNB is set * to 1 in the TRD.dat file.) * IF( PNB.EQ.1 ) $ PNB = 1 + IAM * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * CALL DESCINIT( DESCD, 1, N, NB, NB, 0, 0, ICTXT, 1, $ INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * NDIAG = NQ IF( LSAME( UPLO, 'U' ) ) THEN NOFFD = NQ ELSE NOFFD = NUMROC( N-1, NB, MYCOL, 0, NPCOL ) END IF * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPT = IPE + NOFFD + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) LWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + NPS * WORKTRD = LWMIN + IPOSTPAD WORKSIZ = WORKTRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN ITEMP = 2*NQ + NP IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) ITEMP = NB*ICEIL( ICEIL( NP, NB ), LCM / NPROW ) + $ ITEMP END IF ITEMP = MAX( ITEMP, 2*( NB+NP )*NB ) WORKSIZ = MAX( LWMIN, ITEMP ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Tridiagonal reduction', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * * * Generate the matrix A * CALL PDMATGEN( ICTXT, 'Hemm', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PDFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), NQ, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PDLANSY( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDLANSY', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDFILLPAD( ICTXT, WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to symmetric tridiagonal form * CALL PDSYTTRD( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), LWMIN, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDSYTTRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTTRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PDCHEKPAD( ICTXT, 'PDSYTTRDc', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTTRDd', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTTRDe', WORKTRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKTRD-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PDFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - QTQ'|| / (||A|| * N * eps) * CALL PDSYTDRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), IERR( 1 ) ) * * TTRD does not preserve the upper triangular part of A. * The following call to PDLATRAN means that we only * check the lower triangular part of A - QTQ' * CALL PDLATRAN( N, 1, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PDLAFCHK( 'Hemm', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PDCHEKPAD( ICTXT, 'PDSYTDRVf', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTDRVg', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTDRVh', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PDCHEKPAD( ICTXT, 'PDSYTDRVi', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ. $ 0.0D+0 .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9991 )FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' * * END IF * * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * )'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 50, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 50, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * TRD requires 16/3 N^3 floating point operations * NOPS = DBLE( N ) NOPS = ( 16.0D+0 / 3.0D+0 )*NOPS**3 NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9992 )'WALL', N, INTERLEAVE, $ TWOGEMMS, BALANCED, ANB, PNB, NPROW*NPCOL, $ WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9992 )'CPU ', N, INTERLEAVE, $ TWOGEMMS, BALANCED, ANB, PNB, NPROW*NPCOL, $ CTIME( 1 ), TMFLOPS, FRESID, PASSED * * * If split times were collected (in PDSYttrd.f), print * them out. * IF( WTIME( 13 )+WTIME( 15 )+WTIME( 16 ).GT.0.0D+0 .OR. $ CTIME( 13 )+CTIME( 15 )+CTIME( 16 ).GT.0.0D+0 ) $ THEN SPLITSTIMED = 1 END IF IF( SPLITSTIMED.EQ.1 ) THEN WRITE( NOUT, FMT = 9990 )WTIME( 10 ), WTIME( 11 ), $ WTIME( 12 ), WTIME( 13 ), WTIME( 14 ), $ WTIME( 15 ) WRITE( NOUT, FMT = 9989 )WTIME( 16 ), WTIME( 17 ), $ WTIME( 18 ), WTIME( 19 ), WTIME( 20 ), $ WTIME( 21 ) * WRITE( NOUT, FMT = 9988 )CTIME( 10 ), CTIME( 11 ), $ CTIME( 12 ), CTIME( 13 ), CTIME( 14 ), $ CTIME( 15 ) WRITE( NOUT, FMT = 9987 )CTIME( 16 ), CTIME( 17 ), $ CTIME( 18 ), CTIME( 19 ), CTIME( 20 ), $ CTIME( 21 ) WRITE( NOUT, FMT = 9986 )N, NPROW*NPCOL, PNB, ANB, $ INTERLEAVE, BALANCED, TWOGEMMS, TIMEINTERNALS END IF END IF 10 CONTINUE 20 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN IF( SPLITSTIMED.EQ.1 ) THEN WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = 9974 ) WRITE( NOUT, FMT = 9973 ) END IF END IF * * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE RETURN * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) * 9995 FORMAT( 'PDSYTTRD, tailored reduction to tridiagonal form, test.' $ ) 9994 FORMAT( 'TIME N int 2gm bal anb pnb prcs TRD Time ', $ ' MFLOPS Residual CHECK' ) 9993 FORMAT( '---- ---- --- --- --- --- --- ---- -------- ', $ '----------- -------- ------' ) 9992 FORMAT( A4, 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 1X, $ I5, 1X, F9.2, 1X, F11.2, 1X, F8.2, 1X, A6 ) 9991 FORMAT( '||A - Q*T*Q''|| / (||A|| * N * eps) = ', G25.7 ) 9990 FORMAT( 'wsplit1=[wsplit1;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9989 FORMAT( 'wsplit2=[wsplit2;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9988 FORMAT( 'csplit1=[csplit1;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9987 FORMAT( 'csplit2=[csplit2;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9986 FORMAT( 'size_opts=[size_opts;', I4, 1X, I4, 1X, I4, 1X, I4, 1X, $ I4, 1X, I4, 1X, I4, 1X, I4, 1X, ' ];' ) 9985 FORMAT( 'N=1; NPROCS=2; PNB=3; ANB=4; INTERLEAVE=5; BALANCED=6;', $ ' TWOGEMMS=7; TIMEINTERNALS=8;' ) 9984 FORMAT( 'S1_OVERHEAD = 1; % Should be mainly cost of barrier' ) 9983 FORMAT( 'S1_BARRIER = 2; % Cost of barrier' ) 9982 FORMAT( 'S1_UPDCURCOL = 3; % Update the current column' ) 9981 FORMAT( 'S1_HOUSE = 4; % Compute the householder vector' ) 9980 FORMAT( 'S1_SPREAD = 5; % Spread across' ) 9979 FORMAT( 'S1_TRANSPOSE = 6; % Transpose' ) 9978 FORMAT( 'S2_UPDCURBLK = 1; % Update the current block column' ) 9977 FORMAT( 'S2_TRMVT = 2; % TRMVT v = A * h; vt = ht * A'' ' ) 9976 FORMAT( 'S2_UPD_V = 3; % v = v + V * HT * h + H * VT * h ' ) 9975 FORMAT( 'S2_TRANS_SUM = 4; % v = v + vt'' ' ) 9974 FORMAT( 'S2_DOT = 5; % c = v'' * h ' ) 9973 FORMAT( 'S2_R2K = 6; % A = A - v * h'' - h * v'' ' ) * * * End of PDTTRDTESTER * END scalapack-2.0.2/TESTING/EIG/pmatgeninc.f000644 000766 000024 00000020031 10363532303 017720 0ustar00juliestaff000000 000000 * ===================================================================== * SUBROUTINE LADD * ===================================================================== * SUBROUTINE LADD( J, K, I ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. INTEGER I(2), J(2), K(2) * .. * * ===================================================================== * * .. Parameters .. INTEGER IPOW16, IPOW15 PARAMETER ( IPOW16=2**16, IPOW15=2**15 ) * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * I(1) = MOD( K(1)+J(1), IPOW16 ) I(2) = MOD( (K(1)+J(1)) / IPOW16+K(2)+J(2), IPOW15 ) * RETURN * * End of LADD * END * * ===================================================================== * SUBROUTINE LMUL * ===================================================================== * SUBROUTINE LMUL( K, J, I ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. INTEGER I(2), J(2), K(2) * .. * * ===================================================================== * * .. Parameters .. INTEGER IPOW15, IPOW16, IPOW30 PARAMETER ( IPOW15=2**15, IPOW16=2**16, IPOW30=2**30 ) * .. * .. Local Scalars .. INTEGER KT, LT * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * KT = K(1)*J(1) IF( KT.LT.0 ) KT = (KT+IPOW30) + IPOW30 I(1) = MOD(KT,IPOW16) LT = K(1)*J(2) + K(2)*J(1) IF( LT.LT.0 ) LT = (LT+IPOW30) + IPOW30 KT = KT/IPOW16 + LT IF( KT.LT.0 ) KT = (KT+IPOW30) + IPOW30 I(2) = MOD( KT, IPOW15 ) * RETURN * * End of LMUL * END * * ===================================================================== * SUBROUTINE XJUMPM * ===================================================================== * SUBROUTINE XJUMPM( JUMPM, MULT, IADD, IRANN, IRANM, IAM, ICM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER JUMPM * .. * .. Array Arguments .. INTEGER IADD(2), IAM(2), ICM(2), IRANM(2), IRANN(2) INTEGER MULT(2) * .. * * ===================================================================== * * .. Local Scalars .. INTEGER I * .. * .. Local Arrays .. INTEGER J(2) * .. * .. External Subroutines .. EXTERNAL LADD, LMUL * .. * .. Executable Statements .. * IF( JUMPM.GT.0 ) THEN DO 10 I = 1, 2 IAM(I) = MULT(I) ICM(I) = IADD(I) 10 CONTINUE DO 20 I = 1, JUMPM-1 CALL LMUL( IAM, MULT, J ) IAM(1) = J(1) IAM(2) = J(2) CALL LMUL( ICM, MULT, J ) CALL LADD( IADD, J, ICM ) 20 CONTINUE CALL LMUL( IRANN, IAM, J ) CALL LADD( J, ICM, IRANM ) ELSE IRANM(1) = IRANN(1) IRANM(2) = IRANN(2) END IF * RETURN * * End of XJUMPM * END * * ===================================================================== * SUBROUTINE SETRAN * ===================================================================== * SUBROUTINE SETRAN( IRAN, IA, IC ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. INTEGER IA(2), IC(2), IRAN(2) * .. * * ===================================================================== * * .. Local Scalars .. INTEGER I * .. * .. Local Arrays .. INTEGER IAS(2), ICS(2), IRAND(2) * .. * .. Common Blocks .. COMMON /RANCOM/ IRAND, IAS, ICS SAVE /RANCOM/ * .. * .. Executable Statements .. * DO 10 I = 1, 2 IRAND(I) = IRAN(I) IAS(I) = IA(I) ICS(I) = IC(I) 10 CONTINUE * RETURN * * End of SETRAN * END * * ===================================================================== * SUBROUTINE JUMPIT * ===================================================================== * SUBROUTINE JUMPIT( MULT, IADD, IRANN, IRANM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. INTEGER IADD(2), IRANM(2), IRANN(2), MULT(2) * .. * * ===================================================================== * * .. Local Arrays .. INTEGER IAS(2), ICS(2), IRAND(2), J(2) * .. * .. External Subroutines .. EXTERNAL LADD, LMUL * .. * .. Common Blocks .. COMMON /RANCOM/ IRAND, IAS, ICS SAVE /RANCOM/ * .. * .. Executable Statements .. * CALL LMUL( IRANN, MULT, J ) CALL LADD( J, IADD, IRANM ) * IRAND(1) = IRANM(1) IRAND(2) = IRANM(2) * RETURN * * End of JUMPIT * END * * ===================================================================== * REAL FUNCTION PSRAND * ===================================================================== * REAL FUNCTION PSRAND( IDUMM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * ===================================================================== * * .. Parameters .. REAL DIVFAC, POW16 PARAMETER ( DIVFAC=2.147483648E+9, POW16=6.5536E+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL LADD, LMUL * .. * .. Intrinsic Functions .. INTRINSIC REAL * .. * .. Common Blocks .. INTEGER IAS(2), ICS(2), IRAND(2) COMMON /RANCOM/ IRAND, IAS, ICS SAVE /RANCOM/ * .. * .. Executable Statements .. * PSRAND = ( REAL(IRAND(1)) + POW16 * REAL(IRAND(2)) ) / DIVFAC * CALL LMUL( IRAND, IAS, J ) CALL LADD( J, ICS, IRAND ) * RETURN * * End of PSRAND * END * * ===================================================================== * DOUBLE PRECISION FUNCTION PDRAND * ===================================================================== * DOUBLE PRECISION FUNCTION PDRAND( IDUMM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION DIVFAC, POW16 PARAMETER ( DIVFAC=2.147483648D+9, POW16=6.5536D+4 ) * .. * .. Local Arrays .. INTEGER J(2) * .. * .. External Subroutines .. EXTERNAL LADD, LMUL * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Common Blocks .. INTEGER IAS(2), ICS(2), IRAND(2) COMMON /RANCOM/ IRAND, IAS, ICS SAVE /RANCOM/ * .. * .. Executable Statements .. * PDRAND = ( DBLE(IRAND(1)) + POW16 * DBLE(IRAND(2)) ) / DIVFAC * CALL LMUL( IRAND, IAS, J ) CALL LADD( J, ICS, IRAND ) * RETURN * * End of PDRAND * END scalapack-2.0.2/TESTING/EIG/psbrddriver.f000644 000766 000024 00000047431 10363532303 020136 0ustar00juliestaff000000 000000 PROGRAM PSBRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * Purpose * ======= * * PSBRDDRIVER is the main test program for the REAL * ScaLAPACK BRD (bidiagonal reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 13 lines: * 'ScaLAPACK BRD computation input file' * 'PVM machine' * 'BRD.out' output file name * 6 device out * 3 number of problems sizes * 16 20 18 values of M * 16 18 20 values of N * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 1.0 threshold * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER MEMSIZ, NTESTS, REALSZ, TOTMEM REAL PADVAL PARAMETER ( REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPTP, IPTQ, IPW, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LWORK, M, MAXMN, $ MINMN, MNP, MNQ, MP, MYCOL, MYROW, N, NB, $ NDIAG, NGRIDS, NMAT, NNB, NOFFD, NOUT, NPCOL, $ NPROCS, NPROW, NQ, WORKBRD, WORKSIZ REAL ANORM, FRESID, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ MVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSCHEKPAD, $ PSBRDINFO, PSFILLPAD, PSLAFCHK, $ PSMATGEN, PSGEBDRV, PSGEBRD, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PSLANGE EXTERNAL ICEIL, NUMROC, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PSBRDINFO( OUTFILE, NOUT, NMAT, MVAL, NTESTS, NVAL, NTESTS, $ NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * * Go to bottom of loop if this case doesn't use my process * DO 20 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) MNP = NUMROC( MIN( M, N ), NB, MYROW, 0, NPROW ) MNQ = NUMROC( MIN( M, N ), NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP )+IMIDPAD, IERR( 1 ) ) * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IF( M.GE.N ) THEN NDIAG = MNQ NOFFD = MNP ELSE NDIAG = MNP NOFFD = NUMROC( MIN( M, N )-1, NB, MYCOL, 0, NPCOL ) END IF * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPTQ = IPE + NOFFD + IPOSTPAD + IPREPAD IPTP = IPTQ + MNQ + IPOSTPAD + IPREPAD IPW = IPTP + MNP + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * LWORK = NB*( MP+NQ+1 ) + NQ WORKBRD = LWORK + IPOSTPAD WORKSIZ = WORKBRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN WORKSIZ = MAX( LWORK, 2*NB*( MP+NQ+NB ) ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Bidiagonal reduction', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PSMATGEN( ICTXT, 'No', 'No', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, MNQ, 1, MEM( IPTQ-IPREPAD ), $ MNQ, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, MNP, 1, MEM( IPTP-IPREPAD ), $ MNP, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANGE( 'I', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKBRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKBRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to bidiagonal form * CALL PSGEBRD( M, N, MEM( IPA ), 1, 1, DESCA, MEM( IPD ), $ MEM( IPE ), MEM( IPTQ ), MEM( IPTP ), $ MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGEBRD', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEBRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEBRD', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEBRD', MNQ, 1, $ MEM( IPTQ-IPREPAD ), MNQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEBRD', MNP, 1, $ MEM( IPTP-IPREPAD ), MNP, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEBRD', WORKBRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKBRD-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A-Q*B*P|| / (||A|| * N * eps) * CALL PSGEBDRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPTQ ), $ MEM( IPTP ), MEM( IPW ), IERR( 1 ) ) CALL PSLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGEBDRV', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEBDRV', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEBDRV', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEBDRV', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ.0.0E+0 $ .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID * KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * ) $ 'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * BRD requires 8/3 N^3 floating point operations * MAXMN = MAX( M, N ) MINMN = MIN( M, N ) NOPS = 4.0D+0 * DBLE( MINMN ) * DBLE( MINMN ) * $ ( DBLE( MAXMN ) - DBLE( MINMN ) / 3.0D+0 ) NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, FRESID, PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N NB P Q BRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ------ ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, F9.2, 1X, $ F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*B*P|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PSBRDDRIVER * END scalapack-2.0.2/TESTING/EIG/psbrdinfo.f000644 000766 000024 00000032173 10363532303 017573 0ustar00juliestaff000000 000000 SUBROUTINE PSBRDINFO( SUMMRY, NOUT, NMAT, MVAL, LDMVAL, NVAL, $ LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDMVAL, LDNBVAL, LDNVAL, LDPVAL, $ LDQVAL, NGRIDS, NMAT, NNB, NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDMVAL ), NBVAL( LDNBVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), $ QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PSBRDINFO get needed startup information for the bidiagonal * reduction and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, output to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for M & N. * * MVAL (global output) INTEGER array, dimension (LDMVAL) * The values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M. LDMVAL >= NMAT. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N. LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * MAX( 5, LDMVAL+LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack * all input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, IGEBR2D, IGEBS2D, $ SCOPY * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'BRD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( UNIT = NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get values of M, N * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1. .OR. NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'M', LDMVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 1, 3, WORK, 1 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Bidiagonal reduction' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision bidiagonal ' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - Q B P''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less'// $ ' than THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'BRD time : Time in seconds to reduce the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for '// $ 'the bidiagonal reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9995 ) $ 'M ', ( MVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( MVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9995 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9999 ) ' ' WRITE( NOUT, FMT = 9996 ) EPS WRITE( NOUT, FMT = 9993 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 3, WORK, 1, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = 2*NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 CONTINUE WRITE( NOUT, FMT = 9998 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9997 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9996 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9995 FORMAT( 2X, A5, ': ', 10I6 ) 9994 FORMAT( ' ', 10I6 ) 9993 FORMAT( 'Routines pass computational tests if scaled residual is', $ ' less than ', G12.5 ) * * End of PSBRDINFO * END scalapack-2.0.2/TESTING/EIG/psgebdrv.f000644 000766 000024 00000046520 10363532303 017422 0ustar00juliestaff000000 000000 SUBROUTINE PSGEBDRV( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INFO, IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), D( * ), E( * ), TAUP( * ), TAUQ( * ), $ WORK( * ) * .. * * Purpose * ======= * * PSGEBDRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from sub( A ), * Q, P returned by PSGEBRD: * * sub( A ) := Q * B * P'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of sub( A ) * as returned by PSGEBRD. On exit, the original distribu- * ted matrix sub( A ) is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local input) REAL array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local input) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local input) REAL array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix Q. TAUQ * is tied to the distributed matrix A. See Further Details. * * TAUP (local input) REAL array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix P. TAUP * is tied to the distributed matrix A. See Further Details. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK >= 2*NB*( MP + NQ + NB ) * * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MP = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * On exit, if INFO <> 0, a discrepancy has been found between * the diagonal and off-diagonal elements of A and the copies * contained in the arrays D and E. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL EIGHT, ONE, ZERO PARAMETER ( EIGHT = 8.0E+0, ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, IIA, IL, IPTP, IPTQ, $ IPV, IPW, IPWK, IOFF, IV, J, JB, JJA, JL, JV, $ K, MN, MP, MYCOL, MYROW, NB, NPCOL, NPROW, NQ REAL ADDBND, D1, D2, E1, E2 * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCV( DLEN_ ), $ DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, IGSUM2D, INFOG2L, $ PSLACPY, PSLARFB, PSLARFT, PSLASET, $ PSELGET * .. * .. External Functions .. INTEGER INDXG2P, NUMROC REAL PSLAMCH EXTERNAL INDXG2P, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 NB = DESCA( MB_ ) IOFF = MOD( IA-1, NB ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IOFF, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+IOFF, NB, MYCOL, IACOL, NPCOL ) IPV = 1 IPW = IPV + MP*NB IPTP = IPW + NQ*NB IPTQ = IPTP + NB*NB IPWK = IPTQ + NB*NB * IV = 1 JV = 1 MN = MIN( M, N ) IL = MAX( ( (IA+MN-2) / NB )*NB + 1, IA ) JL = MAX( ( (JA+MN-2) / NB )*NB + 1, JA ) IAROW = INDXG2P( IL, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCV, IA+M-IL, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCW, NB, JA+N-JL, NB, NB, IAROW, IACOL, ICTXT, $ NB ) * ADDBND = EIGHT * PSLAMCH( ICTXT, 'eps' ) * * When A is an upper bidiagonal form * IF( M.GE.N ) THEN * CALL DESCSET( DESCD, 1, JA+MN-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MN-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) * DO 10 J = 0, MN-1 D1 = ZERO E1 = ZERO D2 = ZERO E2 = ZERO CALL PSELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PSELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(MN-1) ) THEN CALL PSELGET( ' ', ' ', E2, E, IA+J, 1, DESCE ) CALL PSELGET( 'Rowwise', ' ', E1, A, IA+J, JA+J+1, $ DESCA ) END IF * IF( ( ABS( D1 - D2 ).GT.( ABS( D2 ) * ADDBND ) ) .OR. $ ( ABS( E1 - E2 ).GT.( ABS( E2 ) * ADDBND ) ) ) $ INFO = INFO + 1 10 CONTINUE * DO 20 J = JL, JA+NB-IOFF, -NB JB = MIN( JA+N-J, NB ) I = IA + J - JA K = I - IA + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PSLARFT( 'Forward', 'Columnwise', M-K+1, JB, A, I, J, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'Lower', M-K+1, JB, A, I, J, DESCA, $ WORK( IPV ), IV, JV, DESCV ) CALL PSLASET( 'Upper', M-K+1, JB, ZERO, ONE, WORK( IPV ), $ IV, JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PSLASET( 'Lower', M-K, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PSLARFT( 'Forward', 'Rowwise', N-K, JB, A, I, J+1, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'Upper', JB, N-K, A, I, J+1, DESCA, $ WORK( IPW ), IV, JV+1, DESCW ) CALL PSLASET( 'Lower', JB, N-K, ZERO, ONE, WORK( IPW ), IV, $ JV+1, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PSLASET( 'Upper', JB, N-K-1, ZERO, ZERO, A, I, J+2, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PSLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-K+1, N-K+1, JB, WORK( IPV ), $ IV, JV, DESCV, WORK( IPTQ ), A, I, J, DESCA, $ WORK( IPWK ) ) * * Apply block Householder transformation from Right. * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', $ M-K+1, N-K, JB, WORK( IPW ), IV, JV+1, DESCW, $ WORK( IPTP ), A, I, J+1, DESCA, WORK( IPWK ) ) * DESCV( M_ ) = DESCV( M_ ) + NB DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCW( N_ ) = DESCW( N_ ) + NB DESCW( RSRC_ ) = DESCV( RSRC_ ) DESCW( CSRC_ ) = DESCV( CSRC_ ) * 20 CONTINUE * * Handle first block separately * JB = MIN( N, NB - IOFF ) IV = IOFF + 1 JV = IOFF + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PSLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPV ), $ IV, JV, DESCV ) CALL PSLASET( 'Upper', M, JB, ZERO, ONE, WORK( IPV ), IV, JV, $ DESCV ) * * Zero out the strict lower triangular part of A. * CALL PSLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PSLARFT( 'Forward', 'Rowwise', N-1, JB, A, IA, JA+1, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'Upper', JB, N-1, A, IA, JA+1, DESCA, $ WORK( IPW ), IV, JV+1, DESCW ) CALL PSLASET( 'Lower', JB, N-1, ZERO, ONE, WORK( IPW ), IV, $ JV+1, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PSLASET( 'Upper', JB, N-2, ZERO, ZERO, A, IA, JA+2, $ DESCA ) * * Apply block Householder transformation from left. * CALL PSLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M, N, JB, WORK( IPV ), IV, JV, DESCV, $ WORK( IPTQ ), A, IA, JA, DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from right. * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', M, $ N-1, JB, WORK( IPW ), IV, JV+1, DESCW, $ WORK( IPTP ), A, IA, JA+1, DESCA, WORK( IPWK ) ) * ELSE * CALL DESCSET( DESCD, IA+MN-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MN-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 30 J = 0, MN-1 D1 = ZERO E1 = ZERO D2 = ZERO E2 = ZERO CALL PSELGET( ' ', ' ', D2, D, IA+J, 1, DESCD ) CALL PSELGET( 'Rowwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(MN-1) ) THEN CALL PSELGET( ' ', ' ', E2, E, 1, JA+J, DESCE ) CALL PSELGET( 'Columnwise', ' ', E1, A, IA+J+1, JA+J, $ DESCA ) END IF * IF( ( ABS( D1 - D2 ).GT.( ABS( D2 ) * ADDBND ) ) .OR. $ ( ABS( E1 - E2 ).GT.( ABS( E2 ) * ADDBND ) ) ) $ INFO = INFO + 1 30 CONTINUE * DO 40 I = IL, IA+NB-IOFF, -NB JB = MIN( IA+M-I, NB ) J = JA + I - IA K = J - JA + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PSLARFT( 'Forward', 'Columnwise', M-K, JB, A, I+1, J, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'Lower', M-K, JB, A, I+1, J, DESCA, $ WORK( IPV ), IV+1, JV, DESCV ) CALL PSLASET( 'Upper', M-K, JB, ZERO, ONE, WORK( IPV ), $ IV+1, JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PSLASET( 'Lower', M-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PSLARFT( 'Forward', 'Rowwise', N-K+1, JB, A, I, J, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'Upper', JB, N-K+1, A, I, J, DESCA, $ WORK( IPW ), IV, JV, DESCW ) CALL PSLASET( 'Lower', JB, N-K+1, ZERO, ONE, WORK( IPW ), $ IV, JV, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PSLASET( 'Upper', JB, N-K, ZERO, ZERO, A, I, J+1, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PSLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-K, N-K+1, JB, WORK( IPV ), $ IV+1, JV, DESCV, WORK( IPTQ ), A, I+1, J, $ DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from Right. * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', $ M-K+1, N-K+1, JB, WORK( IPW ), IV, JV, DESCW, $ WORK( IPTP ), A, I, J, DESCA, WORK( IPWK ) ) * DESCV( M_ ) = DESCV( M_ ) + NB DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCW( N_ ) = DESCW( N_ ) + NB DESCW( RSRC_ ) = DESCV( RSRC_ ) DESCW( CSRC_ ) = DESCV( CSRC_ ) * 40 CONTINUE * * Handle first block separately * JB = MIN( M, NB - IOFF ) IV = IOFF + 1 JV = IOFF + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PSLARFT( 'Forward', 'Columnwise', M-1, JB, A, IA+1, JA, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'Lower', M-1, JB, A, IA+1, JA, DESCA, $ WORK( IPV ), IV+1, JV, DESCV ) CALL PSLASET( 'Upper', M-1, JB, ZERO, ONE, WORK( IPV ), IV+1, $ JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PSLASET( 'Lower', M-2, JB, ZERO, ZERO, A, IA+2, JA, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PSLARFT( 'Forward', 'Rowwise', N, JB, A, IA, JA, DESCA, $ TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'Upper', JB, N, A, IA, JA, DESCA, WORK( IPW ), $ IV, JV, DESCW ) CALL PSLASET( 'Lower', JB, N, ZERO, ONE, WORK( IPW ), IV, JV, $ DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PSLASET( 'Upper', JB, N-1, ZERO, ZERO, A, IA, JA+1, $ DESCA ) * * Apply block Householder transformation from left * CALL PSLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M-1, N, JB, WORK( IPV ), IV+1, JV, DESCV, $ WORK( IPTQ ), A, IA+1, JA, DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from right * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', M, N, $ JB, WORK( IPW ), IV, JV, DESCW, WORK( IPTP ), $ A, IA, JA, DESCA, WORK( IPWK ) ) END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * RETURN * * End of PSGEBDRV * END scalapack-2.0.2/TESTING/EIG/psgehdrv.f000644 000766 000024 00000017632 10602576752 017446 0ustar00juliestaff000000 000000 SUBROUTINE PSGEHDRV( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 28, 2001 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEHDRV computes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from the * orthogonal matrix Q, the Hessenberg matrix, and the array TAU * returned by PSGEHRD: * sub( A ) := Q * H * Q' * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows and columns 1:ILO-1 and IHI+1:N. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) reduced to Hessenberg * form by PSGEHRD. The upper triangle and the first sub- * diagonal of sub( A ) contain the upper Hessenberg matrix H, * and the elements below the first subdiagonal, with the array * TAU, represent the orthogonal matrix Q as a product of * elementary reflectors. On exit, the original distributed * N-by-N matrix sub( A ) is recovered. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors returned by * PSGEHRD. TAU is tied to the distributed matrix A. * * WORK (local workspace) REAL array, dimension (LWORK). * LWORK >= NB*NB + NB*IHLP + MAX[ NB*( IHLP+INLQ ), * NB*( IHLQ + MAX[ IHIP, * IHLP+NUMROC( NUMROC( IHI-ILO+LOFF+1, NB, 0, 0, * NPCOL ), NB, 0, 0, LCMQ ) ] ) ] * * where NB = MB_A = NB_A, * LCM is the least common multiple of NPROW and NPCOL, * LCM = ILCM( NPROW, NPCOL ), LCMQ = LCM / NPCOL, * * IROFFA = MOD( IA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * * ILROW = INDXG2P( IA+ILO-1, NB, MYROW, RSRC_A, NPROW ), * ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, CSRC_A, NPCOL ), * IHLP = NUMROC( IHI-ILO+IROFFA+1, NB, MYROW, ILROW, NPROW ), * IHLQ = NUMROC( IHI-ILO+IROFFA+1, NB, MYCOL, ILCOL, NPCOL ), * INLQ = NUMROC( N-ILO+IROFFA+1, NB, MYCOL, ILCOL, NPCOL ). * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, IHLP, II, IOFF, IPT, $ IPV, IPW, IV, J, JB, JJ, JL, K, MYCOL, MYROW, $ NB, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSLARFB, $ PSLARFT, PSLACPY, PSLASET * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( IHI-ILO.LE.0 ) $ RETURN * NB = DESCA( MB_ ) IOFF = MOD( IA+ILO-2, NB ) CALL INFOG2L( IA+ILO-1, JA+ILO-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, IAROW, IACOL ) IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, IAROW, NPROW ) * IPT = 1 IPV = IPT + NB * NB IPW = IPV + IHLP * NB JL = MAX( ( ( JA+IHI-2 ) / NB ) * NB + 1, JA + ILO - 1 ) CALL DESCSET( DESCV, IHI-ILO+IOFF+1, NB, NB, NB, IAROW, $ INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ), ICTXT, MAX( 1, IHLP ) ) * DO 10 J = JL, ILO+JA+NB-IOFF-1, -NB JB = MIN( JA+IHI-J-1, NB ) I = IA + J - JA K = I - IA + 1 IV = K - ILO + IOFF + 1 * * Compute upper triangular matrix T from TAU. * CALL PSLARFT( 'Forward', 'Columnwise', IHI-K, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'All', IHI-K, JB, A, I+1, J, DESCA, WORK( IPV ), $ IV+1, 1, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PSLASET( 'Lower', IHI-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PSLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ IHI-K, N-K+1, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, I+1, J, DESCA, WORK( IPW ) ) * * Apply block Householder transformation from Right. * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Columnwise', $ IHI, IHI-K, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, IA, J+1, DESCA, WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle the first block separately * IV = IOFF + 1 I = IA + ILO - 1 J = JA + ILO - 1 JB = MIN( NB-IOFF, JA+IHI-J-1 ) * * Compute upper triangular matrix T from TAU. * CALL PSLARFT( 'Forward', 'Columnwise', IHI-ILO, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace. * CALL PSLACPY( 'All', IHI-ILO, JB, A, I+1, J, DESCA, WORK( IPV ), $ IV+1, 1, DESCV ) * * Zero out the strict lower triangular part of A. * IF( IHI-ILO.GT.0 ) $ CALL PSLASET( 'Lower', IHI-ILO-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PSLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ IHI-ILO, N-ILO+1, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, I+1, J, DESCA, WORK( IPW ) ) * * Apply block Householder transformation from Right. * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Columnwise', IHI, $ IHI-ILO, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, IA, J+1, DESCA, WORK( IPW ) ) * RETURN * * End of PSGEHDRV * END scalapack-2.0.2/TESTING/EIG/psgrptseptst.f000644 000766 000024 00000005301 10363532303 020360 0ustar00juliestaff000000 000000 PROGRAM PSRPTGSEPTST * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * Repeat generalized parallel symmetric eigenproblem test * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER MAXN, LWORK, LIWORK PARAMETER ( MAXN = 200, LWORK = 500000, $ LIWORK = 6*MAXN+4 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS, UPLO INTEGER CONTEXT, IAM, IBTYPE, INFO, IPOSTPAD, IPREPAD, $ LDA, MATTYPE, N, NB, NPCOL, NPROCS, NPROW REAL ABSTOL, THRESH * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), ICLUSTR( MAXN ), IFAIL( MAXN ), $ ISEED( 4 ), IWORK( LIWORK ) REAL A( MAXN*MAXN ), B( MAXN, MAXN ), $ COPYA( MAXN*MAXN ), COPYB( MAXN, MAXN ), $ GAP( MAXN ), WIN( MAXN ), WNEW( MAXN ), $ WORK( LWORK ), Z( MAXN*MAXN ) * .. * * * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDINIT, $ BLACS_PINFO, BLACS_SETUP, DESCINIT, PSGSEPTST * .. * .. Executable Statements .. * IPREPAD = 3 IPOSTPAD = 3 LDA = MAXN * * These lines should be replaced by the output from pxGSEPdriver * * ISEED( 1 ) = 2312 ISEED( 2 ) = 3709 ISEED( 3 ) = 666 ISEED( 4 ) = 3371 UPLO = 'U' SUBTESTS = 'Y' N = 33 NPROW = 2 NPCOL = 2 NB = 4 MATTYPE = 9 IBTYPE = 1 * note: the printout often makes a mess of ABSTOL ABSTOL = 0.1175494351E-37 THRESH = .350000E+01 * CALL BLACS_PINFO( IAM, NPROCS ) IF( NPROCS.LT.1 ) THEN * NPROCS = NPROW*NPCOL CALL BLACS_SETUP( IAM, NPROCS ) END IF CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, CONTEXT, LDA, INFO ) * CALL PSGSEPTST( DESCA, UPLO, N, MATTYPE, IBTYPE, SUBTESTS, THRESH, $ N, ABSTOL, ISEED, A, COPYA, B, COPYB, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK-IPREPAD-IPOSTPAD, IWORK, $ LIWORK-IPREPAD-IPOSTPAD, 6, INFO ) * * * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * * CALL BLACS_EXIT( 0 ) STOP * * * * End of PSRPTGSEPTST * END scalapack-2.0.2/TESTING/EIG/psgsepchk.f000644 000766 000024 00000031025 10363532303 017567 0ustar00juliestaff000000 000000 * * SUBROUTINE PSGSEPCHK( IBTYPE, MS, NV, A, IA, JA, DESCA, B, IB, JB, $ DESCB, THRESH, Q, IQ, JQ, DESCQ, C, IC, JC, $ DESCC, W, WORK, LWORK, TSTNRM, RESULT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 15, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, IBTYPE, IC, IQ, JA, JB, JC, JQ, LWORK, $ MS, NV, RESULT REAL THRESH, TSTNRM * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCB( * ), DESCC( * ), DESCQ( * ) REAL A( * ), B( * ), C( * ), Q( * ), W( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PSGSEPCHK checks a decomposition of the form * * A Q = B Q D or * A B Q = Q D or * B A Q = Q D * * where A is a symmetric matrix, B is * symmetric positive definite, Q is orthogonal, and D is diagonal. * * One of the following test ratios is computed: * * IBTYPE = 1: TSTNRM = | A Q - B Q D | / ( |A| |Q| n ulp ) * * IBTYPE = 2: TSTNRM = | A B Q - Q D | / ( |A| |Q| n ulp ) * * IBTYPE = 3: TSTNRM = | B A Q - Q D | / ( |A| |Q| n ulp ) * * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * MP = number of local rows in A, B and Q * MQ = number of local columns in A * NQ = number of local columns in B and Q * * IBTYPE (input) INTEGER * The form of the symmetric generalized eigenproblem. * = 1: A*Q = (lambda)*B*Q * = 2: A*B*Q = (lambda)*Q * = 3: B*A*Q = (lambda)*Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in A, B, C and Q * Also, the number of columns in A * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q. * * A (local input) REAL pointer to an * array in local memory of dimension (LLD_A, LOCc(JA+N-1)). * This array contains the local pieces of the M-by-N * distributed test matrix A * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix A. * * B (local input) REAL pointer to an * array in local memory of dimension (LLD_B, LOCc(JB+N-1)). * This array contains the local pieces of the M-by-N * distributed test matrix B * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix B. * * THRESH (input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) REAL array * global dimension (MS, NV), * local dimension (DESCA( DLEN_ ), NQ) * * Contains the eigenvectors as computed by PSSYEVX * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix Q. * * C (local workspace) REAL array, * global dimension (MS, NV), * local dimension (DESCA( DLEN_ ), MQ) * * Accumulator for computing AQ -QL * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix C. * * W (global input) REAL array, dimension (NV) * * Contains the computed eigenvalues * * WORK (local workspace) REAL array, * dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * TSTNRM (global output) REAL * * RESULT (global output) INTEGER * 0 if the test passes * 1 if the test fails * * .. Local Scalars .. * INTEGER I, INFO, MYCOL, MYROW, NPCOL, NPROW, NQ REAL ANORM, ULP * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) REAL CONE, CNEGONE, CZERO PARAMETER ( CONE = 1.0E+0, CNEGONE = -1.0E+0, $ CZERO = 0.0E+0 ) * .. * .. External Functions .. INTEGER NUMROC REAL PSLANGE, SLAMCH EXTERNAL NUMROC, PSLANGE, SLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PSGEMM, PSSCAL, $ PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * RESULT = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 CALL CHK1MAT( MS, 1, MS, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( MS, 1, MS, 2, IB, JB, DESCB, 11, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IQ, JQ, DESCQ, 16, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IB, JB, DESCB, 20, INFO ) * IF( INFO.EQ.0 ) THEN * NQ = NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IF( IQ.NE.1 ) THEN INFO = -14 ELSE IF( JQ.NE.1 ) THEN INFO = -15 ELSE IF( IA.NE.1 ) THEN INFO = -5 ELSE IF( JA.NE.1 ) THEN INFO = -6 ELSE IF( IB.NE.1 ) THEN INFO = -9 ELSE IF( JB.NE.1 ) THEN INFO = -10 ELSE IF( LWORK.LT.NQ ) THEN INFO = -23 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PSGSEPCHK', -INFO ) RETURN END IF * RESULT = 0 ULP = SLAMCH( 'Epsilon' ) * * Compute product of Max-norms of A and Q. * ANORM = PSLANGE( 'M', MS, MS, A, IA, JA, DESCA, WORK )* $ PSLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) IF( ANORM.EQ.ZERO ) $ ANORM = ONE * IF( IBTYPE.EQ.1 ) THEN * * Norm of AQ - BQD * * C = AQ * CALL PSGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 10 I = 1, NV CALL PSSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 10 CONTINUE * * C = C - BQ (i.e. AQ-BQD) * CALL PSGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, Q, $ IQ, JQ, DESCQ, CNEGONE, C, IC, JC, DESCC ) * TSTNRM = ( PSLANGE( 'M', MS, NV, C, IC, JC, DESCC, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * * ELSE IF( IBTYPE.EQ.2 ) THEN * * Norm of ABQ - QD * * * C = BQ * CALL PSGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 20 I = 1, NV CALL PSSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 20 CONTINUE * * Q = AC - Q * CALL PSGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, C, $ IC, JC, DESCC, CNEGONE, Q, IQ, JQ, DESCQ ) * TSTNRM = ( PSLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * ELSE IF( IBTYPE.EQ.3 ) THEN * * Norm of BAQ - QD * * * C = AQ * CALL PSGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 30 I = 1, NV CALL PSSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 30 CONTINUE * * Q = BC - Q * CALL PSGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, C, $ IC, JC, DESCC, CNEGONE, Q, IQ, JQ, DESCQ ) * TSTNRM = ( PSLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * END IF * IF( TSTNRM.GT.THRESH .OR. ( TSTNRM-TSTNRM.NE.0.0E0 ) ) THEN RESULT = 1 END IF RETURN * * End of PSGSEPCHK * END scalapack-2.0.2/TESTING/EIG/psgsepdriver.f000644 000766 000024 00000023345 10363532303 020323 0ustar00juliestaff000000 000000 * * PROGRAM PSGSEPDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Parallel REAL symmetric eigenproblem test driver * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. INTSIZ and REALSZ * indicate the length in bytes on the given platform for an integer * and a single precision real. * For example, on our system with 8 MB of memory, TOTMEM=6500000 * (leaves 1.5 MB for OS, code, BLACS buffer, etc), the length of a * REAL is 8, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * The full tester requires approximately (5 n + 5 n^2/p + slop) * REAL words and 6*n integer words. * So, TOTMEM should be set to at least 1.1 * 8 * (5n + 5n^2/p) * * WHAT WE TEST * ============ * * This routine tests PSSYGVX, the expert driver for the parallel * symmetric eigenvalue problem. We would like to cover all * possible combinations of: matrix size, process configuration * (nprow and npcol), block size (nb), matrix type (??), range * of eigenvalue (all, by value, by position), sorting options, * and upper vs. lower storage. * * We intend to provide two types of test input files, an * installation test and a thorough test. * * We also intend that the reports be meaningful. Our input file * will allow multiple requests where each request is a cross product * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, REALSZ, NIN PARAMETER ( TOTMEM = 2000000, REALSZ = 4, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / REALSZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) REAL MEM( MEMSIZ ) * .. * .. External Functions .. REAL SLAMCH EXTERNAL SLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PSGSEPREQ, PSLACHKIEEE, PSLASNBT * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PSLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) END IF * CALL PSLACHKIEEE( ISIEEE, SLAMCH( 'O' ), SLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) END IF GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK symmetric Eigendecomposition routines.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'generalized ' // 'symmetric eigenvalue routine: PSSYGVX.' WRITE( NOUT, FMT = 9999 )'A scaled residual check, ' // $ 'will be computed' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ ' : the QTQ norm is allowed to exceed THRESH' // $ ' for those eigenvectors' WRITE( NOUT, FMT = 9999 )' : which could not be ' // $ 'reorthogonalized for lack of workspace.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see pSGSEPtst.f).' WRITE( NOUT, FMT = 9999 ) $ 'IBTYPE : Generalized eigenproblem type' // $ ' (see pSSYGVx.f)' WRITE( NOUT, FMT = 9999 )'SUB : Subtests ' // $ '(see pSGSEPtst).f' WRITE( NOUT, FMT = 9999 )'CHK : The scaled residual' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PSGSEPREQ( NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, NSKIPPED, $ NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * 9999 FORMAT( A ) 9998 FORMAT( ' I am about to check to make sure that overflow' ) 9997 FORMAT( ' is handled in the ieee default manner. If this' ) 9996 FORMAT( ' is the last output you see, you should assume' ) 9995 FORMAT( ' that overflow caused a floating point exception.' ) 9994 FORMAT( ' In that case, we recommend that you add -DNO_IEEE' ) 9993 FORMAT( ' to the CDEFS line in SLmake.inc.' ) 9992 FORMAT( ' Alternatively, you could set CDEFS in SLmake.inc ' ) 9991 FORMAT( ' to enable the default ieee behaviour, However, this' ) 9990 FORMAT( ' may result in good or very bad performance.' ) 9989 FORMAT( ' Either signed zeroes or signed infinities ' ) 9988 FORMAT( ' work incorrectly or your system. Change your' ) 9987 FORMAT( ' SLmake.inc as suggested above.' ) * 9986 FORMAT( ' Your system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP IBTYPE SUB WALL CPU ', $ ' CHK CHECK' ) 9978 FORMAT( ' ----- --- --- --- --- ------ --- -------- --------', $ ' --------- -----' ) * * End of PSGSEPDRIVER * END scalapack-2.0.2/TESTING/EIG/psgsepreq.f000644 000766 000024 00000024354 10602576752 017634 0ustar00juliestaff000000 000000 * * SUBROUTINE PSGSEPREQ( NIN, MEM, MEMSIZE, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL MEM( MEMSIZE ) * .. * * Purpose * ======= * * PSGSEPREQ performs one request from the input file 'SEP.dat' * A request is the cross product of the specifications in the * input file. PSGSEPREQ prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEP.dat' * * MEM (local input) REAL ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL FIVE PARAMETER ( FIVE = 5.0E+0 ) INTEGER REALSZ, INTGSZ PARAMETER ( REALSZ = 4, INTGSZ = 4 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IBTYPE, IMIDPAD, INITCON, $ IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, LDA, LLWORK, MATSIZE, MATTYPE, MYCOL, $ MYROW, N, NB, NIBTYPES, NMATSIZES, NMATTYPES, $ NNODES, NP, NPCOL, NPCONFIGS, NPROW, NQ, $ NUPLOS, ORDER, PCONFIG, PTRA, PTRB, PTRCOPYA, $ PTRCOPYB, PTRGAP, PTRICLUS, PTRIFAIL, PTRIWRK, $ PTRW, PTRW2, PTRWORK, PTRZ, RES, SIZECHK, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, $ SIZESUBTST, SIZESYEVX, SIZETMS, SIZETST, UPLO REAL ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL LSAME, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PSGSEPTST, PSLASIZEGSEP, PSSEPINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PSSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * * Thresholds must be bigger for the generalized problem. * THRESH = THRESH*FIVE * DO 50 MATSIZE = 1, NMATSIZES * DO 40 PCONFIG = 1, NPCONFIGS * DO 30 MATTYPE = 1, NMATTYPES * DO 20 UPLO = 1, NUPLOS IF( LSAME( SUBTESTS, 'Y' ) ) THEN NIBTYPES = 3 ELSE NIBTYPES = 1 END IF DO 10 IBTYPE = 1, NIBTYPES * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, $ NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, $ MYROW, MYCOL ) IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PSLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, $ ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, $ ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYB = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRB = PTRCOPYB + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRB + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + MAX( N, 1 ) + IPREPAD + $ IPOSTPAD PTRGAP = PTRW2 + MAX( N, 1 ) + IPREPAD + $ IPOSTPAD PTRIFAIL = PTRGAP + NPROW*NPCOL + IPREPAD + $ IPOSTPAD PTRICLUS = PTRIFAIL + $ ICEIL( N+IPREPAD+IPOSTPAD, $ REALSZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, REALSZ / INTGSZ ) PTRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, REALSZ / INTGSZ ) LLWORK = MEMSIZE - PTRWORK - IPOSTPAD - $ IPREPAD + 1 NTESTS = NTESTS + 1 IF( LLWORK.LT.SIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PSGSEPTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), $ IBTYPE, SUBTESTS, THRESH, $ N, ABSTOL, ISEED, $ MEM( PTRA ), $ MEM( PTRCOPYA ), $ MEM( PTRB ), $ MEM( PTRCOPYB ), $ MEM( PTRZ ), LDA, $ MEM( PTRW ), MEM( PTRW2 ), $ MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ LLWORK, MEM( PTRIWRK ), $ ISIZETST, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * ) $ ' pSGSEPREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE 50 CONTINUE END IF * * RETURN * * End of PSDGSEPREQ * END scalapack-2.0.2/TESTING/EIG/psgsepsubtst.f000644 000766 000024 00000070504 10363532303 020353 0ustar00juliestaff000000 000000 * * SUBROUTINE PSGSEPSUBTST( WKNOWN, IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, IA, JA, DESCA, WIN, WNEW, $ IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, LWORK1, IWORK, LIWORK, $ RESULT, TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IBTYPE, IL, IPOSTPAD, IPREPAD, IU, JA, $ LIWORK, LWORK, LWORK1, N, NOUT, RESULT REAL ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) REAL A( * ), B( * ), COPYA( * ), COPYB( * ), $ GAP( * ), WIN( * ), WNEW( * ), WORK( * ), $ Z( * ) * .. * * Purpose * ======= * * PSGSEPSUBTST calls PSSYGVX and then tests the output of * PSSYGVX * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < THRESH * |QT * Q - I| / eps * norm(A) < THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PSSYGVX when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PSGSEPSUBTST * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call to PSGSEPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PSSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) REAL array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PSSYGVX for a description of block cyclic layout. * The test matrix, which is then modified by PSSYGVX * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) REAL array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * B (local workspace) REAL array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. * The B test matrix, which is then modified by PSSYGVX * * COPYB (local input) REAL array, dim (N, N) * COPYB is used to hold an identical copy of the array B * identical in both form and content to B * * Z (local workspace) REAL array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PSGSEPCHK and PSSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) REAL array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) REAL array, dimension (N) * The eigenvalues as copmuted by this call to PSSYGVX * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) REAL array, * dimension (NPROW*NPCOL) * * WORK (local workspace) REAL array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PSSYGVX * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PSSYGVX * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) REAL * |AQ- QL| / |A|*N*EPS * * QTQNRM (global output) REAL * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285E+0, FIVE = 5.0E+0, $ NEGONE = -1.0E+0 ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEVX, SIZETMS, $ SIZETST, VALSIZE, VECSIZE REAL EPS, ERROR, MAXERROR, MAXVU, MINERROR, MINVL, $ NORMWIN, OLDVL, OLDVU, ORFAC, SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), DSEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH EXTERNAL LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, IGAMN2D, IGAMX2D, $ PICHEKPAD, PIFILLPAD, PSCHEKPAD, PSELSET, $ PSFILLPAD, PSGSEPCHK, PSLASIZEGSEP, $ PSLASIZESYEVX, PSSYGVX, SGAMN2D, SGAMX2D, $ SLACPY, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PSLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PSLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3E+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159E+0 30 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL MINIL = 1 MAXIU = 0 DO 40 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 40 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * * * DSEED is not used in this call to PSLASIZESYEVX, the * following line just makes ftnchek happy. * DSEED( 1 ) = 1 * CALL PSLASIZESYEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL SLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL SLACPY( 'A', NP, NQ, COPYB, DESCA( LLD_ ), B( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PSFILLPAD( DESCA( CTXT_ ), NP, NQ, B, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0E+2 ) * CALL PSFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0E+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * * Make sure that PSSYGVX does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, MAXEIGS, 1 CALL PSELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0E+0 ) 50 CONTINUE 60 CONTINUE * ORFAC = -1.0E+0 * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PSSYGVX( IBTYPE, JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, $ JA, DESCA, B( 1+IPREPAD ), IA, JA, DESCA, VL, VU, $ IL, IU, ABSTOL, M, NZ, WNEW( 1+IPREPAD ), ORFAC, $ Z( 1+IPREPAD ), IA, JA, DESCA, WORK( 1+IPREPAD ), $ LWORK1, IWORK( 1+IPREPAD ), LIWORK, $ IFAIL( 1+IPREPAD ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYGVX-B', NP, NQ, B, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0E+2 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYGVX-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PSCHEKPAD( DESCZ( CTXT_ ), 'PSSYGVX-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYGVX-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYGVX-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYGVX-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PSSYGVX-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PSSYGVX-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PSSYGVX-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * * Since we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9992 ) RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Make sure that different processes return the same eigenvalues * DO 70 I = 1, M WORK( I ) = WNEW( I+IPREPAD ) WORK( I+M ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, WORK, M, 1, $ 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ WORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 80 I = 1, M * IF( RESULT.EQ.0 .AND. ( ABS( WORK( I )-WORK( M+ $ I ) ).GT.FIVE*EPS*ABS( WORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 80 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 90 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 100 NCLUSTERS = NCLUSTERS + 1 90 CONTINUE 100 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 110 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 110 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * * DO 120 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 120 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 150 * * Note that a couple key variables get redefined in PSGSEPCHK * as described by this table: * * PSGSEPTST name PSGSEPCHK name * ------------- ------------- * COPYA A * Z Q * B B * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the residual check * CALL PSFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3E+0 ) * CALL PSGSEPCHK( IBTYPE, N, NZ, COPYA, IA, JA, DESCA, COPYB, $ IA, JA, DESCA, THRESH, Z( 1+IPREPAD ), IA, $ JA, DESCZ, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSGSEPCHK-WORK', SIZECHK, $ 1, WORK, SIZECHK, IPREPAD, IPOSTPAD, $ 4.3E+0 ) * IF( RES.NE.0 ) $ RESULT = 1 END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN ) THEN * * Set up MYIL if necessary * MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 140 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 130 I = 1, M ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 130 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 140 CONTINUE * * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what PxSYGVX computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * * RETURN * 9999 FORMAT( 'PSSYGVX returned INFO=', I7 ) 9998 FORMAT( 'PSSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PSGSEPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PSSYGVX returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PSSYGVX' ) 9981 FORMAT( 'NZ altered by PSSYGVX with JOBZ=N' ) * * End of PSGSEPSUBTST * END scalapack-2.0.2/TESTING/EIG/psgseptst.f000644 000766 000024 00000121317 11622500733 017641 0ustar00juliestaff000000 000000 * * SUBROUTINE PSGSEPTST( DESCA, UPLO, N, MATTYPE, IBTYPE, SUBTESTS, $ THRESH, ORDER, ABSTOL, ISEED, A, COPYA, B, $ COPYB, Z, LDA, WIN, WNEW, IFAIL, ICLUSTR, $ GAP, IPREPAD, IPOSTPAD, WORK, LWORK, IWORK, $ LIWORK, NOUT, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SUBTESTS, UPLO INTEGER IBTYPE, INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, $ LWORK, MATTYPE, N, NOUT, ORDER REAL ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) REAL A( LDA, * ), B( LDA, * ), COPYA( LDA, * ), $ COPYB( LDA, * ), GAP( * ), WIN( * ), WNEW( * ), $ WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PSGSEPTST builds a random matrix A, and a well conditioned * matrix B, runs PSSYGVX() to compute the eigenvalues * and eigenvectors and then calls PSSYGVCHK to compute * the residual. * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) symmetric matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PSSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) REAL array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * The test matrix, which is then modified by PSSYGVX * * COPYA (local workspace) REAL array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * B (local workspace) REAL array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. * The B test matrix, which is then modified by PSSYGVX * * COPYB (local workspace) REAL array, dim (N, N) * COPYB is used to hold an identical copy of the array B * identical in both form and content to B * * Z (local workspace) REAL array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PSGSEPCHK * * W (local workspace) REAL array, dimension (N) * On normal exit from PSSYGVX, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PSLASIZEGSEP * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PSLASIZEGSEP * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE, TEN, HALF PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TEN = 10.0E+0, $ HALF = 0.5E+0 ) REAL PADVAL PARAMETER ( PADVAL = 19.25E+0 ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IINFO, IL, IMODE, IN, INDD, $ INDWORK, ISIZESUBTST, ISIZESYEVX, ISIZETST, $ ITYPE, IU, J, LLWORK, LSYEVXSIZE, MAXSIZE, $ MYCOL, MYROW, NB, NGEN, NLOC, NNODES, NP, $ NPCOL, NPROW, NQ, RES, SIZECHK, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, SIZESUBTST, $ SIZESYEVX, SIZETMS, SIZETST, VALSIZE, VECSIZE REAL ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH, SLARAN EXTERNAL LSAME, NUMROC, PSLAMCH, SLARAN * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, IGAMX2D, IGEBR2D, $ IGEBS2D, PSCHEKPAD, PSELSET, PSFILLPAD, $ PSGSEPSUBTST, PSLASET, PSLASIZEGSEP, $ PSLASIZESYEVX, PSLATMS, PSMATGEN, SLABAD, $ SLASRT, SLATMS, SLCOMBINE * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, MAX, MIN, MOD, REAL, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 PASSED = 'PASSED ' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Make sure that we have enough memory * * CALL PSLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * IF( LWORK.LT.SIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDWORK = INDD + N LLWORK = LWORK - INDWORK + 1 * ULP = PSLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PSLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL SLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / REAL( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log symmetric, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random symmetric * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * * Zero * * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N WORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PSLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N WORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PSLASET( 'All', N, N, ZERO, ONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0E+0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0E+0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * symmetric, eigenvalues specified * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * symmetric, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PSMATGEN( DESCA( CTXT_ ), 'S', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0E+0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0E+0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PSLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( SLARAN( ISEED )*REAL( NLOC ) ), N-NGEN ) * CALL SLATMS( IN, IN, 'S', ISEED, 'P', WORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PSELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PSELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PSELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PSELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2, ... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 WORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0E+0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * * * WKNOWN ... NOT SET, GUESS A DEFAULT * WKNOWN = .TRUE. ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL SLASRT( 'I', N, WORK( INDD ), IINFO ) * * Create the B matrix * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.3E+0 ) * ANORM = ONE * * Update ISEED so that {SLAGSY creates a different Q * ISEED( 4 ) = MOD( ISEED( 4 )+257, 4096 ) ISEED( 3 ) = MOD( ISEED( 3 )+192, 4096 ) ISEED( 2 ) = MOD( ISEED( 2 )+35, 4096 ) ISEED( 1 ) = MOD( ISEED( 1 )+128, 4096 ) CALL PSLATMS( N, N, 'S', ISEED, 'P', WORK( INDD ), 3, TEN, $ ANORM, N, N, 'N', COPYB, 1, 1, DESCA, ORDER, $ WORK( INDWORK+IPREPAD ), SIZETMS, IINFO ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS5-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.3E+0 ) * * * These values aren't actually used, but they make ftncheck happy. * IL = -1 IU = -2 VL = ONE VU = -ONE * CALL PSLASIZESYEVX( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, WORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) * LSYEVXSIZE = MIN( MAXSIZE, LWORK ) WKNOWN = .FALSE. * CALL PSGSEPSUBTST( WKNOWN, IBTYPE, 'v', 'a', UPLO, N, VL, VU, $ IL, IU, THRESH, ABSTOL, A, COPYA, B, COPYB, $ Z, 1, 1, DESCA, WORK( INDD ), WIN, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, IWORK, $ ISIZESYEVX, RES, TSTNRM, QTQNRM, NOUT ) * * * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'V', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'V' RANGE = 'A' CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 1' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'V', RANGE = 'A', random memory * IF( INFO.EQ.0 ) THEN JOBZ = 'V' RANGE = 'A' CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 2' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * We use PSLASIZESYEVX to choose IL and IU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'I', maximum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PSLASIZESYEVX to choose IL and IU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = MAXSIZE * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF * * Subtest 6: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PSLASIZESYEVX to choose IL and IU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 6' INFO = 1 END IF END IF * * Subtest 7: JOBZ = 'V', RANGE = 'I', random memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PSLASIZESYEVX to choose IL and IU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) LSYEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 7' INFO = 1 END IF END IF * * Subtest 8: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 8' INFO = 1 END IF END IF * * Subtest 9: JOBZ = 'V', RANGE = 'V', maximum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = MAXSIZE * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 9' INFO = 1 END IF END IF * * Subtest 10: JOBZ = 'V', RANGE = 'V', * minimum memory required for eigenvectors * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest10' INFO = 1 END IF END IF * * Subtest 11: JOBZ = 'V', RANGE = 'V', * random memory (enough for all eigenvectors * but not enough to guarantee orthogonality * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest11' INFO = 1 END IF END IF * * Subtest 12: JOBZ = 'V', RANGE = 'V', * miniimum memory required for eigenvalues only * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest12' INFO = 1 END IF END IF * * Subtest 13: JOBZ = 'V', RANGE = 'V', * random memory (more than minimum required * for eigenvalues, less than required for vectors) * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE + INT( SLARAN( ISEED )* $ REAL( VECSIZE-VALSIZE ) ) * CALL PSGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest13' INFO = 1 END IF END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9984 )IBTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, CTIME( 1 ), MAXTSTNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS END IF END IF * 120 CONTINUE * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, F8.2, 1X, F8.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, 8X, 1X, F8.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, F8.2, 1X, F8.2, 11X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, 8X, 1X, F8.2, 11X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 22X, 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) 9984 FORMAT( ' IBTYPE=', I8 ) 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) 9980 FORMAT( ' Increase TOTMEM in PSGSEPDRIVER' ) * * End of PSGSEPTST * END scalapack-2.0.2/TESTING/EIG/pshrddriver.f000644 000766 000024 00000045370 10363532303 020144 0ustar00juliestaff000000 000000 PROGRAM PSHRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * Purpose * ======= * * PSHRDDRIVER is the main test program for the REAL * ScaLAPACK HRD (Hessenberg Reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 14 lines: * 'ScaLAPACK HRD input file' * 'PVM machine' * 'HRD.out' output file name (if any) * 6 device out * 2 number of problems sizes * 100 101 values of N * 2 1 values of ILO * 99 101 values of IHI * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER MEMSIZ, NTESTS, REALSZ, TOTMEM REAL PADVAL PARAMETER ( REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IHI, IHIP, IHLP, IHLQ, $ ILCOL, ILO, ILROW, INFO, INLQ, IMIDPAD, IPA, $ IPT, IPW, IPOSTPAD, IPREPAD, ITEMP, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LCM, LCMQ, LOFF, $ LWORK, MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, $ NPROCS, NOUT, NP, NPCOL, NPROW, NQ, WORKHRD, $ WORKSIZ REAL ANORM, FRESID, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), NVHI( NTESTS ), NVLO( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINIT, BLACS_GRIDINFO, $ DESCINIT, IGSUM2D, BLACS_PINFO, PSFILLPAD, $ PSLAFCHK, PSGEHDRV, PSGEHRD, $ PSHRDINFO, PSMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ILCM, INDXG2P, NUMROC REAL PSLANGE EXTERNAL ILCM, INDXG2P, NUMROC, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PSHRDINFO( OUTFILE, NOUT, NMAT, NVAL, NVLO, NVHI, NTESTS, $ NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GOTO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) ILO = NVLO( J ) IHI = NVHI( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPT = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * IHIP = NUMROC( IHI, NB, MYROW, DESCA( RSRC_ ), NPROW ) LOFF = MOD( ILO-1, NB ) ILROW = INDXG2P( ILO, NB, MYROW, DESCA( RSRC_ ), NPROW ) ILCOL = INDXG2P( ILO, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) IHLP = NUMROC( IHI-ILO+LOFF+1, NB, MYROW, ILROW, NPROW ) INLQ = NUMROC( N-ILO+LOFF+1, NB, MYCOL, ILCOL, NPCOL ) LWORK = NB*( NB + MAX( IHIP+1, IHLP+INLQ ) ) WORKHRD = LWORK + IPOSTPAD WORKSIZ = WORKHRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IHLQ = NUMROC( IHI-ILO+LOFF+1, NB, MYCOL, ILCOL, $ NPCOL ) ITEMP = NB*MAX( IHLP+INLQ, IHLQ+MAX( IHIP, $ IHLP+NUMROC( NUMROC( IHI-ILO+LOFF+1, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ) ) ) WORKSIZ = MAX( NB*NB + NB*IHLP + ITEMP, NB * NP ) + $ IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Hessenberg reduction', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate A * CALL PSMATGEN( ICTXT, 'No', 'No', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), $ IASEED, 0, NP, 0, NQ, MYROW, MYCOL, $ NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), $ NQ, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANGE( 'I', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKHRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKHRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce Hessenberg form * CALL PSGEHRD( N, ILO, IHI, MEM( IPA ), 1, 1, DESCA, $ MEM( IPT ), MEM( IPW ), LWORK, INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGEHRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEHRD', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEHRD', WORKHRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKHRD-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - Q H Q'|| / (||A||*N*eps) * CALL PSGEHDRV( N, ILO, IHI, MEM( IPA ), 1, 1, DESCA, $ MEM( IPT ), MEM( IPW ) ) CALL PSLAFCHK( 'No', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSGEHDRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEHDRV', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSGEHDRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ.0.0E+0 ) $ THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather max. of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * HRD requires 10/3 * N^3 floating point ops. (flops) * more precisely, * HRD requires 4/3*(IHI-ILO)^3 + 2*IHI*(IHI-ILO)^2 flops * NOPS = DBLE( IHI-ILO ) NOPS = NOPS * NOPS * $ ( 2.0D0*DBLE( IHI ) + (4.0D0/3.0D0)*NOPS ) NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', N, ILO, IHI, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, FRESID, $ PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', N, ILO, IHI, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, FRESID, $ PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N ILO IHI NB P Q HRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ------ ------ ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, $ F9.2, 1X, F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*H*Q''|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PSHRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pshrdinfo.f000644 000766 000024 00000032374 10363532303 017604 0ustar00juliestaff000000 000000 SUBROUTINE PSHRDINFO( SUMMRY, NOUT, NMAT, NVAL, NVLO, NVHI, $ LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ NVHI( LDNVAL ), NVLO( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PSHRDINFO get the needed startup information for the Hessenberg * reduction tests and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, output to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for * N, IHI & ILO. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of rows & columns in matrix). * * NVLO (global output) INTEGER array, dimension (LDNVAL) * The values of ILO. * * NVHI (global output) INTEGER array, dimension (LDNVAL) * The values of IHI. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, ILO and IHI. LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * 3*LDNVAL+LDNBVAL+2*LDPVAL. Used to pack all input arrays * in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'HRD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = * ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( UNIT = NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1. .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'N', LDNVAL GO TO 20 END IF * * Get values of N, ILO, IHI * READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVLO( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVHI( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 1, 3, WORK, 1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVLO, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVHI, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS -1 CALL IGEBS2D( ICTXT, 'All', ' ', 1, I, WORK, 1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Reduction routine to Hessenberg form.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision Hessenberg ' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - Q H Q''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'HRD time : Time in seconds to compute HRD ' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for HRD ' // $ 'reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9995 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'ILO ', ( NVLO( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVLO( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'IHI ', ( NVHI( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVHI( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9995 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) EPS WRITE( NOUT, FMT = 9993 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 3, WORK, 1, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = 3*NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) * I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVLO, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVHI, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 CONTINUE WRITE( NOUT, FMT = 9998 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' ILLEGAL INPUT IN FILE ', 40A, '. ABORTING RUN.' ) 9997 FORMAT( ' NUMBER OF VALUES OF ', 5A, $ ' IS LESS THAN 1 OR GREATER ', 'THAN ', I2 ) 9996 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9995 FORMAT( 2X, A5, ': ', 10I6 ) 9994 FORMAT( ' ', 10I6 ) 9993 FORMAT( 'Routines pass computational tests if scaled residual is', $ ' less than ', G14.7 ) * * End of PSHRDINFO * END scalapack-2.0.2/TESTING/EIG/pshseqrdriver.f000644 000766 000024 00000051330 11656313237 020513 0ustar00juliestaff000000 000000 *********************************************************************** * Test program for ScaLAPACK-style routine PSHSEQR * *********************************************************************** * * Contributor: Robert Granat and Meiyue Shao * This version is of Feb 2011. * PROGRAM PSHSEQRDRIVER * * Declarations * IMPLICIT NONE * ...Parameters... LOGICAL BALANCE, COMPHESS, COMPRESI, $ COMPORTH LOGICAL DEBUG, PRN, TIMESTEPS, BARR, $ UNI_LAPACK INTEGER SLV_MIN, SLV_MAX PARAMETER ( DEBUG = .FALSE., $ PRN = .FALSE., $ TIMESTEPS = .TRUE., $ COMPHESS = .TRUE., $ COMPRESI = .TRUE., $ COMPORTH = .TRUE., $ BALANCE = .TRUE., $ BARR = .FALSE., * Solver: 1-PSLAQR1, 2-PSHSEQR. $ SLV_MIN = 2, SLV_MAX = 2, $ UNI_LAPACK = .TRUE. ) INTEGER N, NB, ARSRC, ACSRC PARAMETER ( * Problem size. $ N = 500, NB = 50, * What processor should hold the first element in A? $ ARSRC = 0, ACSRC = 0 ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DPALLOC, INTALLC INTEGER DPSIZ, INTSZ, NOUT, IZERO PARAMETER ( DPSIZ = 8, DPALLOC = 8 000 000, $ INTSZ = 4, INTALLC = 8 000 000, $ NOUT = 6, IZERO = 0) REAL ZERO, ONE, TWO PARAMETER ( ZERO = 0.0, ONE = 1.0, TWO = 2.0 ) * * ...Local Scalars... INTEGER ICTXT, IAM, NPROW, NPCOL, MYROW, MYCOL, $ SYS_NPROCS, NPROCS, AROWS, ACOLS, TEMP_ICTXT INTEGER THREADS INTEGER INFO, KTOP, KBOT, ILO, IHI, I INTEGER IPA, IPACPY, IPQ, WR1, WI1, WR2, WI2, IPW1, $ IPW2, IPIW INTEGER TOTIT, SWEEPS, TOTNS, HESS REAL EPS, THRESH DOUBLE PRECISION STAMP, TOTTIME, T_BA, T_GEN, T_HS, T_SCH, T_QR, $ T_RES, ITPEREIG, SWPSPEIG, NSPEIG, SPEEDUP, $ EFFICIENCY REAL RNORM, ANORM, R1, ORTH, O1, O2, DPDUM, ELEM1, $ ELEM2, ELEM3, EDIFF INTEGER SOLVER CHARACTER*6 PASSED * * ...Local Arrays... INTEGER DESCA( DLEN_ ), DESCQ( DLEN_ ), DESCVEC( DLEN_ ) REAL SCALE( N ) REAL, ALLOCATABLE :: MEM(:) INTEGER, ALLOCATABLE :: IMEM(:) * * ...Intrinsic Functions... INTRINSIC INT, FLOAT, SQRT, MAX, MIN * * ...External Functions... INTEGER NUMROC REAL PSLAMCH, PSLANGE DOUBLE PRECISION MPI_WTIME EXTERNAL BLACS_PINFO, BLACS_GET, BLACS_GRIDINIT, $ BLACS_GRIDINFO, BLACS_GRIDEXIT, BLACS_EXIT EXTERNAL NUMROC, PSLAMCH, PSLASET, PSGEHRD, PSLANGE EXTERNAL SGEBAL, SGEHRD EXTERNAL MPI_WTIME EXTERNAL PSGEBAL EXTERNAL PSMATGEN2 * * ...Executable statements... * CALL BLACS_PINFO( IAM, SYS_NPROCS ) NPROW = INT( SQRT( FLOAT(SYS_NPROCS) ) ) NPCOL = SYS_NPROCS / NPROW CALL BLACS_GET( 0, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, '2D', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) c print*, iam, ictxt, myrow, mycol c IF ( ( MYROW.GE.NPROW ) .OR. ( MYCOL.GE.NPCOL ) ) GO TO 777 IF ( ICTXT.LT.0 ) GO TO 777 * * Read out the number of underlying threads and set stack size in * kilobytes. * THRESH = 30.0 TOTTIME = MPI_WTIME() T_GEN = 0.0D+00 T_RES = 0.0D+00 T_SCH = 0.0D+00 * * Allocate and Init memory with zeros. * INFO = 0 ALLOCATE ( MEM( DPALLOC ), STAT = INFO ) IF( INFO.NE.0 ) THEN WRITE(*,*) '% Could not allocate MEM. INFO = ', INFO GO TO 777 END IF ALLOCATE ( IMEM( INTALLC ), STAT = INFO ) IF( INFO.NE.0 ) THEN WRITE(*,*) '% Could not allocate IMEM. INFO = ', INFO GO TO 777 END IF MEM( 1:DPALLOC ) = ZERO IMEM( 1:INTALLC ) = IZERO * * Get machine epsilon. * EPS = PSLAMCH( ICTXT, 'Epsilon' ) * * Print welcoming message. * IF( IAM.EQ.0 ) THEN WRITE(*,*) WRITE(*,*) 'ScaLAPACK Test for PSHSEQR' WRITE(*,*) WRITE(*,*) 'epsilon = ', EPS WRITE(*,*) 'threshold = ', THRESH WRITE(*,*) WRITE(*,*) 'Residual and Orthogonality Residual computed by:' WRITE(*,*) WRITE(*,*) 'Residual = ', $ ' || T - Q^T*A*Q ||_F / ( ||A||_F * eps * sqrt(N) )' WRITE(*,*) WRITE(*,*) 'Orthogonality = ', $ ' MAX( || I - Q^T*Q ||_F, || I - Q*Q^T ||_F ) / ', $ ' (eps * N)' WRITE(*,*) WRITE(*,*) $ 'Test passes if both residuals are less then threshold' WRITE( NOUT, * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) END IF * * Loop over problem parameters. * DO KTOP = 1, 1 DO KBOT = N, N DO SOLVER = SLV_MAX, SLV_MIN, -1 * * Set INFO to zero for this run. * INFO = 0 NPROCS = NPROW*NPCOL TEMP_ICTXT = ICTXT * * Count the number of rows and columns of current problem * for the current block sizes and grid properties. * STAMP = MPI_WTIME() AROWS = NUMROC( N, NB, MYROW, 0, NPROW ) ACOLS = NUMROC( N, NB, MYCOL, 0, NPCOL ) * * Set up matrix descriptors. * IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Set up descriptors...' IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') CALL DESCINIT( DESCA, N, N, NB, NB, MIN(ARSRC,NPROW-1), $ MIN(NPCOL-1,ACSRC), TEMP_ICTXT, MAX(1, AROWS), INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% DESCINIT DESCA failed, INFO =", INFO GO TO 999 END IF CALL DESCINIT( DESCQ, N, N, NB, NB, MIN(ARSRC,NPROW-1), $ MIN(NPCOL-1,ACSRC), TEMP_ICTXT, MAX(1, AROWS), INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% DESCINIT DESCQ failed, INFO =", INFO GO TO 999 END IF CALL DESCINIT( DESCVEC, N, 1, N, 1, MIN(ARSRC,NPROW-1), $ MIN(NPCOL-1,ACSRC), TEMP_ICTXT, N, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% DESCINIT DESCVEC failed, INFO =", INFO GO TO 999 END IF * * Assign pointer for ScaLAPACK arrays - first set DP memory. * IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Assign pointers...' IPA = 1 IPACPY = IPA + DESCA( LLD_ ) * ACOLS IPQ = IPACPY + DESCA( LLD_ ) * ACOLS WR1 = IPQ + DESCQ( LLD_ ) * ACOLS WI1 = WR1 + N WR2 = WI1 + N WI2 = WR2 + N IPW1 = WI2 + N IPW2 = IPW1 + DESCA( LLD_ ) * ACOLS IF( DEBUG ) WRITE(*,*) '% (IPW2,DPALLOC):', IPW2, DPALLOC * PRINT*, '%', IPA, IPACPY, IPQ, WR1, WI1, WR2, WI2, IPW1, IPW2 IF( IPW2+DESCA(LLD_)*ACOLS .GT. DPALLOC+1 ) THEN WRITE(*,*) '% Not enough DP memory!' GO TO 999 END IF * * Then set integer memory pointers. * IPIW = 1 * * Generate testproblem. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') CALL PSLASET( 'All over', N, N, ZERO, ONE, MEM(IPQ), 1, 1, $ DESCQ ) CALL PSMATGEN2( TEMP_ICTXT, 'Random', 'NoDiagDominant', $ N, N, NB, NB, MEM(IPA), DESCA( LLD_ ), 0, 0, 7, 0, $ AROWS, 0, ACOLS, MYROW, MYCOL, NPROW, NPCOL ) IF( .NOT. COMPHESS ) THEN CALL PSLASET( 'Lower triangular', N-2, N-2, ZERO, ZERO, $ MEM(IPA), 3, 1, DESCA ) CALL PSLASET( 'All over', N, N, ZERO, ONE, MEM(IPQ), $ 1, 1, DESCQ ) IF( KTOP.GT.1 ) $ CALL PSLASET( 'Lower triangular', KTOP-1, KTOP-1, $ ZERO, ZERO, MEM(IPA), 2, 1, DESCQ ) IF( KBOT.LT.N ) $ CALL PSLASET( 'Lower triangular', N-KBOT, N-KBOT, $ ZERO, ZERO, MEM(IPA), KBOT+1, KBOT, DESCQ ) END IF * * Do balancing if general matrix. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') T_BA = MPI_WTIME() IF( COMPHESS .AND. BALANCE ) THEN IF( NPROCS.EQ.1 .AND. SOLVER.NE.2 .AND. UNI_LAPACK ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == dgebal ==' CALL SGEBAL( 'Both', N, MEM(IPA), DESCA(LLD_), ILO, $ IHI, SCALE, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% SGEBAL failed, INFO =", INFO GO TO 999 END IF ELSE IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdgebal ==' CALL PSGEBAL( 'Both', N, MEM(IPA), DESCA, ILO, IHI, $ SCALE, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% PSGEBAL failed, INFO =", INFO GO TO 999 END IF END IF ELSEIF( COMPHESS ) THEN ILO = 1 IHI = N ELSE ILO = KTOP IHI = KBOT END IF T_BA = MPI_WTIME() - T_BA c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Balancing took in seconds:',T_BA IF( DEBUG ) WRITE(*,*) '% #', IAM, ': ILO,IHI=',ILO,IHI * * Make a copy of A. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Copy matrix A' CALL PSLACPY( 'All', N, N, MEM(IPA), 1, 1, DESCA, MEM(IPACPY), $ 1, 1, DESCA ) * * Print matrices to screen in debugging mode. * IF( PRN ) $ CALL PSLAPRNT( N, N, MEM(IPACPY), 1, 1, DESCA, 0, 0, $ 'A', NOUT, MEM(IPW1) ) T_GEN = T_GEN + MPI_WTIME() - STAMP - T_BA c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Generation took in seconds:',T_GEN * * Only compute the Hessenberg form if necessary. * T_HS = MPI_WTIME() IF( .NOT. COMPHESS ) GO TO 30 * * Reduce A to Hessenberg form. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') IF( DEBUG ) WRITE(*,*) '% #', IAM, $ ': Reduce to Hessenberg form...N=',N, ILO,IHI * PRINT*, '% PSGEHRD: IPW2,MEM(IPW2)', IPW2, MEM(IPW2) IF( NPROCS.EQ.1 .AND. SOLVER.NE.2 .AND. UNI_LAPACK ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == dgehrd ==' CALL SGEHRD( N, ILO, IHI, MEM(IPA), DESCA(LLD_), $ MEM(IPW1), MEM(IPW2), -1, INFO ) IF (DPALLOC-IPW2.LT.MEM(IPW2)) THEN WRITE(*,*) "% Not enough memory for SGEHRD" GO TO 999 END IF CALL SGEHRD( N, ILO, IHI, MEM(IPA), DESCA(LLD_), $ MEM(IPW1), MEM(IPW2), DPALLOC-IPW2, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% SGEHRD failed, INFO =", INFO GO TO 999 END IF ELSE IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdgehrd ==' CALL PSGEHRD( N, ILO, IHI, MEM(IPA), 1, 1, DESCA, MEM(IPW1), $ MEM(IPW2), -1, INFO ) IF (DPALLOC-IPW2.LT.MEM(IPW2)) THEN WRITE(*,*) "% Not enough memory for PSGEHRD" GO TO 999 END IF CALL PSGEHRD( N, ILO, IHI, MEM(IPA), 1, 1, DESCA, MEM(IPW1), $ MEM(IPW2), DPALLOC-IPW2, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% PSGEHRD failed, INFO =", INFO GO TO 999 END IF END IF * * Form Q explicitly. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') IF( DEBUG ) WRITE(*,*) '% #', IAM, ':Form Q explicitly' * PRINT*, '% PSORMHR: IPW2,MEM(IPW2)', IPW2, MEM(IPW2) IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdormhr ==' CALL PSORMHR( 'L', 'N', N, N, ILO, IHI, MEM(IPA), 1, 1, $ DESCA, MEM(IPW1), MEM(IPQ), 1, 1, DESCQ, MEM(IPW2), $ -1, INFO ) IF (DPALLOC-IPW2.LT.MEM(IPW2)) THEN WRITE(*,*) "% Not enough memory for PSORMHR" GO TO 999 END IF CALL PSORMHR( 'L', 'N', N, N, ILO, IHI, MEM(IPA), 1, 1, $ DESCA, MEM(IPW1), MEM(IPQ), 1, 1, DESCQ, MEM(IPW2), $ DPALLOC-IPW2, INFO ) IF ( INFO.NE.0 ) THEN WRITE(*,*) "% PSORMHR failed, INFO =", INFO GO TO 999 END IF * * Extract the upper Hessenberg part of A. * CALL PSLASET( 'Lower triangular', N-2, N-2, ZERO, ZERO, $ MEM(IPA), 3, 1, DESCA ) * * Print reduced matrix A in debugging mode. * IF( PRN ) THEN CALL PSLAPRNT( N, N, MEM(IPA), 1, 1, DESCA, 0, 0, 'H', NOUT, $ MEM(IPW1) ) CALL PSLAPRNT( N, N, MEM(IPQ), 1, 1, DESCQ, 0, 0, 'Q', NOUT, $ MEM(IPW1) ) END IF * 30 CONTINUE T_HS = MPI_WTIME() - T_HS c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Hessenberg took in seconds:',T_HS * * Compute the real Schur form of the Hessenberg matrix A. * IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') T_QR = MPI_WTIME() IF( SOLVER.EQ.1 ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdlaqr1 ==' * PRINT*, '% PSLAQR1: IPW1,MEM(IPW1)', IPW1, MEM(IPW1) CALL PSLAQR1( .TRUE., .TRUE., N, ILO, IHI, MEM(IPA), DESCA, $ MEM(WR1), MEM(WI1), ILO, IHI, MEM(IPQ), DESCQ, $ MEM(IPW1), -1, IMEM, -1, INFO ) IF (DPALLOC-IPW1.LT.MEM(IPW1)) THEN WRITE(*,*) "% Not enough DP memory for PSLAQR1" GO TO 999 END IF IF (INTALLC.LT.IMEM(1)) THEN WRITE(*,*) "% Not enough INT memory for PSLAQR1" GO TO 999 END IF CALL PSLAQR1( .TRUE., .TRUE., N, ILO, IHI, MEM(IPA), DESCA, $ MEM(WR1), MEM(WI1), ILO, IHI, MEM(IPQ), DESCQ, $ MEM(IPW1), DPALLOC-IPW1+1, IMEM, INTALLC, INFO ) IF (INFO.NE.0) THEN WRITE(*,*) "% PSLAQR1: INFO =", INFO END IF ELSEIF( SOLVER.EQ.2 ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': == pdhseqr ==' * PRINT*, '% PSHSEQR: IPW1,MEM(IPW1)', IPW1, MEM(IPW1) IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') CALL PSHSEQR( 'Schur', 'Vectors', N, ILO, IHI, MEM(IPA), $ DESCA, MEM(WR2), MEM(WI2), MEM(IPQ), DESCQ, MEM(IPW1), $ -1, IMEM, -1, INFO ) IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') IF (DPALLOC-IPW1.LT.MEM(IPW1)) THEN WRITE(*,*) "% Not enough DP memory for PSHSEQR" GO TO 999 END IF IF (INTALLC.LT.IMEM(1)) THEN WRITE(*,*) "% Not enough INT memory for PSHSEQR" GO TO 999 END IF IF( BARR ) CALL BLACS_BARRIER(ICTXT, 'A') CALL PSHSEQR( 'Schur', 'Vectors', N, ILO, IHI, MEM(IPA), $ DESCA, MEM(WR2), MEM(WI2), MEM(IPQ), DESCQ, MEM(IPW1), $ DPALLOC-IPW1+1, IMEM, INTALLC, INFO ) IF (INFO.NE.0) THEN WRITE(*,*) "% PSHSEQR: INFO =", INFO END IF ELSE WRITE(*,*) '% ERROR: Illegal SOLVER number!' GO TO 999 END IF T_QR = MPI_WTIME() - T_QR c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% QR-algorithm took in seconds:',T_QR T_SCH = T_SCH + T_QR + T_HS + T_BA * TOTIT = IMEM(1) * SWEEPS = IMEM(2) * TOTNS = IMEM(3) ITPEREIG = FLOAT(TOTIT) / FLOAT(N) SWPSPEIG = FLOAT(SWEEPS) / FLOAT(N) NSPEIG = FLOAT(TOTNS) / FLOAT(N) * * Print reduced matrix A in debugging mode. * IF( PRN ) THEN CALL PSLAPRNT( N, N, MEM(IPA), 1, 1, DESCA, 0, 0, 'T', $ NOUT, MEM(IPW1) ) CALL PSLAPRNT( N, N, MEM(IPQ), 1, 1, DESCQ, 0, 0, 'Z', $ NOUT, MEM(IPW1) ) END IF * * Check that returned Schur form is really a quasi-triangular * matrix. * HESS = 0 DO I = 1, N-1 IF( I.GT.1 ) THEN CALL PSELGET( 'All', '1-Tree', ELEM1, MEM(IPA), I, I-1, $ DESCA ) ELSE ELEM1 = ZERO END IF CALL PSELGET( 'All', '1-Tree', ELEM2, MEM(IPA), I+1, I, $ DESCA ) IF( I.LT.N-1 ) THEN CALL PSELGET( 'All', '1-Tree', ELEM3, MEM(IPA), I+2, I+1, $ DESCA ) ELSE ELEM3 = ZERO END IF IF( ELEM2.NE.ZERO .AND. ABS(ELEM1)+ABS(ELEM2)+ABS(ELEM3).GT. $ ABS(ELEM2) ) HESS = HESS + 1 END DO * * Compute residual norms and other results: * * 1) RNORM = || T - Q'*A*Q ||_F / ||A||_F * 2) ORTH = MAX( || I - Q'*Q ||_F, || I - Q*Q' ||_F ) / * (epsilon*N) * STAMP = MPI_WTIME() IF( COMPRESI ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Compute residuals 1' IF( DEBUG ) WRITE(*,*) '% #', IAM, ': pdgemm 3' CALL PSGEMM( 'N', 'N', N, N, N, ONE, MEM(IPACPY), 1, 1, $ DESCA, MEM(IPQ), 1, 1, DESCQ, ZERO, MEM(IPW1), 1, 1, $ DESCA ) IF( DEBUG ) WRITE(*,*) '% #', IAM, ': pdgemm 4' IF( DEBUG ) WRITE(*,*) '% #', IAM, ': N=',N IF( DEBUG ) WRITE(*,*) '% #', IAM, ': DESCA=',DESCA(1:DLEN_) IF( DEBUG ) WRITE(*,*) '% #', IAM, ': DESCQ=',DESCQ(1:DLEN_) CALL PSGEMM( 'T', 'N', N, N, N, -ONE, MEM(IPQ), 1, 1, $ DESCQ, MEM(IPW1), 1, 1, DESCA, ONE, MEM(IPA), 1, 1, $ DESCA ) R1 = PSLANGE( 'Frobenius', N, N, MEM(IPA), 1, 1, DESCA, $ DPDUM ) ANORM = PSLANGE( 'Frobenius', N, N, MEM(IPACPY), 1, 1, $ DESCA, DPDUM ) IF( ANORM.GT.ZERO )THEN RNORM = R1 / (ANORM*EPS*SQRT(FLOAT(N))) ELSE RNORM = R1 END IF ELSE RNORM = 0.0D0 END IF * IF( COMPORTH ) THEN IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Compute residuals 2' CALL PSLASET( 'All', N, N, ZERO, ONE, MEM(IPW1), 1, 1, $ DESCQ ) CALL PSLACPY( 'All', N, N, MEM(IPQ), 1, 1, DESCQ, MEM(IPW2), $ 1, 1, DESCQ ) CALL PSGEMM( 'T', 'N', N, N, N, -ONE, MEM(IPQ), 1, 1, DESCQ, $ MEM(IPW2), 1, 1, DESCQ, ONE, MEM(IPW1), 1, 1, DESCQ ) O1 = PSLANGE( 'Frobenius', N, N, MEM(IPW1), 1, 1, DESCQ, $ DPDUM ) CALL PSLASET( 'All', N, N, ZERO, ONE, MEM(IPW1), 1, 1, $ DESCQ ) CALL PSGEMM( 'N', 'T', N, N, N, -ONE, MEM(IPQ), 1, 1, DESCQ, $ MEM(IPW2), 1, 1, DESCQ, ONE, MEM(IPW1), 1, 1, DESCQ ) O2 = PSLANGE( 'Frobenius', N, N, MEM(IPW1), 1, 1, DESCQ, $ DPDUM ) ORTH = MAX(O1,O2) / (EPS*FLOAT(N)) ELSE ORTH = 0.0D0 END IF * T_RES = T_RES + MPI_WTIME() - STAMP c IF( TIMESTEPS.AND.IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Residuals took in seconds:',T_RES TOTTIME = MPI_WTIME() - TOTTIME c IF( IAM.EQ.0 ) WRITE(*,*) c $ ' %%% Total execution time in seconds:', TOTTIME * * * Print results to screen. * IF( (ORTH.GT.THRESH).OR.(RNORM.GT.THRESH) ) THEN PASSED = 'FAILED' ELSE PASSED = 'PASSED' END IF IF( DEBUG ) WRITE(*,*) '% #', IAM, ': Print results...' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9993 ) N, NB, NPROW, NPCOL, T_QR, PASSED END IF CALL BLACS_BARRIER( ICTXT, 'All' ) END DO END DO END DO 999 CONTINUE * * Deallocate MEM and IMEM. * DEALLOCATE( MEM, IMEM ) * CALL BLACS_GRIDEXIT( ICTXT ) * 777 CONTINUE * CALL BLACS_EXIT( 0 ) * * Format specifications. * 6666 FORMAT(A2,A3,A6,A4,A5,A6,A3,A3,A3,A9,A9,A9,A8,A8,A9,A9,A9,A9,A9, $ A9,A9,A9,A9,A9,A9,A5,A5,A8,A5,A5) 7777 FORMAT(A2,I3,I6,I4,I5,I6,I3,I3,I3,F9.2,F9.2,F9.2,F8.2,F8.2,F9.2, $ F9.2,F9.2,F9.2,F9.2,F9.2,F9.2,F9.2,E9.2,E9.2,E9.2,I5,I5, $ F8.4,I5,I5,A2) 9995 FORMAT( ' N NB P Q QR Time CHECK' ) 9994 FORMAT( '----- --- ---- ---- -------- ------' ) 9993 FORMAT( I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, A6 ) * END scalapack-2.0.2/TESTING/EIG/pslafchk.f000644 000766 000024 00000022200 10363532303 017366 0ustar00juliestaff000000 000000 SUBROUTINE PSLAFCHK( AFORM, DIAG, M, N, A, IA, JA, DESCA, IASEED, $ ANORM, FRESID, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER AFORM, DIAG INTEGER IA, IASEED, JA, M, N REAL ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSLAFCHK computes the residual * || sub( A ) - sub( Ao ) || / (|| sub( Ao ) ||*eps*MAX(M,N)), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A( IA:IA+M-1, JA:JA+N-1 ) and ||.|| stands for the infini- * ty norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * AFORM (global input) CHARACTER * sub( A ) is overwritten with: * - a symmetric matrix, if AFORM = 'S'; * - a Hermitian matrix, if AFORM = 'H'; * - the transpose of what would normally be generated, * if AFORM = 'T'; * - the conjugate transpose of what would normally be * generated, if AFORM = 'C'; * - otherwise a random matrix. * * DIAG (global input) CHARACTER * if DIAG = 'D' : sub( A ) is diagonally dominant. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub( A ) - sub( Ao ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * ANORM (global input) REAL * The Infinity norm of sub( A ). * * FRESID (global output) REAL * The maximum (worst) factorizational error. * * WORK (local workspace) REAL array, dimension (LWORK). * LWORK >= MpA0 * NB_A, where * * IROFFA = MOD( IA-1, MB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * WORK is used to store a block of columns of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IOFFA, IROFF, $ JB, JJ, JJA, JN, LDA, LDW, MP, MYCOL, MYROW, $ NPCOL, NPROW, NQ REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSMATGEN, SMATADD * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PSLAMCH, PSLANGE EXTERNAL ICEIL, NUMROC, PSLAMCH, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( ICTXT, 'eps' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * * Compute sub( A ) := sub( A ) - sub( Ao ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDW = MAX( 1, MP ) LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA - 1 )*LDA * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN JB = JN-JA+1 CALL PSMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ MP, JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL SMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) JJA = JJA + JB NQ = NQ - JB IOFFA = IOFFA + JB * LDA END IF * * Handle the remaning blocks of columns * DO 10 JJ = JJA, JJA+NQ-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JJA+NQ-JJ ) IOFFA = IIA + ( JJ - 1 ) * LDA CALL PSMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ MP, JJ-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL SMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) 10 CONTINUE * * Calculate factor residual * FRESID = PSLANGE( 'I', M, N, A, IA, JA, DESCA, WORK ) / $ ( MAX( M, N ) * EPS * ANORM ) * RETURN * * End PSLAFCHK * END scalapack-2.0.2/TESTING/EIG/pslagge.f000644 000766 000024 00000030763 10363532303 017232 0ustar00juliestaff000000 000000 SUBROUTINE PSLAGGE( M, N, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N, ORDER * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) REAL A( * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * PSLAGGE generates a real symmetric matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random orthogonal * matrices: A = U*D*VT. * * This is just a quick implementation which will be replaced in the * future. The random matrix A1(m,n) is generated and random left * orthogonal matrix U(m,m) is obtained by running QR on A1: * A1(m,n) = U(m,m)*R, * where U(m,m) is a product of min(m,n) Householder rotations. * Afterwards the space of A1 is reused for a second random matrix * A2(m,n), which is used to obtain the right orthogonal matrix VT(n,n) * by running LQ on A2: * A2(m,n) = L*VT(n,n). * This requires vastly more computation than necessary, but not * significantly more communication than is used in the rest of this * routine, and hence is not that much slower than an efficient * solution. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * Number of rows of the matrix A. M >= 0. * * N (global input) INTEGER * Number of columns of matrix A. N >= 0. * * D (local input) REAL array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (local output) REAL array * Global dimension (M, N), local dimension (MP, NQ) * * IA (global input) INTEGER * The global row index of the submatrix of the distributed * matrix A to operate on. * * JA (global input) INTEGER * The global column index of the submatrix of the distributed * matrix A to operate on. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix A. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. On exit, the seed is updated and will remain identical * on all processes in the context. * * ORDER (global input) INTEGER * Number of reflectors in the matrix Q * At present, ORDER .NE. N is not supported * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER dimension of WORK * LWORK >= MAX( QR_WORK, LQ_WORK ) * QR_WORK = LDAA*MAX( 1, NQ ) + 200 + MAX( 1, DTAU1 ) + * MAX( SIZEMQRLEFT, SIZEQRF) * LQ_WORK = LDAA*MAX( 1, NQ ) + 200 + MAX( 1, DTAU2) + * MAX( SIZEMLQRIGHT, SIZEQRF ) * Where: * LDAA = DESCA( LLD_ ) * MB_A = DESCA( MB_ ) * NB_A = DESCA( NB_ ) * RSRC_A = DESCA( RSRC_ ) * CSRC_A = DESCA( CSRC_ ) * LCM = ILCM( NPROW, NPCOL ) * LCMQ = LCM / NPCOL * IROFFA = MOD( IA-1, MB_A ) * ICOFFA = MOD( JA-1, NB_A ) * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ) * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) * MP = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ) * NQ = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ) * DTAU1 = NUMROC( JA + SIZE- 1, NB_A, MYCOL, IACOL, NPROW ) * DTAU2 = NUMROC( IA + SIZE- 1, MB_A, MYROW, IAROW, NPROW ) * SIZEMQRLEFT = MAX( (MB_A*(MB_A-1))/2, ( MP + NQ ) * MB_A ) * + ( MP + NB_A ) * NB_A * SIZEMLQRIGHT = MAX( (MB_A*(MB_A-1))/2, (MP + NQ)*MB_A ) + * MB_A * MB_A * SIZEQRF = NB_A*NP + MB_A*NQ + NB_A*NB_A * * INFO (local output) INTEGER * * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER CSRC_A, DTAU1, DTAU2, I, IACOL, IAROW, ICOFFA, $ IROFFA, LCM, LCMQ, LDAA, LQ_WORK, LWMIN, MB_A, $ MP, MYCOL, MYROW, NB_A, NPCOL, NPROW, NQ, $ PTR2AA, PTR2TAU, PTR2WORK, QR_WORK, RSRC_A, $ SIZE, SIZELQF, SIZEMLQRIGHT, SIZEMQRLEFT, $ SIZEQRF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PSELSET, PSGELQF, $ PSGEQRF, PSLASET, PSMATGEN, PSORMLQ, PSORMQR, $ PXERBLA * .. * .. External Functions .. INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*DLEN_*DTYPE_*M_*N_.LT.0 )RETURN * * Initialize grid information. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * * Check LWORK. * INFO = 0 SIZE = MIN( M, N ) IF( NPROW.EQ.-1 ) THEN INFO = -607 ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 8, INFO ) END IF * Calculation of a minimum workspace. LDAA = DESCA( LLD_ ) MB_A = DESCA( MB_ ) NB_A = DESCA( NB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) DTAU1 = NUMROC( JA+SIZE-1, NB_A, MYCOL, IACOL, NPCOL ) DTAU2 = NUMROC( IA+SIZE-1, MB_A, MYROW, IAROW, NPROW ) MP = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ) * SIZEMQRLEFT = MAX( ( MB_A*( MB_A-1 ) ) / 2, ( MP+NQ )*MB_A ) + $ ( MP+NB_A )*NB_A SIZEMLQRIGHT = MAX( ( MB_A*( MB_A-1 ) ) / 2, ( MP+NQ )*MB_A ) + $ MB_A*MB_A SIZEQRF = NB_A*MP + MB_A*NQ + NB_A*NB_A + 100 SIZELQF = NB_A*( MP+NQ+NB_A ) + 100 * QR_WORK = LDAA*MAX( 1, NQ ) + 200 + MAX( 1, DTAU1 ) + $ MAX( SIZEMQRLEFT, SIZEQRF ) LQ_WORK = LDAA*MAX( 1, NQ ) + 200 + MAX( 1, DTAU2 ) + $ MAX( SIZEMLQRIGHT, SIZELQF ) LWMIN = MAX( QR_WORK, LQ_WORK ) WORK( 1 ) = LWMIN IF( LWORK.EQ.-1 ) $ GO TO 20 * * Test the input arguments. * IF( INFO.EQ.0 ) THEN IF( SIZE.NE.ORDER ) THEN INFO = -9 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( INFO.LT.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PSLAGGE', -INFO ) RETURN END IF * * Build a diagonal matrix A with the eigenvalues specified in D. * CALL PSLASET( 'Full', M, N, ZERO, ZERO, A, IA, JA, DESCA ) DO 10 I = 1, SIZE CALL PSELSET( A, I, I, DESCA, D( I ) ) 10 CONTINUE * * Local dimension of array TAU in tis case is LOCc(JA+MIN(M,N)-1). * PTR2AA = 2 PTR2TAU = PTR2AA + LDAA*MAX( 1, NQ ) + 100 PTR2WORK = PTR2TAU + MAX( 1, DTAU1 ) + 100 * CALL PSLASET( 'All', M, N, ZERO, ZERO, WORK( PTR2AA ), IA, JA, $ DESCA ) * * Build a random matrix AA1. * CALL PSMATGEN( DESCA( CTXT_ ), 'N', 'N', M, N, DESCA( MB_ ), $ DESCA( NB_ ), WORK( PTR2AA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), 0, MP, $ 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Produce QR decomposition AA1 -> U*R. * CALL PSGEQRF( M, N, WORK( PTR2AA ), IA, JA, DESCA, $ WORK( PTR2TAU ), WORK( PTR2WORK ), SIZEQRF, INFO ) * * A = U*A. * CALL PSORMQR( 'L', 'N', M, N, SIZE, WORK( PTR2AA ), IA, JA, DESCA, $ WORK( PTR2TAU ), A, IA, JA, DESCA, WORK( PTR2WORK ), $ SIZEMQRLEFT, INFO ) * * Reinitialize pointer to WORK array. Dimension of array TAU in * this case is LOCr(IA+MIN(M,N)-1). * PTR2WORK = PTR2TAU + MAX( 1, DTAU2 ) + 100 * * Use the same workspace to generate a random matrix AA2. * CALL PSMATGEN( DESCA( CTXT_ ), 'N', 'N', M, N, DESCA( MB_ ), $ DESCA( NB_ ), WORK( PTR2AA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 2 ), 0, MP, $ 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) * * Produce LQ decomposition of random matrix AA2 -> L*VT. * CALL PSGELQF( M, N, WORK( PTR2AA ), IA, JA, DESCA, $ WORK( PTR2TAU ), WORK( PTR2WORK ), SIZELQF, INFO ) * * Calculate A = A*VT. * CALL PSORMLQ( 'R', 'N', M, N, SIZE, WORK( PTR2AA ), IA, JA, DESCA, $ WORK( PTR2TAU ), A, IA, JA, DESCA, WORK( PTR2WORK ), $ SIZEMLQRIGHT, INFO ) * * End of PSLAGGE * 20 CONTINUE RETURN END scalapack-2.0.2/TESTING/EIG/pslagsy.f000644 000766 000024 00000025260 10363532303 017266 0ustar00juliestaff000000 000000 * * SUBROUTINE PSLAGSY( N, K, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, INFO ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, N, ORDER * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) REAL A( * ), D( * ), WORK( * ) * .. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Purpose * ======= * * PSLAGSY generates a real symmetric matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random orthogonal matrix: * A = U*D*U'. * * This is just a quick implementation which will be replaced in the * future. The random orthogonal matrix is computed by creating a * random matrix and running QR on it. This requires vastly more * computation than necessary, but not significantly more communication * than is used in the rest of this rouinte, and hence is not that much * slower than an efficient solution. * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix A. N >= 0. * * K (global input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * ### K must be 0 or N-1, 0 < K < N-1 is not supported yet. * * D (global input) REAL array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (local output) REAL array * Global dimension (N, N), local dimension (NP, NQ) * The generated n by n symmetric matrix A (the full matrix is * stored). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated and will remain identical on * all processes in the context. * * ORDER (global input) INTEGER * Number of reflectors in the matrix Q * At present, ORDER .NE. N is not supported * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER dimension of WORK * LWORK >= SIZETMS as returned by PSLASIZESEP * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER CSRC_A, I, IACOL, IAROW, ICOFFA, II, IIROW, $ INDAA, INDTAU, INDWORK, IPOSTPAD, IPREPAD, $ IROFFA, ISIZESUBTST, ISIZESYEVX, ISIZETST, $ JJCOL, LDAA, LII, LIII, LJJ, LJJJ, LWMIN, MAXI, $ MB_A, MYCOL, MYROW, NB_A, NP, NPCOL, NPROW, NQ, $ RSRC_A, SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZESYEVX, $ SIZETMS, SIZETST * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PSGEQRF, PSLASIZESEP, $ PSMATGEN, PSORMQR, PXERBLA, SLASET * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. * INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Initialize grid information * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * * Check LWORK * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) END IF * LDAA = DESCA( LLD_ ) MB_A = DESCA( MB_ ) NB_A = DESCA( NB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) NP = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ) IPREPAD = 0 IPOSTPAD = 0 CALL PSLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) LWMIN = SIZETMS * * Test the input arguments * IF( INFO.EQ.0 ) THEN IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( N.NE.ORDER ) THEN INFO = -9 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( INFO.LT.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PSLAGSY', -INFO ) RETURN END IF * INDAA = 1 INDTAU = INDAA + LDAA*MAX( 1, NQ ) INDWORK = INDTAU + MAX( 1, NQ ) * IF( K.NE.0 ) THEN CALL SLASET( 'A', LDAA, NQ, ZERO, ZERO, WORK( INDAA ), LDAA ) * * * Build a random matrix * * CALL PSMATGEN( DESCA( CTXT_ ), 'N', 'N', N, ORDER, $ DESCA( MB_ ), DESCA( NB_ ), WORK( INDAA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ ISEED( 1 ), 0, NP, 0, NQ, MYROW, MYCOL, NPROW, $ NPCOL ) CALL PSGEQRF( N, ORDER, WORK( INDAA ), IA, JA, DESCA, $ WORK( INDTAU ), WORK( INDWORK ), SIZEQRF, INFO ) * END IF * * Build a diagonal matrix A with the eigenvalues specified in D * CALL SLASET( 'A', NP, NQ, ZERO, ZERO, A, DESCA( LLD_ ) ) * IIROW = 0 JJCOL = 0 LII = 1 LJJ = 1 * DO 20 II = 1, N, DESCA( MB_ ) MAXI = MIN( N, II+DESCA( MB_ )-1 ) IF( ( MYROW.EQ.IIROW ) .AND. ( MYCOL.EQ.JJCOL ) ) THEN LIII = LII LJJJ = LJJ DO 10 I = II, MAXI A( LIII+( LJJJ-1 )*DESCA( LLD_ ) ) = D( I ) LIII = LIII + 1 LJJJ = LJJJ + 1 10 CONTINUE END IF IF( MYROW.EQ.IIROW ) $ LII = LII + DESCA( MB_ ) IF( MYCOL.EQ.JJCOL ) $ LJJ = LJJ + DESCA( MB_ ) IIROW = MOD( IIROW+1, NPROW ) JJCOL = MOD( JJCOL+1, NPCOL ) 20 CONTINUE * * A = Q * A * IF( K.NE.0 ) THEN * CALL PSORMQR( 'L', 'Transpose', N, N, ORDER, WORK( INDAA ), IA, $ JA, DESCA, WORK( INDTAU ), A, IA, JA, DESCA, $ WORK( INDWORK ), SIZEMQRLEFT, INFO ) * * * A = A * Q' * * CALL PSORMQR( 'R', 'N', N, N, ORDER, WORK( INDAA ), IA, JA, $ DESCA, WORK( INDTAU ), A, IA, JA, DESCA, $ WORK( INDWORK ), SIZEMQRRIGHT, INFO ) * END IF * * End of PSLAGSY * END scalapack-2.0.2/TESTING/EIG/pslasizegsep.f000644 000766 000024 00000011113 10363532303 020305 0ustar00juliestaff000000 000000 * * SUBROUTINE PSLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, $ SIZESUBTST, ISIZESUBTST, SIZETST, $ ISIZETST ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZESYEVX, $ SIZETMS, SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PSLASIZEGSEP computes the amount of memory needed by * various GSEP test routines, as well as SYGVX itself * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PSSYGVX * * SIZEMQRLEFT LWORK for the 1st PSORMQR call in PSLAGSY * * SIZEMQRRIGHT LWORK for the 2nd PSORMQR call in PSLAGSY * * SIZEQRF LWORK for PSGEQRF in PSLAGSY * * SIZETMS LWORK for PSLATMS * * SIZEQTQ LWORK for PSSEPQTQ (nexer complex) * * SIZECHK LWORK for PSGSEPCHK * * SIZESYEVX LWORK for PSSYGVX * * ISIZESYEVX LIWORK for PSSYGVX * * SIZESUBTST LWORK for PSSUBTST * * ISIZESUBTST LIWORK for PSSUBTST * * SIZETST LWORK for PSTST * * ISIZETST LIWORK for PSTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, LCM, $ LCMQ, LDA, MQ0, MYCOL, MYROW, N, NB, NEIG, NN, $ NNP, NP, NP0, NPCOL, NPROW, NQ, RSRC_A * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * .. ** .. Executable Statements .. * This is just to keep ftnchek happy * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 0 SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZESYEVX = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZESYEVX = 6*NNP * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZESYEVX ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZESYEVX + IPREPAD + IPOSTPAD * * * Allow room for A, COPYA and Z and DIAG, WIN, WNEW, GAP, WORK * SIZETST = 5*( LDA*NP+IPREPAD+IPOSTPAD ) + $ 4*( N+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PSSYGVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pslasizesep.f000644 000766 000024 00000011152 10363532303 020141 0ustar00juliestaff000000 000000 * * SUBROUTINE PSLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, $ SIZESUBTST, ISIZESUBTST, SIZETST, $ ISIZETST ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZESYEVX, $ SIZETMS, SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PSLASIZESEP computes the amount of memory needed by * various SEP test routines, as well as SYEVX itself * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PSSYEVX * * SIZEMQRLEFT LWORK for the 1st PSORMQR call in PSLAGSY * * SIZEMQRRIGHT LWORK for the 2nd PSORMQR call in PSLAGSY * * SIZEQRF LWORK for PSGEQRF in PSLAGSY * * SIZETMS LWORK for PSLATMS * * SIZEQTQ LWORK for PSSEPQTQ (nexer complex) * * SIZECHK LWORK for PSSEPCHK * * SIZESYEVX LWORK for PSSYEVX * * ISIZESYEVX LIWORK for PSSYEVX * * SIZESUBTST LWORK for PSSUBTST * * ISIZESUBTST LIWORK for PSSUBTST * * SIZETST LWORK for PSTST * * ISIZETST LIWORK for PSTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, LCM, $ LCMQ, LDA, MQ0, MYCOL, MYROW, N, NB, NEIG, NN, $ NNP, NP, NP0, NPCOL, NPROW, NQ, RSRC_A * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * .. ** .. Executable Statements .. * This is just to keep ftnchek happy * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZESYEVX = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZESYEVX = 6*NNP * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZESYEVX ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZESYEVX + IPREPAD + IPOSTPAD * * * Allow room for A, COPYA and Z and DIAG, WIN, WNEW, GAP, WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + $ 4*( N+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PSSYEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pslasizesepr.f000644 000766 000024 00000010651 11623527140 020331 0ustar00juliestaff000000 000000 SUBROUTINE PSLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVR, ISIZESYEVR, $ SIZESUBTST, ISIZESUBTST, SIZETST, $ ISIZETST ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVR, $ ISIZETST, SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZESYEVR, $ SIZETMS, SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * * Purpose * ======= * * PSLASIZESEPR computes the amount of memory needed by * various SEPR test routines, as well as PSSYEVR itself. * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor for dense matrix. * * SIZEMQRLEFT LWORK for the 1st PSORMQR call in PSLAGSY * * SIZEMQRRIGHT LWORK for the 2nd PSORMQR call in PSLAGSY * * SIZEQRF LWORK for PSGEQRF in PSLAGSY * * SIZETMS LWORK for PSLATMS * * SIZEQTQ LWORK for PSSEPQTQ * * SIZECHK LWORK for PSSEPCHK * * SIZESYEVR LWORK for PSSYEVR * * ISIZESYEVR LIWORK for PSSYEVR * * SIZESUBTST LWORK for PSSEPRSUBTST * * ISIZESUBTST LIWORK for PSSEPRSUBTST * * SIZETST LWORK for PSSEPRTST * * ISIZETST LIWORK for PSSEPRTST * * * .. Parameters .. INTEGER CTXT_, M_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( $ CTXT_ = 2, M_ = 3, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, LCM, $ LCMQ, LDA, MQ0, MYCOL, MYROW, N, NB, NEIG, NN, $ NNP, NP, NP0, NPCOL, NPROW, NQ, RSRC_A * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) + 1 NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) NNP = MAX( N, NPROW*NPCOL+1, 4 ) * * SIZESYEVR = 1 + 5*N + MAX( 18*NN, NP0*MQ0+2*NB*NB ) + $ (2 + ICEIL( NEIG, NPROW*NPCOL ))*NN SIZESYEVR = MAX(3, SIZESYEVR) * ISIZESYEVR = 12*NNP + 2*N * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZESYEVR ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZESYEVR + IPREPAD + IPOSTPAD * * Allow room for A, COPYA and Z and DIAG, WIN, WNEW, GAP, WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + $ 4*( N+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK * (only needed for PSSYEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * * RETURN END scalapack-2.0.2/TESTING/EIG/pslasizesqp.f000644 000766 000024 00000013246 10363532303 020163 0ustar00juliestaff000000 000000 SUBROUTINE PSLASIZESQP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, $ SIZESYEV, SIZESYEVD, ISIZESYEVD, $ SIZESUBTST, ISIZESUBTST, $ SIZETST, ISIZETST ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * February 23, 2000 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZESYEV, $ SIZESYEVX, SIZETMS, SIZETST, $ SIZESYEVD, ISIZESYEVD * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PSLASIZESQP computes the amount of memory needed by * various SEP test routines, as well as PSYEVX and PSSYEV * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PSSYEVX or PSSYEV * * SIZEMQRLEFT LWORK for the 1st PSORMQR call in PSLAGSY * * SIZEMQRRIGHT LWORK for the 2nd PSORMQR call in PSLAGSY * * SIZEQRF LWORK for PSGEQRF in PSLAGSY * * SIZETMS LWORK for PSLATMS * * SIZEQTQ LWORK for PSSEPQTQ (nexer complex) * * SIZECHK LWORK for PSSEPCHK * * SIZESYEVX LWORK for PSSYEVX * * ISIZESYEVX LIWORK for PSSYEVX * * SIZESYEV LWORK for PSSYEV * * SIZESYEVD LWORK for PSSYEVD * * ISIZESYEVD LIWORK for PSSYEVD * * SIZESUBTST LWORK for PSSUBTST * * ISIZESUBTST LIWORK for PSSUBTST * * SIZETST LWORK for PSTST * * ISIZETST LIWORK for PSTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTEXTC, CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, $ LCM, LCMQ, LDA, LDC, MQ0, MYCOL, MYPCOLC, $ MYPROWC, MYROW, N, NB, NEIG, NN, NNP, NP, $ NPCOLC, NPROWC, NP0, NPCOL, NPROW, NQ, RSRC_A * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC, SL_GRIDRESHAPE EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC, SL_GRIDRESHAPE * .. * .. Executable Statements .. * * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_GRIDEXIT * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZESYEVX = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZESYEVX = 6*NNP * * Allow room for the new context created in PSSYEV * CONTEXTC = SL_GRIDRESHAPE( DESCA( CTXT_ ), 0, 1, 1, $ NPROW*NPCOL, 1 ) CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, $ MYPCOLC ) LDC = MAX( 1, NUMROC( N, NB, MYPROWC, 0, NPROW*NPCOL ) ) SIZESYEV = 5*N + MAX( 2*NP0 + MQ0 + NB*NN , 2*NN-2 ) + N*LDC CALL BLACS_GRIDEXIT( CONTEXTC ) * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) NN = MAX( N, NB, 2 ) NNP = 3*N + MAX( NB*( NP+1 ), 3*NB ) SIZESYEVD = MAX( NNP, 1+6*N+2*NP*NQ ) + 2*N ISIZESYEVD = 2+7*N+8*NPCOL * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZESYEVX, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZESYEV ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZESYEVX + IPREPAD + IPOSTPAD * * * Allow room for A, COPYA and Z and DIAG, WIN, WNEW, GAP, WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + $ 4*( N+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PSSYEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pslasizesyev.f000644 000766 000024 00000010272 10363532303 020342 0ustar00juliestaff000000 000000 * * SUBROUTINE PSLASIZESYEV( JOBZ, N, DESCA, MINSIZE ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER JOBZ INTEGER MINSIZE, N * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PSLASIZESYEV computes the amount of memory needed by PSSYEV * to calculate: * 1) Eigenvectors and eigenvalues if JOBZ = 'V' * 2) Eigenvalues only if JOBZ = 'N' * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * MINSIZE (global output) INTEGER * Workspace required for PSSYEV to: * 1) Eigenvectors and eigenvalues if JOBZ = 'V' * 2) Eigenvalues only if JOBZ = 'N' * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL WANTZ INTEGER CONTEXTC, CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, $ LCM, LCMQ, LDC, MQ0, MYCOL, MYPCOLC, MYPROWC, $ MYROW, NB, NN, NP, NP0, NPCOL, NPCOLC, NPROCS, $ NPROW, NPROWC, NQ, NRC, QRMEM, RSRC_A, $ SIZEMQRLEFT, SIZEMQRRIGHT * .. * .. External Functions .. * * LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC, SL_GRIDRESHAPE EXTERNAL ILCM, INDXG2P, LSAME, NUMROC, SL_GRIDRESHAPE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_GRIDEXIT * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) NB = DESCA( MB_ ) N = DESCA( M_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( NN, NB, 0, 0, NPCOL ) NPROCS = NPROW*NPCOL WANTZ = LSAME( JOBZ, 'V' ) LDC = 0 * * Create the new context that is used in PSSYEV * IF( WANTZ ) THEN CONTEXTC = SL_GRIDRESHAPE( DESCA( CTXT_ ), 0, 1, 1, NPROCS, 1 ) CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, $ MYPCOLC ) NRC = NUMROC( N, NB, MYPROWC, 0, NPROCS) LDC = MAX( 1, NRC ) CALL BLACS_GRIDEXIT( CONTEXTC ) END IF * * Compute the total amount of space needed * IF( WANTZ ) THEN QRMEM = 5*N + MAX( 2*NP0 +MQ0 + NB*NN, 2*NN-2 ) + N*LDC MINSIZE = MAX ( SIZEMQRLEFT, SIZEMQRRIGHT, QRMEM ) ELSE MINSIZE = 5*N + 2*NP0 +MQ0 + NB*NN END IF * RETURN * * End of PSLASIZESYEV * END scalapack-2.0.2/TESTING/EIG/pslasizesyevr.f000644 000766 000024 00000014543 11623527140 020534 0ustar00juliestaff000000 000000 SUBROUTINE PSLASIZESYEVR( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER RANGE INTEGER IL, IU, MAXSIZE, N, VALSIZE, VECSIZE REAL VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) REAL WIN( * ) * .. * * Purpose * ======= * * PSLASIZESYEVR computes the amount of memory needed by PSSYEVR * to ensure: * 1) Orthogonal Eigenvectors * 2) Eigenpairs with small residual norms * * Arguments * ========= * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * VL (global input/output ) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VL is set * to a random value near an entry in WIN * * VU (global input/output ) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VU is set * to a random value near an entry in WIN * * IL (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * If IL < 0, RANGE='I' and WKNOWN = .TRUE., IL is set * to a random value from 1 to N * * IU (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * If IU < 0, RANGE='I' and WKNOWN = .TRUE., IU is set * to a random value from IL to N * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * ISEED is not touched unless IL, IU, VL or VU are modified. * * WIN (global input) REAL array, dimension (N) * If WKNOWN=1, WIN contains the eigenvalues of the matrix. * * MAXSIZE (global output) INTEGER * Workspace required to guarantee that PSSYEVR will return * orthogonal eigenvectors. IF WKNOWN=0, MAXSIZE is set to a * a value which guarantees orthogonality no matter what the * spectrum is. If WKNOWN=1, MAXSIZE is set to a value which * guarantees orthogonality on a matrix with eigenvalues given * by WIN. * * VECSIZE (global output) INTEGER * Workspace required to guarantee that PSSYEVR * will compute eigenvectors. * * VALSIZE (global output) INTEGER * Workspace required to guarantee that PSSYEVR * will compute eigenvalues. * * * .. Parameters .. INTEGER CTXT_, MB_ PARAMETER ( CTXT_ = 2, MB_ = 5 ) REAL TWENTY PARAMETER ( TWENTY = 20.0E0 ) * .. * .. Local Scalars .. * INTEGER ILMIN, IUMAX, $ MQ0, MYCOL, MYIL, MYIU, MYROW, NB, NEIG, NN, $ NP0, NPCOL, NPROW REAL ANORM, EPS, SAFMIN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL SLARAN, PSLAMCH EXTERNAL LSAME, ICEIL, NUMROC, SLARAN, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, INT, MAX * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe Minimum' ) NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) VALSIZE = 3 + 5*N + MAX( 12*NN, NB*( NP0+1 ) ) IF( WKNOWN ) THEN ANORM = SAFMIN / EPS IF( N.GE.1 ) $ ANORM = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), ANORM ) IF( LSAME( RANGE, 'I' ) ) THEN IF( IL.LT.0 ) $ IL = INT( SLARAN( ISEED )*REAL( N ) ) + 1 IF( IU.LT.0 ) $ IU = INT( SLARAN( ISEED )*REAL( N-IL ) ) + IL IF( N.EQ.0 ) $ IU = 0 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IF( VL.GT.VU ) THEN MYIL = INT( SLARAN( ISEED )*REAL( N ) ) + 1 MYIU = INT( SLARAN( ISEED )*REAL( N-MYIL ) ) + MYIL VL = WIN( MYIL ) - TWENTY*EPS*ABS( WIN( MYIL ) ) VU = WIN( MYIU ) + TWENTY*EPS*ABS( WIN( MYIU ) ) VU = MAX( VU, VL+EPS*TWENTY*ABS( VL )+SAFMIN ) END IF END IF * END IF IF( LSAME( RANGE, 'V' ) ) THEN * We do not know how many eigenvalues will be computed ILMIN = 1 IUMAX = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN ILMIN = IL IUMAX = IU ELSE IF( LSAME( RANGE, 'A' ) ) THEN ILMIN = 1 IUMAX = N END IF * NEIG = IUMAX - ILMIN + 1 * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * VECSIZE = 3 + 5*N + MAX( 18*NN, NP0*MQ0+2*NB*NB ) + $ (2 + ICEIL( NEIG, NPROW*NPCOL ))*NN VALSIZE = MAX(3, VALSIZE) VECSIZE = MAX(3, VECSIZE) MAXSIZE = VECSIZE * RETURN * * End of PSLASIZESYEVR * END scalapack-2.0.2/TESTING/EIG/pslasizesyevx.f000644 000766 000024 00000017204 10363532303 020534 0ustar00juliestaff000000 000000 * * SUBROUTINE PSLASIZESYEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER RANGE INTEGER IL, IU, MAXSIZE, N, VALSIZE, VECSIZE REAL VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) REAL WIN( * ) * .. * * Purpose * ======= * * PSLASIZESYEVX computes the amount of memory needed by PSSYEVX * to ensure: * 1) Orthogonal Eigenvectors * 2) Eigenvectors * 3) Eigenvalues * * Arguments * ========= * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * VL (global input/output ) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VL is set * to a random value near an entry in WIN * * VU (global input/output ) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VU is set * to a random value near an entry in WIN * * IL (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * If IL < 0, RANGE='I' and WKNOWN = .TRUE., IL is set * to a random value from 1 to N * * IU (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * If IU < 0, RANGE='I' and WKNOWN = .TRUE., IU is set * to a random value from IL to N * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * ISEED is not touched unless IL, IU, VL or VU are modified. * * WIN (global input) REAL array, dimension (N) * If WKNOWN=1, WIN contains the eigenvalues of the matrix. * * MAXSIZE (global output) INTEGER * Workspace required to guarantee that PSSYEVX will return * orthogonal eigenvectors. IF WKNOWN=0, MAXSIZE is set to a * a value which guarantees orthogonality no matter what the * spectrum is. If WKNOWN=1, MAXSIZE is set to a value which * guarantees orthogonality on a matrix with eigenvalues given * by WIN. * * VECSIZE (global output) INTEGER * Workspace required to guarantee that PSSYEVX * will compute eigenvectors. * * VALSIZE (global output) INTEGER * Workspace required to guarantee that PSSYEVX * will compute eigenvalues. * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL TWENTY PARAMETER ( TWENTY = 20.0E0 ) * .. * .. Local Scalars .. * INTEGER CLUSTERSIZE, I, ILMIN, IUMAX, MAXCLUSTERSIZE, $ MQ0, MYCOL, MYIL, MYIU, MYROW, NB, NEIG, NN, $ NP0, NPCOL, NPROW REAL ANORM, EPS, ORFAC, SAFMIN, VLMIN, VUMAX * .. * .. External Functions .. * * LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSLAMCH, SLARAN EXTERNAL LSAME, ICEIL, NUMROC, PSLAMCH, SLARAN * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, MAX, REAL * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * ORFAC = 1.0E-3 * * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe Minimum' ) NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * VALSIZE = 5*N + MAX( 5*NN, NB*( NP0+1 ) ) * IF( WKNOWN ) THEN ANORM = SAFMIN / EPS IF( N.GE.1 ) $ ANORM = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), ANORM ) * IF( LSAME( RANGE, 'I' ) ) THEN IF( IL.LT.0 ) $ IL = INT( SLARAN( ISEED )*REAL( N ) ) + 1 IF( IU.LT.0 ) $ IU = INT( SLARAN( ISEED )*REAL( N-IL ) ) + IL IF( N.EQ.0 ) $ IU = 0 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IF( VL.GT.VU ) THEN MYIL = INT( SLARAN( ISEED )*REAL( N ) ) + 1 MYIU = INT( SLARAN( ISEED )*REAL( N-MYIL ) ) + MYIL VL = WIN( MYIL ) + TWENTY*EPS*ABS( WIN( MYIL ) ) VU = WIN( MYIU ) + TWENTY*EPS*ABS( WIN( MYIU ) ) VU = MAX( VU, VL+EPS*TWENTY*ABS( VL )+SAFMIN ) END IF END IF * END IF IF( LSAME( RANGE, 'V' ) ) THEN * * Compute ILMIN, IUMAX (based on VL, VU and WIN) * IF( WKNOWN ) THEN VLMIN = VL - TWENTY*EPS*ANORM VUMAX = VU + TWENTY*EPS*ANORM ILMIN = 1 IUMAX = 0 DO 10 I = 1, N IF( WIN( I ).LT.VLMIN ) $ ILMIN = ILMIN + 1 IF( WIN( I ).LT.VUMAX ) $ IUMAX = IUMAX + 1 10 CONTINUE ELSE ILMIN = 1 IUMAX = N END IF ELSE IF( LSAME( RANGE, 'I' ) ) THEN ILMIN = IL IUMAX = IU ELSE IF( LSAME( RANGE, 'A' ) ) THEN ILMIN = 1 IUMAX = N END IF * NEIG = IUMAX - ILMIN + 1 * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) VECSIZE = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN * IF( WKNOWN ) THEN CLUSTERSIZE = 1 MAXCLUSTERSIZE = 1 DO 20 I = ILMIN + 1, IUMAX IF( ( WIN( I )-WIN( I-1 ) ).LT.ORFAC*2*ANORM ) THEN CLUSTERSIZE = CLUSTERSIZE + 1 IF( CLUSTERSIZE.GT.MAXCLUSTERSIZE ) $ MAXCLUSTERSIZE = CLUSTERSIZE ELSE CLUSTERSIZE = 1 END IF 20 CONTINUE IF( CLUSTERSIZE.GT.MAXCLUSTERSIZE ) $ MAXCLUSTERSIZE = CLUSTERSIZE ELSE MAXCLUSTERSIZE = N END IF * MAXSIZE = VECSIZE + MAX( ( MAXCLUSTERSIZE-1 ), 0 )*N * * RETURN * * End of PSLASIZESYEVX * END scalapack-2.0.2/TESTING/EIG/pslatms.f000644 000766 000024 00000032414 10363532303 017266 0ustar00juliestaff000000 000000 * * SUBROUTINE PSLATMS( M, N, DIST, ISEED, SYM, D, MODE, COND, DMAX, $ KL, KU, PACK, A, IA, JA, DESCA, ORDER, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIST, PACK, SYM INTEGER IA, INFO, JA, KL, KU, LWORK, M, MODE, N, ORDER REAL COND, DMAX * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) REAL A( * ), D( * ), WORK( * ) * .. * * Purpose * ======= * * PSLATMS generates random symmetric matrices with specified * eigenvalues for testing SCALAPACK programs. * * PSLATMS operates by applying the following sequence of * operations: * * Set the diagonal to D, where D may be input or * computed according to MODE, COND, DMAX, and SYM * as described below. * * Generate a dense M x N matrix by multiplying D on the left * and the right by random unitary matrices, then: * * Reduce the bandwidth according to KL and KU, using * Householder transformations. * ### bandwidth reduction NOT SUPPORTED ### * * Arguments * ========= * * M - (global input) INTEGER * The number of rows of A. Not modified. * * N - (global input) INTEGER * The number of columns of A. Not modified. * ### M .ne. N unsupported * * DIST - (global input) CHARACTER*1 * On entry, DIST specifies the type of distribution to be used * to generate the random eigen-/singular values. * 'U' => UNIFORM( 0, 1 ) ( 'U' for uniform ) * 'S' => UNIFORM( -1, 1 ) ( 'S' for symmetric ) * 'N' => NORMAL( 0, 1 ) ( 'N' for normal ) * Not modified. * * ISEED - (global input) INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. They should lie between 0 and 4095 inclusive, * and ISEED(4) should be odd. The random number generator * uses a linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to SLATMS * to continue the same random number sequence. * Changed on exit. * * SYM - (global input) CHARACTER*1 * If SYM='S' or 'H', the generated matrix is symmetric, with * eigenvalues specified by D, COND, MODE, and DMAX; they * may be positive, negative, or zero. * If SYM='P', the generated matrix is symmetric, with * eigenvalues (= singular values) specified by D, COND, * MODE, and DMAX; they will not be negative. * If SYM='N', the generated matrix is nonsymmetric, with * singular values specified by D, COND, MODE, and DMAX; * they will not be negative. * ### SYM = 'N' NOT SUPPORTED ### * Not modified. * * D - (local input/output) REAL array, * dimension ( MIN( M , N ) ) * This array is used to specify the singular values or * eigenvalues of A (see SYM, above.) If MODE=0, then D is * assumed to contain the singular/eigenvalues, otherwise * they will be computed according to MODE, COND, and DMAX, * and placed in D. * Modified if MODE is nonzero. * * MODE - (global input) INTEGER * On entry this describes how the singular/eigenvalues are to * be specified: * MODE = 0 means use D as input * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * If SYM='S' or 'H', and MODE is neither 0, 6, nor -6, then * the elements of D will also be multiplied by a random * sign (i.e., +1 or -1.) * Not modified. * * COND - (global input) REAL * On entry, this is used as described under MODE above. * If used, it must be >= 1. Not modified. * * DMAX - (global input) REAL * If MODE is neither -6, 0 nor 6, the contents of D, as * computed according to MODE and COND, will be scaled by * DMAX / max(abs(D(i))); thus, the maximum absolute eigen- or * singular value (which is to say the norm) will be abs(DMAX). * Note that DMAX need not be positive: if DMAX is negative * (or zero), D will be scaled by a negative number (or zero). * Not modified. * * KL - (global input) INTEGER * This specifies the lower bandwidth of the matrix. For * example, KL=0 implies upper triangular, KL=1 implies upper * Hessenberg, and KL being at least M-1 means that the matrix * has full lower bandwidth. KL must equal KU if the matrix * is symmetric. * Not modified. * ### 1 <= KL < N-1 is NOT SUPPORTED ### * * KU - (global input) INTEGER * This specifies the upper bandwidth of the matrix. For * example, KU=0 implies lower triangular, KU=1 implies lower * Hessenberg, and KU being at least N-1 means that the matrix * has full upper bandwidth. KL must equal KU if the matrix * is symmetric. * Not modified. * ### 1 <= KU < N-1 is NOT SUPPORTED ### * * PACK - (global input) CHARACTER*1 * This specifies packing of matrix as follows: * 'N' => no packing * ### PACK must be 'N' all other options NOT SUPPORTED ### * * A - (local output) REAL array * Global dimension (M, N), local dimension (MP, NQ) * On exit A is the desired test matrix. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ORDER - (input) INTEGER * The number of reflectors used to define the orthogonal * matrix Q. A = Q * D * Q' * Higher ORDER requires more computation and communication. * * WORK - (local input/output) REAL array, * dimension (LWORK) * * LWORK - (local input) INTEGER dimension of WORK * LWORK >= SIZETMS as returned by PSLASIZESEP * * INFO - (global output) INTEGER * Error code. On exit, INFO will be set to one of the * following values: * 0 => normal return * -1 => M negative or unequal to N and SYM='S', 'H', or 'P' * -2 => N negative * -3 => DIST illegal string * -5 => SYM illegal string * -7 => MODE not in range -6 to 6 * -8 => COND less than 1.0, and MODE neither -6, 0 nor 6 * -10 => KL negative * -11 => KU negative, or SYM='S' or 'H' and KU not equal to KL * -16 => DESCA is inconsistent * -17 => ORDER not in the range 0 to N inclusive * 1 => Error return from SLATM1 * 2 => Cannot scale to DMAX (max. sing. value is 0) * 3 => Error return from PSLAGSY * *----------------------------------------------------------------------- * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IDIST, IINFO, IPACK, IRSIGN, ISYM, LLB, $ MNMIN, MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL ALPHA, TEMP * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSLAGSY, $ PXERBLA, SLASET, SLATM1, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * 1) Decode and Test the input parameters. * Initialize flags & seed. * * INFO = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IF( ( MYROW.GE.NPROW .OR. MYROW.LT.0 ) .OR. $ ( MYCOL.GE.NPCOL .OR. MYCOL.LT.0 ) )RETURN * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Decode DIST * IF( LSAME( DIST, 'U' ) ) THEN IDIST = 1 ELSE IF( LSAME( DIST, 'S' ) ) THEN IDIST = 2 ELSE IF( LSAME( DIST, 'N' ) ) THEN IDIST = 3 ELSE IDIST = -1 END IF * * Decode SYM * IF( LSAME( SYM, 'N' ) ) THEN ISYM = 1 IRSIGN = 0 ELSE IF( LSAME( SYM, 'P' ) ) THEN ISYM = 2 IRSIGN = 0 ELSE IF( LSAME( SYM, 'S' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE IF( LSAME( SYM, 'H' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE ISYM = -1 END IF * * Decode PACK * IF( LSAME( PACK, 'N' ) ) THEN IPACK = 0 ELSE IPACK = 1 END IF * * Set certain internal parameters * MNMIN = MIN( M, N ) LLB = MIN( KL, M-1 ) * IF( ORDER.EQ.0 ) $ ORDER = N * * Set INFO if an error * IF( NPROW.EQ.-1 ) THEN INFO = -( 1600+CTXT_ ) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 16, INFO ) IF( INFO.EQ.0 ) THEN IF( M.NE.N .AND. ISYM.NE.1 ) THEN INFO = -2 ELSE IF( IDIST.EQ.-1 ) THEN INFO = -3 ELSE IF( ISYM.EQ.-1 ) THEN INFO = -5 ELSE IF( ABS( MODE ).GT.6 ) THEN INFO = -7 ELSE IF( ( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) .AND. COND.LT. $ ONE ) THEN INFO = -8 ELSE IF( KL.LT.0 ) THEN INFO = -10 ELSE IF( KU.LT.0 .OR. ( ISYM.NE.1 .AND. KL.NE.KU ) ) THEN INFO = -11 ELSE IF( ( ORDER.LT.0 ) .OR. ( ORDER.GT.N ) ) THEN INFO = -17 END IF END IF CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 16, 0, IDUM1, IDUM2, $ INFO ) END IF * * Check for unsupported features * IF( ISYM.NE.2 ) THEN INFO = -5 ELSE IF( IPACK.NE.0 ) THEN INFO = -12 ELSE IF( KL.GT.0 .AND. KL.LT.M-1 ) THEN INFO = -10 ELSE IF( KU.GT.0 .AND. KU.LT.N-1 ) THEN INFO = -11 ELSE IF( LLB.NE.0 .AND. LLB.NE.M-1 ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PSLATMS', -INFO ) RETURN END IF * * Initialize random number generator * DO 10 I = 1, 4 ISEED( I ) = MOD( ABS( ISEED( I ) ), 4096 ) 10 CONTINUE * IF( MOD( ISEED( 4 ), 2 ).NE.1 ) $ ISEED( 4 ) = ISEED( 4 ) + 1 * * 2) Set up D if indicated. * * Compute D according to COND and MODE * CALL SLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, MNMIN, IINFO ) * IF( IINFO.NE.0 ) THEN INFO = 1 RETURN END IF * * IF( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) THEN * * Scale by DMAX * TEMP = ABS( D( 1 ) ) DO 20 I = 2, MNMIN TEMP = MAX( TEMP, ABS( D( I ) ) ) 20 CONTINUE * IF( TEMP.GT.ZERO ) THEN ALPHA = DMAX / TEMP ELSE INFO = 2 RETURN END IF * CALL SSCAL( MNMIN, ALPHA, D, 1 ) * END IF * CALL SLASET( 'A', NP, NQ, ZERO, ZERO, A, DESCA( LLD_ ) ) * * symmetric -- A = U D U' * CALL PSLAGSY( M, LLB, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, IINFO ) * RETURN * * End of PSLATMS * END scalapack-2.0.2/TESTING/EIG/pslatran.f000644 000766 000024 00000015617 10363532303 017435 0ustar00juliestaff000000 000000 SUBROUTINE PSLATRAN( N, NB, A, IA, JA, DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IA, JA, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * * ======= * * PSLATRAN transpose a lower triangular matrix on to the upper * triangular portion of the same matrix. * * This is an auxiliary routine called by PSSYTRD. * * Notes * ===== * * IA must equal 1 * JA must equal 1 * DESCA( MB_ ) must equal 1 * DESCA( NB_ ) must equal 1 * DESCA( RSRC_ ) must equal 1 * DESCA( CSRC_ ) must equal 1 * * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix to be transposed. * * NB (global input) INTEGER * The number of rows and columns to be transposed with each * message sent. NB has no impact on the result, it is striclty * a performance tuning parameter. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). On entry, the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix. On exit, the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is undefined (and may have been modified). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * Must be equal to 1. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * Must be equal to 1. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * DESCA( MB_ ) must equal 1 * DESCA( NB_ ) must equal 1 * DESCA( ICTXT_ ) must point to a square process grid * i.e. one where NPROW is equal to NPCOL * * WORK (local workspace) COMPLEX*16 array, dimension ( LWORK ) * * Where: * LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, IRECV, ISEND, J, JJ, JRECV, JSEND, $ LDA, MAXIRECV, MAXISEND, MAXJRECV, MAXJSEND, $ MINIRECV, MINISEND, MINJRECV, MINJSEND, MYCOL, $ MYROW, NP, NPCOL, NPROW, NQ, RECVNB, SENDNB, $ STARTCOL, STARTROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, STRRV2D, STRSD2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Further details * * Because the processor grid is square each process needs only send * data to its transpose process. (Likewsie it need only receive * data from its transpose process.) Because the data decomposition * is cyclic, the local portion of the array is triangular. * * This routine requires that the data be buffered (i.e. copied) * on the sending process (because of the triangular shape) and * unbuffered on the receiving process. Hence, two local memory to * memory copies are performed within the communications routines * followed by a memory to memory copy outside of the communications * routines. It would be nice to avoid having back to back memory * to memory copies (as we do presently on the receiving processor). * This could be done by packaging the data ourselves in the sender * and then unpacking it directly into the matrix. However, this * code seems cleaner and so since this routine is not a significant * performance bottleneck we have left it this way. * * * * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * NP = NUMROC( N, 1, MYROW, 0, NPROW ) NQ = NUMROC( N, 1, MYCOL, 0, NPCOL ) * * IF( MYROW.EQ.MYCOL ) THEN * DO 20 J = 1, NP DO 10 I = J + 1, NQ A( J+( I-1 )*LDA ) = A( I+( J-1 )*LDA ) 10 CONTINUE 20 CONTINUE * ELSE IF( MYROW.GT.MYCOL ) THEN STARTROW = 1 STARTCOL = 2 ELSE IF( MYROW.EQ.MYCOL ) THEN STARTROW = 2 STARTCOL = 2 ELSE STARTROW = 2 STARTCOL = 1 END IF END IF * DO 50 JJ = 1, MAX( NP, NQ ), NB MINJSEND = STARTCOL + JJ - 1 MINJRECV = STARTROW + JJ - 1 MAXJSEND = MIN( MINJSEND+NB-1, NQ ) MAXJRECV = MIN( MINJRECV+NB-1, NP ) * SENDNB = MAXJSEND - MINJSEND + 1 RECVNB = MAXJRECV - MINJRECV + 1 * MINISEND = 1 MINIRECV = 1 MAXISEND = MIN( NP, JJ+SENDNB-1 ) MAXIRECV = MIN( NQ, JJ+RECVNB-1 ) * ISEND = MAXISEND - MINISEND + 1 IRECV = MAXIRECV - MINIRECV + 1 JSEND = MAXJSEND - MINJSEND + 1 JRECV = MAXJRECV - MINJRECV + 1 * * * DO 40 J = MINJRECV, MAXJRECV DO 30 I = MINIRECV, MAXIRECV + J - MAXJRECV WORK( I+( J-MINJRECV )*IRECV ) = A( J+( I-1 )*LDA ) 30 CONTINUE 40 CONTINUE * IF( IRECV.GT.0 .AND. JRECV.GT.0 ) $ CALL STRSD2D( ICTXT, 'U', 'N', IRECV, JRECV, WORK, IRECV, $ MYCOL, MYROW ) * IF( ISEND.GT.0 .AND. JSEND.GT.0 ) $ CALL STRRV2D( ICTXT, 'U', 'N', ISEND, JSEND, $ A( MINISEND+( MINJSEND-1 )*LDA ), LDA, $ MYCOL, MYROW ) * * 50 CONTINUE * END IF * RETURN * * End of PSLATRD * END scalapack-2.0.2/TESTING/EIG/psmatgen.f000644 000766 000024 00000043110 10363532303 017414 0ustar00juliestaff000000 000000 SUBROUTINE PSMATGEN( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * PSMATGEN : Parallel Real Single precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * otherwise a random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) REAL, pointer into the local memory * to an array of dimension ( LDA, * ) containing the local * pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSRAND EXTERNAL ICEIL, NUMROC, LSAME, PSRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) * INFO = 0 IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSMATGEN', INFO ) RETURN END IF * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = ONE - TWO*PSRAND(0) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = ONE - TWO*PSRAND(0) 30 CONTINUE A(IK,JK) = ONE - TWO*PSRAND(0) DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = ONE - TWO*PSRAND(0) A(IK,JK+J) = A(IK+J,JK) 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 A(IK,JK) = ONE - TWO*PSRAND(0) JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 A(IK,JK) = ONE - TWO*PSRAND(0) JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSE * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = ONE - TWO*PSRAND(0) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 A(IK,JK+J) = ABS(A(IK,JK+J)) + MAXMN IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PSMATGEN * END scalapack-2.0.2/TESTING/EIG/psmatgen2.f000644 000766 000024 00000055477 11654025546 017533 0ustar00juliestaff000000 000000 SUBROUTINE PSMATGEN2( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * * Modified version by K. L. Dackland (U added) * Modified version by Peter Poromaa, Heavy DIAG * Modified version by Robert Granat, R(andom) added * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * PSMATGEN2 : Parallel Real Double precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'U' : A returned is an Upper triangular matrix. * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * if AFORM = 'R' A random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) REAL, pointer into the local * memory to an array of dimension ( LDA, * ) containing the * local pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0, TWO = 2.0, ZERO = 0.0) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN, UPPR, RANDOM INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSRAND EXTERNAL ICEIL, NUMROC, LSAME, PSRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) UPPR = LSAME( AFORM, 'U' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) RANDOM = LSAME( AFORM, 'R' ) * INFO = 0 IF( .NOT.( UPPR.OR.SYMM.OR.HERM.OR.TRAN.OR.RANDOM ) .AND. $ .NOT.LSAME( AFORM, 'C' ) .AND. $ .NOT.LSAME( AFORM, 'N' ) ) THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( UPPR.OR.SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSMATGEN2', INFO ) RETURN END IF MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = ONE - TWO*PSRAND(0) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = ONE - TWO*PSRAND(0) 30 CONTINUE A(IK,JK) = ONE - TWO*PSRAND(0) DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = ONE - TWO*PSRAND(0) A(IK,JK+J) = A(IK+J,JK) 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 A(IK,JK) = ONE - TWO*PSRAND(0) JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * Generate an upper triangular matrix. * ELSE IF ( UPPR ) THEN JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 1000 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 1000 CONTINUE * JK = 1 DO 8000 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 7000 I = 1, NB IF( JK .GT. ICNUM ) GO TO 8000 * IK = 1 DO 5000 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 6000 DO 3000 J = 1, I-1 A(IK,JK) = ONE - TWO*PSRAND(0) 3000 CONTINUE A(IK,JK) = ONE - TWO*PSRAND(0) DO 4000 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 6000 A(IK,JK+J) = ONE - TWO*PSRAND(0) 4000 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 5000 CONTINUE * 6000 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 7000 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 8000 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 1110 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 1110 CONTINUE * IK = 1 DO 1500 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 1400 J = 1, MB IF( IK .GT. IRNUM ) GO TO 1600 JK = 1 DO 1200 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 1100 I = 1, NB IF( JK .GT. ICNUM ) GO TO 1300 A(IK,JK) = ONE - TWO*PSRAND(0) JK = JK + 1 1100 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 1200 CONTINUE * 1300 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 1400 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 1500 CONTINUE 1600 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = NQNB JUMP3 = N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 A(IK,JK) = ONE - TWO*PSRAND(0) JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSEIF( RANDOM ) THEN * JUMP1 = 1 JUMP2 = NPMB JUMP3 = M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = ONE - TWO*PSRAND(0) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 A(IK,JK+J) = ABS(A(IK,JK+J)) + MAXMN IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PSMATGEN2 * END scalapack-2.0.2/TESTING/EIG/psnepdriver.f000644 000766 000024 00000050552 10363532303 020147 0ustar00juliestaff000000 000000 PROGRAM PSNEPDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PSNEPDRIVER is the main test program for the REAL * SCALAPACK NEP routines. This test driver performs a Schur * decomposition followed by residual check of a Hessenberg matrix. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'SCALAPACK, Version 1.4, NEP (Nonsymmetric EigenProblem) input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'NEP.out' output file name (if any) * 6 device out * 8 number of problems sizes * 1 2 3 4 6 10 100 200 vales of N * 3 number of NB's * 6 20 40 values of NB * 4 number of process grids (ordered pairs of P & Q) * 1 2 1 4 values of P * 1 2 4 1 values of Q * 20.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * REALSZ INTEGER, default = 4 bytes. * REALSZ indicate the length in bytes on the given platform * for a real element. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER REALSZ, TOTMEM, MEMSIZ, NTESTS REAL PADVAL, ZERO, ONE PARAMETER ( REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0, ZERO = 0.0E+0, $ ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, III, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPREPAD, IPW, IPWI, IPWR, IPZ, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LDA, LDZ, LWORK, $ MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, NOUT, $ NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ REAL THRESH REAL ANORM, FRESID, QRESID, ZNORM DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCZ( DLEN_ ), IERR( 2 ), $ IDUM( 1 ), NBVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM ( MEMSIZ ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSCHEKPAD, $ PSFILLPAD, PSGEMM, PSLAHQR, PSLASET, PSMATGEN, $ PSNEPFCHK, PSNEPINFO, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER ILCM, NUMROC REAL PSLAMCH, PSLANGE, PSLANHS EXTERNAL ILCM, NUMROC, PSLAMCH, PSLANGE, PSLANHS * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, REAL * .. * .. Data statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PSNEPINFO( OUTFILE, NOUT, NMAT, NVAL, NTESTS, NNB, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.6 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) IPREPAD = IPREPAD + 1000 IMIDPAD = IMIDPAD + 1000 IPOSTPAD = IPOSTPAD + 1000 ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Initialize the array descriptor for the matrix Z * CALL DESCINIT( DESCZ, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 2 ) ) * LDA = DESCA( LLD_ ) LDZ = DESCZ( LLD_ ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPZ = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWR = IPZ + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWI = IPWR + N + IPOSTPAD + IPREPAD IPW = IPWI + N + IPOSTPAD + IPREPAD III = N / NB IF( III*NB.LT.N ) $ III = III + 1 III = 7*III / ILCM( NPROW, NPCOL ) * * LWORK = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, III ) LWORK = LWORK + MAX(2*N, (8*ILCM(NPROW,NPCOL)+2)**2 ) * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PSNEPFCHK and PSLANHS * WORKSIZ = LWORK + MAX( NP*DESCA( NB_ ), $ DESCA( MB_ )*NQ ) + IPOSTPAD * ELSE * WORKSIZ = LWORK + IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Schur reduction', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate matrix Z = In * CALL PSLASET( 'All', N, N, ZERO, ONE, MEM( IPZ ), 1, 1, $ DESCZ ) * * Generate matrix A upper Hessenberg * CALL PSMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) CALL PSLASET( 'Lower', MAX( 0, N-2 ), MAX( 0, N-2 ), $ ZERO, ZERO, MEM( IPA ), MIN( N, 3 ), 1, $ DESCA ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPZ-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANHS( 'I', N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANHS', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANHS', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PSFILLPAD( ICTXT, N, 1, MEM( IPWR-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, N, 1, MEM( IPWI-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, PADVAL ) * END IF * CALL SLBOOT( ) CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform NEP factorization * CALL PSLAHQR( .TRUE., .TRUE., N, 1, N, MEM( IPA ), DESCA, $ MEM( IPWR ), MEM( IPWI ), 1, N, MEM( IPZ ), $ DESCZ, MEM( IPW ), LWORK, IDUM, 0, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * )'PSLAHQR INFO=', INFO KFAIL = KFAIL + 1 GO TO 10 END IF * IF( CHECK ) THEN * * Check for memory overwrite in NEP factorization * CALL PSCHEKPAD( ICTXT, 'PSLAHQR (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLAHQR (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLAHQR (WR)', N, 1, $ MEM( IPWR-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLAHQR (WI)', N, 1, $ MEM( IPWI-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLAHQR (WORK)', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || Z * H * Z**T - H0 || / ( N*|| H0 ||*EPS ) * CALL PSNEPFCHK( N, MEM( IPA ), 1, 1, DESCA, IASEED, $ MEM( IPZ ), 1, 1, DESCZ, ANORM, $ FRESID, MEM( IPW ) ) * CALL PSCHEKPAD( ICTXT, 'PSNEPFCHK (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSNEPFCHK (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSNEPFCHK (WORK)', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || (Z**T)*Z - In ||_1 * CALL PSLASET( 'All', N, N, ZERO, ONE, MEM( IPA ), 1, $ 1, DESCA ) CALL PSGEMM( 'Transpose', 'No transpose', N, N, N, $ -ONE, MEM( IPZ ), 1, 1, DESCZ, $ MEM( IPZ ), 1, 1, DESCZ, ONE, MEM( IPA ), $ 1, 1, DESCA ) ZNORM = PSLANGE( '1', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) QRESID = ZNORM / ( REAL( N )*PSLAMCH( ICTXT, 'P' ) ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( ( FRESID-FRESID ).EQ.0.0E+0 ) .AND. $ ( QRESID.LE.THRESH ) .AND. $ ( ( QRESID-QRESID ).EQ.0.0E+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 )FRESID WRITE( NOUT, FMT = 9985 )QRESID END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID QRESID = QRESID - QRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 18 N^3 flops for PxLAHQR * NOPS = 18.0D+0*DBLE( N )**3 * * Calculate total megaflops -- factorization only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q NEP Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- --- ---- ---- -------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, F8.2, $ 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||H - Q*S*Q^T|| / (||H|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Q^T*Q - I|| / ( N * eps ) ', G25.7 ) * STOP * * End of PSNEPDRIVER * END scalapack-2.0.2/TESTING/EIG/psnepfchk.f000644 000766 000024 00000025754 10363532303 017575 0ustar00juliestaff000000 000000 SUBROUTINE PSNEPFCHK( N, A, IA, JA, DESCA, IASEED, Z, IZ, JZ, $ DESCZ, ANORM, FRESID, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IASEED, IZ, JA, JZ, N REAL ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) REAL A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSNEPFCHK computes the residual * || sub(Z)*sub( A )*sub(Z)**T - sub( Ao ) || / (||sub( Ao )||*eps*N), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A(IA:IA+M-1,JA:JA+N-1), sub( Z ) = Z(IZ:IZ+N-1,JZ:JZ+N-1) * and ||.|| stands for the infinity norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The order of sub( A ) and sub( Z ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub(Z)*sub( A )*sub(Z)**T - sub( Ao ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * Z (local input) REAL pointer into the local memory * to an array of dimension (LLD_Z,LOCc(JZ+N-1)). On entry, this * array contains the local pieces of the N-by-N distributed * matrix sub( Z ). * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * ANORM (global input) REAL * The Infinity norm of sub( A ). * * FRESID (global output) REAL * The maximum (worst) factorizational error. * * WORK (local workspace) REAL array, dimension (LWORK). * LWORK >= MAX( NpA0 * NB_A, MB_A * NqA0 ) where * * IROFFA = MOD( IA-1, MB_A ), * ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * WORK is used to store a block of rows and a block of columns * of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, IB, ICTXT, IIA, IOFFA, IROFF, $ IW, J, JB, JJA, JN, LDA, LDW, MYCOL, MYROW, NP, $ NPCOL, NPROW REAL EPS * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, SMATADD, INFOG2L, $ PSGEMM, PSLACPY, PSLASET, PSMATGEN * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PSLAMCH, PSLANGE EXTERNAL ICEIL, NUMROC, PSLAMCH, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PSLAMCH( ICTXT, 'eps' ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF LDW = MAX( 1, NP ) * * First compute H <- H * Z**T * CALL DESCSET( DESCW, DESCA( MB_ ), N, DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * DO 10 I = IA, IA + N - 1, DESCA( MB_ ) IB = MIN( IA+N-I, DESCA( MB_ ) ) * CALL PSLACPY( 'All', IB, N, A, I, JA, DESCA, WORK, 1, 1, $ DESCW ) CALL PSGEMM( 'No transpose', 'Transpose', IB, N, N, ONE, WORK, $ 1, 1, DESCW, Z, IZ, JZ, DESCZ, ZERO, A, I, JA, $ DESCA ) * DESCW( RSRC_ ) = MOD( DESCW( RSRC_ )+1, NPROW ) * 10 CONTINUE * * Then compute H <- Z * H = Z * H0 * Z**T * CALL DESCSET( DESCW, N, DESCA( NB_ ), DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, LDW ) * DO 20 J = JA, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * CALL PSLACPY( 'All', N, JB, A, IA, J, DESCA, WORK, 1, 1, $ DESCW ) CALL PSGEMM( 'No transpose', 'No transpose', N, JB, N, ONE, Z, $ IZ, JZ, DESCZ, WORK, 1, 1, DESCW, ZERO, A, IA, J, $ DESCA ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * 20 CONTINUE * * Compute H - H0 * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA-1 )*LDA IW = 1 JB = JN - JA + 1 DESCW( CSRC_ ) = IACOL * * Handle first block of columns separately * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PSMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ NP, JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL PSLASET( 'Lower', MAX( 0, N-2 ), JB, ZERO, ZERO, WORK, $ MIN( IW+2, N ), 1, DESCW ) CALL SMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF * IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * DO 30 J = JN + 1, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PSMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, NP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL PSLASET( 'Lower', MAX( 0, N-IW-1 ), JB, ZERO, ZERO, $ WORK, MIN( N, IW+2 ), 1, DESCW ) CALL SMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) 30 CONTINUE * * Calculate factor residual * FRESID = PSLANGE( 'I', N, N, A, IA, JA, DESCA, WORK ) / $ ( N*EPS*ANORM ) * RETURN * * End PSNEPFCHK * END scalapack-2.0.2/TESTING/EIG/psnepinfo.f000644 000766 000024 00000027677 10363532303 017623 0ustar00juliestaff000000 000000 SUBROUTINE PSNEPINFO( SUMMRY, NOUT, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, NGRIDS, $ NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PSNEPINFO gets needed startup information for PDHSEQR drivers * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (the order of the matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH * * WORK (local workspace) INTEGER array of dimension >= * MAX( 3, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ), used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * * Implemented by: G. Henry, May 10, 1996 * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'NEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * )NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 )'N', LDNVAL GO TO 30 END IF READ( NIN, FMT = * )( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * )NNB IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 )'NB', LDNBVAL GO TO 30 END IF READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) * DO 10 I = 1, NNB IF( NBVAL( I ).LT.6 ) THEN WRITE( NOUT, FMT = 9992 )NBVAL( I ) GO TO 30 END IF 10 CONTINUE * * Get number of grids * READ( NIN, FMT = * )NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDPVAL GO TO 30 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDQVAL GO TO 30 END IF * * Get values of P and Q * READ( NIN, FMT = * )( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * )( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * )THRESH * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 20 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 20 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK QSQ^T by Schur Decomposition.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'Tests of the parallel ' // $ 'real single precision Schur decomposition.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Residual = ||H-QSQ^T|| / ' // $ '(||H|| * eps * N )' WRITE( NOUT, FMT = 9999 ) $ ' Orthogonality residual = ||I - Q^TQ|| / ' // '( eps * N )' WRITE( NOUT, FMT = 9999 )'The matrix A is randomly ' // $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'An explanation of the input/output ' $ // 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or ' // $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the ' // 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the' // $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than' // $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'NEP time : Time in seconds to decompose the ' // ' matrix' WRITE( NOUT, FMT = 9999 )'MFLOPS : Rate of execution ' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 )'N ', $ ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 )'NB ', $ ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 )'P ', $ ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 )'Q ', $ ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 )EPS WRITE( NOUT, FMT = 9998 )THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 30 CONTINUE WRITE( NOUT, FMT = 9993 ) CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9992 FORMAT( ' Blocking size too small at ', I2, ' must be >=6.' ) * * End of PSNEPINFO * END scalapack-2.0.2/TESTING/EIG/psrptseptst.f000644 000766 000024 00000005261 10363532303 020216 0ustar00juliestaff000000 000000 * * PROGRAM PSRPTSEPTST * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Repeat parallel symmetric eigenproblem test * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER MAXN, LWORK, LIWORK PARAMETER ( MAXN = 200, LWORK = 500000, $ LIWORK = 6*MAXN+4 ) * .. * .. Local Scalars .. CHARACTER HETERO, SUBTESTS, UPLO INTEGER CONTEXT, IAM, INFO, IPOSTPAD, IPREPAD, LDA, $ MATTYPE, N, NB, NPCOL, NPROCS, NPROW REAL ABSTOL, THRESH * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), ICLUSTR( MAXN ), IFAIL( MAXN ), $ ISEED( 4 ), IWORK( LIWORK ) REAL A( MAXN*MAXN ), COPYA( MAXN*MAXN ), $ GAP( MAXN ), WIN( MAXN ), WNEW( MAXN ), $ WORK( LWORK ), Z( MAXN*MAXN ) * .. * * * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDINIT, $ BLACS_PINFO, BLACS_SETUP, DESCINIT, PSSEPTST * .. * .. Executable Statements .. * IPREPAD = 3 IPOSTPAD = 3 LDA = MAXN * * Set HETERO to 'Y' if you want to turn off the PxSYEV tests * HETERO = 'N' * * These lines should be replaced by the output from pxSEPdriver * * ISEED( 1 ) = 2312 ISEED( 2 ) = 3709 ISEED( 3 ) = 666 ISEED( 4 ) = 3371 UPLO = 'U' SUBTESTS = 'Y' N = 33 NPROW = 2 NPCOL = 2 NB = 4 MATTYPE = 9 * note: the printout often makes a mess of ABSTOL ABSTOL = 0.1175494351E-37 THRESH = .350000E+01 * CALL BLACS_PINFO( IAM, NPROCS ) IF( NPROCS.LT.1 ) THEN * NPROCS = NPROW*NPCOL CALL BLACS_SETUP( IAM, NPROCS ) END IF CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, CONTEXT, LDA, INFO ) * CALL PSSEPTST( DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, N, $ ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, WORK, $ LWORK-IPREPAD-IPOSTPAD, IWORK, $ LIWORK-IPREPAD-IPOSTPAD, HETERO, 6, INFO ) * * * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * * CALL BLACS_EXIT( 0 ) STOP * * * * End of PSRPTSEPTST * END scalapack-2.0.2/TESTING/EIG/pssdpsubtst.f000644 000766 000024 00000040446 10363532303 020205 0ustar00juliestaff000000 000000 SUBROUTINE PSSDPSUBTST( WKNOWN, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, IA, JA, DESCA, WIN, WNEW, $ IPREPAD, IPOSTPAD, WORK, LWORK, LWORK1, $ IWORK, LIWORK, $ RESULT, TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 16, 2000 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER UPLO INTEGER IA, IPOSTPAD, IPREPAD, JA, LWORK, LWORK1, N, $ NOUT, RESULT, LIWORK REAL ABSTOL, QTQNRM, THRESH, TSTNRM * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) REAL A( * ), COPYA( * ), WIN( * ), WNEW( * ), $ WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSSDPSUBTST calls PSSYEVD and then tests the output of * PSSYEVD * The following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PSSYEVD when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. * * A (local workspace) REAL array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PSSYEVD for a description of block cyclic layout. * The test matrix, which is then modified by PSSYEVD * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) REAL array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) REAL array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PSSEPCHK and PSSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) REAL array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) REAL array, dimension (N) * The eigenvalues as computed by this call to PSSYEVD. * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * WORK (local workspace) REAL array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PSSYEVD * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PSSYEVD * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) REAL * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) REAL * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DT_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL FIVE, NEGONE, PADVAL, ZERO PARAMETER ( PADVAL = 13.5285E+0, FIVE = 5.0E+0, $ NEGONE = -1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IAM, INFO, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, J, MINSIZE, MQ, MYCOL, MYROW, $ NP, NPCOL, NPROW, NQ, RESAQ, RESQTQ, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEV, SIZESYEVX, $ SIZETMS, SIZETST, SIZESYEVD, ISIZESYEVD, $ TRILWMIN REAL EPS, EPSNORMA, ERROR, MAXERROR, MINERROR, $ NORMWIN, SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH, PSLANSY EXTERNAL LSAME, NUMROC, PSLAMCH, PSLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, IGAMN2D, IGAMX2D, $ PSCHEKPAD, PSELSET, PSFILLPAD, PSLASIZESQP, $ PSSEPCHK, PSSEPQTQ, PSSYEVD, SGAMN2D, SGAMX2D, $ SLACPY, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DT_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PSLASIZESQP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESYEV, $ SIZESYEVD, ISIZESYEVD, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PSLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1+IPREPAD ) ), $ ABS( WIN( N+IPREPAD ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3E+0 10 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159E+0 30 CONTINUE * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * Find the amount of workspace needed with or without eigenvectors. * TRILWMIN = 3*N + MAX( DESCA( NB_ )*( NP+1 ), 3*DESCA( NB_ ) ) MINSIZE = MAX( 1 + 6*N + 2*NP*NQ, TRILWMIN ) + 2*N * CALL SLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PSFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0E+0 ) * * Make sure that PSSYEVD does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, N, 1 CALL PSELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0E+0 ) 50 CONTINUE 60 CONTINUE * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PSSYEVD( 'V', UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), LWORK1, IWORK( 1+IPREPAD ), $ LIWORK, INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVD-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PSCHEKPAD( DESCZ( CTXT_ ), 'PSSYEVD-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVD-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVD-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )INFO IF( INFO.EQ.(N+1) ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 END IF ELSE IF( INFO.EQ.14 .AND. LWORK1.GE.MINSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * IF( RESULT.EQ.0 .OR. INFO.GT.N ) THEN * * Make sure that different processes return the same eigenvalues. * This is a more exhaustive check that provided by PSSYEVD. * DO 70 I = 1, N WORK( I ) = WNEW( I+IPREPAD ) WORK( I+N ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', N, 1, WORK, N, 1, $ 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', N, 1, $ WORK( 1+N ), N, 1, 1, -1, -1, 0 ) * DO 80 I = 1, N * IF( ABS( WORK( I )-WORK( N+I ) ).GT.ZERO ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 GO TO 90 END IF 80 CONTINUE 90 CONTINUE END IF * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PSLANSY( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ WORK )*EPS END IF * * Note that a couple key variables get redefined in PSSEPCHK * as described by this table: * * PSSEPTST name PSSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * * * Perform the |AQ - QE| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3E+0 ) * RESAQ = 0 * CALL PSSEPCHK( N, N, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RESAQ ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSEPCHK-WORK', SIZECHK, 1, $ WORK, SIZECHK, IPREPAD, IPOSTPAD, 4.3E+0 ) * IF( RESAQ.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9993 ) END IF * * Perform the |QTQ - I| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1, WORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3E+0 ) * RESQTQ = 0 * * DO 40 I = 1, 2 IWORK( IPREPAD + I ) = 0 40 CONTINUE CALL PSSEPQTQ( N, N, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1 ), IWORK( 1 ), WORK( 1 ), $ WORK( IPREPAD+1 ), SIZEQTQ, QTQNRM, INFO, $ RESQTQ ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSEPQTQ-WORK', SIZEQTQ, 1, $ WORK, SIZEQTQ, IPREPAD, IPOSTPAD, 4.3E+0 ) * IF( RESQTQ.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9992 ) END IF * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF ENDIF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN .AND. N.GT.0 ) THEN * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN MAXERROR = 0 * cc CALL SLASRT( 'I', N,WNEW( IPREPAD +1 ), INFO ) c DO 140 I = 1, N ERROR = ABS( WIN( I+IPREPAD )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 140 CONTINUE MINERROR = MIN( MAXERROR, MINERROR ) * IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * * RETURN * 9999 FORMAT( 'PSSYEVD returned INFO=', I7 ) 9998 FORMAT( 'PSSEPQTQ in PSSDPSUBTST returned INFO=', I7 ) 9997 FORMAT( 'PSSDPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PSSYEVD returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'Different processes return different eigenvalues' ) 9994 FORMAT( 'Heterogeneity detected by PSSYEVD' ) 9993 FORMAT( 'PSSYEVD failed the |AQ -QE| test' ) 9992 FORMAT( 'PSSYEVD failed the |QTQ -I| test' ) * * End of PSSDPSUBTST * END scalapack-2.0.2/TESTING/EIG/pssepchk.f000644 000766 000024 00000024047 11750130340 017422 0ustar00juliestaff000000 000000 * * SUBROUTINE PSSEPCHK( MS, NV, A, IA, JA, DESCA, EPSNORMA, THRESH, $ Q, IQ, JQ, DESCQ, C, IC, JC, DESCC, W, WORK, $ LWORK, TSTNRM, RESULT ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER IA, IC, IQ, JA, JC, JQ, LWORK, MS, NV, RESULT REAL EPSNORMA, THRESH, TSTNRM * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCC( * ), DESCQ( * ) REAL A( * ), C( * ), Q( * ), W( * ), WORK( * ) * .. * * Purpose * ======= * * Compute |AQ- QL| / (EPSNORMA * N) * where EPSNORMA = (abstol + eps)*norm(A) when called by pdsqpsubtst. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * MP = number of local rows in A, C and Q * MQ = number of local columns in A * NQ = number of local columns in C and Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in A, C and Q * Also, the number of global columns in A * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q. * * A (local input) REAL pointer to an * array in local memory of dimension (LLD_A, LOCc(JA+N-1)). * This array contains the local pieces of the MS-by-MS * distributed test matrix A * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * EPSNORMA (input) REAL * abstol + eps * inf.norm(A) * Abstol is absolute tolerence for the eigenvalues and is set * in the calling routines, pdsepsubtst and pdsqpsubtst. * * THRESH (input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) REAL array * global dimension (MS, NV), local dimension (DESCA(DLEN_), NQ) * * Contains the eigenvectors as computed by PSSYEVX * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Q. * * C (local workspace) REAL array, * global dimension (NV, NV), local dimension (DESCA(DLEN_), MQ) * * Accumulator for computing AQ -QL * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * W (global input) REAL array, dimension (NV) * * Contains the computed eigenvalues * * WORK (local workspace) REAL array, * dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * TSTNRM (global output) REAL * |AQ- QL| / ( EPSNROMA * MS ) * * RESULT (global output) INTEGER * 0 if the test passes i.e. * |AQ -QL| / (abstol + eps * norm(A) ) <= n* THRESH * 1 if the test fails i.e. * |AQ -QL| / (abstol + eps * norm(A) ) > n * THRESH * * .. Local Scalars .. * INTEGER INFO, J, LOCALCOL, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ, PCOL REAL NORM * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, NEGONE PARAMETER ( ONE = 1.0E+0, NEGONE = -1.0E+0 ) * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC REAL PSLANGE EXTERNAL INDXG2L, INDXG2P, NUMROC, PSLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PSGEMM, PXERBLA, $ SLACPY, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * RESULT = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 CALL CHK1MAT( MS, 1, MS, 1, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IQ, JQ, DESCQ, 12, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IC, JC, DESCC, 16, INFO ) * IF( INFO.EQ.0 ) THEN * MP = NUMROC( MS, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IF( IQ.NE.1 ) THEN INFO = -10 ELSE IF( JQ.NE.1 ) THEN INFO = -11 ELSE IF( IA.NE.1 ) THEN INFO = -4 ELSE IF( JA.NE.1 ) THEN INFO = -5 ELSE IF( IC.NE.1 ) THEN INFO = -14 ELSE IF( JC.NE.1 ) THEN INFO = -15 ELSE IF( LWORK.LT.NQ ) THEN INFO = -19 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PSSEPCHK', -INFO ) RETURN END IF * * C = Q * W * CALL SLACPY( 'A', MP, NQ, Q, DESCQ( LLD_ ), C, DESCC( LLD_ ) ) * * DO 10 J = 1, NV PCOL = INDXG2P( J, DESCC( NB_ ), 0, 0, NPCOL ) LOCALCOL = INDXG2L( J, DESCC( NB_ ), 0, 0, NPCOL ) * IF( MYCOL.EQ.PCOL ) THEN CALL SSCAL( MP, W( J ), C( ( LOCALCOL-1 )*DESCC( LLD_ )+1 ), $ 1 ) END IF 10 CONTINUE * * * C = C - A * Q * CALL PSGEMM( 'N', 'N', MS, NV, MS, NEGONE, A, 1, 1, DESCA, Q, 1, $ 1, DESCQ, ONE, C, 1, 1, DESCC ) * * Compute the norm of C * * NORM = PSLANGE( 'M', MS, NV, C, 1, 1, DESCC, WORK ) * * TSTNRM = NORM / EPSNORMA / MAX( MS, 1 ) * IF( TSTNRM.GT.THRESH .OR. ( TSTNRM-TSTNRM.NE.0.0E0 ) ) THEN RESULT = 1 END IF * * RETURN * * End of PSSEPCHK * END scalapack-2.0.2/TESTING/EIG/pssepdriver.f000644 000766 000024 00000025157 10363532303 020157 0ustar00juliestaff000000 000000 * * PROGRAM PSSEPDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Parallel REAL symmetric eigenproblem test driver * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. INTSIZ and REALIZ * indicate the length in bytes on the given platform for an integer * and a double precision real. * For example, on our system with 8 MB of memory, TOTMEM=6500000 * (leaves 1.5 MB for OS, code, BLACS buffer, etc), the length of a * REAL is 8, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * The full tester requires approximately (5 n + 5 n^2/p + slop) * REAL words and 6*n integer words. * So, TOTMEM should be set to at least 1.1 * 8 * (5n + 5n^2/p) * * WHAT WE TEST * ============ * * This routine tests PSSYEVX, the expert driver for the parallel * symmetric eigenvalue problem, and PSSYEV. We would like to cover * all possible combinations of: matrix size, process configuration * (nprow and npcol), block size (nb), matrix type (??), range * of eigenvalue (all, by value, by position), sorting options, * and upper vs. lower storage. * * As PSSYEV returns an error message when heterogeneity is detected, * the PSSYEV tests can be suppressed by changing the appropiate * entry in the input file. * * We intend to provide two types of test input files, an * installation test and a thorough test. * * We also intend that the reports be meaningful. Our input file * will allow multiple requests where each request is a cross product * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, REALSZ, NIN PARAMETER ( TOTMEM = 2000000, REALSZ = 8, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / REALSZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) REAL MEM( MEMSIZ ) * .. * .. External Functions .. REAL SLAMCH EXTERNAL SLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PSLACHKIEEE, PSLASNBT, PSSEPREQ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PSLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) END IF * CALL PSLACHKIEEE( ISIEEE, SLAMCH( 'O' ), SLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) END IF GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK symmetric Eigendecomposition routines.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'symmetric eigenvalue routine: PSSYEVX & PSSYEV & ' // $ ' PSSYEVD.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 )' ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )' ||Q^T*Q - I|| ' // '/ (N * eps)' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ ' : the QTQ norm is allowed to exceed THRESH' // $ ' for those eigenvectors' WRITE( NOUT, FMT = 9999 )' : which could not be ' // $ 'reorthogonalized for lack of workspace.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see PSSEPtst.f).' WRITE( NOUT, FMT = 9999 )'SUB : Subtests ' // $ '(see PSSEPtst).f' WRITE( NOUT, FMT = 9999 )'CHK : ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )'QTQ : ||Q^T*Q - I||/ (N * eps)' WRITE( NOUT, FMT = 9999 ) $ ' : when the adjusted QTQ exceeds THRESH', $ ' the adjusted QTQ norm is printed' WRITE( NOUT, FMT = 9999 ) $ ' : otherwise the true QTQ norm is printed' WRITE( NOUT, FMT = 9999 ) $ ' If NT>1, CHK and QTQ are the max over all ' // $ 'eigen request tests' WRITE( NOUT, FMT = 9999 ) $ 'TEST : EVX - testing PSSYEVX, EV - testing PSSYEV' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 1454 ISEED( 2 ) = 3834 ISEED( 3 ) = 2203 ISEED( 4 ) = 583 * CALL PSSEPREQ( HETERO, NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * 9999 FORMAT( A ) 9998 FORMAT( ' I am about to check to make sure that overflow' ) 9997 FORMAT( ' is handled in the ieee default manner. If this' ) 9996 FORMAT( ' is the last output you see, you should assume' ) 9995 FORMAT( ' that overflow caused a floating point exception.' ) 9994 FORMAT( ' In that case, we recommend that you add -DNO_IEEE' ) 9993 FORMAT( ' to the CDEFS line in SLmake.inc.' ) 9992 FORMAT( ' Alternatively, you could set CDEFS in SLmake.inc ' ) 9991 FORMAT( ' to enable the default ieee behaviour, However, this' ) 9990 FORMAT( ' may result in good or very bad performance.' ) 9989 FORMAT( ' Either signed zeroes or signed infinities ' ) 9988 FORMAT( ' work incorrectly or your system. Change your' ) 9987 FORMAT( ' SLmake.inc as suggested above.' ) * 9986 FORMAT( ' Your system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP SUB WALL CPU ', $ ' CHK QTQ CHECK TEST' ) 9978 FORMAT( ' ----- --- --- --- --- --- -------- --------', $ ' --------- --------- ----- ----' ) * * End of PSSEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pssepinfo.f000644 000766 000024 00000030565 11622500733 017617 0ustar00juliestaff000000 000000 * * SUBROUTINE PSSEPINFO( CONTEXT, IAM, NIN, NOUT, MAXSETSIZE, $ NMATSIZES, MATSIZES, NUPLOS, UPLOS, $ NPCONFIGS, NPROWS, NPCOLS, NBS, NMATTYPES, $ MATTYPES, MAXTYPE, SUBTESTS, THRESH, ORDER, $ ABSTOL, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======= * * PSSEPINFO reads the input test data file (INFILE), copies the * information therein to all processes and returns this information * in the corresponding parameters. * * Arguments * ========= * * CONTEXT (global input) INTEGER * BLACS Context * * IAM (local input) INTEGER * process number. * IAM.EQ.0 on the proceesor that performs I/O * * NIN (global input) INTEGER * The unit number of the input file. * * NOUT (global output) INTEGER * The unit number for output file. * if NOUT = 6, ouput to screen, * if NOUT = 0, output to stderr * Only defined for process 0. * * MAXSETSIZE (global output) INTEGER * Maximum set size. Size of the following arrays: * MATSIZES, MATTYPES, NBS, NPCOLS, NPROWS * * NMATSIZES (global output) INTEGER * Number of matrix sizes to test * * MATSIZES (global output) INTEGER array dimension MAXSETSIZE * Matrix sizes to test * * NUPLOS (global output) INTEGER * Number of UPLO values to test * * UPLOS (global output) CHARACTER*1 array dimension 2 * Values of UPLO to test * * NPCONFIGS (global output) INTEGER * Number of process configuratins (NPROW, NPCOL, NB) * * NPROWS (global output) INTEGER array dimension MAXSETSIZE * Values of NPROW to test * * NPCOLS (global output) INTEGER array dimension MAXSETSIZE * Values of NPCOL to test * * NBS (global output) INTEGER array dimension MAXSETSIZE * Values of NB to test * * NMATTYPES (global output) INTEGER * Number of matrix types to test * * MATTYPES (global output) INTEGER array dimension MAXSETSIZE * Matrix types to test * Refer to PSSEPTST for a complete description of the * supported matrix types. * * MAXTYPE (global input) INTEGER * Maximum allowed matrix type * * SUBTESTS (global output) CHARACTER * 'N' = Do not perform subtests * 'Y' = Perfrom subtests * * * THRESH (global output) @(tupc) * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * ( THRESH is set to 1/10 of the value defined in the .dat * file when NOUT = 13. THRESH is set to 1/20 of the value * defined in the .dat file when NOUT = 14. This allows us * to specify more stringent criteria for our internal testing ) * * ORDER (global output) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global output) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PSSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * If ( ABSTOL .EQ. 0 in SEP.dat, it is set to * 2.0 * PSLAMCH( 'u' ) in this routine. * * INFO (global output) INTEGER * 0 = normal return * -1 = end of file * -2 = incorrrect data specification * * .. Scalar Arguments .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, INFO, MAXSETSIZE, MAXTYPE, NIN, $ NMATSIZES, NMATTYPES, NOUT, NPCONFIGS, NUPLOS, $ ORDER REAL ABSTOL, THRESH * .. * .. Array Arguments .. CHARACTER UPLOS( 2 ) INTEGER MATSIZES( MAXSETSIZE ), MATTYPES( MAXSETSIZE ), $ NBS( MAXSETSIZE ), NPCOLS( MAXSETSIZE ), $ NPROWS( MAXSETSIZE ) * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL TWO, TEN, TWENTY PARAMETER ( TWO = 2.0E0, TEN = 10.0E0, TWENTY = 20.0E0 ) * .. * .. Local Scalars .. CHARACTER*80 TESTSUMMRY INTEGER I, ISUBTESTS * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * * .. External Subroutines .. EXTERNAL IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * * .. Local Arrays .. INTEGER IUPLOS( 2 ) * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = 9997 )TESTSUMMRY TESTSUMMRY = ' ' READ( NIN, FMT = 9997 )TESTSUMMRY WRITE( NOUT, FMT = 9997 )TESTSUMMRY END IF * * assign a default INFO = 0 * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NMATSIZES CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NMATSIZES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NMATSIZES, 1, 0, 0 ) END IF IF( NMATSIZES.EQ.-1 ) THEN INFO = -1 GO TO 70 END IF IF( NMATSIZES.LT.1 .OR. NMATSIZES.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'Matrix size', NMATSIZES, 1, $ MAXSETSIZE END IF INFO = -2 GO TO 70 END IF * * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( MATSIZES( I ), I = 1, NMATSIZES ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NMATSIZES, MATSIZES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NMATSIZES, MATSIZES, 1, $ 0, 0 ) END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NUPLOS CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NUPLOS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NUPLOS, 1, 0, 0 ) END IF IF( NUPLOS.LT.1 .OR. NUPLOS.GT.2 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'# of UPLOs', NUPLOS, 1, 2 END IF INFO = -2 GO TO 70 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( UPLOS( I ), I = 1, NUPLOS ) DO 10 I = 1, NUPLOS IF( LSAME( UPLOS( I ), 'L' ) ) THEN IUPLOS( I ) = 1 ELSE IUPLOS( I ) = 2 END IF 10 CONTINUE CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NUPLOS, IUPLOS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NUPLOS, IUPLOS, 1, 0, 0 ) END IF DO 20 I = 1, NUPLOS IF( IUPLOS( I ).EQ.1 ) THEN UPLOS( I ) = 'L' ELSE UPLOS( I ) = 'U' END IF 20 CONTINUE * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NPCONFIGS CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NPCONFIGS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NPCONFIGS, 1, 0, 0 ) END IF IF( NPCONFIGS.LT.1 .OR. NPCONFIGS.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'# proc configs', NPCONFIGS, 1, $ MAXSETSIZE END IF INFO = -2 GO TO 70 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NPROWS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPROWS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPROWS, 1, 0, $ 0 ) END IF DO 30 I = 1, NPCONFIGS IF( NPROWS( I ).LE.0 ) $ INFO = -2 30 CONTINUE IF( INFO.EQ.-2 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9996 )' NPROW' END IF GO TO 70 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NPCOLS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPCOLS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPCOLS, 1, 0, $ 0 ) END IF DO 40 I = 1, NPCONFIGS IF( NPCOLS( I ).LE.0 ) $ INFO = -2 40 CONTINUE IF( INFO.EQ.-2 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9996 )' NPCOL' END IF GO TO 70 END IF * * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NBS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NBS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NBS, 1, 0, 0 ) END IF DO 50 I = 1, NPCONFIGS IF( NBS( I ).LE.0 ) $ INFO = -2 50 CONTINUE IF( INFO.EQ.-2 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9996 )' NB' END IF GO TO 70 END IF * * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NMATTYPES CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NMATTYPES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NMATTYPES, 1, 0, 0 ) END IF IF( NMATTYPES.LT.1 .OR. NMATTYPES.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'matrix types', NMATTYPES, 1, $ MAXSETSIZE END IF INFO = -2 GO TO 70 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( MATTYPES( I ), I = 1, NMATTYPES ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NMATTYPES, MATTYPES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NMATTYPES, MATTYPES, 1, $ 0, 0 ) END IF * DO 60 I = 1, NMATTYPES IF( MATTYPES( I ).LT.1 .OR. MATTYPES( I ).GT.MAXTYPE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'matrix type', MATTYPES( I ), $ 1, MAXTYPE END IF MATTYPES( I ) = 1 END IF 60 CONTINUE * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )SUBTESTS IF( LSAME( SUBTESTS, 'Y' ) ) THEN ISUBTESTS = 2 ELSE ISUBTESTS = 1 END IF CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, ISUBTESTS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, ISUBTESTS, 1, 0, 0 ) END IF IF( ISUBTESTS.EQ.2 ) THEN SUBTESTS = 'Y' ELSE SUBTESTS = 'N' END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )THRESH IF( NOUT.EQ.13 ) $ THRESH = THRESH / TEN IF( NOUT.EQ.14 ) $ THRESH = THRESH / TWENTY CALL SGEBS2D( CONTEXT, 'All', ' ', 1, 1, THRESH, 1 ) ELSE CALL SGEBR2D( CONTEXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) END IF * ORDER = 0 * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )ABSTOL CALL SGEBS2D( CONTEXT, 'All', ' ', 1, 1, ABSTOL, 1 ) ELSE CALL SGEBR2D( CONTEXT, 'All', ' ', 1, 1, ABSTOL, 1, 0, 0 ) END IF IF( ABSTOL.LT.0 ) $ ABSTOL = TWO*PSLAMCH( CONTEXT, 'U' ) * INFO = 0 * 70 CONTINUE RETURN * 9999 FORMAT( A20, ' is:', I5, ' must be between:', I5, ' and', I5 ) 9998 FORMAT( A20, ' is:', I5, ' must be:', I5, ' or', I5 ) 9997 FORMAT( A ) 9996 FORMAT( A20, ' must be positive' ) * * End of PSSEPINFO * END scalapack-2.0.2/TESTING/EIG/pssepqtq.f000644 000766 000024 00000025211 10363532303 017460 0ustar00juliestaff000000 000000 * * SUBROUTINE PSSEPQTQ( MS, NV, THRESH, Q, IQ, JQ, DESCQ, C, IC, JC, $ DESCC, PROCDIST, ICLUSTR, GAP, WORK, LWORK, $ QTQNRM, INFO, RES ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IC, INFO, IQ, JC, JQ, LWORK, MS, NV, RES REAL QTQNRM, THRESH * .. * .. Array Arguments .. * INTEGER DESCC( * ), DESCQ( * ), ICLUSTR( * ), $ PROCDIST( * ) REAL C( * ), GAP( * ), Q( * ), WORK( * ) * .. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Purpose * ======= * * Compute |I - QT * Q| / (ulp * n) * * Arguments * ========= * * NP = number of local rows in C * NQ = number of local columns in C and Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in Q * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) REAL array, * global dimension (MS, NV), local dimension (LDQ, NQ) * * Contains the eigenvectors as computed by PSSTEIN * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Q. * * C (local workspace) REAL array, * global dimension (NV, NV), local dimension (DESCC(DLEN_), NQ) * * Accumulator for computing I - QT * Q * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * W (input) REAL array, dimension (NV) * All procesors have an identical copy of W() * * Contains the computed eigenvalues * * PROCDIST (global input) INTEGER array dimension (NPROW*NPCOL+1) * Identifies which eigenvectors are the last to be computed * by a given process * * ICLUSTR (global input) INTEGER array dimension (2*P) * This input array contains indices of eigenvectors * corresponding to a cluster of eigenvalues that could not be * orthogonalized due to insufficient workspace. * This should be the output of PSSTEIN. * * GAP (global input) REAL array, dimension (P) * This input array contains the gap between eigenvalues whose * eigenvectors could not be orthogonalized. * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= 2 + MAX( DESCC( MB_ ), 2 )*( 2*NP0+MQ0 ) * Where: * NP0 = NUMROC( NV, DESCC( MB_ ), 0, 0, NPROW ) * MQ0 = NUMROC( NV, DESCC( NB_ ), 0, 0, NPCOL ) * * QTQNRM (global output) REAL * |QTQ -I| / EPS * * RES (global output) INTEGER * 0 if the test passes i.e. |I - QT * Q| / (ulp * n) <= THRESH * 1 if the test fails i.e. |I - QT * Q| / (ulp * n) > THRESH * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE, NEGONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, $ NEGONE = -1.0E+0 ) * .. * .. Intrinsic Functions .. * INTRINSIC MAX, REAL * .. * .. Local Scalars .. INTEGER CLUSTER, FIRSTP, IMAX, IMIN, JMAX, JMIN, LWMIN, $ MQ0, MYCOL, MYROW, NEXTP, NP0, NPCOL, NPROW REAL NORM, QTQNRM2, ULP * .. * .. External Functions .. INTEGER NUMROC REAL PSLAMCH, PSLANGE EXTERNAL NUMROC, PSLAMCH, PSLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PSGEMM, PSLASET, $ PSMATADD, PXERBLA * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * RES = 0 ULP = PSLAMCH( DESCC( CTXT_ ), 'P' ) * CALL BLACS_GRIDINFO( DESCC( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 CALL CHK1MAT( MS, 1, MS, 2, IQ, JQ, DESCQ, 7, INFO ) CALL CHK1MAT( NV, 1, MS, 2, IC, JC, DESCC, 11, INFO ) * IF( INFO.EQ.0 ) THEN NP0 = NUMROC( NV, DESCC( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( NV, DESCC( NB_ ), 0, 0, NPCOL ) * LWMIN = 2 + MAX( DESCC( MB_ ), 2 )*( 2*NP0+MQ0 ) * IF( IQ.NE.1 ) THEN INFO = -5 ELSE IF( JQ.NE.1 ) THEN INFO = -6 ELSE IF( IC.NE.1 ) THEN INFO = -9 ELSE IF( JC.NE.1 ) THEN INFO = -10 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -16 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCC( CTXT_ ), 'PSSEPQTQ', -INFO ) RETURN END IF * * C = Identity matrix * CALL PSLASET( 'A', NV, NV, ZERO, ONE, C, IC, JC, DESCC ) * * C = C - QT * Q * IF( NV*MS.GT.0 ) THEN CALL PSGEMM( 'Transpose', 'N', NV, NV, MS, NEGONE, Q, 1, 1, $ DESCQ, Q, 1, 1, DESCQ, ONE, C, 1, 1, DESCC ) END IF * * Allow for poorly orthogonalized eigenvectors for large clusters * NORM = PSLANGE( '1', NV, NV, C, 1, 1, DESCC, WORK ) QTQNRM = NORM / ( REAL( MAX( MS, 1 ) )*ULP ) * CLUSTER = 1 10 CONTINUE DO 20 FIRSTP = 1, NPROW*NPCOL IF( PROCDIST( FIRSTP ).GE.ICLUSTR( 2*( CLUSTER-1 )+1 ) ) $ GO TO 30 20 CONTINUE 30 CONTINUE * IMIN = ICLUSTR( 2*CLUSTER-1 ) JMAX = ICLUSTR( 2*CLUSTER ) * * IF( IMIN.EQ.0 ) $ GO TO 60 * DO 40 NEXTP = FIRSTP, NPROW*NPCOL IMAX = PROCDIST( NEXTP ) JMIN = IMAX + 1 * * CALL PSMATADD( IMAX-IMIN+1, JMAX-JMIN+1, ZERO, C, IMIN, JMIN, $ DESCC, GAP( CLUSTER ) / 0.01E+0, C, IMIN, JMIN, $ DESCC ) CALL PSMATADD( JMAX-JMIN+1, IMAX-IMIN+1, ZERO, C, JMIN, IMIN, $ DESCC, GAP( CLUSTER ) / 0.01E+0, C, JMIN, IMIN, $ DESCC ) IMIN = IMAX * IF( ICLUSTR( 2*CLUSTER ).LT.PROCDIST( NEXTP+1 ) ) $ GO TO 50 40 CONTINUE 50 CONTINUE * CLUSTER = CLUSTER + 1 GO TO 10 60 CONTINUE * * Compute the norm of C * NORM = PSLANGE( '1', NV, NV, C, 1, 1, DESCC, WORK ) * QTQNRM2 = NORM / ( REAL( MAX( MS, 1 ) )*ULP ) * IF( QTQNRM2.GT.THRESH ) THEN RES = 1 QTQNRM = QTQNRM2 END IF RETURN * * End of PSSEPQTQ * END scalapack-2.0.2/TESTING/EIG/psseprdriver.f000644 000766 000024 00000021335 11623527140 020336 0ustar00juliestaff000000 000000 PROGRAM PSSEPRDRIVER * * Parallel REAL symmetric eigenproblem test driver for PSSYEVR * IMPLICIT NONE * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. REALSZ * indicates the length in bytes on the given platform for a number, * real for SINGLE/DOUBLE PRECISION, and complex for COMPLEX/COMPLEX*16. * For example, on a standard system, the length of a * REAL is 4, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * TESTS PERFORMED * =============== * * This routine performs tests for combinations of: matrix size, process * configuration (nprow and npcol), block size (nb), * matrix type, range of eigenvalue (all, by value, by index), * and upper vs. lower storage. * * It returns an error message when heterogeneity is detected. * * The input file allows multiple requests where each one is * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, REALSZ, NIN PARAMETER ( TOTMEM = 100000000, REALSZ = 4, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / REALSZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) REAL MEM( MEMSIZ ) * .. * .. External Functions .. REAL SLAMCH EXTERNAL SLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PSLACHKIEEE, PSLASNBT, PSSEPRREQ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEPR.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PSLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) END IF * CALL PSLACHKIEEE( ISIEEE, SLAMCH( 'O' ), SLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'Test ScaLAPACK symmetric eigendecomposition routine.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'symmetric eigenvalue routine: PSSYEVR.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 )' ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )' ||Q^T*Q - I|| ' // '/ (N * eps)' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT = PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see PSSEPRTST).' WRITE( NOUT, FMT = 9999 )'SUB : Subtests (Y/N).' WRITE( NOUT, FMT = 9999 )'WALL : Wallclock time.' WRITE( NOUT, FMT = 9999 )'CPU : CPU time.' WRITE( NOUT, FMT = 9999 )'CHK : ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )'QTQ : ||Q^T*Q - I||/ (N * eps)' WRITE( NOUT, FMT = 9999 ) $ ' : when the adjusted QTQ norm exceeds THRESH', $ ' it is printed,' WRITE( NOUT, FMT = 9999 ) $ ' otherwise the true QTQ norm is printed.' WRITE( NOUT, FMT = 9999 ) $ ' : If more than one test is done, CHK and QTQ ' WRITE( NOUT, FMT = 9999 ) $ ' are the max over all eigentests performed.' WRITE( NOUT, FMT = 9999 ) $ 'TEST : EVR - testing PSSYEVR' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PSSEPRREQ( HETERO, NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * c CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', ' ') * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * 9999 FORMAT( A ) 9997 FORMAT( 'Check if overflow is handled in ieee default manner.' ) 9996 FORMAT( 'If this is the last output you see, you should assume') 9995 FORMAT( 'that overflow caused a floating point exception.' ) * 9986 FORMAT( 'Test ok. The system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP SUB WALL CPU ', $ ' CHK QTQ CHECK TEST' ) 9978 FORMAT( ' ----- --- --- --- --- --- -------- --------', $ ' --------- --------- ----- ----' ) * * End of PSSEPRDRIVER * END scalapack-2.0.2/TESTING/EIG/pssepreq.f000644 000766 000024 00000022237 10363532303 017447 0ustar00juliestaff000000 000000 * * SUBROUTINE PSSEPREQ( HETERO, NIN, MEM, MEMSIZE, NOUT, ISEED, $ NTESTS, NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER HETERO INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL MEM( MEMSIZE ) * .. * * Purpose * ======= * * PSSEPREQ performs one request from the input file 'SEP.dat' * A request is the cross product of the specifications in the * input file. PSSEPREQ prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEP.dat' * * MEM (local input) REAL ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER REALSZ, INTGSZ PARAMETER ( REALSZ = 4, INTGSZ = 4 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IMIDPAD, INITCON, IPOSTPAD, $ IPREPAD, ISIZESUBTST, ISIZESYEVX, ISIZETST, $ LDA, LLWORK, MATSIZE, MATTYPE, MYCOL, MYROW, N, $ NB, NMATSIZES, NMATTYPES, NNODES, NP, NPCOL, $ NPCONFIGS, NPROW, NQ, NUPLOS, ORDER, PCONFIG, $ PTRA, PTRCOPYA, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRW, PTRW2, PTRWORK, PTRZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEV, SIZESYEVX, $ SIZETMS, SIZETST, UPLO, SIZESYEVD, ISIZESYEVD REAL ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PSLASIZESQP, PSSEPINFO, PSSEPTST * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PSSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * * DO 40 MATSIZE = 1, NMATSIZES * DO 30 PCONFIG = 1, NPCONFIGS * DO 20 MATTYPE = 1, NMATTYPES * DO 10 UPLO = 1, NUPLOS * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, $ MYCOL ) * IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PSLASIZESQP( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, $ ISIZESYEVX, SIZESYEV, $ SIZESYEVD, ISIZESYEVD, $ SIZESUBTST, ISIZESUBTST, $ SIZETST, ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + MAX( N, 1 ) + IPREPAD + IPOSTPAD PTRGAP = PTRW2 + MAX( N, 1 ) + IPREPAD + $ IPOSTPAD PTRIFAIL = PTRGAP + NPROW*NPCOL + IPREPAD + $ IPOSTPAD PTRICLUS = PTRIFAIL + ICEIL( N+IPREPAD+IPOSTPAD, $ REALSZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, REALSZ / INTGSZ ) PTRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, REALSZ / INTGSZ ) LLWORK = MEMSIZE - PTRWORK + 1 C LLWORK = MEMSIZE - PTRWORK - IPREPAD - C $ IPOSTPAD + 1 NTESTS = NTESTS + 1 IF( LLWORK.LT.SIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PSSEPTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), SUBTESTS, $ THRESH, N, ABSTOL, ISEED, $ MEM( PTRA ), MEM( PTRCOPYA ), $ MEM( PTRZ ), LDA, MEM( PTRW ), $ MEM( PTRW2 ), MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ LLWORK, MEM( PTRIWRK ), $ ISIZETST, HETERO, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * )' PSSEPREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE END IF * * RETURN * * End of PSSEPREQ * END scalapack-2.0.2/TESTING/EIG/psseprreq.f000644 000766 000024 00000021016 11623527140 017626 0ustar00juliestaff000000 000000 SUBROUTINE PSSEPRREQ( HETERO, NIN, MEM, MEMSIZE, NOUT, ISEED, $ NTESTS, NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER HETERO INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) REAL MEM( MEMSIZE ) * * Purpose * ======= * * PSSEPRREQ performs one request from the input file 'SEPR.dat' * A request is the cross product of the specifications in the * input file. It prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEPR.dat' * * MEM (local input ) REAL ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER REALSZ, INTGSZ PARAMETER ( REALSZ = 4, INTGSZ = 4 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IMIDPAD, INITCON, IPOSTPAD, $ IPREPAD, ISIZESUBTST, ISIZEEVR, ISIZETST, $ LDA, LLWORK, MATSIZE, MATTYPE, MYCOL, MYROW, N, $ NB, NMATSIZES, NMATTYPES, NNODES, NP, NPCOL, $ NPCONFIGS, NPROW, NQ, NUPLOS, ORDER, PCONFIG, $ PTRA, PTRCOPYA, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRW, PTRW2, PTRWORK, PTRZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZEEVR, $ SIZETMS, SIZETST, UPLO * REAL ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PSLASIZESEPR, PSSEPINFO, PSSEPRTST * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PSSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * DO 40 MATSIZE = 1, NMATSIZES * DO 30 PCONFIG = 1, NPCONFIGS * DO 20 MATTYPE = 1, NMATTYPES * DO 10 UPLO = 1, NUPLOS * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, $ MYCOL ) * IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PSLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, ISIZEEVR, $ SIZESUBTST, ISIZESUBTST, $ SIZETST, ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + MAX( N, 1 ) + IPREPAD + IPOSTPAD PTRGAP = PTRW2 + MAX( N, 1 ) + IPREPAD + $ IPOSTPAD PTRIFAIL = PTRGAP + NPROW*NPCOL + IPREPAD + $ IPOSTPAD PTRICLUS = PTRIFAIL + ICEIL( N+IPREPAD+IPOSTPAD, $ REALSZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, REALSZ / INTGSZ ) PTRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, REALSZ / INTGSZ ) LLWORK = MEMSIZE - PTRWORK + 1 NTESTS = NTESTS + 1 IF( LLWORK.LT.SIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PSSEPRTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), SUBTESTS, $ THRESH, N, ABSTOL, ISEED, $ MEM( PTRA ), MEM( PTRCOPYA ), $ MEM( PTRZ ), LDA, MEM( PTRW ), $ MEM( PTRW2 ), MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ LLWORK, MEM( PTRIWRK ), $ ISIZETST, HETERO, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * )' PSSEPRREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of PSSEPRREQ * END scalapack-2.0.2/TESTING/EIG/psseprsubtst.f000644 000766 000024 00000070321 11623527140 020366 0ustar00juliestaff000000 000000 SUBROUTINE PSSEPRSUBTST( WKNOWN, JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, IA, JA, $ DESCA, WIN, WNEW, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK, LWORK, LWORK1, $ IWORK, LIWORK, RESULT, TSTNRM, QTQNRM, $ NOUT ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, IPOSTPAD, IPREPAD, IU, JA, LIWORK, $ LWORK, LWORK1, N, NOUT, RESULT REAL ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) REAL A( * ), COPYA( * ), GAP( * ), WIN( * ), $ WNEW( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSSEPRSUBTST calls PSSYEVR and then tests its output. * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues computed. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 100 or 250. In particular, * it should not depend on the size of the matrix. * It must be at least zero. * * ABSTOL (global input) REAL * The absolute tolerance for the residual test. * * A (local workspace) REAL array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * The test matrix, which is subsequently overwritten. * A is distributed in a 2D-block cyclic manner over both rows * and columns. * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) REAL array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) REAL array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PSSEPCHK and PSSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) REAL array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) REAL array, dimension (N) * The computed eigenvalues. * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) REAL array, * dimension (NPROW*NPCOL) * * WORK (local workspace) REAL array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to the eigensolver. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call. * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) REAL * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) REAL * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER DLEN_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( DLEN_ = 9, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285E0, FIVE = 5.0E0, $ NEGONE = -1.0E0 ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZESUBTST, ISIZEEVR, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZEEVR, SIZETMS, $ SIZETST, VALSIZE, VECSIZE REAL EPS, EPSNORMA, ERROR, MAXERROR, MAXVU, $ MINERROR, MINVL, NORMWIN, OLDVL, OLDVU, $ SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), ISEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH, PSLANSY EXTERNAL LSAME, NUMROC, PSLAMCH, PSLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, IGAMN2D, IGAMX2D, $ PICHEKPAD, PIFILLPAD, PSCHEKPAD, PSELSET, $ PSFILLPAD, PSLASIZESEPR, PSLASIZESYEVR, $ PSSEPCHK, PSSEPQTQ, PSSYEVR, SGAMN2D, SGAMX2D, $ SLACPY, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * CALL PSLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, ISIZEEVR, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PSLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that no information from previous calls is used * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3E0 10 CONTINUE * DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159E0 30 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF (LSAME( RANGE, 'V' ) ) THEN * WRITE(*,*) 'VL VU = ', VL, ' ', VU END IF IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL * WRITE(*,*) 'MINVL = ', MINVL, ' MAXVU = ', MAXVU * WRITE(*,*) 'WIN = ', WIN( 1 ) MINIL = 1 MAXIU = 0 DO 40 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 40 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * ISEED( 1 ) = 1 * CALL PSLASIZESYEVR( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL SLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PSFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0E0 ) * * WRITE(*,*) ' NP = ', NP, ' MQ = ', MQ, ' LDZ = ', DESCZ( LLD_ ), * $ ' IPREPAD = ', IPREPAD, ' IPOSTPAD = ', IPOSTPAD, * $ ' MAXEIGS = ', MAXEIGS * WRITE(*,*) ' PADZ( 1 ) = ', Z( 1 ), ' PADZ( 2 ) = ', Z( 2 ), * $ ' PADZ( 3 ) = ', Z( 3 ), ' PADZ( 4 ) = ', Z( 4 ) * CALL PSFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0E0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0E0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0E0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * * Make sure that PSSYEVR does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, MAXEIGS, 1 CALL PSELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0E0 ) 50 CONTINUE 60 CONTINUE * * Reset and start the timer * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) ********************************* * * Main call to PSSYEVR * CALL PSSYEVR( JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ VL, VU, IL, IU, M, NZ, WNEW( 1+IPREPAD ), $ Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), LWORK1, IWORK( 1+IPREPAD ), $ LIWORK, INFO ) * ********************************* * * Stop timer * CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * * Indicate that there are no unresolved clusters. * This is necessary so that the tester * (adapted from the one originally made for PSSYEVX) * works correctly. ICLUSTR( 1+IPREPAD ) = 0 * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVR-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PSCHEKPAD( DESCZ( CTXT_ ), 'PSSYEVR-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0E0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVR-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0E0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVR-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0E0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVR-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0E0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PSSYEVR-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PSSYEVR-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PSSYEVR-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * If we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PSLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * Check INFO * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) WRITE( NOUT,*) 'M = ', M, '\n', 'N = ', N RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) WRITE( NOUT,*) 'IL = ', IL, ' IU = ', IU, ' M = ', M END IF RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Ensure that different processes return the same eigenvalues * DO 70 I = 1, M WORK( I ) = WNEW( I+IPREPAD ) WORK( I+M ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, WORK, M, 1, $ 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ WORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 80 I = 1, M IF( RESULT.EQ.0 .AND. ( ABS( WORK( I )-WORK( M+ $ I ) ).GT.FIVE*EPS*ABS( WORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 80 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 90 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 100 NCLUSTERS = NCLUSTERS + 1 90 CONTINUE 100 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 110 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 110 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * DO 120 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 120 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 150 * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PSLANSY( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ WORK )*EPS END IF * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |A Z - Z W| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3E0 ) * CALL PSSEPCHK( N, NZ, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSEPCHK-WORK', SIZECHK, 1, $ WORK, SIZECHK, IPREPAD, IPOSTPAD, 4.3E0 ) * IF( RES.NE.0 ) $ RESULT = 1 * * Perform the |QTQ - I| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1, WORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3E0 ) * * CALL PSSEPQTQ( N, NZ, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1+IPREPAD+1 ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), WORK( IPREPAD+1 ), SIZEQTQ, $ QTQNRM, INFO, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSEPQTQ-WORK', SIZEQTQ, 1, $ WORK, SIZEQTQ, IPREPAD, IPOSTPAD, 4.3E0 ) * IF( RES.NE.0 ) $ RESULT = 1 * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that the right eigenvalues have been obtained * IF( WKNOWN ) THEN * Set up MYIL if necessary MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 140 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 130 I = 1, M * WRITE(*,*) 'WIN WNEW = ', WIN( I+MYIL-1 ), * $ WNEW( I+IPREPAD ) ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 130 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 140 CONTINUE * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what was computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * RETURN * 9999 FORMAT( 'PSSYEVR returned INFO=', I7 ) 9998 FORMAT( 'PSSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PSSEPRSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PSSYEVR returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PSSYEVR' ) 9981 FORMAT( 'NZ altered by PSSYEVR with JOBZ=N' ) * * End of PSSEPRSUBTST * END scalapack-2.0.2/TESTING/EIG/psseprtst.f000644 000766 000024 00000072031 11623527140 017654 0ustar00juliestaff000000 000000 SUBROUTINE PSSEPRTST(DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, $ ORDER, ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, $ IWORK, LIWORK, HETERO, NOUT, INFO ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER HETERO, SUBTESTS, UPLO INTEGER INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, LWORK, $ MATTYPE, N, NOUT, ORDER REAL ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) REAL A( LDA, * ), COPYA( LDA, * ), GAP( * ), $ WIN( * ), WNEW( * ), WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PSSEPRTST builds a random matrix and runs PSSYEVR to * compute the eigenvalues and eigenvectors. Then it performs two tests * to determine if the result is good enough. The two tests are: * |AQ -QL| / (abstol + ulp * norm(A) ) * and * |QT * Q - I| / ulp * norm(A) * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) A matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) REAL * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) REAL array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * The test matrix, which is then overwritten. * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * * COPYA (local workspace) REAL array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * Z (local workspace) REAL array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PSSEPCHK and PSSEPQTQ * * W (local workspace) REAL array, dimension (N) * On normal exit, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * Not used, only for backward compatibility * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PSLASIZESEPR * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PSLASIZESEPR * * HETERO (input) INTEGER * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER CTXT_, MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( CTXT_ = 2, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL HALF, ONE, TEN, ZERO PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, $ TEN = 10.0E0, HALF = 0.5E0 ) REAL PADVAL PARAMETER ( PADVAL = 19.25E0 ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IHETERO, IINFO, IL, IMODE, IN, $ INDD, INDWORK, ISIZESUBTST, ISIZEEVR, $ ISIZETST, ITYPE, IU, J, LLWORK, LEVRSIZE, $ MAXSIZE, MYCOL, MYROW, NB, NGEN, NLOC, $ NNODES, NP, NPCOL, NPROW, NQ, RES, SIZECHK, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, $ SIZESUBTST, SIZEEVR, SIZETMS, $ SIZETST, VALSIZE, VECSIZE REAL ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL SLARAN, PSLAMCH EXTERNAL SLARAN, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, IGAMX2D, IGEBR2D, $ IGEBS2D, PSCHEKPAD, PSELSET, PSFILLPAD, $ PSLASET, PSLASIZESEPR, PSLASIZESYEVR, PSLATMS, $ PSMATGEN, PSSEPRSUBTST, SLABAD, SLASRT, SLATMS, $ SLCOMBINE * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, INT, MAX, MIN, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * INFO = 0 PASSED = 'PASSED EVR' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * Distribute HETERO across processes * IF( IAM.EQ.0 ) THEN IF( LSAME( HETERO, 'Y' ) ) THEN IHETERO = 2 ELSE IHETERO = 1 END IF CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1, 0, 0 ) END IF IF( IHETERO.EQ.2 ) THEN HETERO = 'Y' ELSE HETERO = 'N' END IF * * Make sure that there is enough memory * CALL PSLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, ISIZEEVR, $ SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) IF( LWORK.LT.SIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDWORK = INDD + N LLWORK = LWORK - INDWORK + 1 * ULP = PSLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PSLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL SLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / REAL( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log symmetric, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random symmetric * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N WORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PSLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N WORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PSLASET( 'All', N, N, ZERO, ONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0E0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0E0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * symmetric, eigenvalues specified * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0E0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0E0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * symmetric, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PSMATGEN( DESCA( CTXT_ ), 'S', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0E0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0E0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PSLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( SLARAN( ISEED )*REAL( NLOC ) ), N-NGEN ) * CALL SLATMS( IN, IN, 'S', ISEED, 'P', WORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PSELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PSELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PSELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PSELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2,... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 WORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0E0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0E0 ) * ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL SLASRT( 'I', N, WORK( INDD ), IINFO ) * CALL PSLASIZESYEVR( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, WORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) LEVRSIZE = MIN( MAXSIZE, LLWORK ) * CALL PSSEPRSUBTST( WKNOWN, 'v', 'a', UPLO, N, VL, VU, IL, IU, $ THRESH, ABSTOL, A, COPYA, Z, 1, 1, DESCA, $ WORK( INDD ), WIN, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK( INDWORK ), LLWORK, $ LEVRSIZE, IWORK, ISIZEEVR, RES, TSTNRM, $ QTQNRM, NOUT ) * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PSLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PSSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 1' INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * Use PSLASIZESYEVR to choose IL and IU. * CALL PSLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PSSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 2' INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PSLASIZESYEVR to choose IL and IU for us. * CALL PSLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VECSIZE * CALL PSSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PSLASIZESYEVR to choose IL and IU for us. * CALL PSLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PSSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVR to choose VL and VU for us. * CALL PSLASIZESYEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VECSIZE * CALL PSSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF END IF * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 .AND. .FALSE. ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF C WRITE(*,*)'************************************************' END IF * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, $ F8.2, 1X, F8.2, 1X, G9.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 21X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 21X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 32X, $ 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) C 9984 FORMAT( ' IBTYPE=', I8 ) C 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) C 9980 FORMAT( ' Increase TOTMEM in PSSEPRDRIVER' ) * * End of PSSEPRTST * END scalapack-2.0.2/TESTING/EIG/pssepsubtst.f000644 000766 000024 00000067451 10363532303 020213 0ustar00juliestaff000000 000000 SUBROUTINE PSSEPSUBTST( WKNOWN, JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, IA, JA, $ DESCA, WIN, WNEW, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK, LWORK, LWORK1, $ IWORK, LIWORK, RESULT, TSTNRM, QTQNRM, $ NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 15, 2000 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, IPOSTPAD, IPREPAD, IU, JA, LIWORK, $ LWORK, LWORK1, N, NOUT, RESULT REAL ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) REAL A( * ), COPYA( * ), GAP( * ), WIN( * ), $ WNEW( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSSEPSUBTST calls PSSYEVX and then tests the output of * PSSYEVX * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PSSYEVX when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PSSEPSUBTST * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call to PSSEPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PSSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) REAL array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PSSYEVX for a description of block cyclic layout. * The test matrix, which is then modified by PSSYEVX * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) REAL array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) REAL array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PSSEPCHK and PSSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) REAL array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) REAL array, dimension (N) * The eigenvalues as copmuted by this call to PSSYEVX * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) REAL array, * dimension (NPROW*NPCOL) * * WORK (local workspace) REAL array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PSSYEVX * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PSSYEVX * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) REAL * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) REAL * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285E+0, FIVE = 5.0E+0, $ NEGONE = -1.0E+0 ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. INTEGER I, IAM, INDIWRK, INFO, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEVX, SIZETMS, $ SIZETST, VALSIZE, VECSIZE REAL EPS, EPSNORMA, ERROR, MAXERROR, MAXVU, $ MINERROR, MINVL, NORMWIN, OLDVL, OLDVU, ORFAC, $ SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), DSEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH, PSLANSY EXTERNAL LSAME, NUMROC, PSLAMCH, PSLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, IGAMN2D, IGAMX2D, $ PICHEKPAD, PIFILLPAD, PSCHEKPAD, PSELSET, $ PSFILLPAD, PSLASIZESEP, PSLASIZESYEVX, $ PSSEPCHK, PSSEPQTQ, PSSYEVX, SGAMN2D, SGAMX2D, $ SLACPY, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PSLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESUBTST, $ ISIZESUBTST, SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PSLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe min' ) NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * * Make sure that we aren't using information from previous calls * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3E+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159E+0 30 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL MINIL = 1 MAXIU = 0 DO 40 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 40 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * * * DSEED is not used in this call to PSLASIZESYEVX, the * following line just makes ftnchek happy. * DSEED( 1 ) = 1 * CALL PSLASIZESYEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL SLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PSFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0E+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * * Make sure that PSSYEVX does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, MAXEIGS, 1 CALL PSELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0E+0 ) 50 CONTINUE 60 CONTINUE * ORFAC = -1.0E+0 * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PSSYEVX( JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ VL, VU, IL, IU, ABSTOL, M, NZ, WNEW( 1+IPREPAD ), $ ORFAC, Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), LWORK1, IWORK( 1+IPREPAD ), $ LIWORK, IFAIL( 1+IPREPAD ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVX-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PSCHEKPAD( DESCZ( CTXT_ ), 'PSSYEVX-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVX-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVX-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEVX-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PSSYEVX-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PSSYEVX-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PSSYEVX-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * * Since we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9992 ) RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Make sure that different processes return the same eigenvalues * DO 70 I = 1, M WORK( I ) = WNEW( I+IPREPAD ) WORK( I+M ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, WORK, M, 1, $ 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ WORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 80 I = 1, M IF( RESULT.EQ.0 .AND. WORK( I ).NE.WORK( M+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 80 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 90 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 100 NCLUSTERS = NCLUSTERS + 1 90 CONTINUE 100 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 110 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 110 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * * DO 120 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 120 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 150 * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PSLANSY( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ WORK )*EPS END IF * * Note that a couple key variables get redefined in PSSEPCHK * as described by this table: * * PSSEPTST name PSSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |AQ - QE| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3E+0 ) * CALL PSSEPCHK( N, NZ, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSEPCHK-WORK', SIZECHK, 1, $ WORK, SIZECHK, IPREPAD, IPOSTPAD, 4.3E+0 ) * IF( RES.NE.0 ) $ RESULT = 1 * * Perform the |QTQ - I| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1, WORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3E+0 ) * * CALL PSSEPQTQ( N, NZ, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1+IPREPAD+1 ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), WORK( IPREPAD+1 ), SIZEQTQ, $ QTQNRM, INFO, RES ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSEPQTQ-WORK', SIZEQTQ, 1, $ WORK, SIZEQTQ, IPREPAD, IPOSTPAD, 4.3E+0 ) * IF( RES.NE.0 ) $ RESULT = 1 * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN ) THEN * * Set up MYIL if necessary * MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 140 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * IF( .NOT.LSAME( RANGE, 'V' ) .OR. $ ( MYIL.EQ.1 .OR. ( WIN( MYIL-1 ).LT.VL+NORMWIN*FIVE* $ THRESH*EPS ) ) ) THEN IF( .NOT.LSAME( RANGE, 'V' ) .OR. $ ( MYIL.EQ.N-M+1 .OR. ( WIN( MYIL+M ).GT.VU- $ NORMWIN*FIVE*THRESH*EPS ) ) ) THEN * * Make sure that the eigenvalues that we report are OK * DO 130 I = 1, M ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 130 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 140 CONTINUE * * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what PxSYEVX computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * * RETURN * 9999 FORMAT( 'PSSYEVX returned INFO=', I7 ) 9998 FORMAT( 'PSSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PSSEPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PSSYEVX returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PSSYEVX' ) 9981 FORMAT( 'NZ altered by PSSYEVX with JOBZ=N' ) * * End of PSSEPSUBTST * END scalapack-2.0.2/TESTING/EIG/psseptst.f000644 000766 000024 00000135232 11622500733 017473 0ustar00juliestaff000000 000000 SUBROUTINE PSSEPTST( DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, $ ORDER, ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, IWORK, LIWORK, HETERO, NOUT, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * .. Scalar Arguments .. CHARACTER HETERO, SUBTESTS, UPLO INTEGER INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, LWORK, $ MATTYPE, N, NOUT, ORDER REAL ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) REAL A( LDA, * ), COPYA( LDA, * ), GAP( * ), $ WIN( * ), WNEW( * ), WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PSSEPTST builds a random matrix, runs PSSYEVX and PSSYEV to * compute the eigenvalues * and eigenvectors and then performs two tests to * determine if the result * is good enough. The two tests are: * |AQ -QL| / (abstol + ulp * norm(A) ) * and * |QT * Q - I| / ulp * norm(A) * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) symmetric matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PSSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) REAL array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * The test matrix, which is then modified by PSSYEVX * * COPYA (local workspace) REAL array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * Z (local workspace) REAL array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PSSEPCHK and PSSEPQTQ * * W (local workspace) REAL array, dimension (N) * On normal exit from PSSYEVX, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PSLASIZESQP * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PSLASIZESQP * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DT_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL HALF, ONE, TEN, ZERO PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, $ TEN = 10.0E+0, HALF = 0.5E+0 ) REAL PADVAL PARAMETER ( PADVAL = 19.25E+0 ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IHETERO, IINFO, IL, IMODE, IN, $ INDD, INDWORK, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, ITYPE, IU, J, LLWORK, LSYEVXSIZE, $ MAXSIZE, MINSIZE, MYCOL, MYROW, NB, NGEN, NLOC, $ NNODES, NP, NPCOL, NPROW, NQ, RES, SIZECHK, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, $ SIZESUBTST, SIZESYEV, SIZESYEVX, SIZETMS, $ SIZETST, VALSIZE, VECSIZE,ISIZESYEVD, SIZESYEVD REAL ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH, SLARAN EXTERNAL LSAME, NUMROC, PSLAMCH, SLARAN * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, IGAMX2D, IGEBR2D, $ IGEBS2D, PSCHEKPAD, PSELSET, PSFILLPAD, $ PSLASET, PSLASIZESQP, PSLASIZESYEVX, PSLATMS, $ PSMATGEN, PSSEPSUBTST, PSSQPSUBTST, PSSYEV, $ SLABAD, SLASRT, SLATMS, SLCOMBINE * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, INT, MAX, MIN, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DT_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 PASSED = 'PASSED EVX' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * Distribute HETERO across processes * IF( IAM.EQ.0 ) THEN IF( LSAME( HETERO, 'Y' ) ) THEN IHETERO = 2 ELSE IHETERO = 1 END IF CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1, 0, 0 ) END IF IF( IHETERO.EQ.2 ) THEN HETERO = 'Y' ELSE HETERO = 'N' END IF * * Make sure that we have enough memory * CALL PSLASIZESQP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESYEV, $ SIZESYEVD, ISIZESYEVD, $ SIZESUBTST, ISIZESUBTST, SIZETST, ISIZETST ) * IF( LWORK.LT.SIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDWORK = INDD + N LLWORK = LWORK - INDWORK + 1 * ULP = PSLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PSLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL SLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / REAL( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log symmetric, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random symmetric * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * * Zero * * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N WORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PSLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N WORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PSLASET( 'All', N, N, ZERO, ONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0E+0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0E+0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * symmetric, eigenvalues specified * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * symmetric, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PSMATGEN( DESCA( CTXT_ ), 'S', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0E+0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0E+0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PSLASET( 'All', N, N, ZERO, ZERO, COPYA, 1, 1, DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( SLARAN( ISEED )*REAL( NLOC ) ), N-NGEN ) * CALL SLATMS( IN, IN, 'S', ISEED, 'P', WORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PSELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PSELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PSELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PSELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2, ... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 WORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * * CALL PSFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0E+0 ) * CALL PSLATMS( N, N, 'S', ISEED, 'S', WORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * * * WKNOWN ... NOT SET, GUESS A DEFAULT * WKNOWN = .TRUE. ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL SLASRT( 'I', N, WORK( INDD ), IINFO ) * * * These values aren't actually used, but they make ftncheck happy. * IL = -1 IU = -2 VL = ONE VU = -ONE * CALL PSLASIZESYEVX( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, WORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) * LSYEVXSIZE = MIN( MAXSIZE, LLWORK ) * CALL PSSEPSUBTST( WKNOWN, 'v', 'a', UPLO, N, VL, VU, IL, IU, $ THRESH, ABSTOL, A, COPYA, Z, 1, 1, DESCA, $ WORK( INDD ), WIN, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK( INDWORK ), LLWORK, $ LSYEVXSIZE, IWORK, ISIZESYEVX, RES, TSTNRM, $ QTQNRM, NOUT ) * * * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'V', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'V' RANGE = 'A' CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 1' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'V', RANGE = 'A', random memory * IF( INFO.EQ.0 ) THEN JOBZ = 'V' RANGE = 'A' CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 2' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * We use PSLASIZESYEVX to choose IL and IU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'I', maximum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PSLASIZESYEVX to choose IL and IU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = MAXSIZE * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF * * Subtest 6: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PSLASIZESYEVX to choose IL and IU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 6' INFO = 1 END IF END IF * * Subtest 7: JOBZ = 'V', RANGE = 'I', random memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PSLASIZESYEVX to choose IL and IU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) LSYEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 7' INFO = 1 END IF END IF * * Subtest 8: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 8' INFO = 1 END IF END IF * * Subtest 9: JOBZ = 'V', RANGE = 'V', maximum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = MAXSIZE * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 9' INFO = 1 END IF END IF * * Subtest 10: JOBZ = 'V', RANGE = 'V', * minimum memory required for eigenvectors * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest10' INFO = 1 END IF END IF * * Subtest 11: JOBZ = 'V', RANGE = 'V', * random memory (enough for all eigenvectors * but not enough to guarantee orthogonality * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VECSIZE + INT( SLARAN( ISEED )* $ REAL( MAXSIZE-VECSIZE ) ) * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest11' INFO = 1 END IF END IF * * Subtest 12: JOBZ = 'V', RANGE = 'V', * miniimum memory required for eigenvalues only * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest12' INFO = 1 END IF END IF * * Subtest 13: JOBZ = 'V', RANGE = 'V', * random memory (more than minimum required * for eigenvalues, less than required for vectors) * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PSLASIZESYEVX to choose VL and VU for us. * CALL PSLASIZESYEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LSYEVXSIZE = VALSIZE + INT( SLARAN( ISEED )* $ REAL( VECSIZE-VALSIZE ) ) * CALL PSSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, LSYEVXSIZE, $ IWORK, ISIZESYEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest13' INFO = 1 END IF END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF * * Now that PSSYEVX been tested, we check PSSYEV if we are a * homogeneous machine. * IF( LSAME( HETERO, 'N' ) .AND. LSAME( SUBTESTS, 'N' ) ) THEN PASSED = 'PASSED EV' * * PSSYEV test1: * JOBZ = 'N', eigenvalues only * IF( INFO.NE.0 ) THEN * * If the EVX tests fail, we do not perform the EV tests * PASSED = 'SKIPPED EV' ELSE JOBZ = 'N' * CALL PSSYEV( JOBZ, UPLO, N, A, 1, 1, DESCA, $ WORK( INDWORK ), Z, 1, 1, DESCA, $ WORK( INDWORK ), -1, INFO ) MINSIZE = INT( WORK( INDWORK ) ) * CALL PSSQPSUBTST( WKNOWN, JOBZ, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, 1, 1, DESCA, WIN, WNEW, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ MINSIZE, RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAIL EV test1' INFO = 1 END IF END IF * * PSSYEV test2: * JOBZ = 'V', eigenvalues and eigenvectors * IF( INFO.EQ.0 ) THEN JOBZ = 'V' * CALL PSSYEV( JOBZ, UPLO, N, A, 1, 1, DESCA, $ WORK( INDWORK ), Z, 1, 1, DESCA, $ WORK( INDWORK ), -1, INFO ) MINSIZE = INT( WORK( INDWORK ) ) * CALL PSSQPSUBTST( WKNOWN, JOBZ, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, 1, 1, DESCA, WIN, WNEW, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ MINSIZE, RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAIL EV test2' INFO = 1 END IF END IF IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, $ PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF END IF * * Now that PSSYEV been tested, we check PSSYEVD if we are a * homogeneous machine. * IF( LSAME( HETERO, 'N' ) .AND. LSAME( SUBTESTS, 'N' ) ) THEN PASSED = 'PASSED EVD' * * PSSYEVD test1: * IF( INFO.NE.0 ) THEN * * If the EV tests fail, we do not perform the EVD tests * PASSED = 'SKIPPED EVD' ELSE * NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) MINSIZE = MAX( 1+6*N+2*NP*NQ, $ 3*N + MAX( NB*( NP+1 ), 3*NB ) ) + 2*N * CALL PSSDPSUBTST( WKNOWN, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, 1, 1, DESCA, WIN, WNEW, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, $ MINSIZE, IWORK, ISIZESYEVD, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAIL EVD test1' INFO = 1 END IF END IF IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), TSTNRM, $ QTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, $ PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF END IF RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, $ F8.2, 1X, F8.2, 1X, G9.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 21X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 21X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 32X, $ 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) 9984 FORMAT( ' IBTYPE=', I8 ) 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) 9980 FORMAT( ' Increase TOTMEM in PSSEPDRIVER' ) * * End of PSSEPTST * END scalapack-2.0.2/TESTING/EIG/pssqpsubtst.f000644 000766 000024 00000040573 10363532303 020223 0ustar00juliestaff000000 000000 * * SUBROUTINE PSSQPSUBTST( WKNOWN, JOBZ, UPLO, N, THRESH, ABSTOL, A, $ COPYA, Z, IA, JA, DESCA, WIN, WNEW, $ IPREPAD, IPOSTPAD, WORK, LWORK, LWORK1, $ RESULT, TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, UPLO INTEGER IA, IPOSTPAD, IPREPAD, JA, LWORK, LWORK1, N, $ NOUT, RESULT REAL ABSTOL, QTQNRM, THRESH, TSTNRM * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), COPYA( * ), WIN( * ), WNEW( * ), $ WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSSQPSUBTST calls PSSYEV and then tests the output of * PSSYEV * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PSSYEV when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PSSQPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * THRESH (global input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. * * A (local workspace) REAL array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PSSYEV for a description of block cyclic layout. * The test matrix, which is then modified by PSSYEV * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) REAL array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) REAL array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PSSEPCHK and PSSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) REAL array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) REAL array, dimension (N) * The eigenvalues as computed by this call to PSSYEV. * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * WORK (local workspace) REAL array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PSSYEV * * RESULT (global output) INTEGER * The result of this call to PSSYEV * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) REAL * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) REAL * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DT_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL FIVE, NEGONE, PADVAL, ZERO PARAMETER ( PADVAL = 13.5285E+0, FIVE = 5.0E+0, $ NEGONE = -1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IAM, INFO, ISIZESUBTST, ISIZESYEVX, $ ISIZETST, J, EIGS, MINSIZE, MQ, MYCOL, MYROW, $ NP, NPCOL, NPROW, NQ, RESAQ, RESQTQ, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZESYEV, SIZESYEVX, $ SIZETMS, SIZETST, SIZESYEVD, ISIZESYEVD REAL EPS, EPSNORMA, ERROR, MAXERROR, MINERROR, $ NORMWIN, SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), ITMP( 2 ), $ IWORK( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH, PSLANSY EXTERNAL LSAME, NUMROC, PSLAMCH, PSLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, IGAMN2D, IGAMX2D, $ PSCHEKPAD, PSELSET, PSFILLPAD, PSLASIZESQP, $ PSSEPCHK, PSSEPQTQ, PSSYEV, SGAMN2D, SGAMX2D, $ SLACPY, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DT_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PSLASIZESQP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZESYEVX, ISIZESYEVX, SIZESYEV, $ SIZESYEVD, ISIZESYEVD, SIZESUBTST, ISIZESUBTST, $ SIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PSLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1+IPREPAD ) ), $ ABS( WIN( N+IPREPAD ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * DO 10 I = 1, LWORK1, 1 WORK( I+IPREPAD ) = 14.3E+0 10 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159E+0 30 CONTINUE * DO 40 I = 1, 2 IWORK( I ) = 0 40 CONTINUE * IF( LSAME( JOBZ, 'N' ) ) THEN EIGS = 0 ELSE EIGS = N END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( EIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * Find the amount of workspace needed with or without eigenvectors. * CALL PSLASIZESYEV( JOBZ, N, DESCA, MINSIZE ) * CALL SLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PSFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, PADVAL+1.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0E+0 ) * CALL PSFILLPAD( DESCA( CTXT_ ), LWORK1, 1, WORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0E+0 ) * * Make sure that PSSYEV does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, EIGS, 1 CALL PSELSET( Z( 1+IPREPAD ), I, J, DESCA, 13.0E+0 ) 50 CONTINUE 60 CONTINUE * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PSSYEV( JOBZ, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), LWORK1, INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEV-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, PADVAL ) * CALL PSCHEKPAD( DESCZ( CTXT_ ), 'PSSYEV-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL+1.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEV-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0E+0 ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSYEV-WORK', LWORK1, 1, $ WORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0E+0 ) * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )INFO IF( INFO.EQ.(N+1) ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 END IF ELSE IF( INFO.EQ.14 .AND. LWORK1.GE.MINSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * IF( RESULT.EQ.0 .OR. INFO.GT.N ) THEN * * Make sure that different processes return the same eigenvalues. * This is a more exhaustive check that provided by PSSYEV. * DO 70 I = 1, N WORK( I ) = WNEW( I+IPREPAD ) WORK( I+N ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', N, 1, WORK, N, 1, $ 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', N, 1, $ WORK( 1+N ), N, 1, 1, -1, -1, 0 ) * DO 80 I = 1, N * IF( ABS( WORK( I )-WORK( N+I ) ).GT.ZERO ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 GO TO 90 END IF 80 CONTINUE 90 CONTINUE END IF * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PSLANSY( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ WORK )*EPS END IF * * Note that a couple key variables get redefined in PSSEPCHK * as described by this table: * * PSSEPTST name PSSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |AQ - QE| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, WORK, SIZECHK, $ IPREPAD, IPOSTPAD, 4.3E+0 ) * RESAQ = 0 * CALL PSSEPCHK( N, N, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), WORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RESAQ ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSEPCHK-WORK', SIZECHK, 1, $ WORK, SIZECHK, IPREPAD, IPOSTPAD, 4.3E+0 ) * IF( RESAQ.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9993 ) END IF * * Perform the |QTQ - I| test * CALL PSFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1, WORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3E+0 ) * RESQTQ = 0 * CALL PSSEPQTQ( N, N, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1 ), IWORK( 1 ), WORK( 1 ), $ WORK( IPREPAD+1 ), SIZEQTQ, QTQNRM, INFO, $ RESQTQ ) * CALL PSCHEKPAD( DESCA( CTXT_ ), 'PSSEPQTQ-WORK', SIZEQTQ, 1, $ WORK, SIZEQTQ, IPREPAD, IPOSTPAD, 4.3E+0 ) * IF( RESQTQ.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9992 ) END IF * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN .AND. N.GT.0 ) THEN * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN MAXERROR = 0 * DO 140 I = 1, N ERROR = ABS( WIN( I+IPREPAD )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 140 CONTINUE MINERROR = MIN( MAXERROR, MINERROR ) * IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * * RETURN * 9999 FORMAT( 'PSSYEV returned INFO=', I7 ) 9998 FORMAT( 'PSSEPQTQ in PSSQPSUBTST returned INFO=', I7 ) 9997 FORMAT( 'PSSQPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PSSYEV returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'Different processes return different eigenvalues' ) 9994 FORMAT( 'Heterogeneity detected by PSSYEV' ) 9993 FORMAT( 'PSSYEV failed the |AQ -QE| test' ) 9992 FORMAT( 'PSSYEV failed the |QTQ -I| test' ) * * End of PSSQPSUBTST * END scalapack-2.0.2/TESTING/EIG/pssvdchk.f000644 000766 000024 00000032612 10363532303 017430 0ustar00juliestaff000000 000000 SUBROUTINE PSSVDCHK( M, N, A, IA, JA, DESCA, U, IU, JU, DESCU, VT, $ IVT, JVT, DESCVT, S, THRESH, WORK, LWORK, $ RESULT, CHK, MTM ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IU, IVT, JA, JU, JVT, LWORK, M, N REAL CHK, MTM, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCU( * ), DESCVT( * ), $ RESULT( * ) REAL A( * ), S( * ), U( * ), VT( * ), WORK( * ) * .. * * Purpose * ======= * * For given two-dimensional matrices A, U, VT, and one-dimensional * array D compute the following four tests: * * (1) | A - U*diag(S) VT | / ( |A| max(M,N) ulp ) * * (2) | I - U'*U | / ( M ulp ) * * (3) | I - VT*VT' | / ( N ulp ), * * (4) S contains SIZE = MIN( M, N ) nonnegative values in * decreasing order. * It then compares result of computations (1)-(3) * with TRESH and returns results of comparisons and test (4) in * RESULT(I). When the i-th test fails, value of RESULT( I ) is set * to 1. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MP = number of local rows in A and U * NQ = number of local columns in A and VT * SIZEP = number of local rows in VT * SIZEQ = number of local columns in U * * M (global input) INTEGER * Matrix size. * The number of global rows in A and U and * * N (global input) INTEGER * The number of global columns in A and VT. * * A (input) block cyclic distributed REAL array, * global dimension (M, N), local dimension (DESCA( DLEN_ ), NQ) * Contains the original test matrix. * * IA (global input) INTEGER * The global row index of the submatrix of the distributed * matrix A to operate on. * * JA (global input) INTEGER * The global column index of the submatrix of the distributed * matrix A to operate on. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix A. * * U (local input) REAL array * global dimension (M, SIZE), local dimension * (DESCU( DLEN_ ), SIZEQ) * Contains left singular vectors of matrix A. * * IU (global input) INTEGER * The global row index of the submatrix of the distributed * matrix U to operate on. * * JU (global input) INTEGER * The global column index of the submatrix of the distributed * matrix U to operate on. * * DESCU (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix U. * * VT (local input) REAL array * global dimension (SIZE, N), local dimension * (DESCVT( DLEN_ ), NQ) * Contains right singular vectors of matrix A. * * IVT (global input) INTEGER * The global row index of the submatrix of the distributed * matrix VT to operate on. * * JVT (global input) INTEGER * The global column index of the submatrix of the distributed * matrix VT to operate on. * * DESCVT (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix VT. * * S (global input) REAL array, dimension (SIZE) * Contains the computed singular values * * THRESH (input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= 1 + SIZEQ*SIZEP + MAX[WORK(pdlange(size,size)), * WORK(pdlange(m,n))], * where * SIZEQ = NUMROC( SIZE, DESCU( NB_ ), MYCOL, 0, NPCOL ), * SIZEP = NUMROC( SIZE, DESCVT( MB_ ), MYROW, 0, NPROW ), * and worekspaces required to call pdlange are * WORK(pdlange(size,size)) < MAX(SIZEQ0,2) < SIZEB +2, * WORK(pdlange(m,n)) < MAX(NQ0,2) < SIZEB +2, * SIZEB = MAX(M, N) * Finally, upper limit on required workspace is * LWORK > 1 + SIZEQ*SIZEP + SIZEB + 2 * * RESULT (global input/output) INTEGER array. Four first elements of * the array are set to 0 or 1 depending on passing four * respective tests ( see above in Purpose ). The elements of * RESULT are set to * 0 if the test passes i.e. * | A - U*diag(S)*VT | / ( |A| max(M,N) ulp ) <= THRESH * 1 if the test fails i.e. * | A - U*diag(S)*VT | / ( |A| max(M,N) ulp ) > THRESH * * CHK (global output) REAL * value of the | A - U*diag(S) VT | / ( |A| max(M,N) ulp ) * * MTM (global output) REAL * maximum of the two values: * | I - U'*U | / ( M ulp ) and | I - VT*VT' | / ( N ulp ) * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE, MONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, MONE = -1.0E0 ) * .. * .. Local Scalars .. INTEGER I, INFO, LDR, LOCALCOL, LWMIN, MP, MX, MYCOL, $ MYROW, NPCOL, NPROW, NQ, PCOL, PTRR, PTRWORK, $ SIZE, SIZEP, SIZEPOS, SIZEQ REAL FIRST, NORMA, NORMAI, NORMU, NORMVT, SECOND, $ THRESHA, ULP * .. * .. Local Arrays .. INTEGER DESCR( DLEN_ ) * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC REAL PSLAMCH, PSLANGE EXTERNAL INDXG2L, INDXG2P, NUMROC, PSLAMCH, PSLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCINIT, SSCAL, $ PSELSET, PSGEMM, PSLASET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 SIZE = MIN( M, N ) * * Sizepos is a number of parameters to pdsvdchk plus one. It's used * for the error reporting. * SIZEPOS = 22 IF( NPROW.EQ.-1 ) THEN INFO = -607 ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( M, 1, SIZE, SIZEPOS, IU, JU, DESCU, 10, INFO ) CALL CHK1MAT( SIZE, SIZEPOS, N, 2, IVT, JVT, DESCVT, 14, INFO ) END IF * IF( INFO.EQ.0 ) THEN * * Calculate workspace * MP = NUMROC( M, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) SIZEP = NUMROC( SIZE, DESCVT( MB_ ), MYROW, 0, NPROW ) SIZEQ = NUMROC( SIZE, DESCU( NB_ ), MYCOL, 0, NPCOL ) MX = MAX( SIZEQ, NQ ) LWMIN = 2 + SIZEQ*SIZEP + MAX( 2, MX ) WORK( 1 ) = LWMIN IF( LWORK.EQ.-1 ) $ GO TO 40 IF( LWORK.LT.LWMIN ) THEN INFO = -18 ELSE IF( THRESH.LE.0 ) THEN INFO = -16 END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PSSVDCHK', -INFO ) RETURN END IF * LDR = MAX( 1, SIZEP ) ULP = PSLAMCH( DESCA( CTXT_ ), 'P' ) NORMAI = PSLANGE( '1', M, N, A, 1, 1, DESCA, WORK ) * * Allocate array R of global dimension SIZE x SIZE for testing * PTRR = 2 PTRWORK = PTRR + SIZEQ*SIZEP * CALL DESCINIT( DESCR, SIZE, SIZE, DESCVT( MB_ ), DESCU( NB_ ), 0, $ 0, DESCA( CTXT_ ), LDR, INFO ) * * Test 2. Form identity matrix R and make check norm(U'*U - I ) * CALL PSLASET( 'Full', SIZE, SIZE, ZERO, ONE, WORK( PTRR ), 1, 1, $ DESCR ) CALL PSGEMM( 'T', 'N', SIZE, SIZE, M, ONE, U, 1, 1, DESCU, U, 1, $ 1, DESCU, MONE, WORK( PTRR ), 1, 1, DESCR ) * NORMU = PSLANGE( '1', SIZE, SIZE, WORK( PTRR ), 1, 1, DESCR, $ WORK( PTRWORK ) ) * NORMU = NORMU / ULP / SIZE / THRESH IF( NORMU.GT.1. ) $ RESULT( 2 ) = 1 * * Test3. Form identity matrix R and check norm(VT*VT' - I ) * CALL PSLASET( 'Full', SIZE, SIZE, ZERO, ONE, WORK( PTRR ), 1, 1, $ DESCR ) CALL PSGEMM( 'N', 'T', SIZE, SIZE, N, ONE, VT, 1, 1, DESCVT, VT, $ 1, 1, DESCVT, MONE, WORK( PTRR ), 1, 1, DESCR ) NORMVT = PSLANGE( '1', SIZE, SIZE, WORK( PTRR ), 1, 1, DESCR, $ WORK( PTRWORK ) ) * NORMVT = NORMVT / ULP / SIZE / THRESH IF( NORMVT.GT.1. ) $ RESULT( 3 ) = 1 * MTM = MAX( NORMVT, NORMU )*THRESH * * Test 1. * Initialize R = diag( S ) * CALL PSLASET( 'Full', SIZE, SIZE, ZERO, ZERO, WORK( PTRR ), 1, 1, $ DESCR ) * DO 10 I = 1, SIZE CALL PSELSET( WORK( PTRR ), I, I, DESCR, S( I ) ) 10 CONTINUE * * Calculate U = U*R * DO 20 I = 1, SIZE PCOL = INDXG2P( I, DESCU( NB_ ), 0, 0, NPCOL ) LOCALCOL = INDXG2L( I, DESCU( NB_ ), 0, 0, NPCOL ) IF( MYCOL.EQ.PCOL ) THEN CALL SSCAL( MP, S( I ), U( ( LOCALCOL-1 )*DESCU( LLD_ )+1 ), $ 1 ) END IF 20 CONTINUE * * Calculate A = U*VT - A * CALL PSGEMM( 'N', 'N', M, N, SIZE, ONE, U, 1, 1, DESCU, VT, 1, 1, $ DESCVT, MONE, A, 1, 1, DESCA ) * NORMA = PSLANGE( '1', M, N, A, 1, 1, DESCA, WORK( PTRWORK ) ) THRESHA = NORMAI*MAX( M, N )*ULP*THRESH * IF( NORMA.GT.THRESHA ) $ RESULT( 1 ) = 1 * IF( THRESHA.EQ.0 ) THEN CHK = 0.0E0 ELSE CHK = NORMA / THRESHA*THRESH END IF * * Test 4. * DO 30 I = 1, SIZE - 1 FIRST = S( I ) SECOND = S( I+1 ) IF( FIRST.LT.SECOND ) $ RESULT( 4 ) = 1 30 CONTINUE 40 CONTINUE RETURN END scalapack-2.0.2/TESTING/EIG/pssvdcmp.f000644 000766 000024 00000026743 10363532303 017452 0ustar00juliestaff000000 000000 SUBROUTINE PSSVDCMP( M, N, JOBTYPE, S, SC, U, UC, IU, JU, DESCU, $ VT, VTC, IVT, JVT, DESCVT, THRESH, RESULT, $ DELTA, WORK, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IU, IVT, JOBTYPE, JU, JVT, LWORK, M, N REAL DELTA, THRESH * .. * .. Array Arguments .. INTEGER DESCU( * ), DESCVT( * ), RESULT( * ) REAL S( * ), SC( * ), U( * ), UC( * ), VT( * ), $ VTC( * ), WORK( * ) * .. * * Purpose * ======== * Testing how accurately "full" and "partial" decomposition options * provided by PSGESVD correspond to each other. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========== * * M (global input) INTEGER * Number of rows of the distributed matrix, for which * SVD was calculated * * N (global input) INTEGER * Number of columns of the distributed matrix, for which * SVD was calculated * * JOBTYPE (global input) INTEGER * Depending on the value of this parameter, * the following comparisons are performed: * * JOBTYPE | COMPARISON * ------------------------------------------- * 2 | | U - UC | / ( M ulp ) > THRESH, * 3 | | VT - VTC | / ( N ulp ) > THRESH * * In addition, for JOBTYPE = 2:4 comparison * | S1 - S2 | / ( SIZE ulp |S| ) > THRESH * is performed. Positive result of any of the comparisons * typically indicates erroneous computations and sets * to one corresponding element of array RESULT * * S (global input) REAL array of singular values * calculated for JOBTYPE equal to 1 * * SC (global input) REAL array of singular values * calculated for JOBTYPE nonequal to 1 * * U (local input) REAL array of left singular * vectors calculated for JOBTYPE equal to 1, local * dimension (MP, SIZEQ), global dimension (M, SIZE) * * UC (local input) REAL array of left singular * vectors calculated for JOBTYPE non equal to 1, local * dimension (MP, SIZEQ), global dimension (M, SIZE) * * IU (global input) INTEGER * The row index in the global array U indicating the first * row of sub( U ). * * JU (global input) INTEGER * The column index in the global array U indicating the * first column of sub( U ). * * DESCU (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix U and UC * * V (local input) REAL array of right singular * vectors calculated for JOBTYPE equal to 1, local * dimension (SIZEP, NQ), global dimension (SIZE, N) * * VC (local input) REAL array of right singular * vectors calculated for JOBTYPE non equal to 1, local * dimension (SIZEP, NQ), global dimension (SIZE, N) * * IVT (global input) INTEGER * The row index in the global array VT indicating the first * row of sub( VT ). * * JVT (global input) INTEGER * The column index in the global array VT indicating the * first column of sub( VT ). * * DESCVT (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix VT and * VTC * * THRESH (global input) REAL * The threshold value for the test ratios. A result is * included in the output file if RESULT >= THRESH. The test * ratios are scaled to be O(1), so THRESH should be a small * multiple of 1, e.g., 10 or 100. To have every test ratio * printed, use THRESH = 0. * * RESULT (global input/output) INTEGER array. * Every nonzero entry corresponds to erroneous computation. * * DELTA (global output) REAL * maximum of the available of the following three values * | U - UC | / ( M ulp THRESH ), * | VT - VT | / ( N ulp THRESH ), * | S1 - S2 | / ( SIZE ulp |S| THRESH ) * * WORK (local workspace/output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * ====================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COLPTR, I, INFO, J, LWMIN, MYCOL, MYROW, NPCOL, $ NPROW, NQ, RESULTS, SIZE, SIZEPOS, SIZEQ REAL ACCUR, CMP, NORMDIFS, NORMDIFU, NORMDIFV, $ NORMS, ULP * .. * .. External Functions .. INTEGER NUMROC REAL SLANGE, PSLAMCH, PSLANGE EXTERNAL NUMROC, SLANGE, PSLAMCH, PSLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*DLEN_*DTYPE_*MB_*M_*N_*RSRC_.LT.0 ) $ RETURN * RESULTS = 0 NORMDIFS = 0 NORMDIFU = 0 NORMDIFV = 0 SIZE = MIN( M, N ) * * Sizepos is a number of parameters to pdsvdcmp plus one. It's used * for the error reporting. * SIZEPOS = 17 INFO = 0 CALL BLACS_GRIDINFO( DESCU( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IF( NPROW.EQ.-1 ) THEN INFO = -607 ELSE CALL CHK1MAT( M, 1, SIZE, SIZEPOS, 1, 1, DESCU, 8, INFO ) CALL CHK1MAT( SIZE, SIZEPOS, N, 2, 1, 1, DESCVT, 11, INFO ) END IF * IF( INFO.EQ.0 ) THEN * * Calculate workspace. * SIZEQ = NUMROC( SIZE, DESCU( NB_ ), MYCOL, 0, NPCOL ) NQ = NUMROC( N, DESCVT( NB_ ), MYCOL, 0, NPCOL ) LWMIN = MAX( SIZEQ, NQ ) + 4 WORK( 1 ) = LWMIN IF( LWORK.EQ.-1 ) $ GO TO 60 IF( LWORK.LT.LWMIN ) THEN INFO = -16 ELSE IF( THRESH.LE.0 ) THEN INFO = -12 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCU( CTXT_ ), 'PSSVDCMP', -INFO ) RETURN END IF * ULP = PSLAMCH( DESCU( CTXT_ ), 'P' ) * * Make comparison of singular values. * NORMS = SLANGE( '1', SIZE, 1, S, SIZE, WORK ) DO 10 I = 1, SIZE SC( I ) = S( I ) - SC( I ) 10 CONTINUE * NORMDIFS = SLANGE( '1', SIZE, 1, SC, SIZE, WORK ) ACCUR = ULP*SIZE*NORMS*THRESH * IF( NORMDIFS.GT.ACCUR ) $ RESULTS = 1 IF( NORMDIFS.EQ.0 .AND. ACCUR.EQ.0 ) THEN NORMDIFS = 0 ELSE NORMDIFS = NORMDIFS / ACCUR END IF * IF( JOBTYPE.EQ.2 ) THEN * RESULT( 5 ) = RESULTS ACCUR = ULP*M*THRESH DO 30 J = 1, SIZEQ COLPTR = DESCU( LLD_ )*( J-1 ) DO 20 I = 1, DESCU( LLD_ ) UC( I+COLPTR ) = U( I+COLPTR ) - UC( I+COLPTR ) 20 CONTINUE 30 CONTINUE * NORMDIFU = PSLANGE( '1', M, SIZE, UC, IU, JU, DESCU, WORK ) * IF( NORMDIFU.GE.ACCUR ) $ RESULT( 6 ) = 1 IF( NORMDIFU.EQ.0 .AND. ACCUR.EQ.0 ) THEN NORMDIFU = 0 ELSE NORMDIFU = NORMDIFU / ACCUR END IF * ELSE IF( JOBTYPE.EQ.3 ) THEN * RESULT( 7 ) = RESULTS ACCUR = ULP*N*THRESH DO 50 J = 1, NQ COLPTR = DESCVT( LLD_ )*( J-1 ) DO 40 I = 1, DESCVT( LLD_ ) VTC( I+COLPTR ) = VT( I+COLPTR ) - VTC( I+COLPTR ) 40 CONTINUE 50 CONTINUE * NORMDIFV = PSLANGE( '1', SIZE, N, VTC, IVT, JVT, DESCVT, WORK ) * IF( NORMDIFV.GE.ACCUR ) $ RESULT( 8 ) = 1 * IF( NORMDIFV.EQ.0 .AND. ACCUR.EQ.0 ) THEN NORMDIFV = 0 ELSE NORMDIFV = NORMDIFV / ACCUR END IF * ELSE IF( JOBTYPE.EQ.4 ) THEN * RESULT( 9 ) = RESULTS * END IF * CMP = MAX( NORMDIFV, NORMDIFU ) DELTA = MAX( CMP, NORMDIFS ) * 60 CONTINUE * * End of PSSVDCMP * RETURN END scalapack-2.0.2/TESTING/EIG/pssvddriver.f000644 000766 000024 00000026010 10363532303 020151 0ustar00juliestaff000000 000000 PROGRAM PSSVDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Purpose * ======== * * Parallel Real singular value decomposition test driver. * * INPUT: * ===== * This routine tests PDGESVD, the parallel singular value * decomposition solver. We would like to cover possible combinations * of: matrix size, process configuration (nprow and npcol), block * size (nb), matrix type, and workspace available. * * Current format of the input file SVD.dat lists the following: * device out * Threshold * number of matrices * number of rows for every matrix * number of columns for every matrix * number of process configurations (P, Q, NB) * values of P (NPROW) for every configuration * values of Q (NPCOL) for every configuration * values of NB for every configuration. * Here threshold is an integer constant with a value between 1 and * 100, which meaning is explained in comments to PSSVDTST. * * WHAT IT DOES: * ============ * PSVDDRIVER checks floating-point arithmetic and parameters * provided by the user in initialization file SVD.dat. It reads and * broadcasts to all process parameters required to run actual testing * code PSVDTST. In case all tests are successful it tells you so. For * the actual "meat" of the tests see comments to PSVDTST. * *======================================================================= * * .. Local Scalars .. CHARACTER*80 SUMMARY INTEGER CONTEXT, ERR, I, IAM, J, K, LWORK, MAXNODES, $ NMATSIZES, NOUT, NPCONFIGS, NPROCS REAL THRESH * .. * .. Parameters .. INTEGER MAXSETSIZE, NIN, DBLSIZ, TOTMEM, MEMSIZ PARAMETER ( MAXSETSIZE = 50, NIN = 11, DBLSIZ = 8, $ TOTMEM = 2000000, MEMSIZ = TOTMEM / DBLSIZ ) * .. * .. Local Arrays .. INTEGER ISEED( 4 ), MM( MAXSETSIZE ), $ NBS( MAXSETSIZE ), NN( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ), $ RESULT( 9 ) REAL WORK( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ SGEBR2D, SGEBS2D, IGEBR2D, IGEBS2D, PSSVDTST * .. * .. Executable Statements .. * * Get starting information. * CALL BLACS_PINFO( IAM, NPROCS ) * * Open file and skip data header; read output device. * IF( IAM.EQ.0 ) THEN OPEN( UNIT = NIN, FILE = 'SVD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMARY READ( NIN, FMT = * )NOUT READ( NIN, FMT = * )MAXNODES END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * * Initialize variables, arrays, and grids. * ERR = 0 NMATSIZES = 0 NPCONFIGS = 0 LWORK = MEMSIZ ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) WRITE( NOUT, FMT = 9986 ) WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = 9974 ) WRITE( NOUT, FMT = 9973 ) WRITE( NOUT, FMT = 9972 ) WRITE( NOUT, FMT = 9971 ) WRITE( NOUT, FMT = 9970 ) WRITE( NOUT, FMT = 9969 ) WRITE( NOUT, FMT = 9968 ) WRITE( NOUT, FMT = 9967 ) WRITE( NOUT, FMT = 9966 ) WRITE( NOUT, FMT = 9965 ) END IF * * Process 0 reads values in input file and broadcasts them to * all other processes. * 10 CONTINUE IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )SUMMARY READ( NIN, FMT = * )SUMMARY READ( NIN, FMT = * )THRESH WRITE( NOUT, FMT = 9965 )SUMMARY CALL SGEBS2D( CONTEXT, 'All', ' ', 1, 1, THRESH, 1 ) ELSE CALL SGEBR2D( CONTEXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) END IF IF( THRESH.EQ.-1 ) THEN GO TO 80 END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NMATSIZES CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NMATSIZES, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NMATSIZES, 1, 0, 0 ) END IF * Deal with error IF( NMATSIZES.LT.1 .OR. NMATSIZES.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'Matrix size', NMATSIZES, 1, $ MAXSETSIZE END IF ERR = -1 GO TO 80 END IF * * Read array of MATSIZES. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( MM( I ), I = 1, NMATSIZES ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NMATSIZES, MM, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NMATSIZES, MM, 1, 0, 0 ) END IF * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NN( I ), I = 1, NMATSIZES ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NMATSIZES, NN, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NMATSIZES, NN, 1, 0, 0 ) END IF * * Read and broadcast NPCONFIGS. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )NPCONFIGS CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, NPCONFIGS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, NPCONFIGS, 1, 0, 0 ) END IF * Deal with error IF( NPCONFIGS.LT.1 .OR. NPCONFIGS.GT.MAXSETSIZE ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'# proc configs', NPCONFIGS, 1, $ MAXSETSIZE END IF ERR = -1 GO TO 80 END IF * * Read and broadcast array of NPROWS. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NPROWS( I ), I = 1, NPCONFIGS ) * CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPROWS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPROWS, 1, 0, $ 0 ) END IF * Deal with error DO 20 I = 1, NPCONFIGS IF( NPROWS( I ).LE.0 ) $ ERR = -1 20 CONTINUE IF( ERR.EQ.-1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 )' NPROW' END IF GO TO 80 END IF * * Read and broadcast array of NPCOLS. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NPCOLS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPCOLS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NPCOLS, 1, 0, $ 0 ) END IF * * Deal with error. * DO 30 I = 1, NPCONFIGS IF( NPCOLS( I ).LE.0 ) $ ERR = -1 30 CONTINUE IF( ERR.EQ.-1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 )' NPCOL' END IF GO TO 80 END IF * * Read and broadcast array of NBs. * IF( IAM.EQ.0 ) THEN READ( NIN, FMT = * )( NBS( I ), I = 1, NPCONFIGS ) CALL IGEBS2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NBS, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, NPCONFIGS, NBS, 1, 0, 0 ) END IF * Deal with error DO 40 I = 1, NPCONFIGS IF( NBS( I ).LE.0 ) $ ERR = -1 40 CONTINUE IF( ERR.EQ.-1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 )' NB' END IF GO TO 80 END IF * DO 70 J = 1, NMATSIZES DO 60 I = 1, NPCONFIGS * DO 50 K = 1, 9 RESULT( K ) = 0 50 CONTINUE CALL PSSVDTST( MM( J ), NN( J ), NPROWS( I ), NPCOLS( I ), $ NBS( I ), ISEED, THRESH, WORK, RESULT, LWORK, $ NOUT ) * 60 CONTINUE 70 CONTINUE * GO TO 10 * 80 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * End of PSSVDDRIVER * 9999 FORMAT( A20, ' is:', I5, ' must be between:', I5, ' and', I5 ) 9998 FORMAT( A ) 9997 FORMAT( A20, ' must be positive' ) 9996 FORMAT( A ) 9995 FORMAT( 'M = ', I5, ' N = ', I5, ' NPOW = ', I5, 'NPCOL = ', I5, $ ' NB = ', I5 ) * 9994 FORMAT( 'Test #', I5, 'for this configuration has failed' ) 9993 FORMAT( 'All test passed for this configuration' ) 9992 FORMAT( ' ' ) 9991 FORMAT( 'Running tests of the parallel singular value ', $ 'decomposition routine: PSGESVD' ) 9990 FORMAT( 'The following scaled residual checks will be', $ 'computed:' ) 9989 FORMAT( ' || A - U*diag(S)*VT ||/( ||A||*max(M,N)*ulp )' ) 9988 FORMAT( ' || I - UT*U ||/( M*ulp )' ) 9987 FORMAT( ' || I - VT*V ||/( N*ulp )' ) 9986 FORMAT( ' ' ) 9985 FORMAT( 'An explanation of the input/output parameters', $ ' follows:' ) 9984 FORMAT( 'RESULT : passed; or an indication of which', $ ' jobtype test failed' ) 9983 FORMAT( 'M : The number of rows of the matrix A.' ) 9982 FORMAT( 'N : The number of columns of the matrix A.' ) 9981 FORMAT( 'P : The number of process rows.' ) 9980 FORMAT( 'Q : The number of process columns.' ) 9979 FORMAT( 'NB : The size of the square blocks the', $ ' matrix A is split into.' ) 9978 FORMAT( 'THRESH : If a residual value is less than ', $ ' THRESH, RESULT is flagged as PASSED.' ) 9977 FORMAT( 'MTYPE : matrix type (see pssvdtst.f).' ) 9976 FORMAT( 'CHK : || A - U*diag(S)*VT ||/( ||A||', $ '*max(M,N)*ulp )' ) 9975 FORMAT( 'MTM : maximum of two values:',/, $ ' || I - UT*U ||/( M*ulp ) and', $ ' || I - VT*V ||/( N*ulp )' ) 9974 FORMAT( 'DELTA : maximum of three values:',/, $ ' || U - UC ||/( M*ulp*THRESH ),' ) 9973 FORMAT( ' || VT - VTC ||/( N*ulp*THRESH ), and' ) 9972 FORMAT( ' || S - SC || / ( SIZE*ulp*|S|*THRESH ), ' ) 9971 FORMAT( ' where UC, VTC, SC are singular vectors ', $ 'and values' ) 9970 FORMAT( ' for JOBTYPE.NE.1 (see pdsvdcmp.f) ' ) 9969 FORMAT( 'HET : P if heterogeneity was detected by PDGESVD' ) 9968 FORMAT( ' T if detected by the PDSVSTST, N if', $ ' undetected' ) 9967 FORMAT( ' ' ) 9966 FORMAT( 'RESULT WALL CPU M N P Q', $ ' NB MTYPE CHK MTM DELTA HET' ) 9965 FORMAT( A ) END scalapack-2.0.2/TESTING/EIG/pssvdtst.f000644 000766 000024 00000056102 11645634736 017516 0ustar00juliestaff000000 000000 SUBROUTINE PSSVDTST( M, N, NPROW, NPCOL, NB, ISEED, THRESH, WORK, $ RESULT, LWORK, NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER LWORK, M, N, NB, NOUT, NPCOL, NPROW REAL THRESH * .. * .. Array Arguments .. INTEGER ISEED( 4 ), RESULT( 9 ) REAL WORK( * ) * .. * * Purpose * ======= * * PSSVDTST checks the singular value decomposition (SVD) routine * PSGESVD. PSGESVD factors A = U diag(S) VT, where U and VT are * orthogonal and diag(S) is diagonal with the entries of the array * S on its diagonal. The entries of S are the singular values, stored * in decreasing order. U and VT can be optionally not computed, * computed and overwritten on A, or computed partially. * * A is M by N. Let SIZE = min( M, N ). S has dimension SIZE by SIZE. * U is M by SIZE and VT is SIZE by N. PDGESVD optionally calculates * U and VT, depending on the values of its parameters JOBU and JOBVT. * There are four possible combinations of "job" parameters for a call * to PDGESVD, that correspond to four values of internal index JOBTYPE. * The table below shows the mapping between "job" parameters of * PDGESVD and respective values of the index JOBTYPE together * with matrices computed for each type of the job. * * * | JOBU = 'V' | JOBU = 'N' * ---------- ------------------------------------------- * JOBVT = 'V'| JOBTYPE = 1 | JOBTYPE = 3 * | U1, S1, VT1 | S3, VT3 * ---------- ------------------------------------------ * JOBVT = 'N'| JOBTYPE = 2 | JOBTYPE = 4 * | U2, S2 | S4 * * * When PSSVDTST is called, a number of matrix "types" are specified. * For each type of matrix, and for the minimal workspace as well as * for larger than minimal workspace an M x N matrix "A" with known * singular values is generated and used to test the SVD routines. * For each matrix, A will be factored as A = U diag(S) VT and the * following 9 tests computed: * * (1) | A - U1 diag(S1) VT1 | / ( |A| max(M,N) ulp ) * * (2) | I - U1'U1 | / ( M ulp ) * * (3) | I - VT1 VT1' | / ( N ulp ), * * (4) S1 contains SIZE nonnegative values in decreasing order. * (Return 0 if true, 1/ULP if false.) * * (5) | S1 - S2 | / ( SIZE ulp |S| ) * * (6) | U1 - U2 | / ( M ulp ) * * (7) | S1 - S3 | / ( SIZE ulp |S| ) * * (8) | VT1 - VT3 | / ( N ulp ) * * (9) | S1 - S4 | / ( SIZE ulp |S| ) * * Currently, the list of possible matrix types is: * * (1) The zero matrix. * * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP. * (ULP = (first number larger than 1) - 1 ) * * (4) A matrix of the form U D VT, where U, VT are orthogonal and * D has evenly spaced entries 1, ..., ULP. * * (5) Same as (4), but multiplied by SQRT( overflow threshold ) * * (6) Same as (4), but multiplied by SQRT( underflow threshold ) * * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========== * * M (global input) INTEGER dimension * The value of the matrix row dimension. * * N (global input) INTEGER dimension * The value of the matrix column dimension. * * NPROW (global input) INTEGER * Number of process rows * * NPCOL (global input) INTEGER * Number of process columns * * NB (global input) INTEGER * The block size of the matrix A. NB >=1. * * ISEED (global input/local output) INTEGER array, dimension (4) * On entry, the seed of the random number generator. The array * elements should be between 0 and 4095; if not they will be * reduced mod 4096. Also, ISEED(4) must be odd. * On exit, ISEED is changed and can be used in the next call to * SDRVBD to continue the same random number sequence. * * THRESH (global input) REAL * The threshold value for the test ratios. A result is * included in the output file if RESULT >= THRESH. The test * ratios are scaled to be O(1), so THRESH should be a small * multiple of 1, e.g., 10 or 100. To have every test ratio * printed, use THRESH = 0. * * RESULT (global input/output) INTEGER array of dimension 9. Initially * RESULT( I ) = 0. On the output, RESULT ( I ) = 1 if test I * ( see above ) wasn't passed. * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER * Dimension of the array WORK. It is defined as follows * LWORK = 1 + 2*LDA*NQ + 3*SIZE + * MAX(WPSLAGGE, LDU*SIZEQ + LDVT*NQ + MAX(LDU*SIZEQ, LDVT*NQ) * + WPSGESVD + MAX( WPSSVDCHK, WPSSVDCMP)), * where WPSLAGGE, WPSGESVD, WPSSVDCHK, WPSSVDCMP are amounts * of workspace required respectively by PSLAGGE, PSGESVD, * PSSVDCHK, PSSVDCMP. * Here * LDA = NUMROC( M, NB, MYROW, 0, NPROW ), LDU = LDA, * LDVT = NUMROC( SIZE, NB, MYROW, 0, NPROW ), * NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ), * SIZEQ = NUMROC( SIZE, NB, MYCOL, 0, NPCOL ). * Values of the variables WPSLAGGE, WPSGESVD, WPSSVDCHK, * WPSSVDCMP are found by "dummy" calls to * the respective routines. In every "dummy" call, variable * LWORK is set to -1, thus causing respective routine * immediately return required workspace in WORK(1) without * executing any calculations * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_, NTYPES PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, NTYPES = 6 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) * .. * .. Local Scalars .. CHARACTER HETERO, JOBU, JOBVT INTEGER CONTEXT, DINFO, I, IA, IAM, INFO, ITYPE, IU, $ IVT, JA, JOBTYPE, JU, JVT, LDA, LDU, LDVT, $ LLWORK, LWMIN, MYCOL, MYROW, NNODES, NQ, PASS, $ PTRA, PTRAC, PTRD, PTRS, PTRSC, PTRU, PTRUC, $ PTRVT, PTRVTC, PTRWORK, SETHET, SIZE, SIZEQ, $ WPSGESVD, WPSLAGGE, WPSSVDCHK, WPSSVDCMP REAL CHK, DELTA, H, MTM, OVFL, RTOVFL, RTUNFL, ULP, $ UNFL * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ BLACS_SET, $ DESCINIT, SGAMN2D, SGAMX2D, SLABAD, SSCAL, $ IGAMN2D, IGAMX2D, IGEBR2D, IGEBS2D, PSELSET, $ PSGESVD, PSLACPY, PSLAGGE, PSLASET, PSSVDCHK, $ PSSVDCMP, PXERBLA, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER NUMROC REAL PSLAMCH EXTERNAL NUMROC, PSLAMCH * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCU( DLEN_ ), $ DESCVT( DLEN_ ), ITMP( 2 ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, MAX, MIN, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*DTYPE_*LLD_*MB_*M_*NB_*N_*RSRC_.LT.0 ) $ RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * If this process is not a part of the contex, bail out now. * IF( ( MYROW.GE.NPROW ) .OR. ( MYROW.LT.0 ) .OR. $ ( MYCOL.GE.NPCOL ) .OR. ( MYCOL.LT.0 ) )GO TO 110 CALL BLACS_SET( CONTEXT, 15, 1 ) INFO = 0 * * Check input parameters. * IF( M.LE.0 ) THEN INFO = -1 ELSE IF( N.LE.0 ) THEN INFO = -2 ELSE IF( NPROW.LE.0 ) THEN INFO = -3 ELSE IF( NPCOL.LE.0 ) THEN INFO = -4 ELSE IF( NB.LE.0 ) THEN INFO = -5 ELSE IF( THRESH.LE.0 ) THEN INFO = -7 END IF * SIZE = MIN( M, N ) * * Initialize matrix descriptors. * IA = 1 JA = 1 IU = 1 JU = 1 IVT = 1 JVT = 1 * LDA = NUMROC( M, NB, MYROW, 0, NPROW ) LDA = MAX( 1, LDA ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) LDU = LDA SIZEQ = NUMROC( SIZE, NB, MYCOL, 0, NPCOL ) LDVT = NUMROC( SIZE, NB, MYROW, 0, NPROW ) LDVT = MAX( 1, LDVT ) CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, CONTEXT, LDA, DINFO ) CALL DESCINIT( DESCU, M, SIZE, NB, NB, 0, 0, CONTEXT, LDU, DINFO ) CALL DESCINIT( DESCVT, SIZE, N, NB, NB, 0, 0, CONTEXT, LDVT, $ DINFO ) * * Set some pointers to work array in order to do "dummy" calls. * PTRA = 2 PTRAC = PTRA + LDA*NQ PTRD = PTRAC + LDA*NQ PTRS = PTRD + SIZE PTRSC = PTRS + SIZE PTRWORK = PTRSC + SIZE * PTRU = PTRWORK PTRVT = PTRWORK PTRUC = PTRWORK PTRVTC = PTRWORK * * "Dummy" calls -- return required workspace in work(1) without * any calculation. * CALL PSLAGGE( M, N, WORK( PTRD ), WORK( PTRA ), IA, JA, DESCA, $ ISEED, SIZE, WORK( PTRWORK ), -1, DINFO ) WPSLAGGE = INT( WORK( PTRWORK ) ) * CALL PSGESVD( 'V', 'V', M, N, WORK( PTRA ), IA, JA, DESCA, $ WORK( PTRS ), WORK( PTRU ), IU, JU, DESCU, $ WORK( PTRVT ), IVT, JVT, DESCVT, $ WORK( PTRWORK ), -1, DINFO ) WPSGESVD = INT( WORK( PTRWORK ) ) * CALL PSSVDCHK( M, N, WORK( PTRAC ), IA, JA, DESCA, WORK( PTRUC ), $ IU, JU, DESCU, WORK( PTRVT ), IVT, JVT, DESCVT, $ WORK( PTRS ), THRESH, WORK( PTRWORK ), -1, $ RESULT, CHK, MTM ) WPSSVDCHK = INT( WORK( PTRWORK ) ) * CALL PSSVDCMP( M, N, 1, WORK( PTRS ), WORK( PTRSC ), WORK( PTRU ), $ WORK( PTRUC ), IU, JU, DESCU, WORK( PTRVT ), $ WORK( PTRVTC ), IVT, JVT, DESCVT, THRESH, $ RESULT, DELTA, WORK( PTRWORK ), -1 ) WPSSVDCMP = INT( WORK( PTRWORK ) ) * * Calculation of workspace at last. * LWMIN = 1 + 2*LDA*NQ + 3*SIZE + $ MAX( WPSLAGGE, LDU*SIZEQ+LDVT*NQ+MAX( LDU*SIZEQ, $ LDVT*NQ )+WPSGESVD+MAX( WPSSVDCHK, WPSSVDCMP ) ) WORK( 1 ) = LWMIN * * If this is a "dummy" call, return. * IF( LWORK.EQ.-1 ) $ GO TO 120 IF( INFO.EQ.0 ) THEN IF( LWORK.LT.LWMIN ) THEN INFO = -10 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PSSVDTST', -INFO ) RETURN END IF * ULP = PSLAMCH( CONTEXT, 'P' ) UNFL = PSLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL SLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF * * Loop over matrix types. * DO 100 ITYPE = 1, NTYPES * PASS = 0 SETHET = 0 PTRWORK = PTRSC + SIZE LLWORK = LWORK - PTRWORK + 1 * * Compute A. * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix. * DO 10 I = 1, SIZE WORK( PTRD+I-1 ) = ZERO 10 CONTINUE * CALL PSLASET( 'All', M, N, ZERO, ZERO, WORK( PTRA ), $ IA, JA, DESCA ) * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix. * DO 20 I = 1, SIZE WORK( PTRD+I-1 ) = ONE 20 CONTINUE * CALL PSLASET( 'All', M, N, ZERO, ONE, WORK( PTRA ), $ IA, JA, DESCA ) * ELSE IF( ITYPE.GT.2 ) THEN * * Preset Singular Values. * IF( SIZE.NE.1 ) THEN H = ( ULP-1 ) / ( SIZE-1 ) DO 30 I = 1, SIZE WORK( PTRD+I-1 ) = 1 + H*( I-1 ) 30 CONTINUE ELSE WORK( PTRD ) = 1 END IF * IF( ITYPE.EQ.3 ) THEN * * Diagonal Matrix with specified singular values. * CALL PSLASET( 'All', M, N, ZERO, ZERO, WORK( PTRA ), $ IA, JA, DESCA ) * DO 40 I = 1, SIZE CALL PSELSET( WORK( PTRA ), I, I, DESCA, $ WORK( PTRD+I-1 ) ) 40 CONTINUE * ELSE IF( ITYPE.EQ.4 ) THEN * * General matrix with specified singular values. * CALL PSLAGGE( M, N, WORK( PTRD ), WORK( PTRA ), IA, JA, $ DESCA, ISEED, SIZE, WORK( PTRWORK ), $ LLWORK, INFO ) * ELSE IF( ITYPE.EQ.5 ) THEN * * Singular values scaled by overflow. * CALL SSCAL( SIZE, RTOVFL, WORK( PTRD ), 1 ) * CALL PSLAGGE( M, N, WORK( PTRD ), WORK( PTRA ), IA, JA, $ DESCA, ISEED, SIZE, WORK( PTRWORK ), $ LLWORK, INFO ) * ELSE IF( ITYPE.EQ.6 ) THEN * * Singular values scaled by underflow. * CALL SSCAL( SIZE, RTUNFL, WORK( PTRD ), 1 ) CALL PSLAGGE( M, N, WORK( PTRD ), WORK( PTRA ), IA, JA, $ DESCA, ISEED, SIZE, WORK( PTRWORK ), $ LLWORK, INFO ) * END IF * END IF * * Set mapping between JOBTYPE and calling parameters of * PSGESVD, reset pointers to WORK array to save space. * DO 80 JOBTYPE = 1, 4 * IF( JOBTYPE.EQ.1 ) THEN JOBU = 'V' JOBVT = 'V' PTRVT = PTRU + LDU*SIZEQ PTRUC = PTRVT + LDVT*NQ PTRWORK = PTRUC + LDU*SIZEQ LLWORK = LWORK - PTRWORK + 1 ELSE IF( JOBTYPE.EQ.2 ) THEN JOBU = 'V' JOBVT = 'N' ELSE IF( JOBTYPE.EQ.3 ) THEN JOBU = 'N' JOBVT = 'V' PTRVTC = PTRUC PTRWORK = PTRVTC + LDVT*NQ LLWORK = LWORK - PTRWORK + 1 ELSE IF( JOBTYPE.EQ.4 ) THEN JOBU = 'N' JOBVT = 'N' PTRWORK = PTRUC LLWORK = LWORK - PTRWORK + 1 END IF * * Duplicate matrix A. * CALL PSLACPY( 'A', M, N, WORK( PTRA ), IA, JA, DESCA, $ WORK( PTRAC ), IA, JA, DESCA ) * * Test SVD calculation with minimum amount of workspace * calculated earlier. * IF( JOBTYPE.EQ.1 ) THEN * * Run SVD. CALL SLBOOT CALL BLACS_BARRIER( CONTEXT, 'All' ) CALL SLTIMER( 1 ) * CALL PSGESVD( JOBU, JOBVT, M, N, WORK( PTRAC ), IA, JA, $ DESCA, WORK( PTRS ), WORK( PTRU ), IU, JU, $ DESCU, WORK( PTRVT ), IVT, JVT, DESCVT, $ WORK( PTRWORK ), WPSGESVD, INFO ) * CALL SLTIMER( 1 ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 1, 1, CTIME ) * * Check INFO. Different INFO for different processes mean * something went wrong. * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, $ 1, -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), $ 1, 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' GO TO 120 END IF END IF * * If INFO is negative PXERBLA tells you. So the only thing * is to check for positive INFO -- detected heterogeneous * system. * IF( INFO.EQ.( SIZE+1 ) ) THEN HETERO = 'P' SETHET = 1 END IF * * If INFO was fine do more exhaustive check. * IF( INFO.EQ.ZERO ) THEN * DO 50 I = 1, SIZE WORK( I+PTRWORK ) = WORK( I+PTRS-1 ) WORK( I+SIZE+PTRWORK ) = WORK( I+PTRS-1 ) 50 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', SIZE, 1, $ WORK( 1+PTRWORK ), SIZE, 1, 1, -1, -1, $ 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', SIZE, 1, $ WORK( 1+SIZE+PTRWORK ), SIZE, 1, 1, -1, $ -1, 0 ) * DO 60 I = 1, SIZE IF( ABS( WORK( I+PTRWORK )-WORK( SIZE+I+ $ PTRWORK ) ).GT.ZERO ) THEN WRITE( NOUT, FMT = * )'I= ', I, ' MIN=', $ WORK( I+PTRWORK ), ' MAX=', $ WORK( SIZE+I+PTRWORK ) HETERO = 'T' SETHET = 1 GO TO 70 END IF * 60 CONTINUE 70 CONTINUE * END IF * IF( SETHET.NE.1 ) $ HETERO = 'N' * * After PSGESVD AC got screwed up -- need to copy again. * CALL PSLACPY( 'A', M, N, WORK( PTRA ), IA, JA, DESCA, $ WORK( PTRAC ), IA, JA, DESCA ) * * PSSVDCHK screws up U. So before the call to PSSVDCHK * U is copied to UC and a pointer to UC is passed to * PSSVDCHK. * CALL PSLACPY( 'A', M, SIZE, WORK( PTRU ), IU, JU, DESCU, $ WORK( PTRUC ), IU, JU, DESCU ) * * Run tests 1 - 4. * CALL PSSVDCHK( M, N, WORK( PTRAC ), IA, JA, DESCA, $ WORK( PTRUC ), IU, JU, DESCU, $ WORK( PTRVT ), IVT, JVT, DESCVT, $ WORK( PTRS ), THRESH, WORK( PTRWORK ), $ LLWORK, RESULT, CHK, MTM ) * ELSE * * Once again test PSGESVD with min workspace. * CALL PSGESVD( JOBU, JOBVT, M, N, WORK( PTRAC ), IA, JA, $ DESCA, WORK( PTRSC ), WORK( PTRUC ), IU, $ JU, DESCU, WORK( PTRVTC ), IVT, JVT, $ DESCVT, WORK( PTRWORK ), WPSGESVD, INFO ) * CALL PSSVDCMP( M, N, JOBTYPE, WORK( PTRS ), $ WORK( PTRSC ), WORK( PTRU ), $ WORK( PTRUC ), IU, JU, DESCU, $ WORK( PTRVT ), WORK( PTRVTC ), IVT, JVT, $ DESCVT, THRESH, RESULT, DELTA, $ WORK( PTRWORK ), LLWORK ) * END IF * 80 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN DO 90 I = 1, 9 IF( RESULT( I ).EQ.1 ) THEN PASS = 1 WRITE( NOUT, FMT = * )'Test I = ', I, 'has failed' WRITE( NOUT, FMT = * )' ' END IF 90 CONTINUE IF( PASS.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 )'Passed', WTIME( 1 ), $ CTIME( 1 ), M, N, NPROW, NPCOL, NB, ITYPE, CHK, MTM, $ DELTA, HETERO END IF END IF 100 CONTINUE CALL BLACS_GRIDEXIT( CONTEXT ) 110 CONTINUE * 9999 FORMAT( A6, 2E10.3, 2I6, 2I4, I5, I6, 3F6.2, 4X, A1 ) 120 CONTINUE * * End of PSSVDTST * RETURN END scalapack-2.0.2/TESTING/EIG/pssytdrv.f000644 000766 000024 00000040666 10363532303 017511 0ustar00juliestaff000000 000000 SUBROUTINE PSSYTDRV( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSSYTDRV computes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from Q, the * symmetric tridiagonal matrix T (or D and E), and TAU, which were * computed by PSSYTRD: sub( A ) := Q * T * Q'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of sub( A ). On entry, * if UPLO='U', the diagonal and first superdiagonal of sub( A ) * have the corresponding elements of the tridiagonal matrix T, * and the elements above the first superdiagonal, with the * array TAU, represent the orthogonal matrix Q as a product of * elementary reflectors, and the strictly lower triangular part * of sub( A ) is not referenced. If UPLO='L', the diagonal and * first subdiagonal of sub( A ) have the corresponding elements * of the tridiagonal matrix T, and the elements below the first * subdiagonal, with the array TAU, represent the orthogonal * matrix Q as a product of elementary reflectors, and the * strictly upper triangular part of sub( A ) is not referenced. * On exit, if UPLO = 'U', the upper triangular part of the * distributed symmetric matrix sub( A ) is recovered. * If UPLO='L', the lower triangular part of the distributed * symmetric matrix sub( A ) is recovered. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local input) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local input) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local input) REAL, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK >= 2 * NB *( NB + NP ) * * where NB = MB_A = NB_A, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * On exit, if INFO <> 0, a discrepancy has been found between * the diagonal and off-diagonal elements of A and the copies * contained in the arrays D and E. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL EIGHT, HALF, ONE, ZERO PARAMETER ( EIGHT = 8.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, IACOL, IAROW, ICTXT, II, IPT, IPV, IPX, $ IPY, J, JB, JJ, JL, K, MYCOL, MYROW, NB, NP, $ NPCOL, NPROW REAL ADDBND, D1, D2, E1, E2 * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCV( DLEN_ ), $ DESCT( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC REAL PSLAMCH EXTERNAL INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, IGSUM2D, $ PSELGET, PSGEMM, PSLACPY, $ PSLARFT, PSLASET, PSSYMM, $ PSSYR2K, PSTRMM * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 NB = DESCA( MB_ ) UPPER = LSAME( UPLO, 'U' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) * IPT = 1 IPV = NB * NB + IPT IPX = NB * NP + IPV IPY = NB * NP + IPX * CALL DESCSET( DESCD, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * ADDBND = EIGHT * PSLAMCH( ICTXT, 'eps' ) * IF( UPPER ) THEN * CALL DESCSET( DESCE, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 10 J = 0, N-1 D1 = ZERO E1 = ZERO D2 = ZERO E2 = ZERO CALL PSELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PSELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(N-1) ) THEN CALL PSELGET( ' ', ' ', E2, E, 1, JA+J+1, DESCE ) CALL PSELGET( 'Columnwise', ' ', E1, A, IA+J, JA+J+1, $ DESCA ) END IF * IF( ( ABS( D1 - D2 ).GT.( ABS( D2 ) * ADDBND ) ) .OR. $ ( ABS( E1 - E2 ).GT.( ABS( E2 ) * ADDBND ) ) ) $ INFO = INFO + 1 10 CONTINUE * * Compute the upper triangle of sub( A ). * CALL DESCSET( DESCV, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCT, NB, NB, NB, NB, IAROW, IACOL, ICTXT, NB ) * DO 20 K = 0, N-1, NB JB = MIN( NB, N-K ) I = IA + K J = JA + K * * Compute the lower triangular matrix T. * CALL PSLARFT( 'Backward', 'Columnwise', K+JB-1, JB, A, IA, $ J, DESCA, TAU, WORK( IPT ), WORK( IPV ) ) * * Copy Householder vectors into WORK( IPV ). * CALL PSLACPY( 'All', K+JB-1, JB, A, IA, J, DESCA, $ WORK( IPV ), 1, 1, DESCV ) * IF( K.GT.0 ) THEN CALL PSLASET( 'Lower', JB+1, JB, ZERO, ONE, WORK( IPV ), $ K, 1, DESCV ) ELSE CALL PSLASET( 'Lower', JB, JB-1, ZERO, ONE, WORK( IPV ), $ 1, 2, DESCV ) CALL PSLASET( 'Ge', JB, 1, ZERO, ZERO, WORK( IPV ), 1, $ 1, DESCV ) END IF * * Zero out the strict upper triangular part of A. * IF( K.GT.0 ) THEN CALL PSLASET( 'Ge', K-1, JB, ZERO, ZERO, A, IA, J, $ DESCA ) CALL PSLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, A, I-1, $ J+1, DESCA ) ELSE IF( JB.GT.1 ) THEN CALL PSLASET( 'Upper', JB-2, JB-2, ZERO, ZERO, A, IA, $ J+2, DESCA ) END IF * * (1) X := A * V * T' * CALL PSSYMM( 'Left', 'Upper', K+JB, JB, ONE, A, IA, JA, $ DESCA, WORK( IPV ), 1, 1, DESCV, ZERO, $ WORK( IPX ), 1, 1, DESCV ) CALL PSTRMM( 'Right', 'Lower', 'Transpose', 'Non-Unit', $ K+JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPX ), 1, 1, DESCV ) * * (2) X := X - 1/2 * V * (T * V' * X) * CALL PSGEMM( 'Transpose', 'No transpose', JB, JB, K+JB, ONE, $ WORK( IPV ), 1, 1, DESCV, WORK( IPX ), 1, 1, $ DESCV, ZERO, WORK( IPY ), 1, 1, DESCT ) CALL PSTRMM( 'Left', 'Lower', 'No transpose', 'Non-Unit', $ JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPY ), 1, 1, DESCT ) CALL PSGEMM( 'No tranpose', 'No transpose', K+JB, JB, JB, $ -HALF, WORK( IPV ), 1, 1, DESCV, WORK( IPY ), $ 1, 1, DESCT, ONE, WORK( IPX ), 1, 1, DESCV ) * * (3) A := A - X * V' - V * X' * CALL PSSYR2K( 'Upper', 'No transpose', K+JB, JB, -ONE, $ WORK( IPV ), 1, 1, DESCV, WORK( IPX ), 1, 1, $ DESCV, ONE, A, IA, JA, DESCA ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) DESCT( CSRC_ ) = MOD( DESCT( CSRC_ ) + 1, NPCOL ) * 20 CONTINUE * ELSE * CALL DESCSET( DESCE, 1, JA+N-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 30 J = 0, N-1 D1 = ZERO E1 = ZERO D2 = ZERO E2 = ZERO CALL PSELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PSELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(N-1) ) THEN CALL PSELGET( ' ', ' ', E2, E, 1, JA+J, DESCE ) CALL PSELGET( 'Columnwise', ' ', E1, A, IA+J+1, JA+J, $ DESCA ) END IF * IF( ( ABS( D1 - D2 ).GT.( ABS( D2 ) * ADDBND ) ) .OR. $ ( ABS( E1 - E2 ).GT.( ABS( E2 ) * ADDBND ) ) ) $ INFO = INFO + 1 30 CONTINUE * * Compute the lower triangle of sub( A ). * JL = MAX( ( ( JA+N-2 ) / NB ) * NB + 1, JA ) IACOL = INDXG2P( JL, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCV, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCT, NB, NB, NB, NB, INDXG2P( IA+JL-JA+1, NB, $ MYROW, DESCA( RSRC_ ), NPROW ), IACOL, ICTXT, $ NB ) * DO 40 J = JL, JA, -NB K = J - JA + 1 I = IA + K - 1 JB = MIN( N-K+1, NB ) * * Compute upper triangular matrix T from TAU. * CALL PSLARFT( 'Forward', 'Columnwise', N-K, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPV ) ) * * Copy Householder vectors into WORK( IPV ). * CALL PSLACPY( 'Lower', N-K, JB, A, I+1, J, DESCA, $ WORK( IPV ), K+1, 1, DESCV ) CALL PSLASET( 'Upper', N-K, JB, ZERO, ONE, WORK( IPV ), $ K+1, 1, DESCV ) CALL PSLASET( 'Ge', 1, JB, ZERO, ZERO, WORK( IPV ), K, 1, $ DESCV ) * * Zero out the strict lower triangular part of A. * CALL PSLASET( 'Lower', N-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * (1) X := A * V * T' * CALL PSSYMM( 'Left', 'Lower', N-K+1, JB, ONE, A, I, J, $ DESCA, WORK( IPV ), K, 1, DESCV, ZERO, $ WORK( IPX ), K, 1, DESCV ) CALL PSTRMM( 'Right', 'Upper', 'Transpose', 'Non-Unit', $ N-K+1, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPX ), K, 1, DESCV ) * * (2) X := X - 1/2 * V * (T * V' * X) * CALL PSGEMM( 'Transpose', 'No transpose', JB, JB, N-K+1, $ ONE, WORK( IPV ), K, 1, DESCV, WORK( IPX ), $ K, 1, DESCV, ZERO, WORK( IPY ), 1, 1, DESCT ) CALL PSTRMM( 'Left', 'Upper', 'No transpose', 'Non-Unit', $ JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPY ), 1, 1, DESCT ) CALL PSGEMM( 'No transpose', 'No transpose', N-K+1, JB, JB, $ -HALF, WORK( IPV ), K, 1, DESCV, WORK( IPY ), $ 1, 1, DESCT, ONE, WORK( IPX ), K, 1, DESCV ) * * (3) A := A - X * V' - V * X' * CALL PSSYR2K( 'Lower', 'No tranpose', N-K+1, JB, -ONE, $ WORK( IPV ), K, 1, DESCV, WORK( IPX ), K, 1, $ DESCV, ONE, A, I, J, DESCA ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCT( RSRC_ ) = MOD( DESCT( RSRC_ ) + NPROW - 1, NPROW ) DESCT( CSRC_ ) = MOD( DESCT( CSRC_ ) + NPCOL - 1, NPCOL ) * 40 CONTINUE * END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * RETURN * * End of PSSYTDRV * END scalapack-2.0.2/TESTING/EIG/pstrddriver.f000644 000766 000024 00000046646 10363532303 020167 0ustar00juliestaff000000 000000 PROGRAM PSTRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * Purpose * ======== * * PSTRDDRIVER is the main test program for the REAL * SCALAPACK TRD (symmetric tridiagonal reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 13 lines: * 'ScaLAPACK TRD computation input file' * 'PVM machine' * 'TRD.out' output file name * 6 device out * 'L' define Lower or Upper * 3 number of problems sizes * 5 31 201 values of N * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * REALSZ INTEGER, default = 4 bytes. * INTGSZ and REALSZ indicate the length in bytes on the * given platform for an integer and a single precision real. * MEM REAL array, dimension ( TOTMEM / REALSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER REALSZ, TOTMEM, MEMSIZ, NTESTS REAL PADVAL PARAMETER ( REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, NTESTS = 20, $ PADVAL = -9923.0E+0 ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPT, IPW, ITEMP, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LCM, LWORK, MYCOL, $ MYROW, N, NB, NDIAG, NGRIDS, NMAT, NNB, NOFFD, $ NOUT, NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ, $ WORKTRD REAL ANORM, FRESID, THRESH DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), QVAL( NTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PSCHEKPAD, $ PSFILLPAD, PSLAFCHK, PSMATGEN, PSSYTDRV, $ PSSYTRD, PSTRDINFO, PSTTRDTESTER, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC REAL PSLANSY EXTERNAL LSAME, ICEIL, ILCM, NUMROC, PSLANSY * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )STOP * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PSTRDINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * NDIAG = NQ IF( LSAME( UPLO, 'U' ) ) THEN NOFFD = NQ ELSE NOFFD = NUMROC( N-1, NB, MYCOL, 0, NPCOL ) END IF * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPT = IPE + NOFFD + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * LWORK = MAX( NB*( NP+1 ), 3*NB ) WORKTRD = LWORK + IPOSTPAD WORKSIZ = WORKTRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN ITEMP = 2*NQ + NP IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) ITEMP = NB*ICEIL( ICEIL( NP, NB ), LCM / NPROW ) + $ ITEMP END IF ITEMP = MAX( ITEMP, 2*( NB+NP )*NB ) WORKSIZ = MAX( LWORK, ITEMP ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Tridiagonal reduction', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PSMATGEN( ICTXT, 'Symm', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), NQ, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANSY( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to symmetric tridiagonal form * CALL PSSYTRD( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSSYTRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTRD', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTRD', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTRD', WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - QTQ'|| / (||A|| * N * eps) * CALL PSSYTDRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), IERR( 1 ) ) CALL PSLAFCHK( 'Symm', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSSYTDRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTDRV', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTDRV', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTDRV', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ. $ 0.0E+0 .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 )FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * )'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * TRD requires 4/3 N^3 floating point operations * NOPS = DBLE( N ) * NOPS = ( 4.0D+0 / 3.0D+0 )*NOPS**3 NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', UPLO, N, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', UPLO, N, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, FRESID, PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * CALL PSTTRDTESTER( IAM, NPROCS, CHECK, NOUT, THRESH, NVAL, NMAT, $ MEM, TOTMEM, KPASS, KFAIL, KSKIP ) * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UPLO N NB P Q TRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ---- ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, A4, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, F9.2, 1X, $ F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*T*Q''|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PSTRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pstrdinfo.f000644 000766 000024 00000032241 10363532303 017611 0ustar00juliestaff000000 000000 SUBROUTINE PSTRDINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NNB, NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY*(*) INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PSTRDINFO gets needed startup information for the symmetric * tridiagonal reduction and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * MAX( 5, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT REAL EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL PSLAMCH, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='TRD.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GOTO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GOTO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GOTO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GOTO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS IF( LSAME( UPLO, 'L' ) ) THEN WORK( 4 ) = 1 ELSE WORK( 4 ) = 2 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Reduction Routine to symmetric '// $ 'tridiagonal form.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'real single precision symmetric '// $ 'tridiagonal' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - QTQ''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether the ''Upper'' or ''Low'// $ 'er'' part of A is to be referenced.' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less'// $ 'than THRESH, CHECK is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'TRD time : Time in seconds to reduce the'// $ ' matrix to tridiagonal form.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for '// $ 'symmetric tridiagonal reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) IF( WORK( 4 ).EQ.1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PSTRDINFO * END scalapack-2.0.2/TESTING/EIG/psttrdtester.f000644 000766 000024 00000061630 10363532303 020354 0ustar00juliestaff000000 000000 SUBROUTINE PSTTRDTESTER( IAM, NPROCS, CHECK, NOUT, THRESH, NVAL, $ NMAT, MEM, TOTMEM, KPASS, KFAIL, KSKIP ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * February 24, 2000 * * .. Scalar Arguments .. LOGICAL CHECK INTEGER IAM, KFAIL, KPASS, KSKIP, NMAT, NOUT, NPROCS, $ TOTMEM REAL THRESH * .. * .. Array Arguments .. INTEGER NVAL( * ) REAL MEM( * ) * .. * * Purpose * ======= * * PSTTRDTESTER tests PSSYTTRD * * Arguments * ========= * * IAM (local input) INTEGER * The local process number * * NPROCS (global input) INTEGER * The number of processors * * CHECK (global input) LOGICAL * Specifies whether the user wants to check the answer * * NOUT (local input) INTEGER * File descriptor * * THRESH (global input) REAL * Acceptable error threshold * * NVAL (global input) INTEGER array dimension NMAT * The matrix sizes to test * * NMAT (global input) INTEGER * The number of matrix sizes to test * * MEM (local input) REAL array dimension MEMSIZ * Where: * MEMSIZ = TOTMEM / REALSZ * * TOTMEM (global input) INTEGER * Number of bytes in MEM * * KPASS (local input/output) INTEGER * The number of tests which passed. Only relevant on * processor 0. * * KFAIL (local input/output) INTEGER * The number of tests which failed. Only relevant on * processor 0. * * KSKIP (local input/output) INTEGER * The number of tests which were skipped. Only relevant on * processor 0. * * ================================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER REALSZ REAL PADVAL PARAMETER ( REALSZ = 4, PADVAL = -9923.0E+0 ) INTEGER TIMETESTS PARAMETER ( TIMETESTS = 11 ) INTEGER TESTS PARAMETER ( TESTS = 8 ) INTEGER MINTIMEN PARAMETER ( MINTIMEN = 8 ) * .. * .. Local Scalars .. LOGICAL TIME CHARACTER UPLO CHARACTER*6 PASSED INTEGER DUMMY, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPT, IPW, ITEMP, J, K, $ LCM, LWMIN, MAXTESTS, MEMSIZ, MYCOL, MYROW, N, $ NB, NDIAG, NGRIDS, NN, NOFFD, NP, NPCOL, NPROW, $ NPS, NQ, SPLITSTIMED, WORKSIZ, WORKTRD REAL ANORM, FRESID DOUBLE PRECISION NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER ANBTEST( TESTS ), ANBTIME( TIMETESTS ), $ BALTEST( TESTS ), BALTIME( TIMETESTS ), $ DESCA( DLEN_ ), DESCD( DLEN_ ), IERR( 1 ), $ INTERTEST( TESTS ), INTERTIME( TIMETESTS ), $ PNBTEST( TESTS ), PNBTIME( TIMETESTS ), $ TWOGEMMTEST( TESTS ), TWOGEMMTIME( TIMETESTS ) DOUBLE PRECISION CTIME( 100 ), WTIME( 100 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, DESCINIT, $ IGEBR2D, IGEBS2D, IGSUM2D, PSCHEKPAD, $ PSFILLPAD, PSLAFCHK, PSLATRAN, PSMATGEN, $ PSSYTDRV, PSSYTTRD, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC, PJLAENV REAL PSLANSY EXTERNAL LSAME, ICEIL, ILCM, NUMROC, PJLAENV, PSLANSY * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, REAL, SQRT * .. * * .. Scalars in Common .. INTEGER ANB, BALANCED, BCKBLOCK, GSTBLOCK, INTERLEAVE, $ LLTBLOCK, MINSZ, PNB, TIMEINTERNALS, TIMING, $ TRSBLOCK, TWOGEMMS * .. * .. Common blocks .. COMMON / BLOCKSIZES / GSTBLOCK, LLTBLOCK, BCKBLOCK, $ TRSBLOCK COMMON / MINSIZE / MINSZ COMMON / PJLAENVTIMING / TIMING COMMON / TAILOREDOPTS / PNB, ANB, INTERLEAVE, $ BALANCED, TWOGEMMS COMMON / TIMECONTROL / TIMEINTERNALS * .. * .. Data statements .. DATA BALTIME / 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0 / DATA INTERTIME / 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1 / DATA TWOGEMMTIME / 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0 / DATA ANBTIME / 16, 16, 16, 16, 16, 8, 8, 32, 32, 16, $ 16 / DATA PNBTIME / 32, 32, 32, 32, 32, 32, 32, 32, 32, $ 16, 64 / DATA BALTEST / 0, 0, 0, 0, 1, 1, 1, 1 / DATA INTERTEST / 0, 0, 1, 1, 0, 0, 1, 1 / DATA TWOGEMMTEST / 0, 1, 0, 1, 0, 1, 0, 1 / DATA ANBTEST / 1, 2, 3, 16, 1, 2, 3, 16 / DATA PNBTEST / 1, 16, 8, 1, 16, 8, 1, 16 / * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * IASEED = 100 SPLITSTIMED = 0 NB = 1 UPLO = 'L' MEMSIZ = TOTMEM / REALSZ * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * NGRIDS = INT( SQRT( REAL( NPROCS ) ) ) * DO 30 NN = 1, NGRIDS * NPROW = NN NPCOL = NN IERR( 1 ) = 0 * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * IF( N.GT.MINTIMEN ) THEN * * For timing tests, we perform one or two extra tests. * Both of these extra tests are performed with the * default values for the performance tuning parameters. * The second extra test (which is only performed if * split times are non-zero) is performed with timeinternals * set to 1 (which forces barrier syncs between many * phases of the computation). * TIME = .TRUE. MAXTESTS = TIMETESTS + 2 ELSE TIME = .FALSE. MAXTESTS = TESTS END IF * * DO 10 K = 1, MAXTESTS TIMEINTERNALS = 0 IF( TIME ) THEN IF( K.GE.MAXTESTS-1 ) THEN * * For the last two timings, we let pjlaenv set * the execution path values. These dummy * initializations aren't really necessary, * but they illustrate the fact that these values are * set in xpjlaenv. The dummy call to pjlaenv * has the side effect of setting ANB. * MINSZ = -13 BALANCED = -13 INTERLEAVE = -13 TWOGEMMS = -13 ANB = -13 PNB = -13 TIMING = 1 DUMMY = PJLAENV( ICTXT, 3, 'PSSYTTRD', 'L', 0, 0, $ 0, 0 ) IF( K.EQ.MAXTESTS ) $ TIMEINTERNALS = 1 ELSE TIMING = 0 MINSZ = 1 BALANCED = BALTIME( K ) INTERLEAVE = INTERTIME( K ) TWOGEMMS = TWOGEMMTIME( K ) ANB = ANBTIME( K ) PNB = PNBTIME( K ) END IF ELSE TIMING = 0 MINSZ = 1 BALANCED = BALTEST( K ) INTERLEAVE = INTERTEST( K ) TWOGEMMS = TWOGEMMTEST( K ) ANB = ANBTEST( K ) PNB = PNBTEST( K ) END IF * * Skip the last test (with timeinternals = 1) if * PSSYTTRD is not collecting the split times. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, SPLITSTIMED, $ 1 ) ELSE CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, SPLITSTIMED, 1, $ 0, 0 ) END IF * * IF( SPLITSTIMED.EQ.0 .AND. K.EQ.MAXTESTS ) $ GO TO 10 * * The following hack tests to make sure that PNB need not * be the same on all processes. (Provided that PNB is set * to 1 in the TRD.dat file.) * IF( PNB.EQ.1 ) $ PNB = 1 + IAM * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * CALL DESCINIT( DESCD, 1, N, NB, NB, 0, 0, ICTXT, 1, $ INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * NDIAG = NQ IF( LSAME( UPLO, 'U' ) ) THEN NOFFD = NQ ELSE NOFFD = NUMROC( N-1, NB, MYCOL, 0, NPCOL ) END IF * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPT = IPE + NOFFD + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) LWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + NPS * WORKTRD = LWMIN + IPOSTPAD WORKSIZ = WORKTRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN ITEMP = 2*NQ + NP IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) ITEMP = NB*ICEIL( ICEIL( NP, NB ), LCM / NPROW ) + $ ITEMP END IF ITEMP = MAX( ITEMP, 2*( NB+NP )*NB ) WORKSIZ = MAX( LWMIN, ITEMP ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Tridiagonal reduction', $ ( IPW+WORKSIZ )*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * * * Generate the matrix A * CALL PSMATGEN( ICTXT, 'Hemm', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PSFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), NQ, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PSLANSY( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSLANSY', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSFILLPAD( ICTXT, WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to symmetric tridiagonal form * CALL PSSYTTRD( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), LWMIN, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSSYTTRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTTRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PSCHEKPAD( ICTXT, 'PSSYTTRDc', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTTRDd', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTTRDe', WORKTRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKTRD-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PSFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - QTQ'|| / (||A|| * N * eps) * CALL PSSYTDRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), IERR( 1 ) ) * * TTRD does not preserve the upper triangular part of A. * The following call to PSLATRAN means that we only * check the lower triangular part of A - QTQ' * CALL PSLATRAN( N, 1, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PSLAFCHK( 'Hemm', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PSCHEKPAD( ICTXT, 'PSSYTDRVf', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTDRVg', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTDRVh', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PSCHEKPAD( ICTXT, 'PSSYTDRVi', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ. $ 0.0E+0 .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9991 )FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' * * END IF * * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * )'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 50, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 50, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * TRD requires 16/3 N^3 floating point operations * NOPS = DBLE( N ) NOPS = ( 16.0D+0 / 3.0D+0 )*NOPS**3 NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9992 )'WALL', N, INTERLEAVE, $ TWOGEMMS, BALANCED, ANB, PNB, NPROW*NPCOL, $ WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9992 )'CPU ', N, INTERLEAVE, $ TWOGEMMS, BALANCED, ANB, PNB, NPROW*NPCOL, $ CTIME( 1 ), TMFLOPS, FRESID, PASSED * * * If split times were collected (in PSSYttrd.f), print * them out. * IF( WTIME( 13 )+WTIME( 15 )+WTIME( 16 ).GT.0.0D+0 .OR. $ CTIME( 13 )+CTIME( 15 )+CTIME( 16 ).GT.0.0D+0 ) $ THEN SPLITSTIMED = 1 END IF IF( SPLITSTIMED.EQ.1 ) THEN WRITE( NOUT, FMT = 9990 )WTIME( 10 ), WTIME( 11 ), $ WTIME( 12 ), WTIME( 13 ), WTIME( 14 ), $ WTIME( 15 ) WRITE( NOUT, FMT = 9989 )WTIME( 16 ), WTIME( 17 ), $ WTIME( 18 ), WTIME( 19 ), WTIME( 20 ), $ WTIME( 21 ) * WRITE( NOUT, FMT = 9988 )CTIME( 10 ), CTIME( 11 ), $ CTIME( 12 ), CTIME( 13 ), CTIME( 14 ), $ CTIME( 15 ) WRITE( NOUT, FMT = 9987 )CTIME( 16 ), CTIME( 17 ), $ CTIME( 18 ), CTIME( 19 ), CTIME( 20 ), $ CTIME( 21 ) WRITE( NOUT, FMT = 9986 )N, NPROW*NPCOL, PNB, ANB, $ INTERLEAVE, BALANCED, TWOGEMMS, TIMEINTERNALS END IF END IF 10 CONTINUE 20 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN IF( SPLITSTIMED.EQ.1 ) THEN WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = 9974 ) WRITE( NOUT, FMT = 9973 ) END IF END IF * * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE RETURN * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) * 9995 FORMAT( 'PSSYTTRD, tailored reduction to tridiagonal form, test.' $ ) 9994 FORMAT( 'TIME N int 2gm bal anb pnb prcs TRD Time ', $ ' MFLOPS Residual CHECK' ) 9993 FORMAT( '---- ---- --- --- --- --- --- ---- -------- ', $ '----------- -------- ------' ) 9992 FORMAT( A4, 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 1X, $ I5, 1X, F9.2, 1X, F11.2, 1X, F8.2, 1X, A6 ) 9991 FORMAT( '||A - Q*T*Q''|| / (||A|| * N * eps) = ', G25.7 ) 9990 FORMAT( 'wsplit1=[wsplit1;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9989 FORMAT( 'wsplit2=[wsplit2;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9988 FORMAT( 'csplit1=[csplit1;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9987 FORMAT( 'csplit2=[csplit2;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9986 FORMAT( 'size_opts=[size_opts;', I4, 1X, I4, 1X, I4, 1X, I4, 1X, $ I4, 1X, I4, 1X, I4, 1X, I4, 1X, ' ];' ) 9985 FORMAT( 'N=1; NPROCS=2; PNB=3; ANB=4; INTERLEAVE=5; BALANCED=6;', $ ' TWOGEMMS=7; TIMEINTERNALS=8;' ) 9984 FORMAT( 'S1_OVERHEAD = 1; % Should be mainly cost of barrier' ) 9983 FORMAT( 'S1_BARRIER = 2; % Cost of barrier' ) 9982 FORMAT( 'S1_UPDCURCOL = 3; % Update the current column' ) 9981 FORMAT( 'S1_HOUSE = 4; % Compute the householder vector' ) 9980 FORMAT( 'S1_SPREAD = 5; % Spread across' ) 9979 FORMAT( 'S1_TRANSPOSE = 6; % Transpose' ) 9978 FORMAT( 'S2_UPDCURBLK = 1; % Update the current block column' ) 9977 FORMAT( 'S2_TRMVT = 2; % TRMVT v = A * h; vt = ht * A'' ' ) 9976 FORMAT( 'S2_UPD_V = 3; % v = v + V * HT * h + H * VT * h ' ) 9975 FORMAT( 'S2_TRANS_SUM = 4; % v = v + vt'' ' ) 9974 FORMAT( 'S2_DOT = 5; % c = v'' * h ' ) 9973 FORMAT( 'S2_R2K = 6; % A = A - v * h'' - h * v'' ' ) * * * End of PSTTRDTESTER * END scalapack-2.0.2/TESTING/EIG/pzbrddriver.f000644 000766 000024 00000050074 10363532303 020142 0ustar00juliestaff000000 000000 PROGRAM PZBRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * Purpose * ======= * * PZBRDDRIVER is the main test program for the COMPLEX*16 * ScaLAPACK BRD (bidiagonal reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 13 lines: * 'ScaLAPACK BRD computation input file' * 'PVM machine' * 'BRD.out' output file name * 6 device out * 3 number of problems sizes * 16 20 18 values of M * 16 18 20 values of N * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 1.0 threshold * * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER MEMSIZ, NTESTS, TOTMEM, ZPLXSZ, DBLESZ COMPLEX*16 PADVAL PARAMETER ( TOTMEM = 2000000, ZPLXSZ = 16, DBLESZ = 8, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPTP, IPTQ, IPW, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LWORK, M, MAXMN, $ MINMN, MNP, MNQ, MP, MYCOL, MYROW, N, NB, $ NDIAG, NGRIDS, NMAT, NNB, NOFFD, NOUT, NPCOL, $ NPROCS, NPROW, NQ, WORKBRD, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ MVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZCHEKPAD, $ PZBRDINFO, PZFILLPAD, PZLAFCHK, $ PZMATGEN, PZGEBDRV, PZGEBRD, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PZLANGE EXTERNAL ICEIL, NUMROC, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PZBRDINFO( OUTFILE, NOUT, NMAT, MVAL, NTESTS, NVAL, NTESTS, $ NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * * Go to bottom of loop if this case doesn't use my process * DO 20 J = 1, NMAT * M = MVAL( J ) N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( M.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'M', M IERR( 1 ) = 1 ELSE IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * MP = NUMROC( M, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) MNP = NUMROC( MIN( M, N ), NB, MYROW, 0, NPROW ) MNQ = NUMROC( MIN( M, N ), NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, MP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, M, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, MP )+IMIDPAD, IERR( 1 ) ) * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IF( M.GE.N ) THEN NDIAG = MNQ NOFFD = MNP NDIAG = ICEIL( DBLESZ*NDIAG, ZPLXSZ ) NOFFD = ICEIL( DBLESZ*NOFFD, ZPLXSZ ) ELSE NDIAG = MNP NOFFD = NUMROC( MIN( M, N )-1, NB, MYCOL, 0, NPCOL ) NDIAG = ICEIL( DBLESZ*NDIAG, ZPLXSZ ) NOFFD = ICEIL( DBLESZ*NOFFD, ZPLXSZ ) END IF * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPTQ = IPE + NOFFD + IPOSTPAD + IPREPAD IPTP = IPTQ + MNQ + IPOSTPAD + IPREPAD IPW = IPTP + MNP + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * LWORK = NB*( MP+NQ+1 ) + NQ WORKBRD = LWORK + IPOSTPAD WORKSIZ = WORKBRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN WORKSIZ = MAX( LWORK, 2*NB*( MP+NQ+NB ) ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Bidiagonal reduction', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PZMATGEN( ICTXT, 'No', 'No', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, MP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, MP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, MNQ, 1, MEM( IPTQ-IPREPAD ), $ MNQ, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, MNP, 1, MEM( IPTP-IPREPAD ), $ MNP, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANGE( 'I', M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKBRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKBRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to bidiagonal form * CALL PZGEBRD( M, N, MEM( IPA ), 1, 1, DESCA, MEM( IPD ), $ MEM( IPE ), MEM( IPTQ ), MEM( IPTP ), $ MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGEBRD', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEBRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEBRD', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEBRD', MNQ, 1, $ MEM( IPTQ-IPREPAD ), MNQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEBRD', MNP, 1, $ MEM( IPTP-IPREPAD ), MNP, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEBRD', WORKBRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKBRD-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A-Q*B*P|| / (||A|| * N * eps) * CALL PZGEBDRV( M, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPTQ ), $ MEM( IPTP ), MEM( IPW ), IERR( 1 ) ) CALL PZLAFCHK( 'No', 'No', M, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGEBDRV', MP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEBDRV', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEBDRV', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEBDRV', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ.0.0D+0 $ .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID * KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * ) $ 'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * BRD requires 32/3 N^3 floating point operations * MAXMN = MAX( M, N ) MINMN = MIN( M, N ) NOPS = 16.0D+0 * DBLE( MINMN ) * DBLE( MINMN ) * $ ( DBLE( MAXMN ) - DBLE( MINMN ) / 3.0D+0 ) NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', M, N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', M, N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, FRESID, PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME M N NB P Q BRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ------ ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, F9.2, 1X, $ F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*B*P|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PZBRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pzbrdinfo.f000644 000766 000024 00000032202 10363532303 017573 0ustar00juliestaff000000 000000 SUBROUTINE PZBRDINFO( SUMMRY, NOUT, NMAT, MVAL, LDMVAL, NVAL, $ LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 27, 2000 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDMVAL, LDNBVAL, LDNVAL, LDPVAL, $ LDQVAL, NGRIDS, NMAT, NNB, NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. INTEGER MVAL( LDMVAL ), NBVAL( LDNBVAL ), $ NVAL( LDNVAL ), PVAL( LDPVAL ), $ QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZBRDINFO get needed startup information for the bidiagonal * reduction and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, output to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for M & N. * * MVAL (global output) INTEGER array, dimension (LDMVAL) * The values of M (number of rows in matrix) to run the code * with. * * LDMVAL (global input) INTEGER * The maximum number of different values that can be used for * M. LDMVAL >= NMAT. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N. LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * MAX( 5, LDMVAL+LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack * all input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, IGEBR2D, IGEBS2D, $ SCOPY * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'BRD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( UNIT = NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get values of M, N * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1. .OR. NMAT.GT.LDMVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'M', LDMVAL GO TO 20 END IF READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 1, 3, WORK, 1 ) * I = 1 CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Bidiagonal reduction' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex double precision bidiagonal ' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - Q B P''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'M : The number of rows '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less'// $ ' than THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'BRD time : Time in seconds to reduce the'// $ ' matrix' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for '// $ 'the bidiagonal reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9995 ) $ 'M ', ( MVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( MVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9995 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9999 ) ' ' WRITE( NOUT, FMT = 9996 ) EPS WRITE( NOUT, FMT = 9993 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 3, WORK, 1, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = 2*NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 CONTINUE WRITE( NOUT, FMT = 9998 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9997 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9996 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9995 FORMAT( 2X, A5, ': ', 10I6 ) 9994 FORMAT( ' ', 10I6 ) 9993 FORMAT( 'Routines pass computational tests if scaled residual is', $ ' less than ', G12.5 ) * * End of PZBRDINFO * END scalapack-2.0.2/TESTING/EIG/pzevcdriver.f000644 000766 000024 00000053677 10602576752 020200 0ustar00juliestaff000000 000000 PROGRAM PZEVCDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * June, 2000 * * Purpose * ======= * * PZEVCDRIVER is the main test program for the COMPLEX*16 * SCALAPACK PZTREVC routine. This test driver performs a right and * left eigenvector calculation of a triangular matrix followed by * a residual checks of the calcuated eigenvectors. * * The program must be driven by a short data file and uses the same * input file as the PZNEPDRIVER. An annotated example of a data file * can be obtained by deleting the first 3 characters from the following * 18 lines: * 'SCALAPACK, Version 1.8, NEP (Nonsymmetric EigenProblem) input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'NEP.out' output file name (if any) * 6 device out * 8 number of problems sizes * 1 2 3 4 6 10 100 200 vales of N * 3 number of NB's * 6 20 40 values of NB * 4 number of process grids (ordered pairs of P & Q) * 1 2 1 4 values of P * 1 2 4 1 values of Q * 20.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * ZPLXSZ INTEGER, default = 16 bytes. * ZPLXSZ indicate the length in bytes on the given platform * for a double precision complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * Further Details * ============== * * Contributed by Mark Fahey, June, 2000 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ZPLXSZ, TOTMEM, MEMSIZ, NTESTS PARAMETER ( ZPLXSZ = 16, TOTMEM = 200000000, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20 ) COMPLEX*16 PADVAL, ZERO, ONE PARAMETER ( PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICOL, ICTXT, II, III, IMIDPAD, $ INFO, IPA, IPOSTPAD, IPREPAD, IPVL, IPVR, IPW, $ IPWR, IPZ, IROW, J, JJ, JJJ, K, KFAIL, KPASS, $ KSKIP, KTESTS, LDA, LDZ, LWORK, M, MYCOL, $ MYROW, N, NB, NGRIDS, NMAT, NNB, NOUT, NP, $ NPCOL, NPROCS, NPROW, NQ, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, QRESID, TMFLOPS * .. * .. Local Arrays .. LOGICAL SELECT( 1 ) INTEGER DESCA( DLEN_ ), DESCZ( DLEN_ ), IERR( 2 ), $ NBVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), RESULT( 2 ), RWORK( 5000 ), $ WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, INFOG2L, $ PZCHEKPAD, PZEVCINFO, PZFILLPAD, PZGET22, $ PZLASET, PZMATGEN, PZTREVC, SLBOOT, SLCOMBINE, $ SLTIMER, ZGSUM2D * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PZLANHS EXTERNAL ILCM, NUMROC, PZLANHS * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PZEVCINFO( OUTFILE, NOUT, NMAT, NVAL, NTESTS, NNB, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 40 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 40 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 40 * DO 30 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 30 END IF * DO 20 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.6 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 20 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) IPREPAD = IPREPAD + 1000 IMIDPAD = IMIDPAD + 1000 IPOSTPAD = IPOSTPAD + 1000 ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Initialize the array descriptor for the matrix Z * CALL DESCINIT( DESCZ, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 2 ) ) * LDA = DESCA( LLD_ ) LDZ = DESCZ( LLD_ ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 20 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPZ = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWR = IPZ + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPVL = IPWR + N + IPOSTPAD + IPREPAD IPVR = IPVL + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPVR + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD III = N / NB IF( III*NB.LT.N ) $ III = III + 1 III = 7*III / ILCM( NPROW, NPCOL ) * * LWORK = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, III ) LWORK = LWORK + MAX( 2*N, ( 8*ILCM( NPROW, NPCOL )+2 )** $ 2 ) * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZEVCFCHK and PZLANHS * WORKSIZ = LWORK + MAX( NP*DESCA( NB_ ), $ DESCA( MB_ )*NQ ) + IPOSTPAD * ELSE * WORKSIZ = LWORK + IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Schur reduction', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 20 END IF * * Generate matrix Z = In * CALL PZLASET( 'All', N, N, ZERO, ONE, MEM( IPZ ), 1, 1, $ DESCZ ) CALL PZLASET( 'All', N, N, ZERO, ZERO, MEM( IPVR ), 1, 1, $ DESCZ ) CALL PZLASET( 'All', N, N, ZERO, ZERO, MEM( IPVL ), 1, 1, $ DESCZ ) * * Generate matrix A upper Hessenberg * CALL PZMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) CALL PZLASET( 'Lower', MAX( 0, N-1 ), MAX( 0, N-1 ), $ ZERO, ZERO, MEM( IPA ), MIN( N, 2 ), 1, $ DESCA ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPVR-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPVL-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPZ-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANHS( 'I', N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANHS', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANHS', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, N, 1, MEM( IPWR-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, PADVAL ) * END IF * * Set eigenvalues from diagonal * DO 10 JJJ = 1, N CALL INFOG2L( JJJ, JJJ, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW.EQ.II .AND. MYCOL.EQ.JJ ) THEN MEM( IPWR-1+JJJ ) = MEM( IPA-1+( ICOL-1 )*LDA+ $ IROW ) ELSE MEM( IPWR-1+JJJ ) = ZERO END IF 10 CONTINUE CALL ZGSUM2D( ICTXT, 'All', ' ', N, 1, MEM( IPWR ), N, $ -1, -1 ) * SELECT( 1 ) = .TRUE. CALL SLBOOT CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform eigenvector calculation * CALL PZTREVC( 'B', 'A', SELECT, N, MEM( IPA ), DESCA, $ MEM( IPVL ), DESCZ, MEM( IPVR ), DESCZ, N, $ M, MEM( IPW ), RWORK, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * )'PZTREVC INFO=', INFO KFAIL = KFAIL + 1 GO TO 20 END IF * IF( CHECK ) THEN * * Check for memory overwrite in NEP factorization * CALL PZCHEKPAD( ICTXT, 'PZTREVC (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZTREVC (VR)', NP, NQ, $ MEM( IPVR-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZTREVC (VL)', NP, NQ, $ MEM( IPVL-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZTREVC (WR)', N, 1, $ MEM( IPWR-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZTREVC (WORK)', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || T * Z - Z * D || / ( N*|| T ||*EPS ) * FRESID = 0.0D+0 QRESID = 0.0D+0 CALL PZGET22( 'N', 'N', 'N', N, MEM( IPA ), DESCA, $ MEM( IPVR ), DESCZ, MEM( IPWR ), $ MEM( IPZ ), DESCZ, RWORK, RESULT ) FRESID = RESULT( 1 ) QRESID = RESULT( 2 ) * * Compute || T^H * L - L * D^H || / ( N*|| T ||*EPS ) * CALL PZGET22( 'C', 'N', 'C', N, MEM( IPA ), DESCA, $ MEM( IPVL ), DESCZ, MEM( IPWR ), $ MEM( IPZ ), DESCZ, RWORK, RESULT ) FRESID = MAX( FRESID, RESULT( 1 ) ) QRESID = MAX( QRESID, RESULT( 2 ) ) * CALL PZCHEKPAD( ICTXT, 'PZGET22 (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGET22 (VR)', NP, NQ, $ MEM( IPVR-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGET22 (VL)', NP, NQ, $ MEM( IPVL-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGET22 (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( ( FRESID-FRESID ).EQ.0.0D+0 ) .AND. $ ( QRESID.LE.THRESH ) .AND. $ ( ( QRESID-QRESID ).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 )FRESID WRITE( NOUT, FMT = 9985 )QRESID END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID QRESID = QRESID - QRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 2 N^2 flops for PxTREVC * NOPS = 2.0D+0*DBLE( N )**2 * * Calculate total megaflops -- eigenvector calc only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, PASSED END IF * 20 CONTINUE * 30 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 40 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q NEP Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- --- ---- ---- -------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, F8.2, $ 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||H*Z - Z*D|| / (||T|| * N * eps) = ', G25.7 ) 9985 FORMAT( 'max_j(max|Z(j)| - 1) / ( N * eps ) ', G25.7 ) * STOP * * End of PZEVCDRIVER * END scalapack-2.0.2/TESTING/EIG/pzevcinfo.f000644 000766 000024 00000030050 10363532303 017600 0ustar00juliestaff000000 000000 SUBROUTINE PZEVCINFO( SUMMRY, NOUT, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March, 2000 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, NGRIDS, $ NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZEVCINFO gets needed startup information for PZTREVC driver * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (the order of the matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH * * WORK (local workspace) INTEGER array of dimension >= * MAX( 3, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ), used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Further Details * =============== * * Implemented by: M. Fahey, June 2000 * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'EVC.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * )NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 )'N', LDNVAL GO TO 30 END IF READ( NIN, FMT = * )( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * )NNB IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 )'NB', LDNBVAL GO TO 30 END IF READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) * DO 10 I = 1, NNB IF( NBVAL( I ).LT.6 ) THEN WRITE( NOUT, FMT = 9992 )NBVAL( I ) GO TO 30 END IF 10 CONTINUE * * Get number of grids * READ( NIN, FMT = * )NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDPVAL GO TO 30 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDQVAL GO TO 30 END IF * * Get values of P and Q * READ( NIN, FMT = * )( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * )( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * )THRESH * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 20 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 20 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK nonsymmetric eigenvector calculation.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'Tests of the parallel ' // $ 'complex double precision eigenvector calculation.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Residual = max( ||T*R-R*D||/(||H||*eps*N)' // $ ' , ||T^H*L-L*D^H||/(||H||*eps*N) )' WRITE( NOUT, FMT = 9999 ) $ ' Normalization residual = max(max_j(max|R(j)|-1),' // $ ' max_j(max|L(j)|-1))/(eps*N)' WRITE( NOUT, FMT = 9999 )'The matrix A is randomly ' // $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'An explanation of the input/output ' $ // 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or ' // $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the ' // 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the' // $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than' // $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'NEP time : Time in seconds to decompose the ' // ' matrix' WRITE( NOUT, FMT = 9999 )'MFLOPS : Rate of execution ' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 )'N ', $ ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 )'NB ', $ ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 )'P ', $ ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 )'Q ', $ ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 )EPS WRITE( NOUT, FMT = 9998 )THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 30 CONTINUE WRITE( NOUT, FMT = 9993 ) CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9992 FORMAT( ' Blocking size too small at ', I2, ' must be >=6.' ) * * End of PZEVCINFO * END scalapack-2.0.2/TESTING/EIG/pzgebdrv.f000644 000766 000024 00000047256 10363532303 017440 0ustar00juliestaff000000 000000 SUBROUTINE PZGEBDRV( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INFO, IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION D( * ), E( * ) COMPLEX*16 A( * ), TAUP( * ), TAUQ( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEBDRV computes sub( A ) = A(IA:IA+M-1,JA:JA+N-1) from sub( A ), * Q, P returned by PZGEBRD: * * sub( A ) := Q * B * P'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of sub( A ) * as returned by PZGEBRD. On exit, the original distribu- * ted matrix sub( A ) is restored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local input) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local input) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local input) COMPLEX*16 array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUQ is * tied to the distributed matrix A. See Further Details. * * TAUP (local input) COMPLEX*16 array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix P. TAUP is * tied to the distributed matrix A. See Further Details. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK >= 2*NB*( MP + NQ + NB ) * * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MP = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * On exit, if INFO <> 0, a discrepancy has been found between * the diagonal and off-diagonal elements of A and the copies * contained in the arrays D and E. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION REIGHT, RZERO PARAMETER ( REIGHT = 8.0D+0, RZERO = 0.0D+0 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, IIA, IL, IPTP, IPTQ, $ IPV, IPW, IPWK, IOFF, IV, J, JB, JJA, JL, JV, $ K, MN, MP, MYCOL, MYROW, NB, NPCOL, NPROW, NQ DOUBLE PRECISION ADDBND, D2, E2 COMPLEX*16 D1, E1 * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCV( DLEN_ ), $ DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, IGSUM2D, INFOG2L, $ PDELGET, PZLACPY, PZLARFB, PZLARFT, $ PZLASET, PZELGET * .. * .. External Functions .. INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL INDXG2P, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 NB = DESCA( MB_ ) IOFF = MOD( IA-1, NB ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IOFF, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+IOFF, NB, MYCOL, IACOL, NPCOL ) IPV = 1 IPW = IPV + MP*NB IPTP = IPW + NQ*NB IPTQ = IPTP + NB*NB IPWK = IPTQ + NB*NB * IV = 1 JV = 1 MN = MIN( M, N ) IL = MAX( ( (IA+MN-2) / NB )*NB + 1, IA ) JL = MAX( ( (JA+MN-2) / NB )*NB + 1, JA ) IAROW = INDXG2P( IL, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JL, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCV, IA+M-IL, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCW, NB, JA+N-JL, NB, NB, IAROW, IACOL, ICTXT, $ NB ) * ADDBND = REIGHT * PDLAMCH( ICTXT, 'eps' ) * * When A is an upper bidiagonal form * IF( M.GE.N ) THEN * CALL DESCSET( DESCD, 1, JA+MN-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MN-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) * DO 10 J = 0, MN-1 D1 = ZERO E1 = ZERO D2 = RZERO E2 = RZERO CALL PDELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PZELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(MN-1) ) THEN CALL PDELGET( ' ', ' ', E2, E, IA+J, 1, DESCE ) CALL PZELGET( 'Rowwise', ' ', E1, A, IA+J, JA+J+1, $ DESCA ) END IF * IF( ( ABS( D1-DCMPLX( D2 ) ).GT.( ABS( D2 )*ADDBND ) ) .OR. $ ( ABS( E1-DCMPLX( E2 ) ).GT.( ABS( E2 )*ADDBND ) ) ) $ INFO = INFO + 1 10 CONTINUE * DO 20 J = JL, JA+NB-IOFF, -NB JB = MIN( JA+N-J, NB ) I = IA + J - JA K = I - IA + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PZLARFT( 'Forward', 'Columnwise', M-K+1, JB, A, I, J, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'Lower', M-K+1, JB, A, I, J, DESCA, $ WORK( IPV ), IV, JV, DESCV ) CALL PZLASET( 'Upper', M-K+1, JB, ZERO, ONE, WORK( IPV ), $ IV, JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PZLASET( 'Lower', M-K, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PZLARFT( 'Forward', 'Rowwise', N-K, JB, A, I, J+1, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'Upper', JB, N-K, A, I, J+1, DESCA, $ WORK( IPW ), IV, JV+1, DESCW ) CALL PZLASET( 'Lower', JB, N-K, ZERO, ONE, WORK( IPW ), IV, $ JV+1, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PZLASET( 'Upper', JB, N-K-1, ZERO, ZERO, A, I, J+2, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PZLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-K+1, N-K+1, JB, WORK( IPV ), $ IV, JV, DESCV, WORK( IPTQ ), A, I, J, DESCA, $ WORK( IPWK ) ) * * Apply block Householder transformation from Right. * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-K+1, N-K, JB, WORK( IPW ), IV, $ JV+1, DESCW, WORK( IPTP ), A, I, J+1, DESCA, $ WORK( IPWK ) ) * DESCV( M_ ) = DESCV( M_ ) + NB DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCW( N_ ) = DESCW( N_ ) + NB DESCW( RSRC_ ) = DESCV( RSRC_ ) DESCW( CSRC_ ) = DESCV( CSRC_ ) * 20 CONTINUE * * Handle first block separately * JB = MIN( N, NB - IOFF ) IV = IOFF + 1 JV = IOFF + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PZLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'Lower', M, JB, A, IA, JA, DESCA, WORK( IPV ), $ IV, JV, DESCV ) CALL PZLASET( 'Upper', M, JB, ZERO, ONE, WORK( IPV ), IV, JV, $ DESCV ) * * Zero out the strict lower triangular part of A. * CALL PZLASET( 'Lower', M-1, JB, ZERO, ZERO, A, IA+1, JA, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PZLARFT( 'Forward', 'Rowwise', N-1, JB, A, IA, JA+1, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'Upper', JB, N-1, A, IA, JA+1, DESCA, $ WORK( IPW ), IV, JV+1, DESCW ) CALL PZLASET( 'Lower', JB, N-1, ZERO, ONE, WORK( IPW ), IV, $ JV+1, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PZLASET( 'Upper', JB, N-2, ZERO, ZERO, A, IA, JA+2, $ DESCA ) * * Apply block Householder transformation from left. * CALL PZLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M, N, JB, WORK( IPV ), IV, JV, DESCV, $ WORK( IPTQ ), A, IA, JA, DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from right. * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M, N-1, JB, WORK( IPW ), IV, JV+1, $ DESCW, WORK( IPTP ), A, IA, JA+1, DESCA, $ WORK( IPWK ) ) * ELSE * CALL DESCSET( DESCD, IA+MN-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MN-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 30 J = 0, MN-1 D1 = ZERO E1 = ZERO D2 = RZERO E2 = RZERO CALL PDELGET( ' ', ' ', D2, D, IA+J, 1, DESCD ) CALL PZELGET( 'Rowwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(MN-1) ) THEN CALL PDELGET( ' ', ' ', E2, E, 1, JA+J, DESCE ) CALL PZELGET( 'Columnwise', ' ', E1, A, IA+J+1, JA+J, $ DESCA ) END IF * IF( ( ABS( D1-DCMPLX( D2 ) ).GT.( ABS( D2 )*ADDBND ) ) .OR. $ ( ABS( E1-DCMPLX( E2 ) ).GT.( ABS( E2 )*ADDBND ) ) ) $ INFO = INFO + 1 30 CONTINUE * DO 40 I = IL, IA+NB-IOFF, -NB JB = MIN( IA+M-I, NB ) J = JA + I - IA K = J - JA + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PZLARFT( 'Forward', 'Columnwise', M-K, JB, A, I+1, J, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'Lower', M-K, JB, A, I+1, J, DESCA, $ WORK( IPV ), IV+1, JV, DESCV ) CALL PZLASET( 'Upper', M-K, JB, ZERO, ONE, WORK( IPV ), $ IV+1, JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PZLASET( 'Lower', M-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PZLARFT( 'Forward', 'Rowwise', N-K+1, JB, A, I, J, $ DESCA, TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'Upper', JB, N-K+1, A, I, J, DESCA, $ WORK( IPW ), IV, JV, DESCW ) CALL PZLASET( 'Lower', JB, N-K+1, ZERO, ONE, WORK( IPW ), $ IV, JV, DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PZLASET( 'Upper', JB, N-K, ZERO, ZERO, A, I, J+1, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PZLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-K, N-K+1, JB, WORK( IPV ), $ IV+1, JV, DESCV, WORK( IPTQ ), A, I+1, J, $ DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from Right. * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-K+1, N-K+1, JB, WORK( IPW ), IV, $ JV, DESCW, WORK( IPTP ), A, I, J, DESCA, $ WORK( IPWK ) ) * DESCV( M_ ) = DESCV( M_ ) + NB DESCV( RSRC_ ) = MOD( DESCV( RSRC_ ) + NPROW - 1, NPROW ) DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCW( N_ ) = DESCW( N_ ) + NB DESCW( RSRC_ ) = DESCV( RSRC_ ) DESCW( CSRC_ ) = DESCV( CSRC_ ) * 40 CONTINUE * * Handle first block separately * JB = MIN( M, NB - IOFF ) IV = IOFF + 1 JV = IOFF + 1 * * Compute upper triangular matrix TQ from TAUQ. * CALL PZLARFT( 'Forward', 'Columnwise', M-1, JB, A, IA+1, JA, $ DESCA, TAUQ, WORK( IPTQ ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'Lower', M-1, JB, A, IA+1, JA, DESCA, $ WORK( IPV ), IV+1, JV, DESCV ) CALL PZLASET( 'Upper', M-1, JB, ZERO, ONE, WORK( IPV ), IV+1, $ JV, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PZLASET( 'Lower', M-2, JB, ZERO, ZERO, A, IA+2, JA, $ DESCA ) * * Compute upper triangular matrix TP from TAUP. * CALL PZLARFT( 'Forward', 'Rowwise', N, JB, A, IA, JA, DESCA, $ TAUP, WORK( IPTP ), WORK( IPWK ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'Upper', JB, N, A, IA, JA, DESCA, WORK( IPW ), $ IV, JV, DESCW ) CALL PZLASET( 'Lower', JB, N, ZERO, ONE, WORK( IPW ), IV, JV, $ DESCW ) * * Zero out the strict+1 upper triangular part of A. * CALL PZLASET( 'Upper', JB, N-1, ZERO, ZERO, A, IA, JA+1, $ DESCA ) * * Apply block Householder transformation from left * CALL PZLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M-1, N, JB, WORK( IPV ), IV+1, JV, DESCV, $ WORK( IPTQ ), A, IA+1, JA, DESCA, WORK( IPWK ) ) * * Apply block Householder transformation from right * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M, N, JB, WORK( IPW ), IV, JV, DESCW, $ WORK( IPTP ), A, IA, JA, DESCA, WORK( IPWK ) ) END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * RETURN * * End of PZGEBDRV * END scalapack-2.0.2/TESTING/EIG/pzgehdrv.f000644 000766 000024 00000017760 10602576752 017457 0ustar00juliestaff000000 000000 SUBROUTINE PZGEHDRV( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEHDRV computes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from the * unitary matrix Q, the Hessenberg matrix, and the array TAU returned * by PZGEHRD: * sub( A ) := Q * H * Q' * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows and columns 1:ILO-1 and IHI+1:N. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) reduced to Hessenberg * form by PZGEHRD. The upper triangle and the first sub- * diagonal of sub( A ) contain the upper Hessenberg matrix H, * and the elements below the first subdiagonal, with the array * TAU, represent the unitary matrix Q as a product of * elementary reflectors. On exit, the original distributed * N-by-N matrix sub( A ) is recovered. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16 array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors returned by * PZGEHRD. TAU is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK). * LWORK >= NB*NB + NB*IHLP + MAX[ NB*( IHLP+INLQ ), * NB*( IHLQ + MAX[ IHIP, * IHLP+NUMROC( NUMROC( IHI-ILO+LOFF+1, NB, 0, 0, * NPCOL ), NB, 0, 0, LCMQ ) ] ) ] * * where NB = MB_A = NB_A, * LCM is the least common multiple of NPROW and NPCOL, * LCM = ILCM( NPROW, NPCOL ), LCMQ = LCM / NPCOL, * * IROFFA = MOD( IA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * * ILROW = INDXG2P( IA+ILO-1, NB, MYROW, RSRC_A, NPROW ), * ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, CSRC_A, NPCOL ), * IHLP = NUMROC( IHI-ILO+IROFFA+1, NB, MYROW, ILROW, NPROW ), * IHLQ = NUMROC( IHI-ILO+IROFFA+1, NB, MYCOL, ILCOL, NPCOL ), * INLQ = NUMROC( N-ILO+IROFFA+1, NB, MYCOL, ILCOL, NPCOL ). * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, IHLP, II, IOFF, IPT, $ IPV, IPW, IV, J, JB, JJ, JL, K, MYCOL, MYROW, $ NB, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PZLARFB, $ PZLARFT, PZLACPY, PZLASET * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( IHI-ILO.LE.0 ) $ RETURN * NB = DESCA( MB_ ) IOFF = MOD( IA+ILO-2, NB ) CALL INFOG2L( IA+ILO-1, JA+ILO-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, IAROW, IACOL ) IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, IAROW, NPROW ) * IPT = 1 IPV = IPT + NB * NB IPW = IPV + IHLP * NB JL = MAX( ( ( JA+IHI-2 ) / NB ) * NB + 1, JA + ILO - 1 ) CALL DESCSET( DESCV, IHI-ILO+IOFF+1, NB, NB, NB, IAROW, $ INDXG2P( JL, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ), ICTXT, MAX( 1, IHLP ) ) * DO 10 J = JL, ILO+JA+NB-IOFF-1, -NB JB = MIN( JA+IHI-J-1, NB ) I = IA + J - JA K = I - IA + 1 IV = K - ILO + IOFF + 1 * * Compute upper triangular matrix T from TAU. * CALL PZLARFT( 'Forward', 'Columnwise', IHI-K, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'All', IHI-K, JB, A, I+1, J, DESCA, WORK( IPV ), $ IV+1, 1, DESCV ) * * Zero out the strict lower triangular part of A. * CALL PZLASET( 'Lower', IHI-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PZLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ IHI-K, N-K+1, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, I+1, J, DESCA, WORK( IPW ) ) * * Apply block Householder transformation from Right. * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Columnwise', IHI, IHI-K, JB, WORK( IPV ), IV+1, $ 1, DESCV, WORK( IPT ), A, IA, J+1, DESCA, $ WORK( IPW ) ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle the first block separately * IV = IOFF + 1 I = IA + ILO - 1 J = JA + ILO - 1 JB = MIN( NB-IOFF, JA+IHI-J-1 ) * * Compute upper triangular matrix T from TAU. * CALL PZLARFT( 'Forward', 'Columnwise', IHI-ILO, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPW ) ) * * Copy Householder vectors into workspace. * CALL PZLACPY( 'All', IHI-ILO, JB, A, I+1, J, DESCA, WORK( IPV ), $ IV+1, 1, DESCV ) * * Zero out the strict lower triangular part of A. * IF( IHI-ILO.GT.0 ) $ CALL PZLASET( 'Lower', IHI-ILO-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * Apply block Householder transformation from Left. * CALL PZLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ IHI-ILO, N-ILO+1, JB, WORK( IPV ), IV+1, 1, DESCV, $ WORK( IPT ), A, I+1, J, DESCA, WORK( IPW ) ) * * Apply block Householder transformation from Right. * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Columnwise', IHI, IHI-ILO, JB, WORK( IPV ), IV+1, $ 1, DESCV, WORK( IPT ), A, IA, J+1, DESCA, $ WORK( IPW ) ) * RETURN * * End of PZGEHDRV * END scalapack-2.0.2/TESTING/EIG/pzget22.f000644 000766 000024 00000022514 10363532303 017100 0ustar00juliestaff000000 000000 SUBROUTINE PZGET22( TRANSA, TRANSE, TRANSW, N, A, DESCA, E, DESCE, $ W, WORK, DESCW, RWORK, RESULT ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * .. Scalar Arguments .. CHARACTER TRANSA, TRANSE, TRANSW INTEGER N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCE( * ), DESCW( * ) DOUBLE PRECISION RESULT( 2 ), RWORK( * ) COMPLEX*16 A( * ), E( * ), W( * ), WORK( * ) * .. * * Purpose * ======= * * PZGET22 does an eigenvector check. * * The basic test is: * * RESULT(1) = | A E - E W | / ( |A| |E| ulp ) * * using the 1-norm. It also tests the normalization of E: * * RESULT(2) = max | m-norm(E(j)) - 1 | / ( n ulp ) * j * * where E(j) is the j-th eigenvector, and m-norm is the max-norm of a * vector. The max-norm of a complex n-vector x in this case is the * maximum of |re(x(i)| + |im(x(i)| over i = 1, ..., n. * * Arguments * ========== * * TRANSA (input) CHARACTER*1 * Specifies whether or not A is transposed. * = 'N': No transpose * = 'T': Transpose * = 'C': Conjugate transpose * * TRANSE (input) CHARACTER*1 * Specifies whether or not E is transposed. * = 'N': No transpose, eigenvectors are in columns of E * = 'T': Transpose, eigenvectors are in rows of E * = 'C': Conjugate transpose, eigenvectors are in rows of E * * TRANSW (input) CHARACTER*1 * Specifies whether or not W is transposed. * = 'N': No transpose * = 'T': Transpose, same as TRANSW = 'N' * = 'C': Conjugate transpose, use -WI(j) instead of WI(j) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input) COMPLEX*16 array, dimension (*) * The matrix whose eigenvectors are in E. * * DESCA (input) INTEGER array, dimension(*) * * E (input) COMPLEX*16 array, dimension (*) * The matrix of eigenvectors. If TRANSE = 'N', the eigenvectors * are stored in the columns of E, if TRANSE = 'T' or 'C', the * eigenvectors are stored in the rows of E. * * DESCE (input) INTEGER array, dimension(*) * * W (input) COMPLEX*16 array, dimension (N) * The eigenvalues of A. * * WORK (workspace) COMPLEX*16 array, dimension (*) * DESCW (input) INTEGER array, dimension(*) * * RWORK (workspace) DOUBLE PRECISION array, dimension (N) * * RESULT (output) DOUBLE PRECISION array, dimension (2) * RESULT(1) = | A E - E W | / ( |A| |E| ulp ) * RESULT(2) = max | m-norm(E(j)) - 1 | / ( n ulp ) * j * Further Details * =============== * * Contributed by Mark Fahey, June, 2000 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER NORMA, NORME INTEGER ICOL, II, IROW, ITRNSE, ITRNSW, J, JCOL, JJ, $ JROW, JVEC, LDA, LDE, LDW, MB, MYCOL, MYROW, $ NB, NPCOL, NPROW, CONTXT, CA, CSRC, RA, RSRC DOUBLE PRECISION ANORM, ENORM, ENRMAX, ENRMIN, ERRNRM, TEMP1, $ ULP, UNFL COMPLEX*16 CDUM, WTEMP * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL LSAME, PDLAMCH, PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMN2D, DGAMX2D, INFOG2L, $ PZAXPY, PZGEMM, PZLASET * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * * Initialize RESULT (in case N=0) * RESULT( 1 ) = ZERO RESULT( 2 ) = ZERO IF( N.LE.0 ) $ RETURN * CONTXT = DESCA( CTXT_ ) RSRC = DESCA( RSRC_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) MB = DESCA( MB_ ) LDA = DESCA( LLD_ ) LDE = DESCE( LLD_ ) LDW = DESCW( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * UNFL = PDLAMCH( CONTXT, 'Safe minimum' ) ULP = PDLAMCH( CONTXT, 'Precision' ) * ITRNSE = 0 ITRNSW = 0 NORMA = 'O' NORME = 'O' * IF( LSAME( TRANSA, 'T' ) .OR. LSAME( TRANSA, 'C' ) ) THEN NORMA = 'I' END IF * IF( LSAME( TRANSE, 'T' ) ) THEN ITRNSE = 1 NORME = 'I' ELSE IF( LSAME( TRANSE, 'C' ) ) THEN ITRNSE = 2 NORME = 'I' END IF * IF( LSAME( TRANSW, 'C' ) ) THEN ITRNSW = 1 END IF * * Normalization of E: * ENRMIN = ONE / ULP ENRMAX = ZERO IF( ITRNSE.EQ.0 ) THEN DO 20 JVEC = 1, N TEMP1 = ZERO DO 10 J = 1, N CALL INFOG2L( J, JVEC, DESCE, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN TEMP1 = MAX( TEMP1, CABS1( E( ( ICOL-1 )*LDE+ $ IROW ) ) ) END IF 10 CONTINUE IF( MYCOL.EQ.JJ ) THEN CALL DGAMX2D( CONTXT, 'Col', ' ', 1, 1, TEMP1, 1, RA, CA, $ -1, -1, -1 ) ENRMIN = MIN( ENRMIN, TEMP1 ) ENRMAX = MAX( ENRMAX, TEMP1 ) END IF 20 CONTINUE CALL DGAMX2D( CONTXT, 'Row', ' ', 1, 1, ENRMAX, 1, RA, CA, -1, $ -1, -1 ) CALL DGAMN2D( CONTXT, 'Row', ' ', 1, 1, ENRMIN, 1, RA, CA, -1, $ -1, -1 ) ELSE DO 40 J = 1, N TEMP1 = ZERO DO 30 JVEC = 1, N CALL INFOG2L( J, JVEC, DESCE, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN TEMP1 = MAX( TEMP1, CABS1( E( ( ICOL-1 )*LDE+ $ IROW ) ) ) END IF 30 CONTINUE IF( MYROW.EQ.II ) THEN CALL DGAMX2D( CONTXT, 'Row', ' ', 1, 1, TEMP1, 1, RA, CA, $ -1, -1, -1 ) ENRMIN = MIN( ENRMIN, TEMP1 ) ENRMAX = MAX( ENRMAX, TEMP1 ) END IF 40 CONTINUE CALL DGAMX2D( CONTXT, 'Row', ' ', 1, 1, ENRMAX, 1, RA, CA, -1, $ -1, -1 ) CALL DGAMN2D( CONTXT, 'Row', ' ', 1, 1, ENRMIN, 1, RA, CA, -1, $ -1, -1 ) END IF * * Norm of A: * ANORM = MAX( PZLANGE( NORMA, N, N, A, 1, 1, DESCA, RWORK ), UNFL ) * * Norm of E: * ENORM = MAX( PZLANGE( NORME, N, N, E, 1, 1, DESCE, RWORK ), ULP ) * * Norm of error: * * Error = AE - EW * CALL PZLASET( 'Full', N, N, CZERO, CZERO, WORK, 1, 1, DESCW ) * DO 60 JCOL = 1, N IF( ITRNSW.EQ.0 ) THEN WTEMP = W( JCOL ) ELSE WTEMP = DCONJG( W( JCOL ) ) END IF * IF( ITRNSE.EQ.0 ) THEN CALL PZAXPY( N, WTEMP, E, 1, JCOL, DESCE, 1, WORK, 1, JCOL, $ DESCW, 1 ) ELSE IF( ITRNSE.EQ.1 ) THEN CALL PZAXPY( N, WTEMP, E, JCOL, 1, DESCE, N, WORK, 1, JCOL, $ DESCW, 1 ) ELSE CALL PZAXPY( N, DCONJG( WTEMP ), E, JCOL, 1, DESCE, N, WORK, $ 1, JCOL, DESCW, 1 ) DO 50 JROW = 1, N CALL INFOG2L( JROW, JCOL, DESCW, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN WORK( ( JCOL-1 )*LDW+JROW ) $ = DCONJG( WORK( ( JCOL-1 )*LDW+JROW ) ) END IF 50 CONTINUE END IF 60 CONTINUE * CALL PZGEMM( TRANSA, TRANSE, N, N, N, CONE, A, 1, 1, DESCA, E, 1, $ 1, DESCE, -CONE, WORK, 1, 1, DESCW ) * ERRNRM = PZLANGE( 'One', N, N, WORK, 1, 1, DESCW, RWORK ) / ENORM * * Compute RESULT(1) (avoiding under/overflow) * IF( ANORM.GT.ERRNRM ) THEN RESULT( 1 ) = ( ERRNRM / ANORM ) / ULP ELSE IF( ANORM.LT.ONE ) THEN RESULT( 1 ) = ( MIN( ERRNRM, ANORM ) / ANORM ) / ULP ELSE RESULT( 1 ) = MIN( ERRNRM / ANORM, ONE ) / ULP END IF END IF * * Compute RESULT(2) : the normalization error in E. * RESULT( 2 ) = MAX( ABS( ENRMAX-ONE ), ABS( ENRMIN-ONE ) ) / $ ( DBLE( N )*ULP ) * RETURN * * End of PZGET22 * END scalapack-2.0.2/TESTING/EIG/pzgsepchk.f000644 000766 000024 00000031030 10363532303 017572 0ustar00juliestaff000000 000000 * * SUBROUTINE PZGSEPCHK( IBTYPE, MS, NV, A, IA, JA, DESCA, B, IB, JB, $ DESCB, THRESH, Q, IQ, JQ, DESCQ, C, IC, JC, $ DESCC, W, WORK, LWORK, TSTNRM, RESULT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 15, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, IBTYPE, IC, IQ, JA, JB, JC, JQ, LWORK, $ MS, NV, RESULT DOUBLE PRECISION THRESH, TSTNRM * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCB( * ), DESCC( * ), DESCQ( * ) DOUBLE PRECISION W( * ), WORK( * ) COMPLEX*16 A( * ), B( * ), C( * ), Q( * ) * .. * * * Purpose * ======= * * PZGSEPCHK checks a decomposition of the form * * A Q = B Q D or * A B Q = Q D or * B A Q = Q D * * where A is a symmetric matrix, B is * symmetric positive definite, Q is orthogonal, and D is diagonal. * * One of the following test ratios is computed: * * IBTYPE = 1: TSTNRM = | A Q - B Q D | / ( |A| |Q| n ulp ) * * IBTYPE = 2: TSTNRM = | A B Q - Q D | / ( |A| |Q| n ulp ) * * IBTYPE = 3: TSTNRM = | B A Q - Q D | / ( |A| |Q| n ulp ) * * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * MP = number of local rows in A, B and Q * MQ = number of local columns in A * NQ = number of local columns in B and Q * * IBTYPE (input) INTEGER * The form of the symmetric generalized eigenproblem. * = 1: A*Q = (lambda)*B*Q * = 2: A*B*Q = (lambda)*Q * = 3: B*A*Q = (lambda)*Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in A, B, C and Q * Also, the number of columns in A * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q. * * A (local input) REAL pointer to an * array in local memory of dimension (LLD_A, LOCc(JA+N-1)). * This array contains the local pieces of the M-by-N * distributed test matrix A * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix A. * * B (local input) REAL pointer to an * array in local memory of dimension (LLD_B, LOCc(JB+N-1)). * This array contains the local pieces of the M-by-N * distributed test matrix B * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix B. * * THRESH (input) REAL * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) REAL array * global dimension (MS, NV), * local dimension (DESCA( DLEN_ ), NQ) * * Contains the eigenvectors as computed by PSSYEVX * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix Q. * * C (local workspace) REAL array, * global dimension (MS, NV), * local dimension (DESCA( DLEN_ ), MQ) * * Accumulator for computing AQ -QL * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix C. * * W (global input) REAL array, dimension (NV) * * Contains the computed eigenvalues * * WORK (local workspace) REAL array, * dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * TSTNRM (global output) REAL * * RESULT (global output) INTEGER * 0 if the test passes * 1 if the test fails * * .. Local Scalars .. * INTEGER I, INFO, MYCOL, MYROW, NPCOL, NPROW, NQ DOUBLE PRECISION ANORM, ULP * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) COMPLEX*16 CONE, CNEGONE, CZERO PARAMETER ( CONE = 1.0D+0, CNEGONE = -1.0D+0, $ CZERO = 0.0D+0 ) * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION DLAMCH, PZLANGE EXTERNAL NUMROC, DLAMCH, PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PXERBLA, PZDSCAL, $ PZGEMM * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * RESULT = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 CALL CHK1MAT( MS, 1, MS, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( MS, 1, MS, 2, IB, JB, DESCB, 11, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IQ, JQ, DESCQ, 16, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IB, JB, DESCB, 20, INFO ) * IF( INFO.EQ.0 ) THEN * NQ = NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IF( IQ.NE.1 ) THEN INFO = -14 ELSE IF( JQ.NE.1 ) THEN INFO = -15 ELSE IF( IA.NE.1 ) THEN INFO = -5 ELSE IF( JA.NE.1 ) THEN INFO = -6 ELSE IF( IB.NE.1 ) THEN INFO = -9 ELSE IF( JB.NE.1 ) THEN INFO = -10 ELSE IF( LWORK.LT.NQ ) THEN INFO = -23 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PZGSEPCHK', -INFO ) RETURN END IF * RESULT = 0 ULP = DLAMCH( 'Epsilon' ) * * Compute product of Max-norms of A and Q. * ANORM = PZLANGE( 'M', MS, MS, A, IA, JA, DESCA, WORK )* $ PZLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) IF( ANORM.EQ.ZERO ) $ ANORM = ONE * IF( IBTYPE.EQ.1 ) THEN * * Norm of AQ - BQD * * C = AQ * CALL PZGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 10 I = 1, NV CALL PZDSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 10 CONTINUE * * C = C - BQ (i.e. AQ-BQD) * CALL PZGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, Q, $ IQ, JQ, DESCQ, CNEGONE, C, IC, JC, DESCC ) * TSTNRM = ( PZLANGE( 'M', MS, NV, C, IC, JC, DESCC, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * * ELSE IF( IBTYPE.EQ.2 ) THEN * * Norm of ABQ - QD * * * C = BQ * CALL PZGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 20 I = 1, NV CALL PZDSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 20 CONTINUE * * Q = AC - Q * CALL PZGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, C, $ IC, JC, DESCC, CNEGONE, Q, IQ, JQ, DESCQ ) * TSTNRM = ( PZLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * ELSE IF( IBTYPE.EQ.3 ) THEN * * Norm of BAQ - QD * * * C = AQ * CALL PZGEMM( 'N', 'N', MS, NV, MS, CONE, A, IA, JA, DESCA, Q, $ IQ, JQ, DESCQ, CZERO, C, IC, JC, DESCC ) * * Q = QD * DO 30 I = 1, NV CALL PZDSCAL( MS, W( I ), Q, IQ, JQ+I-1, DESCQ, 1 ) 30 CONTINUE * * Q = BC - Q * CALL PZGEMM( 'N', 'N', MS, NV, MS, CONE, B, IB, JB, DESCB, C, $ IC, JC, DESCC, CNEGONE, Q, IQ, JQ, DESCQ ) * TSTNRM = ( PZLANGE( 'M', MS, NV, Q, IQ, JQ, DESCQ, WORK ) / $ ANORM ) / ( MAX( MS, 1 )*ULP ) * END IF * IF( TSTNRM.GT.THRESH .OR. ( TSTNRM-TSTNRM.NE.0.0D0 ) ) THEN RESULT = 1 END IF RETURN * * End of PZGSEPCHK * END scalapack-2.0.2/TESTING/EIG/pzgsepdriver.f000644 000766 000024 00000023364 10363532303 020333 0ustar00juliestaff000000 000000 * * PROGRAM PZGSEPDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Parallel COMPLEX*16 Hermitian eigenproblem test driver * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. INTSIZ and DBLSIZ * indicate the length in bytes on the given platform for an integer * and a double precision real. * For example, on our system with 8 MB of memory, TOTMEM=6500000 * (leaves 1.5 MB for OS, code, BLACS buffer, etc), the length of a * DOUBLE is 8, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * The full tester requires approximately (5 n + 5 n^2/p + slop) * COMPLEX*16 words and 6*n integer words. * So, TOTMEM should be set to at least 1.1 * 8 * (5n + 5n^2/p) * * WHAT WE TEST * ============ * * This routine tests PZHEGVX, the expert driver for the parallel * Hermitian eigenvalue problem. We would like to cover all * possible combinations of: matrix size, process configuration * (nprow and npcol), block size (nb), matrix type (??), range * of eigenvalue (all, by value, by position), sorting options, * and upper vs. lower storage. * * We intend to provide two types of test input files, an * installation test and a thorough test. * * We also intend that the reports be meaningful. Our input file * will allow multiple requests where each request is a cross product * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, ZPLXSZ, NIN PARAMETER ( TOTMEM = 2000000, ZPLXSZ = 16, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / ZPLXSZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PDLACHKIEEE, PDLASNBT, PZGSEPREQ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PDLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) END IF * CALL PDLACHKIEEE( ISIEEE, DLAMCH( 'O' ), DLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) END IF GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Hermitian Eigendecomposition routines.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'generalized ' // 'Hermitian eigenvalue routine: PZHEGVX.' WRITE( NOUT, FMT = 9999 )'A scaled residual check, ' // $ 'will be computed' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ ' : the QTQ norm is allowed to exceed THRESH' // $ ' for those eigenvectors' WRITE( NOUT, FMT = 9999 )' : which could not be ' // $ 'reorthogonalized for lack of workspace.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see pZGSEPtst.f).' WRITE( NOUT, FMT = 9999 ) $ 'IBTYPE : Generalized eigenproblem type' // $ ' (see pZHEGVx.f)' WRITE( NOUT, FMT = 9999 )'SUB : Subtests ' // $ '(see pZGSEPtst).f' WRITE( NOUT, FMT = 9999 )'CHK : The scaled residual' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PZGSEPREQ( NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, NSKIPPED, $ NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * 9999 FORMAT( A ) 9998 FORMAT( ' I am about to check to make sure that overflow' ) 9997 FORMAT( ' is handled in the ieee default manner. If this' ) 9996 FORMAT( ' is the last output you see, you should assume' ) 9995 FORMAT( ' that overflow caused a floating point exception.' ) 9994 FORMAT( ' In that case, we recommend that you add -DNO_IEEE' ) 9993 FORMAT( ' to the CDEFS line in SLmake.inc.' ) 9992 FORMAT( ' Alternatively, you could set CDEFS in SLmake.inc ' ) 9991 FORMAT( ' to enable the default ieee behaviour, However, this' ) 9990 FORMAT( ' may result in good or very bad performance.' ) 9989 FORMAT( ' Either signed zeroes or signed infinities ' ) 9988 FORMAT( ' work incorrectly or your system. Change your' ) 9987 FORMAT( ' SLmake.inc as suggested above.' ) * 9986 FORMAT( ' Your system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP IBTYPE SUB WALL CPU ', $ ' CHK CHECK' ) 9978 FORMAT( ' ----- --- --- --- --- ------ --- -------- --------', $ ' --------- -----' ) * * End of PZGSEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pzgsepreq.f000644 000766 000024 00000025544 10602576752 017645 0ustar00juliestaff000000 000000 * * SUBROUTINE PZGSEPREQ( NIN, MEM, MEMSIZE, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX*16 MEM( MEMSIZE ) * .. * * Purpose * ======= * * PZGSEPREQ performs one request from the input file 'SEP.dat' * A request is the cross product of the specifications in the * input file. PZGSEPREQ prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEP.dat' * * MEM (local input) COMPLEX*16 ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION FIVE PARAMETER ( FIVE = 5.0D+0 ) INTEGER ZPLXSZ, INTGSZ PARAMETER ( ZPLXSZ = 16, INTGSZ = 4 ) INTEGER DBLESZ PARAMETER ( DBLESZ = 8 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IBTYPE, IMIDPAD, INITCON, $ IPOSTPAD, IPREPAD, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, LDA, LLRWORK, MATSIZE, MATTYPE, $ MYCOL, MYROW, N, NB, NIBTYPES, NMATSIZES, $ NMATTYPES, NNODES, NP, NPCOL, NPCONFIGS, NPROW, $ NQ, NUPLOS, ORDER, PCONFIG, PTRA, PTRB, $ PTRCOPYA, PTRCOPYB, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRRWORK, PTRW, PTRW2, PTRWORK, PTRZ, $ RES, RSIZECHK, RSIZEHEEVX, RSIZEQTQ, $ RSIZESUBTST, RSIZETST, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST, UPLO DOUBLE PRECISION ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL LSAME, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PDSEPINFO, PZGSEPTST, PZLASIZEGSEP * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PDSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * * Thresholds must be bigger for the generalized problem. * THRESH = THRESH*FIVE * DO 50 MATSIZE = 1, NMATSIZES * DO 40 PCONFIG = 1, NPCONFIGS * DO 30 MATTYPE = 1, NMATTYPES * DO 20 UPLO = 1, NUPLOS IF( LSAME( SUBTESTS, 'Y' ) ) THEN NIBTYPES = 3 ELSE NIBTYPES = 1 END IF DO 10 IBTYPE = 1, NIBTYPES * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, $ NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, $ MYROW, MYCOL ) IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PZLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, $ RSIZEQTQ, RSIZECHK, $ SIZEHEEVX, RSIZEHEEVX, $ ISIZEHEEVX, SIZESUBTST, $ RSIZESUBTST, ISIZESUBTST, $ SIZETST, RSIZETST, $ ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYB = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRB = PTRCOPYB + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRB + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, ZPLXSZ / DBLESZ ) PTRWORK = PTRW2 + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, ZPLXSZ / DBLESZ ) PTRGAP = PTRWORK + SIZETST + IPREPAD + $ IPOSTPAD PTRIFAIL = PTRGAP + ICEIL( NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, $ ZPLXSZ / DBLESZ ) PTRICLUS = PTRIFAIL + $ ICEIL( N+IPREPAD+IPOSTPAD, $ ZPLXSZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, ZPLXSZ / INTGSZ ) PTRRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, ZPLXSZ / INTGSZ ) LLRWORK = ( MEMSIZE - PTRRWORK - IPOSTPAD - $ IPREPAD + 1 )* ( ZPLXSZ / DBLESZ ) NTESTS = NTESTS + 1 IF( LLRWORK.LT.RSIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PZGSEPTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), $ IBTYPE, SUBTESTS, THRESH, $ N, ABSTOL, ISEED, $ MEM( PTRA ), $ MEM( PTRCOPYA ), $ MEM( PTRB ), $ MEM( PTRCOPYB ), $ MEM( PTRZ ), LDA, $ MEM( PTRW ), MEM( PTRW2 ), $ MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ SIZETST, MEM( PTRRWORK ), $ LLRWORK, MEM( PTRIWRK ), $ ISIZETST, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * ) $ ' pZGSEPREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE 50 CONTINUE END IF * * RETURN * * End of PZDGSEPREQ * END scalapack-2.0.2/TESTING/EIG/pzgsepsubtst.f000644 000766 000024 00000072720 10363532303 020364 0ustar00juliestaff000000 000000 * * SUBROUTINE PZGSEPSUBTST( WKNOWN, IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, IA, JA, DESCA, WIN, WNEW, $ IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, RWORK, LRWORK, LWORK1, $ IWORK, LIWORK, RESULT, TSTNRM, QTQNRM, $ NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IBTYPE, IL, IPOSTPAD, IPREPAD, IU, JA, $ LIWORK, LRWORK, LWORK, LWORK1, N, NOUT, RESULT DOUBLE PRECISION ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) DOUBLE PRECISION GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) COMPLEX*16 A( * ), B( * ), COPYA( * ), COPYB( * ), $ WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZGSEPSUBTST calls PZHEGVX and then tests the output of * PZHEGVX * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < THRESH * |QT * Q - I| / eps * norm(A) < THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PZHEGVX when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PZGSEPSUBTST * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call to PZGSEPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PZSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) COMPLEX*16 array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PZHEGVX for a description of block cyclic layout. * The test matrix, which is then modified by PZHEGVX * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) COMPLEX*16 array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * B (local workspace) COMPLEX*16 array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. * The B test matrix, which is then modified by PZHEGVX * * COPYB (local input) COMPLEX*16 array, dim (N, N) * COPYB is used to hold an identical copy of the array B * identical in both form and content to B * * Z (local workspace) COMPLEX*16 array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PZGSEPCHK and PZSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) DOUBLE PRECISION array, dimension (N) * The eigenvalues as copmuted by this call to PZHEGVX * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * RWORK has already been padded front and back, * use RWORK(1+IPREPAD) * * LRWORK (local input) INTEGER * The actual length of the array RWORK after padding. * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PZHEGVX * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PZHEGVX * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / |A|*N*EPS * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285D+0, FIVE = 5.0D+0, $ NEGONE = -1.0D+0 ) COMPLEX*16 ZPADVAL PARAMETER ( ZPADVAL = ( 13.989D+0, 1.93D+0 ) ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ RSIZECHK, RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, $ RSIZETST, SIZEHEEVX, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZESUBTST, SIZETMS, SIZETST, VALSIZE, $ VECSIZE DOUBLE PRECISION EPS, ERROR, MAXERROR, MAXVU, MINERROR, MINVL, $ NORMWIN, OLDVL, OLDVU, ORFAC, SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), DSEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, DGAMN2D, DGAMX2D, $ IGAMN2D, IGAMX2D, PDCHEKPAD, PDFILLPAD, $ PICHEKPAD, PIFILLPAD, PZCHEKPAD, PZELSET, $ PZFILLPAD, PZGSEPCHK, PZHEGVX, PZLASIZEGSEP, $ PZLASIZEHEEVX, SLBOOT, SLTIMER, ZLACPY * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PZLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PDLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 RWORK( I+IPREPAD ) = 14.3D+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE DO 30 I = 1, LWORK, 1 WORK( I+IPREPAD ) = ( 15.63D+0, 1.1D+0 ) 30 CONTINUE * DO 40 I = 1, N WNEW( I+IPREPAD ) = 3.14159D+0 40 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL MINIL = 1 MAXIU = 0 DO 50 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 50 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 160 RESULT = 0 * * * DSEED is not used in this call to PZLASIZEHEEVX, the * following line just makes ftnchek happy. * DSEED( 1 ) = 1 * CALL PZLASIZEHEEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL ZLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL ZLACPY( 'A', NP, NQ, COPYB, DESCA( LLD_ ), B( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PZFILLPAD( DESCA( CTXT_ ), NP, NQ, B, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, ZPADVAL+1.0D+2 ) * CALL PZFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, ZPADVAL ) * CALL PZFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, ZPADVAL+1.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), LWORK1, 1, RWORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0D+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PZFILLPAD( DESCA( CTXT_ ), LWORK, 1, WORK, LWORK, IPREPAD, $ IPOSTPAD, ZPADVAL+4.1D+0 ) * * Make sure that PZHEGVX does not cheat (i.e. use answers * already computed.) * DO 70 I = 1, N, 1 DO 60 J = 1, MAXEIGS, 1 CALL PZELSET( Z( 1+IPREPAD ), I, J, DESCA, $ ( 13.0D+0, 1.34D+0 ) ) 60 CONTINUE 70 CONTINUE * ORFAC = -1.0D+0 * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PZHEGVX( IBTYPE, JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, $ JA, DESCA, B( 1+IPREPAD ), IA, JA, DESCA, VL, VU, $ IL, IU, ABSTOL, M, NZ, WNEW( 1+IPREPAD ), ORFAC, $ Z( 1+IPREPAD ), IA, JA, DESCA, WORK( 1+IPREPAD ), $ SIZEHEEVX, RWORK( 1+IPREPAD ), LWORK1, $ IWORK( 1+IPREPAD ), LIWORK, IFAIL( 1+IPREPAD ), $ ICLUSTR( 1+IPREPAD ), GAP( 1+IPREPAD ), INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEGVX-B', NP, NQ, B, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ ZPADVAL+1.0D+2 ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEGVX-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, ZPADVAL ) * CALL PZCHEKPAD( DESCZ( CTXT_ ), 'PZHEGVX-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ ZPADVAL+1.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEGVX-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEGVX-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEGVX-rWORK', LWORK1, 1, $ RWORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEGVX-WORK', LWORK, 1, WORK, $ LWORK, IPREPAD, IPOSTPAD, ZPADVAL+4.1D+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEGVX-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEGVX-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEGVX-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * * Since we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9992 ) RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Make sure that different processes return the same eigenvalues * DO 80 I = 1, M RWORK( I ) = WNEW( I+IPREPAD ) RWORK( I+M ) = WNEW( I+IPREPAD ) 80 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, RWORK, M, $ 1, 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ RWORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 90 I = 1, M * IF( RESULT.EQ.0 .AND. ( ABS( RWORK( I )-RWORK( M+ $ I ) ).GT.FIVE*EPS*ABS( RWORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 90 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 100 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 110 NCLUSTERS = NCLUSTERS + 1 100 CONTINUE 110 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 120 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 120 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * * DO 130 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 130 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 160 * * Note that a couple key variables get redefined in PZGSEPCHK * as described by this table: * * PZGSEPTST name PZGSEPCHK name * ------------- ------------- * COPYA A * Z Q * B B * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the residual check * CALL PDFILLPAD( DESCA( CTXT_ ), RSIZECHK, 1, RWORK, $ RSIZECHK, IPREPAD, IPOSTPAD, 4.3D+0 ) * CALL PZGSEPCHK( IBTYPE, N, NZ, COPYA, IA, JA, DESCA, COPYB, $ IA, JA, DESCA, THRESH, Z( 1+IPREPAD ), IA, $ JA, DESCZ, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), RWORK( 1+IPREPAD ), $ RSIZECHK, TSTNRM, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZGSEPCHK-rWORK', RSIZECHK, $ 1, RWORK, RSIZECHK, IPREPAD, IPOSTPAD, $ 4.3D+0 ) * IF( RES.NE.0 ) $ RESULT = 1 END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN ) THEN * * Set up MYIL if necessary * MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 150 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 140 I = 1, M ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 140 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 150 CONTINUE * * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what PxHEGVX computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 160 CONTINUE * * RETURN * 9999 FORMAT( 'PZHEGVX returned INFO=', I7 ) 9998 FORMAT( 'PZSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PZGSEPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PZHEGVX returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PZHEGVX' ) 9981 FORMAT( 'NZ altered by PZHEGVX with JOBZ=N' ) * * End of PZGSEPSUBTST * END scalapack-2.0.2/TESTING/EIG/pzgseptst.f000644 000766 000024 00000122765 11622500733 017660 0ustar00juliestaff000000 000000 * * SUBROUTINE PZGSEPTST( DESCA, UPLO, N, MATTYPE, IBTYPE, SUBTESTS, $ THRESH, ORDER, ABSTOL, ISEED, A, COPYA, B, $ COPYB, Z, LDA, WIN, WNEW, IFAIL, ICLUSTR, $ GAP, IPREPAD, IPOSTPAD, WORK, LWORK, RWORK, $ LRWORK, IWORK, LIWORK, NOUT, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER SUBTESTS, UPLO INTEGER IBTYPE, INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, $ LRWORK, LWORK, MATTYPE, N, NOUT, ORDER DOUBLE PRECISION ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) DOUBLE PRECISION GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) COMPLEX*16 A( LDA, * ), B( LDA, * ), COPYA( LDA, * ), $ COPYB( LDA, * ), WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PZGSEPTST builds a random matrix A, and a well conditioned * matrix B, runs PZHEGVX() to compute the eigenvalues * and eigenvectors and then calls PZHEGVCHK to compute * the residual. * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) Hermitian matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PZSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) COMPLEX*16 array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * The test matrix, which is then modified by PZHEGVX * * COPYA (local workspace) COMPLEX*16 array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * B (local workspace) COMPLEX*16 array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. * The B test matrix, which is then modified by PZHEGVX * * COPYB (local workspace) COMPLEX*16 array, dim (N, N) * COPYB is used to hold an identical copy of the array B * identical in both form and content to B * * Z (local workspace) COMPLEX*16 array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PZGSEPCHK * * W (local workspace) DOUBLE PRECISION array, dimension (N) * On normal exit from PZHEGVX, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PZLASIZEGSEP * * RWORK (local workspace) COMPLEX*16 array, dimension (LWORK) * * LRWORK (local input) INTEGER * The length of the array WORK. LRWORK >= RSIZETST as * returned by PZLASIZEGSEP * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PZLASIZEGSEP * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE, TEN, HALF PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TEN = 10.0D+0, $ HALF = 0.5D+0 ) COMPLEX*16 PADVAL PARAMETER ( PADVAL = ( 19.25D+0, 1.1D+1 ) ) COMPLEX*16 ZZERO PARAMETER ( ZZERO = ( 0.0D+0, 0.0D+0 ) ) COMPLEX*16 ZONE PARAMETER ( ZONE = ( 1.0D+0, 0.0D+0 ) ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IINFO, IL, IMODE, IN, INDD, $ INDRWORK, INDWORK, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, ITYPE, IU, J, LHEEVXSIZE, LLRWORK, $ LLWORK, MAXSIZE, MYCOL, MYROW, NB, NGEN, NLOC, $ NNODES, NP, NPCOL, NPROW, NQ, RES, RSIZECHK, $ RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, RSIZETST, $ SIZEHEEVX, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZESUBTST, SIZETMS, SIZETST, VALSIZE, VECSIZE DOUBLE PRECISION ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL LSAME, NUMROC, DLARAN, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, DLABAD, DLASRT, $ IGAMX2D, IGEBR2D, IGEBS2D, PZCHEKPAD, PZELSET, $ PZFILLPAD, PZGSEPSUBTST, PZLASET, PZLASIZEGSEP, $ PZLASIZEHEEVX, PZLATMS, PZMATGEN, SLCOMBINE, $ ZLATMS * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX, MIN, MOD, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 PASSED = 'PASSED ' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Make sure that we have enough memory * * CALL PZLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) * IF( LRWORK.LT.RSIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDRWORK = INDD + N INDWORK = 1 LLWORK = LWORK - INDWORK + 1 LLRWORK = LRWORK - INDRWORK + 1 * ULP = PDLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PDLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL DLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / DBLE( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log Hermitian, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random Hermitian * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * * Zero * * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N RWORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PZLASET( 'All', N, N, ZZERO, ZZERO, COPYA, 1, 1, $ DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N RWORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PZLASET( 'All', N, N, ZZERO, ZONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0D+0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0D+0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * Hermitian, eigenvalues specified * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * Hermitian, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PZMATGEN( DESCA( CTXT_ ), 'H', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0D+0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0D+0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PZLASET( 'All', N, N, ZZERO, ZZERO, COPYA, 1, 1, $ DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( DLARAN( ISEED )*DBLE( NLOC ) ), N-NGEN ) * CALL ZLATMS( IN, IN, 'S', ISEED, 'P', RWORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PZELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PZELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PZELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PZELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2, ... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 RWORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0D+0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * * * WKNOWN ... NOT SET, GUESS A DEFAULT * WKNOWN = .TRUE. ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL DLASRT( 'I', N, RWORK( INDD ), IINFO ) * * Create the B matrix * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.3D+0 ) * ANORM = ONE * * Update ISEED so that {ZLAGSY creates a different Q * ISEED( 4 ) = MOD( ISEED( 4 )+257, 4096 ) ISEED( 3 ) = MOD( ISEED( 3 )+192, 4096 ) ISEED( 2 ) = MOD( ISEED( 2 )+35, 4096 ) ISEED( 1 ) = MOD( ISEED( 1 )+128, 4096 ) CALL PZLATMS( N, N, 'S', ISEED, 'P', RWORK( INDD ), 3, TEN, $ ANORM, N, N, 'N', COPYB, 1, 1, DESCA, ORDER, $ WORK( INDWORK+IPREPAD ), SIZETMS, IINFO ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS5-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.3D+0 ) * * * These values aren't actually used, but they make ftncheck happy. * IL = -1 IU = -2 VL = ONE VU = -ONE * CALL PZLASIZEHEEVX( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, RWORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) * LHEEVXSIZE = MIN( MAXSIZE, LRWORK ) WKNOWN = .FALSE. * CALL PZGSEPSUBTST( WKNOWN, IBTYPE, 'v', 'a', UPLO, N, VL, VU, $ IL, IU, THRESH, ABSTOL, A, COPYA, B, COPYB, $ Z, 1, 1, DESCA, RWORK( INDD ), WIN, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK( INDRWORK ), $ LLRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * * * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'V', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'V' RANGE = 'A' CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 1' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'V', RANGE = 'A', random memory * IF( INFO.EQ.0 ) THEN JOBZ = 'V' RANGE = 'A' CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 2' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * We use PZLASIZEHEEVX to choose IL and IU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'I', maximum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PZLASIZEHEEVX to choose IL and IU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = MAXSIZE * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF * * Subtest 6: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PZLASIZEHEEVX to choose IL and IU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 6' INFO = 1 END IF END IF * * Subtest 7: JOBZ = 'V', RANGE = 'I', random memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PZLASIZEHEEVX to choose IL and IU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) LHEEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 7' INFO = 1 END IF END IF * * Subtest 8: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 8' INFO = 1 END IF END IF * * Subtest 9: JOBZ = 'V', RANGE = 'V', maximum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = MAXSIZE * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 9' INFO = 1 END IF END IF * * Subtest 10: JOBZ = 'V', RANGE = 'V', * minimum memory required for eigenvectors * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest10' INFO = 1 END IF END IF * * Subtest 11: JOBZ = 'V', RANGE = 'V', * random memory (enough for all eigenvectors * but not enough to guarantee orthogonality * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest11' INFO = 1 END IF END IF * * Subtest 12: JOBZ = 'V', RANGE = 'V', * miniimum memory required for eigenvalues only * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest12' INFO = 1 END IF END IF * * Subtest 13: JOBZ = 'V', RANGE = 'V', * random memory (more than minimum required * for eigenvalues, less than required for vectors) * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * * CALL PZGSEPSUBTST( .TRUE., IBTYPE, JOBZ, RANGE, UPLO, N, VL, $ VU, IL, IU, THRESH, ABSTOL, A, COPYA, B, $ COPYB, Z, 1, 1, DESCA, WIN( 1+IPREPAD ), $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, $ IPOSTPAD, WORK( INDWORK ), LLWORK, RWORK, $ LRWORK, LHEEVXSIZE, IWORK, ISIZEHEEVX, $ RES, TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest13' INFO = 1 END IF END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9984 )IBTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, CTIME( 1 ), MAXTSTNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ IBTYPE, SUBTESTS END IF END IF * 120 CONTINUE * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, F8.2, 1X, F8.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, 8X, 1X, F8.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, F8.2, 1X, F8.2, 11X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 1X, 8X, 1X, F8.2, 11X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, I3, 4X, A1, $ 22X, 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) 9984 FORMAT( ' IBTYPE=', I8 ) 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) 9980 FORMAT( ' Increase TOTMEM in PZGSEPDRIVER' ) * * End of PZGSEPTST * END scalapack-2.0.2/TESTING/EIG/pzhetdrv.f000644 000766 000024 00000041526 10363532303 017455 0ustar00juliestaff000000 000000 SUBROUTINE PZHETDRV( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION D( * ), E( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZHETDRV computes sub( A ) = A(IA:IA+N-1,JA:JA+N-1) from Q, the * Hermitian tridiagonal matrix T (or D and E), and TAU, which were * computed by PZHETRD: sub( A ) := Q * T * Q'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of sub( A ). On entry, * if UPLO='U', the diagonal and first superdiagonal of sub( A ) * have the corresponding elements of the tridiagonal matrix T, * and the elements above the first superdiagonal, with the * array TAU, represent the unitary matrix Q as a product of * elementary reflectors, and the strictly lower triangular part * of sub( A ) is not referenced. If UPLO='L', the diagonal and * first subdiagonal of sub( A ) have the corresponding elements * of the tridiagonal matrix T, and the elements below the first * subdiagonal, with the array TAU, represent the unitary * matrix Q as a product of elementary reflectors, and the * strictly upper triangular part of sub( A ) is not referenced. * On exit, if UPLO = 'U', the upper triangular part of the * distributed Hermitian matrix sub( A ) is recovered. * If UPLO='L', the lower triangular part of the distributed * Hermitian matrix sub( A ) is recovered. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local input) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local input) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK >= 2 * NB *( NB + NP ) * * where NB = MB_A = NB_A, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * On exit, if INFO <> 0, a discrepancy has been found between * the diagonal and off-diagonal elements of A and the copies * contained in the arrays D and E. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION REIGHT, RONE, RZERO PARAMETER ( REIGHT = 8.0D+0, RONE = 1.0D+0, $ RZERO = 0.0D+0 ) COMPLEX*16 HALF, ONE, ZERO PARAMETER ( HALF = ( 0.5D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, IACOL, IAROW, ICTXT, II, IPT, IPV, IPX, $ IPY, J, JB, JJ, JL, K, MYCOL, MYROW, NB, NP, $ NPCOL, NPROW DOUBLE PRECISION ADDBND, D2, E2 COMPLEX*16 D1, E1 * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCV( DLEN_ ), $ DESCT( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, IGSUM2D, $ PDELGET, PZELGET, PZGEMM, $ PZHEMM, PZHER2K, PZLACPY, $ PZLARFT, PZLASET, PZTRMM * .. * .. Intrinsic Functions .. INTRINSIC ABS, DCMPLX, MAX, MIN, MOD * .. * .. Executable statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 NB = DESCA( MB_ ) UPPER = LSAME( UPLO, 'U' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) * IPT = 1 IPV = NB * NB + IPT IPX = NB * NP + IPV IPY = NB * NP + IPX * CALL DESCSET( DESCD, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * ADDBND = REIGHT * PDLAMCH( ICTXT, 'eps' ) * IF( UPPER ) THEN * CALL DESCSET( DESCE, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 10 J = 0, N-1 D1 = ZERO E1 = ZERO D2 = RZERO E2 = RZERO CALL PDELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PZELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(N-1) ) THEN CALL PDELGET( ' ', ' ', E2, E, 1, JA+J+1, DESCE ) CALL PZELGET( 'Columnwise', ' ', E1, A, IA+J, JA+J+1, $ DESCA ) END IF * IF( ( ABS( D1-DCMPLX( D2 ) ).GT.( ABS( D2 )*ADDBND ) ) .OR. $ ( ABS( E1-DCMPLX( E2 ) ).GT.( ABS( E2 )*ADDBND ) ) ) $ INFO = INFO + 1 10 CONTINUE * * Compute the upper triangle of sub( A ). * CALL DESCSET( DESCV, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCT, NB, NB, NB, NB, IAROW, IACOL, ICTXT, NB ) * DO 20 K = 0, N-1, NB JB = MIN( NB, N-K ) I = IA + K J = JA + K * * Compute the lower triangular matrix T. * CALL PZLARFT( 'Backward', 'Columnwise', K+JB-1, JB, A, IA, $ J, DESCA, TAU, WORK( IPT ), WORK( IPV ) ) * * Copy Householder vectors into WORK( IPV ). * CALL PZLACPY( 'All', K+JB-1, JB, A, IA, J, DESCA, $ WORK( IPV ), 1, 1, DESCV ) * IF( K.GT.0 ) THEN CALL PZLASET( 'Lower', JB+1, JB, ZERO, ONE, WORK( IPV ), $ K, 1, DESCV ) ELSE CALL PZLASET( 'Lower', JB, JB-1, ZERO, ONE, WORK( IPV ), $ 1, 2, DESCV ) CALL PZLASET( 'Ge', JB, 1, ZERO, ZERO, WORK( IPV ), 1, $ 1, DESCV ) END IF * * Zero out the strict upper triangular part of A. * IF( K.GT.0 ) THEN CALL PZLASET( 'Ge', K-1, JB, ZERO, ZERO, A, IA, J, $ DESCA ) CALL PZLASET( 'Upper', JB-1, JB-1, ZERO, ZERO, A, I-1, $ J+1, DESCA ) ELSE IF( JB.GT.1 ) THEN CALL PZLASET( 'Upper', JB-2, JB-2, ZERO, ZERO, A, IA, $ J+2, DESCA ) END IF * * (1) X := A * V * T' * CALL PZHEMM( 'Left', 'Upper', K+JB, JB, ONE, A, IA, JA, $ DESCA, WORK( IPV ), 1, 1, DESCV, ZERO, $ WORK( IPX ), 1, 1, DESCV ) CALL PZTRMM( 'Right', 'Lower', 'Conjugate transpose', $ 'Non-Unit', K+JB, JB, ONE, WORK( IPT ), 1, 1, $ DESCT, WORK( IPX ), 1, 1, DESCV ) * * (2) X := X - 1/2 * V * (T * V' * X) * CALL PZGEMM( 'Conjugate transpose', 'No transpose', JB, JB, $ K+JB, ONE, WORK( IPV ), 1, 1, DESCV, $ WORK( IPX ), 1, 1, DESCV, ZERO, WORK( IPY ), $ 1, 1, DESCT ) CALL PZTRMM( 'Left', 'Lower', 'No transpose', 'Non-Unit', $ JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPY ), 1, 1, DESCT ) CALL PZGEMM( 'No tranpose', 'No transpose', K+JB, JB, JB, $ -HALF, WORK( IPV ), 1, 1, DESCV, WORK( IPY ), $ 1, 1, DESCT, ONE, WORK( IPX ), 1, 1, DESCV ) * * (3) A := A - X * V' - V * X' * CALL PZHER2K( 'Upper', 'No transpose', K+JB, JB, -ONE, $ WORK( IPV ), 1, 1, DESCV, WORK( IPX ), 1, 1, $ DESCV, RONE, A, IA, JA, DESCA ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + 1, NPCOL ) DESCT( CSRC_ ) = MOD( DESCT( CSRC_ ) + 1, NPCOL ) * 20 CONTINUE * ELSE * CALL DESCSET( DESCE, 1, JA+N-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * DO 30 J = 0, N-1 D1 = ZERO E1 = ZERO D2 = RZERO E2 = RZERO CALL PDELGET( ' ', ' ', D2, D, 1, JA+J, DESCD ) CALL PZELGET( 'Columnwise', ' ', D1, A, IA+J, JA+J, DESCA ) IF( J.LT.(N-1) ) THEN CALL PDELGET( ' ', ' ', E2, E, 1, JA+J, DESCE ) CALL PZELGET( 'Columnwise', ' ', E1, A, IA+J+1, JA+J, $ DESCA ) END IF * IF( ( ABS( D1-DCMPLX( D2 ) ).GT.( ABS( D2 )*ADDBND ) ) .OR. $ ( ABS( E1-DCMPLX( E2 ) ).GT.( ABS( E2 )*ADDBND ) ) ) $ INFO = INFO + 1 30 CONTINUE * * Compute the lower triangle of sub( A ). * JL = MAX( ( ( JA+N-2 ) / NB ) * NB + 1, JA ) IACOL = INDXG2P( JL, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCV, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCT, NB, NB, NB, NB, INDXG2P( IA+JL-JA+1, NB, $ MYROW, DESCA( RSRC_ ), NPROW ), IACOL, ICTXT, $ NB ) * DO 40 J = JL, JA, -NB K = J - JA + 1 I = IA + K - 1 JB = MIN( N-K+1, NB ) * * Compute upper triangular matrix T from TAU. * CALL PZLARFT( 'Forward', 'Columnwise', N-K, JB, A, I+1, J, $ DESCA, TAU, WORK( IPT ), WORK( IPV ) ) * * Copy Householder vectors into WORK( IPV ). * CALL PZLACPY( 'Lower', N-K, JB, A, I+1, J, DESCA, $ WORK( IPV ), K+1, 1, DESCV ) CALL PZLASET( 'Upper', N-K, JB, ZERO, ONE, WORK( IPV ), $ K+1, 1, DESCV ) CALL PZLASET( 'Ge', 1, JB, ZERO, ZERO, WORK( IPV ), K, 1, $ DESCV ) * * Zero out the strict lower triangular part of A. * CALL PZLASET( 'Lower', N-K-1, JB, ZERO, ZERO, A, I+2, J, $ DESCA ) * * (1) X := A * V * T' * CALL PZHEMM( 'Left', 'Lower', N-K+1, JB, ONE, A, I, J, $ DESCA, WORK( IPV ), K, 1, DESCV, ZERO, $ WORK( IPX ), K, 1, DESCV ) CALL PZTRMM( 'Right', 'Upper', 'Conjugate transpose', $ 'Non-Unit', N-K+1, JB, ONE, WORK( IPT ), 1, 1, $ DESCT, WORK( IPX ), K, 1, DESCV ) * * (2) X := X - 1/2 * V * (T * V' * X) * CALL PZGEMM( 'Conjugate transpose', 'No transpose', JB, JB, $ N-K+1, ONE, WORK( IPV ), K, 1, DESCV, $ WORK( IPX ), K, 1, DESCV, ZERO, WORK( IPY ), $ 1, 1, DESCT ) CALL PZTRMM( 'Left', 'Upper', 'No transpose', 'Non-Unit', $ JB, JB, ONE, WORK( IPT ), 1, 1, DESCT, $ WORK( IPY ), 1, 1, DESCT ) CALL PZGEMM( 'No transpose', 'No transpose', N-K+1, JB, JB, $ -HALF, WORK( IPV ), K, 1, DESCV, WORK( IPY ), $ 1, 1, DESCT, ONE, WORK( IPX ), K, 1, DESCV ) * * (3) A := A - X * V' - V * X' * CALL PZHER2K( 'Lower', 'No tranpose', N-K+1, JB, -ONE, $ WORK( IPV ), K, 1, DESCV, WORK( IPX ), K, 1, $ DESCV, RONE, A, I, J, DESCA ) * DESCV( CSRC_ ) = MOD( DESCV( CSRC_ ) + NPCOL - 1, NPCOL ) DESCT( RSRC_ ) = MOD( DESCT( RSRC_ ) + NPROW - 1, NPROW ) DESCT( CSRC_ ) = MOD( DESCT( CSRC_ ) + NPCOL - 1, NPCOL ) * 40 CONTINUE * END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * RETURN * * End of PZHETDRV * END scalapack-2.0.2/TESTING/EIG/pzhrddriver.f000644 000766 000024 00000045445 10363532303 020156 0ustar00juliestaff000000 000000 PROGRAM PZHRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * Purpose * ======= * * PZHRDDRIVER is the main test program for the COMPLEX*16 * ScaLAPACK HRD (Hessenberg Reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 14 lines: * 'ScaLAPACK HRD input file' * 'PVM machine' * 'HRD.out' output file name (if any) * 6 device out * 2 number of problems sizes * 100 101 values of N * 2 1 values of ILO * 99 101 values of IHI * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 3.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER MEMSIZ, NTESTS, TOTMEM, ZPLXSZ COMPLEX*16 PADVAL PARAMETER ( TOTMEM = 2000000, ZPLXSZ = 16, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IHI, IHIP, IHLP, IHLQ, $ ILCOL, ILO, ILROW, INFO, INLQ, IMIDPAD, IPA, $ IPT, IPW, IPOSTPAD, IPREPAD, ITEMP, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LCM, LCMQ, LOFF, $ LWORK, MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, $ NPROCS, NOUT, NP, NPCOL, NPROW, NQ, WORKHRD, $ WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), NVHI( NTESTS ), NVLO( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINIT, BLACS_GRIDINFO, $ DESCINIT, IGSUM2D, BLACS_PINFO, PZFILLPAD, $ PZLAFCHK, PZGEHDRV, PZGEHRD, $ PZHRDINFO, PZMATGEN, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. INTEGER ILCM, INDXG2P, NUMROC DOUBLE PRECISION PZLANGE EXTERNAL ILCM, INDXG2P, NUMROC, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PZHRDINFO( OUTFILE, NOUT, NMAT, NVAL, NVLO, NVHI, NTESTS, $ NNB, NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, $ NTESTS, THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GOTO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) ILO = NVLO( J ) IHI = NVHI( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP ) + IMIDPAD, INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPT = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * IHIP = NUMROC( IHI, NB, MYROW, DESCA( RSRC_ ), NPROW ) LOFF = MOD( ILO-1, NB ) ILROW = INDXG2P( ILO, NB, MYROW, DESCA( RSRC_ ), NPROW ) ILCOL = INDXG2P( ILO, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) IHLP = NUMROC( IHI-ILO+LOFF+1, NB, MYROW, ILROW, NPROW ) INLQ = NUMROC( N-ILO+LOFF+1, NB, MYCOL, ILCOL, NPCOL ) LWORK = NB*( NB + MAX( IHIP+1, IHLP+INLQ ) ) WORKHRD = LWORK + IPOSTPAD WORKSIZ = WORKHRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IHLQ = NUMROC( IHI-ILO+LOFF+1, NB, MYCOL, ILCOL, $ NPCOL ) ITEMP = NB*MAX( IHLP+INLQ, IHLQ+MAX( IHIP, $ IHLP+NUMROC( NUMROC( IHI-ILO+LOFF+1, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ) ) ) WORKSIZ = MAX( NB*NB + NB*IHLP + ITEMP, NB * NP ) + $ IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) 'Hessenberg reduction', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate A * CALL PZMATGEN( ICTXT, 'No', 'No', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), $ IASEED, 0, NP, 0, NQ, MYROW, MYCOL, $ NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), $ NQ, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANGE( 'I', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANGE', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKHRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKHRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce Hessenberg form * CALL PZGEHRD( N, ILO, IHI, MEM( IPA ), 1, 1, DESCA, $ MEM( IPT ), MEM( IPW ), LWORK, INFO ) CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGEHRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEHRD', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEHRD', WORKHRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKHRD-IPOSTPAD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - Q H Q'|| / (||A||*N*eps) * CALL PZGEHDRV( N, ILO, IHI, MEM( IPA ), 1, 1, DESCA, $ MEM( IPT ), MEM( IPW ) ) CALL PZLAFCHK( 'No', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZGEHDRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEHDRV', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGEHDRV', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ.0.0D+0 ) $ THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather max. of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * HRD requires 40/3 * N^3 floating point ops. (flops) * more precisely, * HRD requires 16/3*(IHI-ILO)^3+8*IHI*(IHI-ILO)^2 flops * NOPS = DBLE( IHI-ILO ) NOPS = NOPS * NOPS * $ ( 8.0D0*DBLE( IHI ) + (16.0D0/3.0D0)*NOPS ) NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'WALL', N, ILO, IHI, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, FRESID, $ PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 ) 'CPU ', N, ILO, IHI, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, FRESID, $ PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 ) KPASS WRITE( NOUT, FMT = 9989 ) KFAIL ELSE WRITE( NOUT, FMT = 9990 ) KPASS END IF WRITE( NOUT, FMT = 9988 ) KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N ILO IHI NB P Q HRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ------ ------ ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, I6, 1X, I6, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, $ F9.2, 1X, F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*H*Q''|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PZHRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pzhrdinfo.f000644 000766 000024 00000032402 10363532303 017603 0ustar00juliestaff000000 000000 SUBROUTINE PZHRDINFO( SUMMRY, NOUT, NMAT, NVAL, NVLO, NVHI, $ LDNVAL, NNB, NBVAL, LDNBVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, THRESH, WORK, IAM, $ NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 27, 2000 * * .. Scalar Arguments .. INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ NVHI( LDNVAL ), NVLO( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZHRDINFO get the needed startup information for the Hessenberg * reduction tests and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, output to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for * N, IHI & ILO. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of rows & columns in matrix). * * NVLO (global output) INTEGER array, dimension (LDNVAL) * The values of ILO. * * NVHI (global output) INTEGER array, dimension (LDNVAL) * The values of IHI. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, ILO and IHI. LDNVAL >= NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * 3*LDNVAL+LDNBVAL+2*LDPVAL. Used to pack all input arrays * in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'HRD.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = * ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( UNIT = NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1. .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'N', LDNVAL GO TO 20 END IF * * Get values of N, ILO, IHI * READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVLO( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVHI( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'NB', LDNBVAL GO TO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDPVAL GO TO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9997 ) 'Grids', LDQVAL GO TO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 1, 3, WORK, 1 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVLO, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVHI, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS -1 CALL IGEBS2D( ICTXT, 'All', ' ', 1, I, WORK, 1 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Reduction routine to Hessenberg form.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex double precision Hessenberg ' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - Q H Q''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ ' on to the next column of processes.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'HRD time : Time in seconds to compute HRD ' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for HRD ' // $ 'reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9995 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'ILO ', ( NVLO( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVLO( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'IHI ', ( NVHI( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NVHI( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9995 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9995 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9995 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9994 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) EPS WRITE( NOUT, FMT = 9993 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 1, 3, WORK, 1, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = 3*NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', 1, I, WORK, 1, 0, 0 ) * I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVLO, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVHI, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 CONTINUE WRITE( NOUT, FMT = 9998 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' ILLEGAL INPUT IN FILE ', 40A, '. ABORTING RUN.' ) 9997 FORMAT( ' NUMBER OF VALUES OF ', 5A, $ ' IS LESS THAN 1 OR GREATER ', 'THAN ', I2 ) 9996 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9995 FORMAT( 2X, A5, ': ', 10I6 ) 9994 FORMAT( ' ', 10I6 ) 9993 FORMAT( 'Routines pass computational tests if scaled residual is', $ ' less than ', G14.7 ) * * End of PZHRDINFO * END scalapack-2.0.2/TESTING/EIG/pzlafchk.f000644 000766 000024 00000027013 10363532303 017404 0ustar00juliestaff000000 000000 SUBROUTINE PZLAFCHK( AFORM, DIAG, M, N, A, IA, JA, DESCA, IASEED, $ ANORM, FRESID, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER AFORM, DIAG INTEGER IA, IASEED, JA, M, N DOUBLE PRECISION ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZLAFCHK computes the residual * || sub( A ) - sub( Ao ) || / (|| sub( Ao ) ||*eps*MAX(M,N)), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A( IA:IA+M-1, JA:JA+N-1 ) and ||.|| stands for the infini- * ty norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * AFORM (global input) CHARACTER * sub( A ) is overwritten with: * - a symmetric matrix, if AFORM = 'S'; * - a Hermitian matrix, if AFORM = 'H'; * - the transpose of what would normally be generated, * if AFORM = 'T'; * - the conjugate transpose of what would normally be * generated, if AFORM = 'C'; * - otherwise a random matrix. * * DIAG (global input) CHARACTER * if DIAG = 'D' : sub( A ) is diagonally dominant. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub( A ) - sub( Ao ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * ANORM (global input) DOUBLE PRECISION * The Infinity norm of sub( A ). * * FRESID (global output) DOUBLE PRECISION * The maximum (worst) factorizational error. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK). * LWORK >= MpA0 * NB_A, where * * IROFFA = MOD( IA-1, MB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * WORK is used to store a block of columns of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = (1.0D+0, 0.0D+0) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURCOL, ICURROW, $ II, IIA, IOFFA, IROFF, JB, JJ, JJA, JN, KK, $ LDA, LDW, LDWP1, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PZMATGEN, ZMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL ICEIL, LSAME, NUMROC, PDLAMCH, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( ICTXT, 'eps' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * * Compute sub( A ) := sub( A ) - sub( Ao ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP-IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDW = MAX( 1, MP ) LDWP1 = LDW + 1 LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA - 1 )*LDA * IF( LSAME( AFORM, 'H' ) ) THEN * * Handle first block of columns separately * II = 1 ICURROW = IAROW ICURCOL = IACOL JB = JN - JA + 1 * IF( MYCOL.EQ.ICURCOL ) THEN CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) IF( MYROW.EQ.ICURROW ) THEN DO 10, KK = 0, JB-1 WORK( II+KK*LDWP1 ) = DBLE( WORK( II+KK*LDWP1 ) ) 10 CONTINUE END IF CALL ZMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 30, JJ = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-JJ, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ WORK, LDW, DESCA( RSRC_ ), DESCA( CSRC_ ), $ IASEED, IIA-1, MP, JJA-1, JB, MYROW, $ MYCOL, NPROW, NPCOL ) IF( MYROW.EQ.ICURROW ) THEN DO 20, KK = 0, JB-1 WORK( II+KK*LDWP1 ) = DBLE( WORK( II+KK*LDWP1 ) ) 20 CONTINUE END IF CALL ZMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE * ELSE * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN JB = JN-JA+1 CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL ZMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB NQ = NQ - JB IOFFA = IOFFA + JB * LDA END IF * * Handle the remaning blocks of columns * DO 40 JJ = JJA, JJA+NQ-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JJA+NQ-JJ ) IOFFA = IIA + ( JJ - 1 )*LDA CALL PZMATGEN( ICTXT, AFORM, DIAG, DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, MP, JJ-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL ZMATADD( MP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) 40 CONTINUE * END IF * * Calculate factor residual * FRESID = PZLANGE( 'I', M, N, A, IA, JA, DESCA, WORK ) / $ ( MAX( M, N ) * EPS * ANORM ) * RETURN * * End PZLAFCHK * END scalapack-2.0.2/TESTING/EIG/pzlagsy.f000644 000766 000024 00000025732 10363532303 017301 0ustar00juliestaff000000 000000 * * SUBROUTINE PZLAGHE( N, K, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, INFO ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, N, ORDER * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) DOUBLE PRECISION D( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Purpose * ======= * * PZLAGHE generates a real Hermitian matrix A, by pre- and post- * multiplying a real diagonal matrix D with a random orthogonal matrix: * A = U*D*U'. * * This is just a quick implementation which will be replaced in the * future. The random orthogonal matrix is computed by creating a * random matrix and running QR on it. This requires vastly more * computation than necessary, but not significantly more communication * than is used in the rest of this rouinte, and hence is not that much * slower than an efficient solution. * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix A. N >= 0. * * K (global input) INTEGER * The number of nonzero subdiagonals within the band of A. * 0 <= K <= N-1. * ### K must be 0 or N-1, 0 < K < N-1 is not supported yet. * * D (global input) COMPLEX*16 array, dimension (N) * The diagonal elements of the diagonal matrix D. * * A (local output) COMPLEX*16 array * Global dimension (N, N), local dimension (NP, NQ) * The generated n by n Hermitian matrix A (the full matrix is * stored). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated and will remain identical on * all processes in the context. * * ORDER (global input) INTEGER * Number of reflectors in the matrix Q * At present, ORDER .NE. N is not supported * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * * LWORK (local input) INTEGER dimension of WORK * LWORK >= SIZETMS as returned by PZLASIZESEP * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZZERO PARAMETER ( ZZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER CSRC_A, I, IACOL, IAROW, ICOFFA, II, IIROW, $ INDAA, INDTAU, INDWORK, IPOSTPAD, IPREPAD, $ IROFFA, ISIZEHEEVX, ISIZESUBTST, ISIZETST, $ JJCOL, LDAA, LII, LIII, LJJ, LJJJ, LWMIN, MAXI, $ MB_A, MYCOL, MYROW, NB_A, NP, NPCOL, NPROW, NQ, $ RSIZECHK, RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, $ RSIZETST, RSRC_A, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST,SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PXERBLA, PZGEQRF, $ PZLASIZESEP, PZMATGEN, PZUNMQR, ZLASET * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. * INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Initialize grid information * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * * Check LWORK * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) END IF * LDAA = DESCA( LLD_ ) MB_A = DESCA( MB_ ) NB_A = DESCA( NB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) NP = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ) IPREPAD = 0 IPOSTPAD = 0 CALL PZLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) LWMIN = SIZETMS * * Test the input arguments * IF( INFO.EQ.0 ) THEN IF( K.LT.0 .OR. K.GT.N-1 ) THEN INFO = -2 ELSE IF( N.NE.ORDER ) THEN INFO = -9 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( INFO.LT.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PZLAGHE', -INFO ) RETURN END IF * INDAA = 1 INDTAU = INDAA + LDAA*MAX( 1, NQ ) INDWORK = INDTAU + MAX( 1, NQ ) * IF( K.NE.0 ) THEN CALL ZLASET( 'A', LDAA, NQ, ZZERO, ZZERO, WORK( INDAA ), LDAA ) * * * Build a random matrix * * CALL PZMATGEN( DESCA( CTXT_ ), 'N', 'N', N, ORDER, $ DESCA( MB_ ), DESCA( NB_ ), WORK( INDAA ), $ DESCA( LLD_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ ISEED( 1 ), 0, NP, 0, NQ, MYROW, MYCOL, NPROW, $ NPCOL ) CALL PZGEQRF( N, ORDER, WORK( INDAA ), IA, JA, DESCA, $ WORK( INDTAU ), WORK( INDWORK ), SIZEQRF, INFO ) * END IF * * Build a diagonal matrix A with the eigenvalues specified in D * CALL ZLASET( 'A', NP, NQ, ZZERO, ZZERO, A, DESCA( LLD_ ) ) * IIROW = 0 JJCOL = 0 LII = 1 LJJ = 1 * DO 20 II = 1, N, DESCA( MB_ ) MAXI = MIN( N, II+DESCA( MB_ )-1 ) IF( ( MYROW.EQ.IIROW ) .AND. ( MYCOL.EQ.JJCOL ) ) THEN LIII = LII LJJJ = LJJ DO 10 I = II, MAXI A( LIII+( LJJJ-1 )*DESCA( LLD_ ) ) = D( I ) LIII = LIII + 1 LJJJ = LJJJ + 1 10 CONTINUE END IF IF( MYROW.EQ.IIROW ) $ LII = LII + DESCA( MB_ ) IF( MYCOL.EQ.JJCOL ) $ LJJ = LJJ + DESCA( MB_ ) IIROW = MOD( IIROW+1, NPROW ) JJCOL = MOD( JJCOL+1, NPCOL ) 20 CONTINUE * * A = Q * A * IF( K.NE.0 ) THEN * CALL PZUNMQR( 'L', 'Conjugate transpose', N, N, ORDER, $ WORK( INDAA ), IA, JA, DESCA, WORK( INDTAU ), A, $ IA, JA, DESCA, WORK( INDWORK ), SIZEMQRLEFT, $ INFO ) * * * A = A * Q' * * CALL PZUNMQR( 'R', 'N', N, N, ORDER, WORK( INDAA ), IA, JA, $ DESCA, WORK( INDTAU ), A, IA, JA, DESCA, $ WORK( INDWORK ), SIZEMQRRIGHT, INFO ) * END IF * * End of PZLAGHE * END scalapack-2.0.2/TESTING/EIG/pzlasizegsep.f000644 000766 000024 00000013241 10363532303 020320 0ustar00juliestaff000000 000000 SUBROUTINE PZLASIZEGSEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, $ ISIZEHEEVX, SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, RSIZETST, $ ISIZETST ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, RSIZECHK, RSIZEHEEVX, RSIZEQTQ, $ RSIZESUBTST, RSIZETST, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * PZLASIZEGSEP computes the amount of memory needed by * ======= * * PZLASIZEGSEP computes the amount of memory needed by * various GSEP test routines, as well as HEGVX itself * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PZHEGVX * * SIZEMQRLEFT LWORK for the 1st PZUNMQR call in PZLAGHE * * SIZEMQRRIGHT LWORK for the 2nd PZUNMQR call in PZLAGHE * * SIZEQRF LWORK for PZGEQRF in PZLAGHE * * SIZETMS LWORK for PZLATMS * * RSIZEQTQ LWORK for PZSEPQTQ (nexer complex) * * RSIZECHK LWORK for PZGSEPCHK * * SIZEHEEVX LWORK for PZHEGVX * * RSIZEHEEVX LRWORK for PZHEGVX * * ISIZEHEEVX LIWORK for PZHEGVX * * SIZESUBTST LWORK for PZSUBTST * * RSIZESUBTST LRWORK for PZSUBTST * * ISIZESUBTST LIWORK for PZSUBTST * * SIZETST LWORK for PZTST * * RSIZETST LRWORK for PZTST * * ISIZETST LIWORK for PZTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ANB, CSRC_A, IACOL, IAROW, ICOFFA, ICTXT, $ IROFFA, LCM, LCMQ, LDA, MQ0, MYCOL, MYROW, N, $ NB, NEIG, NHEGST_LWOPT, NHETRD_LWOPT, NN, NNP, $ NP, NP0, NPCOL, NPROW, NPS, NQ, NQ0, RSRC_A, $ SIZECHK, SIZEQTQ, SQNPC * .. * .. External Functions .. * INTEGER ICEIL, ILCM, INDXG2P, NUMROC, PJLAENV EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC, PJLAENV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, SQRT * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 0 SIZECHK = 0 RSIZEQTQ = 0 RSIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZEHEEVX = N + ( NP0+MQ0+NB )*NB RSIZEHEEVX = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZEHEEVX = 6*NNP * ICTXT = DESCA( CTXT_ ) ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS * NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) NHEGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB SIZEHEEVX = MAX( SIZEHEEVX, N+NHETRD_LWOPT, NHEGST_LWOPT ) * SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZEHEEVX ) + $ IPREPAD + IPOSTPAD RSIZESUBTST = MAX( RSIZEHEEVX, RSIZEQTQ, RSIZECHK ) + IPREPAD + $ IPOSTPAD ISIZESUBTST = ISIZEHEEVX + IPREPAD + IPOSTPAD * * * Allow room for A, COPYA and Z and WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Room for DIAG, WIN, WNEW, GAP and RWORK * RSIZETST = 4*( N+IPREPAD+IPOSTPAD ) + RSIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PZHEGVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pzlasizeheevr.f000644 000766 000024 00000014543 11623527140 020504 0ustar00juliestaff000000 000000 SUBROUTINE PZLASIZEHEEVR( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER RANGE INTEGER IL, IU, MAXSIZE, N, VALSIZE, VECSIZE DOUBLE PRECISION VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) DOUBLE PRECISION WIN( * ) * .. * * Purpose * ======= * * PZLASIZEHEEVR computes the amount of memory needed by PZHEEVR * to ensure: * 1) Orthogonal Eigenvectors * 2) Eigenpairs with small residual norms * * Arguments * ========= * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * VL (global input/output ) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VL is set * to a random value near an entry in WIN * * VU (global input/output ) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VU is set * to a random value near an entry in WIN * * IL (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * If IL < 0, RANGE='I' and WKNOWN = .TRUE., IL is set * to a random value from 1 to N * * IU (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * If IU < 0, RANGE='I' and WKNOWN = .TRUE., IU is set * to a random value from IL to N * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * ISEED is not touched unless IL, IU, VL or VU are modified. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If WKNOWN=1, WIN contains the eigenvalues of the matrix. * * MAXSIZE (global output) INTEGER * Workspace required to guarantee that PZHEEVR will return * orthogonal eigenvectors. IF WKNOWN=0, MAXSIZE is set to a * a value which guarantees orthogonality no matter what the * spectrum is. If WKNOWN=1, MAXSIZE is set to a value which * guarantees orthogonality on a matrix with eigenvalues given * by WIN. * * VECSIZE (global output) INTEGER * Workspace required to guarantee that PZHEEVR * will compute eigenvectors. * * VALSIZE (global output) INTEGER * Workspace required to guarantee that PZHEEVR * will compute eigenvalues. * * * .. Parameters .. INTEGER CTXT_, MB_ PARAMETER ( CTXT_ = 2, MB_ = 5 ) DOUBLE PRECISION TWENTY PARAMETER ( TWENTY = 20.0D0 ) * .. * .. Local Scalars .. * INTEGER ILMIN, IUMAX, $ MQ0, MYCOL, MYIL, MYIU, MYROW, NB, NEIG, NN, $ NP0, NPCOL, NPROW DOUBLE PRECISION ANORM, EPS, SAFMIN * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL LSAME, ICEIL, NUMROC, DLARAN, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe Minimum' ) NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) VALSIZE = 3 + 5*N + MAX( 12*NN, NB*( NP0+1 ) ) IF( WKNOWN ) THEN ANORM = SAFMIN / EPS IF( N.GE.1 ) $ ANORM = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), ANORM ) IF( LSAME( RANGE, 'I' ) ) THEN IF( IL.LT.0 ) $ IL = INT( DLARAN( ISEED )*DBLE( N ) ) + 1 IF( IU.LT.0 ) $ IU = INT( DLARAN( ISEED )*DBLE( N-IL ) ) + IL IF( N.EQ.0 ) $ IU = 0 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IF( VL.GT.VU ) THEN MYIL = INT( DLARAN( ISEED )*DBLE( N ) ) + 1 MYIU = INT( DLARAN( ISEED )*DBLE( N-MYIL ) ) + MYIL VL = WIN( MYIL ) - TWENTY*EPS*ABS( WIN( MYIL ) ) VU = WIN( MYIU ) + TWENTY*EPS*ABS( WIN( MYIU ) ) VU = MAX( VU, VL+EPS*TWENTY*ABS( VL )+SAFMIN ) END IF END IF * END IF IF( LSAME( RANGE, 'V' ) ) THEN * We do not know how many eigenvalues will be computed ILMIN = 1 IUMAX = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN ILMIN = IL IUMAX = IU ELSE IF( LSAME( RANGE, 'A' ) ) THEN ILMIN = 1 IUMAX = N END IF * NEIG = IUMAX - ILMIN + 1 * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * VECSIZE = 3 + 5*N + MAX( 18*NN, NP0*MQ0+2*NB*NB ) + $ (2 + ICEIL( NEIG, NPROW*NPCOL ))*NN VALSIZE = MAX(3, VALSIZE) VECSIZE = MAX(3, VECSIZE) MAXSIZE = VECSIZE * RETURN * * End of PZLASIZEHEEVR * END scalapack-2.0.2/TESTING/EIG/pzlasizeheevx.f000644 000766 000024 00000017213 10363532303 020504 0ustar00juliestaff000000 000000 * * SUBROUTINE PZLASIZEHEEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER RANGE INTEGER IL, IU, MAXSIZE, N, VALSIZE, VECSIZE DOUBLE PRECISION VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) DOUBLE PRECISION WIN( * ) * .. * * Purpose * ======= * * PZLASIZEHEEVX computes the amount of memory needed by PZHEEVX * to ensure: * 1) Orthogonal Eigenvectors * 2) Eigenvectors * 3) Eigenvalues * * Arguments * ========= * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * * VL (global input/output ) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VL is set * to a random value near an entry in WIN * * VU (global input/output ) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * If VL > VU, RANGE='V' and WKNOWN = .TRUE., VU is set * to a random value near an entry in WIN * * IL (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * If IL < 0, RANGE='I' and WKNOWN = .TRUE., IL is set * to a random value from 1 to N * * IU (global input/output ) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * If IU < 0, RANGE='I' and WKNOWN = .TRUE., IU is set * to a random value from IL to N * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * ISEED is not touched unless IL, IU, VL or VU are modified. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If WKNOWN=1, WIN contains the eigenvalues of the matrix. * * MAXSIZE (global output) INTEGER * Workspace required to guarantee that PZHEEVX will return * orthogonal eigenvectors. IF WKNOWN=0, MAXSIZE is set to a * a value which guarantees orthogonality no matter what the * spectrum is. If WKNOWN=1, MAXSIZE is set to a value which * guarantees orthogonality on a matrix with eigenvalues given * by WIN. * * VECSIZE (global output) INTEGER * Workspace required to guarantee that PZHEEVX * will compute eigenvectors. * * VALSIZE (global output) INTEGER * Workspace required to guarantee that PZHEEVX * will compute eigenvalues. * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION TWENTY PARAMETER ( TWENTY = 20.0D0 ) * .. * .. Local Scalars .. * INTEGER CLUSTERSIZE, I, ILMIN, IUMAX, MAXCLUSTERSIZE, $ MQ0, MYCOL, MYIL, MYIU, MYROW, NB, NEIG, NN, $ NP0, NPCOL, NPROW DOUBLE PRECISION ANORM, EPS, ORFAC, SAFMIN, VLMIN, VUMAX * .. * .. External Functions .. * * LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL LSAME, ICEIL, NUMROC, DLARAN, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * ORFAC = 1.0D-3 * * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe Minimum' ) NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * VALSIZE = 5*NN + 4*N * IF( WKNOWN ) THEN ANORM = SAFMIN / EPS IF( N.GE.1 ) $ ANORM = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), ANORM ) * IF( LSAME( RANGE, 'I' ) ) THEN IF( IL.LT.0 ) $ IL = INT( DLARAN( ISEED )*DBLE( N ) ) + 1 IF( IU.LT.0 ) $ IU = INT( DLARAN( ISEED )*DBLE( N-IL ) ) + IL IF( N.EQ.0 ) $ IU = 0 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IF( VL.GT.VU ) THEN MYIL = INT( DLARAN( ISEED )*DBLE( N ) ) + 1 MYIU = INT( DLARAN( ISEED )*DBLE( N-MYIL ) ) + MYIL VL = WIN( MYIL ) + TWENTY*EPS*ABS( WIN( MYIL ) ) VU = WIN( MYIU ) + TWENTY*EPS*ABS( WIN( MYIU ) ) VU = MAX( VU, VL+EPS*TWENTY*ABS( VL )+SAFMIN ) END IF END IF * END IF IF( LSAME( RANGE, 'V' ) ) THEN * * Compute ILMIN, IUMAX (based on VL, VU and WIN) * IF( WKNOWN ) THEN VLMIN = VL - TWENTY*EPS*ANORM VUMAX = VU + TWENTY*EPS*ANORM ILMIN = 1 IUMAX = 0 DO 10 I = 1, N IF( WIN( I ).LT.VLMIN ) $ ILMIN = ILMIN + 1 IF( WIN( I ).LT.VUMAX ) $ IUMAX = IUMAX + 1 10 CONTINUE ELSE ILMIN = 1 IUMAX = N END IF ELSE IF( LSAME( RANGE, 'I' ) ) THEN ILMIN = IL IUMAX = IU ELSE IF( LSAME( RANGE, 'A' ) ) THEN ILMIN = 1 IUMAX = N END IF * NEIG = IUMAX - ILMIN + 1 * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) VECSIZE = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN * IF( WKNOWN ) THEN CLUSTERSIZE = 1 MAXCLUSTERSIZE = 1 DO 20 I = ILMIN + 1, IUMAX IF( ( WIN( I )-WIN( I-1 ) ).LT.ORFAC*2*ANORM ) THEN CLUSTERSIZE = CLUSTERSIZE + 1 IF( CLUSTERSIZE.GT.MAXCLUSTERSIZE ) $ MAXCLUSTERSIZE = CLUSTERSIZE ELSE CLUSTERSIZE = 1 END IF 20 CONTINUE IF( CLUSTERSIZE.GT.MAXCLUSTERSIZE ) $ MAXCLUSTERSIZE = CLUSTERSIZE ELSE MAXCLUSTERSIZE = N END IF * MAXSIZE = VECSIZE + MAX( ( MAXCLUSTERSIZE-1 ), 0 )*N * * RETURN * * End of PZLASIZEHEEVX * END scalapack-2.0.2/TESTING/EIG/pzlasizesep.f000644 000766 000024 00000013510 10363532303 020150 0ustar00juliestaff000000 000000 SUBROUTINE PZLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, $ ISIZEHEEVX, SIZEHEEVD, RSIZEHEEVD, $ ISIZEHEEVD, SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, RSIZETST, ISIZETST ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZEHEEVD, ISIZEHEEVX, $ ISIZESUBTST, ISIZETST, RSIZECHK, RSIZEHEEVD, $ RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, RSIZETST, $ SIZEHEEVD, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PZLASIZESEP computes the amount of memory needed by * various SEP test routines, as well as HEEVX itself * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor as passed to PZHEEVX * * SIZEMQRLEFT LWORK for the 1st PZUNMQR call in PZLAGHE * * SIZEMQRRIGHT LWORK for the 2nd PZUNMQR call in PZLAGHE * * SIZEQRF LWORK for PZGEQRF in PZLAGHE * * SIZETMS LWORK for PZLATMS * * RSIZEQTQ LWORK for PZSEPQTQ (nexer complex) * * RSIZECHK LWORK for PZSEPCHK * * SIZEHEEVX LWORK for PZHEEVX * * RSIZEHEEVX LRWORK for PZHEEVX * * ISIZEHEEVX LIWORK for PZHEEVX * * SIZEHEEVD LWORK for PCHEEVD * * RSIZEHEEVD LRWORK for PCHEEVD * * ISIZEHEEVD LIWORK for PCHEEVD * * SIZESUBTST LWORK for PZSUBTST * * RSIZESUBTST LRWORK for PZSUBTST * * ISIZESUBTST LIWORK for PZSUBTST * * SIZETST LWORK for PZTST * * RSIZETST LRWORK for PZTST * * ISIZETST LIWORK for PZTST * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ANB, CSRC_A, IACOL, IAROW, ICOFFA, ICTXT, $ IROFFA, LCM, LCMQ, LDA, MQ0, MYCOL, MYROW, N, $ NB, NEIG, NHETRD_LWOPT, NN, NNP, NP, NP0, $ NPCOL, NPROW, NPS, NQ, RSRC_A, SIZECHK, $ SIZEQTQ, SQNPC * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC, PJLAENV EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC, PJLAENV * .. ** .. Executable Statements .. * This is just to keep ftnchek happy * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, SQRT * .. * .. Executable Statements .. IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 0 SIZECHK = 0 RSIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) RSIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) SIZEHEEVX = N + ( NP0+MQ0+NB )*NB RSIZEHEEVX = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN NNP = MAX( N, NPROW*NPCOL+1, 4 ) ISIZEHEEVX = 6*NNP * ICTXT = DESCA( CTXT_ ) ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS * SIZEHEEVX = MAX( SIZEHEEVX, N+NHETRD_LWOPT ) * SIZEHEEVD = SIZEHEEVX RSIZEHEEVD = 7*N + 3*NP0*MQ0 ISIZEHEEVD = 7*N + 8*NPCOL + 2 SIZESUBTST = MAX( SIZETMS, SIZEQTQ, SIZECHK, SIZEHEEVX, $ SIZEHEEVD ) + IPREPAD + IPOSTPAD RSIZESUBTST = MAX( RSIZEHEEVX, RSIZEHEEVD, RSIZEQTQ, RSIZECHK ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = MAX( ISIZEHEEVX, ISIZEHEEVD ) + IPREPAD + IPOSTPAD * * * Allow room for A, COPYA and Z and WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Room for DIAG, WIN, WNEW, GAP and RWORK * RSIZETST = 4*( N+IPREPAD+IPOSTPAD ) + RSIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK (all in PZHEEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * RETURN END scalapack-2.0.2/TESTING/EIG/pzlasizesepr.f000644 000766 000024 00000012422 11623527140 020336 0ustar00juliestaff000000 000000 SUBROUTINE PZLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEHEEVR, RSIZEHEEVR, $ ISIZEHEEVR, SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, RSIZETST, $ ISIZETST ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IPOSTPAD, IPREPAD, ISIZEHEEVR, ISIZESUBTST, $ ISIZETST, RSIZEHEEVR, RSIZESUBTST, RSIZETST, $ SIZECHK, SIZEHEEVR, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZEQTQ, SIZESUBTST, SIZETMS, SIZETST * .. * .. Array Arguments .. INTEGER DESCA( * ) * * Purpose * ======= * * PZLASIZESEPR computes the amount of memory needed by * various SEPR test routines, as well as PZHEEVR itself. * * Arguments * ========= * * DESCA (global input) INTEGER array dimension ( DLEN_ ) * Array descriptor for dense matrix. * * SIZEMQRLEFT LWORK for the 1st PZUNMQR call in PZLAGHE * * SIZEMQRRIGHT LWORK for the 2nd PZUNMQR call in PZLAGHE * * SIZEQRF LWORK for PZGEQRF in PZLAGHE * * SIZETMS LWORK for PZLATMS * * SIZEQTQ LWORK for PZSEPQTQ * * SIZECHK LWORK for PZSEPCHK * * SIZEHEEVR LWORK for PZHEEVR * * RSIZEHEEVR LRWORK for PZHEEVR * * ISIZEHEEVR LIWORK for PZHEEVR * * SIZESUBTST LWORK for PZSEPRSUBTST * * RSIZESUBTST LRWORK for PZSEPRSUBTST * * ISIZESUBTST LIWORK for PZSEPRSUBTST * * SIZETST LWORK for PZSEPRTST * * RSIZETST LRWORK for PZSEPRTST * * ISIZETST LIWORK for PZSEPRTST * * * .. Parameters .. INTEGER CTXT_, M_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( $ CTXT_ = 2, M_ = 3, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC_A, IACOL, IAROW, ICOFFA, IROFFA, LCM, $ LCMQ, LDA, MQ0, MYCOL, MYROW, N, NB, NEIG, NN, $ NNP, NP, NP0, NPCOL, NPROW, NQ, RSRC_A INTEGER ANB, ICTXT, NHETRD_LWOPT, NPS, SQNPC * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC INTEGER PJLAENV EXTERNAL PJLAENV * * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, SQRT * .. * .. Executable Statements .. * N = DESCA( M_ ) NB = DESCA( MB_ ) RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) * LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL IROFFA = 0 ICOFFA = 0 IAROW = INDXG2P( 1, NB, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, NB, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) SIZEMQRLEFT = MAX( ( NB*( NB-1 ) ) / 2, ( NP+NQ )*NB ) + NB*NB SIZEMQRRIGHT = MAX( ( NB*( NB-1 ) ) / 2, $ ( NQ+MAX( NP+NUMROC( NUMROC( N+ICOFFA, NB, 0, 0, $ NPCOL ), NB, 0, 0, LCMQ ), NP ) )*NB ) + NB*NB SIZEQRF = NB*NP + NB*NQ + NB*NB SIZETMS = ( LDA+1 )*MAX( 1, NQ ) + $ MAX( SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF ) * NP0 = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) SIZEQTQ = 2 + MAX( DESCA( MB_ ), 2 )*( 2*NP0+MQ0 ) SIZECHK = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * NEIG = N NN = MAX( N, NB, 2 ) + 1 NP0 = NUMROC( NN, NB, 0, 0, NPROW ) MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) NNP = MAX( N, NPROW*NPCOL+1, 4 ) * * SIZEHEEVR = 1+N + ( NP0+MQ0+NB )*NB SIZEHEEVR = MAX(3, SIZEHEEVR) RSIZEHEEVR = 1 + 5*N + MAX( 18*NN, NP0*MQ0+2*NB*NB ) + $ (2 + ICEIL( NEIG, NPROW*NPCOL ))*NN RSIZEHEEVR = MAX(3, RSIZEHEEVR) * ISIZEHEEVR = 12*NNP + 2*N * ICTXT = DESCA( CTXT_ ) ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS SIZEHEEVR = MAX( SIZEHEEVR, N + NHETRD_LWOPT ) * SIZESUBTST = MAX( SIZETMS, SIZEHEEVR ) + $ IPREPAD + IPOSTPAD RSIZESUBTST = MAX( SIZEQTQ, SIZECHK, RSIZEHEEVR ) + $ IPREPAD + IPOSTPAD ISIZESUBTST = ISIZEHEEVR + IPREPAD + IPOSTPAD * * Allow room for A, COPYA, Z, WORK * SIZETST = 3*( LDA*NP+IPREPAD+IPOSTPAD ) + SIZESUBTST * * Allow room for DIAG, WIN, WNEW, GAP, RWORK * RSIZETST = 4*( N+IPREPAD+IPOSTPAD ) + RSIZESUBTST * * Allow room for IFAIL, ICLUSTR, and IWORK * (only needed for PZHEEVX) * ISIZETST = N + 2*NPROW*NPCOL + 2*( IPREPAD+IPOSTPAD ) + $ ISIZESUBTST * * RETURN END scalapack-2.0.2/TESTING/EIG/pzlatms.f000644 000766 000024 00000032655 10363532303 017304 0ustar00juliestaff000000 000000 * * SUBROUTINE PZLATMS( M, N, DIST, ISEED, SYM, D, MODE, COND, DMAX, $ KL, KU, PACK, A, IA, JA, DESCA, ORDER, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIST, PACK, SYM INTEGER IA, INFO, JA, KL, KU, LWORK, M, MODE, N, ORDER DOUBLE PRECISION COND, DMAX * .. * .. Array Arguments .. INTEGER DESCA( * ), ISEED( 4 ) DOUBLE PRECISION D( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZLATMS generates random Hermitian matrices with specified * eigenvalues for testing SCALAPACK programs. * * PZLATMS operates by applying the following sequence of * operations: * * Set the diagonal to D, where D may be input or * computed according to MODE, COND, DMAX, and SYM * as described below. * * Generate a dense M x N matrix by multiplying D on the left * and the right by random unitary matrices, then: * * Reduce the bandwidth according to KL and KU, using * Householder transformations. * ### bandwidth reduction NOT SUPPORTED ### * * Arguments * ========= * * M - (global input) INTEGER * The number of rows of A. Not modified. * * N - (global input) INTEGER * The number of columns of A. Not modified. * ### M .ne. N unsupported * * DIST - (global input) CHARACTER*1 * On entry, DIST specifies the type of distribution to be used * to generate the random eigen-/singular values. * 'U' => UNIFORM( 0, 1 ) ( 'U' for uniform ) * 'S' => UNIFORM( -1, 1 ) ( 'S' for symmetric ) * 'N' => NORMAL( 0, 1 ) ( 'N' for normal ) * Not modified. * * ISEED - (global input) INTEGER array, dimension ( 4 ) * On entry ISEED specifies the seed of the random number * generator. They should lie between 0 and 4095 inclusive, * and ISEED(4) should be odd. The random number generator * uses a linear congruential sequence limited to small * integers, and so should produce machine independent * random numbers. The values of ISEED are changed on * exit, and can be used in the next call to ZLATMS * to continue the same random number sequence. * Changed on exit. * * SYM - (global input) CHARACTER*1 * If SYM='S' or 'H', the generated matrix is Hermitian, with * eigenvalues specified by D, COND, MODE, and DMAX; they * may be positive, negative, or zero. * If SYM='P', the generated matrix is Hermitian, with * eigenvalues (= singular values) specified by D, COND, * MODE, and DMAX; they will not be negative. * If SYM='N', the generated matrix is nonsymmetric, with * singular values specified by D, COND, MODE, and DMAX; * they will not be negative. * ### SYM = 'N' NOT SUPPORTED ### * Not modified. * * D - (local input/output) DOUBLE PRECISION array, * dimension ( MIN( M , N ) ) * This array is used to specify the singular values or * eigenvalues of A (see SYM, above.) If MODE=0, then D is * assumed to contain the singular/eigenvalues, otherwise * they will be computed according to MODE, COND, and DMAX, * and placed in D. * Modified if MODE is nonzero. * * MODE - (global input) INTEGER * On entry this describes how the singular/eigenvalues are to * be specified: * MODE = 0 means use D as input * MODE = 1 sets D(1)=1 and D(2:N)=1.0/COND * MODE = 2 sets D(1:N-1)=1 and D(N)=1.0/COND * MODE = 3 sets D(I)=COND**(-(I-1)/(N-1)) * MODE = 4 sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) * MODE = 5 sets D to random numbers in the range * ( 1/COND , 1 ) such that their logarithms * are uniformly distributed. * MODE = 6 set D to random numbers from same distribution * as the rest of the matrix. * MODE < 0 has the same meaning as ABS(MODE), except that * the order of the elements of D is reversed. * Thus if MODE is positive, D has entries ranging from * 1 to 1/COND, if negative, from 1/COND to 1, * If SYM='S' or 'H', and MODE is neither 0, 6, nor -6, then * the elements of D will also be multiplied by a random * sign (i.e., +1 or -1.) * Not modified. * * COND - (global input) DOUBLE PRECISION * On entry, this is used as described under MODE above. * If used, it must be >= 1. Not modified. * * DMAX - (global input) DOUBLE PRECISION * If MODE is neither -6, 0 nor 6, the contents of D, as * computed according to MODE and COND, will be scaled by * DMAX / max(abs(D(i))); thus, the maximum absolute eigen- or * singular value (which is to say the norm) will be abs(DMAX). * Note that DMAX need not be positive: if DMAX is negative * (or zero), D will be scaled by a negative number (or zero). * Not modified. * * KL - (global input) INTEGER * This specifies the lower bandwidth of the matrix. For * example, KL=0 implies upper triangular, KL=1 implies upper * Hessenberg, and KL being at least M-1 means that the matrix * has full lower bandwidth. KL must equal KU if the matrix * is Hermitian. * Not modified. * ### 1 <= KL < N-1 is NOT SUPPORTED ### * * KU - (global input) INTEGER * This specifies the upper bandwidth of the matrix. For * example, KU=0 implies lower triangular, KU=1 implies lower * Hessenberg, and KU being at least N-1 means that the matrix * has full upper bandwidth. KL must equal KU if the matrix * is Hermitian. * Not modified. * ### 1 <= KU < N-1 is NOT SUPPORTED ### * * PACK - (global input) CHARACTER*1 * This specifies packing of matrix as follows: * 'N' => no packing * ### PACK must be 'N' all other options NOT SUPPORTED ### * * A - (local output) COMPLEX*16 array * Global dimension (M, N), local dimension (MP, NQ) * On exit A is the desired test matrix. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ORDER - (input) INTEGER * The number of reflectors used to define the orthogonal * matrix Q. A = Q * D * Q' * Higher ORDER requires more computation and communication. * * WORK - (local input/output) COMPLEX*16 array, * dimension (LWORK) * * LWORK - (local input) INTEGER dimension of WORK * LWORK >= SIZETMS as returned by PZLASIZESEP * * INFO - (global output) INTEGER * Error code. On exit, INFO will be set to one of the * following values: * 0 => normal return * -1 => M negative or unequal to N and SYM='S', 'H', or 'P' * -2 => N negative * -3 => DIST illegal string * -5 => SYM illegal string * -7 => MODE not in range -6 to 6 * -8 => COND less than 1.0, and MODE neither -6, 0 nor 6 * -10 => KL negative * -11 => KU negative, or SYM='S' or 'H' and KU not equal to KL * -16 => DESCA is inconsistent * -17 => ORDER not in the range 0 to N inclusive * 1 => Error return from DLATM1 * 2 => Cannot scale to DMAX (max. sing. value is 0) * 3 => Error return from PZLAGHE * *----------------------------------------------------------------------- * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 ZZERO PARAMETER ( ZZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IDIST, IINFO, IPACK, IRSIGN, ISYM, LLB, $ MNMIN, MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION ALPHA, TEMP * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DLATM1, DSCAL, $ PCHK1MAT, PXERBLA, PZLAGHE, ZLASET * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * 1) Decode and Test the input parameters. * Initialize flags & seed. * * INFO = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IF( ( MYROW.GE.NPROW .OR. MYROW.LT.0 ) .OR. $ ( MYCOL.GE.NPCOL .OR. MYCOL.LT.0 ) )RETURN * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Decode DIST * IF( LSAME( DIST, 'U' ) ) THEN IDIST = 1 ELSE IF( LSAME( DIST, 'S' ) ) THEN IDIST = 2 ELSE IF( LSAME( DIST, 'N' ) ) THEN IDIST = 3 ELSE IDIST = -1 END IF * * Decode SYM * IF( LSAME( SYM, 'N' ) ) THEN ISYM = 1 IRSIGN = 0 ELSE IF( LSAME( SYM, 'P' ) ) THEN ISYM = 2 IRSIGN = 0 ELSE IF( LSAME( SYM, 'S' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE IF( LSAME( SYM, 'H' ) ) THEN ISYM = 2 IRSIGN = 1 ELSE ISYM = -1 END IF * * Decode PACK * IF( LSAME( PACK, 'N' ) ) THEN IPACK = 0 ELSE IPACK = 1 END IF * * Set certain internal parameters * MNMIN = MIN( M, N ) LLB = MIN( KL, M-1 ) * IF( ORDER.EQ.0 ) $ ORDER = N * * Set INFO if an error * IF( NPROW.EQ.-1 ) THEN INFO = -( 1600+CTXT_ ) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 16, INFO ) IF( INFO.EQ.0 ) THEN IF( M.NE.N .AND. ISYM.NE.1 ) THEN INFO = -2 ELSE IF( IDIST.EQ.-1 ) THEN INFO = -3 ELSE IF( ISYM.EQ.-1 ) THEN INFO = -5 ELSE IF( ABS( MODE ).GT.6 ) THEN INFO = -7 ELSE IF( ( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) .AND. COND.LT. $ ONE ) THEN INFO = -8 ELSE IF( KL.LT.0 ) THEN INFO = -10 ELSE IF( KU.LT.0 .OR. ( ISYM.NE.1 .AND. KL.NE.KU ) ) THEN INFO = -11 ELSE IF( ( ORDER.LT.0 ) .OR. ( ORDER.GT.N ) ) THEN INFO = -17 END IF END IF CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 16, 0, IDUM1, IDUM2, $ INFO ) END IF * * Check for unsupported features * IF( ISYM.NE.2 ) THEN INFO = -5 ELSE IF( IPACK.NE.0 ) THEN INFO = -12 ELSE IF( KL.GT.0 .AND. KL.LT.M-1 ) THEN INFO = -10 ELSE IF( KU.GT.0 .AND. KU.LT.N-1 ) THEN INFO = -11 ELSE IF( LLB.NE.0 .AND. LLB.NE.M-1 ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PZLATMS', -INFO ) RETURN END IF * * Initialize random number generator * DO 10 I = 1, 4 ISEED( I ) = MOD( ABS( ISEED( I ) ), 4096 ) 10 CONTINUE * IF( MOD( ISEED( 4 ), 2 ).NE.1 ) $ ISEED( 4 ) = ISEED( 4 ) + 1 * * 2) Set up D if indicated. * * Compute D according to COND and MODE * CALL DLATM1( MODE, COND, IRSIGN, IDIST, ISEED, D, MNMIN, IINFO ) * IF( IINFO.NE.0 ) THEN INFO = 1 RETURN END IF * * IF( MODE.NE.0 .AND. ABS( MODE ).NE.6 ) THEN * * Scale by DMAX * TEMP = ABS( D( 1 ) ) DO 20 I = 2, MNMIN TEMP = MAX( TEMP, ABS( D( I ) ) ) 20 CONTINUE * IF( TEMP.GT.ZERO ) THEN ALPHA = DMAX / TEMP ELSE INFO = 2 RETURN END IF * CALL DSCAL( MNMIN, ALPHA, D, 1 ) * END IF * CALL ZLASET( 'A', NP, NQ, ZZERO, ZZERO, A, DESCA( LLD_ ) ) * * Hermitian -- A = U D U' * CALL PZLAGHE( M, LLB, D, A, IA, JA, DESCA, ISEED, ORDER, WORK, $ LWORK, IINFO ) * RETURN * * End of PZLATMS * END scalapack-2.0.2/TESTING/EIG/pzlatran.f000644 000766 000024 00000015700 10363532303 017435 0ustar00juliestaff000000 000000 SUBROUTINE PZLATRAN( N, NB, A, IA, JA, DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IA, JA, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * * ======= * * PZLATRAN transpose a lower triangular matrix on to the upper * triangular portion of the same matrix. * * This is an auxiliary routine called by PZHETRD. * * Notes * ===== * * IA must equal 1 * JA must equal 1 * DESCA( MB_ ) must equal 1 * DESCA( NB_ ) must equal 1 * DESCA( RSRC_ ) must equal 1 * DESCA( CSRC_ ) must equal 1 * * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix to be transposed. * * NB (global input) INTEGER * The number of rows and columns to be transposed with each * message sent. NB has no impact on the result, it is striclty * a performance tuning parameter. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). On entry, the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix. On exit, the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is undefined (and may have been modified). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * Must be equal to 1. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * Must be equal to 1. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * DESCA( MB_ ) must equal 1 * DESCA( NB_ ) must equal 1 * DESCA( ICTXT_ ) must point to a square process grid * i.e. one where NPROW is equal to NPCOL * * WORK (local workspace) COMPLEX*16 array, dimension ( LWORK ) * * Where: * LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, IRECV, ISEND, J, JJ, JRECV, JSEND, $ LDA, MAXIRECV, MAXISEND, MAXJRECV, MAXJSEND, $ MINIRECV, MINISEND, MINJRECV, MINJSEND, MYCOL, $ MYROW, NP, NPCOL, NPROW, NQ, RECVNB, SENDNB, $ STARTCOL, STARTROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, ZTRRV2D, ZTRSD2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, MIN * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Further details * * Because the processor grid is square each process needs only send * data to its transpose process. (Likewsie it need only receive * data from its transpose process.) Because the data decomposition * is cyclic, the local portion of the array is triangular. * * This routine requires that the data be buffered (i.e. copied) * on the sending process (because of the triangular shape) and * unbuffered on the receiving process. Hence, two local memory to * memory copies are performed within the communications routines * followed by a memory to memory copy outside of the communications * routines. It would be nice to avoid having back to back memory * to memory copies (as we do presently on the receiving processor). * This could be done by packaging the data ourselves in the sender * and then unpacking it directly into the matrix. However, this * code seems cleaner and so since this routine is not a significant * performance bottleneck we have left it this way. * * * * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * NP = NUMROC( N, 1, MYROW, 0, NPROW ) NQ = NUMROC( N, 1, MYCOL, 0, NPCOL ) * * IF( MYROW.EQ.MYCOL ) THEN * DO 20 J = 1, NP DO 10 I = J + 1, NQ A( J+( I-1 )*LDA ) = DCONJG( A( I+( J-1 )*LDA ) ) 10 CONTINUE 20 CONTINUE * ELSE IF( MYROW.GT.MYCOL ) THEN STARTROW = 1 STARTCOL = 2 ELSE IF( MYROW.EQ.MYCOL ) THEN STARTROW = 2 STARTCOL = 2 ELSE STARTROW = 2 STARTCOL = 1 END IF END IF * DO 50 JJ = 1, MAX( NP, NQ ), NB MINJSEND = STARTCOL + JJ - 1 MINJRECV = STARTROW + JJ - 1 MAXJSEND = MIN( MINJSEND+NB-1, NQ ) MAXJRECV = MIN( MINJRECV+NB-1, NP ) * SENDNB = MAXJSEND - MINJSEND + 1 RECVNB = MAXJRECV - MINJRECV + 1 * MINISEND = 1 MINIRECV = 1 MAXISEND = MIN( NP, JJ+SENDNB-1 ) MAXIRECV = MIN( NQ, JJ+RECVNB-1 ) * ISEND = MAXISEND - MINISEND + 1 IRECV = MAXIRECV - MINIRECV + 1 JSEND = MAXJSEND - MINJSEND + 1 JRECV = MAXJRECV - MINJRECV + 1 * * * DO 40 J = MINJRECV, MAXJRECV DO 30 I = MINIRECV, MAXIRECV + J - MAXJRECV WORK( I+( J-MINJRECV )*IRECV ) $ = DCONJG( A( J+( I-1 )*LDA ) ) 30 CONTINUE 40 CONTINUE * IF( IRECV.GT.0 .AND. JRECV.GT.0 ) $ CALL ZTRSD2D( ICTXT, 'U', 'N', IRECV, JRECV, WORK, IRECV, $ MYCOL, MYROW ) * IF( ISEND.GT.0 .AND. JSEND.GT.0 ) $ CALL ZTRRV2D( ICTXT, 'U', 'N', ISEND, JSEND, $ A( MINISEND+( MINJSEND-1 )*LDA ), LDA, $ MYCOL, MYROW ) * * 50 CONTINUE * END IF * RETURN * * End of PZLATRD * END scalapack-2.0.2/TESTING/EIG/pzmatgen.f000644 000766 000024 00000046353 10363532303 017437 0ustar00juliestaff000000 000000 SUBROUTINE PZMATGEN( ICTXT, AFORM, DIAG, M, N, MB, NB, A, LDA, $ IAROW, IACOL, ISEED, IROFF, IRNUM, ICOFF, $ ICNUM, MYROW, MYCOL, NPROW, NPCOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER*1 AFORM, DIAG INTEGER IACOL, IAROW, ICNUM, ICOFF, ICTXT, IRNUM, $ IROFF, ISEED, LDA, M, MB, MYCOL, MYROW, N, $ NB, NPCOL, NPROW * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PZMATGEN : Parallel Complex Double precision MATrix GENerator. * Generate (or regenerate) a distributed matrix A (or sub-matrix of A). * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation. The context itself is global. * * AFORM (global input) CHARACTER*1 * if AFORM = 'S' : A is returned is a symmetric matrix. * if AFORM = 'H' : A is returned is a Hermitian matrix. * if AFORM = 'T' : A is overwritten with the transpose of * what would normally be generated. * if AFORM = 'C' : A is overwritten with the conjugate trans- * pose of what would normally be generated. * otherwise a random matrix is generated. * * DIAG (global input) CHARACTER*1 * if DIAG = 'D' : A is diagonally dominant. * * M (global input) INTEGER * The number of rows in the generated distributed matrix. * * N (global input) INTEGER * The number of columns in the generated distributed * matrix. * * MB (global input) INTEGER * The row blocking factor of the distributed matrix A. * * NB (global input) INTEGER * The column blocking factor of the distributed matrix A. * * A (local output) COMPLEX*16, pointer into the local memory * to an array of dimension ( LDA, * ) containing the local * pieces of the distributed matrix. * * LDA (local input) INTEGER * The leading dimension of the array containing the local * pieces of the distributed matrix A. * * IAROW (global input) INTEGER * The row processor coordinate which holds the first block * of the distributed matrix A. * * IACOL (global input) INTEGER * The column processor coordinate which holds the first * block of the distributed matrix A. * * ISEED (global input) INTEGER * The seed number to generate the distributed matrix A. * * IROFF (local input) INTEGER * The number of local rows of A that have already been * generated. It should be a multiple of MB. * * IRNUM (local input) INTEGER * The number of local rows to be generated. * * ICOFF (local input) INTEGER * The number of local columns of A that have already been * generated. It should be a multiple of NB. * * ICNUM (local input) INTEGER * The number of local columns to be generated. * * MYROW (local input) INTEGER * The row process coordinate of the calling process. * * MYCOL (local input) INTEGER * The column process coordinate of the calling process. * * NPROW (global input) INTEGER * The number of process rows in the grid. * * NPCOL (global input) INTEGER * The number of process columns in the grid. * * Notes * ===== * * The code is originally developed by David Walker, ORNL, * and modified by Jaeyoung Choi, ORNL. * * Reference: G. Fox et al. * Section 12.3 of "Solving problems on concurrent processors Vol. I" * * ===================================================================== * * .. Parameters .. INTEGER MULT0, MULT1, IADD0, IADD1 PARAMETER ( MULT0=20077, MULT1=16838, IADD0=12345, $ IADD1=0 ) DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL SYMM, HERM, TRAN INTEGER I, IC, IK, INFO, IOFFC, IOFFR, IR, J, JK, $ JUMP1, JUMP2, JUMP3, JUMP4, JUMP5, JUMP6, $ JUMP7, MAXMN, MEND, MOFF, MP, MRCOL, MRROW, $ NEND, NOFF, NPMB, NQ, NQNB DOUBLE PRECISION DUMMY * .. * .. Local Arrays .. INTEGER IADD(2), IA1(2), IA2(2), IA3(2), IA4(2), $ IA5(2), IB1(2), IB2(2), IB3(2), IC1(2), IC2(2), $ IC3(2), IC4(2), IC5(2), IRAN1(2), IRAN2(2), $ IRAN3(2), IRAN4(2), ITMP1(2), ITMP2(2), $ ITMP3(2), JSEED(2), MULT(2) * .. * .. External Subroutines .. EXTERNAL JUMPIT, PXERBLA, SETRAN, XJUMPM * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDRAND EXTERNAL ICEIL, NUMROC, LSAME, PDRAND * .. * .. Executable Statements .. * * Test the input arguments * MP = NUMROC( M, MB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) TRAN = LSAME( AFORM, 'T' ) * INFO = 0 IF( .NOT.LSAME( DIAG, 'D' ) .AND. $ .NOT.LSAME( DIAG, 'N' ) ) THEN INFO = 3 ELSE IF( SYMM.OR.HERM ) THEN IF( M.NE.N ) THEN INFO = 5 ELSE IF( MB.NE.NB ) THEN INFO = 7 END IF ELSE IF( M.LT.0 ) THEN INFO = 4 ELSE IF( N.LT.0 ) THEN INFO = 5 ELSE IF( MB.LT.1 ) THEN INFO = 6 ELSE IF( NB.LT.1 ) THEN INFO = 7 ELSE IF( LDA.LT.0 ) THEN INFO = 9 ELSE IF( ( IAROW.LT.0 ).OR.( IAROW.GE.NPROW ) ) THEN INFO = 10 ELSE IF( ( IACOL.LT.0 ).OR.( IACOL.GE.NPCOL ) ) THEN INFO = 11 ELSE IF( MOD(IROFF,MB).GT.0 ) THEN INFO = 13 ELSE IF( IRNUM.GT.(MP-IROFF) ) THEN INFO = 14 ELSE IF( MOD(ICOFF,NB).GT.0 ) THEN INFO = 15 ELSE IF( ICNUM.GT.(NQ-ICOFF) ) THEN INFO = 16 ELSE IF( ( MYROW.LT.0 ).OR.( MYROW.GE.NPROW ) ) THEN INFO = 17 ELSE IF( ( MYCOL.LT.0 ).OR.( MYCOL.GE.NPCOL ) ) THEN INFO = 18 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZMATGEN', INFO ) RETURN END IF * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) NPMB = NPROW * MB NQNB = NPCOL * NB MOFF = IROFF / MB NOFF = ICOFF / NB MEND = ICEIL(IRNUM, MB) + MOFF NEND = ICEIL(ICNUM, NB) + NOFF * MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * * Symmetric or Hermitian matrix will be generated. * IF( SYMM.OR.HERM ) THEN * * First, generate the lower triangular part (with diagonal block) * JUMP1 = 1 JUMP2 = 2*NPMB JUMP3 = 2*M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = 2*MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 10 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 10 CONTINUE * JK = 1 DO 80 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 70 I = 1, NB IF( JK .GT. ICNUM ) GO TO 90 * IK = 1 DO 50 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB * IF( IOFFR .GT. IOFFC ) THEN DO 20 J = 1, MB IF( IK .GT. IRNUM ) GO TO 60 A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) IK = IK + 1 20 CONTINUE * ELSE IF( IOFFC .EQ. IOFFR ) THEN IK = IK + I - 1 IF( IK .GT. IRNUM ) GO TO 60 DO 30 J = 1, I-1 A(IK,JK) = DCMPLX( PDRAND(0), PDRAND(0) ) 30 CONTINUE IF( SYMM ) THEN A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) ELSE A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), ZERO ) DUMMY = PDRAND(0) END IF DO 40 J = 1, MB-I IF( IK+J .GT. IRNUM ) GO TO 60 A(IK+J,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) IF( HERM ) THEN A(IK,JK+J) = DCONJG( A(IK+J,JK) ) ELSE A(IK,JK+J) = A(IK+J,JK) END IF 40 CONTINUE IK = IK + MB - I + 1 ELSE IK = IK + MB END IF * CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 50 CONTINUE * 60 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 70 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 80 CONTINUE * * Next, generate the upper triangular part. * 90 CONTINUE MULT(1) = MULT0 MULT(2) = MULT1 IADD(1) = IADD0 IADD(2) = IADD1 JSEED(1) = ISEED JSEED(2) = 0 * JUMP1 = 1 JUMP2 = 2*NQNB JUMP3 = 2*N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = 2*NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 100 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 100 CONTINUE * IK = 1 DO 150 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 140 J = 1, MB IF( IK .GT. IRNUM ) GO TO 160 JK = 1 DO 120 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IF( IOFFC .GT. IOFFR ) THEN DO 110 I = 1, NB IF( JK .GT. ICNUM ) GO TO 130 IF( SYMM ) THEN A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) ELSE A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ TWO*PDRAND(0) - ONE ) END IF JK = JK + 1 110 CONTINUE ELSE JK = JK + NB END IF CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 120 CONTINUE * 130 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 140 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 150 CONTINUE 160 CONTINUE * * (Conjugate) Transposed matrix A will be generated. * ELSE IF( TRAN .OR. LSAME( AFORM, 'C' ) ) THEN * JUMP1 = 1 JUMP2 = 2*NQNB JUMP3 = 2*N JUMP4 = NPMB JUMP5 = MB JUMP6 = MRROW JUMP7 = 2*NB*MRCOL * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( MOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( NOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 170 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 170 CONTINUE * IK = 1 DO 220 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 210 J = 1, MB IF( IK .GT. IRNUM ) GO TO 230 JK = 1 DO 190 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 180 I = 1, NB IF( JK .GT. ICNUM ) GO TO 200 IF( TRAN ) THEN A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) ELSE A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ TWO*PDRAND(0) - ONE ) END IF JK = JK + 1 180 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 190 CONTINUE * 200 CONTINUE IK = IK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 210 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 220 CONTINUE 230 CONTINUE * * A random matrix is generated. * ELSE * JUMP1 = 1 JUMP2 = 2*NPMB JUMP3 = 2*M JUMP4 = NQNB JUMP5 = NB JUMP6 = MRCOL JUMP7 = 2*MB*MRROW * CALL XJUMPM( JUMP1, MULT, IADD, JSEED, IRAN1, IA1, IC1 ) CALL XJUMPM( JUMP2, MULT, IADD, IRAN1, ITMP1, IA2, IC2 ) CALL XJUMPM( JUMP3, MULT, IADD, IRAN1, ITMP1, IA3, IC3 ) CALL XJUMPM( JUMP4, IA3, IC3, IRAN1, ITMP1, IA4, IC4 ) CALL XJUMPM( JUMP5, IA3, IC3, IRAN1, ITMP1, IA5, IC5 ) CALL XJUMPM( JUMP6, IA5, IC5, IRAN1, ITMP3, ITMP1, ITMP2 ) CALL XJUMPM( JUMP7, MULT, IADD, ITMP3, IRAN1, ITMP1, ITMP2 ) CALL XJUMPM( NOFF, IA4, IC4, IRAN1, ITMP1, ITMP2, ITMP3 ) CALL XJUMPM( MOFF, IA2, IC2, ITMP1, IRAN1, ITMP2, ITMP3 ) CALL SETRAN( IRAN1, IA1, IC1 ) * DO 240 I = 1, 2 IB1(I) = IRAN1(I) IB2(I) = IRAN1(I) IB3(I) = IRAN1(I) 240 CONTINUE * JK = 1 DO 290 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB DO 280 I = 1, NB IF( JK .GT. ICNUM ) GO TO 300 IK = 1 DO 260 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB DO 250 J = 1, MB IF( IK .GT. IRNUM ) GO TO 270 A(IK,JK) = DCMPLX( ONE - TWO*PDRAND(0), $ ONE - TWO*PDRAND(0) ) IK = IK + 1 250 CONTINUE CALL JUMPIT( IA2, IC2, IB1, IRAN2 ) IB1(1) = IRAN2(1) IB1(2) = IRAN2(2) 260 CONTINUE * 270 CONTINUE JK = JK + 1 CALL JUMPIT( IA3, IC3, IB2, IRAN3 ) IB1(1) = IRAN3(1) IB1(2) = IRAN3(2) IB2(1) = IRAN3(1) IB2(2) = IRAN3(2) 280 CONTINUE * CALL JUMPIT( IA4, IC4, IB3, IRAN4 ) IB1(1) = IRAN4(1) IB1(2) = IRAN4(2) IB2(1) = IRAN4(1) IB2(2) = IRAN4(2) IB3(1) = IRAN4(1) IB3(2) = IRAN4(2) 290 CONTINUE 300 CONTINUE END IF * * Diagonally dominant matrix will be generated. * IF( LSAME( DIAG, 'D' ) ) THEN IF( MB.NE.NB ) THEN WRITE(*,*) 'Diagonally dominant matrices with rowNB not'// $ ' equal colNB is not supported!' RETURN END IF * MAXMN = MAX(M, N) JK = 1 DO 340 IC = NOFF+1, NEND IOFFC = ((IC-1)*NPCOL+MRCOL) * NB IK = 1 DO 320 IR = MOFF+1, MEND IOFFR = ((IR-1)*NPROW+MRROW) * MB IF( IOFFC.EQ.IOFFR ) THEN DO 310 J = 0, MB-1 IF( IK .GT. IRNUM ) GO TO 330 IF( HERM ) THEN A(IK,JK+J) = DCMPLX( $ ABS(DBLE(A(IK,JK+J)))+2*MAXMN, ZERO ) ELSE A(IK,JK+J)= DCMPLX( ABS(DBLE(A(IK,JK+J)))+MAXMN, $ ABS(DIMAG(A(IK,JK+J)))+ MAXMN ) END IF IK = IK + 1 310 CONTINUE ELSE IK = IK + MB END IF 320 CONTINUE 330 CONTINUE JK = JK + NB 340 CONTINUE END IF * RETURN * * End of PZMATGEN * END scalapack-2.0.2/TESTING/EIG/pznepdriver.f000644 000766 000024 00000050267 10363532303 020161 0ustar00juliestaff000000 000000 PROGRAM PZNEPDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March, 2000 * * Purpose * ======= * * PZNEPDRIVER is the main test program for the COMPLEX*16 * SCALAPACK NEP routines. This test driver performs a Schur * decomposition followed by residual check of a Hessenberg matrix. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'SCALAPACK, Version 1.4, NEP (Nonsymmetric EigenProblem) input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'NEP.out' output file name (if any) * 6 device out * 8 number of problems sizes * 1 2 3 4 6 10 100 200 vales of N * 3 number of NB's * 6 20 40 values of NB * 4 number of process grids (ordered pairs of P & Q) * 1 2 1 4 values of P * 1 2 4 1 values of Q * 20.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * ZPLXSZ INTEGER, default = 16 bytes. * ZPLXSZ indicate the length in bytes on the given platform * for a double precision complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * Further Details * =============== * * Contributed by Mark Fahey, March 2000. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ZPLXSZ, TOTMEM, MEMSIZ, NTESTS PARAMETER ( ZPLXSZ = 16, TOTMEM = 200000000, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20 ) COMPLEX*16 PADVAL, ZERO, ONE PARAMETER ( PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, III, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPREPAD, IPW, IPWR, IPZ, J, K, KFAIL, $ KPASS, KSKIP, KTESTS, LDA, LDWORK, LDZ, LWORK, $ MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, NOUT, $ NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, QRESID, TMFLOPS, ZNORM * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCZ( DLEN_ ), IDUM( 1 ), $ IERR( 2 ), NBVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZCHEKPAD, $ PZFILLPAD, PZGEMM, PZLAHQR, PZLASET, PZMATGEN, $ PZNEPFCHK, PZNEPINFO, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE, PZLANHS EXTERNAL ILCM, NUMROC, PDLAMCH, PZLANGE, PZLANHS * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PZNEPINFO( OUTFILE, NOUT, NMAT, NVAL, NTESTS, NNB, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.6 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) IPREPAD = IPREPAD + 1000 IMIDPAD = IMIDPAD + 1000 IPOSTPAD = IPOSTPAD + 1000 ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Initialize the array descriptor for the matrix Z * CALL DESCINIT( DESCZ, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 2 ) ) * LDA = DESCA( LLD_ ) LDZ = DESCZ( LLD_ ) LDWORK = DESCZ( LLD_ ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPZ = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWR = IPZ + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPWR + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD III = N / NB IF( III*NB.LT.N ) $ III = III + 1 III = 7*III / ILCM( NPROW, NPCOL ) * * LWORK = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, III ) LWORK = LWORK + MAX( 2*N, ( 8*ILCM( NPROW, NPCOL )+2 )** $ 2 ) * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZNEPFCHK and PZLANHS * WORKSIZ = LWORK + MAX( NP*DESCA( NB_ ), $ DESCA( MB_ )*NQ ) + IPOSTPAD * ELSE * WORKSIZ = LWORK + IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Schur reduction', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate matrix Z = In * CALL PZLASET( 'All', N, N, ZERO, ONE, MEM( IPZ ), 1, 1, $ DESCZ ) * * Generate matrix A upper Hessenberg * CALL PZMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) CALL PZLASET( 'Lower', MAX( 0, N-2 ), MAX( 0, N-2 ), $ ZERO, ZERO, MEM( IPA ), MIN( N, 3 ), 1, $ DESCA ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPZ-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANHS( 'I', N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANHS', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANHS', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, N, 1, MEM( IPWR-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, PADVAL ) * END IF * CALL SLBOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform NEP factorization * CALL PZLAHQR( .TRUE., .TRUE., N, 1, N, MEM( IPA ), DESCA, $ MEM( IPWR ), 1, N, MEM( IPZ ), DESCZ, $ MEM( IPW ), LWORK, IDUM, 0, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * )'PZLAHQR INFO=', INFO KFAIL = KFAIL + 1 GO TO 10 END IF * IF( CHECK ) THEN * * Check for memory overwrite in NEP factorization * CALL PZCHEKPAD( ICTXT, 'PZLAHQR (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLAHQR (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLAHQR (WR)', N, 1, $ MEM( IPWR-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLAHQR (WORK)', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || Z * H * Z**T - H0 || / ( N*|| H0 ||*EPS ) * CALL PZNEPFCHK( N, MEM( IPA ), 1, 1, DESCA, IASEED, $ MEM( IPZ ), 1, 1, DESCZ, ANORM, $ FRESID, MEM( IPW ) ) * CALL PZCHEKPAD( ICTXT, 'PZNEPFCHK (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZNEPFCHK (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZNEPFCHK (WORK)', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || (Z**T)*Z - In ||_1 * CALL PZLASET( 'All', N, N, ZERO, ONE, MEM( IPA ), 1, $ 1, DESCA ) CALL PZGEMM( 'Cong Tran', 'No transpose', N, N, N, $ -ONE, MEM( IPZ ), 1, 1, DESCZ, $ MEM( IPZ ), 1, 1, DESCZ, ONE, MEM( IPA ), $ 1, 1, DESCA ) ZNORM = PZLANGE( '1', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) QRESID = ZNORM / ( DBLE( N )*PDLAMCH( ICTXT, 'P' ) ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( ( FRESID-FRESID ).EQ.0.0D+0 ) .AND. $ ( QRESID.LE.THRESH ) .AND. $ ( ( QRESID-QRESID ).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 )FRESID WRITE( NOUT, FMT = 9985 )QRESID END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID QRESID = QRESID - QRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 18 N^3 flops for PxLAHQR * NOPS = 18.0D+0*DBLE( N )**3 * * Calculate total megaflops -- factorization only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q NEP Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- --- ---- ---- -------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, F8.2, $ 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||H - Q*S*Q^T|| / (||H|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Q^T*Q - I|| / ( N * eps ) ', G25.7 ) * STOP * * End of PZNEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pznepfchk.f000644 000766 000024 00000026244 10363532303 017577 0ustar00juliestaff000000 000000 SUBROUTINE PZNEPFCHK( N, A, IA, JA, DESCA, IASEED, Z, IZ, JZ, $ DESCZ, ANORM, FRESID, WORK ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March, 2000 * * .. Scalar Arguments .. INTEGER IA, IASEED, IZ, JA, JZ, N DOUBLE PRECISION ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) COMPLEX*16 A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZNEPFCHK computes the residual * || sub(Z)*sub( A )*sub(Z)**T - sub( Ao ) || / (||sub( Ao )||*eps*N), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A(IA:IA+M-1,JA:JA+N-1), sub( Z ) = Z(IZ:IZ+N-1,JZ:JZ+N-1) * and ||.|| stands for the infinity norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The order of sub( A ) and sub( Z ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub(Z)*sub( A )*sub(Z)**T - sub( Ao ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * Z (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_Z,LOCc(JZ+N-1)). On entry, this * array contains the local pieces of the N-by-N distributed * matrix sub( Z ). * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * ANORM (global input) DOUBLE PRECISION * The Infinity norm of sub( A ). * * FRESID (global output) DOUBLE PRECISION * The maximum (worst) factorizational error. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK). * LWORK >= MAX( NpA0 * NB_A, MB_A * NqA0 ) where * * IROFFA = MOD( IA-1, MB_A ), * ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * WORK is used to store a block of rows and a block of columns * of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * Further Details * =============== * * Contributed by Mark Fahey, March, 2000. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, IB, ICTXT, IIA, IOFFA, IROFF, $ IW, J, JB, JJA, JN, LDA, LDW, MYCOL, MYROW, NP, $ NPCOL, NPROW DOUBLE PRECISION EPS * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PZGEMM, $ PZLACPY, PZLASET, PZMATGEN, ZMATADD * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL ICEIL, NUMROC, PDLAMCH, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( ICTXT, 'eps' ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF LDW = MAX( 1, NP ) * * First compute H <- H * Z**T * CALL DESCSET( DESCW, DESCA( MB_ ), N, DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * DO 10 I = IA, IA + N - 1, DESCA( MB_ ) IB = MIN( IA+N-I, DESCA( MB_ ) ) * CALL PZLACPY( 'All', IB, N, A, I, JA, DESCA, WORK, 1, 1, $ DESCW ) CALL PZGEMM( 'No transpose', 'Cong Tran', IB, N, N, ONE, WORK, $ 1, 1, DESCW, Z, IZ, JZ, DESCZ, ZERO, A, I, JA, $ DESCA ) * DESCW( RSRC_ ) = MOD( DESCW( RSRC_ )+1, NPROW ) * 10 CONTINUE * * Then compute H <- Z * H = Z * H0 * Z**T * CALL DESCSET( DESCW, N, DESCA( NB_ ), DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, LDW ) * DO 20 J = JA, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * CALL PZLACPY( 'All', N, JB, A, IA, J, DESCA, WORK, 1, 1, $ DESCW ) CALL PZGEMM( 'No transpose', 'No transpose', N, JB, N, ONE, Z, $ IZ, JZ, DESCZ, WORK, 1, 1, DESCW, ZERO, A, IA, J, $ DESCA ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * 20 CONTINUE * * Compute H - H0 * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA-1 )*LDA IW = 1 JB = JN - JA + 1 DESCW( CSRC_ ) = IACOL * * Handle first block of columns separately * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PZMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ NP, JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL PZLASET( 'Lower', MAX( 0, N-2 ), JB, ZERO, ZERO, WORK, $ MIN( IW+2, N ), 1, DESCW ) CALL ZMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF * IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * DO 30 J = JN + 1, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PZMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, NP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL PZLASET( 'Lower', MAX( 0, N-IW-1 ), JB, ZERO, ZERO, $ WORK, MIN( N, IW+2 ), 1, DESCW ) CALL ZMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) 30 CONTINUE * * Calculate factor residual * FRESID = PZLANGE( 'I', N, N, A, IA, JA, DESCA, WORK ) / $ ( N*EPS*ANORM ) * RETURN * * End PZNEPFCHK * END scalapack-2.0.2/TESTING/EIG/pznepinfo.f000644 000766 000024 00000027760 10363532303 017623 0ustar00juliestaff000000 000000 SUBROUTINE PZNEPINFO( SUMMRY, NOUT, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March, 2000 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, NGRIDS, $ NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZNEPINFO gets needed startup information for PZHSEQR drivers * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (the order of the matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH * * WORK (local workspace) INTEGER array of dimension >= * MAX( 3, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ), used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Further Details * =============== * * Implemented by: M. Fahey, June 2000 * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a complex * single precision. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'NEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * )NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 )'N', LDNVAL GO TO 30 END IF READ( NIN, FMT = * )( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * )NNB IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 )'NB', LDNBVAL GO TO 30 END IF READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) * DO 10 I = 1, NNB IF( NBVAL( I ).LT.6 ) THEN WRITE( NOUT, FMT = 9992 )NBVAL( I ) GO TO 30 END IF 10 CONTINUE * * Get number of grids * READ( NIN, FMT = * )NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDPVAL GO TO 30 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDQVAL GO TO 30 END IF * * Get values of P and Q * READ( NIN, FMT = * )( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * )( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * )THRESH * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 20 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 20 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK QSQ^H by Schur Decomposition.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'Tests of the parallel ' // $ 'complex double precision Schur decomposition.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Residual = ||H-QSQ^H|| / ' // $ '(||H|| * eps * N )' WRITE( NOUT, FMT = 9999 ) $ ' Orthogonality residual = ||I - Q^HQ|| / ' // '( eps * N )' WRITE( NOUT, FMT = 9999 )'The matrix A is randomly ' // $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'An explanation of the input/output ' $ // 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or ' // $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the ' // 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the' // $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than' // $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'NEP time : Time in seconds to decompose the ' // ' matrix' WRITE( NOUT, FMT = 9999 )'MFLOPS : Rate of execution ' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 )'N ', $ ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 )'NB ', $ ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 )'P ', $ ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 )'Q ', $ ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 )EPS WRITE( NOUT, FMT = 9998 )THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 30 CONTINUE WRITE( NOUT, FMT = 9993 ) CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9992 FORMAT( ' Blocking size too small at ', I2, ' must be >=6.' ) * * End of PZNEPINFO * END scalapack-2.0.2/TESTING/EIG/pzrptseptst.f000644 000766 000024 00000005243 10363532303 020225 0ustar00juliestaff000000 000000 * * PROGRAM PZRPTSEPTST * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * Repeat parallel Hermitian eigenproblem test * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER MAXN, LWORK, LIWORK PARAMETER ( MAXN = 200, LWORK = 500000, $ LIWORK = 6*MAXN+4 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS, UPLO INTEGER CONTEXT, IAM, INFO, IPOSTPAD, IPREPAD, LDA, $ MATTYPE, N, NB, NPCOL, NPROCS, NPROW DOUBLE PRECISION ABSTOL, THRESH * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), ICLUSTR( MAXN ), IFAIL( MAXN ), $ ISEED( 4 ), IWORK( LIWORK ) DOUBLE PRECISION GAP( MAXN ), RWORK( LWORK ), WIN( MAXN ), $ WNEW( MAXN ) COMPLEX*16 A( MAXN*MAXN ), COPYA( MAXN*MAXN ), $ WORK( LWORK ), Z( MAXN*MAXN ) * .. * * * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDINIT, $ BLACS_PINFO, BLACS_SETUP, DESCINIT, PZSEPTST * .. * .. Executable Statements .. * IPREPAD = 3 IPOSTPAD = 3 LDA = MAXN * * These lines should be replaced by the output from pxSEPdriver * * ISEED( 1 ) = 2312 ISEED( 2 ) = 3709 ISEED( 3 ) = 666 ISEED( 4 ) = 3371 UPLO = 'U' SUBTESTS = 'Y' N = 33 NPROW = 2 NPCOL = 2 NB = 4 MATTYPE = 9 * note: the printout often makes a mess of ABSTOL ABSTOL = 0.1175494351D-37 THRESH = .350000D+01 * CALL BLACS_PINFO( IAM, NPROCS ) IF( NPROCS.LT.1 ) THEN * NPROCS = NPROW*NPCOL CALL BLACS_SETUP( IAM, NPROCS ) END IF CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, CONTEXT, LDA, INFO ) * CALL PZSEPTST( DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, N, $ ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, WORK, $ LWORK-IPREPAD-IPOSTPAD, RWORK, $ LWORK-IPREPAD-IPOSTPAD, IWORK, $ LIWORK-IPREPAD-IPOSTPAD, 6, INFO ) * * * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * * CALL BLACS_EXIT( 0 ) STOP * * * * End of PZRPTSEPTST * END scalapack-2.0.2/TESTING/EIG/pzsdpsubtst.f000644 000766 000024 00000040306 10607176122 020213 0ustar00juliestaff000000 000000 SUBROUTINE PZSDPSUBTST( WKNOWN, UPLO, N, THRESH, ABSTOL, A, COPYA, $ Z, IA, JA, DESCA, WIN, WNEW, IPREPAD, $ IPOSTPAD, WORK, LWORK, RWORK, LRWORK, $ LWORK1, IWORK, LIWORK, RESULT, TSTNRM, $ QTQNRM, NOUT ) * * -- ScaLAPACK testing routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * February 28, 2000 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER UPLO INTEGER IA, IPOSTPAD, IPREPAD, JA, LIWORK, LRWORK, $ LWORK, LWORK1, N, NOUT, RESULT DOUBLE PRECISION ABSTOL, QTQNRM, THRESH, TSTNRM * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) DOUBLE PRECISION RWORK( * ), WIN( * ), WNEW( * ) COMPLEX*16 A( * ), COPYA( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZSDPSUBTST calls PZHEEVD and then tests the output of * PZHEEVD * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PZHEEVD when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PZSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) COMPLEX*16 array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PZHEEVD for a description of block cyclic layout. * The test matrix, which is then modified by PZHEEVD * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) COMPLEX*16 array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) COMPLEX*16 array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PZSEPCHK and PZSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) DOUBLE PRECISION array, dimension (N) * The eigenvalues as copmuted by this call to PZHEEVD * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * RWORK has already been padded front and back, * use RWORK(1+IPREPAD) * * LRWORK (local input) INTEGER * The actual length of the array RWORK after padding. * * LWORK1 (local input) INTEGER * The amount of DOUBLE PRECISION workspace to pass to PZHEEVD * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PZHEEVD * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285D+0, FIVE = 5.0D+0, $ NEGONE = -1.0D+0 ) COMPLEX*16 CPADVAL PARAMETER ( CPADVAL = ( 13.989D+0, 1.93D+0 ) ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) COMPLEX*16 CZERO, CONE, CNEGONE PARAMETER ( CZERO = 0.0D+0, CONE = 1.0D+0, $ CNEGONE = -1.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IAM, INFO, ISIZEHEEVD, ISIZEHEEVX, $ ISIZESUBTST, ISIZETST, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ, RES, RSIZECHK, RSIZEHEEVD, $ RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, RSIZETST, $ SIZEHEEVD, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST DOUBLE PRECISION EPS, EPSNORMA, ERROR, MAXERROR, MINERROR, NORM, $ NORMWIN, SAFMIN, ULP * .. * .. Local Arrays .. INTEGER ITMP( 2 ) * .. * .. External Functions .. * INTEGER NUMROC DOUBLE PRECISION PZLANGE, PZLANHE, PDLAMCH EXTERNAL NUMROC, PZLANGE, PZLANHE, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, ZLACPY, IGAMN2D, IGAMX2D, $ PZCHEKPAD, PZFILLPAD, PZGEMM, PZHEEVD, PZLASET, $ PZLASIZESEP, PZSEPCHK, PICHEKPAD, PIFILLPAD, $ PDCHEKPAD, PDFILLPAD, SLBOOT, SLTIMER * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, DBLE * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL PZLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, SIZESUBTST, $ RSIZESUBTST, ISIZESUBTST, SIZETST, RSIZETST, $ ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PDLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1+IPREPAD ) ), $ ABS( WIN( N+IPREPAD ) ), NORMWIN ) * DO 10 I = 1, LWORK1, 1 RWORK( I+IPREPAD ) = 14.3D+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I ) = 14 20 CONTINUE DO 30 I = 1, LWORK, 1 WORK( I+IPREPAD ) = ( 15.63D+0, 1.1D+0 ) 30 CONTINUE * DO 40 I = 1, N WNEW( I+IPREPAD ) = 3.14159D+0 40 CONTINUE * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 60 RESULT = 0 * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL ZLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PZFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, CPADVAL ) * CALL PZFILLPAD( DESCA( CTXT_ ), NP, NQ, Z, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, CPADVAL+1.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), LWORK1, 1, RWORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0D+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PZFILLPAD( DESCA( CTXT_ ), LWORK, 1, WORK, LWORK, IPREPAD, $ IPOSTPAD, CPADVAL+4.1D+0 ) * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) * CALL PZHEEVD( 'V', UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), SIZEHEEVD, RWORK( 1+IPREPAD ), $ LWORK1, IWORK( 1+IPREPAD ), LIWORK, INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEEVD-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, CPADVAL ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEEVD-Z', NP, NQ, Z, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ CPADVAL+1.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEEVD-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEEVD-rWORK', LWORK1, 1, $ RWORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEEVD-WORK', LWORK, 1, WORK, $ LWORK, IPREPAD, IPOSTPAD, CPADVAL+4.1D+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEEVD-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * * Check INFO * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PZLANHE( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ RWORK )*EPS END IF * * Note that a couple key variables get redefined in PZSEPCHK * as described by this table: * * PZSEPTST name PZSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * Perform the |AQ - QE| test * CALL PDFILLPAD( DESCA( CTXT_ ), RSIZECHK, 1, RWORK, RSIZECHK, $ IPREPAD, IPOSTPAD, 4.3D+0 ) * CALL PZSEPCHK( N, N, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCA, A( 1+IPREPAD ), $ IA, JA, DESCA, WNEW( 1+IPREPAD ), $ RWORK( 1+IPREPAD ), RSIZECHK, TSTNRM, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZSDPCHK-rWORK', RSIZECHK, 1, $ RWORK, RSIZECHK, IPREPAD, IPOSTPAD, 4.3D+0 ) * IF( RES.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9995 ) END IF * * Perform the |QTQ - I| test * CALL PDFILLPAD( DESCA( CTXT_ ), RSIZEQTQ, 1, RWORK, RSIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3D+0 ) * * RES = 0 ULP = PDLAMCH( DESCA( CTXT_ ), 'P' ) CALL PZLASET( 'A', N, N, CZERO, CONE, A( 1+IPREPAD ), IA, JA, $ DESCA ) CALL PZGEMM( 'Conjugate transpose', 'N', N, N, N, CNEGONE, $ Z( 1+IPREPAD ), IA, JA, DESCA, Z( 1+IPREPAD ), IA, $ JA, DESCA, CONE, A( 1+IPREPAD ), IA, JA, DESCA ) NORM = PZLANGE( '1', N, N, A( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ) ) QTQNRM = NORM / ( DBLE( MAX( N, 1 ) )*ULP ) IF( QTQNRM.GT.THRESH ) THEN RES = 1 END IF CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZSEPQTQ-rWORK', RSIZEQTQ, 1, $ RWORK, RSIZEQTQ, IPREPAD, IPOSTPAD, 4.3D+0 ) * IF( RES.NE.0 ) THEN RESULT = 1 WRITE( NOUT, FMT = 9994 ) END IF * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN .AND. N.GT.0 ) THEN * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN MAXERROR = 0.0D+00 * DO 50 I = 1, N ERROR = ABS( WIN( I+IPREPAD )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 50 CONTINUE MINERROR = MIN( MAXERROR, MINERROR ) * IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF * * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 60 CONTINUE * RETURN * 9999 FORMAT( 'PZHEEVD returned INFO=', I7 ) 9998 FORMAT( 'PZSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PZSDPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PZHEEVD returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'PZHEEVD failed the |AQ -QE| test' ) 9994 FORMAT( 'PZHEEVD failed the |QTQ -I| test' ) * * End of PZSDPSUBTST * END scalapack-2.0.2/TESTING/EIG/pzsepchk.f000644 000766 000024 00000024221 11750130340 017423 0ustar00juliestaff000000 000000 * * SUBROUTINE PZSEPCHK( MS, NV, A, IA, JA, DESCA, EPSNORMA, THRESH, $ Q, IQ, JQ, DESCQ, C, IC, JC, DESCC, W, WORK, $ LWORK, TSTNRM, RESULT ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER IA, IC, IQ, JA, JC, JQ, LWORK, MS, NV, RESULT DOUBLE PRECISION EPSNORMA, THRESH, TSTNRM * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCC( * ), DESCQ( * ) DOUBLE PRECISION W( * ), WORK( * ) COMPLEX*16 A( * ), C( * ), Q( * ) * .. * * Purpose * ======= * * Compute |AQ- QL| / (EPSNORMA * N) * where EPSNORMA = (abstol + eps)*norm(A) when called by pdsqpsubtst. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * MP = number of local rows in A, C and Q * MQ = number of local columns in A * NQ = number of local columns in C and Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in A, C and Q * Also, the number of global columns in A * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q. * * A (local input) COMPLEX*16 pointer to an * array in local memory of dimension (LLD_A, LOCc(JA+N-1)). * This array contains the local pieces of the MS-by-MS * distributed test matrix A * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * EPSNORMA (input) DOUBLE PRECISION * abstol + eps * inf.norm(A) * Abstol is absolute tolerence for the eigenvalues and is set * in the calling routines, pdsepsubtst and pdsqpsubtst. * * THRESH (input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) COMPLEX*16 array * global dimension (MS, NV), local dimension (DESCA(DLEN_), NQ) * * Contains the eigenvectors as computed by PZHEEVX * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Q. * * C (local workspace) COMPLEX*16 array, * global dimension (NV, NV), local dimension (DESCA(DLEN_), MQ) * * Accumulator for computing AQ -QL * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * W (global input) DOUBLE PRECISION array, dimension (NV) * * Contains the computed eigenvalues * * WORK (local workspace) DOUBLE PRECISION array, * dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / ( EPSNROMA * MS ) * * RESULT (global output) INTEGER * 0 if the test passes i.e. * |AQ -QL| / (abstol + eps * norm(A) ) <= n* THRESH * 1 if the test fails i.e. * |AQ -QL| / (abstol + eps * norm(A) ) > n * THRESH * * .. Local Scalars .. * INTEGER INFO, J, LOCALCOL, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ, PCOL DOUBLE PRECISION NORM * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, NEGONE PARAMETER ( ONE = 1.0D+0, NEGONE = -1.0D+0 ) * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC DOUBLE PRECISION PZLANGE EXTERNAL INDXG2L, INDXG2P, NUMROC, PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PXERBLA, PZGEMM, $ ZDSCAL, ZLACPY * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * RESULT = 0 * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 CALL CHK1MAT( MS, 1, MS, 1, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IQ, JQ, DESCQ, 12, INFO ) CALL CHK1MAT( MS, 1, NV, 2, IC, JC, DESCC, 16, INFO ) * IF( INFO.EQ.0 ) THEN * MP = NUMROC( MS, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( NV, DESCA( NB_ ), MYCOL, 0, NPCOL ) * IF( IQ.NE.1 ) THEN INFO = -10 ELSE IF( JQ.NE.1 ) THEN INFO = -11 ELSE IF( IA.NE.1 ) THEN INFO = -4 ELSE IF( JA.NE.1 ) THEN INFO = -5 ELSE IF( IC.NE.1 ) THEN INFO = -14 ELSE IF( JC.NE.1 ) THEN INFO = -15 ELSE IF( LWORK.LT.NQ ) THEN INFO = -19 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PZSEPCHK', -INFO ) RETURN END IF * * C = Q * W * CALL ZLACPY( 'A', MP, NQ, Q, DESCQ( LLD_ ), C, DESCC( LLD_ ) ) * * DO 10 J = 1, NV PCOL = INDXG2P( J, DESCC( NB_ ), 0, 0, NPCOL ) LOCALCOL = INDXG2L( J, DESCC( NB_ ), 0, 0, NPCOL ) * IF( MYCOL.EQ.PCOL ) THEN CALL ZDSCAL( MP, W( J ), C( ( LOCALCOL-1 )*DESCC( LLD_ )+ $ 1 ), 1 ) END IF 10 CONTINUE * * * C = C - A * Q * CALL PZGEMM( 'N', 'N', MS, NV, MS, NEGONE, A, 1, 1, DESCA, Q, 1, $ 1, DESCQ, ONE, C, 1, 1, DESCC ) * * Compute the norm of C * * NORM = PZLANGE( 'M', MS, NV, C, 1, 1, DESCC, WORK ) * * TSTNRM = NORM / EPSNORMA / MAX( MS, 1 ) * IF( TSTNRM.GT.THRESH .OR. ( TSTNRM-TSTNRM.NE.0.0D0 ) ) THEN RESULT = 1 END IF * * RETURN * * End of PZSEPCHK * END scalapack-2.0.2/TESTING/EIG/pzsepdriver.f000644 000766 000024 00000024400 10363532303 020154 0ustar00juliestaff000000 000000 * * PROGRAM PZSEPDRIVER * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * Parallel COMPLEX*16 Hermitian eigenproblem test driver * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. INTSIZ and DBLSIZ * indicate the length in bytes on the given platform for an integer * and a double precision real. * For example, on our system with 8 MB of memory, TOTMEM=6500000 * (leaves 1.5 MB for OS, code, BLACS buffer, etc), the length of a * DOUBLE is 8, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * The full tester requires approximately (5 n + 5 n^2/p + slop) * COMPLEX*16 words and 6*n integer words. * So, TOTMEM should be set to at least 1.1 * 8 * (5n + 5n^2/p) * * WHAT WE TEST * ============ * * This routine tests PZHEEVX, the expert driver for the parallel * Hermitian eigenvalue problem. We would like to cover all * possible combinations of: matrix size, process configuration * (nprow and npcol), block size (nb), matrix type (??), range * of eigenvalue (all, by value, by position), sorting options, * and upper vs. lower storage. * * We intend to provide two types of test input files, an * installation test and a thorough test. * * We also intend that the reports be meaningful. Our input file * will allow multiple requests where each request is a cross product * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, ZPLXSZ, NIN PARAMETER ( TOTMEM = 2000000, ZPLXSZ = 16, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / ZPLXSZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PDLACHKIEEE, PDLASNBT, PZSEPREQ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PDLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9991 ) WRITE( NOUT, FMT = 9990 ) END IF * CALL PDLACHKIEEE( ISIEEE, DLAMCH( 'O' ), DLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9987 ) END IF GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK Hermitian Eigendecomposition routines.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'Hermitian eigenvalue routine: PZHEEVX.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 )' ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )' ||Q^T*Q - I|| ' // '/ (N * eps)' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ ' : the QTQ norm is allowed to exceed THRESH' // $ ' for those eigenvectors' WRITE( NOUT, FMT = 9999 )' : which could not be ' // $ 'reorthogonalized for lack of workspace.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see pZSEPtst.f).' WRITE( NOUT, FMT = 9999 )'SUB : Subtests ' // $ '(see pZSEPtst).f' WRITE( NOUT, FMT = 9999 )'CHK : ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )'QTQ : ||Q^T*Q - I||/ (N * eps)' WRITE( NOUT, FMT = 9999 ) $ ' : when the adjusted QTQ exceeds THRESH', $ ' the adjusted QTQ norm is printed' WRITE( NOUT, FMT = 9999 ) $ ' : otherwise the true QTQ norm is printed' WRITE( NOUT, FMT = 9999 ) $ 'If NT>1, CHK and QTQ are the max over all ' // $ 'eigen request tests' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PZSEPREQ( NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, NSKIPPED, $ NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * * CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', '') * * * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * * 9999 FORMAT( A ) 9998 FORMAT( ' I am about to check to make sure that overflow' ) 9997 FORMAT( ' is handled in the ieee default manner. If this' ) 9996 FORMAT( ' is the last output you see, you should assume' ) 9995 FORMAT( ' that overflow caused a floating point exception.' ) 9994 FORMAT( ' In that case, we recommend that you add -DNO_IEEE' ) 9993 FORMAT( ' to the CDEFS line in SLmake.inc.' ) 9992 FORMAT( ' Alternatively, you could set CDEFS in SLmake.inc ' ) 9991 FORMAT( ' to enable the default ieee behaviour, However, this' ) 9990 FORMAT( ' may result in good or very bad performance.' ) 9989 FORMAT( ' Either signed zeroes or signed infinities ' ) 9988 FORMAT( ' work incorrectly or your system. Change your' ) 9987 FORMAT( ' SLmake.inc as suggested above.' ) * 9986 FORMAT( ' Your system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP SUB WALL CPU ', $ ' CHK QTQ CHECK' ) 9978 FORMAT( ' ----- --- --- --- --- --- -------- --------', $ ' --------- --------- -----' ) * * End of PZSEPDRIVER * END scalapack-2.0.2/TESTING/EIG/pzsepqtq.f000644 000766 000024 00000025417 10363532303 017477 0ustar00juliestaff000000 000000 * * SUBROUTINE PZSEPQTQ( MS, NV, THRESH, Q, IQ, JQ, DESCQ, C, IC, JC, $ DESCC, PROCDIST, ICLUSTR, GAP, WORK, LWORK, $ QTQNRM, INFO, RES ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IC, INFO, IQ, JC, JQ, LWORK, MS, NV, RES DOUBLE PRECISION QTQNRM, THRESH * .. * .. Array Arguments .. * INTEGER DESCC( * ), DESCQ( * ), ICLUSTR( * ), $ PROCDIST( * ) DOUBLE PRECISION GAP( * ), WORK( * ) COMPLEX*16 C( * ), Q( * ) * .. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Purpose * ======= * * Compute |I - QT * Q| / (ulp * n) * * Arguments * ========= * * NP = number of local rows in C * NQ = number of local columns in C and Q * * MS (global input) INTEGER * Matrix size. * The number of global rows in Q * * NV (global input) INTEGER * Number of eigenvectors * The number of global columns in C and Q * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * Q (local input) COMPLEX*16 array, * global dimension (MS, NV), local dimension (LDQ, NQ) * * Contains the eigenvectors as computed by PZSTEIN * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Q. * * C (local workspace) COMPLEX*16 array, * global dimension (NV, NV), local dimension (DESCC(DLEN_), NQ) * * Accumulator for computing I - QT * Q * * IC (global input) INTEGER * C's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JC (global input) INTEGER * C's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * W (input) DOUBLE PRECISION array, dimension (NV) * All procesors have an identical copy of W() * * Contains the computed eigenvalues * * PROCDIST (global input) INTEGER array dimension (NPROW*NPCOL+1) * Identifies which eigenvectors are the last to be computed * by a given process * * ICLUSTR (global input) INTEGER array dimension (2*P) * This input array contains indices of eigenvectors * corresponding to a cluster of eigenvalues that could not be * orthogonalized due to insufficient workspace. * This should be the output of PZSTEIN. * * GAP (global input) DOUBLE PRECISION array, dimension (P) * This input array contains the gap between eigenvalues whose * eigenvectors could not be orthogonalized. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. * LWORK >= 2 + MAX( DESCC( MB_ ), 2 )*( 2*NP0+MQ0 ) * Where: * NP0 = NUMROC( NV, DESCC( MB_ ), 0, 0, NPROW ) * MQ0 = NUMROC( NV, DESCC( NB_ ), 0, 0, NPCOL ) * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / EPS * * RES (global output) INTEGER * 0 if the test passes i.e. |I - QT * Q| / (ulp * n) <= THRESH * 1 if the test fails i.e. |I - QT * Q| / (ulp * n) > THRESH * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO, ONE, NEGONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, $ NEGONE = -1.0D+0 ) * .. * .. Intrinsic Functions .. * INTRINSIC DBLE, DCMPLX, MAX * .. * .. Local Scalars .. INTEGER CLUSTER, FIRSTP, IMAX, IMIN, JMAX, JMIN, LWMIN, $ MQ0, MYCOL, MYROW, NEXTP, NP0, NPCOL, NPROW DOUBLE PRECISION NORM, QTQNRM2, ULP * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL NUMROC, PDLAMCH, PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PXERBLA, PZGEMM, $ PZLASET, PZMATADD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * RES = 0 ULP = PDLAMCH( DESCC( CTXT_ ), 'P' ) * CALL BLACS_GRIDINFO( DESCC( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 CALL CHK1MAT( MS, 1, MS, 2, IQ, JQ, DESCQ, 7, INFO ) CALL CHK1MAT( NV, 1, MS, 2, IC, JC, DESCC, 11, INFO ) * IF( INFO.EQ.0 ) THEN NP0 = NUMROC( NV, DESCC( MB_ ), 0, 0, NPROW ) MQ0 = NUMROC( NV, DESCC( NB_ ), 0, 0, NPCOL ) * LWMIN = 2 + MAX( DESCC( MB_ ), 2 )*( 2*NP0+MQ0 ) * IF( IQ.NE.1 ) THEN INFO = -5 ELSE IF( JQ.NE.1 ) THEN INFO = -6 ELSE IF( IC.NE.1 ) THEN INFO = -9 ELSE IF( JC.NE.1 ) THEN INFO = -10 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -16 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCC( CTXT_ ), 'PZSEPQTQ', -INFO ) RETURN END IF * * C = Identity matrix * CALL PZLASET( 'A', NV, NV, ZERO, ONE, C, IC, JC, DESCC ) * * C = C - QT * Q * IF( NV*MS.GT.0 ) THEN CALL PZGEMM( 'Conjugate transpose', 'N', NV, NV, MS, NEGONE, Q, $ 1, 1, DESCQ, Q, 1, 1, DESCQ, ONE, C, 1, 1, DESCC ) END IF * * Allow for poorly orthogonalized eigenvectors for large clusters * NORM = PZLANGE( '1', NV, NV, C, 1, 1, DESCC, WORK ) QTQNRM = NORM / ( DBLE( MAX( MS, 1 ) )*ULP ) * CLUSTER = 1 10 CONTINUE DO 20 FIRSTP = 1, NPROW*NPCOL IF( PROCDIST( FIRSTP ).GE.ICLUSTR( 2*( CLUSTER-1 )+1 ) ) $ GO TO 30 20 CONTINUE 30 CONTINUE * IMIN = ICLUSTR( 2*CLUSTER-1 ) JMAX = ICLUSTR( 2*CLUSTER ) * * IF( IMIN.EQ.0 ) $ GO TO 60 * DO 40 NEXTP = FIRSTP, NPROW*NPCOL IMAX = PROCDIST( NEXTP ) JMIN = IMAX + 1 * * CALL PZMATADD( IMAX-IMIN+1, JMAX-JMIN+1, ZERO, C, IMIN, JMIN, $ DESCC, DCMPLX( GAP( CLUSTER ) / 0.01D+0 ), C, $ IMIN, JMIN, DESCC ) CALL PZMATADD( JMAX-JMIN+1, IMAX-IMIN+1, ZERO, C, JMIN, IMIN, $ DESCC, DCMPLX( GAP( CLUSTER ) / 0.01D+0 ), C, $ JMIN, IMIN, DESCC ) IMIN = IMAX * IF( ICLUSTR( 2*CLUSTER ).LT.PROCDIST( NEXTP+1 ) ) $ GO TO 50 40 CONTINUE 50 CONTINUE * CLUSTER = CLUSTER + 1 GO TO 10 60 CONTINUE * * Compute the norm of C * NORM = PZLANGE( '1', NV, NV, C, 1, 1, DESCC, WORK ) * QTQNRM2 = NORM / ( DBLE( MAX( MS, 1 ) )*ULP ) * IF( QTQNRM2.GT.THRESH ) THEN RES = 1 QTQNRM = QTQNRM2 END IF RETURN * * End of PZSEPQTQ * END scalapack-2.0.2/TESTING/EIG/pzseprdriver.f000644 000766 000024 00000021337 11623527140 020347 0ustar00juliestaff000000 000000 PROGRAM PZSEPRDRIVER * * Parallel COMPLEX*16 symmetric eigenproblem test driver for PZSYEVR * IMPLICIT NONE * * The user should modify TOTMEM to indicate the maximum amount of * memory in bytes her system has. Remember to leave room in memory * for operating system, the BLACS buffer, etc. DBLESZ * indicates the length in bytes on the given platform for a number, * real for SINGLE/DOUBLE PRECISION, and complex for COMPLEX/COMPLEX*16. * For example, on a standard system, the length of a * DBLE is 16, and an integer takes up 4 bytes. Some playing around * to discover what the maximum value you can set MEMSIZ to may be * required. * All arrays used by factorization and solve are allocated out of * big array called MEM. * * TESTS PERFORMED * =============== * * This routine performs tests for combinations of: matrix size, process * configuration (nprow and npcol), block size (nb), * matrix type, range of eigenvalue (all, by value, by index), * and upper vs. lower storage. * * It returns an error message when heterogeneity is detected. * * The input file allows multiple requests where each one is * of the following sets: * matrix sizes: n * process configuration triples: nprow, npcol, nb * matrix types: * eigenvalue requests: all, by value, by position * storage (upper vs. lower): uplo * * TERMS: * Request - means a set of tests, which is the cross product of * a set of specifications from the input file. * Test - one element in the cross product, i.e. a specific input * size and type, process configuration, etc. * * .. Parameters .. * INTEGER TOTMEM, DBLESZ, NIN PARAMETER ( TOTMEM = 100000000, DBLESZ = 16, NIN = 11 ) INTEGER MEMSIZ PARAMETER ( MEMSIZ = TOTMEM / DBLESZ ) * .. * .. Local Scalars .. CHARACTER HETERO CHARACTER*80 SUMMRY, USRINFO INTEGER CONTEXT, IAM, INFO, ISIEEE, MAXNODES, NNOCHECK, $ NOUT, NPASSED, NPROCS, NSKIPPED, NTESTS * .. * .. Local Arrays .. * INTEGER ISEED( 4 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH * .. * .. External Subroutines .. * EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_PINFO, BLACS_SETUP, $ IGAMN2D, PDLACHKIEEE, PDLASNBT, PZSEPRREQ * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) * * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( UNIT = NIN, FILE = 'SEPR.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) READ( NIN, FMT = * )MAXNODES READ( NIN, FMT = * )HETERO END IF * IF( NPROCS.LT.1 ) THEN CALL BLACS_SETUP( IAM, MAXNODES ) NPROCS = MAXNODES END IF * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', 1, NPROCS ) * CALL PDLASNBT( ISIEEE ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ( ISIEEE.NE.0 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = 9996 ) WRITE( NOUT, FMT = 9995 ) END IF * CALL PDLACHKIEEE( ISIEEE, DLAMCH( 'O' ), DLAMCH( 'U' ) ) * CALL IGAMN2D( CONTEXT, 'a', ' ', 1, 1, ISIEEE, 1, 1, 1, -1, -1, $ 0 ) * IF( ISIEEE.EQ.0 ) THEN GO TO 20 END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 ) END IF * END IF * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) $ 'Test ScaLAPACK symmetric eigendecomposition routine.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = 9999 )' ' WRITE( NOUT, FMT = 9999 )'Running tests of the parallel ' // $ 'symmetric eigenvalue routine: PZSYEVR.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 )' ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )' ||Q^T*Q - I|| ' // '/ (N * eps)' WRITE( NOUT, FMT = 9999 ) WRITE( NOUT, FMT = 9999 )'An explanation of the ' // $ 'input/output parameters follows:' WRITE( NOUT, FMT = 9999 )'RESULT : passed; or ' // $ 'an indication of which eigen request test failed' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns ' // $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks' // $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less ' // $ 'than THRESH, RESULT = PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'TYP : matrix type (see PZSEPRTST).' WRITE( NOUT, FMT = 9999 )'SUB : Subtests (Y/N).' WRITE( NOUT, FMT = 9999 )'WALL : Wallclock time.' WRITE( NOUT, FMT = 9999 )'CPU : CPU time.' WRITE( NOUT, FMT = 9999 )'CHK : ||AQ - QL|| ' // $ '/ ((abstol + ||A|| * eps) * N)' WRITE( NOUT, FMT = 9999 )'QTQ : ||Q^T*Q - I||/ (N * eps)' WRITE( NOUT, FMT = 9999 ) $ ' : when the adjusted QTQ norm exceeds THRESH', $ ' it is printed,' WRITE( NOUT, FMT = 9999 ) $ ' otherwise the true QTQ norm is printed.' WRITE( NOUT, FMT = 9999 ) $ ' : If more than one test is done, CHK and QTQ ' WRITE( NOUT, FMT = 9999 ) $ ' are the max over all eigentests performed.' WRITE( NOUT, FMT = 9999 ) $ 'TEST : EVR - testing PZSYEVR' WRITE( NOUT, FMT = 9999 )' ' END IF * NTESTS = 0 NPASSED = 0 NSKIPPED = 0 NNOCHECK = 0 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) END IF * 10 CONTINUE * ISEED( 1 ) = 139 ISEED( 2 ) = 1139 ISEED( 3 ) = 2139 ISEED( 4 ) = 3139 * CALL PZSEPRREQ( HETERO, NIN, MEM, MEMSIZ, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) IF( INFO.EQ.0 ) $ GO TO 10 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9985 )NTESTS WRITE( NOUT, FMT = 9984 )NPASSED WRITE( NOUT, FMT = 9983 )NNOCHECK WRITE( NOUT, FMT = 9982 )NSKIPPED WRITE( NOUT, FMT = 9981 )NTESTS - NPASSED - NSKIPPED - $ NNOCHECK WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF * * Uncomment this line on SUN systems to avoid the useless print out * c CALL IEEE_FLAGS( 'clear', 'exception', 'underflow', ' ') * 20 CONTINUE IF( IAM.EQ.0 ) THEN CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_GRIDEXIT( CONTEXT ) * CALL BLACS_EXIT( 0 ) STOP * 9999 FORMAT( A ) 9997 FORMAT( 'Check if overflow is handled in ieee default manner.' ) 9996 FORMAT( 'If this is the last output you see, you should assume') 9995 FORMAT( 'that overflow caused a floating point exception.' ) * 9986 FORMAT( 'Test ok. The system appears to handle ieee overflow.' ) * 9985 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9984 FORMAT( I5, ' tests completed and passed residual checks.' ) 9983 FORMAT( I5, ' tests completed without checking.' ) 9982 FORMAT( I5, ' tests skipped for lack of memory.' ) 9981 FORMAT( I5, ' tests completed and failed.' ) 9980 FORMAT( 'END OF TESTS.' ) 9979 FORMAT( ' N NB P Q TYP SUB WALL CPU ', $ ' CHK QTQ CHECK TEST' ) 9978 FORMAT( ' ----- --- --- --- --- --- -------- --------', $ ' --------- --------- ----- ----' ) * * End of PZSEPRDRIVER * END scalapack-2.0.2/TESTING/EIG/pzsepreq.f000644 000766 000024 00000023262 10363532303 017455 0ustar00juliestaff000000 000000 * * SUBROUTINE PZSEPREQ( NIN, MEM, MEMSIZE, NOUT, ISEED, NTESTS, $ NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX*16 MEM( MEMSIZE ) * .. * * Purpose * ======= * * PZSEPREQ performs one request from the input file 'SEP.dat' * A request is the cross product of the specifications in the * input file. PZSEPREQ prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEP.dat' * * MEM (local input) COMPLEX*16 ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ZPLXSZ, INTGSZ PARAMETER ( ZPLXSZ = 16, INTGSZ = 4 ) INTEGER DBLESZ PARAMETER ( DBLESZ = 8 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IMIDPAD, INITCON, IPOSTPAD, $ IPREPAD, ISIZEHEEVX, ISIZESUBTST, ISIZETST, $ LDA, LLRWORK, MATSIZE, MATTYPE, MYCOL, MYROW, $ N, NB, NMATSIZES, NMATTYPES, NNODES, NP, NPCOL, $ NPCONFIGS, NPROW, NQ, NUPLOS, ORDER, PCONFIG, $ PTRA, PTRCOPYA, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRRWORK, PTRW, PTRW2, PTRWORK, PTRZ, $ RES, RSIZECHK, RSIZEHEEVX, RSIZEQTQ, $ RSIZESUBTST, RSIZETST, SIZEHEEVX, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZESUBTST, SIZETMS, $ SIZETST, UPLO, SIZEHEEVD, RSIZEHEEVD, $ ISIZEHEEVD DOUBLE PRECISION ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PDSEPINFO, PZLASIZESEP, PZSEPTST * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PDSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * * DO 40 MATSIZE = 1, NMATSIZES * DO 30 PCONFIG = 1, NPCONFIGS * DO 20 MATTYPE = 1, NMATTYPES * DO 10 UPLO = 1, NUPLOS * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, $ MYCOL ) IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PZLASIZESEP( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, $ RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, $ ISIZEHEEVD, $ SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, ZPLXSZ / DBLESZ ) PTRWORK = PTRW2 + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, ZPLXSZ / DBLESZ ) PTRGAP = PTRWORK + SIZETST + IPREPAD + IPOSTPAD PTRIFAIL = PTRGAP + ICEIL( NPROW*NPCOL+IPREPAD+ $ IPOSTPAD, ZPLXSZ / DBLESZ ) PTRICLUS = PTRIFAIL + ICEIL( N+IPREPAD+IPOSTPAD, $ ZPLXSZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, ZPLXSZ / INTGSZ ) PTRRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, ZPLXSZ / INTGSZ ) LLRWORK = ( MEMSIZE-PTRRWORK+1 )*ZPLXSZ / DBLESZ C LLRWORK = ( MEMSIZE-PTRRWORK-IPREPAD-IPOSTPAD+1) C $ *ZPLXSZ / DBLESZ NTESTS = NTESTS + 1 IF( LLRWORK.LT.RSIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PZSEPTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), SUBTESTS, $ THRESH, N, ABSTOL, ISEED, $ MEM( PTRA ), MEM( PTRCOPYA ), $ MEM( PTRZ ), LDA, MEM( PTRW ), $ MEM( PTRW2 ), MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ SIZETST, MEM( PTRRWORK ), $ LLRWORK, MEM( PTRIWRK ), $ ISIZETST, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT=* )' PZSEPREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE END IF * * RETURN * * End of PZDSEPREQ * END scalapack-2.0.2/TESTING/EIG/pzseprreq.f000644 000766 000024 00000021756 11623527140 017650 0ustar00juliestaff000000 000000 SUBROUTINE PZSEPRREQ( HETERO, NIN, MEM, MEMSIZE, NOUT, ISEED, $ NTESTS, NSKIPPED, NNOCHECK, NPASSED, INFO ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER HETERO INTEGER INFO, MEMSIZE, NIN, NNOCHECK, NOUT, NPASSED, $ NSKIPPED, NTESTS * .. * .. Array Arguments .. INTEGER ISEED( 4 ) COMPLEX*16 MEM( MEMSIZE ) * * Purpose * ======= * * PZSEPRREQ performs one request from the input file 'SEPR.dat' * A request is the cross product of the specifications in the * input file. It prints one line per test. * * Arguments * ========= * * NIN (local input) INTEGER * The unit number for the input file 'SEPR.dat' * * MEM (local input ) COMPLEX*16 ARRAY, dimension MEMSIZE * Array encompassing the available single precision memory * * MEMSIZE (local input) INTEGER * Size of MEM array * * NOUT (local input) INTEGER * The unit number for output file. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10 * NOUT = 14, output to file, divide thresh by 20 * Only used on node 0. * NOUT = 13, 14 allow the threshold to be tighter for our * internal testing which means that when a user reports * a threshold error, it is more likely to be significant. * * ISEED (global input/output) INTEGER array, dimension 4 * Random number generator seed * * NTESTS (global input/output) INTEGER * NTESTS = NTESTS + tests requested * * NSKIPPED (global input/output) INTEGER * NSKIPPED = NSKIPPED + tests skipped * * NNOCHECK (global input/output) INTEGER * NNOCHECK = NNOCHECK + tests completed but not checked * * NPASSED (global input/output) INTEGER * NPASSED = NPASSED + tests which passed all checks * * INFO (global output) INTEGER * 0 = test request ran * -1 = end of file * -2 = incorrect .dat file * * .. Parameters .. * INTEGER DLEN_ PARAMETER ( DLEN_ = 9 ) INTEGER DBLESZ, INTGSZ PARAMETER ( DBLESZ = 8, INTGSZ = 4 ) INTEGER KMPXSZ PARAMETER ( KMPXSZ = 16 ) INTEGER MAXSETSIZE PARAMETER ( MAXSETSIZE = 50 ) * .. * .. Local Scalars .. CHARACTER SUBTESTS INTEGER CONTEXT, IAM, IMIDPAD, INITCON, IPOSTPAD, $ IPREPAD, ISIZESUBTST, ISIZEEVR, ISIZETST, $ LDA, LLWORK, MATSIZE, MATTYPE, MYCOL, MYROW, N, $ NB, NMATSIZES, NMATTYPES, NNODES, NP, NPCOL, $ NPCONFIGS, NPROW, NQ, NUPLOS, ORDER, PCONFIG, $ PTRA, PTRCOPYA, PTRGAP, PTRICLUS, PTRIFAIL, $ PTRIWRK, PTRW, PTRW2, PTRWORK, PTRZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZEEVR, $ SIZETMS, SIZETST, UPLO INTEGER PTRRWORK, RSIZEEVR, RSIZESUBTST, RSIZETST * DOUBLE PRECISION ABSTOL, THRESH * .. * .. Local Arrays .. CHARACTER UPLOS( 2 ) INTEGER DESCA( DLEN_ ), MATSIZES( MAXSETSIZE ), $ MATTYPES( MAXSETSIZE ), NBS( MAXSETSIZE ), $ NPCOLS( MAXSETSIZE ), NPROWS( MAXSETSIZE ) * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, PZLASIZESEPR, PDSEPINFO, PZSEPRTST * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GET( -1, 0, INITCON ) CALL BLACS_GRIDINIT( INITCON, 'R', 1, NNODES ) * CALL PDSEPINFO( INITCON, IAM, NIN, NOUT, MAXSETSIZE, NMATSIZES, $ MATSIZES, NUPLOS, UPLOS, NPCONFIGS, NPROWS, $ NPCOLS, NBS, NMATTYPES, MATTYPES, 22, SUBTESTS, $ THRESH, ORDER, ABSTOL, INFO ) * CALL BLACS_GRIDEXIT( INITCON ) * IF( INFO.EQ.0 ) THEN * DO 40 MATSIZE = 1, NMATSIZES * DO 30 PCONFIG = 1, NPCONFIGS * DO 20 MATTYPE = 1, NMATTYPES * DO 10 UPLO = 1, NUPLOS * N = MATSIZES( MATSIZE ) ORDER = N * NPROW = NPROWS( PCONFIG ) NPCOL = NPCOLS( PCONFIG ) NB = NBS( PCONFIG ) * NP = NUMROC( N, NB, 0, 0, NPROW ) NQ = NUMROC( N, NB, 0, 0, NPCOL ) IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) * LDA = MAX( NP, 1 ) + IMIDPAD * CALL BLACS_GET( -1, 0, CONTEXT ) CALL BLACS_GRIDINIT( CONTEXT, 'R', NPROW, NPCOL ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, $ MYCOL ) * IF( MYROW.GE.0 ) THEN CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, $ CONTEXT, LDA, INFO ) CALL PZLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, $ SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, RSIZEEVR, $ ISIZEEVR, SIZESUBTST, $ RSIZESUBTST, ISIZESUBTST, $ SIZETST, RSIZETST, ISIZETST ) * PTRA = 1 PTRZ = PTRA + LDA*NQ + IPREPAD + IPOSTPAD PTRCOPYA = PTRZ + LDA*NQ + IPREPAD + IPOSTPAD PTRW = PTRCOPYA + LDA*NQ + IPREPAD + IPOSTPAD PTRW2 = PTRW + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, KMPXSZ / DBLESZ ) PTRWORK = PTRW2 + ICEIL( MAX( N, 1 )+IPREPAD+ $ IPOSTPAD, KMPXSZ / DBLESZ ) PTRGAP = PTRWORK + SIZETST + IPREPAD + IPOSTPAD PTRIFAIL = PTRGAP + ICEIL( NPROW*NPCOL+IPREPAD+ $ IPOSTPAD, KMPXSZ / DBLESZ ) PTRICLUS = PTRIFAIL + ICEIL( N+IPREPAD+IPOSTPAD, $ KMPXSZ / INTGSZ ) PTRIWRK = PTRICLUS + ICEIL( 2*NPROW*NPCOL+ $ IPREPAD+IPOSTPAD, KMPXSZ / INTGSZ ) PTRRWORK = PTRIWRK + ICEIL( ISIZETST+IPREPAD+ $ IPOSTPAD, KMPXSZ / INTGSZ ) LLWORK = ( MEMSIZE-PTRRWORK+1 )*KMPXSZ / DBLESZ NTESTS = NTESTS + 1 IF( LLWORK.LT.RSIZETST ) THEN NSKIPPED = NSKIPPED + 1 ELSE CALL PZSEPRTST( DESCA, UPLOS( UPLO ), N, $ MATTYPES( MATTYPE ), SUBTESTS, $ THRESH, N, ABSTOL, ISEED, $ MEM( PTRA ), MEM( PTRCOPYA ), $ MEM( PTRZ ), LDA, MEM( PTRW ), $ MEM( PTRW2 ), MEM( PTRIFAIL ), $ MEM( PTRICLUS ), $ MEM( PTRGAP ), IPREPAD, $ IPOSTPAD, MEM( PTRWORK ), $ SIZETST, MEM( PTRRWORK ), $ LLWORK, MEM( PTRIWRK ), $ ISIZETST, HETERO, NOUT, RES ) * IF( RES.EQ.0 ) THEN NPASSED = NPASSED + 1 ELSE IF( RES.EQ.2 ) THEN NNOCHECK = NNOCHECK + 1 ELSE IF( RES.EQ.3 ) THEN NSKIPPED = NSKIPPED + 1 WRITE( NOUT, FMT = * )' PZSEPRREQ failed' CALL BLACS_ABORT( CONTEXT, -1 ) END IF END IF CALL BLACS_GRIDEXIT( CONTEXT ) END IF 10 CONTINUE 20 CONTINUE 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of PZSEPRREQ * END scalapack-2.0.2/TESTING/EIG/pzseprsubtst.f000644 000766 000024 00000072366 11623527140 020410 0ustar00juliestaff000000 000000 SUBROUTINE PZSEPRSUBTST( WKNOWN, JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, IA, JA, $ DESCA, WIN, WNEW, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK, LWORK, RWORK, $ LRWORK, LWORK1, IWORK, LIWORK, RESULT, $ TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, IPOSTPAD, IPREPAD, IU, JA, LIWORK, $ LWORK, LWORK1, N, NOUT, RESULT DOUBLE PRECISION ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU INTEGER LRWORK * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) COMPLEX*16 A( * ), COPYA( * ), WORK( * ), Z( * ) DOUBLE PRECISION GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) * .. * * Purpose * ======= * * PZSEPRSUBTST calls PZSYEVR and then tests its output. * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues computed. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 100 or 250. In particular, * it should not depend on the size of the matrix. * It must be at least zero. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the residual test. * * A (local workspace) COMPLEX*16 array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * The test matrix, which is subsequently overwritten. * A is distributed in a 2D-block cyclic manner over both rows * and columns. * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) COMPLEX*16 array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) COMPLEX*16 array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PZSEPCHK and PZSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) DOUBLE PRECISION array, dimension (N) * The computed eigenvalues. * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * RWORK has already been padded front and back, * use RWORK(1+IPREPAD) * * LRWORK (local input) INTEGER * The actual length of the array RWORK after padding. * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to the eigensolver. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call. * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER DLEN_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( DLEN_ = 9, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285D0, FIVE = 5.0D0, $ NEGONE = -1.0D0 ) COMPLEX*16 ZPADVAL PARAMETER ( ZPADVAL = ( 13.989D0, 1.93D0 ) ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZESUBTST, ISIZEEVR, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ SIZECHK, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZEQTQ, SIZESUBTST, SIZEEVR, SIZETMS, $ SIZETST, VALSIZE, VECSIZE INTEGER RSIZEEVR, RSIZESUBTST, RSIZETST DOUBLE PRECISION EPS, EPSNORMA, ERROR, MAXERROR, MAXVU, $ MINERROR, MINVL, NORMWIN, OLDVL, OLDVU, $ SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), ISEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PZLANHE EXTERNAL LSAME, NUMROC, PDLAMCH, PZLANHE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, DGAMN2D, DGAMX2D, $ IGAMN2D, IGAMX2D, PDCHEKPAD, PDFILLPAD, $ PICHEKPAD, PIFILLPAD, PZCHEKPAD, PZELSET, $ PZFILLPAD, PZHEEVR, PZLASIZEHEEVR, $ PZLASIZESEPR, PZSEPCHK, PZSEPQTQ, SLBOOT, $ SLTIMER, ZLACPY * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * CALL PZLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, RSIZEEVR, ISIZEEVR, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, $ SIZETST, RSIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PDLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that no information from previous calls is used * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 RWORK( I+IPREPAD ) = 14.3D0 10 CONTINUE * DO 15 I = 1, LWORK, 1 WORK( I+IPREPAD ) = ( 15.63D0, 1.1D0 ) 15 CONTINUE * DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE * DO 30 I = 1, N WNEW( I+IPREPAD ) = 3.14159D0 30 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF (LSAME( RANGE, 'V' ) ) THEN * WRITE(*,*) 'VL VU = ', VL, ' ', VU END IF IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL * WRITE(*,*) 'MINVL = ', MINVL, ' MAXVU = ', MAXVU * WRITE(*,*) 'WIN = ', WIN( 1 ) MINIL = 1 MAXIU = 0 DO 40 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 40 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 150 RESULT = 0 * ISEED( 1 ) = 1 * CALL PZLASIZEHEEVR( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL ZLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PZFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, ZPADVAL ) * CALL PZFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, ZPADVAL+1.0D0 ) * * WRITE(*,*) ' NP = ', NP, ' MQ = ', MQ, ' LDZ = ', DESCZ( LLD_ ), * $ ' IPREPAD = ', IPREPAD, ' IPOSTPAD = ', IPOSTPAD, * $ ' MAXEIGS = ', MAXEIGS * WRITE(*,*) ' PADZ( 1 ) = ', Z( 1 ), ' PADZ( 2 ) = ', Z( 2 ), * $ ' PADZ( 3 ) = ', Z( 3 ), ' PADZ( 4 ) = ', Z( 4 ) * CALL PDFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0D0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0D0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), LWORK1, 1, RWORK,LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0D0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PZFILLPAD( DESCA( CTXT_ ), LWORK, 1, WORK, LWORK, IPREPAD, $ IPOSTPAD, ZPADVAL+4.1D0 ) * * Make sure that PZHEEVR does not cheat (i.e. use answers * already computed.) * DO 60 I = 1, N, 1 DO 50 J = 1, MAXEIGS, 1 CALL PZELSET( Z( 1+IPREPAD ), I, J, DESCA, $ ( 13.0D0, 1.34D0 ) ) 50 CONTINUE 60 CONTINUE * * Reset and start the timer * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) ********************************* * * Main call to PZHEEVR * CALL PZHEEVR( JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ VL, VU, IL, IU, M, NZ, WNEW( 1+IPREPAD ), $ Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), SIZEEVR, $ RWORK( 1+IPREPAD ), LWORK1, $ IWORK( 1+IPREPAD ), LIWORK, INFO ) * ********************************* * * Stop timer * CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * * Indicate that there are no unresolved clusters. * This is necessary so that the tester * (adapted from the one originally made for PDSYEVX) * works correctly. ICLUSTR( 1+IPREPAD ) = 0 * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEEVR-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, ZPADVAL ) * CALL PZCHEKPAD( DESCZ( CTXT_ ), 'PZHEEVR-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ ZPADVAL+1.0D0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEEVR-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0D0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEEVR-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0D0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEEVR-RWORK',LWORK1, 1, $ RWORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0D0 ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEEVR-WORK',LWORK, 1, $ WORK, LWORK, IPREPAD, IPOSTPAD, $ ZPADVAL+4.1D0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEEVR-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEEVR-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEEVR-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * If we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PZLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * Check INFO * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) WRITE( NOUT,*) 'M = ', M, '\n', 'N = ', N RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) WRITE( NOUT,*) 'IL = ', IL, ' IU = ', IU, ' M = ', M END IF RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Ensure that different processes return the same eigenvalues * DO 70 I = 1, M RWORK( I ) = WNEW( I+IPREPAD ) RWORK( I+M ) = WNEW( I+IPREPAD ) 70 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, RWORK, M, $ 1, 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ RWORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 80 I = 1, M IF( RESULT.EQ.0 .AND. ( ABS( RWORK( I )-RWORK( M+ $ I ) ).GT.FIVE*EPS*ABS( RWORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 80 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 90 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 100 NCLUSTERS = NCLUSTERS + 1 90 CONTINUE 100 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 110 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 110 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * DO 120 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 120 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 150 * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PZLANHE( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ RWORK )*EPS END IF * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |A Z - Z W| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZECHK, 1, RWORK,SIZECHK, $ IPREPAD, IPOSTPAD, 4.3D0 ) * CALL PZSEPCHK( N, NZ, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), RWORK( 1+IPREPAD ), $ SIZECHK, TSTNRM, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZSEPCHK-RWORK',SIZECHK, 1, $ RWORK,SIZECHK, IPREPAD, IPOSTPAD, 4.3D0 ) * IF( RES.NE.0 ) $ RESULT = 1 * * Perform the |QTQ - I| test * CALL PDFILLPAD( DESCA( CTXT_ ), SIZEQTQ, 1,RWORK, SIZEQTQ, $ IPREPAD, IPOSTPAD, 4.3D0 ) * * CALL PZSEPQTQ( N, NZ, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1+IPREPAD+1 ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ),RWORK( IPREPAD+1 ), SIZEQTQ, $ QTQNRM, INFO, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PDSEPQTQ-RWORK',SIZEQTQ, 1, $ RWORK,SIZEQTQ, IPREPAD, IPOSTPAD, 4.3D0 ) * IF( RES.NE.0 ) $ RESULT = 1 * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that the right eigenvalues have been obtained * IF( WKNOWN ) THEN * Set up MYIL if necessary MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 140 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 130 I = 1, M * WRITE(*,*) 'WIN WNEW = ', WIN( I+MYIL-1 ), * $ WNEW( I+IPREPAD ) ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 130 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 140 CONTINUE * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what was computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 150 CONTINUE * RETURN * 9999 FORMAT( 'PZHEEVR returned INFO=', I7 ) 9998 FORMAT( 'PZSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PZSEPRSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PZHEEVR returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PZHEEVR' ) 9981 FORMAT( 'NZ altered by PZHEEVR with JOBZ=N' ) * * End of PZSEPRSUBTST * END scalapack-2.0.2/TESTING/EIG/pzseprtst.f000644 000766 000024 00000073761 11623527140 017676 0ustar00juliestaff000000 000000 SUBROUTINE PZSEPRTST(DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, $ ORDER, ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, RWORK, LRWORK, $ IWORK, LIWORK, HETERO, NOUT, INFO ) * * -- ScaLAPACK routine (@(MODE)version *TBA*) -- * University of California, Berkeley and * University of Tennessee, Knoxville. * October 21, 2006 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER HETERO, SUBTESTS, UPLO INTEGER INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, LWORK, $ MATTYPE, N, NOUT, ORDER INTEGER LRWORK DOUBLE PRECISION ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) DOUBLE PRECISION GAP( * ), WIN( * ), WNEW( * ), RWORK( * ) COMPLEX*16 A( LDA, * ), COPYA( LDA, * ), $ WORK( * ), Z( LDA, * ) * .. * * Purpose * ======= * * PZSEPRTST builds a random matrix and runs PZHEEVR to * compute the eigenvalues and eigenvectors. Then it performs two tests * to determine if the result is good enough. The two tests are: * |AQ -QL| / (abstol + ulp * norm(A) ) * and * |QT * Q - I| / ulp * norm(A) * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) A matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) DOUBLE PRECISION * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) COMPLEX*16 array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * The test matrix, which is then overwritten. * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * * COPYA (local workspace) COMPLEX*16 array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * Z (local workspace) COMPLEX*16 array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PZSEPCHK and PZSEPQTQ * * W (local workspace) DOUBLE PRECISION array, dimension (N) * On normal exit, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * Not used, only for backward compatibility * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PZLASIZESEPR * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * * LRWORK (local input) INTEGER * The length of the array WORK. LRWORK >= RSIZETST as * returned by P@(CRPF)LASIZESEPR * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PZLASIZESEPR * * HETERO (input) INTEGER * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER CTXT_, MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( CTXT_ = 2, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION HALF, ONE, TEN, ZERO PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, $ TEN = 10.0D0, HALF = 0.5D0 ) COMPLEX*16 PADVAL PARAMETER ( PADVAL = ( 19.25D0, 1.1D1 ) ) COMPLEX*16 ZZERO PARAMETER ( ZZERO = ( 0.0D0, 0.0D0 ) ) COMPLEX*16 ZONE PARAMETER ( ZONE = ( 1.0D0, 0.0D0 ) ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IHETERO, IINFO, IL, IMODE, IN, $ INDD, INDWORK, ISIZESUBTST, ISIZEEVR, $ ISIZETST, ITYPE, IU, J, LLWORK, LEVRSIZE, $ MAXSIZE, MYCOL, MYROW, NB, NGEN, NLOC, $ NNODES, NP, NPCOL, NPROW, NQ, RES, SIZECHK, $ SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, SIZEQTQ, $ SIZESUBTST, SIZEEVR, SIZETMS, $ SIZETST, VALSIZE, VECSIZE INTEGER INDRWORK, LLRWORK, RSIZEEVR, RSIZESUBTST, $ RSIZETST DOUBLE PRECISION ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL DLARAN, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, DLABAD, DLASRT, $ IGAMX2D, IGEBR2D, IGEBS2D, PZCHEKPAD, PZELSET, $ PZFILLPAD, PZLASET, PZLASIZEHEEVR, $ PZLASIZESEPR, PZLATMS, PZMATGEN, PZSEPRSUBTST, $ SLCOMBINE, ZLATMS * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX, MIN, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * INFO = 0 PASSED = 'PASSED EVR' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * Distribute HETERO across processes * IF( IAM.EQ.0 ) THEN IF( LSAME( HETERO, 'Y' ) ) THEN IHETERO = 2 ELSE IHETERO = 1 END IF CALL IGEBS2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1 ) ELSE CALL IGEBR2D( CONTEXT, 'All', ' ', 1, 1, IHETERO, 1, 0, 0 ) END IF IF( IHETERO.EQ.2 ) THEN HETERO = 'Y' ELSE HETERO = 'N' END IF * * Make sure that there is enough memory * CALL PZLASIZESEPR( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, SIZEQTQ, $ SIZECHK, SIZEEVR, RSIZEEVR, ISIZEEVR, $ SIZESUBTST, RSIZESUBTST, $ ISIZESUBTST, SIZETST, RSIZETST, ISIZETST ) IF( LRWORK.LT.RSIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDRWORK = INDD + N INDWORK = 1 LLWORK = LWORK - INDWORK + 1 LLRWORK = LRWORK - INDRWORK + 1 * ULP = PDLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PDLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL DLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / DBLE( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log Hermitian, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random Hermitian * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N RWORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PZLASET( 'All', N, N,ZZERO,ZZERO, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N RWORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PZLASET( 'All', N, N,ZZERO,ZONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0D0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S',RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0D0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * Hermitian, eigenvalues specified * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0D0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S',RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0D0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * Hermitian, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PZMATGEN( DESCA( CTXT_ ), 'H', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0D0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S',RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0D0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PZLASET( 'All', N, N,ZZERO,ZZERO, COPYA, 1, 1, DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( DLARAN( ISEED )*DBLE( NLOC ) ), N-NGEN ) * CALL ZLATMS( IN, IN, 'S', ISEED, 'P',RWORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PZELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PZELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PZELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PZELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2,... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 RWORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0D0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S',RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0D0 ) * ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL DLASRT( 'I', N,RWORK( INDD ), IINFO ) * CALL PZLASIZEHEEVR( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED,RWORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) LEVRSIZE = MIN( MAXSIZE, LLRWORK ) * CALL PZSEPRSUBTST( WKNOWN, 'v', 'a', UPLO, N, VL, VU, IL, IU, $ THRESH, ABSTOL, A, COPYA, Z, 1, 1, DESCA, $ RWORK( INDD ), WIN, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK( INDWORK ), LLWORK, $ RWORK( INDRWORK ), LLRWORK, $ LEVRSIZE, IWORK, ISIZEEVR, RES, TSTNRM, $ QTQNRM, NOUT ) * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PZLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PZSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 1' INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * Use PZLASIZEHEEVR to choose IL and IU. * CALL PZLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PZSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 2' INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PZLASIZEHEEVR to choose IL and IU for us. * CALL PZLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VECSIZE * CALL PZSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PZLASIZEHEEVR to choose IL and IU for us. * CALL PZLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VALSIZE * CALL PZSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVR to choose VL and VU for us. * CALL PZLASIZEHEEVR( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LEVRSIZE = VECSIZE * CALL PZSEPRSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, $ RWORK, LRWORK, LEVRSIZE, $ IWORK, ISIZEEVR, RES, TSTNRM, QTQNRM, $ NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF END IF * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 .AND. .FALSE. ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF C WRITE(*,*)'************************************************' END IF * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, $ F8.2, 1X, F8.2, 1X, G9.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 21X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 21X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 32X, $ 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) C 9984 FORMAT( ' IBTYPE=', I8 ) C 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) C 9980 FORMAT( ' Increase TOTMEM in PZSEPRDRIVER' ) * * End of PZSEPRTST * END scalapack-2.0.2/TESTING/EIG/pzsepsubtst.f000644 000766 000024 00000072742 10363532303 020221 0ustar00juliestaff000000 000000 * * SUBROUTINE PZSEPSUBTST( WKNOWN, JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, IA, JA, $ DESCA, WIN, WNEW, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK, LWORK, RWORK, $ LRWORK, LWORK1, IWORK, LIWORK, RESULT, $ TSTNRM, QTQNRM, NOUT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, IPOSTPAD, IPREPAD, IU, JA, LIWORK, $ LRWORK, LWORK, LWORK1, N, NOUT, RESULT DOUBLE PRECISION ABSTOL, QTQNRM, THRESH, TSTNRM, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ IWORK( * ) DOUBLE PRECISION GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) COMPLEX*16 A( * ), COPYA( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZSEPSUBTST calls PZHEEVX and then tests the output of * PZHEEVX * If JOBZ = 'V' then the following two tests are performed: * |AQ -QL| / (abstol + eps * norm(A) ) < N*THRESH * |QT * Q - I| / eps * norm(A) < N*THRESH * If WKNOWN then * we check to make sure that the eigenvalues match expectations * i.e. |WIN - WNEW(1+IPREPAD)| / (eps * |WIN|) < THRESH * where WIN is the array of eigenvalues as computed by * PZHEEVX when eigenvectors are requested * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * WKNOWN (global input) INTEGER * .FALSE.: WIN does not contain the eigenvalues * .TRUE.: WIN does contain the eigenvalues * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * Must be 'V' on first call to PZSEPSUBTST * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * Must be 'A' on first call to PZSEPSUBTST * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PZSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * A (local workspace) COMPLEX*16 array * global dimension (N, N), local dimension (DESCA(DLEN_), NQ) * A is distributed in a block cyclic manner over both rows * and columns. * See PZHEEVX for a description of block cyclic layout. * The test matrix, which is then modified by PZHEEVX * A has already been padded front and back, use A(1+IPREPAD) * * COPYA (local input) COMPLEX*16 array, dimension(N*N) * COPYA holds a copy of the original matrix A * identical in both form and content to A * * Z (local workspace) COMPLEX*16 array, dim (N*N) * Z is distributed in the same manner as A * Z contains the eigenvector matrix * Z is used as workspace by the test routines * PZSEPCHK and PZSEPQTQ. * Z has already been padded front and back, use Z(1+IPREPAD) * * IA (global input) INTEGER * On entry, IA specifies the global row index of the submatrix * of the global matrix A, COPYA and Z to operate on. * * JA (global input) INTEGER * On entry, IA specifies the global column index of the submat * of the global matrix A, COPYA and Z to operate on. * * DESCA (global/local input) INTEGER array of dimension 8 * The array descriptor for the matrix A, COPYA and Z. * * WIN (global input) DOUBLE PRECISION array, dimension (N) * If .not. WKNOWN, WIN is ignored on input * Otherwise, WIN() is taken as the standard by which the * eigenvalues are to be compared against. * * WNEW (global workspace) DOUBLE PRECISION array, dimension (N) * The eigenvalues as copmuted by this call to PZHEEVX * If JOBZ <> 'V' or RANGE <> 'A' these eigenvalues are * compared against those in WIN(). * WNEW has already been padded front and back, * use WNEW(1+IPREPAD) * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If INFO > 0 on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * IFAIL has already been padded front and back, * use IFAIL(1+IPREPAD) * * ICLUSTR (global workspace) integer array, dimension (2*NPROW*NPCOL) * * GAP (global workspace) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * WORK has already been padded front and back, * use WORK(1+IPREPAD) * * LWORK (local input) INTEGER * The actual length of the array WORK after padding. * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * RWORK has already been padded front and back, * use RWORK(1+IPREPAD) * * LRWORK (local input) INTEGER * The actual length of the array RWORK after padding. * * LWORK1 (local input) INTEGER * The amount of real workspace to pass to PZHEEVX * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * IWORK has already been padded front and back, * use IWORK(1+IPREPAD) * * LIWORK (local input) INTEGER * The length of the array IWORK after padding. * * RESULT (global output) INTEGER * The result of this call to PZHEEVX * RESULT = -3 => This process did not participate * RESULT = 0 => All tests passed * RESULT = 1 => ONe or more tests failed * * TSTNRM (global output) DOUBLE PRECISION * |AQ- QL| / (ABSTOL+EPS*|A|)*N * * QTQNRM (global output) DOUBLE PRECISION * |QTQ -I| / N*EPS * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION PADVAL, FIVE, NEGONE PARAMETER ( PADVAL = 13.5285D+0, FIVE = 5.0D+0, $ NEGONE = -1.0D+0 ) COMPLEX*16 ZPADVAL PARAMETER ( ZPADVAL = ( 13.989D+0, 1.93D+0 ) ) INTEGER IPADVAL PARAMETER ( IPADVAL = 927 ) * .. * .. Local Scalars .. LOGICAL MISSLARGEST, MISSSMALLEST INTEGER I, IAM, INDIWRK, INFO, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, J, M, MAXEIGS, MAXIL, MAXIU, MAXSIZE, $ MINIL, MQ, MYCOL, MYIL, MYROW, NCLUSTERS, NP, $ NPCOL, NPROW, NQ, NZ, OLDIL, OLDIU, OLDNZ, RES, $ RSIZECHK, RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, $ RSIZETST, SIZEHEEVX, SIZEMQRLEFT, SIZEMQRRIGHT, $ SIZEQRF, SIZESUBTST, SIZETMS, SIZETST, VALSIZE, $ VECSIZE, SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD DOUBLE PRECISION EPS, EPSNORMA, ERROR, MAXERROR, MAXVU, $ MINERROR, MINVL, NORMWIN, OLDVL, OLDVU, ORFAC, $ SAFMIN * .. * .. Local Arrays .. INTEGER DESCZ( DLEN_ ), DSEED( 4 ), ITMP( 2 ) * .. * .. External Functions .. * LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH, PZLANHE EXTERNAL LSAME, NUMROC, PDLAMCH, PZLANHE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, DGAMN2D, DGAMX2D, $ IGAMN2D, IGAMX2D, PDCHEKPAD, PDFILLPAD, $ PICHEKPAD, PIFILLPAD, PZCHEKPAD, PZELSET, $ PZFILLPAD, PZHEEVX, PZLASIZEHEEVX, PZLASIZESEP, $ PZSEPCHK, PZSEPQTQ, SLBOOT, SLTIMER, ZLACPY * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL PZLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) * TSTNRM = NEGONE QTQNRM = NEGONE EPS = PDLAMCH( DESCA( CTXT_ ), 'Eps' ) SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe min' ) * NORMWIN = SAFMIN / EPS IF( N.GE.1 ) $ NORMWIN = MAX( ABS( WIN( 1 ) ), ABS( WIN( N ) ), NORMWIN ) * * Make sure that we aren't using information from previous calls * NZ = -13 OLDNZ = NZ OLDIL = IL OLDIU = IU OLDVL = VL OLDVU = VU * DO 10 I = 1, LWORK1, 1 RWORK( I+IPREPAD ) = 14.3D+0 10 CONTINUE DO 20 I = 1, LIWORK, 1 IWORK( I+IPREPAD ) = 14 20 CONTINUE DO 30 I = 1, LWORK, 1 WORK( I+IPREPAD ) = ( 15.63D+0, 1.1D+0 ) 30 CONTINUE * DO 40 I = 1, N WNEW( I+IPREPAD ) = 3.14159D+0 40 CONTINUE * ICLUSTR( 1+IPREPAD ) = 139 * IF( LSAME( JOBZ, 'N' ) ) THEN MAXEIGS = 0 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MAXEIGS = N ELSE IF( LSAME( RANGE, 'I' ) ) THEN MAXEIGS = IU - IL + 1 ELSE MINVL = VL - NORMWIN*FIVE*EPS - ABSTOL MAXVU = VU + NORMWIN*FIVE*EPS + ABSTOL MINIL = 1 MAXIU = 0 DO 50 I = 1, N IF( WIN( I ).LT.MINVL ) $ MINIL = MINIL + 1 IF( WIN( I ).LE.MAXVU ) $ MAXIU = MAXIU + 1 50 CONTINUE * MAXEIGS = MAXIU - MINIL + 1 END IF END IF * * CALL DESCINIT( DESCZ, DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), DESCA( RSRC_ ), DESCA( CSRC_ ), $ DESCA( CTXT_ ), DESCA( LLD_ ), INFO ) * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INDIWRK = 1 + IPREPAD + NPROW*NPCOL + 1 * IAM = 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ IAM = 0 * * If this process is not involved in this test, bail out now * RESULT = -3 IF( MYROW.GE.NPROW .OR. MYROW.LT.0 ) $ GO TO 160 RESULT = 0 * * * DSEED is not used in this call to PZLASIZEHEEVX, the * following line just makes ftnchek happy. * DSEED( 1 ) = 1 * CALL PZLASIZEHEEVX( WKNOWN, RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WIN, MAXSIZE, VECSIZE, VALSIZE ) * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) MQ = NUMROC( MAXEIGS, DESCA( NB_ ), MYCOL, 0, NPCOL ) * CALL ZLACPY( 'A', NP, NQ, COPYA, DESCA( LLD_ ), A( 1+IPREPAD ), $ DESCA( LLD_ ) ) * CALL PZFILLPAD( DESCA( CTXT_ ), NP, NQ, A, DESCA( LLD_ ), IPREPAD, $ IPOSTPAD, ZPADVAL ) * CALL PZFILLPAD( DESCZ( CTXT_ ), NP, MQ, Z, DESCZ( LLD_ ), IPREPAD, $ IPOSTPAD, ZPADVAL+1.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), N, 1, WNEW, N, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), NPROW*NPCOL, 1, GAP, NPROW*NPCOL, $ IPREPAD, IPOSTPAD, PADVAL+3.0D+0 ) * CALL PDFILLPAD( DESCA( CTXT_ ), LWORK1, 1, RWORK, LWORK1, IPREPAD, $ IPOSTPAD, PADVAL+4.0D+0 ) * CALL PIFILLPAD( DESCA( CTXT_ ), LIWORK, 1, IWORK, LIWORK, IPREPAD, $ IPOSTPAD, IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), N, 1, IFAIL, N, IPREPAD, IPOSTPAD, $ IPADVAL ) * CALL PIFILLPAD( DESCA( CTXT_ ), 2*NPROW*NPCOL, 1, ICLUSTR, $ 2*NPROW*NPCOL, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PZFILLPAD( DESCA( CTXT_ ), LWORK, 1, WORK, LWORK, IPREPAD, $ IPOSTPAD, ZPADVAL+4.1D+0 ) * * Make sure that PZHEEVX does not cheat (i.e. use answers * already computed.) * DO 70 I = 1, N, 1 DO 60 J = 1, MAXEIGS, 1 CALL PZELSET( Z( 1+IPREPAD ), I, J, DESCA, $ ( 13.0D+0, 1.34D+0 ) ) 60 CONTINUE 70 CONTINUE * ORFAC = -1.0D+0 * CALL SLBOOT CALL SLTIMER( 1 ) CALL SLTIMER( 6 ) CALL PZHEEVX( JOBZ, RANGE, UPLO, N, A( 1+IPREPAD ), IA, JA, DESCA, $ VL, VU, IL, IU, ABSTOL, M, NZ, WNEW( 1+IPREPAD ), $ ORFAC, Z( 1+IPREPAD ), IA, JA, DESCA, $ WORK( 1+IPREPAD ), SIZEHEEVX, RWORK( 1+IPREPAD ), $ LWORK1, IWORK( 1+IPREPAD ), LIWORK, $ IFAIL( 1+IPREPAD ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), INFO ) CALL SLTIMER( 6 ) CALL SLTIMER( 1 ) * IF( THRESH.LE.0 ) THEN RESULT = 0 ELSE CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEEVX-A', NP, NQ, A, $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, ZPADVAL ) * CALL PZCHEKPAD( DESCZ( CTXT_ ), 'PZHEEVX-Z', NP, MQ, Z, $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ ZPADVAL+1.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEEVX-WNEW', N, 1, WNEW, N, $ IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEEVX-GAP', NPROW*NPCOL, 1, $ GAP, NPROW*NPCOL, IPREPAD, IPOSTPAD, $ PADVAL+3.0D+0 ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZHEEVX-rWORK', LWORK1, 1, $ RWORK, LWORK1, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZHEEVX-WORK', LWORK, 1, WORK, $ LWORK, IPREPAD, IPOSTPAD, ZPADVAL+4.1D+0 ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEEVX-IWORK', LIWORK, 1, $ IWORK, LIWORK, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEEVX-IFAIL', N, 1, IFAIL, $ N, IPREPAD, IPOSTPAD, IPADVAL ) * CALL PICHEKPAD( DESCA( CTXT_ ), 'PZHEEVX-ICLUSTR', $ 2*NPROW*NPCOL, 1, ICLUSTR, 2*NPROW*NPCOL, $ IPREPAD, IPOSTPAD, IPADVAL ) * * * Since we now know the spectrum, we can potentially reduce MAXSIZE. * IF( LSAME( RANGE, 'A' ) ) THEN CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ DSEED, WNEW( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) END IF * * * Check INFO * * * Make sure that all processes return the same value of INFO * ITMP( 1 ) = INFO ITMP( 2 ) = INFO * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, 1, $ 1, -1, -1, 0 ) * * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * ) $ 'Different processes return different INFO' RESULT = 1 ELSE IF( MOD( INFO, 2 ).EQ.1 .OR. INFO.GT.7 .OR. INFO.LT.0 ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )INFO RESULT = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 .AND. LWORK1.GE.MAXSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 .AND. LWORK1.GE.VECSIZE ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )INFO RESULT = 1 END IF * * IF( LSAME( JOBZ, 'V' ) .AND. ( ICLUSTR( 1+IPREPAD ).NE. $ 0 ) .AND. ( MOD( INFO / 2, 2 ).NE.1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) RESULT = 1 END IF * * Check M * IF( ( M.LT.0 ) .OR. ( M.GT.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'A' ) .AND. ( M.NE.N ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) RESULT = 1 ELSE IF( LSAME( RANGE, 'I' ) .AND. ( M.NE.IU-IL+1 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9992 ) RESULT = 1 ELSE IF( LSAME( JOBZ, 'V' ) .AND. $ ( .NOT.( LSAME( RANGE, 'V' ) ) ) .AND. ( M.NE.NZ ) ) $ THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) RESULT = 1 END IF * * Check NZ * IF( LSAME( JOBZ, 'V' ) ) THEN IF( LSAME( RANGE, 'V' ) ) THEN IF( NZ.GT.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) RESULT = 1 END IF IF( NZ.LT.M .AND. MOD( INFO / 4, 2 ).NE.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9989 ) RESULT = 1 END IF ELSE IF( NZ.NE.M ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) RESULT = 1 END IF END IF END IF IF( RESULT.EQ.0 ) THEN * * Make sure that all processes return the same # of eigenvalues * ITMP( 1 ) = M ITMP( 2 ) = M * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) RESULT = 1 ELSE * * Make sure that different processes return the same eigenvalues * DO 80 I = 1, M RWORK( I ) = WNEW( I+IPREPAD ) RWORK( I+M ) = WNEW( I+IPREPAD ) 80 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', M, 1, RWORK, M, $ 1, 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', M, 1, $ RWORK( 1+M ), M, 1, 1, -1, -1, 0 ) * DO 90 I = 1, M * IF( RESULT.EQ.0 .AND. ( ABS( RWORK( I )-RWORK( M+ $ I ) ).GT.FIVE*EPS*ABS( RWORK( I ) ) ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) RESULT = 1 END IF 90 CONTINUE END IF END IF * * Make sure that all processes return the same # of clusters * IF( LSAME( JOBZ, 'V' ) ) THEN NCLUSTERS = 0 DO 100 I = 0, NPROW*NPCOL - 1 IF( ICLUSTR( 1+IPREPAD+2*I ).EQ.0 ) $ GO TO 110 NCLUSTERS = NCLUSTERS + 1 100 CONTINUE 110 CONTINUE ITMP( 1 ) = NCLUSTERS ITMP( 2 ) = NCLUSTERS * CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP, 1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, ITMP( 2 ), 1, $ 1, 1, -1, -1, 0 ) * IF( ITMP( 1 ).NE.ITMP( 2 ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) RESULT = 1 ELSE * * Make sure that different processes return the same clusters * DO 120 I = 1, NCLUSTERS IWORK( INDIWRK+I ) = ICLUSTR( I+IPREPAD ) IWORK( INDIWRK+I+NCLUSTERS ) = ICLUSTR( I+IPREPAD ) 120 CONTINUE CALL IGAMN2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1 ), NCLUSTERS*2+1, 1, 1, $ -1, -1, 0 ) CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', NCLUSTERS*2+1, 1, $ IWORK( INDIWRK+1+NCLUSTERS ), $ NCLUSTERS*2+1, 1, 1, -1, -1, 0 ) * * DO 130 I = 1, NCLUSTERS IF( RESULT.EQ.0 .AND. IWORK( INDIWRK+I ).NE. $ IWORK( INDIWRK+NCLUSTERS+I ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) RESULT = 1 END IF 130 CONTINUE * IF( ICLUSTR( 1+IPREPAD+NCLUSTERS*2 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) RESULT = 1 END IF END IF END IF * * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, $ -1, -1, 0 ) IF( RESULT.NE.0 ) $ GO TO 160 * * Compute eps * norm(A) * IF( N.EQ.0 ) THEN EPSNORMA = EPS ELSE EPSNORMA = PZLANHE( 'I', UPLO, N, COPYA, IA, JA, DESCA, $ RWORK )*EPS END IF * * Note that a couple key variables get redefined in PZSEPCHK * as described by this table: * * PZSEPTST name PZSEPCHK name * ------------- ------------- * COPYA A * Z Q * A C * * IF( LSAME( JOBZ, 'V' ) ) THEN * * Perform the |AQ - QE| test * CALL PDFILLPAD( DESCA( CTXT_ ), RSIZECHK, 1, RWORK, $ RSIZECHK, IPREPAD, IPOSTPAD, 4.3D+0 ) * CALL PZSEPCHK( N, NZ, COPYA, IA, JA, DESCA, $ MAX( ABSTOL+EPSNORMA, SAFMIN ), THRESH, $ Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ WNEW( 1+IPREPAD ), RWORK( 1+IPREPAD ), $ RSIZECHK, TSTNRM, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZSEPCHK-rWORK', RSIZECHK, $ 1, RWORK, RSIZECHK, IPREPAD, IPOSTPAD, $ 4.3D+0 ) * IF( RES.NE.0 ) $ RESULT = 1 * * Perform the |QTQ - I| test * CALL PDFILLPAD( DESCA( CTXT_ ), RSIZEQTQ, 1, RWORK, $ RSIZEQTQ, IPREPAD, IPOSTPAD, 4.3D+0 ) * * CALL PZSEPQTQ( N, NZ, THRESH, Z( 1+IPREPAD ), IA, JA, DESCZ, $ A( 1+IPREPAD ), IA, JA, DESCA, $ IWORK( 1+IPREPAD+1 ), ICLUSTR( 1+IPREPAD ), $ GAP( 1+IPREPAD ), RWORK( IPREPAD+1 ), $ RSIZEQTQ, QTQNRM, INFO, RES ) * CALL PDCHEKPAD( DESCA( CTXT_ ), 'PZSEPQTQ-rWORK', RSIZEQTQ, $ 1, RWORK, RSIZEQTQ, IPREPAD, IPOSTPAD, $ 4.3D+0 ) * IF( RES.NE.0 ) $ RESULT = 1 * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )INFO RESULT = 1 END IF END IF * * Check to make sure that we have the right eigenvalues * IF( WKNOWN ) THEN * * Set up MYIL if necessary * MYIL = IL * IF( LSAME( RANGE, 'V' ) ) THEN MYIL = 1 MINIL = 1 MAXIL = N - M + 1 ELSE IF( LSAME( RANGE, 'A' ) ) THEN MYIL = 1 END IF MINIL = MYIL MAXIL = MYIL END IF * * Find the largest difference between the computed * and expected eigenvalues * MINERROR = NORMWIN * DO 150 MYIL = MINIL, MAXIL MAXERROR = 0 * * Make sure that we aren't skipping any important eigenvalues * MISSSMALLEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.1 ) ) $ MISSSMALLEST = .FALSE. IF( MISSSMALLEST .AND. ( WIN( MYIL-1 ).LT.VL+NORMWIN* $ FIVE*THRESH*EPS ) )MISSSMALLEST = .FALSE. MISSLARGEST = .TRUE. IF( .NOT.LSAME( RANGE, 'V' ) .OR. ( MYIL.EQ.MAXIL ) ) $ MISSLARGEST = .FALSE. IF( MISSLARGEST .AND. ( WIN( MYIL+M ).GT.VU-NORMWIN*FIVE* $ THRESH*EPS ) )MISSLARGEST = .FALSE. IF( .NOT.MISSSMALLEST ) THEN IF( .NOT.MISSLARGEST ) THEN * * Make sure that the eigenvalues that we report are OK * DO 140 I = 1, M ERROR = ABS( WIN( I+MYIL-1 )-WNEW( I+IPREPAD ) ) MAXERROR = MAX( MAXERROR, ERROR ) 140 CONTINUE * MINERROR = MIN( MAXERROR, MINERROR ) END IF END IF 150 CONTINUE * * * If JOBZ = 'V' and RANGE='A', we might be comparing * against our estimate of what the eigenvalues ought to * be, rather than comparing against what PxHEEVX computed * last time around, so we have to be more generous. * IF( LSAME( JOBZ, 'V' ) .AND. LSAME( RANGE, 'A' ) ) THEN IF( MINERROR.GT.NORMWIN*FIVE*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF ELSE IF( MINERROR.GT.NORMWIN*FIVE*THRESH*EPS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )MINERROR, NORMWIN RESULT = 1 END IF END IF END IF * * * Make sure that the IL, IU, VL and VU were not altered * IF( IL.NE.OLDIL .OR. IU.NE.OLDIU .OR. VL.NE.OLDVL .OR. VU.NE. $ OLDVU ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) RESULT = 1 END IF * IF( LSAME( JOBZ, 'N' ) .AND. ( NZ.NE.OLDNZ ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9981 ) RESULT = 1 END IF * END IF * * All processes should report the same result * CALL IGAMX2D( DESCA( CTXT_ ), 'a', ' ', 1, 1, RESULT, 1, 1, 1, -1, $ -1, 0 ) * 160 CONTINUE * * RETURN * 9999 FORMAT( 'PZHEEVX returned INFO=', I7 ) 9998 FORMAT( 'PZSEPQTQ returned INFO=', I7 ) 9997 FORMAT( 'PZSEPSUBTST minerror =', D11.2, ' normwin=', D11.2 ) 9996 FORMAT( 'PZHEEVX returned INFO=', I7, $ ' despite adequate workspace' ) 9995 FORMAT( 'ICLUSTR(1).NE.0 but mod(INFO/2,2).NE.1' ) 9994 FORMAT( 'M not in the range 0 to N' ) 9993 FORMAT( 'M not equal to N' ) 9992 FORMAT( 'M not equal to IU-IL+1' ) 9991 FORMAT( 'M not equal to NZ' ) 9990 FORMAT( 'NZ > M' ) 9989 FORMAT( 'NZ < M' ) 9988 FORMAT( 'NZ not equal to M' ) 9987 FORMAT( 'Different processes return different values for M' ) 9986 FORMAT( 'Different processes return different eigenvalues' ) 9985 FORMAT( 'Different processes return ', $ 'different numbers of clusters' ) 9984 FORMAT( 'Different processes return different clusters' ) 9983 FORMAT( 'ICLUSTR not zero terminated' ) 9982 FORMAT( 'IL, IU, VL or VU altered by PZHEEVX' ) 9981 FORMAT( 'NZ altered by PZHEEVX with JOBZ=N' ) * * End of PZSEPSUBTST * END scalapack-2.0.2/TESTING/EIG/pzseptst.f000644 000766 000024 00000125413 11622500733 017502 0ustar00juliestaff000000 000000 * * SUBROUTINE PZSEPTST( DESCA, UPLO, N, MATTYPE, SUBTESTS, THRESH, $ ORDER, ABSTOL, ISEED, A, COPYA, Z, LDA, WIN, $ WNEW, IFAIL, ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK, LWORK, RWORK, LRWORK, IWORK, LIWORK, $ NOUT, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 15, 2002 * * .. Scalar Arguments .. CHARACTER SUBTESTS, UPLO INTEGER INFO, IPOSTPAD, IPREPAD, LDA, LIWORK, LRWORK, $ LWORK, MATTYPE, N, NOUT, ORDER DOUBLE PRECISION ABSTOL, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), ICLUSTR( * ), IFAIL( * ), $ ISEED( 4 ), IWORK( * ) DOUBLE PRECISION GAP( * ), RWORK( * ), WIN( * ), WNEW( * ) COMPLEX*16 A( LDA, * ), COPYA( LDA, * ), WORK( * ), $ Z( LDA, * ) * .. * * Purpose * ======= * * PZSEPTST builds a random matrix, runs PZHEEVX() to * compute the eigenvalues * and eigenvectors and then performs two tests to * determine if the result * is good enough. The two tests are: * |AQ -QL| / (abstol + ulp * norm(A) ) * and * |QT * Q - I| / ulp * norm(A) * * The random matrix built depends upon the following parameters: * N, NB, ISEED, ORDER * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * DESCA (global and local input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrices * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * Size of the matrix to be tested. (global size) * * MATTYPE (global input) INTEGER * Matrix type * Currently, the list of possible types is: * * (1) The zero matrix. * (2) The identity matrix. * * (3) A diagonal matrix with evenly spaced entries * 1, ..., ULP and random signs. * (ULP = (first number larger than 1) - 1 ) * (4) A diagonal matrix with geometrically spaced entries * 1, ..., ULP and random signs. * (5) A diagonal matrix with "clustered" entries 1, ULP, ..., ULP * and random signs. * * (6) Same as (4), but multiplied by SQRT( overflow threshold ) * (7) Same as (4), but multiplied by SQRT( underflow threshold ) * * (8) A matrix of the form U' D U, where U is orthogonal and * D has evenly spaced entries 1, ..., ULP with random signs * on the diagonal. * * (9) A matrix of the form U' D U, where U is orthogonal and * D has geometrically spaced entries 1, ..., ULP with random * signs on the diagonal. * * (10) A matrix of the form U' D U, where U is orthogonal and * D has "clustered" entries 1, ULP,..., ULP with random * signs on the diagonal. * * (11) Same as (8), but multiplied by SQRT( overflow threshold ) * (12) Same as (8), but multiplied by SQRT( underflow threshold ) * * (13) Hermitian matrix with random entries chosen from (-1,1). * (14) Same as (13), but multiplied by SQRT( overflow threshold ) * (15) Same as (13), but multiplied by SQRT( underflow threshold ) * (16) Same as (8), but diagonal elements are all positive. * (17) Same as (9), but diagonal elements are all positive. * (18) Same as (10), but diagonal elements are all positive. * (19) Same as (16), but multiplied by SQRT( overflow threshold ) * (20) Same as (16), but multiplied by SQRT( underflow threshold ) * (21) A tridiagonal matrix that is a direct sum of smaller diagonally * dominant submatrices. Each unreduced submatrix has geometrically * spaced diagonal entries 1, ..., ULP. * (22) A matrix of the form U' D U, where U is orthogonal and * D has ceil(lgN) "clusters" at 0,1,2,...,ceil(lgN)-1. The * size of the cluster at the value I is 2^I. * * SUBTESTS (global input) CHARACTER*1 * 'Y' - Perform subset tests * 'N' - Do not perform subset tests * * THRESH (global input) DOUBLE PRECISION * A test will count as "failed" if the "error", computed as * described below, exceeds THRESH. Note that the error * is scaled to be O(1), so THRESH should be a reasonably * small multiple of 1, e.g., 10 or 100. In particular, * it should not depend on the precision (single vs. double) * or the size of the matrix. It must be at least zero. * * ORDER (global input) INTEGER * Number of reflectors used in test matrix creation. * If ORDER is large, it will * take more time to create the test matrices but they will * be closer to random. * ORDER .lt. N not implemented * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An * eigenvalue is considered to be located if it has * been determined to lie in an interval whose width * is "abstol" or less. If "abstol" is less than or equal * to zero, then ulp*|T| will be used, where |T| is * the 1-norm of the matrix. If eigenvectors are * desired later by inverse iteration ("PZSTEIN"), * "abstol" MUST NOT be bigger than ulp*|T|. * * For the purposes of this test, ABSTOL=0.0 is fine. * THis test does not test for high relative accuracy. * * ISEED (global input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * A (local workspace) COMPLEX*16 array, dim (N*N) * global dimension (N, N), local dimension (LDA, NQ) * A is distributed in a block cyclic manner over both rows * and columns. The actual location of a particular element * in A is controlled by the values of NPROW, NPCOL, and NB. * The test matrix, which is then modified by PZHEEVX * * COPYA (local workspace) COMPLEX*16 array, dim (N, N) * COPYA is used to hold an identical copy of the array A * identical in both form and content to A * * Z (local workspace) COMPLEX*16 array, dim (N*N) * Z is distributed in the same manner as A * Z is used as workspace by the test routines * PZSEPCHK and PZSEPQTQ * * W (local workspace) DOUBLE PRECISION array, dimension (N) * On normal exit from PZHEEVX, the first M entries * contain the selected eigenvalues in ascending order. * * IFAIL (global workspace) INTEGER array, dimension (N) * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * * LWORK (local input) INTEGER * The length of the array WORK. LWORK >= SIZETST as * returned by PZLASIZESEP * * RWORK (local workspace) COMPLEX*16 array, dimension (LWORK) * * LRWORK (local input) INTEGER * The length of the array WORK. LRWORK >= RSIZETST as * returned by PZLASIZESEP * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the array IWORK. LIWORK >= ISIZETST as * returned by PZLASIZESEP * * NOUT (local input) INTEGER * The unit number for output file. Only used on node 0. * NOUT = 6, output to screen, * NOUT = 0, output to stderr. * NOUT = 13, output to file, divide thresh by 10.0 * NOUT = 14, output to file, divide thresh by 20.0 * (This hack allows us to test more stringently internally * so that when errors on found on other computers they will * be serious enough to warrant our attention.) * * INFO (global output) INTEGER * -3 This process is not involved * 0 Test succeeded (passed |AQ -QL| and |QT*Q - I| tests) * 1 At least one test failed * 2 Residual test were not performed, thresh <= 0.0 * 3 Test was skipped because of inadequate memory space * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE, TEN, HALF PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TEN = 10.0D+0, $ HALF = 0.5D+0 ) COMPLEX*16 PADVAL PARAMETER ( PADVAL = ( 19.25D+0, 1.1D+1 ) ) COMPLEX*16 ZZERO PARAMETER ( ZZERO = ( 0.0D+0, 0.0D+0 ) ) COMPLEX*16 ZONE PARAMETER ( ZONE = ( 1.0D+0, 0.0D+0 ) ) INTEGER MAXTYP PARAMETER ( MAXTYP = 22 ) * .. * * .. Local Scalars .. LOGICAL WKNOWN CHARACTER JOBZ, RANGE CHARACTER*14 PASSED INTEGER CONTEXT, I, IAM, IINFO, IL, IMODE, IN, INDD, $ INDRWORK, INDWORK, ISIZEHEEVX, ISIZESUBTST, $ ISIZETST, ITYPE, IU, J, LHEEVXSIZE, LLRWORK, $ LLWORK, MAXSIZE, MYCOL, MYROW, NB, NGEN, NLOC, $ NNODES, NP, NPCOL, NPROW, NQ, RES, RSIZECHK, $ RSIZEHEEVX, RSIZEQTQ, RSIZESUBTST, RSIZETST, $ SIZEHEEVX, SIZEMQRLEFT, SIZEMQRRIGHT, SIZEQRF, $ SIZESUBTST, SIZETMS, SIZETST, VALSIZE, VECSIZE, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, NQ0, NP0, $ LHEEVDSIZE DOUBLE PRECISION ANINV, ANORM, COND, MAXQTQNRM, MAXTSTNRM, OVFL, $ QTQNRM, RTOVFL, RTUNFL, TEMP1, TSTNRM, ULP, $ ULPINV, UNFL, VL, VU * .. * .. Local Arrays .. INTEGER ISEEDIN( 4 ), KMAGN( MAXTYP ), KMODE( MAXTYP ), $ KTYPE( MAXTYP ) DOUBLE PRECISION CTIME( 10 ), WTIME( 10 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION DLARAN, PDLAMCH EXTERNAL LSAME, NUMROC, DLARAN, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, DLABAD, DLASRT, $ IGAMX2D, IGEBR2D, IGEBS2D, PZCHEKPAD, PZELSET, $ PZFILLPAD, PZLASET, PZLASIZEHEEVX, PZLASIZESEP, $ PZLATMS, PZMATGEN, PZSEPSUBTST, SLCOMBINE, $ ZLATMS * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX, MIN, SQRT * .. * .. Data statements .. DATA KTYPE / 1, 2, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 8, $ 8, 8, 9, 9, 9, 9, 9, 10, 11 / DATA KMAGN / 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 2, 3, 1, $ 2, 3, 1, 1, 1, 2, 3, 1, 1 / DATA KMODE / 0, 0, 4, 3, 1, 4, 4, 4, 3, 1, 4, 4, 0, $ 0, 0, 4, 3, 1, 4, 4, 3, 0 / * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 PASSED = 'PASSED EVX' CONTEXT = DESCA( CTXT_ ) NB = DESCA( NB_ ) * CALL BLACS_PINFO( IAM, NNODES ) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Make sure that we have enough memory * * CALL PZLASIZESEP( DESCA, IPREPAD, IPOSTPAD, SIZEMQRLEFT, $ SIZEMQRRIGHT, SIZEQRF, SIZETMS, RSIZEQTQ, $ RSIZECHK, SIZEHEEVX, RSIZEHEEVX, ISIZEHEEVX, $ SIZEHEEVD, RSIZEHEEVD, ISIZEHEEVD, $ SIZESUBTST, RSIZESUBTST, ISIZESUBTST, SIZETST, $ RSIZETST, ISIZETST ) * IF( LRWORK.LT.RSIZETST ) THEN INFO = 3 END IF * CALL IGAMX2D( CONTEXT, 'a', ' ', 1, 1, INFO, 1, 1, 1, -1, -1, 0 ) * IF( INFO.EQ.0 ) THEN * INDD = 1 INDRWORK = INDD + N INDWORK = 1 LLWORK = LWORK - INDWORK + 1 LLRWORK = LRWORK - INDRWORK + 1 * ULP = PDLAMCH( CONTEXT, 'P' ) ULPINV = ONE / ULP UNFL = PDLAMCH( CONTEXT, 'Safe min' ) OVFL = ONE / UNFL CALL DLABAD( UNFL, OVFL ) RTUNFL = SQRT( UNFL ) RTOVFL = SQRT( OVFL ) ANINV = ONE / DBLE( MAX( 1, N ) ) * * This ensures that everyone starts out with the same seed. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4 ) ELSE CALL IGEBR2D( CONTEXT, 'a', ' ', 4, 1, ISEED, 4, 0, 0 ) END IF ISEEDIN( 1 ) = ISEED( 1 ) ISEEDIN( 2 ) = ISEED( 2 ) ISEEDIN( 3 ) = ISEED( 3 ) ISEEDIN( 4 ) = ISEED( 4 ) * * Compute the matrix A * * Control parameters: * * KMAGN KMODE KTYPE * =1 O(1) clustered 1 zero * =2 large clustered 2 identity * =3 small exponential (none) * =4 arithmetic diagonal, (w/ eigenvalues) * =5 random log Hermitian, w/ eigenvalues * =6 random (none) * =7 random diagonal * =8 random Hermitian * =9 positive definite * =10 block diagonal with tridiagonal blocks * =11 Geometrically sized clusters. * ITYPE = KTYPE( MATTYPE ) IMODE = KMODE( MATTYPE ) * * Compute norm * GO TO ( 10, 20, 30 )KMAGN( MATTYPE ) * 10 CONTINUE ANORM = ONE GO TO 40 * 20 CONTINUE ANORM = ( RTOVFL*ULP )*ANINV GO TO 40 * 30 CONTINUE ANORM = RTUNFL*N*ULPINV GO TO 40 * 40 CONTINUE IF( MATTYPE.LE.15 ) THEN COND = ULPINV ELSE COND = ULPINV*ANINV / TEN END IF * * Special Matrices * * Zero * * IF( ITYPE.EQ.1 ) THEN * * Zero Matrix * DO 50 I = 1, N RWORK( INDD+I-1 ) = ZERO 50 CONTINUE CALL PZLASET( 'All', N, N, ZZERO, ZZERO, COPYA, 1, 1, $ DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.2 ) THEN * * Identity Matrix * DO 60 I = 1, N RWORK( INDD+I-1 ) = ONE 60 CONTINUE CALL PZLASET( 'All', N, N, ZZERO, ZONE, COPYA, 1, 1, DESCA ) WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.4 ) THEN * * Diagonal Matrix, [Eigen]values Specified * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+1.0D+0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) WKNOWN = .TRUE. * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS1-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+1.0D+0 ) * ELSE IF( ITYPE.EQ.5 ) THEN * * Hermitian, eigenvalues specified * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+2.0D+0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS2-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+2.0D+0 ) * WKNOWN = .TRUE. * ELSE IF( ITYPE.EQ.8 ) THEN * * Hermitian, random eigenvalues * NP = NUMROC( N, DESCA( MB_ ), MYROW, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), MYCOL, 0, NPCOL ) CALL PZMATGEN( DESCA( CTXT_ ), 'H', 'N', N, N, DESCA( MB_ ), $ DESCA( NB_ ), COPYA, DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), ISEED( 1 ), $ 0, NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) INFO = 0 WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.9 ) THEN * * Positive definite, eigenvalues specified. * * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+3.0D+0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, N, N, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * WKNOWN = .TRUE. * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS3-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+3.0D+0 ) * ELSE IF( ITYPE.EQ.10 ) THEN * * Block diagonal matrix with each block being a positive * definite tridiagonal submatrix. * CALL PZLASET( 'All', N, N, ZZERO, ZZERO, COPYA, 1, 1, $ DESCA ) NP = NUMROC( N, DESCA( MB_ ), 0, 0, NPROW ) NQ = NUMROC( N, DESCA( NB_ ), 0, 0, NPCOL ) NLOC = MIN( NP, NQ ) NGEN = 0 70 CONTINUE * IF( NGEN.LT.N ) THEN IN = MIN( 1+INT( DLARAN( ISEED )*DBLE( NLOC ) ), N-NGEN ) * CALL ZLATMS( IN, IN, 'S', ISEED, 'P', RWORK( INDD ), $ IMODE, COND, ANORM, 1, 1, 'N', A, LDA, $ WORK( INDWORK ), IINFO ) * DO 80 I = 2, IN TEMP1 = ABS( A( I-1, I ) ) / $ SQRT( ABS( A( I-1, I-1 )*A( I, I ) ) ) IF( TEMP1.GT.HALF ) THEN A( I-1, I ) = HALF*SQRT( ABS( A( I-1, I-1 )*A( I, $ I ) ) ) A( I, I-1 ) = A( I-1, I ) END IF 80 CONTINUE CALL PZELSET( COPYA, NGEN+1, NGEN+1, DESCA, A( 1, 1 ) ) DO 90 I = 2, IN CALL PZELSET( COPYA, NGEN+I, NGEN+I, DESCA, $ A( I, I ) ) CALL PZELSET( COPYA, NGEN+I-1, NGEN+I, DESCA, $ A( I-1, I ) ) CALL PZELSET( COPYA, NGEN+I, NGEN+I-1, DESCA, $ A( I, I-1 ) ) 90 CONTINUE NGEN = NGEN + IN GO TO 70 END IF WKNOWN = .FALSE. * ELSE IF( ITYPE.EQ.11 ) THEN * * Geometrically sized clusters. Eigenvalues: 0,1,1,2,2,2,2, ... * NGEN = 0 J = 1 TEMP1 = ZERO 100 CONTINUE IF( NGEN.LT.N ) THEN IN = MIN( J, N-NGEN ) DO 110 I = 0, IN - 1 RWORK( INDD+NGEN+I ) = TEMP1 110 CONTINUE TEMP1 = TEMP1 + ONE J = 2*J NGEN = NGEN + IN GO TO 100 END IF * * CALL PZFILLPAD( DESCA( CTXT_ ), SIZETMS, 1, WORK( INDWORK ), $ SIZETMS, IPREPAD, IPOSTPAD, PADVAL+4.0D+0 ) * CALL PZLATMS( N, N, 'S', ISEED, 'S', RWORK( INDD ), IMODE, $ COND, ANORM, 0, 0, 'N', COPYA, 1, 1, DESCA, $ ORDER, WORK( INDWORK+IPREPAD ), SIZETMS, $ IINFO ) * CALL PZCHEKPAD( DESCA( CTXT_ ), 'PZLATMS4-WORK', SIZETMS, 1, $ WORK( INDWORK ), SIZETMS, IPREPAD, IPOSTPAD, $ PADVAL+4.0D+0 ) * * * WKNOWN ... NOT SET, GUESS A DEFAULT * WKNOWN = .TRUE. ELSE IINFO = 1 END IF * IF( WKNOWN ) $ CALL DLASRT( 'I', N, RWORK( INDD ), IINFO ) * * * These values aren't actually used, but they make ftncheck happy. * IL = -1 IU = -2 VL = ONE VU = -ONE * CALL PZLASIZEHEEVX( WKNOWN, 'A', N, DESCA, VL, VU, IL, IU, $ ISEED, RWORK( INDD ), MAXSIZE, VECSIZE, $ VALSIZE ) * LHEEVXSIZE = MIN( MAXSIZE, LLRWORK ) * CALL PZSEPSUBTST( WKNOWN, 'v', 'a', UPLO, N, VL, VU, IL, IU, $ THRESH, ABSTOL, A, COPYA, Z, 1, 1, DESCA, $ RWORK( INDD ), WIN, IFAIL, ICLUSTR, GAP, $ IPREPAD, IPOSTPAD, WORK( INDWORK ), LLWORK, $ RWORK( INDRWORK ), LLRWORK, LHEEVXSIZE, $ IWORK, ISIZEHEEVX, RES, TSTNRM, QTQNRM, $ NOUT ) * * * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * res =0 IF( THRESH.LE.ZERO ) THEN PASSED = 'SKIPPED ' INFO = 2 ELSE IF( RES.NE.0 ) THEN PASSED = 'FAILED ' INFO = 1 END IF END IF * IF( THRESH.GT.ZERO .AND. LSAME( SUBTESTS, 'Y' ) ) THEN * * Subtest 1: JOBZ = 'V', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'V' RANGE = 'A' CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 1' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 2: JOBZ = 'V', RANGE = 'A', random memory * IF( INFO.EQ.0 ) THEN JOBZ = 'V' RANGE = 'A' CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN PASSED = 'FAILED stest 2' MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) INFO = 1 END IF END IF * * Subtest 3: JOBZ = 'N', RANGE = 'A', minimum memory * IF( INFO.EQ.0 ) THEN * JOBZ = 'N' RANGE = 'A' CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 3' INFO = 1 END IF END IF * * Subtest 4: JOBZ = 'N', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'N' RANGE = 'I' * * We use PZLASIZEHEEVX to choose IL and IU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 4' INFO = 1 END IF END IF * * Subtest 5: JOBZ = 'V', RANGE = 'I', maximum memory * IF( INFO.EQ.0 ) THEN * IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PZLASIZEHEEVX to choose IL and IU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = MAXSIZE * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 5' INFO = 1 END IF END IF * * Subtest 6: JOBZ = 'V', RANGE = 'I', minimum memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PZLASIZEHEEVX to choose IL and IU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 6' INFO = 1 END IF END IF * * Subtest 7: JOBZ = 'V', RANGE = 'I', random memory * IF( INFO.EQ.0 ) THEN IL = -1 IU = -1 JOBZ = 'V' RANGE = 'I' * * We use PZLASIZEHEEVX to choose IL and IU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) LHEEVXSIZE = VECSIZE + INT( DLARAN( ISEED )* $ DBLE( MAXSIZE-VECSIZE ) ) * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 7' INFO = 1 END IF END IF * * Subtest 8: JOBZ = 'N', RANGE = 'V', minimum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'N' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 8' INFO = 1 END IF END IF * * Subtest 9: JOBZ = 'V', RANGE = 'V', maximum memory * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = MAXSIZE * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest 9' INFO = 1 END IF END IF * * Subtest 10: JOBZ = 'V', RANGE = 'V', * minimum memory required for eigenvectors * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VECSIZE * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest10' INFO = 1 END IF END IF * * Subtest 11: JOBZ = 'V', RANGE = 'V', * random memory (enough for all eigenvectors * but not enough to guarantee orthogonality * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest11' INFO = 1 END IF END IF * * Subtest 12: JOBZ = 'V', RANGE = 'V', * miniimum memory required for eigenvalues only * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * LHEEVXSIZE = VALSIZE * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest12' INFO = 1 END IF END IF * * Subtest 13: JOBZ = 'V', RANGE = 'V', * random memory (more than minimum required * for eigenvalues, less than required for vectors) * IF( INFO.EQ.0 ) THEN VL = ONE VU = -ONE JOBZ = 'V' RANGE = 'V' * * We use PZLASIZEHEEVX to choose VL and VU for us. * CALL PZLASIZEHEEVX( .TRUE., RANGE, N, DESCA, VL, VU, IL, IU, $ ISEED, WIN( 1+IPREPAD ), MAXSIZE, $ VECSIZE, VALSIZE ) * * CALL PZSEPSUBTST( .TRUE., JOBZ, RANGE, UPLO, N, VL, VU, IL, $ IU, THRESH, ABSTOL, A, COPYA, Z, 1, 1, $ DESCA, WIN( 1+IPREPAD ), WNEW, IFAIL, $ ICLUSTR, GAP, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVXSIZE, IWORK, ISIZEHEEVX, RES, $ TSTNRM, QTQNRM, NOUT ) * IF( RES.NE.0 ) THEN MAXTSTNRM = MAX( TSTNRM, MAXTSTNRM ) MAXQTQNRM = MAX( QTQNRM, MAXQTQNRM ) PASSED = 'FAILED stest13' INFO = 1 END IF END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF * * Now that PZHEEVX been tested, we check PZHEEVD * PASSED = 'PASSED EEVD' * * PZHEEVD test1: * IF( INFO.EQ.0 ) THEN * NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( MAX( N, 1 ), NB, 0, 0, NPCOL ) LHEEVDSIZE = 1 + 9*N + 3*NP0*NQ0 ISIZEHEEVD = MAX( 1, 2+7*N+8*NPCOL ) * CALL PZSDPSUBTST( WKNOWN, UPLO, N, THRESH, ABSTOL, A, COPYA, Z, $ 1, 1, DESCA, WIN, WNEW, IPREPAD, IPOSTPAD, $ WORK( INDWORK ), LLWORK, RWORK, LRWORK, $ LHEEVDSIZE, IWORK, ISIZEHEEVD, RES, TSTNRM, $ QTQNRM, NOUT ) * MAXTSTNRM = TSTNRM MAXQTQNRM = QTQNRM * IF( RES.NE.0 ) THEN PASSED = 'FAILED EEVD' INFO = 1 END IF END IF * * * CALL IGAMX2D( CONTEXT, 'All', ' ', 1, 1, INFO, 1, -1, -1, -1, -1, $ -1 ) * IF( INFO.EQ.1 ) THEN IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9994 )'C ' WRITE( NOUT, FMT = 9993 )ISEEDIN( 1 ) WRITE( NOUT, FMT = 9992 )ISEEDIN( 2 ) WRITE( NOUT, FMT = 9991 )ISEEDIN( 3 ) WRITE( NOUT, FMT = 9990 )ISEEDIN( 4 ) IF( LSAME( UPLO, 'L' ) ) THEN WRITE( NOUT, FMT = 9994 )' UPLO= ''L'' ' ELSE WRITE( NOUT, FMT = 9994 )' UPLO= ''U'' ' END IF IF( LSAME( SUBTESTS, 'Y' ) ) THEN WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''Y'' ' ELSE WRITE( NOUT, FMT = 9994 )' SUBTESTS= ''N'' ' END IF WRITE( NOUT, FMT = 9989 )N WRITE( NOUT, FMT = 9988 )NPROW WRITE( NOUT, FMT = 9987 )NPCOL WRITE( NOUT, FMT = 9986 )NB WRITE( NOUT, FMT = 9985 )MATTYPE WRITE( NOUT, FMT = 9982 )ABSTOL WRITE( NOUT, FMT = 9981 )THRESH WRITE( NOUT, FMT = 9994 )'C ' END IF END IF * CALL SLCOMBINE( CONTEXT, 'All', '>', 'W', 6, 1, WTIME ) CALL SLCOMBINE( CONTEXT, 'All', '>', 'C', 6, 1, CTIME ) IF( IAM.EQ.0 ) THEN IF( INFO.EQ.0 .OR. INFO.EQ.1 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9999 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ), MAXTSTNRM, $ MAXQTQNRM, PASSED ELSE WRITE( NOUT, FMT = 9998 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ), MAXTSTNRM, MAXQTQNRM, PASSED END IF ELSE IF( INFO.EQ.2 ) THEN IF( WTIME( 1 ).GE.0.0 ) THEN WRITE( NOUT, FMT = 9997 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, WTIME( 1 ), CTIME( 1 ) ELSE WRITE( NOUT, FMT = 9996 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS, CTIME( 1 ) END IF ELSE IF( INFO.EQ.3 ) THEN WRITE( NOUT, FMT = 9995 )N, NB, NPROW, NPCOL, MATTYPE, $ SUBTESTS END IF END IF 120 CONTINUE * RETURN 9999 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, 1X, A14 ) 9998 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 1X, G9.2, 1X, G9.2, A14 ) 9997 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, F8.2, $ 1X, F8.2, 21X, 'Bypassed' ) 9996 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 1X, 8X, $ 1X, F8.2, 21X, 'Bypassed' ) 9995 FORMAT( 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 3X, A1, 32X, $ 'Bad MEMORY parameters' ) 9994 FORMAT( A ) 9993 FORMAT( ' ISEED( 1 ) =', I8 ) 9992 FORMAT( ' ISEED( 2 ) =', I8 ) 9991 FORMAT( ' ISEED( 3 ) =', I8 ) 9990 FORMAT( ' ISEED( 4 ) =', I8 ) 9989 FORMAT( ' N=', I8 ) 9988 FORMAT( ' NPROW=', I8 ) 9987 FORMAT( ' NPCOL=', I8 ) 9986 FORMAT( ' NB=', I8 ) 9985 FORMAT( ' MATTYPE=', I8 ) 9984 FORMAT( ' IBTYPE=', I8 ) 9983 FORMAT( ' SUBTESTS=', A1 ) 9982 FORMAT( ' ABSTOL=', D16.6 ) 9981 FORMAT( ' THRESH=', D16.6 ) 9980 FORMAT( ' Increase TOTMEM in PZSEPDRIVER' ) * * End of PZSEPTST * END scalapack-2.0.2/TESTING/EIG/pztrddriver.f000644 000766 000024 00000047204 10363532303 020165 0ustar00juliestaff000000 000000 PROGRAM PZTRDDRIVER * * -- ScaLAPACK testing driver (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * Purpose * ======== * * PZTRDDRIVER is the main test program for the COMPLEX*16 * SCALAPACK TRD (symmetric tridiagonal reduction) routines. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 13 lines: * 'ScaLAPACK TRD computation input file' * 'PVM machine' * 'TRD.out' output file name * 6 device out * 'L' define Lower or Upper * 3 number of problems sizes * 5 31 201 values of N * 3 number of NB's * 2 3 5 values of NB * 7 number of process grids (ordered pairs of P & Q) * 1 2 1 4 2 3 8 values of P * 1 2 4 1 3 2 1 values of Q * 1.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 2000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * INTGSZ INTEGER, default = 4 bytes. * ZPLXSZ INTEGER, default = 16 bytes. * INTGSZ and ZPLXSZ indicate the length in bytes on the * given platform for an integer and a double precision * complex. * MEM COMPLEX*16 array, dimension ( TOTMEM / ZPLXSZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, TOTMEM, ZPLXSZ, MEMSIZ, NTESTS COMPLEX*16 PADVAL PARAMETER ( DBLESZ = 8, TOTMEM = 2000000, ZPLXSZ = 16, $ MEMSIZ = TOTMEM / ZPLXSZ, NTESTS = 20, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER UPLO CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPT, IPW, ITEMP, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LCM, LWORK, MYCOL, $ MYROW, N, NB, NDIAG, NGRIDS, NMAT, NNB, NOFFD, $ NOUT, NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ, $ WORKTRD REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), IERR( 1 ), NBVAL( NTESTS ), $ NVAL( NTESTS ), PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZCHEKPAD, $ PZFILLPAD, PZHETDRV, PZHETRD, PZLAFCHK, $ PZMATGEN, PZTRDINFO, PZTTRDTESTER, SLBOOT, $ SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC DOUBLE PRECISION PZLANHE EXTERNAL LSAME, ICEIL, ILCM, NUMROC, PZLANHE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Data statements .. DATA KTESTS, KPASS, KFAIL, KSKIP / 4*0 / * .. * .. Executable Statements .. * IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )STOP * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PZTRDINFO( OUTFILE, NOUT, UPLO, NMAT, NVAL, NTESTS, NNB, $ NBVAL, NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.1 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * NDIAG = NQ IF( LSAME( UPLO, 'U' ) ) THEN NOFFD = NQ ELSE NOFFD = NUMROC( N-1, NB, MYCOL, 0, NPCOL ) END IF NDIAG = ICEIL( DBLESZ*NDIAG, ZPLXSZ ) NOFFD = ICEIL( DBLESZ*NOFFD, ZPLXSZ ) * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPT = IPE + NOFFD + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * LWORK = MAX( NB*( NP+1 ), 3*NB ) WORKTRD = LWORK + IPOSTPAD WORKSIZ = WORKTRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN ITEMP = 2*NQ + NP IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) ITEMP = NB*ICEIL( ICEIL( NP, NB ), LCM / NPROW ) + $ ITEMP END IF ITEMP = MAX( ICEIL( DBLESZ*ITEMP, ZPLXSZ ), $ 2*( NB+NP )*NB ) WORKSIZ = MAX( LWORK, ITEMP ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Tridiagonal reduction', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate the matrix A * CALL PZMATGEN( ICTXT, 'Hemm', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), NQ, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANHE( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to symmetric tridiagonal form * CALL PZHETRD( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), LWORK, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZHETRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETRD', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETRD', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETRD', WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - QTQ'|| / (||A|| * N * eps) * CALL PZHETDRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), IERR( 1 ) ) CALL PZLAFCHK( 'Hemm', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZHETDRV', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETDRV', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETDRV', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETDRV', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ. $ 0.0D+0 .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9986 )FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * )'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * TRD requires 16/3 N^3 floating point operations * NOPS = DBLE( N ) * NOPS = ( 4.0D+0 / 3.0D+0 )*NOPS**3 NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', UPLO, N, NB, $ NPROW, NPCOL, WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', UPLO, N, NB, $ NPROW, NPCOL, CTIME( 1 ), TMFLOPS, FRESID, PASSED END IF 10 CONTINUE 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE * CALL PZTTRDTESTER( IAM, NPROCS, CHECK, NOUT, THRESH, NVAL, NMAT, $ MEM, TOTMEM, KPASS, KFAIL, KSKIP ) * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME UPLO N NB P Q TRD Time ', $ ' MFLOPS Residual CHECK' ) 9994 FORMAT( '---- ---- ------ --- ----- ----- --------- ', $ '----------- -------- ------' ) 9993 FORMAT( A4, 1X, A4, 1X, I6, 1X, I3, 1X, I5, 1X, I5, 1X, F9.2, 1X, $ F11.2, 1X, F8.2, 1X, A6 ) 9992 FORMAT( 'Finished', I4, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||A - Q*T*Q''|| / (||A|| * N * eps) = ', G25.7 ) * STOP * * End of PZTRDDRIVER * END scalapack-2.0.2/TESTING/EIG/pztrdinfo.f000644 000766 000024 00000032247 10363532303 017626 0ustar00juliestaff000000 000000 SUBROUTINE PZTRDINFO( SUMMRY, NOUT, UPLO, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 27, 2000 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, $ NGRIDS, NMAT, NNB, NPROCS, NOUT REAL THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY*(*) INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZTRDINFO gets needed startup information for the Hermitian * tridiagonal reduction and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * UPLO (global output) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (number of columns in matrix) to run the * code with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * = 0 : Perform no error checking * > 0 : report all residuals greater than THRESH. * * WORK (local workspace) INTEGER array, dimension >= * MAX( 5, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ). Used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * Note * ==== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='TRD.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get UPLO * READ( NIN, FMT = * ) UPLO * * Get number of matrices and their dimensions * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'N', LDNVAL GOTO 20 END IF READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * ) NNB IF( NNB.LT.1 .OR. NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'NB', LDNBVAL GOTO 20 END IF READ( NIN, FMT = * ) ( NBVAL( I ), I = 1, NNB ) * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDPVAL GOTO 20 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 ) 'Grids', LDQVAL GOTO 20 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * ) THRESH * * Close input file * CLOSE( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 10 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 10 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS IF( LSAME( UPLO, 'L' ) ) THEN WORK( 4 ) = 1 ELSE WORK( 4 ) = 2 END IF CALL IGEBS2D( ICTXT, 'All', ' ', 4, 1, WORK, 4 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK Reduction Routine to Hermitian '// $ 'tridiagonal form.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the parallel '// $ 'complex double precision Hermitian '// $ 'tridiagonal' WRITE( NOUT, FMT = 9999 ) 'reduction routines.' WRITE( NOUT, FMT = 9999 ) $ 'The following scaled residual '// $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' ||A - QTQ''|| / (||A|| * eps * N)' WRITE( NOUT, FMT = 9999 ) $ 'The matrix A is randomly '// $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'UPLO : Whether the ''Upper'' or ''Low'// $ 'er'' part of A is to be referenced.' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or '// $ 'CPU time was used.' WRITE( NOUT, FMT = 9999 ) $ 'N : The number of rows and columns '// $ 'of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks'// $ ' the matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less'// $ 'than THRESH, CHECK is flagged as PASSED.' WRITE( NOUT, FMT = 9999 ) $ 'TRD time : Time in seconds to reduce the'// $ ' matrix to tridiagonal form.' WRITE( NOUT, FMT = 9999 ) $ 'MFLOPS : Rate of execution for '// $ 'Hermitian tridiagonal reduction.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9999 ) $ ' UPLO : '//UPLO WRITE( NOUT, FMT = 9996 ) $ 'N ', ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 ) $ 'NB ', ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 ) $ 'P ', ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 ) $ 'Q ', ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 ) ( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) EPS WRITE( NOUT, FMT = 9998 ) THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * all processes have needed startup information * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 4, 1, WORK, 4, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) IF( WORK( 4 ).EQ.1 ) THEN UPLO = 'L' ELSE UPLO = 'U' END IF * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9993 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PZTRDINFO * END scalapack-2.0.2/TESTING/EIG/pzttrdtester.f000644 000766 000024 00000062150 10363532303 020361 0ustar00juliestaff000000 000000 SUBROUTINE PZTTRDTESTER( IAM, NPROCS, CHECK, NOUT, THRESH, NVAL, $ NMAT, MEM, TOTMEM, KPASS, KFAIL, KSKIP ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * February 24, 2000 * * .. Scalar Arguments .. LOGICAL CHECK INTEGER IAM, KFAIL, KPASS, KSKIP, NMAT, NOUT, NPROCS, $ TOTMEM REAL THRESH * .. * .. Array Arguments .. INTEGER NVAL( * ) COMPLEX*16 MEM( * ) * .. * * Purpose * ======= * * PZTTRDTESTER tests PZHETTRD * * Arguments * ========= * * IAM (local input) INTEGER * The local process number * * NPROCS (global input) INTEGER * The number of processors * * CHECK (global input) LOGICAL * Specifies whether the user wants to check the answer * * NOUT (local input) INTEGER * File descriptor * * THRESH (global input) DOUBLE PRECISION * Acceptable error threshold * * NVAL (global input) INTEGER array dimension NMAT * The matrix sizes to test * * NMAT (global input) INTEGER * The number of matrix sizes to test * * MEM (local input) COMPLEX*16 array dimension MEMSIZ * Where: * MEMSIZ = TOTMEM / ZPLXSZ * * TOTMEM (global input) INTEGER * Number of bytes in MEM * * KPASS (local input/output) INTEGER * The number of tests which passed. Only relevant on * processor 0. * * KFAIL (local input/output) INTEGER * The number of tests which failed. Only relevant on * processor 0. * * KSKIP (local input/output) INTEGER * The number of tests which were skipped. Only relevant on * processor 0. * * ================================================================ * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, ZPLXSZ COMPLEX*16 PADVAL PARAMETER ( DBLESZ = 8, ZPLXSZ = 16, $ PADVAL = ( -9923.0D+0, -9924.0D+0 ) ) INTEGER TIMETESTS PARAMETER ( TIMETESTS = 11 ) INTEGER TESTS PARAMETER ( TESTS = 8 ) INTEGER MINTIMEN PARAMETER ( MINTIMEN = 8 ) * .. * .. Local Scalars .. LOGICAL TIME CHARACTER UPLO CHARACTER*6 PASSED INTEGER DUMMY, IASEED, ICTXT, IMIDPAD, INFO, IPA, IPD, $ IPE, IPOSTPAD, IPREPAD, IPT, IPW, ITEMP, J, K, $ LCM, LWMIN, MAXTESTS, MEMSIZ, MYCOL, MYROW, N, $ NB, NDIAG, NGRIDS, NN, NOFFD, NP, NPCOL, NPROW, $ NPS, NQ, SPLITSTIMED, WORKSIZ, WORKTRD DOUBLE PRECISION ANORM, FRESID, NOPS, TMFLOPS * .. * .. Local Arrays .. INTEGER ANBTEST( TESTS ), ANBTIME( TIMETESTS ), $ BALTEST( TESTS ), BALTIME( TIMETESTS ), $ DESCA( DLEN_ ), DESCD( DLEN_ ), IERR( 1 ), $ INTERTEST( TESTS ), INTERTIME( TIMETESTS ), $ PNBTEST( TESTS ), PNBTIME( TIMETESTS ), $ TWOGEMMTEST( TESTS ), TWOGEMMTIME( TIMETESTS ) DOUBLE PRECISION CTIME( 100 ), WTIME( 100 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, DESCINIT, $ IGEBR2D, IGEBS2D, IGSUM2D, PZCHEKPAD, $ PZFILLPAD, PZHETDRV, PZHETTRD, PZLAFCHK, $ PZLATRAN, PZMATGEN, SLBOOT, SLCOMBINE, SLTIMER * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, NUMROC, PJLAENV DOUBLE PRECISION PZLANHE EXTERNAL LSAME, ICEIL, ILCM, NUMROC, PJLAENV, PZLANHE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, SQRT * .. * * .. Scalars in Common .. INTEGER ANB, BALANCED, BCKBLOCK, GSTBLOCK, INTERLEAVE, $ LLTBLOCK, MINSZ, PNB, TIMEINTERNALS, TIMING, $ TRSBLOCK, TWOGEMMS * .. * .. Common blocks .. COMMON / BLOCKSIZES / GSTBLOCK, LLTBLOCK, BCKBLOCK, $ TRSBLOCK COMMON / MINSIZE / MINSZ COMMON / PJLAENVTIMING / TIMING COMMON / TAILOREDOPTS / PNB, ANB, INTERLEAVE, $ BALANCED, TWOGEMMS COMMON / TIMECONTROL / TIMEINTERNALS * .. * .. Data statements .. DATA BALTIME / 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0 / DATA INTERTIME / 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1 / DATA TWOGEMMTIME / 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0 / DATA ANBTIME / 16, 16, 16, 16, 16, 8, 8, 32, 32, 16, $ 16 / DATA PNBTIME / 32, 32, 32, 32, 32, 32, 32, 32, 32, $ 16, 64 / DATA BALTEST / 0, 0, 0, 0, 1, 1, 1, 1 / DATA INTERTEST / 0, 0, 1, 1, 0, 0, 1, 1 / DATA TWOGEMMTEST / 0, 1, 0, 1, 0, 1, 0, 1 / DATA ANBTEST / 1, 2, 3, 16, 1, 2, 3, 16 / DATA PNBTEST / 1, 16, 8, 1, 16, 8, 1, 16 / * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * IASEED = 100 SPLITSTIMED = 0 NB = 1 UPLO = 'L' MEMSIZ = TOTMEM / ZPLXSZ * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * NGRIDS = INT( SQRT( DBLE( NPROCS ) ) ) * DO 30 NN = 1, NGRIDS * NPROW = NN NPCOL = NN IERR( 1 ) = 0 * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of loop if this case doesn't use my process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Make sure no one had error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * * Loop over different blocking sizes * IF( N.GT.MINTIMEN ) THEN * * For timing tests, we perform one or two extra tests. * Both of these extra tests are performed with the * default values for the performance tuning parameters. * The second extra test (which is only performed if * split times are non-zero) is performed with timeinternals * set to 1 (which forces barrier syncs between many * phases of the computation). * TIME = .TRUE. MAXTESTS = TIMETESTS + 2 ELSE TIME = .FALSE. MAXTESTS = TESTS END IF * * DO 10 K = 1, MAXTESTS TIMEINTERNALS = 0 IF( TIME ) THEN IF( K.GE.MAXTESTS-1 ) THEN * * For the last two timings, we let pjlaenv set * the execution path values. These dummy * initializations aren't really necessary, * but they illustrate the fact that these values are * set in xpjlaenv. The dummy call to pjlaenv * has the side effect of setting ANB. * MINSZ = -13 BALANCED = -13 INTERLEAVE = -13 TWOGEMMS = -13 ANB = -13 PNB = -13 TIMING = 1 DUMMY = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, $ 0, 0 ) IF( K.EQ.MAXTESTS ) $ TIMEINTERNALS = 1 ELSE TIMING = 0 MINSZ = 1 BALANCED = BALTIME( K ) INTERLEAVE = INTERTIME( K ) TWOGEMMS = TWOGEMMTIME( K ) ANB = ANBTIME( K ) PNB = PNBTIME( K ) END IF ELSE TIMING = 0 MINSZ = 1 BALANCED = BALTEST( K ) INTERLEAVE = INTERTEST( K ) TWOGEMMS = TWOGEMMTEST( K ) ANB = ANBTEST( K ) PNB = PNBTEST( K ) END IF * * Skip the last test (with timeinternals = 1) if * PZHETTRD is not collecting the split times. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'All', ' ', 1, 1, SPLITSTIMED, $ 1 ) ELSE CALL IGEBR2D( ICTXT, 'All', ' ', 1, 1, SPLITSTIMED, 1, $ 0, 0 ) END IF * * IF( SPLITSTIMED.EQ.0 .AND. K.EQ.MAXTESTS ) $ GO TO 10 * * The following hack tests to make sure that PNB need not * be the same on all processes. (Provided that PNB is set * to 1 in the TRD.dat file.) * IF( PNB.EQ.1 ) $ PNB = 1 + IAM * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * CALL DESCINIT( DESCD, 1, N, NB, NB, 0, 0, ICTXT, 1, $ INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * NDIAG = NQ IF( LSAME( UPLO, 'U' ) ) THEN NOFFD = NQ ELSE NOFFD = NUMROC( N-1, NB, MYCOL, 0, NPCOL ) END IF NDIAG = ICEIL( DBLESZ*NDIAG, ZPLXSZ ) NOFFD = ICEIL( DBLESZ*NOFFD, ZPLXSZ ) * IPA = IPREPAD + 1 IPD = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPE = IPD + NDIAG + IPOSTPAD + IPREPAD IPT = IPE + NOFFD + IPOSTPAD + IPREPAD IPW = IPT + NQ + IPOSTPAD + IPREPAD * * Calculate the amount of workspace required for the * reduction * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) LWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + NPS * WORKTRD = LWMIN + IPOSTPAD WORKSIZ = WORKTRD * * Figure the amount of workspace required by the check * IF( CHECK ) THEN ITEMP = 2*NQ + NP IF( NPROW.NE.NPCOL ) THEN LCM = ILCM( NPROW, NPCOL ) ITEMP = NB*ICEIL( ICEIL( NP, NB ), LCM / NPROW ) + $ ITEMP END IF ITEMP = MAX( ICEIL( DBLESZ*ITEMP, ZPLXSZ ), $ 2*( NB+NP )*NB ) WORKSIZ = MAX( LWMIN, ITEMP ) + IPOSTPAD END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Tridiagonal reduction', $ ( IPW+WORKSIZ )*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * * * Generate the matrix A * CALL PZMATGEN( ICTXT, 'Hemm', 'N', DESCA( M_ ), $ DESCA( N_ ), DESCA( MB_ ), DESCA( NB_ ), $ MEM( IPA ), DESCA( LLD_ ), DESCA( RSRC_ ), $ DESCA( CSRC_ ), IASEED, 0, NP, 0, NQ, $ MYROW, MYCOL, NPROW, NPCOL ) * * * Need Infinity-norm of A for checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NDIAG, 1, MEM( IPD-IPREPAD ), $ NDIAG, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, NOFFD, 1, MEM( IPE-IPREPAD ), $ NOFFD, IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, NQ, 1, MEM( IPT-IPREPAD ), NQ, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANHE( 'I', UPLO, N, MEM( IPA ), 1, 1, $ DESCA, MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANHE', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, WORKTRD-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKTRD-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) END IF * CALL SLBOOT CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Reduce to symmetric tridiagonal form * CALL PZHETTRD( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), LWMIN, INFO ) * CALL SLTIMER( 1 ) * IF( CHECK ) THEN * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZHETTRD', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETTRD', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PZCHEKPAD( ICTXT, 'PZHETTRDc', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETTRDd', NQ, 1, $ MEM( IPT-IPREPAD ), NQ, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETTRDe', WORKTRD-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKTRD-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute fctres = ||A - QTQ'|| / (||A|| * N * eps) * CALL PZHETDRV( UPLO, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPD ), MEM( IPE ), MEM( IPT ), $ MEM( IPW ), IERR( 1 ) ) * * TTRD does not preserve the upper triangular part of A. * The following call to PZLATRAN means that we only * check the lower triangular part of A - QTQ' * CALL PZLATRAN( N, 1, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZLAFCHK( 'Hemm', 'No', N, N, MEM( IPA ), 1, 1, $ DESCA, IASEED, ANORM, FRESID, $ MEM( IPW ) ) * * Check for memory overwrite * CALL PZCHEKPAD( ICTXT, 'PZHETDRVf', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETDRVg', NDIAG, 1, $ MEM( IPD-IPREPAD ), NDIAG, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETDRVh', NOFFD, 1, $ MEM( IPE-IPREPAD ), NOFFD, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZHETDRVi', WORKSIZ-IPOSTPAD, $ 1, MEM( IPW-IPREPAD ), $ WORKSIZ-IPOSTPAD, IPREPAD, IPOSTPAD, $ PADVAL ) * * Test residual and detect NaN result * IF( FRESID.LE.THRESH .AND. FRESID-FRESID.EQ. $ 0.0D+0 .AND. IERR( 1 ).EQ.0 ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9991 )FRESID KFAIL = KFAIL + 1 PASSED = 'FAILED' * * END IF * * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 .AND. IERR( 1 ).NE.0 ) $ WRITE( NOUT, FMT = * )'D or E copies incorrect ...' ELSE * * Don't perform the checking, only the timing operation * KPASS = KPASS + 1 FRESID = FRESID - FRESID PASSED = 'BYPASS' END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 50, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 50, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * TRD requires 16/3 N^3 floating point operations * NOPS = DBLE( N ) NOPS = ( 16.0D+0 / 3.0D+0 )*NOPS**3 NOPS = NOPS / 1.0D+6 * * Print WALL time * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / WTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9992 )'WALL', N, INTERLEAVE, $ TWOGEMMS, BALANCED, ANB, PNB, NPROW*NPCOL, $ WTIME( 1 ), TMFLOPS, FRESID, PASSED * * Print CPU time * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / CTIME( 1 ) ELSE TMFLOPS = 0.0D+0 END IF IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9992 )'CPU ', N, INTERLEAVE, $ TWOGEMMS, BALANCED, ANB, PNB, NPROW*NPCOL, $ CTIME( 1 ), TMFLOPS, FRESID, PASSED * * * If split times were collected (in PZHEttrd.f), print * them out. * IF( WTIME( 13 )+WTIME( 15 )+WTIME( 16 ).GT.0.0D+0 .OR. $ CTIME( 13 )+CTIME( 15 )+CTIME( 16 ).GT.0.0D+0 ) $ THEN SPLITSTIMED = 1 END IF IF( SPLITSTIMED.EQ.1 ) THEN WRITE( NOUT, FMT = 9990 )WTIME( 10 ), WTIME( 11 ), $ WTIME( 12 ), WTIME( 13 ), WTIME( 14 ), $ WTIME( 15 ) WRITE( NOUT, FMT = 9989 )WTIME( 16 ), WTIME( 17 ), $ WTIME( 18 ), WTIME( 19 ), WTIME( 20 ), $ WTIME( 21 ) * WRITE( NOUT, FMT = 9988 )CTIME( 10 ), CTIME( 11 ), $ CTIME( 12 ), CTIME( 13 ), CTIME( 14 ), $ CTIME( 15 ) WRITE( NOUT, FMT = 9987 )CTIME( 16 ), CTIME( 17 ), $ CTIME( 18 ), CTIME( 19 ), CTIME( 20 ), $ CTIME( 21 ) WRITE( NOUT, FMT = 9986 )N, NPROW*NPCOL, PNB, ANB, $ INTERLEAVE, BALANCED, TWOGEMMS, TIMEINTERNALS END IF END IF 10 CONTINUE 20 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN IF( SPLITSTIMED.EQ.1 ) THEN WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = 9974 ) WRITE( NOUT, FMT = 9973 ) END IF END IF * * CALL BLACS_GRIDEXIT( ICTXT ) 30 CONTINUE RETURN * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) * 9995 FORMAT( 'PZHETTRD, tailored reduction to tridiagonal form, test.' $ ) 9994 FORMAT( 'TIME N int 2gm bal anb pnb prcs TRD Time ', $ ' MFLOPS Residual CHECK' ) 9993 FORMAT( '---- ---- --- --- --- --- --- ---- -------- ', $ '----------- -------- ------' ) 9992 FORMAT( A4, 1X, I5, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 1X, I3, 1X, $ I5, 1X, F9.2, 1X, F11.2, 1X, F8.2, 1X, A6 ) 9991 FORMAT( '||A - Q*T*Q''|| / (||A|| * N * eps) = ', G25.7 ) 9990 FORMAT( 'wsplit1=[wsplit1;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9989 FORMAT( 'wsplit2=[wsplit2;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9988 FORMAT( 'csplit1=[csplit1;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9987 FORMAT( 'csplit2=[csplit2;', F9.2, 1X, F9.2, 1X, F9.2, 1X, F9.2, $ 1X, F9.2, 1X, F9.2, ' ];' ) 9986 FORMAT( 'size_opts=[size_opts;', I4, 1X, I4, 1X, I4, 1X, I4, 1X, $ I4, 1X, I4, 1X, I4, 1X, I4, 1X, ' ];' ) 9985 FORMAT( 'N=1; NPROCS=2; PNB=3; ANB=4; INTERLEAVE=5; BALANCED=6;', $ ' TWOGEMMS=7; TIMEINTERNALS=8;' ) 9984 FORMAT( 'S1_OVERHEAD = 1; % Should be mainly cost of barrier' ) 9983 FORMAT( 'S1_BARRIER = 2; % Cost of barrier' ) 9982 FORMAT( 'S1_UPDCURCOL = 3; % Update the current column' ) 9981 FORMAT( 'S1_HOUSE = 4; % Compute the householder vector' ) 9980 FORMAT( 'S1_SPREAD = 5; % Spread across' ) 9979 FORMAT( 'S1_TRANSPOSE = 6; % Transpose' ) 9978 FORMAT( 'S2_UPDCURBLK = 1; % Update the current block column' ) 9977 FORMAT( 'S2_TRMVT = 2; % TRMVT v = A * h; vt = ht * A'' ' ) 9976 FORMAT( 'S2_UPD_V = 3; % v = v + V * HT * h + H * VT * h ' ) 9975 FORMAT( 'S2_TRANS_SUM = 4; % v = v + vt'' ' ) 9974 FORMAT( 'S2_DOT = 5; % c = v'' * h ' ) 9973 FORMAT( 'S2_R2K = 6; % A = A - v * h'' - h * v'' ' ) * * * End of PZTTRDTESTER * END scalapack-2.0.2/TESTING/EIG/xpjlaenv.f000644 000766 000024 00000034401 11622500733 017431 0ustar00juliestaff000000 000000 INTEGER FUNCTION PJLAENV( ICTXT, ISPEC, NAME, OPTS, N1, $ N2, N3, N4 ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 2, 2000 * * .. Scalar Arguments .. CHARACTER*( * ) NAME, OPTS INTEGER ICTXT, ISPEC, N1, N2, N3, N4 * .. * * xpjlaenv.f versus pjlaenv.f * =========================== * * xpjlaenv.f is used during testing to allow the timer/tester to * control pjlaenv's return values by setting common variables. * xpjlaenv.f guarantees that the return value is the same as the * corresponding value in common. xpjlaenv.f either reads values * from common and uses them as return values or it writes the * return value to common. Either way, xpjlaenv.f's return * value and the correpsonding value in common will always match. * * When the common variable "TIMING" is set, the other common * variables are set to the values returned by xpjlaenv.f, else * xpjlaenv.f returns the values as set in common. * * Purpose * * ======= * * PJLAENV is called from the ScaLAPACK symmetric and Hermitian * tailored eigen-routines to choose * problem-dependent parameters for the local environment. See ISPEC * for a description of the parameters. * * This version provides a set of parameters which should give good, * but not optimal, performance on many of the currently available * computers. Users are encouraged to modify this subroutine to set * the tuning parameters for their particular machine using the option * and problem size information in the arguments. * * This routine will not function correctly if it is converted to all * lower case. Converting it to all upper case is allowed. * * Arguments * ========= * * ISPEC (global input) INTEGER * Specifies the parameter to be returned as the value of * PJLAENV. * = 1: the data layout blocksize; * = 2: the panel blocking factor; * = 3: the algorithmic blocking factor; * = 4: execution path control; * = 5: maximum size for direct call to the LAPACK routine * * NAME (global input) CHARACTER*(*) * The name of the calling subroutine, in either upper case or * lower case. * * OPTS (global input) CHARACTER*(*) * The character options to the subroutine NAME, concatenated * into a single character string. For example, UPLO = 'U', * TRANS = 'T', and DIAG = 'N' for a triangular routine would * be specified as OPTS = 'UTN'. * * N1 (global input) INTEGER * N2 (global input) INTEGER * N3 (global input) INTEGER * N4 (global input) INTEGER * Problem dimensions for the subroutine NAME; these may not all * be required. * * At present, only N1 is used, and it (N1) is used only for * 'TTRD' * * (PJLAENV) (global or local output) INTEGER * >= 0: the value of the parameter specified by ISPEC * < 0: if PJLAENV = -k, the k-th argument had an illegal * value. * * Most parameters set via a call to PJLAENV must be identical * on all processors and hence PJLAENV will return the same * value to all procesors (i.e. global output). However some, * in particular, the panel blocking factor can be different * on each processor and hence PJLAENV can return different * values on different processors (i.e. local output). * * Further Details * =============== * * The following conventions have been used when calling PJLAENV from * the ScaLAPACK routines: * 1) OPTS is a concatenation of all of the character options to * subroutine NAME, in the same order that they appear in the * argument list for NAME, even if they are not used in determining * the value of the parameter specified by ISPEC. * 2) The problem dimensions N1, N2, N3, N4 are specified in the order * that they appear in the argument list for NAME. N1 is used * first, N2 second, and so on, and unused problem dimensions are * passed a value of -1. * 3) The parameter value returned by PJLAENV is checked for validity * in the calling subroutine. For example, PJLAENV is used to * retrieve the optimal blocksize for STRTRI as follows: * * NB = PJLAENV( 1, 'STRTRI', UPLO // DIAG, N, -1, -1, -1 ) * IF( NB.LE.1 ) NB = MAX( 1, N ) * * PJLAENV is patterned after ILAENV and keeps the same interface in * anticipation of future needs, even though PJLAENV is only sparsely * used at present in ScaLAPACK. Most ScaLAPACK codes use the input * data layout blocking factor as the algorithmic blocking factor - * hence there is no need or opportunity to set the algorithmic or * data decomposition blocking factor. * * pXYYtevx.f and pXYYtgvx.f and pXYYttrd.f are the only codes which * call PJLAENV in this release. pXYYtevx.f and pXYYtgvx.f redistribute * the data to the best data layout for each transformation. pXYYttrd.f * uses a data layout blocking factor of 1 and a * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL CNAME, GLOBAL, SNAME, TIME CHARACTER C1 CHARACTER*2 C2, C4 CHARACTER*3 C3 CHARACTER*8 SUBNAM INTEGER I, IC, IDUMM, IZ, MSZ, NB * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR * .. * * * .. Scalars in Common .. INTEGER ANB, BALANCED, BCKBLOCK, GSTBLOCK, INTERLEAVE, $ LLTBLOCK, MINSZ, PNB, TIMING, TRSBLOCK, $ TWOGEMMS * .. * .. External Subroutines .. EXTERNAL IGAMX2D * .. * .. Common blocks .. COMMON / BLOCKSIZES / GSTBLOCK, LLTBLOCK, BCKBLOCK, $ TRSBLOCK COMMON / MINSIZE / MINSZ COMMON / PJLAENVTIMING / TIMING COMMON / TAILOREDOPTS / PNB, ANB, INTERLEAVE, $ BALANCED, TWOGEMMS * .. * .. Executable Statements .. * TIME = ( TIMING.EQ.1 ) * * GO TO ( 10, 10, 10, 10, 10 )ISPEC * * Invalid value for ISPEC * PJLAENV = -1 RETURN * 10 CONTINUE * * Convert NAME to upper case if the first character is lower case. * PJLAENV = 1 SUBNAM = NAME IC = ICHAR( SUBNAM( 1: 1 ) ) IZ = ICHAR( 'Z' ) IF( IZ.EQ.100 .OR. IZ.EQ.122 ) THEN * * ASCII character set * IF( IC.GE.97 .AND. IC.LE.122 ) THEN SUBNAM( 1: 1 ) = CHAR( IC-32 ) DO 20 I = 2, 6 IC = ICHAR( SUBNAM( I: I ) ) IF( IC.GE.97 .AND. IC.LE.122 ) $ SUBNAM( I: I ) = CHAR( IC-32 ) 20 CONTINUE END IF * ELSE IF( IZ.EQ.233 .OR. IZ.EQ.169 ) THEN * * EBCDIC character set * IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR. $ ( IC.GE.145 .AND. IC.LE.153 ) .OR. $ ( IC.GE.162 .AND. IC.LE.169 ) ) THEN SUBNAM( 1: 1 ) = CHAR( IC+64 ) DO 30 I = 2, 6 IC = ICHAR( SUBNAM( I: I ) ) IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR. $ ( IC.GE.145 .AND. IC.LE.153 ) .OR. $ ( IC.GE.162 .AND. IC.LE.169 ) )SUBNAM( I: $ I ) = CHAR( IC+64 ) 30 CONTINUE END IF * ELSE IF( IZ.EQ.218 .OR. IZ.EQ.250 ) THEN * * Prime machines: ASCII+128 * IF( IC.GE.225 .AND. IC.LE.250 ) THEN SUBNAM( 1: 1 ) = CHAR( IC-32 ) DO 40 I = 2, 6 IC = ICHAR( SUBNAM( I: I ) ) IF( IC.GE.225 .AND. IC.LE.250 ) $ SUBNAM( I: I ) = CHAR( IC-32 ) 40 CONTINUE END IF END IF * C1 = SUBNAM( 2: 2 ) SNAME = C1.EQ.'S' .OR. C1.EQ.'D' CNAME = C1.EQ.'C' .OR. C1.EQ.'Z' IF( .NOT.( CNAME .OR. SNAME ) ) $ RETURN C2 = SUBNAM( 3: 4 ) C3 = SUBNAM( 5: 7 ) C4 = C3( 2: 3 ) * * This is to keep ftnchek happy * IF( ( N2+N3+N4 )*0.NE.0 ) THEN C4 = OPTS C3 = C4 END IF * GO TO ( 50, 60, 70, 80, 90 )ISPEC * 50 CONTINUE * * ISPEC = 1: data layout block size * (global - all processes must use the same value) * * In these examples, separate code is provided for setting NB for * real and complex. We assume that NB will take the same value in * single or double precision. * NB = 1 * IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'LLT' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF IF( TIME ) THEN LLTBLOCK = NB ELSE NB = LLTBLOCK IF( NB.LE.0 ) THEN PRINT *, 'xpjlaenv.f ERROR common variable LLTBLOCK', $ ' may be unitialized' c CALL EXIT( 13 ) STOP END IF END IF ELSE IF( C3.EQ.'TTR' ) THEN IF( SNAME ) THEN NB = 1 ELSE NB = 1 END IF ELSE IF( C3.EQ.'GST' ) THEN IF( SNAME ) THEN NB = 32 ELSE NB = 32 END IF IF( TIME ) THEN GSTBLOCK = NB ELSE NB = GSTBLOCK IF( NB.LE.0 ) THEN PRINT *, 'xpjlaenv.f ERROR common variable GSTBLOCK', $ ' may be unitialized' c CALL EXIT( 13 ) STOP END IF END IF ELSE IF( C3.EQ.'BCK' ) THEN IF( SNAME ) THEN NB = 32 ELSE NB = 32 END IF IF( TIME ) THEN BCKBLOCK = NB ELSE NB = BCKBLOCK IF( NB.LE.0 ) THEN PRINT *, 'xpjlaenv.f ERROR common variable BCKBLOCK', $ ' may be unitialized' c CALL EXIT( 13 ) STOP END IF END IF ELSE IF( C3.EQ.'TRS' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF IF( TIME ) THEN TRSBLOCK = NB ELSE NB = TRSBLOCK IF( NB.LE.0 ) THEN PRINT *, 'xpjlaenv.f ERROR common variable TRSBLOCK', $ ' may be unitialized' c CALL EXIT( 13 ) STOP END IF END IF END IF END IF * * PJLAENV = NB GLOBAL = .TRUE. GO TO 100 * 60 CONTINUE * * ISPEC = 2: panel blocking factor (Used only in PxyyTTRD) * (local - different processes may use different values) * NB = 16 IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'TTR' ) THEN IF( SNAME ) THEN NB = 32 ELSE NB = 32 END IF END IF END IF IF( TIME ) THEN PNB = NB ELSE NB = PNB IF( NB.LE.0 ) THEN PRINT *, 'xpjlaenv.f ERROR common variable PNB', $ ' may be unitialized' c CALL EXIT( 13 ) STOP END IF END IF PJLAENV = NB GLOBAL = .FALSE. GO TO 100 * * 70 CONTINUE * * ISPEC = 3: algorithmic blocking factor (Used only in PxyyTTRD) * (global - all processes must use the same value) * NB = 16 NB = 1 IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'TTR' ) THEN IF( SNAME ) THEN NB = 16 ELSE NB = 16 END IF END IF END IF IF( TIME ) THEN ANB = NB ELSE NB = ANB IF( NB.LE.0 ) THEN PRINT *, 'xpjlaenv.f ERROR common variable ANB', $ ' may be unitialized' c CALL EXIT( 13 ) STOP END IF END IF PJLAENV = NB GLOBAL = .TRUE. GO TO 100 * 80 CONTINUE * * ISPEC = 4: Execution path options (Used only in PxyyTTRD) * (global - all processes must use the same value) * PJLAENV = -4 IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'TTR' ) THEN * V and H interleaved (default is not interleaved) IF( N1.EQ.1 ) THEN PJLAENV = 1 IF( TIME ) THEN INTERLEAVE = PJLAENV ELSE PJLAENV = INTERLEAVE END IF END IF * * Two ZGEMMs (default is one ZGEMM) IF( N1.EQ.2 ) THEN PJLAENV = 0 IF( TIME ) THEN TWOGEMMS = PJLAENV ELSE PJLAENV = TWOGEMMS END IF END IF * Balanced Update (default is minimum communication update) IF( N1.EQ.3 ) THEN PJLAENV = 0 IF( TIME ) THEN BALANCED = PJLAENV ELSE PJLAENV = BALANCED END IF END IF END IF END IF GLOBAL = .TRUE. GO TO 100 * 90 CONTINUE * * ISPEC = 5: Minimum size to justify call to parallel code * (global - all processes must use the same value) * MSZ = 0 IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'TTR' ) THEN IF( SNAME ) THEN MSZ = 100 ELSE MSZ = 100 END IF END IF END IF IF( TIME ) THEN MINSZ = MSZ ELSE MSZ = MINSZ END IF PJLAENV = MSZ GLOBAL = .TRUE. GO TO 100 * 100 CONTINUE * IF( GLOBAL ) THEN IDUMM = 0 CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, PJLAENV, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) END IF * * * RETURN * * End of PJLAENV * END scalapack-2.0.2/SRC/bdlaapp.f000644 000766 000024 00000011443 11654534541 016125 0ustar00juliestaff000000 000000 SUBROUTINE BDLAAPP( ISIDE, M, N, NB, A, LDA, NITRAF, ITRAF, $ DTRAF, WORK ) IMPLICIT NONE * * .. Scalar Arguments .. INTEGER ISIDE, LDA, M, N, NB, NITRAF * .. * .. Array Arguments .. INTEGER ITRAF( * ) DOUBLE PRECISION A( LDA, * ), DTRAF( * ), WORK( * ) * * * Purpose * ======= * * BDLAAPP computes * * B = Q**T * A or B = A * Q, * * where A is an M-by-N matrix and Q is an orthogonal matrix represented * by the parameters in the arrays ITRAF and DTRAF as described in * BDTREXC. * * This is an auxiliary routine called by BDTRSEN. * * Arguments * ========= * * ISIDE (input) INTEGER * Specifies whether Q multiplies A from the left or right as * follows: * = 0: compute B = Q**T * A; * = 1: compute B = A * Q. * * M (input) INTEGER * The number of rows of A. * * N (input) INTEGER * The number of columns of A. * * NB (input) INTEGER * If ISIDE = 0, the Q is applied block column-wise to the rows * of A and NB specifies the maximal width of the block columns. * If ISIDE = 1, this variable is not referenced. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the matrix A. * On exit, A is overwritten by B. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * NITRAF (input) INTEGER * Length of the array ITRAF. NITRAF >= 0. * * ITRAF (input) INTEGER array, length NITRAF * List of parameters for representing the transformation * matrix Q, see BDTREXC. * * DTRAF (output) DOUBLE PRECISION array, length k, where * List of parameters for representing the transformation * matrix Q, see BDTREXC. * * WORK (workspace) DOUBLE PRECISION array, dimension (N) * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IT, J, NNB, PD DOUBLE PRECISION TAU * .. * .. External Subroutines .. EXTERNAL DLARFX, DROT * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Quick return if possible. * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * IF( ISIDE.EQ.0 ) THEN * * Apply Q from left. * DO 20 J = 1, N, NB PD = 1 NNB = MIN( NB, N - J + 1 ) DO 10 I = 1, NITRAF IT = ITRAF(I) IF( IT.LE.M ) THEN * * Apply Givens rotation. * CALL DROT( NNB, A(IT,J), LDA, A(IT+1,J), LDA, $ DTRAF(PD), DTRAF(PD+1) ) PD = PD + 2 ELSE IF( IT.LE.2*M ) THEN * * Apply Householder reflector of first kind. * TAU = DTRAF(PD) DTRAF(PD) = ONE CALL DLARFX( 'Left', 3, NNB, DTRAF(PD), TAU, $ A(IT-M,J), LDA, WORK ) DTRAF(PD) = TAU PD = PD + 3 ELSE * * Apply Householder reflector of second kind. * TAU = DTRAF(PD+2) DTRAF(PD+2) = ONE CALL DLARFX( 'Left', 3, NNB, DTRAF(PD), TAU, $ A(IT-2*M,J), LDA, WORK ) DTRAF(PD+2) = TAU PD = PD + 3 END IF 10 CONTINUE 20 CONTINUE ELSE PD = 1 DO 30 I = 1, NITRAF IT = ITRAF(I) IF( IT.LE.N ) THEN * * Apply Givens rotation. * CALL DROT( M, A(1,IT), 1, A(1,IT+1), 1, DTRAF(PD), $ DTRAF(PD+1) ) PD = PD + 2 ELSE IF( IT.LE.2*N ) THEN * * Apply Householder reflector of first kind. * TAU = DTRAF(PD) DTRAF(PD) = ONE CALL DLARFX( 'Right', M, 3, DTRAF(PD), TAU, A(1,IT-N), $ LDA, WORK ) DTRAF(PD) = TAU PD = PD + 3 ELSE * * Apply Householder reflector of second kind. * TAU = DTRAF(PD+2) DTRAF(PD+2) = ONE CALL DLARFX( 'Right', M, 3, DTRAF(PD), TAU, A(1,IT-2*N), $ LDA, WORK ) DTRAF(PD+2) = TAU PD = PD + 3 END IF 30 CONTINUE END IF * RETURN * * End of BDLAAPP * END scalapack-2.0.2/SRC/bdlaexc.f000644 000766 000024 00000025546 11750130340 016117 0ustar00juliestaff000000 000000 SUBROUTINE BDLAEXC( N, T, LDT, J1, N1, N2, ITRAF, DTRAF, WORK, $ INFO ) IMPLICIT NONE * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER INFO, J1, LDT, N, N1, N2 * .. * .. Array Arguments .. INTEGER ITRAF( * ) DOUBLE PRECISION DTRAF( * ), T( LDT, * ), WORK( * ) * .. * * Purpose * ======= * * BDLAEXC swaps adjacent diagonal blocks T11 and T22 of order 1 or 2 in * an upper quasi-triangular matrix T by an orthogonal similarity * transformation. * * In contrast to the LAPACK routine DLAEXC, the orthogonal * transformation matrix Q is not explicitly constructed but * represented by paramaters contained in the arrays ITRAF and DTRAF, * see the description of BDTREXC for more details. * * T must be in Schur canonical form, that is, block upper triangular * with 1-by-1 and 2-by-2 diagonal blocks; each 2-by-2 diagonal block * has its diagonal elemnts equal and its off-diagonal elements of * opposite sign. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix T. N >= 0. * * T (input/output) DOUBLE PRECISION array, dimension (LDT,N) * On entry, the upper quasi-triangular matrix T, in Schur * canonical form. * On exit, the updated matrix T, again in Schur canonical form. * * LDT (input) INTEGER * The leading dimension of the array T. LDT >= max(1,N). * * J1 (input) INTEGER * The index of the first row of the first block T11. * * N1 (input) INTEGER * The order of the first block T11. N1 = 0, 1 or 2. * * N2 (input) INTEGER * The order of the second block T22. N2 = 0, 1 or 2. * * ITRAF (output) INTEGER array, length k, where * k = 1, if N1+N2 = 2; * k = 2, if N1+N2 = 3; * k = 4, if N1+N2 = 4. * List of parameters for representing the transformation * matrix Q, see BDTREXC. * * DTRAF (output) DOUBLE PRECISION array, length k, where * k = 2, if N1+N2 = 2; * k = 5, if N1+N2 = 3; * k = 10, if N1+N2 = 4. * List of parameters for representing the transformation * matrix Q, see BDTREXC. * * WORK (workspace) DOUBLE PRECISION array, dimension (N) * * INFO (output) INTEGER * = 0: successful exit * = 1: the transformed matrix T would be too far from Schur * form; the blocks are not swapped and T and Q are * unchanged. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) DOUBLE PRECISION TEN PARAMETER ( TEN = 1.0D+1 ) INTEGER LDD, LDX PARAMETER ( LDD = 4, LDX = 2 ) * .. * .. Local Scalars .. INTEGER IERR, J2, J3, J4, K, LD, LI, ND DOUBLE PRECISION CS, DNORM, EPS, SCALE, SMLNUM, SN, T11, T22, $ T33, TAU, TAU1, TAU2, TEMP, THRESH, WI1, WI2, $ WR1, WR2, XNORM * .. * .. Local Arrays .. DOUBLE PRECISION D( LDD, 4 ), X( LDX, 2 ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLANGE EXTERNAL DLAMCH, DLANGE * .. * .. External Subroutines .. EXTERNAL DLAMOV, DLANV2, DLARFG, DLARFX, DLARTG, DLASY2, $ DROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( N.EQ.0 .OR. N1.EQ.0 .OR. N2.EQ.0 ) $ RETURN IF( J1+N1.GT.N ) $ RETURN * J2 = J1 + 1 J3 = J1 + 2 J4 = J1 + 3 * IF( N1.EQ.1 .AND. N2.EQ.1 ) THEN * * Swap two 1-by-1 blocks. * T11 = T( J1, J1 ) T22 = T( J2, J2 ) * * Determine the transformation to perform the interchange. * CALL DLARTG( T( J1, J2 ), T22-T11, CS, SN, TEMP ) * * Apply transformation to the matrix T. * IF( J3.LE.N ) $ CALL DROT( N-J1-1, T( J1, J3 ), LDT, T( J2, J3 ), LDT, CS, $ SN ) CALL DROT( J1-1, T( 1, J1 ), 1, T( 1, J2 ), 1, CS, SN ) * T( J1, J1 ) = T22 T( J2, J2 ) = T11 * ITRAF( 1 ) = J1 DTRAF( 1 ) = CS DTRAF( 2 ) = SN * ELSE * * Swapping involves at least one 2-by-2 block. * * Copy the diagonal block of order N1+N2 to the local array D * and compute its norm. * ND = N1 + N2 CALL DLAMOV( 'Full', ND, ND, T( J1, J1 ), LDT, D, LDD ) DNORM = DLANGE( 'Max', ND, ND, D, LDD, WORK ) * * Compute machine-dependent threshold for test for accepting * swap. * EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) / EPS THRESH = MAX( TEN*EPS*DNORM, SMLNUM ) * * Solve T11*X - X*T22 = scale*T12 for X. * CALL DLASY2( .FALSE., .FALSE., -1, N1, N2, D, LDD, $ D( N1+1, N1+1 ), LDD, D( 1, N1+1 ), LDD, SCALE, X, $ LDX, XNORM, IERR ) * * Swap the adjacent diagonal blocks. * K = N1 + N1 + N2 - 3 GO TO ( 10, 20, 30 )K * 10 CONTINUE * * N1 = 1, N2 = 2: generate elementary reflector H so that: * * ( scale, X11, X12 ) H = ( 0, 0, * ) * DTRAF( 1 ) = SCALE DTRAF( 2 ) = X( 1, 1 ) DTRAF( 3 ) = X( 1, 2 ) CALL DLARFG( 3, DTRAF( 3 ), DTRAF, 1, TAU ) DTRAF( 3 ) = ONE T11 = T( J1, J1 ) * * Perform swap provisionally on diagonal block in D. * CALL DLARFX( 'Left', 3, 3, DTRAF, TAU, D, LDD, WORK ) CALL DLARFX( 'Right', 3, 3, DTRAF, TAU, D, LDD, WORK ) * * Test whether to reject swap. * IF( MAX( ABS( D( 3, 1 ) ), ABS( D( 3, 2 ) ), ABS( D( 3, $ 3 )-T11 ) ).GT.THRESH )GO TO 50 * * Accept swap: apply transformation to the entire matrix T. * CALL DLARFX( 'Left', 3, N-J1+1, DTRAF, TAU, T( J1, J1 ), LDT, $ WORK ) CALL DLARFX( 'Right', J2, 3, DTRAF, TAU, T( 1, J1 ), LDT, $ WORK ) * T( J3, J1 ) = ZERO T( J3, J2 ) = ZERO T( J3, J3 ) = T11 * ITRAF( 1 ) = 2*N + J1 LI = 2 DTRAF( 3 ) = TAU LD = 4 GO TO 40 * 20 CONTINUE * * N1 = 2, N2 = 1: generate elementary reflector H so that: * * H ( -X11 ) = ( * ) * ( -X21 ) = ( 0 ) * ( scale ) = ( 0 ) * DTRAF( 1 ) = -X( 1, 1 ) DTRAF( 2 ) = -X( 2, 1 ) DTRAF( 3 ) = SCALE CALL DLARFG( 3, DTRAF( 1 ), DTRAF( 2 ), 1, TAU ) DTRAF( 1 ) = ONE T33 = T( J3, J3 ) * * Perform swap provisionally on diagonal block in D. * CALL DLARFX( 'Left', 3, 3, DTRAF, TAU, D, LDD, WORK ) CALL DLARFX( 'Right', 3, 3, DTRAF, TAU, D, LDD, WORK ) * * Test whether to reject swap. * IF( MAX( ABS( D( 2, 1 ) ), ABS( D( 3, 1 ) ), ABS( D( 1, $ 1 )-T33 ) ).GT.THRESH )GO TO 50 * * Accept swap: apply transformation to the entire matrix T. * CALL DLARFX( 'Right', J3, 3, DTRAF, TAU, T( 1, J1 ), LDT, $ WORK ) CALL DLARFX( 'Left', 3, N-J1, DTRAF, TAU, T( J1, J2 ), LDT, $ WORK ) * T( J1, J1 ) = T33 T( J2, J1 ) = ZERO T( J3, J1 ) = ZERO * ITRAF( 1 ) = N + J1 LI = 2 DTRAF( 1 ) = TAU LD = 4 GO TO 40 * 30 CONTINUE * * N1 = 2, N2 = 2: generate elementary reflectors H(1) and H(2) so * that: * * H(2) H(1) ( -X11 -X12 ) = ( * * ) * ( -X21 -X22 ) ( 0 * ) * ( scale 0 ) ( 0 0 ) * ( 0 scale ) ( 0 0 ) * DTRAF( 1 ) = -X( 1, 1 ) DTRAF( 2 ) = -X( 2, 1 ) DTRAF( 3 ) = SCALE CALL DLARFG( 3, DTRAF( 1 ), DTRAF( 2 ), 1, TAU1 ) DTRAF( 1 ) = ONE * TEMP = -TAU1*( X( 1, 2 )+DTRAF( 2 )*X( 2, 2 ) ) DTRAF( 4 ) = -TEMP*DTRAF( 2 ) - X( 2, 2 ) DTRAF( 5 ) = -TEMP*DTRAF( 3 ) DTRAF( 6 ) = SCALE CALL DLARFG( 3, DTRAF( 4 ), DTRAF( 5 ), 1, TAU2 ) DTRAF( 4 ) = ONE * * Perform swap provisionally on diagonal block in D. * CALL DLARFX( 'Left', 3, 4, DTRAF, TAU1, D, LDD, WORK ) CALL DLARFX( 'Right', 4, 3, DTRAF, TAU1, D, LDD, WORK ) CALL DLARFX( 'Left', 3, 4, DTRAF( 4 ), TAU2, D( 2, 1 ), LDD, $ WORK ) CALL DLARFX( 'Right', 4, 3, DTRAF( 4 ), TAU2, D( 1, 2 ), LDD, $ WORK ) * * Test whether to reject swap. * IF( MAX( ABS( D( 3, 1 ) ), ABS( D( 3, 2 ) ), ABS( D( 4, 1 ) ), $ ABS( D( 4, 2 ) ) ).GT.THRESH )GO TO 50 * * Accept swap: apply transformation to the entire matrix T. * CALL DLARFX( 'Left', 3, N-J1+1, DTRAF, TAU1, T( J1, J1 ), LDT, $ WORK ) CALL DLARFX( 'Right', J4, 3, DTRAF, TAU1, T( 1, J1 ), LDT, $ WORK ) CALL DLARFX( 'Left', 3, N-J1+1, DTRAF( 4 ), TAU2, T( J2, J1 ), $ LDT, WORK ) CALL DLARFX( 'Right', J4, 3, DTRAF( 4 ), TAU2, T( 1, J2 ), LDT, $ WORK ) * T( J3, J1 ) = ZERO T( J3, J2 ) = ZERO T( J4, J1 ) = ZERO T( J4, J2 ) = ZERO * ITRAF( 1 ) = N + J1 ITRAF( 2 ) = N + J2 LI = 3 DTRAF( 1 ) = TAU1 DTRAF( 4 ) = TAU2 LD = 7 GO TO 40 * 40 CONTINUE * IF( N2.EQ.2 ) THEN * * Standardize new 2-by-2 block T11 * CALL DLANV2( T( J1, J1 ), T( J1, J2 ), T( J2, J1 ), $ T( J2, J2 ), WR1, WI1, WR2, WI2, CS, SN ) CALL DROT( N-J1-1, T( J1, J1+2 ), LDT, T( J2, J1+2 ), LDT, $ CS, SN ) CALL DROT( J1-1, T( 1, J1 ), 1, T( 1, J2 ), 1, CS, SN ) ITRAF( LI ) = J1 LI = LI + 1 DTRAF( LD ) = CS DTRAF( LD+1 ) = SN LD = LD + 2 END IF * IF( N1.EQ.2 ) THEN * * Standardize new 2-by-2 block T22 * J3 = J1 + N2 J4 = J3 + 1 CALL DLANV2( T( J3, J3 ), T( J3, J4 ), T( J4, J3 ), $ T( J4, J4 ), WR1, WI1, WR2, WI2, CS, SN ) IF( J3+2.LE.N ) $ CALL DROT( N-J3-1, T( J3, J3+2 ), LDT, T( J4, J3+2 ), $ LDT, CS, SN ) CALL DROT( J3-1, T( 1, J3 ), 1, T( 1, J4 ), 1, CS, SN ) ITRAF( LI ) = J3 DTRAF( LD ) = CS DTRAF( LD+1 ) = SN END IF * END IF RETURN * * Exit with INFO = 1 if swap was rejected. * 50 CONTINUE INFO = 1 RETURN * * End of BDLAEXC * END scalapack-2.0.2/SRC/bdtrexc.f000644 000766 000024 00000044221 11654025546 016155 0ustar00juliestaff000000 000000 SUBROUTINE BDTREXC( N, T, LDT, IFST, ILST, NITRAF, ITRAF, $ NDTRAF, DTRAF, WORK, INFO ) IMPLICIT NONE * * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. INTEGER IFST, ILST, INFO, LDT, N, NDTRAF, NITRAF * .. * .. Array Arguments .. INTEGER ITRAF( * ) DOUBLE PRECISION DTRAF( * ), T( LDT, * ), WORK( * ) * .. * * Purpose * ======= * * BDTREXC reorders the real Schur factorization of a real matrix * A = Q*T*Q**T, so that the diagonal block of T with row index IFST is * moved to row ILST. * * The real Schur form T is reordered by an orthogonal similarity * transformation Z**T*T*Z. In contrast to the LAPACK routine DTREXC, * the orthogonal matrix Z is not explicitly constructed but * represented by paramaters contained in the arrays ITRAF and DTRAF, * see further details. * * T must be in Schur canonical form (as returned by DHSEQR), that is, * block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each * 2-by-2 diagonal block has its diagonal elements equal and its * off-diagonal elements of opposite sign. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix T. N >= 0. * * T (input/output) DOUBLE PRECISION array, dimension (LDT,N) * On entry, the upper quasi-triangular matrix T, in Schur * Schur canonical form. * On exit, the reordered upper quasi-triangular matrix, again * in Schur canonical form. * * LDT (input) INTEGER * The leading dimension of the array T. LDT >= max(1,N). * * IFST (input/output) INTEGER * ILST (input/output) INTEGER * Specify the reordering of the diagonal blocks of T. * The block with row index IFST is moved to row ILST, by a * sequence of transpositions between adjacent blocks. * On exit, if IFST pointed on entry to the second row of a * 2-by-2 block, it is changed to point to the first row; ILST * always points to the first row of the block in its final * position (which may differ from its input value by +1 or -1). * 1 <= IFST <= N; 1 <= ILST <= N. * * NITRAF (input/output) INTEGER * On entry, length of the array ITRAF. * As a minimum requirement, NITRAF >= max(1,|ILST-IFST|). * If there are 2-by-2 blocks in T then NITRAF must be larger; * a safe choice is NITRAF >= max(1,2*|ILST-IFST|). * On exit, actual length of the array ITRAF. * * ITRAF (output) INTEGER array, length NITRAF * List of parameters for representing the transformation * matrix Z, see further details. * * NDTRAF (input/output) INTEGER * On entry, length of the array DTRAF. * As a minimum requirement, NDTRAF >= max(1,2*|ILST-IFST|). * If there are 2-by-2 blocks in T then NDTRAF must be larger; * a safe choice is NDTRAF >= max(1,5*|ILST-IFST|). * On exit, actual length of the array DTRAF. * * DTRAF (output) DOUBLE PRECISION array, length NDTRAF * List of parameters for representing the transformation * matrix Z, see further details. * * WORK (workspace) DOUBLE PRECISION array, dimension (N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * = 1: two adjacent blocks were too close to swap (the problem * is very ill-conditioned); T may have been partially * reordered, and ILST points to the first row of the * current position of the block being moved. * = 2: the 2 by 2 block to be reordered split into two 1 by 1 * blocks and the second block failed to swap with an * adjacent block. ILST points to the first row of the * current position of the whole block being moved. * * Further Details * =============== * * The orthogonal transformation matrix Z is a product of NITRAF * elementary orthogonal transformations. The parameters defining these * transformations are stored in the arrays ITRAF and DTRAF as follows: * * Consider the i-th transformation acting on rows/columns POS, * POS+1, ... If this transformation is * * (1) a Givens rotation with cosine C and sine S then * * ITRAF(i) = POS, * DTRAF(j) = C, DTRAF(j+1) = S; * * (2) a Householder reflector H = I - tau * v * v' with * v = [ 1; v2; v3 ] then * * ITRAF(i) = N + POS, * DTRAF(j) = tau, DTRAF(j+1) = v2, DTRAF(j+2) = v3; * * (3) a Householder reflector H = I - tau * v * v' with * v = [ v1; v2; 1 ] then * * ITRAF(i) = 2*N + POS, * DTRAF(j) = v1, DTRAF(j+1) = v2, DTRAF(j+2) = tau; * * Note that the parameters in DTRAF are stored consecutively. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER DLNGTH(3), ILNGTH(3) * .. * .. Local Scalars .. INTEGER CDTRAF, CITRAF, LDTRAF, LITRAF, HERE, I, NBF, $ NBL, NBNEXT * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL BDLAEXC, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Data Statements .. c DATA ( ILNGTH(I), I = 1, 3 ) / 1, 2, 4 / c DATA ( DLNGTH(I), I = 1, 3 ) / 2, 5, 10 / DATA ILNGTH(1)/1/, ILNGTH(2)/2/, ILNGTH(3)/4/ DATA DLNGTH(1)/2/, DLNGTH(2)/5/, DLNGTH(3)/10/ * .. * .. Executable Statements .. * * Decode and test the input arguments. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN INFO = -3 ELSE IF( IFST.LT.1 .OR. IFST.GT.N ) THEN INFO = -4 ELSE IF( ILST.LT.1 .OR. ILST.GT.N ) THEN INFO = -5 ELSE IF ( NITRAF.LT.MAX( 1, ABS( ILST-IFST ) ) ) THEN INFO = -6 ELSE IF ( NDTRAF.LT.MAX( 1, 2*ABS( ILST-IFST ) ) ) THEN INFO = -8 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DTREXC', -INFO ) RETURN END IF * * Quick return if possible * IF( N.LE.1 ) $ RETURN CITRAF = 1 CDTRAF = 1 * * Determine the first row of specified block * and find out it is 1 by 1 or 2 by 2. * IF( IFST.GT.1 ) THEN IF( T( IFST, IFST-1 ).NE.ZERO ) $ IFST = IFST - 1 END IF NBF = 1 IF( IFST.LT.N ) THEN IF( T( IFST+1, IFST ).NE.ZERO ) $ NBF = 2 END IF * * Determine the first row of the final block * and find out it is 1 by 1 or 2 by 2. * IF( ILST.GT.1 ) THEN IF( T( ILST, ILST-1 ).NE.ZERO ) $ ILST = ILST - 1 END IF NBL = 1 IF( ILST.LT.N ) THEN IF( T( ILST+1, ILST ).NE.ZERO ) $ NBL = 2 END IF * IF( IFST.EQ.ILST ) $ RETURN * IF( IFST.LT.ILST ) THEN * * Update ILST * IF( NBF.EQ.2 .AND. NBL.EQ.1 ) $ ILST = ILST - 1 IF( NBF.EQ.1 .AND. NBL.EQ.2 ) $ ILST = ILST + 1 * HERE = IFST * 10 CONTINUE * * Swap block with next one below * IF( NBF.EQ.1 .OR. NBF.EQ.2 ) THEN * * Current block either 1 by 1 or 2 by 2 * NBNEXT = 1 IF( HERE+NBF+1.LE.N ) THEN IF( T( HERE+NBF+1, HERE+NBF ).NE.ZERO ) $ NBNEXT = 2 END IF * LITRAF = ILNGTH(NBF+NBNEXT-1) LDTRAF = DLNGTH(NBF+NBNEXT-1) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE, NBF, NBNEXT, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE + NBNEXT * * Test if 2 by 2 block breaks into two 1 by 1 blocks * IF( NBF.EQ.2 ) THEN IF( T( HERE+1, HERE ).EQ.ZERO ) $ NBF = 3 END IF * ELSE * * Current block consists of two 1 by 1 blocks each of which * must be swapped individually * NBNEXT = 1 IF( HERE+3.LE.N ) THEN IF( T( HERE+3, HERE+2 ).NE.ZERO ) $ NBNEXT = 2 END IF LITRAF = ILNGTH(NBNEXT) LDTRAF = DLNGTH(NBNEXT) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE+1, 1, NBNEXT, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF * IF( NBNEXT.EQ.1 ) THEN * * Swap two 1 by 1 blocks, no problems possible * LITRAF = ILNGTH(1) LDTRAF = DLNGTH(1) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE, 1, NBNEXT, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE + 1 ELSE * * Recompute NBNEXT in case 2 by 2 split * IF( T( HERE+2, HERE+1 ).EQ.ZERO ) $ NBNEXT = 1 IF( NBNEXT.EQ.2 ) THEN * * 2 by 2 Block did not split * LITRAF = ILNGTH(2) LDTRAF = DLNGTH(2) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE, 1, NBNEXT, $ ITRAF(CITRAF), DTRAF(CDTRAF), WORK, $ INFO ) IF( INFO.NE.0 ) THEN INFO = 2 ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE + 2 ELSE * * 2 by 2 Block did split * LITRAF = ILNGTH(1) LDTRAF = DLNGTH(1) IF( CITRAF+2*LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+2*LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE, 1, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF CALL BDLAEXC( N, T, LDT, HERE+1, 1, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE + 2 END IF END IF END IF IF( HERE.LT.ILST ) $ GO TO 10 * ELSE * HERE = IFST 20 CONTINUE * * Swap block with next one above * IF( NBF.EQ.1 .OR. NBF.EQ.2 ) THEN * * Current block either 1 by 1 or 2 by 2 * NBNEXT = 1 IF( HERE.GE.3 ) THEN IF( T( HERE-1, HERE-2 ).NE.ZERO ) $ NBNEXT = 2 END IF * LITRAF = ILNGTH(NBF+NBNEXT-1) LDTRAF = DLNGTH(NBF+NBNEXT-1) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE-NBNEXT, NBNEXT, NBF, $ ITRAF(CITRAF), DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE - NBNEXT * * Test if 2 by 2 block breaks into two 1 by 1 blocks * IF( NBF.EQ.2 ) THEN IF( T( HERE+1, HERE ).EQ.ZERO ) $ NBF = 3 END IF * ELSE * * Current block consists of two 1 by 1 blocks each of which * must be swapped individually * NBNEXT = 1 IF( HERE.GE.3 ) THEN IF( T( HERE-1, HERE-2 ).NE.ZERO ) $ NBNEXT = 2 END IF LITRAF = ILNGTH(NBNEXT) LDTRAF = DLNGTH(NBNEXT) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE-NBNEXT, NBNEXT, 1, $ ITRAF(CITRAF), DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF * IF( NBNEXT.EQ.1 ) THEN * * Swap two 1 by 1 blocks, no problems possible * LITRAF = ILNGTH(1) LDTRAF = DLNGTH(1) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE, NBNEXT, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE - 1 ELSE * * Recompute NBNEXT in case 2 by 2 split * IF( T( HERE, HERE-1 ).EQ.ZERO ) $ NBNEXT = 1 IF( NBNEXT.EQ.2 ) THEN * * 2 by 2 Block did not split * LITRAF = ILNGTH(2) LDTRAF = DLNGTH(2) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE-1, 2, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN INFO = 2 ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE - 2 ELSE * * 2 by 2 Block did split * LITRAF = ILNGTH(1) LDTRAF = DLNGTH(1) IF( CITRAF+2*LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF IF( CDTRAF+2*LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BDTREXC', -INFO ) RETURN END IF CALL BDLAEXC( N, T, LDT, HERE, 1, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF CALL BDLAEXC( N, T, LDT, HERE-1, 1, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE - 2 END IF END IF END IF IF( HERE.GT.ILST ) $ GO TO 20 END IF ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 * RETURN * * End of BDTREXC * END scalapack-2.0.2/SRC/bslaapp.f000644 000766 000024 00000011443 11654534541 016144 0ustar00juliestaff000000 000000 SUBROUTINE BSLAAPP( ISIDE, M, N, NB, A, LDA, NITRAF, ITRAF, $ DTRAF, WORK ) IMPLICIT NONE * * .. Scalar Arguments .. INTEGER ISIDE, LDA, M, N, NB, NITRAF * .. * .. Array Arguments .. INTEGER ITRAF( * ) REAL A( LDA, * ), DTRAF( * ), WORK( * ) * * * Purpose * ======= * * BSLAAPP computes * * B = Q**T * A or B = A * Q, * * where A is an M-by-N matrix and Q is an orthogonal matrix represented * by the parameters in the arrays ITRAF and DTRAF as described in * BSTREXC. * * This is an auxiliary routine called by BDTRSEN. * * Arguments * ========= * * ISIDE (input) INTEGER * Specifies whether Q multiplies A from the left or right as * follows: * = 0: compute B = Q**T * A; * = 1: compute B = A * Q. * * M (input) INTEGER * The number of rows of A. * * N (input) INTEGER * The number of columns of A. * * NB (input) INTEGER * If ISIDE = 0, the Q is applied block column-wise to the rows * of A and NB specifies the maximal width of the block columns. * If ISIDE = 1, this variable is not referenced. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the matrix A. * On exit, A is overwritten by B. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * NITRAF (input) INTEGER * Length of the array ITRAF. NITRAF >= 0. * * ITRAF (input) INTEGER array, length NITRAF * List of parameters for representing the transformation * matrix Q, see BSTREXC. * * DTRAF (output) REAL array, length k, where * List of parameters for representing the transformation * matrix Q, see BSTREXC. * * WORK (workspace) REAL array, dimension (N) * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IT, J, NNB, PD REAL TAU * .. * .. External Subroutines .. EXTERNAL SLARFX, SROT * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Quick return if possible. * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * IF( ISIDE.EQ.0 ) THEN * * Apply Q from left. * DO 20 J = 1, N, NB PD = 1 NNB = MIN( NB, N - J + 1 ) DO 10 I = 1, NITRAF IT = ITRAF(I) IF( IT.LE.M ) THEN * * Apply Givens rotation. * CALL SROT( NNB, A(IT,J), LDA, A(IT+1,J), LDA, $ DTRAF(PD), DTRAF(PD+1) ) PD = PD + 2 ELSE IF( IT.LE.2*M ) THEN * * Apply Householder reflector of first kind. * TAU = DTRAF(PD) DTRAF(PD) = ONE CALL SLARFX( 'Left', 3, NNB, DTRAF(PD), TAU, $ A(IT-M,J), LDA, WORK ) DTRAF(PD) = TAU PD = PD + 3 ELSE * * Apply Householder reflector of second kind. * TAU = DTRAF(PD+2) DTRAF(PD+2) = ONE CALL SLARFX( 'Left', 3, NNB, DTRAF(PD), TAU, $ A(IT-2*M,J), LDA, WORK ) DTRAF(PD+2) = TAU PD = PD + 3 END IF 10 CONTINUE 20 CONTINUE ELSE PD = 1 DO 30 I = 1, NITRAF IT = ITRAF(I) IF( IT.LE.N ) THEN * * Apply Givens rotation. * CALL SROT( M, A(1,IT), 1, A(1,IT+1), 1, DTRAF(PD), $ DTRAF(PD+1) ) PD = PD + 2 ELSE IF( IT.LE.2*N ) THEN * * Apply Householder reflector of first kind. * TAU = DTRAF(PD) DTRAF(PD) = ONE CALL SLARFX( 'Right', M, 3, DTRAF(PD), TAU, A(1,IT-N), $ LDA, WORK ) DTRAF(PD) = TAU PD = PD + 3 ELSE * * Apply Householder reflector of second kind. * TAU = DTRAF(PD+2) DTRAF(PD+2) = ONE CALL SLARFX( 'Right', M, 3, DTRAF(PD), TAU, A(1,IT-2*N), $ LDA, WORK ) DTRAF(PD+2) = TAU PD = PD + 3 END IF 30 CONTINUE END IF * RETURN * * End of BSLAAPP * END scalapack-2.0.2/SRC/bslaexc.f000644 000766 000024 00000025544 11750130340 016134 0ustar00juliestaff000000 000000 SUBROUTINE BSLAEXC( N, T, LDT, J1, N1, N2, ITRAF, DTRAF, WORK, $ INFO ) IMPLICIT NONE * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER INFO, J1, LDT, N, N1, N2 * .. * .. Array Arguments .. INTEGER ITRAF( * ) REAL DTRAF( * ), T( LDT, * ), WORK( * ) * .. * * Purpose * ======= * * BSLAEXC swaps adjacent diagonal blocks T11 and T22 of order 1 or 2 in * an upper quasi-triangular matrix T by an orthogonal similarity * transformation. * * In contrast to the LAPACK routine DLAEXC, the orthogonal * transformation matrix Q is not explicitly constructed but * represented by paramaters contained in the arrays ITRAF and DTRAF, * see the description of BSTREXC for more details. * * T must be in Schur canonical form, that is, block upper triangular * with 1-by-1 and 2-by-2 diagonal blocks; each 2-by-2 diagonal block * has its diagonal elemnts equal and its off-diagonal elements of * opposite sign. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix T. N >= 0. * * T (input/output) REAL array, dimension (LDT,N) * On entry, the upper quasi-triangular matrix T, in Schur * canonical form. * On exit, the updated matrix T, again in Schur canonical form. * * LDT (input) INTEGER * The leading dimension of the array T. LDT >= max(1,N). * * J1 (input) INTEGER * The index of the first row of the first block T11. * * N1 (input) INTEGER * The order of the first block T11. N1 = 0, 1 or 2. * * N2 (input) INTEGER * The order of the second block T22. N2 = 0, 1 or 2. * * ITRAF (output) INTEGER array, length k, where * k = 1, if N1+N2 = 2; * k = 2, if N1+N2 = 3; * k = 4, if N1+N2 = 4. * List of parameters for representing the transformation * matrix Q, see BSTREXC. * * DTRAF (output) REAL array, length k, where * k = 2, if N1+N2 = 2; * k = 5, if N1+N2 = 3; * k = 10, if N1+N2 = 4. * List of parameters for representing the transformation * matrix Q, see BSTREXC. * * WORK (workspace) REAL array, dimension (N) * * INFO (output) INTEGER * = 0: successful exit * = 1: the transformed matrix T would be too far from Schur * form; the blocks are not swapped and T and Q are * unchanged. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) REAL TEN PARAMETER ( TEN = 10.0 ) INTEGER LDD, LDX PARAMETER ( LDD = 4, LDX = 2 ) * .. * .. Local Scalars .. INTEGER IERR, J2, J3, J4, K, LD, LI, ND REAL CS, DNORM, EPS, SCALE, SMLNUM, SN, T11, T22, $ T33, TAU, TAU1, TAU2, TEMP, THRESH, WI1, WI2, $ WR1, WR2, XNORM * .. * .. Local Arrays .. REAL D( LDD, 4 ), X( LDX, 2 ) * .. * .. External Functions .. REAL SLAMCH, SLANGE EXTERNAL SLAMCH, SLANGE * .. * .. External Subroutines .. EXTERNAL SLAMOV, SLANV2, SLARFG, SLARFX, SLARTG, SLASY2, $ SROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( N.EQ.0 .OR. N1.EQ.0 .OR. N2.EQ.0 ) $ RETURN IF( J1+N1.GT.N ) $ RETURN * J2 = J1 + 1 J3 = J1 + 2 J4 = J1 + 3 * IF( N1.EQ.1 .AND. N2.EQ.1 ) THEN * * Swap two 1-by-1 blocks. * T11 = T( J1, J1 ) T22 = T( J2, J2 ) * * Determine the transformation to perform the interchange. * CALL SLARTG( T( J1, J2 ), T22-T11, CS, SN, TEMP ) * * Apply transformation to the matrix T. * IF( J3.LE.N ) $ CALL SROT( N-J1-1, T( J1, J3 ), LDT, T( J2, J3 ), LDT, CS, $ SN ) CALL SROT( J1-1, T( 1, J1 ), 1, T( 1, J2 ), 1, CS, SN ) * T( J1, J1 ) = T22 T( J2, J2 ) = T11 * ITRAF( 1 ) = J1 DTRAF( 1 ) = CS DTRAF( 2 ) = SN * ELSE * * Swapping involves at least one 2-by-2 block. * * Copy the diagonal block of order N1+N2 to the local array D * and compute its norm. * ND = N1 + N2 CALL SLAMOV( 'Full', ND, ND, T( J1, J1 ), LDT, D, LDD ) DNORM = SLANGE( 'Max', ND, ND, D, LDD, WORK ) * * Compute machine-dependent threshold for test for accepting * swap. * EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) / EPS THRESH = MAX( TEN*EPS*DNORM, SMLNUM ) * * Solve T11*X - X*T22 = scale*T12 for X. * CALL SLASY2( .FALSE., .FALSE., -1, N1, N2, D, LDD, $ D( N1+1, N1+1 ), LDD, D( 1, N1+1 ), LDD, SCALE, X, $ LDX, XNORM, IERR ) * * Swap the adjacent diagonal blocks. * K = N1 + N1 + N2 - 3 GO TO ( 10, 20, 30 )K * 10 CONTINUE * * N1 = 1, N2 = 2: generate elementary reflector H so that: * * ( scale, X11, X12 ) H = ( 0, 0, * ) * DTRAF( 1 ) = SCALE DTRAF( 2 ) = X( 1, 1 ) DTRAF( 3 ) = X( 1, 2 ) CALL SLARFG( 3, DTRAF( 3 ), DTRAF, 1, TAU ) DTRAF( 3 ) = ONE T11 = T( J1, J1 ) * * Perform swap provisionally on diagonal block in D. * CALL SLARFX( 'Left', 3, 3, DTRAF, TAU, D, LDD, WORK ) CALL SLARFX( 'Right', 3, 3, DTRAF, TAU, D, LDD, WORK ) * * Test whether to reject swap. * IF( MAX( ABS( D( 3, 1 ) ), ABS( D( 3, 2 ) ), ABS( D( 3, $ 3 )-T11 ) ).GT.THRESH )GO TO 50 * * Accept swap: apply transformation to the entire matrix T. * CALL SLARFX( 'Left', 3, N-J1+1, DTRAF, TAU, T( J1, J1 ), LDT, $ WORK ) CALL SLARFX( 'Right', J2, 3, DTRAF, TAU, T( 1, J1 ), LDT, $ WORK ) * T( J3, J1 ) = ZERO T( J3, J2 ) = ZERO T( J3, J3 ) = T11 * ITRAF( 1 ) = 2*N + J1 LI = 2 DTRAF( 3 ) = TAU LD = 4 GO TO 40 * 20 CONTINUE * * N1 = 2, N2 = 1: generate elementary reflector H so that: * * H ( -X11 ) = ( * ) * ( -X21 ) = ( 0 ) * ( scale ) = ( 0 ) * DTRAF( 1 ) = -X( 1, 1 ) DTRAF( 2 ) = -X( 2, 1 ) DTRAF( 3 ) = SCALE CALL SLARFG( 3, DTRAF( 1 ), DTRAF( 2 ), 1, TAU ) DTRAF( 1 ) = ONE T33 = T( J3, J3 ) * * Perform swap provisionally on diagonal block in D. * CALL SLARFX( 'Left', 3, 3, DTRAF, TAU, D, LDD, WORK ) CALL SLARFX( 'Right', 3, 3, DTRAF, TAU, D, LDD, WORK ) * * Test whether to reject swap. * IF( MAX( ABS( D( 2, 1 ) ), ABS( D( 3, 1 ) ), ABS( D( 1, $ 1 )-T33 ) ).GT.THRESH )GO TO 50 * * Accept swap: apply transformation to the entire matrix T. * CALL SLARFX( 'Right', J3, 3, DTRAF, TAU, T( 1, J1 ), LDT, $ WORK ) CALL SLARFX( 'Left', 3, N-J1, DTRAF, TAU, T( J1, J2 ), LDT, $ WORK ) * T( J1, J1 ) = T33 T( J2, J1 ) = ZERO T( J3, J1 ) = ZERO * ITRAF( 1 ) = N + J1 LI = 2 DTRAF( 1 ) = TAU LD = 4 GO TO 40 * 30 CONTINUE * * N1 = 2, N2 = 2: generate elementary reflectors H(1) and H(2) so * that: * * H(2) H(1) ( -X11 -X12 ) = ( * * ) * ( -X21 -X22 ) ( 0 * ) * ( scale 0 ) ( 0 0 ) * ( 0 scale ) ( 0 0 ) * DTRAF( 1 ) = -X( 1, 1 ) DTRAF( 2 ) = -X( 2, 1 ) DTRAF( 3 ) = SCALE CALL SLARFG( 3, DTRAF( 1 ), DTRAF( 2 ), 1, TAU1 ) DTRAF( 1 ) = ONE * TEMP = -TAU1*( X( 1, 2 )+DTRAF( 2 )*X( 2, 2 ) ) DTRAF( 4 ) = -TEMP*DTRAF( 2 ) - X( 2, 2 ) DTRAF( 5 ) = -TEMP*DTRAF( 3 ) DTRAF( 6 ) = SCALE CALL SLARFG( 3, DTRAF( 4 ), DTRAF( 5 ), 1, TAU2 ) DTRAF( 4 ) = ONE * * Perform swap provisionally on diagonal block in D. * CALL SLARFX( 'Left', 3, 4, DTRAF, TAU1, D, LDD, WORK ) CALL SLARFX( 'Right', 4, 3, DTRAF, TAU1, D, LDD, WORK ) CALL SLARFX( 'Left', 3, 4, DTRAF( 4 ), TAU2, D( 2, 1 ), LDD, $ WORK ) CALL SLARFX( 'Right', 4, 3, DTRAF( 4 ), TAU2, D( 1, 2 ), LDD, $ WORK ) * * Test whether to reject swap. * IF( MAX( ABS( D( 3, 1 ) ), ABS( D( 3, 2 ) ), ABS( D( 4, 1 ) ), $ ABS( D( 4, 2 ) ) ).GT.THRESH )GO TO 50 * * Accept swap: apply transformation to the entire matrix T. * CALL SLARFX( 'Left', 3, N-J1+1, DTRAF, TAU1, T( J1, J1 ), LDT, $ WORK ) CALL SLARFX( 'Right', J4, 3, DTRAF, TAU1, T( 1, J1 ), LDT, $ WORK ) CALL SLARFX( 'Left', 3, N-J1+1, DTRAF( 4 ), TAU2, T( J2, J1 ), $ LDT, WORK ) CALL SLARFX( 'Right', J4, 3, DTRAF( 4 ), TAU2, T( 1, J2 ), LDT, $ WORK ) * T( J3, J1 ) = ZERO T( J3, J2 ) = ZERO T( J4, J1 ) = ZERO T( J4, J2 ) = ZERO * ITRAF( 1 ) = N + J1 ITRAF( 2 ) = N + J2 LI = 3 DTRAF( 1 ) = TAU1 DTRAF( 4 ) = TAU2 LD = 7 GO TO 40 * 40 CONTINUE * IF( N2.EQ.2 ) THEN * * Standardize new 2-by-2 block T11 * CALL SLANV2( T( J1, J1 ), T( J1, J2 ), T( J2, J1 ), $ T( J2, J2 ), WR1, WI1, WR2, WI2, CS, SN ) CALL SROT( N-J1-1, T( J1, J1+2 ), LDT, T( J2, J1+2 ), LDT, $ CS, SN ) CALL SROT( J1-1, T( 1, J1 ), 1, T( 1, J2 ), 1, CS, SN ) ITRAF( LI ) = J1 LI = LI + 1 DTRAF( LD ) = CS DTRAF( LD+1 ) = SN LD = LD + 2 END IF * IF( N1.EQ.2 ) THEN * * Standardize new 2-by-2 block T22 * J3 = J1 + N2 J4 = J3 + 1 CALL SLANV2( T( J3, J3 ), T( J3, J4 ), T( J4, J3 ), $ T( J4, J4 ), WR1, WI1, WR2, WI2, CS, SN ) IF( J3+2.LE.N ) $ CALL SROT( N-J3-1, T( J3, J3+2 ), LDT, T( J4, J3+2 ), $ LDT, CS, SN ) CALL SROT( J3-1, T( 1, J3 ), 1, T( 1, J4 ), 1, CS, SN ) ITRAF( LI ) = J3 DTRAF( LD ) = CS DTRAF( LD+1 ) = SN END IF * END IF RETURN * * Exit with INFO = 1 if swap was rejected. * 50 CONTINUE INFO = 1 RETURN * * End of BSLAEXC * END scalapack-2.0.2/SRC/bstrexc.f000644 000766 000024 00000044221 11654534541 016174 0ustar00juliestaff000000 000000 SUBROUTINE BSTREXC( N, T, LDT, IFST, ILST, NITRAF, ITRAF, $ NDTRAF, DTRAF, WORK, INFO ) IMPLICIT NONE * * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. INTEGER IFST, ILST, INFO, LDT, N, NDTRAF, NITRAF * .. * .. Array Arguments .. INTEGER ITRAF( * ) REAL DTRAF( * ), T( LDT, * ), WORK( * ) * .. * * Purpose * ======= * * BSTREXC reorders the real Schur factorization of a real matrix * A = Q*T*Q**T, so that the diagonal block of T with row index IFST is * moved to row ILST. * * The real Schur form T is reordered by an orthogonal similarity * transformation Z**T*T*Z. In contrast to the LAPACK routine DTREXC, * the orthogonal matrix Z is not explicitly constructed but * represented by paramaters contained in the arrays ITRAF and DTRAF, * see further details. * * T must be in Schur canonical form (as returned by DHSEQR), that is, * block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each * 2-by-2 diagonal block has its diagonal elements equal and its * off-diagonal elements of opposite sign. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix T. N >= 0. * * T (input/output) REAL array, dimension (LDT,N) * On entry, the upper quasi-triangular matrix T, in Schur * Schur canonical form. * On exit, the reordered upper quasi-triangular matrix, again * in Schur canonical form. * * LDT (input) INTEGER * The leading dimension of the array T. LDT >= max(1,N). * * IFST (input/output) INTEGER * ILST (input/output) INTEGER * Specify the reordering of the diagonal blocks of T. * The block with row index IFST is moved to row ILST, by a * sequence of transpositions between adjacent blocks. * On exit, if IFST pointed on entry to the second row of a * 2-by-2 block, it is changed to point to the first row; ILST * always points to the first row of the block in its final * position (which may differ from its input value by +1 or -1). * 1 <= IFST <= N; 1 <= ILST <= N. * * NITRAF (input/output) INTEGER * On entry, length of the array ITRAF. * As a minimum requirement, NITRAF >= max(1,|ILST-IFST|). * If there are 2-by-2 blocks in T then NITRAF must be larger; * a safe choice is NITRAF >= max(1,2*|ILST-IFST|). * On exit, actual length of the array ITRAF. * * ITRAF (output) INTEGER array, length NITRAF * List of parameters for representing the transformation * matrix Z, see further details. * * NDTRAF (input/output) INTEGER * On entry, length of the array DTRAF. * As a minimum requirement, NDTRAF >= max(1,2*|ILST-IFST|). * If there are 2-by-2 blocks in T then NDTRAF must be larger; * a safe choice is NDTRAF >= max(1,5*|ILST-IFST|). * On exit, actual length of the array DTRAF. * * DTRAF (output) REAL array, length NDTRAF * List of parameters for representing the transformation * matrix Z, see further details. * * WORK (workspace) REAL array, dimension (N) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * = 1: two adjacent blocks were too close to swap (the problem * is very ill-conditioned); T may have been partially * reordered, and ILST points to the first row of the * current position of the block being moved. * = 2: the 2 by 2 block to be reordered split into two 1 by 1 * blocks and the second block failed to swap with an * adjacent block. ILST points to the first row of the * current position of the whole block being moved. * * Further Details * =============== * * The orthogonal transformation matrix Z is a product of NITRAF * elementary orthogonal transformations. The parameters defining these * transformations are stored in the arrays ITRAF and DTRAF as follows: * * Consider the i-th transformation acting on rows/columns POS, * POS+1, ... If this transformation is * * (1) a Givens rotation with cosine C and sine S then * * ITRAF(i) = POS, * DTRAF(j) = C, DTRAF(j+1) = S; * * (2) a Householder reflector H = I - tau * v * v' with * v = [ 1; v2; v3 ] then * * ITRAF(i) = N + POS, * DTRAF(j) = tau, DTRAF(j+1) = v2, DTRAF(j+2) = v3; * * (3) a Householder reflector H = I - tau * v * v' with * v = [ v1; v2; 1 ] then * * ITRAF(i) = 2*N + POS, * DTRAF(j) = v1, DTRAF(j+1) = v2, DTRAF(j+2) = tau; * * Note that the parameters in DTRAF are stored consecutively. * * ===================================================================== * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER DLNGTH(3), ILNGTH(3) * .. * .. Local Scalars .. INTEGER CDTRAF, CITRAF, LDTRAF, LITRAF, HERE, I, NBF, $ NBL, NBNEXT * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL BSLAEXC, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Data Statements .. c DATA ( ILNGTH(I), I = 1, 3 ) / 1, 2, 4 / c DATA ( DLNGTH(I), I = 1, 3 ) / 2, 5, 10 / DATA ILNGTH(1)/1/, ILNGTH(2)/2/, ILNGTH(3)/4/ DATA DLNGTH(1)/2/, DLNGTH(2)/5/, DLNGTH(3)/10/ * .. * .. Executable Statements .. * * Decode and test the input arguments. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN INFO = -3 ELSE IF( IFST.LT.1 .OR. IFST.GT.N ) THEN INFO = -4 ELSE IF( ILST.LT.1 .OR. ILST.GT.N ) THEN INFO = -5 ELSE IF ( NITRAF.LT.MAX( 1, ABS( ILST-IFST ) ) ) THEN INFO = -6 ELSE IF ( NDTRAF.LT.MAX( 1, 2*ABS( ILST-IFST ) ) ) THEN INFO = -8 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DTREXC', -INFO ) RETURN END IF * * Quick return if possible * IF( N.LE.1 ) $ RETURN CITRAF = 1 CDTRAF = 1 * * Determine the first row of specified block * and find out it is 1 by 1 or 2 by 2. * IF( IFST.GT.1 ) THEN IF( T( IFST, IFST-1 ).NE.ZERO ) $ IFST = IFST - 1 END IF NBF = 1 IF( IFST.LT.N ) THEN IF( T( IFST+1, IFST ).NE.ZERO ) $ NBF = 2 END IF * * Determine the first row of the final block * and find out it is 1 by 1 or 2 by 2. * IF( ILST.GT.1 ) THEN IF( T( ILST, ILST-1 ).NE.ZERO ) $ ILST = ILST - 1 END IF NBL = 1 IF( ILST.LT.N ) THEN IF( T( ILST+1, ILST ).NE.ZERO ) $ NBL = 2 END IF * IF( IFST.EQ.ILST ) $ RETURN * IF( IFST.LT.ILST ) THEN * * Update ILST * IF( NBF.EQ.2 .AND. NBL.EQ.1 ) $ ILST = ILST - 1 IF( NBF.EQ.1 .AND. NBL.EQ.2 ) $ ILST = ILST + 1 * HERE = IFST * 10 CONTINUE * * Swap block with next one below * IF( NBF.EQ.1 .OR. NBF.EQ.2 ) THEN * * Current block either 1 by 1 or 2 by 2 * NBNEXT = 1 IF( HERE+NBF+1.LE.N ) THEN IF( T( HERE+NBF+1, HERE+NBF ).NE.ZERO ) $ NBNEXT = 2 END IF * LITRAF = ILNGTH(NBF+NBNEXT-1) LDTRAF = DLNGTH(NBF+NBNEXT-1) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE, NBF, NBNEXT, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE + NBNEXT * * Test if 2 by 2 block breaks into two 1 by 1 blocks * IF( NBF.EQ.2 ) THEN IF( T( HERE+1, HERE ).EQ.ZERO ) $ NBF = 3 END IF * ELSE * * Current block consists of two 1 by 1 blocks each of which * must be swapped individually * NBNEXT = 1 IF( HERE+3.LE.N ) THEN IF( T( HERE+3, HERE+2 ).NE.ZERO ) $ NBNEXT = 2 END IF LITRAF = ILNGTH(NBNEXT) LDTRAF = DLNGTH(NBNEXT) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE+1, 1, NBNEXT, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF * IF( NBNEXT.EQ.1 ) THEN * * Swap two 1 by 1 blocks, no problems possible * LITRAF = ILNGTH(1) LDTRAF = DLNGTH(1) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE, 1, NBNEXT, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE + 1 ELSE * * Recompute NBNEXT in case 2 by 2 split * IF( T( HERE+2, HERE+1 ).EQ.ZERO ) $ NBNEXT = 1 IF( NBNEXT.EQ.2 ) THEN * * 2 by 2 Block did not split * LITRAF = ILNGTH(2) LDTRAF = DLNGTH(2) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE, 1, NBNEXT, $ ITRAF(CITRAF), DTRAF(CDTRAF), WORK, $ INFO ) IF( INFO.NE.0 ) THEN INFO = 2 ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE + 2 ELSE * * 2 by 2 Block did split * LITRAF = ILNGTH(1) LDTRAF = DLNGTH(1) IF( CITRAF+2*LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+2*LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE, 1, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF CALL BSLAEXC( N, T, LDT, HERE+1, 1, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE + 2 END IF END IF END IF IF( HERE.LT.ILST ) $ GO TO 10 * ELSE * HERE = IFST 20 CONTINUE * * Swap block with next one above * IF( NBF.EQ.1 .OR. NBF.EQ.2 ) THEN * * Current block either 1 by 1 or 2 by 2 * NBNEXT = 1 IF( HERE.GE.3 ) THEN IF( T( HERE-1, HERE-2 ).NE.ZERO ) $ NBNEXT = 2 END IF * LITRAF = ILNGTH(NBF+NBNEXT-1) LDTRAF = DLNGTH(NBF+NBNEXT-1) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE-NBNEXT, NBNEXT, NBF, $ ITRAF(CITRAF), DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE - NBNEXT * * Test if 2 by 2 block breaks into two 1 by 1 blocks * IF( NBF.EQ.2 ) THEN IF( T( HERE+1, HERE ).EQ.ZERO ) $ NBF = 3 END IF * ELSE * * Current block consists of two 1 by 1 blocks each of which * must be swapped individually * NBNEXT = 1 IF( HERE.GE.3 ) THEN IF( T( HERE-1, HERE-2 ).NE.ZERO ) $ NBNEXT = 2 END IF LITRAF = ILNGTH(NBNEXT) LDTRAF = DLNGTH(NBNEXT) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE-NBNEXT, NBNEXT, 1, $ ITRAF(CITRAF), DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF * IF( NBNEXT.EQ.1 ) THEN * * Swap two 1 by 1 blocks, no problems possible * LITRAF = ILNGTH(1) LDTRAF = DLNGTH(1) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE, NBNEXT, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE - 1 ELSE * * Recompute NBNEXT in case 2 by 2 split * IF( T( HERE, HERE-1 ).EQ.ZERO ) $ NBNEXT = 1 IF( NBNEXT.EQ.2 ) THEN * * 2 by 2 Block did not split * LITRAF = ILNGTH(2) LDTRAF = DLNGTH(2) IF( CITRAF+LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE-1, 2, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) IF( INFO.NE.0 ) THEN INFO = 2 ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 RETURN END IF CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE - 2 ELSE * * 2 by 2 Block did split * LITRAF = ILNGTH(1) LDTRAF = DLNGTH(1) IF( CITRAF+2*LITRAF-1.GT.NITRAF ) THEN INFO = -6 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF IF( CDTRAF+2*LDTRAF-1.GT.NDTRAF ) THEN INFO = -8 CALL XERBLA( 'BSTREXC', -INFO ) RETURN END IF CALL BSLAEXC( N, T, LDT, HERE, 1, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF CALL BSLAEXC( N, T, LDT, HERE-1, 1, 1, ITRAF(CITRAF), $ DTRAF(CDTRAF), WORK, INFO ) CITRAF = CITRAF + LITRAF CDTRAF = CDTRAF + LDTRAF HERE = HERE - 2 END IF END IF END IF IF( HERE.GT.ILST ) $ GO TO 20 END IF ILST = HERE NITRAF = CITRAF - 1 NDTRAF = CDTRAF - 1 * RETURN * * End of BSTREXC * END scalapack-2.0.2/SRC/cdbtf2.f000644 000766 000024 00000012602 11657111056 015657 0ustar00juliestaff000000 000000 SUBROUTINE CDBTF2( M, N, KL, KU, AB, LDAB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Modified by Andrew J. Cleary in November, 96 from: * -- LAPACK auxiliary routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * August 6, 1991 * * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDAB, M, N * .. * .. Array Arguments .. COMPLEX AB( LDAB, * ) * .. * * Purpose * ======= * * Cdbtrf computes an LU factorization of a real m-by-n band matrix A * without using partial pivoting with row interchanges. * * This is the unblocked version of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of subdiagonals within the band of A. KL >= 0. * * KU (input) INTEGER * The number of superdiagonals within the band of A. KU >= 0. * * AB (input/output) COMPLEX array, dimension (LDAB,N) * On entry, the matrix A in band storage, in rows KL+1 to * 2*KL+KU+1; rows 1 to KL of the array need not be set. * The j-th column of A is stored in the j-th column of the * array AB as follows: * AB(kl+ku+1+i-j,j) = A(i,j) for max(1,j-ku)<=i<=min(m,j+kl) * * On exit, details of the factorization: U is stored as an * upper triangular band matrix with KL+KU superdiagonals in * rows 1 to KL+KU+1, and the multipliers used during the * factorization are stored in rows KL+KU+2 to 2*KL+KU+1. * See below for further details. * * LDAB (input) INTEGER * The leading dimension of the array AB. LDAB >= 2*KL+KU+1. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = +i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * Further Details * =============== * * The band storage scheme is illustrated by the following example, when * M = N = 6, KL = 2, KU = 1: * * On entry: On exit: * * * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 * a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 * a21 a32 a43 a54 a65 * m21 m32 m43 m54 m65 * * a31 a42 a53 a64 * * m31 m42 m53 m64 * * * * Array elements marked * are not used by the routine; elements marked * + need not be set on entry, but are required by the routine to store * elements of U, because of fill-in resulting from the row * interchanges. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER J, JP, JU, KM, KV * .. * .. External Functions .. INTEGER ISAMAX EXTERNAL ISAMAX * .. * .. External Subroutines .. EXTERNAL CGERU, CSCAL, CSWAP * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * KV is the number of superdiagonals in the factor U, allowing for * fill-in. * KV = KU * * Test the input parameters. * INFO = 0 *ECA IF( M.LT.0 ) THEN *ECA INFO = -1 *ECA ELSE IF( N.LT.0 ) THEN *ECA INFO = -2 *ECA ELSE IF( KL.LT.0 ) THEN *ECA INFO = -3 *ECA ELSE IF( KU.LT.0 ) THEN *ECA INFO = -4 *ECA ELSE IF( LDAB.LT.KL+KV+1 ) THEN *ECA INFO = -6 *ECA END IF *ECA IF( INFO.NE.0 ) THEN *ECA CALL XERBLA( 'CDBTF2', -INFO ) *ECA RETURN *ECA END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Gaussian elimination without partial pivoting * * JU is the index of the last column affected by the current stage * of the factorization. * JU = 1 * DO 40 J = 1, MIN( M, N ) * * Test for singularity. KM is the number of * subdiagonal elements in the current column. * KM = MIN( KL, M-J ) JP = 1 IF( AB( KV+1, J ).NE.ZERO ) THEN JU = MAX( JU, MIN( J+KU, N ) ) * IF( KM.GT.0 ) THEN * * Compute multipliers. * CALL CSCAL( KM, ONE / AB( KU+1, J ), AB( KU+2, J ), 1 ) * * Update trailing submatrix within the band. * IF( JU.GT.J ) THEN CALL CGERU( KM, JU-J, -CONE, AB( KU+2, J ), 1, $ AB( KU, J+1 ), LDAB-1, AB( KU+1, J+1 ), $ LDAB-1 ) END IF END IF ELSE * IF( INFO.EQ.0 ) $ INFO = J END IF 40 CONTINUE RETURN * * End of CDBTF2 * END scalapack-2.0.2/SRC/cdbtrf.f000644 000766 000024 00000025351 11657111056 015764 0ustar00juliestaff000000 000000 SUBROUTINE CDBTRF( M, N, KL, KU, AB, LDAB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * August, 1996. * Modified from CGBTRF: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * August 6, 1991 * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDAB, M, N * .. * .. Array Arguments .. COMPLEX AB( LDAB, * ) * .. * * Purpose * ======= * * Cdbtrf computes an LU factorization of a real m-by-n band matrix A * without using partial pivoting or row interchanges. * * This is the blocked version of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of subdiagonals within the band of A. KL >= 0. * * KU (input) INTEGER * The number of superdiagonals within the band of A. KU >= 0. * * AB (input/output) REAL array, dimension (LDAB,N) * On entry, the matrix A in band storage, in rows KL+1 to * 2*KL+KU+1; rows 1 to KL of the array need not be set. * The j-th column of A is stored in the j-th column of the * array AB as follows: * AB(kl+ku+1+i-j,j) = A(i,j) for max(1,j-ku)<=i<=min(m,j+kl) * * On exit, details of the factorization: U is stored as an * upper triangular band matrix with KL+KU superdiagonals in * rows 1 to KL+KU+1, and the multipliers used during the * factorization are stored in rows KL+KU+2 to 2*KL+KU+1. * See below for further details. * * LDAB (input) INTEGER * The leading dimension of the array AB. LDAB >= 2*KL+KU+1. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = +i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * Further Details * =============== * * The band storage scheme is illustrated by the following example, when * M = N = 6, KL = 2, KU = 1: * * On entry: On exit: * * * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 * a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 * a21 a32 a43 a54 a65 * m21 m32 m43 m54 m65 * * a31 a42 a53 a64 * * m31 m42 m53 m64 * * * * Array elements marked * are not used by the routine. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER NBMAX, LDWORK PARAMETER ( NBMAX = 64, LDWORK = NBMAX+1 ) * .. * .. Local Scalars .. INTEGER I, I2, I3, II, J, J2, J3, JB, JJ, JM, JP, $ JU, KM, KV, NB, NW * .. * .. Local Arrays .. COMPLEX WORK13( LDWORK, NBMAX ), $ WORK31( LDWORK, NBMAX ) * .. * .. External Functions .. INTEGER ILAENV, ISAMAX EXTERNAL ILAENV, ISAMAX * .. * .. External Subroutines .. EXTERNAL CCOPY, CDBTF2, CGEMM, CGERU, CSCAL, $ CSWAP, CTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * KV is the number of superdiagonals in the factor U * KV = KU * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( KL.LT.0 ) THEN INFO = -3 ELSE IF( KU.LT.0 ) THEN INFO = -4 ELSE IF( LDAB.LT.MIN( MIN( KL+KV+1,M ),N ) ) THEN INFO = -6 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CDBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Determine the block size for this environment * NB = ILAENV( 1, 'CDBTRF', ' ', M, N, KL, KU ) * * The block size must not exceed the limit set by the size of the * local arrays WORK13 and WORK31. * NB = MIN( NB, NBMAX ) * IF( NB.LE.1 .OR. NB.GT.KL ) THEN * * Use unblocked code * CALL CDBTF2( M, N, KL, KU, AB, LDAB, INFO ) ELSE * * Use blocked code * * Zero the superdiagonal elements of the work array WORK13 * DO 20 J = 1, NB DO 10 I = 1, J - 1 WORK13( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * * Zero the subdiagonal elements of the work array WORK31 * DO 40 J = 1, NB DO 30 I = J + 1, NB WORK31( I, J ) = ZERO 30 CONTINUE 40 CONTINUE * * JU is the index of the last column affected by the current * stage of the factorization * JU = 1 * DO 180 J = 1, MIN( M, N ), NB JB = MIN( NB, MIN( M, N )-J+1 ) * * The active part of the matrix is partitioned * * A11 A12 A13 * A21 A22 A23 * A31 A32 A33 * * Here A11, A21 and A31 denote the current block of JB columns * which is about to be factorized. The number of rows in the * partitioning are JB, I2, I3 respectively, and the numbers * of columns are JB, J2, J3. The superdiagonal elements of A13 * and the subdiagonal elements of A31 lie outside the band. * I2 = MIN( KL-JB, M-J-JB+1 ) I3 = MIN( JB, M-J-KL+1 ) * * J2 and J3 are computed after JU has been updated. * * Factorize the current block of JB columns * DO 80 JJ = J, J + JB - 1 * * Find pivot and test for singularity. KM is the number of * subdiagonal elements in the current column. * KM = MIN( KL, M-JJ ) JP = 1 IF( AB( KV+JP, JJ ).NE.ZERO ) THEN JU = MAX( JU, MIN( JJ+KU+JP-1, N ) ) * * Compute multipliers * CALL CSCAL( KM, ONE / AB( KV+1, JJ ), AB( KV+2, JJ ), $ 1 ) * * Update trailing submatrix within the band and within * the current block. JM is the index of the last column * which needs to be updated. * JM = MIN( JU, J+JB-1 ) IF( JM.GT.JJ ) THEN CALL CGERU( KM, JM-JJ, -CONE, AB( KV+2, JJ ), 1, $ AB( KV, JJ+1 ), LDAB-1, $ AB( KV+1, JJ+1 ), LDAB-1 ) END IF END IF * * Copy current column of A31 into the work array WORK31 * NW = MIN( JJ-J+1, I3 ) IF( NW.GT.0 ) $ CALL CCOPY( NW, AB( KV+KL+1-JJ+J, JJ ), 1, $ WORK31( 1, JJ-J+1 ), 1 ) 80 CONTINUE IF( J+JB.LE.N ) THEN * * Apply the row interchanges to the other blocks. * J2 = MIN( JU-J+1, KV ) - JB J3 = MAX( 0, JU-J-KV+1 ) * * Update the relevant part of the trailing submatrix * IF( J2.GT.0 ) THEN * * Update A12 * CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', $ JB, J2, CONE, AB( KV+1, J ), LDAB-1, $ AB( KV+1-JB, J+JB ), LDAB-1 ) * IF( I2.GT.0 ) THEN * * Update A22 * CALL CGEMM( 'No transpose', 'No transpose', I2, J2, $ JB, -CONE, AB( KV+1+JB, J ), LDAB-1, $ AB( KV+1-JB, J+JB ), LDAB-1, CONE, $ AB( KV+1, J+JB ), LDAB-1 ) END IF * IF( I3.GT.0 ) THEN * * Update A32 * CALL CGEMM( 'No transpose', 'No transpose', I3, J2, $ JB, -CONE, WORK31, LDWORK, $ AB( KV+1-JB, J+JB ), LDAB-1, CONE, $ AB( KV+KL+1-JB, J+JB ), LDAB-1 ) END IF END IF * IF( J3.GT.0 ) THEN * * Copy the lower triangle of A13 into the work array * WORK13 * DO 130 JJ = 1, J3 DO 120 II = JJ, JB WORK13( II, JJ ) = AB( II-JJ+1, JJ+J+KV-1 ) 120 CONTINUE 130 CONTINUE * * Update A13 in the work array * CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', $ JB, J3, CONE, AB( KV+1, J ), LDAB-1, $ WORK13, LDWORK ) * IF( I2.GT.0 ) THEN * * Update A23 * CALL CGEMM( 'No transpose', 'No transpose', I2, J3, $ JB, -CONE, AB( KV+1+JB, J ), LDAB-1, $ WORK13, LDWORK, CONE, AB( 1+JB, J+KV ), $ LDAB-1 ) END IF * IF( I3.GT.0 ) THEN * * Update A33 * CALL CGEMM( 'No transpose', 'No transpose', I3, J3, $ JB, -CONE, WORK31, LDWORK, WORK13, $ LDWORK, CONE, AB( 1+KL, J+KV ), LDAB-1 ) END IF * * Copy the lower triangle of A13 back into place * DO 150 JJ = 1, J3 DO 140 II = JJ, JB AB( II-JJ+1, JJ+J+KV-1 ) = WORK13( II, JJ ) 140 CONTINUE 150 CONTINUE END IF ELSE END IF * * copy the upper triangle of A31 back into place * DO 170 JJ = J + JB - 1, J, -1 * * Copy the current column of A31 back into place * NW = MIN( I3, JJ-J+1 ) IF( NW.GT.0 ) $ CALL CCOPY( NW, WORK31( 1, JJ-J+1 ), 1, $ AB( KV+KL+1-JJ+J, JJ ), 1 ) 170 CONTINUE 180 CONTINUE END IF * RETURN * * End of CDBTRF * END scalapack-2.0.2/SRC/cdttrf.f000644 000766 000024 00000006514 11657111056 016006 0ustar00juliestaff000000 000000 SUBROUTINE CDTTRF( N, DL, D, DU, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, November 1996. * Modified from CGTTRF: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. INTEGER INFO, N * .. * .. Array Arguments .. COMPLEX D( * ), DL( * ), DU( * ) * .. * * Purpose * ======= * * CDTTRF computes an LU factorization of a complex tridiagonal matrix A * using elimination without partial pivoting. * * The factorization has the form * A = L * U * where L is a product of unit lower bidiagonal * matrices and U is upper triangular with nonzeros in only the main * diagonal and first superdiagonal. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * DL (input/output) COMPLEX array, dimension (N-1) * On entry, DL must contain the (n-1) subdiagonal elements of * A. * On exit, DL is overwritten by the (n-1) multipliers that * define the matrix L from the LU factorization of A. * * D (input/output) COMPLEX array, dimension (N) * On entry, D must contain the diagonal elements of A. * On exit, D is overwritten by the n diagonal elements of the * upper triangular matrix U from the LU factorization of A. * * DU (input/output) COMPLEX array, dimension (N-1) * On entry, DU must contain the (n-1) superdiagonal elements * of A. * On exit, DU is overwritten by the (n-1) elements of the first * superdiagonal of U. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Local Scalars .. INTEGER I COMPLEX FACT * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Parameters .. COMPLEX CZERO PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Executable Statements .. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 CALL XERBLA( 'CDTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * DO 20 I = 1, N - 1 IF( DL( I ).EQ.CZERO ) THEN * * Subdiagonal is zero, no elimination is required. * IF( D( I ).EQ.CZERO .AND. INFO.EQ.0 ) $ INFO = I ELSE * FACT = DL( I ) / D( I ) DL( I ) = FACT D( I+1 ) = D( I+1 ) - FACT*DU( I ) END IF 20 CONTINUE IF( D( N ).EQ.CZERO .AND. INFO.EQ.0 ) THEN INFO = N RETURN END IF * RETURN * * End of CDTTRF * END scalapack-2.0.2/SRC/cdttrsv.f000644 000766 000024 00000013353 11657111056 016210 0ustar00juliestaff000000 000000 SUBROUTINE CDTTRSV( UPLO, TRANS, N, NRHS, DL, D, DU, $ B, LDB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * August, 1996. * Modified from CGTTRS: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. CHARACTER UPLO, TRANS INTEGER INFO, LDB, N, NRHS * .. * .. Array Arguments .. COMPLEX B( LDB, * ), D( * ), DL( * ), DU( * ) * .. * * Purpose * ======= * * CDTTRSV solves one of the systems of equations * L * X = B, L**T * X = B, or L**H * X = B, * U * X = B, U**T * X = B, or U**H * X = B, * with factors of the tridiagonal matrix A from the LU factorization * computed by CDTTRF. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether to solve with L or U. * * TRANS (input) CHARACTER * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * DL (input) COMPLEX array, dimension (N-1) * The (n-1) multipliers that define the matrix L from the * LU factorization of A. * * D (input) COMPLEX array, dimension (N) * The n diagonal elements of the upper triangular matrix U from * the LU factorization of A. * * DU (input) COMPLEX array, dimension (N-1) * The (n-1) elements of the first superdiagonal of U. * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, B is overwritten by the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Local Scalars .. LOGICAL LOWER, NOTRAN INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) LOWER = LSAME( UPLO, 'L' ) IF( .NOT.LOWER .AND. .NOT.LSAME( UPLO, 'U' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( N, 1 ) ) THEN INFO = -9 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CDTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( NOTRAN ) THEN * IF( LOWER ) THEN * Solve L*X = B, overwriting B with X. * DO 35 J = 1, NRHS * * Solve L*x = b. * DO 10 I = 1, N - 1 B( I+1, J ) = B( I+1, J ) - DL( I )*B( I, J ) 10 CONTINUE 35 CONTINUE * ELSE * Solve U*x = b. * DO 30 J = 1, NRHS B( N, J ) = B( N, J ) / D( N ) IF( N.GT.1 ) $ B( N-1, J ) = ( B( N-1, J )-DU( N-1 )*B( N, J ) ) / $ D( N-1 ) DO 20 I = N - 2, 1, -1 B( I, J ) = ( B( I, J )-DU( I )*B( I+1, J ) ) / D( I ) 20 CONTINUE 30 CONTINUE * ENDIF * ELSE IF( LSAME( TRANS, 'T' ) ) THEN * IF( .NOT. LOWER ) THEN * Solve U**T * X = B, overwriting B with X. * DO 65 J = 1, NRHS * * Solve U**T * x = b. * B( 1, J ) = B( 1, J ) / D( 1 ) IF( N.GT.1 ) $ B( 2, J ) = ( B( 2, J )-DU( 1 )*B( 1, J ) ) / D( 2 ) DO 40 I = 3, N B( I, J ) = ( B( I, J )-DU( I-1 )*B( I-1, J ) ) / D( I ) 40 CONTINUE 65 CONTINUE * ELSE * * Solve L**T * X = B, overwriting B with X. DO 60 J = 1, NRHS * * Solve L**T * x = b. * DO 50 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - DL( I )*B( I+1, J ) 50 CONTINUE 60 CONTINUE ENDIF ELSE * IF( .NOT. LOWER ) THEN * Solve U**H * X = B, overwriting B with X. * DO 95 J = 1, NRHS * * Solve U**H * x = b. * B( 1, J ) = B( 1, J ) / CONJG( D( 1 ) ) IF( N.GT.1 ) $ B( 2, J ) = ( B( 2, J )-CONJG( DU( 1 ) )*B( 1, J ) ) / $ CONJG( D( 2 ) ) DO 70 I = 3, N B( I, J ) = ( B( I, J ) $ -CONJG( DU( I-1 ) )*B( I-1, J ) ) / $ CONJG( D( I ) ) 70 CONTINUE 95 CONTINUE * ELSE * * Solve L**H * X = B, overwriting B with X. DO 90 J = 1, NRHS * * Solve L**H * x = b. * DO 80 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - CONJG( DL( I ) )*B( I+1, J ) 80 CONTINUE 90 CONTINUE ENDIF END IF * * End of CDTTRSV * END scalapack-2.0.2/SRC/clahqr2.f000644 000766 000024 00000034475 10363532303 016056 0ustar00juliestaff000000 000000 SUBROUTINE CLAHQR2( WANTT, WANTZ, N, ILO, IHI, H, LDH, W, ILOZ, $ IHIZ, Z, LDZ, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 22, 2000 * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, INFO, LDH, LDZ, N * .. * .. Array Arguments .. COMPLEX H( LDH, * ), W( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * CLAHQR2 is an auxiliary routine called by CHSEQR to update the * eigenvalues and Schur decomposition already computed by CHSEQR, by * dealing with the Hessenberg submatrix in rows and columns ILO to IHI. * This version of CLAHQR (not the standard LAPACK version) uses a * double-shift algorithm (like LAPACK's SLAHQR). * Unlike the standard LAPACK convention, this does not assume the * subdiagonal is real, nor does it work to preserve this quality if * given. * * Arguments * ========= * * WANTT (input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (input) INTEGER * The order of the matrix H. N >= 0. * * ILO (input) INTEGER * IHI (input) INTEGER * It is assumed that H is already upper triangular in rows and * columns IHI+1:N, and that H(ILO,ILO-1) = 0 (unless ILO = 1). * CLAHQR works primarily with the Hessenberg submatrix in rows * and columns ILO to IHI, but applies transformations to all of * H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * H (input/output) COMPLEX array, dimension (LDH,N) * On entry, the upper Hessenberg matrix H. * On exit, if WANTT is .TRUE., H is upper triangular in rows * and columns ILO:IHI. If WANTT is .FALSE., the contents of H * are unspecified on exit. * * LDH (input) INTEGER * The leading dimension of the array H. LDH >= max(1,N). * * W (output) COMPLEX array, dimension (N) * The computed eigenvalues ILO to IHI are stored in the * corresponding elements of W. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in H, with W(i) = H(i,i). * * ILOZ (input) INTEGER * IHIZ (input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (input/output) COMPLEX array, dimension (LDZ,N) * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations, and on exit Z has been updated; * transformations are applied only to the submatrix * Z(ILOZ:IHIZ,ILO:IHI). If WANTZ is .FALSE., Z is not * referenced. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * > 0: if INFO = i, CLAHQR failed to compute all the * eigenvalues ILO to IHI in a total of 30*(IHI-ILO+1) * iterations; elements i+1:ihi of W contain those * eigenvalues which have been successfully computed. * * Further Details * =============== * * Modified by Mark R. Fahey, June, 2000 * * ===================================================================== * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0E+0, RONE = 1.0E+0 ) REAL DAT1, DAT2 PARAMETER ( DAT1 = 0.75E+0, DAT2 = -0.4375E+0 ) * .. * .. Local Scalars .. INTEGER I, I1, I2, ITN, ITS, J, K, L, M, NH, NR, NZ REAL CS, OVFL, S, SMLNUM, TST1, ULP, UNFL COMPLEX CDUM, H00, H10, H11, H12, H21, H22, H33, H33S, $ H43H34, H44, H44S, SN, SUM, T1, T2, T3, V1, V2, $ V3 * .. * .. Local Arrays .. REAL RWORK( 1 ) COMPLEX V( 3 ) * .. * .. External Functions .. REAL SLAMCH, CLANHS EXTERNAL SLAMCH, CLANHS * .. * .. External Subroutines .. EXTERNAL SLABAD, CCOPY, CLANV2, CLARFG, CROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( N.EQ.0 ) $ RETURN IF( ILO.EQ.IHI ) THEN W( ILO ) = H( ILO, ILO ) RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * * Set machine-dependent constants for the stopping criterion. * If norm(H) <= sqrt(OVFL), overflow should not occur. * UNFL = SLAMCH( 'Safe minimum' ) OVFL = RONE / UNFL CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Precision' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = 30*NH * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of 1 or 2. Each iteration of the loop works * with the active submatrix in rows and columns L to I. * Eigenvalues I+1 to IHI have already converged. Either L = ILO, or * H(L,L-1) is negligible so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 150 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 130 ITS = 0, ITN * * Look for a single small subdiagonal element. * DO 20 K = I, L + 1, -1 TST1 = CABS1( H( K-1, K-1 ) ) + CABS1( H( K, K ) ) IF( TST1.EQ.RZERO ) $ TST1 = CLANHS( '1', I-L+1, H( L, L ), LDH, RWORK ) IF( CABS1( H( K, K-1 ) ).LE.MAX( ULP*TST1, SMLNUM ) ) $ GO TO 30 20 CONTINUE 30 CONTINUE L = K IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * H( L, L-1 ) = ZERO END IF * * Exit from loop if a submatrix of order 1 or 2 has split off. * IF( L.GE.I-1 ) $ GO TO 140 * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * IF( ITS.EQ.10 .OR. ITS.EQ.20 ) THEN * * Exceptional shift. * * S = ABS( REAL( H( I,I-1 ) ) ) + ABS( REAL( H( I-1,I-2 ) ) ) S = CABS1( H( I, I-1 ) ) + CABS1( H( I-1, I-2 ) ) H44 = DAT1*S H33 = H44 H43H34 = DAT2*S*S ELSE * * Prepare to use Wilkinson's shift. * H44 = H( I, I ) H33 = H( I-1, I-1 ) H43H34 = H( I, I-1 )*H( I-1, I ) END IF * * Look for two consecutive small subdiagonal elements. * DO 40 M = I - 2, L, -1 * * Determine the effect of starting the double-shift QR * iteration at row M, and see if this would make H(M,M-1) * negligible. * H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S = CABS1( V1 ) + CABS1( V2 ) + ABS( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 IF( M.EQ.L ) $ GO TO 50 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).LE.ULP*TST1 ) $ GO TO 50 40 CONTINUE 50 CONTINUE * * Double-shift QR step * DO 120 K = M, I - 1 * * The first iteration of this loop determines a reflection G * from the vector V and applies it from left and right to H, * thus creating a nonzero bulge below the subdiagonal. * * Each subsequent iteration determines a reflection G to * restore the Hessenberg form in the (K-1)th column, and thus * chases the bulge one step toward the bottom of the active * submatrix. NR is the order of G * NR = MIN( 3, I-K+1 ) IF( K.GT.M ) $ CALL CCOPY( NR, H( K, K-1 ), 1, V, 1 ) CALL CLARFG( NR, V( 1 ), V( 2 ), 1, T1 ) IF( K.GT.M ) THEN H( K, K-1 ) = V( 1 ) H( K+1, K-1 ) = ZERO IF( K.LT.I-1 ) $ H( K+2, K-1 ) = ZERO ELSE IF( M.GT.L ) THEN * The real double-shift code uses H( K, K-1 ) = -H( K, K-1 ) * instead of the following. H( K, K-1 ) = H( K, K-1 ) - CONJG( T1 )*H( K, K-1 ) END IF V2 = V( 2 ) T2 = T1*V2 IF( NR.EQ.3 ) THEN V3 = V( 3 ) T3 = T1*V3 * * Apply G from the left to transform the rows of the matrix * in columns K to I2. * DO 60 J = K, I2 SUM = CONJG( T1 )*H( K, J ) + $ CONJG( T2 )*H( K+1, J ) + $ CONJG( T3 )*H( K+2, J ) H( K, J ) = H( K, J ) - SUM H( K+1, J ) = H( K+1, J ) - SUM*V2 H( K+2, J ) = H( K+2, J ) - SUM*V3 60 CONTINUE * * Apply G from the right to transform the columns of the * matrix in rows I1 to min(K+3,I). * DO 70 J = I1, MIN( K+3, I ) SUM = T1*H( J, K ) + T2*H( J, K+1 ) + T3*H( J, K+2 ) H( J, K ) = H( J, K ) - SUM H( J, K+1 ) = H( J, K+1 ) - SUM*CONJG( V2 ) H( J, K+2 ) = H( J, K+2 ) - SUM*CONJG( V3 ) 70 CONTINUE * IF( WANTZ ) THEN * * Accumulate transformations in the matrix Z * DO 80 J = ILOZ, IHIZ SUM = T1*Z( J, K ) + T2*Z( J, K+1 ) + $ T3*Z( J, K+2 ) Z( J, K ) = Z( J, K ) - SUM Z( J, K+1 ) = Z( J, K+1 ) - SUM*CONJG( V2 ) Z( J, K+2 ) = Z( J, K+2 ) - SUM*CONJG( V3 ) 80 CONTINUE END IF ELSE IF( NR.EQ.2 ) THEN * * Apply G from the left to transform the rows of the matrix * in columns K to I2. * DO 90 J = K, I2 SUM = CONJG( T1 )*H( K, J ) + $ CONJG( T2 )*H( K+1, J ) H( K, J ) = H( K, J ) - SUM H( K+1, J ) = H( K+1, J ) - SUM*V2 90 CONTINUE * * Apply G from the right to transform the columns of the * matrix in rows I1 to min(K+2,I). * DO 100 J = I1, MIN( K+2, I ) SUM = T1*H( J, K ) + T2*H( J, K+1 ) H( J, K ) = H( J, K ) - SUM H( J, K+1 ) = H( J, K+1 ) - SUM*CONJG( V2 ) 100 CONTINUE * IF( WANTZ ) THEN * * Accumulate transformations in the matrix Z * DO 110 J = ILOZ, IHIZ SUM = T1*Z( J, K ) + T2*Z( J, K+1 ) Z( J, K ) = Z( J, K ) - SUM Z( J, K+1 ) = Z( J, K+1 ) - SUM*CONJG( V2 ) 110 CONTINUE END IF END IF * * Since at the start of the QR step we have for M > L * H( K, K-1 ) = H( K, K-1 ) - CONJG( T1 )*H( K, K-1 ) * then we don't need to do the following * IF( K.EQ.M .AND. M.GT.L ) THEN * If the QR step was started at row M > L because two * consecutive small subdiagonals were found, then H(M,M-1) * must also be updated by a factor of (1-T1). * TEMP = ONE - T1 * H( m, m-1 ) = H( m, m-1 )*CONJG( TEMP ) * END IF 120 CONTINUE * * Ensure that H(I,I-1) is real. * 130 CONTINUE * * Failure to converge in remaining number of iterations * INFO = I RETURN * 140 CONTINUE * IF( L.EQ.I ) THEN * * H(I,I-1) is negligible: one eigenvalue has converged. * W( I ) = H( I, I ) * ELSE IF( L.EQ.I-1 ) THEN * * H(I-1,I-2) is negligible: a pair of eigenvalues have converged. * * Transform the 2-by-2 submatrix to standard Schur form, * and compute and store the eigenvalues. * CALL CLANV2( H( I-1, I-1 ), H( I-1, I ), H( I, I-1 ), $ H( I, I ), W( I-1 ), W( I ), CS, SN ) * IF( WANTT ) THEN * * Apply the transformation to the rest of H. * IF( I2.GT.I ) $ CALL CROT( I2-I, H( I-1, I+1 ), LDH, H( I, I+1 ), LDH, $ CS, SN ) CALL CROT( I-I1-1, H( I1, I-1 ), 1, H( I1, I ), 1, CS, $ CONJG( SN ) ) END IF IF( WANTZ ) THEN * * Apply the transformation to Z. * CALL CROT( NZ, Z( ILOZ, I-1 ), 1, Z( ILOZ, I ), 1, CS, $ CONJG( SN ) ) END IF * END IF * * Decrement number of remaining iterations, and return to start of * the main loop with new value of I. * ITN = ITN - ITS I = L - 1 GO TO 10 * 150 CONTINUE RETURN * * End of CLAHQR2 * END scalapack-2.0.2/SRC/clamov.c000644 000766 000024 00000000254 11745567264 016007 0ustar00juliestaff000000 000000 // // clamov.c // // Written by Lee Killough 04/19/2012 // #define TYPE complex #define FUNC "CLAMOV" #define LAMOV clamov_ #define LACPY clacpy_ #include "lamov.h" scalapack-2.0.2/SRC/clamsh.f000644 000766 000024 00000022723 10363532303 015762 0ustar00juliestaff000000 000000 SUBROUTINE CLAMSH( S, LDS, NBULGE, JBLK, H, LDH, N, ULP ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * May 28, 1999 * * .. Scalar Arguments .. INTEGER JBLK, LDH, LDS, N, NBULGE REAL ULP * .. * .. Array Arguments .. COMPLEX H( LDH, * ), S( LDS, * ) * .. * * Purpose * ======= * * CLAMSH sends multiple shifts through a small (single node) matrix to * see how consecutive small subdiagonal elements are modified by * subsequent shifts in an effort to maximize the number of bulges * that can be sent through. * CLAMSH should only be called when there are multiple shifts/bulges * (NBULGE > 1) and the first shift is starting in the middle of an * unreduced Hessenberg matrix because of two or more consecutive * small subdiagonal elements. * * Arguments * ========= * * S (local input/output) COMPLEX array, ( LDS,* ) * On entry, the matrix of shifts. Only the 2x2 diagonal of S * is referenced. It is assumed that S has JBLK double shifts * (size 2). * On exit, the data is rearranged in the best order for * applying. * * LDS (local input) INTEGER * On entry, the leading dimension of S. Unchanged on exit. * 1 < NBULGE <= JBLK <= LDS/2 * * NBULGE (local input/output) INTEGER * On entry, the number of bulges to send through H ( >1 ). * NBULGE should be less than the maximum determined (JBLK). * 1 < NBULGE <= JBLK <= LDS/2 * On exit, the maximum number of bulges that can be sent * through. * * JBLK (local input) INTEGER * On entry, the number of shifts determined for S. * Unchanged on exit. * * H (local input/output) COMPLEX array ( LDH,N ) * On entry, the local matrix to apply the shifts on. * H should be aligned so that the starting row is 2. * On exit, the data is destroyed. * * LDH (local input) INTEGER * On entry, the leading dimension of H. Unchanged on exit. * * N (local input) INTEGER * On entry, the size of H. If all the bulges are expected to * go through, N should be at least 4*NBULGE+2. * Otherwise, NBULGE may be reduced by this routine. * * ULP (local input) REAL * On entry, machine precision * Unchanged on exit. * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. REAL RONE, TEN PARAMETER ( RONE = 1.0E+0, TEN = 10.0E+0 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IBULGE, IVAL, J, K, M, NR REAL DVAL, S1, TST1 COMPLEX CDUM, H00, H10, H11, H12, H21, H22, H33, H33S, $ H43H34, H44, H44S, SUM, T1, T2, T3, V1, V2, V3 * .. * .. Local Arrays .. COMPLEX V( 3 ) * .. * .. External Subroutines .. EXTERNAL CCOPY, CLARFG * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. * M = 2 DO 50 IBULGE = 1, NBULGE H44 = S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2 ) H33 = S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+1 ) H43H34 = S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+2 )* $ S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1 ) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).GT.ULP*TST1 ) THEN * Find minimum DVAL = ( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ) ) / $ ( ULP*TST1 ) IVAL = IBULGE DO 10 I = IBULGE + 1, NBULGE H44 = S( 2*JBLK-2*I+2, 2*JBLK-2*I+2 ) H33 = S( 2*JBLK-2*I+1, 2*JBLK-2*I+1 ) H43H34 = S( 2*JBLK-2*I+1, 2*JBLK-2*I+2 )* $ S( 2*JBLK-2*I+2, 2*JBLK-2*I+1 ) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF( ( DVAL.GT.( CABS1( H10 )*( CABS1( V2 )+ $ CABS1( V3 ) ) ) / ( ULP*TST1 ) ) .AND. $ ( DVAL.GT.RONE ) ) THEN DVAL = ( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ) ) / $ ( ULP*TST1 ) IVAL = I END IF 10 CONTINUE IF( ( DVAL.LT.TEN ) .AND. ( IVAL.NE.IBULGE ) ) THEN H44 = S( 2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+2 ) H33 = S( 2*JBLK-2*IVAL+1, 2*JBLK-2*IVAL+1 ) H43H34 = S( 2*JBLK-2*IVAL+1, 2*JBLK-2*IVAL+2 ) H10 = S( 2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1 ) S( 2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+2 ) = S( 2*JBLK-2* $ IBULGE+2, 2*JBLK-2*IBULGE+2 ) S( 2*JBLK-2*IVAL+1, 2*JBLK-2*IVAL+1 ) = S( 2*JBLK-2* $ IBULGE+1, 2*JBLK-2*IBULGE+1 ) S( 2*JBLK-2*IVAL+1, 2*JBLK-2*IVAL+2 ) = S( 2*JBLK-2* $ IBULGE+1, 2*JBLK-2*IBULGE+2 ) S( 2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1 ) = S( 2*JBLK-2* $ IBULGE+2, 2*JBLK-2*IBULGE+1 ) S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2 ) = H44 S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+1 ) = H33 S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+2 ) = H43H34 S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1 ) = H10 END IF H44 = S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2 ) H33 = S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+1 ) H43H34 = S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+2 )* $ S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1 ) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) END IF IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).GT.TEN*ULP*TST1 ) $ THEN * IBULGE better not be 1 here or we have a bug! NBULGE = MAX( IBULGE-1, 1 ) RETURN END IF DO 40 K = M, N - 1 NR = MIN( 3, N-K+1 ) IF( K.GT.M ) $ CALL CCOPY( NR, H( K, K-1 ), 1, V, 1 ) CALL CLARFG( NR, V( 1 ), V( 2 ), 1, T1 ) IF( K.GT.M ) THEN H( K, K-1 ) = V( 1 ) H( K+1, K-1 ) = ZERO IF( K.LT.N-1 ) $ H( K+2, K-1 ) = ZERO ELSE * H(m,m-1) must be updated, * H( K, K-1 ) = H( K, K-1 ) - CONJG( T1 )*H( K, K-1 ) END IF V2 = V( 2 ) T2 = T1*V2 IF( NR.EQ.3 ) THEN V3 = V( 3 ) T3 = T1*V3 DO 20 J = K, N SUM = CONJG( T1 )*H( K, J ) + $ CONJG( T2 )*H( K+1, J ) + $ CONJG( T3 )*H( K+2, J ) H( K, J ) = H( K, J ) - SUM H( K+1, J ) = H( K+1, J ) - SUM*V2 H( K+2, J ) = H( K+2, J ) - SUM*V3 20 CONTINUE DO 30 J = 1, MIN( K+3, N ) SUM = T1*H( J, K ) + T2*H( J, K+1 ) + T3*H( J, K+2 ) H( J, K ) = H( J, K ) - SUM H( J, K+1 ) = H( J, K+1 ) - SUM*CONJG( V2 ) H( J, K+2 ) = H( J, K+2 ) - SUM*CONJG( V3 ) 30 CONTINUE END IF 40 CONTINUE 50 CONTINUE * RETURN * * End of CLAMSH * END scalapack-2.0.2/SRC/clanv2.f000644 000766 000024 00000006672 10363532303 015705 0ustar00juliestaff000000 000000 SUBROUTINE CLANV2( A, B, C, D, RT1, RT2, CS, SN ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * May 28, 1999 * * .. Scalar Arguments .. REAL CS COMPLEX A, B, C, D, RT1, RT2, SN * .. * * Purpose * ======= * * CLANV2 computes the Schur factorization of a complex 2-by-2 * nonhermitian matrix in standard form: * * [ A B ] = [ CS -SN ] [ AA BB ] [ CS SN ] * [ C D ] [ SN CS ] [ 0 DD ] [-SN CS ] * * Arguments * ========= * * A (input/output) COMPLEX * B (input/output) COMPLEX * C (input/output) COMPLEX * D (input/output) COMPLEX * On entry, the elements of the input matrix. * On exit, they are overwritten by the elements of the * standardised Schur form. * * RT1 (output) COMPLEX * RT2 (output) COMPLEX * The two eigenvalues. * * CS (output) REAL * SN (output) COMPLEX * Parameters of the rotation matrix. * * Further Details * =============== * * Implemented by Mark R. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. REAL RZERO, HALF, RONE PARAMETER ( RZERO = 0.0E+0, HALF = 0.5E+0, $ RONE = 1.0E+0 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. COMPLEX AA, BB, DD, T, TEMP, TEMP2, U, X, Y * .. * .. External Functions .. COMPLEX CLADIV EXTERNAL CLADIV * .. * .. External Subroutines .. EXTERNAL CLARTG * .. * .. Intrinsic Functions .. INTRINSIC REAL, CMPLX, CONJG, AIMAG, SQRT * .. * .. Executable Statements .. * * Initialize CS and SN * CS = RONE SN = ZERO * IF( C.EQ.ZERO ) THEN GO TO 10 * ELSE IF( B.EQ.ZERO ) THEN * * Swap rows and columns * CS = RZERO SN = ONE TEMP = D D = A A = TEMP B = -C C = ZERO GO TO 10 ELSE IF( ( A-D ).EQ.ZERO ) THEN TEMP = SQRT( B*C ) A = A + TEMP D = D - TEMP IF( ( B+C ).EQ.ZERO ) THEN CS = SQRT( HALF ) SN = CMPLX( RZERO, RONE )*CS ELSE TEMP = SQRT( B+C ) TEMP2 = CLADIV( SQRT( B ), TEMP ) CS = REAL( TEMP2 ) SN = CLADIV( SQRT( C ), TEMP ) END IF B = B - C C = ZERO GO TO 10 ELSE * * Compute eigenvalue closest to D * T = D U = B*C X = HALF*( A-T ) Y = SQRT( X*X+U ) IF( REAL( X )*REAL( Y )+AIMAG( X )*AIMAG( Y ).LT.RZERO ) $ Y = -Y T = T - CLADIV( U, ( X+Y ) ) * * Do one QR step with exact shift T - resulting 2 x 2 in * triangular form. * CALL CLARTG( A-T, C, CS, SN, AA ) * D = D - T BB = CS*B + SN*D DD = -CONJG( SN )*B + CS*D * A = AA*CS + BB*CONJG( SN ) + T B = -AA*SN + BB*CS C = ZERO D = T * END IF * 10 CONTINUE * * Store eigenvalues in RT1 and RT2. * RT1 = A RT2 = D RETURN * * End of CLANV2 * END scalapack-2.0.2/SRC/claref.f000644 000766 000024 00000031503 10363532303 015743 0ustar00juliestaff000000 000000 SUBROUTINE CLAREF( TYPE, A, LDA, WANTZ, Z, LDZ, BLOCK, IROW1, $ ICOL1, ISTART, ISTOP, ITMP1, ITMP2, LILOZ, $ LIHIZ, VECS, V2, V3, T1, T2, T3 ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * May 28, 1999 * * .. Scalar Arguments .. LOGICAL BLOCK, WANTZ CHARACTER TYPE INTEGER ICOL1, IROW1, ISTART, ISTOP, ITMP1, ITMP2, LDA, $ LDZ, LIHIZ, LILOZ COMPLEX T1, T2, T3, V2, V3 * .. * .. Array Arguments .. COMPLEX A( LDA, * ), VECS( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * CLAREF applies one or several Householder reflectors of size 3 * to one or two matrices (if column is specified) on either their * rows or columns. * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * If 'R': Apply reflectors to the rows of the matrix * (apply from left) * Otherwise: Apply reflectors to the columns of the matrix * Unchanged on exit. * * A (global input/output) COMPLEX array, (LDA,*) * On entry, the matrix to receive the reflections. * The updated matrix on exit. * * LDA (local input) INTEGER * On entry, the leading dimension of A. Unchanged on exit. * * WANTZ (global input) LOGICAL * If .TRUE., then apply any column reflections to Z as well. * If .FALSE., then do no additional work on Z. * * Z (global input/output) COMPLEX array, (LDZ,*) * On entry, the second matrix to receive column reflections. * This is changed only if WANTZ is set. * * LDZ (local input) INTEGER * On entry, the leading dimension of Z. Unchanged on exit. * * BLOCK (global input) LOGICAL * If .TRUE., then apply several reflectors at once and read * their data from the VECS array. * If .FALSE., apply the single reflector given by V2, V3, * T1, T2, and T3. * * IROW1 (local input/output) INTEGER * On entry, the local row element of A. * Undefined on output. * * * ICOL1 (local input/output) INTEGER * On entry, the local column element of A. * Undefined on output. * * ISTART (global input) INTEGER * Specifies the "number" of the first reflector. This is * used as an index into VECS if BLOCK is set. * ISTART is ignored if BLOCK is .FALSE.. * * ISTOP (global input) INTEGER * Specifies the "number" of the last reflector. This is * used as an index into VECS if BLOCK is set. * ISTOP is ignored if BLOCK is .FALSE.. * * ITMP1 (local input) INTEGER * Starting range into A. For rows, this is the local * first column. For columns, this is the local first row. * * ITMP2 (local input) INTEGER * Ending range into A. For rows, this is the local last * column. For columns, this is the local last row. * * LILOZ * LIHIZ (local input) INTEGER * These serve the same purpose as ITMP1,ITMP2 but for Z * when WANTZ is set. * * VECS (global input) COMPLEX array of size 3*N (matrix size) * This holds the size 3 reflectors one after another and this * is only accessed when BLOCK is .TRUE. * * V2 * V3 * T1 * T2 * T3 (global input/output) COMPLEX * This holds information on a single size 3 Householder * reflector and is read when BLOCK is .FALSE., and * overwritten when BLOCK is .TRUE. * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Local Scalars .. INTEGER J, K COMPLEX A1, A11, A2, A22, A3, A4, A5, B1, B2, B3, B4, $ B5, H11, H22, SUM, SUM1, SUM2, SUM3, T12, T13, $ T22, T23, T32, T33, TMP1, TMP2, TMP3, V22, V23, $ V32, V33 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MOD * .. * .. Executable Statements .. * IF( LSAME( TYPE, 'R' ) ) THEN IF( BLOCK ) THEN DO 30 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 10 J = ITMP1, ITMP2 - MOD( ITMP2-ITMP1+1, 2 ), 2 A1 = A( IROW1, J ) A2 = A( IROW1+1, J ) A3 = A( IROW1+2, J ) A4 = A( IROW1+3, J ) A5 = A( IROW1+4, J ) B1 = A( IROW1, J+1 ) B2 = A( IROW1+1, J+1 ) B3 = A( IROW1+2, J+1 ) B4 = A( IROW1+3, J+1 ) B5 = A( IROW1+4, J+1 ) SUM1 = CONJG( T1 )*A1 + CONJG( T2 )*A2 + $ CONJG( T3 )*A3 A( IROW1, J ) = A1 - SUM1 H11 = A2 - SUM1*V2 H22 = A3 - SUM1*V3 TMP1 = CONJG( T1 )*B1 + CONJG( T2 )*B2 + $ CONJG( T3 )*B3 A( IROW1, J+1 ) = B1 - TMP1 A11 = B2 - TMP1*V2 A22 = B3 - TMP1*V3 SUM2 = CONJG( T12 )*H11 + CONJG( T22 )*H22 + $ CONJG( T32 )*A4 A( IROW1+1, J ) = H11 - SUM2 H11 = H22 - SUM2*V22 H22 = A4 - SUM2*V32 TMP2 = CONJG( T12 )*A11 + CONJG( T22 )*A22 + $ CONJG( T32 )*B4 A( IROW1+1, J+1 ) = A11 - TMP2 A11 = A22 - TMP2*V22 A22 = B4 - TMP2*V32 SUM3 = CONJG( T13 )*H11 + CONJG( T23 )*H22 + $ CONJG( T33 )*A5 A( IROW1+2, J ) = H11 - SUM3 A( IROW1+3, J ) = H22 - SUM3*V23 A( IROW1+4, J ) = A5 - SUM3*V33 TMP3 = CONJG( T13 )*A11 + CONJG( T23 )*A22 + $ CONJG( T33 )*B5 A( IROW1+2, J+1 ) = A11 - TMP3 A( IROW1+3, J+1 ) = A22 - TMP3*V23 A( IROW1+4, J+1 ) = B5 - TMP3*V33 10 CONTINUE DO 20 J = ITMP2 - MOD( ITMP2-ITMP1+1, 2 ) + 1, ITMP2 SUM = CONJG( T1 )*A( IROW1, J ) + $ CONJG( T2 )*A( IROW1+1, J ) + $ CONJG( T3 )*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM H11 = A( IROW1+1, J ) - SUM*V2 H22 = A( IROW1+2, J ) - SUM*V3 SUM = CONJG( T12 )*H11 + CONJG( T22 )*H22 + $ CONJG( T32 )*A( IROW1+3, J ) A( IROW1+1, J ) = H11 - SUM H11 = H22 - SUM*V22 H22 = A( IROW1+3, J ) - SUM*V32 SUM = CONJG( T13 )*H11 + CONJG( T23 )*H22 + $ CONJG( T33 )*A( IROW1+4, J ) A( IROW1+2, J ) = H11 - SUM A( IROW1+3, J ) = H22 - SUM*V23 A( IROW1+4, J ) = A( IROW1+4, J ) - SUM*V33 20 CONTINUE IROW1 = IROW1 + 3 30 CONTINUE DO 50 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 40 J = ITMP1, ITMP2 SUM = CONJG( T1 )*A( IROW1, J ) + $ CONJG( T2 )*A( IROW1+1, J ) + $ CONJG( T3 )*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*V2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*V3 40 CONTINUE IROW1 = IROW1 + 1 50 CONTINUE ELSE DO 60 J = ITMP1, ITMP2 SUM = CONJG( T1 )*A( IROW1, J ) + $ CONJG( T2 )*A( IROW1+1, J ) + $ CONJG( T3 )*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*V2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*V3 60 CONTINUE END IF ELSE * * Do column transforms * IF( BLOCK ) THEN DO 90 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 70 J = ITMP1, ITMP2 SUM = T1*A( J, ICOL1 ) + T2*A( J, ICOL1+1 ) + $ T3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM H11 = A( J, ICOL1+1 ) - SUM*CONJG( V2 ) H22 = A( J, ICOL1+2 ) - SUM*CONJG( V3 ) SUM = T12*H11 + T22*H22 + T32*A( J, ICOL1+3 ) A( J, ICOL1+1 ) = H11 - SUM H11 = H22 - SUM*CONJG( V22 ) H22 = A( J, ICOL1+3 ) - SUM*CONJG( V32 ) SUM = T13*H11 + T23*H22 + T33*A( J, ICOL1+4 ) A( J, ICOL1+2 ) = H11 - SUM A( J, ICOL1+3 ) = H22 - SUM*CONJG( V23 ) A( J, ICOL1+4 ) = A( J, ICOL1+4 ) - SUM*CONJG( V33 ) 70 CONTINUE IF( WANTZ ) THEN DO 80 J = LILOZ, LIHIZ SUM = T1*Z( J, ICOL1 ) + T2*Z( J, ICOL1+1 ) + $ T3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM H11 = Z( J, ICOL1+1 ) - SUM*CONJG( V2 ) H22 = Z( J, ICOL1+2 ) - SUM*CONJG( V3 ) SUM = T12*H11 + T22*H22 + T32*Z( J, ICOL1+3 ) Z( J, ICOL1+1 ) = H11 - SUM H11 = H22 - SUM*CONJG( V22 ) H22 = Z( J, ICOL1+3 ) - SUM*CONJG( V32 ) SUM = T13*H11 + T23*H22 + T33*Z( J, ICOL1+4 ) Z( J, ICOL1+2 ) = H11 - SUM Z( J, ICOL1+3 ) = H22 - SUM*CONJG( V23 ) Z( J, ICOL1+4 ) = Z( J, ICOL1+4 ) - $ SUM*CONJG( V33 ) 80 CONTINUE END IF ICOL1 = ICOL1 + 3 90 CONTINUE DO 120 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 100 J = ITMP1, ITMP2 SUM = T1*A( J, ICOL1 ) + T2*A( J, ICOL1+1 ) + $ T3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*CONJG( V2 ) A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*CONJG( V3 ) 100 CONTINUE IF( WANTZ ) THEN DO 110 J = LILOZ, LIHIZ SUM = T1*Z( J, ICOL1 ) + T2*Z( J, ICOL1+1 ) + $ T3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM Z( J, ICOL1+1 ) = Z( J, ICOL1+1 ) - $ SUM*CONJG( V2 ) Z( J, ICOL1+2 ) = Z( J, ICOL1+2 ) - $ SUM*CONJG( V3 ) 110 CONTINUE END IF ICOL1 = ICOL1 + 1 120 CONTINUE ELSE DO 130 J = ITMP1, ITMP2 SUM = T1*A( J, ICOL1 ) + T2*A( J, ICOL1+1 ) + $ T3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*CONJG( V2 ) A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*CONJG( V3 ) 130 CONTINUE END IF END IF RETURN * * End of CLAREF * END scalapack-2.0.2/SRC/CMakeLists.txt000644 000766 000024 00000017264 11745555630 017125 0ustar00juliestaff000000 000000 set (ALLAUX pjlaenv.f pilaenvx.f piparmq.f pilaver.f pmpim2.f pmpcol.f) set (ALLAUX-C pbchkvect.c getpbbuf.c pcrot.c pslaiect.c pdlaiect.c pzrot.c slamov.c clamov.c dlamov.c zlamov.c) set (SCLAUX pslabad.f pslaed0.f pslaed1.f pslaed2.f pslaed3.f pslaedz.f pslamch.f pslared1d.f pslasrt.f psstebz.f psstedc.f slapst.f slasrt2.f sstein2.f slar1va.f slarrb2.f slarrd2.f slarre2.f slarre2a.f slarrf2.f slarrv2.f sstegr2.f sstegr2a.f sstegr2b.f) set (DZLAUX pdlabad.f pdlaed0.f pdlaed1.f pdlaed2.f pdlaed3.f pdlaedz.f pdlamch.f pdlared1d.f pdlasrt.f pdstebz.f pdstedc.f dlapst.f dlasrt2.f dstein2.f dlar1va.f dlarrb2.f dlarrd2.f dlarre2.f dlarre2a.f dlarrf2.f dlarrv2.f dstegr2.f dstegr2a.f dstegr2b.f) set (SLASRC psdbsv.f psdbtrf.f psdbtrs.f psdbtrsv.f psdtsv.f psdttrf.f psdttrs.f psdttrsv.f psgbsv.f psgbtrf.f psgbtrs.f psgebd2.f psgebrd.f psgecon.f psgeequ.f psgehd2.f psgehrd.f psgelq2.f psgelqf.f psgels.f psgeql2.f psgeqlf.f psgeqpf.f psgeqr2.f psgeqrf.f psgerfs.f psgerq2.f psgerqf.f psgesv.f psgesvd.f psgesvx.f psgetf2.f psgetrf.f psgetri.f psgetrs.f psggqrf.f psggrqf.f pslabrd.f pslacon.f pslacp2.f pslacpy.f pslahrd.f pslange.f pslanhs.f pslansy.f pslantr.f pslapiv.f pslapv2.f pslaqge.f pslaqsy.f pslarf.f pslarfb.f pslarfg.f pslarft.f pslase2.f pslaset.f pslascl.f pslassq.f pslaswp.f pslatra.f pslatrd.f pslatrs.f pslauu2.f pslauum.f psorg2l.f psorg2r.f psorgl2.f psorglq.f psorgql.f psorgqr.f psorgr2.f psorgrq.f psorm2l.f psorm2r.f psormbr.f psormhr.f psorml2.f psormlq.f psormql.f psormqr.f psormr2.f psormrq.f psormtr.f pspocon.f pspbsv.f pspbtrf.f pspbtrs.f pspbtrsv.f psptsv.f pspttrf.f pspttrs.f pspttrsv.f pspoequ.f psporfs.f psposv.f psposvx.f pspotf2.f pspotrf.f pspotri.f pspotrs.f psrscl.f psstein.f pssyev.f pssyevd.f pssyevx.f pssygs2.f pssygst.f pssygvx.f pssyngst.f pssyntrd.f pssyttrd.f pssytd2.f pssytrd.f pstrti2.f pstrtri.f pstrtrs.f pslaevswp.f pslarzb.f pslarzt.f pslarz.f pslatrz.f pstzrzf.f psormr3.f psormrz.f pslahqr.f pslaconsb.f pslacp3.f pslawil.f pslasmsub.f pslared2d.f pslamr1d.f slaref.f slamsh.f slasorte.f ssteqr2.f sdbtf2.f sdbtrf.f sdttrf.f sdttrsv.f spttrsv.f strmvt.f pssyevr.f bslaapp.f bslaexc.f bstrexc.f pstrord.f pstrsen.f psgebal.f pshseqr.f pslamve.f pslaqr0.f pslaqr1.f pslaqr2.f pslaqr3.f pslaqr4.f pslaqr5.f psrot.f slaqr6.f) set (CLASRC pcdbsv.f pcdbtrf.f pcdbtrs.f pcdbtrsv.f pcdtsv.f pcdttrf.f pcdttrs.f pcdttrsv.f pcgbsv.f pcgbtrf.f pcgbtrs.f pcgebd2.f pcgebrd.f pcgecon.f pcgeequ.f pcgehd2.f pcgehrd.f pcgelq2.f pcgelqf.f pcgels.f pcgeql2.f pcgeqlf.f pcgeqpf.f pcgeqr2.f pcgeqrf.f pcgerfs.f pcgerq2.f pcgerqf.f pcgesv.f pcgesvd.f pcgesvx.f pcgetf2.f pcgetrf.f pcgetri.f pcgetrs.f pcggqrf.f pcggrqf.f pcheev.f pcheevd.f pcheevx.f pchegs2.f pchegst.f pchegvx.f pchengst.f pchentrd.f pchettrd.f pchetd2.f pchetrd.f pclabrd.f pclacon.f pclacgv.f pclacp2.f pclacpy.f pclahrd.f pclahqr.f pclaconsb.f pclasmsub.f pclacp3.f pclawil.f pclange.f pclanhe.f pclanhs.f pclansy.f pclantr.f pclapiv.f pclapv2.f pclaqge.f pclaqsy.f pclarf.f pclarfb.f pclarfc.f pclarfg.f pclarft.f pclascl.f pclase2.f pclaset.f pclassq.f pclaswp.f pclatra.f pclatrd.f pclatrs.f pclauu2.f pclauum.f pcpocon.f pcpoequ.f pcporfs.f pcposv.f pcpbsv.f pcpbtrf.f pcpbtrs.f pcpbtrsv.f pcptsv.f pcpttrf.f pcpttrs.f pcpttrsv.f pcposvx.f pcpotf2.f pcpotrf.f pcpotri.f pcpotrs.f pcsrscl.f pcstein.f pctrevc.f pctrti2.f pctrtri.f pctrtrs.f pcung2l.f pcung2r.f pcungl2.f pcunglq.f pcungql.f pcungqr.f pcungr2.f pcungrq.f pcunm2l.f pcunm2r.f pcunmbr.f pcunmhr.f pcunml2.f pcunmlq.f pcunmql.f pcunmqr.f pcunmr2.f pcunmrq.f pcunmtr.f pclaevswp.f pclarzb.f pclarzt.f pclarz.f pclarzc.f pclatrz.f pctzrzf.f pclattrs.f pcunmr3.f pcunmrz.f pcmax1.f pscsum1.f pclamr1d.f cdbtf2.f cdbtrf.f cdttrf.f cdttrsv.f cpttrsv.f csteqr2.f ctrmvt.f clamsh.f claref.f clanv2.f clahqr2.f pcheevr.f) set (DLASRC pddbsv.f pddbtrf.f pddbtrs.f pddbtrsv.f pddtsv.f pddttrf.f pddttrs.f pddttrsv.f pdgbsv.f pdgbtrf.f pdgbtrs.f pdgebd2.f pdgebrd.f pdgecon.f pdgeequ.f pdgehd2.f pdgehrd.f pdgelq2.f pdgelqf.f pdgels.f pdgeql2.f pdgeqlf.f pdgeqpf.f pdgeqr2.f pdgeqrf.f pdgerfs.f pdgerq2.f pdgerqf.f pdgesv.f pdgesvd.f pdgesvx.f pdgetf2.f pdgetrf.f pdgetri.f pdgetrs.f pdggqrf.f pdggrqf.f pdlabrd.f pdlacon.f pdlacp2.f pdlacpy.f pdlahrd.f pdlange.f pdlanhs.f pdlansy.f pdlantr.f pdlapiv.f pdlapv2.f pdlaqge.f pdlaqsy.f pdlarf.f pdlarfb.f pdlarfg.f pdlarft.f pdlase2.f pdlaset.f pdlascl.f pdlassq.f pdlaswp.f pdlatra.f pdlatrd.f pdlatrs.f pdlauu2.f pdlauum.f pdorg2l.f pdorg2r.f pdorgl2.f pdorglq.f pdorgql.f pdorgqr.f pdorgr2.f pdorgrq.f pdorm2l.f pdorm2r.f pdormbr.f pdormhr.f pdorml2.f pdormlq.f pdormql.f pdormqr.f pdormr2.f pdormrq.f pdormtr.f pdpocon.f pdpbsv.f pdpbtrf.f pdpbtrs.f pdpbtrsv.f pdptsv.f pdpttrf.f pdpttrs.f pdpttrsv.f pdpoequ.f pdporfs.f pdposv.f pdposvx.f pdpotf2.f pdpotrf.f pdpotri.f pdpotrs.f pdrscl.f pdstein.f pdsyev.f pdsyevd.f pdsyevx.f pdsygs2.f pdsygst.f pdsygvx.f pdsyngst.f pdsyntrd.f pdsyttrd.f pdsytd2.f pdsytrd.f pdtrti2.f pdtrtri.f pdtrtrs.f pdlaevswp.f pdlarzb.f pdlarzt.f pdlarz.f pdlatrz.f pdtzrzf.f pdormr3.f pdormrz.f pdlahqr.f pdlaconsb.f pdlacp3.f pdlawil.f pdlasmsub.f pdlared2d.f pdlamr1d.f dlaref.f dlamsh.f dlasorte.f dsteqr2.f ddbtf2.f ddbtrf.f ddttrf.f ddttrsv.f dpttrsv.f dtrmvt.f pdsyevr.f bdlaapp.f bdlaexc.f bdtrexc.f dlaqr6.f pdtrord.f pdtrsen.f pdgebal.f pdhseqr.f pdlamve.f pdlaqr0.f pdlaqr1.f pdlaqr2.f pdlaqr3.f pdlaqr4.f pdlaqr5.f pdrot.f) set (ZLASRC pzdbsv.f pzdbtrf.f pzdbtrs.f pzdbtrsv.f pzdtsv.f pzdttrf.f pzdttrs.f pzdttrsv.f pzgbsv.f pzgbtrf.f pzgbtrs.f pzgebd2.f pzgebrd.f pzgecon.f pzgeequ.f pzgehd2.f pzgehrd.f pzgelq2.f pzgelqf.f pzgels.f pzgeql2.f pzgeqlf.f pzgeqpf.f pzgeqr2.f pzgeqrf.f pzgerfs.f pzgerq2.f pzgerqf.f pzgesv.f pzgesvd.f pzgesvx.f pzgetf2.f pzgetrf.f pzgetri.f pzgetrs.f pzggqrf.f pzggrqf.f pzheev.f pzheevd.f pzheevx.f pzhegs2.f pzhegst.f pzhegvx.f pzhengst.f pzhentrd.f pzhettrd.f pzhetd2.f pzhetrd.f pzlabrd.f pzlacon.f pzlacgv.f pzlacp2.f pzlacpy.f pzlahrd.f pzlahqr.f pzlaconsb.f pzlasmsub.f pzlacp3.f pzlawil.f pzlange.f pzlanhe.f pzlanhs.f pzlansy.f pzlantr.f pzlapiv.f pzlapv2.f pzlaqge.f pzlaqsy.f pzlarf.f pzlarfb.f pzlarfc.f pzlarfg.f pzlarft.f pzlascl.f pzlase2.f pzlaset.f pzlassq.f pzlaswp.f pzlatra.f pzlatrd.f pzlattrs.f pzlatrs.f pzlauu2.f pzlauum.f pzpocon.f pzpoequ.f pzporfs.f pzposv.f pzpbsv.f pzpbtrf.f pzpbtrs.f pzpbtrsv.f pzptsv.f pzpttrf.f pzpttrs.f pzpttrsv.f pzposvx.f pzpotf2.f pzpotrf.f pzpotri.f pzpotrs.f pzdrscl.f pzstein.f pztrevc.f pztrti2.f pztrtri.f pztrtrs.f pzung2l.f pzung2r.f pzungl2.f pzunglq.f pzungql.f pzungqr.f pzungr2.f pzungrq.f pzunm2l.f pzunm2r.f pzunmbr.f pzunmhr.f pzunml2.f pzunmlq.f pzunmql.f pzunmqr.f pzunmr2.f pzunmrq.f pzunmtr.f pzlaevswp.f pzlarzb.f pzlarzt.f pzlarz.f pzlarzc.f pzlatrz.f pztzrzf.f pzunmr3.f pzunmrz.f pzmax1.f pdzsum1.f pzlamr1d.f zdbtf2.f zdbtrf.f zdttrf.f zdttrsv.f zpttrsv.f zsteqr2.f ztrmvt.f zlamsh.f zlaref.f zlanv2.f zlahqr2.f pzheevr.f) set(src ${ALLAUX} ${SCLAUX} ${DZLAUX} ${SLASRC} ${CLASRC} ${DLASRC} ${ZLASRC} ) set(src-C ${ALLAUX-C} ) scalapack-2.0.2/SRC/cpttrsv.f000644 000766 000024 00000011412 11657111056 016216 0ustar00juliestaff000000 000000 SUBROUTINE CPTTRSV( UPLO, TRANS, N, NRHS, D, E, B, LDB, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * November, 1996. * Modified from CPTTRS: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. CHARACTER UPLO, TRANS INTEGER INFO, LDB, N, NRHS * .. * .. Array Arguments .. REAL D( * ) COMPLEX B( LDB, * ), E( * ) * .. * * Purpose * ======= * * CPTTRSV solves one of the triangular systems * L * X = B, or L**H * X = B, * U * X = B, or U**H * X = B, * where L or U is the Cholesky factor of a Hermitian positive * definite tridiagonal matrix A such that * A = U**H*D*U or A = L*D*L**H (computed by CPTTRF). * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the superdiagonal or the subdiagonal * of the tridiagonal matrix A is stored and the form of the * factorization: * = 'U': E is the superdiagonal of U, and A = U'*D*U; * = 'L': E is the subdiagonal of L, and A = L*D*L'. * (The two forms are equivalent if A is real.) * * TRANS (input) CHARACTER * Specifies the form of the system of equations: * = 'N': L * X = B (No transpose) * = 'N': L * X = B (No transpose) * = 'C': U**H * X = B (Conjugate transpose) * = 'C': L**H * X = B (Conjugate transpose) * * N (input) INTEGER * The order of the tridiagonal matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * D (input) REAL array, dimension (N) * The n diagonal elements of the diagonal matrix D from the * factorization computed by CPTTRF. * * E (input) COMPLEX array, dimension (N-1) * The (n-1) off-diagonal elements of the unit bidiagonal * factor U or L from the factorization computed by CPTTRF * (see UPLO). * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Local Scalars .. LOGICAL NOTRAN, UPPER INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * * Test the input arguments. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CPTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * IF( .NOT.NOTRAN ) THEN * DO 30 J = 1, NRHS * * Solve U**T (or H) * x = b. * DO 10 I = 2, N B( I, J ) = B( I, J ) - B( I-1, J )*CONJG( E( I-1 ) ) 10 CONTINUE 30 CONTINUE * ELSE * DO 35 J = 1, NRHS * * Solve U * x = b. * DO 20 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - B( I+1, J )*E( I ) 20 CONTINUE 35 CONTINUE ENDIF * ELSE * IF( NOTRAN ) THEN * DO 60 J = 1, NRHS * * Solve L * x = b. * DO 40 I = 2, N B( I, J ) = B( I, J ) - B( I-1, J )*E( I-1 ) 40 CONTINUE 60 CONTINUE * ELSE * DO 65 J = 1, NRHS * * Solve L**H * x = b. * DO 50 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - $ B( I+1, J )*CONJG( E( I ) ) 50 CONTINUE 65 CONTINUE ENDIF * END IF * RETURN * * End of CPTTRS * END scalapack-2.0.2/SRC/csteqr2.f000644 000766 000024 00000044530 10363532303 016076 0ustar00juliestaff000000 000000 SUBROUTINE CSTEQR2( COMPZ, N, D, E, Z, LDZ, NR, WORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * November 15, 1997 * * .. Scalar Arguments .. CHARACTER COMPZ INTEGER INFO, LDZ, N, NR * .. * .. Array Arguments .. REAL D( * ), E( * ), WORK( * ) COMPLEX Z( LDZ, * ) * .. * * Purpose * ======= * * CSTEQR2 is a modified version of LAPACK routine CSTEQR. * CSTEQR2 computes all eigenvalues and, optionally, eigenvectors of a * symmetric tridiagonal matrix using the implicit QL or QR method. * CSTEQR2 is modified from CSTEQR to allow each ScaLAPACK process * running CSTEQR2 to perform updates on a distributed matrix Q. * Proper usage of CSTEQR2 can be gleaned from * examination of ScaLAPACK's * PCHEEV. * CSTEQR2 incorporates changes attributed to Greg Henry. * * Arguments * ========= * * COMPZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only. * = 'I': Compute eigenvalues and eigenvectors of the * tridiagonal matrix. Z must be initialized to the * identity matrix by PCLASET or CLASET prior * to entering this subroutine. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) REAL array, dimension (N) * On entry, the diagonal elements of the tridiagonal matrix. * On exit, if INFO = 0, the eigenvalues in ascending order. * * E (input/output) REAL array, dimension (N-1) * On entry, the (n-1) subdiagonal elements of the tridiagonal * matrix. * On exit, E has been destroyed. * * Z (local input/local output) COMPLEX array, global * dimension (N, N), local dimension (LDZ, NR). * On entry, if COMPZ = 'V', then Z contains the orthogonal * matrix used in the reduction to tridiagonal form. * On exit, if INFO = 0, then if COMPZ = 'V', Z contains the * orthonormal eigenvectors of the original symmetric matrix, * and if COMPZ = 'I', Z contains the orthonormal eigenvectors * of the symmetric tridiagonal matrix. * If COMPZ = 'N', then Z is not referenced. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * eigenvectors are desired, then LDZ >= max(1,N). * * NR (input) INTEGER * NR = MAX(1, NUMROC( N, NB, MYPROW, 0, NPROCS ) ). * If COMPZ = 'N', then NR is not referenced. * * WORK (workspace) REAL array, dimension (max(1,2*N-2)) * If COMPZ = 'N', then WORK is not referenced. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: the algorithm has failed to find all the eigenvalues in * a total of 30*N iterations; if INFO = i, then i * elements of E have not converged to zero; on exit, D * and E contain the elements of a symmetric tridiagonal * matrix which is orthogonally similar to the original * matrix. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE, TWO, THREE, HALF PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, TWO = 2.0E0, $ THREE = 3.0E0, HALF = 0.5E0 ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E0, 1.0E0 ) ) INTEGER MAXIT, NMAXLOOK PARAMETER ( MAXIT = 30, NMAXLOOK = 15 ) * .. * .. Local Scalars .. INTEGER I, ICOMPZ, II, ILAST, ISCALE, J, JTOT, K, L, $ L1, LEND, LENDM1, LENDP1, LENDSV, LM1, LSV, M, $ MM, MM1, NLOOK, NM1, NMAXIT REAL ANORM, B, C, EPS, EPS2, F, G, GP, OLDEL, OLDGP, $ OLDRP, P, R, RP, RT1, RT2, S, SAFMAX, SAFMIN, $ SSFMAX, SSFMIN, TST, TST1 * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH, SLANST, SLAPY2 EXTERNAL LSAME, SLAMCH, SLANST, SLAPY2 * .. * .. External Subroutines .. EXTERNAL CLASR, CSWAP, SLAEV2, SLARTG, SLASCL, SSTERF, $ XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SIGN, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * ILAST = 0 INFO = 0 * IF( LSAME( COMPZ, 'N' ) ) THEN ICOMPZ = 0 ELSEIF( LSAME( COMPZ, 'I' ) ) THEN ICOMPZ = 1 ELSE ICOMPZ = -1 ENDIF IF( ICOMPZ.LT.0 ) THEN INFO = -1 ELSEIF( N.LT.0 ) THEN INFO = -2 ELSEIF( ICOMPZ.GT.0 .AND. LDZ.LT.MAX( 1, NR ) ) THEN INFO = -6 ENDIF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CSTEQR2', -INFO ) RETURN ENDIF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * If eigenvectors aren't not desired, this is faster * IF( ICOMPZ.EQ.0 ) THEN CALL SSTERF( N, D, E, INFO ) RETURN ENDIF * IF( N.EQ.1 ) THEN Z( 1, 1 ) = CONE RETURN ENDIF * * Determine the unit roundoff and over/underflow thresholds. * EPS = SLAMCH( 'E' ) EPS2 = EPS**2 SAFMIN = SLAMCH( 'S' ) SAFMAX = ONE / SAFMIN SSFMAX = SQRT( SAFMAX ) / THREE SSFMIN = SQRT( SAFMIN ) / EPS2 * * Compute the eigenvalues and eigenvectors of the tridiagonal * matrix. * NMAXIT = N*MAXIT JTOT = 0 * * Determine where the matrix splits and choose QL or QR iteration * for each block, according to whether top or bottom diagonal * element is smaller. * L1 = 1 NM1 = N - 1 * 10 CONTINUE IF( L1.GT.N ) $ GOTO 220 IF( L1.GT.1 ) $ E( L1-1 ) = ZERO IF( L1.LE.NM1 ) THEN DO 20 M = L1, NM1 TST = ABS( E( M ) ) IF( TST.EQ.ZERO ) $ GOTO 30 IF( TST.LE.( SQRT( ABS( D( M ) ) )*SQRT( ABS( D( M+ $ 1 ) ) ) )*EPS ) THEN E( M ) = ZERO GOTO 30 ENDIF 20 CONTINUE ENDIF M = N * 30 CONTINUE L = L1 LSV = L LEND = M LENDSV = LEND L1 = M + 1 IF( LEND.EQ.L ) $ GOTO 10 * * Scale submatrix in rows and columns L to LEND * ANORM = SLANST( 'I', LEND-L+1, D( L ), E( L ) ) ISCALE = 0 IF( ANORM.EQ.ZERO ) $ GOTO 10 IF( ANORM.GT.SSFMAX ) THEN ISCALE = 1 CALL SLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L+1, 1, D( L ), N, $ INFO ) CALL SLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L, 1, E( L ), N, $ INFO ) ELSEIF( ANORM.LT.SSFMIN ) THEN ISCALE = 2 CALL SLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L+1, 1, D( L ), N, $ INFO ) CALL SLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L, 1, E( L ), N, $ INFO ) ENDIF * * Choose between QL and QR iteration * IF( ABS( D( LEND ) ).LT.ABS( D( L ) ) ) THEN LEND = LSV L = LENDSV ENDIF * IF( LEND.GT.L ) THEN * * QL Iteration * * Look for small subdiagonal element. * 40 CONTINUE IF( L.NE.LEND ) THEN LENDM1 = LEND - 1 DO 50 M = L, LENDM1 TST = ABS( E( M ) )**2 IF( TST.LE.( EPS2*ABS( D( M ) ) )*ABS( D( M+1 ) )+ $ SAFMIN )GOTO 60 50 CONTINUE ENDIF * M = LEND * 60 CONTINUE IF( M.LT.LEND ) $ E( M ) = ZERO P = D( L ) IF( M.EQ.L ) $ GOTO 110 * * If remaining matrix is 2-by-2, use SLAE2 or SLAEV2 * to compute its eigensystem. * IF( M.EQ.L+1 ) THEN CALL SLAEV2( D( L ), E( L ), D( L+1 ), RT1, RT2, C, S ) WORK( L ) = C WORK( N-1+L ) = S CALL CLASR( 'R', 'V', 'B', NR, 2, WORK( L ), WORK( N-1+L ), $ Z( 1, L ), LDZ ) D( L ) = RT1 D( L+1 ) = RT2 E( L ) = ZERO L = L + 2 IF( L.LE.LEND ) $ GOTO 40 GOTO 200 ENDIF * IF( JTOT.EQ.NMAXIT ) $ GOTO 200 JTOT = JTOT + 1 * * Form shift. * G = ( D( L+1 )-P ) / ( TWO*E( L ) ) R = SLAPY2( G, ONE ) G = D( M ) - P + ( E( L ) / ( G+SIGN( R, G ) ) ) * IF( ICOMPZ.EQ.0 ) THEN * Do not do a lookahead! GOTO 90 ENDIF * OLDEL = ABS( E( L ) ) GP = G RP = R TST = ABS( E( L ) )**2 TST = TST / ( ( EPS2*ABS( D( L ) ) )*ABS( D( L+1 ) )+SAFMIN ) * NLOOK = 1 IF( ( TST.GT.ONE ) .AND. ( NLOOK.LE.NMAXLOOK ) ) THEN 70 CONTINUE * * This is the lookahead loop, going until we have * convergence or too many steps have been taken. * S = ONE C = ONE P = ZERO MM1 = M - 1 DO 80 I = MM1, L, -1 F = S*E( I ) B = C*E( I ) CALL SLARTG( GP, F, C, S, RP ) GP = D( I+1 ) - P RP = ( D( I )-GP )*S + TWO*C*B P = S*RP IF( I.NE.L ) $ GP = C*RP - B 80 CONTINUE OLDGP = GP OLDRP = RP * Find GP & RP for the next iteration IF( ABS( C*OLDRP-B ).GT.SAFMIN ) THEN GP = ( ( OLDGP+P )-( D( L )-P ) ) / ( TWO*( C*OLDRP-B ) ) ELSE * * Goto put in by G. Henry to fix ALPHA problem * GOTO 90 * GP = ( ( OLDGP+P )-( D( L )-P ) ) / * $ ( TWO*( C*OLDRP-B )+SAFMIN ) ENDIF RP = SLAPY2( GP, ONE ) GP = D( M ) - ( D( L )-P ) + $ ( ( C*OLDRP-B ) / ( GP+SIGN( RP, GP ) ) ) TST1 = TST TST = ABS( C*OLDRP-B )**2 TST = TST / ( ( EPS2*ABS( D( L )-P ) )*ABS( OLDGP+P )+ $ SAFMIN ) * Make sure that we are making progress IF( ABS( C*OLDRP-B ).GT.0.9E0*OLDEL ) THEN IF( ABS( C*OLDRP-B ).GT.OLDEL ) THEN GP = G RP = R ENDIF TST = HALF ELSE OLDEL = ABS( C*OLDRP-B ) ENDIF NLOOK = NLOOK + 1 IF( ( TST.GT.ONE ) .AND. ( NLOOK.LE.NMAXLOOK ) ) $ GOTO 70 ENDIF * IF( ( TST.LE.ONE ) .AND. ( TST.NE.HALF ) .AND. $ ( ABS( P ).LT.EPS*ABS( D( L ) ) ) .AND. $ ( ILAST.EQ.L ) .AND. ( ABS( E( L ) )**2.LE.10000.0E0* $ ( ( EPS2*ABS( D( L ) ) )*ABS( D( L+1 ) )+SAFMIN ) ) ) THEN * * Skip the current step: the subdiagonal info is just noise. * M = L E( M ) = ZERO P = D( L ) JTOT = JTOT - 1 GOTO 110 ENDIF G = GP R = RP * * Lookahead over * 90 CONTINUE * S = ONE C = ONE P = ZERO * * Inner loop * MM1 = M - 1 DO 100 I = MM1, L, -1 F = S*E( I ) B = C*E( I ) CALL SLARTG( G, F, C, S, R ) IF( I.NE.M-1 ) $ E( I+1 ) = R G = D( I+1 ) - P R = ( D( I )-G )*S + TWO*C*B P = S*R D( I+1 ) = G + P G = C*R - B * * If eigenvectors are desired, then save rotations. * WORK( I ) = C WORK( N-1+I ) = -S * 100 CONTINUE * * If eigenvectors are desired, then apply saved rotations. * MM = M - L + 1 CALL CLASR( 'R', 'V', 'B', NR, MM, WORK( L ), WORK( N-1+L ), $ Z( 1, L ), LDZ ) * D( L ) = D( L ) - P E( L ) = G ILAST = L GOTO 40 * * Eigenvalue found. * 110 CONTINUE D( L ) = P * L = L + 1 IF( L.LE.LEND ) $ GOTO 40 GOTO 200 * ELSE * * QR Iteration * * Look for small superdiagonal element. * 120 CONTINUE IF( L.NE.LEND ) THEN LENDP1 = LEND + 1 DO 130 M = L, LENDP1, -1 TST = ABS( E( M-1 ) )**2 IF( TST.LE.( EPS2*ABS( D( M ) ) )*ABS( D( M-1 ) )+ $ SAFMIN )GOTO 140 130 CONTINUE ENDIF * M = LEND * 140 CONTINUE IF( M.GT.LEND ) $ E( M-1 ) = ZERO P = D( L ) IF( M.EQ.L ) $ GOTO 190 * * If remaining matrix is 2-by-2, use SLAE2 or SLAEV2 * to compute its eigensystem. * IF( M.EQ.L-1 ) THEN CALL SLAEV2( D( L-1 ), E( L-1 ), D( L ), RT1, RT2, C, S ) WORK( M ) = C WORK( N-1+M ) = S CALL CLASR( 'R', 'V', 'F', NR, 2, WORK( M ), WORK( N-1+M ), $ Z( 1, L-1 ), LDZ ) D( L-1 ) = RT1 D( L ) = RT2 E( L-1 ) = ZERO L = L - 2 IF( L.GE.LEND ) $ GOTO 120 GOTO 200 ENDIF * IF( JTOT.EQ.NMAXIT ) $ GOTO 200 JTOT = JTOT + 1 * * Form shift. * G = ( D( L-1 )-P ) / ( TWO*E( L-1 ) ) R = SLAPY2( G, ONE ) G = D( M ) - P + ( E( L-1 ) / ( G+SIGN( R, G ) ) ) * IF( ICOMPZ.EQ.0 ) THEN * Do not do a lookahead! GOTO 170 ENDIF * OLDEL = ABS( E( L-1 ) ) GP = G RP = R TST = ABS( E( L-1 ) )**2 TST = TST / ( ( EPS2*ABS( D( L ) ) )*ABS( D( L-1 ) )+SAFMIN ) NLOOK = 1 IF( ( TST.GT.ONE ) .AND. ( NLOOK.LE.NMAXLOOK ) ) THEN 150 CONTINUE * * This is the lookahead loop, going until we have * convergence or too many steps have been taken. * S = ONE C = ONE P = ZERO * * Inner loop * LM1 = L - 1 DO 160 I = M, LM1 F = S*E( I ) B = C*E( I ) CALL SLARTG( GP, F, C, S, RP ) GP = D( I ) - P RP = ( D( I+1 )-GP )*S + TWO*C*B P = S*RP IF( I.LT.LM1 ) $ GP = C*RP - B 160 CONTINUE OLDGP = GP OLDRP = RP * Find GP & RP for the next iteration IF( ABS( C*OLDRP-B ).GT.SAFMIN ) THEN GP = ( ( OLDGP+P )-( D( L )-P ) ) / ( TWO*( C*OLDRP-B ) ) ELSE * * Goto put in by G. Henry to fix ALPHA problem * GOTO 170 * GP = ( ( OLDGP+P )-( D( L )-P ) ) / * $ ( TWO*( C*OLDRP-B )+SAFMIN ) ENDIF RP = SLAPY2( GP, ONE ) GP = D( M ) - ( D( L )-P ) + $ ( ( C*OLDRP-B ) / ( GP+SIGN( RP, GP ) ) ) TST1 = TST TST = ABS( ( C*OLDRP-B ) )**2 TST = TST / ( ( EPS2*ABS( D( L )-P ) )*ABS( OLDGP+P )+ $ SAFMIN ) * Make sure that we are making progress IF( ABS( C*OLDRP-B ).GT.0.9E0*OLDEL ) THEN IF( ABS( C*OLDRP-B ).GT.OLDEL ) THEN GP = G RP = R ENDIF TST = HALF ELSE OLDEL = ABS( C*OLDRP-B ) ENDIF NLOOK = NLOOK + 1 IF( ( TST.GT.ONE ) .AND. ( NLOOK.LE.NMAXLOOK ) ) $ GOTO 150 ENDIF IF( ( TST.LE.ONE ) .AND. ( TST.NE.HALF ) .AND. $ ( ABS( P ).LT.EPS*ABS( D( L ) ) ) .AND. $ ( ILAST.EQ.L ) .AND. ( ABS( E( L-1 ) )**2.LE.10000.0E0* $ ( ( EPS2*ABS( D( L-1 ) ) )*ABS( D( L ) )+SAFMIN ) ) ) THEN * * Skip the current step: the subdiagonal info is just noise. * M = L E( M-1 ) = ZERO P = D( L ) JTOT = JTOT - 1 GOTO 190 ENDIF * G = GP R = RP * * Lookahead over * 170 CONTINUE * S = ONE C = ONE P = ZERO DO 180 I = M, LM1 F = S*E( I ) B = C*E( I ) CALL SLARTG( G, F, C, S, R ) IF( I.NE.M ) $ E( I-1 ) = R G = D( I ) - P R = ( D( I+1 )-G )*S + TWO*C*B P = S*R D( I ) = G + P G = C*R - B * * If eigenvectors are desired, then save rotations. * WORK( I ) = C WORK( N-1+I ) = S * 180 CONTINUE * * If eigenvectors are desired, then apply saved rotations. * MM = L - M + 1 CALL CLASR( 'R', 'V', 'F', NR, MM, WORK( M ), WORK( N-1+M ), $ Z( 1, M ), LDZ ) * D( L ) = D( L ) - P E( LM1 ) = G ILAST = L GOTO 120 * * Eigenvalue found. * 190 CONTINUE D( L ) = P * L = L - 1 IF( L.GE.LEND ) $ GOTO 120 GOTO 200 * ENDIF * * Undo scaling if necessary * 200 CONTINUE IF( ISCALE.EQ.1 ) THEN CALL SLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV+1, 1, $ D( LSV ), N, INFO ) CALL SLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV, 1, E( LSV ), $ N, INFO ) ELSEIF( ISCALE.EQ.2 ) THEN CALL SLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV+1, 1, $ D( LSV ), N, INFO ) CALL SLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV, 1, E( LSV ), $ N, INFO ) ENDIF * * Check for no convergence to an eigenvalue after a total * of N*MAXIT iterations. * IF( JTOT.LT.NMAXIT ) $ GOTO 10 DO 210 I = 1, N - 1 IF( E( I ).NE.ZERO ) $ INFO = INFO + 1 210 CONTINUE GOTO 250 * * Order eigenvalues and eigenvectors. * 220 CONTINUE * * Use Selection Sort to minimize swaps of eigenvectors * DO 240 II = 2, N I = II - 1 K = I P = D( I ) DO 230 J = II, N IF( D( J ).LT.P ) THEN K = J P = D( J ) ENDIF 230 CONTINUE IF( K.NE.I ) THEN D( K ) = D( I ) D( I ) = P CALL CSWAP( NR, Z( 1, I ), 1, Z( 1, K ), 1 ) ENDIF 240 CONTINUE * 250 CONTINUE * WRITE( *, FMT = * )'JTOT', JTOT RETURN * * End of SSTEQR2 * END scalapack-2.0.2/SRC/ctrmvt.f000644 000766 000024 00000011154 10363532303 016026 0ustar00juliestaff000000 000000 SUBROUTINE CTRMVT( UPLO, N, T, LDT, X, INCX, Y, INCY, W, INCW, Z, $ INCZ ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCW, INCX, INCY, INCZ, LDT, N * .. * .. Array Arguments .. COMPLEX T( LDT, * ), W( * ), X( * ), Y( * ), Z( * ) * .. * * Purpose * ======= * * CTRMVT performs the matrix-vector operations * * x := conjg( T' ) *y, and w := T *z, * * where x is an n element vector and T is an n by n * upper or lower triangular matrix. * * Arguments * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * T - COMPLEX array of DIMENSION ( LDT, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array T must contain the upper * triangular matrix and the strictly lower triangular part of * T is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array T must contain the lower * triangular matrix and the strictly upper triangular part of * T is not referenced. * * LDT - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * On exit, X = T' * y * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * W - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCW ) ). * On exit, W = T * z * * INCW - INTEGER. * On entry, INCW specifies the increment for the elements of * W. INCW must not be zero. * Unchanged on exit. * * Z - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCZ ) ). * Before entry, the incremented array Z must contain the n * element vector z. Unchanged on exit. * * INCY - INTEGER. * On entrz, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * * .. Local Scalars .. INTEGER INFO * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL CCOPY, CTRMV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN INFO = 4 ELSE IF( INCW.EQ.0 ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 10 ELSE IF( INCZ.EQ.0 ) THEN INFO = 12 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CTRMVT', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * * IF( INCX.NE.1 .OR. INCY.NE.1 .OR. INCW.NE.1 .OR. INCZ.NE.1 .OR. $ .TRUE. ) THEN CALL CCOPY( N, Y, INCY, X, INCX ) CALL CTRMV( UPLO, 'C', 'N', N, T, LDT, X, INCX ) CALL CCOPY( N, Z, INCZ, W, INCW ) CALL CTRMV( UPLO, 'N', 'N', N, T, LDT, W, INCW ) RETURN END IF * RETURN * * End of CTRMVT. * END scalapack-2.0.2/SRC/ddbtf2.f000644 000766 000024 00000012353 11657111056 015663 0ustar00juliestaff000000 000000 SUBROUTINE DDBTF2( M, N, KL, KU, AB, LDAB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Modified by Andrew J. Cleary in November, 96 from: * -- LAPACK auxiliary routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * August 6, 1991 * * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDAB, M, N * .. * .. Array Arguments .. DOUBLE PRECISION AB( LDAB, * ) * .. * * Purpose * ======= * * Ddbtrf computes an LU factorization of a real m-by-n band matrix A * without using partial pivoting with row interchanges. * * This is the unblocked version of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of subdiagonals within the band of A. KL >= 0. * * KU (input) INTEGER * The number of superdiagonals within the band of A. KU >= 0. * * AB (input/output) DOUBLE PRECISION array, dimension (LDAB,N) * On entry, the matrix A in band storage, in rows KL+1 to * 2*KL+KU+1; rows 1 to KL of the array need not be set. * The j-th column of A is stored in the j-th column of the * array AB as follows: * AB(kl+ku+1+i-j,j) = A(i,j) for max(1,j-ku)<=i<=min(m,j+kl) * * On exit, details of the factorization: U is stored as an * upper triangular band matrix with KL+KU superdiagonals in * rows 1 to KL+KU+1, and the multipliers used during the * factorization are stored in rows KL+KU+2 to 2*KL+KU+1. * See below for further details. * * LDAB (input) INTEGER * The leading dimension of the array AB. LDAB >= 2*KL+KU+1. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = +i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * Further Details * =============== * * The band storage scheme is illustrated by the following example, when * M = N = 6, KL = 2, KU = 1: * * On entry: On exit: * * * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 * a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 * a21 a32 a43 a54 a65 * m21 m32 m43 m54 m65 * * a31 a42 a53 a64 * * m31 m42 m53 m64 * * * * Array elements marked * are not used by the routine; elements marked * + need not be set on entry, but are required by the routine to store * elements of U, because of fill-in resulting from the row * interchanges. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER J, JP, JU, KM, KV * .. * .. External Functions .. INTEGER ISAMAX EXTERNAL ISAMAX * .. * .. External Subroutines .. EXTERNAL DGER, DSCAL, DSWAP * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * KV is the number of superdiagonals in the factor U, allowing for * fill-in. * KV = KU * * Test the input parameters. * INFO = 0 *ECA IF( M.LT.0 ) THEN *ECA INFO = -1 *ECA ELSE IF( N.LT.0 ) THEN *ECA INFO = -2 *ECA ELSE IF( KL.LT.0 ) THEN *ECA INFO = -3 *ECA ELSE IF( KU.LT.0 ) THEN *ECA INFO = -4 *ECA ELSE IF( LDAB.LT.KL+KV+1 ) THEN *ECA INFO = -6 *ECA END IF *ECA IF( INFO.NE.0 ) THEN *ECA CALL XERBLA( 'DDBTF2', -INFO ) *ECA RETURN *ECA END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Gaussian elimination without partial pivoting * * JU is the index of the last column affected by the current stage * of the factorization. * JU = 1 * DO 40 J = 1, MIN( M, N ) * * Test for singularity. KM is the number of * subdiagonal elements in the current column. * KM = MIN( KL, M-J ) JP = 1 IF( AB( KV+1, J ).NE.ZERO ) THEN JU = MAX( JU, MIN( J+KU, N ) ) * IF( KM.GT.0 ) THEN * * Compute multipliers. * CALL DSCAL( KM, ONE / AB( KU+1, J ), AB( KU+2, J ), 1 ) * * Update trailing submatrix within the band. * IF( JU.GT.J ) THEN CALL DGER( KM, JU-J, -ONE, AB( KU+2, J ), 1, $ AB( KU, J+1 ), LDAB-1, AB( KU+1, J+1 ), $ LDAB-1 ) END IF END IF ELSE * IF( INFO.EQ.0 ) $ INFO = J END IF 40 CONTINUE RETURN * * End of DDBTF2 * END scalapack-2.0.2/SRC/ddbtrf.f000644 000766 000024 00000025110 11657111056 015756 0ustar00juliestaff000000 000000 SUBROUTINE DDBTRF( M, N, KL, KU, AB, LDAB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * August, 1996. * Modified from DGBTRF: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * August 6, 1991 * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDAB, M, N * .. * .. Array Arguments .. DOUBLE PRECISION AB( LDAB, * ) * .. * * Purpose * ======= * * Ddbtrf computes an LU factorization of a real m-by-n band matrix A * without using partial pivoting or row interchanges. * * This is the blocked version of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of subdiagonals within the band of A. KL >= 0. * * KU (input) INTEGER * The number of superdiagonals within the band of A. KU >= 0. * * AB (input/output) REAL array, dimension (LDAB,N) * On entry, the matrix A in band storage, in rows KL+1 to * 2*KL+KU+1; rows 1 to KL of the array need not be set. * The j-th column of A is stored in the j-th column of the * array AB as follows: * AB(kl+ku+1+i-j,j) = A(i,j) for max(1,j-ku)<=i<=min(m,j+kl) * * On exit, details of the factorization: U is stored as an * upper triangular band matrix with KL+KU superdiagonals in * rows 1 to KL+KU+1, and the multipliers used during the * factorization are stored in rows KL+KU+2 to 2*KL+KU+1. * See below for further details. * * LDAB (input) INTEGER * The leading dimension of the array AB. LDAB >= 2*KL+KU+1. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = +i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * Further Details * =============== * * The band storage scheme is illustrated by the following example, when * M = N = 6, KL = 2, KU = 1: * * On entry: On exit: * * * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 * a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 * a21 a32 a43 a54 a65 * m21 m32 m43 m54 m65 * * a31 a42 a53 a64 * * m31 m42 m53 m64 * * * * Array elements marked * are not used by the routine. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) INTEGER NBMAX, LDWORK PARAMETER ( NBMAX = 64, LDWORK = NBMAX+1 ) * .. * .. Local Scalars .. INTEGER I, I2, I3, II, J, J2, J3, JB, JJ, JM, JP, $ JU, KM, KV, NB, NW * .. * .. Local Arrays .. DOUBLE PRECISION WORK13( LDWORK, NBMAX ), $ WORK31( LDWORK, NBMAX ) * .. * .. External Functions .. INTEGER ILAENV, ISAMAX EXTERNAL ILAENV, ISAMAX * .. * .. External Subroutines .. EXTERNAL DCOPY, DDBTF2, DGEMM, DGER, DSCAL, $ DSWAP, DTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * KV is the number of superdiagonals in the factor U * KV = KU * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( KL.LT.0 ) THEN INFO = -3 ELSE IF( KU.LT.0 ) THEN INFO = -4 ELSE IF( LDAB.LT.MIN( MIN( KL+KV+1,M ),N ) ) THEN INFO = -6 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DDBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Determine the block size for this environment * NB = ILAENV( 1, 'DDBTRF', ' ', M, N, KL, KU ) * * The block size must not exceed the limit set by the size of the * local arrays WORK13 and WORK31. * NB = MIN( NB, NBMAX ) * IF( NB.LE.1 .OR. NB.GT.KL ) THEN * * Use unblocked code * CALL DDBTF2( M, N, KL, KU, AB, LDAB, INFO ) ELSE * * Use blocked code * * Zero the superdiagonal elements of the work array WORK13 * DO 20 J = 1, NB DO 10 I = 1, J - 1 WORK13( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * * Zero the subdiagonal elements of the work array WORK31 * DO 40 J = 1, NB DO 30 I = J + 1, NB WORK31( I, J ) = ZERO 30 CONTINUE 40 CONTINUE * * JU is the index of the last column affected by the current * stage of the factorization * JU = 1 * DO 180 J = 1, MIN( M, N ), NB JB = MIN( NB, MIN( M, N )-J+1 ) * * The active part of the matrix is partitioned * * A11 A12 A13 * A21 A22 A23 * A31 A32 A33 * * Here A11, A21 and A31 denote the current block of JB columns * which is about to be factorized. The number of rows in the * partitioning are JB, I2, I3 respectively, and the numbers * of columns are JB, J2, J3. The superdiagonal elements of A13 * and the subdiagonal elements of A31 lie outside the band. * I2 = MIN( KL-JB, M-J-JB+1 ) I3 = MIN( JB, M-J-KL+1 ) * * J2 and J3 are computed after JU has been updated. * * Factorize the current block of JB columns * DO 80 JJ = J, J + JB - 1 * * Find pivot and test for singularity. KM is the number of * subdiagonal elements in the current column. * KM = MIN( KL, M-JJ ) JP = 1 IF( AB( KV+JP, JJ ).NE.ZERO ) THEN JU = MAX( JU, MIN( JJ+KU+JP-1, N ) ) * * Compute multipliers * CALL DSCAL( KM, ONE / AB( KV+1, JJ ), AB( KV+2, JJ ), $ 1 ) * * Update trailing submatrix within the band and within * the current block. JM is the index of the last column * which needs to be updated. * JM = MIN( JU, J+JB-1 ) IF( JM.GT.JJ ) THEN CALL DGER( KM, JM-JJ, -ONE, AB( KV+2, JJ ), 1, $ AB( KV, JJ+1 ), LDAB-1, $ AB( KV+1, JJ+1 ), LDAB-1 ) END IF END IF * * Copy current column of A31 into the work array WORK31 * NW = MIN( JJ-J+1, I3 ) IF( NW.GT.0 ) $ CALL DCOPY( NW, AB( KV+KL+1-JJ+J, JJ ), 1, $ WORK31( 1, JJ-J+1 ), 1 ) 80 CONTINUE IF( J+JB.LE.N ) THEN * * Apply the row interchanges to the other blocks. * J2 = MIN( JU-J+1, KV ) - JB J3 = MAX( 0, JU-J-KV+1 ) * * Update the relevant part of the trailing submatrix * IF( J2.GT.0 ) THEN * * Update A12 * CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', $ JB, J2, ONE, AB( KV+1, J ), LDAB-1, $ AB( KV+1-JB, J+JB ), LDAB-1 ) * IF( I2.GT.0 ) THEN * * Update A22 * CALL DGEMM( 'No transpose', 'No transpose', I2, J2, $ JB, -ONE, AB( KV+1+JB, J ), LDAB-1, $ AB( KV+1-JB, J+JB ), LDAB-1, ONE, $ AB( KV+1, J+JB ), LDAB-1 ) END IF * IF( I3.GT.0 ) THEN * * Update A32 * CALL DGEMM( 'No transpose', 'No transpose', I3, J2, $ JB, -ONE, WORK31, LDWORK, $ AB( KV+1-JB, J+JB ), LDAB-1, ONE, $ AB( KV+KL+1-JB, J+JB ), LDAB-1 ) END IF END IF * IF( J3.GT.0 ) THEN * * Copy the lower triangle of A13 into the work array * WORK13 * DO 130 JJ = 1, J3 DO 120 II = JJ, JB WORK13( II, JJ ) = AB( II-JJ+1, JJ+J+KV-1 ) 120 CONTINUE 130 CONTINUE * * Update A13 in the work array * CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', $ JB, J3, ONE, AB( KV+1, J ), LDAB-1, $ WORK13, LDWORK ) * IF( I2.GT.0 ) THEN * * Update A23 * CALL DGEMM( 'No transpose', 'No transpose', I2, J3, $ JB, -ONE, AB( KV+1+JB, J ), LDAB-1, $ WORK13, LDWORK, ONE, AB( 1+JB, J+KV ), $ LDAB-1 ) END IF * IF( I3.GT.0 ) THEN * * Update A33 * CALL DGEMM( 'No transpose', 'No transpose', I3, J3, $ JB, -ONE, WORK31, LDWORK, WORK13, $ LDWORK, ONE, AB( 1+KL, J+KV ), LDAB-1 ) END IF * * Copy the lower triangle of A13 back into place * DO 150 JJ = 1, J3 DO 140 II = JJ, JB AB( II-JJ+1, JJ+J+KV-1 ) = WORK13( II, JJ ) 140 CONTINUE 150 CONTINUE END IF ELSE END IF * * copy the upper triangle of A31 back into place * DO 170 JJ = J + JB - 1, J, -1 * * Copy the current column of A31 back into place * NW = MIN( I3, JJ-J+1 ) IF( NW.GT.0 ) $ CALL DCOPY( NW, WORK31( 1, JJ-J+1 ), 1, $ AB( KV+KL+1-JJ+J, JJ ), 1 ) 170 CONTINUE 180 CONTINUE END IF * RETURN * * End of DDBTRF * END scalapack-2.0.2/SRC/ddttrf.f000644 000766 000024 00000006473 11657111056 016013 0ustar00juliestaff000000 000000 SUBROUTINE DDTTRF( N, DL, D, DU, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, November 1996. * Modified from DGTTRF: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. INTEGER INFO, N * .. * .. Array Arguments .. DOUBLE PRECISION D( * ), DL( * ), DU( * ) * .. * * Purpose * ======= * * DDTTRF computes an LU factorization of a complex tridiagonal matrix A * using elimination without partial pivoting. * * The factorization has the form * A = L * U * where L is a product of unit lower bidiagonal * matrices and U is upper triangular with nonzeros in only the main * diagonal and first superdiagonal. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * DL (input/output) COMPLEX array, dimension (N-1) * On entry, DL must contain the (n-1) subdiagonal elements of * A. * On exit, DL is overwritten by the (n-1) multipliers that * define the matrix L from the LU factorization of A. * * D (input/output) COMPLEX array, dimension (N) * On entry, D must contain the diagonal elements of A. * On exit, D is overwritten by the n diagonal elements of the * upper triangular matrix U from the LU factorization of A. * * DU (input/output) COMPLEX array, dimension (N-1) * On entry, DU must contain the (n-1) superdiagonal elements * of A. * On exit, DU is overwritten by the (n-1) elements of the first * superdiagonal of U. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Local Scalars .. INTEGER I DOUBLE PRECISION FACT * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Executable Statements .. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 CALL XERBLA( 'DDTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * DO 20 I = 1, N - 1 IF( DL( I ).EQ.ZERO ) THEN * * Subdiagonal is zero, no elimination is required. * IF( D( I ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I ELSE * FACT = DL( I ) / D( I ) DL( I ) = FACT D( I+1 ) = D( I+1 ) - FACT*DU( I ) END IF 20 CONTINUE IF( D( N ).EQ.ZERO .AND. INFO.EQ.0 ) THEN INFO = N RETURN END IF * RETURN * * End of DDTTRF * END scalapack-2.0.2/SRC/ddttrsv.f000644 000766 000024 00000011565 11657111056 016214 0ustar00juliestaff000000 000000 SUBROUTINE DDTTRSV( UPLO, TRANS, N, NRHS, DL, D, DU, $ B, LDB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * August, 1996. * Modified from DGTTRS: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. CHARACTER UPLO, TRANS INTEGER INFO, LDB, N, NRHS * .. * .. Array Arguments .. DOUBLE PRECISION B( LDB, * ), D( * ), DL( * ), DU( * ) * .. * * Purpose * ======= * * DDTTRSV solves one of the systems of equations * L * X = B, L**T * X = B, or L**H * X = B, * U * X = B, U**T * X = B, or U**H * X = B, * with factors of the tridiagonal matrix A from the LU factorization * computed by DDTTRF. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether to solve with L or U. * * TRANS (input) CHARACTER * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * DL (input) COMPLEX array, dimension (N-1) * The (n-1) multipliers that define the matrix L from the * LU factorization of A. * * D (input) COMPLEX array, dimension (N) * The n diagonal elements of the upper triangular matrix U from * the LU factorization of A. * * DU (input) COMPLEX array, dimension (N-1) * The (n-1) elements of the first superdiagonal of U. * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, B is overwritten by the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Local Scalars .. LOGICAL LOWER, NOTRAN INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) LOWER = LSAME( UPLO, 'L' ) IF( .NOT.LOWER .AND. .NOT.LSAME( UPLO, 'U' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( N, 1 ) ) THEN INFO = -9 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DDTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( NOTRAN ) THEN * IF( LOWER ) THEN * Solve L*X = B, overwriting B with X. * DO 35 J = 1, NRHS * * Solve L*x = b. * DO 10 I = 1, N - 1 B( I+1, J ) = B( I+1, J ) - DL( I )*B( I, J ) 10 CONTINUE 35 CONTINUE * ELSE * Solve U*x = b. * DO 30 J = 1, NRHS B( N, J ) = B( N, J ) / D( N ) IF( N.GT.1 ) $ B( N-1, J ) = ( B( N-1, J )-DU( N-1 )*B( N, J ) ) / $ D( N-1 ) DO 20 I = N - 2, 1, -1 B( I, J ) = ( B( I, J )-DU( I )*B( I+1, J ) ) / D( I ) 20 CONTINUE 30 CONTINUE * ENDIF * ELSE * IF( .NOT. LOWER ) THEN * Solve U**T * X = B, overwriting B with X. * DO 65 J = 1, NRHS * * Solve U**T * x = b. * B( 1, J ) = B( 1, J ) / D( 1 ) IF( N.GT.1 ) $ B( 2, J ) = ( B( 2, J )-DU( 1 )*B( 1, J ) ) / D( 2 ) DO 40 I = 3, N B( I, J ) = ( B( I, J )-DU( I-1 )*B( I-1, J ) ) / D( I ) 40 CONTINUE 65 CONTINUE * ELSE * * Solve L**T * X = B, overwriting B with X. DO 60 J = 1, NRHS * * Solve L**T * x = b. * DO 50 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - DL( I )*B( I+1, J ) 50 CONTINUE 60 CONTINUE ENDIF END IF * * End of DDTTRSV * END scalapack-2.0.2/SRC/dlamov.c000644 000766 000024 00000000253 11745567264 016007 0ustar00juliestaff000000 000000 // // dlamov.c // // Written by Lee Killough 04/19/2012 // #define TYPE double #define FUNC "DLAMOV" #define LAMOV dlamov_ #define LACPY dlacpy_ #include "lamov.h" scalapack-2.0.2/SRC/dlamsh.f000644 000766 000024 00000021237 10363532303 015762 0ustar00juliestaff000000 000000 SUBROUTINE DLAMSH ( S, LDS, NBULGE, JBLK, H, LDH, N, ULP ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER LDS, NBULGE, JBLK, LDH, N DOUBLE PRECISION ULP * .. * .. Array Arguments .. DOUBLE PRECISION S(LDS,*), H(LDH,*) * .. * * Purpose * ======= * * DLAMSH sends multiple shifts through a small (single node) matrix to * see how consecutive small subdiagonal elements are modified by * subsequent shifts in an effort to maximize the number of bulges * that can be sent through. * DLAMSH should only be called when there are multiple shifts/bulges * (NBULGE > 1) and the first shift is starting in the middle of an * unreduced Hessenberg matrix because of two or more consecutive small * subdiagonal elements. * * Arguments * ========= * * S (local input/output) DOUBLE PRECISION array, (LDS,*) * On entry, the matrix of shifts. Only the 2x2 diagonal of S is * referenced. It is assumed that S has JBLK double shifts * (size 2). * On exit, the data is rearranged in the best order for * applying. * * LDS (local input) INTEGER * On entry, the leading dimension of S. Unchanged on exit. * 1 < NBULGE <= JBLK <= LDS/2 * * NBULGE (local input/output) INTEGER * On entry, the number of bulges to send through H ( >1 ). * NBULGE should be less than the maximum determined (JBLK). * 1 < NBULGE <= JBLK <= LDS/2 * On exit, the maximum number of bulges that can be sent * through. * * JBLK (local input) INTEGER * On entry, the number of shifts determined for S. * Unchanged on exit. * * H (local input/output) DOUBLE PRECISION array (LDH,N) * On entry, the local matrix to apply the shifts on. * H should be aligned so that the starting row is 2. * On exit, the data is destroyed. * * LDS (local input) INTEGER * On entry, the leading dimension of S. Unchanged on exit. * * N (local input) INTEGER * On entry, the size of H. If all the bulges are expected to * go through, N should be at least 4*NBULGE+2. * Otherwise, NBULGE may be reduced by this routine. * * ULP (local input) DOUBLE PRECISION * On entry, machine precision * Unchanged on exit. * * Implemented by: G. Henry, May 1, 1997 * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, TEN PARAMETER ( ZERO = 0.0D+0, TEN = 10.0D+0 ) * .. * .. Local Scalars .. INTEGER K, IBULGE, M, NR, J, IVAL, I DOUBLE PRECISION H44, H33, H43H34, H11, H22, H21, H12, H44S, $ H33S, V1, V2, V3, H00, H10, TST1, T1, T2, T3, $ SUM, S1, DVAL * .. * .. Local Arrays .. DOUBLE PRECISION V(3) * .. * .. External Subroutines .. EXTERNAL DLARFG, DCOPY * .. * .. Intrinsic Functions .. INTRINSIC MAX, ABS * .. * .. Executable Statements .. * M = 2 DO 10 IBULGE = 1, NBULGE H44 = S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2) H33 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) H43H34 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2)* $ S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = ABS( V1 )*( ABS( H00 )+ABS( H11 )+ABS( H22 ) ) IF( ABS( H10 )*( ABS( V2 )+ABS( V3 ) ).GT.ULP*TST1 ) THEN * Find minimum DVAL = (ABS(H10)*(ABS(V2)+ABS(V3))) / (ULP*TST1) IVAL = IBULGE DO 15 I = IBULGE+1, NBULGE H44 = S(2*JBLK-2*I+2, 2*JBLK-2*I+2) H33 = S(2*JBLK-2*I+1,2*JBLK-2*I+1) H43H34 = S(2*JBLK-2*I+1,2*JBLK-2*I+2)* $ S(2*JBLK-2*I+2, 2*JBLK-2*I+1) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = ABS( V1 )*( ABS( H00 )+ABS( H11 )+ABS( H22 ) ) IF ( (DVAL.GT.(ABS(H10)*(ABS(V2)+ABS(V3)))/(ULP*TST1)) $ .AND. ( DVAL .GT. 1.D0 ) ) THEN DVAL = (ABS(H10)*(ABS(V2)+ABS(V3))) / (ULP*TST1) IVAL = I END IF 15 CONTINUE IF ( (DVAL .LT. TEN) .AND. (IVAL .NE. IBULGE) ) THEN H44 = S(2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+2) H33 = S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+1) H43H34 = S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+2) H10 = S(2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1) S(2*JBLK-2*IVAL+2,2*JBLK-2*IVAL+2) = $ S(2*JBLK-2*IBULGE+2,2*JBLK-2*IBULGE+2) S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+1) = $ S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+2) = $ S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2) S(2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1) = $ S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2) = H44 S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) = H33 S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2) = H43H34 S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) = H10 END IF H44 = S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2) H33 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) H43H34 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2)* $ S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = ABS( V1 )*( ABS( H00 )+ABS( H11 )+ABS( H22 ) ) END IF IF( ABS( H10 )*( ABS( V2 )+ABS( V3 ) ).GT.TEN*ULP*TST1 ) THEN * IBULGE better not be 1 here or we have a bug! NBULGE = MAX(IBULGE -1,1) RETURN END IF DO 120 K = M, N - 1 NR = MIN( 3, N-K+1 ) IF( K.GT.M ) $ CALL DCOPY( NR, H( K, K-1 ), 1, V, 1 ) CALL DLARFG( NR, V( 1 ), V( 2 ), 1, T1 ) IF( K.GT.M ) THEN H( K, K-1 ) = V( 1 ) H( K+1, K-1 ) = ZERO IF( K.LT.N-1 ) $ H( K+2, K-1 ) = ZERO ELSE H( K, K-1 ) = -H( K, K-1 ) END IF V2 = V( 2 ) T2 = T1*V2 IF( NR.EQ.3 ) THEN V3 = V( 3 ) T3 = T1*V3 DO 60 J = K, N SUM = H( K, J ) + V2*H( K+1, J ) + V3*H( K+2, J ) H( K, J ) = H( K, J ) - SUM*T1 H( K+1, J ) = H( K+1, J ) - SUM*T2 H( K+2, J ) = H( K+2, J ) - SUM*T3 60 CONTINUE DO 70 J = 1, MIN( K+3, N ) SUM = H( J, K ) + V2*H( J, K+1 ) + V3*H( J, K+2 ) H( J, K ) = H( J, K ) - SUM*T1 H( J, K+1 ) = H( J, K+1 ) - SUM*T2 H( J, K+2 ) = H( J, K+2 ) - SUM*T3 70 CONTINUE END IF 120 CONTINUE 10 CONTINUE * RETURN END scalapack-2.0.2/SRC/dlapst.f000644 000766 000024 00000015020 10363532303 015772 0ustar00juliestaff000000 000000 SUBROUTINE DLAPST( ID, N, D, INDX, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER ID INTEGER INFO, N * .. * .. Array Arguments .. INTEGER INDX( * ) DOUBLE PRECISION D( * ) * .. * * Purpose * ======= * DLAPST is a modified version of the LAPACK routine DLASRT. * * Define a permutation INDX that sorts the numbers in D * in increasing order (if ID = 'I') or * in decreasing order (if ID = 'D' ). * * Use Quick Sort, reverting to Insertion sort on arrays of * size <= 20. Dimension of STACK limits N to about 2**32. * * Arguments * ========= * * ID (input) CHARACTER*1 * = 'I': sort D in increasing order; * = 'D': sort D in decreasing order. * * N (input) INTEGER * The length of the array D. * * D (input) DOUBLE PRECISION array, dimension (N) * The array to be sorted. * * INDX (ouput) INTEGER array, dimension (N). * The permutation which sorts the array D. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. INTEGER SELECT PARAMETER ( SELECT = 20 ) * .. * .. Local Scalars .. INTEGER DIR, ENDD, I, ITMP, J, START, STKPNT DOUBLE PRECISION D1, D2, D3, DMNMX * .. * .. Local Arrays .. INTEGER STACK( 2, 32 ) * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input paramters. * INFO = 0 DIR = -1 IF( LSAME( ID, 'D' ) ) THEN DIR = 0 ELSE IF( LSAME( ID, 'I' ) ) THEN DIR = 1 END IF IF( DIR.EQ.-1 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAPST', -INFO ) RETURN END IF * * Quick return if possible * IF( N.LE.1 ) $ RETURN * DO 10 I = 1, N INDX( I ) = I 10 CONTINUE * STKPNT = 1 STACK( 1, 1 ) = 1 STACK( 2, 1 ) = N 20 CONTINUE START = STACK( 1, STKPNT ) ENDD = STACK( 2, STKPNT ) STKPNT = STKPNT - 1 IF( ENDD-START.LE.SELECT .AND. ENDD-START.GT.0 ) THEN * * Do Insertion sort on D( START:ENDD ) * IF( DIR.EQ.0 ) THEN * * Sort into decreasing order * DO 40 I = START + 1, ENDD DO 30 J = I, START + 1, -1 IF( D( INDX( J ) ).GT.D( INDX( J-1 ) ) ) THEN ITMP = INDX( J ) INDX( J ) = INDX( J-1 ) INDX( J-1 ) = ITMP ELSE GO TO 40 END IF 30 CONTINUE 40 CONTINUE * ELSE * * Sort into increasing order * DO 60 I = START + 1, ENDD DO 50 J = I, START + 1, -1 IF( D( INDX( J ) ).LT.D( INDX( J-1 ) ) ) THEN ITMP = INDX( J ) INDX( J ) = INDX( J-1 ) INDX( J-1 ) = ITMP ELSE GO TO 60 END IF 50 CONTINUE 60 CONTINUE * END IF * ELSE IF( ENDD-START.GT.SELECT ) THEN * * Partition D( START:ENDD ) and stack parts, largest one first * * Choose partition entry as median of 3 * D1 = D( INDX( START ) ) D2 = D( INDX( ENDD ) ) I = ( START+ENDD ) / 2 D3 = D( INDX( I ) ) IF( D1.LT.D2 ) THEN IF( D3.LT.D1 ) THEN DMNMX = D1 ELSE IF( D3.LT.D2 ) THEN DMNMX = D3 ELSE DMNMX = D2 END IF ELSE IF( D3.LT.D2 ) THEN DMNMX = D2 ELSE IF( D3.LT.D1 ) THEN DMNMX = D3 ELSE DMNMX = D1 END IF END IF * IF( DIR.EQ.0 ) THEN * * Sort into decreasing order * I = START - 1 J = ENDD + 1 70 CONTINUE 80 CONTINUE J = J - 1 IF( D( INDX( J ) ).LT.DMNMX ) $ GO TO 80 90 CONTINUE I = I + 1 IF( D( INDX( I ) ).GT.DMNMX ) $ GO TO 90 IF( I.LT.J ) THEN ITMP = INDX( I ) INDX( I ) = INDX( J ) INDX( J ) = ITMP GO TO 70 END IF IF( J-START.GT.ENDD-J-1 ) THEN STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD ELSE STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J END IF ELSE * * Sort into increasing order * I = START - 1 J = ENDD + 1 100 CONTINUE 110 CONTINUE J = J - 1 IF( D( INDX( J ) ).GT.DMNMX ) $ GO TO 110 120 CONTINUE I = I + 1 IF( D( INDX( I ) ).LT.DMNMX ) $ GO TO 120 IF( I.LT.J ) THEN ITMP = INDX( I ) INDX( I ) = INDX( J ) INDX( J ) = ITMP GO TO 100 END IF IF( J-START.GT.ENDD-J-1 ) THEN STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD ELSE STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J END IF END IF END IF IF( STKPNT.GT.0 ) $ GO TO 20 RETURN * * End of DLAPST * END scalapack-2.0.2/SRC/dlaqr6.f000644 000766 000024 00000102106 11750130340 015672 0ustar00juliestaff000000 000000 SUBROUTINE DLAQR6( JOB, WANTT, WANTZ, KACC22, N, KTOP, KBOT, $ NSHFTS, SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, $ V, LDV, U, LDU, NV, WV, LDWV, NH, WH, LDWH ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER JOB INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV, $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. DOUBLE PRECISION H( LDH, * ), SI( * ), SR( * ), U( LDU, * ), $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ), $ Z( LDZ, * ) * .. * * This auxiliary subroutine called by PDLAQR5 performs a * single small-bulge multi-shift QR sweep, moving the chain * of bulges from top to bottom in the submatrix * H(KTOP:KBOT,KTOP:KBOT), collecting the transformations in the * matrix HV *or* accumulating the transformations in the matrix * Z (see below). * * This is a modified version of DLAQR5 from LAPACK 3.1. * * ====================================================================== * * JOB (input) character scalar * Set the kind of job to do in DLAQR6, as follows: * JOB = 'I': Introduce and chase bulges in submatrix * JOB = 'C': Chase bulges from top to bottom of submatrix * JOB = 'O': Chase bulges off submatrix * * WANTT (input) logical scalar * WANTT = .true. if the quasi-triangular Schur factor * is being computed. WANTT is set to .false. otherwise. * * WANTZ (input) logical scalar * WANTZ = .true. if the orthogonal Schur factor is being * computed. WANTZ is set to .false. otherwise. * * KACC22 (input) integer with value 0, 1, or 2. * Specifies the computation mode of far-from-diagonal * orthogonal updates. * = 0: DLAQR6 does not accumulate reflections and does not * use matrix-matrix multiply to update far-from-diagonal * matrix entries. * = 1: DLAQR6 accumulates reflections and uses matrix-matrix * multiply to update the far-from-diagonal matrix entries. * = 2: DLAQR6 accumulates reflections, uses matrix-matrix * multiply to update the far-from-diagonal matrix entries, * and takes advantage of 2-by-2 block structure during * matrix multiplies. * * N (input) integer scalar * N is the order of the Hessenberg matrix H upon which this * subroutine operates. * * KTOP (input) integer scalar * KBOT (input) integer scalar * These are the first and last rows and columns of an * isolated diagonal block upon which the QR sweep is to be * applied. It is assumed without a check that * either KTOP = 1 or H(KTOP,KTOP-1) = 0 * and * either KBOT = N or H(KBOT+1,KBOT) = 0. * * NSHFTS (input) integer scalar * NSHFTS gives the number of simultaneous shifts. NSHFTS * must be positive and even. * * SR (input) DOUBLE PRECISION array of size (NSHFTS) * SI (input) DOUBLE PRECISION array of size (NSHFTS) * SR contains the real parts and SI contains the imaginary * parts of the NSHFTS shifts of origin that define the * multi-shift QR sweep. * * H (input/output) DOUBLE PRECISION array of size (LDH,N) * On input H contains a Hessenberg matrix. On output a * multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied * to the isolated diagonal block in rows and columns KTOP * through KBOT. * * LDH (input) integer scalar * LDH is the leading dimension of H just as declared in the * calling procedure. LDH.GE.MAX(1,N). * * ILOZ (input) INTEGER * IHIZ (input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N * * Z (input/output) DOUBLE PRECISION array of size (LDZ,IHI) * If WANTZ = .TRUE., then the QR Sweep orthogonal * similarity transformation is accumulated into * Z(ILOZ:IHIZ,ILO:IHI) from the right. * If WANTZ = .FALSE., then Z is unreferenced. * * LDZ (input) integer scalar * LDA is the leading dimension of Z just as declared in * the calling procedure. LDZ.GE.N. * * V (workspace) DOUBLE PRECISION array of size (LDV,NSHFTS/2) * * LDV (input) integer scalar * LDV is the leading dimension of V as declared in the * calling procedure. LDV.GE.3. * * U (workspace) DOUBLE PRECISION array of size * (LDU,3*NSHFTS-3) * * LDU (input) integer scalar * LDU is the leading dimension of U just as declared in the * in the calling subroutine. LDU.GE.3*NSHFTS-3. * * NH (input) integer scalar * NH is the number of columns in array WH available for * workspace. NH.GE.1 is required for usage of this * workspace, otherwise the updates of the far-from-diagonal * elements will be updated without level 3 BLAS. * * WH (workspace) DOUBLE PRECISION array of size (LDWH,NH) * * LDWH (input) integer scalar * Leading dimension of WH just as declared in the * calling procedure. LDWH.GE.3*NSHFTS-3. * * NV (input) integer scalar * NV is the number of rows in WV agailable for workspace. * NV.GE.1 is required for usage of this * workspace, otherwise the updates of the far-from-diagonal * elements will be updated without level 3 BLAS. * * WV (workspace) DOUBLE PRECISION array of size * (LDWV,3*NSHFTS-3) * * LDWV (input) integer scalar * LDWV is the leading dimension of WV as declared in the * in the calling subroutine. LDWV.GE.NV. * * * ================================================================ * Based on contributions by * Karen Braman and Ralph Byers, Department of Mathematics, * University of Kansas, USA * * Robert Granat, Department of Computing Science and HPC2N, * Umea University, Sweden * * ============================================================ * Reference: * * K. Braman, R. Byers and R. Mathias, The Multi-Shift QR * Algorithm Part I: Maintaining Well Focused Shifts, and * Level 3 Performance, SIAM Journal of Matrix Analysis, * volume 23, pages 929--947, 2002. * * ============================================================ * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0d0, ONE = 1.0d0 ) * .. * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, $ NS, NU, SINCOL, EINCOL, UINCOL, IPHV, CHUNK, $ THREADS, JLEN2, JCOL2, GCHUNK, JROW2, MAXCHUNK LOGICAL ACCUM, BLK22, BMP22, INTRO, CHASE, OFF, ALL * .. * .. External Functions .. LOGICAL LSAME INTEGER PILAENVX DOUBLE PRECISION DLAMCH EXTERNAL LSAME, DLAMCH, PILAENVX * .. * .. Intrinsic Functions .. * INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Local Arrays .. DOUBLE PRECISION VT( 3 ) * .. * .. External Subroutines .. EXTERNAL DGEMM, DLABAD, DLAMOV, DLAQR1, DLARFG, DLASET, $ DTRMM * .. * .. Executable Statements .. * * ==== If there are no shifts, then there is nothing to do. ==== * IF( NSHFTS.LT.2 ) $ RETURN * * ==== If the active block is empty or 1-by-1, then there * . is nothing to do. ==== * IF( KTOP.GE.KBOT ) $ RETURN THREADS = 1 * * ==== Shuffle shifts into pairs of real shifts and pairs * . of complex conjugate shifts assuming complex * . conjugate shifts are already adjacent to one * . another. ==== * DO 10 I = 1, NSHFTS - 2, 2 IF( SI( I ).NE.-SI( I+1 ) ) THEN * SWAP = SR( I ) SR( I ) = SR( I+1 ) SR( I+1 ) = SR( I+2 ) SR( I+2 ) = SWAP * SWAP = SI( I ) SI( I ) = SI( I+1 ) SI( I+1 ) = SI( I+2 ) SI( I+2 ) = SWAP END IF 10 CONTINUE * * ==== NSHFTS is supposed to be even, but if it is odd, * . then simply reduce it by one. The shuffle above * . ensures that the dropped shift is real and that * . the remaining shifts are paired. ==== * NS = NSHFTS - MOD( NSHFTS, 2 ) * * ==== Machine constants for deflation ==== * SAFMIN = DLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN CALL DLABAD( SAFMIN, SAFMAX ) ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N ) / ULP ) * * ==== Use accumulated reflections to update far-from-diagonal * . entries ? This is only performed if both NH and NV is * greater than 1. ==== * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) ACCUM = ACCUM .AND. NH.GE.1 .AND. NV.GE.1 * * ==== If so, exploit the 2-by-2 block structure? ==== * BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) * * ==== Decode JOB ==== * ALL = LSAME( JOB, 'A' ) IF( .NOT. ALL ) $ INTRO = LSAME( JOB, 'I' ) IF( .NOT. ALL .AND. .NOT. INTRO ) $ CHASE = LSAME( JOB, 'C' ) IF( .NOT. ALL .AND. .NOT. INTRO .AND. .NOT. CHASE ) THEN OFF = LSAME( JOB, 'O' ) IF( .NOT. OFF ) $ RETURN END IF * * ==== clear trash ==== * IF( INTRO.OR.ALL .AND. KTOP+2.LE.KBOT ) $ H( KTOP+2, KTOP ) = ZERO * * ==== NBMPS = number of 2-shift bulges in the chain ==== * NBMPS = NS / 2 * * ==== KDU = width of slab ==== * KDU = 6*NBMPS - 3 * * Set loop limits for bulge-chasing depending on working mode * IF( ALL ) THEN SINCOL = 3*( 1-NBMPS ) + KTOP - 1 EINCOL = KBOT - 2 UINCOL = 3*NBMPS - 2 ELSEIF( INTRO ) THEN SINCOL = 3*( 1-NBMPS ) + KTOP - 1 EINCOL = KBOT - 3*NBMPS - 1 UINCOL = 3*NBMPS - 2 ELSEIF( CHASE ) THEN SINCOL = KTOP EINCOL = KBOT - 3*NBMPS - 1 UINCOL = 3*NBMPS - 2 ELSEIF( OFF ) THEN SINCOL = KTOP EINCOL = KBOT - 2 UINCOL = 3*NBMPS - 2 END IF IPHV = 0 * * ==== Create and/or chase chains of NBMPS bulges ==== * DO 220 INCOL = SINCOL, EINCOL, UINCOL NDCOL = MIN( INCOL + KDU, EINCOL ) IF( ACCUM ) $ CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge * . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The * . following loop chases a 3*NBMPS column long chain of * . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * DO 150 KRCOL = INCOL, MIN( EINCOL, INCOL+3*NBMPS-3, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small * . 2-by-2 bulge, if there is room. The inactive bulges * . (if any) must wait until the active bulges have moved * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) M22 = MBOT + 1 BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * DO 20 M = MTOP, MBOT K = KRCOL + 3*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), $ V( 1, M ) ) ALPHA = V( 1, M ) CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) * * ==== A Bulge may collapse because of vigilant * . deflation or destructive underflow. In the * . underflow case, try the two-small-subdiagonals * . trick to try to reinflate the bulge. ==== * IF( H( K+3, K ).NE.ZERO .OR. H( K+3, K+1 ).NE. $ ZERO .OR. H( K+3, K+2 ).EQ.ZERO ) THEN * * ==== Typical case: not collapsed (yet). ==== * H( K+1, K ) = BETA H( K+2, K ) = ZERO H( K+3, K ) = ZERO ELSE * * ==== Atypical case: collapsed. Attempt to * . reintroduce ignoring H(K+1,K) and H(K+2,K). * . If the fill resulting from the new * . reflector is too large, then abandon it. * . Otherwise, use the new one. ==== * CALL DLAQR1( 3, H( K+1, K+1 ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), $ VT ) ALPHA = VT( 1 ) CALL DLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) ) REFSUM = VT( 1 )*( H( K+1, K )+VT( 2 )* $ H( K+2, K ) ) * IF( ABS( H( K+2, K )-REFSUM*VT( 2 ) )+ $ ABS( REFSUM*VT( 3 ) ).GT.ULP* $ ( ABS( H( K, K ) )+ABS( H( K+1, $ K+1 ) )+ABS( H( K+2, K+2 ) ) ) ) THEN * * ==== Starting a new bulge here would * . create non-negligible fill. Use * . the old one with trepidation. ==== * H( K+1, K ) = BETA H( K+2, K ) = ZERO H( K+3, K ) = ZERO ELSE * * ==== Stating a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== * H( K+1, K ) = H( K+1, K ) - REFSUM H( K+2, K ) = ZERO H( K+3, K ) = ZERO V( 1, M ) = VT( 1 ) V( 2, M ) = VT( 2 ) V( 3, M ) = VT( 3 ) END IF END IF END IF 20 CONTINUE * * ==== Generate a 2-by-2 reflection, if needed. ==== * K = KRCOL + 3*( M22-1 ) IF( BMP22 ) THEN IF( K.EQ.KTOP-1 ) THEN CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), $ V( 1, M22 ) ) BETA = V( 1, M22 ) CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) ELSE BETA = H( K+1, K ) V( 2, M22 ) = H( K+2, K ) CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) H( K+1, K ) = BETA H( K+2, K ) = ZERO END IF ELSE * * ==== Initialize V(1,M22) here to avoid possible undefined * . variable problems later. ==== * V( 1, M22 ) = ZERO END IF * * ==== Multiply H by reflections from the left ==== * IF( ACCUM ) THEN JBOT = MIN( MAX(INCOL+KDU,NDCOL), KBOT ) ELSE IF( WANTT ) THEN JBOT = N ELSE JBOT = KBOT END IF DO 40 J = MAX( KTOP, KRCOL ), JBOT MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) DO 30 M = MTOP, MEND K = KRCOL + 3*( M-1 ) REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) H( K+1, J ) = H( K+1, J ) - REFSUM H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) 30 CONTINUE 40 CONTINUE IF( BMP22 ) THEN K = KRCOL + 3*( M22-1 ) DO 50 J = MAX( K+1, KTOP ), JBOT REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* $ H( K+2, J ) ) H( K+1, J ) = H( K+1, J ) - REFSUM H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) 50 CONTINUE END IF * * ==== Multiply H by reflections from the right. * . Delay filling in the last row until the * . vigilant deflation check is complete. ==== * IF( ACCUM ) THEN JTOP = MAX( KTOP, INCOL ) ELSE IF( WANTT ) THEN JTOP = 1 ELSE JTOP = KTOP END IF DO 90 M = MTOP, MBOT IF( V( 1, M ).NE.ZERO ) THEN K = KRCOL + 3*( M-1 ) DO 60 J = JTOP, MIN( KBOT, K+3 ) REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) H( J, K+1 ) = H( J, K+1 ) - REFSUM H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) 60 CONTINUE * IF( ACCUM ) THEN * * ==== Accumulate U. (If necessary, update Z later * . with with an efficient matrix-matrix * . multiply.) ==== * KMS = K - INCOL DO 70 J = MAX( 1, KTOP-INCOL ), KDU REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) 70 CONTINUE ELSE IF( WANTZ ) THEN * * ==== U is not accumulated, so update Z * . now by multiplying by reflections * . from the right. ==== * DO 80 J = ILOZ, IHIZ REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) Z( J, K+1 ) = Z( J, K+1 ) - REFSUM Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) 80 CONTINUE END IF END IF 90 CONTINUE * * ==== Special case: 2-by-2 reflection (if needed) ==== * K = KRCOL + 3*( M22-1 ) IF( BMP22 ) THEN IF( V( 1, M22 ).NE.ZERO ) THEN DO 100 J = JTOP, MIN( KBOT, K+3 ) REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* $ H( J, K+2 ) ) H( J, K+1 ) = H( J, K+1 ) - REFSUM H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) 100 CONTINUE * IF( ACCUM ) THEN KMS = K - INCOL DO 110 J = MAX( 1, KTOP-INCOL ), KDU REFSUM = V( 1, M22 )*( U( J, KMS+1 ) + $ V( 2, M22 )*U( J, KMS+2 ) ) U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM U( J, KMS+2 ) = U( J, KMS+2 ) - $ REFSUM*V( 2, M22 ) 110 CONTINUE ELSE IF( WANTZ ) THEN DO 120 J = ILOZ, IHIZ REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* $ Z( J, K+2 ) ) Z( J, K+1 ) = Z( J, K+1 ) - REFSUM Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) 120 CONTINUE END IF END IF END IF * * ==== Vigilant deflation check ==== * MSTART = MTOP IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) $ MSTART = MSTART + 1 MEND = MBOT IF( BMP22 ) $ MEND = MEND + 1 IF( KRCOL.EQ.KBOT-2 ) $ MEND = MEND + 1 DO 130 M = MSTART, MEND K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals * . criterion and the Ahues & Tisseur (LAWN 122, 1997) * . criteria both be satisfied. The latter improves * . accuracy in some examples. Falling back on an * . alternate convergence criterion when TST1 or TST2 * . is zero (as done here) is traditional but probably * . unnecessary. ==== * IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN IF( K.GE.KTOP+1 ) $ TST1 = TST1 + ABS( H( K, K-1 ) ) IF( K.GE.KTOP+2 ) $ TST1 = TST1 + ABS( H( K, K-2 ) ) IF( K.GE.KTOP+3 ) $ TST1 = TST1 + ABS( H( K, K-3 ) ) IF( K.LE.KBOT-2 ) $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) IF( K.LE.KBOT-3 ) $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) IF( K.LE.KBOT-4 ) $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) END IF IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) ) $ THEN H12 = MAX( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) H21 = MIN( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) H11 = MAX( ABS( H( K+1, K+1 ) ), $ ABS( H( K, K )-H( K+1, K+1 ) ) ) H22 = MIN( ABS( H( K+1, K+1 ) ), $ ABS( H( K, K )-H( K+1, K+1 ) ) ) SCL = H11 + H12 TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF 130 CONTINUE * * ==== Fill in the last row of each bulge. ==== * MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) DO 140 M = MTOP, MEND K = KRCOL + 3*( M-1 ) REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) H( K+4, K+1 ) = -REFSUM H( K+4, K+2 ) = -REFSUM*V( 2, M ) H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) 140 CONTINUE * * ==== End of near-the-diagonal bulge chase. ==== * 150 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as * . well. ==== * IF( ACCUM ) THEN IF( WANTT ) THEN JTOP = 1 JBOT = N ELSE JTOP = KTOP JBOT = KBOT END IF K1 = MAX( 1, KTOP-INCOL ) NU = ( KDU-MAX( 0, MAX(INCOL+KDU,NDCOL)-KBOT ) ) - K1 + 1 IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) .OR. $ NU.LT.KDU ) THEN * * ==== Updates not exploiting the 2-by-2 block * . structure of U. K1 and NU keep track of * . the location and size of U in the special * . cases of introducing bulges and chasing * . bulges off the bottom. In these special * . cases and in case the number of shifts * . is NS = 2, there is no 2-by-2 block * . structure to exploit. ==== * * ==== Horizontal Multiply ==== * DO 160 JCOL = MIN(MAX(INCOL+KDU,NDCOL),KBOT)+ 1, JBOT, NH JLEN = MIN( NH, JBOT-JCOL+1 ) CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, $ LDWH ) CALL DLAMOV( 'ALL', NU, JLEN, WH, LDWH, $ H( INCOL+K1, JCOL ), LDH ) 160 CONTINUE * * ==== Vertical multiply ==== * DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL DLAMOV( 'ALL', JLEN, NU, WV, LDWV, $ H( JROW, INCOL+K1 ), LDH ) 170 CONTINUE * * ==== Z multiply (also vertical) ==== * IF( WANTZ ) THEN DO 180 JROW = ILOZ, IHIZ, NV JLEN = MIN( NV, IHIZ-JROW+1 ) CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL DLAMOV( 'ALL', JLEN, NU, WV, LDWV, $ Z( JROW, INCOL+K1 ), LDZ ) 180 CONTINUE END IF ELSE * * ==== Updates exploiting U's 2-by-2 block structure. * . (I2, I4, J2, J4 are the last rows and columns * . of the blocks.) ==== * I2 = ( KDU+1 ) / 2 I4 = KDU J2 = I4 - I2 J4 = KDU * * ==== KZS and KNZ deal with the band of zeros * . along the diagonal of one of the triangular * . blocks. ==== * KZS = ( J4-J2 ) - ( NS+1 ) KNZ = NS + 1 * * ==== Horizontal multiply ==== * DO 190 JCOL = MIN(MAX(INCOL+KDU,NDCOL),KBOT)+ 1, JBOT, NH JLEN = MIN( NH, JBOT-JCOL+1 ) * * ==== Copy bottom of H to top+KZS of scratch ==== * (The first KZS rows get multiplied by zero.) ==== * CALL DLAMOV( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), $ LDH, WH( KZS+1, 1 ), LDWH ) CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) * * ==== Multiply by U21' ==== * CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), $ LDWH ) * * ==== Multiply top of H by U11' ==== * CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) * * ==== Copy top of H to bottom of WH ==== * CALL DLAMOV( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, $ WH( I2+1, 1 ), LDWH ) * * ==== Multiply by U21' ==== * CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) * * ==== Multiply by U22 ==== * CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, $ U( J2+1, I2+1 ), LDU, $ H( INCOL+1+J2, JCOL ), LDH, ONE, $ WH( I2+1, 1 ), LDWH ) * * ==== Copy it back ==== * CALL DLAMOV( 'ALL', KDU, JLEN, WH, LDWH, $ H( INCOL+1, JCOL ), LDH ) 190 CONTINUE * * ==== Vertical multiply ==== * DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) * * ==== Copy right of H to scratch (the first KZS * . columns get multiplied by zero) ==== * CALL DLAMOV( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), $ LDH, WV( 1, 1+KZS ), LDWV ) CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) * * ==== Multiply by U21 ==== * CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), $ LDWV ) * * ==== Multiply by U11 ==== * CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, $ LDWV ) * * ==== Copy left of H to right of scratch ==== * CALL DLAMOV( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, $ WV( 1, 1+I2 ), LDWV ) * * ==== Multiply by U21 ==== * CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) * * ==== Multiply by U22 ==== * CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, $ H( JROW, INCOL+1+J2 ), LDH, $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), $ LDWV ) * * ==== Copy it back ==== * CALL DLAMOV( 'ALL', JLEN, KDU, WV, LDWV, $ H( JROW, INCOL+1 ), LDH ) 200 CONTINUE * * ==== Multiply Z (also vertical) ==== * IF( WANTZ ) THEN DO 210 JROW = ILOZ, IHIZ, NV JLEN = MIN( NV, IHIZ-JROW+1 ) * * ==== Copy right of Z to left of scratch (first * . KZS columns get multiplied by zero) ==== * CALL DLAMOV( 'ALL', JLEN, KNZ, $ Z( JROW, INCOL+1+J2 ), LDZ, $ WV( 1, 1+KZS ), LDWV ) * * ==== Multiply by U12 ==== * CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, $ LDWV ) CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), $ LDWV ) * * ==== Multiply by U11 ==== * CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, $ WV, LDWV ) * * ==== Copy left of Z to right of scratch ==== * CALL DLAMOV( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), $ LDZ, WV( 1, 1+I2 ), LDWV ) * * ==== Multiply by U21 ==== * CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), $ LDWV ) * * ==== Multiply by U22 ==== * CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, $ Z( JROW, INCOL+1+J2 ), LDZ, $ U( J2+1, I2+1 ), LDU, ONE, $ WV( 1, 1+I2 ), LDWV ) * * ==== Copy the result back to Z ==== * CALL DLAMOV( 'ALL', JLEN, KDU, WV, LDWV, $ Z( JROW, INCOL+1 ), LDZ ) 210 CONTINUE END IF END IF END IF 220 CONTINUE * * ==== Clear out workspaces and return. ==== * IF( N.GE.5 ) $ CALL DLASET( 'Lower', N-4, N-4, ZERO, ZERO, H(5,1), LDH ) * * ==== End of DLAQR6 ==== * END scalapack-2.0.2/SRC/dlar1va.f000644 000766 000024 00000033001 11657111056 016041 0ustar00juliestaff000000 000000 SUBROUTINE DLAR1VA(N, B1, BN, LAMBDA, D, L, LD, LLD, $ PIVMIN, GAPTOL, Z, WANTNC, NEGCNT, ZTZ, MINGMA, $ R, ISUPPZ, NRMINV, RESID, RQCORR, WORK ) * IMPLICIT NONE * * -- ScaLAPACK computational routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ of Colorado Denver * July 4, 2010 * * .. Scalar Arguments .. LOGICAL WANTNC INTEGER B1, BN, N, NEGCNT, R DOUBLE PRECISION GAPTOL, LAMBDA, MINGMA, NRMINV, PIVMIN, RESID, $ RQCORR, ZTZ * .. * .. Array Arguments .. INTEGER ISUPPZ( * ) DOUBLE PRECISION D( * ), L( * ), LD( * ), LLD( * ), $ WORK( * ) DOUBLE PRECISION Z( * ) * * Purpose * ======= * * DLAR1VA computes the (scaled) r-th column of the inverse of * the sumbmatrix in rows B1 through BN of the tridiagonal matrix * L D L^T - sigma I. When sigma is close to an eigenvalue, the * computed vector is an accurate eigenvector. Usually, r corresponds * to the index where the eigenvector is largest in magnitude. * The following steps accomplish this computation : * (a) Stationary qd transform, L D L^T - sigma I = L(+) D(+) L(+)^T, * (b) Progressive qd transform, L D L^T - sigma I = U(-) D(-) U(-)^T, * (c) Computation of the diagonal elements of the inverse of * L D L^T - sigma I by combining the above transforms, and choosing * r as the index where the diagonal of the inverse is (one of the) * largest in magnitude. * (d) Computation of the (scaled) r-th column of the inverse using the * twisted factorization obtained by combining the top part of the * the stationary and the bottom part of the progressive transform. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix L D L^T. * * B1 (input) INTEGER * First index of the submatrix of L D L^T. * * BN (input) INTEGER * Last index of the submatrix of L D L^T. * * LAMBDA (input) DOUBLE PRECISION * The shift. In order to compute an accurate eigenvector, * LAMBDA should be a good approximation to an eigenvalue * of L D L^T. * * L (input) DOUBLE PRECISION array, dimension (N-1) * The (n-1) subdiagonal elements of the unit bidiagonal matrix * L, in elements 1 to N-1. * * D (input) DOUBLE PRECISION array, dimension (N) * The n diagonal elements of the diagonal matrix D. * * LD (input) DOUBLE PRECISION array, dimension (N-1) * The n-1 elements L(i)*D(i). * * LLD (input) DOUBLE PRECISION array, dimension (N-1) * The n-1 elements L(i)*L(i)*D(i). * * PIVMIN (input) DOUBLE PRECISION * The minimum pivot in the Sturm sequence. * * GAPTOL (input) DOUBLE PRECISION * Tolerance that indicates when eigenvector entries are negligible * w.r.t. their contribution to the residual. * * Z (input/output) DOUBLE PRECISION array, dimension (N) * On input, all entries of Z must be set to 0. * On output, Z contains the (scaled) r-th column of the * inverse. The scaling is such that Z(R) equals 1. * * WANTNC (input) LOGICAL * Specifies whether NEGCNT has to be computed. * * NEGCNT (output) INTEGER * If WANTNC is .TRUE. then NEGCNT = the number of pivots < pivmin * in the matrix factorization L D L^T, and NEGCNT = -1 otherwise. * * ZTZ (output) DOUBLE PRECISION * The square of the 2-norm of Z. * * MINGMA (output) DOUBLE PRECISION * The reciprocal of the largest (in magnitude) diagonal * element of the inverse of L D L^T - sigma I. * * R (input/output) INTEGER * The twist index for the twisted factorization used to * compute Z. * On input, 0 <= R <= N. If R is input as 0, R is set to * the index where (L D L^T - sigma I)^{-1} is largest * in magnitude. If 1 <= R <= N, R is unchanged. * On output, R contains the twist index used to compute Z. * Ideally, R designates the position of the maximum entry in the * eigenvector. * * ISUPPZ (output) INTEGER array, dimension (2) * The support of the vector in Z, i.e., the vector Z is * nonzero only in elements ISUPPZ(1) through ISUPPZ( 2 ). * * NRMINV (output) DOUBLE PRECISION * NRMINV = 1/SQRT( ZTZ ) * * RESID (output) DOUBLE PRECISION * The residual of the FP vector. * RESID = ABS( MINGMA )/SQRT( ZTZ ) * * RQCORR (output) DOUBLE PRECISION * The Rayleigh Quotient correction to LAMBDA. * RQCORR = MINGMA*TMP * * WORK (workspace) DOUBLE PRECISION array, dimension (4*N) * * Further Details * =============== * * Based on contributions by * Beresford Parlett, University of California, Berkeley, USA * Jim Demmel, University of California, Berkeley, USA * Inderjit Dhillon, University of Texas, Austin, USA * Osni Marques, LBNL/NERSC, USA * Christof Voemel, University of California, Berkeley, USA * * ===================================================================== * * .. Parameters .. INTEGER BLKLEN PARAMETER ( BLKLEN = 16 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. LOGICAL SAWNAN1, SAWNAN2 INTEGER BI, I, INDLPL, INDP, INDS, INDUMN, NB, NEG1, $ NEG2, NX, R1, R2, TO DOUBLE PRECISION ABSZCUR, ABSZPREV, DMINUS, DPLUS, EPS, $ S, TMP, ZPREV * .. * .. External Functions .. LOGICAL DISNAN DOUBLE PRECISION DLAMCH EXTERNAL DISNAN, DLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, DBLE * .. * .. Executable Statements .. * EPS = DLAMCH( 'Precision' ) IF( R.EQ.0 ) THEN R1 = B1 R2 = BN ELSE R1 = R R2 = R END IF * Storage for LPLUS INDLPL = 0 * Storage for UMINUS INDUMN = N INDS = 2*N + 1 INDP = 3*N + 1 IF( B1.EQ.1 ) THEN WORK( INDS ) = ZERO ELSE WORK( INDS+B1-1 ) = LLD( B1-1 ) END IF * * Compute the stationary transform (using the differential form) * until the index R2. * SAWNAN1 = .FALSE. NEG1 = 0 S = WORK( INDS+B1-1 ) - LAMBDA DO 50 I = B1, R1 - 1 DPLUS = D( I ) + S WORK( INDLPL+I ) = LD( I ) / DPLUS IF(DPLUS.LT.ZERO) NEG1 = NEG1 + 1 WORK( INDS+I ) = S*WORK( INDLPL+I )*L( I ) S = WORK( INDS+I ) - LAMBDA 50 CONTINUE SAWNAN1 = DISNAN( S ) IF( SAWNAN1 ) GOTO 60 DO 51 I = R1, R2 - 1 DPLUS = D( I ) + S WORK( INDLPL+I ) = LD( I ) / DPLUS WORK( INDS+I ) = S*WORK( INDLPL+I )*L( I ) S = WORK( INDS+I ) - LAMBDA 51 CONTINUE SAWNAN1 = DISNAN( S ) * 60 CONTINUE IF( SAWNAN1 ) THEN * Runs a slower version of the above loop if a NaN is detected NEG1 = 0 S = WORK( INDS+B1-1 ) - LAMBDA DO 70 I = B1, R1 - 1 DPLUS = D( I ) + S IF(ABS(DPLUS).LT.PIVMIN) DPLUS = -PIVMIN WORK( INDLPL+I ) = LD( I ) / DPLUS IF(DPLUS.LT.ZERO) NEG1 = NEG1 + 1 WORK( INDS+I ) = S*WORK( INDLPL+I )*L( I ) IF( WORK( INDLPL+I ).EQ.ZERO ) $ WORK( INDS+I ) = LLD( I ) S = WORK( INDS+I ) - LAMBDA 70 CONTINUE DO 71 I = R1, R2 - 1 DPLUS = D( I ) + S IF(ABS(DPLUS).LT.PIVMIN) DPLUS = -PIVMIN WORK( INDLPL+I ) = LD( I ) / DPLUS WORK( INDS+I ) = S*WORK( INDLPL+I )*L( I ) IF( WORK( INDLPL+I ).EQ.ZERO ) $ WORK( INDS+I ) = LLD( I ) S = WORK( INDS+I ) - LAMBDA 71 CONTINUE END IF * * Compute the progressive transform (using the differential form) * until the index R1 * SAWNAN2 = .FALSE. NEG2 = 0 WORK( INDP+BN-1 ) = D( BN ) - LAMBDA DO 80 I = BN - 1, R1, -1 DMINUS = LLD( I ) + WORK( INDP+I ) TMP = D( I ) / DMINUS IF(DMINUS.LT.ZERO) NEG2 = NEG2 + 1 WORK( INDUMN+I ) = L( I )*TMP WORK( INDP+I-1 ) = WORK( INDP+I )*TMP - LAMBDA 80 CONTINUE TMP = WORK( INDP+R1-1 ) SAWNAN2 = DISNAN( TMP ) IF( SAWNAN2 ) THEN * Runs a slower version of the above loop if a NaN is detected NEG2 = 0 DO 100 I = BN-1, R1, -1 DMINUS = LLD( I ) + WORK( INDP+I ) IF(ABS(DMINUS).LT.PIVMIN) DMINUS = -PIVMIN TMP = D( I ) / DMINUS IF(DMINUS.LT.ZERO) NEG2 = NEG2 + 1 WORK( INDUMN+I ) = L( I )*TMP WORK( INDP+I-1 ) = WORK( INDP+I )*TMP - LAMBDA IF( TMP.EQ.ZERO ) $ WORK( INDP+I-1 ) = D( I ) - LAMBDA 100 CONTINUE END IF * * Find the index (from R1 to R2) of the largest (in magnitude) * diagonal element of the inverse * MINGMA = WORK( INDS+R1-1 ) + WORK( INDP+R1-1 ) IF( MINGMA.LT.ZERO ) NEG1 = NEG1 + 1 IF( WANTNC ) THEN NEGCNT = NEG1 + NEG2 ELSE NEGCNT = -1 ENDIF IF( ABS(MINGMA).EQ.ZERO ) $ MINGMA = EPS*WORK( INDS+R1-1 ) R = R1 DO 110 I = R1, R2 - 1 TMP = WORK( INDS+I ) + WORK( INDP+I ) IF( TMP.EQ.ZERO ) $ TMP = EPS*WORK( INDS+I ) IF( ABS( TMP ).LE.ABS( MINGMA ) ) THEN MINGMA = TMP R = I + 1 END IF 110 CONTINUE * * Compute the FP vector: solve N^T v = e_r * ISUPPZ( 1 ) = B1 ISUPPZ( 2 ) = BN Z( R ) = ONE ZTZ = ONE * * Compute the FP vector upwards from R * NB = INT((R-B1)/BLKLEN) NX = R-NB*BLKLEN IF( .NOT.SAWNAN1 ) THEN DO 210 BI = R-1, NX, -BLKLEN TO = BI-BLKLEN+1 DO 205 I = BI, TO, -1 Z( I ) = -( WORK(INDLPL+I)*Z(I+1) ) ZTZ = ZTZ + Z( I )*Z( I ) 205 CONTINUE IF( ABS(Z(TO)).LT.EPS .AND. $ ABS(Z(TO+1)).LT.EPS ) THEN ISUPPZ(1) = TO GOTO 220 ENDIF 210 CONTINUE DO 215 I = NX-1, B1, -1 Z( I ) = -( WORK(INDLPL+I)*Z(I+1) ) ZTZ = ZTZ + Z( I )*Z( I ) 215 CONTINUE 220 CONTINUE ELSE * Run slower loop if NaN occurred. DO 230 BI = R-1, NX, -BLKLEN TO = BI-BLKLEN+1 DO 225 I = BI, TO, -1 IF( Z( I+1 ).EQ.ZERO ) THEN Z( I ) = -( LD( I+1 ) / LD( I ) )*Z( I+2 ) ELSE Z( I ) = -( WORK( INDLPL+I )*Z( I+1 ) ) END IF ZTZ = ZTZ + Z( I )*Z( I ) 225 CONTINUE IF( ABS(Z(TO)).LT.EPS .AND. $ ABS(Z(TO+1)).LT.EPS ) THEN ISUPPZ(1) = TO GOTO 240 ENDIF 230 CONTINUE DO 235 I = NX-1, B1, -1 IF( Z( I+1 ).EQ.ZERO ) THEN Z( I ) = -( LD( I+1 ) / LD( I ) )*Z( I+2 ) ELSE Z( I ) = -( WORK( INDLPL+I )*Z( I+1 ) ) END IF ZTZ = ZTZ + Z( I )*Z( I ) 235 CONTINUE 240 CONTINUE ENDIF DO 245 I= B1, (ISUPPZ(1)-1) Z(I) = ZERO 245 CONTINUE * Compute the FP vector downwards from R in blocks of size BLKLEN IF( .NOT.SAWNAN2 ) THEN DO 260 BI = R+1, BN, BLKLEN TO = BI+BLKLEN-1 IF ( TO.LE.BN ) THEN DO 250 I = BI, TO Z(I) = -(WORK(INDUMN+I-1)*Z(I-1)) ZTZ = ZTZ + Z( I )*Z( I ) 250 CONTINUE IF( ABS(Z(TO)).LE.EPS .AND. $ ABS(Z(TO-1)).LE.EPS ) THEN ISUPPZ(2) = TO GOTO 265 ENDIF ELSE DO 255 I = BI, BN Z(I) = -(WORK(INDUMN+I-1)*Z(I-1)) ZTZ = ZTZ + Z( I )*Z( I ) 255 CONTINUE ENDIF 260 CONTINUE 265 CONTINUE ELSE * Run slower loop if NaN occurred. DO 280 BI = R+1, BN, BLKLEN TO = BI+BLKLEN-1 IF ( TO.LE.BN ) THEN DO 270 I = BI, TO ZPREV = Z(I-1) ABSZPREV = ABS(ZPREV) IF( ZPREV.NE.ZERO ) THEN Z(I)= -(WORK(INDUMN+I-1)*ZPREV) ELSE Z(I)= -(LD(I-2)/LD(I-1))*Z(I-2) END IF ABSZCUR = ABS(Z(I)) ZTZ = ZTZ + ABSZCUR**2 270 CONTINUE IF( ABSZCUR.LT.EPS .AND. $ ABSZPREV.LT.EPS ) THEN ISUPPZ(2) = I GOTO 285 ENDIF ELSE DO 275 I = BI, BN ZPREV = Z(I-1) ABSZPREV = ABS(ZPREV) IF( ZPREV.NE.ZERO ) THEN Z(I)= -(WORK(INDUMN+I-1)*ZPREV) ELSE Z(I)= -(LD(I-2)/LD(I-1))*Z(I-2) END IF ABSZCUR = ABS(Z(I)) ZTZ = ZTZ + ABSZCUR**2 275 CONTINUE ENDIF 280 CONTINUE 285 CONTINUE END IF DO 290 I= ISUPPZ(2)+1,BN Z(I) = ZERO 290 CONTINUE * * Compute quantities for convergence test * TMP = ONE / ZTZ NRMINV = SQRT( TMP ) RESID = ABS( MINGMA )*NRMINV RQCORR = MINGMA*TMP * RETURN * * End of DLAR1VA * END scalapack-2.0.2/SRC/dlaref.f000644 000766 000024 00000030353 11642700517 015753 0ustar00juliestaff000000 000000 SUBROUTINE DLAREF( TYPE, A, LDA, WANTZ, Z, LDZ, BLOCK, IROW1, $ ICOL1, ISTART, ISTOP, ITMP1, ITMP2, LILOZ, $ LIHIZ, VECS, V2, V3, T1, T2, T3 ) IMPLICIT NONE * * -- ScaLAPACK auxiliary routine (version 1.5) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL BLOCK, WANTZ CHARACTER TYPE INTEGER ICOL1, IROW1, ISTART, ISTOP, ITMP1, ITMP2, LDA, $ LDZ, LIHIZ, LILOZ DOUBLE PRECISION T1, T2, T3, V2, V3 * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), VECS( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * DLAREF applies one or several Householder reflectors of size 3 * to one or two matrices (if column is specified) on either their * rows or columns. * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * If 'R': Apply reflectors to the rows of the matrix * (apply from left) * Otherwise: Apply reflectors to the columns of the matrix * Unchanged on exit. * * A (global input/output) DOUBLE PRECISION array, (LDA,*) * On entry, the matrix to receive the reflections. * The updated matrix on exit. * * LDA (local input) INTEGER * On entry, the leading dimension of A. Unchanged on exit. * * WANTZ (global input) LOGICAL * If .TRUE., then apply any column reflections to Z as well. * If .FALSE., then do no additional work on Z. * * Z (global input/output) DOUBLE PRECISION array, (LDZ,*) * On entry, the second matrix to receive column reflections. * This is changed only if WANTZ is set. * * LDZ (local input) INTEGER * On entry, the leading dimension of Z. Unchanged on exit. * * BLOCK (global input) LOGICAL * If .TRUE., then apply several reflectors at once and read * their data from the VECS array. * If .FALSE., apply the single reflector given by V2, V3, * T1, T2, and T3. * * IROW1 (local input/output) INTEGER * On entry, the local row element of A. * Undefined on output. * * * ICOL1 (local input/output) INTEGER * On entry, the local column element of A. * Undefined on output. * * ISTART (global input) INTEGER * Specifies the "number" of the first reflector. This is * used as an index into VECS if BLOCK is set. * ISTART is ignored if BLOCK is .FALSE.. * * ISTOP (global input) INTEGER * Specifies the "number" of the last reflector. This is * used as an index into VECS if BLOCK is set. * ISTOP is ignored if BLOCK is .FALSE.. * * ITMP1 (local input) INTEGER * Starting range into A. For rows, this is the local * first column. For columns, this is the local first row. * * ITMP2 (local input) INTEGER * Ending range into A. For rows, this is the local last * column. For columns, this is the local last row. * * LILOZ * LIHIZ (local input) INTEGER * These serve the same purpose as ITMP1,ITMP2 but for Z * when WANTZ is set. * * VECS (global input) DOUBLE PRECISION array of size 3*N (matrix * size) * This holds the size 3 reflectors one after another and this * is only accessed when BLOCK is .TRUE. * * V2 * V3 * T1 * T2 * T3 (global input/output) DOUBLE PRECISION * This holds information on a single size 3 Householder * reflector and is read when BLOCK is .FALSE., and * overwritten when BLOCK is .TRUE. * * Implemented by: G. Henry, May 1, 1997 * * ===================================================================== * * .. Local Scalars .. INTEGER J, K DOUBLE PRECISION H11, H22, SUM, T12, T13, T22, T23, T32, T33, $ V22, V23, V32, V33, A1, A2, A3, A4, A5, B1, $ B2, B3, B4, B5, TMP1, TMP2, TMP3, SUM1, SUM2, $ SUM3, A11, A22 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * IF( LSAME( TYPE, 'R' ) ) THEN IF( BLOCK ) THEN DO 30 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 10 J = ITMP1, ITMP2-MOD(ITMP2-ITMP1+1,2), 2 A1 = A ( IROW1 , J ) A2 = A ( IROW1+1, J ) A3 = A ( IROW1+2, J ) A4 = A ( IROW1+3, J ) A5 = A ( IROW1+4, J ) B1 = A ( IROW1 , J+1 ) B2 = A ( IROW1+1, J+1 ) B3 = A ( IROW1+2, J+1 ) B4 = A ( IROW1+3, J+1 ) B5 = A ( IROW1+4, J+1 ) SUM1 = A1 + V2*A2 + V3*A3 A( IROW1 , J ) = A1 - SUM1 * T1 H11 = A2 - SUM1 * T2 H22 = A3 - SUM1 * T3 TMP1 = B1 + V2*B2 + V3*B3 A( IROW1 , J+1 ) = B1 - TMP1 * T1 A11 = B2 - TMP1 * T2 A22 = B3 - TMP1 * T3 SUM2 = H11 + V22*H22 + V32*A4 A( IROW1+1, J ) = H11 - SUM2 * T12 H11 = H22 - SUM2 * T22 H22 = A4 - SUM2 * T32 TMP2 = A11 + V22*A22 + V32*B4 A( IROW1+1, J+1 ) = A11 - TMP2 * T12 A11 = A22 - TMP2 * T22 A22 = B4 - TMP2 * T32 SUM3 = H11 + V23*H22 + V33*A5 A( IROW1+2, J ) = H11 - SUM3 * T13 A( IROW1+3, J ) = H22 - SUM3 * T23 A( IROW1+4, J ) = A5 - SUM3 * T33 TMP3 = A11 + V23*A22 + V33*B5 A( IROW1+2, J+1 ) = A11 - TMP3 * T13 A( IROW1+3, J+1 ) = A22 - TMP3 * T23 A( IROW1+4, J+1 ) = B5 - TMP3 * T33 10 CONTINUE DO 20 J = ITMP2-MOD(ITMP2-ITMP1+1,2)+1, ITMP2 SUM = A( IROW1, J ) + V2*A( IROW1+1, J ) + $ V3*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM*T1 H11 = A( IROW1+1, J ) - SUM*T2 H22 = A( IROW1+2, J ) - SUM*T3 SUM = H11 + V22*H22 + V32*A( IROW1+3, J ) A( IROW1+1, J ) = H11 - SUM*T12 H11 = H22 - SUM*T22 H22 = A( IROW1+3, J ) - SUM*T32 SUM = H11 + V23*H22 + V33*A( IROW1+4, J ) A( IROW1+2, J ) = H11 - SUM*T13 A( IROW1+3, J ) = H22 - SUM*T23 A( IROW1+4, J ) = A( IROW1+4, J ) - SUM*T33 20 CONTINUE IROW1 = IROW1 + 3 30 CONTINUE DO 50 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 40 J = ITMP1, ITMP2 SUM = A( IROW1, J ) + V2*A( IROW1+1, J ) + $ V3*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM*T1 A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*T2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*T3 40 CONTINUE IROW1 = IROW1 + 1 50 CONTINUE ELSE DO 60 J = ITMP1, ITMP2 SUM = A( IROW1, J ) + V2*A( IROW1+1, J ) + $ V3*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM*T1 A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*T2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*T3 60 CONTINUE END IF ELSE * * Do column transforms * IF( BLOCK ) THEN DO 90 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 70 J = ITMP1, ITMP2 SUM = A( J, ICOL1 ) + V2*A( J, ICOL1+1 ) + $ V3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM*T1 H11 = A( J, ICOL1+1 ) - SUM*T2 H22 = A( J, ICOL1+2 ) - SUM*T3 SUM = H11 + V22*H22 + V32*A( J, ICOL1+3 ) A( J, ICOL1+1 ) = H11 - SUM*T12 H11 = H22 - SUM*T22 H22 = A( J, ICOL1+3 ) - SUM*T32 SUM = H11 + V23*H22 + V33*A( J, ICOL1+4 ) A( J, ICOL1+2 ) = H11 - SUM*T13 A( J, ICOL1+3 ) = H22 - SUM*T23 A( J, ICOL1+4 ) = A( J, ICOL1+4 ) - SUM*T33 70 CONTINUE IF( WANTZ ) THEN DO 80 J = LILOZ, LIHIZ SUM = Z( J, ICOL1 ) + V2*Z( J, ICOL1+1 ) + $ V3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM*T1 H11 = Z( J, ICOL1+1 ) - SUM*T2 H22 = Z( J, ICOL1+2 ) - SUM*T3 SUM = H11 + V22*H22 + V32*Z( J, ICOL1+3 ) Z( J, ICOL1+1 ) = H11 - SUM*T12 H11 = H22 - SUM*T22 H22 = Z( J, ICOL1+3 ) - SUM*T32 SUM = H11 + V23*H22 + V33*Z( J, ICOL1+4 ) Z( J, ICOL1+2 ) = H11 - SUM*T13 Z( J, ICOL1+3 ) = H22 - SUM*T23 Z( J, ICOL1+4 ) = Z( J, ICOL1+4 ) - SUM*T33 80 CONTINUE END IF ICOL1 = ICOL1 + 3 90 CONTINUE DO 120 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 100 J = ITMP1, ITMP2 SUM = A( J, ICOL1 ) + V2*A( J, ICOL1+1 ) + $ V3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM*T1 A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*T2 A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*T3 100 CONTINUE IF( WANTZ ) THEN DO 110 J = LILOZ, LIHIZ SUM = Z( J, ICOL1 ) + V2*Z( J, ICOL1+1 ) + $ V3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM*T1 Z( J, ICOL1+1 ) = Z( J, ICOL1+1 ) - SUM*T2 Z( J, ICOL1+2 ) = Z( J, ICOL1+2 ) - SUM*T3 110 CONTINUE END IF ICOL1 = ICOL1 + 1 120 CONTINUE ELSE DO 130 J = ITMP1, ITMP2 SUM = A( J, ICOL1 ) + V2*A( J, ICOL1+1 ) + $ V3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM*T1 A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*T2 A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*T3 130 CONTINUE END IF END IF RETURN * * End of DLAREF * END scalapack-2.0.2/SRC/dlarrb2.f000644 000766 000024 00000047070 11657111056 016052 0ustar00juliestaff000000 000000 SUBROUTINE DLARRB2( N, D, LLD, IFIRST, ILAST, RTOL1, $ RTOL2, OFFSET, W, WGAP, WERR, WORK, IWORK, $ PIVMIN, LGPVMN, LGSPDM, TWIST, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IFIRST, ILAST, INFO, N, OFFSET, TWIST DOUBLE PRECISION LGPVMN, LGSPDM, PIVMIN, $ RTOL1, RTOL2 * .. * .. Array Arguments .. INTEGER IWORK( * ) DOUBLE PRECISION D( * ), LLD( * ), W( * ), $ WERR( * ), WGAP( * ), WORK( * ) * .. * * Purpose * ======= * * Given the relatively robust representation(RRR) L D L^T, DLARRB2 * does "limited" bisection to refine the eigenvalues of L D L^T, * W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial * guesses for these eigenvalues are input in W, the corresponding estimate * of the error in these guesses and their gaps are input in WERR * and WGAP, respectively. During bisection, intervals * [left, right] are maintained by storing their mid-points and * semi-widths in the arrays W and WERR respectively. * * NOTE: * There are very few minor differences between DLARRB from LAPACK * and this current subroutine DLARRB2. * The most important reason for creating this nearly identical copy * is profiling: in the ScaLAPACK MRRR algorithm, eigenvalue computation * using DLARRB2 is used for refinement in the construction of * the representation tree, as opposed to the initial computation of the * eigenvalues for the root RRR which uses DLARRB. When profiling, * this allows an easy quantification of refinement work vs. computing * eigenvalues of the root. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix. * * D (input) DOUBLE PRECISION array, dimension (N) * The N diagonal elements of the diagonal matrix D. * * LLD (input) DOUBLE PRECISION array, dimension (N-1) * The (N-1) elements L(i)*L(i)*D(i). * * IFIRST (input) INTEGER * The index of the first eigenvalue to be computed. * * ILAST (input) INTEGER * The index of the last eigenvalue to be computed. * * RTOL1 (input) DOUBLE PRECISION * RTOL2 (input) DOUBLE PRECISION * Tolerance for the convergence of the bisection intervals. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) * where GAP is the (estimated) distance to the nearest * eigenvalue. * * OFFSET (input) INTEGER * Offset for the arrays W, WGAP and WERR, i.e., the IFIRST-OFFSET * through ILAST-OFFSET elements of these arrays are to be used. * * W (input/output) DOUBLE PRECISION array, dimension (N) * On input, W( IFIRST-OFFSET ) through W( ILAST-OFFSET ) are * estimates of the eigenvalues of L D L^T indexed IFIRST through ILAST. * On output, these estimates are refined. * * WGAP (input/output) DOUBLE PRECISION array, dimension (N-1) * On input, the (estimated) gaps between consecutive * eigenvalues of L D L^T, i.e., WGAP(I-OFFSET) is the gap between * eigenvalues I and I+1. Note that if IFIRST.EQ.ILAST * then WGAP(IFIRST-OFFSET) must be set to ZERO. * On output, these gaps are refined. * * WERR (input/output) DOUBLE PRECISION array, dimension (N) * On input, WERR( IFIRST-OFFSET ) through WERR( ILAST-OFFSET ) are * the errors in the estimates of the corresponding elements in W. * On output, these errors are refined. * * WORK (workspace) DOUBLE PRECISION array, dimension (4*N) * Workspace. * * IWORK (workspace) INTEGER array, dimension (2*N) * Workspace. * * PIVMIN (input) DOUBLE PRECISION * The minimum pivot in the sturm sequence. * * LGPVMN (input) DOUBLE PRECISION * Logarithm of PIVMIN, precomputed. * * LGSPDM (input) DOUBLE PRECISION * Logarithm of the spectral diameter, precomputed. * * TWIST (input) INTEGER * The twist index for the twisted factorization that is used * for the negcount. * TWIST = N: Compute negcount from L D L^T - LAMBDA I = L+ D+ L+^T * TWIST = 1: Compute negcount from L D L^T - LAMBDA I = U- D- U-^T * TWIST = R: Compute negcount from L D L^T - LAMBDA I = N(r) D(r) N(r) * * INFO (output) INTEGER * Error flag. * * .. Parameters .. DOUBLE PRECISION ZERO, TWO, HALF PARAMETER ( ZERO = 0.0D0, TWO = 2.0D0, $ HALF = 0.5D0 ) INTEGER MAXITR * .. * .. Local Scalars .. INTEGER I, I1, II, INDLLD, IP, ITER, J, K, NEGCNT, $ NEXT, NINT, OLNINT, PREV, R DOUBLE PRECISION BACK, CVRGD, GAP, LEFT, LGAP, MID, MNWDTH, $ RGAP, RIGHT, SAVGAP, TMP, WIDTH LOGICAL PARANOID * .. * .. External Functions .. LOGICAL DISNAN DOUBLE PRECISION DLAMCH INTEGER DLANEG2A EXTERNAL DISNAN, DLAMCH, $ DLANEG2A * * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * INFO = 0 * * Turn on paranoid check for rounding errors * invalidating uncertainty intervals of eigenvalues * PARANOID = .TRUE. * MAXITR = INT( ( LGSPDM - LGPVMN ) / LOG( TWO ) ) + 2 MNWDTH = TWO * PIVMIN * R = TWIST * INDLLD = 2*N DO 5 J = 1, N-1 I=2*J WORK(INDLLD+I-1) = D(J) WORK(INDLLD+I) = LLD(J) 5 CONTINUE WORK(INDLLD+2*N-1) = D(N) * IF((R.LT.1).OR.(R.GT.N)) R = N * * Initialize unconverged intervals in [ WORK(2*I-1), WORK(2*I) ]. * The Sturm Count, Count( WORK(2*I-1) ) is arranged to be I-1, while * Count( WORK(2*I) ) is stored in IWORK( 2*I ). The integer IWORK( 2*I-1 ) * for an unconverged interval is set to the index of the next unconverged * interval, and is -1 or 0 for a converged interval. Thus a linked * list of unconverged intervals is set up. * I1 = IFIRST * The number of unconverged intervals NINT = 0 * The last unconverged interval found PREV = 0 RGAP = WGAP( I1-OFFSET ) DO 75 I = I1, ILAST K = 2*I II = I - OFFSET LEFT = W( II ) - WERR( II ) RIGHT = W( II ) + WERR( II ) LGAP = RGAP RGAP = WGAP( II ) GAP = MIN( LGAP, RGAP ) IF((ABS(LEFT).LE.16*PIVMIN).OR.(ABS(RIGHT).LE.16*PIVMIN)) $ THEN INFO = -1 RETURN ENDIF IF( PARANOID ) THEN * Make sure that [LEFT,RIGHT] contains the desired eigenvalue * Compute negcount from dstqds facto L+D+L+^T = L D L^T - LEFT * * Do while( NEGCNT(LEFT).GT.I-1 ) * BACK = WERR( II ) 20 CONTINUE NEGCNT = DLANEG2A( N, WORK(INDLLD+1), LEFT, PIVMIN, R ) IF( NEGCNT.GT.I-1 ) THEN LEFT = LEFT - BACK BACK = TWO*BACK GO TO 20 END IF * * Do while( NEGCNT(RIGHT).LT.I ) * Compute negcount from dstqds facto L+D+L+^T = L D L^T - RIGHT * BACK = WERR( II ) 50 CONTINUE NEGCNT = DLANEG2A( N, WORK(INDLLD+1),RIGHT, PIVMIN, R ) IF( NEGCNT.LT.I ) THEN RIGHT = RIGHT + BACK BACK = TWO*BACK GO TO 50 END IF ENDIF WIDTH = HALF*ABS( LEFT - RIGHT ) TMP = MAX( ABS( LEFT ), ABS( RIGHT ) ) CVRGD = MAX(RTOL1*GAP,RTOL2*TMP) IF( WIDTH.LE.CVRGD .OR. WIDTH.LE.MNWDTH ) THEN * This interval has already converged and does not need refinement. * (Note that the gaps might change through refining the * eigenvalues, however, they can only get bigger.) * Remove it from the list. IWORK( K-1 ) = -1 * Make sure that I1 always points to the first unconverged interval IF((I.EQ.I1).AND.(I.LT.ILAST)) I1 = I + 1 IF((PREV.GE.I1).AND.(I.LE.ILAST)) IWORK( 2*PREV-1 ) = I + 1 ELSE * unconverged interval found PREV = I NINT = NINT + 1 IWORK( K-1 ) = I + 1 IWORK( K ) = NEGCNT END IF WORK( K-1 ) = LEFT WORK( K ) = RIGHT 75 CONTINUE * * Do while( NINT.GT.0 ), i.e. there are still unconverged intervals * and while (ITER.LT.MAXITR) * ITER = 0 80 CONTINUE PREV = I1 - 1 I = I1 OLNINT = NINT DO 100 IP = 1, OLNINT K = 2*I II = I - OFFSET RGAP = WGAP( II ) LGAP = RGAP IF(II.GT.1) LGAP = WGAP( II-1 ) GAP = MIN( LGAP, RGAP ) NEXT = IWORK( K-1 ) LEFT = WORK( K-1 ) RIGHT = WORK( K ) MID = HALF*( LEFT + RIGHT ) * semiwidth of interval WIDTH = RIGHT - MID TMP = MAX( ABS( LEFT ), ABS( RIGHT ) ) CVRGD = MAX(RTOL1*GAP,RTOL2*TMP) IF( ( WIDTH.LE.CVRGD ) .OR. ( WIDTH.LE.MNWDTH ).OR. $ ( ITER.EQ.MAXITR ) )THEN * reduce number of unconverged intervals NINT = NINT - 1 * Mark interval as converged. IWORK( K-1 ) = 0 IF( I1.EQ.I ) THEN I1 = NEXT ELSE * Prev holds the last unconverged interval previously examined IF(PREV.GE.I1) IWORK( 2*PREV-1 ) = NEXT END IF I = NEXT GO TO 100 END IF PREV = I * * Perform one bisection step * NEGCNT = DLANEG2A( N, WORK(INDLLD+1), MID, PIVMIN, R ) IF( NEGCNT.LE.I-1 ) THEN WORK( K-1 ) = MID ELSE WORK( K ) = MID END IF I = NEXT 100 CONTINUE ITER = ITER + 1 * do another loop if there are still unconverged intervals * However, in the last iteration, all intervals are accepted * since this is the best we can do. IF( ( NINT.GT.0 ).AND.(ITER.LE.MAXITR) ) GO TO 80 * * * At this point, all the intervals have converged * * save this gap to restore it after the loop SAVGAP = WGAP( ILAST-OFFSET ) * LEFT = WORK( 2*IFIRST-1 ) DO 110 I = IFIRST, ILAST K = 2*I II = I - OFFSET * RIGHT is the right boundary of this current interval RIGHT = WORK( K ) * All intervals marked by '0' have been refined. IF( IWORK( K-1 ).EQ.0 ) THEN W( II ) = HALF*( LEFT+RIGHT ) WERR( II ) = RIGHT - W( II ) END IF * Left is the boundary of the next interval LEFT = WORK( K +1 ) WGAP( II ) = MAX( ZERO, LEFT - RIGHT ) 110 CONTINUE * restore the last gap which was overwritten by garbage WGAP( ILAST-OFFSET ) = SAVGAP RETURN * * End of DLARRB2 * END * * * FUNCTION DLANEG2( N, D, LLD, SIGMA, PIVMIN, R ) * IMPLICIT NONE * INTEGER DLANEG2 * * .. Scalar Arguments .. INTEGER N, R DOUBLE PRECISION PIVMIN, SIGMA * .. * .. Array Arguments .. DOUBLE PRECISION D( * ), LLD( * ) * DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) INTEGER BLKLEN PARAMETER ( BLKLEN = 2048 ) * .. * .. Local Scalars .. INTEGER BJ, J, NEG1, NEG2, NEGCNT, TO DOUBLE PRECISION DMINUS, DPLUS, GAMMA, P, S, T, TMP, XSAV LOGICAL SAWNAN * .. * .. External Functions .. LOGICAL DISNAN EXTERNAL DISNAN NEGCNT = 0 * * I) upper part: L D L^T - SIGMA I = L+ D+ L+^T * run dstqds block-wise to avoid excessive work when NaNs occur * S = ZERO DO 210 BJ = 1, R-1, BLKLEN NEG1 = 0 XSAV = S TO = BJ+BLKLEN-1 IF ( TO.LE.R-1 ) THEN DO 21 J = BJ, TO T = S - SIGMA DPLUS = D( J ) + T IF( DPLUS.LT.ZERO ) NEG1=NEG1 + 1 S = T*LLD( J ) / DPLUS 21 CONTINUE ELSE DO 22 J = BJ, R-1 T = S - SIGMA DPLUS = D( J ) + T IF( DPLUS.LT.ZERO ) NEG1=NEG1 + 1 S = T*LLD( J ) / DPLUS 22 CONTINUE ENDIF SAWNAN = DISNAN( S ) * IF( SAWNAN ) THEN NEG1 = 0 S = XSAV TO = BJ+BLKLEN-1 IF ( TO.LE.R-1 ) THEN DO 23 J = BJ, TO T = S - SIGMA DPLUS = D( J ) + T IF(ABS(DPLUS).LT.PIVMIN) $ DPLUS = -PIVMIN TMP = LLD( J ) / DPLUS IF( DPLUS.LT.ZERO ) $ NEG1 = NEG1 + 1 S = T*TMP IF( TMP.EQ.ZERO ) S = LLD( J ) 23 CONTINUE ELSE DO 24 J = BJ, R-1 T = S - SIGMA DPLUS = D( J ) + T IF(ABS(DPLUS).LT.PIVMIN) $ DPLUS = -PIVMIN TMP = LLD( J ) / DPLUS IF( DPLUS.LT.ZERO ) NEG1=NEG1+1 S = T*TMP IF( TMP.EQ.ZERO ) S = LLD( J ) 24 CONTINUE ENDIF END IF NEGCNT = NEGCNT + NEG1 210 CONTINUE * * II) lower part: L D L^T - SIGMA I = U- D- U-^T * P = D( N ) - SIGMA DO 230 BJ = N-1, R, -BLKLEN NEG2 = 0 XSAV = P TO = BJ-BLKLEN+1 IF ( TO.GE.R ) THEN DO 25 J = BJ, TO, -1 DMINUS = LLD( J ) + P IF( DMINUS.LT.ZERO ) NEG2=NEG2+1 TMP = P / DMINUS P = TMP * D( J ) - SIGMA 25 CONTINUE ELSE DO 26 J = BJ, R, -1 DMINUS = LLD( J ) + P IF( DMINUS.LT.ZERO ) NEG2=NEG2+1 TMP = P / DMINUS P = TMP * D( J ) - SIGMA 26 CONTINUE ENDIF SAWNAN = DISNAN( P ) * IF( SAWNAN ) THEN NEG2 = 0 P = XSAV TO = BJ-BLKLEN+1 IF ( TO.GE.R ) THEN DO 27 J = BJ, TO, -1 DMINUS = LLD( J ) + P IF(ABS(DMINUS).LT.PIVMIN) $ DMINUS = -PIVMIN TMP = D( J ) / DMINUS IF( DMINUS.LT.ZERO ) $ NEG2 = NEG2 + 1 P = P*TMP - SIGMA IF( TMP.EQ.ZERO ) $ P = D( J ) - SIGMA 27 CONTINUE ELSE DO 28 J = BJ, R, -1 DMINUS = LLD( J ) + P IF(ABS(DMINUS).LT.PIVMIN) $ DMINUS = -PIVMIN TMP = D( J ) / DMINUS IF( DMINUS.LT.ZERO ) $ NEG2 = NEG2 + 1 P = P*TMP - SIGMA IF( TMP.EQ.ZERO ) $ P = D( J ) - SIGMA 28 CONTINUE ENDIF END IF NEGCNT = NEGCNT + NEG2 230 CONTINUE * * III) Twist index * GAMMA = S + P IF( GAMMA.LT.ZERO ) NEGCNT = NEGCNT+1 DLANEG2 = NEGCNT END * * * FUNCTION DLANEG2A( N, DLLD, SIGMA, PIVMIN, R ) * IMPLICIT NONE * INTEGER DLANEG2A * * .. Scalar Arguments .. INTEGER N, R DOUBLE PRECISION PIVMIN, SIGMA * .. * .. Array Arguments .. DOUBLE PRECISION DLLD( * ) * DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) INTEGER BLKLEN PARAMETER ( BLKLEN = 512 ) * * .. * .. Intrinsic Functions .. INTRINSIC INT * .. * .. Local Scalars .. INTEGER BJ, I, J, NB, NEG1, NEG2, NEGCNT, NX DOUBLE PRECISION DMINUS, DPLUS, GAMMA, P, S, T, TMP, XSAV LOGICAL SAWNAN * .. * .. External Functions .. LOGICAL DISNAN EXTERNAL DISNAN NEGCNT = 0 * * I) upper part: L D L^T - SIGMA I = L+ D+ L+^T * run dstqds block-wise to avoid excessive work when NaNs occur, * first in chunks of size BLKLEN and then the remainder * NB = INT((R-1)/BLKLEN) NX = NB*BLKLEN S = ZERO DO 210 BJ = 1, NX, BLKLEN NEG1 = 0 XSAV = S DO 21 J = BJ, BJ+BLKLEN-1 I = 2*J T = S - SIGMA DPLUS = DLLD( I-1 ) + T IF( DPLUS.LT.ZERO ) NEG1=NEG1 + 1 S = T*DLLD( I ) / DPLUS 21 CONTINUE SAWNAN = DISNAN( S ) * IF( SAWNAN ) THEN NEG1 = 0 S = XSAV DO 23 J = BJ, BJ+BLKLEN-1 I = 2*J T = S - SIGMA DPLUS = DLLD( I-1 ) + T IF(ABS(DPLUS).LT.PIVMIN) $ DPLUS = -PIVMIN TMP = DLLD( I ) / DPLUS IF( DPLUS.LT.ZERO ) $ NEG1 = NEG1 + 1 S = T*TMP IF( TMP.EQ.ZERO ) S = DLLD( I ) 23 CONTINUE END IF NEGCNT = NEGCNT + NEG1 210 CONTINUE * NEG1 = 0 XSAV = S DO 22 J = NX+1, R-1 I = 2*J T = S - SIGMA DPLUS = DLLD( I-1 ) + T IF( DPLUS.LT.ZERO ) NEG1=NEG1 + 1 S = T*DLLD( I ) / DPLUS 22 CONTINUE SAWNAN = DISNAN( S ) * IF( SAWNAN ) THEN NEG1 = 0 S = XSAV DO 24 J = NX+1, R-1 I = 2*J T = S - SIGMA DPLUS = DLLD( I-1 ) + T IF(ABS(DPLUS).LT.PIVMIN) $ DPLUS = -PIVMIN TMP = DLLD( I ) / DPLUS IF( DPLUS.LT.ZERO ) NEG1=NEG1+1 S = T*TMP IF( TMP.EQ.ZERO ) S = DLLD( I ) 24 CONTINUE ENDIF NEGCNT = NEGCNT + NEG1 * * II) lower part: L D L^T - SIGMA I = U- D- U-^T * NB = INT((N-R)/BLKLEN) NX = N-NB*BLKLEN P = DLLD( 2*N-1 ) - SIGMA DO 230 BJ = N-1, NX, -BLKLEN NEG2 = 0 XSAV = P DO 25 J = BJ, BJ-BLKLEN+1, -1 I = 2*J DMINUS = DLLD( I ) + P IF( DMINUS.LT.ZERO ) NEG2=NEG2+1 TMP = P / DMINUS P = TMP * DLLD( I-1 ) - SIGMA 25 CONTINUE SAWNAN = DISNAN( P ) * IF( SAWNAN ) THEN NEG2 = 0 P = XSAV DO 27 J = BJ, BJ-BLKLEN+1, -1 I = 2*J DMINUS = DLLD( I ) + P IF(ABS(DMINUS).LT.PIVMIN) $ DMINUS = -PIVMIN TMP = DLLD( I-1 ) / DMINUS IF( DMINUS.LT.ZERO ) $ NEG2 = NEG2 + 1 P = P*TMP - SIGMA IF( TMP.EQ.ZERO ) $ P = DLLD( I-1 ) - SIGMA 27 CONTINUE END IF NEGCNT = NEGCNT + NEG2 230 CONTINUE NEG2 = 0 XSAV = P DO 26 J = NX-1, R, -1 I = 2*J DMINUS = DLLD( I ) + P IF( DMINUS.LT.ZERO ) NEG2=NEG2+1 TMP = P / DMINUS P = TMP * DLLD( I-1 ) - SIGMA 26 CONTINUE SAWNAN = DISNAN( P ) * IF( SAWNAN ) THEN NEG2 = 0 P = XSAV DO 28 J = NX-1, R, -1 I = 2*J DMINUS = DLLD( I ) + P IF(ABS(DMINUS).LT.PIVMIN) $ DMINUS = -PIVMIN TMP = DLLD( I-1 ) / DMINUS IF( DMINUS.LT.ZERO ) $ NEG2 = NEG2 + 1 P = P*TMP - SIGMA IF( TMP.EQ.ZERO ) $ P = DLLD( I-1 ) - SIGMA 28 CONTINUE END IF NEGCNT = NEGCNT + NEG2 * * III) Twist index * GAMMA = S + P IF( GAMMA.LT.ZERO ) NEGCNT = NEGCNT+1 DLANEG2A = NEGCNT END scalapack-2.0.2/SRC/dlarrd2.f000644 000766 000024 00000060025 11657111056 016047 0ustar00juliestaff000000 000000 SUBROUTINE DLARRD2( RANGE, ORDER, N, VL, VU, IL, IU, GERS, $ RELTOL, D, E, E2, PIVMIN, NSPLIT, ISPLIT, $ M, W, WERR, WL, WU, IBLOCK, INDEXW, $ WORK, IWORK, DOL, DOU, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ of Colorado Denver * July 4, 2010 * * .. Scalar Arguments .. CHARACTER ORDER, RANGE INTEGER DOL, DOU, IL, INFO, IU, M, N, NSPLIT DOUBLE PRECISION PIVMIN, RELTOL, VL, VU, WL, WU * .. * .. Array Arguments .. INTEGER IBLOCK( * ), INDEXW( * ), $ ISPLIT( * ), IWORK( * ) DOUBLE PRECISION D( * ), E( * ), E2( * ), $ GERS( * ), W( * ), WERR( * ), WORK( * ) * .. * * Purpose * ======= * * DLARRD2 computes the eigenvalues of a symmetric tridiagonal * matrix T to limited initial accuracy. This is an auxiliary code to be * called from DLARRE2A. * * DLARRD2 has been created using the LAPACK code DLARRD * which itself stems from DSTEBZ. The motivation for creating * DLARRD2 is efficiency: When computing eigenvalues in parallel * and the input tridiagonal matrix splits into blocks, DLARRD2 * can skip over blocks which contain none of the eigenvalues from * DOL to DOU for which the processor responsible. In extreme cases (such * as large matrices consisting of many blocks of small size, e.g. 2x2, * the gain can be substantial. * * Arguments * ========= * * RANGE (input) CHARACTER * = 'A': ("All") all eigenvalues will be found. * = 'V': ("Value") all eigenvalues in the half-open interval * (VL, VU] will be found. * = 'I': ("Index") the IL-th through IU-th eigenvalues (of the * entire matrix) will be found. * * ORDER (input) CHARACTER * = 'B': ("By Block") the eigenvalues will be grouped by * split-off block (see IBLOCK, ISPLIT) and * ordered from smallest to largest within * the block. * = 'E': ("Entire matrix") * the eigenvalues for the entire matrix * will be ordered from smallest to * largest. * * N (input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * VL (input) DOUBLE PRECISION * VU (input) DOUBLE PRECISION * If RANGE='V', the lower and upper bounds of the interval to * be searched for eigenvalues. Eigenvalues less than or equal * to VL, or greater than VU, will not be returned. VL < VU. * Not referenced if RANGE = 'A' or 'I'. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. * Not referenced if RANGE = 'A' or 'V'. * * GERS (input) DOUBLE PRECISION array, dimension (2*N) * The N Gerschgorin intervals (the i-th Gerschgorin interval * is (GERS(2*i-1), GERS(2*i)). * * RELTOL (input) DOUBLE PRECISION * The minimum relative width of an interval. When an interval * is narrower than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be * sufficiently small, i.e., converged. Note: this should * always be at least radix*machine epsilon. * * D (input) DOUBLE PRECISION array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. * * E (input) DOUBLE PRECISION array, dimension (N-1) * The (n-1) off-diagonal elements of the tridiagonal matrix T. * * E2 (input) DOUBLE PRECISION array, dimension (N-1) * The (n-1) squared off-diagonal elements of the tridiagonal matrix T. * * PIVMIN (input) DOUBLE PRECISION * The minimum pivot allowed in the sturm sequence for T. * * NSPLIT (input) INTEGER * The number of diagonal blocks in the matrix T. * 1 <= NSPLIT <= N. * * ISPLIT (input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N. * (Only the first NSPLIT elements will actually be used, but * since the user cannot know a priori what value NSPLIT will * have, N words must be reserved for ISPLIT.) * * M (output) INTEGER * The actual number of eigenvalues found. 0 <= M <= N. * (See also the description of INFO=2,3.) * * W (output) DOUBLE PRECISION array, dimension (N) * On exit, the first M elements of W will contain the * eigenvalue approximations. DLARRD2 computes an interval * I_j = (a_j, b_j] that includes eigenvalue j. The eigenvalue * approximation is given as the interval midpoint * W(j)= ( a_j + b_j)/2. The corresponding error is bounded by * WERR(j) = abs( a_j - b_j)/2 * * WERR (output) DOUBLE PRECISION array, dimension (N) * The error bound on the corresponding eigenvalue approximation * in W. * * WL (output) DOUBLE PRECISION * WU (output) DOUBLE PRECISION * The interval (WL, WU] contains all the wanted eigenvalues. * If RANGE='V', then WL=VL and WU=VU. * If RANGE='A', then WL and WU are the global Gerschgorin bounds * on the spectrum. * If RANGE='I', then WL and WU are computed by DLAEBZ from the * index range specified. * * IBLOCK (output) INTEGER array, dimension (N) * At each row/column j where E(j) is zero or small, the * matrix T is considered to split into a block diagonal * matrix. On exit, if INFO = 0, IBLOCK(i) specifies to which * block (from 1 to the number of blocks) the eigenvalue W(i) * belongs. (DLARRD2 may use the remaining N-M elements as * workspace.) * * INDEXW (output) INTEGER array, dimension (N) * The indices of the eigenvalues within each block (submatrix); * for example, INDEXW(i)= j and IBLOCK(i)=k imply that the * i-th eigenvalue W(i) is the j-th eigenvalue in block k. * * WORK (workspace) DOUBLE PRECISION array, dimension (4*N) * * IWORK (workspace) INTEGER array, dimension (3*N) * * DOL (input) INTEGER * DOU (input) INTEGER * If the user wants to work on only a selected part of the * representation tree, he can specify an index range DOL:DOU. * Otherwise, the setting DOL=1, DOU=N should be applied. * Note that DOL and DOU refer to the order in which the eigenvalues * are stored in W. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: some or all of the eigenvalues failed to converge or * were not computed: * =1 or 3: Bisection failed to converge for some * eigenvalues; these eigenvalues are flagged by a * negative block number. The effect is that the * eigenvalues may not be as accurate as the * absolute and relative tolerances. This is * generally caused by unexpectedly inaccurate * arithmetic. * =2 or 3: RANGE='I' only: Not all of the eigenvalues * IL:IU were found. * Effect: M < IU+1-IL * Cause: non-monotonic arithmetic, causing the * Sturm sequence to be non-monotonic. * Cure: recalculate, using RANGE='A', and pick * out eigenvalues IL:IU. In some cases, * increasing the PARAMETER "FUDGE" may * make things work. * = 4: RANGE='I', and the Gershgorin interval * initially used was too small. No eigenvalues * were computed. * Probable cause: your machine has sloppy * floating-point arithmetic. * Cure: Increase the PARAMETER "FUDGE", * recompile, and try again. * * Internal Parameters * =================== * * FUDGE DOUBLE PRECISION, default = 2 originally, increased to 10. * A "fudge factor" to widen the Gershgorin intervals. Ideally, * a value of 1 should work, but on machines with sloppy * arithmetic, this needs to be larger. The default for * publicly released versions should be large enough to handle * the worst machine around. Note that this has no effect * on accuracy of the solution. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, TWO, HALF, FUDGE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, $ TWO = 2.0D0, HALF = ONE/TWO, $ FUDGE = 10.0D0 ) * .. * .. Local Scalars .. LOGICAL NCNVRG, TOOFEW INTEGER I, IB, IBEGIN, IDISCL, IDISCU, IE, IEND, IINFO, $ IM, IN, IOFF, IORDER, IOUT, IRANGE, ITMAX, $ ITMP1, ITMP2, IW, IWOFF, J, JBLK, JDISC, JE, $ JEE, NB, NWL, NWU DOUBLE PRECISION ATOLI, EPS, GL, GU, RTOLI, SPDIAM, TMP1, TMP2, $ TNORM, UFLOW, WKILL, WLU, WUL * .. * .. Local Arrays .. INTEGER IDUMMA( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV DOUBLE PRECISION DLAMCH EXTERNAL LSAME, ILAENV, DLAMCH * .. * .. External Subroutines .. EXTERNAL DLAEBZ * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, LOG, MAX, MIN, SQRT * .. * .. Executable Statements .. * INFO = 0 * * Decode RANGE * IF( LSAME( RANGE, 'A' ) ) THEN IRANGE = 1 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IRANGE = 2 ELSE IF( LSAME( RANGE, 'I' ) ) THEN IRANGE = 3 ELSE IRANGE = 0 END IF * * Decode ORDER * IF( LSAME( ORDER, 'B' ) ) THEN IORDER = 2 ELSE IF( LSAME( ORDER, 'E' ) ) THEN IORDER = 1 ELSE IORDER = 0 END IF * * Check for Errors * IF( IRANGE.LE.0 ) THEN INFO = -1 ELSE IF( IORDER.LE.0 ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IRANGE.EQ.2 ) THEN IF( VL.GE.VU ) $ INFO = -5 ELSE IF( IRANGE.EQ.3 .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -6 ELSE IF( IRANGE.EQ.3 .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -7 END IF * IF( INFO.NE.0 ) THEN RETURN END IF * Initialize error flags INFO = 0 NCNVRG = .FALSE. TOOFEW = .FALSE. * Quick return if possible M = 0 IF( N.EQ.0 ) RETURN * Simplification: IF( IRANGE.EQ.3 .AND. IL.EQ.1 .AND. IU.EQ.N ) IRANGE = 1 * Get machine constants EPS = DLAMCH( 'P' ) UFLOW = DLAMCH( 'U' ) * Special Case when N=1 * Treat case of 1x1 matrix for quick return IF( N.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR. $ ((IRANGE.EQ.2).AND.(D(1).GT.VL).AND.(D(1).LE.VU)).OR. $ ((IRANGE.EQ.3).AND.(IL.EQ.1).AND.(IU.EQ.1)) ) THEN M = 1 W(1) = D(1) * The computation error of the eigenvalue is zero WERR(1) = ZERO IBLOCK( 1 ) = 1 INDEXW( 1 ) = 1 ENDIF RETURN END IF * NB is the minimum vector length for vector bisection, or 0 * if only scalar is to be done. NB = ILAENV( 1, 'DSTEBZ', ' ', N, -1, -1, -1 ) IF( NB.LE.1 ) NB = 0 * Find global spectral radius GL = D(1) GU = D(1) DO 5 I = 1,N GL = MIN( GL, GERS( 2*I - 1)) GU = MAX( GU, GERS(2*I) ) 5 CONTINUE * Compute global Gerschgorin bounds and spectral diameter TNORM = MAX( ABS( GL ), ABS( GU ) ) GL = GL - FUDGE*TNORM*EPS*N - FUDGE*TWO*PIVMIN GU = GU + FUDGE*TNORM*EPS*N + FUDGE*TWO*PIVMIN SPDIAM = GU - GL * Input arguments for DLAEBZ: * The relative tolerance. An interval (a,b] lies within * "relative tolerance" if b-a < RELTOL*max(|a|,|b|), RTOLI = RELTOL ATOLI = FUDGE*TWO*UFLOW + FUDGE*TWO*PIVMIN IF( IRANGE.EQ.3 ) THEN * RANGE='I': Compute an interval containing eigenvalues * IL through IU. The initial interval [GL,GU] from the global * Gerschgorin bounds GL and GU is refined by DLAEBZ. ITMAX = INT( ( LOG( TNORM+PIVMIN )-LOG( PIVMIN ) ) / $ LOG( TWO ) ) + 2 WORK( N+1 ) = GL WORK( N+2 ) = GL WORK( N+3 ) = GU WORK( N+4 ) = GU WORK( N+5 ) = GL WORK( N+6 ) = GU IWORK( 1 ) = -1 IWORK( 2 ) = -1 IWORK( 3 ) = N + 1 IWORK( 4 ) = N + 1 IWORK( 5 ) = IL - 1 IWORK( 6 ) = IU * CALL DLAEBZ( 3, ITMAX, N, 2, 2, NB, ATOLI, RTOLI, PIVMIN, $ D, E, E2, IWORK( 5 ), WORK( N+1 ), WORK( N+5 ), IOUT, $ IWORK, W, IBLOCK, IINFO ) IF( IINFO .NE. 0 ) THEN INFO = IINFO RETURN END IF * On exit, output intervals may not be ordered by ascending negcount IF( IWORK( 6 ).EQ.IU ) THEN WL = WORK( N+1 ) WLU = WORK( N+3 ) NWL = IWORK( 1 ) WU = WORK( N+4 ) WUL = WORK( N+2 ) NWU = IWORK( 4 ) ELSE WL = WORK( N+2 ) WLU = WORK( N+4 ) NWL = IWORK( 2 ) WU = WORK( N+3 ) WUL = WORK( N+1 ) NWU = IWORK( 3 ) END IF * On exit, the interval [WL, WLU] contains a value with negcount NWL, * and [WUL, WU] contains a value with negcount NWU. IF( NWL.LT.0 .OR. NWL.GE.N .OR. NWU.LT.1 .OR. NWU.GT.N ) THEN INFO = 4 RETURN END IF ELSEIF( IRANGE.EQ.2 ) THEN WL = VL WU = VU ELSEIF( IRANGE.EQ.1 ) THEN WL = GL WU = GU ENDIF * Find Eigenvalues -- Loop Over blocks and recompute NWL and NWU. * NWL accumulates the number of eigenvalues .le. WL, * NWU accumulates the number of eigenvalues .le. WU M = 0 IEND = 0 INFO = 0 NWL = 0 NWU = 0 * DO 70 JBLK = 1, NSPLIT IOFF = IEND IBEGIN = IOFF + 1 IEND = ISPLIT( JBLK ) IN = IEND - IOFF * IF( IRANGE.EQ.1 ) THEN IF( (IEND.LT.DOL).OR.(IBEGIN.GT.DOU) ) THEN * the local block contains none of eigenvalues that matter * to this processor NWU = NWU + IN DO 30 J = 1, IN M = M + 1 IBLOCK( M ) = JBLK 30 CONTINUE GO TO 70 END IF END IF IF( IN.EQ.1 ) THEN * 1x1 block IF( WL.GE.D( IBEGIN )-PIVMIN ) $ NWL = NWL + 1 IF( WU.GE.D( IBEGIN )-PIVMIN ) $ NWU = NWU + 1 IF( IRANGE.EQ.1 .OR. ( WL.LT.D( IBEGIN )-PIVMIN .AND. WU.GE. $ D( IBEGIN )-PIVMIN ) ) THEN M = M + 1 W( M ) = D( IBEGIN ) WERR(M) = ZERO * The gap for a single block doesn't matter for the later * algorithm and is assigned an arbitrary large value IBLOCK( M ) = JBLK INDEXW( M ) = 1 END IF ELSE * General Case - block of size IN > 2 * Compute local Gerschgorin interval and use it as the initial * interval for DLAEBZ GU = D( IBEGIN ) GL = D( IBEGIN ) TMP1 = ZERO DO 40 J = IBEGIN, IEND GL = MIN( GL, GERS( 2*J - 1)) GU = MAX( GU, GERS(2*J) ) 40 CONTINUE SPDIAM = GU - GL GL = GL - FUDGE*TNORM*EPS*IN - FUDGE*PIVMIN GU = GU + FUDGE*TNORM*EPS*IN + FUDGE*PIVMIN * IF( IRANGE.GT.1 ) THEN IF( GU.LT.WL ) THEN * the local block contains none of the wanted eigenvalues NWL = NWL + IN NWU = NWU + IN GO TO 70 END IF * refine search interval if possible, only range (WL,WU] matters GL = MAX( GL, WL ) GU = MIN( GU, WU ) IF( GL.GE.GU ) $ GO TO 70 END IF * Find negcount of initial interval boundaries GL and GU WORK( N+1 ) = GL WORK( N+IN+1 ) = GU CALL DLAEBZ( 1, 0, IN, IN, 1, NB, ATOLI, RTOLI, PIVMIN, $ D( IBEGIN ), E( IBEGIN ), E2( IBEGIN ), $ IDUMMA, WORK( N+1 ), WORK( N+2*IN+1 ), IM, $ IWORK, W( M+1 ), IBLOCK( M+1 ), IINFO ) IF( IINFO .NE. 0 ) THEN INFO = IINFO RETURN END IF * NWL = NWL + IWORK( 1 ) NWU = NWU + IWORK( IN+1 ) IWOFF = M - IWORK( 1 ) * Compute Eigenvalues ITMAX = INT( ( LOG( GU-GL+PIVMIN )-LOG( PIVMIN ) ) / $ LOG( TWO ) ) + 2 CALL DLAEBZ( 2, ITMAX, IN, IN, 1, NB, ATOLI, RTOLI, PIVMIN, $ D( IBEGIN ), E( IBEGIN ), E2( IBEGIN ), $ IDUMMA, WORK( N+1 ), WORK( N+2*IN+1 ), IOUT, $ IWORK, W( M+1 ), IBLOCK( M+1 ), IINFO ) IF( IINFO .NE. 0 ) THEN INFO = IINFO RETURN END IF * * Copy eigenvalues into W and IBLOCK * Use -JBLK for block number for unconverged eigenvalues. * Loop over the number of output intervals from DLAEBZ DO 60 J = 1, IOUT * eigenvalue approximation is middle point of interval TMP1 = HALF*( WORK( J+N )+WORK( J+IN+N ) ) * semi length of error interval TMP2 = HALF*ABS( WORK( J+N )-WORK( J+IN+N ) ) IF( J.GT.IOUT-IINFO ) THEN * Flag non-convergence. NCNVRG = .TRUE. IB = -JBLK ELSE IB = JBLK END IF DO 50 JE = IWORK( J ) + 1 + IWOFF, $ IWORK( J+IN ) + IWOFF W( JE ) = TMP1 WERR( JE ) = TMP2 INDEXW( JE ) = JE - IWOFF IBLOCK( JE ) = IB 50 CONTINUE 60 CONTINUE * M = M + IM END IF 70 CONTINUE * If RANGE='I', then (WL,WU) contains eigenvalues NWL+1,...,NWU * If NWL+1 < IL or NWU > IU, discard extra eigenvalues. IF( IRANGE.EQ.3 ) THEN IDISCL = IL - 1 - NWL IDISCU = NWU - IU * IF( IDISCL.GT.0 ) THEN IM = 0 DO 80 JE = 1, M * Remove some of the smallest eigenvalues from the left so that * at the end IDISCL =0. Move all eigenvalues up to the left. IF( W( JE ).LE.WLU .AND. IDISCL.GT.0 ) THEN IDISCL = IDISCL - 1 ELSE IM = IM + 1 W( IM ) = W( JE ) WERR( IM ) = WERR( JE ) INDEXW( IM ) = INDEXW( JE ) IBLOCK( IM ) = IBLOCK( JE ) END IF 80 CONTINUE M = IM END IF IF( IDISCU.GT.0 ) THEN * Remove some of the largest eigenvalues from the right so that * at the end IDISCU =0. Move all eigenvalues up to the left. IM=M+1 DO 81 JE = M, 1, -1 IF( W( JE ).GE.WUL .AND. IDISCU.GT.0 ) THEN IDISCU = IDISCU - 1 ELSE IM = IM - 1 W( IM ) = W( JE ) WERR( IM ) = WERR( JE ) INDEXW( IM ) = INDEXW( JE ) IBLOCK( IM ) = IBLOCK( JE ) END IF 81 CONTINUE JEE = 0 DO 82 JE = IM, M JEE = JEE + 1 W( JEE ) = W( JE ) WERR( JEE ) = WERR( JE ) INDEXW( JEE ) = INDEXW( JE ) IBLOCK( JEE ) = IBLOCK( JE ) 82 CONTINUE M = M-IM+1 END IF IF( IDISCL.GT.0 .OR. IDISCU.GT.0 ) THEN * Code to deal with effects of bad arithmetic. (If N(w) is * monotone non-decreasing, this should never happen.) * Some low eigenvalues to be discarded are not in (WL,WLU], * or high eigenvalues to be discarded are not in (WUL,WU] * so just kill off the smallest IDISCL/largest IDISCU * eigenvalues, by marking the corresponding IBLOCK = 0 IF( IDISCL.GT.0 ) THEN WKILL = WU DO 100 JDISC = 1, IDISCL IW = 0 DO 90 JE = 1, M IF( IBLOCK( JE ).NE.0 .AND. $ ( W( JE ).LT.WKILL .OR. IW.EQ.0 ) ) THEN IW = JE WKILL = W( JE ) END IF 90 CONTINUE IBLOCK( IW ) = 0 100 CONTINUE END IF IF( IDISCU.GT.0 ) THEN WKILL = WL DO 120 JDISC = 1, IDISCU IW = 0 DO 110 JE = 1, M IF( IBLOCK( JE ).NE.0 .AND. $ ( W( JE ).GE.WKILL .OR. IW.EQ.0 ) ) THEN IW = JE WKILL = W( JE ) END IF 110 CONTINUE IBLOCK( IW ) = 0 120 CONTINUE END IF * Now erase all eigenvalues with IBLOCK set to zero IM = 0 DO 130 JE = 1, M IF( IBLOCK( JE ).NE.0 ) THEN IM = IM + 1 W( IM ) = W( JE ) WERR( IM ) = WERR( JE ) INDEXW( IM ) = INDEXW( JE ) IBLOCK( IM ) = IBLOCK( JE ) END IF 130 CONTINUE M = IM END IF IF( IDISCL.LT.0 .OR. IDISCU.LT.0 ) THEN TOOFEW = .TRUE. END IF END IF * IF(( IRANGE.EQ.1 .AND. M.NE.N ).OR. $ ( IRANGE.EQ.3 .AND. M.NE.IU-IL+1 ) ) THEN TOOFEW = .TRUE. END IF * If ORDER='B',(IBLOCK = 2), do nothing the eigenvalues are already sorted * by block. * If ORDER='E',(IBLOCK = 1), sort the eigenvalues from smallest to largest IF( IORDER.EQ.1 .AND. NSPLIT.GT.1 ) THEN DO 150 JE = 1, M - 1 IE = 0 TMP1 = W( JE ) DO 140 J = JE + 1, M IF( W( J ).LT.TMP1 ) THEN IE = J TMP1 = W( J ) END IF 140 CONTINUE IF( IE.NE.0 ) THEN TMP2 = WERR( IE ) ITMP1 = IBLOCK( IE ) ITMP2 = INDEXW( IE ) W( IE ) = W( JE ) WERR( IE ) = WERR( JE ) IBLOCK( IE ) = IBLOCK( JE ) INDEXW( IE ) = INDEXW( JE ) W( JE ) = TMP1 WERR( JE ) = TMP2 IBLOCK( JE ) = ITMP1 INDEXW( JE ) = ITMP2 END IF 150 CONTINUE END IF * INFO = 0 IF( NCNVRG ) $ INFO = INFO + 1 IF( TOOFEW ) $ INFO = INFO + 2 RETURN * * End of DLARRD2 * END scalapack-2.0.2/SRC/dlarre2.f000644 000766 000024 00000070777 11657111056 016067 0ustar00juliestaff000000 000000 SUBROUTINE DLARRE2( RANGE, N, VL, VU, IL, IU, D, E, E2, $ RTOL1, RTOL2, SPLTOL, NSPLIT, ISPLIT, $ M, DOL, DOU, $ W, WERR, WGAP, IBLOCK, INDEXW, GERS, PIVMIN, $ WORK, IWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ of Colorado Denver * July 4, 2010 * * .. Scalar Arguments .. CHARACTER RANGE INTEGER DOL, DOU, IL, INFO, IU, M, N, NSPLIT DOUBLE PRECISION PIVMIN, RTOL1, RTOL2, SPLTOL, VL, VU * .. * .. Array Arguments .. INTEGER IBLOCK( * ), ISPLIT( * ), IWORK( * ), $ INDEXW( * ) DOUBLE PRECISION D( * ), E( * ), E2( * ), GERS( * ), $ W( * ),WERR( * ), WGAP( * ), WORK( * ) * * Purpose * ======= * * To find the desired eigenvalues of a given real symmetric * tridiagonal matrix T, DLARRE2 sets, via DLARRA, * "small" off-diagonal elements to zero. For each block T_i, it finds * (a) a suitable shift at one end of the block's spectrum, * (b) the root RRR, T_i - sigma_i I = L_i D_i L_i^T, and * (c) eigenvalues of each L_i D_i L_i^T. * The representations and eigenvalues found are then returned to * DSTEGR2 to compute the eigenvectors T. * * DLARRE2 is more suitable for parallel computation than the * original LAPACK code for computing the root RRR and its eigenvalues. * When computing eigenvalues in parallel and the input tridiagonal * matrix splits into blocks, DLARRE2 * can skip over blocks which contain none of the eigenvalues from * DOL to DOU for which the processor responsible. In extreme cases (such * as large matrices consisting of many blocks of small size, e.g. 2x2, * the gain can be substantial. * * Arguments * ========= * * RANGE (input) CHARACTER * = 'A': ("All") all eigenvalues will be found. * = 'V': ("Value") all eigenvalues in the half-open interval * (VL, VU] will be found. * = 'I': ("Index") the IL-th through IU-th eigenvalues (of the * entire matrix) will be found. * * N (input) INTEGER * The order of the matrix. N > 0. * * VL (input/output) DOUBLE PRECISION * VU (input/output) DOUBLE PRECISION * If RANGE='V', the lower and upper bounds for the eigenvalues. * Eigenvalues less than or equal to VL, or greater than VU, * will not be returned. VL < VU. * If RANGE='I' or ='A', DLARRE2 computes bounds on the desired * part of the spectrum. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the N diagonal elements of the tridiagonal * matrix T. * On exit, the N diagonal elements of the diagonal * matrices D_i. * * E (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the first (N-1) entries contain the subdiagonal * elements of the tridiagonal matrix T; E(N) need not be set. * On exit, E contains the subdiagonal elements of the unit * bidiagonal matrices L_i. The entries E( ISPLIT( I ) ), * 1 <= I <= NSPLIT, contain the base points sigma_i on output. * * E2 (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the first (N-1) entries contain the SQUARES of the * subdiagonal elements of the tridiagonal matrix T; * E2(N) need not be set. * On exit, the entries E2( ISPLIT( I ) ), * 1 <= I <= NSPLIT, have been set to zero * * RTOL1 (input) DOUBLE PRECISION * RTOL2 (input) DOUBLE PRECISION * Parameters for bisection. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) * * SPLTOL (input) DOUBLE PRECISION * The threshold for splitting. * * NSPLIT (output) INTEGER * The number of blocks T splits into. 1 <= NSPLIT <= N. * * ISPLIT (output) INTEGER array, dimension (N) * The splitting points, at which T breaks up into blocks. * The first block consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N. * * M (output) INTEGER * The total number of eigenvalues (of all L_i D_i L_i^T) * found. * * DOL (input) INTEGER * DOU (input) INTEGER * If the user wants to work on only a selected part of the * representation tree, he can specify an index range DOL:DOU. * Otherwise, the setting DOL=1, DOU=N should be applied. * Note that DOL and DOU refer to the order in which the eigenvalues * are stored in W. * * W (output) DOUBLE PRECISION array, dimension (N) * The first M elements contain the eigenvalues. The * eigenvalues of each of the blocks, L_i D_i L_i^T, are * sorted in ascending order ( DLARRE2 may use the * remaining N-M elements as workspace). * Note that immediately after exiting this routine, only * the eigenvalues from position DOL:DOU in W might be * reliable on this processor * when the eigenvalue computation is done in parallel. * * WERR (output) DOUBLE PRECISION array, dimension (N) * The error bound on the corresponding eigenvalue in W. * Note that immediately after exiting this routine, only * the uncertainties from position DOL:DOU in WERR might be * reliable on this processor * when the eigenvalue computation is done in parallel. * * WGAP (output) DOUBLE PRECISION array, dimension (N) * The separation from the right neighbor eigenvalue in W. * The gap is only with respect to the eigenvalues of the same block * as each block has its own representation tree. * Exception: at the right end of a block we store the left gap * Note that immediately after exiting this routine, only * the gaps from position DOL:DOU in WGAP might be * reliable on this processor * when the eigenvalue computation is done in parallel. * * IBLOCK (output) INTEGER array, dimension (N) * The indices of the blocks (submatrices) associated with the * corresponding eigenvalues in W; IBLOCK(i)=1 if eigenvalue * W(i) belongs to the first block from the top, =2 if W(i) * belongs to the second block, etc. * * INDEXW (output) INTEGER array, dimension (N) * The indices of the eigenvalues within each block (submatrix); * for example, INDEXW(i)= 10 and IBLOCK(i)=2 imply that the * i-th eigenvalue W(i) is the 10-th eigenvalue in block 2 * * GERS (output) DOUBLE PRECISION array, dimension (2*N) * The N Gerschgorin intervals (the i-th Gerschgorin interval * is (GERS(2*i-1), GERS(2*i)). * * PIVMIN (output) DOUBLE PRECISION * The minimum pivot in the sturm sequence for T. * * WORK (workspace) DOUBLE PRECISION array, dimension (6*N) * Workspace. * * IWORK (workspace) INTEGER array, dimension (5*N) * Workspace. * * INFO (output) INTEGER * = 0: successful exit * > 0: A problem occured in DLARRE2. * < 0: One of the called subroutines signaled an internal problem. * Needs inspection of the corresponding parameter IINFO * for further information. * * =-1: Problem in DLARRD. * = 2: No base representation could be found in MAXTRY iterations. * Increasing MAXTRY and recompilation might be a remedy. * =-3: Problem in DLARRB when computing the refined root * representation for DLASQ2. * =-4: Problem in DLARRB when preforming bisection on the * desired part of the spectrum. * =-5: Problem in DLASQ2. * =-6: Problem in DLASQ2. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION FAC, FOUR, FOURTH, FUDGE, HALF, HNDRD, $ MAXGROWTH, ONE, PERT, TWO, ZERO PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, $ TWO = 2.0D0, FOUR=4.0D0, $ HNDRD = 100.0D0, $ PERT = 8.0D0, $ HALF = ONE/TWO, FOURTH = ONE/FOUR, FAC= HALF, $ MAXGROWTH = 64.0D0, FUDGE = 2.0D0 ) INTEGER MAXTRY PARAMETER ( MAXTRY = 6 ) * .. * .. Local Scalars .. LOGICAL FORCEB, NOREP, RNDPRT, USEDQD INTEGER CNT, CNT1, CNT2, I, IBEGIN, IDUM, IEND, IINFO, $ IN, INDL, INDU, IRANGE, J, JBLK, MB, MM, $ WBEGIN, WEND DOUBLE PRECISION AVGAP, BSRTOL, CLWDTH, DMAX, DPIVOT, EABS, $ EMAX, EOLD, EPS, GL, GU, ISLEFT, ISRGHT, RTL, $ RTOL, S1, S2, SAFMIN, SGNDEF, SIGMA, SPDIAM, $ TAU, TMP, TMP1 * .. * .. Local Arrays .. INTEGER ISEED( 4 ) * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH, LSAME * .. * .. External Subroutines .. EXTERNAL DCOPY, DLARNV, DLARRA, DLARRB, DLARRC, $ DLARRD, DLASQ2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * INFO = 0 * Dis-/Enable a small random perturbation of the root representation RNDPRT = .TRUE. * * Decode RANGE * IF( LSAME( RANGE, 'A' ) ) THEN IRANGE = 1 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IRANGE = 2 ELSE IF( LSAME( RANGE, 'I' ) ) THEN IRANGE = 3 END IF M = 0 * Get machine constants SAFMIN = DLAMCH( 'S' ) EPS = DLAMCH( 'P' ) * Set parameters RTL = SQRT(EPS) BSRTOL = 1.0D-1 * Treat case of 1x1 matrix for quick return IF( N.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR. $ ((IRANGE.EQ.2).AND.(D(1).GT.VL).AND.(D(1).LE.VU)).OR. $ ((IRANGE.EQ.3).AND.(IL.EQ.1).AND.(IU.EQ.1)) ) THEN M = 1 W(1) = D(1) * The computation error of the eigenvalue is zero WERR(1) = ZERO WGAP(1) = ZERO IBLOCK( 1 ) = 1 INDEXW( 1 ) = 1 GERS(1) = D( 1 ) GERS(2) = D( 1 ) ENDIF * store the shift for the initial RRR, which is zero in this case E(1) = ZERO RETURN END IF * General case: tridiagonal matrix of order > 1 * * Init WERR, WGAP. Compute Gerschgorin intervals and spectral diameter. * Compute maximum off-diagonal entry and pivmin. GL = D(1) GU = D(1) EOLD = ZERO EMAX = ZERO E(N) = ZERO DO 5 I = 1,N WERR(I) = ZERO WGAP(I) = ZERO EABS = ABS( E(I) ) IF( EABS .GE. EMAX ) THEN EMAX = EABS END IF TMP1 = EABS + EOLD GERS( 2*I-1) = D(I) - TMP1 GL = MIN( GL, GERS( 2*I - 1)) GERS( 2*I ) = D(I) + TMP1 GU = MAX( GU, GERS(2*I) ) EOLD = EABS 5 CONTINUE * The minimum pivot allowed in the sturm sequence for T PIVMIN = SAFMIN * MAX( ONE, EMAX**2 ) * Compute spectral diameter. The Gerschgorin bounds give an * estimate that is wrong by at most a factor of SQRT(2) SPDIAM = GU - GL * Compute splitting points CALL DLARRA( N, D, E, E2, SPLTOL, SPDIAM, $ NSPLIT, ISPLIT, IINFO ) * Can force use of bisection instead of faster DQDS FORCEB = .FALSE. IF( (IRANGE.EQ.1) .AND. (.NOT. FORCEB) ) THEN * Set interval [VL,VU] that contains all eigenvalues VL = GL VU = GU ELSE * We call DLARRD to find crude approximations to the eigenvalues * in the desired range. In case IRANGE = 3, we also obtain the * interval (VL,VU] that contains all the wanted eigenvalues. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.RTOL*MAX(ABS(LEFT),ABS(RIGHT)) * DLARRD needs a WORK of size 4*N, IWORK of size 3*N CALL DLARRD( RANGE, 'B', N, VL, VU, IL, IU, GERS, $ BSRTOL, D, E, E2, PIVMIN, NSPLIT, ISPLIT, $ MM, W, WERR, VL, VU, IBLOCK, INDEXW, $ WORK, IWORK, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF * Make sure that the entries M+1 to N in W, WERR, IBLOCK, INDEXW are 0 DO 14 I = MM+1,N W( I ) = ZERO WERR( I ) = ZERO IBLOCK( I ) = 0 INDEXW( I ) = 0 14 CONTINUE END IF *** * Loop over unreduced blocks IBEGIN = 1 WBEGIN = 1 DO 170 JBLK = 1, NSPLIT IEND = ISPLIT( JBLK ) IN = IEND - IBEGIN + 1 * 1 X 1 block IF( IN.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR.( (IRANGE.EQ.2).AND. $ ( D( IBEGIN ).GT.VL ).AND.( D( IBEGIN ).LE.VU ) ) $ .OR. ( (IRANGE.EQ.3).AND.(IBLOCK(WBEGIN).EQ.JBLK)) $ ) THEN M = M + 1 W( M ) = D( IBEGIN ) WERR(M) = ZERO * The gap for a single block doesn't matter for the later * algorithm and is assigned an arbitrary large value WGAP(M) = ZERO IBLOCK( M ) = JBLK INDEXW( M ) = 1 WBEGIN = WBEGIN + 1 ENDIF * E( IEND ) holds the shift for the initial RRR E( IEND ) = ZERO IBEGIN = IEND + 1 GO TO 170 END IF * * Blocks of size larger than 1x1 * * E( IEND ) will hold the shift for the initial RRR, for now set it =0 E( IEND ) = ZERO * * Find local outer bounds GL,GU for the block GL = D(IBEGIN) GU = D(IBEGIN) DO 15 I = IBEGIN , IEND GL = MIN( GERS( 2*I-1 ), GL ) GU = MAX( GERS( 2*I ), GU ) 15 CONTINUE SPDIAM = GU - GL IF(.NOT. ((IRANGE.EQ.1).AND.(.NOT.FORCEB)) ) THEN * Count the number of eigenvalues in the current block. MB = 0 DO 20 I = WBEGIN,MM IF( IBLOCK(I).EQ.JBLK ) THEN MB = MB+1 ELSE GOTO 21 ENDIF 20 CONTINUE 21 CONTINUE IF( MB.EQ.0) THEN * No eigenvalue in the current block lies in the desired range * E( IEND ) holds the shift for the initial RRR E( IEND ) = ZERO IBEGIN = IEND + 1 GO TO 170 ELSE * Decide whether dqds or bisection is more efficient USEDQD = ( (MB .GT. FAC*IN) .AND. (.NOT.FORCEB) ) WEND = WBEGIN + MB - 1 * Calculate gaps for the current block * In later stages, when representations for individual * eigenvalues are different, we use SIGMA = E( IEND ). SIGMA = ZERO DO 30 I = WBEGIN, WEND - 1 WGAP( I ) = MAX( ZERO, $ W(I+1)-WERR(I+1) - (W(I)+WERR(I)) ) 30 CONTINUE WGAP( WEND ) = MAX( ZERO, $ VU - SIGMA - (W( WEND )+WERR( WEND ))) * Find local index of the first and last desired evalue. INDL = INDEXW(WBEGIN) INDU = INDEXW( WEND ) ENDIF ELSE * MB = number of eigenvalues to compute MB = IN WEND = WBEGIN + MB - 1 INDL = 1 INDU = IN ENDIF IF( (WEND.LT.DOL).OR.(WBEGIN.GT.DOU) ) THEN * if this subblock contains no desired eigenvalues, * skip the computation of this representation tree IBEGIN = IEND + 1 WBEGIN = WEND + 1 M = M + INDU - INDL + 1 GO TO 170 END IF * Find approximations to the extremal eigenvalues of the block CALL DLARRK( IN, 1, GL, GU, D(IBEGIN), $ E2(IBEGIN), PIVMIN, RTL, TMP, TMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF ISLEFT = MAX(GL, TMP - TMP1 $ - HNDRD * EPS* ABS(TMP - TMP1)) CALL DLARRK( IN, IN, GL, GU, D(IBEGIN), $ E2(IBEGIN), PIVMIN, RTL, TMP, TMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF ISRGHT = MIN(GU, TMP + TMP1 $ + HNDRD * EPS * ABS(TMP + TMP1)) IF(( (IRANGE.EQ.1) .AND. (.NOT. FORCEB) ).OR.USEDQD) THEN * Case of DQDS * Improve the estimate of the spectral diameter SPDIAM = ISRGHT - ISLEFT ELSE * Case of bisection * Find approximations to the wanted extremal eigenvalues ISLEFT = MAX(GL, W(WBEGIN) - WERR(WBEGIN) $ - HNDRD * EPS*ABS(W(WBEGIN)- WERR(WBEGIN) )) ISRGHT = MIN(GU,W(WEND) + WERR(WEND) $ + HNDRD * EPS * ABS(W(WEND)+ WERR(WEND))) ENDIF * Decide whether the base representation for the current block * L_JBLK D_JBLK L_JBLK^T = T_JBLK - sigma_JBLK I * should be on the left or the right end of the current block. * The strategy is to shift to the end which is "more populated" * Furthermore, decide whether to use DQDS for the computation of * the eigenvalue approximations at the end of DLARRE2 or bisection. * dqds is chosen if all eigenvalues are desired or the number of * eigenvalues to be computed is large compared to the blocksize. IF( ( IRANGE.EQ.1 ) .AND. (.NOT.FORCEB) ) THEN * If all the eigenvalues have to be computed, we use dqd USEDQD = .TRUE. * INDL is the local index of the first eigenvalue to compute INDL = 1 INDU = IN * MB = number of eigenvalues to compute MB = IN WEND = WBEGIN + MB - 1 * Define 1/4 and 3/4 points of the spectrum S1 = ISLEFT + FOURTH * SPDIAM S2 = ISRGHT - FOURTH * SPDIAM ELSE * DLARRD has computed IBLOCK and INDEXW for each eigenvalue * approximation. * choose sigma IF( USEDQD ) THEN S1 = ISLEFT + FOURTH * SPDIAM S2 = ISRGHT - FOURTH * SPDIAM ELSE TMP = MIN(ISRGHT,VU) - MAX(ISLEFT,VL) S1 = MAX(ISLEFT,VL) + FOURTH * TMP S2 = MIN(ISRGHT,VU) - FOURTH * TMP ENDIF ENDIF * Compute the negcount at the 1/4 and 3/4 points IF(MB.GT.1) THEN CALL DLARRC( 'T', IN, S1, S2, D(IBEGIN), $ E(IBEGIN), PIVMIN, CNT, CNT1, CNT2, IINFO) ENDIF IF(MB.EQ.1) THEN SIGMA = GL SGNDEF = ONE ELSEIF( CNT1 - INDL .GE. INDU - CNT2 ) THEN IF( ( IRANGE.EQ.1 ) .AND. (.NOT.FORCEB) ) THEN SIGMA = MAX(ISLEFT,GL) ELSEIF( USEDQD ) THEN * use Gerschgorin bound as shift to get pos def matrix * for dqds SIGMA = ISLEFT ELSE * use approximation of the first desired eigenvalue of the * block as shift SIGMA = MAX(ISLEFT,VL) ENDIF SGNDEF = ONE ELSE IF( ( IRANGE.EQ.1 ) .AND. (.NOT.FORCEB) ) THEN SIGMA = MIN(ISRGHT,GU) ELSEIF( USEDQD ) THEN * use Gerschgorin bound as shift to get neg def matrix * for dqds SIGMA = ISRGHT ELSE * use approximation of the first desired eigenvalue of the * block as shift SIGMA = MIN(ISRGHT,VU) ENDIF SGNDEF = -ONE ENDIF * An initial SIGMA has been chosen that will be used for computing * T - SIGMA I = L D L^T * Define the increment TAU of the shift in case the initial shift * needs to be refined to obtain a factorization with not too much * element growth. IF( USEDQD ) THEN TAU = SPDIAM*EPS*N + TWO*PIVMIN TAU = MAX(TAU,EPS*ABS(SIGMA)) ELSE IF(MB.GT.1) THEN CLWDTH = W(WEND) + WERR(WEND) - W(WBEGIN) - WERR(WBEGIN) AVGAP = ABS(CLWDTH / DBLE(WEND-WBEGIN)) IF( SGNDEF.EQ.ONE ) THEN TAU = HALF*MAX(WGAP(WBEGIN),AVGAP) TAU = MAX(TAU,WERR(WBEGIN)) ELSE TAU = HALF*MAX(WGAP(WEND-1),AVGAP) TAU = MAX(TAU,WERR(WEND)) ENDIF ELSE TAU = WERR(WBEGIN) ENDIF ENDIF * DO 80 IDUM = 1, MAXTRY * Compute L D L^T factorization of tridiagonal matrix T - sigma I. * Store D in WORK(1:IN), L in WORK(IN+1:2*IN), and reciprocals of * pivots in WORK(2*IN+1:3*IN) DPIVOT = D( IBEGIN ) - SIGMA WORK( 1 ) = DPIVOT DMAX = ABS( WORK(1) ) J = IBEGIN DO 70 I = 1, IN - 1 WORK( 2*IN+I ) = ONE / WORK( I ) TMP = E( J )*WORK( 2*IN+I ) WORK( IN+I ) = TMP DPIVOT = ( D( J+1 )-SIGMA ) - TMP*E( J ) WORK( I+1 ) = DPIVOT DMAX = MAX( DMAX, ABS(DPIVOT) ) J = J + 1 70 CONTINUE * check for element growth IF( DMAX .GT. MAXGROWTH*SPDIAM ) THEN NOREP = .TRUE. ELSE NOREP = .FALSE. ENDIF IF(NOREP) THEN * Note that in the case of IRANGE=1, we use the Gerschgorin * shift which makes the matrix definite. So we should end up * here really only in the case of IRANGE = 2,3 IF( IDUM.EQ.MAXTRY-1 ) THEN IF( SGNDEF.EQ.ONE ) THEN * The fudged Gerschgorin shift should succeed SIGMA = $ GL - FUDGE*SPDIAM*EPS*N - FUDGE*TWO*PIVMIN ELSE SIGMA = $ GU + FUDGE*SPDIAM*EPS*N + FUDGE*TWO*PIVMIN END IF ELSE SIGMA = SIGMA - SGNDEF * TAU TAU = TWO * TAU END IF ELSE * an initial RRR is found GO TO 83 END IF 80 CONTINUE * if the program reaches this point, no base representation could be * found in MAXTRY iterations. INFO = 2 RETURN 83 CONTINUE * At this point, we have found an initial base representation * T - SIGMA I = L D L^T with not too much element growth. * Store the shift. E( IEND ) = SIGMA * Store D and L. CALL DCOPY( IN, WORK, 1, D( IBEGIN ), 1 ) CALL DCOPY( IN-1, WORK( IN+1 ), 1, E( IBEGIN ), 1 ) IF(RNDPRT .AND. MB.GT.1 ) THEN * * Perturb each entry of the base representation by a small * (but random) relative amount to overcome difficulties with * glued matrices. * DO 122 I = 1, 4 ISEED( I ) = 1 122 CONTINUE CALL DLARNV(2, ISEED, 2*IN-1, WORK(1)) DO 125 I = 1,IN-1 D(IBEGIN+I-1) = D(IBEGIN+I-1)*(ONE+EPS*PERT*WORK(I)) E(IBEGIN+I-1) = E(IBEGIN+I-1)*(ONE+EPS*PERT*WORK(IN+I)) 125 CONTINUE D(IEND) = D(IEND)*(ONE+EPS*FOUR*WORK(IN)) * ENDIF * * Don't update the Gerschgorin intervals because keeping track * of the updates would be too much work in DLARRV. * We update W instead and use it to locate the proper Gerschgorin * intervals. * Compute the required eigenvalues of L D L' by bisection or dqds IF ( .NOT.USEDQD ) THEN * If DLARRD has been used, shift the eigenvalue approximations * according to their representation. This is necessary for * a uniform DLARRV since dqds computes eigenvalues of the * shifted representation. In DLARRV, W will always hold the * UNshifted eigenvalue approximation. DO 134 J=WBEGIN,WEND W(J) = W(J) - SIGMA WERR(J) = WERR(J) + ABS(W(J)) * EPS 134 CONTINUE * call DLARRB to reduce eigenvalue error of the approximations * from DLARRD DO 135 I = IBEGIN, IEND-1 WORK( I ) = D( I ) * E( I )**2 135 CONTINUE * use bisection to find EV from INDL to INDU CALL DLARRB(IN, D(IBEGIN), WORK(IBEGIN), $ INDL, INDU, RTOL1, RTOL2, INDL-1, $ W(WBEGIN), WGAP(WBEGIN), WERR(WBEGIN), $ WORK( 2*N+1 ), IWORK, PIVMIN, SPDIAM, $ IN, IINFO ) IF( IINFO .NE. 0 ) THEN INFO = -4 RETURN END IF * DLARRB computes all gaps correctly except for the last one * Record distance to VU/GU WGAP( WEND ) = MAX( ZERO, $ ( VU-SIGMA ) - ( W( WEND ) + WERR( WEND ) ) ) DO 138 I = INDL, INDU M = M + 1 IBLOCK(M) = JBLK INDEXW(M) = I 138 CONTINUE ELSE * Call dqds to get all eigs (and then possibly delete unwanted * eigenvalues). * Note that dqds finds the eigenvalues of the L D L^T representation * of T to high relative accuracy. High relative accuracy * might be lost when the shift of the RRR is subtracted to obtain * the eigenvalues of T. However, T is not guaranteed to define its * eigenvalues to high relative accuracy anyway. * Set RTOL to the order of the tolerance used in DLASQ2 * This is an ESTIMATED error, the worst case bound is 4*N*EPS * which is usually too large and requires unnecessary work to be * done by bisection when computing the eigenvectors RTOL = LOG(DBLE(IN)) * FOUR * EPS J = IBEGIN DO 140 I = 1, IN - 1 WORK( 2*I-1 ) = ABS( D( J ) ) WORK( 2*I ) = E( J )*E( J )*WORK( 2*I-1 ) J = J + 1 140 CONTINUE WORK( 2*IN-1 ) = ABS( D( IEND ) ) WORK( 2*IN ) = ZERO CALL DLASQ2( IN, WORK, IINFO ) IF( IINFO .NE. 0 ) THEN * If IINFO = -5 then an index is part of a tight cluster * and should be changed. The index is in IWORK(1) and the * gap is in WORK(N+1) INFO = -5 RETURN ELSE * Test that all eigenvalues are positive as expected DO 149 I = 1, IN IF( WORK( I ).LT.ZERO ) THEN INFO = -6 RETURN ENDIF 149 CONTINUE END IF IF( SGNDEF.GT.ZERO ) THEN DO 150 I = INDL, INDU M = M + 1 W( M ) = WORK( IN-I+1 ) IBLOCK( M ) = JBLK INDEXW( M ) = I 150 CONTINUE ELSE DO 160 I = INDL, INDU M = M + 1 W( M ) = -WORK( I ) IBLOCK( M ) = JBLK INDEXW( M ) = I 160 CONTINUE END IF DO 165 I = M - MB + 1, M * the value of RTOL below should be the tolerance in DLASQ2 WERR( I ) = RTOL * ABS( W(I) ) 165 CONTINUE DO 166 I = M - MB + 1, M - 1 * compute the right gap between the intervals WGAP( I ) = MAX( ZERO, $ W(I+1)-WERR(I+1) - (W(I)+WERR(I)) ) 166 CONTINUE WGAP( M ) = MAX( ZERO, $ ( VU-SIGMA ) - ( W( M ) + WERR( M ) ) ) END IF * proceed with next block IBEGIN = IEND + 1 WBEGIN = WEND + 1 170 CONTINUE * RETURN * * end of DLARRE2 * END scalapack-2.0.2/SRC/dlarre2a.f000644 000766 000024 00000066772 11657111056 016230 0ustar00juliestaff000000 000000 SUBROUTINE DLARRE2A( RANGE, N, VL, VU, IL, IU, D, E, E2, $ RTOL1, RTOL2, SPLTOL, NSPLIT, ISPLIT, $ M, DOL, DOU, NEEDIL, NEEDIU, $ W, WERR, WGAP, IBLOCK, INDEXW, GERS, $ SDIAM, PIVMIN, WORK, IWORK, MINRGP, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER RANGE INTEGER DOL, DOU, IL, INFO, IU, M, N, NSPLIT, $ NEEDIL, NEEDIU DOUBLE PRECISION MINRGP, PIVMIN, RTOL1, RTOL2, SPLTOL, VL, VU * .. * .. Array Arguments .. INTEGER IBLOCK( * ), ISPLIT( * ), IWORK( * ), $ INDEXW( * ) DOUBLE PRECISION D( * ), E( * ), E2( * ), GERS( * ), $ SDIAM( * ), W( * ),WERR( * ), $ WGAP( * ), WORK( * ) * * Purpose * ======= * * To find the desired eigenvalues of a given real symmetric * tridiagonal matrix T, DLARRE2 sets any "small" off-diagonal * elements to zero, and for each unreduced block T_i, it finds * (a) a suitable shift at one end of the block's spectrum, * (b) the base representation, T_i - sigma_i I = L_i D_i L_i^T, and * (c) eigenvalues of each L_i D_i L_i^T. * * NOTE: * The algorithm obtains a crude picture of all the wanted eigenvalues * (as selected by RANGE). However, to reduce work and improve scalability, * only the eigenvalues DOL to DOU are refined. Furthermore, if the matrix * splits into blocks, RRRs for blocks that do not contain eigenvalues * from DOL to DOU are skipped. * The DQDS algorithm (subroutine DLASQ2) is not used, unlike in * the sequential case. Instead, eigenvalues are computed in parallel to some * figures using bisection. * * Arguments * ========= * * RANGE (input) CHARACTER * = 'A': ("All") all eigenvalues will be found. * = 'V': ("Value") all eigenvalues in the half-open interval * (VL, VU] will be found. * = 'I': ("Index") the IL-th through IU-th eigenvalues (of the * entire matrix) will be found. * * N (input) INTEGER * The order of the matrix. N > 0. * * VL (input/output) DOUBLE PRECISION * VU (input/output) DOUBLE PRECISION * If RANGE='V', the lower and upper bounds for the eigenvalues. * Eigenvalues less than or equal to VL, or greater than VU, * will not be returned. VL < VU. * If RANGE='I' or ='A', DLARRE2A computes bounds on the desired * part of the spectrum. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the N diagonal elements of the tridiagonal * matrix T. * On exit, the N diagonal elements of the diagonal * matrices D_i. * * E (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the first (N-1) entries contain the subdiagonal * elements of the tridiagonal matrix T; E(N) need not be set. * On exit, E contains the subdiagonal elements of the unit * bidiagonal matrices L_i. The entries E( ISPLIT( I ) ), * 1 <= I <= NSPLIT, contain the base points sigma_i on output. * * E2 (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the first (N-1) entries contain the SQUARES of the * subdiagonal elements of the tridiagonal matrix T; * E2(N) need not be set. * On exit, the entries E2( ISPLIT( I ) ), * 1 <= I <= NSPLIT, have been set to zero * * RTOL1 (input) DOUBLE PRECISION * RTOL2 (input) DOUBLE PRECISION * Parameters for bisection. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) * * SPLTOL (input) DOUBLE PRECISION * The threshold for splitting. * * NSPLIT (output) INTEGER * The number of blocks T splits into. 1 <= NSPLIT <= N. * * ISPLIT (output) INTEGER array, dimension (N) * The splitting points, at which T breaks up into blocks. * The first block consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N. * * M (output) INTEGER * The total number of eigenvalues (of all L_i D_i L_i^T) * found. * * DOL (input) INTEGER * DOU (input) INTEGER * If the user wants to work on only a selected part of the * representation tree, he can specify an index range DOL:DOU. * Otherwise, the setting DOL=1, DOU=N should be applied. * Note that DOL and DOU refer to the order in which the eigenvalues * are stored in W. * * NEEDIL (output) INTEGER * NEEDIU (output) INTEGER * The indices of the leftmost and rightmost eigenvalues * of the root node RRR which are * needed to accurately compute the relevant part of the * representation tree. * * W (output) DOUBLE PRECISION array, dimension (N) * The first M elements contain the eigenvalues. The * eigenvalues of each of the blocks, L_i D_i L_i^T, are * sorted in ascending order ( DLARRE2A may use the * remaining N-M elements as workspace). * Note that immediately after exiting this routine, only * the eigenvalues from position DOL:DOU in W are * reliable on this processor * because the eigenvalue computation is done in parallel. * * WERR (output) DOUBLE PRECISION array, dimension (N) * The error bound on the corresponding eigenvalue in W. * Note that immediately after exiting this routine, only * the uncertainties from position DOL:DOU in WERR are * reliable on this processor * because the eigenvalue computation is done in parallel. * * WGAP (output) DOUBLE PRECISION array, dimension (N) * The separation from the right neighbor eigenvalue in W. * The gap is only with respect to the eigenvalues of the same block * as each block has its own representation tree. * Exception: at the right end of a block we store the left gap * Note that immediately after exiting this routine, only * the gaps from position DOL:DOU in WGAP are * reliable on this processor * because the eigenvalue computation is done in parallel. * * IBLOCK (output) INTEGER array, dimension (N) * The indices of the blocks (submatrices) associated with the * corresponding eigenvalues in W; IBLOCK(i)=1 if eigenvalue * W(i) belongs to the first block from the top, =2 if W(i) * belongs to the second block, etc. * * INDEXW (output) INTEGER array, dimension (N) * The indices of the eigenvalues within each block (submatrix); * for example, INDEXW(i)= 10 and IBLOCK(i)=2 imply that the * i-th eigenvalue W(i) is the 10-th eigenvalue in block 2 * * GERS (output) DOUBLE PRECISION array, dimension (2*N) * The N Gerschgorin intervals (the i-th Gerschgorin interval * is (GERS(2*i-1), GERS(2*i)). * * PIVMIN (output) DOUBLE PRECISION * The minimum pivot in the sturm sequence for T. * * WORK (workspace) DOUBLE PRECISION array, dimension (6*N) * Workspace. * * IWORK (workspace) INTEGER array, dimension (5*N) * Workspace. * * MINRGP (input) DOUBLE PRECISION * The minimum relativ gap threshold to decide whether an eigenvalue * or a cluster boundary is reached. * * INFO (output) INTEGER * = 0: successful exit * > 0: A problem occured in DLARRE2A. * < 0: One of the called subroutines signaled an internal problem. * Needs inspection of the corresponding parameter IINFO * for further information. * * =-1: Problem in DLARRD2. * = 2: No base representation could be found in MAXTRY iterations. * Increasing MAXTRY and recompilation might be a remedy. * =-3: Problem in DLARRB2 when computing the refined root * representation * =-4: Problem in DLARRB2 when preforming bisection on the * desired part of the spectrum. * = -9 Problem: M < DOU-DOL+1, that is the code found fewer * eigenvalues than it was supposed to * * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION FAC, FOUR, FOURTH, FUDGE, HALF, HNDRD, $ MAXGROWTH, ONE, PERT, TWO, ZERO PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, $ TWO = 2.0D0, FOUR=4.0D0, $ HNDRD = 100.0D0, $ PERT = 8.0D0, $ HALF = ONE/TWO, FOURTH = ONE/FOUR, FAC= HALF, $ MAXGROWTH = 64.0D0, FUDGE = 2.0D0 ) INTEGER MAXTRY PARAMETER ( MAXTRY = 6 ) * .. * .. Local Scalars .. LOGICAL NOREP, RNDPRT, USEDQD INTEGER CNT, CNT1, CNT2, I, IBEGIN, IDUM, IEND, IINFO, $ IN, INDL, INDU, IRANGE, J, JBLK, MB, MM, $ MYINDL, MYINDU, MYWBEG, MYWEND, WBEGIN, WEND DOUBLE PRECISION AVGAP, BSRTOL, CLWDTH, DMAX, DPIVOT, EABS, $ EMAX, EOLD, EPS, GL, GU, ISLEFT, ISRGHT, $ LGPVMN, LGSPDM, RTL, S1, S2, SAFMIN, SGNDEF, $ SIGMA, SPDIAM, TAU, TMP, TMP1, TMP2 * .. * .. Local Arrays .. INTEGER ISEED( 4 ) * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH, LSAME * .. * .. External Subroutines .. EXTERNAL DCOPY, DLARNV, DLARRA, DLARRB2, $ DLARRC, DLARRD2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * INFO = 0 * Dis-/Enable a small random perturbation of the root representation RNDPRT = .TRUE. * * Decode RANGE * IF( LSAME( RANGE, 'A' ) ) THEN IRANGE = 1 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IRANGE = 2 ELSE IF( LSAME( RANGE, 'I' ) ) THEN IRANGE = 3 END IF M = 0 * Get machine constants SAFMIN = DLAMCH( 'S' ) EPS = DLAMCH( 'P' ) * Set parameters RTL = SQRT(EPS) BSRTOL = 1.0D-1 * Treat case of 1x1 matrix for quick return IF( N.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR. $ ((IRANGE.EQ.2).AND.(D(1).GT.VL).AND.(D(1).LE.VU)).OR. $ ((IRANGE.EQ.3).AND.(IL.EQ.1).AND.(IU.EQ.1)) ) THEN M = 1 W(1) = D(1) * The computation error of the eigenvalue is zero WERR(1) = ZERO WGAP(1) = ZERO IBLOCK( 1 ) = 1 INDEXW( 1 ) = 1 GERS(1) = D( 1 ) GERS(2) = D( 1 ) ENDIF * store the shift for the initial RRR, which is zero in this case E(1) = ZERO RETURN END IF * General case: tridiagonal matrix of order > 1 * Init WERR, WGAP. DO 1 I =1,N WERR(I) = ZERO 1 CONTINUE DO 2 I =1,N WGAP(I) = ZERO 2 CONTINUE * Compute Gerschgorin intervals and spectral diameter. * Compute maximum off-diagonal entry and pivmin. GL = D(1) GU = D(1) EOLD = ZERO EMAX = ZERO E(N) = ZERO DO 5 I = 1,N EABS = ABS( E(I) ) IF( EABS .GE. EMAX ) THEN EMAX = EABS END IF TMP = EABS + EOLD EOLD = EABS TMP1 = D(I) - TMP TMP2 = D(I) + TMP GL = MIN( GL, TMP1 ) GU = MAX( GU, TMP2 ) GERS( 2*I-1) = TMP1 GERS( 2*I ) = TMP2 5 CONTINUE * The minimum pivot allowed in the sturm sequence for T PIVMIN = SAFMIN * MAX( ONE, EMAX**2 ) * Compute spectral diameter. The Gerschgorin bounds give an * estimate that is wrong by at most a factor of SQRT(2) SPDIAM = GU - GL * Compute splitting points CALL DLARRA( N, D, E, E2, SPLTOL, SPDIAM, $ NSPLIT, ISPLIT, IINFO ) IF( IRANGE.EQ.1 ) THEN * Set interval [VL,VU] that contains all eigenvalues VL = GL VU = GU ENDIF * We call DLARRD2 to find crude approximations to the eigenvalues * in the desired range. In case IRANGE = 3, we also obtain the * interval (VL,VU] that contains all the wanted eigenvalues. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.RTOL*MAX(ABS(LEFT),ABS(RIGHT)) * DLARRD2 needs a WORK of size 4*N, IWORK of size 3*N CALL DLARRD2( RANGE, 'B', N, VL, VU, IL, IU, GERS, $ BSRTOL, D, E, E2, PIVMIN, NSPLIT, ISPLIT, $ MM, W, WERR, VL, VU, IBLOCK, INDEXW, $ WORK, IWORK, DOL, DOU, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF * Make sure that the entries M+1 to N in W, WERR, IBLOCK, INDEXW are 0 DO 14 I = MM+1,N W( I ) = ZERO WERR( I ) = ZERO IBLOCK( I ) = 0 INDEXW( I ) = 0 14 CONTINUE *** * Loop over unreduced blocks IBEGIN = 1 WBEGIN = 1 DO 170 JBLK = 1, NSPLIT IEND = ISPLIT( JBLK ) IN = IEND - IBEGIN + 1 * 1 X 1 block IF( IN.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR.( (IRANGE.EQ.2).AND. $ ( D( IBEGIN ).GT.VL ).AND.( D( IBEGIN ).LE.VU ) ) $ .OR. ( (IRANGE.EQ.3).AND.(IBLOCK(WBEGIN).EQ.JBLK)) $ ) THEN M = M + 1 W( M ) = D( IBEGIN ) WERR(M) = ZERO * The gap for a single block doesn't matter for the later * algorithm and is assigned an arbitrary large value WGAP(M) = ZERO IBLOCK( M ) = JBLK INDEXW( M ) = 1 WBEGIN = WBEGIN + 1 ENDIF * E( IEND ) holds the shift for the initial RRR E( IEND ) = ZERO IBEGIN = IEND + 1 GO TO 170 END IF * * Blocks of size larger than 1x1 * * E( IEND ) will hold the shift for the initial RRR, for now set it =0 E( IEND ) = ZERO IF( ( IRANGE.EQ.1 ) .OR. $ ((IRANGE.EQ.3).AND.(IL.EQ.1.AND.IU.EQ.N)) ) THEN * MB = number of eigenvalues to compute MB = IN WEND = WBEGIN + MB - 1 INDL = 1 INDU = IN ELSE * Count the number of eigenvalues in the current block. MB = 0 DO 20 I = WBEGIN,MM IF( IBLOCK(I).EQ.JBLK ) THEN MB = MB+1 ELSE GOTO 21 ENDIF 20 CONTINUE 21 CONTINUE IF( MB.EQ.0) THEN * No eigenvalue in the current block lies in the desired range * E( IEND ) holds the shift for the initial RRR E( IEND ) = ZERO IBEGIN = IEND + 1 GO TO 170 ENDIF * WEND = WBEGIN + MB - 1 * Find local index of the first and last desired evalue. INDL = INDEXW(WBEGIN) INDU = INDEXW( WEND ) ENDIF * IF( (WEND.LT.DOL).OR.(WBEGIN.GT.DOU) ) THEN * if this subblock contains no desired eigenvalues, * skip the computation of this representation tree IBEGIN = IEND + 1 WBEGIN = WEND + 1 M = M + MB GO TO 170 END IF * IF(.NOT. ( IRANGE.EQ.1 ) ) THEN * At this point, the sequential code decides * whether dqds or bisection is more efficient. * Note: in the parallel code, we do not use dqds. * However, we do not change the shift strategy * if USEDQD is TRUE, then the same shift is used as for * the sequential code when it uses dqds. * USEDQD = ( MB .GT. FAC*IN ) * * Calculate gaps for the current block * In later stages, when representations for individual * eigenvalues are different, we use SIGMA = E( IEND ). SIGMA = ZERO DO 30 I = WBEGIN, WEND - 1 WGAP( I ) = MAX( ZERO, $ W(I+1)-WERR(I+1) - (W(I)+WERR(I)) ) 30 CONTINUE WGAP( WEND ) = MAX( ZERO, $ VU - SIGMA - (W( WEND )+WERR( WEND ))) ENDIF * * Find local outer bounds GL,GU for the block GL = D(IBEGIN) GU = D(IBEGIN) DO 15 I = IBEGIN , IEND GL = MIN( GERS( 2*I-1 ), GL ) GU = MAX( GERS( 2*I ), GU ) 15 CONTINUE SPDIAM = GU - GL * Save local spectral diameter for later use SDIAM(JBLK) = SPDIAM * Find approximations to the extremal eigenvalues of the block CALL DLARRK( IN, 1, GL, GU, D(IBEGIN), $ E2(IBEGIN), PIVMIN, RTL, TMP, TMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF ISLEFT = MAX(GL, TMP - TMP1 $ - HNDRD * EPS* ABS(TMP - TMP1)) CALL DLARRK( IN, IN, GL, GU, D(IBEGIN), $ E2(IBEGIN), PIVMIN, RTL, TMP, TMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF ISRGHT = MIN(GU, TMP + TMP1 $ + HNDRD * EPS * ABS(TMP + TMP1)) IF( ( IRANGE.EQ.1 ).OR.USEDQD ) THEN * Case of DQDS shift * Improve the estimate of the spectral diameter SPDIAM = ISRGHT - ISLEFT ELSE * Case of bisection * Find approximations to the wanted extremal eigenvalues ISLEFT = MAX(GL, W(WBEGIN) - WERR(WBEGIN) $ - HNDRD * EPS*ABS(W(WBEGIN)- WERR(WBEGIN) )) ISRGHT = MIN(GU,W(WEND) + WERR(WEND) $ + HNDRD * EPS * ABS(W(WEND)+ WERR(WEND))) ENDIF * Decide whether the base representation for the current block * L_JBLK D_JBLK L_JBLK^T = T_JBLK - sigma_JBLK I * should be on the left or the right end of the current block. * The strategy is to shift to the end which is "more populated" IF( IRANGE.EQ.1 ) THEN * If all the eigenvalues have to be computed, we use dqd USEDQD = .TRUE. * INDL is the local index of the first eigenvalue to compute INDL = 1 INDU = IN * MB = number of eigenvalues to compute MB = IN WEND = WBEGIN + MB - 1 * Define 1/4 and 3/4 points of the spectrum S1 = ISLEFT + FOURTH * SPDIAM S2 = ISRGHT - FOURTH * SPDIAM ELSE * DLARRD2 has computed IBLOCK and INDEXW for each eigenvalue * approximation. * choose sigma IF( USEDQD ) THEN S1 = ISLEFT + FOURTH * SPDIAM S2 = ISRGHT - FOURTH * SPDIAM ELSE TMP = MIN(ISRGHT,VU) - MAX(ISLEFT,VL) S1 = MAX(ISLEFT,VL) + FOURTH * TMP S2 = MIN(ISRGHT,VU) - FOURTH * TMP ENDIF ENDIF * Compute the negcount at the 1/4 and 3/4 points IF(MB.GT.2) THEN CALL DLARRC( 'T', IN, S1, S2, D(IBEGIN), $ E(IBEGIN), PIVMIN, CNT, CNT1, CNT2, IINFO) ENDIF IF(MB.LE.2) THEN SIGMA = GL SGNDEF = ONE ELSEIF( CNT1 - INDL .GE. INDU - CNT2 ) THEN IF( IRANGE.EQ.1 ) THEN SIGMA = MAX(ISLEFT,GL) ELSEIF( USEDQD ) THEN * use Gerschgorin bound as shift to get pos def matrix SIGMA = ISLEFT ELSE * use approximation of the first desired eigenvalue of the * block as shift SIGMA = MAX(ISLEFT,VL) ENDIF SGNDEF = ONE ELSE IF( IRANGE.EQ.1 ) THEN SIGMA = MIN(ISRGHT,GU) ELSEIF( USEDQD ) THEN * use Gerschgorin bound as shift to get neg def matrix * for dqds SIGMA = ISRGHT ELSE * use approximation of the first desired eigenvalue of the * block as shift SIGMA = MIN(ISRGHT,VU) ENDIF SGNDEF = -ONE ENDIF * An initial SIGMA has been chosen that will be used for computing * T - SIGMA I = L D L^T * Define the increment TAU of the shift in case the initial shift * needs to be refined to obtain a factorization with not too much * element growth. IF( USEDQD ) THEN TAU = SPDIAM*EPS*N + TWO*PIVMIN TAU = MAX(TAU,EPS*ABS(SIGMA)) ELSE IF(MB.GT.1) THEN CLWDTH = W(WEND) + WERR(WEND) - W(WBEGIN) - WERR(WBEGIN) AVGAP = ABS(CLWDTH / DBLE(WEND-WBEGIN)) IF( SGNDEF.EQ.ONE ) THEN TAU = HALF*MAX(WGAP(WBEGIN),AVGAP) TAU = MAX(TAU,WERR(WBEGIN)) ELSE TAU = HALF*MAX(WGAP(WEND-1),AVGAP) TAU = MAX(TAU,WERR(WEND)) ENDIF ELSE TAU = WERR(WBEGIN) ENDIF ENDIF * DO 80 IDUM = 1, MAXTRY * Compute L D L^T factorization of tridiagonal matrix T - sigma I. * Store D in WORK(1:IN), L in WORK(IN+1:2*IN), and reciprocals of * pivots in WORK(2*IN+1:3*IN) DPIVOT = D( IBEGIN ) - SIGMA WORK( 1 ) = DPIVOT DMAX = ABS( WORK(1) ) J = IBEGIN DO 70 I = 1, IN - 1 WORK( 2*IN+I ) = ONE / WORK( I ) TMP = E( J )*WORK( 2*IN+I ) WORK( IN+I ) = TMP DPIVOT = ( D( J+1 )-SIGMA ) - TMP*E( J ) WORK( I+1 ) = DPIVOT DMAX = MAX( DMAX, ABS(DPIVOT) ) J = J + 1 70 CONTINUE * check for element growth IF( DMAX .GT. MAXGROWTH*SPDIAM ) THEN NOREP = .TRUE. ELSE NOREP = .FALSE. ENDIF IF(NOREP) THEN * Note that in the case of IRANGE=1, we use the Gerschgorin * shift which makes the matrix definite. So we should end up * here really only in the case of IRANGE = 2,3 IF( IDUM.EQ.MAXTRY-1 ) THEN IF( SGNDEF.EQ.ONE ) THEN * The fudged Gerschgorin shift should succeed SIGMA = $ GL - FUDGE*SPDIAM*EPS*N - FUDGE*TWO*PIVMIN ELSE SIGMA = $ GU + FUDGE*SPDIAM*EPS*N + FUDGE*TWO*PIVMIN END IF ELSE SIGMA = SIGMA - SGNDEF * TAU TAU = TWO * TAU END IF ELSE * an initial RRR is found GO TO 83 END IF 80 CONTINUE * if the program reaches this point, no base representation could be * found in MAXTRY iterations. INFO = 2 RETURN 83 CONTINUE * At this point, we have found an initial base representation * T - SIGMA I = L D L^T with not too much element growth. * Store the shift. E( IEND ) = SIGMA * Store D and L. CALL DCOPY( IN, WORK, 1, D( IBEGIN ), 1 ) CALL DCOPY( IN-1, WORK( IN+1 ), 1, E( IBEGIN ), 1 ) IF(RNDPRT .AND. MB.GT.1 ) THEN * * Perturb each entry of the base representation by a small * (but random) relative amount to overcome difficulties with * glued matrices. * DO 122 I = 1, 4 ISEED( I ) = 1 122 CONTINUE CALL DLARNV(2, ISEED, 2*IN-1, WORK(1)) DO 125 I = 1,IN-1 D(IBEGIN+I-1) = D(IBEGIN+I-1)*(ONE+EPS*PERT*WORK(2*I-1)) E(IBEGIN+I-1) = E(IBEGIN+I-1)*(ONE+EPS*PERT*WORK(2*I)) 125 CONTINUE D(IEND) = D(IEND)*(ONE+EPS*PERT*WORK(2*IN-1)) * ENDIF * * Compute the required eigenvalues of L D L' by bisection * Shift the eigenvalue approximations * according to the shift of their representation. DO 134 J=WBEGIN,WEND W(J) = W(J) - SIGMA WERR(J) = WERR(J) + ABS(W(J)) * EPS 134 CONTINUE * call DLARRB2 to reduce eigenvalue error of the approximations * from DLARRD2 DO 135 I = IBEGIN, IEND-1 WORK( I ) = D( I ) * E( I )**2 135 CONTINUE * use bisection to find EV from INDL to INDU INDL = INDEXW( WBEGIN ) INDU = INDEXW( WEND ) * * Indicate that the current block contains eigenvalues that * are potentially needed later. * NEEDIL = MIN(NEEDIL,WBEGIN) NEEDIU = MAX(NEEDIU,WEND) * * For the parallel distributed case, only compute * those eigenvalues that have to be computed as indicated by DOL, DOU * MYWBEG = MAX(WBEGIN,DOL) MYWEND = MIN(WEND,DOU) * IF(MYWBEG.GT.WBEGIN) THEN * This is the leftmost block containing wanted eigenvalues * as well as unwanted ones. To save on communication, * check if NEEDIL can be increased even further: * on the left end, only the eigenvalues of the cluster * including MYWBEG are needed DO 136 I = WBEGIN, MYWBEG-1 IF ( WGAP(I).GE.MINRGP*ABS(W(I)) ) THEN NEEDIL = MAX(I+1,NEEDIL) ENDIF 136 CONTINUE ENDIF IF(MYWEND.LT.WEND) THEN * This is the rightmost block containing wanted eigenvalues * as well as unwanted ones. To save on communication, * Check if NEEDIU can be decreased even further. DO 137 I = MYWEND,WEND-1 IF ( WGAP(I).GE.MINRGP*ABS(W(I)) ) THEN NEEDIU = MIN(I,NEEDIU) GOTO 138 ENDIF 137 CONTINUE 138 CONTINUE ENDIF * * Only compute eigenvalues from MYINDL to MYINDU * instead of INDL to INDU * MYINDL = INDEXW( MYWBEG ) MYINDU = INDEXW( MYWEND ) * LGPVMN = LOG( PIVMIN ) LGSPDM = LOG( SPDIAM + PIVMIN ) CALL DLARRB2(IN, D(IBEGIN), WORK(IBEGIN), $ MYINDL, MYINDU, RTOL1, RTOL2, MYINDL-1, $ W(MYWBEG), WGAP(MYWBEG), WERR(MYWBEG), $ WORK( 2*N+1 ), IWORK, PIVMIN, $ LGPVMN, LGSPDM, IN, IINFO ) IF( IINFO .NE. 0 ) THEN INFO = -4 RETURN END IF * DLARRB2 computes all gaps correctly except for the last one * Record distance to VU/GU WGAP( WEND ) = MAX( ZERO, $ ( VU-SIGMA ) - ( W( WEND ) + WERR( WEND ) ) ) DO 140 I = INDL, INDU M = M + 1 IBLOCK(M) = JBLK INDEXW(M) = I 140 CONTINUE * * proceed with next block IBEGIN = IEND + 1 WBEGIN = WEND + 1 170 CONTINUE * IF (M.LT.DOU-DOL+1) THEN INFO = -9 ENDIF RETURN * * end of DLARRE2A * END scalapack-2.0.2/SRC/dlarrf2.f000644 000766 000024 00000026215 11657111056 016054 0ustar00juliestaff000000 000000 SUBROUTINE DLARRF2( N, D, L, LD, CLSTRT, CLEND, $ CLMID1, CLMID2, W, WGAP, WERR, TRYMID, $ SPDIAM, CLGAPL, CLGAPR, PIVMIN, SIGMA, $ DPLUS, LPLUS, WORK, INFO ) * * -- ScaLAPACK computational routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER CLSTRT, CLEND, CLMID1, CLMID2, INFO, N DOUBLE PRECISION CLGAPL, CLGAPR, PIVMIN, SIGMA, SPDIAM LOGICAL TRYMID * .. * .. Array Arguments .. DOUBLE PRECISION D( * ), DPLUS( * ), L( * ), LD( * ), $ LPLUS( * ), W( * ), WGAP( * ), WERR( * ), WORK( * ) * .. * * Purpose * ======= * * Given the initial representation L D L^T and its cluster of close * eigenvalues (in a relative measure), W( CLSTRT ), W( CLSTRT+1 ), ... * W( CLEND ), DLARRF2 finds a new relatively robust representation * L D L^T - SIGMA I = L(+) D(+) L(+)^T such that at least one of the * eigenvalues of L(+) D(+) L(+)^T is relatively isolated. * * This is an enhanced version of DLARRF that also tries shifts in * the middle of the cluster, should there be a large gap, in order to * break large clusters into at least two pieces. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix (subblock, if the matrix splitted). * * D (input) DOUBLE PRECISION array, dimension (N) * The N diagonal elements of the diagonal matrix D. * * L (input) DOUBLE PRECISION array, dimension (N-1) * The (N-1) subdiagonal elements of the unit bidiagonal * matrix L. * * LD (input) DOUBLE PRECISION array, dimension (N-1) * The (N-1) elements L(i)*D(i). * * CLSTRT (input) INTEGER * The index of the first eigenvalue in the cluster. * * CLEND (input) INTEGER * The index of the last eigenvalue in the cluster. * * CLMID1,2(input) INTEGER * The index of a middle eigenvalue pair with large gap * * W (input) DOUBLE PRECISION array, dimension >= (CLEND-CLSTRT+1) * The eigenvalue APPROXIMATIONS of L D L^T in ascending order. * W( CLSTRT ) through W( CLEND ) form the cluster of relatively * close eigenalues. * * WGAP (input/output) DOUBLE PRECISION array, dimension >= (CLEND-CLSTRT+1) * The separation from the right neighbor eigenvalue in W. * * WERR (input) DOUBLE PRECISION array, dimension >= (CLEND-CLSTRT+1) * WERR contain the semiwidth of the uncertainty * interval of the corresponding eigenvalue APPROXIMATION in W * * SPDIAM (input) estimate of the spectral diameter obtained from the * Gerschgorin intervals * * CLGAPL, CLGAPR (input) absolute gap on each end of the cluster. * Set by the calling routine to protect against shifts too close * to eigenvalues outside the cluster. * * PIVMIN (input) DOUBLE PRECISION * The minimum pivot allowed in the sturm sequence. * * SIGMA (output) DOUBLE PRECISION * The shift used to form L(+) D(+) L(+)^T. * * DPLUS (output) DOUBLE PRECISION array, dimension (N) * The N diagonal elements of the diagonal matrix D(+). * * LPLUS (output) DOUBLE PRECISION array, dimension (N-1) * The first (N-1) elements of LPLUS contain the subdiagonal * elements of the unit bidiagonal matrix L(+). * * WORK (workspace) DOUBLE PRECISION array, dimension (2*N) * Workspace. * * Further Details * =============== * * Based on contributions by * Beresford Parlett, University of California, Berkeley, USA * Jim Demmel, University of California, Berkeley, USA * Inderjit Dhillon, University of Texas, Austin, USA * Osni Marques, LBNL/NERSC, USA * Christof Voemel, University of California, Berkeley, USA * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION FOUR, MAXGROWTH1, MAXGROWTH2, ONE, QUART, TWO PARAMETER ( ONE = 1.0D0, TWO = 2.0D0, $ FOUR = 4.0D0, QUART = 0.25D0, $ MAXGROWTH1 = 8.D0, $ MAXGROWTH2 = 8.D0 ) * .. * .. Local Scalars .. LOGICAL DORRR1, NOFAIL, SAWNAN1, SAWNAN2, TRYRRR1 INTEGER BI,I,J,KTRY,KTRYMAX,SLEFT,SRIGHT,SMID,SHIFT PARAMETER ( KTRYMAX = 1, SMID =0, SLEFT = 1, SRIGHT = 2 ) * DSTQDS loops will be blocked to detect NaNs earlier if they occur INTEGER BLKLEN PARAMETER ( BLKLEN = 512 ) DOUBLE PRECISION AVGAP, BESTSHIFT, CLWDTH, EPS, FACT, FAIL, $ FAIL2, GROWTHBOUND, LDELTA, LDMAX, LEASTGROWTH, $ LSIGMA, MAX1, MAX2, MINGAP, MSIGMA1, MSIGMA2, $ OLDP, PROD, RDELTA, RDMAX, RRR1, RRR2, RSIGMA, $ S, TMP, ZNM2 * .. * .. External Functions .. LOGICAL DISNAN DOUBLE PRECISION DLAMCH EXTERNAL DISNAN, DLAMCH * .. * .. External Subroutines .. EXTERNAL DCOPY * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * INFO = 0 FACT = DBLE(2**KTRYMAX) EPS = DLAMCH( 'Precision' ) SHIFT = 0 * Decide whether the code should accept the best among all * representations despite large element growth or signal INFO=1 NOFAIL = .TRUE. * * Compute the average gap length of the cluster CLWDTH = ABS(W(CLEND)-W(CLSTRT)) + WERR(CLEND) + WERR(CLSTRT) AVGAP = CLWDTH / DBLE(CLEND-CLSTRT) MINGAP = MIN(CLGAPL, CLGAPR) * Initial values for shifts to both ends of cluster LSIGMA = MIN(W( CLSTRT ),W( CLEND )) - WERR( CLSTRT ) RSIGMA = MAX(W( CLSTRT ),W( CLEND )) + WERR( CLEND ) MSIGMA1 = W( CLMID1 ) + WERR( CLMID1 ) MSIGMA2 = W( CLMID2 ) - WERR( CLMID2 ) * Use a small fudge to make sure that we really shift to the outside LSIGMA = LSIGMA - ABS(LSIGMA)* FOUR * EPS RSIGMA = RSIGMA + ABS(RSIGMA)* FOUR * EPS * Compute upper bounds for how much to back off the initial shifts LDMAX = QUART * MINGAP + TWO * PIVMIN RDMAX = QUART * MINGAP + TWO * PIVMIN LDELTA = MAX(AVGAP,WGAP( CLSTRT ))/FACT RDELTA = MAX(AVGAP,WGAP( CLEND-1 ))/FACT * * Initialize the record of the best representation found * S = DLAMCH( 'S' ) LEASTGROWTH = ONE / S FAIL = DBLE(N-1)*MINGAP/(SPDIAM*EPS) FAIL2 = DBLE(N-1)*MINGAP/(SPDIAM*SQRT(EPS)) GROWTHBOUND = MAXGROWTH1*SPDIAM * * Set default best shift * BESTSHIFT = LSIGMA IF(.NOT.TRYMID) GOTO 4 * * Try shifts in the middle * SHIFT = SMID DO 3 J=1,2 SAWNAN1 = .FALSE. IF(J.EQ.1) THEN * Try left middle point SIGMA = MSIGMA1 ELSE * Try left middle point SIGMA = MSIGMA2 ENDIF S = -SIGMA DPLUS( 1 ) = D( 1 ) + S MAX1 = ABS( DPLUS( 1 ) ) DO 2 BI = 1, N-1, BLKLEN DO 1 I = BI, MIN( BI+BLKLEN-1, N-1) LPLUS( I ) = LD( I ) / DPLUS( I ) S = S*LPLUS( I )*L( I ) - SIGMA DPLUS( I+1 ) = D( I+1 ) + S MAX1 = MAX( MAX1,ABS(DPLUS(I+1)) ) 1 CONTINUE SAWNAN1=SAWNAN1 .OR. DISNAN(MAX1) IF (SAWNAN1) GOTO 3 2 CONTINUE IF( .NOT.SAWNAN1 ) THEN IF( MAX1.LE.GROWTHBOUND ) THEN GOTO 100 ELSE IF( MAX1.LE.LEASTGROWTH ) THEN LEASTGROWTH = MAX1 BESTSHIFT = SIGMA ENDIF ENDIF 3 CONTINUE 4 CONTINUE * * Shifts in the middle not tried or not succeeded * Find best shift on the outside of the cluster * * while (KTRY <= KTRYMAX) KTRY = 0 * * * 5 CONTINUE * Compute element growth when shifting to both ends of the cluster * accept shift if there is no element growth at one of the two ends * Left end SAWNAN1 = .FALSE. S = -LSIGMA DPLUS( 1 ) = D( 1 ) + S MAX1 = ABS( DPLUS( 1 ) ) DO 12 BI = 1, N-1, BLKLEN DO 11 I = BI, MIN( BI+BLKLEN-1, N-1) LPLUS( I ) = LD( I ) / DPLUS( I ) S = S*LPLUS( I )*L( I ) - LSIGMA DPLUS( I+1 ) = D( I+1 ) + S MAX1 = MAX( MAX1,ABS(DPLUS(I+1)) ) 11 CONTINUE SAWNAN1=SAWNAN1 .OR. DISNAN(MAX1) IF (SAWNAN1) GOTO 13 12 CONTINUE IF( .NOT.SAWNAN1 ) THEN IF( MAX1.LE.GROWTHBOUND ) THEN SIGMA = LSIGMA SHIFT = SLEFT GOTO 100 ELSE IF( MAX1.LE.LEASTGROWTH ) THEN LEASTGROWTH = MAX1 BESTSHIFT = LSIGMA ENDIF ENDIF 13 CONTINUE * Right end SAWNAN2 = .FALSE. S = -RSIGMA WORK( 1 ) = D( 1 ) + S MAX2 = ABS( WORK( 1 ) ) DO 22 BI = 1, N-1, BLKLEN DO 21 I = BI, MIN( BI+BLKLEN-1, N-1) WORK( N+I ) = LD( I ) / WORK( I ) S = S*WORK( N+I )*L( I ) - RSIGMA WORK( I+1 ) = D( I+1 ) + S MAX2 = MAX( MAX2,ABS(WORK(I+1)) ) 21 CONTINUE SAWNAN2=SAWNAN2 .OR. DISNAN(MAX2) IF (SAWNAN2) GOTO 23 22 CONTINUE IF( .NOT.SAWNAN2 ) THEN IF( MAX2.LE.GROWTHBOUND ) THEN SIGMA = RSIGMA SHIFT = SRIGHT GOTO 100 ELSE IF( MAX2.LE.LEASTGROWTH ) THEN LEASTGROWTH = MAX2 BESTSHIFT = RSIGMA ENDIF ENDIF 23 CONTINUE * If we are at this point, both shifts led to too much element growth 50 CONTINUE IF (KTRY.LT.KTRYMAX) THEN * If we are here, both shifts failed also the RRR test. * Back off to the outside LSIGMA = MAX( LSIGMA - LDELTA, $ LSIGMA - LDMAX) RSIGMA = MIN( RSIGMA + RDELTA, $ RSIGMA + RDMAX ) LDELTA = TWO * LDELTA RDELTA = TWO * RDELTA * Ensure that we do not back off too much of the initial shifts LDELTA = MIN(LDMAX,LDELTA) RDELTA = MIN(RDMAX,RDELTA) KTRY = KTRY + 1 GOTO 5 ELSE * None of the representations investigated satisfied our * criteria. Take the best one we found. IF((LEASTGROWTH.LT.FAIL).OR.NOFAIL) THEN LSIGMA = BESTSHIFT SAWNAN1 = .FALSE. S = -LSIGMA DPLUS( 1 ) = D( 1 ) + S DO 6 I = 1, N - 1 LPLUS( I ) = LD( I ) / DPLUS( I ) S = S*LPLUS( I )*L( I ) - LSIGMA DPLUS( I+1 ) = D( I+1 ) + S IF(ABS(DPLUS(I+1)).LT.PIVMIN) THEN DPLUS(I+1) = -PIVMIN ENDIF 6 CONTINUE SIGMA = LSIGMA SHIFT = SLEFT GOTO 100 ELSE INFO = 1 RETURN ENDIF END IF 100 CONTINUE IF (SHIFT.EQ.SLEFT .OR. SHIFT.EQ.SMID ) THEN ELSEIF (SHIFT.EQ.SRIGHT) THEN * store new L and D back into DPLUS, LPLUS CALL DCOPY( N, WORK, 1, DPLUS, 1 ) CALL DCOPY( N-1, WORK(N+1), 1, LPLUS, 1 ) ENDIF RETURN * * End of DLARRF2 * END scalapack-2.0.2/SRC/dlarrv2.f000644 000766 000024 00000143304 11657111056 016073 0ustar00juliestaff000000 000000 SUBROUTINE DLARRV2( N, VL, VU, D, L, PIVMIN, $ ISPLIT, M, DOL, DOU, NEEDIL, NEEDIU, $ MINRGP, RTOL1, RTOL2, W, WERR, WGAP, $ IBLOCK, INDEXW, GERS, SDIAM, $ Z, LDZ, ISUPPZ, $ WORK, IWORK, VSTART, FINISH, $ MAXCLS, NDEPTH, PARITY, ZOFFSET, INFO ) * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER DOL, DOU, INFO, LDZ, M, N, MAXCLS, $ NDEPTH, NEEDIL, NEEDIU, PARITY, ZOFFSET DOUBLE PRECISION MINRGP, PIVMIN, RTOL1, RTOL2, VL, VU LOGICAL VSTART, FINISH * .. * .. Array Arguments .. INTEGER IBLOCK( * ), INDEXW( * ), ISPLIT( * ), $ ISUPPZ( * ), IWORK( * ) DOUBLE PRECISION D( * ), GERS( * ), L( * ), SDIAM( * ), $ W( * ), WERR( * ), $ WGAP( * ), WORK( * ) DOUBLE PRECISION Z( LDZ, * ) * * Purpose * ======= * * DLARRV2 computes the eigenvectors of the tridiagonal matrix * T = L D L^T given L, D and APPROXIMATIONS to the eigenvalues of L D L^T. * The input eigenvalues should have been computed by DLARRE2A * or by precious calls to DLARRV2. * * The major difference between the parallel and the sequential construction * of the representation tree is that in the parallel case, not all eigenvalues * of a given cluster might be computed locally. Other processors might "own" * and refine part of an eigenvalue cluster. This is crucial for scalability. * Thus there might be communication necessary before the current level of the * representation tree can be parsed. * * Please note: * 1. The calling sequence has two additional INTEGER parameters, * DOL and DOU, that should satisfy M>=DOU>=DOL>=1. * These parameters are only relevant for the case JOBZ = 'V'. * DLARRV2 ONLY computes the eigenVECTORS * corresponding to eigenvalues DOL through DOU in W. (That is, * instead of computing the eigenvectors belonging to W(1) * through W(M), only the eigenvectors belonging to eigenvalues * W(DOL) through W(DOU) are computed. In this case, only the * eigenvalues DOL:DOU are guaranteed to be accurately refined * to all figures by Rayleigh-Quotient iteration. * * 2. The additional arguments VSTART, FINISH, NDEPTH, PARITY, ZOFFSET * are included as a thread-safe implementation equivalent to SAVE variables. * These variables store details about the local representation tree which is * computed layerwise. For scalability reasons, eigenvalues belonging to the * locally relevant representation tree might be computed on other processors. * These need to be communicated before the inspection of the RRRs can proceed * on any given layer. * Note that only when the variable FINISH is true, the computation has ended * All eigenpairs between DOL and DOU have been computed. M is set = DOU - DOL + 1. * * 3. DLARRV2 needs more workspace in Z than the sequential DLARRV. * It is used to store the conformal embedding of the local representation tree. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix. N >= 0. * * VL (input) DOUBLE PRECISION * VU (input) DOUBLE PRECISION * Lower and upper bounds of the interval that contains the desired * eigenvalues. VL < VU. Needed to compute gaps on the left or right * end of the extremal eigenvalues in the desired RANGE. * VU is currently not used but kept as parameter in case needed. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the N diagonal elements of the diagonal matrix D. * On exit, D is overwritten. * * L (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the (N-1) subdiagonal elements of the unit * bidiagonal matrix L are in elements 1 to N-1 of L * (if the matrix is not splitted.) At the end of each block * is stored the corresponding shift as given by DLARRE. * On exit, L is overwritten. * * PIVMIN (in) DOUBLE PRECISION * The minimum pivot allowed in the sturm sequence. * * ISPLIT (input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into blocks. * The first block consists of rows/columns 1 to * ISPLIT( 1 ), the second of rows/columns ISPLIT( 1 )+1 * through ISPLIT( 2 ), etc. * * M (input) INTEGER * The total number of input eigenvalues. 0 <= M <= N. * * DOL (input) INTEGER * DOU (input) INTEGER * If the user wants to compute only selected eigenvectors from all * the eigenvalues supplied, he can specify an index range DOL:DOU. * Or else the setting DOL=1, DOU=M should be applied. * Note that DOL and DOU refer to the order in which the eigenvalues * are stored in W. * If the user wants to compute only selected eigenpairs, then * the columns DOL-1 to DOU+1 of the eigenvector space Z contain the * computed eigenvectors. All other columns of Z are set to zero. * If DOL > 1, then Z(:,DOL-1-ZOFFSET) is used. * If DOU < M, then Z(:,DOU+1-ZOFFSET) is used. * * * NEEDIL (input/output) INTEGER * NEEDIU (input/output) INTEGER * Describe which are the left and right outermost eigenvalues * that still need to be included in the computation. These indices * indicate whether eigenvalues from other processors are needed to * correctly compute the conformally embedded representation tree. * When DOL<=NEEDIL<=NEEDIU<=DOU, all required eigenvalues are local * to the processor and no communication is required to compute its * part of the representation tree. * * MINRGP (input) DOUBLE PRECISION * The minimum relativ gap threshold to decide whether an eigenvalue * or a cluster boundary is reached. * * RTOL1 (input) DOUBLE PRECISION * RTOL2 (input) DOUBLE PRECISION * Parameters for bisection. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) * * W (input/output) DOUBLE PRECISION array, dimension (N) * The first M elements of W contain the APPROXIMATE eigenvalues for * which eigenvectors are to be computed. The eigenvalues * should be grouped by split-off block and ordered from * smallest to largest within the block. (The output array * W from DSTEGR2A is expected here.) Furthermore, they are with * respect to the shift of the corresponding root representation * for their block. On exit, * W holds those UNshifted eigenvalues * for which eigenvectors have already been computed. * * WERR (input/output) DOUBLE PRECISION array, dimension (N) * The first M elements contain the semiwidth of the uncertainty * interval of the corresponding eigenvalue in W * * WGAP (input/output) DOUBLE PRECISION array, dimension (N) * The separation from the right neighbor eigenvalue in W. * * IBLOCK (input) INTEGER array, dimension (N) * The indices of the blocks (submatrices) associated with the * corresponding eigenvalues in W; IBLOCK(i)=1 if eigenvalue * W(i) belongs to the first block from the top, =2 if W(i) * belongs to the second block, etc. * * INDEXW (input) INTEGER array, dimension (N) * The indices of the eigenvalues within each block (submatrix); * for example, INDEXW(i)= 10 and IBLOCK(i)=2 imply that the * i-th eigenvalue W(i) is the 10-th eigenvalue in the second block. * * GERS (input) DOUBLE PRECISION array, dimension (2*N) * The N Gerschgorin intervals (the i-th Gerschgorin interval * is (GERS(2*i-1), GERS(2*i)). The Gerschgorin intervals should * be computed from the original UNshifted matrix. * Currently NOT used but kept as parameter in case it becomes * needed in the future. * * SDIAM (input) DOUBLE PRECISION array, dimension (N) * The spectral diameters for all unreduced blocks. * * Z (output) DOUBLE PRECISION array, dimension (LDZ, max(1,M) ) * If INFO = 0, the first M columns of Z contain the * orthonormal eigenvectors of the matrix T * corresponding to the input eigenvalues, with the i-th * column of Z holding the eigenvector associated with W(i). * In the distributed version, only a subset of columns * is accessed, see DOL,DOU and ZOFFSET. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * JOBZ = 'V', LDZ >= max(1,N). * * ISUPPZ (output) INTEGER array, dimension ( 2*max(1,M) ) * The support of the eigenvectors in Z, i.e., the indices * indicating the nonzero elements in Z. The I-th eigenvector * is nonzero only in elements ISUPPZ( 2*I-1 ) through * ISUPPZ( 2*I ). * * WORK (workspace) DOUBLE PRECISION array, dimension (12*N) * * IWORK (workspace) INTEGER array, dimension (7*N) * * VSTART (input/output) LOGICAL * .TRUE. on initialization, set to .FALSE. afterwards. * * FINISH (input/output) LOGICAL * A flag that indicates whether all eigenpairs have been computed. * * MAXCLS (input/output) INTEGER * The largest cluster worked on by this processor in the * representation tree. * * NDEPTH (input/output) INTEGER * The current depth of the representation tree. Set to * zero on initial pass, changed when the deeper levels of * the representation tree are generated. * * PARITY (input/output) INTEGER * An internal parameter needed for the storage of the * clusters on the current level of the representation tree. * * ZOFFSET (input) INTEGER * Offset for storing the eigenpairs when Z is distributed * in 1D-cyclic fashion. * * INFO (output) INTEGER * = 0: successful exit * * > 0: A problem occured in DLARRV2. * < 0: One of the called subroutines signaled an internal problem. * Needs inspection of the corresponding parameter IINFO * for further information. * * =-1: Problem in DLARRB2 when refining a child's eigenvalues. * =-2: Problem in DLARRF2 when computing the RRR of a child. * When a child is inside a tight cluster, it can be difficult * to find an RRR. A partial remedy from the user's point of * view is to make the parameter MINRGP smaller and recompile. * However, as the orthogonality of the computed vectors is * proportional to 1/MINRGP, the user should be aware that * he might be trading in precision when he decreases MINRGP. * =-3: Problem in DLARRB2 when refining a single eigenvalue * after the Rayleigh correction was rejected. * = 5: The Rayleigh Quotient Iteration failed to converge to * full accuracy in MAXITR steps. * * ===================================================================== * * .. Parameters .. INTEGER MAXITR, USE30, USE31, USE32A, USE32B PARAMETER ( MAXITR = 10, USE30=30, USE31=31, $ USE32A=3210, USE32B = 3211 ) DOUBLE PRECISION ZERO, ONE, TWO, THREE, FOUR, HALF PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, $ TWO = 2.0D0, THREE = 3.0D0, $ FOUR = 4.0D0, HALF = 0.5D0) * .. * .. Local Arrays .. INTEGER SPLACE( 4 ) * .. * .. Local Scalars .. LOGICAL DELREF, ESKIP, NEEDBS, ONLYLC, STP2II, TRYMID, $ TRYRQC, USEDBS, USEDRQ INTEGER I, IBEGIN, IEND, II, IINCLS, IINDC1, IINDC2, $ IINDWK, IINFO, IM, IN, INDEIG, INDLD, INDLLD, $ INDWRK, ISUPMN, ISUPMX, ITER, ITMP1, ITWIST, J, $ JBLK, K, KK, MINIWSIZE, MINWSIZE, MYWFST, $ MYWLST, NCLUS, NEGCNT, NEWCLS, NEWFST, NEWFTT, $ NEWLST, NEWSIZ, OFFSET, OLDCLS, OLDFST, OLDIEN, $ OLDLST, OLDNCL, P, Q, VRTREE, WBEGIN, WEND, $ WINDEX, WINDMN, WINDPL, ZFROM, ZINDEX, ZTO, $ ZUSEDL, ZUSEDU, ZUSEDW DOUBLE PRECISION AVGAP, BSTRES, BSTW, ENUFGP, EPS, FUDGE, GAP, $ GAPTOL, LAMBDA, LEFT, LGAP, LGPVMN, LGSPDM, $ LOG_IN, MGAP, MINGMA, MYERR, NRMINV, NXTERR, $ ORTOL, RESID, RGAP, RIGHT, RLTL30, RQCORR, $ RQTOL, SAVEGP, SGNDEF, SIGMA, SPDIAM, SSIGMA, $ TAU, TMP, TOL, ZTZ * .. * .. External Functions .. DOUBLE PRECISION DLAMCH DOUBLE PRECISION DDOT, DNRM2 EXTERNAL DDOT, DLAMCH, DNRM2 * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DLAR1VA, DLARRB2, $ DLARRF2, DLASET, DSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, SQRT * .. * .. Executable Statements .. * .. INFO = 0 * The first N entries of WORK are reserved for the eigenvalues INDLD = N+1 INDLLD= 2*N+1 INDWRK= 3*N+1 MINWSIZE = 12 * N * IWORK(IINCLS+JBLK) holds the number of clusters on the current level * of the reptree for block JBLK IINCLS = 0 * IWORK(IINDC1+1:IINC2+N) are used to store the clusters of the current * layer and the one above. IINDC1 = N IINDC2 = 2*N IINDWK = 3*N + 1 MINIWSIZE = 7 * N EPS = DLAMCH( 'Precision' ) RQTOL = TWO * EPS TRYRQC = .TRUE. * Decide which representation tree criterion to use * USE30 = Lapack 3.0 criterion * USE31 = LAPACK 3.1 criterion * USE32A = two criteria, determines singletons with USE31, and groups with avgap. * USE32B = two criteria, determines singletons with USE31, and groups with USE30. VRTREE = USE32A * LGPVMN = LOG( PIVMIN ) IF(VSTART) THEN * * PREPROCESSING, DONE ONLY IN THE FIRST CALL * VSTART = .FALSE. * MAXCLS = 1 * Set delayed eigenvalue refinement * In order to enable more parallelism, refinement * must be done immediately and cannot be delayed until * the next representation tree level. DELREF = .FALSE. DO 1 I= 1,MINWSIZE WORK( I ) = ZERO 1 CONTINUE DO 2 I= 1,MINIWSIZE IWORK( I ) = 0 2 CONTINUE ZUSEDL = 1 IF(DOL.GT.1) THEN * Set lower bound for use of Z ZUSEDL = DOL-1 ENDIF ZUSEDU = M IF(DOU.LT.M) THEN * Set lower bound for use of Z ZUSEDU = DOU+1 ENDIF * The width of the part of Z that is used ZUSEDW = ZUSEDU - ZUSEDL + 1 * CALL DLASET( 'Full', N, ZUSEDW, ZERO, ZERO, $ Z(1,(ZUSEDL-ZOFFSET)), LDZ ) * Initialize NDEPTH, the current depth of the representation tree NDEPTH = 0 * Initialize parity PARITY = 1 * Go through blocks, initialize data structures IBEGIN = 1 WBEGIN = 1 DO 10 JBLK = 1, IBLOCK( M ) IEND = ISPLIT( JBLK ) SIGMA = L( IEND ) WEND = WBEGIN - 1 3 CONTINUE IF( WEND.LT.M ) THEN IF( IBLOCK( WEND+1 ).EQ.JBLK ) THEN WEND = WEND + 1 GO TO 3 END IF END IF IF( WEND.LT.WBEGIN ) THEN IWORK( IINCLS + JBLK ) = 0 IBEGIN = IEND + 1 GO TO 10 ELSEIF( (WEND.LT.DOL).OR.(WBEGIN.GT.DOU) ) THEN IWORK( IINCLS + JBLK ) = 0 IBEGIN = IEND + 1 WBEGIN = WEND + 1 GO TO 10 END IF * The number of eigenvalues in the current block IM = WEND - WBEGIN + 1 * This is for a 1x1 block IF( IBEGIN.EQ.IEND ) THEN IWORK( IINCLS + JBLK ) = 0 Z( IBEGIN, (WBEGIN-ZOFFSET) ) = ONE ISUPPZ( 2*WBEGIN-1 ) = IBEGIN ISUPPZ( 2*WBEGIN ) = IBEGIN W( WBEGIN ) = W( WBEGIN ) + SIGMA WORK( WBEGIN ) = W( WBEGIN ) IBEGIN = IEND + 1 WBEGIN = WBEGIN + 1 GO TO 10 END IF CALL DCOPY( IM, W( WBEGIN ), 1, & WORK( WBEGIN ), 1 ) * We store in W the eigenvalue approximations w.r.t. the original * matrix T. DO 5 I=1,IM W(WBEGIN+I-1) = W(WBEGIN+I-1)+SIGMA 5 CONTINUE * Initialize cluster counter for this block IWORK( IINCLS + JBLK ) = 1 IWORK( IINDC1+IBEGIN ) = 1 IWORK( IINDC1+IBEGIN+1 ) = IM * IBEGIN = IEND + 1 WBEGIN = WEND + 1 10 CONTINUE * ENDIF * Init NEEDIL and NEEDIU NEEDIL = DOU NEEDIU = DOL * Here starts the main loop * Only one pass through the loop is done until no collaboration * with other processors is needed. 40 CONTINUE PARITY = 1 - PARITY * For each block, build next level of representation tree * if there are still remaining clusters IBEGIN = 1 WBEGIN = 1 DO 170 JBLK = 1, IBLOCK( M ) IEND = ISPLIT( JBLK ) SIGMA = L( IEND ) * Find the eigenvectors of the submatrix indexed IBEGIN * through IEND. IF(M.EQ.N) THEN * all eigenpairs are computed WEND = IEND ELSE * count how many wanted eigenpairs are in this block WEND = WBEGIN - 1 15 CONTINUE IF( WEND.LT.M ) THEN IF( IBLOCK( WEND+1 ).EQ.JBLK ) THEN WEND = WEND + 1 GO TO 15 END IF END IF ENDIF OLDNCL = IWORK( IINCLS + JBLK ) IF( OLDNCL.EQ.0 ) THEN IBEGIN = IEND + 1 WBEGIN = WEND + 1 GO TO 170 END IF * OLDIEN is the last index of the previous block OLDIEN = IBEGIN - 1 * Calculate the size of the current block IN = IEND - IBEGIN + 1 * The number of eigenvalues in the current block IM = WEND - WBEGIN + 1 * Find local spectral diameter of the block SPDIAM = SDIAM(JBLK) LGSPDM = LOG( SPDIAM + PIVMIN ) * Compute ORTOL parameter, similar to DSTEIN ORTOL = SPDIAM*1.0D-3 * Compute average gap AVGAP = SPDIAM/DBLE(IN-1) * Compute the minimum of average gap and ORTOL parameter * This can used as a lower bound for acceptable separation * between eigenvalues ENUFGP = MIN(ORTOL,AVGAP) * Any 1x1 block has been treated before * loop while( OLDNCLS.GT.0 ) * generate the next representation tree level for the current block IF( OLDNCL.GT.0 ) THEN * This is a crude protection against infinitely deep trees IF( NDEPTH.GT.M ) THEN INFO = -2 RETURN ENDIF * breadth first processing of the current level of the representation * tree: OLDNCL = number of clusters on current level * NCLUS is the number of clusters for the next level of the reptree * reset NCLUS to count the number of child clusters NCLUS = 0 * LOG_IN = LOG(DBLE(IN)) * RLTL30 = MIN( 1.0D-2, ONE / DBLE( IN ) ) * IF( PARITY.EQ.0 ) THEN OLDCLS = IINDC1+IBEGIN-1 NEWCLS = IINDC2+IBEGIN-1 ELSE OLDCLS = IINDC2+IBEGIN-1 NEWCLS = IINDC1+IBEGIN-1 END IF * Process the clusters on the current level DO 150 I = 1, OLDNCL J = OLDCLS + 2*I * OLDFST, OLDLST = first, last index of current cluster. * cluster indices start with 1 and are relative * to WBEGIN when accessing W, WGAP, WERR, Z OLDFST = IWORK( J-1 ) OLDLST = IWORK( J ) IF( NDEPTH.GT.0 ) THEN * Retrieve relatively robust representation (RRR) of cluster * that has been computed at the previous level * The RRR is stored in Z and overwritten once the eigenvectors * have been computed or when the cluster is refined IF((DOL.EQ.1).AND.(DOU.EQ.M)) THEN * Get representation from location of the leftmost evalue * of the cluster J = WBEGIN + OLDFST - 1 ELSE IF(WBEGIN+OLDFST-1.LT.DOL) THEN * Get representation from the left end of Z array J = DOL - 1 ELSEIF(WBEGIN+OLDFST-1.GT.DOU) THEN * Get representation from the right end of Z array J = DOU ELSE J = WBEGIN + OLDFST - 1 ENDIF ENDIF CALL DCOPY( IN, Z( IBEGIN, (J-ZOFFSET) ), $ 1, D( IBEGIN ), 1 ) CALL DCOPY( IN-1, Z( IBEGIN, (J+1-ZOFFSET) ), $ 1, L( IBEGIN ),1 ) SIGMA = Z( IEND, (J+1-ZOFFSET) ) * Set the corresponding entries in Z to zero CALL DLASET( 'Full', IN, 2, ZERO, ZERO, $ Z( IBEGIN, (J-ZOFFSET) ), LDZ ) END IF * Compute DL and DLL of current RRR DO 50 J = IBEGIN, IEND-1 TMP = D( J )*L( J ) WORK( INDLD-1+J ) = TMP WORK( INDLLD-1+J ) = TMP*L( J ) 50 CONTINUE IF( NDEPTH.GT.0 .AND. DELREF ) THEN * P and Q are index of the first and last eigenvalue to compute * within the current block P = INDEXW( WBEGIN-1+OLDFST ) Q = INDEXW( WBEGIN-1+OLDLST ) * Offset for the arrays WORK, WGAP and WERR, i.e., th P-OFFSET * thru' Q-OFFSET elements of these arrays are to be used. C OFFSET = P-OLDFST OFFSET = INDEXW( WBEGIN ) - 1 * perform limited bisection (if necessary) to get approximate * eigenvalues to the precision needed. CALL DLARRB2( IN, D( IBEGIN ), $ WORK(INDLLD+IBEGIN-1), $ P, Q, RTOL1, RTOL2, OFFSET, $ WORK(WBEGIN),WGAP(WBEGIN),WERR(WBEGIN), $ WORK( INDWRK ), IWORK( IINDWK ), $ PIVMIN, LGPVMN, LGSPDM, IN, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF * We also recompute the extremal gaps. W holds all eigenvalues * of the unshifted matrix and must be used for computation * of WGAP, the entries of WORK might stem from RRRs with * different shifts. The gaps from WBEGIN-1+OLDFST to * WBEGIN-1+OLDLST are correctly computed in DLARRB2. * However, we only allow the gaps to become greater since * this is what should happen when we decrease WERR IF( OLDFST.GT.1) THEN WGAP( WBEGIN+OLDFST-2 ) = $ MAX(WGAP(WBEGIN+OLDFST-2), $ W(WBEGIN+OLDFST-1)-WERR(WBEGIN+OLDFST-1) $ - W(WBEGIN+OLDFST-2)-WERR(WBEGIN+OLDFST-2) ) ENDIF IF( WBEGIN + OLDLST -1 .LT. WEND ) THEN WGAP( WBEGIN+OLDLST-1 ) = $ MAX(WGAP(WBEGIN+OLDLST-1), $ W(WBEGIN+OLDLST)-WERR(WBEGIN+OLDLST) $ - W(WBEGIN+OLDLST-1)-WERR(WBEGIN+OLDLST-1) ) ENDIF * Each time the eigenvalues in WORK get refined, we store * the newly found approximation with all shifts applied in W DO 53 J=OLDFST,OLDLST W(WBEGIN+J-1) = WORK(WBEGIN+J-1)+SIGMA 53 CONTINUE ELSEIF( (NDEPTH.EQ.0) .OR. (.NOT.DELREF) ) THEN * Some of the eigenvalues might have been computed on * other processors * Recompute gaps for this cluster * (all eigenvalues have the same * representation, i.e. the same shift, so this is easy) DO 54 J = OLDFST, OLDLST-1 MYERR = WERR(WBEGIN + J - 1) NXTERR = WERR(WBEGIN + J ) WGAP(WBEGIN+J-1) = MAX(WGAP(WBEGIN+J-1), $ ( WORK(WBEGIN+J) - NXTERR ) $ - ( WORK(WBEGIN+J-1) + MYERR ) $ ) 54 CONTINUE END IF * * Process the current node. * NEWFST = OLDFST DO 140 J = OLDFST, OLDLST IF( J.EQ.OLDLST ) THEN * we are at the right end of the cluster, this is also the * boundary of the child cluster NEWLST = J ELSE IF (VRTREE.EQ.USE30) THEN IF(WGAP( WBEGIN + J -1).GE. $ RLTL30 * ABS(WORK(WBEGIN + J -1)) ) THEN * the right relgap is big enough by the Lapack 3.0 criterion NEWLST = J ELSE * inside a child cluster, the relative gap is not * big enough. GOTO 140 ENDIF ELSE IF (VRTREE.EQ.USE31) THEN IF ( WGAP( WBEGIN + J -1).GE. $ MINRGP* ABS( WORK(WBEGIN + J -1) ) ) THEN * the right relgap is big enough by the Lapack 3.1 criterion * (NEWFST,..,NEWLST) is well separated from the following NEWLST = J ELSE * inside a child cluster, the relative gap is not * big enough. GOTO 140 ENDIF ELSE IF (VRTREE.EQ.USE32A) THEN IF( (J.EQ.OLDFST).AND.( WGAP(WBEGIN+J-1).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ) ) THEN * the right relgap is big enough by the Lapack 3.1 criterion * Found a singleton NEWLST = J ELSE IF( (J.GT.OLDFST).AND.(J.EQ.NEWFST).AND. $ ( WGAP(WBEGIN+J-2).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ).AND. $ ( WGAP(WBEGIN+J-1).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ) $ ) THEN * Found a singleton NEWLST = J ELSE IF( (J.GT.NEWFST).AND.WGAP(WBEGIN+J-1).GE. $ (MINRGP*ABS(WORK(WBEGIN+J-1)) ) ) $ THEN * the right relgap is big enough by the Lapack 3.1 criterion NEWLST = J ELSE IF((J.GT.NEWFST).AND.(J+1.LT.OLDLST).AND. $ (WGAP(WBEGIN+J-1).GE.ENUFGP)) $ THEN * the right gap is bigger than ENUFGP * Care needs to be taken with this criterion to make * sure it does not create a remaining `false' singleton NEWLST = J ELSE * inside a child cluster, the relative gap is not * big enough. GOTO 140 ENDIF ELSE IF (VRTREE.EQ.USE32B) THEN IF( (J.EQ.OLDFST).AND.( WGAP(WBEGIN+J-1).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ) ) THEN * the right relgap is big enough by the Lapack 3.1 criterion * Found a singleton NEWLST = J ELSE IF( (J.GT.OLDFST).AND.(J.EQ.NEWFST).AND. $ ( WGAP(WBEGIN+J-2).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ).AND. $ ( WGAP(WBEGIN+J-1).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ) $ ) THEN * Found a singleton NEWLST = J ELSE IF( (J.GT.NEWFST).AND.WGAP(WBEGIN+J-1).GE. $ (MINRGP*ABS(WORK(WBEGIN+J-1)) ) ) $ THEN * the right relgap is big enough by the Lapack 3.1 criterion NEWLST = J ELSE IF((J.GT.NEWFST).AND.(J+1.LT.OLDLST).AND. $ (WGAP( WBEGIN + J -1).GE. $ RLTL30 * ABS(WORK(WBEGIN + J -1)) )) $ THEN * the right relgap is big enough by the Lapack 3.0 criterion * Care needs to be taken with this criterion to make * sure it does not create a remaining `false' singleton NEWLST = J ELSE * inside a child cluster, the relative gap is not * big enough. GOTO 140 ENDIF END IF END IF * Compute size of child cluster found NEWSIZ = NEWLST - NEWFST + 1 MAXCLS = MAX( NEWSIZ, MAXCLS ) * NEWFTT is the place in Z where the new RRR or the computed * eigenvector is to be stored IF((DOL.EQ.1).AND.(DOU.EQ.M)) THEN * Store representation at location of the leftmost evalue * of the cluster NEWFTT = WBEGIN + NEWFST - 1 ELSE IF(WBEGIN+NEWFST-1.LT.DOL) THEN * Store representation at the left end of Z array NEWFTT = DOL - 1 ELSEIF(WBEGIN+NEWFST-1.GT.DOU) THEN * Store representation at the right end of Z array NEWFTT = DOU ELSE NEWFTT = WBEGIN + NEWFST - 1 ENDIF ENDIF * FOR 1D-DISTRIBUTED Z, COMPUTE NEWFTT SHIFTED BY ZOFFSET NEWFTT = NEWFTT - ZOFFSET IF( NEWSIZ.GT.1) THEN * * Current child is not a singleton but a cluster. * * IF((WBEGIN+NEWLST-1.LT.DOL).OR. $ (WBEGIN+NEWFST-1.GT.DOU)) THEN * if the cluster contains no desired eigenvalues * skip the computation of that branch of the rep. tree GOTO 139 ENDIF * Compute left and right cluster gap. * IF( NEWFST.EQ.1 ) THEN LGAP = MAX( ZERO, $ W(WBEGIN)-WERR(WBEGIN) - VL ) ELSE LGAP = WGAP( WBEGIN+NEWFST-2 ) ENDIF RGAP = WGAP( WBEGIN+NEWLST-1 ) * * For larger clusters, record the largest gap observed * somewhere near the middle of the cluster as a possible * alternative position for a shift when TRYMID is TRUE * MGAP = ZERO IF(NEWSIZ.GE.50) THEN KK = NEWFST DO 545 K =NEWFST+NEWSIZ/3,NEWLST-NEWSIZ/3 IF(MGAP.LT.WGAP( WBEGIN+K-1 )) THEN KK = K MGAP = WGAP( WBEGIN+K-1 ) ENDIF 545 CONTINUE ENDIF * * Record the left- and right-most eigenvalues needed * for the next level of the representation tree NEEDIL = MIN(NEEDIL,WBEGIN+NEWFST-1) NEEDIU = MAX(NEEDIU,WBEGIN+NEWLST-1) * * Check if middle gap is large enough to shift there * GAP = MIN(LGAP,RGAP) TRYMID = (MGAP.GT.GAP) SPLACE(1) = NEWFST SPLACE(2) = NEWLST IF(TRYMID) THEN SPLACE(3) = KK SPLACE(4) = KK+1 ELSE SPLACE(3) = NEWFST SPLACE(4) = NEWLST ENDIF * * Compute left- and rightmost eigenvalue of child * to high precision in order to shift as close * as possible and obtain as large relative gaps * as possible * DO 55 K =1,4 P = INDEXW( WBEGIN-1+SPLACE(K) ) OFFSET = INDEXW( WBEGIN ) - 1 CALL DLARRB2( IN, D(IBEGIN), $ WORK( INDLLD+IBEGIN-1 ),P,P, $ RQTOL, RQTOL, OFFSET, $ WORK(WBEGIN),WGAP(WBEGIN), $ WERR(WBEGIN),WORK( INDWRK ), $ IWORK( IINDWK ), $ PIVMIN, LGPVMN, LGSPDM, IN, IINFO ) 55 CONTINUE * * Compute RRR of child cluster. * Note that the new RRR is stored in Z * C DLARRF2 needs LWORK = 2*N CALL DLARRF2( IN, D( IBEGIN ), L( IBEGIN ), $ WORK(INDLD+IBEGIN-1), $ SPLACE(1), SPLACE(2), $ SPLACE(3), SPLACE(4), WORK(WBEGIN), $ WGAP(WBEGIN), WERR(WBEGIN), TRYMID, $ SPDIAM, LGAP, RGAP, PIVMIN, TAU, $ Z( IBEGIN, NEWFTT ), $ Z( IBEGIN, NEWFTT+1 ), $ WORK( INDWRK ), IINFO ) IF( IINFO.EQ.0 ) THEN * a new RRR for the cluster was found by DLARRF2 * update shift and store it SSIGMA = SIGMA + TAU Z( IEND, NEWFTT+1 ) = SSIGMA * WORK() are the midpoints and WERR() the semi-width * Note that the entries in W are unchanged. DO 116 K = NEWFST, NEWLST FUDGE = $ THREE*EPS*ABS(WORK(WBEGIN+K-1)) WORK( WBEGIN + K - 1 ) = $ WORK( WBEGIN + K - 1) - TAU FUDGE = FUDGE + $ FOUR*EPS*ABS(WORK(WBEGIN+K-1)) * Fudge errors WERR( WBEGIN + K - 1 ) = $ WERR( WBEGIN + K - 1 ) + FUDGE 116 CONTINUE NCLUS = NCLUS + 1 K = NEWCLS + 2*NCLUS IWORK( K-1 ) = NEWFST IWORK( K ) = NEWLST * IF(.NOT.DELREF) THEN ONLYLC = .TRUE. * IF(ONLYLC) THEN MYWFST = MAX(WBEGIN-1+NEWFST,DOL-1) MYWLST = MIN(WBEGIN-1+NEWLST,DOU+1) ELSE MYWFST = WBEGIN-1+NEWFST MYWLST = WBEGIN-1+NEWLST ENDIF * Compute LLD of new RRR DO 5000 K = IBEGIN, IEND-1 WORK( INDWRK-1+K ) = $ Z(K,NEWFTT)* $ (Z(K,NEWFTT+1)**2) 5000 CONTINUE * P and Q are index of the first and last * eigenvalue to compute within the new cluster P = INDEXW( MYWFST ) Q = INDEXW( MYWLST ) * Offset for the arrays WORK, WGAP and WERR OFFSET = INDEXW( WBEGIN ) - 1 * perform limited bisection (if necessary) to get approximate * eigenvalues to the precision needed. CALL DLARRB2( IN, $ Z(IBEGIN, NEWFTT ), $ WORK(INDWRK+IBEGIN-1), $ P, Q, RTOL1, RTOL2, OFFSET, $ WORK(WBEGIN),WGAP(WBEGIN),WERR(WBEGIN), $ WORK( INDWRK+N ), IWORK( IINDWK ), $ PIVMIN, LGPVMN, LGSPDM, IN, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF * Each time the eigenvalues in WORK get refined, we store * the newly found approximation with all shifts applied in W DO 5003 K=NEWFST,NEWLST W(WBEGIN+K-1) = WORK(WBEGIN+K-1)+SSIGMA 5003 CONTINUE ENDIF * ELSE INFO = -2 RETURN ENDIF ELSE * * Compute eigenvector of singleton * ITER = 0 * TOL = FOUR * LOG_IN * EPS * K = NEWFST WINDEX = WBEGIN + K - 1 ZINDEX = WINDEX - ZOFFSET WINDMN = MAX(WINDEX - 1,1) WINDPL = MIN(WINDEX + 1,M) LAMBDA = WORK( WINDEX ) * Check if eigenvector computation is to be skipped IF((WINDEX.LT.DOL).OR. $ (WINDEX.GT.DOU)) THEN ESKIP = .TRUE. GOTO 125 ELSE ESKIP = .FALSE. ENDIF LEFT = WORK( WINDEX ) - WERR( WINDEX ) RIGHT = WORK( WINDEX ) + WERR( WINDEX ) INDEIG = INDEXW( WINDEX ) IF( K .EQ. 1) THEN LGAP = EPS*MAX(ABS(LEFT),ABS(RIGHT)) ELSE LGAP = WGAP(WINDMN) ENDIF IF( K .EQ. IM) THEN RGAP = EPS*MAX(ABS(LEFT),ABS(RIGHT)) ELSE RGAP = WGAP(WINDEX) ENDIF GAP = MIN( LGAP, RGAP ) IF(( K .EQ. 1).OR.(K .EQ. IM)) THEN GAPTOL = ZERO ELSE GAPTOL = GAP * EPS ENDIF ISUPMN = IN ISUPMX = 1 * Update WGAP so that it holds the minimum gap * to the left or the right. This is crucial in the * case where bisection is used to ensure that the * eigenvalue is refined up to the required precision. * The correct value is restored afterwards. SAVEGP = WGAP(WINDEX) WGAP(WINDEX) = GAP * We want to use the Rayleigh Quotient Correction * as often as possible since it converges quadratically * when we are close enough to the desired eigenvalue. * However, the Rayleigh Quotient can have the wrong sign * and lead us away from the desired eigenvalue. In this * case, the best we can do is to use bisection. USEDBS = .FALSE. USEDRQ = .FALSE. * Bisection is initially turned off unless it is forced NEEDBS = .NOT.TRYRQC * Reset ITWIST ITWIST = 0 120 CONTINUE * Check if bisection should be used to refine eigenvalue IF(NEEDBS) THEN * Take the bisection as new iterate USEDBS = .TRUE. * Temporary copy of twist index needed ITMP1 = ITWIST OFFSET = INDEXW( WBEGIN ) - 1 CALL DLARRB2( IN, D(IBEGIN), $ WORK(INDLLD+IBEGIN-1),INDEIG,INDEIG, $ ZERO, TWO*EPS, OFFSET, $ WORK(WBEGIN),WGAP(WBEGIN), $ WERR(WBEGIN),WORK( INDWRK ), $ IWORK( IINDWK ), $ PIVMIN, LGPVMN, LGSPDM, ITMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -3 RETURN ENDIF LAMBDA = WORK( WINDEX ) * Reset twist index from inaccurate LAMBDA to * force computation of true MINGMA ITWIST = 0 ENDIF * Given LAMBDA, compute the eigenvector. CALL DLAR1VA( IN, 1, IN, LAMBDA, D(IBEGIN), $ L( IBEGIN ), WORK(INDLD+IBEGIN-1), $ WORK(INDLLD+IBEGIN-1), $ PIVMIN, GAPTOL, Z( IBEGIN, ZINDEX), $ .NOT.USEDBS, NEGCNT, ZTZ, MINGMA, $ ITWIST, ISUPPZ( 2*WINDEX-1 ), $ NRMINV, RESID, RQCORR, WORK( INDWRK ) ) IF(ITER .EQ. 0) THEN BSTRES = RESID BSTW = LAMBDA ELSEIF(RESID.LT.BSTRES) THEN BSTRES = RESID BSTW = LAMBDA ENDIF ISUPMN = MIN(ISUPMN,ISUPPZ( 2*WINDEX-1 )) ISUPMX = MAX(ISUPMX,ISUPPZ( 2*WINDEX )) ITER = ITER + 1 * * Convergence test for Rayleigh-Quotient iteration * (omitted when Bisection has been used) * IF( RESID.GT.TOL*GAP .AND. ABS( RQCORR ).GT. $ RQTOL*ABS( LAMBDA ) .AND. .NOT. USEDBS) $ THEN * We need to check that the RQCORR update doesn't * move the eigenvalue away from the desired one and * towards a neighbor. -> protection with bisection IF(INDEIG.LE.NEGCNT) THEN * The wanted eigenvalue lies to the left SGNDEF = -ONE ELSE * The wanted eigenvalue lies to the right SGNDEF = ONE ENDIF * We only use the RQCORR if it improves the * the iterate reasonably. IF( ( RQCORR*SGNDEF.GE.ZERO ) $ .AND.( LAMBDA + RQCORR.LE. RIGHT) $ .AND.( LAMBDA + RQCORR.GE. LEFT) $ ) THEN USEDRQ = .TRUE. * Store new midpoint of bisection interval in WORK IF(SGNDEF.EQ.ONE) THEN * The current LAMBDA is on the left of the true * eigenvalue LEFT = LAMBDA ELSE * The current LAMBDA is on the right of the true * eigenvalue RIGHT = LAMBDA ENDIF WORK( WINDEX ) = $ HALF * (RIGHT + LEFT) * Take RQCORR since it has the correct sign and * improves the iterate reasonably LAMBDA = LAMBDA + RQCORR * Update width of error interval WERR( WINDEX ) = $ HALF * (RIGHT-LEFT) ELSE NEEDBS = .TRUE. ENDIF IF(RIGHT-LEFT.LT.RQTOL*ABS(LAMBDA)) THEN * The eigenvalue is computed to bisection accuracy * compute eigenvector and stop USEDBS = .TRUE. GOTO 120 ELSEIF( ITER.LT.MAXITR ) THEN GOTO 120 ELSEIF( ITER.EQ.MAXITR ) THEN NEEDBS = .TRUE. GOTO 120 ELSE INFO = 5 RETURN END IF ELSE STP2II = .FALSE. IF(USEDRQ .AND. USEDBS .AND. $ BSTRES.LE.RESID) THEN LAMBDA = BSTW STP2II = .TRUE. ENDIF IF (STP2II) THEN CALL DLAR1VA( IN, 1, IN, LAMBDA, $ D( IBEGIN ), L( IBEGIN ), $ WORK(INDLD+IBEGIN-1), $ WORK(INDLLD+IBEGIN-1), $ PIVMIN, GAPTOL, $ Z( IBEGIN, ZINDEX ), $ .NOT.USEDBS, NEGCNT, ZTZ, MINGMA, $ ITWIST, $ ISUPPZ( 2*WINDEX-1 ), $ NRMINV, RESID, RQCORR, WORK( INDWRK ) ) ENDIF WORK( WINDEX ) = LAMBDA END IF * * Compute FP-vector support w.r.t. whole matrix * ISUPPZ( 2*WINDEX-1 ) = ISUPPZ( 2*WINDEX-1 )+OLDIEN ISUPPZ( 2*WINDEX ) = ISUPPZ( 2*WINDEX )+OLDIEN ZFROM = ISUPPZ( 2*WINDEX-1 ) ZTO = ISUPPZ( 2*WINDEX ) ISUPMN = ISUPMN + OLDIEN ISUPMX = ISUPMX + OLDIEN * Ensure vector is ok if support in the RQI has changed IF(ISUPMN.LT.ZFROM) THEN DO 122 II = ISUPMN,ZFROM-1 Z( II, ZINDEX ) = ZERO 122 CONTINUE ENDIF IF(ISUPMX.GT.ZTO) THEN DO 123 II = ZTO+1,ISUPMX Z( II, ZINDEX ) = ZERO 123 CONTINUE ENDIF CALL DSCAL( ZTO-ZFROM+1, NRMINV, $ Z( ZFROM, ZINDEX ), 1 ) 125 CONTINUE * Update W W( WINDEX ) = LAMBDA+SIGMA * Recompute the gaps on the left and right * But only allow them to become larger and not * smaller (which can only happen through "bad" * cancellation and doesn't reflect the theory * where the initial gaps are underestimated due * to WERR being too crude.) IF(.NOT.ESKIP) THEN IF( K.GT.1) THEN WGAP( WINDMN ) = MAX( WGAP(WINDMN), $ W(WINDEX)-WERR(WINDEX) $ - W(WINDMN)-WERR(WINDMN) ) ENDIF IF( WINDEX.LT.WEND ) THEN WGAP( WINDEX ) = MAX( SAVEGP, $ W( WINDPL )-WERR( WINDPL ) $ - W( WINDEX )-WERR( WINDEX) ) ENDIF ENDIF ENDIF * here ends the code for the current child * 139 CONTINUE * Proceed to any remaining child nodes NEWFST = J + 1 140 CONTINUE 150 CONTINUE * Store number of clusters IWORK( IINCLS + JBLK ) = NCLUS * END IF IBEGIN = IEND + 1 WBEGIN = WEND + 1 170 CONTINUE * * Check if everything is done: no clusters left for * this processor in any block * FINISH = .TRUE. DO 180 JBLK = 1, IBLOCK( M ) FINISH = FINISH .AND. (IWORK(IINCLS + JBLK).EQ.0) 180 CONTINUE IF(.NOT.FINISH) THEN NDEPTH = NDEPTH + 1 IF((NEEDIL.GE.DOL).AND.(NEEDIU.LE.DOU)) THEN * Once this processor's part of the * representation tree consists exclusively of eigenvalues * between DOL and DOU, it can work independently from all * others. GOTO 40 ENDIF ENDIF * RETURN * * End of DLARRV2 * END scalapack-2.0.2/SRC/dlasorte.f000644 000766 000024 00000010037 10602576752 016337 0ustar00juliestaff000000 000000 SUBROUTINE DLASORTE( S, LDS, J, OUT, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER INFO, J, LDS * .. * .. Array Arguments .. DOUBLE PRECISION OUT( J, * ), S( LDS, * ) * .. * * Purpose * ======= * * DLASORTE sorts eigenpairs so that real eigenpairs are together and * complex are together. This way one can employ 2x2 shifts easily * since every 2nd subdiagonal is guaranteed to be zero. * This routine does no parallel work. * * Arguments * ========= * * S (local input/output) DOUBLE PRECISION array, dimension LDS * On entry, a matrix already in Schur form. * On exit, the diagonal blocks of S have been rewritten to pair * the eigenvalues. The resulting matrix is no longer * similar to the input. * * LDS (local input) INTEGER * On entry, the leading dimension of the local array S. * Unchanged on exit. * * J (local input) INTEGER * On entry, the order of the matrix S. * Unchanged on exit. * * OUT (local input/output) DOUBLE PRECISION array, dimension Jx2 * This is the work buffer required by this routine. * * INFO (local input) INTEGER * This is set if the input matrix had an odd number of real * eigenvalues and things couldn't be paired or if the input * matrix S was not originally in Schur form. * 0 indicates successful completion. * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER BOT, I, LAST, TOP * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * LAST = J TOP = 1 BOT = J INFO = 0 DO 10 I = J - 1, 1, -1 IF( S( I+1, I ).EQ.ZERO ) THEN IF( LAST-I.EQ.2 ) THEN OUT( BOT-1, 1 ) = S( I+1, I+1 ) OUT( BOT, 2 ) = S( I+2, I+2 ) OUT( BOT-1, 2 ) = S( I+1, I+2 ) OUT( BOT, 1 ) = S( I+2, I+1 ) BOT = BOT - 2 END IF IF( LAST-I.EQ.1 ) THEN IF( MOD( TOP, 2 ).EQ.1 ) THEN * * FIRST OF A PAIR * IF( ( I.EQ.J-1 ) .OR. ( I.EQ.1 ) ) THEN OUT( TOP, 1 ) = S( I+1, I+1 ) ELSE OUT( TOP, 1 ) = S( I+1, I+1 ) END IF OUT( TOP, 2 ) = ZERO ELSE * * SECOND OF A PAIR * IF( ( I.EQ.J-1 ) .OR. ( I.EQ.1 ) ) THEN OUT( TOP, 2 ) = S( I+1, I+1 ) ELSE OUT( TOP, 2 ) = S( I+1, I+1 ) END IF OUT( TOP, 1 ) = ZERO END IF TOP = TOP + 1 END IF IF( LAST-I.GT.2 ) THEN INFO = I RETURN END IF LAST = I END IF 10 CONTINUE IF( LAST.EQ.2 ) THEN * * GRAB LAST DOUBLE PAIR * OUT( BOT-1, 1 ) = S( 1, 1 ) OUT( BOT, 2 ) = S( 2, 2 ) OUT( BOT-1, 2 ) = S( 1, 2 ) OUT( BOT, 1 ) = S( 2, 1 ) BOT = BOT - 2 END IF IF( LAST.EQ.1 .and. mod(top, 2) .eq. 0 ) THEN * * GRAB SECOND PART OF LAST PAIR * OUT(TOP, 2) = s(1,1) OUT(TOP, 1) = zero TOP = TOP + 1 END IF IF( TOP-1.NE.BOT ) THEN INFO = -BOT RETURN END IF * * Overwrite the S diagonals * DO 20 I = 1, J, 2 S( I, I ) = OUT( I, 1 ) S( I+1, I ) = OUT( I+1, 1 ) S( I, I+1 ) = OUT( I, 2 ) S( I+1, I+1 ) = OUT( I+1, 2 ) 20 CONTINUE * RETURN * * End of DLASORTE * END scalapack-2.0.2/SRC/dlasrt2.f000644 000766 000024 00000016143 10363532303 016065 0ustar00juliestaff000000 000000 * * SUBROUTINE DLASRT2( ID, N, D, KEY, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER ID INTEGER INFO, N * .. * .. Array Arguments .. INTEGER KEY( * ) DOUBLE PRECISION D( * ) * .. * * Purpose * ======= * * Sort the numbers in D in increasing order (if ID = 'I') or * in decreasing order (if ID = 'D' ). * * Use Quick Sort, reverting to Insertion sort on arrays of * size <= 20. Dimension of STACK limits N to about 2**32. * * Arguments * ========= * * ID (input) CHARACTER*1 * = 'I': sort D in increasing order; * = 'D': sort D in decreasing order. * * N (input) INTEGER * The length of the array D. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the array to be sorted. * On exit, D has been sorted into increasing order * (D(1) <= ... <= D(N) ) or into decreasing order * (D(1) >= ... >= D(N) ), depending on ID. * * KEY (input/output) INTEGER array, dimension (N) * On entry, KEY contains a key to each of the entries in D() * Typically, KEY(I) = I for all I * On exit, KEY is permuted in exactly the same manner as * D() was permuted from input to output * Therefore, if KEY(I) = I for all I upon input, then * D_out(I) = D_in(KEY(I)) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. INTEGER SELECT PARAMETER ( SELECT = 20 ) * .. * .. Local Scalars .. INTEGER DIR, ENDD, I, J, START, STKPNT, TMPKEY DOUBLE PRECISION D1, D2, D3, DMNMX, TMP * .. * .. Local Arrays .. INTEGER STACK( 2, 32 ) * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input paramters. * * INFO = 0 DIR = -1 IF( LSAME( ID, 'D' ) ) THEN DIR = 0 ELSE IF( LSAME( ID, 'I' ) ) THEN DIR = 1 END IF IF( DIR.EQ.-1 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLASRT2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.LE.1 ) $ RETURN * STKPNT = 1 STACK( 1, 1 ) = 1 STACK( 2, 1 ) = N 10 CONTINUE START = STACK( 1, STKPNT ) ENDD = STACK( 2, STKPNT ) STKPNT = STKPNT - 1 IF( ENDD-START.GT.0 ) THEN * * Do Insertion sort on D( START:ENDD ) * IF( DIR.EQ.0 ) THEN * * Sort into decreasing order * DO 30 I = START + 1, ENDD DO 20 J = I, START + 1, -1 IF( D( J ).GT.D( J-1 ) ) THEN DMNMX = D( J ) D( J ) = D( J-1 ) D( J-1 ) = DMNMX TMPKEY = KEY( J ) KEY( J ) = KEY( J-1 ) KEY( J-1 ) = TMPKEY ELSE GO TO 30 END IF 20 CONTINUE 30 CONTINUE * ELSE * * Sort into increasing order * DO 50 I = START + 1, ENDD DO 40 J = I, START + 1, -1 IF( D( J ).LT.D( J-1 ) ) THEN DMNMX = D( J ) D( J ) = D( J-1 ) D( J-1 ) = DMNMX TMPKEY = KEY( J ) KEY( J ) = KEY( J-1 ) KEY( J-1 ) = TMPKEY ELSE GO TO 50 END IF 40 CONTINUE 50 CONTINUE * END IF * ELSE IF( ENDD-START.GT.SELECT ) THEN * * Partition D( START:ENDD ) and stack parts, largest one first * * Choose partition entry as median of 3 * D1 = D( START ) D2 = D( ENDD ) I = ( START+ENDD ) / 2 D3 = D( I ) IF( D1.LT.D2 ) THEN IF( D3.LT.D1 ) THEN DMNMX = D1 ELSE IF( D3.LT.D2 ) THEN DMNMX = D3 ELSE DMNMX = D2 END IF ELSE IF( D3.LT.D2 ) THEN DMNMX = D2 ELSE IF( D3.LT.D1 ) THEN DMNMX = D3 ELSE DMNMX = D1 END IF END IF * IF( DIR.EQ.0 ) THEN * * Sort into decreasing order * I = START - 1 J = ENDD + 1 60 CONTINUE 70 CONTINUE J = J - 1 IF( D( J ).LT.DMNMX ) $ GO TO 70 80 CONTINUE I = I + 1 IF( D( I ).GT.DMNMX ) $ GO TO 80 IF( I.LT.J ) THEN TMP = D( I ) D( I ) = D( J ) D( J ) = TMP TMPKEY = KEY( J ) KEY( J ) = KEY( I ) KEY( I ) = TMPKEY GO TO 60 END IF IF( J-START.GT.ENDD-J-1 ) THEN STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD ELSE STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J END IF ELSE * * Sort into increasing order * I = START - 1 J = ENDD + 1 90 CONTINUE 100 CONTINUE J = J - 1 IF( D( J ).GT.DMNMX ) $ GO TO 100 110 CONTINUE I = I + 1 IF( D( I ).LT.DMNMX ) $ GO TO 110 IF( I.LT.J ) THEN TMP = D( I ) D( I ) = D( J ) D( J ) = TMP TMPKEY = KEY( J ) KEY( J ) = KEY( I ) KEY( I ) = TMPKEY GO TO 90 END IF IF( J-START.GT.ENDD-J-1 ) THEN STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD ELSE STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J END IF END IF END IF IF( STKPNT.GT.0 ) $ GO TO 10 * * RETURN * * End of DLASRT2 * END scalapack-2.0.2/SRC/dpttrsv.f000644 000766 000024 00000007113 11657111056 016222 0ustar00juliestaff000000 000000 SUBROUTINE DPTTRSV( TRANS, N, NRHS, D, E, B, LDB, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * November, 1996. * Modified from DPTTRS: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. CHARACTER TRANS INTEGER INFO, LDB, N, NRHS * .. * .. Array Arguments .. DOUBLE PRECISION D( * ) DOUBLE PRECISION B( LDB, * ), E( * ) * .. * * Purpose * ======= * * DPTTRSV solves one of the triangular systems * L**T* X = B, or L * X = B, * where L is the Cholesky factor of a Hermitian positive * definite tridiagonal matrix A such that * A = L*D*L**H (computed by DPTTRF). * * Arguments * ========= * * TRANS (input) CHARACTER * Specifies the form of the system of equations: * = 'N': L * X = B (No transpose) * = 'T': L**T * X = B (Transpose) * * N (input) INTEGER * The order of the tridiagonal matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * D (input) REAL array, dimension (N) * The n diagonal elements of the diagonal matrix D from the * factorization computed by DPTTRF. * * E (input) COMPLEX array, dimension (N-1) * The (n-1) off-diagonal elements of the unit bidiagonal * factor U or L from the factorization computed by DPTTRF * (see UPLO). * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Local Scalars .. LOGICAL NOTRAN INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input arguments. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DPTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN IF( NOTRAN ) THEN * DO 60 J = 1, NRHS * * Solve L * x = b. * DO 40 I = 2, N B( I, J ) = B( I, J ) - B( I-1, J )*E( I-1 ) 40 CONTINUE 60 CONTINUE * ELSE * DO 65 J = 1, NRHS * * Solve L**H * x = b. * DO 50 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - $ B( I+1, J )*( E( I ) ) 50 CONTINUE 65 CONTINUE ENDIF * RETURN * * End of DPTTRS * END scalapack-2.0.2/SRC/dstegr2.f000644 000766 000024 00000044472 11657111056 016077 0ustar00juliestaff000000 000000 SUBROUTINE DSTEGR2( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, $ M, W, Z, LDZ, NZC, ISUPPZ, WORK, LWORK, IWORK, $ LIWORK, DOL, DOU, ZOFFSET, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE INTEGER DOL, DOU, IL, INFO, IU, $ LDZ, NZC, LIWORK, LWORK, M, N, ZOFFSET DOUBLE PRECISION VL, VU * .. * .. Array Arguments .. INTEGER ISUPPZ( * ), IWORK( * ) DOUBLE PRECISION D( * ), E( * ), W( * ), WORK( * ) DOUBLE PRECISION Z( LDZ, * ) * .. * * Purpose * ======= * * DSTEGR2 computes selected eigenvalues and, optionally, eigenvectors * of a real symmetric tridiagonal matrix T. It is invoked in the * ScaLAPACK MRRR driver PDSYEVR and the corresponding Hermitian * version either when only eigenvalues are to be computed, or when only * a single processor is used (the sequential-like case). * * DSTEGR2 has been adapted from LAPACK's DSTEGR. Please note the * following crucial changes. * * 1. The calling sequence has two additional INTEGER parameters, * DOL and DOU, that should satisfy M>=DOU>=DOL>=1. * DSTEGR2 ONLY computes the eigenpairs * corresponding to eigenvalues DOL through DOU in W. (That is, * instead of computing the eigenpairs belonging to W(1) * through W(M), only the eigenvectors belonging to eigenvalues * W(DOL) through W(DOU) are computed. In this case, only the * eigenvalues DOL:DOU are guaranteed to be fully accurate. * * 2. M is NOT the number of eigenvalues specified by RANGE, but is * M = DOU - DOL + 1. This concerns the case where only eigenvalues * are computed, but on more than one processor. Thus, in this case * M refers to the number of eigenvalues computed on this processor. * * 3. The arrays W and Z might not contain all the wanted eigenpairs * locally, instead this information is distributed over other * processors. * * Arguments * ========= * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the half-open interval (VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the N diagonal elements of the tridiagonal matrix * T. On exit, D is overwritten. * * E (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the (N-1) subdiagonal elements of the tridiagonal * matrix T in elements 1 to N-1 of E. E(N) need not be set on * input, but is used internally as workspace. * On exit, E is overwritten. * * VL (input) DOUBLE PRECISION * VU (input) DOUBLE PRECISION * If RANGE='V', the lower and upper bounds of the interval to * be searched for eigenvalues. VL < VU. * Not referenced if RANGE = 'A' or 'I'. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N, if N > 0. * Not referenced if RANGE = 'A' or 'V'. * * M (output) INTEGER * Globally summed over all processors, M equals * the total number of eigenvalues found. 0 <= M <= N. * If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. * The local output equals M = DOU - DOL + 1. * * W (output) DOUBLE PRECISION array, dimension (N) * The first M elements contain the selected eigenvalues in * ascending order. Note that immediately after exiting this * routine, only the eigenvalues from * position DOL:DOU are to reliable on this processor * because the eigenvalue computation is done in parallel. * Other processors will hold reliable information on other * parts of the W array. This information is communicated in * the ScaLAPACK driver. * * Z (output) DOUBLE PRECISION array, dimension (LDZ, max(1,M) ) * If JOBZ = 'V', and if INFO = 0, then the first M columns of Z * contain some of the orthonormal eigenvectors of the matrix T * corresponding to the selected eigenvalues, with the i-th * column of Z holding the eigenvector associated with W(i). * If JOBZ = 'N', then Z is not referenced. * Note: the user must ensure that at least max(1,M) columns are * supplied in the array Z; if RANGE = 'V', the exact value of M * is not known in advance and can be computed with a workspace * query by setting NZC = -1, see below. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * JOBZ = 'V', then LDZ >= max(1,N). * * NZC (input) INTEGER * The number of eigenvectors to be held in the array Z. * If RANGE = 'A', then NZC >= max(1,N). * If RANGE = 'V', then NZC >= the number of eigenvalues in (VL,VU]. * If RANGE = 'I', then NZC >= IU-IL+1. * If NZC = -1, then a workspace query is assumed; the * routine calculates the number of columns of the array Z that * are needed to hold the eigenvectors. * This value is returned as the first entry of the Z array, and * no error message related to NZC is issued. * * ISUPPZ (output) INTEGER ARRAY, dimension ( 2*max(1,M) ) * The support of the eigenvectors in Z, i.e., the indices * indicating the nonzero elements in Z. The i-th computed eigenvector * is nonzero only in elements ISUPPZ( 2*i-1 ) through * ISUPPZ( 2*i ). This is relevant in the case when the matrix * is split. ISUPPZ is only set if N>2. * * WORK (workspace/output) DOUBLE PRECISION array, dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal * (and minimal) LWORK. * * LWORK (input) INTEGER * The dimension of the array WORK. LWORK >= max(1,18*N) * if JOBZ = 'V', and LWORK >= max(1,12*N) if JOBZ = 'N'. * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued. * * IWORK (workspace/output) INTEGER array, dimension (LIWORK) * On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. LIWORK >= max(1,10*N) * if the eigenvectors are desired, and LIWORK >= max(1,8*N) * if only the eigenvalues are to be computed. * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued. * * DOL (input) INTEGER * DOU (input) INTEGER * From the eigenvalues W(1:M), only eigenvectors * Z(:,DOL) to Z(:,DOU) are computed. * If DOL > 1, then Z(:,DOL-1-ZOFFSET) is used and overwritten. * If DOU < M, then Z(:,DOU+1-ZOFFSET) is used and overwritten. * * ZOFFSET (input) INTEGER * Offset for storing the eigenpairs when Z is distributed * in 1D-cyclic fashion * * INFO (output) INTEGER * On exit, INFO * = 0: successful exit * other:if INFO = -i, the i-th argument had an illegal value * if INFO = 10X, internal error in DLARRE2, * if INFO = 20X, internal error in DLARRV. * Here, the digit X = ABS( IINFO ) < 10, where IINFO is * the nonzero error code returned by DLARRE2 or * DLARRV, respectively. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, FOUR, MINRGP PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, $ FOUR = 4.0D0, $ MINRGP = 1.0D-3 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY INTEGER I, IIL, IINDBL, IINDW, IINDWK, IINFO, IINSPL, $ IIU, INDE2, INDERR, INDGP, INDGRS, INDWRK, $ ITMP, ITMP2, J, JJ, LIWMIN, LWMIN, NSPLIT, $ NZCMIN DOUBLE PRECISION BIGNUM, EPS, PIVMIN, RMAX, RMIN, RTOL1, RTOL2, $ SAFMIN, SCALE, SMLNUM, THRESH, TMP, TNRM, WL, $ WU * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANST EXTERNAL LSAME, DLAMCH, DLANST * .. * .. External Subroutines .. EXTERNAL DCOPY, DLAE2, DLAEV2, DLARRC, DLARRE2, $ DLARRV, DLASRT, DSCAL, DSWAP * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * WANTZ = LSAME( JOBZ, 'V' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) * DSTEGR2 needs WORK of size 6*N, IWORK of size 3*N. * In addition, DLARRE2 needs WORK of size 6*N, IWORK of size 5*N. * Furthermore, DLARRV needs WORK of size 12*N, IWORK of size 7*N. IF( WANTZ ) THEN LWMIN = 18*N LIWMIN = 10*N ELSE * need less workspace if only the eigenvalues are wanted LWMIN = 12*N LIWMIN = 8*N ENDIF WL = ZERO WU = ZERO IIL = 0 IIU = 0 IF( VALEIG ) THEN * We do not reference VL, VU in the cases RANGE = 'I','A' * The interval (WL, WU] contains all the wanted eigenvalues. * It is either given by the user or computed in DLARRE2. WL = VL WU = VU ELSEIF( INDEIG ) THEN * We do not reference IL, IU in the cases RANGE = 'V','A' IIL = IL IIU = IU ENDIF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( VALEIG .AND. N.GT.0 .AND. WU.LE.WL ) THEN INFO = -7 ELSE IF( INDEIG .AND. ( IIL.LT.1 .OR. IIL.GT.N ) ) THEN INFO = -8 ELSE IF( INDEIG .AND. ( IIU.LT.IIL .OR. IIU.GT.N ) ) THEN INFO = -9 ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN INFO = -13 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * * Get machine constants. * SAFMIN = DLAMCH( 'Safe minimum' ) EPS = DLAMCH( 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * IF( INFO.EQ.0 ) THEN WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN * IF( WANTZ .AND. ALLEIG ) THEN NZCMIN = N IIL = 1 IIU = N ELSE IF( WANTZ .AND. VALEIG ) THEN CALL DLARRC( 'T', N, VL, VU, D, E, SAFMIN, $ NZCMIN, ITMP, ITMP2, INFO ) IIL = ITMP+1 IIU = ITMP2 ELSE IF( WANTZ .AND. INDEIG ) THEN NZCMIN = IIU-IIL+1 ELSE * WANTZ .EQ. FALSE. NZCMIN = 0 ENDIF IF( ZQUERY .AND. INFO.EQ.0 ) THEN Z( 1,1 ) = NZCMIN ELSE IF( NZC.LT.NZCMIN .AND. .NOT.ZQUERY ) THEN INFO = -14 END IF END IF IF ( WANTZ ) THEN IF ( DOL.LT.1 .OR. DOL.GT.NZCMIN ) THEN INFO = -20 ENDIF IF ( DOU.LT.1 .OR. DOU.GT.NZCMIN .OR. DOU.LT.DOL) THEN INFO = -21 ENDIF ENDIF IF( INFO.NE.0 ) THEN * C Disable sequential error handler C for parallel case C CALL XERBLA( 'DSTEGR2', -INFO ) * RETURN ELSE IF( LQUERY .OR. ZQUERY ) THEN RETURN END IF * * Quick return if possible * M = 0 IF( N.EQ.0 ) $ RETURN * IF( N.EQ.1 ) THEN IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = D( 1 ) ELSE IF( WL.LT.D( 1 ) .AND. WU.GE.D( 1 ) ) THEN M = 1 W( 1 ) = D( 1 ) END IF END IF IF( WANTZ ) $ Z( 1, 1 ) = ONE RETURN END IF * INDGRS = 1 INDERR = 2*N + 1 INDGP = 3*N + 1 INDE2 = 5*N + 1 INDWRK = 6*N + 1 * IINSPL = 1 IINDBL = N + 1 IINDW = 2*N + 1 IINDWK = 3*N + 1 * * Scale matrix to allowable range, if necessary. * SCALE = ONE TNRM = DLANST( 'M', N, D, E ) IF( TNRM.GT.ZERO .AND. TNRM.LT.RMIN ) THEN SCALE = RMIN / TNRM ELSE IF( TNRM.GT.RMAX ) THEN SCALE = RMAX / TNRM END IF IF( SCALE.NE.ONE ) THEN CALL DSCAL( N, SCALE, D, 1 ) CALL DSCAL( N-1, SCALE, E, 1 ) TNRM = TNRM*SCALE IF( VALEIG ) THEN * If eigenvalues in interval have to be found, * scale (WL, WU] accordingly WL = WL*SCALE WU = WU*SCALE ENDIF END IF * * Compute the desired eigenvalues of the tridiagonal after splitting * into smaller subblocks if the corresponding off-diagonal elements * are small * THRESH is the splitting parameter for DLARRE2 * A negative THRESH forces the old splitting criterion based on the * size of the off-diagonal. A positive THRESH switches to splitting * which preserves relative accuracy. * IINFO = -1 * Set the splitting criterion IF (IINFO.EQ.0) THEN THRESH = EPS ELSE THRESH = -EPS ENDIF * * Store the squares of the offdiagonal values of T DO 5 J = 1, N-1 WORK( INDE2+J-1 ) = E(J)**2 5 CONTINUE * Set the tolerance parameters for bisection IF( .NOT.WANTZ ) THEN * DLARRE2 computes the eigenvalues to full precision. RTOL1 = FOUR * EPS RTOL2 = FOUR * EPS ELSE * DLARRE2 computes the eigenvalues to less than full precision. * DLARRV will refine the eigenvalue approximations, and we can * need less accurate initial bisection in DLARRE2. * Note: these settings do only affect the subset case and DLARRE2 RTOL1 = SQRT(EPS) RTOL2 = MAX( SQRT(EPS)*5.0D-3, FOUR * EPS ) ENDIF CALL DLARRE2( RANGE, N, WL, WU, IIL, IIU, D, E, $ WORK(INDE2), RTOL1, RTOL2, THRESH, NSPLIT, $ IWORK( IINSPL ), M, DOL, DOU, $ W, WORK( INDERR ), $ WORK( INDGP ), IWORK( IINDBL ), $ IWORK( IINDW ), WORK( INDGRS ), PIVMIN, $ WORK( INDWRK ), IWORK( IINDWK ), IINFO ) IF( IINFO.NE.0 ) THEN INFO = 100 + ABS( IINFO ) RETURN END IF * Note that if RANGE .NE. 'V', DLARRE2 computes bounds on the desired * part of the spectrum. All desired eigenvalues are contained in * (WL,WU] IF( WANTZ ) THEN * * Compute the desired eigenvectors corresponding to the computed * eigenvalues * CALL DLARRV( N, WL, WU, D, E, $ PIVMIN, IWORK( IINSPL ), M, $ DOL, DOU, MINRGP, RTOL1, RTOL2, $ W, WORK( INDERR ), WORK( INDGP ), IWORK( IINDBL ), $ IWORK( IINDW ), WORK( INDGRS ), Z, LDZ, $ ISUPPZ, WORK( INDWRK ), IWORK( IINDWK ), IINFO ) IF( IINFO.NE.0 ) THEN INFO = 200 + ABS( IINFO ) RETURN END IF ELSE * DLARRE2 computes eigenvalues of the (shifted) root representation * DLARRV returns the eigenvalues of the unshifted matrix. * However, if the eigenvectors are not desired by the user, we need * to apply the corresponding shifts from DLARRE2 to obtain the * eigenvalues of the original matrix. DO 20 J = 1, M ITMP = IWORK( IINDBL+J-1 ) W( J ) = W( J ) + E( IWORK( IINSPL+ITMP-1 ) ) 20 CONTINUE END IF * * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( SCALE.NE.ONE ) THEN CALL DSCAL( M, ONE / SCALE, W, 1 ) END IF * * Correct M if needed * IF ( WANTZ ) THEN IF( DOL.NE.1 .OR. DOU.NE.M ) THEN M = DOU - DOL +1 ENDIF ENDIF * * If eigenvalues are not in increasing order, then sort them, * possibly along with eigenvectors. * IF( NSPLIT.GT.1 ) THEN IF( .NOT. WANTZ ) THEN CALL DLASRT( 'I', DOU - DOL +1, W(DOL), IINFO ) IF( IINFO.NE.0 ) THEN INFO = 3 RETURN END IF ELSE DO 60 J = DOL, DOU - 1 I = 0 TMP = W( J ) DO 50 JJ = J + 1, M IF( W( JJ ).LT.TMP ) THEN I = JJ TMP = W( JJ ) END IF 50 CONTINUE IF( I.NE.0 ) THEN W( I ) = W( J ) W( J ) = TMP IF( WANTZ ) THEN CALL DSWAP( N, Z( 1, I-ZOFFSET ), $ 1, Z( 1, J-ZOFFSET ), 1 ) ITMP = ISUPPZ( 2*I-1 ) ISUPPZ( 2*I-1 ) = ISUPPZ( 2*J-1 ) ISUPPZ( 2*J-1 ) = ITMP ITMP = ISUPPZ( 2*I ) ISUPPZ( 2*I ) = ISUPPZ( 2*J ) ISUPPZ( 2*J ) = ITMP END IF END IF 60 CONTINUE END IF ENDIF * WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN RETURN * * End of DSTEGR2 * END scalapack-2.0.2/SRC/dstegr2a.f000644 000766 000024 00000041354 11657111056 016234 0ustar00juliestaff000000 000000 SUBROUTINE DSTEGR2A( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, $ M, W, Z, LDZ, NZC, WORK, LWORK, IWORK, $ LIWORK, DOL, DOU, NEEDIL, NEEDIU, $ INDERR, NSPLIT, PIVMIN, SCALE, WL, WU, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE INTEGER DOL, DOU, IL, INDERR, INFO, IU, LDZ, LIWORK, $ LWORK, M, N, NEEDIL, NEEDIU, NSPLIT, NZC DOUBLE PRECISION PIVMIN, SCALE, VL, VU, WL, WU * .. * .. Array Arguments .. INTEGER IWORK( * ) DOUBLE PRECISION D( * ), E( * ), W( * ), WORK( * ) DOUBLE PRECISION Z( LDZ, * ) * .. * * Purpose * ======= * * DSTEGR2A computes selected eigenvalues and initial representations. * needed for eigenvector computations in DSTEGR2B. It is invoked in the * ScaLAPACK MRRR driver PDSYEVR and the corresponding Hermitian * version when both eigenvalues and eigenvectors are computed in parallel. * on multiple processors. For this case, DSTEGR2A implements the FIRST * part of the MRRR algorithm, parallel eigenvalue computation and finding * the root RRR. At the end of DSTEGR2A, * other processors might have a part of the spectrum that is needed to * continue the computation locally. Once this eigenvalue information has * been received by the processor, the computation can then proceed by calling * the SECOND part of the parallel MRRR algorithm, DSTEGR2B. * * Please note: * 1. The calling sequence has two additional INTEGER parameters, * (compared to LAPACK's DSTEGR), these are * DOL and DOU and should satisfy M>=DOU>=DOL>=1. * These parameters are only relevant for the case JOBZ = 'V'. * * Globally invoked over all processors, DSTEGR2A computes * ALL the eigenVALUES specified by RANGE. * RANGE= 'A': all eigenvalues will be found. * = 'V': all eigenvalues in (VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * DSTEGR2A LOCALLY only computes the eigenvalues * corresponding to eigenvalues DOL through DOU in W. (That is, * instead of computing the eigenvectors belonging to W(1) * through W(M), only the eigenvectors belonging to eigenvalues * W(DOL) through W(DOU) are computed. In this case, only the * eigenvalues DOL:DOU are guaranteed to be fully accurate. * * 2. M is NOT the number of eigenvalues specified by RANGE, but it is * M = DOU - DOL + 1. Instead, M refers to the number of eigenvalues computed on * this processor. * * 3. While no eigenvectors are computed in DSTEGR2A itself (this is * done later in DSTEGR2B), the interface * If JOBZ = 'V' then, depending on RANGE and DOL, DOU, DSTEGR2A * might need more workspace in Z then the original DSTEGR. * In particular, the arrays W and Z might not contain all the wanted eigenpairs * locally, instead this information is distributed over other * processors. * * Arguments * ========= * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the half-open interval (VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the N diagonal elements of the tridiagonal matrix * T. On exit, D is overwritten. * * E (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the (N-1) subdiagonal elements of the tridiagonal * matrix T in elements 1 to N-1 of E. E(N) need not be set on * input, but is used internally as workspace. * On exit, E is overwritten. * * VL (input) DOUBLE PRECISION * VU (input) DOUBLE PRECISION * If RANGE='V', the lower and upper bounds of the interval to * be searched for eigenvalues. VL < VU. * Not referenced if RANGE = 'A' or 'I'. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N, if N > 0. * Not referenced if RANGE = 'A' or 'V'. * * M (output) INTEGER * Globally summed over all processors, M equals * the total number of eigenvalues found. 0 <= M <= N. * If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. * The local output equals M = DOU - DOL + 1. * * W (output) DOUBLE PRECISION array, dimension (N) * The first M elements contain approximations to the selected * eigenvalues in ascending order. Note that immediately after * exiting this routine, only the eigenvalues from * position DOL:DOU are to reliable on this processor * because the eigenvalue computation is done in parallel. * The other entries outside DOL:DOU are very crude preliminary * approximations. Other processors hold reliable information on * these other parts of the W array. * This information is communicated in the ScaLAPACK driver. * * Z (output) DOUBLE PRECISION array, dimension (LDZ, max(1,M) ) * DSTEGR2A does not compute eigenvectors, this is done * in DSTEGR2B. The argument Z as well as all related * other arguments only appear to keep the interface consistent * and to signal to the user that this subroutine is meant to * be used when eigenvectors are computed. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * JOBZ = 'V', then LDZ >= max(1,N). * * NZC (input) INTEGER * The number of eigenvectors to be held in the array Z. * If RANGE = 'A', then NZC >= max(1,N). * If RANGE = 'V', then NZC >= the number of eigenvalues in (VL,VU]. * If RANGE = 'I', then NZC >= IU-IL+1. * If NZC = -1, then a workspace query is assumed; the * routine calculates the number of columns of the array Z that * are needed to hold the eigenvectors. * This value is returned as the first entry of the Z array, and * no error message related to NZC is issued. * * WORK (workspace/output) DOUBLE PRECISION array, dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal * (and minimal) LWORK. * * LWORK (input) INTEGER * The dimension of the array WORK. LWORK >= max(1,18*N) * if JOBZ = 'V', and LWORK >= max(1,12*N) if JOBZ = 'N'. * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued. * * IWORK (workspace/output) INTEGER array, dimension (LIWORK) * On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. LIWORK >= max(1,10*N) * if the eigenvectors are desired, and LIWORK >= max(1,8*N) * if only the eigenvalues are to be computed. * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued. * * DOL (input) INTEGER * DOU (input) INTEGER * From all the eigenvalues W(1:M), only eigenvalues * W(DOL:DOU) are computed. * * NEEDIL (output) INTEGER * NEEDIU (output) INTEGER * The indices of the leftmost and rightmost eigenvalues * needed to accurately compute the relevant part of the * representation tree. This information can be used to * find out which processors have the relevant eigenvalue * information needed so that it can be communicated. * * INDERR (output) INTEGER * INDERR points to the place in the work space where * the eigenvalue uncertainties (errors) are stored. * * NSPLIT (output) INTEGER * The number of blocks T splits into. 1 <= NSPLIT <= N. * * PIVMIN (output) DOUBLE PRECISION * The minimum pivot in the sturm sequence for T. * * SCALE (output) DOUBLE PRECISION * The scaling factor for the tridiagonal T. * * WL (output) DOUBLE PRECISION * WU (output) DOUBLE PRECISION * The interval (WL, WU] contains all the wanted eigenvalues. * It is either given by the user or computed in DLARRE2A. * * INFO (output) INTEGER * On exit, INFO * = 0: successful exit * other:if INFO = -i, the i-th argument had an illegal value * if INFO = 10X, internal error in DLARRE2A, * Here, the digit X = ABS( IINFO ) < 10, where IINFO is * the nonzero error code returned by DLARRE2A. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, FOUR, MINRGP PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, $ FOUR = 4.0D0, $ MINRGP = 1.0D-3 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY INTEGER IIL, IINDBL, IINDW, IINDWK, IINFO, IINSPL, IIU, $ INDE2, INDGP, INDGRS, INDSDM, INDWRK, ITMP, $ ITMP2, J, LIWMIN, LWMIN, NZCMIN DOUBLE PRECISION BIGNUM, EPS, RMAX, RMIN, RTOL1, RTOL2, SAFMIN, $ SMLNUM, THRESH, TNRM * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANST EXTERNAL LSAME, DLAMCH, DLANST * .. * .. External Subroutines .. EXTERNAL DLARRC, DLARRE2A, DSCAL * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * WANTZ = LSAME( JOBZ, 'V' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) * DSTEGR2A needs WORK of size 6*N, IWORK of size 3*N. * In addition, DLARRE2A needs WORK of size 6*N, IWORK of size 5*N. * Furthermore, DLARRV2 needs WORK of size 12*N, IWORK of size 7*N. * Workspace is kept consistent with DSTEGR2B even though * DLARRV2 is not called here. IF( WANTZ ) THEN LWMIN = 18*N LIWMIN = 10*N ELSE * need less workspace if only the eigenvalues are wanted LWMIN = 12*N LIWMIN = 8*N ENDIF WL = ZERO WU = ZERO IIL = 0 IIU = 0 IF( VALEIG ) THEN * We do not reference VL, VU in the cases RANGE = 'I','A' * The interval (WL, WU] contains all the wanted eigenvalues. * It is either given by the user or computed in DLARRE2A. WL = VL WU = VU ELSEIF( INDEIG ) THEN * We do not reference IL, IU in the cases RANGE = 'V','A' IIL = IL IIU = IU ENDIF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( VALEIG .AND. N.GT.0 .AND. WU.LE.WL ) THEN INFO = -7 ELSE IF( INDEIG .AND. ( IIL.LT.1 .OR. IIL.GT.N ) ) THEN INFO = -8 ELSE IF( INDEIG .AND. ( IIU.LT.IIL .OR. IIU.GT.N ) ) THEN INFO = -9 ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN INFO = -13 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * * Get machine constants. * SAFMIN = DLAMCH( 'Safe minimum' ) EPS = DLAMCH( 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * IF( INFO.EQ.0 ) THEN WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN * IF( WANTZ .AND. ALLEIG ) THEN NZCMIN = N IIL = 1 IIU = N ELSE IF( WANTZ .AND. VALEIG ) THEN CALL DLARRC( 'T', N, VL, VU, D, E, SAFMIN, $ NZCMIN, ITMP, ITMP2, INFO ) IIL = ITMP+1 IIU = ITMP2 ELSE IF( WANTZ .AND. INDEIG ) THEN NZCMIN = IIU-IIL+1 ELSE * WANTZ .EQ. FALSE. NZCMIN = 0 ENDIF IF( ZQUERY .AND. INFO.EQ.0 ) THEN Z( 1,1 ) = NZCMIN ELSE IF( NZC.LT.NZCMIN .AND. .NOT.ZQUERY ) THEN INFO = -14 END IF END IF IF ( WANTZ ) THEN IF ( DOL.LT.1 .OR. DOL.GT.NZCMIN ) THEN INFO = -20 ENDIF IF ( DOU.LT.1 .OR. DOU.GT.NZCMIN .OR. DOU.LT.DOL) THEN INFO = -21 ENDIF ENDIF IF( INFO.NE.0 ) THEN * C Disable sequential error handler C for parallel case C CALL XERBLA( 'DSTEGR2A', -INFO ) * RETURN ELSE IF( LQUERY .OR. ZQUERY ) THEN RETURN END IF * Initialize NEEDIL and NEEDIU, these values are changed in DLARRE2A NEEDIL = DOU NEEDIU = DOL * * Quick return if possible * M = 0 IF( N.EQ.0 ) $ RETURN * IF( N.EQ.1 ) THEN IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = D( 1 ) ELSE IF( WL.LT.D( 1 ) .AND. WU.GE.D( 1 ) ) THEN M = 1 W( 1 ) = D( 1 ) END IF END IF IF( WANTZ ) $ Z( 1, 1 ) = ONE RETURN END IF * INDGRS = 1 INDERR = 2*N + 1 INDGP = 3*N + 1 INDSDM = 4*N + 1 INDE2 = 5*N + 1 INDWRK = 6*N + 1 * IINSPL = 1 IINDBL = N + 1 IINDW = 2*N + 1 IINDWK = 3*N + 1 * * Scale matrix to allowable range, if necessary. * SCALE = ONE TNRM = DLANST( 'M', N, D, E ) IF( TNRM.GT.ZERO .AND. TNRM.LT.RMIN ) THEN SCALE = RMIN / TNRM ELSE IF( TNRM.GT.RMAX ) THEN SCALE = RMAX / TNRM END IF IF( SCALE.NE.ONE ) THEN CALL DSCAL( N, SCALE, D, 1 ) CALL DSCAL( N-1, SCALE, E, 1 ) TNRM = TNRM*SCALE IF( VALEIG ) THEN * If eigenvalues in interval have to be found, * scale (WL, WU] accordingly WL = WL*SCALE WU = WU*SCALE ENDIF END IF * * Compute the desired eigenvalues of the tridiagonal after splitting * into smaller subblocks if the corresponding off-diagonal elements * are small * THRESH is the splitting parameter for DLARRA in DLARRE2A * A negative THRESH forces the old splitting criterion based on the * size of the off-diagonal. THRESH = -EPS IINFO = 0 * Store the squares of the offdiagonal values of T DO 5 J = 1, N-1 WORK( INDE2+J-1 ) = E(J)**2 5 CONTINUE * Set the tolerance parameters for bisection IF( .NOT.WANTZ ) THEN * DLARRE2A computes the eigenvalues to full precision. RTOL1 = FOUR * EPS RTOL2 = FOUR * EPS ELSE * DLARRE2A computes the eigenvalues to less than full precision. * DLARRV2 will refine the eigenvalue approximations, and we can * need less accurate initial bisection in DLARRE2A. RTOL1 = FOUR*SQRT(EPS) RTOL2 = MAX( SQRT(EPS)*5.0D-3, FOUR * EPS ) ENDIF CALL DLARRE2A( RANGE, N, WL, WU, IIL, IIU, D, E, $ WORK(INDE2), RTOL1, RTOL2, THRESH, NSPLIT, $ IWORK( IINSPL ), M, DOL, DOU, NEEDIL, NEEDIU, $ W, WORK( INDERR ), $ WORK( INDGP ), IWORK( IINDBL ), $ IWORK( IINDW ), WORK( INDGRS ), $ WORK( INDSDM ), PIVMIN, $ WORK( INDWRK ), IWORK( IINDWK ), $ MINRGP, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 100 + ABS( IINFO ) RETURN END IF * Note that if RANGE .NE. 'V', DLARRE2A computes bounds on the desired * part of the spectrum. All desired eigenvalues are contained in * (WL,WU] RETURN * * End of DSTEGR2A * END scalapack-2.0.2/SRC/dstegr2b.f000644 000766 000024 00000031245 11657111056 016233 0ustar00juliestaff000000 000000 SUBROUTINE DSTEGR2B( JOBZ, N, D, E, $ M, W, Z, LDZ, NZC, ISUPPZ, WORK, LWORK, IWORK, $ LIWORK, DOL, DOU, NEEDIL, NEEDIU, $ INDWLC, PIVMIN, SCALE, WL, WU, $ VSTART, FINISH, MAXCLS, $ NDEPTH, PARITY, ZOFFSET, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER JOBZ INTEGER DOL, DOU, INDWLC, INFO, LDZ, LIWORK, LWORK, M, $ MAXCLS, N, NDEPTH, NEEDIL, NEEDIU, NZC, PARITY, $ ZOFFSET DOUBLE PRECISION PIVMIN, SCALE, WL, WU LOGICAL VSTART, FINISH * .. * .. Array Arguments .. INTEGER ISUPPZ( * ), IWORK( * ) DOUBLE PRECISION D( * ), E( * ), W( * ), WORK( * ) DOUBLE PRECISION Z( LDZ, * ) * .. * * Purpose * ======= * * DSTEGR2B should only be called after a call to DSTEGR2A. * From eigenvalues and initial representations computed by DSTEGR2A, * DSTEGR2B computes the selected eigenvalues and eigenvectors * of the real symmetric tridiagonal matrix in parallel * on multiple processors. It is potentially invoked multiple times * on a given processor because the locally relevant representation tree * might depend on spectral information that is "owned" by other processors * and might need to be communicated. * * Please note: * 1. The calling sequence has two additional INTEGER parameters, * DOL and DOU, that should satisfy M>=DOU>=DOL>=1. * These parameters are only relevant for the case JOBZ = 'V'. * DSTEGR2B ONLY computes the eigenVECTORS * corresponding to eigenvalues DOL through DOU in W. (That is, * instead of computing the eigenvectors belonging to W(1) * through W(M), only the eigenvectors belonging to eigenvalues * W(DOL) through W(DOU) are computed. In this case, only the * eigenvalues DOL:DOU are guaranteed to be accurately refined * to all figures by Rayleigh-Quotient iteration. * * 2. The additional arguments VSTART, FINISH, NDEPTH, PARITY, ZOFFSET * are included as a thread-safe implementation equivalent to SAVE variables. * These variables store details about the local representation tree which is * computed layerwise. For scalability reasons, eigenvalues belonging to the * locally relevant representation tree might be computed on other processors. * These need to be communicated before the inspection of the RRRs can proceed * on any given layer. * Note that only when the variable FINISH is true, the computation has ended * All eigenpairs between DOL and DOU have been computed. M is set = DOU - DOL + 1. * * 3. DSTEGR2B needs more workspace in Z than the sequential DSTEGR. * It is used to store the conformal embedding of the local representation tree. * * Arguments * ========= * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the N diagonal elements of the tridiagonal matrix * T. On exit, D is overwritten. * * E (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the (N-1) subdiagonal elements of the tridiagonal * matrix T in elements 1 to N-1 of E. E(N) need not be set on * input, but is used internally as workspace. * On exit, E is overwritten. * * M (input) INTEGER * The total number of eigenvalues found * in DSTEGR2A. 0 <= M <= N. * * W (input) DOUBLE PRECISION array, dimension (N) * The first M elements contain approximations to the selected * eigenvalues in ascending order. Note that only the eigenvalues from * the locally relevant part of the representation tree, that is * all the clusters that include eigenvalues from DOL:DOU, are reliable * on this processor. (It does not need to know about any others anyway.) * * Z (output) DOUBLE PRECISION array, dimension (LDZ, max(1,M) ) * If JOBZ = 'V', and if INFO = 0, then * a subset of the first M columns of Z * contain the orthonormal eigenvectors of the matrix T * corresponding to the selected eigenvalues, with the i-th * column of Z holding the eigenvector associated with W(i). * See DOL, DOU for more information. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * JOBZ = 'V', then LDZ >= max(1,N). * * NZC (input) INTEGER * The number of eigenvectors to be held in the array Z. * * ISUPPZ (output) INTEGER ARRAY, dimension ( 2*max(1,M) ) * The support of the eigenvectors in Z, i.e., the indices * indicating the nonzero elements in Z. The i-th computed eigenvector * is nonzero only in elements ISUPPZ( 2*i-1 ) through * ISUPPZ( 2*i ). This is relevant in the case when the matrix * is split. ISUPPZ is only set if N>2. * * WORK (workspace/output) DOUBLE PRECISION array, dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal * (and minimal) LWORK. * * LWORK (input) INTEGER * The dimension of the array WORK. LWORK >= max(1,18*N) * if JOBZ = 'V', and LWORK >= max(1,12*N) if JOBZ = 'N'. * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued. * * IWORK (workspace/output) INTEGER array, dimension (LIWORK) * On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. LIWORK >= max(1,10*N) * if the eigenvectors are desired, and LIWORK >= max(1,8*N) * if only the eigenvalues are to be computed. * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued. * * DOL (input) INTEGER * DOU (input) INTEGER * From the eigenvalues W(1:M), only eigenvectors * Z(:,DOL) to Z(:,DOU) are computed. * If DOL > 1, then Z(:,DOL-1-ZOFFSET) is used and overwritten. * If DOU < M, then Z(:,DOU+1-ZOFFSET) is used and overwritten. * * NEEDIL (input/output) INTEGER * NEEDIU (input/output) INTEGER * Describes which are the left and right outermost eigenvalues * still to be computed. Initially computed by DLARRE2A, * modified in the course of the algorithm. * * INDWLC (output) DOUBLE PRECISION * Pointer into the workspace, location where the local * eigenvalue representations are stored. ("Local eigenvalues" * are those relative to the individual shifts of the RRRs.) * * PIVMIN (input) DOUBLE PRECISION * The minimum pivot in the sturm sequence for T. * * SCALE (input) DOUBLE PRECISION * The scaling factor for T. Used for unscaling the eigenvalues * at the very end of the algorithm. * * WL (input) DOUBLE PRECISION * WU (input) DOUBLE PRECISION * The interval (WL, WU] contains all the wanted eigenvalues. * * VSTART (input/output) LOGICAL * .TRUE. on initialization, set to .FALSE. afterwards. * * FINISH (input/output) LOGICAL * indicates whether all eigenpairs have been computed * * MAXCLS (input/output) INTEGER * The largest cluster worked on by this processor in the * representation tree. * * NDEPTH (input/output) INTEGER * The current depth of the representation tree. Set to * zero on initial pass, changed when the deeper levels of * the representation tree are generated. * * PARITY (input/output) INTEGER * An internal parameter needed for the storage of the * clusters on the current level of the representation tree. * * ZOFFSET (input) INTEGER * Offset for storing the eigenpairs when Z is distributed * in 1D-cyclic fashion * * INFO (output) INTEGER * On exit, INFO * = 0: successful exit * other:if INFO = -i, the i-th argument had an illegal value * if INFO = 20X, internal error in DLARRV2. * Here, the digit X = ABS( IINFO ) < 10, where IINFO is * the nonzero error code returned by DLARRV2. * * .. Parameters .. DOUBLE PRECISION ONE, FOUR, MINRGP PARAMETER ( ONE = 1.0D0, $ FOUR = 4.0D0, $ MINRGP = 1.0D-3 ) * .. * .. Local Scalars .. LOGICAL LQUERY, WANTZ, ZQUERY INTEGER IINDBL, IINDW, IINDWK, IINFO, IINSPL, INDERR, $ INDGP, INDGRS, INDSDM, INDWRK, ITMP, J, LIWMIN, $ LWMIN DOUBLE PRECISION EPS, RTOL1, RTOL2 * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANST EXTERNAL LSAME, DLAMCH, DLANST * .. * .. External Subroutines .. EXTERNAL DLARRV2, DSCAL * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * WANTZ = LSAME( JOBZ, 'V' ) * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) * DSTEGR2B needs WORK of size 6*N, IWORK of size 3*N. * In addition, DLARRE2A needed WORK of size 6*N, IWORK of size 5*N. * Workspace is kept consistent even though DLARRE2A is not called here. * Furthermore, DLARRV2 needs WORK of size 12*N, IWORK of size 7*N. IF( WANTZ ) THEN LWMIN = 18*N LIWMIN = 10*N ELSE * need less workspace if only the eigenvalues are wanted LWMIN = 12*N LIWMIN = 8*N ENDIF * INFO = 0 * * Get machine constants. * EPS = DLAMCH( 'Precision' ) * IF( (N.EQ.0).OR.(N.EQ.1) ) THEN FINISH = .TRUE. RETURN ENDIF IF(ZQUERY.OR.LQUERY) $ RETURN * INDGRS = 1 INDERR = 2*N + 1 INDGP = 3*N + 1 INDSDM = 4*N + 1 INDWRK = 6*N + 1 INDWLC = INDWRK * IINSPL = 1 IINDBL = N + 1 IINDW = 2*N + 1 IINDWK = 3*N + 1 * Set the tolerance parameters for bisection RTOL1 = FOUR*SQRT(EPS) RTOL2 = MAX( SQRT(EPS)*5.0D-3, FOUR * EPS ) IF( WANTZ ) THEN * * Compute the desired eigenvectors corresponding to the computed * eigenvalues * CALL DLARRV2( N, WL, WU, D, E, $ PIVMIN, IWORK( IINSPL ), M, $ DOL, DOU, NEEDIL, NEEDIU, MINRGP, RTOL1, RTOL2, $ W, WORK( INDERR ), WORK( INDGP ), IWORK( IINDBL ), $ IWORK( IINDW ), WORK( INDGRS ), $ WORK( INDSDM ), Z, LDZ, $ ISUPPZ, WORK( INDWRK ), IWORK( IINDWK ), $ VSTART, FINISH, $ MAXCLS, NDEPTH, PARITY, ZOFFSET, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 200 + ABS( IINFO ) RETURN END IF * ELSE * DLARRE2A computed eigenvalues of the (shifted) root representation * DLARRV2 returns the eigenvalues of the unshifted matrix. * However, if the eigenvectors are not desired by the user, we need * to apply the corresponding shifts from DLARRE2A to obtain the * eigenvalues of the original matrix. DO 30 J = 1, M ITMP = IWORK( IINDBL+J-1 ) W( J ) = W( J ) + E( IWORK( IINSPL+ITMP-1 ) ) 30 CONTINUE * FINISH = .TRUE. * END IF * IF(FINISH) THEN * All eigenpairs have been computed * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( SCALE.NE.ONE ) THEN CALL DSCAL( M, ONE / SCALE, W, 1 ) END IF * * Correct M if needed * IF ( WANTZ ) THEN IF( DOL.NE.1 .OR. DOU.NE.M ) THEN M = DOU - DOL +1 ENDIF ENDIF * * No sorting of eigenpairs is done here, done later in the * calling subroutine * WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN ENDIF RETURN * * End of DSTEGR2B * END scalapack-2.0.2/SRC/dstein2.f000644 000766 000024 00000026505 10363532303 016065 0ustar00juliestaff000000 000000 * * SUBROUTINE DSTEIN2( N, D, E, M, W, IBLOCK, ISPLIT, ORFAC, Z, LDZ, $ WORK, IWORK, IFAIL, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INFO, LDZ, M, N DOUBLE PRECISION ORFAC * .. * .. Array Arguments .. INTEGER IBLOCK( * ), IFAIL( * ), ISPLIT( * ), $ IWORK( * ) DOUBLE PRECISION D( * ), E( * ), W( * ), WORK( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * DSTEIN2 computes the eigenvectors of a real symmetric tridiagonal * matrix T corresponding to specified eigenvalues, using inverse * iteration. * * The maximum number of iterations allowed for each eigenvector is * specified by an internal parameter MAXITS (currently set to 5). * * Arguments * ========= * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input) DOUBLE PRECISION array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. * * E (input) DOUBLE PRECISION array, dimension (N) * The (n-1) subdiagonal elements of the tridiagonal matrix * T, in elements 1 to N-1. E(N) need not be set. * * M (input) INTEGER * The number of eigenvectors to be found. 0 <= M <= N. * * W (input) DOUBLE PRECISION array, dimension (N) * The first M elements of W contain the eigenvalues for * which eigenvectors are to be computed. The eigenvalues * should be grouped by split-off block and ordered from * smallest to largest within the block. ( The output array * W from DSTEBZ with ORDER = 'B' is expected here. ) * * IBLOCK (input) INTEGER array, dimension (N) * The submatrix indices associated with the corresponding * eigenvalues in W; IBLOCK(i)=1 if eigenvalue W(i) belongs to * the first submatrix from the top, =2 if W(i) belongs to * the second submatrix, etc. ( The output array IBLOCK * from DSTEBZ is expected here. ) * * ISPLIT (input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to * ISPLIT( 1 ), the second of rows/columns ISPLIT( 1 )+1 * through ISPLIT( 2 ), etc. * ( The output array ISPLIT from DSTEBZ is expected here. ) * * ORFAC (input) DOUBLE PRECISION * ORFAC specifies which eigenvectors should be * orthogonalized. Eigenvectors that correspond to eigenvalues * which are within ORFAC*||T|| of each other are to be * orthogonalized. * * Z (output) DOUBLE PRECISION array, dimension (LDZ, M) * The computed eigenvectors. The eigenvector associated * with the eigenvalue W(i) is stored in the i-th column of * Z. Any vector which fails to converge is set to its current * iterate after MAXITS iterations. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= max(1,N). * * WORK (workspace) DOUBLE PRECISION array, dimension (5*N) * * IWORK (workspace) INTEGER array, dimension (N) * * IFAIL (output) INTEGER array, dimension (M) * On normal exit, all elements of IFAIL are zero. * If one or more eigenvectors fail to converge after * MAXITS iterations, then their indices are stored in * array IFAIL. * * INFO (output) INTEGER * = 0: successful exit. * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, then i eigenvectors failed to converge * in MAXITS iterations. Their indices are stored in * array IFAIL. * * Internal Parameters * =================== * * MAXITS INTEGER, default = 5 * The maximum number of iterations performed. * * EXTRA INTEGER, default = 2 * The number of iterations performed after norm growth * criterion is satisfied, should be at least 1. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, TEN, ODM1 PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TEN = 1.0D+1, $ ODM1 = 1.0D-1 ) INTEGER MAXITS, EXTRA PARAMETER ( MAXITS = 5, EXTRA = 2 ) * .. * .. Local Scalars .. INTEGER B1, BLKSIZ, BN, GPIND, I, IINFO, INDRV1, $ INDRV2, INDRV3, INDRV4, INDRV5, ITS, J, J1, $ JBLK, JMAX, NBLK, NRMCHK DOUBLE PRECISION EPS, EPS1, NRM, ONENRM, ORTOL, PERTOL, SCL, $ SEP, STPCRT, TOL, XJ, XJM, ZTR * .. * .. Local Arrays .. INTEGER ISEED( 4 ) * .. * .. External Functions .. INTEGER IDAMAX DOUBLE PRECISION DASUM, DDOT, DLAMCH, DNRM2 EXTERNAL IDAMAX, DASUM, DDOT, DLAMCH, DNRM2 * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DLAGTF, DLAGTS, DLARNV, DSCAL, $ XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 DO 10 I = 1, M IFAIL( I ) = 0 10 CONTINUE * IF( N.LT.0 ) THEN INFO = -1 ELSE IF( M.LT.0 .OR. M.GT.N ) THEN INFO = -4 ELSE IF( ORFAC.LT.ZERO ) THEN INFO = -8 ELSE IF( LDZ.LT.MAX( 1, N ) ) THEN INFO = -10 ELSE DO 20 J = 2, M IF( IBLOCK( J ).LT.IBLOCK( J-1 ) ) THEN INFO = -6 GO TO 30 END IF IF( IBLOCK( J ).EQ.IBLOCK( J-1 ) .AND. W( J ).LT.W( J-1 ) ) $ THEN INFO = -5 GO TO 30 END IF 20 CONTINUE 30 CONTINUE END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSTEIN2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) THEN RETURN ELSE IF( N.EQ.1 ) THEN Z( 1, 1 ) = ONE RETURN END IF * * Get machine constants. * EPS = DLAMCH( 'Precision' ) * * Initialize seed for random number generator DLARNV. * DO 40 I = 1, 4 ISEED( I ) = 1 40 CONTINUE * * Initialize pointers. * INDRV1 = 0 INDRV2 = INDRV1 + N INDRV3 = INDRV2 + N INDRV4 = INDRV3 + N INDRV5 = INDRV4 + N * * Compute eigenvectors of matrix blocks. * J1 = 1 DO 160 NBLK = 1, IBLOCK( M ) * * Find starting and ending indices of block nblk. * IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) BLKSIZ = BN - B1 + 1 IF( BLKSIZ.EQ.1 ) $ GO TO 60 GPIND = J1 * * Compute reorthogonalization criterion and stopping criterion. * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 50 I = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( I ) )+ABS( E( I-1 ) )+ $ ABS( E( I ) ) ) 50 CONTINUE ORTOL = ORFAC*ONENRM * STPCRT = SQRT( ODM1 / BLKSIZ ) * * Loop through eigenvalues of block nblk. * 60 CONTINUE JBLK = 0 DO 150 J = J1, M IF( IBLOCK( J ).NE.NBLK ) THEN J1 = J GO TO 160 END IF JBLK = JBLK + 1 XJ = W( J ) * * Skip all the work if the block size is one. * IF( BLKSIZ.EQ.1 ) THEN WORK( INDRV1+1 ) = ONE GO TO 120 END IF * * If eigenvalues j and j-1 are too close, add a relatively * small perturbation. * IF( JBLK.GT.1 ) THEN EPS1 = ABS( EPS*XJ ) PERTOL = TEN*EPS1 SEP = XJ - XJM IF( SEP.LT.PERTOL ) $ XJ = XJM + PERTOL END IF * ITS = 0 NRMCHK = 0 * * Get random starting vector. * CALL DLARNV( 2, ISEED, BLKSIZ, WORK( INDRV1+1 ) ) * * Copy the matrix T so it won't be destroyed in factorization. * CALL DCOPY( BLKSIZ, D( B1 ), 1, WORK( INDRV4+1 ), 1 ) CALL DCOPY( BLKSIZ-1, E( B1 ), 1, WORK( INDRV2+2 ), 1 ) CALL DCOPY( BLKSIZ-1, E( B1 ), 1, WORK( INDRV3+1 ), 1 ) * * Compute LU factors with partial pivoting ( PT = LU ) * TOL = ZERO CALL DLAGTF( BLKSIZ, WORK( INDRV4+1 ), XJ, WORK( INDRV2+2 ), $ WORK( INDRV3+1 ), TOL, WORK( INDRV5+1 ), IWORK, $ IINFO ) * * Update iteration count. * 70 CONTINUE ITS = ITS + 1 IF( ITS.GT.MAXITS ) $ GO TO 100 * * Normalize and scale the righthand side vector Pb. * SCL = BLKSIZ*ONENRM*MAX( EPS, $ ABS( WORK( INDRV4+BLKSIZ ) ) ) / $ DASUM( BLKSIZ, WORK( INDRV1+1 ), 1 ) CALL DSCAL( BLKSIZ, SCL, WORK( INDRV1+1 ), 1 ) * * Solve the system LU = Pb. * CALL DLAGTS( -1, BLKSIZ, WORK( INDRV4+1 ), WORK( INDRV2+2 ), $ WORK( INDRV3+1 ), WORK( INDRV5+1 ), IWORK, $ WORK( INDRV1+1 ), TOL, IINFO ) * * Reorthogonalize by modified Gram-Schmidt if eigenvalues are * close enough. * IF( JBLK.EQ.1 ) $ GO TO 90 IF( ABS( XJ-XJM ).GT.ORTOL ) $ GPIND = J * IF( GPIND.NE.J ) THEN DO 80 I = GPIND, J - 1 ZTR = -DDOT( BLKSIZ, WORK( INDRV1+1 ), 1, Z( B1, I ), $ 1 ) CALL DAXPY( BLKSIZ, ZTR, Z( B1, I ), 1, $ WORK( INDRV1+1 ), 1 ) 80 CONTINUE END IF * * Check the infinity norm of the iterate. * 90 CONTINUE JMAX = IDAMAX( BLKSIZ, WORK( INDRV1+1 ), 1 ) NRM = ABS( WORK( INDRV1+JMAX ) ) * * Continue for additional iterations after norm reaches * stopping criterion. * IF( NRM.LT.STPCRT ) $ GO TO 70 NRMCHK = NRMCHK + 1 IF( NRMCHK.LT.EXTRA+1 ) $ GO TO 70 * GO TO 110 * * If stopping criterion was not satisfied, update info and * store eigenvector number in array ifail. * 100 CONTINUE INFO = INFO + 1 IFAIL( INFO ) = J * * Accept iterate as jth eigenvector. * 110 CONTINUE SCL = ONE / DNRM2( BLKSIZ, WORK( INDRV1+1 ), 1 ) JMAX = IDAMAX( BLKSIZ, WORK( INDRV1+1 ), 1 ) IF( WORK( INDRV1+JMAX ).LT.ZERO ) $ SCL = -SCL CALL DSCAL( BLKSIZ, SCL, WORK( INDRV1+1 ), 1 ) 120 CONTINUE DO 130 I = 1, N Z( I, J ) = ZERO 130 CONTINUE DO 140 I = 1, BLKSIZ Z( B1+I-1, J ) = WORK( INDRV1+I ) 140 CONTINUE * * Save the shift to check eigenvalue spacing at next * iteration. * XJM = XJ * 150 CONTINUE 160 CONTINUE * RETURN * * End of DSTEIN2 * END scalapack-2.0.2/SRC/dsteqr2.f000644 000766 000024 00000033007 10363532303 016074 0ustar00juliestaff000000 000000 SUBROUTINE DSTEQR2( COMPZ, N, D, E, Z, LDZ, NR, WORK, INFO ) * * -- LAPACK routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER COMPZ INTEGER INFO, LDZ, N, NR * .. * .. Array Arguments .. DOUBLE PRECISION D( * ), E( * ), WORK( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * DSTEQR2 is a modified version of LAPACK routine DSTEQR. * DSTEQR2 computes all eigenvalues and, optionally, eigenvectors of a * symmetric tridiagonal matrix using the implicit QL or QR method. * DSTEQR2 is modified from DSTEQR to allow each ScaLAPACK process * running DSTEQR2 to perform updates on a distributed matrix Q. * Proper usage of DSTEQR2 can be gleaned from examination of ScaLAPACK's * PDSYEV. * * Arguments * ========= * * COMPZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only. * = 'I': Compute eigenvalues and eigenvectors of the * tridiagonal matrix. Z must be initialized to the * identity matrix by PDLASET or DLASET prior to entering * this subroutine. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the diagonal elements of the tridiagonal matrix. * On exit, if INFO = 0, the eigenvalues in ascending order. * * E (input/output) DOUBLE PRECISION array, dimension (N-1) * On entry, the (n-1) subdiagonal elements of the tridiagonal * matrix. * On exit, E has been destroyed. * * Z (local input/local output) DOUBLE PRECISION array, global * dimension (N, N), local dimension (LDZ, NR). * On entry, if COMPZ = 'V', then Z contains the orthogonal * matrix used in the reduction to tridiagonal form. * On exit, if INFO = 0, then if COMPZ = 'V', Z contains the * orthonormal eigenvectors of the original symmetric matrix, * and if COMPZ = 'I', Z contains the orthonormal eigenvectors * of the symmetric tridiagonal matrix. * If COMPZ = 'N', then Z is not referenced. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * eigenvectors are desired, then LDZ >= max(1,N). * * NR (input) INTEGER * NR = MAX(1, NUMROC( N, NB, MYPROW, 0, NPROCS ) ). * If COMPZ = 'N', then NR is not referenced. * * WORK (workspace) DOUBLE PRECISION array, dimension (max(1,2*N-2)) * If COMPZ = 'N', then WORK is not referenced. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: the algorithm has failed to find all the eigenvalues in * a total of 30*N iterations; if INFO = i, then i * elements of E have not converged to zero; on exit, D * and E contain the elements of a symmetric tridiagonal * matrix which is orthogonally similar to the original * matrix. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, TWO, THREE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, TWO = 2.0D0, $ THREE = 3.0D0 ) INTEGER MAXIT PARAMETER ( MAXIT = 30 ) * .. * .. Local Scalars .. INTEGER I, ICOMPZ, II, ISCALE, J, JTOT, K, L, L1, LEND, $ LENDM1, LENDP1, LENDSV, LM1, LSV, M, MM, MM1, $ NM1, NMAXIT DOUBLE PRECISION ANORM, B, C, EPS, EPS2, F, G, P, R, RT1, RT2, $ S, SAFMAX, SAFMIN, SSFMAX, SSFMIN, TST * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANST, DLAPY2 EXTERNAL LSAME, DLAMCH, DLANST, DLAPY2 * .. * .. External Subroutines .. EXTERNAL DLAE2, DLAEV2, DLARTG, DLASCL, DLASR, $ DLASRT, DSWAP, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SIGN, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 * IF( LSAME( COMPZ, 'N' ) ) THEN ICOMPZ = 0 ELSE IF( LSAME( COMPZ, 'I' ) ) THEN ICOMPZ = 1 ELSE ICOMPZ = -1 END IF IF( ICOMPZ.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( ICOMPZ.GT.0 .AND. LDZ.LT.MAX( 1, NR ) ) THEN INFO = -6 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSTEQR2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( N.EQ.1 ) THEN IF( ICOMPZ.EQ.1 ) $ Z( 1, 1 ) = ONE RETURN END IF * * Determine the unit roundoff and over/underflow thresholds. * EPS = DLAMCH( 'E' ) EPS2 = EPS**2 SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN SSFMAX = SQRT( SAFMAX ) / THREE SSFMIN = SQRT( SAFMIN ) / EPS2 * * Compute the eigenvalues and eigenvectors of the tridiagonal * matrix. * NMAXIT = N*MAXIT JTOT = 0 * * Determine where the matrix splits and choose QL or QR iteration * for each block, according to whether top or bottom diagonal * element is smaller. * L1 = 1 NM1 = N - 1 * 10 CONTINUE IF( L1.GT.N ) $ GO TO 160 IF( L1.GT.1 ) $ E( L1-1 ) = ZERO IF( L1.LE.NM1 ) THEN DO 20 M = L1, NM1 TST = ABS( E( M ) ) IF( TST.EQ.ZERO ) $ GO TO 30 IF( TST.LE.( SQRT( ABS( D( M ) ) )*SQRT( ABS( D( M+ $ 1 ) ) ) )*EPS ) THEN E( M ) = ZERO GO TO 30 END IF 20 CONTINUE END IF M = N * 30 CONTINUE L = L1 LSV = L LEND = M LENDSV = LEND L1 = M + 1 IF( LEND.EQ.L ) $ GO TO 10 * * Scale submatrix in rows and columns L to LEND * ANORM = DLANST( 'I', LEND-L+1, D( L ), E( L ) ) ISCALE = 0 IF( ANORM.EQ.ZERO ) $ GO TO 10 IF( ANORM.GT.SSFMAX ) THEN ISCALE = 1 CALL DLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L+1, 1, D( L ), N, $ INFO ) CALL DLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L, 1, E( L ), N, $ INFO ) ELSE IF( ANORM.LT.SSFMIN ) THEN ISCALE = 2 CALL DLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L+1, 1, D( L ), N, $ INFO ) CALL DLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L, 1, E( L ), N, $ INFO ) END IF * * Choose between QL and QR iteration * IF( ABS( D( LEND ) ).LT.ABS( D( L ) ) ) THEN LEND = LSV L = LENDSV END IF * IF( LEND.GT.L ) THEN * * QL Iteration * * Look for small subdiagonal element. * 40 CONTINUE IF( L.NE.LEND ) THEN LENDM1 = LEND - 1 DO 50 M = L, LENDM1 TST = ABS( E( M ) )**2 IF( TST.LE.( EPS2*ABS( D( M ) ) )*ABS( D( M+1 ) )+ $ SAFMIN )GO TO 60 50 CONTINUE END IF * M = LEND * 60 CONTINUE IF( M.LT.LEND ) $ E( M ) = ZERO P = D( L ) IF( M.EQ.L ) $ GO TO 80 * * If remaining matrix is 2-by-2, use DLAE2 or SLAEV2 * to compute its eigensystem. * IF( M.EQ.L+1 ) THEN IF( ICOMPZ.GT.0 ) THEN CALL DLAEV2( D( L ), E( L ), D( L+1 ), RT1, RT2, C, S ) WORK( L ) = C WORK( N-1+L ) = S CALL DLASR( 'R', 'V', 'B', NR, 2, WORK( L ), $ WORK( N-1+L ), Z( 1, L ), LDZ ) ELSE CALL DLAE2( D( L ), E( L ), D( L+1 ), RT1, RT2 ) END IF D( L ) = RT1 D( L+1 ) = RT2 E( L ) = ZERO L = L + 2 IF( L.LE.LEND ) $ GO TO 40 GO TO 140 END IF * IF( JTOT.EQ.NMAXIT ) $ GO TO 140 JTOT = JTOT + 1 * * Form shift. * G = ( D( L+1 )-P ) / ( TWO*E( L ) ) R = DLAPY2( G, ONE ) G = D( M ) - P + ( E( L ) / ( G+SIGN( R, G ) ) ) * S = ONE C = ONE P = ZERO * * Inner loop * MM1 = M - 1 DO 70 I = MM1, L, -1 F = S*E( I ) B = C*E( I ) CALL DLARTG( G, F, C, S, R ) IF( I.NE.M-1 ) $ E( I+1 ) = R G = D( I+1 ) - P R = ( D( I )-G )*S + TWO*C*B P = S*R D( I+1 ) = G + P G = C*R - B * * If eigenvectors are desired, then save rotations. * IF( ICOMPZ.GT.0 ) THEN WORK( I ) = C WORK( N-1+I ) = -S END IF * 70 CONTINUE * * If eigenvectors are desired, then apply saved rotations. * IF( ICOMPZ.GT.0 ) THEN MM = M - L + 1 CALL DLASR( 'R', 'V', 'B', NR, MM, WORK( L ), WORK( N-1+L ), $ Z( 1, L ), LDZ ) END IF * D( L ) = D( L ) - P E( L ) = G GO TO 40 * * Eigenvalue found. * 80 CONTINUE D( L ) = P * L = L + 1 IF( L.LE.LEND ) $ GO TO 40 GO TO 140 * ELSE * * QR Iteration * * Look for small superdiagonal element. * 90 CONTINUE IF( L.NE.LEND ) THEN LENDP1 = LEND + 1 DO 100 M = L, LENDP1, -1 TST = ABS( E( M-1 ) )**2 IF( TST.LE.( EPS2*ABS( D( M ) ) )*ABS( D( M-1 ) )+ $ SAFMIN )GO TO 110 100 CONTINUE END IF * M = LEND * 110 CONTINUE IF( M.GT.LEND ) $ E( M-1 ) = ZERO P = D( L ) IF( M.EQ.L ) $ GO TO 130 * * If remaining matrix is 2-by-2, use DLAE2 or SLAEV2 * to compute its eigensystem. * IF( M.EQ.L-1 ) THEN IF( ICOMPZ.GT.0 ) THEN CALL DLAEV2( D( L-1 ), E( L-1 ), D( L ), RT1, RT2, C, S ) WORK( M ) = C WORK( N-1+M ) = S CALL DLASR( 'R', 'V', 'F', NR, 2, WORK( M ), $ WORK( N-1+M ), Z( 1, L-1 ), LDZ ) ELSE CALL DLAE2( D( L-1 ), E( L-1 ), D( L ), RT1, RT2 ) END IF D( L-1 ) = RT1 D( L ) = RT2 E( L-1 ) = ZERO L = L - 2 IF( L.GE.LEND ) $ GO TO 90 GO TO 140 END IF * IF( JTOT.EQ.NMAXIT ) $ GO TO 140 JTOT = JTOT + 1 * * Form shift. * G = ( D( L-1 )-P ) / ( TWO*E( L-1 ) ) R = DLAPY2( G, ONE ) G = D( M ) - P + ( E( L-1 ) / ( G+SIGN( R, G ) ) ) * S = ONE C = ONE P = ZERO * * Inner loop * LM1 = L - 1 DO 120 I = M, LM1 F = S*E( I ) B = C*E( I ) CALL DLARTG( G, F, C, S, R ) IF( I.NE.M ) $ E( I-1 ) = R G = D( I ) - P R = ( D( I+1 )-G )*S + TWO*C*B P = S*R D( I ) = G + P G = C*R - B * * If eigenvectors are desired, then save rotations. * IF( ICOMPZ.GT.0 ) THEN WORK( I ) = C WORK( N-1+I ) = S END IF * 120 CONTINUE * * If eigenvectors are desired, then apply saved rotations. * IF( ICOMPZ.GT.0 ) THEN MM = L - M + 1 CALL DLASR( 'R', 'V', 'F', NR, MM, WORK( M ), WORK( N-1+M ), $ Z( 1, M ), LDZ ) END IF * D( L ) = D( L ) - P E( LM1 ) = G GO TO 90 * * Eigenvalue found. * 130 CONTINUE D( L ) = P * L = L - 1 IF( L.GE.LEND ) $ GO TO 90 GO TO 140 * END IF * * Undo scaling if necessary * 140 CONTINUE IF( ISCALE.EQ.1 ) THEN CALL DLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV+1, 1, $ D( LSV ), N, INFO ) CALL DLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV, 1, E( LSV ), $ N, INFO ) ELSE IF( ISCALE.EQ.2 ) THEN CALL DLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV+1, 1, $ D( LSV ), N, INFO ) CALL DLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV, 1, E( LSV ), $ N, INFO ) END IF * * Check for no convergence to an eigenvalue after a total * of N*MAXIT iterations. * IF( JTOT.LT.NMAXIT ) $ GO TO 10 DO 150 I = 1, N - 1 IF( E( I ).NE.ZERO ) $ INFO = INFO + 1 150 CONTINUE GO TO 190 * * Order eigenvalues and eigenvectors. * 160 CONTINUE IF( ICOMPZ.EQ.0 ) THEN * * Use Quick Sort * CALL DLASRT( 'I', N, D, INFO ) * ELSE * * Use Selection Sort to minimize swaps of eigenvectors * DO 180 II = 2, N I = II - 1 K = I P = D( I ) DO 170 J = II, N IF( D( J ).LT.P ) THEN K = J P = D( J ) END IF 170 CONTINUE IF( K.NE.I ) THEN D( K ) = D( I ) D( I ) = P CALL DSWAP( NR, Z( 1, I ), 1, Z( 1, K ), 1 ) END IF 180 CONTINUE END IF * 190 CONTINUE RETURN * * End of DSTEQR2 * END scalapack-2.0.2/SRC/dtrmvt.f000644 000766 000024 00000011217 10363532303 016027 0ustar00juliestaff000000 000000 SUBROUTINE DTRMVT( UPLO, N, T, LDT, X, INCX, Y, INCY, W, INCW, Z, $ INCZ ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCW, INCX, INCY, INCZ, LDT, N * .. * .. Array Arguments .. DOUBLE PRECISION T( LDT, * ), W( * ), X( * ), Y( * ), Z( * ) * .. * * Purpose * ======= * * DTRMVT performs the matrix-vector operations * * x := T' *y, and w := T *z, * * where x is an n element vector and T is an n by n * upper or lower triangular matrix. * * Arguments * ========= * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * T - DOUBLE PRECISION array of DIMENSION ( LDT, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array T must contain the upper * triangular matrix and the strictly lower triangular part of * T is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array T must contain the lower * triangular matrix and the strictly upper triangular part of * T is not referenced. * * LDT - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * On exit, X = T' * y * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * W - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCW ) ). * On exit, W = T * z * * INCW - INTEGER. * On entry, INCW specifies the increment for the elements of * W. INCW must not be zero. * Unchanged on exit. * * Z - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCZ ) ). * Before entry, the incremented array Z must contain the n * element vector z. Unchanged on exit. * * INCY - INTEGER. * On entrz, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * * .. Local Scalars .. INTEGER INFO * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DCOPY, DTRMV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN INFO = 4 ELSE IF( INCW.EQ.0 ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 10 ELSE IF( INCZ.EQ.0 ) THEN INFO = 12 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DTRMVT', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * * IF( INCX.NE.1 .OR. INCY.NE.1 .OR. INCW.NE.1 .OR. INCZ.NE.1 .OR. $ .TRUE. ) THEN CALL DCOPY( N, Y, INCY, X, INCX ) CALL DTRMV( UPLO, 'C', 'N', N, T, LDT, X, INCX ) CALL DCOPY( N, Z, INCZ, W, INCW ) CALL DTRMV( UPLO, 'N', 'N', N, T, LDT, W, INCW ) RETURN END IF * RETURN * * End of DTRMVT. * END scalapack-2.0.2/SRC/getpbbuf.c000644 000766 000024 00000002154 10363532303 016302 0ustar00juliestaff000000 000000 #include "tools.h" char * getpbbuf( mess, length ) /* * .. Scalar Arguments .. */ int length; /* * .. Array Arguments .. */ char * mess; { /* * Purpose * ======= * * getpbbuf returns a pointer to a working buffer of size length alloca- * ted for the PBLAS routines. * * ====================================================================== * * .. Local Scalars .. */ static char * pblasbuf = NULL; static int pbbuflen = 0, mone = -1; /* .. * .. External Functions .. */ void blacs_abort_(); /* .. * .. Executable Statements .. */ if( length >= 0 ) { if( length > pbbuflen ) { if( pblasbuf ) free( pblasbuf ); pblasbuf = (char *) malloc((unsigned)length); if( !pblasbuf ) { fprintf( stderr, "PBLAS %s ERROR: Memory allocation failed\n", mess ); blacs_abort_( &mone, &mone ); } pbbuflen = length; } } else if( pblasbuf ) { free( pblasbuf ); pblasbuf = NULL; pbbuflen = 0; } return( pblasbuf ); } scalapack-2.0.2/SRC/lamov.h000644 000766 000024 00000004632 11745567264 015655 0ustar00juliestaff000000 000000 // // lamov.h // // Written by Lee Killough 04/19/2012 // #include "pblas.h" #include extern void xerbla_(const char *, const F_INTG_FCT *, size_t); void LACPY(const char *UPLO, const F_INTG_FCT *M, const F_INTG_FCT *N, const TYPE *A, const F_INTG_FCT *LDA, TYPE *B, const F_INTG_FCT *LDB); void LAMOV(const char *UPLO, const F_INTG_FCT *M, const F_INTG_FCT *N, const TYPE *A, const F_INTG_FCT *LDA, TYPE *B, const F_INTG_FCT *LDB) { const F_INTG_FCT m = *M; const F_INTG_FCT n = *N; const F_INTG_FCT lda = *LDA; const F_INTG_FCT ldb = *LDB; if (B + m-1 + ldb*(n-1) < A || A + m-1 + lda*(n-1) < B) { LACPY(UPLO, M, N, A, LDA, B, LDB); } else if (lda != ldb) { TYPE *tmp = malloc(sizeof(*A) * m * n); if (!tmp) { F_INTG_FCT info = -1; const char func[] = FUNC; xerbla_(func, &info, sizeof func); } else { LACPY(UPLO, M, N, A, LDA, tmp, &m); LACPY(UPLO, M, N, tmp, &m, B, LDB); free(tmp); } } else { F_INTG_FCT i, j; switch (toupper(*UPLO)) { case 'U': if (A > B) { for (j=0; j=0;) for (i=j=0;) B[i+ldb*j] = A[i+lda*j]; } break; case 'L': if (A > B) { for (j=0; j=0;) for (i=m; --i>=j;) B[i+ldb*j] = A[i+lda*j]; } break; default: if (A > B) { for (j=0; j=0;) for (i=m; --i>=0;) B[i+ldb*j] = A[i+lda*j]; } break; } } } scalapack-2.0.2/SRC/Makefile000644 000766 000024 00000024603 11745552113 016011 0ustar00juliestaff000000 000000 ############################################################################ # # Program: ScaLAPACK # # Module: Makefile # # Purpose: SRC Makefile # # Creation date: March 20, 1995 # # Modified: # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../SLmake.inc ####################################################################### # This is the makefile to create a library for ScaLAPACK. # The files are organized as follows: # ALLAUX -- Auxiliary routines called from all precisions # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX # DZLAUX -- Auxiliary routines called from both DOUBLE PRECISION # and COMPLEX*16 # SLASRC -- Single precision real ScaLAPACK routines # CLASRC -- Single precision complex ScaLAPACK routines # DLASRC -- Double precision real ScaLAPACK routines # ZLASRC -- Double precision complex ScaLAPACK routines # # The library can be set up to include routines for any combination # of the four precisions. First, modify the ARCH, ARCHFLAGS, RANLIB, # FC and FCFLAGS definitions in ../SLmake.inc to match your library # archiver, compiler and the options to be used. # Then to create or add to the library, enter make followed by one or # more of the precisions desired. Some examples: # make single # make single complex # make single double complex complex16 # Alternatively, the command # make # without any arguments creates a library of all four precisions. # The name of the library is defined by ../$(SCALAPACKLIB) in # ../SLmake.inc and is created at the next higher directory level. # # To remove the object files after the library is created, enter # make clean # ####################################################################### ALLAUX = pjlaenv.o pbchkvect.o getpbbuf.o pilaenvx.o piparmq.o pilaver.o pmpim2.o pmpcol.o SCLAUX = \ pslabad.o pslaed0.o pslaed1.o pslaed2.o pslaed3.o pslaedz.o pslaiect.o \ pslamch.o pslared1d.o pslasrt.o psstebz.o psstedc.o slapst.o slasrt2.o \ sstein2.o \ slar1va.o slarrb2.o slarrd2.o slarre2.o slarre2a.o slarrf2.o \ slarrv2.o sstegr2.o sstegr2a.o sstegr2b.o slamov.o clamov.o DZLAUX = \ pdlabad.o pdlaed0.o pdlaed1.o pdlaed2.o pdlaed3.o pdlaedz.o pdlaiect.o \ pdlamch.o pdlared1d.o pdlasrt.o pdstebz.o pdstedc.o dlapst.o dlasrt2.o \ dstein2.o \ dlar1va.o dlarrb2.o dlarrd2.o dlarre2.o dlarre2a.o dlarrf2.o \ dlarrv2.o dstegr2.o dstegr2a.o dstegr2b.o dlamov.o zlamov.o SLASRC = \ psdbsv.o psdbtrf.o psdbtrs.o psdbtrsv.o \ psdtsv.o psdttrf.o psdttrs.o psdttrsv.o \ psgbsv.o psgbtrf.o psgbtrs.o \ psgebd2.o psgebrd.o psgecon.o psgeequ.o psgehd2.o psgehrd.o \ psgelq2.o psgelqf.o psgels.o psgeql2.o psgeqlf.o psgeqpf.o psgeqr2.o \ psgeqrf.o psgerfs.o psgerq2.o psgerqf.o \ psgesv.o psgesvd.o psgesvx.o psgetf2.o psgetrf.o psgetri.o psgetrs.o \ psggqrf.o psggrqf.o pslabrd.o \ pslacon.o pslacp2.o pslacpy.o pslahrd.o pslange.o pslanhs.o pslansy.o \ pslantr.o pslapiv.o pslapv2.o pslaqge.o pslaqsy.o pslarf.o pslarfb.o \ pslarfg.o pslarft.o pslase2.o pslaset.o pslascl.o pslassq.o pslaswp.o \ pslatra.o pslatrd.o pslatrs.o pslauu2.o pslauum.o psorg2l.o psorg2r.o \ psorgl2.o psorglq.o psorgql.o psorgqr.o psorgr2.o \ psorgrq.o psorm2l.o psorm2r.o psormbr.o psormhr.o psorml2.o \ psormlq.o psormql.o psormqr.o psormr2.o psormrq.o psormtr.o pspocon.o \ pspbsv.o pspbtrf.o pspbtrs.o pspbtrsv.o \ psptsv.o pspttrf.o pspttrs.o pspttrsv.o \ pspoequ.o psporfs.o psposv.o psposvx.o pspotf2.o pspotrf.o pspotri.o \ pspotrs.o psrscl.o psstein.o pssyev.o pssyevd.o pssyevx.o \ pssygs2.o pssygst.o \ pssygvx.o pssyngst.o pssyntrd.o pssyttrd.o pssytd2.o pssytrd.o \ pstrti2.o pstrtri.o pstrtrs.o \ pslaevswp.o \ pslarzb.o pslarzt.o pslarz.o pslatrz.o pstzrzf.o psormr3.o psormrz.o \ pslahqr.o pslaconsb.o pslacp3.o pslawil.o \ pslasmsub.o pslared2d.o pslamr1d.o slaref.o slamsh.o slasorte.o ssteqr2.o \ sdbtf2.o sdbtrf.o sdttrf.o sdttrsv.o spttrsv.o strmvt.o pssyevr.o \ bslaapp.o bslaexc.o bstrexc.o pstrord.o pstrsen.o psgebal.o pshseqr.o \ pslamve.o pslaqr0.o pslaqr1.o pslaqr2.o pslaqr3.o pslaqr4.o pslaqr5.o \ psrot.o slaqr6.o CLASRC = \ pcdbsv.o pcdbtrf.o pcdbtrs.o pcdbtrsv.o \ pcdtsv.o pcdttrf.o pcdttrs.o pcdttrsv.o \ pcgbsv.o pcgbtrf.o pcgbtrs.o \ pcgebd2.o pcgebrd.o pcgecon.o pcgeequ.o pcgehd2.o pcgehrd.o \ pcgelq2.o pcgelqf.o pcgels.o pcgeql2.o pcgeqlf.o pcgeqpf.o pcgeqr2.o \ pcgeqrf.o pcgerfs.o pcgerq2.o pcgerqf.o \ pcgesv.o pcgesvd.o pcgesvx.o pcgetf2.o pcgetrf.o pcgetri.o pcgetrs.o \ pcggqrf.o \ pcggrqf.o pcheev.o pcheevd.o pcheevx.o pchegs2.o pchegst.o pchegvx.o \ pchengst.o pchentrd.o pchettrd.o pchetd2.o \ pchetrd.o pclabrd.o pclacon.o pclacgv.o pclacp2.o pclacpy.o pclahrd.o \ pclahqr.o pclaconsb.o pclasmsub.o pclacp3.o pclawil.o pcrot.o \ pclange.o pclanhe.o pclanhs.o pclansy.o pclantr.o pclapiv.o pclapv2.o \ pclaqge.o pclaqsy.o pclarf.o pclarfb.o pclarfc.o pclarfg.o pclarft.o \ pclascl.o pclase2.o pclaset.o pclassq.o pclaswp.o pclatra.o pclatrd.o \ pclatrs.o pclauu2.o pclauum.o pcpocon.o pcpoequ.o pcporfs.o pcposv.o \ pcpbsv.o pcpbtrf.o pcpbtrs.o pcpbtrsv.o \ pcptsv.o pcpttrf.o pcpttrs.o pcpttrsv.o \ pcposvx.o pcpotf2.o pcpotrf.o pcpotri.o pcpotrs.o pcsrscl.o pcstein.o \ pctrevc.o pctrti2.o pctrtri.o pctrtrs.o pcung2l.o pcung2r.o \ pcungl2.o pcunglq.o pcungql.o pcungqr.o pcungr2.o pcungrq.o \ pcunm2l.o pcunm2r.o pcunmbr.o pcunmhr.o pcunml2.o pcunmlq.o \ pcunmql.o pcunmqr.o pcunmr2.o pcunmrq.o pcunmtr.o \ pclaevswp.o \ pclarzb.o pclarzt.o pclarz.o pclarzc.o pclatrz.o pctzrzf.o \ pclattrs.o \ pcunmr3.o pcunmrz.o pcmax1.o pscsum1.o pclamr1d.o \ cdbtf2.o cdbtrf.o cdttrf.o cdttrsv.o cpttrsv.o csteqr2.o ctrmvt.o \ clamsh.o claref.o clanv2.o clahqr2.o pcheevr.o DLASRC = \ pddbsv.o pddbtrf.o pddbtrs.o pddbtrsv.o \ pddtsv.o pddttrf.o pddttrs.o pddttrsv.o \ pdgbsv.o pdgbtrf.o pdgbtrs.o \ pdgebd2.o pdgebrd.o pdgecon.o pdgeequ.o pdgehd2.o pdgehrd.o \ pdgelq2.o pdgelqf.o pdgels.o pdgeql2.o pdgeqlf.o pdgeqpf.o pdgeqr2.o \ pdgeqrf.o pdgerfs.o pdgerq2.o pdgerqf.o \ pdgesv.o pdgesvd.o pdgesvx.o pdgetf2.o pdgetrf.o pdgetri.o pdgetrs.o \ pdggqrf.o pdggrqf.o pdlabrd.o \ pdlacon.o pdlacp2.o pdlacpy.o pdlahrd.o pdlange.o pdlanhs.o pdlansy.o \ pdlantr.o pdlapiv.o pdlapv2.o pdlaqge.o pdlaqsy.o pdlarf.o pdlarfb.o \ pdlarfg.o pdlarft.o pdlase2.o pdlaset.o pdlascl.o pdlassq.o pdlaswp.o \ pdlatra.o pdlatrd.o pdlatrs.o pdlauu2.o pdlauum.o pdorg2l.o pdorg2r.o \ pdorgl2.o pdorglq.o pdorgql.o pdorgqr.o pdorgr2.o \ pdorgrq.o pdorm2l.o pdorm2r.o pdormbr.o pdormhr.o pdorml2.o \ pdormlq.o pdormql.o pdormqr.o pdormr2.o pdormrq.o pdormtr.o pdpocon.o \ pdpbsv.o pdpbtrf.o pdpbtrs.o pdpbtrsv.o \ pdptsv.o pdpttrf.o pdpttrs.o pdpttrsv.o \ pdpoequ.o pdporfs.o pdposv.o pdposvx.o pdpotf2.o pdpotrf.o pdpotri.o \ pdpotrs.o pdrscl.o pdstein.o pdsyev.o pdsyevd.o pdsyevx.o \ pdsygs2.o pdsygst.o \ pdsygvx.o pdsyngst.o pdsyntrd.o pdsyttrd.o pdsytd2.o pdsytrd.o pdtrti2.o \ pdtrtri.o pdtrtrs.o \ pdlaevswp.o \ pdlarzb.o pdlarzt.o pdlarz.o pdlatrz.o pdtzrzf.o pdormr3.o pdormrz.o \ pdlahqr.o pdlaconsb.o pdlacp3.o pdlawil.o \ pdlasmsub.o pdlared2d.o pdlamr1d.o dlaref.o dlamsh.o dlasorte.o dsteqr2.o \ ddbtf2.o ddbtrf.o ddttrf.o ddttrsv.o dpttrsv.o dtrmvt.o pdsyevr.o \ bdlaapp.o bdlaexc.o bdtrexc.o dlaqr6.o pdtrord.o \ pdtrsen.o pdgebal.o pdhseqr.o pdlamve.o pdlaqr0.o pdlaqr1.o pdlaqr2.o \ pdlaqr3.o pdlaqr4.o pdlaqr5.o pdrot.o ZLASRC = \ pzdbsv.o pzdbtrf.o pzdbtrs.o pzdbtrsv.o \ pzdtsv.o pzdttrf.o pzdttrs.o pzdttrsv.o \ pzgbsv.o pzgbtrf.o pzgbtrs.o \ pzgebd2.o pzgebrd.o pzgecon.o pzgeequ.o pzgehd2.o pzgehrd.o \ pzgelq2.o pzgelqf.o pzgels.o pzgeql2.o pzgeqlf.o pzgeqpf.o pzgeqr2.o \ pzgeqrf.o pzgerfs.o pzgerq2.o pzgerqf.o \ pzgesv.o pzgesvd.o pzgesvx.o pzgetf2.o pzgetrf.o pzgetri.o pzgetrs.o \ pzggqrf.o \ pzggrqf.o pzheev.o pzheevd.o pzheevx.o pzhegs2.o pzhegst.o pzhegvx.o \ pzhengst.o pzhentrd.o pzhettrd.o pzhetd2.o \ pzhetrd.o pzlabrd.o pzlacon.o pzlacgv.o pzlacp2.o pzlacpy.o pzlahrd.o \ pzlahqr.o pzlaconsb.o pzlasmsub.o pzlacp3.o pzlawil.o pzrot.o \ pzlange.o pzlanhe.o pzlanhs.o pzlansy.o pzlantr.o pzlapiv.o pzlapv2.o \ pzlaqge.o pzlaqsy.o pzlarf.o pzlarfb.o pzlarfc.o pzlarfg.o pzlarft.o \ pzlascl.o pzlase2.o pzlaset.o pzlassq.o pzlaswp.o pzlatra.o pzlatrd.o \ pzlattrs.o \ pzlatrs.o pzlauu2.o pzlauum.o pzpocon.o pzpoequ.o pzporfs.o pzposv.o \ pzpbsv.o pzpbtrf.o pzpbtrs.o pzpbtrsv.o \ pzptsv.o pzpttrf.o pzpttrs.o pzpttrsv.o \ pzposvx.o pzpotf2.o pzpotrf.o pzpotri.o pzpotrs.o pzdrscl.o pzstein.o \ pztrevc.o pztrti2.o pztrtri.o pztrtrs.o pzung2l.o pzung2r.o \ pzungl2.o pzunglq.o pzungql.o pzungqr.o pzungr2.o pzungrq.o \ pzunm2l.o pzunm2r.o pzunmbr.o pzunmhr.o pzunml2.o pzunmlq.o \ pzunmql.o pzunmqr.o pzunmr2.o pzunmrq.o pzunmtr.o \ pzlaevswp.o \ pzlarzb.o pzlarzt.o pzlarz.o pzlarzc.o pzlatrz.o pztzrzf.o \ pzunmr3.o pzunmrz.o pzmax1.o pdzsum1.o pzlamr1d.o \ zdbtf2.o zdbtrf.o zdttrf.o zdttrsv.o zpttrsv.o zsteqr2.o ztrmvt.o \ zlamsh.o zlaref.o zlanv2.o zlahqr2.o pzheevr.o all: single complex double complex16 single: $(SLASRC) $(SCLAUX) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../$(SCALAPACKLIB) $(SLASRC) $(SCLAUX) \ $(ALLAUX) $(RANLIB) ../$(SCALAPACKLIB) complex: $(CLASRC) $(SCLAUX) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../$(SCALAPACKLIB) $(CLASRC) $(SCLAUX) \ $(ALLAUX) $(RANLIB) ../$(SCALAPACKLIB) double: $(DLASRC) $(DZLAUX) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../$(SCALAPACKLIB) $(DLASRC) $(DZLAUX) \ $(ALLAUX) $(RANLIB) ../$(SCALAPACKLIB) complex16: $(ZLASRC) $(DZLAUX) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../$(SCALAPACKLIB) $(ZLASRC) $(DZLAUX) \ $(ALLAUX) $(RANLIB) ../$(SCALAPACKLIB) slamov.o: slamov.c dlamov.o: dlamov.c clamov.o: clamov.c zlamov.o: zlamov.c slamov.o dlamov.o clamov.o zlamov.o: lamov.h $(CC) -c $(CFLAGS) $(CDEFS) $(@:.o=.c) -o $@ clean : rm -f *.o .f.o : $(FC) -c $(FCFLAGS) $*.f .c.o : $(CC) -c $(CDEFS) $(CCFLAGS) $*.c scalapack-2.0.2/SRC/pbchkvect.c000644 000766 000024 00000020521 10363532303 016453 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 1.5) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 17, 1995 * * --------------------------------------------------------------------- */ /* * Include files */ #include "tools.h" void pbchkvect( n, npos0, ix, jx, desc_X, incx, dpos0, iix, jjx, ixrow, ixcol, nprow, npcol, myrow, mycol, info ) /* * .. Scalar Arguments .. */ int dpos0, * iix, incx, * info, ix, * ixcol, * ixrow, * jjx, jx, myrow, mycol, npcol, nprow, n, npos0; /* * .. Array Arguments .. */ int desc_X[]; { /* * * Purpose * ======= * * pbchkvect checks the validity of a descriptor vector DESCX, the * related global indexes IX, JX and the global increment INCX. It also * computes the starting local indexes (IIX,JJX) corresponding to the * submatrix starting globally at the entry pointed by (IX,JX). * Moreover, this routine returns the coordinates in the grid of the * process owning the global matrix entry of indexes (IX,JX), namely * (IXROW,IXCOL). The routine prevents out-of-bound memory access * by performing the appropriate MIN operation on iix and JJX. Finally, * if an inconsistency is found among its parameters IX, JX, DESCX and * INCX, the routine returns an error code in info. * * Arguments * ========= * * N (global input) INTEGER * The length of the vector X being operated on. * * NPOS0 (global input) INTEGER * Where in the calling routine's parameter list N appears. * * IX (global input) INTEGER * X's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JX (global input) INTEGER * X's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * DPOS0 (global input) INTEGER * Where in the calling routine's parameter list DESCX * appears. Note that we assume IX and JX are respectively 2 * and 1 entries behind DESCX, and INCX is 1 entry after DESCX. * * IIX (local output) pointer to INTEGER * The local rows starting index of the submatrix. * * JJX (local output) pointer to INTEGER * The local columns starting index of the submatrix. * * IXROW (global output) pointer to INTEGER * The row coordinate of the process that possesses the first * row and column of the submatrix. * * IXCOL (global output) pointer to INTEGER * The column coordinate of the process that possesses the * first row and column of the submatrix. * * NPROW (global input) INTEGER * The total number of process rows over which the distributed * matrix is distributed. * * NPCOL (global input) INTEGER * The total number of process columns over which the * distributed matrix is distributed. * * MYROW (local input) INTEGER * The row coordinate of the process calling this routine. * * MYCOL (local input) INTEGER * The column coordinate of the process calling this routine. * * INFO (local input/local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. */ #define DESCMULT 100 #define BIGNUM 10000 /* .. * .. Local Scalars .. */ int descpos, ExtraColBlock, ExtraRowBlock, icpos, ixpos, jxpos, MyColBlock, MyColDist, MyRowBlock, MyRowDist, NColBlock, np, npos, nq, NRowBlock; /* .. * .. External Functions .. */ F_INTG_FCT numroc_(); /* * .. Executable Statements .. */ if( *info >= 0 ) *info = BIGNUM; else if( *info < -DESCMULT ) *info = -(*info); else *info = -(*info) * DESCMULT; /* * Figure where in parameter list each parameter was, factoring in * descriptor multiplier */ npos = npos0 * DESCMULT; ixpos = ( dpos0 - 2 ) * DESCMULT; jxpos = ( dpos0 - 1 ) * DESCMULT; icpos = ( dpos0 + 1 ) * DESCMULT; descpos = dpos0 * DESCMULT + 1; /* * Check that we have a legal descriptor type */ if(desc_X[DT_] != BLOCK_CYCLIC_2D) *info = MIN( *info, descpos + DT_ ); /* * Check that matrix values make sense from local viewpoint */ if( n < 0 ) *info = MIN( *info, npos ); else if( ix < 1 ) *info = MIN( *info, ixpos ); else if( jx < 1 ) *info = MIN( *info, jxpos ); else if( desc_X[MB_] < 1 ) *info = MIN( *info, descpos + MB_ ); else if( desc_X[NB_] < 1 ) *info = MIN( *info, descpos + NB_ ); else if( ( desc_X[RSRC_] < 0 ) || ( desc_X[RSRC_] >= nprow ) ) *info = MIN( *info, descpos + RSRC_ ); else if( ( desc_X[CSRC_] < 0 ) || ( desc_X[CSRC_] >= npcol ) ) *info = MIN( *info, descpos + CSRC_ ); else if( incx != 1 && incx != desc_X[M_] ) *info = MIN( *info, icpos ); else if( desc_X[LLD_] < 1 ) *info = MIN( *info, descpos + LLD_ ); if( n == 0 ) { /* * NULL matrix, relax some checks */ if( desc_X[M_] < 0 ) *info = MIN( *info, descpos + M_ ); if( desc_X[N_] < 0 ) *info = MIN( *info, descpos + N_ ); } else { /* * more rigorous checks for non-degenerate matrices */ if( desc_X[M_] < 1 ) *info = MIN( *info, descpos + M_ ); else if( desc_X[N_] < 1 ) *info = MIN( *info, descpos + N_ ); else if( ( incx == desc_X[M_] ) && ( jx+n-1 > desc_X[N_] ) ) *info = MIN( *info, jxpos ); else if( ( incx == 1 ) && ( incx != desc_X[M_] ) && ( ix+n-1 > desc_X[M_] ) ) *info = MIN( *info, ixpos ); else { if( ix > desc_X[M_] ) *info = MIN( *info, ixpos ); else if( jx > desc_X[N_] ) *info = MIN( *info, jxpos ); } } /* * Retrieve local information for vector X, and prepare output: * set info = 0 if no error, and divide by DESCMULT if error is not * in a descriptor entry. */ if( *info == BIGNUM ) { MyRowDist = ( myrow + nprow - desc_X[RSRC_] ) % nprow; MyColDist = ( mycol + npcol - desc_X[CSRC_] ) % npcol; NRowBlock = desc_X[M_] / desc_X[MB_]; NColBlock = desc_X[N_] / desc_X[NB_]; np = ( NRowBlock / nprow ) * desc_X[MB_]; nq = ( NColBlock / npcol ) * desc_X[NB_]; ExtraRowBlock = NRowBlock % nprow; ExtraColBlock = NColBlock % npcol; ix--; jx--; MyRowBlock = ix / desc_X[MB_]; MyColBlock = jx / desc_X[NB_]; *ixrow = ( MyRowBlock + desc_X[RSRC_] ) % nprow; *ixcol = ( MyColBlock + desc_X[CSRC_] ) % npcol; *iix = ( MyRowBlock / nprow + 1 ) * desc_X[MB_] + 1; *jjx = ( MyColBlock / npcol + 1 ) * desc_X[NB_] + 1; if( MyRowDist >= ( MyRowBlock % nprow ) ) { if( myrow == *ixrow ) *iix += ix % desc_X[MB_]; *iix -= desc_X[MB_]; } if( MyRowDist < ExtraRowBlock ) np += desc_X[MB_]; else if( MyRowDist == ExtraRowBlock ) np += ( desc_X[M_] % desc_X[MB_] ); np = MAX( 1, np ); if( MyColDist >= ( MyColBlock % npcol ) ) { if( mycol == *ixcol ) *jjx += jx % desc_X[NB_]; *jjx -= desc_X[NB_]; } if( MyColDist < ExtraColBlock ) nq += desc_X[NB_]; else if( MyColDist == ExtraColBlock ) nq += ( desc_X[N_] % desc_X[NB_] ); nq = MAX( 1, nq ); *iix = MIN( *iix, np ); *jjx = MIN( *jjx, nq ); if( desc_X[LLD_] < np ) { if( numroc_(&desc_X[N_], &desc_X[NB_], &mycol, &desc_X[CSRC_], &npcol) ) *info = -( descpos + LLD_ ); else *info = 0; } else *info = 0; } else if( *info % DESCMULT == 0 ) { *info = -(*info) / DESCMULT; } else { *info = -(*info); } } scalapack-2.0.2/SRC/pblas.h000644 000766 000024 00000106267 11745552113 015632 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- ScaLAPACK routine (version 1.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 17, 1996 * * --------------------------------------------------------------------- */ /* * This file includes the standard C libraries, as well as system * dependent include files. All PBLAS routines include this file. */ /* * ======================================================================== * Machine Specific PBLAS macros * ======================================================================== */ #define _HAL_ 0 #define _T3D_ 1 #ifdef T3D #define _MACH_ _T3D_ #endif #ifndef _MACH_ #define _MACH_ _HAL_ #endif /* * ======================================================================== * Include files * ======================================================================== */ #include #include #if( _MACH_ == _T3D_ ) #include #endif /* * ======================================================================== * FORTRAN <-> C interface * ======================================================================== * * These macros define how the PBLAS will be called. _F2C_ADD_ assumes * that they will be called by FORTRAN, which expects C routines to have * an underscore postfixed to the name (Suns, and Intel machines expect * this). _F2C_NOCHANGE indicates that FORTRAN will be calling, and that * it expects the name called by FORTRAN to be identical to that compiled * by the C (RS6K's do this). _F2C_UPCASE says it expects C routines * called by FORTRAN to be in all upcase (CRAY wants this). * _F2C_F77ISF2C indicates that the fortran "compiler" in use is * actually f2c, a FORTRAN to C converter. */ #define _F2C_ADD_ 0 #define _F2C_NOCHANGE 1 #define _F2C_UPCASE 2 #define _F2C_F77ISF2C 3 #ifdef UpCase #define _F2C_CALL_ _F2C_UPCASE #endif #ifdef NoChange #define _F2C_CALL_ _F2C_NOCHANGE #endif #ifdef Add_ #define _F2C_CALL_ _F2C_ADD_ #endif #ifdef f77IsF2C #define _F2C_CALL_ _F2C_F77ISF2C #endif #ifndef _F2C_CALL_ #define _F2C_CALL_ _F2C_ADD_ #endif /* * ======================================================================== * TYPE DEFINITIONS AND CONVERSION UTILITIES * ======================================================================== */ typedef struct { float re, im; } complex; typedef struct { double re, im; } complex16; #if( _MACH_ == _T3D_ ) #define float double /* Type of character argument in a FORTRAN call */ #define F_CHAR _fcd /* Character conversion utilities */ #define F2C_CHAR(a) ( _fcdtocp( (a) ) ) #define C2F_CHAR(a) ( _cptofcd( (a), 1 ) ) /* Type of FORTRAN functions */ #define F_VOID_FCT void fortran /* Subroutine */ #define F_INTG_FCT int fortran /* INTEGER function */ #define F_DBLE_FCT double fortran /* DOUBLE PRECISION function */ #else /* Type of character argument in a FORTRAN call */ typedef char * F_CHAR; /* Character conversion utilities */ #define F2C_CHAR(a) (a) #define C2F_CHAR(a) (a) /* Type of FORTRAN functions */ #define F_VOID_FCT void /* Subroutine */ #define F_INTG_FCT int /* INTEGER function */ #define F_DBLE_FCT double /* DOUBLE PRECISION function */ #endif /* * ======================================================================== * #DEFINE MACRO CONSTANTS * ======================================================================== */ #define DLEN_ 9 /* Length of a descriptor */ #define DT_ 0 /* Descriptor Type */ #define CTXT_ 1 /* BLACS context */ #define M_ 2 /* Global Number of Rows */ #define N_ 3 /* Global Number of Columns */ #define MB_ 4 /* Row Blocking Size */ #define NB_ 5 /* Column Blocking Size */ #define RSRC_ 6 /* Starting Processor Row */ #define CSRC_ 7 /* Starting Processor Column */ #define LLD_ 8 /* Local Leading Dimension */ /* * Descriptor types */ #define BLOCK_CYCLIC_2D 1 #define BLOCK_CYCLIC_INB_2D 2 #define BROADCAST "B" /* Blacs operation definitions */ #define COMBINE "C" #define ALL "A" /* Scope definitions */ #define COLUMN "C" #define ROW "R" #define TOPDEF " " /* Default BLACS topology, PB-BLAS routines */ #define CTOPDEF ' ' #define TOPGET "!" #define YES "Y" #define NO "N" #define MULLENFAC 2 #define ONE 1.0 #define ZERO 0.0 /* * ======================================================================== * PREPROCESSOR MACRO FUNCTIONS USED FOR OPTIMIZATION & CONVENIENCE * ======================================================================== */ #define ABS(a) (((a) < 0) ? -(a) : (a)) #define MIN(a,b) (((a) < (b)) ? (a) : (b)) #define MAX(a,b) (((a) > (b)) ? (a) : (b)) #define CEIL(a,b) ( ((a)+(b)-1) / (b) ) #define Mlowcase(C) ( ((C) > 64 && (C) < 91) ? (C) | 32 : (C) ) #define Mupcase(C) ( ((C) > 96 && (C) < 123) ? (C) & 0xDF : (C) ) #define INDXG2L( iglob, nb, iproc, isrcproc, nprocs )\ ( (nb) * ( ( (iglob)-1) / ( (nb) * (nprocs) ) ) +\ ( ( (iglob) - 1 ) % (nb) ) + 1 ) #define INDXL2G( iloc, nb, iproc, isrcproc, nprocs )\ ( (nprocs) * (nb) * ( ( (iloc) - 1 ) / (nb) ) +\ ( ( (iloc) - 1 ) % (nb) ) +\ ( ( (nprocs) + (iproc) - (isrcproc) ) % (nprocs) ) * (nb) + 1 ) #define INDXG2P( iglob, nb, iproc, isrcproc, nprocs ) \ ( ( (isrcproc) + ( (iglob) - 1 ) / (nb) ) % (nprocs) ) #define MYROC0( nblocks, n, nb, nprocs )\ ( ( (nblocks) % (nprocs) ) ? ( ( (nblocks) / (nprocs) ) * (nb) + (nb) )\ : ( ( (nblocks) / (nprocs) )* (nb) + ( (n) % (nb) ) ) ) #if( _F2C_CALL_ == _F2C_ADD_ ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in). * No redefinition necessary to have following FORTRAN to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void pdgemm_(...) * * This is the default. */ #endif #if( _F2C_CALL_ == _F2C_UPCASE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in) * following FORTRAN to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void PDGEMM(...) */ /* TOOLS */ #define ilcm_ ILCM #define infog2l_ INFOG2L #define numroc_ NUMROC #define pstreecomb_ PSTREECOMB #define pdtreecomb_ PDTREECOMB #define pctreecomb_ PCTREECOMB #define pztreecomb_ PZTREECOMB #define scombamax_ SCOMBAMAX #define dcombamax_ DCOMBAMAX #define ccombamax_ CCOMBAMAX #define zcombamax_ ZCOMBAMAX #define scombnrm2_ SCOMBNRM2 #define dcombnrm2_ DCOMBNRM2 #define dlamov_ DLAMOV #define slamov_ SLAMOV #define clamov_ CLAMOV #define zlamov_ ZLAMOV #define dlacpy_ DLACPY #define slacpy_ SLACPY #define clacpy_ CLACPY #define zlacpy_ ZLACPY #define xerbla_ XERBLA /* BLACS */ #define blacs_abort_ BLACS_ABORT #define blacs_gridinfo_ BLACS_GRIDINFO #define igesd2d_ IGESD2D #define igebs2d_ IGEBS2D #define itrsd2d_ ITRSD2D #define itrbs2d_ ITRBS2D #define igerv2d_ IGERV2D #define igebr2d_ IGEBR2D #define itrrv2d_ ITRRV2D #define itrbr2d_ ITRBR2D #define igamx2d_ IGAMX2D #define igamn2d_ IGAMN2D #define igsum2d_ IGSUM2D #define sgesd2d_ SGESD2D #define sgebs2d_ SGEBS2D #define strsd2d_ STRSD2D #define strbs2d_ STRBS2D #define sgerv2d_ SGERV2D #define sgebr2d_ SGEBR2D #define strrv2d_ STRRV2D #define strbr2d_ STRBR2D #define sgamx2d_ SGAMX2D #define sgamn2d_ SGAMN2D #define sgsum2d_ SGSUM2D #define dgesd2d_ DGESD2D #define dgebs2d_ DGEBS2D #define dtrsd2d_ DTRSD2D #define dtrbs2d_ DTRBS2D #define dgerv2d_ DGERV2D #define dgebr2d_ DGEBR2D #define dtrrv2d_ DTRRV2D #define dtrbr2d_ DTRBR2D #define dgamx2d_ DGAMX2D #define dgamn2d_ DGAMN2D #define dgsum2d_ DGSUM2D #define cgesd2d_ CGESD2D #define cgebs2d_ CGEBS2D #define ctrsd2d_ CTRSD2D #define ctrbs2d_ CTRBS2D #define cgerv2d_ CGERV2D #define cgebr2d_ CGEBR2D #define ctrrv2d_ CTRRV2D #define ctrbr2d_ CTRBR2D #define cgamx2d_ CGAMX2D #define cgamn2d_ CGAMN2D #define cgsum2d_ CGSUM2D #define zgesd2d_ ZGESD2D #define zgebs2d_ ZGEBS2D #define ztrsd2d_ ZTRSD2D #define ztrbs2d_ ZTRBS2D #define zgerv2d_ ZGERV2D #define zgebr2d_ ZGEBR2D #define ztrrv2d_ ZTRRV2D #define ztrbr2d_ ZTRBR2D #define zgamx2d_ ZGAMX2D #define zgamn2d_ ZGAMN2D #define zgsum2d_ ZGSUM2D /* Level-1 BLAS */ #define srotg_ SROTG #define srotmg_ SROTMG #define srot_ SROT #define srotm_ SROTM #define sswap_ SSWAP #define sscal_ SSCAL #define scopy_ SCOPY #define saxpy_ SAXPY #define ssdot_ SSDOT #define isamax_ ISAMAX #define drotg_ DROTG #define drotmg_ DROTMG #define drot_ DROT #define drotm_ DROTM #define dswap_ DSWAP #define dscal_ DSCAL #define dcopy_ DCOPY #define daxpy_ DAXPY #define dddot_ DDDOT #define dnrm2_ DNRM2 #define dsnrm2_ DSNRM2 #define dasum_ DASUM #define dsasum_ DSASUM #define idamax_ IDAMAX #define cswap_ CSWAP #define cscal_ CSCAL #define csscal_ CSSCAL #define ccopy_ CCOPY #define caxpy_ CAXPY #define ccdotu_ CCDOTU #define ccdotc_ CCDOTC #define icamax_ ICAMAX #define zswap_ ZSWAP #define zscal_ ZSCAL #define zdscal_ ZDSCAL #define zcopy_ ZCOPY #define zaxpy_ ZAXPY #define zzdotu_ ZZDOTU #define zzdotc_ ZZDOTC #define dscnrm2_ DSCNRM2 #define dznrm2_ DZNRM2 #define dscasum_ DSCASUM #define dzasum_ DZASUM #define izamax_ IZAMAX /* Level-2 BLAS */ #define sgemv_ SGEMV #define ssymv_ SSYMV #define strmv_ STRMV #define strsv_ STRSV #define sger_ SGER #define ssyr_ SSYR #define ssyr2_ SSYR2 #define dgemv_ DGEMV #define dsymv_ DSYMV #define dtrmv_ DTRMV #define dtrsv_ DTRSV #define dger_ DGER #define dsyr_ DSYR #define dsyr2_ DSYR2 #define cgemv_ CGEMV #define chemv_ CHEMV #define ctrmv_ CTRMV #define ctrsv_ CTRSV #define cgeru_ CGERU #define cgerc_ CGERC #define cher_ CHER #define cher2_ CHER2 #define zgemv_ ZGEMV #define zhemv_ ZHEMV #define ztrmv_ ZTRMV #define ztrsv_ ZTRSV #define zgeru_ ZGERU #define zgerc_ ZGERC #define zher_ ZHER #define zher2_ ZHER2 /* Level-3 BLAS */ #define sgemm_ SGEMM #define ssymm_ SSYMM #define ssyrk_ SSYRK #define ssyr2k_ SSYR2K #define strmm_ STRMM #define strsm_ STRSM #define dgemm_ DGEMM #define dsymm_ DSYMM #define dsyrk_ DSYRK #define dsyr2k_ DSYR2K #define dtrmm_ DTRMM #define dtrsm_ DTRSM #define cgemm_ CGEMM #define chemm_ CHEMM #define csymm_ CSYMM #define csyrk_ CSYRK #define cherk_ CHERK #define csyr2k_ CSYR2K #define cher2k_ CHER2K #define ctrmm_ CTRMM #define ctrsm_ CTRSM #define zgemm_ ZGEMM #define zhemm_ ZHEMM #define zsymm_ ZSYMM #define zsyrk_ ZSYRK #define zherk_ ZHERK #define zsyr2k_ ZSYR2K #define zher2k_ ZHER2K #define ztrmm_ ZTRMM #define ztrsm_ ZTRSM /* absolute value auxiliary PBLAS */ #define psatrmv_ PSATRMV #define pdatrmv_ PDATRMV #define pcatrmv_ PCATRMV #define pzatrmv_ PZATRMV #define psagemv_ PSAGEMV #define pdagemv_ PDAGEMV #define pcagemv_ PCAGEMV #define pzagemv_ PZAGEMV #define psasymv_ PSASYMV #define pdasymv_ PDASYMV #define pcahemv_ PCAHEMV #define pzahemv_ PZAHEMV /* Auxiliary PB-BLAS */ #define pbcmatadd_ PBCMATADD #define pbdmatadd_ PBDMATADD #define pbsmatadd_ PBSMATADD #define pbzmatadd_ PBZMATADD /* Level-2 PBBLAS */ #define pbcgemv_ PBCGEMV #define pbcgeru_ PBCGERU #define pbcgerc_ PBCGERC #define pbchemv_ PBCHEMV #define pbcher_ PBCHER #define pbcher2_ PBCHER2 #define pbctrmv_ PBCTRMV #define pbctrnv_ PBCTRNV #define pbctrsv_ PBCTRSV #define pbdgemv_ PBDGEMV #define pbdger_ PBDGER #define pbdsymv_ PBDSYMV #define pbdsyr_ PBDSYR #define pbdsyr2_ PBDSYR2 #define pbdtrmv_ PBDTRMV #define pbdtrnv_ PBDTRNV #define pbdtrsv_ PBDTRSV #define pbsgemv_ PBSGEMV #define pbsger_ PBSGER #define pbssymv_ PBSSYMV #define pbssyr_ PBSSYR #define pbssyr2_ PBSSYR2 #define pbstrmv_ PBSTRMV #define pbstrnv_ PBSTRNV #define pbstrsv_ PBSTRSV #define pbzgemv_ PBZGEMV #define pbzgeru_ PBZGERU #define pbzgerc_ PBZGERC #define pbzhemv_ PBZHEMV #define pbzher_ PBZHER #define pbzher2_ PBZHER2 #define pbztrmv_ PBZTRMV #define pbztrnv_ PBZTRNV #define pbztrsv_ PBZTRSV /* Level-3 PBBLAS */ #define pbcgemm_ PBCGEMM #define pbchemm_ PBCHEMM #define pbcher2k_ PBCHER2K #define pbcherk_ PBCHERK #define pbcsymm_ PBCSYMM #define pbcsyr2k_ PBCSYR2K #define pbcsyrk_ PBCSYRK #define pbctrmm_ PBCTRMM #define pbctrsm_ PBCTRSM #define pbctran_ PBCTRAN #define pbdgemm_ PBDGEMM #define pbdsymm_ PBDSYMM #define pbdsyr2k_ PBDSYR2K #define pbdsyrk_ PBDSYRK #define pbdtrmm_ PBDTRMM #define pbdtrsm_ PBDTRSM #define pbdtran_ PBDTRAN #define pbsgemm_ PBSGEMM #define pbssymm_ PBSSYMM #define pbssyr2k_ PBSSYR2K #define pbssyrk_ PBSSYRK #define pbstrmm_ PBSTRMM #define pbstrsm_ PBSTRSM #define pbstran_ PBSTRAN #define pbzgemm_ PBZGEMM #define pbzhemm_ PBZHEMM #define pbzher2k_ PBZHER2K #define pbzherk_ PBZHERK #define pbzsymm_ PBZSYMM #define pbzsyr2k_ PBZSYR2K #define pbzsyrk_ PBZSYRK #define pbztrmm_ PBZTRMM #define pbztrsm_ PBZTRSM #define pbztran_ PBZTRAN /* Auxilliary PBLAS */ #define pberror_ PBERROR #define pb_freebuf_ PB_FREEBUF #define pb_topget_ PB_TOPGET #define pb_topset_ PB_TOPSET /* Level-1 PBLAS */ #define psrotg_ PSROTG #define psrotmg_ PSROTMG #define psrot_ PSROT #define psrotm_ PSROTM #define psswap_ PSSWAP #define psscal_ PSSCAL #define pscopy_ PSCOPY #define psaxpy_ PSAXPY #define psdot_ PSDOT #define psnrm2_ PSNRM2 #define psasum_ PSASUM #define psamax_ PSAMAX #define pdrotg_ PDROTG #define pdrotmg_ PDROTMG #define pdrot_ PDROT #define pdrotm_ PDROTM #define pdswap_ PDSWAP #define pdscal_ PDSCAL #define pdcopy_ PDCOPY #define pdaxpy_ PDAXPY #define pddot_ PDDOT #define pdnrm2_ PDNRM2 #define pdasum_ PDASUM #define pdamax_ PDAMAX #define pcswap_ PCSWAP #define pcscal_ PCSCAL #define pcsscal_ PCSSCAL #define pccopy_ PCCOPY #define pcaxpy_ PCAXPY #define pcdotu_ PCDOTU #define pcdotc_ PCDOTC #define pscnrm2_ PSCNRM2 #define pscasum_ PSCASUM #define pcamax_ PCAMAX #define pcrot_ PCROT #define crot_ CROT #define pzswap_ PZSWAP #define pzscal_ PZSCAL #define pzdscal_ PZDSCAL #define pzcopy_ PZCOPY #define pzaxpy_ PZAXPY #define pzdotu_ PZDOTU #define pzdotc_ PZDOTC #define pdznrm2_ PDZNRM2 #define pdzasum_ PDZASUM #define pzamax_ PZAMAX #define pzrot_ PZROT #define zrot_ ZROT /* Level-2 PBLAS */ #define pcgemv_ PCGEMV #define pcgeru_ PCGERU #define pcgerc_ PCGERC #define pchemv_ PCHEMV #define pcher_ PCHER #define pcher2_ PCHER2 #define pctrmv_ PCTRMV #define pctrsv_ PCTRSV #define pdgemv_ PDGEMV #define pdger_ PDGER #define pdsymv_ PDSYMV #define pdsyr_ PDSYR #define pdsyr2_ PDSYR2 #define pdtrmv_ PDTRMV #define pdtrsv_ PDTRSV #define psgemv_ PSGEMV #define psger_ PSGER #define pssymv_ PSSYMV #define pssyr_ PSSYR #define pssyr2_ PSSYR2 #define pstrmv_ PSTRMV #define pstrsv_ PSTRSV #define pzgemv_ PZGEMV #define pzgeru_ PZGERU #define pzgerc_ PZGERC #define pzhemv_ PZHEMV #define pzher_ PZHER #define pzher2_ PZHER2 #define pztrmv_ PZTRMV #define pztrsv_ PZTRSV /* Level-3 PBLAS */ #define pcgemm_ PCGEMM #define pchemm_ PCHEMM #define pcher2k_ PCHER2K #define pcherk_ PCHERK #define pcsymm_ PCSYMM #define pcsyr2k_ PCSYR2K #define pcsyrk_ PCSYRK #define pctrmm_ PCTRMM #define pctrsm_ PCTRSM #define pctranu_ PCTRANU #define pctranc_ PCTRANC #define pdgemm_ PDGEMM #define pdsymm_ PDSYMM #define pdsyr2k_ PDSYR2K #define pdsyrk_ PDSYRK #define pdtrmm_ PDTRMM #define pdtrsm_ PDTRSM #define pdtran_ PDTRAN #define psgemm_ PSGEMM #define pssymm_ PSSYMM #define pssyr2k_ PSSYR2K #define pssyrk_ PSSYRK #define pstrmm_ PSTRMM #define pstrsm_ PSTRSM #define pstran_ PSTRAN #define pzgemm_ PZGEMM #define pzhemm_ PZHEMM #define pzher2k_ PZHER2K #define pzherk_ PZHERK #define pzsymm_ PZSYMM #define pzsyr2k_ PZSYR2K #define pzsyrk_ PZSYRK #define pztrmm_ PZTRMM #define pztrsm_ PZTRSM #define pztranu_ PZTRANU #define pztranc_ PZTRANC #endif #if( _F2C_CALL_ == _F2C_NOCHANGE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in) * for following FORTRAN to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void pdgemm(...) */ /* TOOLS */ #define ilcm_ ilcm #define infog2l_ infog2l #define numroc_ numroc #define pstreecomb_ pstreecomb #define pdtreecomb_ pdtreecomb #define pctreecomb_ pctreecomb #define pztreecomb_ pztreecomb #define scombamax_ scombamax #define dcombamax_ dcombamax #define ccombamax_ ccombamax #define zcombamax_ zcombamax #define scombnrm2_ scombnrm2 #define dcombnrm2_ dcombnrm2 #define dlamov_ dlamov #define slamov_ slamov #define clamov_ clamov #define zlamov_ zlamov #define dlacpy_ dlacpy #define slacpy_ slacpy #define clacpy_ clacpy #define zlacpy_ zlacpy #define xerbla_ xerbla /* BLACS */ #define blacs_abort_ blacs_abort #define blacs_gridinfo_ blacs_gridinfo #define igesd2d_ igesd2d #define igebs2d_ igebs2d #define itrsd2d_ itrsd2d #define itrbs2d_ itrbs2d #define igerv2d_ igerv2d #define igebr2d_ igebr2d #define itrrv2d_ itrrv2d #define itrbr2d_ itrbr2d #define igamx2d_ igamx2d #define igamn2d_ igamn2d #define igsum2d_ igsum2d #define sgesd2d_ sgesd2d #define sgebs2d_ sgebs2d #define strsd2d_ strsd2d #define strbs2d_ strbs2d #define sgerv2d_ sgerv2d #define sgebr2d_ sgebr2d #define strrv2d_ strrv2d #define strbr2d_ strbr2d #define sgamx2d_ sgamx2d #define sgamn2d_ sgamn2d #define sgsum2d_ sgsum2d #define dgesd2d_ dgesd2d #define dgebs2d_ dgebs2d #define dtrsd2d_ dtrsd2d #define dtrbs2d_ dtrbs2d #define dgerv2d_ dgerv2d #define dgebr2d_ dgebr2d #define dtrrv2d_ dtrrv2d #define dtrbr2d_ dtrbr2d #define dgamx2d_ dgamx2d #define dgamn2d_ dgamn2d #define dgsum2d_ dgsum2d #define cgesd2d_ cgesd2d #define cgebs2d_ cgebs2d #define ctrsd2d_ ctrsd2d #define ctrbs2d_ ctrbs2d #define cgerv2d_ cgerv2d #define cgebr2d_ cgebr2d #define ctrrv2d_ ctrrv2d #define ctrbr2d_ ctrbr2d #define cgamx2d_ cgamx2d #define cgamn2d_ cgamn2d #define cgsum2d_ cgsum2d #define zgesd2d_ zgesd2d #define zgebs2d_ zgebs2d #define ztrsd2d_ ztrsd2d #define ztrbs2d_ ztrbs2d #define zgerv2d_ zgerv2d #define zgebr2d_ zgebr2d #define ztrrv2d_ ztrrv2d #define ztrbr2d_ ztrbr2d #define zgamx2d_ zgamx2d #define zgamn2d_ zgamn2d #define zgsum2d_ zgsum2d /* Level-1 BLAS */ #define srotg_ srotg #define srotmg_ srotmg #define srot_ srot #define srotm_ srotm #define sswap_ sswap #define sscal_ sscal #define scopy_ scopy #define saxpy_ saxpy #define ssdot_ ssdot #define isamax_ isamax #define drotg_ drotg #define drotmg_ drotmg #define drot_ drot #define drotm_ drotm #define dswap_ dswap #define dscal_ dscal #define dcopy_ dcopy #define daxpy_ daxpy #define dddot_ dddot #define dnrm2_ dnrm2 #define dsnrm2_ dsnrm2 #define dasum_ dasum #define dsasum_ dsasum #define idamax_ idamax #define cswap_ cswap #define cscal_ cscal #define csscal_ csscal #define ccopy_ ccopy #define caxpy_ caxpy #define ccdotu_ ccdotu #define ccdotc_ ccdotc #define icamax_ icamax #define zswap_ zswap #define zscal_ zscal #define zdscal_ zdscal #define zcopy_ zcopy #define zaxpy_ zaxpy #define zzdotu_ zzdotu #define zzdotc_ zzdotc #define dscnrm2_ dscnrm2 #define dznrm2_ dznrm2 #define dscasum_ dscasum #define dzasum_ dzasum #define izamax_ izamax /* Level-2 BLAS */ #define sgemv_ sgemv #define ssymv_ ssymv #define strmv_ strmv #define strsv_ strsv #define sger_ sger #define ssyr_ ssyr #define ssyr2_ ssyr2 #define dgemv_ dgemv #define dsymv_ dsymv #define dtrmv_ dtrmv #define dtrsv_ dtrsv #define dger_ dger #define dsyr_ dsyr #define dsyr2_ dsyr2 #define cgemv_ cgemv #define chemv_ chemv #define ctrmv_ ctrmv #define ctrsv_ ctrsv #define cgeru_ cgeru #define cgerc_ cgerc #define cher_ cher #define cher2_ cher2 #define zgemv_ zgemv #define zhemv_ zhemv #define ztrmv_ ztrmv #define ztrsv_ ztrsv #define zgeru_ zgeru #define zgerc_ zgerc #define zher_ zher #define zher2_ zher2 /* Level-3 BLAS */ #define sgemm_ sgemm #define ssymm_ ssymm #define ssyrk_ ssyrk #define ssyr2k_ ssyr2k #define strmm_ strmm #define strsm_ strsm #define dgemm_ dgemm #define dsymm_ dsymm #define dsyrk_ dsyrk #define dsyr2k_ dsyr2k #define dtrmm_ dtrmm #define dtrsm_ dtrsm #define cgemm_ cgemm #define chemm_ chemm #define csymm_ csymm #define csyrk_ csyrk #define cherk_ cherk #define csyr2k_ csyr2k #define cher2k_ cher2k #define ctrmm_ ctrmm #define ctrsm_ ctrsm #define zgemm_ zgemm #define zhemm_ zhemm #define zsymm_ zsymm #define zsyrk_ zsyrk #define zherk_ zherk #define zsyr2k_ zsyr2k #define zher2k_ zher2k #define ztrmm_ ztrmm #define ztrsm_ ztrsm /* absolute value auxiliary PBLAS */ #define psatrmv_ psatrmv #define pdatrmv_ pdatrmv #define pcatrmv_ pcatrmv #define pzatrmv_ pzatrmv #define psagemv_ psagemv #define pdagemv_ pdagemv #define pcagemv_ pcagemv #define pzagemv_ pzagemv #define psasymv_ psasymv #define pdasymv_ pdasymv #define pcahemv_ pcahemv #define pzahemv_ pzahemv /* Auxiliary PB-BLAS */ #define pbcmatadd_ pbcmatadd #define pbdmatadd_ pbdmatadd #define pbsmatadd_ pbsmatadd #define pbzmatadd_ pbzmatadd /* Level-2 PBBLAS */ #define pbcgemv_ pbcgemv #define pbcgeru_ pbcgeru #define pbcgerc_ pbcgerc #define pbchemv_ pbchemv #define pbcher_ pbcher #define pbcher2_ pbcher2 #define pbctrmv_ pbctrmv #define pbctrnv_ pbctrnv #define pbctrsv_ pbctrsv #define pbdgemv_ pbdgemv #define pbdger_ pbdger #define pbdsymv_ pbdsymv #define pbdsyr_ pbdsyr #define pbdsyr2_ pbdsyr2 #define pbdtrmv_ pbdtrmv #define pbdtrnv_ pbdtrnv #define pbdtrsv_ pbdtrsv #define pbsgemv_ pbsgemv #define pbsger_ pbsger #define pbssymv_ pbssymv #define pbssyr_ pbssyr #define pbssyr2_ pbssyr2 #define pbstrmv_ pbstrmv #define pbstrnv_ pbstrnv #define pbstrsv_ pbstrsv #define pbzgemv_ pbzgemv #define pbzgeru_ pbzgeru #define pbzgerc_ pbzgerc #define pbzhemv_ pbzhemv #define pbzher_ pbzher #define pbzher2_ pbzher2 #define pbztrmv_ pbztrmv #define pbztrnv_ pbztrnv #define pbztrsv_ pbztrsv /* Level-3 PBBLAS */ #define pbcgemm_ pbcgemm #define pbchemm_ pbchemm #define pbcher2k_ pbcher2k #define pbcherk_ pbcherk #define pbcsymm_ pbcsymm #define pbcsyr2k_ pbcsyr2k #define pbcsyrk_ pbcsyrk #define pbctrmm_ pbctrmm #define pbctrsm_ pbctrsm #define pbctran_ pbctran #define pbdgemm_ pbdgemm #define pbdsymm_ pbdsymm #define pbdsyr2k_ pbdsyr2k #define pbdsyrk_ pbdsyrk #define pbdtrmm_ pbdtrmm #define pbdtrsm_ pbdtrsm #define pbdtran_ pbdtran #define pbsgemm_ pbsgemm #define pbssymm_ pbssymm #define pbssyr2k_ pbssyr2k #define pbssyrk_ pbssyrk #define pbstrmm_ pbstrmm #define pbstrsm_ pbstrsm #define pbstran_ pbstran #define pbzgemm_ pbzgemm #define pbzhemm_ pbzhemm #define pbzher2k_ pbzher2k #define pbzherk_ pbzherk #define pbzsymm_ pbzsymm #define pbzsyr2k_ pbzsyr2k #define pbzsyrk_ pbzsyrk #define pbztrmm_ pbztrmm #define pbztrsm_ pbztrsm #define pbztran_ pbztran /* Auxilliary PBLAS */ #define pberror_ pberror #define pb_freebuf_ pb_freebuf #define pb_topget_ pb_topget #define pb_topset_ pb_topset /* Level-1 PBLAS */ #define psrotg_ psrotg #define psrotmg_ psrotmg #define psrot_ psrot #define psrotm_ psrotm #define psswap_ psswap #define psscal_ psscal #define pscopy_ pscopy #define psaxpy_ psaxpy #define psdot_ psdot #define psnrm2_ psnrm2 #define psasum_ psasum #define psamax_ psamax #define pdrotg_ pdrotg #define pdrotmg_ pdrotmg #define pdrot_ pdrot #define pdrotm_ pdrotm #define pdswap_ pdswap #define pdscal_ pdscal #define pdcopy_ pdcopy #define pdaxpy_ pdaxpy #define pddot_ pddot #define pdnrm2_ pdnrm2 #define pdasum_ pdasum #define pdamax_ pdamax #define pcswap_ pcswap #define pcscal_ pcscal #define pcsscal_ pcsscal #define pccopy_ pccopy #define pcaxpy_ pcaxpy #define pcdotu_ pcdotu #define pcdotc_ pcdotc #define pscnrm2_ pscnrm2 #define pscasum_ pscasum #define pcamax_ pcamax #define pcrot_ pcrot #define crot_ crot #define pzswap_ pzswap #define pzscal_ pzscal #define pzdscal_ pzdscal #define pzcopy_ pzcopy #define pzaxpy_ pzaxpy #define pzdotu_ pzdotu #define pzdotc_ pzdotc #define pdznrm2_ pdznrm2 #define pdzasum_ pdzasum #define pzamax_ pzamax #define pzrot_ pzrot #define zrot_ zrot /* Level-2 PBLAS */ #define pcgemv_ pcgemv #define pcgeru_ pcgeru #define pcgerc_ pcgerc #define pchemv_ pchemv #define pcher_ pcher #define pcher2_ pcher2 #define pctrmv_ pctrmv #define pctrsv_ pctrsv #define pdgemv_ pdgemv #define pdger_ pdger #define pdsymv_ pdsymv #define pdsyr_ pdsyr #define pdsyr2_ pdsyr2 #define pdtrmv_ pdtrmv #define pdtrsv_ pdtrsv #define psgemv_ psgemv #define psger_ psger #define pssymv_ pssymv #define pssyr_ pssyr #define pssyr2_ pssyr2 #define pstrmv_ pstrmv #define pstrsv_ pstrsv #define pzgemv_ pzgemv #define pzgeru_ pzgeru #define pzgerc_ pzgerc #define pzhemv_ pzhemv #define pzher_ pzher #define pzher2_ pzher2 #define pztrmv_ pztrmv #define pztrsv_ pztrsv /* Level-3 PBLAS */ #define pcgemm_ pcgemm #define pchemm_ pchemm #define pcher2k_ pcher2k #define pcherk_ pcherk #define pcsymm_ pcsymm #define pcsyr2k_ pcsyr2k #define pcsyrk_ pcsyrk #define pctrmm_ pctrmm #define pctrsm_ pctrsm #define pctranu_ pctranu #define pctranc_ pctranc #define pdgemm_ pdgemm #define pdsymm_ pdsymm #define pdsyr2k_ pdsyr2k #define pdsyrk_ pdsyrk #define pdtrmm_ pdtrmm #define pdtrsm_ pdtrsm #define pdtran_ pdtran #define psgemm_ psgemm #define pssymm_ pssymm #define pssyr2k_ pssyr2k #define pssyrk_ pssyrk #define pstrmm_ pstrmm #define pstrsm_ pstrsm #define pstran_ pstran #define pzgemm_ pzgemm #define pzhemm_ pzhemm #define pzher2k_ pzher2k #define pzherk_ pzherk #define pzsymm_ pzsymm #define pzsyr2k_ pzsyr2k #define pzsyrk_ pzsyrk #define pztrmm_ pztrmm #define pztrsm_ pztrsm #define pztranu_ pztranu #define pztranc_ pztranc #endif #if( _F2C_CALL_ == _F2C_F77ISF2C ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in) * for systems where the fortran "compiler" is actually f2c (a Fortran * to C conversion utility). */ /* * Initialization routines */ #define blacs_pinfo_ blacs_pinfo__ #define blacs_setup_ blacs_setup__ #define blacs_set_ blacs_set__ #define blacs_get_ blacs_get__ #define blacs_gridinit_ blacs_gridinit__ #define blacs_gridmap_ blacs_gridmap__ /* * Destruction routines */ #define blacs_freebuff_ blacs_freebuff__ #define blacs_gridexit_ blacs_gridexit__ #define blacs_abort_ blacs_abort__ #define blacs_exit_ blacs_exit__ /* * Informational & misc. */ #define blacs_gridinfo_ blacs_gridinfo__ #define blacs_pnum_ blacs_pnum__ #define blacs_pcoord_ blacs_pcoord__ #define blacs_barrier_ blacs_barrier__ #endif scalapack-2.0.2/SRC/pcdbsv.f000644 000766 000024 00000045353 10363532303 016000 0ustar00juliestaff000000 000000 SUBROUTINE PCDBSV( N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER BWL, BWU, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PCDBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * banded diagonally dominant-like distributed * matrix with bandwidth BWL, BWU. * * Gaussian elimination without pivoting * is used to factor a reordering * of the matrix into L U. * * See PCDBTRF and PCDBTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * +max((max(bwl,bwu)*NRHS), max(bwl,bwu)*max(bwl,bwu)) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PCDBTRF, PCDBTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PCDBTRF and PCDBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PCDBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * * Factor the matrix * CALL PCDBTRF( N, BWL, BWU, A, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PCDBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PCDBTRS( 'N', N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCDBSV', -INFO ) RETURN END IF * RETURN * * End of PCDBSV * END scalapack-2.0.2/SRC/pcdbtrf.f000644 000766 000024 00000126077 11750130340 016142 0ustar00juliestaff000000 000000 SUBROUTINE PCDBTRF( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), AF( * ), WORK( * ) * .. * * * Purpose * ======= * * PCDBTRF computes a LU factorization * of an N-by-N complex banded * diagonally dominant-like distributed matrix * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PCDBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = L U * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCDBTRF and this is stored in AF. If a linear system * is to be solved using PCDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * max(bwl,bwu)*max(bwl,bwu) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN, $ LEVEL_DIST, LLDA, MAX_BW, MBW2, MYCOL, MYROW, $ MY_NUM_COLS, NB, NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, OFST, PART_OFFSET, PART_SIZE, $ PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE, $ STORE_N_A, UP_PREV_TRI_SIZE_M, $ UP_PREV_TRI_SIZE_N, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ CAXPY, CGEMM, CGERV2D, CGESD2D, CLAMOV, $ CLATCPY, CPBTRF, CPOTRF, CSYRK, CTBTRS, CTRMM, $ CTRRV2D, CTRSD2D, CTRSM, CTRTRS, DESC_CONVERT, $ GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) MBW2 = MAX_BW * MAX_BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LWORK .LT. -1) THEN INFO = -10 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -1 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -2 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 6*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, $ 'PCDBTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PCDBTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * IF( LAF .LT. LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PCDBTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = MAX(BWL,BWU)*MAX(BWL,BWU) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, $ 'PCDBTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BWU PARAM_CHECK( 3, 1 ) = BWL PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 10 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCDBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * Offset to workspace for Upper triangular factor * WORK_U = BWU*ODD_SIZE + 3*MBW2 * * * Zero out space for fillin * DO 10 I=1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * * Zero out space for work * DO 20 I=1, WORK_SIZE_MIN WORK( I ) = CZERO 20 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN PREV_TRI_SIZE_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) PREV_TRI_SIZE_N=MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .GT. 0 ) THEN UP_PREV_TRI_SIZE_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) UP_PREV_TRI_SIZE_N=MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN NEXT_TRI_SIZE_M=MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) NEXT_TRI_SIZE_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF ( MYCOL .LT. NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL CTRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, $ A( OFST+(MY_NUM_COLS-BWL)*LLDA+(BWL+BWU+1) ), $ LLDA-1, 0, MYCOL+1 ) * ENDIF * * * Factor main partition A_i = L_i {U_i} in each processor * CALL CDBTRF( ODD_SIZE, ODD_SIZE, BWL, BWU, A( OFST + 1), $ LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1500 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * * Apply factorization to lower connection block BL_i * conjugate transpose the connection block in preparation. * Apply factorization to upper connection block BU_i * Move the connection block in preparation. * CALL CLATCPY( 'U', BWL, BWL, $ A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )), $ LLDA-1, AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), $ MAX_BW ) CALL CLAMOV( 'L', BWU, BWU, A( ( OFST+1+ODD_SIZE*LLDA ) ), $ LLDA-1, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ), $ MAX_BW ) * * Perform the triangular system solve {L_i}{{BU'}_i} = {B_i} * CALL CTBTRS( 'L', 'N', 'U', BWU, BWL, BWU, $ A( OFST+BWU+1+(ODD_SIZE-BWU )*LLDA ), LLDA, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ), $ MAX_BW, INFO ) * * Perform the triangular solve {U_i}^C{BL'}_i^C = {BL_i}^C * CALL CTBTRS( 'U', 'C', 'N', BWL, BWU, BWL, $ A( OFST+1+(ODD_SIZE-BWL)*LLDA ), LLDA, $ AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW, $ INFO ) * * conjugate transpose resulting block to its location * in main storage. * CALL CLATCPY( 'L', BWL, BWL, $ AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW, $ A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )), $ LLDA-1 ) * * Move the resulting block back to its location in main storage. * CALL CLAMOV( 'L', BWU, BWU, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ), $ MAX_BW, A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i} * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL CGEMM( 'C', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE , $ AF( ODD_SIZE*BWU+2*MBW2+1), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1), MAX_BW, CONE, $ A( OFST+ODD_SIZE*LLDA+1+BWU ), LLDA-1 ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1500 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for * off_diagonal block of reduced system. * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL CTRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M, $ PREV_TRI_SIZE_N, AF( WORK_U+1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF (INFO.EQ.0) THEN * * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ . * CALL CTBTRS( 'L', 'N', 'U', ODD_SIZE, BWL, BWL, $ A( OFST + BWU+1 ), LLDA, AF( WORK_U+1 ), $ ODD_SIZE, INFO ) * * * Calculate the "spike" fillin, ${U_i}^C {{GL}_i}^C = {DU_i}^C$ * * * Copy D block into AF storage for solve. * CALL CLATCPY( 'L', UP_PREV_TRI_SIZE_N, UP_PREV_TRI_SIZE_M, $ A( OFST+1 ), LLDA-1, AF( 1 ), ODD_SIZE ) * CALL CTBTRS( 'U', 'C', 'N', ODD_SIZE, BWU, BWU, $ A( OFST + 1 ), LLDA, $ AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = GL_i{GU_i} * * * Zero out space in case result is smaller than storage block * DO 30 I=1, MBW2 AF( ODD_SIZE*BWU+2*MBW2+I ) = CZERO 30 CONTINUE * CALL CGEMM( 'C', 'N', BWU, BWL, ODD_SIZE, $ -CONE, AF( 1 ), ODD_SIZE, $ AF( WORK_U+1 ), ODD_SIZE, CZERO, $ AF( 1+MAX(0,BWL-BWU)+ODD_SIZE*BWU+ $ (2*MAX_BW+MAX(0,BWU-BWL))*MAX_BW), $ MAX_BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0, $ MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * * Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage * as per requirements of BLAS routine CTRMM. * Since we have GU_i stored, * conjugate transpose HU_i to HU_i^C. * CALL CLATCPY( 'N', BWL, BWL, $ AF( WORK_U+ODD_SIZE-BWL+1 ), ODD_SIZE, $ AF( (ODD_SIZE)*BWU+1+(MAX_BW-BWL) ), $ MAX_BW ) * CALL CTRMM( 'R', 'U', 'C', 'N', BWL, BWL, -CONE, $ A( ( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA ) ), $ LLDA-1, AF( (ODD_SIZE)*BWU+1+(MAX_BW-BWL) ), $ MAX_BW ) * * * Copy matrix HL_i (the last bwu rows of GL_i^C) to AFU store * as per requirements of BLAS routine CTRMM. * Since we have GL_i^C stored, * conjugate transpose HL_i^C to HL_i. * CALL CLATCPY( 'N', BWU, BWU, $ AF( ODD_SIZE-BWU+1 ), ODD_SIZE, $ AF( WORK_U+(ODD_SIZE)*BWL+1+MAX_BW-BWU ), $ MAX_BW ) * CALL CTRMM( 'R', 'L', 'N', 'N', BWU, BWU, -CONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ AF( WORK_U+(ODD_SIZE)*BWL+1+MAX_BW-BWU ), $ MAX_BW ) * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+1 ), $ MAX_BW, 0, MYCOL-1 ) * CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), $ MAX_BW, 0, MYCOL-1 ) * ENDIF * * Copy last diagonal block into AF storage for subsequent * operations. * CALL CLAMOV( 'N', MAX_BW, MAX_BW, $ A( OFST+ODD_SIZE*LLDA+BWU+1 ), $ LLDA-1, AF( ODD_SIZE*BWU+MBW2+1 ), $ MAX_BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ MAX_BW, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * CALL CAXPY( MBW2, CONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ 1, AF( ODD_SIZE*BWU+MBW2+1 ), 1 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL CGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * CALL CAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BWU+MBW2+1 ), 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL CGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * CALL CAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BWU+MBW2+1 ), 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL CDBTRF( MAX_BW, MAX_BW, MIN( MAX_BW-1, BWL ), $ MIN( MAX_BW-1, BWU ), AF( ODD_SIZE*BWU+MBW2+1 $ -( MIN( MAX_BW-1, BWU ))), MAX_BW+1, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL CLAMOV( 'N', MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ), $ MAX_BW, AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), $ MAX_BW ) * CALL CLAMOV( 'N', MAX_BW, MAX_BW, AF( WORK_U+ODD_SIZE*BWL+1 ), $ MAX_BW, AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL CGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+1 ), $ MAX_BW, 0, COMM_PROC ) * CALL CGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), $ MAX_BW, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL CTBTRS( $ 'L', 'N', 'U', BWU, MIN( BWL, BWU-1 ), BWU, $ AF( ODD_SIZE*BWU+ $ MBW2+1+(MAX_BW+1)*(MAX_BW-BWU)), MAX_BW+1, $ AF( WORK_U+ODD_SIZE*BWL+1+MAX_BW-BWU), MAX_BW, INFO ) * * Modify lower off_diagonal block with diagonal block * * CALL CTBTRS( $ 'U', 'C', 'N', BWL, MIN( BWU, BWL-1 ), BWL, $ AF( ODD_SIZE*BWU+ $ MBW2+1-MIN( BWU, BWL-1 )+(MAX_BW+1)*(MAX_BW-BWL)), MAX_BW+1, $ AF( ODD_SIZE*BWU+1+MAX_BW-BWL), MAX_BW, INFO ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'C', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE, $ AF( (ODD_SIZE)*BWU+1 ), MAX_BW, $ AF( WORK_U+(ODD_SIZE)*BWL+1 ), MAX_BW, CZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL CGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST/2 ) * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL CGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * * * Since CTBTRS has no "left-right" option, we must transpose * CALL CLATCPY( 'N', MAX_BW, MAX_BW, AF( WORK_U+ODD_SIZE*BWL+ $ 2*MBW2+1), MAX_BW, WORK( 1 ), MAX_BW ) * CALL CTBTRS( $ 'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), BWL, $ AF( ODD_SIZE*BWU+MBW2+1), MAX_BW+1, $ WORK( 1+MAX_BW*(MAX_BW-BWL) ), MAX_BW, INFO ) * * Transpose back * CALL CLATCPY( $ 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+ $ 2*MBW2+1), MAX_BW ) * * * * Since CTBTRS has no "left-right" option, we must transpose * CALL CLATCPY( 'N', MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+ $ 2*MBW2+1), MAX_BW, WORK( 1 ), MAX_BW ) * CALL CTBTRS( $ 'U', 'C', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), BWU, $ AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, MAX_BW-1 )), MAX_BW+1, $ WORK( 1+MAX_BW*(MAX_BW-BWU) ), MAX_BW, INFO ) * * Transpose back * CALL CLATCPY( $ 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ AF( ODD_SIZE*BWU+ $ 2*MBW2+1), MAX_BW ) * * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL CGEMM( 'N', 'C', MAX_BW, MAX_BW, MAX_BW, -CONE, $ AF( (ODD_SIZE)*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+(ODD_SIZE)*BWL+2*MBW2+1 ), MAX_BW, $ CZERO, WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL CGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ AF( ODD_SIZE*BWU+1 ), MAX_BW, CZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to offdiagonal block's owning processor. * CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ 0, COMM_PROC ) * CALL CGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, CZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to offdiagonal block's owning processor. * CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * * 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PCDBTRF * END scalapack-2.0.2/SRC/pcdbtrs.f000644 000766 000024 00000064603 10363532303 016157 0ustar00juliestaff000000 000000 SUBROUTINE PCDBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 7, 2001 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PCDBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PCDBTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * banded diagonally dominant-like distributed * matrix with bandwidth BWL, BWU. * * Routine PCDBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCDBTRF and this is stored in AF. If a linear system * is to be solved using PCDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (max(bwl,bwu)*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NP_SAVE, PART_OFFSET, $ RETURN_CODE, STORE_M_B, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, $ PCDBTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -5 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PCDBTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PCDBTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ (MAX(BWL,BWU)*NRHS) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, $ 'PCDBTRS: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCDBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( TRANS, 'N' ) ) THEN * CALL PCDBTRSV( 'L', 'N', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PCDBTRSV( 'U', 'C', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ENDIF * * Call backsolve routine * IF( LSAME( TRANS, 'C' ) ) THEN * CALL PCDBTRSV( 'L', 'C', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PCDBTRSV( 'U', 'N', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ENDIF 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PCDBTRS * END scalapack-2.0.2/SRC/pcdbtrsv.f000644 000766 000024 00000145123 11750130340 016336 0ustar00juliestaff000000 000000 SUBROUTINE PCDBTRSV( UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PCDBTRSV solves a banded triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a banded * triangular matrix factor produced by the * Gaussian elimination code PC@(dom_pre)BTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H * is dictated by the user by the parameter TRANS. * * Routine PCDBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCDBTRF and this is stored in AF. If a linear system * is to be solved using PCDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (max(bwl,bwu)*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MAX_BW, MBW2, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 18, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ CGEMM, CGERV2D, CGESD2D, CLAMOV, CMATADD, $ CTBTRS, CTRMM, CTRTRS, DESC_CONVERT, GLOBCHK, $ PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 9*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 12*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 12*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 12*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 12*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) MBW2 = MAX_BW * MAX_BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -2 END IF * IF( LWORK .LT. -1) THEN INFO = -16 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -3 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 9*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -4 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -5 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 9*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 9*100 + 4 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 12*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 12*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -6 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -8 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 9*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PCDBTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 9*100+4 ) CALL PXERBLA( ICTXT, $ 'PCDBTRSV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ MAX(BWL,BWU)*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, $ 'PCDBTRSV: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 18, 1 ) = DESCB(5) PARAM_CHECK( 17, 1 ) = DESCB(4) PARAM_CHECK( 16, 1 ) = DESCB(3) PARAM_CHECK( 15, 1 ) = DESCB(2) PARAM_CHECK( 14, 1 ) = DESCB(1) PARAM_CHECK( 13, 1 ) = IB PARAM_CHECK( 12, 1 ) = DESCA(5) PARAM_CHECK( 11, 1 ) = DESCA(4) PARAM_CHECK( 10, 1 ) = DESCA(3) PARAM_CHECK( 9, 1 ) = DESCA(1) PARAM_CHECK( 8, 1 ) = JA PARAM_CHECK( 7, 1 ) = NRHS PARAM_CHECK( 6, 1 ) = BWU PARAM_CHECK( 5, 1 ) = BWL PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 18, 2 ) = 1205 PARAM_CHECK( 17, 2 ) = 1204 PARAM_CHECK( 16, 2 ) = 1203 PARAM_CHECK( 15, 2 ) = 1202 PARAM_CHECK( 14, 2 ) = 1201 PARAM_CHECK( 13, 2 ) = 11 PARAM_CHECK( 12, 2 ) = 905 PARAM_CHECK( 11, 2 ) = 904 PARAM_CHECK( 10, 2 ) = 903 PARAM_CHECK( 9, 2 ) = 901 PARAM_CHECK( 8, 2 ) = 8 PARAM_CHECK( 7, 2 ) = 6 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 16 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 18, PARAM_CHECK, 18, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCDBTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * Offset to workspace for Upper triangular factor * WORK_U = BWU*ODD_SIZE + 3*MBW2 * * * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * IF ( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL CTBTRS( UPLO, 'N', 'U', ODD_SIZE, $ BWL, NRHS, $ A( OFST+1+BWU ), LLDA, $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL CLAMOV( 'N', BWL, NRHS, $ B( PART_OFFSET+ODD_SIZE-BWL+1), LLDB, $ WORK( 1 ), MAX_BW ) * CALL CTRMM( 'L', 'U', 'N', 'N', BWL, NRHS, -CONE, $ A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )), $ LLDA-1, WORK( 1 ), MAX_BW ) * CALL CMATADD( BWL, NRHS, CONE, WORK( 1 ), MAX_BW, $ CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * Clear garbage out of workspace block * DO 10 IDUM1=1, WORK_SIZE_MIN WORK( IDUM1 )=0.0 10 CONTINUE * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL CGEMM( 'C', 'N', BWU, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+MAX_BW-BWU ), MAX_BW ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL CMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * CALL CMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * CALL CMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL CTBTRS( 'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), NRHS, $ AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'C', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( (ODD_SIZE)*BWU+1 ), $ MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL CGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 27 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 26 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 27 26 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL CGEMM( 'C', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ MAX_BW, $ WORK( 1 ), $ MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( (ODD_SIZE)*BWU+1 ), $ MAX_BW, $ WORK( 1 ), $ MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL CTBTRS( 'L', 'C', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), NRHS, $ AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 22 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 21 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 22 21 CONTINUE * [End of GOTO Loop] * 24 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL CGEMM( 'N', 'N', ODD_SIZE, NRHS, BWU, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+MAX_BW-BWU ), MAX_BW, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL CLAMOV( 'N', BWL, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB, $ WORK( 1+MAX_BW-BWL ), MAX_BW ) * CALL CTRMM( 'L', 'U', 'C', 'N', BWL, NRHS, -CONE, $ A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )), $ LLDA-1, WORK( 1+MAX_BW-BWL ), MAX_BW ) * CALL CMATADD( BWL, NRHS, CONE, WORK( 1+MAX_BW-BWL ), MAX_BW, $ CONE, B( PART_OFFSET+ODD_SIZE-BWL+1 ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL CTBTRS( UPLO, 'C', 'U', ODD_SIZE, $ BWL, NRHS, $ A( OFST+1+BWU ), $ LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF ( LSAME( TRANS, 'C' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL CTBTRS( UPLO, 'C', 'N', ODD_SIZE, $ BWU, NRHS, $ A( OFST+1 ), LLDA, $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL CLAMOV( 'N', BWU, NRHS, $ B( PART_OFFSET+ODD_SIZE-BWU+1), LLDB, $ WORK( 1 ), MAX_BW ) * CALL CTRMM( 'L', 'L', 'C', 'N', BWU, NRHS, -CONE, $ A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1, WORK( 1 ), $ MAX_BW ) * CALL CMATADD( BWU, NRHS, CONE, WORK( 1 ), MAX_BW, $ CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * Clear garbage out of workspace block * DO 20 IDUM1=1, WORK_SIZE_MIN WORK( IDUM1 )=0.0 20 CONTINUE * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL CGEMM( 'C', 'N', BWL, NRHS, ODD_SIZE, -CONE, $ AF( WORK_U+1 ), ODD_SIZE, B( PART_OFFSET+1 ), $ LLDB, CZERO, WORK( 1+MAX_BW-BWL ), MAX_BW ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL CMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 44 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 42 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 41 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * CALL CMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * CALL CMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 42 41 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL CTBTRS( 'U', 'C', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), NRHS, $ AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, MAX_BW-1 ) ), $ MAX_BW+1, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'C', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( WORK_U+(ODD_SIZE)*BWL+1 ), $ MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL CGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), $ MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 44 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 54 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 57 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 56 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 57 56 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL CGEMM( 'C', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), $ MAX_BW, $ WORK( 1 ), $ MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( WORK_U+(ODD_SIZE)*BWL+1 ), $ MAX_BW, $ WORK( 1 ), $ MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL CTBTRS( 'U', 'N', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), NRHS, $ AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, MAX_BW-1 ) ), $ MAX_BW+1, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 52 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 51 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 52 51 CONTINUE * [End of GOTO Loop] * 54 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL CGEMM( 'N', 'N', ODD_SIZE, NRHS, BWL, -CONE, $ AF( WORK_U+1 ), ODD_SIZE, WORK( 1+MAX_BW-BWL ), $ MAX_BW, CONE, B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL CLAMOV( 'N', BWU, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB, $ WORK( 1+MAX_BW-BWU ), MAX_BW+BWL ) * CALL CTRMM( 'L', 'L', 'N', 'N', BWU, NRHS, -CONE, $ A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1, $ WORK( 1+MAX_BW-BWU ), MAX_BW+BWL ) * CALL CMATADD( BWU, NRHS, CONE, WORK( 1+MAX_BW-BWU ), $ MAX_BW+BWL, CONE, $ B( PART_OFFSET+ODD_SIZE-BWU+1 ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL CTBTRS( UPLO, 'N', 'N', ODD_SIZE, $ BWU, NRHS, $ A( OFST+1 ), $ LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ENDIF * End of "IF( LSAME( UPLO, 'L' ) )"... 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PCDBTRSV * END scalapack-2.0.2/SRC/pcdtsv.f000644 000766 000024 00000046360 10363532303 016021 0ustar00juliestaff000000 000000 SUBROUTINE PCDTSV( N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX B( * ), D( * ), DL( * ), DU( * ), WORK( * ) * .. * * * Purpose * ======= * * PCDTSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * tridiagonal diagonally dominant-like distributed * matrix. * * Gaussian elimination without pivoting * is used to factor a reordering * of the matrix into L U. * * See PCDTTRF and PCDTTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) COMPLEX pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) COMPLEX pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) COMPLEX pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (12*NPCOL+3*NB) * +max(10*NPCOL+4*NRHS, 8*NPCOL) * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PCDTTRF, PCDTTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PCDTTRF and PCDTTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSEIF( DESCA( DTYPE_ ) .EQ. 502 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PCDTSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (12*NPCOL+3*NB) * * Factor the matrix * CALL PCDTTRF( N, DL, D, DU, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PCDTSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PCDTTRS( 'N', N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB, $ WORK, MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCDTSV', -INFO ) RETURN END IF * RETURN * * End of PCDTSV * END scalapack-2.0.2/SRC/pcdttrf.f000644 000766 000024 00000107411 11750130340 016153 0ustar00juliestaff000000 000000 SUBROUTINE PCDTTRF( N, DL, D, DU, JA, DESCA, AF, LAF, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX AF( * ), D( * ), DL( * ), DU( * ), WORK( * ) * .. * * * Purpose * ======= * * PCDTTRF computes a LU factorization * of an N-by-N complex tridiagonal * diagonally dominant-like distributed matrix * A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PCDTTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = L U * * where U is a tridiagonal upper triangular matrix and L is tridiagonal * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * DL (local input/local output) COMPLEX pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) COMPLEX pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) COMPLEX pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCDTTRF and this is stored in AF. If a linear system * is to be solved using PCDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 8*NPCOL * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN, $ LEVEL_DIST, LLDA, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_N_A, $ TEMP, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ CAXPY, CGEMM, CGERV2D, CGESD2D, CLAMOV, $ CLATCPY, CPBTRF, CPOTRF, CSYRK, CTBTRS, CTRMM, $ CTRRV2D, CTRSD2D, CTRSM, CTRTRS, DESC_CONVERT, $ GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC COMPLEX CDOTC EXTERNAL CDOTC, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LWORK .LT. -1) THEN INFO = -10 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -1 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, $ 'PCDTTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PCDTTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = (12*NPCOL+3*NB) * IF( LAF .LT. LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PCDTTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = 8*NPCOL * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, $ 'PCDTTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 7, 1 ) = DESCA(5) PARAM_CHECK( 6, 1 ) = DESCA(4) PARAM_CHECK( 5, 1 ) = DESCA(3) PARAM_CHECK( 4, 1 ) = DESCA(1) PARAM_CHECK( 3, 1 ) = JA PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 7, 2 ) = 605 PARAM_CHECK( 6, 2 ) = 604 PARAM_CHECK( 5, 2 ) = 603 PARAM_CHECK( 4, 2 ) = 601 PARAM_CHECK( 3, 2 ) = 5 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 10 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 7, PARAM_CHECK, 7, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCDTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * Offset to workspace for Upper triangular factor * WORK_U = INT_ONE*ODD_SIZE + 3 * * * Zero out space for fillin * DO 10 I=1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * IF ( MYCOL .LT. NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL CTRSD2D( ICTXT, 'U', 'N', 1, 1, $ DU( PART_OFFSET+ODD_SIZE+1 ), LLDA-1, 0, $ MYCOL+1 ) * ENDIF * * * Factor main partition A_i = L_i {U_i} in each processor * CALL CDTTRF( ODD_SIZE, DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1500 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * * Apply factorization to lower connection block BL_i * Apply factorization to upper connection block BU_i * * * Perform the triangular solve {U_i}^C{BL'}_i^C = {BL_i}^C * * DL( PART_OFFSET+ODD_SIZE+1 ) = $ ( DL( PART_OFFSET+ODD_SIZE+1 ) ) $ / ( D( PART_OFFSET+ODD_SIZE ) ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i} * * D( PART_OFFSET+ODD_SIZE+1 ) = D( PART_OFFSET+ODD_SIZE+1 )- $ DL( PART_OFFSET+ODD_SIZE+1 )*DU( PART_OFFSET+ODD_SIZE ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1500 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * * Move entry that causes spike to auxiliary storage * AF( WORK_U+1 ) = ( DL( PART_OFFSET+1 ) ) * IF (INFO.EQ.0) THEN * * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ . * CALL CDTTRSV( 'L', 'N', ODD_SIZE, INT_ONE, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), AF( WORK_U+1 ), ODD_SIZE, $ INFO ) * * * Calculate the "spike" fillin, ${U_i}^C {{GL}_i}^C = {DU_i}^C$ * CALL CTRRV2D( ICTXT, 'U', 'N', 1, 1, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * AF( 1 ) = CONJG( AF( 1 ) ) * CALL CDTTRSV( 'U', 'C', ODD_SIZE, INT_ONE, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), $ AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = GL_i{GU_i} * AF( ODD_SIZE+3 ) = -CONE * $ CDOTC( ODD_SIZE, AF( 1 ), 1, AF( WORK_U+1 ), 1 ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+3 ), $ INT_ONE, 0, MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * AF( ODD_SIZE+1 ) = -CONE $ * CONJG( DL( PART_OFFSET+ODD_SIZE+1 ) $ * AF( WORK_U+ODD_SIZE ) ) * * AF(WORK_U+(ODD_SIZE)+1 ) = -CONE $ * DU( PART_OFFSET+ODD_SIZE ) $ * CONJG( AF( ODD_SIZE ) ) * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, $ AF( WORK_U+ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * ENDIF * * Copy last diagonal block into AF storage for subsequent * operations. * AF( ODD_SIZE+2 ) = $ CMPLX( D( PART_OFFSET+ODD_SIZE+1 ) ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+AF( ODD_SIZE+3 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+WORK( 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+WORK( 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... IF( AF( ODD_SIZE+2 ) .EQ. CZERO ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * AF( WORK_U+ODD_SIZE+3 ) = AF( ODD_SIZE+1 ) * AF( ODD_SIZE+3 ) = AF( WORK_U+ODD_SIZE+1 ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( WORK_U+ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify lower off_diagonal block with diagonal block * * AF( ODD_SIZE+1 ) = AF( ODD_SIZE+1 ) $ / CONJG( AF( ODD_SIZE+2 ) ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * WORK( 1 ) = -ONE*CONJG( AF( ODD_SIZE+1 ) )* $ AF( WORK_U+(ODD_SIZE)+1 ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( WORK_U+ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST/2 ) * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * AF( ODD_SIZE+3 ) = AF( ODD_SIZE+3 ) $ / ( AF( ODD_SIZE+2 ) ) * * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 ) $ *CONJG( AF( WORK_U+ODD_SIZE+3 ) ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * WORK( 1 ) = -ONE*AF( WORK_U+ODD_SIZE+3 ) $ * AF( ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 ) $ * AF( WORK_U+ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * * 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PCDTTRF * END scalapack-2.0.2/SRC/pcdttrs.f000644 000766 000024 00000067307 10363532303 016205 0ustar00juliestaff000000 000000 SUBROUTINE PCDTTRS( TRANS, N, NRHS, DL, D, DU, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 7, 2001 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX AF( * ), B( * ), D( * ), DL( * ), DU( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PCDTTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PCDTTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * tridiagonal diagonally dominant-like distributed * matrix. * * Routine PCDTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) COMPLEX pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) COMPLEX pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) COMPLEX pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCDTTRF and this is stored in AF. If a linear system * is to be solved using PCDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 10*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ MY_NUM_COLS, NB, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, PART_OFFSET, PART_SIZE, $ RETURN_CODE, STORE_M_B, STORE_N_A, TEMP, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 15, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, $ PCDTTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC COMPLEX CDOTC EXTERNAL CDOTC, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -3 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PCDTTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PCDTTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ 10*NPCOL+4*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, $ 'PCDTTRS: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 15, 1 ) = DESCB(5) PARAM_CHECK( 14, 1 ) = DESCB(4) PARAM_CHECK( 13, 1 ) = DESCB(3) PARAM_CHECK( 12, 1 ) = DESCB(2) PARAM_CHECK( 11, 1 ) = DESCB(1) PARAM_CHECK( 10, 1 ) = IB PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = NRHS PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 15, 2 ) = 1105 PARAM_CHECK( 14, 2 ) = 1104 PARAM_CHECK( 13, 2 ) = 1103 PARAM_CHECK( 12, 2 ) = 1102 PARAM_CHECK( 11, 2 ) = 1101 PARAM_CHECK( 10, 2 ) = 10 PARAM_CHECK( 9, 2 ) = 805 PARAM_CHECK( 8, 2 ) = 804 PARAM_CHECK( 7, 2 ) = 803 PARAM_CHECK( 6, 2 ) = 801 PARAM_CHECK( 5, 2 ) = 7 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 15, PARAM_CHECK, 15, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCDTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( TRANS, 'N' ) ) THEN * CALL PCDTTRSV( 'L', 'N', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ELSE * CALL PCDTTRSV( 'U', 'C', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ENDIF * * Call backsolve routine * IF( LSAME( TRANS, 'C' ) ) THEN * CALL PCDTTRSV( 'L', 'C', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ELSE * CALL PCDTTRSV( 'U', 'N', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ENDIF 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PCDTTRS * END scalapack-2.0.2/SRC/pcdttrsv.f000644 000766 000024 00000142103 11750130340 016353 0ustar00juliestaff000000 000000 SUBROUTINE PCDTTRSV( UPLO, TRANS, N, NRHS, DL, D, DU, JA, DESCA, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX AF( * ), B( * ), D( * ), DL( * ), DU( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PCDTTRSV solves a tridiagonal triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a tridiagonal * triangular matrix factor produced by the * Gaussian elimination code PC@(dom_pre)TTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H * is dictated by the user by the parameter TRANS. * * Routine PCDTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) COMPLEX pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) COMPLEX pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) COMPLEX pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCDTTRF and this is stored in AF. If a linear system * is to be solved using PCDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 10*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_M_B, STORE_N_A, $ TEMP, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ CGEMM, CGERV2D, CGESD2D, CLAMOV, CMATADD, $ CTBTRS, CTRMM, CTRTRS, DESC_CONVERT, GLOBCHK, $ PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC COMPLEX CDOTC EXTERNAL CDOTC, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 9*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 12*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 12*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 12*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 12*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -2 END IF * IF( LWORK .LT. -1) THEN INFO = -16 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -3 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 9*100 + 6 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 12*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 12*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -4 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -8 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 9*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PCDTTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 9*100+4 ) CALL PXERBLA( ICTXT, $ 'PCDTTRSV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ INT_ONE*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, $ 'PCDTTRSV: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1205 PARAM_CHECK( 15, 2 ) = 1204 PARAM_CHECK( 14, 2 ) = 1203 PARAM_CHECK( 13, 2 ) = 1202 PARAM_CHECK( 12, 2 ) = 1201 PARAM_CHECK( 11, 2 ) = 11 PARAM_CHECK( 10, 2 ) = 905 PARAM_CHECK( 9, 2 ) = 904 PARAM_CHECK( 8, 2 ) = 903 PARAM_CHECK( 7, 2 ) = 901 PARAM_CHECK( 6, 2 ) = 8 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 16 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCDTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * Offset to workspace for Upper triangular factor * WORK_U = INT_ONE*ODD_SIZE + 3 * * * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * IF ( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL CDTTRSV( UPLO, 'N', ODD_SIZE, NRHS, DL( PART_OFFSET+2 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL CAXPY( NRHS, -DL( PART_OFFSET+ODD_SIZE+1 ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, ODD_SIZE, -CONE, $ AF( 1 ), ODD_SIZE, B( PART_OFFSET+1 ), LLDB, $ CZERO, WORK( 1+INT_ONE-INT_ONE ), INT_ONE ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL CTBTRS( 'L', 'N', 'U', INT_ONE, MIN( INT_ONE, INT_ONE-1 ), $ NRHS, AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*INT_ONE+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL CGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*INT_ONE+2+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 27 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 26 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 27 26 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*INT_ONE+2+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*INT_ONE+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL CTBTRS( 'L', 'C', 'U', INT_ONE, MIN( INT_ONE, INT_ONE-1 ), $ NRHS, AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 22 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 21 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 22 21 CONTINUE * [End of GOTO Loop] * 24 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL CGEMM( 'N', 'N', ODD_SIZE, NRHS, INT_ONE, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+INT_ONE-INT_ONE ), INT_ONE, $ CONE, B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL CAXPY( NRHS, -CONJG( DL( PART_OFFSET+ODD_SIZE+1 ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL CDTTRSV( UPLO, 'C', ODD_SIZE, NRHS, DL( PART_OFFSET+2 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), $ B( PART_OFFSET+1 ), LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF ( LSAME( TRANS, 'C' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL CDTTRSV( UPLO, 'C', ODD_SIZE, NRHS, DL( PART_OFFSET+2 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL CAXPY( NRHS, -CONJG( DU( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, ODD_SIZE, -CONE, $ AF( WORK_U+1 ), ODD_SIZE, B( PART_OFFSET+1 ), $ LLDB, CZERO, WORK( 1+INT_ONE-INT_ONE ), $ INT_ONE ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 44 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 42 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 41 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 42 41 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL CTBTRS( 'U', 'C', 'N', INT_ONE, MIN( INT_ONE, INT_ONE-1 ), $ NRHS, AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+(ODD_SIZE)*INT_ONE+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL CGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+ODD_SIZE*INT_ONE+2+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 44 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 54 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 57 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 56 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 57 56 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+ODD_SIZE*INT_ONE+2+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+(ODD_SIZE)*INT_ONE+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL CTBTRS( 'U', 'N', 'N', INT_ONE, MIN( INT_ONE, INT_ONE-1 ), $ NRHS, AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 52 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 51 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 52 51 CONTINUE * [End of GOTO Loop] * 54 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL CGEMM( 'N', 'N', ODD_SIZE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+1 ), ODD_SIZE, $ WORK( 1+INT_ONE-INT_ONE ), INT_ONE, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL CAXPY( NRHS, -( DU( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL CDTTRSV( UPLO, 'N', ODD_SIZE, NRHS, DU( PART_OFFSET+2 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), $ B( PART_OFFSET+1 ), LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ENDIF * End of "IF( LSAME( UPLO, 'L' ) )"... 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PCDTTRSV * END scalapack-2.0.2/SRC/pcgbsv.f000644 000766 000024 00000045617 10363532303 016006 0ustar00juliestaff000000 000000 SUBROUTINE PCGBSV( N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, $ DESCB, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER BWL, BWU, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) COMPLEX A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PCGBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * banded distributed * matrix with bandwidth BWL, BWU. * * Gaussian elimination with pivoting * is used to factor a reordering * of the matrix into P L U. * * See PCGBTRF and PCGBTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * +max(NRHS*(NB+2*bwl+4*bwu), 1) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * nonsingular, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * nonsingular, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PCGBTRF, PCGBTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PCGBTRF and PCGBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PCGBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * * Factor the matrix * CALL PCGBTRF( N, BWL, BWU, A, JA, DESCA, IPIV, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PCGBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PCGBTRS( 'N', N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, $ DESCB, WORK, MIN( LWORK, WS_FACTOR ), $ WORK( 1+WS_FACTOR), LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGBSV', -INFO ) RETURN END IF * RETURN * * End of PCGBSV * END scalapack-2.0.2/SRC/pcgbtrf.f000644 000766 000024 00000107636 11750130340 016145 0ustar00juliestaff000000 000000 SUBROUTINE PCGBTRF( N, BWL, BWU, A, JA, DESCA, IPIV, AF, LAF, $ WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX A( * ), AF( * ), WORK( * ) * .. * * * Purpose * ======= * * PCGBTRF computes a LU factorization * of an N-by-N complex banded * distributed matrix * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PCGBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) Q = L U * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P and Q are permutation matrices. * The matrix Q represents reordering of columns * for parallelism's sake, while P represents * reordering of rows for numerical stability using * classic partial pivoting. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCGBTRF and this is stored in AF. If a linear system * is to be solved using PCGBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 1 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * nonsingular, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * nonsingular, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Implemented for ScaLAPACK by: * Andrew J. Cleary, Livermore National Lab and University of Tenn., * and Markus Hegland, Australian Natonal University. Feb., 1997. * Based on code written by : Peter Arbenz, ETH Zurich, 1996. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER APTR, BBPTR, BIPTR, BM, BM1, BM2, BMN, BN, BW, $ CSRC, DBPTR, FIRST_PROC, I, ICTXT, ICTXT_NEW, $ ICTXT_SAVE, IDUM3, J, JA_NEW, JPTR, L, LAF_MIN, $ LBWL, LBWU, LDB, LDBB, LLDA, LM, LMJ, LN, LNJ, $ LPTR, MYCOL, MYROW, MY_NUM_COLS, NB, NEICOL, $ NP, NPACT, NPCOL, NPROW, NPSTR, NP_SAVE, NRHS, $ ODD_N, ODD_SIZE, ODPTR, OFST, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, CAXPY, CGEMM, $ CGERV2D, CGESD2D, CLAMOV, CLATCPY, CPBTRF, $ CPOTRF, CSYRK, CTBTRS, CTRMM, CTRRV2D, CTRSD2D, $ CTRSM, CTRTRS, DESC_CONVERT, GLOBCHK, PXERBLA, $ RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LWORK .LT. -1) THEN INFO = -11 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -1 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -2 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (2*BWL+2*BWU+1) ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 6*100 + 4 ) ENDIF * BW = BWU+BWL * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, $ 'PCGBTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.(BWL+BWU+1) )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PCGBTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * IF( LAF .LT. LAF_MIN ) THEN INFO = -9 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PCGBTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = 1 * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -11 * put minimum value of work into work( 1 ) WORK( 1 ) = WORK_SIZE_MIN CALL PXERBLA( ICTXT, $ 'PCGBTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BWU PARAM_CHECK( 3, 1 ) = BWL PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 11 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCGBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * DO 9 J = 1, ODD_SIZE DO 8 I = 1, BW A( I+(J-1)*LLDA ) = CZERO 8 CONTINUE 9 CONTINUE * * Begin main code * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Transfer triangle B_i of local matrix to next processor * for fillin. Overlap the send with the factorization of A_i. * IF (MYCOL .LE. NPCOL-2) THEN * * The last processor does not need to send anything. * BIPTR = location of triangle B_i in memory BIPTR = (NB-BW)*LLDA + 2*BW+1 * CALL CTRSD2D( ICTXT, 'U', 'N', $ MIN( BW, BWU+NUMROC( N, NB, MYCOL+1, 0, NPCOL ) ), $ BW, A(BIPTR), LLDA-1, 0, MYCOL+1) * ENDIF * * Factor main partition P_i A_i = L_i U_i on each processor * * LBWL, LBWU: lower and upper bandwidth of local solver * Note that for MYCOL > 0 one has lower triangular blocks! * LM is the number of rows which is usually NB except for * MYCOL = 0 where it is BWU less and MYCOL=NPCOL-1 where it * is NR+BWU where NR is the number of columns on the last processor * Finally APTR is the pointer to the first element of A. As LAPACK * has a slightly different matrix format than Scalapack the pointer * has to be adjusted on processor MYCOL=0. * IF (MYCOL .NE. 0) THEN LBWL = BW LBWU = 0 APTR = 1 ELSE LBWL = BWL LBWU = BWU APTR = 1+BWU ENDIF * IF (MYCOL .NE. NPCOL-1) THEN LM = NB - LBWU LN = NB - BW ELSE IF (MYCOL .NE. 0) THEN LM = ODD_SIZE + BWU LN = MAX(ODD_SIZE-BW,0) ELSE LM = N LN = MAX( N-BW, 0 ) ENDIF * IF (LN .GT. 0) THEN * CALL CGBTRF(LM,LN, LBWL,LBWU, A(APTR),LLDA, IPIV, INFO) * IF( INFO.NE.0 ) THEN INFO = INFO + NB*MYCOL GO TO 90 END IF * NRHS = BW LDB = LLDA-1 * * Update the last BW columns of A_i (code modified from CGBTRS) * * Only the eliminations of unknowns > LN-BW have an effect on * the last BW columns. Loop over them... * DO 23 J = MAX(LN-BW+1,1), LN * LMJ = MIN( LBWL, LM-J ) LNJ = MIN( BW, J+BW-LN+APTR-1 ) * L = IPIV( J ) * JPTR = J-(LN+1)+2*BW+1-LBWL + LN*LLDA * IF( L.NE.J ) THEN * * Element (L,LN+1) is swapped with element (J,LN+1) etc * Furthermore, the elements in the same row are LDB=LLDA-1 apart * The complicated formulas are to cope with the banded * data format: * LPTR = L-(LN+1)+2*BW+1-LBWL + LN*LLDA * CALL CSWAP( LNJ, A(LPTR),LDB, A(JPTR), LDB ) * ENDIF * * LPTR is the pointer to the beginning of the * coefficients of L * LPTR = BW+1+APTR + (J-1)*LLDA * CALL CGERU(LMJ,LNJ,-CONE, A(LPTR),1, A(JPTR),LDB, $ A(JPTR+1),LDB) 23 CONTINUE * ENDIF * * Compute spike fill-in, L_i F_i = P_i B_{i-1} * * Receive triangle B_{i-1} from previous processor * IF (MYCOL .GT. 0) THEN * CALL CTRRV2D( ICTXT, 'U', 'N', MIN(BW, LM), BW, AF( 1 ), $ LM, 0, MYCOL-1) * * * Permutation and forward elimination (triang. solve) * DO 24 J = 1, LN * LMJ = MIN( LBWL, LM-J ) L = IPIV( J ) * IF( L .NE. J ) THEN * CALL CSWAP(NRHS, AF(L), LM, AF(J), LM ) ENDIF * LPTR = BW+1+APTR + (J-1)*LLDA * CALL CGERU( LMJ,NRHS, -CONE, A(LPTR),1, $ AF(J), LM, AF(J+1), LM) * 24 CONTINUE * ENDIF * 90 CONTINUE * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Define the initial dimensions of the diagonal blocks * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW * IF (MYCOL .NE. NPCOL-1) THEN BM = BW - LBWU BN = BW ELSE BM = MIN(BW,ODD_SIZE) + BWU BN = MIN(BW,ODD_SIZE) ENDIF * * Pointer to first element of block bidiagonal matrix in AF * Leading dimension of block bidiagonal system * BBPTR = (NB+BWU)*BW + 1 LDBB = 2*BW + BWU * * Copy from A and AF into block bidiagonal matrix (tail of AF) * * DBPTR = Pointer to diagonal blocks in A DBPTR = BW+1 + LBWU + LN*LLDA * CALL CLAMOV('G',BM,BN, A(DBPTR),LLDA-1, $ AF(BBPTR + BW*LDBB),LDBB) * * Zero out any junk entries that were copied * DO 870 J=1, BM DO 880 I=J+LBWL, BM-1 AF( BBPTR+BW*LDBB+(J-1)*LDBB+I ) = CZERO 880 CONTINUE 870 CONTINUE * IF (MYCOL .NE. 0) THEN * * ODPTR = Pointer to offdiagonal blocks in A * ODPTR = LM-BM+1 CALL CLAMOV('G',BM,BW, AF(ODPTR),LM, $ AF(BBPTR +2*BW*LDBB),LDBB) ENDIF * IF (NPCOL.EQ.1) THEN * * In this case the loop over the levels will not be * performed. CALL CGETRF( N-LN, N-LN, AF(BBPTR+BW*LDBB), LDBB, $ IPIV(LN+1), INFO) * ENDIF * * Loop over levels ... only occurs if npcol > 1 * * The two integers NPACT (nu. of active processors) and NPSTR * (stride between active processors) are used to control the * loop. * NPACT = NPCOL NPSTR = 1 * * Begin loop over levels * 200 IF (NPACT .LE. 1) GOTO 300 * * Test if processor is active * IF (MOD(MYCOL,NPSTR) .EQ. 0) THEN * * Send/Receive blocks * * IF (MOD(MYCOL,2*NPSTR) .EQ. 0) THEN * * This node will potentially do more work later * NEICOL = MYCOL + NPSTR * IF (NEICOL/NPSTR .LT. NPACT-1) THEN BMN = BW ELSE IF (NEICOL/NPSTR .EQ. NPACT-1) THEN ODD_N = NUMROC(N, NB, NPCOL-1, 0, NPCOL) BMN = MIN(BW,ODD_N) + BWU ELSE * * Last processor skips to next level GOTO 250 ENDIF * * BM1 = M for 1st block on proc pair, BM2 2nd block * BM1 = BM BM2 = BMN * IF (NEICOL/NPSTR .LE. NPACT-1 )THEN * CALL CGESD2D( ICTXT, BM, 2*BW, AF(BBPTR+BW*LDBB), $ LDBB, 0, NEICOL ) * CALL CGERV2D( ICTXT, BMN, 2*BW, AF(BBPTR+BM), $ LDBB, 0, NEICOL) * IF( NPACT .EQ. 2 ) THEN * * Copy diagonal block to align whole system * CALL CLAMOV( 'G', BMN, BW, AF( BBPTR+BM ), $ LDBB, AF( BBPTR+2*BW*LDBB+BM ), LDBB ) ENDIF * ENDIF * ELSE * * This node stops work after this stage -- an extra copy * is required to make the odd and even frontal matrices * look identical * NEICOL = MYCOL - NPSTR * IF (NEICOL .EQ. 0) THEN BMN = BW - BWU ELSE BMN = BW ENDIF * BM1 = BMN BM2 = BM * CALL CGESD2D( ICTXT, BM, 2*BW, AF(BBPTR+BW*LDBB), $ LDBB, 0, NEICOL ) * CALL CLAMOV('G',BM, 2*BW, AF(BBPTR+BW*LDBB),LDBB, $ AF(BBPTR+BMN),LDBB) * DO 31 J=BBPTR+2*BW*LDBB, BBPTR+3*BW*LDBB-1, LDBB DO 32 I=0, LDBB-1 AF(I+J) = CZERO 32 CONTINUE 31 CONTINUE * CALL CGERV2D( ICTXT, BMN, 2*BW, AF(BBPTR+BW*LDBB), $ LDBB, 0, NEICOL) * IF( NPACT .EQ. 2 ) THEN * * Copy diagonal block to align whole system * CALL CLAMOV( 'G', BM, BW, AF( BBPTR+BMN ), $ LDBB, AF( BBPTR+2*BW*LDBB+BMN ), LDBB ) ENDIF * ENDIF * * LU factorization with partial pivoting * IF (NPACT .NE. 2) THEN * CALL CGETRF(BM+BMN, BW, AF(BBPTR+BW*LDBB), LDBB, $ IPIV(LN+1), INFO) * * Backsolve left side * DO 301 J=BBPTR,BBPTR+BW*LDBB-1, LDBB DO 302 I=0, BM1-1 AF(I+J) = CZERO 302 CONTINUE 301 CONTINUE * CALL CLASWP(BW, AF(BBPTR), LDBB, 1, BW, $ IPIV(LN+1), 1) * CALL CTRSM('L','L','N','U', BW, BW, CONE, $ AF(BBPTR+BW*LDBB), LDBB, AF(BBPTR), LDBB) * * Use partial factors to update remainder * CALL CGEMM( 'N', 'N', BM+BMN-BW, BW, BW, $ -CONE, AF(BBPTR+BW*LDBB+BW), LDBB, $ AF( BBPTR ), LDBB, CONE, $ AF( BBPTR+BW ), LDBB ) * * Backsolve right side * NRHS = BW * CALL CLASWP(NRHS, AF(BBPTR+2*BW*LDBB), LDBB, 1, BW, $ IPIV(LN+1), 1) * CALL CTRSM('L','L','N','U', BW, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, AF(BBPTR+2*BW*LDBB), LDBB) * * Use partial factors to update remainder * CALL CGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, $ -CONE, AF(BBPTR+BW*LDBB+BW), LDBB, $ AF( BBPTR+2*BW*LDBB ), LDBB, CONE, $ AF( BBPTR+2*BW*LDBB+BW ), LDBB ) * * * Test if processor is active in next round * IF (MOD(MYCOL,2*NPSTR) .EQ. 0) THEN * * Reset BM * BM = BM1+BM2-BW * * Local copying in the block bidiagonal area * * CALL CLAMOV('G',BM,BW, $ AF(BBPTR+BW), $ LDBB, AF(BBPTR+BW*LDBB), LDBB) CALL CLAMOV('G',BM,BW, $ AF(BBPTR+2*BW*LDBB+BW), $ LDBB, AF(BBPTR+2*BW*LDBB), LDBB) * * Zero out space that held original copy * DO 1020 J=0, BW-1 DO 1021 I=0, BM-1 AF(BBPTR+2*BW*LDBB+BW+J*LDBB+I) = CZERO 1021 CONTINUE 1020 CONTINUE * ENDIF * ELSE * * Factor the final 2 by 2 block matrix * CALL CGETRF(BM+BMN,BM+BMN, AF(BBPTR+BW*LDBB), LDBB, $ IPIV(LN+1), INFO) ENDIF * ENDIF * * Last processor in an odd-sized NPACT skips to here * 250 CONTINUE * NPACT = (NPACT + 1)/2 NPSTR = NPSTR * 2 GOTO 200 * 300 CONTINUE * End loop over levels * 1000 CONTINUE * If error was found in Phase 1, processors jump here. * * Free BLACS space used to hold standard-form grid. * ICTXT = ICTXT_SAVE IF( ICTXT.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 1234 CONTINUE * If this processor did not hold part of the grid it * jumps here. * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PCGBTRF * END * scalapack-2.0.2/SRC/pcgbtrs.f000644 000766 000024 00000112724 11750130340 016154 0ustar00juliestaff000000 000000 SUBROUTINE PCGBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWU, BWL, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV(*) COMPLEX A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PCGBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PCGBTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * banded distributed * matrix with bandwidth BWL, BWU. * * Routine PCGBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCGBTRF and this is stored in AF. If a linear system * is to be solved using PCGBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * NRHS*(NB+2*bwl+4*bwu) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Implemented for ScaLAPACK by: * Andrew J. Cleary, Livermore National Lab and University of Tenn., * and Marbwus Hegland, Australian Natonal University. Feb., 1997. * Based on code written by : Peter Arbenz, ETH Zurich, 1996. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC, $ FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, J, JA_NEW, L, LBWL, LBWU, LDBB, $ LDW, LLDA, LLDB, LM, LMJ, LN, LPTR, MYCOL, $ MYROW, NB, NEICOL, NP, NPACT, NPCOL, NPROW, $ NPSTR, NP_SAVE, ODD_SIZE, PART_OFFSET, $ RECOVERY_VAL, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN, WPTR * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, PXERBLA, $ RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -16 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (2*BWL+2*BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * BW = BWU+BWL * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -5 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PCGBTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.(BWL+BWU+1) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PCGBTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check worksize * WORK_SIZE_MIN = NRHS*(NB+2*BWL+4*BWU) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, $ 'PCGBTRS: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 16 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCGBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * * * Begin main code * * Move data into workspace - communicate/copy (overlap) * IF (MYCOL .LT. NPCOL-1) THEN CALL CGESD2D( ICTXT, BWU, NRHS, B(NB-BWU+1), LLDB, $ 0, MYCOL + 1) ENDIF * IF (MYCOL .LT. NPCOL-1) THEN LM = NB-BWU ELSE LM = NB ENDIF * IF (MYCOL .GT. 0) THEN WPTR = BWU+1 ELSE WPTR = 1 ENDIF * LDW = NB+BWU + 2*BW+BWU * CALL CLAMOV( 'G', LM, NRHS, B(1), LLDB, WORK( WPTR ), LDW ) * * Zero out rest of work * DO 1501 J=1, NRHS DO 1502 L=WPTR+LM, LDW WORK( (J-1)*LDW+L ) = CZERO 1502 CONTINUE 1501 CONTINUE * IF (MYCOL .GT. 0) THEN CALL CGERV2D( ICTXT, BWU, NRHS, WORK(1), LDW, $ 0, MYCOL-1) ENDIF * ******************************************************************** * PHASE 1: Local computation phase -- Solve L*X = B ******************************************************************** * * Size of main (or odd) partition in each processor * ODD_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF (MYCOL .NE. 0) THEN LBWL = BW LBWU = 0 APTR = 1 ELSE LBWL = BWL LBWU = BWU APTR = 1+BWU ENDIF * IF (MYCOL .NE. NPCOL-1) THEN LM = NB - LBWU LN = NB - BW ELSE IF (MYCOL .NE. 0) THEN LM = ODD_SIZE + BWU LN = MAX(ODD_SIZE-BW,0) ELSE LM = N LN = MAX( N-BW, 0 ) ENDIF * DO 21 J = 1, LN * LMJ = MIN(LBWL,LM-J) L = IPIV( J ) * IF( L.NE.J ) THEN CALL CSWAP(NRHS, WORK(L), LDW, WORK(J), LDW) ENDIF * LPTR = BW+1 + (J-1)*LLDA + APTR * CALL CGERU(LMJ,NRHS,-CONE, A(LPTR),1, WORK(J),LDW, $ WORK(J+1),LDW) * 21 CONTINUE * ******************************************************************** * PHASE 2: Global computation phase -- Solve L*X = B ******************************************************************** * * Define the initial dimensions of the diagonal blocks * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW * IF (MYCOL .NE. NPCOL-1) THEN BM = BW - LBWU BN = BW ELSE BM = MIN(BW,ODD_SIZE) + BWU BN = MIN(BW,ODD_SIZE) ENDIF * * Pointer to first element of block bidiagonal matrix in AF * Leading dimension of block bidiagonal system * BBPTR = (NB+BWU)*BW + 1 LDBB = 2*BW + BWU * IF (NPCOL.EQ.1) THEN * * In this case the loop over the levels will not be * performed. CALL CGETRS( 'N', N-LN, NRHS, AF(BBPTR+BW*LDBB), LDBB, $ IPIV(LN+1), WORK( LN+1 ), LDW, INFO) * ENDIF * * Loop over levels ... * * The two integers NPACT (nu. of active processors) and NPSTR * (stride between active processors) is used to control the * loop. * NPACT = NPCOL NPSTR = 1 * * Begin loop over levels 200 IF (NPACT .LE. 1) GOTO 300 * * Test if processor is active IF (MOD(MYCOL,NPSTR) .EQ. 0) THEN * * Send/Receive blocks * IF (MOD(MYCOL,2*NPSTR) .EQ. 0) THEN * NEICOL = MYCOL + NPSTR * IF (NEICOL/NPSTR .LE. NPACT-1) THEN * IF (NEICOL/NPSTR .LT. NPACT-1) THEN BMN = BW ELSE BMN = MIN(BW,NUMROC(N, NB, NEICOL, 0, NPCOL))+BWU ENDIF * CALL CGESD2D( ICTXT, BM, NRHS, $ WORK(LN+1), LDW, 0, NEICOL ) * IF( NPACT .NE. 2 )THEN * * Receive answers back from partner processor * CALL CGERV2D(ICTXT, BM+BMN-BW, NRHS, $ WORK( LN+1 ), LDW, 0, NEICOL ) * BM = BM+BMN-BW * ENDIF * ENDIF * ELSE * NEICOL = MYCOL - NPSTR * IF (NEICOL .EQ. 0) THEN BMN = BW - BWU ELSE BMN = BW ENDIF * CALL CLAMOV( 'G', BM, NRHS, WORK(LN+1), LDW, $ WORK(NB+BWU+BMN+1), LDW ) * CALL CGERV2D( ICTXT, BMN, NRHS, WORK( NB+BWU+1 ), $ LDW, 0, NEICOL ) * * and do the permutations and eliminations * IF (NPACT .NE. 2) THEN * * Solve locally for BW variables * CALL CLASWP( NRHS, WORK(NB+BWU+1), LDW, 1, BW, $ IPIV(LN+1), 1) * CALL CTRSM('L','L','N','U', BW, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW) * * Use soln just calculated to update RHS * CALL CGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, $ -CONE, AF(BBPTR+BW*LDBB+BW), LDBB, $ WORK(NB+BWU+1), LDW, $ CONE, WORK(NB+BWU+1+BW), LDW ) * * Give answers back to partner processor * CALL CGESD2D( ICTXT, BM+BMN-BW, NRHS, $ WORK(NB+BWU+1+BW), LDW, 0, NEICOL ) * ELSE * * Finish up calculations for final level * CALL CLASWP( NRHS, WORK(NB+BWU+1), LDW, 1, BM+BMN, $ IPIV(LN+1), 1) * CALL CTRSM('L','L','N','U', BM+BMN, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW) ENDIF * ENDIF * NPACT = (NPACT + 1)/2 NPSTR = NPSTR * 2 GOTO 200 * ENDIF * 300 CONTINUE * * ************************************** * BACKSOLVE ******************************************************************** * PHASE 2: Global computation phase -- Solve U*Y = X ******************************************************************** * IF (NPCOL.EQ.1) THEN * * In this case the loop over the levels will not be * performed. * In fact, the backsolve portion was done in the call to * CGETRS in the frontsolve. * ENDIF * * Compute variable needed to reverse loop structure in * reduced system. * RECOVERY_VAL = NPACT*NPSTR - NPCOL * * Loop over levels * Terminal values of NPACT and NPSTR from frontsolve are used * 2200 IF( NPACT .GE. NPCOL ) GOTO 2300 * NPSTR = NPSTR/2 * NPACT = NPACT*2 * * Have to adjust npact for non-power-of-2 * NPACT = NPACT-MOD( (RECOVERY_VAL/NPSTR), 2 ) * * Find size of submatrix in this proc at this level * IF( MYCOL/NPSTR .LT. NPACT-1 ) THEN BN = BW ELSE BN = MIN(BW, NUMROC(N, NB, NPCOL-1, 0, NPCOL) ) ENDIF * * If this processor is even in this level... * IF( MOD( MYCOL, 2*NPSTR ) .EQ. 0 ) THEN * NEICOL = MYCOL+NPSTR * IF( NEICOL/NPSTR .LE. NPACT-1 ) THEN * IF( NEICOL/NPSTR .LT. NPACT-1 ) THEN BMN = BW BNN = BW ELSE BMN = MIN(BW,NUMROC(N, NB, NEICOL, 0, NPCOL))+BWU BNN = MIN(BW, NUMROC(N, NB, NEICOL, 0, NPCOL) ) ENDIF * IF( NPACT .GT. 2 ) THEN * CALL CGESD2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * CALL CGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * ELSE * CALL CGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * ENDIF * ENDIF * ELSE * This processor is odd on this level * NEICOL = MYCOL - NPSTR * IF (NEICOL .EQ. 0) THEN BMN = BW - BWU ELSE BMN = BW ENDIF * IF( NEICOL .LT. NPCOL-1 ) THEN BNN = BW ELSE BNN = MIN(BW, NUMROC(N, NB, NEICOL, 0, NPCOL) ) ENDIF * IF( NPACT .GT. 2 ) THEN * * Move RHS to make room for received solutions * CALL CLAMOV( 'G', BW, NRHS, WORK(NB+BWU+1), $ LDW, WORK(NB+BWU+BW+1), LDW ) * CALL CGERV2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * CALL CGEMM( 'N', 'N', BW, NRHS, BN, $ -CONE, AF(BBPTR), LDBB, $ WORK(LN+1), LDW, $ CONE, WORK(NB+BWU+BW+1), LDW ) * * IF( MYCOL .GT. NPSTR ) THEN * CALL CGEMM( 'N', 'N', BW, NRHS, BW, $ -CONE, AF(BBPTR+2*BW*LDBB), LDBB, $ WORK(LN+BW+1), LDW, $ CONE, WORK(NB+BWU+BW+1), LDW ) * ENDIF * CALL CTRSM('L','U','N','N', BW, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+BW+1), LDW) * * Send new solution to neighbor * CALL CGESD2D( ICTXT, BW, NRHS, $ WORK( NB+BWU+BW+1 ), LDW, 0, NEICOL ) * * Copy new solution into expected place * CALL CLAMOV( 'G', BW, NRHS, WORK(NB+BWU+1+BW), $ LDW, WORK(LN+BW+1), LDW ) * ELSE * * Solve with local diagonal block * CALL CTRSM( 'L','U','N','N', BN+BNN, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW) * * Send new solution to neighbor * CALL CGESD2D( ICTXT, BW, NRHS, $ WORK(NB+BWU+1), LDW, 0, NEICOL ) * * Shift solutions into expected positions * CALL CLAMOV( 'G', BNN+BN-BW, NRHS, WORK(NB+BWU+1+BW), $ LDW, WORK(LN+1), LDW ) * * IF( (NB+BWU+1) .NE. (LN+1+BW) ) THEN * * Copy one row at a time since spaces may overlap * DO 1064 J=1, BW CALL CCOPY( NRHS, WORK(NB+BWU+J), LDW, $ WORK(LN+BW+J), LDW ) 1064 CONTINUE * ENDIF * ENDIF * ENDIF * GOTO 2200 * 2300 CONTINUE * End of loop over levels * ******************************************************************** * PHASE 1: (Almost) Local computation phase -- Solve U*Y = X ******************************************************************** * * Reset BM to value it had before reduced system frontsolve... * IF (MYCOL .NE. NPCOL-1) THEN BM = BW - LBWU ELSE BM = MIN(BW,ODD_SIZE) + BWU ENDIF * * First metastep is to account for the fillin blocks AF * IF( MYCOL .LT. NPCOL-1 ) THEN * CALL CGESD2D( ICTXT, BW, NRHS, WORK( NB-BW+1 ), $ LDW, 0, MYCOL+1 ) * ENDIF * IF( MYCOL .GT. 0 ) THEN * CALL CGERV2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ), $ LDW, 0, MYCOL-1 ) * * Modify local right hand sides with received rhs's * CALL CGEMM( 'N', 'N', LM-BM, NRHS, BW, -CONE, $ AF( 1 ), LM, WORK( NB+BWU+1 ), LDW, CONE, $ WORK( 1 ), LDW ) * ENDIF * DO 2021 J = LN, 1, -1 * LMJ = MIN( BW, ODD_SIZE-1 ) * LPTR = BW-1+J*LLDA+APTR * * In the following, the TRANS=T option is used to reverse * the order of multiplication, not as a true transpose * CALL CGEMV( 'T', LMJ, NRHS, -CONE, WORK( J+1), LDW, $ A( LPTR ), LLDA-1, CONE, WORK( J ), LDW ) * * Divide by diagonal element * CALL CSCAL( NRHS, CONE/A( LPTR-LLDA+1 ), $ WORK( J ), LDW ) 2021 CONTINUE * * * CALL CLAMOV( 'G', ODD_SIZE, NRHS, WORK( 1 ), LDW, $ B( 1 ), LLDB ) * * Free BLACS space used to hold standard-form grid. * ICTXT = ICTXT_SAVE IF( ICTXT .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * NP = NP_SAVE * * Output worksize * WORK( 1 ) = WORK_SIZE_MIN * RETURN * * End of PCGBTRS * END scalapack-2.0.2/SRC/pcgebd2.f000644 000766 000024 00000043542 10363532303 016023 0ustar00juliestaff000000 000000 SUBROUTINE PCGEBD2( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL D( * ), E( * ) COMPLEX A( * ), TAUP( * ), TAUQ( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEBD2 reduces a complex general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper or lower bidiagonal * form B by an unitary transformation: Q' * sub( A ) * P = B. * * If M >= N, B is upper bidiagonal; if M < N, B is lower bidiagonal. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ). On exit, if M >= N, * the diagonal and the first superdiagonal of sub( A ) are * overwritten with the upper bidiagonal matrix B; the elements * below the diagonal, with the array TAUQ, represent the * unitary matrix Q as a product of elementary reflectors, and * the elements above the first superdiagonal, with the array * TAUP, represent the orthogonal matrix P as a product of * elementary reflectors. If M < N, the diagonal and the first * subdiagonal are overwritten with the lower bidiagonal * matrix B; the elements below the first subdiagonal, with the * array TAUQ, represent the unitary matrix Q as a product of * elementary reflectors, and the elements above the diagonal, * with the array TAUP, represent the orthogonal matrix P as a * product of elementary reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) COMPLEX array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUQ is * tied to the distributed matrix A. See Further Details. * * TAUP (local output) COMPLEX array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix P. TAUP is * tied to the distributed matrix A. See Further Details. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( MpA0, NqA0 ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ) * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+IROFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * If m >= n, * * Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors; * v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1); * u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in * A(ia+i-1,ja+i+1:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, * * Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors; * v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); * u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in * A(ia+i-1,ja+i:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) * ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) * ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) * ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) * ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) * ( v1 v2 v3 v4 v5 ) * * where d and e denote diagonal and off-diagonal elements of B, vi * denotes an element of the vector defining H(i), and ui an element of * the vector defining G(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, II, IROFFA, J, $ JJ, K, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 COMPLEX ALPHA * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CGEBR2D, $ CGEBS2D, CHK1MAT, CLARFG, DESCSET, INFOG2L, $ PCELSET, PCLACGV, PCLARF, PCLARFC, $ PCLARFG, PSELSET, PXERBLA, SGEBR2D, $ SGEBS2D * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Test the input parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) LWMIN = MAX( MPA0, NQA0 ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IROFFA.NE.ICOFFA ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEBD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) * IF( M.EQ.1 .AND. N.EQ.1 ) THEN IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN I = II+(JJ-1)*DESCA( LLD_ ) CALL CLARFG( 1, A( I ), A( I ), 1, TAUQ( JJ ) ) D( JJ ) = REAL( A( I ) ) CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, D( JJ ), $ 1 ) CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TAUQ( JJ ), $ 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, D( JJ ), $ 1, IAROW, IACOL ) CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAUQ( JJ ), $ 1, IAROW, IACOL ) END IF END IF IF( MYROW.EQ.IAROW ) $ TAUP( II ) = ZERO RETURN END IF * ALPHA = ZERO * IF( M.GE.N ) THEN * * Reduce to upper bidiagonal form * CALL DESCSET( DESCD, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) DO 10 K = 1, N I = IA + K - 1 J = JA + K - 1 * * Generate elementary reflector H(j) to annihilate * A(ia+i:ia+m-1,j) * CALL PCLARFG( M-K+1, ALPHA, I, J, A, MIN( I+1, M+IA-1 ), $ J, DESCA, 1, TAUQ ) CALL PSELSET( D, 1, J, DESCD, REAL( ALPHA ) ) CALL PCELSET( A, I, J, DESCA, ONE ) * * Apply H(i) to A(i:ia+m-1,i+1:ja+n-1) from the left * CALL PCLARFC( 'Left', M-K+1, N-K, A, I, J, DESCA, 1, TAUQ, $ A, I, J+1, DESCA, WORK ) CALL PCELSET( A, I, J, DESCA, CMPLX( REAL( ALPHA ) ) ) * IF( K.LT.N ) THEN * * Generate elementary reflector G(i) to annihilate * A(i,ja+j+1:ja+n-1) * CALL PCLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PCLARFG( N-K, ALPHA, I, J+1, A, I, $ MIN( J+2, JA+N-1 ), DESCA, DESCA( M_ ), $ TAUP ) CALL PSELSET( E, I, 1, DESCE, REAL( ALPHA ) ) CALL PCELSET( A, I, J+1, DESCA, ONE ) * * Apply G(i) to A(i+1:ia+m-1,i+1:ja+n-1) from the right * CALL PCLARF( 'Right', M-K, N-K, A, I, J+1, DESCA, $ DESCA( M_ ), TAUP, A, I+1, J+1, DESCA, $ WORK ) CALL PCELSET( A, I, J+1, DESCA, CMPLX( REAL( ALPHA ) ) ) CALL PCLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) ELSE CALL PCELSET( TAUP, I, 1, DESCE, ZERO ) END IF 10 CONTINUE * ELSE * * Reduce to lower bidiagonal form * CALL DESCSET( DESCD, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) DO 20 K = 1, M I = IA + K - 1 J = JA + K - 1 * * Generate elementary reflector G(i) to annihilate * A(i,ja+j:ja+n-1) * CALL PCLACGV( N-K+1, A, I, J, DESCA, DESCA( M_ ) ) CALL PCLARFG( N-K+1, ALPHA, I, J, A, I, $ MIN( J+1, JA+N-1 ), DESCA, DESCA( M_ ), TAUP ) CALL PSELSET( D, I, 1, DESCD, REAL( ALPHA ) ) CALL PCELSET( A, I, J, DESCA, ONE ) * * Apply G(i) to A(i:ia+m-1,j:ja+n-1) from the right * CALL PCLARF( 'Right', M-K, N-K+1, A, I, J, DESCA, $ DESCA( M_ ), TAUP, A, MIN( I+1, IA+M-1 ), J, $ DESCA, WORK ) CALL PCELSET( A, I, J, DESCA, CMPLX( REAL( ALPHA ) ) ) CALL PCLACGV( N-K+1, A, I, J, DESCA, DESCA( M_ ) ) * IF( K.LT.M ) THEN * * Generate elementary reflector H(i) to annihilate * A(i+2:ia+m-1,j) * CALL PCLARFG( M-K, ALPHA, I+1, J, A, $ MIN( I+2, IA+M-1 ), J, DESCA, 1, TAUQ ) CALL PSELSET( E, 1, J, DESCE, REAL( ALPHA ) ) CALL PCELSET( A, I+1, J, DESCA, ONE ) * * Apply H(i) to A(i+1:ia+m-1,j+1:ja+n-1) from the left * CALL PCLARFC( 'Left', M-K, N-K, A, I+1, J, DESCA, 1, $ TAUQ, A, I+1, J+1, DESCA, WORK ) CALL PCELSET( A, I+1, J, DESCA, CMPLX( REAL( ALPHA ) ) ) ELSE CALL PCELSET( TAUQ, 1, J, DESCE, ZERO ) END IF 20 CONTINUE END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGEBD2 * END scalapack-2.0.2/SRC/pcgebrd.f000644 000766 000024 00000040203 10363532303 016112 0ustar00juliestaff000000 000000 SUBROUTINE PCGEBRD( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL D( * ), E( * ) COMPLEX A( * ), TAUP( * ), TAUQ( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEBRD reduces a complex general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper or lower bidiagonal * form B by an unitary transformation: Q' * sub( A ) * P = B. * * If M >= N, B is upper bidiagonal; if M < N, B is lower bidiagonal. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ). On exit, if M >= N, * the diagonal and the first superdiagonal of sub( A ) are * overwritten with the upper bidiagonal matrix B; the elements * below the diagonal, with the array TAUQ, represent the * unitary matrix Q as a product of elementary reflectors, and * the elements above the first superdiagonal, with the array * TAUP, represent the orthogonal matrix P as a product of * elementary reflectors. If M < N, the diagonal and the first * subdiagonal are overwritten with the lower bidiagonal * matrix B; the elements below the first subdiagonal, with the * array TAUQ, represent the unitary matrix Q as a product of * elementary reflectors, and the elements above the diagonal, * with the array TAUP, represent the orthogonal matrix P as a * product of elementary reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) COMPLEX array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUQ is * tied to the distributed matrix A. See Further Details. * * TAUP (local output) COMPLEX array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix P. TAUP is * tied to the distributed matrix A. See Further Details. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB*( MpA0 + NqA0 + 1 ) + NqA0 * * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * If m >= n, * * Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors; * v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1); * u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in * A(ia+i-1,ja+i+1:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, * * Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors; * v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); * u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in * A(ia+i-1,ja+i:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) * ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) * ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) * ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) * ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) * ( v1 v2 v3 v4 v5 ) * * where d and e denote diagonal and off-diagonal elements of B, vi * denotes an element of the vector defining H(i), and ui an element of * the vector defining G(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICTXT, IINFO, IOFF, IPW, IPY, $ IW, J, JB, JS, JW, K, L, LWMIN, MN, MP, MYCOL, $ MYROW, NB, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCWX( DLEN_ ), DESCWY( DLEN_ ), IDUM1( 1 ), $ IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PCELSET, PCGEBD2, PCGEMM, PCLABRD, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCA( MB_ ) IOFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) MP = NUMROC( M+IOFF, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+IOFF, NB, MYCOL, IACOL, NPCOL ) LWMIN = NB*( MP+NQ+1 ) + NQ * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IOFF.NE.MOD( JA-1, DESCA( NB_ ) ) ) THEN INFO = -5 ELSE IF( NB.NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 12 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEBRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * MN = MIN( M, N ) IF( MN.EQ.0 ) $ RETURN * * Initialize parameters. * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPY = MP * NB + 1 IPW = NQ * NB + IPY * CALL DESCSET( DESCWX, M+IOFF, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCWY, NB, N+IOFF, NB, NB, IAROW, IACOL, ICTXT, $ NB ) * MP = NUMROC( M+IA-1, NB, MYROW, DESCA( RSRC_ ), NPROW ) NQ = NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) K = 1 JB = NB - IOFF IW = IOFF + 1 JW = IOFF + 1 * DO 10 L = 1, MN+IOFF-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce rows and columns i:i+nb-1 to bidiagonal form and return * the matrices X and Y which are needed to update the unreduced * part of the matrix. * CALL PCLABRD( M-K+1, N-K+1, JB, A, I, J, DESCA, D, E, TAUQ, $ TAUP, WORK, IW, JW, DESCWX, WORK( IPY ), IW, $ JW, DESCWY, WORK( IPW ) ) * * Update the trailing submatrix A(i+nb:ia+m-1,j+nb:ja+n-1), using * an update of the form A := A - V*Y' - X*U'. * CALL PCGEMM( 'No transpose', 'No transpose', M-K-JB+1, $ N-K-JB+1, JB, -ONE, A, I+JB, J, DESCA, $ WORK( IPY ), IW, JW+JB, DESCWY, ONE, A, I+JB, $ J+JB, DESCA ) CALL PCGEMM( 'No transpose', 'No transpose', M-K-JB+1, $ N-K-JB+1, JB, -ONE, WORK, IW+JB, JW, DESCWX, A, I, $ J+JB, DESCA, ONE, A, I+JB, J+JB, DESCA ) * * Copy last off-diagonal elements of B back into sub( A ). * IF( M.GE.N ) THEN JS = MIN( INDXG2L( I+JB-1, NB, 0, DESCA( RSRC_ ), NPROW ), $ MP ) IF( JS.GT.0 ) $ CALL PCELSET( A, I+JB-1, J+JB, DESCA, CMPLX( E( JS ) ) ) ELSE JS = MIN( INDXG2L( J+JB-1, NB, 0, DESCA( CSRC_ ), NPCOL ), $ NQ ) IF( JS.GT.0 ) $ CALL PCELSET( A, I+JB, J+JB-1, DESCA, CMPLX( E( JS ) ) ) END IF * K = K + JB JB = NB IW = 1 JW = 1 DESCWX( M_ ) = DESCWX( M_ ) - JB DESCWX( RSRC_ ) = MOD( DESCWX( RSRC_ ) + 1, NPROW ) DESCWX( CSRC_ ) = MOD( DESCWX( CSRC_ ) + 1, NPCOL ) DESCWY( N_ ) = DESCWY( N_ ) - JB DESCWY( RSRC_ ) = MOD( DESCWY( RSRC_ ) + 1, NPROW ) DESCWY( CSRC_ ) = MOD( DESCWY( CSRC_ ) + 1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the remainder of the matrix. * CALL PCGEBD2( M-K+1, N-K+1, A, IA+K-1, JA+K-1, DESCA, D, E, TAUQ, $ TAUP, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGEBRD * END scalapack-2.0.2/SRC/pcgecon.f000644 000766 000024 00000037455 11252745702 016150 0ustar00juliestaff000000 000000 SUBROUTINE PCGECON( NORM, N, A, IA, JA, DESCA, ANORM, RCOND, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, INFO, JA, LRWORK, LWORK, N REAL ANORM, RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL RWORK( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCGECON estimates the reciprocal of the condition number of a general * distributed complex matrix A(IA:IA+N-1,JA:JA+N-1), in either the * 1-norm or the infinity-norm, using the LU factorization computed by * PCGETRF. * * An estimate is obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), and * the reciprocal of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies whether the 1-norm condition number or the * infinity-norm condition number is required: * = '1' or 'O': 1-norm * = 'I': Infinity-norm * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). On entry, * this array contains the local pieces of the factors L and U * from the factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U; the * unit diagonal elements of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ANORM (global input) REAL * If NORM = '1' or 'O', the 1-norm of the original distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * If NORM = 'I', the infinity-norm of the original distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * * RCOND (global output) REAL * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + * MAX( 2, MAX(NB_A*CEIL(NPROW-1,NPCOL),LOCc(N+MOD(JA-1,NB_A)) + * NB_A*CEIL(NPCOL-1,NPROW)) ). * * LOCr and LOCc values can be computed using the ScaLAPACK * tool function NUMROC; NPROW and NPCOL can be determined by * calling the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) REAL array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= MAX( 1, 2*LOCc(N+MOD(JA-1,NB_A)) ). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, ONENRM CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPNL, IPNU, $ IPV, IPW, IPX, IROFF, IV, IX, IXX, JJA, JV, JX, $ KASE, KASE1, LRWMIN, LWMIN, MYCOL, MYROW, NP, $ NPCOL, NPMOD, NPROW, NQ, NQMOD REAL AINVNM, SCALE, SL, SMLNUM, SU COMPLEX WMAX, ZDUM * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, CGEBR2D, $ CGEBS2D, INFOG2L, PCAMAX, PCHK1MAT, $ PCLATRS, PCLACON, PCSRSCL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, ICHAR, MAX, MOD, REAL * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600 + CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN ONENRM = NORM.EQ.'1' .OR. LSAME( NORM, 'O' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = REAL( LWMIN ) LRWMIN = MAX( 1, 2*NQMOD ) RWORK( 1 ) = REAL( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.ONENRM .AND. .NOT.LSAME( NORM, 'I' ) ) THEN INFO = -1 ELSE IF( ANORM.LT.ZERO ) THEN INFO = -7 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF * IF( ONENRM ) THEN IDUM1( 1 ) = ICHAR( '1' ) ELSE IDUM1( 1 ) = ICHAR( 'I' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 IF( LRWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 12 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGECON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * RCOND = ZERO IF( N.EQ.0 ) THEN RCOND = ONE RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN ELSE IF( N.EQ.1 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPW = IPV + NP IPNL = 1 IPNU = IPNL + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Estimate the norm of inv(A). * AINVNM = ZERO NORMIN = 'N' IF( ONENRM ) THEN KASE1 = 1 ELSE KASE1 = 2 END IF KASE = 0 * 10 CONTINUE CALL PCLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), IX, JX, $ DESCX, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( KASE.EQ.KASE1 ) THEN * * Multiply by inv(L). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( 'Lower', 'No transpose', 'Unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, RWORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL * * Multiply by inv(U). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( 'Upper', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, RWORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(U'). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( 'Upper', 'Conjugate transpose', 'Non-unit', $ NORMIN, N, A, IA, JA, DESCA, WORK( IPX ), IX, $ JX, DESCX, SU, RWORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL * * Multiply by inv(L'). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( 'Lower', 'Conjugate transpose', 'Unit', $ NORMIN, N, A, IA, JA, DESCA, WORK( IPX ), $ IX, JX, DESCX, SL, RWORK( IPNL ), $ WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF * * Divide X by 1/(SL*SU) if doing so will not cause overflow. * SCALE = SL*SU NORMIN = 'Y' IF( SCALE.NE.ONE ) THEN CALL PCAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL CGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1 ) ELSE CALL CGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1, $ IAROW, MYCOL ) END IF END IF IF( SCALE.LT.CABS1( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PCSRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / AINVNM ) / ANORM * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PCGECON * END scalapack-2.0.2/SRC/pcgeequ.f000644 000766 000024 00000033021 10363532303 016135 0ustar00juliestaff000000 000000 SUBROUTINE PCGEEQU( M, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N REAL AMAX, COLCND, ROWCND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL C( * ), R( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCGEEQU computes row and column scalings intended to equilibrate an * M-by-N distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA:JA+N-1) and * reduce its condition number. R returns the row scale factors and C * the column scale factors, chosen to try to make the largest entry in * each row and column of the distributed matrix B with elements * B(i,j) = R(i) * A(i,j) * C(j) have absolute value 1. * * R(i) and C(j) are restricted to be between SMLNUM = smallest safe * number and BIGNUM = largest safe number. Use of these scaling * factors is not guaranteed to reduce the condition number of * sub( A ) but works well in practice. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ), the * local pieces of the M-by-N distributed matrix whose * equilibration factors are to be computed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * R (local output) REAL array, dimension LOCr(M_A) * If INFO = 0 or INFO > IA+M-1, R(IA:IA+M-1) contains the row * scale factors for sub( A ). R is aligned with the distributed * matrix A, and replicated across every process column. R is * tied to the distributed matrix A. * * C (local output) REAL array, dimension LOCc(N_A) * If INFO = 0, C(JA:JA+N-1) contains the column scale factors * for sub( A ). C is aligned with the distributed matrix A, and * replicated down every process row. C is tied to the distri- * buted matrix A. * * ROWCND (global output) REAL * If INFO = 0 or INFO > IA+M-1, ROWCND contains the ratio of * the smallest R(i) to the largest R(i) (IA <= i <= IA+M-1). * If ROWCND >= 0.1 and AMAX is neither too large nor too small, * it is not worth scaling by R(IA:IA+M-1). * * COLCND (global output) REAL * If INFO = 0, COLCND contains the ratio of the smallest C(j) * to the largest C(j) (JA <= j <= JA+N-1). If COLCND >= 0.1, it * is not worth scaling by C(JA:JA+N-1). * * AMAX (global output) REAL * Absolute value of largest distributed matrix element. If * AMAX is very close to overflow or very close to underflow, * the matrix should be scaled. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, and i is * <= M: the i-th row of the distributed matrix sub( A ) * is exactly zero, * > M: the (i-M)-th column of the distributed * matrix sub( A ) is exactly zero. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IDUMM, IIA, $ IOFFA, IROFF, J, JJA, LDA, MP, MYCOL, MYROW, $ NPCOL, NPROW, NQ REAL BIGNUM, RCMAX, RCMIN, SMLNUM COMPLEX ZDUM * .. * .. Local Arrays .. INTEGER DESCC( DLEN_ ), DESCR( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, IGAMX2D, $ INFOG2L, PCHK1MAT, PB_TOPGET, PXERBLA, SGAMN2D, $ SGAMX2D * .. * .. External Functions .. INTEGER INDXL2G, NUMROC REAL PSLAMCH EXTERNAL INDXL2G, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, MIN, MOD, REAL * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 0, IDUMM, IDUMM, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEEQU', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) THEN ROWCND = ONE COLCND = ONE AMAX = ZERO RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * * Get machine constants and local indexes. * SMLNUM = PSLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * * Assign descriptors for R and C arrays * CALL DESCSET( DESCR, M, 1, DESCA( MB_ ), 1, 0, 0, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCC, 1, N, 1, DESCA( NB_ ), 0, 0, ICTXT, 1 ) * * Compute row scale factors. * DO 10 I = IIA, IIA+MP-1 R( I ) = ZERO 10 CONTINUE * * Find the maximum element in each row. * IOFFA = (JJA-1)*LDA DO 30 J = JJA, JJA+NQ-1 DO 20 I = IIA, IIA+MP-1 R( I ) = MAX( R( I ), CABS1( A( IOFFA + I ) ) ) 20 CONTINUE IOFFA = IOFFA + LDA 30 CONTINUE CALL SGAMX2D( ICTXT, 'Rowwise', ROWCTOP, MP, 1, R( IIA ), $ MAX( 1, MP ), IDUMM, IDUMM, -1, -1, MYCOL ) * * Find the maximum and minimum scale factors. * RCMIN = BIGNUM RCMAX = ZERO DO 40 I = IIA, IIA+MP-1 RCMAX = MAX( RCMAX, R( I ) ) RCMIN = MIN( RCMIN, R( I ) ) 40 CONTINUE CALL SGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMAX, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) CALL SGAMN2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMIN, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) AMAX = RCMAX * IF( RCMIN.EQ.ZERO ) THEN * * Find the first zero scale factor and return an error code. * DO 50 I = IIA, IIA+MP-1 IF( R( I ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = INDXL2G( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) - IA + 1 50 CONTINUE CALL IGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, INFO, 1, $ IDUMM, IDUMM, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN ELSE * * Invert the scale factors. * DO 60 I = IIA, IIA+MP-1 R( I ) = ONE / MIN( MAX( R( I ), SMLNUM ), BIGNUM ) 60 CONTINUE * * Compute ROWCND = min(R(I)) / max(R(I)) * ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) * END IF * * Compute column scale factors * DO 70 J = JJA, JJA+NQ-1 C( J ) = ZERO 70 CONTINUE * * Find the maximum element in each column, * assuming the row scaling computed above. * IOFFA = (JJA-1)*LDA DO 90 J = JJA, JJA+NQ-1 DO 80 I = IIA, IIA+MP-1 C( J ) = MAX( C( J ), CABS1( A( IOFFA + I ) )*R( I ) ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE CALL SGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, NQ, C( JJA ), $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) * * Find the maximum and minimum scale factors. * RCMIN = BIGNUM RCMAX = ZERO DO 100 J = JJA, JJA+NQ-1 RCMIN = MIN( RCMIN, C( J ) ) RCMAX = MAX( RCMAX, C( J ) ) 100 CONTINUE CALL SGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMAX, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) CALL SGAMN2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMIN, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) * IF( RCMIN.EQ.ZERO ) THEN * * Find the first zero scale factor and return an error code. * DO 110 J = JJA, JJA+NQ-1 IF( C( J ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = M + INDXL2G( J, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) - JA + 1 110 CONTINUE CALL IGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, INFO, 1, $ IDUMM, IDUMM, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN ELSE * * Invert the scale factors. * DO 120 J = JJA, JJA+NQ-1 C( J ) = ONE / MIN( MAX( C( J ), SMLNUM ), BIGNUM ) 120 CONTINUE * * Compute COLCND = min(C(J)) / max(C(J)) * COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) * END IF * RETURN * * End of PCGEEQU * END scalapack-2.0.2/SRC/pcgehd2.f000644 000766 000024 00000026773 10363532303 016040 0ustar00juliestaff000000 000000 SUBROUTINE PCGEHD2( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEHD2 reduces a complex general distributed matrix sub( A ) * to upper Hessenberg form H by an unitary similarity transformation: * Q' * sub( A ) * Q = H, where * sub( A ) = A(IA+N-1:IA+N-1,JA+N-1:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows IA:IA+ILO-2 and IA+IHI:IA+N-1 and columns JA:JA+JLO-2 * and JA+JHI:JA+N-1. See Further Details. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) to be reduced. On exit, * the upper triangle and the first subdiagonal of sub( A ) are * overwritten with the upper Hessenberg matrix H, and the ele- * ments below the first subdiagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). Elements JA:JA+ILO-2 and JA+IHI:JA+N-2 of TAU are * set to zero. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB + MAX( NpA0, NB ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of (ihi-ilo) elementary * reflectors * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on * exit in A(ia+ilo+i:ia+ihi-1,ja+ilo+i-2), and tau in TAU(ja+ilo+i-2). * * The contents of A(IA:IA+N-1,JA:JA+N-1) are illustrated by the follo- * wing example, with n = 7, ilo = 2 and ihi = 6: * * on entry on exit * * ( a a a a a a a ) ( a a h h h h a ) * ( a a a a a a ) ( a h h h h a ) * ( a a a a a a ) ( h h h h h h ) * ( a a a a a a ) ( v2 h h h h h ) * ( a a a a a a ) ( v2 v3 h h h h ) * ( a a a a a a ) ( v2 v3 v4 h h h ) * ( a ) ( a ) * * where a denotes an element of the original matrix sub( A ), h denotes * a modified element of the upper Hessenberg matrix H, and vi denotes * an element of the vector defining H(ja+ilo+i-2). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IAROW, ICOFFA, ICTXT, IROFFA, J, K, LWMIN, $ MYCOL, MYROW, NPA0, NPCOL, NPROW COMPLEX AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PCELSET, $ PCLARF, PCLARFC, PCLARFG, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) NPA0 = NUMROC( IHI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = DESCA( NB_ ) + MAX( NPA0, DESCA( NB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -2 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEHD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * DO 10 K = ILO, IHI-1 I = IA + K - 1 J = JA + K - 1 * * Compute elementary reflector H(j) to annihilate * A(i+2:ihi+ia-1,j) * CALL PCLARFG( IHI-K, AII, I+1, J, A, MIN( I+2, N+IA-1 ), J, $ DESCA, 1, TAU ) CALL PCELSET( A, I+1, J, DESCA, ONE ) * * Apply H(k) to A(ia:ihi+ia-1,j+1:ihi+ja-1) from the right * CALL PCLARF( 'Right', IHI, IHI-K, A, I+1, J, DESCA, 1, TAU, A, $ IA, J+1, DESCA, WORK ) * * Apply H(j) to A(i+1:ia+ihi-1,j+1:ja+n-1) from the left * CALL PCLARFC( 'Left', IHI-K, N-K, A, I+1, J, DESCA, 1, TAU, A, $ I+1, J+1, DESCA, WORK ) * CALL PCELSET( A, I+1, J, DESCA, AII ) 10 CONTINUE * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGEHD2 * END scalapack-2.0.2/SRC/pcgehrd.f000644 000766 000024 00000035564 10363532303 016136 0ustar00juliestaff000000 000000 SUBROUTINE PCGEHRD( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEHRD reduces a complex general distributed matrix sub( A ) * to upper Hessenberg form H by an unitary similarity transformation: * Q' * sub( A ) * Q = H, where * sub( A ) = A(IA+N-1:IA+N-1,JA+N-1:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows IA:IA+ILO-2 and IA+IHI:IA+N-1 and columns JA:JA+ILO-2 * and JA+IHI:JA+N-1. See Further Details. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) to be reduced. On exit, * the upper triangle and the first subdiagonal of sub( A ) are * overwritten with the upper Hessenberg matrix H, and the ele- * ments below the first subdiagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). Elements JA:JA+ILO-2 and JA+IHI:JA+N-2 of TAU are * set to zero. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB*NB + NB*MAX( IHIP+1, IHLP+INLQ ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ), * ICOFFA = MOD( JA-1, NB ), IOFF = MOD( IA+ILO-2, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * ILROW = INDXG2P( IA+ILO-1, NB, MYROW, RSRC_A, NPROW ), * IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, ILROW, NPROW ), * ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, CSRC_A, NPCOL ), * INLQ = NUMROC( N-ILO+IOFF+1, NB, MYCOL, ILCOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of (ihi-ilo) elementary * reflectors * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:I) = 0, v(I+1) = 1 and v(IHI+1:N) = 0; v(I+2:IHI) is stored on * exit in A(IA+ILO+I:IA+IHI-1,JA+ILO+I-2), and tau in TAU(JA+ILO+I-2). * * The contents of A(IA:IA+N-1,JA:JA+N-1) are illustrated by the follow- * ing example, with N = 7, ILO = 2 and IHI = 6: * * on entry on exit * * ( a a a a a a a ) ( a a h h h h a ) * ( a a a a a a ) ( a h h h h a ) * ( a a a a a a ) ( h h h h h h ) * ( a a a a a a ) ( v2 h h h h h ) * ( a a a a a a ) ( v2 v3 h h h h ) * ( a a a a a a ) ( v2 v3 v4 h h h ) * ( a ) ( a ) * * where a denotes an element of the original matrix sub( A ), H denotes * a modified element of the upper Hessenberg matrix H, and vi denotes * an element of the vector defining H(JA+ILO+I-2). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, IB, ICOFFA, ICTXT, IHIP, $ IHLP, IIA, IINFO, ILCOL, ILROW, IMCOL, INLQ, $ IOFF, IPT, IPW, IPY, IROFFA, J, JJ, JJA, JY, $ K, L, LWMIN, MYCOL, MYROW, NB, NPCOL, NPROW, $ NQ COMPLEX EI * .. * .. Local Arrays .. INTEGER DESCY( DLEN_ ), IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG1L, $ INFOG2L, PCGEMM, PCGEHD2, PCHK1MAT, $ PCLAHRD, PCLARFB, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, NB ) ICOFFA = MOD( JA-1, NB ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ) IOFF = MOD( IA+ILO-2, NB ) ILROW = INDXG2P( IA+ILO-1, NB, MYROW, DESCA( RSRC_ ), $ NPROW ) IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, ILROW, NPROW ) ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) INLQ = NUMROC( N-ILO+IOFF+1, NB, MYCOL, ILCOL, NPCOL ) LWMIN = NB*( NB + MAX( IHIP+1, IHLP+INLQ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -2 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA .OR. IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = ILO IDUM2( 1 ) = 2 IDUM1( 2 ) = IHI IDUM2( 2 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 10 CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEHRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Set elements JA:JA+ILO-2 and JA+JHI-1:JA+N-2 of TAU to zero. * NQ = NUMROC( JA+N-2, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL INFOG1L( JA+ILO-2, NB, NPCOL, MYCOL, DESCA( CSRC_ ), JJ, $ IMCOL ) DO 10 J = JJA, MIN( JJ, NQ ) TAU( J ) = ZERO 10 CONTINUE * CALL INFOG1L( JA+IHI-1, NB, NPCOL, MYCOL, DESCA( CSRC_ ), JJ, $ IMCOL ) DO 20 J = JJ, NQ TAU( J ) = ZERO 20 CONTINUE * * Quick return if possible * IF( IHI-ILO.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPT = 1 IPY = IPT + NB * NB IPW = IPY + IHIP * NB CALL DESCSET( DESCY, IHI+IROFFA, NB, NB, NB, IAROW, ILCOL, ICTXT, $ MAX( 1, IHIP ) ) * K = ILO IB = NB - IOFF JY = IOFF + 1 * * Loop over remaining block of columns * DO 30 L = 1, IHI-ILO+IOFF-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns j:j+ib-1 to Hessenberg form, returning the * matrices V and T of the block reflector H = I - V*T*V' * which performs the reduction, and also the matrix Y = A*V*T * CALL PCLAHRD( IHI, K, IB, A, IA, J, DESCA, TAU, WORK( IPT ), $ WORK( IPY ), 1, JY, DESCY, WORK( IPW ) ) * * Apply the block reflector H to A(ia:ia+ihi-1,j+ib:ja+ihi-1) * from the right, computing A := A - Y * V'. * V(i+ib,ib-1) must be set to 1. * CALL PCELSET2( EI, A, I+IB, J+IB-1, DESCA, ONE ) CALL PCGEMM( 'No transpose', 'Conjugate transpose', IHI, $ IHI-K-IB+1, IB, -ONE, WORK( IPY ), 1, JY, DESCY, $ A, I+IB, J, DESCA, ONE, A, IA, J+IB, DESCA ) CALL PCELSET( A, I+IB, J+IB-1, DESCA, EI ) * * Apply the block reflector H to A(i+1:ia+ihi-1,j+ib:ja+n-1) from * the left * CALL PCLARFB( 'Left', 'Conjugate transpose', 'Forward', $ 'Columnwise', IHI-K, N-K-IB+1, IB, A, I+1, J, $ DESCA, WORK( IPT ), A, I+1, J+IB, DESCA, $ WORK( IPY ) ) * K = K + IB IB = NB JY = 1 DESCY( CSRC_ ) = MOD( DESCY( CSRC_ ) + 1, NPCOL ) * 30 CONTINUE * * Use unblocked code to reduce the rest of the matrix * CALL PCGEHD2( N, K, IHI, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGEHRD * END scalapack-2.0.2/SRC/pcgelq2.f000644 000766 000024 00000025071 10363532303 016047 0ustar00juliestaff000000 000000 SUBROUTINE PCGELQ2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGELQ2 computes a LQ factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = L * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and below the diagonal of sub( A ) contain the M by min(M,N) * lower trapezoidal matrix L (L is lower triangular if M <= N); * the elements above the diagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * of the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia+k-1)' H(ia+k-2)' . . . H(ia)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in * A(ia+i-1,ja+i:ja+n-1), and tau in TAU(ia+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, J, K, LWMIN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ COMPLEX AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PCELSET, $ PCLACGV, PCLARF, PCLARFG, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQ + MAX( 1, MP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGELQ2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * K = MIN( M, N ) DO 10 I = IA, IA+K-1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * A(i,j+1:ja+n-1) * CALL PCLACGV( N-J+JA, A, I, J, DESCA, DESCA( M_ ) ) CALL PCLARFG( N-J+JA, AII, I, J, A, I, MIN( J+1, JA+N-1 ), $ DESCA, DESCA( M_ ), TAU ) * IF( I.LT.IA+M-1 ) THEN * * Apply H(i) to A(i+1:ia+m-1,j:ja+n-1) from the right * CALL PCELSET( A, I, J, DESCA, ONE ) CALL PCLARF( 'Right', M-I+IA-1, N-J+JA, A, I, J, DESCA, $ DESCA( M_ ), TAU, A, I+1, J, DESCA, WORK ) END IF CALL PCELSET( A, I, J, DESCA, AII ) CALL PCLACGV( N-J+JA, A, I, J, DESCA, DESCA( M_ ) ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGELQ2 * END scalapack-2.0.2/SRC/pcgelqf.f000644 000766 000024 00000027474 10363532303 016144 0ustar00juliestaff000000 000000 SUBROUTINE PCGELQF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGELQF computes a LQ factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = L * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and below the diagonal of sub( A ) contain the M by min(M,N) * lower trapezoidal matrix L (L is lower triangular if M <= N); * the elements above the diagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * of the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia+k-1)' H(ia+k-2)' . . . H(ia)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in * A(ia+i-1,ja+i:ja+n-1), and tau in TAU(ia+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IN, IPW, $ IROFF, J, K, LWMIN, MP0, MYCOL, MYROW, NPCOL, $ NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCGELQ2, $ PCLARFB, PCLARFT, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * * Handle the first block of rows separately * IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IB = IN - IA + 1 * * Compute the LQ factorization of the first block A(ia:in:ja:ja+n-1) * CALL PCGELQ2( IB, N, A, IA, JA, DESCA, TAU, WORK, LWORK, IINFO ) * IF( IA+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(ia) H(ia+1) . . . H(in) * CALL PCLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia+ib:ia+m-1,ja:ja+n-1) from the right * CALL PCLARFB( 'Right', 'No transpose', 'Forward', 'Rowwise', $ M-IB, N, IB, A, IA, JA, DESCA, WORK, A, IA+IB, $ JA, DESCA, WORK( IPW ) ) END IF * * Loop over the remaining blocks of rows * DO 10 I = IN+1, IA+K-1, DESCA( MB_ ) IB = MIN( K-I+IA, DESCA( MB_ ) ) J = JA + I - IA * * Compute the LQ factorization of the current block * A(i:i+ib-1:j:ja+n-1) * CALL PCGELQ2( IB, N-I+IA, A, I, J, DESCA, TAU, WORK, LWORK, $ IINFO ) * IF( I+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PCLARFT( 'Forward', 'Rowwise', N-I+IA, IB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(i+ib:ia+m-1,j:ja+n-1) from the right * CALL PCLARFB( 'Right', 'No transpose', 'Forward', 'Rowwise', $ M-I-IB+IA, N-J+JA, IB, A, I, J, DESCA, WORK, $ A, I+IB, J, DESCA, WORK( IPW ) ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGELQF * END scalapack-2.0.2/SRC/pcgels.f000644 000766 000024 00000055126 11312467374 016005 0ustar00juliestaff000000 000000 SUBROUTINE PCGELS( TRANS, M, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ), WORK( * ) * .. * * Purpose * ======= * * PCGELS solves overdetermined or underdetermined complex linear * systems involving an M-by-N matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1), * or its conjugate-transpose, using a QR or LQ factorization of * sub( A ). It is assumed that sub( A ) has full rank. * * The following options are provided: * * 1. If TRANS = 'N' and m >= n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || sub( B ) - sub( A )*X ||. * * 2. If TRANS = 'N' and m < n: find the minimum norm solution of * an underdetermined system sub( A ) * X = sub( B ). * * 3. If TRANS = 'C' and m >= n: find the minimum norm solution of * an undetermined system sub( A )**H * X = sub( B ). * * 4. If TRANS = 'C' and m < n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || sub( B ) - sub( A )**H * X ||. * * where sub( B ) denotes B( IB:IB+M-1, JB:JB+NRHS-1 ) when TRANS = 'N' * and B( IB:IB+N-1, JB:JB+NRHS-1 ) otherwise. Several right hand side * vectors b and solution vectors x can be handled in a single call; * When TRANS = 'N', the solution vectors are stored as the columns of * the N-by-NRHS right hand side matrix sub( B ) and the M-by-NRHS * right hand side matrix sub( B ) otherwise. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER * = 'N': the linear system involves sub( A ); * = 'C': the linear system involves sub( A )**H. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of * rows of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e. the number of columns * of the distributed submatrices sub( B ) and X. NRHS >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of local dimension * ( LLD_A, LOCc(JA+N-1) ). On entry, the M-by-N matrix A. * if M >= N, sub( A ) is overwritten by details of its QR * factorization as returned by PCGEQRF; * if M < N, sub( A ) is overwritten by details of its LQ * factorization as returned by PCGELQF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX pointer into the * local memory to an array of local dimension * (LLD_B, LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the distributed matrix B of right hand side * vectors, stored columnwise; * sub( B ) is M-by-NRHS if TRANS='N', and N-by-NRHS otherwise. * On exit, sub( B ) is overwritten by the solution vectors, * stored columnwise: if TRANS = 'N' and M >= N, rows 1 to N * of sub( B ) contain the least squares solution vectors; the * residual sum of squares for the solution in each column is * given by the sum of squares of elements N+1 to M in that * column; if TRANS = 'N' and M < N, rows 1 to N of sub( B ) * contain the minimum norm solution vectors; if TRANS = 'C' * and M >= N, rows 1 to M of sub( B ) contain the minimum norm * solution vectors; if TRANS = 'C' and M < N, rows 1 to M of * sub( B ) contain the least squares solution vectors; the * residual sum of squares for the solution in each column is * given by the sum of squares of elements M+1 to N in that * column. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= LTAU + MAX( LWF, LWS ) where * If M >= N, then * LTAU = NUMROC( JA+MIN(M,N)-1, NB_A, MYCOL, CSRC_A, NPCOL ), * LWF = NB_A * ( MpA0 + NqA0 + NB_A ) * LWS = MAX( (NB_A*(NB_A-1))/2, (NRHSqB0 + MpB0)*NB_A ) + * NB_A * NB_A * Else * LTAU = NUMROC( IA+MIN(M,N)-1, MB_A, MYROW, RSRC_A, NPROW ), * LWF = MB_A * ( MpA0 + NqA0 + MB_A ) * LWS = MAX( (MB_A*(MB_A-1))/2, ( NpB0 + MAX( NqA0 + * NUMROC( NUMROC( N+IROFFB, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NRHSqB0 ) )*MB_A ) + * MB_A * MB_A * End if * * where LCMP = LCM / NPROW with LCM = ILCM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * MpB0 = NUMROC( M+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NpB0 = NUMROC( N+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NRHSqB0 = NUMROC( NRHS+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, TPSD INTEGER BROW, IACOL, IAROW, IASCL, IBCOL, IBROW, IBSCL, $ ICOFFA, ICOFFB, ICTXT, IPW, IROFFA, IROFFB, $ LCM, LCMP, LTAU, LWF, LWMIN, LWS, MPA0, MPB0, $ MYCOL, MYROW, NPB0, NPCOL, NPROW, NQA0, $ NRHSQB0, SCLLEN REAL ANRM, BIGNUM, BNRM, SMLNUM * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) REAL RWORK( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM INTEGER INDXG2P, NUMROC REAL PCLANGE, PSLAMCH EXTERNAL ILCM, INDXG2P, LSAME, NUMROC, PCLANGE, $ PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCGELQF, $ PCGEQRF, PSLABAD, PCLASCL, PCLASET, $ PCTRSM, PCUNMLQ, PCUNMQR, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 800 + CTXT_ ) ELSE CALL CHK1MAT( M, 2, N, 3, IA, JA, DESCA, 8, INFO ) IF ( M .GE. N ) THEN CALL CHK1MAT( M, 2, NRHS, 4, IB, JB, DESCB, 12, INFO ) ELSE CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 12, INFO ) ENDIF IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( IA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( IB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) NRHSQB0 = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, $ NPCOL ) IF( M.GE.N ) THEN MPB0 = NUMROC( M+IROFFB, DESCB( MB_ ), MYROW, IBROW, $ NPROW ) LTAU = NUMROC( JA+MIN(M,N)-1, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) LWF = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) LWS = MAX( ( DESCA( NB_ )*( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPB0 + NRHSQB0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ )*DESCA( NB_ ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW NPB0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, $ NPROW ) LTAU = NUMROC( IA+MIN(M,N)-1, DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) LWF = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) LWS = MAX( ( DESCA( MB_ )*( DESCA( MB_ ) - 1 ) ) / 2, $ ( NPB0 + MAX( NQA0 + NUMROC( NUMROC( N+IROFFB, $ DESCA( MB_ ), 0, 0, NPROW ), DESCA( MB_ ), 0, 0, $ LCMP ), NRHSQB0 ) )*DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF LWMIN = LTAU + MAX( LWF, LWS ) WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) * TPSD = .TRUE. IF( LSAME( TRANS, 'N' ) ) $ TPSD = .FALSE. * IF( .NOT.( LSAME( TRANS, 'N' ) .OR. $ LSAME( TRANS, 'C' ) ) ) THEN INFO = -1 ELSE IF( M.LT.0 ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( M.GE.N .AND. IROFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( M.GE.N .AND. IAROW.NE.IBROW ) THEN INFO = -10 ELSE IF( M.LT.N .AND. ICOFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( M.GE.N .AND. DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1200 + MB_ ) ELSE IF( M.LT.N .AND. DESCA( NB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1200 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1200 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF END IF * IF( .NOT.TPSD ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 14 CALL PCHK2MAT( M, 2, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, 4, $ IB, JB, DESCB, 12, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGELS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( MIN( M, N, NRHS ).EQ.0 ) THEN CALL PCLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, $ IB, JB, DESCB ) RETURN END IF * * Get machine parameters * SMLNUM = PSLAMCH( ICTXT, 'S' ) SMLNUM = SMLNUM / PSLAMCH( ICTXT, 'P' ) BIGNUM = ONE / SMLNUM CALL PSLABAD( ICTXT, SMLNUM, BIGNUM ) * * Scale A, B if max entry outside range [SMLNUM,BIGNUM] * ANRM = PCLANGE( 'M', M, N, A, IA, JA, DESCA, RWORK ) IASCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN * * Scale matrix norm up to SMLNUM * CALL PCLASCL( 'G', ANRM, SMLNUM, M, N, A, IA, JA, DESCA, $ INFO ) IASCL = 1 ELSE IF( ANRM.GT.BIGNUM ) THEN * * Scale matrix norm down to BIGNUM * CALL PCLASCL( 'G', ANRM, BIGNUM, M, N, A, IA, JA, DESCA, $ INFO ) IASCL = 2 ELSE IF( ANRM.EQ.ZERO ) THEN * * Matrix all zero. Return zero solution. * CALL PCLASET( 'F', MAX( M, N ), NRHS, CZERO, CZERO, B, IB, $ JB, DESCB ) GO TO 10 END IF * BROW = M IF( TPSD ) $ BROW = N * BNRM = PCLANGE( 'M', BROW, NRHS, B, IB, JB, DESCB, RWORK ) * IBSCL = 0 IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN * * Scale matrix norm up to SMLNUM * CALL PCLASCL( 'G', BNRM, SMLNUM, BROW, NRHS, B, IB, JB, $ DESCB, INFO ) IBSCL = 1 ELSE IF( BNRM.GT.BIGNUM ) THEN * * Scale matrix norm down to BIGNUM * CALL PCLASCL( 'G', BNRM, BIGNUM, BROW, NRHS, B, IB, JB, $ DESCB, INFO ) IBSCL = 2 END IF * IPW = LTAU + 1 * IF( M.GE.N ) THEN * * compute QR factorization of A * CALL PCGEQRF( M, N, A, IA, JA, DESCA, WORK, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least N, optimally N*NB * IF( .NOT.TPSD ) THEN * * Least-Squares Problem min || A * X - B || * * B(IB:IB+M-1,JB:JB+NRHS-1) := Q' * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PCUNMQR( 'Left', 'Conjugate transpose', M, NRHS, N, A, $ IA, JA, DESCA, WORK, B, IB, JB, DESCB, $ WORK( IPW ), LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * * B(IB:IB+N-1,JB:JB+NRHS-1) := inv(R) * * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PCTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, CONE, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * SCLLEN = N * ELSE * * Overdetermined system of equations sub( A )' * X = sub( B ) * * sub( B ) := inv(R') * sub( B ) * CALL PCTRSM( 'Left', 'Upper', 'Conjugate transpose', $ 'Non-unit', N, NRHS, CONE, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) * * B(IB+N:IB+M-1,JB:JB+NRHS-1) = ZERO * CALL PCLASET( 'All', M-N, NRHS, CZERO, CZERO, B, IB+N, JB, $ DESCB ) * * B(IB:IB+M-1,JB:JB+NRHS-1) := Q(1:N,:) * * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PCUNMQR( 'Left', 'No transpose', M, NRHS, N, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * SCLLEN = M * END IF * ELSE * * Compute LQ factorization of sub( A ) * CALL PCGELQF( M, N, A, IA, JA, DESCA, WORK, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least M, optimally M*NB. * IF( .NOT.TPSD ) THEN * * underdetermined system of equations sub( A ) * X = sub( B ) * * B(IB:IB+M-1,JB:JB+NRHS-1) := inv(L) * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PCTRSM( 'Left', 'Lower', 'No transpose', 'Non-unit', M, $ NRHS, CONE, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * B(IB+M:IB+N-1,JB:JB+NRHS-1) = 0 * CALL PCLASET( 'All', N-M, NRHS, CZERO, CZERO, B, IB+M, JB, $ DESCB ) * * B(IB:IB+N-1,JB:JB+NRHS-1) := Q(1:N,:)' * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PCUNMLQ( 'Left', 'Conjugate transpose', N, NRHS, M, A, $ IA, JA, DESCA, WORK, B, IB, JB, DESCB, $ WORK( IPW ), LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * SCLLEN = N * ELSE * * overdetermined system min || A' * X - B || * * B(IB:IB+N-1,JB:JB+NRHS-1) := Q * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PCUNMLQ( 'Left', 'No transpose', N, NRHS, M, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * * B(IB:IB+M-1,JB:JB+NRHS-1) := inv(L') * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PCTRSM( 'Left', 'Lower', 'Conjugate transpose', $ 'Non-unit', M, NRHS, CONE, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) * SCLLEN = M * END IF * END IF * * Undo scaling * IF( IASCL.EQ.1 ) THEN CALL PCLASCL( 'G', ANRM, SMLNUM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) ELSE IF( IASCL.EQ.2 ) THEN CALL PCLASCL( 'G', ANRM, BIGNUM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) END IF IF( IBSCL.EQ.1 ) THEN CALL PCLASCL( 'G', SMLNUM, BNRM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) ELSE IF( IBSCL.EQ.2 ) THEN CALL PCLASCL( 'G', BIGNUM, BNRM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) END IF * 10 CONTINUE * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGELS * END scalapack-2.0.2/SRC/pcgeql2.f000644 000766 000024 00000030126 10363532303 016044 0ustar00juliestaff000000 000000 SUBROUTINE PCGEQL2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEQL2 computes a QL factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * L. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M >= N, the * lower triangle of the distributed submatrix * A( IA+M-N:IA+M-1, JA:JA+N-1 ) contains the N-by-N lower * triangular matrix L; if M <= N, the elements on and below * the (N-M)-th superdiagonal contain the M by N lower * trapezoidal matrix L; the remaining elements, with the * array TAU, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension LOCc(JA+N-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Mp0 + MAX( 1, Nq0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja+k-1) . . . H(ja+1) H(ja), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in * A(ia:ia+m-k+i-2,ja+n-k+i-1), and tau in TAU(ja+n-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICTXT, II, J, JJ, K, LWMIN, $ MP, MYCOL, MYROW, NPCOL, NPROW, NQ COMPLEX AJJ, ALPHA * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, CGEBR2D, $ CGEBS2D, CLARFG, CSCAL, $ INFOG2L, PCELSET, PCLARFC, $ PCLARFG, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MP + MAX( 1, NQ ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEQL2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( DESCA( M_ ).EQ.1 ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) IACOL = INDXG2P( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN I = II+(JJ+NQ-2)*DESCA( LLD_ ) AJJ = A( I ) CALL CLARFG( 1, AJJ, A( I ), 1, TAU( JJ+NQ-1 ) ) IF( N.GT.1 ) THEN ALPHA = ONE - CONJG( TAU( JJ+NQ-1 ) ) CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1 ) CALL CSCAL( NQ-1, ALPHA, A( II+(JJ-1)*DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ+NQ-1 ), 1 ) A( I ) = AJJ ELSE IF( N.GT.1 ) THEN CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, IAROW, IACOL ) CALL CSCAL( NQ, ALPHA, A( II+(JJ-1)*DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ+NQ-1 ), 1, IAROW, IACOL ) END IF * ELSE * K = MIN( M, N ) DO 10 J = JA+K-1, JA, -1 I = IA + J - JA * * Generate elementary reflector H(j) to annihilate * A(ia:i+m-k-1,j+n-k) * CALL PCLARFG( M-K+I-IA+1, AJJ, M-K+I, N-K+J, A, IA, $ N-K+J, DESCA, 1, TAU ) * * Apply H(j)' to A(ia:i+m-k,ja:j+n-k-1) from the left * CALL PCELSET( A, I+M-K, J+N-K, DESCA, ONE ) CALL PCLARFC( 'Left', M-K+I-IA+1, N-K+J-JA, A, IA, N-K+J, $ DESCA, 1, TAU, A, IA, JA, DESCA, WORK ) CALL PCELSET( A, I+M-K, J+N-K, DESCA, AJJ ) * 10 CONTINUE * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGEQL2 * END scalapack-2.0.2/SRC/pcgeqlf.f000644 000766 000024 00000027333 10363532303 016136 0ustar00juliestaff000000 000000 SUBROUTINE PCGEQLF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEQLF computes a QL factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * L. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M >= N, the * lower triangle of the distributed submatrix * A( IA+M-N:IA+M-1, JA:JA+N-1 ) contains the N-by-N lower * triangular matrix L; if M <= N, the elements on and below * the (N-M)-th superdiagonal contain the M by N lower * trapezoidal matrix L; the remaining elements, with the * array TAU, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension LOCc(JA+N-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( Mp0 + Nq0 + NB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja+k-1) . . . H(ja+1) H(ja), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in * A(ia:ia+m-k+i-2,ja+n-k+i-1), and tau in TAU(ja+n-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, IINFO, ICTXT, IPW, J, JB, JL, JN, $ K, LWMIN, MP0, MU, MYCOL, MYROW, NPCOL, NPROW, $ NQ0, NU * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCGEQL2, $ PCLARFB, PCLARFT, PB_TOPGET, PB_TOPSET, $ PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MP0 + NQ0 + DESCA( NB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEQLF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JL = MAX( ( (JA+N-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( JL.GE.JN+1 ) THEN * * Use blocked code initially * DO 10 J = JL, JN+1, -DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Compute the QL factorization of the current block * A(ia:ia+m-n+j+jb-ja-1,j:j+jb-1) * CALL PCGEQL2( M-N+J+JB-JA, JB, A, IA, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * IF( J.GT.JA ) THEN * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PCLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:ia+m-n+j+jb-ja-1,ja:j-1) from the * left * CALL PCLARFB( 'Left', 'Conjugate transpose', 'Backward', $ 'Columnwise', M-N+J+JB-JA, J-JA, JB, A, IA, $ J, DESCA, WORK, A, IA, JA, DESCA, $ WORK( IPW ) ) END IF * 10 CONTINUE * MU = M - N + JN - JA + 1 NU = JN - JA + 1 * ELSE * MU = M NU = N * END IF * * Use unblocked code to factor the last or only block * IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL PCGEQL2( MU, NU, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGEQLF * END scalapack-2.0.2/SRC/pcgeqpf.f000644 000766 000024 00000052341 10363532303 016137 0ustar00juliestaff000000 000000 SUBROUTINE PCGEQPF( M, N, A, IA, JA, DESCA, IPIV, TAU, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 14, 2000 * * .. Scalar Arguments .. INTEGER IA, JA, INFO, LRWORK, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) REAL RWORK( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEQPF computes a QR factorization with column pivoting of a * M-by-N distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1): * * sub( A ) * P = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension LOCc(JA+N-1). * On exit, if IPIV(I) = K, the local i-th column of sub( A )*P * was the global K-th column of sub( A ). IPIV is tied to the * distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX(3,Mp0 + Nq0). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) REAL array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCc(JA+N-1)+Nq0. * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * LOCc(JA+N-1) = NUMROC( JA+N-1, NB_A, MYCOL, CSRC_A, NPCOL ) * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(1) H(2) . . . H(n) * * Each H(i) has the form * * H = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in * A(ia+i-1:ia+m-1,ja+i-1). * * The matrix P is represented in jpvt as follows: If * jpvt(j) = i * then the jth column of P is the ith canonical unit vector. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFF, ICTXT, ICURROW, $ ICURCOL, II, IIA, IOFFA, IPCOL, IROFF, ITEMP, $ J, JB, JJ, JJA, JJPVT, JN, KB, K, KK, KSTART, $ KSTEP, LDA, LL, LRWMIN, LWMIN, MN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ, NQ0, PVT REAL TEMP, TEMP2 COMPLEX AJJ, ALPHA * .. * .. Local Arrays .. INTEGER DESCN( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CCOPY, CGEBR2D, CGEBS2D, $ CGERV2D, CGESD2D, CHK1MAT, CLARFG, $ CSWAP, DESCSET, IGERV2D, IGESD2D, INFOG1L, $ INFOG2L, PCELSET, PCHK1MAT, PCLARFC, $ PCLARFG, PSAMAX, PSCNRM2, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, CONJG, IFIX, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) NQ0 = NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) LWMIN = MAX( 3, MP + NQ ) LRWMIN = NQ0 + NQ * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) RWORK( 1 ) = REAL( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 10 IF( LRWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 12 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEQPF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF MN = MIN( M, N ) * * Initialize the array of pivots * LDA = DESCA( LLD_ ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) KSTEP = NPCOL * DESCA( NB_ ) * IF( MYCOL.EQ.IACOL ) THEN * * Handle first block separately * JB = JN - JA + 1 DO 10 LL = JJA, JJA+JB-1 IPIV( LL ) = JA + LL - JJA 10 CONTINUE KSTART = JN + KSTEP - DESCA( NB_ ) * * Loop over remaining block of columns * DO 30 KK = JJA+JB, JJA+NQ-1, DESCA( NB_ ) KB = MIN( JJA+NQ-KK, DESCA( NB_ ) ) DO 20 LL = KK, KK+KB-1 IPIV( LL ) = KSTART+LL-KK+1 20 CONTINUE KSTART = KSTART + KSTEP 30 CONTINUE ELSE KSTART = JN + ( MOD( MYCOL-IACOL+NPCOL, NPCOL )-1 )* $ DESCA( NB_ ) DO 50 KK = JJA, JJA+NQ-1, DESCA( NB_ ) KB = MIN( JJA+NQ-KK, DESCA( NB_ ) ) DO 40 LL = KK, KK+KB-1 IPIV( LL ) = KSTART+LL-KK+1 40 CONTINUE KSTART = KSTART + KSTEP 50 CONTINUE END IF * * Initialize partial column norms, handle first block separately * CALL DESCSET( DESCN, 1, DESCA( N_ ), 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), ICTXT, 1 ) * JJ = JJA IF( MYCOL.EQ.IACOL ) THEN DO 60 KK = 0, JB-1 CALL PSCNRM2( M, RWORK( JJ+KK ), A, IA, JA+KK, DESCA, 1 ) RWORK( NQ+JJ+KK ) = RWORK( JJ+KK ) 60 CONTINUE JJ = JJ + JB END IF ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining blocks of columns * DO 80 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 70 KK = 0, JB-1 CALL PSCNRM2( M, RWORK( JJ+KK ), A, IA, J+KK, DESCA, 1 ) RWORK( NQ+JJ+KK ) = RWORK( JJ+KK ) 70 CONTINUE JJ = JJ + JB END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) 80 CONTINUE * * Compute factorization * DO 120 J = JA, JA+MN-1 I = IA + J - JA * CALL INFOG1L( J, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), $ JJ, ICURCOL ) K = JA + N - J IF( K.GT.1 ) THEN CALL PSAMAX( K, TEMP, PVT, RWORK, 1, J, DESCN, $ DESCN( M_ ) ) ELSE PVT = J END IF IF( J.NE.PVT ) THEN CALL INFOG1L( PVT, DESCA( NB_ ), NPCOL, MYCOL, $ DESCA( CSRC_ ), JJPVT, IPCOL ) IF( ICURCOL.EQ.IPCOL ) THEN IF( MYCOL.EQ.ICURCOL ) THEN CALL CSWAP( MP, A( IIA+(JJ-1)*LDA ), 1, $ A( IIA+(JJPVT-1)*LDA ), 1 ) ITEMP = IPIV( JJPVT ) IPIV( JJPVT ) = IPIV( JJ ) IPIV( JJ ) = ITEMP RWORK( JJPVT ) = RWORK( JJ ) RWORK( NQ+JJPVT ) = RWORK( NQ+JJ ) END IF ELSE IF( MYCOL.EQ.ICURCOL ) THEN * CALL CGESD2D( ICTXT, MP, 1, A( IIA+(JJ-1)*LDA ), LDA, $ MYROW, IPCOL ) WORK( 1 ) = CMPLX( REAL( IPIV( JJ ) ) ) WORK( 2 ) = CMPLX( RWORK( JJ ) ) WORK( 3 ) = CMPLX( RWORK( JJ + NQ ) ) CALL CGESD2D( ICTXT, 3, 1, WORK, 3, MYROW, IPCOL ) * CALL CGERV2D( ICTXT, MP, 1, A( IIA+(JJ-1)*LDA ), LDA, $ MYROW, IPCOL ) CALL IGERV2D( ICTXT, 1, 1, IPIV( JJ ), 1, MYROW, $ IPCOL ) * ELSE IF( MYCOL.EQ.IPCOL ) THEN * CALL CGESD2D( ICTXT, MP, 1, A( IIA+(JJPVT-1)*LDA ), $ LDA, MYROW, ICURCOL ) CALL IGESD2D( ICTXT, 1, 1, IPIV( JJPVT ), 1, MYROW, $ ICURCOL ) * CALL CGERV2D( ICTXT, MP, 1, A( IIA+(JJPVT-1)*LDA ), $ LDA, MYROW, ICURCOL ) CALL CGERV2D( ICTXT, 3, 1, WORK, 3, MYROW, ICURCOL ) IPIV( JJPVT ) = IFIX( REAL( WORK( 1 ) ) ) RWORK( JJPVT ) = REAL( WORK( 2 ) ) RWORK( JJPVT+NQ ) = REAL( WORK( 3 ) ) * END IF * END IF * END IF * * Generate elementary reflector H(i) * CALL INFOG1L( I, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ II, ICURROW ) IF( DESCA( M_ ).EQ.1 ) THEN IF( MYROW.EQ.ICURROW ) THEN IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*DESCA( LLD_ ) AJJ = A( IOFFA ) CALL CLARFG( 1, AJJ, A( IOFFA ), 1, TAU( JJ ) ) IF( N.GT.1 ) THEN ALPHA = CMPLX( ONE ) - CONJG( TAU( JJ ) ) CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1 ) CALL CSCAL( NQ-JJ, ALPHA, A( IOFFA+DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ ), 1 ) A( IOFFA ) = AJJ ELSE IF( N.GT.1 ) THEN CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, ICURROW, ICURCOL ) CALL CSCAL( NQ-JJ+1, ALPHA, A( I ), DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.ICURCOL ) THEN CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), $ 1, ICURROW, ICURCOL ) END IF * ELSE * CALL PCLARFG( M-J+JA, AJJ, I, J, A, MIN( I+1, IA+M-1 ), J, $ DESCA, 1, TAU ) IF( J.LT.JA+N-1 ) THEN * * Apply H(i) to A(ia+j-ja:ia+m-1,j+1:ja+n-1) from the left * CALL PCELSET( A, I, J, DESCA, CMPLX( ONE ) ) CALL PCLARFC( 'Left', M-J+JA, JA+N-1-J, A, I, J, DESCA, $ 1, TAU, A, I, J+1, DESCA, WORK ) END IF CALL PCELSET( A, I, J, DESCA, AJJ ) * END IF * * Update partial columns norms * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 IF( MOD( J, DESCA( NB_ ) ).EQ.0 ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) IF( (JJA+NQ-JJ).GT.0 ) THEN IF( MYROW.EQ.ICURROW ) THEN CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, JJA+NQ-JJ, $ A( II+( MIN( JJA+NQ-1, JJ )-1 )*LDA ), $ LDA ) CALL CCOPY( JJA+NQ-JJ, A( II+( MIN( JJA+NQ-1, JJ ) $ -1)*LDA ), LDA, WORK( MIN( JJA+NQ-1, JJ ) ), $ 1 ) ELSE CALL CGEBR2D( ICTXT, 'Columnwise', ' ', JJA+NQ-JJ, 1, $ WORK( MIN( JJA+NQ-1, JJ ) ), MAX( 1, NQ ), $ ICURROW, MYCOL ) END IF END IF * JN = MIN( ICEIL( J+1, DESCA( NB_ ) ) * DESCA( NB_ ), $ JA + N - 1 ) IF( MYCOL.EQ.ICURCOL ) THEN DO 90 LL = JJ, JJ + JN - J - 1 IF( RWORK( LL ).NE.ZERO ) THEN TEMP = ONE-( ABS( WORK( LL ) ) / RWORK( LL ) )**2 TEMP = MAX( TEMP, ZERO ) TEMP2 = ONE + 0.05E+0*TEMP* $ ( RWORK( LL ) / RWORK( NQ+LL ) )**2 IF( TEMP2.EQ.ONE ) THEN IF( IA+M-1.GT.I ) THEN CALL PSCNRM2( IA+M-I-1, RWORK( LL ), A, $ I+1, J+LL-JJ, DESCA, 1 ) RWORK( NQ+LL ) = RWORK( LL ) ELSE RWORK( LL ) = ZERO RWORK( NQ+LL ) = ZERO END IF ELSE RWORK( LL ) = RWORK( LL ) * SQRT( TEMP ) END IF END IF 90 CONTINUE JJ = JJ + JN - J END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 110 K = JN+1, JA+N-1, DESCA( NB_ ) KB = MIN( JA+N-K, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 100 LL = JJ, JJ+KB-1 IF( RWORK(LL).NE.ZERO ) THEN TEMP = ONE-( ABS( WORK( LL ) ) / RWORK( LL ) )**2 TEMP = MAX( TEMP, ZERO ) TEMP2 = ONE + 0.05E+0*TEMP* $ ( RWORK( LL ) / RWORK( NQ+LL ) )**2 IF( TEMP2.EQ.ONE ) THEN IF( IA+M-1.GT.I ) THEN CALL PSCNRM2( IA+M-I-1, RWORK( LL ), A, $ I+1, K+LL-JJ, DESCA, 1 ) RWORK( NQ+LL ) = RWORK( LL ) ELSE RWORK( LL ) = ZERO RWORK( NQ+LL ) = ZERO END IF ELSE RWORK( LL ) = RWORK( LL ) * SQRT( TEMP ) END IF END IF 100 CONTINUE JJ = JJ + KB END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * 110 CONTINUE * 120 CONTINUE * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) RWORK( 1 ) = REAL( LRWMIN ) * RETURN * * End of PCGEQPF * END scalapack-2.0.2/SRC/pcgeqr2.f000644 000766 000024 00000027546 10363532303 016066 0ustar00juliestaff000000 000000 SUBROUTINE PCGEQR2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEQR2 computes a QR factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Mp0 + MAX( 1, Nq0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(m,n). * * Each H(i) has the form * * H(j) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1), and tau in TAU(ja+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, II, IACOL, IAROW, ICTXT, J, JJ, K, LWMIN, $ MP, MYCOL, MYROW, NPCOL, NPROW, NQ COMPLEX AJJ, ALPHA * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CGEBR2D, $ CGEBS2D, CHK1MAT, CLARFG, CSCAL, $ INFOG2L, PCELSET, PCLARFC, $ PCLARFG, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MP + MAX( 1, NQ ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEQR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) I = II+(JJ-1)*DESCA( LLD_ ) IF( MYCOL.EQ.IACOL ) THEN AJJ = A( I ) CALL CLARFG( 1, AJJ, A( I ), 1, TAU( JJ ) ) IF( N.GT.1 ) THEN ALPHA = ONE - CONJG( TAU( JJ ) ) CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1 ) CALL CSCAL( NQ-JJ, ALPHA, A( I+DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), $ 1 ) A( I ) = AJJ ELSE IF( N.GT.1 ) THEN CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, IAROW, IACOL ) CALL CSCAL( NQ-JJ+1, ALPHA, A( I ), DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), 1, $ IAROW, IACOL ) END IF * ELSE * K = MIN( M, N ) DO 10 J = JA, JA+K-1 I = IA + J - JA * * Generate elementary reflector H(j) to annihilate * A(i+1:ia+m-1,j) * CALL PCLARFG( M-J+JA, AJJ, I, J, A, MIN( I+1, IA+M-1 ), J, $ DESCA, 1, TAU ) IF( J.LT.JA+N-1 ) THEN * * Apply H(j)' to A(i:ia+m-1,j+1:ja+n-1) from the left * CALL PCELSET( A, I, J, DESCA, ONE ) * CALL PCLARFC( 'Left', M-J+JA, N-J+JA-1, A, I, J, DESCA, $ 1, TAU, A, I, J+1, DESCA, WORK ) END IF CALL PCELSET( A, I, J, DESCA, AJJ ) * 10 CONTINUE * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGEQR2 * END scalapack-2.0.2/SRC/pcgeqrf.f000644 000766 000024 00000027517 10363532303 016150 0ustar00juliestaff000000 000000 SUBROUTINE PCGEQRF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGEQRF computes a QR factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( Mp0 + Nq0 + NB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(m,n). * * Each H(i) has the form * * H(j) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1), and tau in TAU(ja+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IINFO, IPW, J, $ JB, JN, K, LWMIN, MP0, MYCOL, MYROW, NPCOL, $ NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCGEQR2, $ PCLARFB, PCLARFT, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MP0 + NQ0 + DESCA( NB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGEQRF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JB = JN - JA + 1 * * Compute the QR factorization of the first block A(ia:ia+m-1,ja:jn) * CALL PCGEQR2( M, JB, A, IA, JA, DESCA, TAU, WORK, LWORK, IINFO ) * IF( JA+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(ja) H(ja+1) . . . H(jn) * CALL PCLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:ia+m-1,ja+jb:ja+n-1) from the left * CALL PCLARFB( 'Left', 'Conjugate transpose', 'Forward', $ 'Columnwise', M, N-JB, JB, A, IA, JA, DESCA, $ WORK, A, IA, JA+JB, DESCA, WORK( IPW ) ) END IF * * Loop over the remaining blocks of columns * DO 10 J = JN+1, JA+K-1, DESCA( NB_ ) JB = MIN( K-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Compute the QR factorization of the current block * A(i:ia+m-1,j:j+jb-1) * CALL PCGEQR2( M-J+JA, JB, A, I, J, DESCA, TAU, WORK, LWORK, $ IINFO ) * IF( J+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PCLARFT( 'Forward', 'Columnwise', M-J+JA, JB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(i:ia+m-1,j+jb:ja+n-1) from the left * CALL PCLARFB( 'Left', 'Conjugate transpose', 'Forward', $ 'Columnwise', M-J+JA, N-J-JB+JA, JB, A, I, J, $ DESCA, WORK, A, I, J+JB, DESCA, WORK( IPW ) ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGEQRF * END scalapack-2.0.2/SRC/pcgerfs.f000644 000766 000024 00000102650 10363532303 016142 0ustar00juliestaff000000 000000 SUBROUTINE PCGERFS( TRANS, N, NRHS, A, IA, JA, DESCA, AF, IAF, $ JAF, DESCAF, IPIV, B, IB, JB, DESCB, X, IX, $ JX, DESCX, FERR, BERR, WORK, LWORK, RWORK, $ LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IAF, IB, IX, INFO, JA, JAF, JB, JX, $ LRWORK, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IPIV( * ) REAL BERR( * ), FERR( * ), RWORK( * ) COMPLEX A( * ), AF( * ), B( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PCGERFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solutions. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations. * = 'N': sub( A ) * sub( X ) = sub( B ) (No transpose) * = 'T': sub( A )**T * sub( X ) = sub( B ) (Transpose) * = 'C': sub( A )**H * sub( X ) = sub( B ) * (Conjugate transpose) * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) COMPLEX pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input) COMPLEX pointer into the local * memory to an array of local dimension (LLD_AF,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * factors of the matrix sub( A ) = P * L * U as computed by * PCGETRF. * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * IPIV (local input) INTEGER array of dimension LOCr(M_AF)+MB_AF. * This array contains the pivoting information as computed * by PCGETRF. IPIV(i) -> The global row local row i * was swapped with. This array is tied to the distributed * matrix A. * * B (local input) COMPLEX pointer into the local * memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1)). This array contains the local * pieces of the distributed matrix of right hand sides * sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input and output) COMPLEX pointer into the * local memory to an array of local dimension * (LLD_X,LOCc(JX+NRHS-1)). On entry, this array contains * the local pieces of the distributed matrix solution * sub( X ). On exit, the improved solution vectors. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). * The estimated forward error bound for each solution vector * of sub( X ). If XTRUE is the true solution corresponding * to sub( X ), FERR is an estimated upper bound for the * magnitude of the largest element in (sub( X ) - XTRUE) * divided by the magnitude of the largest element in sub( X ). * The estimate is as reliable as the estimate for RCOND, and * is almost always a slight overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr( N + MOD(IA-1,MB_A) ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) REAL array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCr( N + MOD(IB-1,MB_B) ). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices op( A ) and op( AF ) (respectively * sub( X ) and sub( B ) ) should be distributed the same way on the * same processes. These conditions ensure that sub( A ) and sub( AF ) * (resp. sub( X ) and sub( B ) ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( AF ), sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IAF, DESCAF( MB_ ) ) = f( JAF, DESCAF( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) REAL ZERO, RONE, TWO, THREE PARAMETER ( ZERO = 0.0E+0, RONE = 1.0E+0, TWO = 2.0E+0, $ THREE = 3.0E+0 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOTRAN CHARACTER TRANSN, TRANST INTEGER COUNT, IACOL, IAFCOL, IAFROW, IAROW, IXBCOL, $ IXBROW, IXCOL, IXROW, ICOFFA, ICOFFAF, ICOFFB, $ ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, IIW, $ IOFFXB, IPB, IPR, IPV, IROFFA, IROFFAF, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LRWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ REAL EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN COMPLEX ZDUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, CHK1MAT, $ DESCSET, INFOG2L, PCAGEMV, PCAXPY, PCHK2MAT, $ PCCOPY, PCGEMV, PCGETRS, PCLACON, $ PXERBLA, SGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * NOTRAN = LSAME( TRANS, 'N' ) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, N, 2, IAF, JAF, DESCAF, 11, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 16, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IX, JX, DESCX, 20, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFAF = MOD( JAF-1, DESCAF( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFCOL = INDXG2P( JAF, DESCAF( NB_ ), MYCOL, $ DESCAF( CSRC_ ), NPCOL ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 2 * NPMOD LRWMIN = NPMOD WORK( 1 ) = CMPLX( REAL( LWMIN ) ) RWORK( 1 ) = REAL( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( ( .NOT.NOTRAN ) .AND. ( .NOT.LSAME( TRANS, 'T' ) ) .AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700 + NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCAF( MB_ ) ) THEN INFO = -( 1100 + MB_ ) ELSE IF( IROFFAF.NE.0 .OR. IAROW.NE.IAFROW ) THEN INFO = -9 ELSE IF( DESCA( NB_ ).NE.DESCAF( NB_ ) ) THEN INFO = -( 1100 + NB_ ) ELSE IF( ICOFFAF.NE.0 .OR. IACOL.NE.IAFCOL ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -( 1100 + CTXT_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -14 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1600 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1600 + CTXT_ ) ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 2000 + MB_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -18 ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 2000 + NB_ ) ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -19 ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 2000 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -24 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -26 END IF END IF * IF( NOTRAN ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 1 ) = ICHAR( 'T' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 IDUM1( 2 ) = N IDUM2( 2 ) = 2 IDUM1( 3 ) = NRHS IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 24 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 26 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, N, 2, IAF, $ JAF, DESCAF, 11, 5, IDUM1, IDUM2, INFO ) CALL PCHK2MAT( N, 2, NRHS, 3, IB, JB, DESCB, 16, N, 2, NRHS, 3, $ IX, JX, DESCX, 20, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGERFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * IF( NOTRAN ) THEN TRANSN = 'N' TRANST = 'C' ELSE TRANSN = 'C' TRANST = 'N' END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = 1 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = 1 + maximum number of nonzero entries in each row of sub( A ) * NZ = N + 1 EPS = PSLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) ) * DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 100 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 20 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X), * where op(sub(A)) = sub(A), or sub(A)' (A**T or A**H), * depending on TRANS. * CALL PCCOPY( N, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PCGEMV( TRANS, N, N, -ONE, A, IA, JA, DESCA, X, IX, $ JX+K, DESCX, 1, ONE, WORK( IPR ), IW, JW, $ DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIXB, IIXB + NP - 1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 30 CONTINUE END IF END IF * CALL PCAGEMV( TRANS, N, N, RONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, RONE, RWORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 40 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during the * last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PCGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PCAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, IX, $ JX+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 20 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(op(sub(A))))* * ( abs(R) + NZ*EPS*( * abs(op(sub(A)))*abs(sub(X))+abs(sub(B)))))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(sub(A))) is the inverse of op(sub(A)) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(op(sub(A)))*abs(sub(X)) + abs(sub(B)) is less than * SAFE2. * * Use PCLACON to estimate the infinity-norm of the matrix * inv(op(sub(A))) * diag(W), where * W = abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 50 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 50 CONTINUE END IF END IF * KASE = 0 60 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL CGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PCLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(sub(A))'). * CALL PCGETRS( TRANST, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF ELSE * * Multiply by inv(op(sub(A)))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 80 CONTINUE END IF END IF * CALL PCGETRS( TRANSN, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 60 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 90 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 90 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 100 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 200 J = JN+1, JB+NRHS-1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 190 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 110 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X), * where op(sub(A)) = sub(A), or sub(A)' (A**T or A**H), * depending on TRANS. * CALL PCCOPY( N, B, IB, J+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PCGEMV( TRANS, N, N, -ONE, A, IA, JA, DESCA, X, $ IX, J+K, DESCX, 1, ONE, WORK( IPR ), IW, JW, $ DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) (abs(R(i))/(abs(op(sub(A)))*abs(sub(X)) + * abs(sub(B)))(i)) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIXB, IIXB+NP-1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 120 CONTINUE END IF END IF * CALL PCAGEMV( TRANS, N, N, RONE, A, IA, JA, DESCA, X, IX, $ J+K, DESCX, 1, RONE, RWORK( IPB ), IW, JW, $ DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 )THEN DO 130 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 130 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.ICURCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during * the last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. $ COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PCGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PCAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, $ IX, J+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 110 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(op(sub(A))))* * ( abs(R) + NZ*EPS*( * abs(op(sub(A)))*abs(sub(X))+abs(sub(B)))))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(sub(A))) is the inverse of op(sub(A)) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(op(sub(A)))*abs(sub(X)) + abs(sub(B)) is less than * SAFE2. * * Use PCLACON to estimate the infinity-norm of the matrix * inv(op(sub(A))) * diag(W), where * W = abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 140 CONTINUE END IF END IF * KASE = 0 150 CONTINUE IF( MYCOL.EQ.ICURCOL ) THEN CALL CGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, ICURCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PCLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = ICURCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(sub(A))'). * CALL PCGETRS( TRANST, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 160 CONTINUE END IF END IF ELSE * * Multiply by inv(op(sub(A)))*diag(W). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 170 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 170 CONTINUE END IF END IF * CALL PCGETRS( TRANSN, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, $ INFO ) END IF GO TO 150 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 180 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 180 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, $ 1, IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 190 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 200 CONTINUE * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) RWORK( 1 ) = REAL( LRWMIN ) * RETURN * * End of PCGERFS * END scalapack-2.0.2/SRC/pcgerq2.f000644 000766 000024 00000025160 10363532303 016054 0ustar00juliestaff000000 000000 SUBROUTINE PCGERQ2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGERQ2 computes a RQ factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = R * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAU, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia)' H(ia+1)' . . . H(ia+k-1)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; conjg(v(1:n-k+i-1)) is stored on * exit in A(ia+m-k+i-1,ja:ja+n-k+i-2), and tau in TAU(ia+m-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, J, K, LWMIN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ COMPLEX AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PCELSET, PCLACGV, PCLARF, PCLARFG, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQ + MAX( 1, MP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGERQ2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * K = MIN( M, N ) DO 10 I = IA+K-1, IA, -1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * A(i+m-k,ja:j+n-k-1) * CALL PCLACGV( N-K+J-JA+1, A, I+M-K, JA, DESCA, DESCA( M_ ) ) CALL PCLARFG( N-K+J-JA+1, AII, I+M-K, J+N-K, A, I+M-K, JA, $ DESCA, DESCA( M_ ), TAU ) * * Apply H(i) to A(ia:i+m-k-1,ja:j+n-k) from the right * CALL PCELSET( A, I+M-K, J+N-K, DESCA, ONE ) CALL PCLARF( 'Right', M-K+I-IA, N-K+J-JA+1, A, M-K+I, JA, $ DESCA, DESCA( M_ ), TAU, A, IA, JA, DESCA, WORK ) CALL PCELSET( A, I+M-K, J+N-K, DESCA, AII ) CALL PCLACGV( N-K+J-JA+1, A, I+M-K, JA, DESCA, DESCA( M_ ) ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGERQ2 * END scalapack-2.0.2/SRC/pcgerqf.f000644 000766 000024 00000027231 10363532303 016141 0ustar00juliestaff000000 000000 SUBROUTINE PCGERQF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCGERQF computes a RQ factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = R * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAU, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia)' H(ia+1)' . . . H(ia+k-1)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; conjg(v(1:n-k+i-1)) is stored on * exit in A(ia+m-k+i-1,ja:ja+n-k+i-2), and tau in TAU(ia+m-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IL, IN, IPW, $ K, LWMIN, MP0, MU, MYCOL, MYROW, NPCOL, NPROW, $ NQ0, NU * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCGERQ2, $ PCLARFB, PCLARFT, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGERQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IL = MAX( ( (IA+M-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * IF( IL.GE.IN+1 ) THEN * * Use blocked code initially * DO 10 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Compute the RQ factorization of the current block * A(i:i+ib-1,ja:ja+n-m+i+ib-ia-1) * CALL PCGERQ2( IB, N-M+I+IB-IA, A, I, JA, DESCA, TAU, WORK, $ LWORK, IINFO ) * IF( I.GT.IA ) THEN * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PCLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, $ I, JA, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:i-1,ja:ja+n-m+i+ib-ia-1) from the * right * CALL PCLARFB( 'Right', 'No transpose', 'Backward', $ 'Rowwise', I-IA, N-M+I+IB-IA, IB, A, I, JA, $ DESCA, WORK, A, IA, JA, DESCA, $ WORK( IPW ) ) END IF * 10 CONTINUE * MU = IN - IA + 1 NU = N - M + IN - IA + 1 * ELSE * MU = M NU = N * END IF * * Use unblocked code to factor the last or only block * IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL PCGERQ2( MU, NU, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCGERQF * END scalapack-2.0.2/SRC/pcgesv.f000644 000766 000024 00000023110 10367447133 016003 0ustar00juliestaff000000 000000 SUBROUTINE PCGESV( N, NRHS, A, IA, JA, DESCA, IPIV, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * Jan 30, 2006 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) COMPLEX A( * ), B( * ) * .. * * Purpose * ======= * * PCGESV computes the solution to a complex system of linear equations * * sub( A ) * X = sub( B ), * * where sub( A ) = A(IA:IA+N-1,JA:JA+N-1) is an N-by-N distributed * matrix and X and sub( B ) = B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS * distributed matrices. * * The LU decomposition with partial pivoting and row interchanges is * used to factor sub( A ) as sub( A ) = P * L * U, where P is a permu- * tation matrix, L is unit lower triangular, and U is upper triangular. * L and U are stored in sub( A ). The factored form of sub( A ) is then * used to solve the system of equations sub( A ) * X = sub( B ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the N-by-N distributed matrix * sub( A ) to be factored. On exit, this array contains the * local pieces of the factors L and U from the factorization * sub( A ) = P*L*U; the unit diagonal elements of L are not * stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * B (local input/local output) COMPLEX pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, the right hand side * distributed matrix sub( B ). On exit, if INFO = 0, sub( B ) * is overwritten by the solution distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, so the solution could not be * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCGETRF, $ PCGETRS, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P EXTERNAL INDXG2P * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( N, 1, NRHS, 2, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IF( IROFFA.NE.0 ) THEN INFO = -4 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( IBROW.NE.IAROW .OR. ICOFFA.NE.IROFFB ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1100+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(1100+CTXT_) END IF END IF CALL PCHK2MAT( N, 1, N, 1, IA, JA, DESCA, 6, N, 1, NRHS, 2, $ IB, JB, DESCB, 11, 0, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGESV', -INFO ) RETURN END IF * * Compute the LU factorization of sub( A ). * CALL PCGETRF( N, N, A, IA, JA, DESCA, IPIV, INFO ) * IF( INFO.EQ.0 ) THEN * * Solve the system sub( A ) * X = sub( B ), overwriting sub( B ) * with X. * CALL PCGETRS( 'No transpose', N, NRHS, A, IA, JA, DESCA, IPIV, $ B, IB, JB, DESCB, INFO ) * END IF * RETURN * * End of PCGESV * END scalapack-2.0.2/SRC/pcgesvd.f000644 000766 000024 00000055740 10377355407 016170 0ustar00juliestaff000000 000000 SUBROUTINE PCGESVD(JOBU,JOBVT,M,N,A,IA,JA,DESCA,S,U,IU,JU,DESCU, + VT,IVT,JVT,DESCVT,WORK,LWORK,RWORK,INFO) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Oak Ridge National Laboratory * and Univ. of California Berkeley. * Jan 2006 * * .. Scalar Arguments .. CHARACTER JOBU,JOBVT INTEGER IA,INFO,IU,IVT,JA,JU,JVT,LWORK,M,N * .. * .. Array Arguments .. INTEGER DESCA(*),DESCU(*),DESCVT(*) COMPLEX A(*),U(*),VT(*),WORK(*) REAL S(*) REAL RWORK(*) * .. * * Purpose * ======= * * PCGESVD computes the singular value decomposition (SVD) of an * M-by-N matrix A, optionally computing the left and/or right * singular vectors. The SVD is written as * * A = U * SIGMA * transpose(V) * * where SIGMA is an M-by-N matrix which is zero except for its * min(M,N) diagonal elements, U is an M-by-M orthogonal matrix, and * V is an N-by-N orthogonal matrix. The diagonal elements of SIGMA * are the singular values of A and the columns of U and V are the * corresponding right and left singular vectors, respectively. The * singular values are returned in array S in decreasing order and * only the first min(M,N) columns of U and rows of VT = V**T are * computed. * * Notes * ===== * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, and * assume that its process grid has dimension r x c. LOCr( K ) denotes * the number of elements of K that a process would receive if K were * distributed over the r processes of its process column. Similarly, * LOCc( K ) denotes the number of elements of K that a process would * receive if K were distributed over the c processes of its process * row. The values of LOCr() and LOCc() may be determined via a call * to the ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MP = number of local rows in A and U * NQ = number of local columns in A and VT * SIZE = min( M, N ) * SIZEQ = number of local columns in U * SIZEP = number of local rows in VT * * JOBU (global input) CHARACTER*1 * Specifies options for computing U: * = 'V': the first SIZE columns of U (the left singular * vectors) are returned in the array U; * = 'N': no columns of U (no left singular vectors) are * computed. * * JOBVT (global input) CHARACTER*1 * Specifies options for computing V**T: * = 'V': the first SIZE rows of V**T (the right singular * vectors) are returned in the array VT; * = 'N': no rows of V**T (no right singular vectors) are * computed. * * M (global input) INTEGER * The number of rows of the input matrix A. M >= 0. * * N (global input) INTEGER * The number of columns of the input matrix A. N >= 0. * * A (local input/workspace) block cyclic COMPLEX * array, * global dimension (M, N), local dimension (MP, NQ) * On exit, the contents of A are destroyed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix A. * * S (global output) REAL array, dimension SIZE * The singular values of A, sorted so that S(i) >= S(i+1). * * U (local output) COMPLEX array, local dimension * (MP, SIZEQ), global dimension (M, SIZE) * if JOBU = 'V', U contains the first min(m,n) columns of U * if JOBU = 'N', U is not referenced. * * IU (global input) INTEGER * The row index in the global array U indicating the first * row of sub( U ). * * JU (global input) INTEGER * The column index in the global array U indicating the * first column of sub( U ). * * DESCU (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix U. * * VT (local output) COMPLEX array, local dimension * (SIZEP, NQ), global dimension (SIZE, N). * If JOBVT = 'V', VT contains the first SIZE rows of * V**T. If JOBVT = 'N', VT is not referenced. * * IVT (global input) INTEGER * The row index in the global array VT indicating the first * row of sub( VT ). * * JVT (global input) INTEGER * The column index in the global array VT indicating the * first column of sub( VT ). * * DESCVT (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix VT. * * WORK (local workspace/output) COMPLEX array, dimension * (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * LWORK >= 1 + 2*SIZEB + MAX(WATOBD, WBDTOSVD), * * where SIZEB = MAX(M,N), and WATOBD and WBDTOSVD refer, * respectively, to the workspace required to bidiagonalize * the matrix A and to go from the bidiagonal matrix to the * singular value decomposition U*S*VT. * * For WATOBD, the following holds: * * WATOBD = MAX(MAX(WPCLANGE,WPCGEBRD), * MAX(WPCLARED2D,WP(pre)LARED1D)), * * where WPCLANGE, WPCLARED1D, WPCLARED2D, WPCGEBRD are the * workspaces required respectively for the subprograms * PCLANGE, PSLARED1D, PSLARED2D, PCGEBRD. Using the * standard notation * * MP = NUMROC( M, MB, MYROW, DESCA( CTXT_ ), NPROW), * NQ = NUMROC( N, NB, MYCOL, DESCA( LLD_ ), NPCOL), * * the workspaces required for the above subprograms are * * WPCLANGE = MP, * WPSLARED1D = NQ0, * WPSLARED2D = MP0, * WPCGEBRD = NB*(MP + NQ + 1) + NQ, * * where NQ0 and MP0 refer, respectively, to the values obtained * at MYCOL = 0 and MYROW = 0. In general, the upper limit for * the workspace is given by a workspace required on * processor (0,0): * * WATOBD <= NB*(MP0 + NQ0 + 1) + NQ0. * * In case of a homogeneous process grid this upper limit can * be used as an estimate of the minimum workspace for every * processor. * * For WBDTOSVD, the following holds: * * WBDTOSVD = SIZE*(WANTU*NRU + WANTVT*NCVT) + * MAX(WCBDSQR, * MAX(WANTU*WPCORMBRQLN, WANTVT*WPCORMBRPRT)), * * where * * 1, if left(right) singular vectors are wanted * WANTU(WANTVT) = * 0, otherwise * * and WCBDSQR, WPCORMBRQLN and WPCORMBRPRT refer respectively * to the workspace required for the subprograms CBDSQR, * PCUNMBR(QLN), and PCUNMBR(PRT), where QLN and PRT are the * values of the arguments VECT, SIDE, and TRANS in the call * to PCUNMBR. NRU is equal to the local number of rows of * the matrix U when distributed 1-dimensional "column" of * processes. Analogously, NCVT is equal to the local number * of columns of the matrix VT when distributed across * 1-dimensional "row" of processes. Calling the LAPACK * procedure CBDSQR requires * * WCBDSQR = MAX(1, 4*SIZE ) * * on every processor. Finally, * * WPCORMBRQLN = MAX( (NB*(NB-1))/2, (SIZEQ+MP)*NB)+NB*NB, * WPCORMBRPRT = MAX( (MB*(MB-1))/2, (SIZEP+NQ)*MB )+MB*MB, * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the work array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * RWORK (workspace) REAL array, dimension (1+4*SIZEB) * On exit, if INFO = 0, RWORK(1) returns the necessary size * for RWORK. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if CBDSQR did not converge * If INFO = MIN(M,N) + 1, then PCGESVD has detected * heterogeneity by finding that eigenvalues were not * identical across the process grid. In this case, the * accuracy of the results from PCGESVD cannot be * guaranteed. * * ===================================================================== * * The results of PCGEBRD, and therefore PCGESVD, may vary slightly * from run to run with the same input data. If repeatability is an * issue, call BLACS_SET with the appropriate option after defining * the process grid. * * Alignment requirements * ====================== * * The routine PCGESVD inherits the same alignement requirement as * the routine PCGEBRD, namely: * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * * ===================================================================== * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D,DLEN_,DTYPE_,CTXT_,M_,N_,MB_,NB_,RSRC_, + CSRC_,LLD_,ITHVAL PARAMETER (BLOCK_CYCLIC_2D=1,DLEN_=9,DTYPE_=1,CTXT_=2,M_=3,N_=4, + MB_=5,NB_=6,RSRC_=7,CSRC_=8,LLD_=9,ITHVAL=10) COMPLEX ZERO,ONE PARAMETER (ZERO= ((0.0E+0,0.0E+0)),ONE= ((1.0E+0,0.0E+0))) REAL DZERO,DONE PARAMETER (DZERO=0.0D+0,DONE=1.0D+0) * .. * .. Local Scalars .. CHARACTER UPLO INTEGER CONTEXTC,CONTEXTR,I,INDD,INDD2,INDE,INDE2,INDTAUP,INDTAUQ, + INDU,INDV,INDWORK,IOFFD,IOFFE,ISCALE,J,K,LDU,LDVT,LLWORK, + LWMIN,MAXIM,MB,MP,MYPCOL,MYPCOLC,MYPCOLR,MYPROW,MYPROWC, + MYPROWR,NB,NCVT,NPCOL,NPCOLC,NPCOLR,NPROCS,NPROW,NPROWC, + NPROWR,NQ,NRU,SIZE,SIZEB,SIZEP,SIZEPOS,SIZEQ,WANTU,WANTVT, + WATOBD,WBDTOSVD,WCBDSQR,WPCGEBRD,WPCLANGE,WPCORMBRPRT, + WPCORMBRQLN REAL ANRM,BIGNUM,EPS,RMAX,RMIN,SAFMIN,SIGMA,SMLNUM * .. * .. Local Arrays .. INTEGER DESCTU(DLEN_),DESCTVT(DLEN_),IDUM1(3),IDUM2(3) REAL C(1,1) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH,PCLANGE EXTERNAL LSAME,NUMROC,PDLAMCH,PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GET,BLACS_GRIDEXIT,BLACS_GRIDINFO,BLACS_GRIDINIT, + CHK1MAT,CBDSQR,DESCINIT,SGAMN2D,SGAMX2D,SSCAL,IGAMX2D, + IGEBR2D,IGEBS2D,PCHK1MAT,PCGEBRD,PCGEMR2D,PSLARED1D, + PSLARED2D,PCLASCL,PCLASET,PCUNMBR,PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX,MIN,SQRT,REAL INTRINSIC CMPLX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF (BLOCK_CYCLIC_2D*DTYPE_*LLD_*MB_*M_*NB_*N_.LT.0) RETURN * CALL BLACS_GRIDINFO(DESCA(CTXT_),NPROW,NPCOL,MYPROW,MYPCOL) ISCALE = 0 INFO = 0 * IF (NPROW.EQ.-1) THEN INFO = - (800+CTXT_) ELSE * SIZE = MIN(M,N) SIZEB = MAX(M,N) NPROCS = NPROW*NPCOL IF (M.GE.N) THEN IOFFD = JA - 1 IOFFE = IA - 1 SIZEPOS = 1 ELSE IOFFD = IA - 1 IOFFE = JA - 1 SIZEPOS = 3 END IF * IF (LSAME(JOBU,'V')) THEN WANTU = 1 ELSE WANTU = 0 END IF IF (LSAME(JOBVT,'V')) THEN WANTVT = 1 ELSE WANTVT = 0 END IF * CALL CHK1MAT(M,3,N,4,IA,JA,DESCA,8,INFO) IF (WANTU.EQ.1) THEN CALL CHK1MAT(M,3,SIZE,SIZEPOS,IU,JU,DESCU,13,INFO) END IF IF (WANTVT.EQ.1) THEN CALL CHK1MAT(SIZE,SIZEPOS,N,4,IVT,JVT,DESCVT,17,INFO) END IF CALL IGAMX2D(DESCA(CTXT_),'A',' ',1,1,INFO,1,1,1,-1,-1,0) * IF (INFO.EQ.0) THEN * * Set up pointers into the WORK array. * INDD = 2 INDE = INDD + SIZEB + IOFFD INDD2 = INDE + SIZEB + IOFFE INDE2 = INDD2 + SIZEB + IOFFD * INDTAUQ = 2 INDTAUP = INDTAUQ + SIZEB + JA - 1 INDWORK = INDTAUP + SIZEB + IA - 1 LLWORK = LWORK - INDWORK + 1 * * Initialize contexts for "column" and "row" process matrices. * CALL BLACS_GET(DESCA(CTXT_),10,CONTEXTC) CALL BLACS_GRIDINIT(CONTEXTC,'R',NPROCS,1) CALL BLACS_GRIDINFO(CONTEXTC,NPROWC,NPCOLC,MYPROWC, + MYPCOLC) CALL BLACS_GET(DESCA(CTXT_),10,CONTEXTR) CALL BLACS_GRIDINIT(CONTEXTR,'R',1,NPROCS) CALL BLACS_GRIDINFO(CONTEXTR,NPROWR,NPCOLR,MYPROWR, + MYPCOLR) * * Set local dimensions of matrices (this is for MB=NB=1). * NRU = NUMROC(M,1,MYPROWC,0,NPROCS) NCVT = NUMROC(N,1,MYPCOLR,0,NPROCS) NB = DESCA(NB_) MB = DESCA(MB_) MP = NUMROC(M,MB,MYPROW,DESCA(RSRC_),NPROW) NQ = NUMROC(N,NB,MYPCOL,DESCA(CSRC_),NPCOL) IF (WANTVT.EQ.1) THEN SIZEP = NUMROC(SIZE,DESCVT(MB_),MYPROW,DESCVT(RSRC_), + NPROW) ELSE SIZEP = 0 END IF IF (WANTU.EQ.1) THEN SIZEQ = NUMROC(SIZE,DESCU(NB_),MYPCOL,DESCU(CSRC_), + NPCOL) ELSE SIZEQ = 0 END IF * * Transmit MAX(NQ0, MP0). * IF (MYPROW.EQ.0 .AND. MYPCOL.EQ.0) THEN MAXIM = MAX(NQ,MP) CALL IGEBS2D(DESCA(CTXT_),'All',' ',1,1,MAXIM,1) ELSE CALL IGEBR2D(DESCA(CTXT_),'All',' ',1,1,MAXIM,1,0,0) END IF * WPCLANGE = MP WPCGEBRD = NB* (MP+NQ+1) + NQ WATOBD = MAX(MAX(WPCLANGE,WPCGEBRD),MAXIM) * WCBDSQR = MAX(1,4*SIZE) WPCORMBRQLN = MAX((NB* (NB-1))/2, (SIZEQ+MP)*NB) + NB*NB WPCORMBRPRT = MAX((MB* (MB-1))/2, (SIZEP+NQ)*MB) + MB*MB WBDTOSVD = SIZE* (WANTU*NRU+WANTVT*NCVT) + + MAX(WCBDSQR,MAX(WANTU*WPCORMBRQLN, + WANTVT*WPCORMBRPRT)) * * Finally, calculate required workspace. * LWMIN = 1 + 2*SIZEB + MAX(WATOBD,WBDTOSVD) WORK(1) = CMPLX(LWMIN,0D+00) RWORK(1) = REAL(1+4*SIZEB) * IF (WANTU.NE.1 .AND. .NOT. (LSAME(JOBU,'N'))) THEN INFO = -1 ELSE IF (WANTVT.NE.1 .AND. .NOT. (LSAME(JOBVT,'N'))) THEN INFO = -2 ELSE IF (LWORK.LT.LWMIN .AND. LWORK.NE.-1) THEN INFO = -19 END IF * END IF * IDUM1(1) = WANTU IDUM1(2) = WANTVT IF (LWORK.EQ.-1) THEN IDUM1(3) = -1 ELSE IDUM1(3) = 1 END IF IDUM2(1) = 1 IDUM2(2) = 2 IDUM2(3) = 19 CALL PCHK1MAT(M,3,N,4,IA,JA,DESCA,8,3,IDUM1,IDUM2,INFO) IF (INFO.EQ.0) THEN IF (WANTU.EQ.1) THEN CALL PCHK1MAT(M,3,SIZE,4,IU,JU,DESCU,13,0,IDUM1,IDUM2, + INFO) END IF IF (WANTVT.EQ.1) THEN CALL PCHK1MAT(SIZE,3,N,4,IVT,JVT,DESCVT,17,0,IDUM1, + IDUM2,INFO) END IF END IF * END IF * IF (INFO.NE.0) THEN CALL PXERBLA(DESCA(CTXT_),'PCGESVD',-INFO) RETURN ELSE IF (LWORK.EQ.-1) THEN GO TO 40 END IF * * Quick return if possible. * IF (M.LE.0 .OR. N.LE.0) GO TO 40 * * Get machine constants. * SAFMIN = PSLAMCH(DESCA(CTXT_),'Safe minimum') EPS = PSLAMCH(DESCA(CTXT_),'Precision') SMLNUM = SAFMIN/EPS BIGNUM = DONE/SMLNUM RMIN = SQRT(SMLNUM) RMAX = MIN(SQRT(BIGNUM),DONE/SQRT(SQRT(SAFMIN))) * * Scale matrix to allowable range, if necessary. * ANRM = PCLANGE('1',M,N,A,IA,JA,DESCA,WORK(INDWORK)) IF (ANRM.GT.DZERO .AND. ANRM.LT.RMIN) THEN ISCALE = 1 SIGMA = RMIN/ANRM ELSE IF (ANRM.GT.RMAX) THEN ISCALE = 1 SIGMA = RMAX/ANRM END IF * IF (ISCALE.EQ.1) THEN CALL PCLASCL('G',DONE,SIGMA,M,N,A,IA,JA,DESCA,INFO) END IF * CALL PCGEBRD(M,N,A,IA,JA,DESCA,RWORK(INDD),RWORK(INDE), + WORK(INDTAUQ),WORK(INDTAUP),WORK(INDWORK),LLWORK, + INFO) * * Copy D and E to all processes. * Array D is in local array of dimension: * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * Array E is in local array of dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * IF (M.GE.N) THEN * Distribute D CALL PSLARED1D(N+IOFFD,IA,JA,DESCA,RWORK(INDD),RWORK(INDD2), + WORK(INDWORK),LLWORK) * Distribute E CALL PSLARED2D(M+IOFFE,IA,JA,DESCA,RWORK(INDE),RWORK(INDE2), + WORK(INDWORK),LLWORK) ELSE * Distribute D CALL PSLARED2D(M+IOFFD,IA,JA,DESCA,RWORK(INDD),RWORK(INDD2), + WORK(INDWORK),LLWORK) * Distribute E CALL PSLARED1D(N+IOFFE,IA,JA,DESCA,RWORK(INDE),RWORK(INDE2), + WORK(INDWORK),LLWORK) END IF * * Prepare for calling PCBDSQR. * IF (M.GE.N) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * INDU = INDWORK INDV = INDU + SIZE*NRU*WANTU INDWORK = INDV + SIZE*NCVT*WANTVT * LDU = MAX(1,NRU) LDVT = MAX(1,SIZE) * CALL DESCINIT(DESCTU,M,SIZE,1,1,0,0,CONTEXTC,LDU,INFO) CALL DESCINIT(DESCTVT,SIZE,N,1,1,0,0,CONTEXTR,LDVT,INFO) * IF (WANTU.EQ.1) THEN CALL PCLASET('Full',M,SIZE,ZERO,ONE,WORK(INDU),1,1,DESCTU) ELSE NRU = 0 END IF * IF (WANTVT.EQ.1) THEN CALL PCLASET('Full',SIZE,N,ZERO,ONE,WORK(INDV),1,1,DESCTVT) ELSE NCVT = 0 END IF * CALL CBDSQR(UPLO,SIZE,NCVT,NRU,0,RWORK(INDD2+IOFFD), + RWORK(INDE2+IOFFE),WORK(INDV),SIZE,WORK(INDU),LDU,C,1, + WORK(INDWORK),INFO) * * Redistribute elements of U and VT in the block-cyclic fashion. * IF (WANTU.EQ.1) CALL PCGEMR2D(M,SIZE,WORK(INDU),1,1,DESCTU,U,IU, + JU,DESCU,DESCU(CTXT_)) * IF (WANTVT.EQ.1) CALL PCGEMR2D(SIZE,N,WORK(INDV),1,1,DESCTVT,VT, + IVT,JVT,DESCVT,DESCVT(CTXT_)) * * Set to ZERO "non-square" elements of the larger matrices U, VT. * IF (M.GT.N .AND. WANTU.EQ.1) THEN CALL PCLASET('Full',M-SIZE,SIZE,ZERO,ZERO,U,IA+SIZE,JU,DESCU) ELSE IF (N.GT.M .AND. WANTVT.EQ.1) THEN CALL PCLASET('Full',SIZE,N-SIZE,ZERO,ZERO,VT,IVT,JVT+SIZE, + DESCVT) END IF * * Multiply Householder rotations from bidiagonalized matrix. * IF (WANTU.EQ.1) CALL PCUNMBR('Q','L','N',M,SIZE,N,A,IA,JA,DESCA, + WORK(INDTAUQ),U,IU,JU,DESCU, + WORK(INDWORK),LLWORK,INFO) * IF (WANTVT.EQ.1) CALL PCUNMBR('P','R','C',SIZE,N,M,A,IA,JA,DESCA, + WORK(INDTAUP),VT,IVT,JVT,DESCVT, + WORK(INDWORK),LLWORK,INFO) * * Copy singular values into output array S. * DO 10 I = 1,SIZE S(I) = RWORK(INDD2+IOFFD+I-1) 10 CONTINUE * * If matrix was scaled, then rescale singular values appropriately. * IF (ISCALE.EQ.1) THEN CALL SSCAL(SIZE,ONE/SIGMA,S,1) END IF * * Compare every ith eigenvalue, or all if there are only a few, * across the process grid to check for heterogeneity. * IF (SIZE.LE.ITHVAL) THEN J = SIZE K = 1 ELSE J = SIZE/ITHVAL K = ITHVAL END IF * DO 20 I = 1,J RWORK(I+INDE) = S((I-1)*K+1) RWORK(I+INDD2) = S((I-1)*K+1) 20 CONTINUE * CALL SGAMN2D(DESCA(CTXT_),'a',' ',J,1,RWORK(1+INDE),J,1,1,-1,-1,0) CALL SGAMX2D(DESCA(CTXT_),'a',' ',J,1,RWORK(1+INDD2),J,1,1,-1,-1, + 0) * DO 30 I = 1,J IF ((RWORK(I+INDE)-RWORK(I+INDD2)).NE.DZERO) THEN INFO = SIZE + 1 END IF 30 CONTINUE * 40 CONTINUE * CALL BLACS_GRIDEXIT(CONTEXTC) CALL BLACS_GRIDEXIT(CONTEXTR) * * End of PCGESVD * RETURN END scalapack-2.0.2/SRC/pcgesvx.f000644 000766 000024 00000104655 10363532303 016177 0ustar00juliestaff000000 000000 SUBROUTINE PCGESVX( FACT, TRANS, N, NRHS, A, IA, JA, DESCA, AF, $ IAF, JAF, DESCAF, IPIV, EQUED, R, C, B, IB, $ JB, DESCB, X, IX, JX, DESCX, RCOND, FERR, $ BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER EQUED, FACT, TRANS INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, LRWORK, $ LWORK, N, NRHS REAL RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IPIV( * ) REAL BERR( * ), C( * ), FERR( * ), R( * ), $ RWORK( * ) COMPLEX A( * ), AF( * ), B( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PCGESVX uses the LU factorization to compute the solution to a * complex system of linear equations * * A(IA:IA+N-1,JA:JA+N-1) * X = B(IB:IB+N-1,JB:JB+NRHS-1), * * where A(IA:IA+N-1,JA:JA+N-1) is an N-by-N matrix and X and * B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS matrices. * * Error bounds on the solution and a condition estimate are also * provided. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Description * =========== * * In the following description, A denotes A(IA:IA+N-1,JA:JA+N-1), * B denotes B(IB:IB+N-1,JB:JB+NRHS-1) and X denotes * X(IX:IX+N-1,JX:JX+NRHS-1). * * The following steps are performed: * * 1. If FACT = 'E', real scaling factors are computed to equilibrate * the system: * TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B * TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B * TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') * or diag(C)*B (if TRANS = 'T' or 'C'). * * 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the * matrix A (after equilibration if FACT = 'E') as * A = P * L * U, * where P is a permutation matrix, L is a unit lower triangular * matrix, and U is upper triangular. * * 3. The factored form of A is used to estimate the condition number * of the matrix A. If the reciprocal of the condition number is * less than machine precision, steps 4-6 are skipped. * * 4. The system of equations is solved for X using the factored form * of A. * * 5. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 6. If FACT = 'E' and equilibration was used, the matrix X is * premultiplied by diag(C) (if TRANS = 'N') or diag(R) (if * TRANS = 'T' or 'C') so that it solves the original system * before equilibration. * * Arguments * ========= * * FACT (global input) CHARACTER * Specifies whether or not the factored form of the matrix * A(IA:IA+N-1,JA:JA+N-1) is supplied on entry, and if not, * whether the matrix A(IA:IA+N-1,JA:JA+N-1) should be * equilibrated before it is factored. * = 'F': On entry, AF(IAF:IAF+N-1,JAF:JAF+N-1) and IPIV con- * tain the factored form of A(IA:IA+N-1,JA:JA+N-1). * If EQUED is not 'N', the matrix * A(IA:IA+N-1,JA:JA+N-1) has been equilibrated with * scaling factors given by R and C. * A(IA:IA+N-1,JA:JA+N-1), AF(IAF:IAF+N-1,JAF:JAF+N-1), * and IPIV are not modified. * = 'N': The matrix A(IA:IA+N-1,JA:JA+N-1) will be copied to * AF(IAF:IAF+N-1,JAF:JAF+N-1) and factored. * = 'E': The matrix A(IA:IA+N-1,JA:JA+N-1) will be equili- * brated if necessary, then copied to * AF(IAF:IAF+N-1,JAF:JAF+N-1) and factored. * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': A(IA:IA+N-1,JA:JA+N-1) * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (No transpose) * = 'T': A(IA:IA+N-1,JA:JA+N-1)**T * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (Transpose) * = 'C': A(IA:IA+N-1,JA:JA+N-1)**H * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (Conjugate transpose) * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * NRHS (global input) INTEGER * The number of right-hand sides, i.e., the number of columns * of the distributed submatrices B(IB:IB+N-1,JB:JB+NRHS-1) and * X(IX:IX+N-1,JX:JX+NRHS-1). NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * the local memory to an array of local dimension * (LLD_A,LOCc(JA+N-1)). On entry, the N-by-N matrix * A(IA:IA+N-1,JA:JA+N-1). If FACT = 'F' and EQUED is not 'N', * then A(IA:IA+N-1,JA:JA+N-1) must have been equilibrated by * the scaling factors in R and/or C. A(IA:IA+N-1,JA:JA+N-1) is * not modified if FACT = 'F' or 'N', or if FACT = 'E' and * EQUED = 'N' on exit. * * On exit, if EQUED .ne. 'N', A(IA:IA+N-1,JA:JA+N-1) is scaled * as follows: * EQUED = 'R': A(IA:IA+N-1,JA:JA+N-1) := * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * EQUED = 'C': A(IA:IA+N-1,JA:JA+N-1) := * A(IA:IA+N-1,JA:JA+N-1) * diag(C) * EQUED = 'B': A(IA:IA+N-1,JA:JA+N-1) := * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * diag(C). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input or local output) COMPLEX pointer * into the local memory to an array of local dimension * (LLD_AF,LOCc(JA+N-1)). If FACT = 'F', then * AF(IAF:IAF+N-1,JAF:JAF+N-1) is an input argument and on * entry contains the factors L and U from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U as computed by PCGETRF. * If EQUED .ne. 'N', then AF is the factored form of the * equilibrated matrix A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'N', then AF(IAF:IAF+N-1,JAF:JAF+N-1) is an output * argument and on exit returns the factors L and U from the * factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the original * matrix A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'E', then AF(IAF:IAF+N-1,JAF:JAF+N-1) is an output * argument and on exit returns the factors L and U from the * factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the equili- * brated matrix A(IA:IA+N-1,JA:JA+N-1) (see the description of * A(IA:IA+N-1,JA:JA+N-1) for the form of the equilibrated * matrix). * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * IPIV (local input or local output) INTEGER array, dimension * LOCr(M_A)+MB_A. If FACT = 'F', then IPIV is an input argu- * ment and on entry contains the pivot indices from the fac- * torization A(IA:IA+N-1,JA:JA+N-1) = P*L*U as computed by * PCGETRF; IPIV(i) -> The global row local row i was * swapped with. This array must be aligned with * A( IA:IA+N-1, * ). * * If FACT = 'N', then IPIV is an output argument and on exit * contains the pivot indices from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the original matrix * A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'E', then IPIV is an output argument and on exit * contains the pivot indices from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the equilibrated matrix * A(IA:IA+N-1,JA:JA+N-1). * * EQUED (global input or global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration (always true if FACT = 'N'). * = 'R': Row equilibration, i.e., A(IA:IA+N-1,JA:JA+N-1) has * been premultiplied by diag(R). * = 'C': Column equilibration, i.e., A(IA:IA+N-1,JA:JA+N-1) * has been postmultiplied by diag(C). * = 'B': Both row and column equilibration, i.e., * A(IA:IA+N-1,JA:JA+N-1) has been replaced by * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * diag(C). * EQUED is an input variable if FACT = 'F'; otherwise, it is an * output variable. * * R (local input or local output) REAL array, * dimension LOCr(M_A). * The row scale factors for A(IA:IA+N-1,JA:JA+N-1). * If EQUED = 'R' or 'B', A(IA:IA+N-1,JA:JA+N-1) is multiplied * on the left by diag(R); if EQUED='N' or 'C', R is not acces- * sed. R is an input variable if FACT = 'F'; otherwise, R is * an output variable. * If FACT = 'F' and EQUED = 'R' or 'B', each element of R must * be positive. * R is replicated in every process column, and is aligned * with the distributed matrix A. * * C (local input or local output) REAL array, * dimension LOCc(N_A). * The column scale factors for A(IA:IA+N-1,JA:JA+N-1). * If EQUED = 'C' or 'B', A(IA:IA+N-1,JA:JA+N-1) is multiplied * on the right by diag(C); if EQUED = 'N' or 'R', C is not * accessed. C is an input variable if FACT = 'F'; otherwise, * C is an output variable. If FACT = 'F' and EQUED = 'C' or * 'B', each element of C must be positive. * C is replicated in every process row, and is aligned with * the distributed matrix A. * * B (local input/local output) COMPLEX pointer * into the local memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1) ). On entry, the N-by-NRHS right-hand * side matrix B(IB:IB+N-1,JB:JB+NRHS-1). On exit, if * EQUED = 'N', B(IB:IB+N-1,JB:JB+NRHS-1) is not modified; if * TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by * diag(R)*B(IB:IB+N-1,JB:JB+NRHS-1); if TRANS = 'T' or 'C' * and EQUED = 'C' or 'B', B(IB:IB+N-1,JB:JB+NRHS-1) is over- * written by diag(C)*B(IB:IB+N-1,JB:JB+NRHS-1). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input/local output) COMPLEX pointer * into the local memory to an array of local dimension * (LLD_X, LOCc(JX+NRHS-1)). If INFO = 0, the N-by-NRHS * solution matrix X(IX:IX+N-1,JX:JX+NRHS-1) to the original * system of equations. Note that A(IA:IA+N-1,JA:JA+N-1) and * B(IB:IB+N-1,JB:JB+NRHS-1) are modified on exit if * EQUED .ne. 'N', and the solution to the equilibrated system * is inv(diag(C))*X(IX:IX+N-1,JX:JX+NRHS-1) if TRANS = 'N' * and EQUED = 'C' or 'B', or * inv(diag(R))*X(IX:IX+N-1,JX:JX+NRHS-1) if TRANS = 'T' or 'C' * and EQUED = 'R' or 'B'. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * RCOND (global output) REAL * The estimate of the reciprocal condition number of the matrix * A(IA:IA+N-1,JA:JA+N-1) after equilibration (if done). If * RCOND is less than the machine precision (in particular, if * RCOND = 0), the matrix is singular to working precision. * This condition is indicated by a return code of INFO > 0. * * FERR (local output) REAL array, dimension LOCc(N_B) * The estimated forward error bounds for each solution vector * X(j) (the j-th column of the solution matrix * X(IX:IX+N-1,JX:JX+NRHS-1). If XTRUE is the true solution, * FERR(j) bounds the magnitude of the largest entry in * (X(j) - XTRUE) divided by the magnitude of the largest entry * in X(j). The estimate is as reliable as the estimate for * RCOND, and is almost always a slight overestimate of the * true error. FERR is replicated in every process row, and is * aligned with the matrices B and X. * * BERR (local output) REAL array, dimension LOCc(N_B). * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any entry of A(IA:IA+N-1,JA:JA+N-1) or * B(IB:IB+N-1,JB:JB+NRHS-1) that makes X(j) an exact solution). * BERR is replicated in every process row, and is aligned * with the matrices B and X. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = MAX( PCGECON( LWORK ), PCGERFS( LWORK ) ) * + LOCr( N_A ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) REAL array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK = 2*LOCc(N_A). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, and i is * <= N: U(IA+I-1,IA+I-1) is exactly zero. The * factorization has been completed, but the * factor U is exactly singular, so the solution * and error bounds could not be computed. * = N+1: RCOND is less than machine precision. The * factorization has been completed, but the * matrix is singular to working precision, and * the solution and error bounds have not been * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLEQU, EQUIL, LQUERY, NOFACT, NOTRAN, ROWEQU CHARACTER NORM INTEGER CONWRK, I, IACOL, IAROW, IAFROW, IBROW, IBCOL, $ ICOFFA, ICOFFB, ICOFFX, ICTXT, IDUMM, $ IIA, IIB, IIX, $ INFEQU, IROFFA, IROFFAF, IROFFB, $ IROFFX, IXCOL, IXROW, J, JJA, JJB, JJX, $ LCM, LCMQ, $ LRWMIN, LWMIN, MYCOL, MYROW, NP, NPCOL, NPROW, $ NQ, NQB, NRHSQ, RFSWRK REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, $ ROWCND, SMLNUM * .. * .. Local Arrays .. INTEGER CDESC( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ INFOG2L, PCGECON, PCGEEQU, PCGERFS, $ PCGETRF, PCGETRS, PCLACPY, $ PCLAQGE, PSCOPY, PXERBLA, SGEBR2D, $ SGEBS2D, SGAMN2D, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC REAL PSLAMCH, PCLANGE EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC, PCLANGE, $ PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(800+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 8, INFO ) IF( LSAME( FACT, 'F' ) ) $ CALL CHK1MAT( N, 3, N, 3, IAF, JAF, DESCAF, 12, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 20, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IX, JX, DESCX, 24, INFO ) NOFACT = LSAME( FACT, 'N' ) EQUIL = LSAME( FACT, 'E' ) NOTRAN = LSAME( TRANS, 'N' ) IF( NOFACT .OR. EQUIL ) THEN EQUED = 'N' ROWEQU = .FALSE. COLEQU = .FALSE. ELSE ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM END IF IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IIA, JJA, IAROW, IACOL ) NP = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFFA NQB = ICEIL( N+IROFFA, DESCA( NB_ )*NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL CONWRK = 2*NP + 2*NQ + MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQ + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) RFSWRK = 3*NP IF( LSAME( TRANS, 'N' ) ) THEN RFSWRK = RFSWRK + NP + NQ + $ ICEIL( NQB, LCMQ )*DESCA( NB_ ) ELSE IF( LSAME( TRANS, 'T' ).OR.LSAME( TRANS, 'C' ) ) THEN RFSWRK = RFSWRK + NP + NQ END IF LWMIN = MAX( CONWRK, RFSWRK ) LRWMIN = MAX( 2*NQ, NP ) RWORK( 1 ) = REAL( LRWMIN ) IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. $ .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT. LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( ICOFFA.NE.0 .OR. IROFFA.NE.ICOFFA ) THEN INFO = -7 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(800+NB_) ELSE IF( IAFROW.NE.IAROW ) THEN INFO = -10 ELSE IF( IROFFAF.NE.0 ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -(1200+CTXT_) ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN INFO = -13 ELSE IF( ROWEQU ) THEN RCMIN = BIGNUM RCMAX = ZERO DO 10 J = IIA, IIA + NP - 1 RCMIN = MIN( RCMIN, R( J ) ) RCMAX = MAX( RCMAX, R( J ) ) 10 CONTINUE CALL SGAMN2D( ICTXT, 'Columnwise', ' ', 1, 1, RCMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, RCMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( RCMIN.LE.ZERO ) THEN INFO = -14 ELSE IF( N.GT.0 ) THEN ROWCND = MAX( RCMIN, SMLNUM ) / $ MIN( RCMAX, BIGNUM ) ELSE ROWCND = ONE END IF END IF IF( COLEQU .AND. INFO.EQ.0 ) THEN RCMIN = BIGNUM RCMAX = ZERO DO 20 J = JJA, JJA+NQ-1 RCMIN = MIN( RCMIN, C( J ) ) RCMAX = MAX( RCMAX, C( J ) ) 20 CONTINUE CALL SGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, RCMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, RCMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( RCMIN.LE.ZERO ) THEN INFO = -15 ELSE IF( N.GT.0 ) THEN COLCND = MAX( RCMIN, SMLNUM ) / $ MIN( RCMAX, BIGNUM ) ELSE COLCND = ONE END IF END IF END IF END IF * WORK( 1 ) = REAL( LWMIN ) RWORK( 1 ) = REAL( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IBROW.NE.IAROW ) THEN INFO = -18 ELSE IF( IXROW.NE.IBROW ) THEN INFO = -22 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2000+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(2000+CTXT_) ELSE IF( DESCX( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2400+NB_) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -(2400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -29 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -31 END IF IDUM1( 1 ) = ICHAR( FACT ) IDUM2( 1 ) = 1 IDUM1( 2 ) = ICHAR( TRANS ) IDUM2( 2 ) = 2 IF( LSAME( FACT, 'F' ) ) THEN IDUM1( 3 ) = ICHAR( EQUED ) IDUM2( 3 ) = 14 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 29 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 31 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, $ NRHS, 4, IB, JB, DESCB, 20, 5, IDUM1, $ IDUM2, INFO ) ELSE IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 29 IF( LRWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 31 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, $ NRHS, 4, IB, JB, DESCB, 20, 4, IDUM1, $ IDUM2, INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGESVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * IF( EQUIL ) THEN * * Compute row and column scalings to equilibrate the matrix A. * CALL PCGEEQU( N, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, INFEQU ) IF( INFEQU.EQ.0 ) THEN * * Equilibrate the matrix. * CALL PCLAQGE( N, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, EQUED ) ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) END IF END IF * * Scale the right-hand side. * CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, $ JJB, IBROW, IBCOL ) NP = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) IF( MYROW.EQ.IBROW ) $ NP = NP-IROFFB IF( MYCOL.EQ.IBCOL ) $ NRHSQ = NRHSQ-ICOFFB * IF( NOTRAN ) THEN IF( ROWEQU ) THEN DO 40 J = JJB, JJB+NRHSQ-1 DO 30 I = IIB, IIB+NP-1 B( I+( J-1 )*DESCB( LLD_ ) ) = R( I )* $ B( I+( J-1 )*DESCB( LLD_ ) ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( COLEQU ) THEN * * Transpose the Column scale factors * CALL DESCSET( CDESC, 1, N+ICOFFA, 1, DESCA( NB_ ), MYROW, $ IACOL, ICTXT, 1 ) CALL PSCOPY( N, C, 1, JA, CDESC, CDESC( LLD_ ), RWORK, IB, JB, $ DESCB, 1 ) IF( MYCOL.EQ.IBCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, RWORK( IIB ), $ DESCB( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, RWORK( IIB ), $ DESCB( LLD_ ), MYROW, IBCOL ) END IF DO 60 J = JJB, JJB+NRHSQ-1 DO 50 I = IIB, IIB+NP-1 B( I+( J-1 )*DESCB( LLD_ ) ) = RWORK( I )* $ B( I+( J-1 )*DESCB( LLD_ ) ) 50 CONTINUE 60 CONTINUE END IF * IF( NOFACT.OR.EQUIL ) THEN * * Compute the LU factorization of A. * CALL PCLACPY( 'Full', N, N, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF ) CALL PCGETRF( N, N, AF, IAF, JAF, DESCAF, IPIV, INFO ) * * Return if INFO is non-zero. * IF( INFO.NE.0 ) THEN IF( INFO.GT.0 ) $ RCOND = ZERO RETURN END IF END IF * * Compute the norm of the matrix A. * IF( NOTRAN ) THEN NORM = '1' ELSE NORM = 'I' END IF ANORM = PCLANGE( NORM, N, N, A, IA, JA, DESCA, RWORK ) * * Compute the reciprocal of the condition number of A. * CALL PCGECON( NORM, N, AF, IAF, JAF, DESCAF, ANORM, RCOND, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * Return if the matrix is singular to working precision. * IF( RCOND.LT.PSLAMCH( ICTXT, 'Epsilon' ) ) THEN INFO = IA + N RETURN END IF * * Compute the solution matrix X. * CALL PCLACPY( 'Full', N, NRHS, B, IB, JB, DESCB, X, IX, JX, $ DESCX ) CALL PCGETRS( TRANS, N, NRHS, AF, IAF, JAF, DESCAF, IPIV, X, IX, $ JX, DESCX, INFO ) * * Use iterative refinement to improve the computed solution and * compute error bounds and backward error estimates for it. * CALL PCGERFS( TRANS, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, IPIV, B, IB, JB, DESCB, X, IX, JX, DESCX, $ FERR, BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * Transform the solution matrix X to a solution of the original * system. * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) NP = NUMROC( N+IROFFX, DESCX( MB_ ), MYROW, IXROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFFX, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYROW.EQ.IBROW ) $ NP = NP-IROFFX IF( MYCOL.EQ.IBCOL ) $ NRHSQ = NRHSQ-ICOFFX * IF( NOTRAN ) THEN IF( COLEQU ) THEN * * Transpose the column scaling factors * CALL DESCSET( CDESC, 1, N+ICOFFA, 1, DESCA( NB_ ), MYROW, $ IACOL, ICTXT, 1 ) CALL PSCOPY( N, C, 1, JA, CDESC, CDESC( LLD_ ), RWORK, IX, $ JX, DESCX, 1 ) IF( MYCOL.EQ.IBCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, $ RWORK( IIX ), DESCX( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, $ RWORK( IIX ), DESCX( LLD_ ), MYROW, $ IBCOL ) END IF * DO 80 J = JJX, JJX+NRHSQ-1 DO 70 I = IIX, IIX+NP-1 X( I+( J-1 )*DESCX( LLD_ ) ) = RWORK( I )* $ X( I+( J-1 )*DESCX( LLD_ ) ) 70 CONTINUE 80 CONTINUE DO 90 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / COLCND 90 CONTINUE END IF ELSE IF( ROWEQU ) THEN DO 110 J = JJX, JJX+NRHSQ-1 DO 100 I = IIX, IIX+NP-1 X( I+( J-1 )*DESCX( LLD_ ) ) = R( I )* $ X( I+( J-1 )*DESCX( LLD_ ) ) 100 CONTINUE 110 CONTINUE DO 120 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / ROWCND 120 CONTINUE END IF * WORK( 1 ) = REAL( LWMIN ) RWORK( 1 ) = REAL( LRWMIN ) * RETURN * * End of PCGESVX * END scalapack-2.0.2/SRC/pcgetf2.f000644 000766 000024 00000022641 10363532303 016044 0ustar00juliestaff000000 000000 SUBROUTINE PCGETF2( M, N, A, IA, JA, DESCA, IPIV, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCGETF2 computes an LU factorization of a general M-by-N * distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) using * partial pivoting with row interchanges. * * The factorization has the form sub( A ) = P * L * U, where P is a * permutation matrix, L is lower triangular with unit diagonal * elements (lower trapezoidal if m > n), and U is upper triangular * (upper trapezoidal if m < n). * * This is the right-looking Parallel Level 2 BLAS version of the * algorithm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires N <= NB_A-MOD(JA-1, NB_A) and square block * decomposition ( MB_A = NB_A ). * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). * NB_A-MOD(JA-1, NB_A) >= N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ). On exit, this array contains * the local pieces of the factors L and U from the factoriza- * tion sub( A ) = P*L*U; the unit diagonal elements of L are * not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, and division by zero will occur if * it is used to solve a system of equations. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER ROWBTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IIA, IROFF, J, $ JJA, MN, MYCOL, MYROW, NPCOL, NPROW COMPLEX GMAX * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, IGEBR2D, $ IGEBS2D, INFOG2L, PCAMAX, PCGERU, $ PCSCAL, PCSWAP, PB_TOPGET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( N+ICOFF.GT.DESCA( NB_ ) ) THEN INFO = -2 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGETF2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * MN = MIN( M, N ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) * IF( MYCOL.EQ.IACOL ) THEN DO 10 J = JA, JA+MN-1 I = IA + J - JA * * Find pivot and test for singularity. * CALL PCAMAX( M-J+JA, GMAX, IPIV( IIA+J-JA ), A, I, J, $ DESCA, 1 ) IF( GMAX.NE.ZERO ) THEN * * Apply the row interchanges to columns JA:JA+N-1 * CALL PCSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, $ IPIV( IIA+J-JA ), JA, DESCA, DESCA( M_ ) ) * * Compute elements I+1:IA+M-1 of J-th column. * IF( J-JA+1.LT.M ) $ CALL PCSCAL( M-J+JA-1, ONE / GMAX, A, I+1, J, $ DESCA, 1 ) ELSE IF( INFO.EQ.0 ) THEN INFO = J - JA + 1 END IF * * Update trailing submatrix * IF( J-JA+1.LT.MN ) THEN CALL PCGERU( M-J+JA-1, N-J+JA-1, -ONE, A, I+1, J, DESCA, $ 1, A, I, J+1, DESCA, DESCA( M_ ), A, I+1, $ J+1, DESCA ) END IF 10 CONTINUE * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, MN, 1, IPIV( IIA ), $ MN ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, MN, 1, IPIV( IIA ), $ MN, MYROW, IACOL ) * END IF * RETURN * * End of PCGETF2 * END scalapack-2.0.2/SRC/pcgetrf.f000644 000766 000024 00000026446 10363532303 016153 0ustar00juliestaff000000 000000 SUBROUTINE PCGETRF( M, N, A, IA, JA, DESCA, IPIV, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCGETRF computes an LU factorization of a general M-by-N distributed * matrix sub( A ) = (IA:IA+M-1,JA:JA+N-1) using partial pivoting with * row interchanges. * * The factorization has the form sub( A ) = P * L * U, where P is a * permutation matrix, L is lower triangular with unit diagonal ele- * ments (lower trapezoidal if m > n), and U is upper triangular * (upper trapezoidal if m < n). L and U are stored in sub( A ). * * This is the right-looking Parallel Level 3 BLAS version of the * algorithm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be factored. On exit, this * array contains the local pieces of the factors L and U from * the factorization sub( A ) = P*L*U; the unit diagonal ele- * ments of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, and division by zero will occur if * it is used to solve a system of equations. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, COLCTOP, ROWBTOP INTEGER I, ICOFF, ICTXT, IINFO, IN, IROFF, J, JB, JN, $ MN, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMN2D, PCHK1MAT, $ PB_TOPGET, PB_TOPSET, PCGEMM, PCGETF2, $ PCLASWP, PCTRSM, PXERBLA * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 0, IDUM1, $ IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGETRF', -INFO ) RETURN END IF * * Quick return if possible * IF( DESCA( M_ ).EQ.1 ) THEN IPIV( 1 ) = 1 RETURN ELSE IF( M.EQ.0 .OR. N.EQ.0 ) THEN RETURN END IF * * Split-ring topology for the communication along process rows * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', ' ' ) * * Handle the first block of columns separately * MN = MIN( M, N ) IN = MIN( ICEIL( IA, DESCA( MB_ ) )*DESCA( MB_ ), IA+M-1 ) JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+MN-1 ) JB = JN - JA + 1 * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL PCGETF2( M, JB, A, IA, JA, DESCA, IPIV, INFO ) * IF( JB+1.LE.N ) THEN * * Apply interchanges to columns JN+1:JA+N-1. * CALL PCLASWP( 'Forward', 'Rows', N-JB, A, IA, JN+1, DESCA, $ IA, IN, IPIV ) * * Compute block row of U. * CALL PCTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-JB, ONE, A, IA, JA, DESCA, A, IA, JN+1, DESCA ) * IF( JB+1.LE.M ) THEN * * Update trailing submatrix. * CALL PCGEMM( 'No transpose', 'No transpose', M-JB, N-JB, JB, $ -ONE, A, IN+1, JA, DESCA, A, IA, JN+1, DESCA, $ ONE, A, IN+1, JN+1, DESCA ) * END IF END IF * * Loop over the remaining blocks of columns. * DO 10 J = JN+1, JA+MN-1, DESCA( NB_ ) JB = MIN( MN-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL PCGETF2( M-J+JA, JB, A, I, J, DESCA, IPIV, IINFO ) * IF( INFO.EQ.0 .AND. IINFO.GT.0 ) $ INFO = IINFO + J - JA * * Apply interchanges to columns JA:J-JA. * CALL PCLASWP( 'Forward', 'Rowwise', J-JA, A, IA, JA, DESCA, $ I, I+JB-1, IPIV ) * IF( J-JA+JB+1.LE.N ) THEN * * Apply interchanges to columns J+JB:JA+N-1. * CALL PCLASWP( 'Forward', 'Rowwise', N-J-JB+JA, A, IA, J+JB, $ DESCA, I, I+JB-1, IPIV ) * * Compute block row of U. * CALL PCTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-J-JB+JA, ONE, A, I, J, DESCA, A, I, J+JB, $ DESCA ) * IF( J-JA+JB+1.LE.M ) THEN * * Update trailing submatrix. * CALL PCGEMM( 'No transpose', 'No transpose', M-J-JB+JA, $ N-J-JB+JA, JB, -ONE, A, I+JB, J, DESCA, A, $ I, J+JB, DESCA, ONE, A, I+JB, J+JB, DESCA ) * END IF END IF * 10 CONTINUE * IF( INFO.EQ.0 ) $ INFO = MN + 1 CALL IGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, INFO, 1, IDUM1, IDUM2, $ -1, -1, MYCOL ) IF( INFO.EQ.MN+1 ) $ INFO = 0 * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * RETURN * * End of PCGETRF * END scalapack-2.0.2/SRC/pcgetri.f000644 000766 000024 00000035173 10430435051 016150 0ustar00juliestaff000000 000000 SUBROUTINE PCGETRI( N, A, IA, JA, DESCA, IPIV, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7.4) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * v1.7.4: May 10, 2006 * v1.7: May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ), IWORK( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCGETRI computes the inverse of a distributed matrix using the LU * factorization computed by PCGETRF. This method inverts U and then * computes the inverse of sub( A ) = A(IA:IA+N-1,JA:JA+N-1) denoted * InvA by solving the system InvA*L = inv(U) for InvA. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the L and U obtained by the * factorization sub( A ) = P*L*U computed by PCGETRF. On * exit, if INFO = 0, sub( A ) contains the inverse of the * original distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension LOCr(M_A)+MB_A * keeps track of the pivoting information. IPIV(i) is the * global row index the local row i was swapped with. This * array is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = LOCr(N+MOD(IA-1,MB_A))*NB_A. WORK is used to keep a * copy of at most an entire column block of sub( A ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK used as workspace for * physically transposing the pivots. * LIWORK is local input and must be at least * if NPROW == NPCOL then * LIWORK = LOCc( N_A + MOD(JA-1, NB_A) ) + NB_A, * else * LIWORK = LOCc( N_A + MOD(JA-1, NB_A) ) + * MAX( CEIL(CEIL(LOCr(M_A)/MB_A)/(LCM/NPROW)), * NB_A ) * where LCM is the least common multiple of process * rows and columns (NPROW and NPCOL). * end if * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,IA+K-1) is exactly zero; the * matrix is singular and its inverse could not be * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IROFF, IW, J, $ JB, JN, LCM, LIWMIN, LWMIN, MP, MYCOL, MYROW, $ NN, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PCGEMM, PCLACPY, PCLASET, PCLAPIV, $ PCTRSM, PCTRTRI, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(500+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) LWMIN = NP * DESCA( NB_ ) * MP = NUMROC( DESCA( M_ ), DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) NQ = NUMROC( DESCA( N_ ), DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) IF( NPROW.EQ.NPCOL ) THEN LIWMIN = NQ + DESCA( NB_ ) ELSE * * Use the formula for the workspace given in PxLAPIV * to compute the minimum size LIWORK for IWORK * * The formula in PxLAPIV is * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + * MB_P * CEIL( CEIL(LOCr(M_P)/MB_P) / (LCM/NPROW) ) * * where * M_P is the global length of the pivot vector * MP = DESCA( M_ ) + DESCA( MB_ ) * NPROW * I_P is IA * I_P = IA * MB_P is the block size use for the block cyclic distribution of the * pivot vector * MB_P = DESCA (MB_ ) * LOCc ( . ) * NUMROC ( . , DESCA ( NB_ ), MYCOL, DESCA ( CSRC_ ), NPCOL ) * LOCr ( . ) * NUMROC ( . , DESCA ( MB_ ), MYROW, DESCA ( RSRC_ ), NPROW ) * CEIL ( X / Y ) * ICEIL( X, Y ) * LCM * LCM = ILCM( NPROW, NPCOL ) * LCM = ILCM( NPROW, NPCOL ) LIWMIN = NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW $ + MOD ( IA - 1, DESCA( MB_ ) ), DESCA ( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ) + $ MAX ( DESCA( MB_ ) * ICEIL ( ICEIL( $ NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW, $ DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ), $ DESCA( MB_ ) ), LCM / NPROW ), DESCA( NB_ ) ) * END IF * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -4 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(500+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -8 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 8 IF( LIWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGETRI', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Form inv(U). If INFO > 0 from PCTRTRI, then U is singular, * and the inverse is not computed. * CALL PCTRTRI( 'Upper', 'Non-unit', N, A, IA, JA, DESCA, INFO ) IF( INFO.GT.0 ) $ RETURN * * Define array descriptor for working array WORK * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) NN = ( ( JA+N-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1 IACOL = INDXG2P( NN, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCW, N+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, NP ) ) IW = IROFF + 1 * * Solve the equation inv(A)*L=inv(U) for inv(A) using blocked code. * DO 10 J = NN, JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * * Copy current block column of L to WORK and replace with zeros. * CALL PCLACPY( 'Lower', JA+N-1-J, JB, A, I+1, J, DESCA, $ WORK, IW+J-JA+1, 1, DESCW ) CALL PCLASET( 'Lower', JA+N-1-J, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Compute current block column of inv(A). * IF( J+JB.LE.JA+N-1 ) $ CALL PCGEMM( 'No transpose', 'No transpose', N, JB, $ JA+N-J-JB, -ONE, A, IA, J+JB, DESCA, WORK, $ IW+J+JB-JA, 1, DESCW, ONE, A, IA, J, DESCA ) CALL PCTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, $ ONE, WORK, IW+J-JA, 1, DESCW, A, IA, J, DESCA ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle the last block of columns separately * JB = JN-JA+1 * * Copy current block column of L to WORK and replace with zeros. * CALL PCLACPY( 'Lower', N-1, JB, A, IA+1, JA, DESCA, WORK, IW+1, $ 1, DESCW ) CALL PCLASET( 'Lower', N-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Compute current block column of inv(A). * IF( JA+JB.LE.JA+N-1 ) $ CALL PCGEMM( 'No transpose', 'No transpose', N, JB, $ N-JB, -ONE, A, IA, JA+JB, DESCA, WORK, IW+JB, 1, $ DESCW, ONE, A, IA, JA, DESCA ) CALL PCTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, $ ONE, WORK, IW, 1, DESCW, A, IA, JA, DESCA ) * * Use the row pivots and apply them to the columns of the global * matrix. * CALL DESCSET( DESCW, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ MP+DESCA( MB_ ) ) CALL PCLAPIV( 'Backward', 'Columns', 'Column', N, N, A, IA, $ JA, DESCA, IPIV, IA, 1, DESCW, IWORK ) * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PCGETRI * END scalapack-2.0.2/SRC/pcgetrs.f000644 000766 000024 00000026102 10363532303 016155 0ustar00juliestaff000000 000000 SUBROUTINE PCGETRS( TRANS, N, NRHS, A, IA, JA, DESCA, IPIV, B, $ IB, JB, DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) COMPLEX A( * ), B( * ) * .. * * Purpose * ======= * * PCGETRS solves a system of distributed linear equations * * op( sub( A ) ) * X = sub( B ) * * with a general N-by-N distributed matrix sub( A ) using the LU * factorization computed by PCGETRF. * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), op( A ) = A, A**T or A**H * and sub( B ) denotes B(IB:IB+N-1,JB:JB+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block data decomposition ( MB_A=NB_A ). * * Arguments * ========= * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': sub( A ) * X = sub( B ) (No transpose) * = 'T': sub( A )**T * X = sub( B ) (Transpose) * = 'C': sub( A )**H * X = sub( B ) (Conjugate transpose) * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) COMPLEX pointer into the local * memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the factors * L and U from the factorization sub( A ) = P*L*U; the unit * diagonal elements of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * B (local input/local output) COMPLEX pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, the right hand sides * sub( B ). On exit, sub( B ) is overwritten by the solution * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCIP( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ PCLAPIV, PCTRSM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE NOTRAN = LSAME( TRANS, 'N' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1200+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(1200+CTXT_) END IF END IF IF( NOTRAN ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 1 ) = ICHAR( 'T' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, 3, $ IB, JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGETRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * CALL DESCSET( DESCIP, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ DESCA( MB_ ) + NUMROC( DESCA( M_ ), DESCA( MB_ ), $ MYROW, DESCA( RSRC_ ), NPROW ) ) * IF( NOTRAN ) THEN * * Solve sub( A ) * X = sub( B ). * * Apply row interchanges to the right hand sides. * CALL PCLAPIV( 'Forward', 'Row', 'Col', N, NRHS, B, IB, JB, $ DESCB, IPIV, IA, 1, DESCIP, IDUM1 ) * * Solve L*X = sub( B ), overwriting sub( B ) with X. * CALL PCTRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve U*X = sub( B ), overwriting sub( B ) with X. * CALL PCTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) ELSE * * Solve sub( A )' * X = sub( B ). * * Solve U'*X = sub( B ), overwriting sub( B ) with X. * CALL PCTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve L'*X = sub( B ), overwriting sub( B ) with X. * CALL PCTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, ONE, $ A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Apply row interchanges to the solution vectors. * CALL PCLAPIV( 'Backward', 'Row', 'Col', N, NRHS, B, IB, JB, $ DESCB, IPIV, IA, 1, DESCIP, IDUM1 ) * END IF * RETURN * * End of PCGETRS * END scalapack-2.0.2/SRC/pcggqrf.f000644 000766 000024 00000036306 10363532303 016146 0ustar00juliestaff000000 000000 SUBROUTINE PCGGQRF( N, M, P, A, IA, JA, DESCA, TAUA, B, IB, JB, $ DESCB, TAUB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, P * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ), TAUA( * ), TAUB( * ), WORK( * ) * .. * * Purpose * ======= * * PCGGQRF computes a generalized QR factorization of * an N-by-M matrix sub( A ) = A(IA:IA+N-1,JA:JA+M-1) and * an N-by-P matrix sub( B ) = B(IB:IB+N-1,JB:JB+P-1): * * sub( A ) = Q*R, sub( B ) = Q*T*Z, * * where Q is an N-by-N unitary matrix, Z is a P-by-P unitary matrix, * and R and T assume one of the forms: * * if N >= M, R = ( R11 ) M , or if N < M, R = ( R11 R12 ) N, * ( 0 ) N-M N M-N * M * * where R11 is upper triangular, and * * if N <= P, T = ( 0 T12 ) N, or if N > P, T = ( T11 ) N-P, * P-N N ( T21 ) P * P * * where T12 or T21 is upper triangular. * * In particular, if sub( B ) is square and nonsingular, the GQR * factorization of sub( A ) and sub( B ) implicitly gives the QR * factorization of inv( sub( B ) )* sub( A ): * * inv( sub( B ) )*sub( A )= Z'*(inv(T)*R) * * where inv( sub( B ) ) denotes the inverse of the matrix sub( B ), * and Z' denotes the conjugate transpose of matrix Z. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrices sub( A ) and sub( B ). N >= 0. * * M (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). M >= 0. * * P (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( B ). P >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+M-1)). * On entry, the local pieces of the N-by-M distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(N,M) by M * upper trapezoidal matrix R (R is upper triangular if N >= M); * the elements below the diagonal, with the array TAUA, * represent the unitary matrix Q as a product of min(N,M) * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAUA (local output) COMPLEX, array, dimension * LOCc(JA+MIN(N,M)-1). This array contains the scalar factors * TAUA of the elementary reflectors which represent the unitary * matrix Q. TAUA is tied to the distributed matrix A. (see * Further Details). * * B (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+P-1)). * On entry, the local pieces of the N-by-P distributed matrix * sub( B ) which is to be factored. On exit, if N <= P, the * upper triangle of B(IB:IB+N-1,JB+P-N:JB+P-1) contains the * N by N upper triangular matrix T; if N > P, the elements on * and above the (N-P)-th subdiagonal contain the N by P upper * trapezoidal matrix T; the remaining elements, with the array * TAUB, represent the unitary matrix Z as a product of * elementary reflectors (see Further Details). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * TAUB (local output) COMPLEX, array, dimension LOCr(IB+N-1) * This array contains the scalar factors of the elementary * reflectors which represent the unitary matrix Z. TAUB is * tied to the distributed matrix B (see Further Details). * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB_A * ( NpA0 + MqA0 + NB_A ), * MAX( (NB_A*(NB_A-1))/2, (PqB0 + NpB0)*NB_A ) + * NB_A * NB_A, * MB_B * ( NpB0 + PqB0 + MB_B ) ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * NpB0 = NUMROC( N+IROFFB, MB_B, MYROW, IBROW, NPROW ), * PqB0 = NUMROC( P+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(n,m). * * Each H(i) has the form * * H(i) = I - taua * v * v' * * where taua is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in * A(ia+i:ia+n-1,ja+i-1), and taua in TAUA(ja+i-1). * To form Q explicitly, use ScaLAPACK subroutine PCUNGQR. * To use Q to update another matrix, use ScaLAPACK subroutine PCUNMQR. * * The matrix Z is represented as a product of elementary reflectors * * Z = H(ib)' H(ib+1)' . . . H(ib+k-1)', where k = min(n,p). * * Each H(i) has the form * * H(i) = I - taub * v * v' * * where taub is a complex scalar, and v is a complex vector with * v(p-k+i+1:p) = 0 and v(p-k+i) = 1; conjg(v(1:p-k+i-1)) is stored on * exit in B(ib+n-k+i-1,jb:jb+p-k+i-2), and taub in TAUB(ib+n-k+i-1). * To form Z explicitly, use ScaLAPACK subroutine PCUNGRQ. * To use Z to update another matrix, use ScaLAPACK subroutine PCUNMRQ. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ) and sub( B ) must verify some * alignment properties, namely the following expression should be true: * * ( MB_A.EQ.MB_B .AND. IROFFA.EQ.IROFFB .AND. IAROW.EQ.IBROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, LWMIN, MQA0, MYCOL, $ MYROW, NPA0, NPB0, NPCOL, NPROW, PQB0 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCGEQRF, PCGERQF, $ PCHK2MAT, PCUNMQR, PXERBLA * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, INT, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -707 ELSE CALL CHK1MAT( N, 1, M, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 1, P, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) NPB0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) PQB0 = NUMROC( P+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) LWMIN = MAX( DESCA( NB_ ) * ( NPA0 + MQA0 + DESCA( NB_ ) ), $ MAX( MAX( ( DESCA( NB_ )*( DESCA( NB_ ) - 1 ) ) / 2, $ ( PQB0 + NPB0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ), $ DESCB( MB_ ) * ( NPB0 + PQB0 + DESCB( MB_ ) ) ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IAROW.NE.IBROW .OR. IROFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -1203 ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -1207 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 15 CALL PCHK2MAT( N, 1, M, 2, IA, JA, DESCA, 7, N, 1, P, 3, IB, $ JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGGQRF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * QR factorization of N-by-M matrix sub( A ): sub( A ) = Q*R * CALL PCGEQRF( N, M, A, IA, JA, DESCA, TAUA, WORK, LWORK, INFO ) LWMIN = INT( WORK( 1 ) ) * * Update sub( B ) := Q'*sub( B ). * CALL PCUNMQR( 'Left', 'Conjugate Transpose', N, P, MIN( N, M ), A, $ IA, JA, DESCA, TAUA, B, IB, JB, DESCB, WORK, LWORK, $ INFO ) LWMIN = MIN( LWMIN, INT( WORK( 1 ) ) ) * * RQ factorization of N-by-P matrix sub( B ): sub( B ) = T*Z. * CALL PCGERQF( N, P, B, IB, JB, DESCB, TAUB, WORK, LWORK, INFO ) WORK( 1 ) = CMPLX( REAL( MAX( LWMIN, INT( WORK( 1 ) ) ) ) ) * RETURN * * End of PCGGQRF * END scalapack-2.0.2/SRC/pcggrqf.f000644 000766 000024 00000036314 10363532303 016145 0ustar00juliestaff000000 000000 SUBROUTINE PCGGRQF( M, P, N, A, IA, JA, DESCA, TAUA, B, IB, JB, $ DESCB, TAUB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, P * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ), TAUA( * ), TAUB( * ), WORK( * ) * .. * * Purpose * ======= * * PCGGRQF computes a generalized RQ factorization of * an M-by-N matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) * and a P-by-N matrix sub( B ) = B(IB:IB+P-1,JB:JB+N-1): * * sub( A ) = R*Q, sub( B ) = Z*T*Q, * * where Q is an N-by-N unitary matrix, Z is a P-by-P unitary matrix, * and R and T assume one of the forms: * * if M <= N, R = ( 0 R12 ) M, or if M > N, R = ( R11 ) M-N, * N-M M ( R21 ) N * N * * where R12 or R21 is upper triangular, and * * if P >= N, T = ( T11 ) N , or if P < N, T = ( T11 T12 ) P, * ( 0 ) P-N P N-P * N * * where T11 is upper triangular. * * In particular, if sub( B ) is square and nonsingular, the GRQ * factorization of sub( A ) and sub( B ) implicitly gives the RQ * factorization of sub( A )*inv( sub( B ) ): * * sub( A )*inv( sub( B ) ) = (R*inv(T))*Z' * * where inv( sub( B ) ) denotes the inverse of the matrix sub( B ), * and Z' denotes the conjugate transpose of matrix Z. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of * rows of the distributed submatrix sub( A ). M >= 0. * * P (global input) INTEGER * The number of rows to be operated on i.e the number of * rows of the distributed submatrix sub( B ). P >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrices sub( A ) and sub( B ). * N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAUA, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAUA (local output) COMPLEX, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUA is * tied to the distributed matrix A (see Further Details). * * B (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+N-1)). * On entry, the local pieces of the P-by-N distributed matrix * sub( B ) which is to be factored. On exit, the elements on * and above the diagonal of sub( B ) contain the min(P,N) by N * upper trapezoidal matrix T (T is upper triangular if P >= N); * the elements below the diagonal, with the array TAUB, * represent the unitary matrix Z as a product of elementary * reflectors (see Further Details). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * TAUB (local output) COMPLEX, array, dimension * LOCc(JB+MIN(P,N)-1). This array contains the scalar factors * TAUB of the elementary reflectors which represent the unitary * matrix Z. TAUB is tied to the distributed matrix B (see * Further Details). * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( MB_A * ( MpA0 + NqA0 + MB_A ), * MAX( (MB_A*(MB_A-1))/2, (PpB0 + NqB0)*MB_A ) + * MB_A * MB_A, * NB_B * ( PpB0 + NqB0 + NB_B ) ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * PpB0 = NUMROC( P+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NqB0 = NUMROC( N+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia)' H(ia+1)' . . . H(ia+k-1)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - taua * v * v' * * where taua is a complex scalar, and v is a complex vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; conjg(v(1:n-k+i-1)) is stored on * exit in A(ia+m-k+i-1,ja:ja+n-k+i-2), and taua in TAUA(ia+m-k+i-1). * To form Q explicitly, use ScaLAPACK subroutine PCUNGRQ. * To use Q to update another matrix, use ScaLAPACK subroutine PCUNMRQ. * * The matrix Z is represented as a product of elementary reflectors * * Z = H(jb) H(jb+1) . . . H(jb+k-1), where k = min(p,n). * * Each H(i) has the form * * H(i) = I - taub * v * v' * * where taub is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:p) is stored on exit in * B(ib+i:ib+p-1,jb+i-1), and taub in TAUB(jb+i-1). * To form Z explicitly, use ScaLAPACK subroutine PCUNGQR. * To use Z to update another matrix, use ScaLAPACK subroutine PCUNMQR. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ) and sub( B ) must verify some * alignment properties, namely the following expression should be true: * * ( NB_A.EQ.NB_B .AND. ICOFFA.EQ.ICOFFB .AND. IACOL.EQ.IBCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. Local Scalars .. LOGICAL LQUERY INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, LWMIN, MPA0, MYCOL, $ MYROW, NPCOL, NPROW, NQA0, NQB0, PPB0 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCGEQRF, PCGERQF, $ PCHK2MAT, PCUNMRQ, PXERBLA * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, INT, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -707 ELSE CALL CHK1MAT( M, 1, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( P, 2, N, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) PPB0 = NUMROC( P+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) NQB0 = NUMROC( N+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) LWMIN = MAX( DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ), $ MAX( MAX( ( DESCA( MB_ )*( DESCA( MB_ ) - 1 ) ) / 2, $ ( PPB0 + NQB0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ), $ DESCB( NB_ ) * ( PPB0 + NQB0 + DESCB( NB_ ) ) ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IACOL.NE.IBCOL .OR. ICOFFA.NE.ICOFFB ) THEN INFO = -11 ELSE IF( DESCA( NB_ ).NE.DESCB( NB_ ) ) THEN INFO = -1204 ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -1207 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 15 CALL PCHK2MAT( M, 1, N, 3, IA, JA, DESCA, 7, P, 2, N, 3, IB, $ JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCGGRQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * RQ factorization of M-by-N matrix sub( A ): sub( A ) = R*Q * CALL PCGERQF( M, N, A, IA, JA, DESCA, TAUA, WORK, LWORK, INFO ) LWMIN = INT( WORK( 1 ) ) * * Update sub( B ) := sub( B )*Q' * CALL PCUNMRQ( 'Right', 'Conjugate Transpose', P, N, MIN( M, N ), $ A, MAX( IA, IA+M-N ), JA, DESCA, TAUA, B, IB, JB, $ DESCB, WORK, LWORK, INFO ) LWMIN = MAX( LWMIN, INT( WORK( 1 ) ) ) * * QR factorization of P-by-N matrix sub( B ): sub( B ) = Z*T * CALL PCGEQRF( P, N, B, IB, JB, DESCB, TAUB, WORK, LWORK, INFO ) WORK( 1 ) = CMPLX( REAL( MAX( LWMIN, INT( WORK( 1 ) ) ) ) ) * RETURN * * End of PCGGRQF * END scalapack-2.0.2/SRC/pcheev.f000644 000766 000024 00000055532 11640165744 016003 0ustar00juliestaff000000 000000 SUBROUTINE PCHEEV( JOBZ, UPLO, N, A, IA, JA, DESCA, W, Z, IZ, JZ, $ DESCZ, WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * .. Scalar Arguments .. CHARACTER JOBZ, UPLO INTEGER IA, INFO, IZ, JA, JZ, LRWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) REAL RWORK( * ), W( * ) COMPLEX A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PCHEEV computes selected eigenvalues and, optionally, eigenvectors * of a real Hermitian matrix A by calling the recommended sequence * of ScaLAPACK routines. * * In its present form, PCHEEV assumes a homogeneous system and makes * only spot checks of the consistency of the eigenvalues across the * different processes. Because of this, it is possible that a * heterogeneous system may return incorrect results without any error * messages. * * Notes * ===== * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distributed * matrix A. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of A. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of A. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of the * distributed matrix A. * LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic COMPLEX array, * global dimension (N, N), local dimension ( LLD_A, * LOCc(JA+N-1) ) * * On entry, the Hermitian matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the Hermitian matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * Hermitian matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PCHEEV cannot guarantee * correct error reporting. * * W (global output) REAL array, dimension (N) * If INFO=0, the eigenvalues in ascending order. * * Z (local output) COMPLEX array, * global dimension (N, N), * local dimension (LLD_Z, LOCc(JZ+N-1)) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX array, * dimension (LWORK) * On output, WORK(1) returns the workspace needed to guarantee * completion. If the input parameters are incorrect, WORK(1) * may also be incorrect. * * If JOBZ='N' WORK(1) = minimal workspace for eigenvalues only. * If JOBZ='V' WORK(1) = minimal workspace required to * generate all the eigenvectors. * * * LWORK (local input) INTEGER * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= MAX( NB*( NP0+1 ), 3 ) +3*N * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required: * LWORK >= (NP0 + NQ0 + NB)*NB + 3*N + N^2 * * Variable definitions: * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * NQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) * * If LWORK = -1, the LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the WORK array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * RWORK (local workspace/output) COMPLEX array, * dimension (LRWORK) * On output RWORK(1) returns the * REAL workspace needed to * guarantee completion. If the input parameters are incorrect, * RWORK(1) may also be incorrect. * * LRWORK (local input) INTEGER * Size of RWORK array. * If eigenvectors are desired (JOBZ = 'V') then * LRWORK >= 2*N + 2*N-2 * If eigenvectors are not desired (JOBZ = 'N') then * LRWORK >= 2*N * * If LRWORK = -1, the LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the RWORK array. The required workspace is returned * as the first element of RWORK and no error message is issued * by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = 1 through N, the i(th) eigenvalue did not * converge in CSTEQR2 after a total of 30*N iterations. * If INFO = N+1, then PCHEEV has detected heterogeneity * by finding that eigenvalues were not identical across * the process grid. In this case, the accuracy of * the results from PCHEEV cannot be guaranteed. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * ( MB_A.EQ.NB_A.EQ.MB_Z .AND. IROFFA.EQ.IROFFZ .AND. IROFFA.EQ.0 .AND. * IAROW.EQ.IZROW ) * where * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * Version 1.4 limitations: * DESCA(MB_) = DESCA(NB_) * DESCA(M_) = DESCZ(M_) * DESCA(N_) = DESCZ(N_) * DESCA(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) INTEGER ITHVAL PARAMETER ( ITHVAL = 10 ) * .. * .. Local Scalars .. LOGICAL LOWER, WANTZ INTEGER CONTEXTC, CSRC_A, I, IACOL, IAROW, ICOFFA, $ IINFO, INDD, INDE, INDRD, INDRE, INDRWORK, $ INDTAU, INDWORK, INDWORK2, IROFFA, IROFFZ, $ ISCALE, IZROW, J, K, LDC, LLRWORK, LLWORK, $ LRMIN, LRWMIN, LWMIN, MB_A, MB_Z, MYCOL, $ MYPCOLC, MYPROWC, MYROW, NB, NB_A, NB_Z, NP0, $ NPCOL, NPCOLC, NPROCS, NPROW, NPROWC, NQ0, NRC, $ RSIZECSTEQR2, RSRC_A, RSRC_Z, SIZECSTEQR2, $ SIZEPCHETRD, SIZEPCUNMTR REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, $ SMLNUM * .. * .. Local Arrays .. INTEGER DESCQR( 10 ), IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC, SL_GRIDRESHAPE REAL PCLANHE, PSLAMCH EXTERNAL LSAME, INDXG2P, NUMROC, SL_GRIDRESHAPE, $ PCLANHE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, CHK1MAT, $ CSTEQR2, DESCINIT, PCELGET, PCGEMR2D, PCHETRD, $ PCHK1MAT, PCHK2MAT, PCLASCL, PCLASET, PCUNMTR, $ PXERBLA, SCOPY, SGAMN2D, SGAMX2D, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, ICHAR, INT, MAX, MIN, MOD, REAL, $ SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return * IF( N.EQ.0 ) $ RETURN * * Test the input arguments. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 * * Initialize pointer to some safe value * INDTAU = 1 INDD = 1 INDE = 1 INDWORK = 1 INDWORK2 = 1 * INDRE = 1 INDRD = 1 INDRWORK = 1 * WANTZ = LSAME( JOBZ, 'V' ) IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE IF( WANTZ ) THEN IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) END IF END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 3, N, 3, IZ, JZ, DESCZ, 12, INFO ) * IF( INFO.EQ.0 ) THEN * * Get machine constants. * SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe minimum' ) EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * NPROCS = NPROW*NPCOL NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A LOWER = LSAME( UPLO, 'L' ) * RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( 1, NB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, MB_A, MYCOL, CSRC_A, NPCOL ) NP0 = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) IF( WANTZ ) THEN NB_Z = DESCZ( NB_ ) MB_Z = DESCZ( MB_ ) RSRC_Z = DESCZ( RSRC_ ) IROFFZ = MOD( IZ-1, MB_A ) IZROW = INDXG2P( 1, NB_A, MYROW, RSRC_Z, NPROW ) ELSE IROFFZ = 0 IZROW = 0 END IF * * COMPLEX work space for PCHETRD * CALL PCHETRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDE ), WORK( INDTAU ), $ WORK( INDWORK ), -1, IINFO ) SIZEPCHETRD = INT( ABS( WORK( 1 ) ) ) * * COMPLEX work space for PCUNMTR * IF( WANTZ ) THEN CALL PCUNMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), -1, IINFO ) SIZEPCUNMTR = INT( ABS( WORK( 1 ) ) ) ELSE SIZEPCUNMTR = 0 END IF * * REAL work space for CSTEQR2 * IF( WANTZ ) THEN RSIZECSTEQR2 = MIN( 1, 2*N-2 ) ELSE RSIZECSTEQR2 = 0 END IF * * Initialize the context of the single column distributed * matrix required by CSTEQR2. This specific distribution * allows each process to do 1/pth of the work updating matrix * Q during CSTEQR2 and achieve some parallelization to an * otherwise serial subroutine. * LDC = 0 IF( WANTZ ) THEN CONTEXTC = SL_GRIDRESHAPE( DESCA( CTXT_ ), 0, 1, 1, $ NPROCS, 1 ) CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, $ MYPCOLC ) NRC = NUMROC( N, NB_A, MYPROWC, 0, NPROCS ) LDC = MAX( 1, NRC ) CALL DESCINIT( DESCQR, N, N, NB, NB, 0, 0, CONTEXTC, LDC, $ INFO ) END IF * * COMPLEX work space for CSTEQR2 * IF( WANTZ ) THEN SIZECSTEQR2 = N*LDC ELSE SIZECSTEQR2 = 0 END IF * * Set up pointers into the WORK array * INDTAU = 1 INDD = INDTAU + N INDE = INDD + N INDWORK = INDE + N INDWORK2 = INDWORK + N*LDC LLWORK = LWORK - INDWORK + 1 * * Set up pointers into the RWORK array * INDRE = 1 INDRD = INDRE + N INDRWORK = INDRD + N LLRWORK = LRWORK - INDRWORK + 1 * * Compute the total amount of space needed * LRWMIN = 2*N + RSIZECSTEQR2 LWMIN = 3*N + MAX( SIZEPCHETRD, SIZEPCUNMTR, SIZECSTEQR2 ) * END IF IF( INFO.EQ.0 ) THEN IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -14 ELSE IF( LRWORK.LT.LRWMIN .AND. LRWORK.NE.-1 ) THEN INFO = -16 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( WANTZ ) THEN IF( IROFFA.NE.IROFFZ ) THEN INFO = -10 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -10 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 1200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 1200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 1200+RSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) END IF END IF END IF IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 3 IF( WANTZ ) THEN CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IZ, $ JZ, DESCZ, 12, 3, IDUM1, IDUM2, INFO ) ELSE CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 3, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = CMPLX( LWMIN ) RWORK( 1 ) = REAL( LRWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PCHEEV', -INFO ) IF( WANTZ ) $ CALL BLACS_GRIDEXIT( CONTEXTC ) RETURN ELSE IF( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) THEN IF( WANTZ ) $ CALL BLACS_GRIDEXIT( CONTEXTC ) RETURN END IF * * Scale matrix to allowable range, if necessary. * ISCALE = 0 * ANRM = PCLANHE( 'M', UPLO, N, A, IA, JA, DESCA, $ RWORK( INDRWORK ) ) * * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM END IF * IF( ISCALE.EQ.1 ) THEN CALL PCLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) END IF * * Reduce Hermitian matrix to tridiagonal form. * CALL PCHETRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * Copy the values of D, E to all processes. * DO 10 I = 1, N CALL PCELGET( 'A', ' ', WORK( INDD+I-1 ), A, I+IA-1, I+JA-1, $ DESCA ) RWORK( INDRD+I-1 ) = REAL( WORK( INDD+I-1 ) ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PCELGET( 'A', ' ', WORK( INDE+I-1 ), A, I+IA-1, I+JA, $ DESCA ) RWORK( INDRE+I-1 ) = REAL( WORK( INDE+I-1 ) ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PCELGET( 'A', ' ', WORK( INDE+I-1 ), A, I+IA, I+JA-1, $ DESCA ) RWORK( INDRE+I-1 ) = REAL( WORK( INDE+I-1 ) ) 30 CONTINUE END IF * IF( WANTZ ) THEN * CALL PCLASET( 'Full', N, N, CZERO, CONE, WORK( INDWORK ), 1, 1, $ DESCQR ) * * CSTEQR2 is a modified version of LAPACK's CSTEQR. The * modifications allow each process to perform partial updates * to matrix Q. * CALL CSTEQR2( 'I', N, RWORK( INDRD ), RWORK( INDRE ), $ WORK( INDWORK ), LDC, NRC, RWORK( INDRWORK ), $ INFO ) * CALL PCGEMR2D( N, N, WORK( INDWORK ), 1, 1, DESCQR, Z, IA, JA, $ DESCZ, CONTEXTC ) * CALL PCUNMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) * ELSE * CALL CSTEQR2( 'N', N, RWORK( INDRD ), RWORK( INDRE ), $ WORK( INDWORK ), 1, 1, RWORK( INDRWORK ), INFO ) END IF * * Copy eigenvalues from workspace to output array * CALL SCOPY( N, RWORK( INDD ), 1, W, 1 ) * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL SSCAL( N, ONE / SIGMA, W, 1 ) END IF * WORK( 1 ) = REAL( LWMIN ) * * Free up resources * IF( WANTZ ) THEN CALL BLACS_GRIDEXIT( CONTEXTC ) END IF * * Compare every ith eigenvalue, or all if there are only a few, * across the process grid to check for heterogeneity. * IF( N.LE.ITHVAL ) THEN J = N K = 1 ELSE J = N / ITHVAL K = ITHVAL END IF * LRMIN = INT( RWORK( 1 ) ) INDTAU = 0 INDE = INDTAU + J DO 40 I = 1, J RWORK( I+INDTAU ) = W( ( I-1 )*K+1 ) RWORK( I+INDE ) = W( ( I-1 )*K+1 ) 40 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'All', ' ', J, 1, RWORK( 1+INDTAU ), $ J, 1, 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'All', ' ', J, 1, RWORK( 1+INDE ), $ J, 1, 1, -1, -1, 0 ) * DO 50 I = 1, J IF( INFO.EQ.0 .AND. ( RWORK( I+INDTAU )-RWORK( I+INDE ).NE. $ ZERO ) ) THEN INFO = N + 1 END IF 50 CONTINUE RWORK( 1 ) = LRMIN * RETURN * * End of PCHEEV * END scalapack-2.0.2/SRC/pcheevd.f000644 000766 000024 00000037262 10363532303 016135 0ustar00juliestaff000000 000000 SUBROUTINE PCHEEVD( JOBZ, UPLO, N, A, IA, JA, DESCA, W, Z, IZ, JZ, $ DESCZ, WORK, LWORK, RWORK, LRWORK, IWORK, $ LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 25, 2002 * * .. Scalar Arguments .. CHARACTER JOBZ, UPLO INTEGER IA, INFO, IZ, JA, JZ, LIWORK, LRWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) REAL RWORK( * ), W( * ) COMPLEX A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PCHEEVD computes all the eigenvalues and eigenvectors of a Hermitian * matrix A by using a divide and conquer algorithm. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; (NOT IMPLEMENTED YET) * = 'V': Compute eigenvalues and eigenvectors. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic COMPLEX array, * global dimension (N, N), local dimension ( LLD_A, * LOCc(JA+N-1) ) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PCHEEVD cannot guarantee * correct error reporting. * * W (global output) REAL array, dimension (N) * If INFO=0, the eigenvalues in ascending order. * * Z (local output) COMPLEX array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * Z contains the orthonormal eigenvectors of the matrix A. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX array, * dimension (LWORK) * On output, WORK(1) returns the workspace needed for the * computation. * * LWORK (local input) INTEGER * If eigenvectors are requested: * LWORK = N + ( NP0 + MQ0 + NB ) * NB, * with NP0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine calculates the size for all * work arrays. Each of these values is returned in the first * entry of the corresponding work array, and no error message * is issued by PXERBLA. * * RWORK (local workspace/output) REAL array, * dimension (LRWORK) * On output RWORK(1) returns the real workspace needed to * guarantee completion. If the input parameters are incorrect, * RWORK(1) may also be incorrect. * * LRWORK (local input) INTEGER * Size of RWORK array. * LRWORK >= 1 + 9*N + 3*NP*NQ, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) * NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * On output IWORK(1) returns the integer workspace needed. * * LIWORK (input) INTEGER * The dimension of the array IWORK. * LIWORK = 7*N + 8*NPCOL + 2 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = 1 through N, the i(th) eigenvalue did not * converge in PSLAED3. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ), sub( Z ) must verify * some alignment properties, namely the following expression * should be true: * ( MB_A.EQ.NB_A.EQ.MB_Z.EQ.NB_Z .AND. IROFFA.EQ.ICOFFA .AND. * IROFFA.EQ.0 .AND.IROFFA.EQ.IROFFZ. AND. IAROW.EQ.IZROW) * with IROFFA = MOD( IA-1, MB_A ) * and ICOFFA = MOD( JA-1, NB_A ). * * Further Details * ======= ======= * * Contributed by Francoise Tisseur, University of Manchester. * * Reference: F. Tisseur and J. Dongarra, "A Parallel Divide and * Conquer Algorithm for the Symmetric Eigenvalue Problem * on Distributed Memory Architectures", * SIAM J. Sci. Comput., 6:20 (1999), pp. 2223--2236. * (see also LAPACK Working Note 132) * http://www.netlib.org/lapack/lawns/lawn132.ps * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LOWER, LQUERY INTEGER CSRC_A, I, IACOL, IAROW, ICOFFA, IINFO, IIZ, $ INDD, INDE, INDE2, INDRWORK, INDTAU, INDWORK, $ INDZ, IPR, IPZ, IROFFA, IROFFZ, ISCALE, IZCOL, $ IZROW, J, JJZ, LDR, LDZ, LIWMIN, LLRWORK, $ LLWORK, LRWMIN, LWMIN, MB_A, MYCOL, MYROW, NB, $ NB_A, NN, NP0, NPCOL, NPROW, NQ, NQ0, OFFSET, $ RSRC_A REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, $ SMLNUM * .. * .. Local Arrays .. INTEGER DESCRZ( 9 ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC REAL PCLANHE, PSLAMCH EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC, PCLANHE, $ PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCINIT, INFOG2L, $ PCELGET, PCHETRD, PCHK2MAT, PCLASCL, PCLASET, $ PCUNMTR, PSLARED1D, PSLASET, PSSTEDC, PXERBLA, $ SSCAL * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 * * Quick return * IF( N.EQ.0 ) $ RETURN * * Test the input arguments. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IZ, JZ, DESCZ, 12, INFO ) IF( INFO.EQ.0 ) THEN LOWER = LSAME( UPLO, 'L' ) NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( IA, NB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( JA, MB_A, MYCOL, CSRC_A, NPCOL ) NP0 = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ0 = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) IROFFZ = MOD( IZ-1, MB_A ) CALL INFOG2L( IZ, JZ, DESCZ, NPROW, NPCOL, MYROW, MYCOL, $ IIZ, JJZ, IZROW, IZCOL ) LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * * Compute the total amount of space needed * NN = MAX( N, NB, 2 ) NQ = NUMROC( NN, NB, 0, 0, NPCOL ) LWMIN = N + ( NP0+NQ+NB )*NB LRWMIN = 1 + 9*N + 3*NP0*NQ0 LIWMIN = 7*N + 8*NPCOL + 2 WORK( 1 ) = CMPLX( LWMIN ) RWORK( 1 ) = REAL( LRWMIN ) IWORK( 1 ) = LIWMIN IF( .NOT.LSAME( JOBZ, 'V' ) ) THEN INFO = -1 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -14 ELSE IF( LRWORK.LT.LRWMIN .AND. LRWORK.NE.-1 ) THEN INFO = -16 ELSE IF( IROFFA.NE.0 ) THEN INFO = -4 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFA.NE.IROFFZ ) THEN INFO = -10 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -10 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 1200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 1200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 1200+RSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) END IF END IF IF( LOWER ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'U' ) END IF IDUM2( 1 ) = 2 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 14 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IZ, $ JZ, DESCZ, 12, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PCHEEVD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Get machine constants. * SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe minimum' ) EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * * Set up pointers into the WORK array * INDTAU = 1 INDWORK = INDTAU + N LLWORK = LWORK - INDWORK + 1 * * Set up pointers into the RWORK array * INDE = 1 INDD = INDE + N INDE2 = INDD + N INDRWORK = INDE2 + N LLRWORK = LRWORK - INDRWORK + 1 * * Scale matrix to allowable range, if necessary. * ISCALE = 0 * ANRM = PCLANHE( 'M', UPLO, N, A, IA, JA, DESCA, $ RWORK( INDRWORK ) ) * * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM END IF * IF( ISCALE.EQ.1 ) THEN CALL PCLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) END IF * * Reduce Hermitian matrix to tridiagonal form. * CALL PCHETRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDE2 ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * Copy the values of D, E to all processes * * Here PxLARED1D is used to redistribute the tridiagonal matrix. * PxLARED1D, however, doesn't yet work with arbritary matrix * distributions so we have PxELGET as a backup. * OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. RSRC_A.EQ.0 .AND. CSRC_A.EQ.0 ) $ THEN CALL PSLARED1D( N, IA, JA, DESCA, RWORK( INDD ), W, $ RWORK( INDRWORK ), LLRWORK ) * CALL PSLARED1D( N, IA, JA, DESCA, RWORK( INDE2 ), $ RWORK( INDE ), RWORK( INDRWORK ), LLRWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PCELGET( 'A', ' ', WORK( INDWORK ), A, I+IA-1, I+JA-1, $ DESCA ) W( I ) = REAL( WORK( INDWORK ) ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PCELGET( 'A', ' ', WORK( INDWORK ), A, I+IA-1, I+JA, $ DESCA ) RWORK( INDE+I-1 ) = REAL( WORK( INDWORK ) ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PCELGET( 'A', ' ', WORK( INDWORK ), A, I+IA, I+JA-1, $ DESCA ) RWORK( INDE+I-1 ) = REAL( WORK( INDWORK ) ) 30 CONTINUE END IF END IF * * Call PSSTEDC to compute eigenvalues and eigenvectors. * INDZ = INDE + N INDRWORK = INDZ + NP0*NQ0 LLRWORK = LRWORK - INDRWORK + 1 LDR = MAX( 1, NP0 ) CALL DESCINIT( DESCRZ, DESCZ( M_ ), DESCZ( N_ ), DESCZ( MB_ ), $ DESCZ( NB_ ), DESCZ( RSRC_ ), DESCZ( CSRC_ ), $ DESCZ( CTXT_ ), LDR, INFO ) CALL PCLASET( 'Full', N, N, CZERO, CONE, Z, IZ, JZ, DESCZ ) CALL PSLASET( 'Full', N, N, ZERO, ONE, RWORK( INDZ ), 1, 1, $ DESCRZ ) CALL PSSTEDC( 'I', N, W, RWORK( INDE+OFFSET ), RWORK( INDZ ), IZ, $ JZ, DESCRZ, RWORK( INDRWORK ), LLRWORK, IWORK, $ LIWORK, IINFO ) * LDZ = DESCZ( LLD_ ) LDR = DESCRZ( LLD_ ) IIZ = INDXG2L( IZ, NB, MYROW, MYROW, NPROW ) JJZ = INDXG2L( JZ, NB, MYCOL, MYCOL, NPCOL ) IPZ = IIZ + ( JJZ-1 )*LDZ IPR = INDZ - 1 + IIZ + ( JJZ-1 )*LDR DO 50 J = 0, NQ0 - 1 DO 40 I = 0, NP0 - 1 Z( IPZ+I+J*LDZ ) = RWORK( IPR+I+J*LDR ) 40 CONTINUE 50 CONTINUE * * Z = Q * Z * CALL PCUNMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, WORK( INDWORK ), $ LLWORK, IINFO ) * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL SSCAL( N, ONE / SIGMA, W, 1 ) END IF * WORK( 1 ) = CMPLX( LWMIN ) RWORK( 1 ) = REAL( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PCHEEVD * END scalapack-2.0.2/SRC/pcheevr.f000644 000766 000024 00000133745 11750130340 016152 0ustar00juliestaff000000 000000 SUBROUTINE PCHEEVR( JOBZ, RANGE, UPLO, N, A, IA, JA, $ DESCA, VL, VU, IL, IU, M, NZ, W, Z, IZ, $ JZ, DESCZ, $ WORK, LWORK, RWORK, LRWORK, IWORK, LIWORK, $ INFO ) IMPLICIT NONE * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, INFO, IU, IZ, JA, JZ, LIWORK, LRWORK, $ LWORK, M, N, NZ REAL VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) REAL W( * ), RWORK( * ) COMPLEX A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PCHEEVR computes selected eigenvalues and, optionally, eigenvectors * of a complex Hermitian matrix A distributed in 2D blockcyclic format * by calling the recommended sequence of ScaLAPACK routines. * * First, the matrix A is reduced to real symmetric tridiagonal form. * Then, the eigenproblem is solved using the parallel MRRR algorithm. * Last, if eigenvectors have been computed, a backtransformation is done. * * Upon successful completion, each processor stores a copy of all computed * eigenvalues in W. The eigenvector matrix Z is stored in * 2D blockcyclic format distributed over all processors. * * For constructive feedback and comments, please contact cvoemel@lbl.gov * C. Voemel * * * Arguments * ========= * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0 * * A (local input/workspace) 2D block cyclic COMPLEX array, * global dimension (N, N), * local dimension ( LLD_A, LOCc(JA+N-1) ) * (see Notes below for more detailed explanation of 2d arrays) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * (The ScaLAPACK descriptor length is DLEN_ = 9.) * The array descriptor for the distributed matrix A. * The descriptor stores details about the 2D block-cyclic * storage, see the notes below. * If DESCA is incorrect, PCHEEVR cannot work correctly. * Also note the array alignment requirements specified below * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A'. * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M * * W (global output) REAL array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * Z (local output) COMPLEX array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX array, * dimension (LWORK) * WORK(1) returns workspace adequate workspace to allow * optimal performance. * * LWORK (local input) INTEGER * Size of WORK array, must be at least 3. * If only eigenvalues are requested: * LWORK >= N + MAX( NB * ( NP00 + 1 ), NB * 3 ) * If eigenvectors are requested: * LWORK >= N + ( NP00 + MQ00 + NB ) * NB * For definitions of NP00 & MQ00, see LRWORK. * * For optimal performance, greater workspace is needed, i.e. * LWORK >= MAX( LWORK, NHETRD_LWORK ) * Where LWORK is as defined above, and * NHETRD_LWORK = N + 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 1 ) * NPS * * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) * SQNPC = SQRT( REAL( NPROW * NPCOL ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the * optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * NOTE THAT FOR OPTIMAL PERFORMANCE, LWOPT IS RETURNED * (THE OPTIMUM WORKSPACE) RATHER THAN THE MINIMUM NECESSARY * WORKSPACE LWMIN WHEN A WORKSPACE QUERY IS ISSUED. * FOR VERY SMALL MATRICES, LWOPT >> LWMIN. * * RWORK (local workspace/output) REAL array, * dimension (LRWORK) * On return, RWORK(1) contains the optimal amount of * workspace required for efficient execution. * if JOBZ='N' RWORK(1) = optimal amount of workspace * required to compute the eigenvalues. * if JOBZ='V' RWORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors. * * LRWORK (local input) INTEGER * Size of RWORK, must be at least 3. * See below for definitions of variables used to define LRWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LRWORK >= 2 + 5 * N + MAX( 12 * N, NB * ( NP00 + 1 ) ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required is: * LRWORK >= 2 + 5 * N + MAX( 18*N, NP00 * MQ00 + 2 * NB * NB ) + * (2 + ICEIL( NEIG, NPROW*NPCOL))*N * * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP00 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ00 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance for all work arrays. Each of * these values is returned in the first entry of the * corresponding work arrays, and no error message is issued by * PXERBLA. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * * Let NNP = MAX( N, NPROW*NPCOL + 1, 4 ). Then: * LIWORK >= 12*NNP + 2*N when the eigenvectors are desired * LIWORK >= 10*NNP + 2*N when only the eigenvalues have to be computed * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * PCHEEVR assumes IEEE 754 standard compliant arithmetic. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and Z(IZ:IZ+M-1,JZ:JZ+N-1) * must satisfy the following alignment properties: * * 1.Identical (quadratic) dimension: * DESCA(M_) = DESCZ(M_) = DESCA(N_) = DESCZ(N_) * 2.Quadratic conformal blocking: * DESCA(MB_) = DESCA(NB_) = DESCZ(MB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * 3.MOD( IA-1, MB_A ) = MOD( IZ-1, MB_Z ) = 0 * 4.IAROW = IZROW * * * .. Parameters .. INTEGER CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_ PARAMETER ( CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8 ) REAL ZERO PARAMETER ( ZERO = 0.0E0 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, COLBRT, DOBCST, FINISH, FIRST, INDEIG, $ LOWER, LQUERY, VALEIG, VSTART, WANTZ INTEGER ANB, DOL, DOU, DSTCOL, DSTROW, EIGCNT, FRSTCL, $ I, IAROW, ICTXT, IIL, IINDERR, IINDWLC, IINFO, $ IIU, IM, INDD, INDD2, INDE, INDE2, INDERR, $ INDILU, INDRTAU, INDRW, INDRWORK, INDTAU, $ INDWLC, INDWORK, IPIL, IPIU, IPROC, IZROW, $ LASTCL, LENGTHI, LENGTHI2, LIWMIN, LLRWORK, $ LLWORK, LRWMIN, LRWOPT, LWMIN, LWOPT, MAXCLS, $ MQ00, MYCOL, MYIL, MYIU, MYPROC, MYROW, MZ, NB, $ NDEPTH, NEEDIL, NEEDIU, NHETRD_LWOPT, NNP, $ NP00, NPCOL, NPROCS, NPROW, NPS, NSPLIT, $ OFFSET, PARITY, RLENGTHI, RLENGTHI2, RSTARTI, $ SIZE1, SIZE2, SQNPC, SRCCOL, SRCROW, STARTI, $ ZOFFSET REAL PIVMIN, SAFMIN, SCALE, VLL, VUU, WL, $ WU * * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PJLAENV, $ PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGEBR2D, IGEBS2D, $ IGERV2D, IGESD2D, IGSUM2D, PCELGET, PCHENTRD, $ PCHK1MAT, PCHK2MAT, PCLAEVSWP, PCUNMTR, $ PSLARED1D, PXERBLA, SCOPY, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D, SLARRC, SLASRT2, $ SSTEGR2A, SSTEGR2B, SSTEGR2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, ICHAR, INT, MAX, MIN, MOD, REAL, $ SQRT * .. * .. Executable Statements .. * INFO = 0 *********************************************************************** * * Decode character arguments to find out what the code should do * *********************************************************************** WANTZ = LSAME( JOBZ, 'V' ) LOWER = LSAME( UPLO, 'L' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) *********************************************************************** * * GET MACHINE PARAMETERS * *********************************************************************** ICTXT = DESCA( CTXT_ ) SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) *********************************************************************** * * Set up pointers into the (complex) WORK array * *********************************************************************** INDTAU = 1 INDWORK = INDTAU + N LLWORK = LWORK - INDWORK + 1 *********************************************************************** * * Set up pointers into the RWORK array * *********************************************************************** INDRTAU = 1 INDD = INDRTAU + N INDE = INDD + N + 1 INDD2 = INDE + N + 1 INDE2 = INDD2 + N INDRWORK = INDE2 + N LLRWORK = LRWORK - INDRWORK + 1 *********************************************************************** * * BLACS PROCESSOR GRID SETUP * *********************************************************************** CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW * NPCOL MYPROC = MYROW * NPCOL + MYCOL IF( NPROW.EQ.-1 ) THEN INFO = -( 800+CTXT_ ) ELSE IF( WANTZ ) THEN IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF *********************************************************************** * * COMPUTE REAL WORKSPACE * *********************************************************************** IF ( ALLEIG ) THEN MZ = N ELSE IF ( INDEIG ) THEN MZ = IU - IL + 1 ELSE * Take upper bound for VALEIG case MZ = N END IF * NB = DESCA( NB_ ) NP00 = NUMROC( N, NB, 0, 0, NPROW ) MQ00 = NUMROC( MZ, NB, 0, 0, NPCOL ) IF ( WANTZ ) THEN INDRW = INDRWORK + MAX(18*N, NP00*MQ00 + 2*NB*NB) LRWMIN = INDRW - 1 + (ICEIL(MZ, NPROCS) + 2)*N LWMIN = N + MAX((NP00 + MQ00 + NB) * NB, 3 * NB) ELSE INDRW = INDRWORK + 12*N LRWMIN = INDRW - 1 LWMIN = N + MAX( NB*( NP00 + 1 ), 3 * NB ) END IF * The code that validates the input requires 3 workspace entries LRWMIN = MAX(3, LRWMIN) LRWOPT = LRWMIN LWMIN = MAX(3, LWMIN) LWOPT = LWMIN * ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( REAL( NPROCS ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS LWOPT = MAX( LWOPT, N+NHETRD_LWOPT ) * SIZE1 = INDRW - INDRWORK *********************************************************************** * * COMPUTE INTEGER WORKSPACE * *********************************************************************** NNP = MAX( N, NPROCS+1, 4 ) IF ( WANTZ ) THEN LIWMIN = 12*NNP + 2*N ELSE LIWMIN = 10*NNP + 2*N END IF *********************************************************************** * * Set up pointers into the IWORK array * *********************************************************************** * Pointer to eigenpair distribution over processors INDILU = LIWMIN - 2*NPROCS + 1 SIZE2 = INDILU - 2*N *********************************************************************** * * Test the input arguments. * *********************************************************************** IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 21, INFO ) * IF( INFO.EQ.0 ) THEN IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -3 ELSE IF( MOD( IA-1, DESCA( MB_ ) ).NE.0 ) THEN INFO = -6 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -10 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -11 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N )) $ THEN INFO = -12 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -21 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -25 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 800+NB_ ) END IF IF( WANTZ ) THEN IAROW = INDXG2P( 1, DESCA( NB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) IZROW = INDXG2P( 1, DESCA( NB_ ), MYROW, $ DESCZ( RSRC_ ), NPROW ) IF( IAROW.NE.IZROW ) THEN INFO = -19 ELSE IF( MOD( IA-1, DESCA( MB_ ) ).NE. $ MOD( IZ-1, DESCZ( MB_ ) ) ) THEN INFO = -19 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2100+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2100+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2100+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2100+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2100+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2100+CSRC_ ) ELSE IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( ALLEIG ) THEN IDUM1( 3 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 3 ) = ICHAR( 'I' ) ELSE IDUM1( 3 ) = ICHAR( 'V' ) END IF IDUM2( 3 ) = 3 IF( LQUERY ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 4 IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 8, N, 4, N, 4,IZ, $ JZ, DESCZ, 21, 4, IDUM1, IDUM2, INFO ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 4, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = CMPLX( LWOPT ) RWORK( 1 ) = REAL( LRWOPT ) IWORK( 1 ) = LIWMIN END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHEEVR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF *********************************************************************** * * Quick return if possible * *********************************************************************** IF( N.EQ.0 ) THEN IF( WANTZ ) THEN NZ = 0 END IF M = 0 WORK( 1 ) = CMPLX( LWOPT ) RWORK( 1 ) = REAL( LRWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF IF( VALEIG ) THEN VLL = VL VUU = VU ELSE VLL = ZERO VUU = ZERO END IF * * No scaling done here, leave this to MRRR kernel. * Scale tridiagonal rather than full matrix. * *********************************************************************** * * REDUCE MATRIX TO REAL SYMMETRIC TRIDIAGONAL FORM. * *********************************************************************** CALL PCHENTRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, RWORK( INDRWORK ), LLRWORK,IINFO ) IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'PCHENTRD', -IINFO ) RETURN END IF *********************************************************************** * * DISTRIBUTE TRIDIAGONAL TO ALL PROCESSORS * *********************************************************************** OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. $ DESCA( RSRC_ ).EQ.0 .AND. DESCA( CSRC_ ).EQ.0 ) $ THEN CALL PSLARED1D( N, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDD2 ), RWORK( INDRWORK ), LLRWORK ) * CALL PSLARED1D( N, IA, JA, DESCA, RWORK( INDE ), $ RWORK( INDE2 ), RWORK( INDRWORK ), LLRWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PCELGET( 'A', ' ', WORK( INDWORK ), A, $ I+IA-1, I+JA-1, DESCA ) RWORK( INDD2+I-1 ) = REAL( WORK( INDWORK ) ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PCELGET( 'A', ' ', WORK( INDWORK ), A, $ I+IA-1, I+JA, DESCA ) RWORK( INDE2+I-1 ) = REAL( WORK( INDWORK ) ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PCELGET( 'A', ' ', WORK( INDWORK ), A, $ I+IA, I+JA-1, DESCA ) RWORK( INDE2+I-1 ) = REAL( WORK( INDWORK ) ) 30 CONTINUE END IF END IF *********************************************************************** * * SET IIL, IIU * *********************************************************************** IF ( ALLEIG ) THEN IIL = 1 IIU = N ELSE IF ( INDEIG ) THEN IIL = IL IIU = IU ELSE IF ( VALEIG ) THEN CALL SLARRC('T', N, VLL, VUU, RWORK( INDD2 ), $ RWORK( INDE2 + OFFSET ), SAFMIN, EIGCNT, IIL, IIU, INFO) * Refine upper bound N that was taken MZ = EIGCNT IIL = IIL + 1 ENDIF IF(MZ.EQ.0) THEN M = 0 IF( WANTZ ) THEN NZ = 0 END IF WORK( 1 ) = REAL( LWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF MYIL = 0 MYIU = 0 M = 0 IM = 0 *********************************************************************** * * COMPUTE WORK ASSIGNMENTS * *********************************************************************** * * Each processor computes the work assignments for all processors * CALL PMPIM2( IIL, IIU, NPROCS, $ IWORK(INDILU), IWORK(INDILU+NPROCS) ) * * Find local work assignment * MYIL = IWORK(INDILU+MYPROC) MYIU = IWORK(INDILU+NPROCS+MYPROC) ZOFFSET = MAX(0, MYIL - IIL - 1) FIRST = ( MYIL .EQ. IIL ) *********************************************************************** * * CALLS TO MRRR KERNEL * *********************************************************************** IF(.NOT.WANTZ) THEN * * Compute eigenvalues only. * IINFO = 0 IF ( MYIL.GT.0 ) THEN DOL = 1 DOU = MYIU - MYIL + 1 CALL SSTEGR2( JOBZ, 'I', N, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), VLL, VUU, MYIL, MYIU, $ IM, W( 1 ), RWORK( INDRW ), N, $ MYIU - MYIL + 1, $ IWORK( 1 ), RWORK( INDRWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, $ DOL, DOU, ZOFFSET, IINFO ) * SSTEGR2 zeroes out the entire W array, so we can't just give * it the part of W we need. So here we copy the W entries into * their correct location DO 49 I = 1, IM W( MYIL-IIL+I ) = W( I ) 49 CONTINUE * W( MYIL ) is at W( MYIL - IIL + 1 ) * W( X ) is at W(X - IIL + 1 ) END IF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'SSTEGR2', -IINFO ) RETURN END IF ELSEIF ( WANTZ .AND. NPROCS.EQ.1 ) THEN * * Compute eigenvalues and -vectors, but only on one processor * IINFO = 0 IF ( MYIL.GT.0 ) THEN DOL = MYIL - IIL + 1 DOU = MYIU - IIL + 1 CALL SSTEGR2( JOBZ, 'I', N, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), VLL, VUU, IIL, IIU, $ IM, W( 1 ), RWORK( INDRW ), N, $ N, $ IWORK( 1 ), RWORK( INDRWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, DOU, $ ZOFFSET, IINFO ) ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'SSTEGR2', -IINFO ) RETURN END IF ELSEIF ( WANTZ ) THEN * Compute representations in parallel. * Share eigenvalue computation for root between all processors * Then compute the eigenvectors. IINFO = 0 * Part 1. compute root representations and root eigenvalues IF ( MYIL.GT.0 ) THEN DOL = MYIL - IIL + 1 DOU = MYIU - IIL + 1 CALL SSTEGR2A( JOBZ, 'I', N, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), VLL, VUU, IIL, IIU, $ IM, W( 1 ), RWORK( INDRW ), N, $ N, RWORK( INDRWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, $ DOU, NEEDIL, NEEDIU, $ INDERR, NSPLIT, PIVMIN, SCALE, WL, WU, $ IINFO ) ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'SSTEGR2A', -IINFO ) RETURN END IF * * The second part of parallel MRRR, the representation tree * construction begins. Upon successful completion, the * eigenvectors have been computed. This is indicated by * the flag FINISH. * VSTART = .TRUE. FINISH = (MYIL.LE.0) C Part 2. Share eigenvalues and uncertainties between all processors IINDERR = INDRWORK + INDERR - 1 * * * There are currently two ways to communicate eigenvalue information * using the BLACS. * 1.) BROADCAST * 2.) POINT2POINT between collaborators (those processors working * jointly on a cluster. * For efficiency, BROADCAST has been disabled. * At a later stage, other more efficient communication algorithms * might be implemented, e. g. group or tree-based communication. DOBCST = .FALSE. IF(DOBCST) THEN * First gather everything on the first processor. * Then use BROADCAST-based communication DO 45 I = 2, NPROCS IF (MYPROC .EQ. (I - 1)) THEN DSTROW = 0 DSTCOL = 0 STARTI = DOL IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF (( STARTI.GE.1 ) .AND. ( LENGTHI.GE.1 )) THEN LENGTHI2 = 2*LENGTHI * Copy eigenvalues into communication buffer CALL SCOPY(LENGTHI,W( STARTI ),1, $ RWORK( INDD ), 1) * Copy uncertainties into communication buffer CALL SCOPY(LENGTHI,RWORK(IINDERR+STARTI-1),1, $ RWORK( INDD+LENGTHI ), 1) * send buffer CALL SGESD2D( ICTXT, LENGTHI2, $ 1, RWORK( INDD ), LENGTHI2, $ DSTROW, DSTCOL ) END IF ELSE IF (MYPROC .EQ. 0) THEN SRCROW = (I-1) / NPCOL SRCCOL = MOD(I-1, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) STARTI = IWORK(1) LENGTHI = IWORK(2) IF (( STARTI.GE.1 ) .AND. ( LENGTHI.GE.1 )) THEN LENGTHI2 = 2*LENGTHI * receive buffer CALL SGERV2D( ICTXT, LENGTHI2, 1, $ RWORK(INDD), LENGTHI2, SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL SCOPY( LENGTHI, RWORK(INDD), 1, $ W( STARTI ), 1) * copy uncertainties (errors) from communication buffer CALL SCOPY(LENGTHI,RWORK(INDD+LENGTHI),1, $ RWORK( IINDERR+STARTI-1 ), 1) END IF END IF 45 CONTINUE LENGTHI = IIU - IIL + 1 LENGTHI2 = LENGTHI * 2 IF (MYPROC .EQ. 0) THEN * Broadcast eigenvalues and errors to all processors CALL SCOPY(LENGTHI,W ,1, RWORK( INDD ), 1) CALL SCOPY(LENGTHI,RWORK( IINDERR ),1, $ RWORK( INDD+LENGTHI ), 1) CALL SGEBS2D( ICTXT, 'A', ' ', LENGTHI2, 1, $ RWORK(INDD), LENGTHI2 ) ELSE SRCROW = 0 SRCCOL = 0 CALL SGEBR2D( ICTXT, 'A', ' ', LENGTHI2, 1, $ RWORK(INDD), LENGTHI2, SRCROW, SRCCOL ) CALL SCOPY( LENGTHI, RWORK(INDD), 1, W, 1) CALL SCOPY(LENGTHI,RWORK(INDD+LENGTHI),1, $ RWORK( IINDERR ), 1) END IF ELSE * Enable point2point communication between collaborators * Find collaborators of MYPROC IF( (NPROCS.GT.1).AND.(MYIL.GT.0) ) THEN CALL PMPCOL( MYPROC, NPROCS, IIL, NEEDIL, NEEDIU, $ IWORK(INDILU), IWORK(INDILU+NPROCS), $ COLBRT, FRSTCL, LASTCL ) ELSE COLBRT = .FALSE. ENDIF IF(COLBRT) THEN * If the processor collaborates with others, * communicate information. DO 47 IPROC = FRSTCL, LASTCL IF (MYPROC .EQ. IPROC) THEN STARTI = DOL IWORK(1) = STARTI LENGTHI = MYIU - MYIL + 1 IWORK(2) = LENGTHI IF ((STARTI.GE.1) .AND. (LENGTHI.GE.1)) THEN * Copy eigenvalues into communication buffer CALL SCOPY(LENGTHI,W( STARTI ),1, $ RWORK(INDD), 1) * Copy uncertainties into communication buffer CALL SCOPY(LENGTHI, $ RWORK( IINDERR+STARTI-1 ),1, $ RWORK(INDD+LENGTHI), 1) ENDIF DO 46 I = FRSTCL, LASTCL IF(I.EQ.MYPROC) GOTO 46 DSTROW = I/ NPCOL DSTCOL = MOD(I, NPCOL) CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1) .AND. (LENGTHI.GE.1)) THEN LENGTHI2 = 2*LENGTHI * send buffer CALL SGESD2D( ICTXT, LENGTHI2, $ 1, RWORK(INDD), LENGTHI2, $ DSTROW, DSTCOL ) END IF 46 CONTINUE ELSE SRCROW = IPROC / NPCOL SRCCOL = MOD(IPROC, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) RSTARTI = IWORK(1) RLENGTHI = IWORK(2) IF ((RSTARTI.GE.1 ) .AND. (RLENGTHI.GE.1 )) THEN RLENGTHI2 = 2*RLENGTHI CALL SGERV2D( ICTXT, RLENGTHI2, 1, $ RWORK(INDE), RLENGTHI2, $ SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL SCOPY( RLENGTHI,RWORK(INDE), 1, $ W( RSTARTI ), 1) * copy uncertainties (errors) from communication buffer CALL SCOPY(RLENGTHI,RWORK(INDE+RLENGTHI),1, $ RWORK( IINDERR+RSTARTI-1 ), 1) END IF END IF 47 CONTINUE ENDIF ENDIF * Part 3. Compute representation tree and eigenvectors. * What follows is a loop in which the tree * is constructed in parallel from top to bottom, * on level at a time, until all eigenvectors * have been computed. * 100 CONTINUE IF ( MYIL.GT.0 ) THEN CALL SSTEGR2B( JOBZ, N, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), $ IM, W( 1 ), RWORK( INDRW ), N, N, $ IWORK( 1 ), RWORK( INDRWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, $ DOU, NEEDIL, NEEDIU, INDWLC, $ PIVMIN, SCALE, WL, WU, $ VSTART, FINISH, $ MAXCLS, NDEPTH, PARITY, ZOFFSET, IINFO ) IINDWLC = INDRWORK + INDWLC - 1 IF(.NOT.FINISH) THEN IF((NEEDIL.LT.DOL).OR.(NEEDIU.GT.DOU)) THEN CALL PMPCOL( MYPROC, NPROCS, IIL, NEEDIL, NEEDIU, $ IWORK(INDILU), IWORK(INDILU+NPROCS), $ COLBRT, FRSTCL, LASTCL ) ELSE COLBRT = .FALSE. FRSTCL = MYPROC LASTCL = MYPROC ENDIF * * Check if this processor collaborates, i.e. * communication is needed. * IF(COLBRT) THEN DO 147 IPROC = FRSTCL, LASTCL IF (MYPROC .EQ. IPROC) THEN STARTI = DOL IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN * Copy eigenvalues into communication buffer CALL SCOPY(LENGTHI, $ RWORK( IINDWLC+STARTI-1 ),1, $ RWORK(INDD), 1) * Copy uncertainties into communication buffer CALL SCOPY(LENGTHI, $ RWORK( IINDERR+STARTI-1 ),1, $ RWORK(INDD+LENGTHI), 1) ENDIF DO 146 I = FRSTCL, LASTCL IF(I.EQ.MYPROC) GOTO 146 DSTROW = I/ NPCOL DSTCOL = MOD(I, NPCOL) CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN LENGTHI2 = 2*LENGTHI * send buffer CALL SGESD2D( ICTXT, LENGTHI2, $ 1, RWORK(INDD), LENGTHI2, $ DSTROW, DSTCOL ) END IF 146 CONTINUE ELSE SRCROW = IPROC / NPCOL SRCCOL = MOD(IPROC, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) RSTARTI = IWORK(1) RLENGTHI = IWORK(2) IF ((RSTARTI.GE.1).AND.(RLENGTHI.GE.1)) THEN RLENGTHI2 = 2*RLENGTHI CALL SGERV2D( ICTXT,RLENGTHI2, 1, $ RWORK(INDE),RLENGTHI2, $ SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL SCOPY(RLENGTHI,RWORK(INDE), 1, $ RWORK( IINDWLC+RSTARTI-1 ), 1) * copy uncertainties (errors) from communication buffer CALL SCOPY(RLENGTHI,RWORK(INDE+RLENGTHI), $ 1,RWORK( IINDERR+RSTARTI-1 ), 1) END IF END IF 147 CONTINUE ENDIF GOTO 100 ENDIF ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'SSTEGR2B', -IINFO ) RETURN END IF * ENDIF * *********************************************************************** * * MAIN PART ENDS HERE * *********************************************************************** * *********************************************************************** * * ALLGATHER: EACH PROCESSOR SENDS ITS EIGENVALUES TO THE FIRST ONE, * THEN THE FIRST PROCESSOR BROADCASTS ALL EIGENVALUES * *********************************************************************** DO 50 I = 2, NPROCS IF (MYPROC .EQ. (I - 1)) THEN DSTROW = 0 DSTCOL = 0 STARTI = MYIL - IIL + 1 IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN CALL SGESD2D( ICTXT, LENGTHI, $ 1, W( STARTI ), LENGTHI, $ DSTROW, DSTCOL ) ENDIF ELSE IF (MYPROC .EQ. 0) THEN SRCROW = (I-1) / NPCOL SRCCOL = MOD(I-1, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) STARTI = IWORK(1) LENGTHI = IWORK(2) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN CALL SGERV2D( ICTXT, LENGTHI, 1, $ W( STARTI ), LENGTHI, SRCROW, SRCCOL ) ENDIF ENDIF 50 CONTINUE * Accumulate M from all processors M = IM CALL IGSUM2D( ICTXT, 'A', ' ', 1, 1, M, 1, -1, -1 ) * Broadcast eigenvalues to all processors IF (MYPROC .EQ. 0) THEN * Send eigenvalues CALL SGEBS2D( ICTXT, 'A', ' ', M, 1, W, M ) ELSE SRCROW = 0 SRCCOL = 0 CALL SGEBR2D( ICTXT, 'A', ' ', M, 1, $ W, M, SRCROW, SRCCOL ) END IF * * Sort the eigenvalues and keep permutation in IWORK to * sort the eigenvectors accordingly * DO 160 I = 1, M IWORK( NPROCS+1+I ) = I 160 CONTINUE CALL SLASRT2( 'I', M, W, IWORK( NPROCS+2 ), IINFO ) IF (IINFO.NE.0) THEN CALL PXERBLA( ICTXT, 'SLASRT2', -IINFO ) RETURN END IF *********************************************************************** * * TRANSFORM Z FROM 1D WORKSPACE INTO 2D BLOCKCYCLIC STORAGE * *********************************************************************** IF ( WANTZ ) THEN DO 170 I = 1, M IWORK( M+NPROCS+1+IWORK( NPROCS+1+I ) ) = I 170 CONTINUE * Store NVS in IWORK(1:NPROCS+1) for PCLAEVSWP IWORK( 1 ) = 0 DO 180 I = 1, NPROCS * Find IL and IU for processor i-1 * Has already been computed by PMPIM2 and stored IPIL = IWORK(INDILU+I-1) IPIU = IWORK(INDILU+NPROCS+I-1) IF (IPIL .EQ. 0) THEN IWORK( I + 1 ) = IWORK( I ) ELSE IWORK( I + 1 ) = IWORK( I ) + IPIU - IPIL + 1 ENDIF 180 CONTINUE IF ( FIRST ) THEN CALL PCLAEVSWP(N, RWORK( INDRW ), N, Z, IZ, JZ, $ DESCZ, IWORK( 1 ), IWORK( NPROCS+M+2 ), RWORK( INDRWORK ), $ SIZE1 ) ELSE CALL PCLAEVSWP(N, RWORK( INDRW + N ), N, Z, IZ, JZ, $ DESCZ, IWORK( 1 ), IWORK( NPROCS+M+2 ), RWORK( INDRWORK ), $ SIZE1 ) END IF * NZ = M * *********************************************************************** * * Compute eigenvectors of A from eigenvectors of T * *********************************************************************** IF( NZ.GT.0 ) THEN CALL PCUNMTR( 'L', UPLO, 'N', N, NZ, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) END IF IF (IINFO.NE.0) THEN CALL PXERBLA( ICTXT, 'PCUNMTR', -IINFO ) RETURN END IF * END IF * WORK( 1 ) = CMPLX( LWOPT ) RWORK( 1 ) = REAL( LRWOPT ) IWORK( 1 ) = LIWMIN RETURN * * End of PCHEEVR * END scalapack-2.0.2/SRC/pcheevx.f000644 000766 000024 00000116647 11605326344 016174 0ustar00juliestaff000000 000000 SUBROUTINE PCHEEVX( JOBZ, RANGE, UPLO, N, A, IA, JA, DESCA, VL, $ VU, IL, IU, ABSTOL, M, NZ, W, ORFAC, Z, IZ, $ JZ, DESCZ, WORK, LWORK, RWORK, LRWORK, IWORK, $ LIWORK, IFAIL, ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, INFO, IU, IZ, JA, JZ, LIWORK, LRWORK, $ LWORK, M, N, NZ REAL ABSTOL, ORFAC, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), ICLUSTR( * ), $ IFAIL( * ), IWORK( * ) REAL GAP( * ), RWORK( * ), W( * ) COMPLEX A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PCHEEVX computes selected eigenvalues and, optionally, eigenvectors * of a complex hermitian matrix A by calling the recommended sequence * of ScaLAPACK routines. Eigenvalues/vectors can be selected by * specifying a range of values or a range of indices for the desired * eigenvalues. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * PCHEEVX assumes IEEE 754 standard compliant arithmetic. To port * to a system which does not have IEEE 754 arithmetic, modify * the appropriate SLmake.inc file to include the compiler switch * -DNO_IEEE. This switch only affects the compilation of pslaiect.c. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic COMPLEX array, * global dimension (N, N), * local dimension ( LLD_A, LOCc(JA+N-1) ) * * On entry, the Hermitian matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the Hermitian matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * Hermitian matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PCHEEVX cannot guarantee * correct error reporting. * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * ABSTOL (global input) REAL * If JOBZ='V', setting ABSTOL to PSLAMCH( CONTEXT, 'U') yields * the most orthogonal eigenvectors. * * The absolute error tolerance for the eigenvalues. * An approximate eigenvalue is accepted as converged * when it is determined to lie in an interval [a,b] * of width less than or equal to * * ABSTOL + EPS * max( |a|,|b| ) , * * where EPS is the machine precision. If ABSTOL is less than * or equal to zero, then EPS*norm(T) will be used in its place, * where norm(T) is the 1-norm of the tridiagonal matrix * obtained by reducing A to tridiagonal form. * * Eigenvalues will be computed most accurately when ABSTOL is * set to twice the underflow threshold 2*PSLAMCH('S') not zero. * If this routine returns with ((MOD(INFO,2).NE.0) .OR. * (MOD(INFO/8,2).NE.0)), indicating that some eigenvalues or * eigenvectors did not converge, try setting ABSTOL to * 2*PSLAMCH('S'). * * See "Computing Small Singular Values of Bidiagonal Matrices * with Guaranteed High Relative Accuracy," by Demmel and * Kahan, LAPACK Working Note #3. * * See "On the correctness of Parallel Bisection in Floating * Point" by Demmel, Dhillon and Ren, LAPACK Working Note #70 * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M unless the user supplies * insufficient space and PCHEEVX is not able to detect this * before beginning computation. To get all the eigenvectors * requested, the user must supply both sufficient * space to hold the eigenvectors in Z (M .LE. DESCZ(N_)) * and sufficient workspace to compute them. (See LWORK below.) * PCHEEVX is always able to detect insufficient space without * computation unless RANGE .EQ. 'V'. * * W (global output) REAL array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * ORFAC (global input) REAL * Specifies which eigenvectors should be reorthogonalized. * Eigenvectors that correspond to eigenvalues which are within * tol=ORFAC*norm(A) of each other are to be reorthogonalized. * However, if the workspace is insufficient (see LWORK), * tol may be decreased until all eigenvectors to be * reorthogonalized can be stored in one process. * No reorthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) COMPLEX array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. If an eigenvector * fails to converge, then that column of Z contains the latest * approximation to the eigenvector, and the index of the * eigenvector is returned in IFAIL. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX array, * dimension (LWORK) * WORK(1) returns workspace adequate workspace to allow * optimal performance. * * LWORK (local input) INTEGER * Size of WORK array. If only eigenvalues are requested: * LWORK >= N + MAX( NB * ( NP0 + 1 ), 3 ) * If eigenvectors are requested: * LWORK >= N + ( NP0 + MQ0 + NB ) * NB * with NQ0 = NUMROC( NN, NB, 0, 0, NPCOL ). * * For optimal performance, greater workspace is needed, i.e. * LWORK >= MAX( LWORK, NHETRD_LWORK ) * Where LWORK is as defined above, and * NHETRD_LWORK = N + 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 1 ) * NPS * * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) * SQNPC = SQRT( DBLE( NPROW * NPCOL ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the * optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/output) REAL array, * dimension max(3,LRWORK) * On return, WORK(1) contains the optimal amount of * workspace required for efficient execution. * if JOBZ='N' RWORK(1) = optimal amount of workspace * required to compute eigenvalues efficiently * if JOBZ='V' RWORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors * efficiently with no guarantee on orthogonality. * If RANGE='V', it is assumed that all eigenvectors * may be required. * * LRWORK (local input) INTEGER * Size of RWORK * See below for definitions of variables used to define LRWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LRWORK >= 5 * NN + 4 * N * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * LRWORK >= 4*N + MAX( 5*NN, NP0 * MQ0 ) + * ICEIL( NEIG, NPROW*NPCOL)*NN * * The computed eigenvectors may not be orthogonal if the * minimal workspace is supplied and ORFAC is too small. * If you want to guarantee orthogonality (at the cost * of potentially poor performance) you should add * the following to LRWORK: * (CLUSTERSIZE-1)*N * where CLUSTERSIZE is the number of eigenvalues in the * largest cluster, where a cluster is defined as a set of * close eigenvalues: { W(K),...,W(K+CLUSTERSIZE-1) | * W(J+1) <= W(J) + ORFAC*2*norm(A) } * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * When LRWORK is too small: * If LRWORK is too small to guarantee orthogonality, * PCHEEVX attempts to maintain orthogonality in * the clusters with the smallest * spacing between the eigenvalues. * If LRWORK is too small to compute all the eigenvectors * requested, no computation is performed and INFO=-25 * is returned. Note that when RANGE='V', PCHEEVX does * not know how many eigenvectors are requested until * the eigenvalues are computed. Therefore, when RANGE='V' * and as long as LRWORK is large enough to allow PCHEEVX to * compute the eigenvalues, PCHEEVX will compute the * eigenvalues and as many eigenvectors as it can. * * Relationship between workspace, orthogonality & performance: * If CLUSTERSIZE >= N/SQRT(NPROW*NPCOL), then providing * enough space to compute all the eigenvectors * orthogonally will cause serious degradation in * performance. In the limit (i.e. CLUSTERSIZE = N-1) * PCSTEIN will perform no better than CSTEIN on 1 * processor. * For CLUSTERSIZE = N/SQRT(NPROW*NPCOL) reorthogonalizing * all eigenvectors will increase the total execution time * by a factor of 2 or more. * For CLUSTERSIZE > N/SQRT(NPROW*NPCOL) execution time will * grow as the square of the cluster size, all other factors * remaining equal and assuming enough workspace. Less * workspace means less reorthogonalization but faster * execution. * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance for all work arrays. Each of * these values is returned in the first entry of the * corresponding work arrays, and no error message is issued by * PXERBLA. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * LIWORK >= 6 * NNP * Where: * NNP = MAX( N, NPROW*NPCOL + 1, 4 ) * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If (MOD(INFO,2).NE.0) on exit, then * IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * * ICLUSTR (global output) integer array, dimension (2*NPROW*NPCOL) * This array contains indices of eigenvectors corresponding to * a cluster of eigenvalues that could not be reorthogonalized * due to insufficient workspace (see LWORK, ORFAC and INFO). * Eigenvectors corresponding to clusters of eigenvalues indexed * ICLUSTR(2*I-1) to ICLUSTR(2*I), could not be * reorthogonalized due to lack of workspace. Hence the * eigenvectors corresponding to these clusters may not be * orthogonal. ICLUSTR() is a zero terminated array. * (ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0) if and only if * K is the number of clusters * ICLUSTR is not referenced if JOBZ = 'N' * * GAP (global output) REAL array, * dimension (NPROW*NPCOL) * This array contains the gap between eigenvalues whose * eigenvectors could not be reorthogonalized. The output * values in this array correspond to the clusters indicated * by the array ICLUSTR. As a result, the dot product between * eigenvectors correspoding to the I^th cluster may be as high * as ( C * n ) / GAP(I) where C is a small constant. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: if (MOD(INFO,2).NE.0), then one or more eigenvectors * failed to converge. Their indices are stored * in IFAIL. Ensure ABSTOL=2.0*PSLAMCH( 'U' ) * Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding * to one or more clusters of eigenvalues could not be * reorthogonalized because of insufficient workspace. * The indices of the clusters are stored in the array * ICLUSTR. * if (MOD(INFO/4,2).NE.0), then space limit prevented * PCHEEVX from computing all of the eigenvectors * between VL and VU. The number of eigenvectors * computed is returned in NZ. * if (MOD(INFO/8,2).NE.0), then PCSTEBZ failed to compute * eigenvalues. Ensure ABSTOL=2.0*PSLAMCH( 'U' ) * Send e-mail to scalapack@cs.utk.edu * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * ( MB_A.EQ.NB_A.EQ.MB_Z .AND. IROFFA.EQ.IROFFZ .AND. IROFFA.EQ.0 .AND. * IAROW.EQ.IZROW ) * where * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * Differences between PCHEEVX and CHEEVX * ====================================== * * A, LDA -> A, IA, JA, DESCA * Z, LDZ -> Z, IZ, JZ, DESCZ * WORKSPACE needs are larger for PCHEEVX. * LIWORK parameter added * * ORFAC, ICLUSTER() and GAP() parameters added * meaning of INFO is changed * * Functional differences: * PCHEEVX does not promise orthogonality for eigenvectors associated * with tighly clustered eigenvalues. * PCHEEVX does not reorthogonalize eigenvectors * that are on different processes. The extent of reorthogonalization * is controlled by the input parameter LWORK. * * Version 1.4 limitations: * DESCA(MB_) = DESCA(NB_) * DESCA(M_) = DESCZ(M_) * DESCA(N_) = DESCZ(N_) * DESCA(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE, TEN, FIVE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TEN = 10.0E+0, $ FIVE = 5.0E+0 ) INTEGER IERREIN, IERRCLS, IERRSPC, IERREBZ PARAMETER ( IERREIN = 1, IERRCLS = 2, IERRSPC = 4, $ IERREBZ = 8 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, QUICKRETURN, $ VALEIG, WANTZ CHARACTER ORDER INTEGER ANB, CSRC_A, I, IAROW, ICOFFA, ICTXT, IINFO, $ INDD, INDD2, INDE, INDE2, INDIBL, INDISP, $ INDRWORK, INDTAU, INDWORK, IROFFA, IROFFZ, $ ISCALE, ISIZESTEBZ, ISIZESTEIN, IZROW, $ LALLWORK, LIWMIN, LLRWORK, LLWORK, LRWMIN, $ LRWOPT, LWMIN, LWOPT, MAXEIGS, MB_A, MQ0, $ MYCOL, MYROW, NB, NB_A, NEIG, NHETRD_LWOPT, NN, $ NNP, NP0, NPCOL, NPROCS, NPROW, NPS, NQ0, $ NSPLIT, NZZ, OFFSET, RSRC_A, RSRC_Z, SIZEHEEVX, $ SIZESTEIN, SQNPC REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, $ SIGMA, SMLNUM, VLL, VUU * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV REAL PCLANHE, PSLAMCH EXTERNAL LSAME, ICEIL, INDXG2P, NUMROC, PJLAENV, $ PCLANHE, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMN2D, PCELGET, $ PCHENTRD, PCHK1MAT, PCHK2MAT, PCLASCL, PCSTEIN, $ PCUNMTR, PSLARED1D, PSSTEBZ, PXERBLA, SGEBR2D, $ SGEBS2D, SLASRT, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, DBLE, ICHAR, INT, MAX, MIN, MOD, $ REAL, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * QUICKRETURN = ( N.EQ.0 ) * * Test the input arguments. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 * WANTZ = LSAME( JOBZ, 'V' ) IF( NPROW.EQ.-1 ) THEN INFO = -( 800+CTXT_ ) ELSE IF( WANTZ ) THEN IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 21, INFO ) * IF( INFO.EQ.0 ) THEN * * Get machine constants. * SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) EPS = PSLAMCH( ICTXT, 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * NPROCS = NPROW*NPCOL LOWER = LSAME( UPLO, 'L' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) * * Set up pointers into the WORK array * INDTAU = 1 INDWORK = INDTAU + N LLWORK = LWORK - INDWORK + 1 * * Set up pointers into the RWORK array * INDE = 1 INDD = INDE + N INDD2 = INDD + N INDE2 = INDD2 + N INDRWORK = INDE2 + N LLRWORK = LRWORK - INDRWORK + 1 * * Set up pointers into the IWORK array * ISIZESTEIN = 3*N + NPROCS + 1 ISIZESTEBZ = MAX( 4*N, 14, NPROCS ) INDIBL = ( MAX( ISIZESTEIN, ISIZESTEBZ ) ) + 1 INDISP = INDIBL + N * * Compute the total amount of space needed * LQUERY = .FALSE. IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) $ LQUERY = .TRUE. * NNP = MAX( N, NPROCS+1, 4 ) LIWMIN = 6*NNP * NPROCS = NPROW*NPCOL NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A NN = MAX( N, NB, 2 ) * RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( 1, NB_A, MYROW, RSRC_A, NPROW ) NP0 = NUMROC( N+IROFFA, NB, 0, 0, NPROW ) MQ0 = NUMROC( N+ICOFFA, NB, 0, 0, NPCOL ) IF( WANTZ ) THEN RSRC_Z = DESCZ( RSRC_ ) IROFFZ = MOD( IZ-1, MB_A ) IZROW = INDXG2P( 1, NB_A, MYROW, RSRC_Z, NPROW ) ELSE IROFFZ = 0 IZROW = 0 END IF * IF( ( .NOT.WANTZ ) .OR. ( VALEIG .AND. ( .NOT.LQUERY ) ) ) $ THEN LWMIN = N + MAX( NB*( NP0+1 ), 3 ) LWOPT = LWMIN LRWMIN = 5*NN + 4*N IF( WANTZ ) THEN MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) LRWOPT = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( N, NPROW*NPCOL )*NN ELSE LRWOPT = LRWMIN END IF NEIG = 0 ELSE IF( ALLEIG .OR. VALEIG ) THEN NEIG = N ELSE IF( INDEIG ) THEN NEIG = IU - IL + 1 END IF MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) NQ0 = NUMROC( NN, NB, 0, 0, NPCOL ) LWMIN = N + ( NP0+NQ0+NB )*NB LRWMIN = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN LRWOPT = LRWMIN LWOPT = LWMIN * END IF * * Compute how much workspace is needed to use the * new TRD code * ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS LWOPT = MAX( LWOPT, N+NHETRD_LWOPT ) * END IF IF( INFO.EQ.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN RWORK( 1 ) = ABSTOL IF( VALEIG ) THEN RWORK( 2 ) = VL RWORK( 3 ) = VU ELSE RWORK( 2 ) = ZERO RWORK( 3 ) = ZERO END IF CALL SGEBS2D( ICTXT, 'ALL', ' ', 3, 1, RWORK, 3 ) ELSE CALL SGEBR2D( ICTXT, 'ALL', ' ', 3, 1, RWORK, 3, 0, 0 ) END IF IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -3 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -10 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -11 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -12 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -23 ELSE IF( LRWORK.LT.LRWMIN .AND. LRWORK.NE.-1 ) THEN INFO = -25 ELSE IF( LIWORK.LT.LIWMIN .AND. LIWORK.NE.-1 ) THEN INFO = -27 ELSE IF( VALEIG .AND. ( ABS( RWORK( 2 )-VL ).GT.FIVE*EPS* $ ABS( VL ) ) ) THEN INFO = -9 ELSE IF( VALEIG .AND. ( ABS( RWORK( 3 )-VU ).GT.FIVE*EPS* $ ABS( VU ) ) ) THEN INFO = -10 ELSE IF( ABS( RWORK( 1 )-ABSTOL ).GT.FIVE*EPS* $ ABS( ABSTOL ) ) THEN INFO = -13 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 800+NB_ ) END IF IF( WANTZ ) THEN IF( IROFFA.NE.IROFFZ ) THEN INFO = -19 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -19 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2100+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2100+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2100+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2100+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2100+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2100+CSRC_ ) ELSE IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF END IF IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( ALLEIG ) THEN IDUM1( 3 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 3 ) = ICHAR( 'I' ) ELSE IDUM1( 3 ) = ICHAR( 'V' ) END IF IDUM2( 3 ) = 3 IF( LQUERY ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 4 IF( WANTZ ) THEN CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 8, N, 4, N, 4, IZ, $ JZ, DESCZ, 21, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 4, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = CMPLX( LWOPT ) RWORK( 1 ) = REAL( LRWOPT ) IWORK( 1 ) = LIWMIN END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHEEVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( QUICKRETURN ) THEN IF( WANTZ ) THEN NZ = 0 ICLUSTR( 1 ) = 0 END IF M = 0 WORK( 1 ) = CMPLX( LWOPT ) RWORK( 1 ) = REAL( LRWMIN ) IWORK( 1 ) = LIWMIN RETURN END IF * * Scale matrix to allowable range, if necessary. * ABSTLL = ABSTOL ISCALE = 0 IF( VALEIG ) THEN VLL = VL VUU = VU ELSE VLL = ZERO VUU = ZERO END IF * ANRM = PCLANHE( 'M', UPLO, N, A, IA, JA, DESCA, $ RWORK( INDRWORK ) ) * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ANRM = ANRM*SIGMA ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM ANRM = ANRM*SIGMA END IF * IF( ISCALE.EQ.1 ) THEN CALL PCLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) IF( ABSTOL.GT.0 ) $ ABSTLL = ABSTOL*SIGMA IF( VALEIG ) THEN VLL = VL*SIGMA VUU = VU*SIGMA IF( VUU.EQ.VLL ) THEN VUU = VUU + 2*MAX( ABS( VUU )*EPS, SAFMIN ) END IF END IF END IF * * Call PCHENTRD to reduce Hermitian matrix to tridiagonal form. * LALLWORK = LLRWORK * CALL PCHENTRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, RWORK( INDRWORK ), LLRWORK, IINFO ) * * * Copy the values of D, E to all processes * * Here PxLARED1D is used to redistribute the tridiagonal matrix. * PxLARED1D, however, doesn't yet work with arbritary matrix * distributions so we have PxELGET as a backup. * OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. RSRC_A.EQ.0 .AND. CSRC_A.EQ.0 ) $ THEN CALL PSLARED1D( N, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDD2 ), RWORK( INDRWORK ), LLRWORK ) * CALL PSLARED1D( N, IA, JA, DESCA, RWORK( INDE ), $ RWORK( INDE2 ), RWORK( INDRWORK ), LLRWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PCELGET( 'A', ' ', WORK( INDD2+I-1 ), A, I+IA-1, $ I+JA-1, DESCA ) RWORK( INDD2+I-1 ) = REAL( WORK( INDD2+I-1 ) ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PCELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA-1, $ I+JA, DESCA ) RWORK( INDE2+I-1 ) = REAL( WORK( INDE2+I-1 ) ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PCELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA, $ I+JA-1, DESCA ) RWORK( INDE2+I-1 ) = REAL( WORK( INDE2+I-1 ) ) 30 CONTINUE END IF END IF * * Call PSSTEBZ and, if eigenvectors are desired, PCSTEIN. * IF( WANTZ ) THEN ORDER = 'B' ELSE ORDER = 'E' END IF * CALL PSSTEBZ( ICTXT, RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, $ RWORK( INDD2 ), RWORK( INDE2+OFFSET ), M, NSPLIT, W, $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWORK ), $ LLRWORK, IWORK( 1 ), ISIZESTEBZ, IINFO ) * * * IF PSSTEBZ fails, the error propogates to INFO, but * we do not propogate the eigenvalue(s) which failed because: * 1) This should never happen if the user specifies * ABSTOL = 2 * PSLAMCH( 'U' ) * 2) PSSTEIN will confirm/deny whether the eigenvalues are * close enough. * IF( IINFO.NE.0 ) THEN INFO = INFO + IERREBZ DO 40 I = 1, M IWORK( INDIBL+I-1 ) = ABS( IWORK( INDIBL+I-1 ) ) 40 CONTINUE END IF IF( WANTZ ) THEN * IF( VALEIG ) THEN * * Compute the maximum number of eigenvalues that we can * compute in the * workspace that we have, and that we can store in Z. * * Loop through the possibilities looking for the largest * NZ that we can feed to PCSTEIN and PCUNMTR * * Since all processes must end up with the same value * of NZ, we first compute the minimum of LALLWORK * CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LALLWORK, 1, 1, 1, -1, $ -1, -1 ) * MAXEIGS = DESCZ( N_ ) * DO 50 NZ = MIN( MAXEIGS, M ), 0, -1 MQ0 = NUMROC( NZ, NB, 0, 0, NPCOL ) SIZESTEIN = ICEIL( NZ, NPROCS )*N + MAX( 5*N, NP0*MQ0 ) SIZEHEEVX = SIZESTEIN IF( SIZEHEEVX.LE.LALLWORK ) $ GO TO 60 50 CONTINUE 60 CONTINUE ELSE NZ = M END IF NZ = MAX( NZ, 0 ) IF( NZ.NE.M ) THEN INFO = INFO + IERRSPC * DO 70 I = 1, M IFAIL( I ) = 0 70 CONTINUE * * The following code handles a rare special case * - NZ .NE. M means that we don't have enough room to store * all the vectors. * - NSPLIT .GT. 1 means that the matrix split * In this case, we cannot simply take the first NZ eigenvalues * because PSSTEBZ sorts the eigenvalues by block when * a split occurs. So, we have to make another call to * PSSTEBZ with a new upper limit - VUU. * IF( NSPLIT.GT.1 ) THEN CALL SLASRT( 'I', M, W, IINFO ) NZZ = 0 IF( NZ.GT.0 ) THEN * VUU = W( NZ ) - TEN*( EPS*ANRM+SAFMIN ) IF( VLL.GE.VUU ) THEN NZZ = 0 ELSE CALL PSSTEBZ( ICTXT, RANGE, ORDER, N, VLL, VUU, IL, $ IU, ABSTLL, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), NZZ, NSPLIT, $ W, IWORK( INDIBL ), IWORK( INDISP ), $ RWORK( INDRWORK ), LLRWORK, $ IWORK( 1 ), ISIZESTEBZ, IINFO ) END IF * IF( MOD( INFO / IERREBZ, 1 ).EQ.0 ) THEN IF( NZZ.GT.NZ .OR. IINFO.NE.0 ) THEN INFO = INFO + IERREBZ END IF END IF END IF NZ = MIN( NZ, NZZ ) * END IF END IF CALL PCSTEIN( N, RWORK( INDD2 ), RWORK( INDE2+OFFSET ), NZ, W, $ IWORK( INDIBL ), IWORK( INDISP ), ORFAC, Z, IZ, $ JZ, DESCZ, RWORK( INDRWORK ), LALLWORK, $ IWORK( 1 ), ISIZESTEIN, IFAIL, ICLUSTR, GAP, $ IINFO ) * IF( IINFO.GE.NZ+1 ) $ INFO = INFO + IERRCLS IF( MOD( IINFO, NZ+1 ).NE.0 ) $ INFO = INFO + IERREIN * * Z = Q * Z * * IF( NZ.GT.0 ) THEN CALL PCUNMTR( 'L', UPLO, 'N', N, NZ, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) END IF * END IF * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL SSCAL( M, ONE / SIGMA, W, 1 ) END IF * WORK( 1 ) = CMPLX( LWOPT ) RWORK( 1 ) = REAL( LRWOPT ) IWORK( 1 ) = LIWMIN * RETURN * * End of PCHEEVX * END scalapack-2.0.2/SRC/pchegs2.f000644 000766 000024 00000037553 10363532303 016055 0ustar00juliestaff000000 000000 * * SUBROUTINE PCHEGS2( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ) * .. * * Purpose * ======= * * PCHEGS2 reduces a complex Hermitian-definite generalized eigenproblem * to standard form. * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**H or L**H*sub( A )*L. * * sub( B ) must have been previously factorized as U**H*U or L*L**H by * PCPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H); * = 2 or 3: compute U*sub( A )*U**H or L**H*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**H*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**H. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PCPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, HALF PARAMETER ( ONE = 1.0E+0, HALF = 0.5E+0 ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IIA, IIB, IOFFA, IOFFB, IROFFA, IROFFB, $ JJA, JJB, K, LDA, LDB, MYCOL, MYROW, NPCOL, $ NPROW REAL AKK, BKK COMPLEX CT * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GRIDINFO, CAXPY, CHER2, $ CHK1MAT, CLACGV, CSSCAL, CTRMV, CTRSV, INFOG2L, $ PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MOD, REAL * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL LSAME, INDXG2P * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( N+ICOFFA.GT.DESCA( NB_ ) ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHEGS2', -INFO ) CALL BLACS_EXIT( ICTXT ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. ( MYROW.NE.IAROW .OR. MYCOL.NE.IACOL ) ) $ RETURN * * Compute local information * LDA = DESCA( LLD_ ) LDB = DESCB( LLD_ ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIB, JJB, $ IBROW, IBCOL ) * IF( IBTYPE.EQ.1 ) THEN * IF( UPPER ) THEN * IOFFA = IIA + JJA*LDA IOFFB = IIB + JJB*LDB * * Compute inv(U')*sub( A )*inv(U) * DO 10 K = 1, N * * Update the upper triangle of * A(ia+k-1:ia+n-a,ia+k-1:ia+n-1) * AKK = REAL( A( IOFFA-LDA ) ) BKK = REAL( B( IOFFB-LDB ) ) AKK = AKK / BKK**2 A( IOFFA-LDA ) = AKK IF( K.LT.N ) THEN CALL CSSCAL( N-K, ONE / BKK, A( IOFFA ), LDA ) CT = -HALF*AKK CALL CLACGV( N-K, A( IOFFA ), LDA ) CALL CLACGV( N-K, B( IOFFB ), LDB ) CALL CAXPY( N-K, CT, B( IOFFB ), LDB, A( IOFFA ), $ LDA ) CALL CHER2( UPLO, N-K, -CONE, A( IOFFA ), LDA, $ B( IOFFB ), LDB, A( IOFFA+1 ), LDA ) CALL CAXPY( N-K, CT, B( IOFFB ), LDB, A( IOFFA ), $ LDA ) CALL CLACGV( N-K, B( IOFFB ), LDB ) CALL CTRSV( UPLO, 'Conjugate transpose', 'Non-unit', $ N-K, B( IOFFB+1 ), LDB, A( IOFFA ), LDA ) CALL CLACGV( N-K, A( IOFFA ), LDA ) END IF * * A( IOFFA ) -> A( K, K+1 ) * B( IOFFB ) -> B( K, K+1 ) * IOFFA = IOFFA + LDA + 1 IOFFB = IOFFB + LDB + 1 * 10 CONTINUE * ELSE * IOFFA = IIA + 1 + ( JJA-1 )*LDA IOFFB = IIB + 1 + ( JJB-1 )*LDB * * Compute inv(L)*sub( A )*inv(L') * DO 20 K = 1, N * * Update the lower triangle of * A(ia+k-1:ia+n-a,ia+k-1:ia+n-1) * AKK = REAL( A( IOFFA-1 ) ) BKK = REAL( B( IOFFB-1 ) ) AKK = AKK / BKK**2 A( IOFFA-1 ) = AKK * IF( K.LT.N ) THEN CALL CSSCAL( N-K, ONE / BKK, A( IOFFA ), 1 ) CT = -HALF*AKK CALL CAXPY( N-K, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL CHER2( UPLO, N-K, -CONE, A( IOFFA ), 1, $ B( IOFFB ), 1, A( IOFFA+LDA ), LDA ) CALL CAXPY( N-K, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL CTRSV( UPLO, 'No transpose', 'Non-unit', N-K, $ B( IOFFB+LDB ), LDB, A( IOFFA ), 1 ) END IF * * A( IOFFA ) -> A( K+1, K ) * B( IOFFB ) -> B( K+1, K ) * IOFFA = IOFFA + LDA + 1 IOFFB = IOFFB + LDB + 1 * 20 CONTINUE * END IF * ELSE * IF( UPPER ) THEN * IOFFA = IIA + ( JJA-1 )*LDA IOFFB = IIB + ( JJB-1 )*LDB * * Compute U*sub( A )*U' * DO 30 K = 1, N * * Update the upper triangle of A(ia:ia+k-1,ja:ja+k-1) * AKK = REAL( A( IOFFA+K-1 ) ) BKK = REAL( B( IOFFB+K-1 ) ) CALL CTRMV( UPLO, 'No transpose', 'Non-unit', K-1, $ B( IIB+( JJB-1 )*LDB ), LDB, A( IOFFA ), 1 ) CT = HALF*AKK CALL CAXPY( K-1, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL CHER2( UPLO, K-1, CONE, A( IOFFA ), 1, B( IOFFB ), $ 1, A( IIA+( JJA-1 )*LDA ), LDA ) CALL CAXPY( K-1, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL CSSCAL( K-1, BKK, A( IOFFA ), 1 ) A( IOFFA+K-1 ) = AKK*BKK**2 * * A( IOFFA ) -> A( 1, K ) * B( IOFFB ) -> B( 1, K ) * IOFFA = IOFFA + LDA IOFFB = IOFFB + LDB * 30 CONTINUE * ELSE * IOFFA = IIA + ( JJA-1 )*LDA IOFFB = IIB + ( JJB-1 )*LDB * * Compute L'*sub( A )*L * DO 40 K = 1, N * * Update the lower triangle of A(ia:ia+k-1,ja:ja+k-1) * AKK = REAL( A( IOFFA+( K-1 )*LDA ) ) BKK = REAL( B( IOFFB+( K-1 )*LDB ) ) CALL CLACGV( K-1, A( IOFFA ), LDA ) CALL CTRMV( UPLO, 'Conjugate transpose', 'Non-unit', K-1, $ B( IIB+( JJB-1 )*LDB ), LDB, A( IOFFA ), $ LDA ) CT = HALF*AKK CALL CLACGV( K-1, B( IOFFB ), LDB ) CALL CAXPY( K-1, CT, B( IOFFB ), LDB, A( IOFFA ), LDA ) CALL CHER2( UPLO, K-1, CONE, A( IOFFA ), LDA, B( IOFFB ), $ LDB, A( IIA+( JJA-1 )*LDA ), LDA ) CALL CAXPY( K-1, CT, B( IOFFB ), LDB, A( IOFFA ), LDA ) CALL CLACGV( K-1, B( IOFFB ), LDB ) CALL CSSCAL( K-1, BKK, A( IOFFA ), LDA ) CALL CLACGV( K-1, A( IOFFA ), LDA ) A( IOFFA+( K-1 )*LDA ) = AKK*BKK**2 * * A( IOFFA ) -> A( K, 1 ) * B( IOFFB ) -> B( K, 1 ) * IOFFA = IOFFA + 1 IOFFB = IOFFB + 1 * 40 CONTINUE * END IF * END IF * RETURN * * End of PCHEGS2 * END scalapack-2.0.2/SRC/pchegst.f000644 000766 000024 00000042317 10363532303 016151 0ustar00juliestaff000000 000000 * * SUBROUTINE PCHEGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, N REAL SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ) * .. * * Purpose * ======= * * PCHEGST reduces a complex Hermitian-definite generalized eigenproblem * to standard form. * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**H or L**H*sub( A )*L. * * sub( B ) must have been previously factorized as U**H*U or L*L**H by * PCPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H); * = 2 or 3: compute U*sub( A )*U**H or L**H*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**H*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**H. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PCPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * SCALE (global output) REAL * Amount by which the eigenvalues should be scaled to * compensate for the scaling performed in this routine. * At present, SCALE is always returned as 1.0, it is * returned here to allow for future enhancement. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) COMPLEX CONE, HALF PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), $ HALF = ( 0.5E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, K, KB, MYCOL, MYROW, NB, $ NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHEGS2, PCHEMM, $ PCHER2K, PCHK2MAT, PCTRMM, PCTRSM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL LSAME, ICEIL, INDXG2P * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * SCALE = ONE * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IB, $ JB, DESCB, 11, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHEGST', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( IBTYPE.EQ.1 ) THEN IF( UPPER ) THEN * * Compute inv(U')*sub( A )*inv(U) * K = 1 NB = DESCA( NB_ ) KB = MIN( ICEIL( JA, NB )*NB, JA+N-1 ) - JA + 1 * 10 CONTINUE * * Update the upper triangle of A(ia+k-1:ia+n-1,ja+k-1:ja+n-1) * CALL PCHEGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, IB+K-1, DESCB, INFO ) IF( K+KB.LE.N ) THEN CALL PCTRSM( 'Left', UPLO, 'Conjugate Transpose', $ 'Non-unit', KB, N-K-KB+1, CONE, B, IB+K-1, $ JB+K-1, DESCB, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PCHEMM( 'Left', UPLO, KB, N-K-KB+1, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K-1, JB+K+KB-1, $ DESCB, CONE, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PCHER2K( UPLO, 'Conjugate Transpose', N-K-KB+1, KB, $ -CONE, A, IA+K-1, JA+K+KB-1, DESCA, B, $ IB+K-1, JB+K+KB-1, DESCB, ONE, A, $ IA+K+KB-1, JA+K+KB-1, DESCA ) CALL PCHEMM( 'Left', UPLO, KB, N-K-KB+1, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K-1, JB+K+KB-1, $ DESCB, CONE, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PCTRSM( 'Right', UPLO, 'No transpose', 'Non-unit', $ KB, N-K-KB+1, CONE, B, IB+K+KB-1, JB+K+KB-1, $ DESCB, A, IA+K-1, JA+K+KB-1, DESCA ) END IF K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 10 * ELSE * * Compute inv(L)*sub( A )*inv(L') * K = 1 NB = DESCA( MB_ ) KB = MIN( ICEIL( IA, NB )*NB, IA+N-1 ) - IA + 1 * 20 CONTINUE * * Update the lower triangle of A(ia+k-1:ia+n-1,ja+k-1:ja+n-1) * CALL PCHEGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) IF( K+KB.LE.N ) THEN CALL PCTRSM( 'Right', UPLO, 'Conjugate transpose', $ 'Non-unit', N-K-KB+1, KB, CONE, B, IB+K-1, $ JB+K-1, DESCB, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PCHEMM( 'Right', UPLO, N-K-KB+1, KB, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K+KB-1, JB+K-1, $ DESCB, CONE, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PCHER2K( UPLO, 'No transpose', N-K-KB+1, KB, -CONE, $ A, IA+K+KB-1, JA+K-1, DESCA, B, IB+K+KB-1, $ JB+K-1, DESCB, ONE, A, IA+K+KB-1, $ JA+K+KB-1, DESCA ) CALL PCHEMM( 'Right', UPLO, N-K-KB+1, KB, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K+KB-1, JB+K-1, $ DESCB, CONE, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PCTRSM( 'Left', UPLO, 'No transpose', 'Non-unit', $ N-K-KB+1, KB, CONE, B, IB+K+KB-1, JB+K+KB-1, $ DESCB, A, IA+K+KB-1, JA+K-1, DESCA ) END IF K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 20 * END IF * ELSE * IF( UPPER ) THEN * * Compute U*sub( A )*U' * K = 1 NB = DESCA( NB_ ) KB = MIN( ICEIL( JA, NB )*NB, JA+N-1 ) - JA + 1 * 30 CONTINUE * * Update the upper triangle of A(ia:ia+k+kb-2,ja:ja+k+kb-2) * CALL PCTRMM( 'Left', UPLO, 'No transpose', 'Non-unit', K-1, $ KB, CONE, B, IB, JB, DESCB, A, IA, JA+K-1, $ DESCA ) CALL PCHEMM( 'Right', UPLO, K-1, KB, HALF, A, IA+K-1, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, CONE, A, $ IA, JA+K-1, DESCA ) CALL PCHER2K( UPLO, 'No transpose', K-1, KB, CONE, A, IA, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, ONE, A, $ IA, JA, DESCA ) CALL PCHEMM( 'Right', UPLO, K-1, KB, HALF, A, IA+K-1, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, CONE, A, $ IA, JA+K-1, DESCA ) CALL PCTRMM( 'Right', UPLO, 'Conjugate transpose', $ 'Non-unit', K-1, KB, CONE, B, IB+K-1, JB+K-1, $ DESCB, A, IA, JA+K-1, DESCA ) CALL PCHEGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) * K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 30 * ELSE * * Compute L'*sub( A )*L * K = 1 NB = DESCA( MB_ ) KB = MIN( ICEIL( IA, NB )*NB, IA+N-1 ) - IA + 1 * 40 CONTINUE * * Update the lower triangle of A(ia:ia+k+kb-2,ja:ja+k+kb-2) * CALL PCTRMM( 'Right', UPLO, 'No transpose', 'Non-unit', KB, $ K-1, CONE, B, IB, JB, DESCB, A, IA+K-1, JA, $ DESCA ) CALL PCHEMM( 'Left', UPLO, KB, K-1, HALF, A, IA+K-1, JA+K-1, $ DESCA, B, IB+K-1, JB, DESCB, CONE, A, IA+K-1, $ JA, DESCA ) CALL PCHER2K( UPLO, 'Conjugate transpose', K-1, KB, CONE, A, $ IA+K-1, JA, DESCA, B, IB+K-1, JB, DESCB, ONE, $ A, IA, JA, DESCA ) CALL PCHEMM( 'Left', UPLO, KB, K-1, HALF, A, IA+K-1, JA+K-1, $ DESCA, B, IB+K-1, JB, DESCB, CONE, A, IA+K-1, $ JA, DESCA ) CALL PCTRMM( 'Left', UPLO, 'Conjugate transpose', $ 'Non-unit', KB, K-1, CONE, B, IB+K-1, JB+K-1, $ DESCB, A, IA+K-1, JA, DESCA ) CALL PCHEGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) * K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 40 * END IF * END IF * RETURN * * End of PCHEGST * END scalapack-2.0.2/SRC/pchegvx.f000644 000766 000024 00000105753 10377154001 016164 0ustar00juliestaff000000 000000 SUBROUTINE PCHEGVX( IBTYPE, JOBZ, RANGE, UPLO, N, A, IA, JA, $ DESCA, B, IB, JB, DESCB, VL, VU, IL, IU, $ ABSTOL, M, NZ, W, ORFAC, Z, IZ, JZ, DESCZ, $ WORK, LWORK, RWORK, LRWORK, IWORK, LIWORK, $ IFAIL, ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IB, IBTYPE, IL, INFO, IU, IZ, JA, JB, JZ, $ LIWORK, LRWORK, LWORK, M, N, NZ REAL ABSTOL, ORFAC, VL, VU * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCB( * ), DESCZ( * ), $ ICLUSTR( * ), IFAIL( * ), IWORK( * ) REAL GAP( * ), RWORK( * ), W( * ) COMPLEX A( * ), B( * ), WORK( * ), Z( * ) * .. * * Purpose * * ======= * * PCHEGVX computes all the eigenvalues, and optionally, * the eigenvectors * of a complex generalized Hermitian-definite eigenproblem, of the form * sub( A )*x=(lambda)*sub( B )*x, sub( A )*sub( B )x=(lambda)*x, or * sub( B )*sub( A )*x=(lambda)*x. * Here sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ) is assumed to be * Hermitian, and sub( B ) denoting B( IB:IB+N-1, JB:JB+N-1 ) is assumed * to be Hermitian positive definite. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * JOBZ (global input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * = 'U': Upper triangles of sub( A ) and sub( B ) are stored; * = 'L': Lower triangles of sub( A ) and sub( B ) are stored. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix. * * On exit, if JOBZ = 'V', then if INFO = 0, sub( A ) contains * the distributed matrix Z of eigenvectors. The eigenvectors * are normalized as follows: * if IBTYPE = 1 or 2, Z**H*sub( B )*Z = I; * if IBTYPE = 3, Z**H*inv( sub( B ) )*Z = I. * If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') * or the lower triangle (if UPLO='L') of sub( A ), including * the diagonal, is destroyed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PCHEGVX cannot guarantee * correct error reporting. * * B (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( B ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( B ) contains * the upper triangular part of the matrix. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( B ) contains * the lower triangular part of the matrix. * * On exit, if INFO <= N, the part of sub( B ) containing the * matrix is overwritten by the triangular factor U or L from * the Cholesky factorization sub( B ) = U**H*U or * sub( B ) = L*L**H. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * DESCB( CTXT_ ) must equal DESCA( CTXT_ ) * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * ABSTOL (global input) REAL * If JOBZ='V', setting ABSTOL to PSLAMCH( CONTEXT, 'U') yields * the most orthogonal eigenvectors. * * The absolute error tolerance for the eigenvalues. * An approximate eigenvalue is accepted as converged * when it is determined to lie in an interval [a,b] * of width less than or equal to * * ABSTOL + EPS * max( |a|,|b| ) , * * where EPS is the machine precision. If ABSTOL is less than * or equal to zero, then EPS*norm(T) will be used in its place, * where norm(T) is the 1-norm of the tridiagonal matrix * obtained by reducing A to tridiagonal form. * * Eigenvalues will be computed most accurately when ABSTOL is * set to twice the underflow threshold 2*PSLAMCH('S') not zero. * If this routine returns with ((MOD(INFO,2).NE.0) .OR. * (MOD(INFO/8,2).NE.0)), indicating that some eigenvalues or * eigenvectors did not converge, try setting ABSTOL to * 2*PSLAMCH('S'). * * See "Computing Small Singular Values of Bidiagonal Matrices * with Guaranteed High Relative Accuracy," by Demmel and * Kahan, LAPACK Working Note #3. * * See "On the correctness of Parallel Bisection in Floating * Point" by Demmel, Dhillon and Ren, LAPACK Working Note #70 * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M unless the user supplies * insufficient space and PCHEGVX is not able to detect this * before beginning computation. To get all the eigenvectors * requested, the user must supply both sufficient * space to hold the eigenvectors in Z (M .LE. DESCZ(N_)) * and sufficient workspace to compute them. (See LWORK below.) * PCHEGVX is always able to detect insufficient space without * computation unless RANGE .EQ. 'V'. * * W (global output) REAL array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * ORFAC (global input) REAL * Specifies which eigenvectors should be reorthogonalized. * Eigenvectors that correspond to eigenvalues which are within * tol=ORFAC*norm(A) of each other are to be reorthogonalized. * However, if the workspace is insufficient (see LWORK), * tol may be decreased until all eigenvectors to be * reorthogonalized can be stored in one process. * No reorthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) COMPLEX array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. If an eigenvector * fails to converge, then that column of Z contains the latest * approximation to the eigenvector, and the index of the * eigenvector is returned in IFAIL. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * The row index in the global array Z indicating the first * row of sub( Z ). * * JZ (global input) INTEGER * The column index in the global array Z indicating the * first column of sub( Z ). * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX array, * dimension (LWORK) * WORK(1) returns the optimal workspace. * * LWORK (local input) INTEGER * Size of WORK array. If only eigenvalues are requested: * LWORK >= N + MAX( NB * ( NP0 + 1 ), 3 ) * If eigenvectors are requested: * LWORK >= N + ( NP0 + MQ0 + NB ) * NB * with NQ0 = NUMROC( NN, NB, 0, 0, NPCOL ). * * For optimal performance, greater workspace is needed, i.e. * LWORK >= MAX( LWORK, N + NHETRD_LWOPT, * NHEGST_LWOPT ) * Where LWORK is as defined above, and * NHETRD_LWORK = 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 1 ) * NPS * NHEGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB * * NB = DESCA( MB_ ) * NP0 = NUMROC( N, NB, 0, 0, NPROW ) * NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) * SQNPC = SQRT( DBLE( NPROW * NPCOL ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the optimal * size for all work arrays. Each of these values is returned * in the first entry of the correspondingwork array, and no * error message is issued by PXERBLA. * * RWORK (local workspace/output) REAL array, * dimension max(3,LRWORK) * On return, RWORK(1) contains the amount of workspace * required for optimal efficiency * if JOBZ='N' RWORK(1) = optimal amount of workspace * required to compute eigenvalues efficiently * if JOBZ='V' RWORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors * efficiently with no guarantee on orthogonality. * If RANGE='V', it is assumed that all eigenvectors * may be required when computing optimal workspace. * * LRWORK (local input) INTEGER * Size of RWORK * See below for definitions of variables used to define LRWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LRWORK >= 5 * NN + 4 * N * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * LRWORK >= 4*N + MAX( 5*NN, NP0 * MQ0 ) + * ICEIL( NEIG, NPROW*NPCOL)*NN * * The computed eigenvectors may not be orthogonal if the * minimal workspace is supplied and ORFAC is too small. * If you want to guarantee orthogonality (at the cost * of potentially poor performance) you should add * the following to LRWORK: * (CLUSTERSIZE-1)*N * where CLUSTERSIZE is the number of eigenvalues in the * largest cluster, where a cluster is defined as a set of * close eigenvalues: { W(K),...,W(K+CLUSTERSIZE-1) | * W(J+1) <= W(J) + ORFAC*2*norm(A) } * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = DESCZ( MB_ ) = * DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * When LRWORK is too small: * If LRWORK is too small to guarantee orthogonality, * PCHEGVX attempts to maintain orthogonality in * the clusters with the smallest * spacing between the eigenvalues. * If LRWORK is too small to compute all the eigenvectors * requested, no computation is performed and INFO=-25 * is returned. Note that when RANGE='V', PCHEGVX does * not know how many eigenvectors are requested until * the eigenvalues are computed. Therefore, when RANGE='V' * and as long as LRWORK is large enough to allow PCHEGVX to * compute the eigenvalues, PCHEGVX will compute the * eigenvalues and as many eigenvectors as it can. * * Relationship between workspace, orthogonality & performance: * If CLUSTERSIZE >= N/SQRT(NPROW*NPCOL), then providing * enough space to compute all the eigenvectors * orthogonally will cause serious degradation in * performance. In the limit (i.e. CLUSTERSIZE = N-1) * PCSTEIN will perform no better than CSTEIN on 1 processor. * For CLUSTERSIZE = N/SQRT(NPROW*NPCOL) reorthogonalizing * all eigenvectors will increase the total execution time * by a factor of 2 or more. * For CLUSTERSIZE > N/SQRT(NPROW*NPCOL) execution time will * grow as the square of the cluster size, all other factors * remaining equal and assuming enough workspace. Less * workspace means less reorthogonalization but faster * execution. * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * LIWORK >= 6 * NNP * Where: * NNP = MAX( N, NPROW*NPCOL + 1, 4 ) * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (output) INTEGER array, dimension (N) * IFAIL provides additional information when INFO .NE. 0 * If (MOD(INFO/16,2).NE.0) then IFAIL(1) indicates the order of * the smallest minor which is not positive definite. * If (MOD(INFO,2).NE.0) on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * * If neither of the above error conditions hold and JOBZ = 'V', * then the first M elements of IFAIL are set to zero. * * ICLUSTR (global output) integer array, dimension (2*NPROW*NPCOL) * This array contains indices of eigenvectors corresponding to * a cluster of eigenvalues that could not be reorthogonalized * due to insufficient workspace (see LWORK, ORFAC and INFO). * Eigenvectors corresponding to clusters of eigenvalues indexed * ICLUSTR(2*I-1) to ICLUSTR(2*I), could not be * reorthogonalized due to lack of workspace. Hence the * eigenvectors corresponding to these clusters may not be * orthogonal. ICLUSTR() is a zero terminated array. * (ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0) if and only if * K is the number of clusters * ICLUSTR is not referenced if JOBZ = 'N' * * GAP (global output) REAL array, * dimension (NPROW*NPCOL) * This array contains the gap between eigenvalues whose * eigenvectors could not be reorthogonalized. The output * values in this array correspond to the clusters indicated * by the array ICLUSTR. As a result, the dot product between * eigenvectors correspoding to the I^th cluster may be as high * as ( C * n ) / GAP(I) where C is a small constant. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: if (MOD(INFO,2).NE.0), then one or more eigenvectors * failed to converge. Their indices are stored * in IFAIL. Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding * to one or more clusters of eigenvalues could not be * reorthogonalized because of insufficient workspace. * The indices of the clusters are stored in the array * ICLUSTR. * if (MOD(INFO/4,2).NE.0), then space limit prevented * PCHEGVX from computing all of the eigenvectors * between VL and VU. The number of eigenvectors * computed is returned in NZ. * if (MOD(INFO/8,2).NE.0), then PCSTEBZ failed to * compute eigenvalues. * Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/16,2).NE.0), then B was not positive * definite. IFAIL(1) indicates the order of * the smallest minor which is not positive definite. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*), C(IC:IC+M-1,JC:JC+N-1), * and B( IB:IB+N-1, JB:JB+N-1 ) must verify some alignment properties, * namely the following expressions should be true: * * DESCA(MB_) = DESCA(NB_) * IA = IB = IZ * JA = IB = JZ * DESCA(M_) = DESCB(M_) =DESCZ(M_) * DESCA(N_) = DESCB(N_)= DESCZ(N_) * DESCA(MB_) = DESCB(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCB(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCB(RSRC_) = DESCZ(RSRC_) * DESCA(CSRC_) = DESCB(CSRC_) = DESCZ(CSRC_) * MOD( IA-1, DESCA( MB_ ) ) = 0 * MOD( JA-1, DESCA( NB_ ) ) = 0 * MOD( IB-1, DESCB( MB_ ) ) = 0 * MOD( JB-1, DESCB( NB_ ) ) = 0 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = 1.0E+0 ) REAL FIVE, ZERO PARAMETER ( FIVE = 5.0E+0, ZERO = 0.0E+0 ) INTEGER IERRNPD PARAMETER ( IERRNPD = 16 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LQUERY, UPPER, VALEIG, WANTZ CHARACTER TRANS INTEGER ANB, IACOL, IAROW, IBCOL, IBROW, ICOFFA, $ ICOFFB, ICTXT, IROFFA, IROFFB, LIWMIN, LRWMIN, $ LRWOPT, LWMIN, LWOPT, MQ0, MYCOL, MYROW, NB, $ NEIG, NHEGST_LWOPT, NHETRD_LWOPT, NN, NP0, $ NPCOL, NPROW, NPS, NQ0, SQNPC REAL EPS, SCALE * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV REAL PSLAMCH EXTERNAL LSAME, ICEIL, INDXG2P, NUMROC, PJLAENV, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHEEVX, PCHENGST, $ PCHK1MAT, PCHK2MAT, PCPOTRF, PCTRMM, PCTRSM, $ PXERBLA, SGEBR2D, SGEBS2D, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, DBLE, ICHAR, INT, MAX, MIN, MOD, $ REAL, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 900+CTXT_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2600+CTXT_ ) ELSE * * Get machine constants. * EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) * WANTZ = LSAME( JOBZ, 'V' ) UPPER = LSAME( UPLO, 'U' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, N, 4, IB, JB, DESCB, 13, INFO ) CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 26, INFO ) IF( INFO.EQ.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN RWORK( 1 ) = ABSTOL IF( VALEIG ) THEN RWORK( 2 ) = VL RWORK( 3 ) = VU ELSE RWORK( 2 ) = ZERO RWORK( 3 ) = ZERO END IF CALL SGEBS2D( DESCA( CTXT_ ), 'ALL', ' ', 3, 1, RWORK, $ 3 ) ELSE CALL SGEBR2D( DESCA( CTXT_ ), 'ALL', ' ', 3, 1, RWORK, 3, $ 0, 0 ) END IF IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) * * Compute the total amount of space needed * LQUERY = .FALSE. IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) $ LQUERY = .TRUE. * LIWMIN = 6*MAX( N, ( NPROW*NPCOL )+1, 4 ) * NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * IF( ( .NOT.WANTZ ) .OR. ( VALEIG .AND. ( .NOT.LQUERY ) ) ) $ THEN LWMIN = N + MAX( NB*( NP0+1 ), 3 ) LWOPT = LWMIN LRWMIN = 5*NN + 4*N IF( WANTZ ) THEN MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) LRWOPT = 4*N + MAX( 5*NN, NP0*MQ0 ) ELSE LRWOPT = LRWMIN END IF NEIG = 0 ELSE IF( ALLEIG .OR. VALEIG ) THEN NEIG = N ELSE IF( INDEIG ) THEN NEIG = IU - IL + 1 END IF MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) LWMIN = N + ( NP0+MQ0+NB )*NB LWOPT = LWMIN LRWMIN = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN LRWOPT = LRWMIN * END IF * * Compute how much workspace is needed to use the * new TRD and GST algorithms * ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS NB = DESCA( MB_ ) NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) NHEGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB LWOPT = MAX( LWOPT, N+NHETRD_LWOPT, NHEGST_LWOPT ) * * Version 1.0 Limitations * IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -2 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -3 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -5 ELSE IF( IROFFA.NE.0 ) THEN INFO = -7 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -8 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 900+NB_ ) ELSE IF( DESCA( M_ ).NE.DESCB( M_ ) ) THEN INFO = -( 1300+M_ ) ELSE IF( DESCA( N_ ).NE.DESCB( N_ ) ) THEN INFO = -( 1300+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1300+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCB( NB_ ) ) THEN INFO = -( 1300+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCB( RSRC_ ) ) THEN INFO = -( 1300+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCB( CSRC_ ) ) THEN INFO = -( 1300+CSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2200+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2200+CSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2200+CTXT_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -11 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -12 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -15 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -16 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -17 ELSE IF( VALEIG .AND. ( ABS( RWORK( 2 )-VL ).GT.FIVE*EPS* $ ABS( VL ) ) ) THEN INFO = -14 ELSE IF( VALEIG .AND. ( ABS( RWORK( 3 )-VU ).GT.FIVE*EPS* $ ABS( VU ) ) ) THEN INFO = -15 ELSE IF( ABS( RWORK( 1 )-ABSTOL ).GT.FIVE*EPS* $ ABS( ABSTOL ) ) THEN INFO = -18 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -28 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -30 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -32 END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( WANTZ ) THEN IDUM1( 2 ) = ICHAR( 'V' ) ELSE IDUM1( 2 ) = ICHAR( 'N' ) END IF IDUM2( 2 ) = 2 IF( UPPER ) THEN IDUM1( 3 ) = ICHAR( 'U' ) ELSE IDUM1( 3 ) = ICHAR( 'L' ) END IF IDUM2( 3 ) = 3 IF( ALLEIG ) THEN IDUM1( 4 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 4 ) = ICHAR( 'I' ) ELSE IDUM1( 4 ) = ICHAR( 'V' ) END IF IDUM2( 4 ) = 4 IF( LQUERY ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 5 CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 9, N, 4, N, 4, IB, $ JB, DESCB, 13, 5, IDUM1, IDUM2, INFO ) CALL PCHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 26, 0, IDUM1, IDUM2, $ INFO ) END IF * IWORK( 1 ) = LIWMIN WORK( 1 ) = CMPLX( REAL( LWOPT ) ) RWORK( 1 ) = REAL( LRWOPT ) * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHEGVX ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Form a Cholesky factorization of sub( B ). * CALL PCPOTRF( UPLO, N, B, IB, JB, DESCB, INFO ) IF( INFO.NE.0 ) THEN IWORK( 1 ) = LIWMIN WORK( 1 ) = CMPLX( REAL( LWOPT ) ) RWORK( 1 ) = REAL( LRWOPT ) IFAIL( 1 ) = INFO INFO = IERRNPD RETURN END IF * * Transform problem to standard eigenvalue problem and solve. * CALL PCHENGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, WORK, LWORK, INFO ) CALL PCHEEVX( JOBZ, RANGE, UPLO, N, A, IA, JA, DESCA, VL, VU, IL, $ IU, ABSTOL, M, NZ, W, ORFAC, Z, IZ, JZ, DESCZ, WORK, $ LWORK, RWORK, LRWORK, IWORK, LIWORK, IFAIL, ICLUSTR, $ GAP, INFO ) * IF( WANTZ ) THEN * * Backtransform eigenvectors to the original problem. * NEIG = M IF( IBTYPE.EQ.1 .OR. IBTYPE.EQ.2 ) THEN * * For sub( A )*x=(lambda)*sub( B )*x and * sub( A )*sub( B )*x=(lambda)*x; backtransform eigenvectors: * x = inv(L)'*y or inv(U)*y * IF( UPPER ) THEN TRANS = 'N' ELSE TRANS = 'C' END IF * CALL PCTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, $ B, IB, JB, DESCB, Z, IZ, JZ, DESCZ ) * ELSE IF( IBTYPE.EQ.3 ) THEN * * For sub( B )*sub( A )*x=(lambda)*x; * backtransform eigenvectors: x = L*y or U'*y * IF( UPPER ) THEN TRANS = 'C' ELSE TRANS = 'N' END IF * CALL PCTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, $ B, IB, JB, DESCB, Z, IZ, JZ, DESCZ ) END IF END IF * IF( SCALE.NE.ONE ) THEN CALL SSCAL( N, SCALE, W, 1 ) END IF * IWORK( 1 ) = LIWMIN WORK( 1 ) = CMPLX( REAL( LWOPT ) ) RWORK( 1 ) = REAL( LRWOPT ) RETURN * * End of PCHEGVX * END scalapack-2.0.2/SRC/pchengst.f000644 000766 000024 00000041700 10363532303 016322 0ustar00juliestaff000000 000000 SUBROUTINE PCHENGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, LWORK, N REAL SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ), WORK( * ) * .. * * Purpose * * ======= * * PCHENGST reduces a complex Hermitian-definite generalized * eigenproblem to standard form. * * PCHENGST performs the same function as PCHEGST, but is based on * rank 2K updates, which are faster and more scalable than * triangular solves (the basis of PCHENGST). * * PCHENGST calls PCHEGST when UPLO='U', hence PCHENGST provides * improved performance only when UPLO='L', IBTYPE=1. * * PCHENGST also calls PCHEGST when insufficient workspace is * provided, hence PCHENGST provides improved * performance only when LWORK >= 2 * NP0 * NB + NQ0 * NB + NB * NB * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**H or L**H*sub( A )*L. * * sub( B ) must have been previously factorized as U**H*U or L*L**H by * PCPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H); * = 2 or 3: compute U*sub( A )*U**H or L**H*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**H*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**H. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PCPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * SCALE (global output) REAL * Amount by which the eigenvalues should be scaled to * compensate for the scaling performed in this routine. * At present, SCALE is always returned as 1.0, it is * returned here to allow for future enhancement. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP0 +1 ), 3 * NB ) * * When IBTYPE = 1 and UPLO = 'L', PCHENGST provides improved * performance when LWORK >= 2 * NP0 * NB + NQ0 * NB + NB * NB * * where NB = MB_A = NB_A, * NP0 = NUMROC( N, NB, 0, 0, NPROW ), * NQ0 = NUMROC( N, NB, 0, 0, NPROW ), * * NUMROC ia a ScaLAPACK tool functions * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the * optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * * .. Parameters .. COMPLEX ONEHALF, ONE, MONE REAL RONE PARAMETER ( ONEHALF = ( 0.5E0, 0.0E0 ), $ ONE = ( 1.0E0, 0.0E0 ), $ MONE = ( -1.0E0, 0.0E0 ), RONE = 1.0E0 ) INTEGER DLEN_, CTXT_, MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( DLEN_ = 9, CTXT_ = 2, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER I, IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, INDAA, INDG, INDR, INDRT, IROFFA, $ IROFFB, J, K, KB, LWMIN, LWOPT, MYCOL, MYROW, $ NB, NP0, NPCOL, NPK, NPROW, NQ0, POSTK * .. * .. Local Arrays .. INTEGER DESCAA( DLEN_ ), DESCG( DLEN_ ), $ DESCR( DLEN_ ), DESCRT( DLEN_ ), IDUM1( 2 ), $ IDUM2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL LSAME, INDXG2P, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCGEMM, $ PCHEGST, PCHEMM, PCHER2K, PCHK2MAT, PCLACPY, $ PCTRSM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) SCALE = 1.0E0 * NB = DESCA( MB_ ) * * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) LWMIN = MAX( NB*( NP0+1 ), 3*NB ) IF( IBTYPE.EQ.1 .AND. .NOT.UPPER ) THEN LWOPT = 2*NP0*NB + NQ0*NB + NB*NB ELSE LWOPT = LWMIN END IF WORK( 1 ) = CMPLX( REAL( LWOPT ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IB, $ JB, DESCB, 11, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHENGST', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * IF( IBTYPE.NE.1 .OR. UPPER .OR. LWORK.LT.LWOPT ) THEN CALL PCHEGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, INFO ) RETURN END IF * CALL DESCSET( DESCG, N, NB, NB, NB, IAROW, IACOL, ICTXT, NP0 ) CALL DESCSET( DESCR, N, NB, NB, NB, IAROW, IACOL, ICTXT, NP0 ) CALL DESCSET( DESCRT, NB, N, NB, NB, IAROW, IACOL, ICTXT, NB ) CALL DESCSET( DESCAA, NB, NB, NB, NB, IAROW, IACOL, ICTXT, NB ) * INDG = 1 INDR = INDG + DESCG( LLD_ )*NB INDAA = INDR + DESCR( LLD_ )*NB INDRT = INDAA + DESCAA( LLD_ )*NB * DO 30 K = 1, N, NB * KB = MIN( N-K+1, NB ) POSTK = K + KB NPK = N - POSTK + 1 * * CALL PCLACPY( 'A', N-POSTK+1, KB, B, POSTK+IB-1, K+JB-1, DESCB, $ WORK( INDG ), POSTK, 1, DESCG ) CALL PCLACPY( 'A', N-POSTK+1, KB, A, POSTK+IA-1, K+JA-1, DESCA, $ WORK( INDR ), POSTK, 1, DESCR ) CALL PCLACPY( 'A', KB, K-1, A, K+IA-1, JA, DESCA, $ WORK( INDRT ), 1, 1, DESCRT ) * CALL PCLACPY( 'L', KB, KB, A, K+IA-1, K+JA-1, DESCA, $ WORK( INDR ), K, 1, DESCR ) CALL PCTRSM( 'Right', 'L', 'N', 'N', NPK, KB, MONE, B, K+IB-1, $ K+JB-1, DESCB, WORK( INDG ), POSTK, 1, DESCG ) * CALL PCHEMM( 'Right', 'L', NPK, KB, ONEHALF, A, K+IA-1, K+JA-1, $ DESCA, WORK( INDG ), POSTK, 1, DESCG, ONE, $ WORK( INDR ), POSTK, 1, DESCR ) * CALL PCHER2K( 'Lower', 'No T', NPK, KB, ONE, WORK( INDG ), $ POSTK, 1, DESCG, WORK( INDR ), POSTK, 1, DESCR, $ RONE, A, POSTK+IA-1, POSTK+JA-1, DESCA ) * CALL PCGEMM( 'No T', 'No Conj', NPK, K-1, KB, ONE, $ WORK( INDG ), POSTK, 1, DESCG, WORK( INDRT ), 1, $ 1, DESCRT, ONE, A, POSTK+IA-1, JA, DESCA ) * CALL PCHEMM( 'Right', 'L', NPK, KB, ONE, WORK( INDR ), K, 1, $ DESCR, WORK( INDG ), POSTK, 1, DESCG, ONE, A, $ POSTK+IA-1, K+JA-1, DESCA ) * CALL PCTRSM( 'Left', 'Lower', 'No Conj', 'Non-unit', KB, K-1, $ ONE, B, K+IB-1, K+JB-1, DESCB, A, K+IA-1, JA, $ DESCA ) * CALL PCLACPY( 'L', KB, KB, A, K+IA-1, K+JA-1, DESCA, $ WORK( INDAA ), 1, 1, DESCAA ) * IF( MYROW.EQ.DESCAA( RSRC_ ) .AND. MYCOL.EQ.DESCAA( CSRC_ ) ) $ THEN DO 20 I = 1, KB DO 10 J = 1, I WORK( INDAA+J-1+( I-1 )*DESCAA( LLD_ ) ) $ = CONJG( WORK( INDAA+I-1+( J-1 )*DESCAA( LLD_ ) ) ) 10 CONTINUE 20 CONTINUE END IF * CALL PCTRSM( 'Left', 'Lower', 'No Conj', 'Non-unit', KB, KB, $ ONE, B, K+IB-1, K+JB-1, DESCB, WORK( INDAA ), 1, $ 1, DESCAA ) * CALL PCTRSM( 'Right', 'Lower', 'Conj', 'Non-unit', KB, KB, ONE, $ B, K+IB-1, K+JB-1, DESCB, WORK( INDAA ), 1, 1, $ DESCAA ) * CALL PCLACPY( 'L', KB, KB, WORK( INDAA ), 1, 1, DESCAA, A, $ K+IA-1, K+JA-1, DESCA ) * CALL PCTRSM( 'Right', 'Lower', 'Conj', 'Non-unit', NPK, KB, $ ONE, B, K+IB-1, K+JB-1, DESCB, A, POSTK+IA-1, $ K+JA-1, DESCA ) * DESCR( CSRC_ ) = MOD( DESCR( CSRC_ )+1, NPCOL ) DESCG( CSRC_ ) = MOD( DESCG( CSRC_ )+1, NPCOL ) DESCRT( RSRC_ ) = MOD( DESCRT( RSRC_ )+1, NPROW ) DESCAA( RSRC_ ) = MOD( DESCAA( RSRC_ )+1, NPROW ) DESCAA( CSRC_ ) = MOD( DESCAA( CSRC_ )+1, NPCOL ) 30 CONTINUE * WORK( 1 ) = CMPLX( REAL( LWOPT ) ) * RETURN END scalapack-2.0.2/SRC/pchentrd.f000644 000766 000024 00000053157 10363532303 016327 0ustar00juliestaff000000 000000 SUBROUTINE PCHENTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LRWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL D( * ), E( * ), RWORK( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * Bugs * ==== * * * Support for UPLO='U' is limited to calling the old, slow, PCHETRD * code. * * * Purpose * * ======= * * PCHENTRD is a prototype version of PCHETRD which uses tailored * codes (either the serial, CHETRD, or the parallel code, PCHETTRD) * when the workspace provided by the user is adequate. * * * PCHENTRD reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Features * ======== * * PCHENTRD is faster than PCHETRD on almost all matrices, * particularly small ones (i.e. N < 500 * sqrt(P) ), provided that * enough workspace is available to use the tailored codes. * * The tailored codes provide performance that is essentially * independent of the input data layout. * * The tailored codes place no restrictions on IA, JA, MB or NB. * At present, IA, JA, MB and NB are restricted to those values allowed * by PCHETRD to keep the interface simple. These restrictions are * documented below. (Search for "restrictions".) * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK( 1 ) returns the optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP +1 ), 3 * NB ) * * For optimal performance, greater workspace is needed, i.e. * LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS + 4 ) * NPS * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) * SQNPC = INT( SQRT( REAL( NPROW * NPCOL ) ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * * RWORK (local workspace/local output) COMPLEX array, * dimension (LRWORK) * On exit, RWORK( 1 ) returns the optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= 1 * * For optimal performance, greater workspace is needed, i.e. * LRWORK >= MAX( 2 * N ) * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA .AND. IROFFA.EQ.0 ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER COLCTOP, ROWCTOP INTEGER ANB, CTXTB, I, IACOL, IAROW, ICOFFA, ICTXT, $ IINFO, INDB, INDRD, INDRE, INDTAU, INDW, IPW, $ IROFFA, J, JB, JX, K, KK, LLRWORK, LLWORK, $ LRWMIN, LWMIN, MINSZ, MYCOL, MYCOLB, MYROW, $ MYROWB, NB, NP, NPCOL, NPCOLB, NPROW, NPROWB, $ NPS, NQ, ONEPMIN, ONEPRMIN, SQNPC, TTLRWMIN, $ TTLWMIN * .. * .. Local Arrays .. INTEGER DESCB( DLEN_ ), DESCW( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, CHETRD, CHK1MAT, DESCSET, $ IGAMN2D, PCELSET, PCHER2K, PCHETD2, PCHETTRD, $ PCHK1MAT, PCLAMR1D, PCLATRD, PCTRMR2D, $ PSLAMR1D, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC, PJLAENV EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC, PJLAENV * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, INT, MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = MAX( 1, NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) LWMIN = MAX( ( NP+1 )*NB, 3*NB ) ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) MINSZ = PJLAENV( ICTXT, 5, 'PCHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( REAL( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) TTLWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS LRWMIN = 1 TTLRWMIN = 2*NPS * WORK( 1 ) = CMPLX( REAL( TTLWMIN ) ) RWORK( 1 ) = REAL( TTLRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 * * The following two restrictions are not necessary provided * that either of the tailored codes are used. * ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 600+NB_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 11 IF( LRWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 13 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHENTRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * ONEPMIN = N*N + 3*N + 1 LLWORK = LWORK CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LLWORK, 1, 1, -1, -1, -1, $ -1 ) * ONEPRMIN = 2*N LLRWORK = LRWORK CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LLRWORK, 1, 1, -1, -1, -1, $ -1 ) * * * Use the serial, LAPACK, code: CTRD on small matrices if we * we have enough space. * NPROWB = 0 IF( ( N.LT.MINSZ .OR. SQNPC.EQ.1 ) .AND. LLWORK.GE.ONEPMIN .AND. $ LLRWORK.GE.ONEPRMIN .AND. .NOT.UPPER ) THEN NPROWB = 1 NPS = N ELSE IF( LLWORK.GE.TTLWMIN .AND. LLRWORK.GE.TTLRWMIN .AND. .NOT. $ UPPER ) THEN NPROWB = SQNPC END IF END IF * IF( NPROWB.GE.1 ) THEN NPCOLB = NPROWB SQNPC = NPROWB INDB = 1 INDRD = 1 INDRE = INDRD + NPS INDTAU = INDB + NPS*NPS INDW = INDTAU + NPS LLWORK = LLWORK - INDW + 1 * CALL BLACS_GET( ICTXT, 10, CTXTB ) CALL BLACS_GRIDINIT( CTXTB, 'Row major', SQNPC, SQNPC ) CALL BLACS_GRIDINFO( CTXTB, NPROWB, NPCOLB, MYROWB, MYCOLB ) CALL DESCSET( DESCB, N, N, 1, 1, 0, 0, CTXTB, NPS ) * CALL PCTRMR2D( UPLO, 'N', N, N, A, IA, JA, DESCA, WORK( INDB ), $ 1, 1, DESCB, ICTXT ) * * * Only those processors in context CTXTB are needed for a while * IF( NPROWB.GT.0 ) THEN * IF( NPROWB.EQ.1 ) THEN CALL CHETRD( UPLO, N, WORK( INDB ), NPS, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), $ WORK( INDW ), LLWORK, INFO ) ELSE * CALL PCHETTRD( 'L', N, WORK( INDB ), 1, 1, DESCB, $ RWORK( INDRD ), RWORK( INDRE ), $ WORK( INDTAU ), WORK( INDW ), LLWORK, $ INFO ) * END IF END IF * * All processors participate in moving the data back to the * way that PCHENTRD expects it. * CALL PSLAMR1D( N-1, RWORK( INDRE ), 1, 1, DESCB, E, 1, JA, $ DESCA ) * CALL PSLAMR1D( N, RWORK( INDRD ), 1, 1, DESCB, D, 1, JA, $ DESCA ) * CALL PCLAMR1D( N, WORK( INDTAU ), 1, 1, DESCB, TAU, 1, JA, $ DESCA ) * CALL PCTRMR2D( UPLO, 'N', N, N, WORK( INDB ), 1, 1, DESCB, A, $ IA, JA, DESCA, ICTXT ) * IF( MYROWB.GE.0 ) $ CALL BLACS_GRIDEXIT( CTXTB ) * ELSE * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPW = NP*NB + 1 * IF( UPPER ) THEN * * Reduce the upper triangle of sub( A ). * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, $ INDXG2P( JA+N-KK, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ), ICTXT, MAX( 1, NP ) ) * DO 10 K = N - KK + 1, NB + 1, -NB JB = MIN( N-K+1, NB ) I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part of * the matrix * CALL PCLATRD( UPLO, K+JB-1, JB, A, IA, JA, DESCA, D, E, $ TAU, WORK, 1, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(IA:I-1,JA:J-1), using an * update of the form: * A(IA:I-1,JA:J-1) := A(IA:I-1,JA:J-1) - V*W' - W*V' * CALL PCHER2K( UPLO, 'No transpose', K-1, JB, -CONE, A, $ IA, J, DESCA, WORK, 1, 1, DESCW, ONE, A, $ IA, JA, DESCA ) * * Copy last superdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J, NB, 0, IACOL, NPCOL ), NQ ) CALL PCELSET( A, I-1, J, DESCA, CMPLX( E( JX ) ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+NPCOL-1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PCHETD2( UPLO, MIN( N, NB ), A, IA, JA, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) * ELSE * * Reduce the lower triangle of sub( A ) * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) * DO 20 K = 1, N - NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part * of the matrix * CALL PCLATRD( UPLO, N-K+1, NB, A, I, J, DESCA, D, E, TAU, $ WORK, K, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(I+NB:IA+N-1,I+NB:IA+N-1), * using an update of the form: A(I+NB:IA+N-1,I+NB:IA+N-1) := * A(I+NB:IA+N-1,I+NB:IA+N-1) - V*W' - W*V' * CALL PCHER2K( UPLO, 'No transpose', N-K-NB+1, NB, -CONE, $ A, I+NB, J, DESCA, WORK, K+NB, 1, DESCW, $ ONE, A, I+NB, J+NB, DESCA ) * * Copy last subdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J+NB-1, NB, 0, IACOL, NPCOL ), NQ ) CALL PCELSET( A, I+NB, J+NB-1, DESCA, CMPLX( E( JX ) ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * 20 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PCHETD2( UPLO, KK, A, IA+K-1, JA+K-1, DESCA, D, E, TAU, $ WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * END IF * WORK( 1 ) = CMPLX( REAL( TTLWMIN ) ) RWORK( 1 ) = REAL( TTLRWMIN ) * RETURN * * End of PCHENTRD * END scalapack-2.0.2/SRC/pchetd2.f000644 000766 000024 00000043063 10363532303 016044 0ustar00juliestaff000000 000000 SUBROUTINE PCHETD2( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL D( * ), E( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCHETD2 reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*N. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX HALF, ONE, ZERO PARAMETER ( HALF = ( 0.5E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER IACOL, IAROW, ICOFFA, ICTXT, II, IK, IROFFA, J, $ JJ, JK, JN, LDA, LWMIN, MYCOL, MYROW, NPCOL, $ NPROW COMPLEX ALPHA, TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CAXPY, CGEBR2D, $ CGEBS2D, CHK1MAT, CHEMV, $ CHER2, CLARFG, INFOG2L, PXERBLA * .. * .. External Functions .. LOGICAL LSAME COMPLEX CDOTC EXTERNAL LSAME, CDOTC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) LWMIN = 3 * N * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHETD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Compute local information * LDA = DESCA( LLD_ ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) * IF( UPPER ) THEN * * Process(IAROW, IACOL) owns block to be reduced * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Reduce the upper triangle of sub( A ) * IK = II+N-1+(JJ+N-2)*LDA A( IK ) = REAL( A( IK ) ) DO 10 J = N-1, 1, -1 IK = II + J - 1 JK = JJ + J - 1 * * Generate elementary reflector H(i) = I - tau * v * v' * to annihilate A(IA:IA+J-1,JA:JA+J-1) * ALPHA = A( IK+JK*LDA ) CALL CLARFG( J, ALPHA, A( II+JK*LDA ), 1, TAUI ) E( JK+1 ) = REAL( ALPHA ) * IF( TAUI.NE.ZERO ) THEN * * Apply H(i) from both sides to * A(IA:IA+J-1,JA:JA+J-1) * A( IK+JK*LDA ) = ONE * * Compute x := tau * A * v storing x in TAU(1:i) * CALL CHEMV( UPLO, J, TAUI, A( II+(JJ-1)*LDA ), $ LDA, A( II+JK*LDA ), 1, ZERO, $ TAU( JJ ), 1 ) * * Compute w := x - 1/2 * tau * (x'*v) * v * ALPHA = -HALF*TAUI*CDOTC( J, TAU( JJ ), 1, $ A( II+JK*LDA ), 1 ) CALL CAXPY( J, ALPHA, A( II+JK*LDA ), 1, $ TAU( JJ ), 1 ) * * Apply the transformation as a rank-2 update: * A := A - v * w' - w * v' * CALL CHER2( UPLO, J, -ONE, A( II+JK*LDA ), 1, $ TAU( JJ ), 1, A( II+(JJ-1)*LDA ), $ LDA ) END IF * * Copy D, E, TAU to broadcast them columnwise. * A( IK+JK*LDA ) = CMPLX( E( JK+1 ) ) D( JK+1 ) = REAL( A( IK+1+JK*LDA ) ) WORK( J+1 ) = CMPLX( D( JK+1 ) ) WORK( N+J+1 ) = CMPLX( E( JK+1 ) ) TAU( JK+1 ) = TAUI WORK( 2*N+J+1 ) = TAU( JK+1 ) * 10 CONTINUE D( JJ ) = REAL( A( II+(JJ-1)*LDA ) ) WORK( 1 ) = CMPLX( D( JJ ) ) WORK( N+1 ) = ZERO WORK( 2*N+1 ) = ZERO * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1 ) * ELSE CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1, $ IAROW, IACOL ) DO 20 J = 2, N JN = JJ + J - 1 D( JN ) = REAL( WORK( J ) ) E( JN ) = REAL( WORK( N+J ) ) TAU( JN ) = WORK( 2*N+J ) 20 CONTINUE D( JJ ) = REAL( WORK( 1 ) ) END IF END IF * ELSE * * Process (IAROW, IACOL) owns block to be factorized * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Reduce the lower triangle of sub( A ) * A( II+(JJ-1)*LDA ) = REAL( A( II+(JJ-1)*LDA ) ) DO 30 J = 1, N - 1 IK = II + J - 1 JK = JJ + J - 1 * * Generate elementary reflector H(i) = I - tau * v * v' * to annihilate A(IA+J-JA+2:IA+N-1,JA+J-1) * ALPHA = A( IK+1+(JK-1)*LDA ) CALL CLARFG( N-J, ALPHA, A( IK+2+(JK-1)*LDA ), 1, $ TAUI ) E( JK ) = REAL( ALPHA ) * IF( TAUI.NE.ZERO ) THEN * * Apply H(i) from both sides to * A(IA+J-JA+1:IA+N-1,JA+J+1:JA+N-1) * A( IK+1+(JK-1)*LDA ) = ONE * * Compute x := tau * A * v storing y in TAU(i:n-1) * CALL CHEMV( UPLO, N-J, TAUI, A( IK+1+JK*LDA ), $ LDA, A( IK+1+(JK-1)*LDA ), 1, $ ZERO, TAU( JK ), 1 ) * * Compute w := x - 1/2 * tau * (x'*v) * v * ALPHA = -HALF*TAUI*CDOTC( N-J, TAU( JK ), 1, $ A( IK+1+(JK-1)*LDA ), 1 ) CALL CAXPY( N-J, ALPHA, A( IK+1+(JK-1)*LDA ), $ 1, TAU( JK ), 1 ) * * Apply the transformation as a rank-2 update: * A := A - v * w' - w * v' * CALL CHER2( UPLO, N-J, -ONE, $ A( IK+1+(JK-1)*LDA ), 1, $ TAU( JK ), 1, A( IK+1+JK*LDA ), $ LDA ) END IF * * Copy D(JK), E(JK), TAU(JK) to broadcast them * columnwise. * A( IK+1+(JK-1)*LDA ) = CMPLX( E( JK ) ) D( JK ) = REAL( A( IK+(JK-1)*LDA ) ) WORK( J ) = CMPLX( D( JK ) ) WORK( N+J ) = CMPLX( E( JK ) ) TAU( JK ) = TAUI WORK( 2*N+J ) = TAU( JK ) 30 CONTINUE JN = JJ + N - 1 D( JN ) = REAL( A( II+N-1+(JN-1)*LDA ) ) WORK( N ) = CMPLX( D( JN ) ) TAU( JN ) = ZERO WORK( 2*N ) = ZERO * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK, $ 1 ) * ELSE CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK, $ 1, IAROW, IACOL ) DO 40 J = 1, N - 1 JN = JJ + J - 1 D( JN ) = REAL( WORK( J ) ) E( JN ) = REAL( WORK( N+J ) ) TAU( JN ) = WORK( 2*N+J ) 40 CONTINUE JN = JJ + N - 1 D( JN ) = REAL( WORK( N ) ) TAU( JN ) = ZERO END IF END IF END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCHETD2 * END scalapack-2.0.2/SRC/pchetrd.f000644 000766 000024 00000040336 10363532303 016144 0ustar00juliestaff000000 000000 SUBROUTINE PCHETRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL D( * ), E( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCHETRD reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP +1 ), 3 * NB ) * * where NB = MB_A = NB_A, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA .AND. IROFFA.EQ.0 ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, IINFO, IPW, $ IROFFA, J, JB, JX, K, KK, LWMIN, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHER2K, $ PCHETD2, PCHK1MAT, PCLATRD, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = MAX( 1, NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) LWMIN = MAX( (NP+1)*NB, 3*NB ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 11 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHETRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPW = NP * NB + 1 * IF( UPPER ) THEN * * Reduce the upper triangle of sub( A ). * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, INDXG2P( JA+N-KK, $ NB, MYCOL, DESCA( CSRC_ ), NPCOL ), ICTXT, $ MAX( 1, NP ) ) * DO 10 K = N-KK+1, NB+1, -NB JB = MIN( N-K+1, NB ) I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part of * the matrix * CALL PCLATRD( UPLO, K+JB-1, JB, A, IA, JA, DESCA, D, E, TAU, $ WORK, 1, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(IA:I-1,JA:J-1), using an * update of the form: * A(IA:I-1,JA:J-1) := A(IA:I-1,JA:J-1) - V*W' - W*V' * CALL PCHER2K( UPLO, 'No transpose', K-1, JB, -CONE, A, IA, $ J, DESCA, WORK, 1, 1, DESCW, ONE, A, IA, JA, $ DESCA ) * * Copy last superdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J, NB, 0, IACOL, NPCOL ), NQ ) CALL PCELSET( A, I-1, J, DESCA, CMPLX( E( JX ) ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PCHETD2( UPLO, MIN( N, NB ), A, IA, JA, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) * ELSE * * Reduce the lower triangle of sub( A ) * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) * DO 20 K = 1, N-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part * of the matrix * CALL PCLATRD( UPLO, N-K+1, NB, A, I, J, DESCA, D, E, TAU, $ WORK, K, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(I+NB:IA+N-1,I+NB:IA+N-1), * using an update of the form: A(I+NB:IA+N-1,I+NB:IA+N-1) := * A(I+NB:IA+N-1,I+NB:IA+N-1) - V*W' - W*V' * CALL PCHER2K( UPLO, 'No transpose', N-K-NB+1, NB, -CONE, A, $ I+NB, J, DESCA, WORK, K+NB, 1, DESCW, ONE, A, $ I+NB, J+NB, DESCA ) * * Copy last subdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J+NB-1, NB, 0, IACOL, NPCOL ), NQ ) CALL PCELSET( A, I+NB, J+NB-1, DESCA, CMPLX( E( JX ) ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + 1, NPCOL ) * 20 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PCHETD2( UPLO, KK, A, IA+K-1, JA+K-1, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCHETRD * END scalapack-2.0.2/SRC/pchettrd.f000644 000766 000024 00000123277 11750130340 016332 0ustar00juliestaff000000 000000 SUBROUTINE PCHETTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL D( * ), E( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * * ======= * * PCHETTRD reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding * process and memory location. * * Let A be a generic term for any 2D block cyclicly distributed * array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- ----------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, * indicating the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to * distribute the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to * distribute the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the * first row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCp(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCp( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCq( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes * of its process row. * The values of LOCp() and LOCq() may be determined via a call to * the ScaLAPACK tool function, NUMROC: * LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCq(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dim LOCq(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dim LOCq(JA+N-1) * if UPLO = 'U', LOCq(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCq(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal workspace * * LWORK (local input) INTEGER * The dimension of the array WORK. * LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + NPS * Where: * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) * ANB = PJLAENV( DESCA( CTXT_ ), 3, 'PCHETTRD', 'L', 0, 0, * 0, 0 ) * * NUMROC is a ScaLAPACK tool function; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of * elementary reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of * elementary reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and * vi denotes an element of the vector defining H(i). * * Data storage requirements * ========================= * * PCHETTRD is not intended to be called directly. All users are * encourage to call PCHETRD which will then call PCHETTRD if * appropriate. A must be in cyclic format (i.e. MB = NB = 1), * the process grid must be square ( i.e. NPROW = NPCOL ) and * only lower triangular storage is supported. * * Local variables * =============== * * PCHETTRD uses five local arrays: * WORK ( InV ) dimension ( NP, ANB+1): array V * WORK ( InH ) dimension ( NP, ANB+1): array H * WORK ( InVT ) dimension ( NQ, ANB+1): transpose of the array V * WORK ( InHT ) dimension ( NQ, ANB+1): transpose of the array H * WORK ( InVTT ) dimension ( NQ, 1): transpose of the array VT * * Arrays V and H are replicated across all processor columns. * Arrays V^T and H^T are replicated across all processor rows. * * WORK ( InVT ), or V^T, is stored as a tall skinny * array ( NQ x ANB-1 ) for efficiency. Since only the lower * triangular portion of A is updated, Av is computed as: * tril(A) * v + v^T * tril(A,-1). This is performed as * two local triangular matrix-vector multiplications (both in * MVR2) followed by a transpose and a sum across the columns. * In the local computation, WORK( InVT ) is used to compute * tril(A) * v and WORK( InV ) is used to compute * v^T * tril(A,-1) * * The following variables are global indices into A: * INDEX: The current global row and column number. * MAXINDEX: The global row and column for the first row and * column in the trailing block of A. * LIIB, LIJB: The first row, column in * * The following variables point into the arrays A, V, H, V^T, H^T: * BINDEX =INDEX-MININDEX: The column index in V, H, V^T, H^T. * LII: local index I: The local row number for row INDEX * LIJ: local index J: The local column number for column INDEX * LIIP1: local index I+1: The local row number for row INDEX+1 * LIJP1: local index J+1: The local col number for col INDEX+1 * LTLI: lower triangular local index I: The local row for the * upper left entry in tril( A(INDEX, INDEX) ) * LTLIP1: lower triangular local index I+1: The local row for the * upper left entry in tril( A(INDEX+1, INDEX+1) ) * * Details: The distinction between LII and LTLI (and between * LIIP1 and LTLIP1) is subtle. Within the current processor * column (i.e. MYCOL .eq. CURCOL) they are the same. However, * on some processors, A( LII, LIJ ) points to an element * above the diagonal, on these processors, LTLI = LII+1. * * The following variables give the number of rows and/or columns * in various matrices: * NP: The number of local rows in A( 1:N, 1:N ) * NQ: The number of local columns in A( 1:N, 1:N ) * NPM0: The number of local rows in A( INDEX:N, INDEX:N ) * NQM0: The number of local columns in A( INDEX:N, INDEX:N ) * NPM1: The number of local rows in A( INDEX+1:N, INDEX:N ) * NQM1: The number of local columns in A( INDEX+1:N, INDEX:N ) * LTNM0: The number of local rows & columns in * tril( A( INDEX:N, INDEX:N ) ) * LTNM1: The number of local rows & columns in * tril( A( INDEX+1:N, INDEX+1:N ) ) * NOTE: LTNM0 == LTNM1 on all processors except the diagonal * processors, i.e. those where MYCOL == MYROW. * * Invariants: * NP = NPM0 + LII - 1 * NQ = NQM0 + LIJ - 1 * NP = NPM1 + LIIP1 - 1 * NQ = NQM1 + LIJP1 - 1 * NP = LTLI + LTNM0 - 1 * NP = LTLIP1 + LTNM1 - 1 * * Temporary variables. The following variables are used within * a few lines after they are set and do hold state from one loop * iteration to the next: * * The matrix A: * The matrix A does not hold the same values that it would * in an unblocked code nor the values that it would hold in * in a blocked code. * * The value of A is confusing. It is easiest to state the * difference between trueA and A at the point that MVR2 is called, * so we will start there. * * Let trueA be the value that A would * have at a given point in an unblocked code and A * be the value that A has in this code at the same point. * * At the time of the call to MVR2, * trueA = A + V' * H + H' * V * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ). * * At the bottom of the inner loop, * trueA = A + V' * H + H' * V + v' * h + h' * v * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ) and * v = V( liip1:N, BINDEX+1 ) and * h = H( liip1:N, BINDEX+1 ) * * At the top of the loop, BINDEX gets incremented, hence: * trueA = A + V' * H + H' * V + v' * h + h' * v * where H = H( MAXINDEX:N, 1:BINDEX-1 ) and * V = V( MAXINDEX:N, 1:BINDEX-1 ) and * v = V( liip1:N, BINDEX ) and * h = H( liip1:N, BINDEX ) * * * A gets updated at the bottom of the outer loop * After this update, trueA = A + v' * h + h' * v * where v = V( liip1:N, BINDEX ) and * h = H( liip1:N, BINDEX ) and BINDEX = 0 * Indeed, the previous loop invariant as stated above for the * top of the loop still holds, but with BINDEX = 0, H and V * are null matrices. * * After the current column of A is updated, * trueA( INDEX, INDEX:N ) = A( INDEX, INDEX:N ) * the rest of A is untouched. * * After the current block column of A is updated, * trueA = A + V' * H + H' * V * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ) * * This brings us back to the point at which mvr2 is called. * * * Details of the parallelization: * * We delay spreading v across to all processor columns (which * would naturally happen at the bottom of the loop) in order to * combine the spread of v( : , i-1 ) with the spread of h( : , i ) * * In order to compute h( :, i ), we must update A( :, i ) * which means that the processor column owning A( :, i ) must * have: c, tau, v( i, i ) and h( i, i ). * * The traditional * way of computing v (and the one used in pzlatrd.f and * zlatrd.f) is: * v = tau * v * c = v' * h * alpha = - tau * c / 2 * v = v + alpha * h * However, the traditional way of computing v requires that tau * be broadcast to all processors in the current column (to compute * v = tau * v) and then a sum-to-all is required (to * compute v' * h ). We use the following formula instead: * c = v' * h * v = tau * ( v - c * tau' * h / 2 ) * The above formula allows tau to be spread down in the * same call to SGSUM2D which performs the sum-to-all of c. * * The computation of v, which could be performed in any processor * column (or other procesor subsets), is performed in the * processor column that owns A( :, i+1 ) so that A( :, i+1 ) * can be updated prior to spreading v across. * * We keep the block column of A up-to-date to minimize the * work required in updating the current column of A. Updating * the block column of A is reasonably load balanced whereas * updating the current column of A is not (only the current * processor column is involved). * * In the following overview of the steps performed, M in the * margin indicates message traffic and C indicates O(n^2 nb/sqrt(p)) * or more flops per processor. * * Inner loop: * A( index:n, index ) -= ( v * ht(bindex) + h * vt( bindex) ) *M h = house( A(index:n, index) ) *M Spread v, h across *M vt = v^T; ht = h^T * A( index+1:n, index+1:maxindex ) -= * ( v * ht(index+1:maxindex) + h *vt(index+1:maxindex) ) *C v = tril(A) * h; vt = ht * tril(A,-1) *MorC v = v - H*V*h - V*H*h *M v = v + vt^T *M c = v' * h * v = tau * ( v - c * tau' * h / 2 ) *C A = A - H*V - V*H * * * * ================================================================= * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E0 ) COMPLEX Z_ONE, Z_NEGONE, Z_ZERO PARAMETER ( Z_ONE = 1.0E0, Z_NEGONE = -1.0E0, $ Z_ZERO = 0.0E0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * * * .. Local Scalars .. * * LOGICAL BALANCED, INTERLEAVE, TWOGEMMS, UPPER INTEGER ANB, BINDEX, CURCOL, CURROW, I, ICTXT, INDEX, $ INDEXA, INDEXINH, INDEXINV, INH, INHB, INHT, $ INHTB, INTMP, INV, INVB, INVT, INVTB, J, LDA, $ LDV, LDZG, LII, LIIB, LIIP1, LIJ, LIJB, LIJP1, $ LTLIP1, LTNM1, LWMIN, MAXINDEX, MININDEX, $ MYCOL, MYFIRSTROW, MYROW, MYSETNUM, NBZG, NP, $ NPB, NPCOL, NPM0, NPM1, NPROW, NPS, NPSET, NQ, $ NQB, NQM1, NUMROWS, NXTCOL, NXTROW, PBMAX, $ PBMIN, PBSIZE, PNB, ROWSPERPROC REAL NORM, SAFMAX, SAFMIN COMPLEX ALPHA, BETA, C, ONEOVERBETA, TOPH, TOPNV, $ TOPTAU, TOPV, TTOPH, TTOPV * .. * .. Local Arrays .. * * * * INTEGER IDUM1( 1 ), IDUM2( 1 ) REAL DTMP( 5 ) COMPLEX CC( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, CGEMM, CGEMV, $ CGERV2D, CGESD2D, CGSUM2D, CHK1MAT, CLAMOV, $ CSCAL, CTRMVT, PCHK1MAT, PSTREECOMB, PXERBLA, $ SCOMBNRM2, SGEBR2D, SGEBS2D, SGSUM2D * .. * .. External Functions .. * LOGICAL LSAME INTEGER ICEIL, NUMROC, PJLAENV REAL PSLAMCH, SCNRM2 EXTERNAL LSAME, ICEIL, NUMROC, PJLAENV, PSLAMCH, SCNRM2 * .. * .. Intrinsic Functions .. INTRINSIC AIMAG, CMPLX, CONJG, ICHAR, MAX, MIN, MOD, $ REAL, SIGN, SQRT * .. * * * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * * * Further details * =============== * * At the top of the loop, v and nh have been computed but not * spread across. Hence, A is out-of-date even after the * rank 2k update. Furthermore, we compute the next v before * nh is spread across. * * I claim that if we used a sum-to-all on NV, by summing CC within * each column, that we could compute NV locally and could avoid * spreading V across. Bruce claims that sum-to-all can be made * to cost no more than sum-to-one on the Paragon. If that is * true, this would be a win. But, * the BLACS sum-to-all is just a sum-to-one followed by a broadcast, * and hence the present scheme is better for now. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * SAFMAX = SQRT( PSLAMCH( ICTXT, 'O' ) ) / N SAFMIN = SQRT( PSLAMCH( ICTXT, 'S' ) ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE * * Here we set execution options for PCHETTRD * PNB = PJLAENV( ICTXT, 2, 'PCHETTRD', 'L', 0, 0, 0, 0 ) ANB = PJLAENV( ICTXT, 3, 'PCHETTRD', 'L', 0, 0, 0, 0 ) * INTERLEAVE = ( PJLAENV( ICTXT, 4, 'PCHETTRD', 'L', 1, 0, 0, $ 0 ).EQ.1 ) TWOGEMMS = ( PJLAENV( ICTXT, 4, 'PCHETTRD', 'L', 2, 0, 0, $ 0 ).EQ.1 ) BALANCED = ( PJLAENV( ICTXT, 4, 'PCHETTRD', 'L', 3, 0, 0, $ 0 ).EQ.1 ) * CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) * * UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 .AND. DESCA( NB_ ).NE.1 ) $ INFO = 600 + NB_ IF( INFO.EQ.0 ) THEN * * * Here is the arithmetic: * Let maxnpq = max( np, nq, 2 * ANB ) * LDV = 4 * max( np, nq ) + 2 * LWMIN = 2 * ( ANB + 1 ) * LDV + MAX( np, 2 * ANB ) * = 2 * ( ANB + 1 ) * ( 4 * NPS + 2 ) + NPS * * This overestimates memory requirements when ANB > NP/2 * Memory requirements are lower when interleave = .false. * Hence, we could have two sets of memory requirements, * one for interleave and one for * * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) LWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + NPS * WORK( 1 ) = CMPLX( LWMIN ) IF( .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IA.NE.1 ) THEN INFO = -4 ELSE IF( JA.NE.1 ) THEN INFO = -5 ELSE IF( NPROW.NE.NPCOL ) THEN INFO = -( 600+CTXT_ ) ELSE IF( DESCA( DTYPE_ ).NE.1 ) THEN INFO = -( 600+DTYPE_ ) ELSE IF( DESCA( MB_ ).NE.1 ) THEN INFO = -( 600+MB_ ) ELSE IF( DESCA( NB_ ).NE.1 ) THEN INFO = -( 600+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.0 ) THEN INFO = -( 600+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.0 ) THEN INFO = -( 600+CSRC_ ) ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 * CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHETTRD', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * * Reduce the lower triangle of sub( A ) NP = NUMROC( N, 1, MYROW, 0, NPROW ) NQ = NUMROC( N, 1, MYCOL, 0, NPCOL ) * NXTROW = 0 NXTCOL = 0 * LIIP1 = 1 LIJP1 = 1 NPM1 = NP NQM1 = NQ * LDA = DESCA( LLD_ ) ICTXT = DESCA( CTXT_ ) * * * * Miscellaneous details: * Put tau, D and E in the right places * Check signs * Place all the arrays in WORK, control their placement * in memory. * * * * Loop invariants * A(LIIP1, LIJ) points to the first element of A(I+1,J) * NPM1,NQM1 = the number of rows, cols in A( LII+1:N,LIJ+1:N ) * A(LII:N,LIJ:N) is one step out of date. * proc( CURROW, CURCOL ) owns A(LII,LIJ) * proc( NXTROW, CURCOL ) owns A(LIIP1,LIJ) * INH = 1 * IF( INTERLEAVE ) THEN * * H and V are interleaved to minimize memory movement * LDV has to be twice as large to accomodate interleaving. * In addition, LDV is doubled again to allow v, h and * toptau to be spreaad across and transposed in a * single communication operation with minimum memory * movement. * * We could reduce LDV back to 2*MAX(NPM1,NQM1) * by increasing the memory movement required in * the spread and transpose of v, h and toptau. * However, since the non-interleaved path already * provides a mear minimum memory requirement option, * we did not provide this additional path. * LDV = 4*( MAX( NPM1, NQM1 ) ) + 2 * INH = 1 * INV = INH + LDV / 2 INVT = INH + ( ANB+1 )*LDV * INHT = INVT + LDV / 2 INTMP = INVT + LDV*( ANB+1 ) * ELSE LDV = MAX( NPM1, NQM1 ) * INHT = INH + LDV*( ANB+1 ) INV = INHT + LDV*( ANB+1 ) * * The code works without this +1, but only because of a * coincidence. Without the +1, WORK(INVT) gets trashed, but * WORK(INVT) is only used once and when it is used, it is * multiplied by WORK( INH ) which is zero. Hence, the fact * that WORK(INVT) is trashed has no effect. * INVT = INV + LDV*( ANB+1 ) + 1 INTMP = INVT + LDV*( 2*ANB ) * END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCHETTRD', -INFO ) WORK( 1 ) = CMPLX( LWMIN ) RETURN END IF * * * The satisfies the loop invariant: trueA = A - V * HT - H * VT, * (where V, H, VT and HT all have BINDEX+1 rows/columns) * the first ANB times through the loop. * * * * Setting either ( InH and InHT ) or InV to Z_ZERO * is adequate except in the face of NaNs. * * DO 10 I = 1, NP WORK( INH+I-1 ) = Z_ZERO WORK( INV+I-1 ) = Z_ZERO 10 CONTINUE DO 20 I = 1, NQ WORK( INHT+I-1 ) = Z_ZERO 20 CONTINUE * * * TOPNV = Z_ZERO * LTLIP1 = LIJP1 LTNM1 = NPM1 IF( MYCOL.GT.MYROW ) THEN LTLIP1 = LTLIP1 + 1 LTNM1 = LTNM1 - 1 END IF * * DO 210 MININDEX = 1, N - 1, ANB * * MAXINDEX = MIN( MININDEX+ANB-1, N ) LIJB = NUMROC( MAXINDEX, 1, MYCOL, 0, NPCOL ) + 1 LIIB = NUMROC( MAXINDEX, 1, MYROW, 0, NPROW ) + 1 * NQB = NQ - LIJB + 1 NPB = NP - LIIB + 1 INHTB = INHT + LIJB - 1 INVTB = INVT + LIJB - 1 INHB = INH + LIIB - 1 INVB = INV + LIIB - 1 * * * * DO 160 INDEX = MININDEX, MIN( MAXINDEX, N-1 ) * BINDEX = INDEX - MININDEX * CURROW = NXTROW CURCOL = NXTCOL * NXTROW = MOD( CURROW+1, NPROW ) NXTCOL = MOD( CURCOL+1, NPCOL ) * LII = LIIP1 LIJ = LIJP1 NPM0 = NPM1 * IF( MYROW.EQ.CURROW ) THEN NPM1 = NPM1 - 1 LIIP1 = LIIP1 + 1 END IF IF( MYCOL.EQ.CURCOL ) THEN NQM1 = NQM1 - 1 LIJP1 = LIJP1 + 1 LTLIP1 = LTLIP1 + 1 LTNM1 = LTNM1 - 1 END IF * * * * * V = NV, VT = NVT, H = NH, HT = NHT * * * Update the current column of A * * IF( MYCOL.EQ.CURCOL ) THEN * INDEXA = LII + ( LIJ-1 )*LDA INDEXINV = INV + LII - 1 + ( BINDEX-1 )*LDV INDEXINH = INH + LII - 1 + ( BINDEX-1 )*LDV TTOPH = CONJG( WORK( INHT+LIJ-1+BINDEX*LDV ) ) TTOPV = CONJG( TOPNV ) * IF( INDEX.GT.1 ) THEN DO 30 I = 0, NPM0 - 1 * A( INDEXA+I ) = A( INDEXA+I ) A( INDEXA+I ) = A( INDEXA+I ) - $ WORK( INDEXINV+LDV+I )*TTOPH - $ WORK( INDEXINH+LDV+I )*TTOPV 30 CONTINUE END IF * * END IF * * IF( MYCOL.EQ.CURCOL ) THEN * * Compute the householder vector * IF( MYROW.EQ.CURROW ) THEN DTMP( 2 ) = REAL( A( LII+( LIJ-1 )*LDA ) ) ELSE DTMP( 2 ) = ZERO END IF IF( MYROW.EQ.NXTROW ) THEN DTMP( 3 ) = REAL( A( LIIP1+( LIJ-1 )*LDA ) ) DTMP( 4 ) = AIMAG( A( LIIP1+( LIJ-1 )*LDA ) ) ELSE DTMP( 3 ) = ZERO DTMP( 4 ) = ZERO END IF * NORM = SCNRM2( NPM1, A( LIIP1+( LIJ-1 )*LDA ), 1 ) DTMP( 1 ) = NORM * * IF DTMP(5) = 1.0, NORM is too large and might cause * overflow, hence PSTREECOMB must be called. IF DTMP(5) * is zero on output, DTMP(1) can be trusted. * DTMP( 5 ) = ZERO IF( DTMP( 1 ).GE.SAFMAX .OR. DTMP( 1 ).LT.SAFMIN ) THEN DTMP( 5 ) = ONE DTMP( 1 ) = ZERO END IF * DTMP( 1 ) = DTMP( 1 )*DTMP( 1 ) CALL SGSUM2D( ICTXT, 'C', ' ', 5, 1, DTMP, 5, -1, $ CURCOL ) IF( DTMP( 5 ).EQ.ZERO ) THEN DTMP( 1 ) = SQRT( DTMP( 1 ) ) ELSE DTMP( 1 ) = NORM CALL PSTREECOMB( ICTXT, 'C', 1, DTMP, -1, MYCOL, $ SCOMBNRM2 ) END IF * NORM = DTMP( 1 ) * D( LIJ ) = DTMP( 2 ) IF( MYROW.EQ.CURROW .AND. MYCOL.EQ.CURCOL ) THEN A( LII+( LIJ-1 )*LDA ) = CMPLX( D( LIJ ), ZERO ) END IF * * ALPHA = CMPLX( DTMP( 3 ), DTMP( 4 ) ) * NORM = SIGN( NORM, REAL( ALPHA ) ) * IF( NORM.EQ.ZERO ) THEN TOPTAU = ZERO ELSE BETA = NORM + ALPHA TOPTAU = BETA / NORM ONEOVERBETA = 1.0E0 / BETA * CALL CSCAL( NPM1, ONEOVERBETA, $ A( LIIP1+( LIJ-1 )*LDA ), 1 ) END IF * IF( MYROW.EQ.NXTROW ) THEN A( LIIP1+( LIJ-1 )*LDA ) = Z_ONE END IF * TAU( LIJ ) = TOPTAU E( LIJ ) = -NORM * END IF * * * Spread v, nh, toptau across * DO 40 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+I ) = A( LIIP1+I+ $ ( LIJ-1 )*LDA ) 40 CONTINUE * IF( MYCOL.EQ.CURCOL ) THEN WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 ) = TOPTAU CALL CGEBS2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), $ NPM1+NPM1+1 ) ELSE CALL CGEBR2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), $ NPM1+NPM1+1, MYROW, CURCOL ) TOPTAU = WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 ) END IF DO 50 I = 0, NPM1 - 1 WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1- $ 1+BINDEX*LDV+NPM1+I ) 50 CONTINUE * IF( INDEX.LT.N ) THEN IF( MYROW.EQ.NXTROW .AND. MYCOL.EQ.CURCOL ) $ A( LIIP1+( LIJ-1 )*LDA ) = E( LIJ ) END IF * * Transpose v, nh * * IF( MYROW.EQ.MYCOL ) THEN DO 60 I = 0, NPM1 + NPM1 WORK( INVT+LIJP1-1+BINDEX*LDV+I ) = WORK( INV+LIIP1-1+ $ BINDEX*LDV+I ) 60 CONTINUE ELSE CALL CGESD2D( ICTXT, NPM1+NPM1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), NPM1+NPM1, $ MYCOL, MYROW ) CALL CGERV2D( ICTXT, NQM1+NQM1, 1, $ WORK( INVT+LIJP1-1+BINDEX*LDV ), NQM1+NQM1, $ MYCOL, MYROW ) END IF * DO 70 I = 0, NQM1 - 1 WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV+I ) = WORK( INVT+ $ LIJP1-1+BINDEX*LDV+NQM1+I ) 70 CONTINUE * * * Update the current block column of A * IF( INDEX.GT.1 ) THEN DO 90 J = LIJP1, LIJB - 1 DO 80 I = 0, NPM1 - 1 * A( LIIP1+I+( J-1 )*LDA ) = A( LIIP1+I+( J-1 )*LDA ) $ - WORK( INV+LIIP1-1+BINDEX*LDV+I )* $ CONJG( WORK( INHT+J-1+BINDEX*LDV ) ) - $ WORK( INH+LIIP1-1+BINDEX*LDV+I )* $ CONJG( WORK( INVT+J-1+BINDEX*LDV ) ) 80 CONTINUE 90 CONTINUE END IF * * * * Compute NV = A * NHT; NVT = A * NH * * These two lines are necessary because these elements * are not always involved in the calls to CTRMVT * for two reasons: * 1) On diagonal processors, the call to TRMVT * involves only LTNM1-1 elements * 2) On some processes, NQM1 < LTM1 or LIIP1 < LTLIP1 * and when the results are combined across all processes, * uninitialized values may be included. WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ) = Z_ZERO WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+NQM1-1 ) = Z_ZERO * * IF( MYROW.EQ.MYCOL ) THEN IF( LTNM1.GT.1 ) THEN CALL CTRMVT( 'L', LTNM1-1, $ A( LTLIP1+1+( LIJP1-1 )*LDA ), LDA, $ WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ), 1, $ WORK( INH+LTLIP1+1-1+( BINDEX+1 )*LDV ), $ 1, WORK( INV+LTLIP1+1-1+( BINDEX+1 )* $ LDV ), 1, WORK( INHT+LIJP1-1+( BINDEX+ $ 1 )*LDV ), 1 ) END IF DO 100 I = 1, LTNM1 WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV ) $ = WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV ) + $ A( LTLIP1+I-1+( LIJP1+I-1-1 )*LDA )* $ WORK( INH+LTLIP1+I-1-1+( BINDEX+1 )*LDV ) 100 CONTINUE ELSE IF( LTNM1.GT.0 ) $ CALL CTRMVT( 'L', LTNM1, A( LTLIP1+( LIJP1-1 )*LDA ), $ LDA, WORK( INVT+LIJP1-1+( BINDEX+1 )* $ LDV ), 1, WORK( INH+LTLIP1-1+( BINDEX+ $ 1 )*LDV ), 1, WORK( INV+LTLIP1-1+ $ ( BINDEX+1 )*LDV ), 1, $ WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV ), $ 1 ) * END IF * * * We take advantage of the fact that: * A * sum( B ) = sum ( A * B ) for matrices A,B * * trueA = A + V * HT + H * VT * hence: (trueA)v = Av' + V * HT * v + H * VT * v * VT * v = sum_p_in_NPROW ( VTp * v ) * H * VT * v = H * sum (VTp * v) = sum ( H * VTp * v ) * * v = v + V * HT * h + H * VT * h * * * * tmp = HT * nh1 DO 110 I = 1, 2*( BINDEX+1 ) WORK( INTMP-1+I ) = 0 110 CONTINUE * IF( BALANCED ) THEN NPSET = NPROW MYSETNUM = MYROW ROWSPERPROC = ICEIL( NQB, NPSET ) MYFIRSTROW = MIN( NQB+1, 1+ROWSPERPROC*MYSETNUM ) NUMROWS = MIN( ROWSPERPROC, NQB-MYFIRSTROW+1 ) * * * tmp = HT * v * CALL CGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE, $ WORK( INHTB+MYFIRSTROW-1 ), LDV, $ WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ), $ 1, Z_ZERO, WORK( INTMP ), 1 ) * tmp2 = VT * v CALL CGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE, $ WORK( INVTB+MYFIRSTROW-1 ), LDV, $ WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ), $ 1, Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 ) * * CALL CGSUM2D( ICTXT, 'C', ' ', 2*( BINDEX+1 ), 1, $ WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 ) ELSE * tmp = HT * v * CALL CGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INHTB ), $ LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1, $ Z_ZERO, WORK( INTMP ), 1 ) * tmp2 = VT * v CALL CGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INVTB ), $ LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1, $ Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 ) * END IF * * * IF( BALANCED ) THEN MYSETNUM = MYCOL * ROWSPERPROC = ICEIL( NPB, NPSET ) MYFIRSTROW = MIN( NPB+1, 1+ROWSPERPROC*MYSETNUM ) NUMROWS = MIN( ROWSPERPROC, NPB-MYFIRSTROW+1 ) * CALL CGSUM2D( ICTXT, 'R', ' ', 2*( BINDEX+1 ), 1, $ WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 ) * * * v = v + V * tmp IF( INDEX.GT.1. ) THEN CALL CGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE, $ WORK( INVB+MYFIRSTROW-1 ), LDV, $ WORK( INTMP ), 1, Z_ONE, $ WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )* $ LDV ), 1 ) * * v = v + H * tmp2 CALL CGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE, $ WORK( INHB+MYFIRSTROW-1 ), LDV, $ WORK( INTMP+BINDEX+1 ), 1, Z_ONE, $ WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )* $ LDV ), 1 ) END IF * ELSE * v = v + V * tmp CALL CGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INVB ), $ LDV, WORK( INTMP ), 1, Z_ONE, $ WORK( INVB+( BINDEX+1 )*LDV ), 1 ) * * * v = v + H * tmp2 CALL CGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INHB ), $ LDV, WORK( INTMP+BINDEX+1 ), 1, Z_ONE, $ WORK( INVB+( BINDEX+1 )*LDV ), 1 ) * END IF * * * Transpose NV and add it back into NVT * IF( MYROW.EQ.MYCOL ) THEN DO 120 I = 0, NQM1 - 1 WORK( INTMP+I ) = WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+ $ I ) 120 CONTINUE ELSE CALL CGESD2D( ICTXT, NQM1, 1, $ WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ), $ NQM1, MYCOL, MYROW ) CALL CGERV2D( ICTXT, NPM1, 1, WORK( INTMP ), NPM1, MYCOL, $ MYROW ) * END IF DO 130 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1- $ 1+( BINDEX+1 )*LDV+I ) + WORK( INTMP+I ) 130 CONTINUE * * Sum-to-one NV rowwise (within a row) * CALL CGSUM2D( ICTXT, 'R', ' ', NPM1, 1, $ WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ), NPM1, $ MYROW, NXTCOL ) * * * Dot product c = NV * NH * Sum-to-all c within next processor column * * IF( MYCOL.EQ.NXTCOL ) THEN CC( 1 ) = Z_ZERO DO 140 I = 0, NPM1 - 1 CC( 1 ) = CC( 1 ) + CONJG( WORK( INV+LIIP1-1+( BINDEX+ $ 1 )*LDV+I ) )*WORK( INH+LIIP1-1+ $ ( BINDEX+1 )*LDV+I ) 140 CONTINUE IF( MYROW.EQ.NXTROW ) THEN CC( 2 ) = WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ) CC( 3 ) = WORK( INH+LIIP1-1+( BINDEX+1 )*LDV ) ELSE CC( 2 ) = Z_ZERO CC( 3 ) = Z_ZERO END IF CALL CGSUM2D( ICTXT, 'C', ' ', 3, 1, CC, 3, -1, NXTCOL ) * TOPV = CC( 2 ) C = CC( 1 ) TOPH = CC( 3 ) * TOPNV = TOPTAU*( TOPV-C*CONJG( TOPTAU ) / 2*TOPH ) * * * Compute V = Tau * (V - C * Tau' / 2 * H ) * * DO 150 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = TOPTAU* $ ( WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I )-C* $ CONJG( TOPTAU ) / 2*WORK( INH+LIIP1-1+( BINDEX+1 )* $ LDV+I ) ) 150 CONTINUE * END IF * * 160 CONTINUE * * * Perform the rank2k update * IF( MAXINDEX.LT.N ) THEN * DO 170 I = 0, NPM1 - 1 WORK( INTMP+I ) = WORK( INH+LIIP1-1+ANB*LDV+I ) 170 CONTINUE * * * IF( .NOT.TWOGEMMS ) THEN IF( INTERLEAVE ) THEN LDZG = LDV / 2 ELSE CALL CLAMOV( 'A', LTNM1, ANB, WORK( INHT+LIJP1-1 ), $ LDV, WORK( INVT+LIJP1-1+ANB*LDV ), LDV ) * CALL CLAMOV( 'A', LTNM1, ANB, WORK( INV+LTLIP1-1 ), $ LDV, WORK( INH+LTLIP1-1+ANB*LDV ), LDV ) LDZG = LDV END IF NBZG = ANB*2 ELSE LDZG = LDV NBZG = ANB END IF * * DO 180 PBMIN = 1, LTNM1, PNB * PBSIZE = MIN( PNB, LTNM1-PBMIN+1 ) PBMAX = MIN( LTNM1, PBMIN+PNB-1 ) CALL CGEMM( 'N', 'C', PBSIZE, PBMAX, NBZG, Z_NEGONE, $ WORK( INH+LTLIP1-1+PBMIN-1 ), LDZG, $ WORK( INVT+LIJP1-1 ), LDZG, Z_ONE, $ A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA ) IF( TWOGEMMS ) THEN CALL CGEMM( 'N', 'C', PBSIZE, PBMAX, ANB, Z_NEGONE, $ WORK( INV+LTLIP1-1+PBMIN-1 ), LDZG, $ WORK( INHT+LIJP1-1 ), LDZG, Z_ONE, $ A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA ) END IF 180 CONTINUE * * * DO 190 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+I ) = WORK( INV+LIIP1-1+ANB*LDV+I ) WORK( INH+LIIP1-1+I ) = WORK( INTMP+I ) 190 CONTINUE DO 200 I = 0, NQM1 - 1 WORK( INHT+LIJP1-1+I ) = WORK( INHT+LIJP1-1+ANB*LDV+I ) 200 CONTINUE * * END IF * * End of the update A code * 210 CONTINUE * IF( MYCOL.EQ.NXTCOL ) THEN IF( MYROW.EQ.NXTROW ) THEN * D( NQ ) = REAL( A( NP+( NQ-1 )*LDA ) ) A( NP+( NQ-1 )*LDA ) = D( NQ ) * CALL SGEBS2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1 ) ELSE CALL SGEBR2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1, NXTROW, $ NXTCOL ) END IF END IF * * * * WORK( 1 ) = CMPLX( LWMIN ) RETURN * * End of PCHETTRD * * END scalapack-2.0.2/SRC/pclabrd.f000644 000766 000024 00000055346 10363532303 016131 0ustar00juliestaff000000 000000 SUBROUTINE PCLABRD( M, N, NB, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ X, IX, JX, DESCX, Y, IY, JY, DESCY, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IX, IY, JA, JX, JY, M, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL D( * ), E( * ) COMPLEX A( * ), TAUP( * ), TAUQ( * ), X( * ), Y( * ), $ WORK( * ) * .. * * Purpose * ======= * * PCLABRD reduces the first NB rows and columns of a complex general * M-by-N distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper * or lower bidiagonal form by an unitary transformation Q' * A * P, and * returns the matrices X and Y which are needed to apply the transfor- * mation to the unreduced part of sub( A ). * * If M >= N, sub( A ) is reduced to upper bidiagonal form; if M < N, to * lower bidiagonal form. * * This is an auxiliary routine called by PCGEBRD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NB (global input) INTEGER * The number of leading rows and columns of sub( A ) to be * reduced. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ) to be reduced. On exit, * the first NB rows and columns of the matrix are overwritten; * the rest of the distributed matrix sub( A ) is unchanged. * If m >= n, elements on and below the diagonal in the first NB * columns, with the array TAUQ, represent the unitary * matrix Q as a product of elementary reflectors; and * elements above the diagonal in the first NB rows, with the * array TAUP, represent the unitary matrix P as a product * of elementary reflectors. * If m < n, elements below the diagonal in the first NB * columns, with the array TAUQ, represent the unitary * matrix Q as a product of elementary reflectors, and * elements on and above the diagonal in the first NB rows, * with the array TAUP, represent the unitary matrix P as * a product of elementary reflectors. * See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(ia+i-1,ja+i-1). D is tied to the distributed * matrix A. * * E (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(ia+i-1,ja+i) for i = 1,2,...,n-1; * if m < n, E(i) = A(ia+i,ja+i-1) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) COMPLEX array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUQ is * tied to the distributed matrix A. See Further Details. * * TAUP (local output) COMPLEX array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix P. TAUP is * tied to the distributed matrix A. See Further Details. * * X (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_X,NB). On exit, the local * pieces of the distributed M-by-NB matrix * X(IX:IX+M-1,JX:JX+NB-1) required to update the unreduced * part of sub( A ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * Y (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_Y,NB). On exit, the local * pieces of the distributed N-by-NB matrix * Y(IY:IY+N-1,JY:JY+NB-1) required to update the unreduced * part of sub( A ). * * IY (global input) INTEGER * The row index in the global array Y indicating the first * row of sub( Y ). * * JY (global input) INTEGER * The column index in the global array Y indicating the * first column of sub( Y ). * * DESCY (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Y. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK >= NB_A + NQ, with * * NQ = NUMROC( N+MOD( IA-1, NB_Y ), NB_Y, MYCOL, IACOL, NPCOL ) * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors. * * If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in * A(ia+i-1:ia+m-1,ja+i-1); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is * stored on exit in A(ia+i-1,ja+i:ja+n-1); tauq is stored in * TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); u(1:i-1) = 0, u(i) = 1, and u(i:n) is * stored on exit in A(ia+i-1,ja+i:ja+n-1); tauq is stored in * TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The elements of the vectors v and u together form the m-by-nb matrix * V and the nb-by-n matrix U' which are needed, with X and Y, to apply * the transformation to the unreduced part of the matrix, using a block * update of the form: sub( A ) := sub( A ) - V*Y' - X*U'. * * The contents of sub( A ) on exit are illustrated by the following * examples with nb = 2: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) * ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) * ( v1 v2 a a a ) ( v1 1 a a a a ) * ( v1 v2 a a a ) ( v1 v2 a a a a ) * ( v1 v2 a a a ) ( v1 v2 a a a a ) * ( v1 v2 a a a ) * * where a denotes an element of the original matrix which is unchanged, * vi denotes an element of the vector defining H(i), and ui an element * of the vector defining G(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, IPY, IW, J, JJ, $ JWY, K, MYCOL, MYROW, NPCOL, NPROW COMPLEX ALPHA, TAU INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), $ DESCTP( DLEN_ ), DESCTQ( DLEN_ ), $ DESCW( DLEN_ ), DESCWY( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCCOPY, $ PCELGET, PCELSET, PCGEMV, PCLACGV, $ PCLARFG, PCSCAL, PSELSET * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) IPY = DESCA( MB_ ) + 1 IW = MOD( IA-1, DESCA( NB_ ) ) + 1 ALPHA = ZERO * CALL DESCSET( DESCWY, 1, N+MOD( IA-1, DESCY( NB_ ) ), 1, $ DESCA( NB_ ), IAROW, IACOL, ICTXT, 1 ) CALL DESCSET( DESCW, DESCA( MB_ ), 1, DESCA( MB_ ), 1, IAROW, $ IACOL, ICTXT, DESCA( MB_ ) ) CALL DESCSET( DESCTQ, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), IAROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCTP, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), IACOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) * IF( M.GE.N ) THEN * * Reduce to upper bidiagonal form * CALL DESCSET( DESCD, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) DO 10 K = 1, NB I = IA + K - 1 J = JA + K - 1 JWY = IW + K * * Update A(i:ia+m-1,j) * IF( K.GT.1 ) THEN CALL PCGEMV( 'No transpose', M-K+1, K-1, -ONE, A, I, JA, $ DESCA, Y, IY, JY+K-1, DESCY, 1, ONE, A, I, $ J, DESCA, 1 ) CALL PCGEMV( 'No transpose', M-K+1, K-1, -ONE, X, IX+K-1, $ JX, DESCX, A, IA, J, DESCA, 1, ONE, A, I, J, $ DESCA, 1 ) CALL PCELSET( A, I-1, J, DESCA, ALPHA ) END IF * * Generate reflection Q(i) to annihilate A(i+1:ia+m-1,j) * CALL PCLARFG( M-K+1, ALPHA, I, J, A, I+1, J, DESCA, 1, $ TAUQ ) CALL PSELSET( D, 1, J, DESCD, REAL( ALPHA ) ) CALL PCELSET( A, I, J, DESCA, ONE ) * * Compute Y(IA+I:IA+N-1,J) * CALL PCGEMV( 'Conjugate transpose', M-K+1, N-K, ONE, A, I, $ J+1, DESCA, A, I, J, DESCA, 1, ZERO, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PCGEMV( 'Conjugate transpose', M-K+1, K-1, ONE, A, I, $ JA, DESCA, A, I, J, DESCA, 1, ZERO, WORK, IW, $ 1, DESCW, 1 ) CALL PCGEMV( 'Conjugate transpose', K-1, N-K, -ONE, Y, IY, $ JY+K, DESCY, WORK, IW, 1, DESCW, 1, ONE, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PCGEMV( 'Conjugate transpose', M-K+1, K-1, ONE, X, $ IX+K-1, JX, DESCX, A, I, J, DESCA, 1, ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PCGEMV( 'Conjugate transpose', K-1, N-K, -ONE, A, IA, $ J+1, DESCA, WORK, IW, 1, DESCW, 1, ONE, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) * CALL PCELGET( 'Rowwise', ' ', TAU, TAUQ, 1, J, DESCTQ ) CALL PCSCAL( N-K, TAU, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PCLACGV( N-K, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PCCOPY( N-K, WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ), $ Y, IY+K-1, JY+K, DESCY, DESCY( M_ ) ) * * Update A(i,j+1:ja+n-1) * CALL PCLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PCLACGV( K, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCGEMV( 'Conjugate transpose', K, N-K, -ONE, Y, IY, $ JY+K, DESCY, A, I, JA, DESCA, DESCA( M_ ), ONE, $ A, I, J+1, DESCA, DESCA( M_ ) ) CALL PCLACGV( K, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCLACGV( K-1, X, IX+K-1, JX, DESCX, DESCX( M_ ) ) CALL PCGEMV( 'Conjugate transpose', K-1, N-K, -ONE, A, IA, $ J+1, DESCA, X, IX+K-1, JX, DESCX, DESCX( M_ ), $ ONE, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PCLACGV( K-1, X, IX+K-1, JX, DESCX, DESCX( M_ ) ) CALL PCELSET( A, I, J, DESCA, CMPLX( REAL( ALPHA ) ) ) * * Generate reflection P(i) to annihilate A(i,j+2:ja+n-1) * CALL PCLARFG( N-K, ALPHA, I, J+1, A, I, $ MIN( J+2, N+JA-1 ), DESCA, DESCA( M_ ), TAUP ) CALL PSELSET( E, I, 1, DESCE, REAL( ALPHA ) ) CALL PCELSET( A, I, J+1, DESCA, ONE ) * * Compute X(I+1:IA+M-1,J) * CALL PCGEMV( 'No transpose', M-K, N-K, ONE, A, I+1, J+1, $ DESCA, A, I, J+1, DESCA, DESCA( M_ ), ZERO, X, $ IX+K, JX+K-1, DESCX, 1 ) CALL PCGEMV( 'No transpose', K, N-K, ONE, Y, IY, JY+K, $ DESCY, A, I, J+1, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PCGEMV( 'No transpose', M-K, K, -ONE, A, I+1, JA, $ DESCA, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) CALL PCGEMV( 'No transpose', K-1, N-K, ONE, A, IA, J+1, $ DESCA, A, I, J+1, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PCGEMV( 'No transpose', M-K, K-1, -ONE, X, IX+K, JX, $ DESCX, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) * CALL PCELGET( 'Columnwise', ' ', TAU, TAUP, I, 1, DESCTP ) CALL PCSCAL( M-K, TAU, X, IX+K, JX+K-1, DESCX, 1 ) CALL PCLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) 10 CONTINUE * ELSE * * Reduce to lower bidiagonal form * CALL DESCSET( DESCD, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) DO 20 K = 1, NB I = IA + K - 1 J = JA + K - 1 JWY = IW + K * * Update A(i,j:ja+n-1) * CALL PCLACGV( N-K+1, A, I, J, DESCA, DESCA( M_ ) ) IF( K.GT.1 ) THEN CALL PCLACGV( K-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCGEMV( 'Conjugate transpose', K-1, N-K+1, -ONE, Y, $ IY, JY+K-1, DESCY, A, I, JA, DESCA, $ DESCA( M_ ), ONE, A, I, J, DESCA, $ DESCA( M_ ) ) CALL PCLACGV( K-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCLACGV( K-1, X, IX+K-1, JX, DESCX, DESCX( M_ ) ) CALL PCGEMV( 'Conjugate transpose', K-1, N-K+1, -ONE, A, $ IA, J, DESCA, X, IX+K-1, JX, DESCX, $ DESCX( M_ ), ONE, A, I, J, DESCA, $ DESCA( M_ ) ) CALL PCLACGV( K-1, X, IX+K-1, JX, DESCX, DESCX( M_ ) ) CALL PCELSET( A, I, J-1, DESCA, CMPLX( REAL( ALPHA ) ) ) END IF * * Generate reflection P(i) to annihilate A(i,j+1:ja+n-1) * CALL PCLARFG( N-K+1, ALPHA, I, J, A, I, J+1, DESCA, $ DESCA( M_ ), TAUP ) CALL PSELSET( D, I, 1, DESCD, REAL( ALPHA ) ) CALL PCELSET( A, I, J, DESCA, ONE ) * * Compute X(i+1:ia+m-1,j) * CALL PCGEMV( 'No transpose', M-K, N-K+1, ONE, A, I+1, J, $ DESCA, A, I, J, DESCA, DESCA( M_ ), ZERO, X, $ IX+K, JX+K-1, DESCX, 1 ) CALL PCGEMV( 'No transpose', K-1, N-K+1, ONE, Y, IY, JY+K-1, $ DESCY, A, I, J, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PCGEMV( 'No transpose', M-K, K-1, -ONE, A, I+1, JA, $ DESCA, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) CALL PCGEMV( 'No transpose', K-1, N-K+1, ONE, A, IA, J, $ DESCA, A, I, J, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PCGEMV( 'No transpose', M-K, K-1, -ONE, X, IX+K, JX, $ DESCX, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) * CALL PCELGET( 'Columnwise', ' ', TAU, TAUP, I, 1, DESCTP ) CALL PCSCAL( M-K, TAU, X, IX+K, JX+K-1, DESCX, 1 ) CALL PCLACGV( N-K+1, A, I, J, DESCA, DESCA( M_ ) ) * * Update A(i+1:ia+m-1,j) * CALL PCGEMV( 'No transpose', M-K, K-1, -ONE, A, I+1, JA, $ DESCA, Y, IY, JY+K-1, DESCY, 1, ONE, A, I+1, J, $ DESCA, 1 ) CALL PCGEMV( 'No transpose', M-K, K, -ONE, X, IX+K, JX, $ DESCX, A, IA, J, DESCA, 1, ONE, A, I+1, J, $ DESCA, 1 ) CALL PCELSET( A, I, J, DESCA, ALPHA ) * * Generate reflection Q(i) to annihilate A(i+2:ia+m-1,j) * CALL PCLARFG( M-K, ALPHA, I+1, J, A, MIN( I+2, M+IA-1 ), $ J, DESCA, 1, TAUQ ) CALL PSELSET( E, 1, J, DESCE, REAL( ALPHA ) ) CALL PCELSET( A, I+1, J, DESCA, ONE ) * * Compute Y(ia+i:ia+n-1,j) * CALL PCGEMV( 'Conjugate transpose', M-K, N-K, ONE, A, I+1, $ J+1, DESCA, A, I+1, J, DESCA, 1, ZERO, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PCGEMV( 'Conjugate transpose', M-K, K-1, ONE, A, I+1, $ JA, DESCA, A, I+1, J, DESCA, 1, ZERO, WORK, IW, $ 1, DESCW, 1 ) CALL PCGEMV( 'Conjugate transpose', K-1, N-K, -ONE, Y, IY, $ JY+K, DESCY, WORK, IW, 1, DESCW, 1, ONE, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PCGEMV( 'Conjugate transpose', M-K, K, ONE, X, IX+K, $ JX, DESCX, A, I+1, J, DESCA, 1, ZERO, WORK, IW, $ 1, DESCW, 1 ) CALL PCGEMV( 'Conjugate transpose', K, N-K, -ONE, A, IA, $ J+1, DESCA, WORK, IW, 1, DESCW, 1, ONE, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) * CALL PCELGET( 'Rowwise', ' ', TAU, TAUQ, 1, J, DESCTQ ) CALL PCSCAL( N-K, TAU, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PCLACGV( N-K, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PCCOPY( N-K, WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ), $ Y, IY+K-1, JY+K, DESCY, DESCY( M_ ) ) 20 CONTINUE END IF * RETURN * * End of PCLABRD * END scalapack-2.0.2/SRC/pclacgv.f000644 000766 000024 00000015404 10363532303 016130 0ustar00juliestaff000000 000000 SUBROUTINE PCLACGV( N, X, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX X( * ) * .. * * Purpose * ======= * * PCLACGV conjugates a complex vector of length N, sub( X ), where * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = DESCX( M_ ) and * X(IX:IX+N-1,JX) if INCX = 1, and * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vector sub( X ). * * X (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_X,*). * On entry the vector to be conjugated * x( i ) = X(IX+(JX-1)*M_X +(i-1)*INCX ), 1 <= i <= N. * On exit the conjugated vector. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICOFFX, ICTXT, IIX, IOFFX, IROFFX, IXCOL, $ IXROW, JJX, LDX, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) * LDX = DESCX( LLD_ ) IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is rowwise distributed. * IF( MYROW.NE.IXROW ) $ RETURN ICOFFX = MOD( JX-1, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFFX, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ - ICOFFX * IF( NQ.GT.0 ) THEN IOFFX = IIX+(JJX-1)*LDX DO 10 I = 1, NQ X( IOFFX ) = CONJG( X( IOFFX ) ) IOFFX = IOFFX + LDX 10 CONTINUE END IF * ELSE IF( INCX.EQ.1 ) THEN * * sub( X ) is columnwise distributed. * IF( MYCOL.NE.IXCOL ) $ RETURN IROFFX = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFFX, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFFX * IF( NP.GT.0 ) THEN IOFFX = IIX+(JJX-1)*LDX DO 20 I = IOFFX, IOFFX+NP-1 X( I ) = CONJG( X( I ) ) 20 CONTINUE END IF * END IF * RETURN * * End of PCLACGV * END scalapack-2.0.2/SRC/pclacon.f000644 000766 000024 00000032340 10363532303 016126 0ustar00juliestaff000000 000000 SUBROUTINE PCLACON( N, V, IV, JV, DESCV, X, IX, JX, DESCX, EST, $ KASE ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IV, IX, JV, JX, KASE, N REAL EST * .. * .. Array Arguments .. INTEGER DESCV( * ), DESCX( * ) COMPLEX V( * ), X( * ) * .. * * Purpose * ======= * * PCLACON estimates the 1-norm of a square, complex distributed matrix * A. Reverse communication is used for evaluating matrix-vector * products. X and V are aligned with the distributed matrix A, this * information is implicitly contained within IV, IX, DESCV, and DESCX. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vectors V and X. N >= 0. * * V (local workspace) COMPLEX pointer into the local * memory to an array of dimension LOCr(N+MOD(IV-1,MB_V)). On * the final return, V = A*W, where EST = norm(V)/norm(W) * (W is not returned). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * X (local input/local output) COMPLEX pointer into the * local memory to an array of dimension * LOCr(N+MOD(IX-1,MB_X)). On an intermediate return, X * should be overwritten by * A * X, if KASE=1, * A' * X, if KASE=2, * where A' is the conjugate transpose of A, and PCLACON must * be re-called with all the other parameters unchanged. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * * EST (global output) REAL * An estimate (a lower bound) for norm(A). * * KASE (local input/local output) INTEGER * On the initial call to PCLACON, KASE should be 0. * On an intermediate return, KASE will be 1 or 2, indicating * whether X should be overwritten by A * X or A' * X. * On the final return from PCLACON, KASE will again be 0. * * Further Details * =============== * * The serial version CLACON has been contributed by Nick Higham, * University of Manchester. It was originally named SONEST, dated * March 16, 1988. * * Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of * a real or complex matrix, with applications to condition estimation", * ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, ICTXT, IIVX, IMAXROW, IOFFVX, IROFF, ITER, $ IVXCOL, IVXROW, J, JLAST, JJVX, JUMP, K, $ MYCOL, MYROW, NP, NPCOL, NPROW REAL ALTSGN, ESTOLD, SAFMIN, TEMP COMPLEX JLMAX, XMAX * .. * .. Local Arrays .. COMPLEX WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CCOPY, CGEBR2D, CGEBS2D, $ INFOG2L, PCELGET, PCMAX1, $ PSCSUM1, SGEBR2D, SGEBS2D * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, INDXL2G, NUMROC REAL PSLAMCH EXTERNAL INDXG2L, INDXG2P, INDXL2G, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, REAL * .. * .. Save statement .. SAVE * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIVX, JJVX, IVXROW, IVXCOL ) IF( MYCOL.NE.IVXCOL ) $ RETURN IROFF = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IVXROW, NPROW ) IF( MYROW.EQ.IVXROW ) $ NP = NP - IROFF IOFFVX = IIVX + (JJVX-1)*DESCX( LLD_ ) * SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) IF( KASE.EQ.0 ) THEN DO 10 I = IOFFVX, IOFFVX+NP-1 X( I ) = CMPLX( ONE / REAL( N ) ) 10 CONTINUE KASE = 1 JUMP = 1 RETURN END IF * GO TO ( 20, 40, 70, 90, 120 )JUMP * * ................ ENTRY (JUMP = 1) * FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X * 20 CONTINUE IF( N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN V( IOFFVX ) = X( IOFFVX ) EST = ABS( V( IOFFVX ) ) CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF * ... QUIT GO TO 130 END IF CALL PSCSUM1( N, EST, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF END IF * DO 30 I = IOFFVX, IOFFVX+NP-1 IF( ABS( X( I ) ).GT.SAFMIN ) THEN X( I ) = X( I ) / CMPLX( ABS( X( I ) ) ) ELSE X( I ) = CONE END IF 30 CONTINUE KASE = 2 JUMP = 2 RETURN * * ................ ENTRY (JUMP = 2) * FIRST ITERATION. X HAS BEEN OVERWRITTEN BY CTRANS(A)*X * 40 CONTINUE CALL PCMAX1( N, XMAX, J, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN WORK( 1 ) = XMAX WORK( 2 ) = CMPLX( REAL( J ) ) CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2 ) ELSE CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2, $ IVXROW, MYCOL ) XMAX = WORK( 1 ) J = NINT( REAL( WORK( 2 ) ) ) END IF END IF ITER = 2 * * MAIN LOOP - ITERATIONS 2, 3,...,ITMAX * 50 CONTINUE DO 60 I = IOFFVX, IOFFVX+NP-1 X( I ) = CZERO 60 CONTINUE IMAXROW = INDXG2P( J, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) IF( MYROW.EQ.IMAXROW ) THEN I = INDXG2L( J, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) X( I ) = CONE END IF KASE = 1 JUMP = 3 RETURN * * ................ ENTRY (JUMP = 3) * X HAS BEEN OVERWRITTEN BY A*X * 70 CONTINUE CALL CCOPY( NP, X( IOFFVX ), 1, V( IOFFVX ), 1 ) ESTOLD = EST CALL PSCSUM1( N, EST, V, IV, JV, DESCV, 1 ) IF( DESCV( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF END IF * * TEST FOR CYCLING IF( EST.LE.ESTOLD ) $ GO TO 100 * DO 80 I = IOFFVX, IOFFVX+NP-1 IF( ABS( X( I ) ).GT.SAFMIN ) THEN X( I ) = X( I ) / CMPLX( ABS( X( I ) ) ) ELSE X( I ) = CONE END IF 80 CONTINUE KASE = 2 JUMP = 4 RETURN * * ................ ENTRY (JUMP = 4) * X HAS BEEN OVERWRITTEN BY CTRANS(A)*X * 90 CONTINUE JLAST = J CALL PCMAX1( N, XMAX, J, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN WORK( 1 ) = XMAX WORK( 2 ) = CMPLX( REAL( J ) ) CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2 ) ELSE CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2, $ IVXROW, MYCOL ) XMAX = WORK( 1 ) J = NINT( REAL( WORK( 2 ) ) ) END IF END IF CALL PCELGET( 'Columnwise', ' ', JLMAX, X, JLAST, JX, DESCX ) IF( ( REAL( JLMAX ).NE.ABS( REAL( XMAX ) ) ).AND. $ ( ITER.LT.ITMAX ) ) THEN ITER = ITER + 1 GO TO 50 END IF * * ITERATION COMPLETE. FINAL STAGE. * 100 CONTINUE DO 110 I = IOFFVX, IOFFVX+NP-1 K = INDXL2G( I-IOFFVX+IIVX, DESCX( MB_ ), MYROW, $ DESCX( RSRC_ ), NPROW )-IX+1 IF( MOD( K, 2 ).EQ.0 ) THEN ALTSGN = -ONE ELSE ALTSGN = ONE END IF X( I ) = CMPLX( ALTSGN*( ONE+REAL( K-1 ) / REAL( N-1 ) ) ) 110 CONTINUE KASE = 1 JUMP = 5 RETURN * * ................ ENTRY (JUMP = 5) * X HAS BEEN OVERWRITTEN BY A*X * 120 CONTINUE CALL PSCSUM1( N, TEMP, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TEMP, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TEMP, 1, $ IVXROW, MYCOL ) END IF END IF TEMP = TWO*( TEMP / REAL( 3*N ) ) IF( TEMP.GT.EST ) THEN CALL CCOPY( NP, X( IOFFVX ), 1, V( IOFFVX ), 1 ) EST = TEMP END IF * 130 CONTINUE KASE = 0 * RETURN * * End of PCLACON * END scalapack-2.0.2/SRC/pclaconsb.f000644 000766 000024 00000051124 10363532303 016454 0ustar00juliestaff000000 000000 SUBROUTINE PCLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, BUF, $ LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. INTEGER I, L, LWORK, M COMPLEX H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), BUF( * ) * .. * * Purpose * ======= * * PCLACONSB looks for two consecutive small subdiagonal elements by * seeing the effect of starting a double shift QR iteration * given by H44, H33, & H43H34 and see if this would make a * subdiagonal negligible. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) COMPLEX array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * M (global output) INTEGER * On exit, this yields the starting location of the QR double * shift. This will satisfy: L <= M <= I-2. * * H44 * H33 * H43H34 (global input) COMPLEX * These three values are for the double shift QR iteration. * * BUF (local output) COMPLEX array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 7*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Logic: * ====== * * Two consecutive small subdiagonal elements will stall * convergence of a double shift if their product is small * relatively even if each is not very small. Thus it is * necessary to scan the "tridiagonal portion of the matrix." In * the LAPACK algorithm ZLAHQR, a loop of M goes from I-2 down to * L and examines * H(m,m),H(m+1,m+1),H(m+1,m),H(m,m+1),H(m-1,m-1),H(m,m-1), and * H(m+2,m-1). Since these elements may be on separate * processors, the first major loop (10) goes over the tridiagonal * and has each node store whatever values of the 7 it has that * the node owning H(m,m) does not. This will occur on a border * and can happen in no more than 3 locations per block assuming * square blocks. There are 5 buffers that each node stores these * values: a buffer to send diagonally down and right, a buffer * to send up, a buffer to send left, a buffer to send diagonally * up and left and a buffer to send right. Each of these buffers * is actually stored in one buffer BUF where BUF(ISTR1+1) starts * the first buffer, BUF(ISTR2+1) starts the second, etc.. After * the values are stored, if there are any values that a node * needs, they will be sent and received. Then the next major * loop passes over the data and searches for two consecutive * small subdiagonals. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IBUF1, IBUF2, IBUF3, IBUF4, $ IBUF5, ICOL1, II, IRCV1, IRCV2, IRCV3, IRCV4, $ IRCV5, IROW1, ISRC, ISTR1, ISTR2, ISTR3, ISTR4, $ ISTR5, JJ, JSRC, LDA, LEFT, MODKM1, MYCOL, $ MYROW, NPCOL, NPROW, NUM, RIGHT, UP REAL S, TST1, ULP COMPLEX CDUM, H00, H10, H11, H12, H21, H22, H33S, H44S, $ V1, V2, V3 * .. * .. External Functions .. INTEGER ILCM REAL PSLAMCH EXTERNAL ILCM, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, INFOG2L, PXERBLA, $ CGERV2D, CGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, AIMAG, MOD * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) ULP = PSLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 starts at BUF(ISTR1+1) and will contain IBUF1 elements * BUFFER2 starts at BUF(ISTR2+1) and will contain IBUF2 elements * BUFFER3 starts at BUF(ISTR3+1) and will contain IBUF3 elements * BUFFER4 starts at BUF(ISTR4+1) and will contain IBUF4 elements * BUFFER5 starts at BUF(ISTR5+1) and will contain IBUF5 elements * ISTR1 = 0 ISTR2 = ( ( I-L-1 ) / HBL ) IF( ISTR2*HBL.LT.( I-L-1 ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.7*ISTR2 ) THEN CALL PXERBLA( CONTXT, 'PCLACONSB', 10 ) RETURN END IF ISTR3 = 3*ISTR2 ISTR4 = ISTR3 + ISTR2 ISTR5 = ISTR3 + ISTR3 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) * * Copy our relevant pieces of triadiagonal that we owe into * 5 buffers to send to whomever owns H(M,M) as M moves diagonally * up the tridiagonal * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 IRCV1 = 0 IRCV2 = 0 IRCV3 = 0 IRCV4 = 0 IRCV5 = 0 DO 10 M = I - 2, L, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M-1,M-1) and send it diagonal down * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( M-1, M-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M ,M-1) and send it right * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF5 = IBUF5 + 1 BUF( ISTR5+IBUF5 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+1,M) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( MYROW.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M ,M+1) and send it left * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF3 = IBUF3 + 1 BUF( ISTR3+IBUF3 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M+1,M+1) & H(M+2,M+1) and send it * diagonally up * IF( ( UP.NE.MYROW ) .OR. ( LEFT.NE.MYCOL ) ) THEN CALL INFOG2L( M+1, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF4 = IBUF4 + 2 BUF( ISTR4+IBUF4-1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) BUF( ISTR4+IBUF4 ) = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+2,M+1) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * Add up the receives * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( M.GT.L ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M-1,M-1) from diagonal up * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) .AND. ( M.GT.L ) ) $ THEN * * We must receive H(M ,M-1) from left * IRCV5 = IRCV5 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+1,M ) from down * IRCV2 = IRCV2 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPCOL.GT.1 ) ) THEN * * We must receive H(M ,M+1) from right * IRCV3 = IRCV3 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M+1:M+2,M+1) from diagonal down * IRCV4 = IRCV4 + 2 END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+2,M+1) from down * IRCV2 = IRCV2 + 1 END IF END IF * * Possibly change owners (occurs only when MOD(M-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * * Send data on to the appropriate node if there is any data to send * IF( IBUF1.GT.0 ) THEN CALL CGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL CGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, UP, $ MYCOL ) END IF IF( IBUF3.GT.0 ) THEN CALL CGESD2D( CONTXT, IBUF3, 1, BUF( ISTR3+1 ), IBUF3, MYROW, $ LEFT ) END IF IF( IBUF4.GT.0 ) THEN CALL CGESD2D( CONTXT, IBUF4, 1, BUF( ISTR4+1 ), IBUF4, UP, $ LEFT ) END IF IF( IBUF5.GT.0 ) THEN CALL CGESD2D( CONTXT, IBUF5, 1, BUF( ISTR5+1 ), IBUF5, MYROW, $ RIGHT ) END IF * * Receive appropriate data if there is any * IF( IRCV1.GT.0 ) THEN CALL CGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL CGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, DOWN, $ MYCOL ) END IF IF( IRCV3.GT.0 ) THEN CALL CGERV2D( CONTXT, IRCV3, 1, BUF( ISTR3+1 ), IRCV3, MYROW, $ RIGHT ) END IF IF( IRCV4.GT.0 ) THEN CALL CGERV2D( CONTXT, IRCV4, 1, BUF( ISTR4+1 ), IRCV4, DOWN, $ RIGHT ) END IF IF( IRCV5.GT.0 ) THEN CALL CGERV2D( CONTXT, IRCV5, 1, BUF( ISTR5+1 ), IRCV5, MYROW, $ LEFT ) END IF * * Start main loop * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. $ ( MODKM1.NE.HBL-1 ) ) THEN CALL INFOG2L( I-2, I-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) END IF * * Look for two consecutive small subdiagonal elements. * DO 20 M = I - 2, L, -1 * * Determine the effect of starting the double-shift QR * iteration at row M, and see if this would make H(M,M-1) * negligible. * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H00 = BUF( ISTR1+IBUF1 ) ELSE H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF5 = IBUF5 + 1 H10 = BUF( ISTR5+IBUF5 ) ELSE H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF END IF IF( MODKM1.EQ.HBL-1 ) THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) H11 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( NUM.GT.1 ) THEN IBUF4 = IBUF4 + 2 H22 = BUF( ISTR4+IBUF4-1 ) V3 = BUF( ISTR4+IBUF4 ) ELSE H22 = A( ICOL1*LDA+IROW1+1 ) V3 = A( ( ICOL1+1 )*LDA+IROW1+1 ) END IF IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 H21 = BUF( ISTR2+IBUF2 ) ELSE H21 = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF3 = IBUF3 + 1 H12 = BUF( ISTR3+IBUF3 ) ELSE H12 = A( ICOL1*LDA+IROW1 ) END IF IF( M.GT.L ) THEN H00 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF * * Adjust ICOL1 for next iteration where MODKM1=HBL-2 * ICOL1 = ICOL1 + 1 END IF IF( MODKM1.EQ.HBL-2 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 V3 = BUF( ISTR2+IBUF2 ) ELSE V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) END IF H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.LT.HBL-2 ) .AND. ( MODKM1.GT.0 ) ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S IF( M.EQ.L ) $ GO TO 30 TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).LE.ULP*TST1 ) $ GO TO 30 * * Slide indices diagonally up one for next iteration * IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF IF( M.EQ.L ) THEN * * Stop regardless of which node we are * GO TO 30 END IF * * Possibly change owners if on border * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 20 CONTINUE 30 CONTINUE * CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, M, 1, L, L, -1, -1, -1 ) * RETURN * * End of PCLACONSB * END scalapack-2.0.2/SRC/pclacp2.f000644 000766 000024 00000037402 11750130340 016033 0ustar00juliestaff000000 000000 SUBROUTINE PCLACP2( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ) * .. * * Purpose * ======= * * PCLACP2 copies all or part of a distributed matrix A to another * distributed matrix B. No communication is performed, PCLACP2 * performs a local copy sub( A ) := sub( B ), where sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1) and sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * PCLACP2 requires that only dimension of the matrix operands is * distributed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ) set as follows: * * if UPLO = 'U', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=j, 1<=j<=N; * if UPLO = 'L', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * j<=i<=M, 1<=j<=N; * otherwise, B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=M, 1<=j<=N. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER HEIGHT, IACOL, IAROW, IBASE, IBCOL, IBROW, $ ICOFFA, IIA, IIAA, IIB, IIBB, IIBEGA, IIBEGB, $ IIENDA, IINXTA, IINXTB, ILEFT, IRIGHT, IROFFA, $ ITOP, JJA, JJAA, JJB, JJBB, JJBEGA, JJBEGB, $ JJENDA, JJNXTA, JJNXTB, LDA, LDB, MBA, MP, $ MPAA, MYCOL, MYDIST, MYROW, NBA, NPCOL, NPROW, $ NQ, NQAA, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLAMOV, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, JJB, $ IBROW, IBCOL ) * MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, MBA ) ICOFFA = MOD( JA-1, NBA ) LDB = DESCB( LLD_ ) * IF( N.LE.( NBA-ICOFFA ) ) THEN * * It is assumed that the local columns JJA:JJA+N-1 of the matrix * A are in the same process column (IACOL). * * N * JJA JJA+N-1 * / --------------------- \ * IROFFA| | | | * \ |...................| | ( IAROW ) * IIA |x | | MBA = DESCA( MB_ ) * | x | | * |--x----------------| / * | x | * | x | ITOP * | x | | * | x | /-------\ * |-------x-----------| |-------x-----------| * | x | | x | * | x | | x | * | x | | x | * | x | | x | * |------------x------| |------------x------| * | x | \____________/ * | x | | * | x | IBASE * | x | * |-----------------x-| Local picture * | x| * | | * | | * | | * |-------------------| * | | * . . * . . * . (IACOL) . * IF( MYCOL.EQ.IACOL ) THEN * MP = NUMROC( M+IROFFA, MBA, MYROW, IAROW, NPROW ) IF( MP.LE.0 ) $ RETURN IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA MYDIST = MOD( MYROW-IAROW+NPROW, NPROW ) ITOP = MYDIST * MBA - IROFFA * IF( LSAME( UPLO, 'U' ) ) THEN * ITOP = MAX( 0, ITOP ) IIBEGA = IIA IIENDA = IIA + MP - 1 IINXTA = MIN( ICEIL( IIBEGA, MBA ) * MBA, IIENDA ) IIBEGB = IIB IINXTB = IIBEGB + IINXTA - IIBEGA * 10 CONTINUE IF( ( N-ITOP ).GT.0 ) THEN CALL CLAMOV( UPLO, IINXTA-IIBEGA+1, N-ITOP, $ A( IIBEGA+(JJA+ITOP-1)*LDA ), LDA, $ B( IIBEGB+(JJB+ITOP-1)*LDB ), LDB ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IIBEGA = IINXTA + 1 IINXTA = MIN( IINXTA+MBA, IIENDA ) IIBEGB = IINXTB + 1 IINXTB = IIBEGB + IINXTA - IIBEGA GO TO 10 END IF * ELSE IF( LSAME( UPLO, 'L' ) ) THEN * MPAA = MP IIAA = IIA JJAA = JJA IIBB = IIB JJBB = JJB IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 20 CONTINUE IF( JJAA.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL CLAMOV( 'All', MPAA, ITOP-JJAA+JJA, $ A( IIAA+(JJAA-1)*LDA ), LDA, $ B( IIBB+(JJBB-1)*LDB ), LDB ) CALL CLAMOV( UPLO, MPAA, HEIGHT, $ A( IIAA+(JJA+ITOP-1)*LDA ), LDA, $ B( IIBB+(JJB+ITOP-1)*LDB ), LDB ) MPAA = MAX( 0, MPAA - HEIGHT ) IIAA = IIAA + HEIGHT JJAA = JJA + IBASE IIBB = IIBB + HEIGHT JJBB = JJB + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 20 END IF * ELSE * CALL CLAMOV( 'All', MP, N, A( IIA+(JJA-1)*LDA ), $ LDA, B( IIB+(JJB-1)*LDB ), LDB ) * END IF * END IF * ELSE IF( M.LE.( MBA-IROFFA ) ) THEN * * It is assumed that the local rows IIA:IIA+M-1 of the matrix A * are in the same process row (IAROW). * * ICOFFA * / \JJA * IIA ------------------ .... -------- * | .x | | | / | | \ * | . x | | | ILEFT| | | | * | . x | | | | | | * | . x | | \ x | | * | . |x | | |x | | IRIGHT * | . | x | | | x | | * (IAROW) | . | x | | | x | | * | . | x| | | x| | * | . | x | | x / * | . | |x | | | * | . | | x | | | * | . | | x | | | * | . | | x| | | * IIA+M-1 ------------------ .... ------- * NB_A * (IACOL) Local picture * IF( MYROW.EQ.IAROW ) THEN * NQ = NUMROC( N+ICOFFA, NBA, MYCOL, IACOL, NPCOL ) IF( NQ.LE.0 ) $ RETURN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA MYDIST = MOD( MYCOL-IACOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBA - ICOFFA * IF( LSAME( UPLO, 'L' ) ) THEN * ILEFT = MAX( 0, ILEFT ) JJBEGA = JJA JJENDA = JJA + NQ - 1 JJNXTA = MIN( ICEIL( JJBEGA, NBA ) * NBA, JJENDA ) JJBEGB = JJB JJNXTB = JJBEGB + JJNXTA - JJBEGA * 30 CONTINUE IF( ( M-ILEFT ).GT.0 ) THEN CALL CLAMOV( UPLO, M-ILEFT, JJNXTA-JJBEGA+1, $ A( IIA+ILEFT+(JJBEGA-1)*LDA ), LDA, $ B( IIB+ILEFT+(JJBEGB-1)*LDB ), LDB ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA JJBEGA = JJNXTA +1 JJNXTA = MIN( JJNXTA+NBA, JJENDA ) JJBEGB = JJNXTB +1 JJNXTB = JJBEGB + JJNXTA - JJBEGA GO TO 30 END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * NQAA = NQ IIAA = IIA JJAA = JJA IIBB = IIB JJBB = JJB IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 40 CONTINUE IF( IIAA.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL CLAMOV( 'All', ILEFT-IIAA+IIA, NQAA, $ A( IIAA+(JJAA-1)*LDA ), LDA, $ B( IIBB+(JJBB-1)*LDB ), LDB ) CALL CLAMOV( UPLO, WIDE, NQAA, $ A( IIA+ILEFT+(JJAA-1)*LDA ), LDA, $ B( IIB+ILEFT+(JJBB-1)*LDB ), LDB ) NQAA = MAX( 0, NQAA - WIDE ) IIAA = IIA + IRIGHT JJAA = JJAA + WIDE IIBB = IIB + IRIGHT JJBB = JJBB + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 40 END IF * ELSE * CALL CLAMOV( 'All', M, NQ, A( IIA+(JJA-1)*LDA ), $ LDA, B( IIB+(JJB-1)*LDB ), LDB ) * END IF * END IF * END IF * RETURN * * End of PCLACP2 * END scalapack-2.0.2/SRC/pclacp3.f000644 000766 000024 00000030230 10363532303 016030 0ustar00juliestaff000000 000000 SUBROUTINE PCLACP3( M, I, A, DESCA, B, LDB, II, JJ, REV ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. INTEGER I, II, JJ, LDB, M, REV * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), B( LDB, * ) * .. * * Purpose * ======= * * PCLACP3 is an auxiliary routine that copies from a global parallel * array into a local replicated array or vise versa. Notice that * the entire submatrix that is copied gets placed on one node or * more. The receiving node can be specified precisely, or all nodes * can receive, or just one row or column of nodes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * M is the order of the square submatrix that is copied. * M >= 0. * Unchanged on exit * * I (global input) INTEGER * A(I,I) is the global location that the copying starts from. * Unchanged on exit. * * A (global input/output) COMPLEX array, dimension * (DESCA(LLD_),*) * On entry, the parallel matrix to be copied into or from. * On exit, if REV=1, the copied data. * Unchanged on exit if REV=0. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/output) COMPLEX array of size (LDB,M) * If REV=0, this is the global portion of the array * A(I:I+M-1,I:I+M-1). * If REV=1, this is the unchanged on exit. * * LDB (local input) INTEGER * The leading dimension of B. * * II (global input) INTEGER * By using REV 0 & 1, data can be sent out and returned again. * If REV=0, then II is destination row index for the node(s) * receiving the replicated B. * If II>=0,JJ>=0, then node (II,JJ) receives the data * If II=-1,JJ>=0, then all rows in column JJ receive the * data * If II>=0,JJ=-1, then all cols in row II receive the data * If II=-1,JJ=-1, then all nodes receive the data * If REV<>0, then II is the source row index for the node(s) * sending the replicated B. * * JJ (global input) INTEGER * Similar description as II above * * REV (global input) INTEGER * Use REV = 0 to send global A into locally replicated B * (on node (II,JJ)). * Use REV <> 0 to send locally replicated B from node (II,JJ) * to its owner (which changes depending on its location in * A) into the global A. * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER COL, CONTXT, HBL, ICOL1, ICOL2, IDI, IDJ, IFIN, $ III, IROW1, IROW2, ISTOP, ISTOPI, ISTOPJ, ITMP, $ JJJ, LDA, MYCOL, MYROW, NPCOL, NPROW, ROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG1L, CGEBR2D, CGEBS2D, $ CGERV2D, CGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.LE.0 ) $ RETURN * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( REV.EQ.0 ) THEN DO 20 IDI = 1, M DO 10 IDJ = 1, M B( IDI, IDJ ) = ZERO 10 CONTINUE 20 CONTINUE END IF * IFIN = I + M - 1 * IF( MOD( I+HBL, HBL ).NE.0 ) THEN ISTOP = MIN( I+HBL-MOD( I+HBL, HBL ), IFIN ) ELSE ISTOP = I END IF IDJ = I ISTOPJ = ISTOP IF( IDJ.LE.IFIN ) THEN 30 CONTINUE IDI = I ISTOPI = ISTOP IF( IDI.LE.IFIN ) THEN 40 CONTINUE ROW = MOD( ( IDI-1 ) / HBL, NPROW ) COL = MOD( ( IDJ-1 ) / HBL, NPCOL ) CALL INFOG1L( IDI, HBL, NPROW, ROW, 0, IROW1, ITMP ) IROW2 = NUMROC( ISTOPI, HBL, ROW, 0, NPROW ) CALL INFOG1L( IDJ, HBL, NPCOL, COL, 0, ICOL1, ITMP ) ICOL2 = NUMROC( ISTOPJ, HBL, COL, 0, NPCOL ) IF( ( MYROW.EQ.ROW ) .AND. ( MYCOL.EQ.COL ) ) THEN IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to everyone * IF( REV.EQ.0 ) THEN CALL CGEBS2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.NE.-1 ) ) THEN * * Send the message to Column MYCOL which better be JJ * IF( REV.EQ.0 ) THEN CALL CGEBS2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to Row MYROW which better be II * IF( REV.EQ.0 ) THEN CALL CGEBS2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.NE.-1 ) .AND. $ ( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) ) THEN * * Recv/Send the message to (II,JJ) * IF( REV.EQ.0 ) THEN CALL CGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ A( ( ICOL1-1 )*LDA+IROW1 ), LDA, II, $ JJ ) ELSE CALL CGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, II, JJ ) END IF END IF IF( REV.EQ.0 ) THEN DO 60 JJJ = ICOL1, ICOL2 DO 50 III = IROW1, IROW2 B( IDI+III-IROW1+1-I, IDJ+JJJ-ICOL1+1-I ) $ = A( ( JJJ-1 )*LDA+III ) 50 CONTINUE 60 CONTINUE ELSE DO 80 JJJ = ICOL1, ICOL2 DO 70 III = IROW1, IROW2 A( ( JJJ-1 )*LDA+III ) = B( IDI+III-IROW1+1-I, $ IDJ+JJJ-ICOL1+1-I ) 70 CONTINUE 80 CONTINUE END IF ELSE IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL CGEBR2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL CGEBR2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL CGEBR2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL CGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) ELSE CALL CGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) * CALL CGESD2D(CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, * $ A((ICOL1-1)*LDA+IROW1),LDA, ROW, COL) END IF END IF END IF IDI = ISTOPI + 1 ISTOPI = MIN( ISTOPI+HBL, IFIN ) IF( IDI.LE.IFIN ) $ GO TO 40 END IF IDJ = ISTOPJ + 1 ISTOPJ = MIN( ISTOPJ+HBL, IFIN ) IF( IDJ.LE.IFIN ) $ GO TO 30 END IF RETURN * * End of PCLACP3 * END scalapack-2.0.2/SRC/pclacpy.f000644 000766 000024 00000022362 10363532303 016145 0ustar00juliestaff000000 000000 SUBROUTINE PCLACPY( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ) * .. * * Purpose * ======= * * PCLACPY copies all or part of a distributed matrix A to another * distributed matrix B. No communication is performed, PCLACPY * performs a local copy sub( A ) := sub( B ), where sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1) and sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ) set as follows: * * if UPLO = 'U', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=j, 1<=j<=N; * if UPLO = 'L', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * j<=i<=M, 1<=j<=N; * otherwise, B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=M, 1<=j<=N. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, IAA, IBB, IBLK, IN, ITMP, J, JAA, JBB, $ JBLK, JN, JTMP * .. * .. External Subroutines .. EXTERNAL PCLACP2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) * IF( M.LE.( DESCA( MB_ ) - MOD( IA-1, DESCA( MB_ ) ) ) .OR. $ N.LE.( DESCA( NB_ ) - MOD( JA-1, DESCA( NB_ ) ) ) ) THEN CALL PCLACP2( UPLO, M, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) ELSE * IF( LSAME( UPLO, 'U' ) ) THEN CALL PCLACP2( UPLO, IN-IA+1, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) IBB = IB + ITMP JBB = JB + ITMP JAA = JA + ITMP CALL PCLACP2( UPLO, IBLK, N-ITMP, A, I, JAA, DESCA, $ B, IBB, JBB, DESCB ) 10 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN CALL PCLACP2( UPLO, M, JN-JA+1, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) IBB = IB + JTMP JBB = JB + JTMP IAA = IA + JTMP CALL PCLACP2( UPLO, M-JTMP, JBLK, A, IAA, J, DESCA, $ B, IBB, JBB, DESCB ) 20 CONTINUE ELSE IF( M.LE.N ) THEN CALL PCLACP2( UPLO, IN-IA+1, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) IBB = IB+ITMP CALL PCLACP2( UPLO, IBLK, N, A, I, JA, DESCA, $ B, IBB, JB, DESCB ) 30 CONTINUE ELSE CALL PCLACP2( UPLO, M, JN-JA+1, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) JBB = JB+JTMP CALL PCLACP2( UPLO, M, JBLK, A, IA, J, DESCA, $ B, IB, JBB, DESCB ) 40 CONTINUE END IF END IF * END IF * RETURN * * End of PCLACPY * END scalapack-2.0.2/SRC/pclaevswp.f000644 000766 000024 00000025732 10363532303 016522 0ustar00juliestaff000000 000000 * * SUBROUTINE PCLAEVSWP( N, ZIN, LDZI, Z, IZ, JZ, DESCZ, NVS, KEY, $ RWORK, LRWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 15, 1997 * * .. Scalar Arguments .. INTEGER IZ, JZ, LDZI, LRWORK, N * .. * .. Array Arguments .. INTEGER DESCZ( * ), KEY( * ), NVS( * ) REAL RWORK( * ), ZIN( LDZI, * ) COMPLEX Z( * ) * .. * * Purpose * ======= * * PCLAEVSWP moves the eigenvectors (potentially unsorted) from * where they are computed, to a ScaLAPACK standard block cyclic * array, sorted so that the corresponding eigenvalues are sorted. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * N (global input) INTEGER * The order of the matrix A. N >= 0. * * ZIN (local input) REAL array, * dimension ( LDZI, NVS(iam) ) * The eigenvectors on input. Each eigenvector resides entirely * in one process. Each process holds a contiguous set of * NVS(iam) eigenvectors. The first eigenvector which the * process holds is: sum for i=[0,iam-1) of NVS(i) * * LDZI (locl input) INTEGER * leading dimension of the ZIN array * * Z (local output) COMPLEX array * global dimension (N, N), local dimension (DESCZ(DLEN_), NQ) * The eigenvectors on output. The eigenvectors are distributed * in a block cyclic manner in both dimensions, with a * block size of NB. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * NVS (global input) INTEGER array, dimension( nprocs+1 ) * nvs(i) = number of processes * number of eigenvectors held by processes [0,i-1) * nvs(1) = number of eigen vectors held by [0,1-1) == 0 * nvs(nprocs+1) = number of eigen vectors held by [0,nprocs) == * total number of eigenvectors * * KEY (global input) INTEGER array, dimension( N ) * Indicates the actual index (after sorting) for each of the * eigenvectors. * * RWORK (local workspace) REAL array, dimension (LRWORK) * * LRWORK (local input) INTEGER dimension of RWORK * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CYCLIC_I, CYCLIC_J, DIST, I, IAM, II, INCII, J, $ MAXI, MAXII, MINI, MINII, MYCOL, MYROW, NB, $ NBUFSIZE, NPCOL, NPROCS, NPROW, PCOL, RECVCOL, $ RECVFROM, RECVROW, SENDCOL, SENDROW, SENDTO * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P EXTERNAL INDXG2L, INDXG2P * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGERV2D, SGESD2D * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW + MYCOL*NPROW IAM = MYROW*NPCOL + MYCOL * NB = DESCZ( MB_ ) * NPROCS = NPROW*NPCOL * * If PxSTEIN operates on a sub-matrix of a global matrix, the * key [] that contains the indicies of the eigenvectors is refe- * renced to the dimensions of the sub-matrix and not the global * distrubited matrix. Because of this, PxLAEVSWP will incorrectly * map the eigenvectors to the global eigenvector matrix, Z, unless * the key[] elements are shifted as below. * DO 10 J = DESCZ( N_ ), 1, -1 KEY( J ) = KEY( J-JZ+1 ) + ( JZ-1 ) 10 CONTINUE * DO 110 DIST = 0, NPROCS - 1 * SENDTO = MOD( IAM+DIST, NPROCS ) RECVFROM = MOD( NPROCS+IAM-DIST, NPROCS ) * SENDROW = MOD( SENDTO, NPROW ) SENDCOL = SENDTO / NPROW RECVROW = MOD( RECVFROM, NPROW ) RECVCOL = RECVFROM / NPROW * SENDROW = SENDTO / NPCOL SENDCOL = MOD( SENDTO, NPCOL ) RECVROW = RECVFROM / NPCOL RECVCOL = MOD( RECVFROM, NPCOL ) * * Figure out what I have that process "sendto" wants * NBUFSIZE = 0 * * We are looping through the eigenvectors that I presently own. * DO 40 J = NVS( 1+IAM ) + JZ, NVS( 1+IAM+1 ) + JZ - 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( SENDCOL.EQ.PCOL ) THEN MINII = MOD( SENDROW+DESCZ( RSRC_ ), NPROW )* $ DESCZ( MB_ ) + 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 30 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) MAXI = MIN( II+DESCZ( MB_ )-1, N+IZ-1 ) DO 20 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 RWORK( NBUFSIZE ) = ZIN( I+1-IZ, $ J-NVS( 1+IAM )+1-JZ ) 20 CONTINUE 30 CONTINUE END IF 40 CONTINUE * * IF( MYROW.NE.SENDROW .OR. MYCOL.NE.SENDCOL ) $ CALL SGESD2D( DESCZ( CTXT_ ), NBUFSIZE, 1, RWORK, NBUFSIZE, $ SENDROW, SENDCOL ) * * * Figure out what process "recvfrom" has that I want * NBUFSIZE = 0 DO 70 J = NVS( 1+RECVFROM ) + JZ, $ NVS( 1+RECVFROM+1 ) + JZ - 1, 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.PCOL ) THEN MINII = MOD( MYROW+DESCZ( RSRC_ ), NPROW )*DESCZ( MB_ ) + $ 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 60 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) MAXI = MIN( II+NB-1, N+IZ-1 ) DO 50 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 50 CONTINUE 60 CONTINUE END IF 70 CONTINUE * * * IF( MYROW.NE.RECVROW .OR. MYCOL.NE.RECVCOL ) $ CALL SGERV2D( DESCZ( CTXT_ ), 1, NBUFSIZE, RWORK, 1, $ RECVROW, RECVCOL ) * NBUFSIZE = 0 DO 100 J = NVS( 1+RECVFROM ) + JZ, $ NVS( 1+RECVFROM+1 ) + JZ - 1, 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.PCOL ) THEN CYCLIC_J = INDXG2L( KEY( J ), DESCZ( MB_ ), -1, -1, $ NPCOL ) CYCLIC_I = 1 MINII = MOD( MYROW+DESCZ( RSRC_ ), NPROW )*DESCZ( MB_ ) + $ 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 90 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) CYCLIC_I = INDXG2L( MINI, DESCZ( MB_ ), -1, -1, $ NPROW ) MAXI = MIN( II+NB-1, N+IZ-1 ) DO 80 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 Z( CYCLIC_I+( CYCLIC_J-1 )*DESCZ( LLD_ ) ) $ = CMPLX( RWORK( NBUFSIZE ) ) CYCLIC_I = CYCLIC_I + 1 80 CONTINUE 90 CONTINUE END IF 100 CONTINUE * 110 CONTINUE RETURN * * End of PCLAEVSWP * END scalapack-2.0.2/SRC/pclahqr.f000644 000766 000024 00000333215 10602576752 016162 0ustar00juliestaff000000 000000 SUBROUTINE PCLAHQR( WANTT, WANTZ, N, ILO, IHI, A, DESCA, W, ILOZ, $ IHIZ, Z, DESCZ, WORK, LWORK, IWORK, ILWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7.3) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * 1.7.3: March 22, 2006 * modification suggested by Mark Fahey and Greg Henry * 1.7.0: July 31, 2001 * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, ILWORK, INFO, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) COMPLEX A( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PCLAHQR is an auxiliary routine used to find the Schur decomposition * and or eigenvalues of a matrix already in Hessenberg form from * cols ILO to IHI. * If Z = I, and WANTT=WANTZ=.TRUE., H gets replaced with Z'HZ, * with Z'Z=I, and H in Schur form. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCp(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCp( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCq( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCp() and LOCq() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix A (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that A is already upper quasi-triangular in * rows and columns IHI+1:N, and that A(ILO,ILO-1) = 0 (unless * ILO = 1). PCLAHQR works primarily with the Hessenberg * submatrix in rows and columns ILO to IHI, but applies * transformations to all of H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * A (global input/output) COMPLEX array, dimension * (DESCA(LLD_),*) * On entry, the upper Hessenberg matrix A. * On exit, if WANTT is .TRUE., A is upper triangular in rows * and columns ILO:IHI. If WANTT is .FALSE., the contents of * A are unspecified on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * W (global replicated output) COMPLEX array, dimension (N) * The computed eigenvalues ILO to IHI are stored in the * corresponding elements of W. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in A. A may be returned with * larger diagonal blocks until the next release. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (global input/output) COMPLEX array. * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations accumulated by PCHSEQR, and on * exit Z has been updated; transformations are applied only to * the submatrix Z(ILOZ:IHIZ,ILO:IHI). * If WANTZ is .FALSE., Z is not referenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local output) COMPLEX array of size LWORK * (Unless LWORK=-1, in which case WORK must be at least size 1) * * LWORK (local input) INTEGER * WORK(LWORK) is a local array and LWORK is assumed big enough * so that LWORK >= 3*N + * MAX( 2*MAX(DESCZ(LLD_),DESCA(LLD_)) + 2*LOCq(N), * 7*Ceil(N/HBL)/LCM(NPROW,NPCOL)) + * MAX( 2*N, (8*LCM(NPROW,NPCOL)+2)**2 ) * If LWORK=-1, then WORK(1) gets set to the above number and * the code returns immediately. * * IWORK (global and local input) INTEGER array of size ILWORK * This will hold some of the IBLK integer arrays. * This is held as a place holder for a future release. * Currently unreferenced. * * ILWORK (local input) INTEGER * This will hold the size of the IWORK array. * This is held as a place holder for a future release. * Currently unreferenced. * * INFO (global output) INTEGER * < 0: parameter number -INFO incorrect or inconsistent * = 0: successful exit * > 0: PCLAHQR failed to compute all the eigenvalues ILO to IHI * in a total of 30*(IHI-ILO+1) iterations; if INFO = i, * elements i+1:ihi of W contains those eigenvalues * which have been successfully computed. * * Logic: * This algorithm is very similar to SLAHQR. Unlike SLAHQR, * instead of sending one double shift through the largest * unreduced submatrix, this algorithm sends multiple double shifts * and spaces them apart so that there can be parallelism across * several processor row/columns. Another critical difference is * that this algorithm aggregrates multiple transforms together in * order to apply them in a block fashion. * * Important Local Variables: * IBLK = The maximum number of bulges that can be computed. * Currently fixed. Future releases this won't be fixed. * HBL = The square block size (HBL=DESCA(MB_)=DESCA(NB_)) * ROTN = The number of transforms to block together * NBULGE = The number of bulges that will be attempted on the * current submatrix. * IBULGE = The current number of bulges started. * K1(*),K2(*) = The current bulge loops from K1(*) to K2(*). * * Subroutines: * From LAPACK, this routine calls: * CLAHQR -> Serial QR used to determine shifts and * eigenvalues * CLARFG -> Determine the Householder transforms * * This ScaLAPACK, this routine calls: * PCLACONSB -> To determine where to start each iteration * CLAMSH -> Sends multiple shifts through a small * submatrix to see how the consecutive * subdiagonals change (if PCLACONSB indicates * we can start a run in the middle) * PCLAWIL -> Given the shift, get the transformation * PCLACP3 -> Parallel array to local replicated array copy * & back. * CLAREF -> Row/column reflector applier. Core routine * here. * PCLASMSUB -> Finds negligible subdiagonal elements. * * Current Notes and/or Restrictions: * 1.) This code requires the distributed block size to be square * and at least six (6); unlike simpler codes like LU, this * algorithm is extremely sensitive to block size. Unwise * choices of too small a block size can lead to bad * performance. * 2.) This code requires A and Z to be distributed identically * and have identical contxts. A future version may allow Z to * have a different contxt to 1D row map it to all nodes (so no * communication on Z is necessary.) * 3.) This code does not currently block the initial transforms * so that none of the rows or columns for any bulge are * completed until all are started. To offset pipeline * start-up it is recommended that at least 2*LCM(NPROW,NPCOL) * bulges are used (if possible) * 4.) The maximum number of bulges currently supported is fixed at * 32. In future versions this will be limited only by the * incoming WORK and IWORK array. * 5.) The matrix A must be in upper Hessenberg form. If elements * below the subdiagonal are nonzero, the resulting transforms * may be nonsimilar. This is also true with the LAPACK * routine CLAHQR. * 6.) For this release, this code has only been tested for * RSRC_=CSRC_=0, but it has been written for the general case. * 7.) Currently, all the eigenvalues are distributed to all the * nodes. Future releases will probably distribute the * eigenvalues by the column partitioning. * 8.) The internals of this routine are subject to change. * 9.) To optimize this for your architecture, try tuning CLAREF. * 10.) This code has only been tested for WANTZ = .TRUE. and may * behave unpredictably for WANTZ set to .FALSE. * * Further Details * =============== * * Contributed by Mark Fahey, June, 2000. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL RONE PARAMETER ( RONE = 1.0E+0 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ) ) REAL CONST PARAMETER ( CONST = 1.50E+0 ) INTEGER IBLK PARAMETER ( IBLK = 32 ) * .. * .. Local Scalars .. LOGICAL SKIP INTEGER CONTXT, DOWN, HBL, I, I1, I2, IAFIRST, IBULGE, $ ICBUF, ICOL, ICOL1, ICOL2, IDIA, IERR, II, $ IRBUF, IROW, IROW1, IROW2, ISPEC, ISTART, $ ISTARTCOL, ISTARTROW, ISTOP, ISUB, ISUP, $ ITERMAX, ITMP1, ITMP2, ITN, ITS, IZBUF, J, $ JAFIRST, JBLK, JJ, K, KI, L, LCMRC, LDA, LDZ, $ LEFT, LIHIH, LIHIZ, LILOH, LILOZ, LOCALI1, $ LOCALI2, LOCALK, LOCALM, M, MODKM1, MYCOL, $ MYROW, NBULGE, NH, NODE, NPCOL, NPROW, NQ, NR, $ NUM, NZ, RIGHT, ROTN, UP, VECSIDX REAL CS, OVFL, S, SMLNUM, ULP, UNFL COMPLEX CDUM, H10, H11, H22, H33, H43H34, H44, SN, SUM, $ T1, T1COPY, T2, T3, V1SAVE, V2, V2SAVE, V3, $ V3SAVE * .. * .. Local Arrays .. INTEGER ICURCOL( IBLK ), ICURROW( IBLK ), K1( IBLK ), $ K2( IBLK ), KCOL( IBLK ), KP2COL( IBLK ), $ KP2ROW( IBLK ), KROW( IBLK ) COMPLEX S1( 2*IBLK, 2*IBLK ), SMALLA( 6, 6, IBLK ), $ VCOPY( 3 ) * .. * .. External Functions .. INTEGER ILCM, NUMROC REAL PSLAMCH EXTERNAL ILCM, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMN2D, IGEBR2D, IGEBS2D, $ INFOG1L, INFOG2L, PSLABAD, PXERBLA, PCLACONSB, $ PCLACP3, PCLASMSUB, PCLAWIL, PCROT, CCOPY, $ CGEBR2D, CGEBS2D, CGERV2D, CGESD2D, CGSUM2D, $ CLAHQR2, CLAMSH, CLANV2, CLAREF, CLARFG * .. * .. Intrinsic Functions .. * INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN, MOD * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. * INFO = 0 * ITERMAX = 30*( IHI-ILO+1 ) IF( N.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL NUM = NPROW*NPCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) LCMRC = ILCM( NPROW, NPCOL ) IF( ( NPROW.LE.3 ) .OR. ( NPCOL.LE.3 ) ) THEN SKIP = .TRUE. ELSE SKIP = .FALSE. END IF * * Determine the number of columns we have so we can check workspace * NQ = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) JJ = N / HBL IF( JJ*HBL.LT.N ) $ JJ = JJ + 1 JJ = 7*JJ / LCMRC JJ = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, JJ ) JJ = JJ + MAX( 2*N, ( 8*LCMRC+2 )**2 ) IF( LWORK.EQ.-1 ) THEN WORK( 1 ) = JJ RETURN END IF IF( LWORK.LT.JJ ) THEN INFO = -14 END IF IF( DESCZ( CTXT_ ).NE.DESCA( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) END IF IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1300+NB_ ) END IF IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1300+MB_ ) END IF IF( ( DESCA( RSRC_ ).NE.0 ) .OR. ( DESCA( CSRC_ ).NE.0 ) ) THEN INFO = -( 700+RSRC_ ) END IF IF( ( DESCZ( RSRC_ ).NE.0 ) .OR. ( DESCZ( CSRC_ ).NE.0 ) ) THEN INFO = -( 1300+RSRC_ ) END IF IF( ( ILO.GT.N ) .OR. ( ILO.LT.1 ) ) THEN INFO = -4 END IF IF( ( IHI.GT.N ) .OR. ( IHI.LT.1 ) ) THEN INFO = -5 END IF IF( HBL.LT.5 ) THEN INFO = -( 700+MB_ ) END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PCLAHQR', -INFO ) RETURN END IF * * Set work array indices * VECSIDX = 0 IDIA = 3*N ISUB = 3*N ISUP = 3*N IRBUF = 3*N ICBUF = 3*N IZBUF = 5*N * * Find a value for ROTN * ROTN = HBL / 3 ROTN = MIN( ROTN, HBL-2 ) ROTN = MAX( ROTN, 1 ) * IF( ILO.EQ.IHI ) THEN CALL INFOG2L( ILO, ILO, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN W( ILO ) = A( ( ICOL-1 )*LDA+IROW ) ELSE W( ILO ) = ZERO END IF RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * CALL INFOG1L( ILOZ, HBL, NPROW, MYROW, IAFIRST, LILOZ, LIHIZ ) LIHIZ = NUMROC( IHIZ, HBL, MYROW, IAFIRST, NPROW ) * * Set machine-dependent constants for the stopping criterion. * If NORM(H) <= SQRT(OVFL), overflow should not occur. * UNFL = PSLAMCH( CONTXT, 'SAFE MINIMUM' ) OVFL = RONE / UNFL CALL PSLABAD( CONTXT, UNFL, OVFL ) ULP = PSLAMCH( CONTXT, 'PRECISION' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = ITERMAX * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of our schur block size (<=2*IBLK). Each * iteration of the loop works with the active submatrix in rows * and columns L to I. Eigenvalues I+1 to IHI have already * converged. Either L = ILO or the global A(L,L-1) is negligible * so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 570 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 540 ITS = 0, ITN * * Look for a single small subdiagonal element. * CALL PCLASMSUB( A, DESCA, I, L, K, SMLNUM, WORK( IRBUF+1 ), $ LWORK-IRBUF ) L = K * IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * CALL INFOG2L( L, L-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN A( ( ICOL-1 )*LDA+IROW ) = ZERO END IF WORK( ISUB+L-1 ) = ZERO END IF * * Exit from loop if a submatrix of order 1 or 2 has split off. * IF( WANTT ) THEN * For Schur form, use 2x2 blocks IF( L.GE.I-1 ) THEN GO TO 550 END IF ELSE * If we don't want the Schur form, use bigger blocks. IF( L.GE.I-( 2*IBLK-1 ) ) THEN GO TO 550 END IF END IF * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * * Copy submatrix of size 2*JBLK and prepare to do generalized * Wilkinson shift or an exceptional shift * JBLK = MIN( IBLK, ( ( I-L+1 ) / 2 )-1 ) IF( JBLK.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * JBLK = JBLK - MOD( JBLK, LCMRC ) END IF JBLK = MIN( JBLK, 2*LCMRC ) JBLK = MAX( JBLK, 1 ) * CALL PCLACP3( 2*JBLK, I-2*JBLK+1, A, DESCA, S1, 2*IBLK, -1, -1, $ 0 ) IF( ( ITS.EQ.20 .OR. ITS.EQ.40 ) .AND. ( JBLK.GT.1 ) ) THEN * * Exceptional shift. * DO 20 II = 2*JBLK, 2, -1 S1( II, II ) = CONST*( CABS1( S1( II, II ) )+ $ CABS1( S1( II, II-1 ) ) ) S1( II, II-1 ) = ZERO S1( II-1, II ) = ZERO 20 CONTINUE S1( 1, 1 ) = CONST*CABS1( S1( 1, 1 ) ) ELSE CALL CLAHQR2( .FALSE., .FALSE., 2*JBLK, 1, 2*JBLK, S1, $ 2*IBLK, WORK( IRBUF+1 ), 1, 2*JBLK, Z, LDZ, $ IERR ) * * Prepare to use Wilkinson's double shift * H44 = S1( 2*JBLK, 2*JBLK ) H33 = S1( 2*JBLK-1, 2*JBLK-1 ) H43H34 = S1( 2*JBLK-1, 2*JBLK )*S1( 2*JBLK, 2*JBLK-1 ) * END IF * * Look for two consecutive small subdiagonal elements: * PCLACONSB is the routine that does this. * CALL PCLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, $ WORK( IRBUF+1 ), LWORK-IRBUF ) * * Double-shift QR step * * NBULGE is the number of bulges that will be attempted * ISTOP = MIN( M+ROTN-1-MOD( M-( M / HBL )*HBL-1, ROTN ), I-2 ) ISTOP = MIN( ISTOP, M+HBL-3-MOD( M-1, HBL ) ) ISTOP = MIN( ISTOP, I2-2 ) ISTOP = MAX( ISTOP, M ) NBULGE = ( I-1-ISTOP ) / HBL * * Do not exceed maximum determined. * NBULGE = MIN( NBULGE, JBLK ) IF( NBULGE.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * NBULGE = NBULGE - MOD( NBULGE, LCMRC ) END IF NBULGE = MAX( NBULGE, 1 ) * * If we are starting in the middle because of consecutive small * subdiagonal elements, we need to see how many bulges we * can send through without breaking the consecutive small * subdiagonal property. * IF( ( NBULGE.GT.1 ) .AND. ( M.GT.L ) ) THEN * * Copy a chunk of elements from global A(M-1:,M-1:) * CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ITMP1, ITMP2 ) II = MIN( 4*NBULGE+2, N-M+2 ) CALL PCLACP3( II, M-1, A, DESCA, WORK( IRBUF+1 ), II, ITMP1, $ ITMP2, 0 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN * * Find a new NBULGE based on the bulges we have. * CALL CLAMSH( S1, 2*IBLK, NBULGE, JBLK, WORK( IRBUF+1 ), $ II, II, ULP ) IF( NUM.GT.1 ) THEN CALL IGEBS2D( CONTXT, 'ALL', ' ', 1, 1, NBULGE, 1 ) END IF ELSE * * Everyone needs to receive the new NBULGE * CALL IGEBR2D( CONTXT, 'ALL', ' ', 1, 1, NBULGE, 1, ITMP1, $ ITMP2 ) END IF END IF * * IBULGE is the number of bulges going so far * IBULGE = 1 * * "A" row defs : main row transforms from LOCALK to LOCALI2 * CALL INFOG1L( M, HBL, NPCOL, MYCOL, JAFIRST, ITMP1, LOCALK ) LOCALK = NQ CALL INFOG1L( 1, HBL, NPCOL, MYCOL, JAFIRST, ICOL1, LOCALI2 ) LOCALI2 = NUMROC( I2, HBL, MYCOL, JAFIRST, NPCOL ) * * "A" col defs : main col transforms from LOCALI1 to LOCALM * CALL INFOG1L( I1, HBL, NPROW, MYROW, IAFIRST, LOCALI1, ICOL1 ) CALL INFOG1L( 1, HBL, NPROW, MYROW, IAFIRST, LOCALM, ICOL1 ) ICOL1 = NUMROC( MIN( M+3, I ), HBL, MYROW, IAFIRST, NPROW ) * * Which row & column will start the bulges * ISTARTROW = MOD( ( M+1 ) / HBL, NPROW ) + IAFIRST ISTARTCOL = MOD( ( M+1 ) / HBL, NPCOL ) + JAFIRST * CALL INFOG1L( M, HBL, NPROW, MYROW, IAFIRST, II, ITMP2 ) CALL INFOG1L( M, HBL, NPCOL, MYCOL, JAFIRST, JJ, ITMP2 ) CALL INFOG1L( 1, HBL, NPROW, MYROW, IAFIRST, ISTOP, $ KP2ROW( 1 ) ) KP2ROW( 1 ) = NUMROC( M+2, HBL, MYROW, IAFIRST, NPROW ) CALL INFOG1L( 1, HBL, NPCOL, MYCOL, JAFIRST, ISTOP, $ KP2COL( 1 ) ) KP2COL( 1 ) = NUMROC( M+2, HBL, MYCOL, JAFIRST, NPCOL ) * * Set all values for bulges. All bulges are stored in * intermediate steps as loops over KI. Their current "task" * over the global M to I-1 values is always K1(KI) to K2(KI). * However, because there are many bulges, K1(KI) & K2(KI) might * go past that range while later bulges (KI+1,KI+2,etc..) are * finishing up. Even if ROTN=1, in order to minimize border * communication sometimes K1(KI)=HBL-2 & K2(KI)=HBL-1 so both * border messages can be handled at once. * * Rules: * If MOD(K1(KI)-1,HBL) < HBL-2 then MOD(K2(KI)-1,HBL)= 0. * * K (global input) INTEGER * The offset for the reduction. Elements below the k-th * subdiagonal in the first NB columns are reduced to zero. * * NB (global input) INTEGER * The number of columns to be reduced. * * A (local input/local output) COMPLEX pointer into * the local memory to an array of dimension (LLD_A, * LOCc(JA+N-K)). On entry, this array contains the the local * pieces of the N-by-(N-K+1) general distributed matrix * A(IA:IA+N-1,JA:JA+N-K). On exit, the elements on and above * the k-th subdiagonal in the first NB columns are overwritten * with the corresponding elements of the reduced distributed * matrix; the elements below the k-th subdiagonal, with the * array TAU, represent the matrix Q as a product of elementary * reflectors. The other columns of A(IA:IA+N-1,JA:JA+N-K) are * unchanged. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). TAU is tied to the distributed matrix A. * * T (local output) COMPLEX array, dimension (NB_A,NB_A) * The upper triangular matrix T. * * Y (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_Y,NB_A). On exit, this array * contains the local pieces of the N-by-NB distributed * matrix Y. LLD_Y >= LOCr(IA+N-1). * * IY (global input) INTEGER * The row index in the global array Y indicating the first * row of sub( Y ). * * JY (global input) INTEGER * The column index in the global array Y indicating the * first column of sub( Y ). * * DESCY (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Y. * * WORK (local workspace) COMPLEX array, dimension (NB) * * Further Details * =============== * * The matrix Q is represented as a product of nb elementary reflectors * * Q = H(1) H(2) . . . H(nb). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in * A(ia+i+k:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The elements of the vectors v together form the (n-k+1)-by-nb matrix * V which is needed, with T and Y, to apply the transformation to the * unreduced part of the matrix, using an update of the form: * A(ia:ia+n-1,ja:ja+n-k) := (I-V*T*V')*(A(ia:ia+n-1,ja:ja+n-k)-Y*V'). * * The contents of A(ia:ia+n-1,ja:ja+n-k) on exit are illustrated by the * following example with n = 7, k = 3 and nb = 2: * * ( a h a a a ) * ( a h a a a ) * ( a h a a a ) * ( h h a a a ) * ( v1 h a a a ) * ( v1 v2 a a a ) * ( v1 v2 a a a ) * * where a denotes an element of the original matrix * A(ia:ia+n-1,ja:ja+n-k), h denotes a modified element of the upper * Hessenberg matrix H, and vi denotes an element of the vector * defining H(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL IPROC INTEGER I, IACOL, IAROW, ICTXT, IOFF, II, J, JJ, JL, $ JT, JW, L, MYROW, MYCOL, NPCOL, NPROW, NQ COMPLEX EI * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CAXPY, CCOPY, CSCAL, $ CTRMV, DESCSET, INFOG2L, PCELSET, $ PCGEMV, PCLACGV, PCLARFG, PCSCAL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.1 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA+K, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) * IPROC = ( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) NQ = NUMROC( N+JA-1, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - IOFF * EI = ZERO JW = IOFF + 1 CALL DESCSET( DESCW, 1, DESCA( MB_ ), 1, DESCA( MB_ ), IAROW, $ IACOL, ICTXT, 1 ) * DO 10 L = 1, NB I = IA + K + L - 2 J = JA + L - 1 * IF( L.GT.1 ) THEN * * Update A(ia:ia+n-1,j) * * Compute i-th column of A - Y * V' * CALL PCLACGV( L-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCGEMV( 'No transpose', N, L-1, -ONE, Y, IY, JY, DESCY, $ A, I, JA, DESCA, DESCA( M_ ), ONE, A, IA, J, $ DESCA, 1 ) CALL PCLACGV( L-1, A, I, JA, DESCA, DESCA( M_ ) ) * * Apply I - V * T' * V' to this column (call it b) from the * left, using the last column of T as workspace * * Let V = ( V1 ) and b = ( b1 ) (first I-1 rows) * ( V2 ) ( b2 ) * * where V1 is unit lower triangular * * w := V1' * b1 * IF( IPROC ) THEN CALL CCOPY( L-1, A( (JJ+L-2)*DESCA( LLD_ )+II ), 1, $ WORK( JW ), 1 ) CALL CTRMV( 'Lower', 'Conjugate transpose', 'Unit', L-1, $ A( (JJ-1)*DESCA( LLD_ )+II ), DESCA( LLD_ ), $ WORK( JW ), 1 ) END IF * * w := w + V2'*b2 * CALL PCGEMV( 'Conjugate transpose', N-K-L+1, L-1, ONE, A, $ I+1, JA, DESCA, A, I+1, J, DESCA, 1, ONE, WORK, $ 1, JW, DESCW, DESCW( M_ ) ) * * w := T'*w * IF( IPROC ) $ CALL CTRMV( 'Upper', 'Conjugate transpose', 'Non-unit', $ L-1, T, DESCA( NB_ ), WORK( JW ), 1 ) * * b2 := b2 - V2*w * CALL PCGEMV( 'No transpose', N-K-L+1, L-1, -ONE, A, I+1, JA, $ DESCA, WORK, 1, JW, DESCW, DESCW( M_ ), ONE, $ A, I+1, J, DESCA, 1 ) * * b1 := b1 - V1*w * IF( IPROC ) THEN CALL CTRMV( 'Lower', 'No transpose', 'Unit', L-1, $ A( (JJ-1)*DESCA( LLD_ )+II ), DESCA( LLD_ ), $ WORK( JW ), 1 ) CALL CAXPY( L-1, -ONE, WORK( JW ), 1, $ A( ( JJ+L-2 )*DESCA( LLD_ )+II ), 1 ) END IF CALL PCELSET( A, I, J-1, DESCA, EI ) END IF * * Generate the elementary reflector H(i) to annihilate * A(ia+k+i:ia+n-1,j) * CALL PCLARFG( N-K-L+1, EI, I+1, J, A, MIN( I+2, N+IA-1 ), J, $ DESCA, 1, TAU ) CALL PCELSET( A, I+1, J, DESCA, ONE ) * * Compute Y(iy:y+n-1,jy+l-1) * CALL PCGEMV( 'No transpose', N, N-K-L+1, ONE, A, IA, J+1, $ DESCA, A, I+1, J, DESCA, 1, ZERO, Y, IY, JY+L-1, $ DESCY, 1 ) CALL PCGEMV( 'Conjugate transpose', N-K-L+1, L-1, ONE, A, I+1, $ JA, DESCA, A, I+1, J, DESCA, 1, ZERO, WORK, 1, JW, $ DESCW, DESCW( M_ ) ) CALL PCGEMV( 'No transpose', N, L-1, -ONE, Y, IY, JY, DESCY, $ WORK, 1, JW, DESCW, DESCW( M_ ), ONE, Y, IY, $ JY+L-1, DESCY, 1 ) JL = MIN( JJ+L-1, JA+NQ-1 ) CALL PCSCAL( N, TAU( JL ), Y, IY, JY+L-1, DESCY, 1 ) * * Compute T(1:i,i) * IF( IPROC ) THEN JT = ( L-1 ) * DESCA( NB_ ) CALL CSCAL( L-1, -TAU( JL ), WORK( JW ), 1 ) CALL CCOPY( L-1, WORK( JW ), 1, T( JT+1 ), 1 ) CALL CTRMV( 'Upper', 'No transpose', 'Non-unit', L-1, T, $ DESCA( NB_ ), T( JT+1 ), 1 ) T( JT+L ) = TAU( JL ) END IF 10 CONTINUE * CALL PCELSET( A, K+NB+IA-1, J, DESCA, EI ) * RETURN * * End of PCLAHRD * END scalapack-2.0.2/SRC/pclamr1d.f000644 000766 000024 00000010671 10363532303 016215 0ustar00juliestaff000000 000000 SUBROUTINE PCLAMR1D( N, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IA, IB, JA, JB, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ) * .. * * Bugs * ==== * * I am not sure that this works correctly when IB and JB are not equal * to 1. Indeed, I suspect that IB should always be set to 1 or ignored * with 1 used in its place. * * PCLAMR1D has not been tested except withint the contect of * PCHEPTRD, the prototype reduction to tridiagonal form code. * * Purpose * * ======= * * PCLAMR1D redistributes a one-dimensional row vector from one data * decomposition to another. * * This is an auxiliary routine called by PCHETRD to redistribute D, E * and TAU. * * Notes * ===== * * Although all processes call PCGEMR2D, only the processes that own * the first column of A send data and only processes that own the * first column of B receive data. The calls to CGEBS2D/CGEBR2D * spread the data down. * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix to be transposed. * * A (local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LOCc(JA+N-1)). * On output, A is replicated across all processes in * this processor column. * * IA (global input) INTEGER * A's global row index, which points to the beginning of * the submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LOCc(JB+N-1)). * * IB (global input) INTEGER * B's global row index, NOT USED * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace) COMPLEX*16 array, dimension ( LWORK ) * * LWORK (local input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCAA( DLEN_ ), DESCBB( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, PCGEMR2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return if possible * IF( N.LE.0 ) $ RETURN * DO 10 I = 1, DLEN_ DESCAA( I ) = DESCA( I ) DESCBB( I ) = DESCB( I ) 10 CONTINUE * DESCAA( M_ ) = 1 DESCBB( M_ ) = 1 DESCAA( LLD_ ) = 1 DESCBB( LLD_ ) = 1 * ICTXT = DESCB( CTXT_ ) CALL PCGEMR2D( 1, N, A, IA, JA, DESCAA, B, IB, JB, DESCBB, ICTXT ) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NQ = NUMROC( N, DESCB( NB_ ), MYCOL, 0, NPCOL ) * IF( MYROW.EQ.0 ) THEN CALL CGEBS2D( ICTXT, 'C', ' ', NQ, 1, B, NQ ) ELSE CALL CGEBR2D( ICTXT, 'C', ' ', NQ, 1, B, NQ, 0, MYCOL ) END IF * RETURN * * End of PCLAMR1D * END scalapack-2.0.2/SRC/pclange.f000644 000766 000024 00000026647 10363532303 016135 0ustar00juliestaff000000 000000 REAL FUNCTION PCLANGE( NORM, M, N, A, IA, JA, DESCA, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL WORK( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLANGE returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * distributed matrix sub( A ) = A(IA:IA+M-1, JA:JA+N-1). * * PCLANGE returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+M-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PCLANGE as described * above. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). When M = 0, PCLANGE * is set to zero. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). When N = 0, * PCLANGE is set to zero. N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, ICOFF, IOFFA, $ IROFF, J, JJ, LDA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ REAL SCALE, SUM, VALUE * .. * .. Local Arrays .. REAL RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLASSQ, INFOG2L, SCOMBSSQ, $ SGEBR2D, SGEBS2D, SGAMX2D, SGSUM2D, $ PSTREECOMB * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, NUMROC EXTERNAL LSAME, ISAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * IF( MIN( M, N ).EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO IF( NQ.GT.0 .AND. MP.GT.0 ) THEN IOFFA = (JJ-1)*LDA DO 20 J = JJ, JJ+NQ-1 DO 10 I = II, MP+II-1 VALUE = MAX( VALUE, ABS( A( IOFFA+I ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE END IF CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, J, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * * Find norm1( sub( A ) ). * IF( NQ.GT.0 ) THEN IOFFA = ( JJ - 1 ) * LDA DO 40 J = JJ, JJ+NQ-1 SUM = ZERO IF( MP.GT.0 ) THEN DO 30 I = II, MP+II-1 SUM = SUM + ABS( A( IOFFA+I ) ) 30 CONTINUE END IF IOFFA = IOFFA + LDA WORK( J-JJ+1 ) = SUM 40 CONTINUE END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( ISAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, J, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * * Find normI( sub( A ) ). * IF( MP.GT.0 ) THEN IOFFA = II + ( JJ - 1 ) * LDA DO 60 I = II, II+MP-1 SUM = ZERO IF( NQ.GT.0 ) THEN DO 50 J = IOFFA, IOFFA + NQ*LDA - 1, LDA SUM = SUM + ABS( A( J ) ) 50 CONTINUE END IF WORK( I-II+1 ) = SUM IOFFA = IOFFA + 1 60 CONTINUE END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, WORK, MAX( 1, MP ), $ MYROW, 0 ) * * Find maximum sum of rows for supnorm * IF( MYCOL.EQ.0 ) THEN IF( MP.GT.0 ) THEN VALUE = WORK( ISAMAX( MP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, I, $ J, -1, 0, 0 ) END IF * ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE IOFFA = II + ( JJ - 1 ) * LDA IF( NQ.GT.0 ) THEN DO 70 J = IOFFA, IOFFA + NQ*LDA - 1, LDA CALL CLASSQ( MP, A( J ), 1, SCALE, SUM ) 70 CONTINUE END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PSTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, SCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PCLANGE = VALUE * RETURN * * End of PCLANGE * END scalapack-2.0.2/SRC/pclanhe.f000644 000766 000024 00000102437 10363532303 016126 0ustar00juliestaff000000 000000 REAL FUNCTION PCLANHE( NORM, UPLO, N, A, IA, JA, $ DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL WORK( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLANHE returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * complex hermitian distributed matrix sub(A) = A(IA:IA+N-1,JA:JA+N-1). * * PCLANHE returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PCLANHE as described * above. * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * hermitian matrix sub( A ) is to be referenced. * = 'U': Upper triangular part of sub( A ) is referenced, * = 'L': Lower triangular part of sub( A ) is referenced. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PCLANHE is set to zero. N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the hermitian distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular matrix which norm is * to be computed, and the strictly lower triangular part of * this matrix is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular matrix which norm is to be computed, and the * strictly upper triangular part of sub( A ) is not referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * 2*Nq0+Np0+LDW if NORM = '1', 'O', 'o', 'I' or 'i', * where LDW is given by: * IF( NPROW.NE.NPCOL ) THEN * LDW = MB_A*CEIL(CEIL(Np0/MB_A)/(LCM/NPROW)) * ELSE * LDW = 0 * END IF * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * * where LCM is the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) and CEIL denotes the ceiling * operation (ICEIL). * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * ICEIL, ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IAROW, IACOL, IB, ICOFF, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, IROFF, ICSR, ICSR0, $ IOFFA, IRSC, IRSC0, IRSR, IRSR0, JJ, JJA, K, $ LDA, LL, MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL ABSA, SCALE, SUM, VALUE * .. * .. Local Arrays .. REAL RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLASSQ, PSCOL2ROW, $ PSTREECOMB, SAXPY, SCOMBSSQ, $ SGAMX2D, SGSUM2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ISAMAX, NUMROC EXTERNAL ICEIL, ISAMAX, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * * Get grid parameters and local indexes. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) ICSR = 1 IRSR = ICSR + NQ IRSC = IRSR + NQ IF( MYROW.EQ.IAROW ) THEN IRSC0 = IRSC + IROFF NP = NP - IROFF ELSE IRSC0 = IRSC END IF IF( MYCOL.EQ.IACOL ) THEN ICSR0 = ICSR + ICOFF IRSR0 = IRSR + ICOFF NQ = NQ - ICOFF ELSE ICSR0 = ICSR IRSR0 = IRSR END IF IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+N-1 ) LDA = DESCA( LLD_ ) * * If the matrix is Hermitian, we address only a triangular portion * of the matrix. A sum of row (column) i of the complete matrix * can be obtained by adding along row i and column i of the the * triangular matrix, stopping/starting at the diagonal, which is * the point of reflection. The pictures below demonstrate this. * In the following code, the row sums created by --- rows below are * refered to as ROWSUMS, and the column sums shown by | are refered * to as COLSUMS. Infinity-norm = 1-norm = ROWSUMS+COLSUMS. * * UPLO = 'U' UPLO = 'L' * ____i______ ___________ * |\ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \|------| i i|---\ | * | \ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * |__________\| |___|______\| * i * * II, JJ : local indices into array A * ICURROW : process row containing diagonal block * ICURCOL : process column containing diagonal block * IRSC0 : pointer to part of work used to store the ROWSUMS while * they are stored along a process column * IRSR0 : pointer to part of work used to store the ROWSUMS after * they have been transposed to be along a process row * II = IIA JJ = JJA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 20 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 10 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 10 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 20 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 40 K = II, II+IB-1 IF( MYCOL.EQ.IACOL ) THEN IF( JJ.LE.JJA+NQ-1 ) THEN VALUE = MAX( VALUE, $ ABS( REAL( A( K+(JJ-1)*LDA ) ) ) ) DO 30 LL = JJ*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 30 CONTINUE END IF ELSE IF( JJ.LE.JJA+NQ-1 ) THEN DO 35 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 35 CONTINUE END IF END IF IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 40 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining rows/columns of the matrix. * DO 90 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 60 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 50 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 50 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 60 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 80 K = II, II+IB-1 IF( MYCOL.EQ.ICURCOL ) THEN IF( JJ.LE.JJA+NQ-1 ) THEN VALUE = MAX( VALUE, $ ABS( REAL( A( K+(JJ-1)*LDA ) ) ) ) DO 70 LL = JJ*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 70 CONTINUE END IF ELSE IF( JJ.LE.JJA+NQ-1 ) THEN DO 75 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 75 CONTINUE END IF END IF IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 80 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 90 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 110 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( MYROW.EQ.IAROW ) THEN IF( II.LE.IIA+NP-1 ) THEN VALUE = MAX( VALUE, ABS( REAL( A( II+K ) ) ) ) DO 100 LL = II+1, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 100 CONTINUE END IF ELSE IF( II.LE.IIA+NP-1 ) THEN DO 105 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 105 CONTINUE END IF END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 110 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 130 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 120 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 120 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 130 CONTINUE ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 180 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 150 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( MYROW.EQ.ICURROW ) THEN IF( II.LE.IIA+NP-1 ) THEN VALUE = MAX( VALUE, $ ABS( REAL( A( II+K ) ) ) ) DO 140 LL = II+1, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 140 CONTINUE END IF ELSE IF( II.LE.IIA+NP-1 ) THEN DO 145 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 145 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 150 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 170 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 160 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 160 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 170 CONTINUE ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * END IF * * Gather the result on process (IAROW,IACOL). * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, K, -1, $ IAROW, IACOL ) * ELSE IF( LSAME( NORM, 'I' ) .OR. LSAME( NORM, 'O' ) .OR. $ NORM.EQ.'1' ) THEN * * Find normI( sub( A ) ) ( = norm1( sub( A ) ), since sub( A ) is * hermitian). * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 200 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 190 LL = IIA, II-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 190 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 200 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 220 K = II, II+IB-1 SUM = ZERO IF( MYCOL.EQ.IACOL ) THEN IF( JJA+NQ.GT.JJ ) THEN SUM = ABS( REAL( A( K+(JJ-1)*LDA ) ) ) DO 210 LL = JJ*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 210 CONTINUE END IF ELSE IF( JJA+NQ.GT.JJ ) THEN DO 215 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 215 CONTINUE END IF END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 220 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining rows/columns of global matrix. * DO 270 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 240 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 230 LL = IIA, II-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 230 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 240 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 260 K = II, II+IB-1 SUM = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( JJA+NQ.GT.JJ ) THEN SUM = ABS( REAL( A( K+(JJ-1)*LDA ) ) ) DO 250 LL = JJ*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 250 CONTINUE END IF ELSE IF( JJA+NQ.GT.JJ ) THEN DO 255 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 255 CONTINUE END IF END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 260 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 270 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = (JJ-1)*LDA DO 290 K = 0, IB-1 SUM = ZERO IF( MYROW.EQ.IAROW ) THEN IF( IIA+NP.GT.II ) THEN SUM = ABS( REAL( A( IOFFA+II ) ) ) DO 280 LL = II+1, IIA+NP-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 280 CONTINUE END IF ELSE DO 285 LL = II, IIA+NP-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 285 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 290 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 310 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 300 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 300 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 310 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 360 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 330 K = 0, IB-1 SUM = ZERO IF( MYROW.EQ.ICURROW ) THEN IF( IIA+NP.GT.II ) THEN SUM = ABS( REAL( A( II+IOFFA ) ) ) DO 320 LL = II+1, IIA+NP-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 320 CONTINUE ELSE IF( II.EQ.IIA+NP-1 ) THEN SUM = ABS( REAL( A( II+IOFFA ) ) ) END IF ELSE DO 325 LL = II, IIA+NP-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 325 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 330 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 350 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 340 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 340 CONTINUE END IF WORK(K-IIA+IRSC0) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 350 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 360 CONTINUE END IF * * After calls to SGSUM2D, process row 0 will have global * COLSUMS and process column 0 will have global ROWSUMS. * Transpose ROWSUMS and add to COLSUMS to get global row/column * sum, the max of which is the infinity or 1 norm. * IF( MYCOL.EQ.IACOL ) $ NQ = NQ + ICOFF CALL SGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK( ICSR ), 1, $ IAROW, MYCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP + IROFF CALL SGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IRSC ), $ MAX( 1, NP ), MYROW, IACOL ) * CALL PSCOL2ROW( ICTXT, N, 1, DESCA( MB_ ), WORK( IRSC ), $ MAX( 1, NP ), WORK( IRSR ), MAX( 1, NQ ), $ IAROW, IACOL, IAROW, IACOL, WORK( IRSC+NP ) ) * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF CALL SAXPY( NQ, ONE, WORK( IRSR0 ), 1, WORK( ICSR0 ), 1 ) IF( NQ.LT.1 ) THEN VALUE = ZERO ELSE VALUE = WORK( ISAMAX( NQ, WORK( ICSR0 ), 1 ) ) END IF CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, K, $ -1, IAROW, IACOL ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE * * Add off-diagonal entries, first * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 370 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL CLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) CALL CLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) THEN IF( REAL( A( II+K ) ).NE.ZERO ) THEN ABSA = ABS( REAL( A( II+K ) ) ) IF( SCALE.LT.ABSA ) THEN SUM = ONE + SUM * ( SCALE / ABSA )**2 SCALE = ABSA ELSE SUM = SUM + ( ABSA / SCALE )**2 END IF END IF II = II + 1 END IF 370 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 390 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 380 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL CLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) CALL CLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) THEN IF( REAL( A( II+K ) ).NE.ZERO ) THEN ABSA = ABS( REAL( A( II+K ) ) ) IF( SCALE.LT.ABSA ) THEN SUM = ONE + SUM*( SCALE / ABSA )**2 SCALE = ABSA ELSE SUM = SUM + ( ABSA / SCALE )**2 END IF END IF II = II + 1 END IF 380 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 400 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( MYROW.EQ.IAROW ) THEN IF( REAL( A( II+K ) ).NE.ZERO ) THEN ABSA = ABS( REAL( A( II+K ) ) ) IF( SCALE.LT.ABSA ) THEN SUM = ONE + SUM * ( SCALE / ABSA )**2 SCALE = ABSA ELSE SUM = SUM + ( ABSA / SCALE )**2 END IF END IF II = II + 1 END IF CALL CLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) CALL CLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 400 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 420 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 410 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( MYROW.EQ.ICURROW ) THEN IF( REAL( A( II+K ) ).NE.ZERO ) THEN ABSA = ABS( REAL( A( II+K ) ) ) IF( SCALE.LT.ABSA ) THEN SUM = ONE + SUM * ( SCALE / ABSA )**2 SCALE = ABSA ELSE SUM = SUM + ( ABSA / SCALE )**2 END IF END IF II = II + 1 END IF CALL CLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) CALL CLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 410 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 420 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM * CALL PSTREECOMB( ICTXT, 'All', 2, RWORK, IAROW, IACOL, $ SCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to the other processes * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, IAROW, $ IACOL ) END IF * PCLANHE = VALUE * RETURN * * End of PCLANHE * END scalapack-2.0.2/SRC/pclanhs.f000644 000766 000024 00000062506 10363532303 016146 0ustar00juliestaff000000 000000 REAL FUNCTION PCLANHS( NORM, N, A, IA, JA, DESCA, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL WORK( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLANHS returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * Hessenberg distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * PCLANHS returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PCLANHS as described * above. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PCLANHS is set to zero. N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, II, IIA, ICOFF, INXTROW, $ IOFFA, IROFF, J, JB, JJ, JJA, JN, KK, LDA, LL, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL SCALE, SUM, VALUE * .. * .. Local Arrays .. REAL RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLASSQ, INFOG2L, PSTREECOMB, $ SCOMBSSQ, SGEBR2D, SGEBS2D, $ SGAMX2D, SGSUM2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ISAMAX, NUMROC EXTERNAL LSAME, ICEIL, ISAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) IOFFA = ( JJA - 1 ) * LDA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * VALUE = ZERO * * Find max(abs(A(i,j))). * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 20 LL = JJ, JJ+JB-1 DO 10 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 50 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 40 LL = JJ, JJ+JB-1 DO 30 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 50 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 70 LL = JJ, JJ + JB -1 DO 60 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 60 CONTINUE IOFFA = IOFFA + LDA 70 CONTINUE ELSE DO 90 LL = JJ, JJ+JB-1 DO 80 KK = IIA, MIN( II-1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ VALUE = MAX( VALUE, ABS( A( II+(JJ+JB-2)*LDA ) ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 140 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 110 LL = JJ, JJ + JB -1 DO 100 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE ELSE DO 130 LL = JJ, JJ + JB -1 DO 120 KK = IIA, MIN( II-1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 120 CONTINUE IOFFA = IOFFA + LDA 130 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ VALUE = MAX( VALUE, $ ABS( A( II+(JJ+JB-2)*LDA ) ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 140 CONTINUE * END IF * * Gather the intermediate results to process (0,0). * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, KK, LL, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * VALUE = ZERO II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 160 LL = JJ, JJ+JB-1 SUM = ZERO DO 150 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 150 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 160 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 190 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 180 LL = JJ, JJ+JB-1 SUM = ZERO DO 170 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 170 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 180 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 190 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 210 LL = JJ, JJ + JB -1 SUM = ZERO DO 200 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 200 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 210 CONTINUE ELSE DO 230 LL = JJ, JJ + JB -1 SUM = ZERO DO 220 KK = IIA, MIN( II-1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 220 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 230 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( JJ+JB-JJA ) = WORK( JJ+JB-JJA ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 280 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 250 LL = JJ, JJ + JB -1 SUM = ZERO DO 240 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 240 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 250 CONTINUE ELSE DO 270 LL = JJ, JJ + JB -1 SUM = ZERO DO 260 KK = IIA, MIN( II-1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 260 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 270 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( JJ+JB-JJA ) = WORK( JJ+JB-JJA ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 280 CONTINUE * END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( ISAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, KK, LL, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * DO 290 KK = IIA, IIA+NP-1 WORK( KK ) = ZERO 290 CONTINUE * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 310 LL = JJ, JJ+JB-1 DO 300 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 300 CONTINUE IOFFA = IOFFA + LDA 310 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 340 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 330 LL = JJ, JJ+JB-1 DO 320 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 320 CONTINUE IOFFA = IOFFA + LDA 330 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 340 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 360 LL = JJ, JJ + JB -1 DO 350 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 350 CONTINUE IOFFA = IOFFA + LDA 360 CONTINUE ELSE DO 380 LL = JJ, JJ + JB -1 DO 370 KK = IIA, MIN( II-1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 370 CONTINUE IOFFA = IOFFA + LDA 380 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( II-IIA+1 ) = WORK( II-IIA+1 ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 430 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 400 LL = JJ, JJ + JB -1 DO 390 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 390 CONTINUE IOFFA = IOFFA + LDA 400 CONTINUE ELSE DO 420 LL = JJ, JJ + JB -1 DO 410 KK = IIA, MIN( II-1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS(A(IOFFA+KK)) 410 CONTINUE IOFFA = IOFFA + LDA 420 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( II-IIA+1 ) = WORK( II-IIA+1 ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 430 CONTINUE * END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK, MAX( 1, NP ), $ MYROW, 0 ) * * Find maximum sum of rows for Infinity-norm * IF( MYCOL.EQ.0 ) THEN IF( NP.GT.0 ) THEN VALUE = WORK( ISAMAX( NP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, KK, $ LL, -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * SCALE = ZERO SUM = ONE II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 440 LL = JJ, JJ+JB-1 CALL CLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 440 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 460 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 450 LL = JJ, JJ+JB-1 CALL CLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 450 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 460 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 470 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 470 CONTINUE ELSE DO 480 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II-1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 480 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ CALL CLASSQ( 1, A( II+(JJ+JB-2)*LDA ), 1, $ SCALE, SUM ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 510 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 490 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 490 CONTINUE ELSE DO 500 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II-1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 500 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ CALL CLASSQ( 1, A( II+(JJ+JB-2)*LDA ), 1, $ SCALE, SUM ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 510 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PSTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, SCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PCLANHS = VALUE * RETURN * * End of PCLANHS * END scalapack-2.0.2/SRC/pclansy.f000644 000766 000024 00000070257 10363532303 016171 0ustar00juliestaff000000 000000 REAL FUNCTION PCLANSY( NORM, UPLO, N, A, IA, JA, $ DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL WORK( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLANSY returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * real symmetric distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * PCLANSY returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PCLANSY as described * above. * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is to be referenced. * = 'U': Upper triangular part of sub( A ) is referenced, * = 'L': Lower triangular part of sub( A ) is referenced. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PCLANSY is set to zero. N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the symmetric distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular matrix which norm is * to be computed, and the strictly lower triangular part of * this matrix is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular matrix which norm is to be computed, and the * strictly upper triangular part of sub( A ) is not referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * 2*Nq0+Np0+LDW if NORM = '1', 'O', 'o', 'I' or 'i', * where LDW is given by: * IF( NPROW.NE.NPCOL ) THEN * LDW = MB_A*CEIL(CEIL(Np0/MB_A)/(LCM/NPROW)) * ELSE * LDW = 0 * END IF * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * * where LCM is the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) and CEIL denotes the ceiling * operation (ICEIL). * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * ICEIL, ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IAROW, IACOL, IB, ICOFF, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, IROFF, ICSR, ICSR0, $ IOFFA, IRSC, IRSC0, IRSR, IRSR0, JJ, JJA, K, $ LDA, LL, MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL SCALE, SUM, VALUE * .. * .. Local Arrays .. REAL RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLASSQ, PSCOL2ROW, $ PSTREECOMB, SAXPY, SCOMBSSQ, $ SGAMX2D, SGSUM2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ISAMAX, NUMROC EXTERNAL ICEIL, ISAMAX, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters and local indexes. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) ICSR = 1 IRSR = ICSR + NQ IRSC = IRSR + NQ IF( MYROW.EQ.IAROW ) THEN IRSC0 = IRSC + IROFF NP = NP - IROFF ELSE IRSC0 = IRSC END IF IF( MYCOL.EQ.IACOL ) THEN ICSR0 = ICSR + ICOFF IRSR0 = IRSR + ICOFF NQ = NQ - ICOFF ELSE ICSR0 = ICSR IRSR0 = IRSR END IF IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+N-1 ) LDA = DESCA( LLD_ ) * * If the matrix is symmetric, we address only a triangular portion * of the matrix. A sum of row (column) i of the complete matrix * can be obtained by adding along row i and column i of the the * triangular matrix, stopping/starting at the diagonal, which is * the point of reflection. The pictures below demonstrate this. * In the following code, the row sums created by --- rows below are * refered to as ROWSUMS, and the column sums shown by | are refered * to as COLSUMS. Infinity-norm = 1-norm = ROWSUMS+COLSUMS. * * UPLO = 'U' UPLO = 'L' * ____i______ ___________ * |\ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \|------| i i|---\ | * | \ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * |__________\| |___|______\| * i * * II, JJ : local indices into array A * ICURROW : process row containing diagonal block * ICURCOL : process column containing diagonal block * IRSC0 : pointer to part of work used to store the ROWSUMS while * they are stored along a process column * IRSR0 : pointer to part of work used to store the ROWSUMS after * they have been transposed to be along a process row * II = IIA JJ = JJA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 20 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 10 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 10 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 20 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 40 K = II, II+IB-1 IF( JJ.LE.JJA+NQ-1 ) THEN DO 30 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 30 CONTINUE END IF IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 40 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining rows/columns of the matrix. * DO 90 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 60 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 50 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 50 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 60 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 80 K = II, II+IB-1 IF( JJ.LE.JJA+NQ-1 ) THEN DO 70 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 70 CONTINUE END IF IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 80 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 90 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 110 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.LE.IIA+NP-1 ) THEN DO 100 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 100 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 110 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 130 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 120 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 120 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 130 CONTINUE ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 180 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 150 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.LE.IIA+NP-1 ) THEN DO 140 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 140 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 150 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 170 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 160 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 160 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 170 CONTINUE ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * END IF * * Gather the result on process (IAROW,IACOL). * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, K, -1, $ IAROW, IACOL ) * ELSE IF( LSAME( NORM, 'I' ) .OR. LSAME( NORM, 'O' ) .OR. $ NORM.EQ.'1' ) THEN * * Find normI( sub( A ) ) ( = norm1( sub( A ) ), since sub( A ) is * symmetric). * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 200 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 190 LL = IIA, II-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 190 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 200 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 220 K = II, II+IB-1 SUM = ZERO IF( JJA+NQ.GT.JJ ) THEN DO 210 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 210 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 220 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining rows/columns of global matrix. * DO 270 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 240 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 230 LL = IIA, II-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 230 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 240 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 260 K = II, II+IB-1 SUM = ZERO IF( JJA+NQ.GT.JJ ) THEN DO 250 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 250 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 260 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 270 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = (JJ-1)*LDA DO 290 K = 0, IB-1 SUM = ZERO IF( IIA+NP.GT.II ) THEN DO 280 LL = II, IIA+NP-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 280 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 290 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 310 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 300 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 300 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 310 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 360 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 330 K = 0, IB-1 SUM = ZERO IF( IIA+NP.GT.II ) THEN DO 320 LL = II, IIA+NP-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 320 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 330 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 350 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 340 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 340 CONTINUE END IF WORK(K-IIA+IRSC0) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 350 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 360 CONTINUE END IF * * After calls to SGSUM2D, process row 0 will have global * COLSUMS and process column 0 will have global ROWSUMS. * Transpose ROWSUMS and add to COLSUMS to get global row/column * sum, the max of which is the infinity or 1 norm. * IF( MYCOL.EQ.IACOL ) $ NQ = NQ + ICOFF CALL SGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK( ICSR ), 1, $ IAROW, MYCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP + IROFF CALL SGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IRSC ), $ MAX( 1, NP ), MYROW, IACOL ) * CALL PSCOL2ROW( ICTXT, N, 1, DESCA( MB_ ), WORK( IRSC ), $ MAX( 1, NP ), WORK( IRSR ), MAX( 1, NQ ), $ IAROW, IACOL, IAROW, IACOL, WORK( IRSC+NP ) ) * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF CALL SAXPY( NQ, ONE, WORK( IRSR0 ), 1, WORK( ICSR0 ), 1 ) IF( NQ.LT.1 ) THEN VALUE = ZERO ELSE VALUE = WORK( ISAMAX( NQ, WORK( ICSR0 ), 1 ) ) END IF CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, K, $ -1, IAROW, IACOL ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE * * Add off-diagonal entries, first * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 370 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL CLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) $ II = II + 1 CALL CLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) 370 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 390 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 380 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL CLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) $ II = II + 1 CALL CLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) 380 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 400 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL CLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) $ II = II + 1 CALL CLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 400 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 420 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 410 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL CLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) $ II = II + 1 CALL CLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 410 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 420 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM * CALL PSTREECOMB( ICTXT, 'All', 2, RWORK, IAROW, IACOL, $ SCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to the other processes * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, IAROW, $ IACOL ) END IF * PCLANSY = VALUE * RETURN * * End of PCLANSY * END scalapack-2.0.2/SRC/pclantr.f000644 000766 000024 00000110627 10363532303 016157 0ustar00juliestaff000000 000000 REAL FUNCTION PCLANTR( NORM, UPLO, DIAG, M, N, A, $ IA, JA, DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL WORK( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLANTR returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * trapezoidal or triangular distributed matrix sub( A ) denoting * A(IA:IA+M-1, JA:JA+N-1). * * PCLANTR returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with ia <= i <= ia+m-1, * ( and ja <= j <= ja+n-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PCLANTR as described * above. * * UPLO (global input) CHARACTER * Specifies whether the matrix sub( A ) is upper or lower * trapezoidal. * = 'U': Upper trapezoidal * = 'L': Lower trapezoidal * Note that sub( A ) is triangular instead of trapezoidal * if M = N. * * DIAG (global input) CHARACTER * Specifies whether or not the distributed matrix sub( A ) has * unit diagonal. * = 'N': Non-unit diagonal * = 'U': Unit diagonal * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). When M = 0, PCLANTR is * set to zero. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). When N = 0, * PCLANTR is set to zero. N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL UDIAG INTEGER IACOL, IAROW, ICTXT, II, IIA, ICOFF, IOFFA, $ IROFF, J, JB, JJ, JJA, JN, KK, LDA, LL, MP, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL SCALE, SUM, VALUE * .. * .. Local Arrays .. REAL RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLASSQ, INFOG2L, PSTREECOMB, $ SCOMBSSQ, SGEBR2D, SGEBS2D, $ SGAMX2D, SGSUM2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ISAMAX, NUMROC EXTERNAL LSAME, ICEIL, ISAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * UDIAG = LSAME( DIAG, 'U' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) IOFFA = ( JJA - 1 ) * LDA * IF( MIN( M, N ).EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * IF( UDIAG ) THEN VALUE = ONE ELSE VALUE = ZERO END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 20 LL = JJ, JJ + JB -1 DO 10 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE ELSE DO 40 LL = JJ, JJ + JB -1 DO 30 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE END IF ELSE DO 60 LL = JJ, JJ + JB -1 DO 50 KK = IIA, MIN( II-1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 80 LL = JJ, JJ + JB -1 DO 70 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 70 CONTINUE IOFFA = IOFFA + LDA 80 CONTINUE ELSE DO 100 LL = JJ, JJ + JB -1 DO 90 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 90 CONTINUE IOFFA = IOFFA + LDA 100 CONTINUE END IF ELSE DO 120 LL = JJ, JJ + JB -1 DO 110 KK = IIA, MIN( II-1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 110 CONTINUE IOFFA = IOFFA + LDA 120 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 130 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 150 LL = JJ, JJ + JB -1 DO 140 KK = II+LL-JJ+1, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 140 CONTINUE IOFFA = IOFFA + LDA 150 CONTINUE ELSE DO 170 LL = JJ, JJ + JB -1 DO 160 KK = II+LL-JJ, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 160 CONTINUE IOFFA = IOFFA + LDA 170 CONTINUE END IF ELSE DO 190 LL = JJ, JJ + JB -1 DO 180 KK = II, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 180 CONTINUE IOFFA = IOFFA + LDA 190 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 260 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 210 LL = JJ, JJ + JB -1 DO 200 KK = II+LL-JJ+1, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 200 CONTINUE IOFFA = IOFFA + LDA 210 CONTINUE ELSE DO 230 LL = JJ, JJ + JB -1 DO 220 KK = II+LL-JJ, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 220 CONTINUE IOFFA = IOFFA + LDA 230 CONTINUE END IF ELSE DO 250 LL = JJ, JJ + JB -1 DO 240 KK = II, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 240 CONTINUE IOFFA = IOFFA + LDA 250 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 260 CONTINUE * END IF * * Gather the intermediate results to process (0,0). * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, KK, LL, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 280 LL = JJ, JJ + JB -1 SUM = ONE DO 270 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 270 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 280 CONTINUE ELSE DO 300 LL = JJ, JJ + JB -1 SUM = ZERO DO 290 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 290 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 300 CONTINUE END IF ELSE DO 320 LL = JJ, JJ + JB -1 SUM = ZERO DO 310 KK = IIA, MIN( II-1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 310 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 320 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 390 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 340 LL = JJ, JJ + JB -1 SUM = ONE DO 330 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 330 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 340 CONTINUE ELSE DO 360 LL = JJ, JJ + JB -1 SUM = ZERO DO 350 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 350 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 360 CONTINUE END IF ELSE DO 380 LL = JJ, JJ + JB -1 SUM = ZERO DO 370 KK = IIA, MIN( II-1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 370 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 380 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 410 LL = JJ, JJ + JB -1 SUM = ONE DO 400 KK = II+LL-JJ+1, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 400 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 410 CONTINUE ELSE DO 430 LL = JJ, JJ + JB -1 SUM = ZERO DO 420 KK = II+LL-JJ, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 420 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 430 CONTINUE END IF ELSE DO 450 LL = JJ, JJ + JB -1 SUM = ZERO DO 440 KK = II, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 440 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 450 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 520 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 470 LL = JJ, JJ + JB -1 SUM = ONE DO 460 KK = II+LL-JJ+1, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 460 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 470 CONTINUE ELSE DO 490 LL = JJ, JJ + JB -1 SUM = ZERO DO 480 KK = II+LL-JJ, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 480 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 490 CONTINUE END IF ELSE DO 510 LL = JJ, JJ + JB -1 SUM = ZERO DO 500 KK = II, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 500 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 510 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 520 CONTINUE * END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( ISAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, KK, LL, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * IF( LSAME( UPLO, 'U' ) ) THEN IF( UDIAG ) THEN DO 530 KK = IIA, IIA+MP-1 WORK( KK ) = ONE 530 CONTINUE ELSE DO 540 KK = IIA, IIA+MP-1 WORK( KK ) = ZERO 540 CONTINUE END IF ELSE IF( UDIAG ) THEN NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF DO 550 KK = IIA, IIA+NP-1 WORK( KK ) = ONE 550 CONTINUE DO 560 KK = IIA+NP, IIA+MP-1 WORK( KK ) = ZERO 560 CONTINUE ELSE DO 570 KK = IIA, IIA+MP-1 WORK( KK ) = ZERO 570 CONTINUE END IF END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 590 LL = JJ, JJ + JB -1 DO 580 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 580 CONTINUE IOFFA = IOFFA + LDA 590 CONTINUE ELSE DO 610 LL = JJ, JJ + JB -1 DO 600 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 600 CONTINUE IOFFA = IOFFA + LDA 610 CONTINUE END IF ELSE DO 630 LL = JJ, JJ + JB -1 DO 620 KK = IIA, MIN( II-1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 620 CONTINUE IOFFA = IOFFA + LDA 630 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 700 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 650 LL = JJ, JJ + JB -1 DO 640 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 640 CONTINUE IOFFA = IOFFA + LDA 650 CONTINUE ELSE DO 670 LL = JJ, JJ + JB -1 DO 660 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 660 CONTINUE IOFFA = IOFFA + LDA 670 CONTINUE END IF ELSE DO 690 LL = JJ, JJ + JB -1 DO 680 KK = IIA, MIN( II-1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 680 CONTINUE IOFFA = IOFFA + LDA 690 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 700 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 720 LL = JJ, JJ + JB -1 DO 710 KK = II+LL-JJ+1, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 710 CONTINUE IOFFA = IOFFA + LDA 720 CONTINUE ELSE DO 740 LL = JJ, JJ + JB -1 DO 730 KK = II+LL-JJ, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 730 CONTINUE IOFFA = IOFFA + LDA 740 CONTINUE END IF ELSE DO 760 LL = JJ, JJ + JB -1 DO 750 KK = II, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 750 CONTINUE IOFFA = IOFFA + LDA 760 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 830 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 780 LL = JJ, JJ + JB -1 DO 770 KK = II+LL-JJ+1, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 770 CONTINUE IOFFA = IOFFA + LDA 780 CONTINUE ELSE DO 800 LL = JJ, JJ + JB -1 DO 790 KK = II+LL-JJ, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 790 CONTINUE IOFFA = IOFFA + LDA 800 CONTINUE END IF ELSE DO 820 LL = JJ, JJ + JB -1 DO 810 KK = II, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 810 CONTINUE IOFFA = IOFFA + LDA 820 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 830 CONTINUE * END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, WORK, MAX( 1, MP ), $ MYROW, 0 ) * * Find maximum sum of rows for Infinity-norm * IF( MYCOL.EQ.0 ) THEN IF( MP.GT.0 ) THEN VALUE = WORK( ISAMAX( MP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, KK, $ LL, -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * IF( UDIAG ) THEN SCALE = ONE SUM = REAL( MIN( M, N ) ) / REAL( NPROW*NPCOL ) ELSE SCALE = ZERO SUM = ONE END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 840 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II+LL-JJ, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 840 CONTINUE ELSE DO 850 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II+LL-JJ+1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 850 CONTINUE END IF ELSE DO 860 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II-1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 860 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 900 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 870 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II+LL-JJ+1, IIA+MP-1 )- $ IIA+1, A( IIA+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 870 CONTINUE ELSE DO 880 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II+LL-JJ, IIA+MP-1 )- $ IIA+1, A( IIA+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 880 CONTINUE END IF ELSE DO 890 LL = JJ, JJ + JB -1 CALL CLASSQ( MIN( II-1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 890 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 900 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 910 LL = JJ, JJ + JB -1 CALL CLASSQ( IIA+MP-(II+LL-JJ+1), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 910 CONTINUE ELSE DO 920 LL = JJ, JJ + JB -1 CALL CLASSQ( IIA+MP-(II+LL-JJ), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 920 CONTINUE END IF ELSE DO 930 LL = JJ, JJ + JB -1 CALL CLASSQ( IIA+MP-II, A( II+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 930 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 970 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 940 LL = JJ, JJ + JB -1 CALL CLASSQ( IIA+MP-(II+LL-JJ+1), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 940 CONTINUE ELSE DO 950 LL = JJ, JJ + JB -1 CALL CLASSQ( IIA+MP-(II+LL-JJ), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 950 CONTINUE END IF ELSE DO 960 LL = JJ, JJ + JB -1 CALL CLASSQ( IIA+MP-II, A( II+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 960 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 970 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PSTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, SCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to every process in the grid. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PCLANTR = VALUE * RETURN * * End of PCLANTR * END scalapack-2.0.2/SRC/pclapiv.f000644 000766 000024 00000033644 10363532303 016155 0ustar00juliestaff000000 000000 SUBROUTINE PCLAPIV( DIREC, ROWCOL, PIVROC, M, N, A, IA, JA, $ DESCA, IPIV, IP, JP, DESCIP, IWORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER*1 DIREC, PIVROC, ROWCOL INTEGER IA, IP, JA, JP, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCIP( * ), IPIV( * ), IWORK( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLAPIV applies either P (permutation matrix indicated by IPIV) * or inv( P ) to a general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1), resulting in row or column * pivoting. The pivot vector may be distributed across a process row * or a column. The pivot vector should be aligned with the distributed * matrix A. This routine will transpose the pivot vector if necessary. * For example if the row pivots should be applied to the columns of * sub( A ), pass ROWCOL='C' and PIVROC='C'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Restrictions * ============ * * IPIV must always be a distributed vector (not a matrix). Thus: * IF( ROWPIV .EQ. 'C' ) THEN * JP must be 1 * ELSE * IP must be 1 * END IF * * The following restrictions apply when IPIV must be transposed: * IF( ROWPIV.EQ.'C' .AND. PIVROC.EQ.'C') THEN * DESCIP(MB_) must equal DESCA(NB_) * ELSE IF( ROWPIV.EQ.'R" .AND. PIVROC.EQ.'R') THEN * DESCIP(NB_) must equal DESCA(MB_) * END IF * * Arguments * ========= * * DIREC (global input) CHARACTER*1 * Specifies in which order the permutation is applied: * = 'F' (Forward) Applies pivots Forward from top of matrix. * Computes P*sub( A ). * = 'B' (Backward) Applies pivots Backward from bottom of * matrix. Computes inv( P )*sub( A ). * * ROWCOL (global input) CHARACTER*1 * Specifies if the rows or columns are to be permuted: * = 'R' Rows will be permuted, * = 'C' Columns will be permuted. * * PIVROC (global input) CHARACTER*1 * Specifies whether IPIV is distributed over a process row * or column: * = 'R' IPIV distributed over a process row * = 'C' IPIV distributed over a process column * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of * rows of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * distributed submatrix sub( A ) to which the row or column * interchanges will be applied. On exit, the local pieces * of the permuted distributed submatrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension (LIPIV) where LIPIV is * when ROWCOL='R' or 'r': * >= LOCr( IA+M-1 ) + MB_A if PIVROC='C' or 'c', * >= LOCc( M + MOD(JP-1,NB_P) ) if PIVROC='R' or 'r', and, * when ROWCOL='C' or 'c': * >= LOCr( N + MOD(IP-1,MB_P) ) if PIVROC='C' or 'c', * >= LOCc( JA+N-1 ) + NB_A if PIVROC='R' or 'r'. * This array contains the pivoting information. IPIV(i) is the * global row (column), local row (column) i was swapped with. * When ROWCOL='R' or 'r' and PIVROC='C' or 'c', or ROWCOL='C' * or 'c' and PIVROC='R' or 'r', the last piece of this array of * size MB_A (resp. NB_A) is used as workspace. In those cases, * this array is tied to the distributed matrix A. * * IP (global input) INTEGER * The row index in the global array P indicating the first * row of sub( P ). * * JP (global input) INTEGER * The column index in the global array P indicating the * first column of sub( P ). * * DESCIP (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed vector IPIV. * * IWORK (local workspace) INTEGER array, dimension (LDW) * where LDW is equal to the workspace necessary for * transposition, and the storage of the tranposed IPIV: * * Let LCM be the least common multiple of NPROW and NPCOL. * IF( ROWCOL.EQ.'R' .AND. PIVROC.EQ.'R' ) THEN * IF( NPROW.EQ.NPCOL ) THEN * LDW = LOCr( N_P + MOD(JP-1, NB_P) ) + NB_P * ELSE * LDW = LOCr( N_P + MOD(JP-1, NB_P) ) + * NB_P * CEIL( CEIL(LOCc(N_P)/NB_P) / (LCM/NPCOL) ) * END IF * ELSE IF( ROWCOL.EQ.'C' .AND. PIVROC.EQ.'C' ) THEN * IF( NPROW.EQ.NPCOL ) THEN * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + MB_P * ELSE * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + * MB_P * CEIL( CEIL(LOCr(M_P)/MB_P) / (LCM/NPROW) ) * END IF * ELSE * IWORK is not referenced. * END IF * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL ROWPVT INTEGER I, ICTXT, ICURCOL, ICURROW, IIP, ITMP, IPT, $ JJP, JPT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCPT( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, $ INFOG2L, PCLAPV2, PICOL2ROW, PIROW2COL * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC, INDXG2P EXTERNAL LSAME, NUMROC, INDXG2P * .. * .. Intrinsic Functions .. INTRINSIC MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) ROWPVT = LSAME( ROWCOL, 'R' ) * * If we're pivoting the rows of sub( A ) * IF( ROWPVT ) THEN IF( M.LE.1 .OR. N.LT.1 ) $ RETURN * * If the pivot vector is already distributed correctly * IF( LSAME( PIVROC, 'C' ) ) THEN CALL PCLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * Otherwise, we must redistribute IPIV to match PCLAPV2 * ELSE * * Take IPIV distributed over row 0, and store it in * iwork, distributed over column 0 * IPT = MOD( JP-1, DESCA(MB_) ) DESCPT(M_) = M + IPT + NPROW*DESCA(MB_) DESCPT(N_) = 1 DESCPT(MB_) = DESCA(MB_) DESCPT(NB_) = 1 DESCPT(RSRC_) = INDXG2P( IA, DESCA(MB_), IA, DESCA(RSRC_), $ NPROW ) DESCPT(CSRC_) = MYCOL DESCPT(CTXT_) = ICTXT DESCPT(LLD_) = NUMROC( DESCPT(M_), DESCPT(MB_), MYROW, $ DESCPT(RSRC_), NPROW ) ITMP = NUMROC( DESCIP(N_), DESCIP(NB_), MYCOL, $ DESCIP(CSRC_), NPCOL ) CALL INFOG2L( IP, JP-IPT, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) CALL PIROW2COL( ICTXT, M+IPT, 1, DESCIP(NB_), IPIV(JJP), $ ITMP, IWORK, DESCPT(LLD_), 0, ICURCOL, $ DESCPT(RSRC_), $ MYCOL, IWORK(DESCPT(LLD_)-DESCPT(MB_)+1) ) * * Send column-distributed pivots to all columns * ITMP = DESCPT(LLD_) - DESCPT(MB_) IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'Row', ' ', ITMP, 1, IWORK, ITMP ) ELSE CALL IGEBR2D( ICTXT, 'Row', ' ', ITMP, 1, IWORK, ITMP, $ MYROW, 0 ) END IF * * Adjust pivots so they are relative to the start of IWORK, * not IPIV * IPT = IPT + 1 DO 10 I = 1, ITMP IWORK(I) = IWORK(I) - JP + IPT 10 CONTINUE CALL PCLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IWORK, $ IPT, 1, DESCPT ) END IF * * Otherwise, we're pivoting the columns of sub( A ) * ELSE IF( M.LT.1 .OR. N.LE.1 ) $ RETURN * * If the pivot vector is already distributed correctly * IF( LSAME( PIVROC, 'R' ) ) THEN CALL PCLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * Otherwise, we must redistribute IPIV to match PCLAPV2 * ELSE * * Take IPIV distributed over column 0, and store it in * iwork, distributed over row 0 * JPT = MOD( IP-1, DESCA(NB_) ) DESCPT(M_) = 1 DESCPT(N_) = N + JPT + NPCOL*DESCA(NB_) DESCPT(MB_) = 1 DESCPT(NB_) = DESCA(NB_) DESCPT(RSRC_) = MYROW DESCPT(CSRC_) = INDXG2P( JA, DESCA(NB_), JA, DESCA(CSRC_), $ NPCOL ) DESCPT(CTXT_) = ICTXT DESCPT(LLD_) = 1 CALL INFOG2L( IP-JPT, JP, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) ITMP = NUMROC( N+JPT, DESCPT(NB_), MYCOL, DESCPT(CSRC_), $ NPCOL ) CALL PICOL2ROW( ICTXT, N+JPT, 1, DESCIP(MB_), IPIV(IIP), $ DESCIP(LLD_), IWORK, MAX(1, ITMP), ICURROW, $ 0, 0, DESCPT(CSRC_), IWORK(ITMP+1) ) * * Send row-distributed pivots to all rows * IF( MYROW.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'Column', ' ', ITMP, 1, IWORK, $ ITMP ) ELSE CALL IGEBR2D( ICTXT, 'Column', ' ', ITMP, 1, IWORK, $ ITMP, 0, MYCOL ) END IF * * Adjust pivots so they are relative to the start of IWORK, * not IPIV * JPT = JPT + 1 DO 20 I = 1, ITMP IWORK(I) = IWORK(I) - IP + JPT 20 CONTINUE CALL PCLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IWORK, $ 1, JPT, DESCPT ) END IF END IF * RETURN * * End of PCLAPIV * END scalapack-2.0.2/SRC/pclapv2.f000644 000766 000024 00000036730 10363532303 016065 0ustar00juliestaff000000 000000 SUBROUTINE PCLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIREC, ROWCOL INTEGER IA, IP, JA, JP, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCIP( * ), IPIV( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLAPV2 applies either P (permutation matrix indicated by IPIV) * or inv( P ) to a M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1), resulting in row or column pivoting. The * pivot vector should be aligned with the distributed matrix A. For * pivoting the rows of sub( A ), IPIV should be distributed along a * process column and replicated over all process rows. Similarly, * IPIV should be distributed along a process row and replicated over * all process columns for column pivoting. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIREC (global input) CHARACTER * Specifies in which order the permutation is applied: * = 'F' (Forward) Applies pivots Forward from top of matrix. * Computes P * sub( A ); * = 'B' (Backward) Applies pivots Backward from bottom of * matrix. Computes inv( P ) * sub( A ). * * ROWCOL (global input) CHARACTER * Specifies if the rows or columns are to be permuted: * = 'R' Rows will be permuted, * = 'C' Columns will be permuted. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this local array contains the local pieces of the * distributed matrix sub( A ) to which the row or columns * interchanges will be applied. On exit, this array contains * the local pieces of the permuted distributed matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (input) INTEGER array, dimension >= LOCr(M_A)+MB_A if * ROWCOL = 'R', LOCc(N_A)+NB_A otherwise. It contains * the pivoting information. IPIV(i) is the global row (column), * local row (column) i was swapped with. The last piece of the * array of size MB_A (resp. NB_A) is used as workspace. IPIV is * tied to the distributed matrix A. * * IP (global input) INTEGER * IPIV's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JP (global input) INTEGER * IPIV's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCIP (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix IPIV. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL FORWRD, ROWPVT INTEGER I, IB, ICTXT, ICURCOL, ICURROW, IIP, IP1, ITMP, $ IPVWRK, J, JB, JJP, JP1, K, MA, MBA, MYCOL, $ MYROW, NBA, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBS2D, IGEBR2D, INFOG2L, $ PCSWAP * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * ROWPVT = LSAME( ROWCOL, 'R' ) IF( ROWPVT ) THEN IF( M.LE.1 .OR. N.LT.1 ) $ RETURN ELSE IF( M.LT.1 .OR. N.LE.1 ) $ RETURN END IF FORWRD = LSAME( DIREC, 'F' ) * * * Get grid and matrix parameters * MA = DESCA( M_ ) MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If I'm applying pivots from beginning to end (e.g., repeating * pivoting done earlier). Thus this section computes P * sub( A ). * IF( FORWRD ) THEN CALL INFOG2L( IP, JP, DESCIP, NPROW, NPCOL, MYROW, MYCOL, $ IIP, JJP, ICURROW, ICURCOL ) * * If I'm pivoting the rows of sub( A ) * IF( ROWPVT ) THEN IPVWRK = NUMROC( DESCIP( M_ ), DESCIP( MB_ ), MYROW, $ DESCIP( RSRC_ ), NPROW ) + 1 - $ DESCIP( MB_ ) * * Loop over rows of sub( A ) * I = IA IB = MIN( M, ICEIL( IA, MBA ) * MBA - IA + 1 ) 10 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process column * IF( MYROW.EQ.ICURROW ) THEN CALL IGEBS2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IIP ), IB ) ITMP = IIP IIP = IIP + IB ELSE ITMP = IPVWRK CALL IGEBR2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( ITMP ), IB, ICURROW, MYCOL ) END IF * * Pivot the block of rows * DO 20 K = I, I+IB-1 IP1 = IPIV( ITMP ) - IP + IA IF( IP1.NE.K ) $ CALL PCSWAP( N, A, K, JA, DESCA, MA, A, IP1, JA, $ DESCA, MA ) ITMP = ITMP + 1 20 CONTINUE * * Go on to next row of processes, increment row counter, * and figure number of rows to pivot next * ICURROW = MOD( ICURROW+1, NPROW ) I = I + IB IB = MIN( MBA, M-I+IA ) IF( IB .GT. 0 ) GOTO 10 * * If I am pivoting the columns of sub( A ) * ELSE IPVWRK = NUMROC( DESCIP( N_ ), DESCIP( NB_ ), MYCOL, $ DESCIP( CSRC_ ), NPCOL ) + 1 - $ DESCIP( NB_ ) * * Loop over columns of sub( A ) * J = JA JB = MIN( N, ICEIL( JA, NBA ) * NBA - JA + 1 ) 30 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process row * IF( MYCOL.EQ.ICURCOL ) THEN CALL IGEBS2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( JJP ), JB ) ITMP = JJP JJP = JJP + JB ELSE ITMP = IPVWRK CALL IGEBR2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( ITMP ), JB, MYROW, ICURCOL ) END IF * * Pivot the block of columns * DO 40 K = J, J+JB-1 JP1 = IPIV( ITMP ) - JP + JA IF( JP1.NE.K ) $ CALL PCSWAP( M, A, IA, K, DESCA, 1, A, IA, JP1, $ DESCA, 1 ) ITMP = ITMP + 1 40 CONTINUE * * Go on to next column of processes, increment column * counter, and figure number of columns to pivot next * ICURCOL = MOD( ICURCOL+1, NPCOL ) J = J + JB JB = MIN( NBA, N-J+JA ) IF( JB .GT. 0 ) GOTO 30 END IF * * If I want to apply pivots in reverse order, i.e. reversing * pivoting done earlier. Thus this section computes * inv( P ) * sub( A ). * ELSE * * If I'm pivoting the rows of sub( A ) * IF( ROWPVT ) THEN CALL INFOG2L( IP+M-1, JP, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) * IPVWRK = NUMROC( DESCIP( M_ ), DESCIP( MB_ ), MYROW, $ DESCIP( RSRC_ ), NPROW ) + 1 - $ DESCIP( MB_ ) * * If I'm not in the current process row, my IIP points out * past end of pivot vector (since I don't own a piece of the * last row). Adjust IIP so it points at last pivot entry. * IF( MYROW.NE.ICURROW ) IIP = IIP - 1 * * Loop over rows in reverse order, starting at last row * I = IA + M - 1 IB = MOD( I, MBA ) IF( IB .EQ. 0 ) IB = MBA IB = MIN( IB, M ) 50 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process column * IF( MYROW.EQ.ICURROW ) THEN ITMP = IIP IIP = IIP - IB CALL IGEBS2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IIP+1 ), IB ) ELSE CALL IGEBR2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IPVWRK ), IB, ICURROW, MYCOL ) ITMP = IPVWRK + IB - 1 END IF * * Pivot the block of rows * DO 60 K = I, I-IB+1, -1 IP1 = IPIV( ITMP ) - IP + IA IF( IP1.NE.K ) $ CALL PCSWAP( N, A, K, JA, DESCA, MA, A, IP1, JA, $ DESCA, MA ) ITMP = ITMP - 1 60 CONTINUE * * Go to previous row of processes, decrement row counter, * and figure number of rows to be pivoted next * ICURROW = MOD( NPROW+ICURROW-1, NPROW ) I = I - IB IB = MIN( MBA, I-IA+1 ) IF( IB .GT. 0 ) GOTO 50 * * Otherwise, I'm pivoting the columns of sub( A ) * ELSE CALL INFOG2L( IP, JP+N-1, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) IPVWRK = NUMROC( DESCIP( N_ ), DESCIP( NB_ ), MYCOL, $ DESCIP( CSRC_ ), NPCOL ) + 1 - $ DESCIP( NB_ ) * * If I'm not in the current process column, my JJP points out * past end of pivot vector (since I don't own a piece of the * last column). Adjust JJP so it points at last pivot entry. * IF( MYCOL.NE.ICURCOL ) JJP = JJP - 1 * * Loop over columns in reverse order starting at last column * J = JA + N - 1 JB = MOD( J, NBA ) IF( JB .EQ. 0 ) JB = NBA JB = MIN( JB, N ) 70 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process row * IF( MYCOL.EQ.ICURCOL ) THEN ITMP = JJP JJP = JJP - JB CALL IGEBS2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( JJP+1 ), JB ) ELSE CALL IGEBR2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( IPVWRK ), JB, MYROW, ICURCOL ) ITMP = IPVWRK + JB - 1 END IF * * Pivot a block of columns * DO 80 K = J, J-JB+1, -1 JP1 = IPIV( ITMP ) - JP + JA IF( JP1.NE.K ) $ CALL PCSWAP( M, A, IA, K, DESCA, 1, A, IA, JP1, $ DESCA, 1 ) ITMP = ITMP - 1 80 CONTINUE * * Go to previous row of processes, decrement row counter, * and figure number of rows to be pivoted next * ICURCOL = MOD( NPCOL+ICURCOL-1, NPCOL ) J = J - JB JB = MIN( NBA, J-JA+1 ) IF( JB .GT. 0 ) GOTO 70 END IF * END IF * RETURN * * End PCLAPV2 * END scalapack-2.0.2/SRC/pclaqge.f000644 000766 000024 00000023311 10363532303 016121 0ustar00juliestaff000000 000000 SUBROUTINE PCLAQGE( M, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, EQUED ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER EQUED INTEGER IA, JA, M, N REAL AMAX, COLCND, ROWCND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL C( * ), R( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLAQGE equilibrates a general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) using the row and scaling * factors in the vectors R and C. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)) * containing on entry the M-by-N matrix sub( A ). On exit, * the equilibrated distributed matrix. See EQUED for the * form of the equilibrated distributed submatrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * R (local input) REAL array, dimension LOCr(M_A) * The row scale factors for sub( A ). R is aligned with the * distributed matrix A, and replicated across every process * column. R is tied to the distributed matrix A. * * C (local input) REAL array, dimension LOCc(N_A) * The column scale factors of sub( A ). C is aligned with the * distributed matrix A, and replicated down every process * row. C is tied to the distributed matrix A. * * ROWCND (global input) REAL * The global ratio of the smallest R(i) to the largest R(i), * IA <= i <= IA+M-1. * * COLCND (global input) REAL * The global ratio of the smallest C(i) to the largest C(i), * JA <= j <= JA+N-1. * * AMAX (global input) REAL * Absolute value of largest distributed submatrix entry. * * EQUED (global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration * = 'R': Row equilibration, i.e., sub( A ) has been pre- * multiplied by diag(R(IA:IA+M-1)), * = 'C': Column equilibration, i.e., sub( A ) has been post- * multiplied by diag(C(JA:JA+N-1)), * = 'B': Both row and column equilibration, i.e., sub( A ) * has been replaced by * diag(R(IA:IA+M-1)) * sub( A ) * diag(C(JA:JA+N-1)). * * Internal Parameters * =================== * * THRESH is a threshold value used to decide if row or column scaling * should be done based on the ratio of the row or column scaling * factors. If ROWCND < THRESH, row scaling is done, and if * COLCND < THRESH, column scaling is done. * * LARGE and SMALL are threshold values used to decide if row scaling * should be done based on the absolute size of the largest matrix * element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, THRESH PARAMETER ( ONE = 1.0E+0, THRESH = 0.1E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IIA, IOFFA, $ IROFF, J, JJA, LDA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ REAL CJ, LARGE, SMALL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC REAL PSLAMCH EXTERNAL NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) THEN EQUED = 'N' RETURN END IF * * Get grid parameters and compute local indexes * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * * Initialize LARGE and SMALL. * SMALL = PSLAMCH( ICTXT, 'Safe minimum' ) / $ PSLAMCH( ICTXT, 'Precision' ) LARGE = ONE / SMALL * IF( ROWCND.GE.THRESH .AND. AMAX.GE.SMALL .AND. AMAX.LE.LARGE ) $ THEN * * No row scaling * IF( COLCND.GE.THRESH ) THEN * * No column scaling * EQUED = 'N' * ELSE * * Column scaling * IOFFA = (JJA-1)*LDA DO 20 J = JJA, JJA+NQ-1 CJ = C( J ) DO 10 I = IIA, IIA+MP-1 A( IOFFA + I ) = CJ*A( IOFFA + I ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE EQUED = 'C' END IF * ELSE IF( COLCND.GE.THRESH ) THEN * * Row scaling, no column scaling * IOFFA = (JJA-1)*LDA DO 40 J = JJA, JJA+NQ-1 DO 30 I = IIA, IIA+MP-1 A( IOFFA + I ) = R( I )*A( IOFFA + I ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE EQUED = 'R' * ELSE * * Row and column scaling * IOFFA = (JJA-1)*LDA DO 60 J = JJA, JJA+NQ-1 CJ = C( J ) DO 50 I = IIA, IIA+MP-1 A( IOFFA + I ) = CJ*R( I )*A( IOFFA + I ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE EQUED = 'B' * END IF * RETURN * * End of PCLAQGE * END scalapack-2.0.2/SRC/pclaqsy.f000644 000766 000024 00000032036 10363532303 016165 0ustar00juliestaff000000 000000 SUBROUTINE PCLAQSY( UPLO, N, A, IA, JA, DESCA, SR, SC, SCOND, $ AMAX, EQUED ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER EQUED, UPLO INTEGER IA, JA, N REAL AMAX, SCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL SC( * ), SR( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLAQSY equilibrates a symmetric distributed matrix * sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the scaling factors in the * vectors SR and SC. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric distributed matrix sub( A ) is to be referenced: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (input/output) COMPLEX pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the distributed symmetric * matrix sub( A ). If UPLO = 'U', the leading N-by-N upper * triangular part of sub( A ) contains the upper triangular * part of the matrix, and the strictly lower triangular part * of sub( A ) is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular part of the matrix, and the strictly upper trian- * gular part of sub( A ) is not referenced. * On exit, if EQUED = 'Y', the equilibrated matrix: * diag(SR(IA:IA+N-1)) * sub( A ) * diag(SC(JA:JA+N-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * SR (local input) REAL array, dimension LOCr(M_A) * The scale factors for A(IA:IA+M-1,JA:JA+N-1). SR is aligned * with the distributed matrix A, and replicated across every * process column. SR is tied to the distributed matrix A. * * SC (local input) REAL array, dimension LOCc(N_A) * The scale factors for sub( A ). SC is aligned with the dis- * tributed matrix A, and replicated down every process row. * SC is tied to the distributed matrix A. * * SCOND (global input) REAL * Ratio of the smallest SR(i) (respectively SC(j)) to the * largest SR(i) (respectively SC(j)), with IA <= i <= IA+N-1 * and JA <= j <= JA+N-1. * * AMAX (global input) REAL * Absolute value of the largest distributed submatrix entry. * * EQUED (output) CHARACTER*1 * Specifies whether or not equilibration was done. * = 'N': No equilibration. * = 'Y': Equilibration was done, i.e., sub( A ) has been re- * placed by: * diag(SR(IA:IA+N-1)) * sub( A ) * diag(SC(JA:JA+N-1)). * * Internal Parameters * =================== * * THRESH is a threshold value used to decide if scaling should be done * based on the ratio of the scaling factors. If SCOND < THRESH, * scaling is done. * * LARGE and SMALL are threshold values used to decide if scaling should * be done based on the absolute size of the largest matrix element. * If AMAX > LARGE or AMAX < SMALL, scaling is done. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, THRESH PARAMETER ( ONE = 1.0E+0, THRESH = 0.1E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, II, IIA, IOFFA, IROFF, J, $ JB, JJ, JJA, JN, KK, LDA, LL, MYCOL, MYROW, NP, $ NPCOL, NPROW REAL CJ, LARGE, SMALL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSLAMCH EXTERNAL ICEIL, LSAME, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) THEN EQUED = 'N' RETURN END IF * * Get grid parameters and compute local indexes * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) LDA = DESCA( LLD_ ) * * Initialize LARGE and SMALL. * SMALL = PSLAMCH( ICTXT, 'Safe minimum' ) / $ PSLAMCH( ICTXT, 'Precision' ) LARGE = ONE / SMALL * IF( SCOND.GE.THRESH .AND. AMAX.GE.SMALL .AND. AMAX.LE.LARGE ) THEN * * No equilibration * EQUED = 'N' * ELSE * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Replace A by diag(S) * A * diag(S). * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangle of A(IA:IA+N-1,JA:JA+N-1) is stored. * Handle first block separately * IOFFA = (JJ-1)*LDA IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 20 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 10 KK = IIA, II+LL-JJ+1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE ELSE IOFFA = IOFFA + JB*LDA END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 70 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 40 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 30 KK = IIA, II+LL-JJ+1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE ELSE DO 60 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 50 KK = IIA, II-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 70 CONTINUE * ELSE * * Lower triangle of A(IA:IA+N-1,JA:JA+N-1) is stored. * Handle first block separately * IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFF * IOFFA = (JJ-1)*LDA IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 90 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 80 KK = II+LL-JJ, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE ELSE DO 110 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 100 KK = II, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 160 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 130 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 120 KK = II+LL-JJ, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 120 CONTINUE IOFFA = IOFFA + LDA 130 CONTINUE ELSE DO 150 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 140 KK = II, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 140 CONTINUE IOFFA = IOFFA + LDA 150 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 160 CONTINUE * END IF * EQUED = 'Y' * END IF * RETURN * * End of PCLAQSY * END scalapack-2.0.2/SRC/pclarf.f000644 000766 000024 00000070136 10363532303 015763 0ustar00juliestaff000000 000000 SUBROUTINE PCLARF( SIDE, M, N, V, IV, JV, DESCV, INCV, TAU, $ C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PCLARF applies a complex elementary reflector Q to a complex M-by-N * distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), from either the * left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a complex scalar and v is a complex vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also have the first row of sub( C ). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also have the first column of sub( C ) and * MOD(JV-1,NB_V) must be equal to MOD(JC-1,NB_C), if INCV = 1 only the * last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q * sub( C ), * = 'R': form sub( C ) * Q. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * V (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+M-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+M-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+N-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+N-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) COMPLEX, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q * sub( C ) if SIDE = 'L', or * sub( C ) * Q if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL, ICOFF, ICROW, ICTXT, IIC, IIV, IOFFC, $ IOFFV, IPW, IROFF, IVCOL, IVROW, JJC, JJV, LDC, $ LDV, MYCOL, MYROW, MP, NCC, NCV, NPCOL, NPROW, $ NQ, RDEST COMPLEX TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CCOPY, CGEBR2D, CGEBS2D, $ CGEMV, CGERC, CGERV2D, CGESD2D, $ CGSUM2D, CLASET, INFOG2L, PB_TOPGET, $ PBCTRNV * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) JJC = MIN( JJC, NCC ) JJV = MIN( JJV, NCV ) IOFFC = IIC+(JJC-1)*LDC IOFFV = IIV+(JJV-1)*LDV * IROFF = MOD( IC-1, DESCC( MB_ ) ) ICOFF = MOD( JC-1, DESCC( NB_ ) ) MP = NUMROC( M+IROFF, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYROW.EQ.ICROW ) $ MP = MP - IROFF IF( MYCOL.EQ.ICCOL ) $ NQ = NQ - ICOFF * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFF) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFF) ) * IF( LSAME( SIDE, 'L' ) ) THEN * IF( CRBLCK ) THEN RDEST = ICROW ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V * IPW = MP+1 CALL PBCTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL CGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * TAULOC = TAU( JJV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MP, NQ, $ ONE, C( IOFFC ), LDC, V( IOFFV ), 1, $ ZERO, WORK, 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK, MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK, MAX( 1, NQ ), RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL CGERC( MP, NQ, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL CCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL CGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL ) * ELSE IF( MYCOL.EQ.ICCOL ) THEN * IPW = MP+1 CALL CGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MP, NQ, $ ONE, C( IOFFC ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL CGERC( MP, NQ, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V * IPW = MP+1 CALL PBCTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN IF( IOFFC.GT.0 ) $ CALL CGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( IOFFC.GT.0 ) $ CALL CGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL CCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK(IPW) = TAU( JJV ) CALL CGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( JJV ) * ELSE * IPW = MP+1 CALL CGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN IF( IOFFC.GT.0 ) $ CALL CGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( IOFFC.GT.0 ) $ CALL CGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW ) THEN * TAULOC = TAU( IIV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, V( IOFFV ), LDV, $ ZERO, WORK, 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, $ WORK, MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK, MAX( 1, MP ), RDEST, ICCOL ) * * sub( C ) := sub( C ) - w * v' * IF( IOFFV.GT.0 .AND. IOFFC.GT.0 ) $ CALL CGERC( MP, NQ, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC ), $ LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW * IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 CALL CCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL CGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW ) THEN * IPW = NQ+1 CALL CGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V * IPW = NQ+1 CALL PBCTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = TAU( JJV ) * ELSE * CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 IF( IOFFV.GT.0 ) $ CALL CCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL CGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( IIV ) * ELSE * IPW = NQ+1 CALL CGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * IF( IOFFC.GT.0 ) $ CALL CGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V * IPW = NQ+1 CALL PBCTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = TAU( JJV ) * ELSE * CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, WORK( IPW ), $ MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, 1, $ C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PCLARF * END scalapack-2.0.2/SRC/pclarfb.f000644 000766 000024 00000104135 11750130340 016116 0ustar00juliestaff000000 000000 SUBROUTINE PCLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, IV, $ JV, DESCV, T, C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, DIRECT, STOREV INTEGER IC, IV, JC, JV, K, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX C( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PCLARFB applies a complex block reflector Q or its conjugate * transpose Q**H to a complex M-by-N distributed matrix sub( C ) * denoting C(IC:IC+M-1,JC:JC+N-1), from the left or the right. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * DIRECT (global input) CHARACTER * Indicates how Q is formed from a product of elementary * reflectors * = 'F': Q = H(1) H(2) . . . H(k) (Forward) * = 'B': Q = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Indicates how the vectors which define the elementary * reflectors are stored: * = 'C': Columnwise * = 'R': Rowwise * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The order of the matrix T (= the number of elementary * reflectors whose product defines the block reflector). * * V (local input) COMPLEX pointer into the local memory * to an array of dimension ( LLD_V, LOCc(JV+K-1) ) if * STOREV = 'C', ( LLD_V, LOCc(JV+M-1)) if STOREV = 'R' and * SIDE = 'L', ( LLD_V, LOCc(JV+N-1) ) if STOREV = 'R' and * SIDE = 'R'. It contains the local pieces of the distributed * vectors V representing the Householder transformation. * See further details. * If STOREV = 'C' and SIDE = 'L', LLD_V >= MAX(1,LOCr(IV+M-1)); * if STOREV = 'C' and SIDE = 'R', LLD_V >= MAX(1,LOCr(IV+N-1)); * if STOREV = 'R', LLD_V >= LOCr(IV+K-1). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * T (local input) COMPLEX array, dimension MB_V by MB_V * if STOREV = 'R' and NB_V by NB_V if STOREV = 'C'. The trian- * gular matrix T in the representation of the block reflector. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the M-by-N distributed matrix sub( C ). On exit, * sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) or * sub( C )*Q or sub( C )*Q'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * If STOREV = 'C', * if SIDE = 'L', * LWORK >= ( NqC0 + MpC0 ) * K * else if SIDE = 'R', * LWORK >= ( NqC0 + MAX( NpV0 + NUMROC( NUMROC( N+ICOFFC, * NB_V, 0, 0, NPCOL ), NB_V, 0, 0, LCMQ ), * MpC0 ) ) * K * end if * else if STOREV = 'R', * if SIDE = 'L', * LWORK >= ( MpC0 + MAX( MqV0 + NUMROC( NUMROC( M+IROFFC, * MB_V, 0, 0, NPROW ), MB_V, 0, 0, LCMP ), * NqC0 ) ) * K * else if SIDE = 'R', * LWORK >= ( MpC0 + NqC0 ) * K * end if * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFV = MOD( IV-1, MB_V ), ICOFFV = MOD( JV-1, NB_V ), * IVROW = INDXG2P( IV, MB_V, MYROW, RSRC_V, NPROW ), * IVCOL = INDXG2P( JV, NB_V, MYCOL, CSRC_V, NPCOL ), * MqV0 = NUMROC( M+ICOFFV, NB_V, MYCOL, IVCOL, NPCOL ), * NpV0 = NUMROC( N+IROFFV, MB_V, MYROW, IVROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NpC0 = NUMROC( N+ICOFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If STOREV = 'Columnwise' * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if STOREV = 'Rowwise' * If SIDE = 'Left', * ( NB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL FORWARD CHARACTER COLBTOP, ROWBTOP, TRANST, UPLO INTEGER HEIGHT, IBASE, ICCOL, ICOFFC, ICOFFV, ICROW, $ ICTXT, II, IIBEG, IIC, IIEND, IINXT, IIV, $ ILASTCOL, ILASTROW, ILEFT, IOFF, IOFFC, IOFFV, $ IPT, IPV, IPW, IPW1, IRIGHT, IROFFC, IROFFV, $ ITOP, IVCOL, IVROW, JJ, JJBEG, JJC, JJEND, $ JJNXT, JJV, KP, KQ, LDC, LDV, LV, LW, MBV, MPC, $ MPC0, MQV, MQV0, MYCOL, MYDIST, MYROW, NBV, $ NPV, NPV0, NPCOL, NPROW, NQC, NQC0, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D,CGEMM, $ CGSUM2D, CLAMOV, CLASET, CTRBR2D, $ CTRBS2D, CTRMM, INFOG1L, INFOG2L, PB_TOPGET, $ PBCTRAN * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 .OR. K.LE.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( TRANS, 'N' ) ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF FORWARD = LSAME( DIRECT, 'F' ) IF( FORWARD ) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) MBV = DESCV( MB_ ) NBV = DESCV( NB_ ) IROFFV = MOD( IV-1, MBV ) ICOFFV = MOD( JV-1, NBV ) MPC = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYCOL.EQ.ICCOL ) $ NQC = NQC - ICOFFC IF( MYROW.EQ.ICROW ) $ MPC = MPC - IROFFC JJC = MIN( JJC, MAX( 1, JJC+NQC-1 ) ) JJV = MIN( JJV, MAX( 1, NUMROC( DESCV( N_ ), NBV, MYCOL, $ DESCV( CSRC_ ), NPCOL ) ) ) IOFFC = IIC + ( JJC-1 ) * LDC IOFFV = IIV + ( JJV-1 ) * LDV * IF( LSAME( STOREV, 'C' ) ) THEN * * V is stored columnwise * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * Locally V( IOFFV ) is MPV x K, C( IOFFC ) is MPC x NQC * WORK( IPV ) is MPC x K = V( IOFFV ), MPC = MPV * WORK( IPW ) is NQC x K = C( IOFFC )' * V( IOFFV ) * IPV = 1 IPW = IPV + MPC * K LV = MAX( 1, MPC ) LW = MAX( 1, NQC ) * * Broadcast V to the other process columns. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN CALL CGEBS2D( ICTXT, 'Rowwise', ROWBTOP, MPC, K, $ V( IOFFV ), LDV ) IF( MYROW.EQ.IVROW ) $ CALL CTRBS2D( ICTXT, 'Rowwise', ROWBTOP, UPLO, $ 'Non unit', K, K, T, NBV ) CALL CLAMOV( 'All', MPC, K, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ROWBTOP, MPC, K, $ WORK( IPV ), LV, MYROW, IVCOL ) IF( MYROW.EQ.IVROW ) $ CALL CTRBR2D( ICTXT, 'Rowwise', ROWBTOP, UPLO, $ 'Non unit', K, K, T, NBV, MYROW, IVCOL ) END IF * IF( FORWARD ) THEN * * WORK(IPV) = ( V1 ) where V1 is unit lower triangular, * ( V2 ) zeroes upper triangular part of V1 * MYDIST = MOD( MYROW-IVROW+NPROW, NPROW ) ITOP = MAX( 0, MYDIST*MBV - IROFFV ) IIBEG = IIV IIEND = IIBEG + MPC - 1 IINXT = MIN( ICEIL( IIBEG, MBV )*MBV, IIEND ) * 10 CONTINUE IF( K-ITOP .GT.0 ) THEN CALL CLASET( 'Upper', IINXT-IIBEG+1, K-ITOP, ZERO, $ ONE, WORK( IPV+IIBEG-IIV+ITOP*LV ), LV ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IROFFV IIBEG = IINXT + 1 IINXT = MIN( IINXT+MBV, IIEND ) GO TO 10 END IF * ELSE * * WORK(IPV) = ( V1 ) where V2 is unit upper triangular, * ( V2 ) zeroes lower triangular part of V2 * JJ = JJV IOFF = MOD( IV+M-K-1, MBV ) CALL INFOG1L( IV+M-K, MBV, NPROW, MYROW, DESCV( RSRC_ ), $ II, ILASTROW ) KP = NUMROC( K+IOFF, MBV, MYROW, ILASTROW, NPROW ) IF( MYROW.EQ.ILASTROW ) $ KP = KP - IOFF MYDIST = MOD( MYROW-ILASTROW+NPROW, NPROW ) ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP+MBV, K ) ITOP = MIN( MAX( 0, ITOP ), K ) * 20 CONTINUE IF( JJ.LE.( JJV+K-1 ) ) THEN HEIGHT = IBASE - ITOP CALL CLASET( 'All', KP, ITOP-JJ+JJV, ZERO, ZERO, $ WORK( IPV+II-IIV+(JJ-JJV)*LV ), LV ) CALL CLASET( 'Lower', KP, HEIGHT, ZERO, ONE, $ WORK( IPV+II-IIV+ITOP*LV ), LV ) KP = MAX( 0, KP - HEIGHT ) II = II + HEIGHT JJ = JJV + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP + MBV, K ) ITOP = MIN( ITOP, K ) GO TO 20 END IF * END IF * * WORK( IPW ) = C( IOFFC )' * V (NQC x MPC x K) -> NQC x K * IF( MPC.GT.0 ) THEN CALL CGEMM( 'Conjugate transpose', 'No transpose', NQC, $ K, MPC, ONE, C( IOFFC ), LDC, WORK( IPV ), LV, $ ZERO, WORK( IPW ), LW ) ELSE CALL CLASET( 'All', NQC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * CALL CTRMM( 'Right', UPLO, TRANST, 'Non unit', NQC, K, $ ONE, T, NBV, WORK( IPW ), LW ) CALL CGEBS2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW ) ELSE CALL CGEBR2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C C - V * W' * C( IOFFC ) = C( IOFFC ) - WORK( IPV ) * WORK( IPW )' * MPC x NQC MPC x K K x NQC * CALL CGEMM( 'No transpose', 'Conjugate transpose', MPC, NQC, $ K, -ONE, WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC ), LDC ) * ELSE * * Form sub( C )*Q or sub( C )*Q' * * ICOFFC = IROFFV is required by the current transposition * routine PBCTRAN * NPV0 = NUMROC( N+IROFFV, MBV, MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) THEN NPV = NPV0 - IROFFV ELSE NPV = NPV0 END IF IF( MYCOL.EQ.ICCOL ) THEN NQC0 = NQC + ICOFFC ELSE NQC0 = NQC END IF * * Locally V( IOFFV ) is NPV x K C( IOFFC ) is MPC x NQC * WORK( IPV ) is K x NQC0 = [ . V( IOFFV ) ]' * WORK( IPW ) is NPV0 x K = [ . V( IOFFV )' ]' * WORK( IPT ) is the workspace for PBCTRAN * IPV = 1 IPW = IPV + K * NQC0 IPT = IPW + NPV0 * K LV = MAX( 1, K ) LW = MAX( 1, NPV0 ) * IF( MYCOL.EQ.IVCOL ) THEN IF( MYROW.EQ.IVROW ) THEN CALL CLASET( 'All', IROFFV, K, ZERO, ZERO, $ WORK( IPW ), LW ) IPW1 = IPW + IROFFV CALL CLAMOV( 'All', NPV, K, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) ELSE IPW1 = IPW CALL CLAMOV( 'All', NPV, K, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) END IF * IF( FORWARD ) THEN * * WORK(IPW) = ( . V1' V2' )' where V1 is unit lower * triangular, zeroes upper triangular part of V1 * MYDIST = MOD( MYROW-IVROW+NPROW, NPROW ) ITOP = MAX( 0, MYDIST*MBV - IROFFV ) IIBEG = IIV IIEND = IIBEG + NPV - 1 IINXT = MIN( ICEIL( IIBEG, MBV )*MBV, IIEND ) * 30 CONTINUE IF( ( K-ITOP ).GT.0 ) THEN CALL CLASET( 'Upper', IINXT-IIBEG+1, K-ITOP, ZERO, $ ONE, WORK( IPW1+IIBEG-IIV+ITOP*LW ), $ LW ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IROFFV IIBEG = IINXT + 1 IINXT = MIN( IINXT+MBV, IIEND ) GO TO 30 END IF * ELSE * * WORK( IPW ) = ( . V1' V2' )' where V2 is unit upper * triangular, zeroes lower triangular part of V2. * JJ = JJV CALL INFOG1L( IV+N-K, MBV, NPROW, MYROW, $ DESCV( RSRC_ ), II, ILASTROW ) IOFF = MOD( IV+N-K-1, MBV ) KP = NUMROC( K+IOFF, MBV, MYROW, ILASTROW, NPROW ) IF( MYROW.EQ.ILASTROW ) $ KP = KP - IOFF MYDIST = MOD( MYROW-ILASTROW+NPROW, NPROW ) ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP+MBV, K ) ITOP = MIN( MAX( 0, ITOP ), K ) * 40 CONTINUE IF( JJ.LE.( JJV+K-1 ) ) THEN HEIGHT = IBASE - ITOP CALL CLASET( 'All', KP, ITOP-JJ+JJV, ZERO, ZERO, $ WORK( IPW1+II-IIV+(JJ-JJV)*LW ), LW ) CALL CLASET( 'Lower', KP, HEIGHT, ZERO, ONE, $ WORK( IPW1+II-IIV+ITOP*LW ), LW ) KP = MAX( 0, KP - HEIGHT ) II = II + HEIGHT JJ = JJV + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP + MBV, K ) ITOP = MIN( ITOP, K ) GO TO 40 END IF END IF END IF * CALL PBCTRAN( ICTXT, 'Columnwise', 'Conjugate transpose', $ N+IROFFV, K, MBV, WORK( IPW ), LW, ZERO, $ WORK( IPV ), LV, IVROW, IVCOL, -1, ICCOL, $ WORK( IPT ) ) * * WORK( IPV ) = ( . V' ) -> WORK( IPV ) = V' is K x NQC * IF( MYCOL.EQ.ICCOL ) $ IPV = IPV + ICOFFC * LV * * WORK( IPW ) becomes MPC x K = C( IOFFC ) * V * WORK( IPW ) = C( IOFFC ) * V (MPC x NQC x K) -> MPC x K * LW = MAX( 1, MPC ) * IF( NQC.GT.0 ) THEN CALL CGEMM( 'No transpose', 'Conjugate transpose', MPC, $ K, NQC, ONE, C( IOFFC ), LDC, WORK( IPV ), $ LV, ZERO, WORK( IPW ), LW ) ELSE CALL CLASET( 'All', MPC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN IF( MYROW.EQ.IVROW ) THEN * * Broadcast the block reflector to the other rows. * CALL CTRBS2D( ICTXT, 'Columnwise', ' ', UPLO, $ 'Non unit', K, K, T, NBV ) ELSE CALL CTRBR2D( ICTXT, 'Columnwise', ' ', UPLO, $ 'Non unit', K, K, T, NBV, IVROW, MYCOL ) END IF CALL CTRMM( 'Right', UPLO, TRANS, 'Non unit', MPC, K, $ ONE, T, NBV, WORK( IPW ), LW ) * CALL CGEBS2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C C - W * V' * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC x NQC MPC x K K x NQC * CALL CGEMM( 'No transpose', 'No transpose', MPC, NQC, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC ), LDC ) END IF * ELSE * * V is stored rowwise * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * IROFFC = ICOFFV is required by the current transposition * routine PBCTRAN * MQV0 = NUMROC( M+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN MQV = MQV0 - ICOFFV ELSE MQV = MQV0 END IF IF( MYROW.EQ.ICROW ) THEN MPC0 = MPC + IROFFC ELSE MPC0 = MPC END IF * * Locally V( IOFFV ) is K x MQV, C( IOFFC ) is MPC x NQC * WORK( IPV ) is MPC0 x K = [ . V( IOFFV ) ]' * WORK( IPW ) is K x MQV0 = [ . V( IOFFV ) ] * WORK( IPT ) is the workspace for PBCTRAN * IPV = 1 IPW = IPV + MPC0 * K IPT = IPW + K * MQV0 LV = MAX( 1, MPC0 ) LW = MAX( 1, K ) * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN CALL CLASET( 'All', K, ICOFFV, ZERO, ZERO, $ WORK( IPW ), LW ) IPW1 = IPW + ICOFFV * LW CALL CLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) ELSE IPW1 = IPW CALL CLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) END IF * IF( FORWARD ) THEN * * WORK( IPW ) = ( . V1 V2 ) where V1 is unit upper * triangular, zeroes lower triangular part of V1 * MYDIST = MOD( MYCOL-IVCOL+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBV - ICOFFV ) JJBEG = JJV JJEND = JJV + MQV - 1 JJNXT = MIN( ICEIL( JJBEG, NBV ) * NBV, JJEND ) * 50 CONTINUE IF( ( K-ILEFT ).GT.0 ) THEN CALL CLASET( 'Lower', K-ILEFT, JJNXT-JJBEG+1, ZERO, $ ONE, $ WORK( IPW1+ILEFT+(JJBEG-JJV)*LW ), $ LW ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - ICOFFV JJBEG = JJNXT + 1 JJNXT = MIN( JJNXT+NBV, JJEND ) GO TO 50 END IF * ELSE * * WORK( IPW ) = ( . V1 V2 ) where V2 is unit lower * triangular, zeroes upper triangular part of V2. * II = IIV CALL INFOG1L( JV+M-K, NBV, NPCOL, MYCOL, $ DESCV( CSRC_ ), JJ, ILASTCOL ) IOFF = MOD( JV+M-K-1, NBV ) KQ = NUMROC( K+IOFF, NBV, MYCOL, ILASTCOL, NPCOL ) IF( MYCOL.EQ.ILASTCOL ) $ KQ = KQ - IOFF MYDIST = MOD( MYCOL-ILASTCOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT+NBV, K ) ILEFT = MIN( MAX( 0, ILEFT ), K ) * 60 CONTINUE IF( II.LE.( IIV+K-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL CLASET( 'All', ILEFT-II+IIV, KQ, ZERO, ZERO, $ WORK( IPW1+II-IIV+(JJ-JJV)*LW ), LW ) CALL CLASET( 'Upper', WIDE, KQ, ZERO, ONE, $ WORK( IPW1+ILEFT+(JJ-JJV)*LW ), LW ) KQ = MAX( 0, KQ - WIDE ) II = IIV + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT + NBV, K ) ILEFT = MIN( ILEFT, K ) GO TO 60 END IF END IF END IF * * WORK( IPV ) = WORK( IPW )' (replicated) is MPC0 x K * CALL PBCTRAN( ICTXT, 'Rowwise', 'Conjugate transpose', K, $ M+ICOFFV, NBV, WORK( IPW ), LW, ZERO, $ WORK( IPV ), LV, IVROW, IVCOL, ICROW, -1, $ WORK( IPT ) ) * * WORK( IPV ) = ( . V )' -> WORK( IPV ) = V' is MPC x K * IF( MYROW.EQ.ICROW ) $ IPV = IPV + IROFFC * * WORK( IPW ) becomes NQC x K = C( IOFFC )' * V' * WORK( IPW ) = C( IOFFC )' * V' (NQC x MPC x K) -> NQC x K * LW = MAX( 1, NQC ) * IF( MPC.GT.0 ) THEN CALL CGEMM( 'Conjugate transpose', 'No transpose', NQC, $ K, MPC, ONE, C( IOFFC ), LDC, WORK( IPV ), $ LV, ZERO, WORK( IPW ), LW ) ELSE CALL CLASET( 'All', NQC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN * * Broadcast the block reflector to the other columns. * CALL CTRBS2D( ICTXT, 'Rowwise', ' ', UPLO, 'Non unit', $ K, K, T, MBV ) ELSE CALL CTRBR2D( ICTXT, 'Rowwise', ' ', UPLO, 'Non unit', $ K, K, T, MBV, MYROW, IVCOL ) END IF CALL CTRMM( 'Right', UPLO, TRANST, 'Non unit', NQC, K, $ ONE, T, MBV, WORK( IPW ), LW ) * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW ) ELSE CALL CGEBR2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C C - V' * W' * C( IOFFC ) = C( IOFFC ) - WORK( IPV ) * WORK( IPW )' * MPC x NQC MPC x K K x NQC * CALL CGEMM( 'No transpose', 'Conjugate transpose', MPC, NQC, $ K, -ONE, WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC ), LDC ) * ELSE * * Form Q*sub( C ) or Q'*sub( C ) * * Locally V( IOFFV ) is K x NQV, C( IOFFC ) is MPC x NQC * WORK( IPV ) is K x NQV = V( IOFFV ), NQV = NQC * WORK( IPW ) is MPC x K = C( IOFFC ) * V( IOFFV )' * IPV = 1 IPW = IPV + K * NQC LV = MAX( 1, K ) LW = MAX( 1, MPC ) * * Broadcast V to the other process rows. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) IF( MYROW.EQ.IVROW ) THEN CALL CGEBS2D( ICTXT, 'Columnwise', COLBTOP, K, NQC, $ V( IOFFV ), LDV ) IF( MYCOL.EQ.IVCOL ) $ CALL CTRBS2D( ICTXT, 'Columnwise', COLBTOP, UPLO, $ 'Non unit', K, K, T, MBV ) CALL CLAMOV( 'All', K, NQC, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL CGEBR2D( ICTXT, 'Columnwise', COLBTOP, K, NQC, $ WORK( IPV ), LV, IVROW, MYCOL ) IF( MYCOL.EQ.IVCOL ) $ CALL CTRBR2D( ICTXT, 'Columnwise', COLBTOP, UPLO, $ 'Non unit', K, K, T, MBV, IVROW, MYCOL ) END IF * IF( FORWARD ) THEN * * WORK(IPW) = ( V1 V2 ) where V1 is unit upper * triangular, zeroes lower triangular part of V1 * MYDIST = MOD( MYCOL-IVCOL+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBV - ICOFFV ) JJBEG = JJV JJEND = JJV + NQC - 1 JJNXT = MIN( ICEIL( JJBEG, NBV ) * NBV, JJEND ) * 70 CONTINUE IF( ( K-ILEFT ).GT.0 ) THEN CALL CLASET( 'Lower', K-ILEFT, JJNXT-JJBEG+1, ZERO, $ ONE, WORK( IPV+ILEFT+(JJBEG-JJV)*LV ), $ LV ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - ICOFFV JJBEG = JJNXT + 1 JJNXT = MIN( JJNXT+NBV, JJEND ) GO TO 70 END IF * ELSE * * WORK( IPW ) = ( . V1 V2 ) where V2 is unit lower * triangular, zeroes upper triangular part of V2. * II = IIV CALL INFOG1L( JV+N-K, NBV, NPCOL, MYCOL, DESCV( CSRC_ ), $ JJ, ILASTCOL ) IOFF = MOD( JV+N-K-1, NBV ) KQ = NUMROC( K+IOFF, NBV, MYCOL, ILASTCOL, NPCOL ) IF( MYCOL.EQ.ILASTCOL ) $ KQ = KQ - IOFF MYDIST = MOD( MYCOL-ILASTCOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT+NBV, K ) ILEFT = MIN( MAX( 0, ILEFT ), K ) * 80 CONTINUE IF( II.LE.( IIV+K-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL CLASET( 'All', ILEFT-II+IIV, KQ, ZERO, ZERO, $ WORK( IPV+II-IIV+(JJ-JJV)*LV ), LV ) CALL CLASET( 'Upper', WIDE, KQ, ZERO, ONE, $ WORK( IPV+ILEFT+(JJ-JJV)*LV ), LV ) KQ = MAX( 0, KQ - WIDE ) II = IIV + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT + NBV, K ) ILEFT = MIN( ILEFT, K ) GO TO 80 END IF * END IF * * WORK( IPV ) is K x NQC = V = V( IOFFV ) * WORK( IPW ) = C( IOFFC ) * V' (MPC x NQC x K) -> MPC x K * IF( NQC.GT.0 ) THEN CALL CGEMM( 'No transpose', 'Conjugate transpose', MPC, $ K, NQC, ONE, C( IOFFC ), LDC, WORK( IPV ), $ LV, ZERO, WORK( IPW ), LW ) ELSE CALL CLASET( 'All', MPC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN CALL CTRMM( 'Right', UPLO, TRANS, 'Non unit', MPC, K, $ ONE, T, MBV, WORK( IPW ), LW ) CALL CGEBS2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C C - W * V * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC x NQC MPC x K K x NQC * CALL CGEMM( 'No transpose', 'No transpose', MPC, NQC, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC ), LDC ) * END IF * END IF * RETURN * * End of PCLARFB * END scalapack-2.0.2/SRC/pclarfc.f000644 000766 000024 00000070050 10363532303 016121 0ustar00juliestaff000000 000000 SUBROUTINE PCLARFC( SIDE, M, N, V, IV, JV, DESCV, INCV, TAU, $ C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PCLARFC applies a complex elementary reflector Q**H to a * complex M-by-N distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), * from either the left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a complex scalar and v is a complex vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also have the first row of sub( C ). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also have the first column of sub( C ) and * MOD(JV-1,NB_V) must be equal to MOD(JC-1,NB_C), if INCV = 1 only the * last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q**H * sub( C ), * = 'R': form sub( C ) * Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * V (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+M-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+M-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+N-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+N-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) COMPLEX, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q**H * sub( C ) if SIDE = 'L', or * sub( C ) * Q**H if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL, ICOFF, ICROW, ICTXT, IIC, IIV, IOFFC, $ IOFFV, IPW, IROFF, IVCOL, IVROW, JJC, JJV, LDC, $ LDV, MYCOL, MYROW, MP, NCC, NCV, NPCOL, NPROW, $ NQ, RDEST COMPLEX TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CCOPY, CGEBR2D, CGEBS2D, $ CGEMV, CGERC, CGERV2D, CGESD2D, $ CGSUM2D, CLASET, INFOG2L, PB_TOPGET, $ PBCTRNV * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) JJC = MIN( JJC, NCC ) JJV = MIN( JJV, NCV ) IOFFC = IIC+(JJC-1)*LDC IOFFV = IIV+(JJV-1)*LDV * IROFF = MOD( IC-1, DESCC( MB_ ) ) ICOFF = MOD( JC-1, DESCC( NB_ ) ) MP = NUMROC( M+IROFF, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYROW.EQ.ICROW ) $ MP = MP - IROFF IF( MYCOL.EQ.ICCOL ) $ NQ = NQ - ICOFF * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFF) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFF) ) * IF( LSAME( SIDE, 'L' ) ) THEN * IF( CRBLCK ) THEN RDEST = ICROW ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V * IPW = MP+1 CALL PBCTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = CONJG( TAU( IIV ) ) * ELSE * CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) TAULOC = CONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL CGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * TAULOC = CONJG( TAU( JJV ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MP, NQ, $ ONE, C( IOFFC ), LDC, V( IOFFV ), 1, $ ZERO, WORK, 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK, MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK, MAX( 1, NQ ), RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL CGERC( MP, NQ, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL CCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL CGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL ) * ELSE IF( MYCOL.EQ.ICCOL ) THEN * IPW = MP+1 CALL CGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = CONJG( WORK( IPW ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MP, NQ, $ ONE, C( IOFFC ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL CGERC( MP, NQ, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V * IPW = MP+1 CALL PBCTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = CONJG( TAU( IIV ) ) * ELSE * CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) TAULOC = CONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL CGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), 1, $ C( IOFFC ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL CCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK(IPW) = TAU( JJV ) CALL CGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = CONJG( TAU( JJV ) ) * ELSE * IPW = MP+1 CALL CGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = CONJG( WORK( IPW ) ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL CGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), 1, $ C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW ) THEN * TAULOC = CONJG( TAU( IIV ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, V( IOFFV ), LDV, $ ZERO, WORK, 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, $ WORK, MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK, MAX( 1, MP ), RDEST, ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MP, NQ, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW * IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 CALL CCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL CGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW ) THEN * IPW = NQ+1 CALL CGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = CONJG( WORK( IPW ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V * IPW = NQ+1 CALL PBCTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = CONJG( TAU( JJV ) ) * ELSE * CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) TAULOC = CONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 CALL CCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL CGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = CONJG( TAU( IIV ) ) * ELSE * IPW = NQ+1 CALL CGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = CONJG( WORK( IPW ) ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, 1, $ C( IOFFC ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V * IPW = NQ+1 CALL PBCTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = CONJG( TAU( JJV ) ) * ELSE * CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) TAULOC = CONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL CGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MP, 1, ZERO, ZERO, WORK( IPW ), $ MAX( 1, MP ) ) END IF CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, 1, $ C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PCLARFC * END scalapack-2.0.2/SRC/pclarfg.f000644 000766 000024 00000024325 10363532303 016131 0ustar00juliestaff000000 000000 SUBROUTINE PCLARFG( N, ALPHA, IAX, JAX, X, IX, JX, DESCX, INCX, $ TAU ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAX, INCX, IX, JAX, JX, N COMPLEX ALPHA * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX TAU( * ), X( * ) * .. * * Purpose * ======= * * PCLARFG generates a complex elementary reflector H of order n, such * that * * H * sub( X ) = H * ( x(iax,jax) ) = ( alpha ), H' * H = I. * ( x ) ( 0 ) * * where alpha is a real scalar, and sub( X ) is an (N-1)-element * complex distributed vector X(IX:IX+N-2,JX) if INCX = 1 and * X(IX,JX:JX+N-2) if INCX = DESCX(M_). H is represented in the form * * H = I - tau * ( 1 ) * ( 1 v' ) , * ( v ) * * where tau is a complex scalar and v is a complex (N-1)-element * vector. Note that H is not Hermitian. * * If the elements of sub( X ) are all zero and X(IAX,JAX) is real, * then tau = 0 and H is taken to be the unit matrix. * * Otherwise 1 <= real(tau) <= 2 and abs(tau-1) <= 1. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) INTEGER * The global order of the elementary reflector. N >= 0. * * ALPHA (local output) COMPLEX * On exit, alpha is computed in the process scope having the * vector sub( X ). * * IAX (global input) INTEGER * The global row index in X of X(IAX,JAX). * * JAX (global input) INTEGER * The global column index in X of X(IAX,JAX). * * X (local input/local output) COMPLEX, pointer into the * local memory to an array of dimension (LLD_X,*). This array * contains the local pieces of the distributed vector sub( X ). * Before entry, the incremented array sub( X ) must contain * the vector x. On exit, it is overwritten with the vector v. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * TAU (local output) COMPLEX, array, dimension LOCc(JX) * if INCX = 1, and LOCr(IX) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER ICTXT, IIAX, INDXTAU, IXCOL, IXROW, J, JJAX, $ KNT, MYCOL, MYROW, NPCOL, NPROW REAL ALPHI, ALPHR, BETA, RSAFMN, SAFMIN, XNORM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, PCSCAL, $ PCSSCAL, INFOG2L, PSCNRM2 * .. * .. External Functions .. REAL SLAMCH, SLAPY3 COMPLEX CLADIV EXTERNAL CLADIV, SLAPY3, SLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, REAL, SIGN * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is distributed across a process row. * CALL INFOG2L( IX, JAX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIAX, JJAX, IXROW, IXCOL ) * IF( MYROW.NE.IXROW ) $ RETURN * * Broadcast X(IAX,JAX) across the process row. * IF( MYCOL.EQ.IXCOL ) THEN J = IIAX+(JJAX-1)*DESCX( LLD_ ) CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, X( J ), 1 ) ALPHA = X( J ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1, $ MYROW, IXCOL ) END IF * INDXTAU = IIAX * ELSE * * sub( X ) is distributed across a process column. * CALL INFOG2L( IAX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIAX, JJAX, IXROW, IXCOL ) * IF( MYCOL.NE.IXCOL ) $ RETURN * * Broadcast X(IAX,JAX) across the process column. * IF( MYROW.EQ.IXROW ) THEN J = IIAX+(JJAX-1)*DESCX( LLD_ ) CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, X( J ), 1 ) ALPHA = X( J ) ELSE CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, ALPHA, 1, $ IXROW, MYCOL ) END IF * INDXTAU = JJAX * END IF * IF( N.LE.0 ) THEN TAU( INDXTAU ) = ZERO RETURN END IF * CALL PSCNRM2( N-1, XNORM, X, IX, JX, DESCX, INCX ) ALPHR = REAL( ALPHA ) ALPHI = AIMAG( ALPHA ) * IF( XNORM.EQ.ZERO .AND. ALPHI.EQ.ZERO ) THEN * * H = I * TAU( INDXTAU ) = ZERO * ELSE * * General case * BETA = -SIGN( SLAPY3( ALPHR, ALPHI, XNORM ), ALPHR ) SAFMIN = SLAMCH( 'S' ) RSAFMN = ONE / SAFMIN IF( ABS( BETA ).LT.SAFMIN ) THEN * * XNORM, BETA may be inaccurate; scale X and recompute them * KNT = 0 10 CONTINUE KNT = KNT + 1 CALL PCSSCAL( N-1, RSAFMN, X, IX, JX, DESCX, INCX ) BETA = BETA*RSAFMN ALPHI = ALPHI*RSAFMN ALPHR = ALPHR*RSAFMN IF( ABS( BETA ).LT.SAFMIN ) $ GO TO 10 * * New BETA is at most 1, at least SAFMIN * CALL PSCNRM2( N-1, XNORM, X, IX, JX, DESCX, INCX ) ALPHA = CMPLX( ALPHR, ALPHI ) BETA = -SIGN( SLAPY3( ALPHR, ALPHI, XNORM ), ALPHR ) TAU( INDXTAU ) = CMPLX( ( BETA-ALPHR ) / BETA, $ -ALPHI / BETA ) ALPHA = CLADIV( CMPLX( ONE ), ALPHA-BETA ) CALL PCSCAL( N-1, ALPHA, X, IX, JX, DESCX, INCX ) * * If ALPHA is subnormal, it may lose relative accuracy * ALPHA = BETA DO 20 J = 1, KNT ALPHA = ALPHA*SAFMIN 20 CONTINUE ELSE TAU( INDXTAU ) = CMPLX( ( BETA-ALPHR ) / BETA, $ -ALPHI / BETA ) ALPHA = CLADIV( CMPLX( ONE ), ALPHA-BETA ) CALL PCSCAL( N-1, ALPHA, X, IX, JX, DESCX, INCX ) ALPHA = BETA END IF END IF * RETURN * * End of PCLARFG * END scalapack-2.0.2/SRC/pclarft.f000644 000766 000024 00000045056 10363532303 016152 0ustar00juliestaff000000 000000 SUBROUTINE PCLARFT( DIRECT, STOREV, N, K, V, IV, JV, DESCV, TAU, $ T, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIRECT, STOREV INTEGER IV, JV, K, N * .. * .. Array Arguments .. INTEGER DESCV( * ) COMPLEX TAU( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PCLARFT forms the triangular factor T of a complex block reflector H * of order n, which is defined as a product of k elementary reflectors. * * If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; * * If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. * * If STOREV = 'C', the vector which defines the elementary reflector * H(i) is stored in the i-th column of the distributed matrix V, and * * H = I - V * T * V' * * If STOREV = 'R', the vector which defines the elementary reflector * H(i) is stored in the i-th row of the distributed matrix V, and * * H = I - V' * T * V * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIRECT (global input) CHARACTER*1 * Specifies the order in which the elementary reflectors are * multiplied to form the block reflector: * = 'F': H = H(1) H(2) . . . H(k) (Forward) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER*1 * Specifies how the vectors which define the elementary * reflectors are stored (see also Further Details): * = 'C': columnwise * = 'R': rowwise * * N (global input) INTEGER * The order of the block reflector H. N >= 0. * * K (global input) INTEGER * The order of the triangular factor T (= the number of * elementary reflectors). 1 <= K <= MB_V (= NB_V). * * V (input/output) COMPLEX pointer into the local memory * to an array of local dimension (LOCr(IV+N-1),LOCc(JV+K-1)) * if STOREV = 'C', and (LOCr(IV+K-1),LOCc(JV+N-1)) if * STOREV = 'R'. The distributed matrix V contains the * Householder vectors. See further details. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * TAU (local input) COMPLEX, array, dimension LOCr(IV+K-1) * if INCV = M_V, and LOCc(JV+K-1) otherwise. This array * contains the Householder scalars related to the Householder * vectors. TAU is tied to the distributed matrix V. * * T (local output) COMPLEX array, dimension (NB_V,NB_V) * if STOREV = 'Col', and (MB_V,MB_V) otherwise. It contains * the k-by-k triangular factor of the block reflector asso- * ciated with V. If DIRECT = 'F', T is upper triangular; * if DIRECT = 'B', T is lower triangular. * * WORK (local workspace) COMPLEX array, * dimension (K*(K-1)/2) * * Further Details * =============== * * The shape of the matrix V and the storage of the vectors which define * the H(i) is best illustrated by the following example with n = 5 and * k = 3. The elements equal to 1 are not stored; the corresponding * array elements are modified but restored on exit. The rest of the * array is not used. * * DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': * * V( IV:IV+N-1, ( 1 ) V( IV:IV+K-1, ( 1 v1 v1 v1 v1 ) * JV:JV+K-1 ) = ( v1 1 ) JV:JV+N-1 ) = ( 1 v2 v2 v2 ) * ( v1 v2 1 ) ( 1 v3 v3 ) * ( v1 v2 v3 ) * ( v1 v2 v3 ) * * DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': * * V( IV:IV+N-1, ( v1 v2 v3 ) V( IV:IV+K-1, ( v1 v1 1 ) * JV:JV+K-1 ) = ( v1 v2 v3 ) JV:JV+N-1 ) = ( v2 v2 v2 1 ) * ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) * ( 1 v3 ) * ( 1 ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL FORWARD INTEGER ICOFF, ICTXT, II, IIV, IROFF, IVCOL, IVROW, $ ITMP0, ITMP1, IW, JJ, JJV, LDV, MICOL, MIROW, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ COMPLEX VII * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CCOPY, CGEMV, CGSUM2D, $ CLACGV, CLASET, CTRMV, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 .OR. K.LE.0 ) $ RETURN * ICTXT = DESCV( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * FORWARD = LSAME( DIRECT, 'F' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, $ IIV, JJV, IVROW, IVCOL ) * IF( LSAME( STOREV, 'C' ) .AND. MYCOL.EQ.IVCOL ) THEN * IW = 1 LDV = DESCV( LLD_ ) IROFF = MOD( IV-1, DESCV( MB_ ) ) * IF( FORWARD ) THEN * * DIRECT = 'Forward', STOREV = 'Columnwise' * NP = NUMROC( N+IROFF, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) THEN NP = NP - IROFF II = IIV + 1 ELSE II = IIV END IF IF( IROFF+1.EQ.DESCV( MB_ ) ) THEN MIROW = MOD( IVROW+1, NPROW ) ELSE MIROW = IVROW END IF ITMP0 = 0 * DO 10 JJ = JJV+1, JJV+K-1 * IF( MYROW.EQ.MIROW ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( jv+i-1 ) * * V(iv+i-1:iv+n-1,jv:jv+i-2)' * V(iv+i-1:iv+n-1,jv+i-1) * ITMP0 = ITMP0 + 1 IF( NP-II+IIV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', NP-II+IIV, ITMP0, $ -TAU( JJ ), V( II+(JJV-1)*LDV ), LDV, $ V( II+(JJ-1)*LDV ), 1, ZERO, $ WORK( IW ), 1 ) ELSE CALL CLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF * IW = IW + ITMP0 IF( MYROW.EQ.MIROW ) THEN V( II+(JJ-1)*LDV ) = VII II = II + 1 END IF * IF( MOD( IV+ITMP0, DESCV( MB_ ) ).EQ.0 ) $ MIROW = MOD( MIROW+1, NPROW ) * 10 CONTINUE * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', IW-1, 1, WORK, IW-1, $ IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * IW = 1 ITMP0 = 0 ITMP1 = 1 * T( ITMP1 ) = TAU( JJV ) * DO 20 JJ = JJV+1, JJV+K-1 * * T(1:j-1,j) = T(1:j-1,1:j-1) * T(1:j-1,j) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 + DESCV( NB_ ) CALL CCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL CTRMV( 'Upper', 'No transpose', 'Non-unit', $ ITMP0, T, DESCV( NB_ ), T( ITMP1 ), 1 ) T(ITMP1+ITMP0) = TAU( JJ ) * 20 CONTINUE * END IF * ELSE * * DIRECT = 'Backward', STOREV = 'Columnwise' * NP = NUMROC( N+IROFF-1, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ NP = NP - IROFF MIROW = INDXG2P( IV+N-2, DESCV( MB_ ), MYROW, $ DESCV( RSRC_ ), NPROW ) II = IIV + NP - 1 ITMP0 = 0 * DO 30 JJ = JJV+K-2, JJV, -1 * IF( MYROW.EQ.MIROW ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( jv+i-1 ) * * V(iv:iv+n-k+i-1,jv+i:jv+k-1)' * V(iv:iv+n-k+i-1,jv+i-1) * ITMP0 = ITMP0 + 1 IF( II-IIV+1.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', II-IIV+1, ITMP0, $ -TAU( JJ ), V( IIV+JJ*LDV ), LDV, $ V( IIV+(JJ-1)*LDV ), 1, ZERO, $ WORK( IW ), 1 ) ELSE CALL CLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF * IW = IW + ITMP0 IF( MYROW.EQ.MIROW ) THEN V( II+(JJ-1)*LDV ) = VII II = II - 1 END IF * IF( MOD( IV+N-ITMP0-2, DESCV(MB_) ).EQ.0 ) $ MIROW = MOD( MIROW+NPROW-1, NPROW ) * 30 CONTINUE * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', IW-1, 1, WORK, IW-1, $ IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( NB_ ) * T( ITMP1-1 ) = TAU( JJV+K-1 ) * DO 40 JJ = JJV+K-2, JJV, -1 * * T(j+1:k,j) = T(j+1:k,j+1:k) * T(j+1:k,j) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( NB_ ) - 1 CALL CCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL CTRMV( 'Lower', 'No transpose', 'Non-unit', $ ITMP0, T( ITMP1+DESCV( NB_ ) ), $ DESCV( NB_ ), T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( JJ ) * 40 CONTINUE * END IF * END IF * ELSE IF( LSAME( STOREV, 'R' ) .AND. MYROW.EQ.IVROW ) THEN * IW = 1 LDV = DESCV( LLD_ ) ICOFF = MOD( JV-1, DESCV( NB_ ) ) * IF( FORWARD ) THEN * * DIRECT = 'Forward', STOREV = 'Rowwise' * NQ = NUMROC( N+ICOFF, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN NQ = NQ - ICOFF JJ = JJV + 1 ELSE JJ = JJV END IF IF( ICOFF+1.EQ.DESCV( NB_ ) ) THEN MICOL = MOD( IVCOL+1, NPCOL ) ELSE MICOL = IVCOL END IF ITMP0 = 0 * DO 50 II = IIV+1, IIV+K-1 * IF( MYCOL.EQ.MICOL ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( iv+i-1 ) * * V(iv+i-1,jv+i-1:jv+n-1) * V(iv:iv+i-2,jv+i-1:jv+n-1)' * ITMP0 = ITMP0 + 1 IF( NQ-JJ+JJV.GT.0 ) THEN CALL CLACGV( NQ-JJ+JJV, V( II+(JJ-1)*LDV ), LDV ) CALL CGEMV( 'No transpose', ITMP0, NQ-JJ+JJV, $ -TAU(II), V( IIV+(JJ-1)*LDV ), LDV, $ V( II+(JJ-1)*LDV ), LDV, ZERO, $ WORK( IW ), 1 ) CALL CLACGV( NQ-JJ+JJV, V( II+(JJ-1)*LDV ), LDV ) ELSE CALL CLASET( 'All', ITMP0, 1, ZERO, ZERO, $ WORK( IW ), ITMP0 ) END IF * IW = IW + ITMP0 IF( MYCOL.EQ.MICOL ) THEN V( II+(JJ-1)*LDV ) = VII JJ = JJ + 1 END IF * IF( MOD( JV+ITMP0, DESCV( NB_ ) ).EQ.0 ) $ MICOL = MOD( MICOL+1, NPCOL ) * 50 CONTINUE * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = 1 * T( ITMP1 ) = TAU( IIV ) * DO 60 II = IIV+1, IIV+K-1 * * T(1:i-1,i) = T(1:i-1,1:i-1) * T(1:i-1,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 + DESCV( MB_ ) CALL CCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL CTRMV( 'Upper', 'No transpose', 'Non-unit', $ ITMP0, T, DESCV( MB_ ), T( ITMP1 ), 1 ) T( ITMP1+ITMP0 ) = TAU( II ) * 60 CONTINUE * END IF * ELSE * * DIRECT = 'Backward', STOREV = 'Rowwise' * NQ = NUMROC( N+ICOFF-1, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQ = NQ - ICOFF MICOL = INDXG2P( JV+N-2, DESCV( NB_ ), MYCOL, $ DESCV( CSRC_ ), NPCOL ) JJ = JJV + NQ - 1 ITMP0 = 0 * DO 70 II = IIV+K-2, IIV, -1 * IF( MYCOL.EQ.MICOL ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(i+1:k,i) = -tau( iv+i-1 ) * * V(iv+i:iv+k-1,jv:jv+n-k+i-1)' * V(iv+i-1,jv:jv+n-k+i-1)' * ITMP0 = ITMP0 + 1 IF( JJ-JJV+1.GT.0 ) THEN CALL CLACGV( JJ-JJV+1, V( II+(JJV-1)*LDV ), LDV ) CALL CGEMV( 'No transpose', ITMP0, JJ-JJV+1, $ -TAU( II ), V( II+1+(JJV-1)*LDV ), LDV, $ V( II+(JJV-1)*LDV ), LDV, ZERO, $ WORK( IW ), 1 ) CALL CLACGV( JJ-JJV+1, V( II+(JJV-1)*LDV ), LDV ) ELSE CALL CLASET( 'All', ITMP0, 1, ZERO, ZERO, $ WORK( IW ), ITMP0 ) END IF * IW = IW + ITMP0 IF( MYCOL.EQ.MICOL ) THEN V( II+(JJ-1)*LDV ) = VII JJ = JJ - 1 END IF * IF( MOD( JV+N-ITMP0-2, DESCV( NB_ ) ).EQ.0 ) $ MICOL = MOD( MICOL+NPCOL-1, NPCOL ) * 70 CONTINUE * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( MB_ ) * T( ITMP1-1 ) = TAU( IIV+K-1 ) * DO 80 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = T(i+1:k,i+1:k) * T(i+1:k,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( MB_ ) - 1 CALL CCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL CTRMV( 'Lower', 'No transpose', 'Non-unit', $ ITMP0, T( ITMP1+DESCV( MB_ ) ), $ DESCV( MB_ ), T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( II ) * 80 CONTINUE * END IF * END IF * END IF * RETURN * * End of PCLARFT * END scalapack-2.0.2/SRC/pclarz.f000644 000766 000024 00000102102 10363532303 015774 0ustar00juliestaff000000 000000 SUBROUTINE PCLARZ( SIDE, M, N, L, V, IV, JV, DESCV, INCV, TAU, C, $ IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PCLARZ applies a complex elementary reflector Q to a complex M-by-N * distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), from either the * left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a complex scalar and v is a complex vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Q is a product of k elementary reflectors as returned by PCTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also own C(IC+M-L,JC:JC+N-1). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC+N-L-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also own C(IC:IC+M-1,JC+N-L) and * MOD(JV-1,NB_V) must be equal to MOD(JC+N-L-1,NB_C), if INCV = 1 only * the last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q * sub( C ), * = 'R': form sub( C ) * Q. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+L-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+L-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) COMPLEX, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q * sub( C ) if SIDE = 'L', or * sub( C ) * Q if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK, LEFT CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIC1, IIC2, IIV, IOFFC1, $ IOFFC2, IOFFV, IPW, IROFFC1, IROFFC2, IROFFV, $ IVCOL, IVROW, JJC1, JJC2, JJV, LDC, LDV, MPC2, $ MPV, MYCOL, MYROW, NCC, NCV, NPCOL, NPROW, $ NQC2, NQV, RDEST COMPLEX TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CAXPY, CCOPY, CGEBR2D, $ CGEBS2D, CGEMV, CGERC, CGERV2D, $ CGESD2D, CGSUM2D, CLASET, INFOG2L, $ PB_TOPGET, PBCTRNV * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * LEFT = LSAME( SIDE, 'L' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) IROFFV = MOD( IV-1, DESCV( NB_ ) ) MPV = NUMROC( L+IROFFV, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ MPV = MPV - IROFFV ICOFFV = MOD( JV-1, DESCV( NB_ ) ) NQV = NUMROC( L+ICOFFV, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, NCV ) IOFFV = IIV+(JJV-1)*LDV NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC1, JJC1, ICROW1, ICCOL1 ) IROFFC1 = MOD( IC-1, DESCC( MB_ ) ) ICOFFC1 = MOD( JC-1, DESCC( NB_ ) ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NCC ) ) IOFFC1 = IIC1 + ( JJC1-1 ) * LDC * IF( LEFT ) THEN CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, DESCC( MB_ ) ) ICOFFC2 = MOD( JC-1, DESCC( NB_ ) ) NQC2 = NUMROC( N+ICOFFC2, DESCC( NB_ ), MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 ELSE CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC-1, DESCC( MB_ ) ) MPC2 = NUMROC( M+IROFFC2, DESCC( MB_ ), MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = MOD( JC+N-L-1, DESCC( NB_ ) ) END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NCC ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFFC1) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFFC1) ) * IF( LEFT ) THEN * IF( CRBLCK ) THEN RDEST = ICROW2 ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V (ICOFFV = IROFFC2) * IPW = MPV+1 CALL PBCTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL CGERC( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL2 ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * TAULOC = TAU( JJV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, V( IOFFV ), $ 1, ZERO, WORK, 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK, MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK, MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK, MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK, $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL CGERC( MPV, NQC2, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL2 * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL CCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL CGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL2 ) * ELSE IF( MYCOL.EQ.ICCOL2 ) THEN * IPW = MPV+1 CALL CGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), $ RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL CGERC( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V (ICOFFV=IROFFC2) * IPW = MPV+1 CALL PBCTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL CGERC( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL CCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL CGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( JJV ) * ELSE * IPW = MPV+1 CALL CGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL CGERC( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW2 ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW2 ) THEN * TAULOC = TAU( IIV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, V( IOFFV ), $ LDV, ZERO, WORK, 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK, MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK, 1 ) * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK, MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) * IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK, 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * IF( MPC2.GT.0 .AND. NQV.GT.0 ) $ CALL CGERC( MPC2, NQV, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC2 ), $ LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW2 * IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL CCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL CGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW2, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW2 ) THEN * IPW = NQV+1 CALL CGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), $ RDEST, ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V (IROFFV = ICOFFC2) * IPW = NQV+1 CALL PBCTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW2 ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = TAU( JJV ) * ELSE * CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL CCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL CGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( IIV ) * ELSE * IPW = NQV+1 CALL CGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No Transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V (ICOFFC2=IROFFV) * IPW = NQV+1 CALL PBCTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = TAU( JJV ) * ELSE * CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PCLARZ * END scalapack-2.0.2/SRC/pclarzb.f000644 000766 000024 00000056564 11750130340 016156 0ustar00juliestaff000000 000000 SUBROUTINE PCLARZB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, V, $ IV, JV, DESCV, T, C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER DIRECT, SIDE, STOREV, TRANS INTEGER IC, IV, JC, JV, K, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX C( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PCLARZB applies a complex block reflector Q or its conjugate * transpose Q**H to a complex M-by-N distributed matrix sub( C ) * denoting C(IC:IC+M-1,JC:JC+N-1), from the left or the right. * * Q is a product of k elementary reflectors as returned by PCTZRZF. * * Currently, only STOREV = 'R' and DIRECT = 'B' are supported. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * DIRECT (global input) CHARACTER * Indicates how H is formed from a product of elementary * reflectors * = 'F': H = H(1) H(2) . . . H(k) (Forward, not supported yet) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Indicates how the vectors which define the elementary * reflectors are stored: * = 'C': Columnwise (not supported yet) * = 'R': Rowwise * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The order of the matrix T (= the number of elementary * reflectors whose product defines the block reflector). * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_V, LOCc(JV+M-1)) if SIDE = 'L', * (LLD_V, LOCc(JV+N-1)) if SIDE = 'R'. It contains the local * pieces of the distributed vectors V representing the * Householder transformation as returned by PCTZRZF. * LLD_V >= LOCr(IV+K-1). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * T (local input) COMPLEX array, dimension MB_V by MB_V * The lower triangular matrix T in the representation of the * block reflector. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the M-by-N distributed matrix sub( C ). On exit, * sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) or * sub( C )*Q or sub( C )*Q'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * If STOREV = 'C', * if SIDE = 'L', * LWORK >= ( NqC0 + MpC0 ) * K * else if SIDE = 'R', * LWORK >= ( NqC0 + MAX( NpV0 + NUMROC( NUMROC( N+ICOFFC, * NB_V, 0, 0, NPCOL ), NB_V, 0, 0, LCMQ ), * MpC0 ) ) * K * end if * else if STOREV = 'R', * if SIDE = 'L', * LWORK >= ( MpC0 + MAX( MqV0 + NUMROC( NUMROC( M+IROFFC, * MB_V, 0, 0, NPROW ), MB_V, 0, 0, LCMP ), * NqC0 ) ) * K * else if SIDE = 'R', * LWORK >= ( MpC0 + NqC0 ) * K * end if * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFV = MOD( IV-1, MB_V ), ICOFFV = MOD( JV-1, NB_V ), * IVROW = INDXG2P( IV, MB_V, MYROW, RSRC_V, NPROW ), * IVCOL = INDXG2P( JV, NB_V, MYCOL, CSRC_V, NPCOL ), * MqV0 = NUMROC( M+ICOFFV, NB_V, MYCOL, IVCOL, NPCOL ), * NpV0 = NUMROC( N+IROFFV, MB_V, MYROW, IVROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NpC0 = NUMROC( N+ICOFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If STOREV = 'Columnwise' * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if STOREV = 'Rowwise' * If SIDE = 'Left', * ( NB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT CHARACTER COLBTOP, TRANST INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIBEG, IIC1, IIC2, $ IIEND, IINXT, IIV, ILEFT, INFO, IOFFC2, IOFFV, $ IPT, IPV, IPW, IROFFC1, IROFFC2, ITOP, IVCOL, $ IVROW, J, JJBEG, JJEND, JJNXT, JJC1, JJC2, JJV, $ LDC, LDV, LV, LW, MBC, MBV, MPC1, MPC2, MPC20, $ MQV, MQV0, MYCOL, MYDIST, MYROW, NBC, NBV, $ NPCOL, NPROW, NQC1, NQC2, NQCALL, NQV * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CGEBR2D, $ CGEBS2D, CGEMM, CGSUM2D, CLACGV, $ CLAMOV, CLASET, CTRBR2D, CTRBS2D, $ CTRMM, INFOG2L, PBCMATADD, PBCTRAN, $ PB_TOPGET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 .OR. K.LE.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check for currently supported options * INFO = 0 IF( .NOT.LSAME( DIRECT, 'B' ) ) THEN INFO = -3 ELSE IF( .NOT.LSAME( STOREV, 'R' ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCLARZB', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * LEFT = LSAME( SIDE, 'L' ) IF( LSAME( TRANS, 'N' ) ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) MBV = DESCV( MB_ ) NBV = DESCV( NB_ ) ICOFFV = MOD( JV-1, NBV ) NQV = NUMROC( L+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, MAX( 1, NUMROC( DESCV( N_ ), NBV, MYCOL, $ DESCV( CSRC_ ), NPCOL ) ) ) IOFFV = IIV + ( JJV-1 ) * LDV MBC = DESCC( MB_ ) NBC = DESCC( NB_ ) NQCALL = NUMROC( DESCC( N_ ), NBC, MYCOL, DESCC( CSRC_ ), NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC1, $ JJC1, ICROW1, ICCOL1 ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NQCALL ) ) * IF( LEFT ) THEN IROFFC1 = MOD( IC-1, MBC ) MPC1 = NUMROC( K+IROFFC1, MBC, MYROW, ICROW1, NPROW ) IF( MYROW.EQ.ICROW1 ) $ MPC1 = MPC1 - IROFFC1 ICOFFC1 = MOD( JC-1, NBC ) NQC1 = NUMROC( N+ICOFFC1, NBC, MYCOL, ICCOL1, NPCOL ) IF( MYCOL.EQ.ICCOL1 ) $ NQC1 = NQC1 - ICOFFC1 CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, MBC ) MPC2 = NUMROC( L+IROFFC2, MBC, MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = ICOFFC1 NQC2 = NQC1 ELSE IROFFC1 = MOD( IC-1, MBC ) MPC1 = NUMROC( M+IROFFC1, MBC, MYROW, ICROW1, NPROW ) IF( MYROW.EQ.ICROW1 ) $ MPC1 = MPC1 - IROFFC1 ICOFFC1 = MOD( JC-1, NBC ) NQC1 = NUMROC( K+ICOFFC1, NBC, MYCOL, ICCOL1, NPCOL ) IF( MYCOL.EQ.ICCOL1 ) $ NQC1 = NQC1 - ICOFFC1 CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = IROFFC1 MPC2 = MPC1 ICOFFC2 = MOD( JC+N-L-1, NBC ) NQC2 = NUMROC( L+ICOFFC2, NBC, MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NQCALL ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * IROFFC2 = ICOFFV is required by the current transposition * routine PBCTRAN * MQV0 = NUMROC( M+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN MQV = MQV0 - ICOFFV ELSE MQV = MQV0 END IF IF( MYROW.EQ.ICROW2 ) THEN MPC20 = MPC2 + IROFFC2 ELSE MPC20 = MPC2 END IF * * Locally V( IOFFV ) is K x MQV, C( IOFFC2 ) is MPC2 x NQC2 * WORK( IPV ) is MPC20 x K = [ . V( IOFFV ) ]' * WORK( IPW ) is K x MQV0 = [ . V( IOFFV ) ] * WORK( IPT ) is the workspace for PBCTRAN * IPV = 1 IPW = IPV + MPC20 * K IPT = IPW + K * MQV0 LV = MAX( 1, MPC20 ) LW = MAX( 1, K ) * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN CALL CLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW+ICOFFV*LW ), LW ) ELSE CALL CLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW ), LW ) END IF END IF * * WORK( IPV ) = WORK( IPW )' (replicated) is MPC20 x K * CALL PBCTRAN( ICTXT, 'Rowwise', 'Conjugate transpose', K, $ M+ICOFFV, DESCV( NB_ ), WORK( IPW ), LW, ZERO, $ WORK( IPV ), LV, IVROW, IVCOL, ICROW2, -1, $ WORK( IPT ) ) * * WORK( IPV ) = ( . V )' -> WORK( IPV ) = V' is MPC2 x K * IF( MYROW.EQ.ICROW2 ) $ IPV = IPV + IROFFC2 * * WORK( IPW ) becomes NQC2 x K = C( IOFFC2 )' * V' * WORK( IPW ) = C( IOFFC2 )' * V' (NQC2 x MPC2 x K) -> NQC2 x K * LW = MAX( 1, NQC2 ) * IF( MPC2.GT.0 ) THEN CALL CGEMM( 'Transpose', 'No transpose', NQC2, K, MPC2, $ ONE, C( IOFFC2 ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL CLASET( 'All', NQC2, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * * WORK( IPW ) = WORK( IPW ) + C1 ( NQC1 = NQC2 ) * IF( MPC1.GT.0 ) THEN MYDIST = MOD( MYROW-ICROW1+NPROW, NPROW ) ITOP = MAX( 0, MYDIST * MBC - IROFFC1 ) IIBEG = IIC1 IIEND = IIC1 + MPC1 - 1 IINXT = MIN( ICEIL( IIBEG, MBC ) * MBC, IIEND ) * 10 CONTINUE IF( IIBEG.LE.IINXT ) THEN CALL PBCMATADD( ICTXT, 'Transpose', NQC2, IINXT-IIBEG+1, $ ONE, C( IIBEG+(JJC1-1)*LDC ), LDC, ONE, $ WORK( IPW+ITOP ), LW ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBC - IROFFC1 IIBEG = IINXT +1 IINXT = MIN( IINXT+MBC, IIEND ) GO TO 10 END IF END IF * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN * * Broadcast the block reflector to the other columns. * CALL CTRBS2D( ICTXT, 'Rowwise', ' ', 'Lower', 'Non unit', $ K, K, T, MBV ) ELSE CALL CTRBR2D( ICTXT, 'Rowwise', ' ', 'Lower', 'Non unit', $ K, K, T, MBV, MYROW, IVCOL ) END IF CALL CTRMM( 'Right', 'Lower', TRANST, 'Non unit', NQC2, K, $ ONE, T, MBV, WORK( IPW ), LW ) * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', NQC2, K, $ WORK( IPW ), LW ) ELSE CALL CGEBR2D( ICTXT, 'Columnwise', ' ', NQC2, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C1 = C1 - WORK( IPW ) * IF( MPC1.GT.0 ) THEN MYDIST = MOD( MYROW-ICROW1+NPROW, NPROW ) ITOP = MAX( 0, MYDIST * MBC - IROFFC1 ) IIBEG = IIC1 IIEND = IIC1 + MPC1 - 1 IINXT = MIN( ICEIL( IIBEG, MBC ) * MBC, IIEND ) * 20 CONTINUE IF( IIBEG.LE.IINXT ) THEN CALL PBCMATADD( ICTXT, 'Transpose', IINXT-IIBEG+1, NQC2, $ -ONE, WORK( IPW+ITOP ), LW, ONE, $ C( IIBEG+(JJC1-1)*LDC ), LDC ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBC - IROFFC1 IIBEG = IINXT +1 IINXT = MIN( IINXT+MBC, IIEND ) GO TO 20 END IF END IF * * C2 C2 - V' * W' * C( IOFFC2 ) = C( IOFFC2 ) - WORK( IPV ) * WORK( IPW )' * MPC2 x NQC2 MPC2 x K K x NQC2 * DO 30 J = 1, K CALL CLACGV( MPC2, WORK( IPV+(J-1)*LV ), 1 ) 30 CONTINUE CALL CGEMM( 'No transpose', 'Transpose', MPC2, NQC2, K, -ONE, $ WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC2 ), LDC ) * ELSE * * Form sub( C ) * Q or sub( C ) * Q' * * Locally V( IOFFV ) is K x NQV, C( IOFFC2 ) is MPC2 x NQC2 * WORK( IPV ) is K x NQV = V( IOFFV ), NQV = NQC2 * WORK( IPW ) is MPC2 x K = C( IOFFC2 ) * V( IOFFV )' * IPV = 1 IPW = IPV + K * NQC2 LV = MAX( 1, K ) LW = MAX( 1, MPC2 ) * * Broadcast V to the other process rows. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) IF( MYROW.EQ.IVROW ) THEN CALL CGEBS2D( ICTXT, 'Columnwise', COLBTOP, K, NQC2, $ V( IOFFV ), LDV ) IF( MYCOL.EQ.IVCOL ) $ CALL CTRBS2D( ICTXT, 'Columnwise', COLBTOP, 'Lower', $ 'Non unit', K, K, T, MBV ) CALL CLAMOV( 'All', K, NQC2, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL CGEBR2D( ICTXT, 'Columnwise', COLBTOP, K, NQC2, $ WORK( IPV ), LV, IVROW, MYCOL ) IF( MYCOL.EQ.IVCOL ) $ CALL CTRBR2D( ICTXT, 'Columnwise', COLBTOP, 'Lower', $ 'Non unit', K, K, T, MBV, IVROW, MYCOL ) END IF * * WORK( IPV ) is K x NQC2 = V = V( IOFFV ) * WORK( IPW ) = C( IOFFC2 ) * V' (MPC2 x NQC2 x K) -> MPC2 x K * IF( NQC2.GT.0 ) THEN CALL CGEMM( 'No Transpose', 'Transpose', MPC2, K, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL CLASET( 'All', MPC2, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * * WORK( IPW ) = WORK( IPW ) + C1 ( MPC1 = MPC2 ) * IF( NQC1.GT.0 ) THEN MYDIST = MOD( MYCOL-ICCOL1+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBC - ICOFFC1 ) JJBEG = JJC1 JJEND = JJC1 + NQC1 - 1 JJNXT = MIN( ICEIL( JJBEG, NBC ) * NBC, JJEND ) * 40 CONTINUE IF( JJBEG.LE.JJNXT ) THEN CALL PBCMATADD( ICTXT, 'No transpose', MPC2, $ JJNXT-JJBEG+1, ONE, $ C( IIC1+(JJBEG-1)*LDC ), LDC, ONE, $ WORK( IPW+ILEFT*LW ), LW ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBC - ICOFFC1 JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBC, JJEND ) GO TO 40 END IF END IF * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN DO 50 J = 1, K CALL CLACGV( K-J+1, T( J+(J-1)*MBV ), 1 ) 50 CONTINUE CALL CTRMM( 'Right', 'Lower', TRANS, 'Non unit', MPC2, K, $ ONE, T, MBV, WORK( IPW ), LW ) CALL CGEBS2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW ) DO 60 J = 1, K CALL CLACGV( K-J+1, T( J+(J-1)*MBV ), 1 ) 60 CONTINUE ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C1 = C1 - WORK( IPW ) * IF( NQC1.GT.0 ) THEN MYDIST = MOD( MYCOL-ICCOL1+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBC - ICOFFC1 ) JJBEG = JJC1 JJEND = JJC1 + NQC1 - 1 JJNXT = MIN( ICEIL( JJBEG, NBC ) * NBC, JJEND ) * 70 CONTINUE IF( JJBEG.LE.JJNXT ) THEN CALL PBCMATADD( ICTXT, 'No transpose', MPC2, $ JJNXT-JJBEG+1, -ONE, $ WORK( IPW+ILEFT*LW ), LW, ONE, $ C( IIC1+(JJBEG-1)*LDC ), LDC ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBC - ICOFFC1 JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBC, JJEND ) GO TO 70 END IF END IF * * C2 C2 - W * conjg( V ) * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * conjg( WORK( IPV ) ) * MPC2 x NQC2 MPC2 x K K x NQC2 * DO 80 J = 1, NQC2 CALL CLACGV( K, WORK( IPV+(J-1)*LV ), 1 ) 80 CONTINUE IF( IOFFC2.GT.0 ) $ CALL CGEMM( 'No transpose', 'No transpose', MPC2, NQC2, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC2 ), LDC ) * END IF * RETURN * * End of PCLARZB * END scalapack-2.0.2/SRC/pclarzc.f000644 000766 000024 00000102421 10363532303 016143 0ustar00juliestaff000000 000000 SUBROUTINE PCLARZC( SIDE, M, N, L, V, IV, JV, DESCV, INCV, TAU, C, $ IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PCLARZC applies a complex elementary reflector Q**H to a * complex M-by-N distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), * from either the left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a complex scalar and v is a complex vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Q is a product of k elementary reflectors as returned by PCTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also own C(IC+M-L,JC:JC+N-1). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC+N-L-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also own C(IC:IC+M-1,JC+N-L) and * MOD(JV-1,NB_V) must be equal to MOD(JC+N-L-1,NB_C), if INCV = 1 only * the last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q**H * sub( C ), * = 'R': form sub( C ) * Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+L-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+L-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) COMPLEX, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q**H * sub( C ) if SIDE = 'L', or * sub( C ) * Q**H if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK, LEFT CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIC1, IIC2, IIV, IOFFC1, $ IOFFC2, IOFFV, IPW, IROFFC1, IROFFC2, IROFFV, $ IVCOL, IVROW, JJC1, JJC2, JJV, LDC, LDV, MPC2, $ MPV, MYCOL, MYROW, NCC, NCV, NPCOL, NPROW, $ NQC2, NQV, RDEST COMPLEX TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CAXPY, CCOPY, CGEBR2D, $ CGEBS2D, CGEMV, CGERC, CGERV2D, $ CGESD2D, CGSUM2D, CLASET, INFOG2L, $ PB_TOPGET, PBCTRNV * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * LEFT = LSAME( SIDE, 'L' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) IROFFV = MOD( IV-1, DESCV( NB_ ) ) MPV = NUMROC( L+IROFFV, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ MPV = MPV - IROFFV ICOFFV = MOD( JV-1, DESCV( NB_ ) ) NQV = NUMROC( L+ICOFFV, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, NCV ) IOFFV = IIV+(JJV-1)*LDV NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC1, JJC1, ICROW1, ICCOL1 ) IROFFC1 = MOD( IC-1, DESCC( MB_ ) ) ICOFFC1 = MOD( JC-1, DESCC( NB_ ) ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NCC ) ) IOFFC1 = IIC1 + ( JJC1-1 ) * LDC * IF( LEFT ) THEN CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, DESCC( MB_ ) ) ICOFFC2 = MOD( JC-1, DESCC( NB_ ) ) NQC2 = NUMROC( N+ICOFFC2, DESCC( NB_ ), MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 ELSE CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC-1, DESCC( MB_ ) ) MPC2 = NUMROC( M+IROFFC2, DESCC( MB_ ), MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = MOD( JC+N-L-1, DESCC( NB_ ) ) END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NCC ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFFC1) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFFC1) ) * IF( LEFT ) THEN * IF( CRBLCK ) THEN RDEST = ICROW2 ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V (ICOFFV = IROFFC2) * IPW = MPV+1 CALL PBCTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = CONJG( TAU( IIV ) ) * ELSE * CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) TAULOC = CONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL CGERC( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL2 ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * TAULOC = CONJG( TAU( JJV ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, V( IOFFV ), $ 1, ZERO, WORK, 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK, MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK, MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK, MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK, $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL CGERC( MPV, NQC2, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL2 * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL CCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL CGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL2 ) * ELSE IF( MYCOL.EQ.ICCOL2 ) THEN * IPW = MPV+1 CALL CGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = CONJG( WORK( IPW ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), $ RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL CGERC( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V (ICOFFV=IROFFC2) * IPW = MPV+1 CALL PBCTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL CGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = CONJG( TAU( IIV ) ) * ELSE * CALL CGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) TAULOC = CONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL CGERC( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL CCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL CGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = CONJG( TAU( JJV ) ) * ELSE * IPW = MPV+1 CALL CGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = CONJG( WORK( IPW ) ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL CGEMV( 'Conjugate transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL CGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL CAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL CGERC( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW2 ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW2 ) THEN * TAULOC = CONJG( TAU( IIV ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, V( IOFFV ), $ LDV, ZERO, WORK, 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK, MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK, 1 ) * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK, MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) * IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK, 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MPC2, NQV, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW2 * IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL CCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL CGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW2, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW2 ) THEN * IPW = NQV+1 CALL CGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = CONJG( WORK( IPW ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), $ RDEST, ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V (IROFFV = ICOFFC2) * IPW = NQV+1 CALL PBCTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW2 ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = CONJG( TAU( JJV ) ) * ELSE * CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) TAULOC = CONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL CCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL CGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = CONJG( TAU( IIV ) ) * ELSE * IPW = NQV+1 CALL CGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = CONJG( WORK( IPW ) ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No Transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V (ICOFFC2=IROFFV) * IPW = NQV+1 CALL PBCTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = CONJG( TAU( JJV ) ) * ELSE * CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) TAULOC = CONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL CGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL CLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL CGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL CAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL CGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PCLARZC * END scalapack-2.0.2/SRC/pclarzt.f000644 000766 000024 00000026071 10363532303 016172 0ustar00juliestaff000000 000000 SUBROUTINE PCLARZT( DIRECT, STOREV, N, K, V, IV, JV, DESCV, TAU, $ T, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIRECT, STOREV INTEGER IV, JV, K, N * .. * .. Array Arguments .. INTEGER DESCV( * ) COMPLEX TAU( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PCLARZT forms the triangular factor T of a complex block reflector * H of order > n, which is defined as a product of k elementary * reflectors as returned by PCTZRZF. * * If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; * * If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. * * If STOREV = 'C', the vector which defines the elementary reflector * H(i) is stored in the i-th column of the array V, and * * H = I - V * T * V' * * If STOREV = 'R', the vector which defines the elementary reflector * H(i) is stored in the i-th row of the array V, and * * H = I - V' * T * V * * Currently, only STOREV = 'R' and DIRECT = 'B' are supported. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIRECT (global input) CHARACTER * Specifies the order in which the elementary reflectors are * multiplied to form the block reflector: * = 'F': H = H(1) H(2) . . . H(k) (Forward, not supported yet) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Specifies how the vectors which define the elementary * reflectors are stored (see also Further Details): * = 'C': columnwise (not supported yet) * = 'R': rowwise * * N (global input) INTEGER * The number of meaningful entries of the block reflector H. * N >= 0. * * K (global input) INTEGER * The order of the triangular factor T (= the number of * elementary reflectors). 1 <= K <= MB_V (= NB_V). * * V (input/output) COMPLEX pointer into the local memory * to an array of local dimension (LOCr(IV+K-1),LOCc(JV+N-1)). * The distributed matrix V contains the Householder vectors. * See further details. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * TAU (local input) COMPLEX, array, dimension LOCr(IV+K-1) * if INCV = M_V, and LOCc(JV+K-1) otherwise. This array * contains the Householder scalars related to the Householder * vectors. TAU is tied to the distributed matrix V. * * T (local output) COMPLEX array, dimension (MB_V,MB_V) * It contains the k-by-k triangular factor of the block * reflector associated with V. T is lower triangular. * * WORK (local workspace) COMPLEX array, * dimension (K*(K-1)/2) * * Further Details * =============== * * The shape of the matrix V and the storage of the vectors which define * the H(i) is best illustrated by the following example with n = 5 and * k = 3. The elements equal to 1 are not stored; the corresponding * array elements are modified but restored on exit. The rest of the * array is not used. * * DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': * * ______V_____ * ( v1 v2 v3 ) / \ * ( v1 v2 v3 ) ( v1 v1 v1 v1 v1 . . . . 1 ) * V = ( v1 v2 v3 ) ( v2 v2 v2 v2 v2 . . . 1 ) * ( v1 v2 v3 ) ( v3 v3 v3 v3 v3 . . 1 ) * ( v1 v2 v3 ) * . . . * . . . * 1 . . * 1 . * 1 * * DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': * * ______V_____ * 1 / \ * . 1 ( 1 . . . . v1 v1 v1 v1 v1 ) * . . 1 ( . 1 . . . v2 v2 v2 v2 v2 ) * . . . ( . . 1 . . v3 v3 v3 v3 v3 ) * . . . * ( v1 v2 v3 ) * ( v1 v2 v3 ) * V = ( v1 v2 v3 ) * ( v1 v2 v3 ) * ( v1 v2 v3 ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER ICOFF, ICTXT, II, IIV, INFO, IVCOL, IVROW, $ ITMP0, ITMP1, IW, JJV, LDV, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CCOPY, CGEMV, $ CGSUM2D, CLACGV, CLASET, CTRMV, $ INFOG2L, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCV( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check for currently supported options * INFO = 0 IF( .NOT.LSAME( DIRECT, 'B' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( STOREV, 'R' ) ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCLARZT', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, $ IIV, JJV, IVROW, IVCOL ) * IF( MYROW.EQ.IVROW ) THEN IW = 1 ITMP0 = 0 LDV = DESCV( LLD_ ) ICOFF = MOD( JV-1, DESCV( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQ = NQ - ICOFF * DO 10 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = -tau( iv+i-1 ) * * V(iv+i:iv+k-1,jv:jv+n-1) * V(iv+i-1,jv:jv+n-1)' * ITMP0 = ITMP0 + 1 IF( NQ.GT.0 ) THEN CALL CLACGV( NQ, V( II+(JJV-1)*LDV ), LDV ) CALL CGEMV( 'No transpose', ITMP0, NQ, -TAU( II ), $ V( II+1+(JJV-1)*LDV ), LDV, $ V( II+(JJV-1)*LDV ), LDV, ZERO, WORK( IW ), $ 1 ) CALL CLACGV( NQ, V( II+(JJV-1)*LDV ), LDV ) ELSE CALL CLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF IW = IW + ITMP0 * 10 CONTINUE * CALL CGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( MB_ ) * T( ITMP1-1 ) = TAU( IIV+K-1 ) * DO 20 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = T(i+1:k,i+1:k) * T(i+1:k,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( MB_ ) - 1 CALL CCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL CTRMV( 'Lower', 'No transpose', 'Non-unit', ITMP0, $ T( ITMP1+DESCV( MB_ ) ), DESCV( MB_ ), $ T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( II ) * 20 CONTINUE * END IF * END IF * RETURN * * End of PCLARZT * END scalapack-2.0.2/SRC/pclascl.f000644 000766 000024 00000043010 11556766441 016145 0ustar00juliestaff000000 000000 SUBROUTINE PCLASCL( TYPE, CFROM, CTO, M, N, A, IA, JA, DESCA, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TYPE INTEGER IA, INFO, JA, M, N REAL CFROM, CTO * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLASCL multiplies the M-by-N complex distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1) by the real scalar CTO/CFROM. This * is done without over/underflow as long as the final result * CTO * A(I,J) / CFROM does not over/underflow. TYPE specifies that * sub( A ) may be full, upper triangular, lower triangular or upper * Hessenberg. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TYPE (global input) CHARACTER * TYPE indices the storage type of the input distributed * matrix. * = 'G': sub( A ) is a full matrix, * = 'L': sub( A ) is a lower triangular matrix, * = 'U': sub( A ) is an upper triangular matrix, * = 'H': sub( A ) is an upper Hessenberg matrix. * * CFROM (global input) REAL * CTO (global input) REAL * The distributed matrix sub( A ) is multiplied by CTO/CFROM. * A(I,J) is computed without over/underflow if the final * result CTO * A(I,J) / CFROM can be represented without * over/underflow. CFROM must be nonzero. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * matrix sub( A ). On exit, this array contains the local * pieces of the distributed matrix multiplied by CTO/CFROM. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) * .. * .. Local Scalars .. LOGICAL DONE INTEGER IACOL, IAROW, ICOFFA, ICTXT, ICURCOL, ICURROW, $ IIA, II, INXTROW, IOFFA, IROFFA, ITYPE, J, JB, $ JJA, JJ, JN, KK, LDA, LL, MYCOL, MYROW, MP, $ NPCOL, NPROW, NQ REAL BIGNUM, CFROM1, CFROMC, CTO1, CTOC, MUL, SMLNUM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, INFOG2L, PXERBLA * .. * .. External Functions .. LOGICAL LSAME, SISNAN INTEGER ICEIL, NUMROC REAL PSLAMCH EXTERNAL SISNAN, ICEIL, LSAME, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * IF( NPROW.EQ.-1 ) THEN INFO = -907 ELSE INFO = 0 CALL CHK1MAT( M, 4, N, 6, IA, JA, DESCA, 9, INFO ) IF( INFO.EQ.0 ) THEN IF( LSAME( TYPE, 'G' ) ) THEN ITYPE = 0 ELSE IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 ELSE ITYPE = -1 END IF IF( ITYPE.EQ.-1 ) THEN INFO = -1 ELSE IF( CFROM.EQ.ZERO .OR. SISNAN(CFROM) ) THEN INFO = -4 ELSE IF( SISNAN(CTO) ) THEN INFO = -5 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCLASCL', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) $ RETURN * * Get machine parameters * SMLNUM = PSLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM * CFROMC = CFROM CTOC = CTO * * Compute local indexes * LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * 10 CONTINUE CFROM1 = CFROMC*SMLNUM IF( CFROM1.EQ.CFROMC ) THEN ! CFROMC is an inf. Multiply by a correctly signed zero for ! finite CTOC, or a NaN if CTOC is infinite. MUL = CTOC / CFROMC DONE = .TRUE. CTO1 = CTOC ELSE CTO1 = CTOC / BIGNUM IF( CTO1.EQ.CTOC ) THEN ! CTOC is either 0 or an inf. In both cases, CTOC itself ! serves as the correct multiplication factor. MUL = CTOC DONE = .TRUE. CFROMC = ONE ELSE IF( ABS( CFROM1 ).GT.ABS( CTOC ) .AND. CTOC.NE.ZERO ) THEN MUL = SMLNUM DONE = .FALSE. CFROMC = CFROM1 ELSE IF( ABS( CTO1 ).GT.ABS( CFROMC ) ) THEN MUL = BIGNUM DONE = .FALSE. CTOC = CTO1 ELSE MUL = CTOC / CFROMC DONE = .TRUE. END IF END IF * IOFFA = ( JJA - 1 ) * LDA ICURROW = IAROW ICURCOL = IACOL * IF( ITYPE.EQ.0 ) THEN * * Full matrix * DO 30 JJ = JJA, JJA+NQ-1 DO 20 II = IIA, IIA+MP-1 A( IOFFA+II ) = A( IOFFA+II ) * MUL 20 CONTINUE IOFFA = IOFFA + LDA 30 CONTINUE * ELSE IF( ITYPE.EQ.1 ) THEN * * Lower triangular matrix * II = IIA JJ = JJA JB = JN-JA+1 * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 50 LL = JJ, JJ + JB -1 DO 40 KK = II+LL-JJ, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 40 CONTINUE IOFFA = IOFFA + LDA 50 CONTINUE ELSE DO 70 LL = JJ, JJ + JB -1 DO 60 KK = II, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 60 CONTINUE IOFFA = IOFFA + LDA 70 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 120 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 90 LL = JJ, JJ + JB -1 DO 80 KK = II+LL-JJ, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE ELSE DO 110 LL = JJ, JJ + JB -1 DO 100 KK = II, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 120 CONTINUE * ELSE IF( ITYPE.EQ.2 ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JB = JN-JA+1 * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 140 LL = JJ, JJ + JB -1 DO 130 KK = IIA, MIN(II+LL-JJ,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 130 CONTINUE IOFFA = IOFFA + LDA 140 CONTINUE ELSE DO 160 LL = JJ, JJ + JB -1 DO 150 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 150 CONTINUE IOFFA = IOFFA + LDA 160 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 210 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 180 LL = JJ, JJ + JB -1 DO 170 KK = IIA, MIN(II+LL-JJ,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 170 CONTINUE IOFFA = IOFFA + LDA 180 CONTINUE ELSE DO 200 LL = JJ, JJ + JB -1 DO 190 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 190 CONTINUE IOFFA = IOFFA + LDA 200 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 210 CONTINUE * ELSE IF( ITYPE.EQ.3 ) THEN * * Upper Hessenberg matrix * II = IIA JJ = JJA JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.ICURCOL ) THEN DO 230 LL = JJ, JJ+JB-1 DO 220 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 220 CONTINUE IOFFA = IOFFA + LDA 230 CONTINUE JJ = JJ + JB END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 260 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 250 LL = JJ, JJ+JB-1 DO 240 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 240 CONTINUE IOFFA = IOFFA + LDA 250 CONTINUE JJ = JJ + JB END IF * II = II + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) * 260 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( ICURROW+1, NPROW ) IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 280 LL = JJ, JJ + JB -1 DO 270 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 270 CONTINUE IOFFA = IOFFA + LDA 280 CONTINUE ELSE DO 300 LL = JJ, JJ + JB -1 DO 290 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 290 CONTINUE IOFFA = IOFFA + LDA 300 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+MP-1 ) $ A( II+(JJ+JB-2)*LDA ) = A( II+(JJ+JB-2)*LDA ) * MUL END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = INXTROW ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 350 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 320 LL = JJ, JJ + JB -1 DO 310 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 310 CONTINUE IOFFA = IOFFA + LDA 320 CONTINUE ELSE DO 340 LL = JJ, JJ + JB -1 DO 330 KK = IIA, MIN( II-1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 330 CONTINUE IOFFA = IOFFA + LDA 340 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+MP-1 ) $ A( II+(JJ+JB-2)*LDA ) = A( II+(JJ+JB-2)*LDA ) * $ MUL END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = INXTROW ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 350 CONTINUE * END IF * END IF * IF( .NOT.DONE ) $ GO TO 10 * RETURN * * End of PCLASCL * END scalapack-2.0.2/SRC/pclase2.f000644 000766 000024 00000037363 10363532303 016052 0ustar00juliestaff000000 000000 SUBROUTINE PCLASE2( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLASE2 initializes an M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1) to BETA on the diagonal and ALPHA on the * offdiagonals. PCLASE2 requires that only dimension of the matrix * operand is distributed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * set: * = 'U': Upper triangular part is set; the strictly lower * triangular part of sub( A ) is not changed; * = 'L': Lower triangular part is set; the strictly upper * triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * ALPHA (global input) COMPLEX * The constant to which the offdiagonal elements are to be * set. * * BETA (global input) COMPLEX * The constant to which the diagonal elements are to be set. * * A (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ) * to be set. On exit, the leading M-by-N submatrix sub( A ) * is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER HEIGHT, IACOL, IAROW, IBASE, ICOFFA, II, IIA, $ IIBEG, IIEND, IINXT, ILEFT, IRIGHT, IROFFA, $ ITOP, JJ, JJA, JJBEG, JJEND, JJNXT, LDA, MBA, $ MP, MPA, MYCOL, MYDIST, MYROW, NBA, NPCOL, $ NPROW, NQ, NQA, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CLASET, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, MBA ) ICOFFA = MOD( JA-1, NBA ) * IF( N.LE.( NBA-ICOFFA ) ) THEN * * It is assumed that the local columns JJA:JJA+N-1 of the matrix * A are in the same process column (IACOL). * * N * JJA JJA+N-1 * / --------------------- \ * IROFFA| | | | * \ |...................| | ( IAROW ) * IIA |x | | MB_A * | x | | * |--x----------------| / * | x | * | x | ITOP * | x | | * | x | /-------\ * |-------x-----------| |-------x-----------| * | x | | x | * | x | | x | * | x | | x | * | x | | x | * |------------x------| |------------x------| * | x | \____________/ * | x | | * | x | IBASE * | x | * |-----------------x-| Local picture * | x| * | | * | | * | | * |-------------------| * | | * . . * . . * . (IACOL) . * IF( MYCOL.EQ.IACOL ) THEN * MPA = NUMROC( M+IROFFA, MBA, MYROW, IAROW, NPROW ) IF( MPA.LE.0 ) $ RETURN IF( MYROW.EQ.IAROW ) $ MPA = MPA - IROFFA MYDIST = MOD( MYROW-IAROW+NPROW, NPROW ) ITOP = MYDIST * MBA - IROFFA * IF( LSAME( UPLO, 'U' ) ) THEN * ITOP = MAX( 0, ITOP ) IIBEG = IIA IIEND = IIA + MPA - 1 IINXT = MIN( ICEIL( IIBEG, MBA ) * MBA, IIEND ) * 10 CONTINUE IF( ( N-ITOP ).GT.0 ) THEN CALL CLASET( UPLO, IINXT-IIBEG+1, N-ITOP, ALPHA, BETA, $ A( IIBEG+(JJA+ITOP-1)*LDA ), LDA ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IIBEG = IINXT +1 IINXT = MIN( IINXT+MBA, IIEND ) GO TO 10 END IF * ELSE IF( LSAME( UPLO, 'L' ) ) THEN * II = IIA JJ = JJA MP = MPA IBASE = MIN( ITOP+MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 20 CONTINUE IF( JJ.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL CLASET( 'All', MP, ITOP-JJ+JJA, ALPHA, ALPHA, $ A( II+(JJ-1)*LDA ), LDA ) CALL CLASET( UPLO, MP, HEIGHT, ALPHA, BETA, $ A( II+(JJA+ITOP-1)*LDA ), LDA ) MP = MAX( 0, MP - HEIGHT ) II = II + HEIGHT JJ = JJA + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 20 END IF * ELSE * II = IIA JJ = JJA MP = MPA IBASE = MIN( ITOP+MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 30 CONTINUE IF( JJ.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL CLASET( 'All', MPA, ITOP-JJ+JJA, ALPHA, ALPHA, $ A( IIA+(JJ-1)*LDA ), LDA ) CALL CLASET( 'All', MPA-MP, HEIGHT, ALPHA, ALPHA, $ A( IIA+(JJA+ITOP-1)*LDA ), LDA ) CALL CLASET( 'All', MP, HEIGHT, ALPHA, BETA, $ A( II+(JJA+ITOP-1)*LDA ), LDA ) MP = MAX( 0, MP - HEIGHT ) II = II + HEIGHT JJ = JJA + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 30 END IF * END IF * END IF * ELSE IF( M.LE.( MBA-IROFFA ) ) THEN * * It is assumed that the local rows IIA:IIA+M-1 of the matrix A * are in the same process row (IAROW). * * ICOFFA * / \JJA * IIA ------------------ .... -------- * | .x | | | / | | \ * | . x | | | ILEFT| | | | * | . x | | | | | | * | . x | | \ x | | * | . |x | | |x | | IRIGHT * | . | x | | | x | | * (IAROW) | . | x | | | x | | * | . | x| | | x| | * | . | x | | x / * | . | |x | | | * | . | | x | | | * | . | | x | | | * | . | | x| | | * IIA+M-1 ------------------ .... ------- * NB_A * (IACOL) Local picture * IF( MYROW.EQ.IAROW ) THEN * NQA = NUMROC( N+ICOFFA, NBA, MYCOL, IACOL, NPCOL ) IF( NQA.LE.0 ) $ RETURN IF( MYCOL.EQ.IACOL ) $ NQA = NQA - ICOFFA MYDIST = MOD( MYCOL-IACOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBA - ICOFFA * IF( LSAME( UPLO, 'L' ) ) THEN * ILEFT = MAX( 0, ILEFT ) JJBEG = JJA JJEND = JJA + NQA - 1 JJNXT = MIN( ICEIL( JJBEG, NBA ) * NBA, JJEND ) * 40 CONTINUE IF( ( M-ILEFT ).GT.0 ) THEN CALL CLASET( UPLO, M-ILEFT, JJNXT-JJBEG+1, ALPHA, $ BETA, A( IIA+ILEFT+(JJBEG-1)*LDA ), LDA ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBA, JJEND ) GO TO 40 END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * II = IIA JJ = JJA NQ = NQA IRIGHT = MIN( ILEFT+NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 50 CONTINUE IF( II.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL CLASET( 'All', ILEFT-II+IIA, NQ, ALPHA, ALPHA, $ A( II+(JJ-1)*LDA ), LDA ) CALL CLASET( UPLO, WIDE, NQ, ALPHA, BETA, $ A( IIA+ILEFT+(JJ-1)*LDA ), LDA ) NQ = MAX( 0, NQ - WIDE ) II = IIA + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 50 END IF * ELSE * II = IIA JJ = JJA NQ = NQA IRIGHT = MIN( ILEFT+NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 60 CONTINUE IF( II.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL CLASET( 'All', ILEFT-II+IIA, NQA, ALPHA, ALPHA, $ A( II+(JJA-1)*LDA ), LDA ) CALL CLASET( 'All', WIDE, NQA-NQ, ALPHA, ALPHA, $ A( IIA+ILEFT+(JJA-1)*LDA ), LDA ) CALL CLASET( 'All', WIDE, NQ, ALPHA, BETA, $ A( IIA+ILEFT+(JJ-1)*LDA ), LDA ) NQ = MAX( 0, NQ - WIDE ) II = IIA + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 60 END IF * END IF * END IF * END IF * RETURN * * End of PCLASE2 * END scalapack-2.0.2/SRC/pclaset.f000644 000766 000024 00000021700 10363532303 016140 0ustar00juliestaff000000 000000 SUBROUTINE PCLASET( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLASET initializes an M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1) to BETA on the diagonal and ALPHA on the * offdiagonals. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * set: * = 'U': Upper triangular part is set; the strictly lower * triangular part of sub( A ) is not changed; * = 'L': Lower triangular part is set; the strictly upper * triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * ALPHA (global input) COMPLEX * The constant to which the offdiagonal elements are to be * set. * * BETA (global input) COMPLEX * The constant to which the diagonal elements are to be set. * * A (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ) * to be set. On exit, the leading M-by-N submatrix sub( A ) * is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, IAA, IBLK, IN, ITMP, J, JAA, JBLK, JN, JTMP * .. * .. External Subroutines .. EXTERNAL PCLASE2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( M.LE.( DESCA( MB_ ) - MOD( IA-1, DESCA( MB_ ) ) ) .OR. $ N.LE.( DESCA( NB_ ) - MOD( JA-1, DESCA( NB_ ) ) ) ) THEN CALL PCLASE2( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) ELSE * IF( LSAME( UPLO, 'U' ) ) THEN IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) CALL PCLASE2( UPLO, IN-IA+1, N, ALPHA, BETA, A, IA, JA, $ DESCA ) DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) JAA = JA + ITMP CALL PCLASE2( UPLO, IBLK, N-ITMP, ALPHA, BETA, $ A, I, JAA, DESCA ) 10 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) CALL PCLASE2( UPLO, M, JN-JA+1, ALPHA, BETA, A, IA, JA, $ DESCA ) DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) IAA = IA + JTMP CALL PCLASE2( UPLO, M-JTMP, JBLK, ALPHA, BETA, A, IAA, $ J, DESCA ) 20 CONTINUE ELSE IF( M.LE.N ) THEN IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), $ IA+M-1 ) CALL PCLASE2( UPLO, IN-IA+1, N, ALPHA, BETA, A, IA, $ JA, DESCA ) DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) CALL PCLASE2( UPLO, IBLK, I-IA, ALPHA, ALPHA, A, I, $ JA, DESCA ) CALL PCLASE2( UPLO, IBLK, N-I+IA, ALPHA, BETA, A, I, $ JA+I-IA, DESCA ) 30 CONTINUE ELSE JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), $ JA+N-1 ) CALL PCLASE2( UPLO, M, JN-JA+1, ALPHA, BETA, A, IA, $ JA, DESCA ) DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) CALL PCLASE2( UPLO, J-JA, JBLK, ALPHA, ALPHA, A, IA, $ J, DESCA ) CALL PCLASE2( UPLO, M-J+JA, JBLK, ALPHA, BETA, A, $ IA+J-JA, J, DESCA ) 40 CONTINUE END IF END IF * END IF * RETURN * * End of PCLASET * END scalapack-2.0.2/SRC/pclasmsub.f000644 000766 000024 00000031615 10602576752 016520 0ustar00juliestaff000000 000000 SUBROUTINE PCLASMSUB( A, DESCA, I, L, K, SMLNUM, BUF, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. INTEGER I, K, L, LWORK REAL SMLNUM * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), BUF( * ) * .. * * Purpose * ======= * * PCLASMSUB looks for a small subdiagonal element from the bottom * of the matrix that it can safely set to zero. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) COMPLEX array, dimension (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * K (global output) INTEGER * On exit, this yields the bottom portion of the unreduced * submatrix. This will satisfy: L <= M <= I-1. * * SMLNUM (global input) REAL * On entry, a "small number" for the given matrix. * Unchanged on exit. * * BUF (local output) COMPLEX array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 2*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * This code is basically a parallelization of the following snip * of LAPACK code from CLAHQR: * * Look for a single small subdiagonal element. * * DO 20 K = I, L + 1, -1 * TST1 = CABS1( H( K-1, K-1 ) ) + CABS1( H( K, K ) ) * IF( TST1.EQ.ZERO ) * $ TST1 = CLANHS( '1', I-L+1, H( L, L ), LDH, WORK ) * IF( CABS1( H( K, K-1 ) ).LE.MAX( ULP*TST1, SMLNUM ) ) * $ GO TO 30 * 20 CONTINUE * 30 CONTINUE * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IBUF1, IBUF2, ICOL1, ICOL2, $ II, III, IRCV1, IRCV2, IROW1, IROW2, ISRC, $ ISTR1, ISTR2, ITMP1, ITMP2, JJ, JJJ, JSRC, LDA, $ LEFT, MODKM1, MYCOL, MYROW, NPCOL, NPROW, NUM, $ RIGHT, UP REAL TST1, ULP COMPLEX CDUM, H10, H11, H22 * .. * .. External Functions .. INTEGER ILCM, NUMROC REAL PSLAMCH EXTERNAL ILCM, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, INFOG1L, INFOG2L, $ CGERV2D, CGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, AIMAG, MAX, MOD * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) ULP = PSLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 STARTS AT BUF(ISTR1+1) AND WILL CONTAINS IBUF1 ELEMENTS * BUFFER2 STARTS AT BUF(ISTR2+1) AND WILL CONTAINS IBUF2 ELEMENTS * ISTR1 = 0 ISTR2 = ( ( I-L ) / HBL ) IF( ISTR2*HBL.LT.( I-L ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.2*ISTR2 ) THEN * * Error! * RETURN END IF CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * COPY OUR RELEVANT PIECES OF TRIADIAGONAL THAT WE OWE INTO * 2 BUFFERS TO SEND TO WHOMEVER OWNS H(K,K) AS K MOVES DIAGONALLY * UP THE TRIDIAGONAL * IBUF1 = 0 IBUF2 = 0 IRCV1 = 0 IRCV2 = 0 DO 10 K = I, L + 1, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K-1,K-1) AND SEND IT DIAGONAL DOWN * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K ,K-1) AND SEND IT RIGHT * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( K, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * ADD UP THE RECEIVES * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT. $ 1 ) ) ) THEN * * WE MUST RECEIVE H(K-1,K-1) FROM DIAGONAL UP * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) ) THEN * * WE MUST RECEIVE H(K ,K-1) FROM LEFT * IRCV2 = IRCV2 + 1 END IF END IF * * POSSIBLY CHANGE OWNERS (OCCURS ONLY WHEN MOD(K-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * SEND DATA ON TO THE APPROPRIATE NODE IF THERE IS ANY DATA TO SEND * IF( IBUF1.GT.0 ) THEN CALL CGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL CGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, MYROW, $ RIGHT ) END IF * * RECEIVE APPROPRIATE DATA IF THERE IS ANY * IF( IRCV1.GT.0 ) THEN CALL CGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL CGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, MYROW, $ LEFT ) END IF * * START MAIN LOOP * IBUF1 = 0 IBUF2 = 0 CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * LOOK FOR A SINGLE SMALL SUBDIAGONAL ELEMENT. * * Start loop for subdiagonal search * DO 40 K = I, L + 1, -1 IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN * * Grab information from WORK array * IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H11 = BUF( ISTR1+IBUF1 ) ELSE H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF2 = IBUF2 + 1 H10 = BUF( ISTR2+IBUF2 ) ELSE H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF ELSE * * Information is local * H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF H22 = A( ( ICOL1-1 )*LDA+IROW1 ) TST1 = CABS1( H11 ) + CABS1( H22 ) IF( TST1.EQ.ZERO ) THEN * * FIND SOME NORM OF THE LOCAL H(L:I,L:I) * CALL INFOG1L( L, HBL, NPROW, MYROW, 0, ITMP1, III ) IROW2 = NUMROC( I, HBL, MYROW, 0, NPROW ) CALL INFOG1L( L, HBL, NPCOL, MYCOL, 0, ITMP2, III ) ICOL2 = NUMROC( I, HBL, MYCOL, 0, NPCOL ) DO 30 III = ITMP1, IROW2 DO 20 JJJ = ITMP2, ICOL2 TST1 = TST1 + CABS1( A( ( JJJ-1 )*LDA+III ) ) 20 CONTINUE 30 CONTINUE END IF IF( CABS1( H10 ).LE.MAX( ULP*TST1, SMLNUM ) ) $ GO TO 50 IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 IF( ( MODKM1.EQ.HBL-1 ) .AND. ( K.GT.2 ) ) THEN II = MOD( II+NPROW-1, NPROW ) JJ = MOD( JJ+NPCOL-1, NPCOL ) CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ITMP1, ITMP2 ) END IF 40 CONTINUE 50 CONTINUE CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, K, 1, ITMP1, ITMP2, -1, $ -1, -1 ) RETURN * * End of PCLASMSUB * END scalapack-2.0.2/SRC/pclassq.f000644 000766 000024 00000024311 10363532303 016154 0ustar00juliestaff000000 000000 SUBROUTINE PCLASSQ( N, X, IX, JX, DESCX, INCX, SCALE, SUMSQ ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N REAL SCALE, SUMSQ * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX X( * ) * .. * * Purpose * ======= * * PCLASSQ returns the values scl and smsq such that * * ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, * * where x( i ) = sub( X ) = abs( X( IX+(JX-1)*DESCX(M_)+(i-1)*INCX ) ). * The value of sumsq is assumed to be at least unity and the value of * ssq will then satisfy * * 1.0 .le. ssq .le. ( sumsq + 2*n ). * * scale is assumed to be non-negative and scl returns the value * * scl = max( scale, abs( real( x( i ) ) ), abs( aimag( x( i ) ) ) ), * i * * scale and sumsq must be supplied in SCALE and SUMSQ respectively. * SCALE and SUMSQ are overwritten by scl and ssq respectively. * * The routine makes only one pass through the vector sub( X ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * The result are only available in the scope of sub( X ), i.e if * sub( X ) is distributed along a process row, the correct results are * only available in this process row of the grid. Similarly if sub( X ) * is distributed along a process column, the correct results are only * available in this process column of the grid. * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vector sub( X ). * * X (input) COMPLEX * The vector for which a scaled sum of squares is computed. * x( i ) = X(IX+(JX-1)*M_X +(i-1)*INCX ), 1 <= i <= n. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * SCALE (local input/local output) REAL * On entry, the value scale in the equation above. * On exit, SCALE is overwritten with scl , the scaling factor * for the sum of squares. * * SUMSQ (local input/local output) REAL * On entry, the value sumsq in the equation above. * On exit, SUMSQ is overwritten with smsq , the basic sum of * squares from which scl has been factored out. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, ICOFF, ICTXT, IIX, IOFF, IROFF, IXCOL, $ IXROW, JJX, LDX, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ REAL TEMP1 * .. * .. Local Arrays .. REAL WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSTREECOMB, SCOMBSSQ * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) * LDX = DESCX( LLD_ ) IF( INCX.EQ.DESCX( M_ ) ) THEN * * X is rowwise distributed. * IF( MYROW.NE.IXROW ) $ RETURN ICOFF = MOD( JX, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ - ICOFF * * Code direct from LAPACK's CLASSQ, (save subroutine call) * IF( NQ.GT.0 ) THEN IOFF = IIX + ( JJX - 1 ) * LDX DO 10 I = 1, NQ IF( REAL( X( IOFF ) ).NE.ZERO ) THEN TEMP1 = ABS( REAL( X( IOFF ) ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ * ( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IF( AIMAG( X( IOFF ) ).NE.ZERO ) THEN TEMP1 = ABS( AIMAG( X( IOFF ) ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ*( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IOFF = IOFF + LDX 10 CONTINUE END IF * * Take local result and find global * WORK( 1 ) = SCALE WORK( 2 ) = SUMSQ * CALL PSTREECOMB( ICTXT, 'Rowwise', 2, WORK, -1, IXCOL, $ SCOMBSSQ ) * SCALE = WORK( 1 ) SUMSQ = WORK( 2 ) * ELSE IF( INCX.EQ.1 ) THEN * * X is columnwise distributed. * IF( MYCOL.NE.IXCOL ) $ RETURN IROFF = MOD( IX, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFF * * Code direct from LAPACK's CLASSQ, (save subroutine call) * IF( NP.GT.0 ) THEN IOFF = IIX + ( JJX - 1 ) * LDX DO 20 I = 1, NP IF( REAL( X( IOFF ) ).NE.ZERO ) THEN TEMP1 = ABS( REAL( X( IOFF ) ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ*( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IF( AIMAG( X( IOFF ) ).NE.ZERO ) THEN TEMP1 = ABS( AIMAG( X( IOFF ) ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ*( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IOFF = IOFF + 1 20 CONTINUE END IF * * Take local result and find global * WORK( 1 ) = SCALE WORK( 2 ) = SUMSQ * CALL PSTREECOMB( ICTXT, 'Columnwise', 2, WORK, -1, IXCOL, $ SCOMBSSQ ) * SCALE = WORK( 1 ) SUMSQ = WORK( 2 ) * END IF * RETURN * * End of PCLASSQ * END scalapack-2.0.2/SRC/pclaswp.f000644 000766 000024 00000020337 10363532303 016163 0ustar00juliestaff000000 000000 SUBROUTINE PCLASWP( DIREC, ROWCOL, N, A, IA, JA, DESCA, K1, K2, $ IPIV ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIREC, ROWCOL INTEGER IA, JA, K1, K2, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX A( * ) * .. * * Purpose: * ======== * * PCLASWP performs a series of row or column interchanges on * the distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1). One * interchange is initiated for each of rows or columns K1 trough K2 of * sub( A ). This routine assumes that the pivoting information has * already been broadcast along the process row or column. * Also note that this routine will only work for K1-K2 being in the * same MB (or NB) block. If you want to pivot a full matrix, use * PCLAPIV. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIREC (global input) CHARACTER * Specifies in which order the permutation is applied: * = 'F' (Forward) * = 'B' (Backward) * * ROWCOL (global input) CHARACTER * Specifies if the rows or columns are permuted: * = 'R' (Rows) * = 'C' (Columns) * * N (global input) INTEGER * If ROWCOL = 'R', the length of the rows of the distributed * matrix A(*,JA:JA+N-1) to be permuted; * If ROWCOL = 'C', the length of the columns of the distributed * matrix A(IA:IA+N-1,*) to be permuted. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, * ). * On entry, this array contains the local pieces of the distri- * buted matrix to which the row/columns interchanges will be * applied. On exit the permuted distributed matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * K1 (global input) INTEGER * The first element of IPIV for which a row or column inter- * change will be done. * * K2 (global input) INTEGER * The last element of IPIV for which a row or column inter- * change will be done. * * IPIV (local input) INTEGER array, dimension LOCr(M_A)+MB_A for * row pivoting and LOCc(N_A)+NB_A for column pivoting. This * array is tied to the matrix A, IPIV(K) = L implies rows * (or columns) K and L are to be interchanged. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICURCOL, ICURROW, IIA, IP, J, JJA, JP, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PCSWAP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( ROWCOL, 'R' ) ) THEN IF( LSAME( DIREC, 'F' ) ) THEN CALL INFOG2L( K1, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 10 I = K1, K2 IP = IPIV( IIA+I-K1 ) IF( IP.NE.I ) $ CALL PCSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP, $ JA, DESCA, DESCA( M_ ) ) 10 CONTINUE ELSE CALL INFOG2L( K2, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 20 I = K2, K1, -1 IP = IPIV( IIA+I-K1 ) IF( IP.NE.I ) $ CALL PCSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP, $ JA, DESCA, DESCA( M_ ) ) 20 CONTINUE END IF ELSE IF( LSAME( DIREC, 'F' ) ) THEN CALL INFOG2L( IA, K1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 30 J = K1, K2 JP = IPIV( JJA+J-K1 ) IF( JP.NE.J ) $ CALL PCSWAP( N, A, IA, J, DESCA, 1, A, IA, JP, $ DESCA, 1 ) 30 CONTINUE ELSE CALL INFOG2L( IA, K2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 40 J = K2, K1, -1 JP = IPIV( JJA+J-K1 ) IF( JP.NE.J ) $ CALL PCSWAP( N, A, IA, J, DESCA, 1, A, IA, JP, $ DESCA, 1 ) 40 CONTINUE END IF END IF * RETURN * * End PCLASWP * END scalapack-2.0.2/SRC/pclatra.f000644 000766 000024 00000015445 10363532303 016144 0ustar00juliestaff000000 000000 COMPLEX FUNCTION PCLATRA( N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLATRA computes the trace of an N-by-N distributed matrix sub( A ) * denoting A( IA:IA+N-1, JA:JA+N-1 ). The result is left on every * process of the grid. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix the trace * is to be computed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ==================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER ICURCOL, ICURROW, II, IOFFA, J, JB, JJ, JN, $ LDA, LL, MYCOL, MYROW, NPCOL, NPROW COMPLEX TRACE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGSUM2D, INFOG2L * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * TRACE = ZERO IF( N.EQ.0 ) THEN PCLATRA = TRACE RETURN END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ ICURROW, ICURCOL ) * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 LDA = DESCA( LLD_ ) IOFFA = II + ( JJ - 1 ) * LDA * * Handle first diagonal block separately * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 10 LL = IOFFA, IOFFA + (JB-1)*(LDA+1), LDA+1 TRACE = TRACE + A( LL ) 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JB IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JB*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over the remaining block of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 20 LL = IOFFA, IOFFA + (JB-1)*(LDA+1), LDA+1 TRACE = TRACE + A( LL ) 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JB IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JB*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE * CALL CGSUM2D( DESCA( CTXT_ ), 'All', ' ', 1, 1, TRACE, 1, -1, $ MYCOL ) * PCLATRA = TRACE * RETURN * * End of PCLATRA * END scalapack-2.0.2/SRC/pclatrd.f000644 000766 000024 00000044222 10363532303 016142 0ustar00juliestaff000000 000000 SUBROUTINE PCLATRD( UPLO, N, NB, A, IA, JA, DESCA, D, E, TAU, W, $ IW, JW, DESCW, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IW, JA, JW, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCW( * ) REAL D( * ), E( * ) COMPLEX A( * ), TAU( * ), W( * ), WORK( * ) * .. * * Purpose * ======= * * PCLATRD reduces NB rows and columns of a complex Hermitian * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) to complex * tridiagonal form by an unitary similarity transformation * Q' * sub( A ) * Q, and returns the matrices V and W which are * needed to apply the transformation to the unreduced part of sub( A ). * * If UPLO = 'U', PCLATRD reduces the last NB rows and columns of a * matrix, of which the upper triangle is supplied; * if UPLO = 'L', PCLATRD reduces the first NB rows and columns of a * matrix, of which the lower triangle is supplied. * * This is an auxiliary routine called by PCHETRD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NB (global input) INTEGER * The number of rows and columns to be reduced. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. * On exit, if UPLO = 'U', the last NB columns have been reduced * to tridiagonal form, with the diagonal elements overwriting * the diagonal elements of sub( A ); the elements above the * diagonal with the array TAU, represent the unitary matrix Q * as a product of elementary reflectors. If UPLO = 'L', the * first NB columns have been reduced to tridiagonal form, with * the diagonal elements overwriting the diagonal elements of * sub( A ); the elements below the diagonal with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors; See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * W (local output) COMPLEX pointer into the local memory * to an array of dimension (LLD_W,NB_W), This array contains * the local pieces of the N-by-NB_W matrix W required to * update the unreduced part of sub( A ). * * IW (global input) INTEGER * The row index in the global array W indicating the first * row of sub( W ). * * JW (global input) INTEGER * The column index in the global array W indicating the * first column of sub( W ). * * DESCW (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix W. * * WORK (local workspace) COMPLEX array, dimension (NB_A) * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n) H(n-1) . . . H(n-nb+1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(nb). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The elements of the vectors v together form the N-by-NB matrix V * which is needed, with W, to apply the transformation to the unreduced * part of the matrix, using a Hermitian rank-2k update of the form: * sub( A ) := sub( A ) - V*W' - W*V'. * * The contents of A on exit are illustrated by the following examples * with n = 5 and nb = 2: * * if UPLO = 'U': if UPLO = 'L': * * ( a a a v4 v5 ) ( d ) * ( a a v4 v5 ) ( 1 d ) * ( a 1 v5 ) ( v1 1 a ) * ( d 1 ) ( v1 v2 a a ) * ( d ) ( v1 v2 a a a ) * * where d denotes a diagonal element of the reduced matrix, a denotes * an element of the original matrix that is unchanged, and vi denotes * an element of the vector defining H(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX HALF, ONE, ZERO PARAMETER ( HALF = ( 0.5E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, J, JJ, JP, JWK, K, $ KW, MYCOL, MYROW, NPCOL, NPROW, NQ COMPLEX AII, ALPHA, BETA * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCWK( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PCAXPY, $ PCDOTC, PCELGET, PCELSET, PCGEMV, $ PCHEMV, PCLACGV, PCLARFG, PCSCAL, $ PSELSET, SGEBR2D, SGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MIN, REAL * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NQ = MAX( 1, NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) CALL DESCSET( DESCD, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) AII = ZERO BETA = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * CALL INFOG2L( N+IA-NB, N+JA-NB, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, IAROW, IACOL ) CALL DESCSET( DESCWK, 1, DESCW( NB_ ), 1, DESCW( NB_ ), IAROW, $ IACOL, ICTXT, 1 ) CALL DESCSET( DESCE, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * * Reduce last NB columns of upper triangle * DO 10 J = JA+N-1, JA+N-NB, -1 I = IA + J - JA K = J - JA + 1 KW = MOD( K-1, DESCA( MB_ ) ) + 1 * * Update A(IA:I,I) * CALL PCELGET( 'E', ' ', AII, A, I, J, DESCA ) CALL PCELSET( A, I, J, DESCA, CMPLX( REAL( AII ) ) ) CALL PCLACGV( N-K, W, IW+K-1, JW+KW, DESCW, DESCW( M_ ) ) CALL PCGEMV( 'No transpose', K, N-K, -ONE, A, IA, J+1, $ DESCA, W, IW+K-1, JW+KW, DESCW, DESCW( M_ ), $ ONE, A, IA, J, DESCA, 1 ) CALL PCLACGV( N-K, W, IW+K-1, JW+KW, DESCW, DESCW( M_ ) ) CALL PCLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PCGEMV( 'No transpose', K, N-K, -ONE, W, IW, JW+KW, $ DESCW, A, I, J+1, DESCA, DESCA( M_ ), ONE, A, $ IA, J, DESCA, 1 ) CALL PCLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PCELGET( 'E', ' ', AII, A, I, J, DESCA ) CALL PCELSET( A, I, J, DESCA, CMPLX( REAL( AII ) ) ) IF( N-K.GT.0 ) $ CALL PCELSET( A, I, J+1, DESCA, CMPLX( E( JP ) ) ) * * Generate elementary reflector H(i) to annihilate * A(IA:I-2,I) * JP = MIN( JJ+KW-1, NQ ) CALL PCLARFG( K-1, BETA, I-1, J, A, IA, J, DESCA, 1, $ TAU ) CALL PSELSET( E, 1, J, DESCE, REAL( BETA ) ) CALL PCELSET( A, I-1, J, DESCA, ONE ) * * Compute W(IW:IW+K-2,JW+KW-1) * CALL PCHEMV( 'Upper', K-1, ONE, A, IA, JA, DESCA, A, IA, J, $ DESCA, 1, ZERO, W, IW, JW+KW-1, DESCW, 1 ) * JWK = MOD( K-1, DESCWK( NB_ ) ) + 2 CALL PCGEMV( 'Conjugate transpose', K-1, N-K, ONE, W, IW, $ JW+KW, DESCW, A, IA, J, DESCA, 1, ZERO, WORK, $ 1, JWK, DESCWK, DESCWK( M_ ) ) CALL PCGEMV( 'No transpose', K-1, N-K, -ONE, A, IA, J+1, $ DESCA, WORK, 1, JWK, DESCWK, DESCWK( M_ ), ONE, $ W, IW, JW+KW-1, DESCW, 1 ) CALL PCGEMV( 'Conjugate transpose', K-1, N-K, ONE, A, IA, $ J+1, DESCA, A, IA, J, DESCA, 1, ZERO, WORK, 1, $ JWK, DESCWK, DESCWK( M_ ) ) CALL PCGEMV( 'No transpose', K-1, N-K, -ONE, W, IW, JW+KW, $ DESCW, WORK, 1, JWK, DESCWK, DESCWK( M_ ), ONE, $ W, IW, JW+KW-1, DESCW, 1 ) CALL PCSCAL( K-1, TAU( JP ), W, IW, JW+KW-1, DESCW, 1 ) * CALL PCDOTC( K-1, ALPHA, W, IW, JW+KW-1, DESCW, 1, A, IA, J, $ DESCA, 1 ) IF( MYCOL.EQ.IACOL ) $ ALPHA = -HALF*TAU( JP )*ALPHA CALL PCAXPY( K-1, ALPHA, A, IA, J, DESCA, 1, W, IW, JW+KW-1, $ DESCW, 1 ) CALL PCELGET( 'E', ' ', BETA, A, I, J, DESCA ) CALL PSELSET( D, 1, J, DESCD, REAL( BETA ) ) * 10 CONTINUE * ELSE * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL DESCSET( DESCWK, 1, DESCW( NB_ ), 1, DESCW( NB_ ), IAROW, $ IACOL, ICTXT, 1 ) CALL DESCSET( DESCE, 1, JA+N-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * * Reduce first NB columns of lower triangle * DO 20 J = JA, JA+NB-1 I = IA + J - JA K = J - JA + 1 * * Update A(J:JA+N-1,J) * CALL PCELGET( 'E', ' ', AII, A, I, J, DESCA ) CALL PCELSET( A, I, J, DESCA, CMPLX( REAL( AII ) ) ) CALL PCLACGV( K-1, W, IW+K-1, JW, DESCW, DESCW( M_ ) ) CALL PCGEMV( 'No transpose', N-K+1, K-1, -ONE, A, I, JA, $ DESCA, W, IW+K-1, JW, DESCW, DESCW( M_ ), ONE, $ A, I, J, DESCA, 1 ) CALL PCLACGV( K-1, W, IW+K-1, JW, DESCW, DESCW( M_ ) ) CALL PCLACGV( K-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCGEMV( 'No transpose', N-K+1, K-1, -ONE, W, IW+K-1, $ JW, DESCW, A, I, JA, DESCA, DESCA( M_ ), ONE, $ A, I, J, DESCA, 1 ) CALL PCLACGV( K-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCELGET( 'E', ' ', AII, A, I, J, DESCA ) CALL PCELSET( A, I, J, DESCA, CMPLX( REAL( AII ) ) ) IF( K.GT.1 ) $ CALL PCELSET( A, I, J-1, DESCA, CMPLX( E( JP ) ) ) * * * Generate elementary reflector H(i) to annihilate * A(I+2:IA+N-1,I) * JP = MIN( JJ+K-1, NQ ) CALL PCLARFG( N-K, BETA, I+1, J, A, I+2, J, DESCA, 1, $ TAU ) CALL PSELSET( E, 1, J, DESCE, REAL( BETA ) ) CALL PCELSET( A, I+1, J, DESCA, ONE ) * * Compute W(IW+K:IW+N-1,JW+K-1) * CALL PCHEMV( 'Lower', N-K, ONE, A, I+1, J+1, DESCA, A, I+1, $ J, DESCA, 1, ZERO, W, IW+K, JW+K-1, DESCW, 1 ) * CALL PCGEMV( 'Conjugate Transpose', N-K, K-1, ONE, W, IW+K, $ JW, DESCW, A, I+1, J, DESCA, 1, ZERO, WORK, 1, $ 1, DESCWK, DESCWK( M_ ) ) CALL PCGEMV( 'No transpose', N-K, K-1, -ONE, A, I+1, JA, $ DESCA, WORK, 1, 1, DESCWK, DESCWK( M_ ), ONE, W, $ IW+K, JW+K-1, DESCW, 1 ) CALL PCGEMV( 'Conjugate transpose', N-K, K-1, ONE, A, I+1, $ JA, DESCA, A, I+1, J, DESCA, 1, ZERO, WORK, 1, $ 1, DESCWK, DESCWK( M_ ) ) CALL PCGEMV( 'No transpose', N-K, K-1, -ONE, W, IW+K, JW, $ DESCW, WORK, 1, 1, DESCWK, DESCWK( M_ ), ONE, W, $ IW+K, JW+K-1, DESCW, 1 ) CALL PCSCAL( N-K, TAU( JP ), W, IW+K, JW+K-1, DESCW, 1 ) CALL PCDOTC( N-K, ALPHA, W, IW+K, JW+K-1, DESCW, 1, A, I+1, $ J, DESCA, 1 ) IF( MYCOL.EQ.IACOL ) $ ALPHA = -HALF*TAU( JP )*ALPHA CALL PCAXPY( N-K, ALPHA, A, I+1, J, DESCA, 1, W, IW+K, $ JW+K-1, DESCW, 1 ) CALL PCELGET( 'E', ' ', BETA, A, I, J, DESCA ) CALL PSELSET( D, 1, J, DESCD, REAL( BETA ) ) * 20 CONTINUE * END IF * * Broadcast columnwise the diagonal elements into D. * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, NB, D( JJ ), 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, NB, D( JJ ), 1, $ IAROW, MYCOL ) END IF END IF * RETURN * * End of PCLATRD * END scalapack-2.0.2/SRC/pclatrs.f000644 000766 000024 00000005313 10363532303 016157 0ustar00juliestaff000000 000000 SUBROUTINE PCLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, IA, $ JA, DESCA, X, IX, JX, DESCX, SCALE, CNORM, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, NORMIN, TRANS, UPLO INTEGER IA, IX, JA, JX, N REAL SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) REAL CNORM( * ) COMPLEX A( * ), X( * ), WORK( * ) * .. * * Purpose * ======= * * PCLATRS solves a triangular system. This routine in unfinished * at this time, but will be part of the next release. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * * .. Local Scalars .. INTEGER ICTXT, IIX, IROFF, JJX, MYCOL, MYROW, NP, $ NPCOL, NPROW, LDX, IXCOL, IXROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, INFOG2L, $ PCTRSV * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * ***** NO SCALING ***** Call PCTRSV for all cases ***** * SCALE = ONE CALL PCTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, $ DESCX, 1 ) * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) LDX = DESCX( LLD_ ) IROFF = MOD( IX-1, DESCX(MB_) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IXCOL ) THEN CALL CGEBS2D( ICTXT, 'R', ' ', NP, 1, X( IIX+(JJX-1)*LDX ), $ LDX ) ELSE CALL CGEBR2D( ICTXT, 'R', ' ', NP, 1, X( IIX+(JJX-1)*LDX ), $ LDX, MYROW, IXCOL ) END IF * RETURN * * End of PCLATRS * END scalapack-2.0.2/SRC/pclatrz.f000644 000766 000024 00000022714 10363532303 016172 0ustar00juliestaff000000 000000 SUBROUTINE PCLATRZ( M, N, L, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER IA, JA, L, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCLATRZ reduces the M-by-N ( M<=N ) complex upper trapezoidal * matrix sub( A ) = [A(IA:IA+M-1,JA:JA+M-1) A(IA:IA+M-1,JA+N-L:JA+N-1)] * to upper triangular form by means of unitary transformations. * * The upper trapezoidal matrix sub( A ) is factored as * * sub( A ) = ( R 0 ) * Z, * * where Z is an N-by-N unitary matrix and R is an M-by-M upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. L > 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the leading M-by-M * upper triangular part of sub( A ) contains the upper trian- * gular matrix R, and elements N-L+1 to N of the first M rows * of sub( A ), with the array TAU, represent the unitary matrix * Z as a product of M elementary reflectors. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX array, dimension (LWORK) * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Further Details * =============== * * The factorization is obtained by Householder's method. The kth * transformation matrix, Z( k ), whose conjugate transpose is used to * introduce zeros into the (m - k + 1)th row of sub( A ), is given in * the form * * Z( k ) = ( I 0 ), * ( 0 T( k ) ) * * where * * T( k ) = I - tau*u( k )*u( k )', u( k ) = ( 1 ), * ( 0 ) * ( z( k ) ) * * tau is a scalar and z( k ) is an ( n - m ) element vector. * tau and z( k ) are chosen to annihilate the elements of the kth row * of sub( A ). * * The scalar tau is returned in the kth element of TAU and the vector * u( k ) in the kth row of sub( A ), such that the elements of z( k ) * are in a( k, m + 1 ), ..., a( k, n ). The elements of R are returned * in the upper triangular part of sub( A ). * * Z is given by * * Z = Z( 1 ) * Z( 2 ) * ... * Z( m ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IAROW, ICTXT, II, J, J1, MP, MYCOL, MYROW, $ NPCOL, NPROW COMPLEX AII * .. * .. Local Arrays .. INTEGER DESCTAU( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL DESCSET, INFOG1L, PCELSET, PCLACGV, $ PCLARFG, PCLARZ * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * MP = NUMROC( IA+M-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) * CALL DESCSET( DESCTAU, DESCA( M_ ), 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, ICTXT, MAX( 1, MP ) ) * IF( M.EQ.N ) THEN * CALL INFOG1L( IA, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ II, IAROW ) DO 10 I = II, MP TAU( I ) = ZERO 10 CONTINUE * ELSE * AII = ZERO * J1 = JA + N - L DO 20 I = IA+M-1, IA, -1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * [ A(i, j) A(i,j1:ja+n-1) ] * CALL PCLACGV( 1, A, I, J, DESCA, DESCA( M_ ) ) CALL PCLACGV( L, A, I, J1, DESCA, DESCA( M_ ) ) CALL PCLARFG( L+1, AII, I, J, A, I, J1, DESCA, DESCA( M_ ), $ TAU ) * * Apply H(i) to A(ia:i-1,j:ja+n-1) from the right * CALL PCLARZ( 'Right', I-IA, JA+N-J, L, A, I, J1, DESCA, $ DESCA( M_ ), TAU, A, IA, J, DESCA, WORK ) CALL PCELSET( A, I, J, DESCA, CONJG( AII ) ) * 20 CONTINUE * CALL PCLACGV( M, TAU, IA, 1, DESCTAU, 1 ) * END IF * RETURN * * End of PCLATRZ * END scalapack-2.0.2/SRC/pclattrs.f000644 000766 000024 00000130245 10363532303 016346 0ustar00juliestaff000000 000000 SUBROUTINE PCLATTRS( UPLO, TRANS, DIAG, NORMIN, N, A, IA, JA, $ DESCA, X, IX, JX, DESCX, SCALE, CNORM, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. CHARACTER DIAG, NORMIN, TRANS, UPLO INTEGER IA, INFO, IX, JA, JX, N REAL SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) REAL CNORM( * ) COMPLEX A( * ), X( * ) * .. * * Purpose * ======= * * PCLATTRS solves one of the triangular systems * * A * x = s*b, A**T * x = s*b, or A**H * x = s*b, * * with scaling to prevent overflow. Here A is an upper or lower * triangular matrix, A**T denotes the transpose of A, A**H denotes the * conjugate transpose of A, x and b are n-element vectors, and s is a * scaling factor, usually less than or equal to 1, chosen so that the * components of x will be less than the overflow threshold. If the * unscaled problem will not cause overflow, the Level 2 PBLAS routine * PCTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j) * then s is set to 0 and a non-trivial solution to A*x = 0 is returned. * * This is very slow relative to PCTRSV. This should only be used * when scaling is necessary to control overflow, or when it is modified * to scale better. * Notes * * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the matrix A is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * TRANS (global input) CHARACTER*1 * Specifies the operation applied to A. * = 'N': Solve A * x = s*b (No transpose) * = 'T': Solve A**T * x = s*b (Transpose) * = 'C': Solve A**H * x = s*b (Conjugate transpose) * * DIAG (global input) CHARACTER*1 * Specifies whether or not the matrix A is unit triangular. * = 'N': Non-unit triangular * = 'U': Unit triangular * * NORMIN (global input) CHARACTER*1 * Specifies whether CNORM has been set or not. * = 'Y': CNORM contains the column norms on entry * = 'N': CNORM is not set on entry. On exit, the norms will * be computed and stored in CNORM. * * N (global input) INTEGER * The order of the matrix A. N >= 0. * * A (local input) COMPLEX array, dimension (DESCA(LLD_),*) * The triangular matrix A. If UPLO = 'U', the leading n by n * upper triangular part of the array A contains the upper * triangular matrix, and the strictly lower triangular part of * A is not referenced. If UPLO = 'L', the leading n by n lower * triangular part of the array A contains the lower triangular * matrix, and the strictly upper triangular part of A is not * referenced. If DIAG = 'U', the diagonal elements of A are * also not referenced and are assumed to be 1. * * IA (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix A to operate on. * * JA (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix A to operate on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input/output) COMPLEX array, * dimension (DESCX(LLD_),*) * On entry, the right hand side b of the triangular system. * On exit, X is overwritten by the solution vector x. * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * SCALE (global output) REAL * The scaling factor s for the triangular system * A * x = s*b, A**T * x = s*b, or A**H * x = s*b. * If SCALE = 0, the matrix A is singular or badly scaled, and * the vector x is an exact or approximate solution to A*x = 0. * * CNORM (global input or global output) REAL array, * dimension (N) * If NORMIN = 'Y', CNORM is an input argument and CNORM(j) * contains the norm of the off-diagonal part of the j-th column * of A. If TRANS = 'N', CNORM(j) must be greater than or equal * to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) * must be greater than or equal to the 1-norm. * * If NORMIN = 'N', CNORM is an output argument and CNORM(j) * returns the 1-norm of the offdiagonal part of the j-th column * of A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * Further Details * ======= ======= * * A rough bound on x is computed; if that is less than overflow, PCTRSV * is called, otherwise, specific code is used which checks for possible * overflow or divide-by-zero at every operation. * * A columnwise scheme is used for solving A*x = b. The basic algorithm * if A is lower triangular is * * x[1:n] := b[1:n] * for j = 1, ..., n * x(j) := x(j) / A(j,j) * x[j+1:n] := x[j+1:n] - x(j) * A[j+1:n,j] * end * * Define bounds on the components of x after j iterations of the loop: * M(j) = bound on x[1:j] * G(j) = bound on x[j+1:n] * Initially, let M(0) = 0 and G(0) = max{x(i), i=1,...,n}. * * Then for iteration j+1 we have * M(j+1) <= G(j) / | A(j+1,j+1) | * G(j+1) <= G(j) + M(j+1) * | A[j+2:n,j+1] | * <= G(j) ( 1 + CNORM(j+1) / | A(j+1,j+1) | ) * * where CNORM(j+1) is greater than or equal to the infinity-norm of * column j+1 of A, not counting the diagonal. Hence * * G(j) <= G(0) product ( 1 + CNORM(i) / | A(i,i) | ) * 1<=i<=j * and * * |x(j)| <= ( G(0) / |A(j,j)| ) product ( 1 + CNORM(i) / |A(i,i)| ) * 1<=i< j * * Since |x(j)| <= M(j), we use the Level 2 PBLAS routine PCTRSV if the * reciprocal of the largest M(j), j=1,..,n, is larger than * max(underflow, 1/overflow). * * The bound on x(j) is also used to determine when a step in the * columnwise method can be performed without fear of overflow. If * the computed bound is greater than a large constant, x is scaled to * prevent overflow, but if the bound overflows, x is set to 0, x(j) to * 1, and scale to 0, and a non-trivial solution to A*x = 0 is found. * * Similarly, a row-wise scheme is used to solve A**T *x = b or * A**H *x = b. The basic algorithm for A upper triangular is * * for j = 1, ..., n * x(j) := ( b(j) - A[1:j-1,j]' * x[1:j-1] ) / A(j,j) * end * * We simultaneously compute two bounds * G(j) = bound on ( b(i) - A[1:i-1,i]' * x[1:i-1] ), 1<=i<=j * M(j) = bound on x(i), 1<=i<=j * * The initial values are G(0) = 0, M(0) = max{b(i), i=1,..,n}, and we * add the constraint G(j) >= G(j-1) and M(j) >= M(j-1) for j >= 1. * Then the bound on x(j) is * * M(j) <= M(j-1) * ( 1 + CNORM(j) ) / | A(j,j) | * * <= M(0) * product ( ( 1 + CNORM(i) ) / |A(i,i)| ) * 1<=i<=j * * and we can safely call PCTRSV if 1/M(n) and 1/G(n) are both greater * than max(underflow, 1/overflow). * * Last modified by: Mark R. Fahey, August 2000 * * ===================================================================== * * .. Parameters .. REAL ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, $ TWO = 2.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL NOTRAN, NOUNIT, UPPER INTEGER CONTXT, CSRC, I, ICOL, ICOLX, IMAX, IROW, $ IROWX, ITMP1, ITMP1X, ITMP2, ITMP2X, J, JFIRST, $ JINC, JLAST, LDA, LDX, MB, MYCOL, MYROW, NB, $ NPCOL, NPROW, RSRC REAL BIGNUM, GROW, REC, SMLNUM, TJJ, TMAX, TSCAL, $ XBND, XJ, XMAX COMPLEX CSUMJ, TJJS, USCAL, XJTMP, ZDUM * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX REAL PSLAMCH COMPLEX CLADIV EXTERNAL LSAME, ISAMAX, PSLAMCH, CLADIV * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGSUM2D, SSCAL, INFOG2L, $ PSCASUM, PSLABAD, PXERBLA, PCAMAX, PCAXPY, $ PCDOTC, PCDOTU, PCSSCAL, PCLASET, PCSCAL, $ PCTRSV, CGEBR2D, CGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, CMPLX, CONJG, AIMAG, MAX, MIN * .. * .. Statement Functions .. REAL CABS1, CABS2 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) CABS2( ZDUM ) = ABS( REAL( ZDUM ) / 2.E0 ) + $ ABS( AIMAG( ZDUM ) / 2.E0 ) * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) NOUNIT = LSAME( DIAG, 'N' ) * CONTXT = DESCA( CTXT_ ) RSRC = DESCA( RSRC_ ) CSRC = DESCA( CSRC_ ) MB = DESCA( MB_ ) NB = DESCA( NB_ ) LDA = DESCA( LLD_ ) LDX = DESCX( LLD_ ) * * Test the input parameters. * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. $ LSAME( NORMIN, 'N' ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -5 END IF * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( INFO.NE.0 ) THEN CALL PXERBLA( CONTXT, 'PCLATTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * SMLNUM = PSLAMCH( CONTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM CALL PSLABAD( CONTXT, SMLNUM, BIGNUM ) SMLNUM = SMLNUM / PSLAMCH( CONTXT, 'Precision' ) BIGNUM = ONE / SMLNUM SCALE = ONE * * IF( LSAME( NORMIN, 'N' ) ) THEN * * Compute the 1-norm of each column, not including the diagonal. * IF( UPPER ) THEN * * A is upper triangular. * CNORM( 1 ) = ZERO DO 10 J = 2, N CALL PSCASUM( J-1, CNORM( J ), A, IA, JA+J-1, DESCA, 1 ) 10 CONTINUE ELSE * * A is lower triangular. * DO 20 J = 1, N - 1 CALL PSCASUM( N-J, CNORM( J ), A, IA+J, JA+J-1, DESCA, $ 1 ) 20 CONTINUE CNORM( N ) = ZERO END IF CALL SGSUM2D( CONTXT, 'Row', ' ', N, 1, CNORM, 1, -1, -1 ) END IF * * Scale the column norms by TSCAL if the maximum element in CNORM is * greater than BIGNUM/2. * IMAX = ISAMAX( N, CNORM, 1 ) TMAX = CNORM( IMAX ) IF( TMAX.LE.BIGNUM*HALF ) THEN TSCAL = ONE ELSE TSCAL = HALF / ( SMLNUM*TMAX ) CALL SSCAL( N, TSCAL, CNORM, 1 ) END IF * * Compute a bound on the computed solution vector to see if the * Level 2 PBLAS routine PCTRSV can be used. * XMAX = ZERO CALL PCAMAX( N, ZDUM, IMAX, X, IX, JX, DESCX, 1 ) XMAX = CABS2( ZDUM ) CALL SGSUM2D( CONTXT, 'Row', ' ', 1, 1, XMAX, 1, -1, -1 ) XBND = XMAX * IF( NOTRAN ) THEN * * Compute the growth in A * x = b. * IF( UPPER ) THEN JFIRST = N JLAST = 1 JINC = -1 ELSE JFIRST = 1 JLAST = N JINC = 1 END IF * IF( TSCAL.NE.ONE ) THEN GROW = ZERO GO TO 50 END IF * IF( NOUNIT ) THEN * * A is non-unit triangular. * * Compute GROW = 1/G(j) and XBND = 1/M(j). * Initially, G(0) = max{x(i), i=1,...,n}. * GROW = HALF / MAX( XBND, SMLNUM ) XBND = GROW DO 30 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 50 * * TJJS = A( J, J ) CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN TJJS = A( ( ICOL-1 )*LDA+IROW ) CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF TJJ = CABS1( TJJS ) * IF( TJJ.GE.SMLNUM ) THEN * * M(j) = G(j-1) / abs(A(j,j)) * XBND = MIN( XBND, MIN( ONE, TJJ )*GROW ) ELSE * * M(j) could overflow, set XBND to 0. * XBND = ZERO END IF * IF( TJJ+CNORM( J ).GE.SMLNUM ) THEN * * G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) * GROW = GROW*( TJJ / ( TJJ+CNORM( J ) ) ) ELSE * * G(j) could overflow, set GROW to 0. * GROW = ZERO END IF 30 CONTINUE GROW = XBND ELSE * * A is unit triangular. * * Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. * GROW = MIN( ONE, HALF / MAX( XBND, SMLNUM ) ) DO 40 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 50 * * G(j) = G(j-1)*( 1 + CNORM(j) ) * GROW = GROW*( ONE / ( ONE+CNORM( J ) ) ) 40 CONTINUE END IF 50 CONTINUE * ELSE * * Compute the growth in A**T * x = b or A**H * x = b. * IF( UPPER ) THEN JFIRST = 1 JLAST = N JINC = 1 ELSE JFIRST = N JLAST = 1 JINC = -1 END IF * IF( TSCAL.NE.ONE ) THEN GROW = ZERO GO TO 80 END IF * IF( NOUNIT ) THEN * * A is non-unit triangular. * * Compute GROW = 1/G(j) and XBND = 1/M(j). * Initially, M(0) = max{x(i), i=1,...,n}. * GROW = HALF / MAX( XBND, SMLNUM ) XBND = GROW DO 60 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 80 * * G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) * XJ = ONE + CNORM( J ) GROW = MIN( GROW, XBND / XJ ) * * TJJS = A( J, J ) CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN TJJS = A( ( ICOL-1 )*LDA+IROW ) CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF TJJ = CABS1( TJJS ) * IF( TJJ.GE.SMLNUM ) THEN * * M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) * IF( XJ.GT.TJJ ) $ XBND = XBND*( TJJ / XJ ) ELSE * * M(j) could overflow, set XBND to 0. * XBND = ZERO END IF 60 CONTINUE GROW = MIN( GROW, XBND ) ELSE * * A is unit triangular. * * Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. * GROW = MIN( ONE, HALF / MAX( XBND, SMLNUM ) ) DO 70 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 80 * * G(j) = ( 1 + CNORM(j) )*G(j-1) * XJ = ONE + CNORM( J ) GROW = GROW / XJ 70 CONTINUE END IF 80 CONTINUE END IF * IF( ( GROW*TSCAL ).GT.SMLNUM ) THEN * * Use the Level 2 PBLAS solve if the reciprocal of the bound on * elements of X is not too small. * CALL PCTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, $ DESCX, 1 ) ELSE * * Use a Level 1 PBLAS solve, scaling intermediate results. * IF( XMAX.GT.BIGNUM*HALF ) THEN * * Scale X so that its components are less than or equal to * BIGNUM in absolute value. * SCALE = ( BIGNUM*HALF ) / XMAX CALL PCSSCAL( N, SCALE, X, IX, JX, DESCX, 1 ) XMAX = BIGNUM ELSE XMAX = XMAX*TWO END IF * IF( NOTRAN ) THEN * * Solve A * x = b * DO 100 J = JFIRST, JLAST, JINC * * Compute x(j) = b(j) / A(j,j), scaling x if necessary. * * XJ = CABS1( X( J ) ) CALL INFOG2L( IX+J-1, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, IROWX, ICOLX, ITMP1X, ITMP2X ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) THEN XJTMP = X( IROWX ) CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1, $ ITMP1X, ITMP2X ) END IF XJ = CABS1( XJTMP ) IF( NOUNIT ) THEN * TJJS = A( J, J )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN TJJS = A( ( ICOL-1 )*LDA+IROW )*TSCAL CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL IF( TSCAL.EQ.ONE ) $ GO TO 90 END IF TJJ = CABS1( TJJS ) IF( TJJ.GT.SMLNUM ) THEN * * abs(A(j,j)) > SMLNUM: * IF( TJJ.LT.ONE ) THEN IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by 1/b(j). * REC = ONE / XJ CALL PCSSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * X( J ) = CLADIV( X( J ), TJJS ) * XJ = CABS1( X( J ) ) XJTMP = CLADIV( XJTMP, TJJS ) XJ = CABS1( XJTMP ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF ELSE IF( TJJ.GT.ZERO ) THEN * * 0 < abs(A(j,j)) <= SMLNUM: * IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM * to avoid overflow when dividing by A(j,j). * REC = ( TJJ*BIGNUM ) / XJ IF( CNORM( J ).GT.ONE ) THEN * * Scale by 1/CNORM(j) to avoid overflow when * multiplying x(j) times column j. * REC = REC / CNORM( J ) END IF CALL PCSSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF * X( J ) = CLADIV( X( J ), TJJS ) * XJ = CABS1( X( J ) ) XJTMP = CLADIV( XJTMP, TJJS ) XJ = CABS1( XJTMP ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF ELSE * * A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and * scale = 0, and compute a solution to A*x = 0. * CALL PCLASET( ' ', N, 1, CZERO, CZERO, X, IX, JX, $ DESCX ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = CONE END IF XJTMP = CONE XJ = ONE SCALE = ZERO XMAX = ZERO END IF 90 CONTINUE * * Scale x if necessary to avoid overflow when adding a * multiple of column j of A. * IF( XJ.GT.ONE ) THEN REC = ONE / XJ IF( CNORM( J ).GT.( BIGNUM-XMAX )*REC ) THEN * * Scale x by 1/(2*abs(x(j))). * REC = REC*HALF CALL PCSSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC END IF ELSE IF( XJ*CNORM( J ).GT.( BIGNUM-XMAX ) ) THEN * * Scale x by 1/2. * CALL PCSSCAL( N, HALF, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*HALF SCALE = SCALE*HALF END IF * IF( UPPER ) THEN IF( J.GT.1 ) THEN * * Compute the update * x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) * ZDUM = -XJTMP*TSCAL CALL PCAXPY( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1, X, $ IX, JX, DESCX, 1 ) CALL PCAMAX( J-1, ZDUM, IMAX, X, IX, JX, DESCX, 1 ) XMAX = CABS1( ZDUM ) CALL SGSUM2D( CONTXT, 'Row', ' ', 1, 1, XMAX, 1, $ -1, -1 ) END IF ELSE IF( J.LT.N ) THEN * * Compute the update * x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) * ZDUM = -XJTMP*TSCAL CALL PCAXPY( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) CALL PCAMAX( N-J, ZDUM, I, X, IX+J, JX, DESCX, 1 ) XMAX = CABS1( ZDUM ) CALL SGSUM2D( CONTXT, 'Row', ' ', 1, 1, XMAX, 1, $ -1, -1 ) END IF END IF 100 CONTINUE * ELSE IF( LSAME( TRANS, 'T' ) ) THEN * * Solve A**T * x = b * DO 120 J = JFIRST, JLAST, JINC * * Compute x(j) = b(j) - sum A(k,j)*x(k). * k<>j * * XJ = CABS1( X( J ) ) CALL INFOG2L( IX+J-1, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, IROWX, ICOLX, ITMP1X, ITMP2X ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) THEN XJTMP = X( IROWX ) CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1, $ ITMP1X, ITMP2X ) END IF XJ = CABS1( XJTMP ) USCAL = CMPLX( TSCAL ) REC = ONE / MAX( XMAX, ONE ) IF( CNORM( J ).GT.( BIGNUM-XJ )*REC ) THEN * * If x(j) could overflow, scale x by 1/(2*XMAX). * REC = REC*HALF IF( NOUNIT ) THEN * TJJS = A( J, J )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, $ ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = A( ( ICOL-1 )*LDA+IROW )*TSCAL CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, $ 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL END IF TJJ = CABS1( TJJS ) IF( TJJ.GT.ONE ) THEN * * Divide by A(j,j) when scaling x if A(j,j) > 1. * REC = MIN( ONE, REC*TJJ ) USCAL = CLADIV( USCAL, TJJS ) END IF IF( REC.LT.ONE ) THEN CALL PCSSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * CSUMJ = CZERO IF( USCAL.EQ.CONE ) THEN * * If the scaling needed for A in the dot product is 1, * call PCDOTU to perform the dot product. * IF( UPPER ) THEN CALL PCDOTU( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ELSE IF( J.LT.N ) THEN CALL PCDOTU( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL CGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL CGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF ELSE * * Otherwise, scale column of A by USCAL before dot * product. Below is not the best way to do it. * IF( UPPER ) THEN * DO 130 I = 1, J - 1 * CSUMJ = CSUMJ + ( A( I, J )*USCAL )*X( I ) * 130 CONTINUE ZDUM = CONJG( USCAL ) CALL PCSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) CALL PCDOTU( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ZDUM = CLADIV( ZDUM, USCAL ) CALL PCSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) ELSE IF( J.LT.N ) THEN * DO 140 I = J + 1, N * CSUMJ = CSUMJ + ( A( I, J )*USCAL )*X( I ) * 140 CONTINUE ZDUM = CONJG( USCAL ) CALL PCSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) CALL PCDOTU( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) ZDUM = CLADIV( ZDUM, USCAL ) CALL PCSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL CGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL CGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF END IF * IF( USCAL.EQ.CMPLX( TSCAL ) ) THEN * * Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) * was not used to scale the dotproduct. * * X( J ) = X( J ) - CSUMJ * XJ = CABS1( X( J ) ) XJTMP = XJTMP - CSUMJ XJ = CABS1( XJTMP ) * IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) * $ X( IROWX ) = XJTMP IF( NOUNIT ) THEN * TJJS = A( J, J )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, $ ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = A( ( ICOL-1 )*LDA+IROW )*TSCAL CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, $ 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL IF( TSCAL.EQ.ONE ) $ GO TO 110 END IF * * Compute x(j) = x(j) / A(j,j), scaling if necessary. * TJJ = CABS1( TJJS ) IF( TJJ.GT.SMLNUM ) THEN * * abs(A(j,j)) > SMLNUM: * IF( TJJ.LT.ONE ) THEN IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale X by 1/abs(x(j)). * REC = ONE / XJ CALL PCSSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * X( J ) = CLADIV( X( J ), TJJS ) XJTMP = CLADIV( XJTMP, TJJS ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF ELSE IF( TJJ.GT.ZERO ) THEN * * 0 < abs(A(j,j)) <= SMLNUM: * IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. * REC = ( TJJ*BIGNUM ) / XJ CALL PCSSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF * X( J ) = CLADIV( X( J ), TJJS ) XJTMP = CLADIV( XJTMP, TJJS ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF ELSE * * A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and * scale = 0 and compute a solution to A**T *x = 0. * CALL PCLASET( ' ', N, 1, CZERO, CZERO, X, IX, JX, $ DESCX ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = CONE END IF XJTMP = CONE SCALE = ZERO XMAX = ZERO END IF 110 CONTINUE ELSE * * Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot * product has already been divided by 1/A(j,j). * * X( J ) = CLADIV( X( J ), TJJS ) - CSUMJ XJTMP = CLADIV( XJTMP, TJJS ) - CSUMJ IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF END IF XMAX = MAX( XMAX, CABS1( XJTMP ) ) 120 CONTINUE * ELSE * * Solve A**H * x = b * DO 140 J = JFIRST, JLAST, JINC * * Compute x(j) = b(j) - sum A(k,j)*x(k). * k<>j * CALL INFOG2L( IX+J-1, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, IROWX, ICOLX, ITMP1X, ITMP2X ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) THEN XJTMP = X( IROWX ) CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1, $ ITMP1X, ITMP2X ) END IF XJ = CABS1( XJTMP ) USCAL = TSCAL REC = ONE / MAX( XMAX, ONE ) IF( CNORM( J ).GT.( BIGNUM-XJ )*REC ) THEN * * If x(j) could overflow, scale x by 1/(2*XMAX). * REC = REC*HALF IF( NOUNIT ) THEN * TJJS = CONJG( A( J, J ) )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, $ ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = CONJG( A( ( ICOL-1 )*LDA+IROW ) )*TSCAL CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, $ 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL END IF TJJ = CABS1( TJJS ) IF( TJJ.GT.ONE ) THEN * * Divide by A(j,j) when scaling x if A(j,j) > 1. * REC = MIN( ONE, REC*TJJ ) USCAL = CLADIV( USCAL, TJJS ) END IF IF( REC.LT.ONE ) THEN CALL PCSSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * CSUMJ = CZERO IF( USCAL.EQ.CONE ) THEN * * If the scaling needed for A in the dot product is 1, * call PCDOTC to perform the dot product. * IF( UPPER ) THEN CALL PCDOTC( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ELSE IF( J.LT.N ) THEN CALL PCDOTC( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL CGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL CGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF ELSE * * Otherwise, scale column of A by USCAL before dot * product. Below is not the best way to do it. * IF( UPPER ) THEN * DO 180 I = 1, J - 1 * CSUMJ = CSUMJ + ( CONJG( A( I, J ) )*USCAL )* * $ X( I ) * 180 CONTINUE ZDUM = CONJG( USCAL ) CALL PCSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) CALL PCDOTC( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ZDUM = CLADIV( CONE, ZDUM ) CALL PCSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) ELSE IF( J.LT.N ) THEN * DO 190 I = J + 1, N * CSUMJ = CSUMJ + ( CONJG( A( I, J ) )*USCAL )* * $ X( I ) * 190 CONTINUE ZDUM = CONJG( USCAL ) CALL PCSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) CALL PCDOTC( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) ZDUM = CLADIV( CONE, ZDUM ) CALL PCSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL CGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL CGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF END IF * IF( USCAL.EQ.CMPLX( TSCAL ) ) THEN * * Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) * was not used to scale the dotproduct. * * X( J ) = X( J ) - CSUMJ * XJ = CABS1( X( J ) ) XJTMP = XJTMP - CSUMJ XJ = CABS1( XJTMP ) * IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) * $ X( IROWX ) = XJTMP IF( NOUNIT ) THEN * TJJS = CONJG( A( J, J ) )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, $ ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = CONJG( A( ( ICOL-1 )*LDA+IROW ) )*TSCAL CALL CGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, $ 1 ) ELSE CALL CGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL IF( TSCAL.EQ.ONE ) $ GO TO 130 END IF * * Compute x(j) = x(j) / A(j,j), scaling if necessary. * TJJ = CABS1( TJJS ) IF( TJJ.GT.SMLNUM ) THEN * * abs(A(j,j)) > SMLNUM: * IF( TJJ.LT.ONE ) THEN IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale X by 1/abs(x(j)). * REC = ONE / XJ CALL PCSSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * X( J ) = CLADIV( X( J ), TJJS ) XJTMP = CLADIV( XJTMP, TJJS ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = XJTMP ELSE IF( TJJ.GT.ZERO ) THEN * * 0 < abs(A(j,j)) <= SMLNUM: * IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. * REC = ( TJJ*BIGNUM ) / XJ CALL PCSSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF * X( J ) = CLADIV( X( J ), TJJS ) XJTMP = CLADIV( XJTMP, TJJS ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = XJTMP ELSE * * A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and * scale = 0 and compute a solution to A**H *x = 0. * CALL PCLASET( ' ', N, 1, CZERO, CZERO, X, IX, JX, $ DESCX ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = CONE XJTMP = CONE SCALE = ZERO XMAX = ZERO END IF 130 CONTINUE ELSE * * Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot * product has already been divided by 1/A(j,j). * * X( J ) = CLADIV( X( J ), TJJS ) - CSUMJ XJTMP = CLADIV( XJTMP, TJJS ) - CSUMJ IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = XJTMP END IF XMAX = MAX( XMAX, CABS1( XJTMP ) ) 140 CONTINUE END IF SCALE = SCALE / TSCAL END IF * * Scale the column norms by 1/TSCAL for return. * IF( TSCAL.NE.ONE ) THEN CALL SSCAL( N, ONE / TSCAL, CNORM, 1 ) END IF * RETURN * * End of PCLATTRS * END scalapack-2.0.2/SRC/pclauu2.f000644 000766 000024 00000020413 10363532303 016060 0ustar00juliestaff000000 000000 SUBROUTINE PCLAUU2( UPLO, N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLAUU2 computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in sub( A ). * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in sub( A ). * * This is the unblocked form of the algorithm, calling Level 2 BLAS. * No communication is performed by this routine, the matrix to operate * on should be strictly local to one process. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the triangular factor stored in the matrix * sub( A ) is upper or lower triangular: * = 'U': Upper triangular, * = 'L': Lower triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the order of the triangular factor U or L. N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor L or U. * On exit, if UPLO = 'U', the upper triangle of the distributed * matrix sub( A ) is overwritten with the upper triangle of the * product U * U'; if UPLO = 'L', the lower triangle of sub( A ) * is overwritten with the lower triangle of the product L' * L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICURR, IDIAG, IIA, IOFFA, JJA, $ LDA, MYCOL, MYROW, NA, NPCOL, NPROW REAL AII * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEMV, CLACGV, $ CSSCAL, INFOG2L * .. * .. External Functions .. LOGICAL LSAME COMPLEX CDOTC EXTERNAL CDOTC, LSAME * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, REAL * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Get grid parameters and compute local indexes * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * IF( LSAME( UPLO, 'U' ) ) THEN * * Compute the product U * U'. * DO 10 NA = N-1, 1, -1 AII = A( IDIAG ) ICURR = IDIAG + LDA A( IDIAG ) = AII*AII + REAL( CDOTC( NA, A( ICURR ), LDA, $ A( ICURR ), LDA ) ) CALL CLACGV( NA, A( ICURR ), LDA ) CALL CGEMV( 'No transpose', N-NA-1, NA, ONE, $ A( IOFFA+LDA ), LDA, A( ICURR ), LDA, $ CMPLX( AII ), A( IOFFA ), 1 ) CALL CLACGV( NA, A( ICURR ), LDA ) IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + LDA 10 CONTINUE AII = A( IDIAG ) CALL CSSCAL( N, AII, A( IOFFA ), 1 ) * ELSE * * Compute the product L' * L. * DO 20 NA = 1, N-1 AII = A( IDIAG ) ICURR = IDIAG + 1 A(IDIAG) = AII*AII + REAL( CDOTC( N-NA, A( ICURR ), 1, $ A( ICURR ), 1 ) ) CALL CLACGV( NA-1, A( IOFFA ), LDA ) CALL CGEMV( 'Conjugate transpose', N-NA, NA-1, ONE, $ A( IOFFA+1 ), LDA, A( ICURR ), 1, $ CMPLX( AII ), A( IOFFA ), LDA ) CALL CLACGV( NA-1, A( IOFFA ), LDA ) IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + 1 20 CONTINUE AII = A( IDIAG ) CALL CSSCAL( N, AII, A( IOFFA ), LDA ) * END IF * END IF * RETURN * * End of PCLAUU2 * END scalapack-2.0.2/SRC/pclauum.f000644 000766 000024 00000021172 10363532303 016156 0ustar00juliestaff000000 000000 SUBROUTINE PCLAUUM( UPLO, N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLAUUM computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in sub( A ). * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in sub( A ). * * This is the blocked form of the algorithm, calling Level 3 PBLAS. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the triangular factor stored in the * distributed matrix sub( A ) is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the triangular factor U or L. N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor L or U. * On exit, if UPLO = 'U', the upper triangle of the distributed * matrix sub( A ) is overwritten with the upper triangle of the * product U * U'; if UPLO = 'L', the lower triangle of sub( A ) * is overwritten with the lower triangle of the product L' * L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) COMPLEX CONE PARAMETER ( CONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J, JB, JN * .. * .. External Subroutines .. EXTERNAL PCGEMM, PCHERK, PCLAUU2, PCTRMM * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) IF( LSAME( UPLO, 'U' ) ) THEN * * Compute the product U * U'. * * Handle first block separately * JB = JN-JA+1 CALL PCLAUU2( 'Upper', JB, A, IA, JA, DESCA ) IF( JB.LE.N-1 ) THEN CALL PCHERK( 'Upper', 'No transpose', JB, N-JB, ONE, A, IA, $ JA+JB, DESCA, ONE, A, IA, JA, DESCA ) END IF * * Loop over remaining block of columns * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA CALL PCTRMM( 'Right', 'Upper', 'Conjugate transpose', $ 'Non-unit', J-JA, JB, CONE, A, I, J, DESCA, $ A, IA, J, DESCA ) CALL PCLAUU2( 'Upper', JB, A, I, J, DESCA ) IF( J+JB.LE.JA+N-1 ) THEN CALL PCGEMM( 'No transpose', 'Conjugate transpose', $ J-JA, JB, N-J-JB+JA, CONE, A, IA, J+JB, $ DESCA, A, I, J+JB, DESCA, CONE, A, IA, $ J, DESCA ) CALL PCHERK( 'Upper', 'No transpose', JB, N-J-JB+JA, ONE, $ A, I, J+JB, DESCA, ONE, A, I, J, DESCA ) END IF 10 CONTINUE ELSE * * Compute the product L' * L. * * Handle first block separately * JB = JN-JA+1 CALL PCLAUU2( 'Lower', JB, A, IA, JA, DESCA ) IF( JB.LE.N-1 ) THEN CALL PCHERK( 'Lower', 'Conjugate transpose', JB, N-JB, ONE, $ A, IA+JB, JA, DESCA, ONE, A, IA, JA, DESCA ) END IF * * Loop over remaining block of columns * DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA CALL PCTRMM( 'Left', 'Lower', 'Conjugate Transpose', $ 'Non-unit', JB, J-JA, CONE, A, I, J, DESCA, A, $ I, JA, DESCA ) CALL PCLAUU2( 'Lower', JB, A, I, J, DESCA ) IF( J+JB.LE.JA+N-1 ) THEN CALL PCGEMM( 'Conjugate transpose', 'No transpose', JB, $ J-JA, N-J-JB+JA, CONE, A, I+JB, J, DESCA, $ A, I+JB, JA, DESCA, CONE, A, I, JA, DESCA ) CALL PCHERK( 'Lower', 'Conjugate transpose', JB, $ N-J-JB+JA, ONE, A, I+JB, J, DESCA, ONE, $ A, I, J, DESCA ) END IF 20 CONTINUE END IF * RETURN * * End of PCLAUUM * END scalapack-2.0.2/SRC/pclawil.f000644 000766 000024 00000024172 10363532303 016146 0ustar00juliestaff000000 000000 SUBROUTINE PCLAWIL( II, JJ, M, A, DESCA, H44, H33, H43H34, V ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. INTEGER II, JJ, M COMPLEX H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), V( * ) * .. * * Purpose * ======= * * PCLAWIL gets the transform given by H44,H33, & H43H34 into V * starting at row M. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * II (global input) INTEGER * Row owner of H(M+2,M+2) * * JJ (global input) INTEGER * Column owner of H(M+2,M+2) * * M (global input) INTEGER * On entry, this is where the transform starts (row M.) * Unchanged on exit. * * A (global input) COMPLEX array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * Unchanged on exit. * * H44 * H33 * H43H34 (global input) COMPLEX * These three values are for the double shift QR iteration. * Unchanged on exit. * * V (global output) COMPLEX array of size 3. * Contains the transform on ouput. * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, ICOL, IROW, JSRC, LDA, LEFT, $ MODKM1, MYCOL, MYROW, NPCOL, NPROW, NUM, RIGHT, $ RSRC, UP REAL S COMPLEX CDUM, H11, H12, H21, H22, H33S, H44S, V1, V2, $ V3 * .. * .. Local Arrays .. COMPLEX BUF( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, CGERV2D, CGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, AIMAG, MOD * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * On node (II,JJ) collect all DIA,SUP,SUB info from M, M+1 * MODKM1 = MOD( M+1, HBL ) IF( MODKM1.EQ.0 ) THEN IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) CALL CGESD2D( CONTXT, 1, 1, BUF, 1, II, JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) BUF( 2 ) = A( ( ICOL-1 )*LDA+IROW+1 ) BUF( 3 ) = A( ICOL*LDA+IROW ) BUF( 4 ) = A( ICOL*LDA+IROW+1 ) CALL CGESD2D( CONTXT, 4, 1, BUF, 4, II, JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NPCOL.GT.1 ) THEN CALL CGERV2D( CONTXT, 1, 1, V3, 1, MYROW, LEFT ) ELSE V3 = A( ( ICOL-2 )*LDA+IROW ) END IF IF( NUM.GT.1 ) THEN CALL CGERV2D( CONTXT, 4, 1, BUF, 4, UP, LEFT ) H11 = BUF( 1 ) H21 = BUF( 2 ) H12 = BUF( 3 ) H22 = BUF( 4 ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) END IF END IF END IF IF( MODKM1.EQ.1 ) THEN IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) CALL CGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. ( NPROW.GT.1 ) ) $ THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL CGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL CGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NUM.GT.1 ) THEN CALL CGERV2D( CONTXT, 1, 1, H11, 1, UP, LEFT ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) END IF IF( NPROW.GT.1 ) THEN CALL CGERV2D( CONTXT, 1, 1, H12, 1, UP, MYCOL ) ELSE H12 = A( ( ICOL-2 )*LDA+IROW-2 ) END IF IF( NPCOL.GT.1 ) THEN CALL CGERV2D( CONTXT, 1, 1, H21, 1, MYROW, LEFT ) ELSE H21 = A( ( ICOL-3 )*LDA+IROW-1 ) END IF H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF END IF IF( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) $ RETURN * IF( MODKM1.GT.1 ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF * H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 * RETURN * * End of PCLAWIL * END scalapack-2.0.2/SRC/pcmax1.f000644 000766 000024 00000033307 10363532303 015704 0ustar00juliestaff000000 000000 SUBROUTINE PCMAX1( N, AMAX, INDX, X, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER INDX, INCX, IX, JX, N COMPLEX AMAX * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX X( * ) * .. * * Purpose * ======= * * PCMAX1 computes the global index of the maximum element in absolute * value of a distributed vector sub( X ). The global index is returned * in INDX and the value is returned in AMAX, * * where sub( X ) denotes X(IX:IX+N-1,JX) if INCX = 1, * X(IX,JX:JX+N-1) if INCX = M_X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * When the result of a vector-oriented PBLAS call is a scalar, it will * be made available only within the scope which owns the vector(s) * being operated on. Let X be a generic term for the input vector(s). * Then, the processes which receive the answer will be (note that if * an operation involves more than one vector, the processes which re- * ceive the result will be the union of the following calculation for * each vector): * * If N = 1, M_X = 1 and INCX = 1, then one can't determine if a process * row or process column owns the vector operand, therefore only the * process of coordinate {RSRC_X, CSRC_X} receives the result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process part of this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process part of this column receives the result; * * Based on PCAMAX from Level 1 PBLAS. The change is to use the * 'genuine' absolute value. * * The serial version was contributed to LAPACK by Nick Higham for use * with CLACON. * * Arguments * ========= * * N (global input) pointer to INTEGER * The number of components of the distributed vector sub( X ). * N >= 0. * * AMAX (global output) pointer to REAL * The absolute value of the largest entry of the distributed * vector sub( X ) only in the scope of sub( X ). * * INDX (global output) pointer to INTEGER * The global index of the element of the distributed vector * sub( X ) whose real part has maximum absolute value. * * X (local input) COMPLEX array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. CHARACTER CBTOP, CCTOP, RBTOP, RCTOP INTEGER ICOFF, ICTXT, IDUMM, IIX, IROFF, IXCOL, IXROW, $ JJX, LCINDX, LDX, MAXPOS, MYCOL, MYROW, NP, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. COMPLEX WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CCOMBAMAX1, CGAMX2D, $ IGEBR2D, IGEBS2D, INFOG2L, PCTREECOMB, $ PB_TOPGET * .. * .. External Functions .. LOGICAL LSAME INTEGER ICMAX1, INDXL2G, NUMROC EXTERNAL ICMAX1, INDXL2G, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, MOD, NINT, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible. * INDX = 0 AMAX = ZERO IF( N.LE.0 ) $ RETURN * * Retrieve local information for vector X. * LDX = DESCX( LLD_ ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) * IF( INCX.EQ.1 .AND. DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN INDX = JX AMAX = X( IIX+(JJX-1)*LDX ) RETURN END IF * * Find the maximum value and its index * IF( INCX.EQ.DESCX( M_ ) ) THEN * IF( MYROW.EQ.IXROW ) THEN * ICOFF = MOD( JX-1, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ-ICOFF * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', RBTOP ) * IF( LSAME( RBTOP, ' ' ) ) THEN * IF( NQ.GT.0 ) THEN LCINDX = JJX-1+ICMAX1( NQ, X( IIX+(JJX-1)*LDX ), LDX ) WORK( 1 ) = X( IIX+(LCINDX-1)*LDX ) WORK( 2 ) = CMPLX( REAL( INDXL2G( LCINDX, $ DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), NPCOL ) ) ) ELSE WORK( 1 ) = ZERO WORK( 2 ) = ZERO END IF * CALL PCTREECOMB( ICTXT, 'Row', 2, WORK, -1, MYCOL, $ CCOMBAMAX1 ) * AMAX = WORK( 1 ) IF( AMAX.EQ.ZERO ) THEN INDX = JX ELSE INDX = NINT( REAL( WORK( 2 ) ) ) END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', RCTOP ) * IF( NQ.GT.0 ) THEN LCINDX = JJX-1+ICMAX1( NQ, X( IIX+(JJX-1)*LDX ), LDX ) AMAX = X( IIX + (LCINDX-1)*LDX ) ELSE AMAX = ZERO END IF * * Find the maximum value * CALL CGAMX2D( ICTXT, 'Rowwise', RCTOP, 1, 1, AMAX, 1, $ IDUMM, MAXPOS, 1, -1, MYROW ) * IF( AMAX.NE.ZERO ) THEN * * Broadcast corresponding global index * IF( MYCOL.EQ.MAXPOS ) THEN INDX = INDXL2G( LCINDX, DESCX( NB_ ), MYCOL, $ DESCX( CSRC_ ), NPCOL ) CALL IGEBS2D( ICTXT, 'Rowwise', RBTOP, 1, 1, INDX, $ 1 ) ELSE CALL IGEBR2D( ICTXT, 'Rowwise', RBTOP, 1, 1, INDX, $ 1, MYROW, MAXPOS ) END IF * ELSE * INDX = JX * END IF * END IF * END IF * ELSE * IF( MYCOL.EQ.IXCOL ) THEN * IROFF = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP-IROFF * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) * IF( LSAME( CBTOP, ' ' ) ) THEN * IF( NP.GT.0 ) THEN LCINDX = IIX-1+ICMAX1( NP, X( IIX+(JJX-1)*LDX ), 1 ) WORK( 1 ) = X( LCINDX + (JJX-1)*LDX ) WORK( 2 ) = CMPLX( REAL( INDXL2G( LCINDX, $ DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) ) ) ELSE WORK( 1 ) = ZERO WORK( 2 ) = ZERO END IF * CALL PCTREECOMB( ICTXT, 'Column', 2, WORK, -1, MYCOL, $ CCOMBAMAX1 ) * AMAX = WORK( 1 ) IF( AMAX.EQ.ZERO ) THEN INDX = IX ELSE INDX = NINT( REAL( WORK( 2 ) ) ) END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', CCTOP ) * IF( NP.GT.0 ) THEN LCINDX = IIX-1+ICMAX1( NP, X( IIX+(JJX-1)*LDX ), 1 ) AMAX = X( LCINDX + (JJX-1)*LDX ) ELSE AMAX = ZERO END IF * * Find the maximum value * CALL CGAMX2D( ICTXT, 'Columnwise', CCTOP, 1, 1, AMAX, 1, $ MAXPOS, IDUMM, 1, -1, MYCOL ) * IF( AMAX.NE.ZERO ) THEN * * Broadcast corresponding global index * IF( MYROW.EQ.MAXPOS ) THEN INDX = INDXL2G( LCINDX, DESCX( MB_ ), MYROW, $ DESCX( RSRC_ ), NPROW ) CALL IGEBS2D( ICTXT, 'Columnwise', CBTOP, 1, 1, $ INDX, 1 ) ELSE CALL IGEBR2D( ICTXT, 'Columnwise', CBTOP, 1, 1, $ INDX, 1, MAXPOS, MYCOL ) END IF * ELSE * INDX = IX * END IF * END IF * END IF * END IF * RETURN * * End of PCMAX1 * END * SUBROUTINE CCOMBAMAX1 ( V1, V2 ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. COMPLEX V1( 2 ), V2( 2 ) * .. * * Purpose * ======= * * CCOMBAMAX1 finds the element having maximum real part absolute * value as well as its corresponding globl index. * * Arguments * ========= * * V1 (local input/local output) COMPLEX array of * dimension 2. The first maximum absolute value element and * its global index. V1(1) = AMAX, V1(2) = INDX. * * V2 (local input) COMPLEX array of dimension 2. * The second maximum absolute value element and its global * index. V2(1) = AMAX, V2(2) = INDX. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, REAL * .. * .. Executable Statements .. * IF( ABS( REAL( V1( 1 ) ) ).LT.ABS( REAL( V2( 1 ) ) ) ) THEN V1( 1 ) = V2( 1 ) V1( 2 ) = V2( 2 ) END IF * RETURN * * End of CCOMBAMAX1 * END scalapack-2.0.2/SRC/pcpbsv.f000644 000766 000024 00000045274 10363532303 016016 0ustar00juliestaff000000 000000 SUBROUTINE PCPBSV( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PCPBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * banded symmetric positive definite distributed * matrix with bandwidth BW. * * Cholesky factorization is used to factor a reordering of * the matrix into L L'. * * See PCPBTRF and PCPBTRS for details. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (NB+2*bw)*bw * +max((bw*NRHS), bw*bw) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PCPBTRF, PCPBTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PCPBTRF and PCPBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PCPBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (NB+2*BW)*BW * * Factor the matrix * CALL PCPBTRF( UPLO, N, BW, A, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PCPBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PCPBTRS( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPBSV', -INFO ) RETURN END IF * RETURN * * End of PCPBSV * END scalapack-2.0.2/SRC/pcpbtrf.f000644 000766 000024 00000141637 11750130340 016155 0ustar00juliestaff000000 000000 SUBROUTINE PCPBTRF( UPLO, N, BW, A, JA, DESCA, AF, LAF, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), AF( * ), WORK( * ) * .. * * * Purpose * ======= * * PCPBTRF computes a Cholesky factorization * of an N-by-N complex banded * symmetric positive definite distributed matrix * with bandwidth BW: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PCPBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = U' U , if UPLO = 'U', or * * P A(1:N, JA:JA+N-1) P^T = L L', if UPLO = 'L' * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCPBTRF and this is stored in AF. If a linear system * is to be solved using PCPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * bw*bw * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM1, IDUM3, JA_NEW, $ LAF_MIN, LEVEL_DIST, LLDA, MBW2, MYCOL, MYROW, $ MY_NUM_COLS, NB, NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, OFST, PART_OFFSET, PART_SIZE, $ PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE, $ STORE_N_A, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ CAXPY, CGEMM, CGERV2D, CGESD2D, CLAMOV, $ CLATCPY, CPBTRF, CPOTRF, CSYRK, CTBTRS, CTRMM, $ CTRRV2D, CTRSD2D, CTRSM, CTRTRS, DESC_CONVERT, $ GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * * Pre-calculate bw^2 * MBW2 = BW * BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -10 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 6*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PCPBTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PCPBTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = (NB+2*BW)*BW * IF( LAF .LT. LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PCPBTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = BW*BW * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, $ 'PCPBTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 10 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCPBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW ENDIF * * * Zero out space for fillin * DO 10 I=1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * * Zero out space for work * DO 20 I=1, WORK_SIZE_MIN WORK( I ) = CZERO 20 CONTINUE * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN PREV_TRI_SIZE_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) PREV_TRI_SIZE_N=MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN NEXT_TRI_SIZE_M=MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) NEXT_TRI_SIZE_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF ( MYCOL .LT. NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL CTRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, A( OFST+ODD_SIZE*LLDA+(BW+1) ), $ LLDA-1, 0, MYCOL+1 ) * ENDIF * * * Factor main partition A_i = L_i {L_i}^C in each processor * CALL CPBTRF( UPLO, ODD_SIZE, BW, A( OFST + 1), $ LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1500 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * conjugate transpose the connection block in preparation. * CALL CLATCPY( 'U', BW, BW, $ A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW ) * * Perform the triangular system solve {L_i}{{B'}_i}^C = {B_i}^C * CALL CTRTRS( 'L', 'N', 'N', BW, BW, $ A( OFST+1+(ODD_SIZE-BW)*LLDA ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO ) * * * conjugate transpose resulting block to its location * in main storage. * CALL CLATCPY( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), $ BW, A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), $ LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^C * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL CHERK( UPLO, 'C', BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE, $ A( OFST+1+ODD_SIZE*LLDA ), LLDA-1 ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1500 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bw)*bw+1 ) and use for * off_diagonal block of reduced system. * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL CTRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M, $ PREV_TRI_SIZE_N, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF (INFO.EQ.0) THEN * * Calculate the "spike" fillin, ${L_i} {{G}_i}^C = {D_i}$ . * CALL CTBTRS( 'L', 'N', 'N', ODD_SIZE, BW, BW, A( OFST + 1 ), $ LLDA, AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = G_i{G_i}^C * CALL CHERK( 'L', 'C', BW, ODD_SIZE, $ -ONE, AF( 1 ), ODD_SIZE, ZERO, $ AF( 1 + (ODD_SIZE+2*BW)*BW), BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL CGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ 0, MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * {F_i}^C = {H_i}{{B'}_i}^C * * Copy matrix H_i (the last bw cols of G_i) to AF storage * as per requirements of BLAS routine CTRMM. * Since we have G_i^C stored, conjugate transpose * H_i^C to H_i. * CALL CLATCPY( 'N', BW, BW, $ AF( ODD_SIZE-BW+1 ), ODD_SIZE, $ AF( (ODD_SIZE)*BW+1), BW ) * CALL CTRMM( 'R', 'U', 'C', 'N', BW, BW, -CONE, $ A( ( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA ) ), LLDA-1, $ AF( (ODD_SIZE)*BW+1 ), BW ) * * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL CGESD2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+1 ), $ BW, 0, MYCOL-1 ) * ENDIF * * Copy last diagonal block into AF storage for subsequent * operations. * CALL CLAMOV( 'N', BW, BW, $ A( OFST+ODD_SIZE*LLDA+1 ), $ LLDA-1, AF( ODD_SIZE*BW+MBW2+1 ), $ BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * CALL CAXPY( MBW2, CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ 1, AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL CGERV2D( ICTXT, BW, BW, WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * CALL CAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL CGERV2D( ICTXT, BW, BW, WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * CALL CAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL CPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL CLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL CGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+1 ), $ BW, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL CTRSM( 'L', 'L', 'N', 'N', BW, BW, CONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL CHERK( 'L', 'C', BW, BW, -ONE, $ AF( (ODD_SIZE)*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL CGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * CALL CTRSM( 'R', 'L', 'C', 'N', BW, BW, CONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL CHERK( 'L', 'N', BW, BW, -ONE, $ AF( (ODD_SIZE+2*BW)*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL CGEMM( 'N', 'N', BW, BW, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW, CZERO, WORK( 1 ), $ BW ) * * Send contribution to offdiagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * * CASE UPLO = 'U' * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN PREV_TRI_SIZE_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) PREV_TRI_SIZE_N=MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN NEXT_TRI_SIZE_M=MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) NEXT_TRI_SIZE_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * * * Factor main partition A_i^C = U_i {U_i}^C in each processor * CALL CPBTRF( UPLO, ODD_SIZE, BW, A( OFST + 1), $ LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1600 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * Move the connection block in preparation. * CALL CLAMOV( 'L', BW, BW, A( ( OFST+1+ODD_SIZE*LLDA ) ), $ LLDA-1, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW ) * * * Perform the triangular solve {L_i}{{B'}_i}^C = {B_i}^C * CALL CTRTRS( 'U', 'C', 'N', BW, BW, $ A( OFST+BW+1+(ODD_SIZE-BW)*LLDA ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO ) * * Move the resulting block back to its location in main storage. * CALL CLAMOV( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), $ BW, A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i^C = {C_i}^C-{{B'}_i}^C{{B'}_i} * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL CHERK( UPLO, 'C', BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE, $ A( OFST+BW+1+ODD_SIZE*LLDA ), LLDA-1 ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1600 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bw)*bw+1 ) and use for * off_diagonal block of reduced system. * * Calculate the "spike" fillin, ${L_i} {{G}_i}^C = {D_i}$ . * * * Copy D block into AF storage for solve. * CALL CLATCPY( 'L', PREV_TRI_SIZE_N, PREV_TRI_SIZE_M, $ A( OFST+1 ), LLDA-1, AF( 1 ), ODD_SIZE ) * IF ( INFO.EQ.0 ) THEN * CALL CTBTRS( 'U', 'C', 'N', ODD_SIZE, BW, BW, $ A( OFST + 1 ), LLDA, $ AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = G_i{G_i}^C * CALL CHERK( 'L', 'C', BW, ODD_SIZE, $ -ONE, AF( 1 ), ODD_SIZE, ZERO, $ AF( 1 + (ODD_SIZE+2*BW)*BW), BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL CGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ 0, MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * {F_i}^C = {H_i}{{B'}_i}^C * * Copy matrix H_i (the last bw cols of G_i) to AF storage * as per requirements of BLAS routine CTRMM. * Since we have G_i^C stored, conjugate transpose * H_i^C to H_i. * CALL CLATCPY( 'N', BW, BW, $ AF( ODD_SIZE-BW+1 ), ODD_SIZE, $ AF( (ODD_SIZE)*BW+1), BW ) * CALL CTRMM( 'R', 'L', 'N', 'N', BW, BW, -CONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ AF( (ODD_SIZE)*BW+1 ), BW ) * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL CGESD2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+1 ), $ BW, 0, MYCOL-1 ) * ENDIF * * Transpose last diagonal block into AF storage for subsequent * operations. * CALL CLATCPY( 'U', BW, BW, $ A( OFST+ ODD_SIZE*LLDA+1+BW ), $ LLDA-1, AF( ODD_SIZE*BW+MBW2+1 ), $ BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * CALL CAXPY( MBW2, CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ 1, AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 22 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 21 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL CGERV2D( ICTXT, BW, BW, WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * CALL CAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL CGERV2D( ICTXT, BW, BW, WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * CALL CAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 22 21 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL CPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL CLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL CGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+1 ), $ BW, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL CTRSM( 'L', 'L', 'N', 'N', BW, BW, CONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL CHERK( 'L', 'C', BW, BW, -ONE, $ AF( (ODD_SIZE)*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL CGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * CALL CTRSM( 'R', 'L', 'C', 'N', BW, BW, CONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL CHERK( 'L', 'N', BW, BW, -ONE, $ AF( (ODD_SIZE+2*BW)*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL CGEMM( 'N', 'N', BW, BW, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW, CZERO, WORK( 1 ), $ BW ) * * Send contribution to offdiagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 24 CONTINUE * ENDIF * 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PCPBTRF * END scalapack-2.0.2/SRC/pcpbtrs.f000644 000766 000024 00000064061 10363532303 016171 0ustar00juliestaff000000 000000 SUBROUTINE PCPBTRS( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, $ AF, LAF, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 7, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PCPBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PCPBTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * banded symmetric positive definite distributed * matrix with bandwidth BW. * Depending on the value of UPLO, A stores either U or L in the equn * A(1:N, JA:JA+N-1) = U'*U or L*L' as computed by PCPBTRF. * * Routine PCPBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCPBTRF and this is stored in AF. If a linear system * is to be solved using PCPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (bw*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NP_SAVE, PART_OFFSET, $ RETURN_CODE, STORE_M_B, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, $ PCPBTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 7*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 10*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 10*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 10*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 10*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -14 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 7*100 + 4 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 10*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 10*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -3 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -6 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 7*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PCPBTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, $ 'PCPBTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ (BW*NRHS) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, $ 'PCPBTRS: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1005 PARAM_CHECK( 15, 2 ) = 1004 PARAM_CHECK( 14, 2 ) = 1003 PARAM_CHECK( 13, 2 ) = 1002 PARAM_CHECK( 12, 2 ) = 1001 PARAM_CHECK( 11, 2 ) = 9 PARAM_CHECK( 10, 2 ) = 705 PARAM_CHECK( 9, 2 ) = 704 PARAM_CHECK( 8, 2 ) = 703 PARAM_CHECK( 7, 2 ) = 701 PARAM_CHECK( 6, 2 ) = 6 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCPBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PCPBTRSV( 'L', 'N', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PCPBTRSV( 'U', 'C', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ENDIF * * Call backsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PCPBTRSV( 'L', 'C', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PCPBTRSV( 'U', 'N', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ENDIF 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PCPBTRS * END scalapack-2.0.2/SRC/pcpbtrsv.f000644 000766 000024 00000141644 11750130340 016356 0ustar00juliestaff000000 000000 SUBROUTINE PCPBTRSV( UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B, $ IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PCPBTRSV solves a banded triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a banded * triangular matrix factor produced by the * Cholesky factorization code PCPBTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H * is dictated by the user by the parameter TRANS. * * Routine PCPBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCPBTRF and this is stored in AF. If a linear system * is to be solved using PCPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (bw*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MBW2, MYCOL, MYROW, MY_NUM_COLS, NB, NP, $ NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ CGEMM, CGERV2D, CGESD2D, CLAMOV, CMATADD, $ CTBTRS, CTRMM, CTRTRS, DESC_CONVERT, GLOBCHK, $ PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * * Pre-calculate bw^2 * MBW2 = BW * BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -2 END IF * IF( LWORK .LT. -1) THEN INFO = -14 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -3 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -5 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PCPBTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PCPBTRSV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ BW*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, $ 'PCPBTRSV: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BW PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 14 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCPBTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW ENDIF * * * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * IF ( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL CTBTRS( UPLO, 'N', 'N', ODD_SIZE, $ BW, NRHS, $ A( OFST+1 ), LLDA, $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL CLAMOV( 'N', BW, NRHS, $ B( PART_OFFSET+ODD_SIZE-BW+1), LLDB, $ WORK( 1 ), BW ) * CALL CTRMM( 'L', 'U', 'N', 'N', BW, NRHS, -CONE, $ A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), LLDA-1, $ WORK( 1 ), BW ) * CALL CMATADD( BW, NRHS, CONE, WORK( 1 ), BW, $ CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL CGEMM( 'C', 'N', BW, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+BW-BW ), BW ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL CMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * CALL CMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * CALL CMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL CTRTRS( 'L', 'N', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'C', 'N', BW, NRHS, BW, -CONE, $ AF( (ODD_SIZE)*BW+1 ), $ BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL CGEMM( 'N', 'N', BW, NRHS, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 27 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 26 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 27 26 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL CGEMM( 'C', 'N', BW, NRHS, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, $ WORK( 1 ), $ BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'N', 'N', BW, NRHS, BW, -CONE, $ AF( (ODD_SIZE)*BW+1 ), $ BW, $ WORK( 1 ), $ BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL CTRTRS( 'L', 'C', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 22 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 21 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 22 21 CONTINUE * [End of GOTO Loop] * 24 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL CGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+BW-BW ), BW, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL CLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB, $ WORK( 1+BW-BW ), BW ) * CALL CTRMM( 'L', 'U', 'C', 'N', BW, NRHS, -CONE, $ A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), LLDA-1, $ WORK( 1+BW-BW ), BW ) * CALL CMATADD( BW, NRHS, CONE, WORK( 1+BW-BW ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL CTBTRS( UPLO, 'C', 'N', ODD_SIZE, $ BW, NRHS, $ A( OFST+1 ), $ LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF ( LSAME( TRANS, 'C' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL CTBTRS( UPLO, 'C', 'N', ODD_SIZE, $ BW, NRHS, $ A( OFST+1 ), LLDA, $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL CLAMOV( 'N', BW, NRHS, $ B( PART_OFFSET+ODD_SIZE-BW+1), LLDB, $ WORK( 1 ), BW ) * CALL CTRMM( 'L', 'L', 'C', 'N', BW, NRHS, -CONE, $ A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1, WORK( 1 ), $ BW ) * CALL CMATADD( BW, NRHS, CONE, WORK( 1 ), BW, $ CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL CGEMM( 'C', 'N', BW, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+BW-BW ), BW ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL CMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 44 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 42 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 41 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * CALL CMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * CALL CMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 42 41 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL CTRTRS( 'L', 'N', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'C', 'N', BW, NRHS, BW, -CONE, $ AF( (ODD_SIZE)*BW+1 ), $ BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL CGEMM( 'N', 'N', BW, NRHS, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ BW ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 44 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 54 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 57 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 56 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 57 56 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL CGEMM( 'C', 'N', BW, NRHS, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, $ WORK( 1 ), $ BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'N', 'N', BW, NRHS, BW, -CONE, $ AF( (ODD_SIZE)*BW+1 ), $ BW, $ WORK( 1 ), $ BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL CTRTRS( 'L', 'C', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 52 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 51 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 52 51 CONTINUE * [End of GOTO Loop] * 54 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL CGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+BW-BW ), BW, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL CLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB, $ WORK( 1+BW-BW ), BW ) * CALL CTRMM( 'L', 'L', 'N', 'N', BW, NRHS, -CONE, $ A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1, $ WORK( 1+BW-BW ), BW ) * CALL CMATADD( BW, NRHS, CONE, WORK( 1+BW-BW ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL CTBTRS( UPLO, 'N', 'N', ODD_SIZE, $ BW, NRHS, $ A( OFST+1 ), $ LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ENDIF * End of "IF( LSAME( UPLO, 'L' ) )"... 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PCPBTRSV * END scalapack-2.0.2/SRC/pcpocon.f000644 000766 000024 00000036646 10363532303 016165 0ustar00juliestaff000000 000000 SUBROUTINE PCPOCON( UPLO, N, A, IA, JA, DESCA, ANORM, RCOND, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LRWORK, LWORK, N REAL ANORM, RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL RWORK( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCPOCON estimates the reciprocal of the condition number (in the * 1-norm) of a complex Hermitian positive definite distributed matrix * using the Cholesky factorization A = U**H*U or A = L*L**H computed by * PCPOTRF. * * An estimate is obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), and * the reciprocal of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the factor stored in * A(IA:IA+N-1,JA:JA+N-1) is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) COMPLEX pointer into the local memory to * an array of dimension ( LLD_A, LOCc(JA+N-1) ). On entry, this * array contains the local pieces of the factors L or U from * the Cholesky factorization A(IA:IA+N-1,JA:JA+N-1) = U'*U or * L*L', as computed by PCPOTRF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ANORM (global input) REAL * The 1-norm (or infinity-norm) of the hermitian distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * * RCOND (global output) REAL * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + * MAX( 2, MAX(NB_A*MAX(1,CEIL(P-1,Q)),LOCc(N+MOD(JA-1,NB_A)) + * NB_A*MAX(1,CEIL(Q-1,P))) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) REAL array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= 2*LOCc(N+MOD(JA-1,NB_A)). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPNL, IPNU, $ IPV, IPW, IPX, IROFF, IV, IX, IXX, JJA, JV, $ JX, KASE, LRWMIN, LWMIN, MYCOL, MYROW, NP, $ NPCOL, NPROW, NPMOD, NQ, NQMOD REAL AINVNM, SCALE, SL, SU, SMLNUM COMPLEX WMAX, ZDUM * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, CHK1MAT, $ DESCSET, INFOG2L, PCAMAX, PCHK1MAT, PCLATRS, $ PCLACON, PCSRSCL, PB_TOPGET, PB_TOPSET, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, ICHAR, MAX, MOD, REAL * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = REAL( LWMIN ) LRWMIN = 2*NQMOD RWORK( 1 ) = REAL( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( ANORM.LT.ZERO ) THEN INFO = -7 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 IF( LRWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 12 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPOCON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * RCOND = ZERO IF( N.EQ.0 ) THEN RCOND = ONE RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN ELSE IF( N.EQ.1 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPW = IPV + NP IPNL = 1 IPNU = IPNL + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Estimate the 1-norm (or I-norm) of inv(A). * AINVNM = ZERO KASE = 0 NORMIN = 'N' * 10 CONTINUE CALL PCLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), IX, JX, $ DESCX, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( UPPER ) THEN * * Multiply by inv(U'). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( 'Upper', 'Conjugate transpose', 'Non-unit', $ NORMIN, N, A, IA, JA, DESCA, WORK( IPX ), $ IX, JX, DESCX, SL, RWORK( IPNL ), $ WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL NORMIN = 'Y' * * Multiply by inv(U). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( 'Upper', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, RWORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(L). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( 'Lower', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, RWORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL NORMIN = 'Y' * * Multiply by inv(L'). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( 'Lower', 'Conjugate transpose', 'Non-unit', $ NORMIN, N, A, IA, JA, DESCA, WORK( IPX ), $ IX, JX, DESCX, SU, RWORK( IPNU ), $ WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF * * Multiply by 1/SCALE if doing so will not cause overflow. * SCALE = SL*SU IF( SCALE.NE.ONE ) THEN CALL PCAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL CGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1 ) ELSE CALL CGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1, $ IAROW, MYCOL ) END IF END IF IF( SCALE.LT.CABS1( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PCSRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / AINVNM ) / ANORM * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PCPOCON * END scalapack-2.0.2/SRC/pcpoequ.f000644 000766 000024 00000031250 10363532303 016162 0ustar00juliestaff000000 000000 SUBROUTINE PCPOEQU( N, A, IA, JA, DESCA, SR, SC, SCOND, AMAX, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, N REAL AMAX, SCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL SC( * ), SR( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCPOEQU computes row and column scalings intended to * equilibrate a distributed Hermitian positive definite matrix * sub( A ) = A(IA:IA+N-1,JA:JA+N-1) and reduce its condition number * (with respect to the two-norm). SR and SC contain the scale * factors, S(i) = 1/sqrt(A(i,i)), chosen so that the scaled distri- * buted matrix B with elements B(i,j) = S(i)*A(i,j)*S(j) has ones on * the diagonal. This choice of SR and SC puts the condition number * of B within a factor N of the smallest possible condition number * over all possible diagonal scalings. * * The scaling factor are stored along process rows in SR and along * process columns in SC. The duplication of information simplifies * greatly the application of the factors. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX pointer into the local memory to an * array of local dimension ( LLD_A, LOCc(JA+N-1) ), the * N-by-N Hermitian positive definite distributed matrix * sub( A ) whose scaling factors are to be computed. Only the * diagonal elements of sub( A ) are referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * SR (local output) REAL array, dimension LOCr(M_A) * If INFO = 0, SR(IA:IA+N-1) contains the row scale factors * for sub( A ). SR is aligned with the distributed matrix A, * and replicated across every process column. SR is tied to the * distributed matrix A. * * SC (local output) REAL array, dimension LOCc(N_A) * If INFO = 0, SC(JA:JA+N-1) contains the column scale factors * for A(IA:IA+M-1,JA:JA+N-1). SC is aligned with the distribu- * ted matrix A, and replicated down every process row. SC is * tied to the distributed matrix A. * * SCOND (global output) REAL * If INFO = 0, SCOND contains the ratio of the smallest SR(i) * (or SC(j)) to the largest SR(i) (or SC(j)), with * IA <= i <= IA+N-1 and JA <= j <= JA+N-1. If SCOND >= 0.1 * and AMAX is neither too large nor too small, it is not worth * scaling by SR (or SC). * * AMAX (global output) REAL * Absolute value of largest matrix element. If AMAX is very * close to overflow or very close to underflow, the matrix * should be scaled. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the K-th diagonal entry of sub( A ) is * nonpositive. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. CHARACTER ALLCTOP, COLCTOP, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURCOL, ICURROW, $ IDUMM, II, IIA, IOFFA, IOFFD, IROFF, J, JB, JJ, $ JJA, JN, LDA, LL, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ REAL AII, SMIN * .. * .. Local Arrays .. INTEGER DESCSC( DLEN_ ), DESCSR( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, IGAMN2D, $ INFOG2L, PCHK1MAT, PB_TOPGET, PXERBLA, $ SGAMN2D, SGAMX2D, SGSUM2D * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PSLAMCH EXTERNAL ICEIL, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(500+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, INFO ) CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, 0, IDUMM, IDUMM, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPOEQU', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN SCOND = ONE AMAX = ZERO RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'All', ALLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * * Compute some local indexes * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) * * Assign descriptors for SR and SC arrays * CALL DESCSET( DESCSR, N, 1, DESCA( MB_ ), 1, 0, 0, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCSC, 1, N, 1, DESCA( NB_ ), 0, 0, ICTXT, 1 ) * * Initialize the scaling factors to zero. * DO 10 II = IIA, IIA+NP-1 SR( II ) = ZERO 10 CONTINUE * DO 20 JJ = JJA, JJA+NQ-1 SC( JJ ) = ZERO 20 CONTINUE * * Find the minimum and maximum diagonal elements. * Handle first block separately. * II = IIA JJ = JJA JB = JN-JA+1 SMIN = ONE / PSLAMCH( ICTXT, 'S' ) AMAX = ZERO * IOFFA = II+(JJ-1)*LDA IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN IOFFD = IOFFA DO 30 LL = 0, JB-1 AII = REAL( A( IOFFD ) ) SR( II+LL ) = AII SC( JJ+LL ) = AII SMIN = MIN( SMIN, AII ) AMAX = MAX( AMAX, AII ) IF( AII.LE.ZERO .AND. INFO.EQ.0 ) $ INFO = LL + 1 IOFFD = IOFFD + LDA + 1 30 CONTINUE END IF * IF( MYROW.EQ.IAROW ) THEN II = II + JB IOFFA = IOFFA + JB END IF IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + JB IOFFA = IOFFA + JB*LDA END IF ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 50 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFD = IOFFA DO 40 LL = 0, JB-1 AII = REAL( A( IOFFD ) ) SR( II+LL ) = AII SC( JJ+LL ) = AII SMIN = MIN( SMIN, AII ) AMAX = MAX( AMAX, AII ) IF( AII.LE.ZERO .AND. INFO.EQ.0 ) $ INFO = J + LL - JA + 1 IOFFD = IOFFD + LDA + 1 40 CONTINUE END IF * IF( MYROW.EQ.ICURROW ) THEN II = II + JB IOFFA = IOFFA + JB END IF IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + JB IOFFA = IOFFA + JB*LDA END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 50 CONTINUE * * Compute scaling factors * CALL SGSUM2D( ICTXT, 'Columnwise', COLCTOP, 1, NQ, SC( JJA ), $ 1, -1, MYCOL ) CALL SGSUM2D( ICTXT, 'Rowwise', ROWCTOP, NP, 1, SR( IIA ), $ MAX( 1, NP ), -1, MYCOL ) * CALL SGAMX2D( ICTXT, 'All', ALLCTOP, 1, 1, AMAX, 1, IDUMM, IDUMM, $ -1, -1, MYCOL ) CALL SGAMN2D( ICTXT, 'All', ALLCTOP, 1, 1, SMIN, 1, IDUMM, IDUMM, $ -1, -1, MYCOL ) * IF( SMIN.LE.ZERO ) THEN * * Find the first non-positive diagonal element and return. * CALL IGAMN2D( ICTXT, 'All', ALLCTOP, 1, 1, INFO, 1, II, JJ, -1, $ -1, MYCOL ) RETURN * ELSE * * Set the scale factors to the reciprocals * of the diagonal elements. * DO 60 II = IIA, IIA+NP-1 SR( II ) = ONE / SQRT( SR( II ) ) 60 CONTINUE * DO 70 JJ = JJA, JJA+NQ-1 SC( JJ ) = ONE / SQRT( SC( JJ ) ) 70 CONTINUE * * Compute SCOND = min(S(I)) / max(S(I)) * SCOND = SQRT( SMIN ) / SQRT( AMAX ) * END IF * RETURN * * End of PCPOEQU * END scalapack-2.0.2/SRC/pcporfs.f000644 000766 000024 00000101326 10363532303 016164 0ustar00juliestaff000000 000000 SUBROUTINE PCPORFS( UPLO, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, B, IB, JB, DESCB, X, IX, JX, DESCX, $ FERR, BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, $ LRWORK, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ) COMPLEX A( * ), AF( * ), B( * ), WORK( * ), X( * ) REAL BERR( * ), FERR( * ), RWORK( * ) * .. * * Purpose * ======= * * PCPORFS improves the computed solution to a system of linear * equations when the coefficient matrix is Hermitian positive definite * and provides error bounds and backward error estimates for the * solutions. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) COMPLEX pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1) ). * This array contains the local pieces of the N-by-N Hermitian * distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input) COMPLEX pointer into the local memory * to an array of local dimension (LLD_AF,LOCc(JA+N-1)). * On entry, this array contains the factors L or U from the * Cholesky factorization sub( A ) = L*L**H or U**H*U, as * computed by PCPOTRF. * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * B (local input) COMPLEX pointer into the local memory * to an array of local dimension (LLD_B, LOCc(JB+NRHS-1) ). * On entry, this array contains the the local pieces of the * right hand sides sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input) COMPLEX pointer into the local memory * to an array of local dimension (LLD_X, LOCc(JX+NRHS-1) ). * On entry, this array contains the the local pieces of the * solution vectors sub( X ). On exit, it contains the * improved solution vectors. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). * The estimated forward error bound for each solution vector * of sub( X ). If XTRUE is the true solution corresponding * to sub( X ), FERR is an estimated upper bound for the * magnitude of the largest element in (sub( X ) - XTRUE) * divided by the magnitude of the largest element in sub( X ). * The estimate is as reliable as the estimate for RCOND, and * is almost always a slight overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr( N + MOD( IA-1, MB_A ) ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) REAL array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCr( N + MOD( IB-1, MB_B ) ). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices op( A ) and op( AF ) (respectively * sub( X ) and sub( B ) ) should be distributed the same way on the * same processes. These conditions ensure that sub( A ) and sub( AF ) * (resp. sub( X ) and sub( B ) ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( AF ), sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IAF, DESCAF( MB_ ) ) = f( JAF, DESCAF( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) REAL ZERO, RONE, TWO, THREE PARAMETER ( ZERO = 0.0E+0, RONE = 1.0E+0, TWO = 2.0E+0, $ THREE = 3.0E+0 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER COUNT, IACOL, IAFCOL, IAFROW, IAROW, IXBCOL, $ IXBROW, IXCOL, IXROW, ICOFFA, ICOFFAF, ICOFFB, $ ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, IIW, $ IOFFXB, IPB, IPR, IPV, IROFFA, IROFFAF, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LRWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ REAL EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN COMPLEX ZDUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, CHK1MAT, $ DESCSET, INFOG2L, PCAHEMV, PCAXPY, PCHK2MAT, $ PCCOPY, PCHEMV, PCPOTRS, PCLACON, $ PXERBLA, SGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, N, 2, IAF, JAF, DESCAF, 11, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 15, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IX, JX, DESCX, 19, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFAF = MOD( JAF-1, DESCAF( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFCOL = INDXG2P( JAF, DESCAF( NB_ ), MYCOL, $ DESCAF( CSRC_ ), NPCOL ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 2 * NPMOD LRWMIN = NPMOD WORK( 1 ) = CMPLX( REAL( LWMIN ) ) RWORK( 1 ) = REAL( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700 + NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCAF( MB_ ) ) THEN INFO = -( 1100 + MB_ ) ELSE IF( IROFFAF.NE.0 .OR. IAROW.NE.IAFROW ) THEN INFO = -9 ELSE IF( DESCA( NB_ ).NE.DESCAF( NB_ ) ) THEN INFO = -( 1100 + NB_ ) ELSE IF( ICOFFAF.NE.0 .OR. IACOL.NE.IAFCOL ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -( 1100 + CTXT_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -13 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1500 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1500 + CTXT_ ) ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 1900 + MB_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -17 ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 1900 + NB_ ) ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -18 ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 1900 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -25 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IDUM1( 2 ) = N IDUM2( 2 ) = 2 IDUM1( 3 ) = NRHS IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 23 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 25 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, N, 2, IAF, $ JAF, DESCAF, 11, 0, IDUM1, IDUM2, INFO ) CALL PCHK2MAT( N, 2, NRHS, 3, IB, JB, DESCB, 15, N, 2, NRHS, 3, $ IX, JX, DESCX, 19, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPORFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = 1 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = 1 + maximum number of nonzero entries in each row of sub( A ) * NZ = N + 1 EPS = PSLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) ) * DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 100 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 20 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X) * CALL PCCOPY( N, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PCHEMV( UPLO, N, -ONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, ONE, WORK( IPR ), IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i))/(abs(sub(A))*abs(sub(X))+abs(sub(B)) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to * the i-th components of the numerator and denominator * before dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIXB, IIXB + NP - 1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 30 CONTINUE END IF END IF * CALL PCAHEMV( UPLO, N, RONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, RONE, RWORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 40 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J) is larger than machine epsilon, and * 2) BERR(J) decreased by at least a factor of 2 during the * last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PCPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PCAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, IX, $ JX+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 20 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(sub(A)))* * ( abs(R) + * NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B)) ))) / norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(sub(A)) is the inverse of sub(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(sub(A))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(sub(A))*abs(sub(X)) + abs(sub(B)) is less than SAFE2. * * Use PCLACON to estimate the infinity-norm of the matrix * inv(sub(A)) * diag(W), where * W = abs(R) + NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B))))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 50 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 50 CONTINUE END IF END IF * KASE = 0 60 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL CGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PCLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(sub(A)'). * CALL PCPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF ELSE * * Multiply by inv(sub(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 80 CONTINUE END IF END IF * CALL PCPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 60 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 90 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 90 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 100 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 200 J = JN+1, JB+NRHS-1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 190 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 110 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub( B ) - sub( A )*sub( X ). * CALL PCCOPY( N, B, IB, J+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PCHEMV( UPLO, N, -ONE, A, IA, JA, DESCA, X, IX, J+K, $ DESCX, 1, ONE, WORK( IPR ), IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / * ( abs(sub(A))*abs(sub(X)) + abs(sub(B)) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIXB, IIXB+NP-1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 120 CONTINUE END IF END IF * CALL PCAHEMV( UPLO, N, RONE, A, IA, JA, DESCA, X, IX, J+K, $ DESCX, 1, RONE, RWORK( IPB ), IW, JW, DESCW, $ 1 ) * S = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 )THEN DO 130 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 130 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.ICURCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during * the last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. $ COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PCPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PCAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, $ IX, J+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 110 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(sub(A)))* * ( abs(R) + NZ*EPS*( * abs(sub(A))*abs(sub(X))+abs(sub(B)) )))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(sub(A)) is the inverse of sub(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of abs(R)+NZ*EPS*(abs(sub(A))*abs(sub(X)) * +abs(sub(B))) is incremented by SAFE1 if the i-th component * of abs(sub(A))*abs(sub(X)) + abs(sub(B)) is less than SAFE2. * * Use PCLACON to estimate the infinity-norm of the matrix * inv(sub(A)) * diag(W), where * W = abs(R) + NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B))))) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 140 CONTINUE END IF END IF * KASE = 0 150 CONTINUE IF( MYCOL.EQ.ICURCOL ) THEN CALL CGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, ICURCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PCLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = ICURCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(sub(A)'). * CALL PCPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 160 CONTINUE END IF END IF ELSE * * Multiply by inv(sub(A))*diag(W). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 170 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 170 CONTINUE END IF END IF * CALL PCPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 150 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 180 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 180 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, $ IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 190 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 200 CONTINUE * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) RWORK( 1 ) = REAL( LRWMIN ) * RETURN * * End of PCPORFS * END scalapack-2.0.2/SRC/pcposv.f000644 000766 000024 00000024322 10363532303 016022 0ustar00juliestaff000000 000000 SUBROUTINE PCPOSV( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ) * .. * * Purpose * ======= * * PCPOSV computes the solution to a complex system of linear equations * * sub( A ) * X = sub( B ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is an N-by-N * hermitian distributed positive definite matrix and X and sub( B ) * denoting B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS distributed * matrices. * * The Cholesky decomposition is used to factor sub( A ) as * * sub( A ) = U**H * U, if UPLO = 'U', or * * sub( A ) = L * L**H, if UPLO = 'L', * * where U is an upper triangular matrix and L is a lower triangular * matrix. The factored form of sub( A ) is then used to solve the * system of equations. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if INFO = 0, this array contains the * local pieces of the factor U or L from the Cholesky factori- * zation sub( A ) = U**H*U or L*L**H. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_B,LOC(JB+NRHS-1)). * On entry, the local pieces of the right hand sides distribu- * ted matrix sub( B ). On exit, if INFO = 0, sub( B ) is over- * written with the solution distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed, and the * solution has not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCPOTRF, $ PCPOTRS, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1000+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, $ 3, IB, JB, DESCB, 11, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPOSV', -INFO ) RETURN END IF * * Compute the Cholesky factorization sub( A ) = U'*U or L*L'. * CALL PCPOTRF( UPLO, N, A, IA, JA, DESCA, INFO ) * IF( INFO.EQ.0 ) THEN * * Solve the system sub( A ) * X = sub( B ) overwriting sub( B ) * with X. * CALL PCPOTRS( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * END IF * RETURN * * End of PCPOSV * END scalapack-2.0.2/SRC/pcposvx.f000644 000766 000024 00000065607 10363532303 016225 0ustar00juliestaff000000 000000 SUBROUTINE PCPOSVX( FACT, UPLO, N, NRHS, A, IA, JA, DESCA, AF, $ IAF, JAF, DESCAF, EQUED, SR, SC, B, IB, JB, $ DESCB, X, IX, JX, DESCX, RCOND, FERR, BERR, $ WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER EQUED, FACT, UPLO INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, LRWORK, $ LWORK, N, NRHS REAL RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), DESCX( * ) REAL BERR( * ), FERR( * ), SC( * ), $ SR( * ), RWORK( * ) COMPLEX A( * ), AF( * ), $ B( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PCPOSVX uses the Cholesky factorization A = U**H*U or A = L*L**H to * compute the solution to a complex system of linear equations * * A(IA:IA+N-1,JA:JA+N-1) * X = B(IB:IB+N-1,JB:JB+NRHS-1), * * where A(IA:IA+N-1,JA:JA+N-1) is an N-by-N matrix and X and * B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS matrices. * * Error bounds on the solution and a condition estimate are also * provided. In the following comments Y denotes Y(IY:IY+M-1,JY:JY+K-1) * a M-by-K matrix where Y can be A, AF, B and X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Description * =========== * * The following steps are performed: * * 1. If FACT = 'E', real scaling factors are computed to equilibrate * the system: * diag(SR) * A * diag(SC) * inv(diag(SC)) * X = diag(SR) * B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(SR)*A*diag(SC) and B by diag(SR)*B. * * 2. If FACT = 'N' or 'E', the Cholesky decomposition is used to * factor the matrix A (after equilibration if FACT = 'E') as * A = U**T* U, if UPLO = 'U', or * A = L * L**T, if UPLO = 'L', * where U is an upper triangular matrix and L is a lower triangular * matrix. * * 3. The factored form of A is used to estimate the condition number * of the matrix A. If the reciprocal of the condition number is * less than machine precision, steps 4-6 are skipped. * * 4. The system of equations is solved for X using the factored form * of A. * * 5. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 6. If equilibration was used, the matrix X is premultiplied by * diag(SR) so that it solves the original system before * equilibration. * * Arguments * ========= * * FACT (global input) CHARACTER * Specifies whether or not the factored form of the matrix A is * supplied on entry, and if not, whether the matrix A should be * equilibrated before it is factored. * = 'F': On entry, AF contains the factored form of A. * If EQUED = 'Y', the matrix A has been equilibrated * with scaling factors given by S. A and AF will not * be modified. * = 'N': The matrix A will be copied to AF and factored. * = 'E': The matrix A will be equilibrated if necessary, then * copied to AF and factored. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrices B and X. NRHS >= 0. * * A (local input/local output) COMPLEX pointer into * the local memory to an array of local dimension * ( LLD_A, LOCc(JA+N-1) ). * On entry, the Hermitian matrix A, except if FACT = 'F' and * EQUED = 'Y', then A must contain the equilibrated matrix * diag(SR)*A*diag(SC). If UPLO = 'U', the leading * N-by-N upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. A is not modified if * FACT = 'F' or 'N', or if FACT = 'E' and EQUED = 'N' on exit. * * On exit, if FACT = 'E' and EQUED = 'Y', A is overwritten by * diag(SR)*A*diag(SC). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input or local output) COMPLEX pointer * into the local memory to an array of local dimension * ( LLD_AF, LOCc(JA+N-1)). * If FACT = 'F', then AF is an input argument and on entry * contains the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T, in the same storage * format as A. If EQUED .ne. 'N', then AF is the factored form * of the equilibrated matrix diag(SR)*A*diag(SC). * * If FACT = 'N', then AF is an output argument and on exit * returns the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T of the original * matrix A. * * If FACT = 'E', then AF is an output argument and on exit * returns the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T of the equilibrated * matrix A (see the description of A for the form of the * equilibrated matrix). * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * EQUED (global input/global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration (always true if FACT = 'N'). * = 'Y': Equilibration was done, i.e., A has been replaced by * diag(SR) * A * diag(SC). * EQUED is an input variable if FACT = 'F'; otherwise, it is an * output variable. * * SR (local input/local output) COMPLEX array, * dimension (LLD_A) * The scale factors for A distributed across process rows; * not accessed if EQUED = 'N'. SR is an input variable if * FACT = 'F'; otherwise, SR is an output variable. * If FACT = 'F' and EQUED = 'Y', each element of SR must be * positive. * * SC (local input/local output) COMPLEX array, * dimension (LOC(N_A)) * The scale factors for A distributed across * process columns; not accessed if EQUED = 'N'. SC is an input * variable if FACT = 'F'; otherwise, SC is an output variable. * If FACT = 'F' and EQUED = 'Y', each element of SC must be * positive. * * B (local input/local output) COMPLEX pointer into * the local memory to an array of local dimension * ( LLD_B, LOCc(JB+NRHS-1) ). * On entry, the N-by-NRHS right-hand side matrix B. * On exit, if EQUED = 'N', B is not modified; if TRANS = 'N' * and EQUED = 'R' or 'B', B is overwritten by diag(R)*B; if * TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is overwritten * by diag(C)*B. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input/local output) COMPLEX pointer into * the local memory to an array of local dimension * ( LLD_X, LOCc(JX+NRHS-1) ). * If INFO = 0, the N-by-NRHS solution matrix X to the original * system of equations. Note that A and B are modified on exit * if EQUED .ne. 'N', and the solution to the equilibrated * system is inv(diag(SC))*X if TRANS = 'N' and EQUED = 'C' or * 'B', or inv(diag(SR))*X if TRANS = 'T' or 'C' and EQUED = 'R' * or 'B'. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * RCOND (global output) REAL * The estimate of the reciprocal condition number of the matrix * A after equilibration (if done). If RCOND is less than the * machine precision (in particular, if RCOND = 0), the matrix * is singular to working precision. This condition is * indicated by a return code of INFO > 0, and the solution and * error bounds are not computed. * * FERR (local output) REAL array, dimension (LOC(N_B)) * The estimated forward error bounds for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution, FERR(j) bounds the magnitude * of the largest entry in (X(j) - XTRUE) divided by * the magnitude of the largest entry in X(j). The quality of * the error bound depends on the quality of the estimate of * norm(inv(A)) computed in the code; if the estimate of * norm(inv(A)) is accurate, the error bound is guaranteed. * * BERR (local output) REAL array, dimension (LOC(N_B)) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any entry of A or B that makes X(j) an exact solution). * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = MAX( PCPOCON( LWORK ), PCPORFS( LWORK ) ) * + LOCr( N_A ). * LWORK = 3*DESCA( LLD_ ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) REAL array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK = 2*LOCc(N_A). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, and i is * <= N: if INFO = i, the leading minor of order i of A * is not positive definite, so the factorization * could not be completed, and the solution and error * bounds could not be computed. * = N+1: RCOND is less than machine precision. The * factorization has been completed, but the matrix * is singular to working precision, and the solution * and error bounds have not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL EQUIL, LQUERY, NOFACT, RCEQU INTEGER I, IACOL, IAROW, IAFROW, IBROW, IBCOL, ICOFF, $ ICOFFA, ICTXT, IDUMM, IIA, IIB, IIX, INFEQU, $ IROFF, IROFFA, IROFFAF, IROFFB, IROFFX, IXCOL, $ IXROW, J, JJA, JJB, JJX, LDB, LDX, LRWMIN, $ LWMIN, MYCOL, MYROW, NP, NPCOL, NPROW, NRHSQ, $ NQ REAL AMAX, ANORM, BIGNUM, SCOND, SMAX, SMIN, SMLNUM * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, INFOG2L, $ PCPOCON, PCPOEQU, $ PCPORFS, PCPOTRF, PCPOTRS, $ PCLACPY, PCLAQSY, PXERBLA, $ SGAMN2D, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC REAL PCLANHE, PSLAMCH EXTERNAL INDXG2P, LSAME, NUMROC, PCLANHE, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(800+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 8, INFO ) IF( LSAME( FACT, 'F' ) ) $ CALL CHK1MAT( N, 3, N, 3, IAF, JAF, DESCAF, 12, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 20, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IIA, JJA, IAROW, IACOL ) NP = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFFA LWMIN = 3*DESCA( LLD_ ) LRWMIN = MAX( 2*NQ, NP ) NOFACT = LSAME( FACT, 'N' ) EQUIL = LSAME( FACT, 'E' ) IF( NOFACT .OR. EQUIL ) THEN EQUED = 'N' RCEQU = .FALSE. ELSE RCEQU = LSAME( EQUED, 'Y' ) SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM END IF IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. $ .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( ICOFFA.NE.0 .OR. IROFFA.NE.ICOFFA ) THEN INFO = -7 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(800+NB_) ELSE IF( IAFROW.NE.IAROW ) THEN INFO = -10 ELSE IF( IROFFAF.NE.0 ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -(1200+CTXT_) ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. $ ( RCEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN INFO = -13 ELSE IF( RCEQU ) THEN * SMIN = BIGNUM SMAX = ZERO DO 10 J = IIA, IIA + NP - 1 SMIN = MIN( SMIN, SR( J ) ) SMAX = MAX( SMAX, SR( J ) ) 10 CONTINUE CALL SGAMN2D( ICTXT, 'Columnwise', ' ', 1, 1, SMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, SMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( SMIN.LE.ZERO ) THEN INFO = -14 ELSE IF( N.GT.0 ) THEN SCOND = MAX( SMIN, SMLNUM ) / MIN( SMAX, BIGNUM ) ELSE SCOND = ONE END IF END IF END IF END IF * WORK( 1 ) = REAL( LWMIN ) RWORK( 1 ) = REAL( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IBROW.NE.IAROW ) THEN INFO = -18 ELSE IF( IXROW.NE.IBROW ) THEN INFO = -22 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2000+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(2000+CTXT_) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -(2400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -28 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -30 END IF IDUM1( 1 ) = ICHAR( FACT ) IDUM2( 1 ) = 1 IDUM1( 2 ) = ICHAR( UPLO ) IDUM2( 2 ) = 2 IF( LSAME( FACT, 'F' ) ) THEN IDUM1( 3 ) = ICHAR( EQUED ) IDUM2( 3 ) = 13 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 28 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 30 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, $ 4, IB, JB, DESCB, 19, 5, IDUM1, IDUM2, $ INFO ) ELSE IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 28 IF( LRWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 30 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, $ 4, IB, JB, DESCB, 19, 4, IDUM1, IDUM2, $ INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPOSVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * IF( EQUIL ) THEN * * Compute row and column scalings to equilibrate the matrix A. * CALL PCPOEQU( N, A, IA, JA, DESCA, SR, SC, SCOND, AMAX, $ INFEQU ) * IF( INFEQU.EQ.0 ) THEN * * Equilibrate the matrix. * CALL PCLAQSY( UPLO, N, A, IA, JA, DESCA, SR, SC, SCOND, $ AMAX, EQUED ) RCEQU = LSAME( EQUED, 'Y' ) END IF END IF * * Scale the right-hand side. * CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, $ JJB, IBROW, IBCOL ) LDB = DESCB( LLD_ ) IROFF = MOD( IB-1, DESCB( MB_ ) ) ICOFF = MOD( JB-1, DESCB( NB_ ) ) NP = NUMROC( N+IROFF, DESCB( MB_ ), MYROW, IBROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFF, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) IF( MYROW.EQ.IBROW ) NP = NP-IROFF IF( MYCOL.EQ.IBCOL ) NRHSQ = NRHSQ-ICOFF * IF( RCEQU ) THEN DO 30 J = JJB, JJB+NRHSQ-1 DO 20 I = IIB, IIB+NP-1 B( I + ( J-1 )*LDB ) = SR( I )*B( I + ( J-1 )*LDB ) 20 CONTINUE 30 CONTINUE END IF * IF( NOFACT .OR. EQUIL ) THEN * * Compute the Cholesky factorization A = U'*U or A = L*L'. * CALL PCLACPY( 'Full', N, N, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF ) CALL PCPOTRF( UPLO, N, AF, IAF, JAF, DESCAF, INFO ) * * Return if INFO is non-zero. * IF( INFO.NE.0 ) THEN IF( INFO.GT.0 ) $ RCOND = ZERO RETURN END IF END IF * * Compute the norm of the matrix A. * ANORM = PCLANHE( '1', UPLO, N, A, IA, JA, DESCA, RWORK ) * * Compute the reciprocal of the condition number of A. * CALL PCPOCON( UPLO, N, AF, IAF, JAF, DESCAF, ANORM, RCOND, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * Return if the matrix is singular to working precision. * IF( RCOND.LT.PSLAMCH( ICTXT, 'Epsilon' ) ) THEN INFO = IA + N RETURN END IF * * Compute the solution matrix X. * CALL PCLACPY( 'Full', N, NRHS, B, IB, JB, DESCB, X, IX, JX, $ DESCX ) CALL PCPOTRS( UPLO, N, NRHS, AF, IAF, JAF, DESCAF, X, IX, JX, $ DESCX, INFO ) * * Use iterative refinement to improve the computed solution and * compute error bounds and backward error estimates for it. * CALL PCPORFS( UPLO, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, B, IB, JB, DESCB, X, IX, JX, DESCX, FERR, $ BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * Transform the solution matrix X to a solution of the original * system. * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) LDX = DESCX( LLD_ ) IROFF = MOD( IX-1, DESCX( MB_ ) ) ICOFF = MOD( JX-1, DESCX( NB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYROW.EQ.IBROW ) NP = NP-IROFF IF( MYCOL.EQ.IBCOL ) NRHSQ = NRHSQ-ICOFF * IF( RCEQU ) THEN DO 50 J = JJX, JJX+NRHSQ-1 DO 40 I = IIX, IIX+NP-1 X( I + ( J-1 )*LDX ) = SR( I )*X( I + ( J-1 )*LDX ) 40 CONTINUE 50 CONTINUE DO 60 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / SCOND 60 CONTINUE END IF * WORK( 1 ) = REAL( LWMIN ) RWORK( 1 ) = REAL( LRWMIN ) RETURN * * End of PCPOSVX * END scalapack-2.0.2/SRC/pcpotf2.f000644 000766 000024 00000031334 10363532303 016066 0ustar00juliestaff000000 000000 SUBROUTINE PCPOTF2( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCPOTF2 computes the Cholesky factorization of a complex hermitian * positive definite distributed matrix sub( A )=A(IA:IA+N-1,JA:JA+N-1). * * The factorization has the form * * sub( A ) = U' * U , if UPLO = 'U', or * * sub( A ) = L * L', if UPLO = 'L', * * where U is an upper triangular matrix and L is lower triangular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires N <= NB_A-MOD(JA-1, NB_A) and square block * decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if UPLO = 'U', the upper triangular * part of the distributed matrix contains the Cholesky factor * U, if UPLO = 'L', the lower triangular part of the distribu- * ted matrix contains the Cholesky factor L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) COMPLEX CONE PARAMETER ( CONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURR, IDIAG, IIA, $ IOFFA, IROFF, J, JJA, LDA, MYCOL, MYROW, $ NPCOL, NPROW REAL AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, CGEMV, $ CLACGV, CSSCAL, IGEBR2D, IGEBS2D, $ INFOG2L, PB_TOPGET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MOD, REAL, SQRT * .. * .. External Functions .. LOGICAL LSAME COMPLEX CDOTC EXTERNAL LSAME, CDOTC * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N+ICOFF.GT.DESCA( NB_ ) ) THEN INFO = -2 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPOTF2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Compute local information * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF ( UPPER ) THEN * * Process (IAROW, IACOL) owns block to be factorized * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN * * Compute the Cholesky factorization A = U'*U. * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * DO 10 J = JA, JA+N-1 * * Compute U(J,J) and test for non-positive-definiteness. * AJJ = REAL( A( IDIAG ) ) - $ CDOTC( J-JA, A( IOFFA ), 1, A( IOFFA ), 1 ) IF( AJJ.LE.ZERO ) THEN A( IDIAG ) = AJJ INFO = J - JA + 1 GO TO 20 END IF AJJ = SQRT( AJJ ) A( IDIAG ) = AJJ * * Compute elements J+1:JA+N-1 of row J. * IF( J.LT.JA+N-1 ) THEN ICURR = IDIAG + LDA CALL CLACGV( J-JA, A( IOFFA ), 1 ) CALL CGEMV( 'Transpose', J-JA, JA+N-J-1, -CONE, $ A( IOFFA+LDA ), LDA, A( IOFFA ), 1, $ CONE, A( ICURR ), LDA ) CALL CLACGV( J-JA, A( IOFFA ), 1 ) CALL CSSCAL( JA+N-J-1, ONE / AJJ, A( ICURR ), $ LDA ) END IF IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + LDA 10 CONTINUE * 20 CONTINUE * * Broadcast INFO to all processes in my IAROW. * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1, $ MYROW, IACOL ) END IF * * IAROW bcasts along columns so that everyone has INFO * CALL IGEBS2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, 1, $ IAROW, MYCOL ) * END IF * ELSE * * Process (IAROW, IACOL) owns block to be factorized * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Compute the Cholesky factorization A = L*L'. * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * DO 30 J = JA, JA+N-1 * * Compute L(J,J) and test for non-positive-definiteness. * AJJ = REAL( A( IDIAG ) ) - $ CDOTC( J-JA, A( IOFFA ), LDA, A( IOFFA ), LDA ) IF ( AJJ.LE.ZERO ) THEN A( IDIAG ) = AJJ INFO = J - JA + 1 GO TO 40 END IF AJJ = SQRT( AJJ ) A( IDIAG ) = AJJ * * Compute elements J+1:JA+N-1 of column J. * IF( J.LT.JA+N-1 ) THEN ICURR = IDIAG + 1 CALL CLACGV( J-JA, A( IOFFA ), LDA ) CALL CGEMV( 'No transpose', JA+N-J-1, J-JA, -CONE, $ A( IOFFA+1 ), LDA, A( IOFFA ), LDA, $ CONE, A( ICURR ), 1 ) CALL CLACGV( J-JA, A( IOFFA ), LDA ) CALL CSSCAL( JA+N-J-1, ONE / AJJ, A( ICURR ), 1 ) END IF IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + 1 30 CONTINUE * 40 CONTINUE * * Broadcast INFO to everyone in IACOL * CALL IGEBS2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, $ 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, $ 1, IAROW, MYCOL ) * END IF * * IACOL bcasts INFO along rows so that everyone has it * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1, $ MYROW, IACOL ) * END IF * END IF * RETURN * * End of PCPOTF2 * END scalapack-2.0.2/SRC/pcpotrf.f000644 000766 000024 00000031645 10363532303 016173 0ustar00juliestaff000000 000000 SUBROUTINE PCPOTRF( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCPOTRF computes the Cholesky factorization of an N-by-N complex * hermitian positive definite distributed matrix sub( A ) denoting * A(IA:IA+N-1, JA:JA+N-1). * * The factorization has the form * * sub( A ) = U' * U , if UPLO = 'U', or * * sub( A ) = L * L', if UPLO = 'L', * * where U is an upper triangular matrix and L is lower triangular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if UPLO = 'U', the upper triangular * part of the distributed matrix contains the Cholesky factor * U, if UPLO = 'L', the lower triangular part of the distribu- * ted matrix contains the Cholesky factor L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER I, ICOFF, ICTXT, IROFF, J, JB, JN, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PCPOTF2, PCHERK, PCTRSM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPOTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( UPPER ) THEN * * Split-ring topology for the communication along process * columns, 1-tree topology along process rows. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'S-ring' ) * * A is upper triangular, compute Cholesky factorization A = U'*U. * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA(NB_), JA+N-1 ) JB = JN - JA + 1 * * Perform unblocked Cholesky factorization on JB block * CALL PCPOTF2( UPLO, JB, A, IA, JA, DESCA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 * IF( JB+1.LE.N ) THEN * * Form the row panel of U using the triangular solver * CALL PCTRSM( 'Left', UPLO, 'Conjugate transpose', $ 'Non-Unit', JB, N-JB, CONE, A, IA, JA, DESCA, $ A, IA, JA+JB, DESCA ) * * Update the trailing matrix, A = A - U'*U * CALL PCHERK( UPLO, 'Conjugate transpose', N-JB, JB, -ONE, A, $ IA, JA+JB, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) END IF * * Loop over remaining block of columns * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Perform unblocked Cholesky factorization on JB block * CALL PCPOTF2( UPLO, JB, A, I, J, DESCA, INFO ) IF( INFO.NE.0 ) THEN INFO = INFO + J - JA GO TO 30 END IF * IF( J-JA+JB+1.LE.N ) THEN * * Form the row panel of U using the triangular solver * CALL PCTRSM( 'Left', UPLO, 'Conjugate transpose', $ 'Non-Unit', JB, N-J-JB+JA, CONE, A, I, J, $ DESCA, A, I, J+JB, DESCA ) * * Update the trailing matrix, A = A - U'*U * CALL PCHERK( UPLO, 'Conjugate transpose', N-J-JB+JA, JB, $ -ONE, A, I, J+JB, DESCA, ONE, A, I+JB, $ J+JB, DESCA ) END IF 10 CONTINUE * ELSE * * 1-tree topology for the communication along process columns, * Split-ring topology along process rows. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * A is lower triangular, compute Cholesky factorization A = L*L' * (right-looking) * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) JB = JN - JA + 1 * * Perform unblocked Cholesky factorization on JB block * CALL PCPOTF2( UPLO, JB, A, IA, JA, DESCA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 * IF( JB+1.LE.N ) THEN * * Form the column panel of L using the triangular solver * CALL PCTRSM( 'Right', UPLO, 'Conjugate transpose', $ 'Non-Unit', N-JB, JB, CONE, A, IA, JA, DESCA, $ A, IA+JB, JA, DESCA ) * * Update the trailing matrix, A = A - L*L' * CALL PCHERK( UPLO, 'No Transpose', N-JB, JB, -ONE, A, IA+JB, $ JA, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * END IF * DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Perform unblocked Cholesky factorization on JB block * CALL PCPOTF2( UPLO, JB, A, I, J, DESCA, INFO ) IF( INFO.NE.0 ) THEN INFO = INFO + J - JA GO TO 30 END IF * IF( J-JA+JB+1.LE.N ) THEN * * Form the column panel of L using the triangular solver * CALL PCTRSM( 'Right', UPLO, 'Conjugate transpose', $ 'Non-Unit', N-J-JB+JA, JB, CONE, A, I, J, $ DESCA, A, I+JB, J, DESCA ) * * Update the trailing matrix, A = A - L*L' * CALL PCHERK( UPLO, 'No Transpose', N-J-JB+JA, JB, -ONE, $ A, I+JB, J, DESCA, ONE, A, I+JB, J+JB, $ DESCA ) * END IF 20 CONTINUE * END IF * 30 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PCPOTRF * END scalapack-2.0.2/SRC/pcpotri.f000644 000766 000024 00000017153 10363532303 016174 0ustar00juliestaff000000 000000 SUBROUTINE PCPOTRI( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCPOTRI computes the inverse of a complex Hermitian positive definite * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the * Cholesky factorization sub( A ) = U**H*U or L*L**H computed by * PCPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor U or L * from the Cholesky factorization of the distributed matrix * sub( A ) = U**H*U or L*L**H, as computed by PCPOTRF. * On exit, the local pieces of the upper or lower triangle of * the (Hermitian) inverse of sub( A ), overwriting the input * factor U or L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, the (i,i) element of the factor U or L is * zero, and the inverse could not be computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER ICOFF, ICTXT, IROFF, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCLAUUM, $ PCTRTRI, PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.NE.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPOTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Invert the triangular Cholesky factor U or L. * CALL PCTRTRI( UPLO, 'Non-unit', N, A, IA, JA, DESCA, INFO ) * IF( INFO.GT.0 ) $ RETURN * * Form inv(U)*inv(U)' or inv(L)'*inv(L). * CALL PCLAUUM( UPLO, N, A, IA, JA, DESCA ) * RETURN * * End of PCPOTRI * END scalapack-2.0.2/SRC/pcpotrs.f000644 000766 000024 00000024017 10363532303 016203 0ustar00juliestaff000000 000000 SUBROUTINE PCPOTRS( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ) * .. * * Purpose * ======= * * PCPOTRS solves a system of linear equations * * sub( A ) * X = sub( B ) * A(IA:IA+N-1,JA:JA+N-1)*X = B(IB:IB+N-1,JB:JB+NRHS-1) * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is a N-by-N * hermitian positive definite distributed matrix using the Cholesky * factorization sub( A ) = U**H*U or L*L**H computed by PCPOTRF. * sub( B ) denotes the distributed matrix B(IB:IB+N-1,JB:JB+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) COMPLEX pointer into local memory to * an array of dimension (LLD_A, LOCc(JA+N-1)). On entry, this * array contains the factors L or U from the Cholesky facto- * rization sub( A ) = L*L**H or U**H*U, as computed by PCPOTRF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX pointer into the * local memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * the local pieces of the right hand sides sub( B ). * On exit, this array contains the local pieces of the solution * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IAROW, IBROW, ICTXT, IROFFA, IROFFB, ICOFFA, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCTRSM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 11, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1100+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, $ 3, IB, JB, DESCB, 11, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPOTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Solve sub( A ) * X = sub( B ) where sub( A ) = U'*U. * * Solve U'*X = sub( B ), overwriting sub( B ) with X. * CALL PCTRSM( 'Left', 'Upper', 'Conjugate transpose', $ 'Non-unit', N, NRHS, ONE, A, IA, JA, DESCA, B, IB, $ JB, DESCB ) * * Solve U*X = sub( B ), overwriting sub( B ) with X. * CALL PCTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) ELSE * * Solve sub( A ) *X = sub( B ) where sub( A ) = L*L'. * * Solve L*X = sub( B ), overwriting sub( B ) with X. * CALL PCTRSM( 'Left', 'Lower', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve L'*X = sub( B ), overwriting sub( B ) with X. * CALL PCTRSM( 'Left', 'Lower', 'Conjugate transpose', $ 'Non-unit', N, NRHS, ONE, A, IA, JA, DESCA, B, IB, $ JB, DESCB ) END IF * RETURN * * End of PCPOTRS * END scalapack-2.0.2/SRC/pcptsv.f000644 000766 000024 00000046041 10363532303 016031 0ustar00juliestaff000000 000000 SUBROUTINE PCPTSV( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX B( * ), E( * ), WORK( * ) REAL D( * ) * .. * * * Purpose * ======= * * PCPTSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * tridiagonal symmetric positive definite distributed * matrix. * * Cholesky factorization is used to factor a reordering of * the matrix into L L'. * * See PCPTTRF and PCPTTRS for details. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) COMPLEX pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) COMPLEX pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (12*NPCOL + 3*NB) * +max((10+2*min(100,NRHS))*NPCOL+4*NRHS, 8*NPCOL) * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PCPTTRF, PCPTTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PCPTTRF and PCPTTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSEIF( DESCA( DTYPE_ ) .EQ. 502 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 5*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PCPTSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (12*NPCOL + 3*NB) * * Factor the matrix * CALL PCPTTRF( N, D, E, JA, DESCA, WORK, MIN( LWORK, WS_FACTOR ), $ WORK( 1+WS_FACTOR ), LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PCPTSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PCPTTRS( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCPTSV', -INFO ) RETURN END IF * RETURN * * End of PCPTSV * END scalapack-2.0.2/SRC/pcpttrf.f000644 000766 000024 00000105135 11745552113 016202 0ustar00juliestaff000000 000000 SUBROUTINE PCPTTRF( N, D, E, JA, DESCA, AF, LAF, WORK, LWORK, $ INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX AF( * ), E( * ), WORK( * ) REAL D( * ) * .. * * * Purpose * ======= * * PCPTTRF computes a Cholesky factorization * of an N-by-N complex tridiagonal * symmetric positive definite distributed matrix * A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PCPTTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = U' D U or * * P A(1:N, JA:JA+N-1) P^T = L D L', * * where U is a tridiagonal upper triangular matrix and L is tridiagonal * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * D (local input/local output) COMPLEX pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) COMPLEX pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCPTTRF and this is stored in AF. If a linear system * is to be solved using PCPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 8*NPCOL * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, INT_TEMP, JA_NEW, $ LAF_MIN, LEVEL_DIST, LLDA, MYCOL, MYROW, $ MY_NUM_COLS, NB, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, PART_OFFSET, PART_SIZE, RETURN_CODE, $ STORE_N_A, TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ CAXPY, CGEMM, CGERV2D, CGESD2D, CLAMOV, $ CLATCPY, CPBTRF, CPOTRF, CSYRK, CTBTRS, CTRMM, $ CTRRV2D, CTRSD2D, CTRSM, CTRTRS, DESC_CONVERT, $ GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 5*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LWORK .LT. -1) THEN INFO = -9 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -1 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 5*100 + 6 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 5*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, $ 'PCPTTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 5*100+4 ) CALL PXERBLA( ICTXT, $ 'PCPTTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = (12*NPCOL + 3*NB) * IF( LAF .LT. LAF_MIN ) THEN INFO = -7 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PCPTTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = 8*NPCOL * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -9 CALL PXERBLA( ICTXT, $ 'PCPTTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 7, 1 ) = DESCA(5) PARAM_CHECK( 6, 1 ) = DESCA(4) PARAM_CHECK( 5, 1 ) = DESCA(3) PARAM_CHECK( 4, 1 ) = DESCA(1) PARAM_CHECK( 3, 1 ) = JA PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 7, 2 ) = 505 PARAM_CHECK( 6, 2 ) = 504 PARAM_CHECK( 5, 2 ) = 503 PARAM_CHECK( 4, 2 ) = 501 PARAM_CHECK( 3, 2 ) = 4 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 9 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 7, PARAM_CHECK, 7, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCPTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * * Zero out space for fillin * DO 10 I=1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * IF ( MYCOL .LT. NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL CTRSD2D( ICTXT, 'U', 'N', 1, 1, $ E( PART_OFFSET+ODD_SIZE+1 ), LLDA-1, 0, $ MYCOL+1 ) * ENDIF * * * Factor main partition A_i = L_i {L_i}^C in each processor * Or A_i = {U_i}^C {U_i} if E is the upper superdiagonal * CALL CPTTRF( ODD_SIZE, D( PART_OFFSET+1 ), E( PART_OFFSET+1 ), $ INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1500 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * * Perform the triangular system solve {L_i}{{B'}_i}^C = {B_i}^C * by dividing B_i by diagonal element * E( PART_OFFSET+ODD_SIZE ) = E( PART_OFFSET+ODD_SIZE )/ $ D( PART_OFFSET+ODD_SIZE ) * * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^C * D( PART_OFFSET+ODD_SIZE+1 ) = D( PART_OFFSET+ODD_SIZE+1 )- $ D( PART_OFFSET+ODD_SIZE )*REAL( E( PART_OFFSET+ODD_SIZE )* $ CONJG( E( PART_OFFSET+ODD_SIZE ) ) ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1500 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL CTRRV2D( ICTXT, 'U', 'N', 1, 1, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF (INFO.EQ.0) THEN * * Calculate the "spike" fillin, ${L_i} {{G}_i}^C = {D_i}$ . * CALL CPTTRSV( 'L', 'N', ODD_SIZE, INT_ONE, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), AF( 1 ), ODD_SIZE, INFO ) * * Divide by D * DO 20 I=1, ODD_SIZE AF( I ) = AF( I )/D( PART_OFFSET+I ) 20 CONTINUE * * * Calculate the update block for previous proc, E_i = G_i{G_i}^C * * * Since there is no element-by-element vector multiplication in * the BLAS, this loop must be hardwired in without a BLAS call * INT_TEMP = ODD_SIZE*INT_ONE+2+1 AF( INT_TEMP ) = 0 * DO 30 I=1, ODD_SIZE AF( INT_TEMP ) = AF( INT_TEMP )-D( PART_OFFSET+I )* $ ( AF( I )*CONJG( AF( I ) ) ) 30 CONTINUE * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+3 ), $ INT_ONE, 0, MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * {F_i}^C = {H_i}{{B'}_i}^C * AF( ODD_SIZE+1 ) = $ - D( PART_OFFSET+ODD_SIZE ) $ * CONJG( E( PART_OFFSET+ODD_SIZE ) $ * AF( ODD_SIZE ) ) * * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * ENDIF * * Copy last diagonal block into AF storage for subsequent * operations. * AF( ODD_SIZE+2 ) = $ CMPLX( D( PART_OFFSET+ODD_SIZE+1 ) ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+AF( ODD_SIZE+3 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+WORK( 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+WORK( 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... IF( AF( ODD_SIZE+2 ) .EQ. CZERO ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * AF( ODD_SIZE+3 ) = AF( ODD_SIZE+1 ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * AF( ODD_SIZE+1 ) = AF( ODD_SIZE+1 )/AF( ODD_SIZE+2 ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * WORK( 1 ) = -ONE*AF( ODD_SIZE+1 )*AF( ODD_SIZE+2 ) $ *CONJG( AF( ODD_SIZE+1 ) ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL CGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * AF( ODD_SIZE+3 ) = ( AF( ODD_SIZE+3 ) ) $ /AF( ODD_SIZE+2 ) * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*AF( ODD_SIZE+2 ) $ *CONJG( AF( ODD_SIZE+3 ) ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 ) $ * AF( ODD_SIZE+2 ) $ * AF( ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * * 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PCPTTRF * END scalapack-2.0.2/SRC/pcpttrs.f000644 000766 000024 00000067223 10363532303 016216 0ustar00juliestaff000000 000000 SUBROUTINE PCPTTRS( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, $ AF, LAF, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 7, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX AF( * ), B( * ), E( * ), WORK( * ) REAL D( * ) * .. * * * Purpose * ======= * * PCPTTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PCPTTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * tridiagonal symmetric positive definite distributed * matrix. * Depending on the value of UPLO, A stores either U or L in the equn * A(1:N, JA:JA+N-1) = U'D *U or L*D L' as computed by PCPTTRF. * * Routine PCPTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) COMPLEX pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) COMPLEX pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * IMPORTANT NOTE: The current version of this code supports * only IB=JA * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCPTTRF and this is stored in AF. If a linear system * is to be solved using PCPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (10+2*min(100,NRHS))*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, I, ICTXT, ICTXT_NEW, $ ICTXT_SAVE, IDUM1, IDUM3, JA_NEW, LLDA, LLDB, $ MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 15, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, $ PCPTTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 9*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 9*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 9*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 9*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -13 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 9*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 9*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -3 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -5 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PCPTTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PCPTTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ (10+2*MIN(100,NRHS))*NPCOL+4*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -13 CALL PXERBLA( ICTXT, $ 'PCPTTRS: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 15, 1 ) = DESCB(5) PARAM_CHECK( 14, 1 ) = DESCB(4) PARAM_CHECK( 13, 1 ) = DESCB(3) PARAM_CHECK( 12, 1 ) = DESCB(2) PARAM_CHECK( 11, 1 ) = DESCB(1) PARAM_CHECK( 10, 1 ) = IB PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = NRHS PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 15, 2 ) = 905 PARAM_CHECK( 14, 2 ) = 904 PARAM_CHECK( 13, 2 ) = 903 PARAM_CHECK( 12, 2 ) = 902 PARAM_CHECK( 11, 2 ) = 901 PARAM_CHECK( 10, 2 ) = 8 PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 13 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 15, PARAM_CHECK, 15, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCPTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PCPTTRSV( 'L', 'N', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * ELSE * CALL PCPTTRSV( 'U', 'C', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * ENDIF * * Divide by the main diagonal: B <- D^{-1} B * * The main partition is first * DO 10 I=PART_OFFSET+1, PART_OFFSET+ODD_SIZE CALL CSCAL( NRHS, CMPLX( CONE/D( I ) ), B( I ), LLDB ) 10 CONTINUE * * Reduced system is next * IF( MYCOL .LT. NPCOL-1 ) THEN I=PART_OFFSET+ODD_SIZE+1 CALL CSCAL( NRHS, CONE/AF( ODD_SIZE+2 ), B( I ), LLDB ) ENDIF * * Call backsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PCPTTRSV( 'L', 'C', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * ELSE * CALL PCPTTRSV( 'U', 'N', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * ENDIF 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PCPTTRS * END scalapack-2.0.2/SRC/pcpttrsv.f000644 000766 000024 00000140170 11745552113 016403 0ustar00juliestaff000000 000000 SUBROUTINE PCPTTRSV( UPLO, TRANS, N, NRHS, D, E, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX AF( * ), B( * ), E( * ), WORK( * ) REAL D( * ) * .. * * * Purpose * ======= * * PCPTTRSV solves a tridiagonal triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a tridiagonal * triangular matrix factor produced by the * Cholesky factorization code PCPTTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H * is dictated by the user by the parameter TRANS. * * Routine PCPTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) COMPLEX pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) COMPLEX pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PCPTTRF and this is stored in AF. If a linear system * is to be solved using PCPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (10+2*min(100,NRHS))*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) COMPLEX CONE, CZERO PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_M_B, STORE_N_A, $ TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ CGEMM, CGERV2D, CGESD2D, CLAMOV, CMATADD, $ CTBTRS, CTRMM, CTRTRS, DESC_CONVERT, GLOBCHK, $ PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -2 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -3 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -4 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PCPTTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PCPTTRSV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ INT_ONE*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, $ 'PCPTTRSV: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1105 PARAM_CHECK( 15, 2 ) = 1104 PARAM_CHECK( 14, 2 ) = 1103 PARAM_CHECK( 13, 2 ) = 1102 PARAM_CHECK( 12, 2 ) = 1101 PARAM_CHECK( 11, 2 ) = 10 PARAM_CHECK( 10, 2 ) = 805 PARAM_CHECK( 9, 2 ) = 804 PARAM_CHECK( 8, 2 ) = 803 PARAM_CHECK( 7, 2 ) = 801 PARAM_CHECK( 6, 2 ) = 7 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 14 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PCPTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * IF ( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL CPTTRSV( UPLO, 'N', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL CAXPY( NRHS, -E( PART_OFFSET+ODD_SIZE ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL CGEMM( 'C', 'N', 1, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+INT_ONE-1 ), INT_ONE ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL CTRTRS( 'L', 'N', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*1+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL CGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*1+2+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 27 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 26 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 27 26 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*1+2+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*1+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL CTRTRS( 'L', 'C', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 22 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 21 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 22 21 CONTINUE * [End of GOTO Loop] * 24 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL CGEMM( 'N', 'N', ODD_SIZE, NRHS, 1, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+INT_ONE-1 ), INT_ONE, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL CAXPY( NRHS, -CONJG( E( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL CPTTRSV( UPLO, 'C', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF ( LSAME( TRANS, 'C' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL CPTTRSV( UPLO, 'C', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL CAXPY( NRHS, -CONJG( E( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL CGEMM( 'T', 'N', 1, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+INT_ONE-1 ), INT_ONE ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 44 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 42 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 41 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * CALL CMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 42 41 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL CTRTRS( 'L', 'N', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*1+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*1+2+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 44 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 54 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 57 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 56 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 57 56 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL CGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*1+2+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL CGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*1+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL CTRTRS( 'L', 'C', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 52 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 51 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 52 51 CONTINUE * [End of GOTO Loop] * 54 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL CGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL CGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL CGEMM( 'C', 'N', ODD_SIZE, NRHS, 1, -CONE, AF( 1 ), $ INT_ONE, WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL CAXPY( NRHS, -E( PART_OFFSET+ODD_SIZE ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL CPTTRSV( UPLO, 'N', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ENDIF * End of "IF( LSAME( UPLO, 'L' ) )"... 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PCPTTRSV * END scalapack-2.0.2/SRC/pcrot.c000644 000766 000024 00000042253 10602576752 015653 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- Mark R. Fahey * June 28, 2000 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pcrot_( n, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy, c, s ) /* * Mark Fahey * June 22, 2000 */ /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; float * c; complex * s; /* * .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex X[], Y[]; { /* * Purpose * ======= * * PCROT applies a plane rotation, where the cos (C) is real and the * sin (S) is complex, and the vectors CX and CY are complex, i.e., * * [ sub( X ) ] := [ C S ] [ sub( X ) ] * [ sub( Y ) ] := [ -conjg(S) C ] [ sub( Y ) ] * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y, * * and where C*C + S*CONJG(S) = 1.0. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * Arguments * ========= * * N (input) INTEGER * The number of elements in the vectors CX and CY. * * X (local input) COMPLEX array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * On output, CX is overwritten with C*X + S*Y. * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input) COMPLEX array containing the local * pieces of a distributed matrix of dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * On output, CY is overwritten with -CONJG(S)*X + C*Y. * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * C (input) pointer to FLOAT * S (input) pointer COMPLEX * C and S define a rotation * [ C S ] * [ -conjg(S) C ] * where C*C + S*CONJG(S) = 1.0. * * ===================================================================== * * .. Local Scalars .. */ int ictxt, iix, iiy, info, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, mycol, myrow, nn, np, np0, nprow, npcol, nq, nz, ione=1, tmp1, wksz; complex xwork[1], ywork[1], zero; /* .. * .. PBLAS Buffer .. */ complex * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void cgerv2d_(); void cgesd2d_(); void pbchkvect(); void PB_Cabort(); char * getpbbuf(); F_INTG_FCT pbctrnv_(); F_INTG_FCT crot_(); F_INTG_FCT ilcm_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(500+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 5, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 10, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -9; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1000+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -8; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1000+MB_+1); } else { info = -11; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -9; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1000+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -8; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1000+MB_+1); } else { info = -11; } } else { info = -6; } } if( ictxt != desc_Y[CTXT_] ) info = -(1000+CTXT_+1); } } if( info ) { PB_Cabort( ictxt, "PCROT", info ); return; } /* if( info ) { pberror_( &ictxt, "PCROT", &info ); return; } */ /* * Quick return if possible. */ zero.re = ZERO; zero.im = ZERO; if( *n == 0 ) return; /* * rotation */ if( *n == 1 ) { if( ( myrow == ixrow ) && ( mycol == ixcol ) ) { buff = &X[iix-1+(jjx-1)*desc_X[LLD_]]; if( ( myrow != iyrow ) || ( mycol != iycol ) ) { cgesd2d_( &ictxt, n, n, buff, n, &iyrow, &iycol ); cgerv2d_( &ictxt, n, n, ywork, n, &iyrow, &iycol ); } else *ywork = Y[iiy-1+(jjy-1)*desc_Y[LLD_]]; crot_( n, buff, n, ywork, n, c, s ); X[iix-1+(jjx-1)*desc_X[LLD_]] = *buff; if( ( myrow == iyrow ) && ( mycol == iycol ) ) Y[iiy-1+(jjy-1)*desc_Y[LLD_]] = *ywork; } else if( ( myrow == iyrow ) && ( mycol == iycol ) ) { cgesd2d_( &ictxt, n, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, &ixrow, &ixcol ); cgerv2d_( &ictxt, n, n, xwork, n, &ixrow, &ixcol ); crot_( n, xwork, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, c, s ); } return; } if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) { crot_( &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], c, s ); } } else { if( myrow == ixrow ) { cgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); buff = (complex *)getpbbuf( "PCROT", nq*sizeof(complex) ); cgerv2d_( &ictxt, &nq, &ione, buff, &nq, &iyrow, &mycol ); crot_( &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], buff, &ione, c, s ); } else if( myrow == iyrow ) { cgesd2d_( &ictxt, &ione, &nq, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &mycol ); buff = (complex *)getpbbuf( "PCROT", nq*sizeof(complex) ); cgerv2d_( &ictxt, &nq, &ione, buff, &nq, &ixrow, &mycol ); crot_( &nq, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], c, s ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) { crot_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy, c, s ); } } else { if( mycol == ixcol ) { cgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); buff = (complex *)getpbbuf( "PCROT", np*sizeof(complex) ); cgerv2d_( &ictxt, &np, &ione, buff, &np, &myrow, &iycol ); crot_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione, c, s ); } else if( mycol == iycol ) { cgesd2d_( &ictxt, &np, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &myrow, &ixcol ); buff = (complex *)getpbbuf( "PCROT", np*sizeof(complex) ); cgerv2d_( &ictxt, &np, &ione, buff, &np, &myrow, &ixcol ); crot_( &np, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy, c, s ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmp = lcm / nprow; nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_X[MB_], nprow ); tmp1 = np0 / desc_X[MB_]; wksz = MYROC0( tmp1, np0, desc_X[MB_], lcmp ); wksz = np + wksz; buff = (complex *)getpbbuf( "PCROT", wksz*sizeof(complex) ); if( mycol == iycol ) jjy -= nz; if( myrow == ixrow ) np -= nz; pbctrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &zero, buff, &ione, &iyrow, &iycol, &ixrow, &ixcol, buff+np ); if( mycol == ixcol ) { crot_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione, c, s ); } pbctrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, buff, &ione, &zero, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &ixcol, &iyrow, &iycol, buff+np ); } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*jx-1) % desc_X[NB_]; nn = *n + nz; tmp1 = nn / desc_X[MB_]; np = numroc_( &nn, desc_Y+MB_, &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex *)getpbbuf( "PCROT", wksz*sizeof(complex) ); if( myrow == iyrow ) np -= nz; pbctrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { crot_( &np, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy, c, s ); } pbctrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, buff, &ione, &zero, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &iycol, &ixrow, &ixcol, buff+np ); } } } scalapack-2.0.2/SRC/pcsrscl.f000644 000766 000024 00000016265 10363532303 016170 0ustar00juliestaff000000 000000 SUBROUTINE PCSRSCL( N, SA, SX, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N REAL SA * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX SX( * ) * .. * * Purpose * ======= * * PCSRSCL multiplies an N-element complex distributed vector * sub( X ) by the real scalar 1/a. This is done without overflow or * underflow as long as the final sub( X )/a does not overflow or * underflow. * * where sub( X ) denotes X(IX:IX+N-1,JX:JX), if INCX = 1, * X(IX:IX,JX:JX+N-1), if INCX = M_X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) pointer to INTEGER * The number of components of the distributed vector sub( X ). * N >= 0. * * SA (global input) REAL * The scalar a which is used to divide each component of * sub( X ). SA must be >= 0, or the subroutine will divide by * zero. * * SX (local input/local output) COMPLEX array * containing the local pieces of a distributed matrix of * dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL DONE INTEGER ICTXT, MYCOL, MYROW, NPCOL, NPROW REAL BIGNUM, CDEN, CDEN1, CNUM, CNUM1, MUL, SMLNUM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PCSSCAL, PSLABAD * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Get machine parameters * SMLNUM = PSLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM CALL PSLABAD( ICTXT, SMLNUM, BIGNUM ) * * Initialize the denominator to SA and the numerator to 1. * CDEN = SA CNUM = ONE * 10 CONTINUE CDEN1 = CDEN*SMLNUM CNUM1 = CNUM / BIGNUM IF( ABS( CDEN1 ).GT.ABS( CNUM ) .AND. CNUM.NE.ZERO ) THEN * * Pre-multiply sub( X ) by SMLNUM if CDEN is large compared to * CNUM. * MUL = SMLNUM DONE = .FALSE. CDEN = CDEN1 ELSE IF( ABS( CNUM1 ).GT.ABS( CDEN ) ) THEN * * Pre-multiply sub( X ) by BIGNUM if CDEN is small compared to * CNUM. * MUL = BIGNUM DONE = .FALSE. CNUM = CNUM1 ELSE * * Multiply sub( X ) by CNUM / CDEN and return. * MUL = CNUM / CDEN DONE = .TRUE. END IF * * Scale the vector sub( X ) by MUL * CALL PCSSCAL( N, MUL, SX, IX, JX, DESCX, INCX ) * IF( .NOT.DONE ) $ GO TO 10 * RETURN * * End of PCSRSCL * END scalapack-2.0.2/SRC/pcstein.f000644 000766 000024 00000060310 10602576752 016166 0ustar00juliestaff000000 000000 SUBROUTINE PCSTEIN( N, D, E, M, W, IBLOCK, ISPLIT, ORFAC, Z, IZ, $ JZ, DESCZ, WORK, LWORK, IWORK, LIWORK, IFAIL, $ ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER INFO, IZ, JZ, LIWORK, LWORK, M, N REAL ORFAC * .. * .. Array Arguments .. INTEGER DESCZ( * ), IBLOCK( * ), ICLUSTR( * ), $ IFAIL( * ), ISPLIT( * ), IWORK( * ) REAL D( * ), E( * ), GAP( * ), W( * ), WORK( * ) COMPLEX Z( * ) * .. * * Purpose * ======= * * PCSTEIN computes the eigenvectors of a symmetric tridiagonal matrix * in parallel, using inverse iteration. The eigenvectors found * correspond to user specified eigenvalues. PCSTEIN does not * orthogonalize vectors that are on different processes. The extent * of orthogonalization is controlled by the input parameter LWORK. * Eigenvectors that are to be orthogonalized are computed by the same * process. PCSTEIN decides on the allocation of work among the * processes and then calls SSTEIN2 (modified LAPACK routine) on each * individual process. If insufficient workspace is allocated, the * expected orthogonalization may not be done. * * Note : If the eigenvectors obtained are not orthogonal, increase * LWORK and run the code again. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * P = NPROW * NPCOL is the total number of processes * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * D (global input) REAL array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. * * E (global input) REAL array, dimension (N-1) * The (n-1) off-diagonal elements of the tridiagonal matrix T. * * M (global input) INTEGER * The total number of eigenvectors to be found. 0 <= M <= N. * * W (global input/global output) REAL array, dim (M) * On input, the first M elements of W contain all the * eigenvalues for which eigenvectors are to be computed. The * eigenvalues should be grouped by split-off block and ordered * from smallest to largest within the block (The output array * W from PSSTEBZ with ORDER='b' is expected here). This * array should be replicated on all processes. * On output, the first M elements contain the input * eigenvalues in ascending order. * * Note : To obtain orthogonal vectors, it is best if * eigenvalues are computed to highest accuracy ( this can be * done by setting ABSTOL to the underflow threshold = * SLAMCH('U') --- ABSTOL is an input parameter * to PSSTEBZ ) * * IBLOCK (global input) INTEGER array, dimension (N) * The submatrix indices associated with the corresponding * eigenvalues in W -- 1 for eigenvalues belonging to the * first submatrix from the top, 2 for those belonging to * the second submatrix, etc. (The output array IBLOCK * from PSSTEBZ is expected here). * * ISPLIT (global input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N (The output array * ISPLIT from PSSTEBZ is expected here.) * * ORFAC (global input) REAL * ORFAC specifies which eigenvectors should be orthogonalized. * Eigenvectors that correspond to eigenvalues which are within * ORFAC*||T|| of each other are to be orthogonalized. * However, if the workspace is insufficient (see LWORK), this * tolerance may be decreased until all eigenvectors to be * orthogonalized can be stored in one process. * No orthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) COMPLEX array, * dimension (DESCZ(DLEN_), N/npcol + NB) * Z contains the computed eigenvectors associated with the * specified eigenvalues. Any vector which fails to converge is * set to its current iterate after MAXITS iterations ( See * SSTEIN2 ). * On output, Z is distributed across the P processes in block * cyclic format. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace/global output) REAL array, * dimension ( LWORK ) * On output, WORK(1) gives a lower bound on the * workspace ( LWORK ) that guarantees the user desired * orthogonalization (see ORFAC). * Note that this may overestimate the minimum workspace needed. * * LWORK (local input) integer * LWORK controls the extent of orthogonalization which can be * done. The number of eigenvectors for which storage is * allocated on each process is * NVEC = floor(( LWORK- max(5*N,NP00*MQ00) )/N). * Eigenvectors corresponding to eigenvalue clusters of size * NVEC - ceil(M/P) + 1 are guaranteed to be orthogonal ( the * orthogonality is similar to that obtained from CSTEIN2). * Note : LWORK must be no smaller than: * max(5*N,NP00*MQ00) + ceil(M/P)*N, * and should have the same input value on all processes. * It is the minimum value of LWORK input on different processes * that is significant. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/global output) INTEGER array, * dimension ( 3*N+P+1 ) * On return, IWORK(1) contains the amount of integer workspace * required. * On return, the IWORK(2) through IWORK(P+2) indicate * the eigenvectors computed by each process. Process I computes * eigenvectors indexed IWORK(I+2)+1 thru' IWORK(I+3). * * LIWORK (local input) INTEGER * Size of array IWORK. Must be >= 3*N + P + 1 * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (global output) integer array, dimension (M) * On normal exit, all elements of IFAIL are zero. * If one or more eigenvectors fail to converge after MAXITS * iterations (as in CSTEIN), then INFO > 0 is returned. * If mod(INFO,M+1)>0, then * for I=1 to mod(INFO,M+1), the eigenvector * corresponding to the eigenvalue W(IFAIL(I)) failed to * converge ( W refers to the array of eigenvalues on output ). * * ICLUSTR (global output) integer array, dimension (2*P) * This output array contains indices of eigenvectors * corresponding to a cluster of eigenvalues that could not be * orthogonalized due to insufficient workspace (see LWORK, * ORFAC and INFO). Eigenvectors corresponding to clusters of * eigenvalues indexed ICLUSTR(2*I-1) to ICLUSTR(2*I), I = 1 to * INFO/(M+1), could not be orthogonalized due to lack of * workspace. Hence the eigenvectors corresponding to these * clusters may not be orthogonal. ICLUSTR is a zero terminated * array --- ( ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0 ) * if and only if K is the number of clusters. * * GAP (global output) REAL array, dimension (P) * This output array contains the gap between eigenvalues whose * eigenvectors could not be orthogonalized. The INFO/M output * values in this array correspond to the INFO/(M+1) clusters * indicated by the array ICLUSTR. As a result, the dot product * between eigenvectors corresponding to the I^th cluster may be * as high as ( O(n)*macheps ) / GAP(I). * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * < 0 : if INFO = -I, the I-th argument had an illegal value * > 0 : if mod(INFO,M+1) = I, then I eigenvectors failed to * converge in MAXITS iterations. Their indices are * stored in the array IFAIL. * if INFO/(M+1) = I, then eigenvectors corresponding to * I clusters of eigenvalues could not be orthogonalized * due to insufficient workspace. The indices of the * clusters are stored in the array ICLUSTR. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMN2D, IGEBR2D, $ IGEBS2D, PCHK1MAT, PCLAEVSWP, PXERBLA, SGEBR2D, $ SGEBS2D, SLASRT2, SSTEIN2 * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, NEGONE, ODM1, FIVE, ODM3, ODM18 PARAMETER ( ZERO = 0.0E+0, NEGONE = -1.0E+0, $ ODM1 = 1.0E-1, FIVE = 5.0E+0, ODM3 = 1.0E-3, $ ODM18 = 1.0E-18 ) * .. * .. Local Scalars .. LOGICAL LQUERY, SORTED INTEGER B1, BN, BNDRY, CLSIZ, COL, I, IFIRST, IINFO, $ ILAST, IM, INDRW, ITMP, J, K, LGCLSIZ, LLWORK, $ LOAD, LOCINFO, MAXVEC, MQ00, MYCOL, MYROW, $ NBLK, NERR, NEXT, NP00, NPCOL, NPROW, NVS, $ OLNBLK, P, ROW, SELF, TILL, TOTERR REAL DIFF, MINGAP, ONENRM, ORGFAC, ORTOL, TMPFAC * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) SELF = MYROW*NPCOL + MYCOL * * Make sure that we belong to this context (before calling PCHK1MAT) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1200+CTXT_ ) ELSE * * Make sure that NPROW>0 and NPCOL>0 before calling NUMROC * CALL CHK1MAT( N, 1, N, 1, IZ, JZ, DESCZ, 12, INFO ) IF( INFO.EQ.0 ) THEN * * Now we know that our context is good enough to * perform the rest of the checks * NP00 = NUMROC( N, DESCZ( MB_ ), 0, 0, NPROW ) MQ00 = NUMROC( M, DESCZ( NB_ ), 0, 0, NPCOL ) P = NPROW*NPCOL * * Compute the maximum number of vectors per process * LLWORK = LWORK CALL IGAMN2D( DESCZ( CTXT_ ), 'A', ' ', 1, 1, LLWORK, 1, 1, $ 1, -1, -1, -1 ) INDRW = MAX( 5*N, NP00*MQ00 ) IF( N.NE.0 ) $ MAXVEC = ( LLWORK-INDRW ) / N LOAD = ICEIL( M, P ) IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN TMPFAC = ORFAC CALL SGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, TMPFAC, $ 1 ) ELSE CALL SGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, TMPFAC, $ 1, 0, 0 ) END IF * LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( M.LT.0 .OR. M.GT.N ) THEN INFO = -4 ELSE IF( MAXVEC.LT.LOAD .AND. .NOT.LQUERY ) THEN INFO = -14 ELSE IF( LIWORK.LT.3*N+P+1 .AND. .NOT.LQUERY ) THEN INFO = -16 ELSE DO 10 I = 2, M IF( IBLOCK( I ).LT.IBLOCK( I-1 ) ) THEN INFO = -6 GO TO 20 END IF IF( IBLOCK( I ).EQ.IBLOCK( I-1 ) .AND. W( I ).LT. $ W( I-1 ) ) THEN INFO = -5 GO TO 20 END IF 10 CONTINUE 20 CONTINUE IF( INFO.EQ.0 ) THEN IF( ABS( TMPFAC-ORFAC ).GT.FIVE*ABS( TMPFAC ) ) $ INFO = -8 END IF END IF * END IF IDUM1( 1 ) = M IDUM2( 1 ) = 4 CALL PCHK1MAT( N, 1, N, 1, IZ, JZ, DESCZ, 12, 1, IDUM1, IDUM2, $ INFO ) WORK( 1 ) = REAL( MAX( 5*N, NP00*MQ00 )+ICEIL( M, P )*N ) IWORK( 1 ) = 3*N + P + 1 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCZ( CTXT_ ), 'PCSTEIN', -INFO ) RETURN ELSE IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) THEN RETURN END IF * DO 30 I = 1, M IFAIL( I ) = 0 30 CONTINUE DO 40 I = 1, P + 1 IWORK( I ) = 0 40 CONTINUE DO 50 I = 1, P GAP( I ) = NEGONE ICLUSTR( 2*I-1 ) = 0 ICLUSTR( 2*I ) = 0 50 CONTINUE * * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) $ RETURN * IF( ORFAC.GE.ZERO ) THEN TMPFAC = ORFAC ELSE TMPFAC = ODM3 END IF ORGFAC = TMPFAC * * Allocate the work among the processes * ILAST = M / LOAD IF( MOD( M, LOAD ).EQ.0 ) $ ILAST = ILAST - 1 OLNBLK = -1 NVS = 0 NEXT = 1 IM = 0 ONENRM = ZERO DO 100 I = 0, ILAST - 1 NEXT = NEXT + LOAD J = NEXT - 1 IF( J.GT.NVS ) THEN NBLK = IBLOCK( NEXT ) IF( NBLK.EQ.IBLOCK( NEXT-1 ) .AND. NBLK.NE.OLNBLK ) THEN * * Compute orthogonalization criterion * IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 60 J = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( J ) )+ABS( E( J-1 ) )+ $ ABS( E( J ) ) ) 60 CONTINUE OLNBLK = NBLK END IF TILL = NVS + MAXVEC 70 CONTINUE J = NEXT - 1 IF( TMPFAC.GT.ODM18 ) THEN ORTOL = TMPFAC*ONENRM DO 80 J = NEXT - 1, MIN( TILL, M-1 ) IF( IBLOCK( J+1 ).NE.IBLOCK( J ) .OR. W( J+1 )- $ W( J ).GE.ORTOL ) THEN GO TO 90 END IF 80 CONTINUE IF( J.EQ.M .AND. TILL.GE.M ) $ GO TO 90 TMPFAC = TMPFAC*ODM1 GO TO 70 END IF 90 CONTINUE J = MIN( J, TILL ) END IF IF( SELF.EQ.I ) $ IM = MAX( 0, J-NVS ) * IWORK( I+1 ) = NVS NVS = MAX( J, NVS ) 100 CONTINUE IF( SELF.EQ.ILAST ) $ IM = M - NVS IWORK( ILAST+1 ) = NVS DO 110 I = ILAST + 2, P + 1 IWORK( I ) = M 110 CONTINUE * CLSIZ = 1 LGCLSIZ = 1 ILAST = 0 NBLK = 0 BNDRY = 2 K = 1 DO 140 I = 1, M IF( IBLOCK( I ).NE.NBLK ) THEN NBLK = IBLOCK( I ) IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 120 J = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( J ) )+ABS( E( J-1 ) )+ $ ABS( E( J ) ) ) 120 CONTINUE * END IF IF( I.GT.1 ) THEN DIFF = W( I ) - W( I-1 ) IF( IBLOCK( I ).NE.IBLOCK( I-1 ) .OR. I.EQ.M .OR. DIFF.GT. $ ORGFAC*ONENRM ) THEN IFIRST = ILAST IF( I.EQ.M ) THEN IF( IBLOCK( M ).NE.IBLOCK( M-1 ) .OR. DIFF.GT.ORGFAC* $ ONENRM ) THEN ILAST = M - 1 ELSE ILAST = M END IF ELSE ILAST = I - 1 END IF CLSIZ = ILAST - IFIRST IF( CLSIZ.GT.1 ) THEN IF( LGCLSIZ.LT.CLSIZ ) $ LGCLSIZ = CLSIZ MINGAP = ONENRM 130 CONTINUE IF( BNDRY.GT.P+1 ) $ GO TO 150 IF( IWORK( BNDRY ).GT.IFIRST .AND. IWORK( BNDRY ).LT. $ ILAST ) THEN MINGAP = MIN( W( IWORK( BNDRY )+1 )- $ W( IWORK( BNDRY ) ), MINGAP ) ELSE IF( IWORK( BNDRY ).GE.ILAST ) THEN IF( MINGAP.LT.ONENRM ) THEN ICLUSTR( 2*K-1 ) = IFIRST + 1 ICLUSTR( 2*K ) = ILAST GAP( K ) = MINGAP / ONENRM K = K + 1 END IF GO TO 140 END IF BNDRY = BNDRY + 1 GO TO 130 END IF END IF END IF 140 CONTINUE 150 CONTINUE INFO = ( K-1 )*( M+1 ) * * Call SSTEIN2 to find the eigenvectors * CALL SSTEIN2( N, D, E, IM, W( IWORK( SELF+1 )+1 ), $ IBLOCK( IWORK( SELF+1 )+1 ), ISPLIT, ORGFAC, $ WORK( INDRW+1 ), N, WORK, IWORK( P+2 ), $ IFAIL( IWORK( SELF+1 )+1 ), LOCINFO ) * * Redistribute the eigenvector matrix to conform with the block * cyclic distribution of the input matrix * * DO 160 I = 1, M IWORK( P+1+I ) = I 160 CONTINUE * CALL SLASRT2( 'I', M, W, IWORK( P+2 ), IINFO ) * DO 170 I = 1, M IWORK( M+P+1+IWORK( P+1+I ) ) = I 170 CONTINUE * * DO 180 I = 1, LOCINFO ITMP = IWORK( SELF+1 ) + I IFAIL( ITMP ) = IFAIL( ITMP ) + ITMP - I IFAIL( ITMP ) = IWORK( M+P+1+IFAIL( ITMP ) ) 180 CONTINUE * DO 190 I = 1, K - 1 ICLUSTR( 2*I-1 ) = IWORK( M+P+1+ICLUSTR( 2*I-1 ) ) ICLUSTR( 2*I ) = IWORK( M+P+1+ICLUSTR( 2*I ) ) 190 CONTINUE * * * Still need to apply the above permutation to IFAIL * * TOTERR = 0 DO 210 I = 1, P IF( SELF.EQ.I-1 ) THEN CALL IGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, LOCINFO, 1 ) IF( LOCINFO.NE.0 ) THEN CALL IGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', LOCINFO, 1, $ IFAIL( IWORK( I )+1 ), LOCINFO ) DO 200 J = 1, LOCINFO IFAIL( TOTERR+J ) = IFAIL( IWORK( I )+J ) 200 CONTINUE TOTERR = TOTERR + LOCINFO END IF ELSE * ROW = ( I-1 ) / NPCOL COL = MOD( I-1, NPCOL ) * CALL IGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, NERR, 1, $ ROW, COL ) IF( NERR.NE.0 ) THEN CALL IGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', NERR, 1, $ IFAIL( TOTERR+1 ), NERR, ROW, COL ) TOTERR = TOTERR + NERR END IF END IF 210 CONTINUE INFO = INFO + TOTERR * * CALL PCLAEVSWP( N, WORK( INDRW+1 ), N, Z, IZ, JZ, DESCZ, IWORK, $ IWORK( M+P+2 ), WORK, INDRW ) * DO 220 I = 2, P IWORK( I ) = IWORK( M+P+1+IWORK( I ) ) 220 CONTINUE * * * Sort the IWORK array * * 230 CONTINUE SORTED = .TRUE. DO 240 I = 2, P - 1 IF( IWORK( I ).GT.IWORK( I+1 ) ) THEN ITMP = IWORK( I+1 ) IWORK( I+1 ) = IWORK( I ) IWORK( I ) = ITMP SORTED = .FALSE. END IF 240 CONTINUE IF( .NOT.SORTED ) $ GO TO 230 * DO 250 I = P + 1, 1, -1 IWORK( I+1 ) = IWORK( I ) 250 CONTINUE * WORK( 1 ) = ( LGCLSIZ+LOAD-1 )*N + INDRW IWORK( 1 ) = 3*N + P + 1 * * End of PCSTEIN * END scalapack-2.0.2/SRC/pctrcon.f000644 000766 000024 00000040410 10363532303 016154 0ustar00juliestaff000000 000000 SUBROUTINE PCTRCON( NORM, UPLO, DIAG, N, A, IA, JA, DESCA, RCOND, $ WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER IA, JA, INFO, LRWORK, LWORK, N REAL RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL RWORK( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PCTRCON estimates the reciprocal of the condition number of a * triangular distributed matrix A(IA:IA+N-1,JA:JA+N-1), in either the * 1-norm or the infinity-norm. * * The norm of A(IA:IA+N-1,JA:JA+N-1) is computed and an estimate is * obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), then the reciprocal * of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies whether the 1-norm condition number or the * infinity-norm condition number is required: * = '1' or 'O': 1-norm; * = 'I': Infinity-norm. * * UPLO (global input) CHARACTER * = 'U': A(IA:IA+N-1,JA:JA+N-1) is upper triangular; * = 'L': A(IA:IA+N-1,JA:JA+N-1) is lower triangular. * * DIAG (global input) CHARACTER * = 'N': A(IA:IA+N-1,JA:JA+N-1) is non-unit triangular; * = 'U': A(IA:IA+N-1,JA:JA+N-1) is unit triangular. * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the triangular distributed * matrix A(IA:IA+N-1,JA:JA+N-1). If UPLO = 'U', the leading * N-by-N upper triangular part of this distributed matrix con- * tains the upper triangular matrix, and its strictly lower * triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of this ditributed * matrix contains the lower triangular matrix, and the strictly * upper triangular part is not referenced. If DIAG = 'U', the * diagonal elements of A(IA:IA+N-1,JA:JA+N-1) are also not * referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * RCOND (global output) REAL * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + * MAX( 2, MAX(NB_A*CEIL(P-1,Q),LOCc(N+MOD(JA-1,NB_A)) + * NB_A*CEIL(Q-1,P)) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) REAL array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCc(N+MOD(JA-1,NB_A)). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOUNIT, ONENRM, UPPER CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPN, IPV, IPW, $ IPX, IROFF, IV, IX, IXX, JJA, JV, JX, KASE, $ KASE1, LRWMIN, LWMIN, MYCOL, MYROW, NP, NPCOL, $ NPMOD, NPROW, NQMOD REAL AINVNM, ANORM, SCALE, SMLNUM COMPLEX WMAX, ZDUM * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 5 ), $ IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, CHK1MAT, $ DESCSET, INFOG2L, PCAMAX, PCHK1MAT, PCLATRS, $ PCLACON, PCSRSCL, PB_TOPGET, PB_TOPSET, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PCLANTR, PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PCLANTR, $ PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, ICHAR, MAX, MOD, REAL * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 800 + CTXT_ ) ELSE CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) ONENRM = NORM.EQ.'1' .OR. LSAME( NORM, 'O' ) NOUNIT = LSAME( DIAG, 'N' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = REAL( LWMIN ) LRWMIN = NQMOD RWORK( 1 ) = REAL( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.ONENRM .AND. .NOT.LSAME( NORM, 'I' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF * IF( ONENRM ) THEN IDUM1( 1 ) = ICHAR( '1' ) ELSE IDUM1( 1 ) = ICHAR( 'I' ) END IF IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'U' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 11 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 13 CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 5, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCTRCON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * RCOND = ZERO SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' )*REAL( MAX( 1, N ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPW = IPV + NP IPN = 1 * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Compute the norm of the triangular matrix A. * ANORM = PCLANTR( NORM, UPLO, DIAG, N, N, A, IA, JA, DESCA, RWORK ) * * Continue only if ANORM > 0. * IF( ANORM.GT.ZERO ) THEN * * Estimate the norm of the inverse of A. * AINVNM = ZERO NORMIN = 'N' IF( ONENRM ) THEN KASE1 = 1 ELSE KASE1 = 2 END IF KASE = 0 10 CONTINUE CALL PCLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), $ IX, JX, DESCX, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( KASE.EQ.KASE1 ) THEN * * Multiply by inv(A). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( UPLO, 'No transpose', DIAG, NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SCALE, RWORK( IPN ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(A'). * DESCX( CSRC_ ) = IACOL CALL PCLATRS( UPLO, 'Conjugate transpose', DIAG, NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SCALE, RWORK( IPN ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF NORMIN = 'Y' * * Multiply by 1/SCALE if doing so will not cause overflow. * IF( SCALE.NE.ONE ) THEN CALL PCAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, $ DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL CGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, $ 1 ) ELSE CALL CGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, $ 1, IAROW, MYCOL ) END IF END IF IF( SCALE.LT.CABS1( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PCSRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / ANORM ) / AINVNM END IF * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PCTRCON * END scalapack-2.0.2/SRC/pctrevc.f000644 000766 000024 00000052612 10602576752 016175 0ustar00juliestaff000000 000000 SUBROUTINE PCTREVC( SIDE, HOWMNY, SELECT, N, T, DESCT, VL, DESCVL, $ VR, DESCVR, MM, M, WORK, RWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. CHARACTER HOWMNY, SIDE INTEGER INFO, M, MM, N * .. * .. Array Arguments .. LOGICAL SELECT( * ) INTEGER DESCT( * ), DESCVL( * ), DESCVR( * ) REAL RWORK( * ) COMPLEX T( * ), VL( * ), VR( * ), WORK( * ) * .. * * Purpose * ======= * * PCTREVC computes some or all of the right and/or left eigenvectors of * a complex upper triangular matrix T in parallel. * * The right eigenvector x and the left eigenvector y of T corresponding * to an eigenvalue w are defined by: * * T*x = w*x, y'*T = w*y' * * where y' denotes the conjugate transpose of the vector y. * * If all eigenvectors are requested, the routine may either return the * matrices X and/or Y of right or left eigenvectors of T, or the * products Q*X and/or Q*Y, where Q is an input unitary * matrix. If T was obtained from the Schur factorization of an * original matrix A = Q*T*Q', then Q*X and Q*Y are the matrices of * right or left eigenvectors of A. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * = 'R': compute right eigenvectors only; * = 'L': compute left eigenvectors only; * = 'B': compute both right and left eigenvectors. * * HOWMNY (global input) CHARACTER*1 * = 'A': compute all right and/or left eigenvectors; * = 'B': compute all right and/or left eigenvectors, * and backtransform them using the input matrices * supplied in VR and/or VL; * = 'S': compute selected right and/or left eigenvectors, * specified by the logical array SELECT. * * SELECT (global input) LOGICAL array, dimension (N) * If HOWMNY = 'S', SELECT specifies the eigenvectors to be * computed. * If HOWMNY = 'A' or 'B', SELECT is not referenced. * To select the eigenvector corresponding to the j-th * eigenvalue, SELECT(j) must be set to .TRUE.. * * N (global input) INTEGER * The order of the matrix T. N >= 0. * * T (global input/output) COMPLEX array, dimension * (DESCT(LLD_),*) * The upper triangular matrix T. T is modified, but restored * on exit. * * DESCT (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix T. * * VL (global input/output) COMPLEX array, dimension * (DESCVL(LLD_),MM) * On entry, if SIDE = 'L' or 'B' and HOWMNY = 'B', VL must * contain an N-by-N matrix Q (usually the unitary matrix Q of * Schur vectors returned by CHSEQR). * On exit, if SIDE = 'L' or 'B', VL contains: * if HOWMNY = 'A', the matrix Y of left eigenvectors of T; * if HOWMNY = 'B', the matrix Q*Y; * if HOWMNY = 'S', the left eigenvectors of T specified by * SELECT, stored consecutively in the columns * of VL, in the same order as their * eigenvalues. * If SIDE = 'R', VL is not referenced. * * DESCVL (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix VL. * * VR (global input/output) COMPLEX array, dimension * (DESCVR(LLD_),MM) * On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must * contain an N-by-N matrix Q (usually the unitary matrix Q of * Schur vectors returned by CHSEQR). * On exit, if SIDE = 'R' or 'B', VR contains: * if HOWMNY = 'A', the matrix X of right eigenvectors of T; * if HOWMNY = 'B', the matrix Q*X; * if HOWMNY = 'S', the right eigenvectors of T specified by * SELECT, stored consecutively in the columns * of VR, in the same order as their * eigenvalues. * If SIDE = 'L', VR is not referenced. * * DESCVR (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix VR. * * MM (global input) INTEGER * The number of columns in the arrays VL and/or VR. MM >= M. * * M (global output) INTEGER * The number of columns in the arrays VL and/or VR actually * used to store the eigenvectors. If HOWMNY = 'A' or 'B', M * is set to N. Each selected eigenvector occupies one * column. * * WORK (local workspace) COMPLEX array, * dimension ( 2*DESCT(LLD_) ) * Additional workspace may be required if PCLATTRS is updated * to use WORK. * * RWORK (local workspace) REAL array, * dimension ( DESCT(LLD_) ) * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Further Details * =============== * * The algorithm used in this program is basically backward (forward) * substitution. It is the hope that scaling would be used to make the * the code robust against possible overflow. But scaling has not yet * been implemented in PCLATTRS which is called by this routine to solve * the triangular systems. PCLATTRS just calls PCTRSV. * * Each eigenvector is normalized so that the element of largest * magnitude has magnitude 1; here the magnitude of a complex number * (x,y) is taken to be |x| + |y|. * * Further Details * =============== * * Implemented by Mark R. Fahey, June, 2000 * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL ALLV, BOTHV, LEFTV, OVER, RIGHTV, SOMEV INTEGER CONTXT, CSRC, I, ICOL, II, IROW, IS, ITMP1, $ ITMP2, J, K, KI, LDT, LDVL, LDVR, LDW, MB, $ MYCOL, MYROW, NB, NPCOL, NPROW, RSRC REAL SELF REAL OVFL, REMAXD, SCALE, SMIN, SMLNUM, ULP, UNFL COMPLEX CDUM, REMAXC, SHIFT * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, SGSUM2D, IGAMN2D, $ INFOG2L, PSLABAD, PSCASUM, PXERBLA, PCAMAX, $ PCCOPY, PCSSCAL, PCGEMV, PCLASET, PCLATTRS, $ CGSUM2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, CMPLX, CONJG, AIMAG, MAX * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. * * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CONTXT = DESCT( CTXT_ ) RSRC = DESCT( RSRC_ ) CSRC = DESCT( CSRC_ ) MB = DESCT( MB_ ) NB = DESCT( NB_ ) LDT = DESCT( LLD_ ) LDW = LDT LDVR = DESCVR( LLD_ ) LDVL = DESCVL( LLD_ ) * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) SELF = MYROW*NPCOL + MYCOL * * Decode and test the input parameters * BOTHV = LSAME( SIDE, 'B' ) RIGHTV = LSAME( SIDE, 'R' ) .OR. BOTHV LEFTV = LSAME( SIDE, 'L' ) .OR. BOTHV * ALLV = LSAME( HOWMNY, 'A' ) OVER = LSAME( HOWMNY, 'B' ) .OR. LSAME( HOWMNY, 'O' ) SOMEV = LSAME( HOWMNY, 'S' ) * * Set M to the number of columns required to store the selected * eigenvectors. * IF( SOMEV ) THEN M = 0 DO 10 J = 1, N IF( SELECT( J ) ) $ M = M + 1 10 CONTINUE ELSE M = N END IF * INFO = 0 IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN INFO = -1 ELSE IF( .NOT.ALLV .AND. .NOT.OVER .AND. .NOT.SOMEV ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -4 ELSE IF( MM.LT.M ) THEN INFO = -11 END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PCTREVC', -INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * Set the constants to control overflow. * UNFL = PSLAMCH( CONTXT, 'Safe minimum' ) OVFL = ONE / UNFL CALL PSLABAD( CONTXT, UNFL, OVFL ) ULP = PSLAMCH( CONTXT, 'Precision' ) SMLNUM = UNFL*( N / ULP ) * * Store the diagonal elements of T in working array WORK( LDW+1 ). * DO 20 I = 1, N CALL INFOG2L( I, I, DESCT, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN WORK( LDW+IROW ) = T( ( ICOL-1 )*LDT+IROW ) END IF 20 CONTINUE * * Compute 1-norm of each column of strictly upper triangular * part of T to control overflow in triangular solver. Computed, * but not used. For use in PCLATTRS. * RWORK( 1 ) = ZERO DO 30 J = 2, N CALL PSCASUM( J-1, RWORK( J ), T, 1, J, DESCT, 1 ) 30 CONTINUE * I replicate the norms in RWORK. Should they be distributed * over the process rows? CALL SGSUM2D( CONTXT, 'Row', ' ', N, 1, RWORK, N, -1, -1 ) * IF( RIGHTV ) THEN * * Compute right eigenvectors. * * Need to set the distribution pattern of WORK * CALL DESCINIT( DESCW, N, 1, NB, 1, RSRC, CSRC, CONTXT, LDW, $ INFO ) * IS = M DO 70 KI = N, 1, -1 * IF( SOMEV ) THEN IF( .NOT.SELECT( KI ) ) $ GO TO 70 END IF * SMIN = ZERO SHIFT = CZERO CALL INFOG2L( KI, KI, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN SHIFT = T( ( ICOL-1 )*LDT+IROW ) SMIN = MAX( ULP*( CABS1( SHIFT ) ), SMLNUM ) END IF CALL SGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SMIN, 1, -1, -1 ) CALL CGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SHIFT, 1, -1, -1 ) * CALL INFOG2L( 1, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN WORK( 1 ) = CONE END IF * * Form right-hand side. Distribute rhs onto first column * of processor grid. * IF( KI.GT.1 ) THEN CALL PCCOPY( KI-1, T, 1, KI, DESCT, 1, WORK, 1, 1, DESCW, $ 1 ) END IF DO 40 K = 1, KI - 1 CALL INFOG2L( K, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = -WORK( IROW ) END IF 40 CONTINUE * * Solve the triangular system: * (T(1:KI-1,1:KI-1) - T(KI,KI))*X = SCALE*WORK. * DO 50 K = 1, KI - 1 CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( ( ICOL-1 )*LDT+IROW ) = T( ( ICOL-1 )*LDT+IROW ) - $ SHIFT IF( CABS1( T( ( ICOL-1 )*LDT+IROW ) ).LT.SMIN ) THEN T( ( ICOL-1 )*LDT+IROW ) = CMPLX( SMIN ) END IF END IF 50 CONTINUE * IF( KI.GT.1 ) THEN CALL PCLATTRS( 'Upper', 'No transpose', 'Non-unit', 'Y', $ KI-1, T, 1, 1, DESCT, WORK, 1, 1, DESCW, $ SCALE, RWORK, INFO ) CALL INFOG2L( KI, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = CMPLX( SCALE ) END IF END IF * * Copy the vector x or Q*x to VR and normalize. * IF( .NOT.OVER ) THEN CALL PCCOPY( KI, WORK, 1, 1, DESCW, 1, VR, 1, IS, DESCVR, $ 1 ) * CALL PCAMAX( KI, REMAXC, II, VR, 1, IS, DESCVR, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PCSSCAL( KI, REMAXD, VR, 1, IS, DESCVR, 1 ) * CALL PCLASET( ' ', N-KI, 1, CZERO, CZERO, VR, KI+1, IS, $ DESCVR ) ELSE IF( KI.GT.1 ) $ CALL PCGEMV( 'N', N, KI-1, CONE, VR, 1, 1, DESCVR, $ WORK, 1, 1, DESCW, 1, CMPLX( SCALE ), $ VR, 1, KI, DESCVR, 1 ) * CALL PCAMAX( N, REMAXC, II, VR, 1, KI, DESCVR, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PCSSCAL( N, REMAXD, VR, 1, KI, DESCVR, 1 ) END IF * * Set back the original diagonal elements of T. * DO 60 K = 1, KI - 1 CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( ( ICOL-1 )*LDT+IROW ) = WORK( LDW+IROW ) END IF 60 CONTINUE * IS = IS - 1 70 CONTINUE END IF * IF( LEFTV ) THEN * * Compute left eigenvectors. * * Need to set the distribution pattern of WORK * CALL DESCINIT( DESCW, N, 1, MB, 1, RSRC, CSRC, CONTXT, LDW, $ INFO ) * IS = 1 DO 110 KI = 1, N * IF( SOMEV ) THEN IF( .NOT.SELECT( KI ) ) $ GO TO 110 END IF * SMIN = ZERO SHIFT = CZERO CALL INFOG2L( KI, KI, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN SHIFT = T( ( ICOL-1 )*LDT+IROW ) SMIN = MAX( ULP*( CABS1( SHIFT ) ), SMLNUM ) END IF CALL SGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SMIN, 1, -1, -1 ) CALL CGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SHIFT, 1, -1, -1 ) * CALL INFOG2L( N, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN WORK( IROW ) = CONE END IF * * Form right-hand side. * IF( KI.LT.N ) THEN CALL PCCOPY( N-KI, T, KI, KI+1, DESCT, N, WORK, KI+1, 1, $ DESCW, 1 ) END IF DO 80 K = KI + 1, N CALL INFOG2L( K, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = -CONJG( WORK( IROW ) ) END IF 80 CONTINUE * * Solve the triangular system: * (T(KI+1:N,KI+1:N) - T(KI,KI))'*X = SCALE*WORK. * DO 90 K = KI + 1, N CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( ( ICOL-1 )*LDT+IROW ) = T( ( ICOL-1 )*LDT+IROW ) - $ SHIFT IF( CABS1( T( ( ICOL-1 )*LDT+IROW ) ).LT.SMIN ) $ T( ( ICOL-1 )*LDT+IROW ) = CMPLX( SMIN ) END IF 90 CONTINUE * IF( KI.LT.N ) THEN CALL PCLATTRS( 'Upper', 'Conjugate transpose', 'Nonunit', $ 'Y', N-KI, T, KI+1, KI+1, DESCT, WORK, $ KI+1, 1, DESCW, SCALE, RWORK, INFO ) CALL INFOG2L( KI, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = CMPLX( SCALE ) END IF END IF * * Copy the vector x or Q*x to VL and normalize. * IF( .NOT.OVER ) THEN CALL PCCOPY( N-KI+1, WORK, KI, 1, DESCW, 1, VL, KI, IS, $ DESCVL, 1 ) * CALL PCAMAX( N-KI+1, REMAXC, II, VL, KI, IS, DESCVL, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PCSSCAL( N-KI+1, REMAXD, VL, KI, IS, DESCVL, 1 ) * CALL PCLASET( ' ', KI-1, 1, CZERO, CZERO, VL, 1, IS, $ DESCVL ) ELSE IF( KI.LT.N ) $ CALL PCGEMV( 'N', N, N-KI, CONE, VL, 1, KI+1, DESCVL, $ WORK, KI+1, 1, DESCW, 1, CMPLX( SCALE ), $ VL, 1, KI, DESCVL, 1 ) * CALL PCAMAX( N, REMAXC, II, VL, 1, KI, DESCVL, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PCSSCAL( N, REMAXD, VL, 1, KI, DESCVL, 1 ) END IF * * Set back the original diagonal elements of T. * DO 100 K = KI + 1, N CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( ( ICOL-1 )*LDT+IROW ) = WORK( LDW+IROW ) END IF 100 CONTINUE * IS = IS + 1 110 CONTINUE END IF * RETURN * * End of PCTREVC * END scalapack-2.0.2/SRC/pctrrfs.f000644 000766 000024 00000074401 10363532303 016176 0ustar00juliestaff000000 000000 SUBROUTINE PCTRRFS( UPLO, TRANS, DIAG, N, NRHS, A, IA, JA, DESCA, $ B, IB, JB, DESCB, X, IX, JX, DESCX, FERR, $ BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, TRANS, UPLO INTEGER INFO, IA, IB, IX, JA, JB, JX, LRWORK, LWORK, $ N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) REAL BERR( * ), FERR( * ), RWORK( * ) COMPLEX A( * ), B( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PCTRRFS provides error bounds and backward error estimates for the * solution to a system of linear equations with a triangular * coefficient matrix. * * The solution matrix X must be computed by PCTRTRS or some other * means before entering this routine. PCTRRFS does not do iterative * refinement because doing so cannot improve the backward error. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations. * = 'N': sub( A ) * sub( X ) = sub( B ) (No transpose) * = 'T': sub( A )**T * sub( X ) = sub( B ) (Transpose) * = 'C': sub( A )**H * sub( X ) = sub( B ) * (Conjugate transpose) * * DIAG (global input) CHARACTER*1 * = 'N': sub( A ) is non-unit triangular; * = 'U': sub( A ) is unit triangular. * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of local dimension (LLD_A,LOCc(JA+N-1) ). This * array contains the local pieces of the original triangular * distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. * If DIAG = 'U', the diagonal elements of sub( A ) are also * not referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) COMPLEX pointer into the local memory * to an array of local dimension (LLD_B, LOCc(JB+NRHS-1) ). * On entry, this array contains the the local pieces of the * right hand sides sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input) COMPLEX pointer into the local memory * to an array of local dimension (LLD_X, LOCc(JX+NRHS-1) ). * On entry, this array contains the the local pieces of the * solution vectors sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). The estimated forward error bounds for * each solution vector of sub( X ). If XTRUE is the true * solution, FERR bounds the magnitude of the largest entry * in (sub( X ) - XTRUE) divided by the magnitude of the * largest entry in sub( X ). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr( N + MOD( IA-1, MB_A ) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) REAL array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCr( N + MOD( IB-1, MB_B ) ). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices sub( X ) and sub( B ) should be * distributed the same way on the same processes. These conditions * ensure that sub( X ) and sub( B ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, RONE PARAMETER ( ZERO = 0.0E+0, RONE = 1.0E+0 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER CHARACTER TRANSN, TRANST INTEGER IAROW, IXBCOL, IXBROW, IXCOL, IXROW, ICOFFA, $ ICOFFB, ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, $ IIW, IOFFXB, IPB, IPR, IPV, IROFFA, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LRWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ REAL EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN COMPLEX ZDUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, CHK1MAT, $ DESCSET, INFOG2L, PCATRMV, PCAXPY, PCHK1MAT, $ PCHK2MAT, PCCOPY, PCLACON, PCTRMV, $ PCTRSV, PXERBLA, SGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Statement Functions .. REAL CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 900+CTXT_ ) ELSE CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IX, JX, DESCX, 17, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) NOUNIT = LSAME( DIAG, 'N' ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 2*NPMOD WORK( 1 ) = REAL( LWMIN ) LRWMIN = NPMOD RWORK( 1 ) = REAL( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 ELSE IF( NRHS.LT.0 ) THEN INFO = -5 ELSE IF( IROFFA.NE.0 ) THEN INFO = -7 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -8 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 900+NB_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -11 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1300+MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -15 ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -16 ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 1700+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 1700+NB_ ) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 1700+CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -21 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 2 ) = ICHAR( 'T' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'U' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 21 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 23 CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, 0, IDUM1, IDUM2, $ INFO ) CALL PCHK2MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, N, 4, NRHS, 5, $ IX, JX, DESCX, 17, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCTRRFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * IF( NOTRAN ) THEN TRANSN = 'N' TRANST = 'C' ELSE TRANSN = 'C' TRANST = 'N' END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = 1 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = maximum number of nonzero entries in each row of A, plus 1 * NZ = N + 1 EPS = PSLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) )*DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 90 K = 0, JBRHS - 1 * * Compute residual R = B - op(A) * X, * where op(A) = A or A', depending on TRANS. * CALL PCCOPY( N, X, IX, JX+K, DESCX, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PCTRMV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) CALL PCAXPY( N, -ONE, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, $ JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the matrix * or vector Z. If the i-th component of the denominator is less * than SAFE2, then SAFE1 is added to the i-th components of the * numerator and denominator before dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 20 II = IIXB, IIXB + NP - 1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 20 CONTINUE END IF END IF * CALL PCATRMV( UPLO, TRANS, DIAG, N, RONE, A, IA, JA, DESCA, X, $ IX, JX+K, DESCX, 1, RONE, RWORK( IPB ), IW, JW, $ DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIW - 1, IIW + NP - 2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 30 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Bound error from formula * * norm(X - XTRUE) / norm(X) .le. FERR = * norm( abs(inv(op(A)))* * ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(A)) is the inverse of op(A) * abs(Z) is the componentwise absolute value of the matrix or * vector Z * NZ is the maximum number of nonzeros in any row of A, plus 1 * EPS is machine epsilon * * The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) * is incremented by SAFE1 if the i-th component of * abs(op(A))*abs(X) + abs(B) is less than SAFE2. * * Use PCLACON to estimate the infinity-norm of the matrix * inv(op(A)) * diag(W), * where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW - 1, IIW + NP - 2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 40 CONTINUE END IF END IF * KASE = 0 50 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL CGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PCLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(A)'). * CALL PCTRSV( UPLO, TRANST, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 60 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 60 CONTINUE END IF END IF ELSE * * Multiply by inv(op(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF CALL PCTRSV( UPLO, TRANSN, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) END IF GO TO 50 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIXB, IIXB + NP - 1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 80 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 90 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 180 J = JN + 1, JB + NRHS - 1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 170 K = 0, JBRHS - 1 * * Compute residual R = B - op(A) * X, * where op(A) = A or A', depending on TRANS. * CALL PCCOPY( N, X, IX, J+K, DESCX, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PCTRMV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) CALL PCAXPY( N, -ONE, B, IB, J+K, DESCB, 1, WORK( IPR ), $ IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 100 II = IIXB, IIXB + NP - 1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 100 CONTINUE END IF END IF * CALL PCATRMV( UPLO, TRANS, DIAG, N, RONE, A, IA, JA, DESCA, $ X, IX, J+K, DESCX, 1, RONE, RWORK( IPB ), IW, $ JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 110 II = IIW - 1, IIW + NP - 2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 110 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Bound error from formula * * norm(X - XTRUE) / norm(X) .le. FERR = * norm( abs(inv(op(A)))* * ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) )))/norm(X) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(A)) is the inverse of op(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of A, * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) * is incremented by SAFE1 if the i-th component of * abs(op(A))*abs(X) + abs(B) is less than SAFE2. * * Use PCLACON to estimate the infinity-norm of the matrix * inv(op(A)) * diag(W), * where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIW - 1, IIW + NP - 2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 120 CONTINUE END IF END IF * KASE = 0 130 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL CGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PCLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(A)'). * CALL PCTRSV( UPLO, TRANST, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 140 CONTINUE END IF END IF ELSE * * Multiply by inv(op(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 150 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 150 CONTINUE END IF END IF CALL PCTRSV( UPLO, TRANSN, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) END IF GO TO 130 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIXB, IIXB + NP - 1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 160 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, $ IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 170 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) RWORK( 1 ) = REAL( LRWMIN ) * RETURN * * End of PCTRRFS * END scalapack-2.0.2/SRC/pctrti2.f000644 000766 000024 00000023656 10363532303 016110 0ustar00juliestaff000000 000000 SUBROUTINE PCTRTI2( UPLO, DIAG, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCTRTI2 computes the inverse of a complex upper or lower triangular * block matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). This matrix should be * contained in one and only one process memory space (local operation). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * DIAG (global input) CHARACTER*1 * = 'N': sub( A ) is non-unit triangular * = 'U': sub( A ) is unit triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)), * this array contains the local pieces of the triangular matrix * sub( A ). If UPLO = 'U', the leading N-by-N upper triangular * part of the matrix sub( A ) contains the upper triangular * matrix, and the strictly lower triangular part of sub( A ) * is not referenced. If UPLO = 'L', the leading N-by-N lower * triangular part of the matrix sub( A ) contains the lower * triangular matrix, and the strictly upper triangular part * of sub( A ) is not referenced. If DIAG = 'U', the diagonal * elements of sub( A ) are also not referenced and are assumed * to be 1. On exit, the (triangular) inverse of the original * matrix, in the same storage format. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER IACOL, IAROW, ICTXT, ICURR, IDIAG, IIA, IOFFA, $ JJA, LDA, MYCOL, MYROW, NA, NPCOL, NPROW COMPLEX AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, CSCAL, $ CTRMV, INFOG2L, PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCTRTI2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Compute local indexes * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN * LDA = DESCA( LLD_ ) * IF( UPPER ) THEN * IOFFA = IIA + ( JJA - 1 ) * LDA ICURR = IOFFA + LDA * IF( NOUNIT ) THEN * * Compute inverse of upper non-unit triangular matrix. * A( IOFFA ) = ONE / A( IOFFA ) IDIAG = ICURR + 1 DO 10 NA = 1, N-1 A( IDIAG ) = ONE / A( IDIAG ) AJJ = -A( IDIAG ) * * Compute elements 1:j-1 of j-th column. * CALL CTRMV( 'Upper', 'No transpose', DIAG, NA, $ A( IOFFA ), LDA, A( ICURR ), 1 ) CALL CSCAL( NA, AJJ, A( ICURR ), 1 ) IDIAG = IDIAG + LDA + 1 ICURR = ICURR + LDA 10 CONTINUE * ELSE * * Compute inverse of upper unit triangular matrix. * DO 20 NA = 1, N-1 * * Compute elements 1:j-1 of j-th column. * CALL CTRMV( 'Upper', 'No transpose', DIAG, NA, $ A( IOFFA ), LDA, A( ICURR ), 1 ) CALL CSCAL( NA, -ONE, A( ICURR ), 1 ) ICURR = ICURR + LDA 20 CONTINUE * END IF * ELSE * ICURR = IIA + N - 1 + ( JJA + N - 2 ) * LDA IOFFA = ICURR - LDA * IF( NOUNIT ) THEN * * Compute inverse of lower non-unit triangular matrix. * A( ICURR ) = ONE / A( ICURR ) IDIAG = IOFFA - 1 DO 30 NA = 1, N-1 A( IDIAG ) = ONE / A( IDIAG ) AJJ = -A( IDIAG ) * * Compute elements j+1:n of j-th column. * CALL CTRMV( 'Lower', 'No transpose', DIAG, NA, $ A( ICURR ), LDA, A( IOFFA ), 1 ) CALL CSCAL( NA, AJJ, A( IOFFA ), 1 ) ICURR = IDIAG IDIAG = IDIAG - LDA - 1 IOFFA = IDIAG + 1 30 CONTINUE * ELSE * * Compute inverse of lower unit triangular matrix. * DO 40 NA = 1, N-1 * * Compute elements j+1:n of j-th column. * CALL CTRMV( 'Lower', 'No transpose', DIAG, NA, $ A( ICURR ), LDA, A( IOFFA ), 1 ) CALL CSCAL( NA, -ONE, A( IOFFA ), 1 ) ICURR = ICURR - LDA - 1 IOFFA = ICURR - LDA 40 CONTINUE * END IF * END IF * END IF * * End of PCTRTI2 * END scalapack-2.0.2/SRC/pctrtri.f000644 000766 000024 00000030701 10363532303 016175 0ustar00juliestaff000000 000000 SUBROUTINE PCTRTRI( UPLO, DIAG, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCTRTRI computes the inverse of a upper or lower triangular * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the distributed matrix sub( A ) is upper * or lower triangular: * = 'U': Upper triangular, * = 'L': Lower triangular. * * DIAG (global input) CHARACTER * Specifies whether or not the distributed matrix sub( A ) * is unit triangular: * = 'N': Non-unit triangular, * = 'U': Unit triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * triangular matrix sub( A ). If UPLO = 'U', the leading * N-by-N upper triangular part of the matrix sub( A ) contains * the upper triangular matrix to be inverted, and the strictly * lower triangular part of sub( A ) is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * the matrix sub( A ) contains the lower triangular matrix, * and the strictly upper triangular part of sub( A ) is not * referenced. * On exit, the (triangular) inverse of the original matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, A(IA+K-1,JA+K-1) is exactly zero. The * triangular matrix sub( A ) is singular and its * inverse can not be computed. * * ==================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER I, ICOFF, ICTXT, IROFF, ICURCOL, ICURROW, $ IDUMMY, II, IOFFA, J, JB, JJ, JN, LDA, MYCOL, $ MYROW, NN, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMX2D, INFOG2L, $ PCHK1MAT, PCTRTI2, PCTRMM, PCTRSM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) * CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOUNIT ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 * CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCTRTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) IF( NOUNIT ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, ICURROW, ICURCOL ) * * Handle first block separately * JB = JN-JA+1 LDA = DESCA( LLD_ ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*LDA DO 10 I = 0, JB-1 IF( A( IOFFA ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I + 1 IOFFA = IOFFA + LDA + 1 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*LDA DO 20 I = 0, JB-1 IF( A( IOFFA ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = J + I - JA + 1 IOFFA = IOFFA + LDA + 1 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUMMY, $ IDUMMY, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN END IF * * Use blocked code * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix * JB = JN-JA+1 * * Handle first block of column separately * CALL PCTRTI2( UPLO, DIAG, JB, A, IA, JA, DESCA, INFO ) * * Loop over remaining block of columns * DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * * Compute rows 1:j-1 of current block column * CALL PCTRMM( 'Left', UPLO, 'No transpose', DIAG, J-JA, JB, $ ONE, A, IA, JA, DESCA, A, IA, J, DESCA ) CALL PCTRSM( 'Right', UPLO, 'No transpose', DIAG, J-JA, $ JB, -ONE, A, I, J, DESCA, A, IA, J, DESCA ) * * Compute inverse of current diagonal block * CALL PCTRTI2( UPLO, DIAG, JB, A, I, J, DESCA, INFO ) * 40 CONTINUE * ELSE * * Compute inverse of lower triangular matrix * NN = ( ( JA+N-2 ) / DESCA( NB_ ) )*DESCA( NB_ ) + 1 DO 50 J = NN, JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA IF( J+JB.LE.JA+N-1 ) THEN * * Compute rows j+jb:ja+n-1 of current block column * CALL PCTRMM( 'Left', UPLO, 'No transpose', DIAG, $ JA+N-J-JB, JB, ONE, A, I+JB, J+JB, DESCA, $ A, I+JB, J, DESCA ) CALL PCTRSM( 'Right', UPLO, 'No transpose', DIAG, $ JA+N-J-JB, JB, -ONE, A, I, J, DESCA, $ A, I+JB, J, DESCA ) END IF * * Compute inverse of current diagonal block * CALL PCTRTI2( UPLO, DIAG, JB, A, I, J, DESCA, INFO ) * 50 CONTINUE * * Handle the last block of columns separately * JB = JN-JA+1 IF( JA+JB.LE.JA+N-1 ) THEN * * Compute rows ja+jb:ja+n-1 of current block column * CALL PCTRMM( 'Left', UPLO, 'No transpose', DIAG, N-JB, JB, $ ONE, A, IA+JB, JA+JB, DESCA, A, IA+JB, JA, $ DESCA ) CALL PCTRSM( 'Right', UPLO, 'No transpose', DIAG, N-JB, JB, $ -ONE, A, IA, JA, DESCA, A, IA+JB, JA, DESCA ) END IF * * Compute inverse of current diagonal block * CALL PCTRTI2( UPLO, DIAG, JB, A, IA, JA, DESCA, INFO ) * END IF * RETURN * * End PCTRTRI * END scalapack-2.0.2/SRC/pctrtrs.f000644 000766 000024 00000031433 10363532303 016212 0ustar00juliestaff000000 000000 SUBROUTINE PCTRTRS( UPLO, TRANS, DIAG, N, NRHS, A, IA, JA, DESCA, $ B, IB, JB, DESCB, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, TRANS, UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX A( * ), B( * ) * .. * * Purpose * ======= * * PCTRTRS solves a triangular system of the form * * sub( A ) * X = sub( B ) or sub( A )**T * X = sub( B ) or * * sub( A )**H * X = sub( B ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is a triangular * distributed matrix of order N, and B(IB:IB+N-1,JB:JB+NRHS-1) is an * N-by-NRHS distributed matrix denoted by sub( B ). A check is made * to verify that sub( A ) is nonsingular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': Solve sub( A ) * X = sub( B ) (No transpose) * = 'T': Solve sub( A )**T * X = sub( B ) (Transpose) * = 'C': Solve sub( A )**H * X = sub( B ) (Conjugate transpose) * * DIAG (global input) CHARACTER * = 'N': sub( A ) is non-unit triangular; * = 'U': sub( A ) is unit triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed matrix sub( B ). NRHS >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1) ). This array * contains the local pieces of the distributed triangular * matrix sub( A ). If UPLO = 'U', the leading N-by-N upper * triangular part of sub( A ) contains the upper triangular * matrix, and the strictly lower triangular part of sub( A ) * is not referenced. If UPLO = 'L', the leading N-by-N lower * triangular part of sub( A ) contains the lower triangular * matrix, and the strictly upper triangular part of sub( A ) * is not referenced. If DIAG = 'U', the diagonal elements of * sub( A ) are also not referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the right hand side distributed matrix * sub( B ). On exit, if INFO = 0, sub( B ) is overwritten by * the solution matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, the i-th diagonal element of sub( A ) is * zero, indicating that the submatrix is singular and the * solutions X have not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN, NOUNIT, UPPER INTEGER I, IAROW, IBROW, ICOFFA, ICTXT, ICURCOL, $ ICURROW, IROFFA, IROFFB, IDUM, II, IOFFA, J, $ JBLK, JJ, JN, LDA, LL, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMX2D, INFOG2L, $ PCHK2MAT, PCTRSM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL ICEIL, INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -907 ELSE UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) NOTRAN = LSAME( TRANS, 'N' ) * CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA .OR. IROFFA.NE.0 ) THEN INFO = -8 ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IBROW ) THEN INFO = -11 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -904 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -1304 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 2 ) = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'D' ) END IF IDUM2( 3 ) = 3 CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 9, N, 4, NRHS, 5, $ IB, JB, DESCB, 13, 3, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCTRTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * IF( NOUNIT ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, ICURROW, ICURCOL ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) IOFFA = II + ( JJ - 1 ) * LDA * * Handle first block separately * JBLK = JN-JA+1 IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN LL = IOFFA DO 10 I = 0, JBLK-1 IF( A( LL ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I + 1 LL = IOFFA + LDA + 1 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JBLK IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JBLK*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JBLK = MIN( JA+N-J, DESCA( NB_ ) ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN LL = IOFFA DO 20 I = 0, JBLK-1 IF( A( LL ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = J + I - JA + 1 LL = IOFFA + LDA + 1 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JBLK IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JBLK*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUM, IDUM, $ -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN END IF * * Solve A * x = b, A**T * x = b, or A**H * x = b. * CALL PCTRSM( 'Left', UPLO, TRANS, DIAG, N, NRHS, ONE, A, IA, JA, $ DESCA, B, IB, JB, DESCB ) * RETURN * * End of PCTRTRS * END scalapack-2.0.2/SRC/pctzrzf.f000644 000766 000024 00000031076 10363532303 016216 0ustar00juliestaff000000 000000 SUBROUTINE PCTZRZF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCTZRZF reduces the M-by-N ( M<=N ) complex upper trapezoidal matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper triangular form by means * of unitary transformations. * * The upper trapezoidal matrix sub( A ) is factored as * * sub( A ) = ( R 0 ) * Z, * * where Z is an N-by-N unitary matrix and R is an M-by-M upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the leading M-by-M * upper triangular part of sub( A ) contains the upper trian- * gular matrix R, and elements M+1 to N of the first M rows of * sub( A ), with the array TAU, represent the unitary matrix Z * as a product of M elementary reflectors. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The factorization is obtained by Householder's method. The kth * transformation matrix, Z( k ), whose conjugate transpose is used to * introduce zeros into the (m - k + 1)th row of sub( A ), is given in * the form * * Z( k ) = ( I 0 ), * ( 0 T( k ) ) * * where * * T( k ) = I - tau*u( k )*u( k )', u( k ) = ( 1 ), * ( 0 ) * ( z( k ) ) * * tau is a scalar and z( k ) is an ( n - m ) element vector. * tau and z( k ) are chosen to annihilate the elements of the kth row * of sub( A ). * * The scalar tau is returned in the kth element of TAU and the vector * u( k ) in the kth row of sub( A ), such that the elements of z( k ) * are in a( k, m + 1 ), ..., a( k, n ). The elements of R are returned * in the upper triangular part of sub( A ). * * Z is given by * * Z = Z( 1 ) * Z( 2 ) * ... * Z( m ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IIA, IL, IN, IPW, $ IROFFA, J, JM1, L, LWMIN, MP0, MYCOL, MYROW, $ NPCOL, NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, INFOG1L, PCHK1MAT, $ PCLATRZ, PCLARZB, PCLARZT, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -9 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCTZRZF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( M.EQ.N ) THEN * CALL INFOG1L( IA, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ IIA, IAROW ) IF( MYROW.EQ.IAROW ) $ MP0 = MP0 - IROFFA DO 10 I = IIA, IIA+MP0-1 TAU( I ) = ZERO 10 CONTINUE * ELSE * L = N-M JM1 = JA + MIN( M+1, N ) - 1 IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IL = MAX( ( (IA+M-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * * Use blocked code initially * DO 20 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) J = JA + I - IA * * Compute the complete orthogonal factorization of the current * block A(i:i+ib-1,j:ja+n-1) * CALL PCLATRZ( IB, JA+N-J, L, A, I, J, DESCA, TAU, WORK ) * IF( I.GT.IA ) THEN * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PCLARZT( 'Backward', 'Rowwise', L, IB, A, I, JM1, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:i-1,j:ja+n-1) from the right * CALL PCLARZB( 'Right', 'No transpose', 'Backward', $ 'Rowwise', I-IA, JA+N-J, IB, L, A, I, JM1, $ DESCA, WORK, A, IA, J, DESCA, WORK( IPW ) ) END IF * 20 CONTINUE * * Use unblocked code to factor the last or only block * CALL PCLATRZ( IN-IA+1, N, N-M, A, IA, JA, DESCA, TAU, WORK ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCTZRZF * END scalapack-2.0.2/SRC/pcung2l.f000644 000766 000024 00000025716 10363532303 016072 0ustar00juliestaff000000 000000 SUBROUTINE PCUNG2L( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNG2L generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the last N columns of a product of K elementary reflectors of order M * * Q = H(k) . . . H(2) H(1) * * as returned by PCGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA+N-K <= j <= JA+N-1, * as returned by PCGEQLF in the K columns of its distributed * matrix argument A(IA:*,JA+N-K:JA+N-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PCGEQLF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MpA0 + MAX( 1, NqA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, J, JJ, LWMIN, MPA0, MYCOL, $ MYROW, NPCOL, NPROW, NQA0 COMPLEX TAUJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PCELSET, $ PCLARF, PCLASET, PCSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MPA0 + MAX( 1, NQA0 ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNG2L', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Initialise columns ja:ja+n-k-1 to columns of the unit matrix * CALL PCLASET( 'All', M-N, N-K, ZERO, ZERO, A, IA, JA, DESCA ) CALL PCLASET( 'All', N, N-K, ZERO, ONE, A, IA+M-N, JA, DESCA ) * TAUJ = ZERO NQA0 = MAX( 1, NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) ) DO 10 J = JA+N-K, JA+N-1 * * Apply H(j) to A(ia:ia+m-n+j-ja,ja:j) from the left * CALL PCELSET( A, IA+M-N+J-JA, J, DESCA, ONE ) CALL PCLARF( 'Left', M-N+J-JA+1, J-JA, A, IA, J, DESCA, 1, TAU, $ A, IA, JA, DESCA, WORK ) * JJ = INDXG2L( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) IACOL = INDXG2P( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ TAUJ = TAU( MIN( JJ, NQA0 ) ) CALL PCSCAL( M-N+J-JA, -TAUJ, A, IA, J, DESCA, 1 ) CALL PCELSET( A, IA+M-N+J-JA, J, DESCA, ONE-TAUJ ) * * Set A(ia+m-n+j-ja+1:ia+m-1,j) to zero * CALL PCLASET( 'All', JA+N-1-J, 1, ZERO, ZERO, A, IA+M-N+J-JA+1, $ J, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNG2L * END scalapack-2.0.2/SRC/pcung2r.f000644 000766 000024 00000025765 10363532303 016104 0ustar00juliestaff000000 000000 SUBROUTINE PCUNG2R( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNG2R generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the first N columns of a product of K elementary reflectors of order * M * * Q = H(1) H(2) . . . H(k) * * as returned by PCGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA <= j <= JA+K-1, as * returned by PCGEQRF in the K columns of its array * argument A(IA:*,JA:JA+K-1). On exit, this array contains * the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PCGEQRF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MpA0 + MAX( 1, NqA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, J, JJ, KQ, LWMIN, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 COMPLEX TAUJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PCELSET, $ PCLARF, PCLASET, PCSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MPA0 + MAX( 1, NQA0 ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNG2R', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Initialise columns ja+k:ja+n-1 to columns of the unit matrix * CALL PCLASET( 'All', K, N-K, ZERO, ZERO, A, IA, JA+K, DESCA ) CALL PCLASET( 'All', M-K, N-K, ZERO, ONE, A, IA+K, JA+K, DESCA ) * TAUJ = ZERO KQ = MAX( 1, NUMROC( JA+K-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) DO 10 J = JA+K-1, JA, -1 * * Apply H(j) to A(ia+j-ja:ia+m-1,j:ja+n-1) from the left * IF( J.LT.JA+N-1 ) THEN CALL PCELSET( A, IA+J-JA, J, DESCA, ONE ) CALL PCLARF( 'Left', M-J+JA, JA+N-J-1, A, IA+J-JA, J, DESCA, $ 1, TAU, A, IA+J-JA, J+1, DESCA, WORK ) END IF * JJ = INDXG2L( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) IACOL = INDXG2P( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ TAUJ = TAU( MIN( JJ, KQ ) ) IF( J-JA.LT.M-1 ) $ CALL PCSCAL( M-J+JA-1, -TAUJ, A, IA+J-JA+1, J, DESCA, 1 ) CALL PCELSET( A, IA+J-JA, J, DESCA, ONE-TAUJ ) * * Set A(ia:ia+j-ja-1,j) to zero * CALL PCLASET( 'All', J-JA, 1, ZERO, ZERO, A, IA, J, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNG2R * END scalapack-2.0.2/SRC/pcungl2.f000644 000766 000024 00000026414 10363532303 016066 0ustar00juliestaff000000 000000 SUBROUTINE PCUNGL2( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNGL2 generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as * the first M rows of a product of K elementary reflectors of order N * * Q = H(k)' . . . H(2)' H(1)' * * as returned by PCGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA <= i <= IA+K-1, as * returned by PCGELQF in the K rows of its distributed matrix * argument A(IA:IA+K-1,JA:*). On exit, this array contains the * local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCr(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCGELQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NqA0 + MAX( 1, MpA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, II, J, KP, LWMIN, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 COMPLEX TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PCELSET, $ PCLACGV, PCLARFC, PCLASET, PCSCAL, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQA0 + MAX( 1, MPA0 ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNGL2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * IF( K.LT.M ) THEN * * Initialise rows ia+k:ia+m-1 to rows of the unit matrix * CALL PCLASET( 'All', M-K, K, ZERO, ZERO, A, IA+K, JA, DESCA ) CALL PCLASET( 'All', M-K, N-K, ZERO, ONE, A, IA+K, JA+K, $ DESCA ) * END IF * TAUI = ZERO KP = NUMROC( IA+K-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) * DO 10 I = IA+K-1, IA, -1 * * Apply H(i)' to A(i:ia+m-1,ja+i-ia:ja+n-1) from the right * J = JA + I - IA II = INDXG2L( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IAROW = INDXG2P( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IF( MYROW.EQ.IAROW ) $ TAUI = TAU( MIN( II, KP ) ) IF( J.LT.JA+N-1 ) THEN CALL PCLACGV( N-J+JA-1, A, I, J+1, DESCA, DESCA( M_ ) ) IF( I.LT.IA+M-1 ) THEN CALL PCELSET( A, I, J, DESCA, ONE ) CALL PCLARFC( 'Right', M-I+IA-1, N-J+JA, A, I, J, DESCA, $ DESCA( M_ ), TAU, A, I+1, J, DESCA, WORK ) END IF CALL PCSCAL( N-J+JA-1, -TAUI, A, I, J+1, DESCA, $ DESCA( M_ ) ) CALL PCLACGV( N-J+JA-1, A, I, J+1, DESCA, DESCA( M_ ) ) END IF CALL PCELSET( A, I, J, DESCA, ONE-CONJG( TAUI ) ) * * Set A(i,ja:j-1) to zero * CALL PCLASET( 'All', 1, J-JA, ZERO, ZERO, A, I, JA, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNGL2 * END scalapack-2.0.2/SRC/pcunglq.f000644 000766 000024 00000030767 10363532303 016173 0ustar00juliestaff000000 000000 SUBROUTINE PCUNGLQ( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNGLQ generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as * the first M rows of a product of K elementary reflectors of order N * * Q = H(k)' . . . H(2)' H(1)' * * as returned by PCGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA <= i <= IA+K-1, as * returned by PCGELQF in the K rows of its distributed matrix * argument A(IA:IA+K-1,JA:*). On exit, this array contains the * local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCr(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCGELQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( MpA0 + NqA0 + MB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IL, IN, IPW, $ J, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCLARFB, $ PCLARFT, PCLASET, PCUNGL2, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNGLQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IL = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * CALL PCLASET( 'All', IA+M-IL, IL-IA, ZERO, ZERO, A, IL, JA, $ DESCA ) * * Use unblocked code for the last or only block. * CALL PCUNGL2( IA+M-IL, N-IL+IA, IA+K-IL, A, IL, JA+IL-IA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Is there at least one block of rows to loop over ? * IF( IL.GT.IN+1 ) THEN * * Use blocked code * DO 10 I = IL-DESCA( MB_ ), IN+1, -DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) J = JA + I - IA * IF( I+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PCLARFT( 'Forward', 'Rowwise', N-I+IA, IB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(i+ib:ia+m-1,j:ja+n-1) from the right * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-I-IB+IA, N-I+IA, IB, A, I, J, $ DESCA, WORK, A, I+IB, J, DESCA, $ WORK( IPW ) ) END IF * * Apply H' to columns j:ja+n-1 of current block * CALL PCUNGL2( IB, N-I+IA, IB, A, I, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set columns ia:i-1 of current block to zero * CALL PCLASET( 'All', IB, I-IA, ZERO, ZERO, A, I, JA, DESCA ) 10 CONTINUE * END IF * * Handle first block separately * IF( IL.GT.IA ) THEN * IB = IN - IA + 1 * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PCLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia+ib:ia+m-1,ja:ja+n-1) from the right * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-IB, N, IB, A, IA, JA, DESCA, WORK, $ A, IA+IB, JA, DESCA, WORK( IPW ) ) * * Apply H' to columns ja:ja+n-1 of current block * CALL PCUNGL2( IB, N, IB, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNGLQ * END scalapack-2.0.2/SRC/pcungql.f000644 000766 000024 00000027026 10363532303 016165 0ustar00juliestaff000000 000000 SUBROUTINE PCUNGQL( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNGQL generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the last N columns of a product of K elementary reflectors of order M * * Q = H(k) . . . H(2) H(1) * * as returned by PCGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA+N-K <= j <= JA+N-1, * as returned by PCGEQLF in the K columns of its distributed * matrix argument A(IA:*,JA+N-K:JA+N-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PCGEQLF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( NqA0 + MpA0 + NB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IINFO, IPW, J, JB, JN, $ LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCLARFB, $ PCLARFT, PCLASET, PCUNG2L, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNGQL', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * IPW = DESCA( NB_ )*DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Set A(ia+m-n+jn-ja+1:ia-m+1,ja:jn) to zero. * CALL PCLASET( 'All', N-JN+JA-1, JN-JA+1, ZERO, ZERO, A, $ IA+M-N+JN-JA+1, JA, DESCA ) * * Use unblocked code for the first or only block. * CALL PCUNG2L( M-N+JN-JA+1, JN-JA+1, JN-JA-N+K+1, A, IA, JA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Use blocked code * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PCLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:ia+m-n+j+jb-ja-1,ja:j-1) from the left * CALL PCLARFB( 'Left', 'No transpose', 'Backward', $ 'Columnwise', M-N+J+JB-JA, J-JA, JB, A, IA, $ J, DESCA, WORK, A, IA, JA, DESCA, WORK( IPW ) ) * * Apply H to rows ia:m-k+i+ib-1 of current block * CALL PCUNG2L( M-N+J+JB-JA, JB, JB, A, IA, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows ia+m-n+j+jb-ja:ia+m-1,j:j+jb-1 of current block to * zero * CALL PCLASET( 'All', N-J-JB+JA, JB, ZERO, ZERO, A, $ IA+M-N+J+JB-JA, J, DESCA ) * 10 CONTINUE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNGQL * END scalapack-2.0.2/SRC/pcungqr.f000644 000766 000024 00000030757 10363532303 016200 0ustar00juliestaff000000 000000 SUBROUTINE PCUNGQR( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNGQR generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the first N columns of a product of K elementary reflectors of order * M * * Q = H(1) H(2) . . . H(k) * * as returned by PCGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA <= j <= JA+K-1, as * returned by PCGEQRF in the K columns of its distributed * matrix argument A(IA:*,JA:JA+K-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(JA+K-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PCGEQRF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( NqA0 + MpA0 + NB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICTXT, IINFO, IPW, J, JB, JL, $ JN, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCLARFB, $ PCLARFT, PCLASET, PCUNG2R, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNGQR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * IPW = DESCA( NB_ )*DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JL = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * CALL PCLASET( 'All', JL-JA, JA+N-JL, ZERO, ZERO, A, IA, JL, $ DESCA ) * * Use unblocked code for the last or only block. * CALL PCUNG2R( M-JL+JA, JA+N-JL, JA+K-JL, A, IA+JL-JA, JL, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Is there at least one block of columns to loop over ? * IF( JL.GT.JN+1 ) THEN * * Use blocked code * DO 10 J = JL-DESCA( NB_ ), JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * IF( J+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PCLARFT( 'Forward', 'Columnwise', M-I+IA, JB, A, I, $ J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(i:ia+m-1,j+jb:ja+n-1) from the left * CALL PCLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-I+IA, N-J-JB+JA, JB, A, I, $ J, DESCA, WORK, A, I, J+JB, DESCA, $ WORK( IPW ) ) END IF * * Apply H to rows i:ia+m-1 of current block * CALL PCUNG2R( M-I+IA, JB, JB, A, I, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows ia:i-1 of current block to zero * CALL PCLASET( 'All', I-IA, JB, ZERO, ZERO, A, IA, J, DESCA ) * 10 CONTINUE * END IF * * Handle first block separately * IF( JL.GT.JA ) THEN * JB = JN - JA + 1 * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PCLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:ia+m-1,ja+jb:ja+n-1) from the left * CALL PCLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M, N-JB, JB, A, IA, JA, DESCA, WORK, A, IA, $ JA+JB, DESCA, WORK( IPW ) ) * * Apply H to rows ia:ia+m-1 of current block * CALL PCUNG2R( M, JB, JB, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * END IF * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNGQR * END scalapack-2.0.2/SRC/pcungr2.f000644 000766 000024 00000026175 10363532303 016100 0ustar00juliestaff000000 000000 SUBROUTINE PCUNGR2( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNGR2 generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as the * last M rows of a product of K elementary reflectors of order N * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PCGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA+M-K <= i <= IA+M-1, as * returned by PCGERQF in the K rows of its distributed * matrix argument A(IA+M-K:IA+M-1,JA:*). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCr(IA+M-1) * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCGERQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NqA0 + MAX( 1, MpA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, II, LWMIN, MP, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 COMPLEX TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PCELSET, $ PCLACGV, PCLARFC, PCLASET, PCSCAL, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQA0 + MAX( 1, MPA0 ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNGR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * IF( K.LT.M ) THEN * * Initialise rows ia:ia+m-k-1 to rows of the unit matrix * CALL PCLASET( 'All', M-K, N-M, ZERO, ZERO, A, IA, JA, DESCA ) CALL PCLASET( 'All', M-K, M, ZERO, ONE, A, IA, JA+N-M, DESCA ) * END IF * TAUI = ZERO MP = NUMROC( IA+M-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) * DO 10 I = IA+M-K, IA+M-1 * * Apply H(i)' to A(ia:i,ja:ja+n-m+i-ia) from the right * CALL PCLACGV( I-IA+N-M, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCELSET( A, I, JA+N-M+I-IA, DESCA, ONE ) CALL PCLARFC( 'Right', I-IA, I-IA+N-M+1, A, I, JA, DESCA, $ DESCA( M_ ), TAU, A, IA, JA, DESCA, WORK ) II = INDXG2L( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IAROW = INDXG2P( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IF( MYROW.EQ.IAROW ) $ TAUI = TAU( MIN( II, MP ) ) CALL PCSCAL( I-IA+N-M, -TAUI, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCLACGV( I-IA+N-M, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCELSET( A, I, JA+N-M+I-IA, DESCA, ONE-CONJG( TAUI ) ) * * Set A(i,ja+n-m+i-ia+1:ja+n-1) to zero * CALL PCLASET( 'All', 1, IA+M-1-I, ZERO, ZERO, A, I, $ JA+N-M+I-IA+1, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNGR2 * END scalapack-2.0.2/SRC/pcungrq.f000644 000766 000024 00000027046 10363532303 016175 0ustar00juliestaff000000 000000 SUBROUTINE PCUNGRQ( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNGRQ generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as the * last M rows of a product of K elementary reflectors of order N * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PCGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. * N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA+M-K <= i <= IA+M-1, as * returned by PCGERQF in the K rows of its distributed * matrix argument A(IA+M-K:IA+M-1,JA:*). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCr(IA+M-1) * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCGERQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( MpA0 + NqA0 + MB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IN, IPW, $ LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCLARFB, $ PCLARFT, PCLASET, PCUNGR2, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNGRQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) )*DESCA( MB_ ), IA+M-1 ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * * Set A(ia:in,ja+n-m+in-ia+1:ja-n+1) to zero. * CALL PCLASET( 'All', IN-IA+1, M-IN+IA-1, ZERO, ZERO, A, IA, $ JA+N-M+IN-IA+1, DESCA ) * * Use unblocked code for the first or only block. * CALL PCUNGR2( IN-IA+1, N-M+IN-IA+1, IN-IA-M+K+1, A, IA, JA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Use blocked code * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PCLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, I, JA, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:i-1,ja:ja+n-m+i+ib-ia-1) from the right * CALL PCLARFB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', I-IA, N-M+I+IB-IA, IB, A, I, JA, $ DESCA, WORK, A, IA, JA, DESCA, WORK( IPW ) ) * * Apply H' to columns ja:ja+n-m+i+ib-ia-1 of current block * CALL PCUNGR2( IB, N-M+I+IB-IA, IB, A, I, JA, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows i:i+ib-1,ja+n-m+i+ib-ia:ja+n-1 of current block to * zero * CALL PCLASET( 'All', IB, M-I-IB+IA, ZERO, ZERO, A, I, $ JA+N-M+I+IB-IA, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNGRQ * END scalapack-2.0.2/SRC/pcunm2l.f000644 000766 000024 00000041443 10363532303 016073 0ustar00juliestaff000000 000000 SUBROUTINE PCUNM2L( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNM2L overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PCGEQLF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PCGEQLF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ), * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PCGEQLF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( 1, NqC0 ); * if SIDE = 'R', LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( * NUMROC( N+ICOFFC,NB_A,0,0,NPCOL ),NB_A,0,0,LCMQ ) ); * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICCOL, ICOFFC, ICROW, ICTXT, ICC, $ II, IROFFA, IROFFC, J, J1, J2, J3, JCC, JJ, $ LCM, LCMQ, LWMIN, MI, MP, MPC0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 COMPLEX AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CGEBR2D, $ CGEBS2D, CGERV2D, CGESD2D, CHK1MAT, $ CSCAL, INFOG2L, PCELSET, PCELSET2, $ PCLARF, PCLARFC, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MPC0 + MAX( 1, NQC0 ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = NQC0 + MAX( MAX( 1, MPC0 ), NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ) ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNM2L', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, ICC, $ JCC, ICROW, ICCOL ) IF( LEFT ) THEN IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JC+N-1, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) THEN IF( NOTRAN ) THEN AJJ = ONE - TAU( JJ ) ELSE AJJ = ONE - CONJG( TAU( JJ ) ) END IF CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1 ) CALL CSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1, $ IAROW, IACOL ) CALL CSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN IF( NOTRAN ) THEN AJJ = ONE - TAU( JJ ) ELSE AJJ = ONE - CONJG( TAU( JJ ) ) END IF END IF * IF( IACOL.NE.ICCOL ) THEN IF( MYCOL.EQ.IACOL ) $ CALL CGESD2D( ICTXT, 1, 1, AJJ, 1, MYROW, ICCOL ) IF( MYCOL.EQ.ICCOL ) $ CALL CGERV2D( ICTXT, 1, 1, AJJ, 1, MYROW, IACOL ) END IF * IF( MYCOL.EQ.ICCOL ) THEN MP = NUMROC( IC+M-1, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) CALL CSCAL( MP-ICC+1, AJJ, C( ICC+(JCC-1)* $ DESCC( LLD_ ) ), 1 ) END IF * END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( LEFT .AND. NOTRAN .OR. .NOT.LEFT .AND. .NOT.NOTRAN ) THEN J1 = JA J2 = JA+K-1 J3 = 1 ELSE J1 = JA+K-1 J2 = JA J3 = -1 END IF * IF( LEFT ) THEN NI = N IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M END IF * DO 10 J = J1, J2, J3 * IF( LEFT ) THEN * * H(j) or H(j)' is applied to C(ic:ic+m-k+j-ja,jc:jc+n-1) * MI = M - K + J - JA + 1 ELSE * * H(j) or H(j)' is applied to C(ic:ic+m-1,jc:jc+n-k+j-ja) * NI = N - K + J - JA + 1 END IF * * Apply H(j) or H(j)' * CALL PCELSET2( AJJ, A, IA+NQ-K+J-JA, J, DESCA, ONE ) IF( NOTRAN ) THEN CALL PCLARF( SIDE, MI, NI, A, IA, J, DESCA, 1, TAU, C, $ IC, JC, DESCC, WORK ) ELSE CALL PCLARFC( SIDE, MI, NI, A, IA, J, DESCA, 1, TAU, C, $ IC, JC, DESCC, WORK ) END IF CALL PCELSET( A, IA+NQ-K+J-JA, J, DESCA, AJJ ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNM2L * END scalapack-2.0.2/SRC/pcunm2r.f000644 000766 000024 00000041610 10363532303 016075 0ustar00juliestaff000000 000000 SUBROUTINE PCUNM2R( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNM2R overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of k elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PCGEQRF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PCGEQRF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ); * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PCGEQRF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( 1, NqC0 ); * if SIDE = 'R', LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( * NUMROC( N+ICOFFC,NB_A,0,0,NPCOL ),NB_A,0,0,LCMQ ) ); * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICCOL, ICOFFC, ICROW, ICTXT, ICC, $ II, IROFFA, IROFFC, J, J1, J2, J3, JCC, JJ, $ LCM, LCMQ, LWMIN, MI, MP, MPC0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 COMPLEX AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CGEBR2D, $ CGEBS2D, CGERV2D, CGESD2D, CHK1MAT, $ CSCAL, INFOG2L, PCELSET, PCELSET2, $ PCLARF, PCLARFC, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MPC0 + MAX( 1, NQC0 ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = NQC0 + MAX( MAX( 1, MPC0 ), NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ) ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNM2R', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, ICC, $ JCC, ICROW, ICCOL ) IF( LEFT ) THEN IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JC+N-1, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) THEN IF( NOTRAN ) THEN AJJ = ONE - TAU( JJ ) ELSE AJJ = ONE - CONJG( TAU( JJ ) ) END IF CALL CGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1 ) CALL CSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) ELSE CALL CGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1, $ IAROW, IACOL ) CALL CSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN IF( NOTRAN ) THEN AJJ = ONE - TAU( JJ ) ELSE AJJ = ONE - CONJG( TAU( JJ ) ) END IF END IF * IF( IACOL.NE.ICCOL ) THEN IF( MYCOL.EQ.IACOL ) $ CALL CGESD2D( ICTXT, 1, 1, AJJ, 1, MYROW, ICCOL ) IF( MYCOL.EQ.ICCOL ) $ CALL CGERV2D( ICTXT, 1, 1, AJJ, 1, MYROW, IACOL ) END IF * IF( MYCOL.EQ.ICCOL ) THEN MP = NUMROC( IC+M-1, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) CALL CSCAL( MP-ICC+1, AJJ, C( ICC+(JCC-1)* $ DESCC( LLD_ ) ), 1 ) END IF * END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) THEN J1 = JA J2 = JA+K-1 J3 = 1 ELSE J1 = JA+K-1 J2 = JA J3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M ICC = IC END IF * DO 10 J = J1, J2, J3 IF( LEFT ) THEN * * H(j) or H(j)' is applied to C(ic+j-ja:ic+m-1,jc:jc+n-1) * MI = M - J + JA ICC = IC + J - JA ELSE * * H(j) or H(j)' is applied to C(ic:ic+m-1,jc+j-ja:jc+n-1) * NI = N - J + JA JCC = JC + J - JA END IF * * Apply H(j) or H(j)' * CALL PCELSET2( AJJ, A, IA+J-JA, J, DESCA, ONE ) IF( NOTRAN ) THEN CALL PCLARF( SIDE, MI, NI, A, IA+J-JA, J, DESCA, 1, TAU, $ C, ICC, JCC, DESCC, WORK ) ELSE CALL PCLARFC( SIDE, MI, NI, A, IA+J-JA, J, DESCA, 1, TAU, $ C, ICC, JCC, DESCC, WORK ) END IF CALL PCELSET( A, IA+J-JA, J, DESCA, AJJ ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNM2R * END scalapack-2.0.2/SRC/pcunmbr.f000644 000766 000024 00000054355 10363532303 016167 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMBR( VECT, SIDE, TRANS, M, N, K, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, VECT INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * If VECT = 'Q', PCUNMBR overwrites the general complex distributed * M-by-N matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * If VECT = 'P', PCUNMBR overwrites sub( C ) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': P * sub( C ) sub( C ) * P * TRANS = 'C': P**H * sub( C ) sub( C ) * P**H * * Here Q and P**H are the unitary distributed matrices determined by * PCGEBRD when reducing a complex distributed matrix A(IA:*,JA:*) to * bidiagonal form: A(IA:*,JA:*) = Q * B * P**H. Q and P**H are defined * as products of elementary reflectors H(i) and G(i) respectively. * * Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the * order of the unitary matrix Q or P**H that is applied. * * If VECT = 'Q', A(IA:*,JA:*) is assumed to have been an NQ-by-K * matrix: * if nq >= k, Q = H(1) H(2) . . . H(k); * if nq < k, Q = H(1) H(2) . . . H(nq-1). * * If VECT = 'P', A(IA:*,JA:*) is assumed to have been a K-by-NQ * matrix: * if k < nq, P = G(1) G(2) . . . G(k); * if k >= nq, P = G(1) G(2) . . . G(nq-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * VECT (global input) CHARACTER * = 'Q': apply Q or Q**H; * = 'P': apply P or P**H. * * SIDE (global input) CHARACTER * = 'L': apply Q, Q**H, P or P**H from the Left; * = 'R': apply Q, Q**H, P or P**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q or P; * = 'C': Conjugate transpose, apply Q**H or P**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * If VECT = 'Q', the number of columns in the original * distributed matrix reduced by PCGEBRD. * If VECT = 'P', the number of rows in the original * distributed matrix reduced by PCGEBRD. * K >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+MIN(NQ,K)-1)) if * VECT='Q', and (LLD_A,LOCc(JA+NQ-1)) if VECT = 'P'. NQ = M * if SIDE = 'L', and NQ = N otherwise. The vectors which * define the elementary reflectors H(i) and G(i), whose * products determine the matrices Q and P, as returned by * PCGEBRD. * If VECT = 'Q', LLD_A >= max(1,LOCr(IA+NQ-1)); * if VECT = 'P', LLD_A >= max(1,LOCr(IA+MIN(NQ,K)-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX array, dimension * LOCc(JA+MIN(NQ,K)-1) if VECT = 'Q', LOCr(IA+MIN(NQ,K)-1) if * VECT = 'P', TAU(i) must contain the scalar factor of the * elementary reflector H(i) or G(i), which determines Q or P, * as returned by PDGEBRD in its array argument TAUQ or TAUP. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, if VECT='Q', sub( C ) is overwritten by Q*sub( C ) * or Q'*sub( C ) or sub( C )*Q' or sub( C )*Q; if VECT='P, * sub( C ) is overwritten by P*sub( C ) or P'*sub( C ) or * sub( C )*P or sub( C )*P'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * NQ = M; * if( (VECT = 'Q' and NQ >= K) or (VECT <> 'Q' and NQ > K) ), * IAA=IA; JAA=JA; MI=M; NI=N; ICC=IC; JCC=JC; * else * IAA=IA+1; JAA=JA; MI=M-1; NI=N; ICC=IC+1; JCC=JC; * end if * else if SIDE = 'R', * NQ = N; * if( (VECT = 'Q' and NQ >= K) or (VECT <> 'Q' and NQ > K) ), * IAA=IA; JAA=JA; MI=M; NI=N; ICC=IC; JCC=JC; * else * IAA=IA; JAA=JA+1; MI=M; NI=N-1; ICC=IC; JCC=JC+1; * end if * end if * * If VECT = 'Q', * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * else if VECT <> 'Q', * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( MI+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * end if * * where LCMP = LCM / NPROW, LCMQ = LCM / NPCOL, with * LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JAA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( MI+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If VECT = 'Q', * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * else * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL APPLYQ, LEFT, LQUERY, NOTRAN CHARACTER TRANST INTEGER IAA, IACOL, IAROW, ICC, ICCOL, ICOFFA, ICOFFC, $ ICROW, ICTXT, IINFO, IROFFA, IROFFC, JAA, JCC, $ LCM, LCMP, LCMQ, LWMIN, MI, MPC0, MQA0, MYCOL, $ MYROW, NI, NPA0, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCUNMLQ, $ PCUNMQR, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(1000+CTXT_) ELSE APPLYQ = LSAME( VECT, 'Q' ) LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q or P * IF( LEFT ) THEN NQ = M IF( ( APPLYQ .AND. NQ.GE.K ) .OR. $ ( .NOT.APPLYQ .AND. NQ.GT.K ) ) THEN IAA = IA JAA = JA MI = M NI = N ICC = IC JCC = JC ELSE IAA = IA + 1 JAA = JA MI = M - 1 NI = N ICC = IC + 1 JCC = JC END IF * IF( APPLYQ ) THEN CALL CHK1MAT( M, 4, K, 6, IA, JA, DESCA, 10, INFO ) ELSE CALL CHK1MAT( K, 6, M, 4, IA, JA, DESCA, 10, INFO ) END IF ELSE NQ = N IF( ( APPLYQ .AND. NQ.GE.K ) .OR. $ ( .NOT.APPLYQ .AND. NQ.GT.K ) ) THEN IAA = IA JAA = JA MI = M NI = N ICC = IC JCC = JC ELSE IAA = IA JAA = JA + 1 MI = M NI = N - 1 ICC = IC JCC = JC + 1 END IF * IF( APPLYQ ) THEN CALL CHK1MAT( N, 5, K, 6, IA, JA, DESCA, 10, INFO ) ELSE CALL CHK1MAT( K, 6, N, 5, IA, JA, DESCA, 10, INFO ) END IF END IF CALL CHK1MAT( M, 4, N, 5, IC, JC, DESCC, 15, INFO ) * IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) ICOFFA = MOD( JAA-1, DESCA( NB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JAA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( APPLYQ ) THEN IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF ELSE * IF( LEFT ) THEN MQA0 = NUMROC( MI+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ MI+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.APPLYQ .AND. .NOT.LSAME( VECT, 'P' ) ) THEN INFO = -1 ELSE IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -2 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -3 ELSE IF( K.LT.0 ) THEN INFO = -6 ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( APPLYQ .AND. LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( APPLYQ .AND. LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -13 ELSE IF( .NOT.APPLYQ .AND. LEFT .AND. $ ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ IROFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( APPLYQ .AND. LEFT .AND. $ DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( .NOT.APPLYQ .AND. LEFT .AND. $ DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF * IF( APPLYQ ) THEN IDUM1( 1 ) = ICHAR( 'Q' ) ELSE IDUM1( 1 ) = ICHAR( 'P' ) END IF IDUM2( 1 ) = 1 IF( LEFT ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'R' ) END IF IDUM2( 2 ) = 2 IF( NOTRAN ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'C' ) END IF IDUM2( 3 ) = 3 IDUM1( 4 ) = K IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( APPLYQ ) THEN IF( LEFT ) THEN CALL PCHK2MAT( M, 4, K, 6, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( N, 5, K, 6, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) END IF ELSE IF( LEFT ) THEN CALL PCHK2MAT( K, 6, M, 4, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( K, 6, N, 5, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMBR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( APPLYQ ) THEN * * Apply Q * IF( NQ.GE.K ) THEN * * Q was determined by a call to PCGEBRD with nq >= k * CALL PCUNMQR( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) ELSE IF( NQ.GT.1 ) THEN * * Q was determined by a call to PCGEBRD with nq < k * CALL PCUNMQR( SIDE, TRANS, MI, NI, NQ-1, A, IA+1, JA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) END IF ELSE * * Apply P * IF( NOTRAN ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF IF( NQ.GT.K ) THEN * * P was determined by a call to PCGEBRD with nq > k * CALL PCUNMLQ( SIDE, TRANST, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) ELSE IF( NQ.GT.1 ) THEN * * P was determined by a call to PCGEBRD with nq <= k * CALL PCUNMLQ( SIDE, TRANST, MI, NI, NQ-1, A, IA, JA+1, $ DESCA, TAU, C, ICC, JCC, DESCC, WORK, LWORK, $ IINFO ) END IF END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMBR * END scalapack-2.0.2/SRC/pcunmhr.f000644 000766 000024 00000036451 10363532303 016172 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMHR( SIDE, TRANS, M, N, ILO, IHI, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, IHI, ILO, INFO, JA, JC, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNMHR overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix of order nq, with * nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the * product of IHI-ILO elementary reflectors, as returned by PCGEHRD: * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * ILO and IHI must have the same values as in the previous call * of PCGEHRD. Q is equal to the unit matrix except in the * distributed submatrix Q(ia+ilo:ia+ihi-1,ia+ilo:ja+ihi-1). * If SIDE = 'L', 1 <= ILO <= IHI <= max(1,M); * if SIDE = 'R', 1 <= ILO <= IHI <= max(1,N); * ILO and IHI are relative indexes. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE = 'R'. The vectors which * define the elementary reflectors, as returned by PCGEHRD. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(JA+M-2) * if SIDE = 'L', and LOCc(JA+N-2) if SIDE = 'R'. This array * contains the scalar factors TAU(j) of the elementary * reflectors H(j) as returned by PCGEHRD. TAU is tied to * the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * * IAA = IA + ILO; JAA = JA+ILO-1; * If SIDE = 'L', * MI = IHI-ILO; NI = N; ICC = IC + ILO; JCC = JC; * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * MI = M; NI = IHI-ILO; ICC = IC; JCC = JC + ILO; * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN INTEGER IAA, IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, $ IINFO, IROFFA, IROFFC, JAA, JCC, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NH, NI, NPA0, $ NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCUNMQR, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 NH = IHI - ILO IF( NPROW.EQ.-1 ) THEN INFO = -(1000+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) IAA = IA + ILO JAA = JA + ILO - 1 * * NQ is the order of Q * IF( LEFT ) THEN NQ = M MI = NH NI = N ICC = IC + ILO JCC = JC CALL CHK1MAT( M, 3, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N MI = M NI = NH ICC = IC JCC = JC + ILO CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, NQ ) ) THEN INFO = -5 ELSE IF( IHI.LT.MIN( ILO, NQ ) .OR. IHI.GT.NQ ) THEN INFO = -6 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = ILO IDUM2( 3 ) = 5 IDUM1( 4 ) = IHI IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, M, 3, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMHR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. NH.EQ.0 ) $ RETURN * CALL PCUNMQR( SIDE, TRANS, MI, NI, NH, A, IAA, JAA, DESCA, TAU, $ C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMHR * END scalapack-2.0.2/SRC/pcunml2.f000644 000766 000024 00000036227 10363532303 016077 0ustar00juliestaff000000 000000 SUBROUTINE PCUNML2( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNML2 overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(k)' . . . H(2)' H(1)' * * as returned by PCGELQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= max(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PCGELQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCGELQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IROFFC, JCC, LCM, LCMP, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPCOL, $ NPROW, NQ, NQC0 COMPLEX AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PCELSET, $ PCELSET2, PCLACGV, PCLARF, PCLARFC, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNML2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN .OR. .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K -1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC ELSE MI = M ICC = IC CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(i:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H(i) or H(i)' * IF( I-IA+1.LT.NQ ) $ CALL PCLACGV( NQ-I+IA-1, A, I, JA+I-IA+1, DESCA, $ DESCA( M_ ) ) CALL PCELSET2( AII, A, I, JA+I-IA, DESCA, ONE ) IF( NOTRAN ) THEN CALL PCLARFC( SIDE, MI, NI, A, I, JA+I-IA, DESCA, $ DESCA( M_ ), TAU, C, ICC, JCC, DESCC, WORK ) ELSE CALL PCLARF( SIDE, MI, NI, A, I, JA+I-IA, DESCA, $ DESCA( M_ ), TAU, C, ICC, JCC, DESCC, WORK ) END IF CALL PCELSET( A, I, JA+I-IA, DESCA, AII ) IF( I-IA+1.LT.NQ ) $ CALL PCLACGV( NQ-I+IA-1, A, I, JA+I-IA+1, DESCA, $ DESCA( M_ ) ) * 10 CONTINUE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNML2 * END scalapack-2.0.2/SRC/pcunmlq.f000644 000766 000024 00000042026 10363532303 016170 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMLQ( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNMLQ overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(k)' . . . H(2)' H(1)' * * as returned by PCGELQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= max(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PCGELQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCGELQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, JCC, $ LCM, LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCLARFB, $ PCLARFT, PCUNML2, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMLQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC ELSE MI = M ICC = IC CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. NOTRAN ) .OR. ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) $ CALL PCUNML2( SIDE, TRANS, M, N, I1-IA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PCLARFT( 'Forward', 'Rowwise', NQ-I+IA, IB, A, I, JA+I-IA, $ DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+i-ia:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H or H' * CALL PCLARFB( SIDE, TRANST, 'Forward', 'Rowwise', MI, NI, IB, $ A, I, JA+I-IA, DESCA, WORK, C, ICC, JCC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. ( .NOT.LEFT .AND. NOTRAN ) ) $ CALL PCUNML2( SIDE, TRANS, M, N, I2-IA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMLQ * END scalapack-2.0.2/SRC/pcunmql.f000644 000766 000024 00000042254 10363532303 016173 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMQL( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNMQL overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PCGEQLF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PCGEQLF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ), * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PCGEQLF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( N+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IAROW, ICCOL, ICOFFC, ICROW, ICTXT, IINFO, IPW, $ IROFFA, IROFFC, J, J1, J2, J3, JB, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPA0, NPCOL, $ NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCLARFB, $ PCLARFT, PCUNM2L, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMQL', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN J1 = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+K-1 ) + 1 J2 = JA+K-1 J3 = DESCA( NB_ ) ELSE J1 = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) J2 = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+K-1 ) + 1 J3 = -DESCA( NB_ ) END IF * IF( LEFT ) THEN NI = N IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M END IF * * Use unblocked code for the first block if necessary * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN JB = J1 - JA IF( LEFT ) THEN MI = M - K + JB ELSE NI = N - K + JB END IF CALL PCUNM2L( SIDE, TRANS, MI, NI, JB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 DO 10 J = J1, J2, J3 JB = MIN( DESCA( NB_ ), K-J+JA ) * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PCLARFT( 'Backward', 'Columnwise', NQ-K+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic:ic+m-k+j+jb-ja-1,jc:jc+n-1) * MI = M - K + J + JB - JA ELSE * * H or H' is applied to C(ic:ic+m-1,jc:jc+n-k+j+jb-ja-1) * NI = N - K + J + JB - JA END IF * * Apply H or H' * CALL PCLARFB( SIDE, TRANS, 'Backward', 'Columnwise', MI, NI, $ JB, A, IA, J, DESCA, WORK, C, IC, JC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN JB = J2 - JA IF( LEFT ) THEN MI = M - K + JB ELSE NI = N - K + JB END IF CALL PCUNM2L( SIDE, TRANS, MI, NI, JB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMQL * END scalapack-2.0.2/SRC/pcunmqr.f000644 000766 000024 00000042072 10363532303 016177 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMQR( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNMQR overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of k elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PCGEQRF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PCGEQRF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ); * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PCGEQRF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( N+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, IINFO, $ IPW, IROFFA, IROFFC, J, J1, J2, J3, JB, JCC, $ LCM, LCMQ, LWMIN, MI, MPC0, MYCOL, MYROW, NI, $ NPA0, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCLARFB, $ PCLARFT, PCUNM2R, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMQR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN J1 = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) $ + 1 J2 = JA+K-1 J3 = DESCA( NB_ ) ELSE J1 = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) J2 = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) $ + 1 J3 = -DESCA( NB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M ICC = IC END IF * * Use unblocked code for the first block if necessary * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. ( .NOT.LEFT .AND. NOTRAN ) ) $ CALL PCUNM2R( SIDE, TRANS, M, N, J1-JA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 DO 10 J = J1, J2, J3 JB = MIN( DESCA( NB_ ), K-J+JA ) * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PCLARFT( 'Forward', 'Columnwise', NQ-J+JA, JB, A, $ IA+J-JA, J, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+j-ja:ic+m-1,jc:jc+n-1) * MI = M - J + JA ICC = IC + J - JA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+j-ja:jc+n-1) * NI = N - J + JA JCC = JC + J - JA END IF * * Apply H or H' * CALL PCLARFB( SIDE, TRANS, 'Forward', 'Columnwise', MI, NI, $ JB, A, IA+J-JA, J, DESCA, WORK, C, ICC, JCC, $ DESCC, WORK( IPW ) ) 10 CONTINUE * * Use unblocked code for the last block if necessary * IF( ( LEFT .AND. NOTRAN ) .OR. ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) $ CALL PCUNM2R( SIDE, TRANS, M, N, J2-JA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMQR * END scalapack-2.0.2/SRC/pcunmr2.f000644 000766 000024 00000035322 10363532303 016100 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMR2( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNMR2 overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PCGERQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PCGERQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCGERQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICCOL, ICOFFA, ICOFFC, $ ICROW, ICTXT, IROFFC, LCM, LCMP, LWMIN, MI, $ MPC0, MYCOL, MYROW, NI, NPCOL, NPROW, NQ, NQC0 COMPLEX AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PCELSET, $ PCELSET2, PCLACGV, PCLARF, PCLARFC, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K - 1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N ELSE MI = M CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(ic:ic+m-k+i-ia,jc:jc+n-1) * MI = M - K + I - IA + 1 ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc:jc+n-k+i-ia+1) * NI = N - K + I - IA + 1 END IF * * Apply H(i) or H(i)' * CALL PCLACGV( NQ-K+I-IA, A, I, JA, DESCA, DESCA( M_ ) ) CALL PCELSET2( AII, A, I, JA+NQ-K+I-IA, DESCA, ONE ) IF( NOTRAN ) THEN CALL PCLARFC( SIDE, MI, NI, A, I, JA, DESCA, DESCA( M_ ), $ TAU, C, IC, JC, DESCC, WORK ) ELSE CALL PCLARF( SIDE, MI, NI, A, I, JA, DESCA, DESCA( M_ ), $ TAU, C, IC, JC, DESCC, WORK ) END IF CALL PCELSET( A, I, JA+NQ-K+I-IA, DESCA, AII ) CALL PCLACGV( NQ-K+I-IA, A, I, JA, DESCA, DESCA( M_ ) ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMR2 * END scalapack-2.0.2/SRC/pcunmr3.f000644 000766 000024 00000035447 10363532303 016111 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMR3( SIDE, TRANS, M, N, K, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, L, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNMR3 overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PCTZRZF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PCTZRZF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCTZRZF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IROFFC, JAA, JCC, LCM, $ LCMP, LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPCOL, $ NPROW, NQ, NQC0 * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PCLARZ, $ PCLARZC, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( L.LT.0 .OR. L.GT.NQ ) THEN INFO = -6 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMR3', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K - 1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC JAA = JA + M - L ELSE MI = M ICC = IC JAA = JA + N - L CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(ic+i-ia:icc+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H(i) or H(i)' * IF( NOTRAN ) THEN CALL PCLARZ( SIDE, MI, NI, L, A, I, JAA, DESCA, DESCA( M_ ), $ TAU, C, ICC, JCC, DESCC, WORK ) ELSE CALL PCLARZC( SIDE, MI, NI, L, A, I, JAA, DESCA, $ DESCA( M_ ), TAU, C, ICC, JCC, DESCC, WORK ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMR3 * END scalapack-2.0.2/SRC/pcunmrq.f000644 000766 000024 00000043044 11663037655 016215 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMRQ( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNMRQ overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PCGERQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PCGERQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCGERQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN, RIGHT, TRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, LCM, $ LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, MYROW, NI, $ NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCLARFB, $ PCLARFT, PCUNMR2, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE IF( LSAME( SIDE, 'L' ) ) THEN LEFT = .TRUE. RIGHT = .FALSE. ELSE LEFT = .FALSE. RIGHT = .TRUE. END IF IF( LSAME( TRANS, 'N' ) ) THEN NOTRAN = .TRUE. TRAN = .FALSE. ELSE NOTRAN = .FALSE. TRAN = .TRUE. END IF * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 9, M, 3, N, 4, $ IC, JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 9, M, 3, N, 4, $ IC, JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMRQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N ELSE MI = M CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I1 - IA IF( LEFT ) THEN MI = M - K + IB ELSE NI = N - K + IB END IF CALL PCUNMR2( SIDE, TRANS, MI, NI, IB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PCLARFT( 'Backward', 'Rowwise', NQ-K+I+IB-IA, IB, $ A, I, JA, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic:ic+m-k+i+ib-ia-1,jc:jc+n-1) * MI = M - K + I + IB - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc:jc+n-k+i+ib-ia-1) * NI = N - K + I + IB - IA END IF * * Apply H or H' * CALL PCLARFB( SIDE, TRANST, 'Backward', 'Rowwise', MI, NI, $ IB, A, I, JA, DESCA, WORK, C, IC, JC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( RIGHT .AND. TRAN ) .OR. $ ( LEFT .AND. NOTRAN ) ) THEN IB = I2 - IA IF( LEFT ) THEN MI = M - K + IB ELSE NI = N - K + IB END IF CALL PCUNMR2( SIDE, TRANS, MI, NI, IB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMRQ * END scalapack-2.0.2/SRC/pcunmrz.f000644 000766 000024 00000043336 10363532303 016214 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMRZ( SIDE, TRANS, M, N, K, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, L, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNMRZ overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PCTZRZF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PCTZRZF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PCTZRZF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, JAA, $ JCC, LCM, LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, $ MYROW, NI, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCLARZB, $ PCLARZT, PCUNMR3, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -6 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IDUM1( 4 ) = L IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMRZ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC JAA = JA + M - L ELSE MI = M ICC = IC JAA = JA + N - L CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I1 - IA IF( LEFT ) THEN MI = M ELSE NI = N END IF CALL PCUNMR3( SIDE, TRANS, MI, NI, IB, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PCLARZT( 'Backward', 'Rowwise', L, IB, A, I, JAA, DESCA, $ TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+i-ia:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H or H' * CALL PCLARZB( SIDE, TRANST, 'Backward', 'Rowwise', MI, NI, IB, $ L, A, I, JAA, DESCA, WORK, C, ICC, JCC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I2 - IA IF( LEFT ) THEN MI = M ELSE NI = N END IF CALL PCUNMR3( SIDE, TRANS, MI, NI, IB, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMRZ * END scalapack-2.0.2/SRC/pcunmtr.f000644 000766 000024 00000040444 10363532303 016203 0ustar00juliestaff000000 000000 SUBROUTINE PCUNMTR( SIDE, UPLO, TRANS, M, N, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, UPLO INTEGER IA, IC, INFO, JA, JC, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PCUNMTR overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix of order nq, with * nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the * product of nq-1 elementary reflectors, as returned by PCHETRD: * * if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1); * * if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(IA:*,JA:*) contains elementary * reflectors from PCHETRD; * = 'L': Lower triangle of A(IA:*,JA:*) contains elementary * reflectors from PCHETRD. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * A (local input) COMPLEX pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * or (LLD_A,LOCc(JA+N-1)) if SIDE = 'R'. The vectors which * define the elementary reflectors, as returned by PCHETRD. * If SIDE = 'L', LLD_A >= max(1,LOCr(IA+M-1)); * if SIDE = 'R', LLD_A >= max(1,LOCr(IA+N-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX array, dimension LTAU, where * if SIDE = 'L' and UPLO = 'U', LTAU = LOCc(M_A), * if SIDE = 'L' and UPLO = 'L', LTAU = LOCc(JA+M-2), * if SIDE = 'R' and UPLO = 'U', LTAU = LOCc(N_A), * if SIDE = 'R' and UPLO = 'L', LTAU = LOCc(JA+N-2). * TAU(i) must contain the scalar factor of the elementary * reflector H(i), as returned by PCHETRD. TAU is tied to the * distributed matrix A. * * C (local input/local output) COMPLEX pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * * If UPLO = 'U', * IAA = IA, JAA = JA+1, ICC = IC, JCC = JC; * else UPLO = 'L', * IAA = IA+1, JAA = JA; * if SIDE = 'L', * ICC = IC+1; JCC = JC; * else * ICC = IC; JCC = JC+1; * end if * end if * * If SIDE = 'L', * MI = M-1; NI = N; * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * MI = M; MI = N-1; * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN, UPPER INTEGER IAA, IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, $ IINFO, IROFFA, IROFFC, JAA, JCC, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPA0, NPCOL, $ NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PCUNMQL, $ PCUNMQR, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, ICHAR, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) UPPER = LSAME( UPLO, 'U' ) * IF( UPPER ) THEN IAA = IA JAA = JA+1 ICC = IC JCC = JC ELSE IAA = IA+1 JAA = JA IF( LEFT ) THEN ICC = IC + 1 JCC = JC ELSE ICC = IC JCC = JC + 1 END IF END IF * * NQ is the order of Q * IF( LEFT ) THEN NQ = M MI = M - 1 NI = N CALL CHK1MAT( MI, 4, NQ-1, 4, IAA, JAA, DESCA, 9, INFO ) ELSE NQ = N MI = M NI = N - 1 CALL CHK1MAT( NI, 5, NQ-1, 5, IAA, JAA, DESCA, 9, INFO ) END IF CALL CHK1MAT( MI, 4, NI, 5, ICC, JCC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -3 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 IF( NOTRAN ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'C' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( MI, 4, NQ-1, 4, IAA, JAA, DESCA, 9, MI, 4, $ NI, 5, ICC, JCC, DESCC, 14, 4, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( NI, 5, NQ-1, 5, IAA, JAA, DESCA, 9, MI, 4, $ NI, 5, ICC, JCC, DESCC, 14, 4, IDUM1, IDUM2, $ INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCUNMTR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. NQ.EQ.1 ) $ RETURN * IF( UPPER ) THEN * * Q was determined by a call to PCHETRD with UPLO = 'U' * CALL PCUNMQL( SIDE, TRANS, MI, NI, NQ-1, A, IA, JAA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * ELSE * * Q was determined by a call to PCHETRD with UPLO = 'L' * CALL PCUNMQR( SIDE, TRANS, MI, NI, NQ-1, A, IAA, JAA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * END IF * WORK( 1 ) = CMPLX( REAL( LWMIN ) ) * RETURN * * End of PCUNMTR * END scalapack-2.0.2/SRC/pddbsv.f000644 000766 000024 00000045157 10363532303 016003 0ustar00juliestaff000000 000000 SUBROUTINE PDDBSV( N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER BWL, BWU, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PDDBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * banded diagonally dominant-like distributed * matrix with bandwidth BWL, BWU. * * Gaussian elimination without pivoting * is used to factor a reordering * of the matrix into L U. * * See PDDBTRF and PDDBTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * +max((max(bwl,bwu)*NRHS), max(bwl,bwu)*max(bwl,bwu)) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PDDBTRF, PDDBTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PDDBTRF and PDDBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PDDBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * * Factor the matrix * CALL PDDBTRF( N, BWL, BWU, A, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PDDBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PDDBTRS( 'N', N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDDBSV', -INFO ) RETURN END IF * RETURN * * End of PDDBSV * END scalapack-2.0.2/SRC/pddbtrf.f000644 000766 000024 00000126766 11750130340 016150 0ustar00juliestaff000000 000000 SUBROUTINE PDDBTRF( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), AF( * ), WORK( * ) * .. * * Purpose * ======= * * PDDBTRF computes a LU factorization * of an N-by-N real banded * diagonally dominant-like distributed matrix * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PDDBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = L U * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDDBTRF and this is stored in AF. If a linear system * is to be solved using PDDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * max(bwl,bwu)*max(bwl,bwu) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * Last modified by: Peter Arbenz, Institute of Scientific Computing, * ETH, Zurich. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, I1, I2, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN, $ LEVEL_DIST, LLDA, MAX_BW, MBW2, MYCOL, MYROW, $ MY_NUM_COLS, NB, NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, OFST, PART_OFFSET, PART_SIZE, $ PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE, $ STORE_N_A, UP_PREV_TRI_SIZE_M, $ UP_PREV_TRI_SIZE_N, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DAXPY, DDBTRF, $ DESC_CONVERT, DGEMM, DGEMV, DGERV2D, DGESD2D, $ DLAMOV, DLATCPY, DTBTRS, DTRMM, DTRRV2D, $ DTRSD2D, GLOBCHK, IGAMX2D, IGEBR2D, IGEBS2D, $ PXERBLA, RESHAPE * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 6*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX( BWL, BWU ) MBW2 = MAX_BW*MAX_BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -10 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 6*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -2 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -3 END IF * IF( LLDA.LT.( BWL+BWU+1 ) ) THEN INFO = -( 6*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 6*100+4 ) END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 6*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PDDBTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*MAX( BWL, BWU ) ) ) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, 'PDDBTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = NB*( BWL+BWU ) + 6*MAX( BWL, BWU )*MAX( BWL, BWU ) * IF( LAF.LT.LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PDDBTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = MAX( BWL, BWU )*MAX( BWL, BWU ) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, 'PDDBTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BWU PARAM_CHECK( 3, 1 ) = BWL PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 10 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDDBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 140 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW END IF * * Offset to workspace for Upper triangular factor * WORK_U = BWU*ODD_SIZE + 3*MBW2 * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * * Zero out space for work * DO 20 I = 1, WORK_SIZE_MIN WORK( I ) = ZERO 20 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL.GT.0 ) THEN PREV_TRI_SIZE_M = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) PREV_TRI_SIZE_N = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL-1, 0, $ NPCOL ) ) END IF * IF( MYCOL.GT.0 ) THEN UP_PREV_TRI_SIZE_M = MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) UP_PREV_TRI_SIZE_N = MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NPCOL-1 ) THEN NEXT_TRI_SIZE_M = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL+1, 0, $ NPCOL ) ) NEXT_TRI_SIZE_N = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL DTRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, A( OFST+( MY_NUM_COLS-BWL )* $ LLDA+( BWL+BWU+1 ) ), LLDA-1, 0, MYCOL+1 ) * END IF * * * Factor main partition A_i = L_i {U_i} in each processor * CALL DDBTRF( ODD_SIZE, ODD_SIZE, BWL, BWU, A( OFST+1 ), LLDA, $ INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 30 END IF * * IF( MYCOL.LT.NP-1 ) THEN * * Apply factorization to lower connection block BL_i * transpose the connection block in preparation. * Apply factorization to upper connection block BU_i * Move the connection block in preparation. * CALL DLATCPY( 'U', BWL, BWL, A( ( OFST+( BWL+BWU+1 )+ $ ( ODD_SIZE-BWL )*LLDA ) ), LLDA-1, $ AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW ) CALL DLAMOV( 'L', BWU, BWU, A( ( OFST+1+ODD_SIZE*LLDA ) ), $ LLDA-1, AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW- $ BWU ), MAX_BW ) * * Perform the triangular system solve {L_i}{{BU'}_i} = {B_i} * CALL DTBTRS( 'L', 'N', 'U', BWU, BWL, BWU, $ A( OFST+BWU+1+( ODD_SIZE-BWU )*LLDA ), LLDA, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ), $ MAX_BW, INFO ) * * Perform the triangular solve {U_i}^T{BL'}_i^T = {BL_i}^T * CALL DTBTRS( 'U', 'T', 'N', BWL, BWU, BWL, $ A( OFST+1+( ODD_SIZE-BWL )*LLDA ), LLDA, $ AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW, $ INFO ) * * transpose resulting block to its location * in main storage. * CALL DLATCPY( 'L', BWL, BWL, AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW- $ BWL ), MAX_BW, A( ( OFST+( BWL+BWU+1 )+ $ ( ODD_SIZE-BWL )*LLDA ) ), LLDA-1 ) * * Move the resulting block back to its location in main storage. * CALL DLAMOV( 'L', BWU, BWU, AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+ $ MAX_BW-BWU ), MAX_BW, A( ( OFST+1+ODD_SIZE* $ LLDA ) ), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i} * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL DGEMM( 'T', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, ONE, $ A( OFST+ODD_SIZE*LLDA+1+BWU ), LLDA-1 ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 30 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for * off_diagonal block of reduced system. * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL DTRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M, $ PREV_TRI_SIZE_N, AF( WORK_U+1 ), BWL, 0, $ MYCOL-1 ) * IF( INFO.EQ.0 ) THEN * * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ . * * Transpose transmitted triangular matrix $DL_i$ * DO 50 I1 = 1, BWL DO 40 I2 = I1 + 1, BWL AF( WORK_U+I2+( I1-1 )*BWL ) = AF( WORK_U+I1+( I2-1 )* $ BWL ) AF( WORK_U+I1+( I2-1 )*BWL ) = ZERO 40 CONTINUE 50 CONTINUE * DO 60 I1 = 2, ODD_SIZE I2 = MIN( I1-1, BWL ) CALL DGEMV( 'N', BWL, I2, -ONE, $ AF( WORK_U+1+( I1-1-I2 )*BWL ), BWL, $ A( OFST+BWU+1+I2+( I1-1-I2 )*LLDA ), LLDA-1, $ ONE, AF( WORK_U+1+( I1-1 )*BWL ), 1 ) 60 CONTINUE * * * Calculate the "spike" fillin, ${U_i}^T {{GL}_i}^T = {DU_i}^T$ * * * Copy D block into AF storage for solve. * CALL DLAMOV( 'L', UP_PREV_TRI_SIZE_N, UP_PREV_TRI_SIZE_M, $ A( OFST+1 ), LLDA-1, AF( 1 ), BWU ) * DO 80 I1 = 1, ODD_SIZE I2 = MIN( BWU, I1-1 ) CALL DGEMV( 'N', BWU, I2, -ONE, AF( ( I1-1-I2 )*BWU+1 ), $ BWU, A( OFST+BWU+1-I2+( I1-1 )*LLDA ), 1, $ ONE, AF( ( I1-1 )*BWU+1 ), 1 ) * DO 70 I = 1, BWU AF( ( I1-1 )*BWU+I ) = AF( ( I1-1 )*BWU+I ) / $ A( ( I1-1 )*LLDA+BWU+1 ) 70 CONTINUE 80 CONTINUE * * Calculate the update block for previous proc, E_i = GL_i{GU_i} * * * Zero out space in case result is smaller than storage block * DO 90 I = 1, MBW2 AF( ODD_SIZE*BWU+2*MBW2+I ) = ZERO 90 CONTINUE * CALL DGEMM( 'N', 'T', BWU, BWL, ODD_SIZE, -ONE, AF( 1 ), $ BWU, AF( WORK_U+1 ), BWL, ZERO, $ AF( 1+MAX( 0, BWL-BWU )+ODD_SIZE*BWU+( 2*MAX_BW+ $ MAX( 0, BWU-BWL ) )*MAX_BW ), MAX_BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL DGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0, $ MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * * Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage * as per requirements of BLAS routine DTRMM. * Since we have GU_i stored, * transpose HU_i to HU_i^T. * CALL DLAMOV( 'N', BWL, BWL, $ AF( WORK_U+( ODD_SIZE-BWL )*BWL+1 ), BWL, $ AF( ( ODD_SIZE )*BWU+1+( MAX_BW-BWL ) ), $ MAX_BW ) * CALL DTRMM( 'R', 'U', 'T', 'N', BWL, BWL, -ONE, $ A( ( OFST+( BWL+BWU+1 )+( ODD_SIZE-BWL )* $ LLDA ) ), LLDA-1, AF( ( ODD_SIZE )*BWU+1+ $ ( MAX_BW-BWL ) ), MAX_BW ) * * * Copy matrix HL_i (the last bwu rows of GL_i^T) to AFU store * as per requirements of BLAS routine DTRMM. * Since we have GL_i^T stored, * transpose HL_i^T to HL_i. * CALL DLAMOV( 'N', BWU, BWU, AF( ( ODD_SIZE-BWU )*BWU+1 ), $ BWU, AF( WORK_U+( ODD_SIZE )*BWL+1+MAX_BW- $ BWU ), MAX_BW ) * CALL DTRMM( 'R', 'L', 'N', 'N', BWU, BWU, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ AF( WORK_U+( ODD_SIZE )*BWL+1+MAX_BW-BWU ), $ MAX_BW ) * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 130 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 120 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL DGESD2D( ICTXT, MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ), $ MAX_BW, 0, MYCOL-1 ) * CALL DGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, 0, MYCOL-1 ) * END IF * * Copy last diagonal block into AF storage for subsequent * operations. * CALL DLAMOV( 'N', MAX_BW, MAX_BW, A( OFST+ODD_SIZE*LLDA+BWU+1 ), $ LLDA-1, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0, MYCOL+1 ) * * Add contribution to diagonal block * CALL DAXPY( MBW2, ONE, AF( ODD_SIZE*BWU+2*MBW2+1 ), 1, $ AF( ODD_SIZE*BWU+MBW2+1 ), 1 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 100 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 110 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL DGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * CALL DAXPY( MBW2, ONE, WORK( 1 ), 1, AF( ODD_SIZE*BWU+MBW2+1 ), $ 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL DGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * CALL DAXPY( MBW2, ONE, WORK( 1 ), 1, AF( ODD_SIZE*BWU+MBW2+1 ), $ 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 100 110 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL DDBTRF( MAX_BW, MAX_BW, MIN( MAX_BW-1, BWL ), $ MIN( MAX_BW-1, BWU ), AF( ODD_SIZE*BWU+MBW2+1- $ ( MIN( MAX_BW-1, BWU ) ) ), MAX_BW+1, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL DLAMOV( 'N', MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW ) * CALL DLAMOV( 'N', MAX_BW, MAX_BW, AF( WORK_U+ODD_SIZE*BWL+1 ), $ MAX_BW, AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ), $ MAX_BW, 0, COMM_PROC ) * CALL DGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, 0, $ COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL DTBTRS( 'L', 'N', 'U', BWU, MIN( BWL, BWU-1 ), BWU, $ AF( ODD_SIZE*BWU+MBW2+1+( MAX_BW+1 )*( MAX_BW- $ BWU ) ), MAX_BW+1, AF( WORK_U+ODD_SIZE*BWL+1+ $ MAX_BW-BWU ), MAX_BW, INFO ) * * Modify lower off_diagonal block with diagonal block * * CALL DTBTRS( 'U', 'T', 'N', BWL, MIN( BWU, BWL-1 ), BWL, $ AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, $ BWL-1 )+( MAX_BW+1 )*( MAX_BW-BWL ) ), $ MAX_BW+1, AF( ODD_SIZE*BWU+1+MAX_BW-BWL ), $ MAX_BW, INFO ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'T', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( ( ODD_SIZE )*BWU+1 ), MAX_BW, $ AF( WORK_U+( ODD_SIZE )*BWL+1 ), MAX_BW, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL DGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST / 2 ) * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL DGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * * * Since DTBTRS has no "left-right" option, we must transpose * CALL DLATCPY( 'N', MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ WORK( 1 ), MAX_BW ) * CALL DTBTRS( 'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), $ BWL, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ WORK( 1+MAX_BW*( MAX_BW-BWL ) ), MAX_BW, INFO ) * * Transpose back * CALL DLATCPY( 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW ) * * * * Since DTBTRS has no "left-right" option, we must transpose * CALL DLATCPY( 'N', MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ WORK( 1 ), MAX_BW ) * CALL DTBTRS( 'U', 'T', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), $ BWU, AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, $ MAX_BW-1 ) ), MAX_BW+1, $ WORK( 1+MAX_BW*( MAX_BW-BWU ) ), MAX_BW, INFO ) * * Transpose back * CALL DLATCPY( 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW ) * * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL DGEMM( 'N', 'T', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( ( ODD_SIZE )*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+( ODD_SIZE )*BWL+2*MBW2+1 ), MAX_BW, $ ZERO, WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL DGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ AF( ODD_SIZE*BWU+1 ), MAX_BW, ZERO, WORK( 1 ), $ MAX_BW ) * * Send contribution to offdiagonal block's owning processor. * CALL DGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ COMM_PROC ) * CALL DGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to offdiagonal block's owning processor. * CALL DGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 120 CONTINUE * * 130 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 140 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PDDBTRF * END scalapack-2.0.2/SRC/pddbtrs.f000644 000766 000024 00000064102 10363532303 016152 0ustar00juliestaff000000 000000 SUBROUTINE PDDBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PDDBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PDDBTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * banded diagonally dominant-like distributed * matrix with bandwidth BWL, BWU. * * Routine PDDBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDDBTRF and this is stored in AF. If a linear system * is to be solved using PDDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (max(bwl,bwu)*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NP_SAVE, PART_OFFSET, $ RETURN_CODE, STORE_M_B, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PDDBTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 11*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 11*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 11*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 11*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -15 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 8*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -3 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -4 END IF * IF( LLDA.LT.( BWL+BWU+1 ) ) THEN INFO = -( 8*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 8*100+4 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 11*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 11*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -5 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -7 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 8*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PDDBTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*MAX( BWL, BWU ) ) ) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, 'PDDBTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * WORK_SIZE_MIN = ( MAX( BWL, BWU )*NRHS ) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, 'PDDBTRS: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB( 5 ) PARAM_CHECK( 16, 1 ) = DESCB( 4 ) PARAM_CHECK( 15, 1 ) = DESCB( 3 ) PARAM_CHECK( 14, 1 ) = DESCB( 2 ) PARAM_CHECK( 13, 1 ) = DESCB( 1 ) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA( 5 ) PARAM_CHECK( 10, 1 ) = DESCA( 4 ) PARAM_CHECK( 9, 1 ) = DESCA( 3 ) PARAM_CHECK( 8, 1 ) = DESCA( 1 ) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDDBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 20 END IF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( TRANS, 'N' ) ) THEN * CALL PDDBTRSV( 'L', 'N', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PDDBTRSV( 'U', 'T', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * END IF * * Call backsolve routine * IF( ( LSAME( TRANS, 'C' ) ) .OR. ( LSAME( TRANS, 'T' ) ) ) THEN * CALL PDDBTRSV( 'L', 'T', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PDDBTRSV( 'U', 'N', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * END IF 10 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 20 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PDDBTRS * END scalapack-2.0.2/SRC/pddbtrsv.f000644 000766 000024 00000144531 11750130340 016341 0ustar00juliestaff000000 000000 SUBROUTINE PDDBTRSV( UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PDDBTRSV solves a banded triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a banded * triangular matrix factor produced by the * Gaussian elimination code PD@(dom_pre)BTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T * is dictated by the user by the parameter TRANS. * * Routine PDDBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDDBTRF and this is stored in AF. If a linear system * is to be solved using PDDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (max(bwl,bwu)*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * Last modified by: Peter Arbenz, Institute of Scientific Computing, * ETH, Zurich. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MAX_BW, MBW2, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 18, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ DGEMM, DGERV2D, DGESD2D, DLAMOV, DMATADD, $ DTBTRS, DTRMM, GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 9*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 12*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 12*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 12*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 12*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX( BWL, BWU ) MBW2 = MAX_BW*MAX_BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -2 END IF * IF( LWORK.LT.-1 ) THEN INFO = -16 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -3 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 9*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -4 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -5 END IF * IF( LLDA.LT.( BWL+BWU+1 ) ) THEN INFO = -( 9*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 9*100+4 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 12*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 12*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -6 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -8 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 9*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PDDBTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*MAX( BWL, BWU ) ) ) THEN INFO = -( 9*100+4 ) CALL PXERBLA( ICTXT, 'PDDBTRSV, D&C alg.: NB too small', $ -INFO ) RETURN END IF * * WORK_SIZE_MIN = MAX( BWL, BWU )*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, 'PDDBTRSV: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 18, 1 ) = DESCB( 5 ) PARAM_CHECK( 17, 1 ) = DESCB( 4 ) PARAM_CHECK( 16, 1 ) = DESCB( 3 ) PARAM_CHECK( 15, 1 ) = DESCB( 2 ) PARAM_CHECK( 14, 1 ) = DESCB( 1 ) PARAM_CHECK( 13, 1 ) = IB PARAM_CHECK( 12, 1 ) = DESCA( 5 ) PARAM_CHECK( 11, 1 ) = DESCA( 4 ) PARAM_CHECK( 10, 1 ) = DESCA( 3 ) PARAM_CHECK( 9, 1 ) = DESCA( 1 ) PARAM_CHECK( 8, 1 ) = JA PARAM_CHECK( 7, 1 ) = NRHS PARAM_CHECK( 6, 1 ) = BWU PARAM_CHECK( 5, 1 ) = BWL PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 18, 2 ) = 1205 PARAM_CHECK( 17, 2 ) = 1204 PARAM_CHECK( 16, 2 ) = 1203 PARAM_CHECK( 15, 2 ) = 1202 PARAM_CHECK( 14, 2 ) = 1201 PARAM_CHECK( 13, 2 ) = 11 PARAM_CHECK( 12, 2 ) = 905 PARAM_CHECK( 11, 2 ) = 904 PARAM_CHECK( 10, 2 ) = 903 PARAM_CHECK( 9, 2 ) = 901 PARAM_CHECK( 8, 2 ) = 8 PARAM_CHECK( 7, 2 ) = 6 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 16 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 18, PARAM_CHECK, 18, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDDBTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 200 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW END IF * * Offset to workspace for Upper triangular factor * WORK_U = BWU*ODD_SIZE + 3*MBW2 * * * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * IF( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL DTBTRS( UPLO, 'N', 'U', ODD_SIZE, BWL, NRHS, $ A( OFST+1+BWU ), LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL DLAMOV( 'N', BWL, NRHS, $ B( PART_OFFSET+ODD_SIZE-BWL+1 ), LLDB, $ WORK( 1 ), MAX_BW ) * CALL DTRMM( 'L', 'U', 'N', 'N', BWL, NRHS, -ONE, $ A( ( OFST+( BWL+BWU+1 )+( ODD_SIZE-BWL )* $ LLDA ) ), LLDA-1, WORK( 1 ), MAX_BW ) * CALL DMATADD( BWL, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Clear garbage out of workspace block * DO 10 IDUM1 = 1, WORK_SIZE_MIN WORK( IDUM1 ) = 0.0 10 CONTINUE * * IF( MYCOL.NE.0 ) THEN * * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL DGEMM( 'N', 'N', BWU, NRHS, ODD_SIZE, -ONE, AF( 1 ), $ BWU, B( PART_OFFSET+1 ), LLDB, ZERO, $ WORK( 1+MAX_BW-BWU ), MAX_BW ) * END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL DMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 40 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 20 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 30 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * CALL DMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * CALL DMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 20 30 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL DTBTRS( 'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), $ NRHS, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 190 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'T', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( ( ODD_SIZE )*BWU+1 ), MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL DGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 40 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 90 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 50 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 60 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 50 60 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL DGEMM( 'T', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( ( ODD_SIZE )*BWU+1 ), MAX_BW, WORK( 1 ), $ MAX_BW, ONE, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL DTBTRS( 'L', 'T', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), $ NRHS, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 190 END IF * * * ***Modification Loop ******* * 70 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 80 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 70 80 CONTINUE * [End of GOTO Loop] * 90 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL DGEMM( 'T', 'N', ODD_SIZE, NRHS, BWU, -ONE, AF( 1 ), $ BWU, WORK( 1+MAX_BW-BWU ), MAX_BW, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL DLAMOV( 'N', BWL, NRHS, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, WORK( 1+MAX_BW-BWL ), MAX_BW ) * CALL DTRMM( 'L', 'U', 'T', 'N', BWL, NRHS, -ONE, $ A( ( OFST+( BWL+BWU+1 )+( ODD_SIZE-BWL )* $ LLDA ) ), LLDA-1, WORK( 1+MAX_BW-BWL ), $ MAX_BW ) * CALL DMATADD( BWL, NRHS, ONE, WORK( 1+MAX_BW-BWL ), $ MAX_BW, ONE, B( PART_OFFSET+ODD_SIZE-BWL+ $ 1 ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL DTBTRS( UPLO, 'T', 'U', ODD_SIZE, BWL, NRHS, $ A( OFST+1+BWU ), LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF( LSAME( TRANS, 'T' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL DTBTRS( UPLO, 'T', 'N', ODD_SIZE, BWU, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL DLAMOV( 'N', BWU, NRHS, $ B( PART_OFFSET+ODD_SIZE-BWU+1 ), LLDB, $ WORK( 1 ), MAX_BW ) * CALL DTRMM( 'L', 'L', 'T', 'N', BWU, NRHS, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ WORK( 1 ), MAX_BW ) * CALL DMATADD( BWU, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Clear garbage out of workspace block * DO 100 IDUM1 = 1, WORK_SIZE_MIN WORK( IDUM1 ) = 0.0 100 CONTINUE * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL DGEMM( 'N', 'N', BWL, NRHS, ODD_SIZE, -ONE, $ AF( WORK_U+1 ), BWL, B( PART_OFFSET+1 ), $ LLDB, ZERO, WORK( 1+MAX_BW-BWL ), MAX_BW ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL DMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 130 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 110 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 120 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * CALL DMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * CALL DMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 110 120 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL DTBTRS( 'U', 'T', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), $ NRHS, AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, $ MAX_BW-1 ) ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 190 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'T', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( WORK_U+( ODD_SIZE )*BWL+1 ), MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL DGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 130 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 180 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 140 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 150 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 140 150 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL DGEMM( 'T', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( WORK_U+( ODD_SIZE )*BWL+1 ), MAX_BW, $ WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL DTBTRS( 'U', 'N', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), $ NRHS, AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, $ MAX_BW-1 ) ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 190 END IF * * * ***Modification Loop ******* * 160 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 170 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 160 170 CONTINUE * [End of GOTO Loop] * 180 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL DGEMM( 'T', 'N', ODD_SIZE, NRHS, BWL, -ONE, $ AF( WORK_U+1 ), BWL, WORK( 1+MAX_BW-BWL ), $ MAX_BW, ONE, B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL DLAMOV( 'N', BWU, NRHS, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, WORK( 1+MAX_BW-BWU ), MAX_BW+BWL ) * CALL DTRMM( 'L', 'L', 'N', 'N', BWU, NRHS, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ WORK( 1+MAX_BW-BWU ), MAX_BW+BWL ) * CALL DMATADD( BWU, NRHS, ONE, WORK( 1+MAX_BW-BWU ), $ MAX_BW+BWL, ONE, B( PART_OFFSET+ODD_SIZE- $ BWU+1 ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL DTBTRS( UPLO, 'N', 'N', ODD_SIZE, BWU, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * END IF * End of "IF( LSAME( UPLO, 'L' ) )"... 190 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 200 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PDDBTRSV * END scalapack-2.0.2/SRC/pddtsv.f000644 000766 000024 00000046206 10363532303 016021 0ustar00juliestaff000000 000000 SUBROUTINE PDDTSV( N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION B( * ), D( * ), DL( * ), DU( * ), WORK( * ) * .. * * * Purpose * ======= * * PDDTSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * tridiagonal diagonally dominant-like distributed * matrix. * * Gaussian elimination without pivoting * is used to factor a reordering * of the matrix into L U. * * See PDDTTRF and PDDTTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (12*NPCOL+3*NB) * +max(10*NPCOL+4*NRHS, 8*NPCOL) * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PDDTTRF, PDDTTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PDDTTRF and PDDTTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSEIF( DESCA( DTYPE_ ) .EQ. 502 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PDDTSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (12*NPCOL+3*NB) * * Factor the matrix * CALL PDDTTRF( N, DL, D, DU, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PDDTSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PDDTTRS( 'N', N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB, $ WORK, MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDDTSV', -INFO ) RETURN END IF * RETURN * * End of PDDTSV * END scalapack-2.0.2/SRC/pddttrf.f000644 000766 000024 00000106053 10363532303 016161 0ustar00juliestaff000000 000000 SUBROUTINE PDDTTRF( N, DL, D, DU, JA, DESCA, AF, LAF, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. INTEGER INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION AF( * ), D( * ), DL( * ), DU( * ), WORK( * ) * .. * * * Purpose * ======= * * PDDTTRF computes a LU factorization * of an N-by-N real tridiagonal * diagonally dominant-like distributed matrix * A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PDDTTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = L U * * where U is a tridiagonal upper triangular matrix and L is tridiagonal * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * DL (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDDTTRF and this is stored in AF. If a linear system * is to be solved using PDDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 8*NPCOL * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN, $ LEVEL_DIST, LLDA, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_N_A, $ TEMP, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DDTTRF, $ DDTTRSV, DESC_CONVERT, DGERV2D, DGESD2D, $ DTRRV2D, DTRSD2D, GLOBCHK, IGAMX2D, IGEBR2D, $ IGEBS2D, PXERBLA, RESHAPE * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION DDOT EXTERNAL NUMROC, DDOT * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 6*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -10 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 6*100+6 ) END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 6*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PDDTTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, 'PDDTTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = ( 12*NPCOL+3*NB ) * IF( LAF.LT.LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PDDTTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = 8*NPCOL * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, 'PDDTTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 7, 1 ) = DESCA( 5 ) PARAM_CHECK( 6, 1 ) = DESCA( 4 ) PARAM_CHECK( 5, 1 ) = DESCA( 3 ) PARAM_CHECK( 4, 1 ) = DESCA( 1 ) PARAM_CHECK( 3, 1 ) = JA PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 7, 2 ) = 605 PARAM_CHECK( 6, 2 ) = 604 PARAM_CHECK( 5, 2 ) = 603 PARAM_CHECK( 4, 2 ) = 601 PARAM_CHECK( 3, 2 ) = 5 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 10 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 7, PARAM_CHECK, 7, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDDTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 70 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * Offset to workspace for Upper triangular factor * WORK_U = INT_ONE*ODD_SIZE + 3 * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * IF( MYCOL.LT.NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL DTRSD2D( ICTXT, 'U', 'N', 1, 1, $ DU( PART_OFFSET+ODD_SIZE+1 ), LLDA-1, 0, $ MYCOL+1 ) * END IF * * * Factor main partition A_i = L_i {U_i} in each processor * CALL DDTTRF( ODD_SIZE, DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 20 END IF * * IF( MYCOL.LT.NP-1 ) THEN * * Apply factorization to lower connection block BL_i * Apply factorization to upper connection block BU_i * * * Perform the triangular solve {U_i}^T{BL'}_i^T = {BL_i}^T * * DL( PART_OFFSET+ODD_SIZE+1 ) = ( DL( PART_OFFSET+ODD_SIZE+1 ) ) $ / ( D( PART_OFFSET+ODD_SIZE ) ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i} * * D( PART_OFFSET+ODD_SIZE+1 ) = D( PART_OFFSET+ODD_SIZE+1 ) - $ DL( PART_OFFSET+ODD_SIZE+1 )* $ DU( PART_OFFSET+ODD_SIZE ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 20 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * * Move entry that causes spike to auxiliary storage * AF( WORK_U+1 ) = ( DL( PART_OFFSET+1 ) ) * IF( INFO.EQ.0 ) THEN * * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ . * CALL DDTTRSV( 'L', 'N', ODD_SIZE, INT_ONE, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), AF( WORK_U+1 ), ODD_SIZE, $ INFO ) * * * Calculate the "spike" fillin, ${U_i}^T {{GL}_i}^T = {DU_i}^T$ * CALL DTRRV2D( ICTXT, 'U', 'N', 1, 1, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * CALL DDTTRSV( 'U', 'T', ODD_SIZE, INT_ONE, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = GL_i{GU_i} * AF( ODD_SIZE+3 ) = -ONE*DDOT( ODD_SIZE, AF( 1 ), 1, $ AF( WORK_U+1 ), 1 ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+3 ), $ INT_ONE, 0, MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * AF( ODD_SIZE+1 ) = -ONE*( DL( PART_OFFSET+ODD_SIZE+1 )* $ AF( WORK_U+ODD_SIZE ) ) * * AF( WORK_U+( ODD_SIZE )+1 ) = -ONE* $ DU( PART_OFFSET+ODD_SIZE )*( AF( ODD_SIZE ) ) * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 60 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 50 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, AF( WORK_U+ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * END IF * * Copy last diagonal block into AF storage for subsequent * operations. * AF( ODD_SIZE+2 ) = DBLE( D( PART_OFFSET+ODD_SIZE+1 ) ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL+1 ) * * Add contribution to diagonal block * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + AF( ODD_SIZE+3 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 30 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 40 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 30 40 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... IF( AF( ODD_SIZE+2 ).EQ.ZERO ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * AF( WORK_U+ODD_SIZE+3 ) = AF( ODD_SIZE+1 ) * AF( ODD_SIZE+3 ) = AF( WORK_U+ODD_SIZE+1 ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, AF( WORK_U+ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify lower off_diagonal block with diagonal block * * AF( ODD_SIZE+1 ) = AF( ODD_SIZE+1 ) / ( AF( ODD_SIZE+2 ) ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * WORK( 1 ) = -ONE*( AF( ODD_SIZE+1 ) )* $ AF( WORK_U+( ODD_SIZE )+1 ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( WORK_U+ODD_SIZE+2+1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST / 2 ) * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * AF( ODD_SIZE+3 ) = AF( ODD_SIZE+3 ) / ( AF( ODD_SIZE+2 ) ) * * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*( AF( WORK_U+ODD_SIZE+3 ) ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * WORK( 1 ) = -ONE*AF( WORK_U+ODD_SIZE+3 )*AF( ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*AF( WORK_U+ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 50 CONTINUE * * 60 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 70 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PDDTTRF * END scalapack-2.0.2/SRC/pddttrs.f000644 000766 000024 00000066621 10363532303 016204 0ustar00juliestaff000000 000000 SUBROUTINE PDDTTRS( TRANS, N, NRHS, DL, D, DU, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION AF( * ), B( * ), D( * ), DL( * ), DU( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PDDTTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PDDTTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * tridiagonal diagonally dominant-like distributed * matrix. * * Routine PDDTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDDTTRF and this is stored in AF. If a linear system * is to be solved using PDDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 10*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ MY_NUM_COLS, NB, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, PART_OFFSET, PART_SIZE, RETURN_CODE, $ STORE_M_B, STORE_N_A, TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 15, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PDDTTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 11*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 11*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 11*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 11*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -15 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 8*100+6 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 11*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 11*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -3 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -7 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 8*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PDDTTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, 'PDDTTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * WORK_SIZE_MIN = 10*NPCOL + 4*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, 'PDDTTRS: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 15, 1 ) = DESCB( 5 ) PARAM_CHECK( 14, 1 ) = DESCB( 4 ) PARAM_CHECK( 13, 1 ) = DESCB( 3 ) PARAM_CHECK( 12, 1 ) = DESCB( 2 ) PARAM_CHECK( 11, 1 ) = DESCB( 1 ) PARAM_CHECK( 10, 1 ) = IB PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = NRHS PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 15, 2 ) = 1105 PARAM_CHECK( 14, 2 ) = 1104 PARAM_CHECK( 13, 2 ) = 1103 PARAM_CHECK( 12, 2 ) = 1102 PARAM_CHECK( 11, 2 ) = 1101 PARAM_CHECK( 10, 2 ) = 10 PARAM_CHECK( 9, 2 ) = 805 PARAM_CHECK( 8, 2 ) = 804 PARAM_CHECK( 7, 2 ) = 803 PARAM_CHECK( 6, 2 ) = 801 PARAM_CHECK( 5, 2 ) = 7 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 15, PARAM_CHECK, 15, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDDTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 20 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( TRANS, 'N' ) ) THEN * CALL PDDTTRSV( 'L', 'N', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ELSE * CALL PDDTTRSV( 'U', 'T', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * END IF * * Call backsolve routine * IF( ( LSAME( TRANS, 'C' ) ) .OR. ( LSAME( TRANS, 'T' ) ) ) THEN * CALL PDDTTRSV( 'L', 'T', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ELSE * CALL PDDTTRSV( 'U', 'N', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * END IF 10 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 20 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PDDTTRS * END scalapack-2.0.2/SRC/pddttrsv.f000644 000766 000024 00000142016 10363532303 016363 0ustar00juliestaff000000 000000 SUBROUTINE PDDTTRSV( UPLO, TRANS, N, NRHS, DL, D, DU, JA, DESCA, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION AF( * ), B( * ), D( * ), DL( * ), DU( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PDDTTRSV solves a tridiagonal triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a tridiagonal * triangular matrix factor produced by the * Gaussian elimination code PD@(dom_pre)TTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T * is dictated by the user by the parameter TRANS. * * Routine PDDTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDDTTRF and this is stored in AF. If a linear system * is to be solved using PDDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 10*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_M_B, STORE_N_A, $ TEMP, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DAXPY, DDTTRSV, $ DESC_CONVERT, DGEMM, DGERV2D, DGESD2D, DMATADD, $ DTBTRS, GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 9*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 12*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 12*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 12*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 12*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -2 END IF * IF( LWORK.LT.-1 ) THEN INFO = -16 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -3 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 9*100+6 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 12*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 12*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -4 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -8 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 9*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PDDTTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 9*100+4 ) CALL PXERBLA( ICTXT, 'PDDTTRSV, D&C alg.: NB too small', $ -INFO ) RETURN END IF * * WORK_SIZE_MIN = INT_ONE*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, 'PDDTTRSV: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB( 5 ) PARAM_CHECK( 15, 1 ) = DESCB( 4 ) PARAM_CHECK( 14, 1 ) = DESCB( 3 ) PARAM_CHECK( 13, 1 ) = DESCB( 2 ) PARAM_CHECK( 12, 1 ) = DESCB( 1 ) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA( 5 ) PARAM_CHECK( 9, 1 ) = DESCA( 4 ) PARAM_CHECK( 8, 1 ) = DESCA( 3 ) PARAM_CHECK( 7, 1 ) = DESCA( 1 ) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1205 PARAM_CHECK( 15, 2 ) = 1204 PARAM_CHECK( 14, 2 ) = 1203 PARAM_CHECK( 13, 2 ) = 1202 PARAM_CHECK( 12, 2 ) = 1201 PARAM_CHECK( 11, 2 ) = 11 PARAM_CHECK( 10, 2 ) = 905 PARAM_CHECK( 9, 2 ) = 904 PARAM_CHECK( 8, 2 ) = 903 PARAM_CHECK( 7, 2 ) = 901 PARAM_CHECK( 6, 2 ) = 8 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 16 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDDTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 180 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * Offset to workspace for Upper triangular factor * WORK_U = INT_ONE*ODD_SIZE + 3 * * * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * IF( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL DDTTRSV( UPLO, 'N', ODD_SIZE, NRHS, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL DAXPY( NRHS, -DL( PART_OFFSET+ODD_SIZE+1 ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL DGEMM( 'T', 'N', INT_ONE, NRHS, ODD_SIZE, -ONE, $ AF( 1 ), ODD_SIZE, B( PART_OFFSET+1 ), LLDB, $ ZERO, WORK( 1+INT_ONE-INT_ONE ), INT_ONE ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL DMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 30 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 10 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 20 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * CALL DMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * CALL DMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 10 20 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL DTBTRS( 'L', 'N', 'U', INT_ONE, $ MIN( INT_ONE, INT_ONE-1 ), NRHS, $ AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ( ODD_SIZE )*INT_ONE+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL DGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ODD_SIZE*INT_ONE+2+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 30 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 80 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 40 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 50 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 40 50 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL DGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ODD_SIZE*INT_ONE+2+1 ), INT_ONE, $ WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ( ODD_SIZE )*INT_ONE+1 ), INT_ONE, $ WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL DTBTRS( 'L', 'T', 'U', INT_ONE, $ MIN( INT_ONE, INT_ONE-1 ), NRHS, $ AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * ***Modification Loop ******* * 60 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 70 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 60 70 CONTINUE * [End of GOTO Loop] * 80 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL DGEMM( 'N', 'N', ODD_SIZE, NRHS, INT_ONE, -ONE, $ AF( 1 ), ODD_SIZE, WORK( 1+INT_ONE-INT_ONE ), $ INT_ONE, ONE, B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL DAXPY( NRHS, -( DL( PART_OFFSET+ODD_SIZE+1 ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL DDTTRSV( UPLO, 'T', ODD_SIZE, NRHS, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF( LSAME( TRANS, 'T' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL DDTTRSV( UPLO, 'T', ODD_SIZE, NRHS, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL DAXPY( NRHS, -( DU( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL DGEMM( 'T', 'N', INT_ONE, NRHS, ODD_SIZE, -ONE, $ AF( WORK_U+1 ), ODD_SIZE, B( PART_OFFSET+1 ), $ LLDB, ZERO, WORK( 1+INT_ONE-INT_ONE ), $ INT_ONE ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL DMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 110 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 90 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 100 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * CALL DMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * CALL DMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 90 100 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL DTBTRS( 'U', 'T', 'N', INT_ONE, $ MIN( INT_ONE, INT_ONE-1 ), NRHS, $ AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+( ODD_SIZE )*INT_ONE+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL DGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+ODD_SIZE*INT_ONE+2+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 110 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 160 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 120 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 130 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 120 130 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL DGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+ODD_SIZE*INT_ONE+2+1 ), INT_ONE, $ WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+( ODD_SIZE )*INT_ONE+1 ), INT_ONE, $ WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL DTBTRS( 'U', 'N', 'N', INT_ONE, $ MIN( INT_ONE, INT_ONE-1 ), NRHS, $ AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * ***Modification Loop ******* * 140 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 150 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 140 150 CONTINUE * [End of GOTO Loop] * 160 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL DGEMM( 'N', 'N', ODD_SIZE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+1 ), ODD_SIZE, $ WORK( 1+INT_ONE-INT_ONE ), INT_ONE, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL DAXPY( NRHS, -( DU( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL DDTTRSV( UPLO, 'N', ODD_SIZE, NRHS, $ DU( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * END IF * End of "IF( LSAME( UPLO, 'L' ) )"... 170 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 180 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PDDTTRSV * END scalapack-2.0.2/SRC/pdgbsv.f000644 000766 000024 00000045423 10363532303 016002 0ustar00juliestaff000000 000000 SUBROUTINE PDGBSV( N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, $ DESCB, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER BWL, BWU, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) DOUBLE PRECISION A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PDGBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * banded distributed * matrix with bandwidth BWL, BWU. * * Gaussian elimination with pivoting * is used to factor a reordering * of the matrix into P L U. * * See PDGBTRF and PDGBTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * +max(NRHS*(NB+2*bwl+4*bwu), 1) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * nonsingular, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * nonsingular, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PDGBTRF, PDGBTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PDGBTRF and PDGBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PDGBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * * Factor the matrix * CALL PDGBTRF( N, BWL, BWU, A, JA, DESCA, IPIV, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PDGBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PDGBTRS( 'N', N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, $ DESCB, WORK, MIN( LWORK, WS_FACTOR ), $ WORK( 1+WS_FACTOR), LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGBSV', -INFO ) RETURN END IF * RETURN * * End of PDGBSV * END scalapack-2.0.2/SRC/pdgbtrf.f000644 000766 000024 00000110227 11750130340 016134 0ustar00juliestaff000000 000000 SUBROUTINE PDGBTRF( N, BWL, BWU, A, JA, DESCA, IPIV, AF, LAF, $ WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) DOUBLE PRECISION A( * ), AF( * ), WORK( * ) * .. * * Purpose * ======= * * PDGBTRF computes a LU factorization * of an N-by-N real banded * distributed matrix * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PDGBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) Q = L U * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P and Q are permutation matrices. * The matrix Q represents reordering of columns * for parallelism's sake, while P represents * reordering of rows for numerical stability using * classic partial pivoting. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDGBTRF and this is stored in AF. If a linear system * is to be solved using PDGBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 1 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * nonsingular, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * nonsingular, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * ===================================================================== * * Implemented for ScaLAPACK by: * Andrew J. Cleary, Livermore National Lab and University of Tenn., * and Markus Hegland, Australian National University. Feb., 1997. * Based on code written by : Peter Arbenz, ETH Zurich, 1996. * Last modified by: Peter Arbenz, Institute of Scientific Computing, * ETH, Zurich. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER APTR, BBPTR, BIPTR, BM, BM1, BM2, BMN, BN, BW, $ CSRC, DBPTR, FIRST_PROC, I, I1, I2, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, J, JA_NEW, JPTR, $ L, LAF_MIN, LBWL, LBWU, LDB, LDBB, LLDA, LM, $ LMJ, LN, LNJ, LPTR, MYCOL, MYROW, MY_NUM_COLS, $ NB, NEICOL, NP, NPACT, NPCOL, NPROW, NPSTR, $ NP_SAVE, NRHS, ODD_N, ODD_SIZE, ODPTR, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ DGBTRF, DGEMM, DGER, DGERV2D, DGESD2D, DGETRF, $ DLAMOV, DLASWP, DLATCPY, DSWAP, DTRRV2D, $ DTRSD2D, DTRSM, GLOBCHK, IGAMX2D, IGEBR2D, $ IGEBS2D, PXERBLA, RESHAPE * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 6*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -11 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 6*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -2 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -3 END IF * IF( LLDA.LT.( 2*BWL+2*BWU+1 ) ) THEN INFO = -( 6*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 6*100+4 ) END IF * BW = BWU + BWL * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 6*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PDGBTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.( BWL+BWU+1 ) ) ) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, 'PDGBTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = ( NB+BWU )*( BWL+BWU ) + 6*( BWL+BWU )*( BWL+2*BWU ) * IF( LAF.LT.LAF_MIN ) THEN INFO = -9 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PDGBTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = 1 * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -11 * put minimum value of work into work( 1 ) WORK( 1 ) = WORK_SIZE_MIN CALL PXERBLA( ICTXT, 'PDGBTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BWU PARAM_CHECK( 3, 1 ) = BWL PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 11 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDGBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 210 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * DO 30 J = 1, ODD_SIZE DO 20 I = 1, BW A( I+( J-1 )*LLDA ) = ZERO 20 CONTINUE 30 CONTINUE * * Begin main code * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Transfer triangle B_i of local matrix to next processor * for fillin. Overlap the send with the factorization of A_i. * IF( MYCOL.LE.NPCOL-2 ) THEN * * The last processor does not need to send anything. * BIPTR = location of triangle B_i in memory BIPTR = ( NB-BW )*LLDA + 2*BW + 1 * CALL DTRSD2D( ICTXT, 'U', 'N', $ MIN( BW, BWU+NUMROC( N, NB, MYCOL+1, 0, $ NPCOL ) ), BW, A( BIPTR ), LLDA-1, 0, MYCOL+1 ) * END IF * * Factor main partition P_i A_i = L_i U_i on each processor * * LBWL, LBWU: lower and upper bandwidth of local solver * Note that for MYCOL > 0 one has lower triangular blocks! * LM is the number of rows which is usually NB except for * MYCOL = 0 where it is BWU less and MYCOL=NPCOL-1 where it * is NR+BWU where NR is the number of columns on the last processor * Finally APTR is the pointer to the first element of A. As LAPACK * has a slightly different matrix format than Scalapack the pointer * has to be adjusted on processor MYCOL=0. * IF( MYCOL.NE.0 ) THEN LBWL = BW LBWU = 0 APTR = 1 ELSE LBWL = BWL LBWU = BWU APTR = 1 + BWU END IF * IF( MYCOL.NE.NPCOL-1 ) THEN LM = NB - LBWU LN = NB - BW ELSE IF( MYCOL.NE.0 ) THEN LM = ODD_SIZE + BWU LN = MAX( ODD_SIZE-BW, 0 ) ELSE LM = N LN = MAX( N-BW, 0 ) END IF * IF( LN.GT.0 ) THEN * CALL DGBTRF( LM, LN, LBWL, LBWU, A( APTR ), LLDA, IPIV, INFO ) * IF( INFO.NE.0 ) THEN INFO = INFO + NB*MYCOL GO TO 80 END IF * NRHS = BW LDB = LLDA - 1 * * Update the last BW columns of A_i (code modified from DGBTRS) * * Only the eliminations of unknowns > LN-BW have an effect on * the last BW columns. Loop over them... * DO 40 J = MAX( LN-BW+1, 1 ), LN * LMJ = MIN( LBWL, LM-J ) LNJ = MIN( BW, J+BW-LN+APTR-1 ) * L = IPIV( J ) * JPTR = J - ( LN+1 ) + 2*BW + 1 - LBWL + LN*LLDA * IF( L.NE.J ) THEN * * Element (L,LN+1) is swapped with element (J,LN+1) etc * Furthermore, the elements in the same row are LDB=LLDA-1 apart * The complicated formulas are to cope with the banded * data format: * LPTR = L - ( LN+1 ) + 2*BW + 1 - LBWL + LN*LLDA * CALL DSWAP( LNJ, A( LPTR ), LDB, A( JPTR ), LDB ) * END IF * * LPTR is the pointer to the beginning of the * coefficients of L * LPTR = BW + 1 + APTR + ( J-1 )*LLDA * CALL DGER( LMJ, LNJ, -ONE, A( LPTR ), 1, A( JPTR ), LDB, $ A( JPTR+1 ), LDB ) 40 CONTINUE * END IF * * Compute spike fill-in, L_i F_i = P_i B_{i-1} * * Receive triangle B_{i-1} from previous processor * IF( MYCOL.GT.0 ) THEN CALL DTRRV2D( ICTXT, 'U', 'N', MIN( BW, LM ), BW, AF( 1 ), BW, $ 0, MYCOL-1 ) * * Transpose transmitted upper triangular (trapezoidal) matrix * DO 60 I2 = 1, MIN( BW, LM ) DO 50 I1 = I2 + 1, BW AF( I1+( I2-1 )*BW ) = AF( I2+( I1-1 )*BW ) AF( I2+( I1-1 )*BW ) = ZERO 50 CONTINUE 60 CONTINUE * * Permutation and forward elimination (triang. solve) * DO 70 J = 1, LN * LMJ = MIN( LBWL, LM-J ) L = IPIV( J ) * IF( L.NE.J ) THEN CALL DSWAP( BW, AF( ( L-1 )*BW+1 ), 1, $ AF( ( J-1 )*BW+1 ), 1 ) END IF * LPTR = BW + 1 + APTR + ( J-1 )*LLDA * CALL DGER( NRHS, LMJ, -ONE, AF( ( J-1 )*BW+1 ), 1, $ A( LPTR ), 1, AF( J*BW+1 ), BW ) * 70 CONTINUE * END IF * 80 CONTINUE * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Define the initial dimensions of the diagonal blocks * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW * IF( MYCOL.NE.NPCOL-1 ) THEN BM = BW - LBWU BN = BW ELSE BM = MIN( BW, ODD_SIZE ) + BWU BN = MIN( BW, ODD_SIZE ) END IF * * Pointer to first element of block bidiagonal matrix in AF * Leading dimension of block bidiagonal system * BBPTR = ( NB+BWU )*BW + 1 LDBB = 2*BW + BWU * * Copy from A and AF into block bidiagonal matrix (tail of AF) * * DBPTR = Pointer to diagonal blocks in A DBPTR = BW + 1 + LBWU + LN*LLDA * CALL DLAMOV( 'G', BM, BN, A( DBPTR ), LLDA-1, AF( BBPTR+BW*LDBB ), $ LDBB ) * * Zero out any junk entries that were copied * DO 100 J = 1, BM DO 90 I = J + LBWL, BM - 1 AF( BBPTR+BW*LDBB+( J-1 )*LDBB+I ) = ZERO 90 CONTINUE 100 CONTINUE * IF( MYCOL.NE.0 ) THEN * * ODPTR = Pointer to offdiagonal blocks in A * ODPTR = ( LM-BM )*BW + 1 CALL DLATCPY( 'G', BW, BM, AF( ODPTR ), BW, $ AF( BBPTR+2*BW*LDBB ), LDBB ) END IF * IF( NPCOL.EQ.1 ) THEN * * In this case the loop over the levels will not be * performed. CALL DGETRF( N-LN, N-LN, AF( BBPTR+BW*LDBB ), LDBB, $ IPIV( LN+1 ), INFO ) * END IF * * Loop over levels ... only occurs if npcol > 1 * * The two integers NPACT (nu. of active processors) and NPSTR * (stride between active processors) are used to control the * loop. * NPACT = NPCOL NPSTR = 1 * * Begin loop over levels * 110 CONTINUE IF( NPACT.LE.1 ) $ GO TO 190 * * Test if processor is active * IF( MOD( MYCOL, NPSTR ).EQ.0 ) THEN * * Send/Receive blocks * * IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN * * This node will potentially do more work later * NEICOL = MYCOL + NPSTR * IF( NEICOL / NPSTR.LT.NPACT-1 ) THEN BMN = BW ELSE IF( NEICOL / NPSTR.EQ.NPACT-1 ) THEN ODD_N = NUMROC( N, NB, NPCOL-1, 0, NPCOL ) BMN = MIN( BW, ODD_N ) + BWU ELSE * * Last processor skips to next level GO TO 180 END IF * * BM1 = M for 1st block on proc pair, BM2 2nd block * BM1 = BM BM2 = BMN * IF( NEICOL / NPSTR.LE.NPACT-1 ) THEN * CALL DGESD2D( ICTXT, BM, 2*BW, AF( BBPTR+BW*LDBB ), LDBB, $ 0, NEICOL ) * CALL DGERV2D( ICTXT, BMN, 2*BW, AF( BBPTR+BM ), LDBB, 0, $ NEICOL ) * IF( NPACT.EQ.2 ) THEN * * Copy diagonal block to align whole system * CALL DLAMOV( 'G', BMN, BW, AF( BBPTR+BM ), LDBB, $ AF( BBPTR+2*BW*LDBB+BM ), LDBB ) END IF * END IF * ELSE * * This node stops work after this stage -- an extra copy * is required to make the odd and even frontal matrices * look identical * NEICOL = MYCOL - NPSTR * IF( NEICOL.EQ.0 ) THEN BMN = BW - BWU ELSE BMN = BW END IF * BM1 = BMN BM2 = BM * CALL DGESD2D( ICTXT, BM, 2*BW, AF( BBPTR+BW*LDBB ), LDBB, 0, $ NEICOL ) * CALL DLAMOV( 'G', BM, 2*BW, AF( BBPTR+BW*LDBB ), LDBB, $ AF( BBPTR+BMN ), LDBB ) * DO 130 J = BBPTR + 2*BW*LDBB, BBPTR + 3*BW*LDBB - 1, LDBB DO 120 I = 0, LDBB - 1 AF( I+J ) = ZERO 120 CONTINUE 130 CONTINUE * CALL DGERV2D( ICTXT, BMN, 2*BW, AF( BBPTR+BW*LDBB ), LDBB, $ 0, NEICOL ) * IF( NPACT.EQ.2 ) THEN * * Copy diagonal block to align whole system * CALL DLAMOV( 'G', BM, BW, AF( BBPTR+BMN ), LDBB, $ AF( BBPTR+2*BW*LDBB+BMN ), LDBB ) END IF * END IF * * LU factorization with partial pivoting * IF( NPACT.NE.2 ) THEN * CALL DGETRF( BM+BMN, BW, AF( BBPTR+BW*LDBB ), LDBB, $ IPIV( LN+1 ), INFO ) * * Backsolve left side * DO 150 J = BBPTR, BBPTR + BW*LDBB - 1, LDBB DO 140 I = 0, BM1 - 1 AF( I+J ) = ZERO 140 CONTINUE 150 CONTINUE * CALL DLASWP( BW, AF( BBPTR ), LDBB, 1, BW, IPIV( LN+1 ), 1 ) * CALL DTRSM( 'L', 'L', 'N', 'U', BW, BW, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, AF( BBPTR ), LDBB ) * * Use partial factors to update remainder * CALL DGEMM( 'N', 'N', BM+BMN-BW, BW, BW, -ONE, $ AF( BBPTR+BW*LDBB+BW ), LDBB, AF( BBPTR ), LDBB, $ ONE, AF( BBPTR+BW ), LDBB ) * * Backsolve right side * NRHS = BW * CALL DLASWP( NRHS, AF( BBPTR+2*BW*LDBB ), LDBB, 1, BW, $ IPIV( LN+1 ), 1 ) * CALL DTRSM( 'L', 'L', 'N', 'U', BW, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, $ AF( BBPTR+2*BW*LDBB ), LDBB ) * * Use partial factors to update remainder * CALL DGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, -ONE, $ AF( BBPTR+BW*LDBB+BW ), LDBB, $ AF( BBPTR+2*BW*LDBB ), LDBB, ONE, $ AF( BBPTR+2*BW*LDBB+BW ), LDBB ) * * * Test if processor is active in next round * IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN * * Reset BM * BM = BM1 + BM2 - BW * * Local copying in the block bidiagonal area * * CALL DLAMOV( 'G', BM, BW, AF( BBPTR+BW ), LDBB, $ AF( BBPTR+BW*LDBB ), LDBB ) CALL DLAMOV( 'G', BM, BW, AF( BBPTR+2*BW*LDBB+BW ), LDBB, $ AF( BBPTR+2*BW*LDBB ), LDBB ) * * Zero out space that held original copy * DO 170 J = 0, BW - 1 DO 160 I = 0, BM - 1 AF( BBPTR+2*BW*LDBB+BW+J*LDBB+I ) = ZERO 160 CONTINUE 170 CONTINUE * END IF * ELSE * * Factor the final 2 by 2 block matrix * CALL DGETRF( BM+BMN, BM+BMN, AF( BBPTR+BW*LDBB ), LDBB, $ IPIV( LN+1 ), INFO ) END IF * END IF * * Last processor in an odd-sized NPACT skips to here * 180 CONTINUE * NPACT = ( NPACT+1 ) / 2 NPSTR = NPSTR*2 GO TO 110 * 190 CONTINUE * End loop over levels * 200 CONTINUE * If error was found in Phase 1, processors jump here. * * Free BLACS space used to hold standard-form grid. * ICTXT = ICTXT_SAVE IF( ICTXT.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 210 CONTINUE * If this processor did not hold part of the grid it * jumps here. * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PDGBTRF * END scalapack-2.0.2/SRC/pdgbtrs.f000644 000766 000024 00000113150 11750130340 016147 0ustar00juliestaff000000 000000 SUBROUTINE PDGBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) DOUBLE PRECISION A( * ), AF( * ), B( * ), WORK( * ) * .. * * Purpose * ======= * * PDGBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PDGBTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * banded distributed * matrix with bandwidth BWL, BWU. * * Routine PDGBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDGBTRF and this is stored in AF. If a linear system * is to be solved using PDGBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * NRHS*(NB+2*bwl+4*bwu) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * ===================================================================== * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * ===================================================================== * * Implemented for ScaLAPACK by: * Andrew J. Cleary, Livermore National Lab and University of Tenn., * and Markus Hegland, Australian National University. Feb., 1997. * Based on code written by : Peter Arbenz, ETH Zurich, 1996. * Last modified by: Peter Arbenz, Institute of Scientific Computing, * ETH, Zurich. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC, $ FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, J, JA_NEW, L, LBWL, LBWU, LDBB, $ LDW, LLDA, LLDB, LM, LMJ, LN, LPTR, MYCOL, $ MYROW, NB, NEICOL, NP, NPACT, NPCOL, NPROW, $ NPSTR, NP_SAVE, ODD_SIZE, PART_OFFSET, $ RECOVERY_VAL, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN, WPTR * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DCOPY, $ DESC_CONVERT, DGEMM, DGEMV, DGER, DGERV2D, $ DGESD2D, DGETRS, DLAMOV, DLASWP, DSCAL, DSWAP, $ DTRSM, GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 11*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 11*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 11*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 11*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -16 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 8*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -3 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -4 END IF * IF( LLDA.LT.( 2*BWL+2*BWU+1 ) ) THEN INFO = -( 8*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 8*100+4 ) END IF * BW = BWU + BWL * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 11*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 11*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -5 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -7 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 8*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PDGBTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.( BWL+BWU+1 ) ) ) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, 'PDGBTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check worksize * WORK_SIZE_MIN = NRHS*( NB+2*BWL+4*BWU ) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, 'PDGBTRS: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB( 5 ) PARAM_CHECK( 16, 1 ) = DESCB( 4 ) PARAM_CHECK( 15, 1 ) = DESCB( 3 ) PARAM_CHECK( 14, 1 ) = DESCB( 2 ) PARAM_CHECK( 13, 1 ) = DESCB( 1 ) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA( 5 ) PARAM_CHECK( 10, 1 ) = DESCA( 4 ) PARAM_CHECK( 9, 1 ) = DESCA( 3 ) PARAM_CHECK( 8, 1 ) = DESCA( 1 ) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 16 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDGBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 100 END IF * * * * Begin main code * * Move data into workspace - communicate/copy (overlap) * IF( MYCOL.LT.NPCOL-1 ) THEN CALL DGESD2D( ICTXT, BWU, NRHS, B( NB-BWU+1 ), LLDB, 0, $ MYCOL+1 ) END IF * IF( MYCOL.LT.NPCOL-1 ) THEN LM = NB - BWU ELSE LM = NB END IF * IF( MYCOL.GT.0 ) THEN WPTR = BWU + 1 ELSE WPTR = 1 END IF * LDW = NB + BWU + 2*BW + BWU * CALL DLAMOV( 'G', LM, NRHS, B( 1 ), LLDB, WORK( WPTR ), LDW ) * * Zero out rest of work * DO 20 J = 1, NRHS DO 10 L = WPTR + LM, LDW WORK( ( J-1 )*LDW+L ) = ZERO 10 CONTINUE 20 CONTINUE * IF( MYCOL.GT.0 ) THEN CALL DGERV2D( ICTXT, BWU, NRHS, WORK( 1 ), LDW, 0, MYCOL-1 ) END IF * ******************************************************************** * PHASE 1: Local computation phase -- Solve L*X = B ******************************************************************** * * Size of main (or odd) partition in each processor * ODD_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( MYCOL.NE.0 ) THEN LBWL = BW LBWU = 0 APTR = 1 ELSE LBWL = BWL LBWU = BWU APTR = 1 + BWU END IF * IF( MYCOL.NE.NPCOL-1 ) THEN LM = NB - LBWU LN = NB - BW ELSE IF( MYCOL.NE.0 ) THEN LM = ODD_SIZE + BWU LN = MAX( ODD_SIZE-BW, 0 ) ELSE LM = N LN = MAX( N-BW, 0 ) END IF * DO 30 J = 1, LN * LMJ = MIN( LBWL, LM-J ) L = IPIV( J ) * IF( L.NE.J ) THEN CALL DSWAP( NRHS, WORK( L ), LDW, WORK( J ), LDW ) END IF * LPTR = BW + 1 + ( J-1 )*LLDA + APTR * CALL DGER( LMJ, NRHS, -ONE, A( LPTR ), 1, WORK( J ), LDW, $ WORK( J+1 ), LDW ) * 30 CONTINUE * ******************************************************************** * PHASE 2: Global computation phase -- Solve L*X = B ******************************************************************** * * Define the initial dimensions of the diagonal blocks * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW * IF( MYCOL.NE.NPCOL-1 ) THEN BM = BW - LBWU BN = BW ELSE BM = MIN( BW, ODD_SIZE ) + BWU BN = MIN( BW, ODD_SIZE ) END IF * * Pointer to first element of block bidiagonal matrix in AF * Leading dimension of block bidiagonal system * BBPTR = ( NB+BWU )*BW + 1 LDBB = 2*BW + BWU * IF( NPCOL.EQ.1 ) THEN * * In this case the loop over the levels will not be * performed. CALL DGETRS( 'N', N-LN, NRHS, AF( BBPTR+BW*LDBB ), LDBB, $ IPIV( LN+1 ), WORK( LN+1 ), LDW, INFO ) * END IF * * Loop over levels ... * * The two integers NPACT (nu. of active processors) and NPSTR * (stride between active processors) is used to control the * loop. * NPACT = NPCOL NPSTR = 1 * * Begin loop over levels 40 CONTINUE IF( NPACT.LE.1 ) $ GO TO 50 * * Test if processor is active IF( MOD( MYCOL, NPSTR ).EQ.0 ) THEN * * Send/Receive blocks * IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN * NEICOL = MYCOL + NPSTR * IF( NEICOL / NPSTR.LE.NPACT-1 ) THEN * IF( NEICOL / NPSTR.LT.NPACT-1 ) THEN BMN = BW ELSE BMN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) + $ BWU END IF * CALL DGESD2D( ICTXT, BM, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * IF( NPACT.NE.2 ) THEN * * Receive answers back from partner processor * CALL DGERV2D( ICTXT, BM+BMN-BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * BM = BM + BMN - BW * END IF * END IF * ELSE * NEICOL = MYCOL - NPSTR * IF( NEICOL.EQ.0 ) THEN BMN = BW - BWU ELSE BMN = BW END IF * CALL DLAMOV( 'G', BM, NRHS, WORK( LN+1 ), LDW, $ WORK( NB+BWU+BMN+1 ), LDW ) * CALL DGERV2D( ICTXT, BMN, NRHS, WORK( NB+BWU+1 ), LDW, 0, $ NEICOL ) * * and do the permutations and eliminations * IF( NPACT.NE.2 ) THEN * * Solve locally for BW variables * CALL DLASWP( NRHS, WORK( NB+BWU+1 ), LDW, 1, BW, $ IPIV( LN+1 ), 1 ) * CALL DTRSM( 'L', 'L', 'N', 'U', BW, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+1 ), $ LDW ) * * Use soln just calculated to update RHS * CALL DGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, -ONE, $ AF( BBPTR+BW*LDBB+BW ), LDBB, $ WORK( NB+BWU+1 ), LDW, ONE, $ WORK( NB+BWU+1+BW ), LDW ) * * Give answers back to partner processor * CALL DGESD2D( ICTXT, BM+BMN-BW, NRHS, $ WORK( NB+BWU+1+BW ), LDW, 0, NEICOL ) * ELSE * * Finish up calculations for final level * CALL DLASWP( NRHS, WORK( NB+BWU+1 ), LDW, 1, BM+BMN, $ IPIV( LN+1 ), 1 ) * CALL DTRSM( 'L', 'L', 'N', 'U', BM+BMN, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+1 ), $ LDW ) END IF * END IF * NPACT = ( NPACT+1 ) / 2 NPSTR = NPSTR*2 GO TO 40 * END IF * 50 CONTINUE * * ************************************** * BACKSOLVE ******************************************************************** * PHASE 2: Global computation phase -- Solve U*Y = X ******************************************************************** * IF( NPCOL.EQ.1 ) THEN * * In this case the loop over the levels will not be * performed. * In fact, the backsolve portion was done in the call to * DGETRS in the frontsolve. * END IF * * Compute variable needed to reverse loop structure in * reduced system. * RECOVERY_VAL = NPACT*NPSTR - NPCOL * * Loop over levels * Terminal values of NPACT and NPSTR from frontsolve are used * 60 CONTINUE IF( NPACT.GE.NPCOL ) $ GO TO 80 * NPSTR = NPSTR / 2 * NPACT = NPACT*2 * * Have to adjust npact for non-power-of-2 * NPACT = NPACT - MOD( ( RECOVERY_VAL / NPSTR ), 2 ) * * Find size of submatrix in this proc at this level * IF( MYCOL / NPSTR.LT.NPACT-1 ) THEN BN = BW ELSE BN = MIN( BW, NUMROC( N, NB, NPCOL-1, 0, NPCOL ) ) END IF * * If this processor is even in this level... * IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN * NEICOL = MYCOL + NPSTR * IF( NEICOL / NPSTR.LE.NPACT-1 ) THEN * IF( NEICOL / NPSTR.LT.NPACT-1 ) THEN BMN = BW BNN = BW ELSE BMN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) + BWU BNN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) END IF * IF( NPACT.GT.2 ) THEN * CALL DGESD2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * CALL DGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * ELSE * CALL DGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * END IF * END IF * ELSE * This processor is odd on this level * NEICOL = MYCOL - NPSTR * IF( NEICOL.EQ.0 ) THEN BMN = BW - BWU ELSE BMN = BW END IF * IF( NEICOL.LT.NPCOL-1 ) THEN BNN = BW ELSE BNN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) END IF * IF( NPACT.GT.2 ) THEN * * Move RHS to make room for received solutions * CALL DLAMOV( 'G', BW, NRHS, WORK( NB+BWU+1 ), LDW, $ WORK( NB+BWU+BW+1 ), LDW ) * CALL DGERV2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * CALL DGEMM( 'N', 'N', BW, NRHS, BN, -ONE, AF( BBPTR ), LDBB, $ WORK( LN+1 ), LDW, ONE, WORK( NB+BWU+BW+1 ), $ LDW ) * * IF( MYCOL.GT.NPSTR ) THEN * CALL DGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( BBPTR+2*BW*LDBB ), LDBB, WORK( LN+BW+1 ), $ LDW, ONE, WORK( NB+BWU+BW+1 ), LDW ) * END IF * CALL DTRSM( 'L', 'U', 'N', 'N', BW, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+BW+1 ), $ LDW ) * * Send new solution to neighbor * CALL DGESD2D( ICTXT, BW, NRHS, WORK( NB+BWU+BW+1 ), LDW, 0, $ NEICOL ) * * Copy new solution into expected place * CALL DLAMOV( 'G', BW, NRHS, WORK( NB+BWU+1+BW ), LDW, $ WORK( LN+BW+1 ), LDW ) * ELSE * * Solve with local diagonal block * CALL DTRSM( 'L', 'U', 'N', 'N', BN+BNN, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+1 ), $ LDW ) * * Send new solution to neighbor * CALL DGESD2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ), LDW, 0, $ NEICOL ) * * Shift solutions into expected positions * CALL DLAMOV( 'G', BNN+BN-BW, NRHS, WORK( NB+BWU+1+BW ), LDW, $ WORK( LN+1 ), LDW ) * * IF( ( NB+BWU+1 ).NE.( LN+1+BW ) ) THEN * * Copy one row at a time since spaces may overlap * DO 70 J = 1, BW CALL DCOPY( NRHS, WORK( NB+BWU+J ), LDW, $ WORK( LN+BW+J ), LDW ) 70 CONTINUE * END IF * END IF * END IF * GO TO 60 * 80 CONTINUE * End of loop over levels * ******************************************************************** * PHASE 1: (Almost) Local computation phase -- Solve U*Y = X ******************************************************************** * * Reset BM to value it had before reduced system frontsolve... * IF( MYCOL.NE.NPCOL-1 ) THEN BM = BW - LBWU ELSE BM = MIN( BW, ODD_SIZE ) + BWU END IF * * First metastep is to account for the fillin blocks AF * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, BW, NRHS, WORK( NB-BW+1 ), LDW, 0, $ MYCOL+1 ) * END IF * IF( MYCOL.GT.0 ) THEN * CALL DGERV2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ), LDW, 0, $ MYCOL-1 ) * * Modify local right hand sides with received rhs's * CALL DGEMM( 'T', 'N', LM-BM, NRHS, BW, -ONE, AF( 1 ), BW, $ WORK( NB+BWU+1 ), LDW, ONE, WORK( 1 ), LDW ) * END IF * DO 90 J = LN, 1, -1 * LMJ = MIN( BW, ODD_SIZE-1 ) * LPTR = BW - 1 + J*LLDA + APTR * * In the following, the TRANS=T option is used to reverse * the order of multiplication, not as a true transpose * CALL DGEMV( 'T', LMJ, NRHS, -ONE, WORK( J+1 ), LDW, A( LPTR ), $ LLDA-1, ONE, WORK( J ), LDW ) * * Divide by diagonal element * CALL DSCAL( NRHS, ONE / A( LPTR-LLDA+1 ), WORK( J ), LDW ) 90 CONTINUE * * * CALL DLAMOV( 'G', ODD_SIZE, NRHS, WORK( 1 ), LDW, B( 1 ), LLDB ) * * Free BLACS space used to hold standard-form grid. * ICTXT = ICTXT_SAVE IF( ICTXT.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 100 CONTINUE * * Restore saved input parameters * NP = NP_SAVE * * Output worksize * WORK( 1 ) = WORK_SIZE_MIN * RETURN * * End of PDGBTRS * END scalapack-2.0.2/SRC/pdgebal.f000644 000766 000024 00000033672 11705175572 016132 0ustar00juliestaff000000 000000 SUBROUTINE PDGEBAL( JOB, N, A, DESCA, ILO, IHI, SCALE, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK computational routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER JOB INTEGER IHI, ILO, INFO, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), SCALE( * ) * .. * * Purpose * ======= * * PDGEBAL balances a general real matrix A. This involves, first, * permuting A by a similarity transformation to isolate eigenvalues * in the first 1 to ILO-1 and last IHI+1 to N elements on the * diagonal; and second, applying a diagonal similarity transformation * to rows and columns ILO to IHI to make the rows and columns as * close in norm as possible. Both steps are optional. * * Balancing may reduce the 1-norm of the matrix, and improve the * accuracy of the computed eigenvalues and/or eigenvectors. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * JOB (global input) CHARACTER*1 * Specifies the operations to be performed on A: * = 'N': none: simply set ILO = 1, IHI = N, SCALE(I) = 1.0 * for i = 1,...,N; * = 'P': permute only; * = 'S': scale only; * = 'B': both permute and scale. * * N (global input) INTEGER * The order of the matrix A. N >= 0. * * A (local input/output) DOUBLE PRECISION array, dimension * (DESCA(LLD_,LOCc(N)) * On entry, the input matrix A. * On exit, A is overwritten by the balanced matrix. * If JOB = 'N', A is not referenced. * See Further Details. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ILO (global output) INTEGER * IHI (global output) INTEGER * ILO and IHI are set to integers such that on exit * A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N. * If JOB = 'N' or 'S', ILO = 1 and IHI = N. * * SCALE (global output) DOUBLE PRECISION array, dimension (N) * Details of the permutations and scaling factors applied to * A. If P(j) is the index of the row and column interchanged * with row and column j and D(j) is the scaling factor * applied to row and column j, then * SCALE(j) = P(j) for j = 1,...,ILO-1 * = D(j) for j = ILO,...,IHI * = P(j) for j = IHI+1,...,N. * The order in which the interchanges are made is N to IHI+1, * then 1 to ILO-1. * * INFO (global output) INTEGER * = 0: successful exit. * < 0: if INFO = -i, the i-th argument had an illegal value. * * Further Details * =============== * * The permutations consist of row and column interchanges which put * the matrix in the form * * ( T1 X Y ) * P A P = ( 0 B Z ) * ( 0 0 T2 ) * * where T1 and T2 are upper triangular matrices whose eigenvalues lie * along the diagonal. The column indices ILO and IHI mark the starting * and ending columns of the submatrix B. Balancing consists of applying * a diagonal similarity transformation inv(D) * B * D to make the * 1-norms of each row of B and its corresponding column nearly equal. * The output matrix is * * ( T1 X*D Y ) * ( 0 inv(D)*B*D inv(D)*Z ). * ( 0 0 T2 ) * * Information about the permutations P and the diagonal matrix D is * returned in the vector SCALE. * * This subroutine is based on the EISPACK routine BALANC. In principle, * the parallelism is extracted by using PBLAS and BLACS routines for * the permutation and balancing. * * Modified by Tzu-Yi Chen, Computer Science Division, University of * California at Berkeley, USA * * Parallel version by Robert Granat and Meiyue Shao, Department of * Computing Science and HPC2N, Umea University, Sweden * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) DOUBLE PRECISION SCLFAC PARAMETER ( SCLFAC = 2.0D+0 ) DOUBLE PRECISION FACTOR PARAMETER ( FACTOR = 0.95D+0 ) * .. * .. Local Scalars .. LOGICAL NOCONV INTEGER I, ICA, IEXC, IRA, J, K, L, M, LLDA, $ ICTXT, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ ARSRC, ACSRC DOUBLE PRECISION C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2, ELEM * .. * .. Local Arrays .. DOUBLE PRECISION CR( 2 ) * .. * .. External Functions .. LOGICAL DISNAN, LSAME INTEGER IDAMAX DOUBLE PRECISION DLAMCH EXTERNAL DISNAN, LSAME, DLAMCH * .. * .. External Subroutines .. EXTERNAL PDSCAL, PDSWAP, PDAMAX, PXERBLA, $ BLACS_GRIDINFO, CHK1MAT, DGSUM2D, $ INFOG2L, PDELGET * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. INFO = 0 ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * IF( .NOT.LSAME( JOB, 'N' ) .AND. .NOT.LSAME( JOB, 'P' ) .AND. $ .NOT.LSAME( JOB, 'S' ) .AND. .NOT.LSAME( JOB, 'B' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE CALL CHK1MAT( N, 2, N, 2, 1, 1, DESCA, 4, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( 'PDGEBAL', -INFO ) RETURN END IF * * Extract local leading dimension of A. * LLDA = DESCA( LLD_ ) * K = 1 L = N * IF( N.EQ.0 ) $ GO TO 210 * IF( LSAME( JOB, 'N' ) ) THEN DO 10 I = 1, N SCALE( I ) = ONE 10 CONTINUE GO TO 210 END IF * IF( LSAME( JOB, 'S' ) ) $ GO TO 120 * * Permutation to isolate eigenvalues if possible. * GO TO 50 * * Row and column exchange. * 20 CONTINUE SCALE( M ) = J IF( J.EQ.M ) $ GO TO 30 * CALL PDSWAP( L, A, 1, J, DESCA, 1, A, 1, M, DESCA, 1 ) CALL PDSWAP( N-K+1, A, J, K, DESCA, DESCA(M_), A, M, K, DESCA, $ DESCA(M_) ) * 30 CONTINUE GO TO ( 40, 80 )IEXC * * Search for rows isolating an eigenvalue and push them down. * 40 CONTINUE IF( L.EQ.1 ) $ GO TO 210 L = L - 1 * 50 CONTINUE DO 70 J = L, 1, -1 * DO 60 I = 1, L IF( I.EQ.J ) $ GO TO 60 * * All processors need the information to make correct decisions. * CALL PDELGET( 'All', '1-Tree', ELEM, A, J, I, DESCA ) IF( ELEM.NE.ZERO ) $ GO TO 70 60 CONTINUE * M = L IEXC = 1 GO TO 20 70 CONTINUE * GO TO 90 * * Search for columns isolating an eigenvalue and push them left. * 80 CONTINUE K = K + 1 * 90 CONTINUE DO 110 J = K, L * DO 100 I = K, L IF( I.EQ.J ) $ GO TO 100 * * All processors need the information to make correct decisions. * CALL PDELGET( 'All', '1-Tree', ELEM, A, I, J, DESCA ) IF( ELEM.NE.ZERO ) $ GO TO 110 100 CONTINUE * M = K IEXC = 2 GO TO 20 110 CONTINUE * 120 CONTINUE DO 130 I = K, L SCALE( I ) = ONE 130 CONTINUE * IF( LSAME( JOB, 'P' ) ) $ GO TO 210 * * Balance the submatrix in rows K to L. * * Iterative loop for norm reduction. * SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 140 CONTINUE NOCONV = .FALSE. * DO 200 I = K, L C = ZERO R = ZERO * * Compute local partial values of R and C in parallel and combine * with a call to the BLACS global summation routine distributing * information to all processors. * DO 150 J = K, L IF( J.EQ.I ) $ GO TO 150 CALL INFOG2L( J, I, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, ARSRC, ACSRC ) IF( MYROW.EQ.ARSRC .AND. MYCOL.EQ.ACSRC ) THEN C = C + ABS( A( II + (JJ-1)*LLDA ) ) END IF CALL INFOG2L( I, J, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, ARSRC, ACSRC ) IF( MYROW.EQ.ARSRC .AND. MYCOL.EQ.ACSRC ) THEN R = R + ABS( A( II + (JJ-1)*LLDA ) ) END IF 150 CONTINUE CR( 1 ) = C CR( 2 ) = R CALL DGSUM2D( ICTXT, 'All', '1-Tree', 2, 1, CR, 2, -1, -1 ) C = CR( 1 ) R = CR( 2 ) * * Find global maximum absolute values and indices in parallel. * CALL PDAMAX( L, CA, ICA, A, 1, I, DESCA, 1 ) CALL PDAMAX( N-K+1, RA, IRA, A, I, K, DESCA, DESCA(M_) ) * * Guard against zero C or R due to underflow. * IF( C.EQ.ZERO .OR. R.EQ.ZERO ) $ GO TO 200 G = R / SCLFAC F = ONE S = C + R 160 CONTINUE IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 IF( DISNAN( C+F+CA+R+G+RA ) ) THEN * * Exit if NaN to avoid infinite loop * INFO = -3 CALL PXERBLA( 'PDGEBAL', -INFO ) RETURN END IF F = F*SCLFAC C = C*SCLFAC CA = CA*SCLFAC R = R / SCLFAC G = G / SCLFAC RA = RA / SCLFAC GO TO 160 * 170 CONTINUE G = C / SCLFAC 180 CONTINUE IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 F = F / SCLFAC C = C / SCLFAC G = G / SCLFAC CA = CA / SCLFAC R = R*SCLFAC RA = RA*SCLFAC GO TO 180 * * Now balance. * 190 CONTINUE IF( ( C+R ).GE.FACTOR*S ) $ GO TO 200 IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN IF( F*SCALE( I ).LE.SFMIN1 ) $ GO TO 200 END IF IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN IF( SCALE( I ).GE.SFMAX1 / F ) $ GO TO 200 END IF G = ONE / F SCALE( I ) = SCALE( I )*F NOCONV = .TRUE. * CALL PDSCAL( N-K+1, G, A, I, K, DESCA, DESCA(M_) ) CALL PDSCAL( L, F, A, 1, I, DESCA, 1 ) * 200 CONTINUE * IF( NOCONV ) $ GO TO 140 * 210 CONTINUE ILO = K IHI = L * RETURN * * End of PDGEBAL * END scalapack-2.0.2/SRC/pdgebd2.f000644 000766 000024 00000042576 10363532303 016032 0ustar00juliestaff000000 000000 SUBROUTINE PDGEBD2( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAUP( * ), TAUQ( * ), $ WORK( * ) * .. * * Purpose * ======= * * PDGEBD2 reduces a real general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper or lower bidiagonal * form B by an orthogonal transformation: Q' * sub( A ) * P = B. * * If M >= N, B is upper bidiagonal; if M < N, B is lower bidiagonal. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ). On exit, if M >= N, * the diagonal and the first superdiagonal of sub( A ) are * overwritten with the upper bidiagonal matrix B; the elements * below the diagonal, with the array TAUQ, represent the * orthogonal matrix Q as a product of elementary reflectors, * and the elements above the first superdiagonal, with the * array TAUP, represent the orthogonal matrix P as a product * of elementary reflectors. If M < N, the diagonal and the * first subdiagonal are overwritten with the lower bidiagonal * matrix B; the elements below the first subdiagonal, with the * array TAUQ, represent the orthogonal matrix Q as a product of * elementary reflectors, and the elements above the diagonal, * with the array TAUP, represent the orthogonal matrix P as a * product of elementary reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) DOUBLE PRECISION array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix Q. TAUQ * is tied to the distributed matrix A. See Further Details. * * TAUP (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix P. TAUP * is tied to the distributed matrix A. See Further Details. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( MpA0, NqA0 ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ) * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+IROFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * If m >= n, * * Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors; * v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1); * u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in * A(ia+i-1,ja+i+1:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, * * Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors; * v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); * u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in * A(ia+i-1,ja+i:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) * ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) * ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) * ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) * ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) * ( v1 v2 v3 v4 v5 ) * * where d and e denote diagonal and off-diagonal elements of B, vi * denotes an element of the vector defining H(i), and ui an element of * the vector defining G(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, II, IROFFA, J, $ JJ, K, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 DOUBLE PRECISION ALPHA * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DESCSET, $ DGEBR2D, DGEBS2D, DLARFG, INFOG2L, $ PDLARF, PDLARFG, PDELSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) LWMIN = MAX( MPA0, NQA0 ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( IROFFA.NE.ICOFFA ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEBD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) * IF( M.EQ.1 .AND. N.EQ.1 ) THEN IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN I = II+(JJ-1)*DESCA( LLD_ ) CALL DLARFG( 1, A( I ), A( I ), 1, TAUQ( JJ ) ) D( JJ ) = A( I ) CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, D( JJ ), $ 1 ) CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TAUQ( JJ ), $ 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, D( JJ ), $ 1, IAROW, IACOL ) CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAUQ( JJ ), $ 1, IAROW, IACOL ) END IF END IF IF( MYROW.EQ.IAROW ) $ TAUP( II ) = ZERO RETURN END IF * ALPHA = ZERO * IF( M.GE.N ) THEN * * Reduce to upper bidiagonal form * CALL DESCSET( DESCD, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) DO 10 K = 1, N I = IA + K - 1 J = JA + K - 1 * * Generate elementary reflector H(j) to annihilate * A(ia+i:ia+m-1,j) * CALL PDLARFG( M-K+1, ALPHA, I, J, A, MIN( I+1, M+IA-1 ), $ J, DESCA, 1, TAUQ ) CALL PDELSET( D, 1, J, DESCD, ALPHA ) CALL PDELSET( A, I, J, DESCA, ONE ) * * Apply H(i) to A(i:ia+m-1,i+1:ja+n-1) from the left * CALL PDLARF( 'Left', M-K+1, N-K, A, I, J, DESCA, 1, TAUQ, A, $ I, J+1, DESCA, WORK ) CALL PDELSET( A, I, J, DESCA, ALPHA ) * IF( K.LT.N ) THEN * * Generate elementary reflector G(i) to annihilate * A(i,ja+j+1:ja+n-1) * CALL PDLARFG( N-K, ALPHA, I, J+1, A, I, $ MIN( J+2, JA+N-1 ), DESCA, DESCA( M_ ), $ TAUP ) CALL PDELSET( E, I, 1, DESCE, ALPHA ) CALL PDELSET( A, I, J+1, DESCA, ONE ) * * Apply G(i) to A(i+1:ia+m-1,i+1:ja+n-1) from the right * CALL PDLARF( 'Right', M-K, N-K, A, I, J+1, DESCA, $ DESCA( M_ ), TAUP, A, I+1, J+1, DESCA, $ WORK ) CALL PDELSET( A, I, J+1, DESCA, ALPHA ) ELSE CALL PDELSET( TAUP, I, 1, DESCE, ZERO ) END IF 10 CONTINUE * ELSE * * Reduce to lower bidiagonal form * CALL DESCSET( DESCD, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) DO 20 K = 1, M I = IA + K - 1 J = JA + K - 1 * * Generate elementary reflector G(i) to annihilate * A(i,ja+j:ja+n-1) * CALL PDLARFG( N-K+1, ALPHA, I, J, A, I, $ MIN( J+1, JA+N-1 ), DESCA, DESCA( M_ ), TAUP ) CALL PDELSET( D, I, 1, DESCD, ALPHA ) CALL PDELSET( A, I, J, DESCA, ONE ) * * Apply G(i) to A(i:ia+m-1,j:ja+n-1) from the right * CALL PDLARF( 'Right', M-K, N-K+1, A, I, J, DESCA, $ DESCA( M_ ), TAUP, A, MIN( I+1, IA+M-1 ), J, $ DESCA, WORK ) CALL PDELSET( A, I, J, DESCA, ALPHA ) * IF( K.LT.M ) THEN * * Generate elementary reflector H(i) to annihilate * A(i+2:ia+m-1,j) * CALL PDLARFG( M-K, ALPHA, I+1, J, A, $ MIN( I+2, IA+M-1 ), J, DESCA, 1, TAUQ ) CALL PDELSET( E, 1, J, DESCE, ALPHA ) CALL PDELSET( A, I+1, J, DESCA, ONE ) * * Apply H(i) to A(i+1:ia+m-1,j+1:ja+n-1) from the left * CALL PDLARF( 'Left', M-K, N-K, A, I+1, J, DESCA, 1, TAUQ, $ A, I+1, J+1, DESCA, WORK ) CALL PDELSET( A, I+1, J, DESCA, ALPHA ) ELSE CALL PDELSET( TAUQ, 1, J, DESCE, ZERO ) END IF 20 CONTINUE END IF * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGEBD2 * END scalapack-2.0.2/SRC/pdgebrd.f000644 000766 000024 00000040204 10363532303 016114 0ustar00juliestaff000000 000000 SUBROUTINE PDGEBRD( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAUP( * ), TAUQ( * ), $ WORK( * ) * .. * * Purpose * ======= * * PDGEBRD reduces a real general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper or lower bidiagonal * form B by an orthogonal transformation: Q' * sub( A ) * P = B. * * If M >= N, B is upper bidiagonal; if M < N, B is lower bidiagonal. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ). On exit, if M >= N, * the diagonal and the first superdiagonal of sub( A ) are * overwritten with the upper bidiagonal matrix B; the elements * below the diagonal, with the array TAUQ, represent the * orthogonal matrix Q as a product of elementary reflectors, * and the elements above the first superdiagonal, with the * array TAUP, represent the orthogonal matrix P as a product * of elementary reflectors. If M < N, the diagonal and the * first subdiagonal are overwritten with the lower bidiagonal * matrix B; the elements below the first subdiagonal, with the * array TAUQ, represent the orthogonal matrix Q as a product of * elementary reflectors, and the elements above the diagonal, * with the array TAUP, represent the orthogonal matrix P as a * product of elementary reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) DOUBLE PRECISION array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix Q. TAUQ * is tied to the distributed matrix A. See Further Details. * * TAUP (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix P. TAUP * is tied to the distributed matrix A. See Further Details. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB*( MpA0 + NqA0 + 1 ) + NqA0 * * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * If m >= n, * * Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors; * v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1); * u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in * A(ia+i-1,ja+i+1:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, * * Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors; * v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); * u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in * A(ia+i-1,ja+i:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) * ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) * ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) * ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) * ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) * ( v1 v2 v3 v4 v5 ) * * where d and e denote diagonal and off-diagonal elements of B, vi * denotes an element of the vector defining H(i), and ui an element of * the vector defining G(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICTXT, IINFO, IOFF, IPW, IPY, $ IW, J, JB, JS, JW, K, L, LWMIN, MN, MP, MYCOL, $ MYROW, NB, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCWX( DLEN_ ), DESCWY( DLEN_ ), IDUM1( 1 ), $ IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PDELSET, PDGEBD2, PDGEMM, PDLABRD, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCA( MB_ ) IOFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) MP = NUMROC( M+IOFF, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+IOFF, NB, MYCOL, IACOL, NPCOL ) LWMIN = NB*( MP+NQ+1 ) + NQ * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( IOFF.NE.MOD( JA-1, DESCA( NB_ ) ) ) THEN INFO = -5 ELSE IF( NB.NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 12 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEBRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * MN = MIN( M, N ) IF( MN.EQ.0 ) $ RETURN * * Initialize parameters. * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPY = MP * NB + 1 IPW = NQ * NB + IPY * CALL DESCSET( DESCWX, M+IOFF, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCWY, NB, N+IOFF, NB, NB, IAROW, IACOL, ICTXT, $ NB ) * MP = NUMROC( M+IA-1, NB, MYROW, DESCA( RSRC_ ), NPROW ) NQ = NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) K = 1 JB = NB - IOFF IW = IOFF + 1 JW = IOFF + 1 * DO 10 L = 1, MN+IOFF-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce rows and columns i:i+nb-1 to bidiagonal form and return * the matrices X and Y which are needed to update the unreduced * part of the matrix. * CALL PDLABRD( M-K+1, N-K+1, JB, A, I, J, DESCA, D, E, TAUQ, $ TAUP, WORK, IW, JW, DESCWX, WORK( IPY ), IW, $ JW, DESCWY, WORK( IPW ) ) * * Update the trailing submatrix A(i+nb:ia+m-1,j+nb:ja+n-1), using * an update of the form A := A - V*Y' - X*U'. * CALL PDGEMM( 'No transpose', 'No transpose', M-K-JB+1, $ N-K-JB+1, JB, -ONE, A, I+JB, J, DESCA, $ WORK( IPY ), IW, JW+JB, DESCWY, ONE, A, I+JB, $ J+JB, DESCA ) CALL PDGEMM( 'No transpose', 'No transpose', M-K-JB+1, $ N-K-JB+1, JB, -ONE, WORK, IW+JB, JW, DESCWX, A, I, $ J+JB, DESCA, ONE, A, I+JB, J+JB, DESCA ) * * Copy last off-diagonal elements of B back into sub( A ). * IF( M.GE.N ) THEN JS = MIN( INDXG2L( I+JB-1, NB, 0, DESCA( RSRC_ ), NPROW ), $ MP ) IF( JS.GT.0 ) $ CALL PDELSET( A, I+JB-1, J+JB, DESCA, E( JS ) ) ELSE JS = MIN( INDXG2L( J+JB-1, NB, 0, DESCA( CSRC_ ), NPCOL ), $ NQ ) IF( JS.GT.0 ) $ CALL PDELSET( A, I+JB, J+JB-1, DESCA, E( JS ) ) END IF * K = K + JB JB = NB IW = 1 JW = 1 DESCWX( M_ ) = DESCWX( M_ ) - JB DESCWX( RSRC_ ) = MOD( DESCWX( RSRC_ ) + 1, NPROW ) DESCWX( CSRC_ ) = MOD( DESCWX( CSRC_ ) + 1, NPCOL ) DESCWY( N_ ) = DESCWY( N_ ) - JB DESCWY( RSRC_ ) = MOD( DESCWY( RSRC_ ) + 1, NPROW ) DESCWY( CSRC_ ) = MOD( DESCWY( CSRC_ ) + 1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the remainder of the matrix. * CALL PDGEBD2( M-K+1, N-K+1, A, IA+K-1, JA+K-1, DESCA, D, E, TAUQ, $ TAUP, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGEBRD * END scalapack-2.0.2/SRC/pdgecon.f000644 000766 000024 00000037217 11252745702 016145 0ustar00juliestaff000000 000000 SUBROUTINE PDGECON( NORM, N, A, IA, JA, DESCA, ANORM, RCOND, WORK, $ LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, INFO, JA, LIWORK, LWORK, N DOUBLE PRECISION ANORM, RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDGECON estimates the reciprocal of the condition number of a general * distributed real matrix A(IA:IA+N-1,JA:JA+N-1), in either the 1-norm * or the infinity-norm, using the LU factorization computed by PDGETRF. * * An estimate is obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), and * the reciprocal of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies whether the 1-norm condition number or the * infinity-norm condition number is required: * = '1' or 'O': 1-norm * = 'I': Infinity-norm * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). On entry, * this array contains the local pieces of the factors L and U * from the factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U; the * unit diagonal elements of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ANORM (global input) DOUBLE PRECISION * If NORM = '1' or 'O', the 1-norm of the original distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * If NORM = 'I', the infinity-norm of the original distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * * RCOND (global output) DOUBLE PRECISION * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + 2*LOCc(N+MOD(JA-1,NB_A)) * + MAX( 2, MAX( NB_A*MAX( 1, CEIL(NPROW-1,NPCOL) ), * LOCc(N+MOD(JA-1,NB_A)) + * NB_A*MAX( 1, CEIL(NPCOL-1,NPROW) ) ). * * LOCr and LOCc values can be computed using the ScaLAPACK * tool function NUMROC; NPROW and NPCOL can be determined by * calling the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= MAX( 1, LOCr(N+MOD(IA-1,MB_A)) ). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, ONENRM CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPNL, IPNU, $ IPV, IPW, IPX, IROFF, IV, IX, IXX, JJA, JV, JX, $ KASE, KASE1, LIWMIN, LWMIN, MYCOL, MYROW, NP, $ NPCOL, NPMOD, NPROW, NQ, NQMOD DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU, WMAX * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGEBR2D, $ DGEBS2D, INFOG2L, PCHK1MAT, PDAMAX, $ PDLATRS, PDLACON, PDRSCL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600 + CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN ONENRM = NORM.EQ.'1' .OR. LSAME( NORM, 'O' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + 2*NQMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = DBLE( LWMIN ) LIWMIN = MAX( 1, NPMOD ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.ONENRM .AND. .NOT.LSAME( NORM, 'I' ) ) THEN INFO = -1 ELSE IF( ANORM.LT.ZERO ) THEN INFO = -7 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF * IF( ONENRM ) THEN IDUM1( 1 ) = ICHAR( '1' ) ELSE IDUM1( 1 ) = ICHAR( 'I' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 IF( LIWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 12 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGECON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * RCOND = ZERO IF( N.EQ.0 ) THEN RCOND = ONE RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN ELSE IF( N.EQ.1 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPNL = IPV + NP IPNU = IPNL + NQ IPW = IPNU + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Estimate the norm of inv(A). * AINVNM = ZERO NORMIN = 'N' IF( ONENRM ) THEN KASE1 = 1 ELSE KASE1 = 2 END IF KASE = 0 * 10 CONTINUE CALL PDLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), IX, JX, $ DESCX, IWORK, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( KASE.EQ.KASE1 ) THEN * * Multiply by inv(L). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( 'Lower', 'No transpose', 'Unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, WORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL * * Multiply by inv(U). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( 'Upper', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, WORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(U'). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( 'Upper', 'Transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, WORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL * * Multiply by inv(L'). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( 'Lower', 'Transpose', 'Unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, WORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF * * Divide X by 1/(SL*SU) if doing so will not cause overflow. * SCALE = SL*SU NORMIN = 'Y' IF( SCALE.NE.ONE ) THEN CALL PDAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL DGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1, $ IAROW, MYCOL ) END IF END IF IF( SCALE.LT.ABS( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PDRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / AINVNM ) / ANORM * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PDGECON * END scalapack-2.0.2/SRC/pdgeequ.f000644 000766 000024 00000032522 10363532303 016143 0ustar00juliestaff000000 000000 SUBROUTINE PDGEEQU( M, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N DOUBLE PRECISION AMAX, COLCND, ROWCND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), C( * ), R( * ) * .. * * Purpose * ======= * * PDGEEQU computes row and column scalings intended to equilibrate an * M-by-N distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA:JA+N-1) and * reduce its condition number. R returns the row scale factors and C * the column scale factors, chosen to try to make the largest entry in * each row and column of the distributed matrix B with elements * B(i,j) = R(i) * A(i,j) * C(j) have absolute value 1. * * R(i) and C(j) are restricted to be between SMLNUM = smallest safe * number and BIGNUM = largest safe number. Use of these scaling * factors is not guaranteed to reduce the condition number of * sub( A ) but works well in practice. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ), the * local pieces of the M-by-N distributed matrix whose * equilibration factors are to be computed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * R (local output) DOUBLE PRECISION array, dimension LOCr(M_A) * If INFO = 0 or INFO > IA+M-1, R(IA:IA+M-1) contains the row * scale factors for sub( A ). R is aligned with the distributed * matrix A, and replicated across every process column. R is * tied to the distributed matrix A. * * C (local output) DOUBLE PRECISION array, dimension LOCc(N_A) * If INFO = 0, C(JA:JA+N-1) contains the column scale factors * for sub( A ). C is aligned with the distributed matrix A, and * replicated down every process row. C is tied to the distri- * buted matrix A. * * ROWCND (global output) DOUBLE PRECISION * If INFO = 0 or INFO > IA+M-1, ROWCND contains the ratio of * the smallest R(i) to the largest R(i) (IA <= i <= IA+M-1). * If ROWCND >= 0.1 and AMAX is neither too large nor too small, * it is not worth scaling by R(IA:IA+M-1). * * COLCND (global output) DOUBLE PRECISION * If INFO = 0, COLCND contains the ratio of the smallest C(j) * to the largest C(j) (JA <= j <= JA+N-1). If COLCND >= 0.1, it * is not worth scaling by C(JA:JA+N-1). * * AMAX (global output) DOUBLE PRECISION * Absolute value of largest distributed matrix element. If * AMAX is very close to overflow or very close to underflow, * the matrix should be scaled. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, and i is * <= M: the i-th row of the distributed matrix sub( A ) * is exactly zero, * > M: the (i-M)-th column of the distributed * matrix sub( A ) is exactly zero. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IDUMM, IIA, $ IOFFA, IROFF, J, JJA, LDA, MP, MYCOL, MYROW, $ NPCOL, NPROW, NQ DOUBLE PRECISION BIGNUM, RCMAX, RCMIN, SMLNUM * .. * .. Local Arrays .. INTEGER DESCC( DLEN_ ), DESCR( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMN2D, $ DGAMX2D, IGAMX2D, INFOG2L, PCHK1MAT, PB_TOPGET, $ PXERBLA * .. * .. External Functions .. INTEGER INDXL2G, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL INDXL2G, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 0, IDUMM, IDUMM, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEEQU', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) THEN ROWCND = ONE COLCND = ONE AMAX = ZERO RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * * Get machine constants and local indexes. * SMLNUM = PDLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * * Assign descriptors for R and C arrays * CALL DESCSET( DESCR, M, 1, DESCA( MB_ ), 1, 0, 0, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCC, 1, N, 1, DESCA( NB_ ), 0, 0, ICTXT, 1 ) * * Compute row scale factors. * DO 10 I = IIA, IIA+MP-1 R( I ) = ZERO 10 CONTINUE * * Find the maximum element in each row. * IOFFA = (JJA-1)*LDA DO 30 J = JJA, JJA+NQ-1 DO 20 I = IIA, IIA+MP-1 R( I ) = MAX( R( I ), ABS( A( IOFFA + I ) ) ) 20 CONTINUE IOFFA = IOFFA + LDA 30 CONTINUE CALL DGAMX2D( ICTXT, 'Rowwise', ROWCTOP, MP, 1, R( IIA ), $ MAX( 1, MP ), IDUMM, IDUMM, -1, -1, MYCOL ) * * Find the maximum and minimum scale factors. * RCMIN = BIGNUM RCMAX = ZERO DO 40 I = IIA, IIA+MP-1 RCMAX = MAX( RCMAX, R( I ) ) RCMIN = MIN( RCMIN, R( I ) ) 40 CONTINUE CALL DGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMAX, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) CALL DGAMN2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMIN, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) AMAX = RCMAX * IF( RCMIN.EQ.ZERO ) THEN * * Find the first zero scale factor and return an error code. * DO 50 I = IIA, IIA+MP-1 IF( R( I ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = INDXL2G( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) - IA + 1 50 CONTINUE CALL IGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, INFO, 1, $ IDUMM, IDUMM, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN ELSE * * Invert the scale factors. * DO 60 I = IIA, IIA+MP-1 R( I ) = ONE / MIN( MAX( R( I ), SMLNUM ), BIGNUM ) 60 CONTINUE * * Compute ROWCND = min(R(I)) / max(R(I)) * ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) * END IF * * Compute column scale factors * DO 70 J = JJA, JJA+NQ-1 C( J ) = ZERO 70 CONTINUE * * Find the maximum element in each column, * assuming the row scaling computed above. * IOFFA = (JJA-1)*LDA DO 90 J = JJA, JJA+NQ-1 DO 80 I = IIA, IIA+MP-1 C( J ) = MAX( C( J ), ABS( A( IOFFA + I ) )*R( I ) ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE CALL DGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, NQ, C( JJA ), $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) * * Find the maximum and minimum scale factors. * RCMIN = BIGNUM RCMAX = ZERO DO 100 J = JJA, JJA+NQ-1 RCMIN = MIN( RCMIN, C( J ) ) RCMAX = MAX( RCMAX, C( J ) ) 100 CONTINUE CALL DGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMAX, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) CALL DGAMN2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMIN, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) * IF( RCMIN.EQ.ZERO ) THEN * * Find the first zero scale factor and return an error code. * DO 110 J = JJA, JJA+NQ-1 IF( C( J ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = M + INDXL2G( J, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) - JA + 1 110 CONTINUE CALL IGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, INFO, 1, $ IDUMM, IDUMM, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN ELSE * * Invert the scale factors. * DO 120 J = JJA, JJA+NQ-1 C( J ) = ONE / MIN( MAX( C( J ), SMLNUM ), BIGNUM ) 120 CONTINUE * * Compute COLCND = min(C(J)) / max(C(J)) * COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) * END IF * RETURN * * End of PDGEEQU * END scalapack-2.0.2/SRC/pdgehd2.f000644 000766 000024 00000026746 10363532303 016041 0ustar00juliestaff000000 000000 SUBROUTINE PDGEHD2( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEHD2 reduces a real general distributed matrix sub( A ) * to upper Hessenberg form H by an orthogonal similarity transforma- * tion: Q' * sub( A ) * Q = H, where * sub( A ) = A(IA+N-1:IA+N-1,JA+N-1:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows IA:IA+ILO-2 and IA+IHI:IA+N-1 and columns JA:JA+JLO-2 * and JA+JHI:JA+N-1. See Further Details. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) to be reduced. On exit, * the upper triangle and the first subdiagonal of sub( A ) are * overwritten with the upper Hessenberg matrix H, and the ele- * ments below the first subdiagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). Elements JA:JA+ILO-2 and JA+IHI:JA+N-2 of TAU are * set to zero. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB + MAX( NpA0, NB ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of (ihi-ilo) elementary * reflectors * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on * exit in A(ia+ilo+i:ia+ihi-1,ja+ilo+i-2), and tau in TAU(ja+ilo+i-2). * * The contents of A(IA:IA+N-1,JA:JA+N-1) are illustrated by the follo- * wing example, with n = 7, ilo = 2 and ihi = 6: * * on entry on exit * * ( a a a a a a a ) ( a a h h h h a ) * ( a a a a a a ) ( a h h h h a ) * ( a a a a a a ) ( h h h h h h ) * ( a a a a a a ) ( v2 h h h h h ) * ( a a a a a a ) ( v2 v3 h h h h ) * ( a a a a a a ) ( v2 v3 v4 h h h ) * ( a ) ( a ) * * where a denotes an element of the original matrix sub( A ), h denotes * a modified element of the upper Hessenberg matrix H, and vi denotes * an element of the vector defining H(ja+ilo+i-2). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IAROW, ICOFFA, ICTXT, IROFFA, J, K, LWMIN, $ MYCOL, MYROW, NPA0, NPCOL, NPROW DOUBLE PRECISION AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PDELSET, $ PDLARF, PDLARFG, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) NPA0 = NUMROC( IHI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = DESCA( NB_ ) + MAX( NPA0, DESCA( NB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -2 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEHD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * DO 10 K = ILO, IHI-1 I = IA + K - 1 J = JA + K - 1 * * Compute elementary reflector H(j) to annihilate * A(i+2:ihi+ia-1,j) * CALL PDLARFG( IHI-K, AII, I+1, J, A, MIN( I+2, N+IA-1 ), J, $ DESCA, 1, TAU ) CALL PDELSET( A, I+1, J, DESCA, ONE ) * * Apply H(k) to A(ia:ihi+ia-1,j+1:ihi+ja-1) from the right * CALL PDLARF( 'Right', IHI, IHI-K, A, I+1, J, DESCA, 1, TAU, A, $ IA, J+1, DESCA, WORK ) * * Apply H(j) to A(i+1:ia+ihi-1,j+1:ja+n-1) from the left * CALL PDLARF( 'Left', IHI-K, N-K, A, I+1, J, DESCA, 1, TAU, A, $ I+1, J+1, DESCA, WORK ) * CALL PDELSET( A, I+1, J, DESCA, AII ) 10 CONTINUE * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGEHD2 * END scalapack-2.0.2/SRC/pdgehrd.f000644 000766 000024 00000035510 11642700517 016133 0ustar00juliestaff000000 000000 SUBROUTINE PDGEHRD( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEHRD reduces a real general distributed matrix sub( A ) * to upper Hessenberg form H by an orthogonal similarity transforma- * tion: Q' * sub( A ) * Q = H, where * sub( A ) = A(IA+N-1:IA+N-1,JA+N-1:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows IA:IA+ILO-2 and IA+IHI:IA+N-1 and columns JA:JA+ILO-2 * and JA+IHI:JA+N-1. See Further Details. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) to be reduced. On exit, * the upper triangle and the first subdiagonal of sub( A ) are * overwritten with the upper Hessenberg matrix H, and the ele- * ments below the first subdiagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). Elements JA:JA+ILO-2 and JA+IHI:JA+N-2 of TAU are * set to zero. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB*NB + NB*MAX( IHIP+1, IHLP+INLQ ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ), * ICOFFA = MOD( JA-1, NB ), IOFF = MOD( IA+ILO-2, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * ILROW = INDXG2P( IA+ILO-1, NB, MYROW, RSRC_A, NPROW ), * IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, ILROW, NPROW ), * ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, CSRC_A, NPCOL ), * INLQ = NUMROC( N-ILO+IOFF+1, NB, MYCOL, ILCOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of (ihi-ilo) elementary * reflectors * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:I) = 0, v(I+1) = 1 and v(IHI+1:N) = 0; v(I+2:IHI) is stored on * exit in A(IA+ILO+I:IA+IHI-1,JA+ILO+I-2), and tau in TAU(JA+ILO+I-2). * * The contents of A(IA:IA+N-1,JA:JA+N-1) are illustrated by the follow- * ing example, with N = 7, ILO = 2 and IHI = 6: * * on entry on exit * * ( a a a a a a a ) ( a a h h h h a ) * ( a a a a a a ) ( a h h h h a ) * ( a a a a a a ) ( h h h h h h ) * ( a a a a a a ) ( v2 h h h h h ) * ( a a a a a a ) ( v2 v3 h h h h ) * ( a a a a a a ) ( v2 v3 v4 h h h ) * ( a ) ( a ) * * where a denotes an element of the original matrix sub( A ), H denotes * a modified element of the upper Hessenberg matrix H, and vi denotes * an element of the vector defining H(JA+ILO+I-2). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, IB, ICOFFA, ICTXT, IHIP, $ IHLP, IIA, IINFO, ILCOL, ILROW, IMCOL, INLQ, $ IOFF, IPT, IPW, IPY, IROFFA, J, JJ, JJA, JY, $ K, L, LWMIN, MYCOL, MYROW, NB, NPCOL, NPROW, $ NQ DOUBLE PRECISION EI * .. * .. Local Arrays .. INTEGER DESCY( DLEN_ ), IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG1L, $ INFOG2L, PCHK1MAT, PDGEMM, PDGEHD2, $ PDLAHRD, PDLARFB, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, NB ) ICOFFA = MOD( JA-1, NB ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ) IOFF = MOD( IA+ILO-2, NB ) ILROW = INDXG2P( IA+ILO-1, NB, MYROW, DESCA( RSRC_ ), $ NPROW ) IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, ILROW, NPROW ) ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) INLQ = NUMROC( N-ILO+IOFF+1, NB, MYCOL, ILCOL, NPCOL ) LWMIN = NB*( NB + MAX( IHIP+1, IHLP+INLQ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -2 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -3 C ELSE IF( IROFFA.NE.ICOFFA .OR. IROFFA.NE.0 ) THEN ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = ILO IDUM2( 1 ) = 2 IDUM1( 2 ) = IHI IDUM2( 2 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 10 CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEHRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Set elements JA:JA+ILO-2 and JA+JHI-1:JA+N-2 of TAU to zero. * NQ = NUMROC( JA+N-2, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL INFOG1L( JA+ILO-2, NB, NPCOL, MYCOL, DESCA( CSRC_ ), JJ, $ IMCOL ) DO 10 J = JJA, MIN( JJ, NQ ) TAU( J ) = ZERO 10 CONTINUE * CALL INFOG1L( JA+IHI-1, NB, NPCOL, MYCOL, DESCA( CSRC_ ), JJ, $ IMCOL ) DO 20 J = JJ, NQ TAU( J ) = ZERO 20 CONTINUE * * Quick return if possible * IF( IHI-ILO.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPT = 1 IPY = IPT + NB * NB IPW = IPY + IHIP * NB CALL DESCSET( DESCY, IHI+IROFFA, NB, NB, NB, IAROW, ILCOL, ICTXT, $ MAX( 1, IHIP ) ) * K = ILO IB = NB - IOFF JY = IOFF + 1 * * Loop over remaining block of columns * DO 30 L = 1, IHI-ILO+IOFF-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns j:j+ib-1 to Hessenberg form, returning the * matrices V and T of the block reflector H = I - V*T*V' * which performs the reduction, and also the matrix Y = A*V*T * CALL PDLAHRD( IHI, K, IB, A, IA, J, DESCA, TAU, WORK( IPT ), $ WORK( IPY ), 1, JY, DESCY, WORK( IPW ) ) * * Apply the block reflector H to A(ia:ia+ihi-1,j+ib:ja+ihi-1) * from the right, computing A := A - Y * V'. * V(i+ib,ib-1) must be set to 1. * CALL PDELSET2( EI, A, I+IB, J+IB-1, DESCA, ONE ) CALL PDGEMM( 'No transpose', 'Transpose', IHI, IHI-K-IB+1, IB, $ -ONE, WORK( IPY ), 1, JY, DESCY, A, I+IB, J, $ DESCA, ONE, A, IA, J+IB, DESCA ) CALL PDELSET( A, I+IB, J+IB-1, DESCA, EI ) * * Apply the block reflector H to A(i+1:ia+ihi-1,j+ib:ja+n-1) from * the left * CALL PDLARFB( 'Left', 'Transpose', 'Forward', 'Columnwise', $ IHI-K, N-K-IB+1, IB, A, I+1, J, DESCA, $ WORK( IPT ), A, I+1, J+IB, DESCA, WORK( IPY ) ) * K = K + IB IB = NB JY = 1 DESCY( CSRC_ ) = MOD( DESCY( CSRC_ ) + 1, NPCOL ) * 30 CONTINUE * * Use unblocked code to reduce the rest of the matrix * CALL PDGEHD2( N, K, IHI, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGEHRD * END scalapack-2.0.2/SRC/pdgelq2.f000644 000766 000024 00000024600 10363532303 016045 0ustar00juliestaff000000 000000 SUBROUTINE PDGELQ2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGELQ2 computes a LQ factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = L * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and below the diagonal of sub( A ) contain the M by min(M,N) * lower trapezoidal matrix L (L is lower triangular if M <= N); * the elements above the diagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * of the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia+k-1) H(ia+k-2) . . . H(ia), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1)=0 * and v(i) = 1; v(i+1:n) is stored on exit in A(ia+i-1,ja+i:ja+n-1), * and tau in TAU(ia+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, J, K, LWMIN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ DOUBLE PRECISION AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PDELSET, $ PDLARF, PDLARFG, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQ + MAX( 1, MP ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGELQ2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * K = MIN( M, N ) DO 10 I = IA, IA+K-1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * A(i,j+1:ja+n-1) * CALL PDLARFG( N-J+JA, AII, I, J, A, I, MIN( J+1, JA+N-1 ), $ DESCA, DESCA( M_ ), TAU ) * IF( I.LT.IA+M-1 ) THEN * * Apply H(i) to A(i+1:ia+m-1,j:ja+n-1) from the right * CALL PDELSET( A, I, J, DESCA, ONE ) CALL PDLARF( 'Right', M-I+IA-1, N-J+JA, A, I, J, DESCA, $ DESCA( M_ ), TAU, A, I+1, J, DESCA, WORK ) END IF CALL PDELSET( A, I, J, DESCA, AII ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGELQ2 * END scalapack-2.0.2/SRC/pdgelqf.f000644 000766 000024 00000027453 10363532303 016142 0ustar00juliestaff000000 000000 SUBROUTINE PDGELQF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGELQF computes a LQ factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = L * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and below the diagonal of sub( A ) contain the M by min(M,N) * lower trapezoidal matrix L (L is lower triangular if M <= N); * the elements above the diagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * of the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia+k-1) H(ia+k-2) . . . H(ia), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1)=0 * and v(i) = 1; v(i+1:n) is stored on exit in A(ia+i-1,ja+i:ja+n-1), * and tau in TAU(ia+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IN, IPW, $ IROFF, J, K, LWMIN, MP0, MYCOL, MYROW, NPCOL, $ NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PDGELQ2, $ PDLARFB, PDLARFT, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * * Handle the first block of rows separately * IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IB = IN - IA + 1 * * Compute the LQ factorization of the first block A(ia:in:ja:ja+n-1) * CALL PDGELQ2( IB, N, A, IA, JA, DESCA, TAU, WORK, LWORK, IINFO ) * IF( IA+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(ia) H(ia+1) . . . H(in) * CALL PDLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia+ib:ia+m-1,ja:ja+n-1) from the right * CALL PDLARFB( 'Right', 'No transpose', 'Forward', 'Rowwise', $ M-IB, N, IB, A, IA, JA, DESCA, WORK, A, IA+IB, $ JA, DESCA, WORK( IPW ) ) END IF * * Loop over the remaining blocks of rows * DO 10 I = IN+1, IA+K-1, DESCA( MB_ ) IB = MIN( K-I+IA, DESCA( MB_ ) ) J = JA + I - IA * * Compute the LQ factorization of the current block * A(i:i+ib-1:j:ja+n-1) * CALL PDGELQ2( IB, N-I+IA, A, I, J, DESCA, TAU, WORK, LWORK, $ IINFO ) * IF( I+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PDLARFT( 'Forward', 'Rowwise', N-I+IA, IB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(i+ib:ia+m-1,j:ja+n-1) from the right * CALL PDLARFB( 'Right', 'No transpose', 'Forward', 'Rowwise', $ M-I-IB+IA, N-J+JA, IB, A, I, J, DESCA, WORK, $ A, I+IB, J, DESCA, WORK( IPW ) ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGELQF * END scalapack-2.0.2/SRC/pdgels.f000644 000766 000024 00000054442 11312467374 016006 0ustar00juliestaff000000 000000 SUBROUTINE PDGELS( TRANS, M, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), WORK( * ) * .. * * Purpose * ======= * * PDGELS solves overdetermined or underdetermined real linear * systems involving an M-by-N matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1), * or its transpose, using a QR or LQ factorization of sub( A ). It is * assumed that sub( A ) has full rank. * * The following options are provided: * * 1. If TRANS = 'N' and m >= n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || sub( B ) - sub( A )*X ||. * * 2. If TRANS = 'N' and m < n: find the minimum norm solution of * an underdetermined system sub( A ) * X = sub( B ). * * 3. If TRANS = 'T' and m >= n: find the minimum norm solution of * an undetermined system sub( A )**T * X = sub( B ). * * 4. If TRANS = 'T' and m < n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || sub( B ) - sub( A )**T * X ||. * * where sub( B ) denotes B( IB:IB+M-1, JB:JB+NRHS-1 ) when TRANS = 'N' * and B( IB:IB+N-1, JB:JB+NRHS-1 ) otherwise. Several right hand side * vectors b and solution vectors x can be handled in a single call; * When TRANS = 'N', the solution vectors are stored as the columns of * the N-by-NRHS right hand side matrix sub( B ) and the M-by-NRHS * right hand side matrix sub( B ) otherwise. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER * = 'N': the linear system involves sub( A ); * = 'T': the linear system involves sub( A )**T. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of * rows of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e. the number of columns * of the distributed submatrices sub( B ) and X. NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of local dimension * ( LLD_A, LOCc(JA+N-1) ). On entry, the M-by-N matrix A. * if M >= N, sub( A ) is overwritten by details of its QR * factorization as returned by PDGEQRF; * if M < N, sub( A ) is overwritten by details of its LQ * factorization as returned by PDGELQF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of local dimension * (LLD_B, LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the distributed matrix B of right hand side * vectors, stored columnwise; * sub( B ) is M-by-NRHS if TRANS='N', and N-by-NRHS otherwise. * On exit, sub( B ) is overwritten by the solution vectors, * stored columnwise: if TRANS = 'N' and M >= N, rows 1 to N * of sub( B ) contain the least squares solution vectors; the * residual sum of squares for the solution in each column is * given by the sum of squares of elements N+1 to M in that * column; if TRANS = 'N' and M < N, rows 1 to N of sub( B ) * contain the minimum norm solution vectors; if TRANS = 'T' * and M >= N, rows 1 to M of sub( B ) contain the minimum norm * solution vectors; if TRANS = 'T' and M < N, rows 1 to M of * sub( B ) contain the least squares solution vectors; the * residual sum of squares for the solution in each column is * given by the sum of squares of elements M+1 to N in that * column. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= LTAU + MAX( LWF, LWS ) where * If M >= N, then * LTAU = NUMROC( JA+MIN(M,N)-1, NB_A, MYCOL, CSRC_A, NPCOL ), * LWF = NB_A * ( MpA0 + NqA0 + NB_A ) * LWS = MAX( (NB_A*(NB_A-1))/2, (NRHSqB0 + MpB0)*NB_A ) + * NB_A * NB_A * Else * LTAU = NUMROC( IA+MIN(M,N)-1, MB_A, MYROW, RSRC_A, NPROW ), * LWF = MB_A * ( MpA0 + NqA0 + MB_A ) * LWS = MAX( (MB_A*(MB_A-1))/2, ( NpB0 + MAX( NqA0 + * NUMROC( NUMROC( N+IROFFB, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NRHSqB0 ) )*MB_A ) + * MB_A * MB_A * End if * * where LCMP = LCM / NPROW with LCM = ILCM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * MpB0 = NUMROC( M+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NpB0 = NUMROC( N+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NRHSqB0 = NUMROC( NRHS+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, TPSD INTEGER BROW, IACOL, IAROW, IASCL, IBCOL, IBROW, IBSCL, $ ICOFFA, ICOFFB, ICTXT, IPW, IROFFA, IROFFB, $ LCM, LCMP, LTAU, LWF, LWMIN, LWS, MPA0, MPB0, $ MYCOL, MYROW, NPB0, NPCOL, NPROW, NQA0, $ NRHSQB0, SCLLEN DOUBLE PRECISION ANRM, BIGNUM, BNRM, SMLNUM * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) DOUBLE PRECISION RWORK( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL ILCM, INDXG2P, LSAME, NUMROC, PDLAMCH, $ PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDGELQF, $ PDGEQRF, PDLABAD, PDLASCL, PDLASET, $ PDORMLQ, PDORMQR, PDTRSM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 800 + CTXT_ ) ELSE CALL CHK1MAT( M, 2, N, 3, IA, JA, DESCA, 8, INFO ) IF ( M .GE. N ) THEN CALL CHK1MAT( M, 2, NRHS, 4, IB, JB, DESCB, 12, INFO ) ELSE CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 12, INFO ) ENDIF IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( IA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( IB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) NRHSQB0 = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, $ NPCOL ) IF( M.GE.N ) THEN MPB0 = NUMROC( M+IROFFB, DESCB( MB_ ), MYROW, IBROW, $ NPROW ) LTAU = NUMROC( JA+MIN(M,N)-1, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) LWF = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) LWS = MAX( ( DESCA( NB_ )*( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPB0 + NRHSQB0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ )*DESCA( NB_ ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW NPB0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, $ NPROW ) LTAU = NUMROC( IA+MIN(M,N)-1, DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) LWF = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) LWS = MAX( ( DESCA( MB_ )*( DESCA( MB_ ) - 1 ) ) / 2, $ ( NPB0 + MAX( NQA0 + NUMROC( NUMROC( N+IROFFB, $ DESCA( MB_ ), 0, 0, NPROW ), DESCA( MB_ ), 0, 0, $ LCMP ), NRHSQB0 ) )*DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF LWMIN = LTAU + MAX( LWF, LWS ) WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) * TPSD = .TRUE. IF( LSAME( TRANS, 'N' ) ) $ TPSD = .FALSE. * IF( .NOT.( LSAME( TRANS, 'N' ) .OR. $ LSAME( TRANS, 'T' ) ) ) THEN INFO = -1 ELSE IF( M.LT.0 ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( M.GE.N .AND. IROFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( M.GE.N .AND. IAROW.NE.IBROW ) THEN INFO = -10 ELSE IF( M.LT.N .AND. ICOFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( M.GE.N .AND. DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1200 + MB_ ) ELSE IF( M.LT.N .AND. DESCA( NB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1200 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1200 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF END IF * IF( .NOT.TPSD ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IDUM1( 1 ) = ICHAR( 'T' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 14 CALL PCHK2MAT( M, 2, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, 4, $ IB, JB, DESCB, 12, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGELS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( MIN( M, N, NRHS ).EQ.0 ) THEN CALL PDLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, $ IB, JB, DESCB ) RETURN END IF * * Get machine parameters * SMLNUM = PDLAMCH( ICTXT, 'S' ) SMLNUM = SMLNUM / PDLAMCH( ICTXT, 'P' ) BIGNUM = ONE / SMLNUM CALL PDLABAD( ICTXT, SMLNUM, BIGNUM ) * * Scale A, B if max entry outside range [SMLNUM,BIGNUM] * ANRM = PDLANGE( 'M', M, N, A, IA, JA, DESCA, RWORK ) IASCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN * * Scale matrix norm up to SMLNUM * CALL PDLASCL( 'G', ANRM, SMLNUM, M, N, A, IA, JA, DESCA, $ INFO ) IASCL = 1 ELSE IF( ANRM.GT.BIGNUM ) THEN * * Scale matrix norm down to BIGNUM * CALL PDLASCL( 'G', ANRM, BIGNUM, M, N, A, IA, JA, DESCA, $ INFO ) IASCL = 2 ELSE IF( ANRM.EQ.ZERO ) THEN * * Matrix all zero. Return zero solution. * CALL PDLASET( 'F', MAX( M, N ), NRHS, ZERO, ZERO, B, IB, JB, $ DESCB ) GO TO 10 END IF * BROW = M IF( TPSD ) $ BROW = N * BNRM = PDLANGE( 'M', BROW, NRHS, B, IB, JB, DESCB, RWORK ) * IBSCL = 0 IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN * * Scale matrix norm up to SMLNUM * CALL PDLASCL( 'G', BNRM, SMLNUM, BROW, NRHS, B, IB, JB, $ DESCB, INFO ) IBSCL = 1 ELSE IF( BNRM.GT.BIGNUM ) THEN * * Scale matrix norm down to BIGNUM * CALL PDLASCL( 'G', BNRM, BIGNUM, BROW, NRHS, B, IB, JB, $ DESCB, INFO ) IBSCL = 2 END IF * IPW = LTAU + 1 * IF( M.GE.N ) THEN * * compute QR factorization of A * CALL PDGEQRF( M, N, A, IA, JA, DESCA, WORK, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least N, optimally N*NB * IF( .NOT.TPSD ) THEN * * Least-Squares Problem min || A * X - B || * * B(IB:IB+M-1,JB:JB+NRHS-1) := Q' * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PDORMQR( 'Left', 'Transpose', M, NRHS, N, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * * B(IB:IB+N-1,JB:JB+NRHS-1) := inv(R) * * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PDTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * SCLLEN = N * ELSE * * Overdetermined system of equations sub( A )' * X = sub( B ) * * sub( B ) := inv(R') * sub( B ) * CALL PDTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * B(IB+N:IB+M-1,JB:JB+NRHS-1) = ZERO * CALL PDLASET( 'All', M-N, NRHS, ZERO, ZERO, B, IB+N, JB, $ DESCB ) * * B(IB:IB+M-1,JB:JB+NRHS-1) := Q(1:N,:) * * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PDORMQR( 'Left', 'No transpose', M, NRHS, N, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * SCLLEN = M * END IF * ELSE * * Compute LQ factorization of sub( A ) * CALL PDGELQF( M, N, A, IA, JA, DESCA, WORK, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least M, optimally M*NB. * IF( .NOT.TPSD ) THEN * * underdetermined system of equations sub( A ) * X = sub( B ) * * B(IB:IB+M-1,JB:JB+NRHS-1) := inv(L) * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PDTRSM( 'Left', 'Lower', 'No transpose', 'Non-unit', M, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * B(IB+M:IB+N-1,JB:JB+NRHS-1) = 0 * CALL PDLASET( 'All', N-M, NRHS, ZERO, ZERO, B, IB+M, JB, $ DESCB ) * * B(IB:IB+N-1,JB:JB+NRHS-1) := Q(1:N,:)' * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PDORMLQ( 'Left', 'Transpose', N, NRHS, M, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * SCLLEN = N * ELSE * * overdetermined system min || A' * X - B || * * B(IB:IB+N-1,JB:JB+NRHS-1) := Q * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PDORMLQ( 'Left', 'No transpose', N, NRHS, M, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * * B(IB:IB+M-1,JB:JB+NRHS-1) := inv(L') * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PDTRSM( 'Left', 'Lower', 'Transpose', 'Non-unit', M, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * SCLLEN = M * END IF * END IF * * Undo scaling * IF( IASCL.EQ.1 ) THEN CALL PDLASCL( 'G', ANRM, SMLNUM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) ELSE IF( IASCL.EQ.2 ) THEN CALL PDLASCL( 'G', ANRM, BIGNUM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) END IF IF( IBSCL.EQ.1 ) THEN CALL PDLASCL( 'G', SMLNUM, BNRM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) ELSE IF( IBSCL.EQ.2 ) THEN CALL PDLASCL( 'G', BIGNUM, BNRM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) END IF * 10 CONTINUE * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGELS * END scalapack-2.0.2/SRC/pdgeql2.f000644 000766 000024 00000030070 10363532303 016043 0ustar00juliestaff000000 000000 SUBROUTINE PDGEQL2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEQL2 computes a QL factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * L. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M >= N, the * lower triangle of the distributed submatrix * A( IA+M-N:IA+M-1, JA:JA+N-1 ) contains the N-by-N lower * triangular matrix L; if M <= N, the elements on and below * the (N-M)-th superdiagonal contain the M by N lower * trapezoidal matrix L; the remaining elements, with the * array TAU, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Mp0 + MAX( 1, Nq0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja+k-1) . . . H(ja+1) H(ja), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in * A(ia:ia+m-k+i-2,ja+n-k+i-1), and tau in TAU(ja+n-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICTXT, II, J, JJ, K, LWMIN, $ MP, MYCOL, MYROW, NPCOL, NPROW, NQ DOUBLE PRECISION AJJ, ALPHA * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DGEBR2D, $ DGEBS2D, DLARFG, DSCAL, INFOG2L, $ PDELSET, PDLARF, PDLARFG, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MP + MAX( 1, NQ ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEQL2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( DESCA( M_ ).EQ.1 ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) IACOL = INDXG2P( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN I = II+(JJ+NQ-2)*DESCA( LLD_ ) AJJ = A( I ) CALL DLARFG( 1, AJJ, A( I ), 1, TAU( JJ+NQ-1 ) ) IF( N.GT.1 ) THEN ALPHA = ONE - TAU( JJ+NQ-1 ) CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1 ) CALL DSCAL( NQ-1, ALPHA, A( II+(JJ-1)*DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ+NQ-1 ), 1 ) A( I ) = AJJ ELSE IF( N.GT.1 ) THEN CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, IAROW, IACOL ) CALL DSCAL( NQ, ALPHA, A( II+(JJ-1)*DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ+NQ-1 ), 1, IAROW, IACOL ) END IF * ELSE * K = MIN( M, N ) DO 10 J = JA+K-1, JA, -1 I = IA + J - JA * * Generate elementary reflector H(j) to annihilate * A(ia:i+m-k-1,j+n-k) * CALL PDLARFG( M-K+I-IA+1, AJJ, M-K+I, N-K+J, A, IA, $ N-K+J, DESCA, 1, TAU ) * * Apply H(j) to A(ia:i+m-k,ja:j+n-k-1) from the left * CALL PDELSET( A, I+M-K, J+N-K, DESCA, ONE ) CALL PDLARF( 'Left', M-K+I-IA+1, N-K+J-JA, A, IA, N-K+J, $ DESCA, 1, TAU, A, IA, JA, DESCA, WORK ) CALL PDELSET( A, I+M-K, J+N-K, DESCA, AJJ ) * 10 CONTINUE * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGEQL2 * END scalapack-2.0.2/SRC/pdgeqlf.f000644 000766 000024 00000027314 10363532303 016136 0ustar00juliestaff000000 000000 SUBROUTINE PDGEQLF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEQLF computes a QL factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * L. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M >= N, the * lower triangle of the distributed submatrix * A( IA+M-N:IA+M-1, JA:JA+N-1 ) contains the N-by-N lower * triangular matrix L; if M <= N, the elements on and below * the (N-M)-th superdiagonal contain the M by N lower * trapezoidal matrix L; the remaining elements, with the * array TAU, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( Mp0 + Nq0 + NB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja+k-1) . . . H(ja+1) H(ja), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in * A(ia:ia+m-k+i-2,ja+n-k+i-1), and tau in TAU(ja+n-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, IINFO, ICTXT, IPW, J, JB, JL, JN, $ K, LWMIN, MP0, MU, MYCOL, MYROW, NPCOL, NPROW, $ NQ0, NU * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PDGEQL2, $ PDLARFB, PDLARFT, PB_TOPGET, PB_TOPSET, $ PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MP0 + NQ0 + DESCA( NB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEQLF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JL = MAX( ( (JA+N-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( JL.GE.JN+1 ) THEN * * Use blocked code initially * DO 10 J = JL, JN+1, -DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Compute the QL factorization of the current block * A(ia:ia+m-n+j+jb-ja-1,j:j+jb-1) * CALL PDGEQL2( M-N+J+JB-JA, JB, A, IA, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * IF( J.GT.JA ) THEN * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PDLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:ia+m-n+j+jb-ja-1,ja:j-1) from the * left * CALL PDLARFB( 'Left', 'Transpose', 'Backward', $ 'Columnwise', M-N+J+JB-JA, J-JA, JB, A, IA, $ J, DESCA, WORK, A, IA, JA, DESCA, $ WORK( IPW ) ) END IF * 10 CONTINUE * MU = M - N + JN - JA + 1 NU = JN - JA + 1 * ELSE * MU = M NU = N * END IF * * Use unblocked code to factor the last or only block * IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL PDGEQL2( MU, NU, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGEQLF * END scalapack-2.0.2/SRC/pdgeqpf.f000644 000766 000024 00000050616 10363532303 016143 0ustar00juliestaff000000 000000 SUBROUTINE PDGEQPF( M, N, A, IA, JA, DESCA, IPIV, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 14, 2000 * * .. Scalar Arguments .. INTEGER IA, JA, INFO, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEQPF computes a QR factorization with column pivoting of a * M-by-N distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1): * * sub( A ) * P = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension LOCc(JA+N-1). * On exit, if IPIV(I) = K, the local i-th column of sub( A )*P * was the global K-th column of sub( A ). IPIV is tied to the * distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX(3,Mp0 + Nq0) + LOCc(JA+N-1)+Nq0. * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * LOCc(JA+N-1) = NUMROC( JA+N-1, NB_A, MYCOL, CSRC_A, NPCOL ) * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(1) H(2) . . . H(n) * * Each H(i) has the form * * H = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 * and v(i) = 1; v(i+1:m) is stored on exit in A(ia+i-1:ia+m-1,ja+i-1). * * The matrix P is represented in jpvt as follows: If * jpvt(j) = i * then the jth column of P is the ith canonical unit vector. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFF, ICTXT, ICURROW, $ ICURCOL, II, IIA, IOFFA, IPN, IPCOL, IPW, $ IROFF, ITEMP, J, JB, JJ, JJA, JJPVT, JN, KB, $ K, KK, KSTART, KSTEP, LDA, LL, LWMIN, MN, MP, $ MYCOL, MYROW, NPCOL, NPROW, NQ, NQ0, PVT DOUBLE PRECISION AJJ, ALPHA, TEMP, TEMP2 * .. * .. Local Arrays .. INTEGER DESCN( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DCOPY, DESCSET, $ DGEBR2D, DGEBS2D, DGERV2D, $ DGESD2D, DLARFG, DSWAP, IGERV2D, $ IGESD2D, INFOG1L, INFOG2L, PCHK1MAT, PDAMAX, $ PDELSET, PDLARF, PDLARFG, PDNRM2, $ PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, IDINT, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) NQ0 = NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) LWMIN = MAX( 3, MP + NQ ) + NQ0 + NQ * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -10 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEQPF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF MN = MIN( M, N ) * * Initialize the array of pivots * LDA = DESCA( LLD_ ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) KSTEP = NPCOL * DESCA( NB_ ) * IF( MYCOL.EQ.IACOL ) THEN * * Handle first block separately * JB = JN - JA + 1 DO 10 LL = JJA, JJA+JB-1 IPIV( LL ) = JA + LL - JJA 10 CONTINUE KSTART = JN + KSTEP - DESCA( NB_ ) * * Loop over remaining block of columns * DO 30 KK = JJA+JB, JJA+NQ-1, DESCA( NB_ ) KB = MIN( JJA+NQ-KK, DESCA( NB_ ) ) DO 20 LL = KK, KK+KB-1 IPIV( LL ) = KSTART+LL-KK+1 20 CONTINUE KSTART = KSTART + KSTEP 30 CONTINUE ELSE KSTART = JN + ( MOD( MYCOL-IACOL+NPCOL, NPCOL )-1 )* $ DESCA( NB_ ) DO 50 KK = JJA, JJA+NQ-1, DESCA( NB_ ) KB = MIN( JJA+NQ-KK, DESCA( NB_ ) ) DO 40 LL = KK, KK+KB-1 IPIV( LL ) = KSTART+LL-KK+1 40 CONTINUE KSTART = KSTART + KSTEP 50 CONTINUE END IF * * Initialize partial column norms, handle first block separately * CALL DESCSET( DESCN, 1, DESCA( N_ ), 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), ICTXT, 1 ) * IPN = 1 IPW = IPN + NQ0 + NQ JJ = IPN + JJA - 1 IF( MYCOL.EQ.IACOL ) THEN DO 60 KK = 0, JB-1 CALL PDNRM2( M, WORK( JJ+KK ), A, IA, JA+KK, DESCA, 1 ) WORK( NQ+JJ+KK ) = WORK( JJ+KK ) 60 CONTINUE JJ = JJ + JB END IF ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining blocks of columns * DO 80 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 70 KK = 0, JB-1 CALL PDNRM2( M, WORK( JJ+KK ), A, IA, J+KK, DESCA, 1 ) WORK( NQ+JJ+KK ) = WORK( JJ+KK ) 70 CONTINUE JJ = JJ + JB END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) 80 CONTINUE * * Compute factorization * DO 120 J = JA, JA+MN-1 I = IA + J - JA * CALL INFOG1L( J, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), $ JJ, ICURCOL ) K = JA + N - J IF( K.GT.1 ) THEN CALL PDAMAX( K, TEMP, PVT, WORK( IPN ), 1, J, DESCN, $ DESCN( M_ ) ) ELSE PVT = J END IF IF( J.NE.PVT ) THEN CALL INFOG1L( PVT, DESCA( NB_ ), NPCOL, MYCOL, $ DESCA( CSRC_ ), JJPVT, IPCOL ) IF( ICURCOL.EQ.IPCOL ) THEN IF( MYCOL.EQ.ICURCOL ) THEN CALL DSWAP( MP, A( IIA+(JJ-1)*LDA ), 1, $ A( IIA+(JJPVT-1)*LDA ), 1 ) ITEMP = IPIV( JJPVT ) IPIV( JJPVT ) = IPIV( JJ ) IPIV( JJ ) = ITEMP WORK( IPN+JJPVT-1 ) = WORK( IPN+JJ-1 ) WORK( IPN+NQ+JJPVT-1 ) = WORK( IPN+NQ+JJ-1 ) END IF ELSE IF( MYCOL.EQ.ICURCOL ) THEN * CALL DGESD2D( ICTXT, MP, 1, A( IIA+(JJ-1)*LDA ), LDA, $ MYROW, IPCOL ) WORK( IPW ) = DBLE( IPIV( JJ ) ) WORK( IPW+1 ) = WORK( IPN + JJ - 1 ) WORK( IPW+2 ) = WORK( IPN + NQ + JJ - 1 ) CALL DGESD2D( ICTXT, 3, 1, WORK( IPW ), 3, MYROW, $ IPCOL ) * CALL DGERV2D( ICTXT, MP, 1, A( IIA+(JJ-1)*LDA ), LDA, $ MYROW, IPCOL ) CALL IGERV2D( ICTXT, 1, 1, IPIV( JJ ), 1, MYROW, $ IPCOL ) * ELSE IF( MYCOL.EQ.IPCOL ) THEN * CALL DGESD2D( ICTXT, MP, 1, A( IIA+(JJPVT-1)*LDA ), $ LDA, MYROW, ICURCOL ) CALL IGESD2D( ICTXT, 1, 1, IPIV( JJPVT ), 1, MYROW, $ ICURCOL ) * CALL DGERV2D( ICTXT, MP, 1, A( IIA+(JJPVT-1)*LDA ), $ LDA, MYROW, ICURCOL ) CALL DGERV2D( ICTXT, 3, 1, WORK( IPW ), 3, MYROW, $ ICURCOL ) IPIV( JJPVT ) = IDINT( WORK( IPW ) ) WORK( IPN+JJPVT-1 ) = WORK( IPW+1 ) WORK( IPN+NQ+JJPVT-1 ) = WORK( IPW+2 ) * END IF * END IF * END IF * * Generate elementary reflector H(i) * CALL INFOG1L( I, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ II, ICURROW ) IF( DESCA( M_ ).EQ.1 ) THEN IF( MYROW.EQ.ICURROW ) THEN IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*DESCA( LLD_ ) AJJ = A( IOFFA ) CALL DLARFG( 1, AJJ, A( IOFFA ), 1, TAU( JJ ) ) IF( N.GT.1 ) THEN ALPHA = ONE - TAU( JJ ) CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1 ) CALL DSCAL( NQ-JJ, ALPHA, A( IOFFA+DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ ), 1 ) A( IOFFA ) = AJJ ELSE IF( N.GT.1 ) THEN CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, ICURROW, ICURCOL ) CALL DSCAL( NQ-JJ+1, ALPHA, A( I ), DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.ICURCOL ) THEN CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), $ 1, ICURROW, ICURCOL ) END IF * ELSE * CALL PDLARFG( M-J+JA, AJJ, I, J, A, MIN( I+1, IA+M-1 ), J, $ DESCA, 1, TAU ) IF( J.LT.JA+N-1 ) THEN * * Apply H(i) to A(ia+j-ja:ia+m-1,j+1:ja+n-1) from the left * CALL PDELSET( A, I, J, DESCA, ONE ) CALL PDLARF( 'Left', M-J+JA, JA+N-1-J, A, I, J, DESCA, $ 1, TAU, A, I, J+1, DESCA, WORK( IPW ) ) END IF CALL PDELSET( A, I, J, DESCA, AJJ ) * END IF * * Update partial columns norms * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 IF( MOD( J, DESCA( NB_ ) ).EQ.0 ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) IF( (JJA+NQ-JJ).GT.0 ) THEN IF( MYROW.EQ.ICURROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, JJA+NQ-JJ, $ A( II+( MIN( JJA+NQ-1, JJ )-1 )*LDA ), $ LDA ) CALL DCOPY( JJA+NQ-JJ, A( II+( MIN( JJA+NQ-1, JJ ) $ -1)*LDA ), LDA, WORK( IPW+MIN( JJA+NQ-1, $ JJ )-1 ), 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', JJA+NQ-JJ, 1, $ WORK( IPW+MIN( JJA+NQ-1, JJ )-1 ), $ MAX( 1, NQ ), ICURROW, MYCOL ) END IF END IF * JN = MIN( ICEIL( J+1, DESCA( NB_ ) ) * DESCA( NB_ ), $ JA + N - 1 ) IF( MYCOL.EQ.ICURCOL ) THEN DO 90 LL = JJ-1, JJ + JN - J - 2 IF( WORK( IPN+LL ).NE.ZERO ) THEN TEMP = ONE-( ABS( WORK( IPW+LL ) ) / $ WORK( IPN+LL ) )**2 TEMP = MAX( TEMP, ZERO ) TEMP2 = ONE + 0.05D+0*TEMP* $ ( WORK( IPN+LL ) / WORK( IPN+NQ+LL ) )**2 IF( TEMP2.EQ.ONE ) THEN IF( IA+M-1.GT.I ) THEN CALL PDNRM2( IA+M-I-1, WORK( IPN+LL ), A, I+1, $ J+LL-JJ+2, DESCA, 1 ) WORK( IPN+NQ+LL ) = WORK( IPN+LL ) ELSE WORK( IPN+LL ) = ZERO WORK( IPN+NQ+LL ) = ZERO END IF ELSE WORK( IPN+LL ) = WORK( IPN+LL ) * SQRT( TEMP ) END IF END IF 90 CONTINUE JJ = JJ + JN - J END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 110 K = JN+1, JA+N-1, DESCA( NB_ ) KB = MIN( JA+N-K, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 100 LL = JJ-1, JJ+KB-2 IF( WORK( IPN+LL ).NE.ZERO ) THEN TEMP = ONE-( ABS( WORK( IPW+LL ) ) / $ WORK( IPN+LL ) )**2 TEMP = MAX( TEMP, ZERO ) TEMP2 = ONE + 0.05D+0*TEMP* $ ( WORK( IPN+LL ) / WORK( IPN+NQ+LL ) )**2 IF( TEMP2.EQ.ONE ) THEN IF( IA+M-1.GT.I ) THEN CALL PDNRM2( IA+M-I-1, WORK( IPN+LL ), A, $ I+1, K+LL-JJ+1, DESCA, 1 ) WORK( IPN+NQ+LL ) = WORK( IPN+LL ) ELSE WORK( IPN+LL ) = ZERO WORK( IPN+NQ+LL ) = ZERO END IF ELSE WORK( IPN+LL ) = WORK( IPN+LL ) * SQRT( TEMP ) END IF END IF 100 CONTINUE JJ = JJ + KB END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * 110 CONTINUE * 120 CONTINUE * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGEQPF * END scalapack-2.0.2/SRC/pdgeqr2.f000644 000766 000024 00000027502 10363532303 016057 0ustar00juliestaff000000 000000 SUBROUTINE PDGEQR2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEQR2 computes a QR factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Mp0 + MAX( 1, Nq0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(m,n). * * Each H(i) has the form * * H(j) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 * and v(i) = 1; v(i+1:m) is stored on exit in A(ia+i:ia+m-1,ja+i-1), * and tau in TAU(ja+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, II, IACOL, IAROW, ICTXT, J, JJ, K, LWMIN, $ MP, MYCOL, MYROW, NPCOL, NPROW, NQ DOUBLE PRECISION AJJ, ALPHA * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DGEBR2D, $ DGEBS2D, DLARFG, DSCAL, INFOG2L, $ PDELSET, PDLARF, PDLARFG, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MP + MAX( 1, NQ ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEQR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) I = II+(JJ-1)*DESCA( LLD_ ) IF( MYCOL.EQ.IACOL ) THEN AJJ = A( I ) CALL DLARFG( 1, AJJ, A( I ), 1, TAU( JJ ) ) IF( N.GT.1 ) THEN ALPHA = ONE - TAU( JJ ) CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1 ) CALL DSCAL( NQ-JJ, ALPHA, A( I+DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), $ 1 ) A( I ) = AJJ ELSE IF( N.GT.1 ) THEN CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, IAROW, IACOL ) CALL DSCAL( NQ-JJ+1, ALPHA, A( I ), DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), 1, $ IAROW, IACOL ) END IF * ELSE * K = MIN( M, N ) DO 10 J = JA, JA+K-1 I = IA + J - JA * * Generate elementary reflector H(j) to annihilate * A(i+1:ia+m-1,j) * CALL PDLARFG( M-J+JA, AJJ, I, J, A, MIN( I+1, IA+M-1 ), J, $ DESCA, 1, TAU ) IF( J.LT.JA+N-1 ) THEN * * Apply H(j)' to A(i:ia+m-1,j+1:ja+n-1) from the left * CALL PDELSET( A, I, J, DESCA, ONE ) * CALL PDLARF( 'Left', M-J+JA, N-J+JA-1, A, I, J, DESCA, 1, $ TAU, A, I, J+1, DESCA, WORK ) END IF CALL PDELSET( A, I, J, DESCA, AJJ ) * 10 CONTINUE * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGEQR2 * END scalapack-2.0.2/SRC/pdgeqrf.f000644 000766 000024 00000027466 10363532303 016154 0ustar00juliestaff000000 000000 SUBROUTINE PDGEQRF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGEQRF computes a QR factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( Mp0 + Nq0 + NB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(m,n). * * Each H(i) has the form * * H(j) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 * and v(i) = 1; v(i+1:m) is stored on exit in A(ia+i:ia+m-1,ja+i-1), * and tau in TAU(ja+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IINFO, IPW, J, $ JB, JN, K, LWMIN, MP0, MYCOL, MYROW, NPCOL, $ NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PDGEQR2, $ PDLARFB, PDLARFT, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MP0 + NQ0 + DESCA( NB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGEQRF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JB = JN - JA + 1 * * Compute the QR factorization of the first block A(ia:ia+m-1,ja:jn) * CALL PDGEQR2( M, JB, A, IA, JA, DESCA, TAU, WORK, LWORK, IINFO ) * IF( JA+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(ja) H(ja+1) . . . H(jn) * CALL PDLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:ia+m-1,ja+jb:ja+n-1) from the left * CALL PDLARFB( 'Left', 'Transpose', 'Forward', 'Columnwise', M, $ N-JB, JB, A, IA, JA, DESCA, WORK, A, IA, JA+JB, $ DESCA, WORK( IPW ) ) END IF * * Loop over the remaining blocks of columns * DO 10 J = JN+1, JA+K-1, DESCA( NB_ ) JB = MIN( K-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Compute the QR factorization of the current block * A(i:ia+m-1,j:j+jb-1) * CALL PDGEQR2( M-J+JA, JB, A, I, J, DESCA, TAU, WORK, LWORK, $ IINFO ) * IF( J+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PDLARFT( 'Forward', 'Columnwise', M-J+JA, JB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(i:ia+m-1,j+jb:ja+n-1) from the left * CALL PDLARFB( 'Left', 'Transpose', 'Forward', 'Columnwise', $ M-J+JA, N-J-JB+JA, JB, A, I, J, DESCA, WORK, $ A, I, J+JB, DESCA, WORK( IPW ) ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGEQRF * END scalapack-2.0.2/SRC/pdgerfs.f000644 000766 000024 00000102120 10363532303 016133 0ustar00juliestaff000000 000000 SUBROUTINE PDGERFS( TRANS, N, NRHS, A, IA, JA, DESCA, AF, IAF, $ JAF, DESCAF, IPIV, B, IB, JB, DESCB, X, IX, $ JX, DESCX, FERR, BERR, WORK, LWORK, IWORK, $ LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IAF, IB, IX, INFO, JA, JAF, JB, JX, $ LIWORK, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ),IPIV( * ), IWORK( * ) DOUBLE PRECISION A( * ), AF( * ), B( * ), BERR( * ), FERR( * ), $ WORK( * ), X( * ) * .. * * Purpose * ======= * * PDGERFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solutions. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations. * = 'N': sub( A ) * sub( X ) = sub( B ) (No transpose) * = 'T': sub( A )**T * sub( X ) = sub( B ) (Transpose) * = 'C': sub( A )**T * sub( X ) = sub( B ) * (Conjugate transpose = Transpose) * * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) DOUBLE PRECISION pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input) DOUBLE PRECISION pointer into the local * memory to an array of local dimension (LLD_AF,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * factors of the matrix sub( A ) = P * L * U as computed by * PDGETRF. * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * IPIV (local input) INTEGER array of dimension LOCr(M_AF)+MB_AF. * This array contains the pivoting information as computed * by PDGETRF. IPIV(i) -> The global row local row i * was swapped with. This array is tied to the distributed * matrix A. * * B (local input) DOUBLE PRECISION pointer into the local * memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1)). This array contains the local * pieces of the distributed matrix of right hand sides * sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input and output) DOUBLE PRECISION pointer into the * local memory to an array of local dimension * (LLD_X,LOCc(JX+NRHS-1)). On entry, this array contains * the local pieces of the distributed matrix solution * sub( X ). On exit, the improved solution vectors. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). * The estimated forward error bound for each solution vector * of sub( X ). If XTRUE is the true solution corresponding * to sub( X ), FERR is an estimated upper bound for the * magnitude of the largest element in (sub( X ) - XTRUE) * divided by the magnitude of the largest element in sub( X ). * The estimate is as reliable as the estimate for RCOND, and * is almost always a slight overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*LOCr( N + MOD(IA-1,MB_A) ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr( N + MOD(IB-1,MB_B) ). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices op( A ) and op( AF ) (respectively * sub( X ) and sub( B ) ) should be distributed the same way on the * same processes. These conditions ensure that sub( A ) and sub( AF ) * (resp. sub( X ) and sub( B ) ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( AF ), sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IAF, DESCAF( MB_ ) ) = f( JAF, DESCAF( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) DOUBLE PRECISION TWO, THREE PARAMETER ( TWO = 2.0D+0, THREE = 3.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOTRAN CHARACTER TRANST INTEGER COUNT, IACOL, IAFCOL, IAFROW, IAROW, IXBCOL, $ IXBROW, IXCOL, IXROW, ICOFFA, ICOFFAF, ICOFFB, $ ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, IIW, $ IOFFXB, IPB, IPR, IPV, IROFFA, IROFFAF, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LIWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ DOUBLE PRECISION EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMX2D, $ DGEBR2D, DGEBS2D, INFOG2L, PCHK2MAT, $ PDAGEMV, PDAXPY, PDCOPY, PDGEMV, $ PDGETRS, PDLACON, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * NOTRAN = LSAME( TRANS, 'N' ) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, N, 2, IAF, JAF, DESCAF, 11, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 16, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IX, JX, DESCX, 20, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFAF = MOD( JAF-1, DESCAF( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFCOL = INDXG2P( JAF, DESCAF( NB_ ), MYCOL, $ DESCAF( CSRC_ ), NPCOL ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 3 * NPMOD LIWMIN = NPMOD WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( ( .NOT.NOTRAN ) .AND. ( .NOT.LSAME( TRANS, 'T' ) ) .AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700 + NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCAF( MB_ ) ) THEN INFO = -( 1100 + MB_ ) ELSE IF( IROFFAF.NE.0 .OR. IAROW.NE.IAFROW ) THEN INFO = -9 ELSE IF( DESCA( NB_ ).NE.DESCAF( NB_ ) ) THEN INFO = -( 1100 + NB_ ) ELSE IF( ICOFFAF.NE.0 .OR. IACOL.NE.IAFCOL ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -( 1100 + CTXT_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -14 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1600 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1600 + CTXT_ ) ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 2000 + MB_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -18 ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 2000 + NB_ ) ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -19 ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 2000 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -24 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -26 END IF END IF * IF( NOTRAN ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 1 ) = ICHAR( 'T' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 IDUM1( 2 ) = N IDUM2( 2 ) = 2 IDUM1( 3 ) = NRHS IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 24 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 26 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, N, 2, IAF, $ JAF, DESCAF, 11, 5, IDUM1, IDUM2, INFO ) CALL PCHK2MAT( N, 2, NRHS, 3, IB, JB, DESCB, 16, N, 2, NRHS, 3, $ IX, JX, DESCX, 20, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGERFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = IPB + NP0 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = 1 + maximum number of nonzero entries in each row of sub( A ) * NZ = N + 1 EPS = PDLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) ) * DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 100 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 20 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X), * where op(sub(A)) = sub(A), or sub(A)' (A**T or A**H), * depending on TRANS. * CALL PDCOPY( N, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PDGEMV( TRANS, N, N, -ONE, A, IA, JA, DESCA, X, IX, $ JX+K, DESCX, 1, ONE, WORK( IPR ), IW, JW, $ DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIXB, IIXB + NP - 1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 30 CONTINUE END IF END IF * CALL PDAGEMV( TRANS, N, N, ONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, ONE, WORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 40 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during the * last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PDGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PDAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, IX, $ JX+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 20 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(op(sub(A))))* * ( abs(R) + NZ*EPS*( * abs(op(sub(A)))*abs(sub(X))+abs(sub(B)))))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(sub(A))) is the inverse of op(sub(A)) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(op(sub(A)))*abs(sub(X)) + abs(sub(B)) is less than * SAFE2. * * Use PDLACON to estimate the infinity-norm of the matrix * inv(op(sub(A))) * diag(W), where * W = abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 50 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 50 CONTINUE END IF END IF * KASE = 0 60 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PDLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(sub(A))'). * CALL PDGETRS( TRANST, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF ELSE * * Multiply by inv(op(sub(A)))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 80 CONTINUE END IF END IF * CALL PDGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 60 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 90 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 90 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 100 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 200 J = JN+1, JB+NRHS-1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 190 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 110 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X), * where op(sub(A)) = sub(A), or sub(A)' (A**T or A**H), * depending on TRANS. * CALL PDCOPY( N, B, IB, J+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PDGEMV( TRANS, N, N, -ONE, A, IA, JA, DESCA, X, $ IX, J+K, DESCX, 1, ONE, WORK( IPR ), IW, JW, $ DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) (abs(R(i))/(abs(op(sub(A)))*abs(sub(X)) + * abs(sub(B)))(i)) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIXB, IIXB+NP-1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 120 CONTINUE END IF END IF * CALL PDAGEMV( TRANS, N, N, ONE, A, IA, JA, DESCA, X, IX, $ J+K, DESCX, 1, ONE, WORK( IPB ), IW, JW, $ DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 )THEN DO 130 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 130 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.ICURCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during * the last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. $ COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PDGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PDAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, $ IX, J+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 110 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(op(sub(A))))* * ( abs(R) + NZ*EPS*( * abs(op(sub(A)))*abs(sub(X))+abs(sub(B)))))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(sub(A))) is the inverse of op(sub(A)) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(op(sub(A)))*abs(sub(X)) + abs(sub(B)) is less than * SAFE2. * * Use PDLACON to estimate the infinity-norm of the matrix * inv(op(sub(A))) * diag(W), where * W = abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 140 CONTINUE END IF END IF * KASE = 0 150 CONTINUE IF( MYCOL.EQ.ICURCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, ICURCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PDLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = ICURCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(sub(A))'). * CALL PDGETRS( TRANST, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 160 CONTINUE END IF END IF ELSE * * Multiply by inv(op(sub(A)))*diag(W). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 170 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 170 CONTINUE END IF END IF * CALL PDGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, $ INFO ) END IF GO TO 150 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 180 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 180 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, $ 1, IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 190 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 200 CONTINUE * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PDGERFS * END scalapack-2.0.2/SRC/pdgerq2.f000644 000766 000024 00000024714 10363532303 016061 0ustar00juliestaff000000 000000 SUBROUTINE PDGERQ2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGERQ2 computes a RQ factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = R * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAU, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia) H(ia+1) . . . H(ia+k-1), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; v(1:n-k+i-1) is stored on exit in * A(ia+m-k+i-1,ja:ja+n-k+i-2), and tau in TAU(ia+m-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, J, K, LWMIN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ DOUBLE PRECISION AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PDELSET, PDLARF, PDLARFG, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQ + MAX( 1, MP ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGERQ2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * K = MIN( M, N ) DO 10 I = IA+K-1, IA, -1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * A(i+m-k,ja:j+n-k-1) * CALL PDLARFG( N-K+J-JA+1, AII, I+M-K, J+N-K, A, I+M-K, JA, $ DESCA, DESCA( M_ ), TAU ) * * Apply H(i) to A(ia:i+m-k-1,ja:j+n-k) from the right * CALL PDELSET( A, I+M-K, J+N-K, DESCA, ONE ) CALL PDLARF( 'Right', M-K+I-IA, N-K+J-JA+1, A, M-K+I, JA, $ DESCA, DESCA( M_ ), TAU, A, IA, JA, DESCA, WORK ) CALL PDELSET( A, I+M-K, J+N-K, DESCA, AII ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGERQ2 * END scalapack-2.0.2/SRC/pdgerqf.f000644 000766 000024 00000027212 10363532303 016141 0ustar00juliestaff000000 000000 SUBROUTINE PDGERQF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDGERQF computes a RQ factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = R * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAU, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia) H(ia+1) . . . H(ia+k-1), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; v(1:n-k+i-1) is stored on exit in * A(ia+m-k+i-1,ja:ja+n-k+i-2), and tau in TAU(ia+m-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IL, IN, IPW, $ K, LWMIN, MP0, MU, MYCOL, MYROW, NPCOL, NPROW, $ NQ0, NU * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PDGERQ2, $ PDLARFB, PDLARFT, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGERQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IL = MAX( ( (IA+M-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * IF( IL.GE.IN+1 ) THEN * * Use blocked code initially * DO 10 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Compute the RQ factorization of the current block * A(i:i+ib-1,ja:ja+n-m+i+ib-ia-1) * CALL PDGERQ2( IB, N-M+I+IB-IA, A, I, JA, DESCA, TAU, WORK, $ LWORK, IINFO ) * IF( I.GT.IA ) THEN * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PDLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, $ I, JA, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:i-1,ja:ja+n-m+i+ib-ia-1) from the * right * CALL PDLARFB( 'Right', 'No transpose', 'Backward', $ 'Rowwise', I-IA, N-M+I+IB-IA, IB, A, I, JA, $ DESCA, WORK, A, IA, JA, DESCA, $ WORK( IPW ) ) END IF * 10 CONTINUE * MU = IN - IA + 1 NU = N - M + IN - IA + 1 * ELSE * MU = M NU = N * END IF * * Use unblocked code to factor the last or only block * IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL PDGERQ2( MU, NU, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDGERQF * END scalapack-2.0.2/SRC/pdgesv.f000644 000766 000024 00000023127 10367447133 016014 0ustar00juliestaff000000 000000 SUBROUTINE PDGESV( N, NRHS, A, IA, JA, DESCA, IPIV, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * Jan 30, 2006 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Purpose * ======= * * PDGESV computes the solution to a real system of linear equations * * sub( A ) * X = sub( B ), * * where sub( A ) = A(IA:IA+N-1,JA:JA+N-1) is an N-by-N distributed * matrix and X and sub( B ) = B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS * distributed matrices. * * The LU decomposition with partial pivoting and row interchanges is * used to factor sub( A ) as sub( A ) = P * L * U, where P is a permu- * tation matrix, L is unit lower triangular, and U is upper triangular. * L and U are stored in sub( A ). The factored form of sub( A ) is then * used to solve the system of equations sub( A ) * X = sub( B ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the N-by-N distributed matrix * sub( A ) to be factored. On exit, this array contains the * local pieces of the factors L and U from the factorization * sub( A ) = P*L*U; the unit diagonal elements of L are not * stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * B (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, the right hand side * distributed matrix sub( B ). On exit, if INFO = 0, sub( B ) * is overwritten by the solution distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, so the solution could not be * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDGETRF, $ PDGETRS, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P EXTERNAL INDXG2P * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( N, 1, NRHS, 2, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IF( IROFFA.NE.0 ) THEN INFO = -4 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( IBROW.NE.IAROW .OR. ICOFFA.NE.IROFFB ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1100+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(1100+CTXT_) END IF END IF CALL PCHK2MAT( N, 1, N, 1, IA, JA, DESCA, 6, N, 1, NRHS, 2, $ IB, JB, DESCB, 11, 0, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGESV', -INFO ) RETURN END IF * * Compute the LU factorization of sub( A ). * CALL PDGETRF( N, N, A, IA, JA, DESCA, IPIV, INFO ) * IF( INFO.EQ.0 ) THEN * * Solve the system sub( A ) * X = sub( B ), overwriting sub( B ) * with X. * CALL PDGETRS( 'No transpose', N, NRHS, A, IA, JA, DESCA, IPIV, $ B, IB, JB, DESCB, INFO ) * END IF * RETURN * * End of PDGESV * END scalapack-2.0.2/SRC/pdgesvd.f000644 000766 000024 00000055300 10377355407 016161 0ustar00juliestaff000000 000000 SUBROUTINE PDGESVD(JOBU,JOBVT,M,N,A,IA,JA,DESCA,S,U,IU,JU,DESCU, + VT,IVT,JVT,DESCVT,WORK,LWORK,INFO) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Oak Ridge National Laboratory * and Univ. of California Berkeley. * Jan 2006 * * .. Scalar Arguments .. CHARACTER JOBU,JOBVT INTEGER IA,INFO,IU,IVT,JA,JU,JVT,LWORK,M,N * .. * .. Array Arguments .. INTEGER DESCA(*),DESCU(*),DESCVT(*) DOUBLE PRECISION A(*),U(*),VT(*),WORK(*) DOUBLE PRECISION S(*) * .. * * Purpose * ======= * * PDGESVD computes the singular value decomposition (SVD) of an * M-by-N matrix A, optionally computing the left and/or right * singular vectors. The SVD is written as * * A = U * SIGMA * transpose(V) * * where SIGMA is an M-by-N matrix which is zero except for its * min(M,N) diagonal elements, U is an M-by-M orthogonal matrix, and * V is an N-by-N orthogonal matrix. The diagonal elements of SIGMA * are the singular values of A and the columns of U and V are the * corresponding right and left singular vectors, respectively. The * singular values are returned in array S in decreasing order and * only the first min(M,N) columns of U and rows of VT = V**T are * computed. * * Notes * ===== * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, and * assume that its process grid has dimension r x c. LOCr( K ) denotes * the number of elements of K that a process would receive if K were * distributed over the r processes of its process column. Similarly, * LOCc( K ) denotes the number of elements of K that a process would * receive if K were distributed over the c processes of its process * row. The values of LOCr() and LOCc() may be determined via a call * to the ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MP = number of local rows in A and U * NQ = number of local columns in A and VT * SIZE = min( M, N ) * SIZEQ = number of local columns in U * SIZEP = number of local rows in VT * * JOBU (global input) CHARACTER*1 * Specifies options for computing U: * = 'V': the first SIZE columns of U (the left singular * vectors) are returned in the array U; * = 'N': no columns of U (no left singular vectors) are * computed. * * JOBVT (global input) CHARACTER*1 * Specifies options for computing V**T: * = 'V': the first SIZE rows of V**T (the right singular * vectors) are returned in the array VT; * = 'N': no rows of V**T (no right singular vectors) are * computed. * * M (global input) INTEGER * The number of rows of the input matrix A. M >= 0. * * N (global input) INTEGER * The number of columns of the input matrix A. N >= 0. * * A (local input/workspace) block cyclic DOUBLE PRECISION * array, * global dimension (M, N), local dimension (MP, NQ) * On exit, the contents of A are destroyed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix A. * * S (global output) DOUBLE PRECISION array, dimension SIZE * The singular values of A, sorted so that S(i) >= S(i+1). * * U (local output) DOUBLE PRECISION array, local dimension * (MP, SIZEQ), global dimension (M, SIZE) * if JOBU = 'V', U contains the first min(m,n) columns of U * if JOBU = 'N', U is not referenced. * * IU (global input) INTEGER * The row index in the global array U indicating the first * row of sub( U ). * * JU (global input) INTEGER * The column index in the global array U indicating the * first column of sub( U ). * * DESCU (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix U. * * VT (local output) DOUBLE PRECISION array, local dimension * (SIZEP, NQ), global dimension (SIZE, N). * If JOBVT = 'V', VT contains the first SIZE rows of * V**T. If JOBVT = 'N', VT is not referenced. * * IVT (global input) INTEGER * The row index in the global array VT indicating the first * row of sub( VT ). * * JVT (global input) INTEGER * The column index in the global array VT indicating the * first column of sub( VT ). * * DESCVT (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix VT. * * WORK (local workspace/output) DOUBLE PRECISION array, dimension * (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * LWORK >= 1 + 6*SIZEB + MAX(WATOBD, WBDTOSVD), * * where SIZEB = MAX(M,N), and WATOBD and WBDTOSVD refer, * respectively, to the workspace required to bidiagonalize * the matrix A and to go from the bidiagonal matrix to the * singular value decomposition U*S*VT. * * For WATOBD, the following holds: * * WATOBD = MAX(MAX(WPDLANGE,WPDGEBRD), * MAX(WPDLARED2D,WP(pre)LARED1D)), * * where WPDLANGE, WPDLARED1D, WPDLARED2D, WPDGEBRD are the * workspaces required respectively for the subprograms * PDLANGE, PDLARED1D, PDLARED2D, PDGEBRD. Using the * standard notation * * MP = NUMROC( M, MB, MYROW, DESCA( CTXT_ ), NPROW), * NQ = NUMROC( N, NB, MYCOL, DESCA( LLD_ ), NPCOL), * * the workspaces required for the above subprograms are * * WPDLANGE = MP, * WPDLARED1D = NQ0, * WPDLARED2D = MP0, * WPDGEBRD = NB*(MP + NQ + 1) + NQ, * * where NQ0 and MP0 refer, respectively, to the values obtained * at MYCOL = 0 and MYROW = 0. In general, the upper limit for * the workspace is given by a workspace required on * processor (0,0): * * WATOBD <= NB*(MP0 + NQ0 + 1) + NQ0. * * In case of a homogeneous process grid this upper limit can * be used as an estimate of the minimum workspace for every * processor. * * For WBDTOSVD, the following holds: * * WBDTOSVD = SIZE*(WANTU*NRU + WANTVT*NCVT) + * MAX(WDBDSQR, * MAX(WANTU*WPDORMBRQLN, WANTVT*WPDORMBRPRT)), * * where * * 1, if left(right) singular vectors are wanted * WANTU(WANTVT) = * 0, otherwise * * and WDBDSQR, WPDORMBRQLN and WPDORMBRPRT refer respectively * to the workspace required for the subprograms DBDSQR, * PDORMBR(QLN), and PDORMBR(PRT), where QLN and PRT are the * values of the arguments VECT, SIDE, and TRANS in the call * to PDORMBR. NRU is equal to the local number of rows of * the matrix U when distributed 1-dimensional "column" of * processes. Analogously, NCVT is equal to the local number * of columns of the matrix VT when distributed across * 1-dimensional "row" of processes. Calling the LAPACK * procedure DBDSQR requires * * WDBDSQR = MAX(1, 4*SIZE ) * * on every processor. Finally, * * WPDORMBRQLN = MAX( (NB*(NB-1))/2, (SIZEQ+MP)*NB)+NB*NB, * WPDORMBRPRT = MAX( (MB*(MB-1))/2, (SIZEP+NQ)*MB )+MB*MB, * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the work array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if DBDSQR did not converge * If INFO = MIN(M,N) + 1, then PDGESVD has detected * heterogeneity by finding that eigenvalues were not * identical across the process grid. In this case, the * accuracy of the results from PDGESVD cannot be * guaranteed. * * ===================================================================== * * The results of PDGEBRD, and therefore PDGESVD, may vary slightly * from run to run with the same input data. If repeatability is an * issue, call BLACS_SET with the appropriate option after defining * the process grid. * * Alignment requirements * ====================== * * The routine PDGESVD inherits the same alignement requirement as * the routine PDGEBRD, namely: * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * * ===================================================================== * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D,DLEN_,DTYPE_,CTXT_,M_,N_,MB_,NB_,RSRC_, + CSRC_,LLD_,ITHVAL PARAMETER (BLOCK_CYCLIC_2D=1,DLEN_=9,DTYPE_=1,CTXT_=2,M_=3,N_=4, + MB_=5,NB_=6,RSRC_=7,CSRC_=8,LLD_=9,ITHVAL=10) DOUBLE PRECISION ZERO,ONE PARAMETER (ZERO= (0.0D+0),ONE= (1.0D+0)) * .. * .. Local Scalars .. CHARACTER UPLO INTEGER CONTEXTC,CONTEXTR,I,INDD,INDD2,INDE,INDE2,INDTAUP,INDTAUQ, + INDU,INDV,INDWORK,IOFFD,IOFFE,ISCALE,J,K,LDU,LDVT,LLWORK, + LWMIN,MAXIM,MB,MP,MYPCOL,MYPCOLC,MYPCOLR,MYPROW,MYPROWC, + MYPROWR,NB,NCVT,NPCOL,NPCOLC,NPCOLR,NPROCS,NPROW,NPROWC, + NPROWR,NQ,NRU,SIZE,SIZEB,SIZEP,SIZEPOS,SIZEQ,WANTU,WANTVT, + WATOBD,WBDTOSVD,WDBDSQR,WPDGEBRD,WPDLANGE,WPDORMBRPRT, + WPDORMBRQLN DOUBLE PRECISION ANRM,BIGNUM,EPS,RMAX,RMIN,SAFMIN,SIGMA,SMLNUM * .. * .. Local Arrays .. INTEGER DESCTU(DLEN_),DESCTVT(DLEN_),IDUM1(3),IDUM2(3) DOUBLE PRECISION C(1,1) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH,PDLANGE EXTERNAL LSAME,NUMROC,PDLAMCH,PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GET,BLACS_GRIDEXIT,BLACS_GRIDINFO,BLACS_GRIDINIT, + CHK1MAT,DBDSQR,DESCINIT,DGAMN2D,DGAMX2D,DSCAL,IGAMX2D, + IGEBR2D,IGEBS2D,PCHK1MAT,PDGEBRD,PDGEMR2D,PDLARED1D, + PDLARED2D,PDLASCL,PDLASET,PDORMBR,PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX,MIN,SQRT,DBLE * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF (BLOCK_CYCLIC_2D*DTYPE_*LLD_*MB_*M_*NB_*N_.LT.0) RETURN * CALL BLACS_GRIDINFO(DESCA(CTXT_),NPROW,NPCOL,MYPROW,MYPCOL) ISCALE = 0 INFO = 0 * IF (NPROW.EQ.-1) THEN INFO = - (800+CTXT_) ELSE * SIZE = MIN(M,N) SIZEB = MAX(M,N) NPROCS = NPROW*NPCOL IF (M.GE.N) THEN IOFFD = JA - 1 IOFFE = IA - 1 SIZEPOS = 1 ELSE IOFFD = IA - 1 IOFFE = JA - 1 SIZEPOS = 3 END IF * IF (LSAME(JOBU,'V')) THEN WANTU = 1 ELSE WANTU = 0 END IF IF (LSAME(JOBVT,'V')) THEN WANTVT = 1 ELSE WANTVT = 0 END IF * CALL CHK1MAT(M,3,N,4,IA,JA,DESCA,8,INFO) IF (WANTU.EQ.1) THEN CALL CHK1MAT(M,3,SIZE,SIZEPOS,IU,JU,DESCU,13,INFO) END IF IF (WANTVT.EQ.1) THEN CALL CHK1MAT(SIZE,SIZEPOS,N,4,IVT,JVT,DESCVT,17,INFO) END IF CALL IGAMX2D(DESCA(CTXT_),'A',' ',1,1,INFO,1,1,1,-1,-1,0) * IF (INFO.EQ.0) THEN * * Set up pointers into the WORK array. * INDD = 2 INDE = INDD + SIZEB + IOFFD INDD2 = INDE + SIZEB + IOFFE INDE2 = INDD2 + SIZEB + IOFFD * INDTAUQ = INDE2 + SIZEB + IOFFE INDTAUP = INDTAUQ + SIZEB + JA - 1 INDWORK = INDTAUP + SIZEB + IA - 1 LLWORK = LWORK - INDWORK + 1 * * Initialize contexts for "column" and "row" process matrices. * CALL BLACS_GET(DESCA(CTXT_),10,CONTEXTC) CALL BLACS_GRIDINIT(CONTEXTC,'R',NPROCS,1) CALL BLACS_GRIDINFO(CONTEXTC,NPROWC,NPCOLC,MYPROWC, + MYPCOLC) CALL BLACS_GET(DESCA(CTXT_),10,CONTEXTR) CALL BLACS_GRIDINIT(CONTEXTR,'R',1,NPROCS) CALL BLACS_GRIDINFO(CONTEXTR,NPROWR,NPCOLR,MYPROWR, + MYPCOLR) * * Set local dimensions of matrices (this is for MB=NB=1). * NRU = NUMROC(M,1,MYPROWC,0,NPROCS) NCVT = NUMROC(N,1,MYPCOLR,0,NPROCS) NB = DESCA(NB_) MB = DESCA(MB_) MP = NUMROC(M,MB,MYPROW,DESCA(RSRC_),NPROW) NQ = NUMROC(N,NB,MYPCOL,DESCA(CSRC_),NPCOL) IF (WANTVT.EQ.1) THEN SIZEP = NUMROC(SIZE,DESCVT(MB_),MYPROW,DESCVT(RSRC_), + NPROW) ELSE SIZEP = 0 END IF IF (WANTU.EQ.1) THEN SIZEQ = NUMROC(SIZE,DESCU(NB_),MYPCOL,DESCU(CSRC_), + NPCOL) ELSE SIZEQ = 0 END IF * * Transmit MAX(NQ0, MP0). * IF (MYPROW.EQ.0 .AND. MYPCOL.EQ.0) THEN MAXIM = MAX(NQ,MP) CALL IGEBS2D(DESCA(CTXT_),'All',' ',1,1,MAXIM,1) ELSE CALL IGEBR2D(DESCA(CTXT_),'All',' ',1,1,MAXIM,1,0,0) END IF * WPDLANGE = MP WPDGEBRD = NB* (MP+NQ+1) + NQ WATOBD = MAX(MAX(WPDLANGE,WPDGEBRD),MAXIM) * WDBDSQR = MAX(1,4*SIZE) WPDORMBRQLN = MAX((NB* (NB-1))/2, (SIZEQ+MP)*NB) + NB*NB WPDORMBRPRT = MAX((MB* (MB-1))/2, (SIZEP+NQ)*MB) + MB*MB WBDTOSVD = SIZE* (WANTU*NRU+WANTVT*NCVT) + + MAX(WDBDSQR,MAX(WANTU*WPDORMBRQLN, + WANTVT*WPDORMBRPRT)) * * Finally, calculate required workspace. * LWMIN = 1 + 6*SIZEB + MAX(WATOBD,WBDTOSVD) WORK(1) = DBLE(LWMIN) * IF (WANTU.NE.1 .AND. .NOT. (LSAME(JOBU,'N'))) THEN INFO = -1 ELSE IF (WANTVT.NE.1 .AND. .NOT. (LSAME(JOBVT,'N'))) THEN INFO = -2 ELSE IF (LWORK.LT.LWMIN .AND. LWORK.NE.-1) THEN INFO = -19 END IF * END IF * IDUM1(1) = WANTU IDUM1(2) = WANTVT IF (LWORK.EQ.-1) THEN IDUM1(3) = -1 ELSE IDUM1(3) = 1 END IF IDUM2(1) = 1 IDUM2(2) = 2 IDUM2(3) = 19 CALL PCHK1MAT(M,3,N,4,IA,JA,DESCA,8,3,IDUM1,IDUM2,INFO) IF (INFO.EQ.0) THEN IF (WANTU.EQ.1) THEN CALL PCHK1MAT(M,3,SIZE,4,IU,JU,DESCU,13,0,IDUM1,IDUM2, + INFO) END IF IF (WANTVT.EQ.1) THEN CALL PCHK1MAT(SIZE,3,N,4,IVT,JVT,DESCVT,17,0,IDUM1, + IDUM2,INFO) END IF END IF * END IF * IF (INFO.NE.0) THEN CALL PXERBLA(DESCA(CTXT_),'PDGESVD',-INFO) RETURN ELSE IF (LWORK.EQ.-1) THEN GO TO 40 END IF * * Quick return if possible. * IF (M.LE.0 .OR. N.LE.0) GO TO 40 * * Get machine constants. * SAFMIN = PDLAMCH(DESCA(CTXT_),'Safe minimum') EPS = PDLAMCH(DESCA(CTXT_),'Precision') SMLNUM = SAFMIN/EPS BIGNUM = ONE/SMLNUM RMIN = SQRT(SMLNUM) RMAX = MIN(SQRT(BIGNUM),ONE/SQRT(SQRT(SAFMIN))) * * Scale matrix to allowable range, if necessary. * ANRM = PDLANGE('1',M,N,A,IA,JA,DESCA,WORK(INDWORK)) IF (ANRM.GT.ZERO .AND. ANRM.LT.RMIN) THEN ISCALE = 1 SIGMA = RMIN/ANRM ELSE IF (ANRM.GT.RMAX) THEN ISCALE = 1 SIGMA = RMAX/ANRM END IF * IF (ISCALE.EQ.1) THEN CALL PDLASCL('G',ONE,SIGMA,M,N,A,IA,JA,DESCA,INFO) END IF * CALL PDGEBRD(M,N,A,IA,JA,DESCA,WORK(INDD),WORK(INDE), + WORK(INDTAUQ),WORK(INDTAUP),WORK(INDWORK),LLWORK, + INFO) * * Copy D and E to all processes. * Array D is in local array of dimension: * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * Array E is in local array of dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * IF (M.GE.N) THEN * Distribute D CALL PDLARED1D(N+IOFFD,IA,JA,DESCA,WORK(INDD),WORK(INDD2), + WORK(INDWORK),LLWORK) * Distribute E CALL PDLARED2D(M+IOFFE,IA,JA,DESCA,WORK(INDE),WORK(INDE2), + WORK(INDWORK),LLWORK) ELSE * Distribute D CALL PDLARED2D(M+IOFFD,IA,JA,DESCA,WORK(INDD),WORK(INDD2), + WORK(INDWORK),LLWORK) * Distribute E CALL PDLARED1D(N+IOFFE,IA,JA,DESCA,WORK(INDE),WORK(INDE2), + WORK(INDWORK),LLWORK) END IF * * Prepare for calling PDBDSQR. * IF (M.GE.N) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * INDU = INDWORK INDV = INDU + SIZE*NRU*WANTU INDWORK = INDV + SIZE*NCVT*WANTVT * LDU = MAX(1,NRU) LDVT = MAX(1,SIZE) * CALL DESCINIT(DESCTU,M,SIZE,1,1,0,0,CONTEXTC,LDU,INFO) CALL DESCINIT(DESCTVT,SIZE,N,1,1,0,0,CONTEXTR,LDVT,INFO) * IF (WANTU.EQ.1) THEN CALL PDLASET('Full',M,SIZE,ZERO,ONE,WORK(INDU),1,1,DESCTU) ELSE NRU = 0 END IF * IF (WANTVT.EQ.1) THEN CALL PDLASET('Full',SIZE,N,ZERO,ONE,WORK(INDV),1,1,DESCTVT) ELSE NCVT = 0 END IF * CALL DBDSQR(UPLO,SIZE,NCVT,NRU,0,WORK(INDD2+IOFFD), + WORK(INDE2+IOFFE),WORK(INDV),SIZE,WORK(INDU),LDU,C,1, + WORK(INDWORK),INFO) * * Redistribute elements of U and VT in the block-cyclic fashion. * IF (WANTU.EQ.1) CALL PDGEMR2D(M,SIZE,WORK(INDU),1,1,DESCTU,U,IU, + JU,DESCU,DESCU(CTXT_)) * IF (WANTVT.EQ.1) CALL PDGEMR2D(SIZE,N,WORK(INDV),1,1,DESCTVT,VT, + IVT,JVT,DESCVT,DESCVT(CTXT_)) * * Set to ZERO "non-square" elements of the larger matrices U, VT. * IF (M.GT.N .AND. WANTU.EQ.1) THEN CALL PDLASET('Full',M-SIZE,SIZE,ZERO,ZERO,U,IA+SIZE,JU,DESCU) ELSE IF (N.GT.M .AND. WANTVT.EQ.1) THEN CALL PDLASET('Full',SIZE,N-SIZE,ZERO,ZERO,VT,IVT,JVT+SIZE, + DESCVT) END IF * * Multiply Householder rotations from bidiagonalized matrix. * IF (WANTU.EQ.1) CALL PDORMBR('Q','L','N',M,SIZE,N,A,IA,JA,DESCA, + WORK(INDTAUQ),U,IU,JU,DESCU, + WORK(INDWORK),LLWORK,INFO) * IF (WANTVT.EQ.1) CALL PDORMBR('P','R','T',SIZE,N,M,A,IA,JA,DESCA, + WORK(INDTAUP),VT,IVT,JVT,DESCVT, + WORK(INDWORK),LLWORK,INFO) * * Copy singular values into output array S. * DO 10 I = 1,SIZE S(I) = WORK(INDD2+IOFFD+I-1) 10 CONTINUE * * If matrix was scaled, then rescale singular values appropriately. * IF (ISCALE.EQ.1) THEN CALL DSCAL(SIZE,ONE/SIGMA,S,1) END IF * * Compare every ith eigenvalue, or all if there are only a few, * across the process grid to check for heterogeneity. * IF (SIZE.LE.ITHVAL) THEN J = SIZE K = 1 ELSE J = SIZE/ITHVAL K = ITHVAL END IF * DO 20 I = 1,J WORK(I+INDE) = S((I-1)*K+1) WORK(I+INDD2) = S((I-1)*K+1) 20 CONTINUE * CALL DGAMN2D(DESCA(CTXT_),'a',' ',J,1,WORK(1+INDE),J,1,1,-1,-1,0) CALL DGAMX2D(DESCA(CTXT_),'a',' ',J,1,WORK(1+INDD2),J,1,1,-1,-1,0) * DO 30 I = 1,J IF ((WORK(I+INDE)-WORK(I+INDD2)).NE.ZERO) THEN INFO = SIZE + 1 END IF 30 CONTINUE * 40 CONTINUE * CALL BLACS_GRIDEXIT(CONTEXTC) CALL BLACS_GRIDEXIT(CONTEXTR) * * End of PDGESVD * RETURN END scalapack-2.0.2/SRC/pdgesvx.f000644 000766 000024 00000104714 10363532303 016174 0ustar00juliestaff000000 000000 SUBROUTINE PDGESVX( FACT, TRANS, N, NRHS, A, IA, JA, DESCA, AF, $ IAF, JAF, DESCAF, IPIV, EQUED, R, C, B, IB, $ JB, DESCB, X, IX, JX, DESCX, RCOND, FERR, $ BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER EQUED, FACT, TRANS INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, LIWORK, $ LWORK, N, NRHS DOUBLE PRECISION RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IPIV( * ), IWORK( * ) DOUBLE PRECISION A( * ), AF( * ), B( * ), BERR( * ), C( * ), $ FERR( * ), R( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PDGESVX uses the LU factorization to compute the solution to a real * system of linear equations * * A(IA:IA+N-1,JA:JA+N-1) * X = B(IB:IB+N-1,JB:JB+NRHS-1), * * where A(IA:IA+N-1,JA:JA+N-1) is an N-by-N matrix and X and * B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS matrices. * * Error bounds on the solution and a condition estimate are also * provided. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Description * =========== * * In the following description, A denotes A(IA:IA+N-1,JA:JA+N-1), * B denotes B(IB:IB+N-1,JB:JB+NRHS-1) and X denotes * X(IX:IX+N-1,JX:JX+NRHS-1). * * The following steps are performed: * * 1. If FACT = 'E', real scaling factors are computed to equilibrate * the system: * TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B * TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B * TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') * or diag(C)*B (if TRANS = 'T' or 'C'). * * 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the * matrix A (after equilibration if FACT = 'E') as * A = P * L * U, * where P is a permutation matrix, L is a unit lower triangular * matrix, and U is upper triangular. * * 3. The factored form of A is used to estimate the condition number * of the matrix A. If the reciprocal of the condition number is * less than machine precision, steps 4-6 are skipped. * * 4. The system of equations is solved for X using the factored form * of A. * * 5. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 6. If FACT = 'E' and equilibration was used, the matrix X is * premultiplied by diag(C) (if TRANS = 'N') or diag(R) (if * TRANS = 'T' or 'C') so that it solves the original system * before equilibration. * * Arguments * ========= * * FACT (global input) CHARACTER * Specifies whether or not the factored form of the matrix * A(IA:IA+N-1,JA:JA+N-1) is supplied on entry, and if not, * whether the matrix A(IA:IA+N-1,JA:JA+N-1) should be * equilibrated before it is factored. * = 'F': On entry, AF(IAF:IAF+N-1,JAF:JAF+N-1) and IPIV con- * tain the factored form of A(IA:IA+N-1,JA:JA+N-1). * If EQUED is not 'N', the matrix * A(IA:IA+N-1,JA:JA+N-1) has been equilibrated with * scaling factors given by R and C. * A(IA:IA+N-1,JA:JA+N-1), AF(IAF:IAF+N-1,JAF:JAF+N-1), * and IPIV are not modified. * = 'N': The matrix A(IA:IA+N-1,JA:JA+N-1) will be copied to * AF(IAF:IAF+N-1,JAF:JAF+N-1) and factored. * = 'E': The matrix A(IA:IA+N-1,JA:JA+N-1) will be equili- * brated if necessary, then copied to * AF(IAF:IAF+N-1,JAF:JAF+N-1) and factored. * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': A(IA:IA+N-1,JA:JA+N-1) * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (No transpose) * = 'T': A(IA:IA+N-1,JA:JA+N-1)**T * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (Transpose) * = 'C': A(IA:IA+N-1,JA:JA+N-1)**H * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (Transpose) * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * NRHS (global input) INTEGER * The number of right-hand sides, i.e., the number of columns * of the distributed submatrices B(IB:IB+N-1,JB:JB+NRHS-1) and * X(IX:IX+N-1,JX:JX+NRHS-1). NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * the local memory to an array of local dimension * (LLD_A,LOCc(JA+N-1)). On entry, the N-by-N matrix * A(IA:IA+N-1,JA:JA+N-1). If FACT = 'F' and EQUED is not 'N', * then A(IA:IA+N-1,JA:JA+N-1) must have been equilibrated by * the scaling factors in R and/or C. A(IA:IA+N-1,JA:JA+N-1) is * not modified if FACT = 'F' or 'N', or if FACT = 'E' and * EQUED = 'N' on exit. * * On exit, if EQUED .ne. 'N', A(IA:IA+N-1,JA:JA+N-1) is scaled * as follows: * EQUED = 'R': A(IA:IA+N-1,JA:JA+N-1) := * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * EQUED = 'C': A(IA:IA+N-1,JA:JA+N-1) := * A(IA:IA+N-1,JA:JA+N-1) * diag(C) * EQUED = 'B': A(IA:IA+N-1,JA:JA+N-1) := * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * diag(C). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input or local output) DOUBLE PRECISION pointer * into the local memory to an array of local dimension * (LLD_AF,LOCc(JA+N-1)). If FACT = 'F', then * AF(IAF:IAF+N-1,JAF:JAF+N-1) is an input argument and on * entry contains the factors L and U from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U as computed by PDGETRF. * If EQUED .ne. 'N', then AF is the factored form of the * equilibrated matrix A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'N', then AF(IAF:IAF+N-1,JAF:JAF+N-1) is an output * argument and on exit returns the factors L and U from the * factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the original * matrix A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'E', then AF(IAF:IAF+N-1,JAF:JAF+N-1) is an output * argument and on exit returns the factors L and U from the * factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the equili- * brated matrix A(IA:IA+N-1,JA:JA+N-1) (see the description of * A(IA:IA+N-1,JA:JA+N-1) for the form of the equilibrated * matrix). * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * IPIV (local input or local output) INTEGER array, dimension * LOCr(M_A)+MB_A. If FACT = 'F', then IPIV is an input argu- * ment and on entry contains the pivot indices from the fac- * torization A(IA:IA+N-1,JA:JA+N-1) = P*L*U as computed by * PDGETRF; IPIV(i) -> The global row local row i was * swapped with. This array must be aligned with * A( IA:IA+N-1, * ). * * If FACT = 'N', then IPIV is an output argument and on exit * contains the pivot indices from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the original matrix * A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'E', then IPIV is an output argument and on exit * contains the pivot indices from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the equilibrated matrix * A(IA:IA+N-1,JA:JA+N-1). * * EQUED (global input or global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration (always true if FACT = 'N'). * = 'R': Row equilibration, i.e., A(IA:IA+N-1,JA:JA+N-1) has * been premultiplied by diag(R). * = 'C': Column equilibration, i.e., A(IA:IA+N-1,JA:JA+N-1) * has been postmultiplied by diag(C). * = 'B': Both row and column equilibration, i.e., * A(IA:IA+N-1,JA:JA+N-1) has been replaced by * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * diag(C). * EQUED is an input variable if FACT = 'F'; otherwise, it is an * output variable. * * R (local input or local output) DOUBLE PRECISION array, * dimension LOCr(M_A). * The row scale factors for A(IA:IA+N-1,JA:JA+N-1). * If EQUED = 'R' or 'B', A(IA:IA+N-1,JA:JA+N-1) is multiplied * on the left by diag(R); if EQUED='N' or 'C', R is not acces- * sed. R is an input variable if FACT = 'F'; otherwise, R is * an output variable. * If FACT = 'F' and EQUED = 'R' or 'B', each element of R must * be positive. * R is replicated in every process column, and is aligned * with the distributed matrix A. * * C (local input or local output) DOUBLE PRECISION array, * dimension LOCc(N_A). * The column scale factors for A(IA:IA+N-1,JA:JA+N-1). * If EQUED = 'C' or 'B', A(IA:IA+N-1,JA:JA+N-1) is multiplied * on the right by diag(C); if EQUED = 'N' or 'R', C is not * accessed. C is an input variable if FACT = 'F'; otherwise, * C is an output variable. If FACT = 'F' and EQUED = 'C' or * 'B', each element of C must be positive. * C is replicated in every process row, and is aligned with * the distributed matrix A. * * B (local input/local output) DOUBLE PRECISION pointer * into the local memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1) ). On entry, the N-by-NRHS right-hand * side matrix B(IB:IB+N-1,JB:JB+NRHS-1). On exit, if * EQUED = 'N', B(IB:IB+N-1,JB:JB+NRHS-1) is not modified; if * TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by * diag(R)*B(IB:IB+N-1,JB:JB+NRHS-1); if TRANS = 'T' or 'C' * and EQUED = 'C' or 'B', B(IB:IB+N-1,JB:JB+NRHS-1) is over- * written by diag(C)*B(IB:IB+N-1,JB:JB+NRHS-1). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input/local output) DOUBLE PRECISION pointer * into the local memory to an array of local dimension * (LLD_X, LOCc(JX+NRHS-1)). If INFO = 0, the N-by-NRHS * solution matrix X(IX:IX+N-1,JX:JX+NRHS-1) to the original * system of equations. Note that A(IA:IA+N-1,JA:JA+N-1) and * B(IB:IB+N-1,JB:JB+NRHS-1) are modified on exit if * EQUED .ne. 'N', and the solution to the equilibrated system * is inv(diag(C))*X(IX:IX+N-1,JX:JX+NRHS-1) if TRANS = 'N' * and EQUED = 'C' or 'B', or * inv(diag(R))*X(IX:IX+N-1,JX:JX+NRHS-1) if TRANS = 'T' or 'C' * and EQUED = 'R' or 'B'. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * RCOND (global output) DOUBLE PRECISION * The estimate of the reciprocal condition number of the matrix * A(IA:IA+N-1,JA:JA+N-1) after equilibration (if done). If * RCOND is less than the machine precision (in particular, if * RCOND = 0), the matrix is singular to working precision. * This condition is indicated by a return code of INFO > 0. * * FERR (local output) DOUBLE PRECISION array, dimension LOCc(N_B) * The estimated forward error bounds for each solution vector * X(j) (the j-th column of the solution matrix * X(IX:IX+N-1,JX:JX+NRHS-1). If XTRUE is the true solution, * FERR(j) bounds the magnitude of the largest entry in * (X(j) - XTRUE) divided by the magnitude of the largest entry * in X(j). The estimate is as reliable as the estimate for * RCOND, and is almost always a slight overestimate of the * true error. FERR is replicated in every process row, and is * aligned with the matrices B and X. * * BERR (local output) DOUBLE PRECISION array, dimension LOCc(N_B). * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any entry of A(IA:IA+N-1,JA:JA+N-1) or * B(IB:IB+N-1,JB:JB+NRHS-1) that makes X(j) an exact solution). * BERR is replicated in every process row, and is aligned * with the matrices B and X. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = MAX( PDGECON( LWORK ), PDGERFS( LWORK ) ) * + LOCr( N_A ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK = LOCr(N_A). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, and i is * <= N: U(IA+I-1,IA+I-1) is exactly zero. The * factorization has been completed, but the * factor U is exactly singular, so the solution * and error bounds could not be computed. * = N+1: RCOND is less than machine precision. The * factorization has been completed, but the * matrix is singular to working precision, and * the solution and error bounds have not been * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLEQU, EQUIL, LQUERY, NOFACT, NOTRAN, ROWEQU CHARACTER NORM INTEGER CONWRK, I, IACOL, IAROW, IAFROW, IBROW, IBCOL, $ ICOFFA, ICOFFB, ICOFFX, ICTXT, IDUMM, $ IIA, IIB, IIX, $ INFEQU, IROFFA, IROFFAF, IROFFB, $ IROFFX, IXCOL, IXROW, J, JJA, JJB, JJX, $ LCM, LCMQ, $ LIWMIN, LWMIN, MYCOL, MYROW, NP, NPCOL, NPROW, $ NQ, NQB, NRHSQ, RFSWRK DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, $ ROWCND, SMLNUM * .. * .. Local Arrays .. INTEGER CDESC( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ DGEBR2D, DGEBS2D, DGAMN2D, $ DGAMX2D, INFOG2L, PDCOPY, PDGECON, $ PDGEEQU, PDGERFS, PDGETRF, PDGETRS, $ PDLACPY, PDLAQGE, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC, PDLANGE, $ PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(800+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 8, INFO ) IF( LSAME( FACT, 'F' ) ) $ CALL CHK1MAT( N, 3, N, 3, IAF, JAF, DESCAF, 12, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 20, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IX, JX, DESCX, 24, INFO ) NOFACT = LSAME( FACT, 'N' ) EQUIL = LSAME( FACT, 'E' ) NOTRAN = LSAME( TRANS, 'N' ) IF( NOFACT .OR. EQUIL ) THEN EQUED = 'N' ROWEQU = .FALSE. COLEQU = .FALSE. ELSE ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM END IF IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IIA, JJA, IAROW, IACOL ) NP = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFFA NQB = ICEIL( N+IROFFA, DESCA( NB_ )*NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL CONWRK = 2*NP + 2*NQ + MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQ + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) RFSWRK = 3*NP IF( LSAME( TRANS, 'N' ) ) THEN RFSWRK = RFSWRK + NP + NQ + $ ICEIL( NQB, LCMQ )*DESCA( NB_ ) ELSE IF( LSAME( TRANS, 'T' ).OR.LSAME( TRANS, 'C' ) ) THEN RFSWRK = RFSWRK + NP + NQ END IF LWMIN = MAX( CONWRK, RFSWRK ) WORK( 1 ) = DBLE( LWMIN ) LIWMIN = NP IWORK( 1 ) = LIWMIN IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. $ .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT. LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( ICOFFA.NE.0 .OR. IROFFA.NE.ICOFFA ) THEN INFO = -7 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(800+NB_) ELSE IF( IAFROW.NE.IAROW ) THEN INFO = -10 ELSE IF( IROFFAF.NE.0 ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -(1200+CTXT_) ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN INFO = -13 ELSE IF( ROWEQU ) THEN RCMIN = BIGNUM RCMAX = ZERO DO 10 J = IIA, IIA + NP - 1 RCMIN = MIN( RCMIN, R( J ) ) RCMAX = MAX( RCMAX, R( J ) ) 10 CONTINUE CALL DGAMN2D( ICTXT, 'Columnwise', ' ', 1, 1, RCMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, RCMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( RCMIN.LE.ZERO ) THEN INFO = -14 ELSE IF( N.GT.0 ) THEN ROWCND = MAX( RCMIN, SMLNUM ) / $ MIN( RCMAX, BIGNUM ) ELSE ROWCND = ONE END IF END IF IF( COLEQU .AND. INFO.EQ.0 ) THEN RCMIN = BIGNUM RCMAX = ZERO DO 20 J = JJA, JJA+NQ-1 RCMIN = MIN( RCMIN, C( J ) ) RCMAX = MAX( RCMAX, C( J ) ) 20 CONTINUE CALL DGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, RCMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, RCMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( RCMIN.LE.ZERO ) THEN INFO = -15 ELSE IF( N.GT.0 ) THEN COLCND = MAX( RCMIN, SMLNUM ) / $ MIN( RCMAX, BIGNUM ) ELSE COLCND = ONE END IF END IF END IF END IF * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IBROW.NE.IAROW ) THEN INFO = -18 ELSE IF( IXROW.NE.IBROW ) THEN INFO = -22 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2000+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(2000+CTXT_) ELSE IF( DESCX( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2400+NB_) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -(2400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -29 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -31 END IF IDUM1( 1 ) = ICHAR( FACT ) IDUM2( 1 ) = 1 IDUM1( 2 ) = ICHAR( TRANS ) IDUM2( 2 ) = 2 IF( LSAME( FACT, 'F' ) ) THEN IDUM1( 3 ) = ICHAR( EQUED ) IDUM2( 3 ) = 14 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 29 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 31 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, $ NRHS, 4, IB, JB, DESCB, 20, 5, IDUM1, $ IDUM2, INFO ) ELSE IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 29 IF( LIWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 31 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, $ NRHS, 4, IB, JB, DESCB, 20, 4, IDUM1, $ IDUM2, INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGESVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * IF( EQUIL ) THEN * * Compute row and column scalings to equilibrate the matrix A. * CALL PDGEEQU( N, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, INFEQU ) IF( INFEQU.EQ.0 ) THEN * * Equilibrate the matrix. * CALL PDLAQGE( N, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, EQUED ) ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) END IF END IF * * Scale the right-hand side. * CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, $ JJB, IBROW, IBCOL ) NP = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) IF( MYROW.EQ.IBROW ) $ NP = NP-IROFFB IF( MYCOL.EQ.IBCOL ) $ NRHSQ = NRHSQ-ICOFFB * IF( NOTRAN ) THEN IF( ROWEQU ) THEN DO 40 J = JJB, JJB+NRHSQ-1 DO 30 I = IIB, IIB+NP-1 B( I+( J-1 )*DESCB( LLD_ ) ) = R( I )* $ B( I+( J-1 )*DESCB( LLD_ ) ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( COLEQU ) THEN * * Transpose the Column scale factors * CALL DESCSET( CDESC, 1, N+ICOFFA, 1, DESCA( NB_ ), MYROW, $ IACOL, ICTXT, 1 ) CALL PDCOPY( N, C, 1, JA, CDESC, CDESC( LLD_ ), WORK, IB, JB, $ DESCB, 1 ) IF( MYCOL.EQ.IBCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IIB ), $ DESCB( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IIB ), $ DESCB( LLD_ ), MYROW, IBCOL ) END IF DO 60 J = JJB, JJB+NRHSQ-1 DO 50 I = IIB, IIB+NP-1 B( I+( J-1 )*DESCB( LLD_ ) ) = WORK( I )* $ B( I+( J-1 )*DESCB( LLD_ ) ) 50 CONTINUE 60 CONTINUE END IF * IF( NOFACT.OR.EQUIL ) THEN * * Compute the LU factorization of A. * CALL PDLACPY( 'Full', N, N, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF ) CALL PDGETRF( N, N, AF, IAF, JAF, DESCAF, IPIV, INFO ) * * Return if INFO is non-zero. * IF( INFO.NE.0 ) THEN IF( INFO.GT.0 ) $ RCOND = ZERO RETURN END IF END IF * * Compute the norm of the matrix A. * IF( NOTRAN ) THEN NORM = '1' ELSE NORM = 'I' END IF ANORM = PDLANGE( NORM, N, N, A, IA, JA, DESCA, WORK ) * * Compute the reciprocal of the condition number of A. * CALL PDGECON( NORM, N, AF, IAF, JAF, DESCAF, ANORM, RCOND, WORK, $ LWORK, IWORK, LIWORK, INFO ) * * Return if the matrix is singular to working precision. * IF( RCOND.LT.PDLAMCH( ICTXT, 'Epsilon' ) ) THEN INFO = IA + N RETURN END IF * * Compute the solution matrix X. * CALL PDLACPY( 'Full', N, NRHS, B, IB, JB, DESCB, X, IX, JX, $ DESCX ) CALL PDGETRS( TRANS, N, NRHS, AF, IAF, JAF, DESCAF, IPIV, X, IX, $ JX, DESCX, INFO ) * * Use iterative refinement to improve the computed solution and * compute error bounds and backward error estimates for it. * CALL PDGERFS( TRANS, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, IPIV, B, IB, JB, DESCB, X, IX, JX, DESCX, $ FERR, BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * Transform the solution matrix X to a solution of the original * system. * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) NP = NUMROC( N+IROFFX, DESCX( MB_ ), MYROW, IXROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFFX, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYROW.EQ.IBROW ) $ NP = NP-IROFFX IF( MYCOL.EQ.IBCOL ) $ NRHSQ = NRHSQ-ICOFFX * IF( NOTRAN ) THEN IF( COLEQU ) THEN * * Transpose the column scaling factors * CALL DESCSET( CDESC, 1, N+ICOFFA, 1, DESCA( NB_ ), MYROW, $ IACOL, ICTXT, 1 ) CALL PDCOPY( N, C, 1, JA, CDESC, CDESC( LLD_ ), WORK, IX, $ JX, DESCX, 1 ) IF( MYCOL.EQ.IBCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, $ WORK( IIX ), DESCX( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, $ WORK( IIX ), DESCX( LLD_ ), MYROW, IBCOL ) END IF * DO 80 J = JJX, JJX+NRHSQ-1 DO 70 I = IIX, IIX+NP-1 X( I+( J-1 )*DESCX( LLD_ ) ) = WORK( I )* $ X( I+( J-1 )*DESCX( LLD_ ) ) 70 CONTINUE 80 CONTINUE DO 90 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / COLCND 90 CONTINUE END IF ELSE IF( ROWEQU ) THEN DO 110 J = JJX, JJX+NRHSQ-1 DO 100 I = IIX, IIX+NP-1 X( I+( J-1 )*DESCX( LLD_ ) ) = R( I )* $ X( I+( J-1 )*DESCX( LLD_ ) ) 100 CONTINUE 110 CONTINUE DO 120 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / ROWCND 120 CONTINUE END IF * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PDGESVX * END scalapack-2.0.2/SRC/pdgetf2.f000644 000766 000024 00000022646 10363532303 016052 0ustar00juliestaff000000 000000 SUBROUTINE PDGETF2( M, N, A, IA, JA, DESCA, IPIV, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDGETF2 computes an LU factorization of a general M-by-N * distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) using * partial pivoting with row interchanges. * * The factorization has the form sub( A ) = P * L * U, where P is a * permutation matrix, L is lower triangular with unit diagonal * elements (lower trapezoidal if m > n), and U is upper triangular * (upper trapezoidal if m < n). * * This is the right-looking Parallel Level 2 BLAS version of the * algorithm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires N <= NB_A-MOD(JA-1, NB_A) and square block * decomposition ( MB_A = NB_A ). * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). * NB_A-MOD(JA-1, NB_A) >= N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ). On exit, this array contains * the local pieces of the factors L and U from the factoriza- * tion sub( A ) = P*L*U; the unit diagonal elements of L are * not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, and division by zero will occur if * it is used to solve a system of equations. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER ROWBTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IIA, IROFF, J, $ JJA, MN, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION GMAX * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, IGEBR2D, $ IGEBS2D, INFOG2L, PDAMAX, PDGER, $ PDSCAL, PDSWAP, PB_TOPGET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( N+ICOFF.GT.DESCA( NB_ ) ) THEN INFO = -2 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGETF2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * MN = MIN( M, N ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) * IF( MYCOL.EQ.IACOL ) THEN DO 10 J = JA, JA+MN-1 I = IA + J - JA * * Find pivot and test for singularity. * CALL PDAMAX( M-J+JA, GMAX, IPIV( IIA+J-JA ), A, I, J, $ DESCA, 1 ) IF( GMAX.NE.ZERO ) THEN * * Apply the row interchanges to columns JA:JA+N-1 * CALL PDSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, $ IPIV( IIA+J-JA ), JA, DESCA, DESCA( M_ ) ) * * Compute elements I+1:IA+M-1 of J-th column. * IF( J-JA+1.LT.M ) $ CALL PDSCAL( M-J+JA-1, ONE / GMAX, A, I+1, J, $ DESCA, 1 ) ELSE IF( INFO.EQ.0 ) THEN INFO = J - JA + 1 END IF * * Update trailing submatrix * IF( J-JA+1.LT.MN ) THEN CALL PDGER( M-J+JA-1, N-J+JA-1, -ONE, A, I+1, J, DESCA, $ 1, A, I, J+1, DESCA, DESCA( M_ ), A, I+1, $ J+1, DESCA ) END IF 10 CONTINUE * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, MN, 1, IPIV( IIA ), $ MN ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, MN, 1, IPIV( IIA ), $ MN, MYROW, IACOL ) * END IF * RETURN * * End of PDGETF2 * END scalapack-2.0.2/SRC/pdgetrf.f000644 000766 000024 00000026457 10363532303 016156 0ustar00juliestaff000000 000000 SUBROUTINE PDGETRF( M, N, A, IA, JA, DESCA, IPIV, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDGETRF computes an LU factorization of a general M-by-N distributed * matrix sub( A ) = (IA:IA+M-1,JA:JA+N-1) using partial pivoting with * row interchanges. * * The factorization has the form sub( A ) = P * L * U, where P is a * permutation matrix, L is lower triangular with unit diagonal ele- * ments (lower trapezoidal if m > n), and U is upper triangular * (upper trapezoidal if m < n). L and U are stored in sub( A ). * * This is the right-looking Parallel Level 3 BLAS version of the * algorithm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be factored. On exit, this * array contains the local pieces of the factors L and U from * the factorization sub( A ) = P*L*U; the unit diagonal ele- * ments of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, and division by zero will occur if * it is used to solve a system of equations. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, COLCTOP, ROWBTOP INTEGER I, ICOFF, ICTXT, IINFO, IN, IROFF, J, JB, JN, $ MN, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMN2D, PCHK1MAT, $ PB_TOPGET, PB_TOPSET, PDGEMM, PDGETF2, $ PDLASWP, PDTRSM, PXERBLA * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 0, IDUM1, $ IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGETRF', -INFO ) RETURN END IF * * Quick return if possible * IF( DESCA( M_ ).EQ.1 ) THEN IPIV( 1 ) = 1 RETURN ELSE IF( M.EQ.0 .OR. N.EQ.0 ) THEN RETURN END IF * * Split-ring topology for the communication along process rows * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', ' ' ) * * Handle the first block of columns separately * MN = MIN( M, N ) IN = MIN( ICEIL( IA, DESCA( MB_ ) )*DESCA( MB_ ), IA+M-1 ) JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+MN-1 ) JB = JN - JA + 1 * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL PDGETF2( M, JB, A, IA, JA, DESCA, IPIV, INFO ) * IF( JB+1.LE.N ) THEN * * Apply interchanges to columns JN+1:JA+N-1. * CALL PDLASWP( 'Forward', 'Rows', N-JB, A, IA, JN+1, DESCA, $ IA, IN, IPIV ) * * Compute block row of U. * CALL PDTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-JB, ONE, A, IA, JA, DESCA, A, IA, JN+1, DESCA ) * IF( JB+1.LE.M ) THEN * * Update trailing submatrix. * CALL PDGEMM( 'No transpose', 'No transpose', M-JB, N-JB, JB, $ -ONE, A, IN+1, JA, DESCA, A, IA, JN+1, DESCA, $ ONE, A, IN+1, JN+1, DESCA ) * END IF END IF * * Loop over the remaining blocks of columns. * DO 10 J = JN+1, JA+MN-1, DESCA( NB_ ) JB = MIN( MN-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL PDGETF2( M-J+JA, JB, A, I, J, DESCA, IPIV, IINFO ) * IF( INFO.EQ.0 .AND. IINFO.GT.0 ) $ INFO = IINFO + J - JA * * Apply interchanges to columns JA:J-JA. * CALL PDLASWP( 'Forward', 'Rowwise', J-JA, A, IA, JA, DESCA, $ I, I+JB-1, IPIV ) * IF( J-JA+JB+1.LE.N ) THEN * * Apply interchanges to columns J+JB:JA+N-1. * CALL PDLASWP( 'Forward', 'Rowwise', N-J-JB+JA, A, IA, J+JB, $ DESCA, I, I+JB-1, IPIV ) * * Compute block row of U. * CALL PDTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-J-JB+JA, ONE, A, I, J, DESCA, A, I, J+JB, $ DESCA ) * IF( J-JA+JB+1.LE.M ) THEN * * Update trailing submatrix. * CALL PDGEMM( 'No transpose', 'No transpose', M-J-JB+JA, $ N-J-JB+JA, JB, -ONE, A, I+JB, J, DESCA, A, $ I, J+JB, DESCA, ONE, A, I+JB, J+JB, DESCA ) * END IF END IF * 10 CONTINUE * IF( INFO.EQ.0 ) $ INFO = MN + 1 CALL IGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, INFO, 1, IDUM1, IDUM2, $ -1, -1, MYCOL ) IF( INFO.EQ.MN+1 ) $ INFO = 0 * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * RETURN * * End of PDGETRF * END scalapack-2.0.2/SRC/pdgetri.f000644 000766 000024 00000035424 10430435051 016150 0ustar00juliestaff000000 000000 SUBROUTINE PDGETRI( N, A, IA, JA, DESCA, IPIV, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7.4) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * v1.7.4: May 10, 2006 * v1.7: May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ), IWORK( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDGETRI computes the inverse of a distributed matrix using the LU * factorization computed by PDGETRF. This method inverts U and then * computes the inverse of sub( A ) = A(IA:IA+N-1,JA:JA+N-1) denoted * InvA by solving the system InvA*L = inv(U) for InvA. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the L and U obtained by the * factorization sub( A ) = P*L*U computed by PDGETRF. On * exit, if INFO = 0, sub( A ) contains the inverse of the * original distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension LOCr(M_A)+MB_A * keeps track of the pivoting information. IPIV(i) is the * global row index the local row i was swapped with. This * array is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = LOCr(N+MOD(IA-1,MB_A))*NB_A. WORK is used to keep a * copy of at most an entire column block of sub( A ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK used as workspace for * physically transposing the pivots. * LIWORK is local input and must be at least * if NPROW == NPCOL then * LIWORK = LOCc( N_A + MOD(JA-1, NB_A) ) + NB_A, * else * LIWORK = LOCc( N_A + MOD(JA-1, NB_A) ) + * MAX( CEIL(CEIL(LOCr(M_A)/MB_A)/(LCM/NPROW)), * NB_A ) * where LCM is the least common multiple of process * rows and columns (NPROW and NPCOL). * end if * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,IA+K-1) is exactly zero; the * matrix is singular and its inverse could not be * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IROFF, IW, J, $ JB, JN, LCM, LIWMIN, LWMIN, MP, MYCOL, MYROW, $ NN, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PDGEMM, PDLACPY, PDLASET, PDLAPIV, $ PDTRSM, PDTRTRI, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(500+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) LWMIN = NP * DESCA( NB_ ) * MP = NUMROC( DESCA( M_ ), DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) NQ = NUMROC( DESCA( N_ ), DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) IF( NPROW.EQ.NPCOL ) THEN LIWMIN = NQ + DESCA( NB_ ) ELSE * * Use the formula for the workspace given in PxLAPIV * to compute the minimum size LIWORK for IWORK * * The formula in PxLAPIV is * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + * MB_P * CEIL( CEIL(LOCr(M_P)/MB_P) / (LCM/NPROW) ) * * where * M_P is the global length of the pivot vector * MP = DESCA( M_ ) + DESCA( MB_ ) * NPROW * I_P is IA * I_P = IA * MB_P is the block size use for the block cyclic distribution of the * pivot vector * MB_P = DESCA (MB_ ) * LOCc ( . ) * NUMROC ( . , DESCA ( NB_ ), MYCOL, DESCA ( CSRC_ ), NPCOL ) * LOCr ( . ) * NUMROC ( . , DESCA ( MB_ ), MYROW, DESCA ( RSRC_ ), NPROW ) * CEIL ( X / Y ) * ICEIL( X, Y ) * LCM * LCM = ILCM( NPROW, NPCOL ) * LCM = ILCM( NPROW, NPCOL ) LIWMIN = NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW $ + MOD ( IA - 1, DESCA( MB_ ) ), DESCA ( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ) + $ MAX ( DESCA( MB_ ) * ICEIL ( ICEIL( $ NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW, $ DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ), $ DESCA( MB_ ) ), LCM / NPROW ), DESCA( NB_ ) ) * END IF * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -4 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(500+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -8 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 8 IF( LIWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGETRI', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Form inv(U). If INFO > 0 from PDTRTRI, then U is singular, * and the inverse is not computed. * CALL PDTRTRI( 'Upper', 'Non-unit', N, A, IA, JA, DESCA, INFO ) IF( INFO.GT.0 ) $ RETURN * * Define array descriptor for working array WORK * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) NN = ( ( JA+N-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1 IACOL = INDXG2P( NN, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCW, N+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, NP ) ) IW = IROFF + 1 * * Solve the equation inv(A)*L=inv(U) for inv(A) using blocked code. * DO 10 J = NN, JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * * Copy current block column of L to WORK and replace with zeros. * CALL PDLACPY( 'Lower', JA+N-1-J, JB, A, I+1, J, DESCA, $ WORK, IW+J-JA+1, 1, DESCW ) CALL PDLASET( 'Lower', JA+N-1-J, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Compute current block column of inv(A). * IF( J+JB.LE.JA+N-1 ) $ CALL PDGEMM( 'No transpose', 'No transpose', N, JB, $ JA+N-J-JB, -ONE, A, IA, J+JB, DESCA, WORK, $ IW+J+JB-JA, 1, DESCW, ONE, A, IA, J, DESCA ) CALL PDTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, $ ONE, WORK, IW+J-JA, 1, DESCW, A, IA, J, DESCA ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle the last block of columns separately * JB = JN-JA+1 * * Copy current block column of L to WORK and replace with zeros. * CALL PDLACPY( 'Lower', N-1, JB, A, IA+1, JA, DESCA, WORK, IW+1, $ 1, DESCW ) CALL PDLASET( 'Lower', N-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Compute current block column of inv(A). * IF( JA+JB.LE.JA+N-1 ) $ CALL PDGEMM( 'No transpose', 'No transpose', N, JB, $ N-JB, -ONE, A, IA, JA+JB, DESCA, WORK, IW+JB, 1, $ DESCW, ONE, A, IA, JA, DESCA ) CALL PDTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, $ ONE, WORK, IW, 1, DESCW, A, IA, JA, DESCA ) * * Use the row pivots and apply them to the columns of the global * matrix. * * * JL: I do not get why the size of the PIVOT vector is DESCA( M_ ) + DESCA( MB_ )*NPROW * should be DESCA( M_ ) + DESCA( MB_ ) no? * CALL DESCSET( DESCW, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ MP+DESCA( MB_ ) ) CALL PDLAPIV( 'Backward', 'Columns', 'Column', N, N, A, IA, $ JA, DESCA, IPIV, IA, 1, DESCW, IWORK ) * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PDGETRI * END scalapack-2.0.2/SRC/pdgetrs.f000644 000766 000024 00000026120 10363532303 016156 0ustar00juliestaff000000 000000 SUBROUTINE PDGETRS( TRANS, N, NRHS, A, IA, JA, DESCA, IPIV, B, $ IB, JB, DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Purpose * ======= * * PDGETRS solves a system of distributed linear equations * * op( sub( A ) ) * X = sub( B ) * * with a general N-by-N distributed matrix sub( A ) using the LU * factorization computed by PDGETRF. * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), op( A ) = A or A**T and * sub( B ) denotes B(IB:IB+N-1,JB:JB+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block data decomposition ( MB_A=NB_A ). * * Arguments * ========= * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': sub( A ) * X = sub( B ) (No transpose) * = 'T': sub( A )**T * X = sub( B ) (Transpose) * = 'C': sub( A )**T * X = sub( B ) (Transpose) * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) DOUBLE PRECISION pointer into the local * memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the factors * L and U from the factorization sub( A ) = P*L*U; the unit * diagonal elements of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * B (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, the right hand sides * sub( B ). On exit, sub( B ) is overwritten by the solution * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCIP( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ PDLAPIV, PDTRSM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE NOTRAN = LSAME( TRANS, 'N' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1200+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(1200+CTXT_) END IF END IF IF( NOTRAN ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 1 ) = ICHAR( 'T' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, 3, $ IB, JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGETRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * CALL DESCSET( DESCIP, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ DESCA( MB_ ) + NUMROC( DESCA( M_ ), DESCA( MB_ ), $ MYROW, DESCA( RSRC_ ), NPROW ) ) * IF( NOTRAN ) THEN * * Solve sub( A ) * X = sub( B ). * * Apply row interchanges to the right hand sides. * CALL PDLAPIV( 'Forward', 'Row', 'Col', N, NRHS, B, IB, JB, $ DESCB, IPIV, IA, 1, DESCIP, IDUM1 ) * * Solve L*X = sub( B ), overwriting sub( B ) with X. * CALL PDTRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve U*X = sub( B ), overwriting sub( B ) with X. * CALL PDTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) ELSE * * Solve sub( A )' * X = sub( B ). * * Solve U'*X = sub( B ), overwriting sub( B ) with X. * CALL PDTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve L'*X = sub( B ), overwriting sub( B ) with X. * CALL PDTRSM( 'Left', 'Lower', 'Transpose', 'Unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Apply row interchanges to the solution vectors. * CALL PDLAPIV( 'Backward', 'Row', 'Col', N, NRHS, B, IB, JB, $ DESCB, IPIV, IA, 1, DESCIP, IDUM1 ) * END IF * RETURN * * End of PDGETRS * END scalapack-2.0.2/SRC/pdggqrf.f000644 000766 000024 00000036277 10363532303 016156 0ustar00juliestaff000000 000000 SUBROUTINE PDGGQRF( N, M, P, A, IA, JA, DESCA, TAUA, B, IB, JB, $ DESCB, TAUB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, P * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), TAUA( * ), TAUB( * ), WORK( * ) * .. * * Purpose * ======= * * PDGGQRF computes a generalized QR factorization of * an N-by-M matrix sub( A ) = A(IA:IA+N-1,JA:JA+M-1) and * an N-by-P matrix sub( B ) = B(IB:IB+N-1,JB:JB+P-1): * * sub( A ) = Q*R, sub( B ) = Q*T*Z, * * where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal * matrix, and R and T assume one of the forms: * * if N >= M, R = ( R11 ) M , or if N < M, R = ( R11 R12 ) N, * ( 0 ) N-M N M-N * M * * where R11 is upper triangular, and * * if N <= P, T = ( 0 T12 ) N, or if N > P, T = ( T11 ) N-P, * P-N N ( T21 ) P * P * * where T12 or T21 is upper triangular. * * In particular, if sub( B ) is square and nonsingular, the GQR * factorization of sub( A ) and sub( B ) implicitly gives the QR * factorization of inv( sub( B ) )* sub( A ): * * inv( sub( B ) )*sub( A )= Z'*(inv(T)*R) * * where inv( sub( B ) ) denotes the inverse of the matrix sub( B ), * and Z' denotes the transpose of matrix Z. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrices sub( A ) and sub( B ). N >= 0. * * M (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). M >= 0. * * P (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( B ). P >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+M-1)). * On entry, the local pieces of the N-by-M distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(N,M) by M * upper trapezoidal matrix R (R is upper triangular if N >= M); * the elements below the diagonal, with the array TAUA, * represent the orthogonal matrix Q as a product of min(N,M) * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAUA (local output) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(N,M)-1). This array contains the scalar factors * TAUA of the elementary reflectors which represent the * orthogonal matrix Q. TAUA is tied to the distributed matrix * A. (see Further Details). * * B (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+P-1)). * On entry, the local pieces of the N-by-P distributed matrix * sub( B ) which is to be factored. On exit, if N <= P, the * upper triangle of B(IB:IB+N-1,JB+P-N:JB+P-1) contains the * N by N upper triangular matrix T; if N > P, the elements on * and above the (N-P)-th subdiagonal contain the N by P upper * trapezoidal matrix T; the remaining elements, with the array * TAUB, represent the orthogonal matrix Z as a product of * elementary reflectors (see Further Details). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * TAUB (local output) DOUBLE PRECISION array, dimension LOCr(IB+N-1) * This array contains the scalar factors of the elementary * reflectors which represent the orthogonal unitary matrix Z. * TAUB is tied to the distributed matrix B (see Further * Details). * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB_A * ( NpA0 + MqA0 + NB_A ), * MAX( (NB_A*(NB_A-1))/2, (PqB0 + NpB0)*NB_A ) + * NB_A * NB_A, * MB_B * ( NpB0 + PqB0 + MB_B ) ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * NpB0 = NUMROC( N+IROFFB, MB_B, MYROW, IBROW, NPROW ), * PqB0 = NUMROC( P+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(n,m). * * Each H(i) has the form * * H(i) = I - taua * v * v' * * where taua is a real scalar, and v is a real vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in * A(ia+i:ia+n-1,ja+i-1), and taua in TAUA(ja+i-1). * To form Q explicitly, use ScaLAPACK subroutine PDORGQR. * To use Q to update another matrix, use ScaLAPACK subroutine PDORMQR. * * The matrix Z is represented as a product of elementary reflectors * * Z = H(ib) H(ib+1) . . . H(ib+k-1), where k = min(n,p). * * Each H(i) has the form * * H(i) = I - taub * v * v' * * where taub is a real scalar, and v is a real vector with * v(p-k+i+1:p) = 0 and v(p-k+i) = 1; v(1:p-k+i-1) is stored on exit in * B(ib+n-k+i-1,jb:jb+p-k+i-2), and taub in TAUB(ib+n-k+i-1). * To form Z explicitly, use ScaLAPACK subroutine PDORGRQ. * To use Z to update another matrix, use ScaLAPACK subroutine PDORMRQ. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ) and sub( B ) must verify some * alignment properties, namely the following expression should be true: * * ( MB_A.EQ.MB_B .AND. IROFFA.EQ.IROFFB .AND. IAROW.EQ.IBROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, LWMIN, MQA0, MYCOL, $ MYROW, NPA0, NPB0, NPCOL, NPROW, PQB0 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDGEQRF, $ PDGERQF, PDORMQR, PXERBLA * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -707 ELSE CALL CHK1MAT( N, 1, M, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 1, P, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) NPB0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) PQB0 = NUMROC( P+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) LWMIN = MAX( DESCA( NB_ ) * ( NPA0 + MQA0 + DESCA( NB_ ) ), $ MAX( MAX( ( DESCA( NB_ )*( DESCA( NB_ ) - 1 ) ) / 2, $ ( PQB0 + NPB0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ), $ DESCB( MB_ ) * ( NPB0 + PQB0 + DESCB( MB_ ) ) ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( IAROW.NE.IBROW .OR. IROFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -1203 ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -1207 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 15 CALL PCHK2MAT( N, 1, M, 2, IA, JA, DESCA, 7, N, 1, P, 3, IB, $ JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGGQRF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * QR factorization of N-by-M matrix sub( A ): sub( A ) = Q*R * CALL PDGEQRF( N, M, A, IA, JA, DESCA, TAUA, WORK, LWORK, INFO ) LWMIN = INT( WORK( 1 ) ) * * Update sub( B ) := Q'*sub( B ). * CALL PDORMQR( 'Left', 'Transpose', N, P, MIN( N, M ), A, IA, JA, $ DESCA, TAUA, B, IB, JB, DESCB, WORK, LWORK, INFO ) LWMIN = MIN( LWMIN, INT( WORK( 1 ) ) ) * * RQ factorization of N-by-P matrix sub( B ): sub( B ) = T*Z. * CALL PDGERQF( N, P, B, IB, JB, DESCB, TAUB, WORK, LWORK, INFO ) WORK( 1 ) = DBLE( MAX( LWMIN, INT( WORK( 1 ) ) ) ) * RETURN * * End of PDGGQRF * END scalapack-2.0.2/SRC/pdggrqf.f000644 000766 000024 00000036331 10363532303 016145 0ustar00juliestaff000000 000000 SUBROUTINE PDGGRQF( M, P, N, A, IA, JA, DESCA, TAUA, B, IB, JB, $ DESCB, TAUB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, P * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), TAUA( * ), TAUB( * ), WORK( * ) * .. * * Purpose * ======= * * PDGGRQF computes a generalized RQ factorization of * an M-by-N matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) * and a P-by-N matrix sub( B ) = B(IB:IB+P-1,JB:JB+N-1): * * sub( A ) = R*Q, sub( B ) = Z*T*Q, * * where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal * matrix, and R and T assume one of the forms: * * if M <= N, R = ( 0 R12 ) M, or if M > N, R = ( R11 ) M-N, * N-M M ( R21 ) N * N * * where R12 or R21 is upper triangular, and * * if P >= N, T = ( T11 ) N , or if P < N, T = ( T11 T12 ) P, * ( 0 ) P-N P N-P * N * * where T11 is upper triangular. * * In particular, if sub( B ) is square and nonsingular, the GRQ * factorization of sub( A ) and sub( B ) implicitly gives the RQ * factorization of sub( A )*inv( sub( B ) ): * * sub( A )*inv( sub( B ) ) = (R*inv(T))*Z' * * where inv( sub( B ) ) denotes the inverse of the matrix sub( B ), * and Z' denotes the transpose of matrix Z. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of * rows of the distributed submatrix sub( A ). M >= 0. * * P (global input) INTEGER * The number of rows to be operated on i.e the number of * rows of the distributed submatrix sub( B ). P >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrices sub( A ) and sub( B ). * N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAUA, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAUA (local output) DOUBLE PRECISION array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors which represent the orthogonal unitary matrix Q. * TAUA is tied to the distributed matrix A (see Further * Details). * * B (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+N-1)). * On entry, the local pieces of the P-by-N distributed matrix * sub( B ) which is to be factored. On exit, the elements on * and above the diagonal of sub( B ) contain the min(P,N) by N * upper trapezoidal matrix T (T is upper triangular if P >= N); * the elements below the diagonal, with the array TAUB, * represent the orthogonal matrix Z as a product of elementary * reflectors (see Further Details). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * TAUB (local output) DOUBLE PRECISION array, dimension * LOCc(JB+MIN(P,N)-1). This array contains the scalar factors * TAUB of the elementary reflectors which represent the * orthogonal matrix Z. TAUB is tied to the distributed matrix * B (see Further Details). * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( MB_A * ( MpA0 + NqA0 + MB_A ), * MAX( (MB_A*(MB_A-1))/2, (PpB0 + NqB0)*MB_A ) + * MB_A * MB_A, * NB_B * ( PpB0 + NqB0 + NB_B ) ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * PpB0 = NUMROC( P+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NqB0 = NUMROC( N+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia) H(ia+1) . . . H(ia+k-1), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - taua * v * v' * * where taua is a real scalar, and v is a real vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; v(1:n-k+i-1) is stored on exit in * A(ia+m-k+i-1,ja:ja+n-k+i-2), and taua in TAUA(ia+m-k+i-1). * To form Q explicitly, use ScaLAPACK subroutine PDORGRQ. * To use Q to update another matrix, use ScaLAPACK subroutine PDORMRQ. * * The matrix Z is represented as a product of elementary reflectors * * Z = H(jb) H(jb+1) . . . H(jb+k-1), where k = min(p,n). * * Each H(i) has the form * * H(i) = I - taub * v * v' * * where taub is a real scalar, and v is a real vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:p) is stored on exit in * B(ib+i:ib+p-1,jb+i-1), and taub in TAUB(jb+i-1). * To form Z explicitly, use ScaLAPACK subroutine PDORGQR. * To use Z to update another matrix, use ScaLAPACK subroutine PDORMQR. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ) and sub( B ) must verify some * alignment properties, namely the following expression should be true: * * ( NB_A.EQ.NB_B .AND. ICOFFA.EQ.ICOFFB .AND. IACOL.EQ.IBCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. Local Scalars .. LOGICAL LQUERY INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, LWMIN, MPA0, MYCOL, $ MYROW, NPCOL, NPROW, NQA0, NQB0, PPB0 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDGEQRF, $ PDGERQF, PDORMRQ, PXERBLA * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, INT, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -707 ELSE CALL CHK1MAT( M, 1, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( P, 2, N, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) PPB0 = NUMROC( P+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) NQB0 = NUMROC( N+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) LWMIN = MAX( DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ), $ MAX( MAX( ( DESCA( MB_ )*( DESCA( MB_ ) - 1 ) ) / 2, $ ( PPB0 + NQB0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ), $ DESCB( NB_ ) * ( PPB0 + NQB0 + DESCB( NB_ ) ) ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( IACOL.NE.IBCOL .OR. ICOFFA.NE.ICOFFB ) THEN INFO = -11 ELSE IF( DESCA( NB_ ).NE.DESCB( NB_ ) ) THEN INFO = -1204 ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -1207 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 15 CALL PCHK2MAT( M, 1, N, 3, IA, JA, DESCA, 7, P, 2, N, 3, IB, $ JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDGGRQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * RQ factorization of M-by-N matrix sub( A ): sub( A ) = R*Q * CALL PDGERQF( M, N, A, IA, JA, DESCA, TAUA, WORK, LWORK, INFO ) LWMIN = INT( WORK( 1 ) ) * * Update sub( B ) := sub( B )*Q' * CALL PDORMRQ( 'Right', 'Transpose', P, N, MIN( M, N ), A, $ MAX( IA, IA+M-N ), JA, DESCA, TAUA, B, IB, JB, $ DESCB, WORK, LWORK, INFO ) LWMIN = MAX( LWMIN, INT( WORK( 1 ) ) ) * * QR factorization of P-by-N matrix sub( B ): sub( B ) = Z*T * CALL PDGEQRF( P, N, B, IB, JB, DESCB, TAUB, WORK, LWORK, INFO ) WORK( 1 ) = DBLE( MAX( LWMIN, INT( WORK( 1 ) ) ) ) * RETURN * * End of PDGGRQF * END scalapack-2.0.2/SRC/pdhseqr.f000644 000766 000024 00000066737 11705175572 016212 0ustar00juliestaff000000 000000 SUBROUTINE PDHSEQR( JOB, COMPZ, N, ILO, IHI, H, DESCH, WR, WI, Z, $ DESCZ, WORK, LWORK, IWORK, LIWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK driver routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IHI, ILO, INFO, LWORK, LIWORK, N CHARACTER COMPZ, JOB * .. * .. Array Arguments .. INTEGER DESCH( * ) , DESCZ( * ), IWORK( * ) DOUBLE PRECISION H( * ), WI( N ), WORK( * ), WR( N ), Z( * ) * .. * Purpose * ======= * * PDHSEQR computes the eigenvalues of an upper Hessenberg matrix H * and, optionally, the matrices T and Z from the Schur decomposition * H = Z*T*Z**T, where T is an upper quasi-triangular matrix (the * Schur form), and Z is the orthogonal matrix of Schur vectors. * * Optionally Z may be postmultiplied into an input orthogonal * matrix Q so that this routine can give the Schur factorization * of a matrix A which has been reduced to the Hessenberg form H * by the orthogonal matrix Q: A = Q*H*Q**T = (QZ)*T*(QZ)**T. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * JOB (global input) CHARACTER*1 * = 'E': compute eigenvalues only; * = 'S': compute eigenvalues and the Schur form T. * * COMPZ (global input) CHARACTER*1 * = 'N': no Schur vectors are computed; * = 'I': Z is initialized to the unit matrix and the matrix Z * of Schur vectors of H is returned; * = 'V': Z must contain an orthogonal matrix Q on entry, and * the product Q*Z is returned. * * N (global input) INTEGER * The order of the Hessenberg matrix H (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that H is already upper triangular in rows * and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally * set by a previous call to PDGEBAL, and then passed to PDGEHRD * when the matrix output by PDGEBAL is reduced to Hessenberg * form. Otherwise ILO and IHI should be set to 1 and N * respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. * If N = 0, then ILO = 1 and IHI = 0. * * H (global input/output) DOUBLE PRECISION array, dimension * (DESCH(LLD_),*) * On entry, the upper Hessenberg matrix H. * On exit, if JOB = 'S', H is upper quasi-triangular in * rows and columns ILO:IHI, with 1-by-1 and 2-by-2 blocks on * the main diagonal. The 2-by-2 diagonal blocks (corresponding * to complex conjugate pairs of eigenvalues) are returned in * standard form, with H(i,i) = H(i+1,i+1) and * H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the * contents of H are unspecified on exit. * * DESCH (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix H. * * WR (global output) DOUBLE PRECISION array, dimension (N) * WI (global output) DOUBLE PRECISION array, dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If JOB = 'S', the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in H. * * Z (global input/output) DOUBLE PRECISION array. * If COMPZ = 'V', on entry Z must contain the current * matrix Z of accumulated transformations from, e.g., PDGEHRD, * and on exit Z has been updated; transformations are applied * only to the submatrix Z(ILO:IHI,ILO:IHI). * If COMPZ = 'N', Z is not referenced. * If COMPZ = 'I', on entry Z need not be set and on exit, * if INFO = 0, Z contains the orthogonal matrix Z of the Schur * vectors of H. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace) DOUBLE PRECISION array, dimension(LWORK) * * LWORK (local input) INTEGER * The length of the workspace array WORK. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the workspace array IWORK. * * INFO (output) INTEGER * = 0: successful exit * .LT. 0: if INFO = -i, the i-th argument had an illegal * value (see also below for -7777 and -8888). * .GT. 0: if INFO = i, PDHSEQR failed to compute all of * the eigenvalues. Elements 1:ilo-1 and i+1:n of WR * and WI contain those eigenvalues which have been * successfully computed. (Failures are rare.) * * If INFO .GT. 0 and JOB = 'E', then on exit, the * remaining unconverged eigenvalues are the eigen- * values of the upper Hessenberg matrix rows and * columns ILO through INFO of the final, output * value of H. * * If INFO .GT. 0 and JOB = 'S', then on exit * * (*) (initial value of H)*U = U*(final value of H) * * where U is an orthogonal matrix. The final * value of H is upper Hessenberg and quasi-triangular * in rows and columns INFO+1 through IHI. * * If INFO .GT. 0 and COMPZ = 'V', then on exit * * (final value of Z) = (initial value of Z)*U * * where U is the orthogonal matrix in (*) (regard- * less of the value of JOB.) * * If INFO .GT. 0 and COMPZ = 'I', then on exit * (final value of Z) = U * where U is the orthogonal matrix in (*) (regard- * less of the value of JOB.) * * If INFO .GT. 0 and COMPZ = 'N', then Z is not * accessed. * * = -7777: PDLAQR0 failed to converge and PDLAQR1 was called * instead. This could happen. Mostly due to a bug. * Please, send a bug report to the authors. * = -8888: PDLAQR1 failed to converge and PDLAQR0 was called * instead. This should not happen. * * ================================================================ * Based on contributions by * Robert Granat, Department of Computing Science and HPC2N, * Umea University, Sweden. * ================================================================ * * Restrictions: The block size in H and Z must be square and larger * than or equal to six (6) due to restrictions in PDLAQR1, PDLAQR5 * and DLAQR6. Moreover, H and Z need to be distributed identically * with the same context. * * ================================================================ * References: * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part I: Maintaining Well Focused * Shifts, and Level 3 Performance. * SIAM J. Matrix Anal. Appl., 23(4):929--947, 2002. * * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part II: Aggressive Early * Deflation. * SIAM J. Matrix Anal. Appl., 23(4):948--973, 2002. * * R. Granat, B. Kagstrom, and D. Kressner, * A Novel Parallel QR Algorithm for Hybrid Distributed Momory HPC * Systems. * SIAM J. Sci. Comput., 32(4):2345--2378, 2010. * * ================================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ LOGICAL CRSOVER PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, $ CRSOVER = .TRUE. ) INTEGER NTINY PARAMETER ( NTINY = 11 ) INTEGER NL PARAMETER ( NL = 49 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. INTEGER I, KBOT, NMIN, LLDH, LLDZ, ICTXT, NPROW, NPCOL, $ MYROW, MYCOL, HROWS, HCOLS, IPW, NH, NB, $ II, JJ, HRSRC, HCSRC, NPROCS, ILOC1, JLOC1, $ HRSRC1, HCSRC1, K, ILOC2, JLOC2, ILOC3, JLOC3, $ ILOC4, JLOC4, HRSRC2, HCSRC2, HRSRC3, HCSRC3, $ HRSRC4, HCSRC4, LIWKOPT LOGICAL INITZ, LQUERY, WANTT, WANTZ, PAIR, BORDER DOUBLE PRECISION TMP1, TMP2, TMP3, TMP4, DUM1, DUM2, DUM3, $ DUM4, ELEM1, ELEM2, ELEM3, ELEM4, $ CS, SN, ELEM5, TMP, LWKOPT * .. * .. Local Arrays .. INTEGER DESCH2( DLEN_ ) * .. * .. External Functions .. INTEGER PILAENVX, NUMROC, ICEIL LOGICAL LSAME EXTERNAL PILAENVX, LSAME, NUMROC, ICEIL * .. * .. External Subroutines .. EXTERNAL PDLACPY, PDLAQR1, PDLAQR0, PDLASET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Executable Statements .. * * Decode and check the input parameters. * INFO = 0 ICTXT = DESCH( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL IF( NPROW.EQ.-1 ) INFO = -(600+CTXT_) IF( INFO.EQ.0 ) THEN WANTT = LSAME( JOB, 'S' ) INITZ = LSAME( COMPZ, 'I' ) WANTZ = INITZ .OR. LSAME( COMPZ, 'V' ) LLDH = DESCH( LLD_ ) LLDZ = DESCZ( LLD_ ) NB = DESCH( MB_ ) LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.LSAME( JOB, 'E' ) .AND. .NOT.WANTT ) THEN INFO = -1 ELSE IF( .NOT.LSAME( COMPZ, 'N' ) .AND. .NOT.WANTZ ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -4 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -5 ELSEIF( DESCZ( CTXT_ ).NE.DESCH( CTXT_ ) ) THEN INFO = -( 1000+CTXT_ ) ELSEIF( DESCH( MB_ ).NE.DESCH( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSEIF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1000+NB_ ) ELSEIF( DESCH( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1000+MB_ ) ELSEIF( DESCH( MB_ ).LT.6 ) THEN INFO = -( 700+NB_ ) ELSEIF( DESCZ( MB_ ).LT.6 ) THEN INFO = -( 1000+MB_ ) ELSE CALL CHK1MAT( N, 3, N, 3, 1, 1, DESCH, 7, INFO ) IF( INFO.EQ.0 ) $ CALL CHK1MAT( N, 3, N, 3, 1, 1, DESCZ, 11, INFO ) IF( INFO.EQ.0 ) $ CALL PCHK2MAT( N, 3, N, 3, 1, 1, DESCH, 7, N, 3, N, 3, $ 1, 1, DESCZ, 11, 0, IWORK, IWORK, INFO ) END IF END IF * * Compute required workspace. * CALL PDLAQR1( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILO, IHI, Z, DESCZ, WORK, -1, IWORK, -1, INFO ) LWKOPT = WORK(1) LIWKOPT = IWORK(1) CALL PDLAQR0( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILO, IHI, Z, DESCZ, WORK, -1, IWORK, -1, INFO, 0 ) IF( N.LT.NL ) THEN HROWS = NUMROC( NL, NB, MYROW, DESCH(RSRC_), NPROW ) HCOLS = NUMROC( NL, NB, MYCOL, DESCH(CSRC_), NPCOL ) WORK(1) = WORK(1) + DBLE(2*HROWS*HCOLS) END IF LWKOPT = MAX( LWKOPT, WORK(1) ) LIWKOPT = MAX( LIWKOPT, IWORK(1) ) WORK(1) = LWKOPT IWORK(1) = LIWKOPT * IF( .NOT.LQUERY .AND. LWORK.LT.INT(LWKOPT) ) THEN INFO = -13 ELSEIF( .NOT.LQUERY .AND. LIWORK.LT.LIWKOPT ) THEN INFO = -15 END IF * IF( INFO.NE.0 ) THEN * * Quick return in case of invalid argument. * CALL PXERBLA( 'PDHSEQR', -INFO ) RETURN * ELSE IF( N.EQ.0 ) THEN * * Quick return in case N = 0; nothing to do. * RETURN * ELSE IF( LQUERY ) THEN * * Quick return in case of a workspace query. * RETURN * ELSE * * Copy eigenvalues isolated by PDGEBAL. * DO 10 I = 1, ILO - 1 CALL INFOG2L( I, I, DESCH, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, HRSRC, HCSRC ) IF( MYROW.EQ.HRSRC .AND. MYCOL.EQ.HCSRC ) THEN WR( I ) = H( (JJ-1)*LLDH + II ) ELSE WR( I ) = ZERO END IF WI( I ) = ZERO 10 CONTINUE IF( ILO.GT.1 ) $ CALL DGSUM2D( ICTXT, 'All', '1-Tree', ILO-1, 1, WR, N, -1, $ -1 ) DO 20 I = IHI + 1, N CALL INFOG2L( I, I, DESCH, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, HRSRC, HCSRC ) IF( MYROW.EQ.HRSRC .AND. MYCOL.EQ.HCSRC ) THEN WR( I ) = H( (JJ-1)*LLDH + II ) ELSE WR( I ) = ZERO END IF WI( I ) = ZERO 20 CONTINUE IF( IHI.LT.N ) $ CALL DGSUM2D( ICTXT, 'All', '1-Tree', N-IHI, 1, WR(IHI+1), $ N, -1, -1 ) * * Initialize Z, if requested. * IF( INITZ ) $ CALL PDLASET( 'A', N, N, ZERO, ONE, Z, 1, 1, DESCZ ) * * Quick return if possible. * NPROCS = NPROW*NPCOL IF( ILO.EQ.IHI ) THEN CALL INFOG2L( ILO, ILO, DESCH, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, HRSRC, HCSRC ) IF( MYROW.EQ.HRSRC .AND. MYCOL.EQ.HCSRC ) THEN WR( ILO ) = H( (JJ-1)*LLDH + II ) IF( NPROCS.GT.1 ) $ CALL DGEBS2D( ICTXT, 'All', '1-Tree', 1, 1, WR(ILO), $ 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', '1-Tree', 1, 1, WR(ILO), $ 1, HRSRC, HCSRC ) END IF WI( ILO ) = ZERO RETURN END IF * * PDLAQR1/PDLAQR0 crossover point. * NH = IHI-ILO+1 NMIN = PILAENVX( ICTXT, 12, 'PDHSEQR', $ JOB( : 1 ) // COMPZ( : 1 ), N, ILO, IHI, LWORK ) NMIN = MAX( NTINY, NMIN ) * * PDLAQR0 for big matrices; PDLAQR1 for small ones. * IF( (.NOT. CRSOVER .AND. NH.GT.NTINY) .OR. NH.GT.NMIN .OR. $ DESCH(RSRC_).NE.0 .OR. DESCH(CSRC_).NE.0 ) THEN CALL PDLAQR0( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILO, IHI, Z, DESCZ, WORK, LWORK, IWORK, LIWORK, INFO, $ 0 ) IF( INFO.GT.0 .AND. ( DESCH(RSRC_).NE.0 .OR. $ DESCH(CSRC_).NE.0 ) ) THEN * * A rare PDLAQR0 failure! PDLAQR1 sometimes succeeds * when PDLAQR0 fails. * KBOT = INFO CALL PDLAQR1( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, $ WI, ILO, IHI, Z, DESCZ, WORK, LWORK, IWORK, $ LIWORK, INFO ) INFO = -7777 END IF ELSE * * Small matrix. * CALL PDLAQR1( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILO, IHI, Z, DESCZ, WORK, LWORK, IWORK, LIWORK, INFO ) * IF( INFO.GT.0 ) THEN * * A rare PDLAQR1 failure! PDLAQR0 sometimes succeeds * when PDLAQR1 fails. * KBOT = INFO * IF( N.GE.NL ) THEN * * Larger matrices have enough subdiagonal scratch * space to call PDLAQR0 directly. * CALL PDLAQR0( WANTT, WANTZ, N, ILO, KBOT, H, DESCH, $ WR, WI, ILO, IHI, Z, DESCZ, WORK, LWORK, $ IWORK, LIWORK, INFO, 0 ) ELSE * * Tiny matrices don't have enough subdiagonal * scratch space to benefit from PDLAQR0. Hence, * tiny matrices must be copied into a larger * array before calling PDLAQR0. * HROWS = NUMROC( NL, NB, MYROW, DESCH(RSRC_), NPROW ) HCOLS = NUMROC( NL, NB, MYCOL, DESCH(CSRC_), NPCOL ) CALL DESCINIT( DESCH2, NL, NL, NB, NB, DESCH(RSRC_), $ DESCH(CSRC_), ICTXT, MAX(1, HROWS), INFO ) CALL PDLACPY( 'All', N, N, H, 1, 1, DESCH, WORK, 1, $ 1, DESCH2 ) CALL PDELSET( WORK, N+1, N, DESCH2, ZERO ) CALL PDLASET( 'All', NL, NL-N, ZERO, ZERO, WORK, 1, $ N+1, DESCH2 ) IPW = 1 + DESCH2(LLD_)*HCOLS CALL PDLAQR0( WANTT, WANTZ, NL, ILO, KBOT, WORK, $ DESCH2, WR, WI, ILO, IHI, Z, DESCZ, $ WORK(IPW), LWORK-IPW+1, IWORK, $ LIWORK, INFO, 0 ) IF( WANTT .OR. INFO.NE.0 ) $ CALL PDLACPY( 'All', N, N, WORK, 1, 1, DESCH2, $ H, 1, 1, DESCH ) END IF INFO = -8888 END IF END IF * * Clear out the trash, if necessary. * IF( ( WANTT .OR. INFO.NE.0 ) .AND. N.GT.2 ) $ CALL PDLASET( 'L', N-2, N-2, ZERO, ZERO, H, 3, 1, DESCH ) * * Force any 2-by-2 blocks to be complex conjugate pairs of * eigenvalues by removing false such blocks. * DO 30 I = ILO, IHI-1 CALL PDELGET( 'All', ' ', TMP3, H, I+1, I, DESCH ) IF( TMP3.NE.0.0D+00 ) THEN CALL PDELGET( 'All', ' ', TMP1, H, I, I, DESCH ) CALL PDELGET( 'All', ' ', TMP2, H, I, I+1, DESCH ) CALL PDELGET( 'All', ' ', TMP4, H, I+1, I+1, DESCH ) CALL DLANV2( TMP1, TMP2, TMP3, TMP4, DUM1, DUM2, DUM3, $ DUM4, CS, SN ) IF( TMP3.EQ.0.0D+00 ) THEN IF( WANTT ) THEN IF( I+2.LE.N ) $ CALL PDROT( N-I-1, H, I, I+2, DESCH, $ DESCH(M_), H, I+1, I+2, DESCH, DESCH(M_), $ CS, SN, WORK, LWORK, INFO ) CALL PDROT( I-1, H, 1, I, DESCH, 1, H, 1, I+1, $ DESCH, 1, CS, SN, WORK, LWORK, INFO ) END IF IF( WANTZ ) THEN CALL PDROT( N, Z, 1, I, DESCZ, 1, Z, 1, I+1, DESCZ, $ 1, CS, SN, WORK, LWORK, INFO ) END IF CALL PDELSET( H, I, I, DESCH, TMP1 ) CALL PDELSET( H, I, I+1, DESCH, TMP2 ) CALL PDELSET( H, I+1, I, DESCH, TMP3 ) CALL PDELSET( H, I+1, I+1, DESCH, TMP4 ) END IF END IF 30 CONTINUE * * Read out eigenvalues: first let all the processes compute the * eigenvalue inside their diagonal blocks in parallel, except for * the eigenvalue located next to a block border. After that, * compute all eigenvalues located next to the block borders. * Finally, do a global summation over WR and WI so that all * processors receive the result. * DO 40 K = ILO, IHI WR( K ) = ZERO WI( K ) = ZERO 40 CONTINUE NB = DESCH( MB_ ) * * Loop 50: extract eigenvalues from the blocks which are not laid * out across a border of the processor mesh, except for those 1x1 * blocks on the border. * PAIR = .FALSE. DO 50 K = ILO, IHI IF( .NOT. PAIR ) THEN BORDER = MOD( K, NB ).EQ.0 .OR. ( K.NE.1 .AND. $ MOD( K, NB ).EQ.1 ) IF( .NOT. BORDER ) THEN CALL INFOG2L( K, K, DESCH, NPROW, NPCOL, MYROW, $ MYCOL, ILOC1, JLOC1, HRSRC1, HCSRC1 ) IF( MYROW.EQ.HRSRC1 .AND. MYCOL.EQ.HCSRC1 ) THEN ELEM1 = H((JLOC1-1)*LLDH+ILOC1) IF( K.LT.N ) THEN ELEM3 = H((JLOC1-1)*LLDH+ILOC1+1) ELSE ELEM3 = ZERO END IF IF( ELEM3.NE.ZERO ) THEN ELEM2 = H((JLOC1)*LLDH+ILOC1) ELEM4 = H((JLOC1)*LLDH+ILOC1+1) CALL DLANV2( ELEM1, ELEM2, ELEM3, ELEM4, $ WR( K ), WI( K ), WR( K+1 ), WI( K+1 ), $ SN, CS ) PAIR = .TRUE. ELSE IF( K.GT.1 ) THEN TMP = H((JLOC1-2)*LLDH+ILOC1) IF( TMP.NE.ZERO ) THEN ELEM1 = H((JLOC1-2)*LLDH+ILOC1-1) ELEM2 = H((JLOC1-1)*LLDH+ILOC1-1) ELEM3 = H((JLOC1-2)*LLDH+ILOC1) ELEM4 = H((JLOC1-1)*LLDH+ILOC1) CALL DLANV2( ELEM1, ELEM2, ELEM3, $ ELEM4, WR( K-1 ), WI( K-1 ), $ WR( K ), WI( K ), SN, CS ) ELSE WR( K ) = ELEM1 END IF ELSE WR( K ) = ELEM1 END IF END IF END IF END IF ELSE PAIR = .FALSE. END IF 50 CONTINUE * * Loop 60: extract eigenvalues from the blocks which are laid * out across a border of the processor mesh. The processors are * numbered as below: * * 1 | 2 * --+-- * 3 | 4 * DO 60 K = ICEIL(ILO,NB)*NB, IHI-1, NB CALL INFOG2L( K, K, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC1, HRSRC1, HCSRC1 ) CALL INFOG2L( K, K+1, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC2, JLOC2, HRSRC2, HCSRC2 ) CALL INFOG2L( K+1, K, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC3, JLOC3, HRSRC3, HCSRC3 ) CALL INFOG2L( K+1, K+1, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC4, HRSRC4, HCSRC4 ) IF( MYROW.EQ.HRSRC2 .AND. MYCOL.EQ.HCSRC2 ) THEN ELEM2 = H((JLOC2-1)*LLDH+ILOC2) IF( HRSRC1.NE.HRSRC2 .OR. HCSRC1.NE.HCSRC2 ) $ CALL DGESD2D( ICTXT, 1, 1, ELEM2, 1, HRSRC1, HCSRC1) END IF IF( MYROW.EQ.HRSRC3 .AND. MYCOL.EQ.HCSRC3 ) THEN ELEM3 = H((JLOC3-1)*LLDH+ILOC3) IF( HRSRC1.NE.HRSRC3 .OR. HCSRC1.NE.HCSRC3 ) $ CALL DGESD2D( ICTXT, 1, 1, ELEM3, 1, HRSRC1, HCSRC1) END IF IF( MYROW.EQ.HRSRC4 .AND. MYCOL.EQ.HCSRC4 ) THEN WORK(1) = H((JLOC4-1)*LLDH+ILOC4) IF( K+1.LT.N ) THEN WORK(2) = H((JLOC4-1)*LLDH+ILOC4+1) ELSE WORK(2) = ZERO END IF IF( HRSRC1.NE.HRSRC4 .OR. HCSRC1.NE.HCSRC4 ) $ CALL DGESD2D( ICTXT, 2, 1, WORK, 2, HRSRC1, HCSRC1 ) END IF IF( MYROW.EQ.HRSRC1 .AND. MYCOL.EQ.HCSRC1 ) THEN ELEM1 = H((JLOC1-1)*LLDH+ILOC1) IF( HRSRC1.NE.HRSRC2 .OR. HCSRC1.NE.HCSRC2 ) $ CALL DGERV2D( ICTXT, 1, 1, ELEM2, 1, HRSRC2, HCSRC2) IF( HRSRC1.NE.HRSRC3 .OR. HCSRC1.NE.HCSRC3 ) $ CALL DGERV2D( ICTXT, 1, 1, ELEM3, 1, HRSRC3, HCSRC3) IF( HRSRC1.NE.HRSRC4 .OR. HCSRC1.NE.HCSRC4 ) $ CALL DGERV2D( ICTXT, 2, 1, WORK, 2, HRSRC4, HCSRC4 ) ELEM4 = WORK(1) ELEM5 = WORK(2) IF( ELEM5.EQ.ZERO ) THEN IF( WR( K ).EQ.ZERO .AND. WI( K ).EQ.ZERO ) THEN CALL DLANV2( ELEM1, ELEM2, ELEM3, ELEM4, WR( K ), $ WI( K ), WR( K+1 ), WI( K+1 ), SN, CS ) ELSEIF( WR( K+1 ).EQ.ZERO .AND. WI( K+1 ).EQ.ZERO ) $ THEN WR( K+1 ) = ELEM4 END IF ELSEIF( WR( K ).EQ.ZERO .AND. WI( K ).EQ.ZERO ) $ THEN WR( K ) = ELEM1 END IF END IF 60 CONTINUE * IF( NPROCS.GT.1 ) THEN CALL DGSUM2D( ICTXT, 'All', ' ', IHI-ILO+1, 1, WR(ILO), N, $ -1, -1 ) CALL DGSUM2D( ICTXT, 'All', ' ', IHI-ILO+1, 1, WI(ILO), N, $ -1, -1 ) END IF * END IF * WORK(1) = LWKOPT IWORK(1) = LIWKOPT RETURN * * End of PDHSEQR * END scalapack-2.0.2/SRC/pdlabad.f000644 000766 000024 00000005013 11622500733 016074 0ustar00juliestaff000000 000000 SUBROUTINE PDLABAD( ICTXT, SMALL, LARGE ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT DOUBLE PRECISION LARGE, SMALL * .. * * Purpose * ======= * * PDLABAD takes as input the values computed by PDLAMCH for underflow * and overflow, and returns the square root of each of these values if * the log of LARGE is sufficiently large. This subroutine is intended * to identify machines with a large exponent range, such as the Crays, * and redefine the underflow and overflow limits to be the square roots * of the values computed by PDLAMCH. This subroutine is needed because * PDLAMCH does not compensate for poor arithmetic in the upper half of * the exponent range, as is found on a Cray. * * In addition, this routine performs a global minimization and maximi- * zation on these values, to support heterogeneous computing networks. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle in which the computation takes * place. * * SMALL (local input/local output) DOUBLE PRECISION * On entry, the underflow threshold as computed by PDLAMCH. * On exit, if LOG10(LARGE) is sufficiently large, the square * root of SMALL, otherwise unchanged. * * LARGE (local input/local output) DOUBLE PRECISION * On entry, the overflow threshold as computed by PDLAMCH. * On exit, if LOG10(LARGE) is sufficiently large, the square * root of LARGE, otherwise unchanged. * * ===================================================================== * * .. Local Scalars .. INTEGER IDUMM * .. * .. External Subroutines .. EXTERNAL DGAMN2D, DGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC LOG10, SQRT * .. * .. Executable Statements .. * * If it looks like we're on a Cray, take the square root of * SMALL and LARGE to avoid overflow and underflow problems. * IF( LOG10( LARGE ).GT.2000.D0 ) THEN SMALL = SQRT( SMALL ) LARGE = SQRT( LARGE ) END IF IDUMM = 0 * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, SMALL, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) CALL DGAMN2D( ICTXT, 'All', ' ', 1, 1, LARGE, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) * RETURN * * End of PDLABAD * END scalapack-2.0.2/SRC/pdlabrd.f000644 000766 000024 00000053001 10363532303 016114 0ustar00juliestaff000000 000000 SUBROUTINE PDLABRD( M, N, NB, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ X, IX, JX, DESCX, Y, IY, JY, DESCY, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IX, IY, JA, JX, JY, M, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAUP( * ), $ TAUQ( * ), X( * ), Y( * ), WORK( * ) * .. * * Purpose * ======= * * PDLABRD reduces the first NB rows and columns of a real general * M-by-N distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper * or lower bidiagonal form by an orthogonal transformation Q' * A * P, * and returns the matrices X and Y which are needed to apply the * transformation to the unreduced part of sub( A ). * * If M >= N, sub( A ) is reduced to upper bidiagonal form; if M < N, to * lower bidiagonal form. * * This is an auxiliary routine called by PDGEBRD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NB (global input) INTEGER * The number of leading rows and columns of sub( A ) to be * reduced. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ) to be reduced. On exit, * the first NB rows and columns of the matrix are overwritten; * the rest of the distributed matrix sub( A ) is unchanged. * If m >= n, elements on and below the diagonal in the first NB * columns, with the array TAUQ, represent the orthogonal * matrix Q as a product of elementary reflectors; and * elements above the diagonal in the first NB rows, with the * array TAUP, represent the orthogonal matrix P as a product * of elementary reflectors. * If m < n, elements below the diagonal in the first NB * columns, with the array TAUQ, represent the orthogonal * matrix Q as a product of elementary reflectors, and * elements on and above the diagonal in the first NB rows, * with the array TAUP, represent the orthogonal matrix P as * a product of elementary reflectors. * See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(ia+i-1,ja+i-1). D is tied to the distributed * matrix A. * * E (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(ia+i-1,ja+i) for i = 1,2,...,n-1; * if m < n, E(i) = A(ia+i,ja+i-1) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) DOUBLE PRECISION array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix Q. TAUQ * is tied to the distributed matrix A. See Further Details. * * TAUP (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix P. TAUP * is tied to the distributed matrix A. See Further Details. * * X (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_X,NB). On exit, the local * pieces of the distributed M-by-NB matrix * X(IX:IX+M-1,JX:JX+NB-1) required to update the unreduced * part of sub( A ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * Y (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_Y,NB). On exit, the local * pieces of the distributed N-by-NB matrix * Y(IY:IY+N-1,JY:JY+NB-1) required to update the unreduced * part of sub( A ). * * IY (global input) INTEGER * The row index in the global array Y indicating the first * row of sub( Y ). * * JY (global input) INTEGER * The column index in the global array Y indicating the * first column of sub( Y ). * * DESCY (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Y. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK >= NB_A + NQ, with * * NQ = NUMROC( N+MOD( IA-1, NB_Y ), NB_Y, MYCOL, IACOL, NPCOL ) * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors. * * If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in * A(ia+i-1:ia+m-1,ja+i-1); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is * stored on exit in A(ia+i-1,ja+i:ja+n-1); tauq is stored in * TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); u(1:i-1) = 0, u(i) = 1, and u(i:n) is * stored on exit in A(ia+i-1,ja+i:ja+n-1); tauq is stored in * TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The elements of the vectors v and u together form the m-by-nb matrix * V and the nb-by-n matrix U' which are needed, with X and Y, to apply * the transformation to the unreduced part of the matrix, using a block * update of the form: sub( A ) := sub( A ) - V*Y' - X*U'. * * The contents of sub( A ) on exit are illustrated by the following * examples with nb = 2: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) * ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) * ( v1 v2 a a a ) ( v1 1 a a a a ) * ( v1 v2 a a a ) ( v1 v2 a a a a ) * ( v1 v2 a a a ) ( v1 v2 a a a a ) * ( v1 v2 a a a ) * * where a denotes an element of the original matrix which is unchanged, * vi denotes an element of the vector defining H(i), and ui an element * of the vector defining G(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, IPY, IW, J, JJ, $ JWY, K, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ALPHA, TAU INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), $ DESCTP( DLEN_ ), DESCTQ( DLEN_ ), $ DESCW( DLEN_ ), DESCWY( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PDCOPY, $ PDELGET, PDELSET, PDGEMV, PDLARFG, $ PDSCAL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) IPY = DESCA( MB_ ) + 1 IW = MOD( IA-1, DESCA( NB_ ) ) + 1 ALPHA = ZERO * CALL DESCSET( DESCWY, 1, N+MOD( IA-1, DESCY( NB_ ) ), 1, $ DESCA( NB_ ), IAROW, IACOL, ICTXT, 1 ) CALL DESCSET( DESCW, DESCA( MB_ ), 1, DESCA( MB_ ), 1, IAROW, $ IACOL, ICTXT, DESCA( MB_ ) ) CALL DESCSET( DESCTQ, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), IAROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCTP, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), IACOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) * IF( M.GE.N ) THEN * * Reduce to upper bidiagonal form * CALL DESCSET( DESCD, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) DO 10 K = 1, NB I = IA + K - 1 J = JA + K - 1 JWY = IW + K * * Update A(i:ia+m-1,j) * IF( K.GT.1 ) THEN CALL PDGEMV( 'No transpose', M-K+1, K-1, -ONE, A, I, JA, $ DESCA, Y, IY, JY+K-1, DESCY, 1, ONE, A, I, $ J, DESCA, 1 ) CALL PDGEMV( 'No transpose', M-K+1, K-1, -ONE, X, IX+K-1, $ JX, DESCX, A, IA, J, DESCA, 1, ONE, A, I, J, $ DESCA, 1 ) CALL PDELSET( A, I-1, J, DESCA, ALPHA ) END IF * * Generate reflection Q(i) to annihilate A(i+1:ia+m-1,j) * CALL PDLARFG( M-K+1, ALPHA, I, J, A, I+1, J, DESCA, 1, $ TAUQ ) CALL PDELSET( D, 1, J, DESCD, ALPHA ) CALL PDELSET( A, I, J, DESCA, ONE ) * * Compute Y(IA+I:IA+N-1,J) * CALL PDGEMV( 'Transpose', M-K+1, N-K, ONE, A, I, J+1, DESCA, $ A, I, J, DESCA, 1, ZERO, WORK( IPY ), 1, JWY, $ DESCWY, DESCWY( M_ ) ) CALL PDGEMV( 'Transpose', M-K+1, K-1, ONE, A, I, JA, DESCA, $ A, I, J, DESCA, 1, ZERO, WORK, IW, 1, DESCW, $ 1 ) CALL PDGEMV( 'Transpose', K-1, N-K, -ONE, Y, IY, JY+K, $ DESCY, WORK, IW, 1, DESCW, 1, ONE, WORK( IPY ), $ 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PDGEMV( 'Transpose', M-K+1, K-1, ONE, X, IX+K-1, JX, $ DESCX, A, I, J, DESCA, 1, ZERO, WORK, IW, 1, $ DESCW, 1 ) CALL PDGEMV( 'Transpose', K-1, N-K, -ONE, A, IA, J+1, DESCA, $ WORK, IW, 1, DESCW, 1, ONE, WORK( IPY ), 1, $ JWY, DESCWY, DESCWY( M_ ) ) * CALL PDELGET( 'Rowwise', ' ', TAU, TAUQ, 1, J, DESCTQ ) CALL PDSCAL( N-K, TAU, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PDCOPY( N-K, WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ), $ Y, IY+K-1, JY+K, DESCY, DESCY( M_ ) ) * * Update A(i,j+1:ja+n-1) * CALL PDGEMV( 'Transpose', K, N-K, -ONE, Y, IY, JY+K, DESCY, $ A, I, JA, DESCA, DESCA( M_ ), ONE, A, I, J+1, $ DESCA, DESCA( M_ ) ) CALL PDGEMV( 'Transpose', K-1, N-K, -ONE, A, IA, J+1, DESCA, $ X, IX+K-1, JX, DESCX, DESCX( M_ ), ONE, A, I, $ J+1, DESCA, DESCA( M_ ) ) CALL PDELSET( A, I, J, DESCA, ALPHA ) * * Generate reflection P(i) to annihilate A(i,j+2:ja+n-1) * CALL PDLARFG( N-K, ALPHA, I, J+1, A, I, $ MIN( J+2, N+JA-1 ), DESCA, DESCA( M_ ), TAUP ) CALL PDELSET( E, I, 1, DESCE, ALPHA ) CALL PDELSET( A, I, J+1, DESCA, ONE ) * * Compute X(I+1:IA+M-1,J) * CALL PDGEMV( 'No transpose', M-K, N-K, ONE, A, I+1, J+1, $ DESCA, A, I, J+1, DESCA, DESCA( M_ ), ZERO, X, $ IX+K, JX+K-1, DESCX, 1 ) CALL PDGEMV( 'No transpose', K, N-K, ONE, Y, IY, JY+K, $ DESCY, A, I, J+1, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PDGEMV( 'No transpose', M-K, K, -ONE, A, I+1, JA, $ DESCA, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) CALL PDGEMV( 'No transpose', K-1, N-K, ONE, A, IA, J+1, $ DESCA, A, I, J+1, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PDGEMV( 'No transpose', M-K, K-1, -ONE, X, IX+K, JX, $ DESCX, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) * CALL PDELGET( 'Columnwise', ' ', TAU, TAUP, I, 1, DESCTP ) CALL PDSCAL( M-K, TAU, X, IX+K, JX+K-1, DESCX, 1 ) 10 CONTINUE * ELSE * * Reduce to lower bidiagonal form * CALL DESCSET( DESCD, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) DO 20 K = 1, NB I = IA + K - 1 J = JA + K - 1 JWY = IW + K * * Update A(i,j:ja+n-1) * IF( K.GT.1 ) THEN CALL PDGEMV( 'Transpose', K-1, N-K+1, -ONE, Y, IY, $ JY+K-1, DESCY, A, I, JA, DESCA, DESCA( M_ ), $ ONE, A, I, J, DESCA, DESCA( M_ ) ) CALL PDGEMV( 'Transpose', K-1, N-K+1, -ONE, A, IA, J, $ DESCA, X, IX+K-1, JX, DESCX, DESCX( M_ ), $ ONE, A, I, J, DESCA, DESCA( M_ ) ) CALL PDELSET( A, I, J-1, DESCA, ALPHA ) END IF * * Generate reflection P(i) to annihilate A(i,j+1:ja+n-1) * CALL PDLARFG( N-K+1, ALPHA, I, J, A, I, J+1, DESCA, $ DESCA( M_ ), TAUP ) CALL PDELSET( D, I, 1, DESCD, ALPHA ) CALL PDELSET( A, I, J, DESCA, ONE ) * * Compute X(i+1:ia+m-1,j) * CALL PDGEMV( 'No transpose', M-K, N-K+1, ONE, A, I+1, J, $ DESCA, A, I, J, DESCA, DESCA( M_ ), ZERO, X, $ IX+K, JX+K-1, DESCX, 1 ) CALL PDGEMV( 'No transpose', K-1, N-K+1, ONE, Y, IY, JY+K-1, $ DESCY, A, I, J, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PDGEMV( 'No transpose', M-K, K-1, -ONE, A, I+1, JA, $ DESCA, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) CALL PDGEMV( 'No transpose', K-1, N-K+1, ONE, A, IA, J, $ DESCA, A, I, J, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PDGEMV( 'No transpose', M-K, K-1, -ONE, X, IX+K, JX, $ DESCX, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) * CALL PDELGET( 'Columnwise', ' ', TAU, TAUP, I, 1, DESCTP ) CALL PDSCAL( M-K, TAU, X, IX+K, JX+K-1, DESCX, 1 ) * * Update A(i+1:ia+m-1,j) * CALL PDGEMV( 'No transpose', M-K, K-1, -ONE, A, I+1, JA, $ DESCA, Y, IY, JY+K-1, DESCY, 1, ONE, A, I+1, J, $ DESCA, 1 ) CALL PDGEMV( 'No transpose', M-K, K, -ONE, X, IX+K, JX, $ DESCX, A, IA, J, DESCA, 1, ONE, A, I+1, J, $ DESCA, 1 ) CALL PDELSET( A, I, J, DESCA, ALPHA ) * * Generate reflection Q(i) to annihilate A(i+2:ia+m-1,j) * CALL PDLARFG( M-K, ALPHA, I+1, J, A, MIN( I+2, M+IA-1 ), $ J, DESCA, 1, TAUQ ) CALL PDELSET( E, 1, J, DESCE, ALPHA ) CALL PDELSET( A, I+1, J, DESCA, ONE ) * * Compute Y(ia+i:ia+n-1,j) * CALL PDGEMV( 'Transpose', M-K, N-K, ONE, A, I+1, J+1, DESCA, $ A, I+1, J, DESCA, 1, ZERO, WORK( IPY ), 1, $ JWY, DESCWY, DESCWY( M_ ) ) CALL PDGEMV( 'Transpose', M-K, K-1, ONE, A, I+1, JA, DESCA, $ A, I+1, J, DESCA, 1, ZERO, WORK, IW, 1, DESCW, $ 1 ) CALL PDGEMV( 'Transpose', K-1, N-K, -ONE, Y, IY, JY+K, $ DESCY, WORK, IW, 1, DESCW, 1, ONE, WORK( IPY ), $ 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PDGEMV( 'Transpose', M-K, K, ONE, X, IX+K, JX, DESCX, $ A, I+1, J, DESCA, 1, ZERO, WORK, IW, 1, DESCW, $ 1 ) CALL PDGEMV( 'Transpose', K, N-K, -ONE, A, IA, J+1, DESCA, $ WORK, IW, 1, DESCW, 1, ONE, WORK( IPY ), 1, $ JWY, DESCWY, DESCWY( M_ ) ) * CALL PDELGET( 'Rowwise', ' ', TAU, TAUQ, 1, J, DESCTQ ) CALL PDSCAL( N-K, TAU, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PDCOPY( N-K, WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ), $ Y, IY+K-1, JY+K, DESCY, DESCY( M_ ) ) 20 CONTINUE END IF * RETURN * * End of PDLABRD * END scalapack-2.0.2/SRC/pdlacon.f000644 000766 000024 00000032331 10363532303 016127 0ustar00juliestaff000000 000000 SUBROUTINE PDLACON( N, V, IV, JV, DESCV, X, IX, JX, DESCX, ISGN, $ EST, KASE ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IV, IX, JV, JX, KASE, N DOUBLE PRECISION EST * .. * .. Array Arguments .. INTEGER DESCV( * ), DESCX( * ), ISGN( * ) DOUBLE PRECISION V( * ), X( * ) * .. * * Purpose * ======= * * PDLACON estimates the 1-norm of a square, real distributed matrix A. * Reverse communication is used for evaluating matrix-vector products. * X and V are aligned with the distributed matrix A, this information * is implicitly contained within IV, IX, DESCV, and DESCX. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vectors V and X. N >= 0. * * V (local workspace) DOUBLE PRECISION pointer into the local * memory to an array of dimension LOCr(N+MOD(IV-1,MB_V)). On * the final return, V = A*W, where EST = norm(V)/norm(W) * (W is not returned). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * X (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension * LOCr(N+MOD(IX-1,MB_X)). On an intermediate return, X * should be overwritten by * A * X, if KASE=1, * A' * X, if KASE=2, * PDLACON must be re-called with all the other parameters * unchanged. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * ISGN (local workspace) INTEGER array, dimension * LOCr(N+MOD(IX-1,MB_X)). ISGN is aligned with X and V. * * * EST (global output) DOUBLE PRECISION * An estimate (a lower bound) for norm(A). * * KASE (local input/local output) INTEGER * On the initial call to PDLACON, KASE should be 0. * On an intermediate return, KASE will be 1 or 2, indicating * whether X should be overwritten by A * X or A' * X. * On the final return from PDLACON, KASE will again be 0. * * Further Details * =============== * * The serial version DLACON has been contributed by Nick Higham, * University of Manchester. It was originally named SONEST, dated * March 16, 1988. * * Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of * a real or complex matrix, with applications to condition estimation", * ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) DOUBLE PRECISION ZERO, ONE, TWO PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, IFLAG, IIVX, IMAXROW, IOFFVX, IROFF, $ ITER, IVXCOL, IVXROW, J, JLAST, JJVX, JUMP, $ K, MYCOL, MYROW, NP, NPCOL, NPROW DOUBLE PRECISION ALTSGN, ESTOLD, JLMAX, TEMP, XMAX * .. * .. Local Arrays .. DOUBLE PRECISION WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DGEBR2D, DGEBS2D, $ IGSUM2D, INFOG2L, PDAMAX, PDASUM, $ PDELGET * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, INDXL2G, NUMROC EXTERNAL INDXG2L, INDXG2P, INDXL2G, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MOD, NINT, SIGN * .. * .. Save statement .. SAVE * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIVX, JJVX, IVXROW, IVXCOL ) IF( MYCOL.NE.IVXCOL ) $ RETURN IROFF = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IVXROW, NPROW ) IF( MYROW.EQ.IVXROW ) $ NP = NP - IROFF IOFFVX = IIVX + (JJVX-1)*DESCX( LLD_ ) * IF( KASE.EQ.0 ) THEN DO 10 I = IOFFVX, IOFFVX+NP-1 X( I ) = ONE / DBLE( N ) 10 CONTINUE KASE = 1 JUMP = 1 RETURN END IF * GO TO ( 20, 40, 70, 110, 140 )JUMP * * ................ ENTRY (JUMP = 1) * FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X * 20 CONTINUE IF( N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN V( IOFFVX ) = X( IOFFVX ) EST = ABS( V( IOFFVX ) ) CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF * ... QUIT GO TO 150 END IF CALL PDASUM( N, EST, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF END IF * DO 30 I = IOFFVX, IOFFVX+NP-1 X( I ) = SIGN( ONE, X( I ) ) ISGN( I ) = NINT( X( I ) ) 30 CONTINUE KASE = 2 JUMP = 2 RETURN * * ................ ENTRY (JUMP = 2) * FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X * 40 CONTINUE CALL PDAMAX( N, XMAX, J, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN WORK( 1 ) = XMAX WORK( 2 ) = DBLE( J ) CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2, $ IVXROW, MYCOL ) XMAX = WORK( 1 ) J = NINT( WORK( 2 ) ) END IF END IF ITER = 2 * * MAIN LOOP - ITERATIONS 2, 3,...,ITMAX * 50 CONTINUE DO 60 I = IOFFVX, IOFFVX+NP-1 X( I ) = ZERO 60 CONTINUE IMAXROW = INDXG2P( J, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) IF( MYROW.EQ.IMAXROW ) THEN I = INDXG2L( J, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) X( I ) = ONE END IF KASE = 1 JUMP = 3 RETURN * * ................ ENTRY (JUMP = 3) * X HAS BEEN OVERWRITTEN BY A*X * 70 CONTINUE CALL DCOPY( NP, X( IOFFVX ), 1, V( IOFFVX ), 1 ) ESTOLD = EST CALL PDASUM( N, EST, V, IV, JV, DESCV, 1 ) IF( DESCV( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF END IF IFLAG = 0 DO 80 I = IOFFVX, IOFFVX+NP-1 IF( NINT( SIGN( ONE, X( I ) ) ).NE.ISGN( I ) ) THEN IFLAG = 1 GO TO 90 END IF 80 CONTINUE * 90 CONTINUE CALL IGSUM2D( ICTXT, 'C', ' ', 1, 1, IFLAG, 1, -1, MYCOL ) * * REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. * ALONG WITH IT, TEST FOR CYCLING. * IF( IFLAG.EQ.0 .OR. EST.LE.ESTOLD ) $ GO TO 120 * DO 100 I = IOFFVX, IOFFVX+NP-1 X( I ) = SIGN( ONE, X( I ) ) ISGN( I ) = NINT( X( I ) ) 100 CONTINUE KASE = 2 JUMP = 4 RETURN * * ................ ENTRY (JUMP = 4) * X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X * 110 CONTINUE JLAST = J CALL PDAMAX( N, XMAX, J, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN WORK( 1 ) = XMAX WORK( 2 ) = DBLE( J ) CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2, $ IVXROW, MYCOL ) XMAX = WORK( 1 ) J = NINT( WORK( 2 ) ) END IF END IF CALL PDELGET( 'Columnwise', ' ', JLMAX, X, JLAST, JX, DESCX ) IF( ( JLMAX.NE.ABS( XMAX ) ).AND.( ITER.LT.ITMAX ) ) THEN ITER = ITER + 1 GO TO 50 END IF * * ITERATION COMPLETE. FINAL STAGE. * 120 CONTINUE DO 130 I = IOFFVX, IOFFVX+NP-1 K = INDXL2G( I-IOFFVX+IIVX, DESCX( MB_ ), MYROW, $ DESCX( RSRC_ ), NPROW )-IX+1 IF( MOD( K, 2 ).EQ.0 ) THEN ALTSGN = -ONE ELSE ALTSGN = ONE END IF X( I ) = ALTSGN*( ONE+DBLE( K-1 ) / DBLE( N-1 ) ) 130 CONTINUE KASE = 1 JUMP = 5 RETURN * * ................ ENTRY (JUMP = 5) * X HAS BEEN OVERWRITTEN BY A*X * 140 CONTINUE CALL PDASUM( N, TEMP, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TEMP, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TEMP, 1, $ IVXROW, MYCOL ) END IF END IF TEMP = TWO*( TEMP / DBLE( 3*N ) ) IF( TEMP.GT.EST ) THEN CALL DCOPY( NP, X( IOFFVX ), 1, V( IOFFVX ), 1 ) EST = TEMP END IF * 150 CONTINUE KASE = 0 * RETURN * * End of PDLACON * END scalapack-2.0.2/SRC/pdlaconsb.f000644 000766 000024 00000050470 10363532303 016460 0ustar00juliestaff000000 000000 SUBROUTINE PDLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, BUF, $ LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER I, L, LWORK, M DOUBLE PRECISION H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), BUF( * ) * .. * * Purpose * ======= * * PDLACONSB looks for two consecutive small subdiagonal elements by * seeing the effect of starting a double shift QR iteration * given by H44, H33, & H43H34 and see if this would make a * subdiagonal negligible. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) DOUBLE PRECISION array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * M (global output) INTEGER * On exit, this yields the starting location of the QR double * shift. This will satisfy: L <= M <= I-2. * * H44 * H33 * H43H34 (global input) DOUBLE PRECISION * These three values are for the double shift QR iteration. * * BUF (local output) DOUBLE PRECISION array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 7*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Logic: * ====== * * Two consecutive small subdiagonal elements will stall * convergence of a double shift if their product is small * relatively even if each is not very small. Thus it is * necessary to scan the "tridiagonal portion of the matrix." In * the LAPACK algorithm DLAHQR, a loop of M goes from I-2 down to * L and examines * H(m,m),H(m+1,m+1),H(m+1,m),H(m,m+1),H(m-1,m-1),H(m,m-1), and * H(m+2,m-1). Since these elements may be on separate * processors, the first major loop (10) goes over the tridiagonal * and has each node store whatever values of the 7 it has that * the node owning H(m,m) does not. This will occur on a border * and can happen in no more than 3 locations per block assuming * square blocks. There are 5 buffers that each node stores these * values: a buffer to send diagonally down and right, a buffer * to send up, a buffer to send left, a buffer to send diagonally * up and left and a buffer to send right. Each of these buffers * is actually stored in one buffer BUF where BUF(ISTR1+1) starts * the first buffer, BUF(ISTR2+1) starts the second, etc.. After * the values are stored, if there are any values that a node * needs, they will be sent and received. Then the next major * loop passes over the data and searches for two consecutive * small subdiagonals. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IBUF1, IBUF2, IBUF3, IBUF4, $ IBUF5, ICOL1, II, IRCV1, IRCV2, IRCV3, IRCV4, $ IRCV5, IROW1, ISRC, ISTR1, ISTR2, ISTR3, ISTR4, $ ISTR5, JJ, JSRC, LDA, LEFT, MODKM1, MYCOL, $ MYROW, NPCOL, NPROW, NUM, RIGHT, UP DOUBLE PRECISION H00, H10, H11, H12, H21, H22, H33S, H44S, S, $ TST1, ULP, V1, V2, V3 * .. * .. External Functions .. INTEGER ILCM DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGERV2D, DGESD2D, IGAMX2D, $ INFOG2L, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 starts at BUF(ISTR1+1) and will contain IBUF1 elements * BUFFER2 starts at BUF(ISTR2+1) and will contain IBUF2 elements * BUFFER3 starts at BUF(ISTR3+1) and will contain IBUF3 elements * BUFFER4 starts at BUF(ISTR4+1) and will contain IBUF4 elements * BUFFER5 starts at BUF(ISTR5+1) and will contain IBUF5 elements * ISTR1 = 0 ISTR2 = ( ( I-L-1 ) / HBL ) IF( ISTR2*HBL.LT.( I-L-1 ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.7*ISTR2 ) THEN CALL PXERBLA( CONTXT, 'PDLACONSB', 10 ) RETURN END IF ISTR3 = 3*ISTR2 ISTR4 = ISTR3 + ISTR2 ISTR5 = ISTR3 + ISTR3 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) * * Copy our relevant pieces of triadiagonal that we owe into * 5 buffers to send to whomever owns H(M,M) as M moves diagonally * up the tridiagonal * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 IRCV1 = 0 IRCV2 = 0 IRCV3 = 0 IRCV4 = 0 IRCV5 = 0 DO 10 M = I - 2, L, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M-1,M-1) and send it diagonal down * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( M-1, M-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M ,M-1) and send it right * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF5 = IBUF5 + 1 BUF( ISTR5+IBUF5 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+1,M) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( MYROW.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M ,M+1) and send it left * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF3 = IBUF3 + 1 BUF( ISTR3+IBUF3 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M+1,M+1) & H(M+2,M+1) and send it * diagonally up * IF( ( UP.NE.MYROW ) .OR. ( LEFT.NE.MYCOL ) ) THEN CALL INFOG2L( M+1, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF4 = IBUF4 + 2 BUF( ISTR4+IBUF4-1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) BUF( ISTR4+IBUF4 ) = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+2,M+1) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * Add up the receives * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( M.GT.L ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M-1,M-1) from diagonal up * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) .AND. ( M.GT.L ) ) $ THEN * * We must receive H(M ,M-1) from left * IRCV5 = IRCV5 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+1,M ) from down * IRCV2 = IRCV2 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPCOL.GT.1 ) ) THEN * * We must receive H(M ,M+1) from right * IRCV3 = IRCV3 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M+1:M+2,M+1) from diagonal down * IRCV4 = IRCV4 + 2 END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+2,M+1) from down * IRCV2 = IRCV2 + 1 END IF END IF * * Possibly change owners (occurs only when MOD(M-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * * Send data on to the appropriate node if there is any data to send * IF( IBUF1.GT.0 ) THEN CALL DGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL DGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, UP, $ MYCOL ) END IF IF( IBUF3.GT.0 ) THEN CALL DGESD2D( CONTXT, IBUF3, 1, BUF( ISTR3+1 ), IBUF3, MYROW, $ LEFT ) END IF IF( IBUF4.GT.0 ) THEN CALL DGESD2D( CONTXT, IBUF4, 1, BUF( ISTR4+1 ), IBUF4, UP, $ LEFT ) END IF IF( IBUF5.GT.0 ) THEN CALL DGESD2D( CONTXT, IBUF5, 1, BUF( ISTR5+1 ), IBUF5, MYROW, $ RIGHT ) END IF * * Receive appropriate data if there is any * IF( IRCV1.GT.0 ) THEN CALL DGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL DGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, DOWN, $ MYCOL ) END IF IF( IRCV3.GT.0 ) THEN CALL DGERV2D( CONTXT, IRCV3, 1, BUF( ISTR3+1 ), IRCV3, MYROW, $ RIGHT ) END IF IF( IRCV4.GT.0 ) THEN CALL DGERV2D( CONTXT, IRCV4, 1, BUF( ISTR4+1 ), IRCV4, DOWN, $ RIGHT ) END IF IF( IRCV5.GT.0 ) THEN CALL DGERV2D( CONTXT, IRCV5, 1, BUF( ISTR5+1 ), IRCV5, MYROW, $ LEFT ) END IF * * Start main loop * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. $ ( MODKM1.NE.HBL-1 ) ) THEN CALL INFOG2L( I-2, I-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) END IF * * Look for two consecutive small subdiagonal elements. * DO 20 M = I - 2, L, -1 * * Determine the effect of starting the double-shift QR * iteration at row M, and see if this would make H(M,M-1) * negligible. * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H00 = BUF( ISTR1+IBUF1 ) ELSE H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF5 = IBUF5 + 1 H10 = BUF( ISTR5+IBUF5 ) ELSE H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF END IF IF( MODKM1.EQ.HBL-1 ) THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) H11 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( NUM.GT.1 ) THEN IBUF4 = IBUF4 + 2 H22 = BUF( ISTR4+IBUF4-1 ) V3 = BUF( ISTR4+IBUF4 ) ELSE H22 = A( ICOL1*LDA+IROW1+1 ) V3 = A( ( ICOL1+1 )*LDA+IROW1+1 ) END IF IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 H21 = BUF( ISTR2+IBUF2 ) ELSE H21 = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF3 = IBUF3 + 1 H12 = BUF( ISTR3+IBUF3 ) ELSE H12 = A( ICOL1*LDA+IROW1 ) END IF IF( M.GT.L ) THEN H00 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF * * Adjust ICOL1 for next iteration where MODKM1=HBL-2 * ICOL1 = ICOL1 + 1 END IF IF( MODKM1.EQ.HBL-2 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 V3 = BUF( ISTR2+IBUF2 ) ELSE V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) END IF H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.LT.HBL-2 ) .AND. ( MODKM1.GT.0 ) ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S IF( M.EQ.L ) $ GO TO 30 TST1 = ABS( V1 )*( ABS( H00 )+ABS( H11 )+ABS( H22 ) ) IF( ABS( H10 )*( ABS( V2 )+ABS( V3 ) ).LE.ULP*TST1 ) $ GO TO 30 * * Slide indices diagonally up one for next iteration * IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF IF( M.EQ.L ) THEN * * Stop regardless of which node we are * GO TO 30 END IF * * Possibly change owners if on border * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 20 CONTINUE 30 CONTINUE * CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, M, 1, L, L, -1, -1, -1 ) * RETURN * * End of PDLACONSB * END scalapack-2.0.2/SRC/pdlacp2.f000644 000766 000024 00000037412 11750130340 016035 0ustar00juliestaff000000 000000 SUBROUTINE PDLACP2( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Purpose * ======= * * PDLACP2 copies all or part of a distributed matrix A to another * distributed matrix B. No communication is performed, PDLACP2 * performs a local copy sub( A ) := sub( B ), where sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1) and sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * PDLACP2 requires that only dimension of the matrix operands is * distributed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ) set as follows: * * if UPLO = 'U', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=j, 1<=j<=N; * if UPLO = 'L', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * j<=i<=M, 1<=j<=N; * otherwise, B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=M, 1<=j<=N. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER HEIGHT, IACOL, IAROW, IBASE, IBCOL, IBROW, $ ICOFFA, IIA, IIAA, IIB, IIBB, IIBEGA, IIBEGB, $ IIENDA, IINXTA, IINXTB, ILEFT, IRIGHT, IROFFA, $ ITOP, JJA, JJAA, JJB, JJBB, JJBEGA, JJBEGB, $ JJENDA, JJNXTA, JJNXTB, LDA, LDB, MBA, MP, $ MPAA, MYCOL, MYDIST, MYROW, NBA, NPCOL, NPROW, $ NQ, NQAA, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DLAMOV, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, JJB, $ IBROW, IBCOL ) * MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, MBA ) ICOFFA = MOD( JA-1, NBA ) LDB = DESCB( LLD_ ) * IF( N.LE.( NBA-ICOFFA ) ) THEN * * It is assumed that the local columns JJA:JJA+N-1 of the matrix * A are in the same process column (IACOL). * * N * JJA JJA+N-1 * / --------------------- \ * IROFFA| | | | * \ |...................| | ( IAROW ) * IIA |x | | MBA = DESCA( MB_ ) * | x | | * |--x----------------| / * | x | * | x | ITOP * | x | | * | x | /-------\ * |-------x-----------| |-------x-----------| * | x | | x | * | x | | x | * | x | | x | * | x | | x | * |------------x------| |------------x------| * | x | \____________/ * | x | | * | x | IBASE * | x | * |-----------------x-| Local picture * | x| * | | * | | * | | * |-------------------| * | | * . . * . . * . (IACOL) . * IF( MYCOL.EQ.IACOL ) THEN * MP = NUMROC( M+IROFFA, MBA, MYROW, IAROW, NPROW ) IF( MP.LE.0 ) $ RETURN IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA MYDIST = MOD( MYROW-IAROW+NPROW, NPROW ) ITOP = MYDIST * MBA - IROFFA * IF( LSAME( UPLO, 'U' ) ) THEN * ITOP = MAX( 0, ITOP ) IIBEGA = IIA IIENDA = IIA + MP - 1 IINXTA = MIN( ICEIL( IIBEGA, MBA ) * MBA, IIENDA ) IIBEGB = IIB IINXTB = IIBEGB + IINXTA - IIBEGA * 10 CONTINUE IF( ( N-ITOP ).GT.0 ) THEN CALL DLAMOV( UPLO, IINXTA-IIBEGA+1, N-ITOP, $ A( IIBEGA+(JJA+ITOP-1)*LDA ), LDA, $ B( IIBEGB+(JJB+ITOP-1)*LDB ), LDB ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IIBEGA = IINXTA + 1 IINXTA = MIN( IINXTA+MBA, IIENDA ) IIBEGB = IINXTB + 1 IINXTB = IIBEGB + IINXTA - IIBEGA GO TO 10 END IF * ELSE IF( LSAME( UPLO, 'L' ) ) THEN * MPAA = MP IIAA = IIA JJAA = JJA IIBB = IIB JJBB = JJB IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 20 CONTINUE IF( JJAA.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL DLAMOV( 'All', MPAA, ITOP-JJAA+JJA, $ A( IIAA+(JJAA-1)*LDA ), LDA, $ B( IIBB+(JJBB-1)*LDB ), LDB ) CALL DLAMOV( UPLO, MPAA, HEIGHT, $ A( IIAA+(JJA+ITOP-1)*LDA ), LDA, $ B( IIBB+(JJB+ITOP-1)*LDB ), LDB ) MPAA = MAX( 0, MPAA - HEIGHT ) IIAA = IIAA + HEIGHT JJAA = JJA + IBASE IIBB = IIBB + HEIGHT JJBB = JJB + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 20 END IF * ELSE * CALL DLAMOV( 'All', MP, N, A( IIA+(JJA-1)*LDA ), $ LDA, B( IIB+(JJB-1)*LDB ), LDB ) * END IF * END IF * ELSE IF( M.LE.( MBA-IROFFA ) ) THEN * * It is assumed that the local rows IIA:IIA+M-1 of the matrix A * are in the same process row (IAROW). * * ICOFFA * / \JJA * IIA ------------------ .... -------- * | .x | | | / | | \ * | . x | | | ILEFT| | | | * | . x | | | | | | * | . x | | \ x | | * | . |x | | |x | | IRIGHT * | . | x | | | x | | * (IAROW) | . | x | | | x | | * | . | x| | | x| | * | . | x | | x / * | . | |x | | | * | . | | x | | | * | . | | x | | | * | . | | x| | | * IIA+M-1 ------------------ .... ------- * NB_A * (IACOL) Local picture * IF( MYROW.EQ.IAROW ) THEN * NQ = NUMROC( N+ICOFFA, NBA, MYCOL, IACOL, NPCOL ) IF( NQ.LE.0 ) $ RETURN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA MYDIST = MOD( MYCOL-IACOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBA - ICOFFA * IF( LSAME( UPLO, 'L' ) ) THEN * ILEFT = MAX( 0, ILEFT ) JJBEGA = JJA JJENDA = JJA + NQ - 1 JJNXTA = MIN( ICEIL( JJBEGA, NBA ) * NBA, JJENDA ) JJBEGB = JJB JJNXTB = JJBEGB + JJNXTA - JJBEGA * 30 CONTINUE IF( ( M-ILEFT ).GT.0 ) THEN CALL DLAMOV( UPLO, M-ILEFT, JJNXTA-JJBEGA+1, $ A( IIA+ILEFT+(JJBEGA-1)*LDA ), LDA, $ B( IIB+ILEFT+(JJBEGB-1)*LDB ), LDB ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA JJBEGA = JJNXTA +1 JJNXTA = MIN( JJNXTA+NBA, JJENDA ) JJBEGB = JJNXTB +1 JJNXTB = JJBEGB + JJNXTA - JJBEGA GO TO 30 END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * NQAA = NQ IIAA = IIA JJAA = JJA IIBB = IIB JJBB = JJB IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 40 CONTINUE IF( IIAA.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL DLAMOV( 'All', ILEFT-IIAA+IIA, NQAA, $ A( IIAA+(JJAA-1)*LDA ), LDA, $ B( IIBB+(JJBB-1)*LDB ), LDB ) CALL DLAMOV( UPLO, WIDE, NQAA, $ A( IIA+ILEFT+(JJAA-1)*LDA ), LDA, $ B( IIB+ILEFT+(JJBB-1)*LDB ), LDB ) NQAA = MAX( 0, NQAA - WIDE ) IIAA = IIA + IRIGHT JJAA = JJAA + WIDE IIBB = IIB + IRIGHT JJBB = JJBB + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 40 END IF * ELSE * CALL DLAMOV( 'All', M, NQ, A( IIA+(JJA-1)*LDA ), $ LDA, B( IIB+(JJB-1)*LDB ), LDB ) * END IF * END IF * END IF * RETURN * * End of PDLACP2 * END scalapack-2.0.2/SRC/pdlacp3.f000644 000766 000024 00000030443 11642700517 016044 0ustar00juliestaff000000 000000 SUBROUTINE PDLACP3( M, I, A, DESCA, B, LDB, II, JJ, REV ) IMPLICIT NONE * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER I, II, JJ, LDB, M, REV * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), B( LDB, * ) * .. * * Purpose * ======= * * PDLACP3 is an auxiliary routine that copies from a global parallel * array into a local replicated array or vise versa. Notice that * the entire submatrix that is copied gets placed on one node or * more. The receiving node can be specified precisely, or all nodes * can receive, or just one row or column of nodes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * M is the order of the square submatrix that is copied. * M >= 0. * Unchanged on exit * * I (global input) INTEGER * A(I,I) is the global location that the copying starts from. * Unchanged on exit. * * A (global input/output) DOUBLE PRECISION array, dimension * (DESCA(LLD_),*) * On entry, the parallel matrix to be copied into or from. * On exit, if REV=1, the copied data. * Unchanged on exit if REV=0. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/output) DOUBLE PRECISION array of size (LDB,M) * If REV=0, this is the global portion of the array * A(I:I+M-1,I:I+M-1). * If REV=1, this is the unchanged on exit. * * LDB (local input) INTEGER * The leading dimension of B. * * II (global input) INTEGER * By using REV 0 & 1, data can be sent out and returned again. * If REV=0, then II is destination row index for the node(s) * receiving the replicated B. * If II>=0,JJ>=0, then node (II,JJ) receives the data * If II=-1,JJ>=0, then all rows in column JJ receive the * data * If II>=0,JJ=-1, then all cols in row II receive the data * If II=-1,JJ=-1, then all nodes receive the data * If REV<>0, then II is the source row index for the node(s) * sending the replicated B. * * JJ (global input) INTEGER * Similar description as II above * * REV (global input) INTEGER * Use REV = 0 to send global A into locally replicated B * (on node (II,JJ)). * Use REV <> 0 to send locally replicated B from node (II,JJ) * to its owner (which changes depending on its location in * A) into the global A. * * Implemented by: G. Henry, May 1, 1997 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER COL, CONTXT, HBL, IAFIRST, ICOL1, ICOL2, IDI, $ IDJ, IFIN, III, IROW1, IROW2, ISTOP, ISTOPI, $ ISTOPJ, ITMP, JAFIRST, JJJ, LDA, MYCOL, MYROW, $ NPCOL, NPROW, ROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, DGERV2D, $ DGESD2D, INFOG1L * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.LE.0 ) $ RETURN * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( REV.EQ.0 ) THEN DO 20 IDI = 1, M DO 10 IDJ = 1, M B( IDI, IDJ ) = ZERO 10 CONTINUE 20 CONTINUE END IF * IFIN = I + M - 1 * IF( MOD( I+HBL, HBL ).NE.0 ) THEN ISTOP = MIN( I+HBL-MOD( I+HBL, HBL ), IFIN ) ELSE ISTOP = I END IF IDJ = I ISTOPJ = ISTOP IF( IDJ.LE.IFIN ) THEN 30 CONTINUE IDI = I ISTOPI = ISTOP IF( IDI.LE.IFIN ) THEN 40 CONTINUE ROW = MOD( ( IDI-1 ) / HBL + IAFIRST, NPROW ) COL = MOD( ( IDJ-1 ) / HBL + JAFIRST, NPCOL ) CALL INFOG1L( IDI, HBL, NPROW, ROW, IAFIRST, IROW1, ITMP ) IROW2 = NUMROC( ISTOPI, HBL, ROW, IAFIRST, NPROW ) CALL INFOG1L( IDJ, HBL, NPCOL, COL, JAFIRST, ICOL1, ITMP ) ICOL2 = NUMROC( ISTOPJ, HBL, COL, JAFIRST, NPCOL ) IF( ( MYROW.EQ.ROW ) .AND. ( MYCOL.EQ.COL ) ) THEN IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to everyone * IF( REV.EQ.0 ) THEN CALL DGEBS2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.NE.-1 ) ) THEN * * Send the message to Column MYCOL which better be JJ * IF( REV.EQ.0 ) THEN CALL DGEBS2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to Row MYROW which better be II * IF( REV.EQ.0 ) THEN CALL DGEBS2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.NE.-1 ) .AND. $ ( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) ) THEN * * Recv/Send the message to (II,JJ) * IF( REV.EQ.0 ) THEN CALL DGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ A( ( ICOL1-1 )*LDA+IROW1 ), LDA, II, $ JJ ) ELSE CALL DGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, II, JJ ) END IF END IF IF( REV.EQ.0 ) THEN DO 60 JJJ = ICOL1, ICOL2 DO 50 III = IROW1, IROW2 B( IDI+III-IROW1+1-I, IDJ+JJJ-ICOL1+1-I ) $ = A( ( JJJ-1 )*LDA+III ) 50 CONTINUE 60 CONTINUE ELSE DO 80 JJJ = ICOL1, ICOL2 DO 70 III = IROW1, IROW2 A( ( JJJ-1 )*LDA+III ) = B( IDI+III-IROW1+1-I, $ IDJ+JJJ-ICOL1+1-I ) 70 CONTINUE 80 CONTINUE END IF ELSE IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL DGEBR2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL DGEBR2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL DGEBR2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL DGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) ELSE CALL DGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) * CALL DGESD2D(CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, * $ A((ICOL1-1)*LDA+IROW1),LDA, ROW, COL) END IF END IF END IF IDI = ISTOPI + 1 ISTOPI = MIN( ISTOPI+HBL, IFIN ) IF( IDI.LE.IFIN ) $ GO TO 40 END IF IDJ = ISTOPJ + 1 ISTOPJ = MIN( ISTOPJ+HBL, IFIN ) IF( IDJ.LE.IFIN ) $ GO TO 30 END IF RETURN * * End of PDLACP3 * END scalapack-2.0.2/SRC/pdlacpy.f000644 000766 000024 00000022404 10363532303 016143 0ustar00juliestaff000000 000000 SUBROUTINE PDLACPY( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Purpose * ======= * * PDLACPY copies all or part of a distributed matrix A to another * distributed matrix B. No communication is performed, PDLACPY * performs a local copy sub( A ) := sub( B ), where sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1) and sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ) set as follows: * * if UPLO = 'U', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=j, 1<=j<=N; * if UPLO = 'L', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * j<=i<=M, 1<=j<=N; * otherwise, B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=M, 1<=j<=N. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, IAA, IBB, IBLK, IN, ITMP, J, JAA, JBB, $ JBLK, JN, JTMP * .. * .. External Subroutines .. EXTERNAL PDLACP2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) * IF( M.LE.( DESCA( MB_ ) - MOD( IA-1, DESCA( MB_ ) ) ) .OR. $ N.LE.( DESCA( NB_ ) - MOD( JA-1, DESCA( NB_ ) ) ) ) THEN CALL PDLACP2( UPLO, M, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) ELSE * IF( LSAME( UPLO, 'U' ) ) THEN CALL PDLACP2( UPLO, IN-IA+1, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) IBB = IB + ITMP JBB = JB + ITMP JAA = JA + ITMP CALL PDLACP2( UPLO, IBLK, N-ITMP, A, I, JAA, DESCA, $ B, IBB, JBB, DESCB ) 10 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN CALL PDLACP2( UPLO, M, JN-JA+1, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) IBB = IB + JTMP JBB = JB + JTMP IAA = IA + JTMP CALL PDLACP2( UPLO, M-JTMP, JBLK, A, IAA, J, DESCA, $ B, IBB, JBB, DESCB ) 20 CONTINUE ELSE IF( M.LE.N ) THEN CALL PDLACP2( UPLO, IN-IA+1, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) IBB = IB+ITMP CALL PDLACP2( UPLO, IBLK, N, A, I, JA, DESCA, $ B, IBB, JB, DESCB ) 30 CONTINUE ELSE CALL PDLACP2( UPLO, M, JN-JA+1, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) JBB = JB+JTMP CALL PDLACP2( UPLO, M, JBLK, A, IA, J, DESCA, $ B, IB, JBB, DESCB ) 40 CONTINUE END IF END IF * END IF * RETURN * * End of PDLACPY * END scalapack-2.0.2/SRC/pdlaed0.f000644 000766 000024 00000017562 10363532303 016031 0ustar00juliestaff000000 000000 SUBROUTINE PDLAED0( N, D, E, Q, IQ, JQ, DESCQ, WORK, IWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER INFO, IQ, JQ, N * .. * .. Array Arguments .. INTEGER DESCQ( * ), IWORK( * ) DOUBLE PRECISION D( * ), E( * ), Q( * ), WORK( * ) * .. * * Purpose * ======= * * PDLAED0 computes all eigenvalues and corresponding eigenvectors of a * symmetric tridiagonal matrix using the divide and conquer method. * * * Arguments * ========= * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * D (global input/output) DOUBLE PRECISION array, dimension (N) * On entry, the diagonal elements of the tridiagonal matrix. * On exit, if INFO = 0, the eigenvalues in descending order. * * E (global input/output) DOUBLE PRECISION array, dimension (N-1) * On entry, the subdiagonal elements of the tridiagonal matrix. * On exit, E has been destroyed. * * Q (local output) DOUBLE PRECISION array, * global dimension (N, N), * local dimension ( LLD_Q, LOCc(JQ+N-1)) * Q contains the orthonormal eigenvectors of the symmetric * tridiagonal matrix. * On output, Q is distributed across the P processes in block * cyclic format. * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * * WORK (local workspace ) DOUBLE PRECISION array, dimension (LWORK) * LWORK = 6*N + 2*NP*NQ, with * NP = NUMROC( N, MB_Q, MYROW, IQROW, NPROW ) * NQ = NUMROC( N, NB_Q, MYCOL, IQCOL, NPCOL ) * IQROW = INDXG2P( IQ, NB_Q, MYROW, RSRC_Q, NPROW ) * IQCOL = INDXG2P( JQ, MB_Q, MYCOL, CSRC_Q, NPCOL ) * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * LIWORK = 2 + 7*N + 8*NPCOL * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: The algorithm failed to compute the INFO/(N+1) th * eigenvalue while working on the submatrix lying in * global rows and columns mod(INFO,N+1). * * ===================================================================== * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ID, IDCOL, IDROW, IID, IINFO, IIQ, IM1, IM2, $ IPQ, IQCOL, IQROW, J, JJD, JJQ, LDQ, MATSIZ, $ MYCOL, MYROW, N1, NB, NBL, NBL1, NPCOL, NPROW, $ SUBPBS, TSUBPBS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, DGERV2D, $ DGESD2D, DSTEQR, INFOG2L, PDLAED1, PXERBLA * .. * .. External Functions .. * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Test the input parameters. * CALL BLACS_GRIDINFO( DESCQ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 IF( DESCQ( NB_ ).GT.N .OR. N.LT.2 ) $ INFO = -1 IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCQ( CTXT_ ), 'PDLAED0', -INFO ) RETURN END IF * NB = DESCQ( NB_ ) LDQ = DESCQ( LLD_ ) CALL INFOG2L( IQ, JQ, DESCQ, NPROW, NPCOL, MYROW, MYCOL, IIQ, JJQ, $ IQROW, IQCOL ) * * Determine the size and placement of the submatrices, and save in * the leading elements of IWORK. * TSUBPBS = ( N-1 ) / NB + 1 IWORK( 1 ) = TSUBPBS SUBPBS = 1 10 CONTINUE IF( IWORK( SUBPBS ).GT.1 ) THEN DO 20 J = SUBPBS, 1, -1 IWORK( 2*J ) = ( IWORK( J )+1 ) / 2 IWORK( 2*J-1 ) = IWORK( J ) / 2 20 CONTINUE SUBPBS = 2*SUBPBS GO TO 10 END IF DO 30 J = 2, SUBPBS IWORK( J ) = IWORK( J ) + IWORK( J-1 ) 30 CONTINUE * * Divide the matrix into TSUBPBS submatrices of size at most NB * using rank-1 modifications (cuts). * DO 40 I = NB + 1, N, NB IM1 = I - 1 D( IM1 ) = D( IM1 ) - ABS( E( IM1 ) ) D( I ) = D( I ) - ABS( E( IM1 ) ) 40 CONTINUE * * Solve each submatrix eigenproblem at the bottom of the divide and * conquer tree. D is the same on each process. * DO 50 ID = 1, N, NB CALL INFOG2L( IQ-1+ID, JQ-1+ID, DESCQ, NPROW, NPCOL, MYROW, $ MYCOL, IID, JJD, IDROW, IDCOL ) MATSIZ = MIN( NB, N-ID+1 ) IF( MYROW.EQ.IDROW .AND. MYCOL.EQ.IDCOL ) THEN IPQ = IID + ( JJD-1 )*LDQ CALL DSTEQR( 'I', MATSIZ, D( ID ), E( ID ), Q( IPQ ), LDQ, $ WORK, INFO ) IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCQ( CTXT_ ), 'DSTEQR', -INFO ) RETURN END IF IF( MYROW.NE.IQROW .OR. MYCOL.NE.IQCOL ) THEN CALL DGESD2D( DESCQ( CTXT_ ), MATSIZ, 1, D( ID ), MATSIZ, $ IQROW, IQCOL ) END IF ELSE IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN CALL DGERV2D( DESCQ( CTXT_ ), MATSIZ, 1, D( ID ), MATSIZ, $ IDROW, IDCOL ) END IF 50 CONTINUE * IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN CALL DGEBS2D( DESCQ( CTXT_ ), 'A', ' ', N, 1, D, N ) ELSE CALL DGEBR2D( DESCQ( CTXT_ ), 'A', ' ', N, 1, D, N, IQROW, $ IQCOL ) END IF * * Successively merge eigensystems of adjacent submatrices * into eigensystem for the corresponding larger matrix. * * while ( SUBPBS > 1 ) * 60 CONTINUE IF( SUBPBS.GT.1 ) THEN IM2 = SUBPBS - 2 DO 80 I = 0, IM2, 2 IF( I.EQ.0 ) THEN NBL = IWORK( 2 ) NBL1 = IWORK( 1 ) IF( NBL1.EQ.0 ) $ GO TO 70 ID = 1 MATSIZ = MIN( N, NBL*NB ) N1 = NBL1*NB ELSE NBL = IWORK( I+2 ) - IWORK( I ) NBL1 = NBL / 2 IF( NBL1.EQ.0 ) $ GO TO 70 ID = IWORK( I )*NB + 1 MATSIZ = MIN( NB*NBL, N-ID+1 ) N1 = NBL1*NB END IF * * Merge lower order eigensystems (of size N1 and MATSIZ - N1) * into an eigensystem of size MATSIZ. * CALL PDLAED1( MATSIZ, N1, D( ID ), ID, Q, IQ, JQ, DESCQ, $ E( ID+N1-1 ), WORK, IWORK( SUBPBS+1 ), IINFO ) IF( IINFO.NE.0 ) THEN INFO = IINFO*( N+1 ) + ID END IF * 70 CONTINUE IWORK( I / 2+1 ) = IWORK( I+2 ) 80 CONTINUE SUBPBS = SUBPBS / 2 * GO TO 60 END IF * * end while * 90 CONTINUE RETURN * * End of PDLAED0 * END scalapack-2.0.2/SRC/pdlaed1.f000644 000766 000024 00000022534 10363532303 016025 0ustar00juliestaff000000 000000 SUBROUTINE PDLAED1( N, N1, D, ID, Q, IQ, JQ, DESCQ, RHO, WORK, $ IWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER ID, INFO, IQ, JQ, N, N1 DOUBLE PRECISION RHO * .. * .. Array Arguments .. INTEGER DESCQ( * ), IWORK( * ) DOUBLE PRECISION D( * ), Q( * ), WORK( * ) * .. * * Purpose * ======= * * PDLAED1 computes the updated eigensystem of a diagonal * matrix after modification by a rank-one symmetric matrix, * in parallel. * * T = Q(in) ( D(in) + RHO * Z*Z' ) Q'(in) = Q(out) * D(out) * Q'(out) * * where Z = Q'u, u is a vector of length N with ones in the * N1 and N1 + 1 th elements and zeros elsewhere. * * The eigenvectors of the original matrix are stored in Q, and the * eigenvalues are in D. The algorithm consists of three stages: * * The first stage consists of deflating the size of the problem * when there are multiple eigenvalues or if there is a zero in * the Z vector. For each such occurence the dimension of the * secular equation problem is reduced by one. This stage is * performed by the routine PDLAED2. * * The second stage consists of calculating the updated * eigenvalues. This is done by finding the roots of the secular * equation via the routine SLAED4 (as called by PDLAED3). * This routine also calculates the eigenvectors of the current * problem. * * The final stage consists of computing the updated eigenvectors * directly using the updated eigenvalues. The eigenvectors for * the current problem are multiplied with the eigenvectors from * the overall problem. * * Arguments * ========= * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * * N1 (input) INTEGER * The location of the last eigenvalue in the leading * sub-matrix. * min(1,N) <= N1 <= N. * * D (global input/output) DOUBLE PRECISION array, dimension (N) * On entry,the eigenvalues of the rank-1-perturbed matrix. * On exit, the eigenvalues of the repaired matrix. * * ID (global input) INTEGER * Q's global row/col index, which points to the beginning * of the submatrix which is to be operated on. * * Q (local output) DOUBLE PRECISION array, * global dimension (N, N), * local dimension ( LLD_Q, LOCc(JQ+N-1)) * Q contains the orthonormal eigenvectors of the symmetric * tridiagonal matrix. * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * RHO (input) DOUBLE PRECISION * The subdiagonal entry used to create the rank-1 modification. * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension 6*N + 2*NP*NQ * * IWORK (local workspace/output) INTEGER array, * dimension 7*N + 8*NPCOL + 2 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: The algorithm failed to compute the ith eigenvalue. * * ===================================================================== * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER COL, COLTYP, IBUF, ICTOT, ICTXT, IDLMDA, IIQ, $ INDCOL, INDROW, INDX, INDXC, INDXP, INDXR, INQ, $ IPQ, IPQ2, IPSM, IPU, IPWORK, IQ1, IQ2, IQCOL, $ IQQ, IQROW, IW, IZ, J, JC, JJ2C, JJC, JJQ, JNQ, $ K, LDQ, LDQ2, LDU, MYCOL, MYROW, NB, NN, NN1, $ NN2, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCQ2( DLEN_ ), DESCU( DLEN_ ) * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DESCINIT, INFOG1L, $ INFOG2L, PDGEMM, PDLAED2, PDLAED3, PDLAEDZ, $ PDLASET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * * Test the input parameters. * CALL BLACS_GRIDINFO( DESCQ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE IF( N.LT.0 ) THEN INFO = -1 ELSE IF( ID.GT.DESCQ( N_ ) ) THEN INFO = -4 ELSE IF( N1.GE.N ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCQ( CTXT_ ), 'PDLAED1', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * The following values are integer pointers which indicate * the portion of the workspace used by a particular array * in PDLAED2 and PDLAED3. * ICTXT = DESCQ( CTXT_ ) NB = DESCQ( NB_ ) LDQ = DESCQ( LLD_ ) * CALL INFOG2L( IQ-1+ID, JQ-1+ID, DESCQ, NPROW, NPCOL, MYROW, MYCOL, $ IIQ, JJQ, IQROW, IQCOL ) * NP = NUMROC( N, DESCQ( MB_ ), MYROW, IQROW, NPROW ) NQ = NUMROC( N, DESCQ( NB_ ), MYCOL, IQCOL, NPCOL ) * LDQ2 = MAX( NP, 1 ) LDU = LDQ2 * IZ = 1 IDLMDA = IZ + N IW = IDLMDA + N IPQ2 = IW + N IPU = IPQ2 + LDQ2*NQ IBUF = IPU + LDU*NQ * (IBUF est de taille 3*N au maximum) * ICTOT = 1 IPSM = ICTOT + NPCOL*4 INDX = IPSM + NPCOL*4 INDXC = INDX + N INDXP = INDXC + N INDCOL = INDXP + N COLTYP = INDCOL + N INDROW = COLTYP + N INDXR = INDROW + N * CALL DESCINIT( DESCQ2, N, N, NB, NB, IQROW, IQCOL, ICTXT, LDQ2, $ INFO ) CALL DESCINIT( DESCU, N, N, NB, NB, IQROW, IQCOL, ICTXT, LDU, $ INFO ) * * Form the z-vector which consists of the last row of Q_1 and the * first row of Q_2. * IPWORK = IDLMDA CALL PDLAEDZ( N, N1, ID, Q, IQ, JQ, LDQ, DESCQ, WORK( IZ ), $ WORK( IPWORK ) ) * * Deflate eigenvalues. * IPQ = IIQ + ( JJQ-1 )*LDQ CALL PDLAED2( ICTXT, K, N, N1, NB, D, IQROW, IQCOL, Q( IPQ ), LDQ, $ RHO, WORK( IZ ), WORK( IW ), WORK( IDLMDA ), $ WORK( IPQ2 ), LDQ2, WORK( IBUF ), IWORK( ICTOT ), $ IWORK( IPSM ), NPCOL, IWORK( INDX ), IWORK( INDXC ), $ IWORK( INDXP ), IWORK( INDCOL ), IWORK( COLTYP ), $ NN, NN1, NN2, IQ1, IQ2 ) * * * Solve Secular Equation. * IF( K.NE.0 ) THEN CALL PDLASET( 'A', N, N, ZERO, ONE, WORK( IPU ), 1, 1, DESCU ) CALL PDLAED3( ICTXT, K, N, NB, D, IQROW, IQCOL, RHO, $ WORK( IDLMDA ), WORK( IW ), WORK( IZ ), $ WORK( IPU ), LDQ2, WORK( IBUF ), IWORK( INDX ), $ IWORK( INDCOL ), IWORK( INDROW ), IWORK( INDXR ), $ IWORK( INDXC ), IWORK( ICTOT ), NPCOL, INFO ) * * Compute the updated eigenvectors. * IQQ = MIN( IQ1, IQ2 ) IF( NN1.GT.0 ) THEN INQ = IQ - 1 + ID JNQ = JQ - 1 + ID + IQQ - 1 CALL PDGEMM( 'N', 'N', N1, NN, NN1, ONE, WORK( IPQ2 ), 1, $ IQ1, DESCQ2, WORK( IPU ), IQ1, IQQ, DESCU, $ ZERO, Q, INQ, JNQ, DESCQ ) END IF IF( NN2.GT.0 ) THEN INQ = IQ - 1 + ID + N1 JNQ = JQ - 1 + ID + IQQ - 1 CALL PDGEMM( 'N', 'N', N-N1, NN, NN2, ONE, WORK( IPQ2 ), $ N1+1, IQ2, DESCQ2, WORK( IPU ), IQ2, IQQ, $ DESCU, ZERO, Q, INQ, JNQ, DESCQ ) END IF * DO 10 J = K + 1, N JC = IWORK( INDX+J-1 ) CALL INFOG1L( JQ-1+JC, NB, NPCOL, MYCOL, IQCOL, JJC, COL ) CALL INFOG1L( JC, NB, NPCOL, MYCOL, IQCOL, JJ2C, COL ) IF( MYCOL.EQ.COL ) THEN IQ2 = IPQ2 + ( JJ2C-1 )*LDQ2 INQ = IPQ + ( JJC-1 )*LDQ CALL DCOPY( NP, WORK( IQ2 ), 1, Q( INQ ), 1 ) END IF 10 CONTINUE END IF * 20 CONTINUE RETURN * * End of PDLAED1 * END scalapack-2.0.2/SRC/pdlaed2.f000644 000766 000024 00000035135 10363532303 016027 0ustar00juliestaff000000 000000 SUBROUTINE PDLAED2( ICTXT, K, N, N1, NB, D, DROW, DCOL, Q, LDQ, $ RHO, Z, W, DLAMDA, Q2, LDQ2, QBUF, CTOT, PSM, $ NPCOL, INDX, INDXC, INDXP, INDCOL, COLTYP, NN, $ NN1, NN2, IB1, IB2 ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER DCOL, DROW, IB1, IB2, ICTXT, K, LDQ, LDQ2, N, $ N1, NB, NN, NN1, NN2, NPCOL DOUBLE PRECISION RHO * .. * .. Array Arguments .. INTEGER COLTYP( * ), CTOT( 0: NPCOL-1, 4 ), $ INDCOL( N ), INDX( * ), INDXC( * ), INDXP( * ), $ PSM( 0: NPCOL-1, 4 ) DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), $ Q2( LDQ2, * ), QBUF( * ), W( * ), Z( * ) * .. * * Purpose * ======= * * PDLAED2 sorts the two sets of eigenvalues together into a single * sorted set. Then it tries to deflate the size of the problem. * There are two ways in which deflation can occur: when two or more * eigenvalues are close together or if there is a tiny entry in the * Z vector. For each such occurrence the order of the related secular * equation problem is reduced by one. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation on the matrix. The context itself is global. * * K (output) INTEGER * The number of non-deflated eigenvalues, and the order of the * related secular equation. 0 <= K <=N. * * N (input) INTEGER * The dimension of the symmetric tridiagonal matrix. N >= 0. * * N1 (input) INTEGER * The location of the last eigenvalue in the leading sub-matrix. * min(1,N) < N1 < N. * * NB (global input) INTEGER * The blocking factor used to distribute the columns of the * matrix. NB >= 1. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, D contains the eigenvalues of the two submatrices to * be combined. * On exit, D contains the trailing (N-K) updated eigenvalues * (those which were deflated) sorted into increasing order. * * DROW (global input) INTEGER * The process row over which the first row of the matrix D is * distributed. 0 <= DROW < NPROW. * * DCOL (global input) INTEGER * The process column over which the first column of the * matrix D is distributed. 0 <= DCOL < NPCOL. * * Q (input/output) DOUBLE PRECISION array, dimension (LDQ, N) * On entry, Q contains the eigenvectors of two submatrices in * the two square blocks with corners at (1,1), (N1,N1) * and (N1+1, N1+1), (N,N). * On exit, Q contains the trailing (N-K) updated eigenvectors * (those which were deflated) in its last N-K columns. * * LDQ (input) INTEGER * The leading dimension of the array Q. LDQ >= max(1,NQ). * * RHO (global input/output) DOUBLE PRECISION * On entry, the off-diagonal element associated with the rank-1 * cut which originally split the two submatrices which are now * being recombined. * On exit, RHO has been modified to the value required by * PDLAED3. * * Z (global input) DOUBLE PRECISION array, dimension (N) * On entry, Z contains the updating vector (the last * row of the first sub-eigenvector matrix and the first row of * the second sub-eigenvector matrix). * On exit, the contents of Z have been destroyed by the updating * process. * * DLAMDA (global output) DOUBLE PRECISION array, dimension (N) * A copy of the first K eigenvalues which will be used by * SLAED3 to form the secular equation. * * W (global output) DOUBLE PRECISION array, dimension (N) * The first k values of the final deflation-altered z-vector * which will be passed to SLAED3. * * Q2 (output) DOUBLE PRECISION array, dimension (LDQ2, NQ) * A copy of the first K eigenvectors which will be used by * * LDQ2 (input) INTEGER * The leading dimension of the array Q2. * * QBUF (workspace) DOUBLE PRECISION array, dimension 3*N * * CTOT (workspace) INTEGER array, dimension( NPCOL, 4) * * PSM (workspace) INTEGER array, dimension( NPCOL, 4) * * NPCOL (global input) INTEGER * The total number of columns over which the distributed * submatrix is distributed. * * INDX (workspace) INTEGER array, dimension (N) * The permutation used to sort the contents of DLAMDA into * ascending order. * * INDXC (output) INTEGER array, dimension (N) * The permutation used to arrange the columns of the deflated * Q matrix into three groups: the first group contains non-zero * elements only at and above N1, the second contains * non-zero elements only below N1, and the third is dense. * * INDXP (workspace) INTEGER array, dimension (N) * The permutation used to place deflated values of D at the end * of the array. INDXP(1:K) points to the nondeflated D-values * and INDXP(K+1:N) points to the deflated eigenvalues. * * INDCOL (workspace) INTEGER array, dimension (N) * * COLTYP (workspace/output) INTEGER array, dimension (N) * During execution, a label which will indicate which of the * following types a column in the Q2 matrix is: * 1 : non-zero in the upper half only; * 2 : dense; * 3 : non-zero in the lower half only; * 4 : deflated. * * NN (global output) INTEGER, the order of matrix U, (PDLAED1). * NN1 (global output) INTEGER, the order of matrix Q1, (PDLAED1). * NN2 (global output) INTEGER, the order of matrix Q2, (PDLAED1). * IB1 (global output) INTEGER, pointeur on Q1, (PDLAED1). * IB2 (global output) INTEGER, pointeur on Q2, (PDLAED1). * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION MONE, ZERO, ONE, TWO, EIGHT PARAMETER ( MONE = -1.0D0, ZERO = 0.0D0, ONE = 1.0D0, $ TWO = 2.0D0, EIGHT = 8.0D0 ) * .. * .. Local Scalars .. INTEGER COL, CT, I, IAM, IE1, IE2, IMAX, INFO, J, JJQ2, $ JJS, JMAX, JS, K2, MYCOL, MYROW, N1P1, N2, NJ, $ NJCOL, NJJ, NP, NPROCS, NPROW, PJ, PJCOL, PJJ DOUBLE PRECISION C, EPS, S, T, TAU, TOL * .. * .. External Functions .. INTEGER IDAMAX, INDXG2L, INDXL2G, NUMROC DOUBLE PRECISION DLAPY2, PDLAMCH EXTERNAL IDAMAX, INDXG2L, INDXL2G, NUMROC, PDLAMCH, $ DLAPY2 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, DCOPY, DGERV2D, $ DGESD2D, DLAPST, DROT, DSCAL, INFOG1L * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. External Functions .. * .. * .. Local Arrays .. INTEGER PTT( 4 ) * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL BLACS_PINFO( IAM, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NUMROC( N, NB, MYROW, DROW, NPROW ) * N2 = N - N1 N1P1 = N1 + 1 * IF( RHO.LT.ZERO ) THEN CALL DSCAL( N2, MONE, Z( N1P1 ), 1 ) END IF * * Normalize z so that norm(z) = 1. Since z is the concatenation of * two normalized vectors, norm2(z) = sqrt(2). * T = ONE / SQRT( TWO ) CALL DSCAL( N, T, Z, 1 ) * * RHO = ABS( norm(z)**2 * RHO ) * RHO = ABS( TWO*RHO ) * * Calculate the allowable deflation tolerance * IMAX = IDAMAX( N, Z, 1 ) JMAX = IDAMAX( N, D, 1 ) EPS = PDLAMCH( ICTXT, 'Epsilon' ) TOL = EIGHT*EPS*MAX( ABS( D( JMAX ) ), ABS( Z( IMAX ) ) ) * * If the rank-1 modifier is small enough, no more needs to be done * except to reorganize Q so that its columns correspond with the * elements in D. * IF( RHO*ABS( Z( IMAX ) ).LE.TOL ) THEN K = 0 GO TO 220 END IF * * If there are multiple eigenvalues then the problem deflates. Here * the number of equal eigenvalues are found. As each equal * eigenvalue is found, an elementary reflector is computed to rotate * the corresponding eigensubspace so that the corresponding * components of Z are zero in this new basis. * * CALL DLAPST( 'I', N, D, INDX, INFO ) * DO 10 I = 1, N1 COLTYP( I ) = 1 10 CONTINUE DO 20 I = N1P1, N COLTYP( I ) = 3 20 CONTINUE COL = DCOL DO 40 I = 1, N, NB DO 30 J = 0, NB - 1 IF( I+J.LE.N ) $ INDCOL( I+J ) = COL 30 CONTINUE COL = MOD( COL+1, NPCOL ) 40 CONTINUE * K = 0 K2 = N + 1 DO 50 J = 1, N NJ = INDX( J ) IF( RHO*ABS( Z( NJ ) ).LE.TOL ) THEN * * Deflate due to small z component. * K2 = K2 - 1 COLTYP( NJ ) = 4 INDXP( K2 ) = NJ IF( J.EQ.N ) $ GO TO 80 ELSE PJ = NJ GO TO 60 END IF 50 CONTINUE 60 CONTINUE J = J + 1 NJ = INDX( J ) IF( J.GT.N ) $ GO TO 80 IF( RHO*ABS( Z( NJ ) ).LE.TOL ) THEN * * Deflate due to small z component. * K2 = K2 - 1 COLTYP( NJ ) = 4 INDXP( K2 ) = NJ ELSE * * Check if eigenvalues are close enough to allow deflation. * S = Z( PJ ) C = Z( NJ ) * * Find sqrt(a**2+b**2) without overflow or * destructive underflow. * TAU = DLAPY2( C, S ) T = D( NJ ) - D( PJ ) C = C / TAU S = -S / TAU IF( ABS( T*C*S ).LE.TOL ) THEN * * Deflation is possible. * Z( NJ ) = TAU Z( PJ ) = ZERO IF( COLTYP( NJ ).NE.COLTYP( PJ ) ) $ COLTYP( NJ ) = 2 COLTYP( PJ ) = 4 CALL INFOG1L( NJ, NB, NPCOL, MYCOL, DCOL, NJJ, NJCOL ) CALL INFOG1L( PJ, NB, NPCOL, MYCOL, DCOL, PJJ, PJCOL ) IF( INDCOL( PJ ).EQ.INDCOL( NJ ) .AND. MYCOL.EQ.NJCOL ) THEN CALL DROT( NP, Q( 1, PJJ ), 1, Q( 1, NJJ ), 1, C, S ) ELSE IF( MYCOL.EQ.PJCOL ) THEN CALL DGESD2D( ICTXT, NP, 1, Q( 1, PJJ ), NP, MYROW, $ NJCOL ) CALL DGERV2D( ICTXT, NP, 1, QBUF, NP, MYROW, NJCOL ) CALL DROT( NP, Q( 1, PJJ ), 1, QBUF, 1, C, S ) ELSE IF( MYCOL.EQ.NJCOL ) THEN CALL DGESD2D( ICTXT, NP, 1, Q( 1, NJJ ), NP, MYROW, $ PJCOL ) CALL DGERV2D( ICTXT, NP, 1, QBUF, NP, MYROW, PJCOL ) CALL DROT( NP, QBUF, 1, Q( 1, NJJ ), 1, C, S ) END IF T = D( PJ )*C**2 + D( NJ )*S**2 D( NJ ) = D( PJ )*S**2 + D( NJ )*C**2 D( PJ ) = T K2 = K2 - 1 I = 1 70 CONTINUE IF( K2+I.LE.N ) THEN IF( D( PJ ).LT.D( INDXP( K2+I ) ) ) THEN INDXP( K2+I-1 ) = INDXP( K2+I ) INDXP( K2+I ) = PJ I = I + 1 GO TO 70 ELSE INDXP( K2+I-1 ) = PJ END IF ELSE INDXP( K2+I-1 ) = PJ END IF PJ = NJ ELSE K = K + 1 DLAMDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ PJ = NJ END IF END IF GO TO 60 80 CONTINUE * * Record the last eigenvalue. * K = K + 1 DLAMDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ * * Count up the total number of the various types of columns, then * form a permutation which positions the four column types into * four uniform groups (although one or more of these groups may be * empty). * DO 100 J = 1, 4 DO 90 I = 0, NPCOL - 1 CTOT( I, J ) = 0 90 CONTINUE PTT( J ) = 0 100 CONTINUE DO 110 J = 1, N CT = COLTYP( J ) COL = INDCOL( J ) CTOT( COL, CT ) = CTOT( COL, CT ) + 1 110 CONTINUE * * PSM(*) = Position in SubMatrix (of types 1 through 4) * DO 120 COL = 0, NPCOL - 1 PSM( COL, 1 ) = 1 PSM( COL, 2 ) = 1 + CTOT( COL, 1 ) PSM( COL, 3 ) = PSM( COL, 2 ) + CTOT( COL, 2 ) PSM( COL, 4 ) = PSM( COL, 3 ) + CTOT( COL, 3 ) 120 CONTINUE PTT( 1 ) = 1 DO 140 I = 2, 4 CT = 0 DO 130 J = 0, NPCOL - 1 CT = CT + CTOT( J, I-1 ) 130 CONTINUE PTT( I ) = PTT( I-1 ) + CT 140 CONTINUE * * Fill out the INDXC array so that the permutation which it induces * will place all type-1 columns first, all type-2 columns next, * then all type-3's, and finally all type-4's. * DO 150 J = 1, N JS = INDXP( J ) COL = INDCOL( JS ) CT = COLTYP( JS ) I = INDXL2G( PSM( COL, CT ), NB, COL, DCOL, NPCOL ) INDX( J ) = I INDXC( PTT( CT ) ) = I PSM( COL, CT ) = PSM( COL, CT ) + 1 PTT( CT ) = PTT( CT ) + 1 150 CONTINUE * * DO 160 J = 1, N JS = INDXP( J ) JJS = INDXG2L( JS, NB, J, J, NPCOL ) COL = INDCOL( JS ) IF( COL.EQ.MYCOL ) THEN I = INDX( J ) JJQ2 = INDXG2L( I, NB, J, J, NPCOL ) CALL DCOPY( NP, Q( 1, JJS ), 1, Q2( 1, JJQ2 ), 1 ) END IF 160 CONTINUE * * * The deflated eigenvalues and their corresponding vectors go back * into the last N - K slots of D and Q respectively. * CALL DCOPY( N, D, 1, Z, 1 ) DO 170 J = K + 1, N JS = INDXP( J ) I = INDX( J ) D( I ) = Z( JS ) 170 CONTINUE * PTT( 1 ) = 1 DO 190 I = 2, 4 CT = 0 DO 180 J = 0, NPCOL - 1 CT = CT + CTOT( J, I-1 ) 180 CONTINUE PTT( I ) = PTT( I-1 ) + CT 190 CONTINUE * * IB1 = INDXC( 1 ) IE1 = IB1 IB2 = INDXC( PTT( 2 ) ) IE2 = IB2 DO 200 I = 2, PTT( 3 ) - 1 IB1 = MIN( IB1, INDXC( I ) ) IE1 = MAX( IE1, INDXC( I ) ) 200 CONTINUE DO 210 I = PTT( 2 ), PTT( 4 ) - 1 IB2 = MIN( IB2, INDXC( I ) ) IE2 = MAX( IE2, INDXC( I ) ) 210 CONTINUE NN1 = IE1 - IB1 + 1 NN2 = IE2 - IB2 + 1 NN = MAX( IE1, IE2 ) - MIN( IB1, IB2 ) + 1 220 CONTINUE RETURN * * End of PDLAED2 * END scalapack-2.0.2/SRC/pdlaed3.f000644 000766 000024 00000026623 10363532303 016032 0ustar00juliestaff000000 000000 SUBROUTINE PDLAED3( ICTXT, K, N, NB, D, DROW, DCOL, RHO, DLAMDA, $ W, Z, U, LDU, BUF, INDX, INDCOL, INDROW, $ INDXR, INDXC, CTOT, NPCOL, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER DCOL, DROW, ICTXT, INFO, K, LDU, N, NB, NPCOL DOUBLE PRECISION RHO * .. * .. Array Arguments .. INTEGER CTOT( 0: NPCOL-1, 4 ), INDCOL( * ), $ INDROW( * ), INDX( * ), INDXC( * ), INDXR( * ) DOUBLE PRECISION BUF( * ), D( * ), DLAMDA( * ), U( LDU, * ), $ W( * ), Z( * ) * .. * * Purpose * ======= * * PDLAED3 finds the roots of the secular equation, as defined by the * values in D, W, and RHO, between 1 and K. It makes the * appropriate calls to SLAED4 * * This code makes very mild assumptions about floating point * arithmetic. It will work on machines with a guard digit in * add/subtract, or on those binary machines without guard digits * which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. * It could conceivably fail on hexadecimal or decimal machines * without guard digits, but we know of none. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation on the matrix. The context itself is global. * * K (output) INTEGER * The number of non-deflated eigenvalues, and the order of the * related secular equation. 0 <= K <=N. * * N (input) INTEGER * The dimension of the symmetric tridiagonal matrix. N >= 0. * * NB (global input) INTEGER * The blocking factor used to distribute the columns of the * matrix. NB >= 1. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, D contains the eigenvalues of the two submatrices to * be combined. * On exit, D contains the trailing (N-K) updated eigenvalues * (those which were deflated) sorted into increasing order. * * DROW (global input) INTEGER * The process row over which the first row of the matrix D is * distributed. 0 <= DROW < NPROW. * * DCOL (global input) INTEGER * The process column over which the first column of the * matrix D is distributed. 0 <= DCOL < NPCOL. * * Q (input/output) DOUBLE PRECISION array, dimension (LDQ, N) * On entry, Q contains the eigenvectors of two submatrices in * the two square blocks with corners at (1,1), (N1,N1) * and (N1+1, N1+1), (N,N). * On exit, Q contains the trailing (N-K) updated eigenvectors * (those which were deflated) in its last N-K columns. * * LDQ (input) INTEGER * The leading dimension of the array Q. LDQ >= max(1,NQ). * * RHO (global input/output) DOUBLE PRECISION * On entry, the off-diagonal element associated with the rank-1 * cut which originally split the two submatrices which are now * being recombined. * On exit, RHO has been modified to the value required by * PDLAED3. * * DLAMDA (global output) DOUBLE PRECISION array, dimension (N) * A copy of the first K eigenvalues which will be used by * SLAED3 to form the secular equation. * * W (global output) DOUBLE PRECISION array, dimension (N) * The first k values of the final deflation-altered z-vector * which will be passed to SLAED3. * * Z (global input) DOUBLE PRECISION array, dimension (N) * On entry, Z contains the updating vector (the last * row of the first sub-eigenvector matrix and the first row of * the second sub-eigenvector matrix). * On exit, the contents of Z have been destroyed by the updating * process. * * U (global output) DOUBLE PRECISION array * global dimension (N, N), local dimension (LDU, NQ). * Q contains the orthonormal eigenvectors of the symmetric * tridiagonal matrix. * * LDU (input) INTEGER * The leading dimension of the array U. * * QBUF (workspace) DOUBLE PRECISION array, dimension 3*N * * * INDX (workspace) INTEGER array, dimension (N) * The permutation used to sort the contents of DLAMDA into * ascending order. * * INDCOL (workspace) INTEGER array, dimension (N) * * * INDROW (workspace) INTEGER array, dimension (N) * * * INDXR (workspace) INTEGER array, dimension (N) * * * INDXC (workspace) INTEGER array, dimension (N) * * CTOT (workspace) INTEGER array, dimension( NPCOL, 4) * * NPCOL (global input) INTEGER * The total number of columns over which the distributed * submatrix is distributed. * * INFO (output) INTEGER * = 0: successful exit. * < 0: if INFO = -i, the i-th argument had an illegal value. * > 0: The algorithm failed to compute the ith eigenvalue. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER COL, GI, I, IINFO, IIU, IPD, IU, J, JJU, JU, $ KK, KL, KLC, KLR, MYCOL, MYKL, MYKLR, MYROW, $ NPROW, PDC, PDR, ROW DOUBLE PRECISION AUX, TEMP * .. * .. External Functions .. INTEGER INDXG2L DOUBLE PRECISION DLAMC3, DNRM2 EXTERNAL INDXG2L, DLAMC3, DNRM2 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D, DLAED4 * .. * .. Intrinsic Functions .. INTRINSIC MOD, SIGN, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 * * Quick return if possible * IF( K.EQ.0 ) $ RETURN * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * ROW = DROW COL = DCOL DO 20 I = 1, N, NB DO 10 J = 0, NB - 1 IF( I+J.LE.N ) THEN INDROW( I+J ) = ROW INDCOL( I+J ) = COL END IF 10 CONTINUE ROW = MOD( ROW+1, NPROW ) COL = MOD( COL+1, NPCOL ) 20 CONTINUE * MYKL = CTOT( MYCOL, 1 ) + CTOT( MYCOL, 2 ) + CTOT( MYCOL, 3 ) KLR = MYKL / NPROW IF( MYROW.EQ.DROW ) THEN MYKLR = KLR + MOD( MYKL, NPROW ) ELSE MYKLR = KLR END IF PDC = 1 COL = DCOL 30 CONTINUE IF( MYCOL.NE.COL ) THEN PDC = PDC + CTOT( COL, 1 ) + CTOT( COL, 2 ) + CTOT( COL, 3 ) COL = MOD( COL+1, NPCOL ) GO TO 30 END IF PDR = PDC KL = KLR + MOD( MYKL, NPROW ) ROW = DROW 40 CONTINUE IF( MYROW.NE.ROW ) THEN PDR = PDR + KL KL = KLR ROW = MOD( ROW+1, NPROW ) GO TO 40 END IF * DO 50 I = 1, K DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) Z( I ) = ONE 50 CONTINUE IF( MYKLR.GT.0 ) THEN KK = PDR DO 80 I = 1, MYKLR CALL DLAED4( K, KK, DLAMDA, W, BUF, RHO, BUF( K+I ), IINFO ) IF( IINFO.NE.0 ) THEN INFO = KK END IF * * ..Compute part of z * DO 60 J = 1, KK - 1 Z( J ) = Z( J )*( BUF( J ) / $ ( DLAMDA( J )-DLAMDA( KK ) ) ) 60 CONTINUE Z( KK ) = Z( KK )*BUF( KK ) DO 70 J = KK + 1, K Z( J ) = Z( J )*( BUF( J ) / $ ( DLAMDA( J )-DLAMDA( KK ) ) ) 70 CONTINUE KK = KK + 1 80 CONTINUE * IF( MYROW.NE.DROW ) THEN CALL DCOPY( K, Z, 1, BUF, 1 ) CALL DGESD2D( ICTXT, K+MYKLR, 1, BUF, K+MYKLR, DROW, MYCOL ) ELSE IPD = 2*K + 1 CALL DCOPY( MYKLR, BUF( K+1 ), 1, BUF( IPD ), 1 ) IF( KLR.GT.0 ) THEN IPD = MYKLR + IPD ROW = MOD( DROW+1, NPROW ) DO 100 I = 1, NPROW - 1 CALL DGERV2D( ICTXT, K+KLR, 1, BUF, K+KLR, ROW, $ MYCOL ) CALL DCOPY( KLR, BUF( K+1 ), 1, BUF( IPD ), 1 ) DO 90 J = 1, K Z( J ) = Z( J )*BUF( J ) 90 CONTINUE IPD = IPD + KLR ROW = MOD( ROW+1, NPROW ) 100 CONTINUE END IF END IF END IF * IF( MYROW.EQ.DROW ) THEN IF( MYCOL.NE.DCOL .AND. MYKL.NE.0 ) THEN CALL DCOPY( K, Z, 1, BUF, 1 ) CALL DCOPY( MYKL, BUF( 2*K+1 ), 1, BUF( K+1 ), 1 ) CALL DGESD2D( ICTXT, K+MYKL, 1, BUF, K+MYKL, MYROW, DCOL ) ELSE IF( MYCOL.EQ.DCOL ) THEN IPD = 2*K + 1 COL = DCOL KL = MYKL DO 120 I = 1, NPCOL - 1 IPD = IPD + KL COL = MOD( COL+1, NPCOL ) KL = CTOT( COL, 1 ) + CTOT( COL, 2 ) + CTOT( COL, 3 ) IF( KL.NE.0 ) THEN CALL DGERV2D( ICTXT, K+KL, 1, BUF, K+KL, MYROW, COL ) CALL DCOPY( KL, BUF( K+1 ), 1, BUF( IPD ), 1 ) DO 110 J = 1, K Z( J ) = Z( J )*BUF( J ) 110 CONTINUE END IF 120 CONTINUE DO 130 I = 1, K Z( I ) = SIGN( SQRT( -Z( I ) ), W( I ) ) 130 CONTINUE * END IF END IF * * Diffusion * IF( MYROW.EQ.DROW .AND. MYCOL.EQ.DCOL ) THEN CALL DCOPY( K, Z, 1, BUF, 1 ) CALL DCOPY( K, BUF( 2*K+1 ), 1, BUF( K+1 ), 1 ) CALL DGEBS2D( ICTXT, 'All', ' ', 2*K, 1, BUF, 2*K ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 2*K, 1, BUF, 2*K, DROW, DCOL ) CALL DCOPY( K, BUF, 1, Z, 1 ) END IF * * Copy of D at the good place * KLC = 0 KLR = 0 DO 140 I = 1, K GI = INDX( I ) D( GI ) = BUF( K+I ) COL = INDCOL( GI ) ROW = INDROW( GI ) IF( COL.EQ.MYCOL ) THEN KLC = KLC + 1 INDXC( KLC ) = I END IF IF( ROW.EQ.MYROW ) THEN KLR = KLR + 1 INDXR( KLR ) = I END IF 140 CONTINUE * * Compute eigenvectors of the modified rank-1 modification. * IF( MYKL.NE.0 ) THEN DO 180 J = 1, MYKL KK = INDXC( J ) JU = INDX( KK ) JJU = INDXG2L( JU, NB, J, J, NPCOL ) CALL DLAED4( K, KK, DLAMDA, W, BUF, RHO, AUX, IINFO ) IF( IINFO.NE.0 ) THEN INFO = KK END IF IF( K.EQ.1 .OR. K.EQ.2 ) THEN DO 150 I = 1, KLR KK = INDXR( I ) IU = INDX( KK ) IIU = INDXG2L( IU, NB, J, J, NPROW ) U( IIU, JJU ) = BUF( KK ) 150 CONTINUE GO TO 180 END IF * DO 160 I = 1, K BUF( I ) = Z( I ) / BUF( I ) 160 CONTINUE TEMP = DNRM2( K, BUF, 1 ) DO 170 I = 1, KLR KK = INDXR( I ) IU = INDX( KK ) IIU = INDXG2L( IU, NB, J, J, NPROW ) U( IIU, JJU ) = BUF( KK ) / TEMP 170 CONTINUE * 180 CONTINUE END IF * 190 CONTINUE * RETURN * * End of PDLAED3 * END scalapack-2.0.2/SRC/pdlaedz.f000644 000766 000024 00000012250 10363532303 016130 0ustar00juliestaff000000 000000 SUBROUTINE PDLAEDZ( N, N1, ID, Q, IQ, JQ, LDQ, DESCQ, Z, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER ID, IQ, JQ, LDQ, N, N1 * .. * .. Array Arguments .. INTEGER DESCQ( * ) DOUBLE PRECISION Q( LDQ, * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDLAEDZ Form the z-vector which consists of the last row of Q_1 * and the first row of Q_2. * ===================================================================== * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. * INTEGER COL, I, IBUF, ICTXT, IIQ, IIZ1, IIZ2, IQCOL, $ IQROW, IZ, IZ1, IZ1COL, IZ1ROW, IZ2, IZ2COL, $ IZ2ROW, J, JJQ, JJZ1, JJZ2, MYCOL, MYROW, N2, $ NB, NBLOC, NPCOL, NPROW, NQ1, NQ2, ZSIZ * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D, INFOG2L * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * ICTXT = DESCQ( CTXT_ ) NB = DESCQ( NB_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( ID, ID, DESCQ, NPROW, NPCOL, MYROW, MYCOL, IIQ, JJQ, $ IQROW, IQCOL ) N2 = N - N1 * * Form z1 which consist of the last row of Q1 * CALL INFOG2L( IQ-1+( ID+N1-1 ), JQ-1+ID, DESCQ, NPROW, NPCOL, $ MYROW, MYCOL, IIZ1, JJZ1, IZ1ROW, IZ1COL ) NQ1 = NUMROC( N1, NB, MYCOL, IZ1COL, NPCOL ) IF( ( MYROW.EQ.IZ1ROW ) .AND. ( NQ1.NE.0 ) ) THEN CALL DCOPY( NQ1, Q( IIZ1, JJZ1 ), LDQ, WORK, 1 ) IF( MYROW.NE.IQROW .OR. MYCOL.NE.IQCOL ) $ CALL DGESD2D( ICTXT, NQ1, 1, WORK, NQ1, IQROW, IQCOL ) END IF * * Proc (IQROW, IQCOL) receive the parts of z1 * IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN COL = IZ1COL DO 20 I = 0, NPCOL - 1 NQ1 = NUMROC( N1, NB, COL, IZ1COL, NPCOL ) IF( NQ1.GT.0 ) THEN IF( IZ1ROW.NE.IQROW .OR. COL.NE.IQCOL ) THEN IBUF = N1 + 1 CALL DGERV2D( ICTXT, NQ1, 1, WORK( IBUF ), NQ1, $ IZ1ROW, COL ) ELSE IBUF = 1 END IF IZ1 = 0 IZ = I*NB + 1 NBLOC = ( NQ1-1 ) / NB + 1 DO 10 J = 1, NBLOC ZSIZ = MIN( NB, NQ1-IZ1 ) CALL DCOPY( ZSIZ, WORK( IBUF+IZ1 ), 1, Z( IZ ), 1 ) IZ1 = IZ1 + NB IZ = IZ + NB*NPCOL 10 CONTINUE END IF COL = MOD( COL+1, NPCOL ) 20 CONTINUE END IF * * Form z2 which consist of the first row of Q2 * CALL INFOG2L( IQ-1+( ID+N1 ), JQ-1+( ID+N1 ), DESCQ, NPROW, NPCOL, $ MYROW, MYCOL, IIZ2, JJZ2, IZ2ROW, IZ2COL ) NQ2 = NUMROC( N2, NB, MYCOL, IZ2COL, NPCOL ) IF( ( MYROW.EQ.IZ2ROW ) .AND. ( NQ2.NE.0 ) ) THEN CALL DCOPY( NQ2, Q( IIZ2, JJZ2 ), LDQ, WORK, 1 ) IF( MYROW.NE.IQROW .OR. MYCOL.NE.IQCOL ) $ CALL DGESD2D( ICTXT, NQ2, 1, WORK, NQ2, IQROW, IQCOL ) END IF * * Proc (IQROW, IQCOL) receive the parts of z2 * IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN COL = IZ2COL DO 40 I = 0, NPCOL - 1 NQ2 = NUMROC( N2, NB, COL, IZ2COL, NPCOL ) IF( NQ2.GT.0 ) THEN IF( IQROW.NE.IZ2ROW .OR. IQCOL.NE.COL ) THEN IBUF = 1 + N2 CALL DGERV2D( ICTXT, NQ2, 1, WORK( IBUF ), NQ2, $ IZ2ROW, COL ) ELSE IBUF = 1 END IF IZ2 = 0 IZ = NB*I + N1 + 1 NBLOC = ( NQ2-1 ) / NB + 1 DO 30 J = 1, NBLOC ZSIZ = MIN( NB, NQ2-IZ2 ) CALL DCOPY( ZSIZ, WORK( IBUF+IZ2 ), 1, Z( IZ ), 1 ) IZ2 = IZ2 + NB IZ = IZ + NB*NPCOL 30 CONTINUE END IF COL = MOD( COL+1, NPCOL ) 40 CONTINUE END IF * * proc(IQROW,IQCOL) broadcast Z=(Z1,Z2) * IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', N, 1, Z, N ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', N, 1, Z, N, IQROW, IQCOL ) END IF * RETURN * * End of PDLAEDZ * END scalapack-2.0.2/SRC/pdlaevswp.f000644 000766 000024 00000025706 10363532303 016524 0ustar00juliestaff000000 000000 * * SUBROUTINE PDLAEVSWP( N, ZIN, LDZI, Z, IZ, JZ, DESCZ, NVS, KEY, $ WORK, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 15, 1997 * * .. Scalar Arguments .. INTEGER IZ, JZ, LDZI, LWORK, N * .. * .. Array Arguments .. INTEGER DESCZ( * ), KEY( * ), NVS( * ) DOUBLE PRECISION WORK( * ), Z( * ), ZIN( LDZI, * ) * .. * * Purpose * ======= * * PDLAEVSWP moves the eigenvectors (potentially unsorted) from * where they are computed, to a ScaLAPACK standard block cyclic * array, sorted so that the corresponding eigenvalues are sorted. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * N (global input) INTEGER * The order of the matrix A. N >= 0. * * ZIN (local input) DOUBLE PRECISION array, * dimension ( LDZI, NVS(iam) ) * The eigenvectors on input. Each eigenvector resides entirely * in one process. Each process holds a contiguous set of * NVS(iam) eigenvectors. The first eigenvector which the * process holds is: sum for i=[0,iam-1) of NVS(i) * * LDZI (locl input) INTEGER * leading dimension of the ZIN array * * Z (local output) DOUBLE PRECISION array * global dimension (N, N), local dimension (DESCZ(DLEN_), NQ) * The eigenvectors on output. The eigenvectors are distributed * in a block cyclic manner in both dimensions, with a * block size of NB. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * NVS (global input) INTEGER array, dimension( nprocs+1 ) * nvs(i) = number of processes * number of eigenvectors held by processes [0,i-1) * nvs(1) = number of eigen vectors held by [0,1-1) == 0 * nvs(nprocs+1) = number of eigen vectors held by [0,nprocs) == * total number of eigenvectors * * KEY (global input) INTEGER array, dimension( N ) * Indicates the actual index (after sorting) for each of the * eigenvectors. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * * LWORK (local input) INTEGER dimension of WORK * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CYCLIC_I, CYCLIC_J, DIST, I, IAM, II, INCII, J, $ MAXI, MAXII, MINI, MINII, MYCOL, MYROW, NB, $ NBUFSIZE, NPCOL, NPROCS, NPROW, PCOL, RECVCOL, $ RECVFROM, RECVROW, SENDCOL, SENDROW, SENDTO * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P EXTERNAL INDXG2L, INDXG2P * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGERV2D, DGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW + MYCOL*NPROW IAM = MYROW*NPCOL + MYCOL * NB = DESCZ( MB_ ) * NPROCS = NPROW*NPCOL * * If PxSTEIN operates on a sub-matrix of a global matrix, the * key [] that contains the indicies of the eigenvectors is refe- * renced to the dimensions of the sub-matrix and not the global * distrubited matrix. Because of this, PxLAEVSWP will incorrectly * map the eigenvectors to the global eigenvector matrix, Z, unless * the key[] elements are shifted as below. * DO 10 J = DESCZ( N_ ), 1, -1 KEY( J ) = KEY( J-JZ+1 ) + ( JZ-1 ) 10 CONTINUE * DO 110 DIST = 0, NPROCS - 1 * SENDTO = MOD( IAM+DIST, NPROCS ) RECVFROM = MOD( NPROCS+IAM-DIST, NPROCS ) * SENDROW = MOD( SENDTO, NPROW ) SENDCOL = SENDTO / NPROW RECVROW = MOD( RECVFROM, NPROW ) RECVCOL = RECVFROM / NPROW * SENDROW = SENDTO / NPCOL SENDCOL = MOD( SENDTO, NPCOL ) RECVROW = RECVFROM / NPCOL RECVCOL = MOD( RECVFROM, NPCOL ) * * Figure out what I have that process "sendto" wants * NBUFSIZE = 0 * * We are looping through the eigenvectors that I presently own. * DO 40 J = NVS( 1+IAM ) + JZ, NVS( 1+IAM+1 ) + JZ - 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( SENDCOL.EQ.PCOL ) THEN MINII = MOD( SENDROW+DESCZ( RSRC_ ), NPROW )* $ DESCZ( MB_ ) + 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 30 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) MAXI = MIN( II+DESCZ( MB_ )-1, N+IZ-1 ) DO 20 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 WORK( NBUFSIZE ) = ZIN( I+1-IZ, $ J-NVS( 1+IAM )+1-JZ ) 20 CONTINUE 30 CONTINUE END IF 40 CONTINUE * * IF( MYROW.NE.SENDROW .OR. MYCOL.NE.SENDCOL ) $ CALL DGESD2D( DESCZ( CTXT_ ), NBUFSIZE, 1, WORK, NBUFSIZE, $ SENDROW, SENDCOL ) * * * Figure out what process "recvfrom" has that I want * NBUFSIZE = 0 DO 70 J = NVS( 1+RECVFROM ) + JZ, $ NVS( 1+RECVFROM+1 ) + JZ - 1, 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.PCOL ) THEN MINII = MOD( MYROW+DESCZ( RSRC_ ), NPROW )*DESCZ( MB_ ) + $ 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 60 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) MAXI = MIN( II+NB-1, N+IZ-1 ) DO 50 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 50 CONTINUE 60 CONTINUE END IF 70 CONTINUE * * * IF( MYROW.NE.RECVROW .OR. MYCOL.NE.RECVCOL ) $ CALL DGERV2D( DESCZ( CTXT_ ), 1, NBUFSIZE, WORK, 1, RECVROW, $ RECVCOL ) * NBUFSIZE = 0 DO 100 J = NVS( 1+RECVFROM ) + JZ, $ NVS( 1+RECVFROM+1 ) + JZ - 1, 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.PCOL ) THEN CYCLIC_J = INDXG2L( KEY( J ), DESCZ( MB_ ), -1, -1, $ NPCOL ) CYCLIC_I = 1 MINII = MOD( MYROW+DESCZ( RSRC_ ), NPROW )*DESCZ( MB_ ) + $ 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 90 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) CYCLIC_I = INDXG2L( MINI, DESCZ( MB_ ), -1, -1, $ NPROW ) MAXI = MIN( II+NB-1, N+IZ-1 ) DO 80 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 Z( CYCLIC_I+( CYCLIC_J-1 )*DESCZ( LLD_ ) ) $ = WORK( NBUFSIZE ) CYCLIC_I = CYCLIC_I + 1 80 CONTINUE 90 CONTINUE END IF 100 CONTINUE * 110 CONTINUE RETURN * * End of PDLAEVSWP * END scalapack-2.0.2/SRC/pdlahqr.f000644 000766 000024 00000262075 11750130340 016150 0ustar00juliestaff000000 000000 SUBROUTINE PDLAHQR( WANTT, WANTZ, N, ILO, IHI, A, DESCA, WR, WI, $ ILOZ, IHIZ, Z, DESCZ, WORK, LWORK, IWORK, $ ILWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, ILWORK, INFO, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) DOUBLE PRECISION A( * ), WI( * ), WORK( * ), WR( * ), Z( * ) * .. * * Purpose * ======= * * PDLAHQR is an auxiliary routine used to find the Schur decomposition * and or eigenvalues of a matrix already in Hessenberg form from * cols ILO to IHI. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix A (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that A is already upper quasi-triangular in * rows and columns IHI+1:N, and that A(ILO,ILO-1) = 0 (unless * ILO = 1). PDLAHQR works primarily with the Hessenberg * submatrix in rows and columns ILO to IHI, but applies * transformations to all of H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * A (global input/output) DOUBLE PRECISION array, dimension * (DESCA(LLD_),*) * On entry, the upper Hessenberg matrix A. * On exit, if WANTT is .TRUE., A is upper quasi-triangular in * rows and columns ILO:IHI, with any 2-by-2 or larger diagonal * blocks not yet in standard form. If WANTT is .FALSE., the * contents of A are unspecified on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WR (global replicated output) DOUBLE PRECISION array, * dimension (N) * WI (global replicated output) DOUBLE PRECISION array, * dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in A. A may be returned with * larger diagonal blocks until the next release. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (global input/output) DOUBLE PRECISION array. * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations accumulated by PDHSEQR, and on * exit Z has been updated; transformations are applied only to * the submatrix Z(ILOZ:IHIZ,ILO:IHI). * If WANTZ is .FALSE., Z is not referenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local output) DOUBLE PRECISION array of size LWORK * * LWORK (local input) INTEGER * WORK(LWORK) is a local array and LWORK is assumed big enough * so that LWORK >= 3*N + * MAX( 2*MAX(DESCZ(LLD_),DESCA(LLD_)) + 2*LOCc(N), * 7*Ceil(N/HBL)/LCM(NPROW,NPCOL)) ) * * IWORK (global and local input) INTEGER array of size ILWORK * * ILWORK (local input) INTEGER * This holds the some of the IBLK integer arrays. This is held * as a place holder for the next release. * * INFO (global output) INTEGER * < 0: parameter number -INFO incorrect or inconsistent * = 0: successful exit * > 0: PDLAHQR failed to compute all the eigenvalues ILO to IHI * in a total of 30*(IHI-ILO+1) iterations; if INFO = i, * elements i+1:ihi of WR and WI contain those eigenvalues * which have been successfully computed. * * Logic: * This algorithm is very similar to _LAHQR. Unlike _LAHQR, * instead of sending one double shift through the largest * unreduced submatrix, this algorithm sends multiple double shifts * and spaces them apart so that there can be parallelism across * several processor row/columns. Another critical difference is * that this algorithm aggregrates multiple transforms together in * order to apply them in a block fashion. * * Important Local Variables: * IBLK = The maximum number of bulges that can be computed. * Currently fixed. Future releases this won't be fixed. * HBL = The square block size (HBL=DESCA(MB_)=DESCA(NB_)) * ROTN = The number of transforms to block together * NBULGE = The number of bulges that will be attempted on the * current submatrix. * IBULGE = The current number of bulges started. * K1(*),K2(*) = The current bulge loops from K1(*) to K2(*). * * Subroutines: * This routine calls: * PDLACONSB -> To determine where to start each iteration * PDLAWIL -> Given the shift, get the transformation * DLASORTE -> Pair up eigenvalues so that reals are paired. * PDLACP3 -> Parallel array to local replicated array copy & * back. * DLAREF -> Row/column reflector applier. Core routine * here. * PDLASMSUB -> Finds negligible subdiagonal elements. * * Current Notes and/or Restrictions: * 1.) This code requires the distributed block size to be square * and at least six (6); unlike simpler codes like LU, this * algorithm is extremely sensitive to block size. Unwise * choices of too small a block size can lead to bad * performance. * 2.) This code requires A and Z to be distributed identically * and have identical contxts. * 3.) This release currently does not have a routine for * resolving the Schur blocks into regular 2x2 form after * this code is completed. Because of this, a significant * performance impact is required while the deflation is done * by sometimes a single column of processors. * 4.) This code does not currently block the initial transforms * so that none of the rows or columns for any bulge are * completed until all are started. To offset pipeline * start-up it is recommended that at least 2*LCM(NPROW,NPCOL) * bulges are used (if possible) * 5.) The maximum number of bulges currently supported is fixed at * 32. In future versions this will be limited only by the * incoming WORK array. * 6.) The matrix A must be in upper Hessenberg form. If elements * below the subdiagonal are nonzero, the resulting transforms * may be nonsimilar. This is also true with the LAPACK * routine. * 7.) For this release, it is assumed RSRC_=CSRC_=0 * 8.) Currently, all the eigenvalues are distributed to all the * nodes. Future releases will probably distribute the * eigenvalues by the column partitioning. * 9.) The internals of this routine are subject to change. * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE, HALF PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, HALF = 0.5D+0 ) DOUBLE PRECISION CONST PARAMETER ( CONST = 1.50D+0 ) INTEGER IBLK PARAMETER ( IBLK = 32 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, I, I1, I2, IAFIRST, IBULGE, $ ICBUF, ICOL, ICOL1, ICOL2, IDIA, IERR, II, $ IRBUF, IROW, IROW1, IROW2, ISPEC, ISTART, $ ISTARTCOL, ISTARTROW, ISTOP, ISUB, ISUP, $ ITERMAX, ITMP1, ITMP2, ITN, ITS, J, JAFIRST, $ JBLK, JJ, K, KI, L, LCMRC, LDA, LDZ, LEFT, $ LIHIH, LIHIZ, LILOH, LILOZ, LOCALI1, LOCALI2, $ LOCALK, LOCALM, M, MODKM1, MYCOL, MYROW, $ NBULGE, NH, NODE, NPCOL, NPROW, NR, NUM, NZ, $ RIGHT, ROTN, UP, VECSIDX DOUBLE PRECISION AVE, DISC, H00, H10, H11, H12, H21, H22, H33, $ H43H34, H44, OVFL, S, SMLNUM, SUM, T1, T1COPY, $ T2, T3, ULP, UNFL, V1SAVE, V2, V2SAVE, V3, $ V3SAVE, CS, SN * .. * .. Local Arrays .. INTEGER ICURCOL( IBLK ), ICURROW( IBLK ), K1( IBLK ), $ K2( IBLK ), KCOL( IBLK ), KP2COL( IBLK ), $ KP2ROW( IBLK ), KROW( IBLK ), LOCALK2( IBLK ) DOUBLE PRECISION S1( 2*IBLK, 2*IBLK ), SMALLA( 6, 6, IBLK ), $ VCOPY( 3 ) * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D, DGSUM2D, DLAHQR, DLAREF, $ DLARFG, DLASORTE, IGAMN2D, INFOG1L, INFOG2L, $ PDLABAD, PDLACONSB, PDLACP3, PDLASMSUB, $ PDLAWIL, PXERBLA, DLANV2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SIGN, SQRT * .. * .. Executable Statements .. * INFO = 0 * ITERMAX = 30*( IHI-ILO+1 ) * ITERMAX = 0 IF( N.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL NUM = NPROW*NPCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) LCMRC = ILCM( NPROW, NPCOL ) * * Determine the number of columns we have so we can check workspace * LOCALK = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) JJ = N / HBL IF( JJ*HBL.LT.N ) $ JJ = JJ + 1 JJ = 7*JJ / LCMRC IF( LWORK.LT.3*N+MAX( 2*MAX( LDA, LDZ )+2*LOCALK, JJ ) ) THEN INFO = -15 END IF IF( DESCZ( CTXT_ ).NE.DESCA( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) END IF IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1300+NB_ ) END IF IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1300+MB_ ) END IF IF( ( DESCA( RSRC_ ).NE.0 ) .OR. ( DESCA( CSRC_ ).NE.0 ) ) THEN INFO = -( 700+RSRC_ ) END IF IF( ( DESCZ( RSRC_ ).NE.0 ) .OR. ( DESCZ( CSRC_ ).NE.0 ) ) THEN INFO = -( 1300+RSRC_ ) END IF IF( ( ILO.GT.N ) .OR. ( ILO.LT.1 ) ) THEN INFO = -4 END IF IF( ( IHI.GT.N ) .OR. ( IHI.LT.1 ) ) THEN INFO = -5 END IF IF( HBL.LT.5 ) THEN INFO = -( 700+MB_ ) END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PDLAHQR', -INFO ) RETURN END IF * * Set work array indices * VECSIDX = 0 IDIA = 3*N ISUB = 3*N ISUP = 3*N IRBUF = 3*N ICBUF = 3*N * * Find a value for ROTN * ROTN = HBL / 3 ROTN = MAX( ROTN, HBL-2 ) ROTN = MIN( ROTN, 1 ) * IF( ILO.EQ.IHI ) THEN CALL INFOG2L( ILO, ILO, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN WR( ILO ) = A( ( ICOL-1 )*LDA+IROW ) ELSE WR( ILO ) = ZERO END IF WI( ILO ) = ZERO RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * CALL INFOG1L( ILOZ, HBL, NPROW, MYROW, 0, LILOZ, LIHIZ ) LIHIZ = NUMROC( IHIZ, HBL, MYROW, 0, NPROW ) * * Set machine-dependent constants for the stopping criterion. * If NORM(H) <= SQRT(OVFL), overflow should not occur. * UNFL = PDLAMCH( CONTXT, 'SAFE MINIMUM' ) OVFL = ONE / UNFL CALL PDLABAD( CONTXT, UNFL, OVFL ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = ITERMAX * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of our schur block size (<=2*IBLK). Each * iteration of the loop works with the active submatrix in rows * and columns L to I. Eigenvalues I+1 to IHI have already * converged. Either L = ILO or the global A(L,L-1) is negligible * so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 450 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 420 ITS = 0, ITN * * Look for a single small subdiagonal element. * CALL PDLASMSUB( A, DESCA, I, L, K, SMLNUM, WORK( IRBUF+1 ), $ LWORK-IRBUF ) L = K * IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * CALL INFOG2L( L, L-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN A( ( ICOL-1 )*LDA+IROW ) = ZERO END IF WORK( ISUB+L-1 ) = ZERO END IF * * Exit from loop if a submatrix of order 1 or 2 has split off. * M = L - 10 * IF ( L .GE. I - (2*IBLK-1) ) * IF ( L .GE. I - MAX(2*IBLK-1,HBL) ) IF( L.GE.I-1 ) $ GO TO 430 * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * * Copy submatrix of size 2*JBLK and prepare to do generalized * Wilkinson shift or an exceptional shift * JBLK = MIN( IBLK, ( ( I-L+1 ) / 2 )-1 ) IF( JBLK.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * JBLK = JBLK - MOD( JBLK, LCMRC ) END IF JBLK = MIN( JBLK, 2*LCMRC ) JBLK = MAX( JBLK, 1 ) * CALL PDLACP3( 2*JBLK, I-2*JBLK+1, A, DESCA, S1, 2*IBLK, -1, -1, $ 0 ) IF( ITS.EQ.20 .OR. ITS.EQ.40 ) THEN * * Exceptional shift. * DO 20 II = 2*JBLK, 2, -1 S1( II, II ) = CONST*( ABS( S1( II, II ) )+ $ ABS( S1( II, II-1 ) ) ) S1( II, II-1 ) = ZERO S1( II-1, II ) = ZERO 20 CONTINUE S1( 1, 1 ) = CONST*ABS( S1( 1, 1 ) ) ELSE CALL DLAHQR( .FALSE., .FALSE., 2*JBLK, 1, 2*JBLK, S1, $ 2*IBLK, WORK( IRBUF+1 ), WORK( ICBUF+1 ), 1, $ 2*JBLK, Z, LDZ, IERR ) * * Prepare to use Wilkinson's double shift * H44 = S1( 2*JBLK, 2*JBLK ) H33 = S1( 2*JBLK-1, 2*JBLK-1 ) H43H34 = S1( 2*JBLK-1, 2*JBLK )*S1( 2*JBLK, 2*JBLK-1 ) IF( ( JBLK.GT.1 ) .AND. ( ITS.GT.30 ) ) THEN S = S1( 2*JBLK-1, 2*JBLK-2 ) DISC = ( H33-H44 )*HALF DISC = DISC*DISC + H43H34 IF( DISC.GT.ZERO ) THEN * * Real roots: Use Wilkinson's shift twice * DISC = SQRT( DISC ) AVE = HALF*( H33+H44 ) IF( ABS( H33 )-ABS( H44 ).GT.ZERO ) THEN H33 = H33*H44 - H43H34 H44 = H33 / ( SIGN( DISC, AVE )+AVE ) ELSE H44 = SIGN( DISC, AVE ) + AVE END IF H33 = H44 H43H34 = ZERO END IF END IF END IF * * Look for two consecutive small subdiagonal elements: * PDLACONSB is the routine that does this. * c CALL PDLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, c $ WORK( IRBUF+1 ), LWORK-IRBUF ) * * Skip small submatrices * * IF ( M .GE. I - 5 ) * $ GO TO 80 * * In principle PDLACONSB needs to check all shifts to decide * whether two consecutive small subdiagonal entries are suitable * as the starting position of the bulge chasing phase. It can be * dangerous to check the first pair of shifts only. Moreover it * is quite rare to obtain an M which is much larger than L. This * process is a bit expensive compared with the benefit. * Therefore it is sensible to abandon this routine. Total amount * of communications is saved in average. * M = L * Double-shift QR step * * NBULGE is the number of bulges that will be attempted * ISTOP = MIN( M+ROTN-MOD( M, ROTN ), I-2 ) ISTOP = MIN( ISTOP, M+HBL-3-MOD( M-1, HBL ) ) ISTOP = MIN( ISTOP, I2-2 ) ISTOP = MAX( ISTOP, M ) NBULGE = ( I-1-ISTOP ) / HBL * * Do not exceed maximum determined. * NBULGE = MIN( NBULGE, JBLK ) IF( NBULGE.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * NBULGE = NBULGE - MOD( NBULGE, LCMRC ) END IF NBULGE = MAX( NBULGE, 1 ) * IF( ( ITS.NE.20 ) .AND. ( ITS.NE.40 ) .AND. ( NBULGE.GT.1 ) ) $ THEN * * sort the eigenpairs so that they are in twos for double * shifts. only call if several need sorting * CALL DLASORTE( S1( 2*( JBLK-NBULGE )+1, $ 2*( JBLK-NBULGE )+1 ), 2*IBLK, 2*NBULGE, $ WORK( IRBUF+1 ), IERR ) END IF * * IBULGE is the number of bulges going so far * IBULGE = 1 * * "A" row defs : main row transforms from LOCALK to LOCALI2 * CALL INFOG1L( M, HBL, NPCOL, MYCOL, 0, ITMP1, LOCALK ) LOCALK = NUMROC( N, HBL, MYCOL, 0, NPCOL ) CALL INFOG1L( 1, HBL, NPCOL, MYCOL, 0, ICOL1, LOCALI2 ) LOCALI2 = NUMROC( I2, HBL, MYCOL, 0, NPCOL ) * * "A" col defs : main col transforms from LOCALI1 to LOCALM * CALL INFOG1L( I1, HBL, NPROW, MYROW, 0, LOCALI1, ICOL1 ) ICOL1 = NUMROC( N, HBL, MYROW, 0, NPROW ) CALL INFOG1L( 1, HBL, NPROW, MYROW, 0, LOCALM, ICOL1 ) ICOL1 = NUMROC( MIN( M+3, I ), HBL, MYROW, 0, NPROW ) * * Which row & column will start the bulges * ISTARTROW = MOD( ( M+1 ) / HBL, NPROW ) + IAFIRST ISTARTCOL = MOD( ( M+1 ) / HBL, NPCOL ) + JAFIRST * CALL INFOG1L( M, HBL, NPROW, MYROW, 0, II, ITMP2 ) ITMP2 = NUMROC( N, HBL, MYROW, 0, NPROW ) CALL INFOG1L( M, HBL, NPCOL, MYCOL, 0, JJ, ITMP2 ) ITMP2 = NUMROC( N, HBL, MYCOL, 0, NPCOL ) CALL INFOG1L( 1, HBL, NPROW, MYROW, 0, ISTOP, KP2ROW( 1 ) ) KP2ROW( 1 ) = NUMROC( M+2, HBL, MYROW, 0, NPROW ) CALL INFOG1L( 1, HBL, NPCOL, MYCOL, 0, ISTOP, KP2COL( 1 ) ) KP2COL( 1 ) = NUMROC( M+2, HBL, MYCOL, 0, NPCOL ) * * Set all values for bulges. All bulges are stored in * intermediate steps as loops over KI. Their current "task" * over the global M to I-1 values is always K1(KI) to K2(KI). * However, because there are many bulges, K1(KI) & K2(KI) might * go past that range while later bulges (KI+1,KI+2,etc..) are * finishing up. * * Rules: * If MOD(K1(KI)-1,HBL) < HBL-2 then MOD(K2(KI)-1,HBL)= 0. * * K (global input) INTEGER * The offset for the reduction. Elements below the k-th * subdiagonal in the first NB columns are reduced to zero. * * NB (global input) INTEGER * The number of columns to be reduced. * * A (local input/local output) DOUBLE PRECISION pointer into * the local memory to an array of dimension (LLD_A, * LOCc(JA+N-K)). On entry, this array contains the the local * pieces of the N-by-(N-K+1) general distributed matrix * A(IA:IA+N-1,JA:JA+N-K). On exit, the elements on and above * the k-th subdiagonal in the first NB columns are overwritten * with the corresponding elements of the reduced distributed * matrix; the elements below the k-th subdiagonal, with the * array TAU, represent the matrix Q as a product of elementary * reflectors. The other columns of A(IA:IA+N-1,JA:JA+N-K) are * unchanged. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). TAU is tied to the distributed matrix A. * * T (local output) DOUBLE PRECISION array, dimension (NB_A,NB_A) * The upper triangular matrix T. * * Y (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_Y,NB_A). On exit, this array * contains the local pieces of the N-by-NB distributed * matrix Y. LLD_Y >= LOCr(IA+N-1). * * IY (global input) INTEGER * The row index in the global array Y indicating the first * row of sub( Y ). * * JY (global input) INTEGER * The column index in the global array Y indicating the * first column of sub( Y ). * * DESCY (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Y. * * WORK (local workspace) DOUBLE PRECISION array, dimension (NB) * * Further Details * =============== * * The matrix Q is represented as a product of nb elementary reflectors * * Q = H(1) H(2) . . . H(nb). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in * A(ia+i+k:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The elements of the vectors v together form the (n-k+1)-by-nb matrix * V which is needed, with T and Y, to apply the transformation to the * unreduced part of the matrix, using an update of the form: * A(ia:ia+n-1,ja:ja+n-k) := (I-V*T*V')*(A(ia:ia+n-1,ja:ja+n-k)-Y*V'). * * The contents of A(ia:ia+n-1,ja:ja+n-k) on exit are illustrated by the * following example with n = 7, k = 3 and nb = 2: * * ( a h a a a ) * ( a h a a a ) * ( a h a a a ) * ( h h a a a ) * ( v1 h a a a ) * ( v1 v2 a a a ) * ( v1 v2 a a a ) * * where a denotes an element of the original matrix * A(ia:ia+n-1,ja:ja+n-k), h denotes a modified element of the upper * Hessenberg matrix H, and vi denotes an element of the vector * defining H(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL IPROC INTEGER I, IACOL, IAROW, ICTXT, IOFF, II, J, JJ, JL, $ JT, JW, L, MYROW, MYCOL, NPCOL, NPROW, NQ DOUBLE PRECISION EI * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DAXPY, DESCSET, DCOPY, $ DSCAL, DTRMV, INFOG2L, PDELSET, $ PDGEMV, PDLARFG, PDSCAL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.1 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA+K, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) * IPROC = ( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) NQ = NUMROC( N+JA-1, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - IOFF * EI = ZERO JW = IOFF + 1 CALL DESCSET( DESCW, 1, DESCA( MB_ ), 1, DESCA( MB_ ), IAROW, $ IACOL, ICTXT, 1 ) * DO 10 L = 1, NB I = IA + K + L - 2 J = JA + L - 1 * IF( L.GT.1 ) THEN * * Update A(ia:ia+n-1,j) * * Compute i-th column of A - Y * V' * CALL PDGEMV( 'No transpose', N, L-1, -ONE, Y, IY, JY, DESCY, $ A, I, JA, DESCA, DESCA( M_ ), ONE, A, IA, J, $ DESCA, 1 ) * * Apply I - V * T' * V' to this column (call it b) from the * left, using the last column of T as workspace * * Let V = ( V1 ) and b = ( b1 ) (first I-1 rows) * ( V2 ) ( b2 ) * * where V1 is unit lower triangular * * w := V1' * b1 * IF( IPROC ) THEN CALL DCOPY( L-1, A( (JJ+L-2)*DESCA( LLD_ )+II ), 1, $ WORK( JW ), 1 ) CALL DTRMV( 'Lower', 'Transpose', 'Unit', L-1, $ A( (JJ-1)*DESCA( LLD_ )+II ), DESCA( LLD_ ), $ WORK( JW ), 1 ) END IF * * w := w + V2'*b2 * CALL PDGEMV( 'Transpose', N-K-L+1, L-1, ONE, A, I+1, JA, $ DESCA, A, I+1, J, DESCA, 1, ONE, WORK, 1, JW, $ DESCW, DESCW( M_ ) ) * * w := T'*w * IF( IPROC ) $ CALL DTRMV( 'Upper', 'Transpose', 'Non-unit', L-1, T, $ DESCA( NB_ ), WORK( JW ), 1 ) * * b2 := b2 - V2*w * CALL PDGEMV( 'No transpose', N-K-L+1, L-1, -ONE, A, I+1, JA, $ DESCA, WORK, 1, JW, DESCW, DESCW( M_ ), ONE, $ A, I+1, J, DESCA, 1 ) * * b1 := b1 - V1*w * IF( IPROC ) THEN CALL DTRMV( 'Lower', 'No transpose', 'Unit', L-1, $ A( (JJ-1)*DESCA( LLD_ )+II ), DESCA( LLD_ ), $ WORK( JW ), 1 ) CALL DAXPY( L-1, -ONE, WORK( JW ), 1, $ A( ( JJ+L-2 )*DESCA( LLD_ )+II ), 1 ) END IF CALL PDELSET( A, I, J-1, DESCA, EI ) END IF * * Generate the elementary reflector H(i) to annihilate * A(ia+k+i:ia+n-1,j) * CALL PDLARFG( N-K-L+1, EI, I+1, J, A, MIN( I+2, N+IA-1 ), J, $ DESCA, 1, TAU ) CALL PDELSET( A, I+1, J, DESCA, ONE ) * * Compute Y(iy:y+n-1,jy+l-1) * CALL PDGEMV( 'No transpose', N, N-K-L+1, ONE, A, IA, J+1, $ DESCA, A, I+1, J, DESCA, 1, ZERO, Y, IY, JY+L-1, $ DESCY, 1 ) CALL PDGEMV( 'Transpose', N-K-L+1, L-1, ONE, A, I+1, JA, DESCA, $ A, I+1, J, DESCA, 1, ZERO, WORK, 1, JW, DESCW, $ DESCW( M_ ) ) CALL PDGEMV( 'No transpose', N, L-1, -ONE, Y, IY, JY, DESCY, $ WORK, 1, JW, DESCW, DESCW( M_ ), ONE, Y, IY, $ JY+L-1, DESCY, 1 ) JL = MIN( JJ+L-1, JA+NQ-1 ) CALL PDSCAL( N, TAU( JL ), Y, IY, JY+L-1, DESCY, 1 ) * * Compute T(1:i,i) * IF( IPROC ) THEN JT = ( L-1 ) * DESCA( NB_ ) CALL DSCAL( L-1, -TAU( JL ), WORK( JW ), 1 ) CALL DCOPY( L-1, WORK( JW ), 1, T( JT+1 ), 1 ) CALL DTRMV( 'Upper', 'No transpose', 'Non-unit', L-1, T, $ DESCA( NB_ ), T( JT+1 ), 1 ) T( JT+L ) = TAU( JL ) END IF 10 CONTINUE * CALL PDELSET( A, K+NB+IA-1, J, DESCA, EI ) * RETURN * * End of PDLAHRD * END scalapack-2.0.2/SRC/pdlaiect.c000644 000766 000024 00000021537 11735440737 016314 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- ScaLAPACK routine (version 1.5) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * --------------------------------------------------------------------- */ /* * Include Files */ #include "pxsyevx.h" #include #include #define proto(x) () void pdlasnbt_( int *ieflag ) { /* * * Purpose * ======= * * pdalsnbt finds the position of the signbit of a double * double precision floating point number. This routine assumes IEEE * arithmetic, and hence, tests only the 32nd and 64th bits as * possibilities for the sign bit. * * Note : For this release, we assume that sizeof(int) is 4 bytes. * * Note : If a compile time flag (NO_IEEE) indicates that the * machine does not have IEEE arithmetic, IEFLAG = 0 is returned. * * Arguments * ========= * * IEFLAG (output) INTEGER * This indicates the position of the signbit of any double * precision floating point number. * IEFLAG = 0 if the compile time flag, NO_IEEE, indicates * that the machine does not have IEEE arithmetic, or if * sizeof(int) is different from 4 bytes. * IEFLAG = 1 indicates that the sign bit is the 32nd * bit ( Big Endian ). * IEFLAG = 2 indicates that the sign bit is the 64th * bit ( Little Endian ). * * ===================================================================== * * .. Local Scalars .. */ double x; int negone=-1, errornum; unsigned int *ix; /* .. * .. Executable Statements .. */ #ifdef NO_IEEE *ieflag = 0; #else if(sizeof(int) != 4){ *ieflag = 0; return; } x = (double) -1.0; ix = (unsigned int *) &x; if(( *ix == 0xbff00000) && ( *(ix+1) == 0x0) ) { *ieflag = 1; } else if(( *(ix+1) == 0xbff00000) && ( *ix == 0x0) ) { *ieflag = 2; } else { *ieflag = 0; } #endif } void pdlaiectb_( double *sigma, int *n, double *d, int *count ) { /* * * Purpose * ======= * * pdlaiectb computes the number of negative eigenvalues of (A- SIGMA I). * This implementation of the Sturm Sequence loop exploits IEEE Arithmetic * and has no conditionals in the innermost loop. To extract the signbit, * this routine assumes that the double precision word is stored in * "Big Endian" word order, i.e, the signbit is assumed to be bit 32. * * Note that all arguments are call-by-reference so that this routine * can be directly called from Fortran code. * * This is a ScaLAPACK internal subroutine and arguments are not * checked for unreasonable values. * * Arguments * ========= * * SIGMA (input) DOUBLE PRECISION * The shift. pdlaiectb finds the number of eigenvalues * less than equal to SIGMA. * * N (input) INTEGER * The order of the tridiagonal matrix T. N >= 1. * * D (input) DOUBLE PRECISION array, dimension (2*N - 1) * Contains the diagonals and the squares of the off-diagonal * elements of the tridiagonal matrix T. These elements are * assumed to be interleaved in memory for better cache * performance. The diagonal entries of T are in the entries * D(1),D(3),...,D(2*N-1), while the squares of the off-diagonal * entries are D(2),D(4),...,D(2*N-2). To avoid overflow, the * matrix must be scaled so that its largest entry is no greater * than overflow**(1/2) * underflow**(1/4) in absolute value, * and for greatest accuracy, it should not be much smaller * than that. * * COUNT (output) INTEGER * The count of the number of eigenvalues of T less than or * equal to SIGMA. * * ===================================================================== * * .. Local Scalars .. */ double lsigma, tmp, *pd, *pe2; int i; /* .. * .. Executable Statements .. */ lsigma = *sigma; pd = d; pe2 = d+1; tmp = *pd - lsigma; pd += 2; *count = (*((int *)&tmp) >> 31) & 1; for(i = 1;i < *n;i++){ tmp = *pd - *pe2/tmp - lsigma; pd += 2; pe2 += 2; *count += ((*((int *)&tmp)) >> 31) & 1; } } void pdlaiectl_( double *sigma, int *n, double *d, int *count ) { /* * * Purpose * ======= * * pdlaiectl computes the number of negative eigenvalues of (A- SIGMA I). * This implementation of the Sturm Sequence loop exploits IEEE Arithmetic * and has no conditionals in the innermost loop. To extract the signbit, * this routine assumes that the double precision word is stored in * "Little Endian" word order, i.e, the signbit is assumed to be bit 64. * * Note that all arguments are call-by-reference so that this routine * can be directly called from Fortran code. * * This is a ScaLAPACK internal subroutine and arguments are not * checked for unreasonable values. * * Arguments * ========= * * SIGMA (input) DOUBLE PRECISION * The shift. pdlaiectl finds the number of eigenvalues * less than equal to SIGMA. * * N (input) INTEGER * The order of the tridiagonal matrix T. N >= 1. * * D (input) DOUBLE PRECISION array, dimension (2*N - 1) * Contains the diagonals and the squares of the off-diagonal * elements of the tridiagonal matrix T. These elements are * assumed to be interleaved in memory for better cache * performance. The diagonal entries of T are in the entries * D(1),D(3),...,D(2*N-1), while the squares of the off-diagonal * entries are D(2),D(4),...,D(2*N-2). To avoid overflow, the * matrix must be scaled so that its largest entry is no greater * than overflow**(1/2) * underflow**(1/4) in absolute value, * and for greatest accuracy, it should not be much smaller * than that. * * COUNT (output) INTEGER * The count of the number of eigenvalues of T less than or * equal to SIGMA. * * ===================================================================== * * .. Local Scalars .. */ double lsigma, tmp, *pd, *pe2; int i; /* .. * .. Executable Statements .. */ lsigma = *sigma; pd = d; pe2 = d+1; tmp = *pd - lsigma; pd += 2; *count = (*(((int *)&tmp)+1) >> 31) & 1; for(i = 1;i < *n;i++){ tmp = *pd - *pe2/tmp - lsigma; pd += 2; pe2 += 2; *count += (*(((int *)&tmp)+1) >> 31) & 1; } } void pdlachkieee_( int *isieee, double *rmax, double *rmin ) { /* * * Purpose * ======= * * pdlachkieee performs a simple check to make sure that the features * of the IEEE standard that we rely on are implemented. In some * implementations, pdlachkieee may not return. * * Note that all arguments are call-by-reference so that this routine * can be directly called from Fortran code. * * This is a ScaLAPACK internal subroutine and arguments are not * checked for unreasonable values. * * Arguments * ========= * * ISIEEE (local output) INTEGER * On exit, ISIEEE = 1 implies that all the features of the * IEEE standard that we rely on are implemented. * On exit, ISIEEE = 0 implies that some the features of the * IEEE standard that we rely on are missing. * * RMAX (local input) DOUBLE PRECISION * The overflow threshold ( = DLAMCH('O') ). * * RMIN (local input) DOUBLE PRECISION * The underflow threshold ( = DLAMCH('U') ). * * ===================================================================== * * .. Local Scalars .. */ double x, pinf, pzero, ninf, nzero; int ieflag, *ix, sbit1, sbit2, negone=-1, errornum; /* .. * .. Executable Statements .. */ pdlasnbt_( &ieflag ); pinf = *rmax / *rmin; pzero = 1.0 / pinf; pinf = 1.0 / pzero; if( pzero != 0.0 ){ printf("pzero = %g should be zero\n",pzero); *isieee = 0; return ; } if( ieflag == 1 ){ sbit1 = (*((int *)&pzero) >> 31) & 1; sbit2 = (*((int *)&pinf) >> 31) & 1; }else if(ieflag == 2){ sbit1 = (*(((int *)&pzero)+1) >> 31) & 1; sbit2 = (*(((int *)&pinf)+1) >> 31) & 1; } if( sbit1 == 1 ){ printf("Sign of positive infinity is incorrect\n"); *isieee = 0; } if( sbit2 == 1 ){ printf("Sign of positive zero is incorrect\n"); *isieee = 0; } ninf = -pinf; nzero = 1.0 / ninf; ninf = 1.0 / nzero; if( nzero != 0.0 ){ printf("nzero = %g should be zero\n",nzero); *isieee = 0; } if( ieflag == 1 ){ sbit1 = (*((int *)&nzero) >> 31) & 1; sbit2 = (*((int *)&ninf) >> 31) & 1; }else if(ieflag == 2){ sbit1 = (*(((int *)&nzero)+1) >> 31) & 1; sbit2 = (*(((int *)&ninf)+1) >> 31) & 1; } if( sbit1 == 0 ){ printf("Sign of negative infinity is incorrect\n"); *isieee = 0; } if( sbit2 == 0 ){ printf("Sign of negative zero is incorrect\n"); *isieee = 0; } } scalapack-2.0.2/SRC/pdlamch.f000644 000766 000024 00000005111 11622500733 016114 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PDLAMCH( ICTXT, CMACH ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER CMACH INTEGER ICTXT * .. * * Purpose * ======= * * PDLAMCH determines double precision machine parameters. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle in which the computation takes * place. * * CMACH (global input) CHARACTER*1 * Specifies the value to be returned by PDLAMCH: * = 'E' or 'e', PDLAMCH := eps * = 'S' or 's , PDLAMCH := sfmin * = 'B' or 'b', PDLAMCH := base * = 'P' or 'p', PDLAMCH := eps*base * = 'N' or 'n', PDLAMCH := t * = 'R' or 'r', PDLAMCH := rnd * = 'M' or 'm', PDLAMCH := emin * = 'U' or 'u', PDLAMCH := rmin * = 'L' or 'l', PDLAMCH := emax * = 'O' or 'o', PDLAMCH := rmax * * where * * eps = relative machine precision * sfmin = safe minimum, such that 1/sfmin does not overflow * base = base of the machine * prec = eps*base * t = number of (base) digits in the mantissa * rnd = 1.0 when rounding occurs in addition, 0.0 otherwise * emin = minimum exponent before (gradual) underflow * rmin = underflow threshold - base**(emin-1) * emax = largest exponent before overflow * rmax = overflow threshold - (base**emax)*(1-eps) * * ===================================================================== * * .. Local Scalars .. INTEGER IDUMM DOUBLE PRECISION TEMP * .. * .. External Subroutines .. EXTERNAL DGAMN2D, DGAMX2D * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH, LSAME * .. * .. Executable Statements .. * TEMP = DLAMCH( CMACH ) IDUMM = 0 * IF( LSAME( CMACH, 'E' ).OR.LSAME( CMACH, 'S' ).OR. $ LSAME( CMACH, 'M' ).OR.LSAME( CMACH, 'U' ) ) THEN CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) ELSE IF( LSAME( CMACH, 'L' ).OR.LSAME( CMACH, 'O' ) ) THEN CALL DGAMN2D( ICTXT, 'All', ' ', 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) END IF * PDLAMCH = TEMP * * End of PDLAMCH * END scalapack-2.0.2/SRC/pdlamr1d.f000644 000766 000024 00000010671 10363532303 016216 0ustar00juliestaff000000 000000 SUBROUTINE PDLAMR1D( N, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IA, IB, JA, JB, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Bugs * ==== * * I am not sure that this works correctly when IB and JB are not equal * to 1. Indeed, I suspect that IB should always be set to 1 or ignored * with 1 used in its place. * * PDLAMR1D has not been tested except withint the contect of * PDSYPTRD, the prototype reduction to tridiagonal form code. * * Purpose * * ======= * * PDLAMR1D redistributes a one-dimensional row vector from one data * decomposition to another. * * This is an auxiliary routine called by PDSYTRD to redistribute D, E * and TAU. * * Notes * ===== * * Although all processes call PDGEMR2D, only the processes that own * the first column of A send data and only processes that own the * first column of B receive data. The calls to DGEBS2D/DGEBR2D * spread the data down. * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix to be transposed. * * A (local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LOCc(JA+N-1)). * On output, A is replicated across all processes in * this processor column. * * IA (global input) INTEGER * A's global row index, which points to the beginning of * the submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LOCc(JB+N-1)). * * IB (global input) INTEGER * B's global row index, NOT USED * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace) COMPLEX*16 array, dimension ( LWORK ) * * LWORK (local input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCAA( DLEN_ ), DESCBB( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, PDGEMR2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return if possible * IF( N.LE.0 ) $ RETURN * DO 10 I = 1, DLEN_ DESCAA( I ) = DESCA( I ) DESCBB( I ) = DESCB( I ) 10 CONTINUE * DESCAA( M_ ) = 1 DESCBB( M_ ) = 1 DESCAA( LLD_ ) = 1 DESCBB( LLD_ ) = 1 * ICTXT = DESCB( CTXT_ ) CALL PDGEMR2D( 1, N, A, IA, JA, DESCAA, B, IB, JB, DESCBB, ICTXT ) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NQ = NUMROC( N, DESCB( NB_ ), MYCOL, 0, NPCOL ) * IF( MYROW.EQ.0 ) THEN CALL DGEBS2D( ICTXT, 'C', ' ', NQ, 1, B, NQ ) ELSE CALL DGEBR2D( ICTXT, 'C', ' ', NQ, 1, B, NQ, 0, MYCOL ) END IF * RETURN * * End of PDLAMR1D * END scalapack-2.0.2/SRC/pdlamve.f000644 000766 000024 00000020050 11750130340 016126 0ustar00juliestaff000000 000000 SUBROUTINE PDLAMVE( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, DWORK ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), DWORK( * ) * .. * * Purpose * ======= * * PDLAMVE copies all or part of a distributed matrix A to another * distributed matrix B. There is no alignment assumptions at all * except that A and B are of the same size. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * DWORK (local workspace) DOUBLE PRECISION array * If UPLO = 'U' or UPLO = 'L' and number of processors > 1, * the length of DWORK is at least as large as the length of B. * Otherwise, DWORK is not referenced. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER, LOWER, FULL INTEGER ICTXT, NPROW, NPCOL, MYROW, MYCOL, MYPROC, $ NPROCS, AROWS, ACOLS, K, SPROC, SRSRC, SCSRC, $ RPROC, RRSRC, RCSRC, COUNT, J, I, IIA, JJA, $ IIB, JJB, BRSRC, BCSRC, RAROWS, RACOLS, $ INDEX, IDUM, NUMREC, NUMSND * .. * .. External Subroutines .. EXTERNAL DLAMOV, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC, INDXL2G EXTERNAL ICEIL, LSAME, NUMROC, INDXL2G * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Find underlying mesh properties. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Decode input parameters. * UPPER = LSAME( UPLO, 'U' ) IF( .NOT. UPPER ) LOWER = LSAME( UPLO, 'L' ) FULL = (.NOT. UPPER) .AND. (.NOT. LOWER) * * Assign indiviual numbers based on column major ordering. * NPROCS = NPROW*NPCOL * * Do redistribution operation. * IF( NPROCS.EQ.1 ) THEN CALL DLAMOV( UPLO, M, N, A((JA-1)*DESCA(LLD_)+IA), $ DESCA(LLD_), B((JB-1)*DESCB(LLD_)+IB), $ DESCB(LLD_) ) ELSEIF( FULL ) THEN CALL PDGEMR2D( M, N, A, IA, JA, DESCA, B, IB, JB, DESCB, $ ICTXT ) ELSE CALL PDGEMR2D( M, N, A, IA, JA, DESCA, DWORK, IB, JB, DESCB, $ ICTXT ) CALL PDLACPY( UPLO, M, N, DWORK, IB, JB, DESCB, B, IB, JB, $ DESCB ) END IF * RETURN * * End of PDLAMVE * END scalapack-2.0.2/SRC/pdlange.f000644 000766 000024 00000026644 10363532303 016133 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PDLANGE( NORM, M, N, A, IA, JA, DESCA, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDLANGE returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * distributed matrix sub( A ) = A(IA:IA+M-1, JA:JA+N-1). * * PDLANGE returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+M-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PDLANGE as described * above. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). When M = 0, PDLANGE * is set to zero. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). When N = 0, * PDLANGE is set to zero. N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, ICOFF, IOFFA, $ IROFF, J, JJ, LDA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ DOUBLE PRECISION SCALE, SUM, VALUE * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOMBSSQ, DGEBR2D, $ DGEBS2D, DGAMX2D, DGSUM2D, DLASSQ, $ INFOG2L, PDTREECOMB * .. * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX, NUMROC EXTERNAL LSAME, IDAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * IF( MIN( M, N ).EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO IF( NQ.GT.0 .AND. MP.GT.0 ) THEN IOFFA = (JJ-1)*LDA DO 20 J = JJ, JJ+NQ-1 DO 10 I = II, MP+II-1 VALUE = MAX( VALUE, ABS( A( IOFFA+I ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE END IF CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, J, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * * Find norm1( sub( A ) ). * IF( NQ.GT.0 ) THEN IOFFA = ( JJ - 1 ) * LDA DO 40 J = JJ, JJ+NQ-1 SUM = ZERO IF( MP.GT.0 ) THEN DO 30 I = II, MP+II-1 SUM = SUM + ABS( A( IOFFA+I ) ) 30 CONTINUE END IF IOFFA = IOFFA + LDA WORK( J-JJ+1 ) = SUM 40 CONTINUE END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( IDAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, J, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * * Find normI( sub( A ) ). * IF( MP.GT.0 ) THEN IOFFA = II + ( JJ - 1 ) * LDA DO 60 I = II, II+MP-1 SUM = ZERO IF( NQ.GT.0 ) THEN DO 50 J = IOFFA, IOFFA + NQ*LDA - 1, LDA SUM = SUM + ABS( A( J ) ) 50 CONTINUE END IF WORK( I-II+1 ) = SUM IOFFA = IOFFA + 1 60 CONTINUE END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, WORK, MAX( 1, MP ), $ MYROW, 0 ) * * Find maximum sum of rows for supnorm * IF( MYCOL.EQ.0 ) THEN IF( MP.GT.0 ) THEN VALUE = WORK( IDAMAX( MP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, I, $ J, -1, 0, 0 ) END IF * ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE IOFFA = II + ( JJ - 1 ) * LDA IF( NQ.GT.0 ) THEN DO 70 J = IOFFA, IOFFA + NQ*LDA - 1, LDA CALL DLASSQ( MP, A( J ), 1, SCALE, SUM ) 70 CONTINUE END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PDTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, DCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PDLANGE = VALUE * RETURN * * End of PDLANGE * END scalapack-2.0.2/SRC/pdlanhs.f000644 000766 000024 00000062503 10363532303 016144 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PDLANHS( NORM, N, A, IA, JA, DESCA, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDLANHS returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * Hessenberg distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * PDLANHS returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PDLANHS as described * above. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PDLANHS is set to zero. N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, II, IIA, ICOFF, INXTROW, $ IOFFA, IROFF, J, JB, JJ, JJA, JN, KK, LDA, LL, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION SCALE, SUM, VALUE * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOMBSSQ, DGEBR2D, $ DGEBS2D, DGAMX2D, DGSUM2D, DLASSQ, $ INFOG2L, PDTREECOMB * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, IDAMAX, NUMROC EXTERNAL LSAME, ICEIL, IDAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) IOFFA = ( JJA - 1 ) * LDA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * VALUE = ZERO * * Find max(abs(A(i,j))). * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 20 LL = JJ, JJ+JB-1 DO 10 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 50 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 40 LL = JJ, JJ+JB-1 DO 30 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 50 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 70 LL = JJ, JJ + JB -1 DO 60 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 60 CONTINUE IOFFA = IOFFA + LDA 70 CONTINUE ELSE DO 90 LL = JJ, JJ+JB-1 DO 80 KK = IIA, MIN( II-1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ VALUE = MAX( VALUE, ABS( A( II+(JJ+JB-2)*LDA ) ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 140 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 110 LL = JJ, JJ + JB -1 DO 100 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE ELSE DO 130 LL = JJ, JJ + JB -1 DO 120 KK = IIA, MIN( II-1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 120 CONTINUE IOFFA = IOFFA + LDA 130 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ VALUE = MAX( VALUE, $ ABS( A( II+(JJ+JB-2)*LDA ) ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 140 CONTINUE * END IF * * Gather the intermediate results to process (0,0). * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, KK, LL, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * VALUE = ZERO II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 160 LL = JJ, JJ+JB-1 SUM = ZERO DO 150 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 150 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 160 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 190 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 180 LL = JJ, JJ+JB-1 SUM = ZERO DO 170 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 170 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 180 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 190 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 210 LL = JJ, JJ + JB -1 SUM = ZERO DO 200 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 200 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 210 CONTINUE ELSE DO 230 LL = JJ, JJ + JB -1 SUM = ZERO DO 220 KK = IIA, MIN( II-1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 220 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 230 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( JJ+JB-JJA ) = WORK( JJ+JB-JJA ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 280 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 250 LL = JJ, JJ + JB -1 SUM = ZERO DO 240 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 240 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 250 CONTINUE ELSE DO 270 LL = JJ, JJ + JB -1 SUM = ZERO DO 260 KK = IIA, MIN( II-1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 260 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 270 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( JJ+JB-JJA ) = WORK( JJ+JB-JJA ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 280 CONTINUE * END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( IDAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, KK, LL, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * DO 290 KK = IIA, IIA+NP-1 WORK( KK ) = ZERO 290 CONTINUE * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 310 LL = JJ, JJ+JB-1 DO 300 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 300 CONTINUE IOFFA = IOFFA + LDA 310 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 340 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 330 LL = JJ, JJ+JB-1 DO 320 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 320 CONTINUE IOFFA = IOFFA + LDA 330 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 340 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 360 LL = JJ, JJ + JB -1 DO 350 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 350 CONTINUE IOFFA = IOFFA + LDA 360 CONTINUE ELSE DO 380 LL = JJ, JJ + JB -1 DO 370 KK = IIA, MIN( II-1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 370 CONTINUE IOFFA = IOFFA + LDA 380 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( II-IIA+1 ) = WORK( II-IIA+1 ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 430 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 400 LL = JJ, JJ + JB -1 DO 390 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 390 CONTINUE IOFFA = IOFFA + LDA 400 CONTINUE ELSE DO 420 LL = JJ, JJ + JB -1 DO 410 KK = IIA, MIN( II-1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS(A(IOFFA+KK)) 410 CONTINUE IOFFA = IOFFA + LDA 420 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( II-IIA+1 ) = WORK( II-IIA+1 ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 430 CONTINUE * END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK, MAX( 1, NP ), $ MYROW, 0 ) * * Find maximum sum of rows for Infinity-norm * IF( MYCOL.EQ.0 ) THEN IF( NP.GT.0 ) THEN VALUE = WORK( IDAMAX( NP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, KK, $ LL, -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * SCALE = ZERO SUM = ONE II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 440 LL = JJ, JJ+JB-1 CALL DLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 440 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 460 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 450 LL = JJ, JJ+JB-1 CALL DLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 450 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 460 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 470 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 470 CONTINUE ELSE DO 480 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II-1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 480 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ CALL DLASSQ( 1, A( II+(JJ+JB-2)*LDA ), 1, $ SCALE, SUM ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 510 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 490 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 490 CONTINUE ELSE DO 500 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II-1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 500 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ CALL DLASSQ( 1, A( II+(JJ+JB-2)*LDA ), 1, $ SCALE, SUM ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 510 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PDTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, DCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PDLANHS = VALUE * RETURN * * End of PDLANHS * END scalapack-2.0.2/SRC/pdlansy.f000644 000766 000024 00000070305 10363532303 016164 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PDLANSY( NORM, UPLO, N, A, IA, JA, $ DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDLANSY returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * real symmetric distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * PDLANSY returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PDLANSY as described * above. * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is to be referenced. * = 'U': Upper triangular part of sub( A ) is referenced, * = 'L': Lower triangular part of sub( A ) is referenced. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PDLANSY is set to zero. N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the symmetric distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular matrix which norm is * to be computed, and the strictly lower triangular part of * this matrix is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular matrix which norm is to be computed, and the * strictly upper triangular part of sub( A ) is not referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * 2*Nq0+Np0+LDW if NORM = '1', 'O', 'o', 'I' or 'i', * where LDW is given by: * IF( NPROW.NE.NPCOL ) THEN * LDW = MB_A*CEIL(CEIL(Np0/MB_A)/(LCM/NPROW)) * ELSE * LDW = 0 * END IF * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * * where LCM is the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) and CEIL denotes the ceiling * operation (ICEIL). * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * ICEIL, ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IAROW, IACOL, IB, ICOFF, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, IROFF, ICSR, ICSR0, $ IOFFA, IRSC, IRSC0, IRSR, IRSR0, JJ, JJA, K, $ LDA, LL, MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION SCALE, SUM, VALUE * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DAXPY, DCOMBSSQ, $ DGAMX2D, DGSUM2D, DGEBR2D, $ DGEBS2D, DLASSQ, PDCOL2ROW, $ PDTREECOMB * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, IDAMAX, NUMROC EXTERNAL ICEIL, IDAMAX, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters and local indexes. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) ICSR = 1 IRSR = ICSR + NQ IRSC = IRSR + NQ IF( MYROW.EQ.IAROW ) THEN IRSC0 = IRSC + IROFF NP = NP - IROFF ELSE IRSC0 = IRSC END IF IF( MYCOL.EQ.IACOL ) THEN ICSR0 = ICSR + ICOFF IRSR0 = IRSR + ICOFF NQ = NQ - ICOFF ELSE ICSR0 = ICSR IRSR0 = IRSR END IF IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+N-1 ) LDA = DESCA( LLD_ ) * * If the matrix is symmetric, we address only a triangular portion * of the matrix. A sum of row (column) i of the complete matrix * can be obtained by adding along row i and column i of the the * triangular matrix, stopping/starting at the diagonal, which is * the point of reflection. The pictures below demonstrate this. * In the following code, the row sums created by --- rows below are * refered to as ROWSUMS, and the column sums shown by | are refered * to as COLSUMS. Infinity-norm = 1-norm = ROWSUMS+COLSUMS. * * UPLO = 'U' UPLO = 'L' * ____i______ ___________ * |\ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \|------| i i|---\ | * | \ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * |__________\| |___|______\| * i * * II, JJ : local indices into array A * ICURROW : process row containing diagonal block * ICURCOL : process column containing diagonal block * IRSC0 : pointer to part of work used to store the ROWSUMS while * they are stored along a process column * IRSR0 : pointer to part of work used to store the ROWSUMS after * they have been transposed to be along a process row * II = IIA JJ = JJA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 20 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 10 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 10 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 20 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 40 K = II, II+IB-1 IF( JJ.LE.JJA+NQ-1 ) THEN DO 30 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 30 CONTINUE END IF IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 40 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining rows/columns of the matrix. * DO 90 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 60 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 50 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 50 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 60 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 80 K = II, II+IB-1 IF( JJ.LE.JJA+NQ-1 ) THEN DO 70 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 70 CONTINUE END IF IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 80 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 90 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 110 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.LE.IIA+NP-1 ) THEN DO 100 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 100 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 110 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 130 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 120 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 120 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 130 CONTINUE ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 180 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 150 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.LE.IIA+NP-1 ) THEN DO 140 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 140 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 150 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 170 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 160 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 160 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 170 CONTINUE ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * END IF * * Gather the result on process (IAROW,IACOL). * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, K, -1, $ IAROW, IACOL ) * ELSE IF( LSAME( NORM, 'I' ) .OR. LSAME( NORM, 'O' ) .OR. $ NORM.EQ.'1' ) THEN * * Find normI( sub( A ) ) ( = norm1( sub( A ) ), since sub( A ) is * symmetric). * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 200 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 190 LL = IIA, II-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 190 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 200 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 220 K = II, II+IB-1 SUM = ZERO IF( JJA+NQ.GT.JJ ) THEN DO 210 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 210 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 220 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining rows/columns of global matrix. * DO 270 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 240 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 230 LL = IIA, II-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 230 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 240 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 260 K = II, II+IB-1 SUM = ZERO IF( JJA+NQ.GT.JJ ) THEN DO 250 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 250 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 260 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 270 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = (JJ-1)*LDA DO 290 K = 0, IB-1 SUM = ZERO IF( IIA+NP.GT.II ) THEN DO 280 LL = II, IIA+NP-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 280 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 290 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 310 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 300 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 300 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 310 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 360 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 330 K = 0, IB-1 SUM = ZERO IF( IIA+NP.GT.II ) THEN DO 320 LL = II, IIA+NP-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 320 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 330 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 350 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 340 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 340 CONTINUE END IF WORK(K-IIA+IRSC0) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 350 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 360 CONTINUE END IF * * After calls to DGSUM2D, process row 0 will have global * COLSUMS and process column 0 will have global ROWSUMS. * Transpose ROWSUMS and add to COLSUMS to get global row/column * sum, the max of which is the infinity or 1 norm. * IF( MYCOL.EQ.IACOL ) $ NQ = NQ + ICOFF CALL DGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK( ICSR ), 1, $ IAROW, MYCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP + IROFF CALL DGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IRSC ), $ MAX( 1, NP ), MYROW, IACOL ) * CALL PDCOL2ROW( ICTXT, N, 1, DESCA( MB_ ), WORK( IRSC ), $ MAX( 1, NP ), WORK( IRSR ), MAX( 1, NQ ), $ IAROW, IACOL, IAROW, IACOL, WORK( IRSC+NP ) ) * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF CALL DAXPY( NQ, ONE, WORK( IRSR0 ), 1, WORK( ICSR0 ), 1 ) IF( NQ.LT.1 ) THEN VALUE = ZERO ELSE VALUE = WORK( IDAMAX( NQ, WORK( ICSR0 ), 1 ) ) END IF CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, K, $ -1, IAROW, IACOL ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE * * Add off-diagonal entries, first * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 370 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL DLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) $ II = II + 1 CALL DLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) 370 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 390 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 380 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL DLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) $ II = II + 1 CALL DLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) 380 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 400 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL DLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) $ II = II + 1 CALL DLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 400 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 420 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 410 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL DLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) $ II = II + 1 CALL DLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 410 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 420 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM * CALL PDTREECOMB( ICTXT, 'All', 2, RWORK, IAROW, IACOL, $ DCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to the other processes * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, IAROW, $ IACOL ) END IF * PDLANSY = VALUE * RETURN * * End of PDLANSY * END scalapack-2.0.2/SRC/pdlantr.f000644 000766 000024 00000110624 10363532303 016155 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PDLANTR( NORM, UPLO, DIAG, M, N, A, $ IA, JA, DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDLANTR returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * trapezoidal or triangular distributed matrix sub( A ) denoting * A(IA:IA+M-1, JA:JA+N-1). * * PDLANTR returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with ia <= i <= ia+m-1, * ( and ja <= j <= ja+n-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PDLANTR as described * above. * * UPLO (global input) CHARACTER * Specifies whether the matrix sub( A ) is upper or lower * trapezoidal. * = 'U': Upper trapezoidal * = 'L': Lower trapezoidal * Note that sub( A ) is triangular instead of trapezoidal * if M = N. * * DIAG (global input) CHARACTER * Specifies whether or not the distributed matrix sub( A ) has * unit diagonal. * = 'N': Non-unit diagonal * = 'U': Unit diagonal * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). When M = 0, PDLANTR is * set to zero. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). When N = 0, * PDLANTR is set to zero. N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL UDIAG INTEGER IACOL, IAROW, ICTXT, II, IIA, ICOFF, IOFFA, $ IROFF, J, JB, JJ, JJA, JN, KK, LDA, LL, MP, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION SCALE, SUM, VALUE * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOMBSSQ, DGEBR2D, $ DGEBS2D, DGAMX2D, DGSUM2D, DLASSQ, $ INFOG2L, PDTREECOMB * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, IDAMAX, NUMROC EXTERNAL LSAME, ICEIL, IDAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * UDIAG = LSAME( DIAG, 'U' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) IOFFA = ( JJA - 1 ) * LDA * IF( MIN( M, N ).EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * IF( UDIAG ) THEN VALUE = ONE ELSE VALUE = ZERO END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 20 LL = JJ, JJ + JB -1 DO 10 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE ELSE DO 40 LL = JJ, JJ + JB -1 DO 30 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE END IF ELSE DO 60 LL = JJ, JJ + JB -1 DO 50 KK = IIA, MIN( II-1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 80 LL = JJ, JJ + JB -1 DO 70 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 70 CONTINUE IOFFA = IOFFA + LDA 80 CONTINUE ELSE DO 100 LL = JJ, JJ + JB -1 DO 90 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 90 CONTINUE IOFFA = IOFFA + LDA 100 CONTINUE END IF ELSE DO 120 LL = JJ, JJ + JB -1 DO 110 KK = IIA, MIN( II-1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 110 CONTINUE IOFFA = IOFFA + LDA 120 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 130 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 150 LL = JJ, JJ + JB -1 DO 140 KK = II+LL-JJ+1, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 140 CONTINUE IOFFA = IOFFA + LDA 150 CONTINUE ELSE DO 170 LL = JJ, JJ + JB -1 DO 160 KK = II+LL-JJ, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 160 CONTINUE IOFFA = IOFFA + LDA 170 CONTINUE END IF ELSE DO 190 LL = JJ, JJ + JB -1 DO 180 KK = II, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 180 CONTINUE IOFFA = IOFFA + LDA 190 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 260 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 210 LL = JJ, JJ + JB -1 DO 200 KK = II+LL-JJ+1, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 200 CONTINUE IOFFA = IOFFA + LDA 210 CONTINUE ELSE DO 230 LL = JJ, JJ + JB -1 DO 220 KK = II+LL-JJ, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 220 CONTINUE IOFFA = IOFFA + LDA 230 CONTINUE END IF ELSE DO 250 LL = JJ, JJ + JB -1 DO 240 KK = II, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 240 CONTINUE IOFFA = IOFFA + LDA 250 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 260 CONTINUE * END IF * * Gather the intermediate results to process (0,0). * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, KK, LL, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 280 LL = JJ, JJ + JB -1 SUM = ONE DO 270 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 270 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 280 CONTINUE ELSE DO 300 LL = JJ, JJ + JB -1 SUM = ZERO DO 290 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 290 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 300 CONTINUE END IF ELSE DO 320 LL = JJ, JJ + JB -1 SUM = ZERO DO 310 KK = IIA, MIN( II-1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 310 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 320 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 390 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 340 LL = JJ, JJ + JB -1 SUM = ONE DO 330 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 330 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 340 CONTINUE ELSE DO 360 LL = JJ, JJ + JB -1 SUM = ZERO DO 350 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 350 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 360 CONTINUE END IF ELSE DO 380 LL = JJ, JJ + JB -1 SUM = ZERO DO 370 KK = IIA, MIN( II-1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 370 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 380 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 410 LL = JJ, JJ + JB -1 SUM = ONE DO 400 KK = II+LL-JJ+1, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 400 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 410 CONTINUE ELSE DO 430 LL = JJ, JJ + JB -1 SUM = ZERO DO 420 KK = II+LL-JJ, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 420 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 430 CONTINUE END IF ELSE DO 450 LL = JJ, JJ + JB -1 SUM = ZERO DO 440 KK = II, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 440 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 450 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 520 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 470 LL = JJ, JJ + JB -1 SUM = ONE DO 460 KK = II+LL-JJ+1, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 460 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 470 CONTINUE ELSE DO 490 LL = JJ, JJ + JB -1 SUM = ZERO DO 480 KK = II+LL-JJ, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 480 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 490 CONTINUE END IF ELSE DO 510 LL = JJ, JJ + JB -1 SUM = ZERO DO 500 KK = II, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 500 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 510 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 520 CONTINUE * END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( IDAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, KK, LL, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * IF( LSAME( UPLO, 'U' ) ) THEN IF( UDIAG ) THEN DO 530 KK = IIA, IIA+MP-1 WORK( KK ) = ONE 530 CONTINUE ELSE DO 540 KK = IIA, IIA+MP-1 WORK( KK ) = ZERO 540 CONTINUE END IF ELSE IF( UDIAG ) THEN NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF DO 550 KK = IIA, IIA+NP-1 WORK( KK ) = ONE 550 CONTINUE DO 560 KK = IIA+NP, IIA+MP-1 WORK( KK ) = ZERO 560 CONTINUE ELSE DO 570 KK = IIA, IIA+MP-1 WORK( KK ) = ZERO 570 CONTINUE END IF END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 590 LL = JJ, JJ + JB -1 DO 580 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 580 CONTINUE IOFFA = IOFFA + LDA 590 CONTINUE ELSE DO 610 LL = JJ, JJ + JB -1 DO 600 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 600 CONTINUE IOFFA = IOFFA + LDA 610 CONTINUE END IF ELSE DO 630 LL = JJ, JJ + JB -1 DO 620 KK = IIA, MIN( II-1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 620 CONTINUE IOFFA = IOFFA + LDA 630 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 700 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 650 LL = JJ, JJ + JB -1 DO 640 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 640 CONTINUE IOFFA = IOFFA + LDA 650 CONTINUE ELSE DO 670 LL = JJ, JJ + JB -1 DO 660 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 660 CONTINUE IOFFA = IOFFA + LDA 670 CONTINUE END IF ELSE DO 690 LL = JJ, JJ + JB -1 DO 680 KK = IIA, MIN( II-1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 680 CONTINUE IOFFA = IOFFA + LDA 690 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 700 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 720 LL = JJ, JJ + JB -1 DO 710 KK = II+LL-JJ+1, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 710 CONTINUE IOFFA = IOFFA + LDA 720 CONTINUE ELSE DO 740 LL = JJ, JJ + JB -1 DO 730 KK = II+LL-JJ, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 730 CONTINUE IOFFA = IOFFA + LDA 740 CONTINUE END IF ELSE DO 760 LL = JJ, JJ + JB -1 DO 750 KK = II, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 750 CONTINUE IOFFA = IOFFA + LDA 760 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 830 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 780 LL = JJ, JJ + JB -1 DO 770 KK = II+LL-JJ+1, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 770 CONTINUE IOFFA = IOFFA + LDA 780 CONTINUE ELSE DO 800 LL = JJ, JJ + JB -1 DO 790 KK = II+LL-JJ, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 790 CONTINUE IOFFA = IOFFA + LDA 800 CONTINUE END IF ELSE DO 820 LL = JJ, JJ + JB -1 DO 810 KK = II, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 810 CONTINUE IOFFA = IOFFA + LDA 820 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 830 CONTINUE * END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, WORK, MAX( 1, MP ), $ MYROW, 0 ) * * Find maximum sum of rows for Infinity-norm * IF( MYCOL.EQ.0 ) THEN IF( MP.GT.0 ) THEN VALUE = WORK( IDAMAX( MP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, KK, $ LL, -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * IF( UDIAG ) THEN SCALE = ONE SUM = DBLE( MIN( M, N ) ) / DBLE( NPROW*NPCOL ) ELSE SCALE = ZERO SUM = ONE END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 840 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II+LL-JJ, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 840 CONTINUE ELSE DO 850 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II+LL-JJ+1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 850 CONTINUE END IF ELSE DO 860 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II-1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 860 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 900 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 870 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II+LL-JJ+1, IIA+MP-1 )- $ IIA+1, A( IIA+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 870 CONTINUE ELSE DO 880 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II+LL-JJ, IIA+MP-1 )- $ IIA+1, A( IIA+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 880 CONTINUE END IF ELSE DO 890 LL = JJ, JJ + JB -1 CALL DLASSQ( MIN( II-1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 890 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 900 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 910 LL = JJ, JJ + JB -1 CALL DLASSQ( IIA+MP-(II+LL-JJ+1), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 910 CONTINUE ELSE DO 920 LL = JJ, JJ + JB -1 CALL DLASSQ( IIA+MP-(II+LL-JJ), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 920 CONTINUE END IF ELSE DO 930 LL = JJ, JJ + JB -1 CALL DLASSQ( IIA+MP-II, A( II+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 930 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 970 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 940 LL = JJ, JJ + JB -1 CALL DLASSQ( IIA+MP-(II+LL-JJ+1), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 940 CONTINUE ELSE DO 950 LL = JJ, JJ + JB -1 CALL DLASSQ( IIA+MP-(II+LL-JJ), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 950 CONTINUE END IF ELSE DO 960 LL = JJ, JJ + JB -1 CALL DLASSQ( IIA+MP-II, A( II+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 960 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 970 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PDTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, DCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to every process in the grid. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PDLANTR = VALUE * RETURN * * End of PDLANTR * END scalapack-2.0.2/SRC/pdlapiv.f000644 000766 000024 00000033655 10363532303 016160 0ustar00juliestaff000000 000000 SUBROUTINE PDLAPIV( DIREC, ROWCOL, PIVROC, M, N, A, IA, JA, $ DESCA, IPIV, IP, JP, DESCIP, IWORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER*1 DIREC, PIVROC, ROWCOL INTEGER IA, IP, JA, JP, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCIP( * ), IPIV( * ), IWORK( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLAPIV applies either P (permutation matrix indicated by IPIV) * or inv( P ) to a general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1), resulting in row or column * pivoting. The pivot vector may be distributed across a process row * or a column. The pivot vector should be aligned with the distributed * matrix A. This routine will transpose the pivot vector if necessary. * For example if the row pivots should be applied to the columns of * sub( A ), pass ROWCOL='C' and PIVROC='C'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Restrictions * ============ * * IPIV must always be a distributed vector (not a matrix). Thus: * IF( ROWPIV .EQ. 'C' ) THEN * JP must be 1 * ELSE * IP must be 1 * END IF * * The following restrictions apply when IPIV must be transposed: * IF( ROWPIV.EQ.'C' .AND. PIVROC.EQ.'C') THEN * DESCIP(MB_) must equal DESCA(NB_) * ELSE IF( ROWPIV.EQ.'R" .AND. PIVROC.EQ.'R') THEN * DESCIP(NB_) must equal DESCA(MB_) * END IF * * Arguments * ========= * * DIREC (global input) CHARACTER*1 * Specifies in which order the permutation is applied: * = 'F' (Forward) Applies pivots Forward from top of matrix. * Computes P*sub( A ). * = 'B' (Backward) Applies pivots Backward from bottom of * matrix. Computes inv( P )*sub( A ). * * ROWCOL (global input) CHARACTER*1 * Specifies if the rows or columns are to be permuted: * = 'R' Rows will be permuted, * = 'C' Columns will be permuted. * * PIVROC (global input) CHARACTER*1 * Specifies whether IPIV is distributed over a process row * or column: * = 'R' IPIV distributed over a process row * = 'C' IPIV distributed over a process column * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of * rows of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * distributed submatrix sub( A ) to which the row or column * interchanges will be applied. On exit, the local pieces * of the permuted distributed submatrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension (LIPIV) where LIPIV is * when ROWCOL='R' or 'r': * >= LOCr( IA+M-1 ) + MB_A if PIVROC='C' or 'c', * >= LOCc( M + MOD(JP-1,NB_P) ) if PIVROC='R' or 'r', and, * when ROWCOL='C' or 'c': * >= LOCr( N + MOD(IP-1,MB_P) ) if PIVROC='C' or 'c', * >= LOCc( JA+N-1 ) + NB_A if PIVROC='R' or 'r'. * This array contains the pivoting information. IPIV(i) is the * global row (column), local row (column) i was swapped with. * When ROWCOL='R' or 'r' and PIVROC='C' or 'c', or ROWCOL='C' * or 'c' and PIVROC='R' or 'r', the last piece of this array of * size MB_A (resp. NB_A) is used as workspace. In those cases, * this array is tied to the distributed matrix A. * * IP (global input) INTEGER * The row index in the global array P indicating the first * row of sub( P ). * * JP (global input) INTEGER * The column index in the global array P indicating the * first column of sub( P ). * * DESCIP (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed vector IPIV. * * IWORK (local workspace) INTEGER array, dimension (LDW) * where LDW is equal to the workspace necessary for * transposition, and the storage of the tranposed IPIV: * * Let LCM be the least common multiple of NPROW and NPCOL. * IF( ROWCOL.EQ.'R' .AND. PIVROC.EQ.'R' ) THEN * IF( NPROW.EQ.NPCOL ) THEN * LDW = LOCr( N_P + MOD(JP-1, NB_P) ) + NB_P * ELSE * LDW = LOCr( N_P + MOD(JP-1, NB_P) ) + * NB_P * CEIL( CEIL(LOCc(N_P)/NB_P) / (LCM/NPCOL) ) * END IF * ELSE IF( ROWCOL.EQ.'C' .AND. PIVROC.EQ.'C' ) THEN * IF( NPROW.EQ.NPCOL ) THEN * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + MB_P * ELSE * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + * MB_P * CEIL( CEIL(LOCr(M_P)/MB_P) / (LCM/NPROW) ) * END IF * ELSE * IWORK is not referenced. * END IF * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL ROWPVT INTEGER I, ICTXT, ICURCOL, ICURROW, IIP, ITMP, IPT, $ JJP, JPT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCPT( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, $ INFOG2L, PDLAPV2, PICOL2ROW, PIROW2COL * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC, INDXG2P EXTERNAL LSAME, NUMROC, INDXG2P * .. * .. Intrinsic Functions .. INTRINSIC MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) ROWPVT = LSAME( ROWCOL, 'R' ) * * If we're pivoting the rows of sub( A ) * IF( ROWPVT ) THEN IF( M.LE.1 .OR. N.LT.1 ) $ RETURN * * If the pivot vector is already distributed correctly * IF( LSAME( PIVROC, 'C' ) ) THEN CALL PDLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * Otherwise, we must redistribute IPIV to match PDLAPV2 * ELSE * * Take IPIV distributed over row 0, and store it in * iwork, distributed over column 0 * IPT = MOD( JP-1, DESCA(MB_) ) DESCPT(M_) = M + IPT + NPROW*DESCA(MB_) DESCPT(N_) = 1 DESCPT(MB_) = DESCA(MB_) DESCPT(NB_) = 1 DESCPT(RSRC_) = INDXG2P( IA, DESCA(MB_), IA, DESCA(RSRC_), $ NPROW ) DESCPT(CSRC_) = MYCOL DESCPT(CTXT_) = ICTXT DESCPT(LLD_) = NUMROC( DESCPT(M_), DESCPT(MB_), MYROW, $ DESCPT(RSRC_), NPROW ) ITMP = NUMROC( DESCIP(N_), DESCIP(NB_), MYCOL, $ DESCIP(CSRC_), NPCOL ) CALL INFOG2L( IP, JP-IPT, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) CALL PIROW2COL( ICTXT, M+IPT, 1, DESCIP(NB_), IPIV(JJP), $ ITMP, IWORK, DESCPT(LLD_), 0, ICURCOL, $ DESCPT(RSRC_), $ MYCOL, IWORK(DESCPT(LLD_)-DESCPT(MB_)+1) ) * * Send column-distributed pivots to all columns * ITMP = DESCPT(LLD_) - DESCPT(MB_) IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'Row', ' ', ITMP, 1, IWORK, ITMP ) ELSE CALL IGEBR2D( ICTXT, 'Row', ' ', ITMP, 1, IWORK, ITMP, $ MYROW, 0 ) END IF * * Adjust pivots so they are relative to the start of IWORK, * not IPIV * IPT = IPT + 1 DO 10 I = 1, ITMP IWORK(I) = IWORK(I) - JP + IPT 10 CONTINUE CALL PDLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IWORK, $ IPT, 1, DESCPT ) END IF * * Otherwise, we're pivoting the columns of sub( A ) * ELSE IF( M.LT.1 .OR. N.LE.1 ) $ RETURN * * If the pivot vector is already distributed correctly * IF( LSAME( PIVROC, 'R' ) ) THEN CALL PDLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * Otherwise, we must redistribute IPIV to match PDLAPV2 * ELSE * * Take IPIV distributed over column 0, and store it in * iwork, distributed over row 0 * JPT = MOD( IP-1, DESCA(NB_) ) DESCPT(M_) = 1 DESCPT(N_) = N + JPT + NPCOL*DESCA(NB_) DESCPT(MB_) = 1 DESCPT(NB_) = DESCA(NB_) DESCPT(RSRC_) = MYROW DESCPT(CSRC_) = INDXG2P( JA, DESCA(NB_), JA, DESCA(CSRC_), $ NPCOL ) DESCPT(CTXT_) = ICTXT DESCPT(LLD_) = 1 CALL INFOG2L( IP-JPT, JP, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) ITMP = NUMROC( N+JPT, DESCPT(NB_), MYCOL, DESCPT(CSRC_), $ NPCOL ) CALL PICOL2ROW( ICTXT, N+JPT, 1, DESCIP(MB_), IPIV(IIP), $ DESCIP(LLD_), IWORK, MAX(1, ITMP), ICURROW, $ 0, 0, DESCPT(CSRC_), IWORK(ITMP+1) ) * * Send row-distributed pivots to all rows * IF( MYROW.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'Column', ' ', ITMP, 1, IWORK, $ ITMP ) ELSE CALL IGEBR2D( ICTXT, 'Column', ' ', ITMP, 1, IWORK, $ ITMP, 0, MYCOL ) END IF * * Adjust pivots so they are relative to the start of IWORK, * not IPIV * JPT = JPT + 1 DO 20 I = 1, ITMP IWORK(I) = IWORK(I) - IP + JPT 20 CONTINUE CALL PDLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IWORK, $ 1, JPT, DESCPT ) END IF END IF * RETURN * * End of PDLAPIV * END scalapack-2.0.2/SRC/pdlapv2.f000644 000766 000024 00000036741 10363532303 016070 0ustar00juliestaff000000 000000 SUBROUTINE PDLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIREC, ROWCOL INTEGER IA, IP, JA, JP, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCIP( * ), IPIV( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLAPV2 applies either P (permutation matrix indicated by IPIV) * or inv( P ) to a M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1), resulting in row or column pivoting. The * pivot vector should be aligned with the distributed matrix A. For * pivoting the rows of sub( A ), IPIV should be distributed along a * process column and replicated over all process rows. Similarly, * IPIV should be distributed along a process row and replicated over * all process columns for column pivoting. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIREC (global input) CHARACTER * Specifies in which order the permutation is applied: * = 'F' (Forward) Applies pivots Forward from top of matrix. * Computes P * sub( A ); * = 'B' (Backward) Applies pivots Backward from bottom of * matrix. Computes inv( P ) * sub( A ). * * ROWCOL (global input) CHARACTER * Specifies if the rows or columns are to be permuted: * = 'R' Rows will be permuted, * = 'C' Columns will be permuted. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this local array contains the local pieces of the * distributed matrix sub( A ) to which the row or columns * interchanges will be applied. On exit, this array contains * the local pieces of the permuted distributed matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (input) INTEGER array, dimension >= LOCr(M_A)+MB_A if * ROWCOL = 'R', LOCc(N_A)+NB_A otherwise. It contains * the pivoting information. IPIV(i) is the global row (column), * local row (column) i was swapped with. The last piece of the * array of size MB_A (resp. NB_A) is used as workspace. IPIV is * tied to the distributed matrix A. * * IP (global input) INTEGER * IPIV's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JP (global input) INTEGER * IPIV's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCIP (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix IPIV. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL FORWRD, ROWPVT INTEGER I, IB, ICTXT, ICURCOL, ICURROW, IIP, IP1, ITMP, $ IPVWRK, J, JB, JJP, JP1, K, MA, MBA, MYCOL, $ MYROW, NBA, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBS2D, IGEBR2D, INFOG2L, $ PDSWAP * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * ROWPVT = LSAME( ROWCOL, 'R' ) IF( ROWPVT ) THEN IF( M.LE.1 .OR. N.LT.1 ) $ RETURN ELSE IF( M.LT.1 .OR. N.LE.1 ) $ RETURN END IF FORWRD = LSAME( DIREC, 'F' ) * * * Get grid and matrix parameters * MA = DESCA( M_ ) MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If I'm applying pivots from beginning to end (e.g., repeating * pivoting done earlier). Thus this section computes P * sub( A ). * IF( FORWRD ) THEN CALL INFOG2L( IP, JP, DESCIP, NPROW, NPCOL, MYROW, MYCOL, $ IIP, JJP, ICURROW, ICURCOL ) * * If I'm pivoting the rows of sub( A ) * IF( ROWPVT ) THEN IPVWRK = NUMROC( DESCIP( M_ ), DESCIP( MB_ ), MYROW, $ DESCIP( RSRC_ ), NPROW ) + 1 - $ DESCIP( MB_ ) * * Loop over rows of sub( A ) * I = IA IB = MIN( M, ICEIL( IA, MBA ) * MBA - IA + 1 ) 10 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process column * IF( MYROW.EQ.ICURROW ) THEN CALL IGEBS2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IIP ), IB ) ITMP = IIP IIP = IIP + IB ELSE ITMP = IPVWRK CALL IGEBR2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( ITMP ), IB, ICURROW, MYCOL ) END IF * * Pivot the block of rows * DO 20 K = I, I+IB-1 IP1 = IPIV( ITMP ) - IP + IA IF( IP1.NE.K ) $ CALL PDSWAP( N, A, K, JA, DESCA, MA, A, IP1, JA, $ DESCA, MA ) ITMP = ITMP + 1 20 CONTINUE * * Go on to next row of processes, increment row counter, * and figure number of rows to pivot next * ICURROW = MOD( ICURROW+1, NPROW ) I = I + IB IB = MIN( MBA, M-I+IA ) IF( IB .GT. 0 ) GOTO 10 * * If I am pivoting the columns of sub( A ) * ELSE IPVWRK = NUMROC( DESCIP( N_ ), DESCIP( NB_ ), MYCOL, $ DESCIP( CSRC_ ), NPCOL ) + 1 - $ DESCIP( NB_ ) * * Loop over columns of sub( A ) * J = JA JB = MIN( N, ICEIL( JA, NBA ) * NBA - JA + 1 ) 30 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process row * IF( MYCOL.EQ.ICURCOL ) THEN CALL IGEBS2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( JJP ), JB ) ITMP = JJP JJP = JJP + JB ELSE ITMP = IPVWRK CALL IGEBR2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( ITMP ), JB, MYROW, ICURCOL ) END IF * * Pivot the block of columns * DO 40 K = J, J+JB-1 JP1 = IPIV( ITMP ) - JP + JA IF( JP1.NE.K ) $ CALL PDSWAP( M, A, IA, K, DESCA, 1, A, IA, JP1, $ DESCA, 1 ) ITMP = ITMP + 1 40 CONTINUE * * Go on to next column of processes, increment column * counter, and figure number of columns to pivot next * ICURCOL = MOD( ICURCOL+1, NPCOL ) J = J + JB JB = MIN( NBA, N-J+JA ) IF( JB .GT. 0 ) GOTO 30 END IF * * If I want to apply pivots in reverse order, i.e. reversing * pivoting done earlier. Thus this section computes * inv( P ) * sub( A ). * ELSE * * If I'm pivoting the rows of sub( A ) * IF( ROWPVT ) THEN CALL INFOG2L( IP+M-1, JP, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) * IPVWRK = NUMROC( DESCIP( M_ ), DESCIP( MB_ ), MYROW, $ DESCIP( RSRC_ ), NPROW ) + 1 - $ DESCIP( MB_ ) * * If I'm not in the current process row, my IIP points out * past end of pivot vector (since I don't own a piece of the * last row). Adjust IIP so it points at last pivot entry. * IF( MYROW.NE.ICURROW ) IIP = IIP - 1 * * Loop over rows in reverse order, starting at last row * I = IA + M - 1 IB = MOD( I, MBA ) IF( IB .EQ. 0 ) IB = MBA IB = MIN( IB, M ) 50 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process column * IF( MYROW.EQ.ICURROW ) THEN ITMP = IIP IIP = IIP - IB CALL IGEBS2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IIP+1 ), IB ) ELSE CALL IGEBR2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IPVWRK ), IB, ICURROW, MYCOL ) ITMP = IPVWRK + IB - 1 END IF * * Pivot the block of rows * DO 60 K = I, I-IB+1, -1 IP1 = IPIV( ITMP ) - IP + IA IF( IP1.NE.K ) $ CALL PDSWAP( N, A, K, JA, DESCA, MA, A, IP1, JA, $ DESCA, MA ) ITMP = ITMP - 1 60 CONTINUE * * Go to previous row of processes, decrement row counter, * and figure number of rows to be pivoted next * ICURROW = MOD( NPROW+ICURROW-1, NPROW ) I = I - IB IB = MIN( MBA, I-IA+1 ) IF( IB .GT. 0 ) GOTO 50 * * Otherwise, I'm pivoting the columns of sub( A ) * ELSE CALL INFOG2L( IP, JP+N-1, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) IPVWRK = NUMROC( DESCIP( N_ ), DESCIP( NB_ ), MYCOL, $ DESCIP( CSRC_ ), NPCOL ) + 1 - $ DESCIP( NB_ ) * * If I'm not in the current process column, my JJP points out * past end of pivot vector (since I don't own a piece of the * last column). Adjust JJP so it points at last pivot entry. * IF( MYCOL.NE.ICURCOL ) JJP = JJP - 1 * * Loop over columns in reverse order starting at last column * J = JA + N - 1 JB = MOD( J, NBA ) IF( JB .EQ. 0 ) JB = NBA JB = MIN( JB, N ) 70 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process row * IF( MYCOL.EQ.ICURCOL ) THEN ITMP = JJP JJP = JJP - JB CALL IGEBS2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( JJP+1 ), JB ) ELSE CALL IGEBR2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( IPVWRK ), JB, MYROW, ICURCOL ) ITMP = IPVWRK + JB - 1 END IF * * Pivot a block of columns * DO 80 K = J, J-JB+1, -1 JP1 = IPIV( ITMP ) - JP + JA IF( JP1.NE.K ) $ CALL PDSWAP( M, A, IA, K, DESCA, 1, A, IA, JP1, $ DESCA, 1 ) ITMP = ITMP - 1 80 CONTINUE * * Go to previous row of processes, decrement row counter, * and figure number of rows to be pivoted next * ICURCOL = MOD( NPCOL+ICURCOL-1, NPCOL ) J = J - JB JB = MIN( NBA, J-JA+1 ) IF( JB .GT. 0 ) GOTO 70 END IF * END IF * RETURN * * End PDLAPV2 * END scalapack-2.0.2/SRC/pdlaqge.f000644 000766 000024 00000023366 10363532303 016134 0ustar00juliestaff000000 000000 SUBROUTINE PDLAQGE( M, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, EQUED ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER EQUED INTEGER IA, JA, M, N DOUBLE PRECISION AMAX, COLCND, ROWCND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), C( * ), R( * ) * .. * * Purpose * ======= * * PDLAQGE equilibrates a general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) using the row and scaling * factors in the vectors R and C. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)) * containing on entry the M-by-N matrix sub( A ). On exit, * the equilibrated distributed matrix. See EQUED for the * form of the equilibrated distributed submatrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * R (local input) DOUBLE PRECISION array, dimension LOCr(M_A) * The row scale factors for sub( A ). R is aligned with the * distributed matrix A, and replicated across every process * column. R is tied to the distributed matrix A. * * C (local input) DOUBLE PRECISION array, dimension LOCc(N_A) * The column scale factors of sub( A ). C is aligned with the * distributed matrix A, and replicated down every process * row. C is tied to the distributed matrix A. * * ROWCND (global input) DOUBLE PRECISION * The global ratio of the smallest R(i) to the largest R(i), * IA <= i <= IA+M-1. * * COLCND (global input) DOUBLE PRECISION * The global ratio of the smallest C(i) to the largest C(i), * JA <= j <= JA+N-1. * * AMAX (global input) DOUBLE PRECISION * Absolute value of largest distributed submatrix entry. * * EQUED (global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration * = 'R': Row equilibration, i.e., sub( A ) has been pre- * multiplied by diag(R(IA:IA+M-1)), * = 'C': Column equilibration, i.e., sub( A ) has been post- * multiplied by diag(C(JA:JA+N-1)), * = 'B': Both row and column equilibration, i.e., sub( A ) * has been replaced by * diag(R(IA:IA+M-1)) * sub( A ) * diag(C(JA:JA+N-1)). * * Internal Parameters * =================== * * THRESH is a threshold value used to decide if row or column scaling * should be done based on the ratio of the row or column scaling * factors. If ROWCND < THRESH, row scaling is done, and if * COLCND < THRESH, column scaling is done. * * LARGE and SMALL are threshold values used to decide if row scaling * should be done based on the absolute size of the largest matrix * element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, THRESH PARAMETER ( ONE = 1.0D+0, THRESH = 0.1D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IIA, IOFFA, $ IROFF, J, JJA, LDA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ DOUBLE PRECISION CJ, LARGE, SMALL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) THEN EQUED = 'N' RETURN END IF * * Get grid parameters and compute local indexes * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * * Initialize LARGE and SMALL. * SMALL = PDLAMCH( ICTXT, 'Safe minimum' ) / $ PDLAMCH( ICTXT, 'Precision' ) LARGE = ONE / SMALL * IF( ROWCND.GE.THRESH .AND. AMAX.GE.SMALL .AND. AMAX.LE.LARGE ) $ THEN * * No row scaling * IF( COLCND.GE.THRESH ) THEN * * No column scaling * EQUED = 'N' * ELSE * * Column scaling * IOFFA = (JJA-1)*LDA DO 20 J = JJA, JJA+NQ-1 CJ = C( J ) DO 10 I = IIA, IIA+MP-1 A( IOFFA + I ) = CJ*A( IOFFA + I ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE EQUED = 'C' END IF * ELSE IF( COLCND.GE.THRESH ) THEN * * Row scaling, no column scaling * IOFFA = (JJA-1)*LDA DO 40 J = JJA, JJA+NQ-1 DO 30 I = IIA, IIA+MP-1 A( IOFFA + I ) = R( I )*A( IOFFA + I ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE EQUED = 'R' * ELSE * * Row and column scaling * IOFFA = (JJA-1)*LDA DO 60 J = JJA, JJA+NQ-1 CJ = C( J ) DO 50 I = IIA, IIA+MP-1 A( IOFFA + I ) = CJ*R( I )*A( IOFFA + I ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE EQUED = 'B' * END IF * RETURN * * End of PDLAQGE * END scalapack-2.0.2/SRC/pdlaqr0.f000644 000766 000024 00000107735 11705175572 016101 0ustar00juliestaff000000 000000 RECURSIVE SUBROUTINE PDLAQR0( WANTT, WANTZ, N, ILO, IHI, H, $ DESCH, WR, WI, ILOZ, IHIZ, Z, DESCZ, WORK, LWORK, $ IWORK, LIWORK, INFO, RECLEVEL ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IHI, IHIZ, ILO, ILOZ, INFO, LIWORK, LWORK, N, $ RECLEVEL LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. INTEGER DESCH( * ), DESCZ( * ), IWORK( * ) DOUBLE PRECISION H( * ), WI( N ), WORK( * ), WR( N ), $ Z( * ) * .. * * Purpose * ======= * * PDLAQR0 computes the eigenvalues of a Hessenberg matrix H * and, optionally, the matrices T and Z from the Schur decomposition * H = Z*T*Z**T, where T is an upper quasi-triangular matrix (the * Schur form), and Z is the orthogonal matrix of Schur vectors. * * Optionally Z may be postmultiplied into an input orthogonal * matrix Q so that this routine can give the Schur factorization * of a matrix A which has been reduced to the Hessenberg form H * by the orthogonal matrix Q: * A = Q * H * Q**T = (QZ) * T * (QZ)**T. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix H (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that H is already upper triangular in rows * and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally * set by a previous call to PDGEBAL, and then passed to PDGEHRD * when the matrix output by PDGEBAL is reduced to Hessenberg * form. Otherwise ILO and IHI should be set to 1 and N * respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. * If N = 0, then ILO = 1 and IHI = 0. * * H (global input/output) DOUBLE PRECISION array, dimension * (DESCH(LLD_),*) * On entry, the upper Hessenberg matrix H. * On exit, if JOB = 'S', H is upper quasi-triangular in * rows and columns ILO:IHI, with 1-by-1 and 2-by-2 blocks on * the main diagonal. The 2-by-2 diagonal blocks (corresponding * to complex conjugate pairs of eigenvalues) are returned in * standard form, with H(i,i) = H(i+1,i+1) and * H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the * contents of H are unspecified on exit. * * DESCH (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix H. * * WR (global output) DOUBLE PRECISION array, dimension (N) * WI (global output) DOUBLE PRECISION array, dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If JOB = 'S', the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in H. * * Z (global input/output) DOUBLE PRECISION array. * If COMPZ = 'V', on entry Z must contain the current * matrix Z of accumulated transformations from, e.g., PDGEHRD, * and on exit Z has been updated; transformations are applied * only to the submatrix Z(ILO:IHI,ILO:IHI). * If COMPZ = 'N', Z is not referenced. * If COMPZ = 'I', on entry Z need not be set and on exit, * if INFO = 0, Z contains the orthogonal matrix Z of the Schur * vectors of H. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace) DOUBLE PRECISION array, dimension(DWORK) * * LWORK (local input) INTEGER * The length of the workspace array WORK. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the workspace array IWORK. * * INFO (output) INTEGER * = 0: successful exit * .LT. 0: if INFO = -i, the i-th argument had an illegal * value * .GT. 0: if INFO = i, PDLAQR0 failed to compute all of * the eigenvalues. Elements 1:ilo-1 and i+1:n of WR * and WI contain those eigenvalues which have been * successfully computed. (Failures are rare.) * * If INFO .GT. 0 and JOB = 'E', then on exit, the * remaining unconverged eigenvalues are the eigen- * values of the upper Hessenberg matrix rows and * columns ILO through INFO of the final, output * value of H. * * If INFO .GT. 0 and JOB = 'S', then on exit * * (*) (initial value of H)*U = U*(final value of H) * * where U is an orthogonal matrix. The final * value of H is upper Hessenberg and quasi-triangular * in rows and columns INFO+1 through IHI. * * If INFO .GT. 0 and COMPZ = 'V', then on exit * * (final value of Z) = (initial value of Z)*U * * where U is the orthogonal matrix in (*) (regard- * less of the value of JOB.) * * If INFO .GT. 0 and COMPZ = 'I', then on exit * (final value of Z) = U * where U is the orthogonal matrix in (*) (regard- * less of the value of JOB.) * * If INFO .GT. 0 and COMPZ = 'N', then Z is not * accessed. * * ================================================================ * Based on contributions by * Robert Granat, Department of Computing Science and HPC2N, * Umea University, Sweden. * ================================================================ * * Restrictions: The block size in H and Z must be square and larger * than or equal to six (6) due to restrictions in PDLAQR1, PDLAQR5 * and DLAQR6. Moreover, H and Z need to be distributed identically * with the same context. * * ================================================================ * References: * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part I: Maintaining Well Focused * Shifts, and Level 3 Performance. * SIAM J. Matrix Anal. Appl., 23(4):929--947, 2002. * * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part II: Aggressive Early * Deflation. * SIAM J. Matrix Anal. Appl., 23(4):948--973, 2002. * * R. Granat, B. Kagstrom, and D. Kressner, * A Novel Parallel QR Algorithm for Hybrid Distributed Momory HPC * Systems. * SIAM J. Sci. Comput., 32(4):2345--2378, 2010. * * ================================================================ * * .. Parameters .. * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by increasing the size of the * . deflation window after KEXNW iterations. ===== * * ==== Exceptional shifts: try to cure rare slow convergence * . with ad-hoc exceptional shifts every KEXSH iterations. * . The constants WILK1 and WILK2 are used to form the * . exceptional shifts. ==== * INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ INTEGER RECMAX PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, RECMAX = 3 ) INTEGER NTINY PARAMETER ( NTINY = 11 ) INTEGER KEXNW, KEXSH PARAMETER ( KEXNW = 5, KEXSH = 6 ) DOUBLE PRECISION WILK1, WILK2 PARAMETER ( WILK1 = 0.75D0, WILK2 = -0.4375D0 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. DOUBLE PRECISION AA, BB, CC, CS, DD, SN, SS, SWAP, ELEM, T0, $ ELEM1, ELEM2, ELEM3, ALPHA, SDSUM, STAMP INTEGER I, J, INF, IT, ITMAX, K, KACC22, KBOT, KDU, KS, $ KT, KTOP, KU, KV, KWH, KWTOP, KWV, LD, LS, $ LWKOPT, NDFL, NH, NHO, NIBBLE, NMIN, NS, NSMAX, $ NSR, NVE, NW, NWMAX, NWR, LLDH, LLDZ, II, JJ, $ ICTXT, NPROW, NPCOL, MYROW, MYCOL, IPV, IPT, $ IPW, IPWRK, VROWS, VCOLS, TROWS, TCOLS, WROWS, $ WCOLS, HRSRC, HCSRC, NB, IS, IE, NPROCS, KK, $ IROFFH, ICOFFH, HRSRC3, HCSRC3, NWIN, TOTIT, $ SWEEP, JW, TOTNS, LIWKOPT, NPMIN, ICTXT_NEW, $ MYROW_NEW, MYCOL_NEW LOGICAL NWINC, SORTED, LQUERY, RECURSION CHARACTER JBCMPZ*2 * .. * .. External Functions .. INTEGER PILAENVX, NUMROC, INDXG2P, ICEIL, BLACS_PNUM EXTERNAL PILAENVX, NUMROC, INDXG2P, ICEIL, BLACS_PNUM * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCT( DLEN_ ), DESCW( DLEN_ ), $ PMAP( 64*64 ) DOUBLE PRECISION ZDUM( 1, 1 ) * .. * .. External Subroutines .. EXTERNAL PDLACPY, PDLAQR1, DLANV2, PDLAQR3, PDLAQR5, $ PDELGET, DLAQR0, DLASET, PDGEMR2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX, MIN, MOD * .. * .. Executable Statements .. INFO = 0 ICTXT = DESCH( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL RECURSION = RECLEVEL .LT. RECMAX * * Quick return for N = 0: nothing to do. * IF( N.EQ.0 ) THEN WORK( 1 ) = ONE IWORK( 1 ) = 1 RETURN END IF * * Set up job flags for PILAENV. * IF( WANTT ) THEN JBCMPZ( 1: 1 ) = 'S' ELSE JBCMPZ( 1: 1 ) = 'E' END IF IF( WANTZ ) THEN JBCMPZ( 2: 2 ) = 'V' ELSE JBCMPZ( 2: 2 ) = 'N' END IF * * Check if workspace query * LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * Extract local leading dimensions and block factors of matrices * H and Z * LLDH = DESCH( LLD_ ) LLDZ = DESCZ( LLD_ ) NB = DESCH( MB_ ) * * Tiny (sub-) matrices must use PDLAQR1. (Stops recursion) * IF( N.LE.NTINY ) THEN * * Estimate optimal workspace. * CALL PDLAQR1( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILOZ, IHIZ, Z, DESCZ, WORK, LWORK, IWORK, LIWORK, INFO ) LWKOPT = INT( WORK(1) ) LIWKOPT = IWORK(1) * * Completely local matrices uses LAPACK. (Stops recursion) * ELSEIF( N.LE.NB ) THEN IF( MYROW.EQ.DESCH(RSRC_) .AND. MYCOL.EQ.DESCH(CSRC_) ) THEN CALL DLAQR0( WANTT, WANTZ, N, ILO, IHI, H, DESCH(LLD_), $ WR, WI, ILOZ, IHIZ, Z, DESCZ(LLD_), WORK, LWORK, INFO ) IF( N.GT.2 ) $ CALL DLASET( 'L', N-2, N-2, ZERO, ZERO, H(3), $ DESCH(LLD_) ) LWKOPT = INT( WORK(1) ) LIWKOPT = 1 ELSE LWKOPT = 1 LIWKOPT = 1 END IF * * Do one more step of recursion * ELSE * * Zero out iteration and sweep counters for debugging purposes * TOTIT = 0 SWEEP = 0 TOTNS = 0 * * Use small bulge multi-shift QR with aggressive early * deflation on larger-than-tiny matrices. * * Hope for the best. * INFO = 0 * * NWR = recommended deflation window size. At this * point, N .GT. NTINY = 11, so there is enough * subdiagonal workspace for NWR.GE.2 as required. * (In fact, there is enough subdiagonal space for * NWR.GE.3.) * NWR = PILAENVX( ICTXT, 13, 'PDLAQR0', JBCMPZ, N, ILO, IHI, $ LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, NWR ) NW = NWR * * NSR = recommended number of simultaneous shifts. * At this point N .GT. NTINY = 11, so there is at * enough subdiagonal workspace for NSR to be even * and greater than or equal to two as required. * NWIN = PILAENVX( ICTXT, 19, 'PDLAQR0', JBCMPZ, N, NB, NB, NB ) NSR = PILAENVX( ICTXT, 15, 'PDLAQR0', JBCMPZ, N, ILO, IHI, $ MAX(NWIN,NB) ) NSR = MIN( NSR, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * Estimate optimal workspace * LWKOPT = 3*ICEIL(NWR,NPROW)*ICEIL(NWR,NPCOL) * * Workspace query call to PDLAQR3 * CALL PDLAQR3( WANTT, WANTZ, N, ILO, IHI, NWR+1, H, $ DESCH, ILOZ, IHIZ, Z, DESCZ, LS, LD, WR, WI, H, $ DESCH, N, H, DESCH, N, H, DESCH, WORK, -1, IWORK, $ LIWORK, RECLEVEL ) LWKOPT = LWKOPT + INT( WORK( 1 ) ) LIWKOPT = IWORK( 1 ) * * Workspace query call to PDLAQR5 * CALL PDLAQR5( WANTT, WANTZ, 2, N, 1, N, N, WR, WI, H, $ DESCH, ILOZ, IHIZ, Z, DESCZ, WORK, -1, IWORK, $ LIWORK ) * * Optimal workspace = MAX(PDLAQR3, PDLAQR5) * LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) LIWKOPT = MAX( LIWKOPT, IWORK( 1 ) ) * * Quick return in case of workspace query. * IF( LQUERY ) THEN WORK( 1 ) = DBLE( LWKOPT ) IWORK( 1 ) = LIWKOPT RETURN END IF * * PDLAQR1/PDLAQR0 crossover point. * NMIN = PILAENVX( ICTXT, 12, 'PDLAQR0', JBCMPZ, N, ILO, IHI, $ LWORK ) NMIN = MAX( NTINY, NMIN ) * * Nibble crossover point. * NIBBLE = PILAENVX( ICTXT, 14, 'PDLAQR0', JBCMPZ, N, ILO, IHI, $ LWORK ) NIBBLE = MAX( 0, NIBBLE ) * * Accumulate reflections during ttswp? Use block * 2-by-2 structure during matrix-matrix multiply? * KACC22 = PILAENVX( ICTXT, 16, 'PDLAQR0', JBCMPZ, N, ILO, IHI, $ LWORK ) KACC22 = MAX( 1, KACC22 ) KACC22 = MIN( 2, KACC22 ) * * NWMAX = the largest possible deflation window for * which there is sufficient workspace. * NWMAX = MIN( ( N-1 ) / 3, LWORK / 2 ) * * NSMAX = the Largest number of simultaneous shifts * for which there is sufficient workspace. * NSMAX = MIN( ( N+6 ) / 9, LWORK - LWORK/3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * NDFL: an iteration count restarted at deflation. * NDFL = 1 * * ITMAX = iteration limit * ITMAX = MAX( 30, 2*KEXSH )*MAX( 10, ( IHI-ILO+1 ) ) * * Last row and column in the active block. * KBOT = IHI * * Main Loop. * DO 110 IT = 1, ITMAX TOTIT = TOTIT + 1 * * Done when KBOT falls below ILO. * IF( KBOT.LT.ILO ) $ GO TO 120 * * Locate active block. * DO 10 K = KBOT, ILO + 1, -1 CALL INFOG2L( K, K-1, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, HRSRC, HCSRC ) IF( MYROW.EQ.HRSRC .AND. MYCOL.EQ.HCSRC ) THEN IF( H( II + (JJ-1)*LLDH ).EQ.ZERO ) $ GO TO 20 END IF 10 CONTINUE K = ILO 20 CONTINUE KTOP = K IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', '1-Tree', 1, 1, KTOP, 1, $ -1, -1, -1, -1, -1 ) * * Select deflation window size. * NH = KBOT - KTOP + 1 IF( NH.LE.NTINY ) THEN NW = NH ELSEIF( NDFL.LT.KEXNW .OR. NH.LT.NW ) THEN * * Typical deflation window. If possible and * advisable, nibble the entire active block. * If not, use size NWR or NWR+1 depending upon * which has the smaller corresponding subdiagonal * entry (a heuristic). * NWINC = .TRUE. IF( NH.LE.MIN( NMIN, NWMAX ) ) THEN NW = NH ELSE NW = MIN( NWR, NH, NWMAX ) IF( NW.LT.NWMAX ) THEN IF( NW.GE.NH-1 ) THEN NW = NH ELSE KWTOP = KBOT - NW + 1 CALL PDELGET( 'All', '1-Tree', ELEM1, H, KWTOP, $ KWTOP-1, DESCH ) CALL PDELGET( 'All', '1-Tree', ELEM2, H, $ KWTOP-1, KWTOP-2, DESCH ) IF( ABS( ELEM1 ).GT.ABS( ELEM2 ) ) NW = NW + 1 END IF END IF END IF ELSE * * Exceptional deflation window. If there have * been no deflations in KEXNW or more iterations, * then vary the deflation window size. At first, * because, larger windows are, in general, more * powerful than smaller ones, rapidly increase the * window up to the maximum reasonable and possible. * Then maybe try a slightly smaller window. * IF( NWINC .AND. NW.LT.MIN( NWMAX, NH ) ) THEN NW = MIN( NWMAX, NH, 2*NW ) ELSE NWINC = .FALSE. IF( NW.EQ.NH .AND. NH.GT.2 ) $ NW = NH - 1 END IF END IF * * Aggressive early deflation: * split workspace into * - an NW-by-NW work array V for orthogonal matrix * - an NW-by-at-least-NW-but-more-is-better * (NW-by-NHO) horizontal work array for Schur factor * - an at-least-NW-but-more-is-better (NVE-by-NW) * vertical work array for matrix multiplications * - align T, V and W with the deflation window * KV = N - NW + 1 KT = NW + 1 NHO = ( N-NW-1 ) - KT + 1 KWV = NW + 2 NVE = ( N-NW ) - KWV + 1 * JW = MIN( NW, KBOT-KTOP+1 ) KWTOP = KBOT - JW + 1 IROFFH = MOD( KWTOP - 1, NB ) ICOFFH = IROFFH HRSRC = INDXG2P( KWTOP, NB, MYROW, DESCH(RSRC_), NPROW ) HCSRC = INDXG2P( KWTOP, NB, MYCOL, DESCH(CSRC_), NPCOL ) VROWS = NUMROC( JW+IROFFH, NB, MYROW, HRSRC, NPROW ) VCOLS = NUMROC( JW+ICOFFH, NB, MYCOL, HCSRC, NPCOL ) CALL DESCINIT( DESCV, JW+IROFFH, JW+ICOFFH, NB, NB, $ HRSRC, HCSRC, ICTXT, MAX(1, VROWS), INFO ) * TROWS = NUMROC( JW+IROFFH, NB, MYROW, HRSRC, NPROW ) TCOLS = NUMROC( JW+ICOFFH, NB, MYCOL, HCSRC, NPCOL ) CALL DESCINIT( DESCT, JW+IROFFH, JW+ICOFFH, NB, NB, $ HRSRC, HCSRC, ICTXT, MAX(1, TROWS), INFO ) WROWS = NUMROC( JW+IROFFH, NB, MYROW, HRSRC, NPROW ) WCOLS = NUMROC( JW+ICOFFH, NB, MYCOL, HCSRC, NPCOL ) CALL DESCINIT( DESCW, JW+IROFFH, JW+ICOFFH, NB, NB, $ HRSRC, HCSRC, ICTXT, MAX(1, WROWS), INFO ) * IPV = 1 IPT = IPV + DESCV( LLD_ ) * VCOLS IPW = IPT + DESCT( LLD_ ) * TCOLS IPWRK = IPW + DESCW( LLD_ ) * WCOLS * * Aggressive early deflation * IWORK(1) = IT CALL PDLAQR3( WANTT, WANTZ, N, KTOP, KBOT, NW, H, $ DESCH, ILOZ, IHIZ, Z, DESCZ, LS, LD, WR, WI, $ WORK(IPV), DESCV, NHO, WORK(IPT), DESCT, NVE, $ WORK(IPW), DESCW, WORK(IPWRK), LWORK-IPWRK+1, $ IWORK, LIWORK, RECLEVEL ) * * Adjust KBOT accounting for new deflations. * KBOT = KBOT - LD * * KS points to the shifts. * KS = KBOT - LS + 1 * * Skip an expensive QR sweep if there is a (partly * heuristic) reason to expect that many eigenvalues * will deflate without it. Here, the QR sweep is * skipped if many eigenvalues have just been deflated * or if the remaining active block is small. * IF( ( LD.EQ.0 ) .OR. ( ( 100*LD.LE.NW*NIBBLE ) .AND. ( KBOT- $ KTOP+1.GT.MIN( NMIN, NWMAX ) ) ) ) THEN * * NS = nominal number of simultaneous shifts. * This may be lowered (slightly) if PDLAQR3 * did not provide that many shifts. * NS = MIN( NSMAX, NSR, MAX( 2, KBOT-KTOP ) ) NS = NS - MOD( NS, 2 ) * * If there have been no deflations * in a multiple of KEXSH iterations, * then try exceptional shifts. * Otherwise use shifts provided by * PDLAQR3 above or from the eigenvalues * of a trailing principal submatrix. * IF( MOD( NDFL, KEXSH ).EQ.0 ) THEN KS = KBOT - NS + 1 DO 30 I = KBOT, MAX( KS+1, KTOP+2 ), -2 CALL PDELGET( 'All', '1-Tree', ELEM1, H, I, I-1, $ DESCH ) CALL PDELGET( 'All', '1-Tree', ELEM2, H, I-1, I-2, $ DESCH ) CALL PDELGET( 'All', '1-Tree', ELEM3, H, I, I, $ DESCH ) SS = ABS( ELEM1 ) + ABS( ELEM2 ) AA = WILK1*SS + ELEM3 BB = SS CC = WILK2*SS DD = AA CALL DLANV2( AA, BB, CC, DD, WR( I-1 ), WI( I-1 ), $ WR( I ), WI( I ), CS, SN ) 30 CONTINUE IF( KS.EQ.KTOP ) THEN CALL PDELGET( 'All', '1-Tree', ELEM1, H, KS+1, $ KS+1, DESCH ) WR( KS+1 ) = ELEM1 WI( KS+1 ) = ZERO WR( KS ) = WR( KS+1 ) WI( KS ) = WI( KS+1 ) END IF ELSE * * Got NS/2 or fewer shifts? Use PDLAQR0 or * PDLAQR1 on a trailing principal submatrix to * get more. * IF( KBOT-KS+1.LE.NS / 2 ) THEN KS = KBOT - NS + 1 KT = N - NS + 1 NPMIN = PILAENVX( ICTXT, 23, 'PDLAQR0', 'EN', NS, $ NB, NPROW, NPCOL ) c c Temporarily force NPMIN <= 8 since only PDLAQR1 is used. c NPMIN = MIN(NPMIN, 8) IF( MIN(NPROW, NPCOL).LE.NPMIN+1 .OR. $ RECLEVEL.GE.1 ) THEN * * The window is large enough. Compute the Schur * decomposition with all processors. * IROFFH = MOD( KS - 1, NB ) ICOFFH = IROFFH IF( NS.GT.NMIN ) THEN HRSRC = INDXG2P( KS, NB, MYROW, DESCH(RSRC_), $ NPROW ) HCSRC = INDXG2P( KS, NB, MYROW, DESCH(CSRC_), $ NPCOL ) ELSE HRSRC = 0 HCSRC = 0 END IF TROWS = NUMROC( NS+IROFFH, NB, MYROW, HRSRC, $ NPROW ) TCOLS = NUMROC( NS+ICOFFH, NB, MYCOL, HCSRC, $ NPCOL ) CALL DESCINIT( DESCT, NS+IROFFH, NS+ICOFFH, NB, $ NB, HRSRC, HCSRC, ICTXT, MAX(1, TROWS), $ INFO ) IPT = 1 IPWRK = IPT + DESCT(LLD_) * TCOLS * IF( NS.GT.NMIN .AND. RECURSION ) THEN CALL PDLACPY( 'All', NS, NS, H, KS, KS, $ DESCH, WORK(IPT), 1+IROFFH, 1+ICOFFH, $ DESCT ) CALL PDLAQR0( .FALSE., .FALSE., IROFFH+NS, $ 1+IROFFH, IROFFH+NS, WORK(IPT), $ DESCT, WR( KS-IROFFH ), $ WI( KS-IROFFH ), 1, 1, ZDUM, $ DESCZ, WORK( IPWRK ), $ LWORK-IPWRK+1, IWORK, LIWORK, $ INF, RECLEVEL+1 ) ELSE CALL PDLAMVE( 'All', NS, NS, H, KS, KS, $ DESCH, WORK(IPT), 1+IROFFH, 1+ICOFFH, $ DESCT, WORK(IPWRK) ) CALL PDLAQR1( .FALSE., .FALSE., IROFFH+NS, $ 1+IROFFH, IROFFH+NS, WORK(IPT), $ DESCT, WR( KS-IROFFH ), $ WI( KS-IROFFH ), 1+IROFFH, IROFFH+NS, $ ZDUM, DESCZ, WORK( IPWRK ), $ LWORK-IPWRK+1, IWORK, LIWORK, INF ) END IF ELSE * * The window is too small. Redistribute the AED * window to a subgrid and do the computation on * the subgrid. * ICTXT_NEW = ICTXT DO 50 I = 0, NPMIN-1 DO 40 J = 0, NPMIN-1 PMAP( J+1+I*NPMIN ) = $ BLACS_PNUM( ICTXT, I, J ) 40 CONTINUE 50 CONTINUE CALL BLACS_GRIDMAP( ICTXT_NEW, PMAP, NPMIN, $ NPMIN, NPMIN ) CALL BLACS_GRIDINFO( ICTXT_NEW, NPMIN, NPMIN, $ MYROW_NEW, MYCOL_NEW ) IF( MYROW.GE.NPMIN .OR. MYCOL.GE.NPMIN ) $ ICTXT_NEW = -1 * IF( ICTXT_NEW.GE.0 ) THEN TROWS = NUMROC( NS, NB, MYROW_NEW, 0, NPMIN ) TCOLS = NUMROC( NS, NB, MYCOL_NEW, 0, NPMIN ) CALL DESCINIT( DESCT, NS, NS, NB, NB, 0, 0, $ ICTXT_NEW, MAX(1,TROWS), INFO ) IPT = 1 IPWRK = IPT + DESCT(LLD_) * TCOLS ELSE IPT = 1 IPWRK = 2 DESCT( CTXT_ ) = -1 INF = 0 END IF CALL PDGEMR2D( NS, NS, H, KS, KS, DESCH, $ WORK(IPT), 1, 1, DESCT, ICTXT ) * c c This part is still not perfect. c Either PDLAQR0 or PDLAQR1 can work, but not both. c c NMIN = PILAENVX( ICTXT_NEW, 12, 'PDLAQR0', c $ 'EN', NS, 1, NS, LWORK ) IF( ICTXT_NEW.GE.0 ) THEN c IF( NS.GT.NMIN .AND. RECLEVEL.LT.1 ) THEN c CALL PDLAQR0( .FALSE., .FALSE., NS, 1, c $ NS, WORK(IPT), DESCT, WR( KS ), c $ WI( KS ), 1, 1, ZDUM, DESCT, c $ WORK( IPWRK ), LWORK-IPWRK+1, IWORK, c $ LIWORK, INF, RECLEVEL+1 ) c ELSE CALL PDLAQR1( .FALSE., .FALSE., NS, 1, $ NS, WORK(IPT), DESCT, WR( KS ), $ WI( KS ), 1, NS, ZDUM, DESCT, $ WORK( IPWRK ), LWORK-IPWRK+1, IWORK, $ LIWORK, INF ) c END IF CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF IF( MYROW+MYCOL.GT.0 ) THEN DO 60 J = 0, NS-1 WR( KS+J ) = ZERO WI( KS+J ) = ZERO 60 CONTINUE END IF CALL IGAMN2D( ICTXT, 'All', '1-Tree', 1, 1, INF, $ 1, -1, -1, -1, -1, -1 ) CALL DGSUM2D( ICTXT, 'All', ' ', NS, 1, WR(KS), $ NS, -1, -1 ) CALL DGSUM2D( ICTXT, 'All', ' ', NS, 1, WI(KS), $ NS, -1, -1 ) END IF KS = KS + INF * * In case of a rare QR failure use * eigenvalues of the trailing 2-by-2 * principal submatrix. * IF( KS.GE.KBOT ) THEN CALL PDELGET( 'All', '1-Tree', AA, H, KBOT-1, $ KBOT-1, DESCH ) CALL PDELGET( 'All', '1-Tree', CC, H, KBOT, $ KBOT-1, DESCH ) CALL PDELGET( 'All', '1-Tree', BB, H, KBOT-1, $ KBOT, DESCH ) CALL PDELGET( 'All', '1-Tree', DD, H, KBOT, $ KBOT, DESCH ) CALL DLANV2( AA, BB, CC, DD, WR( KBOT-1 ), $ WI( KBOT-1 ), WR( KBOT ), $ WI( KBOT ), CS, SN ) KS = KBOT - 1 END IF END IF * IF( KBOT-KS+1.GT.NS ) THEN * * Sort the shifts (helps a little) * Bubble sort keeps complex conjugate * pairs together. * SORTED = .FALSE. DO 80 K = KBOT, KS + 1, -1 IF( SORTED ) $ GO TO 90 SORTED = .TRUE. DO 70 I = KS, K - 1 IF( ABS( WR( I ) )+ABS( WI( I ) ).LT. $ ABS( WR( I+1 ) )+ABS( WI( I+1 ) ) ) THEN SORTED = .FALSE. * SWAP = WR( I ) WR( I ) = WR( I+1 ) WR( I+1 ) = SWAP * SWAP = WI( I ) WI( I ) = WI( I+1 ) WI( I+1 ) = SWAP END IF 70 CONTINUE 80 CONTINUE 90 CONTINUE END IF * * Shuffle shifts into pairs of real shifts * and pairs of complex conjugate shifts * assuming complex conjugate shifts are * already adjacent to one another. (Yes, * they are.) * DO 100 I = KBOT, KS + 2, -2 IF( WI( I ).NE.-WI( I-1 ) ) THEN * SWAP = WR( I ) WR( I ) = WR( I-1 ) WR( I-1 ) = WR( I-2 ) WR( I-2 ) = SWAP * SWAP = WI( I ) WI( I ) = WI( I-1 ) WI( I-1 ) = WI( I-2 ) WI( I-2 ) = SWAP END IF 100 CONTINUE END IF * * If there are only two shifts and both are * real, then use only one. * IF( KBOT-KS+1.EQ.2 ) THEN IF( WI( KBOT ).EQ.ZERO ) THEN CALL PDELGET( 'All', '1-Tree', ELEM, H, KBOT, $ KBOT, DESCH ) IF( ABS( WR( KBOT )-ELEM ).LT. $ ABS( WR( KBOT-1 )-ELEM ) ) THEN WR( KBOT-1 ) = WR( KBOT ) ELSE WR( KBOT ) = WR( KBOT-1 ) END IF END IF END IF * * Use up to NS of the the smallest magnatiude * shifts. If there aren't NS shifts available, * then use them all, possibly dropping one to * make the number of shifts even. * NS = MIN( NS, KBOT-KS+1 ) NS = NS - MOD( NS, 2 ) KS = KBOT - NS + 1 * * Small-bulge multi-shift QR sweep. * TOTNS = TOTNS + NS SWEEP = SWEEP + 1 CALL PDLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, $ NS, WR( KS ), WI( KS ), H, DESCH, ILOZ, IHIZ, Z, $ DESCZ, WORK, LWORK, IWORK, LIWORK ) END IF * * Note progress (or the lack of it). * IF( LD.GT.0 ) THEN NDFL = 1 ELSE NDFL = NDFL + 1 END IF * * End of main loop. 110 CONTINUE * * Iteration limit exceeded. Set INFO to show where * the problem occurred and exit. * INFO = KBOT 120 CONTINUE END IF * * Return the optimal value of LWORK. * WORK( 1 ) = DBLE( LWKOPT ) IWORK( 1 ) = LIWKOPT IF( .NOT. LQUERY ) THEN IWORK( 1 ) = TOTIT IWORK( 2 ) = SWEEP IWORK( 3 ) = TOTNS END IF RETURN * * End of PDLAQR0 * END scalapack-2.0.2/SRC/pdlaqr1.f000644 000766 000024 00000300245 11705175572 016071 0ustar00juliestaff000000 000000 RECURSIVE SUBROUTINE PDLAQR1( WANTT, WANTZ, N, ILO, IHI, A, $ DESCA, WR, WI, ILOZ, IHIZ, Z, $ DESCZ, WORK, LWORK, IWORK, $ ILWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, ILWORK, INFO, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) DOUBLE PRECISION A( * ), WI( * ), WORK( * ), WR( * ), Z( * ) * .. * * Purpose * ======= * * PDLAQR1 is an auxiliary routine used to find the Schur decomposition * and or eigenvalues of a matrix already in Hessenberg form from * cols ILO to IHI. * * This is a modified version of PDLAHQR from ScaLAPACK version 1.7.3. * The following modifications were made: * o Recently removed workspace query functionality was added. * o Aggressive early deflation is implemented. * o Aggressive deflation (looking for two consecutive small * subdiagonal elements by PDLACONSB) is abandoned. * o The returned Schur form is now in canonical form, i.e., the * returned 2-by-2 blocks really correspond to complex conjugate * pairs of eigenvalues. * o For some reason, the original version of PDLAHQR sometimes did * not read out the converged eigenvalues correclty. This is now * fixed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix A (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that A is already upper quasi-triangular in * rows and columns IHI+1:N, and that A(ILO,ILO-1) = 0 (unless * ILO = 1). PDLAQR1 works primarily with the Hessenberg * submatrix in rows and columns ILO to IHI, but applies * transformations to all of H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * A (global input/output) DOUBLE PRECISION array, dimension * (DESCA(LLD_),*) * On entry, the upper Hessenberg matrix A. * On exit, if WANTT is .TRUE., A is upper quasi-triangular in * rows and columns ILO:IHI, with any 2-by-2 or larger diagonal * blocks not yet in standard form. If WANTT is .FALSE., the * contents of A are unspecified on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WR (global replicated output) DOUBLE PRECISION array, * dimension (N) * WI (global replicated output) DOUBLE PRECISION array, * dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in A. A may be returned with * larger diagonal blocks until the next release. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (global input/output) DOUBLE PRECISION array. * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations accumulated by PDHSEQR, and on * exit Z has been updated; transformations are applied only to * the submatrix Z(ILOZ:IHIZ,ILO:IHI). * If WANTZ is .FALSE., Z is not referenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local output) DOUBLE PRECISION array of size LWORK * * LWORK (local input) INTEGER * WORK(LWORK) is a local array and LWORK is assumed big enough * so that LWORK >= 6*N + 6*385*385 + * MAX( 2*MAX(DESCZ(LLD_),DESCA(LLD_)) + 2*LOCc(N), * 7*Ceil(N/HBL)/LCM(NPROW,NPCOL)) ) * * IWORK (global and local input) INTEGER array of size ILWORK * * ILWORK (local input) INTEGER * This holds the some of the IBLK integer arrays. This is held * as a place holder for the next release. * * INFO (global output) INTEGER * < 0: parameter number -INFO incorrect or inconsistent * = 0: successful exit * > 0: PDLAQR1 failed to compute all the eigenvalues ILO to IHI * in a total of 30*(IHI-ILO+1) iterations; if INFO = i, * elements i+1:ihi of WR and WI contain those eigenvalues * which have been successfully computed. * * Logic: * This algorithm is very similar to _LAHQR. Unlike _LAHQR, * instead of sending one double shift through the largest * unreduced submatrix, this algorithm sends multiple double shifts * and spaces them apart so that there can be parallelism across * several processor row/columns. Another critical difference is * that this algorithm aggregrates multiple transforms together in * order to apply them in a block fashion. * * Important Local Variables: * IBLK = The maximum number of bulges that can be computed. * Currently fixed. Future releases this won't be fixed. * HBL = The square block size (HBL=DESCA(MB_)=DESCA(NB_)) * ROTN = The number of transforms to block together * NBULGE = The number of bulges that will be attempted on the * current submatrix. * IBULGE = The current number of bulges started. * K1(*),K2(*) = The current bulge loops from K1(*) to K2(*). * * Subroutines: * This routine calls: * PDLAWIL -> Given the shift, get the transformation * DLASORTE -> Pair up eigenvalues so that reals are paired. * PDLACP3 -> Parallel array to local replicated array copy & * back. * DLAREF -> Row/column reflector applier. Core routine here. * PDLASMSUB -> Finds negligible subdiagonal elements. * * Current Notes and/or Restrictions: * 1.) This code requires the distributed block size to be square * and at least six (6); unlike simpler codes like LU, this * algorithm is extremely sensitive to block size. Unwise * choices of too small a block size can lead to bad * performance. * 2.) This code requires A and Z to be distributed identically * and have identical contxts. * 3.) This release currently does not have a routine for * resolving the Schur blocks into regular 2x2 form after * this code is completed. Because of this, a significant * performance impact is required while the deflation is done * by sometimes a single column of processors. * 4.) This code does not currently block the initial transforms * so that none of the rows or columns for any bulge are * completed until all are started. To offset pipeline * start-up it is recommended that at least 2*LCM(NPROW,NPCOL) * bulges are used (if possible) * 5.) The maximum number of bulges currently supported is fixed at * 32. In future versions this will be limited only by the * incoming WORK array. * 6.) The matrix A must be in upper Hessenberg form. If elements * below the subdiagonal are nonzero, the resulting transforms * may be nonsimilar. This is also true with the LAPACK * routine. * 7.) For this release, it is assumed RSRC_=CSRC_=0 * 8.) Currently, all the eigenvalues are distributed to all the * nodes. Future releases will probably distribute the * eigenvalues by the column partitioning. * 9.) The internals of this routine are subject to change. * * Implemented by: G. Henry, November 17, 1996 * * Modified by Robert Granat and Meiyue Shao, Department of Computing * Science and HPC2N, Umea University, Sweden * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE, HALF PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, HALF = 0.5D+0 ) DOUBLE PRECISION CONST PARAMETER ( CONST = 1.50D+0 ) INTEGER IBLK, LDS PARAMETER ( IBLK = 32, LDS = 12*IBLK+1 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, I, I1, I2, IAFIRST, IBULGE, $ ICBUF, ICOL, ICOL1, ICOL2, IERR, II, $ IRBUF, IROW, IROW1, IROW2, ISPEC, ISTART, $ ISTARTCOL, ISTARTROW, ISTOP, ISUB, $ ITERMAX, ITMP1, ITMP2, ITN, ITS, J, JAFIRST, $ JBLK, JJ, K, KI, L, LCMRC, LDA, LDZ, LEFT, $ LIHIH, LIHIZ, LILOH, LILOZ, LOCALI1, LOCALI2, $ LOCALK, LOCALM, M, MODKM1, MYCOL, MYROW, $ NBULGE, NH, NODE, NPCOL, NPROW, NR, NUM, NZ, $ RIGHT, ROTN, UP, VECSIDX, TOTIT, TOTNS, TOTSW, $ DBLK, NIBBLE, ND, NS, LTOP, LWKOPT, S1, S2, S3 DOUBLE PRECISION AVE, DISC, H00, H10, H11, H12, H21, H22, H33, $ H43H34, H44, OVFL, S, SMLNUM, SUM, T1, T1COPY, $ T2, T3, ULP, UNFL, V1SAVE, V2, V2SAVE, V3, $ V3SAVE, SN, CS, SWAP LOGICAL AED * .. * .. Local Arrays .. INTEGER ICURCOL( IBLK ), ICURROW( IBLK ), K1( IBLK ), $ K2( IBLK ), KCOL( IBLK ), KP2COL( IBLK ), $ KP2ROW( IBLK ), KROW( IBLK ), LOCALK2( IBLK ) DOUBLE PRECISION SMALLA( 6, 6, IBLK ), VCOPY( 3 ) * .. * .. External Functions .. INTEGER ILCM, NUMROC, ILAENV DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, NUMROC, ILAENV, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DGEBR2D, DGEBS2D, $ DGERV2D, DGESD2D, DGSUM2D, DLAHQR, DLAREF, $ DLARFG, DLASORTE, IGAMN2D, INFOG1L, INFOG2L, $ PDLABAD, PDLACP3, PDLASMSUB, $ PDLAWIL, PXERBLA, DLANV2, PDLAQR2, PDLAQR4 * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD, SIGN, SQRT * .. * .. Executable Statements .. * INFO = 0 * ITERMAX = 30*( IHI-ILO+1 ) IF( N.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL NUM = NPROW*NPCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) LCMRC = ILCM( NPROW, NPCOL ) TOTIT = 0 TOTNS = 0 TOTSW = 0 * * Determine the number of columns we have so we can check workspace * LOCALK = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) JJ = N / HBL IF( JJ*HBL.LT.N ) $ JJ = JJ + 1 JJ = 7*JJ / LCMRC LWKOPT = INT( 6*N+MAX( 3*MAX( LDA, LDZ )+2*LOCALK, JJ ) $ +6*LDS*LDS ) IF( LWORK.EQ.-1 .OR. ILWORK.EQ.-1 ) THEN WORK( 1 ) = DBLE( LWKOPT ) RETURN ELSEIF( LWORK.LT.LWKOPT ) THEN INFO = -15 END IF IF( DESCZ( CTXT_ ).NE.DESCA( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) END IF IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1300+NB_ ) END IF IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1300+MB_ ) END IF IF( ( ILO.GT.N ) .OR. ( ILO.LT.1 ) ) THEN INFO = -4 END IF IF( ( IHI.GT.N ) .OR. ( IHI.LT.1 ) ) THEN INFO = -5 END IF IF( HBL.LT.5 ) THEN INFO = -( 700+MB_ ) END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PDLAQR1', -INFO ) WORK( 1 ) = DBLE( LWKOPT ) RETURN END IF * * Set work array indices * S1 = 0 S2 = S1+LDS*LDS S3 = S2+LDS*LDS VECSIDX = S3+4*LDS*LDS ISUB = VECSIDX+3*N IRBUF = ISUB+N ICBUF = IRBUF+N * * Find a value for ROTN * ROTN = HBL / 3 ROTN = MAX( ROTN, HBL-2 ) ROTN = MIN( ROTN, 1 ) * IF( ILO.EQ.IHI ) THEN CALL INFOG2L( ILO, ILO, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN WR( ILO ) = A( ( ICOL-1 )*LDA+IROW ) ELSE WR( ILO ) = ZERO END IF WI( ILO ) = ZERO WORK( 1 ) = DBLE( LWKOPT ) RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * * If the diagonal block is small enough, copy it to local memory and * call DLAHQR directly. * IF( NH .LE. LDS ) THEN CALL PDLAQR4( WANTT, WANTZ, N, ILO, IHI, A, DESCA, WR, WI, $ ILOZ, IHIZ, Z, DESCZ, WORK( S1+1 ), NH, $ WORK( S2+1 ), NH, WORK( S3+1 ), 4*LDS*LDS, $ INFO ) WORK( 1 ) = DBLE( LWKOPT ) RETURN END IF * CALL INFOG1L( ILOZ, HBL, NPROW, MYROW, DESCZ(RSRC_), LILOZ, LIHIZ) LIHIZ = NUMROC( IHIZ, HBL, MYROW, DESCZ(RSRC_), NPROW ) * * Set machine-dependent constants for the stopping criterion. * If NORM(H) <= SQRT(OVFL), overflow should not occur. * UNFL = PDLAMCH( CONTXT, 'SAFE MINIMUM' ) OVFL = ONE / UNFL CALL PDLABAD( CONTXT, UNFL, OVFL ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = ITERMAX * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of our schur block size (<=2*IBLK). Each * iteration of the loop works with the active submatrix in rows * and columns L to I. Eigenvalues I+1 to IHI have already * converged. Either L = ILO or the global A(L,L-1) is negligible * so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 450 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 420 ITS = 0, ITN TOTIT = TOTIT + 1 * * Look for a single small subdiagonal element. * CALL PDLASMSUB( A, DESCA, I, L, K, SMLNUM, WORK( IRBUF+1 ), $ LWORK-IRBUF ) L = K * IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * CALL INFOG2L( L, L-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN A( ( ICOL-1 )*LDA+IROW ) = ZERO END IF WORK( ISUB+L-1 ) = ZERO END IF * * Exit from loop if a small submatrix has split off. * M = L - 10 IF ( L .GT. I - LDS ) $ GO TO 430 * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * * Copy submatrix of size 2*JBLK and prepare to do generalized * Wilkinson shift or an exceptional shift * NH = I-L+1 AED = .TRUE. JBLK = MIN( IBLK, ( NH / 2 )-1 ) IF( JBLK.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * JBLK = JBLK - MOD( JBLK, LCMRC ) END IF JBLK = MIN( JBLK, 2*LCMRC ) JBLK = MAX( JBLK, 1 ) * IF( ITS.EQ.20 .OR. ITS.EQ.40 ) THEN * * Exceptional shift. * CALL PDLACP3( 2*JBLK, I-2*JBLK+1, A, DESCA, WORK( S1+1 ), $ LDS, -1, -1, 0 ) DO 20 II = 2*JBLK, 2, -1 WORK( S1+II+(II-1)*LDS ) = CONST*( $ ABS( WORK( S1+II+(II-1)*LDS ) )+ $ ABS( WORK( S1+II+(II-2)*LDS ) ) ) WORK( S1+II+(II-2)*LDS ) = ZERO WORK( S1+II-1+(II-1)*LDS ) = ZERO 20 CONTINUE WORK( S1+1 ) = CONST*ABS( WORK( S1+1 ) ) ELSE * * Aggressive early deflation. * IF( AED ) THEN DBLK = ILAENV( 13, 'DLAQR0', 'SV', N, L, I, 4*LDS*LDS ) DBLK = MAX( 2*JBLK, DBLK ) + 1 DBLK = MIN( NH, LDS, DBLK ) CALL PDLAQR2( WANTT, WANTZ, N, L, I, DBLK, A, DESCA, $ ILOZ, IHIZ, Z, DESCZ, NS, ND, WR, WI, $ WORK( S1+1 ), LDS, WORK( S2+1 ), DBLK, $ WORK( IRBUF+1 ), WORK( ICBUF+1 ), $ WORK( S3+1 ), 4*LDS*LDS ) * * Skip a QR sweep if enough eigenvalues are deflated. * NIBBLE = ILAENV( 14, 'DLAQR0', 'SV', N, L, I, 4*LDS*LDS ) NIBBLE = MAX( 0, NIBBLE ) I = I - ND DBLK = DBLK - ND IF( 100*ND .GT. NIBBLE*NH .OR. DBLK .LT. 2*JBLK ) GOTO 10 * * Use unconverged eigenvalues as shifts for the QR sweep. * (This option is turned off because of the quality of * these shifts are not so good.) * * IF( ND.GE.0 .AND. ND+DBLK.GE.64 ) THEN IF( .FALSE. ) THEN CALL DLASET( 'L', DBLK-1, DBLK-1, ZERO, ZERO, $ WORK( S1+2 ), LDS ) WORK( IRBUF+1 ) = WORK( S1+1 ) WORK( ICBUF+1 ) = ZERO * * Shuffle shifts into pairs of real shifts and pairs of * complex conjugate shifts assuming complex conjugate * shifts are already adjacent to one another. * DO 21 II = DBLK, 3, -2 IF( WORK( ICBUF+II ).NE.-WORK( ICBUF+II-1 ) ) THEN SWAP = WORK( IRBUF+II ) WORK( IRBUF+II ) = WORK( IRBUF+II-1 ) WORK( IRBUF+II-1 ) = WORK( IRBUF+II-2 ) WORK( IRBUF+II-2 ) = SWAP SWAP = WORK( ICBUF+II ) WORK( ICBUF+II ) = WORK( ICBUF+II-1 ) WORK( ICBUF+II-1 ) = WORK( ICBUF+II-2 ) WORK( ICBUF+II-2 ) = SWAP END IF 21 CONTINUE * * Copy undeflatable eigenvalues to the diagonal of S1. * II = 2 22 CONTINUE IF( WORK( ICBUF+II ) .EQ. ZERO ) THEN WORK( S1+II+(II-1)*LDS ) = WORK( IRBUF+II ) WORK( S1+II+(II-2)*LDS ) = ZERO II = II + 1 ELSE WORK( S1+II+(II-1)*LDS ) = WORK( IRBUF+II ) WORK( S1+II+1+II*LDS ) = WORK( IRBUF+II ) WORK( S1+II+1+(II-1)*LDS ) = WORK( ICBUF+II ) WORK( S1+II+II*LDS ) = -WORK( ICBUF+II ) II = II + 2 END IF IF( II .LE. DBLK ) GOTO 22 ELSE CALL DLAHQR( .FALSE., .FALSE., DBLK, 1, DBLK, $ WORK( S1+1 ), LDS, WORK( IRBUF+1 ), $ WORK( ICBUF+1 ), 1, DBLK, Z, LDZ, IERR ) END IF ELSE DBLK = 2*JBLK CALL PDLACP3( DBLK, I-DBLK+1, A, DESCA, WORK( S1+1 ), $ LDS, -1, -1, 0 ) CALL DLAHQR( .FALSE., .FALSE., DBLK, 1, DBLK, $ WORK( S1+1 ), LDS, WORK( IRBUF+1 ), $ WORK( ICBUF+1 ), 1, DBLK, Z, LDZ, IERR ) END IF TOTSW = TOTSW + 1 * * Prepare to use Wilkinson's double shift * H44 = WORK( S1+DBLK+(DBLK-1)*LDS ) H33 = WORK( S1+DBLK-1+(DBLK-2)*LDS ) H43H34 = WORK( S1+DBLK-1+(DBLK-1)*LDS )* $ WORK( S1+DBLK+(DBLK-2)*LDS ) IF( ( JBLK.GT.1 ) .AND. ( ITS.GT.30 ) ) THEN S = WORK( S1+DBLK-1+(DBLK-3)*LDS ) DISC = ( H33-H44 )*HALF DISC = DISC*DISC + H43H34 IF( DISC.GT.ZERO ) THEN * * Real roots: Use Wilkinson's shift twice * DISC = SQRT( DISC ) AVE = HALF*( H33+H44 ) IF( ABS( H33 )-ABS( H44 ).GT.ZERO ) THEN H33 = H33*H44 - H43H34 H44 = H33 / ( SIGN( DISC, AVE )+AVE ) ELSE H44 = SIGN( DISC, AVE ) + AVE END IF H33 = H44 H43H34 = ZERO END IF END IF END IF * * Look for two consecutive small subdiagonal elements: * PDLACONSB is the routine that does this. * * CALL PDLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, * $ WORK( IRBUF+1 ), LWORK-IRBUF ) * * Skip small submatrices * * IF ( M .GE. I - 5 ) * $ GO TO 80 * * In principle PDLACONSB needs to check all shifts to decide * whether two consecutive small subdiagonal entries are suitable * as the starting position of the bulge chasing phase. It can be * dangerous to check the first pair of shifts only. Moreover it * is quite rare to obtain an M which is much larger than L. This * process is a bit expensive compared with the benefit. * Therefore it is sensible to abandon this routine. Total amount * of communications is saved in average. * M = L * * Double-shift QR step * * NBULGE is the number of bulges that will be attempted * ISTOP = MIN( M+ROTN-MOD( M, ROTN ), I-2 ) ISTOP = MIN( ISTOP, M+HBL-3-MOD( M-1, HBL ) ) ISTOP = MIN( ISTOP, I2-2 ) ISTOP = MAX( ISTOP, M ) NBULGE = ( I-1-ISTOP ) / HBL * * Do not exceed maximum determined. * NBULGE = MIN( NBULGE, JBLK ) IF( NBULGE.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * NBULGE = NBULGE - MOD( NBULGE, LCMRC ) END IF NBULGE = MAX( NBULGE, 1 ) * TOTNS = TOTNS + NBULGE*2 * IF( ( ITS.NE.20 ) .AND. ( ITS.NE.40 ) .AND. ( NBULGE.GT.1 ) ) $ THEN * * sort the eigenpairs so that they are in twos for double * shifts. only call if several need sorting * * CALL DLASORTE( S1( 2*( JBLK-NBULGE )+1, * $ 2*( JBLK-NBULGE )+1 ), 3*IBLK, 2*NBULGE, * $ WORK( IRBUF+1 ), IERR ) CALL DLASORTE( WORK(S1+DBLK-2*NBULGE+1+(DBLK-2*NBULGE)*LDS), $ LDS, 2*NBULGE, WORK( IRBUF+1 ), IERR ) END IF * * IBULGE is the number of bulges going so far * IBULGE = 1 * * "A" row defs : main row transforms from LOCALK to LOCALI2 * CALL INFOG1L( M, HBL, NPCOL, MYCOL, DESCA(CSRC_),ITMP1,LOCALK ) LOCALK = NUMROC( N, HBL, MYCOL, DESCA(CSRC_), NPCOL ) CALL INFOG1L( 1, HBL, NPCOL, MYCOL,DESCA(CSRC_),ICOL1,LOCALI2 ) LOCALI2 = NUMROC( I2, HBL, MYCOL, DESCA(CSRC_), NPCOL ) * * "A" col defs : main col transforms from LOCALI1 to LOCALM * CALL INFOG1L( I1, HBL, NPROW,MYROW,DESCA(RSRC_),LOCALI1,ICOL1 ) ICOL1 = NUMROC( N, HBL, MYROW, DESCA(RSRC_), NPROW ) CALL INFOG1L( 1, HBL, NPROW, MYROW, DESCA(RSRC_),LOCALM,ICOL1 ) ICOL1 = NUMROC( MIN( M+3, I ), HBL, MYROW, DESCA(RSRC_),NPROW ) * * Which row & column will start the bulges * ISTARTROW = MOD( ( M+1 ) / HBL + IAFIRST, NPROW ) ISTARTCOL = MOD( ( M+1 ) / HBL + JAFIRST, NPCOL ) * CALL INFOG1L( M, HBL, NPROW, MYROW, DESCA(RSRC_), II, ITMP2 ) ITMP2 = NUMROC( N, HBL, MYROW, DESCA(RSRC_), NPROW ) CALL INFOG1L( M, HBL, NPCOL, MYCOL, DESCA(CSRC_), JJ, ITMP2 ) ITMP2 = NUMROC( N, HBL, MYCOL, DESCA(CSRC_), NPCOL ) CALL INFOG1L(1,HBL,NPROW,MYROW,DESCA(RSRC_),ISTOP,KP2ROW( 1 ) ) KP2ROW( 1 ) = NUMROC( M+2, HBL, MYROW, DESCA(RSRC_), NPROW ) CALL INFOG1L(1,HBL,NPCOL,MYCOL,DESCA(CSRC_),ISTOP,KP2COL( 1 ) ) KP2COL( 1 ) = NUMROC( M+2, HBL, MYCOL, DESCA(CSRC_), NPCOL ) * * Set all values for bulges. All bulges are stored in * intermediate steps as loops over KI. Their current "task" * over the global M to I-1 values is always K1(KI) to K2(KI). * However, because there are many bulges, K1(KI) & K2(KI) might * go past that range while later bulges (KI+1,KI+2,etc..) are * finishing up. * * Rules: * If MOD(K1(KI)-1,HBL) < HBL-2 then MOD(K2(KI)-1,HBL)= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * If .TRUE., then the Hessenberg matrix H is fully updated * so that the quasi-triangular Schur factor may be * computed (in cooperation with the calling subroutine). * If .FALSE., then only enough of H is updated to preserve * the eigenvalues. * * WANTZ (global input) LOGICAL * If .TRUE., then the orthogonal matrix Z is updated so * so that the orthogonal Schur factor may be computed * (in cooperation with the calling subroutine). * If .FALSE., then Z is not referenced. * * N (global input) INTEGER * The order of the matrix H and (if WANTZ is .TRUE.) the * order of the orthogonal matrix Z. * * KTOP (global input) INTEGER * KBOT (global input) INTEGER * It is assumed without a check that either * KBOT = N or H(KBOT+1,KBOT)=0. KBOT and KTOP together * determine an isolated block along the diagonal of the * Hessenberg matrix. However, H(KTOP,KTOP-1)=0 is not * essentially necessary if WANTT is .TRUE. . * * NW (global input) INTEGER * Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). * Normally NW .GE. 3 if PDLAQR2 is called by PDLAQR1. * * A (local input/output) DOUBLE PRECISION array, dimension * (DESCH(LLD_),*) * On input the initial N-by-N section of A stores the * Hessenberg matrix undergoing aggressive early deflation. * On output A has been transformed by an orthogonal * similarity transformation, perturbed, and the returned * to Hessenberg form that (it is to be hoped) has some * zero subdiagonal entries. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. * * Z (input/output) DOUBLE PRECISION array, dimension * (DESCH(LLD_),*) * IF WANTZ is .TRUE., then on output, the orthogonal * similarity transformation mentioned above has been * accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right. * If WANTZ is .FALSE., then Z is unreferenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * NS (global output) INTEGER * The number of unconverged (ie approximate) eigenvalues * returned in SR and SI that may be used as shifts by the * calling subroutine. * * ND (global output) INTEGER * The number of converged eigenvalues uncovered by this * subroutine. * * SR (global output) DOUBLE PRECISION array, dimension KBOT * SI (global output) DOUBLE PRECISION array, dimension KBOT * On output, the real and imaginary parts of approximate * eigenvalues that may be used for shifts are stored in * SR(KBOT-ND-NS+1) through SR(KBOT-ND) and * SI(KBOT-ND-NS+1) through SI(KBOT-ND), respectively. * On proc #0, the real and imaginary parts of converged * eigenvalues are stored in SR(KBOT-ND+1) through SR(KBOT) and * SI(KBOT-ND+1) through SI(KBOT), respectively. On other * processors, these entries are set to zero. * * T (local workspace) DOUBLE PRECISION array, dimension LDT*NW. * * LDT (local input) INTEGER * The leading dimension of the array T. * LDT >= NW. * * V (local workspace) DOUBLE PRECISION array, dimension LDV*NW. * * LDV (local input) INTEGER * The leading dimension of the array V. * LDV >= NW. * * WR (local workspace) DOUBLE PRECISION array, dimension KBOT. * WI (local workspace) DOUBLE PRECISION array, dimension KBOT. * * WORK (local workspace) DOUBLE PRECISION array, dimension LWORK. * * LWORK (local input) INTEGER * WORK(LWORK) is a local array and LWORK is assumed big enough * so that LWORK >= NW*NW. * * ================================================================ * Implemented by * Meiyue Shao, Department of Computing Science and HPC2N, * Umea University, Sweden * * ================================================================ * References: * B. Kagstrom, D. Kressner, and M. Shao, * On Aggressive Early Deflation in Parallel Variants of the QR * Algorithm. * Para 2010, to appear. * * ================================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER CONTXT, HBL, I, I1, I2, IAFIRST, ICOL, ICOL1, $ ICOL2, INFO, II, IROW, IROW1, IROW2, ITMP1, $ ITMP2, J, JAFIRST, JJ, K, L, LDA, LDZ, LLDTMP, $ MYCOL, MYROW, NODE, NPCOL, NPROW, DBLK, $ HSTEP, VSTEP, KKROW, KKCOL, KLN, LTOP, LEFT, $ RIGHT, UP, DOWN, D1, D2 * .. * .. Local Arrays .. INTEGER DESCT( 9 ), DESCV( 9 ), DESCWH( 9 ), $ DESCWV( 9 ) * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, DLASET, $ DLAQR3, DESCINIT, PDGEMM, PDGEMR2D, DGEMM, $ DLAMOV, DGESD2D, DGERV2D, DGEBS2D, DGEBR2D, $ IGEBS2D, IGEBR2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 * IF( N.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) * * I1 and I2 are the indices of the first row and last column of A * to which transformations must be applied. * I = KBOT L = KTOP IF( WANTT ) THEN I1 = 1 I2 = N LTOP = 1 ELSE I1 = L I2 = I LTOP = L END IF * * Begin Aggressive Early Deflation. * DBLK = NW CALL INFOG2L( I-DBLK+1, I-DBLK+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF ( MYROW .EQ. II ) THEN CALL DESCINIT( DESCT, DBLK, DBLK, DBLK, DBLK, II, JJ, CONTXT, $ LDT, INFO ) CALL DESCINIT( DESCV, DBLK, DBLK, DBLK, DBLK, II, JJ, CONTXT, $ LDV, INFO ) ELSE CALL DESCINIT( DESCT, DBLK, DBLK, DBLK, DBLK, II, JJ, CONTXT, $ 1, INFO ) CALL DESCINIT( DESCV, DBLK, DBLK, DBLK, DBLK, II, JJ, CONTXT, $ 1, INFO ) END IF CALL PDGEMR2D( DBLK, DBLK, A, I-DBLK+1, I-DBLK+1, DESCA, T, 1, 1, $ DESCT, CONTXT ) IF ( MYROW .EQ. II .AND. MYCOL .EQ. JJ ) THEN CALL DLASET( 'All', DBLK, DBLK, ZERO, ONE, V, LDV ) CALL DLAQR3( .TRUE., .TRUE., DBLK, 1, DBLK, DBLK-1, T, LDT, 1, $ DBLK, V, LDV, NS, ND, WR, WI, WORK, DBLK, DBLK, $ WORK( DBLK*DBLK+1 ), DBLK, DBLK, WORK( 2*DBLK*DBLK+1 ), $ DBLK, WORK( 3*DBLK*DBLK+1 ), LWORK-3*DBLK*DBLK ) CALL DGEBS2D( CONTXT, 'All', ' ', DBLK, DBLK, V, LDV ) CALL IGEBS2D( CONTXT, 'All', ' ', 1, 1, ND, 1 ) ELSE CALL DGEBR2D( CONTXT, 'All', ' ', DBLK, DBLK, V, LDV, II, JJ ) CALL IGEBR2D( CONTXT, 'All', ' ', 1, 1, ND, 1, II, JJ ) END IF * IF( ND .GT. 0 ) THEN * * Copy the local matrix back to the diagonal block. * CALL PDGEMR2D( DBLK, DBLK, T, 1, 1, DESCT, A, I-DBLK+1, $ I-DBLK+1, DESCA, CONTXT ) * * Update T and Z. * IF( MOD( I-DBLK, HBL )+DBLK .LE. HBL ) THEN * * Simplest case: the deflation window is located on one * processor. * Call DGEMM directly to perform the update. * HSTEP = LWORK / DBLK VSTEP = HSTEP * * Update horizontal slab in A. * IF( WANTT ) THEN CALL INFOG2L( I-DBLK+1, I+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 10 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL DGEMM( 'T', 'N', DBLK, KLN, DBLK, ONE, V, $ LDV, A( IROW+(KKCOL-1)*LDA ), LDA, ZERO, WORK, $ DBLK ) CALL DLAMOV( 'A', DBLK, KLN, WORK, DBLK, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 10 CONTINUE END IF END IF * * Update vertical slab in A. * CALL INFOG2L( LTOP, I-DBLK+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( I-DBLK, I-DBLK+1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 20 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ZERO, WORK, $ KLN ) CALL DLAMOV( 'A', KLN, DBLK, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 20 CONTINUE END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN CALL INFOG2L( ILOZ, I-DBLK+1, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, I-DBLK+1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 30 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ZERO, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, DBLK, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 30 CONTINUE END IF END IF * ELSE IF( MOD( I-DBLK, HBL )+DBLK .LE. 2*HBL ) THEN * * More complicated case: the deflation window lay on a 2x2 * processor mesh. * Call DGEMM locally and communicate by pair. * D1 = HBL - MOD( I-DBLK, HBL ) D2 = DBLK - D1 HSTEP = LWORK / DBLK VSTEP = HSTEP * * Update horizontal slab in A. * IF( WANTT ) THEN CALL INFOG2L( I-DBLK+1, I+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW .EQ. UP ) THEN IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 40 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL DGEMM( 'T', 'N', DBLK, KLN, DBLK, ONE, V, $ DBLK, A( IROW+(KKCOL-1)*LDA ), LDA, ZERO, $ WORK, DBLK ) CALL DLAMOV( 'A', DBLK, KLN, WORK, DBLK, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 40 CONTINUE END IF ELSE IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 50 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL DGEMM( 'T', 'N', D2, KLN, D1, ONE, $ V( 1, D1+1 ), LDV, A( IROW+(KKCOL-1)*LDA ), $ LDA, ZERO, WORK( D1+1 ), DBLK ) CALL DGESD2D( CONTXT, D2, KLN, WORK( D1+1 ), $ DBLK, DOWN, MYCOL ) CALL DGERV2D( CONTXT, D1, KLN, WORK, DBLK, DOWN, $ MYCOL ) CALL DGEMM( 'T', 'N', D1, KLN, D1, ONE, $ V, LDV, A( IROW+(KKCOL-1)*LDA ), LDA, ONE, $ WORK, DBLK ) CALL DLAMOV( 'A', D1, KLN, WORK, DBLK, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 50 CONTINUE ELSE IF( UP .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 60 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL DGEMM( 'T', 'N', D1, KLN, D2, ONE, $ V( D1+1, 1 ), LDV, A( IROW+(KKCOL-1)*LDA ), $ LDA, ZERO, WORK, DBLK ) CALL DGESD2D( CONTXT, D1, KLN, WORK, DBLK, UP, $ MYCOL ) CALL DGERV2D( CONTXT, D2, KLN, WORK( D1+1 ), $ DBLK, UP, MYCOL ) CALL DGEMM( 'T', 'N', D2, KLN, D2, ONE, $ V( D1+1, D1+1 ), LDV, $ A( IROW+(KKCOL-1)*LDA ), LDA, ONE, $ WORK( D1+1 ), DBLK ) CALL DLAMOV( 'A', D2, KLN, WORK( D1+1 ), DBLK, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 60 CONTINUE END IF END IF END IF * * Update vertical slab in A. * CALL INFOG2L( LTOP, I-DBLK+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. LEFT ) THEN IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( I-DBLK, I-DBLK+1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 70 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ZERO, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, DBLK, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 70 CONTINUE END IF ELSE IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( I-DBLK, I-DBLK+1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 80 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, D2, D1, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, $ V( 1, D1+1 ), LDV, ZERO, WORK( 1+D1*KLN ), $ KLN ) CALL DGESD2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, RIGHT ) CALL DGERV2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ RIGHT ) CALL DGEMM( 'N', 'N', KLN, D1, D1, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ONE, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, D1, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 80 CONTINUE ELSE IF ( LEFT .EQ. JJ ) THEN CALL INFOG2L( I-DBLK, I-DBLK+1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 90 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, D1, D2, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( D1+1, 1 ), $ LDV, ZERO, WORK, KLN ) CALL DGESD2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ LEFT ) CALL DGERV2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, LEFT ) CALL DGEMM( 'N', 'N', KLN, D2, D2, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( D1+1, D1+1 ), $ LDV, ONE, WORK( 1+D1*KLN ), KLN ) CALL DLAMOV( 'A', KLN, D2, WORK( 1+D1*KLN ), KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 90 CONTINUE END IF END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN CALL INFOG2L( ILOZ, I-DBLK+1, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. LEFT ) THEN IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, I-DBLK+1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 100 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ZERO, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, DBLK, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 100 CONTINUE END IF ELSE IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, I-DBLK+1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 110 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, D2, D1, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, $ V( 1, D1+1 ), LDV, ZERO, WORK( 1+D1*KLN ), $ KLN ) CALL DGESD2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, RIGHT ) CALL DGERV2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ RIGHT ) CALL DGEMM( 'N', 'N', KLN, D1, D1, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ONE, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, D1, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 110 CONTINUE ELSE IF( LEFT .EQ. JJ ) THEN CALL INFOG2L( IHIZ, I-DBLK+1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 120 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, D1, D2, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, $ V( D1+1, 1 ), LDV, ZERO, WORK, KLN ) CALL DGESD2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ LEFT ) CALL DGERV2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, LEFT ) CALL DGEMM( 'N', 'N', KLN, D2, D2, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, $ V( D1+1, D1+1 ), LDV, ONE, $ WORK( 1+D1*KLN ), KLN ) CALL DLAMOV( 'A', KLN, D2, WORK( 1+D1*KLN ), $ KLN, Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 120 CONTINUE END IF END IF END IF * ELSE * * Most complicated case: the deflation window lay across the * border of the processor mesh. * Treat V as a distributed matrix and call PDGEMM. * HSTEP = LWORK / DBLK * NPCOL VSTEP = LWORK / DBLK * NPROW LLDTMP = NUMROC( DBLK, DBLK, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCV, DBLK, DBLK, DBLK, DBLK, 0, 0, CONTXT, $ LLDTMP, INFO ) CALL DESCINIT( DESCWH, DBLK, HSTEP, DBLK, LWORK / DBLK, 0, $ 0, CONTXT, LLDTMP, INFO ) * * Update horizontal slab in A. * IF( WANTT ) THEN DO 130 KKCOL = I+1, N, HSTEP KLN = MIN( HSTEP, N-KKCOL+1 ) CALL PDGEMM( 'T', 'N', DBLK, KLN, DBLK, ONE, V, 1, 1, $ DESCV, A, I-DBLK+1, KKCOL, DESCA, ZERO, WORK, 1, $ 1, DESCWH ) CALL PDGEMR2D( DBLK, KLN, WORK, 1, 1, DESCWH, A, $ I-DBLK+1, KKCOL, DESCA, CONTXT ) 130 CONTINUE END IF * * Update vertical slab in A. * DO 140 KKROW = LTOP, I-DBLK, VSTEP KLN = MIN( VSTEP, I-DBLK-KKROW+1 ) LLDTMP = NUMROC( KLN, LWORK / DBLK, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCWV, KLN, DBLK, LWORK / DBLK, DBLK, 0, $ 0, CONTXT, LLDTMP, INFO ) CALL PDGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, A, KKROW, $ I-DBLK+1, DESCA, V, 1, 1, DESCV, ZERO, WORK, 1, 1, $ DESCWV ) CALL PDGEMR2D( KLN, DBLK, WORK, 1, 1, DESCWV, A, KKROW, $ I-DBLK+1, DESCA, CONTXT ) 140 CONTINUE * * Update vertical slab in Z. * IF( WANTZ ) THEN DO 150 KKROW = ILOZ, IHIZ, VSTEP KLN = MIN( VSTEP, IHIZ-KKROW+1 ) LLDTMP = NUMROC( KLN, LWORK / DBLK, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCWV, KLN, DBLK, LWORK / DBLK, DBLK, $ 0, 0, CONTXT, LLDTMP, INFO ) CALL PDGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, Z, KKROW, $ I-DBLK+1, DESCZ, V, 1, 1, DESCV, ZERO, WORK, 1, $ 1, DESCWV ) CALL PDGEMR2D( KLN, DBLK, WORK, 1, 1, DESCWV, Z, $ KKROW, I-DBLK+1, DESCZ, CONTXT ) 150 CONTINUE END IF END IF * * Extract converged eigenvalues. * II = 0 160 CONTINUE IF( II .EQ. ND-1 .OR. WI( DBLK-II ) .EQ. ZERO ) THEN IF( NODE .EQ. 0 ) THEN SR( I-II ) = WR( DBLK-II ) ELSE SR( I-II ) = ZERO END IF SI( I-II ) = ZERO II = II + 1 ELSE IF( NODE .EQ. 0 ) THEN SR( I-II-1 ) = WR( DBLK-II-1 ) SR( I-II ) = WR( DBLK-II ) SI( I-II-1 ) = WI( DBLK-II-1 ) SI( I-II ) = WI( DBLK-II ) ELSE SR( I-II-1 ) = ZERO SR( I-II ) = ZERO SI( I-II-1 ) = ZERO SI( I-II ) = ZERO END IF II = II + 2 END IF IF( II .LT. ND ) GOTO 160 END IF * * END OF PDLAQR2 * END scalapack-2.0.2/SRC/pdlaqr3.f000644 000766 000024 00000126610 11705457544 016077 0ustar00juliestaff000000 000000 RECURSIVE SUBROUTINE PDLAQR3( WANTT, WANTZ, N, KTOP, KBOT, NW, H, $ DESCH, ILOZ, IHIZ, Z, DESCZ, NS, ND, $ SR, SI, V, DESCV, NH, T, DESCT, NV, $ WV, DESCW, WORK, LWORK, IWORK, $ LIWORK, RECLEVEL ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IHIZ, ILOZ, KBOT, KTOP, LWORK, N, ND, NH, NS, $ NV, NW, LIWORK, RECLEVEL LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. INTEGER DESCH( * ), DESCZ( * ), DESCT( * ), DESCV( * ), $ DESCW( * ), IWORK( * ) DOUBLE PRECISION H( * ), SI( KBOT ), SR( KBOT ), T( * ), $ V( * ), WORK( * ), WV( * ), $ Z( * ) * .. * * Purpose * ======= * * Aggressive early deflation: * * This subroutine accepts as input an upper Hessenberg matrix H and * performs an orthogonal similarity transformation designed to detect * and deflate fully converged eigenvalues from a trailing principal * submatrix. On output H has been overwritten by a new Hessenberg * matrix that is a perturbation of an orthogonal similarity * transformation of H. It is to be hoped that the final version of H * has many zero subdiagonal entries. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * If .TRUE., then the Hessenberg matrix H is fully updated * so that the quasi-triangular Schur factor may be * computed (in cooperation with the calling subroutine). * If .FALSE., then only enough of H is updated to preserve * the eigenvalues. * * WANTZ (global input) LOGICAL * If .TRUE., then the orthogonal matrix Z is updated so * so that the orthogonal Schur factor may be computed * (in cooperation with the calling subroutine). * If .FALSE., then Z is not referenced. * * N (global input) INTEGER * The order of the matrix H and (if WANTZ is .TRUE.) the * order of the orthogonal matrix Z. * * KTOP (global input) INTEGER * It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0. * KBOT and KTOP together determine an isolated block * along the diagonal of the Hessenberg matrix. * * KBOT (global input) INTEGER * It is assumed without a check that either * KBOT = N or H(KBOT+1,KBOT)=0. KBOT and KTOP together * determine an isolated block along the diagonal of the * Hessenberg matrix. * * NW (global input) INTEGER * Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). * * H (local input/output) DOUBLE PRECISION array, dimension * (DESCH(LLD_),*) * On input the initial N-by-N section of H stores the * Hessenberg matrix undergoing aggressive early deflation. * On output H has been transformed by an orthogonal * similarity transformation, perturbed, and the returned * to Hessenberg form that (it is to be hoped) has some * zero subdiagonal entries. * * DESCH (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix H. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. * * Z (input/output) DOUBLE PRECISION array, dimension * (DESCH(LLD_),*) * IF WANTZ is .TRUE., then on output, the orthogonal * similarity transformation mentioned above has been * accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right. * If WANTZ is .FALSE., then Z is unreferenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * NS (global output) INTEGER * The number of unconverged (ie approximate) eigenvalues * returned in SR and SI that may be used as shifts by the * calling subroutine. * * ND (global output) INTEGER * The number of converged eigenvalues uncovered by this * subroutine. * * SR (global output) DOUBLE PRECISION array, dimension KBOT * SI (global output) DOUBLE PRECISION array, dimension KBOT * On output, the real and imaginary parts of approximate * eigenvalues that may be used for shifts are stored in * SR(KBOT-ND-NS+1) through SR(KBOT-ND) and * SI(KBOT-ND-NS+1) through SI(KBOT-ND), respectively. * The real and imaginary parts of converged eigenvalues * are stored in SR(KBOT-ND+1) through SR(KBOT) and * SI(KBOT-ND+1) through SI(KBOT), respectively. * * V (global workspace) DOUBLE PRECISION array, dimension * (DESCV(LLD_),*) * An NW-by-NW distributed work array. * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * NH (input) INTEGER scalar * The number of columns of T. NH.GE.NW. * * T (global workspace) DOUBLE PRECISION array, dimension * (DESCV(LLD_),*) * * DESCT (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix T. * * NV (global input) INTEGER * The number of rows of work array WV available for * workspace. NV.GE.NW. * * WV (global workspace) DOUBLE PRECISION array, dimension * (DESCW(LLD_),*) * * DESCW (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix WV. * * WORK (local workspace) DOUBLE PRECISION array, dimension LWORK. * On exit, WORK(1) is set to an estimate of the optimal value * of LWORK for the given values of N, NW, KTOP and KBOT. * * LWORK (local input) INTEGER * The dimension of the work array WORK. LWORK = 2*NW * suffices, but greater efficiency may result from larger * values of LWORK. * * If LWORK = -1, then a workspace query is assumed; PDLAQR3 * only estimates the optimal workspace size for the given * values of N, NW, KTOP and KBOT. The estimate is returned * in WORK(1). No error message related to LWORK is issued * by XERBLA. Neither H nor Z are accessed. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the workspace array IWORK * * ================================================================ * Based on contributions by * Robert Granat and Meiyue Shao, * Department of Computing Science and HPC2N, * Umea University, Sweden * * ================================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ INTEGER RECMAX LOGICAL SORTGRAD PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, RECMAX = 3, $ SORTGRAD = .FALSE. ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. DOUBLE PRECISION AA, BB, BETA, CC, CS, DD, EVI, EVK, FOO, S, $ SAFMAX, SAFMIN, SMLNUM, SN, TAU, ULP, $ ELEM, ELEM1, ELEM2, ELEM3, R1, ANORM, RNORM, $ RESAED INTEGER I, IFST, ILST, INFO, INFQR, J, JW, K, KCOL, $ KEND, KLN, KROW, KWTOP, LTOP, LWK1, LWK2, LWK3, $ LWKOPT, NMIN, LLDH, LLDZ, LLDT, LLDV, LLDWV, $ ICTXT, NPROW, NMAX, NPCOL, MYROW, MYCOL, NB, $ IROFFH, M, RCOLS, TAUROWS, RROWS, TAUCOLS, $ ITAU, IR, IPW, NPROCS, MLOC, IROFFHH, $ ICOFFHH, HHRSRC, HHCSRC, HHROWS, HHCOLS, $ IROFFZZ, ICOFFZZ, ZZRSRC, ZZCSRC, ZZROWS, $ ZZCOLS, IERR, TZROWS0, TZCOLS0, IERR0, IPT0, $ IPZ0, IPW0, NB2, ROUND, LILST, KK, LILST0, $ IWRK1, RSRC, CSRC, LWK4, LWK5, IWRK2, LWK6, $ LWK7, LWK8, ILWKOPT, TZROWS, TZCOLS, NSEL, $ NPMIN, ICTXT_NEW, MYROW_NEW, MYCOL_NEW LOGICAL BULGE, SORTED, LQUERY * .. * .. Local Arrays .. INTEGER PAR( 6 ), DESCR( DLEN_ ), $ DESCTAU( DLEN_ ), DESCHH( DLEN_ ), $ DESCZZ( DLEN_ ), DESCTZ0( DLEN_ ), $ PMAP( 64*64 ) DOUBLE PRECISION DDUM( 1 ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, PDLANGE INTEGER PILAENVX, NUMROC, INDXG2P, ICEIL, BLACS_PNUM EXTERNAL DLAMCH, PILAENVX, NUMROC, INDXG2P, PDLANGE, $ MPI_WTIME, ICEIL, BLACS_PNUM * .. * .. External Subroutines .. EXTERNAL PDCOPY, PDGEHRD, PDGEMM, DLABAD, PDLACPY, $ PDLAQR1, DLANV2, PDLAQR0, PDLARF, PDLARFG, $ PDLASET, PDTRORD, PDELGET, PDELSET, $ PDLAMVE, BLACS_GRIDINFO, BLACS_GRIDMAP, $ BLACS_GRIDEXIT, PDGEMR2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX, MIN, SQRT * .. * .. Executable Statements .. ICTXT = DESCH( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL * * Extract local leading dimensions, blockfactors, offset for * keeping the alignment requirements and size of deflation window. * LLDH = DESCH( LLD_ ) LLDZ = DESCZ( LLD_ ) LLDT = DESCT( LLD_ ) LLDV = DESCV( LLD_ ) LLDWV = DESCW( LLD_ ) NB = DESCH( MB_ ) IROFFH = MOD( KTOP - 1, NB ) JW = MIN( NW, KBOT-KTOP+1 ) NSEL = NB+JW * * Extract environment variables for parallel eigenvalue reordering. * PAR(1) = PILAENVX(ICTXT, 17, 'PDLAQR3', 'SV', JW, NB, -1, -1) PAR(2) = PILAENVX(ICTXT, 18, 'PDLAQR3', 'SV', JW, NB, -1, -1) PAR(3) = PILAENVX(ICTXT, 19, 'PDLAQR3', 'SV', JW, NB, -1, -1) PAR(4) = PILAENVX(ICTXT, 20, 'PDLAQR3', 'SV', JW, NB, -1, -1) PAR(5) = PILAENVX(ICTXT, 21, 'PDLAQR3', 'SV', JW, NB, -1, -1) PAR(6) = PILAENVX(ICTXT, 22, 'PDLAQR3', 'SV', JW, NB, -1, -1) * * Check if workspace query. * LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * Estimate optimal workspace. * IF( JW.LE.2 ) THEN LWKOPT = 1 ELSE * * Workspace query calls to PDGEHRD and PDORMHR. * TAUROWS = NUMROC( 1, 1, MYCOL, DESCV(RSRC_), NPROW ) TAUCOLS = NUMROC( JW+IROFFH, NB, MYCOL, DESCV(CSRC_), $ NPCOL ) CALL PDGEHRD( JW, 1, JW, T, 1, 1, DESCT, WORK, WORK, -1, $ INFO ) LWK1 = INT( WORK( 1 ) ) + TAUROWS*TAUCOLS * * Workspace query call to PDORMHR. * CALL PDORMHR( 'Right', 'No', JW, JW, 1, JW, T, 1, 1, DESCT, $ WORK, V, 1, 1, DESCV, WORK, -1, INFO ) LWK2 = INT( WORK( 1 ) ) * * Workspace query call to PDLAQR0. * NMIN = PILAENVX( ICTXT, 12, 'PDLAQR3', 'SV', JW, 1, JW, LWORK ) NMAX = ( N-1 ) / 3 IF( JW+IROFFH.GT.NMIN .AND. JW+IROFFH.LE.NMAX $ .AND. RECLEVEL.LT.RECMAX ) THEN CALL PDLAQR0( .TRUE., .TRUE., JW+IROFFH, 1+IROFFH, $ JW+IROFFH, T, DESCT, SR, SI, 1, JW, V, DESCV, $ WORK, -1, IWORK, LIWORK-NSEL, INFQR, $ RECLEVEL+1 ) LWK3 = INT( WORK( 1 ) ) IWRK1 = IWORK( 1 ) ELSE RSRC = DESCT( RSRC_ ) CSRC = DESCT( CSRC_ ) DESCT( RSRC_ ) = 0 DESCT( CSRC_ ) = 0 CALL PDLAQR1( .TRUE., .TRUE., JW+IROFFH, 1, JW+IROFFH, T, $ DESCT, SR, SI, 1, JW+IROFFH, V, DESCV, WORK, -1, $ IWORK, LIWORK-NSEL, INFQR ) DESCT( RSRC_ ) = RSRC DESCT( CSRC_ ) = CSRC LWK3 = INT( WORK( 1 ) ) IWRK1 = IWORK( 1 ) END IF * * Workspace in case of alignment problems. * TZROWS0 = NUMROC( JW+IROFFH, NB, MYROW, 0, NPROW ) TZCOLS0 = NUMROC( JW+IROFFH, NB, MYCOL, 0, NPCOL ) LWK4 = 2 * TZROWS0*TZCOLS0 * * Workspace check for reordering. * CALL PDTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, 1, $ DESCT, V, 1, 1, DESCV, DDUM, DDUM, MLOC, WORK, -1, $ IWORK, LIWORK-NSEL, INFO ) LWK5 = INT( WORK( 1 ) ) IWRK2 = IWORK( 1 ) * * Extra workspace for reflecting back spike * (workspace for PDLARF approximated for simplicity). * RROWS = NUMROC( N+IROFFH, NB, MYROW, DESCV(RSRC_), NPROW ) RCOLS = NUMROC( 1, 1, MYCOL, DESCV(CSRC_), NPCOL ) LWK6 = RROWS*RCOLS + TAUROWS*TAUCOLS + $ 2*ICEIL(ICEIL(JW+IROFFH,NB),NPROW)*NB $ *ICEIL(ICEIL(JW+IROFFH,NB),NPCOL)*NB * * Extra workspace needed by PBLAS update calls * (also estimated for simplicity). * LWK7 = MAX( ICEIL(ICEIL(JW,NB),NPROW)*NB * $ ICEIL(ICEIL(N-KBOT,NB),NPCOL)*NB, $ ICEIL(ICEIL(IHIZ-ILOZ+1,NB),NPROW)*NB * $ ICEIL(ICEIL(JW,NB),NPCOL)*NB, $ ICEIL(ICEIL(KBOT-JW,NB),NPROW)*NB * $ ICEIL(ICEIL(JW,NB),NPCOL)*NB ) * * Residual check workspace. * LWK8 = 0 * * Optimal workspace. * LWKOPT = MAX( LWK1, LWK2, LWK3+LWK4, LWK5, LWK6, LWK7, LWK8 ) ILWKOPT = MAX( IWRK1, IWRK2 ) END IF * * Quick return in case of workspace query. * WORK( 1 ) = DBLE( LWKOPT ) * * IWORK(1:NSEL) is used as the array SELECT for PDTRORD. * IWORK( 1 ) = ILWKOPT + NSEL IF( LQUERY ) $ RETURN * * Nothing to do for an empty active block ... NS = 0 ND = 0 IF( KTOP.GT.KBOT ) $ RETURN * ... nor for an empty deflation window. * IF( NW.LT.1 ) $ RETURN * * Machine constants. * SAFMIN = DLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN CALL DLABAD( SAFMIN, SAFMAX ) ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N ) / ULP ) * * Setup deflation window. * JW = MIN( NW, KBOT-KTOP+1 ) KWTOP = KBOT - JW + 1 IF( KWTOP.EQ.KTOP ) THEN S = ZERO ELSE CALL PDELGET( 'All', '1-Tree', S, H, KWTOP, KWTOP-1, DESCH ) END IF * IF( KBOT.EQ.KWTOP ) THEN * * 1-by-1 deflation window: not much to do. * CALL PDELGET( 'All', '1-Tree', SR( KWTOP ), H, KWTOP, KWTOP, $ DESCH ) SI( KWTOP ) = ZERO NS = 1 ND = 0 IF( ABS( S ).LE.MAX( SMLNUM, ULP*ABS( SR( KWTOP ) ) ) ) $ THEN NS = 0 ND = 1 IF( KWTOP.GT.KTOP ) $ CALL PDELSET( H, KWTOP, KWTOP-1 , DESCH, ZERO ) END IF RETURN END IF * IF( KWTOP.EQ.KTOP .AND. KBOT-KWTOP.EQ.1 ) THEN * * 2-by-2 deflation window: a little more to do. * CALL PDELGET( 'All', '1-Tree', AA, H, KWTOP, KWTOP, DESCH ) CALL PDELGET( 'All', '1-Tree', BB, H, KWTOP, KWTOP+1, DESCH ) CALL PDELGET( 'All', '1-Tree', CC, H, KWTOP+1, KWTOP, DESCH ) CALL PDELGET( 'All', '1-Tree', DD, H, KWTOP+1, KWTOP+1, DESCH ) CALL DLANV2( AA, BB, CC, DD, SR(KWTOP), SI(KWTOP), $ SR(KWTOP+1), SI(KWTOP+1), CS, SN ) NS = 0 ND = 2 IF( CC.EQ.ZERO ) THEN I = KWTOP IF( I+2.LE.N .AND. WANTT ) $ CALL PDROT( N-I-1, H, I, I+2, DESCH, DESCH(M_), H, I+1, $ I+2, DESCH, DESCH(M_), CS, SN, WORK, LWORK, INFO ) IF( I.GT.1 ) $ CALL PDROT( I-1, H, 1, I, DESCH, 1, H, 1, I+1, DESCH, 1, $ CS, SN, WORK, LWORK, INFO ) IF( WANTZ ) $ CALL PDROT( IHIZ-ILOZ+1, Z, ILOZ, I, DESCZ, 1, Z, ILOZ, $ I+1, DESCZ, 1, CS, SN, WORK, LWORK, INFO ) CALL PDELSET( H, I, I, DESCH, AA ) CALL PDELSET( H, I, I+1, DESCH, BB ) CALL PDELSET( H, I+1, I, DESCH, CC ) CALL PDELSET( H, I+1, I+1, DESCH, DD ) END IF WORK( 1 ) = DBLE( LWKOPT ) RETURN END IF * * Calculate new value for IROFFH in case deflation window * was adjusted. * IROFFH = MOD( KWTOP - 1, NB ) * * Adjust number of rows and columns of T matrix descriptor * to prepare for call to PDBTRORD. * DESCT( M_ ) = JW+IROFFH DESCT( N_ ) = JW+IROFFH * * Convert to spike-triangular form. (In case of a rare QR failure, * this routine continues to do aggressive early deflation using that * part of the deflation window that converged using INFQR here and * there to keep track.) * * Copy the trailing submatrix to the working space. * CALL PDLASET( 'All', IROFFH, JW+IROFFH, ZERO, ONE, T, 1, 1, $ DESCT ) CALL PDLASET( 'All', JW, IROFFH, ZERO, ZERO, T, 1+IROFFH, 1, $ DESCT ) CALL PDLACPY( 'All', 1, JW, H, KWTOP, KWTOP, DESCH, T, 1+IROFFH, $ 1+IROFFH, DESCT ) CALL PDLACPY( 'Upper', JW-1, JW-1, H, KWTOP+1, KWTOP, DESCH, T, $ 1+IROFFH+1, 1+IROFFH, DESCT ) IF( JW.GT.2 ) $ CALL PDLASET( 'Lower', JW-2, JW-2, ZERO, ZERO, T, 1+IROFFH+2, $ 1+IROFFH, DESCT ) CALL PDLACPY( 'All', JW-1, 1, H, KWTOP+1, KWTOP+JW-1, DESCH, T, $ 1+IROFFH+1, 1+IROFFH+JW-1, DESCT ) * * Initialize the working orthogonal matrix. * CALL PDLASET( 'All', JW+IROFFH, JW+IROFFH, ZERO, ONE, V, 1, 1, $ DESCV ) * * Compute the Schur form of T. * NPMIN = PILAENVX( ICTXT, 23, 'PDLAQR3', 'SV', JW, NB, NPROW, $ NPCOL ) NMIN = PILAENVX( ICTXT, 12, 'PDLAQR3', 'SV', JW, 1, JW, LWORK ) NMAX = ( N-1 ) / 3 IF( MIN(NPROW, NPCOL).LE.NPMIN+1 .OR. RECLEVEL.GE.1 ) THEN * * The AED window is large enough. * Compute the Schur decomposition with all processors. * IF( JW+IROFFH.GT.NMIN .AND. JW+IROFFH.LE.NMAX $ .AND. RECLEVEL.LT.RECMAX ) THEN CALL PDLAQR0( .TRUE., .TRUE., JW+IROFFH, 1+IROFFH, $ JW+IROFFH, T, DESCT, SR( KWTOP-IROFFH ), $ SI( KWTOP-IROFFH ), 1+IROFFH, JW+IROFFH, V, DESCV, $ WORK, LWORK, IWORK(NSEL+1), LIWORK-NSEL, INFQR, $ RECLEVEL+1 ) ELSE IF( DESCT(RSRC_).EQ.0 .AND. DESCT(CSRC_).EQ.0 ) THEN IF( JW+IROFFH.GT.DESCT( MB_ ) ) THEN CALL PDLAQR1( .TRUE., .TRUE., JW+IROFFH, 1, $ JW+IROFFH, T, DESCT, SR( KWTOP-IROFFH ), $ SI( KWTOP-IROFFH ), 1, JW+IROFFH, V, $ DESCV, WORK, LWORK, IWORK(NSEL+1), LIWORK-NSEL, $ INFQR ) ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL DLAHQR( .TRUE., .TRUE., JW+IROFFH, 1+IROFFH, $ JW+IROFFH, T, DESCT(LLD_), $ SR( KWTOP-IROFFH ), SI( KWTOP-IROFFH ), $ 1+IROFFH, JW+IROFFH, V, DESCV(LLD_), INFQR ) ELSE INFQR = 0 END IF IF( NPROCS.GT.1 ) $ CALL IGAMN2D( ICTXT, 'All', '1-Tree', 1, 1, INFQR, $ 1, -1, -1, -1, -1, -1 ) END IF ELSEIF( JW+IROFFH.LE.DESCT( MB_ ) ) THEN IF( MYROW.EQ.DESCT(RSRC_) .AND. MYCOL.EQ.DESCT(CSRC_) ) $ THEN CALL DLAHQR( .TRUE., .TRUE., JW+IROFFH, 1+IROFFH, $ JW+IROFFH, T, DESCT(LLD_), $ SR( KWTOP-IROFFH ), SI( KWTOP-IROFFH ), $ 1+IROFFH, JW+IROFFH, V, DESCV(LLD_), INFQR ) ELSE INFQR = 0 END IF IF( NPROCS.GT.1 ) $ CALL IGAMN2D( ICTXT, 'All', '1-Tree', 1, 1, INFQR, $ 1, -1, -1, -1, -1, -1 ) ELSE TZROWS0 = NUMROC( JW+IROFFH, NB, MYROW, 0, NPROW ) TZCOLS0 = NUMROC( JW+IROFFH, NB, MYCOL, 0, NPCOL ) CALL DESCINIT( DESCTZ0, JW+IROFFH, JW+IROFFH, NB, NB, 0, $ 0, ICTXT, MAX(1,TZROWS0), IERR0 ) IPT0 = 1 IPZ0 = IPT0 + MAX(1,TZROWS0)*TZCOLS0 IPW0 = IPZ0 + MAX(1,TZROWS0)*TZCOLS0 CALL PDLAMVE( 'All', JW+IROFFH, JW+IROFFH, T, 1, 1, $ DESCT, WORK(IPT0), 1, 1, DESCTZ0, WORK(IPW0) ) CALL PDLASET( 'All', JW+IROFFH, JW+IROFFH, ZERO, ONE, $ WORK(IPZ0), 1, 1, DESCTZ0 ) CALL PDLAQR1( .TRUE., .TRUE., JW+IROFFH, 1, $ JW+IROFFH, WORK(IPT0), DESCTZ0, $ SR( KWTOP-IROFFH ), SI( KWTOP-IROFFH ), $ 1, JW+IROFFH, WORK(IPZ0), $ DESCTZ0, WORK(IPW0), LWORK-IPW0+1, IWORK(NSEL+1), $ LIWORK-NSEL, INFQR ) CALL PDLAMVE( 'All', JW+IROFFH, JW+IROFFH, WORK(IPT0), 1, $ 1, DESCTZ0, T, 1, 1, DESCT, WORK(IPW0) ) CALL PDLAMVE( 'All', JW+IROFFH, JW+IROFFH, WORK(IPZ0), 1, $ 1, DESCTZ0, V, 1, 1, DESCV, WORK(IPW0) ) END IF END IF ELSE * * The AED window is too small. * Redistribute the AED window to a subgrid * and do the computation on the subgrid. * ICTXT_NEW = ICTXT DO 20 I = 0, NPMIN-1 DO 10 J = 0, NPMIN-1 PMAP( J+1+I*NPMIN ) = BLACS_PNUM( ICTXT, I, J ) 10 CONTINUE 20 CONTINUE CALL BLACS_GRIDMAP( ICTXT_NEW, PMAP, NPMIN, NPMIN, NPMIN ) CALL BLACS_GRIDINFO( ICTXT_NEW, NPMIN, NPMIN, MYROW_NEW, $ MYCOL_NEW ) IF( MYROW.GE.NPMIN .OR. MYCOL.GE.NPMIN ) ICTXT_NEW = -1 IF( ICTXT_NEW.GE.0 ) THEN TZROWS0 = NUMROC( JW, NB, MYROW_NEW, 0, NPMIN ) TZCOLS0 = NUMROC( JW, NB, MYCOL_NEW, 0, NPMIN ) CALL DESCINIT( DESCTZ0, JW, JW, NB, NB, 0, $ 0, ICTXT_NEW, MAX(1,TZROWS0), IERR0 ) IPT0 = 1 IPZ0 = IPT0 + MAX(1,TZROWS0)*MAX(1,TZCOLS0) IPW0 = IPZ0 + MAX(1,TZROWS0)*MAX(1,TZCOLS0) ELSE IPT0 = 1 IPZ0 = 2 IPW0 = 3 DESCTZ0( CTXT_ ) = -1 INFQR = 0 END IF CALL PDGEMR2D( JW, JW, T, 1+IROFFH, 1+IROFFH, DESCT, $ WORK(IPT0), 1, 1, DESCTZ0, ICTXT ) IF( ICTXT_NEW.GE.0 ) THEN CALL PDLASET( 'All', JW, JW, ZERO, ONE, WORK(IPZ0), 1, 1, $ DESCTZ0 ) NMIN = PILAENVX( ICTXT_NEW, 12, 'PDLAQR3', 'SV', JW, 1, JW, $ LWORK ) IF( JW.GT.NMIN .AND. JW.LE.NMAX .AND. RECLEVEL.LT.1 ) THEN CALL PDLAQR0( .TRUE., .TRUE., JW, 1, JW, WORK(IPT0), $ DESCTZ0, SR( KWTOP ), SI( KWTOP ), 1, JW, $ WORK(IPZ0), DESCTZ0, WORK(IPW0), LWORK-IPW0+1, $ IWORK(NSEL+1), LIWORK-NSEL, INFQR, $ RECLEVEL+1 ) ELSE CALL PDLAQR1( .TRUE., .TRUE., JW, 1, JW, WORK(IPT0), $ DESCTZ0, SR( KWTOP ), SI( KWTOP ), 1, JW, $ WORK(IPZ0), DESCTZ0, WORK(IPW0), LWORK-IPW0+1, $ IWORK(NSEL+1), LIWORK-NSEL, INFQR ) END IF END IF CALL PDGEMR2D( JW, JW, WORK(IPT0), 1, 1, DESCTZ0, T, 1+IROFFH, $ 1+IROFFH, DESCT, ICTXT ) CALL PDGEMR2D( JW, JW, WORK(IPZ0), 1, 1, DESCTZ0, V, 1+IROFFH, $ 1+IROFFH, DESCV, ICTXT ) IF( ICTXT_NEW.GE.0 ) $ CALL BLACS_GRIDEXIT( ICTXT_NEW ) IF( MYROW+MYCOL.GT.0 ) THEN DO 40 J = 0, JW-1 SR( KWTOP+J ) = ZERO SI( KWTOP+J ) = ZERO 40 CONTINUE END IF CALL IGAMN2D( ICTXT, 'All', '1-Tree', 1, 1, INFQR, 1, -1, -1, $ -1, -1, -1 ) CALL DGSUM2D( ICTXT, 'All', ' ', JW, 1, SR(KWTOP), JW, -1, -1 ) CALL DGSUM2D( ICTXT, 'All', ' ', JW, 1, SI(KWTOP), JW, -1, -1 ) END IF * * Adjust INFQR for offset from block border in submatrices. * IF( INFQR.NE.0 ) $ INFQR = INFQR - IROFFH * * PDTRORD needs a clean margin near the diagonal. * DO 50 J = 1, JW - 3 CALL PDELSET( T, J+2, J, DESCT, ZERO ) CALL PDELSET( T, J+3, J, DESCT, ZERO ) 50 CONTINUE IF( JW.GT.2 ) $ CALL PDELSET( T, JW, JW-2, DESCT, ZERO ) * * Check local residual for AED Schur decomposition. * RESAED = 0.0D+00 * * Clean up the array SELECT for PDTRORD. * DO 60 J = 1, NSEL IWORK( J ) = 0 60 CONTINUE * * Set local M counter to zero. * MLOC = 0 * * Outer deflation detection loop (label 80). * In this loop a bunch of undeflatable eigenvalues * are moved simultaneously. * DO 70 J = 1, IROFFH + INFQR IWORK( J ) = 1 70 CONTINUE * NS = JW ILST = INFQR + 1 + IROFFH IF( ILST.GT.1 ) THEN CALL PDELGET( 'All', '1-Tree', ELEM, T, ILST, ILST-1, DESCT ) BULGE = ELEM.NE.ZERO IF( BULGE ) ILST = ILST+1 END IF * 80 CONTINUE IF( ILST.LE.NS+IROFFH ) THEN * * Find the top-left corner of the local window. * LILST = MAX(ILST,NS+IROFFH-NB+1) IF( LILST.GT.1 ) THEN CALL PDELGET( 'All', '1-Tree', ELEM, T, LILST, LILST-1, $ DESCT ) BULGE = ELEM.NE.ZERO IF( BULGE ) LILST = LILST+1 END IF * * Lock all eigenvalues outside the local window. * DO 90 J = IROFFH+1, LILST-1 IWORK( J ) = 1 90 CONTINUE LILST0 = LILST * * Inner deflation detection loop (label 100). * In this loop, the undeflatable eigenvalues are moved to the * top-left corner of the local window. * 100 CONTINUE IF( LILST.LE.NS+IROFFH ) THEN IF( NS.EQ.1 ) THEN BULGE = .FALSE. ELSE CALL PDELGET( 'All', '1-Tree', ELEM, T, NS+IROFFH, $ NS+IROFFH-1, DESCT ) BULGE = ELEM.NE.ZERO END IF * * Small spike tip test for deflation. * IF( .NOT.BULGE ) THEN * * Real eigenvalue. * CALL PDELGET( 'All', '1-Tree', ELEM, T, NS+IROFFH, $ NS+IROFFH, DESCT ) FOO = ABS( ELEM ) IF( FOO.EQ.ZERO ) $ FOO = ABS( S ) CALL PDELGET( 'All', '1-Tree', ELEM, V, 1+IROFFH, $ NS+IROFFH, DESCV ) IF( ABS( S*ELEM ).LE.MAX( SMLNUM, ULP*FOO ) ) THEN * * Deflatable. * NS = NS - 1 ELSE * * Undeflatable: move it up out of the way. * IFST = NS DO 110 J = LILST, JW+IROFFH IWORK( J ) = 0 110 CONTINUE IWORK( IFST+IROFFH ) = 1 CALL PDTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, $ 1, DESCT, V, 1, 1, DESCV, WORK, $ WORK(JW+IROFFH+1), MLOC, $ WORK(2*(JW+IROFFH)+1), LWORK-2*(JW+IROFFH), $ IWORK(NSEL+1), LIWORK-NSEL, INFO ) * * Adjust the array SELECT explicitly so that it does not * rely on the output of PDTRORD. * IWORK( IFST+IROFFH ) = 0 IWORK( LILST ) = 1 LILST = LILST + 1 * * In case of a rare exchange failure, adjust the * pointers ILST and LILST to the current place to avoid * unexpected behaviors. * IF( INFO.NE.0 ) THEN LILST = MAX(INFO, LILST) ILST = MAX(INFO, ILST) END IF END IF ELSE * * Complex conjugate pair. * CALL PDELGET( 'All', '1-Tree', ELEM1, T, NS+IROFFH, $ NS+IROFFH, DESCT ) CALL PDELGET( 'All', '1-Tree', ELEM2, T, NS+IROFFH, $ NS+IROFFH-1, DESCT ) CALL PDELGET( 'All', '1-Tree', ELEM3, T, NS+IROFFH-1, $ NS+IROFFH, DESCT ) FOO = ABS( ELEM1 ) + SQRT( ABS( ELEM2 ) )* $ SQRT( ABS( ELEM3 ) ) IF( FOO.EQ.ZERO ) $ FOO = ABS( S ) CALL PDELGET( 'All', '1-Tree', ELEM1, V, 1+IROFFH, $ NS+IROFFH, DESCV ) CALL PDELGET( 'All', '1-Tree', ELEM2, V, 1+IROFFH, $ NS+IROFFH-1, DESCV ) IF( MAX( ABS( S*ELEM1 ), ABS( S*ELEM2 ) ).LE. $ MAX( SMLNUM, ULP*FOO ) ) THEN * * Deflatable. * NS = NS - 2 ELSE * * Undeflatable: move them up out of the way. * IFST = NS DO 120 J = LILST, JW+IROFFH IWORK( J ) = 0 120 CONTINUE IWORK( IFST+IROFFH ) = 1 IWORK( IFST+IROFFH-1 ) = 1 CALL PDTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, $ 1, DESCT, V, 1, 1, DESCV, WORK, $ WORK(JW+IROFFH+1), MLOC, $ WORK(2*(JW+IROFFH)+1), LWORK-2*(JW+IROFFH), $ IWORK(NSEL+1), LIWORK-NSEL, INFO ) * * Adjust the array SELECT explicitly so that it does not * rely on the output of PDTRORD. * IWORK( IFST+IROFFH ) = 0 IWORK( IFST+IROFFH-1 ) = 0 IWORK( LILST ) = 1 IWORK( LILST+1 ) = 1 LILST = LILST + 2 * * In case of a rare exchange failure, adjust the * pointers ILST and LILST to the current place to avoid * unexpected behaviors. * IF( INFO.NE.0 ) THEN LILST = MAX(INFO, LILST) ILST = MAX(INFO, ILST) END IF END IF END IF * * End of inner deflation detection loop. * GO TO 100 END IF * * Unlock the eigenvalues outside the local window. * Then undeflatable eigenvalues are moved to the proper position. * DO 130 J = ILST, LILST0-1 IWORK( J ) = 0 130 CONTINUE CALL PDTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, 1, $ DESCT, V, 1, 1, DESCV, WORK, WORK(JW+IROFFH+1), $ M, WORK(2*(JW+IROFFH)+1), LWORK-2*(JW+IROFFH), $ IWORK(NSEL+1), LIWORK-NSEL, INFO ) ILST = M + 1 * * In case of a rare exchange failure, adjust the pointer ILST to * the current place to avoid unexpected behaviors. * IF( INFO.NE.0 ) $ ILST = MAX(INFO, ILST) * * End of outer deflation detection loop. * GO TO 80 END IF * * Post-reordering step: copy output eigenvalues to output. * CALL DCOPY( JW, WORK(1+IROFFH), 1, SR( KWTOP ), 1 ) CALL DCOPY( JW, WORK(JW+2*IROFFH+1), 1, SI( KWTOP ), 1 ) * * Check local residual for reordered AED Schur decomposition. * RESAED = 0.0D+00 * * Return to Hessenberg form. * IF( NS.EQ.0 ) $ S = ZERO * IF( NS.LT.JW .AND. SORTGRAD ) THEN * * Sorting diagonal blocks of T improves accuracy for * graded matrices. Bubble sort deals well with exchange * failures. Eigenvalues/shifts from T are also restored. * ROUND = 0 SORTED = .FALSE. I = NS + 1 + IROFFH 140 CONTINUE IF( SORTED ) $ GO TO 180 SORTED = .TRUE. ROUND = ROUND + 1 * KEND = I - 1 I = INFQR + 1 + IROFFH IF( I.EQ.NS+IROFFH ) THEN K = I + 1 ELSE IF( SI( KWTOP-IROFFH + I-1 ).EQ.ZERO ) THEN K = I + 1 ELSE K = I + 2 END IF 150 CONTINUE IF( K.LE.KEND ) THEN IF( K.EQ.I+1 ) THEN EVI = ABS( SR( KWTOP-IROFFH+I-1 ) ) ELSE EVI = ABS( SR( KWTOP-IROFFH+I-1 ) ) + $ ABS( SI( KWTOP-IROFFH+I-1 ) ) END IF * IF( K.EQ.KEND ) THEN EVK = ABS( SR( KWTOP-IROFFH+K-1 ) ) ELSEIF( SI( KWTOP-IROFFH+K-1 ).EQ.ZERO ) THEN EVK = ABS( SR( KWTOP-IROFFH+K-1 ) ) ELSE EVK = ABS( SR( KWTOP-IROFFH+K-1 ) ) + $ ABS( SI( KWTOP-IROFFH+K-1 ) ) END IF * IF( EVI.GE.EVK ) THEN I = K ELSE MLOC = 0 SORTED = .FALSE. IFST = I ILST = K DO 160 J = 1, I-1 IWORK( J ) = 1 MLOC = MLOC + 1 160 CONTINUE IF( K.EQ.I+2 ) THEN IWORK( I ) = 0 IWORK(I+1) = 0 ELSE IWORK( I ) = 0 END IF IF( K.NE.KEND .AND. SI( KWTOP-IROFFH+K-1 ).NE.ZERO ) THEN IWORK( K ) = 1 IWORK(K+1) = 1 MLOC = MLOC + 2 ELSE IWORK( K ) = 1 IF( K.LT.KEND ) IWORK(K+1) = 0 MLOC = MLOC + 1 END IF DO 170 J = K+2, JW+IROFFH IWORK( J ) = 0 170 CONTINUE CALL PDTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, 1, $ DESCT, V, 1, 1, DESCV, WORK, WORK(JW+IROFFH+1), M, $ WORK(2*(JW+IROFFH)+1), LWORK-2*(JW+IROFFH), $ IWORK(NSEL+1), LIWORK-NSEL, IERR ) CALL DCOPY( JW, WORK(1+IROFFH), 1, SR( KWTOP ), 1 ) CALL DCOPY( JW, WORK(JW+2*IROFFH+1), 1, SI( KWTOP ), 1 ) IF( IERR.EQ.0 ) THEN I = ILST ELSE I = K END IF END IF IF( I.EQ.KEND ) THEN K = I + 1 ELSE IF( SI( KWTOP-IROFFH+I-1 ).EQ.ZERO ) THEN K = I + 1 ELSE K = I + 2 END IF GO TO 150 END IF GO TO 140 180 CONTINUE END IF * * Restore number of rows and columns of T matrix descriptor. * DESCT( M_ ) = NW+IROFFH DESCT( N_ ) = NH+IROFFH * IF( NS.LT.JW .OR. S.EQ.ZERO ) THEN IF( NS.GT.1 .AND. S.NE.ZERO ) THEN * * Reflect spike back into lower triangle. * RROWS = NUMROC( NS+IROFFH, NB, MYROW, DESCV(RSRC_), NPROW ) RCOLS = NUMROC( 1, 1, MYCOL, DESCV(CSRC_), NPCOL ) CALL DESCINIT( DESCR, NS+IROFFH, 1, NB, 1, DESCV(RSRC_), $ DESCV(CSRC_), ICTXT, MAX(1, RROWS), INFO ) TAUROWS = NUMROC( 1, 1, MYCOL, DESCV(RSRC_), NPROW ) TAUCOLS = NUMROC( JW+IROFFH, NB, MYCOL, DESCV(CSRC_), $ NPCOL ) CALL DESCINIT( DESCTAU, 1, JW+IROFFH, 1, NB, DESCV(RSRC_), $ DESCV(CSRC_), ICTXT, MAX(1, TAUROWS), INFO ) * IR = 1 ITAU = IR + DESCR( LLD_ ) * RCOLS IPW = ITAU + DESCTAU( LLD_ ) * TAUCOLS * CALL PDLASET( 'All', NS+IROFFH, 1, ZERO, ZERO, WORK(ITAU), $ 1, 1, DESCTAU ) * CALL PDCOPY( NS, V, 1+IROFFH, 1+IROFFH, DESCV, DESCV(M_), $ WORK(IR), 1+IROFFH, 1, DESCR, 1 ) CALL PDLARFG( NS, BETA, 1+IROFFH, 1, WORK(IR), 2+IROFFH, 1, $ DESCR, 1, WORK(ITAU+IROFFH) ) CALL PDELSET( WORK(IR), 1+IROFFH, 1, DESCR, ONE ) * CALL PDLASET( 'Lower', JW-2, JW-2, ZERO, ZERO, T, 3+IROFFH, $ 1+IROFFH, DESCT ) * CALL PDLARF( 'Left', NS, JW, WORK(IR), 1+IROFFH, 1, DESCR, $ 1, WORK(ITAU+IROFFH), T, 1+IROFFH, 1+IROFFH, $ DESCT, WORK( IPW ) ) CALL PDLARF( 'Right', NS, NS, WORK(IR), 1+IROFFH, 1, DESCR, $ 1, WORK(ITAU+IROFFH), T, 1+IROFFH, 1+IROFFH, $ DESCT, WORK( IPW ) ) CALL PDLARF( 'Right', JW, NS, WORK(IR), 1+IROFFH, 1, DESCR, $ 1, WORK(ITAU+IROFFH), V, 1+IROFFH, 1+IROFFH, $ DESCV, WORK( IPW ) ) * ITAU = 1 IPW = ITAU + DESCTAU( LLD_ ) * TAUCOLS CALL PDGEHRD( JW+IROFFH, 1+IROFFH, NS+IROFFH, T, 1, 1, $ DESCT, WORK(ITAU), WORK( IPW ), LWORK-IPW+1, INFO ) END IF * * Copy updated reduced window into place. * IF( KWTOP.GT.1 ) THEN CALL PDELGET( 'All', '1-Tree', ELEM, V, 1+IROFFH, $ 1+IROFFH, DESCV ) CALL PDELSET( H, KWTOP, KWTOP-1, DESCH, S*ELEM ) END IF CALL PDLACPY( 'Upper', JW-1, JW-1, T, 1+IROFFH+1, 1+IROFFH, $ DESCT, H, KWTOP+1, KWTOP, DESCH ) CALL PDLACPY( 'All', 1, JW, T, 1+IROFFH, 1+IROFFH, DESCT, H, $ KWTOP, KWTOP, DESCH ) CALL PDLACPY( 'All', JW-1, 1, T, 1+IROFFH+1, 1+IROFFH+JW-1, $ DESCT, H, KWTOP+1, KWTOP+JW-1, DESCH ) * * Accumulate orthogonal matrix in order to update * H and Z, if requested. * IF( NS.GT.1 .AND. S.NE.ZERO ) THEN CALL PDORMHR( 'Right', 'No', JW+IROFFH, NS+IROFFH, 1+IROFFH, $ NS+IROFFH, T, 1, 1, DESCT, WORK(ITAU), V, 1, $ 1, DESCV, WORK( IPW ), LWORK-IPW+1, INFO ) END IF * * Update vertical slab in H. * IF( WANTT ) THEN LTOP = 1 ELSE LTOP = KTOP END IF KLN = MAX( 0, KWTOP-LTOP ) IROFFHH = MOD( LTOP-1, NB ) ICOFFHH = MOD( KWTOP-1, NB ) HHRSRC = INDXG2P( LTOP, NB, MYROW, DESCH(RSRC_), NPROW ) HHCSRC = INDXG2P( KWTOP, NB, MYCOL, DESCH(CSRC_), NPCOL ) HHROWS = NUMROC( KLN+IROFFHH, NB, MYROW, HHRSRC, NPROW ) HHCOLS = NUMROC( JW+ICOFFHH, NB, MYCOL, HHCSRC, NPCOL ) CALL DESCINIT( DESCHH, KLN+IROFFHH, JW+ICOFFHH, NB, NB, $ HHRSRC, HHCSRC, ICTXT, MAX(1, HHROWS), IERR ) CALL PDGEMM( 'No', 'No', KLN, JW, JW, ONE, H, LTOP, $ KWTOP, DESCH, V, 1+IROFFH, 1+IROFFH, DESCV, ZERO, $ WORK, 1+IROFFHH, 1+ICOFFHH, DESCHH ) CALL PDLACPY( 'All', KLN, JW, WORK, 1+IROFFHH, 1+ICOFFHH, $ DESCHH, H, LTOP, KWTOP, DESCH ) * * Update horizontal slab in H. * IF( WANTT ) THEN KLN = N-KBOT IROFFHH = MOD( KWTOP-1, NB ) ICOFFHH = MOD( KBOT, NB ) HHRSRC = INDXG2P( KWTOP, NB, MYROW, DESCH(RSRC_), NPROW ) HHCSRC = INDXG2P( KBOT+1, NB, MYCOL, DESCH(CSRC_), NPCOL ) HHROWS = NUMROC( JW+IROFFHH, NB, MYROW, HHRSRC, NPROW ) HHCOLS = NUMROC( KLN+ICOFFHH, NB, MYCOL, HHCSRC, NPCOL ) CALL DESCINIT( DESCHH, JW+IROFFHH, KLN+ICOFFHH, NB, NB, $ HHRSRC, HHCSRC, ICTXT, MAX(1, HHROWS), IERR ) CALL PDGEMM( 'Tr', 'No', JW, KLN, JW, ONE, V, $ 1+IROFFH, 1+IROFFH, DESCV, H, KWTOP, KBOT+1, $ DESCH, ZERO, WORK, 1+IROFFHH, 1+ICOFFHH, DESCHH ) CALL PDLACPY( 'All', JW, KLN, WORK, 1+IROFFHH, 1+ICOFFHH, $ DESCHH, H, KWTOP, KBOT+1, DESCH ) END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN KLN = IHIZ-ILOZ+1 IROFFZZ = MOD( ILOZ-1, NB ) ICOFFZZ = MOD( KWTOP-1, NB ) ZZRSRC = INDXG2P( ILOZ, NB, MYROW, DESCZ(RSRC_), NPROW ) ZZCSRC = INDXG2P( KWTOP, NB, MYCOL, DESCZ(CSRC_), NPCOL ) ZZROWS = NUMROC( KLN+IROFFZZ, NB, MYROW, ZZRSRC, NPROW ) ZZCOLS = NUMROC( JW+ICOFFZZ, NB, MYCOL, ZZCSRC, NPCOL ) CALL DESCINIT( DESCZZ, KLN+IROFFZZ, JW+ICOFFZZ, NB, NB, $ ZZRSRC, ZZCSRC, ICTXT, MAX(1, ZZROWS), IERR ) CALL PDGEMM( 'No', 'No', KLN, JW, JW, ONE, Z, ILOZ, $ KWTOP, DESCZ, V, 1+IROFFH, 1+IROFFH, DESCV, $ ZERO, WORK, 1+IROFFZZ, 1+ICOFFZZ, DESCZZ ) CALL PDLACPY( 'All', KLN, JW, WORK, 1+IROFFZZ, 1+ICOFFZZ, $ DESCZZ, Z, ILOZ, KWTOP, DESCZ ) END IF END IF * * Return the number of deflations (ND) and the number of shifts (NS). * (Subtracting INFQR from the spike length takes care of the case of * a rare QR failure while calculating eigenvalues of the deflation * window.) * ND = JW - NS NS = NS - INFQR * * Return optimal workspace. * WORK( 1 ) = DBLE( LWKOPT ) IWORK( 1 ) = ILWKOPT + NSEL * * End of PDLAQR3 * END scalapack-2.0.2/SRC/pdlaqr4.f000644 000766 000024 00000064661 11750130340 016065 0ustar00juliestaff000000 000000 SUBROUTINE PDLAQR4( WANTT, WANTZ, N, ILO, IHI, A, DESCA, WR, WI, $ ILOZ, IHIZ, Z, DESCZ, T, LDT, V, LDV, WORK, $ LWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, INFO, LDT, LDV, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) DOUBLE PRECISION A( * ), T( LDT, * ), V( LDV, * ), WI( * ), $ WORK( * ), WR( * ), Z( * ) * .. * * Purpose * ======= * * PDLAQR4 is an auxiliary routine used to find the Schur decomposition * and or eigenvalues of a matrix already in Hessenberg form from cols * ILO to IHI. This routine requires that the active block is small * enough, i.e. IHI-ILO+1 .LE. LDT, so that it can be solved by LAPACK. * Normally, it is called by PDLAQR1. All the inputs are assumed to be * valid without checking. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix A (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that A is already upper quasi-triangular in * rows and columns IHI+1:N, and that A(ILO,ILO-1) = 0 (unless * ILO = 1). PDLAQR4 works primarily with the Hessenberg * submatrix in rows and columns ILO to IHI, but applies * transformations to all of H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * A (global input/output) DOUBLE PRECISION array, dimension * (DESCA(LLD_),*) * On entry, the upper Hessenberg matrix A. * On exit, if WANTT is .TRUE., A is upper quasi-triangular in * rows and columns ILO:IHI, with any 2-by-2 or larger diagonal * blocks not yet in standard form. If WANTT is .FALSE., the * contents of A are unspecified on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WR (global replicated output) DOUBLE PRECISION array, * dimension (N) * WI (global replicated output) DOUBLE PRECISION array, * dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in A. A may be returned with * larger diagonal blocks until the next release. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (global input/output) DOUBLE PRECISION array. * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations accumulated by PDHSEQR, and on * exit Z has been updated; transformations are applied only to * the submatrix Z(ILOZ:IHIZ,ILO:IHI). * If WANTZ is .FALSE., Z is not referenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * T (local workspace) DOUBLE PRECISION array, dimension LDT*NW. * * LDT (local input) INTEGER * The leading dimension of the array T. * LDT >= IHI-ILO+1. * * V (local workspace) DOUBLE PRECISION array, dimension LDV*NW. * * LDV (local input) INTEGER * The leading dimension of the array V. * LDV >= IHI-ILO+1. * * WORK (local workspace) DOUBLE PRECISION array, dimension LWORK. * * LWORK (local input) INTEGER * The dimension of the work array WORK. * LWORK >= IHI-ILO+1. * WORK(LWORK) is a local array and LWORK is assumed big enough. * Typically LWORK >= 4*LDS*LDS if this routine is called by * PDLAQR1. (LDS = 385, see PDLAQR1) * * INFO (global output) INTEGER * < 0: parameter number -INFO incorrect or inconsistent; * = 0: successful exit; * > 0: PDLAQR4 failed to compute all the eigenvalues ILO to IHI * in a total of 30*(IHI-ILO+1) iterations; if INFO = i, * elements i+1:ihi of WR and WI contain those eigenvalues * which have been successfully computed. * * ================================================================ * Implemented by * Meiyue Shao, Department of Computing Science and HPC2N, * Umea University, Sweden * * ================================================================ * References: * B. Kagstrom, D. Kressner, and M. Shao, * On Aggressive Early Deflation in Parallel Variants of the QR * Algorithm. * Para 2010, to appear. * * ================================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER CONTXT, HBL, I, I1, I2, IAFIRST, ICOL, ICOL1, $ ICOL2, II, IROW, IROW1, IROW2, ITMP1, ITMP2, $ IERR, J, JAFIRST, JJ, K, L, LDA, LDZ, LLDTMP, $ MYCOL, MYROW, NODE, NPCOL, NPROW, NH, NMIN, NZ, $ HSTEP, VSTEP, KKROW, KKCOL, KLN, LTOP, LEFT, $ RIGHT, UP, DOWN, D1, D2 * .. * .. Local Arrays .. INTEGER DESCT( 9 ), DESCV( 9 ), DESCWH( 9 ), $ DESCWV( 9 ) * .. * .. External Functions .. INTEGER NUMROC, ILAENV EXTERNAL NUMROC, ILAENV * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, DLASET, $ DLAHQR, DLAQR4, DESCINIT, PDGEMM, PDGEMR2D, $ DGEMM, DLAMOV, DGESD2D, DGERV2D, $ DGEBS2D, DGEBR2D, IGEBS2D, IGEBR2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 IF( N.EQ.0 .OR. NH.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) * * I1 and I2 are the indices of the first row and last column of A * to which transformations must be applied. * I = IHI L = ILO IF( WANTT ) THEN I1 = 1 I2 = N LTOP = 1 ELSE I1 = L I2 = I LTOP = L END IF * * Copy the diagonal block to local and call LAPACK. * CALL INFOG2L( ILO, ILO, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF ( MYROW .EQ. II ) THEN CALL DESCINIT( DESCT, NH, NH, NH, NH, II, JJ, CONTXT, $ LDT, IERR ) CALL DESCINIT( DESCV, NH, NH, NH, NH, II, JJ, CONTXT, $ LDV, IERR ) ELSE CALL DESCINIT( DESCT, NH, NH, NH, NH, II, JJ, CONTXT, $ 1, IERR ) CALL DESCINIT( DESCV, NH, NH, NH, NH, II, JJ, CONTXT, $ 1, IERR ) END IF CALL PDGEMR2D( NH, NH, A, ILO, ILO, DESCA, T, 1, 1, DESCT, $ CONTXT ) IF ( MYROW .EQ. II .AND. MYCOL .EQ. JJ ) THEN CALL DLASET( 'All', NH, NH, ZERO, ONE, V, LDV ) NMIN = ILAENV( 12, 'DLAQR3', 'SV', NH, 1, NH, LWORK ) IF( NH .GT. NMIN ) THEN CALL DLAQR4( .TRUE., .TRUE., NH, 1, NH, T, LDT, WR( ILO ), $ WI( ILO ), 1, NH, V, LDV, WORK, LWORK, INFO ) * Clean up the scratch used by DLAQR4. CALL DLASET( 'L', NH-2, NH-2, ZERO, ZERO, T( 3, 1 ), LDT ) ELSE CALL DLAHQR( .TRUE., .TRUE., NH, 1, NH, T, LDT, WR( ILO ), $ WI( ILO ), 1, NH, V, LDV, INFO ) END IF CALL DGEBS2D( CONTXT, 'All', ' ', NH, NH, V, LDV ) CALL IGEBS2D( CONTXT, 'All', ' ', 1, 1, INFO, 1 ) ELSE CALL DGEBR2D( CONTXT, 'All', ' ', NH, NH, V, LDV, II, JJ ) CALL IGEBR2D( CONTXT, 'All', ' ', 1, 1, INFO, 1, II, JJ ) END IF IF( INFO .NE. 0 ) INFO = INFO+ILO-1 * * Copy the local matrix back to the diagonal block. * CALL PDGEMR2D( NH, NH, T, 1, 1, DESCT, A, ILO, ILO, DESCA, $ CONTXT ) * * Update T and Z. * IF( MOD( ILO-1, HBL )+NH .LE. HBL ) THEN * * Simplest case: the diagonal block is located on one processor. * Call DGEMM directly to perform the update. * HSTEP = LWORK / NH VSTEP = HSTEP * IF( WANTT ) THEN * * Update horizontal slab in A. * CALL INFOG2L( ILO, I+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 10 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL DGEMM( 'T', 'N', NH, KLN, NH, ONE, V, $ LDV, A( IROW+(KKCOL-1)*LDA ), LDA, ZERO, WORK, $ NH ) CALL DLAMOV( 'A', NH, KLN, WORK, NH, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 10 CONTINUE END IF * * Update vertical slab in A. * CALL INFOG2L( LTOP, ILO, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( ILO-1, ILO, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 20 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, NH, NH, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ZERO, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, NH, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 20 CONTINUE END IF END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN CALL INFOG2L( ILOZ, ILO, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, ILO, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 30 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, NH, NH, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ZERO, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, NH, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 30 CONTINUE END IF END IF * ELSE IF( MOD( ILO-1, HBL )+NH .LE. 2*HBL ) THEN * * More complicated case: the diagonal block lay on a 2x2 * processor mesh. * Call DGEMM locally and communicate by pair. * D1 = HBL - MOD( ILO-1, HBL ) D2 = NH - D1 HSTEP = LWORK / NH VSTEP = HSTEP * IF( WANTT ) THEN * * Update horizontal slab in A. * CALL INFOG2L( ILO, I+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW .EQ. UP ) THEN IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 40 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL DGEMM( 'T', 'N', NH, KLN, NH, ONE, V, $ NH, A( IROW+(KKCOL-1)*LDA ), LDA, ZERO, $ WORK, NH ) CALL DLAMOV( 'A', NH, KLN, WORK, NH, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 40 CONTINUE END IF ELSE IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 50 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL DGEMM( 'T', 'N', D2, KLN, D1, ONE, $ V( 1, D1+1 ), LDV, A( IROW+(KKCOL-1)*LDA ), $ LDA, ZERO, WORK( D1+1 ), NH ) CALL DGESD2D( CONTXT, D2, KLN, WORK( D1+1 ), $ NH, DOWN, MYCOL ) CALL DGERV2D( CONTXT, D1, KLN, WORK, NH, DOWN, $ MYCOL ) CALL DGEMM( 'T', 'N', D1, KLN, D1, ONE, $ V, LDV, A( IROW+(KKCOL-1)*LDA ), LDA, ONE, $ WORK, NH ) CALL DLAMOV( 'A', D1, KLN, WORK, NH, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 50 CONTINUE ELSE IF( UP .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 60 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL DGEMM( 'T', 'N', D1, KLN, D2, ONE, $ V( D1+1, 1 ), LDV, A( IROW+(KKCOL-1)*LDA ), $ LDA, ZERO, WORK, NH ) CALL DGESD2D( CONTXT, D1, KLN, WORK, NH, UP, $ MYCOL ) CALL DGERV2D( CONTXT, D2, KLN, WORK( D1+1 ), $ NH, UP, MYCOL ) CALL DGEMM( 'T', 'N', D2, KLN, D2, ONE, $ V( D1+1, D1+1 ), LDV, $ A( IROW+(KKCOL-1)*LDA ), LDA, ONE, $ WORK( D1+1 ), NH ) CALL DLAMOV( 'A', D2, KLN, WORK( D1+1 ), NH, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 60 CONTINUE END IF END IF * * Update vertical slab in A. * CALL INFOG2L( LTOP, ILO, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. LEFT ) THEN IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( ILO-1, ILO, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 70 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, NH, NH, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, $ ZERO, WORK, KLN ) CALL DLAMOV( 'A', KLN, NH, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 70 CONTINUE END IF ELSE IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( ILO-1, ILO, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 80 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, D2, D1, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( 1, D1+1 ), $ LDV, ZERO, WORK( 1+D1*KLN ), KLN ) CALL DGESD2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, RIGHT ) CALL DGERV2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ RIGHT ) CALL DGEMM( 'N', 'N', KLN, D1, D1, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ONE, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, D1, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 80 CONTINUE ELSE IF ( LEFT .EQ. JJ ) THEN CALL INFOG2L( ILO-1, ILO, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 90 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, D1, D2, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( D1+1, 1 ), $ LDV, ZERO, WORK, KLN ) CALL DGESD2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ LEFT ) CALL DGERV2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, LEFT ) CALL DGEMM( 'N', 'N', KLN, D2, D2, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( D1+1, D1+1 ), $ LDV, ONE, WORK( 1+D1*KLN ), KLN ) CALL DLAMOV( 'A', KLN, D2, WORK( 1+D1*KLN ), KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 90 CONTINUE END IF END IF END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN CALL INFOG2L( ILOZ, ILO, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. LEFT ) THEN IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, ILO, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 100 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, NH, NH, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ZERO, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, NH, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 100 CONTINUE END IF ELSE IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, ILO, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 110 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, D2, D1, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V( 1, D1+1 ), $ LDV, ZERO, WORK( 1+D1*KLN ), KLN ) CALL DGESD2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, RIGHT ) CALL DGERV2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ RIGHT ) CALL DGEMM( 'N', 'N', KLN, D1, D1, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ONE, $ WORK, KLN ) CALL DLAMOV( 'A', KLN, D1, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 110 CONTINUE ELSE IF( LEFT .EQ. JJ ) THEN CALL INFOG2L( IHIZ, ILO, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 120 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL DGEMM( 'N', 'N', KLN, D1, D2, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V( D1+1, 1 ), $ LDV, ZERO, WORK, KLN ) CALL DGESD2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ LEFT ) CALL DGERV2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, LEFT ) CALL DGEMM( 'N', 'N', KLN, D2, D2, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, $ V( D1+1, D1+1 ), LDV, ONE, WORK( 1+D1*KLN ), $ KLN ) CALL DLAMOV( 'A', KLN, D2, WORK( 1+D1*KLN ), $ KLN, Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 120 CONTINUE END IF END IF END IF * ELSE * * Most complicated case: the diagonal block lay across the border * of the processor mesh. * Treat V as a distributed matrix and call PDGEMM. * HSTEP = LWORK / NH * NPCOL VSTEP = LWORK / NH * NPROW LLDTMP = NUMROC( NH, NH, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCV, NH, NH, NH, NH, 0, 0, CONTXT, $ LLDTMP, IERR ) CALL DESCINIT( DESCWH, NH, HSTEP, NH, LWORK / NH, 0, 0, $ CONTXT, LLDTMP, IERR ) * IF( WANTT ) THEN * * Update horizontal slab in A. * DO 130 KKCOL = I+1, N, HSTEP KLN = MIN( HSTEP, N-KKCOL+1 ) CALL PDGEMM( 'T', 'N', NH, KLN, NH, ONE, V, 1, 1, $ DESCV, A, ILO, KKCOL, DESCA, ZERO, WORK, 1, 1, $ DESCWH ) CALL PDGEMR2D( NH, KLN, WORK, 1, 1, DESCWH, A, $ ILO, KKCOL, DESCA, CONTXT ) 130 CONTINUE * * Update vertical slab in A. * DO 140 KKROW = LTOP, ILO-1, VSTEP KLN = MIN( VSTEP, ILO-KKROW ) LLDTMP = NUMROC( KLN, LWORK / NH, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCWV, KLN, NH, LWORK / NH, NH, 0, 0, $ CONTXT, LLDTMP, IERR ) CALL PDGEMM( 'N', 'N', KLN, NH, NH, ONE, A, KKROW, $ ILO, DESCA, V, 1, 1, DESCV, ZERO, WORK, 1, 1, $ DESCWV ) CALL PDGEMR2D( KLN, NH, WORK, 1, 1, DESCWV, A, KKROW, $ ILO, DESCA, CONTXT ) 140 CONTINUE END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN DO 150 KKROW = ILOZ, IHIZ, VSTEP KLN = MIN( VSTEP, IHIZ-KKROW+1 ) LLDTMP = NUMROC( KLN, LWORK / NH, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCWV, KLN, NH, LWORK / NH, NH, 0, 0, $ CONTXT, LLDTMP, IERR ) CALL PDGEMM( 'N', 'N', KLN, NH, NH, ONE, Z, KKROW, $ ILO, DESCZ, V, 1, 1, DESCV, ZERO, WORK, 1, 1, $ DESCWV ) CALL PDGEMR2D( KLN, NH, WORK, 1, 1, DESCWV, Z, $ KKROW, ILO, DESCZ, CONTXT ) 150 CONTINUE END IF END IF * * END OF PDLAQR4 * END scalapack-2.0.2/SRC/pdlaqr5.f000644 000766 000024 00000312123 11750130340 016053 0ustar00juliestaff000000 000000 SUBROUTINE PDLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, DESCH, ILOZ, IHIZ, Z, DESCZ, WORK, $ LWORK, IWORK, LIWORK ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, N, NSHFTS, $ LWORK, LIWORK LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. INTEGER DESCH( * ), DESCZ( * ), IWORK( * ) DOUBLE PRECISION H( * ), SI( * ), SR( * ), Z( * ), WORK( * ) * .. * * Purpose * ======= * * This auxiliary subroutine called by PDLAQR0 performs a * single small-bulge multi-shift QR sweep by chasing separated * groups of bulges along the main block diagonal of H. * * WANTT (global input) logical scalar * WANTT = .TRUE. if the quasi-triangular Schur factor * is being computed. WANTT is set to .FALSE. otherwise. * * WANTZ (global input) logical scalar * WANTZ = .TRUE. if the orthogonal Schur factor is being * computed. WANTZ is set to .FALSE. otherwise. * * KACC22 (global input) integer with value 0, 1, or 2. * Specifies the computation mode of far-from-diagonal * orthogonal updates. * = 1: PDLAQR5 accumulates reflections and uses matrix-matrix * multiply to update the far-from-diagonal matrix entries. * = 2: PDLAQR5 accumulates reflections, uses matrix-matrix * multiply to update the far-from-diagonal matrix entries, * and takes advantage of 2-by-2 block structure during * matrix multiplies. * * N (global input) integer scalar * N is the order of the Hessenberg matrix H upon which this * subroutine operates. * * KTOP (global input) integer scalar * KBOT (global input) integer scalar * These are the first and last rows and columns of an * isolated diagonal block upon which the QR sweep is to be * applied. It is assumed without a check that * either KTOP = 1 or H(KTOP,KTOP-1) = 0 * and * either KBOT = N or H(KBOT+1,KBOT) = 0. * * NSHFTS (global input) integer scalar * NSHFTS gives the number of simultaneous shifts. NSHFTS * must be positive and even. * * SR (global input) DOUBLE PRECISION array of size (NSHFTS) * SI (global input) DOUBLE PRECISION array of size (NSHFTS) * SR contains the real parts and SI contains the imaginary * parts of the NSHFTS shifts of origin that define the * multi-shift QR sweep. * * H (local input/output) DOUBLE PRECISION array of size * (DESCH(LLD_),*) * On input H contains a Hessenberg matrix. On output a * multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied * to the isolated diagonal block in rows and columns KTOP * through KBOT. * * DESCH (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix H. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N * * Z (local input/output) DOUBLE PRECISION array of size * (DESCZ(LLD_),*) * If WANTZ = .TRUE., then the QR Sweep orthogonal * similarity transformation is accumulated into * Z(ILOZ:IHIZ,ILO:IHI) from the right. * If WANTZ = .FALSE., then Z is unreferenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace) DOUBLE PRECISION array, dimension(DWORK) * * LWORK (local input) INTEGER * The length of the workspace array WORK. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the workspace array IWORK. * * ================================================================ * Based on contributions by * Robert Granat, Department of Computing Science and HPC2N, * University of Umea, Sweden. * * ============================================================ * References: * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part I: Maintaining Well Focused * Shifts, and Level 3 Performance. * SIAM J. Matrix Anal. Appl., 23(4):929--947, 2002. * * R. Granat, B. Kagstrom, and D. Kressner, * A Novel Parallel QR Algorithm for Hybrid Distributed Momory HPC * Systems. * SIAM J. Sci. Comput., 32(4):2345--2378, 2010. * * ============================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0d0, ONE = 1.0d0 ) INTEGER NTINY PARAMETER ( NTINY = 11 ) * .. * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP, TAU, ELEM, STAMP, DDUM, ORTH INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, $ NS, NU, LLDH, LLDZ, LLDU, LLDV, LLDW, LLDWH, $ INFO, ICTXT, NPROW, NPCOL, NB, IROFFH, ITOP, $ NWIN, MYROW, MYCOL, LNS, NUMWIN, LKACC22, $ LCHAIN, WIN, IDONEJOB, IPNEXT, ANMWIN, LENRBUF, $ LENCBUF, ICHOFF, LRSRC, LCSRC, LKTOP, LKBOT, $ II, JJ, SWIN, EWIN, LNWIN, DIM, LLKTOP, LLKBOT, $ IPV, IPU, IPH, IPW, KU, KWH, KWV, NVE, LKS, $ IDUM, NHO, DIR, WINID, INDX, ILOC, JLOC, RSRC1, $ CSRC1, RSRC2, CSRC2, RSRC3, CSRC3, RSRC4, IPUU, $ CSRC4, LROWS, LCOLS, INDXS, KS, JLOC1, ILOC1, $ LKTOP1, LKTOP2, WCHUNK, NUMCHUNK, ODDEVEN, $ CHUNKNUM, DIM1, DIM4, IPW3, HROWS, ZROWS, $ HCOLS, IPW1, IPW2, RSRC, EAST, JLOC4, ILOC4, $ WEST, CSRC, SOUTH, NORHT, INDXE, NORTH, $ IHH, IPIW, LKBOT1, NPROCS, LIROFFH, $ WINFIN, RWS3, CLS3, INDX2, HROWS2, $ ZROWS2, HCOLS2, MNRBUF, $ MXRBUF, MNCBUF, MXCBUF, LWKOPT LOGICAL BLK22, BMP22, INTRO, DONEJOB, ODDNPROW, $ ODDNPCOL, LQUERY, BCDONE CHARACTER JBCMPZ*2, JOB * .. * .. External Functions .. LOGICAL LSAME INTEGER PILAENVX, ICEIL, INDXG2P, INDXG2L, NUMROC DOUBLE PRECISION DLAMCH, DLANGE EXTERNAL DLAMCH, PILAENVX, ICEIL, INDXG2P, INDXG2L, $ NUMROC, LSAME, DLANGE * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Local Arrays .. DOUBLE PRECISION VT( 3 ) * .. * .. External Subroutines .. EXTERNAL DGEMM, DLABAD, DLAMOV, DLAQR1, DLARFG, DLASET, $ DTRMM, DLAQR6 * .. * .. Executable Statements .. * INFO = 0 ICTXT = DESCH( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL LLDH = DESCH( LLD_ ) LLDZ = DESCZ( LLD_ ) NB = DESCH( MB_ ) IROFFH = MOD( KTOP - 1, NB ) LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * If there are no shifts, then there is nothing to do. * IF( .NOT. LQUERY .AND. NSHFTS.LT.2 ) $ RETURN * * If the active block is empty or 1-by-1, then there * is nothing to do. * IF( .NOT. LQUERY .AND. KTOP.GE.KBOT ) $ RETURN * * Shuffle shifts into pairs of real shifts and pairs of * complex conjugate shifts assuming complex conjugate * shifts are already adjacent to one another. * IF( .NOT. LQUERY ) THEN DO 10 I = 1, NSHFTS - 2, 2 IF( SI( I ).NE.-SI( I+1 ) ) THEN * SWAP = SR( I ) SR( I ) = SR( I+1 ) SR( I+1 ) = SR( I+2 ) SR( I+2 ) = SWAP * SWAP = SI( I ) SI( I ) = SI( I+1 ) SI( I+1 ) = SI( I+2 ) SI( I+2 ) = SWAP END IF 10 CONTINUE END IF * * NSHFTS is supposed to be even, but if is odd, * then simply reduce it by one. The shuffle above * ensures that the dropped shift is real and that * the remaining shifts are paired. * NS = NSHFTS - MOD( NSHFTS, 2 ) * * Extract the size of the computational window. * NWIN = PILAENVX( ICTXT, 19, 'PDLAQR5', JBCMPZ, N, NB, NB, NB ) NWIN = MIN( NWIN, KBOT-KTOP+1 ) * * Adjust number of simultaneous shifts if it exceeds the limit * set by the number of diagonal blocks in the active submatrix * H(KTOP:KBOT,KTOP:KBOT). * NS = MAX( 2, MIN( NS, ICEIL( KBOT-KTOP+1, NB )*NWIN/3 ) ) NS = NS - MOD( NS, 2 ) * * Decide the number of simultaneous computational windows * from the number of shifts - each window should contain up to * (NWIN / 3) shifts. Also compute the number of shifts per * window and make sure that number is even. * LNS = MIN( MAX( 2, NWIN / 3 ), MAX( 2, NS / MIN(NPROW,NPCOL) ) ) LNS = LNS - MOD( LNS, 2 ) NUMWIN = MAX( 1, MIN( ICEIL( NS, LNS ), $ ICEIL( KBOT-KTOP+1, NB ) - 1 ) ) IF( NPROW.NE.NPCOL ) THEN NUMWIN = MIN( NUMWIN, MIN(NPROW,NPCOL) ) LNS = MIN( LNS, MAX( 2, NS / MIN(NPROW,NPCOL) ) ) LNS = LNS - MOD( LNS, 2 ) END IF * * Machine constants for deflation. * SAFMIN = DLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN CALL DLABAD( SAFMIN, SAFMAX ) ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N ) / ULP ) * * Use accumulated reflections to update far-from-diagonal * entries on a local level? * IF( LNS.LT.14 ) THEN LKACC22 = 1 ELSE LKACC22 = 2 END IF * * If so, exploit the 2-by-2 block structure? * ( Usually it is not efficient to exploit the 2-by-2 structure * because the block size is too small. ) * BLK22 = ( LNS.GT.2 ) .AND. ( KACC22.EQ.2 ) * * Clear trash. * IF( .NOT. LQUERY .AND. KTOP+2.LE.KBOT ) $ CALL PDELSET( H, KTOP+2, KTOP, DESCH, ZERO ) * * NBMPS = number of 2-shift bulges in each chain * NBMPS = LNS / 2 * * KDU = width of slab * KDU = 6*NBMPS - 3 * * LCHAIN = length of each chain * LCHAIN = 3 * NBMPS + 1 * * Check if workspace query. * IF( LQUERY ) THEN HROWS = NUMROC( N, NB, MYROW, DESCH(RSRC_), NPROW ) HCOLS = NUMROC( N, NB, MYCOL, DESCH(CSRC_), NPCOL ) LWKOPT = (5+2*NUMWIN)*NB**2 + 2*HROWS*NB + HCOLS*NB + $ MAX( HROWS*NB, HCOLS*NB ) WORK(1) = DBLE(LWKOPT) IWORK(1) = 5*NUMWIN RETURN END IF * * Check if KTOP and KBOT are valid. * IF( KTOP.LT.1 .OR. KBOT.GT.N ) STOP * * Create and chase NUMWIN chains of NBMPS bulges. * * Set up window introduction. * ANMWIN = 0 INTRO = .TRUE. IPIW = 1 * * Main loop: * While-loop over the computational windows which is * terminated when all windows have been introduced, * chased down to the bottom of the considered submatrix * and chased off. * 20 CONTINUE * * Set up next window as long as we have less than the prescribed * number of windows. Each window is described an integer quadruple: * 1. Local value of KTOP (below denoted by LKTOP) * 2. Local value of KBOT (below denoted by LKBOT) * 3-4. Processor indices (LRSRC,LCSRC) associated with the window. * (5. Mark that decides if a window is fully processed or not) * * Notice - the next window is only introduced if the first block * in the active submatrix does not contain any other windows. * IF( ANMWIN.GT.0 ) THEN LKTOP = IWORK( 1+(ANMWIN-1)*5 ) ELSE LKTOP = KTOP END IF IF( INTRO .AND. (ANMWIN.EQ.0 .OR. LKTOP.GT.ICEIL(KTOP,NB)*NB) ) $ THEN ANMWIN = ANMWIN + 1 * * Structure of IWORK: * IWORK( 1+(WIN-1)*5 ): start position * IWORK( 2+(WIN-1)*5 ): stop position * IWORK( 3+(WIN-1)*5 ): processor row id * IWORK( 4+(WIN-1)*5 ): processor col id * IWORK( 5+(WIN-1)*5 ): window status (0, 1, or 2) * IWORK( 1+(ANMWIN-1)*5 ) = KTOP IWORK( 2+(ANMWIN-1)*5 ) = KTOP + $ MIN( NWIN,NB-IROFFH,KBOT-KTOP+1 ) - 1 IWORK( 3+(ANMWIN-1)*5 ) = INDXG2P( IWORK(1+(ANMWIN-1)*5), NB, $ MYROW, DESCH(RSRC_), NPROW ) IWORK( 4+(ANMWIN-1)*5 ) = INDXG2P( IWORK(2+(ANMWIN-1)*5), NB, $ MYCOL, DESCH(CSRC_), NPCOL ) IWORK( 5+(ANMWIN-1)*5 ) = 0 IPIW = 6+(ANMWIN-1)*5 IF( ANMWIN.EQ.NUMWIN ) INTRO = .FALSE. END IF * * Do-loop over the number of windows. * IPNEXT = 1 DONEJOB = .FALSE. IDONEJOB = 0 LENRBUF = 0 LENCBUF = 0 ICHOFF = 0 DO 40 WIN = 1, ANMWIN * * Extract window information to simplify the rest. * LRSRC = IWORK( 3+(WIN-1)*5 ) LCSRC = IWORK( 4+(WIN-1)*5 ) LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = LKBOT - LKTOP + 1 * * Check if anything to do for current window, i.e., if the local * chain of bulges has reached the next block border etc. * IF( IWORK(5+(WIN-1)*5).LT.2 .AND. LNWIN.GT.1 .AND. $ (LNWIN.GT.LCHAIN .OR. LKBOT.EQ.KBOT ) ) THEN LIROFFH = MOD(LKTOP-1,NB) SWIN = LKTOP-LIROFFH EWIN = MIN(KBOT,LKTOP-LIROFFH+NB-1) DIM = EWIN-SWIN+1 IF( DIM.LE.NTINY .AND. .NOT.LKBOT.EQ.KBOT ) THEN IWORK( 5+(WIN-1)*5 ) = 2 GO TO 45 END IF IDONEJOB = 1 IF( IWORK(5+(WIN-1)*5).EQ.0 ) THEN IWORK(5+(WIN-1)*5) = 1 END IF * * Let the process that owns the corresponding window do the * local bulge chase. * IF( MYROW.EQ.LRSRC .AND. MYCOL.EQ.LCSRC ) THEN * * Set the kind of job to do in DLAQR6: * 1. JOB = 'I': Introduce and chase bulges in window WIN * 2. JOB = 'C': Chase bulges from top to bottom of window WIN * 3. JOB = 'O': Chase bulges off window WIN * 4. JOB = 'A': All of 1-3 above is done - this will for * example happen for very small active * submatrices (like 2-by-2) * LLKBOT = LLKTOP + LNWIN - 1 IF( LKTOP.EQ.KTOP .AND. LKBOT.EQ.KBOT ) THEN JOB = 'All steps' ICHOFF = 1 ELSEIF( LKTOP.EQ.KTOP ) THEN JOB = 'Introduce and chase' ELSEIF( LKBOT.EQ.KBOT ) THEN JOB = 'Off-chase bulges' ICHOFF = 1 ELSE JOB = 'Chase bulges' END IF * * Copy submatrix of H corresponding to window WIN into * workspace and set out additional workspace for storing * orthogonal transformations. This submatrix must be at * least (NTINY+1)-by-(NTINY+1) to fit into DLAQR6 - if not, * abort and go for cross border bulge chasing with this * particular window. * II = INDXG2L( SWIN, NB, MYROW, DESCH(RSRC_), NPROW ) JJ = INDXG2L( SWIN, NB, MYCOL, DESCH(CSRC_), NPCOL ) LLKTOP = 1 + LIROFFH LLKBOT = LLKTOP + LNWIN - 1 * IPU = IPNEXT IPH = IPU + LNWIN**2 IPUU = IPH + MAX(NTINY+1,DIM)**2 IPV = IPUU + MAX(NTINY+1,DIM)**2 IPNEXT = IPH * IF( LSAME( JOB, 'A' ) .OR. LSAME( JOB, 'O' ) .AND. $ DIM.LT.NTINY+1 ) THEN CALL DLASET( 'All', NTINY+1, NTINY+1, ZERO, ONE, $ WORK(IPH), NTINY+1 ) END IF CALL DLAMOV( 'Upper', DIM, DIM, H(II+(JJ-1)*LLDH), LLDH, $ WORK(IPH), MAX(NTINY+1,DIM) ) CALL DCOPY( DIM-1, H(II+(JJ-1)*LLDH+1), LLDH+1, $ WORK(IPH+1), MAX(NTINY+1,DIM)+1 ) IF( LSAME( JOB, 'C' ) .OR. LSAME( JOB, 'O') ) THEN CALL DCOPY( DIM-2, H(II+(JJ-1)*LLDH+2), LLDH+1, $ WORK(IPH+2), MAX(NTINY+1,DIM)+1 ) CALL DCOPY( DIM-3, H(II+(JJ-1)*LLDH+3), LLDH+1, $ WORK(IPH+3), MAX(NTINY+1,DIM)+1 ) CALL DLASET( 'Lower', DIM-4, DIM-4, ZERO, $ ZERO, WORK(IPH+4), MAX(NTINY+1,DIM) ) ELSE CALL DLASET( 'Lower', DIM-2, DIM-2, ZERO, $ ZERO, WORK(IPH+2), MAX(NTINY+1,DIM) ) END IF * KU = MAX(NTINY+1,DIM) - KDU + 1 KWH = KDU + 1 NHO = ( MAX(NTINY+1,DIM)-KDU+1-4 ) - ( KDU+1 ) + 1 KWV = KDU + 4 NVE = MAX(NTINY+1,DIM) - KDU - KWV + 1 CALL DLASET( 'All', MAX(NTINY+1,DIM), $ MAX(NTINY+1,DIM), ZERO, ONE, WORK(IPUU), $ MAX(NTINY+1,DIM) ) * * Small-bulge multi-shift QR sweep. * LKS = MAX( 1, NS - WIN*LNS + 1 ) CALL DLAQR6( JOB, WANTT, .TRUE., LKACC22, $ MAX(NTINY+1,DIM), LLKTOP, LLKBOT, LNS, SR( LKS ), $ SI( LKS ), WORK(IPH), MAX(NTINY+1,DIM), LLKTOP, $ LLKBOT, WORK(IPUU), MAX(NTINY+1,DIM), WORK(IPU), $ 3, WORK( IPH+KU-1 ), $ MAX(NTINY+1,DIM), NVE, WORK( IPH+KWV-1 ), $ MAX(NTINY+1,DIM), NHO, WORK( IPH-1+KU+(KWH-1)* $ MAX(NTINY+1,DIM) ), MAX(NTINY+1,DIM) ) * * Copy submatrix of H back. * CALL DLAMOV( 'Upper', DIM, DIM, WORK(IPH), $ MAX(NTINY+1,DIM), H(II+(JJ-1)*LLDH), LLDH ) CALL DCOPY( DIM-1, WORK(IPH+1), MAX(NTINY+1,DIM)+1, $ H(II+(JJ-1)*LLDH+1), LLDH+1 ) IF( LSAME( JOB, 'I' ) .OR. LSAME( JOB, 'C' ) ) THEN CALL DCOPY( DIM-2, WORK(IPH+2), DIM+1, $ H(II+(JJ-1)*LLDH+2), LLDH+1 ) CALL DCOPY( DIM-3, WORK(IPH+3), DIM+1, $ H(II+(JJ-1)*LLDH+3), LLDH+1 ) ELSE CALL DLASET( 'Lower', DIM-2, DIM-2, ZERO, $ ZERO, H(II+(JJ-1)*LLDH+2), LLDH ) END IF * * Copy actual submatrix of U to the correct place * of the buffer. * CALL DLAMOV( 'All', LNWIN, LNWIN, $ WORK(IPUU+(MAX(NTINY+1,DIM)*LIROFFH)+LIROFFH), $ MAX(NTINY+1,DIM), WORK(IPU), LNWIN ) END IF * * In case the local submatrix was smaller than * (NTINY+1)-by-(NTINY+1) we go here and proceed. * 45 CONTINUE ELSE IWORK( 5+(WIN-1)*5 ) = 2 END IF * * Increment counter for buffers of orthogonal transformations. * IF( MYROW.EQ.LRSRC .OR. MYCOL.EQ.LCSRC ) THEN IF( IDONEJOB.EQ.1 .AND. IWORK(5+(WIN-1)*5).LT.2 ) THEN IF( MYROW.EQ.LRSRC ) LENRBUF = LENRBUF + LNWIN*LNWIN IF( MYCOL.EQ.LCSRC ) LENCBUF = LENCBUF + LNWIN*LNWIN END IF END IF 40 CONTINUE * * Did some work in the above do-loop? * CALL IGSUM2D( ICTXT, 'All', '1-Tree', 1, 1, IDONEJOB, 1, -1, -1 ) DONEJOB = IDONEJOB.GT.0 * * Chased off bulges from first window? * IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', '1-Tree', 1, 1, ICHOFF, 1, -1, $ -1, -1, -1, -1 ) * * If work was done in the do-loop over local windows, perform * updates, otherwise go for cross border bulge chasing and updates. * IF( DONEJOB ) THEN * * Broadcast orthogonal transformations. * 49 CONTINUE IF( LENRBUF.GT.0 .OR. LENCBUF.GT.0 ) THEN DO 50 DIR = 1, 2 BCDONE = .FALSE. DO 60 WIN = 1, ANMWIN IF( ( LENRBUF.EQ.0 .AND. LENCBUF.EQ.0 ) .OR. $ BCDONE ) GO TO 62 LRSRC = IWORK( 3+(WIN-1)*5 ) LCSRC = IWORK( 4+(WIN-1)*5 ) IF( MYROW.EQ.LRSRC .AND. MYCOL.EQ.LCSRC ) THEN IF( DIR.EQ.1 .AND. LENRBUF.GT.0 .AND. $ NPCOL.GT.1 ) THEN CALL DGEBS2D( ICTXT, 'Row', '1-Tree', LENRBUF, $ 1, WORK, LENRBUF ) ELSEIF( DIR.EQ.2 .AND. LENCBUF.GT.0 .AND. $ NPROW.GT.1 ) THEN CALL DGEBS2D( ICTXT, 'Col', '1-Tree', LENCBUF, $ 1, WORK, LENCBUF ) END IF IF( LENRBUF.GT.0 ) $ CALL DLAMOV( 'All', LENRBUF, 1, WORK, LENRBUF, $ WORK(1+LENRBUF), LENCBUF ) BCDONE = .TRUE. ELSEIF( MYROW.EQ.LRSRC .AND. DIR.EQ.1 ) THEN IF( LENRBUF.GT.0 .AND. NPCOL.GT.1 ) THEN CALL DGEBR2D( ICTXT, 'Row', '1-Tree', LENRBUF, $ 1, WORK, LENRBUF, LRSRC, LCSRC ) BCDONE = .TRUE. END IF ELSEIF( MYCOL.EQ.LCSRC .AND. DIR.EQ.2 ) THEN IF( LENCBUF.GT.0 .AND. NPROW.GT.1 ) THEN CALL DGEBR2D( ICTXT, 'Col', '1-Tree', LENCBUF, $ 1, WORK(1+LENRBUF), LENCBUF, LRSRC, LCSRC ) BCDONE = .TRUE. END IF END IF 62 CONTINUE 60 CONTINUE 50 CONTINUE END IF * * Compute updates - make sure to skip windows that was skipped * regarding local bulge chasing. * DO 65 DIR = 1, 2 WINID = 0 IF( DIR.EQ.1 ) THEN IPNEXT = 1 ELSE IPNEXT = 1 + LENRBUF END IF DO 70 WIN = 1, ANMWIN IF( IWORK( 5+(WIN-1)*5 ).EQ.2 ) GO TO 75 LRSRC = IWORK( 3+(WIN-1)*5 ) LCSRC = IWORK( 4+(WIN-1)*5 ) LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = LKBOT - LKTOP + 1 IF( (MYROW.EQ.LRSRC.AND.LENRBUF.GT.0.AND.DIR.EQ.1) .OR. $ (MYCOL.EQ.LCSRC.AND.LENCBUF.GT.0.AND.DIR.EQ.2 ) ) $ THEN * * Set up workspaces. * IPU = IPNEXT IPNEXT = IPU + LNWIN*LNWIN IPW = 1 + LENRBUF + LENCBUF LIROFFH = MOD(LKTOP-1,NB) WINID = WINID + 1 * * Recompute JOB to see if block structure of U could * possibly be exploited or not. * IF( LKTOP.EQ.KTOP .AND. LKBOT.EQ.KBOT ) THEN JOB = 'All steps' ELSEIF( LKTOP.EQ.KTOP ) THEN JOB = 'Introduce and chase' ELSEIF( LKBOT.EQ.KBOT ) THEN JOB = 'Off-chase bulges' ELSE JOB = 'Chase bulges' END IF END IF * * Use U to update far-from-diagonal entries in H. * If required, use U to update Z as well. * IF( .NOT. BLK22 .OR. .NOT. LSAME(JOB,'C') $ .OR. LNS.LE.2 ) THEN * IF( DIR.EQ.2 .AND. LENCBUF.GT.0 .AND. $ MYCOL.EQ.LCSRC ) THEN IF( WANTT ) THEN DO 80 INDX = 1, LKTOP-LIROFFH-1, NB CALL INFOG2L( INDX, LKTOP, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, RSRC1, $ CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN LROWS = MIN( NB, LKTOP-INDX ) CALL DGEMM('No transpose', 'No transpose', $ LROWS, LNWIN, LNWIN, ONE, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK( IPU ), LNWIN, ZERO, $ WORK(IPW), $ LROWS ) CALL DLAMOV( 'All', LROWS, LNWIN, $ WORK(IPW), LROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF 80 CONTINUE END IF IF( WANTZ ) THEN DO 90 INDX = 1, N, NB CALL INFOG2L( INDX, LKTOP, DESCZ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, RSRC1, $ CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN LROWS = MIN(NB,N-INDX+1) CALL DGEMM( 'No transpose', $ 'No transpose', LROWS, LNWIN, LNWIN, $ ONE, Z((JLOC-1)*LLDZ+ILOC), LLDZ, $ WORK( IPU ), LNWIN, ZERO, $ WORK(IPW), LROWS ) CALL DLAMOV( 'All', LROWS, LNWIN, $ WORK(IPW), LROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF 90 CONTINUE END IF END IF * * Update the rows of H affected by the bulge-chase. * IF( DIR.EQ.1 .AND. LENRBUF.GT.0 .AND. $ MYROW.EQ.LRSRC ) THEN IF( WANTT ) THEN IF( ICEIL(LKBOT,NB).EQ.ICEIL(KBOT,NB) ) THEN LCOLS = MIN(ICEIL(KBOT,NB)*NB,N) - KBOT ELSE LCOLS = 0 END IF IF( LCOLS.GT.0 ) THEN INDX = KBOT + 1 CALL INFOG2L( LKTOP, INDX, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN CALL DGEMM( 'Transpose', 'No Transpose', $ LNWIN, LCOLS, LNWIN, ONE, WORK(IPU), $ LNWIN, H((JLOC-1)*LLDH+ILOC), LLDH, $ ZERO, WORK(IPW), LNWIN ) CALL DLAMOV( 'All', LNWIN, LCOLS, $ WORK(IPW), LNWIN, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 93 CONTINUE INDXS = ICEIL(LKBOT,NB)*NB + 1 DO 95 INDX = INDXS, N, NB CALL INFOG2L( LKTOP, INDX, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN LCOLS = MIN( NB, N-INDX+1 ) CALL DGEMM( 'Transpose', 'No Transpose', $ LNWIN, LCOLS, LNWIN, ONE, WORK(IPU), $ LNWIN, H((JLOC-1)*LLDH+ILOC), LLDH, $ ZERO, WORK(IPW), $ LNWIN ) CALL DLAMOV( 'All', LNWIN, LCOLS, $ WORK(IPW), LNWIN, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF 95 CONTINUE END IF END IF ELSE KS = LNWIN-LNS/2*3 * * The LNWIN-by-LNWIN matrix U containing the accumulated * orthogonal transformations has the following structure: * * [ U11 U12 ] * U = [ ], * [ U21 U22 ] * * where U21 is KS-by-KS upper triangular and U12 is * (LNWIN-KS)-by-(LNWIN-KS) lower triangular. * Here, KS = LNS. * * Update the columns of H and Z affected by the bulge * chasing. * * Compute H2*U21 + H1*U11 in workspace. * IF( DIR.EQ.2 .AND. LENCBUF.GT.0 .AND. $ MYCOL.EQ.LCSRC ) THEN IF( WANTT ) THEN DO 100 INDX = 1, LKTOP-LIROFFH-1, NB CALL INFOG2L( INDX, LKTOP, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, RSRC1, $ CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN JLOC1 = INDXG2L( LKTOP+LNWIN-KS, NB, $ MYCOL, DESCH( CSRC_ ), NPCOL ) LROWS = MIN( NB, LKTOP-INDX ) CALL DLAMOV( 'All', LROWS, KS, $ H((JLOC1-1)*LLDH+ILOC ), LLDH, $ WORK(IPW), LROWS ) CALL DTRMM( 'Right', 'Upper', $ 'No transpose','Non-unit', LROWS, $ KS, ONE, WORK( IPU+LNWIN-KS ), LNWIN, $ WORK(IPW), LROWS ) CALL DGEMM('No transpose', 'No transpose', $ LROWS, KS, LNWIN-KS, ONE, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK( IPU ), LNWIN, ONE, WORK(IPW), $ LROWS ) * * Compute H1*U12 + H2*U22 in workspace. * CALL DLAMOV( 'All', LROWS, LNWIN-KS, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK( IPW+KS*LROWS ), LROWS ) CALL DTRMM( 'Right', 'Lower', $ 'No transpose', 'Non-Unit', $ LROWS, LNWIN-KS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW+KS*LROWS ), LROWS ) CALL DGEMM('No transpose', 'No transpose', $ LROWS, LNWIN-KS, KS, ONE, $ H((JLOC1-1)*LLDH+ILOC), LLDH, $ WORK( IPU+LNWIN*KS+LNWIN-KS ), LNWIN, $ ONE, WORK( IPW+KS*LROWS ), LROWS ) * * Copy workspace to H. * CALL DLAMOV( 'All', LROWS, LNWIN, $ WORK(IPW), LROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF 100 CONTINUE END IF * IF( WANTZ ) THEN * * Compute Z2*U21 + Z1*U11 in workspace. * DO 110 INDX = 1, N, NB CALL INFOG2L( INDX, LKTOP, DESCZ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, RSRC1, $ CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN JLOC1 = INDXG2L( LKTOP+LNWIN-KS, NB, $ MYCOL, DESCZ( CSRC_ ), NPCOL ) LROWS = MIN(NB,N-INDX+1) CALL DLAMOV( 'All', LROWS, KS, $ Z((JLOC1-1)*LLDZ+ILOC ), LLDZ, $ WORK(IPW), LROWS ) CALL DTRMM( 'Right', 'Upper', $ 'No transpose', 'Non-unit', $ LROWS, KS, ONE, WORK( IPU+LNWIN-KS ), $ LNWIN, WORK(IPW), LROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', LROWS, KS, LNWIN-KS, $ ONE, Z((JLOC-1)*LLDZ+ILOC), LLDZ, $ WORK( IPU ), LNWIN, ONE, WORK(IPW), $ LROWS ) * * Compute Z1*U12 + Z2*U22 in workspace. * CALL DLAMOV( 'All', LROWS, LNWIN-KS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ, $ WORK( IPW+KS*LROWS ), LROWS) CALL DTRMM( 'Right', 'Lower', $ 'No transpose', 'Non-unit', $ LROWS, LNWIN-KS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW+KS*LROWS ), LROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', LROWS, LNWIN-KS, KS, $ ONE, Z((JLOC1-1)*LLDZ+ILOC), LLDZ, $ WORK( IPU+LNWIN*KS+LNWIN-KS ), LNWIN, $ ONE, WORK( IPW+KS*LROWS ), $ LROWS ) * * Copy workspace to Z. * CALL DLAMOV( 'All', LROWS, LNWIN, $ WORK(IPW), LROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF 110 CONTINUE END IF END IF * IF( DIR.EQ.1 .AND. LENRBUF.GT.0 .AND. $ MYROW.EQ.LRSRC ) THEN IF( WANTT ) THEN INDXS = ICEIL(LKBOT,NB)*NB + 1 DO 120 INDX = INDXS, N, NB CALL INFOG2L( LKTOP, INDX, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN * * Compute U21**T*H2 + U11**T*H1 in workspace. * ILOC1 = INDXG2L( LKTOP+LNWIN-KS, NB, $ MYROW, DESCH( RSRC_ ), NPROW ) LCOLS = MIN( NB, N-INDX+1 ) CALL DLAMOV( 'All', KS, LCOLS, $ H((JLOC-1)*LLDH+ILOC1), LLDH, $ WORK(IPW), LNWIN ) CALL DTRMM( 'Left', 'Upper', 'Transpose', $ 'Non-unit', KS, LCOLS, ONE, $ WORK( IPU+LNWIN-KS ), LNWIN, $ WORK(IPW), LNWIN ) CALL DGEMM( 'Transpose', 'No transpose', $ KS, LCOLS, LNWIN-KS, ONE, WORK(IPU), $ LNWIN, H((JLOC-1)*LLDH+ILOC), LLDH, $ ONE, WORK(IPW), LNWIN ) * * Compute U12**T*H1 + U22**T*H2 in workspace. * CALL DLAMOV( 'All', LNWIN-KS, LCOLS, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK( IPW+KS ), LNWIN ) CALL DTRMM( 'Left', 'Lower', 'Transpose', $ 'Non-unit', LNWIN-KS, LCOLS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW+KS ), LNWIN ) CALL DGEMM( 'Transpose', 'No Transpose', $ LNWIN-KS, LCOLS, KS, ONE, $ WORK( IPU+LNWIN*KS+LNWIN-KS ), LNWIN, $ H((JLOC-1)*LLDH+ILOC1), LLDH, $ ONE, WORK( IPW+KS ), LNWIN ) * * Copy workspace to H. * CALL DLAMOV( 'All', LNWIN, LCOLS, $ WORK(IPW), LNWIN, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF 120 CONTINUE END IF END IF END IF * * Update position information about current window. * IF( DIR.EQ.2 ) THEN IF( LKBOT.EQ.KBOT ) THEN LKTOP = KBOT+1 LKBOT = KBOT+1 IWORK( 1+(WIN-1)*5 ) = LKTOP IWORK( 2+(WIN-1)*5 ) = LKBOT IWORK( 5+(WIN-1)*5 ) = 2 ELSE LKTOP = MIN( LKTOP + LNWIN - LCHAIN, $ ICEIL( LKTOP, NB )*NB - LCHAIN + 1, $ KBOT ) IWORK( 1+(WIN-1)*5 ) = LKTOP LKBOT = MIN( LKBOT + LNWIN - LCHAIN, $ ICEIL( LKBOT, NB )*NB, KBOT ) IWORK( 2+(WIN-1)*5 ) = LKBOT LNWIN = LKBOT-LKTOP+1 IF( LNWIN.EQ.LCHAIN ) IWORK(5+(WIN-1)*5) = 2 END IF END IF 75 CONTINUE 70 CONTINUE 65 CONTINUE * * If bulges were chasen off from first window, the window is * removed. * IF( ICHOFF.GT.0 ) THEN DO 128 WIN = 2, ANMWIN IWORK( 1+(WIN-2)*5 ) = IWORK( 1+(WIN-1)*5 ) IWORK( 2+(WIN-2)*5 ) = IWORK( 2+(WIN-1)*5 ) IWORK( 3+(WIN-2)*5 ) = IWORK( 3+(WIN-1)*5 ) IWORK( 4+(WIN-2)*5 ) = IWORK( 4+(WIN-1)*5 ) IWORK( 5+(WIN-2)*5 ) = IWORK( 5+(WIN-1)*5 ) 128 CONTINUE ANMWIN = ANMWIN - 1 IPIW = 6+(ANMWIN-1)*5 END IF * * If we have no more windows, return. * IF( ANMWIN.LT.1 ) RETURN * ELSE * * Set up windows such that as many bulges as possible can be * moved over the border to the next block. Make sure that the * cross border window is at least (NTINY+1)-by-(NTINY+1), unless * we are chasing off the bulges from the last window. This is * accomplished by setting the bottom index LKBOT such that the * local window has the correct size. * * If LKBOT then becomes larger than KBOT, the endpoint of the whole * global submatrix, or LKTOP from a window located already residing * at the other side of the border, this is taken care of by some * dirty tricks. * DO 130 WIN = 1, ANMWIN LKTOP1 = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = MAX( 6, MIN( LKBOT - LKTOP1 + 1, LCHAIN ) ) LKBOT1 = MAX( MIN( KBOT, ICEIL(LKTOP1,NB)*NB+LCHAIN), $ MIN( KBOT, MIN( LKTOP1+2*LNWIN-1, $ (ICEIL(LKTOP1,NB)+1)*NB ) ) ) IWORK( 2+(WIN-1)*5 ) = LKBOT1 130 CONTINUE ICHOFF = 0 * * Keep a record over what windows that were moved over the borders * such that we can delay some windows due to lack of space on the * other side of the border; we do not want to leave any of the * bulges behind... * * IWORK( 5+(WIN-1)*5 ) = 0: window WIN has not been processed * IWORK( 5+(WIN-1)*5 ) = 1: window WIN is being processed (need to * know for updates) * IWORK( 5+(WIN-1)*5 ) = 2: window WIN has been fully processed * * So, start by marking all windows as not processed. * DO 135 WIN = 1, ANMWIN IWORK( 5+(WIN-1)*5 ) = 0 135 CONTINUE * * Do the cross border bulge-chase as follows: Start from the * first window (the one that is closest to be chased off the * diagonal of H) and take the odd windows first followed by the * even ones. To not get into hang-problems on processor meshes * with at least one odd dimension, the windows will in such a case * be processed in chunks of {the minimum odd process dimension}-1 * windows to avoid overlapping processor scopes in forming the * cross border computational windows and the cross border update * regions. * WCHUNK = MAX( 1, MIN( ANMWIN, NPROW-1, NPCOL-1 ) ) NUMCHUNK = ICEIL( ANMWIN, WCHUNK ) * * Based on the computed chunk of windows, start working with * crossborder bulge-chasing. Repeat this as long as there is * still work left to do (137 is a kind of do-while statement). * 137 CONTINUE * * Zero out LENRBUF and LENCBUF each time we restart this loop. * LENRBUF = 0 LENCBUF = 0 * DO 140 ODDEVEN = 1, MIN( 2, ANMWIN ) DO 150 CHUNKNUM = 1, NUMCHUNK IPNEXT = 1 DO 160 WIN = ODDEVEN+(CHUNKNUM-1)*WCHUNK, $ MIN(ANMWIN,MAX(1,ODDEVEN+(CHUNKNUM)*WCHUNK-1)), 2 * * Get position and size of the WIN:th active window and * make sure that we skip the cross border bulge for this * window if the window is not shared between several data * layout blocks (and processors). * * Also, delay windows that do not have sufficient size of * the other side of the border. Moreover, make sure to skip * windows that was already processed in the last round of * the do-while loop (137). * IF( IWORK( 5+(WIN-1)*5 ).EQ.2 ) GO TO 165 LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) IF( WIN.GT.1 ) THEN LKTOP2 = IWORK( 1+(WIN-2)*5 ) ELSE LKTOP2 = KBOT+1 END IF IF( ICEIL(LKTOP,NB).EQ.ICEIL(LKBOT,NB) .OR. $ LKBOT.GE.LKTOP2 ) GO TO 165 LNWIN = LKBOT - LKTOP + 1 IF( LNWIN.LE.NTINY .AND. LKBOT.NE.KBOT .AND. $ .NOT. MOD(LKBOT,NB).EQ.0 ) GO TO 165 * * If window is going to be processed, mark it as processed. * IWORK( 5+(WIN-1)*5 ) = 1 * * Extract processors for current cross border window, * as below: * * 1 | 2 * --+-- * 3 | 4 * RSRC1 = IWORK( 3+(WIN-1)*5 ) CSRC1 = IWORK( 4+(WIN-1)*5 ) RSRC2 = RSRC1 CSRC2 = MOD( CSRC1+1, NPCOL ) RSRC3 = MOD( RSRC1+1, NPROW ) CSRC3 = CSRC1 RSRC4 = MOD( RSRC1+1, NPROW ) CSRC4 = MOD( CSRC1+1, NPCOL ) * * Form group of four processors for cross border window. * IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) .OR. $ ( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN * * Compute the upper and lower parts of the active * window. * DIM1 = NB - MOD(LKTOP-1,NB) DIM4 = LNWIN - DIM1 * * Temporarily compute a new value of the size of the * computational window that is larger than or equal to * NTINY+1; call the *real* value DIM. * DIM = LNWIN LNWIN = MAX(NTINY+1,LNWIN) * * Divide workspace. * IPU = IPNEXT IPH = IPU + DIM**2 IPUU = IPH + LNWIN**2 IPV = IPUU + LNWIN**2 IPNEXT = IPH IF( DIM.LT.LNWIN ) THEN CALL DLASET( 'All', LNWIN, LNWIN, ZERO, $ ONE, WORK( IPH ), LNWIN ) ELSE CALL DLASET( 'All', DIM, DIM, ZERO, $ ZERO, WORK( IPH ), LNWIN ) END IF * * Form the active window. * IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN ILOC = INDXG2L( LKTOP, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL DLAMOV( 'All', DIM1, DIM1, $ H((JLOC-1)*LLDH+ILOC), LLDH, WORK(IPH), $ LNWIN ) IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) THEN * Proc#1 <==> Proc#4 CALL DGESD2D( ICTXT, DIM1, DIM1, $ WORK(IPH), LNWIN, RSRC4, CSRC4 ) CALL DGERV2D( ICTXT, DIM4, DIM4, $ WORK(IPH+DIM1*LNWIN+DIM1), $ LNWIN, RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN ILOC = INDXG2L( LKTOP+DIM1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL DLAMOV( 'All', DIM4, DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK(IPH+DIM1*LNWIN+DIM1), $ LNWIN ) IF( RSRC4.NE.RSRC1 .OR. CSRC4.NE.CSRC1 ) THEN * Proc#4 <==> Proc#1 CALL DGESD2D( ICTXT, DIM4, DIM4, $ WORK(IPH+DIM1*LNWIN+DIM1), $ LNWIN, RSRC1, CSRC1 ) CALL DGERV2D( ICTXT, DIM1, DIM1, $ WORK(IPH), LNWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN ILOC = INDXG2L( LKTOP, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL DLAMOV( 'All', DIM1, DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK(IPH+DIM1*LNWIN), LNWIN ) IF( RSRC2.NE.RSRC1 .OR. CSRC2.NE.CSRC1 ) THEN * Proc#2 ==> Proc#1 CALL DGESD2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN IF( RSRC2.NE.RSRC4 .OR. CSRC2.NE.CSRC4 ) THEN * Proc#2 ==> Proc#4 CALL DGESD2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN ILOC = INDXG2L( LKTOP+DIM1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1-1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL DLAMOV( 'All', 1, 1, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN ) IF( RSRC3.NE.RSRC1 .OR. CSRC3.NE.CSRC1 ) THEN * Proc#3 ==> Proc#1 CALL DGESD2D( ICTXT, 1, 1, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN IF( RSRC3.NE.RSRC4 .OR. CSRC3.NE.CSRC4 ) THEN * Proc#3 ==> Proc#4 CALL DGESD2D( ICTXT, 1, 1, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN, RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC1.NE.RSRC2 .OR. CSRC1.NE.CSRC2 ) THEN * Proc#1 <== Proc#2 CALL DGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC2, CSRC2 ) END IF IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) THEN * Proc#1 <== Proc#3 CALL DGERV2D( ICTXT, 1, 1, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN, RSRC3, CSRC3 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) THEN * Proc#4 <== Proc#2 CALL DGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC2, CSRC2 ) END IF IF( RSRC4.NE.RSRC3 .OR. CSRC4.NE.CSRC3 ) THEN * Proc#4 <== Proc#3 CALL DGERV2D( ICTXT, 1, 1, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN, RSRC3, CSRC3 ) END IF END IF * * Prepare for call to DLAQR6 - it could happen that no * bulges where introduced in the pre-cross border step * since the chain was too long to fit in the top-left * part of the cross border window. In such a case, the * bulges are introduced here instead. It could also * happen that the bottom-right part is too small to hold * the whole chain -- in such a case, the bulges are * chasen off immediately, as well. * IF( (MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1) .OR. $ (MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4) ) THEN IF( LKTOP.EQ.KTOP .AND. LKBOT.EQ.KBOT .AND. $ (DIM1.LE.LCHAIN .OR. DIM1.LE.NTINY ) ) THEN JOB = 'All steps' ICHOFF = 1 ELSEIF( LKTOP.EQ.KTOP .AND. $ ( DIM1.LE.LCHAIN .OR. DIM1.LE.NTINY ) ) THEN JOB = 'Introduce and chase' ELSEIF( LKBOT.EQ.KBOT ) THEN JOB = 'Off-chase bulges' ICHOFF = 1 ELSE JOB = 'Chase bulges' END IF KU = LNWIN - KDU + 1 KWH = KDU + 1 NHO = ( LNWIN-KDU+1-4 ) - ( KDU+1 ) + 1 KWV = KDU + 4 NVE = LNWIN - KDU - KWV + 1 CALL DLASET( 'All', LNWIN, LNWIN, $ ZERO, ONE, WORK(IPUU), LNWIN ) * * Small-bulge multi-shift QR sweep. * LKS = MAX(1, NS - WIN*LNS + 1) CALL DLAQR6( JOB, WANTT, .TRUE., LKACC22, LNWIN, $ 1, DIM, LNS, SR( LKS ), SI( LKS ), $ WORK(IPH), LNWIN, 1, DIM, $ WORK(IPUU), LNWIN, WORK(IPU), 3, $ WORK( IPH+KU-1 ), LNWIN, NVE, $ WORK( IPH+KWV-1 ), LNWIN, NHO, $ WORK( IPH-1+KU+(KWH-1)*LNWIN ), LNWIN ) * * Copy local submatrices of H back to global matrix. * IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN ILOC = INDXG2L( LKTOP, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL DLAMOV( 'All', DIM1, DIM1, WORK(IPH), $ LNWIN, H((JLOC-1)*LLDH+ILOC), $ LLDH ) END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN ILOC = INDXG2L( LKTOP+DIM1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL DLAMOV( 'All', DIM4, DIM4, $ WORK(IPH+DIM1*LNWIN+DIM1), $ LNWIN, H((JLOC-1)*LLDH+ILOC), LLDH ) END IF * * Copy actual submatrix of U to the correct place of * the buffer. * CALL DLAMOV( 'All', DIM, DIM, $ WORK(IPUU), LNWIN, WORK(IPU), DIM ) END IF * * Return data to process 2 and 3. * RWS3 = MIN(3,DIM4) CLS3 = MIN(3,DIM1) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) THEN * Proc#1 ==> Proc#3 CALL DGESD2D( ICTXT, RWS3, CLS3, $ WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1 ), $ LNWIN, RSRC3, CSRC3 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) THEN * Proc#4 ==> Proc#2 CALL DGESD2D( ICTXT, DIM1, DIM4, $ WORK( IPH+DIM1*LNWIN), $ LNWIN, RSRC2, CSRC2 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN ILOC = INDXG2L( LKTOP, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) IF( RSRC2.NE.RSRC4 .OR. CSRC2.NE.CSRC4 ) THEN * Proc#2 <== Proc#4 CALL DGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC4, CSRC4 ) END IF CALL DLAMOV( 'All', DIM1, DIM4, $ WORK( IPH+DIM1*LNWIN ), LNWIN, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN ILOC = INDXG2L( LKTOP+DIM1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1-CLS3, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) IF( RSRC3.NE.RSRC1 .OR. CSRC3.NE.CSRC1 ) THEN * Proc#3 <== Proc#1 CALL DGERV2D( ICTXT, RWS3, CLS3, $ WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1 ), $ LNWIN, RSRC1, CSRC1 ) END IF CALL DLAMOV( 'Upper', RWS3, CLS3, $ WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1 ), $ LNWIN, H((JLOC-1)*LLDH+ILOC), $ LLDH ) IF( RWS3.GT.1 .AND. CLS3.GT.1 ) THEN ELEM = WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1+1 ) IF( ELEM.NE.ZERO ) THEN CALL DLAMOV( 'Lower', RWS3-1, CLS3-1, $ WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1+1 ), $ LNWIN, H((JLOC-1)*LLDH+ILOC+1), LLDH ) END IF END IF END IF * * Restore correct value of LNWIN. * LNWIN = DIM * END IF * * Increment counter for buffers of orthogonal * transformations. * IF( MYROW.EQ.RSRC1 .OR. MYCOL.EQ.CSRC1 .OR. $ MYROW.EQ.RSRC4 .OR. MYCOL.EQ.CSRC4 ) THEN IF( MYROW.EQ.RSRC1 .OR. MYROW.EQ.RSRC4 ) $ LENRBUF = LENRBUF + LNWIN*LNWIN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) $ LENCBUF = LENCBUF + LNWIN*LNWIN END IF * * If no cross border bulge chasing was performed for the * current WIN:th window, the processor jump to this point * and consider the next one. * 165 CONTINUE * 160 CONTINUE * * Broadcast orthogonal transformations -- this will only happen * if the buffer associated with the orthogonal transformations * is not empty (controlled by LENRBUF, for row-wise * broadcasts, and LENCBUF, for column-wise broadcasts). * DO 170 DIR = 1, 2 BCDONE = .FALSE. DO 180 WIN = ODDEVEN+(CHUNKNUM-1)*WCHUNK, $ MIN(ANMWIN,MAX(1,ODDEVEN+(CHUNKNUM)*WCHUNK-1)), 2 IF( ( LENRBUF.EQ.0 .AND. LENCBUF.EQ.0 ) .OR. $ BCDONE ) GO TO 185 RSRC1 = IWORK( 3+(WIN-1)*5 ) CSRC1 = IWORK( 4+(WIN-1)*5 ) RSRC4 = MOD( RSRC1+1, NPROW ) CSRC4 = MOD( CSRC1+1, NPCOL ) IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN IF( DIR.EQ.1 .AND. LENRBUF.GT.0 .AND. $ NPCOL.GT.1 .AND. NPROCS.GT.2 ) THEN IF( MYROW.EQ.RSRC1 .OR. ( MYROW.EQ.RSRC4 $ .AND. RSRC4.NE.RSRC1 ) ) THEN CALL DGEBS2D( ICTXT, 'Row', '1-Tree', $ LENRBUF, 1, WORK, LENRBUF ) ELSE CALL DGEBR2D( ICTXT, 'Row', '1-Tree', $ LENRBUF, 1, WORK, LENRBUF, RSRC1, $ CSRC1 ) END IF ELSEIF( DIR.EQ.2 .AND. LENCBUF.GT.0 .AND. $ NPROW.GT.1 .AND. NPROCS.GT.2 ) THEN IF( MYCOL.EQ.CSRC1 .OR. ( MYCOL.EQ.CSRC4 $ .AND. CSRC4.NE.CSRC1 ) ) THEN CALL DGEBS2D( ICTXT, 'Col', '1-Tree', $ LENCBUF, 1, WORK, LENCBUF ) ELSE CALL DGEBR2D( ICTXT, 'Col', '1-Tree', $ LENCBUF, 1, WORK(1+LENRBUF), LENCBUF, $ RSRC1, CSRC1 ) END IF END IF IF( LENRBUF.GT.0 .AND. ( MYCOL.EQ.CSRC1 .OR. $ ( MYCOL.EQ.CSRC4 .AND. CSRC4.NE.CSRC1 ) ) ) $ CALL DLAMOV( 'All', LENRBUF, 1, WORK, LENRBUF, $ WORK(1+LENRBUF), LENCBUF ) BCDONE = .TRUE. ELSEIF( MYROW.EQ.RSRC1 .AND. DIR.EQ.1 ) THEN IF( LENRBUF.GT.0 .AND. NPCOL.GT.1 ) $ CALL DGEBR2D( ICTXT, 'Row', '1-Tree', LENRBUF, $ 1, WORK, LENRBUF, RSRC1, CSRC1 ) BCDONE = .TRUE. ELSEIF( MYCOL.EQ.CSRC1 .AND. DIR.EQ.2 ) THEN IF( LENCBUF.GT.0 .AND. NPROW.GT.1 ) $ CALL DGEBR2D( ICTXT, 'Col', '1-Tree', LENCBUF, $ 1, WORK(1+LENRBUF), LENCBUF, RSRC1, CSRC1 ) BCDONE = .TRUE. ELSEIF( MYROW.EQ.RSRC4 .AND. DIR.EQ.1 ) THEN IF( LENRBUF.GT.0 .AND. NPCOL.GT.1 ) $ CALL DGEBR2D( ICTXT, 'Row', '1-Tree', LENRBUF, $ 1, WORK, LENRBUF, RSRC4, CSRC4 ) BCDONE = .TRUE. ELSEIF( MYCOL.EQ.CSRC4 .AND. DIR.EQ.2 ) THEN IF( LENCBUF.GT.0 .AND. NPROW.GT.1 ) $ CALL DGEBR2D( ICTXT, 'Col', '1-Tree', LENCBUF, $ 1, WORK(1+LENRBUF), LENCBUF, RSRC4, CSRC4 ) BCDONE = .TRUE. END IF 185 CONTINUE 180 CONTINUE 170 CONTINUE * * Prepare for computing cross border updates by exchanging * data in cross border update regions in H and Z. * DO 190 DIR = 1, 2 WINID = 0 IPW3 = 1 DO 200 WIN = ODDEVEN+(CHUNKNUM-1)*WCHUNK, $ MIN(ANMWIN,MAX(1,ODDEVEN+(CHUNKNUM)*WCHUNK-1)), 2 IF( IWORK( 5+(WIN-1)*5 ).NE.1 ) GO TO 205 * * Make sure this part of the code is only executed when * there has been some work performed on the WIN:th * window. * LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) * * Extract processor indices associated with * the current window. * RSRC1 = IWORK( 3+(WIN-1)*5 ) CSRC1 = IWORK( 4+(WIN-1)*5 ) RSRC4 = MOD( RSRC1+1, NPROW ) CSRC4 = MOD( CSRC1+1, NPCOL ) * * Compute local number of rows and columns * of H and Z to exchange. * IF(((MYCOL.EQ.CSRC1.OR.MYCOL.EQ.CSRC4).AND.DIR.EQ.2) $ .OR.((MYROW.EQ.RSRC1.OR.MYROW.EQ.RSRC4).AND. $ DIR.EQ.1)) THEN WINID = WINID + 1 LNWIN = LKBOT - LKTOP + 1 IPU = IPNEXT DIM1 = NB - MOD(LKTOP-1,NB) DIM4 = LNWIN - DIM1 IPNEXT = IPU + LNWIN*LNWIN IF( DIR.EQ.2 ) THEN IF( WANTZ ) THEN ZROWS = NUMROC( N, NB, MYROW, DESCZ( RSRC_ ), $ NPROW ) ELSE ZROWS = 0 END IF IF( WANTT ) THEN HROWS = NUMROC( LKTOP-1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) ELSE HROWS = 0 END IF ELSE ZROWS = 0 HROWS = 0 END IF IF( DIR.EQ.1 ) THEN IF( WANTT ) THEN HCOLS = NUMROC( N - (LKTOP+DIM1-1), NB, $ MYCOL, CSRC4, NPCOL ) IF( MYCOL.EQ.CSRC4 ) HCOLS = HCOLS - DIM4 ELSE HCOLS = 0 END IF ELSE HCOLS = 0 END IF IPW = MAX( 1 + LENRBUF + LENCBUF, IPW3 ) IPW1 = IPW + HROWS * LNWIN IF( WANTZ ) THEN IPW2 = IPW1 + LNWIN * HCOLS IPW3 = IPW2 + ZROWS * LNWIN ELSE IPW3 = IPW1 + LNWIN * HCOLS END IF END IF * * Let each process row and column involved in the updates * exchange data in H and Z with their neighbours. * IF( DIR.EQ.2 .AND. WANTT .AND. LENCBUF.GT.0 ) THEN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) THEN DO 210 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, LKTOP, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC1, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', HROWS, DIM1, $ H((JLOC1-1)*LLDH+ILOC), LLDH, $ WORK(IPW), HROWS ) IF( NPCOL.GT.1 ) THEN EAST = MOD( MYCOL + 1, NPCOL ) CALL DGESD2D( ICTXT, HROWS, DIM1, $ WORK(IPW), HROWS, RSRC, EAST ) CALL DGERV2D( ICTXT, HROWS, DIM4, $ WORK(IPW+HROWS*DIM1), HROWS, $ RSRC, EAST ) END IF END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, LKTOP+DIM1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC4, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', HROWS, DIM4, $ H((JLOC4-1)*LLDH+ILOC), LLDH, $ WORK(IPW+HROWS*DIM1), HROWS ) IF( NPCOL.GT.1 ) THEN WEST = MOD( MYCOL - 1 + NPCOL, $ NPCOL ) CALL DGESD2D( ICTXT, HROWS, DIM4, $ WORK(IPW+HROWS*DIM1), HROWS, $ RSRC, WEST ) CALL DGERV2D( ICTXT, HROWS, DIM1, $ WORK(IPW), HROWS, RSRC, WEST ) END IF END IF END IF 210 CONTINUE END IF END IF * IF( DIR.EQ.1 .AND. WANTT .AND. LENRBUF.GT.0 ) THEN IF( MYROW.EQ.RSRC1 .OR. MYROW.EQ.RSRC4 ) THEN DO 220 INDX = 1, NPCOL IF( MYROW.EQ.RSRC1 ) THEN IF( INDX.EQ.1 ) THEN IF( LKBOT.LT.N ) THEN CALL INFOG2L( LKTOP, LKBOT+1, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC, RSRC1, CSRC ) ELSE CSRC = -1 END IF ELSEIF( MOD(LKBOT,NB).NE.0 ) THEN CALL INFOG2L( LKTOP, $ (ICEIL(LKBOT,NB)+(INDX-2))*NB+1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC, RSRC1, CSRC ) ELSE CALL INFOG2L( LKTOP, $ (ICEIL(LKBOT,NB)+(INDX-1))*NB+1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC, RSRC1, CSRC ) END IF IF( MYCOL.EQ.CSRC ) THEN CALL DLAMOV( 'All', DIM1, HCOLS, $ H((JLOC-1)*LLDH+ILOC1), LLDH, $ WORK(IPW1), LNWIN ) IF( NPROW.GT.1 ) THEN SOUTH = MOD( MYROW + 1, NPROW ) CALL DGESD2D( ICTXT, DIM1, HCOLS, $ WORK(IPW1), LNWIN, SOUTH, $ CSRC ) CALL DGERV2D( ICTXT, DIM4, HCOLS, $ WORK(IPW1+DIM1), LNWIN, SOUTH, $ CSRC ) END IF END IF END IF IF( MYROW.EQ.RSRC4 ) THEN IF( INDX.EQ.1 ) THEN IF( LKBOT.LT.N ) THEN CALL INFOG2L( LKTOP+DIM1, LKBOT+1, $ DESCH, NPROW, NPCOL, MYROW, $ MYCOL, ILOC4, JLOC, RSRC4, $ CSRC ) ELSE CSRC = -1 END IF ELSEIF( MOD(LKBOT,NB).NE.0 ) THEN CALL INFOG2L( LKTOP+DIM1, $ (ICEIL(LKBOT,NB)+(INDX-2))*NB+1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC, RSRC4, CSRC ) ELSE CALL INFOG2L( LKTOP+DIM1, $ (ICEIL(LKBOT,NB)+(INDX-1))*NB+1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC, RSRC4, CSRC ) END IF IF( MYCOL.EQ.CSRC ) THEN CALL DLAMOV( 'All', DIM4, HCOLS, $ H((JLOC-1)*LLDH+ILOC4), LLDH, $ WORK(IPW1+DIM1), LNWIN ) IF( NPROW.GT.1 ) THEN NORTH = MOD( MYROW - 1 + NPROW, $ NPROW ) CALL DGESD2D( ICTXT, DIM4, HCOLS, $ WORK(IPW1+DIM1), LNWIN, NORTH, $ CSRC ) CALL DGERV2D( ICTXT, DIM1, HCOLS, $ WORK(IPW1), LNWIN, NORTH, $ CSRC ) END IF END IF END IF 220 CONTINUE END IF END IF * IF( DIR.EQ.2 .AND. WANTZ .AND. LENCBUF.GT.0) THEN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) THEN DO 230 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, LKTOP, $ DESCZ, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC1, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', ZROWS, DIM1, $ Z((JLOC1-1)*LLDZ+ILOC), LLDZ, $ WORK(IPW2), ZROWS ) IF( NPCOL.GT.1 ) THEN EAST = MOD( MYCOL + 1, NPCOL ) CALL DGESD2D( ICTXT, ZROWS, DIM1, $ WORK(IPW2), ZROWS, RSRC, $ EAST ) CALL DGERV2D( ICTXT, ZROWS, DIM4, $ WORK(IPW2+ZROWS*DIM1), $ ZROWS, RSRC, EAST ) END IF END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, $ LKTOP+DIM1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC4, RSRC, $ CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', ZROWS, DIM4, $ Z((JLOC4-1)*LLDZ+ILOC), LLDZ, $ WORK(IPW2+ZROWS*DIM1), ZROWS ) IF( NPCOL.GT.1 ) THEN WEST = MOD( MYCOL - 1 + NPCOL, $ NPCOL ) CALL DGESD2D( ICTXT, ZROWS, DIM4, $ WORK(IPW2+ZROWS*DIM1), $ ZROWS, RSRC, WEST ) CALL DGERV2D( ICTXT, ZROWS, DIM1, $ WORK(IPW2), ZROWS, RSRC, $ WEST ) END IF END IF END IF 230 CONTINUE END IF END IF * * If no exchanges was performed for the current window, * all processors jump to this point and try the next * one. * 205 CONTINUE * 200 CONTINUE * * Compute crossborder bulge-chase updates. * WINID = 0 IF( DIR.EQ.1 ) THEN IPNEXT = 1 ELSE IPNEXT = 1 + LENRBUF END IF IPW3 = 1 DO 240 WIN = ODDEVEN+(CHUNKNUM-1)*WCHUNK, $ MIN(ANMWIN,MAX(1,ODDEVEN+(CHUNKNUM)*WCHUNK-1)), 2 IF( IWORK( 5+(WIN-1)*5 ).NE.1 ) GO TO 245 * * Only perform this part of the code if there was really * some work performed on the WIN:th window. * LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = LKBOT - LKTOP + 1 * * Extract the processor indices associated with * the current window. * RSRC1 = IWORK( 3+(WIN-1)*5 ) CSRC1 = IWORK( 4+(WIN-1)*5 ) RSRC4 = MOD( RSRC1+1, NPROW ) CSRC4 = MOD( CSRC1+1, NPCOL ) * IF(((MYCOL.EQ.CSRC1.OR.MYCOL.EQ.CSRC4).AND.DIR.EQ.2) $ .OR.((MYROW.EQ.RSRC1.OR.MYROW.EQ.RSRC4).AND. $ DIR.EQ.1)) THEN * * Set up workspaces. * WINID = WINID + 1 LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = LKBOT - LKTOP + 1 DIM1 = NB - MOD(LKTOP-1,NB) DIM4 = LNWIN - DIM1 IPU = IPNEXT + (WINID-1)*LNWIN*LNWIN IF( DIR.EQ.2 ) THEN IF( WANTZ ) THEN ZROWS = NUMROC( N, NB, MYROW, DESCZ( RSRC_ ), $ NPROW ) ELSE ZROWS = 0 END IF IF( WANTT ) THEN HROWS = NUMROC( LKTOP-1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) ELSE HROWS = 0 END IF ELSE ZROWS = 0 HROWS = 0 END IF IF( DIR.EQ.1 ) THEN IF( WANTT ) THEN HCOLS = NUMROC( N - (LKTOP+DIM1-1), NB, $ MYCOL, CSRC4, NPCOL ) IF( MYCOL.EQ.CSRC4 ) HCOLS = HCOLS - DIM4 ELSE HCOLS = 0 END IF ELSE HCOLS = 0 END IF * * IPW = local copy of overlapping column block of H * IPW1 = local copy of overlapping row block of H * IPW2 = local copy of overlapping column block of Z * IPW3 = workspace for right hand side of matrix * multiplication * IPW = MAX( 1 + LENRBUF + LENCBUF, IPW3 ) IPW1 = IPW + HROWS * LNWIN IF( WANTZ ) THEN IPW2 = IPW1 + LNWIN * HCOLS IPW3 = IPW2 + ZROWS * LNWIN ELSE IPW3 = IPW1 + LNWIN * HCOLS END IF * * Recompute job to see if special structure of U * could possibly be exploited. * IF( LKTOP.EQ.KTOP .AND. LKBOT.EQ.KBOT ) THEN JOB = 'All steps' ELSEIF( LKTOP.EQ.KTOP .AND. $ ( DIM1.LT.LCHAIN+1 .OR. DIM1.LE.NTINY ) ) $ THEN JOB = 'Introduce and chase' ELSEIF( LKBOT.EQ.KBOT ) THEN JOB = 'Off-chase bulges' ELSE JOB = 'Chase bulges' END IF END IF * * Test if to exploit sparsity structure of * orthogonal matrix U. * KS = DIM1+DIM4-LNS/2*3 IF( .NOT. BLK22 .OR. DIM1.NE.KS .OR. $ DIM4.NE.KS .OR. LSAME(JOB,'I') .OR. $ LSAME(JOB,'O') .OR. LNS.LE.2 ) THEN * * Update the columns of H and Z. * IF( DIR.EQ.2 .AND. WANTT .AND. LENCBUF.GT.0 ) THEN DO 250 INDX = 1, MIN(LKTOP-1,1+(NPROW-1)*NB), NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, LKTOP, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DGEMM( 'No transpose', $ 'No transpose', HROWS, DIM1, $ LNWIN, ONE, WORK( IPW ), HROWS, $ WORK( IPU ), LNWIN, ZERO, $ WORK(IPW3), HROWS ) CALL DLAMOV( 'All', HROWS, DIM1, $ WORK(IPW3), HROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, LKTOP+DIM1, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DGEMM( 'No transpose', $ 'No transpose', HROWS, DIM4, $ LNWIN, ONE, WORK( IPW ), HROWS, $ WORK( IPU+LNWIN*DIM1 ), LNWIN, $ ZERO, WORK(IPW3), HROWS ) CALL DLAMOV( 'All', HROWS, DIM4, $ WORK(IPW3), HROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 250 CONTINUE END IF * IF( DIR.EQ.2 .AND. WANTZ .AND. LENCBUF.GT.0 ) THEN DO 260 INDX = 1, MIN(N,1+(NPROW-1)*NB), NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, LKTOP, DESCZ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DGEMM( 'No transpose', $ 'No transpose', ZROWS, DIM1, $ LNWIN, ONE, WORK( IPW2 ), $ ZROWS, WORK( IPU ), LNWIN, $ ZERO, WORK(IPW3), ZROWS ) CALL DLAMOV( 'All', ZROWS, DIM1, $ WORK(IPW3), ZROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, LKTOP+DIM1, DESCZ, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DGEMM( 'No transpose', $ 'No transpose', ZROWS, DIM4, $ LNWIN, ONE, WORK( IPW2 ), $ ZROWS, $ WORK( IPU+LNWIN*DIM1 ), LNWIN, $ ZERO, WORK(IPW3), ZROWS ) CALL DLAMOV( 'All', ZROWS, DIM4, $ WORK(IPW3), ZROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF END IF 260 CONTINUE END IF * * Update the rows of H. * IF( DIR.EQ.1 .AND. WANTT .AND. LENRBUF.GT.0 ) THEN IF( LKBOT.LT.N ) THEN IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC4 .AND. $ MOD(LKBOT,NB).NE.0 ) THEN INDX = LKBOT + 1 CALL INFOG2L( LKTOP, INDX, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC4 ) CALL DGEMM( 'Transpose', 'No Transpose', $ DIM1, HCOLS, LNWIN, ONE, WORK(IPU), $ LNWIN, WORK( IPW1 ), LNWIN, ZERO, $ WORK(IPW3), DIM1 ) CALL DLAMOV( 'All', DIM1, HCOLS, $ WORK(IPW3), DIM1, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF IF( MYROW.EQ.RSRC4.AND.MYCOL.EQ.CSRC4 .AND. $ MOD(LKBOT,NB).NE.0 ) THEN INDX = LKBOT + 1 CALL INFOG2L( LKTOP+DIM1, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC4, CSRC4 ) CALL DGEMM( 'Transpose', 'No Transpose', $ DIM4, HCOLS, LNWIN, ONE, $ WORK( IPU+DIM1*LNWIN ), LNWIN, $ WORK( IPW1), LNWIN, ZERO, $ WORK(IPW3), DIM4 ) CALL DLAMOV( 'All', DIM4, HCOLS, $ WORK(IPW3), DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF INDXS = ICEIL(LKBOT,NB)*NB + 1 IF( MOD(LKBOT,NB).NE.0 ) THEN INDXE = MIN(N,INDXS+(NPCOL-2)*NB) ELSE INDXE = MIN(N,INDXS+(NPCOL-1)*NB) END IF DO 270 INDX = INDXS, INDXE, NB IF( MYROW.EQ.RSRC1 ) THEN CALL INFOG2L( LKTOP, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL DGEMM( 'Transpose', $ 'No Transpose', DIM1, HCOLS, $ LNWIN, ONE, WORK( IPU ), LNWIN, $ WORK( IPW1 ), LNWIN, ZERO, $ WORK(IPW3), DIM1 ) CALL DLAMOV( 'All', DIM1, HCOLS, $ WORK(IPW3), DIM1, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF IF( MYROW.EQ.RSRC4 ) THEN CALL INFOG2L( LKTOP+DIM1, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC4, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL DGEMM( 'Transpose', $ 'No Transpose', DIM4, HCOLS, $ LNWIN, ONE, $ WORK( IPU+LNWIN*DIM1 ), LNWIN, $ WORK( IPW1 ), LNWIN, $ ZERO, WORK(IPW3), DIM4 ) CALL DLAMOV( 'All', DIM4, HCOLS, $ WORK(IPW3), DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 270 CONTINUE END IF END IF ELSE * * Update the columns of H and Z. * * Compute H2*U21 + H1*U11 on the left side of the border. * IF( DIR.EQ.2 .AND. WANTT .AND. LENCBUF.GT.0 ) THEN INDXE = MIN(LKTOP-1,1+(NPROW-1)*NB) DO 280 INDX = 1, INDXE, NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, LKTOP, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', HROWS, KS, $ WORK( IPW+HROWS*DIM4), HROWS, $ WORK(IPW3), HROWS ) CALL DTRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', HROWS, KS, ONE, $ WORK( IPU+DIM4 ), LNWIN, $ WORK(IPW3), HROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', HROWS, KS, DIM4, $ ONE, WORK( IPW ), HROWS, $ WORK( IPU ), LNWIN, ONE, $ WORK(IPW3), HROWS ) CALL DLAMOV( 'All', HROWS, KS, $ WORK(IPW3), HROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF * * Compute H1*U12 + H2*U22 on the right side of * the border. * IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, LKTOP+DIM1, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', HROWS, DIM4, $ WORK(IPW), HROWS, WORK( IPW3 ), $ HROWS ) CALL DTRMM( 'Right', 'Lower', $ 'No transpose', $ 'Non-unit', HROWS, DIM4, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW3 ), HROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', HROWS, DIM4, KS, $ ONE, WORK( IPW+HROWS*DIM4), $ HROWS, $ WORK( IPU+LNWIN*KS+DIM4 ), LNWIN, $ ONE, WORK( IPW3 ), HROWS ) CALL DLAMOV( 'All', HROWS, DIM4, $ WORK(IPW3), HROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 280 CONTINUE END IF * IF( DIR.EQ.2 .AND. WANTZ .AND. LENCBUF.GT.0 ) THEN * * Compute Z2*U21 + Z1*U11 on the left side * of border. * INDXE = MIN(N,1+(NPROW-1)*NB) DO 290 INDX = 1, INDXE, NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCZ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', ZROWS, KS, $ WORK( IPW2+ZROWS*DIM4), $ ZROWS, WORK(IPW3), ZROWS ) CALL DTRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', ZROWS, KS, ONE, $ WORK( IPU+DIM4 ), LNWIN, $ WORK(IPW3), ZROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', ZROWS, KS, $ DIM4, ONE, WORK( IPW2 ), $ ZROWS, WORK( IPU ), LNWIN, $ ONE, WORK(IPW3), ZROWS ) CALL DLAMOV( 'All', ZROWS, KS, $ WORK(IPW3), ZROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF END IF * * Compute Z1*U12 + Z2*U22 on the right side * of border. * IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, DESCZ, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', ZROWS, DIM4, $ WORK(IPW2), ZROWS, $ WORK( IPW3 ), ZROWS ) CALL DTRMM( 'Right', 'Lower', $ 'No transpose', $ 'Non-unit', ZROWS, DIM4, $ ONE, WORK( IPU+LNWIN*KS ), $ LNWIN, WORK( IPW3 ), ZROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', ZROWS, DIM4, $ KS, ONE, $ WORK( IPW2+ZROWS*(DIM4)), $ ZROWS, $ WORK( IPU+LNWIN*KS+DIM4 ), $ LNWIN, ONE, WORK( IPW3 ), $ ZROWS ) CALL DLAMOV( 'All', ZROWS, DIM4, $ WORK(IPW3), ZROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF END IF 290 CONTINUE END IF * IF( DIR.EQ.1 .AND. WANTT .AND. LENRBUF.GT.0) THEN IF ( LKBOT.LT.N ) THEN * * Compute U21**T*H2 + U11**T*H1 on the upper * side of the border. * IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC4.AND. $ MOD(LKBOT,NB).NE.0 ) THEN INDX = LKBOT + 1 CALL INFOG2L( LKTOP, INDX, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC4 ) CALL DLAMOV( 'All', KS, HCOLS, $ WORK( IPW1+DIM4 ), LNWIN, $ WORK(IPW3), KS ) CALL DTRMM( 'Left', 'Upper', 'Transpose', $ 'Non-unit', KS, HCOLS, ONE, $ WORK( IPU+DIM4 ), LNWIN, $ WORK(IPW3), KS ) CALL DGEMM( 'Transpose', 'No transpose', $ KS, HCOLS, DIM4, ONE, WORK(IPU), $ LNWIN, WORK(IPW1), LNWIN, $ ONE, WORK(IPW3), KS ) CALL DLAMOV( 'All', KS, HCOLS, $ WORK(IPW3), KS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF * * Compute U12**T*H1 + U22**T*H2 one the lower * side of the border. * IF( MYROW.EQ.RSRC4.AND.MYCOL.EQ.CSRC4.AND. $ MOD(LKBOT,NB).NE.0 ) THEN INDX = LKBOT + 1 CALL INFOG2L( LKTOP+DIM1, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC4, CSRC4 ) CALL DLAMOV( 'All', DIM4, HCOLS, $ WORK( IPW1 ), LNWIN, $ WORK( IPW3 ), DIM4 ) CALL DTRMM( 'Left', 'Lower', 'Transpose', $ 'Non-unit', DIM4, HCOLS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW3 ), DIM4 ) CALL DGEMM( 'Transpose', 'No Transpose', $ DIM4, HCOLS, KS, ONE, $ WORK( IPU+LNWIN*KS+DIM4 ), LNWIN, $ WORK( IPW1+DIM1 ), LNWIN, $ ONE, WORK( IPW3), DIM4 ) CALL DLAMOV( 'All', DIM4, HCOLS, $ WORK(IPW3), DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF * * Compute U21**T*H2 + U11**T*H1 on upper side * on border. * INDXS = ICEIL(LKBOT,NB)*NB+1 IF( MOD(LKBOT,NB).NE.0 ) THEN INDXE = MIN(N,INDXS+(NPCOL-2)*NB) ELSE INDXE = MIN(N,INDXS+(NPCOL-1)*NB) END IF DO 300 INDX = INDXS, INDXE, NB IF( MYROW.EQ.RSRC1 ) THEN CALL INFOG2L( LKTOP, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL DLAMOV( 'All', KS, HCOLS, $ WORK( IPW1+DIM4 ), LNWIN, $ WORK(IPW3), KS ) CALL DTRMM( 'Left', 'Upper', $ 'Transpose', 'Non-unit', $ KS, HCOLS, ONE, $ WORK( IPU+DIM4 ), LNWIN, $ WORK(IPW3), KS ) CALL DGEMM( 'Transpose', $ 'No transpose', KS, HCOLS, $ DIM4, ONE, WORK(IPU), LNWIN, $ WORK(IPW1), LNWIN, ONE, $ WORK(IPW3), KS ) CALL DLAMOV( 'All', KS, HCOLS, $ WORK(IPW3), KS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF * * Compute U12**T*H1 + U22**T*H2 on lower * side of border. * IF( MYROW.EQ.RSRC4 ) THEN CALL INFOG2L( LKTOP+DIM1, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC4, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL DLAMOV( 'All', DIM4, HCOLS, $ WORK( IPW1 ), LNWIN, $ WORK( IPW3 ), DIM4 ) CALL DTRMM( 'Left', 'Lower', $ 'Transpose','Non-unit', $ DIM4, HCOLS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW3 ), DIM4 ) CALL DGEMM( 'Transpose', $ 'No Transpose', DIM4, HCOLS, $ KS, ONE, $ WORK( IPU+LNWIN*KS+DIM4 ), $ LNWIN, WORK( IPW1+DIM1 ), $ LNWIN, ONE, WORK( IPW3), $ DIM4 ) CALL DLAMOV( 'All', DIM4, HCOLS, $ WORK(IPW3), DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 300 CONTINUE END IF END IF END IF * * Update window information - mark processed windows are * completed. * IF( DIR.EQ.2 ) THEN IF( LKBOT.EQ.KBOT ) THEN LKTOP = KBOT+1 LKBOT = KBOT+1 IWORK( 1+(WIN-1)*5 ) = LKTOP IWORK( 2+(WIN-1)*5 ) = LKBOT ELSE LKTOP = MIN( LKTOP + LNWIN - LCHAIN, $ MIN( KBOT, ICEIL( LKBOT, NB )*NB ) - $ LCHAIN + 1 ) IWORK( 1+(WIN-1)*5 ) = LKTOP LKBOT = MIN( MAX( LKBOT + LNWIN - LCHAIN, $ LKTOP + NWIN - 1), MIN( KBOT, $ ICEIL( LKBOT, NB )*NB ) ) IWORK( 2+(WIN-1)*5 ) = LKBOT END IF IF( IWORK( 5+(WIN-1)*5 ).EQ.1 ) $ IWORK( 5+(WIN-1)*5 ) = 2 IWORK( 3+(WIN-1)*5 ) = RSRC4 IWORK( 4+(WIN-1)*5 ) = CSRC4 END IF * * If nothing was done for the WIN:th window, all * processors come here and consider the next one * instead. * 245 CONTINUE 240 CONTINUE 190 CONTINUE 150 CONTINUE 140 CONTINUE * * Chased off bulges from first window? * IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', '1-Tree', 1, 1, ICHOFF, 1, $ -1, -1, -1, -1, -1 ) * * If the bulge was chasen off from first window it is removed. * IF( ICHOFF.GT.0 ) THEN DO 198 WIN = 2, ANMWIN IWORK( 1+(WIN-2)*5 ) = IWORK( 1+(WIN-1)*5 ) IWORK( 2+(WIN-2)*5 ) = IWORK( 2+(WIN-1)*5 ) IWORK( 3+(WIN-2)*5 ) = IWORK( 3+(WIN-1)*5 ) IWORK( 4+(WIN-2)*5 ) = IWORK( 4+(WIN-1)*5 ) 198 CONTINUE ANMWIN = ANMWIN - 1 IPIW = 6+(ANMWIN-1)*5 END IF * * If we have no more windows, return. * IF( ANMWIN.LT.1 ) RETURN * * Check for any more windows to bring over the border. * WINFIN = 0 DO 199 WIN = 1, ANMWIN WINFIN = WINFIN+IWORK( 5+(WIN-1)*5 ) 199 CONTINUE IF( WINFIN.LT.2*ANMWIN ) GO TO 137 * * Zero out process mark for each window - this is legal now when * the process starts over with local bulge-chasing etc. * DO 201 WIN = 1, ANMWIN IWORK( 5+(WIN-1)*5 ) = 0 201 CONTINUE * END IF * * Go back to local bulge-chase and see if there is more work to do. * GO TO 20 * * End of PDLAQR5 * END scalapack-2.0.2/SRC/pdlaqsy.f000644 000766 000024 00000032077 10363532303 016173 0ustar00juliestaff000000 000000 SUBROUTINE PDLAQSY( UPLO, N, A, IA, JA, DESCA, SR, SC, SCOND, $ AMAX, EQUED ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER EQUED, UPLO INTEGER IA, JA, N DOUBLE PRECISION AMAX, SCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), SC( * ), SR( * ) * .. * * Purpose * ======= * * PDLAQSY equilibrates a symmetric distributed matrix * sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the scaling factors in the * vectors SR and SC. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric distributed matrix sub( A ) is to be referenced: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (input/output) DOUBLE PRECISION pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the distributed symmetric * matrix sub( A ). If UPLO = 'U', the leading N-by-N upper * triangular part of sub( A ) contains the upper triangular * part of the matrix, and the strictly lower triangular part * of sub( A ) is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular part of the matrix, and the strictly upper trian- * gular part of sub( A ) is not referenced. * On exit, if EQUED = 'Y', the equilibrated matrix: * diag(SR(IA:IA+N-1)) * sub( A ) * diag(SC(JA:JA+N-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * SR (local input) DOUBLE PRECISION array, dimension LOCr(M_A) * The scale factors for A(IA:IA+M-1,JA:JA+N-1). SR is aligned * with the distributed matrix A, and replicated across every * process column. SR is tied to the distributed matrix A. * * SC (local input) DOUBLE PRECISION array, dimension LOCc(N_A) * The scale factors for sub( A ). SC is aligned with the dis- * tributed matrix A, and replicated down every process row. * SC is tied to the distributed matrix A. * * SCOND (global input) DOUBLE PRECISION * Ratio of the smallest SR(i) (respectively SC(j)) to the * largest SR(i) (respectively SC(j)), with IA <= i <= IA+N-1 * and JA <= j <= JA+N-1. * * AMAX (global input) DOUBLE PRECISION * Absolute value of the largest distributed submatrix entry. * * EQUED (output) CHARACTER*1 * Specifies whether or not equilibration was done. * = 'N': No equilibration. * = 'Y': Equilibration was done, i.e., sub( A ) has been re- * placed by: * diag(SR(IA:IA+N-1)) * sub( A ) * diag(SC(JA:JA+N-1)). * * Internal Parameters * =================== * * THRESH is a threshold value used to decide if scaling should be done * based on the ratio of the scaling factors. If SCOND < THRESH, * scaling is done. * * LARGE and SMALL are threshold values used to decide if scaling should * be done based on the absolute size of the largest matrix element. * If AMAX > LARGE or AMAX < SMALL, scaling is done. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, THRESH PARAMETER ( ONE = 1.0D+0, THRESH = 0.1D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, II, IIA, IOFFA, IROFF, J, $ JB, JJ, JJA, JN, KK, LDA, LL, MYCOL, MYROW, NP, $ NPCOL, NPROW DOUBLE PRECISION CJ, LARGE, SMALL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, LSAME, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) THEN EQUED = 'N' RETURN END IF * * Get grid parameters and compute local indexes * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) LDA = DESCA( LLD_ ) * * Initialize LARGE and SMALL. * SMALL = PDLAMCH( ICTXT, 'Safe minimum' ) / $ PDLAMCH( ICTXT, 'Precision' ) LARGE = ONE / SMALL * IF( SCOND.GE.THRESH .AND. AMAX.GE.SMALL .AND. AMAX.LE.LARGE ) THEN * * No equilibration * EQUED = 'N' * ELSE * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Replace A by diag(S) * A * diag(S). * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangle of A(IA:IA+N-1,JA:JA+N-1) is stored. * Handle first block separately * IOFFA = (JJ-1)*LDA IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 20 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 10 KK = IIA, II+LL-JJ+1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE ELSE IOFFA = IOFFA + JB*LDA END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 70 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 40 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 30 KK = IIA, II+LL-JJ+1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE ELSE DO 60 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 50 KK = IIA, II-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 70 CONTINUE * ELSE * * Lower triangle of A(IA:IA+N-1,JA:JA+N-1) is stored. * Handle first block separately * IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFF * IOFFA = (JJ-1)*LDA IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 90 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 80 KK = II+LL-JJ, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE ELSE DO 110 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 100 KK = II, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 160 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 130 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 120 KK = II+LL-JJ, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 120 CONTINUE IOFFA = IOFFA + LDA 130 CONTINUE ELSE DO 150 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 140 KK = II, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 140 CONTINUE IOFFA = IOFFA + LDA 150 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 160 CONTINUE * END IF * EQUED = 'Y' * END IF * RETURN * * End of PDLAQSY * END scalapack-2.0.2/SRC/pdlared1d.f000644 000766 000024 00000015075 10367447133 016367 0ustar00juliestaff000000 000000 SUBROUTINE PDLARED1D( N, IA, JA, DESC, BYCOL, BYALL, WORK, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 12, 2005 * * .. Scalar Arguments .. INTEGER IA, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESC( * ) DOUBLE PRECISION BYALL( * ), BYCOL( * ), WORK( LWORK ) * .. * * Purpose * ======= * * PDLARED1D redistributes a 1D array * * It assumes that the input array, BYCOL, is distributed across * rows and that all process columns contain the same copy of * BYCOL. The output array, BYALL, will be identical on all processes * and will contain the entire array. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NP = Number of local rows in BYCOL() * * N (global input) INTEGER * The number of elements to be redistributed. N >= 0. * * IA (global input) INTEGER * IA must be equal to 1 * * JA (global input) INTEGER * JA must be equal to 1 * * DESC (global/local input) INTEGER Array of dimension DLEN_ * A 2D array descriptor, which describes BYCOL * * BYCOL (local input) distributed block cyclic DOUBLE PRECISION array * global dimension (N), local dimension (NP) * BYCOL is distributed across the process rows * All process columns are assumed to contain the same value * * BYALL (global output) DOUBLE PRECISION global dimension( N ) * local dimension (N) * BYALL is exactly duplicated on all processes * It contains the same values as BYCOL, but it is replicated * across all processes rather than being distributed * * BYALL(i) = BYCOL( NUMROC(i,DESC( NB_ ),MYROW,0,NPROW ) on the procs * whose MYROW == mod((i-1)/DESC( NB_ ),NPROW) * * WORK (local workspace) DOUBLE PRECISION dimension (LWORK) * Used to hold the buffers sent from one process to another * * LWORK (local input) INTEGER size of WORK array * LWORK >= NUMROC(N, DESC( NB_ ), 0, 0, NPCOL) * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ALLI, BUFLEN, I, II, MYCOL, MYROW, NB, NPCOL, $ NPROW, PCOL * .. * .. External Functions .. * INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. * EXTERNAL BLACS_GRIDINFO, DCOPY, DGEBR2D, DGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESC( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) NB = DESC( MB_ ) * * DO 30 PCOL = 0, NPCOL - 1 BUFLEN = NUMROC( N, NB, PCOL, 0, NPCOL ) IF( MYCOL.EQ.PCOL ) THEN CALL DCOPY( BUFLEN, BYCOL, 1, WORK, 1 ) CALL DGEBS2D( DESC( CTXT_ ), 'R', ' ', 1, BUFLEN, WORK, 1 ) ELSE CALL DGEBR2D( DESC( CTXT_ ), 'R', ' ', 1, BUFLEN, WORK, 1, $ MYROW, PCOL ) END IF * ALLI = PCOL*NB DO 20 II = 1, BUFLEN, NB DO 10 I = 1, MIN( NB, BUFLEN-II+1 ) BYALL( ALLI+I ) = WORK( II-1+I ) 10 CONTINUE ALLI = ALLI + NB*NPCOL 20 CONTINUE 30 CONTINUE * RETURN * * End of PDLARED1D * END scalapack-2.0.2/SRC/pdlared2d.f000644 000766 000024 00000015131 10367447133 016361 0ustar00juliestaff000000 000000 SUBROUTINE PDLARED2D( N, IA, JA, DESC, BYROW, BYALL, WORK, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 12, 2005 * * .. Scalar Arguments .. INTEGER IA, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESC( * ) DOUBLE PRECISION BYALL( * ), BYROW( * ), WORK( LWORK ) * .. * * Purpose * ======= * * PDLARED2D redistributes a 1D array * * It assumes that the input array, BYROW, is distributed across * columns and that all process rows contain the same copy of * BYROW. The output array, BYALL, will be identical on all processes * and will contain the entire array. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NP = Number of local rows in BYROW() * * N (global input) INTEGER * The number of elements to be redistributed. N >= 0. * * IA (global input) INTEGER * IA must be equal to 1 * * JA (global input) INTEGER * JA must be equal to 1 * * DESC (global/local input) INTEGER Array of dimension DLEN_ * A 2D array descriptor, which describes BYROW * * BYROW (local input) distributed block cyclic DOUBLE PRECISION array * global dimension (N), local dimension (NP) * BYROW is distributed across the process columns * All process rows are assumed to contain the same value * * BYALL (global output) DOUBLE PRECISION global dimension( N ) * local dimension (N) * BYALL is exactly duplicated on all processes * It contains the same values as BYROW, but it is replicated * across all processes rather than being distributed * * BYALL(i) = BYROW( NUMROC(i,DESC( MB_ ),MYCOL,0,NPCOL ) on the procs * whose MYCOL == mod((i-1)/DESC( MB_ ),NPCOL) * * WORK (local workspace) DOUBLE PRECISION dimension (LWORK) * Used to hold the buffers sent from one process to another * * LWORK (local input) INTEGER size of WORK array * LWORK >= NUMROC(N, DESC( MB_ ), 0, 0, NPROW) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ALLI, BUFLEN, I, II, MB, MYCOL, MYROW, NPCOL, $ NPROW, PROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DGEBR2D, DGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESC( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) MB = DESC( MB_ ) * DO 30 PROW = 0, NPROW - 1 BUFLEN = NUMROC( N, MB, PROW, 0, NPROW ) IF( MYROW.EQ.PROW ) THEN CALL DCOPY( BUFLEN, BYROW, 1, WORK, 1 ) CALL DGEBS2D( DESC( CTXT_ ), 'C', ' ', BUFLEN, 1, WORK, $ BUFLEN ) ELSE CALL DGEBR2D( DESC( CTXT_ ), 'C', ' ', BUFLEN, 1, WORK, $ BUFLEN, PROW, MYCOL ) END IF * ALLI = PROW*MB DO 20 II = 1, BUFLEN, MB DO 10 I = 1, MIN( MB, BUFLEN-II+1 ) BYALL( ALLI+I ) = WORK( II-1+I ) 10 CONTINUE ALLI = ALLI + MB*NPROW 20 CONTINUE 30 CONTINUE * RETURN * * End of PSLARED2D * END scalapack-2.0.2/SRC/pdlarf.f000644 000766 000024 00000067756 10363532303 016002 0ustar00juliestaff000000 000000 SUBROUTINE PDLARF( SIDE, M, N, V, IV, JV, DESCV, INCV, TAU, $ C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) DOUBLE PRECISION C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PDLARF applies a real elementary reflector Q (or Q**T) to a real * M-by-N distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), from * either the left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a real scalar and v is a real vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also have the first row of sub( C ). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also have the first column of sub( C ) and * MOD(JV-1,NB_V) must be equal to MOD(JC-1,NB_C), if INCV = 1 only the * last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q * sub( C ), * = 'R': form sub( C ) * Q, Q = Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * V (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+M-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+M-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+N-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+N-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q * sub( C ) if SIDE = 'L', or * sub( C ) * Q if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL, ICOFF, ICROW, ICTXT, IIC, IIV, IOFFC, $ IOFFV, IPW, IROFF, IVCOL, IVROW, JJC, JJV, LDC, $ LDV, MYCOL, MYROW, MP, NCC, NCV, NPCOL, NPROW, $ NQ, RDEST DOUBLE PRECISION TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DGEBR2D, DGEBS2D, $ DGEMV, DGER, DGERV2D, DGESD2D, $ DGSUM2D, DLASET, INFOG2L, PB_TOPGET, $ PBDTRNV * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) JJC = MIN( JJC, NCC ) JJV = MIN( JJV, NCV ) IOFFC = IIC+(JJC-1)*LDC IOFFV = IIV+(JJV-1)*LDV * IROFF = MOD( IC-1, DESCC( MB_ ) ) ICOFF = MOD( JC-1, DESCC( NB_ ) ) MP = NUMROC( M+IROFF, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYROW.EQ.ICROW ) $ MP = MP - IROFF IF( MYCOL.EQ.ICCOL ) $ NQ = NQ - ICOFF * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFF) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFF) ) * IF( LSAME( SIDE, 'L' ) ) THEN * IF( CRBLCK ) THEN RDEST = ICROW ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V * IPW = MP+1 CALL PBDTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL DGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL DGER( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * TAULOC = TAU( JJV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL DGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, V( IOFFV ), 1, $ ZERO, WORK, 1 ) ELSE CALL DLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK, MAX( 1, NQ ) ) END IF CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK, MAX( 1, NQ ), RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL DGER( MP, NQ, -TAULOC, V( IOFFV ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL DCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL DGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL ) * ELSE IF( MYCOL.EQ.ICCOL ) THEN * IPW = MP+1 CALL DGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL DGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL DGER( MP, NQ, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V * IPW = MP+1 CALL PBDTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN IF( IOFFC.GT.0 ) $ CALL DGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( IOFFC.GT.0 ) $ CALL DGER( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL DCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK(IPW) = TAU( JJV ) CALL DGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( JJV ) * ELSE * IPW = MP+1 CALL DGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN IF( IOFFC.GT.0 ) $ CALL DGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( IOFFC.GT.0 ) $ CALL DGER( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW ) THEN * TAULOC = TAU( IIV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL DGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, V( IOFFV ), LDV, $ ZERO, WORK, 1 ) ELSE CALL DLASET( 'All', MP, 1, ZERO, ZERO, $ WORK, MAX( 1, MP ) ) END IF CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK, MAX( 1, MP ), RDEST, ICCOL ) * * sub( C ) := sub( C ) - w * v' * IF( IOFFV.GT.0 .AND. IOFFC.GT.0 ) $ CALL DGER( MP, NQ, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW * IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 CALL DCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL DGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW ) THEN * IPW = NQ+1 CALL DGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL DGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL DGER( MP, NQ, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V * IPW = NQ+1 CALL PBDTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = TAU( JJV ) * ELSE * CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL DGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL DGER( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 IF( IOFFV.GT.0 ) $ CALL DCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL DGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( IIV ) * ELSE * IPW = NQ+1 CALL DGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL DGEMV( 'No Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * IF( IOFFC.GT.0 ) $ CALL DGER( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V * IPW = NQ+1 CALL PBDTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = TAU( JJV ) * ELSE * CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL DGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', MP, 1, ZERO, ZERO, WORK( IPW ), $ MAX( 1, MP ) ) END IF CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL DGER( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, 1, $ C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PDLARF * END scalapack-2.0.2/SRC/pdlarfb.f000644 000766 000024 00000103647 11750130340 016126 0ustar00juliestaff000000 000000 SUBROUTINE PDLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, IV, $ JV, DESCV, T, C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, DIRECT, STOREV INTEGER IC, IV, JC, JV, K, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) DOUBLE PRECISION C( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PDLARFB applies a real block reflector Q or its transpose Q**T to a * real distributed M-by-N matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1) * from the left or the right. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * DIRECT (global input) CHARACTER * Indicates how Q is formed from a product of elementary * reflectors * = 'F': Q = H(1) H(2) . . . H(k) (Forward) * = 'B': Q = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Indicates how the vectors which define the elementary * reflectors are stored: * = 'C': Columnwise * = 'R': Rowwise * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The order of the matrix T (= the number of elementary * reflectors whose product defines the block reflector). * * V (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension ( LLD_V, LOCc(JV+K-1) ) if * STOREV = 'C', ( LLD_V, LOCc(JV+M-1)) if STOREV = 'R' and * SIDE = 'L', ( LLD_V, LOCc(JV+N-1) ) if STOREV = 'R' and * SIDE = 'R'. It contains the local pieces of the distributed * vectors V representing the Householder transformation. * See further details. * If STOREV = 'C' and SIDE = 'L', LLD_V >= MAX(1,LOCr(IV+M-1)); * if STOREV = 'C' and SIDE = 'R', LLD_V >= MAX(1,LOCr(IV+N-1)); * if STOREV = 'R', LLD_V >= LOCr(IV+K-1). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * T (local input) DOUBLE PRECISION array, dimension MB_V by MB_V * if STOREV = 'R' and NB_V by NB_V if STOREV = 'C'. The trian- * gular matrix T in the representation of the block reflector. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the M-by-N distributed matrix sub( C ). On exit, * sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) or * sub( C )*Q or sub( C )*Q'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * If STOREV = 'C', * if SIDE = 'L', * LWORK >= ( NqC0 + MpC0 ) * K * else if SIDE = 'R', * LWORK >= ( NqC0 + MAX( NpV0 + NUMROC( NUMROC( N+ICOFFC, * NB_V, 0, 0, NPCOL ), NB_V, 0, 0, LCMQ ), * MpC0 ) ) * K * end if * else if STOREV = 'R', * if SIDE = 'L', * LWORK >= ( MpC0 + MAX( MqV0 + NUMROC( NUMROC( M+IROFFC, * MB_V, 0, 0, NPROW ), MB_V, 0, 0, LCMP ), * NqC0 ) ) * K * else if SIDE = 'R', * LWORK >= ( MpC0 + NqC0 ) * K * end if * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFV = MOD( IV-1, MB_V ), ICOFFV = MOD( JV-1, NB_V ), * IVROW = INDXG2P( IV, MB_V, MYROW, RSRC_V, NPROW ), * IVCOL = INDXG2P( JV, NB_V, MYCOL, CSRC_V, NPCOL ), * MqV0 = NUMROC( M+ICOFFV, NB_V, MYCOL, IVCOL, NPCOL ), * NpV0 = NUMROC( N+IROFFV, MB_V, MYROW, IVROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NpC0 = NUMROC( N+ICOFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If STOREV = 'Columnwise' * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if STOREV = 'Rowwise' * If SIDE = 'Left', * ( NB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL FORWARD CHARACTER COLBTOP, ROWBTOP, TRANST, UPLO INTEGER HEIGHT, IBASE, ICCOL, ICOFFC, ICOFFV, ICROW, $ ICTXT, II, IIBEG, IIC, IIEND, IINXT, IIV, $ ILASTCOL, ILASTROW, ILEFT, IOFF, IOFFC, IOFFV, $ IPT, IPV, IPW, IPW1, IRIGHT, IROFFC, IROFFV, $ ITOP, IVCOL, IVROW, JJ, JJBEG, JJC, JJEND, $ JJNXT, JJV, KP, KQ, LDC, LDV, LV, LW, MBV, MPC, $ MPC0, MQV, MQV0, MYCOL, MYDIST, MYROW, NBV, $ NPV, NPV0, NPCOL, NPROW, NQC, NQC0, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D,DGEMM, $ DGSUM2D, DLAMOV, DLASET, DTRBR2D, $ DTRBS2D, DTRMM, INFOG1L, INFOG2L, PB_TOPGET, $ PBDTRAN * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 .OR. K.LE.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( TRANS, 'N' ) ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF FORWARD = LSAME( DIRECT, 'F' ) IF( FORWARD ) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) MBV = DESCV( MB_ ) NBV = DESCV( NB_ ) IROFFV = MOD( IV-1, MBV ) ICOFFV = MOD( JV-1, NBV ) MPC = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYCOL.EQ.ICCOL ) $ NQC = NQC - ICOFFC IF( MYROW.EQ.ICROW ) $ MPC = MPC - IROFFC JJC = MIN( JJC, MAX( 1, JJC+NQC-1 ) ) JJV = MIN( JJV, MAX( 1, NUMROC( DESCV( N_ ), NBV, MYCOL, $ DESCV( CSRC_ ), NPCOL ) ) ) IOFFC = IIC + ( JJC-1 ) * LDC IOFFV = IIV + ( JJV-1 ) * LDV * IF( LSAME( STOREV, 'C' ) ) THEN * * V is stored columnwise * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * Locally V( IOFFV ) is MPV x K, C( IOFFC ) is MPC x NQC * WORK( IPV ) is MPC x K = V( IOFFV ), MPC = MPV * WORK( IPW ) is NQC x K = C( IOFFC )' * V( IOFFV ) * IPV = 1 IPW = IPV + MPC * K LV = MAX( 1, MPC ) LW = MAX( 1, NQC ) * * Broadcast V to the other process columns. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ROWBTOP, MPC, K, $ V( IOFFV ), LDV ) IF( MYROW.EQ.IVROW ) $ CALL DTRBS2D( ICTXT, 'Rowwise', ROWBTOP, UPLO, $ 'Non unit', K, K, T, NBV ) CALL DLAMOV( 'All', MPC, K, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ROWBTOP, MPC, K, $ WORK( IPV ), LV, MYROW, IVCOL ) IF( MYROW.EQ.IVROW ) $ CALL DTRBR2D( ICTXT, 'Rowwise', ROWBTOP, UPLO, $ 'Non unit', K, K, T, NBV, MYROW, IVCOL ) END IF * IF( FORWARD ) THEN * * WORK(IPV) = ( V1 ) where V1 is unit lower triangular, * ( V2 ) zeroes upper triangular part of V1 * MYDIST = MOD( MYROW-IVROW+NPROW, NPROW ) ITOP = MAX( 0, MYDIST*MBV - IROFFV ) IIBEG = IIV IIEND = IIBEG + MPC - 1 IINXT = MIN( ICEIL( IIBEG, MBV )*MBV, IIEND ) * 10 CONTINUE IF( K-ITOP .GT.0 ) THEN CALL DLASET( 'Upper', IINXT-IIBEG+1, K-ITOP, ZERO, $ ONE, WORK( IPV+IIBEG-IIV+ITOP*LV ), LV ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IROFFV IIBEG = IINXT + 1 IINXT = MIN( IINXT+MBV, IIEND ) GO TO 10 END IF * ELSE * * WORK(IPV) = ( V1 ) where V2 is unit upper triangular, * ( V2 ) zeroes lower triangular part of V2 * JJ = JJV IOFF = MOD( IV+M-K-1, MBV ) CALL INFOG1L( IV+M-K, MBV, NPROW, MYROW, DESCV( RSRC_ ), $ II, ILASTROW ) KP = NUMROC( K+IOFF, MBV, MYROW, ILASTROW, NPROW ) IF( MYROW.EQ.ILASTROW ) $ KP = KP - IOFF MYDIST = MOD( MYROW-ILASTROW+NPROW, NPROW ) ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP+MBV, K ) ITOP = MIN( MAX( 0, ITOP ), K ) * 20 CONTINUE IF( JJ.LE.( JJV+K-1 ) ) THEN HEIGHT = IBASE - ITOP CALL DLASET( 'All', KP, ITOP-JJ+JJV, ZERO, ZERO, $ WORK( IPV+II-IIV+(JJ-JJV)*LV ), LV ) CALL DLASET( 'Lower', KP, HEIGHT, ZERO, ONE, $ WORK( IPV+II-IIV+ITOP*LV ), LV ) KP = MAX( 0, KP - HEIGHT ) II = II + HEIGHT JJ = JJV + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP + MBV, K ) ITOP = MIN( ITOP, K ) GO TO 20 END IF * END IF * * WORK( IPW ) = C( IOFFC )' * V (NQC x MPC x K) -> NQC x K * IF( MPC.GT.0 ) THEN CALL DGEMM( 'Transpose', 'No transpose', NQC, K, MPC, $ ONE, C( IOFFC ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL DLASET( 'All', NQC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQC, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * CALL DTRMM( 'Right', UPLO, TRANST, 'Non unit', NQC, K, $ ONE, T, NBV, WORK( IPW ), LW ) CALL DGEBS2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C C - V * W' * C( IOFFC ) = C( IOFFC ) - WORK( IPV ) * WORK( IPW )' * MPC x NQC MPC x K K x NQC * CALL DGEMM( 'No transpose', 'Transpose', MPC, NQC, K, -ONE, $ WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC ), LDC ) * ELSE * * Form sub( C )*Q or sub( C )*Q' * * ICOFFC = IROFFV is required by the current transposition * routine PBDTRAN * NPV0 = NUMROC( N+IROFFV, MBV, MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) THEN NPV = NPV0 - IROFFV ELSE NPV = NPV0 END IF IF( MYCOL.EQ.ICCOL ) THEN NQC0 = NQC + ICOFFC ELSE NQC0 = NQC END IF * * Locally V( IOFFV ) is NPV x K C( IOFFC ) is MPC x NQC * WORK( IPV ) is K x NQC0 = [ . V( IOFFV ) ]' * WORK( IPW ) is NPV0 x K = [ . V( IOFFV )' ]' * WORK( IPT ) is the workspace for PBDTRAN * IPV = 1 IPW = IPV + K * NQC0 IPT = IPW + NPV0 * K LV = MAX( 1, K ) LW = MAX( 1, NPV0 ) * IF( MYCOL.EQ.IVCOL ) THEN IF( MYROW.EQ.IVROW ) THEN CALL DLASET( 'All', IROFFV, K, ZERO, ZERO, $ WORK( IPW ), LW ) IPW1 = IPW + IROFFV CALL DLAMOV( 'All', NPV, K, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) ELSE IPW1 = IPW CALL DLAMOV( 'All', NPV, K, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) END IF * IF( FORWARD ) THEN * * WORK(IPW) = ( . V1' V2' )' where V1 is unit lower * triangular, zeroes upper triangular part of V1 * MYDIST = MOD( MYROW-IVROW+NPROW, NPROW ) ITOP = MAX( 0, MYDIST*MBV - IROFFV ) IIBEG = IIV IIEND = IIBEG + NPV - 1 IINXT = MIN( ICEIL( IIBEG, MBV )*MBV, IIEND ) * 30 CONTINUE IF( ( K-ITOP ).GT.0 ) THEN CALL DLASET( 'Upper', IINXT-IIBEG+1, K-ITOP, ZERO, $ ONE, WORK( IPW1+IIBEG-IIV+ITOP*LW ), $ LW ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IROFFV IIBEG = IINXT + 1 IINXT = MIN( IINXT+MBV, IIEND ) GO TO 30 END IF * ELSE * * WORK( IPW ) = ( . V1' V2' )' where V2 is unit upper * triangular, zeroes lower triangular part of V2. * JJ = JJV CALL INFOG1L( IV+N-K, MBV, NPROW, MYROW, $ DESCV( RSRC_ ), II, ILASTROW ) IOFF = MOD( IV+N-K-1, MBV ) KP = NUMROC( K+IOFF, MBV, MYROW, ILASTROW, NPROW ) IF( MYROW.EQ.ILASTROW ) $ KP = KP - IOFF MYDIST = MOD( MYROW-ILASTROW+NPROW, NPROW ) ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP+MBV, K ) ITOP = MIN( MAX( 0, ITOP ), K ) * 40 CONTINUE IF( JJ.LE.( JJV+K-1 ) ) THEN HEIGHT = IBASE - ITOP CALL DLASET( 'All', KP, ITOP-JJ+JJV, ZERO, ZERO, $ WORK( IPW1+II-IIV+(JJ-JJV)*LW ), LW ) CALL DLASET( 'Lower', KP, HEIGHT, ZERO, ONE, $ WORK( IPW1+II-IIV+ITOP*LW ), LW ) KP = MAX( 0, KP - HEIGHT ) II = II + HEIGHT JJ = JJV + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP + MBV, K ) ITOP = MIN( ITOP, K ) GO TO 40 END IF END IF END IF * CALL PBDTRAN( ICTXT, 'Columnwise', 'Transpose', N+IROFFV, K, $ MBV, WORK( IPW ), LW, ZERO, WORK( IPV ), LV, $ IVROW, IVCOL, -1, ICCOL, WORK( IPT ) ) * * WORK( IPV ) = ( . V' ) -> WORK( IPV ) = V' is K x NQC * IF( MYCOL.EQ.ICCOL ) $ IPV = IPV + ICOFFC * LV * * WORK( IPW ) becomes MPC x K = C( IOFFC ) * V * WORK( IPW ) = C( IOFFC ) * V (MPC x NQC x K) -> MPC x K * LW = MAX( 1, MPC ) * IF( NQC.GT.0 ) THEN CALL DGEMM( 'No transpose', 'Transpose', MPC, K, NQC, $ ONE, C( IOFFC ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL DLASET( 'All', MPC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN IF( MYROW.EQ.IVROW ) THEN * * Broadcast the block reflector to the other rows. * CALL DTRBS2D( ICTXT, 'Columnwise', ' ', UPLO, $ 'Non unit', K, K, T, NBV ) ELSE CALL DTRBR2D( ICTXT, 'Columnwise', ' ', UPLO, $ 'Non unit', K, K, T, NBV, IVROW, MYCOL ) END IF CALL DTRMM( 'Right', UPLO, TRANS, 'Non unit', MPC, K, $ ONE, T, NBV, WORK( IPW ), LW ) * CALL DGEBS2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C C - W * V' * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC x NQC MPC x K K x NQC * CALL DGEMM( 'No transpose', 'No transpose', MPC, NQC, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC ), LDC ) END IF * ELSE * * V is stored rowwise * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * IROFFC = ICOFFV is required by the current transposition * routine PBDTRAN * MQV0 = NUMROC( M+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN MQV = MQV0 - ICOFFV ELSE MQV = MQV0 END IF IF( MYROW.EQ.ICROW ) THEN MPC0 = MPC + IROFFC ELSE MPC0 = MPC END IF * * Locally V( IOFFV ) is K x MQV, C( IOFFC ) is MPC x NQC * WORK( IPV ) is MPC0 x K = [ . V( IOFFV ) ]' * WORK( IPW ) is K x MQV0 = [ . V( IOFFV ) ] * WORK( IPT ) is the workspace for PBDTRAN * IPV = 1 IPW = IPV + MPC0 * K IPT = IPW + K * MQV0 LV = MAX( 1, MPC0 ) LW = MAX( 1, K ) * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN CALL DLASET( 'All', K, ICOFFV, ZERO, ZERO, $ WORK( IPW ), LW ) IPW1 = IPW + ICOFFV * LW CALL DLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) ELSE IPW1 = IPW CALL DLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) END IF * IF( FORWARD ) THEN * * WORK( IPW ) = ( . V1 V2 ) where V1 is unit upper * triangular, zeroes lower triangular part of V1 * MYDIST = MOD( MYCOL-IVCOL+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBV - ICOFFV ) JJBEG = JJV JJEND = JJV + MQV - 1 JJNXT = MIN( ICEIL( JJBEG, NBV ) * NBV, JJEND ) * 50 CONTINUE IF( ( K-ILEFT ).GT.0 ) THEN CALL DLASET( 'Lower', K-ILEFT, JJNXT-JJBEG+1, ZERO, $ ONE, $ WORK( IPW1+ILEFT+(JJBEG-JJV)*LW ), $ LW ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - ICOFFV JJBEG = JJNXT + 1 JJNXT = MIN( JJNXT+NBV, JJEND ) GO TO 50 END IF * ELSE * * WORK( IPW ) = ( . V1 V2 ) where V2 is unit lower * triangular, zeroes upper triangular part of V2. * II = IIV CALL INFOG1L( JV+M-K, NBV, NPCOL, MYCOL, $ DESCV( CSRC_ ), JJ, ILASTCOL ) IOFF = MOD( JV+M-K-1, NBV ) KQ = NUMROC( K+IOFF, NBV, MYCOL, ILASTCOL, NPCOL ) IF( MYCOL.EQ.ILASTCOL ) $ KQ = KQ - IOFF MYDIST = MOD( MYCOL-ILASTCOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT+NBV, K ) ILEFT = MIN( MAX( 0, ILEFT ), K ) * 60 CONTINUE IF( II.LE.( IIV+K-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL DLASET( 'All', ILEFT-II+IIV, KQ, ZERO, ZERO, $ WORK( IPW1+II-IIV+(JJ-JJV)*LW ), LW ) CALL DLASET( 'Upper', WIDE, KQ, ZERO, ONE, $ WORK( IPW1+ILEFT+(JJ-JJV)*LW ), LW ) KQ = MAX( 0, KQ - WIDE ) II = IIV + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT + NBV, K ) ILEFT = MIN( ILEFT, K ) GO TO 60 END IF END IF END IF * * WORK( IPV ) = WORK( IPW )' (replicated) is MPC0 x K * CALL PBDTRAN( ICTXT, 'Rowwise', 'Transpose', K, M+ICOFFV, $ NBV, WORK( IPW ), LW, ZERO, WORK( IPV ), LV, $ IVROW, IVCOL, ICROW, -1, WORK( IPT ) ) * * WORK( IPV ) = ( . V )' -> WORK( IPV ) = V' is MPC x K * IF( MYROW.EQ.ICROW ) $ IPV = IPV + IROFFC * * WORK( IPW ) becomes NQC x K = C( IOFFC )' * V' * WORK( IPW ) = C( IOFFC )' * V' (NQC x MPC x K) -> NQC x K * LW = MAX( 1, NQC ) * IF( MPC.GT.0 ) THEN CALL DGEMM( 'Transpose', 'No transpose', NQC, K, MPC, $ ONE, C( IOFFC ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL DLASET( 'All', NQC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQC, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN * * Broadcast the block reflector to the other columns. * CALL DTRBS2D( ICTXT, 'Rowwise', ' ', UPLO, 'Non unit', $ K, K, T, MBV ) ELSE CALL DTRBR2D( ICTXT, 'Rowwise', ' ', UPLO, 'Non unit', $ K, K, T, MBV, MYROW, IVCOL ) END IF CALL DTRMM( 'Right', UPLO, TRANST, 'Non unit', NQC, K, $ ONE, T, MBV, WORK( IPW ), LW ) * CALL DGEBS2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C C - V' * W' * C( IOFFC ) = C( IOFFC ) - WORK( IPV ) * WORK( IPW )' * MPC x NQC MPC x K K x NQC * CALL DGEMM( 'No transpose', 'Transpose', MPC, NQC, K, -ONE, $ WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC ), LDC ) * ELSE * * Form Q*sub( C ) or Q'*sub( C ) * * Locally V( IOFFV ) is K x NQV, C( IOFFC ) is MPC x NQC * WORK( IPV ) is K x NQV = V( IOFFV ), NQV = NQC * WORK( IPW ) is MPC x K = C( IOFFC ) * V( IOFFV )' * IPV = 1 IPW = IPV + K * NQC LV = MAX( 1, K ) LW = MAX( 1, MPC ) * * Broadcast V to the other process rows. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) IF( MYROW.EQ.IVROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', COLBTOP, K, NQC, $ V( IOFFV ), LDV ) IF( MYCOL.EQ.IVCOL ) $ CALL DTRBS2D( ICTXT, 'Columnwise', COLBTOP, UPLO, $ 'Non unit', K, K, T, MBV ) CALL DLAMOV( 'All', K, NQC, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', COLBTOP, K, NQC, $ WORK( IPV ), LV, IVROW, MYCOL ) IF( MYCOL.EQ.IVCOL ) $ CALL DTRBR2D( ICTXT, 'Columnwise', COLBTOP, UPLO, $ 'Non unit', K, K, T, MBV, IVROW, MYCOL ) END IF * IF( FORWARD ) THEN * * WORK(IPW) = ( V1 V2 ) where V1 is unit upper * triangular, zeroes lower triangular part of V1 * MYDIST = MOD( MYCOL-IVCOL+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBV - ICOFFV ) JJBEG = JJV JJEND = JJV + NQC - 1 JJNXT = MIN( ICEIL( JJBEG, NBV ) * NBV, JJEND ) * 70 CONTINUE IF( ( K-ILEFT ).GT.0 ) THEN CALL DLASET( 'Lower', K-ILEFT, JJNXT-JJBEG+1, ZERO, $ ONE, WORK( IPV+ILEFT+(JJBEG-JJV)*LV ), $ LV ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - ICOFFV JJBEG = JJNXT + 1 JJNXT = MIN( JJNXT+NBV, JJEND ) GO TO 70 END IF * ELSE * * WORK( IPW ) = ( . V1 V2 ) where V2 is unit lower * triangular, zeroes upper triangular part of V2. * II = IIV CALL INFOG1L( JV+N-K, NBV, NPCOL, MYCOL, DESCV( CSRC_ ), $ JJ, ILASTCOL ) IOFF = MOD( JV+N-K-1, NBV ) KQ = NUMROC( K+IOFF, NBV, MYCOL, ILASTCOL, NPCOL ) IF( MYCOL.EQ.ILASTCOL ) $ KQ = KQ - IOFF MYDIST = MOD( MYCOL-ILASTCOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT+NBV, K ) ILEFT = MIN( MAX( 0, ILEFT ), K ) * 80 CONTINUE IF( II.LE.( IIV+K-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL DLASET( 'All', ILEFT-II+IIV, KQ, ZERO, ZERO, $ WORK( IPV+II-IIV+(JJ-JJV)*LV ), LV ) CALL DLASET( 'Upper', WIDE, KQ, ZERO, ONE, $ WORK( IPV+ILEFT+(JJ-JJV)*LV ), LV ) KQ = MAX( 0, KQ - WIDE ) II = IIV + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT + NBV, K ) ILEFT = MIN( ILEFT, K ) GO TO 80 END IF * END IF * * WORK( IPV ) is K x NQC = V = V( IOFFV ) * WORK( IPW ) = C( IOFFC ) * V' (MPC x NQC x K) -> MPC x K * IF( NQC.GT.0 ) THEN CALL DGEMM( 'No Transpose', 'Transpose', MPC, K, NQC, $ ONE, C( IOFFC ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL DLASET( 'All', MPC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN CALL DTRMM( 'Right', UPLO, TRANS, 'Non unit', MPC, K, $ ONE, T, MBV, WORK( IPW ), LW ) CALL DGEBS2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C C - W * V * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC x NQC MPC x K K x NQC * CALL DGEMM( 'No transpose', 'No transpose', MPC, NQC, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC ), LDC ) * END IF * END IF * RETURN * * End of PDLARFB * END scalapack-2.0.2/SRC/pdlarfg.f000644 000766 000024 00000023302 10363532303 016124 0ustar00juliestaff000000 000000 SUBROUTINE PDLARFG( N, ALPHA, IAX, JAX, X, IX, JX, DESCX, INCX, $ TAU ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAX, INCX, IX, JAX, JX, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. INTEGER DESCX( * ) DOUBLE PRECISION TAU( * ), X( * ) * .. * * Purpose * ======= * * PDLARFG generates a real elementary reflector H of order n, such * that * * H * sub( X ) = H * ( x(iax,jax) ) = ( alpha ), H' * H = I. * ( x ) ( 0 ) * * where alpha is a scalar, and sub( X ) is an (N-1)-element real * distributed vector X(IX:IX+N-2,JX) if INCX = 1 and X(IX,JX:JX+N-2) if * INCX = DESCX(M_). H is represented in the form * * H = I - tau * ( 1 ) * ( 1 v' ) , * ( v ) * * where tau is a real scalar and v is a real (N-1)-element * vector. * * If the elements of sub( X ) are all zero, then tau = 0 and H is * taken to be the unit matrix. * * Otherwise 1 <= tau <= 2. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) INTEGER * The global order of the elementary reflector. N >= 0. * * ALPHA (local output) DOUBLE PRECISION * On exit, alpha is computed in the process scope having the * vector sub( X ). * * IAX (global input) INTEGER * The global row index in X of X(IAX,JAX). * * JAX (global input) INTEGER * The global column index in X of X(IAX,JAX). * * X (local input/local output) DOUBLE PRECISION, pointer into the * local memory to an array of dimension (LLD_X,*). This array * contains the local pieces of the distributed vector sub( X ). * Before entry, the incremented array sub( X ) must contain * the vector x. On exit, it is overwritten with the vector v. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * TAU (local output) DOUBLE PRECISION array, dimension LOCc(JX) * if INCX = 1, and LOCr(IX) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER ICTXT, IIAX, INDXTAU, IXCOL, IXROW, J, JJAX, $ KNT, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION BETA, RSAFMN, SAFMIN, XNORM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, PDSCAL, $ INFOG2L, PDNRM2 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLAPY2 EXTERNAL DLAMCH, DLAPY2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, SIGN * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is distributed across a process row. * CALL INFOG2L( IX, JAX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIAX, JJAX, IXROW, IXCOL ) * IF( MYROW.NE.IXROW ) $ RETURN * * Broadcast X(IAX,JAX) across the process row. * IF( MYCOL.EQ.IXCOL ) THEN J = IIAX+(JJAX-1)*DESCX( LLD_ ) CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, X( J ), 1 ) ALPHA = X( J ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1, $ MYROW, IXCOL ) END IF * INDXTAU = IIAX * ELSE * * sub( X ) is distributed across a process column. * CALL INFOG2L( IAX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIAX, JJAX, IXROW, IXCOL ) * IF( MYCOL.NE.IXCOL ) $ RETURN * * Broadcast X(IAX,JAX) across the process column. * IF( MYROW.EQ.IXROW ) THEN J = IIAX+(JJAX-1)*DESCX( LLD_ ) CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, X( J ), 1 ) ALPHA = X( J ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, ALPHA, 1, $ IXROW, MYCOL ) END IF * INDXTAU = JJAX * END IF * IF( N.LE.0 ) THEN TAU( INDXTAU ) = ZERO RETURN END IF * CALL PDNRM2( N-1, XNORM, X, IX, JX, DESCX, INCX ) * IF( XNORM.EQ.ZERO ) THEN * * H = I * TAU( INDXTAU ) = ZERO * ELSE * * General case * BETA = -SIGN( DLAPY2( ALPHA, XNORM ), ALPHA ) SAFMIN = DLAMCH( 'S' ) RSAFMN = ONE / SAFMIN IF( ABS( BETA ).LT.SAFMIN ) THEN * * XNORM, BETA may be inaccurate; scale X and recompute them * KNT = 0 10 CONTINUE KNT = KNT + 1 CALL PDSCAL( N-1, RSAFMN, X, IX, JX, DESCX, INCX ) BETA = BETA*RSAFMN ALPHA = ALPHA*RSAFMN IF( ABS( BETA ).LT.SAFMIN ) $ GO TO 10 * * New BETA is at most 1, at least SAFMIN * CALL PDNRM2( N-1, XNORM, X, IX, JX, DESCX, INCX ) BETA = -SIGN( DLAPY2( ALPHA, XNORM ), ALPHA ) TAU( INDXTAU ) = ( BETA-ALPHA ) / BETA CALL PDSCAL( N-1, ONE/(ALPHA-BETA), X, IX, JX, DESCX, INCX ) * * If ALPHA is subnormal, it may lose relative accuracy * ALPHA = BETA DO 20 J = 1, KNT ALPHA = ALPHA*SAFMIN 20 CONTINUE ELSE TAU( INDXTAU ) = ( BETA-ALPHA ) / BETA CALL PDSCAL( N-1, ONE/(ALPHA-BETA), X, IX, JX, DESCX, INCX ) ALPHA = BETA END IF END IF * RETURN * * End of PDLARFG * END scalapack-2.0.2/SRC/pdlarft.f000644 000766 000024 00000044357 10363532303 016156 0ustar00juliestaff000000 000000 SUBROUTINE PDLARFT( DIRECT, STOREV, N, K, V, IV, JV, DESCV, TAU, $ T, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIRECT, STOREV INTEGER IV, JV, K, N * .. * .. Array Arguments .. INTEGER DESCV( * ) DOUBLE PRECISION TAU( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PDLARFT forms the triangular factor T of a real block reflector H * of order n, which is defined as a product of k elementary reflectors. * * If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; * * If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. * * If STOREV = 'C', the vector which defines the elementary reflector * H(i) is stored in the i-th column of the distributed matrix V, and * * H = I - V * T * V' * * If STOREV = 'R', the vector which defines the elementary reflector * H(i) is stored in the i-th row of the distributed matrix V, and * * H = I - V' * T * V * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIRECT (global input) CHARACTER*1 * Specifies the order in which the elementary reflectors are * multiplied to form the block reflector: * = 'F': H = H(1) H(2) . . . H(k) (Forward) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER*1 * Specifies how the vectors which define the elementary * reflectors are stored (see also Further Details): * = 'C': columnwise * = 'R': rowwise * * N (global input) INTEGER * The order of the block reflector H. N >= 0. * * K (global input) INTEGER * The order of the triangular factor T (= the number of * elementary reflectors). 1 <= K <= MB_V (= NB_V). * * V (input/output) DOUBLE PRECISION pointer into the local memory * to an array of local dimension (LOCr(IV+N-1),LOCc(JV+K-1)) * if STOREV = 'C', and (LOCr(IV+K-1),LOCc(JV+N-1)) if * STOREV = 'R'. The distributed matrix V contains the * Householder vectors. See further details. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * TAU (local input) DOUBLE PRECISION array, dimension LOCr(IV+K-1) * if INCV = M_V, and LOCc(JV+K-1) otherwise. This array * contains the Householder scalars related to the Householder * vectors. TAU is tied to the distributed matrix V. * * T (local output) DOUBLE PRECISION array, dimension (NB_V,NB_V) * if STOREV = 'Col', and (MB_V,MB_V) otherwise. It contains * the k-by-k triangular factor of the block reflector asso- * ciated with V. If DIRECT = 'F', T is upper triangular; * if DIRECT = 'B', T is lower triangular. * * WORK (local workspace) DOUBLE PRECISION array, * dimension (K*(K-1)/2) * * Further Details * =============== * * The shape of the matrix V and the storage of the vectors which define * the H(i) is best illustrated by the following example with n = 5 and * k = 3. The elements equal to 1 are not stored; the corresponding * array elements are modified but restored on exit. The rest of the * array is not used. * * DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': * * V( IV:IV+N-1, ( 1 ) V( IV:IV+K-1, ( 1 v1 v1 v1 v1 ) * JV:JV+K-1 ) = ( v1 1 ) JV:JV+N-1 ) = ( 1 v2 v2 v2 ) * ( v1 v2 1 ) ( 1 v3 v3 ) * ( v1 v2 v3 ) * ( v1 v2 v3 ) * * DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': * * V( IV:IV+N-1, ( v1 v2 v3 ) V( IV:IV+K-1, ( v1 v1 1 ) * JV:JV+K-1 ) = ( v1 v2 v3 ) JV:JV+N-1 ) = ( v2 v2 v2 1 ) * ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) * ( 1 v3 ) * ( 1 ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL FORWARD INTEGER ICOFF, ICTXT, II, IIV, IROFF, IVCOL, IVROW, $ ITMP0, ITMP1, IW, JJ, JJV, LDV, MICOL, MIROW, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION VII * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DGEMV, DGSUM2D, $ DLASET, DTRMV, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 .OR. K.LE.0 ) $ RETURN * ICTXT = DESCV( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * FORWARD = LSAME( DIRECT, 'F' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, $ IIV, JJV, IVROW, IVCOL ) * IF( LSAME( STOREV, 'C' ) .AND. MYCOL.EQ.IVCOL ) THEN * IW = 1 LDV = DESCV( LLD_ ) IROFF = MOD( IV-1, DESCV( MB_ ) ) * IF( FORWARD ) THEN * * DIRECT = 'Forward', STOREV = 'Columnwise' * NP = NUMROC( N+IROFF, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) THEN NP = NP - IROFF II = IIV + 1 ELSE II = IIV END IF IF( IROFF+1.EQ.DESCV( MB_ ) ) THEN MIROW = MOD( IVROW+1, NPROW ) ELSE MIROW = IVROW END IF ITMP0 = 0 * DO 10 JJ = JJV+1, JJV+K-1 * IF( MYROW.EQ.MIROW ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( jv+i-1 ) * * V(iv+i-1:iv+n-1,jv:jv+i-2)' * V(iv+i-1:iv+n-1,jv+i-1) * ITMP0 = ITMP0 + 1 IF( NP-II+IIV.GT.0 ) THEN CALL DGEMV( 'Transpose', NP-II+IIV, ITMP0, $ -TAU( JJ ), V( II+(JJV-1)*LDV ), LDV, $ V( II+(JJ-1)*LDV ), 1, ZERO, $ WORK( IW ), 1 ) ELSE CALL DLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF * IW = IW + ITMP0 IF( MYROW.EQ.MIROW ) THEN V( II+(JJ-1)*LDV ) = VII II = II + 1 END IF * IF( MOD( IV+ITMP0, DESCV( MB_ ) ).EQ.0 ) $ MIROW = MOD( MIROW+1, NPROW ) * 10 CONTINUE * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', IW-1, 1, WORK, IW-1, $ IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * IW = 1 ITMP0 = 0 ITMP1 = 1 * T( ITMP1 ) = TAU( JJV ) * DO 20 JJ = JJV+1, JJV+K-1 * * T(1:j-1,j) = T(1:j-1,1:j-1) * T(1:j-1,j) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 + DESCV( NB_ ) CALL DCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL DTRMV( 'Upper', 'No transpose', 'Non-unit', $ ITMP0, T, DESCV( NB_ ), T( ITMP1 ), 1 ) T(ITMP1+ITMP0) = TAU( JJ ) * 20 CONTINUE * END IF * ELSE * * DIRECT = 'Backward', STOREV = 'Columnwise' * NP = NUMROC( N+IROFF-1, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ NP = NP - IROFF MIROW = INDXG2P( IV+N-2, DESCV( MB_ ), MYROW, $ DESCV( RSRC_ ), NPROW ) II = IIV + NP - 1 ITMP0 = 0 * DO 30 JJ = JJV+K-2, JJV, -1 * IF( MYROW.EQ.MIROW ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( jv+i-1 ) * * V(iv:iv+n-k+i-1,jv+i:jv+k-1)' * V(iv:iv+n-k+i-1,jv+i-1) * ITMP0 = ITMP0 + 1 IF( II-IIV+1.GT.0 ) THEN CALL DGEMV( 'Transpose', II-IIV+1, ITMP0, -TAU( JJ ), $ V( IIV+JJ*LDV ), LDV, $ V( IIV+(JJ-1)*LDV ), 1, ZERO, $ WORK( IW ), 1 ) ELSE CALL DLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF * IW = IW + ITMP0 IF( MYROW.EQ.MIROW ) THEN V( II+(JJ-1)*LDV ) = VII II = II - 1 END IF * IF( MOD( IV+N-ITMP0-2, DESCV(MB_) ).EQ.0 ) $ MIROW = MOD( MIROW+NPROW-1, NPROW ) * 30 CONTINUE * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', IW-1, 1, WORK, IW-1, $ IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( NB_ ) * T( ITMP1-1 ) = TAU( JJV+K-1 ) * DO 40 JJ = JJV+K-2, JJV, -1 * * T(j+1:k,j) = T(j+1:k,j+1:k) * T(j+1:k,j) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( NB_ ) - 1 CALL DCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL DTRMV( 'Lower', 'No transpose', 'Non-unit', $ ITMP0, T( ITMP1+DESCV( NB_ ) ), $ DESCV( NB_ ), T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( JJ ) * 40 CONTINUE * END IF * END IF * ELSE IF( LSAME( STOREV, 'R' ) .AND. MYROW.EQ.IVROW ) THEN * IW = 1 LDV = DESCV( LLD_ ) ICOFF = MOD( JV-1, DESCV( NB_ ) ) * IF( FORWARD ) THEN * * DIRECT = 'Forward', STOREV = 'Rowwise' * NQ = NUMROC( N+ICOFF, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN NQ = NQ - ICOFF JJ = JJV + 1 ELSE JJ = JJV END IF IF( ICOFF+1.EQ.DESCV( NB_ ) ) THEN MICOL = MOD( IVCOL+1, NPCOL ) ELSE MICOL = IVCOL END IF ITMP0 = 0 * DO 50 II = IIV+1, IIV+K-1 * IF( MYCOL.EQ.MICOL ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( iv+i-1 ) * * V(iv+i-1,jv+i-1:jv+n-1) * V(iv:iv+i-2,jv+i-1:jv+n-1)' * ITMP0 = ITMP0 + 1 IF( NQ-JJ+JJV.GT.0 ) THEN CALL DGEMV( 'No transpose', ITMP0, NQ-JJ+JJV, $ -TAU(II), V( IIV+(JJ-1)*LDV ), LDV, $ V( II+(JJ-1)*LDV ), LDV, ZERO, $ WORK( IW ), 1 ) ELSE CALL DLASET( 'All', ITMP0, 1, ZERO, ZERO, $ WORK( IW ), ITMP0 ) END IF * IW = IW + ITMP0 IF( MYCOL.EQ.MICOL ) THEN V( II+(JJ-1)*LDV ) = VII JJ = JJ + 1 END IF * IF( MOD( JV+ITMP0, DESCV( NB_ ) ).EQ.0 ) $ MICOL = MOD( MICOL+1, NPCOL ) * 50 CONTINUE * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = 1 * T( ITMP1 ) = TAU( IIV ) * DO 60 II = IIV+1, IIV+K-1 * * T(1:i-1,i) = T(1:i-1,1:i-1) * T(1:i-1,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 + DESCV( MB_ ) CALL DCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL DTRMV( 'Upper', 'No transpose', 'Non-unit', $ ITMP0, T, DESCV( MB_ ), T( ITMP1 ), 1 ) T( ITMP1+ITMP0 ) = TAU( II ) * 60 CONTINUE * END IF * ELSE * * DIRECT = 'Backward', STOREV = 'Rowwise' * NQ = NUMROC( N+ICOFF-1, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQ = NQ - ICOFF MICOL = INDXG2P( JV+N-2, DESCV( NB_ ), MYCOL, $ DESCV( CSRC_ ), NPCOL ) JJ = JJV + NQ - 1 ITMP0 = 0 * DO 70 II = IIV+K-2, IIV, -1 * IF( MYCOL.EQ.MICOL ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(i+1:k,i) = -tau( iv+i-1 ) * * V(iv+i:iv+k-1,jv:jv+n-k+i-1)' * V(iv+i-1,jv:jv+n-k+i-1)' * ITMP0 = ITMP0 + 1 IF( JJ-JJV+1.GT.0 ) THEN CALL DGEMV( 'No transpose', ITMP0, JJ-JJV+1, $ -TAU( II ), V( II+1+(JJV-1)*LDV ), LDV, $ V( II+(JJV-1)*LDV ), LDV, ZERO, $ WORK( IW ), 1 ) ELSE CALL DLASET( 'All', ITMP0, 1, ZERO, ZERO, $ WORK( IW ), ITMP0 ) END IF * IW = IW + ITMP0 IF( MYCOL.EQ.MICOL ) THEN V( II+(JJ-1)*LDV ) = VII JJ = JJ - 1 END IF * IF( MOD( JV+N-ITMP0-2, DESCV( NB_ ) ).EQ.0 ) $ MICOL = MOD( MICOL+NPCOL-1, NPCOL ) * 70 CONTINUE * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( MB_ ) * T( ITMP1-1 ) = TAU( IIV+K-1 ) * DO 80 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = T(i+1:k,i+1:k) * T(i+1:k,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( MB_ ) - 1 CALL DCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL DTRMV( 'Lower', 'No transpose', 'Non-unit', $ ITMP0, T( ITMP1+DESCV( MB_ ) ), $ DESCV( MB_ ), T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( II ) * 80 CONTINUE * END IF * END IF * END IF * RETURN * * End of PDLARFT * END scalapack-2.0.2/SRC/pdlarz.f000644 000766 000024 00000101772 10363532303 016011 0ustar00juliestaff000000 000000 SUBROUTINE PDLARZ( SIDE, M, N, L, V, IV, JV, DESCV, INCV, TAU, C, $ IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) DOUBLE PRECISION C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PDLARZ applies a real elementary reflector Q (or Q**T) to a real * M-by-N distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), from * either the left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a real scalar and v is a real vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Q is a product of k elementary reflectors as returned by PDTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also own C(IC+M-L,JC:JC+N-1). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC+N-L-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also own C(IC:IC+M-1,JC+N-L) and * MOD(JV-1,NB_V) must be equal to MOD(JC+N-L-1,NB_C), if INCV = 1 only * the last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q * sub( C ), * = 'R': form sub( C ) * Q, Q = Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+L-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+L-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q * sub( C ) if SIDE = 'L', or * sub( C ) * Q if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK, LEFT CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIC1, IIC2, IIV, IOFFC1, $ IOFFC2, IOFFV, IPW, IROFFC1, IROFFC2, IROFFV, $ IVCOL, IVROW, JJC1, JJC2, JJV, LDC, LDV, MPC2, $ MPV, MYCOL, MYROW, NCC, NCV, NPCOL, NPROW, $ NQC2, NQV, RDEST DOUBLE PRECISION TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DAXPY, DCOPY, DGEBR2D, $ DGEBS2D, DGEMV, DGER, DGERV2D, $ DGESD2D, DGSUM2D, DLASET, INFOG2L, $ PB_TOPGET, PBDTRNV * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * LEFT = LSAME( SIDE, 'L' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) IROFFV = MOD( IV-1, DESCV( NB_ ) ) MPV = NUMROC( L+IROFFV, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ MPV = MPV - IROFFV ICOFFV = MOD( JV-1, DESCV( NB_ ) ) NQV = NUMROC( L+ICOFFV, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, NCV ) IOFFV = IIV+(JJV-1)*LDV NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC1, JJC1, ICROW1, ICCOL1 ) IROFFC1 = MOD( IC-1, DESCC( MB_ ) ) ICOFFC1 = MOD( JC-1, DESCC( NB_ ) ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NCC ) ) IOFFC1 = IIC1 + ( JJC1-1 ) * LDC * IF( LEFT ) THEN CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, DESCC( MB_ ) ) ICOFFC2 = MOD( JC-1, DESCC( NB_ ) ) NQC2 = NUMROC( N+ICOFFC2, DESCC( NB_ ), MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 ELSE CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC-1, DESCC( MB_ ) ) MPC2 = NUMROC( M+IROFFC2, DESCC( MB_ ), MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = MOD( JC+N-L-1, DESCC( NB_ ) ) END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NCC ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFFC1) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFFC1) ) * IF( LEFT ) THEN * IF( CRBLCK ) THEN RDEST = ICROW2 ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V (ICOFFV = IROFFC2) * IPW = MPV+1 CALL PBDTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL DGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL DGER( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL2 ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * TAULOC = TAU( JJV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL DGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, V( IOFFV ), 1, $ ZERO, WORK, 1 ) ELSE CALL DLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK, MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK, MAX( 1, NQC2 ) ) * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK, MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, -TAULOC, WORK, $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL DGER( MPV, NQC2, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL2 * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL DCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL DGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL2 ) * ELSE IF( MYCOL.EQ.ICCOL2 ) THEN * IPW = MPV+1 CALL DGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL DGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), $ RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL DGER( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V (ICOFFV=IROFFC2) * IPW = MPV+1 CALL PBDTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL DGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL DGER( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL DCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL DGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( JJV ) * ELSE * IPW = MPV+1 CALL DGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL DGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL DAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL DGER( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW2 ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW2 ) THEN * TAULOC = TAU( IIV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL DGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, V( IOFFV ), $ LDV, ZERO, WORK, 1 ) ELSE CALL DLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK, MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK, 1 ) * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK, MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) * IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, -TAULOC, WORK, 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * IF( MPC2.GT.0 .AND. NQV.GT.0 ) $ CALL DGER( MPC2, NQV, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC2 ), $ LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW2 * IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL DCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL DGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW2, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW2 ) THEN * IPW = NQV+1 CALL DGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL DGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), $ RDEST, ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL DGER( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V (IROFFV = ICOFFC2) * IPW = NQV+1 CALL PBDTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW2 ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = TAU( JJV ) * ELSE * CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL DGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL DGER( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL DCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL DGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( IIV ) * ELSE * IPW = NQV+1 CALL DGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL DGEMV( 'No Transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL DGER( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V (ICOFFC2=IROFFV) * IPW = NQV+1 CALL PBDTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = TAU( JJV ) * ELSE * CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL DGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL DLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL DAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL DGER( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PDLARZ * END scalapack-2.0.2/SRC/pdlarzb.f000644 000766 000024 00000055554 11750130340 016155 0ustar00juliestaff000000 000000 SUBROUTINE PDLARZB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, V, $ IV, JV, DESCV, T, C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER DIRECT, SIDE, STOREV, TRANS INTEGER IC, IV, JC, JV, K, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) DOUBLE PRECISION C( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PDLARZB applies a real block reflector Q or its transpose Q**T to * a real distributed M-by-N matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1) * from the left or the right. * * Q is a product of k elementary reflectors as returned by PDTZRZF. * * Currently, only STOREV = 'R' and DIRECT = 'B' are supported. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * DIRECT (global input) CHARACTER * Indicates how H is formed from a product of elementary * reflectors * = 'F': H = H(1) H(2) . . . H(k) (Forward, not supported yet) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Indicates how the vectors which define the elementary * reflectors are stored: * = 'C': Columnwise (not supported yet) * = 'R': Rowwise * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The order of the matrix T (= the number of elementary * reflectors whose product defines the block reflector). * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_V, LOCc(JV+M-1)) if SIDE = 'L', * (LLD_V, LOCc(JV+N-1)) if SIDE = 'R'. It contains the local * pieces of the distributed vectors V representing the * Householder transformation as returned by PDTZRZF. * LLD_V >= LOCr(IV+K-1). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * T (local input) DOUBLE PRECISION array, dimension MB_V by MB_V * The lower triangular matrix T in the representation of the * block reflector. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the M-by-N distributed matrix sub( C ). On exit, * sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) or * sub( C )*Q or sub( C )*Q'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * If STOREV = 'C', * if SIDE = 'L', * LWORK >= ( NqC0 + MpC0 ) * K * else if SIDE = 'R', * LWORK >= ( NqC0 + MAX( NpV0 + NUMROC( NUMROC( N+ICOFFC, * NB_V, 0, 0, NPCOL ), NB_V, 0, 0, LCMQ ), * MpC0 ) ) * K * end if * else if STOREV = 'R', * if SIDE = 'L', * LWORK >= ( MpC0 + MAX( MqV0 + NUMROC( NUMROC( M+IROFFC, * MB_V, 0, 0, NPROW ), MB_V, 0, 0, LCMP ), * NqC0 ) ) * K * else if SIDE = 'R', * LWORK >= ( MpC0 + NqC0 ) * K * end if * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFV = MOD( IV-1, MB_V ), ICOFFV = MOD( JV-1, NB_V ), * IVROW = INDXG2P( IV, MB_V, MYROW, RSRC_V, NPROW ), * IVCOL = INDXG2P( JV, NB_V, MYCOL, CSRC_V, NPCOL ), * MqV0 = NUMROC( M+ICOFFV, NB_V, MYCOL, IVCOL, NPCOL ), * NpV0 = NUMROC( N+IROFFV, MB_V, MYROW, IVROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NpC0 = NUMROC( N+ICOFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If STOREV = 'Columnwise' * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if STOREV = 'Rowwise' * If SIDE = 'Left', * ( NB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LEFT CHARACTER COLBTOP, TRANST INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIBEG, IIC1, IIC2, $ IIEND, IINXT, IIV, ILEFT, INFO, IOFFC2, IOFFV, $ IPT, IPV, IPW, IROFFC1, IROFFC2, ITOP, IVCOL, $ IVROW, JJBEG, JJEND, JJNXT, JJC1, JJC2, JJV, $ LDC, LDV, LV, LW, MBC, MBV, MPC1, MPC2, MPC20, $ MQV, MQV0, MYCOL, MYDIST, MYROW, NBC, NBV, $ NPCOL, NPROW, NQC1, NQC2, NQCALL, NQV * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, DGEBR2D, $ DGEBS2D,DGEMM, DGSUM2D, DLAMOV, $ DLASET, DTRBR2D, DTRBS2D, DTRMM, $ INFOG2L, PBDMATADD, PBDTRAN, PB_TOPGET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 .OR. K.LE.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check for currently supported options * INFO = 0 IF( .NOT.LSAME( DIRECT, 'B' ) ) THEN INFO = -3 ELSE IF( .NOT.LSAME( STOREV, 'R' ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDLARZB', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * LEFT = LSAME( SIDE, 'L' ) IF( LSAME( TRANS, 'N' ) ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) MBV = DESCV( MB_ ) NBV = DESCV( NB_ ) ICOFFV = MOD( JV-1, NBV ) NQV = NUMROC( L+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, MAX( 1, NUMROC( DESCV( N_ ), NBV, MYCOL, $ DESCV( CSRC_ ), NPCOL ) ) ) IOFFV = IIV + ( JJV-1 ) * LDV MBC = DESCC( MB_ ) NBC = DESCC( NB_ ) NQCALL = NUMROC( DESCC( N_ ), NBC, MYCOL, DESCC( CSRC_ ), NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC1, $ JJC1, ICROW1, ICCOL1 ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NQCALL ) ) * IF( LEFT ) THEN IROFFC1 = MOD( IC-1, MBC ) MPC1 = NUMROC( K+IROFFC1, MBC, MYROW, ICROW1, NPROW ) IF( MYROW.EQ.ICROW1 ) $ MPC1 = MPC1 - IROFFC1 ICOFFC1 = MOD( JC-1, NBC ) NQC1 = NUMROC( N+ICOFFC1, NBC, MYCOL, ICCOL1, NPCOL ) IF( MYCOL.EQ.ICCOL1 ) $ NQC1 = NQC1 - ICOFFC1 CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, MBC ) MPC2 = NUMROC( L+IROFFC2, MBC, MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = ICOFFC1 NQC2 = NQC1 ELSE IROFFC1 = MOD( IC-1, MBC ) MPC1 = NUMROC( M+IROFFC1, MBC, MYROW, ICROW1, NPROW ) IF( MYROW.EQ.ICROW1 ) $ MPC1 = MPC1 - IROFFC1 ICOFFC1 = MOD( JC-1, NBC ) NQC1 = NUMROC( K+ICOFFC1, NBC, MYCOL, ICCOL1, NPCOL ) IF( MYCOL.EQ.ICCOL1 ) $ NQC1 = NQC1 - ICOFFC1 CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = IROFFC1 MPC2 = MPC1 ICOFFC2 = MOD( JC+N-L-1, NBC ) NQC2 = NUMROC( L+ICOFFC2, NBC, MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NQCALL ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * IROFFC2 = ICOFFV is required by the current transposition * routine PBDTRAN * MQV0 = NUMROC( M+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN MQV = MQV0 - ICOFFV ELSE MQV = MQV0 END IF IF( MYROW.EQ.ICROW2 ) THEN MPC20 = MPC2 + IROFFC2 ELSE MPC20 = MPC2 END IF * * Locally V( IOFFV ) is K x MQV, C( IOFFC2 ) is MPC2 x NQC2 * WORK( IPV ) is MPC20 x K = [ . V( IOFFV ) ]' * WORK( IPW ) is K x MQV0 = [ . V( IOFFV ) ] * WORK( IPT ) is the workspace for PBDTRAN * IPV = 1 IPW = IPV + MPC20 * K IPT = IPW + K * MQV0 LV = MAX( 1, MPC20 ) LW = MAX( 1, K ) * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN CALL DLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW+ICOFFV*LW ), LW ) ELSE CALL DLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW ), LW ) END IF END IF * * WORK( IPV ) = WORK( IPW )' (replicated) is MPC20 x K * CALL PBDTRAN( ICTXT, 'Rowwise', 'Transpose', K, M+ICOFFV, $ DESCV( NB_ ), WORK( IPW ), LW, ZERO, $ WORK( IPV ), LV, IVROW, IVCOL, ICROW2, -1, $ WORK( IPT ) ) * * WORK( IPV ) = ( . V )' -> WORK( IPV ) = V' is MPC2 x K * IF( MYROW.EQ.ICROW2 ) $ IPV = IPV + IROFFC2 * * WORK( IPW ) becomes NQC2 x K = C( IOFFC2 )' * V' * WORK( IPW ) = C( IOFFC2 )' * V' (NQC2 x MPC2 x K) -> NQC2 x K * LW = MAX( 1, NQC2 ) * IF( MPC2.GT.0 ) THEN CALL DGEMM( 'Transpose', 'No transpose', NQC2, K, MPC2, $ ONE, C( IOFFC2 ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL DLASET( 'All', NQC2, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * * WORK( IPW ) = WORK( IPW ) + C1 ( NQC1 = NQC2 ) * IF( MPC1.GT.0 ) THEN MYDIST = MOD( MYROW-ICROW1+NPROW, NPROW ) ITOP = MAX( 0, MYDIST * MBC - IROFFC1 ) IIBEG = IIC1 IIEND = IIC1 + MPC1 - 1 IINXT = MIN( ICEIL( IIBEG, MBC ) * MBC, IIEND ) * 10 CONTINUE IF( IIBEG.LE.IINXT ) THEN CALL PBDMATADD( ICTXT, 'Transpose', NQC2, IINXT-IIBEG+1, $ ONE, C( IIBEG+(JJC1-1)*LDC ), LDC, ONE, $ WORK( IPW+ITOP ), LW ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBC - IROFFC1 IIBEG = IINXT +1 IINXT = MIN( IINXT+MBC, IIEND ) GO TO 10 END IF END IF * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN * * Broadcast the block reflector to the other columns. * CALL DTRBS2D( ICTXT, 'Rowwise', ' ', 'Lower', 'Non unit', $ K, K, T, MBV ) ELSE CALL DTRBR2D( ICTXT, 'Rowwise', ' ', 'Lower', 'Non unit', $ K, K, T, MBV, MYROW, IVCOL ) END IF CALL DTRMM( 'Right', 'Lower', TRANST, 'Non unit', NQC2, K, $ ONE, T, MBV, WORK( IPW ), LW ) * CALL DGEBS2D( ICTXT, 'Columnwise', ' ', NQC2, K, $ WORK( IPW ), LW ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', NQC2, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C1 = C1 - WORK( IPW ) * IF( MPC1.GT.0 ) THEN MYDIST = MOD( MYROW-ICROW1+NPROW, NPROW ) ITOP = MAX( 0, MYDIST * MBC - IROFFC1 ) IIBEG = IIC1 IIEND = IIC1 + MPC1 - 1 IINXT = MIN( ICEIL( IIBEG, MBC ) * MBC, IIEND ) * 20 CONTINUE IF( IIBEG.LE.IINXT ) THEN CALL PBDMATADD( ICTXT, 'Transpose', IINXT-IIBEG+1, NQC2, $ -ONE, WORK( IPW+ITOP ), LW, ONE, $ C( IIBEG+(JJC1-1)*LDC ), LDC ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBC - IROFFC1 IIBEG = IINXT +1 IINXT = MIN( IINXT+MBC, IIEND ) GO TO 20 END IF END IF * * C2 C2 - V' * W' * C( IOFFC2 ) = C( IOFFC2 ) - WORK( IPV ) * WORK( IPW )' * MPC2 x NQC2 MPC2 x K K x NQC2 * CALL DGEMM( 'No transpose', 'Transpose', MPC2, NQC2, K, -ONE, $ WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC2 ), LDC ) * ELSE * * Form sub( C ) * Q or sub( C ) * Q' * * Locally V( IOFFV ) is K x NQV, C( IOFFC2 ) is MPC2 x NQC2 * WORK( IPV ) is K x NQV = V( IOFFV ), NQV = NQC2 * WORK( IPW ) is MPC2 x K = C( IOFFC2 ) * V( IOFFV )' * IPV = 1 IPW = IPV + K * NQC2 LV = MAX( 1, K ) LW = MAX( 1, MPC2 ) * * Broadcast V to the other process rows. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) IF( MYROW.EQ.IVROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', COLBTOP, K, NQC2, $ V( IOFFV ), LDV ) IF( MYCOL.EQ.IVCOL ) $ CALL DTRBS2D( ICTXT, 'Columnwise', COLBTOP, 'Lower', $ 'Non unit', K, K, T, MBV ) CALL DLAMOV( 'All', K, NQC2, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', COLBTOP, K, NQC2, $ WORK( IPV ), LV, IVROW, MYCOL ) IF( MYCOL.EQ.IVCOL ) $ CALL DTRBR2D( ICTXT, 'Columnwise', COLBTOP, 'Lower', $ 'Non unit', K, K, T, MBV, IVROW, MYCOL ) END IF * * WORK( IPV ) is K x NQC2 = V = V( IOFFV ) * WORK( IPW ) = C( IOFFC2 ) * V' (MPC2 x NQC2 x K) -> MPC2 x K * IF( NQC2.GT.0 ) THEN CALL DGEMM( 'No Transpose', 'Transpose', MPC2, K, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL DLASET( 'All', MPC2, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * * WORK( IPW ) = WORK( IPW ) + C1 ( MPC1 = MPC2 ) * IF( NQC1.GT.0 ) THEN MYDIST = MOD( MYCOL-ICCOL1+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBC - ICOFFC1 ) JJBEG = JJC1 JJEND = JJC1 + NQC1 - 1 JJNXT = MIN( ICEIL( JJBEG, NBC ) * NBC, JJEND ) * 30 CONTINUE IF( JJBEG.LE.JJNXT ) THEN CALL PBDMATADD( ICTXT, 'No transpose', MPC2, $ JJNXT-JJBEG+1, ONE, $ C( IIC1+(JJBEG-1)*LDC ), LDC, ONE, $ WORK( IPW+ILEFT*LW ), LW ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBC - ICOFFC1 JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBC, JJEND ) GO TO 30 END IF END IF * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN CALL DTRMM( 'Right', 'Lower', TRANS, 'Non unit', MPC2, K, $ ONE, T, MBV, WORK( IPW ), LW ) CALL DGEBS2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C1 = C1 - WORK( IPW ) * IF( NQC1.GT.0 ) THEN MYDIST = MOD( MYCOL-ICCOL1+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBC - ICOFFC1 ) JJBEG = JJC1 JJEND = JJC1 + NQC1 - 1 JJNXT = MIN( ICEIL( JJBEG, NBC ) * NBC, JJEND ) * 40 CONTINUE IF( JJBEG.LE.JJNXT ) THEN CALL PBDMATADD( ICTXT, 'No transpose', MPC2, $ JJNXT-JJBEG+1, -ONE, $ WORK( IPW+ILEFT*LW ), LW, ONE, $ C( IIC1+(JJBEG-1)*LDC ), LDC ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBC - ICOFFC1 JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBC, JJEND ) GO TO 40 END IF END IF * * C2 C2 - W * V * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC2 x NQC2 MPC2 x K K x NQC2 * IF( IOFFC2.GT.0 ) $ CALL DGEMM( 'No transpose', 'No transpose', MPC2, NQC2, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC2 ), LDC ) * END IF * RETURN * * End of PDLARZB * END scalapack-2.0.2/SRC/pdlarzt.f000644 000766 000024 00000025717 10363532303 016201 0ustar00juliestaff000000 000000 SUBROUTINE PDLARZT( DIRECT, STOREV, N, K, V, IV, JV, DESCV, TAU, $ T, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIRECT, STOREV INTEGER IV, JV, K, N * .. * .. Array Arguments .. INTEGER DESCV( * ) DOUBLE PRECISION TAU( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PDLARZT forms the triangular factor T of a real block reflector * H of order > n, which is defined as a product of k elementary * reflectors as returned by PDTZRZF. * * If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; * * If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. * * If STOREV = 'C', the vector which defines the elementary reflector * H(i) is stored in the i-th column of the array V, and * * H = I - V * T * V' * * If STOREV = 'R', the vector which defines the elementary reflector * H(i) is stored in the i-th row of the array V, and * * H = I - V' * T * V * * Currently, only STOREV = 'R' and DIRECT = 'B' are supported. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIRECT (global input) CHARACTER * Specifies the order in which the elementary reflectors are * multiplied to form the block reflector: * = 'F': H = H(1) H(2) . . . H(k) (Forward, not supported yet) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Specifies how the vectors which define the elementary * reflectors are stored (see also Further Details): * = 'C': columnwise (not supported yet) * = 'R': rowwise * * N (global input) INTEGER * The number of meaningful entries of the block reflector H. * N >= 0. * * K (global input) INTEGER * The order of the triangular factor T (= the number of * elementary reflectors). 1 <= K <= MB_V (= NB_V). * * V (input/output) DOUBLE PRECISION pointer into the local memory * to an array of local dimension (LOCr(IV+K-1),LOCc(JV+N-1)). * The distributed matrix V contains the Householder vectors. * See further details. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * TAU (local input) DOUBLE PRECISION array, dimension LOCr(IV+K-1) * if INCV = M_V, and LOCc(JV+K-1) otherwise. This array * contains the Householder scalars related to the Householder * vectors. TAU is tied to the distributed matrix V. * * T (local output) DOUBLE PRECISION array, dimension (MB_V,MB_V) * It contains the k-by-k triangular factor of the block * reflector associated with V. T is lower triangular. * * WORK (local workspace) DOUBLE PRECISION array, * dimension (K*(K-1)/2) * * Further Details * =============== * * The shape of the matrix V and the storage of the vectors which define * the H(i) is best illustrated by the following example with n = 5 and * k = 3. The elements equal to 1 are not stored; the corresponding * array elements are modified but restored on exit. The rest of the * array is not used. * * DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': * * ______V_____ * ( v1 v2 v3 ) / \ * ( v1 v2 v3 ) ( v1 v1 v1 v1 v1 . . . . 1 ) * V = ( v1 v2 v3 ) ( v2 v2 v2 v2 v2 . . . 1 ) * ( v1 v2 v3 ) ( v3 v3 v3 v3 v3 . . 1 ) * ( v1 v2 v3 ) * . . . * . . . * 1 . . * 1 . * 1 * * DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': * * ______V_____ * 1 / \ * . 1 ( 1 . . . . v1 v1 v1 v1 v1 ) * . . 1 ( . 1 . . . v2 v2 v2 v2 v2 ) * . . . ( . . 1 . . v3 v3 v3 v3 v3 ) * . . . * ( v1 v2 v3 ) * ( v1 v2 v3 ) * V = ( v1 v2 v3 ) * ( v1 v2 v3 ) * ( v1 v2 v3 ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER ICOFF, ICTXT, II, IIV, INFO, IVCOL, IVROW, $ ITMP0, ITMP1, IW, JJV, LDV, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, DCOPY, DGEMV, $ DGSUM2D, DLASET, DTRMV, INFOG2L, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCV( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check for currently supported options * INFO = 0 IF( .NOT.LSAME( DIRECT, 'B' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( STOREV, 'R' ) ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDLARZT', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, $ IIV, JJV, IVROW, IVCOL ) * IF( MYROW.EQ.IVROW ) THEN IW = 1 ITMP0 = 0 LDV = DESCV( LLD_ ) ICOFF = MOD( JV-1, DESCV( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQ = NQ - ICOFF * DO 10 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = -tau( iv+i-1 ) * * V(iv+i:iv+k-1,jv:jv+n-1) * V(iv+i-1,jv:jv+n-1)' * ITMP0 = ITMP0 + 1 IF( NQ.GT.0 ) THEN CALL DGEMV( 'No transpose', ITMP0, NQ, -TAU( II ), $ V( II+1+(JJV-1)*LDV ), LDV, $ V( II+(JJV-1)*LDV ), LDV, ZERO, WORK( IW ), $ 1 ) ELSE CALL DLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF IW = IW + ITMP0 * 10 CONTINUE * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( MB_ ) * T( ITMP1-1 ) = TAU( IIV+K-1 ) * DO 20 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = T(i+1:k,i+1:k) * T(i+1:k,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( MB_ ) - 1 CALL DCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL DTRMV( 'Lower', 'No transpose', 'Non-unit', ITMP0, $ T( ITMP1+DESCV( MB_ ) ), DESCV( MB_ ), $ T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( II ) * 20 CONTINUE * END IF * END IF * RETURN * * End of PDLARZT * END scalapack-2.0.2/SRC/pdlascl.f000644 000766 000024 00000043046 11552067542 016147 0ustar00juliestaff000000 000000 SUBROUTINE PDLASCL( TYPE, CFROM, CTO, M, N, A, IA, JA, DESCA, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TYPE INTEGER IA, INFO, JA, M, N DOUBLE PRECISION CFROM, CTO * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLASCL multiplies the M-by-N real distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1) by the real scalar CTO/CFROM. This * is done without over/underflow as long as the final result * CTO * A(I,J) / CFROM does not over/underflow. TYPE specifies that * sub( A ) may be full, upper triangular, lower triangular or upper * Hessenberg. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TYPE (global input) CHARACTER * TYPE indices the storage type of the input distributed * matrix. * = 'G': sub( A ) is a full matrix, * = 'L': sub( A ) is a lower triangular matrix, * = 'U': sub( A ) is an upper triangular matrix, * = 'H': sub( A ) is an upper Hessenberg matrix. * * CFROM (global input) DOUBLE PRECISION * CTO (global input) DOUBLE PRECISION * The distributed matrix sub( A ) is multiplied by CTO/CFROM. * A(I,J) is computed without over/underflow if the final * result CTO * A(I,J) / CFROM can be represented without * over/underflow. CFROM must be nonzero. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * matrix sub( A ). On exit, this array contains the local * pieces of the distributed matrix multiplied by CTO/CFROM. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. LOGICAL DONE INTEGER IACOL, IAROW, ICOFFA, ICTXT, ICURCOL, ICURROW, $ IIA, II, INXTROW, IOFFA, IROFFA, ITYPE, J, JB, $ JJA, JJ, JN, KK, LDA, LL, MYCOL, MYROW, MP, $ NPCOL, NPROW, NQ DOUBLE PRECISION BIGNUM, CFROM1, CFROMC, CTO1, CTOC, MUL, SMLNUM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, INFOG2L, PXERBLA * .. * .. External Functions .. LOGICAL LSAME, DISNAN INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL DISNAN, ICEIL, LSAME, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * IF( NPROW.EQ.-1 ) THEN INFO = -907 ELSE INFO = 0 CALL CHK1MAT( M, 4, N, 6, IA, JA, DESCA, 9, INFO ) IF( INFO.EQ.0 ) THEN IF( LSAME( TYPE, 'G' ) ) THEN ITYPE = 0 ELSE IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 ELSE ITYPE = -1 END IF IF( ITYPE.EQ.-1 ) THEN INFO = -1 ELSE IF( CFROM.EQ.ZERO .OR. DISNAN(CFROM) ) THEN INFO = -4 ELSE IF( DISNAN(CTO) ) THEN INFO = -5 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDLASCL', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) $ RETURN * * Get machine parameters * SMLNUM = PDLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM * CFROMC = CFROM CTOC = CTO * * Compute local indexes * LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * 10 CONTINUE CFROM1 = CFROMC*SMLNUM IF( CFROM1.EQ.CFROMC ) THEN ! CFROMC is an inf. Multiply by a correctly signed zero for ! finite CTOC, or a NaN if CTOC is infinite. MUL = CTOC / CFROMC DONE = .TRUE. CTO1 = CTOC ELSE CTO1 = CTOC / BIGNUM IF( CTO1.EQ.CTOC ) THEN ! CTOC is either 0 or an inf. In both cases, CTOC itself ! serves as the correct multiplication factor. MUL = CTOC DONE = .TRUE. CFROMC = ONE ELSE IF( ABS( CFROM1 ).GT.ABS( CTOC ) .AND. CTOC.NE.ZERO ) THEN MUL = SMLNUM DONE = .FALSE. CFROMC = CFROM1 ELSE IF( ABS( CTO1 ).GT.ABS( CFROMC ) ) THEN MUL = BIGNUM DONE = .FALSE. CTOC = CTO1 ELSE MUL = CTOC / CFROMC DONE = .TRUE. END IF END IF * IOFFA = ( JJA - 1 ) * LDA ICURROW = IAROW ICURCOL = IACOL * IF( ITYPE.EQ.0 ) THEN * * Full matrix * DO 30 JJ = JJA, JJA+NQ-1 DO 20 II = IIA, IIA+MP-1 A( IOFFA+II ) = A( IOFFA+II ) * MUL 20 CONTINUE IOFFA = IOFFA + LDA 30 CONTINUE * ELSE IF( ITYPE.EQ.1 ) THEN * * Lower triangular matrix * II = IIA JJ = JJA JB = JN-JA+1 * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 50 LL = JJ, JJ + JB -1 DO 40 KK = II+LL-JJ, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 40 CONTINUE IOFFA = IOFFA + LDA 50 CONTINUE ELSE DO 70 LL = JJ, JJ + JB -1 DO 60 KK = II, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 60 CONTINUE IOFFA = IOFFA + LDA 70 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 120 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 90 LL = JJ, JJ + JB -1 DO 80 KK = II+LL-JJ, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE ELSE DO 110 LL = JJ, JJ + JB -1 DO 100 KK = II, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 120 CONTINUE * ELSE IF( ITYPE.EQ.2 ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JB = JN-JA+1 * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 140 LL = JJ, JJ + JB -1 DO 130 KK = IIA, MIN(II+LL-JJ,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 130 CONTINUE IOFFA = IOFFA + LDA 140 CONTINUE ELSE DO 160 LL = JJ, JJ + JB -1 DO 150 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 150 CONTINUE IOFFA = IOFFA + LDA 160 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 210 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 180 LL = JJ, JJ + JB -1 DO 170 KK = IIA, MIN(II+LL-JJ,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 170 CONTINUE IOFFA = IOFFA + LDA 180 CONTINUE ELSE DO 200 LL = JJ, JJ + JB -1 DO 190 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 190 CONTINUE IOFFA = IOFFA + LDA 200 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 210 CONTINUE * ELSE IF( ITYPE.EQ.3 ) THEN * * Upper Hessenberg matrix * II = IIA JJ = JJA JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.ICURCOL ) THEN DO 230 LL = JJ, JJ+JB-1 DO 220 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 220 CONTINUE IOFFA = IOFFA + LDA 230 CONTINUE JJ = JJ + JB END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 260 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 250 LL = JJ, JJ+JB-1 DO 240 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 240 CONTINUE IOFFA = IOFFA + LDA 250 CONTINUE JJ = JJ + JB END IF * II = II + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) * 260 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( ICURROW+1, NPROW ) IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 280 LL = JJ, JJ + JB -1 DO 270 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 270 CONTINUE IOFFA = IOFFA + LDA 280 CONTINUE ELSE DO 300 LL = JJ, JJ + JB -1 DO 290 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 290 CONTINUE IOFFA = IOFFA + LDA 300 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+MP-1 ) $ A( II+(JJ+JB-2)*LDA ) = A( II+(JJ+JB-2)*LDA ) * MUL END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = INXTROW ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 350 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 320 LL = JJ, JJ + JB -1 DO 310 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 310 CONTINUE IOFFA = IOFFA + LDA 320 CONTINUE ELSE DO 340 LL = JJ, JJ + JB -1 DO 330 KK = IIA, MIN( II-1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 330 CONTINUE IOFFA = IOFFA + LDA 340 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+MP-1 ) $ A( II+(JJ+JB-2)*LDA ) = A( II+(JJ+JB-2)*LDA ) * $ MUL END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = INXTROW ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 350 CONTINUE * END IF * END IF * IF( .NOT.DONE ) $ GO TO 10 * RETURN * * End of PDLASCL * END scalapack-2.0.2/SRC/pdlase2.f000644 000766 000024 00000037416 10363532303 016052 0ustar00juliestaff000000 000000 SUBROUTINE PDLASE2( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLASE2 initializes an M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1) to BETA on the diagonal and ALPHA on the * offdiagonals. PDLASE2 requires that only dimension of the matrix * operand is distributed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * set: * = 'U': Upper triangular part is set; the strictly lower * triangular part of sub( A ) is not changed; * = 'L': Lower triangular part is set; the strictly upper * triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * ALPHA (global input) DOUBLE PRECISION * The constant to which the offdiagonal elements are to be * set. * * BETA (global input) DOUBLE PRECISION * The constant to which the diagonal elements are to be set. * * A (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ) * to be set. On exit, the leading M-by-N submatrix sub( A ) * is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER HEIGHT, IACOL, IAROW, IBASE, ICOFFA, II, IIA, $ IIBEG, IIEND, IINXT, ILEFT, IRIGHT, IROFFA, $ ITOP, JJ, JJA, JJBEG, JJEND, JJNXT, LDA, MBA, $ MP, MPA, MYCOL, MYDIST, MYROW, NBA, NPCOL, $ NPROW, NQ, NQA, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DLASET, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, MBA ) ICOFFA = MOD( JA-1, NBA ) * IF( N.LE.( NBA-ICOFFA ) ) THEN * * It is assumed that the local columns JJA:JJA+N-1 of the matrix * A are in the same process column (IACOL). * * N * JJA JJA+N-1 * / --------------------- \ * IROFFA| | | | * \ |...................| | ( IAROW ) * IIA |x | | MB_A * | x | | * |--x----------------| / * | x | * | x | ITOP * | x | | * | x | /-------\ * |-------x-----------| |-------x-----------| * | x | | x | * | x | | x | * | x | | x | * | x | | x | * |------------x------| |------------x------| * | x | \____________/ * | x | | * | x | IBASE * | x | * |-----------------x-| Local picture * | x| * | | * | | * | | * |-------------------| * | | * . . * . . * . (IACOL) . * IF( MYCOL.EQ.IACOL ) THEN * MPA = NUMROC( M+IROFFA, MBA, MYROW, IAROW, NPROW ) IF( MPA.LE.0 ) $ RETURN IF( MYROW.EQ.IAROW ) $ MPA = MPA - IROFFA MYDIST = MOD( MYROW-IAROW+NPROW, NPROW ) ITOP = MYDIST * MBA - IROFFA * IF( LSAME( UPLO, 'U' ) ) THEN * ITOP = MAX( 0, ITOP ) IIBEG = IIA IIEND = IIA + MPA - 1 IINXT = MIN( ICEIL( IIBEG, MBA ) * MBA, IIEND ) * 10 CONTINUE IF( ( N-ITOP ).GT.0 ) THEN CALL DLASET( UPLO, IINXT-IIBEG+1, N-ITOP, ALPHA, BETA, $ A( IIBEG+(JJA+ITOP-1)*LDA ), LDA ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IIBEG = IINXT +1 IINXT = MIN( IINXT+MBA, IIEND ) GO TO 10 END IF * ELSE IF( LSAME( UPLO, 'L' ) ) THEN * II = IIA JJ = JJA MP = MPA IBASE = MIN( ITOP+MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 20 CONTINUE IF( JJ.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL DLASET( 'All', MP, ITOP-JJ+JJA, ALPHA, ALPHA, $ A( II+(JJ-1)*LDA ), LDA ) CALL DLASET( UPLO, MP, HEIGHT, ALPHA, BETA, $ A( II+(JJA+ITOP-1)*LDA ), LDA ) MP = MAX( 0, MP - HEIGHT ) II = II + HEIGHT JJ = JJA + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 20 END IF * ELSE * II = IIA JJ = JJA MP = MPA IBASE = MIN( ITOP+MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 30 CONTINUE IF( JJ.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL DLASET( 'All', MPA, ITOP-JJ+JJA, ALPHA, ALPHA, $ A( IIA+(JJ-1)*LDA ), LDA ) CALL DLASET( 'All', MPA-MP, HEIGHT, ALPHA, ALPHA, $ A( IIA+(JJA+ITOP-1)*LDA ), LDA ) CALL DLASET( 'All', MP, HEIGHT, ALPHA, BETA, $ A( II+(JJA+ITOP-1)*LDA ), LDA ) MP = MAX( 0, MP - HEIGHT ) II = II + HEIGHT JJ = JJA + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 30 END IF * END IF * END IF * ELSE IF( M.LE.( MBA-IROFFA ) ) THEN * * It is assumed that the local rows IIA:IIA+M-1 of the matrix A * are in the same process row (IAROW). * * ICOFFA * / \JJA * IIA ------------------ .... -------- * | .x | | | / | | \ * | . x | | | ILEFT| | | | * | . x | | | | | | * | . x | | \ x | | * | . |x | | |x | | IRIGHT * | . | x | | | x | | * (IAROW) | . | x | | | x | | * | . | x| | | x| | * | . | x | | x / * | . | |x | | | * | . | | x | | | * | . | | x | | | * | . | | x| | | * IIA+M-1 ------------------ .... ------- * NB_A * (IACOL) Local picture * IF( MYROW.EQ.IAROW ) THEN * NQA = NUMROC( N+ICOFFA, NBA, MYCOL, IACOL, NPCOL ) IF( NQA.LE.0 ) $ RETURN IF( MYCOL.EQ.IACOL ) $ NQA = NQA - ICOFFA MYDIST = MOD( MYCOL-IACOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBA - ICOFFA * IF( LSAME( UPLO, 'L' ) ) THEN * ILEFT = MAX( 0, ILEFT ) JJBEG = JJA JJEND = JJA + NQA - 1 JJNXT = MIN( ICEIL( JJBEG, NBA ) * NBA, JJEND ) * 40 CONTINUE IF( ( M-ILEFT ).GT.0 ) THEN CALL DLASET( UPLO, M-ILEFT, JJNXT-JJBEG+1, ALPHA, $ BETA, A( IIA+ILEFT+(JJBEG-1)*LDA ), LDA ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBA, JJEND ) GO TO 40 END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * II = IIA JJ = JJA NQ = NQA IRIGHT = MIN( ILEFT+NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 50 CONTINUE IF( II.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL DLASET( 'All', ILEFT-II+IIA, NQ, ALPHA, ALPHA, $ A( II+(JJ-1)*LDA ), LDA ) CALL DLASET( UPLO, WIDE, NQ, ALPHA, BETA, $ A( IIA+ILEFT+(JJ-1)*LDA ), LDA ) NQ = MAX( 0, NQ - WIDE ) II = IIA + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 50 END IF * ELSE * II = IIA JJ = JJA NQ = NQA IRIGHT = MIN( ILEFT+NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 60 CONTINUE IF( II.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL DLASET( 'All', ILEFT-II+IIA, NQA, ALPHA, ALPHA, $ A( II+(JJA-1)*LDA ), LDA ) CALL DLASET( 'All', WIDE, NQA-NQ, ALPHA, ALPHA, $ A( IIA+ILEFT+(JJA-1)*LDA ), LDA ) CALL DLASET( 'All', WIDE, NQ, ALPHA, BETA, $ A( IIA+ILEFT+(JJ-1)*LDA ), LDA ) NQ = MAX( 0, NQ - WIDE ) II = IIA + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 60 END IF * END IF * END IF * END IF * RETURN * * End of PDLASE2 * END scalapack-2.0.2/SRC/pdlaset.f000644 000766 000024 00000021733 10363532303 016147 0ustar00juliestaff000000 000000 SUBROUTINE PDLASET( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLASET initializes an M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1) to BETA on the diagonal and ALPHA on the * offdiagonals. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * set: * = 'U': Upper triangular part is set; the strictly lower * triangular part of sub( A ) is not changed; * = 'L': Lower triangular part is set; the strictly upper * triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * ALPHA (global input) DOUBLE PRECISION * The constant to which the offdiagonal elements are to be * set. * * BETA (global input) DOUBLE PRECISION * The constant to which the diagonal elements are to be set. * * A (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ) * to be set. On exit, the leading M-by-N submatrix sub( A ) * is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, IAA, IBLK, IN, ITMP, J, JAA, JBLK, JN, JTMP * .. * .. External Subroutines .. EXTERNAL PDLASE2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( M.LE.( DESCA( MB_ ) - MOD( IA-1, DESCA( MB_ ) ) ) .OR. $ N.LE.( DESCA( NB_ ) - MOD( JA-1, DESCA( NB_ ) ) ) ) THEN CALL PDLASE2( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) ELSE * IF( LSAME( UPLO, 'U' ) ) THEN IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) CALL PDLASE2( UPLO, IN-IA+1, N, ALPHA, BETA, A, IA, JA, $ DESCA ) DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) JAA = JA + ITMP CALL PDLASE2( UPLO, IBLK, N-ITMP, ALPHA, BETA, $ A, I, JAA, DESCA ) 10 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) CALL PDLASE2( UPLO, M, JN-JA+1, ALPHA, BETA, A, IA, JA, $ DESCA ) DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) IAA = IA + JTMP CALL PDLASE2( UPLO, M-JTMP, JBLK, ALPHA, BETA, A, IAA, $ J, DESCA ) 20 CONTINUE ELSE IF( M.LE.N ) THEN IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), $ IA+M-1 ) CALL PDLASE2( UPLO, IN-IA+1, N, ALPHA, BETA, A, IA, $ JA, DESCA ) DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) CALL PDLASE2( UPLO, IBLK, I-IA, ALPHA, ALPHA, A, I, $ JA, DESCA ) CALL PDLASE2( UPLO, IBLK, N-I+IA, ALPHA, BETA, A, I, $ JA+I-IA, DESCA ) 30 CONTINUE ELSE JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), $ JA+N-1 ) CALL PDLASE2( UPLO, M, JN-JA+1, ALPHA, BETA, A, IA, $ JA, DESCA ) DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) CALL PDLASE2( UPLO, J-JA, JBLK, ALPHA, ALPHA, A, IA, $ J, DESCA ) CALL PDLASE2( UPLO, M-J+JA, JBLK, ALPHA, BETA, A, $ IA+J-JA, J, DESCA ) 40 CONTINUE END IF END IF * END IF * RETURN * * End of PDLASET * END scalapack-2.0.2/SRC/pdlasmsub.f000644 000766 000024 00000031403 11642700517 016505 0ustar00juliestaff000000 000000 SUBROUTINE PDLASMSUB( A, DESCA, I, L, K, SMLNUM, BUF, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER I, K, L, LWORK DOUBLE PRECISION SMLNUM * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), BUF( * ) * .. * * Purpose * ======= * * PDLASMSUB looks for a small subdiagonal element from the bottom * of the matrix that it can safely set to zero. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) DOUBLE PRECISION array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * K (global output) INTEGER * On exit, this yields the bottom portion of the unreduced * submatrix. This will satisfy: L <= M <= I-1. * * SMLNUM (global input) DOUBLE PRECISION * On entry, a "small number" for the given matrix. * Unchanged on exit. * * BUF (local output) DOUBLE PRECISION array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 2*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * This code is basically a parallelization of the following snip * of LAPACK code from DLAHQR: * * Look for a single small subdiagonal element. * * DO 20 K = I, L + 1, -1 * TST1 = ABS( H( K-1, K-1 ) ) + ABS( H( K, K ) ) * IF( TST1.EQ.ZERO ) * $ TST1 = DLANHS( '1', I-L+1, H( L, L ), LDH, WORK ) * IF( ABS( H( K, K-1 ) ).LE.MAX( ULP*TST1, SMLNUM ) ) * $ GO TO 30 * 20 CONTINUE * 30 CONTINUE * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IAFIRST, IBUF1, IBUF2, $ ICOL1, ICOL2, II, III, IRCV1, IRCV2, IROW1, $ IROW2, ISRC, ISTR1, ISTR2, ITMP1, ITMP2, $ JAFIRST, JJ, JJJ, JSRC, LDA, LEFT, MODKM1, $ MYCOL, MYROW, NPCOL, NPROW, NUM, RIGHT, UP DOUBLE PRECISION H10, H11, H22, TST1, ULP * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGERV2D, DGESD2D, IGAMX2D, $ INFOG1L, INFOG2L * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 STARTS AT BUF(ISTR1+1) AND WILL CONTAINS IBUF1 ELEMENTS * BUFFER2 STARTS AT BUF(ISTR2+1) AND WILL CONTAINS IBUF2 ELEMENTS * ISTR1 = 0 ISTR2 = ( ( I-L ) / HBL ) IF( ISTR2*HBL.LT.( I-L ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.2*ISTR2 ) THEN * * Error! * RETURN END IF CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * COPY OUR RELEVANT PIECES OF TRIADIAGONAL THAT WE OWE INTO * 2 BUFFERS TO SEND TO WHOMEVER OWNS H(K,K) AS K MOVES DIAGONALLY * UP THE TRIDIAGONAL * IBUF1 = 0 IBUF2 = 0 IRCV1 = 0 IRCV2 = 0 DO 10 K = I, L + 1, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K-1,K-1) AND SEND IT DIAGONAL DOWN * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K ,K-1) AND SEND IT RIGHT * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( K, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * ADD UP THE RECEIVES * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT. $ 1 ) ) ) THEN * * WE MUST RECEIVE H(K-1,K-1) FROM DIAGONAL UP * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) ) THEN * * WE MUST RECEIVE H(K ,K-1) FROM LEFT * IRCV2 = IRCV2 + 1 END IF END IF * * POSSIBLY CHANGE OWNERS (OCCURS ONLY WHEN MOD(K-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * SEND DATA ON TO THE APPROPRIATE NODE IF THERE IS ANY DATA TO SEND * IF( IBUF1.GT.0 ) THEN CALL DGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL DGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, MYROW, $ RIGHT ) END IF * * RECEIVE APPROPRIATE DATA IF THERE IS ANY * IF( IRCV1.GT.0 ) THEN CALL DGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL DGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, MYROW, $ LEFT ) END IF * * START MAIN LOOP * IBUF1 = 0 IBUF2 = 0 CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * LOOK FOR A SINGLE SMALL SUBDIAGONAL ELEMENT. * * Start loop for subdiagonal search * DO 40 K = I, L + 1, -1 IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN * * Grab information from WORK array * IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H11 = BUF( ISTR1+IBUF1 ) ELSE H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF2 = IBUF2 + 1 H10 = BUF( ISTR2+IBUF2 ) ELSE H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF ELSE * * Information is local * H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF H22 = A( ( ICOL1-1 )*LDA+IROW1 ) TST1 = ABS( H11 ) + ABS( H22 ) IF( TST1.EQ.ZERO ) THEN * * FIND SOME NORM OF THE LOCAL H(L:I,L:I) * CALL INFOG1L( L, HBL, NPROW, MYROW, IAFIRST, ITMP1, III ) IROW2 = NUMROC( I, HBL, MYROW, IAFIRST, NPROW ) CALL INFOG1L( L, HBL, NPCOL, MYCOL, JAFIRST, ITMP2, III ) ICOL2 = NUMROC( I, HBL, MYCOL, JAFIRST, NPCOL ) DO 30 III = ITMP1, IROW2 DO 20 JJJ = ITMP2, ICOL2 TST1 = TST1 + ABS( A( ( JJJ-1 )*LDA+III ) ) 20 CONTINUE 30 CONTINUE END IF IF( ABS( H10 ).LE.MAX( ULP*TST1, SMLNUM ) ) $ GO TO 50 IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 IF( ( MODKM1.EQ.HBL-1 ) .AND. ( K.GT.2 ) ) THEN II = MOD( II+NPROW-1, NPROW ) JJ = MOD( JJ+NPCOL-1, NPCOL ) CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ITMP1, ITMP2 ) END IF 40 CONTINUE 50 CONTINUE CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, K, 1, ITMP1, ITMP2, -1, $ -1, -1 ) RETURN * * End of PDLASMSUB * END scalapack-2.0.2/SRC/pdlasrt.f000644 000766 000024 00000020653 11750130340 016160 0ustar00juliestaff000000 000000 SUBROUTINE PDLASRT( ID, N, D, Q, IQ, JQ, DESCQ, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER ID INTEGER INFO, IQ, JQ, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCQ( * ), IWORK( * ) DOUBLE PRECISION D( * ), Q( * ), WORK( * ) * .. * * Purpose * ======= * * PDLASRT Sort the numbers in D in increasing order and the * corresponding vectors in Q. * * Arguments * ========= * * ID (global input) CHARACTER*1 * = 'I': sort D in increasing order; * = 'D': sort D in decreasing order. (NOT IMPLEMENTED YET) * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( Q ). N >= 0. * * D (global input/output) DOUBLE PRECISION array, dimmension (N) * On exit, the number in D are sorted in increasing order. * * Q (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_Q, LOCc(JQ+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IQ (global input) INTEGER * The row index in the global array A indicating the first * row of sub( Q ). * * JQ (global input) INTEGER * The column index in the global array A indicating the * first column of sub( Q ). * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK = MAX( N, NP * ( NB + NQ )) * where * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * NQ = NUMROC( N, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK = N + 2*NB + 2*NPCOL * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CL, COL, DUMMY, I, ICTXT, IID, IIQ, INDCOL, $ INDX, INDXC, INDXG, IPQ, IPQ2, IPW, IPWORK, J, $ JJQ, K, L, LDQ, LEND, LIWMIN, LWMIN, MYCOL, $ MYROW, NB, ND, NP, NPCOL, NPROW, NQ, PSQ, QCOL, $ QTOT, SBUF * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PXERBLA, DCOPY, $ DGERV2D, DGESD2D, DLAMOV, DLAPST * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * IF( N.EQ.0 ) $ RETURN * CALL BLACS_GRIDINFO( DESCQ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 1, N, 1, IQ, JQ, DESCQ, 6, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCQ( NB_ ) LDQ = DESCQ( LLD_ ) NP = NUMROC( N, NB, MYROW, DESCQ( RSRC_ ), NPROW ) NQ = NUMROC( N, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) LWMIN = MAX( N, NP*( NB+NQ ) ) LIWMIN = N + 2*( NB+NPCOL ) IF( .NOT.LSAME( ID, 'I' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -9 ELSE IF( LIWORK.LT.LIWMIN ) THEN INFO = -11 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDLASRT', -INFO ) RETURN END IF * * Set Pointers * INDXC = 1 INDX = INDXC + N INDXG = INDX INDCOL = INDXG + NB QTOT = INDCOL + NB PSQ = QTOT + NPCOL * IID = 1 IPQ2 = 1 IPW = IPQ2 + NP*NQ * DUMMY = 0 IIQ = INDXG2L( IQ, NB, DUMMY, DUMMY, NPROW ) * * Sort the eigenvalues in D * CALL DLAPST( 'I', N, D, IWORK( INDX ), INFO ) * DO 10 L = 0, N - 1 WORK( IID+L ) = D( IWORK( INDX+L ) ) IWORK( INDXC-1+IWORK( INDX+L ) ) = IID + L 10 CONTINUE CALL DCOPY( N, WORK, 1, D, 1 ) * ND = 0 20 CONTINUE IF( ND.LT.N ) THEN LEND = MIN( NB, N-ND ) J = JQ + ND QCOL = INDXG2P( J, NB, DUMMY, DESCQ( CSRC_ ), NPCOL ) K = 0 DO 30 L = 0, LEND - 1 I = JQ - 1 + IWORK( INDXC+ND+L ) CL = INDXG2P( I, NB, DUMMY, DESCQ( CSRC_ ), NPCOL ) IWORK( INDCOL+L ) = CL IF( MYCOL.EQ.CL ) THEN IWORK( INDXG+K ) = IWORK( INDXC+ND+L ) K = K + 1 END IF 30 CONTINUE * IF( MYCOL.EQ.QCOL ) THEN DO 40 CL = 0, NPCOL - 1 IWORK( QTOT+CL ) = 0 40 CONTINUE DO 50 L = 0, LEND - 1 IWORK( QTOT+IWORK( INDCOL+L ) ) = IWORK( QTOT+ $ IWORK( INDCOL+L ) ) + 1 50 CONTINUE IWORK( PSQ ) = 1 DO 60 CL = 1, NPCOL - 1 IWORK( PSQ+CL ) = IWORK( PSQ+CL-1 ) + IWORK( QTOT+CL-1 ) 60 CONTINUE DO 70 L = 0, LEND - 1 CL = IWORK( INDCOL+L ) I = JQ + ND + L JJQ = INDXG2L( I, NB, DUMMY, DUMMY, NPCOL ) IPQ = IIQ + ( JJQ-1 )*LDQ IPWORK = IPW + ( IWORK( PSQ+CL )-1 )*NP CALL DCOPY( NP, Q( IPQ ), 1, WORK( IPWORK ), 1 ) IWORK( PSQ+CL ) = IWORK( PSQ+CL ) + 1 70 CONTINUE IWORK( PSQ ) = 1 DO 80 CL = 1, NPCOL - 1 IWORK( PSQ+CL ) = IWORK( PSQ+CL-1 ) + IWORK( QTOT+CL-1 ) 80 CONTINUE DO 90 L = 0, K - 1 I = IWORK( INDXG+L ) JJQ = INDXG2L( I, NB, DUMMY, DUMMY, NPCOL ) IPQ = IPQ2 + ( JJQ-1 )*NP IPWORK = IPW + ( IWORK( PSQ+MYCOL )-1 )*NP CALL DCOPY( NP, WORK( IPWORK ), 1, WORK( IPQ ), 1 ) IWORK( PSQ+MYCOL ) = IWORK( PSQ+MYCOL ) + 1 90 CONTINUE DO 100 CL = 1, NPCOL - 1 COL = MOD( MYCOL+CL, NPCOL ) SBUF = IWORK( QTOT+COL ) IF( SBUF.NE.0 ) THEN IPWORK = IPW + ( IWORK( PSQ+COL )-1 )*NP CALL DGESD2D( DESCQ( CTXT_ ), NP, SBUF, $ WORK( IPWORK ), NP, MYROW, COL ) END IF 100 CONTINUE * ELSE * IF( K.NE.0 ) THEN CALL DGERV2D( DESCQ( CTXT_ ), NP, K, WORK( IPW ), NP, $ MYROW, QCOL ) DO 110 L = 0, K - 1 I = JQ - 1 + IWORK( INDXG+L ) JJQ = INDXG2L( I, NB, DUMMY, DUMMY, NPCOL ) IPQ = 1 + ( JJQ-1 )*NP IPWORK = IPW + L*NP CALL DCOPY( NP, WORK( IPWORK ), 1, WORK( IPQ ), 1 ) 110 CONTINUE END IF END IF ND = ND + NB GO TO 20 END IF CALL DLAMOV( 'Full', NP, NQ, WORK, NP, Q( IIQ ), LDQ ) * * End of PDLASRT * END scalapack-2.0.2/SRC/pdlassq.f000644 000766 000024 00000022443 10363532303 016161 0ustar00juliestaff000000 000000 SUBROUTINE PDLASSQ( N, X, IX, JX, DESCX, INCX, SCALE, SUMSQ ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N DOUBLE PRECISION SCALE, SUMSQ * .. * .. Array Arguments .. INTEGER DESCX( * ) DOUBLE PRECISION X( * ) * .. * * Purpose * ======= * * PDLASSQ returns the values scl and smsq such that * * ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, * * where x( i ) = sub( X ) = X( IX+(JX-1)*DESCX(M_)+(i-1)*INCX ). * The value of sumsq is assumed to be non-negative and scl returns the * value * * scl = max( scale, abs( x( i ) ) ). * * scale and sumsq must be supplied in SCALE and SUMSQ respectively. * SCALE and SUMSQ are overwritten by scl and ssq respectively. * * The routine makes only one pass through the vector sub( X ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * The result are only available in the scope of sub( X ), i.e if * sub( X ) is distributed along a process row, the correct results are * only available in this process row of the grid. Similarly if sub( X ) * is distributed along a process column, the correct results are only * available in this process column of the grid. * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vector sub( X ). * * X (input) DOUBLE PRECISION * The vector for which a scaled sum of squares is computed. * x( i ) = X(IX+(JX-1)*M_X +(i-1)*INCX ), 1 <= i <= n. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * SCALE (local input/local output) DOUBLE PRECISION * On entry, the value scale in the equation above. * On exit, SCALE is overwritten with scl , the scaling factor * for the sum of squares. * * SUMSQ (local input/local output) DOUBLE PRECISION * On entry, the value sumsq in the equation above. * On exit, SUMSQ is overwritten with smsq , the basic sum of * squares from which scl has been factored out. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, ICOFF, ICTXT, IIX, IOFF, IROFF, IXCOL, $ IXROW, JJX, LDX, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ DOUBLE PRECISION TEMP1 * .. * .. Local Arrays .. DOUBLE PRECISION WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOMBSSQ, INFOG2L, PDTREECOMB * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) * LDX = DESCX( LLD_ ) IF( INCX.EQ.DESCX( M_ ) ) THEN * * X is rowwise distributed. * IF( MYROW.NE.IXROW ) $ RETURN ICOFF = MOD( JX, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ - ICOFF * * Code direct from LAPACK's DLASSQ, (save subroutine call) * IF( NQ.GT.0 ) THEN IOFF = IIX + ( JJX - 1 ) * LDX DO 10 I = 1, NQ IF( X( IOFF ).NE.ZERO ) THEN TEMP1 = ABS( X( IOFF ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ * ( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IOFF = IOFF + LDX 10 CONTINUE END IF * * Take local result and find global * WORK( 1 ) = SCALE WORK( 2 ) = SUMSQ * CALL PDTREECOMB( ICTXT, 'Rowwise', 2, WORK, -1, IXCOL, $ DCOMBSSQ ) * SCALE = WORK( 1 ) SUMSQ = WORK( 2 ) * ELSE IF( INCX.EQ.1 ) THEN * * X is columnwise distributed. * IF( MYCOL.NE.IXCOL ) $ RETURN IROFF = MOD( IX, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFF * * Code direct from LAPACK's DLASSQ, (save subroutine call) * IF( NP.GT.0 ) THEN IOFF = IIX + ( JJX - 1 ) * LDX DO 20 I = 1, NP IF( X( IOFF ).NE.ZERO ) THEN TEMP1 = ABS( X( IOFF ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ*( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IOFF = IOFF + 1 20 CONTINUE END IF * * Take local result and find global * WORK( 1 ) = SCALE WORK( 2 ) = SUMSQ * CALL PDTREECOMB( ICTXT, 'Columnwise', 2, WORK, -1, IXCOL, $ DCOMBSSQ ) * SCALE = WORK( 1 ) SUMSQ = WORK( 2 ) * END IF * RETURN * * End of PDLASSQ * END scalapack-2.0.2/SRC/pdlaswp.f000644 000766 000024 00000020350 10363532303 016157 0ustar00juliestaff000000 000000 SUBROUTINE PDLASWP( DIREC, ROWCOL, N, A, IA, JA, DESCA, K1, K2, $ IPIV ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIREC, ROWCOL INTEGER IA, JA, K1, K2, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) DOUBLE PRECISION A( * ) * .. * * Purpose: * ======== * * PDLASWP performs a series of row or column interchanges on * the distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1). One * interchange is initiated for each of rows or columns K1 trough K2 of * sub( A ). This routine assumes that the pivoting information has * already been broadcast along the process row or column. * Also note that this routine will only work for K1-K2 being in the * same MB (or NB) block. If you want to pivot a full matrix, use * PDLAPIV. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIREC (global input) CHARACTER * Specifies in which order the permutation is applied: * = 'F' (Forward) * = 'B' (Backward) * * ROWCOL (global input) CHARACTER * Specifies if the rows or columns are permuted: * = 'R' (Rows) * = 'C' (Columns) * * N (global input) INTEGER * If ROWCOL = 'R', the length of the rows of the distributed * matrix A(*,JA:JA+N-1) to be permuted; * If ROWCOL = 'C', the length of the columns of the distributed * matrix A(IA:IA+N-1,*) to be permuted. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, * ). * On entry, this array contains the local pieces of the distri- * buted matrix to which the row/columns interchanges will be * applied. On exit the permuted distributed matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * K1 (global input) INTEGER * The first element of IPIV for which a row or column inter- * change will be done. * * K2 (global input) INTEGER * The last element of IPIV for which a row or column inter- * change will be done. * * IPIV (local input) INTEGER array, dimension LOCr(M_A)+MB_A for * row pivoting and LOCc(N_A)+NB_A for column pivoting. This * array is tied to the matrix A, IPIV(K) = L implies rows * (or columns) K and L are to be interchanged. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICURCOL, ICURROW, IIA, IP, J, JJA, JP, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PDSWAP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( ROWCOL, 'R' ) ) THEN IF( LSAME( DIREC, 'F' ) ) THEN CALL INFOG2L( K1, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 10 I = K1, K2 IP = IPIV( IIA+I-K1 ) IF( IP.NE.I ) $ CALL PDSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP, $ JA, DESCA, DESCA( M_ ) ) 10 CONTINUE ELSE CALL INFOG2L( K2, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 20 I = K2, K1, -1 IP = IPIV( IIA+I-K1 ) IF( IP.NE.I ) $ CALL PDSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP, $ JA, DESCA, DESCA( M_ ) ) 20 CONTINUE END IF ELSE IF( LSAME( DIREC, 'F' ) ) THEN CALL INFOG2L( IA, K1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 30 J = K1, K2 JP = IPIV( JJA+J-K1 ) IF( JP.NE.J ) $ CALL PDSWAP( N, A, IA, J, DESCA, 1, A, IA, JP, $ DESCA, 1 ) 30 CONTINUE ELSE CALL INFOG2L( IA, K2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 40 J = K2, K1, -1 JP = IPIV( JJA+J-K1 ) IF( JP.NE.J ) $ CALL PDSWAP( N, A, IA, J, DESCA, 1, A, IA, JP, $ DESCA, 1 ) 40 CONTINUE END IF END IF * RETURN * * End PDLASWP * END scalapack-2.0.2/SRC/pdlatra.f000644 000766 000024 00000015456 10363532303 016147 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PDLATRA( N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLATRA computes the trace of an N-by-N distributed matrix sub( A ) * denoting A( IA:IA+N-1, JA:JA+N-1 ). The result is left on every * process of the grid. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix the trace * is to be computed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ==================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER ICURCOL, ICURROW, II, IOFFA, J, JB, JJ, JN, $ LDA, LL, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION TRACE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGSUM2D, INFOG2L * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * TRACE = ZERO IF( N.EQ.0 ) THEN PDLATRA = TRACE RETURN END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ ICURROW, ICURCOL ) * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 LDA = DESCA( LLD_ ) IOFFA = II + ( JJ - 1 ) * LDA * * Handle first diagonal block separately * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 10 LL = IOFFA, IOFFA + (JB-1)*(LDA+1), LDA+1 TRACE = TRACE + A( LL ) 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JB IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JB*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over the remaining block of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 20 LL = IOFFA, IOFFA + (JB-1)*(LDA+1), LDA+1 TRACE = TRACE + A( LL ) 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JB IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JB*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE * CALL DGSUM2D( DESCA( CTXT_ ), 'All', ' ', 1, 1, TRACE, 1, -1, $ MYCOL ) * PDLATRA = TRACE * RETURN * * End of PDLATRA * END scalapack-2.0.2/SRC/pdlatrd.f000644 000766 000024 00000041620 10363532303 016142 0ustar00juliestaff000000 000000 SUBROUTINE PDLATRD( UPLO, N, NB, A, IA, JA, DESCA, D, E, TAU, W, $ IW, JW, DESCW, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IW, JA, JW, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCW( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAU( * ), W( * ), $ WORK( * ) * .. * * Purpose * ======= * * PDLATRD reduces NB rows and columns of a real symmetric distributed * matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) to symmetric tridiagonal * form by an orthogonal similarity transformation Q' * sub( A ) * Q, * and returns the matrices V and W which are needed to apply the * transformation to the unreduced part of sub( A ). * * If UPLO = 'U', PDLATRD reduces the last NB rows and columns of a * matrix, of which the upper triangle is supplied; * if UPLO = 'L', PDLATRD reduces the first NB rows and columns of a * matrix, of which the lower triangle is supplied. * * This is an auxiliary routine called by PDSYTRD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NB (global input) INTEGER * The number of rows and columns to be reduced. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * symmetric distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. * On exit, if UPLO = 'U', the last NB columns have been reduced * to tridiagonal form, with the diagonal elements overwriting * the diagonal elements of sub( A ); the elements above the * diagonal with the array TAU, represent the orthogonal matrix * Q as a product of elementary reflectors. If UPLO = 'L', the * first NB columns have been reduced to tridiagonal form, with * the diagonal elements overwriting the diagonal elements of * sub( A ); the elements below the diagonal with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors; See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * W (local output) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_W,NB_W), This array contains * the local pieces of the N-by-NB_W matrix W required to * update the unreduced part of sub( A ). * * IW (global input) INTEGER * The row index in the global array W indicating the first * row of sub( W ). * * JW (global input) INTEGER * The column index in the global array W indicating the * first column of sub( W ). * * DESCW (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix W. * * WORK (local workspace) DOUBLE PRECISION array, dimension (NB_A) * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n) H(n-1) . . . H(n-nb+1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(nb). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The elements of the vectors v together form the N-by-NB matrix V * which is needed, with W, to apply the transformation to the unreduced * part of the matrix, using a symmetric rank-2k update of the form: * sub( A ) := sub( A ) - V*W' - W*V'. * * The contents of A on exit are illustrated by the following examples * with n = 5 and nb = 2: * * if UPLO = 'U': if UPLO = 'L': * * ( a a a v4 v5 ) ( d ) * ( a a v4 v5 ) ( 1 d ) * ( a 1 v5 ) ( v1 1 a ) * ( d 1 ) ( v1 v2 a a ) * ( d ) ( v1 v2 a a a ) * * where d denotes a diagonal element of the reduced matrix, a denotes * an element of the original matrix that is unchanged, and vi denotes * an element of the vector defining H(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION HALF, ONE, ZERO PARAMETER ( HALF = 0.5D+0, ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, J, JJ, JP, JWK, K, $ KW, MYCOL, MYROW, NPCOL, NPROW, NQ DOUBLE PRECISION ALPHA * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCWK( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, DGEBR2D, DGEBS2D, $ INFOG2L, PDAXPY, PDDOT, PDELGET, $ PDELSET, PDGEMV, PDLARFG, PDSCAL, $ PDSYMV * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NQ = MAX( 1, NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) CALL DESCSET( DESCD, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * IF( LSAME( UPLO, 'U' ) ) THEN * CALL INFOG2L( N+IA-NB, N+JA-NB, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, IAROW, IACOL ) CALL DESCSET( DESCWK, 1, DESCW( NB_ ), 1, DESCW( NB_ ), IAROW, $ IACOL, ICTXT, 1 ) CALL DESCSET( DESCE, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * * Reduce last NB columns of upper triangle * DO 10 J = JA+N-1, JA+N-NB, -1 I = IA + J - JA K = J - JA + 1 KW = MOD( K-1, DESCA( MB_ ) ) + 1 * * Update A(IA:I,I) * CALL PDGEMV( 'No transpose', K, N-K, -ONE, A, IA, J+1, $ DESCA, W, IW+K-1, JW+KW, DESCW, DESCW( M_ ), $ ONE, A, IA, J, DESCA, 1 ) CALL PDGEMV( 'No transpose', K, N-K, -ONE, W, IW, JW+KW, $ DESCW, A, I, J+1, DESCA, DESCA( M_ ), ONE, A, $ IA, J, DESCA, 1 ) IF( N-K.GT.0 ) $ CALL PDELSET( A, I, J+1, DESCA, E( JP ) ) * * Generate elementary reflector H(i) to annihilate * A(IA:I-2,I) * JP = MIN( JJ+KW-1, NQ ) CALL PDLARFG( K-1, E( JP ), I-1, J, A, IA, J, DESCA, 1, $ TAU ) CALL PDELSET( A, I-1, J, DESCA, ONE ) * * Compute W(IW:IW+K-2,JW+KW-1) * CALL PDSYMV( 'Upper', K-1, ONE, A, IA, JA, DESCA, A, IA, J, $ DESCA, 1, ZERO, W, IW, JW+KW-1, DESCW, 1 ) * JWK = MOD( K-1, DESCWK( NB_ ) ) + 2 CALL PDGEMV( 'Transpose', K-1, N-K, ONE, W, IW, JW+KW, $ DESCW, A, IA, J, DESCA, 1, ZERO, WORK, 1, JWK, $ DESCWK, DESCWK( M_ ) ) CALL PDGEMV( 'No transpose', K-1, N-K, -ONE, A, IA, J+1, $ DESCA, WORK, 1, JWK, DESCWK, DESCWK( M_ ), ONE, $ W, IW, JW+KW-1, DESCW, 1 ) CALL PDGEMV( 'Transpose', K-1, N-K, ONE, A, IA, J+1, DESCA, $ A, IA, J, DESCA, 1, ZERO, WORK, 1, JWK, DESCWK, $ DESCWK( M_ ) ) CALL PDGEMV( 'No transpose', K-1, N-K, -ONE, W, IW, JW+KW, $ DESCW, WORK, 1, JWK, DESCWK, DESCWK( M_ ), ONE, $ W, IW, JW+KW-1, DESCW, 1 ) CALL PDSCAL( K-1, TAU( JP ), W, IW, JW+KW-1, DESCW, 1 ) * CALL PDDOT( K-1, ALPHA, W, IW, JW+KW-1, DESCW, 1, A, IA, J, $ DESCA, 1 ) IF( MYCOL.EQ.IACOL ) $ ALPHA = -HALF*TAU( JP )*ALPHA CALL PDAXPY( K-1, ALPHA, A, IA, J, DESCA, 1, W, IW, JW+KW-1, $ DESCW, 1 ) IF( MYCOL.EQ.IACOL ) THEN CALL PDELGET( 'E', ' ', D( JP ), A, I, J, DESCA ) END IF * 10 CONTINUE * ELSE * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL DESCSET( DESCWK, 1, DESCW( NB_ ), 1, DESCW( NB_ ), IAROW, $ IACOL, ICTXT, 1 ) CALL DESCSET( DESCE, 1, JA+N-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * * Reduce first NB columns of lower triangle * DO 20 J = JA, JA+NB-1 I = IA + J - JA K = J - JA + 1 * * Update A(J:JA+N-1,J) * CALL PDGEMV( 'No transpose', N-K+1, K-1, -ONE, A, I, JA, $ DESCA, W, IW+K-1, JW, DESCW, DESCW( M_ ), ONE, $ A, I, J, DESCA, 1 ) CALL PDGEMV( 'No transpose', N-K+1, K-1, -ONE, W, IW+K-1, $ JW, DESCW, A, I, JA, DESCA, DESCA( M_ ), ONE, $ A, I, J, DESCA, 1 ) IF( K.GT.1 ) $ CALL PDELSET( A, I, J-1, DESCA, E( JP ) ) * * * Generate elementary reflector H(i) to annihilate * A(I+2:IA+N-1,I) * JP = MIN( JJ+K-1, NQ ) CALL PDLARFG( N-K, E( JP ), I+1, J, A, I+2, J, DESCA, 1, $ TAU ) CALL PDELSET( A, I+1, J, DESCA, ONE ) * * Compute W(IW+K:IW+N-1,JW+K-1) * CALL PDSYMV( 'Lower', N-K, ONE, A, I+1, J+1, DESCA, A, I+1, $ J, DESCA, 1, ZERO, W, IW+K, JW+K-1, DESCW, 1 ) * CALL PDGEMV( 'Transpose', N-K, K-1, ONE, W, IW+K, JW, DESCW, $ A, I+1, J, DESCA, 1, ZERO, WORK, 1, 1, DESCWK, $ DESCWK( M_ ) ) CALL PDGEMV( 'No transpose', N-K, K-1, -ONE, A, I+1, JA, $ DESCA, WORK, 1, 1, DESCWK, DESCWK( M_ ), ONE, W, $ IW+K, JW+K-1, DESCW, 1 ) CALL PDGEMV( 'Transpose', N-K, K-1, ONE, A, I+1, JA, DESCA, $ A, I+1, J, DESCA, 1, ZERO, WORK, 1, 1, DESCWK, $ DESCWK( M_ ) ) CALL PDGEMV( 'No transpose', N-K, K-1, -ONE, W, IW+K, JW, $ DESCW, WORK, 1, 1, DESCWK, DESCWK( M_ ), ONE, W, $ IW+K, JW+K-1, DESCW, 1 ) CALL PDSCAL( N-K, TAU( JP ), W, IW+K, JW+K-1, DESCW, 1 ) CALL PDDOT( N-K, ALPHA, W, IW+K, JW+K-1, DESCW, 1, A, I+1, $ J, DESCA, 1 ) IF( MYCOL.EQ.IACOL ) $ ALPHA = -HALF*TAU( JP )*ALPHA CALL PDAXPY( N-K, ALPHA, A, I+1, J, DESCA, 1, W, IW+K, $ JW+K-1, DESCW, 1 ) IF( MYCOL.EQ.IACOL ) THEN CALL PDELGET( 'E', ' ', D( JP ), A, I, J, DESCA ) END IF * 20 CONTINUE * END IF * * Broadcast columnwise the diagonal elements into D. * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, NB, D( JJ ), 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, NB, D( JJ ), 1, $ IAROW, MYCOL ) END IF END IF * RETURN * * End of PDLATRD * END scalapack-2.0.2/SRC/pdlatrs.f000644 000766 000024 00000005314 10363532303 016161 0ustar00juliestaff000000 000000 SUBROUTINE PDLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, IA, $ JA, DESCA, X, IX, JX, DESCX, SCALE, CNORM, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, NORMIN, TRANS, UPLO INTEGER IA, IX, JA, JX, N DOUBLE PRECISION SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION A( * ), CNORM( * ), $ X( * ), WORK( * ) * .. * * Purpose * ======= * * PDLATRS solves a triangular system. This routine in unfinished * at this time, but will be part of the next release. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * * .. Local Scalars .. INTEGER ICTXT, IIX, IROFF, JJX, MYCOL, MYROW, NP, $ NPCOL, NPROW, LDX, IXCOL, IXROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, INFOG2L, $ PDTRSV * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * ***** NO SCALING ***** Call PDTRSV for all cases ***** * SCALE = ONE CALL PDTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, $ DESCX, 1 ) * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) LDX = DESCX( LLD_ ) IROFF = MOD( IX-1, DESCX(MB_) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IXCOL ) THEN CALL DGEBS2D( ICTXT, 'R', ' ', NP, 1, X( IIX+(JJX-1)*LDX ), $ LDX ) ELSE CALL DGEBR2D( ICTXT, 'R', ' ', NP, 1, X( IIX+(JJX-1)*LDX ), $ LDX, MYROW, IXCOL ) END IF * RETURN * * End of PDLATRS * END scalapack-2.0.2/SRC/pdlatrz.f000644 000766 000024 00000021623 10363532303 016171 0ustar00juliestaff000000 000000 SUBROUTINE PDLATRZ( M, N, L, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, L, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDLATRZ reduces the M-by-N ( M<=N ) real upper trapezoidal matrix * sub( A ) = [ A(IA:IA+M-1,JA:JA+M-1) A(IA:IA+M-1,JA+N-L:JA+N-1) ] to * upper triangular form by means of orthogonal transformations. * * The upper trapezoidal matrix sub( A ) is factored as * * sub( A ) = ( R 0 ) * Z, * * where Z is an N-by-N orthogonal matrix and R is an M-by-M upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. L > 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the leading M-by-M * upper triangular part of sub( A ) contains the upper trian- * gular matrix R, and elements N-L+1 to N of the first M rows * of sub( A ), with the array TAU, represent the orthogonal * matrix Z as a product of M elementary reflectors. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Further Details * =============== * * The factorization is obtained by Householder's method. The kth * transformation matrix, Z( k ), which is used to introduce zeros into * the (m - k + 1)th row of sub( A ), is given in the form * * Z( k ) = ( I 0 ), * ( 0 T( k ) ) * * where * * T( k ) = I - tau*u( k )*u( k )', u( k ) = ( 1 ), * ( 0 ) * ( z( k ) ) * * tau is a scalar and z( k ) is an ( n - m ) element vector. * tau and z( k ) are chosen to annihilate the elements of the kth row * of sub( A ). * * The scalar tau is returned in the kth element of TAU and the vector * u( k ) in the kth row of sub( A ), such that the elements of z( k ) * are in a( k, m + 1 ), ..., a( k, n ). The elements of R are returned * in the upper triangular part of sub( A ). * * Z is given by * * Z = Z( 1 ) * Z( 2 ) * ... * Z( m ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IAROW, ICTXT, II, J, J1, MP, MYCOL, MYROW, $ NPCOL, NPROW DOUBLE PRECISION AII * .. * .. External Subroutines .. EXTERNAL INFOG1L, PDELSET, PDLARFG, PDLARZ * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * MP = NUMROC( IA+M-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) * IF( M.EQ.N ) THEN * CALL INFOG1L( IA, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ II, IAROW ) DO 10 I = II, MP TAU( I ) = ZERO 10 CONTINUE * ELSE * J1 = JA + N - L DO 20 I = IA+M-1, IA, -1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * [ A(i, j) A(i,j1:ja+n-1) ] * CALL PDLARFG( L+1, AII, I, J, A, I, J1, DESCA, DESCA( M_ ), $ TAU ) * * Apply H(i) to A(ia:i-1,j:ja+n-1) from the right * CALL PDLARZ( 'Right', I-IA, JA+N-J, L, A, I, J1, DESCA, $ DESCA( M_ ), TAU, A, IA, J, DESCA, WORK ) CALL PDELSET( A, I, J, DESCA, AII ) * 20 CONTINUE * END IF * RETURN * * End of PDLATRZ * END scalapack-2.0.2/SRC/pdlauu2.f000644 000766 000024 00000017561 10363532303 016073 0ustar00juliestaff000000 000000 SUBROUTINE PDLAUU2( UPLO, N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLAUU2 computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in sub( A ). * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in sub( A ). * * This is the unblocked form of the algorithm, calling Level 2 BLAS. * No communication is performed by this routine, the matrix to operate * on should be strictly local to one process. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the triangular factor stored in the matrix * sub( A ) is upper or lower triangular: * = 'U': Upper triangular, * = 'L': Lower triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the order of the triangular factor U or L. N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor L or U. * On exit, if UPLO = 'U', the upper triangle of the distributed * matrix sub( A ) is overwritten with the upper triangle of the * product U * U'; if UPLO = 'L', the lower triangle of sub( A ) * is overwritten with the lower triangle of the product L' * L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICURR, IDIAG, IIA, IOFFA, JJA, $ LDA, MYCOL, MYROW, NA, NPCOL, NPROW DOUBLE PRECISION AII * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, DGEMV, DSCAL * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DDOT EXTERNAL DDOT, LSAME * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Get grid parameters and compute local indexes * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * IF( LSAME( UPLO, 'U' ) ) THEN * * Compute the product U * U'. * DO 10 NA = N-1, 1, -1 AII = A( IDIAG ) ICURR = IDIAG + LDA A( IDIAG ) = AII*AII + DDOT( NA, A( ICURR ), LDA, $ A( ICURR ), LDA ) CALL DGEMV( 'No transpose', N-NA-1, NA, ONE, $ A( IOFFA+LDA ), LDA, A( ICURR ), LDA, AII, $ A( IOFFA ), 1 ) IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + LDA 10 CONTINUE AII = A( IDIAG ) CALL DSCAL( N, AII, A( IOFFA ), 1 ) * ELSE * * Compute the product L' * L. * DO 20 NA = 1, N-1 AII = A( IDIAG ) ICURR = IDIAG + 1 A(IDIAG) = AII*AII + DDOT( N-NA, A( ICURR ), 1, $ A( ICURR ), 1 ) CALL DGEMV( 'Transpose', N-NA, NA-1, ONE, A( IOFFA+1 ), $ LDA, A( ICURR ), 1, AII, A( IOFFA ), LDA ) IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + 1 20 CONTINUE AII = A( IDIAG ) CALL DSCAL( N, AII, A( IOFFA ), LDA ) * END IF * END IF * RETURN * * End of PDLAUU2 * END scalapack-2.0.2/SRC/pdlauum.f000644 000766 000024 00000020650 10363532303 016157 0ustar00juliestaff000000 000000 SUBROUTINE PDLAUUM( UPLO, N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLAUUM computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in sub( A ). * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in sub( A ). * * This is the blocked form of the algorithm, calling Level 3 PBLAS. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the triangular factor stored in the * distributed matrix sub( A ) is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the triangular factor U or L. N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor L or U. * On exit, if UPLO = 'U', the upper triangle of the distributed * matrix sub( A ) is overwritten with the upper triangle of the * product U * U'; if UPLO = 'L', the lower triangle of sub( A ) * is overwritten with the lower triangle of the product L' * L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J, JB, JN * .. * .. External Subroutines .. EXTERNAL PDGEMM, PDLAUU2, PDTRMM, PDSYRK * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) IF( LSAME( UPLO, 'U' ) ) THEN * * Compute the product U * U'. * * Handle first block separately * JB = JN-JA+1 CALL PDLAUU2( 'Upper', JB, A, IA, JA, DESCA ) IF( JB.LE.N-1 ) THEN CALL PDSYRK( 'Upper', 'No transpose', JB, N-JB, ONE, A, IA, $ JA+JB, DESCA, ONE, A, IA, JA, DESCA ) END IF * * Loop over remaining block of columns * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA CALL PDTRMM( 'Right', 'Upper', 'Transpose', 'Non-unit', $ J-JA, JB, ONE, A, I, J, DESCA, A, IA, J, $ DESCA ) CALL PDLAUU2( 'Upper', JB, A, I, J, DESCA ) IF( J+JB.LE.JA+N-1 ) THEN CALL PDGEMM( 'No transpose', 'Transpose', J-JA, JB, $ N-J-JB+JA, ONE, A, IA, J+JB, DESCA, A, I, $ J+JB, DESCA, ONE, A, IA, J, DESCA ) CALL PDSYRK( 'Upper', 'No transpose', JB, N-J-JB+JA, ONE, $ A, I, J+JB, DESCA, ONE, A, I, J, DESCA ) END IF 10 CONTINUE ELSE * * Compute the product L' * L. * * Handle first block separately * JB = JN-JA+1 CALL PDLAUU2( 'Lower', JB, A, IA, JA, DESCA ) IF( JB.LE.N-1 ) THEN CALL PDSYRK( 'Lower', 'Transpose', JB, N-JB, ONE, A, IA+JB, $ JA, DESCA, ONE, A, IA, JA, DESCA ) END IF * * Loop over remaining block of columns * DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA CALL PDTRMM( 'Left', 'Lower', 'Transpose', 'Non-unit', JB, $ J-JA, ONE, A, I, J, DESCA, A, I, JA, DESCA ) CALL PDLAUU2( 'Lower', JB, A, I, J, DESCA ) IF( J+JB.LE.JA+N-1 ) THEN CALL PDGEMM( 'Transpose', 'No transpose', JB, J-JA, $ N-J-JB+JA, ONE, A, I+JB, J, DESCA, A, I+JB, $ JA, DESCA, ONE, A, I, JA, DESCA ) CALL PDSYRK( 'Lower', 'Transpose', JB, N-J-JB+JA, ONE, $ A, I+JB, J, DESCA, ONE, A, I, J, DESCA ) END IF 20 CONTINUE END IF * RETURN * * End of PDLAUUM * END scalapack-2.0.2/SRC/pdlawil.f000644 000766 000024 00000023551 10363532303 016147 0ustar00juliestaff000000 000000 SUBROUTINE PDLAWIL( II, JJ, M, A, DESCA, H44, H33, H43H34, V ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER II, JJ, M DOUBLE PRECISION H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), V( * ) * .. * * Purpose * ======= * * PDLAWIL gets the transform given by H44,H33, & H43H34 into V * starting at row M. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * II (global input) INTEGER * Row owner of H(M+2,M+2) * * JJ (global input) INTEGER * Column owner of H(M+2,M+2) * * M (global input) INTEGER * On entry, this is where the transform starts (row M.) * Unchanged on exit. * * A (global input) DOUBLE PRECISION array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * Unchanged on exit. * * H44 * H33 * H43H34 (global input) DOUBLE PRECISION * These three values are for the double shift QR iteration. * Unchanged on exit. * * V (global output) DOUBLE PRECISION array of size 3. * Contains the transform on ouput. * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, ICOL, IROW, JSRC, LDA, LEFT, $ MODKM1, MYCOL, MYROW, NPCOL, NPROW, NUM, RIGHT, $ RSRC, UP DOUBLE PRECISION H11, H12, H21, H22, H33S, H44S, S, V1, V2, V3 * .. * .. Local Arrays .. DOUBLE PRECISION BUF( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGERV2D, DGESD2D, INFOG2L * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * On node (II,JJ) collect all DIA,SUP,SUB info from M, M+1 * MODKM1 = MOD( M+1, HBL ) IF( MODKM1.EQ.0 ) THEN IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) CALL DGESD2D( CONTXT, 1, 1, BUF, 1, II, JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) BUF( 2 ) = A( ( ICOL-1 )*LDA+IROW+1 ) BUF( 3 ) = A( ICOL*LDA+IROW ) BUF( 4 ) = A( ICOL*LDA+IROW+1 ) CALL DGESD2D( CONTXT, 4, 1, BUF, 4, II, JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NPCOL.GT.1 ) THEN CALL DGERV2D( CONTXT, 1, 1, V3, 1, MYROW, LEFT ) ELSE V3 = A( ( ICOL-2 )*LDA+IROW ) END IF IF( NUM.GT.1 ) THEN CALL DGERV2D( CONTXT, 4, 1, BUF, 4, UP, LEFT ) H11 = BUF( 1 ) H21 = BUF( 2 ) H12 = BUF( 3 ) H22 = BUF( 4 ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) END IF END IF END IF IF( MODKM1.EQ.1 ) THEN IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) CALL DGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. ( NPROW.GT.1 ) ) $ THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL DGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL DGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NUM.GT.1 ) THEN CALL DGERV2D( CONTXT, 1, 1, H11, 1, UP, LEFT ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) END IF IF( NPROW.GT.1 ) THEN CALL DGERV2D( CONTXT, 1, 1, H12, 1, UP, MYCOL ) ELSE H12 = A( ( ICOL-2 )*LDA+IROW-2 ) END IF IF( NPCOL.GT.1 ) THEN CALL DGERV2D( CONTXT, 1, 1, H21, 1, MYROW, LEFT ) ELSE H21 = A( ( ICOL-3 )*LDA+IROW-1 ) END IF H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF END IF IF( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) $ RETURN * IF( MODKM1.GT.1 ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF * H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 * RETURN * * End of PDLAWIL * END scalapack-2.0.2/SRC/pdorg2l.f000644 000766 000024 00000025630 10363532303 016064 0ustar00juliestaff000000 000000 SUBROUTINE PDORG2L( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORG2L generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the last N columns of a product of K elementary reflectors of order M * * Q = H(k) . . . H(2) H(1) * * as returned by PDGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA+N-K <= j <= JA+N-1, * as returned by PDGEQLF in the K columns of its distributed * matrix argument A(IA:*,JA+N-K:JA+N-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PDGEQLF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MpA0 + MAX( 1, NqA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, J, JJ, LWMIN, MPA0, MYCOL, $ MYROW, NPCOL, NPROW, NQA0 DOUBLE PRECISION TAUJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PDELSET, $ PDLARF, PDLASET, PDSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MPA0 + MAX( 1, NQA0 ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORG2L', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Initialise columns ja:ja+n-k-1 to columns of the unit matrix * CALL PDLASET( 'All', M-N, N-K, ZERO, ZERO, A, IA, JA, DESCA ) CALL PDLASET( 'All', N, N-K, ZERO, ONE, A, IA+M-N, JA, DESCA ) * TAUJ = ZERO NQA0 = MAX( 1, NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) ) DO 10 J = JA+N-K, JA+N-1 * * Apply H(j) to A(ia:ia+m-n+j-ja,ja:j) from the left * CALL PDELSET( A, IA+M-N+J-JA, J, DESCA, ONE ) CALL PDLARF( 'Left', M-N+J-JA+1, J-JA, A, IA, J, DESCA, 1, TAU, $ A, IA, JA, DESCA, WORK ) * JJ = INDXG2L( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) IACOL = INDXG2P( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ TAUJ = TAU( MIN( JJ, NQA0 ) ) CALL PDSCAL( M-N+J-JA, -TAUJ, A, IA, J, DESCA, 1 ) CALL PDELSET( A, IA+M-N+J-JA, J, DESCA, ONE-TAUJ ) * * Set A(ia+m-n+j-ja+1:ia+m-1,j) to zero * CALL PDLASET( 'All', JA+N-1-J, 1, ZERO, ZERO, A, IA+M-N+J-JA+1, $ J, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORG2L * END scalapack-2.0.2/SRC/pdorg2r.f000644 000766 000024 00000025677 10363532303 016105 0ustar00juliestaff000000 000000 SUBROUTINE PDORG2R( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORG2R generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the first N columns of a product of K elementary reflectors of order * M * * Q = H(1) H(2) . . . H(k) * * as returned by PDGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA <= j <= JA+K-1, as * returned by PDGEQRF in the K columns of its array * argument A(IA:*,JA:JA+K-1). On exit, this array contains * the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PDGEQRF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MpA0 + MAX( 1, NqA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, J, JJ, KQ, LWMIN, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 DOUBLE PRECISION TAUJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PDELSET, $ PDLARF, PDLASET, PDSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MPA0 + MAX( 1, NQA0 ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORG2R', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Initialise columns ja+k:ja+n-1 to columns of the unit matrix * CALL PDLASET( 'All', K, N-K, ZERO, ZERO, A, IA, JA+K, DESCA ) CALL PDLASET( 'All', M-K, N-K, ZERO, ONE, A, IA+K, JA+K, DESCA ) * TAUJ = ZERO KQ = MAX( 1, NUMROC( JA+K-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) DO 10 J = JA+K-1, JA, -1 * * Apply H(j) to A(ia+j-ja:ia+m-1,j:ja+n-1) from the left * IF( J.LT.JA+N-1 ) THEN CALL PDELSET( A, IA+J-JA, J, DESCA, ONE ) CALL PDLARF( 'Left', M-J+JA, JA+N-J-1, A, IA+J-JA, J, DESCA, $ 1, TAU, A, IA+J-JA, J+1, DESCA, WORK ) END IF * JJ = INDXG2L( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) IACOL = INDXG2P( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ TAUJ = TAU( MIN( JJ, KQ ) ) IF( J-JA.LT.M-1 ) $ CALL PDSCAL( M-J+JA-1, -TAUJ, A, IA+J-JA+1, J, DESCA, 1 ) CALL PDELSET( A, IA+J-JA, J, DESCA, ONE-TAUJ ) * * Set A(ia:ia+j-ja-1,j) to zero * CALL PDLASET( 'All', J-JA, 1, ZERO, ZERO, A, IA, J, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORG2R * END scalapack-2.0.2/SRC/pdorgl2.f000644 000766 000024 00000026056 10363532303 016067 0ustar00juliestaff000000 000000 SUBROUTINE PDORGL2( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORGL2 generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as * the first M rows of a product of K elementary reflectors of order N * * Q = H(k) . . . H(2) H(1) * * as returned by PDGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA <= i <= IA+K-1, as * returned by PDGELQF in the K rows of its distributed matrix * argument A(IA:IA+K-1,JA:*). On exit, this array contains the * local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCr(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDGELQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NqA0 + MAX( 1, MpA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, II, J, KP, LWMIN, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 DOUBLE PRECISION TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PDELSET, $ PDLARF, PDLASET, PDSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQA0 + MAX( 1, MPA0 ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORGL2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * IF( K.LT.M ) THEN * * Initialise rows ia+k:ia+m-1 to rows of the unit matrix * CALL PDLASET( 'All', M-K, K, ZERO, ZERO, A, IA+K, JA, DESCA ) CALL PDLASET( 'All', M-K, N-K, ZERO, ONE, A, IA+K, JA+K, $ DESCA ) * END IF * TAUI = ZERO KP = NUMROC( IA+K-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) * DO 10 I = IA+K-1, IA, -1 * * Apply H(i) to A(i:ia+m-1,ja+i-ia:ja+n-1) from the right * J = JA + I - IA II = INDXG2L( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IAROW = INDXG2P( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IF( MYROW.EQ.IAROW ) $ TAUI = TAU( MIN( II, KP ) ) IF( J.LT.JA+N-1 ) THEN IF( I.LT.IA+M-1 ) THEN CALL PDELSET( A, I, J, DESCA, ONE ) CALL PDLARF( 'Right', M-I+IA-1, N-J+JA, A, I, J, DESCA, $ DESCA( M_ ), TAU, A, I+1, J, DESCA, WORK ) END IF CALL PDSCAL( N-J+JA-1, -TAUI, A, I, J+1, DESCA, $ DESCA( M_ ) ) END IF CALL PDELSET( A, I, J, DESCA, ONE-TAUI ) * * Set A(i,ja:j-1) to zero * CALL PDLASET( 'All', 1, J-JA, ZERO, ZERO, A, I, JA, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORGL2 * END scalapack-2.0.2/SRC/pdorglq.f000644 000766 000024 00000030665 10363532303 016167 0ustar00juliestaff000000 000000 SUBROUTINE PDORGLQ( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORGLQ generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as * the first M rows of a product of K elementary reflectors of order N * * Q = H(k) . . . H(2) H(1) * * as returned by PDGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA <= i <= IA+K-1, as * returned by PDGELQF in the K rows of its distributed matrix * argument A(IA:IA+K-1,JA:*). On exit, this array contains the * local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCr(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDGELQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( MpA0 + NqA0 + MB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IL, IN, IPW, $ J, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PDLARFB, $ PDLARFT, PDLASET, PDORGL2, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORGLQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IL = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * CALL PDLASET( 'All', IA+M-IL, IL-IA, ZERO, ZERO, A, IL, JA, $ DESCA ) * * Use unblocked code for the last or only block. * CALL PDORGL2( IA+M-IL, N-IL+IA, IA+K-IL, A, IL, JA+IL-IA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Is there at least one block of rows to loop over ? * IF( IL.GT.IN+1 ) THEN * * Use blocked code * DO 10 I = IL-DESCA( MB_ ), IN+1, -DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) J = JA + I - IA * IF( I+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PDLARFT( 'Forward', 'Rowwise', N-I+IA, IB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(i+ib:ia+m-1,j:ja+n-1) from the right * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', $ M-I-IB+IA, N-I+IA, IB, A, I, J, DESCA, $ WORK, A, I+IB, J, DESCA, WORK( IPW ) ) END IF * * Apply H' to columns j:ja+n-1 of current block * CALL PDORGL2( IB, N-I+IA, IB, A, I, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set columns ia:i-1 of current block to zero * CALL PDLASET( 'All', IB, I-IA, ZERO, ZERO, A, I, JA, DESCA ) 10 CONTINUE * END IF * * Handle first block separately * IF( IL.GT.IA ) THEN * IB = IN - IA + 1 * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PDLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia+ib:ia+m-1,ja:ja+n-1) from the right * CALL PDLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', M-IB, $ N, IB, A, IA, JA, DESCA, WORK, A, IA+IB, JA, $ DESCA, WORK( IPW ) ) * * Apply H' to columns ja:ja+n-1 of current block * CALL PDORGL2( IB, N, IB, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORGLQ * END scalapack-2.0.2/SRC/pdorgql.f000644 000766 000024 00000027010 10363532303 016155 0ustar00juliestaff000000 000000 SUBROUTINE PDORGQL( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORGQL generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the last N columns of a product of K elementary reflectors of order M * * Q = H(k) . . . H(2) H(1) * * as returned by PDGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA+N-K <= j <= JA+N-1, * as returned by PDGEQLF in the K columns of its distributed * matrix argument A(IA:*,JA+N-K:JA+N-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PDGEQLF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( NqA0 + MpA0 + NB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IINFO, IPW, J, JB, JN, $ LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PDLARFB, $ PDLARFT, PDLASET, PDORG2L, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORGQL', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * IPW = DESCA( NB_ )*DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Set A(ia+m-n+jn-ja+1:ia-m+1,ja:jn) to zero. * CALL PDLASET( 'All', N-JN+JA-1, JN-JA+1, ZERO, ZERO, A, $ IA+M-N+JN-JA+1, JA, DESCA ) * * Use unblocked code for the first or only block. * CALL PDORG2L( M-N+JN-JA+1, JN-JA+1, JN-JA-N+K+1, A, IA, JA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Use blocked code * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PDLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:ia+m-n+j+jb-ja-1,ja:j-1) from the left * CALL PDLARFB( 'Left', 'No transpose', 'Backward', $ 'Columnwise', M-N+J+JB-JA, J-JA, JB, A, IA, $ J, DESCA, WORK, A, IA, JA, DESCA, WORK( IPW ) ) * * Apply H to rows ia:m-k+i+ib-1 of current block * CALL PDORG2L( M-N+J+JB-JA, JB, JB, A, IA, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows ia+m-n+j+jb-ja:ia+m-1,j:j+jb-1 of current block to * zero * CALL PDLASET( 'All', N-J-JB+JA, JB, ZERO, ZERO, A, $ IA+M-N+J+JB-JA, J, DESCA ) * 10 CONTINUE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORGQL * END scalapack-2.0.2/SRC/pdorgqr.f000644 000766 000024 00000030741 10363532303 016170 0ustar00juliestaff000000 000000 SUBROUTINE PDORGQR( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORGQR generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the first N columns of a product of K elementary reflectors of order * M * * Q = H(1) H(2) . . . H(k) * * as returned by PDGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA <= j <= JA+K-1, as * returned by PDGEQRF in the K columns of its distributed * matrix argument A(IA:*,JA:JA+K-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+K-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PDGEQRF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( NqA0 + MpA0 + NB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICTXT, IINFO, IPW, J, JB, JL, $ JN, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PDLARFB, $ PDLARFT, PDLASET, PDORG2R, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORGQR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * IPW = DESCA( NB_ )*DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JL = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * CALL PDLASET( 'All', JL-JA, JA+N-JL, ZERO, ZERO, A, IA, JL, $ DESCA ) * * Use unblocked code for the last or only block. * CALL PDORG2R( M-JL+JA, JA+N-JL, JA+K-JL, A, IA+JL-JA, JL, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Is there at least one block of columns to loop over ? * IF( JL.GT.JN+1 ) THEN * * Use blocked code * DO 10 J = JL-DESCA( NB_ ), JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * IF( J+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PDLARFT( 'Forward', 'Columnwise', M-I+IA, JB, A, I, $ J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(i:ia+m-1,j+jb:ja+n-1) from the left * CALL PDLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-I+IA, N-J-JB+JA, JB, A, I, $ J, DESCA, WORK, A, I, J+JB, DESCA, $ WORK( IPW ) ) END IF * * Apply H to rows i:ia+m-1 of current block * CALL PDORG2R( M-I+IA, JB, JB, A, I, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows ia:i-1 of current block to zero * CALL PDLASET( 'All', I-IA, JB, ZERO, ZERO, A, IA, J, DESCA ) * 10 CONTINUE * END IF * * Handle first block separately * IF( JL.GT.JA ) THEN * JB = JN - JA + 1 * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PDLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:ia+m-1,ja+jb:ja+n-1) from the left * CALL PDLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M, N-JB, JB, A, IA, JA, DESCA, WORK, A, IA, $ JA+JB, DESCA, WORK( IPW ) ) * * Apply H to rows ia:ia+m-1 of current block * CALL PDORG2R( M, JB, JB, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * END IF * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORGQR * END scalapack-2.0.2/SRC/pdorgr2.f000644 000766 000024 00000025646 10363532303 016101 0ustar00juliestaff000000 000000 SUBROUTINE PDORGR2( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORGR2 generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as the * last M rows of a product of K elementary reflectors of order N * * Q = H(1) H(2) . . . H(k) * * as returned by PDGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA+M-K <= i <= IA+M-1, as * returned by PDGERQF in the K rows of its distributed * matrix argument A(IA+M-K:IA+M-1,JA:*). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCr(IA+M-1) * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDGERQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NqA0 + MAX( 1, MpA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, II, LWMIN, MP, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 DOUBLE PRECISION TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PDELSET, $ PDLARF, PDLASET, PDSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQA0 + MAX( 1, MPA0 ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORGR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * IF( K.LT.M ) THEN * * Initialise rows ia:ia+m-k-1 to rows of the unit matrix * CALL PDLASET( 'All', M-K, N-M, ZERO, ZERO, A, IA, JA, DESCA ) CALL PDLASET( 'All', M-K, M, ZERO, ONE, A, IA, JA+N-M, DESCA ) * END IF * TAUI = ZERO MP = NUMROC( IA+M-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) * DO 10 I = IA+M-K, IA+M-1 * * Apply H(i) to A(ia:i,ja:ja+n-k+i-1) from the right * CALL PDELSET( A, I, JA+N-M+I-IA, DESCA, ONE ) CALL PDLARF( 'Right', I-IA, I-IA+N-M+1, A, I, JA, DESCA, $ DESCA( M_ ), TAU, A, IA, JA, DESCA, WORK ) II = INDXG2L( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IAROW = INDXG2P( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IF( MYROW.EQ.IAROW ) $ TAUI = TAU( MIN( II, MP ) ) CALL PDSCAL( I-IA+N-M, -TAUI, A, I, JA, DESCA, DESCA( M_ ) ) CALL PDELSET( A, I, JA+N-M+I-IA, DESCA, ONE-TAUI ) * * Set A(i,ja+n-m+i-ia+1:ja+n-1) to zero * CALL PDLASET( 'All', 1, IA+M-1-I, ZERO, ZERO, A, I, $ JA+N-M+I-IA+1, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORGR2 * END scalapack-2.0.2/SRC/pdorgrq.f000644 000766 000024 00000027013 10363532303 016166 0ustar00juliestaff000000 000000 SUBROUTINE PDORGRQ( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORGRQ generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as the * last M rows of a product of K elementary reflectors of order N * * Q = H(1) H(2) . . . H(k) * * as returned by PDGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. * N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA+M-K <= i <= IA+M-1, as * returned by PDGERQF in the K rows of its distributed * matrix argument A(IA+M-K:IA+M-1,JA:*). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCr(IA+M-1) * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDGERQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( MpA0 + NqA0 + MB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IN, IPW, $ LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PDLARFB, $ PDLARFT, PDLASET, PDORGR2, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORGRQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) )*DESCA( MB_ ), IA+M-1 ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * * Set A(ia:in,ja+n-m+in-ia+1:ja-n+1) to zero. * CALL PDLASET( 'All', IN-IA+1, M-IN+IA-1, ZERO, ZERO, A, IA, $ JA+N-M+IN-IA+1, DESCA ) * * Use unblocked code for the first or only block. * CALL PDORGR2( IN-IA+1, N-M+IN-IA+1, IN-IA-M+K+1, A, IA, JA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Use blocked code * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PDLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, I, JA, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:i-1,ja:ja+n-m+i+ib-ia-1) from the right * CALL PDLARFB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ I-IA, N-M+I+IB-IA, IB, A, I, JA, DESCA, WORK, A, $ IA, JA, DESCA, WORK( IPW ) ) * * Apply H' to columns ja:ja+n-m+i+ib-ia-1 of current block * CALL PDORGR2( IB, N-M+I+IB-IA, IB, A, I, JA, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows i:i+ib-1,ja+n-m+i+ib-ia:ja+n-1 of current block to * zero * CALL PDLASET( 'All', IB, M-I-IB+IA, ZERO, ZERO, A, I, $ JA+N-M+I+IB-IA, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORGRQ * END scalapack-2.0.2/SRC/pdorm2l.f000644 000766 000024 00000040472 10363532303 016073 0ustar00juliestaff000000 000000 SUBROUTINE PDORM2L( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORM2L overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PDGEQLF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PDGEQLF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ), * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PDGEQLF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( 1, NqC0 ); * if SIDE = 'R', LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( * NUMROC( N+ICOFFC,NB_A,0,0,NPCOL ),NB_A,0,0,LCMQ ) ); * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICCOL, ICOFFC, ICROW, ICTXT, ICC, $ II, IROFFA, IROFFC, J, J1, J2, J3, JCC, JJ, $ LCM, LCMQ, LWMIN, MI, MP, MPC0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 DOUBLE PRECISION AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DGEBR2D, $ DGEBS2D, DGERV2D, DGESD2D, DSCAL, $ INFOG2L, PDELSET, PDELSET2, PDLARF, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MPC0 + MAX( 1, NQC0 ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = NQC0 + MAX( MAX( 1, MPC0 ), NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ) ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORM2L', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, ICC, $ JCC, ICROW, ICCOL ) IF( LEFT ) THEN IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JC+N-1, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) THEN AJJ = ONE - TAU( JJ ) CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1 ) CALL DSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1, $ IAROW, IACOL ) CALL DSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN AJJ = ONE - TAU( JJ ) END IF * IF( IACOL.NE.ICCOL ) THEN IF( MYCOL.EQ.IACOL ) $ CALL DGESD2D( ICTXT, 1, 1, AJJ, 1, MYROW, ICCOL ) IF( MYCOL.EQ.ICCOL ) $ CALL DGERV2D( ICTXT, 1, 1, AJJ, 1, MYROW, IACOL ) END IF * IF( MYCOL.EQ.ICCOL ) THEN MP = NUMROC( IC+M-1, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) CALL DSCAL( MP-ICC+1, AJJ, C( ICC+(JCC-1)* $ DESCC( LLD_ ) ), 1 ) END IF * END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( LEFT .AND. NOTRAN .OR. .NOT.LEFT .AND. .NOT.NOTRAN ) THEN J1 = JA J2 = JA+K-1 J3 = 1 ELSE J1 = JA+K-1 J2 = JA J3 = -1 END IF * IF( LEFT ) THEN NI = N IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M END IF * DO 10 J = J1, J2, J3 * IF( LEFT ) THEN * * H(j) or H(j)' is applied to C(ic:ic+m-k+j-ja,jc:jc+n-1) * MI = M - K + J - JA + 1 ELSE * * H(j) or H(j)' is applied to C(ic:ic+m-1,jc:jc+n-k+j-ja) * NI = N - K + J - JA + 1 END IF * * Apply H(j) or H(j)' * CALL PDELSET2( AJJ, A, IA+NQ-K+J-JA, J, DESCA, ONE ) CALL PDLARF( SIDE, MI, NI, A, IA, J, DESCA, 1, TAU, C, IC, $ JC, DESCC, WORK ) CALL PDELSET( A, IA+NQ-K+J-JA, J, DESCA, AJJ ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORM2L * END scalapack-2.0.2/SRC/pdorm2r.f000644 000766 000024 00000040630 10363532303 016075 0ustar00juliestaff000000 000000 SUBROUTINE PDORM2R( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORM2R overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of k elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PDGEQRF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PDGEQRF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ); * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PDGEQRF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( 1, NqC0 ); * if SIDE = 'R', LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( * NUMROC( N+ICOFFC,NB_A,0,0,NPCOL ),NB_A,0,0,LCMQ ) ); * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICCOL, ICOFFC, ICROW, ICTXT, ICC, $ II, IROFFA, IROFFC, J, J1, J2, J3, JCC, JJ, $ LCM, LCMQ, LWMIN, MI, MP, MPC0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 DOUBLE PRECISION AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DGEBR2D, $ DGEBS2D, DGERV2D, DGESD2D, DSCAL, $ INFOG2L, PDELSET, PDELSET2, PDLARF, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MPC0 + MAX( 1, NQC0 ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = NQC0 + MAX( MAX( 1, MPC0 ), NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ) ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORM2R', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, ICC, $ JCC, ICROW, ICCOL ) IF( LEFT ) THEN IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JC+N-1, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) THEN AJJ = ONE - TAU( JJ ) CALL DGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1 ) CALL DSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1, $ IAROW, IACOL ) CALL DSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN AJJ = ONE - TAU( JJ ) END IF * IF( IACOL.NE.ICCOL ) THEN IF( MYCOL.EQ.IACOL ) $ CALL DGESD2D( ICTXT, 1, 1, AJJ, 1, MYROW, ICCOL ) IF( MYCOL.EQ.ICCOL ) $ CALL DGERV2D( ICTXT, 1, 1, AJJ, 1, MYROW, IACOL ) END IF * IF( MYCOL.EQ.ICCOL ) THEN MP = NUMROC( IC+M-1, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) CALL DSCAL( MP-ICC+1, AJJ, C( ICC+(JCC-1)* $ DESCC( LLD_ ) ), 1 ) END IF * END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) THEN J1 = JA J2 = JA+K-1 J3 = 1 ELSE J1 = JA+K-1 J2 = JA J3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M ICC = IC END IF * DO 10 J = J1, J2, J3 IF( LEFT ) THEN * * H(j) or H(j)' is applied to C(ic+j-ja:ic+m-1,jc:jc+n-1) * MI = M - J + JA ICC = IC + J - JA ELSE * * H(j) or H(j)' is applied to C(ic:ic+m-1,jc+j-ja:jc+n-1) * NI = N - J + JA JCC = JC + J - JA END IF * * Apply H(j) or H(j)' * CALL PDELSET2( AJJ, A, IA+J-JA, J, DESCA, ONE ) CALL PDLARF( SIDE, MI, NI, A, IA+J-JA, J, DESCA, 1, TAU, C, $ ICC, JCC, DESCC, WORK ) CALL PDELSET( A, IA+J-JA, J, DESCA, AJJ ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORM2R * END scalapack-2.0.2/SRC/pdormbr.f000644 000766 000024 00000054356 10363532303 016167 0ustar00juliestaff000000 000000 SUBROUTINE PDORMBR( VECT, SIDE, TRANS, M, N, K, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, VECT INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * If VECT = 'Q', PDORMBR overwrites the general real distributed M-by-N * matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * If VECT = 'P', PDORMBR overwrites sub( C ) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': P * sub( C ) sub( C ) * P * TRANS = 'T': P**T * sub( C ) sub( C ) * P**T * * Here Q and P**T are the orthogonal distributed matrices determined by * PDGEBRD when reducing a real distributed matrix A(IA:*,JA:*) to * bidiagonal form: A(IA:*,JA:*) = Q * B * P**T. Q and P**T are defined * as products of elementary reflectors H(i) and G(i) respectively. * * Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the * order of the orthogonal matrix Q or P**T that is applied. * * If VECT = 'Q', A(IA:*,JA:*) is assumed to have been an NQ-by-K * matrix: * if nq >= k, Q = H(1) H(2) . . . H(k); * if nq < k, Q = H(1) H(2) . . . H(nq-1). * * If VECT = 'P', A(IA:*,JA:*) is assumed to have been a K-by-NQ * matrix: * if k < nq, P = G(1) G(2) . . . G(k); * if k >= nq, P = G(1) G(2) . . . G(nq-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * VECT (global input) CHARACTER * = 'Q': apply Q or Q**T; * = 'P': apply P or P**T. * * SIDE (global input) CHARACTER * = 'L': apply Q, Q**T, P or P**T from the Left; * = 'R': apply Q, Q**T, P or P**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q or P; * = 'T': Transpose, apply Q**T or P**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * If VECT = 'Q', the number of columns in the original * distributed matrix reduced by PDGEBRD. * If VECT = 'P', the number of rows in the original * distributed matrix reduced by PDGEBRD. * K >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+MIN(NQ,K)-1)) if * VECT='Q', and (LLD_A,LOCc(JA+NQ-1)) if VECT = 'P'. NQ = M * if SIDE = 'L', and NQ = N otherwise. The vectors which * define the elementary reflectors H(i) and G(i), whose * products determine the matrices Q and P, as returned by * PDGEBRD. * If VECT = 'Q', LLD_A >= max(1,LOCr(IA+NQ-1)); * if VECT = 'P', LLD_A >= max(1,LOCr(IA+MIN(NQ,K)-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(NQ,K)-1) if VECT = 'Q', LOCr(IA+MIN(NQ,K)-1) if * VECT = 'P', TAU(i) must contain the scalar factor of the * elementary reflector H(i) or G(i), which determines Q or P, * as returned by PDGEBRD in its array argument TAUQ or TAUP. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, if VECT='Q', sub( C ) is overwritten by Q*sub( C ) * or Q'*sub( C ) or sub( C )*Q' or sub( C )*Q; if VECT='P, * sub( C ) is overwritten by P*sub( C ) or P'*sub( C ) or * sub( C )*P or sub( C )*P'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * NQ = M; * if( (VECT = 'Q' and NQ >= K) or (VECT <> 'Q' and NQ > K) ), * IAA=IA; JAA=JA; MI=M; NI=N; ICC=IC; JCC=JC; * else * IAA=IA+1; JAA=JA; MI=M-1; NI=N; ICC=IC+1; JCC=JC; * end if * else if SIDE = 'R', * NQ = N; * if( (VECT = 'Q' and NQ >= K) or (VECT <> 'Q' and NQ > K) ), * IAA=IA; JAA=JA; MI=M; NI=N; ICC=IC; JCC=JC; * else * IAA=IA; JAA=JA+1; MI=M; NI=N-1; ICC=IC; JCC=JC+1; * end if * end if * * If VECT = 'Q', * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * else if VECT <> 'Q', * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( MI+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * end if * * where LCMP = LCM / NPROW, LCMQ = LCM / NPCOL, with * LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JAA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( MI+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If VECT = 'Q', * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * else * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL APPLYQ, LEFT, LQUERY, NOTRAN CHARACTER TRANST INTEGER IAA, IACOL, IAROW, ICC, ICCOL, ICOFFA, ICOFFC, $ ICROW, ICTXT, IINFO, IROFFA, IROFFC, JAA, JCC, $ LCM, LCMP, LCMQ, LWMIN, MI, MPC0, MQA0, MYCOL, $ MYROW, NI, NPA0, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDORMLQ, $ PDORMQR, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(1000+CTXT_) ELSE APPLYQ = LSAME( VECT, 'Q' ) LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q or P * IF( LEFT ) THEN NQ = M IF( ( APPLYQ .AND. NQ.GE.K ) .OR. $ ( .NOT.APPLYQ .AND. NQ.GT.K ) ) THEN IAA = IA JAA = JA MI = M NI = N ICC = IC JCC = JC ELSE IAA = IA + 1 JAA = JA MI = M - 1 NI = N ICC = IC + 1 JCC = JC END IF * IF( APPLYQ ) THEN CALL CHK1MAT( M, 4, K, 6, IA, JA, DESCA, 10, INFO ) ELSE CALL CHK1MAT( K, 6, M, 4, IA, JA, DESCA, 10, INFO ) END IF ELSE NQ = N IF( ( APPLYQ .AND. NQ.GE.K ) .OR. $ ( .NOT.APPLYQ .AND. NQ.GT.K ) ) THEN IAA = IA JAA = JA MI = M NI = N ICC = IC JCC = JC ELSE IAA = IA JAA = JA + 1 MI = M NI = N - 1 ICC = IC JCC = JC + 1 END IF * IF( APPLYQ ) THEN CALL CHK1MAT( N, 5, K, 6, IA, JA, DESCA, 10, INFO ) ELSE CALL CHK1MAT( K, 6, N, 5, IA, JA, DESCA, 10, INFO ) END IF END IF CALL CHK1MAT( M, 4, N, 5, IC, JC, DESCC, 15, INFO ) * IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) ICOFFA = MOD( JAA-1, DESCA( NB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JAA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( APPLYQ ) THEN IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF ELSE * IF( LEFT ) THEN MQA0 = NUMROC( MI+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ MI+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.APPLYQ .AND. .NOT.LSAME( VECT, 'P' ) ) THEN INFO = -1 ELSE IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -2 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -3 ELSE IF( K.LT.0 ) THEN INFO = -6 ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( APPLYQ .AND. LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( APPLYQ .AND. LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -13 ELSE IF( .NOT.APPLYQ .AND. LEFT .AND. $ ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ IROFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( APPLYQ .AND. LEFT .AND. $ DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( .NOT.APPLYQ .AND. LEFT .AND. $ DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF * IF( APPLYQ ) THEN IDUM1( 1 ) = ICHAR( 'Q' ) ELSE IDUM1( 1 ) = ICHAR( 'P' ) END IF IDUM2( 1 ) = 1 IF( LEFT ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'R' ) END IF IDUM2( 2 ) = 2 IF( NOTRAN ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'T' ) END IF IDUM2( 3 ) = 3 IDUM1( 4 ) = K IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( APPLYQ ) THEN IF( LEFT ) THEN CALL PCHK2MAT( M, 4, K, 6, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( N, 5, K, 6, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) END IF ELSE IF( LEFT ) THEN CALL PCHK2MAT( K, 6, M, 4, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( K, 6, N, 5, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMBR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( APPLYQ ) THEN * * Apply Q * IF( NQ.GE.K ) THEN * * Q was determined by a call to PDGEBRD with nq >= k * CALL PDORMQR( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) ELSE IF( NQ.GT.1 ) THEN * * Q was determined by a call to PDGEBRD with nq < k * CALL PDORMQR( SIDE, TRANS, MI, NI, NQ-1, A, IA+1, JA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) END IF ELSE * * Apply P * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF IF( NQ.GT.K ) THEN * * P was determined by a call to PDGEBRD with nq > k * CALL PDORMLQ( SIDE, TRANST, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) ELSE IF( NQ.GT.1 ) THEN * * P was determined by a call to PDGEBRD with nq <= k * CALL PDORMLQ( SIDE, TRANST, MI, NI, NQ-1, A, IA, JA+1, $ DESCA, TAU, C, ICC, JCC, DESCC, WORK, LWORK, $ IINFO ) END IF END IF * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMBR * END scalapack-2.0.2/SRC/pdormhr.f000644 000766 000024 00000036502 10363532303 016166 0ustar00juliestaff000000 000000 SUBROUTINE PDORMHR( SIDE, TRANS, M, N, ILO, IHI, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, IHI, ILO, INFO, JA, JC, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORMHR overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix of order nq, with * nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the * product of IHI-ILO elementary reflectors, as returned by PDGEHRD: * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * ILO and IHI must have the same values as in the previous call * of PDGEHRD. Q is equal to the unit matrix except in the * distributed submatrix Q(ia+ilo:ia+ihi-1,ia+ilo:ja+ihi-1). * If SIDE = 'L', 1 <= ILO <= IHI <= max(1,M); * if SIDE = 'R', 1 <= ILO <= IHI <= max(1,N); * ILO and IHI are relative indexes. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE = 'R'. The vectors which * define the elementary reflectors, as returned by PDGEHRD. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+M-2) * if SIDE = 'L', and LOCc(JA+N-2) if SIDE = 'R'. This array * contains the scalar factors TAU(j) of the elementary * reflectors H(j) as returned by PDGEHRD. TAU is tied to * the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * * IAA = IA + ILO; JAA = JA+ILO-1; * If SIDE = 'L', * MI = IHI-ILO; NI = N; ICC = IC + ILO; JCC = JC; * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * MI = M; NI = IHI-ILO; ICC = IC; JCC = JC + ILO; * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN INTEGER IAA, IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, $ IINFO, IROFFA, IROFFC, JAA, JCC, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NH, NI, NPA0, $ NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDORMQR, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 NH = IHI - ILO IF( NPROW.EQ.-1 ) THEN INFO = -(1000+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) IAA = IA + ILO JAA = JA + ILO - 1 * * NQ is the order of Q * IF( LEFT ) THEN NQ = M MI = NH NI = N ICC = IC + ILO JCC = JC CALL CHK1MAT( M, 3, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N MI = M NI = NH ICC = IC JCC = JC + ILO CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, NQ ) ) THEN INFO = -5 ELSE IF( IHI.LT.MIN( ILO, NQ ) .OR. IHI.GT.NQ ) THEN INFO = -6 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = ILO IDUM2( 3 ) = 5 IDUM1( 4 ) = IHI IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, M, 3, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMHR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. NH.EQ.0 ) $ RETURN * CALL PDORMQR( SIDE, TRANS, MI, NI, NH, A, IAA, JAA, DESCA, TAU, $ C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMHR * END scalapack-2.0.2/SRC/pdorml2.f000644 000766 000024 00000035223 10363532303 016071 0ustar00juliestaff000000 000000 SUBROUTINE PDORML2( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORML2 overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PDGELQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= max(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PDGELQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDGELQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IROFFC, JCC, LCM, LCMP, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPCOL, $ NPROW, NQ, NQC0 DOUBLE PRECISION AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PDELSET, $ PDELSET2, PDLARF, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORML2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN .OR. .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K -1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC ELSE MI = M ICC = IC CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(i:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H(i) or H(i)' * CALL PDELSET2( AII, A, I, JA+I-IA, DESCA, ONE ) CALL PDLARF( SIDE, MI, NI, A, I, JA+I-IA, DESCA, DESCA( M_ ), $ TAU, C, ICC, JCC, DESCC, WORK ) CALL PDELSET( A, I, JA+I-IA, DESCA, AII ) * 10 CONTINUE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORML2 * END scalapack-2.0.2/SRC/pdormlq.f000644 000766 000024 00000042020 10363532303 016161 0ustar00juliestaff000000 000000 SUBROUTINE PDORMLQ( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORMLQ overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PDGELQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= max(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PDGELQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDGELQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, JCC, $ LCM, LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDLARFB, $ PDLARFT, PDORML2, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMLQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC ELSE MI = M ICC = IC CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. NOTRAN ) .OR. ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) $ CALL PDORML2( SIDE, TRANS, M, N, I1-IA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PDLARFT( 'Forward', 'Rowwise', NQ-I+IA, IB, A, I, JA+I-IA, $ DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+i-ia:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H or H' * CALL PDLARFB( SIDE, TRANST, 'Forward', 'Rowwise', MI, NI, IB, $ A, I, JA+I-IA, DESCA, WORK, C, ICC, JCC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. ( .NOT.LEFT .AND. NOTRAN ) ) $ CALL PDORML2( SIDE, TRANS, M, N, I2-IA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMLQ * END scalapack-2.0.2/SRC/pdormql.f000644 000766 000024 00000042254 10363532303 016172 0ustar00juliestaff000000 000000 SUBROUTINE PDORMQL( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORMQL overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PDGEQLF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PDGEQLF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ), * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PDGEQLF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( N+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IAROW, ICCOL, ICOFFC, ICROW, ICTXT, IINFO, IPW, $ IROFFA, IROFFC, J, J1, J2, J3, JB, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPA0, NPCOL, $ NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDLARFB, $ PDLARFT, PDORM2L, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMQL', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN J1 = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+K-1 ) + 1 J2 = JA+K-1 J3 = DESCA( NB_ ) ELSE J1 = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) J2 = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+K-1 ) + 1 J3 = -DESCA( NB_ ) END IF * IF( LEFT ) THEN NI = N IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M END IF * * Use unblocked code for the first block if necessary * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN JB = J1 - JA IF( LEFT ) THEN MI = M - K + JB ELSE NI = N - K + JB END IF CALL PDORM2L( SIDE, TRANS, MI, NI, JB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 DO 10 J = J1, J2, J3 JB = MIN( DESCA( NB_ ), K-J+JA ) * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PDLARFT( 'Backward', 'Columnwise', NQ-K+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic:ic+m-k+j+jb-ja-1,jc:jc+n-1) * MI = M - K + J + JB - JA ELSE * * H or H' is applied to C(ic:ic+m-1,jc:jc+n-k+j+jb-ja-1) * NI = N - K + J + JB - JA END IF * * Apply H or H' * CALL PDLARFB( SIDE, TRANS, 'Backward', 'Columnwise', MI, NI, $ JB, A, IA, J, DESCA, WORK, C, IC, JC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN JB = J2 - JA IF( LEFT ) THEN MI = M - K + JB ELSE NI = N - K + JB END IF CALL PDORM2L( SIDE, TRANS, MI, NI, JB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMQL * END scalapack-2.0.2/SRC/pdormqr.f000644 000766 000024 00000042067 10363532303 016202 0ustar00juliestaff000000 000000 SUBROUTINE PDORMQR( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORMQR overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of k elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PDGEQRF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PDGEQRF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ); * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PDGEQRF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( N+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, IINFO, $ IPW, IROFFA, IROFFC, J, J1, J2, J3, JB, JCC, $ LCM, LCMQ, LWMIN, MI, MPC0, MYCOL, MYROW, NI, $ NPA0, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDLARFB, $ PDLARFT, PDORM2R, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMQR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN J1 = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) $ + 1 J2 = JA+K-1 J3 = DESCA( NB_ ) ELSE J1 = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) J2 = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) $ + 1 J3 = -DESCA( NB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M ICC = IC END IF * * Use unblocked code for the first block if necessary * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. ( .NOT.LEFT .AND. NOTRAN ) ) $ CALL PDORM2R( SIDE, TRANS, M, N, J1-JA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 DO 10 J = J1, J2, J3 JB = MIN( DESCA( NB_ ), K-J+JA ) * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PDLARFT( 'Forward', 'Columnwise', NQ-J+JA, JB, A, $ IA+J-JA, J, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+j-ja:ic+m-1,jc:jc+n-1) * MI = M - J + JA ICC = IC + J - JA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+j-ja:jc+n-1) * NI = N - J + JA JCC = JC + J - JA END IF * * Apply H or H' * CALL PDLARFB( SIDE, TRANS, 'Forward', 'Columnwise', MI, NI, $ JB, A, IA+J-JA, J, DESCA, WORK, C, ICC, JCC, $ DESCC, WORK( IPW ) ) 10 CONTINUE * * Use unblocked code for the last block if necessary * IF( ( LEFT .AND. NOTRAN ) .OR. ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) $ CALL PDORM2R( SIDE, TRANS, M, N, J2-JA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMQR * END scalapack-2.0.2/SRC/pdormr2.f000644 000766 000024 00000034526 10363532303 016104 0ustar00juliestaff000000 000000 SUBROUTINE PDORMR2( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORMR2 overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PDGERQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PDGERQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDGERQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICCOL, ICOFFA, ICOFFC, $ ICROW, ICTXT, IROFFC, LCM, LCMP, LWMIN, MI, $ MPC0, MYCOL, MYROW, NI, NPCOL, NPROW, NQ, NQC0 DOUBLE PRECISION AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PDELSET, $ PDELSET2, PDLARF, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K - 1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N ELSE MI = M CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(ic:ic+m-k+i-ia,jc:jc+n-1) * MI = M - K + I - IA + 1 ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc:jc+n-k+i-ia+1) * NI = N - K + I - IA + 1 END IF * * Apply H(i) or H(i)' * CALL PDELSET2( AII, A, I, JA+NQ-K+I-IA, DESCA, ONE ) CALL PDLARF( SIDE, MI, NI, A, I, JA, DESCA, DESCA( M_ ), $ TAU, C, IC, JC, DESCC, WORK ) CALL PDELSET( A, I, JA+NQ-K+I-IA, DESCA, AII ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMR2 * END scalapack-2.0.2/SRC/pdormr3.f000644 000766 000024 00000035126 10363532303 016102 0ustar00juliestaff000000 000000 SUBROUTINE PDORMR3( SIDE, TRANS, M, N, K, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, L, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORMR3 overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PDTZRZF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PDTZRZF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDTZRZF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IROFFC, JAA, JCC, LCM, $ LCMP, LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPCOL, $ NPROW, NQ, NQC0 * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PDLARZ, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( L.LT.0 .OR. L.GT.NQ ) THEN INFO = -6 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMR3', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K - 1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC JAA = JA + M - L ELSE MI = M ICC = IC JAA = JA + N - L CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(ic+i-ia:icc+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H(i) or H(i)' * CALL PDLARZ( SIDE, MI, NI, L, A, I, JAA, DESCA, DESCA( M_ ), $ TAU, C, ICC, JCC, DESCC, WORK ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMR3 * END scalapack-2.0.2/SRC/pdormrq.f000644 000766 000024 00000043036 11663037655 016215 0ustar00juliestaff000000 000000 SUBROUTINE PDORMRQ( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORMRQ overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PDGERQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PDGERQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDGERQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN, RIGHT, TRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, LCM, $ LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, MYROW, NI, $ NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDLARFB, $ PDLARFT, PDORMR2, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE IF( LSAME( SIDE, 'L' ) ) THEN LEFT = .TRUE. RIGHT = .FALSE. ELSE LEFT = .FALSE. RIGHT = .TRUE. END IF IF( LSAME( TRANS, 'N' ) ) THEN NOTRAN = .TRUE. TRAN = .FALSE. ELSE NOTRAN = .FALSE. TRAN = .TRUE. END IF * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 9, M, 3, N, 4, $ IC, JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 9, M, 3, N, 4, $ IC, JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMRQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N ELSE MI = M CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I1 - IA IF( LEFT ) THEN MI = M - K + IB ELSE NI = N - K + IB END IF CALL PDORMR2( SIDE, TRANS, MI, NI, IB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PDLARFT( 'Backward', 'Rowwise', NQ-K+I+IB-IA, IB, $ A, I, JA, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic:ic+m-k+i+ib-ia-1,jc:jc+n-1) * MI = M - K + I + IB - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc:jc+n-k+i+ib-ia-1) * NI = N - K + I + IB - IA END IF * * Apply H or H' * CALL PDLARFB( SIDE, TRANST, 'Backward', 'Rowwise', MI, NI, $ IB, A, I, JA, DESCA, WORK, C, IC, JC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( RIGHT .AND. TRAN ) .OR. $ ( LEFT .AND. NOTRAN ) ) THEN IB = I2 - IA IF( LEFT ) THEN MI = M - K + IB ELSE NI = N - K + IB END IF CALL PDORMR2( SIDE, TRANS, MI, NI, IB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMRQ * END scalapack-2.0.2/SRC/pdormrz.f000644 000766 000024 00000043330 10363532303 016205 0ustar00juliestaff000000 000000 SUBROUTINE PDORMRZ( SIDE, TRANS, M, N, K, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, L, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORMRZ overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PDTZRZF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PDTZRZF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PDTZRZF. * TAU is tied to the distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, JAA, $ JCC, LCM, LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, $ MYROW, NI, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDLARZB, $ PDLARZT, PDORMR3, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -6 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IDUM1( 4 ) = L IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMRZ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC JAA = JA + M - L ELSE MI = M ICC = IC JAA = JA + N - L CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I1 - IA IF( LEFT ) THEN MI = M ELSE NI = N END IF CALL PDORMR3( SIDE, TRANS, MI, NI, IB, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PDLARZT( 'Backward', 'Rowwise', L, IB, A, I, JAA, DESCA, $ TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+i-ia:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H or H' * CALL PDLARZB( SIDE, TRANST, 'Backward', 'Rowwise', MI, NI, IB, $ L, A, I, JAA, DESCA, WORK, C, ICC, JCC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I2 - IA IF( LEFT ) THEN MI = M ELSE NI = N END IF CALL PDORMR3( SIDE, TRANS, MI, NI, IB, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMRZ * END scalapack-2.0.2/SRC/pdormtr.f000644 000766 000024 00000040443 10363532303 016201 0ustar00juliestaff000000 000000 SUBROUTINE PDORMTR( SIDE, UPLO, TRANS, M, N, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, UPLO INTEGER IA, IC, INFO, JA, JC, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDORMTR overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix of order nq, with * nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the * product of nq-1 elementary reflectors, as returned by PDSYTRD: * * if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1); * * if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(IA:*,JA:*) contains elementary * reflectors from PDSYTRD; * = 'L': Lower triangle of A(IA:*,JA:*) contains elementary * reflectors from PDSYTRD. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * or (LLD_A,LOCc(JA+N-1)) if SIDE = 'R'. The vectors which * define the elementary reflectors, as returned by PDSYTRD. * If SIDE = 'L', LLD_A >= max(1,LOCr(IA+M-1)); * if SIDE = 'R', LLD_A >= max(1,LOCr(IA+N-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) DOUBLE PRECISION array, dimension LTAU, where * if SIDE = 'L' and UPLO = 'U', LTAU = LOCc(M_A), * if SIDE = 'L' and UPLO = 'L', LTAU = LOCc(JA+M-2), * if SIDE = 'R' and UPLO = 'U', LTAU = LOCc(N_A), * if SIDE = 'R' and UPLO = 'L', LTAU = LOCc(JA+N-2). * TAU(i) must contain the scalar factor of the elementary * reflector H(i), as returned by PDSYTRD. TAU is tied to the * distributed matrix A. * * C (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * * If UPLO = 'U', * IAA = IA, JAA = JA+1, ICC = IC, JCC = JC; * else UPLO = 'L', * IAA = IA+1, JAA = JA; * if SIDE = 'L', * ICC = IC+1; JCC = JC; * else * ICC = IC; JCC = JC+1; * end if * end if * * If SIDE = 'L', * MI = M-1; NI = N; * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * MI = M; MI = N-1; * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN, UPPER INTEGER IAA, IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, $ IINFO, IROFFA, IROFFC, JAA, JCC, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPA0, NPCOL, $ NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDORMQL, $ PDORMQR, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) UPPER = LSAME( UPLO, 'U' ) * IF( UPPER ) THEN IAA = IA JAA = JA+1 ICC = IC JCC = JC ELSE IAA = IA+1 JAA = JA IF( LEFT ) THEN ICC = IC + 1 JCC = JC ELSE ICC = IC JCC = JC + 1 END IF END IF * * NQ is the order of Q * IF( LEFT ) THEN NQ = M MI = M - 1 NI = N CALL CHK1MAT( MI, 4, NQ-1, 4, IAA, JAA, DESCA, 9, INFO ) ELSE NQ = N MI = M NI = N - 1 CALL CHK1MAT( NI, 5, NQ-1, 5, IAA, JAA, DESCA, 9, INFO ) END IF CALL CHK1MAT( MI, 4, NI, 5, ICC, JCC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -3 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 IF( NOTRAN ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'T' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( MI, 4, NQ-1, 4, IAA, JAA, DESCA, 9, MI, 4, $ NI, 5, ICC, JCC, DESCC, 14, 4, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( NI, 5, NQ-1, 5, IAA, JAA, DESCA, 9, MI, 4, $ NI, 5, ICC, JCC, DESCC, 14, 4, IDUM1, IDUM2, $ INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDORMTR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. NQ.EQ.1 ) $ RETURN * IF( UPPER ) THEN * * Q was determined by a call to PDSYTRD with UPLO = 'U' * CALL PDORMQL( SIDE, TRANS, MI, NI, NQ-1, A, IAA, JAA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * ELSE * * Q was determined by a call to PDSYTRD with UPLO = 'L' * CALL PDORMQR( SIDE, TRANS, MI, NI, NQ-1, A, IAA, JAA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * END IF * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDORMTR * END scalapack-2.0.2/SRC/pdpbsv.f000644 000766 000024 00000045100 10363532303 016003 0ustar00juliestaff000000 000000 SUBROUTINE PDPBSV( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PDPBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * banded symmetric positive definite distributed * matrix with bandwidth BW. * * Cholesky factorization is used to factor a reordering of * the matrix into L L'. * * See PDPBTRF and PDPBTRS for details. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (NB+2*bw)*bw * +max((bw*NRHS), bw*bw) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PDPBTRF, PDPBTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PDPBTRF and PDPBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PDPBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (NB+2*BW)*BW * * Factor the matrix * CALL PDPBTRF( UPLO, N, BW, A, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PDPBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PDPBTRS( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPBSV', -INFO ) RETURN END IF * RETURN * * End of PDPBSV * END scalapack-2.0.2/SRC/pdpbtrf.f000644 000766 000024 00000141475 11750130340 016156 0ustar00juliestaff000000 000000 SUBROUTINE PDPBTRF( UPLO, N, BW, A, JA, DESCA, AF, LAF, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), AF( * ), WORK( * ) * .. * * * Purpose * ======= * * PDPBTRF computes a Cholesky factorization * of an N-by-N real banded * symmetric positive definite distributed matrix * with bandwidth BW: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PDPBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = U' U , if UPLO = 'U', or * * P A(1:N, JA:JA+N-1) P^T = L L', if UPLO = 'L' * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDPBTRF and this is stored in AF. If a linear system * is to be solved using PDPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * bw*bw * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM1, IDUM3, JA_NEW, $ LAF_MIN, LEVEL_DIST, LLDA, MBW2, MYCOL, MYROW, $ MY_NUM_COLS, NB, NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, OFST, PART_OFFSET, PART_SIZE, $ PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE, $ STORE_N_A, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DAXPY, $ DESC_CONVERT, DGEMM, DGERV2D, DGESD2D, DLAMOV, $ DLATCPY, DPBTRF, DPOTRF, DSYRK, DTBTRS, DTRMM, $ DTRRV2D, DTRSD2D, DTRSM, DTRTRS, GLOBCHK, $ IGAMX2D, IGEBR2D, IGEBS2D, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 6*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * * Pre-calculate bw^2 * MBW2 = BW*BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -10 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 6*100+6 ) END IF * IF( ( BW.GT.N-1 ) .OR. ( BW.LT.0 ) ) THEN INFO = -3 END IF * IF( LLDA.LT.( BW+1 ) ) THEN INFO = -( 6*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 6*100+4 ) END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 6*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PDPBTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*BW ) ) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, 'PDPBTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = ( NB+2*BW )*BW * IF( LAF.LT.LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PDPBTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = BW*BW * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, 'PDPBTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 10 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDPBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 120 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW END IF * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * * Zero out space for work * DO 20 I = 1, WORK_SIZE_MIN WORK( I ) = ZERO 20 CONTINUE * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL.GT.0 ) THEN PREV_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) PREV_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL-1, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NPCOL-1 ) THEN NEXT_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL+1, 0, $ NPCOL ) ) NEXT_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL DTRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, A( OFST+ODD_SIZE*LLDA+( BW+ $ 1 ) ), LLDA-1, 0, MYCOL+1 ) * END IF * * * Factor main partition A_i = L_i {L_i}^T in each processor * CALL DPBTRF( UPLO, ODD_SIZE, BW, A( OFST+1 ), LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 30 END IF * * IF( MYCOL.LT.NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * transpose the connection block in preparation. * CALL DLATCPY( 'U', BW, BW, A( ( OFST+( BW+1 )+( ODD_SIZE- $ BW )*LLDA ) ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW ) * * Perform the triangular system solve {L_i}{{B'}_i}^T = {B_i}^T * CALL DTRTRS( 'L', 'N', 'N', BW, BW, $ A( OFST+1+( ODD_SIZE-BW )*LLDA ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO ) * * * transpose resulting block to its location * in main storage. * CALL DLATCPY( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), $ BW, A( ( OFST+( BW+1 )+( ODD_SIZE-BW )* $ LLDA ) ), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^T * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL DSYRK( UPLO, 'T', BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE, $ A( OFST+1+ODD_SIZE*LLDA ), LLDA-1 ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 30 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bw)*bw+1 ) and use for * off_diagonal block of reduced system. * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL DTRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M, $ PREV_TRI_SIZE_N, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF( INFO.EQ.0 ) THEN * * Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ . * CALL DTBTRS( 'L', 'N', 'N', ODD_SIZE, BW, BW, $ A( OFST+1 ), LLDA, AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = G_i{G_i}^T * CALL DSYRK( 'L', 'T', BW, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, ZERO, AF( 1+( ODD_SIZE+2*BW )*BW ), $ BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL DGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * {F_i}^T = {H_i}{{B'}_i}^T * * Copy matrix H_i (the last bw cols of G_i) to AF storage * as per requirements of BLAS routine DTRMM. * Since we have G_i^T stored, transpose * H_i^T to H_i. * CALL DLATCPY( 'N', BW, BW, AF( ODD_SIZE-BW+1 ), $ ODD_SIZE, AF( ( ODD_SIZE )*BW+1 ), BW ) * CALL DTRMM( 'R', 'U', 'T', 'N', BW, BW, -ONE, $ A( ( OFST+( BW+1 )+( ODD_SIZE-BW )* $ LLDA ) ), LLDA-1, AF( ( ODD_SIZE )*BW+1 ), $ BW ) * * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, $ 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 110 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 60 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL DGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0, $ MYCOL-1 ) * END IF * * Copy last diagonal block into AF storage for subsequent * operations. * CALL DLAMOV( 'N', BW, BW, A( OFST+ODD_SIZE*LLDA+1 ), LLDA-1, $ AF( ODD_SIZE*BW+MBW2+1 ), BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ 0, MYCOL+1 ) * * Add contribution to diagonal block * CALL DAXPY( MBW2, ONE, AF( ODD_SIZE*BW+2*MBW2+1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 40 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 50 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL DGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * CALL DAXPY( MBW2, ONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL DGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * CALL DAXPY( MBW2, ONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 40 50 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL DPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), BW, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL DLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL DGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0, $ COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL DTRSM( 'L', 'L', 'N', 'N', BW, BW, ONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL DSYRK( 'L', 'T', BW, BW, -ONE, AF( ( ODD_SIZE )*BW+1 ), $ BW, ZERO, WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL DGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * CALL DTRSM( 'R', 'L', 'T', 'N', BW, BW, ONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL DSYRK( 'L', 'N', BW, BW, -ONE, $ AF( ( ODD_SIZE+2*BW )*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL DGEMM( 'N', 'N', BW, BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW, ZERO, WORK( 1 ), $ BW ) * * Send contribution to offdiagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 60 CONTINUE * ELSE * * CASE UPLO = 'U' * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL.GT.0 ) THEN PREV_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) PREV_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL-1, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NPCOL-1 ) THEN NEXT_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL+1, 0, $ NPCOL ) ) NEXT_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) END IF * * * * Factor main partition A_i^T = U_i {U_i}^T in each processor * CALL DPBTRF( UPLO, ODD_SIZE, BW, A( OFST+1 ), LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 70 END IF * * IF( MYCOL.LT.NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * Move the connection block in preparation. * CALL DLAMOV( 'L', BW, BW, A( ( OFST+1+ODD_SIZE*LLDA ) ), $ LLDA-1, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW ) * * * Perform the triangular solve {L_i}{{B'}_i}^T = {B_i}^T * CALL DTRTRS( 'U', 'T', 'N', BW, BW, $ A( OFST+BW+1+( ODD_SIZE-BW )*LLDA ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO ) * * Move the resulting block back to its location in main storage. * CALL DLAMOV( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), $ BW, A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i^T = {C_i}^T-{{B'}_i}^T{{B'}_i} * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL DSYRK( UPLO, 'T', BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE, $ A( OFST+BW+1+ODD_SIZE*LLDA ), LLDA-1 ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 70 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bw)*bw+1 ) and use for * off_diagonal block of reduced system. * * Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ . * * * Copy D block into AF storage for solve. * CALL DLATCPY( 'L', PREV_TRI_SIZE_N, PREV_TRI_SIZE_M, $ A( OFST+1 ), LLDA-1, AF( 1 ), ODD_SIZE ) * IF( INFO.EQ.0 ) THEN * CALL DTBTRS( 'U', 'T', 'N', ODD_SIZE, BW, BW, $ A( OFST+1 ), LLDA, AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = G_i{G_i}^T * CALL DSYRK( 'L', 'T', BW, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, ZERO, AF( 1+( ODD_SIZE+2*BW )*BW ), $ BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL DGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * {F_i}^T = {H_i}{{B'}_i}^T * * Copy matrix H_i (the last bw cols of G_i) to AF storage * as per requirements of BLAS routine DTRMM. * Since we have G_i^T stored, transpose * H_i^T to H_i. * CALL DLATCPY( 'N', BW, BW, AF( ODD_SIZE-BW+1 ), $ ODD_SIZE, AF( ( ODD_SIZE )*BW+1 ), BW ) * CALL DTRMM( 'R', 'L', 'N', 'N', BW, BW, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ AF( ( ODD_SIZE )*BW+1 ), BW ) * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, $ 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 110 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 100 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL DGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0, $ MYCOL-1 ) * END IF * * Transpose last diagonal block into AF storage for subsequent * operations. * CALL DLATCPY( 'U', BW, BW, A( OFST+ODD_SIZE*LLDA+1+BW ), $ LLDA-1, AF( ODD_SIZE*BW+MBW2+1 ), BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ 0, MYCOL+1 ) * * Add contribution to diagonal block * CALL DAXPY( MBW2, ONE, AF( ODD_SIZE*BW+2*MBW2+1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 80 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 90 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL DGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * CALL DAXPY( MBW2, ONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL DGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * CALL DAXPY( MBW2, ONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 80 90 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL DPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), BW, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL DLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL DGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0, $ COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL DTRSM( 'L', 'L', 'N', 'N', BW, BW, ONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL DSYRK( 'L', 'T', BW, BW, -ONE, AF( ( ODD_SIZE )*BW+1 ), $ BW, ZERO, WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL DGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * CALL DTRSM( 'R', 'L', 'T', 'N', BW, BW, ONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL DSYRK( 'L', 'N', BW, BW, -ONE, $ AF( ( ODD_SIZE+2*BW )*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL DGEMM( 'N', 'N', BW, BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW, ZERO, WORK( 1 ), $ BW ) * * Send contribution to offdiagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 100 CONTINUE * END IF * 110 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 120 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PDPBTRF * END scalapack-2.0.2/SRC/pdpbtrs.f000644 000766 000024 00000063225 10363532303 016173 0ustar00juliestaff000000 000000 SUBROUTINE PDPBTRS( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, $ AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PDPBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PDPBTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * banded symmetric positive definite distributed * matrix with bandwidth BW. * Depending on the value of UPLO, A stores either U or L in the equn * A(1:N, JA:JA+N-1) = U'*U or L*L' as computed by PDPBTRF. * * Routine PDPBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDPBTRF and this is stored in AF. If a linear system * is to be solved using PDPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (bw*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NP_SAVE, PART_OFFSET, $ RETURN_CODE, STORE_M_B, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PDPBTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 7*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 10*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 10*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 10*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 10*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -14 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 7*100+6 ) END IF * IF( ( BW.GT.N-1 ) .OR. ( BW.LT.0 ) ) THEN INFO = -3 END IF * IF( LLDA.LT.( BW+1 ) ) THEN INFO = -( 7*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 7*100+4 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 10*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 10*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -3 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -6 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 7*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PDPBTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*BW ) ) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, 'PDPBTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * WORK_SIZE_MIN = ( BW*NRHS ) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, 'PDPBTRS: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB( 5 ) PARAM_CHECK( 15, 1 ) = DESCB( 4 ) PARAM_CHECK( 14, 1 ) = DESCB( 3 ) PARAM_CHECK( 13, 1 ) = DESCB( 2 ) PARAM_CHECK( 12, 1 ) = DESCB( 1 ) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA( 5 ) PARAM_CHECK( 9, 1 ) = DESCA( 4 ) PARAM_CHECK( 8, 1 ) = DESCA( 3 ) PARAM_CHECK( 7, 1 ) = DESCA( 1 ) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1005 PARAM_CHECK( 15, 2 ) = 1004 PARAM_CHECK( 14, 2 ) = 1003 PARAM_CHECK( 13, 2 ) = 1002 PARAM_CHECK( 12, 2 ) = 1001 PARAM_CHECK( 11, 2 ) = 9 PARAM_CHECK( 10, 2 ) = 705 PARAM_CHECK( 9, 2 ) = 704 PARAM_CHECK( 8, 2 ) = 703 PARAM_CHECK( 7, 2 ) = 701 PARAM_CHECK( 6, 2 ) = 6 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDPBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 20 END IF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PDPBTRSV( 'L', 'N', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PDPBTRSV( 'U', 'T', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * END IF * * Call backsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PDPBTRSV( 'L', 'T', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PDPBTRSV( 'U', 'N', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * END IF 10 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 20 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PDPBTRS * END scalapack-2.0.2/SRC/pdpbtrsv.f000644 000766 000024 00000141021 11750130340 016344 0ustar00juliestaff000000 000000 SUBROUTINE PDPBTRSV( UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B, $ IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PDPBTRSV solves a banded triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a banded * triangular matrix factor produced by the * Cholesky factorization code PDPBTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T * is dictated by the user by the parameter TRANS. * * Routine PDPBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDPBTRF and this is stored in AF. If a linear system * is to be solved using PDPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (bw*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MBW2, MYCOL, MYROW, MY_NUM_COLS, NB, NP, $ NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ DGEMM, DGERV2D, DGESD2D, DLAMOV, DMATADD, $ DTBTRS, DTRMM, DTRTRS, GLOBCHK, PXERBLA, $ RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 11*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 11*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 11*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 11*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * * Pre-calculate bw^2 * MBW2 = BW*BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -2 END IF * IF( LWORK.LT.-1 ) THEN INFO = -14 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -3 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 8*100+6 ) END IF * IF( ( BW.GT.N-1 ) .OR. ( BW.LT.0 ) ) THEN INFO = -4 END IF * IF( LLDA.LT.( BW+1 ) ) THEN INFO = -( 8*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 8*100+4 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 11*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 11*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -5 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -7 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 8*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PDPBTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*BW ) ) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, 'PDPBTRSV, D&C alg.: NB too small', $ -INFO ) RETURN END IF * * WORK_SIZE_MIN = BW*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, 'PDPBTRSV: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB( 5 ) PARAM_CHECK( 16, 1 ) = DESCB( 4 ) PARAM_CHECK( 15, 1 ) = DESCB( 3 ) PARAM_CHECK( 14, 1 ) = DESCB( 2 ) PARAM_CHECK( 13, 1 ) = DESCB( 1 ) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA( 5 ) PARAM_CHECK( 10, 1 ) = DESCA( 4 ) PARAM_CHECK( 9, 1 ) = DESCA( 3 ) PARAM_CHECK( 8, 1 ) = DESCA( 1 ) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BW PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 14 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDPBTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 180 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW END IF * * * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * IF( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL DTBTRS( UPLO, 'N', 'N', ODD_SIZE, BW, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL DLAMOV( 'N', BW, NRHS, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB, $ WORK( 1 ), BW ) * CALL DTRMM( 'L', 'U', 'N', 'N', BW, NRHS, -ONE, $ A( ( OFST+( BW+1 )+( ODD_SIZE-BW )*LLDA ) ), $ LLDA-1, WORK( 1 ), BW ) * CALL DMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL DGEMM( 'T', 'N', BW, NRHS, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, ZERO, $ WORK( 1+BW-BW ), BW ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL DMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 30 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 10 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 20 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * CALL DMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * CALL DMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 10 20 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL DTRTRS( 'L', 'N', 'N', BW, NRHS, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'T', 'N', BW, NRHS, BW, -ONE, $ AF( ( ODD_SIZE )*BW+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL DGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 30 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 80 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 40 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 50 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 40 50 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL DGEMM( 'T', 'N', BW, NRHS, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, WORK( 1 ), $ BW, ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( ( ODD_SIZE )*BW+1 ), BW, WORK( 1 ), BW, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL DTRTRS( 'L', 'T', 'N', BW, NRHS, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * ***Modification Loop ******* * 60 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 70 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 60 70 CONTINUE * [End of GOTO Loop] * 80 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL DGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -ONE, AF( 1 ), $ ODD_SIZE, WORK( 1+BW-BW ), BW, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL DLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, WORK( 1+BW-BW ), BW ) * CALL DTRMM( 'L', 'U', 'T', 'N', BW, NRHS, -ONE, $ A( ( OFST+( BW+1 )+( ODD_SIZE-BW )*LLDA ) ), $ LLDA-1, WORK( 1+BW-BW ), BW ) * CALL DMATADD( BW, NRHS, ONE, WORK( 1+BW-BW ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL DTBTRS( UPLO, 'T', 'N', ODD_SIZE, BW, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF( LSAME( TRANS, 'T' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL DTBTRS( UPLO, 'T', 'N', ODD_SIZE, BW, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL DLAMOV( 'N', BW, NRHS, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB, $ WORK( 1 ), BW ) * CALL DTRMM( 'L', 'L', 'T', 'N', BW, NRHS, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ WORK( 1 ), BW ) * CALL DMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL DGEMM( 'T', 'N', BW, NRHS, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, ZERO, $ WORK( 1+BW-BW ), BW ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL DMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 110 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 90 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 100 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * CALL DMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * CALL DMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 90 100 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL DTRTRS( 'L', 'N', 'N', BW, NRHS, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'T', 'N', BW, NRHS, BW, -ONE, $ AF( ( ODD_SIZE )*BW+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL DGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 110 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 160 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 120 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 130 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 120 130 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL DGEMM( 'T', 'N', BW, NRHS, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, WORK( 1 ), $ BW, ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( ( ODD_SIZE )*BW+1 ), BW, WORK( 1 ), BW, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL DTRTRS( 'L', 'T', 'N', BW, NRHS, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * ***Modification Loop ******* * 140 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 150 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 140 150 CONTINUE * [End of GOTO Loop] * 160 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL DGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -ONE, AF( 1 ), $ ODD_SIZE, WORK( 1+BW-BW ), BW, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL DLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, WORK( 1+BW-BW ), BW ) * CALL DTRMM( 'L', 'L', 'N', 'N', BW, NRHS, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ WORK( 1+BW-BW ), BW ) * CALL DMATADD( BW, NRHS, ONE, WORK( 1+BW-BW ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL DTBTRS( UPLO, 'N', 'N', ODD_SIZE, BW, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * END IF * End of "IF( LSAME( UPLO, 'L' ) )"... 170 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 180 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PDPBTRSV * END scalapack-2.0.2/SRC/pdpocon.f000644 000766 000024 00000036354 10363532303 016162 0ustar00juliestaff000000 000000 SUBROUTINE PDPOCON( UPLO, N, A, IA, JA, DESCA, ANORM, RCOND, WORK, $ LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LIWORK, LWORK, N DOUBLE PRECISION ANORM, RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDPOCON estimates the reciprocal of the condition number (in the * 1-norm) of a real symmetric positive definite distributed matrix * using the Cholesky factorization A = U**T*U or A = L*L**T computed by * PDPOTRF. * * An estimate is obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), and * the reciprocal of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the factor stored in * A(IA:IA+N-1,JA:JA+N-1) is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). On entry, * this array contains the local pieces of the factors L or U * from the Cholesky factorization A(IA:IA+N-1,JA:JA+N-1) = U'*U * or L*L', as computed by PDPOTRF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ANORM (global input) DOUBLE PRECISION * The 1-norm (or infinity-norm) of the symmetric distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * * RCOND (global output) DOUBLE PRECISION * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + 2*LOCc(N+MOD(JA-1,NB_A))+ * MAX( 2, MAX(NB_A*CEIL(NPROW-1,NPCOL),LOCc(N+MOD(JA-1,NB_A)) + * NB_A*CEIL(NPCOL-1,NPROW)) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr(N+MOD(IA-1,MB_A)). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPNL, IPNU, $ IPV, IPW, IPX, IROFF, IV, IX, IXX, JJA, JV, $ JX, KASE, LIWMIN, LWMIN, MYCOL, MYROW, NP, $ NPCOL, NPROW, NPMOD, NQ, NQMOD DOUBLE PRECISION AINVNM, SCALE, SL, SU, SMLNUM DOUBLE PRECISION WMAX * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGEBR2D, $ DGEBS2D, INFOG2L, PCHK1MAT, PDAMAX, $ PDLATRS, PDLACON, PDRSCL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + 2*NQMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = DBLE( LWMIN ) LIWMIN = NPMOD IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( ANORM.LT.ZERO ) THEN INFO = -7 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN IWORK( 1 ) = LIWMIN INFO = -12 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 IF( LIWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 12 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPOCON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * RCOND = ZERO IF( N.EQ.0 ) THEN RCOND = ONE RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN ELSE IF( N.EQ.1 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPNL = IPV + NP IPNU = IPNL + NQ IPW = IPNU + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Estimate the 1-norm (or I-norm) of inv(A). * AINVNM = ZERO KASE = 0 NORMIN = 'N' * 10 CONTINUE CALL PDLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), IX, JX, $ DESCX, IWORK, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( UPPER ) THEN * * Multiply by inv(U'). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( 'Upper', 'Transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, WORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL NORMIN = 'Y' * * Multiply by inv(U). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( 'Upper', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, WORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(L). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( 'Lower', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, WORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL NORMIN = 'Y' * * Multiply by inv(L'). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( 'Lower', 'Transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, WORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF * * Multiply by 1/SCALE if doing so will not cause overflow. * SCALE = SL*SU IF( SCALE.NE.ONE ) THEN CALL PDAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL DGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1, $ IAROW, MYCOL ) END IF END IF IF( SCALE.LT.ABS( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PDRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / AINVNM ) / ANORM * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PDPOCON * END scalapack-2.0.2/SRC/pdpoequ.f000644 000766 000024 00000031265 10363532303 016171 0ustar00juliestaff000000 000000 SUBROUTINE PDPOEQU( N, A, IA, JA, DESCA, SR, SC, SCOND, AMAX, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, N DOUBLE PRECISION AMAX, SCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), SC( * ), SR( * ) * .. * * Purpose * ======= * * PDPOEQU computes row and column scalings intended to * equilibrate a distributed symmetric positive definite matrix * sub( A ) = A(IA:IA+N-1,JA:JA+N-1) and reduce its condition number * (with respect to the two-norm). SR and SC contain the scale * factors, S(i) = 1/sqrt(A(i,i)), chosen so that the scaled distri- * buted matrix B with elements B(i,j) = S(i)*A(i,j)*S(j) has ones on * the diagonal. This choice of SR and SC puts the condition number * of B within a factor N of the smallest possible condition number * over all possible diagonal scalings. * * The scaling factor are stored along process rows in SR and along * process columns in SC. The duplication of information simplifies * greatly the application of the factors. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of local dimension ( LLD_A, LOCc(JA+N-1) ), the * N-by-N symmetric positive definite distributed matrix * sub( A ) whose scaling factors are to be computed. Only the * diagonal elements of sub( A ) are referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * SR (local output) DOUBLE PRECISION array, dimension LOCr(M_A) * If INFO = 0, SR(IA:IA+N-1) contains the row scale factors * for sub( A ). SR is aligned with the distributed matrix A, * and replicated across every process column. SR is tied to the * distributed matrix A. * * SC (local output) DOUBLE PRECISION array, dimension LOCc(N_A) * If INFO = 0, SC(JA:JA+N-1) contains the column scale factors * for A(IA:IA+M-1,JA:JA+N-1). SC is aligned with the distribu- * ted matrix A, and replicated down every process row. SC is * tied to the distributed matrix A. * * SCOND (global output) DOUBLE PRECISION * If INFO = 0, SCOND contains the ratio of the smallest SR(i) * (or SC(j)) to the largest SR(i) (or SC(j)), with * IA <= i <= IA+N-1 and JA <= j <= JA+N-1. If SCOND >= 0.1 * and AMAX is neither too large nor too small, it is not worth * scaling by SR (or SC). * * AMAX (global output) DOUBLE PRECISION * Absolute value of largest matrix element. If AMAX is very * close to overflow or very close to underflow, the matrix * should be scaled. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the K-th diagonal entry of sub( A ) is * nonpositive. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. CHARACTER ALLCTOP, COLCTOP, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURCOL, ICURROW, $ IDUMM, II, IIA, IOFFA, IOFFD, IROFF, J, JB, JJ, $ JJA, JN, LDA, LL, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ DOUBLE PRECISION AII, SMIN * .. * .. Local Arrays .. INTEGER DESCSC( DLEN_ ), DESCSR( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMN2D, $ DGAMX2D, DGSUM2D, IGAMN2D, INFOG2L, $ PCHK1MAT, PB_TOPGET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(500+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, INFO ) CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, 0, IDUMM, IDUMM, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPOEQU', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN SCOND = ONE AMAX = ZERO RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'All', ALLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * * Compute some local indexes * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) * * Assign descriptors for SR and SC arrays * CALL DESCSET( DESCSR, N, 1, DESCA( MB_ ), 1, 0, 0, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCSC, 1, N, 1, DESCA( NB_ ), 0, 0, ICTXT, 1 ) * * Initialize the scaling factors to zero. * DO 10 II = IIA, IIA+NP-1 SR( II ) = ZERO 10 CONTINUE * DO 20 JJ = JJA, JJA+NQ-1 SC( JJ ) = ZERO 20 CONTINUE * * Find the minimum and maximum diagonal elements. * Handle first block separately. * II = IIA JJ = JJA JB = JN-JA+1 SMIN = ONE / PDLAMCH( ICTXT, 'S' ) AMAX = ZERO * IOFFA = II+(JJ-1)*LDA IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN IOFFD = IOFFA DO 30 LL = 0, JB-1 AII = A( IOFFD ) SR( II+LL ) = AII SC( JJ+LL ) = AII SMIN = MIN( SMIN, AII ) AMAX = MAX( AMAX, AII ) IF( AII.LE.ZERO .AND. INFO.EQ.0 ) $ INFO = LL + 1 IOFFD = IOFFD + LDA + 1 30 CONTINUE END IF * IF( MYROW.EQ.IAROW ) THEN II = II + JB IOFFA = IOFFA + JB END IF IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + JB IOFFA = IOFFA + JB*LDA END IF ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 50 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFD = IOFFA DO 40 LL = 0, JB-1 AII = A( IOFFD ) SR( II+LL ) = AII SC( JJ+LL ) = AII SMIN = MIN( SMIN, AII ) AMAX = MAX( AMAX, AII ) IF( AII.LE.ZERO .AND. INFO.EQ.0 ) $ INFO = J + LL - JA + 1 IOFFD = IOFFD + LDA + 1 40 CONTINUE END IF * IF( MYROW.EQ.ICURROW ) THEN II = II + JB IOFFA = IOFFA + JB END IF IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + JB IOFFA = IOFFA + JB*LDA END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 50 CONTINUE * * Compute scaling factors * CALL DGSUM2D( ICTXT, 'Columnwise', COLCTOP, 1, NQ, SC( JJA ), $ 1, -1, MYCOL ) CALL DGSUM2D( ICTXT, 'Rowwise', ROWCTOP, NP, 1, SR( IIA ), $ MAX( 1, NP ), -1, MYCOL ) * CALL DGAMX2D( ICTXT, 'All', ALLCTOP, 1, 1, AMAX, 1, IDUMM, IDUMM, $ -1, -1, MYCOL ) CALL DGAMN2D( ICTXT, 'All', ALLCTOP, 1, 1, SMIN, 1, IDUMM, IDUMM, $ -1, -1, MYCOL ) * IF( SMIN.LE.ZERO ) THEN * * Find the first non-positive diagonal element and return. * CALL IGAMN2D( ICTXT, 'All', ALLCTOP, 1, 1, INFO, 1, II, JJ, -1, $ -1, MYCOL ) RETURN * ELSE * * Set the scale factors to the reciprocals * of the diagonal elements. * DO 60 II = IIA, IIA+NP-1 SR( II ) = ONE / SQRT( SR( II ) ) 60 CONTINUE * DO 70 JJ = JJA, JJA+NQ-1 SC( JJ ) = ONE / SQRT( SC( JJ ) ) 70 CONTINUE * * Compute SCOND = min(S(I)) / max(S(I)) * SCOND = SQRT( SMIN ) / SQRT( AMAX ) * END IF * RETURN * * End of PDPOEQU * END scalapack-2.0.2/SRC/pdporfs.f000644 000766 000024 00000100627 10363532303 016170 0ustar00juliestaff000000 000000 SUBROUTINE PDPORFS( UPLO, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, B, IB, JB, DESCB, X, IX, JX, DESCX, $ FERR, BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, $ LIWORK, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IWORK( * ) DOUBLE PRECISION A( * ), AF( * ), B( * ), $ BERR( * ), FERR( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PDPORFS improves the computed solution to a system of linear * equations when the coefficient matrix is symmetric positive definite * and provides error bounds and backward error estimates for the * solutions. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) DOUBLE PRECISION pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1) ). * This array contains the local pieces of the N-by-N symmetric * distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input) DOUBLE PRECISION pointer into the local memory * to an array of local dimension (LLD_AF,LOCc(JA+N-1)). * On entry, this array contains the factors L or U from the * Cholesky factorization sub( A ) = L*L**T or U**T*U, as * computed by PDPOTRF. * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * B (local input) DOUBLE PRECISION pointer into the local memory * to an array of local dimension (LLD_B, LOCc(JB+NRHS-1) ). * On entry, this array contains the the local pieces of the * right hand sides sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input) DOUBLE PRECISION pointer into the local memory * to an array of local dimension (LLD_X, LOCc(JX+NRHS-1) ). * On entry, this array contains the the local pieces of the * solution vectors sub( X ). On exit, it contains the * improved solution vectors. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). * The estimated forward error bound for each solution vector * of sub( X ). If XTRUE is the true solution corresponding * to sub( X ), FERR is an estimated upper bound for the * magnitude of the largest element in (sub( X ) - XTRUE) * divided by the magnitude of the largest element in sub( X ). * The estimate is as reliable as the estimate for RCOND, and * is almost always a slight overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*LOCr( N + MOD( IA-1, MB_A ) ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr( N + MOD( IB-1, MB_B ) ). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices op( A ) and op( AF ) (respectively * sub( X ) and sub( B ) ) should be distributed the same way on the * same processes. These conditions ensure that sub( A ) and sub( AF ) * (resp. sub( X ) and sub( B ) ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( AF ), sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IAF, DESCAF( MB_ ) ) = f( JAF, DESCAF( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) DOUBLE PRECISION TWO, THREE PARAMETER ( TWO = 2.0D+0, THREE = 3.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER COUNT, IACOL, IAFCOL, IAFROW, IAROW, IXBCOL, $ IXBROW, IXCOL, IXROW, ICOFFA, ICOFFAF, ICOFFB, $ ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, IIW, $ IOFFXB, IPB, IPR, IPV, IROFFA, IROFFAF, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LIWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ DOUBLE PRECISION EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMX2D, $ DGEBR2D, DGEBS2D, INFOG2L, PCHK2MAT, $ PDASYMV, PDAXPY, PDCOPY, PDSYMV, $ PDPOTRS, PDLACON, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, N, 2, IAF, JAF, DESCAF, 11, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 15, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IX, JX, DESCX, 19, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFAF = MOD( JAF-1, DESCAF( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFCOL = INDXG2P( JAF, DESCAF( NB_ ), MYCOL, $ DESCAF( CSRC_ ), NPCOL ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 3 * NPMOD LIWMIN = NPMOD WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700 + NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCAF( MB_ ) ) THEN INFO = -( 1100 + MB_ ) ELSE IF( IROFFAF.NE.0 .OR. IAROW.NE.IAFROW ) THEN INFO = -9 ELSE IF( DESCA( NB_ ).NE.DESCAF( NB_ ) ) THEN INFO = -( 1100 + NB_ ) ELSE IF( ICOFFAF.NE.0 .OR. IACOL.NE.IAFCOL ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -( 1100 + CTXT_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -13 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1500 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1500 + CTXT_ ) ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 1900 + MB_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -17 ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 1900 + NB_ ) ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -18 ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 1900 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -25 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IDUM1( 2 ) = N IDUM2( 2 ) = 2 IDUM1( 3 ) = NRHS IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 23 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 25 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, N, 2, IAF, $ JAF, DESCAF, 11, 0, IDUM1, IDUM2, INFO ) CALL PCHK2MAT( N, 2, NRHS, 3, IB, JB, DESCB, 15, N, 2, NRHS, 3, $ IX, JX, DESCX, 19, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPORFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = IPB + NP0 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = 1 + maximum number of nonzero entries in each row of sub( A ) * NZ = N + 1 EPS = PDLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) ) * DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 100 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 20 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X) * CALL PDCOPY( N, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PDSYMV( UPLO, N, -ONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, ONE, WORK( IPR ), IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i))/(abs(sub(A))*abs(sub(X))+abs(sub(B)) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to * the i-th components of the numerator and denominator * before dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIXB, IIXB + NP - 1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 30 CONTINUE END IF END IF * CALL PDASYMV( UPLO, N, ONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, ONE, WORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 40 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J) is larger than machine epsilon, and * 2) BERR(J) decreased by at least a factor of 2 during the * last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PDPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PDAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, IX, $ JX+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 20 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(sub(A)))* * ( abs(R) + * NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B)) ))) / norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(sub(A)) is the inverse of sub(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(sub(A))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(sub(A))*abs(sub(X)) + abs(sub(B)) is less than SAFE2. * * Use PDLACON to estimate the infinity-norm of the matrix * inv(sub(A)) * diag(W), where * W = abs(R) + NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B))))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 50 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 50 CONTINUE END IF END IF * KASE = 0 60 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PDLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(sub(A)'). * CALL PDPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF ELSE * * Multiply by inv(sub(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 80 CONTINUE END IF END IF * CALL PDPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 60 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 90 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 90 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 100 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 200 J = JN+1, JB+NRHS-1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 190 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 110 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub( B ) - sub( A )*sub( X ). * CALL PDCOPY( N, B, IB, J+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PDSYMV( UPLO, N, -ONE, A, IA, JA, DESCA, X, IX, J+K, $ DESCX, 1, ONE, WORK( IPR ), IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / * ( abs(sub(A))*abs(sub(X)) + abs(sub(B)) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIXB, IIXB+NP-1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 120 CONTINUE END IF END IF * CALL PDASYMV( UPLO, N, ONE, A, IA, JA, DESCA, X, IX, J+K, $ DESCX, 1, ONE, WORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 )THEN DO 130 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 130 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.ICURCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during * the last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. $ COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PDPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PDAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, $ IX, J+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 110 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(sub(A)))* * ( abs(R) + NZ*EPS*( * abs(sub(A))*abs(sub(X))+abs(sub(B)) )))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(sub(A)) is the inverse of sub(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of abs(R)+NZ*EPS*(abs(sub(A))*abs(sub(X)) * +abs(sub(B))) is incremented by SAFE1 if the i-th component * of abs(sub(A))*abs(sub(X)) + abs(sub(B)) is less than SAFE2. * * Use PDLACON to estimate the infinity-norm of the matrix * inv(sub(A)) * diag(W), where * W = abs(R) + NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B))))) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 140 CONTINUE END IF END IF * KASE = 0 150 CONTINUE IF( MYCOL.EQ.ICURCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, ICURCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PDLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = ICURCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(sub(A)'). * CALL PDPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 160 CONTINUE END IF END IF ELSE * * Multiply by inv(sub(A))*diag(W). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 170 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 170 CONTINUE END IF END IF * CALL PDPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 150 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 180 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 180 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, $ IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 190 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 200 CONTINUE * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PDPORFS * END scalapack-2.0.2/SRC/pdposv.f000644 000766 000024 00000024341 10363532303 016024 0ustar00juliestaff000000 000000 SUBROUTINE PDPOSV( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Purpose * ======= * * PDPOSV computes the solution to a real system of linear equations * * sub( A ) * X = sub( B ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is an N-by-N * symmetric distributed positive definite matrix and X and sub( B ) * denoting B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS distributed * matrices. * * The Cholesky decomposition is used to factor sub( A ) as * * sub( A ) = U**T * U, if UPLO = 'U', or * * sub( A ) = L * L**T, if UPLO = 'L', * * where U is an upper triangular matrix and L is a lower triangular * matrix. The factored form of sub( A ) is then used to solve the * system of equations. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if INFO = 0, this array contains the * local pieces of the factor U or L from the Cholesky factori- * zation sub( A ) = U**T*U or L*L**T. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_B,LOC(JB+NRHS-1)). * On entry, the local pieces of the right hand sides distribu- * ted matrix sub( B ). On exit, if INFO = 0, sub( B ) is over- * written with the solution distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed, and the * solution has not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDPOTRF, $ PDPOTRS, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1000+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, $ 3, IB, JB, DESCB, 11, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPOSV', -INFO ) RETURN END IF * * Compute the Cholesky factorization sub( A ) = U'*U or L*L'. * CALL PDPOTRF( UPLO, N, A, IA, JA, DESCA, INFO ) * IF( INFO.EQ.0 ) THEN * * Solve the system sub( A ) * X = sub( B ) overwriting sub( B ) * with X. * CALL PDPOTRS( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * END IF * RETURN * * End of PDPOSV * END scalapack-2.0.2/SRC/pdposvx.f000644 000766 000024 00000065757 10363532303 016234 0ustar00juliestaff000000 000000 SUBROUTINE PDPOSVX( FACT, UPLO, N, NRHS, A, IA, JA, DESCA, AF, $ IAF, JAF, DESCAF, EQUED, SR, SC, B, IB, JB, $ DESCB, X, IX, JX, DESCX, RCOND, FERR, BERR, $ WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER EQUED, FACT, UPLO INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, LIWORK, $ LWORK, N, NRHS DOUBLE PRECISION RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IWORK( * ) DOUBLE PRECISION A( * ), AF( * ), $ B( * ), BERR( * ), FERR( * ), $ SC( * ), SR( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PDPOSVX uses the Cholesky factorization A = U**T*U or A = L*L**T to * compute the solution to a real system of linear equations * * A(IA:IA+N-1,JA:JA+N-1) * X = B(IB:IB+N-1,JB:JB+NRHS-1), * * where A(IA:IA+N-1,JA:JA+N-1) is an N-by-N matrix and X and * B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS matrices. * * Error bounds on the solution and a condition estimate are also * provided. In the following comments Y denotes Y(IY:IY+M-1,JY:JY+K-1) * a M-by-K matrix where Y can be A, AF, B and X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Description * =========== * * The following steps are performed: * * 1. If FACT = 'E', real scaling factors are computed to equilibrate * the system: * diag(SR) * A * diag(SC) * inv(diag(SC)) * X = diag(SR) * B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(SR)*A*diag(SC) and B by diag(SR)*B. * * 2. If FACT = 'N' or 'E', the Cholesky decomposition is used to * factor the matrix A (after equilibration if FACT = 'E') as * A = U**T* U, if UPLO = 'U', or * A = L * L**T, if UPLO = 'L', * where U is an upper triangular matrix and L is a lower triangular * matrix. * * 3. The factored form of A is used to estimate the condition number * of the matrix A. If the reciprocal of the condition number is * less than machine precision, steps 4-6 are skipped. * * 4. The system of equations is solved for X using the factored form * of A. * * 5. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 6. If equilibration was used, the matrix X is premultiplied by * diag(SR) so that it solves the original system before * equilibration. * * Arguments * ========= * * FACT (global input) CHARACTER * Specifies whether or not the factored form of the matrix A is * supplied on entry, and if not, whether the matrix A should be * equilibrated before it is factored. * = 'F': On entry, AF contains the factored form of A. * If EQUED = 'Y', the matrix A has been equilibrated * with scaling factors given by S. A and AF will not * be modified. * = 'N': The matrix A will be copied to AF and factored. * = 'E': The matrix A will be equilibrated if necessary, then * copied to AF and factored. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrices B and X. NRHS >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into * the local memory to an array of local dimension * ( LLD_A, LOCc(JA+N-1) ). * On entry, the symmetric matrix A, except if FACT = 'F' and * EQUED = 'Y', then A must contain the equilibrated matrix * diag(SR)*A*diag(SC). If UPLO = 'U', the leading * N-by-N upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. A is not modified if * FACT = 'F' or 'N', or if FACT = 'E' and EQUED = 'N' on exit. * * On exit, if FACT = 'E' and EQUED = 'Y', A is overwritten by * diag(SR)*A*diag(SC). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input or local output) DOUBLE PRECISION pointer * into the local memory to an array of local dimension * ( LLD_AF, LOCc(JA+N-1)). * If FACT = 'F', then AF is an input argument and on entry * contains the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T, in the same storage * format as A. If EQUED .ne. 'N', then AF is the factored form * of the equilibrated matrix diag(SR)*A*diag(SC). * * If FACT = 'N', then AF is an output argument and on exit * returns the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T of the original * matrix A. * * If FACT = 'E', then AF is an output argument and on exit * returns the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T of the equilibrated * matrix A (see the description of A for the form of the * equilibrated matrix). * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * EQUED (global input/global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration (always true if FACT = 'N'). * = 'Y': Equilibration was done, i.e., A has been replaced by * diag(SR) * A * diag(SC). * EQUED is an input variable if FACT = 'F'; otherwise, it is an * output variable. * * SR (local input/local output) DOUBLE PRECISION array, * dimension (LLD_A) * The scale factors for A distributed across process rows; * not accessed if EQUED = 'N'. SR is an input variable if * FACT = 'F'; otherwise, SR is an output variable. * If FACT = 'F' and EQUED = 'Y', each element of SR must be * positive. * * SC (local input/local output) DOUBLE PRECISION array, * dimension (LOC(N_A)) * The scale factors for A distributed across * process columns; not accessed if EQUED = 'N'. SC is an input * variable if FACT = 'F'; otherwise, SC is an output variable. * If FACT = 'F' and EQUED = 'Y', each element of SC must be * positive. * * B (local input/local output) DOUBLE PRECISION pointer into * the local memory to an array of local dimension * ( LLD_B, LOCc(JB+NRHS-1) ). * On entry, the N-by-NRHS right-hand side matrix B. * On exit, if EQUED = 'N', B is not modified; if TRANS = 'N' * and EQUED = 'R' or 'B', B is overwritten by diag(R)*B; if * TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is overwritten * by diag(C)*B. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input/local output) DOUBLE PRECISION pointer into * the local memory to an array of local dimension * ( LLD_X, LOCc(JX+NRHS-1) ). * If INFO = 0, the N-by-NRHS solution matrix X to the original * system of equations. Note that A and B are modified on exit * if EQUED .ne. 'N', and the solution to the equilibrated * system is inv(diag(SC))*X if TRANS = 'N' and EQUED = 'C' or * 'B', or inv(diag(SR))*X if TRANS = 'T' or 'C' and EQUED = 'R' * or 'B'. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * RCOND (global output) DOUBLE PRECISION * The estimate of the reciprocal condition number of the matrix * A after equilibration (if done). If RCOND is less than the * machine precision (in particular, if RCOND = 0), the matrix * is singular to working precision. This condition is * indicated by a return code of INFO > 0, and the solution and * error bounds are not computed. * * FERR (local output) DOUBLE PRECISION array, dimension (LOC(N_B)) * The estimated forward error bounds for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution, FERR(j) bounds the magnitude * of the largest entry in (X(j) - XTRUE) divided by * the magnitude of the largest entry in X(j). The quality of * the error bound depends on the quality of the estimate of * norm(inv(A)) computed in the code; if the estimate of * norm(inv(A)) is accurate, the error bound is guaranteed. * * BERR (local output) DOUBLE PRECISION array, dimension (LOC(N_B)) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any entry of A or B that makes X(j) an exact solution). * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = MAX( PDPOCON( LWORK ), PDPORFS( LWORK ) ) * + LOCr( N_A ). * LWORK = 3*DESCA( LLD_ ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK = DESCA( LLD_ ) * LIWORK = LOCr(N_A). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, and i is * <= N: if INFO = i, the leading minor of order i of A * is not positive definite, so the factorization * could not be completed, and the solution and error * bounds could not be computed. * = N+1: RCOND is less than machine precision. The * factorization has been completed, but the matrix * is singular to working precision, and the solution * and error bounds have not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL EQUIL, LQUERY, NOFACT, RCEQU INTEGER I, IACOL, IAROW, IAFROW, IBROW, IBCOL, ICOFF, $ ICOFFA, ICTXT, IDUMM, IIA, IIB, IIX, INFEQU, $ IROFF, IROFFA, IROFFAF, IROFFB, IROFFX, IXCOL, $ IXROW, J, JJA, JJB, JJX, LDB, LDX, LIWMIN, $ LWMIN, MYCOL, MYROW, NP, NPCOL, NPROW, NRHSQ, $ NQ DOUBLE PRECISION AMAX, ANORM, BIGNUM, SCOND, SMAX, SMIN, SMLNUM * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, $ DGAMN2D, DGAMX2D, INFOG2L, $ PDPOCON, PDPOEQU, PDPORFS, $ PDPOTRF, $ PDPOTRS, PDLACPY, PDLAQSY, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PDLANSY EXTERNAL PDLAMCH, INDXG2P, LSAME, NUMROC, PDLANSY * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(800+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 8, INFO ) IF( LSAME( FACT, 'F' ) ) $ CALL CHK1MAT( N, 3, N, 3, IAF, JAF, DESCAF, 12, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 20, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IIA, JJA, IAROW, IACOL ) NP = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFFA LWMIN = 3*DESCA( LLD_ ) LIWMIN = NP NOFACT = LSAME( FACT, 'N' ) EQUIL = LSAME( FACT, 'E' ) IF( NOFACT .OR. EQUIL ) THEN EQUED = 'N' RCEQU = .FALSE. ELSE RCEQU = LSAME( EQUED, 'Y' ) SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM END IF IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. $ .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( ICOFFA.NE.0 .OR. IROFFA.NE.ICOFFA ) THEN INFO = -7 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(800+NB_) ELSE IF( IAFROW.NE.IAROW ) THEN INFO = -10 ELSE IF( IROFFAF.NE.0 ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -(1200+CTXT_) ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. $ ( RCEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN INFO = -13 ELSE IF( RCEQU ) THEN * SMIN = BIGNUM SMAX = ZERO DO 10 J = IIA, IIA + NP - 1 SMIN = MIN( SMIN, SR( J ) ) SMAX = MAX( SMAX, SR( J ) ) 10 CONTINUE CALL DGAMN2D( ICTXT, 'Columnwise', ' ', 1, 1, SMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, SMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( SMIN.LE.ZERO ) THEN INFO = -14 ELSE IF( N.GT.0 ) THEN SCOND = MAX( SMIN, SMLNUM ) / MIN( SMAX, BIGNUM ) ELSE SCOND = ONE END IF END IF END IF END IF * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IBROW.NE.IAROW ) THEN INFO = -18 ELSE IF( IXROW.NE.IBROW ) THEN INFO = -22 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2000+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(2000+CTXT_) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -(2400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -28 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -30 END IF IDUM1( 1 ) = ICHAR( FACT ) IDUM2( 1 ) = 1 IDUM1( 2 ) = ICHAR( UPLO ) IDUM2( 2 ) = 2 IF( LSAME( FACT, 'F' ) ) THEN IDUM1( 3 ) = ICHAR( EQUED ) IDUM2( 3 ) = 13 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 28 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 30 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, $ 4, IB, JB, DESCB, 19, 5, IDUM1, IDUM2, $ INFO ) ELSE IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 28 IF( LIWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 30 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, $ 4, IB, JB, DESCB, 19, 4, IDUM1, IDUM2, $ INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPOSVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * IF( EQUIL ) THEN * * Compute row and column scalings to equilibrate the matrix A. * CALL PDPOEQU( N, A, IA, JA, DESCA, SR, SC, SCOND, AMAX, $ INFEQU ) * IF( INFEQU.EQ.0 ) THEN * * Equilibrate the matrix. * CALL PDLAQSY( UPLO, N, A, IA, JA, DESCA, SR, SC, SCOND, $ AMAX, EQUED ) RCEQU = LSAME( EQUED, 'Y' ) END IF END IF * * Scale the right-hand side. * CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, $ JJB, IBROW, IBCOL ) LDB = DESCB( LLD_ ) IROFF = MOD( IB-1, DESCB( MB_ ) ) ICOFF = MOD( JB-1, DESCB( NB_ ) ) NP = NUMROC( N+IROFF, DESCB( MB_ ), MYROW, IBROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFF, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) IF( MYROW.EQ.IBROW ) NP = NP-IROFF IF( MYCOL.EQ.IBCOL ) NRHSQ = NRHSQ-ICOFF * IF( RCEQU ) THEN DO 30 J = JJB, JJB+NRHSQ-1 DO 20 I = IIB, IIB+NP-1 B( I + ( J-1 )*LDB ) = SR( I )*B( I + ( J-1 )*LDB ) 20 CONTINUE 30 CONTINUE END IF * IF( NOFACT .OR. EQUIL ) THEN * * Compute the Cholesky factorization A = U'*U or A = L*L'. * CALL PDLACPY( 'Full', N, N, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF ) CALL PDPOTRF( UPLO, N, AF, IAF, JAF, DESCAF, INFO ) * * Return if INFO is non-zero. * IF( INFO.NE.0 ) THEN IF( INFO.GT.0 ) $ RCOND = ZERO RETURN END IF END IF * * Compute the norm of the matrix A. * ANORM = PDLANSY( '1', UPLO, N, A, IA, JA, DESCA, WORK ) * * Compute the reciprocal of the condition number of A. * CALL PDPOCON( UPLO, N, AF, IAF, JAF, DESCAF, ANORM, RCOND, WORK, $ LWORK, IWORK, LIWORK, INFO ) * * Return if the matrix is singular to working precision. * IF( RCOND.LT.PDLAMCH( ICTXT, 'Epsilon' ) ) THEN INFO = IA + N RETURN END IF * * Compute the solution matrix X. * CALL PDLACPY( 'Full', N, NRHS, B, IB, JB, DESCB, X, IX, JX, $ DESCX ) CALL PDPOTRS( UPLO, N, NRHS, AF, IAF, JAF, DESCAF, X, IX, JX, $ DESCX, INFO ) * * Use iterative refinement to improve the computed solution and * compute error bounds and backward error estimates for it. * CALL PDPORFS( UPLO, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, B, IB, JB, DESCB, X, IX, JX, DESCX, FERR, $ BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * Transform the solution matrix X to a solution of the original * system. * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) LDX = DESCX( LLD_ ) IROFF = MOD( IX-1, DESCX( MB_ ) ) ICOFF = MOD( JX-1, DESCX( NB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYROW.EQ.IBROW ) NP = NP-IROFF IF( MYCOL.EQ.IBCOL ) NRHSQ = NRHSQ-ICOFF * IF( RCEQU ) THEN DO 50 J = JJX, JJX+NRHSQ-1 DO 40 I = IIX, IIX+NP-1 X( I + ( J-1 )*LDX ) = SR( I )*X( I + ( J-1 )*LDX ) 40 CONTINUE 50 CONTINUE DO 60 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / SCOND 60 CONTINUE END IF * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN RETURN * * End of PDPOSVX * END scalapack-2.0.2/SRC/pdpotf2.f000644 000766 000024 00000030552 10363532303 016070 0ustar00juliestaff000000 000000 SUBROUTINE PDPOTF2( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDPOTF2 computes the Cholesky factorization of a real symmetric * positive definite distributed matrix sub( A )=A(IA:IA+N-1,JA:JA+N-1). * * The factorization has the form * * sub( A ) = U' * U , if UPLO = 'U', or * * sub( A ) = L * L', if UPLO = 'L', * * where U is an upper triangular matrix and L is lower triangular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires N <= NB_A-MOD(JA-1, NB_A) and square block * decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if UPLO = 'U', the upper triangular * part of the distributed matrix contains the Cholesky factor * U, if UPLO = 'L', the lower triangular part of the distribu- * ted matrix contains the Cholesky factor L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURR, IDIAG, IIA, $ IOFFA, IROFF, J, JJA, LDA, MYCOL, MYROW, $ NPCOL, NPROW DOUBLE PRECISION AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DGEMV, $ DSCAL, IGEBR2D, IGEBS2D, INFOG2L, PB_TOPGET, $ PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MOD, SQRT * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DDOT EXTERNAL LSAME, DDOT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N+ICOFF.GT.DESCA( NB_ ) ) THEN INFO = -2 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPOTF2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Compute local information * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF ( UPPER ) THEN * * Process (IAROW, IACOL) owns block to be factorized * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN * * Compute the Cholesky factorization A = U'*U. * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * DO 10 J = JA, JA+N-1 * * Compute U(J,J) and test for non-positive-definiteness. * AJJ = A( IDIAG ) - $ DDOT( J-JA, A( IOFFA ), 1, A( IOFFA ), 1 ) IF( AJJ.LE.ZERO ) THEN A( IDIAG ) = AJJ INFO = J - JA + 1 GO TO 20 END IF AJJ = SQRT( AJJ ) A( IDIAG ) = AJJ * * Compute elements J+1:JA+N-1 of row J. * IF( J.LT.JA+N-1 ) THEN ICURR = IDIAG + LDA CALL DGEMV( 'Transpose', J-JA, JA+N-J-1, -ONE, $ A( IOFFA+LDA ), LDA, A( IOFFA ), 1, $ ONE, A( ICURR ), LDA ) CALL DSCAL( N-J+JA-1, ONE / AJJ, A( ICURR ), LDA ) END IF IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + LDA 10 CONTINUE * 20 CONTINUE * * Broadcast INFO to all processes in my IAROW. * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1, $ MYROW, IACOL ) END IF * * IAROW bcasts along columns so that everyone has INFO * CALL IGEBS2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, 1, $ IAROW, MYCOL ) * END IF * ELSE * * Process (IAROW, IACOL) owns block to be factorized * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Compute the Cholesky factorization A = L*L'. * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * DO 30 J = JA, JA+N-1 * * Compute L(J,J) and test for non-positive-definiteness. * AJJ = A( IDIAG ) - $ DDOT( J-JA, A( IOFFA ), LDA, A( IOFFA ), LDA ) IF ( AJJ.LE.ZERO ) THEN A( IDIAG ) = AJJ INFO = J - JA + 1 GO TO 40 END IF AJJ = SQRT( AJJ ) A( IDIAG ) = AJJ * * Compute elements J+1:JA+N-1 of column J. * IF( J.LT.JA+N-1 ) THEN ICURR = IDIAG + 1 CALL DGEMV( 'No transpose', JA+N-J-1, J-JA, -ONE, $ A( IOFFA+1 ), LDA, A( IOFFA ), LDA, $ ONE, A( ICURR ), 1 ) CALL DSCAL( JA+N-J-1, ONE / AJJ, A( ICURR ), 1 ) END IF IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + 1 30 CONTINUE * 40 CONTINUE * * Broadcast INFO to everyone in IACOL * CALL IGEBS2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, $ 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, $ 1, IAROW, MYCOL ) * END IF * * IACOL bcasts INFO along rows so that everyone has it * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1, $ MYROW, IACOL ) * END IF * END IF * RETURN * * End of PDPOTF2 * END scalapack-2.0.2/SRC/pdpotrf.f000644 000766 000024 00000031426 10363532303 016171 0ustar00juliestaff000000 000000 SUBROUTINE PDPOTRF( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDPOTRF computes the Cholesky factorization of an N-by-N real * symmetric positive definite distributed matrix sub( A ) denoting * A(IA:IA+N-1, JA:JA+N-1). * * The factorization has the form * * sub( A ) = U' * U , if UPLO = 'U', or * * sub( A ) = L * L', if UPLO = 'L', * * where U is an upper triangular matrix and L is lower triangular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if UPLO = 'U', the upper triangular * part of the distributed matrix contains the Cholesky factor * U, if UPLO = 'L', the lower triangular part of the distribu- * ted matrix contains the Cholesky factor L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER I, ICOFF, ICTXT, IROFF, J, JB, JN, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PDPOTF2, PDSYRK, PDTRSM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPOTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( UPPER ) THEN * * Split-ring topology for the communication along process * columns, 1-tree topology along process rows. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'S-ring' ) * * A is upper triangular, compute Cholesky factorization A = U'*U. * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA(NB_), JA+N-1 ) JB = JN - JA + 1 * * Perform unblocked Cholesky factorization on JB block * CALL PDPOTF2( UPLO, JB, A, IA, JA, DESCA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 * IF( JB+1.LE.N ) THEN * * Form the row panel of U using the triangular solver * CALL PDTRSM( 'Left', UPLO, 'Transpose', 'Non-Unit', $ JB, N-JB, ONE, A, IA, JA, DESCA, A, IA, JA+JB, $ DESCA ) * * Update the trailing matrix, A = A - U'*U * CALL PDSYRK( UPLO, 'Transpose', N-JB, JB, -ONE, A, IA, $ JA+JB, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) END IF * * Loop over remaining block of columns * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Perform unblocked Cholesky factorization on JB block * CALL PDPOTF2( UPLO, JB, A, I, J, DESCA, INFO ) IF( INFO.NE.0 ) THEN INFO = INFO + J - JA GO TO 30 END IF * IF( J-JA+JB+1.LE.N ) THEN * * Form the row panel of U using the triangular solver * CALL PDTRSM( 'Left', UPLO, 'Transpose', 'Non-Unit', $ JB, N-J-JB+JA, ONE, A, I, J, DESCA, A, $ I, J+JB, DESCA ) * * Update the trailing matrix, A = A - U'*U * CALL PDSYRK( UPLO, 'Transpose', N-J-JB+JA, JB, $ -ONE, A, I, J+JB, DESCA, ONE, A, I+JB, $ J+JB, DESCA ) END IF 10 CONTINUE * ELSE * * 1-tree topology for the communication along process columns, * Split-ring topology along process rows. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * A is lower triangular, compute Cholesky factorization A = L*L' * (right-looking) * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) JB = JN - JA + 1 * * Perform unblocked Cholesky factorization on JB block * CALL PDPOTF2( UPLO, JB, A, IA, JA, DESCA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 * IF( JB+1.LE.N ) THEN * * Form the column panel of L using the triangular solver * CALL PDTRSM( 'Right', UPLO, 'Transpose', 'Non-Unit', $ N-JB, JB, ONE, A, IA, JA, DESCA, A, IA+JB, JA, $ DESCA ) * * Update the trailing matrix, A = A - L*L' * CALL PDSYRK( UPLO, 'No Transpose', N-JB, JB, -ONE, A, IA+JB, $ JA, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * END IF * DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Perform unblocked Cholesky factorization on JB block * CALL PDPOTF2( UPLO, JB, A, I, J, DESCA, INFO ) IF( INFO.NE.0 ) THEN INFO = INFO + J - JA GO TO 30 END IF * IF( J-JA+JB+1.LE.N ) THEN * * Form the column panel of L using the triangular solver * CALL PDTRSM( 'Right', UPLO, 'Transpose', 'Non-Unit', $ N-J-JB+JA, JB, ONE, A, I, J, DESCA, A, I+JB, $ J, DESCA ) * * Update the trailing matrix, A = A - L*L' * CALL PDSYRK( UPLO, 'No Transpose', N-J-JB+JA, JB, -ONE, $ A, I+JB, J, DESCA, ONE, A, I+JB, J+JB, $ DESCA ) * END IF 20 CONTINUE * END IF * 30 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PDPOTRF * END scalapack-2.0.2/SRC/pdpotri.f000644 000766 000024 00000017161 10363532303 016174 0ustar00juliestaff000000 000000 SUBROUTINE PDPOTRI( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDPOTRI computes the inverse of a real symmetric positive definite * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the * Cholesky factorization sub( A ) = U**T*U or L*L**T computed by * PDPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor U or L * from the Cholesky factorization of the distributed matrix * sub( A ) = U**T*U or L*L**T, as computed by PDPOTRF. * On exit, the local pieces of the upper or lower triangle of * the (symmetric) inverse of sub( A ), overwriting the input * factor U or L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, the (i,i) element of the factor U or L is * zero, and the inverse could not be computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER ICOFF, ICTXT, IROFF, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PDLAUUM, $ PDTRTRI, PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.NE.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPOTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Invert the triangular Cholesky factor U or L. * CALL PDTRTRI( UPLO, 'Non-unit', N, A, IA, JA, DESCA, INFO ) * IF( INFO.GT.0 ) $ RETURN * * Form inv(U)*inv(U)' or inv(L)'*inv(L). * CALL PDLAUUM( UPLO, N, A, IA, JA, DESCA ) * RETURN * * End of PDPOTRI * END scalapack-2.0.2/SRC/pdpotrs.f000644 000766 000024 00000023745 10363532303 016213 0ustar00juliestaff000000 000000 SUBROUTINE PDPOTRS( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Purpose * ======= * * PDPOTRS solves a system of linear equations * * sub( A ) * X = sub( B ) * A(IA:IA+N-1,JA:JA+N-1)*X = B(IB:IB+N-1,JB:JB+NRHS-1) * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is a N-by-N * symmetric positive definite distributed matrix using the Cholesky * factorization sub( A ) = U**T*U or L*L**T computed by PDPOTRF. * sub( B ) denotes the distributed matrix B(IB:IB+N-1,JB:JB+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) DOUBLE PRECISION pointer into local memory to * an array of dimension (LLD_A, LOCc(JA+N-1)). On entry, this * array contains the factors L or U from the Cholesky facto- * rization sub( A ) = L*L**T or U**T*U, as computed by PDPOTRF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * the local pieces of the right hand sides sub( B ). * On exit, this array contains the local pieces of the solution * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IAROW, IBROW, ICTXT, IROFFA, IROFFB, ICOFFA, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDTRSM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 11, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1100+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, $ 3, IB, JB, DESCB, 11, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPOTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Solve sub( A ) * X = sub( B ) where sub( A ) = U'*U. * * Solve U'*X = sub( B ), overwriting sub( B ) with X. * CALL PDTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve U*X = sub( B ), overwriting sub( B ) with X. * CALL PDTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) ELSE * * Solve sub( A ) *X = sub( B ) where sub( A ) = L*L'. * * Solve L*X = sub( B ), overwriting sub( B ) with X. * CALL PDTRSM( 'Left', 'Lower', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve L'*X = sub( B ), overwriting sub( B ) with X. * CALL PDTRSM( 'Left', 'Lower', 'Transpose', 'Non-unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) END IF * RETURN * * End of PDPOTRS * END scalapack-2.0.2/SRC/pdptsv.f000644 000766 000024 00000045304 10363532303 016033 0ustar00juliestaff000000 000000 SUBROUTINE PDPTSV( N, NRHS, D, E, JA, DESCA, B, IB, DESCB, WORK, $ LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION B( * ), D( * ), E( * ), WORK( * ) * .. * * * Purpose * ======= * * PDPTSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * tridiagonal symmetric positive definite distributed * matrix. * * Cholesky factorization is used to factor a reordering of * the matrix into L L'. * * See PDPTTRF and PDPTTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (12*NPCOL + 3*NB) * +max((10+2*min(100,NRHS))*NPCOL+4*NRHS, 8*NPCOL) * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PDPTTRF, PDPTTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PDPTTRF and PDPTTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSEIF( DESCA( DTYPE_ ) .EQ. 502 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 5*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PDPTSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (12*NPCOL + 3*NB) * * Factor the matrix * CALL PDPTTRF( N, D, E, JA, DESCA, WORK, MIN( LWORK, WS_FACTOR ), $ WORK( 1+WS_FACTOR ), LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PDPTSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PDPTTRS( N, NRHS, D, E, JA, DESCA, B, IB, DESCB, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDPTSV', -INFO ) RETURN END IF * RETURN * * End of PDPTSV * END scalapack-2.0.2/SRC/pdpttrf.f000644 000766 000024 00000104011 10363532303 016165 0ustar00juliestaff000000 000000 SUBROUTINE PDPTTRF( N, D, E, JA, DESCA, AF, LAF, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. INTEGER INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION AF( * ), D( * ), E( * ), WORK( * ) * .. * * * Purpose * ======= * * PDPTTRF computes a Cholesky factorization * of an N-by-N real tridiagonal * symmetric positive definite distributed matrix * A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PDPTTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = U' D U or * * P A(1:N, JA:JA+N-1) P^T = L D L', * * where U is a tridiagonal upper triangular matrix and L is tridiagonal * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * D (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDPTTRF and this is stored in AF. If a linear system * is to be solved using PDPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 8*NPCOL * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, INT_TEMP, JA_NEW, $ LAF_MIN, LEVEL_DIST, LLDA, MYCOL, MYROW, $ MY_NUM_COLS, NB, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, PART_OFFSET, PART_SIZE, RETURN_CODE, $ STORE_N_A, TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ DGERV2D, DGESD2D, DPTTRF, DPTTRSV, DTRRV2D, $ DTRSD2D, GLOBCHK, IGAMX2D, IGEBR2D, IGEBS2D, $ PXERBLA, RESHAPE * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 5*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -9 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 5*100+6 ) END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 5*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PDPTTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 5*100+4 ) CALL PXERBLA( ICTXT, 'PDPTTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = ( 12*NPCOL+3*NB ) * IF( LAF.LT.LAF_MIN ) THEN INFO = -7 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PDPTTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = 8*NPCOL * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -9 CALL PXERBLA( ICTXT, 'PDPTTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 7, 1 ) = DESCA( 5 ) PARAM_CHECK( 6, 1 ) = DESCA( 4 ) PARAM_CHECK( 5, 1 ) = DESCA( 3 ) PARAM_CHECK( 4, 1 ) = DESCA( 1 ) PARAM_CHECK( 3, 1 ) = JA PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 7, 2 ) = 505 PARAM_CHECK( 6, 2 ) = 504 PARAM_CHECK( 5, 2 ) = 503 PARAM_CHECK( 4, 2 ) = 501 PARAM_CHECK( 3, 2 ) = 4 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 9 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 7, PARAM_CHECK, 7, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDPTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 90 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * IF( MYCOL.LT.NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL DTRSD2D( ICTXT, 'U', 'N', 1, 1, $ E( PART_OFFSET+ODD_SIZE+1 ), LLDA-1, 0, MYCOL+1 ) * END IF * * * Factor main partition A_i = L_i {L_i}^T in each processor * Or A_i = {U_i}^T {U_i} if E is the upper superdiagonal * CALL DPTTRF( ODD_SIZE, D( PART_OFFSET+1 ), E( PART_OFFSET+1 ), $ INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 20 END IF * * IF( MYCOL.LT.NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * * Perform the triangular system solve {L_i}{{B'}_i}^T = {B_i}^T * by dividing B_i by diagonal element * E( PART_OFFSET+ODD_SIZE ) = E( PART_OFFSET+ODD_SIZE ) / $ D( PART_OFFSET+ODD_SIZE ) * * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^T * D( PART_OFFSET+ODD_SIZE+1 ) = D( PART_OFFSET+ODD_SIZE+1 ) - $ D( PART_OFFSET+ODD_SIZE )* $ ( E( PART_OFFSET+ODD_SIZE )* $ ( E( PART_OFFSET+ODD_SIZE ) ) ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 20 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL DTRRV2D( ICTXT, 'U', 'N', 1, 1, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF( INFO.EQ.0 ) THEN * * Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ . * CALL DPTTRSV( 'N', ODD_SIZE, INT_ONE, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), AF( 1 ), ODD_SIZE, INFO ) * * Divide by D * DO 30 I = 1, ODD_SIZE AF( I ) = AF( I ) / D( PART_OFFSET+I ) 30 CONTINUE * * * Calculate the update block for previous proc, E_i = G_i{G_i}^T * * * Since there is no element-by-element vector multiplication in * the BLAS, this loop must be hardwired in without a BLAS call * INT_TEMP = ODD_SIZE*INT_ONE + 2 + 1 AF( INT_TEMP ) = 0 * DO 40 I = 1, ODD_SIZE AF( INT_TEMP ) = AF( INT_TEMP ) - $ D( PART_OFFSET+I )*( AF( I )* $ ( AF( I ) ) ) 40 CONTINUE * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+3 ), $ INT_ONE, 0, MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * {F_i}^T = {H_i}{{B'}_i}^T * AF( ODD_SIZE+1 ) = -D( PART_OFFSET+ODD_SIZE )* $ ( E( PART_OFFSET+ODD_SIZE )* $ AF( ODD_SIZE ) ) * * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 80 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 70 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * END IF * * Copy last diagonal block into AF storage for subsequent * operations. * AF( ODD_SIZE+2 ) = DBLE( D( PART_OFFSET+ODD_SIZE+1 ) ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL+1 ) * * Add contribution to diagonal block * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + AF( ODD_SIZE+3 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 50 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 60 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 50 60 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... IF( AF( ODD_SIZE+2 ).EQ.ZERO ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * AF( ODD_SIZE+3 ) = AF( ODD_SIZE+1 ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * AF( ODD_SIZE+1 ) = AF( ODD_SIZE+1 ) / AF( ODD_SIZE+2 ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * WORK( 1 ) = -ONE*AF( ODD_SIZE+1 )*AF( ODD_SIZE+2 )* $ ( AF( ODD_SIZE+1 ) ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL DGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * AF( ODD_SIZE+3 ) = ( AF( ODD_SIZE+3 ) ) / AF( ODD_SIZE+2 ) * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*AF( ODD_SIZE+2 )* $ ( AF( ODD_SIZE+3 ) ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*AF( ODD_SIZE+2 )* $ AF( ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 70 CONTINUE * * 80 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 90 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PDPTTRF * END scalapack-2.0.2/SRC/pdpttrs.f000644 000766 000024 00000064333 10363532303 016216 0ustar00juliestaff000000 000000 SUBROUTINE PDPTTRS( N, NRHS, D, E, JA, DESCA, B, IB, DESCB, AF, $ LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION AF( * ), B( * ), D( * ), E( * ), WORK( * ) * .. * * * Purpose * ======= * * PDPTTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PDPTTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * tridiagonal symmetric positive definite distributed * matrix. * * Routine PDPTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * IMPORTANT NOTE: The current version of this code supports * only IB=JA * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDPTTRF and this is stored in AF. If a linear system * is to be solved using PDPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (10+2*min(100,NRHS))*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, I, ICTXT, ICTXT_NEW, $ ICTXT_SAVE, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, $ MYROW, MY_NUM_COLS, NB, NP, NPCOL, NPROW, $ NP_SAVE, ODD_SIZE, PART_OFFSET, PART_SIZE, $ RETURN_CODE, STORE_M_B, STORE_N_A, TEMP, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 14, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ DSCAL, GLOBCHK, PDPTTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 5*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 8*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 8*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 8*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -12 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 5*100+6 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 8*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 8*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -2 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -4 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 5*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PDPTTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 5*100+4 ) CALL PXERBLA( ICTXT, 'PDPTTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * WORK_SIZE_MIN = ( 10+2*MIN( 100, NRHS ) )*NPCOL + 4*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -12 CALL PXERBLA( ICTXT, 'PDPTTRS: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 14, 1 ) = DESCB( 5 ) PARAM_CHECK( 13, 1 ) = DESCB( 4 ) PARAM_CHECK( 12, 1 ) = DESCB( 3 ) PARAM_CHECK( 11, 1 ) = DESCB( 2 ) PARAM_CHECK( 10, 1 ) = DESCB( 1 ) PARAM_CHECK( 9, 1 ) = IB PARAM_CHECK( 8, 1 ) = DESCA( 5 ) PARAM_CHECK( 7, 1 ) = DESCA( 4 ) PARAM_CHECK( 6, 1 ) = DESCA( 3 ) PARAM_CHECK( 5, 1 ) = DESCA( 1 ) PARAM_CHECK( 4, 1 ) = JA PARAM_CHECK( 3, 1 ) = NRHS PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 14, 2 ) = 905 PARAM_CHECK( 13, 2 ) = 904 PARAM_CHECK( 12, 2 ) = 903 PARAM_CHECK( 11, 2 ) = 902 PARAM_CHECK( 10, 2 ) = 901 PARAM_CHECK( 9, 2 ) = 8 PARAM_CHECK( 8, 2 ) = 505 PARAM_CHECK( 7, 2 ) = 504 PARAM_CHECK( 6, 2 ) = 503 PARAM_CHECK( 5, 2 ) = 501 PARAM_CHECK( 4, 2 ) = 4 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 12 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 14, PARAM_CHECK, 14, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDPTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 30 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * * CALL PDPTTRSV( 'L', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * * Divide by the main diagonal: B <- D^{-1} B * * The main partition is first * DO 10 I = PART_OFFSET + 1, PART_OFFSET + ODD_SIZE CALL DSCAL( NRHS, DBLE( ONE / D( I ) ), B( I ), LLDB ) 10 CONTINUE * * Reduced system is next * IF( MYCOL.LT.NPCOL-1 ) THEN I = PART_OFFSET + ODD_SIZE + 1 CALL DSCAL( NRHS, ONE / AF( ODD_SIZE+2 ), B( I ), LLDB ) END IF * * Call backsolve routine * * CALL PDPTTRSV( 'U', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) 20 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 30 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PDPTTRS * END scalapack-2.0.2/SRC/pdpttrsv.f000644 000766 000024 00000110264 10363532303 016377 0ustar00juliestaff000000 000000 SUBROUTINE PDPTTRSV( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, $ AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION AF( * ), B( * ), D( * ), E( * ), WORK( * ) * .. * * * Purpose * ======= * * PDPTTRSV solves a tridiagonal triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a tridiagonal * triangular matrix factor produced by the * Cholesky factorization code PDPTTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T * is dictated by the user by the parameter TRANS. * * Routine PDPTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) DOUBLE PRECISION pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) DOUBLE PRECISION pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) DOUBLE PRECISION array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PDPTTRF and this is stored in AF. If a linear system * is to be solved using PDPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * DOUBLE PRECISION temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (10+2*min(100,NRHS))*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, JA_NEW, LEVEL_DIST, LLDA, LLDB, $ MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_M_B, STORE_N_A, $ TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 15, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DAXPY, $ DESC_CONVERT, DGEMM, DGERV2D, DGESD2D, DMATADD, $ DPTTRSV, DTRTRS, GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 7*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 10*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 10*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 10*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 10*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -14 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 7*100+6 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 10*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 10*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -3 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -6 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 7*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PDPTTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, 'PDPTTRSV, D&C alg.: NB too small', $ -INFO ) RETURN END IF * * WORK_SIZE_MIN = INT_ONE*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, 'PDPTTRSV: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 15, 1 ) = DESCB( 5 ) PARAM_CHECK( 14, 1 ) = DESCB( 4 ) PARAM_CHECK( 13, 1 ) = DESCB( 3 ) PARAM_CHECK( 12, 1 ) = DESCB( 2 ) PARAM_CHECK( 11, 1 ) = DESCB( 1 ) PARAM_CHECK( 10, 1 ) = IB PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = NRHS PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 15, 2 ) = 1005 PARAM_CHECK( 14, 2 ) = 1004 PARAM_CHECK( 13, 2 ) = 1003 PARAM_CHECK( 12, 2 ) = 1002 PARAM_CHECK( 11, 2 ) = 1001 PARAM_CHECK( 10, 2 ) = 9 PARAM_CHECK( 9, 2 ) = 705 PARAM_CHECK( 8, 2 ) = 704 PARAM_CHECK( 7, 2 ) = 703 PARAM_CHECK( 6, 2 ) = 701 PARAM_CHECK( 5, 2 ) = 6 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 15, PARAM_CHECK, 15, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PDPTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 100 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL DPTTRSV( 'N', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL DAXPY( NRHS, -E( PART_OFFSET+ODD_SIZE ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL DGEMM( 'T', 'N', 1, NRHS, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, ZERO, $ WORK( 1+INT_ONE-1 ), INT_ONE ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL DMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 30 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 10 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 20 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * CALL DMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * CALL DMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 10 20 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL DTRTRS( 'L', 'N', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 90 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ( ODD_SIZE )*1+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL DGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ODD_SIZE*1+2+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL DGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 30 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 80 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 40 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 50 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 40 50 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL DGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ODD_SIZE*1+2+1 ), INT_ONE, WORK( 1 ), $ INT_ONE, ONE, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL DGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ( ODD_SIZE )*1+1 ), INT_ONE, WORK( 1 ), $ INT_ONE, ONE, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL DTRTRS( 'L', 'T', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 90 END IF * * * ***Modification Loop ******* * 60 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 70 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 60 70 CONTINUE * [End of GOTO Loop] * 80 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL DGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL DGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL DGEMM( 'N', 'N', ODD_SIZE, NRHS, 1, -ONE, AF( 1 ), $ ODD_SIZE, WORK( 1+INT_ONE-1 ), INT_ONE, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL DAXPY( NRHS, -( E( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL DPTTRSV( 'T', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * END IF * End of "IF( LSAME( UPLO, 'L' ) )"... 90 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 100 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PDPTTRSV * END scalapack-2.0.2/SRC/pdrot.f000644 000766 000024 00000041504 11705175572 015655 0ustar00juliestaff000000 000000 SUBROUTINE PDROT( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, $ INCY, CS, SN, WORK, LWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER N, IX, JX, INCX, IY, JY, INCY, LWORK, INFO DOUBLE PRECISION CS, SN * .. * .. Array Arguments .. INTEGER DESCX( * ), DESCY( * ) DOUBLE PRECISION X( * ), Y( * ), WORK( * ) * .. * * Purpose * ======= * PDROT applies a planar rotation defined by CS and SN to the * two distributed vectors sub(X) and sub(Y). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of elements to operate on when applying the planar * rotation to X and Y. N>=0. * * X (local input/local output) DOUBLE PRECSION array of dimension * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. If INCX = 1, then it is required * that IX = IY. 1 <= IX <= M_X. * * JX (global input) INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. If INCX = M_X, then it is required * that JX = JY. 1 <= IX <= N_X. * * DESCX (global and local input) INTEGER array of dimension 9 * The array descriptor of the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * Moreover, it must hold that INCX = M_X if INCY = M_Y and * that INCX = 1 if INCY = 1. * * Y (local input/local output) DOUBLE PRECSION array of dimension * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * * IY (global input) INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. If INCY = 1, then it is required * that IY = IX. 1 <= IY <= M_Y. * * JY (global input) INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. If INCY = M_X, then it is required * that JY = JX. 1 <= JY <= N_Y. * * DESCY (global and local input) INTEGER array of dimension 9 * The array descriptor of the distributed matrix Y. * * INCY (global input) INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * Moreover, it must hold that INCY = M_Y if INCX = M_X and * that INCY = 1 if INCX = 1. * * CS (global input) DOUBLE PRECISION * SN (global input) DOUBLE PRECISION * The parameters defining the properties of the planar * rotation. It must hold that 0 <= CS,SN <= 1 and that * SN**2 + CS**2 = 1. The latter is hardly checked in * finite precision arithmetics. * * WORK (local input) DOUBLE PRECISION array of dimension LWORK * Local workspace area. * * LWORK (local input) INTEGER * The length of the workspace array WORK. * If INCX = 1 and INCY = 1, then LWORK = 2*MB_X * * If LWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the WORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value. * If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then INFO = -i. * * Additional requirements * ======================= * * The following alignment requirements must hold: * (a) DESCX( MB_ ) = DESCY( MB_ ) and DESCX( NB_ ) = DESCY( NB_ ) * (b) DESCX( RSRC_ ) = DESCY( RSRC_ ) * (c) DESCX( CSRC_ ) = DESCY( CSRC_ ) * * ===================================================================== * * Written by Robert Granat, May 15, 2007. * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY, LEFT, RIGHT INTEGER ICTXT, NPROW, NPCOL, MYROW, MYCOL, NPROCS, $ MB, NB, XYROWS, XYCOLS, RSRC1, RSRC2, CSRC1, $ CSRC2, ICOFFXY, IROFFXY, MNWRK, LLDX, LLDY, $ INDX, JXX, XLOC1, XLOC2, RSRC, CSRC, YLOC1, $ YLOC2, JYY, IXX, IYY * .. * .. External Functions .. INTEGER NUMROC, INDXG2P, INDXG2L EXTERNAL NUMROC, INDXG2P, INDXG2L * .. * .. External Subroutines .. EXTERNAL DROT * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Local Functions .. INTEGER ICEIL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL * * Test and decode parameters * LQUERY = LWORK.EQ.-1 INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSEIF( IX.LT.1 .OR. IX.GT.DESCX(M_) ) THEN INFO = -3 ELSEIF( JX.LT.1 .OR. JX.GT.DESCX(N_) ) THEN INFO = -4 ELSEIF( INCX.NE.1 .AND. INCX.NE.DESCX(M_) ) THEN INFO = -6 ELSEIF( IY.LT.1 .OR. IY.GT.DESCY(M_) ) THEN INFO = -8 ELSEIF( JY.LT.1 .OR. JY.GT.DESCY(N_) ) THEN INFO = -9 ELSEIF( INCY.NE.1 .AND. INCY.NE.DESCY(M_) ) THEN INFO = -11 ELSEIF( (INCX.EQ.DESCX(M_) .AND. INCY.NE.DESCY(M_)) .OR. $ (INCX.EQ.1 .AND. INCY.NE.1 ) ) THEN INFO = -11 ELSEIF( (INCX.EQ.1 .AND. INCY.EQ.1) .AND. $ IX.NE.IY ) THEN INFO = -8 ELSEIF( (INCX.EQ.DESCX(M_) .AND. INCY.EQ.DESCY(M_)) .AND. $ JX.NE.JY ) THEN INFO = -9 END IF * * Compute the direction of the planar rotation * LEFT = INCX.EQ.DESCX(M_) .AND. INCY.EQ.DESCY(M_) RIGHT = INCX.EQ.1 .AND. INCY.EQ.1 * * Check blocking factors and root processor * IF( INFO.EQ.0 ) THEN IF( LEFT .AND. DESCX(NB_).NE.DESCY(NB_) ) THEN INFO = -(100*5 + NB_) END IF IF( RIGHT .AND. DESCX(MB_).NE.DESCY(NB_) ) THEN INFO = -(100*10 + MB_) END IF END IF IF( INFO.EQ.0 ) THEN IF( LEFT .AND. DESCX(CSRC_).NE.DESCY(CSRC_) ) THEN INFO = -(100*5 + CSRC_) END IF IF( RIGHT .AND. DESCX(RSRC_).NE.DESCY(RSRC_) ) THEN INFO = -(100*10 + RSRC_) END IF END IF * * Compute workspace * MB = DESCX( MB_ ) NB = DESCX( NB_ ) IF( LEFT ) THEN RSRC1 = INDXG2P( IX, MB, MYROW, DESCX(RSRC_), NPROW ) RSRC2 = INDXG2P( IY, MB, MYROW, DESCY(RSRC_), NPROW ) CSRC = INDXG2P( JX, NB, MYCOL, DESCX(CSRC_), NPCOL ) ICOFFXY = MOD( JX - 1, NB ) XYCOLS = NUMROC( N+ICOFFXY, NB, MYCOL, CSRC, NPCOL ) IF( ( MYROW.EQ.RSRC1 .OR. MYROW.EQ.RSRC2 ) .AND. $ MYCOL.EQ.CSRC ) XYCOLS = XYCOLS - ICOFFXY IF( RSRC1.NE.RSRC2 ) THEN MNWRK = XYCOLS ELSE MNWRK = 0 END IF ELSEIF( RIGHT ) THEN CSRC1 = INDXG2P( JX, NB, MYCOL, DESCX(CSRC_), NPCOL ) CSRC2 = INDXG2P( JY, NB, MYCOL, DESCY(CSRC_), NPCOL ) RSRC = INDXG2P( IX, MB, MYROW, DESCX(RSRC_), NPROW ) IROFFXY = MOD( IX - 1, MB ) XYROWS = NUMROC( N+IROFFXY, MB, MYROW, RSRC, NPROW ) IF( ( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC2 ) .AND. $ MYROW.EQ.RSRC ) XYROWS = XYROWS - IROFFXY IF( CSRC1.NE.CSRC2 ) THEN MNWRK = XYROWS ELSE MNWRK = 0 END IF END IF IF( INFO.EQ.0 ) THEN IF( .NOT.LQUERY . AND. LWORK.LT.MNWRK ) INFO = -15 END IF * * Return if some argument is incorrect * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDROT', -INFO ) RETURN ELSEIF( LQUERY ) THEN WORK( 1 ) = DBLE(MNWRK) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN RETURN END IF * * Extract local leading dimensions * LLDX = DESCX( LLD_ ) LLDY = DESCY( LLD_ ) * * If we have only one process, use the corresponding LAPACK * routine and return * IF( NPROCS.EQ.1 ) THEN IF( LEFT ) THEN CALL DROT( N, X((JX-1)*LLDX+IX), LLDX, Y((JY-1)*LLDY+IY), $ LLDY, CS, SN ) ELSEIF( RIGHT ) THEN CALL DROT( N, X((JX-1)*LLDX+IX), 1, Y((JY-1)*LLDY+IY), $ 1, CS, SN ) END IF RETURN END IF * * Exchange data between processors if necessary and perform planar * rotation * IF( LEFT ) THEN DO 10 INDX = 1, NPCOL IF( MYROW.EQ.RSRC1 .AND. XYCOLS.GT.0 ) THEN IF( INDX.EQ.1 ) THEN JXX = JX ELSE JXX = JX-ICOFFXY + (INDX-1)*NB END IF CALL INFOG2L( IX, JXX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, XLOC1, XLOC2, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN IF( RSRC1.NE.RSRC2 ) THEN CALL DGESD2D( ICTXT, 1, XYCOLS, $ X((XLOC2-1)*LLDX+XLOC1), LLDX, $ RSRC2, CSRC ) CALL DGERV2D( ICTXT, 1, XYCOLS, WORK, 1, $ RSRC2, CSRC ) CALL DROT( XYCOLS, X((XLOC2-1)*LLDX+XLOC1), $ LLDX, WORK, 1, CS, SN ) ELSE CALL INFOG2L( IY, JXX, DESCY, NPROW, NPCOL, $ MYROW, MYCOL, YLOC1, YLOC2, RSRC, $ CSRC ) CALL DROT( XYCOLS, X((XLOC2-1)*LLDX+XLOC1), $ LLDX, Y((YLOC2-1)*LLDY+YLOC1), LLDY, CS, $ SN ) END IF END IF END IF IF( MYROW.EQ.RSRC2 .AND. RSRC1.NE.RSRC2 ) THEN IF( INDX.EQ.1 ) THEN JYY = JY ELSE JYY = JY-ICOFFXY + (INDX-1)*NB END IF CALL INFOG2L( IY, JYY, DESCY, NPROW, NPCOL, MYROW, $ MYCOL, YLOC1, YLOC2, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN CALL DGESD2D( ICTXT, 1, XYCOLS, $ Y((YLOC2-1)*LLDY+YLOC1), LLDY, $ RSRC1, CSRC ) CALL DGERV2D( ICTXT, 1, XYCOLS, WORK, 1, $ RSRC1, CSRC ) CALL DROT( XYCOLS, WORK, 1, Y((YLOC2-1)*LLDY+YLOC1), $ LLDY, CS, SN ) END IF END IF 10 CONTINUE ELSEIF( RIGHT ) THEN DO 20 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 .AND. XYROWS.GT.0 ) THEN IF( INDX.EQ.1 ) THEN IXX = IX ELSE IXX = IX-IROFFXY + (INDX-1)*MB END IF CALL INFOG2L( IXX, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, XLOC1, XLOC2, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN IF( CSRC1.NE.CSRC2 ) THEN CALL DGESD2D( ICTXT, XYROWS, 1, $ X((XLOC2-1)*LLDX+XLOC1), LLDX, $ RSRC, CSRC2 ) CALL DGERV2D( ICTXT, XYROWS, 1, WORK, XYROWS, $ RSRC, CSRC2 ) CALL DROT( XYROWS, X((XLOC2-1)*LLDX+XLOC1), $ 1, WORK, 1, CS, SN ) ELSE CALL INFOG2L( IXX, JY, DESCY, NPROW, NPCOL, $ MYROW, MYCOL, YLOC1, YLOC2, RSRC, $ CSRC ) CALL DROT( XYROWS, X((XLOC2-1)*LLDX+XLOC1), $ 1, Y((YLOC2-1)*LLDY+YLOC1), 1, CS, $ SN ) END IF END IF END IF IF( MYCOL.EQ.CSRC2 .AND. CSRC1.NE.CSRC2 ) THEN IF( INDX.EQ.1 ) THEN IYY = IY ELSE IYY = IY-IROFFXY + (INDX-1)*MB END IF CALL INFOG2L( IYY, JY, DESCY, NPROW, NPCOL, MYROW, $ MYCOL, YLOC1, YLOC2, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN CALL DGESD2D( ICTXT, XYROWS, 1, $ Y((YLOC2-1)*LLDY+YLOC1), LLDY, $ RSRC, CSRC1 ) CALL DGERV2D( ICTXT, XYROWS, 1, WORK, XYROWS, $ RSRC, CSRC1 ) CALL DROT( XYROWS, WORK, 1, Y((YLOC2-1)*LLDY+YLOC1), $ 1, CS, SN ) END IF END IF 20 CONTINUE END IF * * Store minimum workspace requirements in WORK-array and return * WORK( 1 ) = DBLE(MNWRK) RETURN * * End of PDROT * END scalapack-2.0.2/SRC/pdrscl.f000644 000766 000024 00000016306 10363532303 016002 0ustar00juliestaff000000 000000 SUBROUTINE PDRSCL( N, SA, SX, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N DOUBLE PRECISION SA * .. * .. Array Arguments .. INTEGER DESCX( * ) DOUBLE PRECISION SX( * ) * .. * * Purpose * ======= * * PDRSCL multiplies an N-element real distributed vector sub( X ) by * the real scalar 1/a. This is done without overflow or underflow as * long as the final result sub( X )/a does not overflow or underflow. * * where sub( X ) denotes X(IX:IX+N-1,JX:JX), if INCX = 1, * X(IX:IX,JX:JX+N-1), if INCX = M_X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) pointer to INTEGER * The number of components of the distributed vector sub( X ). * N >= 0. * * SA (global input) DOUBLE PRECISION * The scalar a which is used to divide each component of * sub( X ). SA must be >= 0, or the subroutine will divide by * zero. * * SX (local input/local output) DOUBLE PRECISION array * containing the local pieces of a distributed matrix of * dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL DONE INTEGER ICTXT, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION BIGNUM, CDEN, CDEN1, CNUM, CNUM1, MUL, SMLNUM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PDLABAD, PDSCAL * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Get machine parameters * SMLNUM = PDLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM CALL PDLABAD( ICTXT, SMLNUM, BIGNUM ) * * Initialize the denominator to SA and the numerator to 1. * CDEN = SA CNUM = ONE * 10 CONTINUE CDEN1 = CDEN*SMLNUM CNUM1 = CNUM / BIGNUM IF( ABS( CDEN1 ).GT.ABS( CNUM ) .AND. CNUM.NE.ZERO ) THEN * * Pre-multiply sub( X ) by SMLNUM if CDEN is large compared to * CNUM. * MUL = SMLNUM DONE = .FALSE. CDEN = CDEN1 ELSE IF( ABS( CNUM1 ).GT.ABS( CDEN ) ) THEN * * Pre-multiply sub( X ) by BIGNUM if CDEN is small compared to * CNUM. * MUL = BIGNUM DONE = .FALSE. CNUM = CNUM1 ELSE * * Multiply sub( X ) by CNUM / CDEN and return. * MUL = CNUM / CDEN DONE = .TRUE. END IF * * Scale the vector sub( X ) by MUL * CALL PDSCAL( N, MUL, SX, IX, JX, DESCX, INCX ) * IF( .NOT.DONE ) $ GO TO 10 * RETURN * * End of PDRSCL * END scalapack-2.0.2/SRC/pdstebz.f000644 000766 000024 00000146002 10363532303 016163 0ustar00juliestaff000000 000000 SUBROUTINE PDSTEBZ( ICTXT, RANGE, ORDER, N, VL, VU, IL, IU, $ ABSTOL, D, E, M, NSPLIT, W, IBLOCK, ISPLIT, $ WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER ORDER, RANGE INTEGER ICTXT, IL, INFO, IU, LIWORK, LWORK, M, N, $ NSPLIT DOUBLE PRECISION ABSTOL, VL, VU * .. * .. Array Arguments .. INTEGER IBLOCK( * ), ISPLIT( * ), IWORK( * ) DOUBLE PRECISION D( * ), E( * ), W( * ), WORK( * ) * .. * * Purpose * ======= * * PDSTEBZ computes the eigenvalues of a symmetric tridiagonal matrix in * parallel. The user may ask for all eigenvalues, all eigenvalues in * the interval [VL, VU], or the eigenvalues indexed IL through IU. A * static partitioning of work is done at the beginning of PDSTEBZ which * results in all processes finding an (almost) equal number of * eigenvalues. * * NOTE : It is assumed that the user is on an IEEE machine. If the user * is not on an IEEE mchine, set the compile time flag NO_IEEE * to 1 (in SLmake.inc). The features of IEEE arithmetic that * are needed for the "fast" Sturm Count are : (a) infinity * arithmetic (b) the sign bit of a single precision floating * point number is assumed be in the 32nd bit position * (c) the sign of negative zero. * * See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal * Matrix", Report CS41, Computer Science Dept., Stanford * University, July 21, 1966. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle. * * RANGE (global input) CHARACTER * Specifies which eigenvalues are to be found. * = 'A': ("All") all eigenvalues will be found. * = 'V': ("Value") all eigenvalues in the interval * [VL, VU] will be found. * = 'I': ("Index") the IL-th through IU-th eigenvalues (of the * entire matrix) will be found. * * ORDER (global input) CHARACTER * Specifies the order in which the eigenvalues and their block * numbers are stored in W and IBLOCK. * = 'B': ("By Block") the eigenvalues will be grouped by * split-off block (see IBLOCK, ISPLIT) and * ordered from smallest to largest within * the block. * = 'E': ("Entire matrix") * the eigenvalues for the entire matrix * will be ordered from smallest to largest. * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Eigenvalues less than VL will not be * returned. Not referenced if RANGE='A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Eigenvalues greater than VU will not be * returned. VU must be greater than VL. Not referenced if * RANGE='A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL must be at least 1. * Not referenced if RANGE='A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. IU must be at least IL * and no greater than N. Not referenced if RANGE='A' or 'V'. * * ABSTOL (global input) DOUBLE PRECISION * The absolute tolerance for the eigenvalues. An eigenvalue * (or cluster) is considered to be located if it has been * determined to lie in an interval whose width is ABSTOL or * less. If ABSTOL is less than or equal to zero, then ULP*|T| * will be used, where |T| means the 1-norm of T. * Eigenvalues will be computed most accurately when ABSTOL is * set to the underflow threshold DLAMCH('U'), not zero. * Note : If eigenvectors are desired later by inverse iteration * ( PDSTEIN ), ABSTOL should be set to 2*PDLAMCH('S'). * * D (global input) DOUBLE PRECISION array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. To * avoid overflow, the matrix must be scaled so that its largest * entry is no greater than overflow**(1/2) * underflow**(1/4) * in absolute value, and for greatest accuracy, it should not * be much smaller than that. * * E (global input) DOUBLE PRECISION array, dimension (N-1) * The (n-1) off-diagonal elements of the tridiagonal matrix T. * To avoid overflow, the matrix must be scaled so that its * largest entry is no greater than overflow**(1/2) * * underflow**(1/4) in absolute value, and for greatest * accuracy, it should not be much smaller than that. * * M (global output) INTEGER * The actual number of eigenvalues found. 0 <= M <= N. * (See also the description of INFO=2) * * NSPLIT (global output) INTEGER * The number of diagonal blocks in the matrix T. * 1 <= NSPLIT <= N. * * W (global output) DOUBLE PRECISION array, dimension (N) * On exit, the first M elements of W contain the eigenvalues * on all processes. * * IBLOCK (global output) INTEGER array, dimension (N) * At each row/column j where E(j) is zero or small, the * matrix T is considered to split into a block diagonal * matrix. On exit IBLOCK(i) specifies which block (from 1 * to the number of blocks) the eigenvalue W(i) belongs to. * NOTE: in the (theoretically impossible) event that bisection * does not converge for some or all eigenvalues, INFO is set * to 1 and the ones for which it did not are identified by a * negative block number. * * ISPLIT (global output) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N. * (Only the first NSPLIT elements will actually be used, but * since the user cannot know a priori what value NSPLIT will * have, N words must be reserved for ISPLIT.) * * WORK (local workspace) DOUBLE PRECISION array, * dimension ( MAX( 5*N, 7 ) ) * * LWORK (local input) INTEGER * size of array WORK must be >= MAX( 5*N, 7 ) * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace) INTEGER array, dimension ( MAX( 4*N, 14 ) ) * * LIWORK (local input) INTEGER * size of array IWORK must be >= MAX( 4*N, 14, NPROCS ) * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0 : successful exit * < 0 : if INFO = -i, the i-th argument had an illegal value * > 0 : some or all of the eigenvalues failed to converge or * were not computed: * = 1 : Bisection failed to converge for some eigenvalues; * these eigenvalues are flagged by a negative block * number. The effect is that the eigenvalues may not * be as accurate as the absolute and relative * tolerances. This is generally caused by arithmetic * which is less accurate than PDLAMCH says. * = 2 : There is a mismatch between the number of * eigenvalues output and the number desired. * = 3 : RANGE='i', and the Gershgorin interval initially * used was incorrect. No eigenvalues were computed. * Probable cause: your machine has sloppy floating * point arithmetic. * Cure: Increase the PARAMETER "FUDGE", recompile, * and try again. * * Internal Parameters * =================== * * RELFAC DOUBLE PRECISION, default = 2.0 * The relative tolerance. An interval [a,b] lies within * "relative tolerance" if b-a < RELFAC*ulp*max(|a|,|b|), * where "ulp" is the machine precision (distance from 1 to * the next larger floating point number.) * * FUDGE DOUBLE PRECISION, default = 2.0 * A "fudge factor" to widen the Gershgorin intervals. Ideally, * a value of 1 should work, but on machines with sloppy * arithmetic, this needs to be larger. The default for * publicly released versions should be large enough to handle * the worst machine around. Note that this has no effect * on the accuracy of the solution. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, MAX, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER BLACS_PNUM DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, BLACS_PNUM, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_FREEBUFF, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDMAP, DGEBR2D, $ DGEBS2D, DGERV2D, DGESD2D, DLASRT2, GLOBCHK, $ IGEBR2D, IGEBS2D, IGERV2D, IGESD2D, IGSUM2D, $ PDLAEBZ, PDLAIECTB, PDLAIECTL, PDLAPDCT, $ PDLASNBT, PXERBLA * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER BIGNUM, DESCMULT PARAMETER ( BIGNUM = 10000, DESCMULT = 100 ) DOUBLE PRECISION ZERO, ONE, TWO, FIVE, HALF PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0, $ FIVE = 5.0D+0, HALF = 1.0D+0 / TWO ) DOUBLE PRECISION FUDGE, RELFAC PARAMETER ( FUDGE = 2.0D+0, RELFAC = 2.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER BLKNO, FOUND, I, IBEGIN, IEFLAG, IEND, IFRST, $ IINFO, ILAST, ILOAD, IM, IMYLOAD, IN, INDRIW1, $ INDRIW2, INDRW1, INDRW2, INXTLOAD, IOFF, $ IORDER, IOUT, IRANGE, IRECV, IREM, ITMP1, $ ITMP2, J, JB, K, LAST, LEXTRA, LREQ, MYCOL, $ MYROW, NALPHA, NBETA, NCMP, NEIGINT, NEXT, NGL, $ NGLOB, NGU, NINT, NPCOL, NPROW, OFFSET, $ ONEDCONTEXT, P, PREV, REXTRA, RREQ, SELF, $ TORECV DOUBLE PRECISION ALPHA, ATOLI, BETA, BNORM, DRECV, DSEND, GL, $ GU, INITVL, INITVU, LSAVE, MID, PIVMIN, RELTOL, $ SAFEMN, TMP1, TMP2, TNORM, ULP * .. * .. Local Arrays .. INTEGER IDUM( 5, 2 ) * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Set up process grid * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 M = 0 * * Decode RANGE * IF( LSAME( RANGE, 'A' ) ) THEN IRANGE = 1 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IRANGE = 2 ELSE IF( LSAME( RANGE, 'I' ) ) THEN IRANGE = 3 ELSE IRANGE = 0 END IF * * Decode ORDER * IF( LSAME( ORDER, 'B' ) ) THEN IORDER = 2 ELSE IF( LSAME( ORDER, 'E' ) .OR. LSAME( ORDER, 'A' ) ) THEN IORDER = 1 ELSE IORDER = 0 END IF * * Check for Errors * IF( NPROW.EQ.-1 ) THEN INFO = -1 ELSE * * Get machine constants * SAFEMN = PDLAMCH( ICTXT, 'S' ) ULP = PDLAMCH( ICTXT, 'P' ) RELTOL = ULP*RELFAC IDUM( 1, 1 ) = ICHAR( RANGE ) IDUM( 1, 2 ) = 2 IDUM( 2, 1 ) = ICHAR( ORDER ) IDUM( 2, 2 ) = 3 IDUM( 3, 1 ) = N IDUM( 3, 2 ) = 4 NGLOB = 5 IF( IRANGE.EQ.3 ) THEN IDUM( 4, 1 ) = IL IDUM( 4, 2 ) = 7 IDUM( 5, 1 ) = IU IDUM( 5, 2 ) = 8 ELSE IDUM( 4, 1 ) = 0 IDUM( 4, 2 ) = 0 IDUM( 5, 1 ) = 0 IDUM( 5, 2 ) = 0 END IF IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WORK( 1 ) = ABSTOL IF( IRANGE.EQ.2 ) THEN WORK( 2 ) = VL WORK( 3 ) = VU ELSE WORK( 2 ) = ZERO WORK( 3 ) = ZERO END IF CALL DGEBS2D( ICTXT, 'ALL', ' ', 3, 1, WORK, 3 ) ELSE CALL DGEBR2D( ICTXT, 'ALL', ' ', 3, 1, WORK, 3, 0, 0 ) END IF LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IRANGE.EQ.0 ) THEN INFO = -2 ELSE IF( IORDER.EQ.0 ) THEN INFO = -3 ELSE IF( IRANGE.EQ.2 .AND. VL.GE.VU ) THEN INFO = -5 ELSE IF( IRANGE.EQ.3 .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, $ N ) ) ) THEN INFO = -6 ELSE IF( IRANGE.EQ.3 .AND. ( IU.LT.MIN( N, $ IL ) .OR. IU.GT.N ) ) THEN INFO = -7 ELSE IF( LWORK.LT.MAX( 5*N, 7 ) .AND. .NOT.LQUERY ) THEN INFO = -18 ELSE IF( LIWORK.LT.MAX( 4*N, 14, NPROW*NPCOL ) .AND. .NOT. $ LQUERY ) THEN INFO = -20 ELSE IF( IRANGE.EQ.2 .AND. ( ABS( WORK( 2 )-VL ).GT.FIVE* $ ULP*ABS( VL ) ) ) THEN INFO = -5 ELSE IF( IRANGE.EQ.2 .AND. ( ABS( WORK( 3 )-VU ).GT.FIVE* $ ULP*ABS( VU ) ) ) THEN INFO = -6 ELSE IF( ABS( WORK( 1 )-ABSTOL ).GT.FIVE*ULP*ABS( ABSTOL ) ) $ THEN INFO = -9 END IF END IF IF( INFO.EQ.0 ) $ INFO = BIGNUM CALL GLOBCHK( ICTXT, NGLOB, IDUM, 5, IWORK, INFO ) IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF END IF WORK( 1 ) = DBLE( MAX( 5*N, 7 ) ) IWORK( 1 ) = MAX( 4*N, 14, NPROW*NPCOL ) * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSTEBZ', -INFO ) RETURN ELSE IF( LWORK.EQ.-1 .AND. LIWORK.EQ.-1 ) THEN RETURN END IF * * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * K = 1 DO 20 I = 0, NPROW - 1 DO 10 J = 0, NPCOL - 1 IWORK( K ) = BLACS_PNUM( ICTXT, I, J ) K = K + 1 10 CONTINUE 20 CONTINUE * P = NPROW*NPCOL NPROW = 1 NPCOL = P * CALL BLACS_GET( ICTXT, 10, ONEDCONTEXT ) CALL BLACS_GRIDMAP( ONEDCONTEXT, IWORK, NPROW, NPROW, NPCOL ) CALL BLACS_GRIDINFO( ONEDCONTEXT, I, J, K, SELF ) * * Simplifications: * IF( IRANGE.EQ.3 .AND. IL.EQ.1 .AND. IU.EQ.N ) $ IRANGE = 1 * NEXT = MOD( SELF+1, P ) PREV = MOD( P+SELF-1, P ) * * Compute squares of off-diagonals, splitting points and pivmin. * Interleave diagonals and off-diagonals. * INDRW1 = MAX( 2*N, 4 ) INDRW2 = INDRW1 + 2*N INDRIW1 = MAX( 2*N, 8 ) NSPLIT = 1 WORK( INDRW1+2*N ) = ZERO PIVMIN = ONE * DO 30 I = 1, N - 1 TMP1 = E( I )**2 J = 2*I WORK( INDRW1+J-1 ) = D( I ) IF( ABS( D( I+1 )*D( I ) )*ULP**2+SAFEMN.GT.TMP1 ) THEN ISPLIT( NSPLIT ) = I NSPLIT = NSPLIT + 1 WORK( INDRW1+J ) = ZERO ELSE WORK( INDRW1+J ) = TMP1 PIVMIN = MAX( PIVMIN, TMP1 ) END IF 30 CONTINUE WORK( INDRW1+2*N-1 ) = D( N ) ISPLIT( NSPLIT ) = N PIVMIN = PIVMIN*SAFEMN * * Compute Gershgorin interval [gl,gu] for entire matrix * GU = D( 1 ) GL = D( 1 ) TMP1 = ZERO * DO 40 I = 1, N - 1 TMP2 = ABS( E( I ) ) GU = MAX( GU, D( I )+TMP1+TMP2 ) GL = MIN( GL, D( I )-TMP1-TMP2 ) TMP1 = TMP2 40 CONTINUE GU = MAX( GU, D( N )+TMP1 ) GL = MIN( GL, D( N )-TMP1 ) TNORM = MAX( ABS( GL ), ABS( GU ) ) GL = GL - FUDGE*TNORM*ULP*N - FUDGE*TWO*PIVMIN GU = GU + FUDGE*TNORM*ULP*N + FUDGE*PIVMIN * IF( ABSTOL.LE.ZERO ) THEN ATOLI = ULP*TNORM ELSE ATOLI = ABSTOL END IF * * Find out if on an IEEE machine, the sign bit is the * 32nd bit (Big Endian) or the 64th bit (Little Endian) * IF( IRANGE.EQ.1 .OR. NSPLIT.EQ.1 ) THEN CALL PDLASNBT( IEFLAG ) ELSE IEFLAG = 0 END IF LEXTRA = 0 REXTRA = 0 * * Form Initial Interval containing desired eigenvalues * IF( IRANGE.EQ.1 ) THEN INITVL = GL INITVU = GU WORK( 1 ) = GL WORK( 2 ) = GU IWORK( 1 ) = 0 IWORK( 2 ) = N IFRST = 1 ILAST = N ELSE IF( IRANGE.EQ.2 ) THEN IF( VL.GT.GL ) THEN IF( IEFLAG.EQ.0 ) THEN CALL PDLAPDCT( VL, N, WORK( INDRW1+1 ), PIVMIN, IFRST ) ELSE IF( IEFLAG.EQ.1 ) THEN CALL PDLAIECTB( VL, N, WORK( INDRW1+1 ), IFRST ) ELSE CALL PDLAIECTL( VL, N, WORK( INDRW1+1 ), IFRST ) END IF IFRST = IFRST + 1 INITVL = VL ELSE INITVL = GL IFRST = 1 END IF IF( VU.LT.GU ) THEN IF( IEFLAG.EQ.0 ) THEN CALL PDLAPDCT( VU, N, WORK( INDRW1+1 ), PIVMIN, ILAST ) ELSE IF( IEFLAG.EQ.1 ) THEN CALL PDLAIECTB( VU, N, WORK( INDRW1+1 ), ILAST ) ELSE CALL PDLAIECTL( VU, N, WORK( INDRW1+1 ), ILAST ) END IF INITVU = VU ELSE INITVU = GU ILAST = N END IF WORK( 1 ) = INITVL WORK( 2 ) = INITVU IWORK( 1 ) = IFRST - 1 IWORK( 2 ) = ILAST ELSE IF( IRANGE.EQ.3 ) THEN WORK( 1 ) = GL WORK( 2 ) = GU IWORK( 1 ) = 0 IWORK( 2 ) = N IWORK( 5 ) = IL - 1 IWORK( 6 ) = IU CALL PDLAEBZ( 0, N, 2, 1, ATOLI, RELTOL, PIVMIN, $ WORK( INDRW1+1 ), IWORK( 5 ), WORK, IWORK, NINT, $ LSAVE, IEFLAG, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 3 GO TO 230 END IF IF( NINT.GT.1 ) THEN IF( IWORK( 5 ).EQ.IL-1 ) THEN WORK( 2 ) = WORK( 4 ) IWORK( 2 ) = IWORK( 4 ) ELSE WORK( 1 ) = WORK( 3 ) IWORK( 1 ) = IWORK( 3 ) END IF IF( IWORK( 1 ).LT.0 .OR. IWORK( 1 ).GT.IL-1 .OR. $ IWORK( 2 ).LE.MIN( IU-1, IWORK( 1 ) ) .OR. $ IWORK( 2 ).GT.N ) THEN INFO = 3 GO TO 230 END IF END IF LEXTRA = IL - 1 - IWORK( 1 ) REXTRA = IWORK( 2 ) - IU INITVL = WORK( 1 ) INITVU = WORK( 2 ) IFRST = IL ILAST = IU END IF * NVL = IFRST - 1 * NVU = ILAST GL = INITVL GU = INITVU NGL = IWORK( 1 ) NGU = IWORK( 2 ) IM = 0 FOUND = 0 INDRIW2 = INDRIW1 + NGU - NGL IEND = 0 IF( IFRST.GT.ILAST ) $ GO TO 100 IF( IFRST.EQ.1 .AND. ILAST.EQ.N ) $ IRANGE = 1 * * Find Eigenvalues -- Loop Over Blocks * DO 90 JB = 1, NSPLIT IOFF = IEND IBEGIN = IOFF + 1 IEND = ISPLIT( JB ) IN = IEND - IOFF IF( JB.NE.1 ) THEN IF( IRANGE.NE.1 ) THEN FOUND = IM * * Find total number of eigenvalues found thus far * CALL IGSUM2D( ONEDCONTEXT, 'All', ' ', 1, 1, FOUND, 1, $ -1, -1 ) ELSE FOUND = IOFF END IF END IF * IF( SELF.GE.P ) * $ GO TO 30 IF( IN.NE.N ) THEN * * Compute Gershgorin interval [gl,gu] for split matrix * GU = D( IBEGIN ) GL = D( IBEGIN ) TMP1 = ZERO * DO 50 J = IBEGIN, IEND - 1 TMP2 = ABS( E( J ) ) GU = MAX( GU, D( J )+TMP1+TMP2 ) GL = MIN( GL, D( J )-TMP1-TMP2 ) TMP1 = TMP2 50 CONTINUE * GU = MAX( GU, D( IEND )+TMP1 ) GL = MIN( GL, D( IEND )-TMP1 ) BNORM = MAX( ABS( GL ), ABS( GU ) ) GL = GL - FUDGE*BNORM*ULP*IN - FUDGE*PIVMIN GU = GU + FUDGE*BNORM*ULP*IN + FUDGE*PIVMIN * * Compute ATOLI for the current submatrix * IF( ABSTOL.LE.ZERO ) THEN ATOLI = ULP*BNORM ELSE ATOLI = ABSTOL END IF * IF( GL.LT.INITVL ) THEN GL = INITVL IF( IEFLAG.EQ.0 ) THEN CALL PDLAPDCT( GL, IN, WORK( INDRW1+2*IOFF+1 ), $ PIVMIN, NGL ) ELSE IF( IEFLAG.EQ.1 ) THEN CALL PDLAIECTB( GL, IN, WORK( INDRW1+2*IOFF+1 ), NGL ) ELSE CALL PDLAIECTL( GL, IN, WORK( INDRW1+2*IOFF+1 ), NGL ) END IF ELSE NGL = 0 END IF IF( GU.GT.INITVU ) THEN GU = INITVU IF( IEFLAG.EQ.0 ) THEN CALL PDLAPDCT( GU, IN, WORK( INDRW1+2*IOFF+1 ), $ PIVMIN, NGU ) ELSE IF( IEFLAG.EQ.1 ) THEN CALL PDLAIECTB( GU, IN, WORK( INDRW1+2*IOFF+1 ), NGU ) ELSE CALL PDLAIECTL( GU, IN, WORK( INDRW1+2*IOFF+1 ), NGU ) END IF ELSE NGU = IN END IF IF( NGL.GE.NGU ) $ GO TO 90 WORK( 1 ) = GL WORK( 2 ) = GU IWORK( 1 ) = NGL IWORK( 2 ) = NGU END IF OFFSET = FOUND - NGL BLKNO = JB * * Do a static partitioning of work so that each process * has to find an (almost) equal number of eigenvalues * NCMP = NGU - NGL ILOAD = NCMP / P IREM = NCMP - ILOAD*P ITMP1 = MOD( SELF-FOUND, P ) IF( ITMP1.LT.0 ) $ ITMP1 = ITMP1 + P IF( ITMP1.LT.IREM ) THEN IMYLOAD = ILOAD + 1 ELSE IMYLOAD = ILOAD END IF IF( IMYLOAD.EQ.0 ) THEN GO TO 90 ELSE IF( IN.EQ.1 ) THEN WORK( INDRW2+IM+1 ) = WORK( INDRW1+2*IOFF+1 ) IWORK( INDRIW1+IM+1 ) = BLKNO IWORK( INDRIW2+IM+1 ) = OFFSET + 1 IM = IM + 1 GO TO 90 ELSE INXTLOAD = ILOAD ITMP2 = MOD( SELF+1-FOUND, P ) IF( ITMP2.LT.0 ) $ ITMP2 = ITMP2 + P IF( ITMP2.LT.IREM ) $ INXTLOAD = INXTLOAD + 1 LREQ = NGL + ITMP1*ILOAD + MIN( IREM, ITMP1 ) RREQ = LREQ + IMYLOAD IWORK( 5 ) = LREQ IWORK( 6 ) = RREQ TMP1 = WORK( 1 ) ITMP1 = IWORK( 1 ) CALL PDLAEBZ( 1, IN, 1, 1, ATOLI, RELTOL, PIVMIN, $ WORK( INDRW1+2*IOFF+1 ), IWORK( 5 ), WORK, $ IWORK, NINT, LSAVE, IEFLAG, IINFO ) ALPHA = WORK( 1 ) BETA = WORK( 2 ) NALPHA = IWORK( 1 ) NBETA = IWORK( 2 ) DSEND = BETA IF( NBETA.GT.RREQ+INXTLOAD ) THEN NBETA = RREQ DSEND = ALPHA END IF LAST = MOD( FOUND+MIN( NGU-NGL, P )-1, P ) IF( LAST.LT.0 ) $ LAST = LAST + P IF( SELF.NE.LAST ) THEN CALL DGESD2D( ONEDCONTEXT, 1, 1, DSEND, 1, 0, NEXT ) CALL IGESD2D( ONEDCONTEXT, 1, 1, NBETA, 1, 0, NEXT ) END IF IF( SELF.NE.MOD( FOUND, P ) ) THEN CALL DGERV2D( ONEDCONTEXT, 1, 1, DRECV, 1, 0, PREV ) CALL IGERV2D( ONEDCONTEXT, 1, 1, IRECV, 1, 0, PREV ) ELSE DRECV = TMP1 IRECV = ITMP1 END IF WORK( 1 ) = MAX( LSAVE, DRECV ) IWORK( 1 ) = IRECV ALPHA = MAX( ALPHA, WORK( 1 ) ) NALPHA = MAX( NALPHA, IRECV ) IF( BETA-ALPHA.LE.MAX( ATOLI, RELTOL*MAX( ABS( ALPHA ), $ ABS( BETA ) ) ) ) THEN MID = HALF*( ALPHA+BETA ) DO 60 J = OFFSET + NALPHA + 1, OFFSET + NBETA WORK( INDRW2+IM+1 ) = MID IWORK( INDRIW1+IM+1 ) = BLKNO IWORK( INDRIW2+IM+1 ) = J IM = IM + 1 60 CONTINUE WORK( 2 ) = ALPHA IWORK( 2 ) = NALPHA END IF END IF NEIGINT = IWORK( 2 ) - IWORK( 1 ) IF( NEIGINT.LE.0 ) $ GO TO 90 * * Call the main computational routine * CALL PDLAEBZ( 2, IN, NEIGINT, 1, ATOLI, RELTOL, PIVMIN, $ WORK( INDRW1+2*IOFF+1 ), IWORK, WORK, IWORK, $ IOUT, LSAVE, IEFLAG, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 1 END IF DO 80 I = 1, IOUT MID = HALF*( WORK( 2*I-1 )+WORK( 2*I ) ) IF( I.GT.IOUT-IINFO ) $ BLKNO = -BLKNO DO 70 J = OFFSET + IWORK( 2*I-1 ) + 1, $ OFFSET + IWORK( 2*I ) WORK( INDRW2+IM+1 ) = MID IWORK( INDRIW1+IM+1 ) = BLKNO IWORK( INDRIW2+IM+1 ) = J IM = IM + 1 70 CONTINUE 80 CONTINUE 90 CONTINUE * * Find out total number of eigenvalues computed * 100 CONTINUE M = IM CALL IGSUM2D( ONEDCONTEXT, 'ALL', ' ', 1, 1, M, 1, -1, -1 ) * * Move the eigenvalues found to their final destinations * DO 130 I = 1, P IF( SELF.EQ.I-1 ) THEN CALL IGEBS2D( ONEDCONTEXT, 'ALL', ' ', 1, 1, IM, 1 ) IF( IM.NE.0 ) THEN CALL IGEBS2D( ONEDCONTEXT, 'ALL', ' ', IM, 1, $ IWORK( INDRIW2+1 ), IM ) CALL DGEBS2D( ONEDCONTEXT, 'ALL', ' ', IM, 1, $ WORK( INDRW2+1 ), IM ) CALL IGEBS2D( ONEDCONTEXT, 'ALL', ' ', IM, 1, $ IWORK( INDRIW1+1 ), IM ) DO 110 J = 1, IM W( IWORK( INDRIW2+J ) ) = WORK( INDRW2+J ) IBLOCK( IWORK( INDRIW2+J ) ) = IWORK( INDRIW1+J ) 110 CONTINUE END IF ELSE CALL IGEBR2D( ONEDCONTEXT, 'ALL', ' ', 1, 1, TORECV, 1, 0, $ I-1 ) IF( TORECV.NE.0 ) THEN CALL IGEBR2D( ONEDCONTEXT, 'ALL', ' ', TORECV, 1, IWORK, $ TORECV, 0, I-1 ) CALL DGEBR2D( ONEDCONTEXT, 'ALL', ' ', TORECV, 1, WORK, $ TORECV, 0, I-1 ) CALL IGEBR2D( ONEDCONTEXT, 'ALL', ' ', TORECV, 1, $ IWORK( N+1 ), TORECV, 0, I-1 ) DO 120 J = 1, TORECV W( IWORK( J ) ) = WORK( J ) IBLOCK( IWORK( J ) ) = IWORK( N+J ) 120 CONTINUE END IF END IF 130 CONTINUE IF( NSPLIT.GT.1 .AND. IORDER.EQ.1 ) THEN * * Sort the eigenvalues * * DO 140 I = 1, M IWORK( M+I ) = I 140 CONTINUE CALL DLASRT2( 'I', M, W, IWORK( M+1 ), IINFO ) DO 150 I = 1, M IWORK( I ) = IBLOCK( I ) 150 CONTINUE DO 160 I = 1, M IBLOCK( I ) = IWORK( IWORK( M+I ) ) 160 CONTINUE END IF IF( IRANGE.EQ.3 .AND. ( LEXTRA.GT.0 .OR. REXTRA.GT.0 ) ) THEN * * Discard unwanted eigenvalues (occurs only when RANGE = 'I', * and eigenvalues IL, and/or IU are in a cluster) * DO 170 I = 1, M WORK( I ) = W( I ) IWORK( I ) = I IWORK( M+I ) = I 170 CONTINUE DO 190 I = 1, LEXTRA ITMP1 = I DO 180 J = I + 1, M IF( WORK( J ).LT.WORK( ITMP1 ) ) THEN ITMP1 = J END IF 180 CONTINUE TMP1 = WORK( I ) WORK( I ) = WORK( ITMP1 ) WORK( ITMP1 ) = TMP1 IWORK( IWORK( M+ITMP1 ) ) = I IWORK( IWORK( M+I ) ) = ITMP1 ITMP2 = IWORK( M+I ) IWORK( M+I ) = IWORK( M+ITMP1 ) IWORK( M+ITMP1 ) = ITMP2 190 CONTINUE DO 210 I = 1, REXTRA ITMP1 = M - I + 1 DO 200 J = M - I, LEXTRA + 1, -1 IF( WORK( J ).GT.WORK( ITMP1 ) ) THEN ITMP1 = J END IF 200 CONTINUE TMP1 = WORK( M-I+1 ) WORK( M-I+1 ) = WORK( ITMP1 ) WORK( ITMP1 ) = TMP1 IWORK( IWORK( M+ITMP1 ) ) = M - I + 1 IWORK( IWORK( 2*M-I+1 ) ) = ITMP1 ITMP2 = IWORK( 2*M-I+1 ) IWORK( 2*M-I+1 ) = IWORK( M+ITMP1 ) IWORK( M+ITMP1 ) = ITMP2 * IWORK( ITMP1 ) = 1 210 CONTINUE J = 0 DO 220 I = 1, M IF( IWORK( I ).GT.LEXTRA .AND. IWORK( I ).LE.M-REXTRA ) THEN J = J + 1 W( J ) = WORK( IWORK( I ) ) IBLOCK( J ) = IBLOCK( I ) END IF 220 CONTINUE M = M - LEXTRA - REXTRA END IF IF( M.NE.ILAST-IFRST+1 ) THEN INFO = 2 END IF * 230 CONTINUE CALL BLACS_FREEBUFF( ONEDCONTEXT, 1 ) CALL BLACS_GRIDEXIT( ONEDCONTEXT ) RETURN * * End of PDSTEBZ * END * SUBROUTINE PDLAEBZ( IJOB, N, MMAX, MINP, ABSTOL, RELTOL, PIVMIN, $ D, NVAL, INTVL, INTVLCT, MOUT, LSAVE, IEFLAG, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * * .. Scalar Arguments .. INTEGER IEFLAG, IJOB, INFO, MINP, MMAX, MOUT, N DOUBLE PRECISION ABSTOL, LSAVE, PIVMIN, RELTOL * .. * .. Array Arguments .. INTEGER INTVLCT( * ), NVAL( * ) DOUBLE PRECISION D( * ), INTVL( * ) * .. * * Purpose * ======= * * PDLAEBZ contains the iteration loop which computes the eigenvalues * contained in the input intervals [ INTVL(2*j-1), INTVL(2*j) ] where * j = 1,...,MINP. It uses and computes the function N(w), which is * the count of eigenvalues of a symmetric tridiagonal matrix less than * or equal to its argument w. * * This is a ScaLAPACK internal subroutine and arguments are not * checked for unreasonable values. * * Arguments * ========= * * IJOB (input) INTEGER * Specifies the computation done by PDLAEBZ * = 0 : Find an interval with desired values of N(w) at the * endpoints of the interval. * = 1 : Find a floating point number contained in the initial * interval with a desired value of N(w). * = 2 : Perform bisection iteration to find eigenvalues of T. * * N (input) INTEGER * The order of the tridiagonal matrix T. N >= 1. * * MMAX (input) INTEGER * The maximum number of intervals that may be generated. If * more than MMAX intervals are generated, then PDLAEBZ will * quit with INFO = MMAX+1. * * MINP (input) INTEGER * The initial number of intervals. MINP <= MMAX. * * ABSTOL (input) DOUBLE PRECISION * The minimum (absolute) width of an interval. When an interval * is narrower than ABSTOL, or than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be sufficiently * small, i.e., converged. * This must be at least zero. * * RELTOL (input) DOUBLE PRECISION * The minimum relative width of an interval. When an interval * is narrower than ABSTOL, or than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be sufficiently * small, i.e., converged. * Note : This should be at least radix*machine epsilon. * * PIVMIN (input) DOUBLE PRECISION * The minimum absolute of a "pivot" in the "paranoid" * implementation of the Sturm sequence loop. This must be at * least max_j |e(j)^2| *safe_min, and at least safe_min, where * safe_min is at least the smallest number that can divide 1.0 * without overflow. * See PDLAPDCT for the "paranoid" implementation of the Sturm * sequence loop. * * D (input) DOUBLE PRECISION array, dimension (2*N - 1) * Contains the diagonals and the squares of the off-diagonal * elements of the tridiagonal matrix T. These elements are * assumed to be interleaved in memory for better cache * performance. The diagonal entries of T are in the entries * D(1),D(3),...,D(2*N-1), while the squares of the off-diagonal * entries are D(2),D(4),...,D(2*N-2). To avoid overflow, the * matrix must be scaled so that its largest entry is no greater * than overflow**(1/2) * underflow**(1/4) in absolute value, * and for greatest accuracy, it should not be much smaller * than that. * * NVAL (input/output) INTEGER array, dimension (4) * If IJOB = 0, the desired values of N(w) are in NVAL(1) and * NVAL(2). * If IJOB = 1, NVAL(2) is the desired value of N(w). * If IJOB = 2, not referenced. * This array will, in general, be reordered on output. * * INTVL (input/output) DOUBLE PRECISION array, dimension (2*MMAX) * The endpoints of the intervals. INTVL(2*j-1) is the left * endpoint of the j-th interval, and INTVL(2*j) is the right * endpoint of the j-th interval. The input intervals will, * in general, be modified, split and reordered by the * calculation. * On input, INTVL contains the MINP input intervals. * On output, INTVL contains the converged intervals. * * INTVLCT (input/output) INTEGER array, dimension (2*MMAX) * The counts at the endpoints of the intervals. INTVLCT(2*j-1) * is the count at the left endpoint of the j-th interval, i.e., * the function value N(INTVL(2*j-1)), and INTVLCT(2*j) is the * count at the right endpoint of the j-th interval. * On input, INTVLCT contains the counts at the endpoints of * the MINP input intervals. * On output, INTVLCT contains the counts at the endpoints of * the converged intervals. * * MOUT (output) INTEGER * The number of intervals output. * * LSAVE (output) DOUBLE PRECISION * If IJOB = 0 or 2, not referenced. * If IJOB = 1, this is the largest floating point number * encountered which has count N(w) = NVAL(1). * * IEFLAG (input) INTEGER * A flag which indicates whether N(w) should be speeded up by * exploiting IEEE Arithmetic. * * INFO (output) INTEGER * = 0 : All intervals converged. * = 1 - MMAX : The last INFO intervals did not converge. * = MMAX + 1 : More than MMAX intervals were generated. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, INT, LOG, MAX, MIN * .. * .. External Subroutines .. EXTERNAL PDLAECV, PDLAIECTB, PDLAIECTL, PDLAPDCT * .. * .. Parameters .. DOUBLE PRECISION ZERO, TWO, HALF PARAMETER ( ZERO = 0.0D+0, TWO = 2.0D+0, $ HALF = 1.0D+0 / TWO ) * .. * .. Local Scalars .. INTEGER I, ITMAX, J, K, KF, KL, KLNEW, L, LCNT, LREQ, $ NALPHA, NBETA, NMID, RCNT, RREQ DOUBLE PRECISION ALPHA, BETA, MID * .. * .. Executable Statements .. * KF = 1 KL = MINP + 1 INFO = 0 IF( INTVL( 2 )-INTVL( 1 ).LE.ZERO ) THEN INFO = MINP MOUT = KF RETURN END IF IF( IJOB.EQ.0 ) THEN * * Check if some input intervals have "converged" * CALL PDLAECV( 0, KF, KL, INTVL, INTVLCT, NVAL, $ MAX( ABSTOL, PIVMIN ), RELTOL ) IF( KF.GE.KL ) $ GO TO 60 * * Compute upper bound on number of iterations needed * ITMAX = INT( ( LOG( INTVL( 2 )-INTVL( 1 )+PIVMIN )- $ LOG( PIVMIN ) ) / LOG( TWO ) ) + 2 * * Iteration Loop * DO 20 I = 1, ITMAX KLNEW = KL DO 10 J = KF, KL - 1 K = 2*J * * Bisect the interval and find the count at that point * MID = HALF*( INTVL( K-1 )+INTVL( K ) ) IF( IEFLAG.EQ.0 ) THEN CALL PDLAPDCT( MID, N, D, PIVMIN, NMID ) ELSE IF( IEFLAG.EQ.1 ) THEN CALL PDLAIECTB( MID, N, D, NMID ) ELSE CALL PDLAIECTL( MID, N, D, NMID ) END IF LREQ = NVAL( K-1 ) RREQ = NVAL( K ) IF( KL.EQ.1 ) $ NMID = MIN( INTVLCT( K ), $ MAX( INTVLCT( K-1 ), NMID ) ) IF( NMID.LE.NVAL( K-1 ) ) THEN INTVL( K-1 ) = MID INTVLCT( K-1 ) = NMID END IF IF( NMID.GE.NVAL( K ) ) THEN INTVL( K ) = MID INTVLCT( K ) = NMID END IF IF( NMID.GT.LREQ .AND. NMID.LT.RREQ ) THEN L = 2*KLNEW INTVL( L-1 ) = MID INTVL( L ) = INTVL( K ) INTVLCT( L-1 ) = NVAL( K ) INTVLCT( L ) = INTVLCT( K ) INTVL( K ) = MID INTVLCT( K ) = NVAL( K-1 ) NVAL( L-1 ) = NVAL( K ) NVAL( L ) = NVAL( L-1 ) NVAL( K ) = NVAL( K-1 ) KLNEW = KLNEW + 1 END IF 10 CONTINUE KL = KLNEW CALL PDLAECV( 0, KF, KL, INTVL, INTVLCT, NVAL, $ MAX( ABSTOL, PIVMIN ), RELTOL ) IF( KF.GE.KL ) $ GO TO 60 20 CONTINUE ELSE IF( IJOB.EQ.1 ) THEN ALPHA = INTVL( 1 ) BETA = INTVL( 2 ) NALPHA = INTVLCT( 1 ) NBETA = INTVLCT( 2 ) LSAVE = ALPHA LREQ = NVAL( 1 ) RREQ = NVAL( 2 ) 30 CONTINUE IF( NBETA.NE.RREQ .AND. BETA-ALPHA.GT. $ MAX( ABSTOL, RELTOL*MAX( ABS( ALPHA ), ABS( BETA ) ) ) ) $ THEN * * Bisect the interval and find the count at that point * MID = HALF*( ALPHA+BETA ) IF( IEFLAG.EQ.0 ) THEN CALL PDLAPDCT( MID, N, D, PIVMIN, NMID ) ELSE IF( IEFLAG.EQ.1 ) THEN CALL PDLAIECTB( MID, N, D, NMID ) ELSE CALL PDLAIECTL( MID, N, D, NMID ) END IF NMID = MIN( NBETA, MAX( NALPHA, NMID ) ) IF( NMID.GE.RREQ ) THEN BETA = MID NBETA = NMID ELSE ALPHA = MID NALPHA = NMID IF( NMID.EQ.LREQ ) $ LSAVE = ALPHA END IF GO TO 30 END IF KL = KF INTVL( 1 ) = ALPHA INTVL( 2 ) = BETA INTVLCT( 1 ) = NALPHA INTVLCT( 2 ) = NBETA ELSE IF( IJOB.EQ.2 ) THEN * * Check if some input intervals have "converged" * CALL PDLAECV( 1, KF, KL, INTVL, INTVLCT, NVAL, $ MAX( ABSTOL, PIVMIN ), RELTOL ) IF( KF.GE.KL ) $ GO TO 60 * * Compute upper bound on number of iterations needed * ITMAX = INT( ( LOG( INTVL( 2 )-INTVL( 1 )+PIVMIN )- $ LOG( PIVMIN ) ) / LOG( TWO ) ) + 2 * * Iteration Loop * DO 50 I = 1, ITMAX KLNEW = KL DO 40 J = KF, KL - 1 K = 2*J MID = HALF*( INTVL( K-1 )+INTVL( K ) ) IF( IEFLAG.EQ.0 ) THEN CALL PDLAPDCT( MID, N, D, PIVMIN, NMID ) ELSE IF( IEFLAG.EQ.1 ) THEN CALL PDLAIECTB( MID, N, D, NMID ) ELSE CALL PDLAIECTL( MID, N, D, NMID ) END IF LCNT = INTVLCT( K-1 ) RCNT = INTVLCT( K ) NMID = MIN( RCNT, MAX( LCNT, NMID ) ) * * Form New Interval(s) * IF( NMID.EQ.LCNT ) THEN INTVL( K-1 ) = MID ELSE IF( NMID.EQ.RCNT ) THEN INTVL( K ) = MID ELSE IF( KLNEW.LT.MMAX+1 ) THEN L = 2*KLNEW INTVL( L-1 ) = MID INTVL( L ) = INTVL( K ) INTVLCT( L-1 ) = NMID INTVLCT( L ) = INTVLCT( K ) INTVL( K ) = MID INTVLCT( K ) = NMID KLNEW = KLNEW + 1 ELSE INFO = MMAX + 1 RETURN END IF 40 CONTINUE KL = KLNEW CALL PDLAECV( 1, KF, KL, INTVL, INTVLCT, NVAL, $ MAX( ABSTOL, PIVMIN ), RELTOL ) IF( KF.GE.KL ) $ GO TO 60 50 CONTINUE END IF 60 CONTINUE INFO = MAX( KL-KF, 0 ) MOUT = KL - 1 RETURN * * End of PDLAEBZ * END * * SUBROUTINE PDLAECV( IJOB, KF, KL, INTVL, INTVLCT, NVAL, ABSTOL, $ RELTOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * * .. Scalar Arguments .. INTEGER IJOB, KF, KL DOUBLE PRECISION ABSTOL, RELTOL * .. * .. Array Arguments .. INTEGER INTVLCT( * ), NVAL( * ) DOUBLE PRECISION INTVL( * ) * .. * * Purpose * ======= * * PDLAECV checks if the input intervals [ INTVL(2*i-1), INTVL(2*i) ], * i = KF, ... , KL-1, have "converged". * PDLAECV modifies KF to be the index of the last converged interval, * i.e., on output, all intervals [ INTVL(2*i-1), INTVL(2*i) ], i < KF, * have converged. Note that the input intervals may be reordered by * PDLAECV. * * This is a SCALAPACK internal procedure and arguments are not checked * for unreasonable values. * * Arguments * ========= * * IJOB (input) INTEGER * Specifies the criterion for "convergence" of an interval. * = 0 : When an interval is narrower than ABSTOL, or than * RELTOL times the larger (in magnitude) endpoint, then * it is considered to have "converged". * = 1 : When an interval is narrower than ABSTOL, or than * RELTOL times the larger (in magnitude) endpoint, or if * the counts at the endpoints are identical to the counts * specified by NVAL ( see NVAL ) then the interval is * considered to have "converged". * * KF (input/output) INTEGER * On input, the index of the first input interval is 2*KF-1. * On output, the index of the last converged interval * is 2*KF-3. * * KL (input) INTEGER * The index of the last input interval is 2*KL-3. * * INTVL (input/output) DOUBLE PRECISION array, dimension (2*(KL-KF)) * The endpoints of the intervals. INTVL(2*j-1) is the left * oendpoint f the j-th interval, and INTVL(2*j) is the right * endpoint of the j-th interval. The input intervals will, * in general, be reordered on output. * On input, INTVL contains the KL-KF input intervals. * On output, INTVL contains the converged intervals, 1 thru' * KF-1, and the unconverged intervals, KF thru' KL-1. * * INTVLCT (input/output) INTEGER array, dimension (2*(KL-KF)) * The counts at the endpoints of the intervals. INTVLCT(2*j-1) * is the count at the left endpoint of the j-th interval, i.e., * the function value N(INTVL(2*j-1)), and INTVLCT(2*j) is the * count at the right endpoint of the j-th interval. This array * will, in general, be reordered on output. * See the comments in PDLAEBZ for more on the function N(w). * * NVAL (input/output) INTEGER array, dimension (2*(KL-KF)) * The desired counts, N(w), at the endpoints of the * corresponding intervals. This array will, in general, * be reordered on output. * * ABSTOL (input) DOUBLE PRECISION * The minimum (absolute) width of an interval. When an interval * is narrower than ABSTOL, or than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be sufficiently * small, i.e., converged. * Note : This must be at least zero. * * RELTOL (input) DOUBLE PRECISION * The minimum relative width of an interval. When an interval * is narrower than ABSTOL, or than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be sufficiently * small, i.e., converged. * Note : This should be at least radix*machine epsilon. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Local Scalars .. LOGICAL CONDN INTEGER I, ITMP1, ITMP2, J, K, KFNEW DOUBLE PRECISION TMP1, TMP2, TMP3, TMP4 * .. * .. Executable Statements .. * KFNEW = KF DO 10 I = KF, KL - 1 K = 2*I TMP3 = INTVL( K-1 ) TMP4 = INTVL( K ) TMP1 = ABS( TMP4-TMP3 ) TMP2 = MAX( ABS( TMP3 ), ABS( TMP4 ) ) CONDN = TMP1.LT.MAX( ABSTOL, RELTOL*TMP2 ) IF( IJOB.EQ.0 ) $ CONDN = CONDN .OR. ( ( INTVLCT( K-1 ).EQ.NVAL( K-1 ) ) .AND. $ INTVLCT( K ).EQ.NVAL( K ) ) IF( CONDN ) THEN IF( I.GT.KFNEW ) THEN * * Reorder Intervals * J = 2*KFNEW TMP1 = INTVL( K-1 ) TMP2 = INTVL( K ) ITMP1 = INTVLCT( K-1 ) ITMP2 = INTVLCT( K ) INTVL( K-1 ) = INTVL( J-1 ) INTVL( K ) = INTVL( J ) INTVLCT( K-1 ) = INTVLCT( J-1 ) INTVLCT( K ) = INTVLCT( J ) INTVL( J-1 ) = TMP1 INTVL( J ) = TMP2 INTVLCT( J-1 ) = ITMP1 INTVLCT( J ) = ITMP2 IF( IJOB.EQ.0 ) THEN ITMP1 = NVAL( K-1 ) NVAL( K-1 ) = NVAL( J-1 ) NVAL( J-1 ) = ITMP1 ITMP1 = NVAL( K ) NVAL( K ) = NVAL( J ) NVAL( J ) = ITMP1 END IF END IF KFNEW = KFNEW + 1 END IF 10 CONTINUE KF = KFNEW RETURN * * End of PDLAECV * END * SUBROUTINE PDLAPDCT( SIGMA, N, D, PIVMIN, COUNT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * * .. Scalar Arguments .. INTEGER COUNT, N DOUBLE PRECISION PIVMIN, SIGMA * .. * .. Array Arguments .. DOUBLE PRECISION D( * ) * .. * * Purpose * ======= * * PDLAPDCT counts the number of negative eigenvalues of (T - SIGMA I). * This implementation of the Sturm Sequence loop has conditionals in * the innermost loop to avoid overflow and determine the sign of a * floating point number. PDLAPDCT will be referred to as the "paranoid" * implementation of the Sturm Sequence loop. * * This is a SCALAPACK internal procedure and arguments are not checked * for unreasonable values. * * Arguments * ========= * * SIGMA (input) DOUBLE PRECISION * The shift. PDLAPDCT finds the number of eigenvalues of T less * than or equal to SIGMA. * * N (input) INTEGER * The order of the tridiagonal matrix T. N >= 1. * * D (input) DOUBLE PRECISION array, dimension (2*N - 1) * Contains the diagonals and the squares of the off-diagonal * elements of the tridiagonal matrix T. These elements are * assumed to be interleaved in memory for better cache * performance. The diagonal entries of T are in the entries * D(1),D(3),...,D(2*N-1), while the squares of the off-diagonal * entries are D(2),D(4),...,D(2*N-2). To avoid overflow, the * matrix must be scaled so that its largest entry is no greater * than overflow**(1/2) * underflow**(1/4) in absolute value, * and for greatest accuracy, it should not be much smaller * than that. * * PIVMIN (input) DOUBLE PRECISION * The minimum absolute of a "pivot" in this "paranoid" * implementation of the Sturm sequence loop. This must be at * least max_j |e(j)^2| *safe_min, and at least safe_min, where * safe_min is at least the smallest number that can divide 1.0 * without overflow. * * COUNT (output) INTEGER * The count of the number of eigenvalues of T less than or * equal to SIGMA. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I DOUBLE PRECISION TMP * .. * .. Executable Statements .. * TMP = D( 1 ) - SIGMA IF( ABS( TMP ).LE.PIVMIN ) $ TMP = -PIVMIN COUNT = 0 IF( TMP.LE.ZERO ) $ COUNT = 1 DO 10 I = 3, 2*N - 1, 2 TMP = D( I ) - D( I-1 ) / TMP - SIGMA IF( ABS( TMP ).LE.PIVMIN ) $ TMP = -PIVMIN IF( TMP.LE.ZERO ) $ COUNT = COUNT + 1 10 CONTINUE * RETURN * * End of PDLAPDCT * END scalapack-2.0.2/SRC/pdstedc.f000644 000766 000024 00000023023 10363532303 016133 0ustar00juliestaff000000 000000 SUBROUTINE PDSTEDC( COMPZ, N, D, E, Q, IQ, JQ, DESCQ, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * .. Scalar Arguments .. CHARACTER COMPZ INTEGER INFO, IQ, JQ, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCQ( * ), IWORK( * ) DOUBLE PRECISION D( * ), E( * ), Q( * ), WORK( * ) * .. * * Purpose * ======= * PDSTEDC computes all eigenvalues and eigenvectors of a * symmetric tridiagonal matrix in parallel, using the divide and * conquer algorithm. * * This code makes very mild assumptions about floating point * arithmetic. It will work on machines with a guard digit in * add/subtract, or on those binary machines without guard digits * which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. * It could conceivably fail on hexadecimal or decimal machines * without guard digits, but we know of none. See DLAED3 for details. * * Arguments * ========= * * COMPZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only. (NOT IMPLEMENTED YET) * = 'I': Compute eigenvectors of tridiagonal matrix also. * = 'V': Compute eigenvectors of original dense symmetric * matrix also. On entry, Z contains the orthogonal * matrix used to reduce the original matrix to * tridiagonal form. (NOT IMPLEMENTED YET) * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * D (global input/output) DOUBLE PRECISION array, dimension (N) * On entry, the diagonal elements of the tridiagonal matrix. * On exit, if INFO = 0, the eigenvalues in descending order. * * E (global input/output) DOUBLE PRECISION array, dimension (N-1) * On entry, the subdiagonal elements of the tridiagonal matrix. * On exit, E has been destroyed. * * Q (local output) DOUBLE PRECISION array, * local dimension ( LLD_Q, LOCc(JQ+N-1)) * Q contains the orthonormal eigenvectors of the symmetric * tridiagonal matrix. * On output, Q is distributed across the P processes in block * cyclic format. * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension (LWORK) * On output, WORK(1) returns the workspace needed. * * LWORK (local input/output) INTEGER, * the dimension of the array WORK. * LWORK = 6*N + 2*NP*NQ * NP = NUMROC( N, NB, MYROW, DESCQ( RSRC_ ), NPROW ) * NQ = NUMROC( N, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) * * If LWORK = -1, the LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the WORK array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * On exit, if LIWORK > 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. * LIWORK = 2 + 7*N + 8*NPCOL * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: The algorithm failed to compute the INFO/(N+1) th * eigenvalue while working on the submatrix lying in * global rows and columns mod(INFO,N+1). * * Further Details * ======= ======= * * Contributed by Francoise Tisseur, University of Manchester. * * Reference: F. Tisseur and J. Dongarra, "A Parallel Divide and * Conquer Algorithm for the Symmetric Eigenvalue Problem * on Distributed Memory Architectures", * SIAM J. Sci. Comput., 6:20 (1999), pp. 2223--2236. * (see also LAPACK Working Note 132) * http://www.netlib.org/lapack/lawns/lawn132.ps * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER ICOFFQ, IIQ, IPQ, IQCOL, IQROW, IROFFQ, JJQ, $ LDQ, LIWMIN, LWMIN, MYCOL, MYROW, NB, NP, $ NPCOL, NPROW, NQ DOUBLE PRECISION ORGNRM * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC DOUBLE PRECISION DLANST EXTERNAL INDXG2P, LSAME, NUMROC, DLANST * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DLASCL, DSTEDC, $ INFOG2L, PDLAED0, PDLASRT, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MOD * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Test the input parameters. * CALL BLACS_GRIDINFO( DESCQ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) LDQ = DESCQ( LLD_ ) NB = DESCQ( NB_ ) NP = NUMROC( N, NB, MYROW, DESCQ( RSRC_ ), NPROW ) NQ = NUMROC( N, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IQ, JQ, DESCQ, 8, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCQ( NB_ ) IROFFQ = MOD( IQ-1, DESCQ( MB_ ) ) ICOFFQ = MOD( JQ-1, DESCQ( NB_ ) ) IQROW = INDXG2P( IQ, NB, MYROW, DESCQ( RSRC_ ), NPROW ) IQCOL = INDXG2P( JQ, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) LWMIN = 6*N + 2*NP*NQ LIWMIN = 2 + 7*N + 8*NPCOL * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( .NOT.LSAME( COMPZ, 'I' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( IROFFQ.NE.ICOFFQ .OR. ICOFFQ.NE.0 ) THEN INFO = -5 ELSE IF( DESCQ( MB_ ).NE.DESCQ( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCQ( CTXT_ ), 'PDSTEDC', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return * IF( N.EQ.0 ) $ GO TO 10 CALL INFOG2L( IQ, JQ, DESCQ, NPROW, NPCOL, MYROW, MYCOL, IIQ, JJQ, $ IQROW, IQCOL ) IF( N.EQ.1 ) THEN IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) $ Q( 1 ) = ONE GO TO 10 END IF * * If N is smaller than the minimum divide size NB, then * solve the problem with the serial divide and conquer * code locally. * IF( N.LE.NB ) THEN IF( ( MYROW.EQ.IQROW ) .AND. ( MYCOL.EQ.IQCOL ) ) THEN IPQ = IIQ + ( JJQ-1 )*LDQ CALL DSTEDC( 'I', N, D, E, Q( IPQ ), LDQ, WORK, LWORK, $ IWORK, LIWORK, INFO ) IF( INFO.NE.0 ) THEN INFO = ( N+1 ) + N GO TO 10 END IF END IF GO TO 10 END IF * * If P=NPROW*NPCOL=1, solve the problem with DSTEDC. * IF( NPCOL*NPROW.EQ.1 ) THEN IPQ = IIQ + ( JJQ-1 )*LDQ CALL DSTEDC( 'I', N, D, E, Q( IPQ ), LDQ, WORK, LWORK, IWORK, $ LIWORK, INFO ) GO TO 10 END IF * * Scale matrix to allowable range, if necessary. * ORGNRM = DLANST( 'M', N, D, E ) IF( ORGNRM.NE.ZERO ) THEN CALL DLASCL( 'G', 0, 0, ORGNRM, ONE, N, 1, D, N, INFO ) CALL DLASCL( 'G', 0, 0, ORGNRM, ONE, N-1, 1, E, N-1, INFO ) END IF * CALL PDLAED0( N, D, E, Q, IQ, JQ, DESCQ, WORK, IWORK, INFO ) * * Sort eigenvalues and corresponding eigenvectors * CALL PDLASRT( 'I', N, D, Q, IQ, JQ, DESCQ, WORK, LWORK, IWORK, $ LIWORK, INFO ) * * Scale back. * IF( ORGNRM.NE.ZERO ) $ CALL DLASCL( 'G', 0, 0, ONE, ORGNRM, N, 1, D, N, INFO ) * 10 CONTINUE * IF( LWORK.GT.0 ) $ WORK( 1 ) = DBLE( LWMIN ) IF( LIWORK.GT.0 ) $ IWORK( 1 ) = LIWMIN RETURN * * End of PDSTEDC * END scalapack-2.0.2/SRC/pdstein.f000644 000766 000024 00000060432 10602576752 016174 0ustar00juliestaff000000 000000 SUBROUTINE PDSTEIN( N, D, E, M, W, IBLOCK, ISPLIT, ORFAC, Z, IZ, $ JZ, DESCZ, WORK, LWORK, IWORK, LIWORK, IFAIL, $ ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER INFO, IZ, JZ, LIWORK, LWORK, M, N DOUBLE PRECISION ORFAC * .. * .. Array Arguments .. INTEGER DESCZ( * ), IBLOCK( * ), ICLUSTR( * ), $ IFAIL( * ), ISPLIT( * ), IWORK( * ) DOUBLE PRECISION D( * ), E( * ), GAP( * ), W( * ), WORK( * ), $ Z( * ) * .. * * Purpose * ======= * * PDSTEIN computes the eigenvectors of a symmetric tridiagonal matrix * in parallel, using inverse iteration. The eigenvectors found * correspond to user specified eigenvalues. PDSTEIN does not * orthogonalize vectors that are on different processes. The extent * of orthogonalization is controlled by the input parameter LWORK. * Eigenvectors that are to be orthogonalized are computed by the same * process. PDSTEIN decides on the allocation of work among the * processes and then calls DSTEIN2 (modified LAPACK routine) on each * individual process. If insufficient workspace is allocated, the * expected orthogonalization may not be done. * * Note : If the eigenvectors obtained are not orthogonal, increase * LWORK and run the code again. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * P = NPROW * NPCOL is the total number of processes * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * D (global input) DOUBLE PRECISION array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. * * E (global input) DOUBLE PRECISION array, dimension (N-1) * The (n-1) off-diagonal elements of the tridiagonal matrix T. * * M (global input) INTEGER * The total number of eigenvectors to be found. 0 <= M <= N. * * W (global input/global output) DOUBLE PRECISION array, dim (M) * On input, the first M elements of W contain all the * eigenvalues for which eigenvectors are to be computed. The * eigenvalues should be grouped by split-off block and ordered * from smallest to largest within the block (The output array * W from PDSTEBZ with ORDER='b' is expected here). This * array should be replicated on all processes. * On output, the first M elements contain the input * eigenvalues in ascending order. * * Note : To obtain orthogonal vectors, it is best if * eigenvalues are computed to highest accuracy ( this can be * done by setting ABSTOL to the underflow threshold = * DLAMCH('U') --- ABSTOL is an input parameter * to PDSTEBZ ) * * IBLOCK (global input) INTEGER array, dimension (N) * The submatrix indices associated with the corresponding * eigenvalues in W -- 1 for eigenvalues belonging to the * first submatrix from the top, 2 for those belonging to * the second submatrix, etc. (The output array IBLOCK * from PDSTEBZ is expected here). * * ISPLIT (global input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N (The output array * ISPLIT from PDSTEBZ is expected here.) * * ORFAC (global input) DOUBLE PRECISION * ORFAC specifies which eigenvectors should be orthogonalized. * Eigenvectors that correspond to eigenvalues which are within * ORFAC*||T|| of each other are to be orthogonalized. * However, if the workspace is insufficient (see LWORK), this * tolerance may be decreased until all eigenvectors to be * orthogonalized can be stored in one process. * No orthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) DOUBLE PRECISION array, * dimension (DESCZ(DLEN_), N/npcol + NB) * Z contains the computed eigenvectors associated with the * specified eigenvalues. Any vector which fails to converge is * set to its current iterate after MAXITS iterations ( See * DSTEIN2 ). * On output, Z is distributed across the P processes in block * cyclic format. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace/global output) DOUBLE PRECISION array, * dimension ( LWORK ) * On output, WORK(1) gives a lower bound on the * workspace ( LWORK ) that guarantees the user desired * orthogonalization (see ORFAC). * Note that this may overestimate the minimum workspace needed. * * LWORK (local input) integer * LWORK controls the extent of orthogonalization which can be * done. The number of eigenvectors for which storage is * allocated on each process is * NVEC = floor(( LWORK- max(5*N,NP00*MQ00) )/N). * Eigenvectors corresponding to eigenvalue clusters of size * NVEC - ceil(M/P) + 1 are guaranteed to be orthogonal ( the * orthogonality is similar to that obtained from DSTEIN2). * Note : LWORK must be no smaller than: * max(5*N,NP00*MQ00) + ceil(M/P)*N, * and should have the same input value on all processes. * It is the minimum value of LWORK input on different processes * that is significant. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/global output) INTEGER array, * dimension ( 3*N+P+1 ) * On return, IWORK(1) contains the amount of integer workspace * required. * On return, the IWORK(2) through IWORK(P+2) indicate * the eigenvectors computed by each process. Process I computes * eigenvectors indexed IWORK(I+2)+1 thru' IWORK(I+3). * * LIWORK (local input) INTEGER * Size of array IWORK. Must be >= 3*N + P + 1 * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (global output) integer array, dimension (M) * On normal exit, all elements of IFAIL are zero. * If one or more eigenvectors fail to converge after MAXITS * iterations (as in DSTEIN), then INFO > 0 is returned. * If mod(INFO,M+1)>0, then * for I=1 to mod(INFO,M+1), the eigenvector * corresponding to the eigenvalue W(IFAIL(I)) failed to * converge ( W refers to the array of eigenvalues on output ). * * ICLUSTR (global output) integer array, dimension (2*P) * This output array contains indices of eigenvectors * corresponding to a cluster of eigenvalues that could not be * orthogonalized due to insufficient workspace (see LWORK, * ORFAC and INFO). Eigenvectors corresponding to clusters of * eigenvalues indexed ICLUSTR(2*I-1) to ICLUSTR(2*I), I = 1 to * INFO/(M+1), could not be orthogonalized due to lack of * workspace. Hence the eigenvectors corresponding to these * clusters may not be orthogonal. ICLUSTR is a zero terminated * array --- ( ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0 ) * if and only if K is the number of clusters. * * GAP (global output) DOUBLE PRECISION array, dimension (P) * This output array contains the gap between eigenvalues whose * eigenvectors could not be orthogonalized. The INFO/M output * values in this array correspond to the INFO/(M+1) clusters * indicated by the array ICLUSTR. As a result, the dot product * between eigenvectors corresponding to the I^th cluster may be * as high as ( O(n)*macheps ) / GAP(I). * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * < 0 : if INFO = -I, the I-th argument had an illegal value * > 0 : if mod(INFO,M+1) = I, then I eigenvectors failed to * converge in MAXITS iterations. Their indices are * stored in the array IFAIL. * if INFO/(M+1) = I, then eigenvectors corresponding to * I clusters of eigenvalues could not be orthogonalized * due to insufficient workspace. The indices of the * clusters are stored in the array ICLUSTR. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DGEBR2D, DGEBS2D, $ DLASRT2, DSTEIN2, IGAMN2D, IGEBR2D, IGEBS2D, $ PCHK1MAT, PDLAEVSWP, PXERBLA * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, NEGONE, ODM1, FIVE, ODM3, ODM18 PARAMETER ( ZERO = 0.0D+0, NEGONE = -1.0D+0, $ ODM1 = 1.0D-1, FIVE = 5.0D+0, ODM3 = 1.0D-3, $ ODM18 = 1.0D-18 ) * .. * .. Local Scalars .. LOGICAL LQUERY, SORTED INTEGER B1, BN, BNDRY, CLSIZ, COL, I, IFIRST, IINFO, $ ILAST, IM, INDRW, ITMP, J, K, LGCLSIZ, LLWORK, $ LOAD, LOCINFO, MAXVEC, MQ00, MYCOL, MYROW, $ NBLK, NERR, NEXT, NP00, NPCOL, NPROW, NVS, $ OLNBLK, P, ROW, SELF, TILL, TOTERR DOUBLE PRECISION DIFF, MINGAP, ONENRM, ORGFAC, ORTOL, TMPFAC * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) SELF = MYROW*NPCOL + MYCOL * * Make sure that we belong to this context (before calling PCHK1MAT) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1200+CTXT_ ) ELSE * * Make sure that NPROW>0 and NPCOL>0 before calling NUMROC * CALL CHK1MAT( N, 1, N, 1, IZ, JZ, DESCZ, 12, INFO ) IF( INFO.EQ.0 ) THEN * * Now we know that our context is good enough to * perform the rest of the checks * NP00 = NUMROC( N, DESCZ( MB_ ), 0, 0, NPROW ) MQ00 = NUMROC( M, DESCZ( NB_ ), 0, 0, NPCOL ) P = NPROW*NPCOL * * Compute the maximum number of vectors per process * LLWORK = LWORK CALL IGAMN2D( DESCZ( CTXT_ ), 'A', ' ', 1, 1, LLWORK, 1, 1, $ 1, -1, -1, -1 ) INDRW = MAX( 5*N, NP00*MQ00 ) IF( N.NE.0 ) $ MAXVEC = ( LLWORK-INDRW ) / N LOAD = ICEIL( M, P ) IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN TMPFAC = ORFAC CALL DGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, TMPFAC, $ 1 ) ELSE CALL DGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, TMPFAC, $ 1, 0, 0 ) END IF * LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( M.LT.0 .OR. M.GT.N ) THEN INFO = -4 ELSE IF( MAXVEC.LT.LOAD .AND. .NOT.LQUERY ) THEN INFO = -14 ELSE IF( LIWORK.LT.3*N+P+1 .AND. .NOT.LQUERY ) THEN INFO = -16 ELSE DO 10 I = 2, M IF( IBLOCK( I ).LT.IBLOCK( I-1 ) ) THEN INFO = -6 GO TO 20 END IF IF( IBLOCK( I ).EQ.IBLOCK( I-1 ) .AND. W( I ).LT. $ W( I-1 ) ) THEN INFO = -5 GO TO 20 END IF 10 CONTINUE 20 CONTINUE IF( INFO.EQ.0 ) THEN IF( ABS( TMPFAC-ORFAC ).GT.FIVE*ABS( TMPFAC ) ) $ INFO = -8 END IF END IF * END IF IDUM1( 1 ) = M IDUM2( 1 ) = 4 CALL PCHK1MAT( N, 1, N, 1, IZ, JZ, DESCZ, 12, 1, IDUM1, IDUM2, $ INFO ) WORK( 1 ) = DBLE( MAX( 5*N, NP00*MQ00 )+ICEIL( M, P )*N ) IWORK( 1 ) = 3*N + P + 1 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCZ( CTXT_ ), 'PDSTEIN', -INFO ) RETURN ELSE IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) THEN RETURN END IF * DO 30 I = 1, M IFAIL( I ) = 0 30 CONTINUE DO 40 I = 1, P + 1 IWORK( I ) = 0 40 CONTINUE DO 50 I = 1, P GAP( I ) = NEGONE ICLUSTR( 2*I-1 ) = 0 ICLUSTR( 2*I ) = 0 50 CONTINUE * * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) $ RETURN * IF( ORFAC.GE.ZERO ) THEN TMPFAC = ORFAC ELSE TMPFAC = ODM3 END IF ORGFAC = TMPFAC * * Allocate the work among the processes * ILAST = M / LOAD IF( MOD( M, LOAD ).EQ.0 ) $ ILAST = ILAST - 1 OLNBLK = -1 NVS = 0 NEXT = 1 IM = 0 ONENRM = ZERO DO 100 I = 0, ILAST - 1 NEXT = NEXT + LOAD J = NEXT - 1 IF( J.GT.NVS ) THEN NBLK = IBLOCK( NEXT ) IF( NBLK.EQ.IBLOCK( NEXT-1 ) .AND. NBLK.NE.OLNBLK ) THEN * * Compute orthogonalization criterion * IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 60 J = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( J ) )+ABS( E( J-1 ) )+ $ ABS( E( J ) ) ) 60 CONTINUE OLNBLK = NBLK END IF TILL = NVS + MAXVEC 70 CONTINUE J = NEXT - 1 IF( TMPFAC.GT.ODM18 ) THEN ORTOL = TMPFAC*ONENRM DO 80 J = NEXT - 1, MIN( TILL, M-1 ) IF( IBLOCK( J+1 ).NE.IBLOCK( J ) .OR. W( J+1 )- $ W( J ).GE.ORTOL ) THEN GO TO 90 END IF 80 CONTINUE IF( J.EQ.M .AND. TILL.GE.M ) $ GO TO 90 TMPFAC = TMPFAC*ODM1 GO TO 70 END IF 90 CONTINUE J = MIN( J, TILL ) END IF IF( SELF.EQ.I ) $ IM = MAX( 0, J-NVS ) * IWORK( I+1 ) = NVS NVS = MAX( J, NVS ) 100 CONTINUE IF( SELF.EQ.ILAST ) $ IM = M - NVS IWORK( ILAST+1 ) = NVS DO 110 I = ILAST + 2, P + 1 IWORK( I ) = M 110 CONTINUE * CLSIZ = 1 LGCLSIZ = 1 ILAST = 0 NBLK = 0 BNDRY = 2 K = 1 DO 140 I = 1, M IF( IBLOCK( I ).NE.NBLK ) THEN NBLK = IBLOCK( I ) IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 120 J = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( J ) )+ABS( E( J-1 ) )+ $ ABS( E( J ) ) ) 120 CONTINUE * END IF IF( I.GT.1 ) THEN DIFF = W( I ) - W( I-1 ) IF( IBLOCK( I ).NE.IBLOCK( I-1 ) .OR. I.EQ.M .OR. DIFF.GT. $ ORGFAC*ONENRM ) THEN IFIRST = ILAST IF( I.EQ.M ) THEN IF( IBLOCK( M ).NE.IBLOCK( M-1 ) .OR. DIFF.GT.ORGFAC* $ ONENRM ) THEN ILAST = M - 1 ELSE ILAST = M END IF ELSE ILAST = I - 1 END IF CLSIZ = ILAST - IFIRST IF( CLSIZ.GT.1 ) THEN IF( LGCLSIZ.LT.CLSIZ ) $ LGCLSIZ = CLSIZ MINGAP = ONENRM 130 CONTINUE IF( BNDRY.GT.P+1 ) $ GO TO 150 IF( IWORK( BNDRY ).GT.IFIRST .AND. IWORK( BNDRY ).LT. $ ILAST ) THEN MINGAP = MIN( W( IWORK( BNDRY )+1 )- $ W( IWORK( BNDRY ) ), MINGAP ) ELSE IF( IWORK( BNDRY ).GE.ILAST ) THEN IF( MINGAP.LT.ONENRM ) THEN ICLUSTR( 2*K-1 ) = IFIRST + 1 ICLUSTR( 2*K ) = ILAST GAP( K ) = MINGAP / ONENRM K = K + 1 END IF GO TO 140 END IF BNDRY = BNDRY + 1 GO TO 130 END IF END IF END IF 140 CONTINUE 150 CONTINUE INFO = ( K-1 )*( M+1 ) * * Call DSTEIN2 to find the eigenvectors * CALL DSTEIN2( N, D, E, IM, W( IWORK( SELF+1 )+1 ), $ IBLOCK( IWORK( SELF+1 )+1 ), ISPLIT, ORGFAC, $ WORK( INDRW+1 ), N, WORK, IWORK( P+2 ), $ IFAIL( IWORK( SELF+1 )+1 ), LOCINFO ) * * Redistribute the eigenvector matrix to conform with the block * cyclic distribution of the input matrix * * DO 160 I = 1, M IWORK( P+1+I ) = I 160 CONTINUE * CALL DLASRT2( 'I', M, W, IWORK( P+2 ), IINFO ) * DO 170 I = 1, M IWORK( M+P+1+IWORK( P+1+I ) ) = I 170 CONTINUE * * DO 180 I = 1, LOCINFO ITMP = IWORK( SELF+1 ) + I IFAIL( ITMP ) = IFAIL( ITMP ) + ITMP - I IFAIL( ITMP ) = IWORK( M+P+1+IFAIL( ITMP ) ) 180 CONTINUE * DO 190 I = 1, K - 1 ICLUSTR( 2*I-1 ) = IWORK( M+P+1+ICLUSTR( 2*I-1 ) ) ICLUSTR( 2*I ) = IWORK( M+P+1+ICLUSTR( 2*I ) ) 190 CONTINUE * * * Still need to apply the above permutation to IFAIL * * TOTERR = 0 DO 210 I = 1, P IF( SELF.EQ.I-1 ) THEN CALL IGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, LOCINFO, 1 ) IF( LOCINFO.NE.0 ) THEN CALL IGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', LOCINFO, 1, $ IFAIL( IWORK( I )+1 ), LOCINFO ) DO 200 J = 1, LOCINFO IFAIL( TOTERR+J ) = IFAIL( IWORK( I )+J ) 200 CONTINUE TOTERR = TOTERR + LOCINFO END IF ELSE * ROW = ( I-1 ) / NPCOL COL = MOD( I-1, NPCOL ) * CALL IGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, NERR, 1, $ ROW, COL ) IF( NERR.NE.0 ) THEN CALL IGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', NERR, 1, $ IFAIL( TOTERR+1 ), NERR, ROW, COL ) TOTERR = TOTERR + NERR END IF END IF 210 CONTINUE INFO = INFO + TOTERR * * CALL PDLAEVSWP( N, WORK( INDRW+1 ), N, Z, IZ, JZ, DESCZ, IWORK, $ IWORK( M+P+2 ), WORK, INDRW ) * DO 220 I = 2, P IWORK( I ) = IWORK( M+P+1+IWORK( I ) ) 220 CONTINUE * * * Sort the IWORK array * * 230 CONTINUE SORTED = .TRUE. DO 240 I = 2, P - 1 IF( IWORK( I ).GT.IWORK( I+1 ) ) THEN ITMP = IWORK( I+1 ) IWORK( I+1 ) = IWORK( I ) IWORK( I ) = ITMP SORTED = .FALSE. END IF 240 CONTINUE IF( .NOT.SORTED ) $ GO TO 230 * DO 250 I = P + 1, 1, -1 IWORK( I+1 ) = IWORK( I ) 250 CONTINUE * WORK( 1 ) = ( LGCLSIZ+LOAD-1 )*N + INDRW IWORK( 1 ) = 3*N + P + 1 * * End of PDSTEIN * END scalapack-2.0.2/SRC/pdsyev.f000644 000766 000024 00000052623 11640165744 016041 0ustar00juliestaff000000 000000 SUBROUTINE PDSYEV( JOBZ, UPLO, N, A, IA, JA, DESCA, W, $ Z, IZ, JZ, DESCZ, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER JOBZ, UPLO INTEGER IA, INFO, IZ, JA, JZ, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) DOUBLE PRECISION A( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDSYEV computes all eigenvalues and, optionally, eigenvectors * of a real symmetric matrix A by calling the recommended sequence * of ScaLAPACK routines. * * In its present form, PDSYEV assumes a homogeneous system and makes * no checks for consistency of the eigenvalues or eigenvectors across * the different processes. Because of this, it is possible that a * heterogeneous system may return incorrect results without any error * messages. * * Notes * ===== * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distributed * matrix A. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of A. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of A. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of the * distributed matrix A. * LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic DOUBLE PRECISION array, * global dimension (N, N), local dimension ( LLD_A, * LOCc(JA+N-1) ) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PDSYEV cannot guarantee * correct error reporting. * * W (global output) DOUBLE PRECISION array, dimension (N) * If INFO=0, the eigenvalues in ascending order. * * Z (local output) DOUBLE PRECISION array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension (LWORK) * Version 1.0: on output, WORK(1) returns the workspace * needed to guarantee completion. * If the input parameters are incorrect, WORK(1) may also be * incorrect. * * If JOBZ='N' WORK(1) = minimal=optimal amount of workspace * If JOBZ='V' WORK(1) = minimal workspace required to * generate all the eigenvectors. * * * LWORK (local input) INTEGER * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= 5*N + SIZESYTRD + 1 * where * SIZESYTRD = The workspace requirement for PDSYTRD * and is MAX( NB * ( NP +1 ), 3 * NB ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * * QRMEM = 2*N-2 * LWMIN = 5*N + N*LDC + MAX( SIZEMQRLEFT, QRMEM ) + 1 * * Variable definitions: * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( RSRC_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP = NUMROC( NN, NB, 0, 0, NPROW ) * NQ = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) * NRC = NUMROC( N, NB, MYPROWC, 0, NPROCS) * LDC = MAX( 1, NRC ) * SIZEMQRLEFT = The workspace requirement for PDORMTR * when it's SIDE argument is 'L'. * * With MYPROWC defined when a new context is created as: * CALL BLACS_GET( DESCA( CTXT_ ), 0, CONTEXTC ) * CALL BLACS_GRIDINIT( CONTEXTC, 'R', NPROCS, 1 ) * CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, * MYPCOLC ) * * If LWORK = -1, the LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the WORK array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = 1 through N, the i(th) eigenvalue did not * converge in DSTEQR2 after a total of 30*N iterations. * If INFO = N+1, then PDSYEV has detected heterogeneity * by finding that eigenvalues were not identical across * the process grid. In this case, the accuracy of * the results from PDSYEV cannot be guaranteed. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and Z(IZ:IZ+M-1,JZ:JZ+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * ( MB_A.EQ.NB_A.EQ.MB_Z .AND. IROFFA.EQ.IROFFZ .AND. IROFFA.EQ.0 .AND. * IAROW.EQ.IZROW ) * where * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * Version 1.4 limitations: * DESCA(MB_) = DESCA(NB_) * DESCA(M_) = DESCZ(M_) * DESCA(N_) = DESCZ(N_) * DESCA(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION FIVE, ONE, TEN, ZERO PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, $ TEN = 10.0D+0, FIVE = 5.0D+0 ) INTEGER IERREIN, IERRCLS, IERRSPC, IERREBZ, ITHVAL PARAMETER ( IERREIN = 1, IERRCLS = 2, IERRSPC = 4, $ IERREBZ = 8, ITHVAL = 10 ) * .. * .. Local Scalars .. LOGICAL LOWER, WANTZ INTEGER CONTEXTC, CSRC_A, I, IACOL, IAROW, ICOFFA, $ IINFO, INDD, INDD2, INDE, INDE2, INDTAU, $ INDWORK, INDWORK2, IROFFA, IROFFZ, ISCALE, $ IZROW, J, K, LDC, LLWORK, LWMIN, MB_A, MB_Z, $ MYCOL, MYPCOLC, MYPROWC, MYROW, NB, NB_A, NB_Z, $ NP, NPCOL, NPCOLC, NPROCS, NPROW, NPROWC, NQ, $ NRC, QRMEM, RSRC_A, RSRC_Z, SIZEMQRLEFT, $ SIZESYTRD DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, $ SMLNUM * .. * .. Local Arrays .. INTEGER DESCQR( 9 ), IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC, SL_GRIDRESHAPE DOUBLE PRECISION PDLAMCH, PDLANSY EXTERNAL LSAME, NUMROC, PDLAMCH, PDLANSY, $ SL_GRIDRESHAPE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, CHK1MAT, DCOPY, $ DESCINIT, DSCAL, DSTEQR2, PCHK1MAT, PCHK2MAT, $ PDELGET, PDGEMR2D, PDLASCL, PDLASET, PDORMTR, $ PDSYTRD, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, MAX, MIN, MOD, SQRT, INT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return * IF( N.EQ.0 ) RETURN * * Test the input arguments. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 * WANTZ = LSAME( JOBZ, 'V' ) IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE IF( WANTZ ) THEN IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) END IF END IF IF( INFO .EQ. 0 ) THEN CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 3, N, 3, IZ, JZ, DESCZ, 12, INFO ) * IF( INFO.EQ.0 ) THEN * * Get machine constants. * SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe minimum' ) EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * NPROCS = NPROW*NPCOL NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A LOWER = LSAME( UPLO, 'L' ) * RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( 1, NB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, MB_A, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) IF( WANTZ ) THEN NB_Z = DESCZ( NB_ ) MB_Z = DESCZ( MB_ ) RSRC_Z = DESCZ( RSRC_ ) IROFFZ = MOD( IZ-1, MB_A ) IZROW = INDXG2P( 1, NB_A, MYROW, RSRC_Z, NPROW ) SIZEMQRLEFT = MAX( ( NB_A*( NB_A-1 ) ) / 2, ( NP+NQ )* $ NB_A ) + NB_A*NB_A ELSE SIZEMQRLEFT = 0 IROFFZ = 0 IZROW = 0 END IF SIZESYTRD = MAX( NB * ( NP +1 ), 3 * NB ) * * Initialize the context of the single column distributed * matrix required by DSTEQR2. This specific distribution * allows each process to do 1/pth of the work updating matrix * Q during DSTEQR2 and achieve some parallelization to an * otherwise serial subroutine. * LDC = 0 IF( WANTZ ) THEN CONTEXTC = SL_GRIDRESHAPE( DESCA( CTXT_ ), 0, 1, 1, $ NPROCS, 1 ) CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, $ MYPCOLC ) NRC = NUMROC( N, NB_A, MYPROWC, 0, NPROCS) LDC = MAX( 1, NRC ) CALL DESCINIT( DESCQR, N, N, NB, NB, 0, 0, CONTEXTC, $ LDC, INFO ) END IF * * Set up pointers into the WORK array * INDTAU = 1 INDE = INDTAU + N INDD = INDE + N INDD2 = INDD + N INDE2 = INDD2 + N INDWORK = INDE2 + N INDWORK2 = INDWORK + N*LDC LLWORK = LWORK - INDWORK + 1 * * Compute the total amount of space needed * QRMEM = 2*N-2 IF( WANTZ ) THEN LWMIN = 5*N + N*LDC + MAX( SIZEMQRLEFT, QRMEM ) + 1 ELSE LWMIN = 5*N + SIZESYTRD + 1 END IF * END IF IF( INFO.EQ.0 ) THEN IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -14 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( WANTZ ) THEN IF( IROFFA.NE.IROFFZ ) THEN INFO = -10 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -10 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 1200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 1200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 1200+RSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) ENDIF END IF END IF IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 3 IF( LSAME( JOBZ, 'V' ) ) THEN CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, $ IZ, JZ, DESCZ, 12, 3, IDUM1, IDUM2, INFO ) ELSE CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 3, IDUM1, $ IDUM2, INFO ) END IF * * Write the required workspace for lwork queries. * WORK( 1 ) = DBLE( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PDSYEV', -INFO ) IF( WANTZ ) CALL BLACS_GRIDEXIT( CONTEXTC ) RETURN ELSE IF( LWORK .EQ. -1 ) THEN IF( WANTZ ) CALL BLACS_GRIDEXIT( CONTEXTC ) RETURN END IF * * Scale matrix to allowable range, if necessary. * ISCALE = 0 * ANRM = PDLANSY( 'M', UPLO, N, A, IA, JA, DESCA, WORK( INDWORK ) ) * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM END IF * IF( ISCALE.EQ.1 ) THEN CALL PDLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) END IF * * Reduce symmetric matrix to tridiagonal form. * CALL PDSYTRD( UPLO, N, A, IA, JA, DESCA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * Copy the values of D, E to all processes. * DO 10 I=1,N CALL PDELGET( 'A', ' ', WORK(INDD2+I-1), A, $ I+IA-1, I+JA-1, DESCA ) 10 CONTINUE IF( LSAME( UPLO, 'U') ) THEN DO 20 I=1,N-1 CALL PDELGET( 'A', ' ', WORK(INDE2+I-1), A, $ I+IA-1, I+JA, DESCA ) 20 CONTINUE ELSE DO 30 I=1,N-1 CALL PDELGET( 'A', ' ', WORK(INDE2+I-1), A, $ I+IA, I+JA-1, DESCA ) 30 CONTINUE ENDIF * IF( WANTZ ) THEN * CALL PDLASET( 'Full', N, N, ZERO, ONE, WORK( INDWORK ), 1, 1, $ DESCQR ) * * DSTEQR2 is a modified version of LAPACK's DSTEQR. The * modifications allow each process to perform partial updates * to matrix Q. * CALL DSTEQR2( 'I', N, WORK( INDD2 ), WORK( INDE2 ), $ WORK( INDWORK ), LDC, NRC, WORK( INDWORK2 ), $ INFO ) * CALL PDGEMR2D( N, N, WORK( INDWORK ), 1, 1, DESCQR, Z, IA, JA, $ DESCZ, CONTEXTC ) * CALL PDORMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) * ELSE * CALL DSTEQR2( 'N', N, WORK( INDD2 ), WORK( INDE2 ), $ WORK( INDWORK ), 1, 1, WORK( INDWORK2 ), $ INFO ) ENDIF * * Copy eigenvalues from workspace to output array * CALL DCOPY( N, WORK( INDD2 ), 1, W, 1 ) * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE .EQ. 1 ) THEN CALL DSCAL( N, ONE / SIGMA, W, 1 ) END IF * * Free up resources * IF( WANTZ ) THEN CALL BLACS_GRIDEXIT( CONTEXTC ) END IF * * Compare every ith eigenvalue, or all if there are only a few, * across the process grid to check for heterogeneity. * IF( N.LE.ITHVAL ) THEN J = N K = 1 ELSE J = N/ITHVAL K = ITHVAL END IF * DO 40 I = 1, J WORK( I+INDTAU ) = W( (I-1)*K+1 ) WORK( I+INDE ) = W( (I-1)*K+1 ) 40 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'a', ' ', J, 1, WORK( 1+INDTAU ), $ J, 1, 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'a', ' ', J, 1, WORK( 1+INDE ), $ J, 1, 1, -1, -1, 0 ) * DO 50 I = 1, J IF( INFO.EQ.0 .AND. ( WORK( I+INDTAU )-WORK( I+INDE ) $ .NE. ZERO ) )THEN INFO = N+1 END IF 50 CONTINUE * RETURN * * End of PDSYEV * END scalapack-2.0.2/SRC/pdsyevd.f000644 000766 000024 00000031305 10363532303 016165 0ustar00juliestaff000000 000000 SUBROUTINE PDSYEVD( JOBZ, UPLO, N, A, IA, JA, DESCA, W, Z, IZ, JZ, $ DESCZ, WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 14, 2000 * * .. Scalar Arguments .. CHARACTER JOBZ, UPLO INTEGER IA, INFO, IZ, JA, JZ, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) DOUBLE PRECISION A( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDSYEVD computes all the eigenvalues and eigenvectors * of a real symmetric matrix A by calling the recommended sequence * of ScaLAPACK routines. * * In its present form, PDSYEVD assumes a homogeneous system and makes * no checks for consistency of the eigenvalues or eigenvectors across * the different processes. Because of this, it is possible that a * heterogeneous system may return incorrect results without any error * messages. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; (NOT IMPLEMENTED YET) * = 'V': Compute eigenvalues and eigenvectors. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/workspace) block cyclic DOUBLE PRECISION array, * global dimension (N, N), local dimension ( LLD_A, * LOCc(JA+N-1) ) * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * W (global output) DOUBLE PRECISION array, dimension (N) * If INFO=0, the eigenvalues in ascending order. * * Z (local output) DOUBLE PRECISION array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * Z contains the orthonormal eigenvectors * of the symmetric matrix A. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension (LWORK) * On output, WORK(1) returns the workspace required. * * LWORK (local input) INTEGER * LWORK >= MAX( 1+6*N+2*NP*NQ, TRILWMIN ) + 2*N * TRILWMIN = 3*N + MAX( NB*( NP+1 ), 3*NB ) * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) * NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * * If LWORK = -1, the LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the WORK array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * On exit, if LIWORK > 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. * LIWORK = 7*N + 8*NPCOL + 2 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: The algorithm failed to compute the INFO/(N+1) th * eigenvalue while working on the submatrix lying in * global rows and columns mod(INFO,N+1). * * Alignment requirements * ====================== * * The distributed submatrices sub( A ), sub( Z ) must verify * some alignment properties, namely the following expression * should be true: * ( MB_A.EQ.NB_A.EQ.MB_Z.EQ.NB_Z .AND. IROFFA.EQ.ICOFFA .AND. * IROFFA.EQ.0 .AND.IROFFA.EQ.IROFFZ. AND. IAROW.EQ.IZROW) * with IROFFA = MOD( IA-1, MB_A ) * and ICOFFA = MOD( JA-1, NB_A ). * * Further Details * ======= ======= * * Contributed by Francoise Tisseur, University of Manchester. * * Reference: F. Tisseur and J. Dongarra, "A Parallel Divide and * Conquer Algorithm for the Symmetric Eigenvalue Problem * on Distributed Memory Architectures", * SIAM J. Sci. Comput., 6:20 (1999), pp. 2223--2236. * (see also LAPACK Working Note 132) * http://www.netlib.org/lapack/lawns/lawn132.ps * * ===================================================================== * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER IACOL, IAROW, ICOFFA, ICOFFZ, ICTXT, IINFO, $ INDD, INDE, INDE2, INDTAU, INDWORK, INDWORK2, $ IROFFA, IROFFZ, ISCALE, LIWMIN, LLWORK, $ LLWORK2, LWMIN, MYCOL, MYROW, NB, NP, NPCOL, $ NPROW, NQ, OFFSET, TRILWMIN DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, $ SMLNUM * .. * .. Local Arrays .. * .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PDLANSY EXTERNAL LSAME, INDXG2P, NUMROC, PDLAMCH, PDLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DSCAL, PCHK1MAT, $ PDLARED1D, PDLASCL, PDLASET, PDORMTR, PDSTEDC, $ PDSYTRD, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return * IF( N.EQ.0 ) $ RETURN * * Test the input arguments. * CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IZ, JZ, DESCZ, 12, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFZ = MOD( IZ-1, DESCZ( MB_ ) ) ICOFFZ = MOD( JZ-1, DESCZ( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * LQUERY = ( LWORK.EQ.-1 ) TRILWMIN = 3*N + MAX( NB*( NP+1 ), 3*NB ) LWMIN = MAX( 1+6*N+2*NP*NQ, TRILWMIN ) + 2*N LIWMIN = 7*N + 8*NPCOL + 2 WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN IF( .NOT.LSAME( JOBZ, 'V' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( IROFFA.NE.IROFFZ .OR. ICOFFA.NE.ICOFFZ ) THEN INFO = -10 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 1200+M_ ) ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1200+NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1200+MB_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 1200+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 1200+CSRC_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -14 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 2 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 14 CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYEVD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Set up pointers into the WORK array * INDTAU = 1 INDE = INDTAU + N INDD = INDE + N INDE2 = INDD + N INDWORK = INDE2 + N LLWORK = LWORK - INDWORK + 1 INDWORK2 = INDD LLWORK2 = LWORK - INDWORK2 + 1 * * Scale matrix to allowable range, if necessary. * ISCALE = 0 SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe minimum' ) EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) ANRM = PDLANSY( 'M', UPLO, N, A, IA, JA, DESCA, WORK( INDWORK ) ) * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM END IF * IF( ISCALE.EQ.1 ) THEN CALL PDLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) END IF * * Reduce symmetric matrix to tridiagonal form. * * CALL PDSYTRD( UPLO, N, A, IA, JA, DESCA, WORK( INDD ), $ WORK( INDE2 ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * Copy the values of D, E to all processes. * CALL PDLARED1D( N, IA, JA, DESCA, WORK( INDD ), W, $ WORK( INDWORK ), LLWORK ) * CALL PDLARED1D( N, IA, JA, DESCA, WORK( INDE2 ), WORK( INDE ), $ WORK( INDWORK ), LLWORK ) * CALL PDLASET( 'Full', N, N, ZERO, ONE, Z, 1, 1, DESCZ ) * IF( UPPER ) THEN OFFSET = 1 ELSE OFFSET = 0 END IF CALL PDSTEDC( 'I', N, W, WORK( INDE+OFFSET ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK2 ), LLWORK2, IWORK, LIWORK, INFO ) * CALL PDORMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, WORK( INDWORK2 ), $ LLWORK2, IINFO ) * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL DSCAL( N, ONE / SIGMA, W, 1 ) END IF * RETURN * * End of PDSYEVD * END scalapack-2.0.2/SRC/pdsyevr.f000644 000766 000024 00000127766 11750130340 016220 0ustar00juliestaff000000 000000 SUBROUTINE PDSYEVR( JOBZ, RANGE, UPLO, N, A, IA, JA, $ DESCA, VL, VU, IL, IU, M, NZ, W, Z, IZ, $ JZ, DESCZ, WORK, LWORK, IWORK, LIWORK, $ INFO ) IMPLICIT NONE * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, INFO, IU, IZ, JA, JZ, LIWORK, LWORK, M, $ N, NZ DOUBLE PRECISION VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) DOUBLE PRECISION A( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDSYEVR computes selected eigenvalues and, optionally, eigenvectors * of a real symmetric matrix A distributed in 2D blockcyclic format * by calling the recommended sequence of ScaLAPACK routines. * * First, the matrix A is reduced to real symmetric tridiagonal form. * Then, the eigenproblem is solved using the parallel MRRR algorithm. * Last, if eigenvectors have been computed, a backtransformation is done. * * Upon successful completion, each processor stores a copy of all computed * eigenvalues in W. The eigenvector matrix Z is stored in * 2D blockcyclic format distributed over all processors. * * Note that subsets of eigenvalues/vectors can be selected by * specifying a range of values or a range of indices for the desired * eigenvalues. * * For constructive feedback and comments, please contact cvoemel@lbl.gov * C. Voemel * * Arguments * ========= * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0 * * A (local input/workspace) 2D block cyclic DOUBLE PRECISION array, * global dimension (N, N), * local dimension ( LLD_A, LOCc(JA+N-1) ), * (see Notes below for more detailed explanation of 2d arrays) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * DESCA (global and local input) INTEGER array of dimension DLEN=9. * The array descriptor for the distributed matrix A. * The descriptor stores details about the 2D block-cyclic * storage, see the notes below. * If DESCA is incorrect, PDSYEVR cannot guarantee * correct error reporting. * Also note the array alignment requirements specified below. * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A'. * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M * * W (global output) DOUBLE PRECISION array, dimension (N) * Upon successful exit, the first M entries contain the selected * eigenvalues in ascending order. * * Z (local output) DOUBLE PRECISION array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * (see Notes below for more detailed explanation of 2d arrays) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * The context DESCZ( CTXT_ ) must equal DESCA( CTXT_ ). * Also note the array alignment requirements specified below. * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension (LWORK) * On return, WORK(1) contains the optimal amount of * workspace required for efficient execution. * if JOBZ='N' WORK(1) = optimal amount of workspace * required to compute the eigenvalues. * if JOBZ='V' WORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors. * * LWORK (local input) INTEGER * Size of WORK, must be at least 3. * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= 2 + 5*N + MAX( 12 * NN, NB * ( NP0 + 1 ) ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required is: * LWORK >= 2 + 5*N + MAX( 18*NN, NP0 * MQ0 + 2 * NB * NB ) + * (2 + ICEIL( NEIG, NPROW*NPCOL))*NN * * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance for all work arrays. Each of * these values is returned in the first entry of the * corresponding work arrays, and no error message is issued by * PXERBLA. * Note that in a workspace query, for performance the optimal * workspace LWOPT is returned rather than the minimum necessary * WORKSPACE LWMIN. For very small matrices, LWOPT >> LWMIN. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * * Let NNP = MAX( N, NPROW*NPCOL + 1, 4 ). Then: * LIWORK >= 12*NNP + 2*N when the eigenvectors are desired * LIWORK >= 10*NNP + 2*N when only the eigenvalues have to be computed * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA, * or DESCZ for the descriptor of Z, etc. * The length of a ScaLAPACK descriptor is nine. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * PDSYEVR assumes IEEE 754 standard compliant arithmetic. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and Z(IZ:IZ+M-1,JZ:JZ+N-1) * must satisfy the following alignment properties: * * 1.Identical (quadratic) dimension: * DESCA(M_) = DESCZ(M_) = DESCA(N_) = DESCZ(N_) * 2.Quadratic conformal blocking: * DESCA(MB_) = DESCA(NB_) = DESCZ(MB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * 3.MOD( IA-1, MB_A ) = MOD( IZ-1, MB_Z ) = 0 * 4.IAROW = IZROW * * * .. Parameters .. INTEGER CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_ PARAMETER ( CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, COLBRT, DOBCST, FINISH, FIRST, INDEIG, $ LOWER, LQUERY, VALEIG, VSTART, WANTZ INTEGER ANB, DOL, DOU, DSTCOL, DSTROW, EIGCNT, FRSTCL, $ I, IAROW, ICTXT, IIL, IINDERR, IINDWLC, IINFO, $ IIU, IM, INDD, INDD2, INDE, INDE2, INDERR, $ INDILU, INDRW, INDTAU, INDWLC, INDWORK, IPIL, $ IPIU, IPROC, IZROW, LASTCL, LENGTHI, LENGTHI2, $ LIWMIN, LLWORK, LWMIN, LWOPT, MAXCLS, MQ00, $ MYCOL, MYIL, MYIU, MYPROC, MYROW, MZ, NB, $ NDEPTH, NEEDIL, NEEDIU, NNP, NP00, NPCOL, $ NPROCS, NPROW, NPS, NSPLIT, NSYTRD_LWOPT, $ OFFSET, PARITY, RLENGTHI, RLENGTHI2, RSTARTI, $ SIZE1, SIZE2, SQNPC, SRCCOL, SRCROW, STARTI, $ ZOFFSET DOUBLE PRECISION PIVMIN, SAFMIN, SCALE, VLL, VUU, WL, $ WU * * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH, $ PJLAENV * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DCOPY, DGEBR2D, $ DGEBS2D, DGERV2D, DGESD2D, DLARRC, DLASRT2, $ DSTEGR2A, DSTEGR2B, DSTEGR2, IGEBR2D, $ IGEBS2D, IGERV2D, IGESD2D, IGSUM2D, PCHK1MAT, $ PCHK2MAT, PDELGET, PDLAEVSWP, PDLARED1D, $ PDORMTR, PDSYNTRD, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, INT, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * INFO = 0 *********************************************************************** * * Decode character arguments to find out what the code should do * *********************************************************************** WANTZ = LSAME( JOBZ, 'V' ) LOWER = LSAME( UPLO, 'L' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) *********************************************************************** * * GET MACHINE PARAMETERS * *********************************************************************** ICTXT = DESCA( CTXT_ ) SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) *********************************************************************** * * Set up pointers into the WORK array * *********************************************************************** INDTAU = 1 INDD = INDTAU + N INDE = INDD + N + 1 INDD2 = INDE + N + 1 INDE2 = INDD2 + N INDWORK = INDE2 + N LLWORK = LWORK - INDWORK + 1 *********************************************************************** * * BLACS PROCESSOR GRID SETUP * *********************************************************************** CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW * NPCOL MYPROC = MYROW * NPCOL + MYCOL IF( NPROW.EQ.-1 ) THEN INFO = -( 800+CTXT_ ) ELSE IF( WANTZ ) THEN IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF *********************************************************************** * * COMPUTE REAL WORKSPACE * *********************************************************************** IF ( ALLEIG ) THEN MZ = N ELSE IF ( INDEIG ) THEN MZ = IU - IL + 1 ELSE * Take upper bound for VALEIG case MZ = N END IF * NB = DESCA( NB_ ) IF ( WANTZ ) THEN NP00 = NUMROC( N, NB, 0, 0, NPROW ) MQ00 = NUMROC( MZ, NB, 0, 0, NPCOL ) INDRW = INDWORK + MAX(18*N, NP00*MQ00 + 2*NB*NB) LWMIN = INDRW - 1 + (ICEIL(MZ, NPROCS) + 2)*N ELSE INDRW = INDWORK + 12*N LWMIN = INDRW - 1 END IF * The code that validates the input requires 3 workspace entries LWMIN = MAX(3, LWMIN) LWOPT = LWMIN ANB = PJLAENV( ICTXT, 3, 'PDSYTTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROCS ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NSYTRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS LWOPT = MAX( LWOPT, 5*N+NSYTRD_LWOPT ) * SIZE1 = INDRW - INDWORK *********************************************************************** * * COMPUTE INTEGER WORKSPACE * *********************************************************************** NNP = MAX( N, NPROCS+1, 4 ) IF ( WANTZ ) THEN LIWMIN = 12*NNP + 2*N ELSE LIWMIN = 10*NNP + 2*N END IF *********************************************************************** * * Set up pointers into the IWORK array * *********************************************************************** * Pointer to eigenpair distribution over processors INDILU = LIWMIN - 2*NPROCS + 1 SIZE2 = INDILU - 2*N *********************************************************************** * * Test the input arguments. * *********************************************************************** IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 21, INFO ) * IF( INFO.EQ.0 ) THEN IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -3 ELSE IF( MOD( IA-1, DESCA( MB_ ) ).NE.0 ) THEN INFO = -6 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -10 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -11 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -12 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -21 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 800+NB_ ) END IF IF( WANTZ ) THEN IAROW = INDXG2P( 1, DESCA( NB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) IZROW = INDXG2P( 1, DESCA( NB_ ), MYROW, $ DESCZ( RSRC_ ), NPROW ) IF( IAROW.NE.IZROW ) THEN INFO = -19 ELSE IF( MOD( IA-1, DESCA( MB_ ) ).NE. $ MOD( IZ-1, DESCZ( MB_ ) ) ) THEN INFO = -19 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2100+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2100+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2100+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2100+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2100+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2100+CSRC_ ) ELSE IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( ALLEIG ) THEN IDUM1( 3 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 3 ) = ICHAR( 'I' ) ELSE IDUM1( 3 ) = ICHAR( 'V' ) END IF IDUM2( 3 ) = 3 IF( LQUERY ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 4 IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 8, N, 4, N, 4, IZ, $ JZ, DESCZ, 21, 4, IDUM1, IDUM2, INFO ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 4, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = DBLE( LWOPT ) IWORK( 1 ) = LIWMIN END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYEVR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF *********************************************************************** * * Quick return if possible * *********************************************************************** IF( N.EQ.0 ) THEN IF( WANTZ ) THEN NZ = 0 END IF M = 0 WORK( 1 ) = DBLE( LWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF IF( VALEIG ) THEN VLL = VL VUU = VU ELSE VLL = ZERO VUU = ZERO END IF * * No scaling done here, leave this to MRRR kernel. * Scale tridiagonal rather than full matrix. * *********************************************************************** * * REDUCE SYMMETRIC MATRIX TO TRIDIAGONAL FORM. * *********************************************************************** CALL PDSYNTRD( UPLO, N, A, IA, JA, DESCA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'PDSYNTRD', -IINFO ) RETURN END IF *********************************************************************** * * DISTRIBUTE TRIDIAGONAL TO ALL PROCESSORS * *********************************************************************** OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. $ DESCA( RSRC_ ).EQ.0 .AND. DESCA( CSRC_ ).EQ.0 ) $ THEN CALL PDLARED1D( N, IA, JA, DESCA, WORK( INDD ), WORK( INDD2 ), $ WORK( INDWORK ), LLWORK ) * CALL PDLARED1D( N, IA, JA, DESCA, WORK( INDE ), WORK( INDE2 ), $ WORK( INDWORK ), LLWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PDELGET( 'A', ' ', WORK( INDD2+I-1 ), A, I+IA-1, $ I+JA-1, DESCA ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PDELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA-1, $ I+JA, DESCA ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PDELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA, $ I+JA-1, DESCA ) 30 CONTINUE END IF END IF *********************************************************************** * * SET IIL, IIU * *********************************************************************** IF ( ALLEIG ) THEN IIL = 1 IIU = N ELSE IF ( INDEIG ) THEN IIL = IL IIU = IU ELSE IF ( VALEIG ) THEN CALL DLARRC('T', N, VLL, VUU, WORK( INDD2 ), $ WORK( INDE2 + OFFSET ), SAFMIN, EIGCNT, IIL, IIU, INFO) * Refine upper bound N that was taken MZ = EIGCNT IIL = IIL + 1 ENDIF IF(MZ.EQ.0) THEN M = 0 IF( WANTZ ) THEN NZ = 0 END IF WORK( 1 ) = DBLE( LWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF MYIL = 0 MYIU = 0 M = 0 IM = 0 *********************************************************************** * * COMPUTE WORK ASSIGNMENTS * *********************************************************************** * * Each processor computes the work assignments for all processors * CALL PMPIM2( IIL, IIU, NPROCS, $ IWORK(INDILU), IWORK(INDILU+NPROCS) ) * * Find local work assignment * MYIL = IWORK(INDILU+MYPROC) MYIU = IWORK(INDILU+NPROCS+MYPROC) ZOFFSET = MAX(0, MYIL - IIL - 1) FIRST = ( MYIL .EQ. IIL ) *********************************************************************** * * CALLS TO MRRR KERNEL * *********************************************************************** IF(.NOT.WANTZ) THEN * * Compute eigenvalues only. * IINFO = 0 IF ( MYIL.GT.0 ) THEN DOL = 1 DOU = MYIU - MYIL + 1 CALL DSTEGR2( JOBZ, 'I', N, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), VLL, VUU, MYIL, MYIU, $ IM, W( 1 ), WORK( INDRW ), N, $ MYIU - MYIL + 1, $ IWORK( 1 ), WORK( INDWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, $ DOL, DOU, ZOFFSET, IINFO ) * DSTEGR2 zeroes out the entire W array, so we can't just give * it the part of W we need. So here we copy the W entries into * their correct location DO 49 I = 1, IM W( MYIL-IIL+I ) = W( I ) 49 CONTINUE * W( MYIL ) is at W( MYIL - IIL + 1 ) * W( X ) is at W(X - IIL + 1 ) END IF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'DSTEGR2', -IINFO ) RETURN END IF ELSEIF ( WANTZ .AND. NPROCS.EQ.1 ) THEN * * Compute eigenvalues and -vectors, but only on one processor * IINFO = 0 IF ( MYIL.GT.0 ) THEN DOL = MYIL - IIL + 1 DOU = MYIU - IIL + 1 CALL DSTEGR2( JOBZ, 'I', N, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), VLL, VUU, IIL, IIU, $ IM, W( 1 ), WORK( INDRW ), N, $ N, $ IWORK( 1 ), WORK( INDWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, DOU, $ ZOFFSET, IINFO ) ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'DSTEGR2', -IINFO ) RETURN END IF ELSEIF ( WANTZ ) THEN * * Compute representations in parallel. * Share eigenvalue computation for root between all processors * Then compute the eigenvectors. * IINFO = 0 * Part 1. compute root representations and root eigenvalues IF ( MYIL.GT.0 ) THEN DOL = MYIL - IIL + 1 DOU = MYIU - IIL + 1 CALL DSTEGR2A( JOBZ, 'I', N, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), VLL, VUU, IIL, IIU, $ IM, W( 1 ), WORK( INDRW ), N, $ N, WORK( INDWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, $ DOU, NEEDIL, NEEDIU, $ INDERR, NSPLIT, PIVMIN, SCALE, WL, WU, $ IINFO ) ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'DSTEGR2A', -IINFO ) RETURN END IF * * The second part of parallel MRRR, the representation tree * construction begins. Upon successful completion, the * eigenvectors have been computed. This is indicated by * the flag FINISH. * VSTART = .TRUE. FINISH = (MYIL.LE.0) C Part 2. Share eigenvalues and uncertainties between all processors IINDERR = INDWORK + INDERR - 1 * * * There are currently two ways to communicate eigenvalue information * using the BLACS. * 1.) BROADCAST * 2.) POINT2POINT between collaborators (those processors working * jointly on a cluster. * For efficiency, BROADCAST has been disabled. * At a later stage, other more efficient communication algorithms * might be implemented, e. g. group or tree-based communication. * DOBCST = .FALSE. IF(DOBCST) THEN * First gather everything on the first processor. * Then use BROADCAST-based communication DO 45 I = 2, NPROCS IF (MYPROC .EQ. (I - 1)) THEN DSTROW = 0 DSTCOL = 0 STARTI = DOL IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF (( STARTI.GE.1 ) .AND. ( LENGTHI.GE.1 )) THEN LENGTHI2 = 2*LENGTHI * Copy eigenvalues into communication buffer CALL DCOPY(LENGTHI,W( STARTI ),1, $ WORK( INDD ), 1) * Copy uncertainties into communication buffer CALL DCOPY(LENGTHI,WORK( IINDERR+STARTI-1 ),1, $ WORK( INDD+LENGTHI ), 1) * send buffer CALL DGESD2D( ICTXT, LENGTHI2, $ 1, WORK( INDD ), LENGTHI2, $ DSTROW, DSTCOL ) END IF ELSE IF (MYPROC .EQ. 0) THEN SRCROW = (I-1) / NPCOL SRCCOL = MOD(I-1, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) STARTI = IWORK(1) LENGTHI = IWORK(2) IF (( STARTI.GE.1 ) .AND. ( LENGTHI.GE.1 )) THEN LENGTHI2 = 2*LENGTHI * receive buffer CALL DGERV2D( ICTXT, LENGTHI2, 1, $ WORK(INDD), LENGTHI2, SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL DCOPY( LENGTHI, WORK(INDD), 1, $ W( STARTI ), 1) * copy uncertainties (errors) from communication buffer CALL DCOPY(LENGTHI,WORK(INDD+LENGTHI),1, $ WORK( IINDERR+STARTI-1 ), 1) END IF END IF 45 CONTINUE LENGTHI = IIU - IIL + 1 LENGTHI2 = LENGTHI * 2 IF (MYPROC .EQ. 0) THEN * Broadcast eigenvalues and errors to all processors CALL DCOPY(LENGTHI,W ,1, WORK( INDD ), 1) CALL DCOPY(LENGTHI,WORK( IINDERR ),1, $ WORK( INDD+LENGTHI ), 1) CALL DGEBS2D( ICTXT, 'A', ' ', LENGTHI2, 1, $ WORK(INDD), LENGTHI2 ) ELSE SRCROW = 0 SRCCOL = 0 CALL DGEBR2D( ICTXT, 'A', ' ', LENGTHI2, 1, $ WORK(INDD), LENGTHI2, SRCROW, SRCCOL ) CALL DCOPY( LENGTHI, WORK(INDD), 1, W, 1) CALL DCOPY(LENGTHI,WORK(INDD+LENGTHI),1, $ WORK( IINDERR ), 1) END IF ELSE * * Enable point2point communication between collaborators * * Find collaborators of MYPROC IF( (NPROCS.GT.1).AND.(MYIL.GT.0) ) THEN CALL PMPCOL( MYPROC, NPROCS, IIL, NEEDIL, NEEDIU, $ IWORK(INDILU), IWORK(INDILU+NPROCS), $ COLBRT, FRSTCL, LASTCL ) ELSE COLBRT = .FALSE. ENDIF IF(COLBRT) THEN * If the processor collaborates with others, * communicate information. DO 47 IPROC = FRSTCL, LASTCL IF (MYPROC .EQ. IPROC) THEN STARTI = DOL IWORK(1) = STARTI LENGTHI = MYIU - MYIL + 1 IWORK(2) = LENGTHI IF ((STARTI.GE.1) .AND. (LENGTHI.GE.1)) THEN * Copy eigenvalues into communication buffer CALL DCOPY(LENGTHI,W( STARTI ),1, $ WORK(INDD), 1) * Copy uncertainties into communication buffer CALL DCOPY(LENGTHI, $ WORK( IINDERR+STARTI-1 ),1, $ WORK(INDD+LENGTHI), 1) ENDIF DO 46 I = FRSTCL, LASTCL IF(I.EQ.MYPROC) GOTO 46 DSTROW = I/ NPCOL DSTCOL = MOD(I, NPCOL) CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1) .AND. (LENGTHI.GE.1)) THEN LENGTHI2 = 2*LENGTHI * send buffer CALL DGESD2D( ICTXT, LENGTHI2, $ 1, WORK(INDD), LENGTHI2, $ DSTROW, DSTCOL ) END IF 46 CONTINUE ELSE SRCROW = IPROC / NPCOL SRCCOL = MOD(IPROC, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) RSTARTI = IWORK(1) RLENGTHI = IWORK(2) IF ((RSTARTI.GE.1 ) .AND. (RLENGTHI.GE.1 )) THEN RLENGTHI2 = 2*RLENGTHI CALL DGERV2D( ICTXT, RLENGTHI2, 1, $ WORK(INDE), RLENGTHI2, $ SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL DCOPY( RLENGTHI, WORK(INDE), 1, $ W( RSTARTI ), 1) * copy uncertainties (errors) from communication buffer CALL DCOPY(RLENGTHI,WORK(INDE+RLENGTHI),1, $ WORK( IINDERR+RSTARTI-1 ), 1) END IF END IF 47 CONTINUE ENDIF ENDIF * * Part 3. Compute representation tree and eigenvectors. * What follows is a loop in which the tree * is constructed in parallel from top to bottom, * on level at a time, until all eigenvectors * have been computed. * 100 CONTINUE IF ( MYIL.GT.0 ) THEN CALL DSTEGR2B( JOBZ, N, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), $ IM, W( 1 ), WORK( INDRW ), N, N, $ IWORK( 1 ), WORK( INDWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, $ DOU, NEEDIL, NEEDIU, INDWLC, $ PIVMIN, SCALE, WL, WU, $ VSTART, FINISH, $ MAXCLS, NDEPTH, PARITY, ZOFFSET, IINFO ) IINDWLC = INDWORK + INDWLC - 1 IF(.NOT.FINISH) THEN IF((NEEDIL.LT.DOL).OR.(NEEDIU.GT.DOU)) THEN CALL PMPCOL( MYPROC, NPROCS, IIL, NEEDIL, NEEDIU, $ IWORK(INDILU), IWORK(INDILU+NPROCS), $ COLBRT, FRSTCL, LASTCL ) ELSE COLBRT = .FALSE. FRSTCL = MYPROC LASTCL = MYPROC ENDIF * * Check if this processor collaborates, i.e. * communication is needed. * IF(COLBRT) THEN DO 147 IPROC = FRSTCL, LASTCL IF (MYPROC .EQ. IPROC) THEN STARTI = DOL IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN * Copy eigenvalues into communication buffer CALL DCOPY(LENGTHI, $ WORK( IINDWLC+STARTI-1 ),1, $ WORK(INDD), 1) * Copy uncertainties into communication buffer CALL DCOPY(LENGTHI, $ WORK( IINDERR+STARTI-1 ),1, $ WORK(INDD+LENGTHI), 1) ENDIF DO 146 I = FRSTCL, LASTCL IF(I.EQ.MYPROC) GOTO 146 DSTROW = I/ NPCOL DSTCOL = MOD(I, NPCOL) CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN LENGTHI2 = 2*LENGTHI * send buffer CALL DGESD2D( ICTXT, LENGTHI2, $ 1, WORK(INDD), LENGTHI2, $ DSTROW, DSTCOL ) END IF 146 CONTINUE ELSE SRCROW = IPROC / NPCOL SRCCOL = MOD(IPROC, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) RSTARTI = IWORK(1) RLENGTHI = IWORK(2) IF ((RSTARTI.GE.1).AND.(RLENGTHI.GE.1)) THEN RLENGTHI2 = 2*RLENGTHI CALL DGERV2D( ICTXT,RLENGTHI2, 1, $ WORK(INDE),RLENGTHI2, $ SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL DCOPY(RLENGTHI, WORK(INDE), 1, $ WORK( IINDWLC+RSTARTI-1 ), 1) * copy uncertainties (errors) from communication buffer CALL DCOPY(RLENGTHI,WORK(INDE+RLENGTHI),1, $ WORK( IINDERR+RSTARTI-1 ), 1) END IF END IF 147 CONTINUE ENDIF GOTO 100 ENDIF ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'DSTEGR2B', -IINFO ) RETURN END IF * ENDIF * *********************************************************************** * * MAIN PART ENDS HERE * *********************************************************************** * *********************************************************************** * * ALLGATHER: EACH PROCESSOR SENDS ITS EIGENVALUES TO THE FIRST ONE, * THEN THE FIRST PROCESSOR BROADCASTS ALL EIGENVALUES * *********************************************************************** * DO 50 I = 2, NPROCS IF (MYPROC .EQ. (I - 1)) THEN DSTROW = 0 DSTCOL = 0 STARTI = MYIL - IIL + 1 IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN CALL DGESD2D( ICTXT, LENGTHI, $ 1, W( STARTI ), LENGTHI, $ DSTROW, DSTCOL ) ENDIF ELSE IF (MYPROC .EQ. 0) THEN SRCROW = (I-1) / NPCOL SRCCOL = MOD(I-1, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) STARTI = IWORK(1) LENGTHI = IWORK(2) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN CALL DGERV2D( ICTXT, LENGTHI, 1, $ W( STARTI ), LENGTHI, SRCROW, SRCCOL ) ENDIF ENDIF 50 CONTINUE * Accumulate M from all processors M = IM CALL IGSUM2D( ICTXT, 'A', ' ', 1, 1, M, 1, -1, -1 ) * Broadcast eigenvalues to all processors IF (MYPROC .EQ. 0) THEN * Send eigenvalues CALL DGEBS2D( ICTXT, 'A', ' ', M, 1, W, M ) ELSE SRCROW = 0 SRCCOL = 0 CALL DGEBR2D( ICTXT, 'A', ' ', M, 1, $ W, M, SRCROW, SRCCOL ) END IF * * Sort the eigenvalues and keep permutation in IWORK to * sort the eigenvectors accordingly * DO 160 I = 1, M IWORK( NPROCS+1+I ) = I 160 CONTINUE CALL DLASRT2( 'I', M, W, IWORK( NPROCS+2 ), IINFO ) IF (IINFO.NE.0) THEN CALL PXERBLA( ICTXT, 'DLASRT2', -IINFO ) RETURN END IF *********************************************************************** * * TRANSFORM Z FROM 1D WORKSPACE INTO 2D BLOCKCYCLIC STORAGE * *********************************************************************** IF ( WANTZ ) THEN DO 170 I = 1, M IWORK( M+NPROCS+1+IWORK( NPROCS+1+I ) ) = I 170 CONTINUE * Store NVS in IWORK(1:NPROCS+1) for PDLAEVSWP IWORK( 1 ) = 0 DO 180 I = 1, NPROCS * Find IL and IU for processor i-1 * Has already been computed by PMPIM2 and stored IPIL = IWORK(INDILU+I-1) IPIU = IWORK(INDILU+NPROCS+I-1) IF (IPIL .EQ. 0) THEN IWORK( I + 1 ) = IWORK( I ) ELSE IWORK( I + 1 ) = IWORK( I ) + IPIU - IPIL + 1 ENDIF 180 CONTINUE IF ( FIRST ) THEN CALL PDLAEVSWP(N, WORK( INDRW ), N, Z, IZ, JZ, $ DESCZ, IWORK( 1 ), IWORK( NPROCS+M+2 ), WORK( INDWORK ), $ INDRW - INDWORK ) ELSE CALL PDLAEVSWP(N, WORK( INDRW + N ), N, Z, IZ, JZ, $ DESCZ, IWORK( 1 ), IWORK( NPROCS+M+2 ), WORK( INDWORK ), $ INDRW - INDWORK ) END IF * NZ = M * *********************************************************************** * * Compute eigenvectors of A from eigenvectors of T * *********************************************************************** IF( NZ.GT.0 ) THEN CALL PDORMTR( 'L', UPLO, 'N', N, NZ, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), SIZE1, IINFO ) END IF IF (IINFO.NE.0) THEN CALL PXERBLA( ICTXT, 'PDORMTR', -IINFO ) RETURN END IF * END IF * WORK( 1 ) = DBLE( LWOPT ) IWORK( 1 ) = LIWMIN RETURN * * End of PDSYEVR * END scalapack-2.0.2/SRC/pdsyevx.f000644 000766 000024 00000115167 11605326344 016230 0ustar00juliestaff000000 000000 SUBROUTINE PDSYEVX( JOBZ, RANGE, UPLO, N, A, IA, JA, DESCA, VL, $ VU, IL, IU, ABSTOL, M, NZ, W, ORFAC, Z, IZ, $ JZ, DESCZ, WORK, LWORK, IWORK, LIWORK, IFAIL, $ ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, INFO, IU, IZ, JA, JZ, LIWORK, LWORK, M, $ N, NZ DOUBLE PRECISION ABSTOL, ORFAC, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), ICLUSTR( * ), $ IFAIL( * ), IWORK( * ) DOUBLE PRECISION A( * ), GAP( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PDSYEVX computes selected eigenvalues and, optionally, eigenvectors * of a real symmetric matrix A by calling the recommended sequence * of ScaLAPACK routines. Eigenvalues/vectors can be selected by * specifying a range of values or a range of indices for the desired * eigenvalues. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * PDSYEVX assumes IEEE 754 standard compliant arithmetic. To port * to a system which does not have IEEE 754 arithmetic, modify * the appropriate SLmake.inc file to include the compiler switch * -DNO_IEEE. This switch only affects the compilation of pdlaiect.c. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic DOUBLE PRECISION array, * global dimension (N, N), * local dimension ( LLD_A, LOCc(JA+N-1) ) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PDSYEVX cannot guarantee * correct error reporting. * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * ABSTOL (global input) DOUBLE PRECISION * If JOBZ='V', setting ABSTOL to PDLAMCH( CONTEXT, 'U') yields * the most orthogonal eigenvectors. * * The absolute error tolerance for the eigenvalues. * An approximate eigenvalue is accepted as converged * when it is determined to lie in an interval [a,b] * of width less than or equal to * * ABSTOL + EPS * max( |a|,|b| ) , * * where EPS is the machine precision. If ABSTOL is less than * or equal to zero, then EPS*norm(T) will be used in its place, * where norm(T) is the 1-norm of the tridiagonal matrix * obtained by reducing A to tridiagonal form. * * Eigenvalues will be computed most accurately when ABSTOL is * set to twice the underflow threshold 2*PDLAMCH('S') not zero. * If this routine returns with ((MOD(INFO,2).NE.0) .OR. * (MOD(INFO/8,2).NE.0)), indicating that some eigenvalues or * eigenvectors did not converge, try setting ABSTOL to * 2*PDLAMCH('S'). * * See "Computing Small Singular Values of Bidiagonal Matrices * with Guaranteed High Relative Accuracy," by Demmel and * Kahan, LAPACK Working Note #3. * * See "On the correctness of Parallel Bisection in Floating * Point" by Demmel, Dhillon and Ren, LAPACK Working Note #70 * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M unless the user supplies * insufficient space and PDSYEVX is not able to detect this * before beginning computation. To get all the eigenvectors * requested, the user must supply both sufficient * space to hold the eigenvectors in Z (M .LE. DESCZ(N_)) * and sufficient workspace to compute them. (See LWORK below.) * PDSYEVX is always able to detect insufficient space without * computation unless RANGE .EQ. 'V'. * * W (global output) DOUBLE PRECISION array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * ORFAC (global input) DOUBLE PRECISION * Specifies which eigenvectors should be reorthogonalized. * Eigenvectors that correspond to eigenvalues which are within * tol=ORFAC*norm(A) of each other are to be reorthogonalized. * However, if the workspace is insufficient (see LWORK), * tol may be decreased until all eigenvectors to be * reorthogonalized can be stored in one process. * No reorthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) DOUBLE PRECISION array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. If an eigenvector * fails to converge, then that column of Z contains the latest * approximation to the eigenvector, and the index of the * eigenvector is returned in IFAIL. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension max(3,LWORK) * On return, WORK(1) contains the optimal amount of * workspace required for efficient execution. * if JOBZ='N' WORK(1) = optimal amount of workspace * required to compute eigenvalues efficiently * if JOBZ='V' WORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors * efficiently with no guarantee on orthogonality. * If RANGE='V', it is assumed that all eigenvectors * may be required. * * LWORK (local input) INTEGER * Size of WORK * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= 5 * N + MAX( 5 * NN, NB * ( NP0 + 1 ) ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * LWORK >= 5*N + MAX( 5*NN, NP0 * MQ0 + 2 * NB * NB ) + * ICEIL( NEIG, NPROW*NPCOL)*NN * * The computed eigenvectors may not be orthogonal if the * minimal workspace is supplied and ORFAC is too small. * If you want to guarantee orthogonality (at the cost * of potentially poor performance) you should add * the following to LWORK: * (CLUSTERSIZE-1)*N * where CLUSTERSIZE is the number of eigenvalues in the * largest cluster, where a cluster is defined as a set of * close eigenvalues: { W(K),...,W(K+CLUSTERSIZE-1) | * W(J+1) <= W(J) + ORFAC*2*norm(A) } * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * When LWORK is too small: * If LWORK is too small to guarantee orthogonality, * PDSYEVX attempts to maintain orthogonality in * the clusters with the smallest * spacing between the eigenvalues. * If LWORK is too small to compute all the eigenvectors * requested, no computation is performed and INFO=-23 * is returned. Note that when RANGE='V', PDSYEVX does * not know how many eigenvectors are requested until * the eigenvalues are computed. Therefore, when RANGE='V' * and as long as LWORK is large enough to allow PDSYEVX to * compute the eigenvalues, PDSYEVX will compute the * eigenvalues and as many eigenvectors as it can. * * Relationship between workspace, orthogonality & performance: * Greater performance can be achieved if adequate workspace * is provided. On the other hand, in some situations, * performance can decrease as the workspace provided * increases above the workspace amount shown below: * * For optimal performance, greater workspace may be * needed, i.e. * LWORK >= MAX( LWORK, 5*N + NSYTRD_LWOPT ) * Where: * LWORK, as defined previously, depends upon the number * of eigenvectors requested, and * NSYTRD_LWOPT = N + 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 3 ) * NPS * * ANB = PJLAENV( DESCA( CTXT_), 3, 'PDSYTTRD', 'L', * 0, 0, 0, 0) * SQNPC = INT( SQRT( DBLE( NPROW * NPCOL ) ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by * calling the subroutine BLACS_GRIDINFO. * * For large N, no extra workspace is needed, however the * biggest boost in performance comes for small N, so it * is wise to provide the extra workspace (typically less * than a Megabyte per process). * * If CLUSTERSIZE >= N/SQRT(NPROW*NPCOL), then providing * enough space to compute all the eigenvectors * orthogonally will cause serious degradation in * performance. In the limit (i.e. CLUSTERSIZE = N-1) * PDSTEIN will perform no better than DSTEIN on 1 * processor. * For CLUSTERSIZE = N/SQRT(NPROW*NPCOL) reorthogonalizing * all eigenvectors will increase the total execution time * by a factor of 2 or more. * For CLUSTERSIZE > N/SQRT(NPROW*NPCOL) execution time will * grow as the square of the cluster size, all other factors * remaining equal and assuming enough workspace. Less * workspace means less reorthogonalization but faster * execution. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance for all work arrays. Each of * these values is returned in the first entry of the * corresponding work arrays, and no error message is issued by * PXERBLA. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * LIWORK >= 6 * NNP * Where: * NNP = MAX( N, NPROW*NPCOL + 1, 4 ) * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If (MOD(INFO,2).NE.0) on exit, then * IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * * ICLUSTR (global output) integer array, dimension (2*NPROW*NPCOL) * This array contains indices of eigenvectors corresponding to * a cluster of eigenvalues that could not be reorthogonalized * due to insufficient workspace (see LWORK, ORFAC and INFO). * Eigenvectors corresponding to clusters of eigenvalues indexed * ICLUSTR(2*I-1) to ICLUSTR(2*I), could not be * reorthogonalized due to lack of workspace. Hence the * eigenvectors corresponding to these clusters may not be * orthogonal. ICLUSTR() is a zero terminated array. * (ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0) if and only if * K is the number of clusters * ICLUSTR is not referenced if JOBZ = 'N' * * GAP (global output) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * This array contains the gap between eigenvalues whose * eigenvectors could not be reorthogonalized. The output * values in this array correspond to the clusters indicated * by the array ICLUSTR. As a result, the dot product between * eigenvectors correspoding to the I^th cluster may be as high * as ( C * n ) / GAP(I) where C is a small constant. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: if (MOD(INFO,2).NE.0), then one or more eigenvectors * failed to converge. Their indices are stored * in IFAIL. Ensure ABSTOL=2.0*PDLAMCH( 'U' ) * Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding * to one or more clusters of eigenvalues could not be * reorthogonalized because of insufficient workspace. * The indices of the clusters are stored in the array * ICLUSTR. * if (MOD(INFO/4,2).NE.0), then space limit prevented * PDSYEVX from computing all of the eigenvectors * between VL and VU. The number of eigenvectors * computed is returned in NZ. * if (MOD(INFO/8,2).NE.0), then PDSTEBZ failed to compute * eigenvalues. Ensure ABSTOL=2.0*PDLAMCH( 'U' ) * Send e-mail to scalapack@cs.utk.edu * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * ( MB_A.EQ.NB_A.EQ.MB_Z .AND. IROFFA.EQ.IROFFZ .AND. IROFFA.EQ.0 .AND. * IAROW.EQ.IZROW ) * where * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * Differences between PDSYEVX and DSYEVX * ====================================== * * A, LDA -> A, IA, JA, DESCA * Z, LDZ -> Z, IZ, JZ, DESCZ * WORKSPACE needs are larger for PDSYEVX. * LIWORK parameter added * * ORFAC, ICLUSTER() and GAP() parameters added * meaning of INFO is changed * * Functional differences: * PDSYEVX does not promise orthogonality for eigenvectors associated * with tighly clustered eigenvalues. * PDSYEVX does not reorthogonalize eigenvectors * that are on different processes. The extent of reorthogonalization * is controlled by the input parameter LWORK. * * Version 1.4 limitations: * DESCA(MB_) = DESCA(NB_) * DESCA(M_) = DESCZ(M_) * DESCA(N_) = DESCZ(N_) * DESCA(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE, TEN, FIVE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TEN = 10.0D+0, $ FIVE = 5.0D+0 ) INTEGER IERREIN, IERRCLS, IERRSPC, IERREBZ PARAMETER ( IERREIN = 1, IERRCLS = 2, IERRSPC = 4, $ IERREBZ = 8 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, QUICKRETURN, $ VALEIG, WANTZ CHARACTER ORDER INTEGER ANB, CSRC_A, I, IAROW, ICOFFA, ICTXT, IINFO, $ INDD, INDD2, INDE, INDE2, INDIBL, INDISP, $ INDTAU, INDWORK, IROFFA, IROFFZ, ISCALE, $ ISIZESTEBZ, ISIZESTEIN, IZROW, LALLWORK, $ LIWMIN, LLWORK, LWMIN, LWOPT, MAXEIGS, MB_A, $ MQ0, MYCOL, MYROW, NB, NB_A, NEIG, NN, NNP, $ NP0, NPCOL, NPROCS, NPROW, NPS, NSPLIT, $ NSYTRD_LWOPT, NZZ, OFFSET, RSRC_A, RSRC_Z, $ SIZEORMTR, SIZESTEIN, SIZESYEVX, SQNPC DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, $ SIGMA, SMLNUM, VLL, VUU * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV DOUBLE PRECISION PDLAMCH, PDLANSY EXTERNAL LSAME, ICEIL, INDXG2P, NUMROC, PJLAENV, $ PDLAMCH, PDLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DGEBR2D, DGEBS2D, $ DLASRT, DSCAL, IGAMN2D, PCHK1MAT, PCHK2MAT, $ PDELGET, PDLARED1D, PDLASCL, PDORMTR, PDSTEBZ, $ PDSTEIN, PDSYNTRD, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, INT, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * QUICKRETURN = ( N.EQ.0 ) * * Test the input arguments. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 * WANTZ = LSAME( JOBZ, 'V' ) IF( NPROW.EQ.-1 ) THEN INFO = -( 800+CTXT_ ) ELSE IF( WANTZ ) THEN IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 21, INFO ) * IF( INFO.EQ.0 ) THEN * * Get machine constants. * SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) EPS = PDLAMCH( ICTXT, 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * NPROCS = NPROW*NPCOL LOWER = LSAME( UPLO, 'L' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) * * Set up pointers into the WORK array * INDTAU = 1 INDE = INDTAU + N INDD = INDE + N INDD2 = INDD + N INDE2 = INDD2 + N INDWORK = INDE2 + N LLWORK = LWORK - INDWORK + 1 * * Set up pointers into the IWORK array * ISIZESTEIN = 3*N + NPROCS + 1 ISIZESTEBZ = MAX( 4*N, 14, NPROCS ) INDIBL = ( MAX( ISIZESTEIN, ISIZESTEBZ ) ) + 1 INDISP = INDIBL + N * * Compute the total amount of space needed * LQUERY = .FALSE. IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) $ LQUERY = .TRUE. * NNP = MAX( N, NPROCS+1, 4 ) LIWMIN = 6*NNP * NPROCS = NPROW*NPCOL NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A NN = MAX( N, NB, 2 ) * RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( 1, NB_A, MYROW, RSRC_A, NPROW ) NP0 = NUMROC( N+IROFFA, NB, 0, 0, NPROW ) MQ0 = NUMROC( N+ICOFFA, NB, 0, 0, NPCOL ) IF( WANTZ ) THEN RSRC_Z = DESCZ( RSRC_ ) IROFFZ = MOD( IZ-1, MB_A ) IZROW = INDXG2P( 1, NB_A, MYROW, RSRC_Z, NPROW ) ELSE IROFFZ = 0 IZROW = 0 END IF * IF( ( .NOT.WANTZ ) .OR. ( VALEIG .AND. ( .NOT.LQUERY ) ) ) $ THEN LWMIN = 5*N + MAX( 5*NN, NB*( NP0+1 ) ) IF( WANTZ ) THEN MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) LWOPT = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) ELSE LWOPT = LWMIN END IF NEIG = 0 ELSE IF( ALLEIG .OR. VALEIG ) THEN NEIG = N ELSE IF( INDEIG ) THEN NEIG = IU - IL + 1 END IF MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) LWMIN = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN LWOPT = LWMIN * END IF * * Compute how much workspace is needed to use the * new TRD code * ANB = PJLAENV( ICTXT, 3, 'PDSYTTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NSYTRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS LWOPT = MAX( LWOPT, 5*N+NSYTRD_LWOPT ) * END IF IF( INFO.EQ.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WORK( 1 ) = ABSTOL IF( VALEIG ) THEN WORK( 2 ) = VL WORK( 3 ) = VU ELSE WORK( 2 ) = ZERO WORK( 3 ) = ZERO END IF CALL DGEBS2D( ICTXT, 'ALL', ' ', 3, 1, WORK, 3 ) ELSE CALL DGEBR2D( ICTXT, 'ALL', ' ', 3, 1, WORK, 3, 0, 0 ) END IF IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -3 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -10 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -11 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -12 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -23 ELSE IF( LIWORK.LT.LIWMIN .AND. LIWORK.NE.-1 ) THEN INFO = -25 ELSE IF( VALEIG .AND. ( ABS( WORK( 2 )-VL ).GT.FIVE*EPS* $ ABS( VL ) ) ) THEN INFO = -9 ELSE IF( VALEIG .AND. ( ABS( WORK( 3 )-VU ).GT.FIVE*EPS* $ ABS( VU ) ) ) THEN INFO = -10 ELSE IF( ABS( WORK( 1 )-ABSTOL ).GT.FIVE*EPS*ABS( ABSTOL ) ) $ THEN INFO = -13 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 800+NB_ ) END IF IF( WANTZ ) THEN IF( IROFFA.NE.IROFFZ ) THEN INFO = -19 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -19 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2100+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2100+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2100+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2100+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2100+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2100+CSRC_ ) ELSE IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF END IF IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( ALLEIG ) THEN IDUM1( 3 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 3 ) = ICHAR( 'I' ) ELSE IDUM1( 3 ) = ICHAR( 'V' ) END IF IDUM2( 3 ) = 3 IF( LQUERY ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 4 IF( WANTZ ) THEN CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 8, N, 4, N, 4, IZ, $ JZ, DESCZ, 21, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 4, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = DBLE( LWOPT ) IWORK( 1 ) = LIWMIN END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYEVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( QUICKRETURN ) THEN IF( WANTZ ) THEN NZ = 0 ICLUSTR( 1 ) = 0 END IF M = 0 WORK( 1 ) = DBLE( LWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF * * Scale matrix to allowable range, if necessary. * ABSTLL = ABSTOL ISCALE = 0 IF( VALEIG ) THEN VLL = VL VUU = VU ELSE VLL = ZERO VUU = ZERO END IF * ANRM = PDLANSY( 'M', UPLO, N, A, IA, JA, DESCA, WORK( INDWORK ) ) * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ANRM = ANRM*SIGMA ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM ANRM = ANRM*SIGMA END IF * IF( ISCALE.EQ.1 ) THEN CALL PDLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) IF( ABSTOL.GT.0 ) $ ABSTLL = ABSTOL*SIGMA IF( VALEIG ) THEN VLL = VL*SIGMA VUU = VU*SIGMA IF( VUU.EQ.VLL ) THEN VUU = VUU + 2*MAX( ABS( VUU )*EPS, SAFMIN ) END IF END IF END IF * * Call PDSYNTRD to reduce symmetric matrix to tridiagonal form. * LALLWORK = LLWORK * CALL PDSYNTRD( UPLO, N, A, IA, JA, DESCA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * * Copy the values of D, E to all processes * * Here PxLARED1D is used to redistribute the tridiagonal matrix. * PxLARED1D, however, doesn't yet work with arbritary matrix * distributions so we have PxELGET as a backup. * OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. RSRC_A.EQ.0 .AND. CSRC_A.EQ.0 ) $ THEN CALL PDLARED1D( N, IA, JA, DESCA, WORK( INDD ), WORK( INDD2 ), $ WORK( INDWORK ), LLWORK ) * CALL PDLARED1D( N, IA, JA, DESCA, WORK( INDE ), WORK( INDE2 ), $ WORK( INDWORK ), LLWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PDELGET( 'A', ' ', WORK( INDD2+I-1 ), A, I+IA-1, $ I+JA-1, DESCA ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PDELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA-1, $ I+JA, DESCA ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PDELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA, $ I+JA-1, DESCA ) 30 CONTINUE END IF END IF * * Call PDSTEBZ and, if eigenvectors are desired, PDSTEIN. * IF( WANTZ ) THEN ORDER = 'B' ELSE ORDER = 'E' END IF * CALL PDSTEBZ( ICTXT, RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, $ WORK( INDD2 ), WORK( INDE2+OFFSET ), M, NSPLIT, W, $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWORK ), $ LLWORK, IWORK( 1 ), ISIZESTEBZ, IINFO ) * * * IF PDSTEBZ fails, the error propogates to INFO, but * we do not propogate the eigenvalue(s) which failed because: * 1) This should never happen if the user specifies * ABSTOL = 2 * PDLAMCH( 'U' ) * 2) PDSTEIN will confirm/deny whether the eigenvalues are * close enough. * IF( IINFO.NE.0 ) THEN INFO = INFO + IERREBZ DO 40 I = 1, M IWORK( INDIBL+I-1 ) = ABS( IWORK( INDIBL+I-1 ) ) 40 CONTINUE END IF IF( WANTZ ) THEN * IF( VALEIG ) THEN * * Compute the maximum number of eigenvalues that we can * compute in the * workspace that we have, and that we can store in Z. * * Loop through the possibilities looking for the largest * NZ that we can feed to PDSTEIN and PDORMTR * * Since all processes must end up with the same value * of NZ, we first compute the minimum of LALLWORK * CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LALLWORK, 1, 1, 1, -1, $ -1, -1 ) * MAXEIGS = DESCZ( N_ ) * DO 50 NZ = MIN( MAXEIGS, M ), 0, -1 MQ0 = NUMROC( NZ, NB, 0, 0, NPCOL ) SIZESTEIN = ICEIL( NZ, NPROCS )*N + MAX( 5*N, NP0*MQ0 ) SIZEORMTR = MAX( ( NB*( NB-1 ) ) / 2, ( MQ0+NP0 )*NB ) + $ NB*NB * SIZESYEVX = MAX( SIZESTEIN, SIZEORMTR ) IF( SIZESYEVX.LE.LALLWORK ) $ GO TO 60 50 CONTINUE 60 CONTINUE ELSE NZ = M END IF NZ = MAX( NZ, 0 ) IF( NZ.NE.M ) THEN INFO = INFO + IERRSPC * DO 70 I = 1, M IFAIL( I ) = 0 70 CONTINUE * * The following code handles a rare special case * - NZ .NE. M means that we don't have enough room to store * all the vectors. * - NSPLIT .GT. 1 means that the matrix split * In this case, we cannot simply take the first NZ eigenvalues * because PDSTEBZ sorts the eigenvalues by block when * a split occurs. So, we have to make another call to * PDSTEBZ with a new upper limit - VUU. * IF( NSPLIT.GT.1 ) THEN CALL DLASRT( 'I', M, W, IINFO ) NZZ = 0 IF( NZ.GT.0 ) THEN * VUU = W( NZ ) - TEN*( EPS*ANRM+SAFMIN ) IF( VLL.GE.VUU ) THEN NZZ = 0 ELSE CALL PDSTEBZ( ICTXT, RANGE, ORDER, N, VLL, VUU, IL, $ IU, ABSTLL, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), NZZ, NSPLIT, W, $ IWORK( INDIBL ), IWORK( INDISP ), $ WORK( INDWORK ), LLWORK, IWORK( 1 ), $ ISIZESTEBZ, IINFO ) END IF * IF( MOD( INFO / IERREBZ, 1 ).EQ.0 ) THEN IF( NZZ.GT.NZ .OR. IINFO.NE.0 ) THEN INFO = INFO + IERREBZ END IF END IF END IF NZ = MIN( NZ, NZZ ) * END IF END IF CALL PDSTEIN( N, WORK( INDD2 ), WORK( INDE2+OFFSET ), NZ, W, $ IWORK( INDIBL ), IWORK( INDISP ), ORFAC, Z, IZ, $ JZ, DESCZ, WORK( INDWORK ), LALLWORK, IWORK( 1 ), $ ISIZESTEIN, IFAIL, ICLUSTR, GAP, IINFO ) * IF( IINFO.GE.NZ+1 ) $ INFO = INFO + IERRCLS IF( MOD( IINFO, NZ+1 ).NE.0 ) $ INFO = INFO + IERREIN * * Z = Q * Z * * IF( NZ.GT.0 ) THEN CALL PDORMTR( 'L', UPLO, 'N', N, NZ, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) END IF * END IF * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL DSCAL( M, ONE / SIGMA, W, 1 ) END IF * WORK( 1 ) = DBLE( LWOPT ) IWORK( 1 ) = LIWMIN * RETURN * * End of PDSYEVX * END scalapack-2.0.2/SRC/pdsygs2.f000644 000766 000024 00000036345 10363532303 016113 0ustar00juliestaff000000 000000 * * SUBROUTINE PDSYGS2( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Purpose * ======= * * PDSYGS2 reduces a real symmetric-definite generalized eigenproblem * to standard form. * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**T)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**T) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**T or L**T*sub( A )*L. * * sub( B ) must have been previously factorized as U**T*U or L*L**T by * PDPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**T)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**T); * = 2 or 3: compute U*sub( A )*U**T or L**T*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**T*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**T. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PDPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, HALF PARAMETER ( ONE = 1.0D+0, HALF = 0.5D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IIA, IIB, IOFFA, IOFFB, IROFFA, IROFFB, $ JJA, JJB, K, LDA, LDB, MYCOL, MYROW, NPCOL, $ NPROW DOUBLE PRECISION AKK, BKK, CT * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GRIDINFO, CHK1MAT, DAXPY, $ DSCAL, DSYR2, DTRMV, DTRSV, INFOG2L, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL LSAME, INDXG2P * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( N+ICOFFA.GT.DESCA( NB_ ) ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYGS2', -INFO ) CALL BLACS_EXIT( ICTXT ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. ( MYROW.NE.IAROW .OR. MYCOL.NE.IACOL ) ) $ RETURN * * Compute local information * LDA = DESCA( LLD_ ) LDB = DESCB( LLD_ ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIB, JJB, $ IBROW, IBCOL ) * IF( IBTYPE.EQ.1 ) THEN * IF( UPPER ) THEN * IOFFA = IIA + JJA*LDA IOFFB = IIB + JJB*LDB * * Compute inv(U')*sub( A )*inv(U) * DO 10 K = 1, N * * Update the upper triangle of * A(ia+k-1:ia+n-a,ia+k-1:ia+n-1) * AKK = A( IOFFA-LDA ) BKK = B( IOFFB-LDB ) AKK = AKK / BKK**2 A( IOFFA-LDA ) = AKK IF( K.LT.N ) THEN CALL DSCAL( N-K, ONE / BKK, A( IOFFA ), LDA ) CT = -HALF*AKK CALL DAXPY( N-K, CT, B( IOFFB ), LDB, A( IOFFA ), $ LDA ) CALL DSYR2( UPLO, N-K, -ONE, A( IOFFA ), LDA, $ B( IOFFB ), LDB, A( IOFFA+1 ), LDA ) CALL DAXPY( N-K, CT, B( IOFFB ), LDB, A( IOFFA ), $ LDA ) CALL DTRSV( UPLO, 'Transpose', 'Non-unit', N-K, $ B( IOFFB+1 ), LDB, A( IOFFA ), LDA ) END IF * * A( IOFFA ) -> A( K, K+1 ) * B( IOFFB ) -> B( K, K+1 ) * IOFFA = IOFFA + LDA + 1 IOFFB = IOFFB + LDB + 1 * 10 CONTINUE * ELSE * IOFFA = IIA + 1 + ( JJA-1 )*LDA IOFFB = IIB + 1 + ( JJB-1 )*LDB * * Compute inv(L)*sub( A )*inv(L') * DO 20 K = 1, N * * Update the lower triangle of * A(ia+k-1:ia+n-a,ia+k-1:ia+n-1) * AKK = A( IOFFA-1 ) BKK = B( IOFFB-1 ) AKK = AKK / BKK**2 A( IOFFA-1 ) = AKK * IF( K.LT.N ) THEN CALL DSCAL( N-K, ONE / BKK, A( IOFFA ), 1 ) CT = -HALF*AKK CALL DAXPY( N-K, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL DSYR2( UPLO, N-K, -ONE, A( IOFFA ), 1, $ B( IOFFB ), 1, A( IOFFA+LDA ), LDA ) CALL DAXPY( N-K, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL DTRSV( UPLO, 'No transpose', 'Non-unit', N-K, $ B( IOFFB+LDB ), LDB, A( IOFFA ), 1 ) END IF * * A( IOFFA ) -> A( K+1, K ) * B( IOFFB ) -> B( K+1, K ) * IOFFA = IOFFA + LDA + 1 IOFFB = IOFFB + LDB + 1 * 20 CONTINUE * END IF * ELSE * IF( UPPER ) THEN * IOFFA = IIA + ( JJA-1 )*LDA IOFFB = IIB + ( JJB-1 )*LDB * * Compute U*sub( A )*U' * DO 30 K = 1, N * * Update the upper triangle of A(ia:ia+k-1,ja:ja+k-1) * AKK = A( IOFFA+K-1 ) BKK = B( IOFFB+K-1 ) CALL DTRMV( UPLO, 'No transpose', 'Non-unit', K-1, $ B( IIB+( JJB-1 )*LDB ), LDB, A( IOFFA ), 1 ) CT = HALF*AKK CALL DAXPY( K-1, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL DSYR2( UPLO, K-1, ONE, A( IOFFA ), 1, B( IOFFB ), 1, $ A( IIA+( JJA-1 )*LDA ), LDA ) CALL DAXPY( K-1, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL DSCAL( K-1, BKK, A( IOFFA ), 1 ) A( IOFFA+K-1 ) = AKK*BKK**2 * * A( IOFFA ) -> A( 1, K ) * B( IOFFB ) -> B( 1, K ) * IOFFA = IOFFA + LDA IOFFB = IOFFB + LDB * 30 CONTINUE * ELSE * IOFFA = IIA + ( JJA-1 )*LDA IOFFB = IIB + ( JJB-1 )*LDB * * Compute L'*sub( A )*L * DO 40 K = 1, N * * Update the lower triangle of A(ia:ia+k-1,ja:ja+k-1) * AKK = A( IOFFA+( K-1 )*LDA ) BKK = B( IOFFB+( K-1 )*LDB ) CALL DTRMV( UPLO, 'Transpose', 'Non-unit', K-1, $ B( IIB+( JJB-1 )*LDB ), LDB, A( IOFFA ), $ LDA ) CT = HALF*AKK CALL DAXPY( K-1, CT, B( IOFFB ), LDB, A( IOFFA ), LDA ) CALL DSYR2( UPLO, K-1, ONE, A( IOFFA ), LDA, B( IOFFB ), $ LDB, A( IIA+( JJA-1 )*LDA ), LDA ) CALL DAXPY( K-1, CT, B( IOFFB ), LDB, A( IOFFA ), LDA ) CALL DSCAL( K-1, BKK, A( IOFFA ), LDA ) A( IOFFA+( K-1 )*LDA ) = AKK*BKK**2 * * A( IOFFA ) -> A( K, 1 ) * B( IOFFB ) -> B( K, 1 ) * IOFFA = IOFFA + 1 IOFFB = IOFFB + 1 * 40 CONTINUE * END IF * END IF * RETURN * * End of PDSYGS2 * END scalapack-2.0.2/SRC/pdsygst.f000644 000766 000024 00000042040 10363532303 016202 0ustar00juliestaff000000 000000 * * SUBROUTINE PDSYGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, N DOUBLE PRECISION SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Purpose * ======= * * PDSYGST reduces a real symmetric-definite generalized eigenproblem * to standard form. * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**T)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**T) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**T or L**T*sub( A )*L. * * sub( B ) must have been previously factorized as U**T*U or L*L**T by * PDPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**T)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**T); * = 2 or 3: compute U*sub( A )*U**T or L**T*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**T*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**T. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PDPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * SCALE (global output) DOUBLE PRECISION * Amount by which the eigenvalues should be scaled to * compensate for the scaling performed in this routine. * At present, SCALE is always returned as 1.0, it is * returned here to allow for future enhancement. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, HALF PARAMETER ( ONE = 1.0D+0, HALF = 0.5D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, K, KB, MYCOL, MYROW, NB, $ NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PDSYGS2, $ PDSYMM, PDSYR2K, PDTRMM, PDTRSM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL LSAME, ICEIL, INDXG2P * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * SCALE = ONE * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IB, $ JB, DESCB, 11, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYGST', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( IBTYPE.EQ.1 ) THEN IF( UPPER ) THEN * * Compute inv(U')*sub( A )*inv(U) * K = 1 NB = DESCA( NB_ ) KB = MIN( ICEIL( JA, NB )*NB, JA+N-1 ) - JA + 1 * 10 CONTINUE * * Update the upper triangle of A(ia+k-1:ia+n-1,ja+k-1:ja+n-1) * CALL PDSYGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, IB+K-1, DESCB, INFO ) IF( K+KB.LE.N ) THEN CALL PDTRSM( 'Left', UPLO, 'Transpose', 'Non-unit', KB, $ N-K-KB+1, ONE, B, IB+K-1, JB+K-1, DESCB, A, $ IA+K-1, JA+K+KB-1, DESCA ) CALL PDSYMM( 'Left', UPLO, KB, N-K-KB+1, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K-1, JB+K+KB-1, $ DESCB, ONE, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PDSYR2K( UPLO, 'Transpose', N-K-KB+1, KB, -ONE, A, $ IA+K-1, JA+K+KB-1, DESCA, B, IB+K-1, $ JB+K+KB-1, DESCB, ONE, A, IA+K+KB-1, $ JA+K+KB-1, DESCA ) CALL PDSYMM( 'Left', UPLO, KB, N-K-KB+1, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K-1, JB+K+KB-1, $ DESCB, ONE, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PDTRSM( 'Right', UPLO, 'No transpose', 'Non-unit', $ KB, N-K-KB+1, ONE, B, IB+K+KB-1, JB+K+KB-1, $ DESCB, A, IA+K-1, JA+K+KB-1, DESCA ) END IF K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 10 * ELSE * * Compute inv(L)*sub( A )*inv(L') * K = 1 NB = DESCA( MB_ ) KB = MIN( ICEIL( IA, NB )*NB, IA+N-1 ) - IA + 1 * 20 CONTINUE * * Update the lower triangle of A(ia+k-1:ia+n-1,ja+k-1:ja+n-1) * CALL PDSYGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) IF( K+KB.LE.N ) THEN CALL PDTRSM( 'Right', UPLO, 'Transpose', 'Non-unit', $ N-K-KB+1, KB, ONE, B, IB+K-1, JB+K-1, DESCB, $ A, IA+K+KB-1, JA+K-1, DESCA ) CALL PDSYMM( 'Right', UPLO, N-K-KB+1, KB, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K+KB-1, JB+K-1, $ DESCB, ONE, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PDSYR2K( UPLO, 'No transpose', N-K-KB+1, KB, -ONE, $ A, IA+K+KB-1, JA+K-1, DESCA, B, IB+K+KB-1, $ JB+K-1, DESCB, ONE, A, IA+K+KB-1, $ JA+K+KB-1, DESCA ) CALL PDSYMM( 'Right', UPLO, N-K-KB+1, KB, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K+KB-1, JB+K-1, $ DESCB, ONE, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PDTRSM( 'Left', UPLO, 'No transpose', 'Non-unit', $ N-K-KB+1, KB, ONE, B, IB+K+KB-1, JB+K+KB-1, $ DESCB, A, IA+K+KB-1, JA+K-1, DESCA ) END IF K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 20 * END IF * ELSE * IF( UPPER ) THEN * * Compute U*sub( A )*U' * K = 1 NB = DESCA( NB_ ) KB = MIN( ICEIL( JA, NB )*NB, JA+N-1 ) - JA + 1 * 30 CONTINUE * * Update the upper triangle of A(ia:ia+k+kb-2,ja:ja+k+kb-2) * CALL PDTRMM( 'Left', UPLO, 'No transpose', 'Non-unit', K-1, $ KB, ONE, B, IB, JB, DESCB, A, IA, JA+K-1, $ DESCA ) CALL PDSYMM( 'Right', UPLO, K-1, KB, HALF, A, IA+K-1, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, ONE, A, $ IA, JA+K-1, DESCA ) CALL PDSYR2K( UPLO, 'No transpose', K-1, KB, ONE, A, IA, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, ONE, A, $ IA, JA, DESCA ) CALL PDSYMM( 'Right', UPLO, K-1, KB, HALF, A, IA+K-1, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, ONE, A, $ IA, JA+K-1, DESCA ) CALL PDTRMM( 'Right', UPLO, 'Transpose', 'Non-unit', K-1, $ KB, ONE, B, IB+K-1, JB+K-1, DESCB, A, IA, $ JA+K-1, DESCA ) CALL PDSYGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) * K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 30 * ELSE * * Compute L'*sub( A )*L * K = 1 NB = DESCA( MB_ ) KB = MIN( ICEIL( IA, NB )*NB, IA+N-1 ) - IA + 1 * 40 CONTINUE * * Update the lower triangle of A(ia:ia+k+kb-2,ja:ja+k+kb-2) * CALL PDTRMM( 'Right', UPLO, 'No transpose', 'Non-unit', KB, $ K-1, ONE, B, IB, JB, DESCB, A, IA+K-1, JA, $ DESCA ) CALL PDSYMM( 'Left', UPLO, KB, K-1, HALF, A, IA+K-1, JA+K-1, $ DESCA, B, IB+K-1, JB, DESCB, ONE, A, IA+K-1, $ JA, DESCA ) CALL PDSYR2K( UPLO, 'Transpose', K-1, KB, ONE, A, IA+K-1, $ JA, DESCA, B, IB+K-1, JB, DESCB, ONE, A, IA, $ JA, DESCA ) CALL PDSYMM( 'Left', UPLO, KB, K-1, HALF, A, IA+K-1, JA+K-1, $ DESCA, B, IB+K-1, JB, DESCB, ONE, A, IA+K-1, $ JA, DESCA ) CALL PDTRMM( 'Left', UPLO, 'Transpose', 'Non-unit', KB, K-1, $ ONE, B, IB+K-1, JB+K-1, DESCB, A, IA+K-1, JA, $ DESCA ) CALL PDSYGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) * K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 40 * END IF * END IF * RETURN * * End of PDSYGST * END scalapack-2.0.2/SRC/pdsygvx.f000644 000766 000024 00000104703 10377154001 016216 0ustar00juliestaff000000 000000 SUBROUTINE PDSYGVX( IBTYPE, JOBZ, RANGE, UPLO, N, A, IA, JA, $ DESCA, B, IB, JB, DESCB, VL, VU, IL, IU, $ ABSTOL, M, NZ, W, ORFAC, Z, IZ, JZ, DESCZ, $ WORK, LWORK, IWORK, LIWORK, IFAIL, ICLUSTR, $ GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IB, IBTYPE, IL, INFO, IU, IZ, JA, JB, JZ, $ LIWORK, LWORK, M, N, NZ DOUBLE PRECISION ABSTOL, ORFAC, VL, VU * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCB( * ), DESCZ( * ), $ ICLUSTR( * ), IFAIL( * ), IWORK( * ) DOUBLE PRECISION A( * ), B( * ), GAP( * ), W( * ), WORK( * ), $ Z( * ) * .. * * Purpose * * ======= * * PDSYGVX computes all the eigenvalues, and optionally, * the eigenvectors * of a real generalized SY-definite eigenproblem, of the form * sub( A )*x=(lambda)*sub( B )*x, sub( A )*sub( B )x=(lambda)*x, or * sub( B )*sub( A )*x=(lambda)*x. * Here sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ) is assumed to be * SY, and sub( B ) denoting B( IB:IB+N-1, JB:JB+N-1 ) is assumed * to be symmetric positive definite. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * JOBZ (global input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * = 'U': Upper triangles of sub( A ) and sub( B ) are stored; * = 'L': Lower triangles of sub( A ) and sub( B ) are stored. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix. * * On exit, if JOBZ = 'V', then if INFO = 0, sub( A ) contains * the distributed matrix Z of eigenvectors. The eigenvectors * are normalized as follows: * if IBTYPE = 1 or 2, Z**T*sub( B )*Z = I; * if IBTYPE = 3, Z**T*inv( sub( B ) )*Z = I. * If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') * or the lower triangle (if UPLO='L') of sub( A ), including * the diagonal, is destroyed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PDSYGVX cannot guarantee * correct error reporting. * * B (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( B ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( B ) contains * the upper triangular part of the matrix. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( B ) contains * the lower triangular part of the matrix. * * On exit, if INFO <= N, the part of sub( B ) containing the * matrix is overwritten by the triangular factor U or L from * the Cholesky factorization sub( B ) = U**T*U or * sub( B ) = L*L**T. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * DESCB( CTXT_ ) must equal DESCA( CTXT_ ) * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * ABSTOL (global input) DOUBLE PRECISION * If JOBZ='V', setting ABSTOL to PDLAMCH( CONTEXT, 'U') yields * the most orthogonal eigenvectors. * * The absolute error tolerance for the eigenvalues. * An approximate eigenvalue is accepted as converged * when it is determined to lie in an interval [a,b] * of width less than or equal to * * ABSTOL + EPS * max( |a|,|b| ) , * * where EPS is the machine precision. If ABSTOL is less than * or equal to zero, then EPS*norm(T) will be used in its place, * where norm(T) is the 1-norm of the tridiagonal matrix * obtained by reducing A to tridiagonal form. * * Eigenvalues will be computed most accurately when ABSTOL is * set to twice the underflow threshold 2*PDLAMCH('S') not zero. * If this routine returns with ((MOD(INFO,2).NE.0) .OR. * (MOD(INFO/8,2).NE.0)), indicating that some eigenvalues or * eigenvectors did not converge, try setting ABSTOL to * 2*PDLAMCH('S'). * * See "Computing Small Singular Values of Bidiagonal Matrices * with Guaranteed High Relative Accuracy," by Demmel and * Kahan, LAPACK Working Note #3. * * See "On the correctness of Parallel Bisection in Floating * Point" by Demmel, Dhillon and Ren, LAPACK Working Note #70 * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M unless the user supplies * insufficient space and PDSYGVX is not able to detect this * before beginning computation. To get all the eigenvectors * requested, the user must supply both sufficient * space to hold the eigenvectors in Z (M .LE. DESCZ(N_)) * and sufficient workspace to compute them. (See LWORK below.) * PDSYGVX is always able to detect insufficient space without * computation unless RANGE .EQ. 'V'. * * W (global output) DOUBLE PRECISION array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * ORFAC (global input) DOUBLE PRECISION * Specifies which eigenvectors should be reorthogonalized. * Eigenvectors that correspond to eigenvalues which are within * tol=ORFAC*norm(A) of each other are to be reorthogonalized. * However, if the workspace is insufficient (see LWORK), * tol may be decreased until all eigenvectors to be * reorthogonalized can be stored in one process. * No reorthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) DOUBLE PRECISION array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. If an eigenvector * fails to converge, then that column of Z contains the latest * approximation to the eigenvector, and the index of the * eigenvector is returned in IFAIL. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * The row index in the global array Z indicating the first * row of sub( Z ). * * JZ (global input) INTEGER * The column index in the global array Z indicating the * first column of sub( Z ). * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension max(3,LWORK) * if JOBZ='N' WORK(1) = optimal amount of workspace * required to compute eigenvalues efficiently * if JOBZ='V' WORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors * efficiently with no guarantee on orthogonality. * If RANGE='V', it is assumed that all eigenvectors * may be required. * * LWORK (local input) INTEGER * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= 5 * N + MAX( 5 * NN, NB * ( NP0 + 1 ) ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * LWORK >= 5 * N + MAX( 5*NN, NP0 * MQ0 + 2 * NB * NB ) + * ICEIL( NEIG, NPROW*NPCOL)*NN * * The computed eigenvectors may not be orthogonal if the * minimal workspace is supplied and ORFAC is too small. * If you want to guarantee orthogonality (at the cost * of potentially poor performance) you should add * the following to LWORK: * (CLUSTERSIZE-1)*N * where CLUSTERSIZE is the number of eigenvalues in the * largest cluster, where a cluster is defined as a set of * close eigenvalues: { W(K),...,W(K+CLUSTERSIZE-1) | * W(J+1) <= W(J) + ORFAC*2*norm(A) } * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = DESCZ( MB_ ) = * DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * When LWORK is too small: * If LWORK is too small to guarantee orthogonality, * PDSYGVX attempts to maintain orthogonality in * the clusters with the smallest * spacing between the eigenvalues. * If LWORK is too small to compute all the eigenvectors * requested, no computation is performed and INFO=-23 * is returned. Note that when RANGE='V', PDSYGVX does * not know how many eigenvectors are requested until * the eigenvalues are computed. Therefore, when RANGE='V' * and as long as LWORK is large enough to allow PDSYGVX to * compute the eigenvalues, PDSYGVX will compute the * eigenvalues and as many eigenvectors as it can. * * Relationship between workspace, orthogonality & performance: * Greater performance can be achieved if adequate workspace * is provided. On the other hand, in some situations, * performance can decrease as the workspace provided * increases above the workspace amount shown below: * * For optimal performance, greater workspace may be * needed, i.e. * LWORK >= MAX( LWORK, 5 * N + NSYTRD_LWOPT, * NSYGST_LWOPT ) * Where: * LWORK, as defined previously, depends upon the number * of eigenvectors requested, and * NSYTRD_LWOPT = N + 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 3 ) * NPS * NSYGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB * * ANB = PJLAENV( DESCA( CTXT_), 3, 'PDSYTTRD', 'L', * 0, 0, 0, 0) * SQNPC = INT( SQRT( DBLE( NPROW * NPCOL ) ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * NB = DESCA( MB_ ) * NP0 = NUMROC( N, NB, 0, 0, NPROW ) * NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by * calling the subroutine BLACS_GRIDINFO. * * For large N, no extra workspace is needed, however the * biggest boost in performance comes for small N, so it * is wise to provide the extra workspace (typically less * than a Megabyte per process). * * If CLUSTERSIZE >= N/SQRT(NPROW*NPCOL), then providing * enough space to compute all the eigenvectors * orthogonally will cause serious degradation in * performance. In the limit (i.e. CLUSTERSIZE = N-1) * PDSTEIN will perform no better than DSTEIN on 1 processor. * For CLUSTERSIZE = N/SQRT(NPROW*NPCOL) reorthogonalizing * all eigenvectors will increase the total execution time * by a factor of 2 or more. * For CLUSTERSIZE > N/SQRT(NPROW*NPCOL) execution time will * grow as the square of the cluster size, all other factors * remaining equal and assuming enough workspace. Less * workspace means less reorthogonalization but faster * execution. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance on all work arrays. * Each of these values is returned in the first entry of the * corresponding work array, and no error message is issued by * PXERBLA. * * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * LIWORK >= 6 * NNP * Where: * NNP = MAX( N, NPROW*NPCOL + 1, 4 ) * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (output) INTEGER array, dimension (N) * IFAIL provides additional information when INFO .NE. 0 * If (MOD(INFO/16,2).NE.0) then IFAIL(1) indicates the order of * the smallest minor which is not positive definite. * If (MOD(INFO,2).NE.0) on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * * If neither of the above error conditions hold and JOBZ = 'V', * then the first M elements of IFAIL are set to zero. * * ICLUSTR (global output) integer array, dimension (2*NPROW*NPCOL) * This array contains indices of eigenvectors corresponding to * a cluster of eigenvalues that could not be reorthogonalized * due to insufficient workspace (see LWORK, ORFAC and INFO). * Eigenvectors corresponding to clusters of eigenvalues indexed * ICLUSTR(2*I-1) to ICLUSTR(2*I), could not be * reorthogonalized due to lack of workspace. Hence the * eigenvectors corresponding to these clusters may not be * orthogonal. ICLUSTR() is a zero terminated array. * (ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0) if and only if * K is the number of clusters * ICLUSTR is not referenced if JOBZ = 'N' * * GAP (global output) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * This array contains the gap between eigenvalues whose * eigenvectors could not be reorthogonalized. The output * values in this array correspond to the clusters indicated * by the array ICLUSTR. As a result, the dot product between * eigenvectors correspoding to the I^th cluster may be as high * as ( C * n ) / GAP(I) where C is a small constant. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: if (MOD(INFO,2).NE.0), then one or more eigenvectors * failed to converge. Their indices are stored * in IFAIL. Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding * to one or more clusters of eigenvalues could not be * reorthogonalized because of insufficient workspace. * The indices of the clusters are stored in the array * ICLUSTR. * if (MOD(INFO/4,2).NE.0), then space limit prevented * PDSYGVX from computing all of the eigenvectors * between VL and VU. The number of eigenvectors * computed is returned in NZ. * if (MOD(INFO/8,2).NE.0), then PDSTEBZ failed to * compute eigenvalues. * Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/16,2).NE.0), then B was not positive * definite. IFAIL(1) indicates the order of * the smallest minor which is not positive definite. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*), C(IC:IC+M-1,JC:JC+N-1), * and B( IB:IB+N-1, JB:JB+N-1 ) must verify some alignment properties, * namely the following expressions should be true: * * DESCA(MB_) = DESCA(NB_) * IA = IB = IZ * JA = IB = JZ * DESCA(M_) = DESCB(M_) =DESCZ(M_) * DESCA(N_) = DESCB(N_)= DESCZ(N_) * DESCA(MB_) = DESCB(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCB(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCB(RSRC_) = DESCZ(RSRC_) * DESCA(CSRC_) = DESCB(CSRC_) = DESCZ(CSRC_) * MOD( IA-1, DESCA( MB_ ) ) = 0 * MOD( JA-1, DESCA( NB_ ) ) = 0 * MOD( IB-1, DESCB( MB_ ) ) = 0 * MOD( JB-1, DESCB( NB_ ) ) = 0 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION FIVE, ZERO PARAMETER ( FIVE = 5.0D+0, ZERO = 0.0D+0 ) INTEGER IERRNPD PARAMETER ( IERRNPD = 16 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LQUERY, UPPER, VALEIG, WANTZ CHARACTER TRANS INTEGER ANB, IACOL, IAROW, IBCOL, IBROW, ICOFFA, $ ICOFFB, ICTXT, IROFFA, IROFFB, LIWMIN, LWMIN, $ LWOPT, MQ0, MYCOL, MYROW, NB, NEIG, NN, NP0, $ NPCOL, NPROW, NPS, NQ0, NSYGST_LWOPT, $ NSYTRD_LWOPT, SQNPC DOUBLE PRECISION EPS, SCALE * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, ICEIL, INDXG2P, NUMROC, PJLAENV, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DGEBR2D, DGEBS2D, $ DSCAL, PCHK1MAT, PCHK2MAT, PDPOTRF, PDSYEVX, $ PDSYNGST, PDTRMM, PDTRSM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, INT, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 900+CTXT_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2600+CTXT_ ) ELSE * * Get machine constants. * EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) * WANTZ = LSAME( JOBZ, 'V' ) UPPER = LSAME( UPLO, 'U' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, N, 4, IB, JB, DESCB, 13, INFO ) CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 26, INFO ) IF( INFO.EQ.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WORK( 1 ) = ABSTOL IF( VALEIG ) THEN WORK( 2 ) = VL WORK( 3 ) = VU ELSE WORK( 2 ) = ZERO WORK( 3 ) = ZERO END IF CALL DGEBS2D( DESCA( CTXT_ ), 'ALL', ' ', 3, 1, WORK, 3 ) ELSE CALL DGEBR2D( DESCA( CTXT_ ), 'ALL', ' ', 3, 1, WORK, 3, $ 0, 0 ) END IF IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) * * Compute the total amount of space needed * LQUERY = .FALSE. IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) $ LQUERY = .TRUE. * LIWMIN = 6*MAX( N, ( NPROW*NPCOL )+1, 4 ) * NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * IF( ( .NOT.WANTZ ) .OR. ( VALEIG .AND. ( .NOT.LQUERY ) ) ) $ THEN LWMIN = 5*N + MAX( 5*NN, NB*( NP0+1 ) ) IF( WANTZ ) THEN MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) LWOPT = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) ELSE LWOPT = LWMIN END IF NEIG = 0 ELSE IF( ALLEIG .OR. VALEIG ) THEN NEIG = N ELSE IF( INDEIG ) THEN NEIG = IU - IL + 1 END IF MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) LWMIN = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN LWOPT = LWMIN * END IF * * Compute how much workspace is needed to use the * new TRD and GST algorithms * ANB = PJLAENV( ICTXT, 3, 'PDSYTTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NSYTRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS NB = DESCA( MB_ ) NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) NSYGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB LWOPT = MAX( LWOPT, N+NSYTRD_LWOPT, NSYGST_LWOPT ) * * Version 1.0 Limitations * IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -2 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -3 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -5 ELSE IF( IROFFA.NE.0 ) THEN INFO = -7 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -8 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 900+NB_ ) ELSE IF( DESCA( M_ ).NE.DESCB( M_ ) ) THEN INFO = -( 1300+M_ ) ELSE IF( DESCA( N_ ).NE.DESCB( N_ ) ) THEN INFO = -( 1300+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1300+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCB( NB_ ) ) THEN INFO = -( 1300+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCB( RSRC_ ) ) THEN INFO = -( 1300+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCB( CSRC_ ) ) THEN INFO = -( 1300+CSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2200+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2200+CSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2200+CTXT_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -11 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -12 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -15 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -16 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -17 ELSE IF( VALEIG .AND. ( ABS( WORK( 2 )-VL ).GT.FIVE*EPS* $ ABS( VL ) ) ) THEN INFO = -14 ELSE IF( VALEIG .AND. ( ABS( WORK( 3 )-VU ).GT.FIVE*EPS* $ ABS( VU ) ) ) THEN INFO = -15 ELSE IF( ABS( WORK( 1 )-ABSTOL ).GT.FIVE*EPS*ABS( ABSTOL ) ) $ THEN INFO = -18 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -28 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -30 END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( WANTZ ) THEN IDUM1( 2 ) = ICHAR( 'V' ) ELSE IDUM1( 2 ) = ICHAR( 'N' ) END IF IDUM2( 2 ) = 2 IF( UPPER ) THEN IDUM1( 3 ) = ICHAR( 'U' ) ELSE IDUM1( 3 ) = ICHAR( 'L' ) END IF IDUM2( 3 ) = 3 IF( ALLEIG ) THEN IDUM1( 4 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 4 ) = ICHAR( 'I' ) ELSE IDUM1( 4 ) = ICHAR( 'V' ) END IF IDUM2( 4 ) = 4 IF( LQUERY ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 5 CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 9, N, 4, N, 4, IB, $ JB, DESCB, 13, 5, IDUM1, IDUM2, INFO ) CALL PCHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 26, 0, IDUM1, IDUM2, $ INFO ) END IF * IWORK( 1 ) = LIWMIN WORK( 1 ) = DBLE( LWOPT ) * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYGVX ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Form a Cholesky factorization of sub( B ). * CALL PDPOTRF( UPLO, N, B, IB, JB, DESCB, INFO ) IF( INFO.NE.0 ) THEN IWORK( 1 ) = LIWMIN WORK( 1 ) = DBLE( LWOPT ) IFAIL( 1 ) = INFO INFO = IERRNPD RETURN END IF * * Transform problem to standard eigenvalue problem and solve. * CALL PDSYNGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, WORK, LWORK, INFO ) CALL PDSYEVX( JOBZ, RANGE, UPLO, N, A, IA, JA, DESCA, VL, VU, IL, $ IU, ABSTOL, M, NZ, W, ORFAC, Z, IZ, JZ, DESCZ, WORK, $ LWORK, IWORK, LIWORK, IFAIL, ICLUSTR, GAP, INFO ) * IF( WANTZ ) THEN * * Backtransform eigenvectors to the original problem. * NEIG = M IF( IBTYPE.EQ.1 .OR. IBTYPE.EQ.2 ) THEN * * For sub( A )*x=(lambda)*sub( B )*x and * sub( A )*sub( B )*x=(lambda)*x; backtransform eigenvectors: * x = inv(L)'*y or inv(U)*y * IF( UPPER ) THEN TRANS = 'N' ELSE TRANS = 'T' END IF * CALL PDTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, $ B, IB, JB, DESCB, Z, IZ, JZ, DESCZ ) * ELSE IF( IBTYPE.EQ.3 ) THEN * * For sub( B )*sub( A )*x=(lambda)*x; * backtransform eigenvectors: x = L*y or U'*y * IF( UPPER ) THEN TRANS = 'T' ELSE TRANS = 'N' END IF * CALL PDTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, $ B, IB, JB, DESCB, Z, IZ, JZ, DESCZ ) END IF END IF * IF( SCALE.NE.ONE ) THEN CALL DSCAL( N, SCALE, W, 1 ) END IF * IWORK( 1 ) = LIWMIN WORK( 1 ) = DBLE( LWOPT ) RETURN * * End of PDSYGVX * END scalapack-2.0.2/SRC/pdsyngst.f000644 000766 000024 00000041476 10363532303 016374 0ustar00juliestaff000000 000000 SUBROUTINE PDSYNGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, LWORK, N DOUBLE PRECISION SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ), WORK( * ) * .. * * Purpose * * ======= * * PDSYNGST reduces a complex Hermitian-definite generalized * eigenproblem to standard form. * * PDSYNGST performs the same function as PDHEGST, but is based on * rank 2K updates, which are faster and more scalable than * triangular solves (the basis of PDSYNGST). * * PDSYNGST calls PDHEGST when UPLO='U', hence PDHENGST provides * improved performance only when UPLO='L', IBTYPE=1. * * PDSYNGST also calls PDHEGST when insufficient workspace is * provided, hence PDSYNGST provides improved * performance only when LWORK >= 2 * NP0 * NB + NQ0 * NB + NB * NB * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**H or L**H*sub( A )*L. * * sub( B ) must have been previously factorized as U**H*U or L*L**H by * PDPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H); * = 2 or 3: compute U*sub( A )*U**H or L**H*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**H*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**H. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PDPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * SCALE (global output) DOUBLE PRECISION * Amount by which the eigenvalues should be scaled to * compensate for the scaling performed in this routine. * At present, SCALE is always returned as 1.0, it is * returned here to allow for future enhancement. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP0 +1 ), 3 * NB ) * * When IBTYPE = 1 and UPLO = 'L', PDSYNGST provides improved * performance when LWORK >= 2 * NP0 * NB + NQ0 * NB + NB * NB * * where NB = MB_A = NB_A, * NP0 = NUMROC( N, NB, 0, 0, NPROW ), * NQ0 = NUMROC( N, NB, 0, 0, NPROW ), * * NUMROC ia a ScaLAPACK tool functions * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the * optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * * .. Parameters .. DOUBLE PRECISION ONEHALF, ONE, MONE PARAMETER ( ONEHALF = 0.5D0, ONE = 1.0D0, MONE = -1.0D0 ) INTEGER DLEN_, CTXT_, MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( DLEN_ = 9, CTXT_ = 2, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER I, IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, INDAA, INDG, INDR, INDRT, IROFFA, $ IROFFB, J, K, KB, LWMIN, LWOPT, MYCOL, MYROW, $ NB, NP0, NPCOL, NPK, NPROW, NQ0, POSTK * .. * .. Local Arrays .. INTEGER DESCAA( DLEN_ ), DESCG( DLEN_ ), $ DESCR( DLEN_ ), DESCRT( DLEN_ ), IDUM1( 2 ), $ IDUM2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL LSAME, INDXG2P, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ PDGEMM, PDLACPY, PDSYGST, PDSYMM, PDSYR2K, $ PDTRSM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) SCALE = 1.0D0 * NB = DESCA( MB_ ) * * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) LWMIN = MAX( NB*( NP0+1 ), 3*NB ) IF( IBTYPE.EQ.1 .AND. .NOT.UPPER ) THEN LWOPT = 2*NP0*NB + NQ0*NB + NB*NB ELSE LWOPT = LWMIN END IF WORK( 1 ) = DBLE( LWOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IB, $ JB, DESCB, 11, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYNGST', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * IF( IBTYPE.NE.1 .OR. UPPER .OR. LWORK.LT.LWOPT ) THEN CALL PDSYGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, INFO ) RETURN END IF * CALL DESCSET( DESCG, N, NB, NB, NB, IAROW, IACOL, ICTXT, NP0 ) CALL DESCSET( DESCR, N, NB, NB, NB, IAROW, IACOL, ICTXT, NP0 ) CALL DESCSET( DESCRT, NB, N, NB, NB, IAROW, IACOL, ICTXT, NB ) CALL DESCSET( DESCAA, NB, NB, NB, NB, IAROW, IACOL, ICTXT, NB ) * INDG = 1 INDR = INDG + DESCG( LLD_ )*NB INDAA = INDR + DESCR( LLD_ )*NB INDRT = INDAA + DESCAA( LLD_ )*NB * DO 30 K = 1, N, NB * KB = MIN( N-K+1, NB ) POSTK = K + KB NPK = N - POSTK + 1 * * CALL PDLACPY( 'A', N-POSTK+1, KB, B, POSTK+IB-1, K+JB-1, DESCB, $ WORK( INDG ), POSTK, 1, DESCG ) CALL PDLACPY( 'A', N-POSTK+1, KB, A, POSTK+IA-1, K+JA-1, DESCA, $ WORK( INDR ), POSTK, 1, DESCR ) CALL PDLACPY( 'A', KB, K-1, A, K+IA-1, JA, DESCA, $ WORK( INDRT ), 1, 1, DESCRT ) * CALL PDLACPY( 'L', KB, KB, A, K+IA-1, K+JA-1, DESCA, $ WORK( INDR ), K, 1, DESCR ) CALL PDTRSM( 'Right', 'L', 'N', 'N', NPK, KB, MONE, B, K+IB-1, $ K+JB-1, DESCB, WORK( INDG ), POSTK, 1, DESCG ) * CALL PDSYMM( 'Right', 'L', NPK, KB, ONEHALF, A, K+IA-1, K+JA-1, $ DESCA, WORK( INDG ), POSTK, 1, DESCG, ONE, $ WORK( INDR ), POSTK, 1, DESCR ) * CALL PDSYR2K( 'Lower', 'No T', NPK, KB, ONE, WORK( INDG ), $ POSTK, 1, DESCG, WORK( INDR ), POSTK, 1, DESCR, $ ONE, A, POSTK+IA-1, POSTK+JA-1, DESCA ) * CALL PDGEMM( 'No T', 'No Conj', NPK, K-1, KB, ONE, $ WORK( INDG ), POSTK, 1, DESCG, WORK( INDRT ), 1, $ 1, DESCRT, ONE, A, POSTK+IA-1, JA, DESCA ) * CALL PDSYMM( 'Right', 'L', NPK, KB, ONE, WORK( INDR ), K, 1, $ DESCR, WORK( INDG ), POSTK, 1, DESCG, ONE, A, $ POSTK+IA-1, K+JA-1, DESCA ) * CALL PDTRSM( 'Left', 'Lower', 'No Conj', 'Non-unit', KB, K-1, $ ONE, B, K+IB-1, K+JB-1, DESCB, A, K+IA-1, JA, $ DESCA ) * CALL PDLACPY( 'L', KB, KB, A, K+IA-1, K+JA-1, DESCA, $ WORK( INDAA ), 1, 1, DESCAA ) * IF( MYROW.EQ.DESCAA( RSRC_ ) .AND. MYCOL.EQ.DESCAA( CSRC_ ) ) $ THEN DO 20 I = 1, KB DO 10 J = 1, I WORK( INDAA+J-1+( I-1 )*DESCAA( LLD_ ) ) $ = WORK( INDAA+I-1+( J-1 )*DESCAA( LLD_ ) ) 10 CONTINUE 20 CONTINUE END IF * CALL PDTRSM( 'Left', 'Lower', 'No Conj', 'Non-unit', KB, KB, $ ONE, B, K+IB-1, K+JB-1, DESCB, WORK( INDAA ), 1, $ 1, DESCAA ) * CALL PDTRSM( 'Right', 'Lower', 'Conj', 'Non-unit', KB, KB, ONE, $ B, K+IB-1, K+JB-1, DESCB, WORK( INDAA ), 1, 1, $ DESCAA ) * CALL PDLACPY( 'L', KB, KB, WORK( INDAA ), 1, 1, DESCAA, A, $ K+IA-1, K+JA-1, DESCA ) * CALL PDTRSM( 'Right', 'Lower', 'Conj', 'Non-unit', NPK, KB, $ ONE, B, K+IB-1, K+JB-1, DESCB, A, POSTK+IA-1, $ K+JA-1, DESCA ) * DESCR( CSRC_ ) = MOD( DESCR( CSRC_ )+1, NPCOL ) DESCG( CSRC_ ) = MOD( DESCG( CSRC_ )+1, NPCOL ) DESCRT( RSRC_ ) = MOD( DESCRT( RSRC_ )+1, NPROW ) DESCAA( RSRC_ ) = MOD( DESCAA( RSRC_ )+1, NPROW ) DESCAA( CSRC_ ) = MOD( DESCAA( CSRC_ )+1, NPCOL ) 30 CONTINUE * WORK( 1 ) = DBLE( LWOPT ) * RETURN END scalapack-2.0.2/SRC/pdsyntrd.f000644 000766 000024 00000050530 10363532303 016357 0ustar00juliestaff000000 000000 SUBROUTINE PDSYNTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * Bugs * ==== * * * Support for UPLO='U' is limited to calling the old, slow, PDSYTRD * code. * * * Purpose * * ======= * * PDSYNTRD is a prototype version of PDSYTRD which uses tailored * codes (either the serial, DSYTRD, or the parallel code, PDSYTTRD) * when the workspace provided by the user is adequate. * * * PDSYNTRD reduces a real symmetric matrix sub( A ) to symmetric * tridiagonal form T by an orthogonal similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Features * ======== * * PDSYNTRD is faster than PDSYTRD on almost all matrices, * particularly small ones (i.e. N < 500 * sqrt(P) ), provided that * enough workspace is available to use the tailored codes. * * The tailored codes provide performance that is essentially * independent of the input data layout. * * The tailored codes place no restrictions on IA, JA, MB or NB. * At present, IA, JA, MB and NB are restricted to those values allowed * by PDSYTRD to keep the interface simple. These restrictions are * documented below. (Search for "restrictions".) * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * symmetric distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the orthogonal matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK( 1 ) returns the optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP +1 ), 3 * NB ) * * For optimal performance, greater workspace is needed, i.e. * LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS + 4 ) * NPS * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PDSYTTRD', 'L', 0, 0, 0, 0 ) * SQNPC = INT( SQRT( DBLE( NPROW * NPCOL ) ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA .AND. IROFFA.EQ.0 ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER COLCTOP, ROWCTOP INTEGER ANB, CTXTB, I, IACOL, IAROW, ICOFFA, ICTXT, $ IINFO, INDB, INDD, INDE, INDTAU, INDW, IPW, $ IROFFA, J, JB, JX, K, KK, LLWORK, LWMIN, MINSZ, $ MYCOL, MYCOLB, MYROW, MYROWB, NB, NP, NPCOL, $ NPCOLB, NPROW, NPROWB, NPS, NQ, ONEPMIN, SQNPC, $ TTLWMIN * .. * .. Local Arrays .. INTEGER DESCB( DLEN_ ), DESCW( DLEN_ ), IDUM1( 2 ), $ IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, CHK1MAT, DESCSET, DSYTRD, $ IGAMN2D, PCHK1MAT, PDELSET, PDLAMR1D, PDLATRD, $ PDSYR2K, PDSYTD2, PDSYTTRD, PDTRMR2D, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC, PJLAENV EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC, PJLAENV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, INT, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = MAX( 1, NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) LWMIN = MAX( ( NP+1 )*NB, 3*NB ) ANB = PJLAENV( ICTXT, 3, 'PDSYTTRD', 'L', 0, 0, 0, 0 ) MINSZ = PJLAENV( ICTXT, 5, 'PDSYTTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) TTLWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS * WORK( 1 ) = DBLE( TTLWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 * * The following two restrictions are not necessary provided * that either of the tailored codes are used. * ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 600+NB_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 11 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYNTRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * ONEPMIN = N*N + 3*N + 1 LLWORK = LWORK CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LLWORK, 1, 1, -1, -1, -1, $ -1 ) * * * * Use the serial, LAPACK, code: DTRD on small matrices if we * we have enough space. * NPROWB = 0 IF( ( N.LT.MINSZ .OR. SQNPC.EQ.1 ) .AND. LLWORK.GE.ONEPMIN .AND. $ .NOT.UPPER ) THEN NPROWB = 1 NPS = N ELSE IF( LLWORK.GE.TTLWMIN .AND. .NOT.UPPER ) THEN NPROWB = SQNPC END IF END IF * IF( NPROWB.GE.1 ) THEN NPCOLB = NPROWB SQNPC = NPROWB INDB = 1 INDD = INDB + NPS*NPS INDE = INDD + NPS INDTAU = INDE + NPS INDW = INDTAU + NPS LLWORK = LLWORK - INDW + 1 * CALL BLACS_GET( ICTXT, 10, CTXTB ) CALL BLACS_GRIDINIT( CTXTB, 'Row major', SQNPC, SQNPC ) CALL BLACS_GRIDINFO( CTXTB, NPROWB, NPCOLB, MYROWB, MYCOLB ) CALL DESCSET( DESCB, N, N, 1, 1, 0, 0, CTXTB, NPS ) * CALL PDTRMR2D( UPLO, 'N', N, N, A, IA, JA, DESCA, WORK( INDB ), $ 1, 1, DESCB, ICTXT ) * * * Only those processors in context CTXTB are needed for a while * IF( NPROWB.GT.0 ) THEN * IF( NPROWB.EQ.1 ) THEN CALL DSYTRD( UPLO, N, WORK( INDB ), NPS, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDW ), $ LLWORK, INFO ) ELSE * CALL PDSYTTRD( 'L', N, WORK( INDB ), 1, 1, DESCB, $ WORK( INDD ), WORK( INDE ), $ WORK( INDTAU ), WORK( INDW ), LLWORK, $ INFO ) * END IF END IF * * All processors participate in moving the data back to the * way that PDSYNTRD expects it. * CALL PDLAMR1D( N-1, WORK( INDE ), 1, 1, DESCB, E, 1, JA, $ DESCA ) * CALL PDLAMR1D( N, WORK( INDD ), 1, 1, DESCB, D, 1, JA, DESCA ) * CALL PDLAMR1D( N, WORK( INDTAU ), 1, 1, DESCB, TAU, 1, JA, $ DESCA ) * CALL PDTRMR2D( UPLO, 'N', N, N, WORK( INDB ), 1, 1, DESCB, A, $ IA, JA, DESCA, ICTXT ) * IF( MYROWB.GE.0 ) $ CALL BLACS_GRIDEXIT( CTXTB ) * ELSE * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPW = NP*NB + 1 * IF( UPPER ) THEN * * Reduce the upper triangle of sub( A ). * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, $ INDXG2P( JA+N-KK, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ), ICTXT, MAX( 1, NP ) ) * DO 10 K = N - KK + 1, NB + 1, -NB JB = MIN( N-K+1, NB ) I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part of * the matrix * CALL PDLATRD( UPLO, K+JB-1, JB, A, IA, JA, DESCA, D, E, $ TAU, WORK, 1, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(IA:I-1,JA:J-1), using an * update of the form: * A(IA:I-1,JA:J-1) := A(IA:I-1,JA:J-1) - V*W' - W*V' * CALL PDSYR2K( UPLO, 'No transpose', K-1, JB, -ONE, A, IA, $ J, DESCA, WORK, 1, 1, DESCW, ONE, A, IA, $ JA, DESCA ) * * Copy last superdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J, NB, 0, IACOL, NPCOL ), NQ ) CALL PDELSET( A, I-1, J, DESCA, E( JX ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+NPCOL-1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PDSYTD2( UPLO, MIN( N, NB ), A, IA, JA, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) * ELSE * * Reduce the lower triangle of sub( A ) * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) * DO 20 K = 1, N - NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part * of the matrix * CALL PDLATRD( UPLO, N-K+1, NB, A, I, J, DESCA, D, E, TAU, $ WORK, K, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(I+NB:IA+N-1,I+NB:IA+N-1), * using an update of the form: A(I+NB:IA+N-1,I+NB:IA+N-1) := * A(I+NB:IA+N-1,I+NB:IA+N-1) - V*W' - W*V' * CALL PDSYR2K( UPLO, 'No transpose', N-K-NB+1, NB, -ONE, $ A, I+NB, J, DESCA, WORK, K+NB, 1, DESCW, $ ONE, A, I+NB, J+NB, DESCA ) * * Copy last subdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J+NB-1, NB, 0, IACOL, NPCOL ), NQ ) CALL PDELSET( A, I+NB, J+NB-1, DESCA, E( JX ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * 20 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PDSYTD2( UPLO, KK, A, IA+K-1, JA+K-1, DESCA, D, E, TAU, $ WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * END IF * WORK( 1 ) = DBLE( TTLWMIN ) * RETURN * * End of PDSYNTRD * END scalapack-2.0.2/SRC/pdsytd2.f000644 000766 000024 00000042231 10363532303 016100 0ustar00juliestaff000000 000000 SUBROUTINE PDSYTD2( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDSYTD2 reduces a real symmetric matrix sub( A ) to symmetric * tridiagonal form T by an orthogonal similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * symmetric distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the orthogonal matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*N. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION HALF, ONE, ZERO PARAMETER ( HALF = 0.5D+0, ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER IACOL, IAROW, ICOFFA, ICTXT, II, IK, IROFFA, J, $ JJ, JK, JN, LDA, LWMIN, MYCOL, MYROW, NPCOL, $ NPROW DOUBLE PRECISION ALPHA, TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DAXPY, $ DGEBR2D, DGEBS2D, DLARFG, $ DSYMV, DSYR2, INFOG2L, PXERBLA * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DDOT EXTERNAL LSAME, DDOT * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) LWMIN = 3 * N * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYTD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Compute local information * LDA = DESCA( LLD_ ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) * IF( UPPER ) THEN * * Process(IAROW, IACOL) owns block to be reduced * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Reduce the upper triangle of sub( A ) * DO 10 J = N-1, 1, -1 IK = II + J - 1 JK = JJ + J - 1 * * Generate elementary reflector H(i) = I - tau * v * v' * to annihilate A(IA:IA+J-1,JA:JA+J-1) * CALL DLARFG( J, A( IK+JK*LDA ), A( II+JK*LDA ), 1, $ TAUI ) E( JK+1 ) = A( IK+JK*LDA ) * IF( TAUI.NE.ZERO ) THEN * * Apply H(i) from both sides to * A(IA:IA+J-1,JA:JA+J-1) * A( IK+JK*LDA ) = ONE * * Compute x := tau * A * v storing x in TAU(1:i) * CALL DSYMV( UPLO, J, TAUI, A( II+(JJ-1)*LDA ), $ LDA, A( II+JK*LDA ), 1, ZERO, $ TAU( JJ ), 1 ) * * Compute w := x - 1/2 * tau * (x'*v) * v * ALPHA = -HALF*TAUI*DDOT( J, TAU( JJ ), 1, $ A( II+JK*LDA ), 1 ) CALL DAXPY( J, ALPHA, A( II+JK*LDA ), 1, $ TAU( JJ ), 1 ) * * Apply the transformation as a rank-2 update: * A := A - v * w' - w * v' * CALL DSYR2( UPLO, J, -ONE, A( II+JK*LDA ), 1, $ TAU( JJ ), 1, A( II+(JJ-1)*LDA ), $ LDA ) A( IK+JK*LDA ) = E( JK+1 ) END IF * * Copy D, E, TAU to broadcast them columnwise. * D( JK+1 ) = A( IK+1+JK*LDA ) WORK( J+1 ) = D( JK+1 ) WORK( N+J+1 ) = E( JK+1 ) TAU( JK+1 ) = TAUI WORK( 2*N+J+1 ) = TAU( JK+1 ) * 10 CONTINUE D( JJ ) = A( II+(JJ-1)*LDA ) WORK( 1 ) = D( JJ ) WORK( N+1 ) = ZERO WORK( 2*N+1 ) = ZERO * CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1 ) * ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1, $ IAROW, IACOL ) DO 20 J = 2, N JN = JJ + J - 1 D( JN ) = WORK( J ) E( JN ) = WORK( N+J ) TAU( JN ) = WORK( 2*N+J ) 20 CONTINUE D( JJ ) = WORK( 1 ) END IF END IF * ELSE * * Process (IAROW, IACOL) owns block to be factorized * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Reduce the lower triangle of sub( A ) * DO 30 J = 1, N - 1 IK = II + J - 1 JK = JJ + J - 1 * * Generate elementary reflector H(i) = I - tau * v * v' * to annihilate A(IA+J-JA+2:IA+N-1,JA+J-1) * CALL DLARFG( N-J, A( IK+1+(JK-1)*LDA ), $ A( IK+2+(JK-1)*LDA ), 1, TAUI ) E( JK ) = A( IK+1+(JK-1)*LDA ) * IF( TAUI.NE.ZERO ) THEN * * Apply H(i) from both sides to * A(IA+J-JA+1:IA+N-1,JA+J+1:JA+N-1) * A( IK+1+(JK-1)*LDA ) = ONE * * Compute x := tau * A * v storing y in TAU(i:n-1) * CALL DSYMV( UPLO, N-J, TAUI, A( IK+1+JK*LDA ), $ LDA, A( IK+1+(JK-1)*LDA ), 1, $ ZERO, TAU( JK ), 1 ) * * Compute w := x - 1/2 * tau * (x'*v) * v * ALPHA = -HALF*TAUI*DDOT( N-J, TAU( JK ), 1, $ A( IK+1+(JK-1)*LDA ), 1 ) CALL DAXPY( N-J, ALPHA, A( IK+1+(JK-1)*LDA ), $ 1, TAU( JK ), 1 ) * * Apply the transformation as a rank-2 update: * A := A - v * w' - w * v' * CALL DSYR2( UPLO, N-J, -ONE, $ A( IK+1+(JK-1)*LDA ), 1, $ TAU( JK ), 1, A( IK+1+JK*LDA ), $ LDA ) A( IK+1+(JK-1)*LDA ) = E( JK ) END IF * * Copy D(JK), E(JK), TAU(JK) to broadcast them * columnwise. * D( JK ) = A( IK+(JK-1)*LDA ) WORK( J ) = D( JK ) WORK( N+J ) = E( JK ) TAU( JK ) = TAUI WORK( 2*N+J ) = TAU( JK ) 30 CONTINUE JN = JJ + N - 1 D( JN ) = A( II+N-1+(JN-1)*LDA ) WORK( N ) = D( JN ) TAU( JN ) = ZERO WORK( 2*N ) = ZERO * CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK, $ 1 ) * ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK, $ 1, IAROW, IACOL ) DO 40 J = 1, N - 1 JN = JJ + J - 1 D( JN ) = WORK( J ) E( JN ) = WORK( N+J ) TAU( JN ) = WORK( 2*N+J ) 40 CONTINUE JN = JJ + N - 1 D( JN ) = WORK( N ) TAU( JN ) = ZERO END IF END IF END IF * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDSYTD2 * END scalapack-2.0.2/SRC/pdsytrd.f000644 000766 000024 00000040161 10363532303 016200 0ustar00juliestaff000000 000000 SUBROUTINE PDSYTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDSYTRD reduces a real symmetric matrix sub( A ) to symmetric * tridiagonal form T by an orthogonal similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * symmetric distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the orthogonal matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP +1 ), 3 * NB ) * * where NB = MB_A = NB_A, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA .AND. IROFFA.EQ.0 ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, IINFO, IPW, $ IROFFA, J, JB, JX, K, KK, LWMIN, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PDLATRD, PDSYR2K, PDSYTD2, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = MAX( 1, NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) LWMIN = MAX( (NP+1)*NB, 3*NB ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 11 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYTRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPW = NP * NB + 1 * IF( UPPER ) THEN * * Reduce the upper triangle of sub( A ). * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, INDXG2P( JA+N-KK, $ NB, MYCOL, DESCA( CSRC_ ), NPCOL ), ICTXT, $ MAX( 1, NP ) ) * DO 10 K = N-KK+1, NB+1, -NB JB = MIN( N-K+1, NB ) I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part of * the matrix * CALL PDLATRD( UPLO, K+JB-1, JB, A, IA, JA, DESCA, D, E, TAU, $ WORK, 1, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(IA:I-1,JA:J-1), using an * update of the form: * A(IA:I-1,JA:J-1) := A(IA:I-1,JA:J-1) - V*W' - W*V' * CALL PDSYR2K( UPLO, 'No transpose', K-1, JB, -ONE, A, IA, J, $ DESCA, WORK, 1, 1, DESCW, ONE, A, IA, JA, $ DESCA ) * * Copy last superdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J, NB, 0, IACOL, NPCOL ), NQ ) CALL PDELSET( A, I-1, J, DESCA, E( JX ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PDSYTD2( UPLO, MIN( N, NB ), A, IA, JA, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) * ELSE * * Reduce the lower triangle of sub( A ) * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) * DO 20 K = 1, N-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part * of the matrix * CALL PDLATRD( UPLO, N-K+1, NB, A, I, J, DESCA, D, E, TAU, $ WORK, K, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(I+NB:IA+N-1,I+NB:IA+N-1), * using an update of the form: A(I+NB:IA+N-1,I+NB:IA+N-1) := * A(I+NB:IA+N-1,I+NB:IA+N-1) - V*W' - W*V' * CALL PDSYR2K( UPLO, 'No transpose', N-K-NB+1, NB, -ONE, A, $ I+NB, J, DESCA, WORK, K+NB, 1, DESCW, ONE, A, $ I+NB, J+NB, DESCA ) * * Copy last subdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J+NB-1, NB, 0, IACOL, NPCOL ), NQ ) CALL PDELSET( A, I+NB, J+NB-1, DESCA, E( JX ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + 1, NPCOL ) * 20 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PDSYTD2( UPLO, KK, A, IA+K-1, JA+K-1, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDSYTRD * END scalapack-2.0.2/SRC/pdsyttrd.f000644 000766 000024 00000122635 11750130340 016367 0ustar00juliestaff000000 000000 SUBROUTINE PDSYTTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * * Purpose * * ======= * * PDSYTTRD reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding * process and memory location. * * Let A be a generic term for any 2D block cyclicly distributed * array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- ----------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, * indicating the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to * distribute the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to * distribute the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the * first row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCp(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCp( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCq( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes * of its process row. * The values of LOCp() and LOCq() may be determined via a call to * the ScaLAPACK tool function, NUMROC: * LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCq(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1) * if UPLO = 'U', LOCq(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension * LOCq(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace) DOUBLE PRECISION array, dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal workspace * * LWORK (local input) INTEGER * The dimension of the array WORK. * LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + NPS * Where: * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) * ANB = PJLAENV( DESCA( CTXT_ ), 3, 'PDSYTTRD', 'L', 0, 0, * 0, 0 ) * * NUMROC is a ScaLAPACK tool function; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of * elementary reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of * elementary reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and * vi denotes an element of the vector defining H(i). * * Data storage requirements * ========================= * * PDSYTTRD is not intended to be called directly. All users are * encourage to call PDSYTRD which will then call PDHETTRD if * appropriate. A must be in cyclic format (i.e. MB = NB = 1), * the process grid must be square ( i.e. NPROW = NPCOL ) and * only lower triangular storage is supported. * * Local variables * =============== * * PDSYTTRD uses five local arrays: * WORK ( InV ) dimension ( NP, ANB+1): array V * WORK ( InH ) dimension ( NP, ANB+1): array H * WORK ( InVT ) dimension ( NQ, ANB+1): transpose of the array V * WORK ( InHT ) dimension ( NQ, ANB+1): transpose of the array H * WORK ( InVTT ) dimension ( NQ, 1): transpose of the array VT * * Arrays V and H are replicated across all processor columns. * Arrays V^T and H^T are replicated across all processor rows. * * WORK ( InVT ), or V^T, is stored as a tall skinny * array ( NQ x ANB-1 ) for efficiency. Since only the lower * triangular portion of A is updated, Av is computed as: * tril(A) * v + v^T * tril(A,-1). This is performed as * two local triangular matrix-vector multiplications (both in * MVR2) followed by a transpose and a sum across the columns. * In the local computation, WORK( InVT ) is used to compute * tril(A) * v and WORK( InV ) is used to compute * v^T * tril(A,-1) * * The following variables are global indices into A: * INDEX: The current global row and column number. * MAXINDEX: The global row and column for the first row and * column in the trailing block of A. * LIIB, LIJB: The first row, column in * * The following variables point into the arrays A, V, H, V^T, H^T: * BINDEX =INDEX-MININDEX: The column index in V, H, V^T, H^T. * LII: local index I: The local row number for row INDEX * LIJ: local index J: The local column number for column INDEX * LIIP1: local index I+1: The local row number for row INDEX+1 * LIJP1: local index J+1: The local col number for col INDEX+1 * LTLI: lower triangular local index I: The local row for the * upper left entry in tril( A(INDEX, INDEX) ) * LTLIP1: lower triangular local index I+1: The local row for the * upper left entry in tril( A(INDEX+1, INDEX+1) ) * * Details: The distinction between LII and LTLI (and between * LIIP1 and LTLIP1) is subtle. Within the current processor * column (i.e. MYCOL .eq. CURCOL) they are the same. However, * on some processors, A( LII, LIJ ) points to an element * above the diagonal, on these processors, LTLI = LII+1. * * The following variables give the number of rows and/or columns * in various matrices: * NP: The number of local rows in A( 1:N, 1:N ) * NQ: The number of local columns in A( 1:N, 1:N ) * NPM0: The number of local rows in A( INDEX:N, INDEX:N ) * NQM0: The number of local columns in A( INDEX:N, INDEX:N ) * NPM1: The number of local rows in A( INDEX+1:N, INDEX:N ) * NQM1: The number of local columns in A( INDEX+1:N, INDEX:N ) * LTNM0: The number of local rows & columns in * tril( A( INDEX:N, INDEX:N ) ) * LTNM1: The number of local rows & columns in * tril( A( INDEX+1:N, INDEX+1:N ) ) * NOTE: LTNM0 == LTNM1 on all processors except the diagonal * processors, i.e. those where MYCOL == MYROW. * * Invariants: * NP = NPM0 + LII - 1 * NQ = NQM0 + LIJ - 1 * NP = NPM1 + LIIP1 - 1 * NQ = NQM1 + LIJP1 - 1 * NP = LTLI + LTNM0 - 1 * NP = LTLIP1 + LTNM1 - 1 * * Temporary variables. The following variables are used within * a few lines after they are set and do hold state from one loop * iteration to the next: * * The matrix A: * The matrix A does not hold the same values that it would * in an unblocked code nor the values that it would hold in * in a blocked code. * * The value of A is confusing. It is easiest to state the * difference between trueA and A at the point that MVR2 is called, * so we will start there. * * Let trueA be the value that A would * have at a given point in an unblocked code and A * be the value that A has in this code at the same point. * * At the time of the call to MVR2, * trueA = A + V' * H + H' * V * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ). * * At the bottom of the inner loop, * trueA = A + V' * H + H' * V + v' * h + h' * v * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ) and * v = V( liip1:N, BINDEX+1 ) and * h = H( liip1:N, BINDEX+1 ) * * At the top of the loop, BINDEX gets incremented, hence: * trueA = A + V' * H + H' * V + v' * h + h' * v * where H = H( MAXINDEX:N, 1:BINDEX-1 ) and * V = V( MAXINDEX:N, 1:BINDEX-1 ) and * v = V( liip1:N, BINDEX ) and * h = H( liip1:N, BINDEX ) * * * A gets updated at the bottom of the outer loop * After this update, trueA = A + v' * h + h' * v * where v = V( liip1:N, BINDEX ) and * h = H( liip1:N, BINDEX ) and BINDEX = 0 * Indeed, the previous loop invariant as stated above for the * top of the loop still holds, but with BINDEX = 0, H and V * are null matrices. * * After the current column of A is updated, * trueA( INDEX, INDEX:N ) = A( INDEX, INDEX:N ) * the rest of A is untouched. * * After the current block column of A is updated, * trueA = A + V' * H + H' * V * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ) * * This brings us back to the point at which mvr2 is called. * * * Details of the parallelization: * * We delay spreading v across to all processor columns (which * would naturally happen at the bottom of the loop) in order to * combine the spread of v( : , i-1 ) with the spread of h( : , i ) * * In order to compute h( :, i ), we must update A( :, i ) * which means that the processor column owning A( :, i ) must * have: c, tau, v( i, i ) and h( i, i ). * * The traditional * way of computing v (and the one used in pzlatrd.f and * zlatrd.f) is: * v = tau * v * c = v' * h * alpha = - tau * c / 2 * v = v + alpha * h * However, the traditional way of computing v requires that tau * be broadcast to all processors in the current column (to compute * v = tau * v) and then a sum-to-all is required (to * compute v' * h ). We use the following formula instead: * c = v' * h * v = tau * ( v - c * tau' * h / 2 ) * The above formula allows tau to be spread down in the * same call to DGSUM2D which performs the sum-to-all of c. * * The computation of v, which could be performed in any processor * column (or other procesor subsets), is performed in the * processor column that owns A( :, i+1 ) so that A( :, i+1 ) * can be updated prior to spreading v across. * * We keep the block column of A up-to-date to minimize the * work required in updating the current column of A. Updating * the block column of A is reasonably load balanced whereas * updating the current column of A is not (only the current * processor column is involved). * * In the following overview of the steps performed, M in the * margin indicates message traffic and C indicates O(n^2 nb/sqrt(p)) * or more flops per processor. * * Inner loop: * A( index:n, index ) -= ( v * ht(bindex) + h * vt( bindex) ) *M h = house( A(index:n, index) ) *M Spread v, h across *M vt = v^T; ht = h^T * A( index+1:n, index+1:maxindex ) -= * ( v * ht(index+1:maxindex) + h *vt(index+1:maxindex) ) *C v = tril(A) * h; vt = ht * tril(A,-1) *MorC v = v - H*V*h - V*H*h *M v = v + vt^T *M c = v' * h * v = tau * ( v - c * tau' * h / 2 ) *C A = A - H*V - V*H * * * * ================================================================= * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D0 ) DOUBLE PRECISION Z_ONE, Z_NEGONE, Z_ZERO PARAMETER ( Z_ONE = 1.0D0, Z_NEGONE = -1.0D0, $ Z_ZERO = 0.0D0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * * * .. Local Scalars .. * * LOGICAL BALANCED, INTERLEAVE, TWOGEMMS, UPPER INTEGER ANB, BINDEX, CURCOL, CURROW, I, ICTXT, INDEX, $ INDEXA, INDEXINH, INDEXINV, INH, INHB, INHT, $ INHTB, INTMP, INV, INVB, INVT, INVTB, J, LDA, $ LDV, LDZG, LII, LIIB, LIIP1, LIJ, LIJB, LIJP1, $ LTLIP1, LTNM1, LWMIN, MAXINDEX, MININDEX, $ MYCOL, MYFIRSTROW, MYROW, MYSETNUM, NBZG, NP, $ NPB, NPCOL, NPM0, NPM1, NPROW, NPS, NPSET, NQ, $ NQB, NQM1, NUMROWS, NXTCOL, NXTROW, PBMAX, $ PBMIN, PBSIZE, PNB, ROWSPERPROC DOUBLE PRECISION ALPHA, BETA, C, CONJTOPH, CONJTOPV, NORM, $ ONEOVERBETA, SAFMAX, SAFMIN, TOPH, TOPNV, $ TOPTAU, TOPV * .. * .. Local Arrays .. * * * * INTEGER IDUM1( 1 ), IDUM2( 1 ) DOUBLE PRECISION CC( 3 ), DTMP( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DCOMBNRM2, DGEBR2D, $ DGEBS2D, DGEMM, DGEMV, DGERV2D, DGESD2D, $ DGSUM2D, DLAMOV, DSCAL, DTRMVT, PCHK1MAT, $ PDTREECOMB, PXERBLA * .. * .. External Functions .. * LOGICAL LSAME INTEGER ICEIL, NUMROC, PJLAENV DOUBLE PRECISION DNRM2, PDLAMCH EXTERNAL LSAME, ICEIL, NUMROC, PJLAENV, DNRM2, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD, SIGN, SQRT * .. * * * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * * * Further details * =============== * * At the top of the loop, v and nh have been computed but not * spread across. Hence, A is out-of-date even after the * rank 2k update. Furthermore, we compute the next v before * nh is spread across. * * I claim that if we used a sum-to-all on NV, by summing CC within * each column, that we could compute NV locally and could avoid * spreading V across. Bruce claims that sum-to-all can be made * to cost no more than sum-to-one on the Paragon. If that is * true, this would be a win. But, * the BLACS sum-to-all is just a sum-to-one followed by a broadcast, * and hence the present scheme is better for now. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * SAFMAX = SQRT( PDLAMCH( ICTXT, 'O' ) ) / N SAFMIN = SQRT( PDLAMCH( ICTXT, 'S' ) ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE * * Here we set execution options for PDSYTTRD * PNB = PJLAENV( ICTXT, 2, 'PDSYTTRD', 'L', 0, 0, 0, 0 ) ANB = PJLAENV( ICTXT, 3, 'PDSYTTRD', 'L', 0, 0, 0, 0 ) * INTERLEAVE = ( PJLAENV( ICTXT, 4, 'PDSYTTRD', 'L', 1, 0, 0, $ 0 ).EQ.1 ) TWOGEMMS = ( PJLAENV( ICTXT, 4, 'PDSYTTRD', 'L', 2, 0, 0, $ 0 ).EQ.1 ) BALANCED = ( PJLAENV( ICTXT, 4, 'PDSYTTRD', 'L', 3, 0, 0, $ 0 ).EQ.1 ) * CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) * * UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 .AND. DESCA( NB_ ).NE.1 ) $ INFO = 600 + NB_ IF( INFO.EQ.0 ) THEN * * * Here is the arithmetic: * Let maxnpq = max( np, nq, 2 * ANB ) * LDV = 4 * max( np, nq ) + 2 * LWMIN = 2 * ( ANB + 1 ) * LDV + MAX( np, 2 * ANB ) * = 2 * ( ANB + 1 ) * ( 4 * NPS + 2 ) + NPS * * This overestimates memory requirements when ANB > NP/2 * Memory requirements are lower when interleave = .false. * Hence, we could have two sets of memory requirements, * one for interleave and one for * * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) LWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + NPS * WORK( 1 ) = DBLE( LWMIN ) IF( .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IA.NE.1 ) THEN INFO = -4 ELSE IF( JA.NE.1 ) THEN INFO = -5 ELSE IF( NPROW.NE.NPCOL ) THEN INFO = -( 600+CTXT_ ) ELSE IF( DESCA( DTYPE_ ).NE.1 ) THEN INFO = -( 600+DTYPE_ ) ELSE IF( DESCA( MB_ ).NE.1 ) THEN INFO = -( 600+MB_ ) ELSE IF( DESCA( NB_ ).NE.1 ) THEN INFO = -( 600+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.0 ) THEN INFO = -( 600+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.0 ) THEN INFO = -( 600+CSRC_ ) ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 * CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYTTRD', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * * Reduce the lower triangle of sub( A ) NP = NUMROC( N, 1, MYROW, 0, NPROW ) NQ = NUMROC( N, 1, MYCOL, 0, NPCOL ) * NXTROW = 0 NXTCOL = 0 * LIIP1 = 1 LIJP1 = 1 NPM1 = NP NQM1 = NQ * LDA = DESCA( LLD_ ) ICTXT = DESCA( CTXT_ ) * * * * Miscellaneous details: * Put tau, D and E in the right places * Check signs * Place all the arrays in WORK, control their placement * in memory. * * * * Loop invariants * A(LIIP1, LIJ) points to the first element of A(I+1,J) * NPM1,NQM1 = the number of rows, cols in A( LII+1:N,LIJ+1:N ) * A(LII:N,LIJ:N) is one step out of date. * proc( CURROW, CURCOL ) owns A(LII,LIJ) * proc( NXTROW, CURCOL ) owns A(LIIP1,LIJ) * INH = 1 * IF( INTERLEAVE ) THEN * * H and V are interleaved to minimize memory movement * LDV has to be twice as large to accomodate interleaving. * In addition, LDV is doubled again to allow v, h and * toptau to be spreaad across and transposed in a * single communication operation with minimum memory * movement. * * We could reduce LDV back to 2*MAX(NPM1,NQM1) * by increasing the memory movement required in * the spread and transpose of v, h and toptau. * However, since the non-interleaved path already * provides a mear minimum memory requirement option, * we did not provide this additional path. * LDV = 4*( MAX( NPM1, NQM1 ) ) + 2 * INH = 1 * INV = INH + LDV / 2 INVT = INH + ( ANB+1 )*LDV * INHT = INVT + LDV / 2 INTMP = INVT + LDV*( ANB+1 ) * ELSE LDV = MAX( NPM1, NQM1 ) * INHT = INH + LDV*( ANB+1 ) INV = INHT + LDV*( ANB+1 ) * * The code works without this +1, but only because of a * coincidence. Without the +1, WORK(INVT) gets trashed, but * WORK(INVT) is only used once and when it is used, it is * multiplied by WORK( INH ) which is zero. Hence, the fact * that WORK(INVT) is trashed has no effect. * INVT = INV + LDV*( ANB+1 ) + 1 INTMP = INVT + LDV*( 2*ANB ) * END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDSYTTRD', -INFO ) WORK( 1 ) = DBLE( LWMIN ) RETURN END IF * * * The satisfies the loop invariant: trueA = A - V * HT - H * VT, * (where V, H, VT and HT all have BINDEX+1 rows/columns) * the first ANB times through the loop. * * * * Setting either ( InH and InHT ) or InV to Z_ZERO * is adequate except in the face of NaNs. * * DO 10 I = 1, NP WORK( INH+I-1 ) = Z_ZERO WORK( INV+I-1 ) = Z_ZERO 10 CONTINUE DO 20 I = 1, NQ WORK( INHT+I-1 ) = Z_ZERO 20 CONTINUE * * * TOPNV = Z_ZERO * LTLIP1 = LIJP1 LTNM1 = NPM1 IF( MYCOL.GT.MYROW ) THEN LTLIP1 = LTLIP1 + 1 LTNM1 = LTNM1 - 1 END IF * * DO 210 MININDEX = 1, N - 1, ANB * * MAXINDEX = MIN( MININDEX+ANB-1, N ) LIJB = NUMROC( MAXINDEX, 1, MYCOL, 0, NPCOL ) + 1 LIIB = NUMROC( MAXINDEX, 1, MYROW, 0, NPROW ) + 1 * NQB = NQ - LIJB + 1 NPB = NP - LIIB + 1 INHTB = INHT + LIJB - 1 INVTB = INVT + LIJB - 1 INHB = INH + LIIB - 1 INVB = INV + LIIB - 1 * * * * DO 160 INDEX = MININDEX, MIN( MAXINDEX, N-1 ) * BINDEX = INDEX - MININDEX * CURROW = NXTROW CURCOL = NXTCOL * NXTROW = MOD( CURROW+1, NPROW ) NXTCOL = MOD( CURCOL+1, NPCOL ) * LII = LIIP1 LIJ = LIJP1 NPM0 = NPM1 * IF( MYROW.EQ.CURROW ) THEN NPM1 = NPM1 - 1 LIIP1 = LIIP1 + 1 END IF IF( MYCOL.EQ.CURCOL ) THEN NQM1 = NQM1 - 1 LIJP1 = LIJP1 + 1 LTLIP1 = LTLIP1 + 1 LTNM1 = LTNM1 - 1 END IF * * * * * V = NV, VT = NVT, H = NH, HT = NHT * * * Update the current column of A * * IF( MYCOL.EQ.CURCOL ) THEN * INDEXA = LII + ( LIJ-1 )*LDA INDEXINV = INV + LII - 1 + ( BINDEX-1 )*LDV INDEXINH = INH + LII - 1 + ( BINDEX-1 )*LDV CONJTOPH = WORK( INHT+LIJ-1+BINDEX*LDV ) CONJTOPV = TOPNV * IF( INDEX.GT.1 ) THEN DO 30 I = 0, NPM0 - 1 * A( INDEXA+I ) = A( INDEXA+I ) A( INDEXA+I ) = A( INDEXA+I ) - $ WORK( INDEXINV+LDV+I )*CONJTOPH - $ WORK( INDEXINH+LDV+I )*CONJTOPV 30 CONTINUE END IF * * END IF * * IF( MYCOL.EQ.CURCOL ) THEN * * Compute the householder vector * IF( MYROW.EQ.CURROW ) THEN DTMP( 2 ) = A( LII+( LIJ-1 )*LDA ) ELSE DTMP( 2 ) = ZERO END IF IF( MYROW.EQ.NXTROW ) THEN DTMP( 3 ) = A( LIIP1+( LIJ-1 )*LDA ) DTMP( 4 ) = ZERO ELSE DTMP( 3 ) = ZERO DTMP( 4 ) = ZERO END IF * NORM = DNRM2( NPM1, A( LIIP1+( LIJ-1 )*LDA ), 1 ) DTMP( 1 ) = NORM * * IF DTMP(5) = 1.0, NORM is too large and might cause * overflow, hence PDTREECOMB must be called. IF DTMP(5) * is zero on output, DTMP(1) can be trusted. * DTMP( 5 ) = ZERO IF( DTMP( 1 ).GE.SAFMAX .OR. DTMP( 1 ).LT.SAFMIN ) THEN DTMP( 5 ) = ONE DTMP( 1 ) = ZERO END IF * DTMP( 1 ) = DTMP( 1 )*DTMP( 1 ) CALL DGSUM2D( ICTXT, 'C', ' ', 5, 1, DTMP, 5, -1, $ CURCOL ) IF( DTMP( 5 ).EQ.ZERO ) THEN DTMP( 1 ) = SQRT( DTMP( 1 ) ) ELSE DTMP( 1 ) = NORM CALL PDTREECOMB( ICTXT, 'C', 1, DTMP, -1, MYCOL, $ DCOMBNRM2 ) END IF * NORM = DTMP( 1 ) * D( LIJ ) = DTMP( 2 ) IF( MYROW.EQ.CURROW .AND. MYCOL.EQ.CURCOL ) THEN A( LII+( LIJ-1 )*LDA ) = D( LIJ ) END IF * * ALPHA = DTMP( 3 ) * NORM = SIGN( NORM, ALPHA ) * IF( NORM.EQ.ZERO ) THEN TOPTAU = ZERO ELSE BETA = NORM + ALPHA TOPTAU = BETA / NORM ONEOVERBETA = 1.0D0 / BETA * CALL DSCAL( NPM1, ONEOVERBETA, $ A( LIIP1+( LIJ-1 )*LDA ), 1 ) END IF * IF( MYROW.EQ.NXTROW ) THEN A( LIIP1+( LIJ-1 )*LDA ) = Z_ONE END IF * TAU( LIJ ) = TOPTAU E( LIJ ) = -NORM * END IF * * * Spread v, nh, toptau across * DO 40 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+I ) = A( LIIP1+I+ $ ( LIJ-1 )*LDA ) 40 CONTINUE * IF( MYCOL.EQ.CURCOL ) THEN WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 ) = TOPTAU CALL DGEBS2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), $ NPM1+NPM1+1 ) ELSE CALL DGEBR2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), $ NPM1+NPM1+1, MYROW, CURCOL ) TOPTAU = WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 ) END IF DO 50 I = 0, NPM1 - 1 WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1- $ 1+BINDEX*LDV+NPM1+I ) 50 CONTINUE * IF( INDEX.LT.N ) THEN IF( MYROW.EQ.NXTROW .AND. MYCOL.EQ.CURCOL ) $ A( LIIP1+( LIJ-1 )*LDA ) = E( LIJ ) END IF * * Transpose v, nh * * IF( MYROW.EQ.MYCOL ) THEN DO 60 I = 0, NPM1 + NPM1 WORK( INVT+LIJP1-1+BINDEX*LDV+I ) = WORK( INV+LIIP1-1+ $ BINDEX*LDV+I ) 60 CONTINUE ELSE CALL DGESD2D( ICTXT, NPM1+NPM1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), NPM1+NPM1, $ MYCOL, MYROW ) CALL DGERV2D( ICTXT, NQM1+NQM1, 1, $ WORK( INVT+LIJP1-1+BINDEX*LDV ), NQM1+NQM1, $ MYCOL, MYROW ) END IF * DO 70 I = 0, NQM1 - 1 WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV+I ) = WORK( INVT+ $ LIJP1-1+BINDEX*LDV+NQM1+I ) 70 CONTINUE * * * Update the current block column of A * IF( INDEX.GT.1 ) THEN DO 90 J = LIJP1, LIJB - 1 DO 80 I = 0, NPM1 - 1 * A( LIIP1+I+( J-1 )*LDA ) = A( LIIP1+I+( J-1 )*LDA ) $ - WORK( INV+LIIP1-1+BINDEX*LDV+I )* $ WORK( INHT+J-1+BINDEX*LDV ) - $ WORK( INH+LIIP1-1+BINDEX*LDV+I )* $ WORK( INVT+J-1+BINDEX*LDV ) 80 CONTINUE 90 CONTINUE END IF * * * * Compute NV = A * NHT; NVT = A * NH * * These two lines are necessary because these elements * are not always involved in the calls to DTRMVT * for two reasons: * 1) On diagonal processors, the call to TRMVT * involves only LTNM1-1 elements * 2) On some processes, NQM1 < LTM1 or LIIP1 < LTLIP1 * and when the results are combined across all processes, * uninitialized values may be included. WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ) = Z_ZERO WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+NQM1-1 ) = Z_ZERO * * IF( MYROW.EQ.MYCOL ) THEN IF( LTNM1.GT.1 ) THEN CALL DTRMVT( 'L', LTNM1-1, $ A( LTLIP1+1+( LIJP1-1 )*LDA ), LDA, $ WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ), 1, $ WORK( INH+LTLIP1+1-1+( BINDEX+1 )*LDV ), $ 1, WORK( INV+LTLIP1+1-1+( BINDEX+1 )* $ LDV ), 1, WORK( INHT+LIJP1-1+( BINDEX+ $ 1 )*LDV ), 1 ) END IF DO 100 I = 1, LTNM1 WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV ) $ = WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV ) + $ A( LTLIP1+I-1+( LIJP1+I-1-1 )*LDA )* $ WORK( INH+LTLIP1+I-1-1+( BINDEX+1 )*LDV ) 100 CONTINUE ELSE IF( LTNM1.GT.0 ) $ CALL DTRMVT( 'L', LTNM1, A( LTLIP1+( LIJP1-1 )*LDA ), $ LDA, WORK( INVT+LIJP1-1+( BINDEX+1 )* $ LDV ), 1, WORK( INH+LTLIP1-1+( BINDEX+ $ 1 )*LDV ), 1, WORK( INV+LTLIP1-1+ $ ( BINDEX+1 )*LDV ), 1, $ WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV ), $ 1 ) * END IF * * * We take advantage of the fact that: * A * sum( B ) = sum ( A * B ) for matrices A,B * * trueA = A + V * HT + H * VT * hence: (trueA)v = Av' + V * HT * v + H * VT * v * VT * v = sum_p_in_NPROW ( VTp * v ) * H * VT * v = H * sum (VTp * v) = sum ( H * VTp * v ) * * v = v + V * HT * h + H * VT * h * * * * tmp = HT * nh1 DO 110 I = 1, 2*( BINDEX+1 ) WORK( INTMP-1+I ) = 0 110 CONTINUE * IF( BALANCED ) THEN NPSET = NPROW MYSETNUM = MYROW ROWSPERPROC = ICEIL( NQB, NPSET ) MYFIRSTROW = MIN( NQB+1, 1+ROWSPERPROC*MYSETNUM ) NUMROWS = MIN( ROWSPERPROC, NQB-MYFIRSTROW+1 ) * * * tmp = HT * v * CALL DGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE, $ WORK( INHTB+MYFIRSTROW-1 ), LDV, $ WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ), $ 1, Z_ZERO, WORK( INTMP ), 1 ) * tmp2 = VT * v CALL DGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE, $ WORK( INVTB+MYFIRSTROW-1 ), LDV, $ WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ), $ 1, Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 ) * * CALL DGSUM2D( ICTXT, 'C', ' ', 2*( BINDEX+1 ), 1, $ WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 ) ELSE * tmp = HT * v * CALL DGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INHTB ), $ LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1, $ Z_ZERO, WORK( INTMP ), 1 ) * tmp2 = VT * v CALL DGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INVTB ), $ LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1, $ Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 ) * END IF * * * IF( BALANCED ) THEN MYSETNUM = MYCOL * ROWSPERPROC = ICEIL( NPB, NPSET ) MYFIRSTROW = MIN( NPB+1, 1+ROWSPERPROC*MYSETNUM ) NUMROWS = MIN( ROWSPERPROC, NPB-MYFIRSTROW+1 ) * CALL DGSUM2D( ICTXT, 'R', ' ', 2*( BINDEX+1 ), 1, $ WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 ) * * * v = v + V * tmp IF( INDEX.GT.1. ) THEN CALL DGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE, $ WORK( INVB+MYFIRSTROW-1 ), LDV, $ WORK( INTMP ), 1, Z_ONE, $ WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )* $ LDV ), 1 ) * * v = v + H * tmp2 CALL DGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE, $ WORK( INHB+MYFIRSTROW-1 ), LDV, $ WORK( INTMP+BINDEX+1 ), 1, Z_ONE, $ WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )* $ LDV ), 1 ) END IF * ELSE * v = v + V * tmp CALL DGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INVB ), $ LDV, WORK( INTMP ), 1, Z_ONE, $ WORK( INVB+( BINDEX+1 )*LDV ), 1 ) * * * v = v + H * tmp2 CALL DGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INHB ), $ LDV, WORK( INTMP+BINDEX+1 ), 1, Z_ONE, $ WORK( INVB+( BINDEX+1 )*LDV ), 1 ) * END IF * * * Transpose NV and add it back into NVT * IF( MYROW.EQ.MYCOL ) THEN DO 120 I = 0, NQM1 - 1 WORK( INTMP+I ) = WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+ $ I ) 120 CONTINUE ELSE CALL DGESD2D( ICTXT, NQM1, 1, $ WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ), $ NQM1, MYCOL, MYROW ) CALL DGERV2D( ICTXT, NPM1, 1, WORK( INTMP ), NPM1, MYCOL, $ MYROW ) * END IF DO 130 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1- $ 1+( BINDEX+1 )*LDV+I ) + WORK( INTMP+I ) 130 CONTINUE * * Sum-to-one NV rowwise (within a row) * CALL DGSUM2D( ICTXT, 'R', ' ', NPM1, 1, $ WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ), NPM1, $ MYROW, NXTCOL ) * * * Dot product c = NV * NH * Sum-to-all c within next processor column * * IF( MYCOL.EQ.NXTCOL ) THEN CC( 1 ) = Z_ZERO DO 140 I = 0, NPM1 - 1 CC( 1 ) = CC( 1 ) + WORK( INV+LIIP1-1+( BINDEX+1 )* $ LDV+I )*WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+ $ I ) 140 CONTINUE IF( MYROW.EQ.NXTROW ) THEN CC( 2 ) = WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ) CC( 3 ) = WORK( INH+LIIP1-1+( BINDEX+1 )*LDV ) ELSE CC( 2 ) = Z_ZERO CC( 3 ) = Z_ZERO END IF CALL DGSUM2D( ICTXT, 'C', ' ', 3, 1, CC, 3, -1, NXTCOL ) * TOPV = CC( 2 ) C = CC( 1 ) TOPH = CC( 3 ) * TOPNV = TOPTAU*( TOPV-C*TOPTAU / 2*TOPH ) * * * Compute V = Tau * (V - C * Tau' / 2 * H ) * * DO 150 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = TOPTAU* $ ( WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I )-C*TOPTAU / $ 2*WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+I ) ) 150 CONTINUE * END IF * * 160 CONTINUE * * * Perform the rank2k update * IF( MAXINDEX.LT.N ) THEN * DO 170 I = 0, NPM1 - 1 WORK( INTMP+I ) = WORK( INH+LIIP1-1+ANB*LDV+I ) 170 CONTINUE * * * IF( .NOT.TWOGEMMS ) THEN IF( INTERLEAVE ) THEN LDZG = LDV / 2 ELSE CALL DLAMOV( 'A', LTNM1, ANB, WORK( INHT+LIJP1-1 ), $ LDV, WORK( INVT+LIJP1-1+ANB*LDV ), LDV ) * CALL DLAMOV( 'A', LTNM1, ANB, WORK( INV+LTLIP1-1 ), $ LDV, WORK( INH+LTLIP1-1+ANB*LDV ), LDV ) LDZG = LDV END IF NBZG = ANB*2 ELSE LDZG = LDV NBZG = ANB END IF * * DO 180 PBMIN = 1, LTNM1, PNB * PBSIZE = MIN( PNB, LTNM1-PBMIN+1 ) PBMAX = MIN( LTNM1, PBMIN+PNB-1 ) CALL DGEMM( 'N', 'C', PBSIZE, PBMAX, NBZG, Z_NEGONE, $ WORK( INH+LTLIP1-1+PBMIN-1 ), LDZG, $ WORK( INVT+LIJP1-1 ), LDZG, Z_ONE, $ A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA ) IF( TWOGEMMS ) THEN CALL DGEMM( 'N', 'C', PBSIZE, PBMAX, ANB, Z_NEGONE, $ WORK( INV+LTLIP1-1+PBMIN-1 ), LDZG, $ WORK( INHT+LIJP1-1 ), LDZG, Z_ONE, $ A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA ) END IF 180 CONTINUE * * * DO 190 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+I ) = WORK( INV+LIIP1-1+ANB*LDV+I ) WORK( INH+LIIP1-1+I ) = WORK( INTMP+I ) 190 CONTINUE DO 200 I = 0, NQM1 - 1 WORK( INHT+LIJP1-1+I ) = WORK( INHT+LIJP1-1+ANB*LDV+I ) 200 CONTINUE * * END IF * * End of the update A code * 210 CONTINUE * IF( MYCOL.EQ.NXTCOL ) THEN IF( MYROW.EQ.NXTROW ) THEN * D( NQ ) = A( NP+( NQ-1 )*LDA ) * CALL DGEBS2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1 ) ELSE CALL DGEBR2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1, NXTROW, $ NXTCOL ) END IF END IF * * * * WORK( 1 ) = DBLE( LWMIN ) RETURN * * End of PDSYTTRD * * END scalapack-2.0.2/SRC/pdtrcon.f000644 000766 000024 00000040363 10363532303 016164 0ustar00juliestaff000000 000000 SUBROUTINE PDTRCON( NORM, UPLO, DIAG, N, A, IA, JA, DESCA, RCOND, $ WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER IA, JA, INFO, LIWORK, LWORK, N DOUBLE PRECISION RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PDTRCON estimates the reciprocal of the condition number of a * triangular distributed matrix A(IA:IA+N-1,JA:JA+N-1), in either the * 1-norm or the infinity-norm. * * The norm of A(IA:IA+N-1,JA:JA+N-1) is computed and an estimate is * obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), then the reciprocal * of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies whether the 1-norm condition number or the * infinity-norm condition number is required: * = '1' or 'O': 1-norm; * = 'I': Infinity-norm. * * UPLO (global input) CHARACTER * = 'U': A(IA:IA+N-1,JA:JA+N-1) is upper triangular; * = 'L': A(IA:IA+N-1,JA:JA+N-1) is lower triangular. * * DIAG (global input) CHARACTER * = 'N': A(IA:IA+N-1,JA:JA+N-1) is non-unit triangular; * = 'U': A(IA:IA+N-1,JA:JA+N-1) is unit triangular. * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the triangular distributed * matrix A(IA:IA+N-1,JA:JA+N-1). If UPLO = 'U', the leading * N-by-N upper triangular part of this distributed matrix con- * tains the upper triangular matrix, and its strictly lower * triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of this ditributed * matrix contains the lower triangular matrix, and the strictly * upper triangular part is not referenced. If DIAG = 'U', the * diagonal elements of A(IA:IA+N-1,JA:JA+N-1) are also not * referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * RCOND (global output) DOUBLE PRECISION * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + LOCc(N+MOD(JA-1,NB_A)) * + MAX( 2, MAX( NB_A*MAX( 1, CEIL(NPROW-1,NPCOL) ), * LOCc(N+MOD(JA-1,NB_A)) + * NB_A*MAX( 1, CEIL(NPCOL-1,NPROW) ) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr(N+MOD(IA-1,MB_A)). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOUNIT, ONENRM, UPPER CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPN, IPV, IPW, $ IPX, IROFF, IV, IX, IXX, JJA, JV, JX, KASE, $ KASE1, LIWMIN, LWMIN, MYCOL, MYROW, NP, NPCOL, $ NPMOD, NPROW, NQ, NQMOD DOUBLE PRECISION AINVNM, ANORM, SCALE, SMLNUM DOUBLE PRECISION WMAX * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 5 ), $ IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGEBR2D, $ DGEBS2D, INFOG2L, PCHK1MAT, PDAMAX, $ PDLATRS, PDLACON, PDRSCL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PDLANTR EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH, $ PDLANTR * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 800 + CTXT_ ) ELSE CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) ONENRM = NORM.EQ.'1' .OR. LSAME( NORM, 'O' ) NOUNIT = LSAME( DIAG, 'N' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + NQMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = DBLE( LWMIN ) LIWMIN = NPMOD IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.ONENRM .AND. .NOT.LSAME( NORM, 'I' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF * IF( ONENRM ) THEN IDUM1( 1 ) = ICHAR( '1' ) ELSE IDUM1( 1 ) = ICHAR( 'I' ) END IF IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'U' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 11 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 13 CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 5, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDTRCON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * RCOND = ZERO SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' )*DBLE( MAX( 1, N ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPN = IPV + NP IPW = IPN + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Compute the norm of the triangular matrix A. * ANORM = PDLANTR( NORM, UPLO, DIAG, N, N, A, IA, JA, DESCA, WORK ) * * Continue only if ANORM > 0. * IF( ANORM.GT.ZERO ) THEN * * Estimate the norm of the inverse of A. * AINVNM = ZERO NORMIN = 'N' IF( ONENRM ) THEN KASE1 = 1 ELSE KASE1 = 2 END IF KASE = 0 10 CONTINUE CALL PDLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), $ IX, JX, DESCX, IWORK, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( KASE.EQ.KASE1 ) THEN * * Multiply by inv(A). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( UPLO, 'No transpose', DIAG, NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SCALE, WORK( IPN ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(A'). * DESCX( CSRC_ ) = IACOL CALL PDLATRS( UPLO, 'Transpose', DIAG, NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SCALE, WORK( IPN ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF NORMIN = 'Y' * * Multiply by 1/SCALE if doing so will not cause overflow. * IF( SCALE.NE.ONE ) THEN CALL PDAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, $ DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL DGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, $ 1 ) ELSE CALL DGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, $ 1, IAROW, MYCOL ) END IF END IF IF( SCALE.LT.ABS( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PDRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / ANORM ) / AINVNM END IF * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PDTRCON * END scalapack-2.0.2/SRC/pdtrord.f000644 000766 000024 00000472607 11750130340 016177 0ustar00juliestaff000000 000000 SUBROUTINE PDTRORD( COMPQ, SELECT, PARA, N, T, IT, JT, $ DESCT, Q, IQ, JQ, DESCQ, WR, WI, M, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER COMPQ INTEGER INFO, LIWORK, LWORK, M, N, $ IT, JT, IQ, JQ * .. * .. Array Arguments .. INTEGER SELECT( * ) INTEGER PARA( 6 ), DESCT( * ), DESCQ( * ), IWORK( * ) DOUBLE PRECISION Q( * ), T( * ), WI( * ), WORK( * ), WR( * ) * .. * * Purpose * ======= * * PDTRORD reorders the real Schur factorization of a real matrix * A = Q*T*Q**T, so that a selected cluster of eigenvalues appears * in the leading diagonal blocks of the upper quasi-triangular matrix * T, and the leading columns of Q form an orthonormal basis of the * corresponding right invariant subspace. * * T must be in Schur form (as returned by PDLAHQR), that is, block * upper triangular with 1-by-1 and 2-by-2 diagonal blocks. * * This subroutine uses a delay and accumulate procedure for performing * the off-diagonal updates (see references for details). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * * COMPQ (global input) CHARACTER*1 * = 'V': update the matrix Q of Schur vectors; * = 'N': do not update Q. * * SELECT (global input/output) INTEGER array, dimension (N) * SELECT specifies the eigenvalues in the selected cluster. To * select a real eigenvalue w(j), SELECT(j) must be set to 1. * To select a complex conjugate pair of eigenvalues * w(j) and w(j+1), corresponding to a 2-by-2 diagonal block, * either SELECT(j) or SELECT(j+1) or both must be set to 1; * a complex conjugate pair of eigenvalues must be * either both included in the cluster or both excluded. * On output, the (partial) reordering is displayed. * * PARA (global input) INTEGER*6 * Block parameters (some should be replaced by calls to * PILAENV and others by meaningful default values): * PARA(1) = maximum number of concurrent computational windows * allowed in the algorithm; * 0 < PARA(1) <= min(NPROW,NPCOL) must hold; * PARA(2) = number of eigenvalues in each window; * 0 < PARA(2) < PARA(3) must hold; * PARA(3) = window size; PARA(2) < PARA(3) < DESCT(MB_) * must hold; * PARA(4) = minimal percentage of flops required for * performing matrix-matrix multiplications instead * of pipelined orthogonal transformations; * 0 <= PARA(4) <= 100 must hold; * PARA(5) = width of block column slabs for row-wise * application of pipelined orthogonal * transformations in their factorized form; * 0 < PARA(5) <= DESCT(MB_) must hold. * PARA(6) = the maximum number of eigenvalues moved together * over a process border; in practice, this will be * approximately half of the cross border window size * 0 < PARA(6) <= PARA(2) must hold; * * N (global input) INTEGER * The order of the globally distributed matrix T. N >= 0. * * T (local input/output) DOUBLE PRECISION array, * dimension (LLD_T,LOCc(N)). * On entry, the local pieces of the global distributed * upper quasi-triangular matrix T, in Schur form. On exit, T is * overwritten by the local pieces of the reordered matrix T, * again in Schur form, with the selected eigenvalues in the * globally leading diagonal blocks. * * IT (global input) INTEGER * JT (global input) INTEGER * The row and column index in the global array T indicating the * first column of sub( T ). IT = JT = 1 must hold. * * DESCT (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the global distributed matrix T. * * Q (local input/output) DOUBLE PRECISION array, * dimension (LLD_Q,LOCc(N)). * On entry, if COMPQ = 'V', the local pieces of the global * distributed matrix Q of Schur vectors. * On exit, if COMPQ = 'V', Q has been postmultiplied by the * global orthogonal transformation matrix which reorders T; the * leading M columns of Q form an orthonormal basis for the * specified invariant subspace. * If COMPQ = 'N', Q is not referenced. * * IQ (global input) INTEGER * JQ (global input) INTEGER * The column index in the global array Q indicating the * first column of sub( Q ). IQ = JQ = 1 must hold. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the global distributed matrix Q. * * WR (global output) DOUBLE PRECISION array, dimension (N) * WI (global output) DOUBLE PRECISION array, dimension (N) * The real and imaginary parts, respectively, of the reordered * eigenvalues of T. The eigenvalues are in principle stored in * the same order as on the diagonal of T, with WR(i) = T(i,i) * and, if T(i:i+1,i:i+1) is a 2-by-2 diagonal block, WI(i) > 0 * and WI(i+1) = -WI(i). * Note also that if a complex eigenvalue is sufficiently * ill-conditioned, then its value may differ significantly * from its value before reordering. * * M (global output) INTEGER * The dimension of the specified invariant subspace. * 0 <= M <= N. * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued by PXERBLA. * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The dimension of the array IWORK. * * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value. * If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*1000+j), if the i-th * argument is a scalar and had an illegal value, then INFO = -i. * > 0: here we have several possibilites * *) Reordering of T failed because some eigenvalues are too * close to separate (the problem is very ill-conditioned); * T may have been partially reordered, and WR and WI * contain the eigenvalues in the same order as in T. * On exit, INFO = {the index of T where the swap failed}. * *) A 2-by-2 block to be reordered split into two 1-by-1 * blocks and the second block failed to swap with an * adjacent block. * On exit, INFO = {the index of T where the swap failed}. * *) If INFO = N+1, there is no valid BLACS context (see the * BLACS documentation for details). * In a future release this subroutine may distinguish between * the case 1 and 2 above. * * Additional requirements * ======================= * * The following alignment requirements must hold: * (a) DESCT( MB_ ) = DESCT( NB_ ) = DESCQ( MB_ ) = DESCQ( NB_ ) * (b) DESCT( RSRC_ ) = DESCQ( RSRC_ ) * (c) DESCT( CSRC_ ) = DESCQ( CSRC_ ) * * All matrices must be blocked by a block factor larger than or * equal to two (3). This is to simplify reordering across processor * borders in the presence of 2-by-2 blocks. * * Limitations * =========== * * This algorithm cannot work on submatrices of T and Q, i.e., * IT = JT = IQ = JQ = 1 must hold. This is however no limitation * since PDLAHQR does not compute Schur forms of submatrices anyway. * * References * ========== * * [1] Z. Bai and J. W. Demmel; On swapping diagonal blocks in real * Schur form, Linear Algebra Appl., 186:73--95, 1993. Also as * LAPACK Working Note 54. * * [2] D. Kressner; Block algorithms for reordering standard and * generalized Schur forms, ACM TOMS, 32(4):521-532, 2006. * Also LAPACK Working Note 171. * * [3] R. Granat, B. Kagstrom, and D. Kressner; Parallel eigenvalue * reordering in real Schur form, Concurrency and Computations: * Practice and Experience, 21(9):1225-1250, 2009. Also as * LAPACK Working Note 192. * * Parallel execution recommendations * ================================== * * Use a square grid, if possible, for maximum performance. The block * parameters in PARA should be kept well below the data distribution * block size. In particular, see [3] for recommended settings for * these parameters. * * In general, the parallel algorithm strives to perform as much work * as possible without crossing the block borders on the main block * diagonal. * * Contributors * ============ * * Implemented by Robert Granat, Dept. of Computing Science and HPC2N, * Umea University, Sweden, March 2007, * in collaboration with Bo Kagstrom and Daniel Kressner. * Modified by Meiyue Shao, October 2011. * * Revisions * ========= * * Please send bug-reports to granat@cs.umu.se * * Keywords * ======== * * Real Schur form, eigenvalue reordering * * ===================================================================== * .. * .. Parameters .. CHARACTER TOP INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ DOUBLE PRECISION ZERO, ONE PARAMETER ( TOP = '1-Tree', $ BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, $ ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, PAIR, SWAP, WANTQ, $ ISHH, FIRST, SKIP1CR, BORDER, LASTWAIT INTEGER NPROW, NPCOL, MYROW, MYCOL, NB, NPROCS, $ IERR, DIM1, INDX, LLDT, TRSRC, TCSRC, ILOC1, $ JLOC1, MYIERR, ICTXT, $ RSRC1, CSRC1, ILOC3, JLOC3, TRSRC3, $ TCSRC3, ILOC, JLOC, TRSRC4, TCSRC4, $ FLOPS, I, ILO, IHI, J, K, KK, KKS, $ KS, LIWMIN, LWMIN, MMULT, N1, N2, $ NCB, NDTRAF, NITRAF, NWIN, NUMWIN, PDTRAF, $ PITRAF, PDW, WINEIG, WINSIZ, LLDQ, $ RSRC, CSRC, ILILO, ILIHI, ILSEL, IRSRC, $ ICSRC, IPIW, IPW1, IPW2, IPW3, TIHI, TILO, $ LIHI, WINDOW, LILO, LSEL, BUFFER, $ NMWIN2, BUFFLEN, LROWS, LCOLS, ILOC2, JLOC2, $ WNEICR, WINDOW0, RSRC4, CSRC4, LIHI4, RSRC3, $ CSRC3, RSRC2, CSRC2, LIHIC, LIHI1, ILEN4, $ SELI4, ILEN1, DIM4, IPW4, QROWS, TROWS, $ TCOLS, IPW5, IPW6, IPW7, IPW8, JLOC4, $ EAST, WEST, ILOC4, SOUTH, NORTH, INDXS, $ ITT, JTT, ILEN, DLEN, INDXE, TRSRC1, TCSRC1, $ TRSRC2, TCSRC2, ILOS, DIR, TLIHI, TLILO, TLSEL, $ ROUND, LAST, WIN0S, WIN0E, WINE, MMAX, MMIN DOUBLE PRECISION ELEM, ELEM1, ELEM2, ELEM3, ELEM4, SN, CS, TMP, $ ELEM5 * .. * .. Local Arrays .. INTEGER IBUFF( 8 ), IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC, INDXG2P, INDXG2L EXTERNAL LSAME, NUMROC, INDXG2P, INDXG2L * .. * .. External Subroutines .. EXTERNAL PDLACPY, PXERBLA, PCHK1MAT, PCHK2MAT, $ DGEMM, DLAMOV, ILACPY, CHK1MAT, $ INFOG2L, DGSUM2D, DGESD2D, DGERV2D, DGEBS2D, $ DGEBR2D, IGSUM2D, BLACS_GRIDINFO, IGEBS2D, $ IGEBR2D, IGAMX2D, IGAMN2D, BDLAAPP, BDTREXC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT, MIN * .. * .. Local Functions .. INTEGER ICEIL * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCT( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL * * Test if grid is O.K., i.e., the context is valid. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = N+1 END IF * * Check if workspace query. * LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * Test dimensions for local sanity. * IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, INFO ) END IF * * Check the blocking sizes for alignment requirements. * IF( INFO.EQ.0 ) THEN IF( DESCT( MB_ ).NE.DESCT( NB_ ) ) INFO = -(1000*9 + MB_) END IF IF( INFO.EQ.0 ) THEN IF( DESCQ( MB_ ).NE.DESCQ( NB_ ) ) INFO = -(1000*13 + MB_) END IF IF( INFO.EQ.0 ) THEN IF( DESCT( MB_ ).NE.DESCQ( MB_ ) ) INFO = -(1000*9 + MB_) END IF * * Check the blocking sizes for minimum sizes. * IF( INFO.EQ.0 ) THEN IF( N.NE.DESCT( MB_ ) .AND. DESCT( MB_ ).LT.3 ) $ INFO = -(1000*9 + MB_) IF( N.NE.DESCQ( MB_ ) .AND. DESCQ( MB_ ).LT.3 ) $ INFO = -(1000*13 + MB_) END IF * * Check parameters in PARA. * NB = DESCT( MB_ ) IF( INFO.EQ.0 ) THEN IF( PARA(1).LT.1 .OR. PARA(1).GT.MIN(NPROW,NPCOL) ) $ INFO = -(1000 * 4 + 1) IF( PARA(2).LT.1 .OR. PARA(2).GE.PARA(3) ) $ INFO = -(1000 * 4 + 2) IF( PARA(3).LT.1 .OR. PARA(3).GT.NB ) $ INFO = -(1000 * 4 + 3) IF( PARA(4).LT.0 .OR. PARA(4).GT.100 ) $ INFO = -(1000 * 4 + 4) IF( PARA(5).LT.1 .OR. PARA(5).GT.NB ) $ INFO = -(1000 * 4 + 5) IF( PARA(6).LT.1 .OR. PARA(6).GT.PARA(2) ) $ INFO = -(1000 * 4 + 6) END IF * * Check requirements on IT, JT, IQ and JQ. * IF( INFO.EQ.0 ) THEN IF( IT.NE.1 ) INFO = -6 IF( JT.NE.IT ) INFO = -7 IF( IQ.NE.1 ) INFO = -10 IF( JQ.NE.IQ ) INFO = -11 END IF * * Test input parameters for global sanity. * IF( INFO.EQ.0 ) THEN CALL PCHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, 0, IDUM1, $ IDUM2, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL PCHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, 0, IDUM1, $ IDUM2, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL PCHK2MAT( N, 5, N, 5, IT, JT, DESCT, 9, N, 5, N, 5, $ IQ, JQ, DESCQ, 13, 0, IDUM1, IDUM2, INFO ) END IF * * Decode and test the input parameters. * IF( INFO.EQ.0 .OR. LQUERY ) THEN * WANTQ = LSAME( COMPQ, 'V' ) IF( N.LT.0 ) THEN INFO = -4 ELSE * * Extract local leading dimension. * LLDT = DESCT( LLD_ ) LLDQ = DESCQ( LLD_ ) * * Check the SELECT vector for consistency and set M to the * dimension of the specified invariant subspace. * M = 0 DO 10 K = 1, N IF( K.LT.N ) THEN CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ITT, JTT, TRSRC, TCSRC ) IF( MYROW.EQ.TRSRC .AND. MYCOL.EQ.TCSRC ) THEN ELEM = T( (JTT-1)*LLDT + ITT ) IF( ELEM.NE.ZERO ) THEN IF( SELECT(K).NE.0 .AND. $ SELECT(K+1).EQ.0 ) THEN * INFO = -2 SELECT(K+1) = 1 ELSEIF( SELECT(K).EQ.0 .AND. $ SELECT(K+1).NE.0 ) THEN * INFO = -2 SELECT(K) = 1 END IF END IF END IF END IF IF( SELECT(K).NE.0 ) M = M + 1 10 CONTINUE MMAX = M MMIN = M IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, MMAX, 1, -1, $ -1, -1, -1, -1 ) IF( NPROCS.GT.1 ) $ CALL IGAMN2D( ICTXT, 'All', TOP, 1, 1, MMIN, 1, -1, $ -1, -1, -1, -1 ) IF( MMAX.GT.MMIN ) THEN M = MMAX IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, N, 1, SELECT, N, $ -1, -1, -1, -1, -1 ) END IF * * Compute needed workspace. * N1 = M N2 = N - M * TROWS = NUMROC( N, NB, MYROW, DESCT(RSRC_), NPROW ) TCOLS = NUMROC( N, NB, MYCOL, DESCT(CSRC_), NPCOL ) LWMIN = N + 7*NB**2 + 2*TROWS*PARA( 3 ) + TCOLS*PARA( 3 ) + $ MAX( TROWS*PARA( 3 ), TCOLS*PARA( 3 ) ) LIWMIN = 5*PARA( 1 ) + PARA( 2 )*PARA( 3 ) - $ PARA( 2 ) * ( PARA( 2 ) + 1 ) / 2 * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF END IF END IF * * Global maximum on info. * IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, -1, -1, -1, $ -1, -1 ) * * Return if some argument is incorrect. * IF( INFO.NE.0 .AND. .NOT.LQUERY ) THEN M = 0 CALL PXERBLA( ICTXT, 'PDTRORD', -INFO ) RETURN ELSEIF( LQUERY ) THEN WORK( 1 ) = DBLE(LWMIN) IWORK( 1 ) = LIWMIN RETURN END IF * * Quick return if possible. * IF( M.EQ.N .OR. M.EQ.0 ) GO TO 545 * * Set parameters. * NUMWIN = PARA( 1 ) WINEIG = MAX( PARA( 2 ), 2 ) WINSIZ = MIN( MAX( PARA( 3 ), PARA( 2 )*2 ), NB ) MMULT = PARA( 4 ) NCB = PARA( 5 ) WNEICR = PARA( 6 ) * * Insert some pointers into INTEGER workspace. * * Information about all the active windows is stored * in IWORK( 1:5*NUMWIN ). Each processor has a copy. * LILO: start position * LIHI: stop position * LSEL: number of selected eigenvalues * RSRC: processor id (row) * CSRC: processor id (col) * IWORK( IPIW+ ) contain information of orthogonal transformations. * ILILO = 1 ILIHI = ILILO + NUMWIN ILSEL = ILIHI + NUMWIN IRSRC = ILSEL + NUMWIN ICSRC = IRSRC + NUMWIN IPIW = ICSRC + NUMWIN * * Insert some pointers into DOUBLE PRECISION workspace - for now we * only need two pointers. * IPW1 = 1 IPW2 = IPW1 + NB * * Collect the selected blocks at the top-left corner of T. * * Globally: ignore eigenvalues that are already in order. * ILO is a global variable and is kept updated to be consistent * throughout the process mesh. * ILO = 0 40 CONTINUE ILO = ILO + 1 IF( ILO.LE.N ) THEN IF( SELECT(ILO).NE.0 ) GO TO 40 END IF * * Globally: start the collection at the top of the matrix. Here, * IHI is a global variable and is kept updated to be consistent * throughout the process mesh. * IHI = N * * Globally: While ( ILO <= M ) do 50 CONTINUE * IF( ILO.LE.M ) THEN * * Depending on the value of ILO, find the diagonal block index J, * such that T(1+(J-1)*NB:1+J*NB,1+(J-1)*NB:1+J*NB) contains the * first unsorted eigenvalue. Check that J does not point to a * block with only one selected eigenvalue in the last position * which belongs to a splitted 2-by-2 block. * ILOS = ILO - 1 52 CONTINUE ILOS = ILOS + 1 IF( SELECT(ILOS).EQ.0 ) GO TO 52 IF( ILOS.LT.N ) THEN IF( SELECT(ILOS+1).NE.0 .AND. MOD(ILOS,NB).EQ.0 ) THEN CALL PDELGET( 'All', TOP, ELEM, T, ILOS+1, ILOS, DESCT ) IF( ELEM.NE.ZERO ) GO TO 52 END IF END IF J = ICEIL(ILOS,NB) * * Globally: Set start values of LILO and LIHI for all processes. * Choose also the number of selected eigenvalues at top of each * diagonal block such that the number of eigenvalues which remain * to be reordered is an integer multiple of WINEIG. * * All the information is saved into the INTEGER workspace such * that all processors are aware of each others operations. * * Compute the number of concurrent windows. * NMWIN2 = (ICEIL(IHI,NB)*NB - (ILO-MOD(ILO,NB)+1)+1) / NB NMWIN2 = MIN( MIN( NUMWIN, NMWIN2 ), ICEIL(N,NB) - J + 1 ) * * For all windows, set LSEL = 0 and find a proper start value of * LILO such that LILO points at the first non-selected entry in * the corresponding diagonal block of T. * DO 80 K = 1, NMWIN2 IWORK( ILSEL+K-1) = 0 IWORK( ILILO+K-1) = MAX( ILO, (J-1)*NB+(K-1)*NB+1 ) LILO = IWORK( ILILO+K-1 ) 82 CONTINUE IF( SELECT(LILO).NE.0 .AND. LILO.LT.(J+K-1)*NB ) THEN LILO = LILO + 1 IF( LILO.LE.N ) GO TO 82 END IF IWORK( ILILO+K-1 ) = LILO * * Fix each LILO to ensure that no 2-by-2 block is cut in top * of the submatrix (LILO:LIHI,LILO:LIHI). * LILO = IWORK(ILILO+K-1) IF( LILO.GT.NB ) THEN CALL PDELGET( 'All', TOP, ELEM, T, LILO, LILO-1, DESCT ) IF( ELEM.NE.ZERO ) THEN IF( LILO.LT.(J+K-1)*NB ) THEN IWORK(ILILO+K-1) = IWORK(ILILO+K-1) + 1 ELSE IWORK(ILILO+K-1) = IWORK(ILILO+K-1) - 1 END IF END IF END IF * * Set a proper LIHI value for each window. Also find the * processors corresponding to the corresponding windows. * IWORK( ILIHI+K-1 ) = IWORK( ILILO+K-1 ) IWORK( IRSRC+K-1 ) = INDXG2P( IWORK(ILILO+K-1), NB, MYROW, $ DESCT( RSRC_ ), NPROW ) IWORK( ICSRC+K-1 ) = INDXG2P( IWORK(ILILO+K-1), NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) TILO = IWORK(ILILO+K-1) TIHI = MIN( N, ICEIL( TILO, NB ) * NB ) DO 90 KK = TIHI, TILO, -1 IF( SELECT(KK).NE.0 ) THEN IWORK(ILIHI+K-1) = MAX(IWORK(ILIHI+K-1) , KK ) IWORK(ILSEL+K-1) = IWORK(ILSEL+K-1) + 1 IF( IWORK(ILSEL+K-1).GT.WINEIG ) THEN IWORK(ILIHI+K-1) = KK IWORK(ILSEL+K-1) = 1 END IF END IF 90 CONTINUE * * Fix each LIHI to avoid that bottom of window cuts 2-by-2 * block. We exclude such a block if located on block (process) * border and on window border or if an inclusion would cause * violation on the maximum number of eigenvalues to reorder * inside each window. If only on window border, we include it. * The excluded block is included automatically later when a * subcluster is reordered into the block from South-East. * LIHI = IWORK(ILIHI+K-1) IF( LIHI.LT.N ) THEN CALL PDELGET( 'All', TOP, ELEM, T, LIHI+1, LIHI, DESCT ) IF( ELEM.NE.ZERO ) THEN IF( ICEIL( LIHI, NB ) .NE. ICEIL( LIHI+1, NB ) .OR. $ IWORK( ILSEL+K-1 ).EQ.WINEIG ) THEN IWORK( ILIHI+K-1 ) = IWORK( ILIHI+K-1 ) - 1 IF( IWORK( ILSEL+K-1 ).GT.2 ) $ IWORK( ILSEL+K-1 ) = IWORK( ILSEL+K-1 ) - 1 ELSE IWORK( ILIHI+K-1 ) = IWORK( ILIHI+K-1 ) + 1 IF( SELECT(LIHI+1).NE.0 ) $ IWORK( ILSEL+K-1 ) = IWORK( ILSEL+K-1 ) + 1 END IF END IF END IF 80 CONTINUE * * Fix the special cases of LSEL = 0 and LILO = LIHI for each * window by assuring that the stop-condition for local reordering * is fulfilled directly. Do this by setting LIHI = startposition * for the corresponding block and LILO = LIHI + 1. * DO 85 K = 1, NMWIN2 LILO = IWORK( ILILO + K - 1 ) LIHI = IWORK( ILIHI + K - 1 ) LSEL = IWORK( ILSEL + K - 1 ) IF( LSEL.EQ.0 .OR. LILO.EQ.LIHI ) THEN LIHI = IWORK( ILIHI + K - 1 ) IWORK( ILIHI + K - 1 ) = (ICEIL(LIHI,NB)-1)*NB + 1 IWORK( ILILO + K - 1 ) = IWORK( ILIHI + K - 1 ) + 1 END IF 85 CONTINUE * * Associate all processors with the first computational window * that should be activated, if possible. * LILO = IHI LIHI = ILO LSEL = M FIRST = .TRUE. DO 95 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) IF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN TLILO = IWORK( ILILO + WINDOW - 1 ) TLIHI = IWORK( ILIHI + WINDOW - 1 ) TLSEL = IWORK( ILSEL + WINDOW - 1 ) IF( (.NOT. ( LIHI .GE. LILO + LSEL ) ) .AND. $ ( (TLIHI .GE. TLILO + TLSEL) .OR. FIRST ) ) THEN IF( FIRST ) FIRST = .FALSE. LILO = TLILO LIHI = TLIHI LSEL = TLSEL GO TO 97 END IF END IF 95 CONTINUE 97 CONTINUE * * Exclude all processors that are not involved in any * computational window right now. * IERR = 0 IF( LILO.EQ.IHI .AND. LIHI.EQ.ILO .AND. LSEL.EQ.M ) $ GO TO 114 * * Make sure all processors associated with a compuational window * enter the local reordering the first time. * FIRST = .TRUE. * * Globally for all computational windows: * While ( LIHI >= LILO + LSEL ) do ROUND = 1 130 CONTINUE IF( FIRST .OR. ( LIHI .GE. LILO + LSEL ) ) THEN * * Perform computations in parallel: loop through all * compuational windows, do local reordering and accumulate * transformations, broadcast them in the corresponding block * row and columns and compute the corresponding updates. * DO 110 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) * * The process on the block diagonal computes the * reordering. * IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN LILO = IWORK(ILILO+WINDOW-1) LIHI = IWORK(ILIHI+WINDOW-1) LSEL = IWORK(ILSEL+WINDOW-1) * * Compute the local value of I -- start position. * I = MAX( LILO, LIHI - WINSIZ + 1 ) * * Fix my I to avoid that top of window cuts a 2-by-2 * block. * IF( I.GT.LILO ) THEN CALL INFOG2L( I, I-1, DESCT, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC, CSRC ) IF( T( LLDT*(JLOC-1) + ILOC ).NE.ZERO ) $ I = I + 1 END IF * * Compute local indicies for submatrix to operate on. * CALL INFOG2L( I, I, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC1, JLOC1, RSRC, CSRC ) * * The active window is ( I:LIHI, I:LIHI ). Reorder * eigenvalues within this window and pipeline * transformations. * NWIN = LIHI - I + 1 KS = 0 PITRAF = IPIW PDTRAF = IPW2 * PAIR = .FALSE. DO 140 K = I, LIHI IF( PAIR ) THEN PAIR = .FALSE. ELSE SWAP = SELECT( K ).NE.0 IF( K.LT.LIHI ) THEN CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC, CSRC ) IF( T( LLDT*(JLOC-1) + ILOC ).NE.ZERO ) $ PAIR = .TRUE. END IF IF( SWAP ) THEN KS = KS + 1 * * Swap the K-th block to position I+KS-1. * IERR = 0 KK = K - I + 1 KKS = KS IF( KK.NE.KS ) THEN NITRAF = LIWORK - PITRAF + 1 NDTRAF = LWORK - PDTRAF + 1 CALL BDTREXC( NWIN, $ T(LLDT*(JLOC1-1) + ILOC1), LLDT, KK, $ KKS, NITRAF, IWORK( PITRAF ), NDTRAF, $ WORK( PDTRAF ), WORK(IPW1), IERR ) PITRAF = PITRAF + NITRAF PDTRAF = PDTRAF + NDTRAF * * Update array SELECT. * IF ( PAIR ) THEN DO 150 J = I+KK-1, I+KKS, -1 SELECT(J+1) = SELECT(J-1) 150 CONTINUE SELECT(I+KKS-1) = 1 SELECT(I+KKS) = 1 ELSE DO 160 J = I+KK-1, I+KKS, -1 SELECT(J) = SELECT(J-1) 160 CONTINUE SELECT(I+KKS-1) = 1 END IF * IF ( IERR.EQ.1 .OR. IERR.EQ.2 ) THEN * * Some blocks are too close to swap: * prepare to leave in a clean fashion. If * IERR.EQ.2, we must update SELECT to * account for the fact that the 2 by 2 * block to be reordered did split and the * first part of this block is already * reordered. * IF ( IERR.EQ.2 ) THEN SELECT( I+KKS-3 ) = 1 SELECT( I+KKS-1 ) = 0 KKS = KKS + 1 END IF * * Update off-diagonal blocks immediately. * GO TO 170 END IF KS = KKS END IF IF( PAIR ) $ KS = KS + 1 END IF END IF 140 CONTINUE END IF 110 CONTINUE 170 CONTINUE * * The on-diagonal processes save their information from the * local reordering in the integer buffer. This buffer is * broadcasted to updating processors, see below. * DO 175 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN IBUFF( 1 ) = I IBUFF( 2 ) = NWIN IBUFF( 3 ) = PITRAF IBUFF( 4 ) = KS IBUFF( 5 ) = PDTRAF IBUFF( 6 ) = NDTRAF ILEN = PITRAF - IPIW DLEN = PDTRAF - IPW2 IBUFF( 7 ) = ILEN IBUFF( 8 ) = DLEN END IF 175 CONTINUE * * For the updates with respect to the local reordering, we * organize the updates in two phases where the update * "direction" (controlled by the DIR variable below) is first * chosen to be the corresponding rows, then the corresponding * columns. * DO 1111 DIR = 1, 2 * * Broadcast information about the reordering and the * accumulated transformations: I, NWIN, PITRAF, NITRAF, * PDTRAF, NDTRAF. If no broadcast is performed, use an * artificial value of KS to prevent updating indicies for * windows already finished (use KS = -1). * DO 111 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) IF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN LILO = IWORK(ILILO+WINDOW-1) LIHI = IWORK(ILIHI+WINDOW-1) LSEL = IWORK(ILSEL+WINDOW-1) END IF IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) $ CALL IGEBS2D( ICTXT, 'Row', TOP, 8, 1, IBUFF, 8 ) IF( NPROW.GT.1 .AND. DIR.EQ.2 ) $ CALL IGEBS2D( ICTXT, 'Col', TOP, 8, 1, IBUFF, 8 ) ELSEIF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 .AND. MYROW.EQ.RSRC ) $ THEN IF( FIRST .OR. (LIHI .GE. LILO + LSEL) ) THEN CALL IGEBR2D( ICTXT, 'Row', TOP, 8, 1, IBUFF, 8, $ RSRC, CSRC ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) ELSE ILEN = 0 DLEN = 0 KS = -1 END IF END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 .AND. MYCOL.EQ.CSRC ) $ THEN IF( FIRST .OR. (LIHI .GE. LILO + LSEL) ) THEN CALL IGEBR2D( ICTXT, 'Col', TOP, 8, 1, IBUFF, 8, $ RSRC, CSRC ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) ELSE ILEN = 0 DLEN = 0 KS = -1 END IF END IF END IF * * Broadcast the accumulated transformations - copy all * information from IWORK(IPIW:PITRAF-1) and * WORK(IPW2:PDTRAF-1) to a buffer and broadcast this * buffer in the corresponding block row and column. On * arrival, copy the information back to the correct part of * the workspace. This step is avoided if no computations * were performed at the diagonal processor, i.e., * BUFFLEN = 0. * IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( BUFFLEN.NE.0 ) THEN DO 180 INDX = 1, ILEN WORK( BUFFER+INDX-1 ) = $ DBLE( IWORK(IPIW+INDX-1) ) 180 CONTINUE CALL DLAMOV( 'All', DLEN, 1, WORK( IPW2 ), $ DLEN, WORK(BUFFER+ILEN), DLEN ) IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) THEN CALL DGEBS2D( ICTXT, 'Row', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN ) END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 ) THEN CALL DGEBS2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN ) END IF END IF ELSEIF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 .AND. MYROW.EQ.RSRC ) $ THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( BUFFLEN.NE.0 ) THEN CALL DGEBR2D( ICTXT, 'Row', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC, CSRC ) END IF END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 .AND. MYCOL.EQ.CSRC ) $ THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( BUFFLEN.NE.0 ) THEN CALL DGEBR2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC, CSRC ) END IF END IF IF((NPCOL.GT.1.AND.DIR.EQ.1.AND.MYROW.EQ.RSRC).OR. $ (NPROW.GT.1.AND.DIR.EQ.2.AND.MYCOL.EQ.CSRC ) ) $ THEN IF( BUFFLEN.NE.0 ) THEN DO 190 INDX = 1, ILEN IWORK(IPIW+INDX-1) = $ INT(WORK( BUFFER+INDX-1 )) 190 CONTINUE CALL DLAMOV( 'All', DLEN, 1, $ WORK( BUFFER+ILEN ), DLEN, $ WORK( IPW2 ), DLEN ) END IF END IF END IF 111 CONTINUE * * Now really perform the updates by applying the orthogonal * transformations to the out-of-window parts of T and Q. This * step is avoided if no reordering was performed by the on- * diagonal processor from the beginning, i.e., BUFFLEN = 0. * * Count number of operations to decide whether to use * matrix-matrix multiplications for updating off-diagonal * parts or not. * DO 112 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) * IF( (MYROW.EQ.RSRC .AND. DIR.EQ.1 ).OR. $ (MYCOL.EQ.CSRC .AND. DIR.EQ.2 ) ) THEN LILO = IWORK(ILILO+WINDOW-1) LIHI = IWORK(ILIHI+WINDOW-1) LSEL = IWORK(ILSEL+WINDOW-1) * * Skip update part for current WINDOW if BUFFLEN = 0. * IF( BUFFLEN.EQ.0 ) GO TO 295 * NITRAF = PITRAF - IPIW ISHH = .FALSE. FLOPS = 0 DO 200 K = 1, NITRAF IF( IWORK( IPIW + K - 1 ).LE.NWIN ) THEN FLOPS = FLOPS + 6 ELSE FLOPS = FLOPS + 11 ISHH = .TRUE. END IF 200 CONTINUE * * Compute amount of work space necessary for performing * matrix-matrix multiplications. * PDW = BUFFER IPW3 = PDW + NWIN*NWIN ELSE FLOPS = 0 END IF * IF( FLOPS.NE.0 .AND. $ ( FLOPS*100 ) / ( 2*NWIN*NWIN ) .GE. MMULT ) THEN * * Update off-diagonal blocks and Q using matrix-matrix * multiplications; if there are no Householder * reflectors it is preferable to take the triangular * block structure of the transformation matrix into * account. * CALL DLASET( 'All', NWIN, NWIN, ZERO, ONE, $ WORK( PDW ), NWIN ) CALL BDLAAPP( 1, NWIN, NWIN, NCB, WORK( PDW ), NWIN, $ NITRAF, IWORK(IPIW), WORK( IPW2 ), WORK(IPW3) ) * IF( ISHH ) THEN * * Loop through the local blocks of the distributed * matrices T and Q and update them according to the * performed reordering. * * Update the columns of T and Q affected by the * reordering. * IF( DIR.EQ.2 ) THEN DO 210 INDX = 1, I-1, NB CALL INFOG2L( INDX, I, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN LROWS = MIN(NB,I-INDX) CALL DGEMM( 'No transpose', $ 'No transpose', LROWS, NWIN, NWIN, $ ONE, T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( PDW ), NWIN, ZERO, $ WORK(IPW3), LROWS ) CALL DLAMOV( 'All', LROWS, NWIN, $ WORK(IPW3), LROWS, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 210 CONTINUE IF( WANTQ ) THEN DO 220 INDX = 1, N, NB CALL INFOG2L( INDX, I, DESCQ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN LROWS = MIN(NB,N-INDX+1) CALL DGEMM( 'No transpose', $ 'No transpose', LROWS, NWIN, NWIN, $ ONE, Q((JLOC-1)*LLDQ+ILOC), LLDQ, $ WORK( PDW ), NWIN, ZERO, $ WORK(IPW3), LROWS ) CALL DLAMOV( 'All', LROWS, NWIN, $ WORK(IPW3), LROWS, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ ) END IF 220 CONTINUE END IF END IF * * Update the rows of T affected by the reordering * IF( DIR.EQ.1 ) THEN IF( LIHI.LT.N ) THEN IF( MOD(LIHI,NB).GT.0 ) THEN INDX = LIHI + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN LCOLS = MOD( MIN( NB-MOD(LIHI,NB), $ N-LIHI ), NB ) CALL DGEMM( 'Transpose', $ 'No Transpose', NWIN, LCOLS, NWIN, $ ONE, WORK( PDW ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT, ZERO, $ WORK(IPW3), NWIN ) CALL DLAMOV( 'All', NWIN, LCOLS, $ WORK(IPW3), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF END IF INDXS = ICEIL(LIHI,NB)*NB + 1 DO 230 INDX = INDXS, N, NB CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN LCOLS = MIN( NB, N-INDX+1 ) CALL DGEMM( 'Transpose', $ 'No Transpose', NWIN, LCOLS, NWIN, $ ONE, WORK( PDW ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT, ZERO, $ WORK(IPW3), NWIN ) CALL DLAMOV( 'All', NWIN, LCOLS, $ WORK(IPW3), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 230 CONTINUE END IF END IF ELSE * * The NWIN-by-NWIN matrix U containing the * accumulated orthogonal transformations has the * following structure: * * [ U11 U12 ] * U = [ ], * [ U21 U22 ] * * where U21 is KS-by-KS upper triangular and U12 is * (NWIN-KS)-by-(NWIN-KS) lower triangular. * * Update the columns of T and Q affected by the * reordering. * * Compute T2*U21 + T1*U11 in workspace. * IF( DIR.EQ.2 ) THEN DO 240 INDX = 1, I-1, NB CALL INFOG2L( INDX, I, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN JLOC1 = INDXG2L( I+NWIN-KS, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) LROWS = MIN(NB,I-INDX) CALL DLAMOV( 'All', LROWS, KS, $ T((JLOC1-1)*LLDT+ILOC ), LLDT, $ WORK(IPW3), LROWS ) CALL DTRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', LROWS, KS, ONE, $ WORK( PDW+NWIN-KS ), NWIN, $ WORK(IPW3), LROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', LROWS, KS, NWIN-KS, $ ONE, T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( PDW ), NWIN, ONE, WORK(IPW3), $ LROWS ) * * Compute T1*U12 + T2*U22 in workspace. * CALL DLAMOV( 'All', LROWS, NWIN-KS, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( IPW3+KS*LROWS ), LROWS ) CALL DTRMM( 'Right', 'Lower', $ 'No transpose', 'Non-unit', $ LROWS, NWIN-KS, ONE, $ WORK( PDW+NWIN*KS ), NWIN, $ WORK( IPW3+KS*LROWS ), LROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', LROWS, NWIN-KS, KS, $ ONE, T((JLOC1-1)*LLDT+ILOC), LLDT, $ WORK( PDW+NWIN*KS+NWIN-KS ), NWIN, $ ONE, WORK( IPW3+KS*LROWS ), LROWS ) * * Copy workspace to T. * CALL DLAMOV( 'All', LROWS, NWIN, $ WORK(IPW3), LROWS, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 240 CONTINUE IF( WANTQ ) THEN * * Compute Q2*U21 + Q1*U11 in workspace. * DO 250 INDX = 1, N, NB CALL INFOG2L( INDX, I, DESCQ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN JLOC1 = INDXG2L( I+NWIN-KS, NB, $ MYCOL, DESCQ( CSRC_ ), NPCOL ) LROWS = MIN(NB,N-INDX+1) CALL DLAMOV( 'All', LROWS, KS, $ Q((JLOC1-1)*LLDQ+ILOC ), LLDQ, $ WORK(IPW3), LROWS ) CALL DTRMM( 'Right', 'Upper', $ 'No transpose', 'Non-unit', $ LROWS, KS, ONE, $ WORK( PDW+NWIN-KS ), NWIN, $ WORK(IPW3), LROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', LROWS, KS, $ NWIN-KS, ONE, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ, $ WORK( PDW ), NWIN, ONE, $ WORK(IPW3), LROWS ) * * Compute Q1*U12 + Q2*U22 in workspace. * CALL DLAMOV( 'All', LROWS, NWIN-KS, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ, $ WORK( IPW3+KS*LROWS ), LROWS) CALL DTRMM( 'Right', 'Lower', $ 'No transpose', 'Non-unit', $ LROWS, NWIN-KS, ONE, $ WORK( PDW+NWIN*KS ), NWIN, $ WORK( IPW3+KS*LROWS ), LROWS) CALL DGEMM( 'No transpose', $ 'No transpose', LROWS, NWIN-KS, $ KS, ONE, Q((JLOC1-1)*LLDQ+ILOC), $ LLDQ, WORK(PDW+NWIN*KS+NWIN-KS), $ NWIN, ONE, WORK( IPW3+KS*LROWS ), $ LROWS ) * * Copy workspace to Q. * CALL DLAMOV( 'All', LROWS, NWIN, $ WORK(IPW3), LROWS, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ ) END IF 250 CONTINUE END IF END IF * IF( DIR.EQ.1 ) THEN IF ( LIHI.LT.N ) THEN * * Compute U21**T*T2 + U11**T*T1 in workspace. * IF( MOD(LIHI,NB).GT.0 ) THEN INDX = LIHI + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN ILOC1 = INDXG2L( I+NWIN-KS, NB, MYROW, $ DESCT( RSRC_ ), NPROW ) LCOLS = MOD( MIN( NB-MOD(LIHI,NB), $ N-LIHI ), NB ) CALL DLAMOV( 'All', KS, LCOLS, $ T((JLOC-1)*LLDT+ILOC1), LLDT, $ WORK(IPW3), NWIN ) CALL DTRMM( 'Left', 'Upper', $ 'Transpose', 'Non-unit', KS, $ LCOLS, ONE, WORK( PDW+NWIN-KS ), $ NWIN, WORK(IPW3), NWIN ) CALL DGEMM( 'Transpose', $ 'No transpose', KS, LCOLS, $ NWIN-KS, ONE, WORK(PDW), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT, ONE, $ WORK(IPW3), NWIN ) * * Compute U12**T*T1 + U22**T*T2 in * workspace. * CALL DLAMOV( 'All', NWIN-KS, LCOLS, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( IPW3+KS ), NWIN ) CALL DTRMM( 'Left', 'Lower', $ 'Transpose', 'Non-unit', $ NWIN-KS, LCOLS, ONE, $ WORK( PDW+NWIN*KS ), NWIN, $ WORK( IPW3+KS ), NWIN ) CALL DGEMM( 'Transpose', $ 'No Transpose', NWIN-KS, LCOLS, $ KS, ONE, $ WORK( PDW+NWIN*KS+NWIN-KS ), $ NWIN, T((JLOC-1)*LLDT+ILOC1), $ LLDT, ONE, WORK( IPW3+KS ), $ NWIN ) * * Copy workspace to T. * CALL DLAMOV( 'All', NWIN, LCOLS, $ WORK(IPW3), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF END IF INDXS = ICEIL(LIHI,NB)*NB + 1 DO 260 INDX = INDXS, N, NB CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN * * Compute U21**T*T2 + U11**T*T1 in * workspace. * ILOC1 = INDXG2L( I+NWIN-KS, NB, $ MYROW, DESCT( RSRC_ ), NPROW ) LCOLS = MIN( NB, N-INDX+1 ) CALL DLAMOV( 'All', KS, LCOLS, $ T((JLOC-1)*LLDT+ILOC1), LLDT, $ WORK(IPW3), NWIN ) CALL DTRMM( 'Left', 'Upper', $ 'Transpose', 'Non-unit', KS, $ LCOLS, ONE, $ WORK( PDW+NWIN-KS ), NWIN, $ WORK(IPW3), NWIN ) CALL DGEMM( 'Transpose', $ 'No transpose', KS, LCOLS, $ NWIN-KS, ONE, WORK(PDW), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT, ONE, $ WORK(IPW3), NWIN ) * * Compute U12**T*T1 + U22**T*T2 in * workspace. * CALL DLAMOV( 'All', NWIN-KS, LCOLS, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( IPW3+KS ), NWIN ) CALL DTRMM( 'Left', 'Lower', $ 'Transpose', 'Non-unit', $ NWIN-KS, LCOLS, ONE, $ WORK( PDW+NWIN*KS ), NWIN, $ WORK( IPW3+KS ), NWIN ) CALL DGEMM( 'Transpose', $ 'No Transpose', NWIN-KS, LCOLS, $ KS, ONE, $ WORK( PDW+NWIN*KS+NWIN-KS ), $ NWIN, T((JLOC-1)*LLDT+ILOC1), $ LLDT, ONE, WORK(IPW3+KS), NWIN ) * * Copy workspace to T. * CALL DLAMOV( 'All', NWIN, LCOLS, $ WORK(IPW3), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 260 CONTINUE END IF END IF END IF ELSEIF( FLOPS.NE.0 ) THEN * * Update off-diagonal blocks and Q using the pipelined * elementary transformations. * IF( DIR.EQ.2 ) THEN DO 270 INDX = 1, I-1, NB CALL INFOG2L( INDX, I, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN LROWS = MIN(NB,I-INDX) CALL BDLAAPP( 1, LROWS, NWIN, NCB, $ T((JLOC-1)*LLDT+ILOC ), LLDT, NITRAF, $ IWORK(IPIW), WORK( IPW2 ), $ WORK(IPW3) ) END IF 270 CONTINUE IF( WANTQ ) THEN DO 280 INDX = 1, N, NB CALL INFOG2L( INDX, I, DESCQ, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN LROWS = MIN(NB,N-INDX+1) CALL BDLAAPP( 1, LROWS, NWIN, NCB, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ, NITRAF, $ IWORK(IPIW), WORK( IPW2 ), $ WORK(IPW3) ) END IF 280 CONTINUE END IF END IF IF( DIR.EQ.1 ) THEN IF( LIHI.LT.N ) THEN IF( MOD(LIHI,NB).GT.0 ) THEN INDX = LIHI + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN LCOLS = MOD( MIN( NB-MOD(LIHI,NB), $ N-LIHI ), NB ) CALL BDLAAPP( 0, NWIN, LCOLS, NCB, $ T((JLOC-1)*LLDT+ILOC), LLDT, NITRAF, $ IWORK(IPIW), WORK( IPW2 ), $ WORK(IPW3) ) END IF END IF INDXS = ICEIL(LIHI,NB)*NB + 1 DO 290 INDX = INDXS, N, NB CALL INFOG2L( I, INDX, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN LCOLS = MIN( NB, N-INDX+1 ) CALL BDLAAPP( 0, NWIN, LCOLS, NCB, $ T((JLOC-1)*LLDT+ILOC), LLDT, NITRAF, $ IWORK(IPIW), WORK( IPW2 ), $ WORK(IPW3) ) END IF 290 CONTINUE END IF END IF END IF * * If I was not involved in the updates for the current * window or the window was fully processed, I go here and * try again for the next window. * 295 CONTINUE * * Update LIHI and LIHI depending on the number of * eigenvalues really moved - for on-diagonal processes we * do this update only once since each on-diagonal process * is only involved with one window at one time. The * indicies are updated in three cases: * 1) When some reordering was really performed * -- indicated by BUFFLEN > 0. * 2) When no selected eigenvalues was found in the * current window -- indicated by KS = 0. * 3) When some selected eigenvalues was found in the * current window but no one of them was moved * (KS > 0 and BUFFLEN = 0) * False index updating is avoided by sometimes setting * KS = -1. This will affect processors involved in more * than one window and where the first one ends up with * KS = 0 and for the second one is done already. * IF( MYROW.EQ.RSRC.AND.MYCOL.EQ.CSRC ) THEN IF( DIR.EQ.2 ) THEN IF( BUFFLEN.NE.0 .OR. KS.EQ.0 .OR. $ ( BUFFLEN.EQ.0 .AND. KS.GT.0 ) ) $ LIHI = I + KS - 1 IWORK( ILIHI+WINDOW-1 ) = LIHI IF( .NOT. LIHI.GE.LILO+LSEL ) THEN LILO = LILO + LSEL IWORK( ILILO+WINDOW-1 ) = LILO END IF END IF ELSEIF( MYROW.EQ.RSRC .AND. DIR.EQ.1 ) THEN IF( BUFFLEN.NE.0 .OR. KS.EQ.0 .OR. $ ( BUFFLEN.EQ.0 .AND. KS.GT.0 ) ) $ LIHI = I + KS - 1 IWORK( ILIHI+WINDOW-1 ) = LIHI IF( .NOT. LIHI.GE.LILO+LSEL ) THEN LILO = LILO + LSEL IWORK( ILILO+WINDOW-1 ) = LILO END IF ELSEIF( MYCOL.EQ.CSRC .AND. DIR.EQ.2 ) THEN IF( BUFFLEN.NE.0 .OR. KS.EQ.0 .OR. $ ( BUFFLEN.EQ.0 .AND. KS.GT.0 ) ) $ LIHI = I + KS - 1 IWORK( ILIHI+WINDOW-1 ) = LIHI IF( .NOT. LIHI.GE.LILO+LSEL ) THEN LILO = LILO + LSEL IWORK( ILILO+WINDOW-1 ) = LILO END IF END IF * 112 CONTINUE * * End of direction loop for updates with respect to local * reordering. * 1111 CONTINUE * * Associate each process with one of the corresponding * computational windows such that the test for another round * of local reordering is carried out properly. Since the * column updates were computed after the row updates, it is * sufficient to test for changing the association to the * window in the corresponding process row. * DO 113 WINDOW = 1, NMWIN2 RSRC = IWORK( IRSRC + WINDOW - 1 ) IF( MYROW.EQ.RSRC .AND. (.NOT. LIHI.GE.LILO+LSEL ) ) THEN LILO = IWORK( ILILO + WINDOW - 1 ) LIHI = IWORK( ILIHI + WINDOW - 1 ) LSEL = IWORK( ILSEL + WINDOW - 1 ) END IF 113 CONTINUE * * End While ( LIHI >= LILO + LSEL ) ROUND = ROUND + 1 IF( FIRST ) FIRST = .FALSE. GO TO 130 END IF * * All processors excluded from the local reordering go here. * 114 CONTINUE * * Barrier to collect the processes before proceeding. * CALL BLACS_BARRIER( ICTXT, 'All' ) * * Compute global maximum of IERR so that we know if some process * experienced a failure in the reordering. * MYIERR = IERR IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, IERR, 1, -1, $ -1, -1, -1, -1 ) * IF( IERR.NE.0 ) THEN * * When calling BDTREXC, the block at position I+KKS-1 failed * to swap. * IF( MYIERR.NE.0 ) INFO = MAX(1,I+KKS-1) IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, -1, $ -1, -1, -1, -1 ) GO TO 300 END IF * * Now, for each compuational window, move the selected * eigenvalues across the process border. Do this by forming the * processors into groups of four working together to bring the * window over the border. The processes are numbered as follows * * 1 | 2 * --+-- * 3 | 4 * * where '|' and '-' denotes the process (and block) borders. * This implies that the cluster to be reordered over the border * is held by process 4, process 1 will receive the cluster after * the reordering, process 3 holds the local (2,1)th element of a * 2-by-2 diagonal block located on the block border and process 2 * holds the closest off-diagonal part of the window that is * affected by the cross-border reordering. * * The active window is now ( I : LIHI[4], I : LIHI[4] ), where * I = MAX( ILO, LIHI - 2*MOD(LIHI,NB) ). If this active window is * too large compared to the value of PARA( 6 ), it will be * truncated in both ends such that a maximum of PARA( 6 ) * eigenvalues is reordered across the border this time. * * The active window will be collected and built in workspace at * process 1 and 4, which both compute the reordering and return * the updated parts to the corresponding processes 2-3. Next, the * accumulated transformations are broadcasted for updates in the * block rows and column that corresponds to the process rows and * columns where process 1 and 4 reside. * * The off-diagonal blocks are updated by the processes receiving * from the broadcasts of the orthogonal transformations. Since * the active window is split over the process borders, the * updates of T and Q requires that stripes of block rows of * columns are exchanged between neighboring processes in the * corresponding process rows and columns. * * First, form each group of processors involved in the * crossborder reordering. Do this in two (or three) phases: * 1) Reorder each odd window over the border. * 2) Reorder each even window over the border. * 3) Reorder the last odd window over the border, if it was not * processed in the first phase. * * When reordering the odd windows over the border, we must make * sure that no process row or column is involved in both the * first and the last window at the same time. This happens when * the total number of windows is odd, greater than one and equal * to the minumum process mesh dimension. Therefore the last odd * window may be reordered over the border at last. * LASTWAIT = NMWIN2.GT.1 .AND. MOD(NMWIN2,2).EQ.1 .AND. $ NMWIN2.EQ.MIN(NPROW,NPCOL) * LAST = 0 308 CONTINUE IF( LASTWAIT ) THEN IF( LAST.EQ.0 ) THEN WIN0S = 1 WIN0E = 2 WINE = NMWIN2 - 1 ELSE WIN0S = NMWIN2 WIN0E = NMWIN2 WINE = NMWIN2 END IF ELSE WIN0S = 1 WIN0E = 2 WINE = NMWIN2 END IF DO 310 WINDOW0 = WIN0S, WIN0E DO 320 WINDOW = WINDOW0, WINE, 2 * * Define the process holding the down-right part of the * window. * RSRC4 = IWORK(IRSRC+WINDOW-1) CSRC4 = IWORK(ICSRC+WINDOW-1) * * Define the other processes in the group of four. * RSRC3 = RSRC4 CSRC3 = MOD( CSRC4 - 1 + NPCOL, NPCOL ) RSRC2 = MOD( RSRC4 - 1 + NPROW, NPROW ) CSRC2 = CSRC4 RSRC1 = RSRC2 CSRC1 = CSRC3 IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) .OR. $ ( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN * * Compute the correct active window - for reordering * into a block that has not been active at all before, * we try to reorder as many of our eigenvalues over the * border as possible without knowing of the situation on * the other side - this may cause very few eigenvalues * to be reordered over the border this time (perhaps not * any) but this should be an initial problem. Anyway, * the bottom-right position of the block will be at * position LIHIC. * IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN LIHI4 = ( IWORK( ILILO + WINDOW - 1 ) + $ IWORK( ILIHI + WINDOW - 1 ) ) / 2 LIHIC = MIN(LIHI4,(ICEIL(LIHI4,NB)-1)*NB+WNEICR) * * Fix LIHIC to avoid that bottom of window cuts * 2-by-2 block and make sure all processors in the * group knows about the correct value. * IF( (.NOT. LIHIC.LE.NB) .AND. LIHIC.LT.N ) THEN ILOC = INDXG2L( LIHIC+1, NB, MYROW, $ DESCT( RSRC_ ), NPROW ) JLOC = INDXG2L( LIHIC, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) IF( T( (JLOC-1)*LLDT+ILOC ).NE.ZERO ) THEN IF( MOD( LIHIC, NB ).EQ.1 .OR. $ ( MOD( LIHIC, NB ).EQ.2 .AND. $ SELECT(LIHIC-2).EQ.0 ) ) $ THEN LIHIC = LIHIC + 1 ELSE LIHIC = LIHIC - 1 END IF END IF END IF IF( RSRC4.NE.RSRC1 .OR. CSRC4.NE.CSRC1 ) $ CALL IGESD2D( ICTXT, 1, 1, LIHIC, 1, RSRC1, $ CSRC1 ) IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) $ CALL IGESD2D( ICTXT, 1, 1, LIHIC, 1, RSRC2, $ CSRC2 ) IF( RSRC4.NE.RSRC3 .OR. CSRC4.NE.CSRC3 ) $ CALL IGESD2D( ICTXT, 1, 1, LIHIC, 1, RSRC3, $ CSRC3 ) END IF IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC4.NE.RSRC1 .OR. CSRC4.NE.CSRC1 ) $ CALL IGERV2D( ICTXT, 1, 1, LIHIC, 1, RSRC4, $ CSRC4 ) END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) $ CALL IGERV2D( ICTXT, 1, 1, LIHIC, 1, RSRC4, $ CSRC4 ) END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN IF( RSRC4.NE.RSRC3 .OR. CSRC4.NE.CSRC3 ) $ CALL IGERV2D( ICTXT, 1, 1, LIHIC, 1, RSRC4, $ CSRC4 ) END IF * * Avoid going over the border with the first window if * it resides in the block where the last global position * T(ILO,ILO) is or ILO has been updated to point to a * position right of T(LIHIC,LIHIC). * SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) * * Decide I, where to put top of window, such that top of * window does not cut 2-by-2 block. Make sure that we do * not end up in a situation where a 2-by-2 block * splitted on the border is left in its original place * -- this can cause infinite loops. * Remedy: make sure that the part of the window that * resides left to the border is at least of dimension * two (2) in case we have 2-by-2 blocks in top of the * cross border window. * * Also make sure all processors in the group knows about * the correct value of I. When skipping the crossborder * reordering, just set I = LIHIC. * IF( .NOT. SKIP1CR ) THEN IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( WINDOW.EQ.1 ) THEN LIHI1 = ILO ELSE LIHI1 = IWORK( ILIHI + WINDOW - 2 ) END IF I = MAX( LIHI1, $ MIN( LIHIC-2*MOD(LIHIC,NB) + 1, $ (ICEIL(LIHIC,NB)-1)*NB - 1 ) ) ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I-1, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) IF( T( (JLOC-1)*LLDT+ILOC ).NE.ZERO ) $ I = I - 1 IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) $ CALL IGESD2D( ICTXT, 1, 1, I, 1, RSRC4, $ CSRC4 ) IF( RSRC1.NE.RSRC2 .OR. CSRC1.NE.CSRC2 ) $ CALL IGESD2D( ICTXT, 1, 1, I, 1, RSRC2, $ CSRC2 ) IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) $ CALL IGESD2D( ICTXT, 1, 1, I, 1, RSRC3, $ CSRC3 ) END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN IF( RSRC1.NE.RSRC2 .OR. CSRC1.NE.CSRC2 ) $ CALL IGERV2D( ICTXT, 1, 1, I, 1, RSRC1, $ CSRC1 ) END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) $ CALL IGERV2D( ICTXT, 1, 1, I, 1, RSRC1, $ CSRC1 ) END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) $ CALL IGERV2D( ICTXT, 1, 1, I, 1, RSRC1, $ CSRC1 ) END IF ELSE I = LIHIC END IF * * Finalize computation of window size: active window is * now (I:LIHIC,I:LIHIC). * NWIN = LIHIC - I + 1 KS = 0 * * Skip rest of this part if appropriate. * IF( SKIP1CR ) GO TO 360 * * Divide workspace -- put active window in * WORK(IPW2:IPW2+NWIN**2-1) and orthogonal * transformations in WORK(IPW3:...). * CALL DLASET( 'All', NWIN, NWIN, ZERO, ZERO, $ WORK( IPW2 ), NWIN ) * PITRAF = IPIW IPW3 = IPW2 + NWIN*NWIN PDTRAF = IPW3 * * Exchange the current view of SELECT for the active * window between process 1 and 4 to make sure that * exactly the same job is performed for both processes. * IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) THEN ILEN4 = MOD(LIHIC,NB) SELI4 = ICEIL(I,NB)*NB+1 ILEN1 = NWIN - ILEN4 IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN CALL IGESD2D( ICTXT, ILEN1, 1, SELECT(I), $ ILEN1, RSRC4, CSRC4 ) CALL IGERV2D( ICTXT, ILEN4, 1, SELECT(SELI4), $ ILEN4, RSRC4, CSRC4 ) END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN CALL IGESD2D( ICTXT, ILEN4, 1, SELECT(SELI4), $ ILEN4, RSRC1, CSRC1 ) CALL IGERV2D( ICTXT, ILEN1, 1, SELECT(I), $ ILEN1, RSRC1, CSRC1 ) END IF END IF * * Form the active window by a series of point-to-point * sends and receives. * DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) CALL DLAMOV( 'All', DIM1, DIM1, $ T((JLOC-1)*LLDT+ILOC), LLDT, WORK(IPW2), $ NWIN ) IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) THEN CALL DGESD2D( ICTXT, DIM1, DIM1, $ WORK(IPW2), NWIN, RSRC4, CSRC4 ) CALL DGERV2D( ICTXT, DIM4, DIM4, $ WORK(IPW2+DIM1*NWIN+DIM1), NWIN, RSRC4, $ CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN ILOC = INDXG2L( I+DIM1, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I+DIM1, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) CALL DLAMOV( 'All', DIM4, DIM4, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK(IPW2+DIM1*NWIN+DIM1), NWIN ) IF( RSRC4.NE.RSRC1 .OR. CSRC4.NE.CSRC1 ) THEN CALL DGESD2D( ICTXT, DIM4, DIM4, $ WORK(IPW2+DIM1*NWIN+DIM1), NWIN, RSRC1, $ CSRC1 ) CALL DGERV2D( ICTXT, DIM1, DIM1, $ WORK(IPW2), NWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I+DIM1, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) CALL DLAMOV( 'All', DIM1, DIM4, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK(IPW2+DIM1*NWIN), NWIN ) IF( RSRC2.NE.RSRC1 .OR. CSRC2.NE.CSRC1 ) THEN CALL DGESD2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN IF( RSRC2.NE.RSRC4 .OR. CSRC2.NE.CSRC4 ) THEN CALL DGESD2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN ILOC = INDXG2L( I+DIM1, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I+DIM1-1, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) CALL DLAMOV( 'All', 1, 1, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN ) IF( RSRC3.NE.RSRC1 .OR. CSRC3.NE.CSRC1 ) THEN CALL DGESD2D( ICTXT, 1, 1, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN, $ RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN IF( RSRC3.NE.RSRC4 .OR. CSRC3.NE.CSRC4 ) THEN CALL DGESD2D( ICTXT, 1, 1, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN, $ RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC1.NE.RSRC2 .OR. CSRC1.NE.CSRC2 ) THEN CALL DGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC2, $ CSRC2 ) END IF IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) THEN CALL DGERV2D( ICTXT, 1, 1, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN, $ RSRC3, CSRC3 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) THEN CALL DGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC2, $ CSRC2 ) END IF IF( RSRC4.NE.RSRC3 .OR. CSRC4.NE.CSRC3 ) THEN CALL DGERV2D( ICTXT, 1, 1, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN, $ RSRC3, CSRC3 ) END IF END IF * * Compute the reordering (just as in the total local * case) and accumulate the transformations (ONLY * ON-DIAGONAL PROCESSES). * IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN PAIR = .FALSE. DO 330 K = I, LIHIC IF( PAIR ) THEN PAIR = .FALSE. ELSE SWAP = SELECT( K ).NE.0 IF( K.LT.LIHIC ) THEN ELEM = WORK(IPW2+(K-I)*NWIN+K-I+1) IF( ELEM.NE.ZERO ) $ PAIR = .TRUE. END IF IF( SWAP ) THEN KS = KS + 1 * * Swap the K-th block to position I+KS-1. * IERR = 0 KK = K - I + 1 KKS = KS IF( KK.NE.KS ) THEN NITRAF = LIWORK - PITRAF + 1 NDTRAF = LWORK - PDTRAF + 1 CALL BDTREXC( NWIN, WORK(IPW2), NWIN, $ KK, KKS, NITRAF, IWORK( PITRAF ), $ NDTRAF, WORK( PDTRAF ), $ WORK(IPW1), IERR ) PITRAF = PITRAF + NITRAF PDTRAF = PDTRAF + NDTRAF * * Update array SELECT. * IF ( PAIR ) THEN DO 340 J = I+KK-1, I+KKS, -1 SELECT(J+1) = SELECT(J-1) 340 CONTINUE SELECT(I+KKS-1) = 1 SELECT(I+KKS) = 1 ELSE DO 350 J = I+KK-1, I+KKS, -1 SELECT(J) = SELECT(J-1) 350 CONTINUE SELECT(I+KKS-1) = 1 END IF * IF ( IERR.EQ.1 .OR. IERR.EQ.2 ) THEN * IF ( IERR.EQ.2 ) THEN SELECT( I+KKS-3 ) = 1 SELECT( I+KKS-1 ) = 0 KKS = KKS + 1 END IF * GO TO 360 END IF KS = KKS END IF IF( PAIR ) $ KS = KS + 1 END IF END IF 330 CONTINUE END IF 360 CONTINUE * * Save information about the reordering. * IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN IBUFF( 1 ) = I IBUFF( 2 ) = NWIN IBUFF( 3 ) = PITRAF IBUFF( 4 ) = KS IBUFF( 5 ) = PDTRAF IBUFF( 6 ) = NDTRAF ILEN = PITRAF - IPIW + 1 DLEN = PDTRAF - IPW3 + 1 IBUFF( 7 ) = ILEN IBUFF( 8 ) = DLEN * * Put reordered data back into global matrix if a * reordering took place. * IF( .NOT. SKIP1CR ) THEN IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) CALL DLAMOV( 'All', DIM1, DIM1, WORK(IPW2), $ NWIN, T((JLOC-1)*LLDT+ILOC), LLDT ) END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN ILOC = INDXG2L( I+DIM1, NB, MYROW, $ DESCT( RSRC_ ), NPROW ) JLOC = INDXG2L( I+DIM1, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) CALL DLAMOV( 'All', DIM4, DIM4, $ WORK(IPW2+DIM1*NWIN+DIM1), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF END IF END IF * * Break if appropriate -- IBUFF(3:8) may now contain * nonsens, but that's no problem. The processors outside * the cross border group only needs to know about I and * NWIN to get a correct value of SKIP1CR (see below) and * to skip the cross border updates if necessary. * IF( WINDOW.EQ.1 .AND. SKIP1CR ) GO TO 325 * * Return reordered data to process 2 and 3. * IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) THEN CALL DGESD2D( ICTXT, 1, 1, $ WORK( IPW2+(DIM1-1)*NWIN+DIM1 ), NWIN, $ RSRC3, CSRC3 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) THEN CALL DGESD2D( ICTXT, DIM1, DIM4, $ WORK( IPW2+DIM1*NWIN), NWIN, RSRC2, $ CSRC2 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I+DIM1, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) IF( RSRC2.NE.RSRC4 .OR. CSRC2.NE.CSRC4 ) THEN CALL DGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC4, CSRC4 ) END IF CALL DLAMOV( 'All', DIM1, DIM4, $ WORK( IPW2+DIM1*NWIN ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN ILOC = INDXG2L( I+DIM1, NB, MYROW, $ DESCT( RSRC_ ), NPROW ) JLOC = INDXG2L( I+DIM1-1, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) IF( RSRC3.NE.RSRC1 .OR. CSRC3.NE.CSRC1 ) THEN CALL DGERV2D( ICTXT, 1, 1, $ WORK( IPW2+(DIM1-1)*NWIN+DIM1 ), NWIN, $ RSRC1, CSRC1 ) END IF T((JLOC-1)*LLDT+ILOC) = $ WORK( IPW2+(DIM1-1)*NWIN+DIM1 ) END IF END IF * 325 CONTINUE * 320 CONTINUE * * For the crossborder updates, we use the same directions as * in the local reordering case above. * DO 2222 DIR = 1, 2 * * Broadcast information about the reordering. * DO 321 WINDOW = WINDOW0, WINE, 2 RSRC4 = IWORK(IRSRC+WINDOW-1) CSRC4 = IWORK(ICSRC+WINDOW-1) RSRC1 = MOD( RSRC4 - 1 + NPROW, NPROW ) CSRC1 = MOD( CSRC4 - 1 + NPCOL, NPCOL ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) $ CALL IGEBS2D( ICTXT, 'Row', TOP, 8, 1, $ IBUFF, 8 ) IF( NPROW.GT.1 .AND. DIR.EQ.2 ) $ CALL IGEBS2D( ICTXT, 'Col', TOP, 8, 1, $ IBUFF, 8 ) SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) ELSEIF( MYROW.EQ.RSRC1 .OR. MYCOL.EQ.CSRC1 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 .AND. $ MYROW.EQ.RSRC1 ) THEN CALL IGEBR2D( ICTXT, 'Row', TOP, 8, 1, $ IBUFF, 8, RSRC1, CSRC1 ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) BUFFLEN = ILEN + DLEN IPW3 = IPW2 + NWIN*NWIN DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 LIHIC = NWIN + I - 1 SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 .AND. $ MYCOL.EQ.CSRC1 ) THEN CALL IGEBR2D( ICTXT, 'Col', TOP, 8, 1, $ IBUFF, 8, RSRC1, CSRC1 ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) BUFFLEN = ILEN + DLEN IPW3 = IPW2 + NWIN*NWIN DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 LIHIC = NWIN + I - 1 SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) END IF END IF IF( RSRC1.NE.RSRC4 ) THEN IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) $ CALL IGEBS2D( ICTXT, 'Row', TOP, 8, 1, $ IBUFF, 8 ) SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) ELSEIF( MYROW.EQ.RSRC4 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) THEN CALL IGEBR2D( ICTXT, 'Row', TOP, 8, 1, $ IBUFF, 8, RSRC4, CSRC4 ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) BUFFLEN = ILEN + DLEN IPW3 = IPW2 + NWIN*NWIN DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 LIHIC = NWIN + I - 1 SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) END IF END IF END IF IF( CSRC1.NE.CSRC4 ) THEN IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( NPROW.GT.1 .AND. DIR.EQ.2 ) $ CALL IGEBS2D( ICTXT, 'Col', TOP, 8, 1, $ IBUFF, 8 ) SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) ELSEIF( MYCOL.EQ.CSRC4 ) THEN IF( NPROW.GT.1 .AND. DIR.EQ.2 ) THEN CALL IGEBR2D( ICTXT, 'Col', TOP, 8, 1, $ IBUFF, 8, RSRC4, CSRC4 ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) BUFFLEN = ILEN + DLEN IPW3 = IPW2 + NWIN*NWIN DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 LIHIC = NWIN + I - 1 SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) END IF END IF END IF * * Skip rest of broadcasts and updates if appropriate. * IF( SKIP1CR ) GO TO 326 * * Broadcast the orthogonal transformations. * IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( (NPROW.GT.1 .AND. DIR.EQ.2) .OR. $ (NPCOL.GT.1 .AND. DIR.EQ.1) ) THEN DO 370 INDX = 1, ILEN WORK( BUFFER+INDX-1 ) = $ DBLE( IWORK(IPIW+INDX-1) ) 370 CONTINUE CALL DLAMOV( 'All', DLEN, 1, WORK( IPW3 ), $ DLEN, WORK(BUFFER+ILEN), DLEN ) END IF IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) THEN CALL DGEBS2D( ICTXT, 'Row', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN ) END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 ) THEN CALL DGEBS2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN ) END IF ELSEIF( MYROW.EQ.RSRC1 .OR. MYCOL.EQ.CSRC1 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 .AND. $ MYROW.EQ.RSRC1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN CALL DGEBR2D( ICTXT, 'Row', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC1, CSRC1 ) END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 .AND. $ MYCOL.EQ.CSRC1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN CALL DGEBR2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC1, CSRC1 ) END IF IF( (NPCOL.GT.1.AND.DIR.EQ.1.AND.MYROW.EQ.RSRC1) $ .OR. (NPROW.GT.1.AND.DIR.EQ.2.AND. $ MYCOL.EQ.CSRC1) ) THEN DO 380 INDX = 1, ILEN IWORK(IPIW+INDX-1) = $ INT( WORK( BUFFER+INDX-1 ) ) 380 CONTINUE CALL DLAMOV( 'All', DLEN, 1, $ WORK( BUFFER+ILEN ), DLEN, $ WORK( IPW3 ), DLEN ) END IF END IF IF( RSRC1.NE.RSRC4 ) THEN IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) THEN DO 390 INDX = 1, ILEN WORK( BUFFER+INDX-1 ) = $ DBLE( IWORK(IPIW+INDX-1) ) 390 CONTINUE CALL DLAMOV( 'All', DLEN, 1, WORK( IPW3 ), $ DLEN, WORK(BUFFER+ILEN), DLEN ) CALL DGEBS2D( ICTXT, 'Row', TOP, BUFFLEN, $ 1, WORK(BUFFER), BUFFLEN ) END IF ELSEIF( MYROW.EQ.RSRC4 .AND. DIR.EQ.1 .AND. $ NPCOL.GT.1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN CALL DGEBR2D( ICTXT, 'Row', TOP, BUFFLEN, $ 1, WORK(BUFFER), BUFFLEN, RSRC4, CSRC4 ) DO 400 INDX = 1, ILEN IWORK(IPIW+INDX-1) = $ INT( WORK( BUFFER+INDX-1 ) ) 400 CONTINUE CALL DLAMOV( 'All', DLEN, 1, $ WORK( BUFFER+ILEN ), DLEN, $ WORK( IPW3 ), DLEN ) END IF END IF IF( CSRC1.NE.CSRC4 ) THEN IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( NPROW.GT.1 .AND. DIR.EQ.2 ) THEN DO 395 INDX = 1, ILEN WORK( BUFFER+INDX-1 ) = $ DBLE( IWORK(IPIW+INDX-1) ) 395 CONTINUE CALL DLAMOV( 'All', DLEN, 1, WORK( IPW3 ), $ DLEN, WORK(BUFFER+ILEN), DLEN ) CALL DGEBS2D( ICTXT, 'Col', TOP, BUFFLEN, $ 1, WORK(BUFFER), BUFFLEN ) END IF ELSEIF( MYCOL.EQ.CSRC4 .AND. DIR.EQ.2 .AND. $ NPROW.GT.1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN CALL DGEBR2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC4, CSRC4 ) DO 402 INDX = 1, ILEN IWORK(IPIW+INDX-1) = $ INT( WORK( BUFFER+INDX-1 ) ) 402 CONTINUE CALL DLAMOV( 'All', DLEN, 1, $ WORK( BUFFER+ILEN ), DLEN, $ WORK( IPW3 ), DLEN ) END IF END IF * 326 CONTINUE * 321 CONTINUE * * Compute crossborder updates. * DO 322 WINDOW = WINDOW0, WINE, 2 IF( WINDOW.EQ.1 .AND. SKIP1CR ) GO TO 327 RSRC4 = IWORK(IRSRC+WINDOW-1) CSRC4 = IWORK(ICSRC+WINDOW-1) RSRC1 = MOD( RSRC4 - 1 + NPROW, NPROW ) CSRC1 = MOD( CSRC4 - 1 + NPCOL, NPCOL ) * * Prepare workspaces for updates: * IPW3 holds now the orthogonal transformations * IPW4 holds the explicit orthogonal matrix, if formed * IPW5 holds the crossborder block column of T * IPW6 holds the crossborder block row of T * IPW7 holds the crossborder block column of Q * (if WANTQ=.TRUE.) * IPW8 points to the leftover workspace used as lhs in * matrix multiplications * IF( ((MYCOL.EQ.CSRC1.OR.MYCOL.EQ.CSRC4).AND.DIR.EQ.2) $ .OR. ((MYROW.EQ.RSRC1.OR.MYROW.EQ.RSRC4).AND. $ DIR.EQ.1)) THEN IPW4 = BUFFER IF( DIR.EQ.2 ) THEN IF( WANTQ ) THEN QROWS = NUMROC( N, NB, MYROW, DESCQ( RSRC_ ), $ NPROW ) ELSE QROWS = 0 END IF TROWS = NUMROC( I-1, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) ELSE QROWS = 0 TROWS = 0 END IF IF( DIR.EQ.1 ) THEN TCOLS = NUMROC( N - (I+DIM1-1), NB, MYCOL, $ CSRC4, NPCOL ) IF( MYCOL.EQ.CSRC4 ) TCOLS = TCOLS - DIM4 ELSE TCOLS = 0 END IF IPW5 = IPW4 + NWIN*NWIN IPW6 = IPW5 + TROWS * NWIN IF( WANTQ ) THEN IPW7 = IPW6 + NWIN * TCOLS IPW8 = IPW7 + QROWS * NWIN ELSE IPW8 = IPW6 + NWIN * TCOLS END IF END IF * * Let each process row and column involved in the updates * exchange data in T and Q with their neighbours. * IF( DIR.EQ.2 ) THEN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) THEN DO 410 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, I, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC1, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', TROWS, DIM1, $ T((JLOC1-1)*LLDT+ILOC), LLDT, $ WORK(IPW5), TROWS ) IF( NPCOL.GT.1 ) THEN EAST = MOD( MYCOL + 1, NPCOL ) CALL DGESD2D( ICTXT, TROWS, DIM1, $ WORK(IPW5), TROWS, RSRC, $ EAST ) CALL DGERV2D( ICTXT, TROWS, DIM4, $ WORK(IPW5+TROWS*DIM1), TROWS, $ RSRC, EAST ) END IF END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, I+DIM1, $ DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC4, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', TROWS, DIM4, $ T((JLOC4-1)*LLDT+ILOC), LLDT, $ WORK(IPW5+TROWS*DIM1), TROWS ) IF( NPCOL.GT.1 ) THEN WEST = MOD( MYCOL-1+NPCOL, NPCOL ) CALL DGESD2D( ICTXT, TROWS, DIM4, $ WORK(IPW5+TROWS*DIM1), TROWS, $ RSRC, WEST ) CALL DGERV2D( ICTXT, TROWS, DIM1, $ WORK(IPW5), TROWS, RSRC, $ WEST ) END IF END IF END IF 410 CONTINUE END IF END IF * IF( DIR.EQ.1 ) THEN IF( MYROW.EQ.RSRC1 .OR. MYROW.EQ.RSRC4 ) THEN DO 420 INDX = 1, NPCOL IF( MYROW.EQ.RSRC1 ) THEN IF( INDX.EQ.1 ) THEN CALL INFOG2L( I, LIHIC+1, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC1, JLOC, $ RSRC1, CSRC ) ELSE CALL INFOG2L( I, $ (ICEIL(LIHIC,NB)+(INDX-2))*NB+1, $ DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC, RSRC1, CSRC ) END IF IF( MYCOL.EQ.CSRC ) THEN CALL DLAMOV( 'All', DIM1, TCOLS, $ T((JLOC-1)*LLDT+ILOC1), LLDT, $ WORK(IPW6), NWIN ) IF( NPROW.GT.1 ) THEN SOUTH = MOD( MYROW + 1, NPROW ) CALL DGESD2D( ICTXT, DIM1, TCOLS, $ WORK(IPW6), NWIN, SOUTH, $ CSRC ) CALL DGERV2D( ICTXT, DIM4, TCOLS, $ WORK(IPW6+DIM1), NWIN, SOUTH, $ CSRC ) END IF END IF END IF IF( MYROW.EQ.RSRC4 ) THEN IF( INDX.EQ.1 ) THEN CALL INFOG2L( I+DIM1, LIHIC+1, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, ILOC4, $ JLOC, RSRC4, CSRC ) ELSE CALL INFOG2L( I+DIM1, $ (ICEIL(LIHIC,NB)+(INDX-2))*NB+1, $ DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC, RSRC4, CSRC ) END IF IF( MYCOL.EQ.CSRC ) THEN CALL DLAMOV( 'All', DIM4, TCOLS, $ T((JLOC-1)*LLDT+ILOC4), LLDT, $ WORK(IPW6+DIM1), NWIN ) IF( NPROW.GT.1 ) THEN NORTH = MOD( MYROW-1+NPROW, NPROW ) CALL DGESD2D( ICTXT, DIM4, TCOLS, $ WORK(IPW6+DIM1), NWIN, NORTH, $ CSRC ) CALL DGERV2D( ICTXT, DIM1, TCOLS, $ WORK(IPW6), NWIN, NORTH, $ CSRC ) END IF END IF END IF 420 CONTINUE END IF END IF * IF( DIR.EQ.2 ) THEN IF( WANTQ ) THEN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) THEN DO 430 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, I, DESCQ, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC1, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', QROWS, DIM1, $ Q((JLOC1-1)*LLDQ+ILOC), LLDQ, $ WORK(IPW7), QROWS ) IF( NPCOL.GT.1 ) THEN EAST = MOD( MYCOL + 1, NPCOL ) CALL DGESD2D( ICTXT, QROWS, DIM1, $ WORK(IPW7), QROWS, RSRC, $ EAST ) CALL DGERV2D( ICTXT, QROWS, DIM4, $ WORK(IPW7+QROWS*DIM1), $ QROWS, RSRC, EAST ) END IF END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, I+DIM1, $ DESCQ, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC4, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', QROWS, DIM4, $ Q((JLOC4-1)*LLDQ+ILOC), LLDQ, $ WORK(IPW7+QROWS*DIM1), QROWS ) IF( NPCOL.GT.1 ) THEN WEST = MOD( MYCOL-1+NPCOL, $ NPCOL ) CALL DGESD2D( ICTXT, QROWS, DIM4, $ WORK(IPW7+QROWS*DIM1), $ QROWS, RSRC, WEST ) CALL DGERV2D( ICTXT, QROWS, DIM1, $ WORK(IPW7), QROWS, RSRC, $ WEST ) END IF END IF END IF 430 CONTINUE END IF END IF END IF * 327 CONTINUE * 322 CONTINUE * DO 323 WINDOW = WINDOW0, WINE, 2 RSRC4 = IWORK(IRSRC+WINDOW-1) CSRC4 = IWORK(ICSRC+WINDOW-1) RSRC1 = MOD( RSRC4 - 1 + NPROW, NPROW ) CSRC1 = MOD( CSRC4 - 1 + NPCOL, NPCOL ) FLOPS = 0 IF( ((MYCOL.EQ.CSRC1.OR.MYCOL.EQ.CSRC4).AND.DIR.EQ.2) $ .OR. ((MYROW.EQ.RSRC1.OR.MYROW.EQ.RSRC4).AND. $ DIR.EQ.1) ) THEN * * Skip this part of the updates if appropriate. * IF( WINDOW.EQ.1 .AND. SKIP1CR ) GO TO 328 * * Count number of operations to decide whether to use * matrix-matrix multiplications for updating * off-diagonal parts or not. * NITRAF = PITRAF - IPIW ISHH = .FALSE. DO 405 K = 1, NITRAF IF( IWORK( IPIW + K - 1 ).LE.NWIN ) THEN FLOPS = FLOPS + 6 ELSE FLOPS = FLOPS + 11 ISHH = .TRUE. END IF 405 CONTINUE * * Perform updates in parallel. * IF( FLOPS.NE.0 .AND. $ ( 2*FLOPS*100 )/( 2*NWIN*NWIN ) .GE. MMULT ) $ THEN * CALL DLASET( 'All', NWIN, NWIN, ZERO, ONE, $ WORK( IPW4 ), NWIN ) WORK(IPW8) = DBLE(MYROW) WORK(IPW8+1) = DBLE(MYCOL) CALL BDLAAPP( 1, NWIN, NWIN, NCB, WORK( IPW4 ), $ NWIN, NITRAF, IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) * * Test if sparsity structure of orthogonal matrix * can be exploited (see below). * IF( ISHH .OR. DIM1.NE.KS .OR. DIM4.NE.KS ) THEN * * Update the columns of T and Q affected by the * reordering. * IF( DIR.EQ.2 ) THEN DO 440 INDX = 1, MIN(I-1,1+(NPROW-1)*NB), $ NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DGEMM( 'No transpose', $ 'No transpose', TROWS, DIM1, $ NWIN, ONE, WORK( IPW5 ), $ TROWS, WORK( IPW4 ), NWIN, $ ZERO, WORK(IPW8), TROWS ) CALL DLAMOV( 'All', TROWS, DIM1, $ WORK(IPW8), TROWS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DGEMM( 'No transpose', $ 'No transpose', TROWS, DIM4, $ NWIN, ONE, WORK( IPW5 ), $ TROWS, $ WORK( IPW4+NWIN*DIM1 ), $ NWIN, ZERO, WORK(IPW8), $ TROWS ) CALL DLAMOV( 'All', TROWS, DIM4, $ WORK(IPW8), TROWS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF 440 CONTINUE * IF( WANTQ ) THEN DO 450 INDX = 1, MIN(N,1+(NPROW-1)*NB), $ NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCQ, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DGEMM( 'No transpose', $ 'No transpose', QROWS, $ DIM1, NWIN, ONE, $ WORK( IPW7 ), QROWS, $ WORK( IPW4 ), NWIN, $ ZERO, WORK(IPW8), $ QROWS ) CALL DLAMOV( 'All', QROWS, $ DIM1, WORK(IPW8), QROWS, $ Q((JLOC-1)*LLDQ+ILOC), $ LLDQ ) END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, $ DESCQ, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC, $ CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DGEMM( 'No transpose', $ 'No transpose', QROWS, $ DIM4, NWIN, ONE, $ WORK( IPW7 ), QROWS, $ WORK( IPW4+NWIN*DIM1 ), $ NWIN, ZERO, WORK(IPW8), $ QROWS ) CALL DLAMOV( 'All', QROWS, $ DIM4, WORK(IPW8), QROWS, $ Q((JLOC-1)*LLDQ+ILOC), $ LLDQ ) END IF END IF 450 CONTINUE END IF END IF * * Update the rows of T affected by the * reordering. * IF( DIR.EQ.1 ) THEN IF ( LIHIC.LT.N ) THEN IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC4 $ .AND.MOD(LIHIC,NB).NE.0 ) THEN INDX = LIHIC + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC4 ) CALL DGEMM( 'Transpose', $ 'No Transpose', DIM1, TCOLS, $ NWIN, ONE, WORK(IPW4), NWIN, $ WORK( IPW6 ), NWIN, ZERO, $ WORK(IPW8), DIM1 ) CALL DLAMOV( 'All', DIM1, TCOLS, $ WORK(IPW8), DIM1, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF IF( MYROW.EQ.RSRC4.AND.MYCOL.EQ.CSRC4 $ .AND.MOD(LIHIC,NB).NE.0 ) THEN INDX = LIHIC + 1 CALL INFOG2L( I+DIM1, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC4, CSRC4 ) CALL DGEMM( 'Transpose', $ 'No Transpose', DIM4, TCOLS, $ NWIN, ONE, $ WORK( IPW4+DIM1*NWIN ), NWIN, $ WORK( IPW6), NWIN, ZERO, $ WORK(IPW8), DIM4 ) CALL DLAMOV( 'All', DIM4, TCOLS, $ WORK(IPW8), DIM4, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF INDXS = ICEIL(LIHIC,NB)*NB + 1 INDXE = MIN(N,INDXS+(NPCOL-2)*NB) DO 460 INDX = INDXS, INDXE, NB IF( MYROW.EQ.RSRC1 ) THEN CALL INFOG2L( I, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC1, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL DGEMM( 'Transpose', $ 'No Transpose', DIM1, $ TCOLS, NWIN, ONE, $ WORK( IPW4 ), NWIN, $ WORK( IPW6 ), NWIN, $ ZERO, WORK(IPW8), DIM1 ) CALL DLAMOV( 'All', DIM1, $ TCOLS, WORK(IPW8), DIM1, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF IF( MYROW.EQ.RSRC4 ) THEN CALL INFOG2L( I+DIM1, INDX, $ DESCT, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC4, $ CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL DGEMM( 'Transpose', $ 'No Transpose', DIM4, $ TCOLS, NWIN, ONE, $ WORK( IPW4+NWIN*DIM1 ), $ NWIN, WORK( IPW6 ), $ NWIN, ZERO, WORK(IPW8), $ DIM4 ) CALL DLAMOV( 'All', DIM4, $ TCOLS, WORK(IPW8), DIM4, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF 460 CONTINUE END IF END IF ELSE * * The NWIN-by-NWIN matrix U containing the * accumulated orthogonal transformations has * the following structure: * * [ U11 U12 ] * U = [ ], * [ U21 U22 ] * * where U21 is KS-by-KS upper triangular and * U12 is (NWIN-KS)-by-(NWIN-KS) lower * triangular. For reordering over the border * the structure is only exploited when the * border cuts the columns of U conformally with * the structure itself. This happens exactly * when all eigenvalues in the subcluster was * moved to the other side of the border and * fits perfectly in their new positions, i.e., * the reordering stops when the last eigenvalue * to cross the border is reordered to the * position closest to the border. Tested by * checking is KS = DIM1 = DIM4 (see above). * This should hold quite often. But this branch * is entered only if all involved eigenvalues * are real. * * Update the columns of T and Q affected by the * reordering. * * Compute T2*U21 + T1*U11 on the left side of * the border. * IF( DIR.EQ.2 ) THEN INDXE = MIN(I-1,1+(NPROW-1)*NB) DO 470 INDX = 1, INDXE, NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', TROWS, KS, $ WORK( IPW5+TROWS*DIM4), $ TROWS, WORK(IPW8), TROWS ) CALL DTRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', TROWS, KS, $ ONE, WORK( IPW4+DIM4 ), $ NWIN, WORK(IPW8), TROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', TROWS, KS, $ DIM4, ONE, WORK( IPW5 ), $ TROWS, WORK( IPW4 ), NWIN, $ ONE, WORK(IPW8), TROWS ) CALL DLAMOV( 'All', TROWS, KS, $ WORK(IPW8), TROWS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF * * Compute T1*U12 + T2*U22 on the right * side of the border. * IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', TROWS, DIM4, $ WORK(IPW5), TROWS, $ WORK( IPW8 ), TROWS ) CALL DTRMM( 'Right', 'Lower', $ 'No transpose', $ 'Non-unit', TROWS, DIM4, $ ONE, WORK( IPW4+NWIN*KS ), $ NWIN, WORK( IPW8 ), TROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', TROWS, DIM4, $ KS, ONE, $ WORK( IPW5+TROWS*DIM4), $ TROWS, $ WORK( IPW4+NWIN*KS+DIM4 ), $ NWIN, ONE, WORK( IPW8 ), $ TROWS ) CALL DLAMOV( 'All', TROWS, DIM4, $ WORK(IPW8), TROWS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF 470 CONTINUE IF( WANTQ ) THEN * * Compute Q2*U21 + Q1*U11 on the left * side of border. * INDXE = MIN(N,1+(NPROW-1)*NB) DO 480 INDX = 1, INDXE, NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCQ, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', QROWS, KS, $ WORK( IPW7+QROWS*DIM4), $ QROWS, WORK(IPW8), $ QROWS ) CALL DTRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', QROWS, $ KS, ONE, $ WORK( IPW4+DIM4 ), NWIN, $ WORK(IPW8), QROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', QROWS, $ KS, DIM4, ONE, $ WORK( IPW7 ), QROWS, $ WORK( IPW4 ), NWIN, ONE, $ WORK(IPW8), QROWS ) CALL DLAMOV( 'All', QROWS, KS, $ WORK(IPW8), QROWS, $ Q((JLOC-1)*LLDQ+ILOC), $ LLDQ ) END IF END IF * * Compute Q1*U12 + Q2*U22 on the right * side of border. * IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, $ DESCQ, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC, $ CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL DLAMOV( 'All', QROWS, $ DIM4, WORK(IPW7), QROWS, $ WORK( IPW8 ), QROWS ) CALL DTRMM( 'Right', 'Lower', $ 'No transpose', $ 'Non-unit', QROWS, $ DIM4, ONE, $ WORK( IPW4+NWIN*KS ), $ NWIN, WORK( IPW8 ), $ QROWS ) CALL DGEMM( 'No transpose', $ 'No transpose', QROWS, $ DIM4, KS, ONE, $ WORK(IPW7+QROWS*(DIM4)), $ QROWS, $ WORK(IPW4+NWIN*KS+DIM4), $ NWIN, ONE, WORK( IPW8 ), $ QROWS ) CALL DLAMOV( 'All', QROWS, $ DIM4, WORK(IPW8), QROWS, $ Q((JLOC-1)*LLDQ+ILOC), $ LLDQ ) END IF END IF 480 CONTINUE END IF END IF * IF( DIR.EQ.1 ) THEN IF ( LIHIC.LT.N ) THEN * * Compute U21**T*T2 + U11**T*T1 on the * upper side of the border. * IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC4 $ .AND.MOD(LIHIC,NB).NE.0 ) THEN INDX = LIHIC + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC4 ) CALL DLAMOV( 'All', KS, TCOLS, $ WORK( IPW6+DIM4 ), NWIN, $ WORK(IPW8), KS ) CALL DTRMM( 'Left', 'Upper', $ 'Transpose', 'Non-unit', $ KS, TCOLS, ONE, $ WORK( IPW4+DIM4 ), NWIN, $ WORK(IPW8), KS ) CALL DGEMM( 'Transpose', $ 'No transpose', KS, TCOLS, $ DIM4, ONE, WORK(IPW4), NWIN, $ WORK(IPW6), NWIN, ONE, $ WORK(IPW8), KS ) CALL DLAMOV( 'All', KS, TCOLS, $ WORK(IPW8), KS, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF * * Compute U12**T*T1 + U22**T*T2 on the * lower side of the border. * IF( MYROW.EQ.RSRC4.AND.MYCOL.EQ.CSRC4 $ .AND.MOD(LIHIC,NB).NE.0 ) THEN INDX = LIHIC + 1 CALL INFOG2L( I+DIM1, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC4, CSRC4 ) CALL DLAMOV( 'All', DIM4, TCOLS, $ WORK( IPW6 ), NWIN, $ WORK( IPW8 ), DIM4 ) CALL DTRMM( 'Left', 'Lower', $ 'Transpose', 'Non-unit', $ DIM4, TCOLS, ONE, $ WORK( IPW4+NWIN*KS ), NWIN, $ WORK( IPW8 ), DIM4 ) CALL DGEMM( 'Transpose', $ 'No Transpose', DIM4, TCOLS, $ KS, ONE, $ WORK( IPW4+NWIN*KS+DIM4 ), $ NWIN, WORK( IPW6+DIM1 ), NWIN, $ ONE, WORK( IPW8), DIM4 ) CALL DLAMOV( 'All', DIM4, TCOLS, $ WORK(IPW8), DIM4, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF * * Compute U21**T*T2 + U11**T*T1 on upper * side on border. * INDXS = ICEIL(LIHIC,NB)*NB+1 INDXE = MIN(N,INDXS+(NPCOL-2)*NB) DO 490 INDX = INDXS, INDXE, NB IF( MYROW.EQ.RSRC1 ) THEN CALL INFOG2L( I, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC1, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL DLAMOV( 'All', KS, TCOLS, $ WORK( IPW6+DIM4 ), NWIN, $ WORK(IPW8), KS ) CALL DTRMM( 'Left', 'Upper', $ 'Transpose', $ 'Non-unit', KS, $ TCOLS, ONE, $ WORK( IPW4+DIM4 ), NWIN, $ WORK(IPW8), KS ) CALL DGEMM( 'Transpose', $ 'No transpose', KS, $ TCOLS, DIM4, ONE, $ WORK(IPW4), NWIN, $ WORK(IPW6), NWIN, ONE, $ WORK(IPW8), KS ) CALL DLAMOV( 'All', KS, TCOLS, $ WORK(IPW8), KS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF * * Compute U12**T*T1 + U22**T*T2 on * lower side of border. * IF( MYROW.EQ.RSRC4 ) THEN CALL INFOG2L( I+DIM1, INDX, $ DESCT, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC4, $ CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL DLAMOV( 'All', DIM4, $ TCOLS, WORK( IPW6 ), $ NWIN, WORK( IPW8 ), $ DIM4 ) CALL DTRMM( 'Left', 'Lower', $ 'Transpose', $ 'Non-unit', DIM4, $ TCOLS, ONE, $ WORK( IPW4+NWIN*KS ), $ NWIN, WORK( IPW8 ), $ DIM4 ) CALL DGEMM( 'Transpose', $ 'No Transpose', DIM4, $ TCOLS, KS, ONE, $ WORK(IPW4+NWIN*KS+DIM4), $ NWIN, WORK( IPW6+DIM1 ), $ NWIN, ONE, WORK( IPW8), $ DIM4 ) CALL DLAMOV( 'All', DIM4, $ TCOLS, WORK(IPW8), DIM4, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF 490 CONTINUE END IF END IF END IF ELSEIF( FLOPS.NE.0 ) THEN * * Update off-diagonal blocks and Q using the * pipelined elementary transformations. Now we * have a delicate problem: how to do this without * redundant work? For now, we let the processes * involved compute the whole crossborder block * rows and column saving only the part belonging * to the corresponding side of the border. To make * this a realistic alternative, we have modified * the ratio r_flops (see Reference [2] above) to * give more favor to the ordinary matrix * multiplication. * IF( DIR.EQ.2 ) THEN INDXE = MIN(I-1,1+(NPROW-1)*NB) DO 500 INDX = 1, INDXE, NB CALL INFOG2L( INDX, I, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN CALL BDLAAPP( 1, TROWS, NWIN, NCB, $ WORK(IPW5), TROWS, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL DLAMOV( 'All', TROWS, DIM1, $ WORK(IPW5), TROWS, $ T((JLOC-1)*LLDT+ILOC ), LLDT ) END IF CALL INFOG2L( INDX, I+DIM1, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN IF( NPCOL.GT.1 ) $ CALL BDLAAPP( 1, TROWS, NWIN, NCB, $ WORK(IPW5), TROWS, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL DLAMOV( 'All', TROWS, DIM4, $ WORK(IPW5+TROWS*DIM1), TROWS, $ T((JLOC-1)*LLDT+ILOC ), LLDT ) END IF 500 CONTINUE IF( WANTQ ) THEN INDXE = MIN(N,1+(NPROW-1)*NB) DO 510 INDX = 1, INDXE, NB CALL INFOG2L( INDX, I, DESCQ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN CALL BDLAAPP( 1, QROWS, NWIN, NCB, $ WORK(IPW7), QROWS, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL DLAMOV( 'All', QROWS, DIM1, $ WORK(IPW7), QROWS, $ Q((JLOC-1)*LLDQ+ILOC ), LLDQ ) END IF CALL INFOG2L( INDX, I+DIM1, DESCQ, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN IF( NPCOL.GT.1 ) $ CALL BDLAAPP( 1, QROWS, NWIN, $ NCB, WORK(IPW7), QROWS, $ NITRAF, IWORK(IPIW), $ WORK( IPW3 ), WORK(IPW8) ) CALL DLAMOV( 'All', QROWS, DIM4, $ WORK(IPW7+QROWS*DIM1), QROWS, $ Q((JLOC-1)*LLDQ+ILOC ), LLDQ ) END IF 510 CONTINUE END IF END IF * IF( DIR.EQ.1 ) THEN IF( LIHIC.LT.N ) THEN INDX = LIHIC + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC.AND. $ MOD(LIHIC,NB).NE.0 ) THEN CALL BDLAAPP( 0, NWIN, TCOLS, NCB, $ WORK( IPW6 ), NWIN, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL DLAMOV( 'All', DIM1, TCOLS, $ WORK( IPW6 ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF CALL INFOG2L( I+DIM1, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC.AND. $ MOD(LIHIC,NB).NE.0 ) THEN IF( NPROW.GT.1 ) $ CALL BDLAAPP( 0, NWIN, TCOLS, NCB, $ WORK( IPW6 ), NWIN, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL DLAMOV( 'All', DIM4, TCOLS, $ WORK( IPW6+DIM1 ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF INDXS = ICEIL(LIHIC,NB)*NB + 1 INDXE = MIN(N,INDXS+(NPCOL-2)*NB) DO 520 INDX = INDXS, INDXE, NB CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN CALL BDLAAPP( 0, NWIN, TCOLS, NCB, $ WORK(IPW6), NWIN, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL DLAMOV( 'All', DIM1, TCOLS, $ WORK( IPW6 ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF CALL INFOG2L( I+DIM1, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN IF( NPROW.GT.1 ) $ CALL BDLAAPP( 0, NWIN, TCOLS, $ NCB, WORK(IPW6), NWIN, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL DLAMOV( 'All', DIM4, TCOLS, $ WORK( IPW6+DIM1 ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 520 CONTINUE END IF END IF END IF END IF * 328 CONTINUE * 323 CONTINUE * * End of loops over directions (DIR). * 2222 CONTINUE * * End of loops over diagonal blocks for reordering over the * block diagonal. * 310 CONTINUE LAST = LAST + 1 IF( LASTWAIT .AND. LAST.LT.2 ) GO TO 308 * * Barrier to collect the processes before proceeding. * CALL BLACS_BARRIER( ICTXT, 'All' ) * * Compute global maximum of IERR so that we know if some process * experienced a failure in the reordering. * MYIERR = IERR IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, IERR, 1, -1, $ -1, -1, -1, -1 ) * IF( IERR.NE.0 ) THEN * * When calling BDTREXC, the block at position I+KKS-1 failed * to swap. * IF( MYIERR.NE.0 ) INFO = MAX(1,I+KKS-1) IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, -1, $ -1, -1, -1, -1 ) GO TO 300 END IF * * Do a global update of the SELECT vector. * DO 530 K = 1, N RSRC = INDXG2P( K, NB, MYROW, DESCT( RSRC_ ), NPROW ) CSRC = INDXG2P( K, NB, MYCOL, DESCT( CSRC_ ), NPCOL ) IF( MYROW.NE.RSRC .OR. MYCOL.NE.CSRC ) $ SELECT( K ) = 0 530 CONTINUE IF( NPROCS.GT.1 ) $ CALL IGSUM2D( ICTXT, 'All', TOP, N, 1, SELECT, N, -1, -1 ) * * Find the global minumum of ILO and IHI. * ILO = ILO - 1 523 CONTINUE ILO = ILO + 1 IF( ILO.LE.N ) THEN IF( SELECT(ILO).NE.0 ) GO TO 523 END IF IHI = IHI + 1 527 CONTINUE IHI = IHI - 1 IF( IHI.GE.1 ) THEN IF( SELECT(IHI).EQ.0 ) GO TO 527 END IF * * End While ( ILO <= M ) GO TO 50 END IF * 300 CONTINUE * * In case an error occured, do an additional global update of * SELECT. * IF( INFO.NE.0 ) THEN DO 540 K = 1, N RSRC = INDXG2P( K, NB, MYROW, DESCT( RSRC_ ), NPROW ) CSRC = INDXG2P( K, NB, MYCOL, DESCT( CSRC_ ), NPCOL ) IF( MYROW.NE.RSRC .OR. MYCOL.NE.CSRC ) $ SELECT( K ) = 0 540 CONTINUE IF( NPROCS.GT.1 ) $ CALL IGSUM2D( ICTXT, 'All', TOP, N, 1, SELECT, N, -1, -1 ) END IF * 545 CONTINUE * * Store the output eigenvalues in WR and WI: first let all the * processes compute the eigenvalue inside their diagonal blocks in * parallel, except for the eigenvalue located next to a block * border. After that, compute all eigenvalues located next to the * block borders. Finally, do a global summation over WR and WI so * that all processors receive the result. Notice: real eigenvalues * extracted from a non-canonical 2-by-2 block are not stored in * any particular order. * DO 550 K = 1, N WR( K ) = ZERO WI( K ) = ZERO 550 CONTINUE * * Loop 560: extract eigenvalues from the blocks which are not laid * out across a border of the processor mesh, except for those 1x1 * blocks on the border. * PAIR = .FALSE. DO 560 K = 1, N IF( .NOT. PAIR ) THEN BORDER = ( K.NE.N .AND. MOD( K, NB ).EQ.0 ) .OR. % ( K.NE.1 .AND. MOD( K, NB ).EQ.1 ) IF( .NOT. BORDER ) THEN CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC1, TRSRC1, TCSRC1 ) IF( MYROW.EQ.TRSRC1 .AND. MYCOL.EQ.TCSRC1 ) THEN ELEM1 = T((JLOC1-1)*LLDT+ILOC1) IF( K.LT.N ) THEN ELEM3 = T((JLOC1-1)*LLDT+ILOC1+1) ELSE ELEM3 = ZERO END IF IF( ELEM3.NE.ZERO ) THEN ELEM2 = T((JLOC1)*LLDT+ILOC1) ELEM4 = T((JLOC1)*LLDT+ILOC1+1) CALL DLANV2( ELEM1, ELEM2, ELEM3, ELEM4, $ WR( K ), WI( K ), WR( K+1 ), WI( K+1 ), SN, $ CS ) PAIR = .TRUE. ELSE IF( K.GT.1 ) THEN TMP = T((JLOC1-2)*LLDT+ILOC1) IF( TMP.NE.ZERO ) THEN ELEM1 = T((JLOC1-2)*LLDT+ILOC1-1) ELEM2 = T((JLOC1-1)*LLDT+ILOC1-1) ELEM3 = T((JLOC1-2)*LLDT+ILOC1) ELEM4 = T((JLOC1-1)*LLDT+ILOC1) CALL DLANV2( ELEM1, ELEM2, ELEM3, ELEM4, $ WR( K-1 ), WI( K-1 ), WR( K ), $ WI( K ), SN, CS ) ELSE WR( K ) = ELEM1 END IF ELSE WR( K ) = ELEM1 END IF END IF END IF END IF ELSE PAIR = .FALSE. END IF 560 CONTINUE * * Loop 570: extract eigenvalues from the blocks which are laid * out across a border of the processor mesh. The processors are * numbered as below: * * 1 | 2 * --+-- * 3 | 4 * DO 570 K = NB, N-1, NB CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC1, TRSRC1, TCSRC1 ) CALL INFOG2L( K, K+1, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC2, JLOC2, TRSRC2, TCSRC2 ) CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC3, JLOC3, TRSRC3, TCSRC3 ) CALL INFOG2L( K+1, K+1, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC4, TRSRC4, TCSRC4 ) IF( MYROW.EQ.TRSRC2 .AND. MYCOL.EQ.TCSRC2 ) THEN ELEM2 = T((JLOC2-1)*LLDT+ILOC2) IF( TRSRC1.NE.TRSRC2 .OR. TCSRC1.NE.TCSRC2 ) $ CALL DGESD2D( ICTXT, 1, 1, ELEM2, 1, TRSRC1, TCSRC1 ) END IF IF( MYROW.EQ.TRSRC3 .AND. MYCOL.EQ.TCSRC3 ) THEN ELEM3 = T((JLOC3-1)*LLDT+ILOC3) IF( TRSRC1.NE.TRSRC3 .OR. TCSRC1.NE.TCSRC3 ) $ CALL DGESD2D( ICTXT, 1, 1, ELEM3, 1, TRSRC1, TCSRC1 ) END IF IF( MYROW.EQ.TRSRC4 .AND. MYCOL.EQ.TCSRC4 ) THEN WORK(1) = T((JLOC4-1)*LLDT+ILOC4) IF( K+1.LT.N ) THEN WORK(2) = T((JLOC4-1)*LLDT+ILOC4+1) ELSE WORK(2) = ZERO END IF IF( TRSRC1.NE.TRSRC4 .OR. TCSRC1.NE.TCSRC4 ) $ CALL DGESD2D( ICTXT, 2, 1, WORK, 2, TRSRC1, TCSRC1 ) END IF IF( MYROW.EQ.TRSRC1 .AND. MYCOL.EQ.TCSRC1 ) THEN ELEM1 = T((JLOC1-1)*LLDT+ILOC1) IF( TRSRC1.NE.TRSRC2 .OR. TCSRC1.NE.TCSRC2 ) $ CALL DGERV2D( ICTXT, 1, 1, ELEM2, 1, TRSRC2, TCSRC2 ) IF( TRSRC1.NE.TRSRC3 .OR. TCSRC1.NE.TCSRC3 ) $ CALL DGERV2D( ICTXT, 1, 1, ELEM3, 1, TRSRC3, TCSRC3 ) IF( TRSRC1.NE.TRSRC4 .OR. TCSRC1.NE.TCSRC4 ) $ CALL DGERV2D( ICTXT, 2, 1, WORK, 2, TRSRC4, TCSRC4 ) ELEM4 = WORK(1) ELEM5 = WORK(2) IF( ELEM5.EQ.ZERO ) THEN IF( WR( K ).EQ.ZERO .AND. WI( K ).EQ.ZERO ) THEN CALL DLANV2( ELEM1, ELEM2, ELEM3, ELEM4, WR( K ), $ WI( K ), WR( K+1 ), WI( K+1 ), SN, CS ) ELSEIF( WR( K+1 ).EQ.ZERO .AND. WI( K+1 ).EQ.ZERO ) THEN WR( K+1 ) = ELEM4 END IF ELSEIF( WR( K ).EQ.ZERO .AND. WI( K ).EQ.ZERO ) THEN WR( K ) = ELEM1 END IF END IF 570 CONTINUE * IF( NPROCS.GT.1 ) THEN CALL DGSUM2D( ICTXT, 'All', TOP, N, 1, WR, N, -1, -1 ) CALL DGSUM2D( ICTXT, 'All', TOP, N, 1, WI, N, -1, -1 ) END IF * * Store storage requirements in workspaces. * WORK( 1 ) = DBLE(LWMIN) IWORK( 1 ) = LIWMIN * * Return to calling program. * RETURN * * End of PDTRORD * END * scalapack-2.0.2/SRC/pdtrrfs.f000644 000766 000024 00000073630 10363532303 016202 0ustar00juliestaff000000 000000 SUBROUTINE PDTRRFS( UPLO, TRANS, DIAG, N, NRHS, A, IA, JA, DESCA, $ B, IB, JB, DESCB, X, IX, JX, DESCX, FERR, $ BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, TRANS, UPLO INTEGER INFO, IA, IB, IX, JA, JB, JX, LIWORK, LWORK, $ N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ), IWORK( * ) DOUBLE PRECISION A( * ), B( * ), BERR( * ), FERR( * ), $ WORK( * ), X( * ) * .. * * Purpose * ======= * * PDTRRFS provides error bounds and backward error estimates for the * solution to a system of linear equations with a triangular * coefficient matrix. * * The solution matrix X must be computed by PDTRTRS or some other * means before entering this routine. PDTRRFS does not do iterative * refinement because doing so cannot improve the backward error. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations. * = 'N': sub( A ) * sub( X ) = sub( B ) (No transpose) * = 'T': sub( A )**T * sub( X ) = sub( B ) (Transpose) * = 'C': sub( A )**T * sub( X ) = sub( B ) * (Conjugate transpose = Transpose) * * * DIAG (global input) CHARACTER*1 * = 'N': sub( A ) is non-unit triangular; * = 'U': sub( A ) is unit triangular. * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of local dimension (LLD_A,LOCc(JA+N-1) ). This * array contains the local pieces of the original triangular * distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. * If DIAG = 'U', the diagonal elements of sub( A ) are also * not referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) DOUBLE PRECISION pointer into the local memory * to an array of local dimension (LLD_B, LOCc(JB+NRHS-1) ). * On entry, this array contains the the local pieces of the * right hand sides sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input) DOUBLE PRECISION pointer into the local memory * to an array of local dimension (LLD_X, LOCc(JX+NRHS-1) ). * On entry, this array contains the the local pieces of the * solution vectors sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). The estimated forward error bounds for * each solution vector of sub( X ). If XTRUE is the true * solution, FERR bounds the magnitude of the largest entry * in (sub( X ) - XTRUE) divided by the magnitude of the * largest entry in sub( X ). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*LOCr( N + MOD( IA-1, MB_A ) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr( N + MOD( IB-1, MB_B ) ). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices sub( X ) and sub( B ) should be * distributed the same way on the same processes. These conditions * ensure that sub( X ) and sub( B ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER CHARACTER TRANST INTEGER IAROW, IXBCOL, IXBROW, IXCOL, IXROW, ICOFFA, $ ICOFFB, ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, $ IIW, IOFFXB, IPB, IPR, IPV, IROFFA, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LIWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ DOUBLE PRECISION EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMX2D, $ DGEBR2D, DGEBS2D, INFOG2L, PCHK1MAT, $ PCHK2MAT, PDATRMV, PDAXPY, PDCOPY, $ PDLACON, PDTRMV, PDTRSV, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 900+CTXT_ ) ELSE CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IX, JX, DESCX, 17, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) NOUNIT = LSAME( DIAG, 'N' ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 3*NPMOD WORK( 1 ) = DBLE( LWMIN ) LIWMIN = NPMOD IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 ELSE IF( NRHS.LT.0 ) THEN INFO = -5 ELSE IF( IROFFA.NE.0 ) THEN INFO = -7 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -8 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 900+NB_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -11 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1300+MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -15 ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -16 ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 1700+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 1700+NB_ ) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 1700+CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -21 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 2 ) = ICHAR( 'T' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'U' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 21 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 23 CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, 0, IDUM1, IDUM2, $ INFO ) CALL PCHK2MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, N, 4, NRHS, 5, $ IX, JX, DESCX, 17, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDTRRFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = IPB + NP0 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = maximum number of nonzero entries in each row of A, plus 1 * NZ = N + 1 EPS = PDLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) )*DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 90 K = 0, JBRHS - 1 * * Compute residual R = B - op(A) * X, * where op(A) = A or A', depending on TRANS. * CALL PDCOPY( N, X, IX, JX+K, DESCX, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PDTRMV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) CALL PDAXPY( N, -ONE, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, $ JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the matrix * or vector Z. If the i-th component of the denominator is less * than SAFE2, then SAFE1 is added to the i-th components of the * numerator and denominator before dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 20 II = IIXB, IIXB + NP - 1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 20 CONTINUE END IF END IF * CALL PDATRMV( UPLO, TRANS, DIAG, N, ONE, A, IA, JA, DESCA, X, $ IX, JX+K, DESCX, 1, ONE, WORK( IPB ), IW, JW, $ DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIW - 1, IIW + NP - 2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 30 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Bound error from formula * * norm(X - XTRUE) / norm(X) .le. FERR = * norm( abs(inv(op(A)))* * ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(A)) is the inverse of op(A) * abs(Z) is the componentwise absolute value of the matrix or * vector Z * NZ is the maximum number of nonzeros in any row of A, plus 1 * EPS is machine epsilon * * The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) * is incremented by SAFE1 if the i-th component of * abs(op(A))*abs(X) + abs(B) is less than SAFE2. * * Use PDLACON to estimate the infinity-norm of the matrix * inv(op(A)) * diag(W), * where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW - 1, IIW + NP - 2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 40 CONTINUE END IF END IF * KASE = 0 50 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PDLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(A)'). * CALL PDTRSV( UPLO, TRANST, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 60 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 60 CONTINUE END IF END IF ELSE * * Multiply by inv(op(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF CALL PDTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) END IF GO TO 50 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIXB, IIXB + NP - 1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 80 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 90 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 180 J = JN + 1, JB + NRHS - 1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 170 K = 0, JBRHS - 1 * * Compute residual R = B - op(A) * X, * where op(A) = A or A', depending on TRANS. * CALL PDCOPY( N, X, IX, J+K, DESCX, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PDTRMV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) CALL PDAXPY( N, -ONE, B, IB, J+K, DESCB, 1, WORK( IPR ), $ IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 100 II = IIXB, IIXB + NP - 1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 100 CONTINUE END IF END IF * CALL PDATRMV( UPLO, TRANS, DIAG, N, ONE, A, IA, JA, DESCA, $ X, IX, J+K, DESCX, 1, ONE, WORK( IPB ), IW, $ JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 110 II = IIW - 1, IIW + NP - 2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 110 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Bound error from formula * * norm(X - XTRUE) / norm(X) .le. FERR = * norm( abs(inv(op(A)))* * ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) )))/norm(X) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(A)) is the inverse of op(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of A, * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) * is incremented by SAFE1 if the i-th component of * abs(op(A))*abs(X) + abs(B) is less than SAFE2. * * Use PDLACON to estimate the infinity-norm of the matrix * inv(op(A)) * diag(W), * where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIW - 1, IIW + NP - 2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 120 CONTINUE END IF END IF * KASE = 0 130 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PDLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(A)'). * CALL PDTRSV( UPLO, TRANST, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 140 CONTINUE END IF END IF ELSE * * Multiply by inv(op(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 150 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 150 CONTINUE END IF END IF CALL PDTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) END IF GO TO 130 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIXB, IIXB + NP - 1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 160 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, $ IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 170 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PDTRRFS * END scalapack-2.0.2/SRC/pdtrsen.f000644 000766 000024 00000065376 11705457544 016223 0ustar00juliestaff000000 000000 SUBROUTINE PDTRSEN( JOB, COMPQ, SELECT, PARA, N, T, IT, JT, $ DESCT, Q, IQ, JQ, DESCQ, WR, WI, M, S, SEP, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK computational routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER COMPQ, JOB INTEGER INFO, LIWORK, LWORK, M, N, $ IT, JT, IQ, JQ DOUBLE PRECISION S, SEP * .. * .. Array Arguments .. LOGICAL SELECT( N ) INTEGER PARA( 6 ), DESCT( * ), DESCQ( * ), IWORK( * ) DOUBLE PRECISION Q( * ), T( * ), WI( * ), WORK( * ), WR( * ) * .. * * Purpose * ======= * * PDTRSEN reorders the real Schur factorization of a real matrix * A = Q*T*Q**T, so that a selected cluster of eigenvalues appears * in the leading diagonal blocks of the upper quasi-triangular matrix * T, and the leading columns of Q form an orthonormal basis of the * corresponding right invariant subspace. The reordering is performed * by PDTRORD. * * Optionally the routine computes the reciprocal condition numbers of * the cluster of eigenvalues and/or the invariant subspace. SCASY * library is needed for condition estimation. * * T must be in Schur form (as returned by PDLAHQR), that is, block * upper triangular with 1-by-1 and 2-by-2 diagonal blocks. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * JOB (global input) CHARACTER*1 * Specifies whether condition numbers are required for the * cluster of eigenvalues (S) or the invariant subspace (SEP): * = 'N': none; * = 'E': for eigenvalues only (S); * = 'V': for invariant subspace only (SEP); * = 'B': for both eigenvalues and invariant subspace (S and * SEP). * * COMPQ (global input) CHARACTER*1 * = 'V': update the matrix Q of Schur vectors; * = 'N': do not update Q. * * SELECT (global input) LOGICAL array, dimension (N) * SELECT specifies the eigenvalues in the selected cluster. To * select a real eigenvalue w(j), SELECT(j) must be set to * .TRUE.. To select a complex conjugate pair of eigenvalues * w(j) and w(j+1), corresponding to a 2-by-2 diagonal block, * either SELECT(j) or SELECT(j+1) or both must be set to * .TRUE.; a complex conjugate pair of eigenvalues must be * either both included in the cluster or both excluded. * * PARA (global input) INTEGER*6 * Block parameters (some should be replaced by calls to * PILAENV and others by meaningful default values): * PARA(1) = maximum number of concurrent computational windows * allowed in the algorithm; * 0 < PARA(1) <= min(NPROW,NPCOL) must hold; * PARA(2) = number of eigenvalues in each window; * 0 < PARA(2) < PARA(3) must hold; * PARA(3) = window size; PARA(2) < PARA(3) < DESCT(MB_) * must hold; * PARA(4) = minimal percentage of flops required for * performing matrix-matrix multiplications instead * of pipelined orthogonal transformations; * 0 <= PARA(4) <= 100 must hold; * PARA(5) = width of block column slabs for row-wise * application of pipelined orthogonal * transformations in their factorized form; * 0 < PARA(5) <= DESCT(MB_) must hold. * PARA(6) = the maximum number of eigenvalues moved together * over a process border; in practice, this will be * approximately half of the cross border window size * 0 < PARA(6) <= PARA(2) must hold; * * N (global input) INTEGER * The order of the globally distributed matrix T. N >= 0. * * T (local input/output) DOUBLE PRECISION array, * dimension (LLD_T,LOCc(N)). * On entry, the local pieces of the global distributed * upper quasi-triangular matrix T, in Schur form. On exit, T is * overwritten by the local pieces of the reordered matrix T, * again in Schur form, with the selected eigenvalues in the * globally leading diagonal blocks. * * IT (global input) INTEGER * JT (global input) INTEGER * The row and column index in the global array T indicating the * first column of sub( T ). IT = JT = 1 must hold. * * DESCT (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the global distributed matrix T. * * Q (local input/output) DOUBLE PRECISION array, * dimension (LLD_Q,LOCc(N)). * On entry, if COMPQ = 'V', the local pieces of the global * distributed matrix Q of Schur vectors. * On exit, if COMPQ = 'V', Q has been postmultiplied by the * global orthogonal transformation matrix which reorders T; the * leading M columns of Q form an orthonormal basis for the * specified invariant subspace. * If COMPQ = 'N', Q is not referenced. * * IQ (global input) INTEGER * JQ (global input) INTEGER * The column index in the global array Q indicating the * first column of sub( Q ). IQ = JQ = 1 must hold. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the global distributed matrix Q. * * WR (global output) DOUBLE PRECISION array, dimension (N) * WI (global output) DOUBLE PRECISION array, dimension (N) * The real and imaginary parts, respectively, of the reordered * eigenvalues of T. The eigenvalues are in principle stored in * the same order as on the diagonal of T, with WR(i) = T(i,i) * and, if T(i:i+1,i:i+1) is a 2-by-2 diagonal block, WI(i) > 0 * and WI(i+1) = -WI(i). * Note also that if a complex eigenvalue is sufficiently * ill-conditioned, then its value may differ significantly * from its value before reordering. * * M (global output) INTEGER * The dimension of the specified invariant subspace. * 0 <= M <= N. * * S (global output) DOUBLE PRECISION * If JOB = 'E' or 'B', S is a lower bound on the reciprocal * condition number for the selected cluster of eigenvalues. * S cannot underestimate the true reciprocal condition number * by more than a factor of sqrt(N). If M = 0 or N, S = 1. * If JOB = 'N' or 'V', S is not referenced. * * SEP (global output) DOUBLE PRECISION * If JOB = 'V' or 'B', SEP is the estimated reciprocal * condition number of the specified invariant subspace. If * M = 0 or N, SEP = norm(T). * If JOB = 'N' or 'E', SEP is not referenced. * * WORK (local workspace/output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued by PXERBLA. * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The dimension of the array IWORK. * * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value. * If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*1000+j), if the i-th * argument is a scalar and had an illegal value, then INFO = -i. * > 0: here we have several possibilites * *) Reordering of T failed because some eigenvalues are too * close to separate (the problem is very ill-conditioned); * T may have been partially reordered, and WR and WI * contain the eigenvalues in the same order as in T. * On exit, INFO = {the index of T where the swap failed}. * *) A 2-by-2 block to be reordered split into two 1-by-1 * blocks and the second block failed to swap with an * adjacent block. * On exit, INFO = {the index of T where the swap failed}. * *) If INFO = N+1, there is no valid BLACS context (see the * BLACS documentation for details). * *) If INFO = N+2, the routines used in the calculation of * the condition numbers raised a positive warning flag * (see the documentation for PGESYCTD and PSYCTCON of the * SCASY library). * *) If INFO = N+3, PGESYCTD raised an input error flag; * please report this bug to the authors (see below). * If INFO = N+4, PSYCTCON raised an input error flag; * please report this bug to the authors (see below). * In a future release this subroutine may distinguish between * the case 1 and 2 above. * * Method * ====== * * This routine performs parallel eigenvalue reordering in real Schur * form by parallelizing the approach proposed in [3]. The condition * number estimation part is performed by using techniques and code * from SCASY, see http://www.cs.umu.se/research/parallel/scasy. * * Additional requirements * ======================= * * The following alignment requirements must hold: * (a) DESCT( MB_ ) = DESCT( NB_ ) = DESCQ( MB_ ) = DESCQ( NB_ ) * (b) DESCT( RSRC_ ) = DESCQ( RSRC_ ) * (c) DESCT( CSRC_ ) = DESCQ( CSRC_ ) * * All matrices must be blocked by a block factor larger than or * equal to two (3). This to simplify reordering across processor * borders in the presence of 2-by-2 blocks. * * Limitations * =========== * * This algorithm cannot work on submatrices of T and Q, i.e., * IT = JT = IQ = JQ = 1 must hold. This is however no limitation * since PDLAHQR does not compute Schur forms of submatrices anyway. * * References * ========== * * [1] Z. Bai and J. W. Demmel; On swapping diagonal blocks in real * Schur form, Linear Algebra Appl., 186:73--95, 1993. Also as * LAPACK Working Note 54. * * [2] Z. Bai, J. W. Demmel, and A. McKenney; On computing condition * numbers for the nonsymmetric eigenvalue problem, ACM Trans. * Math. Software, 19(2):202--223, 1993. Also as LAPACK Working * Note 13. * * [3] D. Kressner; Block algorithms for reordering standard and * generalized Schur forms, ACM TOMS, 32(4):521-532, 2006. * Also LAPACK Working Note 171. * * [4] R. Granat, B. Kagstrom, and D. Kressner; Parallel eigenvalue * reordering in real Schur form, Concurrency and Computations: * Practice and Experience, 21(9):1225-1250, 2009. Also as * LAPACK Working Note 192. * * Parallel execution recommendations * ================================== * * Use a square grid, if possible, for maximum performance. The block * parameters in PARA should be kept well below the data distribution * block size. In particular, see [3,4] for recommended settings for * these parameters. * * In general, the parallel algorithm strives to perform as much work * as possible without crossing the block borders on the main block * diagonal. * * Contributors * ============ * * Implemented by Robert Granat, Dept. of Computing Science and HPC2N, * Umea University, Sweden, March 2007, * in collaboration with Bo Kagstrom and Daniel Kressner. * Modified by Meiyue Shao, October 2011. * * Revisions * ========= * * Please send bug-reports to granat@cs.umu.se * * Keywords * ======== * * Real Schur form, eigenvalue reordering, Sylvester matrix equation * * ===================================================================== * .. * .. Parameters .. CHARACTER TOP INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ DOUBLE PRECISION ZERO, ONE PARAMETER ( TOP = '1-Tree', $ BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, $ ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, WANTBH, WANTQ, WANTS, WANTSP INTEGER ICOFFT12, ICTXT, IDUM1, IDUM2, IERR, ILOC1, $ IPW1, ITER, ITT, JLOC1, JTT, K, LIWMIN, LLDT, $ LLDQ, LWMIN, MMAX, MMIN, MYROW, MYCOL, N1, N2, $ NB, NOEXSY, NPCOL, NPROCS, NPROW, SPACE, $ T12ROWS, T12COLS, TCOLS, TCSRC, TROWS, TRSRC, $ WRK1, IWRK1, WRK2, IWRK2, WRK3, IWRK3 DOUBLE PRECISION DPDUM1, ELEM, EST, SCALE, RNORM * .. Local Arrays .. INTEGER DESCT12( DLEN_ ), MBNB2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLANGE EXTERNAL LSAME, NUMROC, PDLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCINIT, $ IGAMX2D, INFOG2L, PDLACPY, PDTRORD, PXERBLA, $ PCHK1MAT, PCHK2MAT * $ , PGESYCTD, PSYCTCON * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCT( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL * * Test if grid is O.K., i.e., the context is valid * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = N+1 END IF * * Check if workspace * LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * Test dimensions for local sanity * IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, INFO ) END IF * * Check the blocking sizes for alignment requirements * IF( INFO.EQ.0 ) THEN IF( DESCT( MB_ ).NE.DESCT( NB_ ) ) INFO = -(1000*9 + MB_) END IF IF( INFO.EQ.0 ) THEN IF( DESCQ( MB_ ).NE.DESCQ( NB_ ) ) INFO = -(1000*13 + MB_) END IF IF( INFO.EQ.0 ) THEN IF( DESCT( MB_ ).NE.DESCQ( MB_ ) ) INFO = -(1000*9 + MB_) END IF * * Check the blocking sizes for minimum sizes * IF( INFO.EQ.0 ) THEN IF( N.NE.DESCT( MB_ ) .AND. DESCT( MB_ ).LT.3 ) $ INFO = -(1000*9 + MB_) IF( N.NE.DESCQ( MB_ ) .AND. DESCQ( MB_ ).LT.3 ) $ INFO = -(1000*13 + MB_) END IF * * Check parameters in PARA * NB = DESCT( MB_ ) IF( INFO.EQ.0 ) THEN IF( PARA(1).LT.1 .OR. PARA(1).GT.MIN(NPROW,NPCOL) ) $ INFO = -(1000 * 4 + 1) IF( PARA(2).LT.1 .OR. PARA(2).GE.PARA(3) ) $ INFO = -(1000 * 4 + 2) IF( PARA(3).LT.1 .OR. PARA(3).GT.NB ) $ INFO = -(1000 * 4 + 3) IF( PARA(4).LT.0 .OR. PARA(4).GT.100 ) $ INFO = -(1000 * 4 + 4) IF( PARA(5).LT.1 .OR. PARA(5).GT.NB ) $ INFO = -(1000 * 4 + 5) IF( PARA(6).LT.1 .OR. PARA(6).GT.PARA(2) ) $ INFO = -(1000 * 4 + 6) END IF * * Check requirements on IT, JT, IQ and JQ * IF( INFO.EQ.0 ) THEN IF( IT.NE.1 ) INFO = -7 IF( JT.NE.IT ) INFO = -8 IF( IQ.NE.1 ) INFO = -11 IF( JQ.NE.IQ ) INFO = -12 END IF * * Test input parameters for global sanity * IF( INFO.EQ.0 ) THEN CALL PCHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, 0, IDUM1, $ IDUM2, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL PCHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, 0, IDUM1, $ IDUM2, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL PCHK2MAT( N, 5, N, 5, IT, JT, DESCT, 9, N, 5, N, 5, $ IQ, JQ, DESCQ, 13, 0, IDUM1, IDUM2, INFO ) END IF * * Decode and test the input parameters * IF( INFO.EQ.0 .OR. LQUERY ) THEN WANTBH = LSAME( JOB, 'B' ) WANTS = LSAME( JOB, 'E' ) .OR. WANTBH WANTSP = LSAME( JOB, 'V' ) .OR. WANTBH WANTQ = LSAME( COMPQ, 'V' ) * IF( .NOT.LSAME( JOB, 'N' ) .AND. .NOT.WANTS .AND. .NOT.WANTSP ) $ THEN INFO = -1 ELSEIF( .NOT.LSAME( COMPQ, 'N' ) .AND. .NOT.WANTQ ) THEN INFO = -2 ELSEIF( N.LT.0 ) THEN INFO = -4 ELSE * * Extract local leading dimension * LLDT = DESCT( LLD_ ) LLDQ = DESCQ( LLD_ ) * * Check the SELECT vector for consistency and set M to the * dimension of the specified invariant subspace. * M = 0 DO 10 K = 1, N * * IWORK(1:N) is an integer copy of SELECT. * IF( SELECT(K) ) THEN IWORK(K) = 1 ELSE IWORK(K) = 0 END IF IF( K.LT.N ) THEN CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ITT, JTT, TRSRC, TCSRC ) IF( MYROW.EQ.TRSRC .AND. MYCOL.EQ.TCSRC ) THEN ELEM = T( (JTT-1)*LLDT + ITT ) IF( ELEM.NE.ZERO ) THEN IF( SELECT(K) .AND. .NOT.SELECT(K+1) ) THEN * INFO = -3 IWORK(K+1) = 1 ELSEIF( .NOT.SELECT(K) .AND. SELECT(K+1) ) THEN * INFO = -3 IWORK(K) = 1 END IF END IF END IF END IF IF( SELECT(K) ) M = M + 1 10 CONTINUE MMAX = M MMIN = M IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, MMAX, 1, -1, $ -1, -1, -1, -1 ) IF( NPROCS.GT.1 ) $ CALL IGAMN2D( ICTXT, 'All', TOP, 1, 1, MMIN, 1, -1, $ -1, -1, -1, -1 ) IF( MMAX.GT.MMIN ) THEN M = MMAX IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, N, 1, IWORK, N, $ -1, -1, -1, -1, -1 ) END IF * * Set parameters for deep pipelining in parallel * Sylvester solver. * MBNB2( 1 ) = MIN( MAX( PARA( 3 ), PARA( 2 )*2 ), NB ) MBNB2( 2 ) = MBNB2( 1 ) * * Compute needed workspace * N1 = M N2 = N - M IF( WANTS ) THEN c CALL PGESYCTD( 'Solve', 'Schur', 'Schur', 'Notranspose', c $ 'Notranspose', -1, 'Demand', N1, N2, T, 1, 1, DESCT, c $ T, N1+1, N1+1, DESCT, T, 1, N1+1, DESCT, MBNB2, c $ WORK, -1, IWORK(N+1), -1, NOEXSY, SCALE, IERR ) WRK1 = INT(WORK(1)) IWRK1 = IWORK(N+1) WRK1 = 0 IWRK1 = 0 ELSE WRK1 = 0 IWRK1 = 0 END IF * IF( WANTSP ) THEN c CALL PSYCTCON( 'Notranspose', 'Notranspose', -1, c $ 'Demand', N1, N2, T, 1, 1, DESCT, T, N1+1, N1+1, c $ DESCT, MBNB2, WORK, -1, IWORK(N+1), -1, EST, ITER, c $ IERR ) WRK2 = INT(WORK(1)) IWRK2 = IWORK(N+1) WRK2 = 0 IWRK2 = 0 ELSE WRK2 = 0 IWRK2 = 0 END IF * TROWS = NUMROC( N, NB, MYROW, DESCT(RSRC_), NPROW ) TCOLS = NUMROC( N, NB, MYCOL, DESCT(CSRC_), NPCOL ) WRK3 = N + 7*NB**2 + 2*TROWS*PARA( 3 ) + TCOLS*PARA( 3 ) + $ MAX( TROWS*PARA( 3 ), TCOLS*PARA( 3 ) ) IWRK3 = 5*PARA( 1 ) + PARA(2)*PARA(3) - $ PARA(2) * (PARA(2) + 1 ) / 2 * IF( WANTSP ) THEN LWMIN = MAX( 1, MAX( WRK2, WRK3) ) LIWMIN = MAX( 1, MAX( IWRK2, IWRK3 ) )+N ELSE IF( LSAME( JOB, 'N' ) ) THEN LWMIN = MAX( 1, WRK3 ) LIWMIN = IWRK3+N ELSE IF( LSAME( JOB, 'E' ) ) THEN LWMIN = MAX( 1, MAX( WRK1, WRK3) ) LIWMIN = MAX( 1, MAX( IWRK1, IWRK3 ) )+N END IF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -20 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -22 END IF END IF END IF * * Global maximum on info * IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, -1, -1, -1, $ -1, -1 ) * * Return if some argument is incorrect * IF( INFO.NE.0 .AND. .NOT.LQUERY ) THEN M = 0 S = ONE SEP = ZERO CALL PXERBLA( ICTXT, 'PDTRSEN', -INFO ) RETURN ELSEIF( LQUERY ) THEN WORK( 1 ) = DBLE(LWMIN) IWORK( 1 ) = LIWMIN RETURN END IF * * Quick return if possible. * IF( M.EQ.N .OR. M.EQ.0 ) THEN IF( WANTS ) $ S = ONE IF( WANTSP ) $ SEP = PDLANGE( '1', N, N, T, IT, JT, DESCT, WORK ) GO TO 50 END IF * * Reorder the eigenvalues. * CALL PDTRORD( COMPQ, IWORK, PARA, N, T, IT, JT, $ DESCT, Q, IQ, JQ, DESCQ, WR, WI, M, WORK, LWORK, $ IWORK(N+1), LIWORK-N, INFO ) * IF( WANTS ) THEN * * Solve Sylvester equation T11*R - R*T2 = scale*T12 for R in * parallel. * * Copy T12 to workspace. * CALL INFOG2L( 1, N1+1, DESCT, NPROW, NPCOL, MYROW, $ MYCOL, ILOC1, JLOC1, TRSRC, TCSRC ) ICOFFT12 = MOD( N1, NB ) T12ROWS = NUMROC( N1, NB, MYROW, TRSRC, NPROW ) T12COLS = NUMROC( N2+ICOFFT12, NB, MYCOL, TCSRC, NPCOL ) CALL DESCINIT( DESCT12, N1, N2+ICOFFT12, NB, NB, TRSRC, $ TCSRC, ICTXT, MAX(1,T12ROWS), IERR ) CALL PDLACPY( 'All', N1, N2, T, 1, N1+1, DESCT, WORK, $ 1, 1+ICOFFT12, DESCT12 ) * * Solve the equation to get the solution in workspace. * SPACE = DESCT12( LLD_ ) * T12COLS IPW1 = 1 + SPACE c CALL PGESYCTD( 'Solve', 'Schur', 'Schur', 'Notranspose', c $ 'Notranspose', -1, 'Demand', N1, N2, T, 1, 1, DESCT, T, c $ N1+1, N1+1, DESCT, WORK, 1, 1+ICOFFT12, DESCT12, MBNB2, c $ WORK(IPW1), LWORK-SPACE+1, IWORK(N+1), LIWORK-N, NOEXSY, c $ SCALE, IERR ) IF( IERR.LT.0 ) THEN INFO = N+3 ELSE INFO = N+2 END IF * * Estimate the reciprocal of the condition number of the cluster * of eigenvalues. * RNORM = PDLANGE( 'Frobenius', N1, N2, WORK, 1, 1+ICOFFT12, $ DESCT12, DPDUM1 ) IF( RNORM.EQ.ZERO ) THEN S = ONE ELSE S = SCALE / ( SQRT( SCALE*SCALE / RNORM+RNORM )* $ SQRT( RNORM ) ) END IF END IF * IF( WANTSP ) THEN * * Estimate sep(T11,T21) in parallel. * c CALL PSYCTCON( 'Notranspose', 'Notranspose', -1, 'Demand', N1, c $ N2, T, 1, 1, DESCT, T, N1+1, N1+1, DESCT, MBNB2, WORK, c $ LWORK, IWORK(N+1), LIWORK-N, EST, ITER, IERR ) EST = EST * SQRT(DBLE(N1*N2)) SEP = ONE / EST IF( IERR.LT.0 ) THEN INFO = N+4 ELSE INFO = N+2 END IF END IF * * Return to calling program. * 50 CONTINUE * RETURN * * End of PDTRSEN * END * scalapack-2.0.2/SRC/pdtrti2.f000644 000766 000024 00000023664 10363532303 016110 0ustar00juliestaff000000 000000 SUBROUTINE PDTRTI2( UPLO, DIAG, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDTRTI2 computes the inverse of a real upper or lower triangular * block matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). This matrix should be * contained in one and only one process memory space (local operation). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * DIAG (global input) CHARACTER*1 * = 'N': sub( A ) is non-unit triangular * = 'U': sub( A ) is unit triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)), * this array contains the local pieces of the triangular matrix * sub( A ). If UPLO = 'U', the leading N-by-N upper triangular * part of the matrix sub( A ) contains the upper triangular * matrix, and the strictly lower triangular part of sub( A ) * is not referenced. If UPLO = 'L', the leading N-by-N lower * triangular part of the matrix sub( A ) contains the lower * triangular matrix, and the strictly upper triangular part * of sub( A ) is not referenced. If DIAG = 'U', the diagonal * elements of sub( A ) are also not referenced and are assumed * to be 1. On exit, the (triangular) inverse of the original * matrix, in the same storage format. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER IACOL, IAROW, ICTXT, ICURR, IDIAG, IIA, IOFFA, $ JJA, LDA, MYCOL, MYROW, NA, NPCOL, NPROW DOUBLE PRECISION AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DSCAL, $ DTRMV, INFOG2L, PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDTRTI2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Compute local indexes * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN * LDA = DESCA( LLD_ ) * IF( UPPER ) THEN * IOFFA = IIA + ( JJA - 1 ) * LDA ICURR = IOFFA + LDA * IF( NOUNIT ) THEN * * Compute inverse of upper non-unit triangular matrix. * A( IOFFA ) = ONE / A( IOFFA ) IDIAG = ICURR + 1 DO 10 NA = 1, N-1 A( IDIAG ) = ONE / A( IDIAG ) AJJ = -A( IDIAG ) * * Compute elements 1:j-1 of j-th column. * CALL DTRMV( 'Upper', 'No transpose', DIAG, NA, $ A( IOFFA ), LDA, A( ICURR ), 1 ) CALL DSCAL( NA, AJJ, A( ICURR ), 1 ) IDIAG = IDIAG + LDA + 1 ICURR = ICURR + LDA 10 CONTINUE * ELSE * * Compute inverse of upper unit triangular matrix. * DO 20 NA = 1, N-1 * * Compute elements 1:j-1 of j-th column. * CALL DTRMV( 'Upper', 'No transpose', DIAG, NA, $ A( IOFFA ), LDA, A( ICURR ), 1 ) CALL DSCAL( NA, -ONE, A( ICURR ), 1 ) ICURR = ICURR + LDA 20 CONTINUE * END IF * ELSE * ICURR = IIA + N - 1 + ( JJA + N - 2 ) * LDA IOFFA = ICURR - LDA * IF( NOUNIT ) THEN * * Compute inverse of lower non-unit triangular matrix. * A( ICURR ) = ONE / A( ICURR ) IDIAG = IOFFA - 1 DO 30 NA = 1, N-1 A( IDIAG ) = ONE / A( IDIAG ) AJJ = -A( IDIAG ) * * Compute elements j+1:n of j-th column. * CALL DTRMV( 'Lower', 'No transpose', DIAG, NA, $ A( ICURR ), LDA, A( IOFFA ), 1 ) CALL DSCAL( NA, AJJ, A( IOFFA ), 1 ) ICURR = IDIAG IDIAG = IDIAG - LDA - 1 IOFFA = IDIAG + 1 30 CONTINUE * ELSE * * Compute inverse of lower unit triangular matrix. * DO 40 NA = 1, N-1 * * Compute elements j+1:n of j-th column. * CALL DTRMV( 'Lower', 'No transpose', DIAG, NA, $ A( ICURR ), LDA, A( IOFFA ), 1 ) CALL DSCAL( NA, -ONE, A( IOFFA ), 1 ) ICURR = ICURR - LDA - 1 IOFFA = ICURR - LDA 40 CONTINUE * END IF * END IF * END IF * * End of PDTRTI2 * END scalapack-2.0.2/SRC/pdtrtri.f000644 000766 000024 00000030712 10363532303 016200 0ustar00juliestaff000000 000000 SUBROUTINE PDTRTRI( UPLO, DIAG, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDTRTRI computes the inverse of a upper or lower triangular * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the distributed matrix sub( A ) is upper * or lower triangular: * = 'U': Upper triangular, * = 'L': Lower triangular. * * DIAG (global input) CHARACTER * Specifies whether or not the distributed matrix sub( A ) * is unit triangular: * = 'N': Non-unit triangular, * = 'U': Unit triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * triangular matrix sub( A ). If UPLO = 'U', the leading * N-by-N upper triangular part of the matrix sub( A ) contains * the upper triangular matrix to be inverted, and the strictly * lower triangular part of sub( A ) is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * the matrix sub( A ) contains the lower triangular matrix, * and the strictly upper triangular part of sub( A ) is not * referenced. * On exit, the (triangular) inverse of the original matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, A(IA+K-1,JA+K-1) is exactly zero. The * triangular matrix sub( A ) is singular and its * inverse can not be computed. * * ==================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER I, ICOFF, ICTXT, IROFF, ICURCOL, ICURROW, $ IDUMMY, II, IOFFA, J, JB, JJ, JN, LDA, MYCOL, $ MYROW, NN, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMX2D, INFOG2L, $ PCHK1MAT, PDTRTI2, PDTRMM, PDTRSM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) * CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOUNIT ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 * CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDTRTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) IF( NOUNIT ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, ICURROW, ICURCOL ) * * Handle first block separately * JB = JN-JA+1 LDA = DESCA( LLD_ ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*LDA DO 10 I = 0, JB-1 IF( A( IOFFA ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I + 1 IOFFA = IOFFA + LDA + 1 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*LDA DO 20 I = 0, JB-1 IF( A( IOFFA ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = J + I - JA + 1 IOFFA = IOFFA + LDA + 1 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUMMY, $ IDUMMY, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN END IF * * Use blocked code * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix * JB = JN-JA+1 * * Handle first block of column separately * CALL PDTRTI2( UPLO, DIAG, JB, A, IA, JA, DESCA, INFO ) * * Loop over remaining block of columns * DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * * Compute rows 1:j-1 of current block column * CALL PDTRMM( 'Left', UPLO, 'No transpose', DIAG, J-JA, JB, $ ONE, A, IA, JA, DESCA, A, IA, J, DESCA ) CALL PDTRSM( 'Right', UPLO, 'No transpose', DIAG, J-JA, $ JB, -ONE, A, I, J, DESCA, A, IA, J, DESCA ) * * Compute inverse of current diagonal block * CALL PDTRTI2( UPLO, DIAG, JB, A, I, J, DESCA, INFO ) * 40 CONTINUE * ELSE * * Compute inverse of lower triangular matrix * NN = ( ( JA+N-2 ) / DESCA( NB_ ) )*DESCA( NB_ ) + 1 DO 50 J = NN, JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA IF( J+JB.LE.JA+N-1 ) THEN * * Compute rows j+jb:ja+n-1 of current block column * CALL PDTRMM( 'Left', UPLO, 'No transpose', DIAG, $ JA+N-J-JB, JB, ONE, A, I+JB, J+JB, DESCA, $ A, I+JB, J, DESCA ) CALL PDTRSM( 'Right', UPLO, 'No transpose', DIAG, $ JA+N-J-JB, JB, -ONE, A, I, J, DESCA, $ A, I+JB, J, DESCA ) END IF * * Compute inverse of current diagonal block * CALL PDTRTI2( UPLO, DIAG, JB, A, I, J, DESCA, INFO ) * 50 CONTINUE * * Handle the last block of columns separately * JB = JN-JA+1 IF( JA+JB.LE.JA+N-1 ) THEN * * Compute rows ja+jb:ja+n-1 of current block column * CALL PDTRMM( 'Left', UPLO, 'No transpose', DIAG, N-JB, JB, $ ONE, A, IA+JB, JA+JB, DESCA, A, IA+JB, JA, $ DESCA ) CALL PDTRSM( 'Right', UPLO, 'No transpose', DIAG, N-JB, JB, $ -ONE, A, IA, JA, DESCA, A, IA+JB, JA, DESCA ) END IF * * Compute inverse of current diagonal block * CALL PDTRTI2( UPLO, DIAG, JB, A, IA, JA, DESCA, INFO ) * END IF * RETURN * * End PDTRTRI * END scalapack-2.0.2/SRC/pdtrtrs.f000644 000766 000024 00000031353 10363532303 016214 0ustar00juliestaff000000 000000 SUBROUTINE PDTRTRS( UPLO, TRANS, DIAG, N, NRHS, A, IA, JA, DESCA, $ B, IB, JB, DESCB, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, TRANS, UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) DOUBLE PRECISION A( * ), B( * ) * .. * * Purpose * ======= * * PDTRTRS solves a triangular system of the form * * sub( A ) * X = sub( B ) or sub( A )**T * X = sub( B ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is a triangular * distributed matrix of order N, and B(IB:IB+N-1,JB:JB+NRHS-1) is an * N-by-NRHS distributed matrix denoted by sub( B ). A check is made * to verify that sub( A ) is nonsingular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': Solve sub( A ) * X = sub( B ) (No transpose) * = 'T': Solve sub( A )**T * X = sub( B ) (Transpose) * = 'C': Solve sub( A )**T * X = sub( B ) (Transpose) * * DIAG (global input) CHARACTER * = 'N': sub( A ) is non-unit triangular; * = 'U': sub( A ) is unit triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed matrix sub( B ). NRHS >= 0. * * A (local input) DOUBLE PRECISION pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1) ). This array * contains the local pieces of the distributed triangular * matrix sub( A ). If UPLO = 'U', the leading N-by-N upper * triangular part of sub( A ) contains the upper triangular * matrix, and the strictly lower triangular part of sub( A ) * is not referenced. If UPLO = 'L', the leading N-by-N lower * triangular part of sub( A ) contains the lower triangular * matrix, and the strictly upper triangular part of sub( A ) * is not referenced. If DIAG = 'U', the diagonal elements of * sub( A ) are also not referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the right hand side distributed matrix * sub( B ). On exit, if INFO = 0, sub( B ) is overwritten by * the solution matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, the i-th diagonal element of sub( A ) is * zero, indicating that the submatrix is singular and the * solutions X have not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN, NOUNIT, UPPER INTEGER I, IAROW, IBROW, ICOFFA, ICTXT, ICURCOL, $ ICURROW, IROFFA, IROFFB, IDUM, II, IOFFA, J, $ JBLK, JJ, JN, LDA, LL, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMX2D, INFOG2L, $ PCHK2MAT, PDTRSM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL ICEIL, INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -907 ELSE UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) NOTRAN = LSAME( TRANS, 'N' ) * CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA .OR. IROFFA.NE.0 ) THEN INFO = -8 ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IBROW ) THEN INFO = -11 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -904 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -1304 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 2 ) = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'D' ) END IF IDUM2( 3 ) = 3 CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 9, N, 4, NRHS, 5, $ IB, JB, DESCB, 13, 3, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDTRTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * IF( NOUNIT ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, ICURROW, ICURCOL ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) IOFFA = II + ( JJ - 1 ) * LDA * * Handle first block separately * JBLK = JN-JA+1 IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN LL = IOFFA DO 10 I = 0, JBLK-1 IF( A( LL ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I + 1 LL = IOFFA + LDA + 1 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JBLK IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JBLK*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JBLK = MIN( JA+N-J, DESCA( NB_ ) ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN LL = IOFFA DO 20 I = 0, JBLK-1 IF( A( LL ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = J + I - JA + 1 LL = IOFFA + LDA + 1 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JBLK IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JBLK*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUM, IDUM, $ -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN END IF * * Solve A * x = b or A' * x = b. * CALL PDTRSM( 'Left', UPLO, TRANS, DIAG, N, NRHS, ONE, A, IA, JA, $ DESCA, B, IB, JB, DESCB ) * RETURN * * End of PDTRTRS * END scalapack-2.0.2/SRC/pdtzrzf.f000644 000766 000024 00000031042 10363532303 016210 0ustar00juliestaff000000 000000 SUBROUTINE PDTZRZF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PDTZRZF reduces the M-by-N ( M<=N ) real upper trapezoidal matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper triangular form by means * of orthogonal transformations. * * The upper trapezoidal matrix sub( A ) is factored as * * sub( A ) = ( R 0 ) * Z, * * where Z is an N-by-N orthogonal matrix and R is an M-by-M upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) DOUBLE PRECISION pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the leading M-by-M * upper triangular part of sub( A ) contains the upper trian- * gular matrix R, and elements M+1 to N of the first M rows of * sub( A ), with the array TAU, represent the orthogonal matrix * Z as a product of M elementary reflectors. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) DOUBLE PRECISION array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The factorization is obtained by Householder's method. The kth * transformation matrix, Z( k ), which is used to introduce zeros into * the (m - k + 1)th row of sub( A ), is given in the form * * Z( k ) = ( I 0 ), * ( 0 T( k ) ) * * where * * T( k ) = I - tau*u( k )*u( k )', u( k ) = ( 1 ), * ( 0 ) * ( z( k ) ) * * tau is a scalar and z( k ) is an ( n - m ) element vector. * tau and z( k ) are chosen to annihilate the elements of the kth row * of sub( A ). * * The scalar tau is returned in the kth element of TAU and the vector * u( k ) in the kth row of sub( A ), such that the elements of z( k ) * are in a( k, m + 1 ), ..., a( k, n ). The elements of R are returned * in the upper triangular part of sub( A ). * * Z is given by * * Z = Z( 1 ) * Z( 2 ) * ... * Z( m ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IIA, IL, IN, IPW, $ IROFFA, J, JM1, L, LWMIN, MP0, MYCOL, MYROW, $ NPCOL, NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, INFOG1L, PCHK1MAT, $ PDLATRZ, PDLARZB, PDLARZT, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = DBLE( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -9 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDTZRZF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( M.EQ.N ) THEN * CALL INFOG1L( IA, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ IIA, IAROW ) IF( MYROW.EQ.IAROW ) $ MP0 = MP0 - IROFFA DO 10 I = IIA, IIA+MP0-1 TAU( I ) = ZERO 10 CONTINUE * ELSE * L = N-M JM1 = JA + MIN( M+1, N ) - 1 IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IL = MAX( ( (IA+M-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * * Use blocked code initially * DO 20 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) J = JA + I - IA * * Compute the complete orthogonal factorization of the current * block A(i:i+ib-1,j:ja+n-1) * CALL PDLATRZ( IB, JA+N-J, L, A, I, J, DESCA, TAU, WORK ) * IF( I.GT.IA ) THEN * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PDLARZT( 'Backward', 'Rowwise', L, IB, A, I, JM1, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:i-1,j:ja+n-1) from the right * CALL PDLARZB( 'Right', 'No transpose', 'Backward', $ 'Rowwise', I-IA, JA+N-J, IB, L, A, I, JM1, $ DESCA, WORK, A, IA, J, DESCA, WORK( IPW ) ) END IF * 20 CONTINUE * * Use unblocked code to factor the last or only block * CALL PDLATRZ( IN-IA+1, N, N-M, A, IA, JA, DESCA, TAU, WORK ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = DBLE( LWMIN ) * RETURN * * End of PDTZRZF * END scalapack-2.0.2/SRC/pdzsum1.f000644 000766 000024 00000021104 10363532303 016106 0ustar00juliestaff000000 000000 SUBROUTINE PDZSUM1( N, ASUM, X, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N DOUBLE PRECISION ASUM * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX*16 X( * ) * .. * * Purpose * ======= * * PDZSUM1 returns the sum of absolute values of a complex * distributed vector sub( X ) in ASUM, * * where sub( X ) denotes X(IX:IX+N-1,JX:JX), if INCX = 1, * X(IX:IX,JX:JX+N-1), if INCX = M_X. * * Based on PDZASUM from the Level 1 PBLAS. The change is * to use the 'genuine' absolute value. * * The serial version of this routine was originally contributed by * Nick Higham for use with ZLACON. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * When the result of a vector-oriented PBLAS call is a scalar, it will * be made available only within the scope which owns the vector(s) * being operated on. Let X be a generic term for the input vector(s). * Then, the processes which receive the answer will be (note that if * an operation involves more than one vector, the processes which re- * ceive the result will be the union of the following calculation for * each vector): * * If N = 1, M_X = 1 and INCX = 1, then one can't determine if a process * row or process column owns the vector operand, therefore only the * process of coordinate {RSRC_X, CSRC_X} receives the result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process part of this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process part of this column receives the result; * * Parameters * ========== * * N (global input) pointer to INTEGER * The number of components of the distributed vector sub( X ). * N >= 0. * * ASUM (local output) pointer to DOUBLE PRECISION * The sum of absolute values of the distributed vector sub( X ) * only in its scope. * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER CCTOP, RCTOP INTEGER ICOFF, ICTXT, IIX, IROFF, IXCOL, IXROW, JJX, $ LDX, MYCOL, MYROW, NP, NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGSUM2D, INFOG2L, PB_TOPGET * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION DZSUM1 EXTERNAL DZSUM1, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * ASUM = ZERO IF( N.LE.0 ) $ RETURN * LDX = DESCX( LLD_ ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) * IF( INCX.EQ.1 .AND. DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IXROW .AND. MYCOL.EQ.IXCOL ) THEN ASUM = ABS( X( IIX+(JJX-1)*LDX ) ) END IF RETURN END IF * IF( INCX.EQ.DESCX( M_ ) ) THEN * * X is distributed over a process row * IF( MYROW.EQ.IXROW ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', RCTOP ) ICOFF = MOD( JX-1, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ-ICOFF ASUM = DZSUM1( NQ, X( IIX+(JJX-1)*LDX ), LDX ) CALL DGSUM2D( ICTXT, 'Rowwise', RCTOP, 1, 1, ASUM, 1, $ -1, MYCOL ) END IF * ELSE * * X is distributed over a process column * IF( MYCOL.EQ.IXCOL ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', CCTOP ) IROFF = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP-IROFF ASUM = DZSUM1( NP, X( IIX+(JJX-1)*LDX ), 1 ) CALL DGSUM2D( ICTXT, 'Columnwise', CCTOP, 1, 1, ASUM, 1, $ -1, MYCOL ) END IF * END IF * RETURN * * End of PDZSUM1 * END scalapack-2.0.2/SRC/pilaenvx.f000644 000766 000024 00000046315 11705175572 016360 0ustar00juliestaff000000 000000 INTEGER FUNCTION PILAENVX( ICTXT, ISPEC, NAME, OPTS, N1, N2, N3, $ N4 ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER*( * ) NAME, OPTS INTEGER ICTXT, ISPEC, N1, N2, N3, N4 * .. * * Purpose * ======= * * PILAENVX is called from the ScaLAPACK routines to choose problem- * dependent parameters for the local environment. See ISPEC for a * description of the parameters. * * This version provides a set of parameters which should give good, * but not optimal, performance on many of the currently available * computers. Users are encouraged to modify this subroutine to set * the tuning parameters for their particular machine using the option * and problem size information in the arguments. * * This routine will not function correctly if it is converted to all * lower case. Converting it to all upper case is allowed. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * ISPEC (global input) INTEGER * Specifies the parameter to be returned as the value of * PILAENVX. * = 1: the optimal blocksize; if this value is 1, an unblocked * algorithm will give the best performance (unlikely). * = 2: the minimum block size for which the block routine * should be used; if the usable block size is less than * this value, an unblocked routine should be used. * = 3: the crossover point (in a block routine, for N less * than this value, an unblocked routine should be used) * = 4: the number of shifts, used in the nonsymmetric * eigenvalue routines (DEPRECATED) * = 5: the minimum column dimension for blocking to be used; * rectangular blocks must have dimension at least k by m, * where k is given by PILAENVX(2,...) and m by PILAENVX(5,...) * = 6: the crossover point for the SVD (when reducing an m by n * matrix to bidiagonal form, if max(m,n)/min(m,n) exceeds * this value, a QR factorization is used first to reduce * the matrix to a triangular form.) * = 7: the number of processors * = 8: the crossover point for the multishift QR method * for nonsymmetric eigenvalue problems (DEPRECATED) * = 9: maximum size of the subproblems at the bottom of the * computation tree in the divide-and-conquer algorithm * (used by xGELSD and xGESDD) * =10: ieee NaN arithmetic can be trusted not to trap * =11: infinity arithmetic can be trusted not to trap * 12 <= ISPEC <= 16: * PxHSEQR or one of its subroutines, * see PIPARMQ for detailed explanation * 17 <= ISPEC <= 22: * Parameters for PBxTRORD/PxHSEQR (not all), as follows: * =17: maximum number of concurrent computational windows; * =18: number of eigenvalues/bulges in each window; * =19: computational window size; * =20: minimal percentage of flops required for * performing matrix-matrix multiplications instead * of pipelined orthogonal transformations; * =21: width of block column slabs for row-wise * application of pipelined orthogonal * transformations in their factorized form; * =22: the maximum number of eigenvalues moved together * over a process border; * =23: the number of processors involved in AED; * =99: Maximum iteration chunksize in OpenMP parallelization * * NAME (global input) CHARACTER*(*) * The name of the calling subroutine, in either upper case or * lower case. * * OPTS (global input) CHARACTER*(*) * The character options to the subroutine NAME, concatenated * into a single character string. For example, UPLO = 'U', * TRANS = 'T', and DIAG = 'N' for a triangular routine would * be specified as OPTS = 'UTN'. * * N1 (global input) INTEGER * N2 (global input) INTEGER * N3 (global input) INTEGER * N4 (global input) INTEGER * Problem dimensions for the subroutine NAME; these may not all * be required. * * (PILAENVX) (global output) INTEGER * >= 0: the value of the parameter specified by ISPEC * < 0: if PILAENVX = -k, the k-th argument had an illegal value. * * Further Details * =============== * * The following conventions have been used when calling ILAENV from the * LAPACK routines: * 1) OPTS is a concatenation of all of the character options to * subroutine NAME, in the same order that they appear in the * argument list for NAME, even if they are not used in determining * the value of the parameter specified by ISPEC. * 2) The problem dimensions N1, N2, N3, N4 are specified in the order * that they appear in the argument list for NAME. N1 is used * first, N2 second, and so on, and unused problem dimensions are * passed a value of -1. * 3) The parameter value returned by ILAENV is checked for validity in * the calling subroutine. For example, ILAENV is used to retrieve * the optimal blocksize for STRTRI as follows: * * NB = ILAENV( 1, 'STRTRI', UPLO // DIAG, N, -1, -1, -1 ) * IF( NB.LE.1 ) NB = MAX( 1, N ) * * The same conventions will hold for this ScaLAPACK-style variant. * * ===================================================================== * * .. Local Scalars .. INTEGER I, IC, IZ, NB, NBMIN, NX, NPROW, NPCOL, MYROW, $ MYCOL LOGICAL CNAME, SNAME CHARACTER C1*1, C2*2, C4*2, C3*3, SUBNAM*6 * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, INT, MIN, REAL * .. * .. External Functions .. INTEGER IEEECK, PIPARMQ, ICEIL EXTERNAL IEEECK, PIPARMQ, ICEIL * .. * .. Executable Statements .. * IF( ISPEC.GT.23 ) GO TO 990 GO TO ( 10, 10, 10, 80, 90, 100, 110, 120, $ 130, 140, 150, 160, 160, 160, 160, 160, $ 170, 180, 190, 200, 210, 220, 230, 160)ISPEC * * Invalid value for ISPEC * PILAENVX = -1 RETURN * 10 CONTINUE * * Convert NAME to upper case if the first character is lower case. * PILAENVX = 1 SUBNAM = NAME IC = ICHAR( SUBNAM( 1: 1 ) ) IZ = ICHAR( 'Z' ) IF( IZ.EQ.90 .OR. IZ.EQ.122 ) THEN * * ASCII character set * IF( IC.GE.97 .AND. IC.LE.122 ) THEN SUBNAM( 1: 1 ) = CHAR( IC-32 ) DO 20 I = 2, 6 IC = ICHAR( SUBNAM( I: I ) ) IF( IC.GE.97 .AND. IC.LE.122 ) $ SUBNAM( I: I ) = CHAR( IC-32 ) 20 CONTINUE END IF * ELSE IF( IZ.EQ.233 .OR. IZ.EQ.169 ) THEN * * EBCDIC character set * IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR. $ ( IC.GE.145 .AND. IC.LE.153 ) .OR. $ ( IC.GE.162 .AND. IC.LE.169 ) ) THEN SUBNAM( 1: 1 ) = CHAR( IC+64 ) DO 30 I = 2, 6 IC = ICHAR( SUBNAM( I: I ) ) IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR. $ ( IC.GE.145 .AND. IC.LE.153 ) .OR. $ ( IC.GE.162 .AND. IC.LE.169 ) )SUBNAM( I: $ I ) = CHAR( IC+64 ) 30 CONTINUE END IF * ELSE IF( IZ.EQ.218 .OR. IZ.EQ.250 ) THEN * * Prime machines: ASCII+128 * IF( IC.GE.225 .AND. IC.LE.250 ) THEN SUBNAM( 1: 1 ) = CHAR( IC-32 ) DO 40 I = 2, 6 IC = ICHAR( SUBNAM( I: I ) ) IF( IC.GE.225 .AND. IC.LE.250 ) $ SUBNAM( I: I ) = CHAR( IC-32 ) 40 CONTINUE END IF END IF * C1 = SUBNAM( 1: 1 ) SNAME = C1.EQ.'S' .OR. C1.EQ.'D' CNAME = C1.EQ.'C' .OR. C1.EQ.'Z' IF( .NOT.( CNAME .OR. SNAME ) ) $ RETURN C2 = SUBNAM( 2: 3 ) C3 = SUBNAM( 4: 6 ) C4 = C3( 2: 3 ) * GO TO ( 50, 60, 70 )ISPEC * 50 CONTINUE * * ISPEC = 1: block size * * In these examples, separate code is provided for setting NB for * real and complex. We assume that NB will take the same value in * single or double precision. * NB = 1 * IF( C2.EQ.'GE' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF ELSE IF( C3.EQ.'QRF' .OR. C3.EQ.'RQF' .OR. C3.EQ.'LQF' .OR. $ C3.EQ.'QLF' ) THEN IF( SNAME ) THEN NB = 32 ELSE NB = 32 END IF ELSE IF( C3.EQ.'HRD' ) THEN IF( SNAME ) THEN NB = 32 ELSE NB = 32 END IF ELSE IF( C3.EQ.'BRD' ) THEN IF( SNAME ) THEN NB = 32 ELSE NB = 32 END IF ELSE IF( C3.EQ.'TRI' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF END IF ELSE IF( C2.EQ.'PO' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF END IF ELSE IF( C2.EQ.'SY' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF ELSE IF( SNAME .AND. C3.EQ.'TRD' ) THEN NB = 32 ELSE IF( SNAME .AND. C3.EQ.'GST' ) THEN NB = 64 END IF ELSE IF( CNAME .AND. C2.EQ.'HE' ) THEN IF( C3.EQ.'TRF' ) THEN NB = 64 ELSE IF( C3.EQ.'TRD' ) THEN NB = 32 ELSE IF( C3.EQ.'GST' ) THEN NB = 64 END IF ELSE IF( SNAME .AND. C2.EQ.'OR' ) THEN IF( C3( 1: 1 ).EQ.'G' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NB = 32 END IF ELSE IF( C3( 1: 1 ).EQ.'M' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NB = 32 END IF END IF ELSE IF( CNAME .AND. C2.EQ.'UN' ) THEN IF( C3( 1: 1 ).EQ.'G' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NB = 32 END IF ELSE IF( C3( 1: 1 ).EQ.'M' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NB = 32 END IF END IF ELSE IF( C2.EQ.'GB' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN IF( N4.LE.64 ) THEN NB = 1 ELSE NB = 32 END IF ELSE IF( N4.LE.64 ) THEN NB = 1 ELSE NB = 32 END IF END IF END IF ELSE IF( C2.EQ.'PB' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN IF( N2.LE.64 ) THEN NB = 1 ELSE NB = 32 END IF ELSE IF( N2.LE.64 ) THEN NB = 1 ELSE NB = 32 END IF END IF END IF ELSE IF( C2.EQ.'TR' ) THEN IF( C3.EQ.'TRI' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF END IF ELSE IF( C2.EQ.'LA' ) THEN IF( C3.EQ.'UUM' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF END IF ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN IF( C3.EQ.'EBZ' ) THEN NB = 1 END IF END IF PILAENVX = NB RETURN * 60 CONTINUE * * ISPEC = 2: minimum block size * NBMIN = 2 IF( C2.EQ.'GE' ) THEN IF( C3.EQ.'QRF' .OR. C3.EQ.'RQF' .OR. C3.EQ.'LQF' .OR. C3.EQ. $ 'QLF' ) THEN IF( SNAME ) THEN NBMIN = 2 ELSE NBMIN = 2 END IF ELSE IF( C3.EQ.'HRD' ) THEN IF( SNAME ) THEN NBMIN = 2 ELSE NBMIN = 2 END IF ELSE IF( C3.EQ.'BRD' ) THEN IF( SNAME ) THEN NBMIN = 2 ELSE NBMIN = 2 END IF ELSE IF( C3.EQ.'TRI' ) THEN IF( SNAME ) THEN NBMIN = 2 ELSE NBMIN = 2 END IF END IF ELSE IF( C2.EQ.'SY' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN NBMIN = 8 ELSE NBMIN = 8 END IF ELSE IF( SNAME .AND. C3.EQ.'TRD' ) THEN NBMIN = 2 END IF ELSE IF( CNAME .AND. C2.EQ.'HE' ) THEN IF( C3.EQ.'TRD' ) THEN NBMIN = 2 END IF ELSE IF( SNAME .AND. C2.EQ.'OR' ) THEN IF( C3( 1: 1 ).EQ.'G' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NBMIN = 2 END IF ELSE IF( C3( 1: 1 ).EQ.'M' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NBMIN = 2 END IF END IF ELSE IF( CNAME .AND. C2.EQ.'UN' ) THEN IF( C3( 1: 1 ).EQ.'G' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NBMIN = 2 END IF ELSE IF( C3( 1: 1 ).EQ.'M' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NBMIN = 2 END IF END IF END IF PILAENVX = NBMIN RETURN * 70 CONTINUE * * ISPEC = 3: crossover point * NX = 0 IF( C2.EQ.'GE' ) THEN IF( C3.EQ.'QRF' .OR. C3.EQ.'RQF' .OR. C3.EQ.'LQF' .OR. C3.EQ. $ 'QLF' ) THEN IF( SNAME ) THEN NX = 128 ELSE NX = 128 END IF ELSE IF( C3.EQ.'HRD' ) THEN IF( SNAME ) THEN NX = 128 ELSE NX = 128 END IF ELSE IF( C3.EQ.'BRD' ) THEN IF( SNAME ) THEN NX = 128 ELSE NX = 128 END IF END IF ELSE IF( C2.EQ.'SY' ) THEN IF( SNAME .AND. C3.EQ.'TRD' ) THEN NX = 32 END IF ELSE IF( CNAME .AND. C2.EQ.'HE' ) THEN IF( C3.EQ.'TRD' ) THEN NX = 32 END IF ELSE IF( SNAME .AND. C2.EQ.'OR' ) THEN IF( C3( 1: 1 ).EQ.'G' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NX = 128 END IF END IF ELSE IF( CNAME .AND. C2.EQ.'UN' ) THEN IF( C3( 1: 1 ).EQ.'G' ) THEN IF( C4.EQ.'QR' .OR. C4.EQ.'RQ' .OR. C4.EQ.'LQ' .OR. C4.EQ. $ 'QL' .OR. C4.EQ.'HR' .OR. C4.EQ.'TR' .OR. C4.EQ.'BR' ) $ THEN NX = 128 END IF END IF END IF PILAENVX = NX RETURN * 80 CONTINUE * * ISPEC = 4: number of shifts (used by xHSEQR) * PILAENVX = 6 RETURN * 90 CONTINUE * * ISPEC = 5: minimum column dimension (not used) * PILAENVX = 2 RETURN * 100 CONTINUE * * ISPEC = 6: crossover point for SVD (used by xGELSS and xGESVD) * PILAENVX = INT( REAL( MIN( N1, N2 ) )*1.6E0 ) RETURN * 110 CONTINUE * * ISPEC = 7: number of processors (not used) * PILAENVX = 1 RETURN * 120 CONTINUE * * ISPEC = 8: crossover point for multishift (used by xHSEQR) * PILAENVX = 50 RETURN * 130 CONTINUE * * ISPEC = 9: maximum size of the subproblems at the bottom of the * computation tree in the divide-and-conquer algorithm * (used by xGELSD and xGESDD) * PILAENVX = 25 RETURN * 140 CONTINUE * * ISPEC = 10: ieee NaN arithmetic can be trusted not to trap * * PILAENVX = 0 PILAENVX = 1 IF( PILAENVX.EQ.1 ) THEN PILAENVX = IEEECK( 0, 0.0, 1.0 ) END IF RETURN * 150 CONTINUE * * ISPEC = 11: infinity arithmetic can be trusted not to trap * * PILAENVX = 0 PILAENVX = 1 IF( PILAENVX.EQ.1 ) THEN PILAENVX = IEEECK( 1, 0.0, 1.0 ) END IF RETURN * 160 CONTINUE * * 12 <= ISPEC <= 16 or ISPEC = 24: xHSEQR or one of its subroutines. * PILAENVX = PIPARMQ( ICTXT, ISPEC, NAME, OPTS, N1, N2, N3, N4 ) RETURN * 170 CONTINUE * * ISPEC = 17: maximum number of independent computational windows * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) PILAENVX = MIN( ICEIL(N1,N2), MIN( NPROW, NPCOL) ) RETURN * 180 CONTINUE * * ISPEC = 18: number of eigenvalues in each window * PILAENVX = MIN(N2/2,40) RETURN * 190 CONTINUE * * ISPEC = 19: computational window size * PILAENVX = MIN(N2,80) RETURN * 200 CONTINUE * * ISPEC = 20: minimal percentage of flops required for * performing matrix-matrix multiplications instead of * pipelined orthogonal transformations * * PILAENVX = 50 RETURN * 210 CONTINUE * * ISPEC = 21: width of block column slabs for row-wise * application of pipelined orthogonal transformations in * their factorized form * * PILAENVX = MIN(N2,32) RETURN * 220 CONTINUE * * ISPEC = 22: maximum number of eigenvalues to bring over * the block border * * PILAENVX = MIN(N2/2,40) RETURN 230 CONTINUE * * ISPEC = 23: number of processors involved in AED * * PILAENVX = ICEIL(N1, ICEIL(384, N2)*N2) RETURN 990 CONTINUE * * ISPEC = 99: maximum chunksize of iterations in OpenMP * parallelization * PILAENVX = 32 RETURN * * End of PILAENVX * END scalapack-2.0.2/SRC/pilaver.f000644 000766 000024 00000002004 11705402646 016152 0ustar00juliestaff000000 000000 SUBROUTINE PILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) C C -- ScaLAPACK computational routine (version 2.0.1 ) -- C -- ScaLAPACK is a software package provided by Univ. of Tennessee, -- C -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- C January 2012 C C Purpose C ======= C C This subroutine return the ScaLAPACK version. C C Arguments C ========= C VERS_MAJOR (output) INTEGER C return the scalapack major version C VERS_MINOR (output) INTEGER C return the scalapack minor version from the major version C VERS_PATCH (output) INTEGER C return the scalapack patch version from the minor version C ===================================================================== C INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH C ===================================================================== VERS_MAJOR = 2 VERS_MINOR = 0 VERS_PATCH = 1 C ===================================================================== C RETURN END scalapack-2.0.2/SRC/piparmq.f000644 000766 000024 00000026754 11705175572 016210 0ustar00juliestaff000000 000000 INTEGER FUNCTION PIPARMQ( ICTXT, ISPEC, NAME, OPTS, N, ILO, IHI, $ LWORKNB ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER ICTXT, IHI, ILO, ISPEC, LWORKNB, N CHARACTER NAME*( * ), OPTS*( * ) * * Purpose * ======= * * This program sets problem and machine dependent parameters * useful for PxHSEQR and its subroutines. It is called whenever * PILAENVX is called with 12 <= ISPEC <= 16 * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, * indicating the global context of the operation. The * context itself is global, but the value of ICTXT is * local. * * ISPEC (global input) INTEGER * ISPEC specifies which tunable parameter PIPARMQ should * return. * * ISPEC=12: (INMIN) Matrices of order nmin or less * are sent directly to PxLAHQR, the implicit * double shift QR algorithm. NMIN must be * at least 11. * * ISPEC=13: (INWIN) Size of the deflation window. * This is best set greater than or equal to * the number of simultaneous shifts NS. * Larger matrices benefit from larger deflation * windows. * * ISPEC=14: (INIBL) Determines when to stop nibbling and * invest in an (expensive) multi-shift QR sweep. * If the aggressive early deflation subroutine * finds LD converged eigenvalues from an order * NW deflation window and LD.GT.(NW*NIBBLE)/100, * then the next QR sweep is skipped and early * deflation is applied immediately to the * remaining active diagonal block. Setting * PIPARMQ(ISPEC=14) = 0 causes PxLAQR0 to skip a * multi-shift QR sweep whenever early deflation * finds a converged eigenvalue. Setting * PIPARMQ(ISPEC=14) greater than or equal to 100 * prevents PxLAQR0 from skipping a multi-shift * QR sweep. * * ISPEC=15: (NSHFTS) The number of simultaneous shifts in * a multi-shift QR iteration. * * ISPEC=16: (IACC22) PIPARMQ is set to 1 or 2 with the * following meanings. * 1: During the multi-shift QR sweep, * PxLAQR5 and/or xLAQR6 accumulates reflections * and uses matrix-matrix multiply to update * the far-from-diagonal matrix entries. * 2: During the multi-shift QR sweep. * PxLAQR5 accumulates reflections and takes * advantage of 2-by-2 block structure during * matrix-matrix multiplies. * * ( IACC22=0 is valid in LAPACK but not here. * Householder reflections are always accumulated * for the performance consideration. * If xTRMM is slower than xGEMM or NB is small, * PIPARMQ(ISPEC=16)=1 may be more efficient than * PIPARMQ(ISPEC=16)=2 despite the greater level of * arithmetic work implied by the latter choice. ) * * NAME (global input) character string * Name of the calling subroutine * * OPTS (global input) character string * This is a concatenation of the string arguments to * TTQRE. * * N (global input) integer scalar * N is the order of the Hessenberg matrix H. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that H is already upper triangular * in rows and columns 1:ILO-1 and IHI+1:N. * * LWORKNB (global input) INTEGER * The amount of workspace available or the blockfactor. * * Further Details * =============== * * Little is known about how best to choose these parameters. * It is possible to use different values of the parameters * for each of PCHSEQR, PDHSEQR, PSHSEQR and PZHSEQR. * * It is probably best to choose different parameters for * different matrices and different parameters at different * times during the iteration, but this has not been fully * implemented --- yet. * * * The best choices of most of the parameters depend * in an ill-understood way on the relative execution * rate of PxLAQR3 and PxLAQR5 and on the nature of each * particular eigenvalue problem. Experiment may be the * only practical way to determine which choices are most * effective. * * Following is a list of default values supplied by PIPARMQ. * These defaults may be adjusted in order to attain better * performance in any particular computational environment. * * PIPARMQ(ISPEC=12) The PxLAQR1 vs PxLAQR0 crossover point. * Default: 220. (Must be at least 11.) * * PIPARMQ(ISPEC=13) Recommended deflation window size. * This depends on ILO, IHI and NS, the * number of simultaneous shifts returned * by PIPARMQ(ISPEC=15). The default for * (IHI-ILO+1).LE.500 is NS. The default * for (IHI-ILO+1).GT.500 is 3*NS/2. * * PIPARMQ(ISPEC=14) Nibble crossover point. * The default for the serial case is 14. * The default for the parallel case is * 335 * N**(-0.44) * NPROCS. * * PIPARMQ(ISPEC=15) Number of simultaneous shifts, NS. * a multi-shift QR iteration. * * If IHI-ILO+1 is ... * * greater than ...but less ... the * or equal to ... than default is * * 0 30 NS = 2+ * 30 60 NS = 4+ * 60 150 NS = 10 * 150 590 NS = ** * 590 3000 NS = 64 * 3000 6000 NS = 128 * 6000 12000 NS = 256 * 12000 24000 NS = 512 * 24000 48000 NS = 1024 * 48000 96000 NS = 2048 * 96000 INFINITY NS = 4096 * * (+) By default matrices of this order are * passed to the implicit double shift routine * PxLAQR1. See PIPARMQ(ISPEC=12) above. These * values of NS are used only in case of a rare * PxLAQR1 failure. * * (**) The asterisks (**) indicate an ad-hoc * function increasing from 10 to 64. * * PIPARMQ(ISPEC=16) Select structured matrix multiply. * (See ISPEC=16 above for details.) * Default: 3. * * ================================================================ * .. Parameters .. INTEGER INMIN, INWIN, INIBL, ISHFTS, IACC22 PARAMETER ( INMIN = 12, INWIN = 13, INIBL = 14, $ ISHFTS = 15, IACC22 = 16 ) INTEGER NMIN, NMIN2, K22MIN, KACMIN, NIBBLE, KNWSWP PARAMETER ( NMIN = 220, K22MIN = 14, KACMIN = 14, $ NIBBLE = 14, KNWSWP = 500, NMIN2 = 770 ) REAL TWO PARAMETER ( TWO = 2.0 ) * .. * .. Local Scalars .. INTEGER NH, NS, MYROW, MYCOL, NPROW, NPCOL, NP * .. * .. Intrinsic Functions .. INTRINSIC LOG, MAX, MOD, NINT, REAL * .. * .. External functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( ( ISPEC.EQ.ISHFTS ) .OR. ( ISPEC.EQ.INWIN ) .OR. $ ( ISPEC.EQ.IACC22 ) ) THEN * * ==== Set the number simultaneous shifts ==== * NH = IHI - ILO + 1 NS = 2 IF( NH.GE.30 ) $ NS = 4 IF( NH.GE.60 ) $ NS = 10 IF( NH.GE.150 ) $ NS = MAX( 10, NH / NINT( LOG( REAL( NH ) ) / LOG( TWO ) )) IF( NH.GE.590 ) $ NS = 64 IF( NH.GE.3000 ) $ NS = 128 IF( NH.GE.6000 ) $ NS = 256 IF( NH.GE.12000 ) $ NS = 512 IF( NH.GE.24000 ) $ NS = 1024 IF( NH.GE.48000 ) $ NS = 2048 IF( NH.GE.96000 ) $ NS = 4096 IF( NH.GE.192000 ) $ NS = 8192 IF( NH.GE.384000 ) $ NS = 16384 IF( NH.GE.768000 ) $ NS = 32768 IF( NH.GE.1000000 ) $ NS = ICEIL( NH, 25 ) NS = MAX( NS, 2*MIN(NPROW,NPCOL) ) NS = MAX( 2, NS-MOD( NS, 2 ) ) END IF * IF( ISPEC.EQ.INMIN ) THEN * * * ===== Submatrices of order smaller than NMIN*min(P_r,P_c) * . get sent to PxLAHQR, the classic ScaLAPACK algorithm. * . This must be at least 11. ==== * PIPARMQ = NMIN * MIN( NPROW, NPCOL ) * ELSE IF( ISPEC.EQ.INIBL ) THEN * * ==== INIBL: skip a multi-shift QR iteration and * . whenever aggressive early deflation finds * . at least (NIBBLE*(window size)/100) deflations. ==== * NP = MIN( NPROW, NPCOL ) IF( NP.EQ.1 ) THEN PIPARMQ = NIBBLE ELSE NH = IHI - ILO + 1 PIPARMQ = MIN( 100, $ CEILING( 335.0D+0 * NH**(-0.44D+0) * NP ) ) END IF * ELSE IF( ISPEC.EQ.ISHFTS ) THEN * * ==== NSHFTS: The number of simultaneous shifts ===== * PIPARMQ = NS * ELSE IF( ISPEC.EQ.INWIN ) THEN * * ==== NW: deflation window size. ==== * IF( NH.LE.KNWSWP ) THEN PIPARMQ = NS ELSE PIPARMQ = 3*NS / 2 END IF * ELSE IF( ISPEC.EQ.IACC22 ) THEN * * ==== IACC22: Whether to use 2-by-2 block structure while * . doing it. A small amount of work could be saved * . by making this choice dependent also upon the * . NH=IHI-ILO+1. * PIPARMQ = 1 c PIPARMQ = 0 c IF( NS.GE.KACMIN ) c $ PIPARMQ = 1 IF( NS.GE.K22MIN ) $ PIPARMQ = 2 * ELSE * ===== invalid value of ispec ===== PIPARMQ = -1 * END IF * * ==== End of PIPARMQ ==== * END scalapack-2.0.2/SRC/pjlaenv.f000644 000766 000024 00000025353 11622500733 016155 0ustar00juliestaff000000 000000 INTEGER FUNCTION PJLAENV( ICTXT, ISPEC, NAME, OPTS, N1, $ N2, N3, N4 ) * * -- ScaLAPACK test routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER*( * ) NAME, OPTS INTEGER ICTXT, ISPEC, N1, N2, N3, N4 * .. * * Purpose * * ======= * * PJLAENV is called from the ScaLAPACK symmetric and Hermitian * tailored eigen-routines to choose * problem-dependent parameters for the local environment. See ISPEC * for a description of the parameters. * * This version provides a set of parameters which should give good, * but not optimal, performance on many of the currently available * computers. Users are encouraged to modify this subroutine to set * the tuning parameters for their particular machine using the option * and problem size information in the arguments. * * This routine will not function correctly if it is converted to all * lower case. Converting it to all upper case is allowed. * * Arguments * ========= * * ISPEC (global input) INTEGER * Specifies the parameter to be returned as the value of * PJLAENV. * = 1: the data layout blocksize; * = 2: the panel blocking factor; * = 3: the algorithmic blocking factor; * = 4: execution path control; * = 5: maximum size for direct call to the LAPACK routine * * NAME (global input) CHARACTER*(*) * The name of the calling subroutine, in either upper case or * lower case. * * OPTS (global input) CHARACTER*(*) * The character options to the subroutine NAME, concatenated * into a single character string. For example, UPLO = 'U', * TRANS = 'T', and DIAG = 'N' for a triangular routine would * be specified as OPTS = 'UTN'. * * N1 (global input) INTEGER * N2 (global input) INTEGER * N3 (global input) INTEGER * N4 (global input) INTEGER * Problem dimensions for the subroutine NAME; these may not all * be required. * * At present, only N1 is used, and it (N1) is used only for * 'TTRD' * * (PJLAENV) (global or local output) INTEGER * >= 0: the value of the parameter specified by ISPEC * < 0: if PJLAENV = -k, the k-th argument had an illegal * value. * * Most parameters set via a call to PJLAENV must be identical * on all processors and hence PJLAENV will return the same * value to all procesors (i.e. global output). However some, * in particular, the panel blocking factor can be different * on each processor and hence PJLAENV can return different * values on different processors (i.e. local output). * * Further Details * =============== * * The following conventions have been used when calling PJLAENV from * the ScaLAPACK routines: * 1) OPTS is a concatenation of all of the character options to * subroutine NAME, in the same order that they appear in the * argument list for NAME, even if they are not used in determining * the value of the parameter specified by ISPEC. * 2) The problem dimensions N1, N2, N3, N4 are specified in the order * that they appear in the argument list for NAME. N1 is used * first, N2 second, and so on, and unused problem dimensions are * passed a value of -1. * 3) The parameter value returned by PJLAENV is checked for validity * in the calling subroutine. For example, PJLAENV is used to * retrieve the optimal blocksize for STRTRI as follows: * * NB = PJLAENV( 1, 'STRTRI', UPLO // DIAG, N, -1, -1, -1 ) * IF( NB.LE.1 ) NB = MAX( 1, N ) * * PJLAENV is patterned after ILAENV and keeps the same interface in * anticipation of future needs, even though PJLAENV is only sparsely * used at present in ScaLAPACK. Most ScaLAPACK codes use the input * data layout blocking factor as the algorithmic blocking factor - * hence there is no need or opportunity to set the algorithmic or * data decomposition blocking factor. * * pXYYtevx.f and pXYYtgvx.f and pXYYttrd.f are the only codes which * call PJLAENV in this release. pXYYtevx.f and pXYYtgvx.f redistribute * the data to the best data layout for each transformation. pXYYttrd.f * uses a data layout blocking factor of 1 and a * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL CNAME, GLOBAL, SNAME CHARACTER C1 CHARACTER*2 C2, C4 CHARACTER*3 C3 CHARACTER*8 SUBNAM INTEGER I, IC, IDUMM, IZ, MSZ, NB * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR * .. * * * .. External Subroutines .. EXTERNAL IGAMX2D * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * * GO TO ( 10, 10, 10, 10, 10 )ISPEC * * Invalid value for ISPEC * PJLAENV = -1 RETURN * 10 CONTINUE * * Convert NAME to upper case if the first character is lower case. * PJLAENV = 1 SUBNAM = NAME IC = ICHAR( SUBNAM( 1: 1 ) ) IZ = ICHAR( 'Z' ) IF( IZ.EQ.100 .OR. IZ.EQ.122 ) THEN * * ASCII character set * IF( IC.GE.97 .AND. IC.LE.122 ) THEN SUBNAM( 1: 1 ) = CHAR( IC-32 ) DO 20 I = 2, 6 IC = ICHAR( SUBNAM( I: I ) ) IF( IC.GE.97 .AND. IC.LE.122 ) $ SUBNAM( I: I ) = CHAR( IC-32 ) 20 CONTINUE END IF * ELSE IF( IZ.EQ.233 .OR. IZ.EQ.169 ) THEN * * EBCDIC character set * IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR. $ ( IC.GE.145 .AND. IC.LE.153 ) .OR. $ ( IC.GE.162 .AND. IC.LE.169 ) ) THEN SUBNAM( 1: 1 ) = CHAR( IC+64 ) DO 30 I = 2, 6 IC = ICHAR( SUBNAM( I: I ) ) IF( ( IC.GE.129 .AND. IC.LE.137 ) .OR. $ ( IC.GE.145 .AND. IC.LE.153 ) .OR. $ ( IC.GE.162 .AND. IC.LE.169 ) )SUBNAM( I: $ I ) = CHAR( IC+64 ) 30 CONTINUE END IF * ELSE IF( IZ.EQ.218 .OR. IZ.EQ.250 ) THEN * * Prime machines: ASCII+128 * IF( IC.GE.225 .AND. IC.LE.250 ) THEN SUBNAM( 1: 1 ) = CHAR( IC-32 ) DO 40 I = 2, 6 IC = ICHAR( SUBNAM( I: I ) ) IF( IC.GE.225 .AND. IC.LE.250 ) $ SUBNAM( I: I ) = CHAR( IC-32 ) 40 CONTINUE END IF END IF * C1 = SUBNAM( 2: 2 ) SNAME = C1.EQ.'S' .OR. C1.EQ.'D' CNAME = C1.EQ.'C' .OR. C1.EQ.'Z' IF( .NOT.( CNAME .OR. SNAME ) ) $ RETURN C2 = SUBNAM( 3: 4 ) C3 = SUBNAM( 5: 7 ) C4 = C3( 2: 3 ) * * This is to keep ftnchek happy * IF( ( N2+N3+N4 )*0.NE.0 ) THEN C4 = OPTS C3 = C4 END IF * GO TO ( 50, 60, 70, 80, 90 )ISPEC * 50 CONTINUE * * ISPEC = 1: data layout block size * (global - all processes must use the same value) * * In these examples, separate code is provided for setting NB for * real and complex. We assume that NB will take the same value in * single or double precision. * NB = 1 * IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'LLT' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF ELSE IF( C3.EQ.'TTR' ) THEN IF( SNAME ) THEN NB = 1 ELSE NB = 1 END IF ELSE IF( C3.EQ.'GST' ) THEN IF( SNAME ) THEN NB = 32 ELSE NB = 32 END IF ELSE IF( C3.EQ.'BCK' ) THEN IF( SNAME ) THEN NB = 32 ELSE NB = 32 END IF ELSE IF( C3.EQ.'TRS' ) THEN IF( SNAME ) THEN NB = 64 ELSE NB = 64 END IF END IF END IF * * PJLAENV = NB GLOBAL = .TRUE. GO TO 100 * 60 CONTINUE * * ISPEC = 2: panel blocking factor (Used only in PxyyTTRD) * (local - different processes may use different values) * NB = 16 IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'TTR' ) THEN IF( SNAME ) THEN NB = 32 ELSE NB = 32 END IF END IF END IF PJLAENV = NB GLOBAL = .FALSE. GO TO 100 * * 70 CONTINUE * * ISPEC = 3: algorithmic blocking factor (Used only in PxyyTTRD) * (global - all processes must use the same value) * NB = 1 IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'TTR' ) THEN IF( SNAME ) THEN NB = 16 ELSE NB = 16 END IF END IF END IF PJLAENV = NB GLOBAL = .TRUE. GO TO 100 * 80 CONTINUE * * ISPEC = 4: Execution path options (Used only in PxyyTTRD) * (global - all processes must use the same value) * PJLAENV = -4 IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'TTR' ) THEN * V and H interleaved (default is not interleaved) IF( N1.EQ.1 ) THEN PJLAENV = 1 END IF * * Two ZGEMMs (default is one ZGEMM) IF( N1.EQ.2 ) THEN PJLAENV = 0 END IF * Balanced Update (default is minimum communication update) IF( N1.EQ.3 ) THEN PJLAENV = 0 END IF END IF END IF GLOBAL = .TRUE. GO TO 100 * 90 CONTINUE * * ISPEC = 5: Minimum size to justify call to parallel code * (global - all processes must use the same value) * MSZ = 0 IF( C2.EQ.'SY' .OR. C2.EQ.'HE' ) THEN IF( C3.EQ.'TTR' ) THEN IF( SNAME ) THEN MSZ = 100 ELSE MSZ = 100 END IF END IF END IF PJLAENV = MSZ GLOBAL = .TRUE. GO TO 100 * 100 CONTINUE * IF( GLOBAL ) THEN IDUMM = 0 CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, PJLAENV, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) END IF * * * RETURN * * End of PJLAENV * END scalapack-2.0.2/SRC/pmpcol.f000644 000766 000024 00000006243 11750130340 016000 0ustar00juliestaff000000 000000 *********************************************************************** * * Auxiliary subroutine for eigenpair assignments * *********************************************************************** SUBROUTINE PMPCOL( MYPROC, NPROCS, IIL, NEEDIL, NEEDIU, $ PMYILS, PMYIUS, $ COLBRT, FRSTCL, LASTCL ) IMPLICIT NONE * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER FRSTCL, IIL, LASTCL, MYPROC, NEEDIL, NEEDIU, $ NPROCS LOGICAL COLBRT * .. * .. Array Arguments .. INTEGER PMYILS( * ), PMYIUS( * ) * .. * * Purpose * ======= * * Using the output from PMPIM2 and given the information on * eigenvalue clusters, PMPCOL finds the collaborators of MYPROC. * * Arguments * ========= * * MYPROC (input) INTEGER * The processor number, 0 <= MYPROC < NPROCS * * NPROCS (input) INTEGER * The total number of processors available * * IIL (input) INTEGER * The index of the leftmost eigenvalue in W * * NEEDIL (input) INTEGER * The leftmost position in W needed by MYPROC * * NEEDIU (input) INTEGER * The rightmost position in W needed by MYPROC * * PMYILS (input) INTEGER array * For each processor p, PMYILS(p) is the index * of the first eigenvalue in W to be computed * PMYILS(p) equals zero if p stays idle * * PMYIUS (input) INTEGER array * For each processor p, PMYIUS(p) is the index * of the last eigenvalue in W to be computed * PMYIUS(p) equals zero if p stays idle * * COLBRT (output) LOGICAL * TRUE if MYPROC collaborates. * * FRSTCL (output) INTEGER * LASTCL FIRST and LAST collaborator of MYPROC * MYPROC collaborates with * FRSTCL, ..., MYPROC-1, MYPROC+1, ...,LASTCL * If MYPROC == FRSTCL, there are no collaborators * on the left. IF MYPROC == LASTCL, there are no * collaborators on the right. * If FRSTCL == 0 and LASTCL = NPROCS-1, then * MYPROC collaborates with everybody * * .. Local Scalars .. INTEGER I, NEEDIIL, NEEDIIU * .. * .. Executable Statements .. * Compute global eigenvalue index from position in W NEEDIIL = NEEDIL + IIL - 1 NEEDIIU = NEEDIU + IIL - 1 * Find processor responsible for NEEDIL, this is the first * collaborator DO 1 I = 1, NPROCS IF( PMYILS(I).GT.NEEDIIL) GOTO 2 FRSTCL = I-1 1 CONTINUE 2 CONTINUE * Find processor responsible for NEEDIU, this is the last * collaborator DO 3 I = NPROCS,1,-1 IF( PMYIUS(I).LT.NEEDIIU ) THEN * Need to check special case: does this proc work at all? IF( PMYIUS(I).GT.0 ) $ GOTO 4 ENDIF LASTCL = I-1 3 CONTINUE 4 CONTINUE * Decide if there is a collaboration IF( (FRSTCL.LT.MYPROC).OR.(LASTCL.GT.MYPROC) ) THEN COLBRT = .TRUE. ELSE COLBRT = .FALSE. ENDIF RETURN END scalapack-2.0.2/SRC/pmpim2.f000644 000766 000024 00000004217 11750130340 015711 0ustar00juliestaff000000 000000 *********************************************************************** * * Auxiliary subroutine for eigenpair assignments * *********************************************************************** SUBROUTINE PMPIM2( IL, IU, NPROCS, PMYILS, PMYIUS ) IMPLICIT NONE * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER PMYILS( * ), PMYIUS( * ) * .. * .. Array Arguments .. INTEGER IL, IU, M, NPROCS, PRCCTR * .. * * Purpose * ======= * * PMPIM2 is the scheduling subroutine. * It computes for all processors the eigenpair range assignments. * * Arguments * ========= * * IL, IU (input) INTEGER * The range of eigenpairs to be computed * * NPROCS (input) INTEGER * The total number of processors available * * PMYILS (output) INTEGER array * For each processor p, PMYILS(p) is the index * of the first eigenvalue in W to be computed * PMYILS(p) equals zero if p stays idle * * PMYIUS (output) INTEGER array * For each processor p, PMYIUS(p) is the index * of the last eigenvalue in W to be computed * PMYIUS(p) equals zero if p stays idle * * .. Executable Statements .. M = IU - IL + 1 IF ( NPROCS.GT.M ) THEN DO 10 PRCCTR = 0, NPROCS-1 IF ( PRCCTR.LT.M ) THEN PMYILS(PRCCTR+1) = PRCCTR + IL PMYIUS(PRCCTR+1) = PRCCTR + IL ELSE PMYILS(PRCCTR+1) = 0 PMYIUS(PRCCTR+1) = 0 END IF 10 CONTINUE ELSE DO 20 PRCCTR = 0, NPROCS-1 PMYILS(PRCCTR+1) = (PRCCTR * (M / NPROCS)) + IL IF (PRCCTR.LT.MOD(M, NPROCS)) THEN PMYILS(PRCCTR+1) = PMYILS(PRCCTR+1) + PRCCTR PMYIUS(PRCCTR+1) = PMYILS(PRCCTR+1) + M / NPROCS ELSE PMYILS(PRCCTR+1) = PMYILS(PRCCTR+1) + MOD(M, NPROCS) PMYIUS(PRCCTR+1) = PMYILS(PRCCTR+1) + M / NPROCS - 1 END IF 20 CONTINUE END IF RETURN END scalapack-2.0.2/SRC/pscsum1.f000644 000766 000024 00000021065 10363532303 016104 0ustar00juliestaff000000 000000 SUBROUTINE PSCSUM1( N, ASUM, X, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N REAL ASUM * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX X( * ) * .. * * Purpose * ======= * * PSCSUM1 returns the sum of absolute values of a complex * distributed vector sub( X ) in ASUM, * * where sub( X ) denotes X(IX:IX+N-1,JX:JX), if INCX = 1, * X(IX:IX,JX:JX+N-1), if INCX = M_X. * * Based on PSCASUM from the Level 1 PBLAS. The change is * to use the 'genuine' absolute value. * * The serial version of this routine was originally contributed by * Nick Higham for use with CLACON. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * When the result of a vector-oriented PBLAS call is a scalar, it will * be made available only within the scope which owns the vector(s) * being operated on. Let X be a generic term for the input vector(s). * Then, the processes which receive the answer will be (note that if * an operation involves more than one vector, the processes which re- * ceive the result will be the union of the following calculation for * each vector): * * If N = 1, M_X = 1 and INCX = 1, then one can't determine if a process * row or process column owns the vector operand, therefore only the * process of coordinate {RSRC_X, CSRC_X} receives the result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process part of this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process part of this column receives the result; * * Parameters * ========== * * N (global input) pointer to INTEGER * The number of components of the distributed vector sub( X ). * N >= 0. * * ASUM (local output) pointer to REAL * The sum of absolute values of the distributed vector sub( X ) * only in its scope. * * X (local input) COMPLEX array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER CCTOP, RCTOP INTEGER ICOFF, ICTXT, IIX, IROFF, IXCOL, IXROW, JJX, $ LDX, MYCOL, MYROW, NP, NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SGSUM2D, PB_TOPGET * .. * .. External Functions .. INTEGER NUMROC REAL SCSUM1 EXTERNAL NUMROC, SCSUM1 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * ASUM = ZERO IF( N.LE.0 ) $ RETURN * LDX = DESCX( LLD_ ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) * IF( INCX.EQ.1 .AND. DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IXROW .AND. MYCOL.EQ.IXCOL ) THEN ASUM = ABS( X( IIX+(JJX-1)*LDX ) ) END IF RETURN END IF * IF( INCX.EQ.DESCX( M_ ) ) THEN * * X is distributed over a process row * IF( MYROW.EQ.IXROW ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', RCTOP ) ICOFF = MOD( JX-1, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ-ICOFF ASUM = SCSUM1( NQ, X( IIX+(JJX-1)*LDX ), LDX ) CALL SGSUM2D( ICTXT, 'Rowwise', RCTOP, 1, 1, ASUM, 1, $ -1, MYCOL ) END IF * ELSE * * X is distributed over a process column * IF( MYCOL.EQ.IXCOL ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', CCTOP ) IROFF = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP-IROFF ASUM = SCSUM1( NP, X( IIX+(JJX-1)*LDX ), 1 ) CALL SGSUM2D( ICTXT, 'Columnwise', CCTOP, 1, 1, ASUM, 1, $ -1, MYCOL ) END IF * END IF * RETURN * * End of PSCSUM1 * END scalapack-2.0.2/SRC/psdbsv.f000644 000766 000024 00000045113 10363532303 016012 0ustar00juliestaff000000 000000 SUBROUTINE PSDBSV( N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER BWL, BWU, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PSDBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * banded diagonally dominant-like distributed * matrix with bandwidth BWL, BWU. * * Gaussian elimination without pivoting * is used to factor a reordering * of the matrix into L U. * * See PSDBTRF and PSDBTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * +max((max(bwl,bwu)*NRHS), max(bwl,bwu)*max(bwl,bwu)) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PSDBTRF, PSDBTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PSDBTRF and PSDBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PSDBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * * Factor the matrix * CALL PSDBTRF( N, BWL, BWU, A, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PSDBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PSDBTRS( 'N', N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSDBSV', -INFO ) RETURN END IF * RETURN * * End of PSDBSV * END scalapack-2.0.2/SRC/psdbtrf.f000644 000766 000024 00000126722 11750130340 016157 0ustar00juliestaff000000 000000 SUBROUTINE PSDBTRF( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), AF( * ), WORK( * ) * .. * * Purpose * ======= * * PSDBTRF computes a LU factorization * of an N-by-N real banded * diagonally dominant-like distributed matrix * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PSDBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = L U * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSDBTRF and this is stored in AF. If a linear system * is to be solved using PSDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * max(bwl,bwu)*max(bwl,bwu) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * Last modified by: Peter Arbenz, Institute of Scientific Computing, * ETH, Zurich. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, I1, I2, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN, $ LEVEL_DIST, LLDA, MAX_BW, MBW2, MYCOL, MYROW, $ MY_NUM_COLS, NB, NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, OFST, PART_OFFSET, PART_SIZE, $ PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE, $ STORE_N_A, UP_PREV_TRI_SIZE_M, $ UP_PREV_TRI_SIZE_N, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, SAXPY, SDBTRF, $ DESC_CONVERT, SGEMM, SGEMV, SGERV2D, SGESD2D, $ SLAMOV, SLATCPY, STBTRS, STRMM, STRRV2D, $ STRSD2D, GLOBCHK, IGAMX2D, IGEBR2D, IGEBS2D, $ PXERBLA, RESHAPE * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 6*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX( BWL, BWU ) MBW2 = MAX_BW*MAX_BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -10 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 6*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -2 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -3 END IF * IF( LLDA.LT.( BWL+BWU+1 ) ) THEN INFO = -( 6*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 6*100+4 ) END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 6*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PSDBTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*MAX( BWL, BWU ) ) ) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, 'PSDBTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = NB*( BWL+BWU ) + 6*MAX( BWL, BWU )*MAX( BWL, BWU ) * IF( LAF.LT.LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PSDBTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = MAX( BWL, BWU )*MAX( BWL, BWU ) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, 'PSDBTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BWU PARAM_CHECK( 3, 1 ) = BWL PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 10 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSDBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 140 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW END IF * * Offset to workspace for Upper triangular factor * WORK_U = BWU*ODD_SIZE + 3*MBW2 * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * * Zero out space for work * DO 20 I = 1, WORK_SIZE_MIN WORK( I ) = ZERO 20 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL.GT.0 ) THEN PREV_TRI_SIZE_M = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) PREV_TRI_SIZE_N = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL-1, 0, $ NPCOL ) ) END IF * IF( MYCOL.GT.0 ) THEN UP_PREV_TRI_SIZE_M = MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) UP_PREV_TRI_SIZE_N = MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NPCOL-1 ) THEN NEXT_TRI_SIZE_M = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL+1, 0, $ NPCOL ) ) NEXT_TRI_SIZE_N = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL STRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, A( OFST+( MY_NUM_COLS-BWL )* $ LLDA+( BWL+BWU+1 ) ), LLDA-1, 0, MYCOL+1 ) * END IF * * * Factor main partition A_i = L_i {U_i} in each processor * CALL SDBTRF( ODD_SIZE, ODD_SIZE, BWL, BWU, A( OFST+1 ), LLDA, $ INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 30 END IF * * IF( MYCOL.LT.NP-1 ) THEN * * Apply factorization to lower connection block BL_i * transpose the connection block in preparation. * Apply factorization to upper connection block BU_i * Move the connection block in preparation. * CALL SLATCPY( 'U', BWL, BWL, A( ( OFST+( BWL+BWU+1 )+ $ ( ODD_SIZE-BWL )*LLDA ) ), LLDA-1, $ AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW ) CALL SLAMOV( 'L', BWU, BWU, A( ( OFST+1+ODD_SIZE*LLDA ) ), $ LLDA-1, AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW- $ BWU ), MAX_BW ) * * Perform the triangular system solve {L_i}{{BU'}_i} = {B_i} * CALL STBTRS( 'L', 'N', 'U', BWU, BWL, BWU, $ A( OFST+BWU+1+( ODD_SIZE-BWU )*LLDA ), LLDA, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ), $ MAX_BW, INFO ) * * Perform the triangular solve {U_i}^T{BL'}_i^T = {BL_i}^T * CALL STBTRS( 'U', 'T', 'N', BWL, BWU, BWL, $ A( OFST+1+( ODD_SIZE-BWL )*LLDA ), LLDA, $ AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW, $ INFO ) * * transpose resulting block to its location * in main storage. * CALL SLATCPY( 'L', BWL, BWL, AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW- $ BWL ), MAX_BW, A( ( OFST+( BWL+BWU+1 )+ $ ( ODD_SIZE-BWL )*LLDA ) ), LLDA-1 ) * * Move the resulting block back to its location in main storage. * CALL SLAMOV( 'L', BWU, BWU, AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+ $ MAX_BW-BWU ), MAX_BW, A( ( OFST+1+ODD_SIZE* $ LLDA ) ), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i} * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL SGEMM( 'T', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, ONE, $ A( OFST+ODD_SIZE*LLDA+1+BWU ), LLDA-1 ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 30 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for * off_diagonal block of reduced system. * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL STRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M, $ PREV_TRI_SIZE_N, AF( WORK_U+1 ), BWL, 0, $ MYCOL-1 ) * IF( INFO.EQ.0 ) THEN * * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ . * * Transpose transmitted triangular matrix $DL_i$ * DO 50 I1 = 1, BWL DO 40 I2 = I1 + 1, BWL AF( WORK_U+I2+( I1-1 )*BWL ) = AF( WORK_U+I1+( I2-1 )* $ BWL ) AF( WORK_U+I1+( I2-1 )*BWL ) = ZERO 40 CONTINUE 50 CONTINUE * DO 60 I1 = 2, ODD_SIZE I2 = MIN( I1-1, BWL ) CALL SGEMV( 'N', BWL, I2, -ONE, $ AF( WORK_U+1+( I1-1-I2 )*BWL ), BWL, $ A( OFST+BWU+1+I2+( I1-1-I2 )*LLDA ), LLDA-1, $ ONE, AF( WORK_U+1+( I1-1 )*BWL ), 1 ) 60 CONTINUE * * * Calculate the "spike" fillin, ${U_i}^T {{GL}_i}^T = {DU_i}^T$ * * * Copy D block into AF storage for solve. * CALL SLAMOV( 'L', UP_PREV_TRI_SIZE_N, UP_PREV_TRI_SIZE_M, $ A( OFST+1 ), LLDA-1, AF( 1 ), BWU ) * DO 80 I1 = 1, ODD_SIZE I2 = MIN( BWU, I1-1 ) CALL SGEMV( 'N', BWU, I2, -ONE, AF( ( I1-1-I2 )*BWU+1 ), $ BWU, A( OFST+BWU+1-I2+( I1-1 )*LLDA ), 1, $ ONE, AF( ( I1-1 )*BWU+1 ), 1 ) * DO 70 I = 1, BWU AF( ( I1-1 )*BWU+I ) = AF( ( I1-1 )*BWU+I ) / $ A( ( I1-1 )*LLDA+BWU+1 ) 70 CONTINUE 80 CONTINUE * * Calculate the update block for previous proc, E_i = GL_i{GU_i} * * * Zero out space in case result is smaller than storage block * DO 90 I = 1, MBW2 AF( ODD_SIZE*BWU+2*MBW2+I ) = ZERO 90 CONTINUE * CALL SGEMM( 'N', 'T', BWU, BWL, ODD_SIZE, -ONE, AF( 1 ), $ BWU, AF( WORK_U+1 ), BWL, ZERO, $ AF( 1+MAX( 0, BWL-BWU )+ODD_SIZE*BWU+( 2*MAX_BW+ $ MAX( 0, BWU-BWL ) )*MAX_BW ), MAX_BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0, $ MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * * Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage * as per requirements of BLAS routine DTRMM. * Since we have GU_i stored, * transpose HU_i to HU_i^T. * CALL SLAMOV( 'N', BWL, BWL, $ AF( WORK_U+( ODD_SIZE-BWL )*BWL+1 ), BWL, $ AF( ( ODD_SIZE )*BWU+1+( MAX_BW-BWL ) ), $ MAX_BW ) * CALL STRMM( 'R', 'U', 'T', 'N', BWL, BWL, -ONE, $ A( ( OFST+( BWL+BWU+1 )+( ODD_SIZE-BWL )* $ LLDA ) ), LLDA-1, AF( ( ODD_SIZE )*BWU+1+ $ ( MAX_BW-BWL ) ), MAX_BW ) * * * Copy matrix HL_i (the last bwu rows of GL_i^T) to AFU store * as per requirements of BLAS routine DTRMM. * Since we have GL_i^T stored, * transpose HL_i^T to HL_i. * CALL SLAMOV( 'N', BWU, BWU, AF( ( ODD_SIZE-BWU )*BWU+1 ), $ BWU, AF( WORK_U+( ODD_SIZE )*BWL+1+MAX_BW- $ BWU ), MAX_BW ) * CALL STRMM( 'R', 'L', 'N', 'N', BWU, BWU, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ AF( WORK_U+( ODD_SIZE )*BWL+1+MAX_BW-BWU ), $ MAX_BW ) * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 130 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 120 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ), $ MAX_BW, 0, MYCOL-1 ) * CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, 0, MYCOL-1 ) * END IF * * Copy last diagonal block into AF storage for subsequent * operations. * CALL SLAMOV( 'N', MAX_BW, MAX_BW, A( OFST+ODD_SIZE*LLDA+BWU+1 ), $ LLDA-1, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0, MYCOL+1 ) * * Add contribution to diagonal block * CALL SAXPY( MBW2, ONE, AF( ODD_SIZE*BWU+2*MBW2+1 ), 1, $ AF( ODD_SIZE*BWU+MBW2+1 ), 1 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 100 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 110 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * CALL SAXPY( MBW2, ONE, WORK( 1 ), 1, AF( ODD_SIZE*BWU+MBW2+1 ), $ 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * CALL SAXPY( MBW2, ONE, WORK( 1 ), 1, AF( ODD_SIZE*BWU+MBW2+1 ), $ 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 100 110 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL SDBTRF( MAX_BW, MAX_BW, MIN( MAX_BW-1, BWL ), $ MIN( MAX_BW-1, BWU ), AF( ODD_SIZE*BWU+MBW2+1- $ ( MIN( MAX_BW-1, BWU ) ) ), MAX_BW+1, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL SLAMOV( 'N', MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW ) * CALL SLAMOV( 'N', MAX_BW, MAX_BW, AF( WORK_U+ODD_SIZE*BWL+1 ), $ MAX_BW, AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ), $ MAX_BW, 0, COMM_PROC ) * CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, 0, $ COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL STBTRS( 'L', 'N', 'U', BWU, MIN( BWL, BWU-1 ), BWU, $ AF( ODD_SIZE*BWU+MBW2+1+( MAX_BW+1 )*( MAX_BW- $ BWU ) ), MAX_BW+1, AF( WORK_U+ODD_SIZE*BWL+1+ $ MAX_BW-BWU ), MAX_BW, INFO ) * * Modify lower off_diagonal block with diagonal block * * CALL STBTRS( 'U', 'T', 'N', BWL, MIN( BWU, BWL-1 ), BWL, $ AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, $ BWL-1 )+( MAX_BW+1 )*( MAX_BW-BWL ) ), $ MAX_BW+1, AF( ODD_SIZE*BWU+1+MAX_BW-BWL ), $ MAX_BW, INFO ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'T', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( ( ODD_SIZE )*BWU+1 ), MAX_BW, $ AF( WORK_U+( ODD_SIZE )*BWL+1 ), MAX_BW, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST / 2 ) * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * * * Since STBTRS has no "left-right" option, we must transpose * CALL SLATCPY( 'N', MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ WORK( 1 ), MAX_BW ) * CALL STBTRS( 'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), $ BWL, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ WORK( 1+MAX_BW*( MAX_BW-BWL ) ), MAX_BW, INFO ) * * Transpose back * CALL SLATCPY( 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW ) * * * * Since STBTRS has no "left-right" option, we must transpose * CALL SLATCPY( 'N', MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ WORK( 1 ), MAX_BW ) * CALL STBTRS( 'U', 'T', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), $ BWU, AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, $ MAX_BW-1 ) ), MAX_BW+1, $ WORK( 1+MAX_BW*( MAX_BW-BWU ) ), MAX_BW, INFO ) * * Transpose back * CALL SLATCPY( 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW ) * * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL SGEMM( 'N', 'T', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( ( ODD_SIZE )*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+( ODD_SIZE )*BWL+2*MBW2+1 ), MAX_BW, $ ZERO, WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL SGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ AF( ODD_SIZE*BWU+1 ), MAX_BW, ZERO, WORK( 1 ), $ MAX_BW ) * * Send contribution to offdiagonal block's owning processor. * CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ COMM_PROC ) * CALL SGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to offdiagonal block's owning processor. * CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0, $ COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 120 CONTINUE * * 130 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 140 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PSDBTRF * END scalapack-2.0.2/SRC/psdbtrs.f000644 000766 000024 00000064022 10363532303 016172 0ustar00juliestaff000000 000000 SUBROUTINE PSDBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PSDBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PSDBTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * banded diagonally dominant-like distributed * matrix with bandwidth BWL, BWU. * * Routine PSDBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSDBTRF and this is stored in AF. If a linear system * is to be solved using PSDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (max(bwl,bwu)*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NP_SAVE, PART_OFFSET, $ RETURN_CODE, STORE_M_B, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PSDBTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 11*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 11*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 11*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 11*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -15 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 8*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -3 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -4 END IF * IF( LLDA.LT.( BWL+BWU+1 ) ) THEN INFO = -( 8*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 8*100+4 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 11*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 11*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -5 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -7 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 8*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PSDBTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*MAX( BWL, BWU ) ) ) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, 'PSDBTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * WORK_SIZE_MIN = ( MAX( BWL, BWU )*NRHS ) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, 'PSDBTRS: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB( 5 ) PARAM_CHECK( 16, 1 ) = DESCB( 4 ) PARAM_CHECK( 15, 1 ) = DESCB( 3 ) PARAM_CHECK( 14, 1 ) = DESCB( 2 ) PARAM_CHECK( 13, 1 ) = DESCB( 1 ) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA( 5 ) PARAM_CHECK( 10, 1 ) = DESCA( 4 ) PARAM_CHECK( 9, 1 ) = DESCA( 3 ) PARAM_CHECK( 8, 1 ) = DESCA( 1 ) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSDBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 20 END IF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( TRANS, 'N' ) ) THEN * CALL PSDBTRSV( 'L', 'N', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PSDBTRSV( 'U', 'T', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * END IF * * Call backsolve routine * IF( ( LSAME( TRANS, 'C' ) ) .OR. ( LSAME( TRANS, 'T' ) ) ) THEN * CALL PSDBTRSV( 'L', 'T', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PSDBTRSV( 'U', 'N', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * END IF 10 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 20 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PSDBTRS * END scalapack-2.0.2/SRC/psdbtrsv.f000644 000766 000024 00000144451 11750130340 016361 0ustar00juliestaff000000 000000 SUBROUTINE PSDBTRSV( UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PSDBTRSV solves a banded triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a banded * triangular matrix factor produced by the * Gaussian elimination code PD@(dom_pre)BTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T * is dictated by the user by the parameter TRANS. * * Routine PSDBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSDBTRF and this is stored in AF. If a linear system * is to be solved using PSDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (max(bwl,bwu)*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * Last modified by: Peter Arbenz, Institute of Scientific Computing, * ETH, Zurich. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MAX_BW, MBW2, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 18, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ SGEMM, SGERV2D, SGESD2D, SLAMOV, SMATADD, $ STBTRS, STRMM, GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 9*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 12*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 12*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 12*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 12*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX( BWL, BWU ) MBW2 = MAX_BW*MAX_BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -2 END IF * IF( LWORK.LT.-1 ) THEN INFO = -16 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -3 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 9*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -4 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -5 END IF * IF( LLDA.LT.( BWL+BWU+1 ) ) THEN INFO = -( 9*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 9*100+4 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 12*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 12*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -6 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -8 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 9*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PSDBTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*MAX( BWL, BWU ) ) ) THEN INFO = -( 9*100+4 ) CALL PXERBLA( ICTXT, 'PSDBTRSV, D&C alg.: NB too small', $ -INFO ) RETURN END IF * * WORK_SIZE_MIN = MAX( BWL, BWU )*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, 'PSDBTRSV: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 18, 1 ) = DESCB( 5 ) PARAM_CHECK( 17, 1 ) = DESCB( 4 ) PARAM_CHECK( 16, 1 ) = DESCB( 3 ) PARAM_CHECK( 15, 1 ) = DESCB( 2 ) PARAM_CHECK( 14, 1 ) = DESCB( 1 ) PARAM_CHECK( 13, 1 ) = IB PARAM_CHECK( 12, 1 ) = DESCA( 5 ) PARAM_CHECK( 11, 1 ) = DESCA( 4 ) PARAM_CHECK( 10, 1 ) = DESCA( 3 ) PARAM_CHECK( 9, 1 ) = DESCA( 1 ) PARAM_CHECK( 8, 1 ) = JA PARAM_CHECK( 7, 1 ) = NRHS PARAM_CHECK( 6, 1 ) = BWU PARAM_CHECK( 5, 1 ) = BWL PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 18, 2 ) = 1205 PARAM_CHECK( 17, 2 ) = 1204 PARAM_CHECK( 16, 2 ) = 1203 PARAM_CHECK( 15, 2 ) = 1202 PARAM_CHECK( 14, 2 ) = 1201 PARAM_CHECK( 13, 2 ) = 11 PARAM_CHECK( 12, 2 ) = 905 PARAM_CHECK( 11, 2 ) = 904 PARAM_CHECK( 10, 2 ) = 903 PARAM_CHECK( 9, 2 ) = 901 PARAM_CHECK( 8, 2 ) = 8 PARAM_CHECK( 7, 2 ) = 6 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 16 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 18, PARAM_CHECK, 18, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSDBTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 200 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW END IF * * Offset to workspace for Upper triangular factor * WORK_U = BWU*ODD_SIZE + 3*MBW2 * * * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * IF( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL STBTRS( UPLO, 'N', 'U', ODD_SIZE, BWL, NRHS, $ A( OFST+1+BWU ), LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL SLAMOV( 'N', BWL, NRHS, $ B( PART_OFFSET+ODD_SIZE-BWL+1 ), LLDB, $ WORK( 1 ), MAX_BW ) * CALL STRMM( 'L', 'U', 'N', 'N', BWL, NRHS, -ONE, $ A( ( OFST+( BWL+BWU+1 )+( ODD_SIZE-BWL )* $ LLDA ) ), LLDA-1, WORK( 1 ), MAX_BW ) * CALL SMATADD( BWL, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Clear garbage out of workspace block * DO 10 IDUM1 = 1, WORK_SIZE_MIN WORK( IDUM1 ) = 0.0 10 CONTINUE * * IF( MYCOL.NE.0 ) THEN * * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL SGEMM( 'N', 'N', BWU, NRHS, ODD_SIZE, -ONE, AF( 1 ), $ BWU, B( PART_OFFSET+1 ), LLDB, ZERO, $ WORK( 1+MAX_BW-BWU ), MAX_BW ) * END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL SMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 40 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 20 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 30 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * CALL SMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * CALL SMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 20 30 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL STBTRS( 'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), $ NRHS, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 190 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'T', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( ( ODD_SIZE )*BWU+1 ), MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL SGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 40 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 90 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 50 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 60 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 50 60 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL SGEMM( 'T', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( ( ODD_SIZE )*BWU+1 ), MAX_BW, WORK( 1 ), $ MAX_BW, ONE, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL STBTRS( 'L', 'T', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), $ NRHS, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 190 END IF * * * ***Modification Loop ******* * 70 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 80 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 70 80 CONTINUE * [End of GOTO Loop] * 90 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL SGEMM( 'T', 'N', ODD_SIZE, NRHS, BWU, -ONE, AF( 1 ), $ BWU, WORK( 1+MAX_BW-BWU ), MAX_BW, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL SLAMOV( 'N', BWL, NRHS, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, WORK( 1+MAX_BW-BWL ), MAX_BW ) * CALL STRMM( 'L', 'U', 'T', 'N', BWL, NRHS, -ONE, $ A( ( OFST+( BWL+BWU+1 )+( ODD_SIZE-BWL )* $ LLDA ) ), LLDA-1, WORK( 1+MAX_BW-BWL ), $ MAX_BW ) * CALL SMATADD( BWL, NRHS, ONE, WORK( 1+MAX_BW-BWL ), $ MAX_BW, ONE, B( PART_OFFSET+ODD_SIZE-BWL+ $ 1 ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL STBTRS( UPLO, 'T', 'U', ODD_SIZE, BWL, NRHS, $ A( OFST+1+BWU ), LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF( LSAME( TRANS, 'T' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL STBTRS( UPLO, 'T', 'N', ODD_SIZE, BWU, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL SLAMOV( 'N', BWU, NRHS, $ B( PART_OFFSET+ODD_SIZE-BWU+1 ), LLDB, $ WORK( 1 ), MAX_BW ) * CALL STRMM( 'L', 'L', 'T', 'N', BWU, NRHS, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ WORK( 1 ), MAX_BW ) * CALL SMATADD( BWU, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Clear garbage out of workspace block * DO 100 IDUM1 = 1, WORK_SIZE_MIN WORK( IDUM1 ) = 0.0 100 CONTINUE * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL SGEMM( 'N', 'N', BWL, NRHS, ODD_SIZE, -ONE, $ AF( WORK_U+1 ), BWL, B( PART_OFFSET+1 ), $ LLDB, ZERO, WORK( 1+MAX_BW-BWL ), MAX_BW ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL SMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 130 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 110 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 120 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * CALL SMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * CALL SMATADD( MAX_BW, NRHS, ONE, WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 110 120 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL STBTRS( 'U', 'T', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), $ NRHS, AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, $ MAX_BW-1 ) ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 190 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'T', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( WORK_U+( ODD_SIZE )*BWL+1 ), MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL SGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 130 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 180 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 140 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 150 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 140 150 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL SGEMM( 'T', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -ONE, $ AF( WORK_U+( ODD_SIZE )*BWL+1 ), MAX_BW, $ WORK( 1 ), MAX_BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL STBTRS( 'U', 'N', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), $ NRHS, AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, $ MAX_BW-1 ) ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 190 END IF * * * ***Modification Loop ******* * 160 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 170 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 160 170 CONTINUE * [End of GOTO Loop] * 180 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGERV2D( ICTXT, MAX_BW, NRHS, WORK( 1 ), MAX_BW, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL SGEMM( 'T', 'N', ODD_SIZE, NRHS, BWL, -ONE, $ AF( WORK_U+1 ), BWL, WORK( 1+MAX_BW-BWL ), $ MAX_BW, ONE, B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL SLAMOV( 'N', BWU, NRHS, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, WORK( 1+MAX_BW-BWU ), MAX_BW+BWL ) * CALL STRMM( 'L', 'L', 'N', 'N', BWU, NRHS, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ WORK( 1+MAX_BW-BWU ), MAX_BW+BWL ) * CALL SMATADD( BWU, NRHS, ONE, WORK( 1+MAX_BW-BWU ), $ MAX_BW+BWL, ONE, B( PART_OFFSET+ODD_SIZE- $ BWU+1 ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL STBTRS( UPLO, 'N', 'N', ODD_SIZE, BWU, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * END IF * End of "IF( LSAME( UPLO, 'L' ) )"... 190 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 200 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PSDBTRSV * END scalapack-2.0.2/SRC/psdtsv.f000644 000766 000024 00000046112 10363532303 016034 0ustar00juliestaff000000 000000 SUBROUTINE PSDTSV( N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL B( * ), D( * ), DL( * ), DU( * ), WORK( * ) * .. * * * Purpose * ======= * * PSDTSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * tridiagonal diagonally dominant-like distributed * matrix. * * Gaussian elimination without pivoting * is used to factor a reordering * of the matrix into L U. * * See PSDTTRF and PSDTTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) REAL pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) REAL pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) REAL pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (12*NPCOL+3*NB) * +max(10*NPCOL+4*NRHS, 8*NPCOL) * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PSDTTRF, PSDTTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PSDTTRF and PSDTTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSEIF( DESCA( DTYPE_ ) .EQ. 502 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PSDTSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (12*NPCOL+3*NB) * * Factor the matrix * CALL PSDTTRF( N, DL, D, DU, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PSDTSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PSDTTRS( 'N', N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB, $ WORK, MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSDTSV', -INFO ) RETURN END IF * RETURN * * End of PSDTSV * END scalapack-2.0.2/SRC/psdttrf.f000644 000766 000024 00000105757 10363532303 016212 0ustar00juliestaff000000 000000 SUBROUTINE PSDTTRF( N, DL, D, DU, JA, DESCA, AF, LAF, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. INTEGER INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL AF( * ), D( * ), DL( * ), DU( * ), WORK( * ) * .. * * * Purpose * ======= * * PSDTTRF computes a LU factorization * of an N-by-N real tridiagonal * diagonally dominant-like distributed matrix * A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PSDTTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = L U * * where U is a tridiagonal upper triangular matrix and L is tridiagonal * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * DL (local input/local output) REAL pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) REAL pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) REAL pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSDTTRF and this is stored in AF. If a linear system * is to be solved using PSDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 8*NPCOL * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN, $ LEVEL_DIST, LLDA, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_N_A, $ TEMP, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, IGAMX2D, IGEBR2D, IGEBS2D, PXERBLA, $ RESHAPE, SDTTRF, SDTTRSV, SGERV2D, SGESD2D, $ STRRV2D, STRSD2D * .. * .. External Functions .. INTEGER NUMROC REAL SDOT EXTERNAL NUMROC, SDOT * .. * .. Intrinsic Functions .. INTRINSIC MOD, REAL * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 6*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -10 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 6*100+6 ) END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 6*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PSDTTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, 'PSDTTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = ( 12*NPCOL+3*NB ) * IF( LAF.LT.LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PSDTTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = 8*NPCOL * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, 'PSDTTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 7, 1 ) = DESCA( 5 ) PARAM_CHECK( 6, 1 ) = DESCA( 4 ) PARAM_CHECK( 5, 1 ) = DESCA( 3 ) PARAM_CHECK( 4, 1 ) = DESCA( 1 ) PARAM_CHECK( 3, 1 ) = JA PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 7, 2 ) = 605 PARAM_CHECK( 6, 2 ) = 604 PARAM_CHECK( 5, 2 ) = 603 PARAM_CHECK( 4, 2 ) = 601 PARAM_CHECK( 3, 2 ) = 5 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 10 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 7, PARAM_CHECK, 7, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSDTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 70 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * Offset to workspace for Upper triangular factor * WORK_U = INT_ONE*ODD_SIZE + 3 * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * IF( MYCOL.LT.NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL STRSD2D( ICTXT, 'U', 'N', 1, 1, $ DU( PART_OFFSET+ODD_SIZE+1 ), LLDA-1, 0, $ MYCOL+1 ) * END IF * * * Factor main partition A_i = L_i {U_i} in each processor * CALL SDTTRF( ODD_SIZE, DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 20 END IF * * IF( MYCOL.LT.NP-1 ) THEN * * Apply factorization to lower connection block BL_i * Apply factorization to upper connection block BU_i * * * Perform the triangular solve {U_i}^T{BL'}_i^T = {BL_i}^T * * DL( PART_OFFSET+ODD_SIZE+1 ) = ( DL( PART_OFFSET+ODD_SIZE+1 ) ) $ / ( D( PART_OFFSET+ODD_SIZE ) ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i} * * D( PART_OFFSET+ODD_SIZE+1 ) = D( PART_OFFSET+ODD_SIZE+1 ) - $ DL( PART_OFFSET+ODD_SIZE+1 )* $ DU( PART_OFFSET+ODD_SIZE ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 20 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * * Move entry that causes spike to auxiliary storage * AF( WORK_U+1 ) = ( DL( PART_OFFSET+1 ) ) * IF( INFO.EQ.0 ) THEN * * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ . * CALL SDTTRSV( 'L', 'N', ODD_SIZE, INT_ONE, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), AF( WORK_U+1 ), ODD_SIZE, $ INFO ) * * * Calculate the "spike" fillin, ${U_i}^T {{GL}_i}^T = {DU_i}^T$ * CALL STRRV2D( ICTXT, 'U', 'N', 1, 1, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * CALL SDTTRSV( 'U', 'T', ODD_SIZE, INT_ONE, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = GL_i{GU_i} * AF( ODD_SIZE+3 ) = -ONE*SDOT( ODD_SIZE, AF( 1 ), 1, $ AF( WORK_U+1 ), 1 ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+3 ), $ INT_ONE, 0, MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * AF( ODD_SIZE+1 ) = -ONE*( DL( PART_OFFSET+ODD_SIZE+1 )* $ AF( WORK_U+ODD_SIZE ) ) * * AF( WORK_U+( ODD_SIZE )+1 ) = -ONE* $ DU( PART_OFFSET+ODD_SIZE )*( AF( ODD_SIZE ) ) * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 60 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 50 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, AF( WORK_U+ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * END IF * * Copy last diagonal block into AF storage for subsequent * operations. * AF( ODD_SIZE+2 ) = REAL( D( PART_OFFSET+ODD_SIZE+1 ) ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL+1 ) * * Add contribution to diagonal block * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + AF( ODD_SIZE+3 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 30 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 40 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 30 40 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... IF( AF( ODD_SIZE+2 ).EQ.ZERO ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * AF( WORK_U+ODD_SIZE+3 ) = AF( ODD_SIZE+1 ) * AF( ODD_SIZE+3 ) = AF( WORK_U+ODD_SIZE+1 ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( WORK_U+ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify lower off_diagonal block with diagonal block * * AF( ODD_SIZE+1 ) = AF( ODD_SIZE+1 ) / ( AF( ODD_SIZE+2 ) ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * WORK( 1 ) = -ONE*( AF( ODD_SIZE+1 ) )* $ AF( WORK_U+( ODD_SIZE )+1 ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( WORK_U+ODD_SIZE+2+1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST / 2 ) * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * AF( ODD_SIZE+3 ) = AF( ODD_SIZE+3 ) / ( AF( ODD_SIZE+2 ) ) * * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*( AF( WORK_U+ODD_SIZE+3 ) ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * WORK( 1 ) = -ONE*AF( WORK_U+ODD_SIZE+3 )*AF( ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*AF( WORK_U+ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 50 CONTINUE * * 60 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 70 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PSDTTRF * END scalapack-2.0.2/SRC/psdttrs.f000644 000766 000024 00000066511 10363532303 016221 0ustar00juliestaff000000 000000 SUBROUTINE PSDTTRS( TRANS, N, NRHS, DL, D, DU, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL AF( * ), B( * ), D( * ), DL( * ), DU( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PSDTTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PSDTTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * tridiagonal diagonally dominant-like distributed * matrix. * * Routine PSDTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) REAL pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) REAL pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) REAL pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSDTTRF and this is stored in AF. If a linear system * is to be solved using PSDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 10*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ MY_NUM_COLS, NB, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, PART_OFFSET, PART_SIZE, RETURN_CODE, $ STORE_M_B, STORE_N_A, TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 15, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PSDTTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 11*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 11*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 11*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 11*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -15 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 8*100+6 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 11*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 11*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -3 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -7 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 8*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PSDTTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, 'PSDTTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * WORK_SIZE_MIN = 10*NPCOL + 4*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, 'PSDTTRS: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 15, 1 ) = DESCB( 5 ) PARAM_CHECK( 14, 1 ) = DESCB( 4 ) PARAM_CHECK( 13, 1 ) = DESCB( 3 ) PARAM_CHECK( 12, 1 ) = DESCB( 2 ) PARAM_CHECK( 11, 1 ) = DESCB( 1 ) PARAM_CHECK( 10, 1 ) = IB PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = NRHS PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 15, 2 ) = 1105 PARAM_CHECK( 14, 2 ) = 1104 PARAM_CHECK( 13, 2 ) = 1103 PARAM_CHECK( 12, 2 ) = 1102 PARAM_CHECK( 11, 2 ) = 1101 PARAM_CHECK( 10, 2 ) = 10 PARAM_CHECK( 9, 2 ) = 805 PARAM_CHECK( 8, 2 ) = 804 PARAM_CHECK( 7, 2 ) = 803 PARAM_CHECK( 6, 2 ) = 801 PARAM_CHECK( 5, 2 ) = 7 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 15, PARAM_CHECK, 15, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSDTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 20 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( TRANS, 'N' ) ) THEN * CALL PSDTTRSV( 'L', 'N', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ELSE * CALL PSDTTRSV( 'U', 'T', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * END IF * * Call backsolve routine * IF( ( LSAME( TRANS, 'C' ) ) .OR. ( LSAME( TRANS, 'T' ) ) ) THEN * CALL PSDTTRSV( 'L', 'T', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ELSE * CALL PSDTTRSV( 'U', 'N', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * END IF 10 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 20 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PSDTTRS * END scalapack-2.0.2/SRC/psdttrsv.f000644 000766 000024 00000141706 10363532303 016407 0ustar00juliestaff000000 000000 SUBROUTINE PSDTTRSV( UPLO, TRANS, N, NRHS, DL, D, DU, JA, DESCA, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL AF( * ), B( * ), D( * ), DL( * ), DU( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PSDTTRSV solves a tridiagonal triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a tridiagonal * triangular matrix factor produced by the * Gaussian elimination code PS@(dom_pre)TTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T * is dictated by the user by the parameter TRANS. * * Routine PSDTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) REAL pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) REAL pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) REAL pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSDTTRF and this is stored in AF. If a linear system * is to be solved using PSDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 10*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_M_B, STORE_N_A, $ TEMP, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PXERBLA, RESHAPE, SAXPY, SDTTRSV, $ SGEMM, SGERV2D, SGESD2D, SMATADD, STBTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 9*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 12*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 12*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 12*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 12*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -2 END IF * IF( LWORK.LT.-1 ) THEN INFO = -16 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -3 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 9*100+6 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 12*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 12*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -4 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -8 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 9*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PSDTTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 9*100+4 ) CALL PXERBLA( ICTXT, 'PSDTTRSV, D&C alg.: NB too small', $ -INFO ) RETURN END IF * * WORK_SIZE_MIN = INT_ONE*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, 'PSDTTRSV: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB( 5 ) PARAM_CHECK( 15, 1 ) = DESCB( 4 ) PARAM_CHECK( 14, 1 ) = DESCB( 3 ) PARAM_CHECK( 13, 1 ) = DESCB( 2 ) PARAM_CHECK( 12, 1 ) = DESCB( 1 ) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA( 5 ) PARAM_CHECK( 9, 1 ) = DESCA( 4 ) PARAM_CHECK( 8, 1 ) = DESCA( 3 ) PARAM_CHECK( 7, 1 ) = DESCA( 1 ) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1205 PARAM_CHECK( 15, 2 ) = 1204 PARAM_CHECK( 14, 2 ) = 1203 PARAM_CHECK( 13, 2 ) = 1202 PARAM_CHECK( 12, 2 ) = 1201 PARAM_CHECK( 11, 2 ) = 11 PARAM_CHECK( 10, 2 ) = 905 PARAM_CHECK( 9, 2 ) = 904 PARAM_CHECK( 8, 2 ) = 903 PARAM_CHECK( 7, 2 ) = 901 PARAM_CHECK( 6, 2 ) = 8 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 16 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSDTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 180 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * Offset to workspace for Upper triangular factor * WORK_U = INT_ONE*ODD_SIZE + 3 * * * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * IF( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL SDTTRSV( UPLO, 'N', ODD_SIZE, NRHS, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL SAXPY( NRHS, -DL( PART_OFFSET+ODD_SIZE+1 ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL SGEMM( 'T', 'N', INT_ONE, NRHS, ODD_SIZE, -ONE, $ AF( 1 ), ODD_SIZE, B( PART_OFFSET+1 ), LLDB, $ ZERO, WORK( 1+INT_ONE-INT_ONE ), INT_ONE ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL SMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 30 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 10 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 20 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * CALL SMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * CALL SMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 10 20 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL STBTRS( 'L', 'N', 'U', INT_ONE, $ MIN( INT_ONE, INT_ONE-1 ), NRHS, $ AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ( ODD_SIZE )*INT_ONE+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL SGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ODD_SIZE*INT_ONE+2+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 30 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 80 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 40 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 50 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 40 50 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL SGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ODD_SIZE*INT_ONE+2+1 ), INT_ONE, $ WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ( ODD_SIZE )*INT_ONE+1 ), INT_ONE, $ WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL STBTRS( 'L', 'T', 'U', INT_ONE, $ MIN( INT_ONE, INT_ONE-1 ), NRHS, $ AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * ***Modification Loop ******* * 60 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 70 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 60 70 CONTINUE * [End of GOTO Loop] * 80 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL SGEMM( 'N', 'N', ODD_SIZE, NRHS, INT_ONE, -ONE, $ AF( 1 ), ODD_SIZE, WORK( 1+INT_ONE-INT_ONE ), $ INT_ONE, ONE, B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL SAXPY( NRHS, -( DL( PART_OFFSET+ODD_SIZE+1 ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL SDTTRSV( UPLO, 'T', ODD_SIZE, NRHS, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF( LSAME( TRANS, 'T' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL SDTTRSV( UPLO, 'T', ODD_SIZE, NRHS, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL SAXPY( NRHS, -( DU( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL SGEMM( 'T', 'N', INT_ONE, NRHS, ODD_SIZE, -ONE, $ AF( WORK_U+1 ), ODD_SIZE, B( PART_OFFSET+1 ), $ LLDB, ZERO, WORK( 1+INT_ONE-INT_ONE ), $ INT_ONE ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL SMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 110 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 90 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 100 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * CALL SMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * CALL SMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 90 100 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL STBTRS( 'U', 'T', 'N', INT_ONE, $ MIN( INT_ONE, INT_ONE-1 ), NRHS, $ AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+( ODD_SIZE )*INT_ONE+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL SGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+ODD_SIZE*INT_ONE+2+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 110 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 160 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 120 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 130 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 120 130 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL SGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+ODD_SIZE*INT_ONE+2+1 ), INT_ONE, $ WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+( ODD_SIZE )*INT_ONE+1 ), INT_ONE, $ WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL STBTRS( 'U', 'N', 'N', INT_ONE, $ MIN( INT_ONE, INT_ONE-1 ), NRHS, $ AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * ***Modification Loop ******* * 140 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 150 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 140 150 CONTINUE * [End of GOTO Loop] * 160 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, $ 0, MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL SGEMM( 'N', 'N', ODD_SIZE, NRHS, INT_ONE, -ONE, $ AF( WORK_U+1 ), ODD_SIZE, $ WORK( 1+INT_ONE-INT_ONE ), INT_ONE, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL SAXPY( NRHS, -( DU( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL SDTTRSV( UPLO, 'N', ODD_SIZE, NRHS, $ DU( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * END IF * End of "IF( LSAME( UPLO, 'L' ) )"... 170 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 180 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PSDTTRSV * END scalapack-2.0.2/SRC/psgbsv.f000644 000766 000024 00000045357 10363532303 016027 0ustar00juliestaff000000 000000 SUBROUTINE PSGBSV( N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, $ DESCB, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER BWL, BWU, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) REAL A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PSGBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * banded distributed * matrix with bandwidth BWL, BWU. * * Gaussian elimination with pivoting * is used to factor a reordering * of the matrix into P L U. * * See PSGBTRF and PSGBTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * +max(NRHS*(NB+2*bwl+4*bwu), 1) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * nonsingular, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * nonsingular, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PSGBTRF, PSGBTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PSGBTRF and PSGBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PSGBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * * Factor the matrix * CALL PSGBTRF( N, BWL, BWU, A, JA, DESCA, IPIV, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PSGBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PSGBTRS( 'N', N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, $ DESCB, WORK, MIN( LWORK, WS_FACTOR ), $ WORK( 1+WS_FACTOR), LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGBSV', -INFO ) RETURN END IF * RETURN * * End of PSGBSV * END scalapack-2.0.2/SRC/psgbtrf.f000644 000766 000024 00000110147 11750130340 016154 0ustar00juliestaff000000 000000 SUBROUTINE PSGBTRF( N, BWL, BWU, A, JA, DESCA, IPIV, AF, LAF, $ WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) REAL A( * ), AF( * ), WORK( * ) * .. * * Purpose * ======= * * PSGBTRF computes a LU factorization * of an N-by-N real banded * distributed matrix * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PSGBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) Q = L U * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P and Q are permutation matrices. * The matrix Q represents reordering of columns * for parallelism's sake, while P represents * reordering of rows for numerical stability using * classic partial pivoting. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSGBTRF and this is stored in AF. If a linear system * is to be solved using PSGBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 1 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * nonsingular, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * nonsingular, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * ===================================================================== * * Implemented for ScaLAPACK by: * Andrew J. Cleary, Livermore National Lab and University of Tenn., * and Markus Hegland, Australian National University. Feb., 1997. * Based on code written by : Peter Arbenz, ETH Zurich, 1996. * Last modified by: Peter Arbenz, Institute of Scientific Computing, * ETH, Zurich. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER APTR, BBPTR, BIPTR, BM, BM1, BM2, BMN, BN, BW, $ CSRC, DBPTR, FIRST_PROC, I, I1, I2, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, J, JA_NEW, JPTR, $ L, LAF_MIN, LBWL, LBWU, LDB, LDBB, LLDA, LM, $ LMJ, LN, LNJ, LPTR, MYCOL, MYROW, MY_NUM_COLS, $ NB, NEICOL, NP, NPACT, NPCOL, NPROW, NPSTR, $ NP_SAVE, NRHS, ODD_N, ODD_SIZE, ODPTR, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ SGBTRF, SGEMM, SGER, SGERV2D, SGESD2D, SGETRF, $ SLAMOV, SLASWP, SLATCPY, SSWAP, STRRV2D, $ STRSD2D, STRSM, GLOBCHK, IGAMX2D, IGEBR2D, $ IGEBS2D, PXERBLA, RESHAPE * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 6*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -11 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 6*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -2 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -3 END IF * IF( LLDA.LT.( 2*BWL+2*BWU+1 ) ) THEN INFO = -( 6*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 6*100+4 ) END IF * BW = BWU + BWL * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 6*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PSGBTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.( BWL+BWU+1 ) ) ) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, 'PSGBTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = ( NB+BWU )*( BWL+BWU ) + 6*( BWL+BWU )*( BWL+2*BWU ) * IF( LAF.LT.LAF_MIN ) THEN INFO = -9 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PSGBTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = 1 * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -11 * put minimum value of work into work( 1 ) WORK( 1 ) = WORK_SIZE_MIN CALL PXERBLA( ICTXT, 'PSGBTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BWU PARAM_CHECK( 3, 1 ) = BWL PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 11 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSGBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 210 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * DO 30 J = 1, ODD_SIZE DO 20 I = 1, BW A( I+( J-1 )*LLDA ) = ZERO 20 CONTINUE 30 CONTINUE * * Begin main code * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Transfer triangle B_i of local matrix to next processor * for fillin. Overlap the send with the factorization of A_i. * IF( MYCOL.LE.NPCOL-2 ) THEN * * The last processor does not need to send anything. * BIPTR = location of triangle B_i in memory BIPTR = ( NB-BW )*LLDA + 2*BW + 1 * CALL STRSD2D( ICTXT, 'U', 'N', $ MIN( BW, BWU+NUMROC( N, NB, MYCOL+1, 0, $ NPCOL ) ), BW, A( BIPTR ), LLDA-1, 0, MYCOL+1 ) * END IF * * Factor main partition P_i A_i = L_i U_i on each processor * * LBWL, LBWU: lower and upper bandwidth of local solver * Note that for MYCOL > 0 one has lower triangular blocks! * LM is the number of rows which is usually NB except for * MYCOL = 0 where it is BWU less and MYCOL=NPCOL-1 where it * is NR+BWU where NR is the number of columns on the last processor * Finally APTR is the pointer to the first element of A. As LAPACK * has a slightly different matrix format than Scalapack the pointer * has to be adjusted on processor MYCOL=0. * IF( MYCOL.NE.0 ) THEN LBWL = BW LBWU = 0 APTR = 1 ELSE LBWL = BWL LBWU = BWU APTR = 1 + BWU END IF * IF( MYCOL.NE.NPCOL-1 ) THEN LM = NB - LBWU LN = NB - BW ELSE IF( MYCOL.NE.0 ) THEN LM = ODD_SIZE + BWU LN = MAX( ODD_SIZE-BW, 0 ) ELSE LM = N LN = MAX( N-BW, 0 ) END IF * IF( LN.GT.0 ) THEN * CALL SGBTRF( LM, LN, LBWL, LBWU, A( APTR ), LLDA, IPIV, INFO ) * IF( INFO.NE.0 ) THEN INFO = INFO + NB*MYCOL GO TO 80 END IF * NRHS = BW LDB = LLDA - 1 * * Update the last BW columns of A_i (code modified from DGBTRS) * * Only the eliminations of unknowns > LN-BW have an effect on * the last BW columns. Loop over them... * DO 40 J = MAX( LN-BW+1, 1 ), LN * LMJ = MIN( LBWL, LM-J ) LNJ = MIN( BW, J+BW-LN+APTR-1 ) * L = IPIV( J ) * JPTR = J - ( LN+1 ) + 2*BW + 1 - LBWL + LN*LLDA * IF( L.NE.J ) THEN * * Element (L,LN+1) is swapped with element (J,LN+1) etc * Furthermore, the elements in the same row are LDB=LLDA-1 apart * The complicated formulas are to cope with the banded * data format: * LPTR = L - ( LN+1 ) + 2*BW + 1 - LBWL + LN*LLDA * CALL SSWAP( LNJ, A( LPTR ), LDB, A( JPTR ), LDB ) * END IF * * LPTR is the pointer to the beginning of the * coefficients of L * LPTR = BW + 1 + APTR + ( J-1 )*LLDA * CALL SGER( LMJ, LNJ, -ONE, A( LPTR ), 1, A( JPTR ), LDB, $ A( JPTR+1 ), LDB ) 40 CONTINUE * END IF * * Compute spike fill-in, L_i F_i = P_i B_{i-1} * * Receive triangle B_{i-1} from previous processor * IF( MYCOL.GT.0 ) THEN CALL STRRV2D( ICTXT, 'U', 'N', MIN( BW, LM ), BW, AF( 1 ), BW, $ 0, MYCOL-1 ) * * Transpose transmitted upper triangular (trapezoidal) matrix * DO 60 I2 = 1, MIN( BW, LM ) DO 50 I1 = I2 + 1, BW AF( I1+( I2-1 )*BW ) = AF( I2+( I1-1 )*BW ) AF( I2+( I1-1 )*BW ) = ZERO 50 CONTINUE 60 CONTINUE * * Permutation and forward elimination (triang. solve) * DO 70 J = 1, LN * LMJ = MIN( LBWL, LM-J ) L = IPIV( J ) * IF( L.NE.J ) THEN CALL SSWAP( BW, AF( ( L-1 )*BW+1 ), 1, $ AF( ( J-1 )*BW+1 ), 1 ) END IF * LPTR = BW + 1 + APTR + ( J-1 )*LLDA * CALL SGER( NRHS, LMJ, -ONE, AF( ( J-1 )*BW+1 ), 1, $ A( LPTR ), 1, AF( J*BW+1 ), BW ) * 70 CONTINUE * END IF * 80 CONTINUE * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Define the initial dimensions of the diagonal blocks * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW * IF( MYCOL.NE.NPCOL-1 ) THEN BM = BW - LBWU BN = BW ELSE BM = MIN( BW, ODD_SIZE ) + BWU BN = MIN( BW, ODD_SIZE ) END IF * * Pointer to first element of block bidiagonal matrix in AF * Leading dimension of block bidiagonal system * BBPTR = ( NB+BWU )*BW + 1 LDBB = 2*BW + BWU * * Copy from A and AF into block bidiagonal matrix (tail of AF) * * DBPTR = Pointer to diagonal blocks in A DBPTR = BW + 1 + LBWU + LN*LLDA * CALL SLAMOV( 'G', BM, BN, A( DBPTR ), LLDA-1, AF( BBPTR+BW*LDBB ), $ LDBB ) * * Zero out any junk entries that were copied * DO 100 J = 1, BM DO 90 I = J + LBWL, BM - 1 AF( BBPTR+BW*LDBB+( J-1 )*LDBB+I ) = ZERO 90 CONTINUE 100 CONTINUE * IF( MYCOL.NE.0 ) THEN * * ODPTR = Pointer to offdiagonal blocks in A * ODPTR = ( LM-BM )*BW + 1 CALL SLATCPY( 'G', BW, BM, AF( ODPTR ), BW, $ AF( BBPTR+2*BW*LDBB ), LDBB ) END IF * IF( NPCOL.EQ.1 ) THEN * * In this case the loop over the levels will not be * performed. CALL SGETRF( N-LN, N-LN, AF( BBPTR+BW*LDBB ), LDBB, $ IPIV( LN+1 ), INFO ) * END IF * * Loop over levels ... only occurs if npcol > 1 * * The two integers NPACT (nu. of active processors) and NPSTR * (stride between active processors) are used to control the * loop. * NPACT = NPCOL NPSTR = 1 * * Begin loop over levels * 110 CONTINUE IF( NPACT.LE.1 ) $ GO TO 190 * * Test if processor is active * IF( MOD( MYCOL, NPSTR ).EQ.0 ) THEN * * Send/Receive blocks * * IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN * * This node will potentially do more work later * NEICOL = MYCOL + NPSTR * IF( NEICOL / NPSTR.LT.NPACT-1 ) THEN BMN = BW ELSE IF( NEICOL / NPSTR.EQ.NPACT-1 ) THEN ODD_N = NUMROC( N, NB, NPCOL-1, 0, NPCOL ) BMN = MIN( BW, ODD_N ) + BWU ELSE * * Last processor skips to next level GO TO 180 END IF * * BM1 = M for 1st block on proc pair, BM2 2nd block * BM1 = BM BM2 = BMN * IF( NEICOL / NPSTR.LE.NPACT-1 ) THEN * CALL SGESD2D( ICTXT, BM, 2*BW, AF( BBPTR+BW*LDBB ), LDBB, $ 0, NEICOL ) * CALL SGERV2D( ICTXT, BMN, 2*BW, AF( BBPTR+BM ), LDBB, 0, $ NEICOL ) * IF( NPACT.EQ.2 ) THEN * * Copy diagonal block to align whole system * CALL SLAMOV( 'G', BMN, BW, AF( BBPTR+BM ), LDBB, $ AF( BBPTR+2*BW*LDBB+BM ), LDBB ) END IF * END IF * ELSE * * This node stops work after this stage -- an extra copy * is required to make the odd and even frontal matrices * look identical * NEICOL = MYCOL - NPSTR * IF( NEICOL.EQ.0 ) THEN BMN = BW - BWU ELSE BMN = BW END IF * BM1 = BMN BM2 = BM * CALL SGESD2D( ICTXT, BM, 2*BW, AF( BBPTR+BW*LDBB ), LDBB, 0, $ NEICOL ) * CALL SLAMOV( 'G', BM, 2*BW, AF( BBPTR+BW*LDBB ), LDBB, $ AF( BBPTR+BMN ), LDBB ) * DO 130 J = BBPTR + 2*BW*LDBB, BBPTR + 3*BW*LDBB - 1, LDBB DO 120 I = 0, LDBB - 1 AF( I+J ) = ZERO 120 CONTINUE 130 CONTINUE * CALL SGERV2D( ICTXT, BMN, 2*BW, AF( BBPTR+BW*LDBB ), LDBB, $ 0, NEICOL ) * IF( NPACT.EQ.2 ) THEN * * Copy diagonal block to align whole system * CALL SLAMOV( 'G', BM, BW, AF( BBPTR+BMN ), LDBB, $ AF( BBPTR+2*BW*LDBB+BMN ), LDBB ) END IF * END IF * * LU factorization with partial pivoting * IF( NPACT.NE.2 ) THEN * CALL SGETRF( BM+BMN, BW, AF( BBPTR+BW*LDBB ), LDBB, $ IPIV( LN+1 ), INFO ) * * Backsolve left side * DO 150 J = BBPTR, BBPTR + BW*LDBB - 1, LDBB DO 140 I = 0, BM1 - 1 AF( I+J ) = ZERO 140 CONTINUE 150 CONTINUE * CALL SLASWP( BW, AF( BBPTR ), LDBB, 1, BW, IPIV( LN+1 ), 1 ) * CALL STRSM( 'L', 'L', 'N', 'U', BW, BW, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, AF( BBPTR ), LDBB ) * * Use partial factors to update remainder * CALL SGEMM( 'N', 'N', BM+BMN-BW, BW, BW, -ONE, $ AF( BBPTR+BW*LDBB+BW ), LDBB, AF( BBPTR ), LDBB, $ ONE, AF( BBPTR+BW ), LDBB ) * * Backsolve right side * NRHS = BW * CALL SLASWP( NRHS, AF( BBPTR+2*BW*LDBB ), LDBB, 1, BW, $ IPIV( LN+1 ), 1 ) * CALL STRSM( 'L', 'L', 'N', 'U', BW, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, $ AF( BBPTR+2*BW*LDBB ), LDBB ) * * Use partial factors to update remainder * CALL SGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, -ONE, $ AF( BBPTR+BW*LDBB+BW ), LDBB, $ AF( BBPTR+2*BW*LDBB ), LDBB, ONE, $ AF( BBPTR+2*BW*LDBB+BW ), LDBB ) * * * Test if processor is active in next round * IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN * * Reset BM * BM = BM1 + BM2 - BW * * Local copying in the block bidiagonal area * * CALL SLAMOV( 'G', BM, BW, AF( BBPTR+BW ), LDBB, $ AF( BBPTR+BW*LDBB ), LDBB ) CALL SLAMOV( 'G', BM, BW, AF( BBPTR+2*BW*LDBB+BW ), LDBB, $ AF( BBPTR+2*BW*LDBB ), LDBB ) * * Zero out space that held original copy * DO 170 J = 0, BW - 1 DO 160 I = 0, BM - 1 AF( BBPTR+2*BW*LDBB+BW+J*LDBB+I ) = ZERO 160 CONTINUE 170 CONTINUE * END IF * ELSE * * Factor the final 2 by 2 block matrix * CALL SGETRF( BM+BMN, BM+BMN, AF( BBPTR+BW*LDBB ), LDBB, $ IPIV( LN+1 ), INFO ) END IF * END IF * * Last processor in an odd-sized NPACT skips to here * 180 CONTINUE * NPACT = ( NPACT+1 ) / 2 NPSTR = NPSTR*2 GO TO 110 * 190 CONTINUE * End loop over levels * 200 CONTINUE * If error was found in Phase 1, processors jump here. * * Free BLACS space used to hold standard-form grid. * ICTXT = ICTXT_SAVE IF( ICTXT.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 210 CONTINUE * If this processor did not hold part of the grid it * jumps here. * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PSGBTRF * END scalapack-2.0.2/SRC/psgbtrs.f000644 000766 000024 00000113070 11750130340 016167 0ustar00juliestaff000000 000000 SUBROUTINE PSGBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) REAL A( * ), AF( * ), B( * ), WORK( * ) * .. * * Purpose * ======= * * PSGBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PSGBTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * banded distributed * matrix with bandwidth BWL, BWU. * * Routine PSGBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSGBTRF and this is stored in AF. If a linear system * is to be solved using PSGBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * NRHS*(NB+2*bwl+4*bwu) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * ===================================================================== * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * ===================================================================== * * Implemented for ScaLAPACK by: * Andrew J. Cleary, Livermore National Lab and University of Tenn., * and Markus Hegland, Australian National University. Feb., 1997. * Based on code written by : Peter Arbenz, ETH Zurich, 1996. * Last modified by: Peter Arbenz, Institute of Scientific Computing, * ETH, Zurich. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC, $ FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, J, JA_NEW, L, LBWL, LBWU, LDBB, $ LDW, LLDA, LLDB, LM, LMJ, LN, LPTR, MYCOL, $ MYROW, NB, NEICOL, NP, NPACT, NPCOL, NPROW, $ NPSTR, NP_SAVE, ODD_SIZE, PART_OFFSET, $ RECOVERY_VAL, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN, WPTR * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, SCOPY, $ DESC_CONVERT, SGEMM, SGEMV, SGER, SGERV2D, $ SGESD2D, SGETRS, SLAMOV, SLASWP, SSCAL, SSWAP, $ STRSM, GLOBCHK, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 11*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 11*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 11*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 11*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -16 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 8*100+6 ) END IF * IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN INFO = -3 END IF * IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN INFO = -4 END IF * IF( LLDA.LT.( 2*BWL+2*BWU+1 ) ) THEN INFO = -( 8*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 8*100+4 ) END IF * BW = BWU + BWL * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 11*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 11*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -5 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -7 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 8*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PSGBTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.( BWL+BWU+1 ) ) ) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, 'PSGBTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check worksize * WORK_SIZE_MIN = NRHS*( NB+2*BWL+4*BWU ) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, 'PSGBTRS: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB( 5 ) PARAM_CHECK( 16, 1 ) = DESCB( 4 ) PARAM_CHECK( 15, 1 ) = DESCB( 3 ) PARAM_CHECK( 14, 1 ) = DESCB( 2 ) PARAM_CHECK( 13, 1 ) = DESCB( 1 ) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA( 5 ) PARAM_CHECK( 10, 1 ) = DESCA( 4 ) PARAM_CHECK( 9, 1 ) = DESCA( 3 ) PARAM_CHECK( 8, 1 ) = DESCA( 1 ) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 16 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSGBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 100 END IF * * * * Begin main code * * Move data into workspace - communicate/copy (overlap) * IF( MYCOL.LT.NPCOL-1 ) THEN CALL SGESD2D( ICTXT, BWU, NRHS, B( NB-BWU+1 ), LLDB, 0, $ MYCOL+1 ) END IF * IF( MYCOL.LT.NPCOL-1 ) THEN LM = NB - BWU ELSE LM = NB END IF * IF( MYCOL.GT.0 ) THEN WPTR = BWU + 1 ELSE WPTR = 1 END IF * LDW = NB + BWU + 2*BW + BWU * CALL SLAMOV( 'G', LM, NRHS, B( 1 ), LLDB, WORK( WPTR ), LDW ) * * Zero out rest of work * DO 20 J = 1, NRHS DO 10 L = WPTR + LM, LDW WORK( ( J-1 )*LDW+L ) = ZERO 10 CONTINUE 20 CONTINUE * IF( MYCOL.GT.0 ) THEN CALL SGERV2D( ICTXT, BWU, NRHS, WORK( 1 ), LDW, 0, MYCOL-1 ) END IF * ******************************************************************** * PHASE 1: Local computation phase -- Solve L*X = B ******************************************************************** * * Size of main (or odd) partition in each processor * ODD_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF( MYCOL.NE.0 ) THEN LBWL = BW LBWU = 0 APTR = 1 ELSE LBWL = BWL LBWU = BWU APTR = 1 + BWU END IF * IF( MYCOL.NE.NPCOL-1 ) THEN LM = NB - LBWU LN = NB - BW ELSE IF( MYCOL.NE.0 ) THEN LM = ODD_SIZE + BWU LN = MAX( ODD_SIZE-BW, 0 ) ELSE LM = N LN = MAX( N-BW, 0 ) END IF * DO 30 J = 1, LN * LMJ = MIN( LBWL, LM-J ) L = IPIV( J ) * IF( L.NE.J ) THEN CALL SSWAP( NRHS, WORK( L ), LDW, WORK( J ), LDW ) END IF * LPTR = BW + 1 + ( J-1 )*LLDA + APTR * CALL SGER( LMJ, NRHS, -ONE, A( LPTR ), 1, WORK( J ), LDW, $ WORK( J+1 ), LDW ) * 30 CONTINUE * ******************************************************************** * PHASE 2: Global computation phase -- Solve L*X = B ******************************************************************** * * Define the initial dimensions of the diagonal blocks * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW * IF( MYCOL.NE.NPCOL-1 ) THEN BM = BW - LBWU BN = BW ELSE BM = MIN( BW, ODD_SIZE ) + BWU BN = MIN( BW, ODD_SIZE ) END IF * * Pointer to first element of block bidiagonal matrix in AF * Leading dimension of block bidiagonal system * BBPTR = ( NB+BWU )*BW + 1 LDBB = 2*BW + BWU * IF( NPCOL.EQ.1 ) THEN * * In this case the loop over the levels will not be * performed. CALL SGETRS( 'N', N-LN, NRHS, AF( BBPTR+BW*LDBB ), LDBB, $ IPIV( LN+1 ), WORK( LN+1 ), LDW, INFO ) * END IF * * Loop over levels ... * * The two integers NPACT (nu. of active processors) and NPSTR * (stride between active processors) is used to control the * loop. * NPACT = NPCOL NPSTR = 1 * * Begin loop over levels 40 CONTINUE IF( NPACT.LE.1 ) $ GO TO 50 * * Test if processor is active IF( MOD( MYCOL, NPSTR ).EQ.0 ) THEN * * Send/Receive blocks * IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN * NEICOL = MYCOL + NPSTR * IF( NEICOL / NPSTR.LE.NPACT-1 ) THEN * IF( NEICOL / NPSTR.LT.NPACT-1 ) THEN BMN = BW ELSE BMN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) + $ BWU END IF * CALL SGESD2D( ICTXT, BM, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * IF( NPACT.NE.2 ) THEN * * Receive answers back from partner processor * CALL SGERV2D( ICTXT, BM+BMN-BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * BM = BM + BMN - BW * END IF * END IF * ELSE * NEICOL = MYCOL - NPSTR * IF( NEICOL.EQ.0 ) THEN BMN = BW - BWU ELSE BMN = BW END IF * CALL SLAMOV( 'G', BM, NRHS, WORK( LN+1 ), LDW, $ WORK( NB+BWU+BMN+1 ), LDW ) * CALL SGERV2D( ICTXT, BMN, NRHS, WORK( NB+BWU+1 ), LDW, 0, $ NEICOL ) * * and do the permutations and eliminations * IF( NPACT.NE.2 ) THEN * * Solve locally for BW variables * CALL SLASWP( NRHS, WORK( NB+BWU+1 ), LDW, 1, BW, $ IPIV( LN+1 ), 1 ) * CALL STRSM( 'L', 'L', 'N', 'U', BW, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+1 ), $ LDW ) * * Use soln just calculated to update RHS * CALL SGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, -ONE, $ AF( BBPTR+BW*LDBB+BW ), LDBB, $ WORK( NB+BWU+1 ), LDW, ONE, $ WORK( NB+BWU+1+BW ), LDW ) * * Give answers back to partner processor * CALL SGESD2D( ICTXT, BM+BMN-BW, NRHS, $ WORK( NB+BWU+1+BW ), LDW, 0, NEICOL ) * ELSE * * Finish up calculations for final level * CALL SLASWP( NRHS, WORK( NB+BWU+1 ), LDW, 1, BM+BMN, $ IPIV( LN+1 ), 1 ) * CALL STRSM( 'L', 'L', 'N', 'U', BM+BMN, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+1 ), $ LDW ) END IF * END IF * NPACT = ( NPACT+1 ) / 2 NPSTR = NPSTR*2 GO TO 40 * END IF * 50 CONTINUE * * ************************************** * BACKSOLVE ******************************************************************** * PHASE 2: Global computation phase -- Solve U*Y = X ******************************************************************** * IF( NPCOL.EQ.1 ) THEN * * In this case the loop over the levels will not be * performed. * In fact, the backsolve portion was done in the call to * SGETRS in the frontsolve. * END IF * * Compute variable needed to reverse loop structure in * reduced system. * RECOVERY_VAL = NPACT*NPSTR - NPCOL * * Loop over levels * Terminal values of NPACT and NPSTR from frontsolve are used * 60 CONTINUE IF( NPACT.GE.NPCOL ) $ GO TO 80 * NPSTR = NPSTR / 2 * NPACT = NPACT*2 * * Have to adjust npact for non-power-of-2 * NPACT = NPACT - MOD( ( RECOVERY_VAL / NPSTR ), 2 ) * * Find size of submatrix in this proc at this level * IF( MYCOL / NPSTR.LT.NPACT-1 ) THEN BN = BW ELSE BN = MIN( BW, NUMROC( N, NB, NPCOL-1, 0, NPCOL ) ) END IF * * If this processor is even in this level... * IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN * NEICOL = MYCOL + NPSTR * IF( NEICOL / NPSTR.LE.NPACT-1 ) THEN * IF( NEICOL / NPSTR.LT.NPACT-1 ) THEN BMN = BW BNN = BW ELSE BMN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) + BWU BNN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) END IF * IF( NPACT.GT.2 ) THEN * CALL SGESD2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * CALL SGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * ELSE * CALL SGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * END IF * END IF * ELSE * This processor is odd on this level * NEICOL = MYCOL - NPSTR * IF( NEICOL.EQ.0 ) THEN BMN = BW - BWU ELSE BMN = BW END IF * IF( NEICOL.LT.NPCOL-1 ) THEN BNN = BW ELSE BNN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) END IF * IF( NPACT.GT.2 ) THEN * * Move RHS to make room for received solutions * CALL SLAMOV( 'G', BW, NRHS, WORK( NB+BWU+1 ), LDW, $ WORK( NB+BWU+BW+1 ), LDW ) * CALL SGERV2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), LDW, 0, $ NEICOL ) * CALL SGEMM( 'N', 'N', BW, NRHS, BN, -ONE, AF( BBPTR ), LDBB, $ WORK( LN+1 ), LDW, ONE, WORK( NB+BWU+BW+1 ), $ LDW ) * * IF( MYCOL.GT.NPSTR ) THEN * CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( BBPTR+2*BW*LDBB ), LDBB, WORK( LN+BW+1 ), $ LDW, ONE, WORK( NB+BWU+BW+1 ), LDW ) * END IF * CALL STRSM( 'L', 'U', 'N', 'N', BW, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+BW+1 ), $ LDW ) * * Send new solution to neighbor * CALL SGESD2D( ICTXT, BW, NRHS, WORK( NB+BWU+BW+1 ), LDW, 0, $ NEICOL ) * * Copy new solution into expected place * CALL SLAMOV( 'G', BW, NRHS, WORK( NB+BWU+1+BW ), LDW, $ WORK( LN+BW+1 ), LDW ) * ELSE * * Solve with local diagonal block * CALL STRSM( 'L', 'U', 'N', 'N', BN+BNN, NRHS, ONE, $ AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+1 ), $ LDW ) * * Send new solution to neighbor * CALL SGESD2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ), LDW, 0, $ NEICOL ) * * Shift solutions into expected positions * CALL SLAMOV( 'G', BNN+BN-BW, NRHS, WORK( NB+BWU+1+BW ), LDW, $ WORK( LN+1 ), LDW ) * * IF( ( NB+BWU+1 ).NE.( LN+1+BW ) ) THEN * * Copy one row at a time since spaces may overlap * DO 70 J = 1, BW CALL SCOPY( NRHS, WORK( NB+BWU+J ), LDW, $ WORK( LN+BW+J ), LDW ) 70 CONTINUE * END IF * END IF * END IF * GO TO 60 * 80 CONTINUE * End of loop over levels * ******************************************************************** * PHASE 1: (Almost) Local computation phase -- Solve U*Y = X ******************************************************************** * * Reset BM to value it had before reduced system frontsolve... * IF( MYCOL.NE.NPCOL-1 ) THEN BM = BW - LBWU ELSE BM = MIN( BW, ODD_SIZE ) + BWU END IF * * First metastep is to account for the fillin blocks AF * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, BW, NRHS, WORK( NB-BW+1 ), LDW, 0, $ MYCOL+1 ) * END IF * IF( MYCOL.GT.0 ) THEN * CALL SGERV2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ), LDW, 0, $ MYCOL-1 ) * * Modify local right hand sides with received rhs's * CALL SGEMM( 'T', 'N', LM-BM, NRHS, BW, -ONE, AF( 1 ), BW, $ WORK( NB+BWU+1 ), LDW, ONE, WORK( 1 ), LDW ) * END IF * DO 90 J = LN, 1, -1 * LMJ = MIN( BW, ODD_SIZE-1 ) * LPTR = BW - 1 + J*LLDA + APTR * * In the following, the TRANS=T option is used to reverse * the order of multiplication, not as a true transpose * CALL SGEMV( 'T', LMJ, NRHS, -ONE, WORK( J+1 ), LDW, A( LPTR ), $ LLDA-1, ONE, WORK( J ), LDW ) * * Divide by diagonal element * CALL SSCAL( NRHS, ONE / A( LPTR-LLDA+1 ), WORK( J ), LDW ) 90 CONTINUE * * * CALL SLAMOV( 'G', ODD_SIZE, NRHS, WORK( 1 ), LDW, B( 1 ), LLDB ) * * Free BLACS space used to hold standard-form grid. * ICTXT = ICTXT_SAVE IF( ICTXT.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 100 CONTINUE * * Restore saved input parameters * NP = NP_SAVE * * Output worksize * WORK( 1 ) = WORK_SIZE_MIN * RETURN * * End of PSGBTRS * END scalapack-2.0.2/SRC/psgebal.f000644 000766 000024 00000033672 11705175572 016151 0ustar00juliestaff000000 000000 SUBROUTINE PSGEBAL( JOB, N, A, DESCA, ILO, IHI, SCALE, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK computational routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER JOB INTEGER IHI, ILO, INFO, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), SCALE( * ) * .. * * Purpose * ======= * * PSGEBAL balances a general real matrix A. This involves, first, * permuting A by a similarity transformation to isolate eigenvalues * in the first 1 to ILO-1 and last IHI+1 to N elements on the * diagonal; and second, applying a diagonal similarity transformation * to rows and columns ILO to IHI to make the rows and columns as * close in norm as possible. Both steps are optional. * * Balancing may reduce the 1-norm of the matrix, and improve the * accuracy of the computed eigenvalues and/or eigenvectors. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * JOB (global input) CHARACTER*1 * Specifies the operations to be performed on A: * = 'N': none: simply set ILO = 1, IHI = N, SCALE(I) = 1.0 * for i = 1,...,N; * = 'P': permute only; * = 'S': scale only; * = 'B': both permute and scale. * * N (global input) INTEGER * The order of the matrix A. N >= 0. * * A (local input/output) REAL array, dimension * (DESCA(LLD_,LOCc(N)) * On entry, the input matrix A. * On exit, A is overwritten by the balanced matrix. * If JOB = 'N', A is not referenced. * See Further Details. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ILO (global output) INTEGER * IHI (global output) INTEGER * ILO and IHI are set to integers such that on exit * A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N. * If JOB = 'N' or 'S', ILO = 1 and IHI = N. * * SCALE (global output) REAL array, dimension (N) * Details of the permutations and scaling factors applied to * A. If P(j) is the index of the row and column interchanged * with row and column j and D(j) is the scaling factor * applied to row and column j, then * SCALE(j) = P(j) for j = 1,...,ILO-1 * = D(j) for j = ILO,...,IHI * = P(j) for j = IHI+1,...,N. * The order in which the interchanges are made is N to IHI+1, * then 1 to ILO-1. * * INFO (global output) INTEGER * = 0: successful exit. * < 0: if INFO = -i, the i-th argument had an illegal value. * * Further Details * =============== * * The permutations consist of row and column interchanges which put * the matrix in the form * * ( T1 X Y ) * P A P = ( 0 B Z ) * ( 0 0 T2 ) * * where T1 and T2 are upper triangular matrices whose eigenvalues lie * along the diagonal. The column indices ILO and IHI mark the starting * and ending columns of the submatrix B. Balancing consists of applying * a diagonal similarity transformation inv(D) * B * D to make the * 1-norms of each row of B and its corresponding column nearly equal. * The output matrix is * * ( T1 X*D Y ) * ( 0 inv(D)*B*D inv(D)*Z ). * ( 0 0 T2 ) * * Information about the permutations P and the diagonal matrix D is * returned in the vector SCALE. * * This subroutine is based on the EISPACK routine BALANC. In principle, * the parallelism is extracted by using PBLAS and BLACS routines for * the permutation and balancing. * * Modified by Tzu-Yi Chen, Computer Science Division, University of * California at Berkeley, USA * * Parallel version by Robert Granat and Meiyue Shao, Department of * Computing Science and HPC2N, Umea University, Sweden * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) REAL SCLFAC PARAMETER ( SCLFAC = 2.0E+0 ) REAL FACTOR PARAMETER ( FACTOR = 0.95E+0 ) * .. * .. Local Scalars .. LOGICAL NOCONV INTEGER I, ICA, IEXC, IRA, J, K, L, M, LLDA, $ ICTXT, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ ARSRC, ACSRC REAL C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2, ELEM * .. * .. Local Arrays .. REAL CR( 2 ) * .. * .. External Functions .. LOGICAL SISNAN, LSAME INTEGER IDAMAX REAL SLAMCH EXTERNAL SISNAN, LSAME, SLAMCH * .. * .. External Subroutines .. EXTERNAL PSSCAL, PSSWAP, PSAMAX, PXERBLA, $ BLACS_GRIDINFO, CHK1MAT, SGSUM2D, $ INFOG2L, PSELGET * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. INFO = 0 ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * IF( .NOT.LSAME( JOB, 'N' ) .AND. .NOT.LSAME( JOB, 'P' ) .AND. $ .NOT.LSAME( JOB, 'S' ) .AND. .NOT.LSAME( JOB, 'B' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE CALL CHK1MAT( N, 2, N, 2, 1, 1, DESCA, 4, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( 'PSGEBAL', -INFO ) RETURN END IF * * Extract local leading dimension of A. * LLDA = DESCA( LLD_ ) * K = 1 L = N * IF( N.EQ.0 ) $ GO TO 210 * IF( LSAME( JOB, 'N' ) ) THEN DO 10 I = 1, N SCALE( I ) = ONE 10 CONTINUE GO TO 210 END IF * IF( LSAME( JOB, 'S' ) ) $ GO TO 120 * * Permutation to isolate eigenvalues if possible. * GO TO 50 * * Row and column exchange. * 20 CONTINUE SCALE( M ) = J IF( J.EQ.M ) $ GO TO 30 * CALL PSSWAP( L, A, 1, J, DESCA, 1, A, 1, M, DESCA, 1 ) CALL PSSWAP( N-K+1, A, J, K, DESCA, DESCA(M_), A, M, K, DESCA, $ DESCA(M_) ) * 30 CONTINUE GO TO ( 40, 80 )IEXC * * Search for rows isolating an eigenvalue and push them down. * 40 CONTINUE IF( L.EQ.1 ) $ GO TO 210 L = L - 1 * 50 CONTINUE DO 70 J = L, 1, -1 * DO 60 I = 1, L IF( I.EQ.J ) $ GO TO 60 * * All processors need the information to make correct decisions. * CALL PSELGET( 'All', '1-Tree', ELEM, A, J, I, DESCA ) IF( ELEM.NE.ZERO ) $ GO TO 70 60 CONTINUE * M = L IEXC = 1 GO TO 20 70 CONTINUE * GO TO 90 * * Search for columns isolating an eigenvalue and push them left. * 80 CONTINUE K = K + 1 * 90 CONTINUE DO 110 J = K, L * DO 100 I = K, L IF( I.EQ.J ) $ GO TO 100 * * All processors need the information to make correct decisions. * CALL PSELGET( 'All', '1-Tree', ELEM, A, I, J, DESCA ) IF( ELEM.NE.ZERO ) $ GO TO 110 100 CONTINUE * M = K IEXC = 2 GO TO 20 110 CONTINUE * 120 CONTINUE DO 130 I = K, L SCALE( I ) = ONE 130 CONTINUE * IF( LSAME( JOB, 'P' ) ) $ GO TO 210 * * Balance the submatrix in rows K to L. * * Iterative loop for norm reduction. * SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 140 CONTINUE NOCONV = .FALSE. * DO 200 I = K, L C = ZERO R = ZERO * * Compute local partial values of R and C in parallel and combine * with a call to the BLACS global summation routine distributing * information to all processors. * DO 150 J = K, L IF( J.EQ.I ) $ GO TO 150 CALL INFOG2L( J, I, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, ARSRC, ACSRC ) IF( MYROW.EQ.ARSRC .AND. MYCOL.EQ.ACSRC ) THEN C = C + ABS( A( II + (JJ-1)*LLDA ) ) END IF CALL INFOG2L( I, J, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, ARSRC, ACSRC ) IF( MYROW.EQ.ARSRC .AND. MYCOL.EQ.ACSRC ) THEN R = R + ABS( A( II + (JJ-1)*LLDA ) ) END IF 150 CONTINUE CR( 1 ) = C CR( 2 ) = R CALL SGSUM2D( ICTXT, 'All', '1-Tree', 2, 1, CR, 2, -1, -1 ) C = CR( 1 ) R = CR( 2 ) * * Find global maximum absolute values and indices in parallel. * CALL PSAMAX( L, CA, ICA, A, 1, I, DESCA, 1 ) CALL PSAMAX( N-K+1, RA, IRA, A, I, K, DESCA, DESCA(M_) ) * * Guard against zero C or R due to underflow. * IF( C.EQ.ZERO .OR. R.EQ.ZERO ) $ GO TO 200 G = R / SCLFAC F = ONE S = C + R 160 CONTINUE IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 IF( SISNAN( C+F+CA+R+G+RA ) ) THEN * * Exit if NaN to avoid infinite loop * INFO = -3 CALL PXERBLA( 'PSGEBAL', -INFO ) RETURN END IF F = F*SCLFAC C = C*SCLFAC CA = CA*SCLFAC R = R / SCLFAC G = G / SCLFAC RA = RA / SCLFAC GO TO 160 * 170 CONTINUE G = C / SCLFAC 180 CONTINUE IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 F = F / SCLFAC C = C / SCLFAC G = G / SCLFAC CA = CA / SCLFAC R = R*SCLFAC RA = RA*SCLFAC GO TO 180 * * Now balance. * 190 CONTINUE IF( ( C+R ).GE.FACTOR*S ) $ GO TO 200 IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN IF( F*SCALE( I ).LE.SFMIN1 ) $ GO TO 200 END IF IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN IF( SCALE( I ).GE.SFMAX1 / F ) $ GO TO 200 END IF G = ONE / F SCALE( I ) = SCALE( I )*F NOCONV = .TRUE. * CALL PSSCAL( N-K+1, G, A, I, K, DESCA, DESCA(M_) ) CALL PSSCAL( L, F, A, 1, I, DESCA, 1 ) * 200 CONTINUE * IF( NOCONV ) $ GO TO 140 * 210 CONTINUE ILO = K IHI = L * RETURN * * End of PSGEBAL * END scalapack-2.0.2/SRC/psgebd2.f000644 000766 000024 00000042466 10363532303 016047 0ustar00juliestaff000000 000000 SUBROUTINE PSGEBD2( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), D( * ), E( * ), TAUP( * ), TAUQ( * ), $ WORK( * ) * .. * * Purpose * ======= * * PSGEBD2 reduces a real general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper or lower bidiagonal * form B by an orthogonal transformation: Q' * sub( A ) * P = B. * * If M >= N, B is upper bidiagonal; if M < N, B is lower bidiagonal. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ). On exit, if M >= N, * the diagonal and the first superdiagonal of sub( A ) are * overwritten with the upper bidiagonal matrix B; the elements * below the diagonal, with the array TAUQ, represent the * orthogonal matrix Q as a product of elementary reflectors, * and the elements above the first superdiagonal, with the * array TAUP, represent the orthogonal matrix P as a product * of elementary reflectors. If M < N, the diagonal and the * first subdiagonal are overwritten with the lower bidiagonal * matrix B; the elements below the first subdiagonal, with the * array TAUQ, represent the orthogonal matrix Q as a product of * elementary reflectors, and the elements above the diagonal, * with the array TAUP, represent the orthogonal matrix P as a * product of elementary reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) REAL array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix Q. TAUQ * is tied to the distributed matrix A. See Further Details. * * TAUP (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix P. TAUP * is tied to the distributed matrix A. See Further Details. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( MpA0, NqA0 ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ) * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+IROFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * If m >= n, * * Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors; * v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1); * u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in * A(ia+i-1,ja+i+1:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, * * Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors; * v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); * u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in * A(ia+i-1,ja+i:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) * ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) * ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) * ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) * ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) * ( v1 v2 v3 v4 v5 ) * * where d and e denote diagonal and off-diagonal elements of B, vi * denotes an element of the vector defining H(i), and ui an element of * the vector defining G(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, II, IROFFA, J, $ JJ, K, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 REAL ALPHA * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DESCSET, $ INFOG2L, PSLARF, PSLARFG, PSELSET, $ PXERBLA, SGEBR2D, SGEBS2D, SLARFG * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Test the input parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) LWMIN = MAX( MPA0, NQA0 ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( IROFFA.NE.ICOFFA ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEBD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) * IF( M.EQ.1 .AND. N.EQ.1 ) THEN IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN I = II+(JJ-1)*DESCA( LLD_ ) CALL SLARFG( 1, A( I ), A( I ), 1, TAUQ( JJ ) ) D( JJ ) = A( I ) CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, D( JJ ), $ 1 ) CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TAUQ( JJ ), $ 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, D( JJ ), $ 1, IAROW, IACOL ) CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAUQ( JJ ), $ 1, IAROW, IACOL ) END IF END IF IF( MYROW.EQ.IAROW ) $ TAUP( II ) = ZERO RETURN END IF * ALPHA = ZERO * IF( M.GE.N ) THEN * * Reduce to upper bidiagonal form * CALL DESCSET( DESCD, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) DO 10 K = 1, N I = IA + K - 1 J = JA + K - 1 * * Generate elementary reflector H(j) to annihilate * A(ia+i:ia+m-1,j) * CALL PSLARFG( M-K+1, ALPHA, I, J, A, MIN( I+1, M+IA-1 ), $ J, DESCA, 1, TAUQ ) CALL PSELSET( D, 1, J, DESCD, ALPHA ) CALL PSELSET( A, I, J, DESCA, ONE ) * * Apply H(i) to A(i:ia+m-1,i+1:ja+n-1) from the left * CALL PSLARF( 'Left', M-K+1, N-K, A, I, J, DESCA, 1, TAUQ, A, $ I, J+1, DESCA, WORK ) CALL PSELSET( A, I, J, DESCA, ALPHA ) * IF( K.LT.N ) THEN * * Generate elementary reflector G(i) to annihilate * A(i,ja+j+1:ja+n-1) * CALL PSLARFG( N-K, ALPHA, I, J+1, A, I, $ MIN( J+2, JA+N-1 ), DESCA, DESCA( M_ ), $ TAUP ) CALL PSELSET( E, I, 1, DESCE, ALPHA ) CALL PSELSET( A, I, J+1, DESCA, ONE ) * * Apply G(i) to A(i+1:ia+m-1,i+1:ja+n-1) from the right * CALL PSLARF( 'Right', M-K, N-K, A, I, J+1, DESCA, $ DESCA( M_ ), TAUP, A, I+1, J+1, DESCA, $ WORK ) CALL PSELSET( A, I, J+1, DESCA, ALPHA ) ELSE CALL PSELSET( TAUP, I, 1, DESCE, ZERO ) END IF 10 CONTINUE * ELSE * * Reduce to lower bidiagonal form * CALL DESCSET( DESCD, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) DO 20 K = 1, M I = IA + K - 1 J = JA + K - 1 * * Generate elementary reflector G(i) to annihilate * A(i,ja+j:ja+n-1) * CALL PSLARFG( N-K+1, ALPHA, I, J, A, I, $ MIN( J+1, JA+N-1 ), DESCA, DESCA( M_ ), TAUP ) CALL PSELSET( D, I, 1, DESCD, ALPHA ) CALL PSELSET( A, I, J, DESCA, ONE ) * * Apply G(i) to A(i:ia+m-1,j:ja+n-1) from the right * CALL PSLARF( 'Right', M-K, N-K+1, A, I, J, DESCA, $ DESCA( M_ ), TAUP, A, MIN( I+1, IA+M-1 ), J, $ DESCA, WORK ) CALL PSELSET( A, I, J, DESCA, ALPHA ) * IF( K.LT.M ) THEN * * Generate elementary reflector H(i) to annihilate * A(i+2:ia+m-1,j) * CALL PSLARFG( M-K, ALPHA, I+1, J, A, $ MIN( I+2, IA+M-1 ), J, DESCA, 1, TAUQ ) CALL PSELSET( E, 1, J, DESCE, ALPHA ) CALL PSELSET( A, I+1, J, DESCA, ONE ) * * Apply H(i) to A(i+1:ia+m-1,j+1:ja+n-1) from the left * CALL PSLARF( 'Left', M-K, N-K, A, I+1, J, DESCA, 1, TAUQ, $ A, I+1, J+1, DESCA, WORK ) CALL PSELSET( A, I+1, J, DESCA, ALPHA ) ELSE CALL PSELSET( TAUQ, 1, J, DESCE, ZERO ) END IF 20 CONTINUE END IF * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGEBD2 * END scalapack-2.0.2/SRC/psgebrd.f000644 000766 000024 00000040073 10363532303 016137 0ustar00juliestaff000000 000000 SUBROUTINE PSGEBRD( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), D( * ), E( * ), TAUP( * ), TAUQ( * ), $ WORK( * ) * .. * * Purpose * ======= * * PSGEBRD reduces a real general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper or lower bidiagonal * form B by an orthogonal transformation: Q' * sub( A ) * P = B. * * If M >= N, B is upper bidiagonal; if M < N, B is lower bidiagonal. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ). On exit, if M >= N, * the diagonal and the first superdiagonal of sub( A ) are * overwritten with the upper bidiagonal matrix B; the elements * below the diagonal, with the array TAUQ, represent the * orthogonal matrix Q as a product of elementary reflectors, * and the elements above the first superdiagonal, with the * array TAUP, represent the orthogonal matrix P as a product * of elementary reflectors. If M < N, the diagonal and the * first subdiagonal are overwritten with the lower bidiagonal * matrix B; the elements below the first subdiagonal, with the * array TAUQ, represent the orthogonal matrix Q as a product of * elementary reflectors, and the elements above the diagonal, * with the array TAUP, represent the orthogonal matrix P as a * product of elementary reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) REAL array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix Q. TAUQ * is tied to the distributed matrix A. See Further Details. * * TAUP (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix P. TAUP * is tied to the distributed matrix A. See Further Details. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB*( MpA0 + NqA0 + 1 ) + NqA0 * * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * If m >= n, * * Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors; * v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1); * u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in * A(ia+i-1,ja+i+1:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, * * Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors; * v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); * u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in * A(ia+i-1,ja+i:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) * ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) * ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) * ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) * ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) * ( v1 v2 v3 v4 v5 ) * * where d and e denote diagonal and off-diagonal elements of B, vi * denotes an element of the vector defining H(i), and ui an element of * the vector defining G(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICTXT, IINFO, IOFF, IPW, IPY, $ IW, J, JB, JS, JW, K, L, LWMIN, MN, MP, MYCOL, $ MYROW, NB, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCWX( DLEN_ ), DESCWY( DLEN_ ), IDUM1( 1 ), $ IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PSELSET, PSGEBD2, PSGEMM, PSLABRD, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCA( MB_ ) IOFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) MP = NUMROC( M+IOFF, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+IOFF, NB, MYCOL, IACOL, NPCOL ) LWMIN = NB*( MP+NQ+1 ) + NQ * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( IOFF.NE.MOD( JA-1, DESCA( NB_ ) ) ) THEN INFO = -5 ELSE IF( NB.NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 12 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEBRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * MN = MIN( M, N ) IF( MN.EQ.0 ) $ RETURN * * Initialize parameters. * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPY = MP * NB + 1 IPW = NQ * NB + IPY * CALL DESCSET( DESCWX, M+IOFF, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCWY, NB, N+IOFF, NB, NB, IAROW, IACOL, ICTXT, $ NB ) * MP = NUMROC( M+IA-1, NB, MYROW, DESCA( RSRC_ ), NPROW ) NQ = NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) K = 1 JB = NB - IOFF IW = IOFF + 1 JW = IOFF + 1 * DO 10 L = 1, MN+IOFF-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce rows and columns i:i+nb-1 to bidiagonal form and return * the matrices X and Y which are needed to update the unreduced * part of the matrix. * CALL PSLABRD( M-K+1, N-K+1, JB, A, I, J, DESCA, D, E, TAUQ, $ TAUP, WORK, IW, JW, DESCWX, WORK( IPY ), IW, $ JW, DESCWY, WORK( IPW ) ) * * Update the trailing submatrix A(i+nb:ia+m-1,j+nb:ja+n-1), using * an update of the form A := A - V*Y' - X*U'. * CALL PSGEMM( 'No transpose', 'No transpose', M-K-JB+1, $ N-K-JB+1, JB, -ONE, A, I+JB, J, DESCA, $ WORK( IPY ), IW, JW+JB, DESCWY, ONE, A, I+JB, $ J+JB, DESCA ) CALL PSGEMM( 'No transpose', 'No transpose', M-K-JB+1, $ N-K-JB+1, JB, -ONE, WORK, IW+JB, JW, DESCWX, A, I, $ J+JB, DESCA, ONE, A, I+JB, J+JB, DESCA ) * * Copy last off-diagonal elements of B back into sub( A ). * IF( M.GE.N ) THEN JS = MIN( INDXG2L( I+JB-1, NB, 0, DESCA( RSRC_ ), NPROW ), $ MP ) IF( JS.GT.0 ) $ CALL PSELSET( A, I+JB-1, J+JB, DESCA, E( JS ) ) ELSE JS = MIN( INDXG2L( J+JB-1, NB, 0, DESCA( CSRC_ ), NPCOL ), $ NQ ) IF( JS.GT.0 ) $ CALL PSELSET( A, I+JB, J+JB-1, DESCA, E( JS ) ) END IF * K = K + JB JB = NB IW = 1 JW = 1 DESCWX( M_ ) = DESCWX( M_ ) - JB DESCWX( RSRC_ ) = MOD( DESCWX( RSRC_ ) + 1, NPROW ) DESCWX( CSRC_ ) = MOD( DESCWX( CSRC_ ) + 1, NPCOL ) DESCWY( N_ ) = DESCWY( N_ ) - JB DESCWY( RSRC_ ) = MOD( DESCWY( RSRC_ ) + 1, NPROW ) DESCWY( CSRC_ ) = MOD( DESCWY( CSRC_ ) + 1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the remainder of the matrix. * CALL PSGEBD2( M-K+1, N-K+1, A, IA+K-1, JA+K-1, DESCA, D, E, TAUQ, $ TAUP, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGEBRD * END scalapack-2.0.2/SRC/psgecon.f000644 000766 000024 00000037137 11252745702 016165 0ustar00juliestaff000000 000000 SUBROUTINE PSGECON( NORM, N, A, IA, JA, DESCA, ANORM, RCOND, WORK, $ LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, INFO, JA, LIWORK, LWORK, N REAL ANORM, RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSGECON estimates the reciprocal of the condition number of a general * distributed real matrix A(IA:IA+N-1,JA:JA+N-1), in either the 1-norm * or the infinity-norm, using the LU factorization computed by PSGETRF. * * An estimate is obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), and * the reciprocal of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies whether the 1-norm condition number or the * infinity-norm condition number is required: * = '1' or 'O': 1-norm * = 'I': Infinity-norm * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). On entry, * this array contains the local pieces of the factors L and U * from the factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U; the * unit diagonal elements of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ANORM (global input) REAL * If NORM = '1' or 'O', the 1-norm of the original distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * If NORM = 'I', the infinity-norm of the original distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * * RCOND (global output) REAL * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + 2*LOCc(N+MOD(JA-1,NB_A)) * + MAX( 2, MAX( NB_A*MAX( 1, CEIL(NPROW-1,NPCOL) ), * LOCc(N+MOD(JA-1,NB_A)) + * NB_A*MAX( 1, CEIL(NPCOL-1,NPROW) ) ). * * LOCr and LOCc values can be computed using the ScaLAPACK * tool function NUMROC; NPROW and NPCOL can be determined by * calling the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= MAX( 1, LOCr(N+MOD(IA-1,MB_A)) ). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, ONENRM CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPNL, IPNU, $ IPV, IPW, IPX, IROFF, IV, IX, IXX, JJA, JV, JX, $ KASE, KASE1, LIWMIN, LWMIN, MYCOL, MYROW, NP, $ NPCOL, NPMOD, NPROW, NQ, NQMOD REAL AINVNM, SCALE, SL, SMLNUM, SU, WMAX * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG2L, $ PCHK1MAT, PSAMAX, PSLATRS, PSLACON, $ PSRSCL, PB_TOPGET, PB_TOPSET, PXERBLA, SGEBR2D, $ SGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, ICHAR, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600 + CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN ONENRM = NORM.EQ.'1' .OR. LSAME( NORM, 'O' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + 2*NQMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = REAL( LWMIN ) LIWMIN = MAX( 1, NPMOD ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.ONENRM .AND. .NOT.LSAME( NORM, 'I' ) ) THEN INFO = -1 ELSE IF( ANORM.LT.ZERO ) THEN INFO = -7 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF * IF( ONENRM ) THEN IDUM1( 1 ) = ICHAR( '1' ) ELSE IDUM1( 1 ) = ICHAR( 'I' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 IF( LIWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 12 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGECON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * RCOND = ZERO IF( N.EQ.0 ) THEN RCOND = ONE RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN ELSE IF( N.EQ.1 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPNL = IPV + NP IPNU = IPNL + NQ IPW = IPNU + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Estimate the norm of inv(A). * AINVNM = ZERO NORMIN = 'N' IF( ONENRM ) THEN KASE1 = 1 ELSE KASE1 = 2 END IF KASE = 0 * 10 CONTINUE CALL PSLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), IX, JX, $ DESCX, IWORK, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( KASE.EQ.KASE1 ) THEN * * Multiply by inv(L). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( 'Lower', 'No transpose', 'Unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, WORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL * * Multiply by inv(U). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( 'Upper', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, WORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(U'). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( 'Upper', 'Transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, WORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL * * Multiply by inv(L'). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( 'Lower', 'Transpose', 'Unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, WORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF * * Divide X by 1/(SL*SU) if doing so will not cause overflow. * SCALE = SL*SU NORMIN = 'Y' IF( SCALE.NE.ONE ) THEN CALL PSAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL SGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1, $ IAROW, MYCOL ) END IF END IF IF( SCALE.LT.ABS( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PSRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / AINVNM ) / ANORM * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PSGECON * END scalapack-2.0.2/SRC/psgeequ.f000644 000766 000024 00000032412 10363532303 016160 0ustar00juliestaff000000 000000 SUBROUTINE PSGEEQU( M, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N REAL AMAX, COLCND, ROWCND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), C( * ), R( * ) * .. * * Purpose * ======= * * PSGEEQU computes row and column scalings intended to equilibrate an * M-by-N distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA:JA+N-1) and * reduce its condition number. R returns the row scale factors and C * the column scale factors, chosen to try to make the largest entry in * each row and column of the distributed matrix B with elements * B(i,j) = R(i) * A(i,j) * C(j) have absolute value 1. * * R(i) and C(j) are restricted to be between SMLNUM = smallest safe * number and BIGNUM = largest safe number. Use of these scaling * factors is not guaranteed to reduce the condition number of * sub( A ) but works well in practice. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ), the * local pieces of the M-by-N distributed matrix whose * equilibration factors are to be computed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * R (local output) REAL array, dimension LOCr(M_A) * If INFO = 0 or INFO > IA+M-1, R(IA:IA+M-1) contains the row * scale factors for sub( A ). R is aligned with the distributed * matrix A, and replicated across every process column. R is * tied to the distributed matrix A. * * C (local output) REAL array, dimension LOCc(N_A) * If INFO = 0, C(JA:JA+N-1) contains the column scale factors * for sub( A ). C is aligned with the distributed matrix A, and * replicated down every process row. C is tied to the distri- * buted matrix A. * * ROWCND (global output) REAL * If INFO = 0 or INFO > IA+M-1, ROWCND contains the ratio of * the smallest R(i) to the largest R(i) (IA <= i <= IA+M-1). * If ROWCND >= 0.1 and AMAX is neither too large nor too small, * it is not worth scaling by R(IA:IA+M-1). * * COLCND (global output) REAL * If INFO = 0, COLCND contains the ratio of the smallest C(j) * to the largest C(j) (JA <= j <= JA+N-1). If COLCND >= 0.1, it * is not worth scaling by C(JA:JA+N-1). * * AMAX (global output) REAL * Absolute value of largest distributed matrix element. If * AMAX is very close to overflow or very close to underflow, * the matrix should be scaled. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, and i is * <= M: the i-th row of the distributed matrix sub( A ) * is exactly zero, * > M: the (i-M)-th column of the distributed * matrix sub( A ) is exactly zero. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IDUMM, IIA, $ IOFFA, IROFF, J, JJA, LDA, MP, MYCOL, MYROW, $ NPCOL, NPROW, NQ REAL BIGNUM, RCMAX, RCMIN, SMLNUM * .. * .. Local Arrays .. INTEGER DESCC( DLEN_ ), DESCR( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, IGAMX2D, $ INFOG2L, PCHK1MAT, PB_TOPGET, PXERBLA, SGAMN2D, $ SGAMX2D * .. * .. External Functions .. INTEGER INDXL2G, NUMROC REAL PSLAMCH EXTERNAL INDXL2G, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 0, IDUMM, IDUMM, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEEQU', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) THEN ROWCND = ONE COLCND = ONE AMAX = ZERO RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * * Get machine constants and local indexes. * SMLNUM = PSLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * * Assign descriptors for R and C arrays * CALL DESCSET( DESCR, M, 1, DESCA( MB_ ), 1, 0, 0, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCC, 1, N, 1, DESCA( NB_ ), 0, 0, ICTXT, 1 ) * * Compute row scale factors. * DO 10 I = IIA, IIA+MP-1 R( I ) = ZERO 10 CONTINUE * * Find the maximum element in each row. * IOFFA = (JJA-1)*LDA DO 30 J = JJA, JJA+NQ-1 DO 20 I = IIA, IIA+MP-1 R( I ) = MAX( R( I ), ABS( A( IOFFA + I ) ) ) 20 CONTINUE IOFFA = IOFFA + LDA 30 CONTINUE CALL SGAMX2D( ICTXT, 'Rowwise', ROWCTOP, MP, 1, R( IIA ), $ MAX( 1, MP ), IDUMM, IDUMM, -1, -1, MYCOL ) * * Find the maximum and minimum scale factors. * RCMIN = BIGNUM RCMAX = ZERO DO 40 I = IIA, IIA+MP-1 RCMAX = MAX( RCMAX, R( I ) ) RCMIN = MIN( RCMIN, R( I ) ) 40 CONTINUE CALL SGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMAX, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) CALL SGAMN2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMIN, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) AMAX = RCMAX * IF( RCMIN.EQ.ZERO ) THEN * * Find the first zero scale factor and return an error code. * DO 50 I = IIA, IIA+MP-1 IF( R( I ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = INDXL2G( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) - IA + 1 50 CONTINUE CALL IGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, INFO, 1, $ IDUMM, IDUMM, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN ELSE * * Invert the scale factors. * DO 60 I = IIA, IIA+MP-1 R( I ) = ONE / MIN( MAX( R( I ), SMLNUM ), BIGNUM ) 60 CONTINUE * * Compute ROWCND = min(R(I)) / max(R(I)) * ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) * END IF * * Compute column scale factors * DO 70 J = JJA, JJA+NQ-1 C( J ) = ZERO 70 CONTINUE * * Find the maximum element in each column, * assuming the row scaling computed above. * IOFFA = (JJA-1)*LDA DO 90 J = JJA, JJA+NQ-1 DO 80 I = IIA, IIA+MP-1 C( J ) = MAX( C( J ), ABS( A( IOFFA + I ) )*R( I ) ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE CALL SGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, NQ, C( JJA ), $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) * * Find the maximum and minimum scale factors. * RCMIN = BIGNUM RCMAX = ZERO DO 100 J = JJA, JJA+NQ-1 RCMIN = MIN( RCMIN, C( J ) ) RCMAX = MAX( RCMAX, C( J ) ) 100 CONTINUE CALL SGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMAX, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) CALL SGAMN2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMIN, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) * IF( RCMIN.EQ.ZERO ) THEN * * Find the first zero scale factor and return an error code. * DO 110 J = JJA, JJA+NQ-1 IF( C( J ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = M + INDXL2G( J, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) - JA + 1 110 CONTINUE CALL IGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, INFO, 1, $ IDUMM, IDUMM, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN ELSE * * Invert the scale factors. * DO 120 J = JJA, JJA+NQ-1 C( J ) = ONE / MIN( MAX( C( J ), SMLNUM ), BIGNUM ) 120 CONTINUE * * Compute COLCND = min(C(J)) / max(C(J)) * COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) * END IF * RETURN * * End of PSGEEQU * END scalapack-2.0.2/SRC/psgehd2.f000644 000766 000024 00000026702 10363532303 016050 0ustar00juliestaff000000 000000 SUBROUTINE PSGEHD2( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEHD2 reduces a real general distributed matrix sub( A ) * to upper Hessenberg form H by an orthogonal similarity transforma- * tion: Q' * sub( A ) * Q = H, where * sub( A ) = A(IA+N-1:IA+N-1,JA+N-1:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows IA:IA+ILO-2 and IA+IHI:IA+N-1 and columns JA:JA+JLO-2 * and JA+JHI:JA+N-1. See Further Details. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) to be reduced. On exit, * the upper triangle and the first subdiagonal of sub( A ) are * overwritten with the upper Hessenberg matrix H, and the ele- * ments below the first subdiagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). Elements JA:JA+ILO-2 and JA+IHI:JA+N-2 of TAU are * set to zero. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB + MAX( NpA0, NB ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of (ihi-ilo) elementary * reflectors * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on * exit in A(ia+ilo+i:ia+ihi-1,ja+ilo+i-2), and tau in TAU(ja+ilo+i-2). * * The contents of A(IA:IA+N-1,JA:JA+N-1) are illustrated by the follo- * wing example, with n = 7, ilo = 2 and ihi = 6: * * on entry on exit * * ( a a a a a a a ) ( a a h h h h a ) * ( a a a a a a ) ( a h h h h a ) * ( a a a a a a ) ( h h h h h h ) * ( a a a a a a ) ( v2 h h h h h ) * ( a a a a a a ) ( v2 v3 h h h h ) * ( a a a a a a ) ( v2 v3 v4 h h h ) * ( a ) ( a ) * * where a denotes an element of the original matrix sub( A ), h denotes * a modified element of the upper Hessenberg matrix H, and vi denotes * an element of the vector defining H(ja+ilo+i-2). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IAROW, ICOFFA, ICTXT, IROFFA, J, K, LWMIN, $ MYCOL, MYROW, NPA0, NPCOL, NPROW REAL AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PSELSET, $ PSLARF, PSLARFG, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) NPA0 = NUMROC( IHI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = DESCA( NB_ ) + MAX( NPA0, DESCA( NB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -2 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEHD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * DO 10 K = ILO, IHI-1 I = IA + K - 1 J = JA + K - 1 * * Compute elementary reflector H(j) to annihilate * A(i+2:ihi+ia-1,j) * CALL PSLARFG( IHI-K, AII, I+1, J, A, MIN( I+2, N+IA-1 ), J, $ DESCA, 1, TAU ) CALL PSELSET( A, I+1, J, DESCA, ONE ) * * Apply H(k) to A(ia:ihi+ia-1,j+1:ihi+ja-1) from the right * CALL PSLARF( 'Right', IHI, IHI-K, A, I+1, J, DESCA, 1, TAU, A, $ IA, J+1, DESCA, WORK ) * * Apply H(j) to A(i+1:ia+ihi-1,j+1:ja+n-1) from the left * CALL PSLARF( 'Left', IHI-K, N-K, A, I+1, J, DESCA, 1, TAU, A, $ I+1, J+1, DESCA, WORK ) * CALL PSELSET( A, I+1, J, DESCA, AII ) 10 CONTINUE * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGEHD2 * END scalapack-2.0.2/SRC/psgehrd.f000644 000766 000024 00000035513 11654534541 016162 0ustar00juliestaff000000 000000 SUBROUTINE PSGEHRD( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEHRD reduces a real general distributed matrix sub( A ) * to upper Hessenberg form H by an orthogonal similarity transforma- * tion: Q' * sub( A ) * Q = H, where * sub( A ) = A(IA+N-1:IA+N-1,JA+N-1:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows IA:IA+ILO-2 and IA+IHI:IA+N-1 and columns JA:JA+ILO-2 * and JA+IHI:JA+N-1. See Further Details. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) to be reduced. On exit, * the upper triangle and the first subdiagonal of sub( A ) are * overwritten with the upper Hessenberg matrix H, and the ele- * ments below the first subdiagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). Elements JA:JA+ILO-2 and JA+IHI:JA+N-2 of TAU are * set to zero. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB*NB + NB*MAX( IHIP+1, IHLP+INLQ ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ), * ICOFFA = MOD( JA-1, NB ), IOFF = MOD( IA+ILO-2, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * ILROW = INDXG2P( IA+ILO-1, NB, MYROW, RSRC_A, NPROW ), * IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, ILROW, NPROW ), * ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, CSRC_A, NPCOL ), * INLQ = NUMROC( N-ILO+IOFF+1, NB, MYCOL, ILCOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of (ihi-ilo) elementary * reflectors * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:I) = 0, v(I+1) = 1 and v(IHI+1:N) = 0; v(I+2:IHI) is stored on * exit in A(IA+ILO+I:IA+IHI-1,JA+ILO+I-2), and tau in TAU(JA+ILO+I-2). * * The contents of A(IA:IA+N-1,JA:JA+N-1) are illustrated by the follow- * ing example, with N = 7, ILO = 2 and IHI = 6: * * on entry on exit * * ( a a a a a a a ) ( a a h h h h a ) * ( a a a a a a ) ( a h h h h a ) * ( a a a a a a ) ( h h h h h h ) * ( a a a a a a ) ( v2 h h h h h ) * ( a a a a a a ) ( v2 v3 h h h h ) * ( a a a a a a ) ( v2 v3 v4 h h h ) * ( a ) ( a ) * * where a denotes an element of the original matrix sub( A ), H denotes * a modified element of the upper Hessenberg matrix H, and vi denotes * an element of the vector defining H(JA+ILO+I-2). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, IB, ICOFFA, ICTXT, IHIP, $ IHLP, IIA, IINFO, ILCOL, ILROW, IMCOL, INLQ, $ IOFF, IPT, IPW, IPY, IROFFA, J, JJ, JJA, JY, $ K, L, LWMIN, MYCOL, MYROW, NB, NPCOL, NPROW, $ NQ REAL EI * .. * .. Local Arrays .. INTEGER DESCY( DLEN_ ), IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG1L, $ INFOG2L, PCHK1MAT, PSGEMM, PSGEHD2, $ PSLAHRD, PSLARFB, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC FLOAT, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, NB ) ICOFFA = MOD( JA-1, NB ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ) IOFF = MOD( IA+ILO-2, NB ) ILROW = INDXG2P( IA+ILO-1, NB, MYROW, DESCA( RSRC_ ), $ NPROW ) IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, ILROW, NPROW ) ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) INLQ = NUMROC( N-ILO+IOFF+1, NB, MYCOL, ILCOL, NPCOL ) LWMIN = NB*( NB + MAX( IHIP+1, IHLP+INLQ ) ) * WORK( 1 ) = FLOAT( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -2 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -3 C ELSE IF( IROFFA.NE.ICOFFA .OR. IROFFA.NE.0 ) THEN ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = ILO IDUM2( 1 ) = 2 IDUM1( 2 ) = IHI IDUM2( 2 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 10 CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEHRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Set elements JA:JA+ILO-2 and JA+JHI-1:JA+N-2 of TAU to zero. * NQ = NUMROC( JA+N-2, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL INFOG1L( JA+ILO-2, NB, NPCOL, MYCOL, DESCA( CSRC_ ), JJ, $ IMCOL ) DO 10 J = JJA, MIN( JJ, NQ ) TAU( J ) = ZERO 10 CONTINUE * CALL INFOG1L( JA+IHI-1, NB, NPCOL, MYCOL, DESCA( CSRC_ ), JJ, $ IMCOL ) DO 20 J = JJ, NQ TAU( J ) = ZERO 20 CONTINUE * * Quick return if possible * IF( IHI-ILO.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPT = 1 IPY = IPT + NB * NB IPW = IPY + IHIP * NB CALL DESCSET( DESCY, IHI+IROFFA, NB, NB, NB, IAROW, ILCOL, ICTXT, $ MAX( 1, IHIP ) ) * K = ILO IB = NB - IOFF JY = IOFF + 1 * * Loop over remaining block of columns * DO 30 L = 1, IHI-ILO+IOFF-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns j:j+ib-1 to Hessenberg form, returning the * matrices V and T of the block reflector H = I - V*T*V' * which performs the reduction, and also the matrix Y = A*V*T * CALL PSLAHRD( IHI, K, IB, A, IA, J, DESCA, TAU, WORK( IPT ), $ WORK( IPY ), 1, JY, DESCY, WORK( IPW ) ) * * Apply the block reflector H to A(ia:ia+ihi-1,j+ib:ja+ihi-1) * from the right, computing A := A - Y * V'. * V(i+ib,ib-1) must be set to 1. * CALL PSELSET2( EI, A, I+IB, J+IB-1, DESCA, ONE ) CALL PSGEMM( 'No transpose', 'Transpose', IHI, IHI-K-IB+1, IB, $ -ONE, WORK( IPY ), 1, JY, DESCY, A, I+IB, J, $ DESCA, ONE, A, IA, J+IB, DESCA ) CALL PSELSET( A, I+IB, J+IB-1, DESCA, EI ) * * Apply the block reflector H to A(i+1:ia+ihi-1,j+ib:ja+n-1) from * the left * CALL PSLARFB( 'Left', 'Transpose', 'Forward', 'Columnwise', $ IHI-K, N-K-IB+1, IB, A, I+1, J, DESCA, $ WORK( IPT ), A, I+1, J+IB, DESCA, WORK( IPY ) ) * K = K + IB IB = NB JY = 1 DESCY( CSRC_ ) = MOD( DESCY( CSRC_ ) + 1, NPCOL ) * 30 CONTINUE * * Use unblocked code to reduce the rest of the matrix * CALL PSGEHD2( N, K, IHI, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = FLOAT( LWMIN ) * RETURN * * End of PSGEHRD * END scalapack-2.0.2/SRC/psgelq2.f000644 000766 000024 00000024535 10363532303 016073 0ustar00juliestaff000000 000000 SUBROUTINE PSGELQ2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGELQ2 computes a LQ factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = L * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and below the diagonal of sub( A ) contain the M by min(M,N) * lower trapezoidal matrix L (L is lower triangular if M <= N); * the elements above the diagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * of the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia+k-1) H(ia+k-2) . . . H(ia), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1)=0 * and v(i) = 1; v(i+1:n) is stored on exit in A(ia+i-1,ja+i:ja+n-1), * and tau in TAU(ia+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, J, K, LWMIN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ REAL AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PSELSET, $ PSLARF, PSLARFG, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQ + MAX( 1, MP ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGELQ2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * K = MIN( M, N ) DO 10 I = IA, IA+K-1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * A(i,j+1:ja+n-1) * CALL PSLARFG( N-J+JA, AII, I, J, A, I, MIN( J+1, JA+N-1 ), $ DESCA, DESCA( M_ ), TAU ) * IF( I.LT.IA+M-1 ) THEN * * Apply H(i) to A(i+1:ia+m-1,j:ja+n-1) from the right * CALL PSELSET( A, I, J, DESCA, ONE ) CALL PSLARF( 'Right', M-I+IA-1, N-J+JA, A, I, J, DESCA, $ DESCA( M_ ), TAU, A, I+1, J, DESCA, WORK ) END IF CALL PSELSET( A, I, J, DESCA, AII ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGELQ2 * END scalapack-2.0.2/SRC/psgelqf.f000644 000766 000024 00000027410 10363532303 016152 0ustar00juliestaff000000 000000 SUBROUTINE PSGELQF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGELQF computes a LQ factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = L * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and below the diagonal of sub( A ) contain the M by min(M,N) * lower trapezoidal matrix L (L is lower triangular if M <= N); * the elements above the diagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * of the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia+k-1) H(ia+k-2) . . . H(ia), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1)=0 * and v(i) = 1; v(i+1:n) is stored on exit in A(ia+i-1,ja+i:ja+n-1), * and tau in TAU(ia+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IN, IPW, $ IROFF, J, K, LWMIN, MP0, MYCOL, MYROW, NPCOL, $ NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSGELQ2, $ PSLARFB, PSLARFT, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * * Handle the first block of rows separately * IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IB = IN - IA + 1 * * Compute the LQ factorization of the first block A(ia:in:ja:ja+n-1) * CALL PSGELQ2( IB, N, A, IA, JA, DESCA, TAU, WORK, LWORK, IINFO ) * IF( IA+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(ia) H(ia+1) . . . H(in) * CALL PSLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia+ib:ia+m-1,ja:ja+n-1) from the right * CALL PSLARFB( 'Right', 'No transpose', 'Forward', 'Rowwise', $ M-IB, N, IB, A, IA, JA, DESCA, WORK, A, IA+IB, $ JA, DESCA, WORK( IPW ) ) END IF * * Loop over the remaining blocks of rows * DO 10 I = IN+1, IA+K-1, DESCA( MB_ ) IB = MIN( K-I+IA, DESCA( MB_ ) ) J = JA + I - IA * * Compute the LQ factorization of the current block * A(i:i+ib-1:j:ja+n-1) * CALL PSGELQ2( IB, N-I+IA, A, I, J, DESCA, TAU, WORK, LWORK, $ IINFO ) * IF( I+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PSLARFT( 'Forward', 'Rowwise', N-I+IA, IB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(i+ib:ia+m-1,j:ja+n-1) from the right * CALL PSLARFB( 'Right', 'No transpose', 'Forward', 'Rowwise', $ M-I-IB+IA, N-J+JA, IB, A, I, J, DESCA, WORK, $ A, I+IB, J, DESCA, WORK( IPW ) ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGELQF * END scalapack-2.0.2/SRC/psgels.f000644 000766 000024 00000054376 11312467374 016033 0ustar00juliestaff000000 000000 SUBROUTINE PSGELS( TRANS, M, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), WORK( * ) * .. * * Purpose * ======= * * PSGELS solves overdetermined or underdetermined real linear * systems involving an M-by-N matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1), * or its transpose, using a QR or LQ factorization of sub( A ). It is * assumed that sub( A ) has full rank. * * The following options are provided: * * 1. If TRANS = 'N' and m >= n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || sub( B ) - sub( A )*X ||. * * 2. If TRANS = 'N' and m < n: find the minimum norm solution of * an underdetermined system sub( A ) * X = sub( B ). * * 3. If TRANS = 'T' and m >= n: find the minimum norm solution of * an undetermined system sub( A )**T * X = sub( B ). * * 4. If TRANS = 'T' and m < n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || sub( B ) - sub( A )**T * X ||. * * where sub( B ) denotes B( IB:IB+M-1, JB:JB+NRHS-1 ) when TRANS = 'N' * and B( IB:IB+N-1, JB:JB+NRHS-1 ) otherwise. Several right hand side * vectors b and solution vectors x can be handled in a single call; * When TRANS = 'N', the solution vectors are stored as the columns of * the N-by-NRHS right hand side matrix sub( B ) and the M-by-NRHS * right hand side matrix sub( B ) otherwise. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER * = 'N': the linear system involves sub( A ); * = 'T': the linear system involves sub( A )**T. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of * rows of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e. the number of columns * of the distributed submatrices sub( B ) and X. NRHS >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of local dimension * ( LLD_A, LOCc(JA+N-1) ). On entry, the M-by-N matrix A. * if M >= N, sub( A ) is overwritten by details of its QR * factorization as returned by PSGEQRF; * if M < N, sub( A ) is overwritten by details of its LQ * factorization as returned by PSGELQF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) REAL pointer into the * local memory to an array of local dimension * (LLD_B, LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the distributed matrix B of right hand side * vectors, stored columnwise; * sub( B ) is M-by-NRHS if TRANS='N', and N-by-NRHS otherwise. * On exit, sub( B ) is overwritten by the solution vectors, * stored columnwise: if TRANS = 'N' and M >= N, rows 1 to N * of sub( B ) contain the least squares solution vectors; the * residual sum of squares for the solution in each column is * given by the sum of squares of elements N+1 to M in that * column; if TRANS = 'N' and M < N, rows 1 to N of sub( B ) * contain the minimum norm solution vectors; if TRANS = 'T' * and M >= N, rows 1 to M of sub( B ) contain the minimum norm * solution vectors; if TRANS = 'T' and M < N, rows 1 to M of * sub( B ) contain the least squares solution vectors; the * residual sum of squares for the solution in each column is * given by the sum of squares of elements M+1 to N in that * column. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= LTAU + MAX( LWF, LWS ) where * If M >= N, then * LTAU = NUMROC( JA+MIN(M,N)-1, NB_A, MYCOL, CSRC_A, NPCOL ), * LWF = NB_A * ( MpA0 + NqA0 + NB_A ) * LWS = MAX( (NB_A*(NB_A-1))/2, (NRHSqB0 + MpB0)*NB_A ) + * NB_A * NB_A * Else * LTAU = NUMROC( IA+MIN(M,N)-1, MB_A, MYROW, RSRC_A, NPROW ), * LWF = MB_A * ( MpA0 + NqA0 + MB_A ) * LWS = MAX( (MB_A*(MB_A-1))/2, ( NpB0 + MAX( NqA0 + * NUMROC( NUMROC( N+IROFFB, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NRHSqB0 ) )*MB_A ) + * MB_A * MB_A * End if * * where LCMP = LCM / NPROW with LCM = ILCM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * MpB0 = NUMROC( M+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NpB0 = NUMROC( N+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NRHSqB0 = NUMROC( NRHS+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, TPSD INTEGER BROW, IACOL, IAROW, IASCL, IBCOL, IBROW, IBSCL, $ ICOFFA, ICOFFB, ICTXT, IPW, IROFFA, IROFFB, $ LCM, LCMP, LTAU, LWF, LWMIN, LWS, MPA0, MPB0, $ MYCOL, MYROW, NPB0, NPCOL, NPROW, NQA0, $ NRHSQB0, SCLLEN REAL ANRM, BIGNUM, BNRM, SMLNUM * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) REAL RWORK( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM INTEGER INDXG2P, NUMROC REAL PSLANGE, PSLAMCH EXTERNAL ILCM, INDXG2P, LSAME, NUMROC, PSLANGE, $ PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSGELQF, $ PSGEQRF, PSLABAD, PSLASCL, PSLASET, $ PSORMLQ, PSORMQR, PSTRSM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 800 + CTXT_ ) ELSE CALL CHK1MAT( M, 2, N, 3, IA, JA, DESCA, 8, INFO ) IF ( M .GE. N ) THEN CALL CHK1MAT( M, 2, NRHS, 4, IB, JB, DESCB, 12, INFO ) ELSE CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 12, INFO ) ENDIF IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( IA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( IB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) NRHSQB0 = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, $ NPCOL ) IF( M.GE.N ) THEN MPB0 = NUMROC( M+IROFFB, DESCB( MB_ ), MYROW, IBROW, $ NPROW ) LTAU = NUMROC( JA+MIN(M,N)-1, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) LWF = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) LWS = MAX( ( DESCA( NB_ )*( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPB0 + NRHSQB0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ )*DESCA( NB_ ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW NPB0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, $ NPROW ) LTAU = NUMROC( IA+MIN(M,N)-1, DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) LWF = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) LWS = MAX( ( DESCA( MB_ )*( DESCA( MB_ ) - 1 ) ) / 2, $ ( NPB0 + MAX( NQA0 + NUMROC( NUMROC( N+IROFFB, $ DESCA( MB_ ), 0, 0, NPROW ), DESCA( MB_ ), 0, 0, $ LCMP ), NRHSQB0 ) )*DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF LWMIN = LTAU + MAX( LWF, LWS ) WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) * TPSD = .TRUE. IF( LSAME( TRANS, 'N' ) ) $ TPSD = .FALSE. * IF( .NOT.( LSAME( TRANS, 'N' ) .OR. $ LSAME( TRANS, 'T' ) ) ) THEN INFO = -1 ELSE IF( M.LT.0 ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( M.GE.N .AND. IROFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( M.GE.N .AND. IAROW.NE.IBROW ) THEN INFO = -10 ELSE IF( M.LT.N .AND. ICOFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( M.GE.N .AND. DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1200 + MB_ ) ELSE IF( M.LT.N .AND. DESCA( NB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1200 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1200 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF END IF * IF( .NOT.TPSD ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IDUM1( 1 ) = ICHAR( 'T' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 14 CALL PCHK2MAT( M, 2, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, 4, $ IB, JB, DESCB, 12, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGELS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( MIN( M, N, NRHS ).EQ.0 ) THEN CALL PSLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, $ IB, JB, DESCB ) RETURN END IF * * Get machine parameters * SMLNUM = PSLAMCH( ICTXT, 'S' ) SMLNUM = SMLNUM / PSLAMCH( ICTXT, 'P' ) BIGNUM = ONE / SMLNUM CALL PSLABAD( ICTXT, SMLNUM, BIGNUM ) * * Scale A, B if max entry outside range [SMLNUM,BIGNUM] * ANRM = PSLANGE( 'M', M, N, A, IA, JA, DESCA, RWORK ) IASCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN * * Scale matrix norm up to SMLNUM * CALL PSLASCL( 'G', ANRM, SMLNUM, M, N, A, IA, JA, DESCA, $ INFO ) IASCL = 1 ELSE IF( ANRM.GT.BIGNUM ) THEN * * Scale matrix norm down to BIGNUM * CALL PSLASCL( 'G', ANRM, BIGNUM, M, N, A, IA, JA, DESCA, $ INFO ) IASCL = 2 ELSE IF( ANRM.EQ.ZERO ) THEN * * Matrix all zero. Return zero solution. * CALL PSLASET( 'F', MAX( M, N ), NRHS, ZERO, ZERO, B, IB, JB, $ DESCB ) GO TO 10 END IF * BROW = M IF( TPSD ) $ BROW = N * BNRM = PSLANGE( 'M', BROW, NRHS, B, IB, JB, DESCB, RWORK ) * IBSCL = 0 IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN * * Scale matrix norm up to SMLNUM * CALL PSLASCL( 'G', BNRM, SMLNUM, BROW, NRHS, B, IB, JB, $ DESCB, INFO ) IBSCL = 1 ELSE IF( BNRM.GT.BIGNUM ) THEN * * Scale matrix norm down to BIGNUM * CALL PSLASCL( 'G', BNRM, BIGNUM, BROW, NRHS, B, IB, JB, $ DESCB, INFO ) IBSCL = 2 END IF * IPW = LTAU + 1 * IF( M.GE.N ) THEN * * compute QR factorization of A * CALL PSGEQRF( M, N, A, IA, JA, DESCA, WORK, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least N, optimally N*NB * IF( .NOT.TPSD ) THEN * * Least-Squares Problem min || A * X - B || * * B(IB:IB+M-1,JB:JB+NRHS-1) := Q' * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PSORMQR( 'Left', 'Transpose', M, NRHS, N, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * * B(IB:IB+N-1,JB:JB+NRHS-1) := inv(R) * * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PSTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * SCLLEN = N * ELSE * * Overdetermined system of equations sub( A )' * X = sub( B ) * * sub( B ) := inv(R') * sub( B ) * CALL PSTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * B(IB+N:IB+M-1,JB:JB+NRHS-1) = ZERO * CALL PSLASET( 'All', M-N, NRHS, ZERO, ZERO, B, IB+N, JB, $ DESCB ) * * B(IB:IB+M-1,JB:JB+NRHS-1) := Q(1:N,:) * * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PSORMQR( 'Left', 'No transpose', M, NRHS, N, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * SCLLEN = M * END IF * ELSE * * Compute LQ factorization of sub( A ) * CALL PSGELQF( M, N, A, IA, JA, DESCA, WORK, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least M, optimally M*NB. * IF( .NOT.TPSD ) THEN * * underdetermined system of equations sub( A ) * X = sub( B ) * * B(IB:IB+M-1,JB:JB+NRHS-1) := inv(L) * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PSTRSM( 'Left', 'Lower', 'No transpose', 'Non-unit', M, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * B(IB+M:IB+N-1,JB:JB+NRHS-1) = 0 * CALL PSLASET( 'All', N-M, NRHS, ZERO, ZERO, B, IB+M, JB, $ DESCB ) * * B(IB:IB+N-1,JB:JB+NRHS-1) := Q(1:N,:)' * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PSORMLQ( 'Left', 'Transpose', N, NRHS, M, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * SCLLEN = N * ELSE * * overdetermined system min || A' * X - B || * * B(IB:IB+N-1,JB:JB+NRHS-1) := Q * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PSORMLQ( 'Left', 'No transpose', N, NRHS, M, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * * B(IB:IB+M-1,JB:JB+NRHS-1) := inv(L') * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PSTRSM( 'Left', 'Lower', 'Transpose', 'Non-unit', M, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * SCLLEN = M * END IF * END IF * * Undo scaling * IF( IASCL.EQ.1 ) THEN CALL PSLASCL( 'G', ANRM, SMLNUM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) ELSE IF( IASCL.EQ.2 ) THEN CALL PSLASCL( 'G', ANRM, BIGNUM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) END IF IF( IBSCL.EQ.1 ) THEN CALL PSLASCL( 'G', SMLNUM, BNRM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) ELSE IF( IBSCL.EQ.2 ) THEN CALL PSLASCL( 'G', BIGNUM, BNRM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) END IF * 10 CONTINUE * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGELS * END scalapack-2.0.2/SRC/psgeql2.f000644 000766 000024 00000030025 10363532303 016062 0ustar00juliestaff000000 000000 SUBROUTINE PSGEQL2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEQL2 computes a QL factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * L. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M >= N, the * lower triangle of the distributed submatrix * A( IA+M-N:IA+M-1, JA:JA+N-1 ) contains the N-by-N lower * triangular matrix L; if M <= N, the elements on and below * the (N-M)-th superdiagonal contain the M by N lower * trapezoidal matrix L; the remaining elements, with the * array TAU, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension LOCc(JA+N-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Mp0 + MAX( 1, Nq0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja+k-1) . . . H(ja+1) H(ja), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in * A(ia:ia+m-k+i-2,ja+n-k+i-1), and tau in TAU(ja+n-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICTXT, II, J, JJ, K, LWMIN, $ MP, MYCOL, MYROW, NPCOL, NPROW, NQ REAL AJJ, ALPHA * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PSELSET, PSLARF, PSLARFG, PB_TOPGET, $ PB_TOPSET, PXERBLA, SGEBR2D, SGEBS2D, $ SLARFG, SSCAL * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MP + MAX( 1, NQ ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEQL2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( DESCA( M_ ).EQ.1 ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) IACOL = INDXG2P( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN I = II+(JJ+NQ-2)*DESCA( LLD_ ) AJJ = A( I ) CALL SLARFG( 1, AJJ, A( I ), 1, TAU( JJ+NQ-1 ) ) IF( N.GT.1 ) THEN ALPHA = ONE - TAU( JJ+NQ-1 ) CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1 ) CALL SSCAL( NQ-1, ALPHA, A( II+(JJ-1)*DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ+NQ-1 ), 1 ) A( I ) = AJJ ELSE IF( N.GT.1 ) THEN CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, IAROW, IACOL ) CALL SSCAL( NQ, ALPHA, A( II+(JJ-1)*DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ+NQ-1 ), 1, IAROW, IACOL ) END IF * ELSE * K = MIN( M, N ) DO 10 J = JA+K-1, JA, -1 I = IA + J - JA * * Generate elementary reflector H(j) to annihilate * A(ia:i+m-k-1,j+n-k) * CALL PSLARFG( M-K+I-IA+1, AJJ, M-K+I, N-K+J, A, IA, $ N-K+J, DESCA, 1, TAU ) * * Apply H(j) to A(ia:i+m-k,ja:j+n-k-1) from the left * CALL PSELSET( A, I+M-K, J+N-K, DESCA, ONE ) CALL PSLARF( 'Left', M-K+I-IA+1, N-K+J-JA, A, IA, N-K+J, $ DESCA, 1, TAU, A, IA, JA, DESCA, WORK ) CALL PSELSET( A, I+M-K, J+N-K, DESCA, AJJ ) * 10 CONTINUE * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGEQL2 * END scalapack-2.0.2/SRC/psgeqlf.f000644 000766 000024 00000027251 10363532303 016155 0ustar00juliestaff000000 000000 SUBROUTINE PSGEQLF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEQLF computes a QL factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * L. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M >= N, the * lower triangle of the distributed submatrix * A( IA+M-N:IA+M-1, JA:JA+N-1 ) contains the N-by-N lower * triangular matrix L; if M <= N, the elements on and below * the (N-M)-th superdiagonal contain the M by N lower * trapezoidal matrix L; the remaining elements, with the * array TAU, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension LOCc(JA+N-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( Mp0 + Nq0 + NB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja+k-1) . . . H(ja+1) H(ja), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in * A(ia:ia+m-k+i-2,ja+n-k+i-1), and tau in TAU(ja+n-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, IINFO, ICTXT, IPW, J, JB, JL, JN, $ K, LWMIN, MP0, MU, MYCOL, MYROW, NPCOL, NPROW, $ NQ0, NU * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSGEQL2, $ PSLARFB, PSLARFT, PB_TOPGET, PB_TOPSET, $ PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MP0 + NQ0 + DESCA( NB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEQLF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JL = MAX( ( (JA+N-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( JL.GE.JN+1 ) THEN * * Use blocked code initially * DO 10 J = JL, JN+1, -DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Compute the QL factorization of the current block * A(ia:ia+m-n+j+jb-ja-1,j:j+jb-1) * CALL PSGEQL2( M-N+J+JB-JA, JB, A, IA, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * IF( J.GT.JA ) THEN * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PSLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:ia+m-n+j+jb-ja-1,ja:j-1) from the * left * CALL PSLARFB( 'Left', 'Transpose', 'Backward', $ 'Columnwise', M-N+J+JB-JA, J-JA, JB, A, IA, $ J, DESCA, WORK, A, IA, JA, DESCA, $ WORK( IPW ) ) END IF * 10 CONTINUE * MU = M - N + JN - JA + 1 NU = JN - JA + 1 * ELSE * MU = M NU = N * END IF * * Use unblocked code to factor the last or only block * IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL PSGEQL2( MU, NU, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGEQLF * END scalapack-2.0.2/SRC/psgeqpf.f000644 000766 000024 00000050517 10363532303 016162 0ustar00juliestaff000000 000000 SUBROUTINE PSGEQPF( M, N, A, IA, JA, DESCA, IPIV, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 14, 2000 * * .. Scalar Arguments .. INTEGER IA, JA, INFO, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEQPF computes a QR factorization with column pivoting of a * M-by-N distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1): * * sub( A ) * P = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, repre- * sent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension LOCc(JA+N-1). * On exit, if IPIV(I) = K, the local i-th column of sub( A )*P * was the global K-th column of sub( A ). IPIV is tied to the * distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX(3,Mp0 + Nq0) + LOCc(JA+N-1)+Nq0. * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * LOCc(JA+N-1) = NUMROC( JA+N-1, NB_A, MYCOL, CSRC_A, NPCOL ) * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(1) H(2) . . . H(n) * * Each H(i) has the form * * H = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 * and v(i) = 1; v(i+1:m) is stored on exit in A(ia+i-1:ia+m-1,ja+i-1). * * The matrix P is represented in jpvt as follows: If * jpvt(j) = i * then the jth column of P is the ith canonical unit vector. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFF, ICTXT, ICURROW, $ ICURCOL, II, IIA, IOFFA, IPN, IPCOL, IPW, $ IROFF, ITEMP, J, JB, JJ, JJA, JJPVT, JN, KB, $ K, KK, KSTART, KSTEP, LDA, LL, LWMIN, MN, MP, $ MYCOL, MYROW, NPCOL, NPROW, NQ, NQ0, PVT REAL AJJ, ALPHA, TEMP, TEMP2 * .. * .. Local Arrays .. INTEGER DESCN( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, IGERV2D, $ IGESD2D, INFOG1L, INFOG2L, PCHK1MAT, PSAMAX, $ PSELSET, PSLARF, PSLARFG, PSNRM2, $ PXERBLA, SCOPY, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D, SLARFG, SSWAP * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, IFIX, MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) NQ0 = NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) LWMIN = MAX( 3, MP + NQ ) + NQ0 + NQ * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -10 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEQPF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF MN = MIN( M, N ) * * Initialize the array of pivots * LDA = DESCA( LLD_ ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) KSTEP = NPCOL * DESCA( NB_ ) * IF( MYCOL.EQ.IACOL ) THEN * * Handle first block separately * JB = JN - JA + 1 DO 10 LL = JJA, JJA+JB-1 IPIV( LL ) = JA + LL - JJA 10 CONTINUE KSTART = JN + KSTEP - DESCA( NB_ ) * * Loop over remaining block of columns * DO 30 KK = JJA+JB, JJA+NQ-1, DESCA( NB_ ) KB = MIN( JJA+NQ-KK, DESCA( NB_ ) ) DO 20 LL = KK, KK+KB-1 IPIV( LL ) = KSTART+LL-KK+1 20 CONTINUE KSTART = KSTART + KSTEP 30 CONTINUE ELSE KSTART = JN + ( MOD( MYCOL-IACOL+NPCOL, NPCOL )-1 )* $ DESCA( NB_ ) DO 50 KK = JJA, JJA+NQ-1, DESCA( NB_ ) KB = MIN( JJA+NQ-KK, DESCA( NB_ ) ) DO 40 LL = KK, KK+KB-1 IPIV( LL ) = KSTART+LL-KK+1 40 CONTINUE KSTART = KSTART + KSTEP 50 CONTINUE END IF * * Initialize partial column norms, handle first block separately * CALL DESCSET( DESCN, 1, DESCA( N_ ), 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), ICTXT, 1 ) * IPN = 1 IPW = IPN + NQ0 + NQ JJ = IPN + JJA - 1 IF( MYCOL.EQ.IACOL ) THEN DO 60 KK = 0, JB-1 CALL PSNRM2( M, WORK( JJ+KK ), A, IA, JA+KK, DESCA, 1 ) WORK( NQ+JJ+KK ) = WORK( JJ+KK ) 60 CONTINUE JJ = JJ + JB END IF ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining blocks of columns * DO 80 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 70 KK = 0, JB-1 CALL PSNRM2( M, WORK( JJ+KK ), A, IA, J+KK, DESCA, 1 ) WORK( NQ+JJ+KK ) = WORK( JJ+KK ) 70 CONTINUE JJ = JJ + JB END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) 80 CONTINUE * * Compute factorization * DO 120 J = JA, JA+MN-1 I = IA + J - JA * CALL INFOG1L( J, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), $ JJ, ICURCOL ) K = JA + N - J IF( K.GT.1 ) THEN CALL PSAMAX( K, TEMP, PVT, WORK( IPN ), 1, J, DESCN, $ DESCN( M_ ) ) ELSE PVT = J END IF IF( J.NE.PVT ) THEN CALL INFOG1L( PVT, DESCA( NB_ ), NPCOL, MYCOL, $ DESCA( CSRC_ ), JJPVT, IPCOL ) IF( ICURCOL.EQ.IPCOL ) THEN IF( MYCOL.EQ.ICURCOL ) THEN CALL SSWAP( MP, A( IIA+(JJ-1)*LDA ), 1, $ A( IIA+(JJPVT-1)*LDA ), 1 ) ITEMP = IPIV( JJPVT ) IPIV( JJPVT ) = IPIV( JJ ) IPIV( JJ ) = ITEMP WORK( IPN+JJPVT-1 ) = WORK( IPN+JJ-1 ) WORK( IPN+NQ+JJPVT-1 ) = WORK( IPN+NQ+JJ-1 ) END IF ELSE IF( MYCOL.EQ.ICURCOL ) THEN * CALL SGESD2D( ICTXT, MP, 1, A( IIA+(JJ-1)*LDA ), LDA, $ MYROW, IPCOL ) WORK( IPW ) = REAL( IPIV( JJ ) ) WORK( IPW+1 ) = WORK( IPN + JJ - 1 ) WORK( IPW+2 ) = WORK( IPN + NQ + JJ - 1 ) CALL SGESD2D( ICTXT, 3, 1, WORK( IPW ), 3, MYROW, $ IPCOL ) * CALL SGERV2D( ICTXT, MP, 1, A( IIA+(JJ-1)*LDA ), LDA, $ MYROW, IPCOL ) CALL IGERV2D( ICTXT, 1, 1, IPIV( JJ ), 1, MYROW, $ IPCOL ) * ELSE IF( MYCOL.EQ.IPCOL ) THEN * CALL SGESD2D( ICTXT, MP, 1, A( IIA+(JJPVT-1)*LDA ), $ LDA, MYROW, ICURCOL ) CALL IGESD2D( ICTXT, 1, 1, IPIV( JJPVT ), 1, MYROW, $ ICURCOL ) * CALL SGERV2D( ICTXT, MP, 1, A( IIA+(JJPVT-1)*LDA ), $ LDA, MYROW, ICURCOL ) CALL SGERV2D( ICTXT, 3, 1, WORK( IPW ), 3, MYROW, $ ICURCOL ) IPIV( JJPVT ) = IFIX( WORK( IPW ) ) WORK( IPN+JJPVT-1 )= WORK( IPW+1 ) WORK( IPN+NQ+JJPVT-1 ) = WORK( IPW+2 ) * END IF * END IF * END IF * * Generate elementary reflector H(i) * CALL INFOG1L( I, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ II, ICURROW ) IF( DESCA( M_ ).EQ.1 ) THEN IF( MYROW.EQ.ICURROW ) THEN IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*DESCA( LLD_ ) AJJ = A( IOFFA ) CALL SLARFG( 1, AJJ, A( IOFFA ), 1, TAU( JJ ) ) IF( N.GT.1 ) THEN ALPHA = ONE - TAU( JJ ) CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1 ) CALL SSCAL( NQ-JJ, ALPHA, A( IOFFA+DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ ), 1 ) A( IOFFA ) = AJJ ELSE IF( N.GT.1 ) THEN CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, ICURROW, ICURCOL ) CALL SSCAL( NQ-JJ+1, ALPHA, A( I ), DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.ICURCOL ) THEN CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), $ 1, ICURROW, ICURCOL ) END IF * ELSE * CALL PSLARFG( M-J+JA, AJJ, I, J, A, MIN( I+1, IA+M-1 ), J, $ DESCA, 1, TAU ) IF( J.LT.JA+N-1 ) THEN * * Apply H(i) to A(ia+j-ja:ia+m-1,j+1:ja+n-1) from the left * CALL PSELSET( A, I, J, DESCA, ONE ) CALL PSLARF( 'Left', M-J+JA, JA+N-1-J, A, I, J, DESCA, $ 1, TAU, A, I, J+1, DESCA, WORK( IPW ) ) END IF CALL PSELSET( A, I, J, DESCA, AJJ ) * END IF * * Update partial columns norms * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 IF( MOD( J, DESCA( NB_ ) ).EQ.0 ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) IF( (JJA+NQ-JJ).GT.0 ) THEN IF( MYROW.EQ.ICURROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, JJA+NQ-JJ, $ A( II+( MIN( JJA+NQ-1, JJ )-1 )*LDA ), $ LDA ) CALL SCOPY( JJA+NQ-JJ, A( II+( MIN( JJA+NQ-1, JJ ) $ -1)*LDA ), LDA, WORK( IPW+MIN( JJA+NQ-1, $ JJ )-1 ), 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', JJA+NQ-JJ, 1, $ WORK( IPW+MIN( JJA+NQ-1, JJ )-1 ), $ MAX( 1, NQ ), ICURROW, MYCOL ) END IF END IF * JN = MIN( ICEIL( J+1, DESCA( NB_ ) ) * DESCA( NB_ ), $ JA + N - 1 ) IF( MYCOL.EQ.ICURCOL ) THEN DO 90 LL = JJ-1, JJ + JN - J - 2 IF( WORK( IPN+LL ).NE.ZERO ) THEN TEMP = ONE-( ABS( WORK( IPW+LL ) ) / $ WORK( IPN+LL ) )**2 TEMP = MAX( TEMP, ZERO ) TEMP2 = ONE + 0.05E+0*TEMP* $ ( WORK( IPN+LL ) / WORK( IPN+NQ+LL ) )**2 IF( TEMP2.EQ.ONE ) THEN IF( IA+M-1.GT.I ) THEN CALL PSNRM2( IA+M-I-1, WORK( IPN+LL ), A, I+1, $ J+LL-JJ+2, DESCA, 1 ) WORK( IPN+NQ+LL ) = WORK( IPN+LL ) ELSE WORK( IPN+LL ) = ZERO WORK( IPN+NQ+LL ) = ZERO END IF ELSE WORK( IPN+LL ) = WORK( IPN+LL ) * SQRT( TEMP ) END IF END IF 90 CONTINUE JJ = JJ + JN - J END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 110 K = JN+1, JA+N-1, DESCA( NB_ ) KB = MIN( JA+N-K, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 100 LL = JJ-1, JJ+KB-2 IF( WORK( IPN+LL ).NE.ZERO ) THEN TEMP = ONE-( ABS( WORK( IPW+LL ) ) / $ WORK( IPN+LL ) )**2 TEMP = MAX( TEMP, ZERO ) TEMP2 = ONE + 0.05E+0*TEMP* $ ( WORK( IPN+LL ) / WORK( IPN+NQ+LL ) )**2 IF( TEMP2.EQ.ONE ) THEN IF( IA+M-1.GT.I ) THEN CALL PSNRM2( IA+M-I-1, WORK( IPN+LL ), A, $ I+1, K+LL-JJ+1, DESCA, 1 ) WORK( IPN+NQ+LL ) = WORK( IPN+LL ) ELSE WORK( IPN+LL ) = ZERO WORK( IPN+NQ+LL ) = ZERO END IF ELSE WORK( IPN+LL ) = WORK( IPN+LL ) * SQRT( TEMP ) END IF END IF 100 CONTINUE JJ = JJ + KB END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * 110 CONTINUE * 120 CONTINUE * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGEQPF * END scalapack-2.0.2/SRC/psgeqr2.f000644 000766 000024 00000027437 10363532303 016105 0ustar00juliestaff000000 000000 SUBROUTINE PSGEQR2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEQR2 computes a QR factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Mp0 + MAX( 1, Nq0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(m,n). * * Each H(i) has the form * * H(j) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 * and v(i) = 1; v(i+1:m) is stored on exit in A(ia+i:ia+m-1,ja+i-1), * and tau in TAU(ja+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, II, IACOL, IAROW, ICTXT, J, JJ, K, LWMIN, $ MP, MYCOL, MYROW, NPCOL, NPROW, NQ REAL AJJ, ALPHA * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PSELSET, PSLARF, PSLARFG, PB_TOPGET, $ PB_TOPSET, PXERBLA, SGEBR2D, SGEBS2D, $ SLARFG, SSCAL * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MP + MAX( 1, NQ ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEQR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) I = II+(JJ-1)*DESCA( LLD_ ) IF( MYCOL.EQ.IACOL ) THEN AJJ = A( I ) CALL SLARFG( 1, AJJ, A( I ), 1, TAU( JJ ) ) IF( N.GT.1 ) THEN ALPHA = ONE - TAU( JJ ) CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1 ) CALL SSCAL( NQ-JJ, ALPHA, A( I+DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), $ 1 ) A( I ) = AJJ ELSE IF( N.GT.1 ) THEN CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, IAROW, IACOL ) CALL SSCAL( NQ-JJ+1, ALPHA, A( I ), DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), 1, $ IAROW, IACOL ) END IF * ELSE * K = MIN( M, N ) DO 10 J = JA, JA+K-1 I = IA + J - JA * * Generate elementary reflector H(j) to annihilate * A(i+1:ia+m-1,j) * CALL PSLARFG( M-J+JA, AJJ, I, J, A, MIN( I+1, IA+M-1 ), J, $ DESCA, 1, TAU ) IF( J.LT.JA+N-1 ) THEN * * Apply H(j)' to A(i:ia+m-1,j+1:ja+n-1) from the left * CALL PSELSET( A, I, J, DESCA, ONE ) * CALL PSLARF( 'Left', M-J+JA, N-J+JA-1, A, I, J, DESCA, 1, $ TAU, A, I, J+1, DESCA, WORK ) END IF CALL PSELSET( A, I, J, DESCA, AJJ ) * 10 CONTINUE * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGEQR2 * END scalapack-2.0.2/SRC/psgeqrf.f000644 000766 000024 00000027423 10363532303 016164 0ustar00juliestaff000000 000000 SUBROUTINE PSGEQRF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGEQRF computes a QR factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( Mp0 + Nq0 + NB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(m,n). * * Each H(i) has the form * * H(j) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 * and v(i) = 1; v(i+1:m) is stored on exit in A(ia+i:ia+m-1,ja+i-1), * and tau in TAU(ja+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IINFO, IPW, J, $ JB, JN, K, LWMIN, MP0, MYCOL, MYROW, NPCOL, $ NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSGEQR2, $ PSLARFB, PSLARFT, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MP0 + NQ0 + DESCA( NB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGEQRF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JB = JN - JA + 1 * * Compute the QR factorization of the first block A(ia:ia+m-1,ja:jn) * CALL PSGEQR2( M, JB, A, IA, JA, DESCA, TAU, WORK, LWORK, IINFO ) * IF( JA+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(ja) H(ja+1) . . . H(jn) * CALL PSLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:ia+m-1,ja+jb:ja+n-1) from the left * CALL PSLARFB( 'Left', 'Transpose', 'Forward', 'Columnwise', M, $ N-JB, JB, A, IA, JA, DESCA, WORK, A, IA, JA+JB, $ DESCA, WORK( IPW ) ) END IF * * Loop over the remaining blocks of columns * DO 10 J = JN+1, JA+K-1, DESCA( NB_ ) JB = MIN( K-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Compute the QR factorization of the current block * A(i:ia+m-1,j:j+jb-1) * CALL PSGEQR2( M-J+JA, JB, A, I, J, DESCA, TAU, WORK, LWORK, $ IINFO ) * IF( J+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PSLARFT( 'Forward', 'Columnwise', M-J+JA, JB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(i:ia+m-1,j+jb:ja+n-1) from the left * CALL PSLARFB( 'Left', 'Transpose', 'Forward', 'Columnwise', $ M-J+JA, N-J-JB+JA, JB, A, I, J, DESCA, WORK, $ A, I, J+JB, DESCA, WORK( IPW ) ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGEQRF * END scalapack-2.0.2/SRC/psgerfs.f000644 000766 000024 00000101774 10363532303 016170 0ustar00juliestaff000000 000000 SUBROUTINE PSGERFS( TRANS, N, NRHS, A, IA, JA, DESCA, AF, IAF, $ JAF, DESCAF, IPIV, B, IB, JB, DESCB, X, IX, $ JX, DESCX, FERR, BERR, WORK, LWORK, IWORK, $ LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IAF, IB, IX, INFO, JA, JAF, JB, JX, $ LIWORK, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ),IPIV( * ), IWORK( * ) REAL A( * ), AF( * ), B( * ), BERR( * ), FERR( * ), $ WORK( * ), X( * ) * .. * * Purpose * ======= * * PSGERFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solutions. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations. * = 'N': sub( A ) * sub( X ) = sub( B ) (No transpose) * = 'T': sub( A )**T * sub( X ) = sub( B ) (Transpose) * = 'C': sub( A )**T * sub( X ) = sub( B ) * (Conjugate transpose = Transpose) * * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) REAL pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input) REAL pointer into the local * memory to an array of local dimension (LLD_AF,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * factors of the matrix sub( A ) = P * L * U as computed by * PSGETRF. * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * IPIV (local input) INTEGER array of dimension LOCr(M_AF)+MB_AF. * This array contains the pivoting information as computed * by PSGETRF. IPIV(i) -> The global row local row i * was swapped with. This array is tied to the distributed * matrix A. * * B (local input) REAL pointer into the local * memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1)). This array contains the local * pieces of the distributed matrix of right hand sides * sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input and output) REAL pointer into the * local memory to an array of local dimension * (LLD_X,LOCc(JX+NRHS-1)). On entry, this array contains * the local pieces of the distributed matrix solution * sub( X ). On exit, the improved solution vectors. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). * The estimated forward error bound for each solution vector * of sub( X ). If XTRUE is the true solution corresponding * to sub( X ), FERR is an estimated upper bound for the * magnitude of the largest element in (sub( X ) - XTRUE) * divided by the magnitude of the largest element in sub( X ). * The estimate is as reliable as the estimate for RCOND, and * is almost always a slight overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*LOCr( N + MOD(IA-1,MB_A) ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr( N + MOD(IB-1,MB_B) ). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices op( A ) and op( AF ) (respectively * sub( X ) and sub( B ) ) should be distributed the same way on the * same processes. These conditions ensure that sub( A ) and sub( AF ) * (resp. sub( X ) and sub( B ) ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( AF ), sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IAF, DESCAF( MB_ ) ) = f( JAF, DESCAF( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) REAL TWO, THREE PARAMETER ( TWO = 2.0E+0, THREE = 3.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOTRAN CHARACTER TRANST INTEGER COUNT, IACOL, IAFCOL, IAFROW, IAROW, IXBCOL, $ IXBROW, IXCOL, IXROW, ICOFFA, ICOFFAF, ICOFFB, $ ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, IIW, $ IOFFXB, IPB, IPR, IPV, IROFFA, IROFFAF, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LIWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ REAL EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG2L, $ PCHK2MAT, PSAGEMV, PSAXPY, PSCOPY, $ PSGEMV, PSGETRS, PSLACON, PXERBLA, $ SGAMX2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * NOTRAN = LSAME( TRANS, 'N' ) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, N, 2, IAF, JAF, DESCAF, 11, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 16, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IX, JX, DESCX, 20, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFAF = MOD( JAF-1, DESCAF( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFCOL = INDXG2P( JAF, DESCAF( NB_ ), MYCOL, $ DESCAF( CSRC_ ), NPCOL ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 3 * NPMOD LIWMIN = NPMOD WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( ( .NOT.NOTRAN ) .AND. ( .NOT.LSAME( TRANS, 'T' ) ) .AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700 + NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCAF( MB_ ) ) THEN INFO = -( 1100 + MB_ ) ELSE IF( IROFFAF.NE.0 .OR. IAROW.NE.IAFROW ) THEN INFO = -9 ELSE IF( DESCA( NB_ ).NE.DESCAF( NB_ ) ) THEN INFO = -( 1100 + NB_ ) ELSE IF( ICOFFAF.NE.0 .OR. IACOL.NE.IAFCOL ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -( 1100 + CTXT_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -14 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1600 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1600 + CTXT_ ) ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 2000 + MB_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -18 ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 2000 + NB_ ) ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -19 ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 2000 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -24 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -26 END IF END IF * IF( NOTRAN ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 1 ) = ICHAR( 'T' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 IDUM1( 2 ) = N IDUM2( 2 ) = 2 IDUM1( 3 ) = NRHS IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 24 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 26 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, N, 2, IAF, $ JAF, DESCAF, 11, 5, IDUM1, IDUM2, INFO ) CALL PCHK2MAT( N, 2, NRHS, 3, IB, JB, DESCB, 16, N, 2, NRHS, 3, $ IX, JX, DESCX, 20, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGERFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = IPB + NP0 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = 1 + maximum number of nonzero entries in each row of sub( A ) * NZ = N + 1 EPS = PSLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) ) * DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 100 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 20 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X), * where op(sub(A)) = sub(A), or sub(A)' (A**T or A**H), * depending on TRANS. * CALL PSCOPY( N, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PSGEMV( TRANS, N, N, -ONE, A, IA, JA, DESCA, X, IX, $ JX+K, DESCX, 1, ONE, WORK( IPR ), IW, JW, $ DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIXB, IIXB + NP - 1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 30 CONTINUE END IF END IF * CALL PSAGEMV( TRANS, N, N, ONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, ONE, WORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 40 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during the * last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PSGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PSAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, IX, $ JX+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 20 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(op(sub(A))))* * ( abs(R) + NZ*EPS*( * abs(op(sub(A)))*abs(sub(X))+abs(sub(B)))))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(sub(A))) is the inverse of op(sub(A)) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(op(sub(A)))*abs(sub(X)) + abs(sub(B)) is less than * SAFE2. * * Use PSLACON to estimate the infinity-norm of the matrix * inv(op(sub(A))) * diag(W), where * W = abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 50 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 50 CONTINUE END IF END IF * KASE = 0 60 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PSLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(sub(A))'). * CALL PSGETRS( TRANST, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF ELSE * * Multiply by inv(op(sub(A)))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 80 CONTINUE END IF END IF * CALL PSGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 60 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 90 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 90 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 100 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 200 J = JN+1, JB+NRHS-1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 190 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 110 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X), * where op(sub(A)) = sub(A), or sub(A)' (A**T or A**H), * depending on TRANS. * CALL PSCOPY( N, B, IB, J+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PSGEMV( TRANS, N, N, -ONE, A, IA, JA, DESCA, X, $ IX, J+K, DESCX, 1, ONE, WORK( IPR ), IW, JW, $ DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) (abs(R(i))/(abs(op(sub(A)))*abs(sub(X)) + * abs(sub(B)))(i)) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIXB, IIXB+NP-1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 120 CONTINUE END IF END IF * CALL PSAGEMV( TRANS, N, N, ONE, A, IA, JA, DESCA, X, IX, $ J+K, DESCX, 1, ONE, WORK( IPB ), IW, JW, $ DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 )THEN DO 130 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 130 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.ICURCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during * the last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. $ COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PSGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PSAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, $ IX, J+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 110 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(op(sub(A))))* * ( abs(R) + NZ*EPS*( * abs(op(sub(A)))*abs(sub(X))+abs(sub(B)))))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(sub(A))) is the inverse of op(sub(A)) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(op(sub(A)))*abs(sub(X)) + abs(sub(B)) is less than * SAFE2. * * Use PSLACON to estimate the infinity-norm of the matrix * inv(op(sub(A))) * diag(W), where * W = abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 140 CONTINUE END IF END IF * KASE = 0 150 CONTINUE IF( MYCOL.EQ.ICURCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, ICURCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PSLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = ICURCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(sub(A))'). * CALL PSGETRS( TRANST, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 160 CONTINUE END IF END IF ELSE * * Multiply by inv(op(sub(A)))*diag(W). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 170 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 170 CONTINUE END IF END IF * CALL PSGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, $ INFO ) END IF GO TO 150 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 180 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 180 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, $ 1, IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 190 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 200 CONTINUE * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PSGERFS * END scalapack-2.0.2/SRC/psgerq2.f000644 000766 000024 00000024651 10363532303 016100 0ustar00juliestaff000000 000000 SUBROUTINE PSGERQ2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGERQ2 computes a RQ factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = R * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAU, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia) H(ia+1) . . . H(ia+k-1), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; v(1:n-k+i-1) is stored on exit in * A(ia+m-k+i-1,ja:ja+n-k+i-2), and tau in TAU(ia+m-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, J, K, LWMIN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ REAL AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PSELSET, PSLARF, PSLARFG, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQ + MAX( 1, MP ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGERQ2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * K = MIN( M, N ) DO 10 I = IA+K-1, IA, -1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * A(i+m-k,ja:j+n-k-1) * CALL PSLARFG( N-K+J-JA+1, AII, I+M-K, J+N-K, A, I+M-K, JA, $ DESCA, DESCA( M_ ), TAU ) * * Apply H(i) to A(ia:i+m-k-1,ja:j+n-k) from the right * CALL PSELSET( A, I+M-K, J+N-K, DESCA, ONE ) CALL PSLARF( 'Right', M-K+I-IA, N-K+J-JA+1, A, M-K+I, JA, $ DESCA, DESCA( M_ ), TAU, A, IA, JA, DESCA, WORK ) CALL PSELSET( A, I+M-K, J+N-K, DESCA, AII ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGERQ2 * END scalapack-2.0.2/SRC/psgerqf.f000644 000766 000024 00000027147 10363532303 016167 0ustar00juliestaff000000 000000 SUBROUTINE PSGERQF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSGERQF computes a RQ factorization of a real distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = R * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAU, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia) H(ia+1) . . . H(ia+k-1), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; v(1:n-k+i-1) is stored on exit in * A(ia+m-k+i-1,ja:ja+n-k+i-2), and tau in TAU(ia+m-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IL, IN, IPW, $ K, LWMIN, MP0, MU, MYCOL, MYROW, NPCOL, NPROW, $ NQ0, NU * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSGERQ2, $ PSLARFB, PSLARFT, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGERQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IL = MAX( ( (IA+M-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * IF( IL.GE.IN+1 ) THEN * * Use blocked code initially * DO 10 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Compute the RQ factorization of the current block * A(i:i+ib-1,ja:ja+n-m+i+ib-ia-1) * CALL PSGERQ2( IB, N-M+I+IB-IA, A, I, JA, DESCA, TAU, WORK, $ LWORK, IINFO ) * IF( I.GT.IA ) THEN * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PSLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, $ I, JA, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:i-1,ja:ja+n-m+i+ib-ia-1) from the * right * CALL PSLARFB( 'Right', 'No transpose', 'Backward', $ 'Rowwise', I-IA, N-M+I+IB-IA, IB, A, I, JA, $ DESCA, WORK, A, IA, JA, DESCA, $ WORK( IPW ) ) END IF * 10 CONTINUE * MU = IN - IA + 1 NU = N - M + IN - IA + 1 * ELSE * MU = M NU = N * END IF * * Use unblocked code to factor the last or only block * IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL PSGERQ2( MU, NU, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSGERQF * END scalapack-2.0.2/SRC/psgesv.f000644 000766 000024 00000023077 10367447133 016037 0ustar00juliestaff000000 000000 SUBROUTINE PSGESV( N, NRHS, A, IA, JA, DESCA, IPIV, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * Jan 30, 2006 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) REAL A( * ), B( * ) * .. * * Purpose * ======= * * PSGESV computes the solution to a real system of linear equations * * sub( A ) * X = sub( B ), * * where sub( A ) = A(IA:IA+N-1,JA:JA+N-1) is an N-by-N distributed * matrix and X and sub( B ) = B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS * distributed matrices. * * The LU decomposition with partial pivoting and row interchanges is * used to factor sub( A ) as sub( A ) = P * L * U, where P is a permu- * tation matrix, L is unit lower triangular, and U is upper triangular. * L and U are stored in sub( A ). The factored form of sub( A ) is then * used to solve the system of equations sub( A ) * X = sub( B ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the N-by-N distributed matrix * sub( A ) to be factored. On exit, this array contains the * local pieces of the factors L and U from the factorization * sub( A ) = P*L*U; the unit diagonal elements of L are not * stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * B (local input/local output) REAL pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, the right hand side * distributed matrix sub( B ). On exit, if INFO = 0, sub( B ) * is overwritten by the solution distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, so the solution could not be * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSGETRF, $ PSGETRS, PXERBLA * .. * .. External Functions .. INTEGER INDXG2P EXTERNAL INDXG2P * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( N, 1, NRHS, 2, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IF( IROFFA.NE.0 ) THEN INFO = -4 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( IBROW.NE.IAROW .OR. ICOFFA.NE.IROFFB ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1100+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(1100+CTXT_) END IF END IF CALL PCHK2MAT( N, 1, N, 1, IA, JA, DESCA, 6, N, 1, NRHS, 2, $ IB, JB, DESCB, 11, 0, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGESV', -INFO ) RETURN END IF * * Compute the LU factorization of sub( A ). * CALL PSGETRF( N, N, A, IA, JA, DESCA, IPIV, INFO ) * IF( INFO.EQ.0 ) THEN * * Solve the system sub( A ) * X = sub( B ), overwriting sub( B ) * with X. * CALL PSGETRS( 'No transpose', N, NRHS, A, IA, JA, DESCA, IPIV, $ B, IB, JB, DESCB, INFO ) * END IF * RETURN * * End of PSGESV * END scalapack-2.0.2/SRC/psgesvd.f000644 000766 000024 00000055154 10377355407 016207 0ustar00juliestaff000000 000000 SUBROUTINE PSGESVD(JOBU,JOBVT,M,N,A,IA,JA,DESCA,S,U,IU,JU,DESCU, + VT,IVT,JVT,DESCVT,WORK,LWORK,INFO) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Oak Ridge National Laboratory * and Univ. of California Berkeley. * Jan 2006 * * .. Scalar Arguments .. CHARACTER JOBU,JOBVT INTEGER IA,INFO,IU,IVT,JA,JU,JVT,LWORK,M,N * .. * .. Array Arguments .. INTEGER DESCA(*),DESCU(*),DESCVT(*) REAL A(*),U(*),VT(*),WORK(*) REAL S(*) * .. * * Purpose * ======= * * PSGESVD computes the singular value decomposition (SVD) of an * M-by-N matrix A, optionally computing the left and/or right * singular vectors. The SVD is written as * * A = U * SIGMA * transpose(V) * * where SIGMA is an M-by-N matrix which is zero except for its * min(M,N) diagonal elements, U is an M-by-M orthogonal matrix, and * V is an N-by-N orthogonal matrix. The diagonal elements of SIGMA * are the singular values of A and the columns of U and V are the * corresponding right and left singular vectors, respectively. The * singular values are returned in array S in decreasing order and * only the first min(M,N) columns of U and rows of VT = V**T are * computed. * * Notes * ===== * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, and * assume that its process grid has dimension r x c. LOCr( K ) denotes * the number of elements of K that a process would receive if K were * distributed over the r processes of its process column. Similarly, * LOCc( K ) denotes the number of elements of K that a process would * receive if K were distributed over the c processes of its process * row. The values of LOCr() and LOCc() may be determined via a call * to the ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MP = number of local rows in A and U * NQ = number of local columns in A and VT * SIZE = min( M, N ) * SIZEQ = number of local columns in U * SIZEP = number of local rows in VT * * JOBU (global input) CHARACTER*1 * Specifies options for computing U: * = 'V': the first SIZE columns of U (the left singular * vectors) are returned in the array U; * = 'N': no columns of U (no left singular vectors) are * computed. * * JOBVT (global input) CHARACTER*1 * Specifies options for computing V**T: * = 'V': the first SIZE rows of V**T (the right singular * vectors) are returned in the array VT; * = 'N': no rows of V**T (no right singular vectors) are * computed. * * M (global input) INTEGER * The number of rows of the input matrix A. M >= 0. * * N (global input) INTEGER * The number of columns of the input matrix A. N >= 0. * * A (local input/workspace) block cyclic REAL * array, * global dimension (M, N), local dimension (MP, NQ) * On exit, the contents of A are destroyed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix A. * * S (global output) REAL array, dimension SIZE * The singular values of A, sorted so that S(i) >= S(i+1). * * U (local output) REAL array, local dimension * (MP, SIZEQ), global dimension (M, SIZE) * if JOBU = 'V', U contains the first min(m,n) columns of U * if JOBU = 'N', U is not referenced. * * IU (global input) INTEGER * The row index in the global array U indicating the first * row of sub( U ). * * JU (global input) INTEGER * The column index in the global array U indicating the * first column of sub( U ). * * DESCU (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix U. * * VT (local output) REAL array, local dimension * (SIZEP, NQ), global dimension (SIZE, N). * If JOBVT = 'V', VT contains the first SIZE rows of * V**T. If JOBVT = 'N', VT is not referenced. * * IVT (global input) INTEGER * The row index in the global array VT indicating the first * row of sub( VT ). * * JVT (global input) INTEGER * The column index in the global array VT indicating the * first column of sub( VT ). * * DESCVT (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix VT. * * WORK (local workspace/output) REAL array, dimension * (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * LWORK >= 1 + 6*SIZEB + MAX(WATOBD, WBDTOSVD), * * where SIZEB = MAX(M,N), and WATOBD and WBDTOSVD refer, * respectively, to the workspace required to bidiagonalize * the matrix A and to go from the bidiagonal matrix to the * singular value decomposition U*S*VT. * * For WATOBD, the following holds: * * WATOBD = MAX(MAX(WPSLANGE,WPSGEBRD), * MAX(WPSLARED2D,WP(pre)LARED1D)), * * where WPSLANGE, WPSLARED1D, WPSLARED2D, WPSGEBRD are the * workspaces required respectively for the subprograms * PSLANGE, PSLARED1D, PSLARED2D, PSGEBRD. Using the * standard notation * * MP = NUMROC( M, MB, MYROW, DESCA( CTXT_ ), NPROW), * NQ = NUMROC( N, NB, MYCOL, DESCA( LLD_ ), NPCOL), * * the workspaces required for the above subprograms are * * WPSLANGE = MP, * WPSLARED1D = NQ0, * WPSLARED2D = MP0, * WPSGEBRD = NB*(MP + NQ + 1) + NQ, * * where NQ0 and MP0 refer, respectively, to the values obtained * at MYCOL = 0 and MYROW = 0. In general, the upper limit for * the workspace is given by a workspace required on * processor (0,0): * * WATOBD <= NB*(MP0 + NQ0 + 1) + NQ0. * * In case of a homogeneous process grid this upper limit can * be used as an estimate of the minimum workspace for every * processor. * * For WBDTOSVD, the following holds: * * WBDTOSVD = SIZE*(WANTU*NRU + WANTVT*NCVT) + * MAX(WSBDSQR, * MAX(WANTU*WPSORMBRQLN, WANTVT*WPSORMBRPRT)), * * where * * 1, if left(right) singular vectors are wanted * WANTU(WANTVT) = * 0, otherwise * * and WSBDSQR, WPSORMBRQLN and WPSORMBRPRT refer respectively * to the workspace required for the subprograms SBDSQR, * PSORMBR(QLN), and PSORMBR(PRT), where QLN and PRT are the * values of the arguments VECT, SIDE, and TRANS in the call * to PSORMBR. NRU is equal to the local number of rows of * the matrix U when distributed 1-dimensional "column" of * processes. Analogously, NCVT is equal to the local number * of columns of the matrix VT when distributed across * 1-dimensional "row" of processes. Calling the LAPACK * procedure SBDSQR requires * * WSBDSQR = MAX(1, 4*SIZE ) * * on every processor. Finally, * * WPSORMBRQLN = MAX( (NB*(NB-1))/2, (SIZEQ+MP)*NB)+NB*NB, * WPSORMBRPRT = MAX( (MB*(MB-1))/2, (SIZEP+NQ)*MB )+MB*MB, * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the work array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if SBDSQR did not converge * If INFO = MIN(M,N) + 1, then PSGESVD has detected * heterogeneity by finding that eigenvalues were not * identical across the process grid. In this case, the * accuracy of the results from PSGESVD cannot be * guaranteed. * * ===================================================================== * * The results of PSGEBRD, and therefore PSGESVD, may vary slightly * from run to run with the same input data. If repeatability is an * issue, call BLACS_SET with the appropriate option after defining * the process grid. * * Alignment requirements * ====================== * * The routine PSGESVD inherits the same alignement requirement as * the routine PSGEBRD, namely: * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * * ===================================================================== * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D,DLEN_,DTYPE_,CTXT_,M_,N_,MB_,NB_,RSRC_, + CSRC_,LLD_,ITHVAL PARAMETER (BLOCK_CYCLIC_2D=1,DLEN_=9,DTYPE_=1,CTXT_=2,M_=3,N_=4, + MB_=5,NB_=6,RSRC_=7,CSRC_=8,LLD_=9,ITHVAL=10) REAL ZERO,ONE PARAMETER (ZERO= (0.0E+0),ONE= (1.0E+0)) * .. * .. Local Scalars .. CHARACTER UPLO INTEGER CONTEXTC,CONTEXTR,I,INDD,INDD2,INDE,INDE2,INDTAUP,INDTAUQ, + INDU,INDV,INDWORK,IOFFD,IOFFE,ISCALE,J,K,LDU,LDVT,LLWORK, + LWMIN,MAXIM,MB,MP,MYPCOL,MYPCOLC,MYPCOLR,MYPROW,MYPROWC, + MYPROWR,NB,NCVT,NPCOL,NPCOLC,NPCOLR,NPROCS,NPROW,NPROWC, + NPROWR,NQ,NRU,SIZE,SIZEB,SIZEP,SIZEPOS,SIZEQ,WANTU,WANTVT, + WATOBD,WBDTOSVD,WSBDSQR,WPSGEBRD,WPSLANGE,WPSORMBRPRT, + WPSORMBRQLN REAL ANRM,BIGNUM,EPS,RMAX,RMIN,SAFMIN,SIGMA,SMLNUM * .. * .. Local Arrays .. INTEGER DESCTU(DLEN_),DESCTVT(DLEN_),IDUM1(3),IDUM2(3) REAL C(1,1) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL PSLAMCH,PSLANGE EXTERNAL LSAME,NUMROC,PDLAMCH,PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GET,BLACS_GRIDEXIT,BLACS_GRIDINFO,BLACS_GRIDINIT, + CHK1MAT,SBDSQR,DESCINIT,SGAMN2D,SGAMX2D,SSCAL,IGAMX2D, + IGEBR2D,IGEBS2D,PCHK1MAT,PSGEBRD,PSGEMR2D,PSLARED1D, + PSLARED2D,PSLASCL,PSLASET,PSORMBR,PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX,MIN,SQRT,REAL * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF (BLOCK_CYCLIC_2D*DTYPE_*LLD_*MB_*M_*NB_*N_.LT.0) RETURN * CALL BLACS_GRIDINFO(DESCA(CTXT_),NPROW,NPCOL,MYPROW,MYPCOL) ISCALE = 0 INFO = 0 * IF (NPROW.EQ.-1) THEN INFO = - (800+CTXT_) ELSE * SIZE = MIN(M,N) SIZEB = MAX(M,N) NPROCS = NPROW*NPCOL IF (M.GE.N) THEN IOFFD = JA - 1 IOFFE = IA - 1 SIZEPOS = 1 ELSE IOFFD = IA - 1 IOFFE = JA - 1 SIZEPOS = 3 END IF * IF (LSAME(JOBU,'V')) THEN WANTU = 1 ELSE WANTU = 0 END IF IF (LSAME(JOBVT,'V')) THEN WANTVT = 1 ELSE WANTVT = 0 END IF * CALL CHK1MAT(M,3,N,4,IA,JA,DESCA,8,INFO) IF (WANTU.EQ.1) THEN CALL CHK1MAT(M,3,SIZE,SIZEPOS,IU,JU,DESCU,13,INFO) END IF IF (WANTVT.EQ.1) THEN CALL CHK1MAT(SIZE,SIZEPOS,N,4,IVT,JVT,DESCVT,17,INFO) END IF CALL IGAMX2D(DESCA(CTXT_),'A',' ',1,1,INFO,1,1,1,-1,-1,0) * IF (INFO.EQ.0) THEN * * Set up pointers into the WORK array. * INDD = 2 INDE = INDD + SIZEB + IOFFD INDD2 = INDE + SIZEB + IOFFE INDE2 = INDD2 + SIZEB + IOFFD * INDTAUQ = INDE2 + SIZEB + IOFFE INDTAUP = INDTAUQ + SIZEB + JA - 1 INDWORK = INDTAUP + SIZEB + IA - 1 LLWORK = LWORK - INDWORK + 1 * * Initialize contexts for "column" and "row" process matrices. * CALL BLACS_GET(DESCA(CTXT_),10,CONTEXTC) CALL BLACS_GRIDINIT(CONTEXTC,'R',NPROCS,1) CALL BLACS_GRIDINFO(CONTEXTC,NPROWC,NPCOLC,MYPROWC, + MYPCOLC) CALL BLACS_GET(DESCA(CTXT_),10,CONTEXTR) CALL BLACS_GRIDINIT(CONTEXTR,'R',1,NPROCS) CALL BLACS_GRIDINFO(CONTEXTR,NPROWR,NPCOLR,MYPROWR, + MYPCOLR) * * Set local dimensions of matrices (this is for MB=NB=1). * NRU = NUMROC(M,1,MYPROWC,0,NPROCS) NCVT = NUMROC(N,1,MYPCOLR,0,NPROCS) NB = DESCA(NB_) MB = DESCA(MB_) MP = NUMROC(M,MB,MYPROW,DESCA(RSRC_),NPROW) NQ = NUMROC(N,NB,MYPCOL,DESCA(CSRC_),NPCOL) IF (WANTVT.EQ.1) THEN SIZEP = NUMROC(SIZE,DESCVT(MB_),MYPROW,DESCVT(RSRC_), + NPROW) ELSE SIZEP = 0 END IF IF (WANTU.EQ.1) THEN SIZEQ = NUMROC(SIZE,DESCU(NB_),MYPCOL,DESCU(CSRC_), + NPCOL) ELSE SIZEQ = 0 END IF * * Transmit MAX(NQ0, MP0). * IF (MYPROW.EQ.0 .AND. MYPCOL.EQ.0) THEN MAXIM = MAX(NQ,MP) CALL IGEBS2D(DESCA(CTXT_),'All',' ',1,1,MAXIM,1) ELSE CALL IGEBR2D(DESCA(CTXT_),'All',' ',1,1,MAXIM,1,0,0) END IF * WPSLANGE = MP WPSGEBRD = NB* (MP+NQ+1) + NQ WATOBD = MAX(MAX(WPSLANGE,WPSGEBRD),MAXIM) * WSBDSQR = MAX(1,4*SIZE) WPSORMBRQLN = MAX((NB* (NB-1))/2, (SIZEQ+MP)*NB) + NB*NB WPSORMBRPRT = MAX((MB* (MB-1))/2, (SIZEP+NQ)*MB) + MB*MB WBDTOSVD = SIZE* (WANTU*NRU+WANTVT*NCVT) + + MAX(WSBDSQR,MAX(WANTU*WPSORMBRQLN, + WANTVT*WPSORMBRPRT)) * * Finally, calculate required workspace. * LWMIN = 1 + 6*SIZEB + MAX(WATOBD,WBDTOSVD) WORK(1) = REAL(LWMIN) * IF (WANTU.NE.1 .AND. .NOT. (LSAME(JOBU,'N'))) THEN INFO = -1 ELSE IF (WANTVT.NE.1 .AND. .NOT. (LSAME(JOBVT,'N'))) THEN INFO = -2 ELSE IF (LWORK.LT.LWMIN .AND. LWORK.NE.-1) THEN INFO = -19 END IF * END IF * IDUM1(1) = WANTU IDUM1(2) = WANTVT IF (LWORK.EQ.-1) THEN IDUM1(3) = -1 ELSE IDUM1(3) = 1 END IF IDUM2(1) = 1 IDUM2(2) = 2 IDUM2(3) = 19 CALL PCHK1MAT(M,3,N,4,IA,JA,DESCA,8,3,IDUM1,IDUM2,INFO) IF (INFO.EQ.0) THEN IF (WANTU.EQ.1) THEN CALL PCHK1MAT(M,3,SIZE,4,IU,JU,DESCU,13,0,IDUM1,IDUM2, + INFO) END IF IF (WANTVT.EQ.1) THEN CALL PCHK1MAT(SIZE,3,N,4,IVT,JVT,DESCVT,17,0,IDUM1, + IDUM2,INFO) END IF END IF * END IF * IF (INFO.NE.0) THEN CALL PXERBLA(DESCA(CTXT_),'PSGESVD',-INFO) RETURN ELSE IF (LWORK.EQ.-1) THEN GO TO 40 END IF * * Quick return if possible. * IF (M.LE.0 .OR. N.LE.0) GO TO 40 * * Get machine constants. * SAFMIN = PSLAMCH(DESCA(CTXT_),'Safe minimum') EPS = PSLAMCH(DESCA(CTXT_),'Precision') SMLNUM = SAFMIN/EPS BIGNUM = ONE/SMLNUM RMIN = SQRT(SMLNUM) RMAX = MIN(SQRT(BIGNUM),ONE/SQRT(SQRT(SAFMIN))) * * Scale matrix to allowable range, if necessary. * ANRM = PSLANGE('1',M,N,A,IA,JA,DESCA,WORK(INDWORK)) IF (ANRM.GT.ZERO .AND. ANRM.LT.RMIN) THEN ISCALE = 1 SIGMA = RMIN/ANRM ELSE IF (ANRM.GT.RMAX) THEN ISCALE = 1 SIGMA = RMAX/ANRM END IF * IF (ISCALE.EQ.1) THEN CALL PSLASCL('G',ONE,SIGMA,M,N,A,IA,JA,DESCA,INFO) END IF * CALL PSGEBRD(M,N,A,IA,JA,DESCA,WORK(INDD),WORK(INDE), + WORK(INDTAUQ),WORK(INDTAUP),WORK(INDWORK),LLWORK, + INFO) * * Copy D and E to all processes. * Array D is in local array of dimension: * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * Array E is in local array of dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * IF (M.GE.N) THEN * Distribute D CALL PSLARED1D(N+IOFFD,IA,JA,DESCA,WORK(INDD),WORK(INDD2), + WORK(INDWORK),LLWORK) * Distribute E CALL PSLARED2D(M+IOFFE,IA,JA,DESCA,WORK(INDE),WORK(INDE2), + WORK(INDWORK),LLWORK) ELSE * Distribute D CALL PSLARED2D(M+IOFFD,IA,JA,DESCA,WORK(INDD),WORK(INDD2), + WORK(INDWORK),LLWORK) * Distribute E CALL PSLARED1D(N+IOFFE,IA,JA,DESCA,WORK(INDE),WORK(INDE2), + WORK(INDWORK),LLWORK) END IF * * Prepare for calling PSBDSQR. * IF (M.GE.N) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * INDU = INDWORK INDV = INDU + SIZE*NRU*WANTU INDWORK = INDV + SIZE*NCVT*WANTVT * LDU = MAX(1,NRU) LDVT = MAX(1,SIZE) * CALL DESCINIT(DESCTU,M,SIZE,1,1,0,0,CONTEXTC,LDU,INFO) CALL DESCINIT(DESCTVT,SIZE,N,1,1,0,0,CONTEXTR,LDVT,INFO) * IF (WANTU.EQ.1) THEN CALL PSLASET('Full',M,SIZE,ZERO,ONE,WORK(INDU),1,1,DESCTU) ELSE NRU = 0 END IF * IF (WANTVT.EQ.1) THEN CALL PSLASET('Full',SIZE,N,ZERO,ONE,WORK(INDV),1,1,DESCTVT) ELSE NCVT = 0 END IF * CALL SBDSQR(UPLO,SIZE,NCVT,NRU,0,WORK(INDD2+IOFFD), + WORK(INDE2+IOFFE),WORK(INDV),SIZE,WORK(INDU),LDU,C,1, + WORK(INDWORK),INFO) * * Redistribute elements of U and VT in the block-cyclic fashion. * IF (WANTU.EQ.1) CALL PSGEMR2D(M,SIZE,WORK(INDU),1,1,DESCTU,U,IU, + JU,DESCU,DESCU(CTXT_)) * IF (WANTVT.EQ.1) CALL PSGEMR2D(SIZE,N,WORK(INDV),1,1,DESCTVT,VT, + IVT,JVT,DESCVT,DESCVT(CTXT_)) * * Set to ZERO "non-square" elements of the larger matrices U, VT. * IF (M.GT.N .AND. WANTU.EQ.1) THEN CALL PSLASET('Full',M-SIZE,SIZE,ZERO,ZERO,U,IA+SIZE,JU,DESCU) ELSE IF (N.GT.M .AND. WANTVT.EQ.1) THEN CALL PSLASET('Full',SIZE,N-SIZE,ZERO,ZERO,VT,IVT,JVT+SIZE, + DESCVT) END IF * * Multiply Householder rotations from bidiagonalized matrix. * IF (WANTU.EQ.1) CALL PSORMBR('Q','L','N',M,SIZE,N,A,IA,JA,DESCA, + WORK(INDTAUQ),U,IU,JU,DESCU, + WORK(INDWORK),LLWORK,INFO) * IF (WANTVT.EQ.1) CALL PSORMBR('P','R','T',SIZE,N,M,A,IA,JA,DESCA, + WORK(INDTAUP),VT,IVT,JVT,DESCVT, + WORK(INDWORK),LLWORK,INFO) * * Copy singular values into output array S. * DO 10 I = 1,SIZE S(I) = WORK(INDD2+IOFFD+I-1) 10 CONTINUE * * If matrix was scaled, then rescale singular values appropriately. * IF (ISCALE.EQ.1) THEN CALL SSCAL(SIZE,ONE/SIGMA,S,1) END IF * * Compare every ith eigenvalue, or all if there are only a few, * across the process grid to check for heterogeneity. * IF (SIZE.LE.ITHVAL) THEN J = SIZE K = 1 ELSE J = SIZE/ITHVAL K = ITHVAL END IF * DO 20 I = 1,J WORK(I+INDE) = S((I-1)*K+1) WORK(I+INDD2) = S((I-1)*K+1) 20 CONTINUE * CALL SGAMN2D(DESCA(CTXT_),'a',' ',J,1,WORK(1+INDE),J,1,1,-1,-1,0) CALL SGAMX2D(DESCA(CTXT_),'a',' ',J,1,WORK(1+INDD2),J,1,1,-1,-1,0) * DO 30 I = 1,J IF ((WORK(I+INDE)-WORK(I+INDD2)).NE.ZERO) THEN INFO = SIZE + 1 END IF 30 CONTINUE * 40 CONTINUE * CALL BLACS_GRIDEXIT(CONTEXTC) CALL BLACS_GRIDEXIT(CONTEXTR) * * End of PSGESVD * RETURN END scalapack-2.0.2/SRC/psgesvx.f000644 000766 000024 00000104524 10363532303 016212 0ustar00juliestaff000000 000000 SUBROUTINE PSGESVX( FACT, TRANS, N, NRHS, A, IA, JA, DESCA, AF, $ IAF, JAF, DESCAF, IPIV, EQUED, R, C, B, IB, $ JB, DESCB, X, IX, JX, DESCX, RCOND, FERR, $ BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER EQUED, FACT, TRANS INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, LIWORK, $ LWORK, N, NRHS REAL RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IPIV( * ), IWORK( * ) REAL A( * ), AF( * ), B( * ), BERR( * ), C( * ), $ FERR( * ), R( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PSGESVX uses the LU factorization to compute the solution to a real * system of linear equations * * A(IA:IA+N-1,JA:JA+N-1) * X = B(IB:IB+N-1,JB:JB+NRHS-1), * * where A(IA:IA+N-1,JA:JA+N-1) is an N-by-N matrix and X and * B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS matrices. * * Error bounds on the solution and a condition estimate are also * provided. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Description * =========== * * In the following description, A denotes A(IA:IA+N-1,JA:JA+N-1), * B denotes B(IB:IB+N-1,JB:JB+NRHS-1) and X denotes * X(IX:IX+N-1,JX:JX+NRHS-1). * * The following steps are performed: * * 1. If FACT = 'E', real scaling factors are computed to equilibrate * the system: * TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B * TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B * TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') * or diag(C)*B (if TRANS = 'T' or 'C'). * * 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the * matrix A (after equilibration if FACT = 'E') as * A = P * L * U, * where P is a permutation matrix, L is a unit lower triangular * matrix, and U is upper triangular. * * 3. The factored form of A is used to estimate the condition number * of the matrix A. If the reciprocal of the condition number is * less than machine precision, steps 4-6 are skipped. * * 4. The system of equations is solved for X using the factored form * of A. * * 5. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 6. If FACT = 'E' and equilibration was used, the matrix X is * premultiplied by diag(C) (if TRANS = 'N') or diag(R) (if * TRANS = 'T' or 'C') so that it solves the original system * before equilibration. * * Arguments * ========= * * FACT (global input) CHARACTER * Specifies whether or not the factored form of the matrix * A(IA:IA+N-1,JA:JA+N-1) is supplied on entry, and if not, * whether the matrix A(IA:IA+N-1,JA:JA+N-1) should be * equilibrated before it is factored. * = 'F': On entry, AF(IAF:IAF+N-1,JAF:JAF+N-1) and IPIV con- * tain the factored form of A(IA:IA+N-1,JA:JA+N-1). * If EQUED is not 'N', the matrix * A(IA:IA+N-1,JA:JA+N-1) has been equilibrated with * scaling factors given by R and C. * A(IA:IA+N-1,JA:JA+N-1), AF(IAF:IAF+N-1,JAF:JAF+N-1), * and IPIV are not modified. * = 'N': The matrix A(IA:IA+N-1,JA:JA+N-1) will be copied to * AF(IAF:IAF+N-1,JAF:JAF+N-1) and factored. * = 'E': The matrix A(IA:IA+N-1,JA:JA+N-1) will be equili- * brated if necessary, then copied to * AF(IAF:IAF+N-1,JAF:JAF+N-1) and factored. * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': A(IA:IA+N-1,JA:JA+N-1) * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (No transpose) * = 'T': A(IA:IA+N-1,JA:JA+N-1)**T * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (Transpose) * = 'C': A(IA:IA+N-1,JA:JA+N-1)**H * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (Transpose) * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * NRHS (global input) INTEGER * The number of right-hand sides, i.e., the number of columns * of the distributed submatrices B(IB:IB+N-1,JB:JB+NRHS-1) and * X(IX:IX+N-1,JX:JX+NRHS-1). NRHS >= 0. * * A (local input/local output) REAL pointer into * the local memory to an array of local dimension * (LLD_A,LOCc(JA+N-1)). On entry, the N-by-N matrix * A(IA:IA+N-1,JA:JA+N-1). If FACT = 'F' and EQUED is not 'N', * then A(IA:IA+N-1,JA:JA+N-1) must have been equilibrated by * the scaling factors in R and/or C. A(IA:IA+N-1,JA:JA+N-1) is * not modified if FACT = 'F' or 'N', or if FACT = 'E' and * EQUED = 'N' on exit. * * On exit, if EQUED .ne. 'N', A(IA:IA+N-1,JA:JA+N-1) is scaled * as follows: * EQUED = 'R': A(IA:IA+N-1,JA:JA+N-1) := * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * EQUED = 'C': A(IA:IA+N-1,JA:JA+N-1) := * A(IA:IA+N-1,JA:JA+N-1) * diag(C) * EQUED = 'B': A(IA:IA+N-1,JA:JA+N-1) := * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * diag(C). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input or local output) REAL pointer * into the local memory to an array of local dimension * (LLD_AF,LOCc(JA+N-1)). If FACT = 'F', then * AF(IAF:IAF+N-1,JAF:JAF+N-1) is an input argument and on * entry contains the factors L and U from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U as computed by PSGETRF. * If EQUED .ne. 'N', then AF is the factored form of the * equilibrated matrix A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'N', then AF(IAF:IAF+N-1,JAF:JAF+N-1) is an output * argument and on exit returns the factors L and U from the * factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the original * matrix A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'E', then AF(IAF:IAF+N-1,JAF:JAF+N-1) is an output * argument and on exit returns the factors L and U from the * factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the equili- * brated matrix A(IA:IA+N-1,JA:JA+N-1) (see the description of * A(IA:IA+N-1,JA:JA+N-1) for the form of the equilibrated * matrix). * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * IPIV (local input or local output) INTEGER array, dimension * LOCr(M_A)+MB_A. If FACT = 'F', then IPIV is an input argu- * ment and on entry contains the pivot indices from the fac- * torization A(IA:IA+N-1,JA:JA+N-1) = P*L*U as computed by * PSGETRF; IPIV(i) -> The global row local row i was * swapped with. This array must be aligned with * A( IA:IA+N-1, * ). * * If FACT = 'N', then IPIV is an output argument and on exit * contains the pivot indices from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the original matrix * A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'E', then IPIV is an output argument and on exit * contains the pivot indices from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the equilibrated matrix * A(IA:IA+N-1,JA:JA+N-1). * * EQUED (global input or global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration (always true if FACT = 'N'). * = 'R': Row equilibration, i.e., A(IA:IA+N-1,JA:JA+N-1) has * been premultiplied by diag(R). * = 'C': Column equilibration, i.e., A(IA:IA+N-1,JA:JA+N-1) * has been postmultiplied by diag(C). * = 'B': Both row and column equilibration, i.e., * A(IA:IA+N-1,JA:JA+N-1) has been replaced by * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * diag(C). * EQUED is an input variable if FACT = 'F'; otherwise, it is an * output variable. * * R (local input or local output) REAL array, * dimension LOCr(M_A). * The row scale factors for A(IA:IA+N-1,JA:JA+N-1). * If EQUED = 'R' or 'B', A(IA:IA+N-1,JA:JA+N-1) is multiplied * on the left by diag(R); if EQUED='N' or 'C', R is not acces- * sed. R is an input variable if FACT = 'F'; otherwise, R is * an output variable. * If FACT = 'F' and EQUED = 'R' or 'B', each element of R must * be positive. * R is replicated in every process column, and is aligned * with the distributed matrix A. * * C (local input or local output) REAL array, * dimension LOCc(N_A). * The column scale factors for A(IA:IA+N-1,JA:JA+N-1). * If EQUED = 'C' or 'B', A(IA:IA+N-1,JA:JA+N-1) is multiplied * on the right by diag(C); if EQUED = 'N' or 'R', C is not * accessed. C is an input variable if FACT = 'F'; otherwise, * C is an output variable. If FACT = 'F' and EQUED = 'C' or * 'B', each element of C must be positive. * C is replicated in every process row, and is aligned with * the distributed matrix A. * * B (local input/local output) REAL pointer * into the local memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1) ). On entry, the N-by-NRHS right-hand * side matrix B(IB:IB+N-1,JB:JB+NRHS-1). On exit, if * EQUED = 'N', B(IB:IB+N-1,JB:JB+NRHS-1) is not modified; if * TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by * diag(R)*B(IB:IB+N-1,JB:JB+NRHS-1); if TRANS = 'T' or 'C' * and EQUED = 'C' or 'B', B(IB:IB+N-1,JB:JB+NRHS-1) is over- * written by diag(C)*B(IB:IB+N-1,JB:JB+NRHS-1). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input/local output) REAL pointer * into the local memory to an array of local dimension * (LLD_X, LOCc(JX+NRHS-1)). If INFO = 0, the N-by-NRHS * solution matrix X(IX:IX+N-1,JX:JX+NRHS-1) to the original * system of equations. Note that A(IA:IA+N-1,JA:JA+N-1) and * B(IB:IB+N-1,JB:JB+NRHS-1) are modified on exit if * EQUED .ne. 'N', and the solution to the equilibrated system * is inv(diag(C))*X(IX:IX+N-1,JX:JX+NRHS-1) if TRANS = 'N' * and EQUED = 'C' or 'B', or * inv(diag(R))*X(IX:IX+N-1,JX:JX+NRHS-1) if TRANS = 'T' or 'C' * and EQUED = 'R' or 'B'. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * RCOND (global output) REAL * The estimate of the reciprocal condition number of the matrix * A(IA:IA+N-1,JA:JA+N-1) after equilibration (if done). If * RCOND is less than the machine precision (in particular, if * RCOND = 0), the matrix is singular to working precision. * This condition is indicated by a return code of INFO > 0. * * FERR (local output) REAL array, dimension LOCc(N_B) * The estimated forward error bounds for each solution vector * X(j) (the j-th column of the solution matrix * X(IX:IX+N-1,JX:JX+NRHS-1). If XTRUE is the true solution, * FERR(j) bounds the magnitude of the largest entry in * (X(j) - XTRUE) divided by the magnitude of the largest entry * in X(j). The estimate is as reliable as the estimate for * RCOND, and is almost always a slight overestimate of the * true error. FERR is replicated in every process row, and is * aligned with the matrices B and X. * * BERR (local output) REAL array, dimension LOCc(N_B). * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any entry of A(IA:IA+N-1,JA:JA+N-1) or * B(IB:IB+N-1,JB:JB+NRHS-1) that makes X(j) an exact solution). * BERR is replicated in every process row, and is aligned * with the matrices B and X. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = MAX( PSGECON( LWORK ), PSGERFS( LWORK ) ) * + LOCr( N_A ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK = LOCr(N_A). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, and i is * <= N: U(IA+I-1,IA+I-1) is exactly zero. The * factorization has been completed, but the * factor U is exactly singular, so the solution * and error bounds could not be computed. * = N+1: RCOND is less than machine precision. The * factorization has been completed, but the * matrix is singular to working precision, and * the solution and error bounds have not been * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLEQU, EQUIL, LQUERY, NOFACT, NOTRAN, ROWEQU CHARACTER NORM INTEGER CONWRK, I, IACOL, IAROW, IAFROW, IBROW, IBCOL, $ ICOFFA, ICOFFB, ICOFFX, ICTXT, IDUMM, $ IIA, IIB, IIX, $ INFEQU, IROFFA, IROFFAF, IROFFB, $ IROFFX, IXCOL, IXROW, J, JJA, JJB, JJX, $ LCM, LCMQ, $ LIWMIN, LWMIN, MYCOL, MYROW, NP, NPCOL, NPROW, $ NQ, NQB, NRHSQ, RFSWRK REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, $ ROWCND, SMLNUM * .. * .. Local Arrays .. INTEGER CDESC( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ INFOG2L, PSGECON, PSGEEQU, PSGERFS, $ PSGETRF, PSGETRS, PSLACPY, $ PSLAQGE, PSCOPY, PXERBLA, SGEBR2D, $ SGEBS2D, SGAMN2D, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC REAL PSLAMCH, PSLANGE EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC, PSLANGE, $ PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(800+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 8, INFO ) IF( LSAME( FACT, 'F' ) ) $ CALL CHK1MAT( N, 3, N, 3, IAF, JAF, DESCAF, 12, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 20, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IX, JX, DESCX, 24, INFO ) NOFACT = LSAME( FACT, 'N' ) EQUIL = LSAME( FACT, 'E' ) NOTRAN = LSAME( TRANS, 'N' ) IF( NOFACT .OR. EQUIL ) THEN EQUED = 'N' ROWEQU = .FALSE. COLEQU = .FALSE. ELSE ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM END IF IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IIA, JJA, IAROW, IACOL ) NP = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFFA NQB = ICEIL( N+IROFFA, DESCA( NB_ )*NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL CONWRK = 2*NP + 2*NQ + MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQ + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) RFSWRK = 3*NP IF( LSAME( TRANS, 'N' ) ) THEN RFSWRK = RFSWRK + NP + NQ + $ ICEIL( NQB, LCMQ )*DESCA( NB_ ) ELSE IF( LSAME( TRANS, 'T' ).OR.LSAME( TRANS, 'C' ) ) THEN RFSWRK = RFSWRK + NP + NQ END IF LWMIN = MAX( CONWRK, RFSWRK ) WORK( 1 ) = REAL( LWMIN ) LIWMIN = NP IWORK( 1 ) = LIWMIN IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. $ .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT. LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( ICOFFA.NE.0 .OR. IROFFA.NE.ICOFFA ) THEN INFO = -7 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(800+NB_) ELSE IF( IAFROW.NE.IAROW ) THEN INFO = -10 ELSE IF( IROFFAF.NE.0 ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -(1200+CTXT_) ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN INFO = -13 ELSE IF( ROWEQU ) THEN RCMIN = BIGNUM RCMAX = ZERO DO 10 J = IIA, IIA + NP - 1 RCMIN = MIN( RCMIN, R( J ) ) RCMAX = MAX( RCMAX, R( J ) ) 10 CONTINUE CALL SGAMN2D( ICTXT, 'Columnwise', ' ', 1, 1, RCMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, RCMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( RCMIN.LE.ZERO ) THEN INFO = -14 ELSE IF( N.GT.0 ) THEN ROWCND = MAX( RCMIN, SMLNUM ) / $ MIN( RCMAX, BIGNUM ) ELSE ROWCND = ONE END IF END IF IF( COLEQU .AND. INFO.EQ.0 ) THEN RCMIN = BIGNUM RCMAX = ZERO DO 20 J = JJA, JJA+NQ-1 RCMIN = MIN( RCMIN, C( J ) ) RCMAX = MAX( RCMAX, C( J ) ) 20 CONTINUE CALL SGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, RCMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, RCMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( RCMIN.LE.ZERO ) THEN INFO = -15 ELSE IF( N.GT.0 ) THEN COLCND = MAX( RCMIN, SMLNUM ) / $ MIN( RCMAX, BIGNUM ) ELSE COLCND = ONE END IF END IF END IF END IF * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IBROW.NE.IAROW ) THEN INFO = -18 ELSE IF( IXROW.NE.IBROW ) THEN INFO = -22 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2000+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(2000+CTXT_) ELSE IF( DESCX( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2400+NB_) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -(2400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -29 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -31 END IF IDUM1( 1 ) = ICHAR( FACT ) IDUM2( 1 ) = 1 IDUM1( 2 ) = ICHAR( TRANS ) IDUM2( 2 ) = 2 IF( LSAME( FACT, 'F' ) ) THEN IDUM1( 3 ) = ICHAR( EQUED ) IDUM2( 3 ) = 14 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 29 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 31 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, $ NRHS, 4, IB, JB, DESCB, 20, 5, IDUM1, $ IDUM2, INFO ) ELSE IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 29 IF( LIWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 31 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, $ NRHS, 4, IB, JB, DESCB, 20, 4, IDUM1, $ IDUM2, INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGESVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * IF( EQUIL ) THEN * * Compute row and column scalings to equilibrate the matrix A. * CALL PSGEEQU( N, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, INFEQU ) IF( INFEQU.EQ.0 ) THEN * * Equilibrate the matrix. * CALL PSLAQGE( N, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, EQUED ) ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) END IF END IF * * Scale the right-hand side. * CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, $ JJB, IBROW, IBCOL ) NP = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) IF( MYROW.EQ.IBROW ) $ NP = NP-IROFFB IF( MYCOL.EQ.IBCOL ) $ NRHSQ = NRHSQ-ICOFFB * IF( NOTRAN ) THEN IF( ROWEQU ) THEN DO 40 J = JJB, JJB+NRHSQ-1 DO 30 I = IIB, IIB+NP-1 B( I+( J-1 )*DESCB( LLD_ ) ) = R( I )* $ B( I+( J-1 )*DESCB( LLD_ ) ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( COLEQU ) THEN * * Transpose the Column scale factors * CALL DESCSET( CDESC, 1, N+ICOFFA, 1, DESCA( NB_ ), MYROW, $ IACOL, ICTXT, 1 ) CALL PSCOPY( N, C, 1, JA, CDESC, CDESC( LLD_ ), WORK, IB, JB, $ DESCB, 1 ) IF( MYCOL.EQ.IBCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IIB ), $ DESCB( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IIB ), $ DESCB( LLD_ ), MYROW, IBCOL ) END IF DO 60 J = JJB, JJB+NRHSQ-1 DO 50 I = IIB, IIB+NP-1 B( I+( J-1 )*DESCB( LLD_ ) ) = WORK( I )* $ B( I+( J-1 )*DESCB( LLD_ ) ) 50 CONTINUE 60 CONTINUE END IF * IF( NOFACT.OR.EQUIL ) THEN * * Compute the LU factorization of A. * CALL PSLACPY( 'Full', N, N, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF ) CALL PSGETRF( N, N, AF, IAF, JAF, DESCAF, IPIV, INFO ) * * Return if INFO is non-zero. * IF( INFO.NE.0 ) THEN IF( INFO.GT.0 ) $ RCOND = ZERO RETURN END IF END IF * * Compute the norm of the matrix A. * IF( NOTRAN ) THEN NORM = '1' ELSE NORM = 'I' END IF ANORM = PSLANGE( NORM, N, N, A, IA, JA, DESCA, WORK ) * * Compute the reciprocal of the condition number of A. * CALL PSGECON( NORM, N, AF, IAF, JAF, DESCAF, ANORM, RCOND, WORK, $ LWORK, IWORK, LIWORK, INFO ) * * Return if the matrix is singular to working precision. * IF( RCOND.LT.PSLAMCH( ICTXT, 'Epsilon' ) ) THEN INFO = IA + N RETURN END IF * * Compute the solution matrix X. * CALL PSLACPY( 'Full', N, NRHS, B, IB, JB, DESCB, X, IX, JX, $ DESCX ) CALL PSGETRS( TRANS, N, NRHS, AF, IAF, JAF, DESCAF, IPIV, X, IX, $ JX, DESCX, INFO ) * * Use iterative refinement to improve the computed solution and * compute error bounds and backward error estimates for it. * CALL PSGERFS( TRANS, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, IPIV, B, IB, JB, DESCB, X, IX, JX, DESCX, $ FERR, BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * Transform the solution matrix X to a solution of the original * system. * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) NP = NUMROC( N+IROFFX, DESCX( MB_ ), MYROW, IXROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFFX, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYROW.EQ.IBROW ) $ NP = NP-IROFFX IF( MYCOL.EQ.IBCOL ) $ NRHSQ = NRHSQ-ICOFFX * IF( NOTRAN ) THEN IF( COLEQU ) THEN * * Transpose the column scaling factors * CALL DESCSET( CDESC, 1, N+ICOFFA, 1, DESCA( NB_ ), MYROW, $ IACOL, ICTXT, 1 ) CALL PSCOPY( N, C, 1, JA, CDESC, CDESC( LLD_ ), WORK, IX, $ JX, DESCX, 1 ) IF( MYCOL.EQ.IBCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, $ WORK( IIX ), DESCX( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, $ WORK( IIX ), DESCX( LLD_ ), MYROW, IBCOL ) END IF * DO 80 J = JJX, JJX+NRHSQ-1 DO 70 I = IIX, IIX+NP-1 X( I+( J-1 )*DESCX( LLD_ ) ) = WORK( I )* $ X( I+( J-1 )*DESCX( LLD_ ) ) 70 CONTINUE 80 CONTINUE DO 90 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / COLCND 90 CONTINUE END IF ELSE IF( ROWEQU ) THEN DO 110 J = JJX, JJX+NRHSQ-1 DO 100 I = IIX, IIX+NP-1 X( I+( J-1 )*DESCX( LLD_ ) ) = R( I )* $ X( I+( J-1 )*DESCX( LLD_ ) ) 100 CONTINUE 110 CONTINUE DO 120 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / ROWCND 120 CONTINUE END IF * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PSGESVX * END scalapack-2.0.2/SRC/psgetf2.f000644 000766 000024 00000022632 10363532303 016064 0ustar00juliestaff000000 000000 SUBROUTINE PSGETF2( M, N, A, IA, JA, DESCA, IPIV, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) REAL A( * ) * .. * * Purpose * ======= * * PSGETF2 computes an LU factorization of a general M-by-N * distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) using * partial pivoting with row interchanges. * * The factorization has the form sub( A ) = P * L * U, where P is a * permutation matrix, L is lower triangular with unit diagonal * elements (lower trapezoidal if m > n), and U is upper triangular * (upper trapezoidal if m < n). * * This is the right-looking Parallel Level 2 BLAS version of the * algorithm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires N <= NB_A-MOD(JA-1, NB_A) and square block * decomposition ( MB_A = NB_A ). * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). * NB_A-MOD(JA-1, NB_A) >= N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ). On exit, this array contains * the local pieces of the factors L and U from the factoriza- * tion sub( A ) = P*L*U; the unit diagonal elements of L are * not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, and division by zero will occur if * it is used to solve a system of equations. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. CHARACTER ROWBTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IIA, IROFF, J, $ JJA, MN, MYCOL, MYROW, NPCOL, NPROW REAL GMAX * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, IGEBR2D, $ IGEBS2D, INFOG2L, PSAMAX, PSGER, $ PSSCAL, PSSWAP, PB_TOPGET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( N+ICOFF.GT.DESCA( NB_ ) ) THEN INFO = -2 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGETF2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * MN = MIN( M, N ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) * IF( MYCOL.EQ.IACOL ) THEN DO 10 J = JA, JA+MN-1 I = IA + J - JA * * Find pivot and test for singularity. * CALL PSAMAX( M-J+JA, GMAX, IPIV( IIA+J-JA ), A, I, J, $ DESCA, 1 ) IF( GMAX.NE.ZERO ) THEN * * Apply the row interchanges to columns JA:JA+N-1 * CALL PSSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, $ IPIV( IIA+J-JA ), JA, DESCA, DESCA( M_ ) ) * * Compute elements I+1:IA+M-1 of J-th column. * IF( J-JA+1.LT.M ) $ CALL PSSCAL( M-J+JA-1, ONE / GMAX, A, I+1, J, $ DESCA, 1 ) ELSE IF( INFO.EQ.0 ) THEN INFO = J - JA + 1 END IF * * Update trailing submatrix * IF( J-JA+1.LT.MN ) THEN CALL PSGER( M-J+JA-1, N-J+JA-1, -ONE, A, I+1, J, DESCA, $ 1, A, I, J+1, DESCA, DESCA( M_ ), A, I+1, $ J+1, DESCA ) END IF 10 CONTINUE * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, MN, 1, IPIV( IIA ), $ MN ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, MN, 1, IPIV( IIA ), $ MN, MYROW, IACOL ) * END IF * RETURN * * End of PSGETF2 * END scalapack-2.0.2/SRC/psgetrf.f000644 000766 000024 00000026443 10363532303 016170 0ustar00juliestaff000000 000000 SUBROUTINE PSGETRF( M, N, A, IA, JA, DESCA, IPIV, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) REAL A( * ) * .. * * Purpose * ======= * * PSGETRF computes an LU factorization of a general M-by-N distributed * matrix sub( A ) = (IA:IA+M-1,JA:JA+N-1) using partial pivoting with * row interchanges. * * The factorization has the form sub( A ) = P * L * U, where P is a * permutation matrix, L is lower triangular with unit diagonal ele- * ments (lower trapezoidal if m > n), and U is upper triangular * (upper trapezoidal if m < n). L and U are stored in sub( A ). * * This is the right-looking Parallel Level 3 BLAS version of the * algorithm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be factored. On exit, this * array contains the local pieces of the factors L and U from * the factorization sub( A ) = P*L*U; the unit diagonal ele- * ments of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, and division by zero will occur if * it is used to solve a system of equations. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, COLCTOP, ROWBTOP INTEGER I, ICOFF, ICTXT, IINFO, IN, IROFF, J, JB, JN, $ MN, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMN2D, PCHK1MAT, $ PB_TOPGET, PB_TOPSET, PSGEMM, PSGETF2, $ PSLASWP, PSTRSM, PXERBLA * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 0, IDUM1, $ IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGETRF', -INFO ) RETURN END IF * * Quick return if possible * IF( DESCA( M_ ).EQ.1 ) THEN IPIV( 1 ) = 1 RETURN ELSE IF( M.EQ.0 .OR. N.EQ.0 ) THEN RETURN END IF * * Split-ring topology for the communication along process rows * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', ' ' ) * * Handle the first block of columns separately * MN = MIN( M, N ) IN = MIN( ICEIL( IA, DESCA( MB_ ) )*DESCA( MB_ ), IA+M-1 ) JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+MN-1 ) JB = JN - JA + 1 * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL PSGETF2( M, JB, A, IA, JA, DESCA, IPIV, INFO ) * IF( JB+1.LE.N ) THEN * * Apply interchanges to columns JN+1:JA+N-1. * CALL PSLASWP( 'Forward', 'Rows', N-JB, A, IA, JN+1, DESCA, $ IA, IN, IPIV ) * * Compute block row of U. * CALL PSTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-JB, ONE, A, IA, JA, DESCA, A, IA, JN+1, DESCA ) * IF( JB+1.LE.M ) THEN * * Update trailing submatrix. * CALL PSGEMM( 'No transpose', 'No transpose', M-JB, N-JB, JB, $ -ONE, A, IN+1, JA, DESCA, A, IA, JN+1, DESCA, $ ONE, A, IN+1, JN+1, DESCA ) * END IF END IF * * Loop over the remaining blocks of columns. * DO 10 J = JN+1, JA+MN-1, DESCA( NB_ ) JB = MIN( MN-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL PSGETF2( M-J+JA, JB, A, I, J, DESCA, IPIV, IINFO ) * IF( INFO.EQ.0 .AND. IINFO.GT.0 ) $ INFO = IINFO + J - JA * * Apply interchanges to columns JA:J-JA. * CALL PSLASWP( 'Forward', 'Rowwise', J-JA, A, IA, JA, DESCA, $ I, I+JB-1, IPIV ) * IF( J-JA+JB+1.LE.N ) THEN * * Apply interchanges to columns J+JB:JA+N-1. * CALL PSLASWP( 'Forward', 'Rowwise', N-J-JB+JA, A, IA, J+JB, $ DESCA, I, I+JB-1, IPIV ) * * Compute block row of U. * CALL PSTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-J-JB+JA, ONE, A, I, J, DESCA, A, I, J+JB, $ DESCA ) * IF( J-JA+JB+1.LE.M ) THEN * * Update trailing submatrix. * CALL PSGEMM( 'No transpose', 'No transpose', M-J-JB+JA, $ N-J-JB+JA, JB, -ONE, A, I+JB, J, DESCA, A, $ I, J+JB, DESCA, ONE, A, I+JB, J+JB, DESCA ) * END IF END IF * 10 CONTINUE * IF( INFO.EQ.0 ) $ INFO = MN + 1 CALL IGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, INFO, 1, IDUM1, IDUM2, $ -1, -1, MYCOL ) IF( INFO.EQ.MN+1 ) $ INFO = 0 * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * RETURN * * End of PSGETRF * END scalapack-2.0.2/SRC/psgetri.f000644 000766 000024 00000035165 10430435051 016171 0ustar00juliestaff000000 000000 SUBROUTINE PSGETRI( N, A, IA, JA, DESCA, IPIV, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7.4) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * v1.7.4: May 10, 2006 * v1.7: May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ), IWORK( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSGETRI computes the inverse of a distributed matrix using the LU * factorization computed by PSGETRF. This method inverts U and then * computes the inverse of sub( A ) = A(IA:IA+N-1,JA:JA+N-1) denoted * InvA by solving the system InvA*L = inv(U) for InvA. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the L and U obtained by the * factorization sub( A ) = P*L*U computed by PSGETRF. On * exit, if INFO = 0, sub( A ) contains the inverse of the * original distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension LOCr(M_A)+MB_A * keeps track of the pivoting information. IPIV(i) is the * global row index the local row i was swapped with. This * array is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = LOCr(N+MOD(IA-1,MB_A))*NB_A. WORK is used to keep a * copy of at most an entire column block of sub( A ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK used as workspace for * physically transposing the pivots. * LIWORK is local input and must be at least * if NPROW == NPCOL then * LIWORK = LOCc( N_A + MOD(JA-1, NB_A) ) + NB_A, * else * LIWORK = LOCc( N_A + MOD(JA-1, NB_A) ) + * MAX( CEIL(CEIL(LOCr(M_A)/MB_A)/(LCM/NPROW)), * NB_A ) * where LCM is the least common multiple of process * rows and columns (NPROW and NPCOL). * end if * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,IA+K-1) is exactly zero; the * matrix is singular and its inverse could not be * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IROFF, IW, J, $ JB, JN, LCM, LIWMIN, LWMIN, MP, MYCOL, MYROW, $ NN, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PSGEMM, PSLACPY, PSLASET, PSLAPIV, $ PSTRSM, PSTRTRI, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(500+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) LWMIN = NP * DESCA( NB_ ) * MP = NUMROC( DESCA( M_ ), DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) NQ = NUMROC( DESCA( N_ ), DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) IF( NPROW.EQ.NPCOL ) THEN LIWMIN = NQ + DESCA( NB_ ) ELSE * * Use the formula for the workspace given in PxLAPIV * to compute the minimum size LIWORK for IWORK * * The formula in PxLAPIV is * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + * MB_P * CEIL( CEIL(LOCr(M_P)/MB_P) / (LCM/NPROW) ) * * where * M_P is the global length of the pivot vector * MP = DESCA( M_ ) + DESCA( MB_ ) * NPROW * I_P is IA * I_P = IA * MB_P is the block size use for the block cyclic distribution of the * pivot vector * MB_P = DESCA (MB_ ) * LOCc ( . ) * NUMROC ( . , DESCA ( NB_ ), MYCOL, DESCA ( CSRC_ ), NPCOL ) * LOCr ( . ) * NUMROC ( . , DESCA ( MB_ ), MYROW, DESCA ( RSRC_ ), NPROW ) * CEIL ( X / Y ) * ICEIL( X, Y ) * LCM * LCM = ILCM( NPROW, NPCOL ) * LCM = ILCM( NPROW, NPCOL ) LIWMIN = NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW $ + MOD ( IA - 1, DESCA( MB_ ) ), DESCA ( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ) + $ MAX ( DESCA( MB_ ) * ICEIL ( ICEIL( $ NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW, $ DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ), $ DESCA( MB_ ) ), LCM / NPROW ), DESCA( NB_ ) ) * END IF * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -4 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(500+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -8 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 8 IF( LIWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGETRI', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Form inv(U). If INFO > 0 from PSTRTRI, then U is singular, * and the inverse is not computed. * CALL PSTRTRI( 'Upper', 'Non-unit', N, A, IA, JA, DESCA, INFO ) IF( INFO.GT.0 ) $ RETURN * * Define array descriptor for working array WORK * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) NN = ( ( JA+N-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1 IACOL = INDXG2P( NN, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCW, N+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, NP ) ) IW = IROFF + 1 * * Solve the equation inv(A)*L=inv(U) for inv(A) using blocked code. * DO 10 J = NN, JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * * Copy current block column of L to WORK and replace with zeros. * CALL PSLACPY( 'Lower', JA+N-1-J, JB, A, I+1, J, DESCA, $ WORK, IW+J-JA+1, 1, DESCW ) CALL PSLASET( 'Lower', JA+N-1-J, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Compute current block column of inv(A). * IF( J+JB.LE.JA+N-1 ) $ CALL PSGEMM( 'No transpose', 'No transpose', N, JB, $ JA+N-J-JB, -ONE, A, IA, J+JB, DESCA, WORK, $ IW+J+JB-JA, 1, DESCW, ONE, A, IA, J, DESCA ) CALL PSTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, $ ONE, WORK, IW+J-JA, 1, DESCW, A, IA, J, DESCA ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle the last block of columns separately * JB = JN-JA+1 * * Copy current block column of L to WORK and replace with zeros. * CALL PSLACPY( 'Lower', N-1, JB, A, IA+1, JA, DESCA, WORK, IW+1, $ 1, DESCW ) CALL PSLASET( 'Lower', N-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Compute current block column of inv(A). * IF( JA+JB.LE.JA+N-1 ) $ CALL PSGEMM( 'No transpose', 'No transpose', N, JB, $ N-JB, -ONE, A, IA, JA+JB, DESCA, WORK, IW+JB, 1, $ DESCW, ONE, A, IA, JA, DESCA ) CALL PSTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, $ ONE, WORK, IW, 1, DESCW, A, IA, JA, DESCA ) * * Use the row pivots and apply them to the columns of the global * matrix. * CALL DESCSET( DESCW, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ MP+DESCA( MB_ ) ) CALL PSLAPIV( 'Backward', 'Columns', 'Column', N, N, A, IA, $ JA, DESCA, IPIV, IA, 1, DESCW, IWORK ) * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PSGETRI * END scalapack-2.0.2/SRC/psgetrs.f000644 000766 000024 00000026070 10363532303 016201 0ustar00juliestaff000000 000000 SUBROUTINE PSGETRS( TRANS, N, NRHS, A, IA, JA, DESCA, IPIV, B, $ IB, JB, DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) REAL A( * ), B( * ) * .. * * Purpose * ======= * * PSGETRS solves a system of distributed linear equations * * op( sub( A ) ) * X = sub( B ) * * with a general N-by-N distributed matrix sub( A ) using the LU * factorization computed by PSGETRF. * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), op( A ) = A or A**T and * sub( B ) denotes B(IB:IB+N-1,JB:JB+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block data decomposition ( MB_A=NB_A ). * * Arguments * ========= * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': sub( A ) * X = sub( B ) (No transpose) * = 'T': sub( A )**T * X = sub( B ) (Transpose) * = 'C': sub( A )**T * X = sub( B ) (Transpose) * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) REAL pointer into the local * memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the factors * L and U from the factorization sub( A ) = P*L*U; the unit * diagonal elements of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * B (local input/local output) REAL pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, the right hand sides * sub( B ). On exit, sub( B ) is overwritten by the solution * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCIP( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ PSLAPIV, PSTRSM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE NOTRAN = LSAME( TRANS, 'N' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1200+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(1200+CTXT_) END IF END IF IF( NOTRAN ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 1 ) = ICHAR( 'T' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, 3, $ IB, JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGETRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * CALL DESCSET( DESCIP, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ DESCA( MB_ ) + NUMROC( DESCA( M_ ), DESCA( MB_ ), $ MYROW, DESCA( RSRC_ ), NPROW ) ) * IF( NOTRAN ) THEN * * Solve sub( A ) * X = sub( B ). * * Apply row interchanges to the right hand sides. * CALL PSLAPIV( 'Forward', 'Row', 'Col', N, NRHS, B, IB, JB, $ DESCB, IPIV, IA, 1, DESCIP, IDUM1 ) * * Solve L*X = sub( B ), overwriting sub( B ) with X. * CALL PSTRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve U*X = sub( B ), overwriting sub( B ) with X. * CALL PSTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) ELSE * * Solve sub( A )' * X = sub( B ). * * Solve U'*X = sub( B ), overwriting sub( B ) with X. * CALL PSTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve L'*X = sub( B ), overwriting sub( B ) with X. * CALL PSTRSM( 'Left', 'Lower', 'Transpose', 'Unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Apply row interchanges to the solution vectors. * CALL PSLAPIV( 'Backward', 'Row', 'Col', N, NRHS, B, IB, JB, $ DESCB, IPIV, IA, 1, DESCIP, IDUM1 ) * END IF * RETURN * * End of PSGETRS * END scalapack-2.0.2/SRC/psggqrf.f000644 000766 000024 00000036205 10363532303 016164 0ustar00juliestaff000000 000000 SUBROUTINE PSGGQRF( N, M, P, A, IA, JA, DESCA, TAUA, B, IB, JB, $ DESCB, TAUB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, P * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), TAUA( * ), TAUB( * ), WORK( * ) * .. * * Purpose * ======= * * PSGGQRF computes a generalized QR factorization of * an N-by-M matrix sub( A ) = A(IA:IA+N-1,JA:JA+M-1) and * an N-by-P matrix sub( B ) = B(IB:IB+N-1,JB:JB+P-1): * * sub( A ) = Q*R, sub( B ) = Q*T*Z, * * where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal * matrix, and R and T assume one of the forms: * * if N >= M, R = ( R11 ) M , or if N < M, R = ( R11 R12 ) N, * ( 0 ) N-M N M-N * M * * where R11 is upper triangular, and * * if N <= P, T = ( 0 T12 ) N, or if N > P, T = ( T11 ) N-P, * P-N N ( T21 ) P * P * * where T12 or T21 is upper triangular. * * In particular, if sub( B ) is square and nonsingular, the GQR * factorization of sub( A ) and sub( B ) implicitly gives the QR * factorization of inv( sub( B ) )* sub( A ): * * inv( sub( B ) )*sub( A )= Z'*(inv(T)*R) * * where inv( sub( B ) ) denotes the inverse of the matrix sub( B ), * and Z' denotes the transpose of matrix Z. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrices sub( A ) and sub( B ). N >= 0. * * M (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). M >= 0. * * P (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( B ). P >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+M-1)). * On entry, the local pieces of the N-by-M distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(N,M) by M * upper trapezoidal matrix R (R is upper triangular if N >= M); * the elements below the diagonal, with the array TAUA, * represent the orthogonal matrix Q as a product of min(N,M) * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAUA (local output) REAL, array, dimension * LOCc(JA+MIN(N,M)-1). This array contains the scalar factors * TAUA of the elementary reflectors which represent the * orthogonal matrix Q. TAUA is tied to the distributed matrix * A. (see Further Details). * * B (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+P-1)). * On entry, the local pieces of the N-by-P distributed matrix * sub( B ) which is to be factored. On exit, if N <= P, the * upper triangle of B(IB:IB+N-1,JB+P-N:JB+P-1) contains the * N by N upper triangular matrix T; if N > P, the elements on * and above the (N-P)-th subdiagonal contain the N by P upper * trapezoidal matrix T; the remaining elements, with the array * TAUB, represent the orthogonal matrix Z as a product of * elementary reflectors (see Further Details). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * TAUB (local output) REAL, array, dimension LOCr(IB+N-1) * This array contains the scalar factors of the elementary * reflectors which represent the orthogonal unitary matrix Z. * TAUB is tied to the distributed matrix B (see Further * Details). * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB_A * ( NpA0 + MqA0 + NB_A ), * MAX( (NB_A*(NB_A-1))/2, (PqB0 + NpB0)*NB_A ) + * NB_A * NB_A, * MB_B * ( NpB0 + PqB0 + MB_B ) ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * NpB0 = NUMROC( N+IROFFB, MB_B, MYROW, IBROW, NPROW ), * PqB0 = NUMROC( P+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(n,m). * * Each H(i) has the form * * H(i) = I - taua * v * v' * * where taua is a real scalar, and v is a real vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in * A(ia+i:ia+n-1,ja+i-1), and taua in TAUA(ja+i-1). * To form Q explicitly, use ScaLAPACK subroutine PSORGQR. * To use Q to update another matrix, use ScaLAPACK subroutine PSORMQR. * * The matrix Z is represented as a product of elementary reflectors * * Z = H(ib) H(ib+1) . . . H(ib+k-1), where k = min(n,p). * * Each H(i) has the form * * H(i) = I - taub * v * v' * * where taub is a real scalar, and v is a real vector with * v(p-k+i+1:p) = 0 and v(p-k+i) = 1; v(1:p-k+i-1) is stored on exit in * B(ib+n-k+i-1,jb:jb+p-k+i-2), and taub in TAUB(ib+n-k+i-1). * To form Z explicitly, use ScaLAPACK subroutine PSORGRQ. * To use Z to update another matrix, use ScaLAPACK subroutine PSORMRQ. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ) and sub( B ) must verify some * alignment properties, namely the following expression should be true: * * ( MB_A.EQ.MB_B .AND. IROFFA.EQ.IROFFB .AND. IAROW.EQ.IBROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, LWMIN, MQA0, MYCOL, $ MYROW, NPA0, NPB0, NPCOL, NPROW, PQB0 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSGEQRF, $ PSGERQF, PSORMQR, PXERBLA * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -707 ELSE CALL CHK1MAT( N, 1, M, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 1, P, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) NPB0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) PQB0 = NUMROC( P+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) LWMIN = MAX( DESCA( NB_ ) * ( NPA0 + MQA0 + DESCA( NB_ ) ), $ MAX( MAX( ( DESCA( NB_ )*( DESCA( NB_ ) - 1 ) ) / 2, $ ( PQB0 + NPB0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ), $ DESCB( MB_ ) * ( NPB0 + PQB0 + DESCB( MB_ ) ) ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( IAROW.NE.IBROW .OR. IROFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -1203 ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -1207 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 15 CALL PCHK2MAT( N, 1, M, 2, IA, JA, DESCA, 7, N, 1, P, 3, IB, $ JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGGQRF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * QR factorization of N-by-M matrix sub( A ): sub( A ) = Q*R * CALL PSGEQRF( N, M, A, IA, JA, DESCA, TAUA, WORK, LWORK, INFO ) LWMIN = INT( WORK( 1 ) ) * * Update sub( B ) := Q'*sub( B ). * CALL PSORMQR( 'Left', 'Transpose', N, P, MIN( N, M ), A, IA, JA, $ DESCA, TAUA, B, IB, JB, DESCB, WORK, LWORK, INFO ) LWMIN = MIN( LWMIN, INT( WORK( 1 ) ) ) * * RQ factorization of N-by-P matrix sub( B ): sub( B ) = T*Z. * CALL PSGERQF( N, P, B, IB, JB, DESCB, TAUB, WORK, LWORK, INFO ) WORK( 1 ) = REAL( MAX( LWMIN, INT( WORK( 1 ) ) ) ) * RETURN * * End of PSGGQRF * END scalapack-2.0.2/SRC/psggrqf.f000644 000766 000024 00000036237 10363532303 016171 0ustar00juliestaff000000 000000 SUBROUTINE PSGGRQF( M, P, N, A, IA, JA, DESCA, TAUA, B, IB, JB, $ DESCB, TAUB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, P * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), TAUA( * ), TAUB( * ), WORK( * ) * .. * * Purpose * ======= * * PSGGRQF computes a generalized RQ factorization of * an M-by-N matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) * and a P-by-N matrix sub( B ) = B(IB:IB+P-1,JB:JB+N-1): * * sub( A ) = R*Q, sub( B ) = Z*T*Q, * * where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal * matrix, and R and T assume one of the forms: * * if M <= N, R = ( 0 R12 ) M, or if M > N, R = ( R11 ) M-N, * N-M M ( R21 ) N * N * * where R12 or R21 is upper triangular, and * * if P >= N, T = ( T11 ) N , or if P < N, T = ( T11 T12 ) P, * ( 0 ) P-N P N-P * N * * where T11 is upper triangular. * * In particular, if sub( B ) is square and nonsingular, the GRQ * factorization of sub( A ) and sub( B ) implicitly gives the RQ * factorization of sub( A )*inv( sub( B ) ): * * sub( A )*inv( sub( B ) ) = (R*inv(T))*Z' * * where inv( sub( B ) ) denotes the inverse of the matrix sub( B ), * and Z' denotes the transpose of matrix Z. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of * rows of the distributed submatrix sub( A ). M >= 0. * * P (global input) INTEGER * The number of rows to be operated on i.e the number of * rows of the distributed submatrix sub( B ). P >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrices sub( A ) and sub( B ). * N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAUA, represent the orthogonal matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAUA (local output) REAL, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors which represent the orthogonal unitary matrix Q. * TAUA is tied to the distributed matrix A (see Further * Details). * * B (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+N-1)). * On entry, the local pieces of the P-by-N distributed matrix * sub( B ) which is to be factored. On exit, the elements on * and above the diagonal of sub( B ) contain the min(P,N) by N * upper trapezoidal matrix T (T is upper triangular if P >= N); * the elements below the diagonal, with the array TAUB, * represent the orthogonal matrix Z as a product of elementary * reflectors (see Further Details). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * TAUB (local output) REAL, array, dimension * LOCc(JB+MIN(P,N)-1). This array contains the scalar factors * TAUB of the elementary reflectors which represent the * orthogonal matrix Z. TAUB is tied to the distributed matrix * B (see Further Details). * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( MB_A * ( MpA0 + NqA0 + MB_A ), * MAX( (MB_A*(MB_A-1))/2, (PpB0 + NqB0)*MB_A ) + * MB_A * MB_A, * NB_B * ( PpB0 + NqB0 + NB_B ) ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * PpB0 = NUMROC( P+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NqB0 = NUMROC( N+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia) H(ia+1) . . . H(ia+k-1), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - taua * v * v' * * where taua is a real scalar, and v is a real vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; v(1:n-k+i-1) is stored on exit in * A(ia+m-k+i-1,ja:ja+n-k+i-2), and taua in TAUA(ia+m-k+i-1). * To form Q explicitly, use ScaLAPACK subroutine PSORGRQ. * To use Q to update another matrix, use ScaLAPACK subroutine PSORMRQ. * * The matrix Z is represented as a product of elementary reflectors * * Z = H(jb) H(jb+1) . . . H(jb+k-1), where k = min(p,n). * * Each H(i) has the form * * H(i) = I - taub * v * v' * * where taub is a real scalar, and v is a real vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:p) is stored on exit in * B(ib+i:ib+p-1,jb+i-1), and taub in TAUB(jb+i-1). * To form Z explicitly, use ScaLAPACK subroutine PSORGQR. * To use Z to update another matrix, use ScaLAPACK subroutine PSORMQR. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ) and sub( B ) must verify some * alignment properties, namely the following expression should be true: * * ( NB_A.EQ.NB_B .AND. ICOFFA.EQ.ICOFFB .AND. IACOL.EQ.IBCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. Local Scalars .. LOGICAL LQUERY INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, LWMIN, MPA0, MYCOL, $ MYROW, NPCOL, NPROW, NQA0, NQB0, PPB0 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSGEQRF, $ PSGERQF, PSORMRQ, PXERBLA * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -707 ELSE CALL CHK1MAT( M, 1, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( P, 2, N, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) PPB0 = NUMROC( P+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) NQB0 = NUMROC( N+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) LWMIN = MAX( DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ), $ MAX( MAX( ( DESCA( MB_ )*( DESCA( MB_ ) - 1 ) ) / 2, $ ( PPB0 + NQB0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ), $ DESCB( NB_ ) * ( PPB0 + NQB0 + DESCB( NB_ ) ) ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( IACOL.NE.IBCOL .OR. ICOFFA.NE.ICOFFB ) THEN INFO = -11 ELSE IF( DESCA( NB_ ).NE.DESCB( NB_ ) ) THEN INFO = -1204 ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -1207 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 15 CALL PCHK2MAT( M, 1, N, 3, IA, JA, DESCA, 7, P, 2, N, 3, IB, $ JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSGGRQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * RQ factorization of M-by-N matrix sub( A ): sub( A ) = R*Q * CALL PSGERQF( M, N, A, IA, JA, DESCA, TAUA, WORK, LWORK, INFO ) LWMIN = INT( WORK( 1 ) ) * * Update sub( B ) := sub( B )*Q' * CALL PSORMRQ( 'Right', 'Transpose', P, N, MIN( M, N ), A, $ MAX( IA, IA+M-N ), JA, DESCA, TAUA, B, IB, JB, $ DESCB, WORK, LWORK, INFO ) LWMIN = MAX( LWMIN, INT( WORK( 1 ) ) ) * * QR factorization of P-by-N matrix sub( B ): sub( B ) = Z*T * CALL PSGEQRF( P, N, B, IB, JB, DESCB, TAUB, WORK, LWORK, INFO ) WORK( 1 ) = REAL( MAX( LWMIN, INT( WORK( 1 ) ) ) ) * RETURN * * End of PSGGRQF * END scalapack-2.0.2/SRC/pshseqr.f000644 000766 000024 00000066741 11705175572 016224 0ustar00juliestaff000000 000000 SUBROUTINE PSHSEQR( JOB, COMPZ, N, ILO, IHI, H, DESCH, WR, WI, Z, $ DESCZ, WORK, LWORK, IWORK, LIWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK driver routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IHI, ILO, INFO, LWORK, LIWORK, N CHARACTER COMPZ, JOB * .. * .. Array Arguments .. INTEGER DESCH( * ) , DESCZ( * ), IWORK( * ) REAL H( * ), WI( N ), WORK( * ), WR( N ), Z( * ) * .. * Purpose * ======= * * PSHSEQR computes the eigenvalues of an upper Hessenberg matrix H * and, optionally, the matrices T and Z from the Schur decomposition * H = Z*T*Z**T, where T is an upper quasi-triangular matrix (the * Schur form), and Z is the orthogonal matrix of Schur vectors. * * Optionally Z may be postmultiplied into an input orthogonal * matrix Q so that this routine can give the Schur factorization * of a matrix A which has been reduced to the Hessenberg form H * by the orthogonal matrix Q: A = Q*H*Q**T = (QZ)*T*(QZ)**T. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * JOB (global input) CHARACTER*1 * = 'E': compute eigenvalues only; * = 'S': compute eigenvalues and the Schur form T. * * COMPZ (global input) CHARACTER*1 * = 'N': no Schur vectors are computed; * = 'I': Z is initialized to the unit matrix and the matrix Z * of Schur vectors of H is returned; * = 'V': Z must contain an orthogonal matrix Q on entry, and * the product Q*Z is returned. * * N (global input) INTEGER * The order of the Hessenberg matrix H (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that H is already upper triangular in rows * and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally * set by a previous call to PSGEBAL, and then passed to PSGEHRD * when the matrix output by PSGEBAL is reduced to Hessenberg * form. Otherwise ILO and IHI should be set to 1 and N * respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. * If N = 0, then ILO = 1 and IHI = 0. * * H (global input/output) REAL array, dimension * (DESCH(LLD_),*) * On entry, the upper Hessenberg matrix H. * On exit, if JOB = 'S', H is upper quasi-triangular in * rows and columns ILO:IHI, with 1-by-1 and 2-by-2 blocks on * the main diagonal. The 2-by-2 diagonal blocks (corresponding * to complex conjugate pairs of eigenvalues) are returned in * standard form, with H(i,i) = H(i+1,i+1) and * H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the * contents of H are unspecified on exit. * * DESCH (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix H. * * WR (global output) REAL array, dimension (N) * WI (global output) REAL array, dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If JOB = 'S', the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in H. * * Z (global input/output) REAL array. * If COMPZ = 'V', on entry Z must contain the current * matrix Z of accumulated transformations from, e.g., PSGEHRD, * and on exit Z has been updated; transformations are applied * only to the submatrix Z(ILO:IHI,ILO:IHI). * If COMPZ = 'N', Z is not referenced. * If COMPZ = 'I', on entry Z need not be set and on exit, * if INFO = 0, Z contains the orthogonal matrix Z of the Schur * vectors of H. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace) REAL array, dimension(LWORK) * * LWORK (local input) INTEGER * The length of the workspace array WORK. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the workspace array IWORK. * * INFO (output) INTEGER * = 0: successful exit * .LT. 0: if INFO = -i, the i-th argument had an illegal * value (see also below for -7777 and -8888). * .GT. 0: if INFO = i, PSHSEQR failed to compute all of * the eigenvalues. Elements 1:ilo-1 and i+1:n of WR * and WI contain those eigenvalues which have been * successfully computed. (Failures are rare.) * * If INFO .GT. 0 and JOB = 'E', then on exit, the * remaining unconverged eigenvalues are the eigen- * values of the upper Hessenberg matrix rows and * columns ILO through INFO of the final, output * value of H. * * If INFO .GT. 0 and JOB = 'S', then on exit * * (*) (initial value of H)*U = U*(final value of H) * * where U is an orthogonal matrix. The final * value of H is upper Hessenberg and quasi-triangular * in rows and columns INFO+1 through IHI. * * If INFO .GT. 0 and COMPZ = 'V', then on exit * * (final value of Z) = (initial value of Z)*U * * where U is the orthogonal matrix in (*) (regard- * less of the value of JOB.) * * If INFO .GT. 0 and COMPZ = 'I', then on exit * (final value of Z) = U * where U is the orthogonal matrix in (*) (regard- * less of the value of JOB.) * * If INFO .GT. 0 and COMPZ = 'N', then Z is not * accessed. * * = -7777: PSLAQR0 failed to converge and PSLAQR1 was called * instead. This could happen. Mostly due to a bug. * Please, send a bug report to the authors. * = -8888: PSLAQR1 failed to converge and PSLAQR0 was called * instead. This should not happen. * * ================================================================ * Based on contributions by * Robert Granat, Department of Computing Science and HPC2N, * Umea University, Sweden. * ================================================================ * * Restrictions: The block size in H and Z must be square and larger * than or equal to six (6) due to restrictions in PSLAQR1, PSLAQR5 * and SLAQR6. Moreover, H and Z need to be distributed identically * with the same context. * * ================================================================ * References: * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part I: Maintaining Well Focused * Shifts, and Level 3 Performance. * SIAM J. Matrix Anal. Appl., 23(4):929--947, 2002. * * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part II: Aggressive Early * Deflation. * SIAM J. Matrix Anal. Appl., 23(4):948--973, 2002. * * R. Granat, B. Kagstrom, and D. Kressner, * A Novel Parallel QR Algorithm for Hybrid Distributed Momory HPC * Systems. * SIAM J. Sci. Comput., 32(4):2345--2378, 2010. * * ================================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ LOGICAL CRSOVER PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, $ CRSOVER = .TRUE. ) INTEGER NTINY PARAMETER ( NTINY = 11 ) INTEGER NL PARAMETER ( NL = 49 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0e0, ONE = 1.0e0 ) * .. * .. Local Scalars .. INTEGER I, KBOT, NMIN, LLDH, LLDZ, ICTXT, NPROW, NPCOL, $ MYROW, MYCOL, HROWS, HCOLS, IPW, NH, NB, $ II, JJ, HRSRC, HCSRC, NPROCS, ILOC1, JLOC1, $ HRSRC1, HCSRC1, K, ILOC2, JLOC2, ILOC3, JLOC3, $ ILOC4, JLOC4, HRSRC2, HCSRC2, HRSRC3, HCSRC3, $ HRSRC4, HCSRC4, LIWKOPT LOGICAL INITZ, LQUERY, WANTT, WANTZ, PAIR, BORDER REAL TMP1, TMP2, TMP3, TMP4, DUM1, DUM2, DUM3, $ DUM4, ELEM1, ELEM2, ELEM3, ELEM4, $ CS, SN, ELEM5, TMP, LWKOPT * .. * .. Local Arrays .. INTEGER DESCH2( DLEN_ ) * .. * .. External Functions .. INTEGER PILAENVX, NUMROC, ICEIL LOGICAL LSAME EXTERNAL PILAENVX, LSAME, NUMROC, ICEIL * .. * .. External Subroutines .. EXTERNAL PSLACPY, PSLAQR1, PSLAQR0, PSLASET, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC FLOAT, MAX, MIN * .. * .. Executable Statements .. * * Decode and check the input parameters. * INFO = 0 ICTXT = DESCH( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL IF( NPROW.EQ.-1 ) INFO = -(600+CTXT_) IF( INFO.EQ.0 ) THEN WANTT = LSAME( JOB, 'S' ) INITZ = LSAME( COMPZ, 'I' ) WANTZ = INITZ .OR. LSAME( COMPZ, 'V' ) LLDH = DESCH( LLD_ ) LLDZ = DESCZ( LLD_ ) NB = DESCH( MB_ ) LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.LSAME( JOB, 'E' ) .AND. .NOT.WANTT ) THEN INFO = -1 ELSE IF( .NOT.LSAME( COMPZ, 'N' ) .AND. .NOT.WANTZ ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -4 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -5 ELSEIF( DESCZ( CTXT_ ).NE.DESCH( CTXT_ ) ) THEN INFO = -( 1000+CTXT_ ) ELSEIF( DESCH( MB_ ).NE.DESCH( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSEIF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1000+NB_ ) ELSEIF( DESCH( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1000+MB_ ) ELSEIF( DESCH( MB_ ).LT.6 ) THEN INFO = -( 700+NB_ ) ELSEIF( DESCZ( MB_ ).LT.6 ) THEN INFO = -( 1000+MB_ ) ELSE CALL CHK1MAT( N, 3, N, 3, 1, 1, DESCH, 7, INFO ) IF( INFO.EQ.0 ) $ CALL CHK1MAT( N, 3, N, 3, 1, 1, DESCZ, 11, INFO ) IF( INFO.EQ.0 ) $ CALL PCHK2MAT( N, 3, N, 3, 1, 1, DESCH, 7, N, 3, N, 3, $ 1, 1, DESCZ, 11, 0, IWORK, IWORK, INFO ) END IF END IF * * Compute required workspace. * CALL PSLAQR1( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILO, IHI, Z, DESCZ, WORK, -1, IWORK, -1, INFO ) LWKOPT = WORK(1) LIWKOPT = IWORK(1) CALL PSLAQR0( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILO, IHI, Z, DESCZ, WORK, -1, IWORK, -1, INFO, 0 ) IF( N.LT.NL ) THEN HROWS = NUMROC( NL, NB, MYROW, DESCH(RSRC_), NPROW ) HCOLS = NUMROC( NL, NB, MYCOL, DESCH(CSRC_), NPCOL ) WORK(1) = WORK(1) + FLOAT(2*HROWS*HCOLS) END IF LWKOPT = MAX( LWKOPT, WORK(1) ) LIWKOPT = MAX( LIWKOPT, IWORK(1) ) WORK(1) = LWKOPT IWORK(1) = LIWKOPT * IF( .NOT.LQUERY .AND. LWORK.LT.INT(LWKOPT) ) THEN INFO = -13 ELSEIF( .NOT.LQUERY .AND. LIWORK.LT.LIWKOPT ) THEN INFO = -15 END IF * IF( INFO.NE.0 ) THEN * * Quick return in case of invalid argument. * CALL PXERBLA( 'PSHSEQR', -INFO ) RETURN * ELSE IF( N.EQ.0 ) THEN * * Quick return in case N = 0; nothing to do. * RETURN * ELSE IF( LQUERY ) THEN * * Quick return in case of a workspace query. * RETURN * ELSE * * Copy eigenvalues isolated by PSGEBAL. * DO 10 I = 1, ILO - 1 CALL INFOG2L( I, I, DESCH, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, HRSRC, HCSRC ) IF( MYROW.EQ.HRSRC .AND. MYCOL.EQ.HCSRC ) THEN WR( I ) = H( (JJ-1)*LLDH + II ) ELSE WR( I ) = ZERO END IF WI( I ) = ZERO 10 CONTINUE IF( ILO.GT.1 ) $ CALL SGSUM2D( ICTXT, 'All', '1-Tree', ILO-1, 1, WR, N, -1, $ -1 ) DO 20 I = IHI + 1, N CALL INFOG2L( I, I, DESCH, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, HRSRC, HCSRC ) IF( MYROW.EQ.HRSRC .AND. MYCOL.EQ.HCSRC ) THEN WR( I ) = H( (JJ-1)*LLDH + II ) ELSE WR( I ) = ZERO END IF WI( I ) = ZERO 20 CONTINUE IF( IHI.LT.N ) $ CALL SGSUM2D( ICTXT, 'All', '1-Tree', N-IHI, 1, WR(IHI+1), $ N, -1, -1 ) * * Initialize Z, if requested. * IF( INITZ ) $ CALL PSLASET( 'A', N, N, ZERO, ONE, Z, 1, 1, DESCZ ) * * Quick return if possible. * NPROCS = NPROW*NPCOL IF( ILO.EQ.IHI ) THEN CALL INFOG2L( ILO, ILO, DESCH, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, HRSRC, HCSRC ) IF( MYROW.EQ.HRSRC .AND. MYCOL.EQ.HCSRC ) THEN WR( ILO ) = H( (JJ-1)*LLDH + II ) IF( NPROCS.GT.1 ) $ CALL SGEBS2D( ICTXT, 'All', '1-Tree', 1, 1, WR(ILO), $ 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', '1-Tree', 1, 1, WR(ILO), $ 1, HRSRC, HCSRC ) END IF WI( ILO ) = ZERO RETURN END IF * * PSLAQR1/PSLAQR0 crossover point. * NH = IHI-ILO+1 NMIN = PILAENVX( ICTXT, 12, 'PSHSEQR', $ JOB( : 1 ) // COMPZ( : 1 ), N, ILO, IHI, LWORK ) NMIN = MAX( NTINY, NMIN ) * * PSLAQR0 for big matrices; PSLAQR1 for small ones. * IF( (.NOT. CRSOVER .AND. NH.GT.NTINY) .OR. NH.GT.NMIN .OR. $ DESCH(RSRC_).NE.0 .OR. DESCH(CSRC_).NE.0 ) THEN CALL PSLAQR0( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILO, IHI, Z, DESCZ, WORK, LWORK, IWORK, LIWORK, INFO, $ 0 ) IF( INFO.GT.0 .AND. ( DESCH(RSRC_).NE.0 .OR. $ DESCH(CSRC_).NE.0 ) ) THEN * * A rare PSLAQR0 failure! PSLAQR1 sometimes succeeds * when PSLAQR0 fails. * KBOT = INFO CALL PSLAQR1( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, $ WI, ILO, IHI, Z, DESCZ, WORK, LWORK, IWORK, $ LIWORK, INFO ) INFO = -7777 END IF ELSE * * Small matrix. * CALL PSLAQR1( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILO, IHI, Z, DESCZ, WORK, LWORK, IWORK, LIWORK, INFO ) * IF( INFO.GT.0 ) THEN * * A rare PSLAQR1 failure! PSLAQR0 sometimes succeeds * when PSLAQR1 fails. * KBOT = INFO * IF( N.GE.NL ) THEN * * Larger matrices have enough subdiagonal scratch * space to call PSLAQR0 directly. * CALL PSLAQR0( WANTT, WANTZ, N, ILO, KBOT, H, DESCH, $ WR, WI, ILO, IHI, Z, DESCZ, WORK, LWORK, $ IWORK, LIWORK, INFO, 0 ) ELSE * * Tiny matrices don't have enough subdiagonal * scratch space to benefit from PSLAQR0. Hence, * tiny matrices must be copied into a larger * array before calling PSLAQR0. * HROWS = NUMROC( NL, NB, MYROW, DESCH(RSRC_), NPROW ) HCOLS = NUMROC( NL, NB, MYCOL, DESCH(CSRC_), NPCOL ) CALL DESCINIT( DESCH2, NL, NL, NB, NB, DESCH(RSRC_), $ DESCH(CSRC_), ICTXT, MAX(1, HROWS), INFO ) CALL PSLACPY( 'All', N, N, H, 1, 1, DESCH, WORK, 1, $ 1, DESCH2 ) CALL PSELSET( WORK, N+1, N, DESCH2, ZERO ) CALL PSLASET( 'All', NL, NL-N, ZERO, ZERO, WORK, 1, $ N+1, DESCH2 ) IPW = 1 + DESCH2(LLD_)*HCOLS CALL PSLAQR0( WANTT, WANTZ, NL, ILO, KBOT, WORK, $ DESCH2, WR, WI, ILO, IHI, Z, DESCZ, $ WORK(IPW), LWORK-IPW+1, IWORK, $ LIWORK, INFO, 0 ) IF( WANTT .OR. INFO.NE.0 ) $ CALL PSLACPY( 'All', N, N, WORK, 1, 1, DESCH2, $ H, 1, 1, DESCH ) END IF INFO = -8888 END IF END IF * * Clear out the trash, if necessary. * IF( ( WANTT .OR. INFO.NE.0 ) .AND. N.GT.2 ) $ CALL PSLASET( 'L', N-2, N-2, ZERO, ZERO, H, 3, 1, DESCH ) * * Force any 2-by-2 blocks to be complex conjugate pairs of * eigenvalues by removing false such blocks. * DO 30 I = ILO, IHI-1 CALL PSELGET( 'All', ' ', TMP3, H, I+1, I, DESCH ) IF( TMP3.NE.0.0E+00 ) THEN CALL PSELGET( 'All', ' ', TMP1, H, I, I, DESCH ) CALL PSELGET( 'All', ' ', TMP2, H, I, I+1, DESCH ) CALL PSELGET( 'All', ' ', TMP4, H, I+1, I+1, DESCH ) CALL SLANV2( TMP1, TMP2, TMP3, TMP4, DUM1, DUM2, DUM3, $ DUM4, CS, SN ) IF( TMP3.EQ.0.0E+00 ) THEN IF( WANTT ) THEN IF( I+2.LE.N ) $ CALL PSROT( N-I-1, H, I, I+2, DESCH, $ DESCH(M_), H, I+1, I+2, DESCH, DESCH(M_), $ CS, SN, WORK, LWORK, INFO ) CALL PSROT( I-1, H, 1, I, DESCH, 1, H, 1, I+1, $ DESCH, 1, CS, SN, WORK, LWORK, INFO ) END IF IF( WANTZ ) THEN CALL PSROT( N, Z, 1, I, DESCZ, 1, Z, 1, I+1, DESCZ, $ 1, CS, SN, WORK, LWORK, INFO ) END IF CALL PSELSET( H, I, I, DESCH, TMP1 ) CALL PSELSET( H, I, I+1, DESCH, TMP2 ) CALL PSELSET( H, I+1, I, DESCH, TMP3 ) CALL PSELSET( H, I+1, I+1, DESCH, TMP4 ) END IF END IF 30 CONTINUE * * Read out eigenvalues: first let all the processes compute the * eigenvalue inside their diagonal blocks in parallel, except for * the eigenvalue located next to a block border. After that, * compute all eigenvalues located next to the block borders. * Finally, do a global summation over WR and WI so that all * processors receive the result. * DO 40 K = ILO, IHI WR( K ) = ZERO WI( K ) = ZERO 40 CONTINUE NB = DESCH( MB_ ) * * Loop 50: extract eigenvalues from the blocks which are not laid * out across a border of the processor mesh, except for those 1x1 * blocks on the border. * PAIR = .FALSE. DO 50 K = ILO, IHI IF( .NOT. PAIR ) THEN BORDER = MOD( K, NB ).EQ.0 .OR. ( K.NE.1 .AND. $ MOD( K, NB ).EQ.1 ) IF( .NOT. BORDER ) THEN CALL INFOG2L( K, K, DESCH, NPROW, NPCOL, MYROW, $ MYCOL, ILOC1, JLOC1, HRSRC1, HCSRC1 ) IF( MYROW.EQ.HRSRC1 .AND. MYCOL.EQ.HCSRC1 ) THEN ELEM1 = H((JLOC1-1)*LLDH+ILOC1) IF( K.LT.N ) THEN ELEM3 = H((JLOC1-1)*LLDH+ILOC1+1) ELSE ELEM3 = ZERO END IF IF( ELEM3.NE.ZERO ) THEN ELEM2 = H((JLOC1)*LLDH+ILOC1) ELEM4 = H((JLOC1)*LLDH+ILOC1+1) CALL SLANV2( ELEM1, ELEM2, ELEM3, ELEM4, $ WR( K ), WI( K ), WR( K+1 ), WI( K+1 ), $ SN, CS ) PAIR = .TRUE. ELSE IF( K.GT.1 ) THEN TMP = H((JLOC1-2)*LLDH+ILOC1) IF( TMP.NE.ZERO ) THEN ELEM1 = H((JLOC1-2)*LLDH+ILOC1-1) ELEM2 = H((JLOC1-1)*LLDH+ILOC1-1) ELEM3 = H((JLOC1-2)*LLDH+ILOC1) ELEM4 = H((JLOC1-1)*LLDH+ILOC1) CALL SLANV2( ELEM1, ELEM2, ELEM3, $ ELEM4, WR( K-1 ), WI( K-1 ), $ WR( K ), WI( K ), SN, CS ) ELSE WR( K ) = ELEM1 END IF ELSE WR( K ) = ELEM1 END IF END IF END IF END IF ELSE PAIR = .FALSE. END IF 50 CONTINUE * * Loop 60: extract eigenvalues from the blocks which are laid * out across a border of the processor mesh. The processors are * numbered as below: * * 1 | 2 * --+-- * 3 | 4 * DO 60 K = ICEIL(ILO,NB)*NB, IHI-1, NB CALL INFOG2L( K, K, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC1, HRSRC1, HCSRC1 ) CALL INFOG2L( K, K+1, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC2, JLOC2, HRSRC2, HCSRC2 ) CALL INFOG2L( K+1, K, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC3, JLOC3, HRSRC3, HCSRC3 ) CALL INFOG2L( K+1, K+1, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC4, HRSRC4, HCSRC4 ) IF( MYROW.EQ.HRSRC2 .AND. MYCOL.EQ.HCSRC2 ) THEN ELEM2 = H((JLOC2-1)*LLDH+ILOC2) IF( HRSRC1.NE.HRSRC2 .OR. HCSRC1.NE.HCSRC2 ) $ CALL SGESD2D( ICTXT, 1, 1, ELEM2, 1, HRSRC1, HCSRC1) END IF IF( MYROW.EQ.HRSRC3 .AND. MYCOL.EQ.HCSRC3 ) THEN ELEM3 = H((JLOC3-1)*LLDH+ILOC3) IF( HRSRC1.NE.HRSRC3 .OR. HCSRC1.NE.HCSRC3 ) $ CALL SGESD2D( ICTXT, 1, 1, ELEM3, 1, HRSRC1, HCSRC1) END IF IF( MYROW.EQ.HRSRC4 .AND. MYCOL.EQ.HCSRC4 ) THEN WORK(1) = H((JLOC4-1)*LLDH+ILOC4) IF( K+1.LT.N ) THEN WORK(2) = H((JLOC4-1)*LLDH+ILOC4+1) ELSE WORK(2) = ZERO END IF IF( HRSRC1.NE.HRSRC4 .OR. HCSRC1.NE.HCSRC4 ) $ CALL SGESD2D( ICTXT, 2, 1, WORK, 2, HRSRC1, HCSRC1 ) END IF IF( MYROW.EQ.HRSRC1 .AND. MYCOL.EQ.HCSRC1 ) THEN ELEM1 = H((JLOC1-1)*LLDH+ILOC1) IF( HRSRC1.NE.HRSRC2 .OR. HCSRC1.NE.HCSRC2 ) $ CALL SGERV2D( ICTXT, 1, 1, ELEM2, 1, HRSRC2, HCSRC2) IF( HRSRC1.NE.HRSRC3 .OR. HCSRC1.NE.HCSRC3 ) $ CALL SGERV2D( ICTXT, 1, 1, ELEM3, 1, HRSRC3, HCSRC3) IF( HRSRC1.NE.HRSRC4 .OR. HCSRC1.NE.HCSRC4 ) $ CALL SGERV2D( ICTXT, 2, 1, WORK, 2, HRSRC4, HCSRC4 ) ELEM4 = WORK(1) ELEM5 = WORK(2) IF( ELEM5.EQ.ZERO ) THEN IF( WR( K ).EQ.ZERO .AND. WI( K ).EQ.ZERO ) THEN CALL SLANV2( ELEM1, ELEM2, ELEM3, ELEM4, WR( K ), $ WI( K ), WR( K+1 ), WI( K+1 ), SN, CS ) ELSEIF( WR( K+1 ).EQ.ZERO .AND. WI( K+1 ).EQ.ZERO ) $ THEN WR( K+1 ) = ELEM4 END IF ELSEIF( WR( K ).EQ.ZERO .AND. WI( K ).EQ.ZERO ) $ THEN WR( K ) = ELEM1 END IF END IF 60 CONTINUE * IF( NPROCS.GT.1 ) THEN CALL SGSUM2D( ICTXT, 'All', ' ', IHI-ILO+1, 1, WR(ILO), N, $ -1, -1 ) CALL SGSUM2D( ICTXT, 'All', ' ', IHI-ILO+1, 1, WI(ILO), N, $ -1, -1 ) END IF * END IF * WORK(1) = LWKOPT IWORK(1) = LIWKOPT RETURN * * End of PSHSEQR * END scalapack-2.0.2/SRC/pslabad.f000644 000766 000024 00000004761 11622500733 016124 0ustar00juliestaff000000 000000 SUBROUTINE PSLABAD( ICTXT, SMALL, LARGE ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER ICTXT REAL LARGE, SMALL * .. * * Purpose * ======= * * PSLABAD takes as input the values computed by PSLAMCH for underflow * and overflow, and returns the square root of each of these values if * the log of LARGE is sufficiently large. This subroutine is intended * to identify machines with a large exponent range, such as the Crays, * and redefine the underflow and overflow limits to be the square roots * of the values computed by PSLAMCH. This subroutine is needed because * PSLAMCH does not compensate for poor arithmetic in the upper half of * the exponent range, as is found on a Cray. * * In addition, this routine performs a global minimization and maximi- * zation on these values, to support heterogeneous computing networks. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle in which the computation takes * place. * * SMALL (local input/local output) REAL * On entry, the underflow threshold as computed by PSLAMCH. * On exit, if LOG10(LARGE) is sufficiently large, the square * root of SMALL, otherwise unchanged. * * LARGE (local input/local output) REAL * On entry, the overflow threshold as computed by PSLAMCH. * On exit, if LOG10(LARGE) is sufficiently large, the square * root of LARGE, otherwise unchanged. * * ===================================================================== * * .. Local Scalars .. INTEGER IDUMM * .. * .. External Subroutines .. EXTERNAL SGAMN2D, SGAMX2D * .. * .. Intrinsic Functions .. INTRINSIC LOG10, SQRT * .. * .. Executable Statements .. * * If it looks like we're on a Cray, take the square root of * SMALL and LARGE to avoid overflow and underflow problems. * IF( LOG10( LARGE ).GT.2000. ) THEN SMALL = SQRT( SMALL ) LARGE = SQRT( LARGE ) END IF IDUMM = 0 * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, SMALL, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) CALL SGAMN2D( ICTXT, 'All', ' ', 1, 1, LARGE, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) * RETURN * * End of PSLABAD * END scalapack-2.0.2/SRC/pslabrd.f000644 000766 000024 00000052641 10363532303 016144 0ustar00juliestaff000000 000000 SUBROUTINE PSLABRD( M, N, NB, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ X, IX, JX, DESCX, Y, IY, JY, DESCY, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IX, IY, JA, JX, JY, M, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL A( * ), D( * ), E( * ), TAUP( * ), $ TAUQ( * ), X( * ), Y( * ), WORK( * ) * .. * * Purpose * ======= * * PSLABRD reduces the first NB rows and columns of a real general * M-by-N distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper * or lower bidiagonal form by an orthogonal transformation Q' * A * P, * and returns the matrices X and Y which are needed to apply the * transformation to the unreduced part of sub( A ). * * If M >= N, sub( A ) is reduced to upper bidiagonal form; if M < N, to * lower bidiagonal form. * * This is an auxiliary routine called by PSGEBRD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NB (global input) INTEGER * The number of leading rows and columns of sub( A ) to be * reduced. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ) to be reduced. On exit, * the first NB rows and columns of the matrix are overwritten; * the rest of the distributed matrix sub( A ) is unchanged. * If m >= n, elements on and below the diagonal in the first NB * columns, with the array TAUQ, represent the orthogonal * matrix Q as a product of elementary reflectors; and * elements above the diagonal in the first NB rows, with the * array TAUP, represent the orthogonal matrix P as a product * of elementary reflectors. * If m < n, elements below the diagonal in the first NB * columns, with the array TAUQ, represent the orthogonal * matrix Q as a product of elementary reflectors, and * elements on and above the diagonal in the first NB rows, * with the array TAUP, represent the orthogonal matrix P as * a product of elementary reflectors. * See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(ia+i-1,ja+i-1). D is tied to the distributed * matrix A. * * E (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(ia+i-1,ja+i) for i = 1,2,...,n-1; * if m < n, E(i) = A(ia+i,ja+i-1) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) REAL array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix Q. TAUQ * is tied to the distributed matrix A. See Further Details. * * TAUP (local output) REAL array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the orthogonal matrix P. TAUP * is tied to the distributed matrix A. See Further Details. * * X (local output) REAL pointer into the local memory * to an array of dimension (LLD_X,NB). On exit, the local * pieces of the distributed M-by-NB matrix * X(IX:IX+M-1,JX:JX+NB-1) required to update the unreduced * part of sub( A ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * Y (local output) REAL pointer into the local memory * to an array of dimension (LLD_Y,NB). On exit, the local * pieces of the distributed N-by-NB matrix * Y(IY:IY+N-1,JY:JY+NB-1) required to update the unreduced * part of sub( A ). * * IY (global input) INTEGER * The row index in the global array Y indicating the first * row of sub( Y ). * * JY (global input) INTEGER * The column index in the global array Y indicating the * first column of sub( Y ). * * DESCY (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Y. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK >= NB_A + NQ, with * * NQ = NUMROC( N+MOD( IA-1, NB_Y ), NB_Y, MYCOL, IACOL, NPCOL ) * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are real scalars, and v and u are real vectors. * * If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in * A(ia+i-1:ia+m-1,ja+i-1); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is * stored on exit in A(ia+i-1,ja+i:ja+n-1); tauq is stored in * TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); u(1:i-1) = 0, u(i) = 1, and u(i:n) is * stored on exit in A(ia+i-1,ja+i:ja+n-1); tauq is stored in * TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The elements of the vectors v and u together form the m-by-nb matrix * V and the nb-by-n matrix U' which are needed, with X and Y, to apply * the transformation to the unreduced part of the matrix, using a block * update of the form: sub( A ) := sub( A ) - V*Y' - X*U'. * * The contents of sub( A ) on exit are illustrated by the following * examples with nb = 2: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) * ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) * ( v1 v2 a a a ) ( v1 1 a a a a ) * ( v1 v2 a a a ) ( v1 v2 a a a a ) * ( v1 v2 a a a ) ( v1 v2 a a a a ) * ( v1 v2 a a a ) * * where a denotes an element of the original matrix which is unchanged, * vi denotes an element of the vector defining H(i), and ui an element * of the vector defining G(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, IPY, IW, J, JJ, $ JWY, K, MYCOL, MYROW, NPCOL, NPROW REAL ALPHA, TAU INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), $ DESCTP( DLEN_ ), DESCTQ( DLEN_ ), $ DESCW( DLEN_ ), DESCWY( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSCOPY, $ PSELGET, PSELSET, PSGEMV, PSLARFG, $ PSSCAL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) IPY = DESCA( MB_ ) + 1 IW = MOD( IA-1, DESCA( NB_ ) ) + 1 ALPHA = ZERO * CALL DESCSET( DESCWY, 1, N+MOD( IA-1, DESCY( NB_ ) ), 1, $ DESCA( NB_ ), IAROW, IACOL, ICTXT, 1 ) CALL DESCSET( DESCW, DESCA( MB_ ), 1, DESCA( MB_ ), 1, IAROW, $ IACOL, ICTXT, DESCA( MB_ ) ) CALL DESCSET( DESCTQ, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), IAROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCTP, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), IACOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) * IF( M.GE.N ) THEN * * Reduce to upper bidiagonal form * CALL DESCSET( DESCD, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) DO 10 K = 1, NB I = IA + K - 1 J = JA + K - 1 JWY = IW + K * * Update A(i:ia+m-1,j) * IF( K.GT.1 ) THEN CALL PSGEMV( 'No transpose', M-K+1, K-1, -ONE, A, I, JA, $ DESCA, Y, IY, JY+K-1, DESCY, 1, ONE, A, I, $ J, DESCA, 1 ) CALL PSGEMV( 'No transpose', M-K+1, K-1, -ONE, X, IX+K-1, $ JX, DESCX, A, IA, J, DESCA, 1, ONE, A, I, J, $ DESCA, 1 ) CALL PSELSET( A, I-1, J, DESCA, ALPHA ) END IF * * Generate reflection Q(i) to annihilate A(i+1:ia+m-1,j) * CALL PSLARFG( M-K+1, ALPHA, I, J, A, I+1, J, DESCA, 1, $ TAUQ ) CALL PSELSET( D, 1, J, DESCD, ALPHA ) CALL PSELSET( A, I, J, DESCA, ONE ) * * Compute Y(IA+I:IA+N-1,J) * CALL PSGEMV( 'Transpose', M-K+1, N-K, ONE, A, I, J+1, DESCA, $ A, I, J, DESCA, 1, ZERO, WORK( IPY ), 1, JWY, $ DESCWY, DESCWY( M_ ) ) CALL PSGEMV( 'Transpose', M-K+1, K-1, ONE, A, I, JA, DESCA, $ A, I, J, DESCA, 1, ZERO, WORK, IW, 1, DESCW, $ 1 ) CALL PSGEMV( 'Transpose', K-1, N-K, -ONE, Y, IY, JY+K, $ DESCY, WORK, IW, 1, DESCW, 1, ONE, WORK( IPY ), $ 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PSGEMV( 'Transpose', M-K+1, K-1, ONE, X, IX+K-1, JX, $ DESCX, A, I, J, DESCA, 1, ZERO, WORK, IW, 1, $ DESCW, 1 ) CALL PSGEMV( 'Transpose', K-1, N-K, -ONE, A, IA, J+1, DESCA, $ WORK, IW, 1, DESCW, 1, ONE, WORK( IPY ), 1, $ JWY, DESCWY, DESCWY( M_ ) ) * CALL PSELGET( 'Rowwise', ' ', TAU, TAUQ, 1, J, DESCTQ ) CALL PSSCAL( N-K, TAU, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PSCOPY( N-K, WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ), $ Y, IY+K-1, JY+K, DESCY, DESCY( M_ ) ) * * Update A(i,j+1:ja+n-1) * CALL PSGEMV( 'Transpose', K, N-K, -ONE, Y, IY, JY+K, DESCY, $ A, I, JA, DESCA, DESCA( M_ ), ONE, A, I, J+1, $ DESCA, DESCA( M_ ) ) CALL PSGEMV( 'Transpose', K-1, N-K, -ONE, A, IA, J+1, DESCA, $ X, IX+K-1, JX, DESCX, DESCX( M_ ), ONE, A, I, $ J+1, DESCA, DESCA( M_ ) ) CALL PSELSET( A, I, J, DESCA, ALPHA ) * * Generate reflection P(i) to annihilate A(i,j+2:ja+n-1) * CALL PSLARFG( N-K, ALPHA, I, J+1, A, I, $ MIN( J+2, N+JA-1 ), DESCA, DESCA( M_ ), TAUP ) CALL PSELSET( E, I, 1, DESCE, ALPHA ) CALL PSELSET( A, I, J+1, DESCA, ONE ) * * Compute X(I+1:IA+M-1,J) * CALL PSGEMV( 'No transpose', M-K, N-K, ONE, A, I+1, J+1, $ DESCA, A, I, J+1, DESCA, DESCA( M_ ), ZERO, X, $ IX+K, JX+K-1, DESCX, 1 ) CALL PSGEMV( 'No transpose', K, N-K, ONE, Y, IY, JY+K, $ DESCY, A, I, J+1, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PSGEMV( 'No transpose', M-K, K, -ONE, A, I+1, JA, $ DESCA, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) CALL PSGEMV( 'No transpose', K-1, N-K, ONE, A, IA, J+1, $ DESCA, A, I, J+1, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PSGEMV( 'No transpose', M-K, K-1, -ONE, X, IX+K, JX, $ DESCX, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) * CALL PSELGET( 'Columnwise', ' ', TAU, TAUP, I, 1, DESCTP ) CALL PSSCAL( M-K, TAU, X, IX+K, JX+K-1, DESCX, 1 ) 10 CONTINUE * ELSE * * Reduce to lower bidiagonal form * CALL DESCSET( DESCD, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) DO 20 K = 1, NB I = IA + K - 1 J = JA + K - 1 JWY = IW + K * * Update A(i,j:ja+n-1) * IF( K.GT.1 ) THEN CALL PSGEMV( 'Transpose', K-1, N-K+1, -ONE, Y, IY, $ JY+K-1, DESCY, A, I, JA, DESCA, DESCA( M_ ), $ ONE, A, I, J, DESCA, DESCA( M_ ) ) CALL PSGEMV( 'Transpose', K-1, N-K+1, -ONE, A, IA, J, $ DESCA, X, IX+K-1, JX, DESCX, DESCX( M_ ), $ ONE, A, I, J, DESCA, DESCA( M_ ) ) CALL PSELSET( A, I, J-1, DESCA, ALPHA ) END IF * * Generate reflection P(i) to annihilate A(i,j+1:ja+n-1) * CALL PSLARFG( N-K+1, ALPHA, I, J, A, I, J+1, DESCA, $ DESCA( M_ ), TAUP ) CALL PSELSET( D, I, 1, DESCD, ALPHA ) CALL PSELSET( A, I, J, DESCA, ONE ) * * Compute X(i+1:ia+m-1,j) * CALL PSGEMV( 'No transpose', M-K, N-K+1, ONE, A, I+1, J, $ DESCA, A, I, J, DESCA, DESCA( M_ ), ZERO, X, $ IX+K, JX+K-1, DESCX, 1 ) CALL PSGEMV( 'No transpose', K-1, N-K+1, ONE, Y, IY, JY+K-1, $ DESCY, A, I, J, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PSGEMV( 'No transpose', M-K, K-1, -ONE, A, I+1, JA, $ DESCA, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) CALL PSGEMV( 'No transpose', K-1, N-K+1, ONE, A, IA, J, $ DESCA, A, I, J, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PSGEMV( 'No transpose', M-K, K-1, -ONE, X, IX+K, JX, $ DESCX, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) * CALL PSELGET( 'Columnwise', ' ', TAU, TAUP, I, 1, DESCTP ) CALL PSSCAL( M-K, TAU, X, IX+K, JX+K-1, DESCX, 1 ) * * Update A(i+1:ia+m-1,j) * CALL PSGEMV( 'No transpose', M-K, K-1, -ONE, A, I+1, JA, $ DESCA, Y, IY, JY+K-1, DESCY, 1, ONE, A, I+1, J, $ DESCA, 1 ) CALL PSGEMV( 'No transpose', M-K, K, -ONE, X, IX+K, JX, $ DESCX, A, IA, J, DESCA, 1, ONE, A, I+1, J, $ DESCA, 1 ) CALL PSELSET( A, I, J, DESCA, ALPHA ) * * Generate reflection Q(i) to annihilate A(i+2:ia+m-1,j) * CALL PSLARFG( M-K, ALPHA, I+1, J, A, MIN( I+2, M+IA-1 ), $ J, DESCA, 1, TAUQ ) CALL PSELSET( E, 1, J, DESCE, ALPHA ) CALL PSELSET( A, I+1, J, DESCA, ONE ) * * Compute Y(ia+i:ia+n-1,j) * CALL PSGEMV( 'Transpose', M-K, N-K, ONE, A, I+1, J+1, DESCA, $ A, I+1, J, DESCA, 1, ZERO, WORK( IPY ), 1, $ JWY, DESCWY, DESCWY( M_ ) ) CALL PSGEMV( 'Transpose', M-K, K-1, ONE, A, I+1, JA, DESCA, $ A, I+1, J, DESCA, 1, ZERO, WORK, IW, 1, DESCW, $ 1 ) CALL PSGEMV( 'Transpose', K-1, N-K, -ONE, Y, IY, JY+K, $ DESCY, WORK, IW, 1, DESCW, 1, ONE, WORK( IPY ), $ 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PSGEMV( 'Transpose', M-K, K, ONE, X, IX+K, JX, DESCX, $ A, I+1, J, DESCA, 1, ZERO, WORK, IW, 1, DESCW, $ 1 ) CALL PSGEMV( 'Transpose', K, N-K, -ONE, A, IA, J+1, DESCA, $ WORK, IW, 1, DESCW, 1, ONE, WORK( IPY ), 1, $ JWY, DESCWY, DESCWY( M_ ) ) * CALL PSELGET( 'Rowwise', ' ', TAU, TAUQ, 1, J, DESCTQ ) CALL PSSCAL( N-K, TAU, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PSCOPY( N-K, WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ), $ Y, IY+K-1, JY+K, DESCY, DESCY( M_ ) ) 20 CONTINUE END IF * RETURN * * End of PSLABRD * END scalapack-2.0.2/SRC/pslacon.f000644 000766 000024 00000032265 10363532303 016154 0ustar00juliestaff000000 000000 SUBROUTINE PSLACON( N, V, IV, JV, DESCV, X, IX, JX, DESCX, ISGN, $ EST, KASE ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IV, IX, JV, JX, KASE, N REAL EST * .. * .. Array Arguments .. INTEGER DESCV( * ), DESCX( * ), ISGN( * ) REAL V( * ), X( * ) * .. * * Purpose * ======= * * PSLACON estimates the 1-norm of a square, real distributed matrix A. * Reverse communication is used for evaluating matrix-vector products. * X and V are aligned with the distributed matrix A, this information * is implicitly contained within IV, IX, DESCV, and DESCX. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vectors V and X. N >= 0. * * V (local workspace) REAL pointer into the local * memory to an array of dimension LOCr(N+MOD(IV-1,MB_V)). On * the final return, V = A*W, where EST = norm(V)/norm(W) * (W is not returned). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * X (local input/local output) REAL pointer into the * local memory to an array of dimension * LOCr(N+MOD(IX-1,MB_X)). On an intermediate return, X * should be overwritten by * A * X, if KASE=1, * A' * X, if KASE=2, * PSLACON must be re-called with all the other parameters * unchanged. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * ISGN (local workspace) INTEGER array, dimension * LOCr(N+MOD(IX-1,MB_X)). ISGN is aligned with X and V. * * * EST (global output) REAL * An estimate (a lower bound) for norm(A). * * KASE (local input/local output) INTEGER * On the initial call to PSLACON, KASE should be 0. * On an intermediate return, KASE will be 1 or 2, indicating * whether X should be overwritten by A * X or A' * X. * On the final return from PSLACON, KASE will again be 0. * * Further Details * =============== * * The serial version SLACON has been contributed by Nick Higham, * University of Manchester. It was originally named SONEST, dated * March 16, 1988. * * Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of * a real or complex matrix, with applications to condition estimation", * ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) REAL ZERO, ONE, TWO PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, IFLAG, IIVX, IMAXROW, IOFFVX, IROFF, $ ITER, IVXCOL, IVXROW, J, JLAST, JJVX, JUMP, $ K, MYCOL, MYROW, NP, NPCOL, NPROW REAL ALTSGN, ESTOLD, JLMAX, TEMP, XMAX * .. * .. Local Arrays .. REAL WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, INFOG2L, PSAMAX, $ PSASUM, PSELGET, SGEBR2D, $ SGEBS2D, SCOPY * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, INDXL2G, NUMROC EXTERNAL INDXG2L, INDXG2P, INDXL2G, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD, NINT, REAL, SIGN * .. * .. Save statement .. SAVE * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIVX, JJVX, IVXROW, IVXCOL ) IF( MYCOL.NE.IVXCOL ) $ RETURN IROFF = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IVXROW, NPROW ) IF( MYROW.EQ.IVXROW ) $ NP = NP - IROFF IOFFVX = IIVX + (JJVX-1)*DESCX( LLD_ ) * IF( KASE.EQ.0 ) THEN DO 10 I = IOFFVX, IOFFVX+NP-1 X( I ) = ONE / REAL( N ) 10 CONTINUE KASE = 1 JUMP = 1 RETURN END IF * GO TO ( 20, 40, 70, 110, 140 )JUMP * * ................ ENTRY (JUMP = 1) * FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X * 20 CONTINUE IF( N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN V( IOFFVX ) = X( IOFFVX ) EST = ABS( V( IOFFVX ) ) CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF * ... QUIT GO TO 150 END IF CALL PSASUM( N, EST, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF END IF * DO 30 I = IOFFVX, IOFFVX+NP-1 X( I ) = SIGN( ONE, X( I ) ) ISGN( I ) = NINT( X( I ) ) 30 CONTINUE KASE = 2 JUMP = 2 RETURN * * ................ ENTRY (JUMP = 2) * FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X * 40 CONTINUE CALL PSAMAX( N, XMAX, J, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN WORK( 1 ) = XMAX WORK( 2 ) = REAL( J ) CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2, $ IVXROW, MYCOL ) XMAX = WORK( 1 ) J = NINT( WORK( 2 ) ) END IF END IF ITER = 2 * * MAIN LOOP - ITERATIONS 2, 3,...,ITMAX * 50 CONTINUE DO 60 I = IOFFVX, IOFFVX+NP-1 X( I ) = ZERO 60 CONTINUE IMAXROW = INDXG2P( J, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) IF( MYROW.EQ.IMAXROW ) THEN I = INDXG2L( J, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) X( I ) = ONE END IF KASE = 1 JUMP = 3 RETURN * * ................ ENTRY (JUMP = 3) * X HAS BEEN OVERWRITTEN BY A*X * 70 CONTINUE CALL SCOPY( NP, X( IOFFVX ), 1, V( IOFFVX ), 1 ) ESTOLD = EST CALL PSASUM( N, EST, V, IV, JV, DESCV, 1 ) IF( DESCV( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF END IF IFLAG = 0 DO 80 I = IOFFVX, IOFFVX+NP-1 IF( NINT( SIGN( ONE, X( I ) ) ).NE.ISGN( I ) ) THEN IFLAG = 1 GO TO 90 END IF 80 CONTINUE * 90 CONTINUE CALL IGSUM2D( ICTXT, 'C', ' ', 1, 1, IFLAG, 1, -1, MYCOL ) * * REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. * ALONG WITH IT, TEST FOR CYCLING. * IF( IFLAG.EQ.0 .OR. EST.LE.ESTOLD ) $ GO TO 120 * DO 100 I = IOFFVX, IOFFVX+NP-1 X( I ) = SIGN( ONE, X( I ) ) ISGN( I ) = NINT( X( I ) ) 100 CONTINUE KASE = 2 JUMP = 4 RETURN * * ................ ENTRY (JUMP = 4) * X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X * 110 CONTINUE JLAST = J CALL PSAMAX( N, XMAX, J, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN WORK( 1 ) = XMAX WORK( 2 ) = REAL( J ) CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2, $ IVXROW, MYCOL ) XMAX = WORK( 1 ) J = NINT( WORK( 2 ) ) END IF END IF CALL PSELGET( 'Columnwise', ' ', JLMAX, X, JLAST, JX, DESCX ) IF( ( JLMAX.NE.ABS( XMAX ) ).AND.( ITER.LT.ITMAX ) ) THEN ITER = ITER + 1 GO TO 50 END IF * * ITERATION COMPLETE. FINAL STAGE. * 120 CONTINUE DO 130 I = IOFFVX, IOFFVX+NP-1 K = INDXL2G( I-IOFFVX+IIVX, DESCX( MB_ ), MYROW, $ DESCX( RSRC_ ), NPROW )-IX+1 IF( MOD( K, 2 ).EQ.0 ) THEN ALTSGN = -ONE ELSE ALTSGN = ONE END IF X( I ) = ALTSGN*( ONE+REAL( K-1 ) / REAL( N-1 ) ) 130 CONTINUE KASE = 1 JUMP = 5 RETURN * * ................ ENTRY (JUMP = 5) * X HAS BEEN OVERWRITTEN BY A*X * 140 CONTINUE CALL PSASUM( N, TEMP, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TEMP, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TEMP, 1, $ IVXROW, MYCOL ) END IF END IF TEMP = TWO*( TEMP / REAL( 3*N ) ) IF( TEMP.GT.EST ) THEN CALL SCOPY( NP, X( IOFFVX ), 1, V( IOFFVX ), 1 ) EST = TEMP END IF * 150 CONTINUE KASE = 0 * RETURN * * End of PSLACON * END scalapack-2.0.2/SRC/pslaconsb.f000644 000766 000024 00000050424 10363532303 016476 0ustar00juliestaff000000 000000 SUBROUTINE PSLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, BUF, $ LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER I, L, LWORK, M REAL H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), BUF( * ) * .. * * Purpose * ======= * * PSLACONSB looks for two consecutive small subdiagonal elements by * seeing the effect of starting a double shift QR iteration * given by H44, H33, & H43H34 and see if this would make a * subdiagonal negligible. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) REAL array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * M (global output) INTEGER * On exit, this yields the starting location of the QR double * shift. This will satisfy: L <= M <= I-2. * * H44 * H33 * H43H34 (global input) REAL * These three values are for the double shift QR iteration. * * BUF (local output) REAL array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 7*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Logic: * ====== * * Two consecutive small subdiagonal elements will stall * convergence of a double shift if their product is small * relatively even if each is not very small. Thus it is * necessary to scan the "tridiagonal portion of the matrix." In * the LAPACK algorithm DLAHQR, a loop of M goes from I-2 down to * L and examines * H(m,m),H(m+1,m+1),H(m+1,m),H(m,m+1),H(m-1,m-1),H(m,m-1), and * H(m+2,m-1). Since these elements may be on separate * processors, the first major loop (10) goes over the tridiagonal * and has each node store whatever values of the 7 it has that * the node owning H(m,m) does not. This will occur on a border * and can happen in no more than 3 locations per block assuming * square blocks. There are 5 buffers that each node stores these * values: a buffer to send diagonally down and right, a buffer * to send up, a buffer to send left, a buffer to send diagonally * up and left and a buffer to send right. Each of these buffers * is actually stored in one buffer BUF where BUF(ISTR1+1) starts * the first buffer, BUF(ISTR2+1) starts the second, etc.. After * the values are stored, if there are any values that a node * needs, they will be sent and received. Then the next major * loop passes over the data and searches for two consecutive * small subdiagonals. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IBUF1, IBUF2, IBUF3, IBUF4, $ IBUF5, ICOL1, II, IRCV1, IRCV2, IRCV3, IRCV4, $ IRCV5, IROW1, ISRC, ISTR1, ISTR2, ISTR3, ISTR4, $ ISTR5, JJ, JSRC, LDA, LEFT, MODKM1, MYCOL, $ MYROW, NPCOL, NPROW, NUM, RIGHT, UP REAL H00, H10, H11, H12, H21, H22, H33S, H44S, S, $ TST1, ULP, V1, V2, V3 * .. * .. External Functions .. INTEGER ILCM REAL PSLAMCH EXTERNAL ILCM, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGERV2D, SGESD2D, IGAMX2D, $ INFOG2L, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) ULP = PSLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 starts at BUF(ISTR1+1) and will contain IBUF1 elements * BUFFER2 starts at BUF(ISTR2+1) and will contain IBUF2 elements * BUFFER3 starts at BUF(ISTR3+1) and will contain IBUF3 elements * BUFFER4 starts at BUF(ISTR4+1) and will contain IBUF4 elements * BUFFER5 starts at BUF(ISTR5+1) and will contain IBUF5 elements * ISTR1 = 0 ISTR2 = ( ( I-L-1 ) / HBL ) IF( ISTR2*HBL.LT.( I-L-1 ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.7*ISTR2 ) THEN CALL PXERBLA( CONTXT, 'PSLACONSB', 10 ) RETURN END IF ISTR3 = 3*ISTR2 ISTR4 = ISTR3 + ISTR2 ISTR5 = ISTR3 + ISTR3 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) * * Copy our relevant pieces of triadiagonal that we owe into * 5 buffers to send to whomever owns H(M,M) as M moves diagonally * up the tridiagonal * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 IRCV1 = 0 IRCV2 = 0 IRCV3 = 0 IRCV4 = 0 IRCV5 = 0 DO 10 M = I - 2, L, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M-1,M-1) and send it diagonal down * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( M-1, M-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M ,M-1) and send it right * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF5 = IBUF5 + 1 BUF( ISTR5+IBUF5 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+1,M) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( MYROW.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M ,M+1) and send it left * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF3 = IBUF3 + 1 BUF( ISTR3+IBUF3 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M+1,M+1) & H(M+2,M+1) and send it * diagonally up * IF( ( UP.NE.MYROW ) .OR. ( LEFT.NE.MYCOL ) ) THEN CALL INFOG2L( M+1, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF4 = IBUF4 + 2 BUF( ISTR4+IBUF4-1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) BUF( ISTR4+IBUF4 ) = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+2,M+1) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * Add up the receives * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( M.GT.L ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M-1,M-1) from diagonal up * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) .AND. ( M.GT.L ) ) $ THEN * * We must receive H(M ,M-1) from left * IRCV5 = IRCV5 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+1,M ) from down * IRCV2 = IRCV2 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPCOL.GT.1 ) ) THEN * * We must receive H(M ,M+1) from right * IRCV3 = IRCV3 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M+1:M+2,M+1) from diagonal down * IRCV4 = IRCV4 + 2 END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+2,M+1) from down * IRCV2 = IRCV2 + 1 END IF END IF * * Possibly change owners (occurs only when MOD(M-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * * Send data on to the appropriate node if there is any data to send * IF( IBUF1.GT.0 ) THEN CALL SGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL SGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, UP, $ MYCOL ) END IF IF( IBUF3.GT.0 ) THEN CALL SGESD2D( CONTXT, IBUF3, 1, BUF( ISTR3+1 ), IBUF3, MYROW, $ LEFT ) END IF IF( IBUF4.GT.0 ) THEN CALL SGESD2D( CONTXT, IBUF4, 1, BUF( ISTR4+1 ), IBUF4, UP, $ LEFT ) END IF IF( IBUF5.GT.0 ) THEN CALL SGESD2D( CONTXT, IBUF5, 1, BUF( ISTR5+1 ), IBUF5, MYROW, $ RIGHT ) END IF * * Receive appropriate data if there is any * IF( IRCV1.GT.0 ) THEN CALL SGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL SGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, DOWN, $ MYCOL ) END IF IF( IRCV3.GT.0 ) THEN CALL SGERV2D( CONTXT, IRCV3, 1, BUF( ISTR3+1 ), IRCV3, MYROW, $ RIGHT ) END IF IF( IRCV4.GT.0 ) THEN CALL SGERV2D( CONTXT, IRCV4, 1, BUF( ISTR4+1 ), IRCV4, DOWN, $ RIGHT ) END IF IF( IRCV5.GT.0 ) THEN CALL SGERV2D( CONTXT, IRCV5, 1, BUF( ISTR5+1 ), IRCV5, MYROW, $ LEFT ) END IF * * Start main loop * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. $ ( MODKM1.NE.HBL-1 ) ) THEN CALL INFOG2L( I-2, I-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) END IF * * Look for two consecutive small subdiagonal elements. * DO 20 M = I - 2, L, -1 * * Determine the effect of starting the double-shift QR * iteration at row M, and see if this would make H(M,M-1) * negligible. * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H00 = BUF( ISTR1+IBUF1 ) ELSE H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF5 = IBUF5 + 1 H10 = BUF( ISTR5+IBUF5 ) ELSE H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF END IF IF( MODKM1.EQ.HBL-1 ) THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) H11 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( NUM.GT.1 ) THEN IBUF4 = IBUF4 + 2 H22 = BUF( ISTR4+IBUF4-1 ) V3 = BUF( ISTR4+IBUF4 ) ELSE H22 = A( ICOL1*LDA+IROW1+1 ) V3 = A( ( ICOL1+1 )*LDA+IROW1+1 ) END IF IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 H21 = BUF( ISTR2+IBUF2 ) ELSE H21 = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF3 = IBUF3 + 1 H12 = BUF( ISTR3+IBUF3 ) ELSE H12 = A( ICOL1*LDA+IROW1 ) END IF IF( M.GT.L ) THEN H00 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF * * Adjust ICOL1 for next iteration where MODKM1=HBL-2 * ICOL1 = ICOL1 + 1 END IF IF( MODKM1.EQ.HBL-2 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 V3 = BUF( ISTR2+IBUF2 ) ELSE V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) END IF H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.LT.HBL-2 ) .AND. ( MODKM1.GT.0 ) ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S IF( M.EQ.L ) $ GO TO 30 TST1 = ABS( V1 )*( ABS( H00 )+ABS( H11 )+ABS( H22 ) ) IF( ABS( H10 )*( ABS( V2 )+ABS( V3 ) ).LE.ULP*TST1 ) $ GO TO 30 * * Slide indices diagonally up one for next iteration * IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF IF( M.EQ.L ) THEN * * Stop regardless of which node we are * GO TO 30 END IF * * Possibly change owners if on border * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 20 CONTINUE 30 CONTINUE * CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, M, 1, L, L, -1, -1, -1 ) * RETURN * * End of PSLACONSB * END scalapack-2.0.2/SRC/pslacp2.f000644 000766 000024 00000037374 11750130340 016063 0ustar00juliestaff000000 000000 SUBROUTINE PSLACP2( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ) * .. * * Purpose * ======= * * PSLACP2 copies all or part of a distributed matrix A to another * distributed matrix B. No communication is performed, PSLACP2 * performs a local copy sub( A ) := sub( B ), where sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1) and sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * PSLACP2 requires that only dimension of the matrix operands is * distributed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) REAL pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ) set as follows: * * if UPLO = 'U', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=j, 1<=j<=N; * if UPLO = 'L', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * j<=i<=M, 1<=j<=N; * otherwise, B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=M, 1<=j<=N. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER HEIGHT, IACOL, IAROW, IBASE, IBCOL, IBROW, $ ICOFFA, IIA, IIAA, IIB, IIBB, IIBEGA, IIBEGB, $ IIENDA, IINXTA, IINXTB, ILEFT, IRIGHT, IROFFA, $ ITOP, JJA, JJAA, JJB, JJBB, JJBEGA, JJBEGB, $ JJENDA, JJNXTA, JJNXTB, LDA, LDB, MBA, MP, $ MPAA, MYCOL, MYDIST, MYROW, NBA, NPCOL, NPROW, $ NQ, NQAA, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SLAMOV * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, JJB, $ IBROW, IBCOL ) * MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, MBA ) ICOFFA = MOD( JA-1, NBA ) LDB = DESCB( LLD_ ) * IF( N.LE.( NBA-ICOFFA ) ) THEN * * It is assumed that the local columns JJA:JJA+N-1 of the matrix * A are in the same process column (IACOL). * * N * JJA JJA+N-1 * / --------------------- \ * IROFFA| | | | * \ |...................| | ( IAROW ) * IIA |x | | MBA = DESCA( MB_ ) * | x | | * |--x----------------| / * | x | * | x | ITOP * | x | | * | x | /-------\ * |-------x-----------| |-------x-----------| * | x | | x | * | x | | x | * | x | | x | * | x | | x | * |------------x------| |------------x------| * | x | \____________/ * | x | | * | x | IBASE * | x | * |-----------------x-| Local picture * | x| * | | * | | * | | * |-------------------| * | | * . . * . . * . (IACOL) . * IF( MYCOL.EQ.IACOL ) THEN * MP = NUMROC( M+IROFFA, MBA, MYROW, IAROW, NPROW ) IF( MP.LE.0 ) $ RETURN IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA MYDIST = MOD( MYROW-IAROW+NPROW, NPROW ) ITOP = MYDIST * MBA - IROFFA * IF( LSAME( UPLO, 'U' ) ) THEN * ITOP = MAX( 0, ITOP ) IIBEGA = IIA IIENDA = IIA + MP - 1 IINXTA = MIN( ICEIL( IIBEGA, MBA ) * MBA, IIENDA ) IIBEGB = IIB IINXTB = IIBEGB + IINXTA - IIBEGA * 10 CONTINUE IF( ( N-ITOP ).GT.0 ) THEN CALL SLAMOV( UPLO, IINXTA-IIBEGA+1, N-ITOP, $ A( IIBEGA+(JJA+ITOP-1)*LDA ), LDA, $ B( IIBEGB+(JJB+ITOP-1)*LDB ), LDB ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IIBEGA = IINXTA + 1 IINXTA = MIN( IINXTA+MBA, IIENDA ) IIBEGB = IINXTB + 1 IINXTB = IIBEGB + IINXTA - IIBEGA GO TO 10 END IF * ELSE IF( LSAME( UPLO, 'L' ) ) THEN * MPAA = MP IIAA = IIA JJAA = JJA IIBB = IIB JJBB = JJB IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 20 CONTINUE IF( JJAA.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL SLAMOV( 'All', MPAA, ITOP-JJAA+JJA, $ A( IIAA+(JJAA-1)*LDA ), LDA, $ B( IIBB+(JJBB-1)*LDB ), LDB ) CALL SLAMOV( UPLO, MPAA, HEIGHT, $ A( IIAA+(JJA+ITOP-1)*LDA ), LDA, $ B( IIBB+(JJB+ITOP-1)*LDB ), LDB ) MPAA = MAX( 0, MPAA - HEIGHT ) IIAA = IIAA + HEIGHT JJAA = JJA + IBASE IIBB = IIBB + HEIGHT JJBB = JJB + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 20 END IF * ELSE * CALL SLAMOV( 'All', MP, N, A( IIA+(JJA-1)*LDA ), $ LDA, B( IIB+(JJB-1)*LDB ), LDB ) * END IF * END IF * ELSE IF( M.LE.( MBA-IROFFA ) ) THEN * * It is assumed that the local rows IIA:IIA+M-1 of the matrix A * are in the same process row (IAROW). * * ICOFFA * / \JJA * IIA ------------------ .... -------- * | .x | | | / | | \ * | . x | | | ILEFT| | | | * | . x | | | | | | * | . x | | \ x | | * | . |x | | |x | | IRIGHT * | . | x | | | x | | * (IAROW) | . | x | | | x | | * | . | x| | | x| | * | . | x | | x / * | . | |x | | | * | . | | x | | | * | . | | x | | | * | . | | x| | | * IIA+M-1 ------------------ .... ------- * NB_A * (IACOL) Local picture * IF( MYROW.EQ.IAROW ) THEN * NQ = NUMROC( N+ICOFFA, NBA, MYCOL, IACOL, NPCOL ) IF( NQ.LE.0 ) $ RETURN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA MYDIST = MOD( MYCOL-IACOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBA - ICOFFA * IF( LSAME( UPLO, 'L' ) ) THEN * ILEFT = MAX( 0, ILEFT ) JJBEGA = JJA JJENDA = JJA + NQ - 1 JJNXTA = MIN( ICEIL( JJBEGA, NBA ) * NBA, JJENDA ) JJBEGB = JJB JJNXTB = JJBEGB + JJNXTA - JJBEGA * 30 CONTINUE IF( ( M-ILEFT ).GT.0 ) THEN CALL SLAMOV( UPLO, M-ILEFT, JJNXTA-JJBEGA+1, $ A( IIA+ILEFT+(JJBEGA-1)*LDA ), LDA, $ B( IIB+ILEFT+(JJBEGB-1)*LDB ), LDB ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA JJBEGA = JJNXTA +1 JJNXTA = MIN( JJNXTA+NBA, JJENDA ) JJBEGB = JJNXTB +1 JJNXTB = JJBEGB + JJNXTA - JJBEGA GO TO 30 END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * NQAA = NQ IIAA = IIA JJAA = JJA IIBB = IIB JJBB = JJB IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 40 CONTINUE IF( IIAA.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL SLAMOV( 'All', ILEFT-IIAA+IIA, NQAA, $ A( IIAA+(JJAA-1)*LDA ), LDA, $ B( IIBB+(JJBB-1)*LDB ), LDB ) CALL SLAMOV( UPLO, WIDE, NQAA, $ A( IIA+ILEFT+(JJAA-1)*LDA ), LDA, $ B( IIB+ILEFT+(JJBB-1)*LDB ), LDB ) NQAA = MAX( 0, NQAA - WIDE ) IIAA = IIA + IRIGHT JJAA = JJAA + WIDE IIBB = IIB + IRIGHT JJBB = JJBB + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 40 END IF * ELSE * CALL SLAMOV( 'All', M, NQ, A( IIA+(JJA-1)*LDA ), $ LDA, B( IIB+(JJB-1)*LDB ), LDB ) * END IF * END IF * END IF * RETURN * * End of PSLACP2 * END scalapack-2.0.2/SRC/pslacp3.f000644 000766 000024 00000030440 11654534541 016065 0ustar00juliestaff000000 000000 SUBROUTINE PSLACP3( M, I, A, DESCA, B, LDB, II, JJ, REV ) IMPLICIT NONE * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER I, II, JJ, LDB, M, REV * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), B( LDB, * ) * .. * * Purpose * ======= * * PSLACP3 is an auxiliary routine that copies from a global parallel * array into a local replicated array or vise versa. Notice that * the entire submatrix that is copied gets placed on one node or * more. The receiving node can be specified precisely, or all nodes * can receive, or just one row or column of nodes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * M is the order of the square submatrix that is copied. * M >= 0. * Unchanged on exit * * I (global input) INTEGER * A(I,I) is the global location that the copying starts from. * Unchanged on exit. * * A (global input/output) REAL array, dimension * (DESCA(LLD_),*) * On entry, the parallel matrix to be copied into or from. * On exit, if REV=1, the copied data. * Unchanged on exit if REV=0. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/output) REAL array of size (LDB,M) * If REV=0, this is the global portion of the array * A(I:I+M-1,I:I+M-1). * If REV=1, this is the unchanged on exit. * * LDB (local input) INTEGER * The leading dimension of B. * * II (global input) INTEGER * By using REV 0 & 1, data can be sent out and returned again. * If REV=0, then II is destination row index for the node(s) * receiving the replicated B. * If II>=0,JJ>=0, then node (II,JJ) receives the data * If II=-1,JJ>=0, then all rows in column JJ receive the * data * If II>=0,JJ=-1, then all cols in row II receive the data * If II=-1,JJ=-1, then all nodes receive the data * If REV<>0, then II is the source row index for the node(s) * sending the replicated B. * * JJ (global input) INTEGER * Similar description as II above * * REV (global input) INTEGER * Use REV = 0 to send global A into locally replicated B * (on node (II,JJ)). * Use REV <> 0 to send locally replicated B from node (II,JJ) * to its owner (which changes depending on its location in * A) into the global A. * * Implemented by: G. Henry, May 1, 1997 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0 ) * .. * .. Local Scalars .. INTEGER COL, CONTXT, HBL, IAFIRST, ICOL1, ICOL2, IDI, $ IDJ, IFIN, III, IROW1, IROW2, ISTOP, ISTOPI, $ ISTOPJ, ITMP, JAFIRST, JJJ, LDA, MYCOL, MYROW, $ NPCOL, NPROW, ROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGEBR2D, SGEBS2D, SGERV2D, $ SGESD2D, INFOG1L * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.LE.0 ) $ RETURN * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( REV.EQ.0 ) THEN DO 20 IDI = 1, M DO 10 IDJ = 1, M B( IDI, IDJ ) = ZERO 10 CONTINUE 20 CONTINUE END IF * IFIN = I + M - 1 * IF( MOD( I+HBL, HBL ).NE.0 ) THEN ISTOP = MIN( I+HBL-MOD( I+HBL, HBL ), IFIN ) ELSE ISTOP = I END IF IDJ = I ISTOPJ = ISTOP IF( IDJ.LE.IFIN ) THEN 30 CONTINUE IDI = I ISTOPI = ISTOP IF( IDI.LE.IFIN ) THEN 40 CONTINUE ROW = MOD( ( IDI-1 ) / HBL + IAFIRST, NPROW ) COL = MOD( ( IDJ-1 ) / HBL + JAFIRST, NPCOL ) CALL INFOG1L( IDI, HBL, NPROW, ROW, IAFIRST, IROW1, ITMP ) IROW2 = NUMROC( ISTOPI, HBL, ROW, IAFIRST, NPROW ) CALL INFOG1L( IDJ, HBL, NPCOL, COL, JAFIRST, ICOL1, ITMP ) ICOL2 = NUMROC( ISTOPJ, HBL, COL, JAFIRST, NPCOL ) IF( ( MYROW.EQ.ROW ) .AND. ( MYCOL.EQ.COL ) ) THEN IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to everyone * IF( REV.EQ.0 ) THEN CALL SGEBS2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.NE.-1 ) ) THEN * * Send the message to Column MYCOL which better be JJ * IF( REV.EQ.0 ) THEN CALL SGEBS2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to Row MYROW which better be II * IF( REV.EQ.0 ) THEN CALL SGEBS2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.NE.-1 ) .AND. $ ( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) ) THEN * * Recv/Send the message to (II,JJ) * IF( REV.EQ.0 ) THEN CALL SGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ A( ( ICOL1-1 )*LDA+IROW1 ), LDA, II, $ JJ ) ELSE CALL SGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, II, JJ ) END IF END IF IF( REV.EQ.0 ) THEN DO 60 JJJ = ICOL1, ICOL2 DO 50 III = IROW1, IROW2 B( IDI+III-IROW1+1-I, IDJ+JJJ-ICOL1+1-I ) $ = A( ( JJJ-1 )*LDA+III ) 50 CONTINUE 60 CONTINUE ELSE DO 80 JJJ = ICOL1, ICOL2 DO 70 III = IROW1, IROW2 A( ( JJJ-1 )*LDA+III ) = B( IDI+III-IROW1+1-I, $ IDJ+JJJ-ICOL1+1-I ) 70 CONTINUE 80 CONTINUE END IF ELSE IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL SGEBR2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL SGEBR2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL SGEBR2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL SGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) ELSE CALL SGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) * CALL SGESD2D(CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, * $ A((ICOL1-1)*LDA+IROW1),LDA, ROW, COL) END IF END IF END IF IDI = ISTOPI + 1 ISTOPI = MIN( ISTOPI+HBL, IFIN ) IF( IDI.LE.IFIN ) $ GO TO 40 END IF IDJ = ISTOPJ + 1 ISTOPJ = MIN( ISTOPJ+HBL, IFIN ) IF( IDJ.LE.IFIN ) $ GO TO 30 END IF RETURN * * End of PSLACP3 * END scalapack-2.0.2/SRC/pslacpy.f000644 000766 000024 00000022354 10363532303 016166 0ustar00juliestaff000000 000000 SUBROUTINE PSLACPY( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ) * .. * * Purpose * ======= * * PSLACPY copies all or part of a distributed matrix A to another * distributed matrix B. No communication is performed, PSLACPY * performs a local copy sub( A ) := sub( B ), where sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1) and sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) REAL pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ) set as follows: * * if UPLO = 'U', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=j, 1<=j<=N; * if UPLO = 'L', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * j<=i<=M, 1<=j<=N; * otherwise, B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=M, 1<=j<=N. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, IAA, IBB, IBLK, IN, ITMP, J, JAA, JBB, $ JBLK, JN, JTMP * .. * .. External Subroutines .. EXTERNAL PSLACP2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) * IF( M.LE.( DESCA( MB_ ) - MOD( IA-1, DESCA( MB_ ) ) ) .OR. $ N.LE.( DESCA( NB_ ) - MOD( JA-1, DESCA( NB_ ) ) ) ) THEN CALL PSLACP2( UPLO, M, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) ELSE * IF( LSAME( UPLO, 'U' ) ) THEN CALL PSLACP2( UPLO, IN-IA+1, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) IBB = IB + ITMP JBB = JB + ITMP JAA = JA + ITMP CALL PSLACP2( UPLO, IBLK, N-ITMP, A, I, JAA, DESCA, $ B, IBB, JBB, DESCB ) 10 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN CALL PSLACP2( UPLO, M, JN-JA+1, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) IBB = IB + JTMP JBB = JB + JTMP IAA = IA + JTMP CALL PSLACP2( UPLO, M-JTMP, JBLK, A, IAA, J, DESCA, $ B, IBB, JBB, DESCB ) 20 CONTINUE ELSE IF( M.LE.N ) THEN CALL PSLACP2( UPLO, IN-IA+1, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) IBB = IB+ITMP CALL PSLACP2( UPLO, IBLK, N, A, I, JA, DESCA, $ B, IBB, JB, DESCB ) 30 CONTINUE ELSE CALL PSLACP2( UPLO, M, JN-JA+1, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) JBB = JB+JTMP CALL PSLACP2( UPLO, M, JBLK, A, IA, J, DESCA, $ B, IB, JBB, DESCB ) 40 CONTINUE END IF END IF * END IF * RETURN * * End of PSLACPY * END scalapack-2.0.2/SRC/pslaed0.f000644 000766 000024 00000017426 10363532303 016047 0ustar00juliestaff000000 000000 SUBROUTINE PSLAED0( N, D, E, Q, IQ, JQ, DESCQ, WORK, IWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER INFO, IQ, JQ, N * .. * .. Array Arguments .. INTEGER DESCQ( * ), IWORK( * ) REAL D( * ), E( * ), Q( * ), WORK( * ) * .. * * Purpose * ======= * * PSLAED0 computes all eigenvalues and corresponding eigenvectors of a * symmetric tridiagonal matrix using the divide and conquer method. * * * Arguments * ========= * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * D (global input/output) REAL array, dimension (N) * On entry, the diagonal elements of the tridiagonal matrix. * On exit, if INFO = 0, the eigenvalues in descending order. * * E (global input/output) REAL array, dimension (N-1) * On entry, the subdiagonal elements of the tridiagonal matrix. * On exit, E has been destroyed. * * Q (local output) REAL array, * global dimension (N, N), * local dimension ( LLD_Q, LOCc(JQ+N-1)) * Q contains the orthonormal eigenvectors of the symmetric * tridiagonal matrix. * On output, Q is distributed across the P processes in block * cyclic format. * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * * WORK (local workspace ) REAL array, dimension (LWORK) * LWORK = 6*N + 2*NP*NQ, with * NP = NUMROC( N, MB_Q, MYROW, IQROW, NPROW ) * NQ = NUMROC( N, NB_Q, MYCOL, IQCOL, NPCOL ) * IQROW = INDXG2P( IQ, NB_Q, MYROW, RSRC_Q, NPROW ) * IQCOL = INDXG2P( JQ, MB_Q, MYCOL, CSRC_Q, NPCOL ) * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * LIWORK = 2 + 7*N + 8*NPCOL * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: The algorithm failed to compute the INFO/(N+1) th * eigenvalue while working on the submatrix lying in * global rows and columns mod(INFO,N+1). * * ===================================================================== * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ID, IDCOL, IDROW, IID, IINFO, IIQ, IM1, IM2, $ IPQ, IQCOL, IQROW, J, JJD, JJQ, LDQ, MATSIZ, $ MYCOL, MYROW, N1, NB, NBL, NBL1, NPCOL, NPROW, $ SUBPBS, TSUBPBS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSLAED1, PXERBLA, $ SGEBR2D, SGEBS2D, SGERV2D, SGESD2D, SSTEQR * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Test the input parameters. * CALL BLACS_GRIDINFO( DESCQ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 IF( DESCQ( NB_ ).GT.N .OR. N.LT.2 ) $ INFO = -1 IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCQ( CTXT_ ), 'PSLAED0', -INFO ) RETURN END IF * NB = DESCQ( NB_ ) LDQ = DESCQ( LLD_ ) CALL INFOG2L( IQ, JQ, DESCQ, NPROW, NPCOL, MYROW, MYCOL, IIQ, JJQ, $ IQROW, IQCOL ) * * Determine the size and placement of the submatrices, and save in * the leading elements of IWORK. * TSUBPBS = ( N-1 ) / NB + 1 IWORK( 1 ) = TSUBPBS SUBPBS = 1 10 CONTINUE IF( IWORK( SUBPBS ).GT.1 ) THEN DO 20 J = SUBPBS, 1, -1 IWORK( 2*J ) = ( IWORK( J )+1 ) / 2 IWORK( 2*J-1 ) = IWORK( J ) / 2 20 CONTINUE SUBPBS = 2*SUBPBS GO TO 10 END IF DO 30 J = 2, SUBPBS IWORK( J ) = IWORK( J ) + IWORK( J-1 ) 30 CONTINUE * * Divide the matrix into TSUBPBS submatrices of size at most NB * using rank-1 modifications (cuts). * DO 40 I = NB + 1, N, NB IM1 = I - 1 D( IM1 ) = D( IM1 ) - ABS( E( IM1 ) ) D( I ) = D( I ) - ABS( E( IM1 ) ) 40 CONTINUE * * Solve each submatrix eigenproblem at the bottom of the divide and * conquer tree. D is the same on each process. * DO 50 ID = 1, N, NB CALL INFOG2L( IQ-1+ID, JQ-1+ID, DESCQ, NPROW, NPCOL, MYROW, $ MYCOL, IID, JJD, IDROW, IDCOL ) MATSIZ = MIN( NB, N-ID+1 ) IF( MYROW.EQ.IDROW .AND. MYCOL.EQ.IDCOL ) THEN IPQ = IID + ( JJD-1 )*LDQ CALL SSTEQR( 'I', MATSIZ, D( ID ), E( ID ), Q( IPQ ), LDQ, $ WORK, INFO ) IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCQ( CTXT_ ), 'SSTEQR', -INFO ) RETURN END IF IF( MYROW.NE.IQROW .OR. MYCOL.NE.IQCOL ) THEN CALL SGESD2D( DESCQ( CTXT_ ), MATSIZ, 1, D( ID ), MATSIZ, $ IQROW, IQCOL ) END IF ELSE IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN CALL SGERV2D( DESCQ( CTXT_ ), MATSIZ, 1, D( ID ), MATSIZ, $ IDROW, IDCOL ) END IF 50 CONTINUE * IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN CALL SGEBS2D( DESCQ( CTXT_ ), 'A', ' ', N, 1, D, N ) ELSE CALL SGEBR2D( DESCQ( CTXT_ ), 'A', ' ', N, 1, D, N, IQROW, $ IQCOL ) END IF * * Successively merge eigensystems of adjacent submatrices * into eigensystem for the corresponding larger matrix. * * while ( SUBPBS > 1 ) * 60 CONTINUE IF( SUBPBS.GT.1 ) THEN IM2 = SUBPBS - 2 DO 80 I = 0, IM2, 2 IF( I.EQ.0 ) THEN NBL = IWORK( 2 ) NBL1 = IWORK( 1 ) IF( NBL1.EQ.0 ) $ GO TO 70 ID = 1 MATSIZ = MIN( N, NBL*NB ) N1 = NBL1*NB ELSE NBL = IWORK( I+2 ) - IWORK( I ) NBL1 = NBL / 2 IF( NBL1.EQ.0 ) $ GO TO 70 ID = IWORK( I )*NB + 1 MATSIZ = MIN( NB*NBL, N-ID+1 ) N1 = NBL1*NB END IF * * Merge lower order eigensystems (of size N1 and MATSIZ - N1) * into an eigensystem of size MATSIZ. * CALL PSLAED1( MATSIZ, N1, D( ID ), ID, Q, IQ, JQ, DESCQ, $ E( ID+N1-1 ), WORK, IWORK( SUBPBS+1 ), IINFO ) IF( IINFO.NE.0 ) THEN INFO = IINFO*( N+1 ) + ID END IF 70 CONTINUE IWORK( I / 2+1 ) = IWORK( I+2 ) 80 CONTINUE SUBPBS = SUBPBS / 2 GO TO 60 END IF * * end while * 90 CONTINUE RETURN * * End of PSLAED0 * END scalapack-2.0.2/SRC/pslaed1.f000644 000766 000024 00000022452 10363532303 016043 0ustar00juliestaff000000 000000 SUBROUTINE PSLAED1( N, N1, D, ID, Q, IQ, JQ, DESCQ, RHO, WORK, $ IWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER ID, INFO, IQ, JQ, N, N1 REAL RHO * .. * .. Array Arguments .. INTEGER DESCQ( * ), IWORK( * ) REAL D( * ), Q( * ), WORK( * ) * .. * * Purpose * ======= * * PSLAED1 computes the updated eigensystem of a diagonal * matrix after modification by a rank-one symmetric matrix, * in parallel. * * T = Q(in) ( D(in) + RHO * Z*Z' ) Q'(in) = Q(out) * D(out) * Q'(out) * * where Z = Q'u, u is a vector of length N with ones in the * N1 and N1 + 1 th elements and zeros elsewhere. * * The eigenvectors of the original matrix are stored in Q, and the * eigenvalues are in D. The algorithm consists of three stages: * * The first stage consists of deflating the size of the problem * when there are multiple eigenvalues or if there is a zero in * the Z vector. For each such occurence the dimension of the * secular equation problem is reduced by one. This stage is * performed by the routine PSLAED2. * * The second stage consists of calculating the updated * eigenvalues. This is done by finding the roots of the secular * equation via the routine SLAED4 (as called by PSLAED3). * This routine also calculates the eigenvectors of the current * problem. * * The final stage consists of computing the updated eigenvectors * directly using the updated eigenvalues. The eigenvectors for * the current problem are multiplied with the eigenvectors from * the overall problem. * * Arguments * ========= * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * * N1 (input) INTEGER * The location of the last eigenvalue in the leading * sub-matrix. * min(1,N) <= N1 <= N. * * D (global input/output) REAL array, dimension (N) * On entry,the eigenvalues of the rank-1-perturbed matrix. * On exit, the eigenvalues of the repaired matrix. * * ID (global input) INTEGER * Q's global row/col index, which points to the beginning * of the submatrix which is to be operated on. * * Q (local output) REAL array, * global dimension (N, N), * local dimension ( LLD_Q, LOCc(JQ+N-1)) * Q contains the orthonormal eigenvectors of the symmetric * tridiagonal matrix. * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * RHO (input) REAL * The subdiagonal entry used to create the rank-1 modification. * * WORK (local workspace/output) REAL array, * dimension 6*N + 2*NP*NQ * * IWORK (local workspace/output) INTEGER array, * dimension 7*N + 8*NPCOL + 2 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: The algorithm failed to compute the ith eigenvalue. * * ===================================================================== * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER COL, COLTYP, IBUF, ICTOT, ICTXT, IDLMDA, IIQ, $ INDCOL, INDROW, INDX, INDXC, INDXP, INDXR, INQ, $ IPQ, IPQ2, IPSM, IPU, IPWORK, IQ1, IQ2, IQCOL, $ IQQ, IQROW, IW, IZ, J, JC, JJ2C, JJC, JJQ, JNQ, $ K, LDQ, LDQ2, LDU, MYCOL, MYROW, NB, NN, NN1, $ NN2, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCQ2( DLEN_ ), DESCU( DLEN_ ) * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, INFOG1L, INFOG2L, $ PSGEMM, PSLAED2, PSLAED3, PSLAEDZ, PSLASET, $ PXERBLA, SCOPY * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * * Test the input parameters. * CALL BLACS_GRIDINFO( DESCQ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE IF( N.LT.0 ) THEN INFO = -1 ELSE IF( ID.GT.DESCQ( N_ ) ) THEN INFO = -4 ELSE IF( N1.GE.N ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCQ( CTXT_ ), 'PSLAED1', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * The following values are integer pointers which indicate * the portion of the workspace used by a particular array * in PSLAED2 and PSLAED3. * ICTXT = DESCQ( CTXT_ ) NB = DESCQ( NB_ ) LDQ = DESCQ( LLD_ ) * CALL INFOG2L( IQ-1+ID, JQ-1+ID, DESCQ, NPROW, NPCOL, MYROW, MYCOL, $ IIQ, JJQ, IQROW, IQCOL ) * NP = NUMROC( N, DESCQ( MB_ ), MYROW, IQROW, NPROW ) NQ = NUMROC( N, DESCQ( NB_ ), MYCOL, IQCOL, NPCOL ) * LDQ2 = MAX( NP, 1 ) LDU = LDQ2 * IZ = 1 IDLMDA = IZ + N IW = IDLMDA + N IPQ2 = IW + N IPU = IPQ2 + LDQ2*NQ IBUF = IPU + LDU*NQ * (IBUF est de taille 3*N au maximum) * ICTOT = 1 IPSM = ICTOT + NPCOL*4 INDX = IPSM + NPCOL*4 INDXC = INDX + N INDXP = INDXC + N INDCOL = INDXP + N COLTYP = INDCOL + N INDROW = COLTYP + N INDXR = INDROW + N * CALL DESCINIT( DESCQ2, N, N, NB, NB, IQROW, IQCOL, ICTXT, LDQ2, $ INFO ) CALL DESCINIT( DESCU, N, N, NB, NB, IQROW, IQCOL, ICTXT, LDU, $ INFO ) * * Form the z-vector which consists of the last row of Q_1 and the * first row of Q_2. * IPWORK = IDLMDA CALL PSLAEDZ( N, N1, ID, Q, IQ, JQ, LDQ, DESCQ, WORK( IZ ), $ WORK( IPWORK ) ) * * Deflate eigenvalues. * IPQ = IIQ + ( JJQ-1 )*LDQ CALL PSLAED2( ICTXT, K, N, N1, NB, D, IQROW, IQCOL, Q( IPQ ), LDQ, $ RHO, WORK( IZ ), WORK( IW ), WORK( IDLMDA ), $ WORK( IPQ2 ), LDQ2, WORK( IBUF ), IWORK( ICTOT ), $ IWORK( IPSM ), NPCOL, IWORK( INDX ), IWORK( INDXC ), $ IWORK( INDXP ), IWORK( INDCOL ), IWORK( COLTYP ), $ NN, NN1, NN2, IQ1, IQ2 ) * * Solve Secular Equation. * IF( K.NE.0 ) THEN CALL PSLASET( 'A', N, N, ZERO, ONE, WORK( IPU ), 1, 1, DESCU ) CALL PSLAED3( ICTXT, K, N, NB, D, IQROW, IQCOL, RHO, $ WORK( IDLMDA ), WORK( IW ), WORK( IZ ), $ WORK( IPU ), LDQ2, WORK( IBUF ), IWORK( INDX ), $ IWORK( INDCOL ), IWORK( INDROW ), IWORK( INDXR ), $ IWORK( INDXC ), IWORK( ICTOT ), NPCOL, INFO ) * * Compute the updated eigenvectors. * IQQ = MIN( IQ1, IQ2 ) IF( NN1.GT.0 ) THEN INQ = IQ - 1 + ID JNQ = JQ - 1 + ID + IQQ - 1 CALL PSGEMM( 'N', 'N', N1, NN, NN1, ONE, WORK( IPQ2 ), 1, $ IQ1, DESCQ2, WORK( IPU ), IQ1, IQQ, DESCU, $ ZERO, Q, INQ, JNQ, DESCQ ) END IF IF( NN2.GT.0 ) THEN INQ = IQ - 1 + ID + N1 JNQ = JQ - 1 + ID + IQQ - 1 CALL PSGEMM( 'N', 'N', N-N1, NN, NN2, ONE, WORK( IPQ2 ), $ N1+1, IQ2, DESCQ2, WORK( IPU ), IQ2, IQQ, $ DESCU, ZERO, Q, INQ, JNQ, DESCQ ) END IF * DO 10 J = K + 1, N JC = IWORK( INDX+J-1 ) CALL INFOG1L( JQ-1+JC, NB, NPCOL, MYCOL, IQCOL, JJC, COL ) CALL INFOG1L( JC, NB, NPCOL, MYCOL, IQCOL, JJ2C, COL ) IF( MYCOL.EQ.COL ) THEN IQ2 = IPQ2 + ( JJ2C-1 )*LDQ2 INQ = IPQ + ( JJC-1 )*LDQ CALL SCOPY( NP, WORK( IQ2 ), 1, Q( INQ ), 1 ) END IF 10 CONTINUE END IF * 20 CONTINUE RETURN * * End of PSLAED1 * END scalapack-2.0.2/SRC/pslaed2.f000644 000766 000024 00000034771 10363532303 016053 0ustar00juliestaff000000 000000 SUBROUTINE PSLAED2( ICTXT, K, N, N1, NB, D, DROW, DCOL, Q, LDQ, $ RHO, Z, W, DLAMDA, Q2, LDQ2, QBUF, CTOT, PSM, $ NPCOL, INDX, INDXC, INDXP, INDCOL, COLTYP, NN, $ NN1, NN2, IB1, IB2 ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER DCOL, DROW, IB1, IB2, ICTXT, K, LDQ, LDQ2, N, $ N1, NB, NN, NN1, NN2, NPCOL REAL RHO * .. * .. Array Arguments .. INTEGER COLTYP( * ), CTOT( 0: NPCOL-1, 4 ), $ INDCOL( N ), INDX( * ), INDXC( * ), INDXP( * ), $ PSM( 0: NPCOL-1, 4 ) REAL D( * ), DLAMDA( * ), Q( LDQ, * ), $ Q2( LDQ2, * ), QBUF( * ), W( * ), Z( * ) * .. * * Purpose * ======= * * PSLAED2 sorts the two sets of eigenvalues together into a single * sorted set. Then it tries to deflate the size of the problem. * There are two ways in which deflation can occur: when two or more * eigenvalues are close together or if there is a tiny entry in the * Z vector. For each such occurrence the order of the related secular * equation problem is reduced by one. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation on the matrix. The context itself is global. * * K (output) INTEGER * The number of non-deflated eigenvalues, and the order of the * related secular equation. 0 <= K <=N. * * N (input) INTEGER * The dimension of the symmetric tridiagonal matrix. N >= 0. * * N1 (input) INTEGER * The location of the last eigenvalue in the leading sub-matrix. * min(1,N) < N1 < N. * * NB (global input) INTEGER * The blocking factor used to distribute the columns of the * matrix. NB >= 1. * * D (input/output) REAL array, dimension (N) * On entry, D contains the eigenvalues of the two submatrices to * be combined. * On exit, D contains the trailing (N-K) updated eigenvalues * (those which were deflated) sorted into increasing order. * * DROW (global input) INTEGER * The process row over which the first row of the matrix D is * distributed. 0 <= DROW < NPROW. * * DCOL (global input) INTEGER * The process column over which the first column of the * matrix D is distributed. 0 <= DCOL < NPCOL. * * Q (input/output) REAL array, dimension (LDQ, N) * On entry, Q contains the eigenvectors of two submatrices in * the two square blocks with corners at (1,1), (N1,N1) * and (N1+1, N1+1), (N,N). * On exit, Q contains the trailing (N-K) updated eigenvectors * (those which were deflated) in its last N-K columns. * * LDQ (input) INTEGER * The leading dimension of the array Q. LDQ >= max(1,NQ). * * RHO (global input/output) REAL * On entry, the off-diagonal element associated with the rank-1 * cut which originally split the two submatrices which are now * being recombined. * On exit, RHO has been modified to the value required by * PSLAED3. * * Z (global input) REAL array, dimension (N) * On entry, Z contains the updating vector (the last * row of the first sub-eigenvector matrix and the first row of * the second sub-eigenvector matrix). * On exit, the contents of Z have been destroyed by the updating * process. * * DLAMDA (global output) REAL array, dimension (N) * A copy of the first K eigenvalues which will be used by * SLAED3 to form the secular equation. * * W (global output) REAL array, dimension (N) * The first k values of the final deflation-altered z-vector * which will be passed to SLAED3. * * Q2 (output) REAL array, dimension (LDQ2, NQ) * A copy of the first K eigenvectors which will be used by * * LDQ2 (input) INTEGER * The leading dimension of the array Q2. * * QBUF (workspace) REAL array, dimension 3*N * * CTOT (workspace) INTEGER array, dimension( NPCOL, 4) * * PSM (workspace) INTEGER array, dimension( NPCOL, 4) * * NPCOL (global input) INTEGER * The total number of columns over which the distributed * submatrix is distributed. * * INDX (workspace) INTEGER array, dimension (N) * The permutation used to sort the contents of DLAMDA into * ascending order. * * INDXC (output) INTEGER array, dimension (N) * The permutation used to arrange the columns of the deflated * Q matrix into three groups: the first group contains non-zero * elements only at and above N1, the second contains * non-zero elements only below N1, and the third is dense. * * INDXP (workspace) INTEGER array, dimension (N) * The permutation used to place deflated values of D at the end * of the array. INDXP(1:K) points to the nondeflated D-values * and INDXP(K+1:N) points to the deflated eigenvalues. * * INDCOL (workspace) INTEGER array, dimension (N) * * COLTYP (workspace/output) INTEGER array, dimension (N) * During execution, a label which will indicate which of the * following types a column in the Q2 matrix is: * 1 : non-zero in the upper half only; * 2 : dense; * 3 : non-zero in the lower half only; * 4 : deflated. * * NN (global output) INTEGER, the order of matrix U, (PSLAED1). * NN1 (global output) INTEGER, the order of matrix Q1, (PSLAED1). * NN2 (global output) INTEGER, the order of matrix Q2, (PSLAED1). * IB1 (global output) INTEGER, pointeur on Q1, (PSLAED1). * IB2 (global output) INTEGER, pointeur on Q2, (PSLAED1). * * ===================================================================== * * .. Parameters .. REAL MONE, ZERO, ONE, TWO, EIGHT PARAMETER ( MONE = -1.0E0, ZERO = 0.0E0, ONE = 1.0E0, $ TWO = 2.0E0, EIGHT = 8.0E0 ) * .. * .. Local Scalars .. INTEGER COL, CT, I, IAM, IE1, IE2, IMAX, INFO, J, JJQ2, $ JJS, JMAX, JS, K2, MYCOL, MYROW, N1P1, N2, NJ, $ NJCOL, NJJ, NP, NPROCS, NPROW, PJ, PJCOL, PJJ REAL C, EPS, S, T, TAU, TOL * .. * .. External Functions .. INTEGER INDXG2L, INDXL2G, ISAMAX, NUMROC REAL PSLAMCH, SLAPY2 EXTERNAL INDXG2L, INDXL2G, ISAMAX, NUMROC, PSLAMCH, $ SLAPY2 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, BLACS_PINFO, INFOG1L, SCOPY, $ SGERV2D, SGESD2D, SLAPST, SROT, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. External Functions .. * .. * .. Local Arrays .. INTEGER PTT( 4 ) * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL BLACS_PINFO( IAM, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NUMROC( N, NB, MYROW, DROW, NPROW ) * N2 = N - N1 N1P1 = N1 + 1 * IF( RHO.LT.ZERO ) THEN CALL SSCAL( N2, MONE, Z( N1P1 ), 1 ) END IF * * Normalize z so that norm(z) = 1. Since z is the concatenation of * two normalized vectors, norm2(z) = sqrt(2). * T = ONE / SQRT( TWO ) CALL SSCAL( N, T, Z, 1 ) * * RHO = ABS( norm(z)**2 * RHO ) * RHO = ABS( TWO*RHO ) * * Calculate the allowable deflation tolerance * IMAX = ISAMAX( N, Z, 1 ) JMAX = ISAMAX( N, D, 1 ) EPS = PSLAMCH( ICTXT, 'Epsilon' ) TOL = EIGHT*EPS*MAX( ABS( D( JMAX ) ), ABS( Z( IMAX ) ) ) * * If the rank-1 modifier is small enough, no more needs to be done * except to reorganize Q so that its columns correspond with the * elements in D. * IF( RHO*ABS( Z( IMAX ) ).LE.TOL ) THEN K = 0 GO TO 220 END IF * * If there are multiple eigenvalues then the problem deflates. Here * the number of equal eigenvalues are found. As each equal * eigenvalue is found, an elementary reflector is computed to rotate * the corresponding eigensubspace so that the corresponding * components of Z are zero in this new basis. * * CALL SLAPST( 'I', N, D, INDX, INFO ) * DO 10 I = 1, N1 COLTYP( I ) = 1 10 CONTINUE DO 20 I = N1P1, N COLTYP( I ) = 3 20 CONTINUE COL = DCOL DO 40 I = 1, N, NB DO 30 J = 0, NB - 1 IF( I+J.LE.N ) $ INDCOL( I+J ) = COL 30 CONTINUE COL = MOD( COL+1, NPCOL ) 40 CONTINUE * K = 0 K2 = N + 1 DO 50 J = 1, N NJ = INDX( J ) IF( RHO*ABS( Z( NJ ) ).LE.TOL ) THEN * * Deflate due to small z component. * K2 = K2 - 1 COLTYP( NJ ) = 4 INDXP( K2 ) = NJ IF( J.EQ.N ) $ GO TO 80 ELSE PJ = NJ GO TO 60 END IF 50 CONTINUE 60 CONTINUE J = J + 1 NJ = INDX( J ) IF( J.GT.N ) $ GO TO 80 IF( RHO*ABS( Z( NJ ) ).LE.TOL ) THEN * * Deflate due to small z component. * K2 = K2 - 1 COLTYP( NJ ) = 4 INDXP( K2 ) = NJ ELSE * * Check if eigenvalues are close enough to allow deflation. * S = Z( PJ ) C = Z( NJ ) * * Find sqrt(a**2+b**2) without overflow or * destructive underflow. * TAU = SLAPY2( C, S ) T = D( NJ ) - D( PJ ) C = C / TAU S = -S / TAU IF( ABS( T*C*S ).LE.TOL ) THEN * * Deflation is possible. * Z( NJ ) = TAU Z( PJ ) = ZERO IF( COLTYP( NJ ).NE.COLTYP( PJ ) ) $ COLTYP( NJ ) = 2 COLTYP( PJ ) = 4 CALL INFOG1L( NJ, NB, NPCOL, MYCOL, DCOL, NJJ, NJCOL ) CALL INFOG1L( PJ, NB, NPCOL, MYCOL, DCOL, PJJ, PJCOL ) IF( INDCOL( PJ ).EQ.INDCOL( NJ ) .AND. MYCOL.EQ.NJCOL ) THEN CALL SROT( NP, Q( 1, PJJ ), 1, Q( 1, NJJ ), 1, C, S ) ELSE IF( MYCOL.EQ.PJCOL ) THEN CALL SGESD2D( ICTXT, NP, 1, Q( 1, PJJ ), NP, MYROW, $ NJCOL ) CALL SGERV2D( ICTXT, NP, 1, QBUF, NP, MYROW, NJCOL ) CALL SROT( NP, Q( 1, PJJ ), 1, QBUF, 1, C, S ) ELSE IF( MYCOL.EQ.NJCOL ) THEN CALL SGESD2D( ICTXT, NP, 1, Q( 1, NJJ ), NP, MYROW, $ PJCOL ) CALL SGERV2D( ICTXT, NP, 1, QBUF, NP, MYROW, PJCOL ) CALL SROT( NP, QBUF, 1, Q( 1, NJJ ), 1, C, S ) END IF T = D( PJ )*C**2 + D( NJ )*S**2 D( NJ ) = D( PJ )*S**2 + D( NJ )*C**2 D( PJ ) = T K2 = K2 - 1 I = 1 70 CONTINUE IF( K2+I.LE.N ) THEN IF( D( PJ ).LT.D( INDXP( K2+I ) ) ) THEN INDXP( K2+I-1 ) = INDXP( K2+I ) INDXP( K2+I ) = PJ I = I + 1 GO TO 70 ELSE INDXP( K2+I-1 ) = PJ END IF ELSE INDXP( K2+I-1 ) = PJ END IF PJ = NJ ELSE K = K + 1 DLAMDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ PJ = NJ END IF END IF GO TO 60 80 CONTINUE * * Record the last eigenvalue. * K = K + 1 DLAMDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ * * Count up the total number of the various types of columns, then * form a permutation which positions the four column types into * four uniform groups (although one or more of these groups may be * empty). * DO 100 J = 1, 4 DO 90 I = 0, NPCOL - 1 CTOT( I, J ) = 0 90 CONTINUE PTT( J ) = 0 100 CONTINUE DO 110 J = 1, N CT = COLTYP( J ) COL = INDCOL( J ) CTOT( COL, CT ) = CTOT( COL, CT ) + 1 110 CONTINUE * * PSM(*) = Position in SubMatrix (of types 1 through 4) * DO 120 COL = 0, NPCOL - 1 PSM( COL, 1 ) = 1 PSM( COL, 2 ) = 1 + CTOT( COL, 1 ) PSM( COL, 3 ) = PSM( COL, 2 ) + CTOT( COL, 2 ) PSM( COL, 4 ) = PSM( COL, 3 ) + CTOT( COL, 3 ) 120 CONTINUE PTT( 1 ) = 1 DO 140 I = 2, 4 CT = 0 DO 130 J = 0, NPCOL - 1 CT = CT + CTOT( J, I-1 ) 130 CONTINUE PTT( I ) = PTT( I-1 ) + CT 140 CONTINUE * * Fill out the INDXC array so that the permutation which it induces * will place all type-1 columns first, all type-2 columns next, * then all type-3's, and finally all type-4's. * DO 150 J = 1, N JS = INDXP( J ) COL = INDCOL( JS ) CT = COLTYP( JS ) I = INDXL2G( PSM( COL, CT ), NB, COL, DCOL, NPCOL ) INDX( J ) = I INDXC( PTT( CT ) ) = I PSM( COL, CT ) = PSM( COL, CT ) + 1 PTT( CT ) = PTT( CT ) + 1 150 CONTINUE DO 160 J = 1, N JS = INDXP( J ) JJS = INDXG2L( JS, NB, J, J, NPCOL ) COL = INDCOL( JS ) IF( COL.EQ.MYCOL ) THEN I = INDX( J ) JJQ2 = INDXG2L( I, NB, J, J, NPCOL ) CALL SCOPY( NP, Q( 1, JJS ), 1, Q2( 1, JJQ2 ), 1 ) END IF 160 CONTINUE * * * The deflated eigenvalues and their corresponding vectors go back * into the last N - K slots of D and Q respectively. * CALL SCOPY( N, D, 1, Z, 1 ) DO 170 J = K + 1, N JS = INDXP( J ) I = INDX( J ) D( I ) = Z( JS ) 170 CONTINUE * PTT( 1 ) = 1 DO 190 I = 2, 4 CT = 0 DO 180 J = 0, NPCOL - 1 CT = CT + CTOT( J, I-1 ) 180 CONTINUE PTT( I ) = PTT( I-1 ) + CT 190 CONTINUE * * IB1 = INDXC( 1 ) IE1 = IB1 IB2 = INDXC( PTT( 2 ) ) IE2 = IB2 DO 200 I = 2, PTT( 3 ) - 1 IB1 = MIN( IB1, INDXC( I ) ) IE1 = MAX( IE1, INDXC( I ) ) 200 CONTINUE DO 210 I = PTT( 2 ), PTT( 4 ) - 1 IB2 = MIN( IB2, INDXC( I ) ) IE2 = MAX( IE2, INDXC( I ) ) 210 CONTINUE NN1 = IE1 - IB1 + 1 NN2 = IE2 - IB2 + 1 NN = MAX( IE1, IE2 ) - MIN( IB1, IB2 ) + 1 220 CONTINUE RETURN * * End of PSLAED2 * END scalapack-2.0.2/SRC/pslaed3.f000644 000766 000024 00000026366 10363532303 016055 0ustar00juliestaff000000 000000 SUBROUTINE PSLAED3( ICTXT, K, N, NB, D, DROW, DCOL, RHO, DLAMDA, $ W, Z, U, LDU, BUF, INDX, INDCOL, INDROW, $ INDXR, INDXC, CTOT, NPCOL, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER DCOL, DROW, ICTXT, INFO, K, LDU, N, NB, NPCOL REAL RHO * .. * .. Array Arguments .. INTEGER CTOT( 0: NPCOL-1, 4 ), INDCOL( * ), $ INDROW( * ), INDX( * ), INDXC( * ), INDXR( * ) REAL BUF( * ), D( * ), DLAMDA( * ), U( LDU, * ), $ W( * ), Z( * ) * .. * * Purpose * ======= * * PSLAED3 finds the roots of the secular equation, as defined by the * values in D, W, and RHO, between 1 and K. It makes the * appropriate calls to SLAED4 * * This code makes very mild assumptions about floating point * arithmetic. It will work on machines with a guard digit in * add/subtract, or on those binary machines without guard digits * which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. * It could conceivably fail on hexadecimal or decimal machines * without guard digits, but we know of none. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle, indicating the global context of * the operation on the matrix. The context itself is global. * * K (output) INTEGER * The number of non-deflated eigenvalues, and the order of the * related secular equation. 0 <= K <=N. * * N (input) INTEGER * The dimension of the symmetric tridiagonal matrix. N >= 0. * * NB (global input) INTEGER * The blocking factor used to distribute the columns of the * matrix. NB >= 1. * * D (input/output) REAL array, dimension (N) * On entry, D contains the eigenvalues of the two submatrices to * be combined. * On exit, D contains the trailing (N-K) updated eigenvalues * (those which were deflated) sorted into increasing order. * * DROW (global input) INTEGER * The process row over which the first row of the matrix D is * distributed. 0 <= DROW < NPROW. * * DCOL (global input) INTEGER * The process column over which the first column of the * matrix D is distributed. 0 <= DCOL < NPCOL. * * Q (input/output) REAL array, dimension (LDQ, N) * On entry, Q contains the eigenvectors of two submatrices in * the two square blocks with corners at (1,1), (N1,N1) * and (N1+1, N1+1), (N,N). * On exit, Q contains the trailing (N-K) updated eigenvectors * (those which were deflated) in its last N-K columns. * * LDQ (input) INTEGER * The leading dimension of the array Q. LDQ >= max(1,NQ). * * RHO (global input/output) REAL * On entry, the off-diagonal element associated with the rank-1 * cut which originally split the two submatrices which are now * being recombined. * On exit, RHO has been modified to the value required by * PSLAED3. * * DLAMDA (global output) REAL array, dimension (N) * A copy of the first K eigenvalues which will be used by * SLAED3 to form the secular equation. * * W (global output) REAL array, dimension (N) * The first k values of the final deflation-altered z-vector * which will be passed to SLAED3. * * Z (global input) REAL array, dimension (N) * On entry, Z contains the updating vector (the last * row of the first sub-eigenvector matrix and the first row of * the second sub-eigenvector matrix). * On exit, the contents of Z have been destroyed by the updating * process. * * U (global output) REAL array * global dimension (N, N), local dimension (LDU, NQ). * Q contains the orthonormal eigenvectors of the symmetric * tridiagonal matrix. * * LDU (input) INTEGER * The leading dimension of the array U. * * QBUF (workspace) REAL array, dimension 3*N * * * INDX (workspace) INTEGER array, dimension (N) * The permutation used to sort the contents of DLAMDA into * ascending order. * * INDCOL (workspace) INTEGER array, dimension (N) * * * INDROW (workspace) INTEGER array, dimension (N) * * * INDXR (workspace) INTEGER array, dimension (N) * * * INDXC (workspace) INTEGER array, dimension (N) * * CTOT (workspace) INTEGER array, dimension( NPCOL, 4) * * NPCOL (global input) INTEGER * The total number of columns over which the distributed * submatrix is distributed. * * INFO (output) INTEGER * = 0: successful exit. * < 0: if INFO = -i, the i-th argument had an illegal value. * > 0: The algorithm failed to compute the ith eigenvalue. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E0 ) * .. * .. Local Scalars .. INTEGER COL, GI, I, IINFO, IIU, IPD, IU, J, JJU, JU, $ KK, KL, KLC, KLR, MYCOL, MYKL, MYKLR, MYROW, $ NPROW, PDC, PDR, ROW REAL AUX, TEMP * .. * .. External Functions .. INTEGER INDXG2L REAL SLAMC3, SNRM2 EXTERNAL INDXG2L, SLAMC3, SNRM2 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SCOPY, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D, SLAED4 * .. * .. Intrinsic Functions .. INTRINSIC MOD, SIGN, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * IINFO = 0 * * Quick return if possible * IF( K.EQ.0 ) $ RETURN * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * ROW = DROW COL = DCOL DO 20 I = 1, N, NB DO 10 J = 0, NB - 1 INDROW( I+J ) = ROW INDCOL( I+J ) = COL 10 CONTINUE ROW = MOD( ROW+1, NPROW ) COL = MOD( COL+1, NPCOL ) 20 CONTINUE * MYKL = CTOT( MYCOL, 1 ) + CTOT( MYCOL, 2 ) + CTOT( MYCOL, 3 ) KLR = MYKL / NPROW IF( MYROW.EQ.DROW ) THEN MYKLR = KLR + MOD( MYKL, NPROW ) ELSE MYKLR = KLR END IF PDC = 1 COL = DCOL 30 CONTINUE IF( MYCOL.NE.COL ) THEN PDC = PDC + CTOT( COL, 1 ) + CTOT( COL, 2 ) + CTOT( COL, 3 ) COL = MOD( COL+1, NPCOL ) GO TO 30 END IF PDR = PDC KL = KLR + MOD( MYKL, NPROW ) ROW = DROW 40 CONTINUE IF( MYROW.NE.ROW ) THEN PDR = PDR + KL KL = KLR ROW = MOD( ROW+1, NPROW ) GO TO 40 END IF * DO 50 I = 1, K DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) Z( I ) = ONE 50 CONTINUE IF( MYKLR.GT.0 ) THEN KK = PDR DO 80 I = 1, MYKLR CALL SLAED4( K, KK, DLAMDA, W, BUF, RHO, BUF( K+I ), IINFO ) IF( IINFO.NE.0 ) THEN INFO = KK END IF * * ..Compute part of z * DO 60 J = 1, KK - 1 Z( J ) = Z( J )*( BUF( J ) / $ ( DLAMDA( J )-DLAMDA( KK ) ) ) 60 CONTINUE Z( KK ) = Z( KK )*BUF( KK ) DO 70 J = KK + 1, K Z( J ) = Z( J )*( BUF( J ) / $ ( DLAMDA( J )-DLAMDA( KK ) ) ) 70 CONTINUE KK = KK + 1 80 CONTINUE * IF( MYROW.NE.DROW ) THEN CALL SCOPY( K, Z, 1, BUF, 1 ) CALL SGESD2D( ICTXT, K+MYKLR, 1, BUF, K+MYKLR, DROW, MYCOL ) ELSE IPD = 2*K + 1 CALL SCOPY( MYKLR, BUF( K+1 ), 1, BUF( IPD ), 1 ) IF( KLR.GT.0 ) THEN IPD = MYKLR + IPD ROW = MOD( DROW+1, NPROW ) DO 100 I = 1, NPROW - 1 CALL SGERV2D( ICTXT, K+KLR, 1, BUF, K+KLR, ROW, $ MYCOL ) CALL SCOPY( KLR, BUF( K+1 ), 1, BUF( IPD ), 1 ) DO 90 J = 1, K Z( J ) = Z( J )*BUF( J ) 90 CONTINUE IPD = IPD + KLR ROW = MOD( ROW+1, NPROW ) 100 CONTINUE END IF END IF END IF * IF( MYROW.EQ.DROW ) THEN IF( MYCOL.NE.DCOL .AND. MYKL.NE.0 ) THEN CALL SCOPY( K, Z, 1, BUF, 1 ) CALL SCOPY( MYKL, BUF( 2*K+1 ), 1, BUF( K+1 ), 1 ) CALL SGESD2D( ICTXT, K+MYKL, 1, BUF, K+MYKL, MYROW, DCOL ) ELSE IF( MYCOL.EQ.DCOL ) THEN IPD = 2*K + 1 COL = DCOL KL = MYKL DO 120 I = 1, NPCOL - 1 IPD = IPD + KL COL = MOD( COL+1, NPCOL ) KL = CTOT( COL, 1 ) + CTOT( COL, 2 ) + CTOT( COL, 3 ) IF( KL.NE.0 ) THEN CALL SGERV2D( ICTXT, K+KL, 1, BUF, K+KL, MYROW, COL ) CALL SCOPY( KL, BUF( K+1 ), 1, BUF( IPD ), 1 ) DO 110 J = 1, K Z( J ) = Z( J )*BUF( J ) 110 CONTINUE END IF 120 CONTINUE DO 130 I = 1, K Z( I ) = SIGN( SQRT( -Z( I ) ), W( I ) ) 130 CONTINUE * END IF END IF * * Diffusion * IF( MYROW.EQ.DROW .AND. MYCOL.EQ.DCOL ) THEN CALL SCOPY( K, Z, 1, BUF, 1 ) CALL SCOPY( K, BUF( 2*K+1 ), 1, BUF( K+1 ), 1 ) CALL SGEBS2D( ICTXT, 'All', ' ', 2*K, 1, BUF, 2*K ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 2*K, 1, BUF, 2*K, DROW, DCOL ) CALL SCOPY( K, BUF, 1, Z, 1 ) END IF * * Copy of D at the good place * KLC = 0 KLR = 0 DO 140 I = 1, K GI = INDX( I ) D( GI ) = BUF( K+I ) COL = INDCOL( GI ) ROW = INDROW( GI ) IF( COL.EQ.MYCOL ) THEN KLC = KLC + 1 INDXC( KLC ) = I END IF IF( ROW.EQ.MYROW ) THEN KLR = KLR + 1 INDXR( KLR ) = I END IF 140 CONTINUE * * Compute eigenvectors of the modified rank-1 modification. * IF( MYKL.NE.0 ) THEN DO 180 J = 1, MYKL KK = INDXC( J ) JU = INDX( KK ) JJU = INDXG2L( JU, NB, J, J, NPCOL ) CALL SLAED4( K, KK, DLAMDA, W, BUF, RHO, AUX, IINFO ) IF( IINFO.NE.0 ) THEN INFO = KK END IF IF( K.EQ.1 .OR. K.EQ.2 ) THEN DO 150 I = 1, KLR KK = INDXR( I ) IU = INDX( KK ) IIU = INDXG2L( IU, NB, J, J, NPROW ) U( IIU, JJU ) = BUF( KK ) 150 CONTINUE GO TO 180 END IF * DO 160 I = 1, K BUF( I ) = Z( I ) / BUF( I ) 160 CONTINUE TEMP = SNRM2( K, BUF, 1 ) DO 170 I = 1, KLR KK = INDXR( I ) IU = INDX( KK ) IIU = INDXG2L( IU, NB, J, J, NPROW ) U( IIU, JJU ) = BUF( KK ) / TEMP 170 CONTINUE 180 CONTINUE END IF * 190 CONTINUE RETURN * * End of PSLAED3 * END scalapack-2.0.2/SRC/pslaedz.f000644 000766 000024 00000012252 10363532303 016151 0ustar00juliestaff000000 000000 SUBROUTINE PSLAEDZ( N, N1, ID, Q, IQ, JQ, LDQ, DESCQ, Z, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER ID, IQ, JQ, LDQ, N, N1 * .. * .. Array Arguments .. INTEGER DESCQ( * ) REAL Q( LDQ, * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSLAEDZ Form the z-vector which consists of the last row of Q_1 * and the first row of Q_2. * ===================================================================== * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. * INTEGER COL, I, IBUF, ICTXT, IIQ, IIZ1, IIZ2, IQCOL, $ IQROW, IZ, IZ1, IZ1COL, IZ1ROW, IZ2, IZ2COL, $ IZ2ROW, J, JJQ, JJZ1, JJZ2, MYCOL, MYROW, N2, $ NB, NBLOC, NPCOL, NPROW, NQ1, NQ2, ZSIZ * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SCOPY, SGEBR2D, $ SGEBS2D, SGERV2D, SGESD2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * ICTXT = DESCQ( CTXT_ ) NB = DESCQ( NB_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( ID, ID, DESCQ, NPROW, NPCOL, MYROW, MYCOL, IIQ, JJQ, $ IQROW, IQCOL ) N2 = N - N1 * * Form z1 which consist of the last row of Q1 * CALL INFOG2L( IQ-1+( ID+N1-1 ), JQ-1+ID, DESCQ, NPROW, NPCOL, $ MYROW, MYCOL, IIZ1, JJZ1, IZ1ROW, IZ1COL ) NQ1 = NUMROC( N1, NB, MYCOL, IZ1COL, NPCOL ) IF( ( MYROW.EQ.IZ1ROW ) .AND. ( NQ1.NE.0 ) ) THEN CALL SCOPY( NQ1, Q( IIZ1, JJZ1 ), LDQ, WORK, 1 ) IF( MYROW.NE.IQROW .OR. MYCOL.NE.IQCOL ) $ CALL SGESD2D( ICTXT, NQ1, 1, WORK, NQ1, IQROW, IQCOL ) END IF * * Proc (IQROW, IQCOL) receive the parts of z1 * IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN COL = IZ1COL DO 20 I = 0, NPCOL - 1 NQ1 = NUMROC( N1, NB, COL, IZ1COL, NPCOL ) IF( NQ1.GT.0 ) THEN IF( IZ1ROW.NE.IQROW .OR. COL.NE.IQCOL ) THEN IBUF = N1 + 1 CALL SGERV2D( ICTXT, NQ1, 1, WORK( IBUF ), NQ1, $ IZ1ROW, COL ) ELSE IBUF = 1 END IF IZ1 = 0 IZ = I*NB + 1 NBLOC = ( NQ1-1 ) / NB + 1 DO 10 J = 1, NBLOC ZSIZ = MIN( NB, NQ1-IZ1 ) CALL SCOPY( ZSIZ, WORK( IBUF+IZ1 ), 1, Z( IZ ), 1 ) IZ1 = IZ1 + NB IZ = IZ + NB*NPCOL 10 CONTINUE END IF COL = MOD( COL+1, NPCOL ) 20 CONTINUE END IF * * Form z2 which consist of the first row of Q2 * CALL INFOG2L( IQ-1+( ID+N1 ), JQ-1+( ID+N1 ), DESCQ, NPROW, NPCOL, $ MYROW, MYCOL, IIZ2, JJZ2, IZ2ROW, IZ2COL ) NQ2 = NUMROC( N2, NB, MYCOL, IZ2COL, NPCOL ) IF( ( MYROW.EQ.IZ2ROW ) .AND. ( NQ2.NE.0 ) ) THEN CALL SCOPY( NQ2, Q( IIZ2, JJZ2 ), LDQ, WORK, 1 ) IF( MYROW.NE.IQROW .OR. MYCOL.NE.IQCOL ) $ CALL SGESD2D( ICTXT, NQ2, 1, WORK, NQ2, IQROW, IQCOL ) END IF * * Proc (IQROW, IQCOL) receive the parts of z2 * IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN COL = IZ2COL DO 40 I = 0, NPCOL - 1 NQ2 = NUMROC( N2, NB, COL, IZ2COL, NPCOL ) IF( NQ2.GT.0 ) THEN IF( IQROW.NE.IZ2ROW .OR. IQCOL.NE.COL ) THEN IBUF = 1 + N2 CALL SGERV2D( ICTXT, NQ2, 1, WORK( IBUF ), NQ2, $ IZ2ROW, COL ) ELSE IBUF = 1 END IF IZ2 = 0 IZ = NB*I + N1 + 1 NBLOC = ( NQ2-1 ) / NB + 1 DO 30 J = 1, NBLOC ZSIZ = MIN( NB, NQ2-IZ2 ) CALL SCOPY( ZSIZ, WORK( IBUF+IZ2 ), 1, Z( IZ ), 1 ) IZ2 = IZ2 + NB IZ = IZ + NB*NPCOL 30 CONTINUE END IF COL = MOD( COL+1, NPCOL ) 40 CONTINUE END IF * * proc(IQROW,IQCOL) broadcast Z=(Z1,Z2) * IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', N, 1, Z, N ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', N, 1, Z, N, IQROW, IQCOL ) END IF * RETURN * * End of PSLAEDZ * * END scalapack-2.0.2/SRC/pslaevswp.f000644 000766 000024 00000025642 10363532303 016542 0ustar00juliestaff000000 000000 * * SUBROUTINE PSLAEVSWP( N, ZIN, LDZI, Z, IZ, JZ, DESCZ, NVS, KEY, $ WORK, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 15, 1997 * * .. Scalar Arguments .. INTEGER IZ, JZ, LDZI, LWORK, N * .. * .. Array Arguments .. INTEGER DESCZ( * ), KEY( * ), NVS( * ) REAL WORK( * ), Z( * ), ZIN( LDZI, * ) * .. * * Purpose * ======= * * PSLAEVSWP moves the eigenvectors (potentially unsorted) from * where they are computed, to a ScaLAPACK standard block cyclic * array, sorted so that the corresponding eigenvalues are sorted. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * N (global input) INTEGER * The order of the matrix A. N >= 0. * * ZIN (local input) REAL array, * dimension ( LDZI, NVS(iam) ) * The eigenvectors on input. Each eigenvector resides entirely * in one process. Each process holds a contiguous set of * NVS(iam) eigenvectors. The first eigenvector which the * process holds is: sum for i=[0,iam-1) of NVS(i) * * LDZI (locl input) INTEGER * leading dimension of the ZIN array * * Z (local output) REAL array * global dimension (N, N), local dimension (DESCZ(DLEN_), NQ) * The eigenvectors on output. The eigenvectors are distributed * in a block cyclic manner in both dimensions, with a * block size of NB. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * NVS (global input) INTEGER array, dimension( nprocs+1 ) * nvs(i) = number of processes * number of eigenvectors held by processes [0,i-1) * nvs(1) = number of eigen vectors held by [0,1-1) == 0 * nvs(nprocs+1) = number of eigen vectors held by [0,nprocs) == * total number of eigenvectors * * KEY (global input) INTEGER array, dimension( N ) * Indicates the actual index (after sorting) for each of the * eigenvectors. * * WORK (local workspace) REAL array, dimension (LWORK) * * LWORK (local input) INTEGER dimension of WORK * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CYCLIC_I, CYCLIC_J, DIST, I, IAM, II, INCII, J, $ MAXI, MAXII, MINI, MINII, MYCOL, MYROW, NB, $ NBUFSIZE, NPCOL, NPROCS, NPROW, PCOL, RECVCOL, $ RECVFROM, RECVROW, SENDCOL, SENDROW, SENDTO * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P EXTERNAL INDXG2L, INDXG2P * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGERV2D, SGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW + MYCOL*NPROW IAM = MYROW*NPCOL + MYCOL * NB = DESCZ( MB_ ) * NPROCS = NPROW*NPCOL * * If PxSTEIN operates on a sub-matrix of a global matrix, the * key [] that contains the indicies of the eigenvectors is refe- * renced to the dimensions of the sub-matrix and not the global * distrubited matrix. Because of this, PxLAEVSWP will incorrectly * map the eigenvectors to the global eigenvector matrix, Z, unless * the key[] elements are shifted as below. * DO 10 J = DESCZ( N_ ), 1, -1 KEY( J ) = KEY( J-JZ+1 ) + ( JZ-1 ) 10 CONTINUE * DO 110 DIST = 0, NPROCS - 1 * SENDTO = MOD( IAM+DIST, NPROCS ) RECVFROM = MOD( NPROCS+IAM-DIST, NPROCS ) * SENDROW = MOD( SENDTO, NPROW ) SENDCOL = SENDTO / NPROW RECVROW = MOD( RECVFROM, NPROW ) RECVCOL = RECVFROM / NPROW * SENDROW = SENDTO / NPCOL SENDCOL = MOD( SENDTO, NPCOL ) RECVROW = RECVFROM / NPCOL RECVCOL = MOD( RECVFROM, NPCOL ) * * Figure out what I have that process "sendto" wants * NBUFSIZE = 0 * * We are looping through the eigenvectors that I presently own. * DO 40 J = NVS( 1+IAM ) + JZ, NVS( 1+IAM+1 ) + JZ - 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( SENDCOL.EQ.PCOL ) THEN MINII = MOD( SENDROW+DESCZ( RSRC_ ), NPROW )* $ DESCZ( MB_ ) + 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 30 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) MAXI = MIN( II+DESCZ( MB_ )-1, N+IZ-1 ) DO 20 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 WORK( NBUFSIZE ) = ZIN( I+1-IZ, $ J-NVS( 1+IAM )+1-JZ ) 20 CONTINUE 30 CONTINUE END IF 40 CONTINUE * * IF( MYROW.NE.SENDROW .OR. MYCOL.NE.SENDCOL ) $ CALL SGESD2D( DESCZ( CTXT_ ), NBUFSIZE, 1, WORK, NBUFSIZE, $ SENDROW, SENDCOL ) * * * Figure out what process "recvfrom" has that I want * NBUFSIZE = 0 DO 70 J = NVS( 1+RECVFROM ) + JZ, $ NVS( 1+RECVFROM+1 ) + JZ - 1, 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.PCOL ) THEN MINII = MOD( MYROW+DESCZ( RSRC_ ), NPROW )*DESCZ( MB_ ) + $ 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 60 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) MAXI = MIN( II+NB-1, N+IZ-1 ) DO 50 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 50 CONTINUE 60 CONTINUE END IF 70 CONTINUE * * * IF( MYROW.NE.RECVROW .OR. MYCOL.NE.RECVCOL ) $ CALL SGERV2D( DESCZ( CTXT_ ), 1, NBUFSIZE, WORK, 1, RECVROW, $ RECVCOL ) * NBUFSIZE = 0 DO 100 J = NVS( 1+RECVFROM ) + JZ, $ NVS( 1+RECVFROM+1 ) + JZ - 1, 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.PCOL ) THEN CYCLIC_J = INDXG2L( KEY( J ), DESCZ( MB_ ), -1, -1, $ NPCOL ) CYCLIC_I = 1 MINII = MOD( MYROW+DESCZ( RSRC_ ), NPROW )*DESCZ( MB_ ) + $ 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 90 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) CYCLIC_I = INDXG2L( MINI, DESCZ( MB_ ), -1, -1, $ NPROW ) MAXI = MIN( II+NB-1, N+IZ-1 ) DO 80 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 Z( CYCLIC_I+( CYCLIC_J-1 )*DESCZ( LLD_ ) ) $ = WORK( NBUFSIZE ) CYCLIC_I = CYCLIC_I + 1 80 CONTINUE 90 CONTINUE END IF 100 CONTINUE * 110 CONTINUE RETURN * * End of PSLAEVSWP * END scalapack-2.0.2/SRC/pslahqr.f000644 000766 000024 00000261765 11750130340 016174 0ustar00juliestaff000000 000000 SUBROUTINE PSLAHQR( WANTT, WANTZ, N, ILO, IHI, A, DESCA, WR, WI, $ ILOZ, IHIZ, Z, DESCZ, WORK, LWORK, IWORK, $ ILWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, ILWORK, INFO, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) REAL A( * ), WI( * ), WORK( * ), WR( * ), Z( * ) * .. * * Purpose * ======= * * PSLAHQR is an auxiliary routine used to find the Schur decomposition * and or eigenvalues of a matrix already in Hessenberg form from * cols ILO to IHI. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix A (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that A is already upper quasi-triangular in * rows and columns IHI+1:N, and that A(ILO,ILO-1) = 0 (unless * ILO = 1). PSLAHQR works primarily with the Hessenberg * submatrix in rows and columns ILO to IHI, but applies * transformations to all of H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * A (global input/output) REAL array, dimension * (DESCA(LLD_),*) * On entry, the upper Hessenberg matrix A. * On exit, if WANTT is .TRUE., A is upper quasi-triangular in * rows and columns ILO:IHI, with any 2-by-2 or larger diagonal * blocks not yet in standard form. If WANTT is .FALSE., the * contents of A are unspecified on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WR (global replicated output) REAL array, * dimension (N) * WI (global replicated output) REAL array, * dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in A. A may be returned with * larger diagonal blocks until the next release. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (global input/output) REAL array. * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations accumulated by PDHSEQR, and on * exit Z has been updated; transformations are applied only to * the submatrix Z(ILOZ:IHIZ,ILO:IHI). * If WANTZ is .FALSE., Z is not referenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local output) REAL array of size LWORK * * LWORK (local input) INTEGER * WORK(LWORK) is a local array and LWORK is assumed big enough * so that LWORK >= 3*N + * MAX( 2*MAX(DESCZ(LLD_),DESCA(LLD_)) + 2*LOCc(N), * 7*Ceil(N/HBL)/LCM(NPROW,NPCOL)) ) * * IWORK (global and local input) INTEGER array of size ILWORK * * ILWORK (local input) INTEGER * This holds the some of the IBLK integer arrays. This is held * as a place holder for the next release. * * INFO (global output) INTEGER * < 0: parameter number -INFO incorrect or inconsistent * = 0: successful exit * > 0: PSLAHQR failed to compute all the eigenvalues ILO to IHI * in a total of 30*(IHI-ILO+1) iterations; if INFO = i, * elements i+1:ihi of WR and WI contain those eigenvalues * which have been successfully computed. * * Logic: * This algorithm is very similar to _LAHQR. Unlike _LAHQR, * instead of sending one double shift through the largest * unreduced submatrix, this algorithm sends multiple double shifts * and spaces them apart so that there can be parallelism across * several processor row/columns. Another critical difference is * that this algorithm aggregrates multiple transforms together in * order to apply them in a block fashion. * * Important Local Variables: * IBLK = The maximum number of bulges that can be computed. * Currently fixed. Future releases this won't be fixed. * HBL = The square block size (HBL=DESCA(MB_)=DESCA(NB_)) * ROTN = The number of transforms to block together * NBULGE = The number of bulges that will be attempted on the * current submatrix. * IBULGE = The current number of bulges started. * K1(*),K2(*) = The current bulge loops from K1(*) to K2(*). * * Subroutines: * This routine calls: * PSLACONSB -> To determine where to start each iteration * PSLAWIL -> Given the shift, get the transformation * SLASORTE -> Pair up eigenvalues so that reals are paired. * PSLACP3 -> Parallel array to local replicated array copy & * back. * SLAREF -> Row/column reflector applier. Core routine * here. * PSLASMSUB -> Finds negligible subdiagonal elements. * * Current Notes and/or Restrictions: * 1.) This code requires the distributed block size to be square * and at least six (6); unlike simpler codes like LU, this * algorithm is extremely sensitive to block size. Unwise * choices of too small a block size can lead to bad * performance. * 2.) This code requires A and Z to be distributed identically * and have identical contxts. * 3.) This release currently does not have a routine for * resolving the Schur blocks into regular 2x2 form after * this code is completed. Because of this, a significant * performance impact is required while the deflation is done * by sometimes a single column of processors. * 4.) This code does not currently block the initial transforms * so that none of the rows or columns for any bulge are * completed until all are started. To offset pipeline * start-up it is recommended that at least 2*LCM(NPROW,NPCOL) * bulges are used (if possible) * 5.) The maximum number of bulges currently supported is fixed at * 32. In future versions this will be limited only by the * incoming WORK array. * 6.) The matrix A must be in upper Hessenberg form. If elements * below the subdiagonal are nonzero, the resulting transforms * may be nonsimilar. This is also true with the LAPACK * routine. * 7.) For this release, it is assumed RSRC_=CSRC_=0 * 8.) Currently, all the eigenvalues are distributed to all the * nodes. Future releases will probably distribute the * eigenvalues by the column partitioning. * 9.) The internals of this routine are subject to change. * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE, HALF PARAMETER ( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) REAL CONST PARAMETER ( CONST = 1.50 ) INTEGER IBLK PARAMETER ( IBLK = 32 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, I, I1, I2, IAFIRST, IBULGE, $ ICBUF, ICOL, ICOL1, ICOL2, IDIA, IERR, II, $ IRBUF, IROW, IROW1, IROW2, ISPEC, ISTART, $ ISTARTCOL, ISTARTROW, ISTOP, ISUB, ISUP, $ ITERMAX, ITMP1, ITMP2, ITN, ITS, J, JAFIRST, $ JBLK, JJ, K, KI, L, LCMRC, LDA, LDZ, LEFT, $ LIHIH, LIHIZ, LILOH, LILOZ, LOCALI1, LOCALI2, $ LOCALK, LOCALM, M, MODKM1, MYCOL, MYROW, $ NBULGE, NH, NODE, NPCOL, NPROW, NR, NUM, NZ, $ RIGHT, ROTN, UP, VECSIDX REAL AVE, DISC, H00, H10, H11, H12, H21, H22, H33, $ H43H34, H44, OVFL, S, SMLNUM, SUM, T1, T1COPY, $ T2, T3, ULP, UNFL, V1SAVE, V2, V2SAVE, V3, $ V3SAVE, CS, SN * .. * .. Local Arrays .. INTEGER ICURCOL( IBLK ), ICURROW( IBLK ), K1( IBLK ), $ K2( IBLK ), KCOL( IBLK ), KP2COL( IBLK ), $ KP2ROW( IBLK ), KROW( IBLK ), LOCALK2( IBLK ) REAL S1( 2*IBLK, 2*IBLK ), SMALLA( 6, 6, IBLK ), $ VCOPY( 3 ) * .. * .. External Functions .. INTEGER ILCM, NUMROC REAL PSLAMCH EXTERNAL ILCM, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SCOPY, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D, SGSUM2D, SLAHQR, SLAREF, $ SLARFG, SLASORTE, IGAMN2D, INFOG1L, INFOG2L, $ PSLABAD, PSLACONSB, PSLACP3, PSLASMSUB, $ PSLAWIL, PXERBLA, SLANV2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SIGN, SQRT * .. * .. Executable Statements .. * INFO = 0 * ITERMAX = 30*( IHI-ILO+1 ) * ITERMAX = 0 IF( N.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL NUM = NPROW*NPCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) LCMRC = ILCM( NPROW, NPCOL ) * * Determine the number of columns we have so we can check workspace * LOCALK = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) JJ = N / HBL IF( JJ*HBL.LT.N ) $ JJ = JJ + 1 JJ = 7*JJ / LCMRC IF( LWORK.LT.3*N+MAX( 2*MAX( LDA, LDZ )+2*LOCALK, JJ ) ) THEN INFO = -15 END IF IF( DESCZ( CTXT_ ).NE.DESCA( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) END IF IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1300+NB_ ) END IF IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1300+MB_ ) END IF IF( ( DESCA( RSRC_ ).NE.0 ) .OR. ( DESCA( CSRC_ ).NE.0 ) ) THEN INFO = -( 700+RSRC_ ) END IF IF( ( DESCZ( RSRC_ ).NE.0 ) .OR. ( DESCZ( CSRC_ ).NE.0 ) ) THEN INFO = -( 1300+RSRC_ ) END IF IF( ( ILO.GT.N ) .OR. ( ILO.LT.1 ) ) THEN INFO = -4 END IF IF( ( IHI.GT.N ) .OR. ( IHI.LT.1 ) ) THEN INFO = -5 END IF IF( HBL.LT.5 ) THEN INFO = -( 700+MB_ ) END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PSLAHQR', -INFO ) RETURN END IF * * Set work array indices * VECSIDX = 0 IDIA = 3*N ISUB = 3*N ISUP = 3*N IRBUF = 3*N ICBUF = 3*N * * Find a value for ROTN * ROTN = HBL / 3 ROTN = MAX( ROTN, HBL-2 ) ROTN = MIN( ROTN, 1 ) * IF( ILO.EQ.IHI ) THEN CALL INFOG2L( ILO, ILO, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN WR( ILO ) = A( ( ICOL-1 )*LDA+IROW ) ELSE WR( ILO ) = ZERO END IF WI( ILO ) = ZERO RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * CALL INFOG1L( ILOZ, HBL, NPROW, MYROW, 0, LILOZ, LIHIZ ) LIHIZ = NUMROC( IHIZ, HBL, MYROW, 0, NPROW ) * * Set machine-dependent constants for the stopping criterion. * If NORM(H) <= SQRT(OVFL), overflow should not occur. * UNFL = PSLAMCH( CONTXT, 'SAFE MINIMUM' ) OVFL = ONE / UNFL CALL PSLABAD( CONTXT, UNFL, OVFL ) ULP = PSLAMCH( CONTXT, 'PRECISION' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = ITERMAX * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of our schur block size (<=2*IBLK). Each * iteration of the loop works with the active submatrix in rows * and columns L to I. Eigenvalues I+1 to IHI have already * converged. Either L = ILO or the global A(L,L-1) is negligible * so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 450 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 420 ITS = 0, ITN * * Look for a single small subdiagonal element. * CALL PSLASMSUB( A, DESCA, I, L, K, SMLNUM, WORK( IRBUF+1 ), $ LWORK-IRBUF ) L = K * IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * CALL INFOG2L( L, L-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN A( ( ICOL-1 )*LDA+IROW ) = ZERO END IF WORK( ISUB+L-1 ) = ZERO END IF * * Exit from loop if a submatrix of order 1 or 2 has split off. * M = L - 10 * IF ( L .GE. I - (2*IBLK-1) ) * IF ( L .GE. I - MAX(2*IBLK-1,HBL) ) IF( L.GE.I-1 ) $ GO TO 430 * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * * Copy submatrix of size 2*JBLK and prepare to do generalized * Wilkinson shift or an exceptional shift * JBLK = MIN( IBLK, ( ( I-L+1 ) / 2 )-1 ) IF( JBLK.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * JBLK = JBLK - MOD( JBLK, LCMRC ) END IF JBLK = MIN( JBLK, 2*LCMRC ) JBLK = MAX( JBLK, 1 ) * CALL PSLACP3( 2*JBLK, I-2*JBLK+1, A, DESCA, S1, 2*IBLK, -1, -1, $ 0 ) IF( ITS.EQ.20 .OR. ITS.EQ.40 ) THEN * * Exceptional shift. * DO 20 II = 2*JBLK, 2, -1 S1( II, II ) = CONST*( ABS( S1( II, II ) )+ $ ABS( S1( II, II-1 ) ) ) S1( II, II-1 ) = ZERO S1( II-1, II ) = ZERO 20 CONTINUE S1( 1, 1 ) = CONST*ABS( S1( 1, 1 ) ) ELSE CALL SLAHQR( .FALSE., .FALSE., 2*JBLK, 1, 2*JBLK, S1, $ 2*IBLK, WORK( IRBUF+1 ), WORK( ICBUF+1 ), 1, $ 2*JBLK, Z, LDZ, IERR ) * * Prepare to use Wilkinson's double shift * H44 = S1( 2*JBLK, 2*JBLK ) H33 = S1( 2*JBLK-1, 2*JBLK-1 ) H43H34 = S1( 2*JBLK-1, 2*JBLK )*S1( 2*JBLK, 2*JBLK-1 ) IF( ( JBLK.GT.1 ) .AND. ( ITS.GT.30 ) ) THEN S = S1( 2*JBLK-1, 2*JBLK-2 ) DISC = ( H33-H44 )*HALF DISC = DISC*DISC + H43H34 IF( DISC.GT.ZERO ) THEN * * Real roots: Use Wilkinson's shift twice * DISC = SQRT( DISC ) AVE = HALF*( H33+H44 ) IF( ABS( H33 )-ABS( H44 ).GT.ZERO ) THEN H33 = H33*H44 - H43H34 H44 = H33 / ( SIGN( DISC, AVE )+AVE ) ELSE H44 = SIGN( DISC, AVE ) + AVE END IF H33 = H44 H43H34 = ZERO END IF END IF END IF * * Look for two consecutive small subdiagonal elements: * PSLACONSB is the routine that does this. * c CALL PSLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, c $ WORK( IRBUF+1 ), LWORK-IRBUF ) * * Skip small submatrices * * IF ( M .GE. I - 5 ) * $ GO TO 80 * * In principle PSLACONSB needs to check all shifts to decide * whether two consecutive small subdiagonal entries are suitable * as the starting position of the bulge chasing phase. It can be * dangerous to check the first pair of shifts only. Moreover it * is quite rare to obtain an M which is much larger than L. This * process is a bit expensive compared with the benefit. * Therefore it is sensible to abandon this routine. Total amount * of communications is saved in average. * M = L * Double-shift QR step * * NBULGE is the number of bulges that will be attempted * ISTOP = MIN( M+ROTN-MOD( M, ROTN ), I-2 ) ISTOP = MIN( ISTOP, M+HBL-3-MOD( M-1, HBL ) ) ISTOP = MIN( ISTOP, I2-2 ) ISTOP = MAX( ISTOP, M ) NBULGE = ( I-1-ISTOP ) / HBL * * Do not exceed maximum determined. * NBULGE = MIN( NBULGE, JBLK ) IF( NBULGE.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * NBULGE = NBULGE - MOD( NBULGE, LCMRC ) END IF NBULGE = MAX( NBULGE, 1 ) * IF( ( ITS.NE.20 ) .AND. ( ITS.NE.40 ) .AND. ( NBULGE.GT.1 ) ) $ THEN * * sort the eigenpairs so that they are in twos for double * shifts. only call if several need sorting * CALL SLASORTE( S1( 2*( JBLK-NBULGE )+1, $ 2*( JBLK-NBULGE )+1 ), 2*IBLK, 2*NBULGE, $ WORK( IRBUF+1 ), IERR ) END IF * * IBULGE is the number of bulges going so far * IBULGE = 1 * * "A" row defs : main row transforms from LOCALK to LOCALI2 * CALL INFOG1L( M, HBL, NPCOL, MYCOL, 0, ITMP1, LOCALK ) LOCALK = NUMROC( N, HBL, MYCOL, 0, NPCOL ) CALL INFOG1L( 1, HBL, NPCOL, MYCOL, 0, ICOL1, LOCALI2 ) LOCALI2 = NUMROC( I2, HBL, MYCOL, 0, NPCOL ) * * "A" col defs : main col transforms from LOCALI1 to LOCALM * CALL INFOG1L( I1, HBL, NPROW, MYROW, 0, LOCALI1, ICOL1 ) ICOL1 = NUMROC( N, HBL, MYROW, 0, NPROW ) CALL INFOG1L( 1, HBL, NPROW, MYROW, 0, LOCALM, ICOL1 ) ICOL1 = NUMROC( MIN( M+3, I ), HBL, MYROW, 0, NPROW ) * * Which row & column will start the bulges * ISTARTROW = MOD( ( M+1 ) / HBL, NPROW ) + IAFIRST ISTARTCOL = MOD( ( M+1 ) / HBL, NPCOL ) + JAFIRST * CALL INFOG1L( M, HBL, NPROW, MYROW, 0, II, ITMP2 ) ITMP2 = NUMROC( N, HBL, MYROW, 0, NPROW ) CALL INFOG1L( M, HBL, NPCOL, MYCOL, 0, JJ, ITMP2 ) ITMP2 = NUMROC( N, HBL, MYCOL, 0, NPCOL ) CALL INFOG1L( 1, HBL, NPROW, MYROW, 0, ISTOP, KP2ROW( 1 ) ) KP2ROW( 1 ) = NUMROC( M+2, HBL, MYROW, 0, NPROW ) CALL INFOG1L( 1, HBL, NPCOL, MYCOL, 0, ISTOP, KP2COL( 1 ) ) KP2COL( 1 ) = NUMROC( M+2, HBL, MYCOL, 0, NPCOL ) * * Set all values for bulges. All bulges are stored in * intermediate steps as loops over KI. Their current "task" * over the global M to I-1 values is always K1(KI) to K2(KI). * However, because there are many bulges, K1(KI) & K2(KI) might * go past that range while later bulges (KI+1,KI+2,etc..) are * finishing up. * * Rules: * If MOD(K1(KI)-1,HBL) < HBL-2 then MOD(K2(KI)-1,HBL)= 0. * * K (global input) INTEGER * The offset for the reduction. Elements below the k-th * subdiagonal in the first NB columns are reduced to zero. * * NB (global input) INTEGER * The number of columns to be reduced. * * A (local input/local output) REAL pointer into * the local memory to an array of dimension (LLD_A, * LOCc(JA+N-K)). On entry, this array contains the the local * pieces of the N-by-(N-K+1) general distributed matrix * A(IA:IA+N-1,JA:JA+N-K). On exit, the elements on and above * the k-th subdiagonal in the first NB columns are overwritten * with the corresponding elements of the reduced distributed * matrix; the elements below the k-th subdiagonal, with the * array TAU, represent the matrix Q as a product of elementary * reflectors. The other columns of A(IA:IA+N-1,JA:JA+N-K) are * unchanged. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). TAU is tied to the distributed matrix A. * * T (local output) REAL array, dimension (NB_A,NB_A) * The upper triangular matrix T. * * Y (local output) REAL pointer into the local memory * to an array of dimension (LLD_Y,NB_A). On exit, this array * contains the local pieces of the N-by-NB distributed * matrix Y. LLD_Y >= LOCr(IA+N-1). * * IY (global input) INTEGER * The row index in the global array Y indicating the first * row of sub( Y ). * * JY (global input) INTEGER * The column index in the global array Y indicating the * first column of sub( Y ). * * DESCY (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Y. * * WORK (local workspace) REAL array, dimension (NB) * * Further Details * =============== * * The matrix Q is represented as a product of nb elementary reflectors * * Q = H(1) H(2) . . . H(nb). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in * A(ia+i+k:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The elements of the vectors v together form the (n-k+1)-by-nb matrix * V which is needed, with T and Y, to apply the transformation to the * unreduced part of the matrix, using an update of the form: * A(ia:ia+n-1,ja:ja+n-k) := (I-V*T*V')*(A(ia:ia+n-1,ja:ja+n-k)-Y*V'). * * The contents of A(ia:ia+n-1,ja:ja+n-k) on exit are illustrated by the * following example with n = 7, k = 3 and nb = 2: * * ( a h a a a ) * ( a h a a a ) * ( a h a a a ) * ( h h a a a ) * ( v1 h a a a ) * ( v1 v2 a a a ) * ( v1 v2 a a a ) * * where a denotes an element of the original matrix * A(ia:ia+n-1,ja:ja+n-k), h denotes a modified element of the upper * Hessenberg matrix H, and vi denotes an element of the vector * defining H(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL IPROC INTEGER I, IACOL, IAROW, ICTXT, IOFF, II, J, JJ, JL, $ JT, JW, L, MYROW, MYCOL, NPCOL, NPROW, NQ REAL EI * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSELSET, $ PSGEMV, PSLARFG, PSSCAL, SAXPY, $ SCOPY, SSCAL, STRMV * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.1 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA+K, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) * IPROC = ( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) NQ = NUMROC( N+JA-1, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - IOFF * EI = ZERO JW = IOFF + 1 CALL DESCSET( DESCW, 1, DESCA( MB_ ), 1, DESCA( MB_ ), IAROW, $ IACOL, ICTXT, 1 ) * DO 10 L = 1, NB I = IA + K + L - 2 J = JA + L - 1 * IF( L.GT.1 ) THEN * * Update A(ia:ia+n-1,j) * * Compute i-th column of A - Y * V' * CALL PSGEMV( 'No transpose', N, L-1, -ONE, Y, IY, JY, DESCY, $ A, I, JA, DESCA, DESCA( M_ ), ONE, A, IA, J, $ DESCA, 1 ) * * Apply I - V * T' * V' to this column (call it b) from the * left, using the last column of T as workspace * * Let V = ( V1 ) and b = ( b1 ) (first I-1 rows) * ( V2 ) ( b2 ) * * where V1 is unit lower triangular * * w := V1' * b1 * IF( IPROC ) THEN CALL SCOPY( L-1, A( (JJ+L-2)*DESCA( LLD_ )+II ), 1, $ WORK( JW ), 1 ) CALL STRMV( 'Lower', 'Transpose', 'Unit', L-1, $ A( (JJ-1)*DESCA( LLD_ )+II ), DESCA( LLD_ ), $ WORK( JW ), 1 ) END IF * * w := w + V2'*b2 * CALL PSGEMV( 'Transpose', N-K-L+1, L-1, ONE, A, I+1, JA, $ DESCA, A, I+1, J, DESCA, 1, ONE, WORK, 1, JW, $ DESCW, DESCW( M_ ) ) * * w := T'*w * IF( IPROC ) $ CALL STRMV( 'Upper', 'Transpose', 'Non-unit', L-1, T, $ DESCA( NB_ ), WORK( JW ), 1 ) * * b2 := b2 - V2*w * CALL PSGEMV( 'No transpose', N-K-L+1, L-1, -ONE, A, I+1, JA, $ DESCA, WORK, 1, JW, DESCW, DESCW( M_ ), ONE, $ A, I+1, J, DESCA, 1 ) * * b1 := b1 - V1*w * IF( IPROC ) THEN CALL STRMV( 'Lower', 'No transpose', 'Unit', L-1, $ A( (JJ-1)*DESCA( LLD_ )+II ), DESCA( LLD_ ), $ WORK( JW ), 1 ) CALL SAXPY( L-1, -ONE, WORK( JW ), 1, $ A( ( JJ+L-2 )*DESCA( LLD_ )+II ), 1 ) END IF CALL PSELSET( A, I, J-1, DESCA, EI ) END IF * * Generate the elementary reflector H(i) to annihilate * A(ia+k+i:ia+n-1,j) * CALL PSLARFG( N-K-L+1, EI, I+1, J, A, MIN( I+2, N+IA-1 ), J, $ DESCA, 1, TAU ) CALL PSELSET( A, I+1, J, DESCA, ONE ) * * Compute Y(iy:y+n-1,jy+l-1) * CALL PSGEMV( 'No transpose', N, N-K-L+1, ONE, A, IA, J+1, $ DESCA, A, I+1, J, DESCA, 1, ZERO, Y, IY, JY+L-1, $ DESCY, 1 ) CALL PSGEMV( 'Transpose', N-K-L+1, L-1, ONE, A, I+1, JA, DESCA, $ A, I+1, J, DESCA, 1, ZERO, WORK, 1, JW, DESCW, $ DESCW( M_ ) ) CALL PSGEMV( 'No transpose', N, L-1, -ONE, Y, IY, JY, DESCY, $ WORK, 1, JW, DESCW, DESCW( M_ ), ONE, Y, IY, $ JY+L-1, DESCY, 1 ) JL = MIN( JJ+L-1, JA+NQ-1 ) CALL PSSCAL( N, TAU( JL ), Y, IY, JY+L-1, DESCY, 1 ) * * Compute T(1:i,i) * IF( IPROC ) THEN JT = ( L-1 ) * DESCA( NB_ ) CALL SSCAL( L-1, -TAU( JL ), WORK( JW ), 1 ) CALL SCOPY( L-1, WORK( JW ), 1, T( JT+1 ), 1 ) CALL STRMV( 'Upper', 'No transpose', 'Non-unit', L-1, T, $ DESCA( NB_ ), T( JT+1 ), 1 ) T( JT+L ) = TAU( JL ) END IF 10 CONTINUE * CALL PSELSET( A, K+NB+IA-1, J, DESCA, EI ) * RETURN * * End of PSLAHRD * END scalapack-2.0.2/SRC/pslaiect.c000644 000766 000024 00000013767 11735440737 016341 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- ScaLAPACK routine (version 1.5) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * --------------------------------------------------------------------- */ /* * Include Files */ #include "pxsyevx.h" #include #include #define proto(x) () void pslasnbt_( int *ieflag ) { /* * * Purpose * ======= * * psalsnbt finds the position of the signbit of a single * precision floating point number. This routine assumes IEEE * arithmetic, and hence, tests only the 32nd bit as a possibility * for the sign bit. * * Note : For this release, we assume that sizeof(int) is 4 bytes. * * Note : If a compile time flag (NO_IEEE) indicates that the * machine does not have IEEE arithmetic, IEFLAG = 0 is returned. * * Arguments * ========= * * IEFLAG (output) INTEGER * This indicates the position of the signbit of any single * precision floating point number. * IEFLAG = 0 if the compile time flag, NO_IEEE, indicates * that the machine does not have IEEE arithmetic, or if * sizeof(int) is different from 4 bytes. * IEFLAG = 1 indicates that the sign bit is the 32nd bit. * * ===================================================================== * * .. Local Scalars .. */ float x; int negone=-1, errornum; unsigned int *ix; /* .. * .. Executable Statements .. */ #ifdef NO_IEEE *ieflag = 0; #else if(sizeof(int) != 4){ *ieflag = 0; return; } x = (float) -1.0; ix = (unsigned int *) &x; if( *ix == 0xbff00000 ) { *ieflag = 1; } else { *ieflag = 0; } #endif } void pslaiect_( float *sigma, int *n, float *d, int *count ) { /* * * Purpose * ======= * * pslaiect computes the number of negative eigenvalues of (A- SIGMA I). * This implementation of the Sturm Sequence loop exploits IEEE Arithmetic * and has no conditionals in the innermost loop. The signbit is assumed * to be bit 32. * * Note that all arguments are call-by-reference so that this routine * can be directly called from Fortran code. * * This is a ScaLAPACK internal subroutine and arguments are not * checked for unreasonable values. * * Arguments * ========= * * SIGMA (input) REAL * The shift. pslaiect finds the number of eigenvalues less * than equal to SIGMA. * * N (input) INTEGER * The order of the tridiagonal matrix T. N >= 1. * * D (input) REAL array, dimension (2*N - 1) * Contains the diagonals and the squares of the off-diagonal * elements of the tridiagonal matrix T. These elements are * assumed to be interleaved in memory for better cache * performance. The diagonal entries of T are in the entries * D(1),D(3),...,D(2*N-1), while the squares of the off-diagonal * entries are D(2),D(4),...,D(2*N-2). To avoid overflow, the * matrix must be scaled so that its largest entry is no greater * than overflow**(1/2) * underflow**(1/4) in absolute value, * and for greatest accuracy, it should not be much smaller * than that. * * COUNT (output) INTEGER * The count of the number of eigenvalues of T less than or * equal to SIGMA. * * ===================================================================== * * .. Local Scalars .. */ float lsigma, tmp, *pd, *pe2; int i; /* .. * .. Executable Statements .. */ lsigma = *sigma; pd = d; pe2 = d+1; tmp = *pd - lsigma; pd += 2; *count = (*((int *)&tmp) >> 31) & 1; for(i = 1;i < *n;i++){ tmp = *pd - *pe2/tmp - lsigma; pd += 2; pe2 += 2; *count += ((*((int *)&tmp)) >> 31) & 1; } } void pslachkieee_( int *isieee, float *rmax, float *rmin ) { /* * * Purpose * ======= * * pslachkieee performs a simple check to make sure that the features * of the IEEE standard that we rely on are implemented. In some * implementations, pslachkieee may not return. * * Note that all arguments are call-by-reference so that this routine * can be directly called from Fortran code. * * This is a ScaLAPACK internal subroutine and arguments are not * checked for unreasonable values. * * Arguments * ========= * * ISIEEE (local output) INTEGER * On exit, ISIEEE = 1 implies that all the features of the * IEEE standard that we rely on are implemented. * On exit, ISIEEE = 0 implies that some the features of the * IEEE standard that we rely on are missing. * * RMAX (local input) REAL * The overflow threshold ( = SLAMCH('O') ). * * RMIN (local input) REAL * The underflow threshold ( = SLAMCH('U') ). * * ===================================================================== * * .. Local Scalars .. */ float x, pinf, pzero, ninf, nzero; int ieflag, *ix, sbit1, sbit2, negone=-1, errornum; /* .. * .. Executable Statements .. */ pslasnbt_( &ieflag ); pinf = *rmax / *rmin; pzero = 1.0 / pinf; pinf = 1.0 / pzero; if( pzero != 0.0 ){ printf("pzero = %g should be zero\n",pzero); *isieee = 0; return ; } if( ieflag == 1 ){ sbit1 = (*((int *)&pzero) >> 31) & 1; sbit2 = (*((int *)&pinf) >> 31) & 1; } if( sbit1 == 1 ){ printf("Sign of positive infinity is incorrect\n"); *isieee = 0; } if( sbit2 == 1 ){ printf("Sign of positive zero is incorrect\n"); *isieee = 0; } ninf = -pinf; nzero = 1.0 / ninf; ninf = 1.0 / nzero; if( nzero != 0.0 ){ printf("nzero = %g should be zero\n",nzero); *isieee = 0; } if( ieflag == 1 ){ sbit1 = (*((int *)&nzero) >> 31) & 1; sbit2 = (*((int *)&ninf) >> 31) & 1; } if( sbit1 == 0 ){ printf("Sign of negative infinity is incorrect\n"); *isieee = 0; } if( sbit2 == 0 ){ printf("Sign of negative zero is incorrect\n"); *isieee = 0; } } scalapack-2.0.2/SRC/pslamch.f000644 000766 000024 00000005111 11622500733 016133 0ustar00juliestaff000000 000000 REAL FUNCTION PSLAMCH( ICTXT, CMACH ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER CMACH INTEGER ICTXT * .. * * Purpose * ======= * * PSLAMCH determines single precision machine parameters. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle in which the computation takes * place. * * CMACH (global input) CHARACTER*1 * Specifies the value to be returned by PSLAMCH: * = 'E' or 'e', PSLAMCH := eps * = 'S' or 's , PSLAMCH := sfmin * = 'B' or 'b', PSLAMCH := base * = 'P' or 'p', PSLAMCH := eps*base * = 'N' or 'n', PSLAMCH := t * = 'R' or 'r', PSLAMCH := rnd * = 'M' or 'm', PSLAMCH := emin * = 'U' or 'u', PSLAMCH := rmin * = 'L' or 'l', PSLAMCH := emax * = 'O' or 'o', PSLAMCH := rmax * * where * * eps = relative machine precision * sfmin = safe minimum, such that 1/sfmin does not overflow * base = base of the machine * prec = eps*base * t = number of (base) digits in the mantissa * rnd = 1.0 when rounding occurs in addition, 0.0 otherwise * emin = minimum exponent before (gradual) underflow * rmin = underflow threshold - base**(emin-1) * emax = largest exponent before overflow * rmax = overflow threshold - (base**emax)*(1-eps) * * ===================================================================== * * .. Local Scalars .. INTEGER IDUMM REAL TEMP * .. * .. External Subroutines .. EXTERNAL SGAMN2D, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH EXTERNAL LSAME, SLAMCH * .. * .. Executable Statements .. * TEMP = SLAMCH( CMACH ) IDUMM = 0 * IF( LSAME( CMACH, 'E' ).OR.LSAME( CMACH, 'S' ).OR. $ LSAME( CMACH, 'M' ).OR.LSAME( CMACH, 'U' ) ) THEN CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) ELSE IF( LSAME( CMACH, 'L' ).OR.LSAME( CMACH, 'O' ) ) THEN CALL SGAMN2D( ICTXT, 'All', ' ', 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) END IF * PSLAMCH = TEMP * * End of PSLAMCH * END scalapack-2.0.2/SRC/pslamr1d.f000644 000766 000024 00000010671 10363532303 016235 0ustar00juliestaff000000 000000 SUBROUTINE PSLAMR1D( N, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IA, IB, JA, JB, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ) * .. * * Bugs * ==== * * I am not sure that this works correctly when IB and JB are not equal * to 1. Indeed, I suspect that IB should always be set to 1 or ignored * with 1 used in its place. * * PSLAMR1D has not been tested except withint the contect of * PSSYPTRD, the prototype reduction to tridiagonal form code. * * Purpose * * ======= * * PSLAMR1D redistributes a one-dimensional row vector from one data * decomposition to another. * * This is an auxiliary routine called by PSSYTRD to redistribute D, E * and TAU. * * Notes * ===== * * Although all processes call PSGEMR2D, only the processes that own * the first column of A send data and only processes that own the * first column of B receive data. The calls to SGEBS2D/SGEBR2D * spread the data down. * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix to be transposed. * * A (local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LOCc(JA+N-1)). * On output, A is replicated across all processes in * this processor column. * * IA (global input) INTEGER * A's global row index, which points to the beginning of * the submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LOCc(JB+N-1)). * * IB (global input) INTEGER * B's global row index, NOT USED * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace) COMPLEX*16 array, dimension ( LWORK ) * * LWORK (local input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCAA( DLEN_ ), DESCBB( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PSGEMR2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return if possible * IF( N.LE.0 ) $ RETURN * DO 10 I = 1, DLEN_ DESCAA( I ) = DESCA( I ) DESCBB( I ) = DESCB( I ) 10 CONTINUE * DESCAA( M_ ) = 1 DESCBB( M_ ) = 1 DESCAA( LLD_ ) = 1 DESCBB( LLD_ ) = 1 * ICTXT = DESCB( CTXT_ ) CALL PSGEMR2D( 1, N, A, IA, JA, DESCAA, B, IB, JB, DESCBB, ICTXT ) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NQ = NUMROC( N, DESCB( NB_ ), MYCOL, 0, NPCOL ) * IF( MYROW.EQ.0 ) THEN CALL SGEBS2D( ICTXT, 'C', ' ', NQ, 1, B, NQ ) ELSE CALL SGEBR2D( ICTXT, 'C', ' ', NQ, 1, B, NQ, 0, MYCOL ) END IF * RETURN * * End of PSLAMR1D * END scalapack-2.0.2/SRC/pslamve.f000644 000766 000024 00000020050 11750130340 016145 0ustar00juliestaff000000 000000 SUBROUTINE PSLAMVE( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, DWORK ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), DWORK( * ) * .. * * Purpose * ======= * * PSLAMVE copies all or part of a distributed matrix A to another * distributed matrix B. There is no alignment assumptions at all * except that A and B are of the same size. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) REAL pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * DWORK (local workspace) REAL array * If UPLO = 'U' or UPLO = 'L' and number of processors > 1, * the length of DWORK is at least as large as the length of B. * Otherwise, DWORK is not referenced. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER, LOWER, FULL INTEGER ICTXT, NPROW, NPCOL, MYROW, MYCOL, MYPROC, $ NPROCS, AROWS, ACOLS, K, SPROC, SRSRC, SCSRC, $ RPROC, RRSRC, RCSRC, COUNT, J, I, IIA, JJA, $ IIB, JJB, BRSRC, BCSRC, RAROWS, RACOLS, $ INDEX, IDUM, NUMREC, NUMSND * .. * .. External Subroutines .. EXTERNAL SLAMOV, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC, INDXL2G EXTERNAL ICEIL, LSAME, NUMROC, INDXL2G * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Find underlying mesh properties. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Decode input parameters. * UPPER = LSAME( UPLO, 'U' ) IF( .NOT. UPPER ) LOWER = LSAME( UPLO, 'L' ) FULL = (.NOT. UPPER) .AND. (.NOT. LOWER) * * Assign indiviual numbers based on column major ordering. * NPROCS = NPROW*NPCOL * * Do redistribution operation. * IF( NPROCS.EQ.1 ) THEN CALL SLAMOV( UPLO, M, N, A((JA-1)*DESCA(LLD_)+IA), $ DESCA(LLD_), B((JB-1)*DESCB(LLD_)+IB), $ DESCB(LLD_) ) ELSEIF( FULL ) THEN CALL PSGEMR2D( M, N, A, IA, JA, DESCA, B, IB, JB, DESCB, $ ICTXT ) ELSE CALL PSGEMR2D( M, N, A, IA, JA, DESCA, DWORK, IB, JB, DESCB, $ ICTXT ) CALL PSLACPY( UPLO, M, N, DWORK, IB, JB, DESCB, B, IB, JB, $ DESCB ) END IF * RETURN * * End of PSLAMVE * END scalapack-2.0.2/SRC/pslange.f000644 000766 000024 00000026614 10363532303 016147 0ustar00juliestaff000000 000000 REAL FUNCTION PSLANGE( NORM, M, N, A, IA, JA, DESCA, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSLANGE returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * distributed matrix sub( A ) = A(IA:IA+M-1, JA:JA+N-1). * * PSLANGE returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+M-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PSLANGE as described * above. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). When M = 0, PSLANGE * is set to zero. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). When N = 0, * PSLANGE is set to zero. N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, ICOFF, IOFFA, $ IROFF, J, JJ, LDA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ REAL SCALE, SUM, VALUE * .. * .. Local Arrays .. REAL RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSTREECOMB, $ SCOMBSSQ, SGEBR2D, SGEBS2D, $ SGAMX2D, SGSUM2D, SLASSQ * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, NUMROC EXTERNAL LSAME, ISAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * IF( MIN( M, N ).EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO IF( NQ.GT.0 .AND. MP.GT.0 ) THEN IOFFA = (JJ-1)*LDA DO 20 J = JJ, JJ+NQ-1 DO 10 I = II, MP+II-1 VALUE = MAX( VALUE, ABS( A( IOFFA+I ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE END IF CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, J, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * * Find norm1( sub( A ) ). * IF( NQ.GT.0 ) THEN IOFFA = ( JJ - 1 ) * LDA DO 40 J = JJ, JJ+NQ-1 SUM = ZERO IF( MP.GT.0 ) THEN DO 30 I = II, MP+II-1 SUM = SUM + ABS( A( IOFFA+I ) ) 30 CONTINUE END IF IOFFA = IOFFA + LDA WORK( J-JJ+1 ) = SUM 40 CONTINUE END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( ISAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, J, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * * Find normI( sub( A ) ). * IF( MP.GT.0 ) THEN IOFFA = II + ( JJ - 1 ) * LDA DO 60 I = II, II+MP-1 SUM = ZERO IF( NQ.GT.0 ) THEN DO 50 J = IOFFA, IOFFA + NQ*LDA - 1, LDA SUM = SUM + ABS( A( J ) ) 50 CONTINUE END IF WORK( I-II+1 ) = SUM IOFFA = IOFFA + 1 60 CONTINUE END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, WORK, MAX( 1, MP ), $ MYROW, 0 ) * * Find maximum sum of rows for supnorm * IF( MYCOL.EQ.0 ) THEN IF( MP.GT.0 ) THEN VALUE = WORK( ISAMAX( MP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, I, $ J, -1, 0, 0 ) END IF * ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE IOFFA = II + ( JJ - 1 ) * LDA IF( NQ.GT.0 ) THEN DO 70 J = IOFFA, IOFFA + NQ*LDA - 1, LDA CALL SLASSQ( MP, A( J ), 1, SCALE, SUM ) 70 CONTINUE END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PSTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, SCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PSLANGE = VALUE * RETURN * * End of PSLANGE * END scalapack-2.0.2/SRC/pslanhs.f000644 000766 000024 00000062453 10363532303 016167 0ustar00juliestaff000000 000000 REAL FUNCTION PSLANHS( NORM, N, A, IA, JA, DESCA, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSLANHS returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * Hessenberg distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * PSLANHS returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PSLANHS as described * above. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PSLANHS is set to zero. N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, II, IIA, ICOFF, INXTROW, $ IOFFA, IROFF, J, JB, JJ, JJA, JN, KK, LDA, LL, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL SCALE, SUM, VALUE * .. * .. Local Arrays .. REAL RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSTREECOMB, $ SCOMBSSQ, SGEBR2D, SGEBS2D, $ SGAMX2D, SGSUM2D, SLASSQ * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ISAMAX, NUMROC EXTERNAL LSAME, ICEIL, ISAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) IOFFA = ( JJA - 1 ) * LDA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * VALUE = ZERO * * Find max(abs(A(i,j))). * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 20 LL = JJ, JJ+JB-1 DO 10 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 50 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 40 LL = JJ, JJ+JB-1 DO 30 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 50 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 70 LL = JJ, JJ + JB -1 DO 60 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 60 CONTINUE IOFFA = IOFFA + LDA 70 CONTINUE ELSE DO 90 LL = JJ, JJ+JB-1 DO 80 KK = IIA, MIN( II-1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ VALUE = MAX( VALUE, ABS( A( II+(JJ+JB-2)*LDA ) ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 140 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 110 LL = JJ, JJ + JB -1 DO 100 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE ELSE DO 130 LL = JJ, JJ + JB -1 DO 120 KK = IIA, MIN( II-1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 120 CONTINUE IOFFA = IOFFA + LDA 130 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ VALUE = MAX( VALUE, $ ABS( A( II+(JJ+JB-2)*LDA ) ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 140 CONTINUE * END IF * * Gather the intermediate results to process (0,0). * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, KK, LL, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * VALUE = ZERO II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 160 LL = JJ, JJ+JB-1 SUM = ZERO DO 150 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 150 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 160 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 190 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 180 LL = JJ, JJ+JB-1 SUM = ZERO DO 170 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 170 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 180 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 190 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 210 LL = JJ, JJ + JB -1 SUM = ZERO DO 200 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 200 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 210 CONTINUE ELSE DO 230 LL = JJ, JJ + JB -1 SUM = ZERO DO 220 KK = IIA, MIN( II-1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 220 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 230 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( JJ+JB-JJA ) = WORK( JJ+JB-JJA ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 280 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 250 LL = JJ, JJ + JB -1 SUM = ZERO DO 240 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 240 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 250 CONTINUE ELSE DO 270 LL = JJ, JJ + JB -1 SUM = ZERO DO 260 KK = IIA, MIN( II-1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 260 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 270 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( JJ+JB-JJA ) = WORK( JJ+JB-JJA ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 280 CONTINUE * END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( ISAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, KK, LL, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * DO 290 KK = IIA, IIA+NP-1 WORK( KK ) = ZERO 290 CONTINUE * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 310 LL = JJ, JJ+JB-1 DO 300 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 300 CONTINUE IOFFA = IOFFA + LDA 310 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 340 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 330 LL = JJ, JJ+JB-1 DO 320 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 320 CONTINUE IOFFA = IOFFA + LDA 330 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 340 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 360 LL = JJ, JJ + JB -1 DO 350 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 350 CONTINUE IOFFA = IOFFA + LDA 360 CONTINUE ELSE DO 380 LL = JJ, JJ + JB -1 DO 370 KK = IIA, MIN( II-1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 370 CONTINUE IOFFA = IOFFA + LDA 380 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( II-IIA+1 ) = WORK( II-IIA+1 ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 430 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 400 LL = JJ, JJ + JB -1 DO 390 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 390 CONTINUE IOFFA = IOFFA + LDA 400 CONTINUE ELSE DO 420 LL = JJ, JJ + JB -1 DO 410 KK = IIA, MIN( II-1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS(A(IOFFA+KK)) 410 CONTINUE IOFFA = IOFFA + LDA 420 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( II-IIA+1 ) = WORK( II-IIA+1 ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 430 CONTINUE * END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK, MAX( 1, NP ), $ MYROW, 0 ) * * Find maximum sum of rows for Infinity-norm * IF( MYCOL.EQ.0 ) THEN IF( NP.GT.0 ) THEN VALUE = WORK( ISAMAX( NP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, KK, $ LL, -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * SCALE = ZERO SUM = ONE II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 440 LL = JJ, JJ+JB-1 CALL SLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 440 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 460 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 450 LL = JJ, JJ+JB-1 CALL SLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 450 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 460 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 470 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 470 CONTINUE ELSE DO 480 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II-1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 480 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ CALL SLASSQ( 1, A( II+(JJ+JB-2)*LDA ), 1, $ SCALE, SUM ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 510 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 490 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 490 CONTINUE ELSE DO 500 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II-1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 500 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ CALL SLASSQ( 1, A( II+(JJ+JB-2)*LDA ), 1, $ SCALE, SUM ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 510 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PSTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, SCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PSLANHS = VALUE * RETURN * * End of PSLANHS * END scalapack-2.0.2/SRC/pslansy.f000644 000766 000024 00000070224 10363532303 016203 0ustar00juliestaff000000 000000 REAL FUNCTION PSLANSY( NORM, UPLO, N, A, IA, JA, $ DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSLANSY returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * real symmetric distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * PSLANSY returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PSLANSY as described * above. * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is to be referenced. * = 'U': Upper triangular part of sub( A ) is referenced, * = 'L': Lower triangular part of sub( A ) is referenced. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PSLANSY is set to zero. N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the symmetric distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular matrix which norm is * to be computed, and the strictly lower triangular part of * this matrix is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular matrix which norm is to be computed, and the * strictly upper triangular part of sub( A ) is not referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * 2*Nq0+Np0+LDW if NORM = '1', 'O', 'o', 'I' or 'i', * where LDW is given by: * IF( NPROW.NE.NPCOL ) THEN * LDW = MB_A*CEIL(CEIL(Np0/MB_A)/(LCM/NPROW)) * ELSE * LDW = 0 * END IF * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * * where LCM is the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) and CEIL denotes the ceiling * operation (ICEIL). * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * ICEIL, ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IAROW, IACOL, IB, ICOFF, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, IROFF, ICSR, ICSR0, $ IOFFA, IRSC, IRSC0, IRSR, IRSR0, JJ, JJA, K, $ LDA, LL, MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL SCALE, SUM, VALUE * .. * .. Local Arrays .. REAL RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PSCOL2ROW, PSTREECOMB, $ SAXPY, SCOMBSSQ, SGAMX2D, SGSUM2D, $ SGEBR2D, SGEBS2D, SLASSQ * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ISAMAX, NUMROC EXTERNAL ICEIL, ISAMAX, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters and local indexes. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) ICSR = 1 IRSR = ICSR + NQ IRSC = IRSR + NQ IF( MYROW.EQ.IAROW ) THEN IRSC0 = IRSC + IROFF NP = NP - IROFF ELSE IRSC0 = IRSC END IF IF( MYCOL.EQ.IACOL ) THEN ICSR0 = ICSR + ICOFF IRSR0 = IRSR + ICOFF NQ = NQ - ICOFF ELSE ICSR0 = ICSR IRSR0 = IRSR END IF IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+N-1 ) LDA = DESCA( LLD_ ) * * If the matrix is symmetric, we address only a triangular portion * of the matrix. A sum of row (column) i of the complete matrix * can be obtained by adding along row i and column i of the the * triangular matrix, stopping/starting at the diagonal, which is * the point of reflection. The pictures below demonstrate this. * In the following code, the row sums created by --- rows below are * refered to as ROWSUMS, and the column sums shown by | are refered * to as COLSUMS. Infinity-norm = 1-norm = ROWSUMS+COLSUMS. * * UPLO = 'U' UPLO = 'L' * ____i______ ___________ * |\ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \|------| i i|---\ | * | \ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * |__________\| |___|______\| * i * * II, JJ : local indices into array A * ICURROW : process row containing diagonal block * ICURCOL : process column containing diagonal block * IRSC0 : pointer to part of work used to store the ROWSUMS while * they are stored along a process column * IRSR0 : pointer to part of work used to store the ROWSUMS after * they have been transposed to be along a process row * II = IIA JJ = JJA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 20 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 10 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 10 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 20 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 40 K = II, II+IB-1 IF( JJ.LE.JJA+NQ-1 ) THEN DO 30 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 30 CONTINUE END IF IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 40 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining rows/columns of the matrix. * DO 90 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 60 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 50 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 50 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 60 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 80 K = II, II+IB-1 IF( JJ.LE.JJA+NQ-1 ) THEN DO 70 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 70 CONTINUE END IF IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 80 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 90 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 110 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.LE.IIA+NP-1 ) THEN DO 100 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 100 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 110 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 130 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 120 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 120 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 130 CONTINUE ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 180 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 150 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.LE.IIA+NP-1 ) THEN DO 140 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 140 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 150 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 170 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 160 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 160 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 170 CONTINUE ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * END IF * * Gather the result on process (IAROW,IACOL). * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, K, -1, $ IAROW, IACOL ) * ELSE IF( LSAME( NORM, 'I' ) .OR. LSAME( NORM, 'O' ) .OR. $ NORM.EQ.'1' ) THEN * * Find normI( sub( A ) ) ( = norm1( sub( A ) ), since sub( A ) is * symmetric). * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 200 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 190 LL = IIA, II-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 190 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 200 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 220 K = II, II+IB-1 SUM = ZERO IF( JJA+NQ.GT.JJ ) THEN DO 210 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 210 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 220 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining rows/columns of global matrix. * DO 270 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 240 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 230 LL = IIA, II-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 230 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 240 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 260 K = II, II+IB-1 SUM = ZERO IF( JJA+NQ.GT.JJ ) THEN DO 250 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 250 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 260 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 270 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = (JJ-1)*LDA DO 290 K = 0, IB-1 SUM = ZERO IF( IIA+NP.GT.II ) THEN DO 280 LL = II, IIA+NP-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 280 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 290 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 310 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 300 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 300 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 310 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 360 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 330 K = 0, IB-1 SUM = ZERO IF( IIA+NP.GT.II ) THEN DO 320 LL = II, IIA+NP-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 320 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 330 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 350 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 340 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 340 CONTINUE END IF WORK(K-IIA+IRSC0) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 350 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 360 CONTINUE END IF * * After calls to SGSUM2D, process row 0 will have global * COLSUMS and process column 0 will have global ROWSUMS. * Transpose ROWSUMS and add to COLSUMS to get global row/column * sum, the max of which is the infinity or 1 norm. * IF( MYCOL.EQ.IACOL ) $ NQ = NQ + ICOFF CALL SGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK( ICSR ), 1, $ IAROW, MYCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP + IROFF CALL SGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IRSC ), $ MAX( 1, NP ), MYROW, IACOL ) * CALL PSCOL2ROW( ICTXT, N, 1, DESCA( MB_ ), WORK( IRSC ), $ MAX( 1, NP ), WORK( IRSR ), MAX( 1, NQ ), $ IAROW, IACOL, IAROW, IACOL, WORK( IRSC+NP ) ) * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF CALL SAXPY( NQ, ONE, WORK( IRSR0 ), 1, WORK( ICSR0 ), 1 ) IF( NQ.LT.1 ) THEN VALUE = ZERO ELSE VALUE = WORK( ISAMAX( NQ, WORK( ICSR0 ), 1 ) ) END IF CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, K, $ -1, IAROW, IACOL ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE * * Add off-diagonal entries, first * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 370 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL SLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) $ II = II + 1 CALL SLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) 370 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 390 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 380 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL SLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) $ II = II + 1 CALL SLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) 380 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 400 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL SLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) $ II = II + 1 CALL SLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 400 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 420 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 410 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL SLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) $ II = II + 1 CALL SLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 410 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 420 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM * CALL PSTREECOMB( ICTXT, 'All', 2, RWORK, IAROW, IACOL, $ SCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to the other processes * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, IAROW, $ IACOL ) END IF * PSLANSY = VALUE * RETURN * * End of PSLANSY * END scalapack-2.0.2/SRC/pslantr.f000644 000766 000024 00000110574 10363532303 016200 0ustar00juliestaff000000 000000 REAL FUNCTION PSLANTR( NORM, UPLO, DIAG, M, N, A, $ IA, JA, DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSLANTR returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * trapezoidal or triangular distributed matrix sub( A ) denoting * A(IA:IA+M-1, JA:JA+N-1). * * PSLANTR returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with ia <= i <= ia+m-1, * ( and ja <= j <= ja+n-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PSLANTR as described * above. * * UPLO (global input) CHARACTER * Specifies whether the matrix sub( A ) is upper or lower * trapezoidal. * = 'U': Upper trapezoidal * = 'L': Lower trapezoidal * Note that sub( A ) is triangular instead of trapezoidal * if M = N. * * DIAG (global input) CHARACTER * Specifies whether or not the distributed matrix sub( A ) has * unit diagonal. * = 'N': Non-unit diagonal * = 'U': Unit diagonal * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). When M = 0, PSLANTR is * set to zero. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). When N = 0, * PSLANTR is set to zero. N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) REAL array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL UDIAG INTEGER IACOL, IAROW, ICTXT, II, IIA, ICOFF, IOFFA, $ IROFF, J, JB, JJ, JJA, JN, KK, LDA, LL, MP, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL SCALE, SUM, VALUE * .. * .. Local Arrays .. REAL RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSTREECOMB, $ SCOMBSSQ, SGEBR2D, SGEBS2D, $ SGAMX2D, SGSUM2D, SLASSQ * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ISAMAX, NUMROC EXTERNAL LSAME, ICEIL, ISAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * UDIAG = LSAME( DIAG, 'U' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) IOFFA = ( JJA - 1 ) * LDA * IF( MIN( M, N ).EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * IF( UDIAG ) THEN VALUE = ONE ELSE VALUE = ZERO END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 20 LL = JJ, JJ + JB -1 DO 10 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE ELSE DO 40 LL = JJ, JJ + JB -1 DO 30 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE END IF ELSE DO 60 LL = JJ, JJ + JB -1 DO 50 KK = IIA, MIN( II-1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 80 LL = JJ, JJ + JB -1 DO 70 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 70 CONTINUE IOFFA = IOFFA + LDA 80 CONTINUE ELSE DO 100 LL = JJ, JJ + JB -1 DO 90 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 90 CONTINUE IOFFA = IOFFA + LDA 100 CONTINUE END IF ELSE DO 120 LL = JJ, JJ + JB -1 DO 110 KK = IIA, MIN( II-1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 110 CONTINUE IOFFA = IOFFA + LDA 120 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 130 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 150 LL = JJ, JJ + JB -1 DO 140 KK = II+LL-JJ+1, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 140 CONTINUE IOFFA = IOFFA + LDA 150 CONTINUE ELSE DO 170 LL = JJ, JJ + JB -1 DO 160 KK = II+LL-JJ, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 160 CONTINUE IOFFA = IOFFA + LDA 170 CONTINUE END IF ELSE DO 190 LL = JJ, JJ + JB -1 DO 180 KK = II, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 180 CONTINUE IOFFA = IOFFA + LDA 190 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 260 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 210 LL = JJ, JJ + JB -1 DO 200 KK = II+LL-JJ+1, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 200 CONTINUE IOFFA = IOFFA + LDA 210 CONTINUE ELSE DO 230 LL = JJ, JJ + JB -1 DO 220 KK = II+LL-JJ, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 220 CONTINUE IOFFA = IOFFA + LDA 230 CONTINUE END IF ELSE DO 250 LL = JJ, JJ + JB -1 DO 240 KK = II, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 240 CONTINUE IOFFA = IOFFA + LDA 250 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 260 CONTINUE * END IF * * Gather the intermediate results to process (0,0). * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, KK, LL, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 280 LL = JJ, JJ + JB -1 SUM = ONE DO 270 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 270 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 280 CONTINUE ELSE DO 300 LL = JJ, JJ + JB -1 SUM = ZERO DO 290 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 290 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 300 CONTINUE END IF ELSE DO 320 LL = JJ, JJ + JB -1 SUM = ZERO DO 310 KK = IIA, MIN( II-1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 310 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 320 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 390 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 340 LL = JJ, JJ + JB -1 SUM = ONE DO 330 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 330 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 340 CONTINUE ELSE DO 360 LL = JJ, JJ + JB -1 SUM = ZERO DO 350 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 350 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 360 CONTINUE END IF ELSE DO 380 LL = JJ, JJ + JB -1 SUM = ZERO DO 370 KK = IIA, MIN( II-1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 370 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 380 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 410 LL = JJ, JJ + JB -1 SUM = ONE DO 400 KK = II+LL-JJ+1, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 400 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 410 CONTINUE ELSE DO 430 LL = JJ, JJ + JB -1 SUM = ZERO DO 420 KK = II+LL-JJ, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 420 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 430 CONTINUE END IF ELSE DO 450 LL = JJ, JJ + JB -1 SUM = ZERO DO 440 KK = II, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 440 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 450 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 520 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 470 LL = JJ, JJ + JB -1 SUM = ONE DO 460 KK = II+LL-JJ+1, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 460 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 470 CONTINUE ELSE DO 490 LL = JJ, JJ + JB -1 SUM = ZERO DO 480 KK = II+LL-JJ, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 480 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 490 CONTINUE END IF ELSE DO 510 LL = JJ, JJ + JB -1 SUM = ZERO DO 500 KK = II, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 500 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 510 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 520 CONTINUE * END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( ISAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, KK, LL, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * IF( LSAME( UPLO, 'U' ) ) THEN IF( UDIAG ) THEN DO 530 KK = IIA, IIA+MP-1 WORK( KK ) = ONE 530 CONTINUE ELSE DO 540 KK = IIA, IIA+MP-1 WORK( KK ) = ZERO 540 CONTINUE END IF ELSE IF( UDIAG ) THEN NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF DO 550 KK = IIA, IIA+NP-1 WORK( KK ) = ONE 550 CONTINUE DO 560 KK = IIA+NP, IIA+MP-1 WORK( KK ) = ZERO 560 CONTINUE ELSE DO 570 KK = IIA, IIA+MP-1 WORK( KK ) = ZERO 570 CONTINUE END IF END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 590 LL = JJ, JJ + JB -1 DO 580 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 580 CONTINUE IOFFA = IOFFA + LDA 590 CONTINUE ELSE DO 610 LL = JJ, JJ + JB -1 DO 600 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 600 CONTINUE IOFFA = IOFFA + LDA 610 CONTINUE END IF ELSE DO 630 LL = JJ, JJ + JB -1 DO 620 KK = IIA, MIN( II-1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 620 CONTINUE IOFFA = IOFFA + LDA 630 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 700 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 650 LL = JJ, JJ + JB -1 DO 640 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 640 CONTINUE IOFFA = IOFFA + LDA 650 CONTINUE ELSE DO 670 LL = JJ, JJ + JB -1 DO 660 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 660 CONTINUE IOFFA = IOFFA + LDA 670 CONTINUE END IF ELSE DO 690 LL = JJ, JJ + JB -1 DO 680 KK = IIA, MIN( II-1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 680 CONTINUE IOFFA = IOFFA + LDA 690 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 700 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 720 LL = JJ, JJ + JB -1 DO 710 KK = II+LL-JJ+1, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 710 CONTINUE IOFFA = IOFFA + LDA 720 CONTINUE ELSE DO 740 LL = JJ, JJ + JB -1 DO 730 KK = II+LL-JJ, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 730 CONTINUE IOFFA = IOFFA + LDA 740 CONTINUE END IF ELSE DO 760 LL = JJ, JJ + JB -1 DO 750 KK = II, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 750 CONTINUE IOFFA = IOFFA + LDA 760 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 830 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 780 LL = JJ, JJ + JB -1 DO 770 KK = II+LL-JJ+1, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 770 CONTINUE IOFFA = IOFFA + LDA 780 CONTINUE ELSE DO 800 LL = JJ, JJ + JB -1 DO 790 KK = II+LL-JJ, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 790 CONTINUE IOFFA = IOFFA + LDA 800 CONTINUE END IF ELSE DO 820 LL = JJ, JJ + JB -1 DO 810 KK = II, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 810 CONTINUE IOFFA = IOFFA + LDA 820 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 830 CONTINUE * END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, WORK, MAX( 1, MP ), $ MYROW, 0 ) * * Find maximum sum of rows for Infinity-norm * IF( MYCOL.EQ.0 ) THEN IF( MP.GT.0 ) THEN VALUE = WORK( ISAMAX( MP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, KK, $ LL, -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * IF( UDIAG ) THEN SCALE = ONE SUM = REAL( MIN( M, N ) ) / REAL( NPROW*NPCOL ) ELSE SCALE = ZERO SUM = ONE END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 840 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II+LL-JJ, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 840 CONTINUE ELSE DO 850 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II+LL-JJ+1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 850 CONTINUE END IF ELSE DO 860 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II-1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 860 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 900 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 870 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II+LL-JJ+1, IIA+MP-1 )- $ IIA+1, A( IIA+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 870 CONTINUE ELSE DO 880 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II+LL-JJ, IIA+MP-1 )- $ IIA+1, A( IIA+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 880 CONTINUE END IF ELSE DO 890 LL = JJ, JJ + JB -1 CALL SLASSQ( MIN( II-1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 890 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 900 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 910 LL = JJ, JJ + JB -1 CALL SLASSQ( IIA+MP-(II+LL-JJ+1), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 910 CONTINUE ELSE DO 920 LL = JJ, JJ + JB -1 CALL SLASSQ( IIA+MP-(II+LL-JJ), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 920 CONTINUE END IF ELSE DO 930 LL = JJ, JJ + JB -1 CALL SLASSQ( IIA+MP-II, A( II+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 930 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 970 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 940 LL = JJ, JJ + JB -1 CALL SLASSQ( IIA+MP-(II+LL-JJ+1), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 940 CONTINUE ELSE DO 950 LL = JJ, JJ + JB -1 CALL SLASSQ( IIA+MP-(II+LL-JJ), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 950 CONTINUE END IF ELSE DO 960 LL = JJ, JJ + JB -1 CALL SLASSQ( IIA+MP-II, A( II+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 960 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 970 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PSTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, SCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to every process in the grid. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PSLANTR = VALUE * RETURN * * End of PSLANTR * END scalapack-2.0.2/SRC/pslapiv.f000644 000766 000024 00000033641 10363532303 016172 0ustar00juliestaff000000 000000 SUBROUTINE PSLAPIV( DIREC, ROWCOL, PIVROC, M, N, A, IA, JA, $ DESCA, IPIV, IP, JP, DESCIP, IWORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER*1 DIREC, PIVROC, ROWCOL INTEGER IA, IP, JA, JP, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCIP( * ), IPIV( * ), IWORK( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLAPIV applies either P (permutation matrix indicated by IPIV) * or inv( P ) to a general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1), resulting in row or column * pivoting. The pivot vector may be distributed across a process row * or a column. The pivot vector should be aligned with the distributed * matrix A. This routine will transpose the pivot vector if necessary. * For example if the row pivots should be applied to the columns of * sub( A ), pass ROWCOL='C' and PIVROC='C'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Restrictions * ============ * * IPIV must always be a distributed vector (not a matrix). Thus: * IF( ROWPIV .EQ. 'C' ) THEN * JP must be 1 * ELSE * IP must be 1 * END IF * * The following restrictions apply when IPIV must be transposed: * IF( ROWPIV.EQ.'C' .AND. PIVROC.EQ.'C') THEN * DESCIP(MB_) must equal DESCA(NB_) * ELSE IF( ROWPIV.EQ.'R" .AND. PIVROC.EQ.'R') THEN * DESCIP(NB_) must equal DESCA(MB_) * END IF * * Arguments * ========= * * DIREC (global input) CHARACTER*1 * Specifies in which order the permutation is applied: * = 'F' (Forward) Applies pivots Forward from top of matrix. * Computes P*sub( A ). * = 'B' (Backward) Applies pivots Backward from bottom of * matrix. Computes inv( P )*sub( A ). * * ROWCOL (global input) CHARACTER*1 * Specifies if the rows or columns are to be permuted: * = 'R' Rows will be permuted, * = 'C' Columns will be permuted. * * PIVROC (global input) CHARACTER*1 * Specifies whether IPIV is distributed over a process row * or column: * = 'R' IPIV distributed over a process row * = 'C' IPIV distributed over a process column * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of * rows of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * distributed submatrix sub( A ) to which the row or column * interchanges will be applied. On exit, the local pieces * of the permuted distributed submatrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension (LIPIV) where LIPIV is * when ROWCOL='R' or 'r': * >= LOCr( IA+M-1 ) + MB_A if PIVROC='C' or 'c', * >= LOCc( M + MOD(JP-1,NB_P) ) if PIVROC='R' or 'r', and, * when ROWCOL='C' or 'c': * >= LOCr( N + MOD(IP-1,MB_P) ) if PIVROC='C' or 'c', * >= LOCc( JA+N-1 ) + NB_A if PIVROC='R' or 'r'. * This array contains the pivoting information. IPIV(i) is the * global row (column), local row (column) i was swapped with. * When ROWCOL='R' or 'r' and PIVROC='C' or 'c', or ROWCOL='C' * or 'c' and PIVROC='R' or 'r', the last piece of this array of * size MB_A (resp. NB_A) is used as workspace. In those cases, * this array is tied to the distributed matrix A. * * IP (global input) INTEGER * The row index in the global array P indicating the first * row of sub( P ). * * JP (global input) INTEGER * The column index in the global array P indicating the * first column of sub( P ). * * DESCIP (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed vector IPIV. * * IWORK (local workspace) INTEGER array, dimension (LDW) * where LDW is equal to the workspace necessary for * transposition, and the storage of the tranposed IPIV: * * Let LCM be the least common multiple of NPROW and NPCOL. * IF( ROWCOL.EQ.'R' .AND. PIVROC.EQ.'R' ) THEN * IF( NPROW.EQ.NPCOL ) THEN * LDW = LOCr( N_P + MOD(JP-1, NB_P) ) + NB_P * ELSE * LDW = LOCr( N_P + MOD(JP-1, NB_P) ) + * NB_P * CEIL( CEIL(LOCc(N_P)/NB_P) / (LCM/NPCOL) ) * END IF * ELSE IF( ROWCOL.EQ.'C' .AND. PIVROC.EQ.'C' ) THEN * IF( NPROW.EQ.NPCOL ) THEN * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + MB_P * ELSE * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + * MB_P * CEIL( CEIL(LOCr(M_P)/MB_P) / (LCM/NPROW) ) * END IF * ELSE * IWORK is not referenced. * END IF * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL ROWPVT INTEGER I, ICTXT, ICURCOL, ICURROW, IIP, ITMP, IPT, $ JJP, JPT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCPT( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, $ INFOG2L, PICOL2ROW, PIROW2COL, PSLAPV2 * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC, INDXG2P EXTERNAL LSAME, NUMROC, INDXG2P * .. * .. Intrinsic Functions .. INTRINSIC MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) ROWPVT = LSAME( ROWCOL, 'R' ) * * If we're pivoting the rows of sub( A ) * IF( ROWPVT ) THEN IF( M.LE.1 .OR. N.LT.1 ) $ RETURN * * If the pivot vector is already distributed correctly * IF( LSAME( PIVROC, 'C' ) ) THEN CALL PSLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * Otherwise, we must redistribute IPIV to match PSLAPV2 * ELSE * * Take IPIV distributed over row 0, and store it in * iwork, distributed over column 0 * IPT = MOD( JP-1, DESCA(MB_) ) DESCPT(M_) = M + IPT + NPROW*DESCA(MB_) DESCPT(N_) = 1 DESCPT(MB_) = DESCA(MB_) DESCPT(NB_) = 1 DESCPT(RSRC_) = INDXG2P( IA, DESCA(MB_), IA, DESCA(RSRC_), $ NPROW ) DESCPT(CSRC_) = MYCOL DESCPT(CTXT_) = ICTXT DESCPT(LLD_) = NUMROC( DESCPT(M_), DESCPT(MB_), MYROW, $ DESCPT(RSRC_), NPROW ) ITMP = NUMROC( DESCIP(N_), DESCIP(NB_), MYCOL, $ DESCIP(CSRC_), NPCOL ) CALL INFOG2L( IP, JP-IPT, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) CALL PIROW2COL( ICTXT, M+IPT, 1, DESCIP(NB_), IPIV(JJP), $ ITMP, IWORK, DESCPT(LLD_), 0, ICURCOL, $ DESCPT(RSRC_), $ MYCOL, IWORK(DESCPT(LLD_)-DESCPT(MB_)+1) ) * * Send column-distributed pivots to all columns * ITMP = DESCPT(LLD_) - DESCPT(MB_) IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'Row', ' ', ITMP, 1, IWORK, ITMP ) ELSE CALL IGEBR2D( ICTXT, 'Row', ' ', ITMP, 1, IWORK, ITMP, $ MYROW, 0 ) END IF * * Adjust pivots so they are relative to the start of IWORK, * not IPIV * IPT = IPT + 1 DO 10 I = 1, ITMP IWORK(I) = IWORK(I) - JP + IPT 10 CONTINUE CALL PSLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IWORK, $ IPT, 1, DESCPT ) END IF * * Otherwise, we're pivoting the columns of sub( A ) * ELSE IF( M.LT.1 .OR. N.LE.1 ) $ RETURN * * If the pivot vector is already distributed correctly * IF( LSAME( PIVROC, 'R' ) ) THEN CALL PSLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * Otherwise, we must redistribute IPIV to match PSLAPV2 * ELSE * * Take IPIV distributed over column 0, and store it in * iwork, distributed over row 0 * JPT = MOD( IP-1, DESCA(NB_) ) DESCPT(M_) = 1 DESCPT(N_) = N + JPT + NPCOL*DESCA(NB_) DESCPT(MB_) = 1 DESCPT(NB_) = DESCA(NB_) DESCPT(RSRC_) = MYROW DESCPT(CSRC_) = INDXG2P( JA, DESCA(NB_), JA, DESCA(CSRC_), $ NPCOL ) DESCPT(CTXT_) = ICTXT DESCPT(LLD_) = 1 CALL INFOG2L( IP-JPT, JP, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) ITMP = NUMROC( N+JPT, DESCPT(NB_), MYCOL, DESCPT(CSRC_), $ NPCOL ) CALL PICOL2ROW( ICTXT, N+JPT, 1, DESCIP(MB_), IPIV(IIP), $ DESCIP(LLD_), IWORK, MAX(1, ITMP), ICURROW, $ 0, 0, DESCPT(CSRC_), IWORK(ITMP+1) ) * * Send row-distributed pivots to all rows * IF( MYROW.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'Column', ' ', ITMP, 1, IWORK, $ ITMP ) ELSE CALL IGEBR2D( ICTXT, 'Column', ' ', ITMP, 1, IWORK, $ ITMP, 0, MYCOL ) END IF * * Adjust pivots so they are relative to the start of IWORK, * not IPIV * JPT = JPT + 1 DO 20 I = 1, ITMP IWORK(I) = IWORK(I) - IP + JPT 20 CONTINUE CALL PSLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IWORK, $ 1, JPT, DESCPT ) END IF END IF * RETURN * * End of PSLAPIV * END scalapack-2.0.2/SRC/pslapv2.f000644 000766 000024 00000036725 10363532303 016111 0ustar00juliestaff000000 000000 SUBROUTINE PSLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIREC, ROWCOL INTEGER IA, IP, JA, JP, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCIP( * ), IPIV( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLAPV2 applies either P (permutation matrix indicated by IPIV) * or inv( P ) to a M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1), resulting in row or column pivoting. The * pivot vector should be aligned with the distributed matrix A. For * pivoting the rows of sub( A ), IPIV should be distributed along a * process column and replicated over all process rows. Similarly, * IPIV should be distributed along a process row and replicated over * all process columns for column pivoting. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIREC (global input) CHARACTER * Specifies in which order the permutation is applied: * = 'F' (Forward) Applies pivots Forward from top of matrix. * Computes P * sub( A ); * = 'B' (Backward) Applies pivots Backward from bottom of * matrix. Computes inv( P ) * sub( A ). * * ROWCOL (global input) CHARACTER * Specifies if the rows or columns are to be permuted: * = 'R' Rows will be permuted, * = 'C' Columns will be permuted. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this local array contains the local pieces of the * distributed matrix sub( A ) to which the row or columns * interchanges will be applied. On exit, this array contains * the local pieces of the permuted distributed matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (input) INTEGER array, dimension >= LOCr(M_A)+MB_A if * ROWCOL = 'R', LOCc(N_A)+NB_A otherwise. It contains * the pivoting information. IPIV(i) is the global row (column), * local row (column) i was swapped with. The last piece of the * array of size MB_A (resp. NB_A) is used as workspace. IPIV is * tied to the distributed matrix A. * * IP (global input) INTEGER * IPIV's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JP (global input) INTEGER * IPIV's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCIP (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix IPIV. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL FORWRD, ROWPVT INTEGER I, IB, ICTXT, ICURCOL, ICURROW, IIP, IP1, ITMP, $ IPVWRK, J, JB, JJP, JP1, K, MA, MBA, MYCOL, $ MYROW, NBA, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBS2D, IGEBR2D, INFOG2L, $ PSSWAP * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * ROWPVT = LSAME( ROWCOL, 'R' ) IF( ROWPVT ) THEN IF( M.LE.1 .OR. N.LT.1 ) $ RETURN ELSE IF( M.LT.1 .OR. N.LE.1 ) $ RETURN END IF FORWRD = LSAME( DIREC, 'F' ) * * * Get grid and matrix parameters * MA = DESCA( M_ ) MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If I'm applying pivots from beginning to end (e.g., repeating * pivoting done earlier). Thus this section computes P * sub( A ). * IF( FORWRD ) THEN CALL INFOG2L( IP, JP, DESCIP, NPROW, NPCOL, MYROW, MYCOL, $ IIP, JJP, ICURROW, ICURCOL ) * * If I'm pivoting the rows of sub( A ) * IF( ROWPVT ) THEN IPVWRK = NUMROC( DESCIP( M_ ), DESCIP( MB_ ), MYROW, $ DESCIP( RSRC_ ), NPROW ) + 1 - $ DESCIP( MB_ ) * * Loop over rows of sub( A ) * I = IA IB = MIN( M, ICEIL( IA, MBA ) * MBA - IA + 1 ) 10 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process column * IF( MYROW.EQ.ICURROW ) THEN CALL IGEBS2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IIP ), IB ) ITMP = IIP IIP = IIP + IB ELSE ITMP = IPVWRK CALL IGEBR2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( ITMP ), IB, ICURROW, MYCOL ) END IF * * Pivot the block of rows * DO 20 K = I, I+IB-1 IP1 = IPIV( ITMP ) - IP + IA IF( IP1.NE.K ) $ CALL PSSWAP( N, A, K, JA, DESCA, MA, A, IP1, JA, $ DESCA, MA ) ITMP = ITMP + 1 20 CONTINUE * * Go on to next row of processes, increment row counter, * and figure number of rows to pivot next * ICURROW = MOD( ICURROW+1, NPROW ) I = I + IB IB = MIN( MBA, M-I+IA ) IF( IB .GT. 0 ) GOTO 10 * * If I am pivoting the columns of sub( A ) * ELSE IPVWRK = NUMROC( DESCIP( N_ ), DESCIP( NB_ ), MYCOL, $ DESCIP( CSRC_ ), NPCOL ) + 1 - $ DESCIP( NB_ ) * * Loop over columns of sub( A ) * J = JA JB = MIN( N, ICEIL( JA, NBA ) * NBA - JA + 1 ) 30 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process row * IF( MYCOL.EQ.ICURCOL ) THEN CALL IGEBS2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( JJP ), JB ) ITMP = JJP JJP = JJP + JB ELSE ITMP = IPVWRK CALL IGEBR2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( ITMP ), JB, MYROW, ICURCOL ) END IF * * Pivot the block of columns * DO 40 K = J, J+JB-1 JP1 = IPIV( ITMP ) - JP + JA IF( JP1.NE.K ) $ CALL PSSWAP( M, A, IA, K, DESCA, 1, A, IA, JP1, $ DESCA, 1 ) ITMP = ITMP + 1 40 CONTINUE * * Go on to next column of processes, increment column * counter, and figure number of columns to pivot next * ICURCOL = MOD( ICURCOL+1, NPCOL ) J = J + JB JB = MIN( NBA, N-J+JA ) IF( JB .GT. 0 ) GOTO 30 END IF * * If I want to apply pivots in reverse order, i.e. reversing * pivoting done earlier. Thus this section computes * inv( P ) * sub( A ). * ELSE * * If I'm pivoting the rows of sub( A ) * IF( ROWPVT ) THEN CALL INFOG2L( IP+M-1, JP, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) * IPVWRK = NUMROC( DESCIP( M_ ), DESCIP( MB_ ), MYROW, $ DESCIP( RSRC_ ), NPROW ) + 1 - $ DESCIP( MB_ ) * * If I'm not in the current process row, my IIP points out * past end of pivot vector (since I don't own a piece of the * last row). Adjust IIP so it points at last pivot entry. * IF( MYROW.NE.ICURROW ) IIP = IIP - 1 * * Loop over rows in reverse order, starting at last row * I = IA + M - 1 IB = MOD( I, MBA ) IF( IB .EQ. 0 ) IB = MBA IB = MIN( IB, M ) 50 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process column * IF( MYROW.EQ.ICURROW ) THEN ITMP = IIP IIP = IIP - IB CALL IGEBS2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IIP+1 ), IB ) ELSE CALL IGEBR2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IPVWRK ), IB, ICURROW, MYCOL ) ITMP = IPVWRK + IB - 1 END IF * * Pivot the block of rows * DO 60 K = I, I-IB+1, -1 IP1 = IPIV( ITMP ) - IP + IA IF( IP1.NE.K ) $ CALL PSSWAP( N, A, K, JA, DESCA, MA, A, IP1, JA, $ DESCA, MA ) ITMP = ITMP - 1 60 CONTINUE * * Go to previous row of processes, decrement row counter, * and figure number of rows to be pivoted next * ICURROW = MOD( NPROW+ICURROW-1, NPROW ) I = I - IB IB = MIN( MBA, I-IA+1 ) IF( IB .GT. 0 ) GOTO 50 * * Otherwise, I'm pivoting the columns of sub( A ) * ELSE CALL INFOG2L( IP, JP+N-1, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) IPVWRK = NUMROC( DESCIP( N_ ), DESCIP( NB_ ), MYCOL, $ DESCIP( CSRC_ ), NPCOL ) + 1 - $ DESCIP( NB_ ) * * If I'm not in the current process column, my JJP points out * past end of pivot vector (since I don't own a piece of the * last column). Adjust JJP so it points at last pivot entry. * IF( MYCOL.NE.ICURCOL ) JJP = JJP - 1 * * Loop over columns in reverse order starting at last column * J = JA + N - 1 JB = MOD( J, NBA ) IF( JB .EQ. 0 ) JB = NBA JB = MIN( JB, N ) 70 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process row * IF( MYCOL.EQ.ICURCOL ) THEN ITMP = JJP JJP = JJP - JB CALL IGEBS2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( JJP+1 ), JB ) ELSE CALL IGEBR2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( IPVWRK ), JB, MYROW, ICURCOL ) ITMP = IPVWRK + JB - 1 END IF * * Pivot a block of columns * DO 80 K = J, J-JB+1, -1 JP1 = IPIV( ITMP ) - JP + JA IF( JP1.NE.K ) $ CALL PSSWAP( M, A, IA, K, DESCA, 1, A, IA, JP1, $ DESCA, 1 ) ITMP = ITMP - 1 80 CONTINUE * * Go to previous row of processes, decrement row counter, * and figure number of rows to be pivoted next * ICURCOL = MOD( NPCOL+ICURCOL-1, NPCOL ) J = J - JB JB = MIN( NBA, J-JA+1 ) IF( JB .GT. 0 ) GOTO 70 END IF * END IF * RETURN * * End PSLAPV2 * END scalapack-2.0.2/SRC/pslaqge.f000644 000766 000024 00000023256 10363532303 016151 0ustar00juliestaff000000 000000 SUBROUTINE PSLAQGE( M, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, EQUED ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER EQUED INTEGER IA, JA, M, N REAL AMAX, COLCND, ROWCND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), C( * ), R( * ) * .. * * Purpose * ======= * * PSLAQGE equilibrates a general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) using the row and scaling * factors in the vectors R and C. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)) * containing on entry the M-by-N matrix sub( A ). On exit, * the equilibrated distributed matrix. See EQUED for the * form of the equilibrated distributed submatrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * R (local input) REAL array, dimension LOCr(M_A) * The row scale factors for sub( A ). R is aligned with the * distributed matrix A, and replicated across every process * column. R is tied to the distributed matrix A. * * C (local input) REAL array, dimension LOCc(N_A) * The column scale factors of sub( A ). C is aligned with the * distributed matrix A, and replicated down every process * row. C is tied to the distributed matrix A. * * ROWCND (global input) REAL * The global ratio of the smallest R(i) to the largest R(i), * IA <= i <= IA+M-1. * * COLCND (global input) REAL * The global ratio of the smallest C(i) to the largest C(i), * JA <= j <= JA+N-1. * * AMAX (global input) REAL * Absolute value of largest distributed submatrix entry. * * EQUED (global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration * = 'R': Row equilibration, i.e., sub( A ) has been pre- * multiplied by diag(R(IA:IA+M-1)), * = 'C': Column equilibration, i.e., sub( A ) has been post- * multiplied by diag(C(JA:JA+N-1)), * = 'B': Both row and column equilibration, i.e., sub( A ) * has been replaced by * diag(R(IA:IA+M-1)) * sub( A ) * diag(C(JA:JA+N-1)). * * Internal Parameters * =================== * * THRESH is a threshold value used to decide if row or column scaling * should be done based on the ratio of the row or column scaling * factors. If ROWCND < THRESH, row scaling is done, and if * COLCND < THRESH, column scaling is done. * * LARGE and SMALL are threshold values used to decide if row scaling * should be done based on the absolute size of the largest matrix * element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, THRESH PARAMETER ( ONE = 1.0E+0, THRESH = 0.1E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IIA, IOFFA, $ IROFF, J, JJA, LDA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ REAL CJ, LARGE, SMALL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC REAL PSLAMCH EXTERNAL NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) THEN EQUED = 'N' RETURN END IF * * Get grid parameters and compute local indexes * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * * Initialize LARGE and SMALL. * SMALL = PSLAMCH( ICTXT, 'Safe minimum' ) / $ PSLAMCH( ICTXT, 'Precision' ) LARGE = ONE / SMALL * IF( ROWCND.GE.THRESH .AND. AMAX.GE.SMALL .AND. AMAX.LE.LARGE ) $ THEN * * No row scaling * IF( COLCND.GE.THRESH ) THEN * * No column scaling * EQUED = 'N' * ELSE * * Column scaling * IOFFA = (JJA-1)*LDA DO 20 J = JJA, JJA+NQ-1 CJ = C( J ) DO 10 I = IIA, IIA+MP-1 A( IOFFA + I ) = CJ*A( IOFFA + I ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE EQUED = 'C' END IF * ELSE IF( COLCND.GE.THRESH ) THEN * * Row scaling, no column scaling * IOFFA = (JJA-1)*LDA DO 40 J = JJA, JJA+NQ-1 DO 30 I = IIA, IIA+MP-1 A( IOFFA + I ) = R( I )*A( IOFFA + I ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE EQUED = 'R' * ELSE * * Row and column scaling * IOFFA = (JJA-1)*LDA DO 60 J = JJA, JJA+NQ-1 CJ = C( J ) DO 50 I = IIA, IIA+MP-1 A( IOFFA + I ) = CJ*R( I )*A( IOFFA + I ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE EQUED = 'B' * END IF * RETURN * * End of PSLAQGE * END scalapack-2.0.2/SRC/pslaqr0.f000644 000766 000024 00000107740 11705175572 016114 0ustar00juliestaff000000 000000 RECURSIVE SUBROUTINE PSLAQR0( WANTT, WANTZ, N, ILO, IHI, H, $ DESCH, WR, WI, ILOZ, IHIZ, Z, DESCZ, WORK, LWORK, $ IWORK, LIWORK, INFO, RECLEVEL ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IHI, IHIZ, ILO, ILOZ, INFO, LIWORK, LWORK, N, $ RECLEVEL LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. INTEGER DESCH( * ), DESCZ( * ), IWORK( * ) REAL H( * ), WI( N ), WORK( * ), WR( N ), $ Z( * ) * .. * * Purpose * ======= * * PSLAQR0 computes the eigenvalues of a Hessenberg matrix H * and, optionally, the matrices T and Z from the Schur decomposition * H = Z*T*Z**T, where T is an upper quasi-triangular matrix (the * Schur form), and Z is the orthogonal matrix of Schur vectors. * * Optionally Z may be postmultiplied into an input orthogonal * matrix Q so that this routine can give the Schur factorization * of a matrix A which has been reduced to the Hessenberg form H * by the orthogonal matrix Q: * A = Q * H * Q**T = (QZ) * T * (QZ)**T. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix H (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that H is already upper triangular in rows * and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally * set by a previous call to PSGEBAL, and then passed to PSGEHRD * when the matrix output by PSGEBAL is reduced to Hessenberg * form. Otherwise ILO and IHI should be set to 1 and N * respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. * If N = 0, then ILO = 1 and IHI = 0. * * H (global input/output) REAL array, dimension * (DESCH(LLD_),*) * On entry, the upper Hessenberg matrix H. * On exit, if JOB = 'S', H is upper quasi-triangular in * rows and columns ILO:IHI, with 1-by-1 and 2-by-2 blocks on * the main diagonal. The 2-by-2 diagonal blocks (corresponding * to complex conjugate pairs of eigenvalues) are returned in * standard form, with H(i,i) = H(i+1,i+1) and * H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the * contents of H are unspecified on exit. * * DESCH (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix H. * * WR (global output) REAL array, dimension (N) * WI (global output) REAL array, dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If JOB = 'S', the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in H. * * Z (global input/output) REAL array. * If COMPZ = 'V', on entry Z must contain the current * matrix Z of accumulated transformations from, e.g., PSGEHRD, * and on exit Z has been updated; transformations are applied * only to the submatrix Z(ILO:IHI,ILO:IHI). * If COMPZ = 'N', Z is not referenced. * If COMPZ = 'I', on entry Z need not be set and on exit, * if INFO = 0, Z contains the orthogonal matrix Z of the Schur * vectors of H. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace) REAL array, dimension(DWORK) * * LWORK (local input) INTEGER * The length of the workspace array WORK. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the workspace array IWORK. * * INFO (output) INTEGER * = 0: successful exit * .LT. 0: if INFO = -i, the i-th argument had an illegal * value * .GT. 0: if INFO = i, PSLAQR0 failed to compute all of * the eigenvalues. Elements 1:ilo-1 and i+1:n of WR * and WI contain those eigenvalues which have been * successfully computed. (Failures are rare.) * * If INFO .GT. 0 and JOB = 'E', then on exit, the * remaining unconverged eigenvalues are the eigen- * values of the upper Hessenberg matrix rows and * columns ILO through INFO of the final, output * value of H. * * If INFO .GT. 0 and JOB = 'S', then on exit * * (*) (initial value of H)*U = U*(final value of H) * * where U is an orthogonal matrix. The final * value of H is upper Hessenberg and quasi-triangular * in rows and columns INFO+1 through IHI. * * If INFO .GT. 0 and COMPZ = 'V', then on exit * * (final value of Z) = (initial value of Z)*U * * where U is the orthogonal matrix in (*) (regard- * less of the value of JOB.) * * If INFO .GT. 0 and COMPZ = 'I', then on exit * (final value of Z) = U * where U is the orthogonal matrix in (*) (regard- * less of the value of JOB.) * * If INFO .GT. 0 and COMPZ = 'N', then Z is not * accessed. * * ================================================================ * Based on contributions by * Robert Granat, Department of Computing Science and HPC2N, * Umea University, Sweden. * ================================================================ * * Restrictions: The block size in H and Z must be square and larger * than or equal to six (6) due to restrictions in PSLAQR1, PSLAQR5 * and SLAQR6. Moreover, H and Z need to be distributed identically * with the same context. * * ================================================================ * References: * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part I: Maintaining Well Focused * Shifts, and Level 3 Performance. * SIAM J. Matrix Anal. Appl., 23(4):929--947, 2002. * * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part II: Aggressive Early * Deflation. * SIAM J. Matrix Anal. Appl., 23(4):948--973, 2002. * * R. Granat, B. Kagstrom, and D. Kressner, * A Novel Parallel QR Algorithm for Hybrid Distributed Momory HPC * Systems. * SIAM J. Sci. Comput., 32(4):2345--2378, 2010. * * ================================================================ * * .. Parameters .. * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by increasing the size of the * . deflation window after KEXNW iterations. ===== * * ==== Exceptional shifts: try to cure rare slow convergence * . with ad-hoc exceptional shifts every KEXSH iterations. * . The constants WILK1 and WILK2 are used to form the * . exceptional shifts. ==== * INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ INTEGER RECMAX PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, RECMAX = 3 ) INTEGER NTINY PARAMETER ( NTINY = 11 ) INTEGER KEXNW, KEXSH PARAMETER ( KEXNW = 5, KEXSH = 6 ) REAL WILK1, WILK2 PARAMETER ( WILK1 = 0.75E0, WILK2 = -0.4375E0 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0e0, ONE = 1.0e0 ) * .. * .. Local Scalars .. REAL AA, BB, CC, CS, DD, SN, SS, SWAP, ELEM, T0, $ ELEM1, ELEM2, ELEM3, ALPHA, SDSUM, STAMP INTEGER I, J, INF, IT, ITMAX, K, KACC22, KBOT, KDU, KS, $ KT, KTOP, KU, KV, KWH, KWTOP, KWV, LD, LS, $ LWKOPT, NDFL, NH, NHO, NIBBLE, NMIN, NS, NSMAX, $ NSR, NVE, NW, NWMAX, NWR, LLDH, LLDZ, II, JJ, $ ICTXT, NPROW, NPCOL, MYROW, MYCOL, IPV, IPT, $ IPW, IPWRK, VROWS, VCOLS, TROWS, TCOLS, WROWS, $ WCOLS, HRSRC, HCSRC, NB, IS, IE, NPROCS, KK, $ IROFFH, ICOFFH, HRSRC3, HCSRC3, NWIN, TOTIT, $ SWEEP, JW, TOTNS, LIWKOPT, NPMIN, ICTXT_NEW, $ MYROW_NEW, MYCOL_NEW LOGICAL NWINC, SORTED, LQUERY, RECURSION CHARACTER JBCMPZ*2 * .. * .. External Functions .. INTEGER PILAENVX, NUMROC, INDXG2P, ICEIL, BLACS_PNUM EXTERNAL PILAENVX, NUMROC, INDXG2P, ICEIL, BLACS_PNUM * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCT( DLEN_ ), DESCW( DLEN_ ), $ PMAP( 64*64 ) REAL ZDUM( 1, 1 ) * .. * .. External Subroutines .. EXTERNAL PSLACPY, PSLAQR1, SLANV2, PSLAQR3, PSLAQR5, $ PSELGET, SLAQR0, SLASET, PSGEMR2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, FLOAT, INT, MAX, MIN, MOD * .. * .. Executable Statements .. INFO = 0 ICTXT = DESCH( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL RECURSION = RECLEVEL .LT. RECMAX * * Quick return for N = 0: nothing to do. * IF( N.EQ.0 ) THEN WORK( 1 ) = ONE IWORK( 1 ) = 1 RETURN END IF * * Set up job flags for PILAENV. * IF( WANTT ) THEN JBCMPZ( 1: 1 ) = 'S' ELSE JBCMPZ( 1: 1 ) = 'E' END IF IF( WANTZ ) THEN JBCMPZ( 2: 2 ) = 'V' ELSE JBCMPZ( 2: 2 ) = 'N' END IF * * Check if workspace query * LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * Extract local leading dimensions and block factors of matrices * H and Z * LLDH = DESCH( LLD_ ) LLDZ = DESCZ( LLD_ ) NB = DESCH( MB_ ) * * Tiny (sub-) matrices must use PSLAQR1. (Stops recursion) * IF( N.LE.NTINY ) THEN * * Estimate optimal workspace. * CALL PSLAQR1( WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, $ ILOZ, IHIZ, Z, DESCZ, WORK, LWORK, IWORK, LIWORK, INFO ) LWKOPT = INT( WORK(1) ) LIWKOPT = IWORK(1) * * Completely local matrices uses LAPACK. (Stops recursion) * ELSEIF( N.LE.NB ) THEN IF( MYROW.EQ.DESCH(RSRC_) .AND. MYCOL.EQ.DESCH(CSRC_) ) THEN CALL SLAQR0( WANTT, WANTZ, N, ILO, IHI, H, DESCH(LLD_), $ WR, WI, ILOZ, IHIZ, Z, DESCZ(LLD_), WORK, LWORK, INFO ) IF( N.GT.2 ) $ CALL SLASET( 'L', N-2, N-2, ZERO, ZERO, H(3), $ DESCH(LLD_) ) LWKOPT = INT( WORK(1) ) LIWKOPT = 1 ELSE LWKOPT = 1 LIWKOPT = 1 END IF * * Do one more step of recursion * ELSE * * Zero out iteration and sweep counters for debugging purposes * TOTIT = 0 SWEEP = 0 TOTNS = 0 * * Use small bulge multi-shift QR with aggressive early * deflation on larger-than-tiny matrices. * * Hope for the best. * INFO = 0 * * NWR = recommended deflation window size. At this * point, N .GT. NTINY = 11, so there is enough * subdiagonal workspace for NWR.GE.2 as required. * (In fact, there is enough subdiagonal space for * NWR.GE.3.) * NWR = PILAENVX( ICTXT, 13, 'PSLAQR0', JBCMPZ, N, ILO, IHI, $ LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, NWR ) NW = NWR * * NSR = recommended number of simultaneous shifts. * At this point N .GT. NTINY = 11, so there is at * enough subdiagonal workspace for NSR to be even * and greater than or equal to two as required. * NWIN = PILAENVX( ICTXT, 19, 'PSLAQR0', JBCMPZ, N, NB, NB, NB ) NSR = PILAENVX( ICTXT, 15, 'PSLAQR0', JBCMPZ, N, ILO, IHI, $ MAX(NWIN,NB) ) NSR = MIN( NSR, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * Estimate optimal workspace * LWKOPT = 3*ICEIL(NWR,NPROW)*ICEIL(NWR,NPCOL) * * Workspace query call to PSLAQR3 * CALL PSLAQR3( WANTT, WANTZ, N, ILO, IHI, NWR+1, H, $ DESCH, ILOZ, IHIZ, Z, DESCZ, LS, LD, WR, WI, H, $ DESCH, N, H, DESCH, N, H, DESCH, WORK, -1, IWORK, $ LIWORK, RECLEVEL ) LWKOPT = LWKOPT + INT( WORK( 1 ) ) LIWKOPT = IWORK( 1 ) * * Workspace query call to PSLAQR5 * CALL PSLAQR5( WANTT, WANTZ, 2, N, 1, N, N, WR, WI, H, $ DESCH, ILOZ, IHIZ, Z, DESCZ, WORK, -1, IWORK, $ LIWORK ) * * Optimal workspace = MAX(PSLAQR3, PSLAQR5) * LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) LIWKOPT = MAX( LIWKOPT, IWORK( 1 ) ) * * Quick return in case of workspace query. * IF( LQUERY ) THEN WORK( 1 ) = FLOAT( LWKOPT ) IWORK( 1 ) = LIWKOPT RETURN END IF * * PSLAQR1/PSLAQR0 crossover point. * NMIN = PILAENVX( ICTXT, 12, 'PSLAQR0', JBCMPZ, N, ILO, IHI, $ LWORK ) NMIN = MAX( NTINY, NMIN ) * * Nibble crossover point. * NIBBLE = PILAENVX( ICTXT, 14, 'PSLAQR0', JBCMPZ, N, ILO, IHI, $ LWORK ) NIBBLE = MAX( 0, NIBBLE ) * * Accumulate reflections during ttswp? Use block * 2-by-2 structure during matrix-matrix multiply? * KACC22 = PILAENVX( ICTXT, 16, 'PSLAQR0', JBCMPZ, N, ILO, IHI, $ LWORK ) KACC22 = MAX( 1, KACC22 ) KACC22 = MIN( 2, KACC22 ) * * NWMAX = the largest possible deflation window for * which there is sufficient workspace. * NWMAX = MIN( ( N-1 ) / 3, LWORK / 2 ) * * NSMAX = the Largest number of simultaneous shifts * for which there is sufficient workspace. * NSMAX = MIN( ( N+6 ) / 9, LWORK - LWORK/3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * NDFL: an iteration count restarted at deflation. * NDFL = 1 * * ITMAX = iteration limit * ITMAX = MAX( 30, 2*KEXSH )*MAX( 10, ( IHI-ILO+1 ) ) * * Last row and column in the active block. * KBOT = IHI * * Main Loop. * DO 110 IT = 1, ITMAX TOTIT = TOTIT + 1 * * Done when KBOT falls below ILO. * IF( KBOT.LT.ILO ) $ GO TO 120 * * Locate active block. * DO 10 K = KBOT, ILO + 1, -1 CALL INFOG2L( K, K-1, DESCH, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, HRSRC, HCSRC ) IF( MYROW.EQ.HRSRC .AND. MYCOL.EQ.HCSRC ) THEN IF( H( II + (JJ-1)*LLDH ).EQ.ZERO ) $ GO TO 20 END IF 10 CONTINUE K = ILO 20 CONTINUE KTOP = K IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', '1-Tree', 1, 1, KTOP, 1, $ -1, -1, -1, -1, -1 ) * * Select deflation window size. * NH = KBOT - KTOP + 1 IF( NH.LE.NTINY ) THEN NW = NH ELSEIF( NDFL.LT.KEXNW .OR. NH.LT.NW ) THEN * * Typical deflation window. If possible and * advisable, nibble the entire active block. * If not, use size NWR or NWR+1 depending upon * which has the smaller corresponding subdiagonal * entry (a heuristic). * NWINC = .TRUE. IF( NH.LE.MIN( NMIN, NWMAX ) ) THEN NW = NH ELSE NW = MIN( NWR, NH, NWMAX ) IF( NW.LT.NWMAX ) THEN IF( NW.GE.NH-1 ) THEN NW = NH ELSE KWTOP = KBOT - NW + 1 CALL PSELGET( 'All', '1-Tree', ELEM1, H, KWTOP, $ KWTOP-1, DESCH ) CALL PSELGET( 'All', '1-Tree', ELEM2, H, $ KWTOP-1, KWTOP-2, DESCH ) IF( ABS( ELEM1 ).GT.ABS( ELEM2 ) ) NW = NW + 1 END IF END IF END IF ELSE * * Exceptional deflation window. If there have * been no deflations in KEXNW or more iterations, * then vary the deflation window size. At first, * because, larger windows are, in general, more * powerful than smaller ones, rapidly increase the * window up to the maximum reasonable and possible. * Then maybe try a slightly smaller window. * IF( NWINC .AND. NW.LT.MIN( NWMAX, NH ) ) THEN NW = MIN( NWMAX, NH, 2*NW ) ELSE NWINC = .FALSE. IF( NW.EQ.NH .AND. NH.GT.2 ) $ NW = NH - 1 END IF END IF * * Aggressive early deflation: * split workspace into * - an NW-by-NW work array V for orthogonal matrix * - an NW-by-at-least-NW-but-more-is-better * (NW-by-NHO) horizontal work array for Schur factor * - an at-least-NW-but-more-is-better (NVE-by-NW) * vertical work array for matrix multiplications * - align T, V and W with the deflation window * KV = N - NW + 1 KT = NW + 1 NHO = ( N-NW-1 ) - KT + 1 KWV = NW + 2 NVE = ( N-NW ) - KWV + 1 * JW = MIN( NW, KBOT-KTOP+1 ) KWTOP = KBOT - JW + 1 IROFFH = MOD( KWTOP - 1, NB ) ICOFFH = IROFFH HRSRC = INDXG2P( KWTOP, NB, MYROW, DESCH(RSRC_), NPROW ) HCSRC = INDXG2P( KWTOP, NB, MYCOL, DESCH(CSRC_), NPCOL ) VROWS = NUMROC( JW+IROFFH, NB, MYROW, HRSRC, NPROW ) VCOLS = NUMROC( JW+ICOFFH, NB, MYCOL, HCSRC, NPCOL ) CALL DESCINIT( DESCV, JW+IROFFH, JW+ICOFFH, NB, NB, $ HRSRC, HCSRC, ICTXT, MAX(1, VROWS), INFO ) * TROWS = NUMROC( JW+IROFFH, NB, MYROW, HRSRC, NPROW ) TCOLS = NUMROC( JW+ICOFFH, NB, MYCOL, HCSRC, NPCOL ) CALL DESCINIT( DESCT, JW+IROFFH, JW+ICOFFH, NB, NB, $ HRSRC, HCSRC, ICTXT, MAX(1, TROWS), INFO ) WROWS = NUMROC( JW+IROFFH, NB, MYROW, HRSRC, NPROW ) WCOLS = NUMROC( JW+ICOFFH, NB, MYCOL, HCSRC, NPCOL ) CALL DESCINIT( DESCW, JW+IROFFH, JW+ICOFFH, NB, NB, $ HRSRC, HCSRC, ICTXT, MAX(1, WROWS), INFO ) * IPV = 1 IPT = IPV + DESCV( LLD_ ) * VCOLS IPW = IPT + DESCT( LLD_ ) * TCOLS IPWRK = IPW + DESCW( LLD_ ) * WCOLS * * Aggressive early deflation * IWORK(1) = IT CALL PSLAQR3( WANTT, WANTZ, N, KTOP, KBOT, NW, H, $ DESCH, ILOZ, IHIZ, Z, DESCZ, LS, LD, WR, WI, $ WORK(IPV), DESCV, NHO, WORK(IPT), DESCT, NVE, $ WORK(IPW), DESCW, WORK(IPWRK), LWORK-IPWRK+1, $ IWORK, LIWORK, RECLEVEL ) * * Adjust KBOT accounting for new deflations. * KBOT = KBOT - LD * * KS points to the shifts. * KS = KBOT - LS + 1 * * Skip an expensive QR sweep if there is a (partly * heuristic) reason to expect that many eigenvalues * will deflate without it. Here, the QR sweep is * skipped if many eigenvalues have just been deflated * or if the remaining active block is small. * IF( ( LD.EQ.0 ) .OR. ( ( 100*LD.LE.NW*NIBBLE ) .AND. ( KBOT- $ KTOP+1.GT.MIN( NMIN, NWMAX ) ) ) ) THEN * * NS = nominal number of simultaneous shifts. * This may be lowered (slightly) if PSLAQR3 * did not provide that many shifts. * NS = MIN( NSMAX, NSR, MAX( 2, KBOT-KTOP ) ) NS = NS - MOD( NS, 2 ) * * If there have been no deflations * in a multiple of KEXSH iterations, * then try exceptional shifts. * Otherwise use shifts provided by * PSLAQR3 above or from the eigenvalues * of a trailing principal submatrix. * IF( MOD( NDFL, KEXSH ).EQ.0 ) THEN KS = KBOT - NS + 1 DO 30 I = KBOT, MAX( KS+1, KTOP+2 ), -2 CALL PSELGET( 'All', '1-Tree', ELEM1, H, I, I-1, $ DESCH ) CALL PSELGET( 'All', '1-Tree', ELEM2, H, I-1, I-2, $ DESCH ) CALL PSELGET( 'All', '1-Tree', ELEM3, H, I, I, $ DESCH ) SS = ABS( ELEM1 ) + ABS( ELEM2 ) AA = WILK1*SS + ELEM3 BB = SS CC = WILK2*SS DD = AA CALL SLANV2( AA, BB, CC, DD, WR( I-1 ), WI( I-1 ), $ WR( I ), WI( I ), CS, SN ) 30 CONTINUE IF( KS.EQ.KTOP ) THEN CALL PSELGET( 'All', '1-Tree', ELEM1, H, KS+1, $ KS+1, DESCH ) WR( KS+1 ) = ELEM1 WI( KS+1 ) = ZERO WR( KS ) = WR( KS+1 ) WI( KS ) = WI( KS+1 ) END IF ELSE * * Got NS/2 or fewer shifts? Use PSLAQR0 or * PSLAQR1 on a trailing principal submatrix to * get more. * IF( KBOT-KS+1.LE.NS / 2 ) THEN KS = KBOT - NS + 1 KT = N - NS + 1 NPMIN = PILAENVX( ICTXT, 23, 'PSLAQR0', 'EN', NS, $ NB, NPROW, NPCOL ) c c Temporarily force NPMIN <= 8 since only PSLAQR1 is used. c NPMIN = MIN(NPMIN, 8) IF( MIN(NPROW, NPCOL).LE.NPMIN+1 .OR. $ RECLEVEL.GE.1 ) THEN * * The window is large enough. Compute the Schur * decomposition with all processors. * IROFFH = MOD( KS - 1, NB ) ICOFFH = IROFFH IF( NS.GT.NMIN ) THEN HRSRC = INDXG2P( KS, NB, MYROW, DESCH(RSRC_), $ NPROW ) HCSRC = INDXG2P( KS, NB, MYROW, DESCH(CSRC_), $ NPCOL ) ELSE HRSRC = 0 HCSRC = 0 END IF TROWS = NUMROC( NS+IROFFH, NB, MYROW, HRSRC, $ NPROW ) TCOLS = NUMROC( NS+ICOFFH, NB, MYCOL, HCSRC, $ NPCOL ) CALL DESCINIT( DESCT, NS+IROFFH, NS+ICOFFH, NB, $ NB, HRSRC, HCSRC, ICTXT, MAX(1, TROWS), $ INFO ) IPT = 1 IPWRK = IPT + DESCT(LLD_) * TCOLS * IF( NS.GT.NMIN .AND. RECURSION ) THEN CALL PSLACPY( 'All', NS, NS, H, KS, KS, $ DESCH, WORK(IPT), 1+IROFFH, 1+ICOFFH, $ DESCT ) CALL PSLAQR0( .FALSE., .FALSE., IROFFH+NS, $ 1+IROFFH, IROFFH+NS, WORK(IPT), $ DESCT, WR( KS-IROFFH ), $ WI( KS-IROFFH ), 1, 1, ZDUM, $ DESCZ, WORK( IPWRK ), $ LWORK-IPWRK+1, IWORK, LIWORK, $ INF, RECLEVEL+1 ) ELSE CALL PSLAMVE( 'All', NS, NS, H, KS, KS, $ DESCH, WORK(IPT), 1+IROFFH, 1+ICOFFH, $ DESCT, WORK(IPWRK) ) CALL PSLAQR1( .FALSE., .FALSE., IROFFH+NS, $ 1+IROFFH, IROFFH+NS, WORK(IPT), $ DESCT, WR( KS-IROFFH ), $ WI( KS-IROFFH ), 1+IROFFH, IROFFH+NS, $ ZDUM, DESCZ, WORK( IPWRK ), $ LWORK-IPWRK+1, IWORK, LIWORK, INF ) END IF ELSE * * The window is too small. Redistribute the AED * window to a subgrid and do the computation on * the subgrid. * ICTXT_NEW = ICTXT DO 50 I = 0, NPMIN-1 DO 40 J = 0, NPMIN-1 PMAP( J+1+I*NPMIN ) = $ BLACS_PNUM( ICTXT, I, J ) 40 CONTINUE 50 CONTINUE CALL BLACS_GRIDMAP( ICTXT_NEW, PMAP, NPMIN, $ NPMIN, NPMIN ) CALL BLACS_GRIDINFO( ICTXT_NEW, NPMIN, NPMIN, $ MYROW_NEW, MYCOL_NEW ) IF( MYROW.GE.NPMIN .OR. MYCOL.GE.NPMIN ) $ ICTXT_NEW = -1 * IF( ICTXT_NEW.GE.0 ) THEN TROWS = NUMROC( NS, NB, MYROW_NEW, 0, NPMIN ) TCOLS = NUMROC( NS, NB, MYCOL_NEW, 0, NPMIN ) CALL DESCINIT( DESCT, NS, NS, NB, NB, 0, 0, $ ICTXT_NEW, MAX(1,TROWS), INFO ) IPT = 1 IPWRK = IPT + DESCT(LLD_) * TCOLS ELSE IPT = 1 IPWRK = 2 DESCT( CTXT_ ) = -1 INF = 0 END IF CALL PSGEMR2D( NS, NS, H, KS, KS, DESCH, $ WORK(IPT), 1, 1, DESCT, ICTXT ) * c c This part is still not perfect. c Either PSLAQR0 or PSLAQR1 can work, but not both. c c NMIN = PILAENVX( ICTXT_NEW, 12, 'PSLAQR0', c $ 'EN', NS, 1, NS, LWORK ) IF( ICTXT_NEW.GE.0 ) THEN c IF( NS.GT.NMIN .AND. RECLEVEL.LT.1 ) THEN c CALL PSLAQR0( .FALSE., .FALSE., NS, 1, c $ NS, WORK(IPT), DESCT, WR( KS ), c $ WI( KS ), 1, 1, ZDUM, DESCT, c $ WORK( IPWRK ), LWORK-IPWRK+1, IWORK, c $ LIWORK, INF, RECLEVEL+1 ) c ELSE CALL PSLAQR1( .FALSE., .FALSE., NS, 1, $ NS, WORK(IPT), DESCT, WR( KS ), $ WI( KS ), 1, NS, ZDUM, DESCT, $ WORK( IPWRK ), LWORK-IPWRK+1, IWORK, $ LIWORK, INF ) c END IF CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF IF( MYROW+MYCOL.GT.0 ) THEN DO 60 J = 0, NS-1 WR( KS+J ) = ZERO WI( KS+J ) = ZERO 60 CONTINUE END IF CALL IGAMN2D( ICTXT, 'All', '1-Tree', 1, 1, INF, $ 1, -1, -1, -1, -1, -1 ) CALL SGSUM2D( ICTXT, 'All', ' ', NS, 1, WR(KS), $ NS, -1, -1 ) CALL SGSUM2D( ICTXT, 'All', ' ', NS, 1, WI(KS), $ NS, -1, -1 ) END IF KS = KS + INF * * In case of a rare QR failure use * eigenvalues of the trailing 2-by-2 * principal submatrix. * IF( KS.GE.KBOT ) THEN CALL PSELGET( 'All', '1-Tree', AA, H, KBOT-1, $ KBOT-1, DESCH ) CALL PSELGET( 'All', '1-Tree', CC, H, KBOT, $ KBOT-1, DESCH ) CALL PSELGET( 'All', '1-Tree', BB, H, KBOT-1, $ KBOT, DESCH ) CALL PSELGET( 'All', '1-Tree', DD, H, KBOT, $ KBOT, DESCH ) CALL SLANV2( AA, BB, CC, DD, WR( KBOT-1 ), $ WI( KBOT-1 ), WR( KBOT ), $ WI( KBOT ), CS, SN ) KS = KBOT - 1 END IF END IF * IF( KBOT-KS+1.GT.NS ) THEN * * Sort the shifts (helps a little) * Bubble sort keeps complex conjugate * pairs together. * SORTED = .FALSE. DO 80 K = KBOT, KS + 1, -1 IF( SORTED ) $ GO TO 90 SORTED = .TRUE. DO 70 I = KS, K - 1 IF( ABS( WR( I ) )+ABS( WI( I ) ).LT. $ ABS( WR( I+1 ) )+ABS( WI( I+1 ) ) ) THEN SORTED = .FALSE. * SWAP = WR( I ) WR( I ) = WR( I+1 ) WR( I+1 ) = SWAP * SWAP = WI( I ) WI( I ) = WI( I+1 ) WI( I+1 ) = SWAP END IF 70 CONTINUE 80 CONTINUE 90 CONTINUE END IF * * Shuffle shifts into pairs of real shifts * and pairs of complex conjugate shifts * assuming complex conjugate shifts are * already adjacent to one another. (Yes, * they are.) * DO 100 I = KBOT, KS + 2, -2 IF( WI( I ).NE.-WI( I-1 ) ) THEN * SWAP = WR( I ) WR( I ) = WR( I-1 ) WR( I-1 ) = WR( I-2 ) WR( I-2 ) = SWAP * SWAP = WI( I ) WI( I ) = WI( I-1 ) WI( I-1 ) = WI( I-2 ) WI( I-2 ) = SWAP END IF 100 CONTINUE END IF * * If there are only two shifts and both are * real, then use only one. * IF( KBOT-KS+1.EQ.2 ) THEN IF( WI( KBOT ).EQ.ZERO ) THEN CALL PSELGET( 'All', '1-Tree', ELEM, H, KBOT, $ KBOT, DESCH ) IF( ABS( WR( KBOT )-ELEM ).LT. $ ABS( WR( KBOT-1 )-ELEM ) ) THEN WR( KBOT-1 ) = WR( KBOT ) ELSE WR( KBOT ) = WR( KBOT-1 ) END IF END IF END IF * * Use up to NS of the the smallest magnatiude * shifts. If there aren't NS shifts available, * then use them all, possibly dropping one to * make the number of shifts even. * NS = MIN( NS, KBOT-KS+1 ) NS = NS - MOD( NS, 2 ) KS = KBOT - NS + 1 * * Small-bulge multi-shift QR sweep. * TOTNS = TOTNS + NS SWEEP = SWEEP + 1 CALL PSLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, $ NS, WR( KS ), WI( KS ), H, DESCH, ILOZ, IHIZ, Z, $ DESCZ, WORK, LWORK, IWORK, LIWORK ) END IF * * Note progress (or the lack of it). * IF( LD.GT.0 ) THEN NDFL = 1 ELSE NDFL = NDFL + 1 END IF * * End of main loop. 110 CONTINUE * * Iteration limit exceeded. Set INFO to show where * the problem occurred and exit. * INFO = KBOT 120 CONTINUE END IF * * Return the optimal value of LWORK. * WORK( 1 ) = FLOAT( LWKOPT ) IWORK( 1 ) = LIWKOPT IF( .NOT. LQUERY ) THEN IWORK( 1 ) = TOTIT IWORK( 2 ) = SWEEP IWORK( 3 ) = TOTNS END IF RETURN * * End of PSLAQR0 * END scalapack-2.0.2/SRC/pslaqr1.f000644 000766 000024 00000300145 11705175572 016107 0ustar00juliestaff000000 000000 RECURSIVE SUBROUTINE PSLAQR1( WANTT, WANTZ, N, ILO, IHI, A, $ DESCA, WR, WI, ILOZ, IHIZ, Z, $ DESCZ, WORK, LWORK, IWORK, $ ILWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, ILWORK, INFO, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) REAL A( * ), WI( * ), WORK( * ), WR( * ), Z( * ) * .. * * Purpose * ======= * * PSLAQR1 is an auxiliary routine used to find the Schur decomposition * and or eigenvalues of a matrix already in Hessenberg form from * cols ILO to IHI. * * This is a modified version of PSLAHQR from ScaLAPACK version 1.7.3. * The following modifications were made: * o Recently removed workspace query functionality was added. * o Aggressive early deflation is implemented. * o Aggressive deflation (looking for two consecutive small * subdiagonal elements by PSLACONSB) is abandoned. * o The returned Schur form is now in canonical form, i.e., the * returned 2-by-2 blocks really correspond to complex conjugate * pairs of eigenvalues. * o For some reason, the original version of PSLAHQR sometimes did * not read out the converged eigenvalues correclty. This is now * fixed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix A (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that A is already upper quasi-triangular in * rows and columns IHI+1:N, and that A(ILO,ILO-1) = 0 (unless * ILO = 1). PSLAQR1 works primarily with the Hessenberg * submatrix in rows and columns ILO to IHI, but applies * transformations to all of H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * A (global input/output) REAL array, dimension * (DESCA(LLD_),*) * On entry, the upper Hessenberg matrix A. * On exit, if WANTT is .TRUE., A is upper quasi-triangular in * rows and columns ILO:IHI, with any 2-by-2 or larger diagonal * blocks not yet in standard form. If WANTT is .FALSE., the * contents of A are unspecified on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WR (global replicated output) REAL array, * dimension (N) * WI (global replicated output) REAL array, * dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in A. A may be returned with * larger diagonal blocks until the next release. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (global input/output) REAL array. * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations accumulated by PSHSEQR, and on * exit Z has been updated; transformations are applied only to * the submatrix Z(ILOZ:IHIZ,ILO:IHI). * If WANTZ is .FALSE., Z is not referenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local output) REAL array of size LWORK * * LWORK (local input) INTEGER * WORK(LWORK) is a local array and LWORK is assumed big enough * so that LWORK >= 6*N + 6*385*385 + * MAX( 2*MAX(DESCZ(LLD_),DESCA(LLD_)) + 2*LOCc(N), * 7*Ceil(N/HBL)/LCM(NPROW,NPCOL)) ) * * IWORK (global and local input) INTEGER array of size ILWORK * * ILWORK (local input) INTEGER * This holds the some of the IBLK integer arrays. This is held * as a place holder for the next release. * * INFO (global output) INTEGER * < 0: parameter number -INFO incorrect or inconsistent * = 0: successful exit * > 0: PSLAQR1 failed to compute all the eigenvalues ILO to IHI * in a total of 30*(IHI-ILO+1) iterations; if INFO = i, * elements i+1:ihi of WR and WI contain those eigenvalues * which have been successfully computed. * * Logic: * This algorithm is very similar to _LAHQR. Unlike _LAHQR, * instead of sending one double shift through the largest * unreduced submatrix, this algorithm sends multiple double shifts * and spaces them apart so that there can be parallelism across * several processor row/columns. Another critical difference is * that this algorithm aggregrates multiple transforms together in * order to apply them in a block fashion. * * Important Local Variables: * IBLK = The maximum number of bulges that can be computed. * Currently fixed. Future releases this won't be fixed. * HBL = The square block size (HBL=DESCA(MB_)=DESCA(NB_)) * ROTN = The number of transforms to block together * NBULGE = The number of bulges that will be attempted on the * current submatrix. * IBULGE = The current number of bulges started. * K1(*),K2(*) = The current bulge loops from K1(*) to K2(*). * * Subroutines: * This routine calls: * PSLAWIL -> Given the shift, get the transformation * SLASORTE -> Pair up eigenvalues so that reals are paired. * PSLACP3 -> Parallel array to local replicated array copy & * back. * SLAREF -> Row/column reflector applier. Core routine here. * PSLASMSUB -> Finds negligible subdiagonal elements. * * Current Notes and/or Restrictions: * 1.) This code requires the distributed block size to be square * and at least six (6); unlike simpler codes like LU, this * algorithm is extremely sensitive to block size. Unwise * choices of too small a block size can lead to bad * performance. * 2.) This code requires A and Z to be distributed identically * and have identical contxts. * 3.) This release currently does not have a routine for * resolving the Schur blocks into regular 2x2 form after * this code is completed. Because of this, a significant * performance impact is required while the deflation is done * by sometimes a single column of processors. * 4.) This code does not currently block the initial transforms * so that none of the rows or columns for any bulge are * completed until all are started. To offset pipeline * start-up it is recommended that at least 2*LCM(NPROW,NPCOL) * bulges are used (if possible) * 5.) The maximum number of bulges currently supported is fixed at * 32. In future versions this will be limited only by the * incoming WORK array. * 6.) The matrix A must be in upper Hessenberg form. If elements * below the subdiagonal are nonzero, the resulting transforms * may be nonsimilar. This is also true with the LAPACK * routine. * 7.) For this release, it is assumed RSRC_=CSRC_=0 * 8.) Currently, all the eigenvalues are distributed to all the * nodes. Future releases will probably distribute the * eigenvalues by the column partitioning. * 9.) The internals of this routine are subject to change. * * Implemented by: G. Henry, November 17, 1996 * * Modified by Robert Granat and Meiyue Shao, Department of Computing * Science and HPC2N, Umea University, Sweden * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE, HALF PARAMETER ( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) REAL CONST PARAMETER ( CONST = 1.50 ) INTEGER IBLK, LDS PARAMETER ( IBLK = 32, LDS = 12*IBLK+1 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, I, I1, I2, IAFIRST, IBULGE, $ ICBUF, ICOL, ICOL1, ICOL2, IERR, II, $ IRBUF, IROW, IROW1, IROW2, ISPEC, ISTART, $ ISTARTCOL, ISTARTROW, ISTOP, ISUB, $ ITERMAX, ITMP1, ITMP2, ITN, ITS, J, JAFIRST, $ JBLK, JJ, K, KI, L, LCMRC, LDA, LDZ, LEFT, $ LIHIH, LIHIZ, LILOH, LILOZ, LOCALI1, LOCALI2, $ LOCALK, LOCALM, M, MODKM1, MYCOL, MYROW, $ NBULGE, NH, NODE, NPCOL, NPROW, NR, NUM, NZ, $ RIGHT, ROTN, UP, VECSIDX, TOTIT, TOTNS, TOTSW, $ DBLK, NIBBLE, ND, NS, LTOP, LWKOPT, S1, S2, S3 REAL AVE, DISC, H00, H10, H11, H12, H21, H22, H33, $ H43H34, H44, OVFL, S, SMLNUM, SUM, T1, T1COPY, $ T2, T3, ULP, UNFL, V1SAVE, V2, V2SAVE, V3, $ V3SAVE, SN, CS, SWAP LOGICAL AED * .. * .. Local Arrays .. INTEGER ICURCOL( IBLK ), ICURROW( IBLK ), K1( IBLK ), $ K2( IBLK ), KCOL( IBLK ), KP2COL( IBLK ), $ KP2ROW( IBLK ), KROW( IBLK ), LOCALK2( IBLK ) REAL SMALLA( 6, 6, IBLK ), VCOPY( 3 ) * .. * .. External Functions .. INTEGER ILCM, NUMROC, ILAENV REAL PSLAMCH EXTERNAL ILCM, NUMROC, ILAENV, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SCOPY, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D, SGSUM2D, SLAHQR, SLAREF, $ SLARFG, SLASORTE, IGAMN2D, INFOG1L, INFOG2L, $ PSLABAD, PSLACP3, PSLASMSUB, $ PSLAWIL, PXERBLA, SLANV2, PSLAQR2, PSLAQR4 * .. * .. Intrinsic Functions .. INTRINSIC ABS, FLOAT, MAX, MIN, MOD, SIGN, SQRT * .. * .. Executable Statements .. * INFO = 0 * ITERMAX = 30*( IHI-ILO+1 ) IF( N.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL NUM = NPROW*NPCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) LCMRC = ILCM( NPROW, NPCOL ) TOTIT = 0 TOTNS = 0 TOTSW = 0 * * Determine the number of columns we have so we can check workspace * LOCALK = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) JJ = N / HBL IF( JJ*HBL.LT.N ) $ JJ = JJ + 1 JJ = 7*JJ / LCMRC LWKOPT = INT( 6*N+MAX( 3*MAX( LDA, LDZ )+2*LOCALK, JJ ) $ +6*LDS*LDS ) IF( LWORK.EQ.-1 .OR. ILWORK.EQ.-1 ) THEN WORK( 1 ) = FLOAT( LWKOPT ) RETURN ELSEIF( LWORK.LT.LWKOPT ) THEN INFO = -15 END IF IF( DESCZ( CTXT_ ).NE.DESCA( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) END IF IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1300+NB_ ) END IF IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1300+MB_ ) END IF IF( ( ILO.GT.N ) .OR. ( ILO.LT.1 ) ) THEN INFO = -4 END IF IF( ( IHI.GT.N ) .OR. ( IHI.LT.1 ) ) THEN INFO = -5 END IF IF( HBL.LT.5 ) THEN INFO = -( 700+MB_ ) END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PSLAQR1', -INFO ) WORK( 1 ) = FLOAT( LWKOPT ) RETURN END IF * * Set work array indices * S1 = 0 S2 = S1+LDS*LDS S3 = S2+LDS*LDS VECSIDX = S3+4*LDS*LDS ISUB = VECSIDX+3*N IRBUF = ISUB+N ICBUF = IRBUF+N * * Find a value for ROTN * ROTN = HBL / 3 ROTN = MAX( ROTN, HBL-2 ) ROTN = MIN( ROTN, 1 ) * IF( ILO.EQ.IHI ) THEN CALL INFOG2L( ILO, ILO, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN WR( ILO ) = A( ( ICOL-1 )*LDA+IROW ) ELSE WR( ILO ) = ZERO END IF WI( ILO ) = ZERO WORK( 1 ) = FLOAT( LWKOPT ) RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * * If the diagonal block is small enough, copy it to local memory and * call SLAHQR directly. * IF( NH .LE. LDS ) THEN CALL PSLAQR4( WANTT, WANTZ, N, ILO, IHI, A, DESCA, WR, WI, $ ILOZ, IHIZ, Z, DESCZ, WORK( S1+1 ), NH, $ WORK( S2+1 ), NH, WORK( S3+1 ), 4*LDS*LDS, $ INFO ) WORK( 1 ) = FLOAT( LWKOPT ) RETURN END IF * CALL INFOG1L( ILOZ, HBL, NPROW, MYROW, DESCZ(RSRC_), LILOZ, LIHIZ) LIHIZ = NUMROC( IHIZ, HBL, MYROW, DESCZ(RSRC_), NPROW ) * * Set machine-dependent constants for the stopping criterion. * If NORM(H) <= SQRT(OVFL), overflow should not occur. * UNFL = PSLAMCH( CONTXT, 'SAFE MINIMUM' ) OVFL = ONE / UNFL CALL PSLABAD( CONTXT, UNFL, OVFL ) ULP = PSLAMCH( CONTXT, 'PRECISION' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = ITERMAX * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of our schur block size (<=2*IBLK). Each * iteration of the loop works with the active submatrix in rows * and columns L to I. Eigenvalues I+1 to IHI have already * converged. Either L = ILO or the global A(L,L-1) is negligible * so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 450 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 420 ITS = 0, ITN TOTIT = TOTIT + 1 * * Look for a single small subdiagonal element. * CALL PSLASMSUB( A, DESCA, I, L, K, SMLNUM, WORK( IRBUF+1 ), $ LWORK-IRBUF ) L = K * IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * CALL INFOG2L( L, L-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN A( ( ICOL-1 )*LDA+IROW ) = ZERO END IF WORK( ISUB+L-1 ) = ZERO END IF * * Exit from loop if a small submatrix has split off. * M = L - 10 IF ( L .GT. I - LDS ) $ GO TO 430 * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * * Copy submatrix of size 2*JBLK and prepare to do generalized * Wilkinson shift or an exceptional shift * NH = I-L+1 AED = .TRUE. JBLK = MIN( IBLK, ( NH / 2 )-1 ) IF( JBLK.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * JBLK = JBLK - MOD( JBLK, LCMRC ) END IF JBLK = MIN( JBLK, 2*LCMRC ) JBLK = MAX( JBLK, 1 ) * IF( ITS.EQ.20 .OR. ITS.EQ.40 ) THEN * * Exceptional shift. * CALL PSLACP3( 2*JBLK, I-2*JBLK+1, A, DESCA, WORK( S1+1 ), $ LDS, -1, -1, 0 ) DO 20 II = 2*JBLK, 2, -1 WORK( S1+II+(II-1)*LDS ) = CONST*( $ ABS( WORK( S1+II+(II-1)*LDS ) )+ $ ABS( WORK( S1+II+(II-2)*LDS ) ) ) WORK( S1+II+(II-2)*LDS ) = ZERO WORK( S1+II-1+(II-1)*LDS ) = ZERO 20 CONTINUE WORK( S1+1 ) = CONST*ABS( WORK( S1+1 ) ) ELSE * * Aggressive early deflation. * IF( AED ) THEN DBLK = ILAENV( 13, 'DLAQR0', 'SV', N, L, I, 4*LDS*LDS ) DBLK = MAX( 2*JBLK, DBLK ) + 1 DBLK = MIN( NH, LDS, DBLK ) CALL PSLAQR2( WANTT, WANTZ, N, L, I, DBLK, A, DESCA, $ ILOZ, IHIZ, Z, DESCZ, NS, ND, WR, WI, $ WORK( S1+1 ), LDS, WORK( S2+1 ), DBLK, $ WORK( IRBUF+1 ), WORK( ICBUF+1 ), $ WORK( S3+1 ), 4*LDS*LDS ) * * Skip a QR sweep if enough eigenvalues are deflated. * NIBBLE = ILAENV( 14, 'DLAQR0', 'SV', N, L, I, 4*LDS*LDS ) NIBBLE = MAX( 0, NIBBLE ) I = I - ND DBLK = DBLK - ND IF( 100*ND .GT. NIBBLE*NH .OR. DBLK .LT. 2*JBLK ) GOTO 10 * * Use unconverged eigenvalues as shifts for the QR sweep. * (This option is turned off because of the quality of * these shifts are not so good.) * * IF( ND.GE.0 .AND. ND+DBLK.GE.64 ) THEN IF( .FALSE. ) THEN CALL SLASET( 'L', DBLK-1, DBLK-1, ZERO, ZERO, $ WORK( S1+2 ), LDS ) WORK( IRBUF+1 ) = WORK( S1+1 ) WORK( ICBUF+1 ) = ZERO * * Shuffle shifts into pairs of real shifts and pairs of * complex conjugate shifts assuming complex conjugate * shifts are already adjacent to one another. * DO 21 II = DBLK, 3, -2 IF( WORK( ICBUF+II ).NE.-WORK( ICBUF+II-1 ) ) THEN SWAP = WORK( IRBUF+II ) WORK( IRBUF+II ) = WORK( IRBUF+II-1 ) WORK( IRBUF+II-1 ) = WORK( IRBUF+II-2 ) WORK( IRBUF+II-2 ) = SWAP SWAP = WORK( ICBUF+II ) WORK( ICBUF+II ) = WORK( ICBUF+II-1 ) WORK( ICBUF+II-1 ) = WORK( ICBUF+II-2 ) WORK( ICBUF+II-2 ) = SWAP END IF 21 CONTINUE * * Copy undeflatable eigenvalues to the diagonal of S1. * II = 2 22 CONTINUE IF( WORK( ICBUF+II ) .EQ. ZERO ) THEN WORK( S1+II+(II-1)*LDS ) = WORK( IRBUF+II ) WORK( S1+II+(II-2)*LDS ) = ZERO II = II + 1 ELSE WORK( S1+II+(II-1)*LDS ) = WORK( IRBUF+II ) WORK( S1+II+1+II*LDS ) = WORK( IRBUF+II ) WORK( S1+II+1+(II-1)*LDS ) = WORK( ICBUF+II ) WORK( S1+II+II*LDS ) = -WORK( ICBUF+II ) II = II + 2 END IF IF( II .LE. DBLK ) GOTO 22 ELSE CALL SLAHQR( .FALSE., .FALSE., DBLK, 1, DBLK, $ WORK( S1+1 ), LDS, WORK( IRBUF+1 ), $ WORK( ICBUF+1 ), 1, DBLK, Z, LDZ, IERR ) END IF ELSE DBLK = 2*JBLK CALL PSLACP3( DBLK, I-DBLK+1, A, DESCA, WORK( S1+1 ), $ LDS, -1, -1, 0 ) CALL SLAHQR( .FALSE., .FALSE., DBLK, 1, DBLK, $ WORK( S1+1 ), LDS, WORK( IRBUF+1 ), $ WORK( ICBUF+1 ), 1, DBLK, Z, LDZ, IERR ) END IF TOTSW = TOTSW + 1 * * Prepare to use Wilkinson's double shift * H44 = WORK( S1+DBLK+(DBLK-1)*LDS ) H33 = WORK( S1+DBLK-1+(DBLK-2)*LDS ) H43H34 = WORK( S1+DBLK-1+(DBLK-1)*LDS )* $ WORK( S1+DBLK+(DBLK-2)*LDS ) IF( ( JBLK.GT.1 ) .AND. ( ITS.GT.30 ) ) THEN S = WORK( S1+DBLK-1+(DBLK-3)*LDS ) DISC = ( H33-H44 )*HALF DISC = DISC*DISC + H43H34 IF( DISC.GT.ZERO ) THEN * * Real roots: Use Wilkinson's shift twice * DISC = SQRT( DISC ) AVE = HALF*( H33+H44 ) IF( ABS( H33 )-ABS( H44 ).GT.ZERO ) THEN H33 = H33*H44 - H43H34 H44 = H33 / ( SIGN( DISC, AVE )+AVE ) ELSE H44 = SIGN( DISC, AVE ) + AVE END IF H33 = H44 H43H34 = ZERO END IF END IF END IF * * Look for two consecutive small subdiagonal elements: * PSLACONSB is the routine that does this. * * CALL PSLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, * $ WORK( IRBUF+1 ), LWORK-IRBUF ) * * Skip small submatrices * * IF ( M .GE. I - 5 ) * $ GO TO 80 * * In principle PSLACONSB needs to check all shifts to decide * whether two consecutive small subdiagonal entries are suitable * as the starting position of the bulge chasing phase. It can be * dangerous to check the first pair of shifts only. Moreover it * is quite rare to obtain an M which is much larger than L. This * process is a bit expensive compared with the benefit. * Therefore it is sensible to abandon this routine. Total amount * of communications is saved in average. * M = L * * Double-shift QR step * * NBULGE is the number of bulges that will be attempted * ISTOP = MIN( M+ROTN-MOD( M, ROTN ), I-2 ) ISTOP = MIN( ISTOP, M+HBL-3-MOD( M-1, HBL ) ) ISTOP = MIN( ISTOP, I2-2 ) ISTOP = MAX( ISTOP, M ) NBULGE = ( I-1-ISTOP ) / HBL * * Do not exceed maximum determined. * NBULGE = MIN( NBULGE, JBLK ) IF( NBULGE.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * NBULGE = NBULGE - MOD( NBULGE, LCMRC ) END IF NBULGE = MAX( NBULGE, 1 ) * TOTNS = TOTNS + NBULGE*2 * IF( ( ITS.NE.20 ) .AND. ( ITS.NE.40 ) .AND. ( NBULGE.GT.1 ) ) $ THEN * * sort the eigenpairs so that they are in twos for double * shifts. only call if several need sorting * * CALL SLASORTE( S1( 2*( JBLK-NBULGE )+1, * $ 2*( JBLK-NBULGE )+1 ), 3*IBLK, 2*NBULGE, * $ WORK( IRBUF+1 ), IERR ) CALL SLASORTE( WORK(S1+DBLK-2*NBULGE+1+(DBLK-2*NBULGE)*LDS), $ LDS, 2*NBULGE, WORK( IRBUF+1 ), IERR ) END IF * * IBULGE is the number of bulges going so far * IBULGE = 1 * * "A" row defs : main row transforms from LOCALK to LOCALI2 * CALL INFOG1L( M, HBL, NPCOL, MYCOL, DESCA(CSRC_),ITMP1,LOCALK ) LOCALK = NUMROC( N, HBL, MYCOL, DESCA(CSRC_), NPCOL ) CALL INFOG1L( 1, HBL, NPCOL, MYCOL,DESCA(CSRC_),ICOL1,LOCALI2 ) LOCALI2 = NUMROC( I2, HBL, MYCOL, DESCA(CSRC_), NPCOL ) * * "A" col defs : main col transforms from LOCALI1 to LOCALM * CALL INFOG1L( I1, HBL, NPROW,MYROW,DESCA(RSRC_),LOCALI1,ICOL1 ) ICOL1 = NUMROC( N, HBL, MYROW, DESCA(RSRC_), NPROW ) CALL INFOG1L( 1, HBL, NPROW, MYROW, DESCA(RSRC_),LOCALM,ICOL1 ) ICOL1 = NUMROC( MIN( M+3, I ), HBL, MYROW, DESCA(RSRC_),NPROW ) * * Which row & column will start the bulges * ISTARTROW = MOD( ( M+1 ) / HBL + IAFIRST, NPROW ) ISTARTCOL = MOD( ( M+1 ) / HBL + JAFIRST, NPCOL ) * CALL INFOG1L( M, HBL, NPROW, MYROW, DESCA(RSRC_), II, ITMP2 ) ITMP2 = NUMROC( N, HBL, MYROW, DESCA(RSRC_), NPROW ) CALL INFOG1L( M, HBL, NPCOL, MYCOL, DESCA(CSRC_), JJ, ITMP2 ) ITMP2 = NUMROC( N, HBL, MYCOL, DESCA(CSRC_), NPCOL ) CALL INFOG1L(1,HBL,NPROW,MYROW,DESCA(RSRC_),ISTOP,KP2ROW( 1 ) ) KP2ROW( 1 ) = NUMROC( M+2, HBL, MYROW, DESCA(RSRC_), NPROW ) CALL INFOG1L(1,HBL,NPCOL,MYCOL,DESCA(CSRC_),ISTOP,KP2COL( 1 ) ) KP2COL( 1 ) = NUMROC( M+2, HBL, MYCOL, DESCA(CSRC_), NPCOL ) * * Set all values for bulges. All bulges are stored in * intermediate steps as loops over KI. Their current "task" * over the global M to I-1 values is always K1(KI) to K2(KI). * However, because there are many bulges, K1(KI) & K2(KI) might * go past that range while later bulges (KI+1,KI+2,etc..) are * finishing up. * * Rules: * If MOD(K1(KI)-1,HBL) < HBL-2 then MOD(K2(KI)-1,HBL)= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * If .TRUE., then the Hessenberg matrix H is fully updated * so that the quasi-triangular Schur factor may be * computed (in cooperation with the calling subroutine). * If .FALSE., then only enough of H is updated to preserve * the eigenvalues. * * WANTZ (global input) LOGICAL * If .TRUE., then the orthogonal matrix Z is updated so * so that the orthogonal Schur factor may be computed * (in cooperation with the calling subroutine). * If .FALSE., then Z is not referenced. * * N (global input) INTEGER * The order of the matrix H and (if WANTZ is .TRUE.) the * order of the orthogonal matrix Z. * * KTOP (global input) INTEGER * KBOT (global input) INTEGER * It is assumed without a check that either * KBOT = N or H(KBOT+1,KBOT)=0. KBOT and KTOP together * determine an isolated block along the diagonal of the * Hessenberg matrix. However, H(KTOP,KTOP-1)=0 is not * essentially necessary if WANTT is .TRUE. . * * NW (global input) INTEGER * Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). * Normally NW .GE. 3 if PSLAQR2 is called by PSLAQR1. * * A (local input/output) REAL array, dimension * (DESCH(LLD_),*) * On input the initial N-by-N section of A stores the * Hessenberg matrix undergoing aggressive early deflation. * On output A has been transformed by an orthogonal * similarity transformation, perturbed, and the returned * to Hessenberg form that (it is to be hoped) has some * zero subdiagonal entries. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. * * Z (input/output) REAL array, dimension * (DESCH(LLD_),*) * IF WANTZ is .TRUE., then on output, the orthogonal * similarity transformation mentioned above has been * accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right. * If WANTZ is .FALSE., then Z is unreferenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * NS (global output) INTEGER * The number of unconverged (ie approximate) eigenvalues * returned in SR and SI that may be used as shifts by the * calling subroutine. * * ND (global output) INTEGER * The number of converged eigenvalues uncovered by this * subroutine. * * SR (global output) REAL array, dimension KBOT * SI (global output) REAL array, dimension KBOT * On output, the real and imaginary parts of approximate * eigenvalues that may be used for shifts are stored in * SR(KBOT-ND-NS+1) through SR(KBOT-ND) and * SI(KBOT-ND-NS+1) through SI(KBOT-ND), respectively. * On proc #0, the real and imaginary parts of converged * eigenvalues are stored in SR(KBOT-ND+1) through SR(KBOT) and * SI(KBOT-ND+1) through SI(KBOT), respectively. On other * processors, these entries are set to zero. * * T (local workspace) REAL array, dimension LDT*NW. * * LDT (local input) INTEGER * The leading dimension of the array T. * LDT >= NW. * * V (local workspace) REAL array, dimension LDV*NW. * * LDV (local input) INTEGER * The leading dimension of the array V. * LDV >= NW. * * WR (local workspace) REAL array, dimension KBOT. * WI (local workspace) REAL array, dimension KBOT. * * WORK (local workspace) REAL array, dimension LWORK. * * LWORK (local input) INTEGER * WORK(LWORK) is a local array and LWORK is assumed big enough * so that LWORK >= NW*NW. * * ================================================================ * Implemented by * Meiyue Shao, Department of Computing Science and HPC2N, * Umea University, Sweden * * ================================================================ * References: * B. Kagstrom, D. Kressner, and M. Shao, * On Aggressive Early Deflation in Parallel Variants of the QR * Algorithm. * Para 2010, to appear. * * ================================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) * .. * .. Local Scalars .. INTEGER CONTXT, HBL, I, I1, I2, IAFIRST, ICOL, ICOL1, $ ICOL2, INFO, II, IROW, IROW1, IROW2, ITMP1, $ ITMP2, J, JAFIRST, JJ, K, L, LDA, LDZ, LLDTMP, $ MYCOL, MYROW, NODE, NPCOL, NPROW, DBLK, $ HSTEP, VSTEP, KKROW, KKCOL, KLN, LTOP, LEFT, $ RIGHT, UP, DOWN, D1, D2 * .. * .. Local Arrays .. INTEGER DESCT( 9 ), DESCV( 9 ), DESCWH( 9 ), $ DESCWV( 9 ) * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SLASET, $ SLAQR3, DESCINIT, PSGEMM, PSGEMR2D, SGEMM, $ SLAMOV, SGESD2D, SGERV2D, SGEBS2D, SGEBR2D, $ IGEBS2D, IGEBR2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 * IF( N.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) * * I1 and I2 are the indices of the first row and last column of A * to which transformations must be applied. * I = KBOT L = KTOP IF( WANTT ) THEN I1 = 1 I2 = N LTOP = 1 ELSE I1 = L I2 = I LTOP = L END IF * * Begin Aggressive Early Deflation. * DBLK = NW CALL INFOG2L( I-DBLK+1, I-DBLK+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF ( MYROW .EQ. II ) THEN CALL DESCINIT( DESCT, DBLK, DBLK, DBLK, DBLK, II, JJ, CONTXT, $ LDT, INFO ) CALL DESCINIT( DESCV, DBLK, DBLK, DBLK, DBLK, II, JJ, CONTXT, $ LDV, INFO ) ELSE CALL DESCINIT( DESCT, DBLK, DBLK, DBLK, DBLK, II, JJ, CONTXT, $ 1, INFO ) CALL DESCINIT( DESCV, DBLK, DBLK, DBLK, DBLK, II, JJ, CONTXT, $ 1, INFO ) END IF CALL PSGEMR2D( DBLK, DBLK, A, I-DBLK+1, I-DBLK+1, DESCA, T, 1, 1, $ DESCT, CONTXT ) IF ( MYROW .EQ. II .AND. MYCOL .EQ. JJ ) THEN CALL SLASET( 'All', DBLK, DBLK, ZERO, ONE, V, LDV ) CALL SLAQR3( .TRUE., .TRUE., DBLK, 1, DBLK, DBLK-1, T, LDT, 1, $ DBLK, V, LDV, NS, ND, WR, WI, WORK, DBLK, DBLK, $ WORK( DBLK*DBLK+1 ), DBLK, DBLK, WORK( 2*DBLK*DBLK+1 ), $ DBLK, WORK( 3*DBLK*DBLK+1 ), LWORK-3*DBLK*DBLK ) CALL SGEBS2D( CONTXT, 'All', ' ', DBLK, DBLK, V, LDV ) CALL IGEBS2D( CONTXT, 'All', ' ', 1, 1, ND, 1 ) ELSE CALL SGEBR2D( CONTXT, 'All', ' ', DBLK, DBLK, V, LDV, II, JJ ) CALL IGEBR2D( CONTXT, 'All', ' ', 1, 1, ND, 1, II, JJ ) END IF * IF( ND .GT. 0 ) THEN * * Copy the local matrix back to the diagonal block. * CALL PSGEMR2D( DBLK, DBLK, T, 1, 1, DESCT, A, I-DBLK+1, $ I-DBLK+1, DESCA, CONTXT ) * * Update T and Z. * IF( MOD( I-DBLK, HBL )+DBLK .LE. HBL ) THEN * * Simplest case: the deflation window is located on one * processor. * Call SGEMM directly to perform the update. * HSTEP = LWORK / DBLK VSTEP = HSTEP * * Update horizontal slab in A. * IF( WANTT ) THEN CALL INFOG2L( I-DBLK+1, I+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 10 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL SGEMM( 'T', 'N', DBLK, KLN, DBLK, ONE, V, $ LDV, A( IROW+(KKCOL-1)*LDA ), LDA, ZERO, WORK, $ DBLK ) CALL SLAMOV( 'A', DBLK, KLN, WORK, DBLK, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 10 CONTINUE END IF END IF * * Update vertical slab in A. * CALL INFOG2L( LTOP, I-DBLK+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( I-DBLK, I-DBLK+1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 20 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ZERO, WORK, $ KLN ) CALL SLAMOV( 'A', KLN, DBLK, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 20 CONTINUE END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN CALL INFOG2L( ILOZ, I-DBLK+1, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, I-DBLK+1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 30 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ZERO, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, DBLK, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 30 CONTINUE END IF END IF * ELSE IF( MOD( I-DBLK, HBL )+DBLK .LE. 2*HBL ) THEN * * More complicated case: the deflation window lay on a 2x2 * processor mesh. * Call SGEMM locally and communicate by pair. * D1 = HBL - MOD( I-DBLK, HBL ) D2 = DBLK - D1 HSTEP = LWORK / DBLK VSTEP = HSTEP * * Update horizontal slab in A. * IF( WANTT ) THEN CALL INFOG2L( I-DBLK+1, I+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW .EQ. UP ) THEN IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 40 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL SGEMM( 'T', 'N', DBLK, KLN, DBLK, ONE, V, $ DBLK, A( IROW+(KKCOL-1)*LDA ), LDA, ZERO, $ WORK, DBLK ) CALL SLAMOV( 'A', DBLK, KLN, WORK, DBLK, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 40 CONTINUE END IF ELSE IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 50 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL SGEMM( 'T', 'N', D2, KLN, D1, ONE, $ V( 1, D1+1 ), LDV, A( IROW+(KKCOL-1)*LDA ), $ LDA, ZERO, WORK( D1+1 ), DBLK ) CALL SGESD2D( CONTXT, D2, KLN, WORK( D1+1 ), $ DBLK, DOWN, MYCOL ) CALL SGERV2D( CONTXT, D1, KLN, WORK, DBLK, DOWN, $ MYCOL ) CALL SGEMM( 'T', 'N', D1, KLN, D1, ONE, $ V, LDV, A( IROW+(KKCOL-1)*LDA ), LDA, ONE, $ WORK, DBLK ) CALL SLAMOV( 'A', D1, KLN, WORK, DBLK, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 50 CONTINUE ELSE IF( UP .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 60 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL SGEMM( 'T', 'N', D1, KLN, D2, ONE, $ V( D1+1, 1 ), LDV, A( IROW+(KKCOL-1)*LDA ), $ LDA, ZERO, WORK, DBLK ) CALL SGESD2D( CONTXT, D1, KLN, WORK, DBLK, UP, $ MYCOL ) CALL SGERV2D( CONTXT, D2, KLN, WORK( D1+1 ), $ DBLK, UP, MYCOL ) CALL SGEMM( 'T', 'N', D2, KLN, D2, ONE, $ V( D1+1, D1+1 ), LDV, $ A( IROW+(KKCOL-1)*LDA ), LDA, ONE, $ WORK( D1+1 ), DBLK ) CALL SLAMOV( 'A', D2, KLN, WORK( D1+1 ), DBLK, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 60 CONTINUE END IF END IF END IF * * Update vertical slab in A. * CALL INFOG2L( LTOP, I-DBLK+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. LEFT ) THEN IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( I-DBLK, I-DBLK+1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 70 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ZERO, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, DBLK, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 70 CONTINUE END IF ELSE IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( I-DBLK, I-DBLK+1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 80 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, D2, D1, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, $ V( 1, D1+1 ), LDV, ZERO, WORK( 1+D1*KLN ), $ KLN ) CALL SGESD2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, RIGHT ) CALL SGERV2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ RIGHT ) CALL SGEMM( 'N', 'N', KLN, D1, D1, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ONE, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, D1, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 80 CONTINUE ELSE IF ( LEFT .EQ. JJ ) THEN CALL INFOG2L( I-DBLK, I-DBLK+1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 90 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, D1, D2, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( D1+1, 1 ), $ LDV, ZERO, WORK, KLN ) CALL SGESD2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ LEFT ) CALL SGERV2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, LEFT ) CALL SGEMM( 'N', 'N', KLN, D2, D2, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( D1+1, D1+1 ), $ LDV, ONE, WORK( 1+D1*KLN ), KLN ) CALL SLAMOV( 'A', KLN, D2, WORK( 1+D1*KLN ), KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 90 CONTINUE END IF END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN CALL INFOG2L( ILOZ, I-DBLK+1, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. LEFT ) THEN IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, I-DBLK+1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 100 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ZERO, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, DBLK, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 100 CONTINUE END IF ELSE IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, I-DBLK+1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 110 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, D2, D1, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, $ V( 1, D1+1 ), LDV, ZERO, WORK( 1+D1*KLN ), $ KLN ) CALL SGESD2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, RIGHT ) CALL SGERV2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ RIGHT ) CALL SGEMM( 'N', 'N', KLN, D1, D1, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ONE, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, D1, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 110 CONTINUE ELSE IF( LEFT .EQ. JJ ) THEN CALL INFOG2L( IHIZ, I-DBLK+1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 120 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, D1, D2, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, $ V( D1+1, 1 ), LDV, ZERO, WORK, KLN ) CALL SGESD2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ LEFT ) CALL SGERV2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, LEFT ) CALL SGEMM( 'N', 'N', KLN, D2, D2, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, $ V( D1+1, D1+1 ), LDV, ONE, $ WORK( 1+D1*KLN ), KLN ) CALL SLAMOV( 'A', KLN, D2, WORK( 1+D1*KLN ), $ KLN, Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 120 CONTINUE END IF END IF END IF * ELSE * * Most complicated case: the deflation window lay across the * border of the processor mesh. * Treat V as a distributed matrix and call PSGEMM. * HSTEP = LWORK / DBLK * NPCOL VSTEP = LWORK / DBLK * NPROW LLDTMP = NUMROC( DBLK, DBLK, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCV, DBLK, DBLK, DBLK, DBLK, 0, 0, CONTXT, $ LLDTMP, INFO ) CALL DESCINIT( DESCWH, DBLK, HSTEP, DBLK, LWORK / DBLK, 0, $ 0, CONTXT, LLDTMP, INFO ) * * Update horizontal slab in A. * IF( WANTT ) THEN DO 130 KKCOL = I+1, N, HSTEP KLN = MIN( HSTEP, N-KKCOL+1 ) CALL PSGEMM( 'T', 'N', DBLK, KLN, DBLK, ONE, V, 1, 1, $ DESCV, A, I-DBLK+1, KKCOL, DESCA, ZERO, WORK, 1, $ 1, DESCWH ) CALL PSGEMR2D( DBLK, KLN, WORK, 1, 1, DESCWH, A, $ I-DBLK+1, KKCOL, DESCA, CONTXT ) 130 CONTINUE END IF * * Update vertical slab in A. * DO 140 KKROW = LTOP, I-DBLK, VSTEP KLN = MIN( VSTEP, I-DBLK-KKROW+1 ) LLDTMP = NUMROC( KLN, LWORK / DBLK, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCWV, KLN, DBLK, LWORK / DBLK, DBLK, 0, $ 0, CONTXT, LLDTMP, INFO ) CALL PSGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, A, KKROW, $ I-DBLK+1, DESCA, V, 1, 1, DESCV, ZERO, WORK, 1, 1, $ DESCWV ) CALL PSGEMR2D( KLN, DBLK, WORK, 1, 1, DESCWV, A, KKROW, $ I-DBLK+1, DESCA, CONTXT ) 140 CONTINUE * * Update vertical slab in Z. * IF( WANTZ ) THEN DO 150 KKROW = ILOZ, IHIZ, VSTEP KLN = MIN( VSTEP, IHIZ-KKROW+1 ) LLDTMP = NUMROC( KLN, LWORK / DBLK, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCWV, KLN, DBLK, LWORK / DBLK, DBLK, $ 0, 0, CONTXT, LLDTMP, INFO ) CALL PSGEMM( 'N', 'N', KLN, DBLK, DBLK, ONE, Z, KKROW, $ I-DBLK+1, DESCZ, V, 1, 1, DESCV, ZERO, WORK, 1, $ 1, DESCWV ) CALL PSGEMR2D( KLN, DBLK, WORK, 1, 1, DESCWV, Z, $ KKROW, I-DBLK+1, DESCZ, CONTXT ) 150 CONTINUE END IF END IF * * Extract converged eigenvalues. * II = 0 160 CONTINUE IF( II .EQ. ND-1 .OR. WI( DBLK-II ) .EQ. ZERO ) THEN IF( NODE .EQ. 0 ) THEN SR( I-II ) = WR( DBLK-II ) ELSE SR( I-II ) = ZERO END IF SI( I-II ) = ZERO II = II + 1 ELSE IF( NODE .EQ. 0 ) THEN SR( I-II-1 ) = WR( DBLK-II-1 ) SR( I-II ) = WR( DBLK-II ) SI( I-II-1 ) = WI( DBLK-II-1 ) SI( I-II ) = WI( DBLK-II ) ELSE SR( I-II-1 ) = ZERO SR( I-II ) = ZERO SI( I-II-1 ) = ZERO SI( I-II ) = ZERO END IF II = II + 2 END IF IF( II .LT. ND ) GOTO 160 END IF * * END OF PSLAQR2 * END scalapack-2.0.2/SRC/pslaqr3.f000644 000766 000024 00000126657 11705457544 016131 0ustar00juliestaff000000 000000 RECURSIVE SUBROUTINE PSLAQR3( WANTT, WANTZ, N, KTOP, KBOT, NW, H, $ DESCH, ILOZ, IHIZ, Z, DESCZ, NS, ND, $ SR, SI, V, DESCV, NH, T, DESCT, NV, $ WV, DESCW, WORK, LWORK, IWORK, $ LIWORK, RECLEVEL ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IHIZ, ILOZ, KBOT, KTOP, LWORK, N, ND, NH, NS, $ NV, NW, LIWORK, RECLEVEL LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. INTEGER DESCH( * ), DESCZ( * ), DESCT( * ), DESCV( * ), $ DESCW( * ), IWORK( * ) REAL H( * ), SI( KBOT ), SR( KBOT ), T( * ), $ V( * ), WORK( * ), WV( * ), $ Z( * ) * .. * * Purpose * ======= * * Aggressive early deflation: * * This subroutine accepts as input an upper Hessenberg matrix H and * performs an orthogonal similarity transformation designed to detect * and deflate fully converged eigenvalues from a trailing principal * submatrix. On output H has been overwritten by a new Hessenberg * matrix that is a perturbation of an orthogonal similarity * transformation of H. It is to be hoped that the final version of H * has many zero subdiagonal entries. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * If .TRUE., then the Hessenberg matrix H is fully updated * so that the quasi-triangular Schur factor may be * computed (in cooperation with the calling subroutine). * If .FALSE., then only enough of H is updated to preserve * the eigenvalues. * * WANTZ (global input) LOGICAL * If .TRUE., then the orthogonal matrix Z is updated so * so that the orthogonal Schur factor may be computed * (in cooperation with the calling subroutine). * If .FALSE., then Z is not referenced. * * N (global input) INTEGER * The order of the matrix H and (if WANTZ is .TRUE.) the * order of the orthogonal matrix Z. * * KTOP (global input) INTEGER * It is assumed that either KTOP = 1 or H(KTOP,KTOP-1)=0. * KBOT and KTOP together determine an isolated block * along the diagonal of the Hessenberg matrix. * * KBOT (global input) INTEGER * It is assumed without a check that either * KBOT = N or H(KBOT+1,KBOT)=0. KBOT and KTOP together * determine an isolated block along the diagonal of the * Hessenberg matrix. * * NW (global input) INTEGER * Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). * * H (local input/output) REAL array, dimension * (DESCH(LLD_),*) * On input the initial N-by-N section of H stores the * Hessenberg matrix undergoing aggressive early deflation. * On output H has been transformed by an orthogonal * similarity transformation, perturbed, and the returned * to Hessenberg form that (it is to be hoped) has some * zero subdiagonal entries. * * DESCH (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix H. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. * * Z (input/output) REAL array, dimension * (DESCH(LLD_),*) * IF WANTZ is .TRUE., then on output, the orthogonal * similarity transformation mentioned above has been * accumulated into Z(ILOZ:IHIZ,ILO:IHI) from the right. * If WANTZ is .FALSE., then Z is unreferenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * NS (global output) INTEGER * The number of unconverged (ie approximate) eigenvalues * returned in SR and SI that may be used as shifts by the * calling subroutine. * * ND (global output) INTEGER * The number of converged eigenvalues uncovered by this * subroutine. * * SR (global output) REAL array, dimension KBOT * SI (global output) REAL array, dimension KBOT * On output, the real and imaginary parts of approximate * eigenvalues that may be used for shifts are stored in * SR(KBOT-ND-NS+1) through SR(KBOT-ND) and * SI(KBOT-ND-NS+1) through SI(KBOT-ND), respectively. * The real and imaginary parts of converged eigenvalues * are stored in SR(KBOT-ND+1) through SR(KBOT) and * SI(KBOT-ND+1) through SI(KBOT), respectively. * * V (global workspace) REAL array, dimension * (DESCV(LLD_),*) * An NW-by-NW distributed work array. * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * NH (input) INTEGER scalar * The number of columns of T. NH.GE.NW. * * T (global workspace) REAL array, dimension * (DESCV(LLD_),*) * * DESCT (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix T. * * NV (global input) INTEGER * The number of rows of work array WV available for * workspace. NV.GE.NW. * * WV (global workspace) REAL array, dimension * (DESCW(LLD_),*) * * DESCW (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix WV. * * WORK (local workspace) REAL array, dimension LWORK. * On exit, WORK(1) is set to an estimate of the optimal value * of LWORK for the given values of N, NW, KTOP and KBOT. * * LWORK (local input) INTEGER * The dimension of the work array WORK. LWORK = 2*NW * suffices, but greater efficiency may result from larger * values of LWORK. * * If LWORK = -1, then a workspace query is assumed; PSLAQR3 * only estimates the optimal workspace size for the given * values of N, NW, KTOP and KBOT. The estimate is returned * in WORK(1). No error message related to LWORK is issued * by XERBLA. Neither H nor Z are accessed. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the workspace array IWORK * * ================================================================ * Based on contributions by * Robert Granat and Meiyue Shao, * Department of Computing Science and HPC2N, * Umea University, Sweden * * ================================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ INTEGER RECMAX LOGICAL SORTGRAD PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, RECMAX = 3, $ SORTGRAD = .FALSE. ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) * .. * .. Local Scalars .. REAL AA, BB, BETA, CC, CS, DD, EVI, EVK, FOO, S, $ SAFMAX, SAFMIN, SMLNUM, SN, TAU, ULP, $ ELEM, ELEM1, ELEM2, ELEM3, R1, ANORM, RNORM, $ RESAED INTEGER I, IFST, ILST, INFO, INFQR, J, JW, K, KCOL, $ KEND, KLN, KROW, KWTOP, LTOP, LWK1, LWK2, LWK3, $ LWKOPT, NMIN, LLDH, LLDZ, LLDT, LLDV, LLDWV, $ ICTXT, NPROW, NMAX, NPCOL, MYROW, MYCOL, NB, $ IROFFH, M, RCOLS, TAUROWS, RROWS, TAUCOLS, $ ITAU, IR, IPW, NPROCS, MLOC, IROFFHH, $ ICOFFHH, HHRSRC, HHCSRC, HHROWS, HHCOLS, $ IROFFZZ, ICOFFZZ, ZZRSRC, ZZCSRC, ZZROWS, $ ZZCOLS, IERR, TZROWS0, TZCOLS0, IERR0, IPT0, $ IPZ0, IPW0, NB2, ROUND, LILST, KK, LILST0, $ IWRK1, RSRC, CSRC, LWK4, LWK5, IWRK2, LWK6, $ LWK7, LWK8, ILWKOPT, TZROWS, TZCOLS, NSEL, $ NPMIN, ICTXT_NEW, MYROW_NEW, MYCOL_NEW LOGICAL BULGE, SORTED, LQUERY * .. * .. Local Arrays .. INTEGER PAR( 6 ), DESCR( DLEN_ ), $ DESCTAU( DLEN_ ), DESCHH( DLEN_ ), $ DESCZZ( DLEN_ ), DESCTZ0( DLEN_ ), $ PMAP( 64*64 ) REAL DDUM( 1 ) * .. * .. External Functions .. REAL SLAMCH, PSLANGE INTEGER PILAENVX, NUMROC, INDXG2P, ICEIL, BLACS_PNUM EXTERNAL SLAMCH, PILAENVX, NUMROC, INDXG2P, PSLANGE, $ ICEIL, BLACS_PNUM * .. * .. External Subroutines .. EXTERNAL PSCOPY, PSGEHRD, PSGEMM, SLABAD, PSLACPY, $ PSLAQR1, SLANV2, PSLAQR0, PSLARF, PSLARFG, $ PSLASET, PSTRORD, PSELGET, PSELSET, $ PSLAMVE, BLACS_GRIDINFO, BLACS_GRIDMAP, $ BLACS_GRIDEXIT, PSGEMR2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, FLOAT, INT, MAX, MIN, SQRT * .. * .. Executable Statements .. ICTXT = DESCH( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL * * Extract local leading dimensions, blockfactors, offset for * keeping the alignment requirements and size of deflation window. * LLDH = DESCH( LLD_ ) LLDZ = DESCZ( LLD_ ) LLDT = DESCT( LLD_ ) LLDV = DESCV( LLD_ ) LLDWV = DESCW( LLD_ ) NB = DESCH( MB_ ) IROFFH = MOD( KTOP - 1, NB ) JW = MIN( NW, KBOT-KTOP+1 ) NSEL = NB+JW * * Extract environment variables for parallel eigenvalue reordering. * PAR(1) = PILAENVX(ICTXT, 17, 'PSLAQR3', 'SV', JW, NB, -1, -1) PAR(2) = PILAENVX(ICTXT, 18, 'PSLAQR3', 'SV', JW, NB, -1, -1) PAR(3) = PILAENVX(ICTXT, 19, 'PSLAQR3', 'SV', JW, NB, -1, -1) PAR(4) = PILAENVX(ICTXT, 20, 'PSLAQR3', 'SV', JW, NB, -1, -1) PAR(5) = PILAENVX(ICTXT, 21, 'PSLAQR3', 'SV', JW, NB, -1, -1) PAR(6) = PILAENVX(ICTXT, 22, 'PSLAQR3', 'SV', JW, NB, -1, -1) * * Check if workspace query. * LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * Estimate optimal workspace. * IF( JW.LE.2 ) THEN LWKOPT = 1 ELSE * * Workspace query calls to PSGEHRD and PSORMHR. * TAUROWS = NUMROC( 1, 1, MYCOL, DESCV(RSRC_), NPROW ) TAUCOLS = NUMROC( JW+IROFFH, NB, MYCOL, DESCV(CSRC_), $ NPCOL ) CALL PSGEHRD( JW, 1, JW, T, 1, 1, DESCT, WORK, WORK, -1, $ INFO ) LWK1 = INT( WORK( 1 ) ) + TAUROWS*TAUCOLS * * Workspace query call to PSORMHR. * CALL PSORMHR( 'Right', 'No', JW, JW, 1, JW, T, 1, 1, DESCT, $ WORK, V, 1, 1, DESCV, WORK, -1, INFO ) LWK2 = INT( WORK( 1 ) ) * * Workspace query call to PSLAQR0. * NMIN = PILAENVX( ICTXT, 12, 'PSLAQR3', 'SV', JW, 1, JW, LWORK ) NMAX = ( N-1 ) / 3 IF( JW+IROFFH.GT.NMIN .AND. JW+IROFFH.LE.NMAX $ .AND. RECLEVEL.LT.RECMAX ) THEN CALL PSLAQR0( .TRUE., .TRUE., JW+IROFFH, 1+IROFFH, $ JW+IROFFH, T, DESCT, SR, SI, 1, JW, V, DESCV, $ WORK, -1, IWORK, LIWORK-NSEL, INFQR, $ RECLEVEL+1 ) LWK3 = INT( WORK( 1 ) ) IWRK1 = IWORK( 1 ) ELSE RSRC = DESCT( RSRC_ ) CSRC = DESCT( CSRC_ ) DESCT( RSRC_ ) = 0 DESCT( CSRC_ ) = 0 CALL PSLAQR1( .TRUE., .TRUE., JW+IROFFH, 1, JW+IROFFH, T, $ DESCT, SR, SI, 1, JW+IROFFH, V, DESCV, WORK, -1, $ IWORK, LIWORK-NSEL, INFQR ) DESCT( RSRC_ ) = RSRC DESCT( CSRC_ ) = CSRC LWK3 = INT( WORK( 1 ) ) IWRK1 = IWORK( 1 ) END IF * * Workspace in case of alignment problems. * TZROWS0 = NUMROC( JW+IROFFH, NB, MYROW, 0, NPROW ) TZCOLS0 = NUMROC( JW+IROFFH, NB, MYCOL, 0, NPCOL ) LWK4 = 2 * TZROWS0*TZCOLS0 * * Workspace check for reordering. * CALL PSTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, 1, $ DESCT, V, 1, 1, DESCV, DDUM, DDUM, MLOC, WORK, -1, $ IWORK, LIWORK-NSEL, INFO ) LWK5 = INT( WORK( 1 ) ) IWRK2 = IWORK( 1 ) * * Extra workspace for reflecting back spike * (workspace for PSLARF approximated for simplicity). * RROWS = NUMROC( N+IROFFH, NB, MYROW, DESCV(RSRC_), NPROW ) RCOLS = NUMROC( 1, 1, MYCOL, DESCV(CSRC_), NPCOL ) LWK6 = RROWS*RCOLS + TAUROWS*TAUCOLS + $ 2*ICEIL(ICEIL(JW+IROFFH,NB),NPROW)*NB $ *ICEIL(ICEIL(JW+IROFFH,NB),NPCOL)*NB * * Extra workspace needed by PBLAS update calls * (also estimated for simplicity). * LWK7 = MAX( ICEIL(ICEIL(JW,NB),NPROW)*NB * $ ICEIL(ICEIL(N-KBOT,NB),NPCOL)*NB, $ ICEIL(ICEIL(IHIZ-ILOZ+1,NB),NPROW)*NB * $ ICEIL(ICEIL(JW,NB),NPCOL)*NB, $ ICEIL(ICEIL(KBOT-JW,NB),NPROW)*NB * $ ICEIL(ICEIL(JW,NB),NPCOL)*NB ) * * Residual check workspace. * TZROWS = NUMROC( JW+IROFFH, NB, MYROW, DESCT(RSRC_), NPROW ) TZCOLS = NUMROC( JW+IROFFH, NB, MYCOL, DESCT(CSRC_), NPCOL ) LWK8 = 2*TZROWS*TZCOLS * * Optimal workspace. * LWKOPT = MAX( LWK1, LWK2, LWK3+LWK4, LWK5, LWK6, LWK7, LWK8 ) ILWKOPT = MAX( IWRK1, IWRK2 ) END IF * * Quick return in case of workspace query. * WORK( 1 ) = FLOAT( LWKOPT ) * * IWORK(1:NSEL) is used as the array SELECT for PSTRORD. * IWORK( 1 ) = ILWKOPT + NSEL IF( LQUERY ) $ RETURN * * Nothing to do for an empty active block ... NS = 0 ND = 0 IF( KTOP.GT.KBOT ) $ RETURN * ... nor for an empty deflation window. * IF( NW.LT.1 ) $ RETURN * * Machine constants. * SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( FLOAT( N ) / ULP ) * * Setup deflation window. * JW = MIN( NW, KBOT-KTOP+1 ) KWTOP = KBOT - JW + 1 IF( KWTOP.EQ.KTOP ) THEN S = ZERO ELSE CALL PSELGET( 'All', '1-Tree', S, H, KWTOP, KWTOP-1, DESCH ) END IF * IF( KBOT.EQ.KWTOP ) THEN * * 1-by-1 deflation window: not much to do. * CALL PSELGET( 'All', '1-Tree', SR( KWTOP ), H, KWTOP, KWTOP, $ DESCH ) SI( KWTOP ) = ZERO NS = 1 ND = 0 IF( ABS( S ).LE.MAX( SMLNUM, ULP*ABS( SR( KWTOP ) ) ) ) $ THEN NS = 0 ND = 1 IF( KWTOP.GT.KTOP ) $ CALL PSELSET( H, KWTOP, KWTOP-1 , DESCH, ZERO ) END IF RETURN END IF * IF( KWTOP.EQ.KTOP .AND. KBOT-KWTOP.EQ.1 ) THEN * * 2-by-2 deflation window: a little more to do. * CALL PSELGET( 'All', '1-Tree', AA, H, KWTOP, KWTOP, DESCH ) CALL PSELGET( 'All', '1-Tree', BB, H, KWTOP, KWTOP+1, DESCH ) CALL PSELGET( 'All', '1-Tree', CC, H, KWTOP+1, KWTOP, DESCH ) CALL PSELGET( 'All', '1-Tree', DD, H, KWTOP+1, KWTOP+1, DESCH ) CALL SLANV2( AA, BB, CC, DD, SR(KWTOP), SI(KWTOP), $ SR(KWTOP+1), SI(KWTOP+1), CS, SN ) NS = 0 ND = 2 IF( CC.EQ.ZERO ) THEN I = KWTOP IF( I+2.LE.N .AND. WANTT ) $ CALL PSROT( N-I-1, H, I, I+2, DESCH, DESCH(M_), H, I+1, $ I+2, DESCH, DESCH(M_), CS, SN, WORK, LWORK, INFO ) IF( I.GT.1 ) $ CALL PSROT( I-1, H, 1, I, DESCH, 1, H, 1, I+1, DESCH, 1, $ CS, SN, WORK, LWORK, INFO ) IF( WANTZ ) $ CALL PSROT( IHIZ-ILOZ+1, Z, ILOZ, I, DESCZ, 1, Z, ILOZ, $ I+1, DESCZ, 1, CS, SN, WORK, LWORK, INFO ) CALL PSELSET( H, I, I, DESCH, AA ) CALL PSELSET( H, I, I+1, DESCH, BB ) CALL PSELSET( H, I+1, I, DESCH, CC ) CALL PSELSET( H, I+1, I+1, DESCH, DD ) END IF WORK( 1 ) = FLOAT( LWKOPT ) RETURN END IF * * Calculate new value for IROFFH in case deflation window * was adjusted. * IROFFH = MOD( KWTOP - 1, NB ) * * Adjust number of rows and columns of T matrix descriptor * to prepare for call to PDBTRORD. * DESCT( M_ ) = JW+IROFFH DESCT( N_ ) = JW+IROFFH * * Convert to spike-triangular form. (In case of a rare QR failure, * this routine continues to do aggressive early deflation using that * part of the deflation window that converged using INFQR here and * there to keep track.) * * Copy the trailing submatrix to the working space. * CALL PSLASET( 'All', IROFFH, JW+IROFFH, ZERO, ONE, T, 1, 1, $ DESCT ) CALL PSLASET( 'All', JW, IROFFH, ZERO, ZERO, T, 1+IROFFH, 1, $ DESCT ) CALL PSLACPY( 'All', 1, JW, H, KWTOP, KWTOP, DESCH, T, 1+IROFFH, $ 1+IROFFH, DESCT ) CALL PSLACPY( 'Upper', JW-1, JW-1, H, KWTOP+1, KWTOP, DESCH, T, $ 1+IROFFH+1, 1+IROFFH, DESCT ) IF( JW.GT.2 ) $ CALL PSLASET( 'Lower', JW-2, JW-2, ZERO, ZERO, T, 1+IROFFH+2, $ 1+IROFFH, DESCT ) CALL PSLACPY( 'All', JW-1, 1, H, KWTOP+1, KWTOP+JW-1, DESCH, T, $ 1+IROFFH+1, 1+IROFFH+JW-1, DESCT ) * * Initialize the working orthogonal matrix. * CALL PSLASET( 'All', JW+IROFFH, JW+IROFFH, ZERO, ONE, V, 1, 1, $ DESCV ) * * Compute the Schur form of T. * NPMIN = PILAENVX( ICTXT, 23, 'PSLAQR3', 'SV', JW, NB, NPROW, $ NPCOL ) NMIN = PILAENVX( ICTXT, 12, 'PSLAQR3', 'SV', JW, 1, JW, LWORK ) NMAX = ( N-1 ) / 3 IF( MIN(NPROW, NPCOL).LE.NPMIN+1 .OR. RECLEVEL.GE.1 ) THEN * * The AED window is large enough. * Compute the Schur decomposition with all processors. * IF( JW+IROFFH.GT.NMIN .AND. JW+IROFFH.LE.NMAX $ .AND. RECLEVEL.LT.RECMAX ) THEN CALL PSLAQR0( .TRUE., .TRUE., JW+IROFFH, 1+IROFFH, $ JW+IROFFH, T, DESCT, SR( KWTOP-IROFFH ), $ SI( KWTOP-IROFFH ), 1+IROFFH, JW+IROFFH, V, DESCV, $ WORK, LWORK, IWORK(NSEL+1), LIWORK-NSEL, INFQR, $ RECLEVEL+1 ) ELSE IF( DESCT(RSRC_).EQ.0 .AND. DESCT(CSRC_).EQ.0 ) THEN IF( JW+IROFFH.GT.DESCT( MB_ ) ) THEN CALL PSLAQR1( .TRUE., .TRUE., JW+IROFFH, 1, $ JW+IROFFH, T, DESCT, SR( KWTOP-IROFFH ), $ SI( KWTOP-IROFFH ), 1, JW+IROFFH, V, $ DESCV, WORK, LWORK, IWORK(NSEL+1), LIWORK-NSEL, $ INFQR ) ELSE IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL SLAHQR( .TRUE., .TRUE., JW+IROFFH, 1+IROFFH, $ JW+IROFFH, T, DESCT(LLD_), $ SR( KWTOP-IROFFH ), SI( KWTOP-IROFFH ), $ 1+IROFFH, JW+IROFFH, V, DESCV(LLD_), INFQR ) ELSE INFQR = 0 END IF IF( NPROCS.GT.1 ) $ CALL IGAMN2D( ICTXT, 'All', '1-Tree', 1, 1, INFQR, $ 1, -1, -1, -1, -1, -1 ) END IF ELSEIF( JW+IROFFH.LE.DESCT( MB_ ) ) THEN IF( MYROW.EQ.DESCT(RSRC_) .AND. MYCOL.EQ.DESCT(CSRC_) ) $ THEN CALL SLAHQR( .TRUE., .TRUE., JW+IROFFH, 1+IROFFH, $ JW+IROFFH, T, DESCT(LLD_), $ SR( KWTOP-IROFFH ), SI( KWTOP-IROFFH ), $ 1+IROFFH, JW+IROFFH, V, DESCV(LLD_), INFQR ) ELSE INFQR = 0 END IF IF( NPROCS.GT.1 ) $ CALL IGAMN2D( ICTXT, 'All', '1-Tree', 1, 1, INFQR, $ 1, -1, -1, -1, -1, -1 ) ELSE TZROWS0 = NUMROC( JW+IROFFH, NB, MYROW, 0, NPROW ) TZCOLS0 = NUMROC( JW+IROFFH, NB, MYCOL, 0, NPCOL ) CALL DESCINIT( DESCTZ0, JW+IROFFH, JW+IROFFH, NB, NB, 0, $ 0, ICTXT, MAX(1,TZROWS0), IERR0 ) IPT0 = 1 IPZ0 = IPT0 + MAX(1,TZROWS0)*TZCOLS0 IPW0 = IPZ0 + MAX(1,TZROWS0)*TZCOLS0 CALL PSLAMVE( 'All', JW+IROFFH, JW+IROFFH, T, 1, 1, $ DESCT, WORK(IPT0), 1, 1, DESCTZ0, WORK(IPW0) ) CALL PSLASET( 'All', JW+IROFFH, JW+IROFFH, ZERO, ONE, $ WORK(IPZ0), 1, 1, DESCTZ0 ) CALL PSLAQR1( .TRUE., .TRUE., JW+IROFFH, 1, $ JW+IROFFH, WORK(IPT0), DESCTZ0, $ SR( KWTOP-IROFFH ), SI( KWTOP-IROFFH ), $ 1, JW+IROFFH, WORK(IPZ0), $ DESCTZ0, WORK(IPW0), LWORK-IPW0+1, IWORK(NSEL+1), $ LIWORK-NSEL, INFQR ) CALL PSLAMVE( 'All', JW+IROFFH, JW+IROFFH, WORK(IPT0), 1, $ 1, DESCTZ0, T, 1, 1, DESCT, WORK(IPW0) ) CALL PSLAMVE( 'All', JW+IROFFH, JW+IROFFH, WORK(IPZ0), 1, $ 1, DESCTZ0, V, 1, 1, DESCV, WORK(IPW0) ) END IF END IF ELSE * * The AED window is too small. * Redistribute the AED window to a subgrid * and do the computation on the subgrid. * ICTXT_NEW = ICTXT DO 20 I = 0, NPMIN-1 DO 10 J = 0, NPMIN-1 PMAP( J+1+I*NPMIN ) = BLACS_PNUM( ICTXT, I, J ) 10 CONTINUE 20 CONTINUE CALL BLACS_GRIDMAP( ICTXT_NEW, PMAP, NPMIN, NPMIN, NPMIN ) CALL BLACS_GRIDINFO( ICTXT_NEW, NPMIN, NPMIN, MYROW_NEW, $ MYCOL_NEW ) IF( MYROW.GE.NPMIN .OR. MYCOL.GE.NPMIN ) ICTXT_NEW = -1 IF( ICTXT_NEW.GE.0 ) THEN TZROWS0 = NUMROC( JW, NB, MYROW_NEW, 0, NPMIN ) TZCOLS0 = NUMROC( JW, NB, MYCOL_NEW, 0, NPMIN ) CALL DESCINIT( DESCTZ0, JW, JW, NB, NB, 0, $ 0, ICTXT_NEW, MAX(1,TZROWS0), IERR0 ) IPT0 = 1 IPZ0 = IPT0 + MAX(1,TZROWS0)*MAX(1,TZCOLS0) IPW0 = IPZ0 + MAX(1,TZROWS0)*MAX(1,TZCOLS0) ELSE IPT0 = 1 IPZ0 = 2 IPW0 = 3 DESCTZ0( CTXT_ ) = -1 INFQR = 0 END IF CALL PSGEMR2D( JW, JW, T, 1+IROFFH, 1+IROFFH, DESCT, $ WORK(IPT0), 1, 1, DESCTZ0, ICTXT ) IF( ICTXT_NEW.GE.0 ) THEN CALL PSLASET( 'All', JW, JW, ZERO, ONE, WORK(IPZ0), 1, 1, $ DESCTZ0 ) NMIN = PILAENVX( ICTXT_NEW, 12, 'PSLAQR3', 'SV', JW, 1, JW, $ LWORK ) IF( JW.GT.NMIN .AND. JW.LE.NMAX .AND. RECLEVEL.LT.1 ) THEN CALL PSLAQR0( .TRUE., .TRUE., JW, 1, JW, WORK(IPT0), $ DESCTZ0, SR( KWTOP ), SI( KWTOP ), 1, JW, $ WORK(IPZ0), DESCTZ0, WORK(IPW0), LWORK-IPW0+1, $ IWORK(NSEL+1), LIWORK-NSEL, INFQR, $ RECLEVEL+1 ) ELSE CALL PSLAQR1( .TRUE., .TRUE., JW, 1, JW, WORK(IPT0), $ DESCTZ0, SR( KWTOP ), SI( KWTOP ), 1, JW, $ WORK(IPZ0), DESCTZ0, WORK(IPW0), LWORK-IPW0+1, $ IWORK(NSEL+1), LIWORK-NSEL, INFQR ) END IF END IF CALL PSGEMR2D( JW, JW, WORK(IPT0), 1, 1, DESCTZ0, T, 1+IROFFH, $ 1+IROFFH, DESCT, ICTXT ) CALL PSGEMR2D( JW, JW, WORK(IPZ0), 1, 1, DESCTZ0, V, 1+IROFFH, $ 1+IROFFH, DESCV, ICTXT ) IF( ICTXT_NEW.GE.0 ) $ CALL BLACS_GRIDEXIT( ICTXT_NEW ) IF( MYROW+MYCOL.GT.0 ) THEN DO 40 J = 0, JW-1 SR( KWTOP+J ) = ZERO SI( KWTOP+J ) = ZERO 40 CONTINUE END IF CALL IGAMN2D( ICTXT, 'All', '1-Tree', 1, 1, INFQR, 1, -1, -1, $ -1, -1, -1 ) CALL SGSUM2D( ICTXT, 'All', ' ', JW, 1, SR(KWTOP), JW, -1, -1 ) CALL SGSUM2D( ICTXT, 'All', ' ', JW, 1, SI(KWTOP), JW, -1, -1 ) END IF * * Adjust INFQR for offset from block border in submatrices. * IF( INFQR.NE.0 ) $ INFQR = INFQR - IROFFH * * PSTRORD needs a clean margin near the diagonal. * DO 50 J = 1, JW - 3 CALL PSELSET( T, J+2, J, DESCT, ZERO ) CALL PSELSET( T, J+3, J, DESCT, ZERO ) 50 CONTINUE IF( JW.GT.2 ) $ CALL PSELSET( T, JW, JW-2, DESCT, ZERO ) * * Check local residual for AED Schur decomposition. * RESAED = 0.0 * * Clean up the array SELECT for PSTRORD. * DO 60 J = 1, NSEL IWORK( J ) = 0 60 CONTINUE * * Set local M counter to zero. * MLOC = 0 * * Outer deflation detection loop (label 80). * In this loop a bunch of undeflatable eigenvalues * are moved simultaneously. * DO 70 J = 1, IROFFH + INFQR IWORK( J ) = 1 70 CONTINUE * NS = JW ILST = INFQR + 1 + IROFFH IF( ILST.GT.1 ) THEN CALL PSELGET( 'All', '1-Tree', ELEM, T, ILST, ILST-1, DESCT ) BULGE = ELEM.NE.ZERO IF( BULGE ) ILST = ILST+1 END IF * 80 CONTINUE IF( ILST.LE.NS+IROFFH ) THEN * * Find the top-left corner of the local window. * LILST = MAX(ILST,NS+IROFFH-NB+1) IF( LILST.GT.1 ) THEN CALL PSELGET( 'All', '1-Tree', ELEM, T, LILST, LILST-1, $ DESCT ) BULGE = ELEM.NE.ZERO IF( BULGE ) LILST = LILST+1 END IF * * Lock all eigenvalues outside the local window. * DO 90 J = IROFFH+1, LILST-1 IWORK( J ) = 1 90 CONTINUE LILST0 = LILST * * Inner deflation detection loop (label 100). * In this loop, the undeflatable eigenvalues are moved to the * top-left corner of the local window. * 100 CONTINUE IF( LILST.LE.NS+IROFFH ) THEN IF( NS.EQ.1 ) THEN BULGE = .FALSE. ELSE CALL PSELGET( 'All', '1-Tree', ELEM, T, NS+IROFFH, $ NS+IROFFH-1, DESCT ) BULGE = ELEM.NE.ZERO END IF * * Small spike tip test for deflation. * IF( .NOT.BULGE ) THEN * * Real eigenvalue. * CALL PSELGET( 'All', '1-Tree', ELEM, T, NS+IROFFH, $ NS+IROFFH, DESCT ) FOO = ABS( ELEM ) IF( FOO.EQ.ZERO ) $ FOO = ABS( S ) CALL PSELGET( 'All', '1-Tree', ELEM, V, 1+IROFFH, $ NS+IROFFH, DESCV ) IF( ABS( S*ELEM ).LE.MAX( SMLNUM, ULP*FOO ) ) THEN * * Deflatable. * NS = NS - 1 ELSE * * Undeflatable: move it up out of the way. * IFST = NS DO 110 J = LILST, JW+IROFFH IWORK( J ) = 0 110 CONTINUE IWORK( IFST+IROFFH ) = 1 CALL PSTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, $ 1, DESCT, V, 1, 1, DESCV, WORK, $ WORK(JW+IROFFH+1), MLOC, $ WORK(2*(JW+IROFFH)+1), LWORK-2*(JW+IROFFH), $ IWORK(NSEL+1), LIWORK-NSEL, INFO ) * * Adjust the array SELECT explicitly so that it does not * rely on the output of PSTRORD. * IWORK( IFST+IROFFH ) = 0 IWORK( LILST ) = 1 LILST = LILST + 1 * * In case of a rare exchange failure, adjust the * pointers ILST and LILST to the current place to avoid * unexpected behaviors. * IF( INFO.NE.0 ) THEN LILST = MAX(INFO, LILST) ILST = MAX(INFO, ILST) END IF END IF ELSE * * Complex conjugate pair. * CALL PSELGET( 'All', '1-Tree', ELEM1, T, NS+IROFFH, $ NS+IROFFH, DESCT ) CALL PSELGET( 'All', '1-Tree', ELEM2, T, NS+IROFFH, $ NS+IROFFH-1, DESCT ) CALL PSELGET( 'All', '1-Tree', ELEM3, T, NS+IROFFH-1, $ NS+IROFFH, DESCT ) FOO = ABS( ELEM1 ) + SQRT( ABS( ELEM2 ) )* $ SQRT( ABS( ELEM3 ) ) IF( FOO.EQ.ZERO ) $ FOO = ABS( S ) CALL PSELGET( 'All', '1-Tree', ELEM1, V, 1+IROFFH, $ NS+IROFFH, DESCV ) CALL PSELGET( 'All', '1-Tree', ELEM2, V, 1+IROFFH, $ NS+IROFFH-1, DESCV ) IF( MAX( ABS( S*ELEM1 ), ABS( S*ELEM2 ) ).LE. $ MAX( SMLNUM, ULP*FOO ) ) THEN * * Deflatable. * NS = NS - 2 ELSE * * Undeflatable: move them up out of the way. * IFST = NS DO 120 J = LILST, JW+IROFFH IWORK( J ) = 0 120 CONTINUE IWORK( IFST+IROFFH ) = 1 IWORK( IFST+IROFFH-1 ) = 1 CALL PSTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, $ 1, DESCT, V, 1, 1, DESCV, WORK, $ WORK(JW+IROFFH+1), MLOC, $ WORK(2*(JW+IROFFH)+1), LWORK-2*(JW+IROFFH), $ IWORK(NSEL+1), LIWORK-NSEL, INFO ) * * Adjust the array SELECT explicitly so that it does not * rely on the output of PSTRORD. * IWORK( IFST+IROFFH ) = 0 IWORK( IFST+IROFFH-1 ) = 0 IWORK( LILST ) = 1 IWORK( LILST+1 ) = 1 LILST = LILST + 2 * * In case of a rare exchange failure, adjust the * pointers ILST and LILST to the current place to avoid * unexpected behaviors. * IF( INFO.NE.0 ) THEN LILST = MAX(INFO, LILST) ILST = MAX(INFO, ILST) END IF END IF END IF * * End of inner deflation detection loop. * GO TO 100 END IF * * Unlock the eigenvalues outside the local window. * Then undeflatable eigenvalues are moved to the proper position. * DO 130 J = ILST, LILST0-1 IWORK( J ) = 0 130 CONTINUE CALL PSTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, 1, $ DESCT, V, 1, 1, DESCV, WORK, WORK(JW+IROFFH+1), $ M, WORK(2*(JW+IROFFH)+1), LWORK-2*(JW+IROFFH), $ IWORK(NSEL+1), LIWORK-NSEL, INFO ) ILST = M + 1 * * In case of a rare exchange failure, adjust the pointer ILST to * the current place to avoid unexpected behaviors. * IF( INFO.NE.0 ) $ ILST = MAX(INFO, ILST) * * End of outer deflation detection loop. * GO TO 80 END IF * * Post-reordering step: copy output eigenvalues to output. * CALL SCOPY( JW, WORK(1+IROFFH), 1, SR( KWTOP ), 1 ) CALL SCOPY( JW, WORK(JW+2*IROFFH+1), 1, SI( KWTOP ), 1 ) * * Check local residual for reordered AED Schur decomposition. * RESAED = 0.0 * * Return to Hessenberg form. * IF( NS.EQ.0 ) $ S = ZERO * IF( NS.LT.JW .AND. SORTGRAD ) THEN * * Sorting diagonal blocks of T improves accuracy for * graded matrices. Bubble sort deals well with exchange * failures. Eigenvalues/shifts from T are also restored. * ROUND = 0 SORTED = .FALSE. I = NS + 1 + IROFFH 140 CONTINUE IF( SORTED ) $ GO TO 180 SORTED = .TRUE. ROUND = ROUND + 1 * KEND = I - 1 I = INFQR + 1 + IROFFH IF( I.EQ.NS+IROFFH ) THEN K = I + 1 ELSE IF( SI( KWTOP-IROFFH + I-1 ).EQ.ZERO ) THEN K = I + 1 ELSE K = I + 2 END IF 150 CONTINUE IF( K.LE.KEND ) THEN IF( K.EQ.I+1 ) THEN EVI = ABS( SR( KWTOP-IROFFH+I-1 ) ) ELSE EVI = ABS( SR( KWTOP-IROFFH+I-1 ) ) + $ ABS( SI( KWTOP-IROFFH+I-1 ) ) END IF * IF( K.EQ.KEND ) THEN EVK = ABS( SR( KWTOP-IROFFH+K-1 ) ) ELSEIF( SI( KWTOP-IROFFH+K-1 ).EQ.ZERO ) THEN EVK = ABS( SR( KWTOP-IROFFH+K-1 ) ) ELSE EVK = ABS( SR( KWTOP-IROFFH+K-1 ) ) + $ ABS( SI( KWTOP-IROFFH+K-1 ) ) END IF * IF( EVI.GE.EVK ) THEN I = K ELSE MLOC = 0 SORTED = .FALSE. IFST = I ILST = K DO 160 J = 1, I-1 IWORK( J ) = 1 MLOC = MLOC + 1 160 CONTINUE IF( K.EQ.I+2 ) THEN IWORK( I ) = 0 IWORK(I+1) = 0 ELSE IWORK( I ) = 0 END IF IF( K.NE.KEND .AND. SI( KWTOP-IROFFH+K-1 ).NE.ZERO ) THEN IWORK( K ) = 1 IWORK(K+1) = 1 MLOC = MLOC + 2 ELSE IWORK( K ) = 1 IF( K.LT.KEND ) IWORK(K+1) = 0 MLOC = MLOC + 1 END IF DO 170 J = K+2, JW+IROFFH IWORK( J ) = 0 170 CONTINUE CALL PSTRORD( 'Vectors', IWORK, PAR, JW+IROFFH, T, 1, 1, $ DESCT, V, 1, 1, DESCV, WORK, WORK(JW+IROFFH+1), M, $ WORK(2*(JW+IROFFH)+1), LWORK-2*(JW+IROFFH), $ IWORK(NSEL+1), LIWORK-NSEL, IERR ) CALL SCOPY( JW, WORK(1+IROFFH), 1, SR( KWTOP ), 1 ) CALL SCOPY( JW, WORK(JW+2*IROFFH+1), 1, SI( KWTOP ), 1 ) IF( IERR.EQ.0 ) THEN I = ILST ELSE I = K END IF END IF IF( I.EQ.KEND ) THEN K = I + 1 ELSE IF( SI( KWTOP-IROFFH+I-1 ).EQ.ZERO ) THEN K = I + 1 ELSE K = I + 2 END IF GO TO 150 END IF GO TO 140 180 CONTINUE END IF * * Restore number of rows and columns of T matrix descriptor. * DESCT( M_ ) = NW+IROFFH DESCT( N_ ) = NH+IROFFH * IF( NS.LT.JW .OR. S.EQ.ZERO ) THEN IF( NS.GT.1 .AND. S.NE.ZERO ) THEN * * Reflect spike back into lower triangle. * RROWS = NUMROC( NS+IROFFH, NB, MYROW, DESCV(RSRC_), NPROW ) RCOLS = NUMROC( 1, 1, MYCOL, DESCV(CSRC_), NPCOL ) CALL DESCINIT( DESCR, NS+IROFFH, 1, NB, 1, DESCV(RSRC_), $ DESCV(CSRC_), ICTXT, MAX(1, RROWS), INFO ) TAUROWS = NUMROC( 1, 1, MYCOL, DESCV(RSRC_), NPROW ) TAUCOLS = NUMROC( JW+IROFFH, NB, MYCOL, DESCV(CSRC_), $ NPCOL ) CALL DESCINIT( DESCTAU, 1, JW+IROFFH, 1, NB, DESCV(RSRC_), $ DESCV(CSRC_), ICTXT, MAX(1, TAUROWS), INFO ) * IR = 1 ITAU = IR + DESCR( LLD_ ) * RCOLS IPW = ITAU + DESCTAU( LLD_ ) * TAUCOLS * CALL PSLASET( 'All', NS+IROFFH, 1, ZERO, ZERO, WORK(ITAU), $ 1, 1, DESCTAU ) * CALL PSCOPY( NS, V, 1+IROFFH, 1+IROFFH, DESCV, DESCV(M_), $ WORK(IR), 1+IROFFH, 1, DESCR, 1 ) CALL PSLARFG( NS, BETA, 1+IROFFH, 1, WORK(IR), 2+IROFFH, 1, $ DESCR, 1, WORK(ITAU+IROFFH) ) CALL PSELSET( WORK(IR), 1+IROFFH, 1, DESCR, ONE ) * CALL PSLASET( 'Lower', JW-2, JW-2, ZERO, ZERO, T, 3+IROFFH, $ 1+IROFFH, DESCT ) * CALL PSLARF( 'Left', NS, JW, WORK(IR), 1+IROFFH, 1, DESCR, $ 1, WORK(ITAU+IROFFH), T, 1+IROFFH, 1+IROFFH, $ DESCT, WORK( IPW ) ) CALL PSLARF( 'Right', NS, NS, WORK(IR), 1+IROFFH, 1, DESCR, $ 1, WORK(ITAU+IROFFH), T, 1+IROFFH, 1+IROFFH, $ DESCT, WORK( IPW ) ) CALL PSLARF( 'Right', JW, NS, WORK(IR), 1+IROFFH, 1, DESCR, $ 1, WORK(ITAU+IROFFH), V, 1+IROFFH, 1+IROFFH, $ DESCV, WORK( IPW ) ) * ITAU = 1 IPW = ITAU + DESCTAU( LLD_ ) * TAUCOLS CALL PSGEHRD( JW+IROFFH, 1+IROFFH, NS+IROFFH, T, 1, 1, $ DESCT, WORK(ITAU), WORK( IPW ), LWORK-IPW+1, INFO ) END IF * * Copy updated reduced window into place. * IF( KWTOP.GT.1 ) THEN CALL PSELGET( 'All', '1-Tree', ELEM, V, 1+IROFFH, $ 1+IROFFH, DESCV ) CALL PSELSET( H, KWTOP, KWTOP-1, DESCH, S*ELEM ) END IF CALL PSLACPY( 'Upper', JW-1, JW-1, T, 1+IROFFH+1, 1+IROFFH, $ DESCT, H, KWTOP+1, KWTOP, DESCH ) CALL PSLACPY( 'All', 1, JW, T, 1+IROFFH, 1+IROFFH, DESCT, H, $ KWTOP, KWTOP, DESCH ) CALL PSLACPY( 'All', JW-1, 1, T, 1+IROFFH+1, 1+IROFFH+JW-1, $ DESCT, H, KWTOP+1, KWTOP+JW-1, DESCH ) * * Accumulate orthogonal matrix in order to update * H and Z, if requested. * IF( NS.GT.1 .AND. S.NE.ZERO ) THEN CALL PSORMHR( 'Right', 'No', JW+IROFFH, NS+IROFFH, 1+IROFFH, $ NS+IROFFH, T, 1, 1, DESCT, WORK(ITAU), V, 1, $ 1, DESCV, WORK( IPW ), LWORK-IPW+1, INFO ) END IF * * Update vertical slab in H. * IF( WANTT ) THEN LTOP = 1 ELSE LTOP = KTOP END IF KLN = MAX( 0, KWTOP-LTOP ) IROFFHH = MOD( LTOP-1, NB ) ICOFFHH = MOD( KWTOP-1, NB ) HHRSRC = INDXG2P( LTOP, NB, MYROW, DESCH(RSRC_), NPROW ) HHCSRC = INDXG2P( KWTOP, NB, MYCOL, DESCH(CSRC_), NPCOL ) HHROWS = NUMROC( KLN+IROFFHH, NB, MYROW, HHRSRC, NPROW ) HHCOLS = NUMROC( JW+ICOFFHH, NB, MYCOL, HHCSRC, NPCOL ) CALL DESCINIT( DESCHH, KLN+IROFFHH, JW+ICOFFHH, NB, NB, $ HHRSRC, HHCSRC, ICTXT, MAX(1, HHROWS), IERR ) CALL PSGEMM( 'No', 'No', KLN, JW, JW, ONE, H, LTOP, $ KWTOP, DESCH, V, 1+IROFFH, 1+IROFFH, DESCV, ZERO, $ WORK, 1+IROFFHH, 1+ICOFFHH, DESCHH ) CALL PSLACPY( 'All', KLN, JW, WORK, 1+IROFFHH, 1+ICOFFHH, $ DESCHH, H, LTOP, KWTOP, DESCH ) * * Update horizontal slab in H. * IF( WANTT ) THEN KLN = N-KBOT IROFFHH = MOD( KWTOP-1, NB ) ICOFFHH = MOD( KBOT, NB ) HHRSRC = INDXG2P( KWTOP, NB, MYROW, DESCH(RSRC_), NPROW ) HHCSRC = INDXG2P( KBOT+1, NB, MYCOL, DESCH(CSRC_), NPCOL ) HHROWS = NUMROC( JW+IROFFHH, NB, MYROW, HHRSRC, NPROW ) HHCOLS = NUMROC( KLN+ICOFFHH, NB, MYCOL, HHCSRC, NPCOL ) CALL DESCINIT( DESCHH, JW+IROFFHH, KLN+ICOFFHH, NB, NB, $ HHRSRC, HHCSRC, ICTXT, MAX(1, HHROWS), IERR ) CALL PSGEMM( 'Tr', 'No', JW, KLN, JW, ONE, V, $ 1+IROFFH, 1+IROFFH, DESCV, H, KWTOP, KBOT+1, $ DESCH, ZERO, WORK, 1+IROFFHH, 1+ICOFFHH, DESCHH ) CALL PSLACPY( 'All', JW, KLN, WORK, 1+IROFFHH, 1+ICOFFHH, $ DESCHH, H, KWTOP, KBOT+1, DESCH ) END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN KLN = IHIZ-ILOZ+1 IROFFZZ = MOD( ILOZ-1, NB ) ICOFFZZ = MOD( KWTOP-1, NB ) ZZRSRC = INDXG2P( ILOZ, NB, MYROW, DESCZ(RSRC_), NPROW ) ZZCSRC = INDXG2P( KWTOP, NB, MYCOL, DESCZ(CSRC_), NPCOL ) ZZROWS = NUMROC( KLN+IROFFZZ, NB, MYROW, ZZRSRC, NPROW ) ZZCOLS = NUMROC( JW+ICOFFZZ, NB, MYCOL, ZZCSRC, NPCOL ) CALL DESCINIT( DESCZZ, KLN+IROFFZZ, JW+ICOFFZZ, NB, NB, $ ZZRSRC, ZZCSRC, ICTXT, MAX(1, ZZROWS), IERR ) CALL PSGEMM( 'No', 'No', KLN, JW, JW, ONE, Z, ILOZ, $ KWTOP, DESCZ, V, 1+IROFFH, 1+IROFFH, DESCV, $ ZERO, WORK, 1+IROFFZZ, 1+ICOFFZZ, DESCZZ ) CALL PSLACPY( 'All', KLN, JW, WORK, 1+IROFFZZ, 1+ICOFFZZ, $ DESCZZ, Z, ILOZ, KWTOP, DESCZ ) END IF END IF * * Return the number of deflations (ND) and the number of shifts (NS). * (Subtracting INFQR from the spike length takes care of the case of * a rare QR failure while calculating eigenvalues of the deflation * window.) * ND = JW - NS NS = NS - INFQR * * Return optimal workspace. * WORK( 1 ) = FLOAT( LWKOPT ) IWORK( 1 ) = ILWKOPT + NSEL * * End of PSLAQR3 * END scalapack-2.0.2/SRC/pslaqr4.f000644 000766 000024 00000064665 11750130340 016110 0ustar00juliestaff000000 000000 SUBROUTINE PSLAQR4( WANTT, WANTZ, N, ILO, IHI, A, DESCA, WR, WI, $ ILOZ, IHIZ, Z, DESCZ, T, LDT, V, LDV, WORK, $ LWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, INFO, LDT, LDV, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) REAL A( * ), T( LDT, * ), V( LDV, * ), WI( * ), $ WORK( * ), WR( * ), Z( * ) * .. * * Purpose * ======= * * PSLAQR4 is an auxiliary routine used to find the Schur decomposition * and or eigenvalues of a matrix already in Hessenberg form from cols * ILO to IHI. This routine requires that the active block is small * enough, i.e. IHI-ILO+1 .LE. LDT, so that it can be solved by LAPACK. * Normally, it is called by PSLAQR1. All the inputs are assumed to be * valid without checking. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix A (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that A is already upper quasi-triangular in * rows and columns IHI+1:N, and that A(ILO,ILO-1) = 0 (unless * ILO = 1). PSLAQR4 works primarily with the Hessenberg * submatrix in rows and columns ILO to IHI, but applies * transformations to all of H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * A (global input/output) REAL array, dimension * (DESCA(LLD_),*) * On entry, the upper Hessenberg matrix A. * On exit, if WANTT is .TRUE., A is upper quasi-triangular in * rows and columns ILO:IHI, with any 2-by-2 or larger diagonal * blocks not yet in standard form. If WANTT is .FALSE., the * contents of A are unspecified on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WR (global replicated output) REAL array, * dimension (N) * WI (global replicated output) REAL array, * dimension (N) * The real and imaginary parts, respectively, of the computed * eigenvalues ILO to IHI are stored in the corresponding * elements of WR and WI. If two eigenvalues are computed as a * complex conjugate pair, they are stored in consecutive * elements of WR and WI, say the i-th and (i+1)th, with * WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in A. A may be returned with * larger diagonal blocks until the next release. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (global input/output) REAL array. * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations accumulated by PDHSEQR, and on * exit Z has been updated; transformations are applied only to * the submatrix Z(ILOZ:IHIZ,ILO:IHI). * If WANTZ is .FALSE., Z is not referenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * T (local workspace) REAL array, dimension LDT*NW. * * LDT (local input) INTEGER * The leading dimension of the array T. * LDT >= IHI-ILO+1. * * V (local workspace) REAL array, dimension LDV*NW. * * LDV (local input) INTEGER * The leading dimension of the array V. * LDV >= IHI-ILO+1. * * WORK (local workspace) REAL array, dimension LWORK. * * LWORK (local input) INTEGER * The dimension of the work array WORK. * LWORK >= IHI-ILO+1. * WORK(LWORK) is a local array and LWORK is assumed big enough. * Typically LWORK >= 4*LDS*LDS if this routine is called by * PSLAQR1. (LDS = 385, see PSLAQR1) * * INFO (global output) INTEGER * < 0: parameter number -INFO incorrect or inconsistent; * = 0: successful exit; * > 0: PSLAQR4 failed to compute all the eigenvalues ILO to IHI * in a total of 30*(IHI-ILO+1) iterations; if INFO = i, * elements i+1:ihi of WR and WI contain those eigenvalues * which have been successfully computed. * * ================================================================ * Implemented by * Meiyue Shao, Department of Computing Science and HPC2N, * Umea University, Sweden * * ================================================================ * References: * B. Kagstrom, D. Kressner, and M. Shao, * On Aggressive Early Deflation in Parallel Variants of the QR * Algorithm. * Para 2010, to appear. * * ================================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) * .. * .. Local Scalars .. INTEGER CONTXT, HBL, I, I1, I2, IAFIRST, ICOL, ICOL1, $ ICOL2, II, IROW, IROW1, IROW2, ITMP1, ITMP2, $ IERR, J, JAFIRST, JJ, K, L, LDA, LDZ, LLDTMP, $ MYCOL, MYROW, NODE, NPCOL, NPROW, NH, NMIN, NZ, $ HSTEP, VSTEP, KKROW, KKCOL, KLN, LTOP, LEFT, $ RIGHT, UP, DOWN, D1, D2 * .. * .. Local Arrays .. INTEGER DESCT( 9 ), DESCV( 9 ), DESCWH( 9 ), $ DESCWV( 9 ) * .. * .. External Functions .. INTEGER NUMROC, ILAENV EXTERNAL NUMROC, ILAENV * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SLASET, $ SLAHQR, SLAQR4, DESCINIT, PSGEMM, PSGEMR2D, $ SGEMM, SLAMOV, SGESD2D, SGERV2D, $ SGEBS2D, SGEBR2D, IGEBS2D, IGEBR2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 IF( N.EQ.0 .OR. NH.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) * * I1 and I2 are the indices of the first row and last column of A * to which transformations must be applied. * I = IHI L = ILO IF( WANTT ) THEN I1 = 1 I2 = N LTOP = 1 ELSE I1 = L I2 = I LTOP = L END IF * * Copy the diagonal block to local and call LAPACK. * CALL INFOG2L( ILO, ILO, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF ( MYROW .EQ. II ) THEN CALL DESCINIT( DESCT, NH, NH, NH, NH, II, JJ, CONTXT, $ LDT, IERR ) CALL DESCINIT( DESCV, NH, NH, NH, NH, II, JJ, CONTXT, $ LDV, IERR ) ELSE CALL DESCINIT( DESCT, NH, NH, NH, NH, II, JJ, CONTXT, $ 1, IERR ) CALL DESCINIT( DESCV, NH, NH, NH, NH, II, JJ, CONTXT, $ 1, IERR ) END IF CALL PSGEMR2D( NH, NH, A, ILO, ILO, DESCA, T, 1, 1, DESCT, $ CONTXT ) IF ( MYROW .EQ. II .AND. MYCOL .EQ. JJ ) THEN CALL SLASET( 'All', NH, NH, ZERO, ONE, V, LDV ) NMIN = ILAENV( 12, 'SLAQR3', 'SV', NH, 1, NH, LWORK ) IF( NH .GT. NMIN ) THEN CALL SLAQR4( .TRUE., .TRUE., NH, 1, NH, T, LDT, WR( ILO ), $ WI( ILO ), 1, NH, V, LDV, WORK, LWORK, INFO ) * Clean up the scratch used by SLAQR4. CALL SLASET( 'L', NH-2, NH-2, ZERO, ZERO, T( 3, 1 ), LDT ) ELSE CALL SLAHQR( .TRUE., .TRUE., NH, 1, NH, T, LDT, WR( ILO ), $ WI( ILO ), 1, NH, V, LDV, INFO ) END IF CALL SGEBS2D( CONTXT, 'All', ' ', NH, NH, V, LDV ) CALL IGEBS2D( CONTXT, 'All', ' ', 1, 1, INFO, 1 ) ELSE CALL SGEBR2D( CONTXT, 'All', ' ', NH, NH, V, LDV, II, JJ ) CALL IGEBR2D( CONTXT, 'All', ' ', 1, 1, INFO, 1, II, JJ ) END IF IF( INFO .NE. 0 ) INFO = INFO+ILO-1 * * Copy the local matrix back to the diagonal block. * CALL PSGEMR2D( NH, NH, T, 1, 1, DESCT, A, ILO, ILO, DESCA, $ CONTXT ) * * Update T and Z. * IF( MOD( ILO-1, HBL )+NH .LE. HBL ) THEN * * Simplest case: the diagonal block is located on one processor. * Call SGEMM directly to perform the update. * HSTEP = LWORK / NH VSTEP = HSTEP * IF( WANTT ) THEN * * Update horizontal slab in A. * CALL INFOG2L( ILO, I+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 10 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL SGEMM( 'T', 'N', NH, KLN, NH, ONE, V, $ LDV, A( IROW+(KKCOL-1)*LDA ), LDA, ZERO, WORK, $ NH ) CALL SLAMOV( 'A', NH, KLN, WORK, NH, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 10 CONTINUE END IF * * Update vertical slab in A. * CALL INFOG2L( LTOP, ILO, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( ILO-1, ILO, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 20 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, NH, NH, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ZERO, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, NH, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 20 CONTINUE END IF END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN CALL INFOG2L( ILOZ, ILO, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, ILO, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 30 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, NH, NH, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ZERO, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, NH, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 30 CONTINUE END IF END IF * ELSE IF( MOD( ILO-1, HBL )+NH .LE. 2*HBL ) THEN * * More complicated case: the diagonal block lay on a 2x2 * processor mesh. * Call SGEMM locally and communicate by pair. * D1 = HBL - MOD( ILO-1, HBL ) D2 = NH - D1 HSTEP = LWORK / NH VSTEP = HSTEP * IF( WANTT ) THEN * * Update horizontal slab in A. * CALL INFOG2L( ILO, I+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW .EQ. UP ) THEN IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 40 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL SGEMM( 'T', 'N', NH, KLN, NH, ONE, V, $ NH, A( IROW+(KKCOL-1)*LDA ), LDA, ZERO, $ WORK, NH ) CALL SLAMOV( 'A', NH, KLN, WORK, NH, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 40 CONTINUE END IF ELSE IF( MYROW .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 50 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL SGEMM( 'T', 'N', D2, KLN, D1, ONE, $ V( 1, D1+1 ), LDV, A( IROW+(KKCOL-1)*LDA ), $ LDA, ZERO, WORK( D1+1 ), NH ) CALL SGESD2D( CONTXT, D2, KLN, WORK( D1+1 ), $ NH, DOWN, MYCOL ) CALL SGERV2D( CONTXT, D1, KLN, WORK, NH, DOWN, $ MYCOL ) CALL SGEMM( 'T', 'N', D1, KLN, D1, ONE, $ V, LDV, A( IROW+(KKCOL-1)*LDA ), LDA, ONE, $ WORK, NH ) CALL SLAMOV( 'A', D1, KLN, WORK, NH, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 50 CONTINUE ELSE IF( UP .EQ. II ) THEN ICOL1 = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) DO 60 KKCOL = ICOL, ICOL1, HSTEP KLN = MIN( HSTEP, ICOL1-KKCOL+1 ) CALL SGEMM( 'T', 'N', D1, KLN, D2, ONE, $ V( D1+1, 1 ), LDV, A( IROW+(KKCOL-1)*LDA ), $ LDA, ZERO, WORK, NH ) CALL SGESD2D( CONTXT, D1, KLN, WORK, NH, UP, $ MYCOL ) CALL SGERV2D( CONTXT, D2, KLN, WORK( D1+1 ), $ NH, UP, MYCOL ) CALL SGEMM( 'T', 'N', D2, KLN, D2, ONE, $ V( D1+1, D1+1 ), LDV, $ A( IROW+(KKCOL-1)*LDA ), LDA, ONE, $ WORK( D1+1 ), NH ) CALL SLAMOV( 'A', D2, KLN, WORK( D1+1 ), NH, $ A( IROW+(KKCOL-1)*LDA ), LDA ) 60 CONTINUE END IF END IF * * Update vertical slab in A. * CALL INFOG2L( LTOP, ILO, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. LEFT ) THEN IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( ILO-1, ILO, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 70 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, NH, NH, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, $ ZERO, WORK, KLN ) CALL SLAMOV( 'A', KLN, NH, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 70 CONTINUE END IF ELSE IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( ILO-1, ILO, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 80 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, D2, D1, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( 1, D1+1 ), $ LDV, ZERO, WORK( 1+D1*KLN ), KLN ) CALL SGESD2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, RIGHT ) CALL SGERV2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ RIGHT ) CALL SGEMM( 'N', 'N', KLN, D1, D1, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V, LDV, ONE, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, D1, WORK, KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 80 CONTINUE ELSE IF ( LEFT .EQ. JJ ) THEN CALL INFOG2L( ILO-1, ILO, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 90 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, D1, D2, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( D1+1, 1 ), $ LDV, ZERO, WORK, KLN ) CALL SGESD2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ LEFT ) CALL SGERV2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, LEFT ) CALL SGEMM( 'N', 'N', KLN, D2, D2, ONE, $ A( KKROW+(ICOL-1)*LDA ), LDA, V( D1+1, D1+1 ), $ LDV, ONE, WORK( 1+D1*KLN ), KLN ) CALL SLAMOV( 'A', KLN, D2, WORK( 1+D1*KLN ), KLN, $ A( KKROW+(ICOL-1)*LDA ), LDA ) 90 CONTINUE END IF END IF END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN CALL INFOG2L( ILOZ, ILO, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYCOL .EQ. LEFT ) THEN IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, ILO, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 100 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, NH, NH, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ZERO, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, NH, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 100 CONTINUE END IF ELSE IF( MYCOL .EQ. JJ ) THEN CALL INFOG2L( IHIZ, ILO, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 110 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, D2, D1, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V( 1, D1+1 ), $ LDV, ZERO, WORK( 1+D1*KLN ), KLN ) CALL SGESD2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, RIGHT ) CALL SGERV2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ RIGHT ) CALL SGEMM( 'N', 'N', KLN, D1, D1, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V, LDV, ONE, $ WORK, KLN ) CALL SLAMOV( 'A', KLN, D1, WORK, KLN, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 110 CONTINUE ELSE IF( LEFT .EQ. JJ ) THEN CALL INFOG2L( IHIZ, ILO, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, IROW1, ICOL1, ITMP1, ITMP2 ) IF( MYROW .NE. ITMP1 ) IROW1 = IROW1-1 DO 120 KKROW = IROW, IROW1, VSTEP KLN = MIN( VSTEP, IROW1-KKROW+1 ) CALL SGEMM( 'N', 'N', KLN, D1, D2, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, V( D1+1, 1 ), $ LDV, ZERO, WORK, KLN ) CALL SGESD2D( CONTXT, KLN, D1, WORK, KLN, MYROW, $ LEFT ) CALL SGERV2D( CONTXT, KLN, D2, WORK( 1+D1*KLN ), $ KLN, MYROW, LEFT ) CALL SGEMM( 'N', 'N', KLN, D2, D2, ONE, $ Z( KKROW+(ICOL-1)*LDZ ), LDZ, $ V( D1+1, D1+1 ), LDV, ONE, WORK( 1+D1*KLN ), $ KLN ) CALL SLAMOV( 'A', KLN, D2, WORK( 1+D1*KLN ), $ KLN, Z( KKROW+(ICOL-1)*LDZ ), LDZ ) 120 CONTINUE END IF END IF END IF * ELSE * * Most complicated case: the diagonal block lay across the border * of the processor mesh. * Treat V as a distributed matrix and call PSGEMM. * HSTEP = LWORK / NH * NPCOL VSTEP = LWORK / NH * NPROW LLDTMP = NUMROC( NH, NH, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCV, NH, NH, NH, NH, 0, 0, CONTXT, $ LLDTMP, IERR ) CALL DESCINIT( DESCWH, NH, HSTEP, NH, LWORK / NH, 0, 0, $ CONTXT, LLDTMP, IERR ) * IF( WANTT ) THEN * * Update horizontal slab in A. * DO 130 KKCOL = I+1, N, HSTEP KLN = MIN( HSTEP, N-KKCOL+1 ) CALL PSGEMM( 'T', 'N', NH, KLN, NH, ONE, V, 1, 1, $ DESCV, A, ILO, KKCOL, DESCA, ZERO, WORK, 1, 1, $ DESCWH ) CALL PSGEMR2D( NH, KLN, WORK, 1, 1, DESCWH, A, $ ILO, KKCOL, DESCA, CONTXT ) 130 CONTINUE * * Update vertical slab in A. * DO 140 KKROW = LTOP, ILO-1, VSTEP KLN = MIN( VSTEP, ILO-KKROW ) LLDTMP = NUMROC( KLN, LWORK / NH, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCWV, KLN, NH, LWORK / NH, NH, 0, 0, $ CONTXT, LLDTMP, IERR ) CALL PSGEMM( 'N', 'N', KLN, NH, NH, ONE, A, KKROW, $ ILO, DESCA, V, 1, 1, DESCV, ZERO, WORK, 1, 1, $ DESCWV ) CALL PSGEMR2D( KLN, NH, WORK, 1, 1, DESCWV, A, KKROW, $ ILO, DESCA, CONTXT ) 140 CONTINUE END IF * * Update vertical slab in Z. * IF( WANTZ ) THEN DO 150 KKROW = ILOZ, IHIZ, VSTEP KLN = MIN( VSTEP, IHIZ-KKROW+1 ) LLDTMP = NUMROC( KLN, LWORK / NH, MYROW, 0, NPROW ) LLDTMP = MAX( 1, LLDTMP ) CALL DESCINIT( DESCWV, KLN, NH, LWORK / NH, NH, 0, 0, $ CONTXT, LLDTMP, IERR ) CALL PSGEMM( 'N', 'N', KLN, NH, NH, ONE, Z, KKROW, $ ILO, DESCZ, V, 1, 1, DESCV, ZERO, WORK, 1, 1, $ DESCWV ) CALL PSGEMR2D( KLN, NH, WORK, 1, 1, DESCWV, Z, $ KKROW, ILO, DESCZ, CONTXT ) 150 CONTINUE END IF END IF * * END OF PSLAQR4 * END scalapack-2.0.2/SRC/pslaqr5.f000644 000766 000024 00000312140 11750130340 016071 0ustar00juliestaff000000 000000 SUBROUTINE PSLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, DESCH, ILOZ, IHIZ, Z, DESCZ, WORK, $ LWORK, IWORK, LIWORK ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, N, NSHFTS, $ LWORK, LIWORK LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. INTEGER DESCH( * ), DESCZ( * ), IWORK( * ) REAL H( * ), SI( * ), SR( * ), Z( * ), WORK( * ) * .. * * Purpose * ======= * * This auxiliary subroutine called by PSLAQR0 performs a * single small-bulge multi-shift QR sweep by chasing separated * groups of bulges along the main block diagonal of H. * * WANTT (global input) logical scalar * WANTT = .TRUE. if the quasi-triangular Schur factor * is being computed. WANTT is set to .FALSE. otherwise. * * WANTZ (global input) logical scalar * WANTZ = .TRUE. if the orthogonal Schur factor is being * computed. WANTZ is set to .FALSE. otherwise. * * KACC22 (global input) integer with value 0, 1, or 2. * Specifies the computation mode of far-from-diagonal * orthogonal updates. * = 1: PSLAQR5 accumulates reflections and uses matrix-matrix * multiply to update the far-from-diagonal matrix entries. * = 2: PSLAQR5 accumulates reflections, uses matrix-matrix * multiply to update the far-from-diagonal matrix entries, * and takes advantage of 2-by-2 block structure during * matrix multiplies. * * N (global input) integer scalar * N is the order of the Hessenberg matrix H upon which this * subroutine operates. * * KTOP (global input) integer scalar * KBOT (global input) integer scalar * These are the first and last rows and columns of an * isolated diagonal block upon which the QR sweep is to be * applied. It is assumed without a check that * either KTOP = 1 or H(KTOP,KTOP-1) = 0 * and * either KBOT = N or H(KBOT+1,KBOT) = 0. * * NSHFTS (global input) integer scalar * NSHFTS gives the number of simultaneous shifts. NSHFTS * must be positive and even. * * SR (global input) REAL array of size (NSHFTS) * SI (global input) REAL array of size (NSHFTS) * SR contains the real parts and SI contains the imaginary * parts of the NSHFTS shifts of origin that define the * multi-shift QR sweep. * * H (local input/output) REAL array of size * (DESCH(LLD_),*) * On input H contains a Hessenberg matrix. On output a * multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied * to the isolated diagonal block in rows and columns KTOP * through KBOT. * * DESCH (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix H. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N * * Z (local input/output) REAL array of size * (DESCZ(LLD_),*) * If WANTZ = .TRUE., then the QR Sweep orthogonal * similarity transformation is accumulated into * Z(ILOZ:IHIZ,ILO:IHI) from the right. * If WANTZ = .FALSE., then Z is unreferenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace) REAL array, dimension(DWORK) * * LWORK (local input) INTEGER * The length of the workspace array WORK. * * IWORK (local workspace) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The length of the workspace array IWORK. * * ================================================================ * Based on contributions by * Robert Granat, Department of Computing Science and HPC2N, * University of Umea, Sweden. * * ============================================================ * References: * K. Braman, R. Byers, and R. Mathias, * The Multi-Shift QR Algorithm Part I: Maintaining Well Focused * Shifts, and Level 3 Performance. * SIAM J. Matrix Anal. Appl., 23(4):929--947, 2002. * * R. Granat, B. Kagstrom, and D. Kressner, * A Novel Parallel QR Algorithm for Hybrid Distributed Momory HPC * Systems. * SIAM J. Sci. Comput., 32(4):2345--2378, 2010. * * ============================================================ * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0e0, ONE = 1.0e0 ) INTEGER NTINY PARAMETER ( NTINY = 11 ) * .. * .. Local Scalars .. REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP, TAU, ELEM, STAMP, DDUM, ORTH INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, $ NS, NU, LLDH, LLDZ, LLDU, LLDV, LLDW, LLDWH, $ INFO, ICTXT, NPROW, NPCOL, NB, IROFFH, ITOP, $ NWIN, MYROW, MYCOL, LNS, NUMWIN, LKACC22, $ LCHAIN, WIN, IDONEJOB, IPNEXT, ANMWIN, LENRBUF, $ LENCBUF, ICHOFF, LRSRC, LCSRC, LKTOP, LKBOT, $ II, JJ, SWIN, EWIN, LNWIN, DIM, LLKTOP, LLKBOT, $ IPV, IPU, IPH, IPW, KU, KWH, KWV, NVE, LKS, $ IDUM, NHO, DIR, WINID, INDX, ILOC, JLOC, RSRC1, $ CSRC1, RSRC2, CSRC2, RSRC3, CSRC3, RSRC4, IPUU, $ CSRC4, LROWS, LCOLS, INDXS, KS, JLOC1, ILOC1, $ LKTOP1, LKTOP2, WCHUNK, NUMCHUNK, ODDEVEN, $ CHUNKNUM, DIM1, DIM4, IPW3, HROWS, ZROWS, $ HCOLS, IPW1, IPW2, RSRC, EAST, JLOC4, ILOC4, $ WEST, CSRC, SOUTH, NORHT, INDXE, NORTH, $ IHH, IPIW, LKBOT1, NPROCS, LIROFFH, $ WINFIN, RWS3, CLS3, INDX2, HROWS2, $ ZROWS2, HCOLS2, MNRBUF, $ MXRBUF, MNCBUF, MXCBUF, LWKOPT LOGICAL BLK22, BMP22, INTRO, DONEJOB, ODDNPROW, $ ODDNPCOL, LQUERY, BCDONE CHARACTER JBCMPZ*2, JOB * .. * .. External Functions .. LOGICAL LSAME INTEGER PILAENVX, ICEIL, INDXG2P, INDXG2L, NUMROC REAL SLAMCH, SLANGE EXTERNAL SLAMCH, PILAENVX, ICEIL, INDXG2P, INDXG2L, $ NUMROC, LSAME, SLANGE * .. * .. Intrinsic Functions .. INTRINSIC ABS, FLOAT, MAX, MIN, MOD * .. * .. Local Arrays .. REAL VT( 3 ) * .. * .. External Subroutines .. EXTERNAL SGEMM, SLABAD, SLAMOV, SLAQR1, SLARFG, SLASET, $ STRMM, SLAQR6 * .. * .. Executable Statements .. * INFO = 0 ICTXT = DESCH( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL LLDH = DESCH( LLD_ ) LLDZ = DESCZ( LLD_ ) NB = DESCH( MB_ ) IROFFH = MOD( KTOP - 1, NB ) LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * If there are no shifts, then there is nothing to do. * IF( .NOT. LQUERY .AND. NSHFTS.LT.2 ) $ RETURN * * If the active block is empty or 1-by-1, then there * is nothing to do. * IF( .NOT. LQUERY .AND. KTOP.GE.KBOT ) $ RETURN * * Shuffle shifts into pairs of real shifts and pairs of * complex conjugate shifts assuming complex conjugate * shifts are already adjacent to one another. * IF( .NOT. LQUERY ) THEN DO 10 I = 1, NSHFTS - 2, 2 IF( SI( I ).NE.-SI( I+1 ) ) THEN * SWAP = SR( I ) SR( I ) = SR( I+1 ) SR( I+1 ) = SR( I+2 ) SR( I+2 ) = SWAP * SWAP = SI( I ) SI( I ) = SI( I+1 ) SI( I+1 ) = SI( I+2 ) SI( I+2 ) = SWAP END IF 10 CONTINUE END IF * * NSHFTS is supposed to be even, but if is odd, * then simply reduce it by one. The shuffle above * ensures that the dropped shift is real and that * the remaining shifts are paired. * NS = NSHFTS - MOD( NSHFTS, 2 ) * * Extract the size of the computational window. * NWIN = PILAENVX( ICTXT, 19, 'PSLAQR5', JBCMPZ, N, NB, NB, NB ) NWIN = MIN( NWIN, KBOT-KTOP+1 ) * * Adjust number of simultaneous shifts if it exceeds the limit * set by the number of diagonal blocks in the active submatrix * H(KTOP:KBOT,KTOP:KBOT). * NS = MAX( 2, MIN( NS, ICEIL( KBOT-KTOP+1, NB )*NWIN/3 ) ) NS = NS - MOD( NS, 2 ) * * Decide the number of simultaneous computational windows * from the number of shifts - each window should contain up to * (NWIN / 3) shifts. Also compute the number of shifts per * window and make sure that number is even. * LNS = MIN( MAX( 2, NWIN / 3 ), MAX( 2, NS / MIN(NPROW,NPCOL) ) ) LNS = LNS - MOD( LNS, 2 ) NUMWIN = MAX( 1, MIN( ICEIL( NS, LNS ), $ ICEIL( KBOT-KTOP+1, NB ) - 1 ) ) IF( NPROW.NE.NPCOL ) THEN NUMWIN = MIN( NUMWIN, MIN(NPROW,NPCOL) ) LNS = MIN( LNS, MAX( 2, NS / MIN(NPROW,NPCOL) ) ) LNS = LNS - MOD( LNS, 2 ) END IF * * Machine constants for deflation. * SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( FLOAT( N ) / ULP ) * * Use accumulated reflections to update far-from-diagonal * entries on a local level? * IF( LNS.LT.14 ) THEN LKACC22 = 1 ELSE LKACC22 = 2 END IF * * If so, exploit the 2-by-2 block structure? * ( Usually it is not efficient to exploit the 2-by-2 structure * because the block size is too small. ) * BLK22 = ( LNS.GT.2 ) .AND. ( KACC22.EQ.2 ) * * Clear trash. * IF( .NOT. LQUERY .AND. KTOP+2.LE.KBOT ) $ CALL PSELSET( H, KTOP+2, KTOP, DESCH, ZERO ) * * NBMPS = number of 2-shift bulges in each chain * NBMPS = LNS / 2 * * KDU = width of slab * KDU = 6*NBMPS - 3 * * LCHAIN = length of each chain * LCHAIN = 3 * NBMPS + 1 * * Check if workspace query. * IF( LQUERY ) THEN HROWS = NUMROC( N, NB, MYROW, DESCH(RSRC_), NPROW ) HCOLS = NUMROC( N, NB, MYCOL, DESCH(CSRC_), NPCOL ) LWKOPT = (5+2*NUMWIN)*NB**2 + 2*HROWS*NB + HCOLS*NB + $ MAX( HROWS*NB, HCOLS*NB ) WORK(1) = FLOAT(LWKOPT) IWORK(1) = 5*NUMWIN RETURN END IF * * Check if KTOP and KBOT are valid. * IF( KTOP.LT.1 .OR. KBOT.GT.N ) STOP * * Create and chase NUMWIN chains of NBMPS bulges. * * Set up window introduction. * ANMWIN = 0 INTRO = .TRUE. IPIW = 1 * * Main loop: * While-loop over the computational windows which is * terminated when all windows have been introduced, * chased down to the bottom of the considered submatrix * and chased off. * 20 CONTINUE * * Set up next window as long as we have less than the prescribed * number of windows. Each window is described an integer quadruple: * 1. Local value of KTOP (below denoted by LKTOP) * 2. Local value of KBOT (below denoted by LKBOT) * 3-4. Processor indices (LRSRC,LCSRC) associated with the window. * (5. Mark that decides if a window is fully processed or not) * * Notice - the next window is only introduced if the first block * in the active submatrix does not contain any other windows. * IF( ANMWIN.GT.0 ) THEN LKTOP = IWORK( 1+(ANMWIN-1)*5 ) ELSE LKTOP = KTOP END IF IF( INTRO .AND. (ANMWIN.EQ.0 .OR. LKTOP.GT.ICEIL(KTOP,NB)*NB) ) $ THEN ANMWIN = ANMWIN + 1 * * Structure of IWORK: * IWORK( 1+(WIN-1)*5 ): start position * IWORK( 2+(WIN-1)*5 ): stop position * IWORK( 3+(WIN-1)*5 ): processor row id * IWORK( 4+(WIN-1)*5 ): processor col id * IWORK( 5+(WIN-1)*5 ): window status (0, 1, or 2) * IWORK( 1+(ANMWIN-1)*5 ) = KTOP IWORK( 2+(ANMWIN-1)*5 ) = KTOP + $ MIN( NWIN,NB-IROFFH,KBOT-KTOP+1 ) - 1 IWORK( 3+(ANMWIN-1)*5 ) = INDXG2P( IWORK(1+(ANMWIN-1)*5), NB, $ MYROW, DESCH(RSRC_), NPROW ) IWORK( 4+(ANMWIN-1)*5 ) = INDXG2P( IWORK(2+(ANMWIN-1)*5), NB, $ MYCOL, DESCH(CSRC_), NPCOL ) IWORK( 5+(ANMWIN-1)*5 ) = 0 IPIW = 6+(ANMWIN-1)*5 IF( ANMWIN.EQ.NUMWIN ) INTRO = .FALSE. END IF * * Do-loop over the number of windows. * IPNEXT = 1 DONEJOB = .FALSE. IDONEJOB = 0 LENRBUF = 0 LENCBUF = 0 ICHOFF = 0 DO 40 WIN = 1, ANMWIN * * Extract window information to simplify the rest. * LRSRC = IWORK( 3+(WIN-1)*5 ) LCSRC = IWORK( 4+(WIN-1)*5 ) LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = LKBOT - LKTOP + 1 * * Check if anything to do for current window, i.e., if the local * chain of bulges has reached the next block border etc. * IF( IWORK(5+(WIN-1)*5).LT.2 .AND. LNWIN.GT.1 .AND. $ (LNWIN.GT.LCHAIN .OR. LKBOT.EQ.KBOT ) ) THEN LIROFFH = MOD(LKTOP-1,NB) SWIN = LKTOP-LIROFFH EWIN = MIN(KBOT,LKTOP-LIROFFH+NB-1) DIM = EWIN-SWIN+1 IF( DIM.LE.NTINY .AND. .NOT.LKBOT.EQ.KBOT ) THEN IWORK( 5+(WIN-1)*5 ) = 2 GO TO 45 END IF IDONEJOB = 1 IF( IWORK(5+(WIN-1)*5).EQ.0 ) THEN IWORK(5+(WIN-1)*5) = 1 END IF * * Let the process that owns the corresponding window do the * local bulge chase. * IF( MYROW.EQ.LRSRC .AND. MYCOL.EQ.LCSRC ) THEN * * Set the kind of job to do in SLAQR6: * 1. JOB = 'I': Introduce and chase bulges in window WIN * 2. JOB = 'C': Chase bulges from top to bottom of window WIN * 3. JOB = 'O': Chase bulges off window WIN * 4. JOB = 'A': All of 1-3 above is done - this will for * example happen for very small active * submatrices (like 2-by-2) * LLKBOT = LLKTOP + LNWIN - 1 IF( LKTOP.EQ.KTOP .AND. LKBOT.EQ.KBOT ) THEN JOB = 'All steps' ICHOFF = 1 ELSEIF( LKTOP.EQ.KTOP ) THEN JOB = 'Introduce and chase' ELSEIF( LKBOT.EQ.KBOT ) THEN JOB = 'Off-chase bulges' ICHOFF = 1 ELSE JOB = 'Chase bulges' END IF * * Copy submatrix of H corresponding to window WIN into * workspace and set out additional workspace for storing * orthogonal transformations. This submatrix must be at * least (NTINY+1)-by-(NTINY+1) to fit into SLAQR6 - if not, * abort and go for cross border bulge chasing with this * particular window. * II = INDXG2L( SWIN, NB, MYROW, DESCH(RSRC_), NPROW ) JJ = INDXG2L( SWIN, NB, MYCOL, DESCH(CSRC_), NPCOL ) LLKTOP = 1 + LIROFFH LLKBOT = LLKTOP + LNWIN - 1 * IPU = IPNEXT IPH = IPU + LNWIN**2 IPUU = IPH + MAX(NTINY+1,DIM)**2 IPV = IPUU + MAX(NTINY+1,DIM)**2 IPNEXT = IPH * IF( LSAME( JOB, 'A' ) .OR. LSAME( JOB, 'O' ) .AND. $ DIM.LT.NTINY+1 ) THEN CALL SLASET( 'All', NTINY+1, NTINY+1, ZERO, ONE, $ WORK(IPH), NTINY+1 ) END IF CALL SLAMOV( 'Upper', DIM, DIM, H(II+(JJ-1)*LLDH), LLDH, $ WORK(IPH), MAX(NTINY+1,DIM) ) CALL SCOPY( DIM-1, H(II+(JJ-1)*LLDH+1), LLDH+1, $ WORK(IPH+1), MAX(NTINY+1,DIM)+1 ) IF( LSAME( JOB, 'C' ) .OR. LSAME( JOB, 'O') ) THEN CALL SCOPY( DIM-2, H(II+(JJ-1)*LLDH+2), LLDH+1, $ WORK(IPH+2), MAX(NTINY+1,DIM)+1 ) CALL SCOPY( DIM-3, H(II+(JJ-1)*LLDH+3), LLDH+1, $ WORK(IPH+3), MAX(NTINY+1,DIM)+1 ) CALL SLASET( 'Lower', DIM-4, DIM-4, ZERO, $ ZERO, WORK(IPH+4), MAX(NTINY+1,DIM) ) ELSE CALL SLASET( 'Lower', DIM-2, DIM-2, ZERO, $ ZERO, WORK(IPH+2), MAX(NTINY+1,DIM) ) END IF * KU = MAX(NTINY+1,DIM) - KDU + 1 KWH = KDU + 1 NHO = ( MAX(NTINY+1,DIM)-KDU+1-4 ) - ( KDU+1 ) + 1 KWV = KDU + 4 NVE = MAX(NTINY+1,DIM) - KDU - KWV + 1 CALL SLASET( 'All', MAX(NTINY+1,DIM), $ MAX(NTINY+1,DIM), ZERO, ONE, WORK(IPUU), $ MAX(NTINY+1,DIM) ) * * Small-bulge multi-shift QR sweep. * LKS = MAX( 1, NS - WIN*LNS + 1 ) CALL SLAQR6( JOB, WANTT, .TRUE., LKACC22, $ MAX(NTINY+1,DIM), LLKTOP, LLKBOT, LNS, SR( LKS ), $ SI( LKS ), WORK(IPH), MAX(NTINY+1,DIM), LLKTOP, $ LLKBOT, WORK(IPUU), MAX(NTINY+1,DIM), WORK(IPU), $ 3, WORK( IPH+KU-1 ), $ MAX(NTINY+1,DIM), NVE, WORK( IPH+KWV-1 ), $ MAX(NTINY+1,DIM), NHO, WORK( IPH-1+KU+(KWH-1)* $ MAX(NTINY+1,DIM) ), MAX(NTINY+1,DIM) ) * * Copy submatrix of H back. * CALL SLAMOV( 'Upper', DIM, DIM, WORK(IPH), $ MAX(NTINY+1,DIM), H(II+(JJ-1)*LLDH), LLDH ) CALL SCOPY( DIM-1, WORK(IPH+1), MAX(NTINY+1,DIM)+1, $ H(II+(JJ-1)*LLDH+1), LLDH+1 ) IF( LSAME( JOB, 'I' ) .OR. LSAME( JOB, 'C' ) ) THEN CALL SCOPY( DIM-2, WORK(IPH+2), DIM+1, $ H(II+(JJ-1)*LLDH+2), LLDH+1 ) CALL SCOPY( DIM-3, WORK(IPH+3), DIM+1, $ H(II+(JJ-1)*LLDH+3), LLDH+1 ) ELSE CALL SLASET( 'Lower', DIM-2, DIM-2, ZERO, $ ZERO, H(II+(JJ-1)*LLDH+2), LLDH ) END IF * * Copy actual submatrix of U to the correct place * of the buffer. * CALL SLAMOV( 'All', LNWIN, LNWIN, $ WORK(IPUU+(MAX(NTINY+1,DIM)*LIROFFH)+LIROFFH), $ MAX(NTINY+1,DIM), WORK(IPU), LNWIN ) END IF * * In case the local submatrix was smaller than * (NTINY+1)-by-(NTINY+1) we go here and proceed. * 45 CONTINUE ELSE IWORK( 5+(WIN-1)*5 ) = 2 END IF * * Increment counter for buffers of orthogonal transformations. * IF( MYROW.EQ.LRSRC .OR. MYCOL.EQ.LCSRC ) THEN IF( IDONEJOB.EQ.1 .AND. IWORK(5+(WIN-1)*5).LT.2 ) THEN IF( MYROW.EQ.LRSRC ) LENRBUF = LENRBUF + LNWIN*LNWIN IF( MYCOL.EQ.LCSRC ) LENCBUF = LENCBUF + LNWIN*LNWIN END IF END IF 40 CONTINUE * * Did some work in the above do-loop? * CALL IGSUM2D( ICTXT, 'All', '1-Tree', 1, 1, IDONEJOB, 1, -1, -1 ) DONEJOB = IDONEJOB.GT.0 * * Chased off bulges from first window? * IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', '1-Tree', 1, 1, ICHOFF, 1, -1, $ -1, -1, -1, -1 ) * * If work was done in the do-loop over local windows, perform * updates, otherwise go for cross border bulge chasing and updates. * IF( DONEJOB ) THEN * * Broadcast orthogonal transformations. * 49 CONTINUE IF( LENRBUF.GT.0 .OR. LENCBUF.GT.0 ) THEN DO 50 DIR = 1, 2 BCDONE = .FALSE. DO 60 WIN = 1, ANMWIN IF( ( LENRBUF.EQ.0 .AND. LENCBUF.EQ.0 ) .OR. $ BCDONE ) GO TO 62 LRSRC = IWORK( 3+(WIN-1)*5 ) LCSRC = IWORK( 4+(WIN-1)*5 ) IF( MYROW.EQ.LRSRC .AND. MYCOL.EQ.LCSRC ) THEN IF( DIR.EQ.1 .AND. LENRBUF.GT.0 .AND. $ NPCOL.GT.1 ) THEN CALL SGEBS2D( ICTXT, 'Row', '1-Tree', LENRBUF, $ 1, WORK, LENRBUF ) ELSEIF( DIR.EQ.2 .AND. LENCBUF.GT.0 .AND. $ NPROW.GT.1 ) THEN CALL SGEBS2D( ICTXT, 'Col', '1-Tree', LENCBUF, $ 1, WORK, LENCBUF ) END IF IF( LENRBUF.GT.0 ) $ CALL SLAMOV( 'All', LENRBUF, 1, WORK, LENRBUF, $ WORK(1+LENRBUF), LENCBUF ) BCDONE = .TRUE. ELSEIF( MYROW.EQ.LRSRC .AND. DIR.EQ.1 ) THEN IF( LENRBUF.GT.0 .AND. NPCOL.GT.1 ) THEN CALL SGEBR2D( ICTXT, 'Row', '1-Tree', LENRBUF, $ 1, WORK, LENRBUF, LRSRC, LCSRC ) BCDONE = .TRUE. END IF ELSEIF( MYCOL.EQ.LCSRC .AND. DIR.EQ.2 ) THEN IF( LENCBUF.GT.0 .AND. NPROW.GT.1 ) THEN CALL SGEBR2D( ICTXT, 'Col', '1-Tree', LENCBUF, $ 1, WORK(1+LENRBUF), LENCBUF, LRSRC, LCSRC ) BCDONE = .TRUE. END IF END IF 62 CONTINUE 60 CONTINUE 50 CONTINUE END IF * * Compute updates - make sure to skip windows that was skipped * regarding local bulge chasing. * DO 65 DIR = 1, 2 WINID = 0 IF( DIR.EQ.1 ) THEN IPNEXT = 1 ELSE IPNEXT = 1 + LENRBUF END IF DO 70 WIN = 1, ANMWIN IF( IWORK( 5+(WIN-1)*5 ).EQ.2 ) GO TO 75 LRSRC = IWORK( 3+(WIN-1)*5 ) LCSRC = IWORK( 4+(WIN-1)*5 ) LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = LKBOT - LKTOP + 1 IF( (MYROW.EQ.LRSRC.AND.LENRBUF.GT.0.AND.DIR.EQ.1) .OR. $ (MYCOL.EQ.LCSRC.AND.LENCBUF.GT.0.AND.DIR.EQ.2 ) ) $ THEN * * Set up workspaces. * IPU = IPNEXT IPNEXT = IPU + LNWIN*LNWIN IPW = 1 + LENRBUF + LENCBUF LIROFFH = MOD(LKTOP-1,NB) WINID = WINID + 1 * * Recompute JOB to see if block structure of U could * possibly be exploited or not. * IF( LKTOP.EQ.KTOP .AND. LKBOT.EQ.KBOT ) THEN JOB = 'All steps' ELSEIF( LKTOP.EQ.KTOP ) THEN JOB = 'Introduce and chase' ELSEIF( LKBOT.EQ.KBOT ) THEN JOB = 'Off-chase bulges' ELSE JOB = 'Chase bulges' END IF END IF * * Use U to update far-from-diagonal entries in H. * If required, use U to update Z as well. * IF( .NOT. BLK22 .OR. .NOT. LSAME(JOB,'C') $ .OR. LNS.LE.2 ) THEN * IF( DIR.EQ.2 .AND. LENCBUF.GT.0 .AND. $ MYCOL.EQ.LCSRC ) THEN IF( WANTT ) THEN DO 80 INDX = 1, LKTOP-LIROFFH-1, NB CALL INFOG2L( INDX, LKTOP, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, RSRC1, $ CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN LROWS = MIN( NB, LKTOP-INDX ) CALL SGEMM('No transpose', 'No transpose', $ LROWS, LNWIN, LNWIN, ONE, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK( IPU ), LNWIN, ZERO, $ WORK(IPW), $ LROWS ) CALL SLAMOV( 'All', LROWS, LNWIN, $ WORK(IPW), LROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF 80 CONTINUE END IF IF( WANTZ ) THEN DO 90 INDX = 1, N, NB CALL INFOG2L( INDX, LKTOP, DESCZ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, RSRC1, $ CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN LROWS = MIN(NB,N-INDX+1) CALL SGEMM( 'No transpose', $ 'No transpose', LROWS, LNWIN, LNWIN, $ ONE, Z((JLOC-1)*LLDZ+ILOC), LLDZ, $ WORK( IPU ), LNWIN, ZERO, $ WORK(IPW), LROWS ) CALL SLAMOV( 'All', LROWS, LNWIN, $ WORK(IPW), LROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF 90 CONTINUE END IF END IF * * Update the rows of H affected by the bulge-chase. * IF( DIR.EQ.1 .AND. LENRBUF.GT.0 .AND. $ MYROW.EQ.LRSRC ) THEN IF( WANTT ) THEN IF( ICEIL(LKBOT,NB).EQ.ICEIL(KBOT,NB) ) THEN LCOLS = MIN(ICEIL(KBOT,NB)*NB,N) - KBOT ELSE LCOLS = 0 END IF IF( LCOLS.GT.0 ) THEN INDX = KBOT + 1 CALL INFOG2L( LKTOP, INDX, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN CALL SGEMM( 'Transpose', 'No Transpose', $ LNWIN, LCOLS, LNWIN, ONE, WORK(IPU), $ LNWIN, H((JLOC-1)*LLDH+ILOC), LLDH, $ ZERO, WORK(IPW), LNWIN ) CALL SLAMOV( 'All', LNWIN, LCOLS, $ WORK(IPW), LNWIN, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 93 CONTINUE INDXS = ICEIL(LKBOT,NB)*NB + 1 DO 95 INDX = INDXS, N, NB CALL INFOG2L( LKTOP, INDX, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN LCOLS = MIN( NB, N-INDX+1 ) CALL SGEMM( 'Transpose', 'No Transpose', $ LNWIN, LCOLS, LNWIN, ONE, WORK(IPU), $ LNWIN, H((JLOC-1)*LLDH+ILOC), LLDH, $ ZERO, WORK(IPW), $ LNWIN ) CALL SLAMOV( 'All', LNWIN, LCOLS, $ WORK(IPW), LNWIN, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF 95 CONTINUE END IF END IF ELSE KS = LNWIN-LNS/2*3 * * The LNWIN-by-LNWIN matrix U containing the accumulated * orthogonal transformations has the following structure: * * [ U11 U12 ] * U = [ ], * [ U21 U22 ] * * where U21 is KS-by-KS upper triangular and U12 is * (LNWIN-KS)-by-(LNWIN-KS) lower triangular. * Here, KS = LNS. * * Update the columns of H and Z affected by the bulge * chasing. * * Compute H2*U21 + H1*U11 in workspace. * IF( DIR.EQ.2 .AND. LENCBUF.GT.0 .AND. $ MYCOL.EQ.LCSRC ) THEN IF( WANTT ) THEN DO 100 INDX = 1, LKTOP-LIROFFH-1, NB CALL INFOG2L( INDX, LKTOP, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, RSRC1, $ CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN JLOC1 = INDXG2L( LKTOP+LNWIN-KS, NB, $ MYCOL, DESCH( CSRC_ ), NPCOL ) LROWS = MIN( NB, LKTOP-INDX ) CALL SLAMOV( 'All', LROWS, KS, $ H((JLOC1-1)*LLDH+ILOC ), LLDH, $ WORK(IPW), LROWS ) CALL STRMM( 'Right', 'Upper', $ 'No transpose','Non-unit', LROWS, $ KS, ONE, WORK( IPU+LNWIN-KS ), LNWIN, $ WORK(IPW), LROWS ) CALL SGEMM('No transpose', 'No transpose', $ LROWS, KS, LNWIN-KS, ONE, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK( IPU ), LNWIN, ONE, WORK(IPW), $ LROWS ) * * Compute H1*U12 + H2*U22 in workspace. * CALL SLAMOV( 'All', LROWS, LNWIN-KS, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK( IPW+KS*LROWS ), LROWS ) CALL STRMM( 'Right', 'Lower', $ 'No transpose', 'Non-Unit', $ LROWS, LNWIN-KS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW+KS*LROWS ), LROWS ) CALL SGEMM('No transpose', 'No transpose', $ LROWS, LNWIN-KS, KS, ONE, $ H((JLOC1-1)*LLDH+ILOC), LLDH, $ WORK( IPU+LNWIN*KS+LNWIN-KS ), LNWIN, $ ONE, WORK( IPW+KS*LROWS ), LROWS ) * * Copy workspace to H. * CALL SLAMOV( 'All', LROWS, LNWIN, $ WORK(IPW), LROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF 100 CONTINUE END IF * IF( WANTZ ) THEN * * Compute Z2*U21 + Z1*U11 in workspace. * DO 110 INDX = 1, N, NB CALL INFOG2L( INDX, LKTOP, DESCZ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, RSRC1, $ CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN JLOC1 = INDXG2L( LKTOP+LNWIN-KS, NB, $ MYCOL, DESCZ( CSRC_ ), NPCOL ) LROWS = MIN(NB,N-INDX+1) CALL SLAMOV( 'All', LROWS, KS, $ Z((JLOC1-1)*LLDZ+ILOC ), LLDZ, $ WORK(IPW), LROWS ) CALL STRMM( 'Right', 'Upper', $ 'No transpose', 'Non-unit', $ LROWS, KS, ONE, WORK( IPU+LNWIN-KS ), $ LNWIN, WORK(IPW), LROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', LROWS, KS, LNWIN-KS, $ ONE, Z((JLOC-1)*LLDZ+ILOC), LLDZ, $ WORK( IPU ), LNWIN, ONE, WORK(IPW), $ LROWS ) * * Compute Z1*U12 + Z2*U22 in workspace. * CALL SLAMOV( 'All', LROWS, LNWIN-KS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ, $ WORK( IPW+KS*LROWS ), LROWS) CALL STRMM( 'Right', 'Lower', $ 'No transpose', 'Non-unit', $ LROWS, LNWIN-KS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW+KS*LROWS ), LROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', LROWS, LNWIN-KS, KS, $ ONE, Z((JLOC1-1)*LLDZ+ILOC), LLDZ, $ WORK( IPU+LNWIN*KS+LNWIN-KS ), LNWIN, $ ONE, WORK( IPW+KS*LROWS ), $ LROWS ) * * Copy workspace to Z. * CALL SLAMOV( 'All', LROWS, LNWIN, $ WORK(IPW), LROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF 110 CONTINUE END IF END IF * IF( DIR.EQ.1 .AND. LENRBUF.GT.0 .AND. $ MYROW.EQ.LRSRC ) THEN IF( WANTT ) THEN INDXS = ICEIL(LKBOT,NB)*NB + 1 DO 120 INDX = INDXS, N, NB CALL INFOG2L( LKTOP, INDX, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) THEN * * Compute U21**T*H2 + U11**T*H1 in workspace. * ILOC1 = INDXG2L( LKTOP+LNWIN-KS, NB, $ MYROW, DESCH( RSRC_ ), NPROW ) LCOLS = MIN( NB, N-INDX+1 ) CALL SLAMOV( 'All', KS, LCOLS, $ H((JLOC-1)*LLDH+ILOC1), LLDH, $ WORK(IPW), LNWIN ) CALL STRMM( 'Left', 'Upper', 'Transpose', $ 'Non-unit', KS, LCOLS, ONE, $ WORK( IPU+LNWIN-KS ), LNWIN, $ WORK(IPW), LNWIN ) CALL SGEMM( 'Transpose', 'No transpose', $ KS, LCOLS, LNWIN-KS, ONE, WORK(IPU), $ LNWIN, H((JLOC-1)*LLDH+ILOC), LLDH, $ ONE, WORK(IPW), LNWIN ) * * Compute U12**T*H1 + U22**T*H2 in workspace. * CALL SLAMOV( 'All', LNWIN-KS, LCOLS, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK( IPW+KS ), LNWIN ) CALL STRMM( 'Left', 'Lower', 'Transpose', $ 'Non-unit', LNWIN-KS, LCOLS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW+KS ), LNWIN ) CALL SGEMM( 'Transpose', 'No Transpose', $ LNWIN-KS, LCOLS, KS, ONE, $ WORK( IPU+LNWIN*KS+LNWIN-KS ), LNWIN, $ H((JLOC-1)*LLDH+ILOC1), LLDH, $ ONE, WORK( IPW+KS ), LNWIN ) * * Copy workspace to H. * CALL SLAMOV( 'All', LNWIN, LCOLS, $ WORK(IPW), LNWIN, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF 120 CONTINUE END IF END IF END IF * * Update position information about current window. * IF( DIR.EQ.2 ) THEN IF( LKBOT.EQ.KBOT ) THEN LKTOP = KBOT+1 LKBOT = KBOT+1 IWORK( 1+(WIN-1)*5 ) = LKTOP IWORK( 2+(WIN-1)*5 ) = LKBOT IWORK( 5+(WIN-1)*5 ) = 2 ELSE LKTOP = MIN( LKTOP + LNWIN - LCHAIN, $ ICEIL( LKTOP, NB )*NB - LCHAIN + 1, $ KBOT ) IWORK( 1+(WIN-1)*5 ) = LKTOP LKBOT = MIN( LKBOT + LNWIN - LCHAIN, $ ICEIL( LKBOT, NB )*NB, KBOT ) IWORK( 2+(WIN-1)*5 ) = LKBOT LNWIN = LKBOT-LKTOP+1 IF( LNWIN.EQ.LCHAIN ) IWORK(5+(WIN-1)*5) = 2 END IF END IF 75 CONTINUE 70 CONTINUE 65 CONTINUE * * If bulges were chasen off from first window, the window is * removed. * IF( ICHOFF.GT.0 ) THEN DO 128 WIN = 2, ANMWIN IWORK( 1+(WIN-2)*5 ) = IWORK( 1+(WIN-1)*5 ) IWORK( 2+(WIN-2)*5 ) = IWORK( 2+(WIN-1)*5 ) IWORK( 3+(WIN-2)*5 ) = IWORK( 3+(WIN-1)*5 ) IWORK( 4+(WIN-2)*5 ) = IWORK( 4+(WIN-1)*5 ) IWORK( 5+(WIN-2)*5 ) = IWORK( 5+(WIN-1)*5 ) 128 CONTINUE ANMWIN = ANMWIN - 1 IPIW = 6+(ANMWIN-1)*5 END IF * * If we have no more windows, return. * IF( ANMWIN.LT.1 ) RETURN * ELSE * * Set up windows such that as many bulges as possible can be * moved over the border to the next block. Make sure that the * cross border window is at least (NTINY+1)-by-(NTINY+1), unless * we are chasing off the bulges from the last window. This is * accomplished by setting the bottom index LKBOT such that the * local window has the correct size. * * If LKBOT then becomes larger than KBOT, the endpoint of the whole * global submatrix, or LKTOP from a window located already residing * at the other side of the border, this is taken care of by some * dirty tricks. * DO 130 WIN = 1, ANMWIN LKTOP1 = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = MAX( 6, MIN( LKBOT - LKTOP1 + 1, LCHAIN ) ) LKBOT1 = MAX( MIN( KBOT, ICEIL(LKTOP1,NB)*NB+LCHAIN), $ MIN( KBOT, MIN( LKTOP1+2*LNWIN-1, $ (ICEIL(LKTOP1,NB)+1)*NB ) ) ) IWORK( 2+(WIN-1)*5 ) = LKBOT1 130 CONTINUE ICHOFF = 0 * * Keep a record over what windows that were moved over the borders * such that we can delay some windows due to lack of space on the * other side of the border; we do not want to leave any of the * bulges behind... * * IWORK( 5+(WIN-1)*5 ) = 0: window WIN has not been processed * IWORK( 5+(WIN-1)*5 ) = 1: window WIN is being processed (need to * know for updates) * IWORK( 5+(WIN-1)*5 ) = 2: window WIN has been fully processed * * So, start by marking all windows as not processed. * DO 135 WIN = 1, ANMWIN IWORK( 5+(WIN-1)*5 ) = 0 135 CONTINUE * * Do the cross border bulge-chase as follows: Start from the * first window (the one that is closest to be chased off the * diagonal of H) and take the odd windows first followed by the * even ones. To not get into hang-problems on processor meshes * with at least one odd dimension, the windows will in such a case * be processed in chunks of {the minimum odd process dimension}-1 * windows to avoid overlapping processor scopes in forming the * cross border computational windows and the cross border update * regions. * WCHUNK = MAX( 1, MIN( ANMWIN, NPROW-1, NPCOL-1 ) ) NUMCHUNK = ICEIL( ANMWIN, WCHUNK ) * * Based on the computed chunk of windows, start working with * crossborder bulge-chasing. Repeat this as long as there is * still work left to do (137 is a kind of do-while statement). * 137 CONTINUE * * Zero out LENRBUF and LENCBUF each time we restart this loop. * LENRBUF = 0 LENCBUF = 0 * DO 140 ODDEVEN = 1, MIN( 2, ANMWIN ) DO 150 CHUNKNUM = 1, NUMCHUNK IPNEXT = 1 DO 160 WIN = ODDEVEN+(CHUNKNUM-1)*WCHUNK, $ MIN(ANMWIN,MAX(1,ODDEVEN+(CHUNKNUM)*WCHUNK-1)), 2 * * Get position and size of the WIN:th active window and * make sure that we skip the cross border bulge for this * window if the window is not shared between several data * layout blocks (and processors). * * Also, delay windows that do not have sufficient size of * the other side of the border. Moreover, make sure to skip * windows that was already processed in the last round of * the do-while loop (137). * IF( IWORK( 5+(WIN-1)*5 ).EQ.2 ) GO TO 165 LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) IF( WIN.GT.1 ) THEN LKTOP2 = IWORK( 1+(WIN-2)*5 ) ELSE LKTOP2 = KBOT+1 END IF IF( ICEIL(LKTOP,NB).EQ.ICEIL(LKBOT,NB) .OR. $ LKBOT.GE.LKTOP2 ) GO TO 165 LNWIN = LKBOT - LKTOP + 1 IF( LNWIN.LE.NTINY .AND. LKBOT.NE.KBOT .AND. $ .NOT. MOD(LKBOT,NB).EQ.0 ) GO TO 165 * * If window is going to be processed, mark it as processed. * IWORK( 5+(WIN-1)*5 ) = 1 * * Extract processors for current cross border window, * as below: * * 1 | 2 * --+-- * 3 | 4 * RSRC1 = IWORK( 3+(WIN-1)*5 ) CSRC1 = IWORK( 4+(WIN-1)*5 ) RSRC2 = RSRC1 CSRC2 = MOD( CSRC1+1, NPCOL ) RSRC3 = MOD( RSRC1+1, NPROW ) CSRC3 = CSRC1 RSRC4 = MOD( RSRC1+1, NPROW ) CSRC4 = MOD( CSRC1+1, NPCOL ) * * Form group of four processors for cross border window. * IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) .OR. $ ( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN * * Compute the upper and lower parts of the active * window. * DIM1 = NB - MOD(LKTOP-1,NB) DIM4 = LNWIN - DIM1 * * Temporarily compute a new value of the size of the * computational window that is larger than or equal to * NTINY+1; call the *real* value DIM. * DIM = LNWIN LNWIN = MAX(NTINY+1,LNWIN) * * Divide workspace. * IPU = IPNEXT IPH = IPU + DIM**2 IPUU = IPH + LNWIN**2 IPV = IPUU + LNWIN**2 IPNEXT = IPH IF( DIM.LT.LNWIN ) THEN CALL SLASET( 'All', LNWIN, LNWIN, ZERO, $ ONE, WORK( IPH ), LNWIN ) ELSE CALL SLASET( 'All', DIM, DIM, ZERO, $ ZERO, WORK( IPH ), LNWIN ) END IF * * Form the active window. * IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN ILOC = INDXG2L( LKTOP, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL SLAMOV( 'All', DIM1, DIM1, $ H((JLOC-1)*LLDH+ILOC), LLDH, WORK(IPH), $ LNWIN ) IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) THEN * Proc#1 <==> Proc#4 CALL SGESD2D( ICTXT, DIM1, DIM1, $ WORK(IPH), LNWIN, RSRC4, CSRC4 ) CALL SGERV2D( ICTXT, DIM4, DIM4, $ WORK(IPH+DIM1*LNWIN+DIM1), $ LNWIN, RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN ILOC = INDXG2L( LKTOP+DIM1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL SLAMOV( 'All', DIM4, DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK(IPH+DIM1*LNWIN+DIM1), $ LNWIN ) IF( RSRC4.NE.RSRC1 .OR. CSRC4.NE.CSRC1 ) THEN * Proc#4 <==> Proc#1 CALL SGESD2D( ICTXT, DIM4, DIM4, $ WORK(IPH+DIM1*LNWIN+DIM1), $ LNWIN, RSRC1, CSRC1 ) CALL SGERV2D( ICTXT, DIM1, DIM1, $ WORK(IPH), LNWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN ILOC = INDXG2L( LKTOP, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL SLAMOV( 'All', DIM1, DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK(IPH+DIM1*LNWIN), LNWIN ) IF( RSRC2.NE.RSRC1 .OR. CSRC2.NE.CSRC1 ) THEN * Proc#2 ==> Proc#1 CALL SGESD2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN IF( RSRC2.NE.RSRC4 .OR. CSRC2.NE.CSRC4 ) THEN * Proc#2 ==> Proc#4 CALL SGESD2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN ILOC = INDXG2L( LKTOP+DIM1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1-1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL SLAMOV( 'All', 1, 1, $ H((JLOC-1)*LLDH+ILOC), LLDH, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN ) IF( RSRC3.NE.RSRC1 .OR. CSRC3.NE.CSRC1 ) THEN * Proc#3 ==> Proc#1 CALL SGESD2D( ICTXT, 1, 1, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN IF( RSRC3.NE.RSRC4 .OR. CSRC3.NE.CSRC4 ) THEN * Proc#3 ==> Proc#4 CALL SGESD2D( ICTXT, 1, 1, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN, RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC1.NE.RSRC2 .OR. CSRC1.NE.CSRC2 ) THEN * Proc#1 <== Proc#2 CALL SGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC2, CSRC2 ) END IF IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) THEN * Proc#1 <== Proc#3 CALL SGERV2D( ICTXT, 1, 1, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN, RSRC3, CSRC3 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) THEN * Proc#4 <== Proc#2 CALL SGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC2, CSRC2 ) END IF IF( RSRC4.NE.RSRC3 .OR. CSRC4.NE.CSRC3 ) THEN * Proc#4 <== Proc#3 CALL SGERV2D( ICTXT, 1, 1, $ WORK(IPH+(DIM1-1)*LNWIN+DIM1), $ LNWIN, RSRC3, CSRC3 ) END IF END IF * * Prepare for call to SLAQR6 - it could happen that no * bulges where introduced in the pre-cross border step * since the chain was too long to fit in the top-left * part of the cross border window. In such a case, the * bulges are introduced here instead. It could also * happen that the bottom-right part is too small to hold * the whole chain -- in such a case, the bulges are * chasen off immediately, as well. * IF( (MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1) .OR. $ (MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4) ) THEN IF( LKTOP.EQ.KTOP .AND. LKBOT.EQ.KBOT .AND. $ (DIM1.LE.LCHAIN .OR. DIM1.LE.NTINY ) ) THEN JOB = 'All steps' ICHOFF = 1 ELSEIF( LKTOP.EQ.KTOP .AND. $ ( DIM1.LE.LCHAIN .OR. DIM1.LE.NTINY ) ) THEN JOB = 'Introduce and chase' ELSEIF( LKBOT.EQ.KBOT ) THEN JOB = 'Off-chase bulges' ICHOFF = 1 ELSE JOB = 'Chase bulges' END IF KU = LNWIN - KDU + 1 KWH = KDU + 1 NHO = ( LNWIN-KDU+1-4 ) - ( KDU+1 ) + 1 KWV = KDU + 4 NVE = LNWIN - KDU - KWV + 1 CALL SLASET( 'All', LNWIN, LNWIN, $ ZERO, ONE, WORK(IPUU), LNWIN ) * * Small-bulge multi-shift QR sweep. * LKS = MAX(1, NS - WIN*LNS + 1) CALL SLAQR6( JOB, WANTT, .TRUE., LKACC22, LNWIN, $ 1, DIM, LNS, SR( LKS ), SI( LKS ), $ WORK(IPH), LNWIN, 1, DIM, $ WORK(IPUU), LNWIN, WORK(IPU), 3, $ WORK( IPH+KU-1 ), LNWIN, NVE, $ WORK( IPH+KWV-1 ), LNWIN, NHO, $ WORK( IPH-1+KU+(KWH-1)*LNWIN ), LNWIN ) * * Copy local submatrices of H back to global matrix. * IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN ILOC = INDXG2L( LKTOP, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL SLAMOV( 'All', DIM1, DIM1, WORK(IPH), $ LNWIN, H((JLOC-1)*LLDH+ILOC), $ LLDH ) END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN ILOC = INDXG2L( LKTOP+DIM1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) CALL SLAMOV( 'All', DIM4, DIM4, $ WORK(IPH+DIM1*LNWIN+DIM1), $ LNWIN, H((JLOC-1)*LLDH+ILOC), LLDH ) END IF * * Copy actual submatrix of U to the correct place of * the buffer. * CALL SLAMOV( 'All', DIM, DIM, $ WORK(IPUU), LNWIN, WORK(IPU), DIM ) END IF * * Return data to process 2 and 3. * RWS3 = MIN(3,DIM4) CLS3 = MIN(3,DIM1) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) THEN * Proc#1 ==> Proc#3 CALL SGESD2D( ICTXT, RWS3, CLS3, $ WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1 ), $ LNWIN, RSRC3, CSRC3 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) THEN * Proc#4 ==> Proc#2 CALL SGESD2D( ICTXT, DIM1, DIM4, $ WORK( IPH+DIM1*LNWIN), $ LNWIN, RSRC2, CSRC2 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN ILOC = INDXG2L( LKTOP, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) IF( RSRC2.NE.RSRC4 .OR. CSRC2.NE.CSRC4 ) THEN * Proc#2 <== Proc#4 CALL SGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPH+DIM1*LNWIN), $ LNWIN, RSRC4, CSRC4 ) END IF CALL SLAMOV( 'All', DIM1, DIM4, $ WORK( IPH+DIM1*LNWIN ), LNWIN, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN ILOC = INDXG2L( LKTOP+DIM1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) JLOC = INDXG2L( LKTOP+DIM1-CLS3, NB, MYCOL, $ DESCH( CSRC_ ), NPCOL ) IF( RSRC3.NE.RSRC1 .OR. CSRC3.NE.CSRC1 ) THEN * Proc#3 <== Proc#1 CALL SGERV2D( ICTXT, RWS3, CLS3, $ WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1 ), $ LNWIN, RSRC1, CSRC1 ) END IF CALL SLAMOV( 'Upper', RWS3, CLS3, $ WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1 ), $ LNWIN, H((JLOC-1)*LLDH+ILOC), $ LLDH ) IF( RWS3.GT.1 .AND. CLS3.GT.1 ) THEN ELEM = WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1+1 ) IF( ELEM.NE.ZERO ) THEN CALL SLAMOV( 'Lower', RWS3-1, CLS3-1, $ WORK( IPH+(DIM1-CLS3)*LNWIN+DIM1+1 ), $ LNWIN, H((JLOC-1)*LLDH+ILOC+1), LLDH ) END IF END IF END IF * * Restore correct value of LNWIN. * LNWIN = DIM * END IF * * Increment counter for buffers of orthogonal * transformations. * IF( MYROW.EQ.RSRC1 .OR. MYCOL.EQ.CSRC1 .OR. $ MYROW.EQ.RSRC4 .OR. MYCOL.EQ.CSRC4 ) THEN IF( MYROW.EQ.RSRC1 .OR. MYROW.EQ.RSRC4 ) $ LENRBUF = LENRBUF + LNWIN*LNWIN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) $ LENCBUF = LENCBUF + LNWIN*LNWIN END IF * * If no cross border bulge chasing was performed for the * current WIN:th window, the processor jump to this point * and consider the next one. * 165 CONTINUE * 160 CONTINUE * * Broadcast orthogonal transformations -- this will only happen * if the buffer associated with the orthogonal transformations * is not empty (controlled by LENRBUF, for row-wise * broadcasts, and LENCBUF, for column-wise broadcasts). * DO 170 DIR = 1, 2 BCDONE = .FALSE. DO 180 WIN = ODDEVEN+(CHUNKNUM-1)*WCHUNK, $ MIN(ANMWIN,MAX(1,ODDEVEN+(CHUNKNUM)*WCHUNK-1)), 2 IF( ( LENRBUF.EQ.0 .AND. LENCBUF.EQ.0 ) .OR. $ BCDONE ) GO TO 185 RSRC1 = IWORK( 3+(WIN-1)*5 ) CSRC1 = IWORK( 4+(WIN-1)*5 ) RSRC4 = MOD( RSRC1+1, NPROW ) CSRC4 = MOD( CSRC1+1, NPCOL ) IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN IF( DIR.EQ.1 .AND. LENRBUF.GT.0 .AND. $ NPCOL.GT.1 .AND. NPROCS.GT.2 ) THEN IF( MYROW.EQ.RSRC1 .OR. ( MYROW.EQ.RSRC4 $ .AND. RSRC4.NE.RSRC1 ) ) THEN CALL SGEBS2D( ICTXT, 'Row', '1-Tree', $ LENRBUF, 1, WORK, LENRBUF ) ELSE CALL SGEBR2D( ICTXT, 'Row', '1-Tree', $ LENRBUF, 1, WORK, LENRBUF, RSRC1, $ CSRC1 ) END IF ELSEIF( DIR.EQ.2 .AND. LENCBUF.GT.0 .AND. $ NPROW.GT.1 .AND. NPROCS.GT.2 ) THEN IF( MYCOL.EQ.CSRC1 .OR. ( MYCOL.EQ.CSRC4 $ .AND. CSRC4.NE.CSRC1 ) ) THEN CALL SGEBS2D( ICTXT, 'Col', '1-Tree', $ LENCBUF, 1, WORK, LENCBUF ) ELSE CALL SGEBR2D( ICTXT, 'Col', '1-Tree', $ LENCBUF, 1, WORK(1+LENRBUF), LENCBUF, $ RSRC1, CSRC1 ) END IF END IF IF( LENRBUF.GT.0 .AND. ( MYCOL.EQ.CSRC1 .OR. $ ( MYCOL.EQ.CSRC4 .AND. CSRC4.NE.CSRC1 ) ) ) $ CALL SLAMOV( 'All', LENRBUF, 1, WORK, LENRBUF, $ WORK(1+LENRBUF), LENCBUF ) BCDONE = .TRUE. ELSEIF( MYROW.EQ.RSRC1 .AND. DIR.EQ.1 ) THEN IF( LENRBUF.GT.0 .AND. NPCOL.GT.1 ) $ CALL SGEBR2D( ICTXT, 'Row', '1-Tree', LENRBUF, $ 1, WORK, LENRBUF, RSRC1, CSRC1 ) BCDONE = .TRUE. ELSEIF( MYCOL.EQ.CSRC1 .AND. DIR.EQ.2 ) THEN IF( LENCBUF.GT.0 .AND. NPROW.GT.1 ) $ CALL SGEBR2D( ICTXT, 'Col', '1-Tree', LENCBUF, $ 1, WORK(1+LENRBUF), LENCBUF, RSRC1, CSRC1 ) BCDONE = .TRUE. ELSEIF( MYROW.EQ.RSRC4 .AND. DIR.EQ.1 ) THEN IF( LENRBUF.GT.0 .AND. NPCOL.GT.1 ) $ CALL SGEBR2D( ICTXT, 'Row', '1-Tree', LENRBUF, $ 1, WORK, LENRBUF, RSRC4, CSRC4 ) BCDONE = .TRUE. ELSEIF( MYCOL.EQ.CSRC4 .AND. DIR.EQ.2 ) THEN IF( LENCBUF.GT.0 .AND. NPROW.GT.1 ) $ CALL SGEBR2D( ICTXT, 'Col', '1-Tree', LENCBUF, $ 1, WORK(1+LENRBUF), LENCBUF, RSRC4, CSRC4 ) BCDONE = .TRUE. END IF 185 CONTINUE 180 CONTINUE 170 CONTINUE * * Prepare for computing cross border updates by exchanging * data in cross border update regions in H and Z. * DO 190 DIR = 1, 2 WINID = 0 IPW3 = 1 DO 200 WIN = ODDEVEN+(CHUNKNUM-1)*WCHUNK, $ MIN(ANMWIN,MAX(1,ODDEVEN+(CHUNKNUM)*WCHUNK-1)), 2 IF( IWORK( 5+(WIN-1)*5 ).NE.1 ) GO TO 205 * * Make sure this part of the code is only executed when * there has been some work performed on the WIN:th * window. * LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) * * Extract processor indices associated with * the current window. * RSRC1 = IWORK( 3+(WIN-1)*5 ) CSRC1 = IWORK( 4+(WIN-1)*5 ) RSRC4 = MOD( RSRC1+1, NPROW ) CSRC4 = MOD( CSRC1+1, NPCOL ) * * Compute local number of rows and columns * of H and Z to exchange. * IF(((MYCOL.EQ.CSRC1.OR.MYCOL.EQ.CSRC4).AND.DIR.EQ.2) $ .OR.((MYROW.EQ.RSRC1.OR.MYROW.EQ.RSRC4).AND. $ DIR.EQ.1)) THEN WINID = WINID + 1 LNWIN = LKBOT - LKTOP + 1 IPU = IPNEXT DIM1 = NB - MOD(LKTOP-1,NB) DIM4 = LNWIN - DIM1 IPNEXT = IPU + LNWIN*LNWIN IF( DIR.EQ.2 ) THEN IF( WANTZ ) THEN ZROWS = NUMROC( N, NB, MYROW, DESCZ( RSRC_ ), $ NPROW ) ELSE ZROWS = 0 END IF IF( WANTT ) THEN HROWS = NUMROC( LKTOP-1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) ELSE HROWS = 0 END IF ELSE ZROWS = 0 HROWS = 0 END IF IF( DIR.EQ.1 ) THEN IF( WANTT ) THEN HCOLS = NUMROC( N - (LKTOP+DIM1-1), NB, $ MYCOL, CSRC4, NPCOL ) IF( MYCOL.EQ.CSRC4 ) HCOLS = HCOLS - DIM4 ELSE HCOLS = 0 END IF ELSE HCOLS = 0 END IF IPW = MAX( 1 + LENRBUF + LENCBUF, IPW3 ) IPW1 = IPW + HROWS * LNWIN IF( WANTZ ) THEN IPW2 = IPW1 + LNWIN * HCOLS IPW3 = IPW2 + ZROWS * LNWIN ELSE IPW3 = IPW1 + LNWIN * HCOLS END IF END IF * * Let each process row and column involved in the updates * exchange data in H and Z with their neighbours. * IF( DIR.EQ.2 .AND. WANTT .AND. LENCBUF.GT.0 ) THEN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) THEN DO 210 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, LKTOP, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC1, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', HROWS, DIM1, $ H((JLOC1-1)*LLDH+ILOC), LLDH, $ WORK(IPW), HROWS ) IF( NPCOL.GT.1 ) THEN EAST = MOD( MYCOL + 1, NPCOL ) CALL SGESD2D( ICTXT, HROWS, DIM1, $ WORK(IPW), HROWS, RSRC, EAST ) CALL SGERV2D( ICTXT, HROWS, DIM4, $ WORK(IPW+HROWS*DIM1), HROWS, $ RSRC, EAST ) END IF END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, LKTOP+DIM1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC4, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', HROWS, DIM4, $ H((JLOC4-1)*LLDH+ILOC), LLDH, $ WORK(IPW+HROWS*DIM1), HROWS ) IF( NPCOL.GT.1 ) THEN WEST = MOD( MYCOL - 1 + NPCOL, $ NPCOL ) CALL SGESD2D( ICTXT, HROWS, DIM4, $ WORK(IPW+HROWS*DIM1), HROWS, $ RSRC, WEST ) CALL SGERV2D( ICTXT, HROWS, DIM1, $ WORK(IPW), HROWS, RSRC, WEST ) END IF END IF END IF 210 CONTINUE END IF END IF * IF( DIR.EQ.1 .AND. WANTT .AND. LENRBUF.GT.0 ) THEN IF( MYROW.EQ.RSRC1 .OR. MYROW.EQ.RSRC4 ) THEN DO 220 INDX = 1, NPCOL IF( MYROW.EQ.RSRC1 ) THEN IF( INDX.EQ.1 ) THEN IF( LKBOT.LT.N ) THEN CALL INFOG2L( LKTOP, LKBOT+1, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC, RSRC1, CSRC ) ELSE CSRC = -1 END IF ELSEIF( MOD(LKBOT,NB).NE.0 ) THEN CALL INFOG2L( LKTOP, $ (ICEIL(LKBOT,NB)+(INDX-2))*NB+1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC, RSRC1, CSRC ) ELSE CALL INFOG2L( LKTOP, $ (ICEIL(LKBOT,NB)+(INDX-1))*NB+1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC, RSRC1, CSRC ) END IF IF( MYCOL.EQ.CSRC ) THEN CALL SLAMOV( 'All', DIM1, HCOLS, $ H((JLOC-1)*LLDH+ILOC1), LLDH, $ WORK(IPW1), LNWIN ) IF( NPROW.GT.1 ) THEN SOUTH = MOD( MYROW + 1, NPROW ) CALL SGESD2D( ICTXT, DIM1, HCOLS, $ WORK(IPW1), LNWIN, SOUTH, $ CSRC ) CALL SGERV2D( ICTXT, DIM4, HCOLS, $ WORK(IPW1+DIM1), LNWIN, SOUTH, $ CSRC ) END IF END IF END IF IF( MYROW.EQ.RSRC4 ) THEN IF( INDX.EQ.1 ) THEN IF( LKBOT.LT.N ) THEN CALL INFOG2L( LKTOP+DIM1, LKBOT+1, $ DESCH, NPROW, NPCOL, MYROW, $ MYCOL, ILOC4, JLOC, RSRC4, $ CSRC ) ELSE CSRC = -1 END IF ELSEIF( MOD(LKBOT,NB).NE.0 ) THEN CALL INFOG2L( LKTOP+DIM1, $ (ICEIL(LKBOT,NB)+(INDX-2))*NB+1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC, RSRC4, CSRC ) ELSE CALL INFOG2L( LKTOP+DIM1, $ (ICEIL(LKBOT,NB)+(INDX-1))*NB+1, $ DESCH, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC, RSRC4, CSRC ) END IF IF( MYCOL.EQ.CSRC ) THEN CALL SLAMOV( 'All', DIM4, HCOLS, $ H((JLOC-1)*LLDH+ILOC4), LLDH, $ WORK(IPW1+DIM1), LNWIN ) IF( NPROW.GT.1 ) THEN NORTH = MOD( MYROW - 1 + NPROW, $ NPROW ) CALL SGESD2D( ICTXT, DIM4, HCOLS, $ WORK(IPW1+DIM1), LNWIN, NORTH, $ CSRC ) CALL SGERV2D( ICTXT, DIM1, HCOLS, $ WORK(IPW1), LNWIN, NORTH, $ CSRC ) END IF END IF END IF 220 CONTINUE END IF END IF * IF( DIR.EQ.2 .AND. WANTZ .AND. LENCBUF.GT.0) THEN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) THEN DO 230 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, LKTOP, $ DESCZ, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC1, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', ZROWS, DIM1, $ Z((JLOC1-1)*LLDZ+ILOC), LLDZ, $ WORK(IPW2), ZROWS ) IF( NPCOL.GT.1 ) THEN EAST = MOD( MYCOL + 1, NPCOL ) CALL SGESD2D( ICTXT, ZROWS, DIM1, $ WORK(IPW2), ZROWS, RSRC, $ EAST ) CALL SGERV2D( ICTXT, ZROWS, DIM4, $ WORK(IPW2+ZROWS*DIM1), $ ZROWS, RSRC, EAST ) END IF END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, $ LKTOP+DIM1, DESCZ, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC4, RSRC, $ CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', ZROWS, DIM4, $ Z((JLOC4-1)*LLDZ+ILOC), LLDZ, $ WORK(IPW2+ZROWS*DIM1), ZROWS ) IF( NPCOL.GT.1 ) THEN WEST = MOD( MYCOL - 1 + NPCOL, $ NPCOL ) CALL SGESD2D( ICTXT, ZROWS, DIM4, $ WORK(IPW2+ZROWS*DIM1), $ ZROWS, RSRC, WEST ) CALL SGERV2D( ICTXT, ZROWS, DIM1, $ WORK(IPW2), ZROWS, RSRC, $ WEST ) END IF END IF END IF 230 CONTINUE END IF END IF * * If no exchanges was performed for the current window, * all processors jump to this point and try the next * one. * 205 CONTINUE * 200 CONTINUE * * Compute crossborder bulge-chase updates. * WINID = 0 IF( DIR.EQ.1 ) THEN IPNEXT = 1 ELSE IPNEXT = 1 + LENRBUF END IF IPW3 = 1 DO 240 WIN = ODDEVEN+(CHUNKNUM-1)*WCHUNK, $ MIN(ANMWIN,MAX(1,ODDEVEN+(CHUNKNUM)*WCHUNK-1)), 2 IF( IWORK( 5+(WIN-1)*5 ).NE.1 ) GO TO 245 * * Only perform this part of the code if there was really * some work performed on the WIN:th window. * LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = LKBOT - LKTOP + 1 * * Extract the processor indices associated with * the current window. * RSRC1 = IWORK( 3+(WIN-1)*5 ) CSRC1 = IWORK( 4+(WIN-1)*5 ) RSRC4 = MOD( RSRC1+1, NPROW ) CSRC4 = MOD( CSRC1+1, NPCOL ) * IF(((MYCOL.EQ.CSRC1.OR.MYCOL.EQ.CSRC4).AND.DIR.EQ.2) $ .OR.((MYROW.EQ.RSRC1.OR.MYROW.EQ.RSRC4).AND. $ DIR.EQ.1)) THEN * * Set up workspaces. * WINID = WINID + 1 LKTOP = IWORK( 1+(WIN-1)*5 ) LKBOT = IWORK( 2+(WIN-1)*5 ) LNWIN = LKBOT - LKTOP + 1 DIM1 = NB - MOD(LKTOP-1,NB) DIM4 = LNWIN - DIM1 IPU = IPNEXT + (WINID-1)*LNWIN*LNWIN IF( DIR.EQ.2 ) THEN IF( WANTZ ) THEN ZROWS = NUMROC( N, NB, MYROW, DESCZ( RSRC_ ), $ NPROW ) ELSE ZROWS = 0 END IF IF( WANTT ) THEN HROWS = NUMROC( LKTOP-1, NB, MYROW, $ DESCH( RSRC_ ), NPROW ) ELSE HROWS = 0 END IF ELSE ZROWS = 0 HROWS = 0 END IF IF( DIR.EQ.1 ) THEN IF( WANTT ) THEN HCOLS = NUMROC( N - (LKTOP+DIM1-1), NB, $ MYCOL, CSRC4, NPCOL ) IF( MYCOL.EQ.CSRC4 ) HCOLS = HCOLS - DIM4 ELSE HCOLS = 0 END IF ELSE HCOLS = 0 END IF * * IPW = local copy of overlapping column block of H * IPW1 = local copy of overlapping row block of H * IPW2 = local copy of overlapping column block of Z * IPW3 = workspace for right hand side of matrix * multiplication * IPW = MAX( 1 + LENRBUF + LENCBUF, IPW3 ) IPW1 = IPW + HROWS * LNWIN IF( WANTZ ) THEN IPW2 = IPW1 + LNWIN * HCOLS IPW3 = IPW2 + ZROWS * LNWIN ELSE IPW3 = IPW1 + LNWIN * HCOLS END IF * * Recompute job to see if special structure of U * could possibly be exploited. * IF( LKTOP.EQ.KTOP .AND. LKBOT.EQ.KBOT ) THEN JOB = 'All steps' ELSEIF( LKTOP.EQ.KTOP .AND. $ ( DIM1.LT.LCHAIN+1 .OR. DIM1.LE.NTINY ) ) $ THEN JOB = 'Introduce and chase' ELSEIF( LKBOT.EQ.KBOT ) THEN JOB = 'Off-chase bulges' ELSE JOB = 'Chase bulges' END IF END IF * * Test if to exploit sparsity structure of * orthogonal matrix U. * KS = DIM1+DIM4-LNS/2*3 IF( .NOT. BLK22 .OR. DIM1.NE.KS .OR. $ DIM4.NE.KS .OR. LSAME(JOB,'I') .OR. $ LSAME(JOB,'O') .OR. LNS.LE.2 ) THEN * * Update the columns of H and Z. * IF( DIR.EQ.2 .AND. WANTT .AND. LENCBUF.GT.0 ) THEN DO 250 INDX = 1, MIN(LKTOP-1,1+(NPROW-1)*NB), NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, LKTOP, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SGEMM( 'No transpose', $ 'No transpose', HROWS, DIM1, $ LNWIN, ONE, WORK( IPW ), HROWS, $ WORK( IPU ), LNWIN, ZERO, $ WORK(IPW3), HROWS ) CALL SLAMOV( 'All', HROWS, DIM1, $ WORK(IPW3), HROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, LKTOP+DIM1, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SGEMM( 'No transpose', $ 'No transpose', HROWS, DIM4, $ LNWIN, ONE, WORK( IPW ), HROWS, $ WORK( IPU+LNWIN*DIM1 ), LNWIN, $ ZERO, WORK(IPW3), HROWS ) CALL SLAMOV( 'All', HROWS, DIM4, $ WORK(IPW3), HROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 250 CONTINUE END IF * IF( DIR.EQ.2 .AND. WANTZ .AND. LENCBUF.GT.0 ) THEN DO 260 INDX = 1, MIN(N,1+(NPROW-1)*NB), NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, LKTOP, DESCZ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SGEMM( 'No transpose', $ 'No transpose', ZROWS, DIM1, $ LNWIN, ONE, WORK( IPW2 ), $ ZROWS, WORK( IPU ), LNWIN, $ ZERO, WORK(IPW3), ZROWS ) CALL SLAMOV( 'All', ZROWS, DIM1, $ WORK(IPW3), ZROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, LKTOP+DIM1, DESCZ, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SGEMM( 'No transpose', $ 'No transpose', ZROWS, DIM4, $ LNWIN, ONE, WORK( IPW2 ), $ ZROWS, $ WORK( IPU+LNWIN*DIM1 ), LNWIN, $ ZERO, WORK(IPW3), ZROWS ) CALL SLAMOV( 'All', ZROWS, DIM4, $ WORK(IPW3), ZROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF END IF 260 CONTINUE END IF * * Update the rows of H. * IF( DIR.EQ.1 .AND. WANTT .AND. LENRBUF.GT.0 ) THEN IF( LKBOT.LT.N ) THEN IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC4 .AND. $ MOD(LKBOT,NB).NE.0 ) THEN INDX = LKBOT + 1 CALL INFOG2L( LKTOP, INDX, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC4 ) CALL SGEMM( 'Transpose', 'No Transpose', $ DIM1, HCOLS, LNWIN, ONE, WORK(IPU), $ LNWIN, WORK( IPW1 ), LNWIN, ZERO, $ WORK(IPW3), DIM1 ) CALL SLAMOV( 'All', DIM1, HCOLS, $ WORK(IPW3), DIM1, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF IF( MYROW.EQ.RSRC4.AND.MYCOL.EQ.CSRC4 .AND. $ MOD(LKBOT,NB).NE.0 ) THEN INDX = LKBOT + 1 CALL INFOG2L( LKTOP+DIM1, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC4, CSRC4 ) CALL SGEMM( 'Transpose', 'No Transpose', $ DIM4, HCOLS, LNWIN, ONE, $ WORK( IPU+DIM1*LNWIN ), LNWIN, $ WORK( IPW1), LNWIN, ZERO, $ WORK(IPW3), DIM4 ) CALL SLAMOV( 'All', DIM4, HCOLS, $ WORK(IPW3), DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF INDXS = ICEIL(LKBOT,NB)*NB + 1 IF( MOD(LKBOT,NB).NE.0 ) THEN INDXE = MIN(N,INDXS+(NPCOL-2)*NB) ELSE INDXE = MIN(N,INDXS+(NPCOL-1)*NB) END IF DO 270 INDX = INDXS, INDXE, NB IF( MYROW.EQ.RSRC1 ) THEN CALL INFOG2L( LKTOP, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL SGEMM( 'Transpose', $ 'No Transpose', DIM1, HCOLS, $ LNWIN, ONE, WORK( IPU ), LNWIN, $ WORK( IPW1 ), LNWIN, ZERO, $ WORK(IPW3), DIM1 ) CALL SLAMOV( 'All', DIM1, HCOLS, $ WORK(IPW3), DIM1, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF IF( MYROW.EQ.RSRC4 ) THEN CALL INFOG2L( LKTOP+DIM1, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC4, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL SGEMM( 'Transpose', $ 'No Transpose', DIM4, HCOLS, $ LNWIN, ONE, $ WORK( IPU+LNWIN*DIM1 ), LNWIN, $ WORK( IPW1 ), LNWIN, $ ZERO, WORK(IPW3), DIM4 ) CALL SLAMOV( 'All', DIM4, HCOLS, $ WORK(IPW3), DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 270 CONTINUE END IF END IF ELSE * * Update the columns of H and Z. * * Compute H2*U21 + H1*U11 on the left side of the border. * IF( DIR.EQ.2 .AND. WANTT .AND. LENCBUF.GT.0 ) THEN INDXE = MIN(LKTOP-1,1+(NPROW-1)*NB) DO 280 INDX = 1, INDXE, NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, LKTOP, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', HROWS, KS, $ WORK( IPW+HROWS*DIM4), HROWS, $ WORK(IPW3), HROWS ) CALL STRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', HROWS, KS, ONE, $ WORK( IPU+DIM4 ), LNWIN, $ WORK(IPW3), HROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', HROWS, KS, DIM4, $ ONE, WORK( IPW ), HROWS, $ WORK( IPU ), LNWIN, ONE, $ WORK(IPW3), HROWS ) CALL SLAMOV( 'All', HROWS, KS, $ WORK(IPW3), HROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF * * Compute H1*U12 + H2*U22 on the right side of * the border. * IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, LKTOP+DIM1, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', HROWS, DIM4, $ WORK(IPW), HROWS, WORK( IPW3 ), $ HROWS ) CALL STRMM( 'Right', 'Lower', $ 'No transpose', $ 'Non-unit', HROWS, DIM4, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW3 ), HROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', HROWS, DIM4, KS, $ ONE, WORK( IPW+HROWS*DIM4), $ HROWS, $ WORK( IPU+LNWIN*KS+DIM4 ), LNWIN, $ ONE, WORK( IPW3 ), HROWS ) CALL SLAMOV( 'All', HROWS, DIM4, $ WORK(IPW3), HROWS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 280 CONTINUE END IF * IF( DIR.EQ.2 .AND. WANTZ .AND. LENCBUF.GT.0 ) THEN * * Compute Z2*U21 + Z1*U11 on the left side * of border. * INDXE = MIN(N,1+(NPROW-1)*NB) DO 290 INDX = 1, INDXE, NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCZ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', ZROWS, KS, $ WORK( IPW2+ZROWS*DIM4), $ ZROWS, WORK(IPW3), ZROWS ) CALL STRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', ZROWS, KS, ONE, $ WORK( IPU+DIM4 ), LNWIN, $ WORK(IPW3), ZROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', ZROWS, KS, $ DIM4, ONE, WORK( IPW2 ), $ ZROWS, WORK( IPU ), LNWIN, $ ONE, WORK(IPW3), ZROWS ) CALL SLAMOV( 'All', ZROWS, KS, $ WORK(IPW3), ZROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF END IF * * Compute Z1*U12 + Z2*U22 on the right side * of border. * IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, DESCZ, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', ZROWS, DIM4, $ WORK(IPW2), ZROWS, $ WORK( IPW3 ), ZROWS ) CALL STRMM( 'Right', 'Lower', $ 'No transpose', $ 'Non-unit', ZROWS, DIM4, $ ONE, WORK( IPU+LNWIN*KS ), $ LNWIN, WORK( IPW3 ), ZROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', ZROWS, DIM4, $ KS, ONE, $ WORK( IPW2+ZROWS*(DIM4)), $ ZROWS, $ WORK( IPU+LNWIN*KS+DIM4 ), $ LNWIN, ONE, WORK( IPW3 ), $ ZROWS ) CALL SLAMOV( 'All', ZROWS, DIM4, $ WORK(IPW3), ZROWS, $ Z((JLOC-1)*LLDZ+ILOC), LLDZ ) END IF END IF 290 CONTINUE END IF * IF( DIR.EQ.1 .AND. WANTT .AND. LENRBUF.GT.0) THEN IF ( LKBOT.LT.N ) THEN * * Compute U21**T*H2 + U11**T*H1 on the upper * side of the border. * IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC4.AND. $ MOD(LKBOT,NB).NE.0 ) THEN INDX = LKBOT + 1 CALL INFOG2L( LKTOP, INDX, DESCH, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC4 ) CALL SLAMOV( 'All', KS, HCOLS, $ WORK( IPW1+DIM4 ), LNWIN, $ WORK(IPW3), KS ) CALL STRMM( 'Left', 'Upper', 'Transpose', $ 'Non-unit', KS, HCOLS, ONE, $ WORK( IPU+DIM4 ), LNWIN, $ WORK(IPW3), KS ) CALL SGEMM( 'Transpose', 'No transpose', $ KS, HCOLS, DIM4, ONE, WORK(IPU), $ LNWIN, WORK(IPW1), LNWIN, $ ONE, WORK(IPW3), KS ) CALL SLAMOV( 'All', KS, HCOLS, $ WORK(IPW3), KS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF * * Compute U12**T*H1 + U22**T*H2 one the lower * side of the border. * IF( MYROW.EQ.RSRC4.AND.MYCOL.EQ.CSRC4.AND. $ MOD(LKBOT,NB).NE.0 ) THEN INDX = LKBOT + 1 CALL INFOG2L( LKTOP+DIM1, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC4, CSRC4 ) CALL SLAMOV( 'All', DIM4, HCOLS, $ WORK( IPW1 ), LNWIN, $ WORK( IPW3 ), DIM4 ) CALL STRMM( 'Left', 'Lower', 'Transpose', $ 'Non-unit', DIM4, HCOLS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW3 ), DIM4 ) CALL SGEMM( 'Transpose', 'No Transpose', $ DIM4, HCOLS, KS, ONE, $ WORK( IPU+LNWIN*KS+DIM4 ), LNWIN, $ WORK( IPW1+DIM1 ), LNWIN, $ ONE, WORK( IPW3), DIM4 ) CALL SLAMOV( 'All', DIM4, HCOLS, $ WORK(IPW3), DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF * * Compute U21**T*H2 + U11**T*H1 on upper side * on border. * INDXS = ICEIL(LKBOT,NB)*NB+1 IF( MOD(LKBOT,NB).NE.0 ) THEN INDXE = MIN(N,INDXS+(NPCOL-2)*NB) ELSE INDXE = MIN(N,INDXS+(NPCOL-1)*NB) END IF DO 300 INDX = INDXS, INDXE, NB IF( MYROW.EQ.RSRC1 ) THEN CALL INFOG2L( LKTOP, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL SLAMOV( 'All', KS, HCOLS, $ WORK( IPW1+DIM4 ), LNWIN, $ WORK(IPW3), KS ) CALL STRMM( 'Left', 'Upper', $ 'Transpose', 'Non-unit', $ KS, HCOLS, ONE, $ WORK( IPU+DIM4 ), LNWIN, $ WORK(IPW3), KS ) CALL SGEMM( 'Transpose', $ 'No transpose', KS, HCOLS, $ DIM4, ONE, WORK(IPU), LNWIN, $ WORK(IPW1), LNWIN, ONE, $ WORK(IPW3), KS ) CALL SLAMOV( 'All', KS, HCOLS, $ WORK(IPW3), KS, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF * * Compute U12**T*H1 + U22**T*H2 on lower * side of border. * IF( MYROW.EQ.RSRC4 ) THEN CALL INFOG2L( LKTOP+DIM1, INDX, DESCH, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC4, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL SLAMOV( 'All', DIM4, HCOLS, $ WORK( IPW1 ), LNWIN, $ WORK( IPW3 ), DIM4 ) CALL STRMM( 'Left', 'Lower', $ 'Transpose','Non-unit', $ DIM4, HCOLS, ONE, $ WORK( IPU+LNWIN*KS ), LNWIN, $ WORK( IPW3 ), DIM4 ) CALL SGEMM( 'Transpose', $ 'No Transpose', DIM4, HCOLS, $ KS, ONE, $ WORK( IPU+LNWIN*KS+DIM4 ), $ LNWIN, WORK( IPW1+DIM1 ), $ LNWIN, ONE, WORK( IPW3), $ DIM4 ) CALL SLAMOV( 'All', DIM4, HCOLS, $ WORK(IPW3), DIM4, $ H((JLOC-1)*LLDH+ILOC), LLDH ) END IF END IF 300 CONTINUE END IF END IF END IF * * Update window information - mark processed windows are * completed. * IF( DIR.EQ.2 ) THEN IF( LKBOT.EQ.KBOT ) THEN LKTOP = KBOT+1 LKBOT = KBOT+1 IWORK( 1+(WIN-1)*5 ) = LKTOP IWORK( 2+(WIN-1)*5 ) = LKBOT ELSE LKTOP = MIN( LKTOP + LNWIN - LCHAIN, $ MIN( KBOT, ICEIL( LKBOT, NB )*NB ) - $ LCHAIN + 1 ) IWORK( 1+(WIN-1)*5 ) = LKTOP LKBOT = MIN( MAX( LKBOT + LNWIN - LCHAIN, $ LKTOP + NWIN - 1), MIN( KBOT, $ ICEIL( LKBOT, NB )*NB ) ) IWORK( 2+(WIN-1)*5 ) = LKBOT END IF IF( IWORK( 5+(WIN-1)*5 ).EQ.1 ) $ IWORK( 5+(WIN-1)*5 ) = 2 IWORK( 3+(WIN-1)*5 ) = RSRC4 IWORK( 4+(WIN-1)*5 ) = CSRC4 END IF * * If nothing was done for the WIN:th window, all * processors come here and consider the next one * instead. * 245 CONTINUE 240 CONTINUE 190 CONTINUE 150 CONTINUE 140 CONTINUE * * Chased off bulges from first window? * IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', '1-Tree', 1, 1, ICHOFF, 1, $ -1, -1, -1, -1, -1 ) * * If the bulge was chasen off from first window it is removed. * IF( ICHOFF.GT.0 ) THEN DO 198 WIN = 2, ANMWIN IWORK( 1+(WIN-2)*5 ) = IWORK( 1+(WIN-1)*5 ) IWORK( 2+(WIN-2)*5 ) = IWORK( 2+(WIN-1)*5 ) IWORK( 3+(WIN-2)*5 ) = IWORK( 3+(WIN-1)*5 ) IWORK( 4+(WIN-2)*5 ) = IWORK( 4+(WIN-1)*5 ) 198 CONTINUE ANMWIN = ANMWIN - 1 IPIW = 6+(ANMWIN-1)*5 END IF * * If we have no more windows, return. * IF( ANMWIN.LT.1 ) RETURN * * Check for any more windows to bring over the border. * WINFIN = 0 DO 199 WIN = 1, ANMWIN WINFIN = WINFIN+IWORK( 5+(WIN-1)*5 ) 199 CONTINUE IF( WINFIN.LT.2*ANMWIN ) GO TO 137 * * Zero out process mark for each window - this is legal now when * the process starts over with local bulge-chasing etc. * DO 201 WIN = 1, ANMWIN IWORK( 5+(WIN-1)*5 ) = 0 201 CONTINUE * END IF * * Go back to local bulge-chase and see if there is more work to do. * GO TO 20 * * End of PSLAQR5 * END scalapack-2.0.2/SRC/pslaqsy.f000644 000766 000024 00000032003 10363532303 016177 0ustar00juliestaff000000 000000 SUBROUTINE PSLAQSY( UPLO, N, A, IA, JA, DESCA, SR, SC, SCOND, $ AMAX, EQUED ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER EQUED, UPLO INTEGER IA, JA, N REAL AMAX, SCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), SC( * ), SR( * ) * .. * * Purpose * ======= * * PSLAQSY equilibrates a symmetric distributed matrix * sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the scaling factors in the * vectors SR and SC. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric distributed matrix sub( A ) is to be referenced: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (input/output) REAL pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the distributed symmetric * matrix sub( A ). If UPLO = 'U', the leading N-by-N upper * triangular part of sub( A ) contains the upper triangular * part of the matrix, and the strictly lower triangular part * of sub( A ) is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular part of the matrix, and the strictly upper trian- * gular part of sub( A ) is not referenced. * On exit, if EQUED = 'Y', the equilibrated matrix: * diag(SR(IA:IA+N-1)) * sub( A ) * diag(SC(JA:JA+N-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * SR (local input) REAL array, dimension LOCr(M_A) * The scale factors for A(IA:IA+M-1,JA:JA+N-1). SR is aligned * with the distributed matrix A, and replicated across every * process column. SR is tied to the distributed matrix A. * * SC (local input) REAL array, dimension LOCc(N_A) * The scale factors for sub( A ). SC is aligned with the dis- * tributed matrix A, and replicated down every process row. * SC is tied to the distributed matrix A. * * SCOND (global input) REAL * Ratio of the smallest SR(i) (respectively SC(j)) to the * largest SR(i) (respectively SC(j)), with IA <= i <= IA+N-1 * and JA <= j <= JA+N-1. * * AMAX (global input) REAL * Absolute value of the largest distributed submatrix entry. * * EQUED (output) CHARACTER*1 * Specifies whether or not equilibration was done. * = 'N': No equilibration. * = 'Y': Equilibration was done, i.e., sub( A ) has been re- * placed by: * diag(SR(IA:IA+N-1)) * sub( A ) * diag(SC(JA:JA+N-1)). * * Internal Parameters * =================== * * THRESH is a threshold value used to decide if scaling should be done * based on the ratio of the scaling factors. If SCOND < THRESH, * scaling is done. * * LARGE and SMALL are threshold values used to decide if scaling should * be done based on the absolute size of the largest matrix element. * If AMAX > LARGE or AMAX < SMALL, scaling is done. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, THRESH PARAMETER ( ONE = 1.0E+0, THRESH = 0.1E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, II, IIA, IOFFA, IROFF, J, $ JB, JJ, JJA, JN, KK, LDA, LL, MYCOL, MYROW, NP, $ NPCOL, NPROW REAL CJ, LARGE, SMALL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC REAL PSLAMCH EXTERNAL ICEIL, LSAME, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) THEN EQUED = 'N' RETURN END IF * * Get grid parameters and compute local indexes * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) LDA = DESCA( LLD_ ) * * Initialize LARGE and SMALL. * SMALL = PSLAMCH( ICTXT, 'Safe minimum' ) / $ PSLAMCH( ICTXT, 'Precision' ) LARGE = ONE / SMALL * IF( SCOND.GE.THRESH .AND. AMAX.GE.SMALL .AND. AMAX.LE.LARGE ) THEN * * No equilibration * EQUED = 'N' * ELSE * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Replace A by diag(S) * A * diag(S). * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangle of A(IA:IA+N-1,JA:JA+N-1) is stored. * Handle first block separately * IOFFA = (JJ-1)*LDA IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 20 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 10 KK = IIA, II+LL-JJ+1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE ELSE IOFFA = IOFFA + JB*LDA END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 70 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 40 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 30 KK = IIA, II+LL-JJ+1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE ELSE DO 60 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 50 KK = IIA, II-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 70 CONTINUE * ELSE * * Lower triangle of A(IA:IA+N-1,JA:JA+N-1) is stored. * Handle first block separately * IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFF * IOFFA = (JJ-1)*LDA IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 90 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 80 KK = II+LL-JJ, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE ELSE DO 110 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 100 KK = II, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 160 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 130 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 120 KK = II+LL-JJ, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 120 CONTINUE IOFFA = IOFFA + LDA 130 CONTINUE ELSE DO 150 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 140 KK = II, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 140 CONTINUE IOFFA = IOFFA + LDA 150 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 160 CONTINUE * END IF * EQUED = 'Y' * END IF * RETURN * * End of PSLAQSY * END scalapack-2.0.2/SRC/pslared1d.f000644 000766 000024 00000015027 10367447133 016403 0ustar00juliestaff000000 000000 SUBROUTINE PSLARED1D( N, IA, JA, DESC, BYCOL, BYALL, WORK, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 12, 2005 * * .. Scalar Arguments .. INTEGER IA, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESC( * ) REAL BYALL( * ), BYCOL( * ), WORK( LWORK ) * .. * * Purpose * ======= * * PSLARED1D redistributes a 1D array * * It assumes that the input array, BYCOL, is distributed across * rows and that all process columns contain the same copy of * BYCOL. The output array, BYALL, will be identical on all processes * and will contain the entire array. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NP = Number of local rows in BYCOL() * * N (global input) INTEGER * The number of elements to be redistributed. N >= 0. * * IA (global input) INTEGER * IA must be equal to 1 * * JA (global input) INTEGER * JA must be equal to 1 * * DESC (global/local input) INTEGER Array of dimension DLEN_ * A 2D array descriptor, which describes BYCOL * * BYCOL (local input) distributed block cyclic REAL array * global dimension (N), local dimension (NP) * BYCOL is distributed across the process rows * All process columns are assumed to contain the same value * * BYALL (global output) REAL global dimension( N ) * local dimension (N) * BYALL is exactly duplicated on all processes * It contains the same values as BYCOL, but it is replicated * across all processes rather than being distributed * * BYALL(i) = BYCOL( NUMROC(i,DESC( NB_ ),MYROW,0,NPROW ) on the procs * whose MYROW == mod((i-1)/DESC( NB_ ),NPROW) * * WORK (local workspace) REAL dimension (LWORK) * Used to hold the buffers sent from one process to another * * LWORK (local input) INTEGER size of WORK array * LWORK >= NUMROC(N, DESC( NB_ ), 0, 0, NPCOL) * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ALLI, BUFLEN, I, II, MYCOL, MYROW, NB, NPCOL, $ NPROW, PCOL * .. * .. External Functions .. * INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. * EXTERNAL BLACS_GRIDINFO, SCOPY, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESC( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) NB = DESC( MB_ ) * DO 30 PCOL = 0, NPCOL - 1 BUFLEN = NUMROC( N, NB, PCOL, 0, NPCOL ) IF( MYCOL.EQ.PCOL ) THEN CALL SCOPY( BUFLEN, BYCOL, 1, WORK, 1 ) CALL SGEBS2D( DESC( CTXT_ ), 'R', ' ', 1, BUFLEN, WORK, 1 ) ELSE CALL SGEBR2D( DESC( CTXT_ ), 'R', ' ', 1, BUFLEN, WORK, 1, $ MYROW, PCOL ) END IF * ALLI = PCOL*NB DO 20 II = 1, BUFLEN, NB DO 10 I = 1, MIN( NB, BUFLEN-II+1 ) BYALL( ALLI+I ) = WORK( II-1+I ) 10 CONTINUE ALLI = ALLI + NB*NPCOL 20 CONTINUE 30 CONTINUE * RETURN * * End of PSLARED1D * END scalapack-2.0.2/SRC/pslared2d.f000644 000766 000024 00000015065 10367447133 016406 0ustar00juliestaff000000 000000 SUBROUTINE PSLARED2D( N, IA, JA, DESC, BYROW, BYALL, WORK, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 12, 2005 * * .. Scalar Arguments .. INTEGER IA, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESC( * ) REAL BYALL( * ), BYROW( * ), WORK( LWORK ) * .. * * Purpose * ======= * * PSLARED2D redistributes a 1D array * * It assumes that the input array, BYROW, is distributed across * columns and that all process rows contain the same copy of * BYROW. The output array, BYALL, will be identical on all processes * and will contain the entire array. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NP = Number of local rows in BYROW() * * N (global input) INTEGER * The number of elements to be redistributed. N >= 0. * * IA (global input) INTEGER * IA must be equal to 1 * * JA (global input) INTEGER * JA must be equal to 1 * * DESC (global/local input) INTEGER Array of dimension DLEN_ * A 2D array descriptor, which describes BYROW * * BYROW (local input) distributed block cyclic REAL array * global dimension (N), local dimension (NP) * BYROW is distributed across the process columns * All process rows are assumed to contain the same value * * BYALL (global output) REAL global dimension( N ) * local dimension (N) * BYALL is exactly duplicated on all processes * It contains the same values as BYROW, but it is replicated * across all processes rather than being distributed * * BYALL(i) = BYROW( NUMROC(i,DESC( MB_ ),MYCOL,0,NPCOL ) on the procs * whose MYCOL == mod((i-1)/DESC( MB_ ),NPCOL) * * WORK (local workspace) REAL dimension (LWORK) * Used to hold the buffers sent from one process to another * * LWORK (local input) INTEGER size of WORK array * LWORK >= NUMROC(N, DESC( MB_ ), 0, 0, NPROW) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ALLI, BUFLEN, I, II, MB, MYCOL, MYROW, NPCOL, $ NPROW, PROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SCOPY, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESC( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) MB = DESC( MB_ ) * DO 30 PROW = 0, NPROW - 1 BUFLEN = NUMROC( N, MB, PROW, 0, NPROW ) IF( MYROW.EQ.PROW ) THEN CALL SCOPY( BUFLEN, BYROW, 1, WORK, 1 ) CALL SGEBS2D( DESC( CTXT_ ), 'C', ' ', BUFLEN, 1, WORK, $ BUFLEN ) ELSE CALL SGEBR2D( DESC( CTXT_ ), 'C', ' ', BUFLEN, 1, WORK, $ BUFLEN, PROW, MYCOL ) END IF * ALLI = PROW*MB DO 20 II = 1, BUFLEN, MB DO 10 I = 1, MIN( MB, BUFLEN-II+1 ) BYALL( ALLI+I ) = WORK( II-1+I ) 10 CONTINUE ALLI = ALLI + MB*NPROW 20 CONTINUE 30 CONTINUE * RETURN * * End of PSLARED2D * END scalapack-2.0.2/SRC/pslarf.f000644 000766 000024 00000067677 10363532303 016023 0ustar00juliestaff000000 000000 SUBROUTINE PSLARF( SIDE, M, N, V, IV, JV, DESCV, INCV, TAU, $ C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) REAL C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PSLARF applies a real elementary reflector Q (or Q**T) to a real * M-by-N distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), from * either the left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a real scalar and v is a real vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also have the first row of sub( C ). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also have the first column of sub( C ) and * MOD(JV-1,NB_V) must be equal to MOD(JC-1,NB_C), if INCV = 1 only the * last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q * sub( C ), * = 'R': form sub( C ) * Q, Q = Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * V (local input) REAL pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+M-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+M-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+N-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+N-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) REAL, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q * sub( C ) if SIDE = 'L', or * sub( C ) * Q if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) REAL array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL, ICOFF, ICROW, ICTXT, IIC, IIV, IOFFC, $ IOFFV, IPW, IROFF, IVCOL, IVROW, JJC, JJV, LDC, $ LDV, MYCOL, MYROW, MP, NCC, NCV, NPCOL, NPROW, $ NQ, RDEST REAL TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PB_TOPGET, PBSTRNV, $ SCOPY, SGEBR2D, SGEBS2D, SGEMV, $ SGER, SGERV2D, SGESD2D, SGSUM2D, $ SLASET * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) JJC = MIN( JJC, NCC ) JJV = MIN( JJV, NCV ) IOFFC = IIC+(JJC-1)*LDC IOFFV = IIV+(JJV-1)*LDV * IROFF = MOD( IC-1, DESCC( MB_ ) ) ICOFF = MOD( JC-1, DESCC( NB_ ) ) MP = NUMROC( M+IROFF, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYROW.EQ.ICROW ) $ MP = MP - IROFF IF( MYCOL.EQ.ICCOL ) $ NQ = NQ - ICOFF * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFF) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFF) ) * IF( LSAME( SIDE, 'L' ) ) THEN * IF( CRBLCK ) THEN RDEST = ICROW ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V * IPW = MP+1 CALL PBSTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL SGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL SGER( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * TAULOC = TAU( JJV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL SGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, V( IOFFV ), 1, $ ZERO, WORK, 1 ) ELSE CALL SLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK, MAX( 1, NQ ) ) END IF CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK, MAX( 1, NQ ), RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL SGER( MP, NQ, -TAULOC, V( IOFFV ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL SCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL SGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL ) * ELSE IF( MYCOL.EQ.ICCOL ) THEN * IPW = MP+1 CALL SGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL SGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL SGER( MP, NQ, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V * IPW = MP+1 CALL PBSTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN IF( IOFFC.GT.0 ) $ CALL SGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( IOFFC.GT.0 ) $ CALL SGER( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL SCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK(IPW) = TAU( JJV ) CALL SGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( JJV ) * ELSE * IPW = MP+1 CALL SGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN IF( IOFFC.GT.0 ) $ CALL SGEMV( 'Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( IOFFC.GT.0 ) $ CALL SGER( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW ) THEN * TAULOC = TAU( IIV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL SGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, V( IOFFV ), LDV, $ ZERO, WORK, 1 ) ELSE CALL SLASET( 'All', MP, 1, ZERO, ZERO, $ WORK, MAX( 1, MP ) ) END IF CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK, MAX( 1, MP ), RDEST, ICCOL ) * * sub( C ) := sub( C ) - w * v' * IF( IOFFV.GT.0 .AND. IOFFC.GT.0 ) $ CALL SGER( MP, NQ, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW * IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 CALL SCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL SGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW ) THEN * IPW = NQ+1 CALL SGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL SGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL SGER( MP, NQ, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V * IPW = NQ+1 CALL PBSTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = TAU( JJV ) * ELSE * CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL SGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL SGER( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 IF( IOFFV.GT.0 ) $ CALL SCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL SGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( IIV ) * ELSE * IPW = NQ+1 CALL SGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL SGEMV( 'No Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * IF( IOFFC.GT.0 ) $ CALL SGER( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V * IPW = NQ+1 CALL PBSTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = TAU( JJV ) * ELSE * CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL SGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', MP, 1, ZERO, ZERO, WORK( IPW ), $ MAX( 1, MP ) ) END IF CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL SGER( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, 1, $ C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PSLARF * END scalapack-2.0.2/SRC/pslarfb.f000644 000766 000024 00000103570 11750130340 016140 0ustar00juliestaff000000 000000 SUBROUTINE PSLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, IV, $ JV, DESCV, T, C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, DIRECT, STOREV INTEGER IC, IV, JC, JV, K, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) REAL C( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PSLARFB applies a real block reflector Q or its transpose Q**T to a * real distributed M-by-N matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1) * from the left or the right. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * DIRECT (global input) CHARACTER * Indicates how Q is formed from a product of elementary * reflectors * = 'F': Q = H(1) H(2) . . . H(k) (Forward) * = 'B': Q = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Indicates how the vectors which define the elementary * reflectors are stored: * = 'C': Columnwise * = 'R': Rowwise * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The order of the matrix T (= the number of elementary * reflectors whose product defines the block reflector). * * V (local input) REAL pointer into the local memory * to an array of dimension ( LLD_V, LOCc(JV+K-1) ) if * STOREV = 'C', ( LLD_V, LOCc(JV+M-1)) if STOREV = 'R' and * SIDE = 'L', ( LLD_V, LOCc(JV+N-1) ) if STOREV = 'R' and * SIDE = 'R'. It contains the local pieces of the distributed * vectors V representing the Householder transformation. * See further details. * If STOREV = 'C' and SIDE = 'L', LLD_V >= MAX(1,LOCr(IV+M-1)); * if STOREV = 'C' and SIDE = 'R', LLD_V >= MAX(1,LOCr(IV+N-1)); * if STOREV = 'R', LLD_V >= LOCr(IV+K-1). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * T (local input) REAL array, dimension MB_V by MB_V * if STOREV = 'R' and NB_V by NB_V if STOREV = 'C'. The trian- * gular matrix T in the representation of the block reflector. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the M-by-N distributed matrix sub( C ). On exit, * sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) or * sub( C )*Q or sub( C )*Q'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) REAL array, dimension (LWORK) * If STOREV = 'C', * if SIDE = 'L', * LWORK >= ( NqC0 + MpC0 ) * K * else if SIDE = 'R', * LWORK >= ( NqC0 + MAX( NpV0 + NUMROC( NUMROC( N+ICOFFC, * NB_V, 0, 0, NPCOL ), NB_V, 0, 0, LCMQ ), * MpC0 ) ) * K * end if * else if STOREV = 'R', * if SIDE = 'L', * LWORK >= ( MpC0 + MAX( MqV0 + NUMROC( NUMROC( M+IROFFC, * MB_V, 0, 0, NPROW ), MB_V, 0, 0, LCMP ), * NqC0 ) ) * K * else if SIDE = 'R', * LWORK >= ( MpC0 + NqC0 ) * K * end if * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFV = MOD( IV-1, MB_V ), ICOFFV = MOD( JV-1, NB_V ), * IVROW = INDXG2P( IV, MB_V, MYROW, RSRC_V, NPROW ), * IVCOL = INDXG2P( JV, NB_V, MYCOL, CSRC_V, NPCOL ), * MqV0 = NUMROC( M+ICOFFV, NB_V, MYCOL, IVCOL, NPCOL ), * NpV0 = NUMROC( N+IROFFV, MB_V, MYROW, IVROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NpC0 = NUMROC( N+ICOFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If STOREV = 'Columnwise' * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if STOREV = 'Rowwise' * If SIDE = 'Left', * ( NB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL FORWARD CHARACTER COLBTOP, ROWBTOP, TRANST, UPLO INTEGER HEIGHT, IBASE, ICCOL, ICOFFC, ICOFFV, ICROW, $ ICTXT, II, IIBEG, IIC, IIEND, IINXT, IIV, $ ILASTCOL, ILASTROW, ILEFT, IOFF, IOFFC, IOFFV, $ IPT, IPV, IPW, IPW1, IRIGHT, IROFFC, IROFFV, $ ITOP, IVCOL, IVROW, JJ, JJBEG, JJC, JJEND, $ JJNXT, JJV, KP, KQ, LDC, LDV, LV, LW, MBV, MPC, $ MPC0, MQV, MQV0, MYCOL, MYDIST, MYROW, NBV, $ NPV, NPV0, NPCOL, NPROW, NQC, NQC0, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG1L, INFOG2L, PB_TOPGET, $ PBSTRAN, SGEBR2D, SGEBS2D, SGEMM, $ SGSUM2D, SLAMOV, SLASET, STRBR2D, $ STRBS2D, STRMM * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 .OR. K.LE.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( TRANS, 'N' ) ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF FORWARD = LSAME( DIRECT, 'F' ) IF( FORWARD ) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) MBV = DESCV( MB_ ) NBV = DESCV( NB_ ) IROFFV = MOD( IV-1, MBV ) ICOFFV = MOD( JV-1, NBV ) MPC = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYCOL.EQ.ICCOL ) $ NQC = NQC - ICOFFC IF( MYROW.EQ.ICROW ) $ MPC = MPC - IROFFC JJC = MIN( JJC, MAX( 1, JJC+NQC-1 ) ) JJV = MIN( JJV, MAX( 1, NUMROC( DESCV( N_ ), NBV, MYCOL, $ DESCV( CSRC_ ), NPCOL ) ) ) IOFFC = IIC + ( JJC-1 ) * LDC IOFFV = IIV + ( JJV-1 ) * LDV * IF( LSAME( STOREV, 'C' ) ) THEN * * V is stored columnwise * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * Locally V( IOFFV ) is MPV x K, C( IOFFC ) is MPC x NQC * WORK( IPV ) is MPC x K = V( IOFFV ), MPC = MPV * WORK( IPW ) is NQC x K = C( IOFFC )' * V( IOFFV ) * IPV = 1 IPW = IPV + MPC * K LV = MAX( 1, MPC ) LW = MAX( 1, NQC ) * * Broadcast V to the other process columns. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ROWBTOP, MPC, K, $ V( IOFFV ), LDV ) IF( MYROW.EQ.IVROW ) $ CALL STRBS2D( ICTXT, 'Rowwise', ROWBTOP, UPLO, $ 'Non unit', K, K, T, NBV ) CALL SLAMOV( 'All', MPC, K, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ROWBTOP, MPC, K, $ WORK( IPV ), LV, MYROW, IVCOL ) IF( MYROW.EQ.IVROW ) $ CALL STRBR2D( ICTXT, 'Rowwise', ROWBTOP, UPLO, $ 'Non unit', K, K, T, NBV, MYROW, IVCOL ) END IF * IF( FORWARD ) THEN * * WORK(IPV) = ( V1 ) where V1 is unit lower triangular, * ( V2 ) zeroes upper triangular part of V1 * MYDIST = MOD( MYROW-IVROW+NPROW, NPROW ) ITOP = MAX( 0, MYDIST*MBV - IROFFV ) IIBEG = IIV IIEND = IIBEG + MPC - 1 IINXT = MIN( ICEIL( IIBEG, MBV )*MBV, IIEND ) * 10 CONTINUE IF( K-ITOP .GT.0 ) THEN CALL SLASET( 'Upper', IINXT-IIBEG+1, K-ITOP, ZERO, $ ONE, WORK( IPV+IIBEG-IIV+ITOP*LV ), LV ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IROFFV IIBEG = IINXT + 1 IINXT = MIN( IINXT+MBV, IIEND ) GO TO 10 END IF * ELSE * * WORK(IPV) = ( V1 ) where V2 is unit upper triangular, * ( V2 ) zeroes lower triangular part of V2 * JJ = JJV IOFF = MOD( IV+M-K-1, MBV ) CALL INFOG1L( IV+M-K, MBV, NPROW, MYROW, DESCV( RSRC_ ), $ II, ILASTROW ) KP = NUMROC( K+IOFF, MBV, MYROW, ILASTROW, NPROW ) IF( MYROW.EQ.ILASTROW ) $ KP = KP - IOFF MYDIST = MOD( MYROW-ILASTROW+NPROW, NPROW ) ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP+MBV, K ) ITOP = MIN( MAX( 0, ITOP ), K ) * 20 CONTINUE IF( JJ.LE.( JJV+K-1 ) ) THEN HEIGHT = IBASE - ITOP CALL SLASET( 'All', KP, ITOP-JJ+JJV, ZERO, ZERO, $ WORK( IPV+II-IIV+(JJ-JJV)*LV ), LV ) CALL SLASET( 'Lower', KP, HEIGHT, ZERO, ONE, $ WORK( IPV+II-IIV+ITOP*LV ), LV ) KP = MAX( 0, KP - HEIGHT ) II = II + HEIGHT JJ = JJV + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP + MBV, K ) ITOP = MIN( ITOP, K ) GO TO 20 END IF * END IF * * WORK( IPW ) = C( IOFFC )' * V (NQC x MPC x K) -> NQC x K * IF( MPC.GT.0 ) THEN CALL SGEMM( 'Transpose', 'No transpose', NQC, K, MPC, $ ONE, C( IOFFC ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL SLASET( 'All', NQC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQC, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * CALL STRMM( 'Right', UPLO, TRANST, 'Non unit', NQC, K, $ ONE, T, NBV, WORK( IPW ), LW ) CALL SGEBS2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C C - V * W' * C( IOFFC ) = C( IOFFC ) - WORK( IPV ) * WORK( IPW )' * MPC x NQC MPC x K K x NQC * CALL SGEMM( 'No transpose', 'Transpose', MPC, NQC, K, -ONE, $ WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC ), LDC ) * ELSE * * Form sub( C )*Q or sub( C )*Q' * * ICOFFC = IROFFV is required by the current transposition * routine PBSTRAN * NPV0 = NUMROC( N+IROFFV, MBV, MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) THEN NPV = NPV0 - IROFFV ELSE NPV = NPV0 END IF IF( MYCOL.EQ.ICCOL ) THEN NQC0 = NQC + ICOFFC ELSE NQC0 = NQC END IF * * Locally V( IOFFV ) is NPV x K C( IOFFC ) is MPC x NQC * WORK( IPV ) is K x NQC0 = [ . V( IOFFV ) ]' * WORK( IPW ) is NPV0 x K = [ . V( IOFFV )' ]' * WORK( IPT ) is the workspace for PBSTRAN * IPV = 1 IPW = IPV + K * NQC0 IPT = IPW + NPV0 * K LV = MAX( 1, K ) LW = MAX( 1, NPV0 ) * IF( MYCOL.EQ.IVCOL ) THEN IF( MYROW.EQ.IVROW ) THEN CALL SLASET( 'All', IROFFV, K, ZERO, ZERO, $ WORK( IPW ), LW ) IPW1 = IPW + IROFFV CALL SLAMOV( 'All', NPV, K, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) ELSE IPW1 = IPW CALL SLAMOV( 'All', NPV, K, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) END IF * IF( FORWARD ) THEN * * WORK(IPW) = ( . V1' V2' )' where V1 is unit lower * triangular, zeroes upper triangular part of V1 * MYDIST = MOD( MYROW-IVROW+NPROW, NPROW ) ITOP = MAX( 0, MYDIST*MBV - IROFFV ) IIBEG = IIV IIEND = IIBEG + NPV - 1 IINXT = MIN( ICEIL( IIBEG, MBV )*MBV, IIEND ) * 30 CONTINUE IF( ( K-ITOP ).GT.0 ) THEN CALL SLASET( 'Upper', IINXT-IIBEG+1, K-ITOP, ZERO, $ ONE, WORK( IPW1+IIBEG-IIV+ITOP*LW ), $ LW ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IROFFV IIBEG = IINXT + 1 IINXT = MIN( IINXT+MBV, IIEND ) GO TO 30 END IF * ELSE * * WORK( IPW ) = ( . V1' V2' )' where V2 is unit upper * triangular, zeroes lower triangular part of V2. * JJ = JJV CALL INFOG1L( IV+N-K, MBV, NPROW, MYROW, $ DESCV( RSRC_ ), II, ILASTROW ) IOFF = MOD( IV+N-K-1, MBV ) KP = NUMROC( K+IOFF, MBV, MYROW, ILASTROW, NPROW ) IF( MYROW.EQ.ILASTROW ) $ KP = KP - IOFF MYDIST = MOD( MYROW-ILASTROW+NPROW, NPROW ) ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP+MBV, K ) ITOP = MIN( MAX( 0, ITOP ), K ) * 40 CONTINUE IF( JJ.LE.( JJV+K-1 ) ) THEN HEIGHT = IBASE - ITOP CALL SLASET( 'All', KP, ITOP-JJ+JJV, ZERO, ZERO, $ WORK( IPW1+II-IIV+(JJ-JJV)*LW ), LW ) CALL SLASET( 'Lower', KP, HEIGHT, ZERO, ONE, $ WORK( IPW1+II-IIV+ITOP*LW ), LW ) KP = MAX( 0, KP - HEIGHT ) II = II + HEIGHT JJ = JJV + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP + MBV, K ) ITOP = MIN( ITOP, K ) GO TO 40 END IF END IF END IF * CALL PBSTRAN( ICTXT, 'Columnwise', 'Transpose', N+IROFFV, K, $ MBV, WORK( IPW ), LW, ZERO, WORK( IPV ), LV, $ IVROW, IVCOL, -1, ICCOL, WORK( IPT ) ) * * WORK( IPV ) = ( . V' ) -> WORK( IPV ) = V' is K x NQC * IF( MYCOL.EQ.ICCOL ) $ IPV = IPV + ICOFFC * LV * * WORK( IPW ) becomes MPC x K = C( IOFFC ) * V * WORK( IPW ) = C( IOFFC ) * V (MPC x NQC x K) -> MPC x K * LW = MAX( 1, MPC ) * IF( NQC.GT.0 ) THEN CALL SGEMM( 'No transpose', 'Transpose', MPC, K, NQC, $ ONE, C( IOFFC ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL SLASET( 'All', MPC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN IF( MYROW.EQ.IVROW ) THEN * * Broadcast the block reflector to the other rows. * CALL STRBS2D( ICTXT, 'Columnwise', ' ', UPLO, $ 'Non unit', K, K, T, NBV ) ELSE CALL STRBR2D( ICTXT, 'Columnwise', ' ', UPLO, $ 'Non unit', K, K, T, NBV, IVROW, MYCOL ) END IF CALL STRMM( 'Right', UPLO, TRANS, 'Non unit', MPC, K, $ ONE, T, NBV, WORK( IPW ), LW ) * CALL SGEBS2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C C - W * V' * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC x NQC MPC x K K x NQC * CALL SGEMM( 'No transpose', 'No transpose', MPC, NQC, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC ), LDC ) END IF * ELSE * * V is stored rowwise * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * IROFFC = ICOFFV is required by the current transposition * routine PBSTRAN * MQV0 = NUMROC( M+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN MQV = MQV0 - ICOFFV ELSE MQV = MQV0 END IF IF( MYROW.EQ.ICROW ) THEN MPC0 = MPC + IROFFC ELSE MPC0 = MPC END IF * * Locally V( IOFFV ) is K x MQV, C( IOFFC ) is MPC x NQC * WORK( IPV ) is MPC0 x K = [ . V( IOFFV ) ]' * WORK( IPW ) is K x MQV0 = [ . V( IOFFV ) ] * WORK( IPT ) is the workspace for PBSTRAN * IPV = 1 IPW = IPV + MPC0 * K IPT = IPW + K * MQV0 LV = MAX( 1, MPC0 ) LW = MAX( 1, K ) * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN CALL SLASET( 'All', K, ICOFFV, ZERO, ZERO, $ WORK( IPW ), LW ) IPW1 = IPW + ICOFFV * LW CALL SLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) ELSE IPW1 = IPW CALL SLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) END IF * IF( FORWARD ) THEN * * WORK( IPW ) = ( . V1 V2 ) where V1 is unit upper * triangular, zeroes lower triangular part of V1 * MYDIST = MOD( MYCOL-IVCOL+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBV - ICOFFV ) JJBEG = JJV JJEND = JJV + MQV - 1 JJNXT = MIN( ICEIL( JJBEG, NBV ) * NBV, JJEND ) * 50 CONTINUE IF( ( K-ILEFT ).GT.0 ) THEN CALL SLASET( 'Lower', K-ILEFT, JJNXT-JJBEG+1, ZERO, $ ONE, $ WORK( IPW1+ILEFT+(JJBEG-JJV)*LW ), $ LW ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - ICOFFV JJBEG = JJNXT + 1 JJNXT = MIN( JJNXT+NBV, JJEND ) GO TO 50 END IF * ELSE * * WORK( IPW ) = ( . V1 V2 ) where V2 is unit lower * triangular, zeroes upper triangular part of V2. * II = IIV CALL INFOG1L( JV+M-K, NBV, NPCOL, MYCOL, $ DESCV( CSRC_ ), JJ, ILASTCOL ) IOFF = MOD( JV+M-K-1, NBV ) KQ = NUMROC( K+IOFF, NBV, MYCOL, ILASTCOL, NPCOL ) IF( MYCOL.EQ.ILASTCOL ) $ KQ = KQ - IOFF MYDIST = MOD( MYCOL-ILASTCOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT+NBV, K ) ILEFT = MIN( MAX( 0, ILEFT ), K ) * 60 CONTINUE IF( II.LE.( IIV+K-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL SLASET( 'All', ILEFT-II+IIV, KQ, ZERO, ZERO, $ WORK( IPW1+II-IIV+(JJ-JJV)*LW ), LW ) CALL SLASET( 'Upper', WIDE, KQ, ZERO, ONE, $ WORK( IPW1+ILEFT+(JJ-JJV)*LW ), LW ) KQ = MAX( 0, KQ - WIDE ) II = IIV + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT + NBV, K ) ILEFT = MIN( ILEFT, K ) GO TO 60 END IF END IF END IF * * WORK( IPV ) = WORK( IPW )' (replicated) is MPC0 x K * CALL PBSTRAN( ICTXT, 'Rowwise', 'Transpose', K, M+ICOFFV, $ NBV, WORK( IPW ), LW, ZERO, WORK( IPV ), LV, $ IVROW, IVCOL, ICROW, -1, WORK( IPT ) ) * * WORK( IPV ) = ( . V )' -> WORK( IPV ) = V' is MPC x K * IF( MYROW.EQ.ICROW ) $ IPV = IPV + IROFFC * * WORK( IPW ) becomes NQC x K = C( IOFFC )' * V' * WORK( IPW ) = C( IOFFC )' * V' (NQC x MPC x K) -> NQC x K * LW = MAX( 1, NQC ) * IF( MPC.GT.0 ) THEN CALL SGEMM( 'Transpose', 'No transpose', NQC, K, MPC, $ ONE, C( IOFFC ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL SLASET( 'All', NQC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQC, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN * * Broadcast the block reflector to the other columns. * CALL STRBS2D( ICTXT, 'Rowwise', ' ', UPLO, 'Non unit', $ K, K, T, MBV ) ELSE CALL STRBR2D( ICTXT, 'Rowwise', ' ', UPLO, 'Non unit', $ K, K, T, MBV, MYROW, IVCOL ) END IF CALL STRMM( 'Right', UPLO, TRANST, 'Non unit', NQC, K, $ ONE, T, MBV, WORK( IPW ), LW ) * CALL SGEBS2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C C - V' * W' * C( IOFFC ) = C( IOFFC ) - WORK( IPV ) * WORK( IPW )' * MPC x NQC MPC x K K x NQC * CALL SGEMM( 'No transpose', 'Transpose', MPC, NQC, K, -ONE, $ WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC ), LDC ) * ELSE * * Form Q*sub( C ) or Q'*sub( C ) * * Locally V( IOFFV ) is K x NQV, C( IOFFC ) is MPC x NQC * WORK( IPV ) is K x NQV = V( IOFFV ), NQV = NQC * WORK( IPW ) is MPC x K = C( IOFFC ) * V( IOFFV )' * IPV = 1 IPW = IPV + K * NQC LV = MAX( 1, K ) LW = MAX( 1, MPC ) * * Broadcast V to the other process rows. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) IF( MYROW.EQ.IVROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', COLBTOP, K, NQC, $ V( IOFFV ), LDV ) IF( MYCOL.EQ.IVCOL ) $ CALL STRBS2D( ICTXT, 'Columnwise', COLBTOP, UPLO, $ 'Non unit', K, K, T, MBV ) CALL SLAMOV( 'All', K, NQC, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', COLBTOP, K, NQC, $ WORK( IPV ), LV, IVROW, MYCOL ) IF( MYCOL.EQ.IVCOL ) $ CALL STRBR2D( ICTXT, 'Columnwise', COLBTOP, UPLO, $ 'Non unit', K, K, T, MBV, IVROW, MYCOL ) END IF * IF( FORWARD ) THEN * * WORK(IPW) = ( V1 V2 ) where V1 is unit upper * triangular, zeroes lower triangular part of V1 * MYDIST = MOD( MYCOL-IVCOL+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBV - ICOFFV ) JJBEG = JJV JJEND = JJV + NQC - 1 JJNXT = MIN( ICEIL( JJBEG, NBV ) * NBV, JJEND ) * 70 CONTINUE IF( ( K-ILEFT ).GT.0 ) THEN CALL SLASET( 'Lower', K-ILEFT, JJNXT-JJBEG+1, ZERO, $ ONE, WORK( IPV+ILEFT+(JJBEG-JJV)*LV ), $ LV ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - ICOFFV JJBEG = JJNXT + 1 JJNXT = MIN( JJNXT+NBV, JJEND ) GO TO 70 END IF * ELSE * * WORK( IPW ) = ( . V1 V2 ) where V2 is unit lower * triangular, zeroes upper triangular part of V2. * II = IIV CALL INFOG1L( JV+N-K, NBV, NPCOL, MYCOL, DESCV( CSRC_ ), $ JJ, ILASTCOL ) IOFF = MOD( JV+N-K-1, NBV ) KQ = NUMROC( K+IOFF, NBV, MYCOL, ILASTCOL, NPCOL ) IF( MYCOL.EQ.ILASTCOL ) $ KQ = KQ - IOFF MYDIST = MOD( MYCOL-ILASTCOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT+NBV, K ) ILEFT = MIN( MAX( 0, ILEFT ), K ) * 80 CONTINUE IF( II.LE.( IIV+K-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL SLASET( 'All', ILEFT-II+IIV, KQ, ZERO, ZERO, $ WORK( IPV+II-IIV+(JJ-JJV)*LV ), LV ) CALL SLASET( 'Upper', WIDE, KQ, ZERO, ONE, $ WORK( IPV+ILEFT+(JJ-JJV)*LV ), LV ) KQ = MAX( 0, KQ - WIDE ) II = IIV + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT + NBV, K ) ILEFT = MIN( ILEFT, K ) GO TO 80 END IF * END IF * * WORK( IPV ) is K x NQC = V = V( IOFFV ) * WORK( IPW ) = C( IOFFC ) * V' (MPC x NQC x K) -> MPC x K * IF( NQC.GT.0 ) THEN CALL SGEMM( 'No Transpose', 'Transpose', MPC, K, NQC, $ ONE, C( IOFFC ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL SLASET( 'All', MPC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN CALL STRMM( 'Right', UPLO, TRANS, 'Non unit', MPC, K, $ ONE, T, MBV, WORK( IPW ), LW ) CALL SGEBS2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C C - W * V * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC x NQC MPC x K K x NQC * CALL SGEMM( 'No transpose', 'No transpose', MPC, NQC, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC ), LDC ) * END IF * END IF * RETURN * * End of PSLARFB * END scalapack-2.0.2/SRC/pslarfg.f000644 000766 000024 00000023237 10363532303 016152 0ustar00juliestaff000000 000000 SUBROUTINE PSLARFG( N, ALPHA, IAX, JAX, X, IX, JX, DESCX, INCX, $ TAU ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAX, INCX, IX, JAX, JX, N REAL ALPHA * .. * .. Array Arguments .. INTEGER DESCX( * ) REAL TAU( * ), X( * ) * .. * * Purpose * ======= * * PSLARFG generates a real elementary reflector H of order n, such * that * * H * sub( X ) = H * ( x(iax,jax) ) = ( alpha ), H' * H = I. * ( x ) ( 0 ) * * where alpha is a scalar, and sub( X ) is an (N-1)-element real * distributed vector X(IX:IX+N-2,JX) if INCX = 1 and X(IX,JX:JX+N-2) if * INCX = DESCX(M_). H is represented in the form * * H = I - tau * ( 1 ) * ( 1 v' ) , * ( v ) * * where tau is a real scalar and v is a real (N-1)-element * vector. * * If the elements of sub( X ) are all zero, then tau = 0 and H is * taken to be the unit matrix. * * Otherwise 1 <= tau <= 2. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) INTEGER * The global order of the elementary reflector. N >= 0. * * ALPHA (local output) REAL * On exit, alpha is computed in the process scope having the * vector sub( X ). * * IAX (global input) INTEGER * The global row index in X of X(IAX,JAX). * * JAX (global input) INTEGER * The global column index in X of X(IAX,JAX). * * X (local input/local output) REAL, pointer into the * local memory to an array of dimension (LLD_X,*). This array * contains the local pieces of the distributed vector sub( X ). * Before entry, the incremented array sub( X ) must contain * the vector x. On exit, it is overwritten with the vector v. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * TAU (local output) REAL, array, dimension LOCc(JX) * if INCX = 1, and LOCr(IX) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER ICTXT, IIAX, INDXTAU, IXCOL, IXROW, J, JJAX, $ KNT, MYCOL, MYROW, NPCOL, NPROW REAL BETA, RSAFMN, SAFMIN, XNORM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSNRM2, SGEBR2D, $ SGEBS2D, PSSCAL * .. * .. External Functions .. REAL SLAMCH, SLAPY2 EXTERNAL SLAMCH, SLAPY2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, SIGN * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is distributed across a process row. * CALL INFOG2L( IX, JAX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIAX, JJAX, IXROW, IXCOL ) * IF( MYROW.NE.IXROW ) $ RETURN * * Broadcast X(IAX,JAX) across the process row. * IF( MYCOL.EQ.IXCOL ) THEN J = IIAX+(JJAX-1)*DESCX( LLD_ ) CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, X( J ), 1 ) ALPHA = X( J ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1, $ MYROW, IXCOL ) END IF * INDXTAU = IIAX * ELSE * * sub( X ) is distributed across a process column. * CALL INFOG2L( IAX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIAX, JJAX, IXROW, IXCOL ) * IF( MYCOL.NE.IXCOL ) $ RETURN * * Broadcast X(IAX,JAX) across the process column. * IF( MYROW.EQ.IXROW ) THEN J = IIAX+(JJAX-1)*DESCX( LLD_ ) CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, X( J ), 1 ) ALPHA = X( J ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, ALPHA, 1, $ IXROW, MYCOL ) END IF * INDXTAU = JJAX * END IF * IF( N.LE.0 ) THEN TAU( INDXTAU ) = ZERO RETURN END IF * CALL PSNRM2( N-1, XNORM, X, IX, JX, DESCX, INCX ) * IF( XNORM.EQ.ZERO ) THEN * * H = I * TAU( INDXTAU ) = ZERO * ELSE * * General case * BETA = -SIGN( SLAPY2( ALPHA, XNORM ), ALPHA ) SAFMIN = SLAMCH( 'S' ) RSAFMN = ONE / SAFMIN IF( ABS( BETA ).LT.SAFMIN ) THEN * * XNORM, BETA may be inaccurate; scale X and recompute them * KNT = 0 10 CONTINUE KNT = KNT + 1 CALL PSSCAL( N-1, RSAFMN, X, IX, JX, DESCX, INCX ) BETA = BETA*RSAFMN ALPHA = ALPHA*RSAFMN IF( ABS( BETA ).LT.SAFMIN ) $ GO TO 10 * * New BETA is at most 1, at least SAFMIN * CALL PSNRM2( N-1, XNORM, X, IX, JX, DESCX, INCX ) BETA = -SIGN( SLAPY2( ALPHA, XNORM ), ALPHA ) TAU( INDXTAU ) = ( BETA-ALPHA ) / BETA CALL PSSCAL( N-1, ONE/(ALPHA-BETA), X, IX, JX, DESCX, INCX ) * * If ALPHA is subnormal, it may lose relative accuracy * ALPHA = BETA DO 20 J = 1, KNT ALPHA = ALPHA*SAFMIN 20 CONTINUE ELSE TAU( INDXTAU ) = ( BETA-ALPHA ) / BETA CALL PSSCAL( N-1, ONE/(ALPHA-BETA), X, IX, JX, DESCX, INCX ) ALPHA = BETA END IF END IF * RETURN * * End of PSLARFG * END scalapack-2.0.2/SRC/pslarft.f000644 000766 000024 00000044300 10363532303 016161 0ustar00juliestaff000000 000000 SUBROUTINE PSLARFT( DIRECT, STOREV, N, K, V, IV, JV, DESCV, TAU, $ T, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIRECT, STOREV INTEGER IV, JV, K, N * .. * .. Array Arguments .. INTEGER DESCV( * ) REAL TAU( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PSLARFT forms the triangular factor T of a real block reflector H * of order n, which is defined as a product of k elementary reflectors. * * If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; * * If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. * * If STOREV = 'C', the vector which defines the elementary reflector * H(i) is stored in the i-th column of the distributed matrix V, and * * H = I - V * T * V' * * If STOREV = 'R', the vector which defines the elementary reflector * H(i) is stored in the i-th row of the distributed matrix V, and * * H = I - V' * T * V * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIRECT (global input) CHARACTER*1 * Specifies the order in which the elementary reflectors are * multiplied to form the block reflector: * = 'F': H = H(1) H(2) . . . H(k) (Forward) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER*1 * Specifies how the vectors which define the elementary * reflectors are stored (see also Further Details): * = 'C': columnwise * = 'R': rowwise * * N (global input) INTEGER * The order of the block reflector H. N >= 0. * * K (global input) INTEGER * The order of the triangular factor T (= the number of * elementary reflectors). 1 <= K <= MB_V (= NB_V). * * V (input/output) REAL pointer into the local memory * to an array of local dimension (LOCr(IV+N-1),LOCc(JV+K-1)) * if STOREV = 'C', and (LOCr(IV+K-1),LOCc(JV+N-1)) if * STOREV = 'R'. The distributed matrix V contains the * Householder vectors. See further details. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * TAU (local input) REAL, array, dimension LOCr(IV+K-1) * if INCV = M_V, and LOCc(JV+K-1) otherwise. This array * contains the Householder scalars related to the Householder * vectors. TAU is tied to the distributed matrix V. * * T (local output) REAL array, dimension (NB_V,NB_V) * if STOREV = 'Col', and (MB_V,MB_V) otherwise. It contains * the k-by-k triangular factor of the block reflector asso- * ciated with V. If DIRECT = 'F', T is upper triangular; * if DIRECT = 'B', T is lower triangular. * * WORK (local workspace) REAL array, * dimension (K*(K-1)/2) * * Further Details * =============== * * The shape of the matrix V and the storage of the vectors which define * the H(i) is best illustrated by the following example with n = 5 and * k = 3. The elements equal to 1 are not stored; the corresponding * array elements are modified but restored on exit. The rest of the * array is not used. * * DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': * * V( IV:IV+N-1, ( 1 ) V( IV:IV+K-1, ( 1 v1 v1 v1 v1 ) * JV:JV+K-1 ) = ( v1 1 ) JV:JV+N-1 ) = ( 1 v2 v2 v2 ) * ( v1 v2 1 ) ( 1 v3 v3 ) * ( v1 v2 v3 ) * ( v1 v2 v3 ) * * DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': * * V( IV:IV+N-1, ( v1 v2 v3 ) V( IV:IV+K-1, ( v1 v1 1 ) * JV:JV+K-1 ) = ( v1 v2 v3 ) JV:JV+N-1 ) = ( v2 v2 v2 1 ) * ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) * ( 1 v3 ) * ( 1 ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL FORWARD INTEGER ICOFF, ICTXT, II, IIV, IROFF, IVCOL, IVROW, $ ITMP0, ITMP1, IW, JJ, JJV, LDV, MICOL, MIROW, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ REAL VII * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SCOPY, SGEMV, $ SGSUM2D, SLASET, STRMV * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 .OR. K.LE.0 ) $ RETURN * ICTXT = DESCV( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * FORWARD = LSAME( DIRECT, 'F' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, $ IIV, JJV, IVROW, IVCOL ) * IF( LSAME( STOREV, 'C' ) .AND. MYCOL.EQ.IVCOL ) THEN * IW = 1 LDV = DESCV( LLD_ ) IROFF = MOD( IV-1, DESCV( MB_ ) ) * IF( FORWARD ) THEN * * DIRECT = 'Forward', STOREV = 'Columnwise' * NP = NUMROC( N+IROFF, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) THEN NP = NP - IROFF II = IIV + 1 ELSE II = IIV END IF IF( IROFF+1.EQ.DESCV( MB_ ) ) THEN MIROW = MOD( IVROW+1, NPROW ) ELSE MIROW = IVROW END IF ITMP0 = 0 * DO 10 JJ = JJV+1, JJV+K-1 * IF( MYROW.EQ.MIROW ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( jv+i-1 ) * * V(iv+i-1:iv+n-1,jv:jv+i-2)' * V(iv+i-1:iv+n-1,jv+i-1) * ITMP0 = ITMP0 + 1 IF( NP-II+IIV.GT.0 ) THEN CALL SGEMV( 'Transpose', NP-II+IIV, ITMP0, $ -TAU( JJ ), V( II+(JJV-1)*LDV ), LDV, $ V( II+(JJ-1)*LDV ), 1, ZERO, $ WORK( IW ), 1 ) ELSE CALL SLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF * IW = IW + ITMP0 IF( MYROW.EQ.MIROW ) THEN V( II+(JJ-1)*LDV ) = VII II = II + 1 END IF * IF( MOD( IV+ITMP0, DESCV( MB_ ) ).EQ.0 ) $ MIROW = MOD( MIROW+1, NPROW ) * 10 CONTINUE * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', IW-1, 1, WORK, IW-1, $ IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * IW = 1 ITMP0 = 0 ITMP1 = 1 * T( ITMP1 ) = TAU( JJV ) * DO 20 JJ = JJV+1, JJV+K-1 * * T(1:j-1,j) = T(1:j-1,1:j-1) * T(1:j-1,j) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 + DESCV( NB_ ) CALL SCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL STRMV( 'Upper', 'No transpose', 'Non-unit', $ ITMP0, T, DESCV( NB_ ), T( ITMP1 ), 1 ) T(ITMP1+ITMP0) = TAU( JJ ) * 20 CONTINUE * END IF * ELSE * * DIRECT = 'Backward', STOREV = 'Columnwise' * NP = NUMROC( N+IROFF-1, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ NP = NP - IROFF MIROW = INDXG2P( IV+N-2, DESCV( MB_ ), MYROW, $ DESCV( RSRC_ ), NPROW ) II = IIV + NP - 1 ITMP0 = 0 * DO 30 JJ = JJV+K-2, JJV, -1 * IF( MYROW.EQ.MIROW ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( jv+i-1 ) * * V(iv:iv+n-k+i-1,jv+i:jv+k-1)' * V(iv:iv+n-k+i-1,jv+i-1) * ITMP0 = ITMP0 + 1 IF( II-IIV+1.GT.0 ) THEN CALL SGEMV( 'Transpose', II-IIV+1, ITMP0, -TAU( JJ ), $ V( IIV+JJ*LDV ), LDV, $ V( IIV+(JJ-1)*LDV ), 1, ZERO, $ WORK( IW ), 1 ) ELSE CALL SLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF * IW = IW + ITMP0 IF( MYROW.EQ.MIROW ) THEN V( II+(JJ-1)*LDV ) = VII II = II - 1 END IF * IF( MOD( IV+N-ITMP0-2, DESCV(MB_) ).EQ.0 ) $ MIROW = MOD( MIROW+NPROW-1, NPROW ) * 30 CONTINUE * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', IW-1, 1, WORK, IW-1, $ IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( NB_ ) * T( ITMP1-1 ) = TAU( JJV+K-1 ) * DO 40 JJ = JJV+K-2, JJV, -1 * * T(j+1:k,j) = T(j+1:k,j+1:k) * T(j+1:k,j) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( NB_ ) - 1 CALL SCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL STRMV( 'Lower', 'No transpose', 'Non-unit', $ ITMP0, T( ITMP1+DESCV( NB_ ) ), $ DESCV( NB_ ), T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( JJ ) * 40 CONTINUE * END IF * END IF * ELSE IF( LSAME( STOREV, 'R' ) .AND. MYROW.EQ.IVROW ) THEN * IW = 1 LDV = DESCV( LLD_ ) ICOFF = MOD( JV-1, DESCV( NB_ ) ) * IF( FORWARD ) THEN * * DIRECT = 'Forward', STOREV = 'Rowwise' * NQ = NUMROC( N+ICOFF, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN NQ = NQ - ICOFF JJ = JJV + 1 ELSE JJ = JJV END IF IF( ICOFF+1.EQ.DESCV( NB_ ) ) THEN MICOL = MOD( IVCOL+1, NPCOL ) ELSE MICOL = IVCOL END IF ITMP0 = 0 * DO 50 II = IIV+1, IIV+K-1 * IF( MYCOL.EQ.MICOL ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( iv+i-1 ) * * V(iv+i-1,jv+i-1:jv+n-1) * V(iv:iv+i-2,jv+i-1:jv+n-1)' * ITMP0 = ITMP0 + 1 IF( NQ-JJ+JJV.GT.0 ) THEN CALL SGEMV( 'No transpose', ITMP0, NQ-JJ+JJV, $ -TAU(II), V( IIV+(JJ-1)*LDV ), LDV, $ V( II+(JJ-1)*LDV ), LDV, ZERO, $ WORK( IW ), 1 ) ELSE CALL SLASET( 'All', ITMP0, 1, ZERO, ZERO, $ WORK( IW ), ITMP0 ) END IF * IW = IW + ITMP0 IF( MYCOL.EQ.MICOL ) THEN V( II+(JJ-1)*LDV ) = VII JJ = JJ + 1 END IF * IF( MOD( JV+ITMP0, DESCV( NB_ ) ).EQ.0 ) $ MICOL = MOD( MICOL+1, NPCOL ) * 50 CONTINUE * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = 1 * T( ITMP1 ) = TAU( IIV ) * DO 60 II = IIV+1, IIV+K-1 * * T(1:i-1,i) = T(1:i-1,1:i-1) * T(1:i-1,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 + DESCV( MB_ ) CALL SCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL STRMV( 'Upper', 'No transpose', 'Non-unit', $ ITMP0, T, DESCV( MB_ ), T( ITMP1 ), 1 ) T( ITMP1+ITMP0 ) = TAU( II ) * 60 CONTINUE * END IF * ELSE * * DIRECT = 'Backward', STOREV = 'Rowwise' * NQ = NUMROC( N+ICOFF-1, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQ = NQ - ICOFF MICOL = INDXG2P( JV+N-2, DESCV( NB_ ), MYCOL, $ DESCV( CSRC_ ), NPCOL ) JJ = JJV + NQ - 1 ITMP0 = 0 * DO 70 II = IIV+K-2, IIV, -1 * IF( MYCOL.EQ.MICOL ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(i+1:k,i) = -tau( iv+i-1 ) * * V(iv+i:iv+k-1,jv:jv+n-k+i-1)' * V(iv+i-1,jv:jv+n-k+i-1)' * ITMP0 = ITMP0 + 1 IF( JJ-JJV+1.GT.0 ) THEN CALL SGEMV( 'No transpose', ITMP0, JJ-JJV+1, $ -TAU( II ), V( II+1+(JJV-1)*LDV ), LDV, $ V( II+(JJV-1)*LDV ), LDV, ZERO, $ WORK( IW ), 1 ) ELSE CALL SLASET( 'All', ITMP0, 1, ZERO, ZERO, $ WORK( IW ), ITMP0 ) END IF * IW = IW + ITMP0 IF( MYCOL.EQ.MICOL ) THEN V( II+(JJ-1)*LDV ) = VII JJ = JJ - 1 END IF * IF( MOD( JV+N-ITMP0-2, DESCV( NB_ ) ).EQ.0 ) $ MICOL = MOD( MICOL+NPCOL-1, NPCOL ) * 70 CONTINUE * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( MB_ ) * T( ITMP1-1 ) = TAU( IIV+K-1 ) * DO 80 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = T(i+1:k,i+1:k) * T(i+1:k,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( MB_ ) - 1 CALL SCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL STRMV( 'Lower', 'No transpose', 'Non-unit', $ ITMP0, T( ITMP1+DESCV( MB_ ) ), $ DESCV( MB_ ), T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( II ) * 80 CONTINUE * END IF * END IF * END IF * RETURN * * End of PSLARFT * END scalapack-2.0.2/SRC/pslarz.f000644 000766 000024 00000101713 10363532303 016023 0ustar00juliestaff000000 000000 SUBROUTINE PSLARZ( SIDE, M, N, L, V, IV, JV, DESCV, INCV, TAU, C, $ IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) REAL C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PSLARZ applies a real elementary reflector Q (or Q**T) to a real * M-by-N distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), from * either the left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a real scalar and v is a real vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Q is a product of k elementary reflectors as returned by PSTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also own C(IC+M-L,JC:JC+N-1). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC+N-L-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also own C(IC:IC+M-1,JC+N-L) and * MOD(JV-1,NB_V) must be equal to MOD(JC+N-L-1,NB_C), if INCV = 1 only * the last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q * sub( C ), * = 'R': form sub( C ) * Q, Q = Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) REAL pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+L-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+L-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) REAL, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q * sub( C ) if SIDE = 'L', or * sub( C ) * Q if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) REAL array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK, LEFT CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIC1, IIC2, IIV, IOFFC1, $ IOFFC2, IOFFV, IPW, IROFFC1, IROFFC2, IROFFV, $ IVCOL, IVROW, JJC1, JJC2, JJV, LDC, LDV, MPC2, $ MPV, MYCOL, MYROW, NCC, NCV, NPCOL, NPROW, $ NQC2, NQV, RDEST REAL TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PB_TOPGET, PBSTRNV, $ SAXPY, SCOPY, SGEBR2D, SGEBS2D, $ SGEMV, SGER, SGERV2D, SGESD2D, $ SGSUM2D, SLASET * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * LEFT = LSAME( SIDE, 'L' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) IROFFV = MOD( IV-1, DESCV( NB_ ) ) MPV = NUMROC( L+IROFFV, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ MPV = MPV - IROFFV ICOFFV = MOD( JV-1, DESCV( NB_ ) ) NQV = NUMROC( L+ICOFFV, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, NCV ) IOFFV = IIV+(JJV-1)*LDV NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC1, JJC1, ICROW1, ICCOL1 ) IROFFC1 = MOD( IC-1, DESCC( MB_ ) ) ICOFFC1 = MOD( JC-1, DESCC( NB_ ) ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NCC ) ) IOFFC1 = IIC1 + ( JJC1-1 ) * LDC * IF( LEFT ) THEN CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, DESCC( MB_ ) ) ICOFFC2 = MOD( JC-1, DESCC( NB_ ) ) NQC2 = NUMROC( N+ICOFFC2, DESCC( NB_ ), MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 ELSE CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC-1, DESCC( MB_ ) ) MPC2 = NUMROC( M+IROFFC2, DESCC( MB_ ), MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = MOD( JC+N-L-1, DESCC( NB_ ) ) END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NCC ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFFC1) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFFC1) ) * IF( LEFT ) THEN * IF( CRBLCK ) THEN RDEST = ICROW2 ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V (ICOFFV = IROFFC2) * IPW = MPV+1 CALL PBSTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL SGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL SGER( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL2 ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * TAULOC = TAU( JJV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL SGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, V( IOFFV ), 1, $ ZERO, WORK, 1 ) ELSE CALL SLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK, MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK, MAX( 1, NQC2 ) ) * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK, MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, -TAULOC, WORK, $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL SGER( MPV, NQC2, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL2 * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL SCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL SGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL2 ) * ELSE IF( MYCOL.EQ.ICCOL2 ) THEN * IPW = MPV+1 CALL SGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL SGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), $ RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL SGER( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V (ICOFFV=IROFFC2) * IPW = MPV+1 CALL PBSTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL SGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL SGER( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL SCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL SGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( JJV ) * ELSE * IPW = MPV+1 CALL SGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL SGEMV( 'Transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL SAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL SGER( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW2 ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW2 ) THEN * TAULOC = TAU( IIV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL SGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, V( IOFFV ), $ LDV, ZERO, WORK, 1 ) ELSE CALL SLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK, MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK, 1 ) * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK, MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) * IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, -TAULOC, WORK, 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * IF( MPC2.GT.0 .AND. NQV.GT.0 ) $ CALL SGER( MPC2, NQV, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC2 ), $ LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW2 * IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL SCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL SGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW2, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW2 ) THEN * IPW = NQV+1 CALL SGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL SGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), $ RDEST, ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL SGER( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V (IROFFV = ICOFFC2) * IPW = NQV+1 CALL PBSTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW2 ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = TAU( JJV ) * ELSE * CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL SGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL SGER( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL SCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL SGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( IIV ) * ELSE * IPW = NQV+1 CALL SGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL SGEMV( 'No Transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL SGER( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V (ICOFFC2=IROFFV) * IPW = NQV+1 CALL PBSTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = TAU( JJV ) * ELSE * CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL SGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL SLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL SAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL SGER( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PSLARZ * END scalapack-2.0.2/SRC/pslarzb.f000644 000766 000024 00000055577 11745552113 016213 0ustar00juliestaff000000 000000 SUBROUTINE PSLARZB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, V, $ IV, JV, DESCV, T, C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 14, 2000 * * .. Scalar Arguments .. CHARACTER DIRECT, SIDE, STOREV, TRANS INTEGER IC, IV, JC, JV, K, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) REAL C( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PSLARZB applies a real block reflector Q or its transpose Q**T to * a real distributed M-by-N matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1) * from the left or the right. * * Q is a product of k elementary reflectors as returned by PSTZRZF. * * Currently, only STOREV = 'R' and DIRECT = 'B' are supported. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * DIRECT (global input) CHARACTER * Indicates how H is formed from a product of elementary * reflectors * = 'F': H = H(1) H(2) . . . H(k) (Forward, not supported yet) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Indicates how the vectors which define the elementary * reflectors are stored: * = 'C': Columnwise (not supported yet) * = 'R': Rowwise * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The order of the matrix T (= the number of elementary * reflectors whose product defines the block reflector). * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) REAL pointer into the local memory * to an array of dimension (LLD_V, LOCc(JV+M-1)) if SIDE = 'L', * (LLD_V, LOCc(JV+N-1)) if SIDE = 'R'. It contains the local * pieces of the distributed vectors V representing the * Householder transformation as returned by PSTZRZF. * LLD_V >= LOCr(IV+K-1). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * T (local input) REAL array, dimension MB_V by MB_V * The lower triangular matrix T in the representation of the * block reflector. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the M-by-N distributed matrix sub( C ). On exit, * sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) or * sub( C )*Q or sub( C )*Q'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) REAL array, dimension (LWORK) * If STOREV = 'C', * if SIDE = 'L', * LWORK >= ( NqC0 + MpC0 ) * K * else if SIDE = 'R', * LWORK >= ( NqC0 + MAX( NpV0 + NUMROC( NUMROC( N+ICOFFC, * NB_V, 0, 0, NPCOL ), NB_V, 0, 0, LCMQ ), * MpC0 ) ) * K * end if * else if STOREV = 'R', * if SIDE = 'L', * LWORK >= ( MpC0 + MAX( MqV0 + NUMROC( NUMROC( M+IROFFC, * MB_V, 0, 0, NPROW ), MB_V, 0, 0, LCMP ), * NqC0 ) ) * K * else if SIDE = 'R', * LWORK >= ( MpC0 + NqC0 ) * K * end if * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFV = MOD( IV-1, MB_V ), ICOFFV = MOD( JV-1, NB_V ), * IVROW = INDXG2P( IV, MB_V, MYROW, RSRC_V, NPROW ), * IVCOL = INDXG2P( JV, NB_V, MYCOL, CSRC_V, NPCOL ), * MqV0 = NUMROC( M+ICOFFV, NB_V, MYCOL, IVCOL, NPCOL ), * NpV0 = NUMROC( N+IROFFV, MB_V, MYROW, IVROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NpC0 = NUMROC( N+ICOFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If STOREV = 'Columnwise' * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if STOREV = 'Rowwise' * If SIDE = 'Left', * ( NB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LEFT CHARACTER COLBTOP, TRANST INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIBEG, IIC1, IIC2, $ IIEND, IINXT, IIV, ILEFT, INFO, IOFFC2, IOFFV, $ IPT, IPV, IPW, IROFFC1, IROFFC2, ITOP, IVCOL, $ IVROW, JJBEG, JJEND, JJNXT, JJC1, JJC2, JJV, $ LDC, LDV, LV, LW, MBC, MBV, MPC1, MPC2, MPC20, $ MQV, MQV0, MYCOL, MYDIST, MYROW, NBC, NBV, $ NPCOL, NPROW, NQC1, NQC2, NQCALL, NQV * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, INFOG2L, $ PBSMATADD, PBSTRAN, PB_TOPGET, PXERBLA, $ SGEBR2D, SGEBS2D, SGEMM, $ SGSUM2D, SLAMOV, SLASET, STRBR2D, $ STRBS2D, STRMM * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 .OR. K.LE.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check for currently supported options * INFO = 0 IF( .NOT.LSAME( DIRECT, 'B' ) ) THEN INFO = -3 ELSE IF( .NOT.LSAME( STOREV, 'R' ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSLARZB', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * LEFT = LSAME( SIDE, 'L' ) IF( LSAME( TRANS, 'N' ) ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) MBV = DESCV( MB_ ) NBV = DESCV( NB_ ) ICOFFV = MOD( JV-1, NBV ) NQV = NUMROC( L+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, MAX( 1, NUMROC( DESCV( N_ ), NBV, MYCOL, $ DESCV( CSRC_ ), NPCOL ) ) ) IOFFV = IIV + ( JJV-1 ) * LDV MBC = DESCC( MB_ ) NBC = DESCC( NB_ ) NQCALL = NUMROC( DESCC( N_ ), NBC, MYCOL, DESCC( CSRC_ ), NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC1, $ JJC1, ICROW1, ICCOL1 ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NQCALL ) ) * IF( LEFT ) THEN IROFFC1 = MOD( IC-1, MBC ) MPC1 = NUMROC( K+IROFFC1, MBC, MYROW, ICROW1, NPROW ) IF( MYROW.EQ.ICROW1 ) $ MPC1 = MPC1 - IROFFC1 ICOFFC1 = MOD( JC-1, NBC ) NQC1 = NUMROC( N+ICOFFC1, NBC, MYCOL, ICCOL1, NPCOL ) IF( MYCOL.EQ.ICCOL1 ) $ NQC1 = NQC1 - ICOFFC1 CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, MBC ) MPC2 = NUMROC( L+IROFFC2, MBC, MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = ICOFFC1 NQC2 = NQC1 ELSE IROFFC1 = MOD( IC-1, MBC ) MPC1 = NUMROC( M+IROFFC1, MBC, MYROW, ICROW1, NPROW ) IF( MYROW.EQ.ICROW1 ) $ MPC1 = MPC1 - IROFFC1 ICOFFC1 = MOD( JC-1, NBC ) NQC1 = NUMROC( K+ICOFFC1, NBC, MYCOL, ICCOL1, NPCOL ) IF( MYCOL.EQ.ICCOL1 ) $ NQC1 = NQC1 - ICOFFC1 CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = IROFFC1 MPC2 = MPC1 ICOFFC2 = MOD( JC+N-L-1, NBC ) NQC2 = NUMROC( L+ICOFFC2, NBC, MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NQCALL ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * IROFFC2 = ICOFFV is required by the current transposition * routine PBSTRAN * MQV0 = NUMROC( M+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN MQV = MQV0 - ICOFFV ELSE MQV = MQV0 END IF IF( MYROW.EQ.ICROW2 ) THEN MPC20 = MPC2 + IROFFC2 ELSE MPC20 = MPC2 END IF * * Locally V( IOFFV ) is K x MQV, C( IOFFC2 ) is MPC2 x NQC2 * WORK( IPV ) is MPC20 x K = [ . V( IOFFV ) ]' * WORK( IPW ) is K x MQV0 = [ . V( IOFFV ) ] * WORK( IPT ) is the workspace for PBSTRAN * IPV = 1 IPW = IPV + MPC20 * K IPT = IPW + K * MQV0 LV = MAX( 1, MPC20 ) LW = MAX( 1, K ) * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN CALL SLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW+ICOFFV*LW ), LW ) ELSE CALL SLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW ), LW ) END IF END IF * * WORK( IPV ) = WORK( IPW )' (replicated) is MPC20 x K * CALL PBSTRAN( ICTXT, 'Rowwise', 'Transpose', K, M+ICOFFV, $ DESCV( NB_ ), WORK( IPW ), LW, ZERO, $ WORK( IPV ), LV, IVROW, IVCOL, ICROW2, -1, $ WORK( IPT ) ) * * WORK( IPV ) = ( . V )' -> WORK( IPV ) = V' is MPC2 x K * IF( MYROW.EQ.ICROW2 ) $ IPV = IPV + IROFFC2 * * WORK( IPW ) becomes NQC2 x K = C( IOFFC2 )' * V' * WORK( IPW ) = C( IOFFC2 )' * V' (NQC2 x MPC2 x K) -> NQC2 x K * LW = MAX( 1, NQC2 ) * IF( MPC2.GT.0 ) THEN CALL SGEMM( 'Transpose', 'No transpose', NQC2, K, MPC2, $ ONE, C( IOFFC2 ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL SLASET( 'All', NQC2, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * * WORK( IPW ) = WORK( IPW ) + C1 ( NQC1 = NQC2 ) * IF( MPC1.GT.0 ) THEN MYDIST = MOD( MYROW-ICROW1+NPROW, NPROW ) ITOP = MAX( 0, MYDIST * MBC - IROFFC1 ) IIBEG = IIC1 IIEND = IIC1 + MPC1 - 1 IINXT = MIN( ICEIL( IIBEG, MBC ) * MBC, IIEND ) * 10 CONTINUE IF( IIBEG.LE.IINXT ) THEN CALL PBSMATADD( ICTXT, 'Transpose', NQC2, IINXT-IIBEG+1, $ ONE, C( IIBEG+(JJC1-1)*LDC ), LDC, ONE, $ WORK( IPW+ITOP ), LW ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBC - IROFFC1 IIBEG = IINXT +1 IINXT = MIN( IINXT+MBC, IIEND ) GO TO 10 END IF END IF * CALL SGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN * * Broadcast the block reflector to the other columns. * CALL STRBS2D( ICTXT, 'Rowwise', ' ', 'Lower', 'Non unit', $ K, K, T, MBV ) ELSE CALL STRBR2D( ICTXT, 'Rowwise', ' ', 'Lower', 'Non unit', $ K, K, T, MBV, MYROW, IVCOL ) END IF CALL STRMM( 'Right', 'Lower', TRANST, 'Non unit', NQC2, K, $ ONE, T, MBV, WORK( IPW ), LW ) * CALL SGEBS2D( ICTXT, 'Columnwise', ' ', NQC2, K, $ WORK( IPW ), LW ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', NQC2, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C1 = C1 - WORK( IPW ) * IF( MPC1.GT.0 ) THEN MYDIST = MOD( MYROW-ICROW1+NPROW, NPROW ) ITOP = MAX( 0, MYDIST * MBC - IROFFC1 ) IIBEG = IIC1 IIEND = IIC1 + MPC1 - 1 IINXT = MIN( ICEIL( IIBEG, MBC ) * MBC, IIEND ) * 20 CONTINUE IF( IIBEG.LE.IINXT ) THEN CALL PBSMATADD( ICTXT, 'Transpose', IINXT-IIBEG+1, NQC2, $ -ONE, WORK( IPW+ITOP ), LW, ONE, $ C( IIBEG+(JJC1-1)*LDC ), LDC ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBC - IROFFC1 IIBEG = IINXT +1 IINXT = MIN( IINXT+MBC, IIEND ) GO TO 20 END IF END IF * * C2 C2 - V' * W' * C( IOFFC2 ) = C( IOFFC2 ) - WORK( IPV ) * WORK( IPW )' * MPC2 x NQC2 MPC2 x K K x NQC2 * CALL SGEMM( 'No transpose', 'Transpose', MPC2, NQC2, K, -ONE, $ WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC2 ), LDC ) * ELSE * * Form sub( C ) * Q or sub( C ) * Q' * * Locally V( IOFFV ) is K x NQV, C( IOFFC2 ) is MPC2 x NQC2 * WORK( IPV ) is K x NQV = V( IOFFV ), NQV = NQC2 * WORK( IPW ) is MPC2 x K = C( IOFFC2 ) * V( IOFFV )' * IPV = 1 IPW = IPV + K * NQC2 LV = MAX( 1, K ) LW = MAX( 1, MPC2 ) * * Broadcast V to the other process rows. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) IF( MYROW.EQ.IVROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', COLBTOP, K, NQC2, $ V( IOFFV ), LDV ) IF( MYCOL.EQ.IVCOL ) $ CALL STRBS2D( ICTXT, 'Columnwise', COLBTOP, 'Lower', $ 'Non unit', K, K, T, MBV ) CALL SLAMOV( 'All', K, NQC2, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', COLBTOP, K, NQC2, $ WORK( IPV ), LV, IVROW, MYCOL ) IF( MYCOL.EQ.IVCOL ) $ CALL STRBR2D( ICTXT, 'Columnwise', COLBTOP, 'Lower', $ 'Non unit', K, K, T, MBV, IVROW, MYCOL ) END IF * * WORK( IPV ) is K x NQC2 = V = V( IOFFV ) * WORK( IPW ) = C( IOFFC2 ) * V' (MPC2 x NQC2 x K) -> MPC2 x K * IF( NQC2.GT.0 ) THEN CALL SGEMM( 'No Transpose', 'Transpose', MPC2, K, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL SLASET( 'All', MPC2, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * * WORK( IPW ) = WORK( IPW ) + C1 ( MPC1 = MPC2 ) * IF( NQC1.GT.0 ) THEN MYDIST = MOD( MYCOL-ICCOL1+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBC - ICOFFC1 ) JJBEG = JJC1 JJEND = JJC1 + NQC1 - 1 JJNXT = MIN( ICEIL( JJBEG, NBC ) * NBC, JJEND ) * 30 CONTINUE IF( JJBEG.LE.JJNXT ) THEN CALL PBSMATADD( ICTXT, 'No transpose', MPC2, $ JJNXT-JJBEG+1, ONE, $ C( IIC1+(JJBEG-1)*LDC ), LDC, ONE, $ WORK( IPW+ILEFT*LW ), LW ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBC - ICOFFC1 JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBC, JJEND ) GO TO 30 END IF END IF * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN CALL STRMM( 'Right', 'Lower', TRANS, 'Non unit', MPC2, K, $ ONE, T, MBV, WORK( IPW ), LW ) CALL SGEBS2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C1 = C1 - WORK( IPW ) * IF( NQC1.GT.0 ) THEN MYDIST = MOD( MYCOL-ICCOL1+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBC - ICOFFC1 ) JJBEG = JJC1 JJEND = JJC1 + NQC1 - 1 JJNXT = MIN( ICEIL( JJBEG, NBC ) * NBC, JJEND ) * 40 CONTINUE IF( JJBEG.LE.JJNXT ) THEN CALL PBSMATADD( ICTXT, 'No transpose', MPC2, $ JJNXT-JJBEG+1, -ONE, $ WORK( IPW+ILEFT*LW ), LW, ONE, $ C( IIC1+(JJBEG-1)*LDC ), LDC ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBC - ICOFFC1 JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBC, JJEND ) GO TO 40 END IF END IF * * C2 C2 - W * V * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC2 x NQC2 MPC2 x K K x NQC2 * IF( IOFFC2.GT.0 ) $ CALL SGEMM( 'No transpose', 'No transpose', MPC2, NQC2, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC2 ), LDC ) * END IF * RETURN * * End of PSLARZB * END scalapack-2.0.2/SRC/pslarzt.f000644 000766 000024 00000025640 10363532303 016213 0ustar00juliestaff000000 000000 SUBROUTINE PSLARZT( DIRECT, STOREV, N, K, V, IV, JV, DESCV, TAU, $ T, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIRECT, STOREV INTEGER IV, JV, K, N * .. * .. Array Arguments .. INTEGER DESCV( * ) REAL TAU( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PSLARZT forms the triangular factor T of a real block reflector * H of order > n, which is defined as a product of k elementary * reflectors as returned by PSTZRZF. * * If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; * * If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. * * If STOREV = 'C', the vector which defines the elementary reflector * H(i) is stored in the i-th column of the array V, and * * H = I - V * T * V' * * If STOREV = 'R', the vector which defines the elementary reflector * H(i) is stored in the i-th row of the array V, and * * H = I - V' * T * V * * Currently, only STOREV = 'R' and DIRECT = 'B' are supported. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIRECT (global input) CHARACTER * Specifies the order in which the elementary reflectors are * multiplied to form the block reflector: * = 'F': H = H(1) H(2) . . . H(k) (Forward, not supported yet) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Specifies how the vectors which define the elementary * reflectors are stored (see also Further Details): * = 'C': columnwise (not supported yet) * = 'R': rowwise * * N (global input) INTEGER * The number of meaningful entries of the block reflector H. * N >= 0. * * K (global input) INTEGER * The order of the triangular factor T (= the number of * elementary reflectors). 1 <= K <= MB_V (= NB_V). * * V (input/output) REAL pointer into the local memory * to an array of local dimension (LOCr(IV+K-1),LOCc(JV+N-1)). * The distributed matrix V contains the Householder vectors. * See further details. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * TAU (local input) REAL, array, dimension LOCr(IV+K-1) * if INCV = M_V, and LOCc(JV+K-1) otherwise. This array * contains the Householder scalars related to the Householder * vectors. TAU is tied to the distributed matrix V. * * T (local output) REAL array, dimension (MB_V,MB_V) * It contains the k-by-k triangular factor of the block * reflector associated with V. T is lower triangular. * * WORK (local workspace) REAL array, * dimension (K*(K-1)/2) * * Further Details * =============== * * The shape of the matrix V and the storage of the vectors which define * the H(i) is best illustrated by the following example with n = 5 and * k = 3. The elements equal to 1 are not stored; the corresponding * array elements are modified but restored on exit. The rest of the * array is not used. * * DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': * * ______V_____ * ( v1 v2 v3 ) / \ * ( v1 v2 v3 ) ( v1 v1 v1 v1 v1 . . . . 1 ) * V = ( v1 v2 v3 ) ( v2 v2 v2 v2 v2 . . . 1 ) * ( v1 v2 v3 ) ( v3 v3 v3 v3 v3 . . 1 ) * ( v1 v2 v3 ) * . . . * . . . * 1 . . * 1 . * 1 * * DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': * * ______V_____ * 1 / \ * . 1 ( 1 . . . . v1 v1 v1 v1 v1 ) * . . 1 ( . 1 . . . v2 v2 v2 v2 v2 ) * . . . ( . . 1 . . v3 v3 v3 v3 v3 ) * . . . * ( v1 v2 v3 ) * ( v1 v2 v3 ) * V = ( v1 v2 v3 ) * ( v1 v2 v3 ) * ( v1 v2 v3 ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER ICOFF, ICTXT, II, IIV, INFO, IVCOL, IVROW, $ ITMP0, ITMP1, IW, JJV, LDV, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, INFOG2L, PXERBLA, $ SCOPY, SGEMV, SGSUM2D, SLASET, $ STRMV * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCV( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check for currently supported options * INFO = 0 IF( .NOT.LSAME( DIRECT, 'B' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( STOREV, 'R' ) ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSLARZT', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, $ IIV, JJV, IVROW, IVCOL ) * IF( MYROW.EQ.IVROW ) THEN IW = 1 ITMP0 = 0 LDV = DESCV( LLD_ ) ICOFF = MOD( JV-1, DESCV( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQ = NQ - ICOFF * DO 10 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = -tau( iv+i-1 ) * * V(iv+i:iv+k-1,jv:jv+n-1) * V(iv+i-1,jv:jv+n-1)' * ITMP0 = ITMP0 + 1 IF( NQ.GT.0 ) THEN CALL SGEMV( 'No transpose', ITMP0, NQ, -TAU( II ), $ V( II+1+(JJV-1)*LDV ), LDV, $ V( II+(JJV-1)*LDV ), LDV, ZERO, WORK( IW ), $ 1 ) ELSE CALL SLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF IW = IW + ITMP0 * 10 CONTINUE * CALL SGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( MB_ ) * T( ITMP1-1 ) = TAU( IIV+K-1 ) * DO 20 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = T(i+1:k,i+1:k) * T(i+1:k,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( MB_ ) - 1 CALL SCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL STRMV( 'Lower', 'No transpose', 'Non-unit', ITMP0, $ T( ITMP1+DESCV( MB_ ) ), DESCV( MB_ ), $ T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( II ) * 20 CONTINUE * END IF * END IF * RETURN * * End of PSLARZT * END scalapack-2.0.2/SRC/pslascl.f000644 000766 000024 00000043002 11556766441 016166 0ustar00juliestaff000000 000000 SUBROUTINE PSLASCL( TYPE, CFROM, CTO, M, N, A, IA, JA, DESCA, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TYPE INTEGER IA, INFO, JA, M, N REAL CFROM, CTO * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLASCL multiplies the M-by-N real distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1) by the real scalar CTO/CFROM. This * is done without over/underflow as long as the final result * CTO * A(I,J) / CFROM does not over/underflow. TYPE specifies that * sub( A ) may be full, upper triangular, lower triangular or upper * Hessenberg. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TYPE (global input) CHARACTER * TYPE indices the storage type of the input distributed * matrix. * = 'G': sub( A ) is a full matrix, * = 'L': sub( A ) is a lower triangular matrix, * = 'U': sub( A ) is an upper triangular matrix, * = 'H': sub( A ) is an upper Hessenberg matrix. * * CFROM (global input) REAL * CTO (global input) REAL * The distributed matrix sub( A ) is multiplied by CTO/CFROM. * A(I,J) is computed without over/underflow if the final * result CTO * A(I,J) / CFROM can be represented without * over/underflow. CFROM must be nonzero. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * matrix sub( A ). On exit, this array contains the local * pieces of the distributed matrix multiplied by CTO/CFROM. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) * .. * .. Local Scalars .. LOGICAL DONE INTEGER IACOL, IAROW, ICOFFA, ICTXT, ICURCOL, ICURROW, $ IIA, II, INXTROW, IOFFA, IROFFA, ITYPE, J, JB, $ JJA, JJ, JN, KK, LDA, LL, MYCOL, MYROW, MP, $ NPCOL, NPROW, NQ REAL BIGNUM, CFROM1, CFROMC, CTO1, CTOC, MUL, SMLNUM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, INFOG2L, PXERBLA * .. * .. External Functions .. LOGICAL LSAME, SISNAN INTEGER ICEIL, NUMROC REAL PSLAMCH EXTERNAL SISNAN, ICEIL, LSAME, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * IF( NPROW.EQ.-1 ) THEN INFO = -907 ELSE INFO = 0 CALL CHK1MAT( M, 4, N, 6, IA, JA, DESCA, 9, INFO ) IF( INFO.EQ.0 ) THEN IF( LSAME( TYPE, 'G' ) ) THEN ITYPE = 0 ELSE IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 ELSE ITYPE = -1 END IF IF( ITYPE.EQ.-1 ) THEN INFO = -1 ELSE IF( CFROM.EQ.ZERO .OR. SISNAN(CFROM) ) THEN INFO = -4 ELSE IF( SISNAN(CTO) ) THEN INFO = -5 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSLASCL', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) $ RETURN * * Get machine parameters * SMLNUM = PSLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM * CFROMC = CFROM CTOC = CTO * * Compute local indexes * LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * 10 CONTINUE CFROM1 = CFROMC*SMLNUM IF( CFROM1.EQ.CFROMC ) THEN ! CFROMC is an inf. Multiply by a correctly signed zero for ! finite CTOC, or a NaN if CTOC is infinite. MUL = CTOC / CFROMC DONE = .TRUE. CTO1 = CTOC ELSE CTO1 = CTOC / BIGNUM IF( CTO1.EQ.CTOC ) THEN ! CTOC is either 0 or an inf. In both cases, CTOC itself ! serves as the correct multiplication factor. MUL = CTOC DONE = .TRUE. CFROMC = ONE ELSE IF( ABS( CFROM1 ).GT.ABS( CTOC ) .AND. CTOC.NE.ZERO ) THEN MUL = SMLNUM DONE = .FALSE. CFROMC = CFROM1 ELSE IF( ABS( CTO1 ).GT.ABS( CFROMC ) ) THEN MUL = BIGNUM DONE = .FALSE. CTOC = CTO1 ELSE MUL = CTOC / CFROMC DONE = .TRUE. END IF END IF * IOFFA = ( JJA - 1 ) * LDA ICURROW = IAROW ICURCOL = IACOL * IF( ITYPE.EQ.0 ) THEN * * Full matrix * DO 30 JJ = JJA, JJA+NQ-1 DO 20 II = IIA, IIA+MP-1 A( IOFFA+II ) = A( IOFFA+II ) * MUL 20 CONTINUE IOFFA = IOFFA + LDA 30 CONTINUE * ELSE IF( ITYPE.EQ.1 ) THEN * * Lower triangular matrix * II = IIA JJ = JJA JB = JN-JA+1 * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 50 LL = JJ, JJ + JB -1 DO 40 KK = II+LL-JJ, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 40 CONTINUE IOFFA = IOFFA + LDA 50 CONTINUE ELSE DO 70 LL = JJ, JJ + JB -1 DO 60 KK = II, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 60 CONTINUE IOFFA = IOFFA + LDA 70 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 120 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 90 LL = JJ, JJ + JB -1 DO 80 KK = II+LL-JJ, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE ELSE DO 110 LL = JJ, JJ + JB -1 DO 100 KK = II, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 120 CONTINUE * ELSE IF( ITYPE.EQ.2 ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JB = JN-JA+1 * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 140 LL = JJ, JJ + JB -1 DO 130 KK = IIA, MIN(II+LL-JJ,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 130 CONTINUE IOFFA = IOFFA + LDA 140 CONTINUE ELSE DO 160 LL = JJ, JJ + JB -1 DO 150 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 150 CONTINUE IOFFA = IOFFA + LDA 160 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 210 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 180 LL = JJ, JJ + JB -1 DO 170 KK = IIA, MIN(II+LL-JJ,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 170 CONTINUE IOFFA = IOFFA + LDA 180 CONTINUE ELSE DO 200 LL = JJ, JJ + JB -1 DO 190 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 190 CONTINUE IOFFA = IOFFA + LDA 200 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 210 CONTINUE * ELSE IF( ITYPE.EQ.3 ) THEN * * Upper Hessenberg matrix * II = IIA JJ = JJA JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.ICURCOL ) THEN DO 230 LL = JJ, JJ+JB-1 DO 220 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 220 CONTINUE IOFFA = IOFFA + LDA 230 CONTINUE JJ = JJ + JB END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 260 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 250 LL = JJ, JJ+JB-1 DO 240 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 240 CONTINUE IOFFA = IOFFA + LDA 250 CONTINUE JJ = JJ + JB END IF * II = II + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) * 260 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( ICURROW+1, NPROW ) IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 280 LL = JJ, JJ + JB -1 DO 270 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 270 CONTINUE IOFFA = IOFFA + LDA 280 CONTINUE ELSE DO 300 LL = JJ, JJ + JB -1 DO 290 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 290 CONTINUE IOFFA = IOFFA + LDA 300 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+MP-1 ) $ A( II+(JJ+JB-2)*LDA ) = A( II+(JJ+JB-2)*LDA ) * MUL END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = INXTROW ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 350 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 320 LL = JJ, JJ + JB -1 DO 310 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 310 CONTINUE IOFFA = IOFFA + LDA 320 CONTINUE ELSE DO 340 LL = JJ, JJ + JB -1 DO 330 KK = IIA, MIN( II-1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 330 CONTINUE IOFFA = IOFFA + LDA 340 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+MP-1 ) $ A( II+(JJ+JB-2)*LDA ) = A( II+(JJ+JB-2)*LDA ) * $ MUL END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = INXTROW ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 350 CONTINUE * END IF * END IF * IF( .NOT.DONE ) $ GO TO 10 * RETURN * * End of PSLASCL * END scalapack-2.0.2/SRC/pslase2.f000644 000766 000024 00000037352 10363532303 016070 0ustar00juliestaff000000 000000 SUBROUTINE PSLASE2( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLASE2 initializes an M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1) to BETA on the diagonal and ALPHA on the * offdiagonals. PSLASE2 requires that only dimension of the matrix * operand is distributed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * set: * = 'U': Upper triangular part is set; the strictly lower * triangular part of sub( A ) is not changed; * = 'L': Lower triangular part is set; the strictly upper * triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * ALPHA (global input) REAL * The constant to which the offdiagonal elements are to be * set. * * BETA (global input) REAL * The constant to which the diagonal elements are to be set. * * A (local output) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ) * to be set. On exit, the leading M-by-N submatrix sub( A ) * is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER HEIGHT, IACOL, IAROW, IBASE, ICOFFA, II, IIA, $ IIBEG, IIEND, IINXT, ILEFT, IRIGHT, IROFFA, $ ITOP, JJ, JJA, JJBEG, JJEND, JJNXT, LDA, MBA, $ MP, MPA, MYCOL, MYDIST, MYROW, NBA, NPCOL, $ NPROW, NQ, NQA, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SLASET * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, MBA ) ICOFFA = MOD( JA-1, NBA ) * IF( N.LE.( NBA-ICOFFA ) ) THEN * * It is assumed that the local columns JJA:JJA+N-1 of the matrix * A are in the same process column (IACOL). * * N * JJA JJA+N-1 * / --------------------- \ * IROFFA| | | | * \ |...................| | ( IAROW ) * IIA |x | | MB_A * | x | | * |--x----------------| / * | x | * | x | ITOP * | x | | * | x | /-------\ * |-------x-----------| |-------x-----------| * | x | | x | * | x | | x | * | x | | x | * | x | | x | * |------------x------| |------------x------| * | x | \____________/ * | x | | * | x | IBASE * | x | * |-----------------x-| Local picture * | x| * | | * | | * | | * |-------------------| * | | * . . * . . * . (IACOL) . * IF( MYCOL.EQ.IACOL ) THEN * MPA = NUMROC( M+IROFFA, MBA, MYROW, IAROW, NPROW ) IF( MPA.LE.0 ) $ RETURN IF( MYROW.EQ.IAROW ) $ MPA = MPA - IROFFA MYDIST = MOD( MYROW-IAROW+NPROW, NPROW ) ITOP = MYDIST * MBA - IROFFA * IF( LSAME( UPLO, 'U' ) ) THEN * ITOP = MAX( 0, ITOP ) IIBEG = IIA IIEND = IIA + MPA - 1 IINXT = MIN( ICEIL( IIBEG, MBA ) * MBA, IIEND ) * 10 CONTINUE IF( ( N-ITOP ).GT.0 ) THEN CALL SLASET( UPLO, IINXT-IIBEG+1, N-ITOP, ALPHA, BETA, $ A( IIBEG+(JJA+ITOP-1)*LDA ), LDA ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IIBEG = IINXT +1 IINXT = MIN( IINXT+MBA, IIEND ) GO TO 10 END IF * ELSE IF( LSAME( UPLO, 'L' ) ) THEN * II = IIA JJ = JJA MP = MPA IBASE = MIN( ITOP+MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 20 CONTINUE IF( JJ.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL SLASET( 'All', MP, ITOP-JJ+JJA, ALPHA, ALPHA, $ A( II+(JJ-1)*LDA ), LDA ) CALL SLASET( UPLO, MP, HEIGHT, ALPHA, BETA, $ A( II+(JJA+ITOP-1)*LDA ), LDA ) MP = MAX( 0, MP - HEIGHT ) II = II + HEIGHT JJ = JJA + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 20 END IF * ELSE * II = IIA JJ = JJA MP = MPA IBASE = MIN( ITOP+MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 30 CONTINUE IF( JJ.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL SLASET( 'All', MPA, ITOP-JJ+JJA, ALPHA, ALPHA, $ A( IIA+(JJ-1)*LDA ), LDA ) CALL SLASET( 'All', MPA-MP, HEIGHT, ALPHA, ALPHA, $ A( IIA+(JJA+ITOP-1)*LDA ), LDA ) CALL SLASET( 'All', MP, HEIGHT, ALPHA, BETA, $ A( II+(JJA+ITOP-1)*LDA ), LDA ) MP = MAX( 0, MP - HEIGHT ) II = II + HEIGHT JJ = JJA + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 30 END IF * END IF * END IF * ELSE IF( M.LE.( MBA-IROFFA ) ) THEN * * It is assumed that the local rows IIA:IIA+M-1 of the matrix A * are in the same process row (IAROW). * * ICOFFA * / \JJA * IIA ------------------ .... -------- * | .x | | | / | | \ * | . x | | | ILEFT| | | | * | . x | | | | | | * | . x | | \ x | | * | . |x | | |x | | IRIGHT * | . | x | | | x | | * (IAROW) | . | x | | | x | | * | . | x| | | x| | * | . | x | | x / * | . | |x | | | * | . | | x | | | * | . | | x | | | * | . | | x| | | * IIA+M-1 ------------------ .... ------- * NB_A * (IACOL) Local picture * IF( MYROW.EQ.IAROW ) THEN * NQA = NUMROC( N+ICOFFA, NBA, MYCOL, IACOL, NPCOL ) IF( NQA.LE.0 ) $ RETURN IF( MYCOL.EQ.IACOL ) $ NQA = NQA - ICOFFA MYDIST = MOD( MYCOL-IACOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBA - ICOFFA * IF( LSAME( UPLO, 'L' ) ) THEN * ILEFT = MAX( 0, ILEFT ) JJBEG = JJA JJEND = JJA + NQA - 1 JJNXT = MIN( ICEIL( JJBEG, NBA ) * NBA, JJEND ) * 40 CONTINUE IF( ( M-ILEFT ).GT.0 ) THEN CALL SLASET( UPLO, M-ILEFT, JJNXT-JJBEG+1, ALPHA, $ BETA, A( IIA+ILEFT+(JJBEG-1)*LDA ), LDA ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBA, JJEND ) GO TO 40 END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * II = IIA JJ = JJA NQ = NQA IRIGHT = MIN( ILEFT+NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 50 CONTINUE IF( II.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL SLASET( 'All', ILEFT-II+IIA, NQ, ALPHA, ALPHA, $ A( II+(JJ-1)*LDA ), LDA ) CALL SLASET( UPLO, WIDE, NQ, ALPHA, BETA, $ A( IIA+ILEFT+(JJ-1)*LDA ), LDA ) NQ = MAX( 0, NQ - WIDE ) II = IIA + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 50 END IF * ELSE * II = IIA JJ = JJA NQ = NQA IRIGHT = MIN( ILEFT+NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 60 CONTINUE IF( II.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL SLASET( 'All', ILEFT-II+IIA, NQA, ALPHA, ALPHA, $ A( II+(JJA-1)*LDA ), LDA ) CALL SLASET( 'All', WIDE, NQA-NQ, ALPHA, ALPHA, $ A( IIA+ILEFT+(JJA-1)*LDA ), LDA ) CALL SLASET( 'All', WIDE, NQ, ALPHA, BETA, $ A( IIA+ILEFT+(JJ-1)*LDA ), LDA ) NQ = MAX( 0, NQ - WIDE ) II = IIA + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 60 END IF * END IF * END IF * END IF * RETURN * * End of PSLASE2 * END scalapack-2.0.2/SRC/pslaset.f000644 000766 000024 00000021667 10363532303 016174 0ustar00juliestaff000000 000000 SUBROUTINE PSLASET( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLASET initializes an M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1) to BETA on the diagonal and ALPHA on the * offdiagonals. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * set: * = 'U': Upper triangular part is set; the strictly lower * triangular part of sub( A ) is not changed; * = 'L': Lower triangular part is set; the strictly upper * triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * ALPHA (global input) REAL * The constant to which the offdiagonal elements are to be * set. * * BETA (global input) REAL * The constant to which the diagonal elements are to be set. * * A (local output) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ) * to be set. On exit, the leading M-by-N submatrix sub( A ) * is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, IAA, IBLK, IN, ITMP, J, JAA, JBLK, JN, JTMP * .. * .. External Subroutines .. EXTERNAL PSLASE2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( M.LE.( DESCA( MB_ ) - MOD( IA-1, DESCA( MB_ ) ) ) .OR. $ N.LE.( DESCA( NB_ ) - MOD( JA-1, DESCA( NB_ ) ) ) ) THEN CALL PSLASE2( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) ELSE * IF( LSAME( UPLO, 'U' ) ) THEN IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) CALL PSLASE2( UPLO, IN-IA+1, N, ALPHA, BETA, A, IA, JA, $ DESCA ) DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) JAA = JA + ITMP CALL PSLASE2( UPLO, IBLK, N-ITMP, ALPHA, BETA, $ A, I, JAA, DESCA ) 10 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) CALL PSLASE2( UPLO, M, JN-JA+1, ALPHA, BETA, A, IA, JA, $ DESCA ) DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) IAA = IA + JTMP CALL PSLASE2( UPLO, M-JTMP, JBLK, ALPHA, BETA, A, IAA, $ J, DESCA ) 20 CONTINUE ELSE IF( M.LE.N ) THEN IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), $ IA+M-1 ) CALL PSLASE2( UPLO, IN-IA+1, N, ALPHA, BETA, A, IA, $ JA, DESCA ) DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) CALL PSLASE2( UPLO, IBLK, I-IA, ALPHA, ALPHA, A, I, $ JA, DESCA ) CALL PSLASE2( UPLO, IBLK, N-I+IA, ALPHA, BETA, A, I, $ JA+I-IA, DESCA ) 30 CONTINUE ELSE JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), $ JA+N-1 ) CALL PSLASE2( UPLO, M, JN-JA+1, ALPHA, BETA, A, IA, $ JA, DESCA ) DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) CALL PSLASE2( UPLO, J-JA, JBLK, ALPHA, ALPHA, A, IA, $ J, DESCA ) CALL PSLASE2( UPLO, M-J+JA, JBLK, ALPHA, BETA, A, $ IA+J-JA, J, DESCA ) 40 CONTINUE END IF END IF * END IF * RETURN * * End of PSLASET * END scalapack-2.0.2/SRC/pslasmsub.f000644 000766 000024 00000031403 11654534541 016531 0ustar00juliestaff000000 000000 SUBROUTINE PSLASMSUB( A, DESCA, I, L, K, SMLNUM, BUF, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER I, K, L, LWORK REAL SMLNUM * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), BUF( * ) * .. * * Purpose * ======= * * PSLASMSUB looks for a small subdiagonal element from the bottom * of the matrix that it can safely set to zero. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) REAL array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * K (global output) INTEGER * On exit, this yields the bottom portion of the unreduced * submatrix. This will satisfy: L <= M <= I-1. * * SMLNUM (global input) REAL * On entry, a "small number" for the given matrix. * Unchanged on exit. * * BUF (local output) REAL array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 2*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * This code is basically a parallelization of the following snip * of LAPACK code from SLAHQR: * * Look for a single small subdiagonal element. * * DO 20 K = I, L + 1, -1 * TST1 = ABS( H( K-1, K-1 ) ) + ABS( H( K, K ) ) * IF( TST1.EQ.ZERO ) * $ TST1 = DLANHS( '1', I-L+1, H( L, L ), LDH, WORK ) * IF( ABS( H( K, K-1 ) ).LE.MAX( ULP*TST1, SMLNUM ) ) * $ GO TO 30 * 20 CONTINUE * 30 CONTINUE * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IAFIRST, IBUF1, IBUF2, $ ICOL1, ICOL2, II, III, IRCV1, IRCV2, IROW1, $ IROW2, ISRC, ISTR1, ISTR2, ITMP1, ITMP2, $ JAFIRST, JJ, JJJ, JSRC, LDA, LEFT, MODKM1, $ MYCOL, MYROW, NPCOL, NPROW, NUM, RIGHT, UP REAL H10, H11, H22, TST1, ULP * .. * .. External Functions .. INTEGER ILCM, NUMROC REAL PSLAMCH EXTERNAL ILCM, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGERV2D, SGESD2D, IGAMX2D, $ INFOG1L, INFOG2L * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) ULP = PSLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 STARTS AT BUF(ISTR1+1) AND WILL CONTAINS IBUF1 ELEMENTS * BUFFER2 STARTS AT BUF(ISTR2+1) AND WILL CONTAINS IBUF2 ELEMENTS * ISTR1 = 0 ISTR2 = ( ( I-L ) / HBL ) IF( ISTR2*HBL.LT.( I-L ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.2*ISTR2 ) THEN * * Error! * RETURN END IF CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * COPY OUR RELEVANT PIECES OF TRIADIAGONAL THAT WE OWE INTO * 2 BUFFERS TO SEND TO WHOMEVER OWNS H(K,K) AS K MOVES DIAGONALLY * UP THE TRIDIAGONAL * IBUF1 = 0 IBUF2 = 0 IRCV1 = 0 IRCV2 = 0 DO 10 K = I, L + 1, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K-1,K-1) AND SEND IT DIAGONAL DOWN * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K ,K-1) AND SEND IT RIGHT * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( K, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * ADD UP THE RECEIVES * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT. $ 1 ) ) ) THEN * * WE MUST RECEIVE H(K-1,K-1) FROM DIAGONAL UP * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) ) THEN * * WE MUST RECEIVE H(K ,K-1) FROM LEFT * IRCV2 = IRCV2 + 1 END IF END IF * * POSSIBLY CHANGE OWNERS (OCCURS ONLY WHEN MOD(K-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * SEND DATA ON TO THE APPROPRIATE NODE IF THERE IS ANY DATA TO SEND * IF( IBUF1.GT.0 ) THEN CALL SGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL SGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, MYROW, $ RIGHT ) END IF * * RECEIVE APPROPRIATE DATA IF THERE IS ANY * IF( IRCV1.GT.0 ) THEN CALL SGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL SGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, MYROW, $ LEFT ) END IF * * START MAIN LOOP * IBUF1 = 0 IBUF2 = 0 CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * LOOK FOR A SINGLE SMALL SUBDIAGONAL ELEMENT. * * Start loop for subdiagonal search * DO 40 K = I, L + 1, -1 IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN * * Grab information from WORK array * IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H11 = BUF( ISTR1+IBUF1 ) ELSE H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF2 = IBUF2 + 1 H10 = BUF( ISTR2+IBUF2 ) ELSE H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF ELSE * * Information is local * H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF H22 = A( ( ICOL1-1 )*LDA+IROW1 ) TST1 = ABS( H11 ) + ABS( H22 ) IF( TST1.EQ.ZERO ) THEN * * FIND SOME NORM OF THE LOCAL H(L:I,L:I) * CALL INFOG1L( L, HBL, NPROW, MYROW, IAFIRST, ITMP1, III ) IROW2 = NUMROC( I, HBL, MYROW, IAFIRST, NPROW ) CALL INFOG1L( L, HBL, NPCOL, MYCOL, JAFIRST, ITMP2, III ) ICOL2 = NUMROC( I, HBL, MYCOL, JAFIRST, NPCOL ) DO 30 III = ITMP1, IROW2 DO 20 JJJ = ITMP2, ICOL2 TST1 = TST1 + ABS( A( ( JJJ-1 )*LDA+III ) ) 20 CONTINUE 30 CONTINUE END IF IF( ABS( H10 ).LE.MAX( ULP*TST1, SMLNUM ) ) $ GO TO 50 IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 IF( ( MODKM1.EQ.HBL-1 ) .AND. ( K.GT.2 ) ) THEN II = MOD( II+NPROW-1, NPROW ) JJ = MOD( JJ+NPCOL-1, NPCOL ) CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ITMP1, ITMP2 ) END IF 40 CONTINUE 50 CONTINUE CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, K, 1, ITMP1, ITMP2, -1, $ -1, -1 ) RETURN * * End of PSLASMSUB * END scalapack-2.0.2/SRC/pslasrt.f000644 000766 000024 00000020657 11750130340 016203 0ustar00juliestaff000000 000000 SUBROUTINE PSLASRT( ID, N, D, Q, IQ, JQ, DESCQ, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER ID INTEGER INFO, IQ, JQ, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCQ( * ), IWORK( * ) REAL D( * ), Q( * ), WORK( * ) * .. * * Purpose * ======= * * PSLASRT Sort the numbers in D in increasing order and the * corresponding vectors in Q. * * Arguments * ========= * * ID (global input) CHARACTER*1 * = 'I': sort D in increasing order; * = 'D': sort D in decreasing order. (NOT IMPLEMENTED YET) * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( Q ). N >= 0. * * D (global input/output) REAL array, dimmension (N) * On exit, the number in D are sorted in increasing order. * * Q (local input) REAL pointer into the local memory * to an array of dimension (LLD_Q, LOCc(JQ+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IQ (global input) INTEGER * The row index in the global array A indicating the first * row of sub( Q ). * * JQ (global input) INTEGER * The column index in the global array A indicating the * first column of sub( Q ). * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK = MAX( N, NP * ( NB + NQ )) * where * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * NQ = NUMROC( N, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK = N + 2*NB + 2*NPCOL * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CL, COL, DUMMY, I, ICTXT, IID, IIQ, INDCOL, $ INDX, INDXC, INDXG, IPQ, IPQ2, IPW, IPWORK, J, $ JJQ, K, L, LDQ, LEND, LIWMIN, LWMIN, MYCOL, $ MYROW, NB, ND, NP, NPCOL, NPROW, NQ, PSQ, QCOL, $ QTOT, SBUF * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PXERBLA, SCOPY, $ SGERV2D, SGESD2D, SLAMOV, SLAPST * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * IF( N.EQ.0 ) $ RETURN * CALL BLACS_GRIDINFO( DESCQ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 1, N, 1, IQ, JQ, DESCQ, 6, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCQ( NB_ ) LDQ = DESCQ( LLD_ ) NP = NUMROC( N, NB, MYROW, DESCQ( RSRC_ ), NPROW ) NQ = NUMROC( N, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) LWMIN = MAX( N, NP*( NB+NQ ) ) LIWMIN = N + 2*( NB+NPCOL ) IF( .NOT.LSAME( ID, 'I' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -9 ELSE IF( LIWORK.LT.LIWMIN ) THEN INFO = -11 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSLASRT', -INFO ) RETURN END IF * * Set Pointers * INDXC = 1 INDX = INDXC + N INDXG = INDX INDCOL = INDXG + NB QTOT = INDCOL + NB PSQ = QTOT + NPCOL * IID = 1 IPQ2 = 1 IPW = IPQ2 + NP*NQ * DUMMY = 0 IIQ = INDXG2L( IQ, NB, DUMMY, DUMMY, NPROW ) * * Sort the eigenvalues in D * CALL SLAPST( 'I', N, D, IWORK( INDX ), INFO ) * DO 10 L = 0, N - 1 WORK( IID+L ) = D( IWORK( INDX+L ) ) IWORK( INDXC-1+IWORK( INDX+L ) ) = IID + L 10 CONTINUE CALL SCOPY( N, WORK, 1, D, 1 ) * ND = 0 20 CONTINUE IF( ND.LT.N ) THEN LEND = MIN( NB, N-ND ) J = JQ + ND QCOL = INDXG2P( J, NB, DUMMY, DESCQ( CSRC_ ), NPCOL ) K = 0 DO 30 L = 0, LEND - 1 I = JQ - 1 + IWORK( INDXC+ND+L ) CL = INDXG2P( I, NB, DUMMY, DESCQ( CSRC_ ), NPCOL ) IWORK( INDCOL+L ) = CL IF( MYCOL.EQ.CL ) THEN IWORK( INDXG+K ) = IWORK( INDXC+ND+L ) K = K + 1 END IF 30 CONTINUE * IF( MYCOL.EQ.QCOL ) THEN DO 40 CL = 0, NPCOL - 1 IWORK( QTOT+CL ) = 0 40 CONTINUE DO 50 L = 0, LEND - 1 IWORK( QTOT+IWORK( INDCOL+L ) ) = IWORK( QTOT+ $ IWORK( INDCOL+L ) ) + 1 50 CONTINUE IWORK( PSQ ) = 1 DO 60 CL = 1, NPCOL - 1 IWORK( PSQ+CL ) = IWORK( PSQ+CL-1 ) + IWORK( QTOT+CL-1 ) 60 CONTINUE DO 70 L = 0, LEND - 1 CL = IWORK( INDCOL+L ) I = JQ + ND + L JJQ = INDXG2L( I, NB, DUMMY, DUMMY, NPCOL ) IPQ = IIQ + ( JJQ-1 )*LDQ IPWORK = IPW + ( IWORK( PSQ+CL )-1 )*NP CALL SCOPY( NP, Q( IPQ ), 1, WORK( IPWORK ), 1 ) IWORK( PSQ+CL ) = IWORK( PSQ+CL ) + 1 70 CONTINUE IWORK( PSQ ) = 1 DO 80 CL = 1, NPCOL - 1 IWORK( PSQ+CL ) = IWORK( PSQ+CL-1 ) + IWORK( QTOT+CL-1 ) 80 CONTINUE DO 90 L = 0, K - 1 I = IWORK( INDXG+L ) JJQ = INDXG2L( I, NB, DUMMY, DUMMY, NPCOL ) IPQ = IPQ2 + ( JJQ-1 )*NP IPWORK = IPW + ( IWORK( PSQ+MYCOL )-1 )*NP CALL SCOPY( NP, WORK( IPWORK ), 1, WORK( IPQ ), 1 ) IWORK( PSQ+MYCOL ) = IWORK( PSQ+MYCOL ) + 1 90 CONTINUE DO 100 CL = 1, NPCOL - 1 COL = MOD( MYCOL+CL, NPCOL ) SBUF = IWORK( QTOT+COL ) IF( SBUF.NE.0 ) THEN IPWORK = IPW + ( IWORK( PSQ+COL )-1 )*NP CALL SGESD2D( DESCQ( CTXT_ ), NP, SBUF, $ WORK( IPWORK ), NP, MYROW, COL ) END IF 100 CONTINUE * ELSE * IF( K.NE.0 ) THEN CALL SGERV2D( DESCQ( CTXT_ ), NP, K, WORK( IPW ), NP, $ MYROW, QCOL ) DO 110 L = 0, K - 1 I = JQ - 1 + IWORK( INDXG+L ) JJQ = INDXG2L( I, NB, DUMMY, DUMMY, NPCOL ) IPQ = 1 + ( JJQ-1 )*NP IPWORK = IPW + L*NP CALL SCOPY( NP, WORK( IPWORK ), 1, WORK( IPQ ), 1 ) 110 CONTINUE END IF END IF ND = ND + NB GO TO 20 END IF CALL SLAMOV( 'Full', NP, NQ, WORK, NP, Q( IIQ ), LDQ ) * * End of PSLASRT * END scalapack-2.0.2/SRC/pslassq.f000644 000766 000024 00000022377 10363532303 016206 0ustar00juliestaff000000 000000 SUBROUTINE PSLASSQ( N, X, IX, JX, DESCX, INCX, SCALE, SUMSQ ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N REAL SCALE, SUMSQ * .. * .. Array Arguments .. INTEGER DESCX( * ) REAL X( * ) * .. * * Purpose * ======= * * PSLASSQ returns the values scl and smsq such that * * ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, * * where x( i ) = sub( X ) = X( IX+(JX-1)*DESCX(M_)+(i-1)*INCX ). * The value of sumsq is assumed to be non-negative and scl returns the * value * * scl = max( scale, abs( x( i ) ) ). * * scale and sumsq must be supplied in SCALE and SUMSQ respectively. * SCALE and SUMSQ are overwritten by scl and ssq respectively. * * The routine makes only one pass through the vector sub( X ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * The result are only available in the scope of sub( X ), i.e if * sub( X ) is distributed along a process row, the correct results are * only available in this process row of the grid. Similarly if sub( X ) * is distributed along a process column, the correct results are only * available in this process column of the grid. * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vector sub( X ). * * X (input) REAL * The vector for which a scaled sum of squares is computed. * x( i ) = X(IX+(JX-1)*M_X +(i-1)*INCX ), 1 <= i <= n. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * SCALE (local input/local output) REAL * On entry, the value scale in the equation above. * On exit, SCALE is overwritten with scl , the scaling factor * for the sum of squares. * * SUMSQ (local input/local output) REAL * On entry, the value sumsq in the equation above. * On exit, SUMSQ is overwritten with smsq , the basic sum of * squares from which scl has been factored out. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, ICOFF, ICTXT, IIX, IOFF, IROFF, IXCOL, $ IXROW, JJX, LDX, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ REAL TEMP1 * .. * .. Local Arrays .. REAL WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSTREECOMB, SCOMBSSQ * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) * LDX = DESCX( LLD_ ) IF( INCX.EQ.DESCX( M_ ) ) THEN * * X is rowwise distributed. * IF( MYROW.NE.IXROW ) $ RETURN ICOFF = MOD( JX, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ - ICOFF * * Code direct from LAPACK's SLASSQ, (save subroutine call) * IF( NQ.GT.0 ) THEN IOFF = IIX + ( JJX - 1 ) * LDX DO 10 I = 1, NQ IF( X( IOFF ).NE.ZERO ) THEN TEMP1 = ABS( X( IOFF ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ * ( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IOFF = IOFF + LDX 10 CONTINUE END IF * * Take local result and find global * WORK( 1 ) = SCALE WORK( 2 ) = SUMSQ * CALL PSTREECOMB( ICTXT, 'Rowwise', 2, WORK, -1, IXCOL, $ SCOMBSSQ ) * SCALE = WORK( 1 ) SUMSQ = WORK( 2 ) * ELSE IF( INCX.EQ.1 ) THEN * * X is columnwise distributed. * IF( MYCOL.NE.IXCOL ) $ RETURN IROFF = MOD( IX, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFF * * Code direct from LAPACK's SLASSQ, (save subroutine call) * IF( NP.GT.0 ) THEN IOFF = IIX + ( JJX - 1 ) * LDX DO 20 I = 1, NP IF( X( IOFF ).NE.ZERO ) THEN TEMP1 = ABS( X( IOFF ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ*( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IOFF = IOFF + 1 20 CONTINUE END IF * * Take local result and find global * WORK( 1 ) = SCALE WORK( 2 ) = SUMSQ * CALL PSTREECOMB( ICTXT, 'Columnwise', 2, WORK, -1, IXCOL, $ SCOMBSSQ ) * SCALE = WORK( 1 ) SUMSQ = WORK( 2 ) * END IF * RETURN * * End of PSLASSQ * END scalapack-2.0.2/SRC/pslaswp.f000644 000766 000024 00000020334 10363532303 016200 0ustar00juliestaff000000 000000 SUBROUTINE PSLASWP( DIREC, ROWCOL, N, A, IA, JA, DESCA, K1, K2, $ IPIV ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIREC, ROWCOL INTEGER IA, JA, K1, K2, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) REAL A( * ) * .. * * Purpose: * ======== * * PSLASWP performs a series of row or column interchanges on * the distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1). One * interchange is initiated for each of rows or columns K1 trough K2 of * sub( A ). This routine assumes that the pivoting information has * already been broadcast along the process row or column. * Also note that this routine will only work for K1-K2 being in the * same MB (or NB) block. If you want to pivot a full matrix, use * PSLAPIV. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIREC (global input) CHARACTER * Specifies in which order the permutation is applied: * = 'F' (Forward) * = 'B' (Backward) * * ROWCOL (global input) CHARACTER * Specifies if the rows or columns are permuted: * = 'R' (Rows) * = 'C' (Columns) * * N (global input) INTEGER * If ROWCOL = 'R', the length of the rows of the distributed * matrix A(*,JA:JA+N-1) to be permuted; * If ROWCOL = 'C', the length of the columns of the distributed * matrix A(IA:IA+N-1,*) to be permuted. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, * ). * On entry, this array contains the local pieces of the distri- * buted matrix to which the row/columns interchanges will be * applied. On exit the permuted distributed matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * K1 (global input) INTEGER * The first element of IPIV for which a row or column inter- * change will be done. * * K2 (global input) INTEGER * The last element of IPIV for which a row or column inter- * change will be done. * * IPIV (local input) INTEGER array, dimension LOCr(M_A)+MB_A for * row pivoting and LOCc(N_A)+NB_A for column pivoting. This * array is tied to the matrix A, IPIV(K) = L implies rows * (or columns) K and L are to be interchanged. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICURCOL, ICURROW, IIA, IP, J, JJA, JP, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PSSWAP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( ROWCOL, 'R' ) ) THEN IF( LSAME( DIREC, 'F' ) ) THEN CALL INFOG2L( K1, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 10 I = K1, K2 IP = IPIV( IIA+I-K1 ) IF( IP.NE.I ) $ CALL PSSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP, $ JA, DESCA, DESCA( M_ ) ) 10 CONTINUE ELSE CALL INFOG2L( K2, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 20 I = K2, K1, -1 IP = IPIV( IIA+I-K1 ) IF( IP.NE.I ) $ CALL PSSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP, $ JA, DESCA, DESCA( M_ ) ) 20 CONTINUE END IF ELSE IF( LSAME( DIREC, 'F' ) ) THEN CALL INFOG2L( IA, K1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 30 J = K1, K2 JP = IPIV( JJA+J-K1 ) IF( JP.NE.J ) $ CALL PSSWAP( N, A, IA, J, DESCA, 1, A, IA, JP, $ DESCA, 1 ) 30 CONTINUE ELSE CALL INFOG2L( IA, K2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 40 J = K2, K1, -1 JP = IPIV( JJA+J-K1 ) IF( JP.NE.J ) $ CALL PSSWAP( N, A, IA, J, DESCA, 1, A, IA, JP, $ DESCA, 1 ) 40 CONTINUE END IF END IF * RETURN * * End PSLASWP * END scalapack-2.0.2/SRC/pslatra.f000644 000766 000024 00000015442 10363532303 016161 0ustar00juliestaff000000 000000 REAL FUNCTION PSLATRA( N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLATRA computes the trace of an N-by-N distributed matrix sub( A ) * denoting A( IA:IA+N-1, JA:JA+N-1 ). The result is left on every * process of the grid. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix the trace * is to be computed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ==================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER ICURCOL, ICURROW, II, IOFFA, J, JB, JJ, JN, $ LDA, LL, MYCOL, MYROW, NPCOL, NPROW REAL TRACE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SGSUM2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * TRACE = ZERO IF( N.EQ.0 ) THEN PSLATRA = TRACE RETURN END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ ICURROW, ICURCOL ) * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 LDA = DESCA( LLD_ ) IOFFA = II + ( JJ - 1 ) * LDA * * Handle first diagonal block separately * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 10 LL = IOFFA, IOFFA + (JB-1)*(LDA+1), LDA+1 TRACE = TRACE + A( LL ) 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JB IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JB*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over the remaining block of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 20 LL = IOFFA, IOFFA + (JB-1)*(LDA+1), LDA+1 TRACE = TRACE + A( LL ) 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JB IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JB*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE * CALL SGSUM2D( DESCA( CTXT_ ), 'All', ' ', 1, 1, TRACE, 1, -1, $ MYCOL ) * PSLATRA = TRACE * RETURN * * End of PSLATRA * END scalapack-2.0.2/SRC/pslatrd.f000644 000766 000024 00000041511 10363532303 016160 0ustar00juliestaff000000 000000 SUBROUTINE PSLATRD( UPLO, N, NB, A, IA, JA, DESCA, D, E, TAU, W, $ IW, JW, DESCW, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IW, JA, JW, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCW( * ) REAL A( * ), D( * ), E( * ), TAU( * ), W( * ), $ WORK( * ) * .. * * Purpose * ======= * * PSLATRD reduces NB rows and columns of a real symmetric distributed * matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) to symmetric tridiagonal * form by an orthogonal similarity transformation Q' * sub( A ) * Q, * and returns the matrices V and W which are needed to apply the * transformation to the unreduced part of sub( A ). * * If UPLO = 'U', PSLATRD reduces the last NB rows and columns of a * matrix, of which the upper triangle is supplied; * if UPLO = 'L', PSLATRD reduces the first NB rows and columns of a * matrix, of which the lower triangle is supplied. * * This is an auxiliary routine called by PSSYTRD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NB (global input) INTEGER * The number of rows and columns to be reduced. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * symmetric distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. * On exit, if UPLO = 'U', the last NB columns have been reduced * to tridiagonal form, with the diagonal elements overwriting * the diagonal elements of sub( A ); the elements above the * diagonal with the array TAU, represent the orthogonal matrix * Q as a product of elementary reflectors. If UPLO = 'L', the * first NB columns have been reduced to tridiagonal form, with * the diagonal elements overwriting the diagonal elements of * sub( A ); the elements below the diagonal with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors; See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * W (local output) REAL pointer into the local memory * to an array of dimension (LLD_W,NB_W), This array contains * the local pieces of the N-by-NB_W matrix W required to * update the unreduced part of sub( A ). * * IW (global input) INTEGER * The row index in the global array W indicating the first * row of sub( W ). * * JW (global input) INTEGER * The column index in the global array W indicating the * first column of sub( W ). * * DESCW (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix W. * * WORK (local workspace) REAL array, dimension (NB_A) * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n) H(n-1) . . . H(n-nb+1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(nb). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The elements of the vectors v together form the N-by-NB matrix V * which is needed, with W, to apply the transformation to the unreduced * part of the matrix, using a symmetric rank-2k update of the form: * sub( A ) := sub( A ) - V*W' - W*V'. * * The contents of A on exit are illustrated by the following examples * with n = 5 and nb = 2: * * if UPLO = 'U': if UPLO = 'L': * * ( a a a v4 v5 ) ( d ) * ( a a v4 v5 ) ( 1 d ) * ( a 1 v5 ) ( v1 1 a ) * ( d 1 ) ( v1 v2 a a ) * ( d ) ( v1 v2 a a a ) * * where d denotes a diagonal element of the reduced matrix, a denotes * an element of the original matrix that is unchanged, and vi denotes * an element of the vector defining H(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL HALF, ONE, ZERO PARAMETER ( HALF = 0.5E+0, ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, J, JJ, JP, JWK, K, $ KW, MYCOL, MYROW, NPCOL, NPROW, NQ REAL ALPHA * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCWK( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PSAXPY, $ PSDOT, PSELGET, PSELSET, PSGEMV, $ PSLARFG, PSSCAL, PSSYMV, SGEBR2D, $ SGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NQ = MAX( 1, NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) CALL DESCSET( DESCD, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * IF( LSAME( UPLO, 'U' ) ) THEN * CALL INFOG2L( N+IA-NB, N+JA-NB, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, IAROW, IACOL ) CALL DESCSET( DESCWK, 1, DESCW( NB_ ), 1, DESCW( NB_ ), IAROW, $ IACOL, ICTXT, 1 ) CALL DESCSET( DESCE, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * * Reduce last NB columns of upper triangle * DO 10 J = JA+N-1, JA+N-NB, -1 I = IA + J - JA K = J - JA + 1 KW = MOD( K-1, DESCA( MB_ ) ) + 1 * * Update A(IA:I,I) * CALL PSGEMV( 'No transpose', K, N-K, -ONE, A, IA, J+1, $ DESCA, W, IW+K-1, JW+KW, DESCW, DESCW( M_ ), $ ONE, A, IA, J, DESCA, 1 ) CALL PSGEMV( 'No transpose', K, N-K, -ONE, W, IW, JW+KW, $ DESCW, A, I, J+1, DESCA, DESCA( M_ ), ONE, A, $ IA, J, DESCA, 1 ) IF( N-K.GT.0 ) $ CALL PSELSET( A, I, J+1, DESCA, E( JP ) ) * * Generate elementary reflector H(i) to annihilate * A(IA:I-2,I) * JP = MIN( JJ+KW-1, NQ ) CALL PSLARFG( K-1, E( JP ), I-1, J, A, IA, J, DESCA, 1, $ TAU ) CALL PSELSET( A, I-1, J, DESCA, ONE ) * * Compute W(IW:IW+K-2,JW+KW-1) * CALL PSSYMV( 'Upper', K-1, ONE, A, IA, JA, DESCA, A, IA, J, $ DESCA, 1, ZERO, W, IW, JW+KW-1, DESCW, 1 ) * JWK = MOD( K-1, DESCWK( NB_ ) ) + 2 CALL PSGEMV( 'Transpose', K-1, N-K, ONE, W, IW, JW+KW, $ DESCW, A, IA, J, DESCA, 1, ZERO, WORK, 1, JWK, $ DESCWK, DESCWK( M_ ) ) CALL PSGEMV( 'No transpose', K-1, N-K, -ONE, A, IA, J+1, $ DESCA, WORK, 1, JWK, DESCWK, DESCWK( M_ ), ONE, $ W, IW, JW+KW-1, DESCW, 1 ) CALL PSGEMV( 'Transpose', K-1, N-K, ONE, A, IA, J+1, DESCA, $ A, IA, J, DESCA, 1, ZERO, WORK, 1, JWK, DESCWK, $ DESCWK( M_ ) ) CALL PSGEMV( 'No transpose', K-1, N-K, -ONE, W, IW, JW+KW, $ DESCW, WORK, 1, JWK, DESCWK, DESCWK( M_ ), ONE, $ W, IW, JW+KW-1, DESCW, 1 ) CALL PSSCAL( K-1, TAU( JP ), W, IW, JW+KW-1, DESCW, 1 ) * CALL PSDOT( K-1, ALPHA, W, IW, JW+KW-1, DESCW, 1, A, IA, J, $ DESCA, 1 ) IF( MYCOL.EQ.IACOL ) $ ALPHA = -HALF*TAU( JP )*ALPHA CALL PSAXPY( K-1, ALPHA, A, IA, J, DESCA, 1, W, IW, JW+KW-1, $ DESCW, 1 ) IF( MYCOL.EQ.IACOL ) THEN CALL PSELGET( 'E', ' ', D( JP ), A, I, J, DESCA ) END IF * 10 CONTINUE * ELSE * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL DESCSET( DESCWK, 1, DESCW( NB_ ), 1, DESCW( NB_ ), IAROW, $ IACOL, ICTXT, 1 ) CALL DESCSET( DESCE, 1, JA+N-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * * Reduce first NB columns of lower triangle * DO 20 J = JA, JA+NB-1 I = IA + J - JA K = J - JA + 1 * * Update A(J:JA+N-1,J) * CALL PSGEMV( 'No transpose', N-K+1, K-1, -ONE, A, I, JA, $ DESCA, W, IW+K-1, JW, DESCW, DESCW( M_ ), ONE, $ A, I, J, DESCA, 1 ) CALL PSGEMV( 'No transpose', N-K+1, K-1, -ONE, W, IW+K-1, $ JW, DESCW, A, I, JA, DESCA, DESCA( M_ ), ONE, $ A, I, J, DESCA, 1 ) IF( K.GT.1 ) $ CALL PSELSET( A, I, J-1, DESCA, E( JP ) ) * * * Generate elementary reflector H(i) to annihilate * A(I+2:IA+N-1,I) * JP = MIN( JJ+K-1, NQ ) CALL PSLARFG( N-K, E( JP ), I+1, J, A, I+2, J, DESCA, 1, $ TAU ) CALL PSELSET( A, I+1, J, DESCA, ONE ) * * Compute W(IW+K:IW+N-1,JW+K-1) * CALL PSSYMV( 'Lower', N-K, ONE, A, I+1, J+1, DESCA, A, I+1, $ J, DESCA, 1, ZERO, W, IW+K, JW+K-1, DESCW, 1 ) * CALL PSGEMV( 'Transpose', N-K, K-1, ONE, W, IW+K, JW, DESCW, $ A, I+1, J, DESCA, 1, ZERO, WORK, 1, 1, DESCWK, $ DESCWK( M_ ) ) CALL PSGEMV( 'No transpose', N-K, K-1, -ONE, A, I+1, JA, $ DESCA, WORK, 1, 1, DESCWK, DESCWK( M_ ), ONE, W, $ IW+K, JW+K-1, DESCW, 1 ) CALL PSGEMV( 'Transpose', N-K, K-1, ONE, A, I+1, JA, DESCA, $ A, I+1, J, DESCA, 1, ZERO, WORK, 1, 1, DESCWK, $ DESCWK( M_ ) ) CALL PSGEMV( 'No transpose', N-K, K-1, -ONE, W, IW+K, JW, $ DESCW, WORK, 1, 1, DESCWK, DESCWK( M_ ), ONE, W, $ IW+K, JW+K-1, DESCW, 1 ) CALL PSSCAL( N-K, TAU( JP ), W, IW+K, JW+K-1, DESCW, 1 ) CALL PSDOT( N-K, ALPHA, W, IW+K, JW+K-1, DESCW, 1, A, I+1, $ J, DESCA, 1 ) IF( MYCOL.EQ.IACOL ) $ ALPHA = -HALF*TAU( JP )*ALPHA CALL PSAXPY( N-K, ALPHA, A, I+1, J, DESCA, 1, W, IW+K, $ JW+K-1, DESCW, 1 ) IF( MYCOL.EQ.IACOL ) THEN CALL PSELGET( 'E', ' ', D( JP ), A, I, J, DESCA ) END IF * 20 CONTINUE * END IF * * Broadcast columnwise the diagonal elements into D. * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, NB, D( JJ ), 1 ) ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, NB, D( JJ ), 1, $ IAROW, MYCOL ) END IF END IF * RETURN * * End of PSLATRD * END scalapack-2.0.2/SRC/pslatrs.f000644 000766 000024 00000005314 10363532303 016200 0ustar00juliestaff000000 000000 SUBROUTINE PSLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, IA, $ JA, DESCA, X, IX, JX, DESCX, SCALE, CNORM, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, NORMIN, TRANS, UPLO INTEGER IA, IX, JA, JX, N REAL SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) REAL A( * ), CNORM( * ), $ X( * ), WORK( * ) * .. * * Purpose * ======= * * PSLATRS solves a triangular system. This routine in unfinished * at this time, but will be part of the next release. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * * .. Local Scalars .. INTEGER ICTXT, IIX, IROFF, JJX, MYCOL, MYROW, NP, $ NPCOL, NPROW, LDX, IXCOL, IXROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGEBR2D, SGEBS2D, INFOG2L, $ PSTRSV * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * ***** NO SCALING ***** Call PSTRSV for all cases ***** * SCALE = ONE CALL PSTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, $ DESCX, 1 ) * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) LDX = DESCX( LLD_ ) IROFF = MOD( IX-1, DESCX(MB_) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IXCOL ) THEN CALL SGEBS2D( ICTXT, 'R', ' ', NP, 1, X( IIX+(JJX-1)*LDX ), $ LDX ) ELSE CALL SGEBR2D( ICTXT, 'R', ' ', NP, 1, X( IIX+(JJX-1)*LDX ), $ LDX, MYROW, IXCOL ) END IF * RETURN * * End of PSLATRS * END scalapack-2.0.2/SRC/pslatrz.f000644 000766 000024 00000021560 10363532303 016210 0ustar00juliestaff000000 000000 SUBROUTINE PSLATRZ( M, N, L, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, L, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSLATRZ reduces the M-by-N ( M<=N ) real upper trapezoidal matrix * sub( A ) = [ A(IA:IA+M-1,JA:JA+M-1) A(IA:IA+M-1,JA+N-L:JA+N-1) ] to * upper triangular form by means of orthogonal transformations. * * The upper trapezoidal matrix sub( A ) is factored as * * sub( A ) = ( R 0 ) * Z, * * where Z is an N-by-N orthogonal matrix and R is an M-by-M upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. L > 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the leading M-by-M * upper triangular part of sub( A ) contains the upper trian- * gular matrix R, and elements N-L+1 to N of the first M rows * of sub( A ), with the array TAU, represent the orthogonal * matrix Z as a product of M elementary reflectors. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace) REAL array, dimension (LWORK) * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Further Details * =============== * * The factorization is obtained by Householder's method. The kth * transformation matrix, Z( k ), which is used to introduce zeros into * the (m - k + 1)th row of sub( A ), is given in the form * * Z( k ) = ( I 0 ), * ( 0 T( k ) ) * * where * * T( k ) = I - tau*u( k )*u( k )', u( k ) = ( 1 ), * ( 0 ) * ( z( k ) ) * * tau is a scalar and z( k ) is an ( n - m ) element vector. * tau and z( k ) are chosen to annihilate the elements of the kth row * of sub( A ). * * The scalar tau is returned in the kth element of TAU and the vector * u( k ) in the kth row of sub( A ), such that the elements of z( k ) * are in a( k, m + 1 ), ..., a( k, n ). The elements of R are returned * in the upper triangular part of sub( A ). * * Z is given by * * Z = Z( 1 ) * Z( 2 ) * ... * Z( m ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IAROW, ICTXT, II, J, J1, MP, MYCOL, MYROW, $ NPCOL, NPROW REAL AII * .. * .. External Subroutines .. EXTERNAL INFOG1L, PSELSET, PSLARFG, PSLARZ * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * MP = NUMROC( IA+M-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) * IF( M.EQ.N ) THEN * CALL INFOG1L( IA, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ II, IAROW ) DO 10 I = II, MP TAU( I ) = ZERO 10 CONTINUE * ELSE * J1 = JA + N - L DO 20 I = IA+M-1, IA, -1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * [ A(i, j) A(i,j1:ja+n-1) ] * CALL PSLARFG( L+1, AII, I, J, A, I, J1, DESCA, DESCA( M_ ), $ TAU ) * * Apply H(i) to A(ia:i-1,j:ja+n-1) from the right * CALL PSLARZ( 'Right', I-IA, JA+N-J, L, A, I, J1, DESCA, $ DESCA( M_ ), TAU, A, IA, J, DESCA, WORK ) CALL PSELSET( A, I, J, DESCA, AII ) * 20 CONTINUE * END IF * RETURN * * End of PSLATRZ * END scalapack-2.0.2/SRC/pslauu2.f000644 000766 000024 00000017545 10363532303 016114 0ustar00juliestaff000000 000000 SUBROUTINE PSLAUU2( UPLO, N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLAUU2 computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in sub( A ). * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in sub( A ). * * This is the unblocked form of the algorithm, calling Level 2 BLAS. * No communication is performed by this routine, the matrix to operate * on should be strictly local to one process. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the triangular factor stored in the matrix * sub( A ) is upper or lower triangular: * = 'U': Upper triangular, * = 'L': Lower triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the order of the triangular factor U or L. N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor L or U. * On exit, if UPLO = 'U', the upper triangle of the distributed * matrix sub( A ) is overwritten with the upper triangle of the * product U * U'; if UPLO = 'L', the lower triangle of sub( A ) * is overwritten with the lower triangle of the product L' * L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICURR, IDIAG, IIA, IOFFA, JJA, $ LDA, MYCOL, MYROW, NA, NPCOL, NPROW REAL AII * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, SGEMV, SSCAL * .. * .. External Functions .. LOGICAL LSAME REAL SDOT EXTERNAL LSAME, SDOT * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Get grid parameters and compute local indexes * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * IF( LSAME( UPLO, 'U' ) ) THEN * * Compute the product U * U'. * DO 10 NA = N-1, 1, -1 AII = A( IDIAG ) ICURR = IDIAG + LDA A( IDIAG ) = AII*AII + SDOT( NA, A( ICURR ), LDA, $ A( ICURR ), LDA ) CALL SGEMV( 'No transpose', N-NA-1, NA, ONE, $ A( IOFFA+LDA ), LDA, A( ICURR ), LDA, AII, $ A( IOFFA ), 1 ) IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + LDA 10 CONTINUE AII = A( IDIAG ) CALL SSCAL( N, AII, A( IOFFA ), 1 ) * ELSE * * Compute the product L' * L. * DO 20 NA = 1, N-1 AII = A( IDIAG ) ICURR = IDIAG + 1 A(IDIAG) = AII*AII + SDOT( N-NA, A( ICURR ), 1, $ A( ICURR ), 1 ) CALL SGEMV( 'Transpose', N-NA, NA-1, ONE, A( IOFFA+1 ), $ LDA, A( ICURR ), 1, AII, A( IOFFA ), LDA ) IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + 1 20 CONTINUE AII = A( IDIAG ) CALL SSCAL( N, AII, A( IOFFA ), LDA ) * END IF * END IF * RETURN * * End of PSLAUU2 * END scalapack-2.0.2/SRC/pslauum.f000644 000766 000024 00000020634 10363532303 016200 0ustar00juliestaff000000 000000 SUBROUTINE PSLAUUM( UPLO, N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLAUUM computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in sub( A ). * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in sub( A ). * * This is the blocked form of the algorithm, calling Level 3 PBLAS. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the triangular factor stored in the * distributed matrix sub( A ) is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the triangular factor U or L. N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor L or U. * On exit, if UPLO = 'U', the upper triangle of the distributed * matrix sub( A ) is overwritten with the upper triangle of the * product U * U'; if UPLO = 'L', the lower triangle of sub( A ) * is overwritten with the lower triangle of the product L' * L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J, JB, JN * .. * .. External Subroutines .. EXTERNAL PSGEMM, PSLAUU2, PSTRMM, PSSYRK * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) IF( LSAME( UPLO, 'U' ) ) THEN * * Compute the product U * U'. * * Handle first block separately * JB = JN-JA+1 CALL PSLAUU2( 'Upper', JB, A, IA, JA, DESCA ) IF( JB.LE.N-1 ) THEN CALL PSSYRK( 'Upper', 'No transpose', JB, N-JB, ONE, A, IA, $ JA+JB, DESCA, ONE, A, IA, JA, DESCA ) END IF * * Loop over remaining block of columns * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA CALL PSTRMM( 'Right', 'Upper', 'Transpose', 'Non-unit', $ J-JA, JB, ONE, A, I, J, DESCA, A, IA, J, $ DESCA ) CALL PSLAUU2( 'Upper', JB, A, I, J, DESCA ) IF( J+JB.LE.JA+N-1 ) THEN CALL PSGEMM( 'No transpose', 'Transpose', J-JA, JB, $ N-J-JB+JA, ONE, A, IA, J+JB, DESCA, A, I, $ J+JB, DESCA, ONE, A, IA, J, DESCA ) CALL PSSYRK( 'Upper', 'No transpose', JB, N-J-JB+JA, ONE, $ A, I, J+JB, DESCA, ONE, A, I, J, DESCA ) END IF 10 CONTINUE ELSE * * Compute the product L' * L. * * Handle first block separately * JB = JN-JA+1 CALL PSLAUU2( 'Lower', JB, A, IA, JA, DESCA ) IF( JB.LE.N-1 ) THEN CALL PSSYRK( 'Lower', 'Transpose', JB, N-JB, ONE, A, IA+JB, $ JA, DESCA, ONE, A, IA, JA, DESCA ) END IF * * Loop over remaining block of columns * DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA CALL PSTRMM( 'Left', 'Lower', 'Transpose', 'Non-unit', JB, $ J-JA, ONE, A, I, J, DESCA, A, I, JA, DESCA ) CALL PSLAUU2( 'Lower', JB, A, I, J, DESCA ) IF( J+JB.LE.JA+N-1 ) THEN CALL PSGEMM( 'Transpose', 'No transpose', JB, J-JA, $ N-J-JB+JA, ONE, A, I+JB, J, DESCA, A, I+JB, $ JA, DESCA, ONE, A, I, JA, DESCA ) CALL PSSYRK( 'Lower', 'Transpose', JB, N-J-JB+JA, ONE, $ A, I+JB, J, DESCA, ONE, A, I, J, DESCA ) END IF 20 CONTINUE END IF * RETURN * * End of PSLAUUM * END scalapack-2.0.2/SRC/pslawil.f000644 000766 000024 00000023506 10363532303 016166 0ustar00juliestaff000000 000000 SUBROUTINE PSLAWIL( II, JJ, M, A, DESCA, H44, H33, H43H34, V ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER II, JJ, M REAL H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), V( * ) * .. * * Purpose * ======= * * PSLAWIL gets the transform given by H44,H33, & H43H34 into V * starting at row M. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * II (global input) INTEGER * Row owner of H(M+2,M+2) * * JJ (global input) INTEGER * Column owner of H(M+2,M+2) * * M (global input) INTEGER * On entry, this is where the transform starts (row M.) * Unchanged on exit. * * A (global input) REAL array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * Unchanged on exit. * * H44 * H33 * H43H34 (global input) REAL * These three values are for the double shift QR iteration. * Unchanged on exit. * * V (global output) REAL array of size 3. * Contains the transform on output. * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, ICOL, IROW, JSRC, LDA, LEFT, $ MODKM1, MYCOL, MYROW, NPCOL, NPROW, NUM, RIGHT, $ RSRC, UP REAL H11, H12, H21, H22, H33S, H44S, S, V1, V2, V3 * .. * .. Local Arrays .. REAL BUF( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGERV2D, SGESD2D, INFOG2L * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * On node (II,JJ) collect all DIA,SUP,SUB info from M, M+1 * MODKM1 = MOD( M+1, HBL ) IF( MODKM1.EQ.0 ) THEN IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) CALL SGESD2D( CONTXT, 1, 1, BUF, 1, II, JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) BUF( 2 ) = A( ( ICOL-1 )*LDA+IROW+1 ) BUF( 3 ) = A( ICOL*LDA+IROW ) BUF( 4 ) = A( ICOL*LDA+IROW+1 ) CALL SGESD2D( CONTXT, 4, 1, BUF, 4, II, JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NPCOL.GT.1 ) THEN CALL SGERV2D( CONTXT, 1, 1, V3, 1, MYROW, LEFT ) ELSE V3 = A( ( ICOL-2 )*LDA+IROW ) END IF IF( NUM.GT.1 ) THEN CALL SGERV2D( CONTXT, 4, 1, BUF, 4, UP, LEFT ) H11 = BUF( 1 ) H21 = BUF( 2 ) H12 = BUF( 3 ) H22 = BUF( 4 ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) END IF END IF END IF IF( MODKM1.EQ.1 ) THEN IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) CALL SGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. ( NPROW.GT.1 ) ) $ THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL SGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL SGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NUM.GT.1 ) THEN CALL SGERV2D( CONTXT, 1, 1, H11, 1, UP, LEFT ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) END IF IF( NPROW.GT.1 ) THEN CALL SGERV2D( CONTXT, 1, 1, H12, 1, UP, MYCOL ) ELSE H12 = A( ( ICOL-2 )*LDA+IROW-2 ) END IF IF( NPCOL.GT.1 ) THEN CALL SGERV2D( CONTXT, 1, 1, H21, 1, MYROW, LEFT ) ELSE H21 = A( ( ICOL-3 )*LDA+IROW-1 ) END IF H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF END IF IF( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) $ RETURN * IF( MODKM1.GT.1 ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF * H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 * RETURN * * End of PSLAWIL * END scalapack-2.0.2/SRC/psorg2l.f000644 000766 000024 00000025565 10363532303 016112 0ustar00juliestaff000000 000000 SUBROUTINE PSORG2L( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORG2L generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the last N columns of a product of K elementary reflectors of order M * * Q = H(k) . . . H(2) H(1) * * as returned by PSGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA+N-K <= j <= JA+N-1, * as returned by PSGEQLF in the K columns of its distributed * matrix argument A(IA:*,JA+N-K:JA+N-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PSGEQLF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MpA0 + MAX( 1, NqA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, J, JJ, LWMIN, MPA0, MYCOL, $ MYROW, NPCOL, NPROW, NQA0 REAL TAUJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PSELSET, $ PSLARF, PSLASET, PSSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MPA0 + MAX( 1, NQA0 ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORG2L', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Initialise columns ja:ja+n-k-1 to columns of the unit matrix * CALL PSLASET( 'All', M-N, N-K, ZERO, ZERO, A, IA, JA, DESCA ) CALL PSLASET( 'All', N, N-K, ZERO, ONE, A, IA+M-N, JA, DESCA ) * TAUJ = ZERO NQA0 = MAX( 1, NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) ) DO 10 J = JA+N-K, JA+N-1 * * Apply H(j) to A(ia:ia+m-n+j-ja,ja:j) from the left * CALL PSELSET( A, IA+M-N+J-JA, J, DESCA, ONE ) CALL PSLARF( 'Left', M-N+J-JA+1, J-JA, A, IA, J, DESCA, 1, TAU, $ A, IA, JA, DESCA, WORK ) * JJ = INDXG2L( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) IACOL = INDXG2P( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ TAUJ = TAU( MIN( JJ, NQA0 ) ) CALL PSSCAL( M-N+J-JA, -TAUJ, A, IA, J, DESCA, 1 ) CALL PSELSET( A, IA+M-N+J-JA, J, DESCA, ONE-TAUJ ) * * Set A(ia+m-n+j-ja+1:ia+m-1,j) to zero * CALL PSLASET( 'All', JA+N-1-J, 1, ZERO, ZERO, A, IA+M-N+J-JA+1, $ J, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORG2L * END scalapack-2.0.2/SRC/psorg2r.f000644 000766 000024 00000025634 10363532303 016115 0ustar00juliestaff000000 000000 SUBROUTINE PSORG2R( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORG2R generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the first N columns of a product of K elementary reflectors of order * M * * Q = H(1) H(2) . . . H(k) * * as returned by PSGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA <= j <= JA+K-1, as * returned by PSGEQRF in the K columns of its array * argument A(IA:*,JA:JA+K-1). On exit, this array contains * the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PSGEQRF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MpA0 + MAX( 1, NqA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, J, JJ, KQ, LWMIN, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 REAL TAUJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PSELSET, $ PSLARF, PSLASET, PSSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MPA0 + MAX( 1, NQA0 ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORG2R', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Initialise columns ja+k:ja+n-1 to columns of the unit matrix * CALL PSLASET( 'All', K, N-K, ZERO, ZERO, A, IA, JA+K, DESCA ) CALL PSLASET( 'All', M-K, N-K, ZERO, ONE, A, IA+K, JA+K, DESCA ) * TAUJ = ZERO KQ = MAX( 1, NUMROC( JA+K-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) DO 10 J = JA+K-1, JA, -1 * * Apply H(j) to A(ia+j-ja:ia+m-1,j:ja+n-1) from the left * IF( J.LT.JA+N-1 ) THEN CALL PSELSET( A, IA+J-JA, J, DESCA, ONE ) CALL PSLARF( 'Left', M-J+JA, JA+N-J-1, A, IA+J-JA, J, DESCA, $ 1, TAU, A, IA+J-JA, J+1, DESCA, WORK ) END IF * JJ = INDXG2L( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) IACOL = INDXG2P( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ TAUJ = TAU( MIN( JJ, KQ ) ) IF( J-JA.LT.M-1 ) $ CALL PSSCAL( M-J+JA-1, -TAUJ, A, IA+J-JA+1, J, DESCA, 1 ) CALL PSELSET( A, IA+J-JA, J, DESCA, ONE-TAUJ ) * * Set A(ia:ia+j-ja-1,j) to zero * CALL PSLASET( 'All', J-JA, 1, ZERO, ZERO, A, IA, J, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORG2R * END scalapack-2.0.2/SRC/psorgl2.f000644 000766 000024 00000026013 10363532303 016077 0ustar00juliestaff000000 000000 SUBROUTINE PSORGL2( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORGL2 generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as * the first M rows of a product of K elementary reflectors of order N * * Q = H(k) . . . H(2) H(1) * * as returned by PSGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA <= i <= IA+K-1, as * returned by PSGELQF in the K rows of its distributed matrix * argument A(IA:IA+K-1,JA:*). On exit, this array contains the * local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCr(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSGELQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NqA0 + MAX( 1, MpA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, II, J, KP, LWMIN, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 REAL TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PSELSET, $ PSLARF, PSLASET, PSSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQA0 + MAX( 1, MPA0 ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORGL2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * IF( K.LT.M ) THEN * * Initialise rows ia+k:ia+m-1 to rows of the unit matrix * CALL PSLASET( 'All', M-K, K, ZERO, ZERO, A, IA+K, JA, DESCA ) CALL PSLASET( 'All', M-K, N-K, ZERO, ONE, A, IA+K, JA+K, $ DESCA ) * END IF * TAUI = ZERO KP = NUMROC( IA+K-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) * DO 10 I = IA+K-1, IA, -1 * * Apply H(i) to A(i:ia+m-1,ja+i-ia:ja+n-1) from the right * J = JA + I - IA II = INDXG2L( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IAROW = INDXG2P( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IF( MYROW.EQ.IAROW ) $ TAUI = TAU( MIN( II, KP ) ) IF( J.LT.JA+N-1 ) THEN IF( I.LT.IA+M-1 ) THEN CALL PSELSET( A, I, J, DESCA, ONE ) CALL PSLARF( 'Right', M-I+IA-1, N-J+JA, A, I, J, DESCA, $ DESCA( M_ ), TAU, A, I+1, J, DESCA, WORK ) END IF CALL PSSCAL( N-J+JA-1, -TAUI, A, I, J+1, DESCA, $ DESCA( M_ ) ) END IF CALL PSELSET( A, I, J, DESCA, ONE-TAUI ) * * Set A(i,ja:j-1) to zero * CALL PSLASET( 'All', 1, J-JA, ZERO, ZERO, A, I, JA, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORGL2 * END scalapack-2.0.2/SRC/psorglq.f000644 000766 000024 00000030622 10363532303 016177 0ustar00juliestaff000000 000000 SUBROUTINE PSORGLQ( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORGLQ generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as * the first M rows of a product of K elementary reflectors of order N * * Q = H(k) . . . H(2) H(1) * * as returned by PSGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA <= i <= IA+K-1, as * returned by PSGELQF in the K rows of its distributed matrix * argument A(IA:IA+K-1,JA:*). On exit, this array contains the * local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCr(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSGELQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( MpA0 + NqA0 + MB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IL, IN, IPW, $ J, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSLARFB, $ PSLARFT, PSLASET, PSORGL2, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORGLQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IL = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * CALL PSLASET( 'All', IA+M-IL, IL-IA, ZERO, ZERO, A, IL, JA, $ DESCA ) * * Use unblocked code for the last or only block. * CALL PSORGL2( IA+M-IL, N-IL+IA, IA+K-IL, A, IL, JA+IL-IA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Is there at least one block of rows to loop over ? * IF( IL.GT.IN+1 ) THEN * * Use blocked code * DO 10 I = IL-DESCA( MB_ ), IN+1, -DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) J = JA + I - IA * IF( I+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PSLARFT( 'Forward', 'Rowwise', N-I+IA, IB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(i+ib:ia+m-1,j:ja+n-1) from the right * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', $ M-I-IB+IA, N-I+IA, IB, A, I, J, DESCA, $ WORK, A, I+IB, J, DESCA, WORK( IPW ) ) END IF * * Apply H' to columns j:ja+n-1 of current block * CALL PSORGL2( IB, N-I+IA, IB, A, I, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set columns ia:i-1 of current block to zero * CALL PSLASET( 'All', IB, I-IA, ZERO, ZERO, A, I, JA, DESCA ) 10 CONTINUE * END IF * * Handle first block separately * IF( IL.GT.IA ) THEN * IB = IN - IA + 1 * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PSLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia+ib:ia+m-1,ja:ja+n-1) from the right * CALL PSLARFB( 'Right', 'Transpose', 'Forward', 'Rowwise', M-IB, $ N, IB, A, IA, JA, DESCA, WORK, A, IA+IB, JA, $ DESCA, WORK( IPW ) ) * * Apply H' to columns ja:ja+n-1 of current block * CALL PSORGL2( IB, N, IB, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORGLQ * END scalapack-2.0.2/SRC/psorgql.f000644 000766 000024 00000026745 10363532303 016212 0ustar00juliestaff000000 000000 SUBROUTINE PSORGQL( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORGQL generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the last N columns of a product of K elementary reflectors of order M * * Q = H(k) . . . H(2) H(1) * * as returned by PSGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA+N-K <= j <= JA+N-1, * as returned by PSGEQLF in the K columns of its distributed * matrix argument A(IA:*,JA+N-K:JA+N-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PSGEQLF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( NqA0 + MpA0 + NB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IINFO, IPW, J, JB, JN, $ LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSLARFB, $ PSLARFT, PSLASET, PSORG2L, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORGQL', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * IPW = DESCA( NB_ )*DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Set A(ia+m-n+jn-ja+1:ia-m+1,ja:jn) to zero. * CALL PSLASET( 'All', N-JN+JA-1, JN-JA+1, ZERO, ZERO, A, $ IA+M-N+JN-JA+1, JA, DESCA ) * * Use unblocked code for the first or only block. * CALL PSORG2L( M-N+JN-JA+1, JN-JA+1, JN-JA-N+K+1, A, IA, JA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Use blocked code * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PSLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:ia+m-n+j+jb-ja-1,ja:j-1) from the left * CALL PSLARFB( 'Left', 'No transpose', 'Backward', $ 'Columnwise', M-N+J+JB-JA, J-JA, JB, A, IA, $ J, DESCA, WORK, A, IA, JA, DESCA, WORK( IPW ) ) * * Apply H to rows ia:m-k+i+ib-1 of current block * CALL PSORG2L( M-N+J+JB-JA, JB, JB, A, IA, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows ia+m-n+j+jb-ja:ia+m-1,j:j+jb-1 of current block to * zero * CALL PSLASET( 'All', N-J-JB+JA, JB, ZERO, ZERO, A, $ IA+M-N+J+JB-JA, J, DESCA ) * 10 CONTINUE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORGQL * END scalapack-2.0.2/SRC/psorgqr.f000644 000766 000024 00000030676 10363532303 016216 0ustar00juliestaff000000 000000 SUBROUTINE PSORGQR( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORGQR generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the first N columns of a product of K elementary reflectors of order * M * * Q = H(1) H(2) . . . H(k) * * as returned by PSGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA <= j <= JA+K-1, as * returned by PSGEQRF in the K columns of its distributed * matrix argument A(IA:*,JA:JA+K-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(JA+K-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PSGEQRF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( NqA0 + MpA0 + NB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICTXT, IINFO, IPW, J, JB, JL, $ JN, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSLARFB, $ PSLARFT, PSLASET, PSORG2R, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORGQR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * IPW = DESCA( NB_ )*DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JL = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * CALL PSLASET( 'All', JL-JA, JA+N-JL, ZERO, ZERO, A, IA, JL, $ DESCA ) * * Use unblocked code for the last or only block. * CALL PSORG2R( M-JL+JA, JA+N-JL, JA+K-JL, A, IA+JL-JA, JL, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Is there at least one block of columns to loop over ? * IF( JL.GT.JN+1 ) THEN * * Use blocked code * DO 10 J = JL-DESCA( NB_ ), JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * IF( J+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PSLARFT( 'Forward', 'Columnwise', M-I+IA, JB, A, I, $ J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(i:ia+m-1,j+jb:ja+n-1) from the left * CALL PSLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-I+IA, N-J-JB+JA, JB, A, I, $ J, DESCA, WORK, A, I, J+JB, DESCA, $ WORK( IPW ) ) END IF * * Apply H to rows i:ia+m-1 of current block * CALL PSORG2R( M-I+IA, JB, JB, A, I, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows ia:i-1 of current block to zero * CALL PSLASET( 'All', I-IA, JB, ZERO, ZERO, A, IA, J, DESCA ) * 10 CONTINUE * END IF * * Handle first block separately * IF( JL.GT.JA ) THEN * JB = JN - JA + 1 * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PSLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:ia+m-1,ja+jb:ja+n-1) from the left * CALL PSLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M, N-JB, JB, A, IA, JA, DESCA, WORK, A, IA, $ JA+JB, DESCA, WORK( IPW ) ) * * Apply H to rows ia:ia+m-1 of current block * CALL PSORG2R( M, JB, JB, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * END IF * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORGQR * END scalapack-2.0.2/SRC/psorgr2.f000644 000766 000024 00000025603 10363532303 016111 0ustar00juliestaff000000 000000 SUBROUTINE PSORGR2( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORGR2 generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as the * last M rows of a product of K elementary reflectors of order N * * Q = H(1) H(2) . . . H(k) * * as returned by PSGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA+M-K <= i <= IA+M-1, as * returned by PSGERQF in the K rows of its distributed * matrix argument A(IA+M-K:IA+M-1,JA:*). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCr(IA+M-1) * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSGERQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NqA0 + MAX( 1, MpA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, II, LWMIN, MP, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 REAL TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PSELSET, $ PSLARF, PSLASET, PSSCAL, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQA0 + MAX( 1, MPA0 ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORGR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * IF( K.LT.M ) THEN * * Initialise rows ia:ia+m-k-1 to rows of the unit matrix * CALL PSLASET( 'All', M-K, N-M, ZERO, ZERO, A, IA, JA, DESCA ) CALL PSLASET( 'All', M-K, M, ZERO, ONE, A, IA, JA+N-M, DESCA ) * END IF * TAUI = ZERO MP = NUMROC( IA+M-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) * DO 10 I = IA+M-K, IA+M-1 * * Apply H(i) to A(ia:i,ja:ja+n-k+i-1) from the right * CALL PSELSET( A, I, JA+N-M+I-IA, DESCA, ONE ) CALL PSLARF( 'Right', I-IA, I-IA+N-M+1, A, I, JA, DESCA, $ DESCA( M_ ), TAU, A, IA, JA, DESCA, WORK ) II = INDXG2L( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IAROW = INDXG2P( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IF( MYROW.EQ.IAROW ) $ TAUI = TAU( MIN( II, MP ) ) CALL PSSCAL( I-IA+N-M, -TAUI, A, I, JA, DESCA, DESCA( M_ ) ) CALL PSELSET( A, I, JA+N-M+I-IA, DESCA, ONE-TAUI ) * * Set A(i,ja+n-m+i-ia+1:ja+n-1) to zero * CALL PSLASET( 'All', 1, IA+M-1-I, ZERO, ZERO, A, I, $ JA+N-M+I-IA+1, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORGR2 * END scalapack-2.0.2/SRC/psorgrq.f000644 000766 000024 00000026750 10363532303 016214 0ustar00juliestaff000000 000000 SUBROUTINE PSORGRQ( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORGRQ generates an M-by-N real distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as the * last M rows of a product of K elementary reflectors of order N * * Q = H(1) H(2) . . . H(k) * * as returned by PSGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. * N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA+M-K <= i <= IA+M-1, as * returned by PSGERQF in the K rows of its distributed * matrix argument A(IA+M-K:IA+M-1,JA:*). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCr(IA+M-1) * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSGERQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( MpA0 + NqA0 + MB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IN, IPW, $ LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSLARFB, $ PSLARFT, PSLASET, PSORGR2, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORGRQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) )*DESCA( MB_ ), IA+M-1 ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * * Set A(ia:in,ja+n-m+in-ia+1:ja-n+1) to zero. * CALL PSLASET( 'All', IN-IA+1, M-IN+IA-1, ZERO, ZERO, A, IA, $ JA+N-M+IN-IA+1, DESCA ) * * Use unblocked code for the first or only block. * CALL PSORGR2( IN-IA+1, N-M+IN-IA+1, IN-IA-M+K+1, A, IA, JA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Use blocked code * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PSLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, I, JA, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:i-1,ja:ja+n-m+i+ib-ia-1) from the right * CALL PSLARFB( 'Right', 'Transpose', 'Backward', 'Rowwise', $ I-IA, N-M+I+IB-IA, IB, A, I, JA, DESCA, WORK, A, $ IA, JA, DESCA, WORK( IPW ) ) * * Apply H' to columns ja:ja+n-m+i+ib-ia-1 of current block * CALL PSORGR2( IB, N-M+I+IB-IA, IB, A, I, JA, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows i:i+ib-1,ja+n-m+i+ib-ia:ja+n-1 of current block to * zero * CALL PSLASET( 'All', IB, M-I-IB+IA, ZERO, ZERO, A, I, $ JA+N-M+I+IB-IA, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORGRQ * END scalapack-2.0.2/SRC/psorm2l.f000644 000766 000024 00000040414 10363532303 016106 0ustar00juliestaff000000 000000 SUBROUTINE PSORM2L( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORM2L overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PSGEQLF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PSGEQLF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ), * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PSGEQLF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( 1, NqC0 ); * if SIDE = 'R', LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( * NUMROC( N+ICOFFC,NB_A,0,0,NPCOL ),NB_A,0,0,LCMQ ) ); * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICCOL, ICOFFC, ICROW, ICTXT, ICC, $ II, IROFFA, IROFFC, J, J1, J2, J3, JCC, JJ, $ LCM, LCMQ, LWMIN, MI, MP, MPC0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 REAL AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PSELSET, PSELSET2, PSLARF, PB_TOPGET, $ PB_TOPSET, PXERBLA, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D, SSCAL * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MPC0 + MAX( 1, NQC0 ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = NQC0 + MAX( MAX( 1, MPC0 ), NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ) ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORM2L', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, ICC, $ JCC, ICROW, ICCOL ) IF( LEFT ) THEN IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JC+N-1, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) THEN AJJ = ONE - TAU( JJ ) CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1 ) CALL SSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1, $ IAROW, IACOL ) CALL SSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN AJJ = ONE - TAU( JJ ) END IF * IF( IACOL.NE.ICCOL ) THEN IF( MYCOL.EQ.IACOL ) $ CALL SGESD2D( ICTXT, 1, 1, AJJ, 1, MYROW, ICCOL ) IF( MYCOL.EQ.ICCOL ) $ CALL SGERV2D( ICTXT, 1, 1, AJJ, 1, MYROW, IACOL ) END IF * IF( MYCOL.EQ.ICCOL ) THEN MP = NUMROC( IC+M-1, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) CALL SSCAL( MP-ICC+1, AJJ, C( ICC+(JCC-1)* $ DESCC( LLD_ ) ), 1 ) END IF * END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( LEFT .AND. NOTRAN .OR. .NOT.LEFT .AND. .NOT.NOTRAN ) THEN J1 = JA J2 = JA+K-1 J3 = 1 ELSE J1 = JA+K-1 J2 = JA J3 = -1 END IF * IF( LEFT ) THEN NI = N IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M END IF * DO 10 J = J1, J2, J3 * IF( LEFT ) THEN * * H(j) or H(j)' is applied to C(ic:ic+m-k+j-ja,jc:jc+n-1) * MI = M - K + J - JA + 1 ELSE * * H(j) or H(j)' is applied to C(ic:ic+m-1,jc:jc+n-k+j-ja) * NI = N - K + J - JA + 1 END IF * * Apply H(j) or H(j)' * CALL PSELSET2( AJJ, A, IA+NQ-K+J-JA, J, DESCA, ONE ) CALL PSLARF( SIDE, MI, NI, A, IA, J, DESCA, 1, TAU, C, IC, $ JC, DESCC, WORK ) CALL PSELSET( A, IA+NQ-K+J-JA, J, DESCA, AJJ ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORM2L * END scalapack-2.0.2/SRC/psorm2r.f000644 000766 000024 00000040551 10363532303 016116 0ustar00juliestaff000000 000000 SUBROUTINE PSORM2R( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORM2R overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of k elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PSGEQRF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PSGEQRF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ); * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PSGEQRF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( 1, NqC0 ); * if SIDE = 'R', LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( * NUMROC( N+ICOFFC,NB_A,0,0,NPCOL ),NB_A,0,0,LCMQ ) ); * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICCOL, ICOFFC, ICROW, ICTXT, ICC, $ II, IROFFA, IROFFC, J, J1, J2, J3, JCC, JJ, $ LCM, LCMQ, LWMIN, MI, MP, MPC0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 REAL AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PSELSET, PSELSET2, PSLARF, PB_TOPGET, $ PB_TOPSET, PXERBLA, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D, SSCAL * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MPC0 + MAX( 1, NQC0 ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = NQC0 + MAX( MAX( 1, MPC0 ), NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ) ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORM2R', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, ICC, $ JCC, ICROW, ICCOL ) IF( LEFT ) THEN IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JC+N-1, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) THEN AJJ = ONE - TAU( JJ ) CALL SGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1 ) CALL SSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1, $ IAROW, IACOL ) CALL SSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN AJJ = ONE - TAU( JJ ) END IF * IF( IACOL.NE.ICCOL ) THEN IF( MYCOL.EQ.IACOL ) $ CALL SGESD2D( ICTXT, 1, 1, AJJ, 1, MYROW, ICCOL ) IF( MYCOL.EQ.ICCOL ) $ CALL SGERV2D( ICTXT, 1, 1, AJJ, 1, MYROW, IACOL ) END IF * IF( MYCOL.EQ.ICCOL ) THEN MP = NUMROC( IC+M-1, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) CALL SSCAL( MP-ICC+1, AJJ, C( ICC+(JCC-1)* $ DESCC( LLD_ ) ), 1 ) END IF * END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) THEN J1 = JA J2 = JA+K-1 J3 = 1 ELSE J1 = JA+K-1 J2 = JA J3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M ICC = IC END IF * DO 10 J = J1, J2, J3 IF( LEFT ) THEN * * H(j) or H(j)' is applied to C(ic+j-ja:ic+m-1,jc:jc+n-1) * MI = M - J + JA ICC = IC + J - JA ELSE * * H(j) or H(j)' is applied to C(ic:ic+m-1,jc+j-ja:jc+n-1) * NI = N - J + JA JCC = JC + J - JA END IF * * Apply H(j) or H(j)' * CALL PSELSET2( AJJ, A, IA+J-JA, J, DESCA, ONE ) CALL PSLARF( SIDE, MI, NI, A, IA+J-JA, J, DESCA, 1, TAU, C, $ ICC, JCC, DESCC, WORK ) CALL PSELSET( A, IA+J-JA, J, DESCA, AJJ ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORM2R * END scalapack-2.0.2/SRC/psormbr.f000644 000766 000024 00000054276 10363532303 016207 0ustar00juliestaff000000 000000 SUBROUTINE PSORMBR( VECT, SIDE, TRANS, M, N, K, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, VECT INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * If VECT = 'Q', PSORMBR overwrites the general real distributed M-by-N * matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * If VECT = 'P', PSORMBR overwrites sub( C ) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': P * sub( C ) sub( C ) * P * TRANS = 'T': P**T * sub( C ) sub( C ) * P**T * * Here Q and P**T are the orthogonal distributed matrices determined by * PSGEBRD when reducing a real distributed matrix A(IA:*,JA:*) to * bidiagonal form: A(IA:*,JA:*) = Q * B * P**T. Q and P**T are defined * as products of elementary reflectors H(i) and G(i) respectively. * * Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the * order of the orthogonal matrix Q or P**T that is applied. * * If VECT = 'Q', A(IA:*,JA:*) is assumed to have been an NQ-by-K * matrix: * if nq >= k, Q = H(1) H(2) . . . H(k); * if nq < k, Q = H(1) H(2) . . . H(nq-1). * * If VECT = 'P', A(IA:*,JA:*) is assumed to have been a K-by-NQ * matrix: * if k < nq, P = G(1) G(2) . . . G(k); * if k >= nq, P = G(1) G(2) . . . G(nq-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * VECT (global input) CHARACTER * = 'Q': apply Q or Q**T; * = 'P': apply P or P**T. * * SIDE (global input) CHARACTER * = 'L': apply Q, Q**T, P or P**T from the Left; * = 'R': apply Q, Q**T, P or P**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q or P; * = 'T': Transpose, apply Q**T or P**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * If VECT = 'Q', the number of columns in the original * distributed matrix reduced by PSGEBRD. * If VECT = 'P', the number of rows in the original * distributed matrix reduced by PSGEBRD. * K >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+MIN(NQ,K)-1)) if * VECT='Q', and (LLD_A,LOCc(JA+NQ-1)) if VECT = 'P'. NQ = M * if SIDE = 'L', and NQ = N otherwise. The vectors which * define the elementary reflectors H(i) and G(i), whose * products determine the matrices Q and P, as returned by * PSGEBRD. * If VECT = 'Q', LLD_A >= max(1,LOCr(IA+NQ-1)); * if VECT = 'P', LLD_A >= max(1,LOCr(IA+MIN(NQ,K)-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL array, dimension * LOCc(JA+MIN(NQ,K)-1) if VECT = 'Q', LOCr(IA+MIN(NQ,K)-1) if * VECT = 'P', TAU(i) must contain the scalar factor of the * elementary reflector H(i) or G(i), which determines Q or P, * as returned by PDGEBRD in its array argument TAUQ or TAUP. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, if VECT='Q', sub( C ) is overwritten by Q*sub( C ) * or Q'*sub( C ) or sub( C )*Q' or sub( C )*Q; if VECT='P, * sub( C ) is overwritten by P*sub( C ) or P'*sub( C ) or * sub( C )*P or sub( C )*P'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * NQ = M; * if( (VECT = 'Q' and NQ >= K) or (VECT <> 'Q' and NQ > K) ), * IAA=IA; JAA=JA; MI=M; NI=N; ICC=IC; JCC=JC; * else * IAA=IA+1; JAA=JA; MI=M-1; NI=N; ICC=IC+1; JCC=JC; * end if * else if SIDE = 'R', * NQ = N; * if( (VECT = 'Q' and NQ >= K) or (VECT <> 'Q' and NQ > K) ), * IAA=IA; JAA=JA; MI=M; NI=N; ICC=IC; JCC=JC; * else * IAA=IA; JAA=JA+1; MI=M; NI=N-1; ICC=IC; JCC=JC+1; * end if * end if * * If VECT = 'Q', * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * else if VECT <> 'Q', * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( MI+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * end if * * where LCMP = LCM / NPROW, LCMQ = LCM / NPCOL, with * LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JAA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( MI+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If VECT = 'Q', * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * else * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL APPLYQ, LEFT, LQUERY, NOTRAN CHARACTER TRANST INTEGER IAA, IACOL, IAROW, ICC, ICCOL, ICOFFA, ICOFFC, $ ICROW, ICTXT, IINFO, IROFFA, IROFFC, JAA, JCC, $ LCM, LCMP, LCMQ, LWMIN, MI, MPC0, MQA0, MYCOL, $ MYROW, NI, NPA0, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSORMLQ, $ PSORMQR, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(1000+CTXT_) ELSE APPLYQ = LSAME( VECT, 'Q' ) LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q or P * IF( LEFT ) THEN NQ = M IF( ( APPLYQ .AND. NQ.GE.K ) .OR. $ ( .NOT.APPLYQ .AND. NQ.GT.K ) ) THEN IAA = IA JAA = JA MI = M NI = N ICC = IC JCC = JC ELSE IAA = IA + 1 JAA = JA MI = M - 1 NI = N ICC = IC + 1 JCC = JC END IF * IF( APPLYQ ) THEN CALL CHK1MAT( M, 4, K, 6, IA, JA, DESCA, 10, INFO ) ELSE CALL CHK1MAT( K, 6, M, 4, IA, JA, DESCA, 10, INFO ) END IF ELSE NQ = N IF( ( APPLYQ .AND. NQ.GE.K ) .OR. $ ( .NOT.APPLYQ .AND. NQ.GT.K ) ) THEN IAA = IA JAA = JA MI = M NI = N ICC = IC JCC = JC ELSE IAA = IA JAA = JA + 1 MI = M NI = N - 1 ICC = IC JCC = JC + 1 END IF * IF( APPLYQ ) THEN CALL CHK1MAT( N, 5, K, 6, IA, JA, DESCA, 10, INFO ) ELSE CALL CHK1MAT( K, 6, N, 5, IA, JA, DESCA, 10, INFO ) END IF END IF CALL CHK1MAT( M, 4, N, 5, IC, JC, DESCC, 15, INFO ) * IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) ICOFFA = MOD( JAA-1, DESCA( NB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JAA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( APPLYQ ) THEN IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF ELSE * IF( LEFT ) THEN MQA0 = NUMROC( MI+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ MI+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.APPLYQ .AND. .NOT.LSAME( VECT, 'P' ) ) THEN INFO = -1 ELSE IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -2 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -3 ELSE IF( K.LT.0 ) THEN INFO = -6 ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( APPLYQ .AND. LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( APPLYQ .AND. LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -13 ELSE IF( .NOT.APPLYQ .AND. LEFT .AND. $ ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ IROFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( APPLYQ .AND. LEFT .AND. $ DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( .NOT.APPLYQ .AND. LEFT .AND. $ DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF * IF( APPLYQ ) THEN IDUM1( 1 ) = ICHAR( 'Q' ) ELSE IDUM1( 1 ) = ICHAR( 'P' ) END IF IDUM2( 1 ) = 1 IF( LEFT ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'R' ) END IF IDUM2( 2 ) = 2 IF( NOTRAN ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'T' ) END IF IDUM2( 3 ) = 3 IDUM1( 4 ) = K IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( APPLYQ ) THEN IF( LEFT ) THEN CALL PCHK2MAT( M, 4, K, 6, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( N, 5, K, 6, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) END IF ELSE IF( LEFT ) THEN CALL PCHK2MAT( K, 6, M, 4, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( K, 6, N, 5, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMBR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( APPLYQ ) THEN * * Apply Q * IF( NQ.GE.K ) THEN * * Q was determined by a call to PSGEBRD with nq >= k * CALL PSORMQR( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) ELSE IF( NQ.GT.1 ) THEN * * Q was determined by a call to PSGEBRD with nq < k * CALL PSORMQR( SIDE, TRANS, MI, NI, NQ-1, A, IA+1, JA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) END IF ELSE * * Apply P * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF IF( NQ.GT.K ) THEN * * P was determined by a call to PSGEBRD with nq > k * CALL PSORMLQ( SIDE, TRANST, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) ELSE IF( NQ.GT.1 ) THEN * * P was determined by a call to PSGEBRD with nq <= k * CALL PSORMLQ( SIDE, TRANST, MI, NI, NQ-1, A, IA, JA+1, $ DESCA, TAU, C, ICC, JCC, DESCC, WORK, LWORK, $ IINFO ) END IF END IF * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMBR * END scalapack-2.0.2/SRC/psormhr.f000644 000766 000024 00000036423 10363532303 016207 0ustar00juliestaff000000 000000 SUBROUTINE PSORMHR( SIDE, TRANS, M, N, ILO, IHI, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, IHI, ILO, INFO, JA, JC, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORMHR overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix of order nq, with * nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the * product of IHI-ILO elementary reflectors, as returned by PSGEHRD: * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * ILO and IHI must have the same values as in the previous call * of PSGEHRD. Q is equal to the unit matrix except in the * distributed submatrix Q(ia+ilo:ia+ihi-1,ia+ilo:ja+ihi-1). * If SIDE = 'L', 1 <= ILO <= IHI <= max(1,M); * if SIDE = 'R', 1 <= ILO <= IHI <= max(1,N); * ILO and IHI are relative indexes. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE = 'R'. The vectors which * define the elementary reflectors, as returned by PSGEHRD. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(JA+M-2) * if SIDE = 'L', and LOCc(JA+N-2) if SIDE = 'R'. This array * contains the scalar factors TAU(j) of the elementary * reflectors H(j) as returned by PSGEHRD. TAU is tied to * the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * * IAA = IA + ILO; JAA = JA+ILO-1; * If SIDE = 'L', * MI = IHI-ILO; NI = N; ICC = IC + ILO; JCC = JC; * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * MI = M; NI = IHI-ILO; ICC = IC; JCC = JC + ILO; * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN INTEGER IAA, IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, $ IINFO, IROFFA, IROFFC, JAA, JCC, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NH, NI, NPA0, $ NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSORMQR, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 NH = IHI - ILO IF( NPROW.EQ.-1 ) THEN INFO = -(1000+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) IAA = IA + ILO JAA = JA + ILO - 1 * * NQ is the order of Q * IF( LEFT ) THEN NQ = M MI = NH NI = N ICC = IC + ILO JCC = JC CALL CHK1MAT( M, 3, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N MI = M NI = NH ICC = IC JCC = JC + ILO CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, NQ ) ) THEN INFO = -5 ELSE IF( IHI.LT.MIN( ILO, NQ ) .OR. IHI.GT.NQ ) THEN INFO = -6 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = ILO IDUM2( 3 ) = 5 IDUM1( 4 ) = IHI IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, M, 3, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMHR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. NH.EQ.0 ) $ RETURN * CALL PSORMQR( SIDE, TRANS, MI, NI, NH, A, IAA, JAA, DESCA, TAU, $ C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMHR * END scalapack-2.0.2/SRC/psorml2.f000644 000766 000024 00000035144 10363532303 016112 0ustar00juliestaff000000 000000 SUBROUTINE PSORML2( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORML2 overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PSGELQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= max(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PSGELQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSGELQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IROFFC, JCC, LCM, LCMP, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPCOL, $ NPROW, NQ, NQC0 REAL AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PSELSET, $ PSELSET2, PSLARF, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORML2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN .OR. .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K -1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC ELSE MI = M ICC = IC CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(i:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H(i) or H(i)' * CALL PSELSET2( AII, A, I, JA+I-IA, DESCA, ONE ) CALL PSLARF( SIDE, MI, NI, A, I, JA+I-IA, DESCA, DESCA( M_ ), $ TAU, C, ICC, JCC, DESCC, WORK ) CALL PSELSET( A, I, JA+I-IA, DESCA, AII ) * 10 CONTINUE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORML2 * END scalapack-2.0.2/SRC/psormlq.f000644 000766 000024 00000041741 10363532303 016211 0ustar00juliestaff000000 000000 SUBROUTINE PSORMLQ( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORMLQ overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PSGELQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= max(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PSGELQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSGELQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, JCC, $ LCM, LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSLARFB, $ PSLARFT, PSORML2, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMLQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC ELSE MI = M ICC = IC CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. NOTRAN ) .OR. ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) $ CALL PSORML2( SIDE, TRANS, M, N, I1-IA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PSLARFT( 'Forward', 'Rowwise', NQ-I+IA, IB, A, I, JA+I-IA, $ DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+i-ia:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H or H' * CALL PSLARFB( SIDE, TRANST, 'Forward', 'Rowwise', MI, NI, IB, $ A, I, JA+I-IA, DESCA, WORK, C, ICC, JCC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. ( .NOT.LEFT .AND. NOTRAN ) ) $ CALL PSORML2( SIDE, TRANS, M, N, I2-IA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMLQ * END scalapack-2.0.2/SRC/psormql.f000644 000766 000024 00000042175 10363532303 016213 0ustar00juliestaff000000 000000 SUBROUTINE PSORMQL( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORMQL overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PSGEQLF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PSGEQLF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ), * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PSGEQLF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( N+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IAROW, ICCOL, ICOFFC, ICROW, ICTXT, IINFO, IPW, $ IROFFA, IROFFC, J, J1, J2, J3, JB, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPA0, NPCOL, $ NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSLARFB, $ PSLARFT, PSORM2L, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMQL', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN J1 = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+K-1 ) + 1 J2 = JA+K-1 J3 = DESCA( NB_ ) ELSE J1 = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) J2 = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+K-1 ) + 1 J3 = -DESCA( NB_ ) END IF * IF( LEFT ) THEN NI = N IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M END IF * * Use unblocked code for the first block if necessary * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN JB = J1 - JA IF( LEFT ) THEN MI = M - K + JB ELSE NI = N - K + JB END IF CALL PSORM2L( SIDE, TRANS, MI, NI, JB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 DO 10 J = J1, J2, J3 JB = MIN( DESCA( NB_ ), K-J+JA ) * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PSLARFT( 'Backward', 'Columnwise', NQ-K+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic:ic+m-k+j+jb-ja-1,jc:jc+n-1) * MI = M - K + J + JB - JA ELSE * * H or H' is applied to C(ic:ic+m-1,jc:jc+n-k+j+jb-ja-1) * NI = N - K + J + JB - JA END IF * * Apply H or H' * CALL PSLARFB( SIDE, TRANS, 'Backward', 'Columnwise', MI, NI, $ JB, A, IA, J, DESCA, WORK, C, IC, JC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN JB = J2 - JA IF( LEFT ) THEN MI = M - K + JB ELSE NI = N - K + JB END IF CALL PSORM2L( SIDE, TRANS, MI, NI, JB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMQL * END scalapack-2.0.2/SRC/psormqr.f000644 000766 000024 00000042010 10363532303 016205 0ustar00juliestaff000000 000000 SUBROUTINE PSORMQR( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORMQR overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of k elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PSGEQRF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PSGEQRF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ); * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PSGEQRF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( N+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, IINFO, $ IPW, IROFFA, IROFFC, J, J1, J2, J3, JB, JCC, $ LCM, LCMQ, LWMIN, MI, MPC0, MYCOL, MYROW, NI, $ NPA0, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSLARFB, $ PSLARFT, PSORM2R, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMQR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN J1 = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) $ + 1 J2 = JA+K-1 J3 = DESCA( NB_ ) ELSE J1 = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) J2 = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) $ + 1 J3 = -DESCA( NB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M ICC = IC END IF * * Use unblocked code for the first block if necessary * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. ( .NOT.LEFT .AND. NOTRAN ) ) $ CALL PSORM2R( SIDE, TRANS, M, N, J1-JA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 DO 10 J = J1, J2, J3 JB = MIN( DESCA( NB_ ), K-J+JA ) * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PSLARFT( 'Forward', 'Columnwise', NQ-J+JA, JB, A, $ IA+J-JA, J, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+j-ja:ic+m-1,jc:jc+n-1) * MI = M - J + JA ICC = IC + J - JA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+j-ja:jc+n-1) * NI = N - J + JA JCC = JC + J - JA END IF * * Apply H or H' * CALL PSLARFB( SIDE, TRANS, 'Forward', 'Columnwise', MI, NI, $ JB, A, IA+J-JA, J, DESCA, WORK, C, ICC, JCC, $ DESCC, WORK( IPW ) ) 10 CONTINUE * * Use unblocked code for the last block if necessary * IF( ( LEFT .AND. NOTRAN ) .OR. ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) $ CALL PSORM2R( SIDE, TRANS, M, N, J2-JA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMQR * END scalapack-2.0.2/SRC/psormr2.f000644 000766 000024 00000034447 10363532303 016125 0ustar00juliestaff000000 000000 SUBROUTINE PSORMR2( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORMR2 overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PSGERQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PSGERQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSGERQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICCOL, ICOFFA, ICOFFC, $ ICROW, ICTXT, IROFFC, LCM, LCMP, LWMIN, MI, $ MPC0, MYCOL, MYROW, NI, NPCOL, NPROW, NQ, NQC0 REAL AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PSELSET, $ PSELSET2, PSLARF, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K - 1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N ELSE MI = M CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(ic:ic+m-k+i-ia,jc:jc+n-1) * MI = M - K + I - IA + 1 ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc:jc+n-k+i-ia+1) * NI = N - K + I - IA + 1 END IF * * Apply H(i) or H(i)' * CALL PSELSET2( AII, A, I, JA+NQ-K+I-IA, DESCA, ONE ) CALL PSLARF( SIDE, MI, NI, A, I, JA, DESCA, DESCA( M_ ), $ TAU, C, IC, JC, DESCC, WORK ) CALL PSELSET( A, I, JA+NQ-K+I-IA, DESCA, AII ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMR2 * END scalapack-2.0.2/SRC/psormr3.f000644 000766 000024 00000035047 10363532303 016123 0ustar00juliestaff000000 000000 SUBROUTINE PSORMR3( SIDE, TRANS, M, N, K, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, L, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORMR3 overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PSTZRZF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PSTZRZF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSTZRZF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IROFFC, JAA, JCC, LCM, $ LCMP, LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPCOL, $ NPROW, NQ, NQC0 * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PSLARZ, $ PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( L.LT.0 .OR. L.GT.NQ ) THEN INFO = -6 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMR3', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K - 1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC JAA = JA + M - L ELSE MI = M ICC = IC JAA = JA + N - L CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(ic+i-ia:icc+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H(i) or H(i)' * CALL PSLARZ( SIDE, MI, NI, L, A, I, JAA, DESCA, DESCA( M_ ), $ TAU, C, ICC, JCC, DESCC, WORK ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMR3 * END scalapack-2.0.2/SRC/psormrq.f000644 000766 000024 00000042757 11663037655 016245 0ustar00juliestaff000000 000000 SUBROUTINE PSORMRQ( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORMRQ overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PSGERQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PSGERQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSGERQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN, RIGHT, TRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, LCM, $ LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, MYROW, NI, $ NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSLARFB, $ PSLARFT, PSORMR2, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE IF( LSAME( SIDE, 'L' ) ) THEN LEFT = .TRUE. RIGHT = .FALSE. ELSE LEFT = .FALSE. RIGHT = .TRUE. END IF IF( LSAME( TRANS, 'N' ) ) THEN NOTRAN = .TRUE. TRAN = .FALSE. ELSE NOTRAN = .FALSE. TRAN = .TRUE. END IF * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 9, M, 3, N, 4, $ IC, JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 9, M, 3, N, 4, $ IC, JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMRQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N ELSE MI = M CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I1 - IA IF( LEFT ) THEN MI = M - K + IB ELSE NI = N - K + IB END IF CALL PSORMR2( SIDE, TRANS, MI, NI, IB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PSLARFT( 'Backward', 'Rowwise', NQ-K+I+IB-IA, IB, $ A, I, JA, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic:ic+m-k+i+ib-ia-1,jc:jc+n-1) * MI = M - K + I + IB - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc:jc+n-k+i+ib-ia-1) * NI = N - K + I + IB - IA END IF * * Apply H or H' * CALL PSLARFB( SIDE, TRANST, 'Backward', 'Rowwise', MI, NI, $ IB, A, I, JA, DESCA, WORK, C, IC, JC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( RIGHT .AND. TRAN ) .OR. $ ( LEFT .AND. NOTRAN ) ) THEN IB = I2 - IA IF( LEFT ) THEN MI = M - K + IB ELSE NI = N - K + IB END IF CALL PSORMR2( SIDE, TRANS, MI, NI, IB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMRQ * END scalapack-2.0.2/SRC/psormrz.f000644 000766 000024 00000043251 10363532303 016226 0ustar00juliestaff000000 000000 SUBROUTINE PSORMRZ( SIDE, TRANS, M, N, K, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, L, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORMRZ overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix defined as the * product of K elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PSTZRZF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PSTZRZF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PSTZRZF. * TAU is tied to the distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, JAA, $ JCC, LCM, LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, $ MYROW, NI, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSLARZB, $ PSLARZT, PSORMR3, PB_TOPGET, PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -6 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'T' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IDUM1( 4 ) = L IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMRZ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC JAA = JA + M - L ELSE MI = M ICC = IC JAA = JA + N - L CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I1 - IA IF( LEFT ) THEN MI = M ELSE NI = N END IF CALL PSORMR3( SIDE, TRANS, MI, NI, IB, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PSLARZT( 'Backward', 'Rowwise', L, IB, A, I, JAA, DESCA, $ TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+i-ia:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H or H' * CALL PSLARZB( SIDE, TRANST, 'Backward', 'Rowwise', MI, NI, IB, $ L, A, I, JAA, DESCA, WORK, C, ICC, JCC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I2 - IA IF( LEFT ) THEN MI = M ELSE NI = N END IF CALL PSORMR3( SIDE, TRANS, MI, NI, IB, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMRZ * END scalapack-2.0.2/SRC/psormtr.f000644 000766 000024 00000040363 10363532303 016221 0ustar00juliestaff000000 000000 SUBROUTINE PSORMTR( SIDE, UPLO, TRANS, M, N, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, UPLO INTEGER IA, IC, INFO, JA, JC, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSORMTR overwrites the general real M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'T': Q**T * sub( C ) sub( C ) * Q**T * * where Q is a real orthogonal distributed matrix of order nq, with * nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the * product of nq-1 elementary reflectors, as returned by PSSYTRD: * * if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1); * * if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(IA:*,JA:*) contains elementary * reflectors from PSSYTRD; * = 'L': Lower triangle of A(IA:*,JA:*) contains elementary * reflectors from PSSYTRD. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * or (LLD_A,LOCc(JA+N-1)) if SIDE = 'R'. The vectors which * define the elementary reflectors, as returned by PSSYTRD. * If SIDE = 'L', LLD_A >= max(1,LOCr(IA+M-1)); * if SIDE = 'R', LLD_A >= max(1,LOCr(IA+N-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) REAL array, dimension LTAU, where * if SIDE = 'L' and UPLO = 'U', LTAU = LOCc(M_A), * if SIDE = 'L' and UPLO = 'L', LTAU = LOCc(JA+M-2), * if SIDE = 'R' and UPLO = 'U', LTAU = LOCc(N_A), * if SIDE = 'R' and UPLO = 'L', LTAU = LOCc(JA+N-2). * TAU(i) must contain the scalar factor of the elementary * reflector H(i), as returned by PSSYTRD. TAU is tied to the * distributed matrix A. * * C (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * * If UPLO = 'U', * IAA = IA, JAA = JA+1, ICC = IC, JCC = JC; * else UPLO = 'L', * IAA = IA+1, JAA = JA; * if SIDE = 'L', * ICC = IC+1; JCC = JC; * else * ICC = IC; JCC = JC+1; * end if * end if * * If SIDE = 'L', * MI = M-1; NI = N; * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * MI = M; MI = N-1; * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN, UPPER INTEGER IAA, IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, $ IINFO, IROFFA, IROFFC, JAA, JCC, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPA0, NPCOL, $ NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSORMQL, $ PSORMQR, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) UPPER = LSAME( UPLO, 'U' ) * IF( UPPER ) THEN IAA = IA JAA = JA+1 ICC = IC JCC = JC ELSE IAA = IA+1 JAA = JA IF( LEFT ) THEN ICC = IC + 1 JCC = JC ELSE ICC = IC JCC = JC + 1 END IF END IF * * NQ is the order of Q * IF( LEFT ) THEN NQ = M MI = M - 1 NI = N CALL CHK1MAT( MI, 4, NQ-1, 4, IAA, JAA, DESCA, 9, INFO ) ELSE NQ = N MI = M NI = N - 1 CALL CHK1MAT( NI, 5, NQ-1, 5, IAA, JAA, DESCA, 9, INFO ) END IF CALL CHK1MAT( MI, 4, NI, 5, ICC, JCC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -3 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 IF( NOTRAN ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'T' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( MI, 4, NQ-1, 4, IAA, JAA, DESCA, 9, MI, 4, $ NI, 5, ICC, JCC, DESCC, 14, 4, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( NI, 5, NQ-1, 5, IAA, JAA, DESCA, 9, MI, 4, $ NI, 5, ICC, JCC, DESCC, 14, 4, IDUM1, IDUM2, $ INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSORMTR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. NQ.EQ.1 ) $ RETURN * IF( UPPER ) THEN * * Q was determined by a call to PSSYTRD with UPLO = 'U' * CALL PSORMQL( SIDE, TRANS, MI, NI, NQ-1, A, IAA, JAA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * ELSE * * Q was determined by a call to PSSYTRD with UPLO = 'L' * CALL PSORMQR( SIDE, TRANS, MI, NI, NQ-1, A, IAA, JAA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * END IF * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSORMTR * END scalapack-2.0.2/SRC/pspbsv.f000644 000766 000024 00000045034 10363532303 016030 0ustar00juliestaff000000 000000 SUBROUTINE PSPBSV( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PSPBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * banded symmetric positive definite distributed * matrix with bandwidth BW. * * Cholesky factorization is used to factor a reordering of * the matrix into L L'. * * See PSPBTRF and PSPBTRS for details. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (NB+2*bw)*bw * +max((bw*NRHS), bw*bw) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PSPBTRF, PSPBTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PSPBTRF and PSPBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PSPBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (NB+2*BW)*BW * * Factor the matrix * CALL PSPBTRF( UPLO, N, BW, A, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PSPBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PSPBTRS( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPBSV', -INFO ) RETURN END IF * RETURN * * End of PSPBSV * END scalapack-2.0.2/SRC/pspbtrf.f000644 000766 000024 00000141431 11750130340 016165 0ustar00juliestaff000000 000000 SUBROUTINE PSPBTRF( UPLO, N, BW, A, JA, DESCA, AF, LAF, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), AF( * ), WORK( * ) * .. * * * Purpose * ======= * * PSPBTRF computes a Cholesky factorization * of an N-by-N real banded * symmetric positive definite distributed matrix * with bandwidth BW: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PSPBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = U' U , if UPLO = 'U', or * * P A(1:N, JA:JA+N-1) P^T = L L', if UPLO = 'L' * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSPBTRF and this is stored in AF. If a linear system * is to be solved using PSPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * bw*bw * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM1, IDUM3, JA_NEW, $ LAF_MIN, LEVEL_DIST, LLDA, MBW2, MYCOL, MYROW, $ MY_NUM_COLS, NB, NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, OFST, PART_OFFSET, PART_SIZE, $ PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE, $ STORE_N_A, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, IGAMX2D, IGEBR2D, IGEBS2D, PXERBLA, $ RESHAPE, SAXPY, SGEMM, SGERV2D, SGESD2D, $ SLAMOV, SLATCPY, SPBTRF, SPOTRF, SSYRK, STBTRS, $ STRMM, STRRV2D, STRSD2D, STRSM, STRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 6*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * * Pre-calculate bw^2 * MBW2 = BW*BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -10 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 6*100+6 ) END IF * IF( ( BW.GT.N-1 ) .OR. ( BW.LT.0 ) ) THEN INFO = -3 END IF * IF( LLDA.LT.( BW+1 ) ) THEN INFO = -( 6*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 6*100+4 ) END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 6*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PSPBTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*BW ) ) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, 'PSPBTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = ( NB+2*BW )*BW * IF( LAF.LT.LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PSPBTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = BW*BW * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, 'PSPBTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 10 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSPBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 120 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW END IF * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * * Zero out space for work * DO 20 I = 1, WORK_SIZE_MIN WORK( I ) = ZERO 20 CONTINUE * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL.GT.0 ) THEN PREV_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) PREV_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL-1, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NPCOL-1 ) THEN NEXT_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL+1, 0, $ NPCOL ) ) NEXT_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL STRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, A( OFST+ODD_SIZE*LLDA+( BW+ $ 1 ) ), LLDA-1, 0, MYCOL+1 ) * END IF * * * Factor main partition A_i = L_i {L_i}^T in each processor * CALL SPBTRF( UPLO, ODD_SIZE, BW, A( OFST+1 ), LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 30 END IF * * IF( MYCOL.LT.NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * transpose the connection block in preparation. * CALL SLATCPY( 'U', BW, BW, A( ( OFST+( BW+1 )+( ODD_SIZE- $ BW )*LLDA ) ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW ) * * Perform the triangular system solve {L_i}{{B'}_i}^T = {B_i}^T * CALL STRTRS( 'L', 'N', 'N', BW, BW, $ A( OFST+1+( ODD_SIZE-BW )*LLDA ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO ) * * * transpose resulting block to its location * in main storage. * CALL SLATCPY( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), $ BW, A( ( OFST+( BW+1 )+( ODD_SIZE-BW )* $ LLDA ) ), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^T * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL SSYRK( UPLO, 'T', BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE, $ A( OFST+1+ODD_SIZE*LLDA ), LLDA-1 ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 30 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bw)*bw+1 ) and use for * off_diagonal block of reduced system. * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL STRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M, $ PREV_TRI_SIZE_N, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF( INFO.EQ.0 ) THEN * * Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ . * CALL STBTRS( 'L', 'N', 'N', ODD_SIZE, BW, BW, $ A( OFST+1 ), LLDA, AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = G_i{G_i}^T * CALL SSYRK( 'L', 'T', BW, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, ZERO, AF( 1+( ODD_SIZE+2*BW )*BW ), $ BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL SGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * {F_i}^T = {H_i}{{B'}_i}^T * * Copy matrix H_i (the last bw cols of G_i) to AF storage * as per requirements of BLAS routine STRMM. * Since we have G_i^T stored, transpose * H_i^T to H_i. * CALL SLATCPY( 'N', BW, BW, AF( ODD_SIZE-BW+1 ), $ ODD_SIZE, AF( ( ODD_SIZE )*BW+1 ), BW ) * CALL STRMM( 'R', 'U', 'T', 'N', BW, BW, -ONE, $ A( ( OFST+( BW+1 )+( ODD_SIZE-BW )* $ LLDA ) ), LLDA-1, AF( ( ODD_SIZE )*BW+1 ), $ BW ) * * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, $ 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 110 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 60 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL SGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0, $ MYCOL-1 ) * END IF * * Copy last diagonal block into AF storage for subsequent * operations. * CALL SLAMOV( 'N', BW, BW, A( OFST+ODD_SIZE*LLDA+1 ), LLDA-1, $ AF( ODD_SIZE*BW+MBW2+1 ), BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ 0, MYCOL+1 ) * * Add contribution to diagonal block * CALL SAXPY( MBW2, ONE, AF( ODD_SIZE*BW+2*MBW2+1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 40 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 50 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL SGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * CALL SAXPY( MBW2, ONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL SGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * CALL SAXPY( MBW2, ONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 40 50 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL SPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), BW, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL SLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0, $ COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL STRSM( 'L', 'L', 'N', 'N', BW, BW, ONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL SSYRK( 'L', 'T', BW, BW, -ONE, AF( ( ODD_SIZE )*BW+1 ), $ BW, ZERO, WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * CALL STRSM( 'R', 'L', 'T', 'N', BW, BW, ONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL SSYRK( 'L', 'N', BW, BW, -ONE, $ AF( ( ODD_SIZE+2*BW )*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL SGEMM( 'N', 'N', BW, BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW, ZERO, WORK( 1 ), $ BW ) * * Send contribution to offdiagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 60 CONTINUE * ELSE * * CASE UPLO = 'U' * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL.GT.0 ) THEN PREV_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) PREV_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL-1, 0, $ NPCOL ) ) END IF * IF( MYCOL.LT.NPCOL-1 ) THEN NEXT_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL+1, 0, $ NPCOL ) ) NEXT_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0, $ NPCOL ) ) END IF * * * * Factor main partition A_i^T = U_i {U_i}^T in each processor * CALL SPBTRF( UPLO, ODD_SIZE, BW, A( OFST+1 ), LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 70 END IF * * IF( MYCOL.LT.NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * Move the connection block in preparation. * CALL SLAMOV( 'L', BW, BW, A( ( OFST+1+ODD_SIZE*LLDA ) ), $ LLDA-1, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW ) * * * Perform the triangular solve {L_i}{{B'}_i}^T = {B_i}^T * CALL STRTRS( 'U', 'T', 'N', BW, BW, $ A( OFST+BW+1+( ODD_SIZE-BW )*LLDA ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO ) * * Move the resulting block back to its location in main storage. * CALL SLAMOV( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), $ BW, A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i^T = {C_i}^T-{{B'}_i}^T{{B'}_i} * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL SSYRK( UPLO, 'T', BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE, $ A( OFST+BW+1+ODD_SIZE*LLDA ), LLDA-1 ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 70 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bw)*bw+1 ) and use for * off_diagonal block of reduced system. * * Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ . * * * Copy D block into AF storage for solve. * CALL SLATCPY( 'L', PREV_TRI_SIZE_N, PREV_TRI_SIZE_M, $ A( OFST+1 ), LLDA-1, AF( 1 ), ODD_SIZE ) * IF( INFO.EQ.0 ) THEN * CALL STBTRS( 'U', 'T', 'N', ODD_SIZE, BW, BW, $ A( OFST+1 ), LLDA, AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = G_i{G_i}^T * CALL SSYRK( 'L', 'T', BW, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, ZERO, AF( 1+( ODD_SIZE+2*BW )*BW ), $ BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL SGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * {F_i}^T = {H_i}{{B'}_i}^T * * Copy matrix H_i (the last bw cols of G_i) to AF storage * as per requirements of BLAS routine STRMM. * Since we have G_i^T stored, transpose * H_i^T to H_i. * CALL SLATCPY( 'N', BW, BW, AF( ODD_SIZE-BW+1 ), $ ODD_SIZE, AF( ( ODD_SIZE )*BW+1 ), BW ) * CALL STRMM( 'R', 'L', 'N', 'N', BW, BW, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ AF( ( ODD_SIZE )*BW+1 ), BW ) * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, $ 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 110 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 100 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL SGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0, $ MYCOL-1 ) * END IF * * Transpose last diagonal block into AF storage for subsequent * operations. * CALL SLATCPY( 'U', BW, BW, A( OFST+ODD_SIZE*LLDA+1+BW ), $ LLDA-1, AF( ODD_SIZE*BW+MBW2+1 ), BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ 0, MYCOL+1 ) * * Add contribution to diagonal block * CALL SAXPY( MBW2, ONE, AF( ODD_SIZE*BW+2*MBW2+1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 80 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 90 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL SGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * CALL SAXPY( MBW2, ONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL SGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * CALL SAXPY( MBW2, ONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 80 90 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL SPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), BW, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL SLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0, $ COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL STRSM( 'L', 'L', 'N', 'N', BW, BW, ONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL SSYRK( 'L', 'T', BW, BW, -ONE, AF( ( ODD_SIZE )*BW+1 ), $ BW, ZERO, WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * CALL STRSM( 'R', 'L', 'T', 'N', BW, BW, ONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL SSYRK( 'L', 'N', BW, BW, -ONE, $ AF( ( ODD_SIZE+2*BW )*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL SGEMM( 'N', 'N', BW, BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW, ZERO, WORK( 1 ), $ BW ) * * Send contribution to offdiagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0, $ COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 100 CONTINUE * END IF * 110 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 120 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PSPBTRF * END scalapack-2.0.2/SRC/pspbtrs.f000644 000766 000024 00000063145 10363532303 016213 0ustar00juliestaff000000 000000 SUBROUTINE PSPBTRS( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, $ AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PSPBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PSPBTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * banded symmetric positive definite distributed * matrix with bandwidth BW. * Depending on the value of UPLO, A stores either U or L in the equn * A(1:N, JA:JA+N-1) = U'*U or L*L' as computed by PSPBTRF. * * Routine PSPBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSPBTRF and this is stored in AF. If a linear system * is to be solved using PSPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (bw*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NP_SAVE, PART_OFFSET, $ RETURN_CODE, STORE_M_B, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PSPBTRSV, PXERBLA, RESHAPE * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 7*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 10*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 10*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 10*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 10*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -14 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 7*100+6 ) END IF * IF( ( BW.GT.N-1 ) .OR. ( BW.LT.0 ) ) THEN INFO = -3 END IF * IF( LLDA.LT.( BW+1 ) ) THEN INFO = -( 7*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 7*100+4 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 10*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 10*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -3 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -6 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 7*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, 'PSPBTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*BW ) ) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, 'PSPBTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * WORK_SIZE_MIN = ( BW*NRHS ) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, 'PSPBTRS: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB( 5 ) PARAM_CHECK( 15, 1 ) = DESCB( 4 ) PARAM_CHECK( 14, 1 ) = DESCB( 3 ) PARAM_CHECK( 13, 1 ) = DESCB( 2 ) PARAM_CHECK( 12, 1 ) = DESCB( 1 ) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA( 5 ) PARAM_CHECK( 9, 1 ) = DESCA( 4 ) PARAM_CHECK( 8, 1 ) = DESCA( 3 ) PARAM_CHECK( 7, 1 ) = DESCA( 1 ) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1005 PARAM_CHECK( 15, 2 ) = 1004 PARAM_CHECK( 14, 2 ) = 1003 PARAM_CHECK( 13, 2 ) = 1002 PARAM_CHECK( 12, 2 ) = 1001 PARAM_CHECK( 11, 2 ) = 9 PARAM_CHECK( 10, 2 ) = 705 PARAM_CHECK( 9, 2 ) = 704 PARAM_CHECK( 8, 2 ) = 703 PARAM_CHECK( 7, 2 ) = 701 PARAM_CHECK( 6, 2 ) = 6 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSPBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 20 END IF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PSPBTRSV( 'L', 'N', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PSPBTRSV( 'U', 'T', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * END IF * * Call backsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PSPBTRSV( 'L', 'T', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PSPBTRSV( 'U', 'N', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * END IF 10 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 20 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PSPBTRS * END scalapack-2.0.2/SRC/pspbtrsv.f000644 000766 000024 00000140710 11750130340 016367 0ustar00juliestaff000000 000000 SUBROUTINE PSPBTRSV( UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B, $ IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PSPBTRSV solves a banded triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a banded * triangular matrix factor produced by the * Cholesky factorization code PSPBTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T * is dictated by the user by the parameter TRANS. * * Routine PSPBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T; * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) REAL pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSPBTRF and this is stored in AF. If a linear system * is to be solved using PSPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (bw*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MBW2, MYCOL, MYROW, MY_NUM_COLS, NB, NP, $ NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PXERBLA, RESHAPE, SGEMM, SGERV2D, $ SGESD2D, SLAMOV, SMATADD, STBTRS, STRMM, STRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 11*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 11*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 11*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 11*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * * Pre-calculate bw^2 * MBW2 = BW*BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'T' ) ELSE INFO = -2 END IF * IF( LWORK.LT.-1 ) THEN INFO = -14 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -3 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 8*100+6 ) END IF * IF( ( BW.GT.N-1 ) .OR. ( BW.LT.0 ) ) THEN INFO = -4 END IF * IF( LLDA.LT.( BW+1 ) ) THEN INFO = -( 8*100+6 ) END IF * IF( NB.LE.0 ) THEN INFO = -( 8*100+4 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 11*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 11*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -5 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -7 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 8*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PSPBTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*BW ) ) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, 'PSPBTRSV, D&C alg.: NB too small', $ -INFO ) RETURN END IF * * WORK_SIZE_MIN = BW*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, 'PSPBTRSV: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB( 5 ) PARAM_CHECK( 16, 1 ) = DESCB( 4 ) PARAM_CHECK( 15, 1 ) = DESCB( 3 ) PARAM_CHECK( 14, 1 ) = DESCB( 2 ) PARAM_CHECK( 13, 1 ) = DESCB( 1 ) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA( 5 ) PARAM_CHECK( 10, 1 ) = DESCA( 4 ) PARAM_CHECK( 9, 1 ) = DESCA( 3 ) PARAM_CHECK( 8, 1 ) = DESCA( 1 ) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BW PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 14 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSPBTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 180 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW END IF * * * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * IF( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL STBTRS( UPLO, 'N', 'N', ODD_SIZE, BW, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL SLAMOV( 'N', BW, NRHS, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB, $ WORK( 1 ), BW ) * CALL STRMM( 'L', 'U', 'N', 'N', BW, NRHS, -ONE, $ A( ( OFST+( BW+1 )+( ODD_SIZE-BW )*LLDA ) ), $ LLDA-1, WORK( 1 ), BW ) * CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL SGEMM( 'T', 'N', BW, NRHS, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, ZERO, $ WORK( 1+BW-BW ), BW ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 30 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 10 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 20 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 10 20 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL STRTRS( 'L', 'N', 'N', BW, NRHS, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'T', 'N', BW, NRHS, BW, -ONE, $ AF( ( ODD_SIZE )*BW+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 30 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 80 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 40 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 50 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 40 50 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL SGEMM( 'T', 'N', BW, NRHS, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, WORK( 1 ), $ BW, ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( ( ODD_SIZE )*BW+1 ), BW, WORK( 1 ), BW, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL STRTRS( 'L', 'T', 'N', BW, NRHS, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * ***Modification Loop ******* * 60 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 70 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 60 70 CONTINUE * [End of GOTO Loop] * 80 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL SGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -ONE, AF( 1 ), $ ODD_SIZE, WORK( 1+BW-BW ), BW, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL SLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, WORK( 1+BW-BW ), BW ) * CALL STRMM( 'L', 'U', 'T', 'N', BW, NRHS, -ONE, $ A( ( OFST+( BW+1 )+( ODD_SIZE-BW )*LLDA ) ), $ LLDA-1, WORK( 1+BW-BW ), BW ) * CALL SMATADD( BW, NRHS, ONE, WORK( 1+BW-BW ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL STBTRS( UPLO, 'T', 'N', ODD_SIZE, BW, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF( LSAME( TRANS, 'T' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL STBTRS( UPLO, 'T', 'N', ODD_SIZE, BW, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL SLAMOV( 'N', BW, NRHS, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB, $ WORK( 1 ), BW ) * CALL STRMM( 'L', 'L', 'T', 'N', BW, NRHS, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ WORK( 1 ), BW ) * CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL SGEMM( 'T', 'N', BW, NRHS, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, ZERO, $ WORK( 1+BW-BW ), BW ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 110 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 90 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 100 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 90 100 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL STRTRS( 'L', 'N', 'N', BW, NRHS, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'T', 'N', BW, NRHS, BW, -ONE, $ AF( ( ODD_SIZE )*BW+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 110 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 160 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 120 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 130 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 120 130 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) $ THEN * * Receive solution from processor to left * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL SGEMM( 'T', 'N', BW, NRHS, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, WORK( 1 ), $ BW, ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE, $ AF( ( ODD_SIZE )*BW+1 ), BW, WORK( 1 ), BW, $ ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL STRTRS( 'L', 'T', 'N', BW, NRHS, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 170 END IF * * * ***Modification Loop ******* * 140 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 150 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 140 150 CONTINUE * [End of GOTO Loop] * 160 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL SGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -ONE, AF( 1 ), $ ODD_SIZE, WORK( 1+BW-BW ), BW, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL SLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, WORK( 1+BW-BW ), BW ) * CALL STRMM( 'L', 'L', 'N', 'N', BW, NRHS, -ONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ WORK( 1+BW-BW ), BW ) * CALL SMATADD( BW, NRHS, ONE, WORK( 1+BW-BW ), BW, ONE, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL STBTRS( UPLO, 'N', 'N', ODD_SIZE, BW, NRHS, $ A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB, $ INFO ) * END IF * End of "IF( LSAME( TRANS, 'N' ) )"... * * END IF * End of "IF( LSAME( UPLO, 'L' ) )"... 170 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 180 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PSPBTRSV * END scalapack-2.0.2/SRC/pspocon.f000644 000766 000024 00000036274 10363532303 016202 0ustar00juliestaff000000 000000 SUBROUTINE PSPOCON( UPLO, N, A, IA, JA, DESCA, ANORM, RCOND, WORK, $ LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LIWORK, LWORK, N REAL ANORM, RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSPOCON estimates the reciprocal of the condition number (in the * 1-norm) of a real symmetric positive definite distributed matrix * using the Cholesky factorization A = U**T*U or A = L*L**T computed by * PSPOTRF. * * An estimate is obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), and * the reciprocal of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the factor stored in * A(IA:IA+N-1,JA:JA+N-1) is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) REAL pointer into the local memory to * an array of dimension ( LLD_A, LOCc(JA+N-1) ). On entry, this * array contains the local pieces of the factors L or U from * the Cholesky factorization A(IA:IA+N-1,JA:JA+N-1) = U'*U or * L*L', as computed by PSPOTRF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ANORM (global input) REAL * The 1-norm (or infinity-norm) of the symmetric distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * * RCOND (global output) REAL * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + 2*LOCc(N+MOD(JA-1,NB_A))+ * MAX( 2, MAX(NB_A*CEIL(NPROW-1,NPCOL),LOCc(N+MOD(JA-1,NB_A)) + * NB_A*CEIL(NPCOL-1,NPROW)) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr(N+MOD(IA-1,MB_A)). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPNL, IPNU, $ IPV, IPW, IPX, IROFF, IV, IX, IXX, JJA, JV, $ JX, KASE, LIWMIN, LWMIN, MYCOL, MYROW, NP, $ NPCOL, NPROW, NPMOD, NQ, NQMOD REAL AINVNM, SCALE, SL, SU, SMLNUM REAL WMAX * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG2L, $ PCHK1MAT, PSAMAX, PSLATRS, PSLACON, $ PSRSCL, PB_TOPGET, PB_TOPSET, PXERBLA, SGEBR2D, $ SGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, ICHAR, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + 2*NQMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = REAL( LWMIN ) LIWMIN = NPMOD IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( ANORM.LT.ZERO ) THEN INFO = -7 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN IWORK( 1 ) = LIWMIN INFO = -12 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 IF( LIWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 12 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPOCON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * RCOND = ZERO IF( N.EQ.0 ) THEN RCOND = ONE RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN ELSE IF( N.EQ.1 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPNL = IPV + NP IPNU = IPNL + NQ IPW = IPNU + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Estimate the 1-norm (or I-norm) of inv(A). * AINVNM = ZERO KASE = 0 NORMIN = 'N' * 10 CONTINUE CALL PSLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), IX, JX, $ DESCX, IWORK, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( UPPER ) THEN * * Multiply by inv(U'). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( 'Upper', 'Transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, WORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL NORMIN = 'Y' * * Multiply by inv(U). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( 'Upper', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, WORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(L). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( 'Lower', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, WORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL NORMIN = 'Y' * * Multiply by inv(L'). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( 'Lower', 'Transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, WORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF * * Multiply by 1/SCALE if doing so will not cause overflow. * SCALE = SL*SU IF( SCALE.NE.ONE ) THEN CALL PSAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL SGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1 ) ELSE CALL SGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1, $ IAROW, MYCOL ) END IF END IF IF( SCALE.LT.ABS( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PSRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / AINVNM ) / ANORM * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PSPOCON * END scalapack-2.0.2/SRC/pspoequ.f000644 000766 000024 00000031170 10363532303 016203 0ustar00juliestaff000000 000000 SUBROUTINE PSPOEQU( N, A, IA, JA, DESCA, SR, SC, SCOND, AMAX, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, N REAL AMAX, SCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), SC( * ), SR( * ) * .. * * Purpose * ======= * * PSPOEQU computes row and column scalings intended to * equilibrate a distributed symmetric positive definite matrix * sub( A ) = A(IA:IA+N-1,JA:JA+N-1) and reduce its condition number * (with respect to the two-norm). SR and SC contain the scale * factors, S(i) = 1/sqrt(A(i,i)), chosen so that the scaled distri- * buted matrix B with elements B(i,j) = S(i)*A(i,j)*S(j) has ones on * the diagonal. This choice of SR and SC puts the condition number * of B within a factor N of the smallest possible condition number * over all possible diagonal scalings. * * The scaling factor are stored along process rows in SR and along * process columns in SC. The duplication of information simplifies * greatly the application of the factors. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) REAL pointer into the local memory to an * array of local dimension ( LLD_A, LOCc(JA+N-1) ), the * N-by-N symmetric positive definite distributed matrix * sub( A ) whose scaling factors are to be computed. Only the * diagonal elements of sub( A ) are referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * SR (local output) REAL array, dimension LOCr(M_A) * If INFO = 0, SR(IA:IA+N-1) contains the row scale factors * for sub( A ). SR is aligned with the distributed matrix A, * and replicated across every process column. SR is tied to the * distributed matrix A. * * SC (local output) REAL array, dimension LOCc(N_A) * If INFO = 0, SC(JA:JA+N-1) contains the column scale factors * for A(IA:IA+M-1,JA:JA+N-1). SC is aligned with the distribu- * ted matrix A, and replicated down every process row. SC is * tied to the distributed matrix A. * * SCOND (global output) REAL * If INFO = 0, SCOND contains the ratio of the smallest SR(i) * (or SC(j)) to the largest SR(i) (or SC(j)), with * IA <= i <= IA+N-1 and JA <= j <= JA+N-1. If SCOND >= 0.1 * and AMAX is neither too large nor too small, it is not worth * scaling by SR (or SC). * * AMAX (global output) REAL * Absolute value of largest matrix element. If AMAX is very * close to overflow or very close to underflow, the matrix * should be scaled. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the K-th diagonal entry of sub( A ) is * nonpositive. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. CHARACTER ALLCTOP, COLCTOP, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURCOL, ICURROW, $ IDUMM, II, IIA, IOFFA, IOFFD, IROFF, J, JB, JJ, $ JJA, JN, LDA, LL, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ REAL AII, SMIN * .. * .. Local Arrays .. INTEGER DESCSC( DLEN_ ), DESCSR( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, IGAMN2D, $ INFOG2L, PCHK1MAT, PB_TOPGET, PXERBLA, $ SGAMN2D, SGAMX2D, SGSUM2D * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PSLAMCH EXTERNAL ICEIL, NUMROC, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(500+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, INFO ) CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, 0, IDUMM, IDUMM, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPOEQU', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN SCOND = ONE AMAX = ZERO RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'All', ALLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * * Compute some local indexes * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) * * Assign descriptors for SR and SC arrays * CALL DESCSET( DESCSR, N, 1, DESCA( MB_ ), 1, 0, 0, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCSC, 1, N, 1, DESCA( NB_ ), 0, 0, ICTXT, 1 ) * * Initialize the scaling factors to zero. * DO 10 II = IIA, IIA+NP-1 SR( II ) = ZERO 10 CONTINUE * DO 20 JJ = JJA, JJA+NQ-1 SC( JJ ) = ZERO 20 CONTINUE * * Find the minimum and maximum diagonal elements. * Handle first block separately. * II = IIA JJ = JJA JB = JN-JA+1 SMIN = ONE / PSLAMCH( ICTXT, 'S' ) AMAX = ZERO * IOFFA = II+(JJ-1)*LDA IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN IOFFD = IOFFA DO 30 LL = 0, JB-1 AII = A( IOFFD ) SR( II+LL ) = AII SC( JJ+LL ) = AII SMIN = MIN( SMIN, AII ) AMAX = MAX( AMAX, AII ) IF( AII.LE.ZERO .AND. INFO.EQ.0 ) $ INFO = LL + 1 IOFFD = IOFFD + LDA + 1 30 CONTINUE END IF * IF( MYROW.EQ.IAROW ) THEN II = II + JB IOFFA = IOFFA + JB END IF IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + JB IOFFA = IOFFA + JB*LDA END IF ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 50 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFD = IOFFA DO 40 LL = 0, JB-1 AII = A( IOFFD ) SR( II+LL ) = AII SC( JJ+LL ) = AII SMIN = MIN( SMIN, AII ) AMAX = MAX( AMAX, AII ) IF( AII.LE.ZERO .AND. INFO.EQ.0 ) $ INFO = J + LL - JA + 1 IOFFD = IOFFD + LDA + 1 40 CONTINUE END IF * IF( MYROW.EQ.ICURROW ) THEN II = II + JB IOFFA = IOFFA + JB END IF IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + JB IOFFA = IOFFA + JB*LDA END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 50 CONTINUE * * Compute scaling factors * CALL SGSUM2D( ICTXT, 'Columnwise', COLCTOP, 1, NQ, SC( JJA ), $ 1, -1, MYCOL ) CALL SGSUM2D( ICTXT, 'Rowwise', ROWCTOP, NP, 1, SR( IIA ), $ MAX( 1, NP ), -1, MYCOL ) * CALL SGAMX2D( ICTXT, 'All', ALLCTOP, 1, 1, AMAX, 1, IDUMM, IDUMM, $ -1, -1, MYCOL ) CALL SGAMN2D( ICTXT, 'All', ALLCTOP, 1, 1, SMIN, 1, IDUMM, IDUMM, $ -1, -1, MYCOL ) * IF( SMIN.LE.ZERO ) THEN * * Find the first non-positive diagonal element and return. * CALL IGAMN2D( ICTXT, 'All', ALLCTOP, 1, 1, INFO, 1, II, JJ, -1, $ -1, MYCOL ) RETURN * ELSE * * Set the scale factors to the reciprocals * of the diagonal elements. * DO 60 II = IIA, IIA+NP-1 SR( II ) = ONE / SQRT( SR( II ) ) 60 CONTINUE * DO 70 JJ = JJA, JJA+NQ-1 SC( JJ ) = ONE / SQRT( SC( JJ ) ) 70 CONTINUE * * Compute SCOND = min(S(I)) / max(S(I)) * SCOND = SQRT( SMIN ) / SQRT( AMAX ) * END IF * RETURN * * End of PSPOEQU * END scalapack-2.0.2/SRC/psporfs.f000644 000766 000024 00000100503 10363532303 016200 0ustar00juliestaff000000 000000 SUBROUTINE PSPORFS( UPLO, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, B, IB, JB, DESCB, X, IX, JX, DESCX, $ FERR, BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, $ LIWORK, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IWORK( * ) REAL A( * ), AF( * ), B( * ), $ BERR( * ), FERR( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PSPORFS improves the computed solution to a system of linear * equations when the coefficient matrix is symmetric positive definite * and provides error bounds and backward error estimates for the * solutions. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) REAL pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1) ). * This array contains the local pieces of the N-by-N symmetric * distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input) REAL pointer into the local memory * to an array of local dimension (LLD_AF,LOCc(JA+N-1)). * On entry, this array contains the factors L or U from the * Cholesky factorization sub( A ) = L*L**T or U**T*U, as * computed by PSPOTRF. * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * B (local input) REAL pointer into the local memory * to an array of local dimension (LLD_B, LOCc(JB+NRHS-1) ). * On entry, this array contains the the local pieces of the * right hand sides sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input) REAL pointer into the local memory * to an array of local dimension (LLD_X, LOCc(JX+NRHS-1) ). * On entry, this array contains the the local pieces of the * solution vectors sub( X ). On exit, it contains the * improved solution vectors. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). * The estimated forward error bound for each solution vector * of sub( X ). If XTRUE is the true solution corresponding * to sub( X ), FERR is an estimated upper bound for the * magnitude of the largest element in (sub( X ) - XTRUE) * divided by the magnitude of the largest element in sub( X ). * The estimate is as reliable as the estimate for RCOND, and * is almost always a slight overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*LOCr( N + MOD( IA-1, MB_A ) ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr( N + MOD( IB-1, MB_B ) ). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices op( A ) and op( AF ) (respectively * sub( X ) and sub( B ) ) should be distributed the same way on the * same processes. These conditions ensure that sub( A ) and sub( AF ) * (resp. sub( X ) and sub( B ) ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( AF ), sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IAF, DESCAF( MB_ ) ) = f( JAF, DESCAF( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) REAL TWO, THREE PARAMETER ( TWO = 2.0E+0, THREE = 3.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER COUNT, IACOL, IAFCOL, IAFROW, IAROW, IXBCOL, $ IXBROW, IXCOL, IXROW, ICOFFA, ICOFFAF, ICOFFB, $ ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, IIW, $ IOFFXB, IPB, IPR, IPV, IROFFA, IROFFAF, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LIWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ REAL EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG2L, $ PCHK2MAT, PSASYMV, PSAXPY, PSCOPY, $ PSLACON, PSPOTRS, PSSYMV, PXERBLA, $ SGAMX2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, N, 2, IAF, JAF, DESCAF, 11, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 15, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IX, JX, DESCX, 19, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFAF = MOD( JAF-1, DESCAF( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFCOL = INDXG2P( JAF, DESCAF( NB_ ), MYCOL, $ DESCAF( CSRC_ ), NPCOL ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 3 * NPMOD LIWMIN = NPMOD WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700 + NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCAF( MB_ ) ) THEN INFO = -( 1100 + MB_ ) ELSE IF( IROFFAF.NE.0 .OR. IAROW.NE.IAFROW ) THEN INFO = -9 ELSE IF( DESCA( NB_ ).NE.DESCAF( NB_ ) ) THEN INFO = -( 1100 + NB_ ) ELSE IF( ICOFFAF.NE.0 .OR. IACOL.NE.IAFCOL ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -( 1100 + CTXT_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -13 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1500 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1500 + CTXT_ ) ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 1900 + MB_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -17 ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 1900 + NB_ ) ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -18 ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 1900 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -25 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IDUM1( 2 ) = N IDUM2( 2 ) = 2 IDUM1( 3 ) = NRHS IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 23 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 25 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, N, 2, IAF, $ JAF, DESCAF, 11, 0, IDUM1, IDUM2, INFO ) CALL PCHK2MAT( N, 2, NRHS, 3, IB, JB, DESCB, 15, N, 2, NRHS, 3, $ IX, JX, DESCX, 19, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPORFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = IPB + NP0 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = 1 + maximum number of nonzero entries in each row of sub( A ) * NZ = N + 1 EPS = PSLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) ) * DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 100 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 20 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X) * CALL PSCOPY( N, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PSSYMV( UPLO, N, -ONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, ONE, WORK( IPR ), IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i))/(abs(sub(A))*abs(sub(X))+abs(sub(B)) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to * the i-th components of the numerator and denominator * before dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIXB, IIXB + NP - 1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 30 CONTINUE END IF END IF * CALL PSASYMV( UPLO, N, ONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, ONE, WORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 40 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J) is larger than machine epsilon, and * 2) BERR(J) decreased by at least a factor of 2 during the * last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PSPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PSAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, IX, $ JX+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 20 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(sub(A)))* * ( abs(R) + * NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B)) ))) / norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(sub(A)) is the inverse of sub(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(sub(A))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(sub(A))*abs(sub(X)) + abs(sub(B)) is less than SAFE2. * * Use PSLACON to estimate the infinity-norm of the matrix * inv(sub(A)) * diag(W), where * W = abs(R) + NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B))))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 50 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 50 CONTINUE END IF END IF * KASE = 0 60 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PSLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(sub(A)'). * CALL PSPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF ELSE * * Multiply by inv(sub(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 80 CONTINUE END IF END IF * CALL PSPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 60 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 90 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 90 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 100 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 200 J = JN+1, JB+NRHS-1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 190 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 110 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub( B ) - sub( A )*sub( X ). * CALL PSCOPY( N, B, IB, J+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PSSYMV( UPLO, N, -ONE, A, IA, JA, DESCA, X, IX, J+K, $ DESCX, 1, ONE, WORK( IPR ), IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / * ( abs(sub(A))*abs(sub(X)) + abs(sub(B)) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIXB, IIXB+NP-1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 120 CONTINUE END IF END IF * CALL PSASYMV( UPLO, N, ONE, A, IA, JA, DESCA, X, IX, J+K, $ DESCX, 1, ONE, WORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 )THEN DO 130 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 130 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.ICURCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during * the last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. $ COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PSPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PSAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, $ IX, J+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 110 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(sub(A)))* * ( abs(R) + NZ*EPS*( * abs(sub(A))*abs(sub(X))+abs(sub(B)) )))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(sub(A)) is the inverse of sub(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of abs(R)+NZ*EPS*(abs(sub(A))*abs(sub(X)) * +abs(sub(B))) is incremented by SAFE1 if the i-th component * of abs(sub(A))*abs(sub(X)) + abs(sub(B)) is less than SAFE2. * * Use PSLACON to estimate the infinity-norm of the matrix * inv(sub(A)) * diag(W), where * W = abs(R) + NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B))))) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW-1, IIW+NP-2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 140 CONTINUE END IF END IF * KASE = 0 150 CONTINUE IF( MYCOL.EQ.ICURCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, ICURCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PSLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = ICURCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(sub(A)'). * CALL PSPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 160 CONTINUE END IF END IF ELSE * * Multiply by inv(sub(A))*diag(W). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 170 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 170 CONTINUE END IF END IF * CALL PSPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 150 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 180 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 180 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, $ IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 190 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 200 CONTINUE * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PSPORFS * END scalapack-2.0.2/SRC/psposv.f000644 000766 000024 00000024311 10363532303 016040 0ustar00juliestaff000000 000000 SUBROUTINE PSPOSV( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ) * .. * * Purpose * ======= * * PSPOSV computes the solution to a real system of linear equations * * sub( A ) * X = sub( B ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is an N-by-N * symmetric distributed positive definite matrix and X and sub( B ) * denoting B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS distributed * matrices. * * The Cholesky decomposition is used to factor sub( A ) as * * sub( A ) = U**T * U, if UPLO = 'U', or * * sub( A ) = L * L**T, if UPLO = 'L', * * where U is an upper triangular matrix and L is a lower triangular * matrix. The factored form of sub( A ) is then used to solve the * system of equations. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if INFO = 0, this array contains the * local pieces of the factor U or L from the Cholesky factori- * zation sub( A ) = U**T*U or L*L**T. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_B,LOC(JB+NRHS-1)). * On entry, the local pieces of the right hand sides distribu- * ted matrix sub( B ). On exit, if INFO = 0, sub( B ) is over- * written with the solution distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed, and the * solution has not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSPOTRF, $ PSPOTRS, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1000+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, $ 3, IB, JB, DESCB, 11, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPOSV', -INFO ) RETURN END IF * * Compute the Cholesky factorization sub( A ) = U'*U or L*L'. * CALL PSPOTRF( UPLO, N, A, IA, JA, DESCA, INFO ) * IF( INFO.EQ.0 ) THEN * * Solve the system sub( A ) * X = sub( B ) overwriting sub( B ) * with X. * CALL PSPOTRS( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * END IF * RETURN * * End of PSPOSV * END scalapack-2.0.2/SRC/psposvx.f000644 000766 000024 00000065567 10363532303 016252 0ustar00juliestaff000000 000000 SUBROUTINE PSPOSVX( FACT, UPLO, N, NRHS, A, IA, JA, DESCA, AF, $ IAF, JAF, DESCAF, EQUED, SR, SC, B, IB, JB, $ DESCB, X, IX, JX, DESCX, RCOND, FERR, BERR, $ WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER EQUED, FACT, UPLO INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, LIWORK, $ LWORK, N, NRHS REAL RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IWORK( * ) REAL A( * ), AF( * ), $ B( * ), BERR( * ), FERR( * ), $ SC( * ), SR( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PSPOSVX uses the Cholesky factorization A = U**T*U or A = L*L**T to * compute the solution to a real system of linear equations * * A(IA:IA+N-1,JA:JA+N-1) * X = B(IB:IB+N-1,JB:JB+NRHS-1), * * where A(IA:IA+N-1,JA:JA+N-1) is an N-by-N matrix and X and * B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS matrices. * * Error bounds on the solution and a condition estimate are also * provided. In the following comments Y denotes Y(IY:IY+M-1,JY:JY+K-1) * a M-by-K matrix where Y can be A, AF, B and X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Description * =========== * * The following steps are performed: * * 1. If FACT = 'E', real scaling factors are computed to equilibrate * the system: * diag(SR) * A * diag(SC) * inv(diag(SC)) * X = diag(SR) * B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(SR)*A*diag(SC) and B by diag(SR)*B. * * 2. If FACT = 'N' or 'E', the Cholesky decomposition is used to * factor the matrix A (after equilibration if FACT = 'E') as * A = U**T* U, if UPLO = 'U', or * A = L * L**T, if UPLO = 'L', * where U is an upper triangular matrix and L is a lower triangular * matrix. * * 3. The factored form of A is used to estimate the condition number * of the matrix A. If the reciprocal of the condition number is * less than machine precision, steps 4-6 are skipped. * * 4. The system of equations is solved for X using the factored form * of A. * * 5. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 6. If equilibration was used, the matrix X is premultiplied by * diag(SR) so that it solves the original system before * equilibration. * * Arguments * ========= * * FACT (global input) CHARACTER * Specifies whether or not the factored form of the matrix A is * supplied on entry, and if not, whether the matrix A should be * equilibrated before it is factored. * = 'F': On entry, AF contains the factored form of A. * If EQUED = 'Y', the matrix A has been equilibrated * with scaling factors given by S. A and AF will not * be modified. * = 'N': The matrix A will be copied to AF and factored. * = 'E': The matrix A will be equilibrated if necessary, then * copied to AF and factored. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrices B and X. NRHS >= 0. * * A (local input/local output) REAL pointer into * the local memory to an array of local dimension * ( LLD_A, LOCc(JA+N-1) ). * On entry, the symmetric matrix A, except if FACT = 'F' and * EQUED = 'Y', then A must contain the equilibrated matrix * diag(SR)*A*diag(SC). If UPLO = 'U', the leading * N-by-N upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. A is not modified if * FACT = 'F' or 'N', or if FACT = 'E' and EQUED = 'N' on exit. * * On exit, if FACT = 'E' and EQUED = 'Y', A is overwritten by * diag(SR)*A*diag(SC). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input or local output) REAL pointer * into the local memory to an array of local dimension * ( LLD_AF, LOCc(JA+N-1)). * If FACT = 'F', then AF is an input argument and on entry * contains the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T, in the same storage * format as A. If EQUED .ne. 'N', then AF is the factored form * of the equilibrated matrix diag(SR)*A*diag(SC). * * If FACT = 'N', then AF is an output argument and on exit * returns the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T of the original * matrix A. * * If FACT = 'E', then AF is an output argument and on exit * returns the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T of the equilibrated * matrix A (see the description of A for the form of the * equilibrated matrix). * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * EQUED (global input/global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration (always true if FACT = 'N'). * = 'Y': Equilibration was done, i.e., A has been replaced by * diag(SR) * A * diag(SC). * EQUED is an input variable if FACT = 'F'; otherwise, it is an * output variable. * * SR (local input/local output) REAL array, * dimension (LLD_A) * The scale factors for A distributed across process rows; * not accessed if EQUED = 'N'. SR is an input variable if * FACT = 'F'; otherwise, SR is an output variable. * If FACT = 'F' and EQUED = 'Y', each element of SR must be * positive. * * SC (local input/local output) REAL array, * dimension (LOC(N_A)) * The scale factors for A distributed across * process columns; not accessed if EQUED = 'N'. SC is an input * variable if FACT = 'F'; otherwise, SC is an output variable. * If FACT = 'F' and EQUED = 'Y', each element of SC must be * positive. * * B (local input/local output) REAL pointer into * the local memory to an array of local dimension * ( LLD_B, LOCc(JB+NRHS-1) ). * On entry, the N-by-NRHS right-hand side matrix B. * On exit, if EQUED = 'N', B is not modified; if TRANS = 'N' * and EQUED = 'R' or 'B', B is overwritten by diag(R)*B; if * TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is overwritten * by diag(C)*B. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input/local output) REAL pointer into * the local memory to an array of local dimension * ( LLD_X, LOCc(JX+NRHS-1) ). * If INFO = 0, the N-by-NRHS solution matrix X to the original * system of equations. Note that A and B are modified on exit * if EQUED .ne. 'N', and the solution to the equilibrated * system is inv(diag(SC))*X if TRANS = 'N' and EQUED = 'C' or * 'B', or inv(diag(SR))*X if TRANS = 'T' or 'C' and EQUED = 'R' * or 'B'. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * RCOND (global output) REAL * The estimate of the reciprocal condition number of the matrix * A after equilibration (if done). If RCOND is less than the * machine precision (in particular, if RCOND = 0), the matrix * is singular to working precision. This condition is * indicated by a return code of INFO > 0, and the solution and * error bounds are not computed. * * FERR (local output) REAL array, dimension (LOC(N_B)) * The estimated forward error bounds for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution, FERR(j) bounds the magnitude * of the largest entry in (X(j) - XTRUE) divided by * the magnitude of the largest entry in X(j). The quality of * the error bound depends on the quality of the estimate of * norm(inv(A)) computed in the code; if the estimate of * norm(inv(A)) is accurate, the error bound is guaranteed. * * BERR (local output) REAL array, dimension (LOC(N_B)) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any entry of A or B that makes X(j) an exact solution). * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = MAX( PSPOCON( LWORK ), PSPORFS( LWORK ) ) * + LOCr( N_A ). * LWORK = 3*DESCA( LLD_ ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK = DESCA( LLD_ ) * LIWORK = LOCr(N_A). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, and i is * <= N: if INFO = i, the leading minor of order i of A * is not positive definite, so the factorization * could not be completed, and the solution and error * bounds could not be computed. * = N+1: RCOND is less than machine precision. The * factorization has been completed, but the matrix * is singular to working precision, and the solution * and error bounds have not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL EQUIL, LQUERY, NOFACT, RCEQU INTEGER I, IACOL, IAROW, IAFROW, IBROW, IBCOL, ICOFF, $ ICOFFA, ICTXT, IDUMM, IIA, IIB, IIX, INFEQU, $ IROFF, IROFFA, IROFFAF, IROFFB, IROFFX, IXCOL, $ IXROW, J, JJA, JJB, JJX, LDB, LDX, LIWMIN, $ LWMIN, MYCOL, MYROW, NP, NPCOL, NPROW, NRHSQ, $ NQ REAL AMAX, ANORM, BIGNUM, SCOND, SMAX, SMIN, SMLNUM * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, INFOG2L, $ PSPOCON, PSPOEQU, PSPORFS, $ PSPOTRF, PSPOTRS, $ PSLACPY, PSLAQSY, PXERBLA, $ SGAMN2D, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC REAL PSLANSY, PSLAMCH EXTERNAL INDXG2P, LSAME, NUMROC, PSLANSY, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(800+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 8, INFO ) IF( LSAME( FACT, 'F' ) ) $ CALL CHK1MAT( N, 3, N, 3, IAF, JAF, DESCAF, 12, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 20, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IIA, JJA, IAROW, IACOL ) NP = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFFA LWMIN = 3*DESCA( LLD_ ) LIWMIN = NP NOFACT = LSAME( FACT, 'N' ) EQUIL = LSAME( FACT, 'E' ) IF( NOFACT .OR. EQUIL ) THEN EQUED = 'N' RCEQU = .FALSE. ELSE RCEQU = LSAME( EQUED, 'Y' ) SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM END IF IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. $ .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( ICOFFA.NE.0 .OR. IROFFA.NE.ICOFFA ) THEN INFO = -7 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(800+NB_) ELSE IF( IAFROW.NE.IAROW ) THEN INFO = -10 ELSE IF( IROFFAF.NE.0 ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -(1200+CTXT_) ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. $ ( RCEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN INFO = -13 ELSE IF( RCEQU ) THEN * SMIN = BIGNUM SMAX = ZERO DO 10 J = IIA, IIA + NP - 1 SMIN = MIN( SMIN, SR( J ) ) SMAX = MAX( SMAX, SR( J ) ) 10 CONTINUE CALL SGAMN2D( ICTXT, 'Columnwise', ' ', 1, 1, SMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, SMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( SMIN.LE.ZERO ) THEN INFO = -14 ELSE IF( N.GT.0 ) THEN SCOND = MAX( SMIN, SMLNUM ) / MIN( SMAX, BIGNUM ) ELSE SCOND = ONE END IF END IF END IF END IF * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IBROW.NE.IAROW ) THEN INFO = -18 ELSE IF( IXROW.NE.IBROW ) THEN INFO = -22 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2000+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(2000+CTXT_) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -(2400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -28 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -30 END IF IDUM1( 1 ) = ICHAR( FACT ) IDUM2( 1 ) = 1 IDUM1( 2 ) = ICHAR( UPLO ) IDUM2( 2 ) = 2 IF( LSAME( FACT, 'F' ) ) THEN IDUM1( 3 ) = ICHAR( EQUED ) IDUM2( 3 ) = 13 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 28 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 30 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, $ 4, IB, JB, DESCB, 19, 5, IDUM1, IDUM2, $ INFO ) ELSE IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 28 IF( LIWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 30 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, $ 4, IB, JB, DESCB, 19, 4, IDUM1, IDUM2, $ INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPOSVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * IF( EQUIL ) THEN * * Compute row and column scalings to equilibrate the matrix A. * CALL PSPOEQU( N, A, IA, JA, DESCA, SR, SC, SCOND, AMAX, $ INFEQU ) * IF( INFEQU.EQ.0 ) THEN * * Equilibrate the matrix. * CALL PSLAQSY( UPLO, N, A, IA, JA, DESCA, SR, SC, SCOND, $ AMAX, EQUED ) RCEQU = LSAME( EQUED, 'Y' ) END IF END IF * * Scale the right-hand side. * CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, $ JJB, IBROW, IBCOL ) LDB = DESCB( LLD_ ) IROFF = MOD( IB-1, DESCB( MB_ ) ) ICOFF = MOD( JB-1, DESCB( NB_ ) ) NP = NUMROC( N+IROFF, DESCB( MB_ ), MYROW, IBROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFF, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) IF( MYROW.EQ.IBROW ) NP = NP-IROFF IF( MYCOL.EQ.IBCOL ) NRHSQ = NRHSQ-ICOFF * IF( RCEQU ) THEN DO 30 J = JJB, JJB+NRHSQ-1 DO 20 I = IIB, IIB+NP-1 B( I + ( J-1 )*LDB ) = SR( I )*B( I + ( J-1 )*LDB ) 20 CONTINUE 30 CONTINUE END IF * IF( NOFACT .OR. EQUIL ) THEN * * Compute the Cholesky factorization A = U'*U or A = L*L'. * CALL PSLACPY( 'Full', N, N, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF ) CALL PSPOTRF( UPLO, N, AF, IAF, JAF, DESCAF, INFO ) * * Return if INFO is non-zero. * IF( INFO.NE.0 ) THEN IF( INFO.GT.0 ) $ RCOND = ZERO RETURN END IF END IF * * Compute the norm of the matrix A. * ANORM = PSLANSY( '1', UPLO, N, A, IA, JA, DESCA, WORK ) * * Compute the reciprocal of the condition number of A. * CALL PSPOCON( UPLO, N, AF, IAF, JAF, DESCAF, ANORM, RCOND, WORK, $ LWORK, IWORK, LIWORK, INFO ) * * Return if the matrix is singular to working precision. * IF( RCOND.LT.PSLAMCH( ICTXT, 'Epsilon' ) ) THEN INFO = IA + N RETURN END IF * * Compute the solution matrix X. * CALL PSLACPY( 'Full', N, NRHS, B, IB, JB, DESCB, X, IX, JX, $ DESCX ) CALL PSPOTRS( UPLO, N, NRHS, AF, IAF, JAF, DESCAF, X, IX, JX, $ DESCX, INFO ) * * Use iterative refinement to improve the computed solution and * compute error bounds and backward error estimates for it. * CALL PSPORFS( UPLO, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, B, IB, JB, DESCB, X, IX, JX, DESCX, FERR, $ BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * Transform the solution matrix X to a solution of the original * system. * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) LDX = DESCX( LLD_ ) IROFF = MOD( IX-1, DESCX( MB_ ) ) ICOFF = MOD( JX-1, DESCX( NB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYROW.EQ.IBROW ) NP = NP-IROFF IF( MYCOL.EQ.IBCOL ) NRHSQ = NRHSQ-ICOFF * IF( RCEQU ) THEN DO 50 J = JJX, JJX+NRHSQ-1 DO 40 I = IIX, IIX+NP-1 X( I + ( J-1 )*LDX ) = SR( I )*X( I + ( J-1 )*LDX ) 40 CONTINUE 50 CONTINUE DO 60 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / SCOND 60 CONTINUE END IF * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN RETURN * * End of PSPOSVX * END scalapack-2.0.2/SRC/pspotf2.f000644 000766 000024 00000030536 10363532303 016111 0ustar00juliestaff000000 000000 SUBROUTINE PSPOTF2( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSPOTF2 computes the Cholesky factorization of a real symmetric * positive definite distributed matrix sub( A )=A(IA:IA+N-1,JA:JA+N-1). * * The factorization has the form * * sub( A ) = U' * U , if UPLO = 'U', or * * sub( A ) = L * L', if UPLO = 'L', * * where U is an upper triangular matrix and L is lower triangular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires N <= NB_A-MOD(JA-1, NB_A) and square block * decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if UPLO = 'U', the upper triangular * part of the distributed matrix contains the Cholesky factor * U, if UPLO = 'L', the lower triangular part of the distribu- * ted matrix contains the Cholesky factor L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURR, IDIAG, IIA, $ IOFFA, IROFF, J, JJA, LDA, MYCOL, MYROW, $ NPCOL, NPROW REAL AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, IGEBR2D, $ IGEBS2D, INFOG2L, PB_TOPGET, PXERBLA, SGEMV, $ SSCAL * .. * .. Intrinsic Functions .. INTRINSIC MOD, SQRT * .. * .. External Functions .. LOGICAL LSAME REAL SDOT EXTERNAL LSAME, SDOT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N+ICOFF.GT.DESCA( NB_ ) ) THEN INFO = -2 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPOTF2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Compute local information * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF ( UPPER ) THEN * * Process (IAROW, IACOL) owns block to be factorized * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN * * Compute the Cholesky factorization A = U'*U. * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * DO 10 J = JA, JA+N-1 * * Compute U(J,J) and test for non-positive-definiteness. * AJJ = A( IDIAG ) - $ SDOT( J-JA, A( IOFFA ), 1, A( IOFFA ), 1 ) IF( AJJ.LE.ZERO ) THEN A( IDIAG ) = AJJ INFO = J - JA + 1 GO TO 20 END IF AJJ = SQRT( AJJ ) A( IDIAG ) = AJJ * * Compute elements J+1:JA+N-1 of row J. * IF( J.LT.JA+N-1 ) THEN ICURR = IDIAG + LDA CALL SGEMV( 'Transpose', J-JA, JA+N-J-1, -ONE, $ A( IOFFA+LDA ), LDA, A( IOFFA ), 1, $ ONE, A( ICURR ), LDA ) CALL SSCAL( N-J+JA-1, ONE / AJJ, A( ICURR ), LDA ) END IF IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + LDA 10 CONTINUE * 20 CONTINUE * * Broadcast INFO to all processes in my IAROW. * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1, $ MYROW, IACOL ) END IF * * IAROW bcasts along columns so that everyone has INFO * CALL IGEBS2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, 1, $ IAROW, MYCOL ) * END IF * ELSE * * Process (IAROW, IACOL) owns block to be factorized * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Compute the Cholesky factorization A = L*L'. * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * DO 30 J = JA, JA+N-1 * * Compute L(J,J) and test for non-positive-definiteness. * AJJ = A( IDIAG ) - $ SDOT( J-JA, A( IOFFA ), LDA, A( IOFFA ), LDA ) IF ( AJJ.LE.ZERO ) THEN A( IDIAG ) = AJJ INFO = J - JA + 1 GO TO 40 END IF AJJ = SQRT( AJJ ) A( IDIAG ) = AJJ * * Compute elements J+1:JA+N-1 of column J. * IF( J.LT.JA+N-1 ) THEN ICURR = IDIAG + 1 CALL SGEMV( 'No transpose', JA+N-J-1, J-JA, -ONE, $ A( IOFFA+1 ), LDA, A( IOFFA ), LDA, $ ONE, A( ICURR ), 1 ) CALL SSCAL( JA+N-J-1, ONE / AJJ, A( ICURR ), 1 ) END IF IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + 1 30 CONTINUE * 40 CONTINUE * * Broadcast INFO to everyone in IACOL * CALL IGEBS2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, $ 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, $ 1, IAROW, MYCOL ) * END IF * * IACOL bcasts INFO along rows so that everyone has it * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1, $ MYROW, IACOL ) * END IF * END IF * RETURN * * End of PSPOTF2 * END scalapack-2.0.2/SRC/pspotrf.f000644 000766 000024 00000031412 10363532303 016203 0ustar00juliestaff000000 000000 SUBROUTINE PSPOTRF( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSPOTRF computes the Cholesky factorization of an N-by-N real * symmetric positive definite distributed matrix sub( A ) denoting * A(IA:IA+N-1, JA:JA+N-1). * * The factorization has the form * * sub( A ) = U' * U , if UPLO = 'U', or * * sub( A ) = L * L', if UPLO = 'L', * * where U is an upper triangular matrix and L is lower triangular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if UPLO = 'U', the upper triangular * part of the distributed matrix contains the Cholesky factor * U, if UPLO = 'L', the lower triangular part of the distribu- * ted matrix contains the Cholesky factor L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER I, ICOFF, ICTXT, IROFF, J, JB, JN, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PSPOTF2, PSSYRK, PSTRSM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPOTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( UPPER ) THEN * * Split-ring topology for the communication along process * columns, 1-tree topology along process rows. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'S-ring' ) * * A is upper triangular, compute Cholesky factorization A = U'*U. * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA(NB_), JA+N-1 ) JB = JN - JA + 1 * * Perform unblocked Cholesky factorization on JB block * CALL PSPOTF2( UPLO, JB, A, IA, JA, DESCA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 * IF( JB+1.LE.N ) THEN * * Form the row panel of U using the triangular solver * CALL PSTRSM( 'Left', UPLO, 'Transpose', 'Non-Unit', $ JB, N-JB, ONE, A, IA, JA, DESCA, A, IA, JA+JB, $ DESCA ) * * Update the trailing matrix, A = A - U'*U * CALL PSSYRK( UPLO, 'Transpose', N-JB, JB, -ONE, A, IA, $ JA+JB, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) END IF * * Loop over remaining block of columns * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Perform unblocked Cholesky factorization on JB block * CALL PSPOTF2( UPLO, JB, A, I, J, DESCA, INFO ) IF( INFO.NE.0 ) THEN INFO = INFO + J - JA GO TO 30 END IF * IF( J-JA+JB+1.LE.N ) THEN * * Form the row panel of U using the triangular solver * CALL PSTRSM( 'Left', UPLO, 'Transpose', 'Non-Unit', $ JB, N-J-JB+JA, ONE, A, I, J, DESCA, A, $ I, J+JB, DESCA ) * * Update the trailing matrix, A = A - U'*U * CALL PSSYRK( UPLO, 'Transpose', N-J-JB+JA, JB, $ -ONE, A, I, J+JB, DESCA, ONE, A, I+JB, $ J+JB, DESCA ) END IF 10 CONTINUE * ELSE * * 1-tree topology for the communication along process columns, * Split-ring topology along process rows. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * A is lower triangular, compute Cholesky factorization A = L*L' * (right-looking) * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) JB = JN - JA + 1 * * Perform unblocked Cholesky factorization on JB block * CALL PSPOTF2( UPLO, JB, A, IA, JA, DESCA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 * IF( JB+1.LE.N ) THEN * * Form the column panel of L using the triangular solver * CALL PSTRSM( 'Right', UPLO, 'Transpose', 'Non-Unit', $ N-JB, JB, ONE, A, IA, JA, DESCA, A, IA+JB, JA, $ DESCA ) * * Update the trailing matrix, A = A - L*L' * CALL PSSYRK( UPLO, 'No Transpose', N-JB, JB, -ONE, A, IA+JB, $ JA, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * END IF * DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Perform unblocked Cholesky factorization on JB block * CALL PSPOTF2( UPLO, JB, A, I, J, DESCA, INFO ) IF( INFO.NE.0 ) THEN INFO = INFO + J - JA GO TO 30 END IF * IF( J-JA+JB+1.LE.N ) THEN * * Form the column panel of L using the triangular solver * CALL PSTRSM( 'Right', UPLO, 'Transpose', 'Non-Unit', $ N-J-JB+JA, JB, ONE, A, I, J, DESCA, A, I+JB, $ J, DESCA ) * * Update the trailing matrix, A = A - L*L' * CALL PSSYRK( UPLO, 'No Transpose', N-J-JB+JA, JB, -ONE, $ A, I+JB, J, DESCA, ONE, A, I+JB, J+JB, $ DESCA ) * END IF 20 CONTINUE * END IF * 30 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PSPOTRF * END scalapack-2.0.2/SRC/pspotri.f000644 000766 000024 00000017145 10363532303 016215 0ustar00juliestaff000000 000000 SUBROUTINE PSPOTRI( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSPOTRI computes the inverse of a real symmetric positive definite * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the * Cholesky factorization sub( A ) = U**T*U or L*L**T computed by * PSPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor U or L * from the Cholesky factorization of the distributed matrix * sub( A ) = U**T*U or L*L**T, as computed by PSPOTRF. * On exit, the local pieces of the upper or lower triangle of * the (symmetric) inverse of sub( A ), overwriting the input * factor U or L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, the (i,i) element of the factor U or L is * zero, and the inverse could not be computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER ICOFF, ICTXT, IROFF, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSLAUUM, $ PSTRTRI, PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.NE.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPOTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Invert the triangular Cholesky factor U or L. * CALL PSTRTRI( UPLO, 'Non-unit', N, A, IA, JA, DESCA, INFO ) * IF( INFO.GT.0 ) $ RETURN * * Form inv(U)*inv(U)' or inv(L)'*inv(L). * CALL PSLAUUM( UPLO, N, A, IA, JA, DESCA ) * RETURN * * End of PSPOTRI * END scalapack-2.0.2/SRC/pspotrs.f000644 000766 000024 00000023715 10363532303 016227 0ustar00juliestaff000000 000000 SUBROUTINE PSPOTRS( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ) * .. * * Purpose * ======= * * PSPOTRS solves a system of linear equations * * sub( A ) * X = sub( B ) * A(IA:IA+N-1,JA:JA+N-1)*X = B(IB:IB+N-1,JB:JB+NRHS-1) * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is a N-by-N * symmetric positive definite distributed matrix using the Cholesky * factorization sub( A ) = U**T*U or L*L**T computed by PSPOTRF. * sub( B ) denotes the distributed matrix B(IB:IB+N-1,JB:JB+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) REAL pointer into local memory to * an array of dimension (LLD_A, LOCc(JA+N-1)). On entry, this * array contains the factors L or U from the Cholesky facto- * rization sub( A ) = L*L**T or U**T*U, as computed by PSPOTRF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) REAL pointer into the * local memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * the local pieces of the right hand sides sub( B ). * On exit, this array contains the local pieces of the solution * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IAROW, IBROW, ICTXT, IROFFA, IROFFB, ICOFFA, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSTRSM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 11, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1100+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, $ 3, IB, JB, DESCB, 11, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPOTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Solve sub( A ) * X = sub( B ) where sub( A ) = U'*U. * * Solve U'*X = sub( B ), overwriting sub( B ) with X. * CALL PSTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve U*X = sub( B ), overwriting sub( B ) with X. * CALL PSTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) ELSE * * Solve sub( A ) *X = sub( B ) where sub( A ) = L*L'. * * Solve L*X = sub( B ), overwriting sub( B ) with X. * CALL PSTRSM( 'Left', 'Lower', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve L'*X = sub( B ), overwriting sub( B ) with X. * CALL PSTRSM( 'Left', 'Lower', 'Transpose', 'Non-unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) END IF * RETURN * * End of PSPOTRS * END scalapack-2.0.2/SRC/psptsv.f000644 000766 000024 00000045224 10363532303 016053 0ustar00juliestaff000000 000000 SUBROUTINE PSPTSV( N, NRHS, D, E, JA, DESCA, B, IB, DESCB, WORK, $ LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL B( * ), D( * ), E( * ), WORK( * ) * .. * * * Purpose * ======= * * PSPTSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N real * tridiagonal symmetric positive definite distributed * matrix. * * Cholesky factorization is used to factor a reordering of * the matrix into L L'. * * See PSPTTRF and PSPTTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) REAL pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) REAL pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (12*NPCOL + 3*NB) * +max((10+2*min(100,NRHS))*NPCOL+4*NRHS, 8*NPCOL) * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PSPTTRF, PSPTTRS, PXERBLA * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PSPTTRF and PSPTTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSEIF( DESCA( DTYPE_ ) .EQ. 502 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 5*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PSPTSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (12*NPCOL + 3*NB) * * Factor the matrix * CALL PSPTTRF( N, D, E, JA, DESCA, WORK, MIN( LWORK, WS_FACTOR ), $ WORK( 1+WS_FACTOR ), LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PSPTSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PSPTTRS( N, NRHS, D, E, JA, DESCA, B, IB, DESCB, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSPTSV', -INFO ) RETURN END IF * RETURN * * End of PSPTSV * END scalapack-2.0.2/SRC/pspttrf.f000644 000766 000024 00000103731 10363532303 016214 0ustar00juliestaff000000 000000 SUBROUTINE PSPTTRF( N, D, E, JA, DESCA, AF, LAF, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. INTEGER INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL AF( * ), D( * ), E( * ), WORK( * ) * .. * * * Purpose * ======= * * PSPTTRF computes a Cholesky factorization * of an N-by-N real tridiagonal * symmetric positive definite distributed matrix * A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PSPTTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = U' D U or * * P A(1:N, JA:JA+N-1) P^T = L D L', * * where U is a tridiagonal upper triangular matrix and L is tridiagonal * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * D (local input/local output) REAL pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) REAL pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSPTTRF and this is stored in AF. If a linear system * is to be solved using PSPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 8*NPCOL * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, INT_TEMP, JA_NEW, $ LAF_MIN, LEVEL_DIST, LLDA, MYCOL, MYROW, $ MY_NUM_COLS, NB, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, PART_OFFSET, PART_SIZE, RETURN_CODE, $ STORE_N_A, TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, IGAMX2D, IGEBR2D, IGEBS2D, PXERBLA, $ RESHAPE, SGERV2D, SGESD2D, SPTTRF, SPTTRSV, $ STRRV2D, STRSD2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MOD, REAL * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 5*100+2 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -9 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 5*100+6 ) END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 5*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PSPTTRF, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 5*100+4 ) CALL PXERBLA( ICTXT, 'PSPTTRF, D&C alg.: NB too small', -INFO ) RETURN END IF * * * Check auxiliary storage size * LAF_MIN = ( 12*NPCOL+3*NB ) * IF( LAF.LT.LAF_MIN ) THEN INFO = -7 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, 'PSPTTRF: auxiliary storage error ', $ -INFO ) RETURN END IF * * Check worksize * WORK_SIZE_MIN = 8*NPCOL * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -9 CALL PXERBLA( ICTXT, 'PSPTTRF: worksize error ', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 7, 1 ) = DESCA( 5 ) PARAM_CHECK( 6, 1 ) = DESCA( 4 ) PARAM_CHECK( 5, 1 ) = DESCA( 3 ) PARAM_CHECK( 4, 1 ) = DESCA( 1 ) PARAM_CHECK( 3, 1 ) = JA PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 7, 2 ) = 505 PARAM_CHECK( 6, 2 ) = 504 PARAM_CHECK( 5, 2 ) = 503 PARAM_CHECK( 4, 2 ) = 501 PARAM_CHECK( 3, 2 ) = 4 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 9 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 7, PARAM_CHECK, 7, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSPTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 90 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = ZERO 10 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * IF( MYCOL.LT.NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL STRSD2D( ICTXT, 'U', 'N', 1, 1, $ E( PART_OFFSET+ODD_SIZE+1 ), LLDA-1, 0, MYCOL+1 ) * END IF * * * Factor main partition A_i = L_i {L_i}^T in each processor * Or A_i = {U_i}^T {U_i} if E is the upper superdiagonal * CALL SPTTRF( ODD_SIZE, D( PART_OFFSET+1 ), E( PART_OFFSET+1 ), $ INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL + 1 GO TO 20 END IF * * IF( MYCOL.LT.NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * * Perform the triangular system solve {L_i}{{B'}_i}^T = {B_i}^T * by dividing B_i by diagonal element * E( PART_OFFSET+ODD_SIZE ) = E( PART_OFFSET+ODD_SIZE ) / $ D( PART_OFFSET+ODD_SIZE ) * * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^T * D( PART_OFFSET+ODD_SIZE+1 ) = D( PART_OFFSET+ODD_SIZE+1 ) - $ D( PART_OFFSET+ODD_SIZE )* $ ( E( PART_OFFSET+ODD_SIZE )* $ ( E( PART_OFFSET+ODD_SIZE ) ) ) * END IF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 20 CONTINUE * If the processor could not locally factor, it jumps here. * IF( MYCOL.NE.0 ) THEN * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL STRRV2D( ICTXT, 'U', 'N', 1, 1, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF( INFO.EQ.0 ) THEN * * Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ . * CALL SPTTRSV( 'N', ODD_SIZE, INT_ONE, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), AF( 1 ), ODD_SIZE, INFO ) * * Divide by D * DO 30 I = 1, ODD_SIZE AF( I ) = AF( I ) / D( PART_OFFSET+I ) 30 CONTINUE * * * Calculate the update block for previous proc, E_i = G_i{G_i}^T * * * Since there is no element-by-element vector multiplication in * the BLAS, this loop must be hardwired in without a BLAS call * INT_TEMP = ODD_SIZE*INT_ONE + 2 + 1 AF( INT_TEMP ) = 0 * DO 40 I = 1, ODD_SIZE AF( INT_TEMP ) = AF( INT_TEMP ) - $ D( PART_OFFSET+I )*( AF( I )* $ ( AF( I ) ) ) 40 CONTINUE * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+3 ), $ INT_ONE, 0, MYCOL-1 ) * * IF( MYCOL.LT.NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in transpose form. * {F_i}^T = {H_i}{{B'}_i}^T * AF( ODD_SIZE+1 ) = -D( PART_OFFSET+ODD_SIZE )* $ ( E( PART_OFFSET+ODD_SIZE )* $ AF( ODD_SIZE ) ) * * END IF * END IF * End of "if ( MYCOL .ne. 0 )..." * END IF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * IF( INFO.NE.0 ) THEN GO TO 80 END IF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 70 END IF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * END IF * * Copy last diagonal block into AF storage for subsequent * operations. * AF( ODD_SIZE+2 ) = REAL( D( PART_OFFSET+ODD_SIZE+1 ) ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL+1 ) * * Add contribution to diagonal block * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + AF( ODD_SIZE+3 ) * END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 50 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 60 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 ) * END IF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 50 60 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... IF( AF( ODD_SIZE+2 ).EQ.ZERO ) THEN INFO = NPCOL + MYCOL END IF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST.EQ.1 ) THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * AF( ODD_SIZE+3 ) = AF( ODD_SIZE+1 ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST / 2 END IF * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * IF( INFO.EQ.0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * AF( ODD_SIZE+1 ) = AF( ODD_SIZE+1 ) / AF( ODD_SIZE+2 ) * END IF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * WORK( 1 ) = -ONE*AF( ODD_SIZE+1 )*AF( ODD_SIZE+2 )* $ ( AF( ODD_SIZE+1 ) ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST.GT.1 ) THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST / 2 ) * END IF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * AF( ODD_SIZE+3 ) = ( AF( ODD_SIZE+3 ) ) / AF( ODD_SIZE+2 ) * END IF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*AF( ODD_SIZE+2 )* $ ( AF( ODD_SIZE+3 ) ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST END IF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*AF( ODD_SIZE+2 )* $ AF( ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * END IF * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 70 CONTINUE * * 80 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 90 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0, $ 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) END IF * * RETURN * * End of PSPTTRF * END scalapack-2.0.2/SRC/pspttrs.f000644 000766 000024 00000064237 10363532303 016240 0ustar00juliestaff000000 000000 SUBROUTINE PSPTTRS( N, NRHS, D, E, JA, DESCA, B, IB, DESCB, AF, $ LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL AF( * ), B( * ), D( * ), E( * ), WORK( * ) * .. * * * Purpose * ======= * * PSPTTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PSPTTRF. * A(1:N, JA:JA+N-1) is an N-by-N real * tridiagonal symmetric positive definite distributed * matrix. * * Routine PSPTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) REAL pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) REAL pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * IMPORTANT NOTE: The current version of this code supports * only IB=JA * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSPTTRF and this is stored in AF. If a linear system * is to be solved using PSPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (10+2*min(100,NRHS))*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, I, ICTXT, ICTXT_NEW, $ ICTXT_SAVE, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, $ MYROW, MY_NUM_COLS, NB, NP, NPCOL, NPROW, $ NP_SAVE, ODD_SIZE, PART_OFFSET, PART_SIZE, $ RETURN_CODE, STORE_M_B, STORE_N_A, TEMP, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 14, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PSPTTRSV, PXERBLA, RESHAPE, SSCAL * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD, REAL * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 5*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 8*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 8*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 8*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 8*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LWORK.LT.-1 ) THEN INFO = -12 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -1 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 5*100+6 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 8*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 8*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -2 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -4 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 5*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, 'PSPTTRS, D&C alg.: only 1 block per proc' $ , -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 5*100+4 ) CALL PXERBLA( ICTXT, 'PSPTTRS, D&C alg.: NB too small', -INFO ) RETURN END IF * * WORK_SIZE_MIN = ( 10+2*MIN( 100, NRHS ) )*NPCOL + 4*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -12 CALL PXERBLA( ICTXT, 'PSPTTRS: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 14, 1 ) = DESCB( 5 ) PARAM_CHECK( 13, 1 ) = DESCB( 4 ) PARAM_CHECK( 12, 1 ) = DESCB( 3 ) PARAM_CHECK( 11, 1 ) = DESCB( 2 ) PARAM_CHECK( 10, 1 ) = DESCB( 1 ) PARAM_CHECK( 9, 1 ) = IB PARAM_CHECK( 8, 1 ) = DESCA( 5 ) PARAM_CHECK( 7, 1 ) = DESCA( 4 ) PARAM_CHECK( 6, 1 ) = DESCA( 3 ) PARAM_CHECK( 5, 1 ) = DESCA( 1 ) PARAM_CHECK( 4, 1 ) = JA PARAM_CHECK( 3, 1 ) = NRHS PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 14, 2 ) = 905 PARAM_CHECK( 13, 2 ) = 904 PARAM_CHECK( 12, 2 ) = 903 PARAM_CHECK( 11, 2 ) = 902 PARAM_CHECK( 10, 2 ) = 901 PARAM_CHECK( 9, 2 ) = 8 PARAM_CHECK( 8, 2 ) = 505 PARAM_CHECK( 7, 2 ) = 504 PARAM_CHECK( 6, 2 ) = 503 PARAM_CHECK( 5, 2 ) = 501 PARAM_CHECK( 4, 2 ) = 4 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 12 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 14, PARAM_CHECK, 14, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSPTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 30 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * * CALL PSPTTRSV( 'L', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * * Divide by the main diagonal: B <- D^{-1} B * * The main partition is first * DO 10 I = PART_OFFSET + 1, PART_OFFSET + ODD_SIZE CALL SSCAL( NRHS, REAL( ONE / D( I ) ), B( I ), LLDB ) 10 CONTINUE * * Reduced system is next * IF( MYCOL.LT.NPCOL-1 ) THEN I = PART_OFFSET + ODD_SIZE + 1 CALL SSCAL( NRHS, ONE / AF( ODD_SIZE+2 ), B( I ), LLDB ) END IF * * Call backsolve routine * * CALL PSPTTRSV( 'U', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) 20 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 30 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PSPTTRS * END scalapack-2.0.2/SRC/pspttrsv.f000644 000766 000024 00000110170 10363532303 016412 0ustar00juliestaff000000 000000 SUBROUTINE PSPTTRSV( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, $ AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 3, 2000 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL AF( * ), B( * ), D( * ), E( * ), WORK( * ) * .. * * * Purpose * ======= * * PSPTTRSV solves a tridiagonal triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a tridiagonal * triangular matrix factor produced by the * Cholesky factorization code PSPTTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T * is dictated by the user by the parameter TRANS. * * Routine PSPTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) REAL pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) REAL pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) REAL pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) REAL array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PSPTTRF and this is stored in AF. If a linear system * is to be solved using PSPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * REAL temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (10+2*min(100,NRHS))*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, JA_NEW, LEVEL_DIST, LLDA, LLDB, $ MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_M_B, STORE_N_A, $ TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 15, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PXERBLA, RESHAPE, SAXPY, SGEMM, $ SGERV2D, SGESD2D, SMATADD, SPTTRSV, STRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP.EQ.502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 END IF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 7*100+2 ) END IF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE.NE.0 ) THEN INFO = -( 10*100+2 ) END IF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN INFO = -( 10*100+2 ) END IF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN INFO = -( 10*100+4 ) END IF * * Source processor must be the same * IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN INFO = -( 10*100+5 ) END IF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW*NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK.LT.-1 ) THEN INFO = -14 ELSE IF( LWORK.EQ.-1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 END IF * IF( N.LT.0 ) THEN INFO = -2 END IF * IF( N+JA-1.GT.STORE_N_A ) THEN INFO = -( 7*100+6 ) END IF * IF( N+IB-1.GT.STORE_M_B ) THEN INFO = -( 10*100+3 ) END IF * IF( LLDB.LT.NB ) THEN INFO = -( 10*100+6 ) END IF * IF( NRHS.LT.0 ) THEN INFO = -3 END IF * * Current alignment restriction * IF( JA.NE.IB ) THEN INFO = -6 END IF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW.NE.1 ) THEN INFO = -( 7*100+2 ) END IF * IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PSPTTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN END IF * IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, 'PSPTTRSV, D&C alg.: NB too small', $ -INFO ) RETURN END IF * * WORK_SIZE_MIN = INT_ONE*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK.LT.WORK_SIZE_MIN ) THEN IF( LWORK.NE.-1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, 'PSPTTRSV: worksize error', -INFO ) END IF RETURN END IF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 15, 1 ) = DESCB( 5 ) PARAM_CHECK( 14, 1 ) = DESCB( 4 ) PARAM_CHECK( 13, 1 ) = DESCB( 3 ) PARAM_CHECK( 12, 1 ) = DESCB( 2 ) PARAM_CHECK( 11, 1 ) = DESCB( 1 ) PARAM_CHECK( 10, 1 ) = IB PARAM_CHECK( 9, 1 ) = DESCA( 5 ) PARAM_CHECK( 8, 1 ) = DESCA( 4 ) PARAM_CHECK( 7, 1 ) = DESCA( 3 ) PARAM_CHECK( 6, 1 ) = DESCA( 1 ) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = NRHS PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 15, 2 ) = 1005 PARAM_CHECK( 14, 2 ) = 1004 PARAM_CHECK( 13, 2 ) = 1003 PARAM_CHECK( 12, 2 ) = 1002 PARAM_CHECK( 11, 2 ) = 1001 PARAM_CHECK( 10, 2 ) = 9 PARAM_CHECK( 9, 2 ) = 705 PARAM_CHECK( 8, 2 ) = 704 PARAM_CHECK( 7, 2 ) = 703 PARAM_CHECK( 6, 2 ) = 701 PARAM_CHECK( 5, 2 ) = 6 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO*DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 15, PARAM_CHECK, 15, PARAM_CHECK( 1, 3 ), $ INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PSPTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) ) * IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN PART_OFFSET = PART_OFFSET + NB END IF * IF( MYCOL.LT.CSRC ) THEN PART_OFFSET = PART_OFFSET - NB END IF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 ) / NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC, $ INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW.LT.0 ) THEN GO TO 100 END IF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF( MYCOL.EQ.0 ) THEN PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE ) END IF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF( MYCOL.LT.NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE END IF * * * * Begin main code * IF( LSAME( UPLO, 'L' ) ) THEN * * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL SPTTRSV( 'N', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL SAXPY( NRHS, -E( PART_OFFSET+ODD_SIZE ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL SGEMM( 'T', 'N', 1, NRHS, ODD_SIZE, -ONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, ZERO, $ WORK( 1+INT_ONE-1 ), INT_ONE ) END IF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL+1 ) * * Combine contribution to locally stored right hand sides * CALL SMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 30 END IF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 10 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 20 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * CALL SMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * CALL SMATADD( INT_ONE, NRHS, ONE, WORK( 1 ), INT_ONE, ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * END IF * LEVEL_DIST = LEVEL_DIST*2 * GO TO 10 20 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL STRTRS( 'L', 'N', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 90 END IF * * * * ********* IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ( ODD_SIZE )*1+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL SGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ODD_SIZE*1+2+1 ), INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO, $ WORK( 1 ), INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL SGESD2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 30 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL.EQ.NPCOL-1 ) THEN GO TO 80 END IF * * Determine number of steps in tree loop * LEVEL_DIST = 1 40 CONTINUE IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 ) $ GO TO 50 * LEVEL_DIST = LEVEL_DIST*2 * GO TO 40 50 CONTINUE * * IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND. $ ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL SGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ODD_SIZE*1+2+1 ), INT_ONE, WORK( 1 ), $ INT_ONE, ONE, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) END IF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN * * Receive solution from processor to right * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL SGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -ONE, $ AF( ( ODD_SIZE )*1+1 ), INT_ONE, WORK( 1 ), $ INT_ONE, ONE, B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * END IF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL STRTRS( 'L', 'T', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 90 END IF * * * ***Modification Loop ******* * 60 CONTINUE IF( LEVEL_DIST.EQ.1 ) $ GO TO 70 * LEVEL_DIST = LEVEL_DIST / 2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+LEVEL_DIST ) * END IF * * Send solution to left * IF( MYCOL-LEVEL_DIST.GE.0 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL-LEVEL_DIST ) * END IF * GO TO 60 70 CONTINUE * [End of GOTO Loop] * 80 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL.LT.NPCOL-1 ) THEN * CALL SGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0, $ MYCOL+1 ) * END IF * * Receive modifications to processor's right hand sides * IF( MYCOL.GT.0 ) THEN * CALL SGERV2D( ICTXT, INT_ONE, NRHS, WORK( 1 ), INT_ONE, 0, $ MYCOL-1 ) * END IF * * * ********************************************** * Local computation phase ********************************************** * IF( MYCOL.NE.0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL SGEMM( 'N', 'N', ODD_SIZE, NRHS, 1, -ONE, AF( 1 ), $ ODD_SIZE, WORK( 1+INT_ONE-1 ), INT_ONE, ONE, $ B( PART_OFFSET+1 ), LLDB ) * END IF * * IF( MYCOL.LT.NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL SAXPY( NRHS, -( E( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * END IF * * Use main partition in each processor to solve locally * CALL SPTTRSV( 'T', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * END IF * End of "IF( LSAME( UPLO, 'L' ) )"... 90 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 100 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PSPTTRSV * END scalapack-2.0.2/SRC/psrot.f000644 000766 000024 00000041442 11705175572 015675 0ustar00juliestaff000000 000000 SUBROUTINE PSROT( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, $ INCY, CS, SN, WORK, LWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER N, IX, JX, INCX, IY, JY, INCY, LWORK, INFO REAL CS, SN * .. * .. Array Arguments .. INTEGER DESCX( * ), DESCY( * ) REAL X( * ), Y( * ), WORK( * ) * .. * * Purpose * ======= * PSROT applies a planar rotation defined by CS and SN to the * two distributed vectors sub(X) and sub(Y). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of elements to operate on when applying the planar * rotation to X and Y. N>=0. * * X (local input/local output) DOUBLE PRECSION array of dimension * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. If INCX = 1, then it is required * that IX = IY. 1 <= IX <= M_X. * * JX (global input) INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. If INCX = M_X, then it is required * that JX = JY. 1 <= IX <= N_X. * * DESCX (global and local input) INTEGER array of dimension 9 * The array descriptor of the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * Moreover, it must hold that INCX = M_X if INCY = M_Y and * that INCX = 1 if INCY = 1. * * Y (local input/local output) DOUBLE PRECSION array of dimension * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * * IY (global input) INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. If INCY = 1, then it is required * that IY = IX. 1 <= IY <= M_Y. * * JY (global input) INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. If INCY = M_X, then it is required * that JY = JX. 1 <= JY <= N_Y. * * DESCY (global and local input) INTEGER array of dimension 9 * The array descriptor of the distributed matrix Y. * * INCY (global input) INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * Moreover, it must hold that INCY = M_Y if INCX = M_X and * that INCY = 1 if INCX = 1. * * CS (global input) REAL * SN (global input) REAL * The parameters defining the properties of the planar * rotation. It must hold that 0 <= CS,SN <= 1 and that * SN**2 + CS**2 = 1. The latter is hardly checked in * finite precision arithmetics. * * WORK (local input) REAL array of dimension LWORK * Local workspace area. * * LWORK (local input) INTEGER * The length of the workspace array WORK. * If INCX = 1 and INCY = 1, then LWORK = 2*MB_X * * If LWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the WORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value. * If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then INFO = -i. * * Additional requirements * ======================= * * The following alignment requirements must hold: * (a) DESCX( MB_ ) = DESCY( MB_ ) and DESCX( NB_ ) = DESCY( NB_ ) * (b) DESCX( RSRC_ ) = DESCY( RSRC_ ) * (c) DESCX( CSRC_ ) = DESCY( CSRC_ ) * * ===================================================================== * * Written by Robert Granat, May 15, 2007. * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY, LEFT, RIGHT INTEGER ICTXT, NPROW, NPCOL, MYROW, MYCOL, NPROCS, $ MB, NB, XYROWS, XYCOLS, RSRC1, RSRC2, CSRC1, $ CSRC2, ICOFFXY, IROFFXY, MNWRK, LLDX, LLDY, $ INDX, JXX, XLOC1, XLOC2, RSRC, CSRC, YLOC1, $ YLOC2, JYY, IXX, IYY * .. * .. External Functions .. INTEGER NUMROC, INDXG2P, INDXG2L EXTERNAL NUMROC, INDXG2P, INDXG2L * .. * .. External Subroutines .. EXTERNAL SROT * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Local Functions .. INTEGER ICEIL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL * * Test and decode parameters * LQUERY = LWORK.EQ.-1 INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSEIF( IX.LT.1 .OR. IX.GT.DESCX(M_) ) THEN INFO = -3 ELSEIF( JX.LT.1 .OR. JX.GT.DESCX(N_) ) THEN INFO = -4 ELSEIF( INCX.NE.1 .AND. INCX.NE.DESCX(M_) ) THEN INFO = -6 ELSEIF( IY.LT.1 .OR. IY.GT.DESCY(M_) ) THEN INFO = -8 ELSEIF( JY.LT.1 .OR. JY.GT.DESCY(N_) ) THEN INFO = -9 ELSEIF( INCY.NE.1 .AND. INCY.NE.DESCY(M_) ) THEN INFO = -11 ELSEIF( (INCX.EQ.DESCX(M_) .AND. INCY.NE.DESCY(M_)) .OR. $ (INCX.EQ.1 .AND. INCY.NE.1 ) ) THEN INFO = -11 ELSEIF( (INCX.EQ.1 .AND. INCY.EQ.1) .AND. $ IX.NE.IY ) THEN INFO = -8 ELSEIF( (INCX.EQ.DESCX(M_) .AND. INCY.EQ.DESCY(M_)) .AND. $ JX.NE.JY ) THEN INFO = -9 END IF * * Compute the direction of the planar rotation * LEFT = INCX.EQ.DESCX(M_) .AND. INCY.EQ.DESCY(M_) RIGHT = INCX.EQ.1 .AND. INCY.EQ.1 * * Check blocking factors and root processor * IF( INFO.EQ.0 ) THEN IF( LEFT .AND. DESCX(NB_).NE.DESCY(NB_) ) THEN INFO = -(100*5 + NB_) END IF IF( RIGHT .AND. DESCX(MB_).NE.DESCY(NB_) ) THEN INFO = -(100*10 + MB_) END IF END IF IF( INFO.EQ.0 ) THEN IF( LEFT .AND. DESCX(CSRC_).NE.DESCY(CSRC_) ) THEN INFO = -(100*5 + CSRC_) END IF IF( RIGHT .AND. DESCX(RSRC_).NE.DESCY(RSRC_) ) THEN INFO = -(100*10 + RSRC_) END IF END IF * * Compute workspace * MB = DESCX( MB_ ) NB = DESCX( NB_ ) IF( LEFT ) THEN RSRC1 = INDXG2P( IX, MB, MYROW, DESCX(RSRC_), NPROW ) RSRC2 = INDXG2P( IY, MB, MYROW, DESCY(RSRC_), NPROW ) CSRC = INDXG2P( JX, NB, MYCOL, DESCX(CSRC_), NPCOL ) ICOFFXY = MOD( JX - 1, NB ) XYCOLS = NUMROC( N+ICOFFXY, NB, MYCOL, CSRC, NPCOL ) IF( ( MYROW.EQ.RSRC1 .OR. MYROW.EQ.RSRC2 ) .AND. $ MYCOL.EQ.CSRC ) XYCOLS = XYCOLS - ICOFFXY IF( RSRC1.NE.RSRC2 ) THEN MNWRK = XYCOLS ELSE MNWRK = 0 END IF ELSEIF( RIGHT ) THEN CSRC1 = INDXG2P( JX, NB, MYCOL, DESCX(CSRC_), NPCOL ) CSRC2 = INDXG2P( JY, NB, MYCOL, DESCY(CSRC_), NPCOL ) RSRC = INDXG2P( IX, MB, MYROW, DESCX(RSRC_), NPROW ) IROFFXY = MOD( IX - 1, MB ) XYROWS = NUMROC( N+IROFFXY, MB, MYROW, RSRC, NPROW ) IF( ( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC2 ) .AND. $ MYROW.EQ.RSRC ) XYROWS = XYROWS - IROFFXY IF( CSRC1.NE.CSRC2 ) THEN MNWRK = XYROWS ELSE MNWRK = 0 END IF END IF IF( INFO.EQ.0 ) THEN IF( .NOT.LQUERY . AND. LWORK.LT.MNWRK ) INFO = -15 END IF * * Return if some argument is incorrect * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSROT', -INFO ) RETURN ELSEIF( LQUERY ) THEN WORK( 1 ) = FLOAT(MNWRK) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN RETURN END IF * * Extract local leading dimensions * LLDX = DESCX( LLD_ ) LLDY = DESCY( LLD_ ) * * If we have only one process, use the corresponding LAPACK * routine and return * IF( NPROCS.EQ.1 ) THEN IF( LEFT ) THEN CALL SROT( N, X((JX-1)*LLDX+IX), LLDX, Y((JY-1)*LLDY+IY), $ LLDY, CS, SN ) ELSEIF( RIGHT ) THEN CALL SROT( N, X((JX-1)*LLDX+IX), 1, Y((JY-1)*LLDY+IY), $ 1, CS, SN ) END IF RETURN END IF * * Exchange data between processors if necessary and perform planar * rotation * IF( LEFT ) THEN DO 10 INDX = 1, NPCOL IF( MYROW.EQ.RSRC1 .AND. XYCOLS.GT.0 ) THEN IF( INDX.EQ.1 ) THEN JXX = JX ELSE JXX = JX-ICOFFXY + (INDX-1)*NB END IF CALL INFOG2L( IX, JXX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, XLOC1, XLOC2, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN IF( RSRC1.NE.RSRC2 ) THEN CALL SGESD2D( ICTXT, 1, XYCOLS, $ X((XLOC2-1)*LLDX+XLOC1), LLDX, $ RSRC2, CSRC ) CALL SGERV2D( ICTXT, 1, XYCOLS, WORK, 1, $ RSRC2, CSRC ) CALL SROT( XYCOLS, X((XLOC2-1)*LLDX+XLOC1), $ LLDX, WORK, 1, CS, SN ) ELSE CALL INFOG2L( IY, JXX, DESCY, NPROW, NPCOL, $ MYROW, MYCOL, YLOC1, YLOC2, RSRC, $ CSRC ) CALL SROT( XYCOLS, X((XLOC2-1)*LLDX+XLOC1), $ LLDX, Y((YLOC2-1)*LLDY+YLOC1), LLDY, CS, $ SN ) END IF END IF END IF IF( MYROW.EQ.RSRC2 .AND. RSRC1.NE.RSRC2 ) THEN IF( INDX.EQ.1 ) THEN JYY = JY ELSE JYY = JY-ICOFFXY + (INDX-1)*NB END IF CALL INFOG2L( IY, JYY, DESCY, NPROW, NPCOL, MYROW, $ MYCOL, YLOC1, YLOC2, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN CALL SGESD2D( ICTXT, 1, XYCOLS, $ Y((YLOC2-1)*LLDY+YLOC1), LLDY, $ RSRC1, CSRC ) CALL SGERV2D( ICTXT, 1, XYCOLS, WORK, 1, $ RSRC1, CSRC ) CALL SROT( XYCOLS, WORK, 1, Y((YLOC2-1)*LLDY+YLOC1), $ LLDY, CS, SN ) END IF END IF 10 CONTINUE ELSEIF( RIGHT ) THEN DO 20 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 .AND. XYROWS.GT.0 ) THEN IF( INDX.EQ.1 ) THEN IXX = IX ELSE IXX = IX-IROFFXY + (INDX-1)*MB END IF CALL INFOG2L( IXX, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, XLOC1, XLOC2, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN IF( CSRC1.NE.CSRC2 ) THEN CALL SGESD2D( ICTXT, XYROWS, 1, $ X((XLOC2-1)*LLDX+XLOC1), LLDX, $ RSRC, CSRC2 ) CALL SGERV2D( ICTXT, XYROWS, 1, WORK, XYROWS, $ RSRC, CSRC2 ) CALL SROT( XYROWS, X((XLOC2-1)*LLDX+XLOC1), $ 1, WORK, 1, CS, SN ) ELSE CALL INFOG2L( IXX, JY, DESCY, NPROW, NPCOL, $ MYROW, MYCOL, YLOC1, YLOC2, RSRC, $ CSRC ) CALL SROT( XYROWS, X((XLOC2-1)*LLDX+XLOC1), $ 1, Y((YLOC2-1)*LLDY+YLOC1), 1, CS, $ SN ) END IF END IF END IF IF( MYCOL.EQ.CSRC2 .AND. CSRC1.NE.CSRC2 ) THEN IF( INDX.EQ.1 ) THEN IYY = IY ELSE IYY = IY-IROFFXY + (INDX-1)*MB END IF CALL INFOG2L( IYY, JY, DESCY, NPROW, NPCOL, MYROW, $ MYCOL, YLOC1, YLOC2, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN CALL SGESD2D( ICTXT, XYROWS, 1, $ Y((YLOC2-1)*LLDY+YLOC1), LLDY, $ RSRC, CSRC1 ) CALL SGERV2D( ICTXT, XYROWS, 1, WORK, XYROWS, $ RSRC, CSRC1 ) CALL SROT( XYROWS, WORK, 1, Y((YLOC2-1)*LLDY+YLOC1), $ 1, CS, SN ) END IF END IF 20 CONTINUE END IF * * Store minimum workspace requirements in WORK-array and return * WORK( 1 ) = FLOAT(MNWRK) RETURN * * End of PSROT * END scalapack-2.0.2/SRC/psrscl.f000644 000766 000024 00000016256 10363532303 016025 0ustar00juliestaff000000 000000 SUBROUTINE PSRSCL( N, SA, SX, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N REAL SA * .. * .. Array Arguments .. INTEGER DESCX( * ) REAL SX( * ) * .. * * Purpose * ======= * * PSRSCL multiplies an N-element real distributed vector sub( X ) by * the real scalar 1/a. This is done without overflow or underflow as * long as the final result sub( X )/a does not overflow or underflow. * * where sub( X ) denotes X(IX:IX+N-1,JX:JX), if INCX = 1, * X(IX:IX,JX:JX+N-1), if INCX = M_X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) pointer to INTEGER * The number of components of the distributed vector sub( X ). * N >= 0. * * SA (global input) REAL * The scalar a which is used to divide each component of * sub( X ). SA must be >= 0, or the subroutine will divide by * zero. * * SX (local input/local output) REAL array * containing the local pieces of a distributed matrix of * dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL DONE INTEGER ICTXT, MYCOL, MYROW, NPCOL, NPROW REAL BIGNUM, CDEN, CDEN1, CNUM, CNUM1, MUL, SMLNUM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PSLABAD, PSSCAL * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Get machine parameters * SMLNUM = PSLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM CALL PSLABAD( ICTXT, SMLNUM, BIGNUM ) * * Initialize the denominator to SA and the numerator to 1. * CDEN = SA CNUM = ONE * 10 CONTINUE CDEN1 = CDEN*SMLNUM CNUM1 = CNUM / BIGNUM IF( ABS( CDEN1 ).GT.ABS( CNUM ) .AND. CNUM.NE.ZERO ) THEN * * Pre-multiply sub( X ) by SMLNUM if CDEN is large compared to * CNUM. * MUL = SMLNUM DONE = .FALSE. CDEN = CDEN1 ELSE IF( ABS( CNUM1 ).GT.ABS( CDEN ) ) THEN * * Pre-multiply sub( X ) by BIGNUM if CDEN is small compared to * CNUM. * MUL = BIGNUM DONE = .FALSE. CNUM = CNUM1 ELSE * * Multiply sub( X ) by CNUM / CDEN and return. * MUL = CNUM / CDEN DONE = .TRUE. END IF * * Scale the vector sub( X ) by MUL * CALL PSSCAL( N, MUL, SX, IX, JX, DESCX, INCX ) * IF( .NOT.DONE ) $ GO TO 10 * RETURN * * End of PSRSCL * END scalapack-2.0.2/SRC/psstebz.f000644 000766 000024 00000143774 10363532303 016217 0ustar00juliestaff000000 000000 SUBROUTINE PSSTEBZ( ICTXT, RANGE, ORDER, N, VL, VU, IL, IU, $ ABSTOL, D, E, M, NSPLIT, W, IBLOCK, ISPLIT, $ WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER ORDER, RANGE INTEGER ICTXT, IL, INFO, IU, LIWORK, LWORK, M, N, $ NSPLIT REAL ABSTOL, VL, VU * .. * .. Array Arguments .. INTEGER IBLOCK( * ), ISPLIT( * ), IWORK( * ) REAL D( * ), E( * ), W( * ), WORK( * ) * .. * * Purpose * ======= * * PSSTEBZ computes the eigenvalues of a symmetric tridiagonal matrix in * parallel. The user may ask for all eigenvalues, all eigenvalues in * the interval [VL, VU], or the eigenvalues indexed IL through IU. A * static partitioning of work is done at the beginning of PSSTEBZ which * results in all processes finding an (almost) equal number of * eigenvalues. * * NOTE : It is assumed that the user is on an IEEE machine. If the user * is not on an IEEE mchine, set the compile time flag NO_IEEE * to 1 (in SLmake.inc). The features of IEEE arithmetic that * are needed for the "fast" Sturm Count are : (a) infinity * arithmetic (b) the sign bit of a double precision floating * point number is assumed be in the 32nd or 64th bit position * (c) the sign of negative zero. * * See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal * Matrix", Report CS41, Computer Science Dept., Stanford * University, July 21, 1966. * * Arguments * ========= * * ICTXT (global input) INTEGER * The BLACS context handle. * * RANGE (global input) CHARACTER * Specifies which eigenvalues are to be found. * = 'A': ("All") all eigenvalues will be found. * = 'V': ("Value") all eigenvalues in the interval * [VL, VU] will be found. * = 'I': ("Index") the IL-th through IU-th eigenvalues (of the * entire matrix) will be found. * * ORDER (global input) CHARACTER * Specifies the order in which the eigenvalues and their block * numbers are stored in W and IBLOCK. * = 'B': ("By Block") the eigenvalues will be grouped by * split-off block (see IBLOCK, ISPLIT) and * ordered from smallest to largest within * the block. * = 'E': ("Entire matrix") * the eigenvalues for the entire matrix * will be ordered from smallest to largest. * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Eigenvalues less than VL will not be * returned. Not referenced if RANGE='A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Eigenvalues greater than VU will not be * returned. VU must be greater than VL. Not referenced if * RANGE='A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL must be at least 1. * Not referenced if RANGE='A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. IU must be at least IL * and no greater than N. Not referenced if RANGE='A' or 'V'. * * ABSTOL (global input) REAL * The absolute tolerance for the eigenvalues. An eigenvalue * (or cluster) is considered to be located if it has been * determined to lie in an interval whose width is ABSTOL or * less. If ABSTOL is less than or equal to zero, then ULP*|T| * will be used, where |T| means the 1-norm of T. * Eigenvalues will be computed most accurately when ABSTOL is * set to the underflow threshold SLAMCH('U'), not zero. * Note : If eigenvectors are desired later by inverse iteration * ( PSSTEIN ), ABSTOL should be set to 2*PSLAMCH('S'). * * D (global input) REAL array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. To * avoid overflow, the matrix must be scaled so that its largest * entry is no greater than overflow**(1/2) * underflow**(1/4) * in absolute value, and for greatest accuracy, it should not * be much smaller than that. * * E (global input) REAL array, dimension (N-1) * The (n-1) off-diagonal elements of the tridiagonal matrix T. * To avoid overflow, the matrix must be scaled so that its * largest entry is no greater than overflow**(1/2) * * underflow**(1/4) in absolute value, and for greatest * accuracy, it should not be much smaller than that. * * M (global output) INTEGER * The actual number of eigenvalues found. 0 <= M <= N. * (See also the description of INFO=2) * * NSPLIT (global output) INTEGER * The number of diagonal blocks in the matrix T. * 1 <= NSPLIT <= N. * * W (global output) REAL array, dimension (N) * On exit, the first M elements of W contain the eigenvalues * on all processes. * * IBLOCK (global output) INTEGER array, dimension (N) * At each row/column j where E(j) is zero or small, the * matrix T is considered to split into a block diagonal * matrix. On exit IBLOCK(i) specifies which block (from 1 * to the number of blocks) the eigenvalue W(i) belongs to. * NOTE: in the (theoretically impossible) event that bisection * does not converge for some or all eigenvalues, INFO is set * to 1 and the ones for which it did not are identified by a * negative block number. * * ISPLIT (global output) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N. * (Only the first NSPLIT elements will actually be used, but * since the user cannot know a priori what value NSPLIT will * have, N words must be reserved for ISPLIT.) * * WORK (local workspace) REAL array, dimension ( MAX( 5*N, 7 ) ) * * LWORK (local input) INTEGER * size of array WORK must be >= MAX( 5*N, 7 ) * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace) INTEGER array, dimension ( MAX( 4*N, 14 ) ) * * LIWORK (local input) INTEGER * size of array IWORK must be >= MAX( 4*N, 14, NPROCS ) * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0 : successful exit * < 0 : if INFO = -i, the i-th argument had an illegal value * > 0 : some or all of the eigenvalues failed to converge or * were not computed: * = 1 : Bisection failed to converge for some eigenvalues; * these eigenvalues are flagged by a negative block * number. The effect is that the eigenvalues may not * be as accurate as the absolute and relative * tolerances. This is generally caused by arithmetic * which is less accurate than PSLAMCH says. * = 2 : There is a mismatch between the number of * eigenvalues output and the number desired. * = 3 : RANGE='i', and the Gershgorin interval initially * used was incorrect. No eigenvalues were computed. * Probable cause: your machine has sloppy floating * point arithmetic. * Cure: Increase the PARAMETER "FUDGE", recompile, * and try again. * * Internal Parameters * =================== * * RELFAC REAL, default = 2.0 * The relative tolerance. An interval [a,b] lies within * "relative tolerance" if b-a < RELFAC*ulp*max(|a|,|b|), * where "ulp" is the machine precision (distance from 1 to * the next larger floating point number.) * * FUDGE REAL, default = 2.0 * A "fudge factor" to widen the Gershgorin intervals. Ideally, * a value of 1 should work, but on machines with sloppy * arithmetic, this needs to be larger. The default for * publicly released versions should be large enough to handle * the worst machine around. Note that this has no effect * on the accuracy of the solution. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, ICHAR, MAX, MIN, MOD, REAL * .. * .. External Functions .. LOGICAL LSAME INTEGER BLACS_PNUM REAL PSLAMCH EXTERNAL LSAME, BLACS_PNUM, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_FREEBUFF, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDMAP, GLOBCHK, $ IGEBR2D, IGEBS2D, IGERV2D, IGESD2D, IGSUM2D, $ PSLAEBZ, PSLAIECT, PSLAPDCT, PSLASNBT, PXERBLA, $ SGEBR2D, SGEBS2D, SGERV2D, SGESD2D, SLASRT2 * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER BIGNUM, DESCMULT PARAMETER ( BIGNUM = 10000, DESCMULT = 100 ) REAL ZERO, ONE, TWO, FIVE, HALF PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0, $ FIVE = 5.0E+0, HALF = 1.0E+0 / TWO ) REAL FUDGE, RELFAC PARAMETER ( FUDGE = 2.0E+0, RELFAC = 2.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER BLKNO, FOUND, I, IBEGIN, IEFLAG, IEND, IFRST, $ IINFO, ILAST, ILOAD, IM, IMYLOAD, IN, INDRIW1, $ INDRIW2, INDRW1, INDRW2, INXTLOAD, IOFF, $ IORDER, IOUT, IRANGE, IRECV, IREM, ITMP1, $ ITMP2, J, JB, K, LAST, LEXTRA, LREQ, MYCOL, $ MYROW, NALPHA, NBETA, NCMP, NEIGINT, NEXT, NGL, $ NGLOB, NGU, NINT, NPCOL, NPROW, OFFSET, $ ONEDCONTEXT, P, PREV, REXTRA, RREQ, SELF, $ TORECV REAL ALPHA, ATOLI, BETA, BNORM, DRECV, DSEND, GL, $ GU, INITVL, INITVU, LSAVE, MID, PIVMIN, RELTOL, $ SAFEMN, TMP1, TMP2, TNORM, ULP * .. * .. Local Arrays .. INTEGER IDUM( 5, 2 ) * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Set up process grid * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 M = 0 * * Decode RANGE * IF( LSAME( RANGE, 'A' ) ) THEN IRANGE = 1 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IRANGE = 2 ELSE IF( LSAME( RANGE, 'I' ) ) THEN IRANGE = 3 ELSE IRANGE = 0 END IF * * Decode ORDER * IF( LSAME( ORDER, 'B' ) ) THEN IORDER = 2 ELSE IF( LSAME( ORDER, 'E' ) .OR. LSAME( ORDER, 'A' ) ) THEN IORDER = 1 ELSE IORDER = 0 END IF * * Check for Errors * IF( NPROW.EQ.-1 ) THEN INFO = -1 ELSE * * Get machine constants * SAFEMN = PSLAMCH( ICTXT, 'S' ) ULP = PSLAMCH( ICTXT, 'P' ) RELTOL = ULP*RELFAC IDUM( 1, 1 ) = ICHAR( RANGE ) IDUM( 1, 2 ) = 2 IDUM( 2, 1 ) = ICHAR( ORDER ) IDUM( 2, 2 ) = 3 IDUM( 3, 1 ) = N IDUM( 3, 2 ) = 4 NGLOB = 5 IF( IRANGE.EQ.3 ) THEN IDUM( 4, 1 ) = IL IDUM( 4, 2 ) = 7 IDUM( 5, 1 ) = IU IDUM( 5, 2 ) = 8 ELSE IDUM( 4, 1 ) = 0 IDUM( 4, 2 ) = 0 IDUM( 5, 1 ) = 0 IDUM( 5, 2 ) = 0 END IF IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WORK( 1 ) = ABSTOL IF( IRANGE.EQ.2 ) THEN WORK( 2 ) = VL WORK( 3 ) = VU ELSE WORK( 2 ) = ZERO WORK( 3 ) = ZERO END IF CALL SGEBS2D( ICTXT, 'ALL', ' ', 3, 1, WORK, 3 ) ELSE CALL SGEBR2D( ICTXT, 'ALL', ' ', 3, 1, WORK, 3, 0, 0 ) END IF LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IRANGE.EQ.0 ) THEN INFO = -2 ELSE IF( IORDER.EQ.0 ) THEN INFO = -3 ELSE IF( IRANGE.EQ.2 .AND. VL.GE.VU ) THEN INFO = -5 ELSE IF( IRANGE.EQ.3 .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, $ N ) ) ) THEN INFO = -6 ELSE IF( IRANGE.EQ.3 .AND. ( IU.LT.MIN( N, $ IL ) .OR. IU.GT.N ) ) THEN INFO = -7 ELSE IF( LWORK.LT.MAX( 5*N, 7 ) .AND. .NOT.LQUERY ) THEN INFO = -18 ELSE IF( LIWORK.LT.MAX( 4*N, 14, NPROW*NPCOL ) .AND. .NOT. $ LQUERY ) THEN INFO = -20 ELSE IF( IRANGE.EQ.2 .AND. ( ABS( WORK( 2 )-VL ).GT.FIVE* $ ULP*ABS( VL ) ) ) THEN INFO = -5 ELSE IF( IRANGE.EQ.2 .AND. ( ABS( WORK( 3 )-VU ).GT.FIVE* $ ULP*ABS( VU ) ) ) THEN INFO = -6 ELSE IF( ABS( WORK( 1 )-ABSTOL ).GT.FIVE*ULP*ABS( ABSTOL ) ) $ THEN INFO = -9 END IF END IF IF( INFO.EQ.0 ) $ INFO = BIGNUM CALL GLOBCHK( ICTXT, NGLOB, IDUM, 5, IWORK, INFO ) IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF END IF WORK( 1 ) = REAL( MAX( 5*N, 7 ) ) IWORK( 1 ) = MAX( 4*N, 14, NPROW*NPCOL ) * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSTEBZ', -INFO ) RETURN ELSE IF( LWORK.EQ.-1 .AND. LIWORK.EQ.-1 ) THEN RETURN END IF * * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * K = 1 DO 20 I = 0, NPROW - 1 DO 10 J = 0, NPCOL - 1 IWORK( K ) = BLACS_PNUM( ICTXT, I, J ) K = K + 1 10 CONTINUE 20 CONTINUE * P = NPROW*NPCOL NPROW = 1 NPCOL = P * CALL BLACS_GET( ICTXT, 10, ONEDCONTEXT ) CALL BLACS_GRIDMAP( ONEDCONTEXT, IWORK, NPROW, NPROW, NPCOL ) CALL BLACS_GRIDINFO( ONEDCONTEXT, I, J, K, SELF ) * * Simplifications: * IF( IRANGE.EQ.3 .AND. IL.EQ.1 .AND. IU.EQ.N ) $ IRANGE = 1 * NEXT = MOD( SELF+1, P ) PREV = MOD( P+SELF-1, P ) * * Compute squares of off-diagonals, splitting points and pivmin. * Interleave diagonals and off-diagonals. * INDRW1 = MAX( 2*N, 4 ) INDRW2 = INDRW1 + 2*N INDRIW1 = MAX( 2*N, 8 ) NSPLIT = 1 WORK( INDRW1+2*N ) = ZERO PIVMIN = ONE * DO 30 I = 1, N - 1 TMP1 = E( I )**2 J = 2*I WORK( INDRW1+J-1 ) = D( I ) IF( ABS( D( I+1 )*D( I ) )*ULP**2+SAFEMN.GT.TMP1 ) THEN ISPLIT( NSPLIT ) = I NSPLIT = NSPLIT + 1 WORK( INDRW1+J ) = ZERO ELSE WORK( INDRW1+J ) = TMP1 PIVMIN = MAX( PIVMIN, TMP1 ) END IF 30 CONTINUE WORK( INDRW1+2*N-1 ) = D( N ) ISPLIT( NSPLIT ) = N PIVMIN = PIVMIN*SAFEMN * * Compute Gershgorin interval [gl,gu] for entire matrix * GU = D( 1 ) GL = D( 1 ) TMP1 = ZERO * DO 40 I = 1, N - 1 TMP2 = ABS( E( I ) ) GU = MAX( GU, D( I )+TMP1+TMP2 ) GL = MIN( GL, D( I )-TMP1-TMP2 ) TMP1 = TMP2 40 CONTINUE GU = MAX( GU, D( N )+TMP1 ) GL = MIN( GL, D( N )-TMP1 ) TNORM = MAX( ABS( GL ), ABS( GU ) ) GL = GL - FUDGE*TNORM*ULP*N - FUDGE*TWO*PIVMIN GU = GU + FUDGE*TNORM*ULP*N + FUDGE*PIVMIN * IF( ABSTOL.LE.ZERO ) THEN ATOLI = ULP*TNORM ELSE ATOLI = ABSTOL END IF * * Find out if on an IEEE machine, the sign bit is the * 32nd bit (Big Endian) or the 64th bit (Little Endian) * IF( IRANGE.EQ.1 .OR. NSPLIT.EQ.1 ) THEN CALL PSLASNBT( IEFLAG ) ELSE IEFLAG = 0 END IF LEXTRA = 0 REXTRA = 0 * * Form Initial Interval containing desired eigenvalues * IF( IRANGE.EQ.1 ) THEN INITVL = GL INITVU = GU WORK( 1 ) = GL WORK( 2 ) = GU IWORK( 1 ) = 0 IWORK( 2 ) = N IFRST = 1 ILAST = N ELSE IF( IRANGE.EQ.2 ) THEN IF( VL.GT.GL ) THEN IF( IEFLAG.EQ.0 ) THEN CALL PSLAPDCT( VL, N, WORK( INDRW1+1 ), PIVMIN, IFRST ) ELSE CALL PSLAIECT( VL, N, WORK( INDRW1+1 ), IFRST ) END IF IFRST = IFRST + 1 INITVL = VL ELSE INITVL = GL IFRST = 1 END IF IF( VU.LT.GU ) THEN IF( IEFLAG.EQ.0 ) THEN CALL PSLAPDCT( VU, N, WORK( INDRW1+1 ), PIVMIN, ILAST ) ELSE CALL PSLAIECT( VU, N, WORK( INDRW1+1 ), ILAST ) END IF INITVU = VU ELSE INITVU = GU ILAST = N END IF WORK( 1 ) = INITVL WORK( 2 ) = INITVU IWORK( 1 ) = IFRST - 1 IWORK( 2 ) = ILAST ELSE IF( IRANGE.EQ.3 ) THEN WORK( 1 ) = GL WORK( 2 ) = GU IWORK( 1 ) = 0 IWORK( 2 ) = N IWORK( 5 ) = IL - 1 IWORK( 6 ) = IU CALL PSLAEBZ( 0, N, 2, 1, ATOLI, RELTOL, PIVMIN, $ WORK( INDRW1+1 ), IWORK( 5 ), WORK, IWORK, NINT, $ LSAVE, IEFLAG, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 3 GO TO 230 END IF IF( NINT.GT.1 ) THEN IF( IWORK( 5 ).EQ.IL-1 ) THEN WORK( 2 ) = WORK( 4 ) IWORK( 2 ) = IWORK( 4 ) ELSE WORK( 1 ) = WORK( 3 ) IWORK( 1 ) = IWORK( 3 ) END IF IF( IWORK( 1 ).LT.0 .OR. IWORK( 1 ).GT.IL-1 .OR. $ IWORK( 2 ).LE.MIN( IU-1, IWORK( 1 ) ) .OR. $ IWORK( 2 ).GT.N ) THEN INFO = 3 GO TO 230 END IF END IF LEXTRA = IL - 1 - IWORK( 1 ) REXTRA = IWORK( 2 ) - IU INITVL = WORK( 1 ) INITVU = WORK( 2 ) IFRST = IL ILAST = IU END IF * NVL = IFRST - 1 * NVU = ILAST GL = INITVL GU = INITVU NGL = IWORK( 1 ) NGU = IWORK( 2 ) IM = 0 FOUND = 0 INDRIW2 = INDRIW1 + NGU - NGL IEND = 0 IF( IFRST.GT.ILAST ) $ GO TO 100 IF( IFRST.EQ.1 .AND. ILAST.EQ.N ) $ IRANGE = 1 * * Find Eigenvalues -- Loop Over Blocks * DO 90 JB = 1, NSPLIT IOFF = IEND IBEGIN = IOFF + 1 IEND = ISPLIT( JB ) IN = IEND - IOFF IF( JB.NE.1 ) THEN IF( IRANGE.NE.1 ) THEN FOUND = IM * * Find total number of eigenvalues found thus far * CALL IGSUM2D( ONEDCONTEXT, 'All', ' ', 1, 1, FOUND, 1, $ -1, -1 ) ELSE FOUND = IOFF END IF END IF * IF( SELF.GE.P ) * $ GO TO 30 IF( IN.NE.N ) THEN * * Compute Gershgorin interval [gl,gu] for split matrix * GU = D( IBEGIN ) GL = D( IBEGIN ) TMP1 = ZERO * DO 50 J = IBEGIN, IEND - 1 TMP2 = ABS( E( J ) ) GU = MAX( GU, D( J )+TMP1+TMP2 ) GL = MIN( GL, D( J )-TMP1-TMP2 ) TMP1 = TMP2 50 CONTINUE * GU = MAX( GU, D( IEND )+TMP1 ) GL = MIN( GL, D( IEND )-TMP1 ) BNORM = MAX( ABS( GL ), ABS( GU ) ) GL = GL - FUDGE*BNORM*ULP*IN - FUDGE*PIVMIN GU = GU + FUDGE*BNORM*ULP*IN + FUDGE*PIVMIN * * Compute ATOLI for the current submatrix * IF( ABSTOL.LE.ZERO ) THEN ATOLI = ULP*BNORM ELSE ATOLI = ABSTOL END IF * IF( GL.LT.INITVL ) THEN GL = INITVL IF( IEFLAG.EQ.0 ) THEN CALL PSLAPDCT( GL, IN, WORK( INDRW1+2*IOFF+1 ), $ PIVMIN, NGL ) ELSE CALL PSLAIECT( GL, IN, WORK( INDRW1+2*IOFF+1 ), NGL ) END IF ELSE NGL = 0 END IF IF( GU.GT.INITVU ) THEN GU = INITVU IF( IEFLAG.EQ.0 ) THEN CALL PSLAPDCT( GU, IN, WORK( INDRW1+2*IOFF+1 ), $ PIVMIN, NGU ) ELSE CALL PSLAIECT( GU, IN, WORK( INDRW1+2*IOFF+1 ), NGU ) END IF ELSE NGU = IN END IF IF( NGL.GE.NGU ) $ GO TO 90 WORK( 1 ) = GL WORK( 2 ) = GU IWORK( 1 ) = NGL IWORK( 2 ) = NGU END IF OFFSET = FOUND - NGL BLKNO = JB * * Do a static partitioning of work so that each process * has to find an (almost) equal number of eigenvalues * NCMP = NGU - NGL ILOAD = NCMP / P IREM = NCMP - ILOAD*P ITMP1 = MOD( SELF-FOUND, P ) IF( ITMP1.LT.0 ) $ ITMP1 = ITMP1 + P IF( ITMP1.LT.IREM ) THEN IMYLOAD = ILOAD + 1 ELSE IMYLOAD = ILOAD END IF IF( IMYLOAD.EQ.0 ) THEN GO TO 90 ELSE IF( IN.EQ.1 ) THEN WORK( INDRW2+IM+1 ) = WORK( INDRW1+2*IOFF+1 ) IWORK( INDRIW1+IM+1 ) = BLKNO IWORK( INDRIW2+IM+1 ) = OFFSET + 1 IM = IM + 1 GO TO 90 ELSE INXTLOAD = ILOAD ITMP2 = MOD( SELF+1-FOUND, P ) IF( ITMP2.LT.0 ) $ ITMP2 = ITMP2 + P IF( ITMP2.LT.IREM ) $ INXTLOAD = INXTLOAD + 1 LREQ = NGL + ITMP1*ILOAD + MIN( IREM, ITMP1 ) RREQ = LREQ + IMYLOAD IWORK( 5 ) = LREQ IWORK( 6 ) = RREQ TMP1 = WORK( 1 ) ITMP1 = IWORK( 1 ) CALL PSLAEBZ( 1, IN, 1, 1, ATOLI, RELTOL, PIVMIN, $ WORK( INDRW1+2*IOFF+1 ), IWORK( 5 ), WORK, $ IWORK, NINT, LSAVE, IEFLAG, IINFO ) ALPHA = WORK( 1 ) BETA = WORK( 2 ) NALPHA = IWORK( 1 ) NBETA = IWORK( 2 ) DSEND = BETA IF( NBETA.GT.RREQ+INXTLOAD ) THEN NBETA = RREQ DSEND = ALPHA END IF LAST = MOD( FOUND+MIN( NGU-NGL, P )-1, P ) IF( LAST.LT.0 ) $ LAST = LAST + P IF( SELF.NE.LAST ) THEN CALL SGESD2D( ONEDCONTEXT, 1, 1, DSEND, 1, 0, NEXT ) CALL IGESD2D( ONEDCONTEXT, 1, 1, NBETA, 1, 0, NEXT ) END IF IF( SELF.NE.MOD( FOUND, P ) ) THEN CALL SGERV2D( ONEDCONTEXT, 1, 1, DRECV, 1, 0, PREV ) CALL IGERV2D( ONEDCONTEXT, 1, 1, IRECV, 1, 0, PREV ) ELSE DRECV = TMP1 IRECV = ITMP1 END IF WORK( 1 ) = MAX( LSAVE, DRECV ) IWORK( 1 ) = IRECV ALPHA = MAX( ALPHA, WORK( 1 ) ) NALPHA = MAX( NALPHA, IRECV ) IF( BETA-ALPHA.LE.MAX( ATOLI, RELTOL*MAX( ABS( ALPHA ), $ ABS( BETA ) ) ) ) THEN MID = HALF*( ALPHA+BETA ) DO 60 J = OFFSET + NALPHA + 1, OFFSET + NBETA WORK( INDRW2+IM+1 ) = MID IWORK( INDRIW1+IM+1 ) = BLKNO IWORK( INDRIW2+IM+1 ) = J IM = IM + 1 60 CONTINUE WORK( 2 ) = ALPHA IWORK( 2 ) = NALPHA END IF END IF NEIGINT = IWORK( 2 ) - IWORK( 1 ) IF( NEIGINT.LE.0 ) $ GO TO 90 * * Call the main computational routine * CALL PSLAEBZ( 2, IN, NEIGINT, 1, ATOLI, RELTOL, PIVMIN, $ WORK( INDRW1+2*IOFF+1 ), IWORK, WORK, IWORK, $ IOUT, LSAVE, IEFLAG, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 1 END IF DO 80 I = 1, IOUT MID = HALF*( WORK( 2*I-1 )+WORK( 2*I ) ) IF( I.GT.IOUT-IINFO ) $ BLKNO = -BLKNO DO 70 J = OFFSET + IWORK( 2*I-1 ) + 1, $ OFFSET + IWORK( 2*I ) WORK( INDRW2+IM+1 ) = MID IWORK( INDRIW1+IM+1 ) = BLKNO IWORK( INDRIW2+IM+1 ) = J IM = IM + 1 70 CONTINUE 80 CONTINUE 90 CONTINUE * * Find out total number of eigenvalues computed * 100 CONTINUE M = IM CALL IGSUM2D( ONEDCONTEXT, 'ALL', ' ', 1, 1, M, 1, -1, -1 ) * * Move the eigenvalues found to their final destinations * DO 130 I = 1, P IF( SELF.EQ.I-1 ) THEN CALL IGEBS2D( ONEDCONTEXT, 'ALL', ' ', 1, 1, IM, 1 ) IF( IM.NE.0 ) THEN CALL IGEBS2D( ONEDCONTEXT, 'ALL', ' ', IM, 1, $ IWORK( INDRIW2+1 ), IM ) CALL SGEBS2D( ONEDCONTEXT, 'ALL', ' ', IM, 1, $ WORK( INDRW2+1 ), IM ) CALL IGEBS2D( ONEDCONTEXT, 'ALL', ' ', IM, 1, $ IWORK( INDRIW1+1 ), IM ) DO 110 J = 1, IM W( IWORK( INDRIW2+J ) ) = WORK( INDRW2+J ) IBLOCK( IWORK( INDRIW2+J ) ) = IWORK( INDRIW1+J ) 110 CONTINUE END IF ELSE CALL IGEBR2D( ONEDCONTEXT, 'ALL', ' ', 1, 1, TORECV, 1, 0, $ I-1 ) IF( TORECV.NE.0 ) THEN CALL IGEBR2D( ONEDCONTEXT, 'ALL', ' ', TORECV, 1, IWORK, $ TORECV, 0, I-1 ) CALL SGEBR2D( ONEDCONTEXT, 'ALL', ' ', TORECV, 1, WORK, $ TORECV, 0, I-1 ) CALL IGEBR2D( ONEDCONTEXT, 'ALL', ' ', TORECV, 1, $ IWORK( N+1 ), TORECV, 0, I-1 ) DO 120 J = 1, TORECV W( IWORK( J ) ) = WORK( J ) IBLOCK( IWORK( J ) ) = IWORK( N+J ) 120 CONTINUE END IF END IF 130 CONTINUE IF( NSPLIT.GT.1 .AND. IORDER.EQ.1 ) THEN * * Sort the eigenvalues * * DO 140 I = 1, M IWORK( M+I ) = I 140 CONTINUE CALL SLASRT2( 'I', M, W, IWORK( M+1 ), IINFO ) DO 150 I = 1, M IWORK( I ) = IBLOCK( I ) 150 CONTINUE DO 160 I = 1, M IBLOCK( I ) = IWORK( IWORK( M+I ) ) 160 CONTINUE END IF IF( IRANGE.EQ.3 .AND. ( LEXTRA.GT.0 .OR. REXTRA.GT.0 ) ) THEN * * Discard unwanted eigenvalues (occurs only when RANGE = 'I', * and eigenvalues IL, and/or IU are in a cluster) * DO 170 I = 1, M WORK( I ) = W( I ) IWORK( I ) = I IWORK( M+I ) = I 170 CONTINUE DO 190 I = 1, LEXTRA ITMP1 = I DO 180 J = I + 1, M IF( WORK( J ).LT.WORK( ITMP1 ) ) THEN ITMP1 = J END IF 180 CONTINUE TMP1 = WORK( I ) WORK( I ) = WORK( ITMP1 ) WORK( ITMP1 ) = TMP1 IWORK( IWORK( M+ITMP1 ) ) = I IWORK( IWORK( M+I ) ) = ITMP1 ITMP2 = IWORK( M+I ) IWORK( M+I ) = IWORK( M+ITMP1 ) IWORK( M+ITMP1 ) = ITMP2 190 CONTINUE DO 210 I = 1, REXTRA ITMP1 = M - I + 1 DO 200 J = M - I, LEXTRA + 1, -1 IF( WORK( J ).GT.WORK( ITMP1 ) ) THEN ITMP1 = J END IF 200 CONTINUE TMP1 = WORK( M-I+1 ) WORK( M-I+1 ) = WORK( ITMP1 ) WORK( ITMP1 ) = TMP1 IWORK( IWORK( M+ITMP1 ) ) = M - I + 1 IWORK( IWORK( 2*M-I+1 ) ) = ITMP1 ITMP2 = IWORK( 2*M-I+1 ) IWORK( 2*M-I+1 ) = IWORK( M+ITMP1 ) IWORK( M+ITMP1 ) = ITMP2 * IWORK( ITMP1 ) = 1 210 CONTINUE J = 0 DO 220 I = 1, M IF( IWORK( I ).GT.LEXTRA .AND. IWORK( I ).LE.M-REXTRA ) THEN J = J + 1 W( J ) = WORK( IWORK( I ) ) IBLOCK( J ) = IBLOCK( I ) END IF 220 CONTINUE M = M - LEXTRA - REXTRA END IF IF( M.NE.ILAST-IFRST+1 ) THEN INFO = 2 END IF * 230 CONTINUE CALL BLACS_FREEBUFF( ONEDCONTEXT, 1 ) CALL BLACS_GRIDEXIT( ONEDCONTEXT ) RETURN * * End of PSSTEBZ * END * SUBROUTINE PSLAEBZ( IJOB, N, MMAX, MINP, ABSTOL, RELTOL, PIVMIN, $ D, NVAL, INTVL, INTVLCT, MOUT, LSAVE, IEFLAG, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * * .. Scalar Arguments .. INTEGER IEFLAG, IJOB, INFO, MINP, MMAX, MOUT, N REAL ABSTOL, LSAVE, PIVMIN, RELTOL * .. * .. Array Arguments .. INTEGER INTVLCT( * ), NVAL( * ) REAL D( * ), INTVL( * ) * .. * * Purpose * ======= * * PSLAEBZ contains the iteration loop which computes the eigenvalues * contained in the input intervals [ INTVL(2*j-1), INTVL(2*j) ] where * j = 1,...,MINP. It uses and computes the function N(w), which is * the count of eigenvalues of a symmetric tridiagonal matrix less than * or equal to its argument w. * * This is a ScaLAPACK internal subroutine and arguments are not * checked for unreasonable values. * * Arguments * ========= * * IJOB (input) INTEGER * Specifies the computation done by PSLAEBZ * = 0 : Find an interval with desired values of N(w) at the * endpoints of the interval. * = 1 : Find a floating point number contained in the initial * interval with a desired value of N(w). * = 2 : Perform bisection iteration to find eigenvalues of T. * * N (input) INTEGER * The order of the tridiagonal matrix T. N >= 1. * * MMAX (input) INTEGER * The maximum number of intervals that may be generated. If * more than MMAX intervals are generated, then PSLAEBZ will * quit with INFO = MMAX+1. * * MINP (input) INTEGER * The initial number of intervals. MINP <= MMAX. * * ABSTOL (input) REAL * The minimum (absolute) width of an interval. When an interval * is narrower than ABSTOL, or than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be sufficiently * small, i.e., converged. * This must be at least zero. * * RELTOL (input) REAL * The minimum relative width of an interval. When an interval * is narrower than ABSTOL, or than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be sufficiently * small, i.e., converged. * Note : This should be at least radix*machine epsilon. * * PIVMIN (input) REAL * The minimum absolute of a "pivot" in the "paranoid" * implementation of the Sturm sequence loop. This must be at * least max_j |e(j)^2| *safe_min, and at least safe_min, where * safe_min is at least the smallest number that can divide 1.0 * without overflow. * See PSLAPDCT for the "paranoid" implementation of the Sturm * sequence loop. * * D (input) REAL array, dimension (2*N - 1) * Contains the diagonals and the squares of the off-diagonal * elements of the tridiagonal matrix T. These elements are * assumed to be interleaved in memory for better cache * performance. The diagonal entries of T are in the entries * D(1),D(3),...,D(2*N-1), while the squares of the off-diagonal * entries are D(2),D(4),...,D(2*N-2). To avoid overflow, the * matrix must be scaled so that its largest entry is no greater * than overflow**(1/2) * underflow**(1/4) in absolute value, * and for greatest accuracy, it should not be much smaller * than that. * * NVAL (input/output) INTEGER array, dimension (4) * If IJOB = 0, the desired values of N(w) are in NVAL(1) and * NVAL(2). * If IJOB = 1, NVAL(2) is the desired value of N(w). * If IJOB = 2, not referenced. * This array will, in general, be reordered on output. * * INTVL (input/output) REAL array, dimension (2*MMAX) * The endpoints of the intervals. INTVL(2*j-1) is the left * endpoint of the j-th interval, and INTVL(2*j) is the right * endpoint of the j-th interval. The input intervals will, * in general, be modified, split and reordered by the * calculation. * On input, INTVL contains the MINP input intervals. * On output, INTVL contains the converged intervals. * * INTVLCT (input/output) INTEGER array, dimension (2*MMAX) * The counts at the endpoints of the intervals. INTVLCT(2*j-1) * is the count at the left endpoint of the j-th interval, i.e., * the function value N(INTVL(2*j-1)), and INTVLCT(2*j) is the * count at the right endpoint of the j-th interval. * On input, INTVLCT contains the counts at the endpoints of * the MINP input intervals. * On output, INTVLCT contains the counts at the endpoints of * the converged intervals. * * MOUT (output) INTEGER * The number of intervals output. * * LSAVE (output) REAL * If IJOB = 0 or 2, not referenced. * If IJOB = 1, this is the largest floating point number * encountered which has count N(w) = NVAL(1). * * IEFLAG (input) INTEGER * A flag which indicates whether N(w) should be speeded up by * exploiting IEEE Arithmetic. * * INFO (output) INTEGER * = 0 : All intervals converged. * = 1 - MMAX : The last INFO intervals did not converge. * = MMAX + 1 : More than MMAX intervals were generated. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, INT, LOG, MAX, MIN * .. * .. External Subroutines .. EXTERNAL PSLAECV, PSLAIECT, PSLAPDCT * .. * .. Parameters .. REAL ZERO, TWO, HALF PARAMETER ( ZERO = 0.0E+0, TWO = 2.0E+0, $ HALF = 1.0E+0 / TWO ) * .. * .. Local Scalars .. INTEGER I, ITMAX, J, K, KF, KL, KLNEW, L, LCNT, LREQ, $ NALPHA, NBETA, NMID, RCNT, RREQ REAL ALPHA, BETA, MID * .. * .. Executable Statements .. * KF = 1 KL = MINP + 1 INFO = 0 IF( INTVL( 2 )-INTVL( 1 ).LE.ZERO ) THEN INFO = MINP MOUT = KF RETURN END IF IF( IJOB.EQ.0 ) THEN * * Check if some input intervals have "converged" * CALL PSLAECV( 0, KF, KL, INTVL, INTVLCT, NVAL, $ MAX( ABSTOL, PIVMIN ), RELTOL ) IF( KF.GE.KL ) $ GO TO 60 * * Compute upper bound on number of iterations needed * ITMAX = INT( ( LOG( INTVL( 2 )-INTVL( 1 )+PIVMIN )- $ LOG( PIVMIN ) ) / LOG( TWO ) ) + 2 * * Iteration Loop * DO 20 I = 1, ITMAX KLNEW = KL DO 10 J = KF, KL - 1 K = 2*J * * Bisect the interval and find the count at that point * MID = HALF*( INTVL( K-1 )+INTVL( K ) ) IF( IEFLAG.EQ.0 ) THEN CALL PSLAPDCT( MID, N, D, PIVMIN, NMID ) ELSE CALL PSLAIECT( MID, N, D, NMID ) END IF LREQ = NVAL( K-1 ) RREQ = NVAL( K ) IF( KL.EQ.1 ) $ NMID = MIN( INTVLCT( K ), $ MAX( INTVLCT( K-1 ), NMID ) ) IF( NMID.LE.NVAL( K-1 ) ) THEN INTVL( K-1 ) = MID INTVLCT( K-1 ) = NMID END IF IF( NMID.GE.NVAL( K ) ) THEN INTVL( K ) = MID INTVLCT( K ) = NMID END IF IF( NMID.GT.LREQ .AND. NMID.LT.RREQ ) THEN L = 2*KLNEW INTVL( L-1 ) = MID INTVL( L ) = INTVL( K ) INTVLCT( L-1 ) = NVAL( K ) INTVLCT( L ) = INTVLCT( K ) INTVL( K ) = MID INTVLCT( K ) = NVAL( K-1 ) NVAL( L-1 ) = NVAL( K ) NVAL( L ) = NVAL( L-1 ) NVAL( K ) = NVAL( K-1 ) KLNEW = KLNEW + 1 END IF 10 CONTINUE KL = KLNEW CALL PSLAECV( 0, KF, KL, INTVL, INTVLCT, NVAL, $ MAX( ABSTOL, PIVMIN ), RELTOL ) IF( KF.GE.KL ) $ GO TO 60 20 CONTINUE ELSE IF( IJOB.EQ.1 ) THEN ALPHA = INTVL( 1 ) BETA = INTVL( 2 ) NALPHA = INTVLCT( 1 ) NBETA = INTVLCT( 2 ) LSAVE = ALPHA LREQ = NVAL( 1 ) RREQ = NVAL( 2 ) 30 CONTINUE IF( NBETA.NE.RREQ .AND. BETA-ALPHA.GT. $ MAX( ABSTOL, RELTOL*MAX( ABS( ALPHA ), ABS( BETA ) ) ) ) $ THEN * * Bisect the interval and find the count at that point * MID = HALF*( ALPHA+BETA ) IF( IEFLAG.EQ.0 ) THEN CALL PSLAPDCT( MID, N, D, PIVMIN, NMID ) ELSE CALL PSLAIECT( MID, N, D, NMID ) END IF NMID = MIN( NBETA, MAX( NALPHA, NMID ) ) IF( NMID.GE.RREQ ) THEN BETA = MID NBETA = NMID ELSE ALPHA = MID NALPHA = NMID IF( NMID.EQ.LREQ ) $ LSAVE = ALPHA END IF GO TO 30 END IF KL = KF INTVL( 1 ) = ALPHA INTVL( 2 ) = BETA INTVLCT( 1 ) = NALPHA INTVLCT( 2 ) = NBETA ELSE IF( IJOB.EQ.2 ) THEN * * Check if some input intervals have "converged" * CALL PSLAECV( 1, KF, KL, INTVL, INTVLCT, NVAL, $ MAX( ABSTOL, PIVMIN ), RELTOL ) IF( KF.GE.KL ) $ GO TO 60 * * Compute upper bound on number of iterations needed * ITMAX = INT( ( LOG( INTVL( 2 )-INTVL( 1 )+PIVMIN )- $ LOG( PIVMIN ) ) / LOG( TWO ) ) + 2 * * Iteration Loop * DO 50 I = 1, ITMAX KLNEW = KL DO 40 J = KF, KL - 1 K = 2*J MID = HALF*( INTVL( K-1 )+INTVL( K ) ) IF( IEFLAG.EQ.0 ) THEN CALL PSLAPDCT( MID, N, D, PIVMIN, NMID ) ELSE CALL PSLAIECT( MID, N, D, NMID ) END IF LCNT = INTVLCT( K-1 ) RCNT = INTVLCT( K ) NMID = MIN( RCNT, MAX( LCNT, NMID ) ) * * Form New Interval(s) * IF( NMID.EQ.LCNT ) THEN INTVL( K-1 ) = MID ELSE IF( NMID.EQ.RCNT ) THEN INTVL( K ) = MID ELSE IF( KLNEW.LT.MMAX+1 ) THEN L = 2*KLNEW INTVL( L-1 ) = MID INTVL( L ) = INTVL( K ) INTVLCT( L-1 ) = NMID INTVLCT( L ) = INTVLCT( K ) INTVL( K ) = MID INTVLCT( K ) = NMID KLNEW = KLNEW + 1 ELSE INFO = MMAX + 1 RETURN END IF 40 CONTINUE KL = KLNEW CALL PSLAECV( 1, KF, KL, INTVL, INTVLCT, NVAL, $ MAX( ABSTOL, PIVMIN ), RELTOL ) IF( KF.GE.KL ) $ GO TO 60 50 CONTINUE END IF 60 CONTINUE INFO = MAX( KL-KF, 0 ) MOUT = KL - 1 RETURN * * End of PSLAEBZ * END * * SUBROUTINE PSLAECV( IJOB, KF, KL, INTVL, INTVLCT, NVAL, ABSTOL, $ RELTOL ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * * .. Scalar Arguments .. INTEGER IJOB, KF, KL REAL ABSTOL, RELTOL * .. * .. Array Arguments .. INTEGER INTVLCT( * ), NVAL( * ) REAL INTVL( * ) * .. * * Purpose * ======= * * PSLAECV checks if the input intervals [ INTVL(2*i-1), INTVL(2*i) ], * i = KF, ... , KL-1, have "converged". * PSLAECV modifies KF to be the index of the last converged interval, * i.e., on output, all intervals [ INTVL(2*i-1), INTVL(2*i) ], i < KF, * have converged. Note that the input intervals may be reordered by * PSLAECV. * * This is a SCALAPACK internal procedure and arguments are not checked * for unreasonable values. * * Arguments * ========= * * IJOB (input) INTEGER * Specifies the criterion for "convergence" of an interval. * = 0 : When an interval is narrower than ABSTOL, or than * RELTOL times the larger (in magnitude) endpoint, then * it is considered to have "converged". * = 1 : When an interval is narrower than ABSTOL, or than * RELTOL times the larger (in magnitude) endpoint, or if * the counts at the endpoints are identical to the counts * specified by NVAL ( see NVAL ) then the interval is * considered to have "converged". * * KF (input/output) INTEGER * On input, the index of the first input interval is 2*KF-1. * On output, the index of the last converged interval * is 2*KF-3. * * KL (input) INTEGER * The index of the last input interval is 2*KL-3. * * INTVL (input/output) REAL array, dimension (2*(KL-KF)) * The endpoints of the intervals. INTVL(2*j-1) is the left * oendpoint f the j-th interval, and INTVL(2*j) is the right * endpoint of the j-th interval. The input intervals will, * in general, be reordered on output. * On input, INTVL contains the KL-KF input intervals. * On output, INTVL contains the converged intervals, 1 thru' * KF-1, and the unconverged intervals, KF thru' KL-1. * * INTVLCT (input/output) INTEGER array, dimension (2*(KL-KF)) * The counts at the endpoints of the intervals. INTVLCT(2*j-1) * is the count at the left endpoint of the j-th interval, i.e., * the function value N(INTVL(2*j-1)), and INTVLCT(2*j) is the * count at the right endpoint of the j-th interval. This array * will, in general, be reordered on output. * See the comments in PSLAEBZ for more on the function N(w). * * NVAL (input/output) INTEGER array, dimension (2*(KL-KF)) * The desired counts, N(w), at the endpoints of the * corresponding intervals. This array will, in general, * be reordered on output. * * ABSTOL (input) REAL * The minimum (absolute) width of an interval. When an interval * is narrower than ABSTOL, or than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be sufficiently * small, i.e., converged. * Note : This must be at least zero. * * RELTOL (input) REAL * The minimum relative width of an interval. When an interval * is narrower than ABSTOL, or than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be sufficiently * small, i.e., converged. * Note : This should be at least radix*machine epsilon. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Local Scalars .. LOGICAL CONDN INTEGER I, ITMP1, ITMP2, J, K, KFNEW REAL TMP1, TMP2, TMP3, TMP4 * .. * .. Executable Statements .. * KFNEW = KF DO 10 I = KF, KL - 1 K = 2*I TMP3 = INTVL( K-1 ) TMP4 = INTVL( K ) TMP1 = ABS( TMP4-TMP3 ) TMP2 = MAX( ABS( TMP3 ), ABS( TMP4 ) ) CONDN = TMP1.LT.MAX( ABSTOL, RELTOL*TMP2 ) IF( IJOB.EQ.0 ) $ CONDN = CONDN .OR. ( ( INTVLCT( K-1 ).EQ.NVAL( K-1 ) ) .AND. $ INTVLCT( K ).EQ.NVAL( K ) ) IF( CONDN ) THEN IF( I.GT.KFNEW ) THEN * * Reorder Intervals * J = 2*KFNEW TMP1 = INTVL( K-1 ) TMP2 = INTVL( K ) ITMP1 = INTVLCT( K-1 ) ITMP2 = INTVLCT( K ) INTVL( K-1 ) = INTVL( J-1 ) INTVL( K ) = INTVL( J ) INTVLCT( K-1 ) = INTVLCT( J-1 ) INTVLCT( K ) = INTVLCT( J ) INTVL( J-1 ) = TMP1 INTVL( J ) = TMP2 INTVLCT( J-1 ) = ITMP1 INTVLCT( J ) = ITMP2 IF( IJOB.EQ.0 ) THEN ITMP1 = NVAL( K-1 ) NVAL( K-1 ) = NVAL( J-1 ) NVAL( J-1 ) = ITMP1 ITMP1 = NVAL( K ) NVAL( K ) = NVAL( J ) NVAL( J ) = ITMP1 END IF END IF KFNEW = KFNEW + 1 END IF 10 CONTINUE KF = KFNEW RETURN * * End of PSLAECV * END * SUBROUTINE PSLAPDCT( SIGMA, N, D, PIVMIN, COUNT ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * * .. Scalar Arguments .. INTEGER COUNT, N REAL PIVMIN, SIGMA * .. * .. Array Arguments .. REAL D( * ) * .. * * Purpose * ======= * * PSLAPDCT counts the number of negative eigenvalues of (T - SIGMA I). * This implementation of the Sturm Sequence loop has conditionals in * the innermost loop to avoid overflow and determine the sign of a * floating point number. PSLAPDCT will be referred to as the "paranoid" * implementation of the Sturm Sequence loop. * * This is a SCALAPACK internal procedure and arguments are not checked * for unreasonable values. * * Arguments * ========= * * SIGMA (input) REAL * The shift. PSLAPDCT finds the number of eigenvalues of T less * than or equal to SIGMA. * * N (input) INTEGER * The order of the tridiagonal matrix T. N >= 1. * * D (input) REAL array, dimension (2*N - 1) * Contains the diagonals and the squares of the off-diagonal * elements of the tridiagonal matrix T. These elements are * assumed to be interleaved in memory for better cache * performance. The diagonal entries of T are in the entries * D(1),D(3),...,D(2*N-1), while the squares of the off-diagonal * entries are D(2),D(4),...,D(2*N-2). To avoid overflow, the * matrix must be scaled so that its largest entry is no greater * than overflow**(1/2) * underflow**(1/4) in absolute value, * and for greatest accuracy, it should not be much smaller * than that. * * PIVMIN (input) REAL * The minimum absolute of a "pivot" in this "paranoid" * implementation of the Sturm sequence loop. This must be at * least max_j |e(j)^2| *safe_min, and at least safe_min, where * safe_min is at least the smallest number that can divide 1.0 * without overflow. * * COUNT (output) INTEGER * The count of the number of eigenvalues of T less than or * equal to SIGMA. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I REAL TMP * .. * .. Executable Statements .. * TMP = D( 1 ) - SIGMA IF( ABS( TMP ).LE.PIVMIN ) $ TMP = -PIVMIN COUNT = 0 IF( TMP.LE.ZERO ) $ COUNT = 1 DO 10 I = 3, 2*N - 1, 2 TMP = D( I ) - D( I-1 ) / TMP - SIGMA IF( ABS( TMP ).LE.PIVMIN ) $ TMP = -PIVMIN IF( TMP.LE.ZERO ) $ COUNT = COUNT + 1 10 CONTINUE * RETURN * * End of PSLAPDCT * END scalapack-2.0.2/SRC/psstedc.f000644 000766 000024 00000022741 10363532303 016160 0ustar00juliestaff000000 000000 SUBROUTINE PSSTEDC( COMPZ, N, D, E, Q, IQ, JQ, DESCQ, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * .. Scalar Arguments .. CHARACTER COMPZ INTEGER INFO, IQ, JQ, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCQ( * ), IWORK( * ) REAL D( * ), E( * ), Q( * ), WORK( * ) * .. * * Purpose * ======= * PSSTEDC computes all eigenvalues and eigenvectors of a * symmetric tridiagonal matrix in parallel, using the divide and * conquer algorithm. * * This code makes very mild assumptions about floating point * arithmetic. It will work on machines with a guard digit in * add/subtract, or on those binary machines without guard digits * which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. * It could conceivably fail on hexadecimal or decimal machines * without guard digits, but we know of none. See SLAED3 for details. * * Arguments * ========= * * COMPZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only. (NOT IMPLEMENTED YET) * = 'I': Compute eigenvectors of tridiagonal matrix also. * = 'V': Compute eigenvectors of original dense symmetric * matrix also. On entry, Z contains the orthogonal * matrix used to reduce the original matrix to * tridiagonal form. (NOT IMPLEMENTED YET) * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * D (global input/output) REAL array, dimension (N) * On entry, the diagonal elements of the tridiagonal matrix. * On exit, if INFO = 0, the eigenvalues in descending order. * * E (global input/output) REAL array, dimension (N-1) * On entry, the subdiagonal elements of the tridiagonal matrix. * On exit, E has been destroyed. * * Q (local output) REAL array, * local dimension ( LLD_Q, LOCc(JQ+N-1)) * Q contains the orthonormal eigenvectors of the symmetric * tridiagonal matrix. * On output, Q is distributed across the P processes in block * cyclic format. * * IQ (global input) INTEGER * Q's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JQ (global input) INTEGER * Q's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * * WORK (local workspace/output) REAL array, * dimension (LWORK) * On output, WORK(1) returns the workspace needed. * * LWORK (local input/output) INTEGER, * the dimension of the array WORK. * LWORK = 6*N + 2*NP*NQ * NP = NUMROC( N, NB, MYROW, DESCQ( RSRC_ ), NPROW ) * NQ = NUMROC( N, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) * * If LWORK = -1, the LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the WORK array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * On exit, if LIWORK > 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. * LIWORK = 2 + 7*N + 8*NPCOL * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: The algorithm failed to compute the INFO/(N+1) th * eigenvalue while working on the submatrix lying in * global rows and columns mod(INFO,N+1). * * Further Details * ======= ======= * * Contributed by Francoise Tisseur, University of Manchester. * * Reference: F. Tisseur and J. Dongarra, "A Parallel Divide and * Conquer Algorithm for the Symmetric Eigenvalue Problem * on Distributed Memory Architectures", * SIAM J. Sci. Comput., 6:20 (1999), pp. 2223--2236. * (see also LAPACK Working Note 132) * http://www.netlib.org/lapack/lawns/lawn132.ps * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER ICOFFQ, IIQ, IPQ, IQCOL, IQROW, IROFFQ, JJQ, $ LDQ, LIWMIN, LWMIN, MYCOL, MYROW, NB, NP, $ NPCOL, NPROW, NQ REAL ORGNRM * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC REAL SLANST EXTERNAL INDXG2P, LSAME, NUMROC, SLANST * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, INFOG2L, PSLAED0, $ PSLASRT, PXERBLA, SLASCL, SSTEDC * .. * .. Intrinsic Functions .. INTRINSIC MOD, REAL * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Test the input parameters. * CALL BLACS_GRIDINFO( DESCQ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) LDQ = DESCQ( LLD_ ) NB = DESCQ( NB_ ) NP = NUMROC( N, NB, MYROW, DESCQ( RSRC_ ), NPROW ) NQ = NUMROC( N, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IQ, JQ, DESCQ, 8, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCQ( NB_ ) IROFFQ = MOD( IQ-1, DESCQ( MB_ ) ) ICOFFQ = MOD( JQ-1, DESCQ( NB_ ) ) IQROW = INDXG2P( IQ, NB, MYROW, DESCQ( RSRC_ ), NPROW ) IQCOL = INDXG2P( JQ, NB, MYCOL, DESCQ( CSRC_ ), NPCOL ) LWMIN = 6*N + 2*NP*NQ LIWMIN = 2 + 7*N + 8*NPCOL WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( .NOT.LSAME( COMPZ, 'I' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( IROFFQ.NE.ICOFFQ .OR. ICOFFQ.NE.0 ) THEN INFO = -5 ELSE IF( DESCQ( MB_ ).NE.DESCQ( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCQ( CTXT_ ), 'PSSTEDC', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return * IF( N.EQ.0 ) $ GO TO 10 CALL INFOG2L( IQ, JQ, DESCQ, NPROW, NPCOL, MYROW, MYCOL, IIQ, JJQ, $ IQROW, IQCOL ) IF( N.EQ.1 ) THEN IF( MYROW.EQ.IQROW .AND. MYCOL.EQ.IQCOL ) $ Q( 1 ) = ONE GO TO 10 END IF * * If N is smaller than the minimum divide size NB, then * solve the problem with the serial divide and conquer * code locally. * IF( N.LE.NB ) THEN IF( ( MYROW.EQ.IQROW ) .AND. ( MYCOL.EQ.IQCOL ) ) THEN IPQ = IIQ + ( JJQ-1 )*LDQ CALL SSTEDC( 'I', N, D, E, Q( IPQ ), LDQ, WORK, LWORK, $ IWORK, LIWORK, INFO ) IF( INFO.NE.0 ) THEN INFO = ( N+1 ) + N GO TO 10 END IF END IF GO TO 10 END IF * * If P=NPROW*NPCOL=1, solve the problem with SSTEDC. * IF( NPCOL*NPROW.EQ.1 ) THEN IPQ = IIQ + ( JJQ-1 )*LDQ CALL SSTEDC( 'I', N, D, E, Q( IPQ ), LDQ, WORK, LWORK, IWORK, $ LIWORK, INFO ) GO TO 10 END IF * * Scale matrix to allowable range, if necessary. * ORGNRM = SLANST( 'M', N, D, E ) IF( ORGNRM.NE.ZERO ) THEN CALL SLASCL( 'G', 0, 0, ORGNRM, ONE, N, 1, D, N, INFO ) CALL SLASCL( 'G', 0, 0, ORGNRM, ONE, N-1, 1, E, N-1, INFO ) END IF * CALL PSLAED0( N, D, E, Q, IQ, JQ, DESCQ, WORK, IWORK, INFO ) * * Sort eigenvalues and corresponding eigenvectors * CALL PSLASRT( 'I', N, D, Q, IQ, JQ, DESCQ, WORK, LWORK, IWORK, $ LIWORK, INFO ) * * Scale back. * IF( ORGNRM.NE.ZERO ) $ CALL SLASCL( 'G', 0, 0, ONE, ORGNRM, N, 1, D, N, INFO ) * 10 CONTINUE * IF( LWORK.GT.0 ) $ WORK( 1 ) = REAL( LWMIN ) IF( LIWORK.GT.0 ) $ IWORK( 1 ) = LIWMIN RETURN * * End of PSSTEDC * END scalapack-2.0.2/SRC/psstein.f000644 000766 000024 00000060306 10602576752 016213 0ustar00juliestaff000000 000000 SUBROUTINE PSSTEIN( N, D, E, M, W, IBLOCK, ISPLIT, ORFAC, Z, IZ, $ JZ, DESCZ, WORK, LWORK, IWORK, LIWORK, IFAIL, $ ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER INFO, IZ, JZ, LIWORK, LWORK, M, N REAL ORFAC * .. * .. Array Arguments .. INTEGER DESCZ( * ), IBLOCK( * ), ICLUSTR( * ), $ IFAIL( * ), ISPLIT( * ), IWORK( * ) REAL D( * ), E( * ), GAP( * ), W( * ), WORK( * ), $ Z( * ) * .. * * Purpose * ======= * * PSSTEIN computes the eigenvectors of a symmetric tridiagonal matrix * in parallel, using inverse iteration. The eigenvectors found * correspond to user specified eigenvalues. PSSTEIN does not * orthogonalize vectors that are on different processes. The extent * of orthogonalization is controlled by the input parameter LWORK. * Eigenvectors that are to be orthogonalized are computed by the same * process. PSSTEIN decides on the allocation of work among the * processes and then calls SSTEIN2 (modified LAPACK routine) on each * individual process. If insufficient workspace is allocated, the * expected orthogonalization may not be done. * * Note : If the eigenvectors obtained are not orthogonal, increase * LWORK and run the code again. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * P = NPROW * NPCOL is the total number of processes * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * D (global input) REAL array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. * * E (global input) REAL array, dimension (N-1) * The (n-1) off-diagonal elements of the tridiagonal matrix T. * * M (global input) INTEGER * The total number of eigenvectors to be found. 0 <= M <= N. * * W (global input/global output) REAL array, dim (M) * On input, the first M elements of W contain all the * eigenvalues for which eigenvectors are to be computed. The * eigenvalues should be grouped by split-off block and ordered * from smallest to largest within the block (The output array * W from PSSTEBZ with ORDER='b' is expected here). This * array should be replicated on all processes. * On output, the first M elements contain the input * eigenvalues in ascending order. * * Note : To obtain orthogonal vectors, it is best if * eigenvalues are computed to highest accuracy ( this can be * done by setting ABSTOL to the underflow threshold = * SLAMCH('U') --- ABSTOL is an input parameter * to PSSTEBZ ) * * IBLOCK (global input) INTEGER array, dimension (N) * The submatrix indices associated with the corresponding * eigenvalues in W -- 1 for eigenvalues belonging to the * first submatrix from the top, 2 for those belonging to * the second submatrix, etc. (The output array IBLOCK * from PSSTEBZ is expected here). * * ISPLIT (global input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N (The output array * ISPLIT from PSSTEBZ is expected here.) * * ORFAC (global input) REAL * ORFAC specifies which eigenvectors should be orthogonalized. * Eigenvectors that correspond to eigenvalues which are within * ORFAC*||T|| of each other are to be orthogonalized. * However, if the workspace is insufficient (see LWORK), this * tolerance may be decreased until all eigenvectors to be * orthogonalized can be stored in one process. * No orthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) REAL array, * dimension (DESCZ(DLEN_), N/npcol + NB) * Z contains the computed eigenvectors associated with the * specified eigenvalues. Any vector which fails to converge is * set to its current iterate after MAXITS iterations ( See * SSTEIN2 ). * On output, Z is distributed across the P processes in block * cyclic format. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace/global output) REAL array, * dimension ( LWORK ) * On output, WORK(1) gives a lower bound on the * workspace ( LWORK ) that guarantees the user desired * orthogonalization (see ORFAC). * Note that this may overestimate the minimum workspace needed. * * LWORK (local input) integer * LWORK controls the extent of orthogonalization which can be * done. The number of eigenvectors for which storage is * allocated on each process is * NVEC = floor(( LWORK- max(5*N,NP00*MQ00) )/N). * Eigenvectors corresponding to eigenvalue clusters of size * NVEC - ceil(M/P) + 1 are guaranteed to be orthogonal ( the * orthogonality is similar to that obtained from SSTEIN2). * Note : LWORK must be no smaller than: * max(5*N,NP00*MQ00) + ceil(M/P)*N, * and should have the same input value on all processes. * It is the minimum value of LWORK input on different processes * that is significant. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/global output) INTEGER array, * dimension ( 3*N+P+1 ) * On return, IWORK(1) contains the amount of integer workspace * required. * On return, the IWORK(2) through IWORK(P+2) indicate * the eigenvectors computed by each process. Process I computes * eigenvectors indexed IWORK(I+2)+1 thru' IWORK(I+3). * * LIWORK (local input) INTEGER * Size of array IWORK. Must be >= 3*N + P + 1 * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (global output) integer array, dimension (M) * On normal exit, all elements of IFAIL are zero. * If one or more eigenvectors fail to converge after MAXITS * iterations (as in SSTEIN), then INFO > 0 is returned. * If mod(INFO,M+1)>0, then * for I=1 to mod(INFO,M+1), the eigenvector * corresponding to the eigenvalue W(IFAIL(I)) failed to * converge ( W refers to the array of eigenvalues on output ). * * ICLUSTR (global output) integer array, dimension (2*P) * This output array contains indices of eigenvectors * corresponding to a cluster of eigenvalues that could not be * orthogonalized due to insufficient workspace (see LWORK, * ORFAC and INFO). Eigenvectors corresponding to clusters of * eigenvalues indexed ICLUSTR(2*I-1) to ICLUSTR(2*I), I = 1 to * INFO/(M+1), could not be orthogonalized due to lack of * workspace. Hence the eigenvectors corresponding to these * clusters may not be orthogonal. ICLUSTR is a zero terminated * array --- ( ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0 ) * if and only if K is the number of clusters. * * GAP (global output) REAL array, dimension (P) * This output array contains the gap between eigenvalues whose * eigenvectors could not be orthogonalized. The INFO/M output * values in this array correspond to the INFO/(M+1) clusters * indicated by the array ICLUSTR. As a result, the dot product * between eigenvectors corresponding to the I^th cluster may be * as high as ( O(n)*macheps ) / GAP(I). * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * < 0 : if INFO = -I, the I-th argument had an illegal value * > 0 : if mod(INFO,M+1) = I, then I eigenvectors failed to * converge in MAXITS iterations. Their indices are * stored in the array IFAIL. * if INFO/(M+1) = I, then eigenvectors corresponding to * I clusters of eigenvalues could not be orthogonalized * due to insufficient workspace. The indices of the * clusters are stored in the array ICLUSTR. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, REAL * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMN2D, IGEBR2D, $ IGEBS2D, PCHK1MAT, PSLAEVSWP, PXERBLA, SGEBR2D, $ SGEBS2D, SLASRT2, SSTEIN2 * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, NEGONE, ODM1, FIVE, ODM3, ODM18 PARAMETER ( ZERO = 0.0E+0, NEGONE = -1.0E+0, $ ODM1 = 1.0E-1, FIVE = 5.0E+0, ODM3 = 1.0E-3, $ ODM18 = 1.0E-18 ) * .. * .. Local Scalars .. LOGICAL LQUERY, SORTED INTEGER B1, BN, BNDRY, CLSIZ, COL, I, IFIRST, IINFO, $ ILAST, IM, INDRW, ITMP, J, K, LGCLSIZ, LLWORK, $ LOAD, LOCINFO, MAXVEC, MQ00, MYCOL, MYROW, $ NBLK, NERR, NEXT, NP00, NPCOL, NPROW, NVS, $ OLNBLK, P, ROW, SELF, TILL, TOTERR REAL DIFF, MINGAP, ONENRM, ORGFAC, ORTOL, TMPFAC * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) SELF = MYROW*NPCOL + MYCOL * * Make sure that we belong to this context (before calling PCHK1MAT) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1200+CTXT_ ) ELSE * * Make sure that NPROW>0 and NPCOL>0 before calling NUMROC * CALL CHK1MAT( N, 1, N, 1, IZ, JZ, DESCZ, 12, INFO ) IF( INFO.EQ.0 ) THEN * * Now we know that our context is good enough to * perform the rest of the checks * NP00 = NUMROC( N, DESCZ( MB_ ), 0, 0, NPROW ) MQ00 = NUMROC( M, DESCZ( NB_ ), 0, 0, NPCOL ) P = NPROW*NPCOL * * Compute the maximum number of vectors per process * LLWORK = LWORK CALL IGAMN2D( DESCZ( CTXT_ ), 'A', ' ', 1, 1, LLWORK, 1, 1, $ 1, -1, -1, -1 ) INDRW = MAX( 5*N, NP00*MQ00 ) IF( N.NE.0 ) $ MAXVEC = ( LLWORK-INDRW ) / N LOAD = ICEIL( M, P ) IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN TMPFAC = ORFAC CALL SGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, TMPFAC, $ 1 ) ELSE CALL SGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, TMPFAC, $ 1, 0, 0 ) END IF * LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( M.LT.0 .OR. M.GT.N ) THEN INFO = -4 ELSE IF( MAXVEC.LT.LOAD .AND. .NOT.LQUERY ) THEN INFO = -14 ELSE IF( LIWORK.LT.3*N+P+1 .AND. .NOT.LQUERY ) THEN INFO = -16 ELSE DO 10 I = 2, M IF( IBLOCK( I ).LT.IBLOCK( I-1 ) ) THEN INFO = -6 GO TO 20 END IF IF( IBLOCK( I ).EQ.IBLOCK( I-1 ) .AND. W( I ).LT. $ W( I-1 ) ) THEN INFO = -5 GO TO 20 END IF 10 CONTINUE 20 CONTINUE IF( INFO.EQ.0 ) THEN IF( ABS( TMPFAC-ORFAC ).GT.FIVE*ABS( TMPFAC ) ) $ INFO = -8 END IF END IF * END IF IDUM1( 1 ) = M IDUM2( 1 ) = 4 CALL PCHK1MAT( N, 1, N, 1, IZ, JZ, DESCZ, 12, 1, IDUM1, IDUM2, $ INFO ) WORK( 1 ) = REAL( MAX( 5*N, NP00*MQ00 )+ICEIL( M, P )*N ) IWORK( 1 ) = 3*N + P + 1 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCZ( CTXT_ ), 'PSSTEIN', -INFO ) RETURN ELSE IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) THEN RETURN END IF * DO 30 I = 1, M IFAIL( I ) = 0 30 CONTINUE DO 40 I = 1, P + 1 IWORK( I ) = 0 40 CONTINUE DO 50 I = 1, P GAP( I ) = NEGONE ICLUSTR( 2*I-1 ) = 0 ICLUSTR( 2*I ) = 0 50 CONTINUE * * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) $ RETURN * IF( ORFAC.GE.ZERO ) THEN TMPFAC = ORFAC ELSE TMPFAC = ODM3 END IF ORGFAC = TMPFAC * * Allocate the work among the processes * ILAST = M / LOAD IF( MOD( M, LOAD ).EQ.0 ) $ ILAST = ILAST - 1 OLNBLK = -1 NVS = 0 NEXT = 1 IM = 0 ONENRM = ZERO DO 100 I = 0, ILAST - 1 NEXT = NEXT + LOAD J = NEXT - 1 IF( J.GT.NVS ) THEN NBLK = IBLOCK( NEXT ) IF( NBLK.EQ.IBLOCK( NEXT-1 ) .AND. NBLK.NE.OLNBLK ) THEN * * Compute orthogonalization criterion * IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 60 J = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( J ) )+ABS( E( J-1 ) )+ $ ABS( E( J ) ) ) 60 CONTINUE OLNBLK = NBLK END IF TILL = NVS + MAXVEC 70 CONTINUE J = NEXT - 1 IF( TMPFAC.GT.ODM18 ) THEN ORTOL = TMPFAC*ONENRM DO 80 J = NEXT - 1, MIN( TILL, M-1 ) IF( IBLOCK( J+1 ).NE.IBLOCK( J ) .OR. W( J+1 )- $ W( J ).GE.ORTOL ) THEN GO TO 90 END IF 80 CONTINUE IF( J.EQ.M .AND. TILL.GE.M ) $ GO TO 90 TMPFAC = TMPFAC*ODM1 GO TO 70 END IF 90 CONTINUE J = MIN( J, TILL ) END IF IF( SELF.EQ.I ) $ IM = MAX( 0, J-NVS ) * IWORK( I+1 ) = NVS NVS = MAX( J, NVS ) 100 CONTINUE IF( SELF.EQ.ILAST ) $ IM = M - NVS IWORK( ILAST+1 ) = NVS DO 110 I = ILAST + 2, P + 1 IWORK( I ) = M 110 CONTINUE * CLSIZ = 1 LGCLSIZ = 1 ILAST = 0 NBLK = 0 BNDRY = 2 K = 1 DO 140 I = 1, M IF( IBLOCK( I ).NE.NBLK ) THEN NBLK = IBLOCK( I ) IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 120 J = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( J ) )+ABS( E( J-1 ) )+ $ ABS( E( J ) ) ) 120 CONTINUE * END IF IF( I.GT.1 ) THEN DIFF = W( I ) - W( I-1 ) IF( IBLOCK( I ).NE.IBLOCK( I-1 ) .OR. I.EQ.M .OR. DIFF.GT. $ ORGFAC*ONENRM ) THEN IFIRST = ILAST IF( I.EQ.M ) THEN IF( IBLOCK( M ).NE.IBLOCK( M-1 ) .OR. DIFF.GT.ORGFAC* $ ONENRM ) THEN ILAST = M - 1 ELSE ILAST = M END IF ELSE ILAST = I - 1 END IF CLSIZ = ILAST - IFIRST IF( CLSIZ.GT.1 ) THEN IF( LGCLSIZ.LT.CLSIZ ) $ LGCLSIZ = CLSIZ MINGAP = ONENRM 130 CONTINUE IF( BNDRY.GT.P+1 ) $ GO TO 150 IF( IWORK( BNDRY ).GT.IFIRST .AND. IWORK( BNDRY ).LT. $ ILAST ) THEN MINGAP = MIN( W( IWORK( BNDRY )+1 )- $ W( IWORK( BNDRY ) ), MINGAP ) ELSE IF( IWORK( BNDRY ).GE.ILAST ) THEN IF( MINGAP.LT.ONENRM ) THEN ICLUSTR( 2*K-1 ) = IFIRST + 1 ICLUSTR( 2*K ) = ILAST GAP( K ) = MINGAP / ONENRM K = K + 1 END IF GO TO 140 END IF BNDRY = BNDRY + 1 GO TO 130 END IF END IF END IF 140 CONTINUE 150 CONTINUE INFO = ( K-1 )*( M+1 ) * * Call SSTEIN2 to find the eigenvectors * CALL SSTEIN2( N, D, E, IM, W( IWORK( SELF+1 )+1 ), $ IBLOCK( IWORK( SELF+1 )+1 ), ISPLIT, ORGFAC, $ WORK( INDRW+1 ), N, WORK, IWORK( P+2 ), $ IFAIL( IWORK( SELF+1 )+1 ), LOCINFO ) * * Redistribute the eigenvector matrix to conform with the block * cyclic distribution of the input matrix * * DO 160 I = 1, M IWORK( P+1+I ) = I 160 CONTINUE * CALL SLASRT2( 'I', M, W, IWORK( P+2 ), IINFO ) * DO 170 I = 1, M IWORK( M+P+1+IWORK( P+1+I ) ) = I 170 CONTINUE * * DO 180 I = 1, LOCINFO ITMP = IWORK( SELF+1 ) + I IFAIL( ITMP ) = IFAIL( ITMP ) + ITMP - I IFAIL( ITMP ) = IWORK( M+P+1+IFAIL( ITMP ) ) 180 CONTINUE * DO 190 I = 1, K - 1 ICLUSTR( 2*I-1 ) = IWORK( M+P+1+ICLUSTR( 2*I-1 ) ) ICLUSTR( 2*I ) = IWORK( M+P+1+ICLUSTR( 2*I ) ) 190 CONTINUE * * * Still need to apply the above permutation to IFAIL * * TOTERR = 0 DO 210 I = 1, P IF( SELF.EQ.I-1 ) THEN CALL IGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, LOCINFO, 1 ) IF( LOCINFO.NE.0 ) THEN CALL IGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', LOCINFO, 1, $ IFAIL( IWORK( I )+1 ), LOCINFO ) DO 200 J = 1, LOCINFO IFAIL( TOTERR+J ) = IFAIL( IWORK( I )+J ) 200 CONTINUE TOTERR = TOTERR + LOCINFO END IF ELSE * ROW = ( I-1 ) / NPCOL COL = MOD( I-1, NPCOL ) * CALL IGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, NERR, 1, $ ROW, COL ) IF( NERR.NE.0 ) THEN CALL IGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', NERR, 1, $ IFAIL( TOTERR+1 ), NERR, ROW, COL ) TOTERR = TOTERR + NERR END IF END IF 210 CONTINUE INFO = INFO + TOTERR * * CALL PSLAEVSWP( N, WORK( INDRW+1 ), N, Z, IZ, JZ, DESCZ, IWORK, $ IWORK( M+P+2 ), WORK, INDRW ) * DO 220 I = 2, P IWORK( I ) = IWORK( M+P+1+IWORK( I ) ) 220 CONTINUE * * * Sort the IWORK array * * 230 CONTINUE SORTED = .TRUE. DO 240 I = 2, P - 1 IF( IWORK( I ).GT.IWORK( I+1 ) ) THEN ITMP = IWORK( I+1 ) IWORK( I+1 ) = IWORK( I ) IWORK( I ) = ITMP SORTED = .FALSE. END IF 240 CONTINUE IF( .NOT.SORTED ) $ GO TO 230 * DO 250 I = P + 1, 1, -1 IWORK( I+1 ) = IWORK( I ) 250 CONTINUE * WORK( 1 ) = ( LGCLSIZ+LOAD-1 )*N + INDRW IWORK( 1 ) = 3*N + P + 1 * * End of PSSTEIN * END scalapack-2.0.2/SRC/pssyev.f000644 000766 000024 00000052560 11640165744 016060 0ustar00juliestaff000000 000000 SUBROUTINE PSSYEV( JOBZ, UPLO, N, A, IA, JA, DESCA, W, $ Z, IZ, JZ, DESCZ, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER JOBZ, UPLO INTEGER IA, INFO, IZ, JA, JZ, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) REAL A( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSSYEV computes all eigenvalues and, optionally, eigenvectors * of a real symmetric matrix A by calling the recommended sequence * of ScaLAPACK routines. * * In its present form, PSSYEV assumes a homogeneous system and makes * no checks for consistency of the eigenvalues or eigenvectors across * the different processes. Because of this, it is possible that a * heterogeneous system may return incorrect results without any error * messages. * * Notes * ===== * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distributed * matrix A. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of A. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of A. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of the * distributed matrix A. * LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic DOUBLE PRECISION array, * global dimension (N, N), local dimension ( LLD_A, * LOCc(JA+N-1) ) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PSSYEV cannot guarantee * correct error reporting. * * W (global output) REAL array, dimension (N) * If INFO=0, the eigenvalues in ascending order. * * Z (local output) REAL array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) REAL array, * dimension (LWORK) * Version 1.0: on output, WORK(1) returns the workspace * needed to guarantee completion. * If the input parameters are incorrect, WORK(1) may also be * incorrect. * * If JOBZ='N' WORK(1) = minimal=optimal amount of workspace * If JOBZ='V' WORK(1) = minimal workspace required to * generate all the eigenvectors. * * * LWORK (local input) INTEGER * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= 5*N + SIZESYTRD + 1 * where * SIZESYTRD = The workspace requirement for PSSYTRD * and is MAX( NB * ( NP +1 ), 3 * NB ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * * QRMEM = 2*N-2 * LWMIN = 5*N + N*LDC + MAX( SIZEMQRLEFT, QRMEM ) + 1 * * Variable definitions: * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( RSRC_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP = NUMROC( NN, NB, 0, 0, NPROW ) * NQ = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) * NRC = NUMROC( N, NB, MYPROWC, 0, NPROCS) * LDC = MAX( 1, NRC ) * SIZEMQRLEFT = The workspace requirement for PSORMTR * when it's SIDE argument is 'L'. * * With MYPROWC defined when a new context is created as: * CALL BLACS_GET( DESCA( CTXT_ ), 0, CONTEXTC ) * CALL BLACS_GRIDINIT( CONTEXTC, 'R', NPROCS, 1 ) * CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, * MYPCOLC ) * * If LWORK = -1, the LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the WORK array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = 1 through N, the i(th) eigenvalue did not * converge in SSTEQR2 after a total of 30*N iterations. * If INFO = N+1, then PSSYEV has detected heterogeneity * by finding that eigenvalues were not identical across * the process grid. In this case, the accuracy of * the results from PSSYEV cannot be guaranteed. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and Z(IZ:IZ+M-1,JZ:JZ+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * ( MB_A.EQ.NB_A.EQ.MB_Z .AND. IROFFA.EQ.IROFFZ .AND. IROFFA.EQ.0 .AND. * IAROW.EQ.IZROW ) * where * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * Version 1.4 limitations: * DESCA(MB_) = DESCA(NB_) * DESCA(M_) = DESCZ(M_) * DESCA(N_) = DESCZ(N_) * DESCA(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL FIVE, ONE, TEN, ZERO PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, $ TEN = 10.0E+0, FIVE = 5.0E+0 ) INTEGER IERREIN, IERRCLS, IERRSPC, IERREBZ, ITHVAL PARAMETER ( IERREIN = 1, IERRCLS = 2, IERRSPC = 4, $ IERREBZ = 8, ITHVAL = 10 ) * .. * .. Local Scalars .. LOGICAL LOWER, WANTZ INTEGER CONTEXTC, CSRC_A, I, IACOL, IAROW, ICOFFA, $ IINFO, INDD, INDD2, INDE, INDE2, INDTAU, $ INDWORK, INDWORK2, IROFFA, IROFFZ, ISCALE, $ IZROW, J, K, LDC, LLWORK, LWMIN, MB_A, MB_Z, $ MYCOL, MYPCOLC, MYPROWC, MYROW, NB, NB_A, NB_Z, $ NP, NPCOL, NPCOLC, NPROCS, NPROW, NPROWC, NQ, $ NRC, QRMEM, RSRC_A, RSRC_Z, SIZEMQRLEFT, $ SIZESYTRD REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, $ SMLNUM * .. * .. Local Arrays .. INTEGER DESCQR( 9 ), IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC, SL_GRIDRESHAPE REAL PSLAMCH, PSLANSY EXTERNAL LSAME, NUMROC, PSLAMCH, PSLANSY, $ SL_GRIDRESHAPE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, CHK1MAT, $ DESCINIT, PCHK1MAT, PCHK2MAT, PSELGET, $ PSGEMR2D, PSLASCL, PSLASET, PSORMTR, PSSYTRD, $ PXERBLA, SCOPY, SSCAL, SSTEQR2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, ICHAR, INT, MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return * IF( N.EQ.0 ) RETURN * * Test the input arguments. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 * WANTZ = LSAME( JOBZ, 'V' ) IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE IF( WANTZ ) THEN IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) END IF END IF IF( INFO .EQ. 0 ) THEN CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 3, N, 3, IZ, JZ, DESCZ, 12, INFO ) * IF( INFO.EQ.0 ) THEN * * Get machine constants. * SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe minimum' ) EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * NPROCS = NPROW*NPCOL NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A LOWER = LSAME( UPLO, 'L' ) * RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( 1, NB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, MB_A, MYCOL, CSRC_A, NPCOL ) NP = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) IF( WANTZ ) THEN NB_Z = DESCZ( NB_ ) MB_Z = DESCZ( MB_ ) RSRC_Z = DESCZ( RSRC_ ) IROFFZ = MOD( IZ-1, MB_A ) IZROW = INDXG2P( 1, NB_A, MYROW, RSRC_Z, NPROW ) SIZEMQRLEFT = MAX( ( NB_A*( NB_A-1 ) ) / 2, ( NP+NQ )* $ NB_A ) + NB_A*NB_A ELSE SIZEMQRLEFT = 0 IROFFZ = 0 IZROW = 0 END IF SIZESYTRD = MAX( NB * ( NP +1 ), 3 * NB ) * * Initialize the context of the single column distributed * matrix required by SSTEQR2. This specific distribution * allows each process to do 1/pth of the work updating matrix * Q during SSTEQR2 and achieve some parallelization to an * otherwise serial subroutine. * LDC = 0 IF( WANTZ ) THEN CONTEXTC = SL_GRIDRESHAPE( DESCA( CTXT_ ), 0, 1, 1, $ NPROCS, 1 ) CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, $ MYPCOLC ) NRC = NUMROC( N, NB_A, MYPROWC, 0, NPROCS) LDC = MAX( 1, NRC ) CALL DESCINIT( DESCQR, N, N, NB, NB, 0, 0, CONTEXTC, $ LDC, INFO ) END IF * * Set up pointers into the WORK array * INDTAU = 1 INDE = INDTAU + N INDD = INDE + N INDD2 = INDD + N INDE2 = INDD2 + N INDWORK = INDE2 + N INDWORK2 = INDWORK + N*LDC LLWORK = LWORK - INDWORK + 1 * * Compute the total amount of space needed * QRMEM = 2*N-2 IF( WANTZ ) THEN LWMIN = 5*N + N*LDC + MAX( SIZEMQRLEFT, QRMEM ) + 1 ELSE LWMIN = 5*N + SIZESYTRD + 1 END IF * END IF IF( INFO.EQ.0 ) THEN IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -14 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( WANTZ ) THEN IF( IROFFA.NE.IROFFZ ) THEN INFO = -10 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -10 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 1200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 1200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 1200+RSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) ENDIF END IF END IF IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 3 IF( LSAME( JOBZ, 'V' ) ) THEN CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, $ IZ, JZ, DESCZ, 12, 3, IDUM1, IDUM2, INFO ) ELSE CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 3, IDUM1, $ IDUM2, INFO ) END IF * * Write the required workspace for lwork queries. * WORK( 1 ) = REAL( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PSSYEV', -INFO ) IF( WANTZ ) CALL BLACS_GRIDEXIT( CONTEXTC ) RETURN ELSE IF( LWORK .EQ. -1 ) THEN IF( WANTZ ) CALL BLACS_GRIDEXIT( CONTEXTC ) RETURN END IF * * Scale matrix to allowable range, if necessary. * ISCALE = 0 * ANRM = PSLANSY( 'M', UPLO, N, A, IA, JA, DESCA, WORK( INDWORK ) ) * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM END IF * IF( ISCALE.EQ.1 ) THEN CALL PSLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) END IF * * Reduce symmetric matrix to tridiagonal form. * CALL PSSYTRD( UPLO, N, A, IA, JA, DESCA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * Copy the values of D, E to all processes. * DO 10 I=1,N CALL PSELGET( 'A', ' ', WORK(INDD2+I-1), A, $ I+IA-1, I+JA-1, DESCA ) 10 CONTINUE IF( LSAME( UPLO, 'U') ) THEN DO 20 I=1,N-1 CALL PSELGET( 'A', ' ', WORK(INDE2+I-1), A, $ I+IA-1, I+JA, DESCA ) 20 CONTINUE ELSE DO 30 I=1,N-1 CALL PSELGET( 'A', ' ', WORK(INDE2+I-1), A, $ I+IA, I+JA-1, DESCA ) 30 CONTINUE ENDIF * IF( WANTZ ) THEN * CALL PSLASET( 'Full', N, N, ZERO, ONE, WORK( INDWORK ), 1, 1, $ DESCQR ) * * SSTEQR2 is a modified version of LAPACK's SSTEQR. The * modifications allow each process to perform partial updates * to matrix Q. * CALL SSTEQR2( 'I', N, WORK( INDD2 ), WORK( INDE2 ), $ WORK( INDWORK ), LDC, NRC, WORK( INDWORK2 ), $ INFO ) * CALL PSGEMR2D( N, N, WORK( INDWORK ), 1, 1, DESCQR, Z, IA, JA, $ DESCZ, CONTEXTC ) * CALL PSORMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) * ELSE * CALL SSTEQR2( 'N', N, WORK( INDD2 ), WORK( INDE2 ), $ WORK( INDWORK ), 1, 1, WORK( INDWORK2 ), $ INFO ) ENDIF * * Copy eigenvalues from workspace to output array * CALL SCOPY( N, WORK( INDD2 ), 1, W, 1 ) * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE .EQ. 1 ) THEN CALL SSCAL( N, ONE / SIGMA, W, 1 ) END IF * * Free up resources * IF( WANTZ ) THEN CALL BLACS_GRIDEXIT( CONTEXTC ) END IF * * Compare every ith eigenvalue, or all if there are only a few, * across the process grid to check for heterogeneity. * IF( N.LE.ITHVAL ) THEN J = N K = 1 ELSE J = N/ITHVAL K = ITHVAL END IF * DO 40 I = 1, J WORK( I+INDTAU ) = W( (I-1)*K+1 ) WORK( I+INDE ) = W( (I-1)*K+1 ) 40 CONTINUE * CALL SGAMN2D( DESCA( CTXT_ ), 'a', ' ', J, 1, WORK( 1+INDTAU ), $ J, 1, 1, -1, -1, 0 ) CALL SGAMX2D( DESCA( CTXT_ ), 'a', ' ', J, 1, WORK( 1+INDE ), $ J, 1, 1, -1, -1, 0 ) * DO 50 I = 1, J IF( INFO.EQ.0 .AND. ( WORK( I+INDTAU )-WORK( I+INDE ) $ .NE. ZERO ) )THEN INFO = N+1 END IF 50 CONTINUE * RETURN * * End of PSSYEV * END scalapack-2.0.2/SRC/pssyevd.f000644 000766 000024 00000031236 10363532303 016207 0ustar00juliestaff000000 000000 SUBROUTINE PSSYEVD( JOBZ, UPLO, N, A, IA, JA, DESCA, W, Z, IZ, JZ, $ DESCZ, WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 14, 2000 * * .. Scalar Arguments .. CHARACTER JOBZ, UPLO INTEGER IA, INFO, IZ, JA, JZ, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) REAL A( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSSYEVD computes all the eigenvalues and eigenvectors * of a real symmetric matrix A by calling the recommended sequence * of ScaLAPACK routines. * * In its present form, PSSYEVD assumes a homogeneous system and makes * no checks for consistency of the eigenvalues or eigenvectors across * the different processes. Because of this, it is possible that a * heterogeneous system may return incorrect results without any error * messages. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; (NOT IMPLEMENTED YET) * = 'V': Compute eigenvalues and eigenvectors. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/workspace) block cyclic REAL array, * global dimension (N, N), local dimension ( LLD_A, * LOCc(JA+N-1) ) * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * W (global output) REAL array, dimension (N) * If INFO=0, the eigenvalues in ascending order. * * Z (local output) REAL array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * Z contains the orthonormal eigenvectors * of the symmetric matrix A. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) REAL array, * dimension (LWORK) * On output, WORK(1) returns the workspace required. * * LWORK (local input) INTEGER * LWORK >= MAX( 1+6*N+2*NP*NQ, TRILWMIN ) + 2*N * TRILWMIN = 3*N + MAX( NB*( NP+1 ), 3*NB ) * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) * NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * * If LWORK = -1, the LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the WORK array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * On exit, if LIWORK > 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. * LIWORK = 7*N + 8*NPCOL + 2 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: The algorithm failed to compute the INFO/(N+1) th * eigenvalue while working on the submatrix lying in * global rows and columns mod(INFO,N+1). * * Alignment requirements * ====================== * * The distributed submatrices sub( A ), sub( Z ) must verify * some alignment properties, namely the following expression * should be true: * ( MB_A.EQ.NB_A.EQ.MB_Z.EQ.NB_Z .AND. IROFFA.EQ.ICOFFA .AND. * IROFFA.EQ.0 .AND.IROFFA.EQ.IROFFZ. AND. IAROW.EQ.IZROW) * with IROFFA = MOD( IA-1, MB_A ) * and ICOFFA = MOD( JA-1, NB_A ). * * Further Details * ======= ======= * * Contributed by Francoise Tisseur, University of Manchester. * * Reference: F. Tisseur and J. Dongarra, "A Parallel Divide and * Conquer Algorithm for the Symmetric Eigenvalue Problem * on Distributed Memory Architectures", * SIAM J. Sci. Comput., 6:20 (1999), pp. 2223--2236. * (see also LAPACK Working Note 132) * http://www.netlib.org/lapack/lawns/lawn132.ps * * ===================================================================== * * .. Parameters .. * INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER IACOL, IAROW, ICOFFA, ICOFFZ, ICTXT, IINFO, $ INDD, INDE, INDE2, INDTAU, INDWORK, INDWORK2, $ IROFFA, IROFFZ, ISCALE, LIWMIN, LLWORK, $ LLWORK2, LWMIN, MYCOL, MYROW, NB, NP, NPCOL, $ NPROW, NQ, OFFSET, TRILWMIN REAL ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, $ SMLNUM * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC REAL PSLAMCH, PSLANSY EXTERNAL LSAME, INDXG2P, NUMROC, PSLAMCH, PSLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSLARED1D, $ PSLASCL, PSLASET, PSORMTR, PSSTEDC, PSSYTRD, $ PXERBLA, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return * IF( N.EQ.0 ) $ RETURN * * Test the input arguments. * CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IZ, JZ, DESCZ, 12, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFZ = MOD( IZ-1, DESCZ( MB_ ) ) ICOFFZ = MOD( JZ-1, DESCZ( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) TRILWMIN = 3*N + MAX( NB*( NP+1 ), 3*NB ) LWMIN = MAX( 1+6*N+2*NP*NQ, TRILWMIN ) + 2*N LIWMIN = 7*N + 8*NPCOL + 2 WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN IF( .NOT.LSAME( JOBZ, 'V' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( IROFFA.NE.IROFFZ .OR. ICOFFA.NE.ICOFFZ ) THEN INFO = -10 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 1200+M_ ) ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1200+NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1200+MB_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 1200+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 1200+CSRC_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -14 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 2 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 14 CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYEVD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Set up pointers into the WORK array * INDTAU = 1 INDE = INDTAU + N INDD = INDE + N INDE2 = INDD + N INDWORK = INDE2 + N LLWORK = LWORK - INDWORK + 1 INDWORK2 = INDD LLWORK2 = LWORK - INDWORK2 + 1 * * Scale matrix to allowable range, if necessary. * ISCALE = 0 SAFMIN = PSLAMCH( DESCA( CTXT_ ), 'Safe minimum' ) EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) ANRM = PSLANSY( 'M', UPLO, N, A, IA, JA, DESCA, WORK( INDWORK ) ) * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM END IF * IF( ISCALE.EQ.1 ) THEN CALL PSLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) END IF * * Reduce symmetric matrix to tridiagonal form. * * CALL PSSYTRD( UPLO, N, A, IA, JA, DESCA, WORK( INDD ), $ WORK( INDE2 ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * Copy the values of D, E to all processes. * CALL PSLARED1D( N, IA, JA, DESCA, WORK( INDD ), W, $ WORK( INDWORK ), LLWORK ) * CALL PSLARED1D( N, IA, JA, DESCA, WORK( INDE2 ), WORK( INDE ), $ WORK( INDWORK ), LLWORK ) * CALL PSLASET( 'Full', N, N, ZERO, ONE, Z, 1, 1, DESCZ ) * IF( UPPER ) THEN OFFSET = 1 ELSE OFFSET = 0 END IF CALL PSSTEDC( 'I', N, W, WORK( INDE+OFFSET ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK2 ), LLWORK2, IWORK, LIWORK, INFO ) * CALL PSORMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, WORK( INDWORK2 ), $ LLWORK2, IINFO ) * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL SSCAL( N, ONE / SIGMA, W, 1 ) END IF * RETURN * * End of PSSYEVD * END scalapack-2.0.2/SRC/pssyevr.f000644 000766 000024 00000127656 11750130340 016235 0ustar00juliestaff000000 000000 SUBROUTINE PSSYEVR( JOBZ, RANGE, UPLO, N, A, IA, JA, $ DESCA, VL, VU, IL, IU, M, NZ, W, Z, IZ, $ JZ, DESCZ, WORK, LWORK, IWORK, LIWORK, $ INFO ) IMPLICIT NONE * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, INFO, IU, IZ, JA, JZ, LIWORK, LWORK, M, $ N, NZ REAL VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) REAL A( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSSYEVR computes selected eigenvalues and, optionally, eigenvectors * of a real symmetric matrix A distributed in 2D blockcyclic format * by calling the recommended sequence of ScaLAPACK routines. * * First, the matrix A is reduced to real symmetric tridiagonal form. * Then, the eigenproblem is solved using the parallel MRRR algorithm. * Last, if eigenvectors have been computed, a backtransformation is done. * * Upon successful completion, each processor stores a copy of all computed * eigenvalues in W. The eigenvector matrix Z is stored in * 2D blockcyclic format distributed over all processors. * * Note that subsets of eigenvalues/vectors can be selected by * specifying a range of values or a range of indices for the desired * eigenvalues. * * For constructive feedback and comments, please contact cvoemel@lbl.gov * C. Voemel * * Arguments * ========= * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0 * * A (local input/workspace) 2D block cyclic REAL array, * global dimension (N, N), * local dimension ( LLD_A, LOCc(JA+N-1) ), * (see Notes below for more detailed explanation of 2d arrays) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * DESCA (global and local input) INTEGER array of dimension DLEN=9. * The array descriptor for the distributed matrix A. * The descriptor stores details about the 2D block-cyclic * storage, see the notes below. * If DESCA is incorrect, PSSYEVR cannot guarantee * correct error reporting. * Also note the array alignment requirements specified below. * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A'. * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M * * W (global output) REAL array, dimension (N) * Upon successful exit, the first M entries contain the selected * eigenvalues in ascending order. * * Z (local output) REAL array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * (see Notes below for more detailed explanation of 2d arrays) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * The context DESCZ( CTXT_ ) must equal DESCA( CTXT_ ). * Also note the array alignment requirements specified below. * * WORK (local workspace/output) REAL array, * dimension (LWORK) * On return, WORK(1) contains the optimal amount of * workspace required for efficient execution. * if JOBZ='N' WORK(1) = optimal amount of workspace * required to compute the eigenvalues. * if JOBZ='V' WORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors. * * LWORK (local input) INTEGER * Size of WORK, must be at least 3. * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= 2 + 5*N + MAX( 12 * NN, NB * ( NP0 + 1 ) ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required is: * LWORK >= 2 + 5*N + MAX( 18*NN, NP0 * MQ0 + 2 * NB * NB ) + * (2 + ICEIL( NEIG, NPROW*NPCOL))*NN * * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance for all work arrays. Each of * these values is returned in the first entry of the * corresponding work arrays, and no error message is issued by * PXERBLA. * Note that in a workspace query, for performance the optimal * workspace LWOPT is returned rather than the minimum necessary * WORKSPACE LWMIN. For very small matrices, LWOPT >> LWMIN. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * * Let NNP = MAX( N, NPROW*NPCOL + 1, 4 ). Then: * LIWORK >= 12*NNP + 2*N when the eigenvectors are desired * LIWORK >= 10*NNP + 2*N when only the eigenvalues have to be computed * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA, * or DESCZ for the descriptor of Z, etc. * The length of a ScaLAPACK descriptor is nine. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * PSSYEVR assumes IEEE 754 standard compliant arithmetic. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and Z(IZ:IZ+M-1,JZ:JZ+N-1) * must satisfy the following alignment properties: * * 1.Identical (quadratic) dimension: * DESCA(M_) = DESCZ(M_) = DESCA(N_) = DESCZ(N_) * 2.Quadratic conformal blocking: * DESCA(MB_) = DESCA(NB_) = DESCZ(MB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * 3.MOD( IA-1, MB_A ) = MOD( IZ-1, MB_Z ) = 0 * 4.IAROW = IZROW * * * .. Parameters .. INTEGER CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_ PARAMETER ( CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8 ) REAL ZERO PARAMETER ( ZERO = 0.0E0 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, COLBRT, DOBCST, FINISH, FIRST, INDEIG, $ LOWER, LQUERY, VALEIG, VSTART, WANTZ INTEGER ANB, DOL, DOU, DSTCOL, DSTROW, EIGCNT, FRSTCL, $ I, IAROW, ICTXT, IIL, IINDERR, IINDWLC, IINFO, $ IIU, IM, INDD, INDD2, INDE, INDE2, INDERR, $ INDILU, INDRW, INDTAU, INDWLC, INDWORK, IPIL, $ IPIU, IPROC, IZROW, LASTCL, LENGTHI, LENGTHI2, $ LIWMIN, LLWORK, LWMIN, LWOPT, MAXCLS, MQ00, $ MYCOL, MYIL, MYIU, MYPROC, MYROW, MZ, NB, $ NDEPTH, NEEDIL, NEEDIU, NNP, NP00, NPCOL, $ NPROCS, NPROW, NPS, NSPLIT, NSYTRD_LWOPT, $ OFFSET, PARITY, RLENGTHI, RLENGTHI2, RSTARTI, $ SIZE1, SIZE2, SQNPC, SRCCOL, SRCROW, STARTI, $ ZOFFSET REAL PIVMIN, SAFMIN, SCALE, VLL, VUU, WL, $ WU * * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PJLAENV, $ PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGEBR2D, IGEBS2D, $ IGERV2D, IGESD2D, IGSUM2D, PCHK1MAT, PCHK2MAT, $ PSELGET, PSLAEVSWP, PSLARED1D, PSORMTR, $ PSSYNTRD, PXERBLA, SCOPY, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D, SLARRC, SLASRT2, $ SSTEGR2A, SSTEGR2B, SSTEGR2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, ICHAR, INT, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * INFO = 0 *********************************************************************** * * Decode character arguments to find out what the code should do * *********************************************************************** WANTZ = LSAME( JOBZ, 'V' ) LOWER = LSAME( UPLO, 'L' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) *********************************************************************** * * GET MACHINE PARAMETERS * *********************************************************************** ICTXT = DESCA( CTXT_ ) SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) *********************************************************************** * * Set up pointers into the WORK array * *********************************************************************** INDTAU = 1 INDD = INDTAU + N INDE = INDD + N + 1 INDD2 = INDE + N + 1 INDE2 = INDD2 + N INDWORK = INDE2 + N LLWORK = LWORK - INDWORK + 1 *********************************************************************** * * BLACS PROCESSOR GRID SETUP * *********************************************************************** CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW * NPCOL MYPROC = MYROW * NPCOL + MYCOL IF( NPROW.EQ.-1 ) THEN INFO = -( 800+CTXT_ ) ELSE IF( WANTZ ) THEN IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF *********************************************************************** * * COMPUTE REAL WORKSPACE * *********************************************************************** IF ( ALLEIG ) THEN MZ = N ELSE IF ( INDEIG ) THEN MZ = IU - IL + 1 ELSE * Take upper bound for VALEIG case MZ = N END IF * NB = DESCA( NB_ ) IF ( WANTZ ) THEN NP00 = NUMROC( N, NB, 0, 0, NPROW ) MQ00 = NUMROC( MZ, NB, 0, 0, NPCOL ) INDRW = INDWORK + MAX(18*N, NP00*MQ00 + 2*NB*NB) LWMIN = INDRW - 1 + (ICEIL(MZ, NPROCS) + 2)*N ELSE INDRW = INDWORK + 12*N LWMIN = INDRW - 1 END IF * The code that validates the input requires 3 workspace entries LWMIN = MAX(3, LWMIN) LWOPT = LWMIN ANB = PJLAENV( ICTXT, 3, 'PSSYTTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( REAL( NPROCS ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NSYTRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS LWOPT = MAX( LWOPT, 5*N+NSYTRD_LWOPT ) * SIZE1 = INDRW - INDWORK *********************************************************************** * * COMPUTE INTEGER WORKSPACE * *********************************************************************** NNP = MAX( N, NPROCS+1, 4 ) IF ( WANTZ ) THEN LIWMIN = 12*NNP + 2*N ELSE LIWMIN = 10*NNP + 2*N END IF *********************************************************************** * * Set up pointers into the IWORK array * *********************************************************************** * Pointer to eigenpair distribution over processors INDILU = LIWMIN - 2*NPROCS + 1 SIZE2 = INDILU - 2*N *********************************************************************** * * Test the input arguments. * *********************************************************************** IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 21, INFO ) * IF( INFO.EQ.0 ) THEN IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -3 ELSE IF( MOD( IA-1, DESCA( MB_ ) ).NE.0 ) THEN INFO = -6 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -10 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -11 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -12 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -21 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 800+NB_ ) END IF IF( WANTZ ) THEN IAROW = INDXG2P( 1, DESCA( NB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) IZROW = INDXG2P( 1, DESCA( NB_ ), MYROW, $ DESCZ( RSRC_ ), NPROW ) IF( IAROW.NE.IZROW ) THEN INFO = -19 ELSE IF( MOD( IA-1, DESCA( MB_ ) ).NE. $ MOD( IZ-1, DESCZ( MB_ ) ) ) THEN INFO = -19 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2100+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2100+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2100+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2100+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2100+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2100+CSRC_ ) ELSE IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( ALLEIG ) THEN IDUM1( 3 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 3 ) = ICHAR( 'I' ) ELSE IDUM1( 3 ) = ICHAR( 'V' ) END IF IDUM2( 3 ) = 3 IF( LQUERY ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 4 IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 8, N, 4, N, 4, IZ, $ JZ, DESCZ, 21, 4, IDUM1, IDUM2, INFO ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 4, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = REAL( LWOPT ) IWORK( 1 ) = LIWMIN END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYEVR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF *********************************************************************** * * Quick return if possible * *********************************************************************** IF( N.EQ.0 ) THEN IF( WANTZ ) THEN NZ = 0 END IF M = 0 WORK( 1 ) = REAL( LWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF IF( VALEIG ) THEN VLL = VL VUU = VU ELSE VLL = ZERO VUU = ZERO END IF * * No scaling done here, leave this to MRRR kernel. * Scale tridiagonal rather than full matrix. * *********************************************************************** * * REDUCE SYMMETRIC MATRIX TO TRIDIAGONAL FORM. * *********************************************************************** CALL PSSYNTRD( UPLO, N, A, IA, JA, DESCA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'PSSYNTRD', -IINFO ) RETURN END IF *********************************************************************** * * DISTRIBUTE TRIDIAGONAL TO ALL PROCESSORS * *********************************************************************** OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. $ DESCA( RSRC_ ).EQ.0 .AND. DESCA( CSRC_ ).EQ.0 ) $ THEN CALL PSLARED1D( N, IA, JA, DESCA, WORK( INDD ), WORK( INDD2 ), $ WORK( INDWORK ), LLWORK ) * CALL PSLARED1D( N, IA, JA, DESCA, WORK( INDE ), WORK( INDE2 ), $ WORK( INDWORK ), LLWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PSELGET( 'A', ' ', WORK( INDD2+I-1 ), A, I+IA-1, $ I+JA-1, DESCA ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PSELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA-1, $ I+JA, DESCA ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PSELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA, $ I+JA-1, DESCA ) 30 CONTINUE END IF END IF *********************************************************************** * * SET IIL, IIU * *********************************************************************** IF ( ALLEIG ) THEN IIL = 1 IIU = N ELSE IF ( INDEIG ) THEN IIL = IL IIU = IU ELSE IF ( VALEIG ) THEN CALL SLARRC('T', N, VLL, VUU, WORK( INDD2 ), $ WORK( INDE2 + OFFSET ), SAFMIN, EIGCNT, IIL, IIU, INFO) * Refine upper bound N that was taken MZ = EIGCNT IIL = IIL + 1 ENDIF IF(MZ.EQ.0) THEN M = 0 IF( WANTZ ) THEN NZ = 0 END IF WORK( 1 ) = REAL( LWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF MYIL = 0 MYIU = 0 M = 0 IM = 0 *********************************************************************** * * COMPUTE WORK ASSIGNMENTS * *********************************************************************** * * Each processor computes the work assignments for all processors * CALL PMPIM2( IIL, IIU, NPROCS, $ IWORK(INDILU), IWORK(INDILU+NPROCS) ) * * Find local work assignment * MYIL = IWORK(INDILU+MYPROC) MYIU = IWORK(INDILU+NPROCS+MYPROC) ZOFFSET = MAX(0, MYIL - IIL - 1) FIRST = ( MYIL .EQ. IIL ) *********************************************************************** * * CALLS TO MRRR KERNEL * *********************************************************************** IF(.NOT.WANTZ) THEN * * Compute eigenvalues only. * IINFO = 0 IF ( MYIL.GT.0 ) THEN DOL = 1 DOU = MYIU - MYIL + 1 CALL SSTEGR2( JOBZ, 'I', N, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), VLL, VUU, MYIL, MYIU, $ IM, W( 1 ), WORK( INDRW ), N, $ MYIU - MYIL + 1, $ IWORK( 1 ), WORK( INDWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, $ DOL, DOU, ZOFFSET, IINFO ) * SSTEGR2 zeroes out the entire W array, so we can't just give * it the part of W we need. So here we copy the W entries into * their correct location DO 49 I = 1, IM W( MYIL-IIL+I ) = W( I ) 49 CONTINUE * W( MYIL ) is at W( MYIL - IIL + 1 ) * W( X ) is at W(X - IIL + 1 ) END IF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'SSTEGR2', -IINFO ) RETURN END IF ELSEIF ( WANTZ .AND. NPROCS.EQ.1 ) THEN * * Compute eigenvalues and -vectors, but only on one processor * IINFO = 0 IF ( MYIL.GT.0 ) THEN DOL = MYIL - IIL + 1 DOU = MYIU - IIL + 1 CALL SSTEGR2( JOBZ, 'I', N, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), VLL, VUU, IIL, IIU, $ IM, W( 1 ), WORK( INDRW ), N, $ N, $ IWORK( 1 ), WORK( INDWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, DOU, $ ZOFFSET, IINFO ) ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'SSTEGR2', -IINFO ) RETURN END IF ELSEIF ( WANTZ ) THEN * * Compute representations in parallel. * Share eigenvalue computation for root between all processors * Then compute the eigenvectors. * IINFO = 0 * Part 1. compute root representations and root eigenvalues IF ( MYIL.GT.0 ) THEN DOL = MYIL - IIL + 1 DOU = MYIU - IIL + 1 CALL SSTEGR2A( JOBZ, 'I', N, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), VLL, VUU, IIL, IIU, $ IM, W( 1 ), WORK( INDRW ), N, $ N, WORK( INDWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, $ DOU, NEEDIL, NEEDIU, $ INDERR, NSPLIT, PIVMIN, SCALE, WL, WU, $ IINFO ) ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'SSTEGR2A', -IINFO ) RETURN END IF * * The second part of parallel MRRR, the representation tree * construction begins. Upon successful completion, the * eigenvectors have been computed. This is indicated by * the flag FINISH. * VSTART = .TRUE. FINISH = (MYIL.LE.0) C Part 2. Share eigenvalues and uncertainties between all processors IINDERR = INDWORK + INDERR - 1 * * * There are currently two ways to communicate eigenvalue information * using the BLACS. * 1.) BROADCAST * 2.) POINT2POINT between collaborators (those processors working * jointly on a cluster. * For efficiency, BROADCAST has been disabled. * At a later stage, other more efficient communication algorithms * might be implemented, e. g. group or tree-based communication. * DOBCST = .FALSE. IF(DOBCST) THEN * First gather everything on the first processor. * Then use BROADCAST-based communication DO 45 I = 2, NPROCS IF (MYPROC .EQ. (I - 1)) THEN DSTROW = 0 DSTCOL = 0 STARTI = DOL IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF (( STARTI.GE.1 ) .AND. ( LENGTHI.GE.1 )) THEN LENGTHI2 = 2*LENGTHI * Copy eigenvalues into communication buffer CALL SCOPY(LENGTHI,W( STARTI ),1, $ WORK( INDD ), 1) * Copy uncertainties into communication buffer CALL SCOPY(LENGTHI,WORK( IINDERR+STARTI-1 ),1, $ WORK( INDD+LENGTHI ), 1) * send buffer CALL SGESD2D( ICTXT, LENGTHI2, $ 1, WORK( INDD ), LENGTHI2, $ DSTROW, DSTCOL ) END IF ELSE IF (MYPROC .EQ. 0) THEN SRCROW = (I-1) / NPCOL SRCCOL = MOD(I-1, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) STARTI = IWORK(1) LENGTHI = IWORK(2) IF (( STARTI.GE.1 ) .AND. ( LENGTHI.GE.1 )) THEN LENGTHI2 = 2*LENGTHI * receive buffer CALL SGERV2D( ICTXT, LENGTHI2, 1, $ WORK(INDD), LENGTHI2, SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL SCOPY( LENGTHI, WORK(INDD), 1, $ W( STARTI ), 1) * copy uncertainties (errors) from communication buffer CALL SCOPY(LENGTHI,WORK(INDD+LENGTHI),1, $ WORK( IINDERR+STARTI-1 ), 1) END IF END IF 45 CONTINUE LENGTHI = IIU - IIL + 1 LENGTHI2 = LENGTHI * 2 IF (MYPROC .EQ. 0) THEN * Broadcast eigenvalues and errors to all processors CALL SCOPY(LENGTHI,W ,1, WORK( INDD ), 1) CALL SCOPY(LENGTHI,WORK( IINDERR ),1, $ WORK( INDD+LENGTHI ), 1) CALL SGEBS2D( ICTXT, 'A', ' ', LENGTHI2, 1, $ WORK(INDD), LENGTHI2 ) ELSE SRCROW = 0 SRCCOL = 0 CALL SGEBR2D( ICTXT, 'A', ' ', LENGTHI2, 1, $ WORK(INDD), LENGTHI2, SRCROW, SRCCOL ) CALL SCOPY( LENGTHI, WORK(INDD), 1, W, 1) CALL SCOPY(LENGTHI,WORK(INDD+LENGTHI),1, $ WORK( IINDERR ), 1) END IF ELSE * * Enable point2point communication between collaborators * * Find collaborators of MYPROC IF( (NPROCS.GT.1).AND.(MYIL.GT.0) ) THEN CALL PMPCOL( MYPROC, NPROCS, IIL, NEEDIL, NEEDIU, $ IWORK(INDILU), IWORK(INDILU+NPROCS), $ COLBRT, FRSTCL, LASTCL ) ELSE COLBRT = .FALSE. ENDIF IF(COLBRT) THEN * If the processor collaborates with others, * communicate information. DO 47 IPROC = FRSTCL, LASTCL IF (MYPROC .EQ. IPROC) THEN STARTI = DOL IWORK(1) = STARTI LENGTHI = MYIU - MYIL + 1 IWORK(2) = LENGTHI IF ((STARTI.GE.1) .AND. (LENGTHI.GE.1)) THEN * Copy eigenvalues into communication buffer CALL SCOPY(LENGTHI,W( STARTI ),1, $ WORK(INDD), 1) * Copy uncertainties into communication buffer CALL SCOPY(LENGTHI, $ WORK( IINDERR+STARTI-1 ),1, $ WORK(INDD+LENGTHI), 1) ENDIF DO 46 I = FRSTCL, LASTCL IF(I.EQ.MYPROC) GOTO 46 DSTROW = I/ NPCOL DSTCOL = MOD(I, NPCOL) CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1) .AND. (LENGTHI.GE.1)) THEN LENGTHI2 = 2*LENGTHI * send buffer CALL SGESD2D( ICTXT, LENGTHI2, $ 1, WORK(INDD), LENGTHI2, $ DSTROW, DSTCOL ) END IF 46 CONTINUE ELSE SRCROW = IPROC / NPCOL SRCCOL = MOD(IPROC, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) RSTARTI = IWORK(1) RLENGTHI = IWORK(2) IF ((RSTARTI.GE.1 ) .AND. (RLENGTHI.GE.1 )) THEN RLENGTHI2 = 2*RLENGTHI CALL SGERV2D( ICTXT, RLENGTHI2, 1, $ WORK(INDE), RLENGTHI2, $ SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL SCOPY( RLENGTHI, WORK(INDE), 1, $ W( RSTARTI ), 1) * copy uncertainties (errors) from communication buffer CALL SCOPY(RLENGTHI,WORK(INDE+RLENGTHI),1, $ WORK( IINDERR+RSTARTI-1 ), 1) END IF END IF 47 CONTINUE ENDIF ENDIF * * Part 3. Compute representation tree and eigenvectors. * What follows is a loop in which the tree * is constructed in parallel from top to bottom, * on level at a time, until all eigenvectors * have been computed. * 100 CONTINUE IF ( MYIL.GT.0 ) THEN CALL SSTEGR2B( JOBZ, N, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), $ IM, W( 1 ), WORK( INDRW ), N, N, $ IWORK( 1 ), WORK( INDWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, $ DOU, NEEDIL, NEEDIU, INDWLC, $ PIVMIN, SCALE, WL, WU, $ VSTART, FINISH, $ MAXCLS, NDEPTH, PARITY, ZOFFSET, IINFO ) IINDWLC = INDWORK + INDWLC - 1 IF(.NOT.FINISH) THEN IF((NEEDIL.LT.DOL).OR.(NEEDIU.GT.DOU)) THEN CALL PMPCOL( MYPROC, NPROCS, IIL, NEEDIL, NEEDIU, $ IWORK(INDILU), IWORK(INDILU+NPROCS), $ COLBRT, FRSTCL, LASTCL ) ELSE COLBRT = .FALSE. FRSTCL = MYPROC LASTCL = MYPROC ENDIF * * Check if this processor collaborates, i.e. * communication is needed. * IF(COLBRT) THEN DO 147 IPROC = FRSTCL, LASTCL IF (MYPROC .EQ. IPROC) THEN STARTI = DOL IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN * Copy eigenvalues into communication buffer CALL SCOPY(LENGTHI, $ WORK( IINDWLC+STARTI-1 ),1, $ WORK(INDD), 1) * Copy uncertainties into communication buffer CALL SCOPY(LENGTHI, $ WORK( IINDERR+STARTI-1 ),1, $ WORK(INDD+LENGTHI), 1) ENDIF DO 146 I = FRSTCL, LASTCL IF(I.EQ.MYPROC) GOTO 146 DSTROW = I/ NPCOL DSTCOL = MOD(I, NPCOL) CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN LENGTHI2 = 2*LENGTHI * send buffer CALL SGESD2D( ICTXT, LENGTHI2, $ 1, WORK(INDD), LENGTHI2, $ DSTROW, DSTCOL ) END IF 146 CONTINUE ELSE SRCROW = IPROC / NPCOL SRCCOL = MOD(IPROC, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) RSTARTI = IWORK(1) RLENGTHI = IWORK(2) IF ((RSTARTI.GE.1).AND.(RLENGTHI.GE.1)) THEN RLENGTHI2 = 2*RLENGTHI CALL SGERV2D( ICTXT,RLENGTHI2, 1, $ WORK(INDE),RLENGTHI2, $ SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL SCOPY(RLENGTHI, WORK(INDE), 1, $ WORK( IINDWLC+RSTARTI-1 ), 1) * copy uncertainties (errors) from communication buffer CALL SCOPY(RLENGTHI,WORK(INDE+RLENGTHI),1, $ WORK( IINDERR+RSTARTI-1 ), 1) END IF END IF 147 CONTINUE ENDIF GOTO 100 ENDIF ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'SSTEGR2B', -IINFO ) RETURN END IF * ENDIF * *********************************************************************** * * MAIN PART ENDS HERE * *********************************************************************** * *********************************************************************** * * ALLGATHER: EACH PROCESSOR SENDS ITS EIGENVALUES TO THE FIRST ONE, * THEN THE FIRST PROCESSOR BROADCASTS ALL EIGENVALUES * *********************************************************************** * DO 50 I = 2, NPROCS IF (MYPROC .EQ. (I - 1)) THEN DSTROW = 0 DSTCOL = 0 STARTI = MYIL - IIL + 1 IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN CALL SGESD2D( ICTXT, LENGTHI, $ 1, W( STARTI ), LENGTHI, $ DSTROW, DSTCOL ) ENDIF ELSE IF (MYPROC .EQ. 0) THEN SRCROW = (I-1) / NPCOL SRCCOL = MOD(I-1, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) STARTI = IWORK(1) LENGTHI = IWORK(2) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN CALL SGERV2D( ICTXT, LENGTHI, 1, $ W( STARTI ), LENGTHI, SRCROW, SRCCOL ) ENDIF ENDIF 50 CONTINUE * Accumulate M from all processors M = IM CALL IGSUM2D( ICTXT, 'A', ' ', 1, 1, M, 1, -1, -1 ) * Broadcast eigenvalues to all processors IF (MYPROC .EQ. 0) THEN * Send eigenvalues CALL SGEBS2D( ICTXT, 'A', ' ', M, 1, W, M ) ELSE SRCROW = 0 SRCCOL = 0 CALL SGEBR2D( ICTXT, 'A', ' ', M, 1, $ W, M, SRCROW, SRCCOL ) END IF * * Sort the eigenvalues and keep permutation in IWORK to * sort the eigenvectors accordingly * DO 160 I = 1, M IWORK( NPROCS+1+I ) = I 160 CONTINUE CALL SLASRT2( 'I', M, W, IWORK( NPROCS+2 ), IINFO ) IF (IINFO.NE.0) THEN CALL PXERBLA( ICTXT, 'SLASRT2', -IINFO ) RETURN END IF *********************************************************************** * * TRANSFORM Z FROM 1D WORKSPACE INTO 2D BLOCKCYCLIC STORAGE * *********************************************************************** IF ( WANTZ ) THEN DO 170 I = 1, M IWORK( M+NPROCS+1+IWORK( NPROCS+1+I ) ) = I 170 CONTINUE * Store NVS in IWORK(1:NPROCS+1) for PSLAEVSWP IWORK( 1 ) = 0 DO 180 I = 1, NPROCS * Find IL and IU for processor i-1 * Has already been computed by PMPIM2 and stored IPIL = IWORK(INDILU+I-1) IPIU = IWORK(INDILU+NPROCS+I-1) IF (IPIL .EQ. 0) THEN IWORK( I + 1 ) = IWORK( I ) ELSE IWORK( I + 1 ) = IWORK( I ) + IPIU - IPIL + 1 ENDIF 180 CONTINUE IF ( FIRST ) THEN CALL PSLAEVSWP(N, WORK( INDRW ), N, Z, IZ, JZ, $ DESCZ, IWORK( 1 ), IWORK( NPROCS+M+2 ), WORK( INDWORK ), $ INDRW - INDWORK ) ELSE CALL PSLAEVSWP(N, WORK( INDRW + N ), N, Z, IZ, JZ, $ DESCZ, IWORK( 1 ), IWORK( NPROCS+M+2 ), WORK( INDWORK ), $ INDRW - INDWORK ) END IF * NZ = M * *********************************************************************** * * Compute eigenvectors of A from eigenvectors of T * *********************************************************************** IF( NZ.GT.0 ) THEN CALL PSORMTR( 'L', UPLO, 'N', N, NZ, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), SIZE1, IINFO ) END IF IF (IINFO.NE.0) THEN CALL PXERBLA( ICTXT, 'PSORMTR', -IINFO ) RETURN END IF * END IF * WORK( 1 ) = REAL( LWOPT ) IWORK( 1 ) = LIWMIN RETURN * * End of PSSYEVR * END scalapack-2.0.2/SRC/pssyevx.f000644 000766 000024 00000115052 11605326344 016240 0ustar00juliestaff000000 000000 SUBROUTINE PSSYEVX( JOBZ, RANGE, UPLO, N, A, IA, JA, DESCA, VL, $ VU, IL, IU, ABSTOL, M, NZ, W, ORFAC, Z, IZ, $ JZ, DESCZ, WORK, LWORK, IWORK, LIWORK, IFAIL, $ ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, INFO, IU, IZ, JA, JZ, LIWORK, LWORK, M, $ N, NZ REAL ABSTOL, ORFAC, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), ICLUSTR( * ), $ IFAIL( * ), IWORK( * ) REAL A( * ), GAP( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PSSYEVX computes selected eigenvalues and, optionally, eigenvectors * of a real symmetric matrix A by calling the recommended sequence * of ScaLAPACK routines. Eigenvalues/vectors can be selected by * specifying a range of values or a range of indices for the desired * eigenvalues. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * PSSYEVX assumes IEEE 754 standard compliant arithmetic. To port * to a system which does not have IEEE 754 arithmetic, modify * the appropriate SLmake.inc file to include the compiler switch * -DNO_IEEE. This switch only affects the compilation of pslaiect.c. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic REAL array, * global dimension (N, N), * local dimension ( LLD_A, LOCc(JA+N-1) ) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PSSYEVX cannot guarantee * correct error reporting. * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * ABSTOL (global input) REAL * If JOBZ='V', setting ABSTOL to PSLAMCH( CONTEXT, 'U') yields * the most orthogonal eigenvectors. * * The absolute error tolerance for the eigenvalues. * An approximate eigenvalue is accepted as converged * when it is determined to lie in an interval [a,b] * of width less than or equal to * * ABSTOL + EPS * max( |a|,|b| ) , * * where EPS is the machine precision. If ABSTOL is less than * or equal to zero, then EPS*norm(T) will be used in its place, * where norm(T) is the 1-norm of the tridiagonal matrix * obtained by reducing A to tridiagonal form. * * Eigenvalues will be computed most accurately when ABSTOL is * set to twice the underflow threshold 2*PSLAMCH('S') not zero. * If this routine returns with ((MOD(INFO,2).NE.0) .OR. * (MOD(INFO/8,2).NE.0)), indicating that some eigenvalues or * eigenvectors did not converge, try setting ABSTOL to * 2*PSLAMCH('S'). * * See "Computing Small Singular Values of Bidiagonal Matrices * with Guaranteed High Relative Accuracy," by Demmel and * Kahan, LAPACK Working Note #3. * * See "On the correctness of Parallel Bisection in Floating * Point" by Demmel, Dhillon and Ren, LAPACK Working Note #70 * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M unless the user supplies * insufficient space and PSSYEVX is not able to detect this * before beginning computation. To get all the eigenvectors * requested, the user must supply both sufficient * space to hold the eigenvectors in Z (M .LE. DESCZ(N_)) * and sufficient workspace to compute them. (See LWORK below.) * PSSYEVX is always able to detect insufficient space without * computation unless RANGE .EQ. 'V'. * * W (global output) REAL array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * ORFAC (global input) REAL * Specifies which eigenvectors should be reorthogonalized. * Eigenvectors that correspond to eigenvalues which are within * tol=ORFAC*norm(A) of each other are to be reorthogonalized. * However, if the workspace is insufficient (see LWORK), * tol may be decreased until all eigenvectors to be * reorthogonalized can be stored in one process. * No reorthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) REAL array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. If an eigenvector * fails to converge, then that column of Z contains the latest * approximation to the eigenvector, and the index of the * eigenvector is returned in IFAIL. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) REAL array, * dimension max(3,LWORK) * On return, WORK(1) contains the optimal amount of * workspace required for efficient execution. * if JOBZ='N' WORK(1) = optimal amount of workspace * required to compute eigenvalues efficiently * if JOBZ='V' WORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors * efficiently with no guarantee on orthogonality. * If RANGE='V', it is assumed that all eigenvectors * may be required. * * LWORK (local input) INTEGER * Size of WORK * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= 5 * N + MAX( 5 * NN, NB * ( NP0 + 1 ) ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * LWORK >= 5*N + MAX( 5*NN, NP0 * MQ0 + 2 * NB * NB ) + * ICEIL( NEIG, NPROW*NPCOL)*NN * * The computed eigenvectors may not be orthogonal if the * minimal workspace is supplied and ORFAC is too small. * If you want to guarantee orthogonality (at the cost * of potentially poor performance) you should add * the following to LWORK: * (CLUSTERSIZE-1)*N * where CLUSTERSIZE is the number of eigenvalues in the * largest cluster, where a cluster is defined as a set of * close eigenvalues: { W(K),...,W(K+CLUSTERSIZE-1) | * W(J+1) <= W(J) + ORFAC*2*norm(A) } * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * When LWORK is too small: * If LWORK is too small to guarantee orthogonality, * PSSYEVX attempts to maintain orthogonality in * the clusters with the smallest * spacing between the eigenvalues. * If LWORK is too small to compute all the eigenvectors * requested, no computation is performed and INFO=-23 * is returned. Note that when RANGE='V', PSSYEVX does * not know how many eigenvectors are requested until * the eigenvalues are computed. Therefore, when RANGE='V' * and as long as LWORK is large enough to allow PSSYEVX to * compute the eigenvalues, PSSYEVX will compute the * eigenvalues and as many eigenvectors as it can. * * Relationship between workspace, orthogonality & performance: * Greater performance can be achieved if adequate workspace * is provided. On the other hand, in some situations, * performance can decrease as the workspace provided * increases above the workspace amount shown below: * * For optimal performance, greater workspace may be * needed, i.e. * LWORK >= MAX( LWORK, 5*N + NSYTRD_LWOPT ) * Where: * LWORK, as defined previously, depends upon the number * of eigenvectors requested, and * NSYTRD_LWOPT = N + 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 3 ) * NPS * * ANB = PJLAENV( DESCA( CTXT_), 3, 'PSSYTTRD', 'L', * 0, 0, 0, 0) * SQNPC = INT( SQRT( DBLE( NPROW * NPCOL ) ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by * calling the subroutine BLACS_GRIDINFO. * * For large N, no extra workspace is needed, however the * biggest boost in performance comes for small N, so it * is wise to provide the extra workspace (typically less * than a Megabyte per process). * * If CLUSTERSIZE >= N/SQRT(NPROW*NPCOL), then providing * enough space to compute all the eigenvectors * orthogonally will cause serious degradation in * performance. In the limit (i.e. CLUSTERSIZE = N-1) * PSSTEIN will perform no better than SSTEIN on 1 * processor. * For CLUSTERSIZE = N/SQRT(NPROW*NPCOL) reorthogonalizing * all eigenvectors will increase the total execution time * by a factor of 2 or more. * For CLUSTERSIZE > N/SQRT(NPROW*NPCOL) execution time will * grow as the square of the cluster size, all other factors * remaining equal and assuming enough workspace. Less * workspace means less reorthogonalization but faster * execution. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance for all work arrays. Each of * these values is returned in the first entry of the * corresponding work arrays, and no error message is issued by * PXERBLA. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * LIWORK >= 6 * NNP * Where: * NNP = MAX( N, NPROW*NPCOL + 1, 4 ) * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If (MOD(INFO,2).NE.0) on exit, then * IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * * ICLUSTR (global output) integer array, dimension (2*NPROW*NPCOL) * This array contains indices of eigenvectors corresponding to * a cluster of eigenvalues that could not be reorthogonalized * due to insufficient workspace (see LWORK, ORFAC and INFO). * Eigenvectors corresponding to clusters of eigenvalues indexed * ICLUSTR(2*I-1) to ICLUSTR(2*I), could not be * reorthogonalized due to lack of workspace. Hence the * eigenvectors corresponding to these clusters may not be * orthogonal. ICLUSTR() is a zero terminated array. * (ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0) if and only if * K is the number of clusters * ICLUSTR is not referenced if JOBZ = 'N' * * GAP (global output) REAL array, * dimension (NPROW*NPCOL) * This array contains the gap between eigenvalues whose * eigenvectors could not be reorthogonalized. The output * values in this array correspond to the clusters indicated * by the array ICLUSTR. As a result, the dot product between * eigenvectors correspoding to the I^th cluster may be as high * as ( C * n ) / GAP(I) where C is a small constant. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: if (MOD(INFO,2).NE.0), then one or more eigenvectors * failed to converge. Their indices are stored * in IFAIL. Ensure ABSTOL=2.0*PSLAMCH( 'U' ) * Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding * to one or more clusters of eigenvalues could not be * reorthogonalized because of insufficient workspace. * The indices of the clusters are stored in the array * ICLUSTR. * if (MOD(INFO/4,2).NE.0), then space limit prevented * PSSYEVX from computing all of the eigenvectors * between VL and VU. The number of eigenvectors * computed is returned in NZ. * if (MOD(INFO/8,2).NE.0), then PSSTEBZ failed to compute * eigenvalues. Ensure ABSTOL=2.0*PSLAMCH( 'U' ) * Send e-mail to scalapack@cs.utk.edu * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * ( MB_A.EQ.NB_A.EQ.MB_Z .AND. IROFFA.EQ.IROFFZ .AND. IROFFA.EQ.0 .AND. * IAROW.EQ.IZROW ) * where * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * Differences between PSSYEVX and SSYEVX * ====================================== * * A, LDA -> A, IA, JA, DESCA * Z, LDZ -> Z, IZ, JZ, DESCZ * WORKSPACE needs are larger for PSSYEVX. * LIWORK parameter added * * ORFAC, ICLUSTER() and GAP() parameters added * meaning of INFO is changed * * Functional differences: * PSSYEVX does not promise orthogonality for eigenvectors associated * with tighly clustered eigenvalues. * PSSYEVX does not reorthogonalize eigenvectors * that are on different processes. The extent of reorthogonalization * is controlled by the input parameter LWORK. * * Version 1.4 limitations: * DESCA(MB_) = DESCA(NB_) * DESCA(M_) = DESCZ(M_) * DESCA(N_) = DESCZ(N_) * DESCA(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE, TEN, FIVE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TEN = 10.0E+0, $ FIVE = 5.0E+0 ) INTEGER IERREIN, IERRCLS, IERRSPC, IERREBZ PARAMETER ( IERREIN = 1, IERRCLS = 2, IERRSPC = 4, $ IERREBZ = 8 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, QUICKRETURN, $ VALEIG, WANTZ CHARACTER ORDER INTEGER ANB, CSRC_A, I, IAROW, ICOFFA, ICTXT, IINFO, $ INDD, INDD2, INDE, INDE2, INDIBL, INDISP, $ INDTAU, INDWORK, IROFFA, IROFFZ, ISCALE, $ ISIZESTEBZ, ISIZESTEIN, IZROW, LALLWORK, $ LIWMIN, LLWORK, LWMIN, LWOPT, MAXEIGS, MB_A, $ MQ0, MYCOL, MYROW, NB, NB_A, NEIG, NN, NNP, $ NP0, NPCOL, NPROCS, NPROW, NPS, NSPLIT, $ NSYTRD_LWOPT, NZZ, OFFSET, RSRC_A, RSRC_Z, $ SIZEORMTR, SIZESTEIN, SIZESYEVX, SQNPC REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, $ SIGMA, SMLNUM, VLL, VUU * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV REAL PSLAMCH, PSLANSY EXTERNAL LSAME, ICEIL, INDXG2P, NUMROC, PJLAENV, $ PSLAMCH, PSLANSY * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMN2D, PCHK1MAT, $ PCHK2MAT, PSELGET, PSLARED1D, PSLASCL, PSORMTR, $ PSSTEBZ, PSSTEIN, PSSYNTRD, PXERBLA, SGEBR2D, $ SGEBS2D, SLASRT, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, INT, MAX, MIN, MOD, REAL, $ SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * QUICKRETURN = ( N.EQ.0 ) * * Test the input arguments. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 * WANTZ = LSAME( JOBZ, 'V' ) IF( NPROW.EQ.-1 ) THEN INFO = -( 800+CTXT_ ) ELSE IF( WANTZ ) THEN IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 21, INFO ) * IF( INFO.EQ.0 ) THEN * * Get machine constants. * SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) EPS = PSLAMCH( ICTXT, 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * NPROCS = NPROW*NPCOL LOWER = LSAME( UPLO, 'L' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) * * Set up pointers into the WORK array * INDTAU = 1 INDE = INDTAU + N INDD = INDE + N INDD2 = INDD + N INDE2 = INDD2 + N INDWORK = INDE2 + N LLWORK = LWORK - INDWORK + 1 * * Set up pointers into the IWORK array * ISIZESTEIN = 3*N + NPROCS + 1 ISIZESTEBZ = MAX( 4*N, 14, NPROCS ) INDIBL = ( MAX( ISIZESTEIN, ISIZESTEBZ ) ) + 1 INDISP = INDIBL + N * * Compute the total amount of space needed * LQUERY = .FALSE. IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) $ LQUERY = .TRUE. * NNP = MAX( N, NPROCS+1, 4 ) LIWMIN = 6*NNP * NPROCS = NPROW*NPCOL NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A NN = MAX( N, NB, 2 ) * RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( 1, NB_A, MYROW, RSRC_A, NPROW ) NP0 = NUMROC( N+IROFFA, NB, 0, 0, NPROW ) MQ0 = NUMROC( N+ICOFFA, NB, 0, 0, NPCOL ) IF( WANTZ ) THEN RSRC_Z = DESCZ( RSRC_ ) IROFFZ = MOD( IZ-1, MB_A ) IZROW = INDXG2P( 1, NB_A, MYROW, RSRC_Z, NPROW ) ELSE IROFFZ = 0 IZROW = 0 END IF * IF( ( .NOT.WANTZ ) .OR. ( VALEIG .AND. ( .NOT.LQUERY ) ) ) $ THEN LWMIN = 5*N + MAX( 5*NN, NB*( NP0+1 ) ) IF( WANTZ ) THEN MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) LWOPT = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) ELSE LWOPT = LWMIN END IF NEIG = 0 ELSE IF( ALLEIG .OR. VALEIG ) THEN NEIG = N ELSE IF( INDEIG ) THEN NEIG = IU - IL + 1 END IF MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) LWMIN = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN LWOPT = LWMIN * END IF * * Compute how much workspace is needed to use the * new TRD code * ANB = PJLAENV( ICTXT, 3, 'PSSYTTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NSYTRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS LWOPT = MAX( LWOPT, 5*N+NSYTRD_LWOPT ) * END IF IF( INFO.EQ.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WORK( 1 ) = ABSTOL IF( VALEIG ) THEN WORK( 2 ) = VL WORK( 3 ) = VU ELSE WORK( 2 ) = ZERO WORK( 3 ) = ZERO END IF CALL SGEBS2D( ICTXT, 'ALL', ' ', 3, 1, WORK, 3 ) ELSE CALL SGEBR2D( ICTXT, 'ALL', ' ', 3, 1, WORK, 3, 0, 0 ) END IF IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -3 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -10 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -11 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -12 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -23 ELSE IF( LIWORK.LT.LIWMIN .AND. LIWORK.NE.-1 ) THEN INFO = -25 ELSE IF( VALEIG .AND. ( ABS( WORK( 2 )-VL ).GT.FIVE*EPS* $ ABS( VL ) ) ) THEN INFO = -9 ELSE IF( VALEIG .AND. ( ABS( WORK( 3 )-VU ).GT.FIVE*EPS* $ ABS( VU ) ) ) THEN INFO = -10 ELSE IF( ABS( WORK( 1 )-ABSTOL ).GT.FIVE*EPS*ABS( ABSTOL ) ) $ THEN INFO = -13 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 800+NB_ ) END IF IF( WANTZ ) THEN IF( IROFFA.NE.IROFFZ ) THEN INFO = -19 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -19 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2100+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2100+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2100+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2100+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2100+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2100+CSRC_ ) ELSE IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF END IF IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( ALLEIG ) THEN IDUM1( 3 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 3 ) = ICHAR( 'I' ) ELSE IDUM1( 3 ) = ICHAR( 'V' ) END IF IDUM2( 3 ) = 3 IF( LQUERY ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 4 IF( WANTZ ) THEN CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 8, N, 4, N, 4, IZ, $ JZ, DESCZ, 21, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 4, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = REAL( LWOPT ) IWORK( 1 ) = LIWMIN END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYEVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( QUICKRETURN ) THEN IF( WANTZ ) THEN NZ = 0 ICLUSTR( 1 ) = 0 END IF M = 0 WORK( 1 ) = REAL( LWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF * * Scale matrix to allowable range, if necessary. * ABSTLL = ABSTOL ISCALE = 0 IF( VALEIG ) THEN VLL = VL VUU = VU ELSE VLL = ZERO VUU = ZERO END IF * ANRM = PSLANSY( 'M', UPLO, N, A, IA, JA, DESCA, WORK( INDWORK ) ) * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ANRM = ANRM*SIGMA ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM ANRM = ANRM*SIGMA END IF * IF( ISCALE.EQ.1 ) THEN CALL PSLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) IF( ABSTOL.GT.0 ) $ ABSTLL = ABSTOL*SIGMA IF( VALEIG ) THEN VLL = VL*SIGMA VUU = VU*SIGMA IF( VUU.EQ.VLL ) THEN VUU = VUU + 2*MAX( ABS( VUU )*EPS, SAFMIN ) END IF END IF END IF * * Call PSSYNTRD to reduce symmetric matrix to tridiagonal form. * LALLWORK = LLWORK * CALL PSSYNTRD( UPLO, N, A, IA, JA, DESCA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * * Copy the values of D, E to all processes * * Here PxLARED1D is used to redistribute the tridiagonal matrix. * PxLARED1D, however, doesn't yet work with arbritary matrix * distributions so we have PxELGET as a backup. * OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. RSRC_A.EQ.0 .AND. CSRC_A.EQ.0 ) $ THEN CALL PSLARED1D( N, IA, JA, DESCA, WORK( INDD ), WORK( INDD2 ), $ WORK( INDWORK ), LLWORK ) * CALL PSLARED1D( N, IA, JA, DESCA, WORK( INDE ), WORK( INDE2 ), $ WORK( INDWORK ), LLWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PSELGET( 'A', ' ', WORK( INDD2+I-1 ), A, I+IA-1, $ I+JA-1, DESCA ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PSELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA-1, $ I+JA, DESCA ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PSELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA, $ I+JA-1, DESCA ) 30 CONTINUE END IF END IF * * Call PSSTEBZ and, if eigenvectors are desired, PSSTEIN. * IF( WANTZ ) THEN ORDER = 'B' ELSE ORDER = 'E' END IF * CALL PSSTEBZ( ICTXT, RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, $ WORK( INDD2 ), WORK( INDE2+OFFSET ), M, NSPLIT, W, $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWORK ), $ LLWORK, IWORK( 1 ), ISIZESTEBZ, IINFO ) * * * IF PSSTEBZ fails, the error propogates to INFO, but * we do not propogate the eigenvalue(s) which failed because: * 1) This should never happen if the user specifies * ABSTOL = 2 * PSLAMCH( 'U' ) * 2) PSSTEIN will confirm/deny whether the eigenvalues are * close enough. * IF( IINFO.NE.0 ) THEN INFO = INFO + IERREBZ DO 40 I = 1, M IWORK( INDIBL+I-1 ) = ABS( IWORK( INDIBL+I-1 ) ) 40 CONTINUE END IF IF( WANTZ ) THEN * IF( VALEIG ) THEN * * Compute the maximum number of eigenvalues that we can * compute in the * workspace that we have, and that we can store in Z. * * Loop through the possibilities looking for the largest * NZ that we can feed to PSSTEIN and PSORMTR * * Since all processes must end up with the same value * of NZ, we first compute the minimum of LALLWORK * CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LALLWORK, 1, 1, 1, -1, $ -1, -1 ) * MAXEIGS = DESCZ( N_ ) * DO 50 NZ = MIN( MAXEIGS, M ), 0, -1 MQ0 = NUMROC( NZ, NB, 0, 0, NPCOL ) SIZESTEIN = ICEIL( NZ, NPROCS )*N + MAX( 5*N, NP0*MQ0 ) SIZEORMTR = MAX( ( NB*( NB-1 ) ) / 2, ( MQ0+NP0 )*NB ) + $ NB*NB * SIZESYEVX = MAX( SIZESTEIN, SIZEORMTR ) IF( SIZESYEVX.LE.LALLWORK ) $ GO TO 60 50 CONTINUE 60 CONTINUE ELSE NZ = M END IF NZ = MAX( NZ, 0 ) IF( NZ.NE.M ) THEN INFO = INFO + IERRSPC * DO 70 I = 1, M IFAIL( I ) = 0 70 CONTINUE * * The following code handles a rare special case * - NZ .NE. M means that we don't have enough room to store * all the vectors. * - NSPLIT .GT. 1 means that the matrix split * In this case, we cannot simply take the first NZ eigenvalues * because PSSTEBZ sorts the eigenvalues by block when * a split occurs. So, we have to make another call to * PSSTEBZ with a new upper limit - VUU. * IF( NSPLIT.GT.1 ) THEN CALL SLASRT( 'I', M, W, IINFO ) NZZ = 0 IF( NZ.GT.0 ) THEN * VUU = W( NZ ) - TEN*( EPS*ANRM+SAFMIN ) IF( VLL.GE.VUU ) THEN NZZ = 0 ELSE CALL PSSTEBZ( ICTXT, RANGE, ORDER, N, VLL, VUU, IL, $ IU, ABSTLL, WORK( INDD2 ), $ WORK( INDE2+OFFSET ), NZZ, NSPLIT, W, $ IWORK( INDIBL ), IWORK( INDISP ), $ WORK( INDWORK ), LLWORK, IWORK( 1 ), $ ISIZESTEBZ, IINFO ) END IF * IF( MOD( INFO / IERREBZ, 1 ).EQ.0 ) THEN IF( NZZ.GT.NZ .OR. IINFO.NE.0 ) THEN INFO = INFO + IERREBZ END IF END IF END IF NZ = MIN( NZ, NZZ ) * END IF END IF CALL PSSTEIN( N, WORK( INDD2 ), WORK( INDE2+OFFSET ), NZ, W, $ IWORK( INDIBL ), IWORK( INDISP ), ORFAC, Z, IZ, $ JZ, DESCZ, WORK( INDWORK ), LALLWORK, IWORK( 1 ), $ ISIZESTEIN, IFAIL, ICLUSTR, GAP, IINFO ) * IF( IINFO.GE.NZ+1 ) $ INFO = INFO + IERRCLS IF( MOD( IINFO, NZ+1 ).NE.0 ) $ INFO = INFO + IERREIN * * Z = Q * Z * * IF( NZ.GT.0 ) THEN CALL PSORMTR( 'L', UPLO, 'N', N, NZ, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) END IF * END IF * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL SSCAL( M, ONE / SIGMA, W, 1 ) END IF * WORK( 1 ) = REAL( LWOPT ) IWORK( 1 ) = LIWMIN * RETURN * * End of PSSYEVX * END scalapack-2.0.2/SRC/pssygs2.f000644 000766 000024 00000036315 10363532303 016127 0ustar00juliestaff000000 000000 * * SUBROUTINE PSSYGS2( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ) * .. * * Purpose * ======= * * PSSYGS2 reduces a real symmetric-definite generalized eigenproblem * to standard form. * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**T)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**T) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**T or L**T*sub( A )*L. * * sub( B ) must have been previously factorized as U**T*U or L*L**T by * PSPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**T)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**T); * = 2 or 3: compute U*sub( A )*U**T or L**T*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**T*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**T. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) REAL pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PSPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, HALF PARAMETER ( ONE = 1.0E+0, HALF = 0.5E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IIA, IIB, IOFFA, IOFFB, IROFFA, IROFFB, $ JJA, JJB, K, LDA, LDB, MYCOL, MYROW, NPCOL, $ NPROW REAL AKK, BKK, CT * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PXERBLA, SAXPY, SSCAL, SSYR2, STRMV, STRSV * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL LSAME, INDXG2P * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( N+ICOFFA.GT.DESCA( NB_ ) ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYGS2', -INFO ) CALL BLACS_EXIT( ICTXT ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. ( MYROW.NE.IAROW .OR. MYCOL.NE.IACOL ) ) $ RETURN * * Compute local information * LDA = DESCA( LLD_ ) LDB = DESCB( LLD_ ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIB, JJB, $ IBROW, IBCOL ) * IF( IBTYPE.EQ.1 ) THEN * IF( UPPER ) THEN * IOFFA = IIA + JJA*LDA IOFFB = IIB + JJB*LDB * * Compute inv(U')*sub( A )*inv(U) * DO 10 K = 1, N * * Update the upper triangle of * A(ia+k-1:ia+n-a,ia+k-1:ia+n-1) * AKK = A( IOFFA-LDA ) BKK = B( IOFFB-LDB ) AKK = AKK / BKK**2 A( IOFFA-LDA ) = AKK IF( K.LT.N ) THEN CALL SSCAL( N-K, ONE / BKK, A( IOFFA ), LDA ) CT = -HALF*AKK CALL SAXPY( N-K, CT, B( IOFFB ), LDB, A( IOFFA ), $ LDA ) CALL SSYR2( UPLO, N-K, -ONE, A( IOFFA ), LDA, $ B( IOFFB ), LDB, A( IOFFA+1 ), LDA ) CALL SAXPY( N-K, CT, B( IOFFB ), LDB, A( IOFFA ), $ LDA ) CALL STRSV( UPLO, 'Transpose', 'Non-unit', N-K, $ B( IOFFB+1 ), LDB, A( IOFFA ), LDA ) END IF * * A( IOFFA ) -> A( K, K+1 ) * B( IOFFB ) -> B( K, K+1 ) * IOFFA = IOFFA + LDA + 1 IOFFB = IOFFB + LDB + 1 * 10 CONTINUE * ELSE * IOFFA = IIA + 1 + ( JJA-1 )*LDA IOFFB = IIB + 1 + ( JJB-1 )*LDB * * Compute inv(L)*sub( A )*inv(L') * DO 20 K = 1, N * * Update the lower triangle of * A(ia+k-1:ia+n-a,ia+k-1:ia+n-1) * AKK = A( IOFFA-1 ) BKK = B( IOFFB-1 ) AKK = AKK / BKK**2 A( IOFFA-1 ) = AKK * IF( K.LT.N ) THEN CALL SSCAL( N-K, ONE / BKK, A( IOFFA ), 1 ) CT = -HALF*AKK CALL SAXPY( N-K, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL SSYR2( UPLO, N-K, -ONE, A( IOFFA ), 1, $ B( IOFFB ), 1, A( IOFFA+LDA ), LDA ) CALL SAXPY( N-K, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL STRSV( UPLO, 'No transpose', 'Non-unit', N-K, $ B( IOFFB+LDB ), LDB, A( IOFFA ), 1 ) END IF * * A( IOFFA ) -> A( K+1, K ) * B( IOFFB ) -> B( K+1, K ) * IOFFA = IOFFA + LDA + 1 IOFFB = IOFFB + LDB + 1 * 20 CONTINUE * END IF * ELSE * IF( UPPER ) THEN * IOFFA = IIA + ( JJA-1 )*LDA IOFFB = IIB + ( JJB-1 )*LDB * * Compute U*sub( A )*U' * DO 30 K = 1, N * * Update the upper triangle of A(ia:ia+k-1,ja:ja+k-1) * AKK = A( IOFFA+K-1 ) BKK = B( IOFFB+K-1 ) CALL STRMV( UPLO, 'No transpose', 'Non-unit', K-1, $ B( IIB+( JJB-1 )*LDB ), LDB, A( IOFFA ), 1 ) CT = HALF*AKK CALL SAXPY( K-1, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL SSYR2( UPLO, K-1, ONE, A( IOFFA ), 1, B( IOFFB ), 1, $ A( IIA+( JJA-1 )*LDA ), LDA ) CALL SAXPY( K-1, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL SSCAL( K-1, BKK, A( IOFFA ), 1 ) A( IOFFA+K-1 ) = AKK*BKK**2 * * A( IOFFA ) -> A( 1, K ) * B( IOFFB ) -> B( 1, K ) * IOFFA = IOFFA + LDA IOFFB = IOFFB + LDB * 30 CONTINUE * ELSE * IOFFA = IIA + ( JJA-1 )*LDA IOFFB = IIB + ( JJB-1 )*LDB * * Compute L'*sub( A )*L * DO 40 K = 1, N * * Update the lower triangle of A(ia:ia+k-1,ja:ja+k-1) * AKK = A( IOFFA+( K-1 )*LDA ) BKK = B( IOFFB+( K-1 )*LDB ) CALL STRMV( UPLO, 'Transpose', 'Non-unit', K-1, $ B( IIB+( JJB-1 )*LDB ), LDB, A( IOFFA ), $ LDA ) CT = HALF*AKK CALL SAXPY( K-1, CT, B( IOFFB ), LDB, A( IOFFA ), LDA ) CALL SSYR2( UPLO, K-1, ONE, A( IOFFA ), LDA, B( IOFFB ), $ LDB, A( IIA+( JJA-1 )*LDA ), LDA ) CALL SAXPY( K-1, CT, B( IOFFB ), LDB, A( IOFFA ), LDA ) CALL SSCAL( K-1, BKK, A( IOFFA ), LDA ) A( IOFFA+( K-1 )*LDA ) = AKK*BKK**2 * * A( IOFFA ) -> A( K, 1 ) * B( IOFFB ) -> B( K, 1 ) * IOFFA = IOFFA + 1 IOFFB = IOFFB + 1 * 40 CONTINUE * END IF * END IF * RETURN * * End of PSSYGS2 * END scalapack-2.0.2/SRC/pssygst.f000644 000766 000024 00000041774 10363532303 016236 0ustar00juliestaff000000 000000 * * SUBROUTINE PSSYGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, N REAL SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ) * .. * * Purpose * ======= * * PSSYGST reduces a real symmetric-definite generalized eigenproblem * to standard form. * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**T)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**T) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**T or L**T*sub( A )*L. * * sub( B ) must have been previously factorized as U**T*U or L*L**T by * PSPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**T)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**T); * = 2 or 3: compute U*sub( A )*U**T or L**T*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**T*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**T. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) REAL pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PSPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * SCALE (global output) REAL * Amount by which the eigenvalues should be scaled to * compensate for the scaling performed in this routine. * At present, SCALE is always returned as 1.0, it is * returned here to allow for future enhancement. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, HALF PARAMETER ( ONE = 1.0E+0, HALF = 0.5E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, K, KB, MYCOL, MYROW, NB, $ NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PSSYGS2, $ PSSYMM, PSSYR2K, PSTRMM, PSTRSM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL LSAME, ICEIL, INDXG2P * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * SCALE = ONE * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IB, $ JB, DESCB, 11, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYGST', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( IBTYPE.EQ.1 ) THEN IF( UPPER ) THEN * * Compute inv(U')*sub( A )*inv(U) * K = 1 NB = DESCA( NB_ ) KB = MIN( ICEIL( JA, NB )*NB, JA+N-1 ) - JA + 1 * 10 CONTINUE * * Update the upper triangle of A(ia+k-1:ia+n-1,ja+k-1:ja+n-1) * CALL PSSYGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, IB+K-1, DESCB, INFO ) IF( K+KB.LE.N ) THEN CALL PSTRSM( 'Left', UPLO, 'Transpose', 'Non-unit', KB, $ N-K-KB+1, ONE, B, IB+K-1, JB+K-1, DESCB, A, $ IA+K-1, JA+K+KB-1, DESCA ) CALL PSSYMM( 'Left', UPLO, KB, N-K-KB+1, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K-1, JB+K+KB-1, $ DESCB, ONE, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PSSYR2K( UPLO, 'Transpose', N-K-KB+1, KB, -ONE, A, $ IA+K-1, JA+K+KB-1, DESCA, B, IB+K-1, $ JB+K+KB-1, DESCB, ONE, A, IA+K+KB-1, $ JA+K+KB-1, DESCA ) CALL PSSYMM( 'Left', UPLO, KB, N-K-KB+1, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K-1, JB+K+KB-1, $ DESCB, ONE, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PSTRSM( 'Right', UPLO, 'No transpose', 'Non-unit', $ KB, N-K-KB+1, ONE, B, IB+K+KB-1, JB+K+KB-1, $ DESCB, A, IA+K-1, JA+K+KB-1, DESCA ) END IF K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 10 * ELSE * * Compute inv(L)*sub( A )*inv(L') * K = 1 NB = DESCA( MB_ ) KB = MIN( ICEIL( IA, NB )*NB, IA+N-1 ) - IA + 1 * 20 CONTINUE * * Update the lower triangle of A(ia+k-1:ia+n-1,ja+k-1:ja+n-1) * CALL PSSYGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) IF( K+KB.LE.N ) THEN CALL PSTRSM( 'Right', UPLO, 'Transpose', 'Non-unit', $ N-K-KB+1, KB, ONE, B, IB+K-1, JB+K-1, DESCB, $ A, IA+K+KB-1, JA+K-1, DESCA ) CALL PSSYMM( 'Right', UPLO, N-K-KB+1, KB, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K+KB-1, JB+K-1, $ DESCB, ONE, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PSSYR2K( UPLO, 'No transpose', N-K-KB+1, KB, -ONE, $ A, IA+K+KB-1, JA+K-1, DESCA, B, IB+K+KB-1, $ JB+K-1, DESCB, ONE, A, IA+K+KB-1, $ JA+K+KB-1, DESCA ) CALL PSSYMM( 'Right', UPLO, N-K-KB+1, KB, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K+KB-1, JB+K-1, $ DESCB, ONE, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PSTRSM( 'Left', UPLO, 'No transpose', 'Non-unit', $ N-K-KB+1, KB, ONE, B, IB+K+KB-1, JB+K+KB-1, $ DESCB, A, IA+K+KB-1, JA+K-1, DESCA ) END IF K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 20 * END IF * ELSE * IF( UPPER ) THEN * * Compute U*sub( A )*U' * K = 1 NB = DESCA( NB_ ) KB = MIN( ICEIL( JA, NB )*NB, JA+N-1 ) - JA + 1 * 30 CONTINUE * * Update the upper triangle of A(ia:ia+k+kb-2,ja:ja+k+kb-2) * CALL PSTRMM( 'Left', UPLO, 'No transpose', 'Non-unit', K-1, $ KB, ONE, B, IB, JB, DESCB, A, IA, JA+K-1, $ DESCA ) CALL PSSYMM( 'Right', UPLO, K-1, KB, HALF, A, IA+K-1, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, ONE, A, $ IA, JA+K-1, DESCA ) CALL PSSYR2K( UPLO, 'No transpose', K-1, KB, ONE, A, IA, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, ONE, A, $ IA, JA, DESCA ) CALL PSSYMM( 'Right', UPLO, K-1, KB, HALF, A, IA+K-1, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, ONE, A, $ IA, JA+K-1, DESCA ) CALL PSTRMM( 'Right', UPLO, 'Transpose', 'Non-unit', K-1, $ KB, ONE, B, IB+K-1, JB+K-1, DESCB, A, IA, $ JA+K-1, DESCA ) CALL PSSYGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) * K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 30 * ELSE * * Compute L'*sub( A )*L * K = 1 NB = DESCA( MB_ ) KB = MIN( ICEIL( IA, NB )*NB, IA+N-1 ) - IA + 1 * 40 CONTINUE * * Update the lower triangle of A(ia:ia+k+kb-2,ja:ja+k+kb-2) * CALL PSTRMM( 'Right', UPLO, 'No transpose', 'Non-unit', KB, $ K-1, ONE, B, IB, JB, DESCB, A, IA+K-1, JA, $ DESCA ) CALL PSSYMM( 'Left', UPLO, KB, K-1, HALF, A, IA+K-1, JA+K-1, $ DESCA, B, IB+K-1, JB, DESCB, ONE, A, IA+K-1, $ JA, DESCA ) CALL PSSYR2K( UPLO, 'Transpose', K-1, KB, ONE, A, IA+K-1, $ JA, DESCA, B, IB+K-1, JB, DESCB, ONE, A, IA, $ JA, DESCA ) CALL PSSYMM( 'Left', UPLO, KB, K-1, HALF, A, IA+K-1, JA+K-1, $ DESCA, B, IB+K-1, JB, DESCB, ONE, A, IA+K-1, $ JA, DESCA ) CALL PSTRMM( 'Left', UPLO, 'Transpose', 'Non-unit', KB, K-1, $ ONE, B, IB+K-1, JB+K-1, DESCB, A, IA+K-1, JA, $ DESCA ) CALL PSSYGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) * K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 40 * END IF * END IF * RETURN * * End of PSSYGST * END scalapack-2.0.2/SRC/pssygvx.f000644 000766 000024 00000104552 10377154001 016237 0ustar00juliestaff000000 000000 SUBROUTINE PSSYGVX( IBTYPE, JOBZ, RANGE, UPLO, N, A, IA, JA, $ DESCA, B, IB, JB, DESCB, VL, VU, IL, IU, $ ABSTOL, M, NZ, W, ORFAC, Z, IZ, JZ, DESCZ, $ WORK, LWORK, IWORK, LIWORK, IFAIL, ICLUSTR, $ GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IB, IBTYPE, IL, INFO, IU, IZ, JA, JB, JZ, $ LIWORK, LWORK, M, N, NZ REAL ABSTOL, ORFAC, VL, VU * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCB( * ), DESCZ( * ), $ ICLUSTR( * ), IFAIL( * ), IWORK( * ) REAL A( * ), B( * ), GAP( * ), W( * ), WORK( * ), $ Z( * ) * .. * * Purpose * * ======= * * PSSYGVX computes all the eigenvalues, and optionally, * the eigenvectors * of a real generalized SY-definite eigenproblem, of the form * sub( A )*x=(lambda)*sub( B )*x, sub( A )*sub( B )x=(lambda)*x, or * sub( B )*sub( A )*x=(lambda)*x. * Here sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ) is assumed to be * SY, and sub( B ) denoting B( IB:IB+N-1, JB:JB+N-1 ) is assumed * to be symmetric positive definite. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * JOBZ (global input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * = 'U': Upper triangles of sub( A ) and sub( B ) are stored; * = 'L': Lower triangles of sub( A ) and sub( B ) are stored. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix. * * On exit, if JOBZ = 'V', then if INFO = 0, sub( A ) contains * the distributed matrix Z of eigenvectors. The eigenvectors * are normalized as follows: * if IBTYPE = 1 or 2, Z**T*sub( B )*Z = I; * if IBTYPE = 3, Z**T*inv( sub( B ) )*Z = I. * If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') * or the lower triangle (if UPLO='L') of sub( A ), including * the diagonal, is destroyed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PSSYGVX cannot guarantee * correct error reporting. * * B (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( B ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( B ) contains * the upper triangular part of the matrix. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( B ) contains * the lower triangular part of the matrix. * * On exit, if INFO <= N, the part of sub( B ) containing the * matrix is overwritten by the triangular factor U or L from * the Cholesky factorization sub( B ) = U**T*U or * sub( B ) = L*L**T. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * DESCB( CTXT_ ) must equal DESCA( CTXT_ ) * * VL (global input) REAL * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) REAL * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * ABSTOL (global input) REAL * If JOBZ='V', setting ABSTOL to PSLAMCH( CONTEXT, 'U') yields * the most orthogonal eigenvectors. * * The absolute error tolerance for the eigenvalues. * An approximate eigenvalue is accepted as converged * when it is determined to lie in an interval [a,b] * of width less than or equal to * * ABSTOL + EPS * max( |a|,|b| ) , * * where EPS is the machine precision. If ABSTOL is less than * or equal to zero, then EPS*norm(T) will be used in its place, * where norm(T) is the 1-norm of the tridiagonal matrix * obtained by reducing A to tridiagonal form. * * Eigenvalues will be computed most accurately when ABSTOL is * set to twice the underflow threshold 2*PSLAMCH('S') not zero. * If this routine returns with ((MOD(INFO,2).NE.0) .OR. * (MOD(INFO/8,2).NE.0)), indicating that some eigenvalues or * eigenvectors did not converge, try setting ABSTOL to * 2*PSLAMCH('S'). * * See "Computing Small Singular Values of Bidiagonal Matrices * with Guaranteed High Relative Accuracy," by Demmel and * Kahan, LAPACK Working Note #3. * * See "On the correctness of Parallel Bisection in Floating * Point" by Demmel, Dhillon and Ren, LAPACK Working Note #70 * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M unless the user supplies * insufficient space and PSSYGVX is not able to detect this * before beginning computation. To get all the eigenvectors * requested, the user must supply both sufficient * space to hold the eigenvectors in Z (M .LE. DESCZ(N_)) * and sufficient workspace to compute them. (See LWORK below.) * PSSYGVX is always able to detect insufficient space without * computation unless RANGE .EQ. 'V'. * * W (global output) REAL array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * ORFAC (global input) REAL * Specifies which eigenvectors should be reorthogonalized. * Eigenvectors that correspond to eigenvalues which are within * tol=ORFAC*norm(A) of each other are to be reorthogonalized. * However, if the workspace is insufficient (see LWORK), * tol may be decreased until all eigenvectors to be * reorthogonalized can be stored in one process. * No reorthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) REAL array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. If an eigenvector * fails to converge, then that column of Z contains the latest * approximation to the eigenvector, and the index of the * eigenvector is returned in IFAIL. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * The row index in the global array Z indicating the first * row of sub( Z ). * * JZ (global input) INTEGER * The column index in the global array Z indicating the * first column of sub( Z ). * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) REAL array, * dimension max(3,LWORK) * if JOBZ='N' WORK(1) = optimal amount of workspace * required to compute eigenvalues efficiently * if JOBZ='V' WORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors * efficiently with no guarantee on orthogonality. * If RANGE='V', it is assumed that all eigenvectors * may be required. * * LWORK (local input) INTEGER * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= 5 * N + MAX( 5 * NN, NB * ( NP0 + 1 ) ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * LWORK >= 5 * N + MAX( 5*NN, NP0 * MQ0 + 2 * NB * NB ) + * ICEIL( NEIG, NPROW*NPCOL)*NN * * The computed eigenvectors may not be orthogonal if the * minimal workspace is supplied and ORFAC is too small. * If you want to guarantee orthogonality (at the cost * of potentially poor performance) you should add * the following to LWORK: * (CLUSTERSIZE-1)*N * where CLUSTERSIZE is the number of eigenvalues in the * largest cluster, where a cluster is defined as a set of * close eigenvalues: { W(K),...,W(K+CLUSTERSIZE-1) | * W(J+1) <= W(J) + ORFAC*2*norm(A) } * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = DESCZ( MB_ ) = * DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * When LWORK is too small: * If LWORK is too small to guarantee orthogonality, * PSSYGVX attempts to maintain orthogonality in * the clusters with the smallest * spacing between the eigenvalues. * If LWORK is too small to compute all the eigenvectors * requested, no computation is performed and INFO=-23 * is returned. Note that when RANGE='V', PSSYGVX does * not know how many eigenvectors are requested until * the eigenvalues are computed. Therefore, when RANGE='V' * and as long as LWORK is large enough to allow PSSYGVX to * compute the eigenvalues, PSSYGVX will compute the * eigenvalues and as many eigenvectors as it can. * * Relationship between workspace, orthogonality & performance: * Greater performance can be achieved if adequate workspace * is provided. On the other hand, in some situations, * performance can decrease as the workspace provided * increases above the workspace amount shown below: * * For optimal performance, greater workspace may be * needed, i.e. * LWORK >= MAX( LWORK, 5 * N + NSYTRD_LWOPT, * NSYGST_LWOPT ) * Where: * LWORK, as defined previously, depends upon the number * of eigenvectors requested, and * NSYTRD_LWOPT = N + 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 3 ) * NPS * NSYGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB * * ANB = PJLAENV( DESCA( CTXT_), 3, 'PSSYTTRD', 'L', * 0, 0, 0, 0) * SQNPC = INT( SQRT( DBLE( NPROW * NPCOL ) ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * NB = DESCA( MB_ ) * NP0 = NUMROC( N, NB, 0, 0, NPROW ) * NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by * calling the subroutine BLACS_GRIDINFO. * * For large N, no extra workspace is needed, however the * biggest boost in performance comes for small N, so it * is wise to provide the extra workspace (typically less * than a Megabyte per process). * * If CLUSTERSIZE >= N/SQRT(NPROW*NPCOL), then providing * enough space to compute all the eigenvectors * orthogonally will cause serious degradation in * performance. In the limit (i.e. CLUSTERSIZE = N-1) * PSSTEIN will perform no better than SSTEIN on 1 processor. * For CLUSTERSIZE = N/SQRT(NPROW*NPCOL) reorthogonalizing * all eigenvectors will increase the total execution time * by a factor of 2 or more. * For CLUSTERSIZE > N/SQRT(NPROW*NPCOL) execution time will * grow as the square of the cluster size, all other factors * remaining equal and assuming enough workspace. Less * workspace means less reorthogonalization but faster * execution. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance on all work arrays. * Each of these values is returned in the first entry of the * corresponding work array, and no error message is issued by * PXERBLA. * * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * LIWORK >= 6 * NNP * Where: * NNP = MAX( N, NPROW*NPCOL + 1, 4 ) * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (output) INTEGER array, dimension (N) * IFAIL provides additional information when INFO .NE. 0 * If (MOD(INFO/16,2).NE.0) then IFAIL(1) indicates the order of * the smallest minor which is not positive definite. * If (MOD(INFO,2).NE.0) on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * * If neither of the above error conditions hold and JOBZ = 'V', * then the first M elements of IFAIL are set to zero. * * ICLUSTR (global output) integer array, dimension (2*NPROW*NPCOL) * This array contains indices of eigenvectors corresponding to * a cluster of eigenvalues that could not be reorthogonalized * due to insufficient workspace (see LWORK, ORFAC and INFO). * Eigenvectors corresponding to clusters of eigenvalues indexed * ICLUSTR(2*I-1) to ICLUSTR(2*I), could not be * reorthogonalized due to lack of workspace. Hence the * eigenvectors corresponding to these clusters may not be * orthogonal. ICLUSTR() is a zero terminated array. * (ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0) if and only if * K is the number of clusters * ICLUSTR is not referenced if JOBZ = 'N' * * GAP (global output) REAL array, * dimension (NPROW*NPCOL) * This array contains the gap between eigenvalues whose * eigenvectors could not be reorthogonalized. The output * values in this array correspond to the clusters indicated * by the array ICLUSTR. As a result, the dot product between * eigenvectors correspoding to the I^th cluster may be as high * as ( C * n ) / GAP(I) where C is a small constant. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: if (MOD(INFO,2).NE.0), then one or more eigenvectors * failed to converge. Their indices are stored * in IFAIL. Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding * to one or more clusters of eigenvalues could not be * reorthogonalized because of insufficient workspace. * The indices of the clusters are stored in the array * ICLUSTR. * if (MOD(INFO/4,2).NE.0), then space limit prevented * PSSYGVX from computing all of the eigenvectors * between VL and VU. The number of eigenvectors * computed is returned in NZ. * if (MOD(INFO/8,2).NE.0), then PSSTEBZ failed to * compute eigenvalues. * Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/16,2).NE.0), then B was not positive * definite. IFAIL(1) indicates the order of * the smallest minor which is not positive definite. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*), C(IC:IC+M-1,JC:JC+N-1), * and B( IB:IB+N-1, JB:JB+N-1 ) must verify some alignment properties, * namely the following expressions should be true: * * DESCA(MB_) = DESCA(NB_) * IA = IB = IZ * JA = IB = JZ * DESCA(M_) = DESCB(M_) =DESCZ(M_) * DESCA(N_) = DESCB(N_)= DESCZ(N_) * DESCA(MB_) = DESCB(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCB(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCB(RSRC_) = DESCZ(RSRC_) * DESCA(CSRC_) = DESCB(CSRC_) = DESCZ(CSRC_) * MOD( IA-1, DESCA( MB_ ) ) = 0 * MOD( JA-1, DESCA( NB_ ) ) = 0 * MOD( IB-1, DESCB( MB_ ) ) = 0 * MOD( JB-1, DESCB( NB_ ) ) = 0 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) REAL FIVE, ZERO PARAMETER ( FIVE = 5.0E+0, ZERO = 0.0E+0 ) INTEGER IERRNPD PARAMETER ( IERRNPD = 16 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LQUERY, UPPER, VALEIG, WANTZ CHARACTER TRANS INTEGER ANB, IACOL, IAROW, IBCOL, IBROW, ICOFFA, $ ICOFFB, ICTXT, IROFFA, IROFFB, LIWMIN, LWMIN, $ LWOPT, MQ0, MYCOL, MYROW, NB, NEIG, NN, NP0, $ NPCOL, NPROW, NPS, NQ0, NSYGST_LWOPT, $ NSYTRD_LWOPT, SQNPC REAL EPS, SCALE * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV REAL PSLAMCH EXTERNAL LSAME, ICEIL, INDXG2P, NUMROC, PJLAENV, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PCHK2MAT, $ PSPOTRF, PSSYEVX, PSSYNGST, PSTRMM, PSTRSM, $ PXERBLA, SGEBR2D, SGEBS2D, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, ICHAR, INT, MAX, MIN, MOD, REAL, $ SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 900+CTXT_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2600+CTXT_ ) ELSE * * Get machine constants. * EPS = PSLAMCH( DESCA( CTXT_ ), 'Precision' ) * WANTZ = LSAME( JOBZ, 'V' ) UPPER = LSAME( UPLO, 'U' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, N, 4, IB, JB, DESCB, 13, INFO ) CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 26, INFO ) IF( INFO.EQ.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WORK( 1 ) = ABSTOL IF( VALEIG ) THEN WORK( 2 ) = VL WORK( 3 ) = VU ELSE WORK( 2 ) = ZERO WORK( 3 ) = ZERO END IF CALL SGEBS2D( DESCA( CTXT_ ), 'ALL', ' ', 3, 1, WORK, 3 ) ELSE CALL SGEBR2D( DESCA( CTXT_ ), 'ALL', ' ', 3, 1, WORK, 3, $ 0, 0 ) END IF IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) * * Compute the total amount of space needed * LQUERY = .FALSE. IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) $ LQUERY = .TRUE. * LIWMIN = 6*MAX( N, ( NPROW*NPCOL )+1, 4 ) * NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * IF( ( .NOT.WANTZ ) .OR. ( VALEIG .AND. ( .NOT.LQUERY ) ) ) $ THEN LWMIN = 5*N + MAX( 5*NN, NB*( NP0+1 ) ) IF( WANTZ ) THEN MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) LWOPT = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) ELSE LWOPT = LWMIN END IF NEIG = 0 ELSE IF( ALLEIG .OR. VALEIG ) THEN NEIG = N ELSE IF( INDEIG ) THEN NEIG = IU - IL + 1 END IF MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) LWMIN = 5*N + MAX( 5*NN, NP0*MQ0+2*NB*NB ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN LWOPT = LWMIN * END IF * * Compute how much workspace is needed to use the * new TRD and GST algorithms * ANB = PJLAENV( ICTXT, 3, 'PSSYTTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NSYTRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS NB = DESCA( MB_ ) NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) NSYGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB LWOPT = MAX( LWOPT, N+NSYTRD_LWOPT, NSYGST_LWOPT ) * * Version 1.0 Limitations * IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -2 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -3 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -5 ELSE IF( IROFFA.NE.0 ) THEN INFO = -7 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -8 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 900+NB_ ) ELSE IF( DESCA( M_ ).NE.DESCB( M_ ) ) THEN INFO = -( 1300+M_ ) ELSE IF( DESCA( N_ ).NE.DESCB( N_ ) ) THEN INFO = -( 1300+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1300+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCB( NB_ ) ) THEN INFO = -( 1300+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCB( RSRC_ ) ) THEN INFO = -( 1300+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCB( CSRC_ ) ) THEN INFO = -( 1300+CSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2200+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2200+CSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2200+CTXT_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -11 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -12 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -15 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -16 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -17 ELSE IF( VALEIG .AND. ( ABS( WORK( 2 )-VL ).GT.FIVE*EPS* $ ABS( VL ) ) ) THEN INFO = -14 ELSE IF( VALEIG .AND. ( ABS( WORK( 3 )-VU ).GT.FIVE*EPS* $ ABS( VU ) ) ) THEN INFO = -15 ELSE IF( ABS( WORK( 1 )-ABSTOL ).GT.FIVE*EPS*ABS( ABSTOL ) ) $ THEN INFO = -18 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -28 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -30 END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( WANTZ ) THEN IDUM1( 2 ) = ICHAR( 'V' ) ELSE IDUM1( 2 ) = ICHAR( 'N' ) END IF IDUM2( 2 ) = 2 IF( UPPER ) THEN IDUM1( 3 ) = ICHAR( 'U' ) ELSE IDUM1( 3 ) = ICHAR( 'L' ) END IF IDUM2( 3 ) = 3 IF( ALLEIG ) THEN IDUM1( 4 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 4 ) = ICHAR( 'I' ) ELSE IDUM1( 4 ) = ICHAR( 'V' ) END IF IDUM2( 4 ) = 4 IF( LQUERY ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 5 CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 9, N, 4, N, 4, IB, $ JB, DESCB, 13, 5, IDUM1, IDUM2, INFO ) CALL PCHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 26, 0, IDUM1, IDUM2, $ INFO ) END IF * IWORK( 1 ) = LIWMIN WORK( 1 ) = REAL( LWOPT ) * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYGVX ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Form a Cholesky factorization of sub( B ). * CALL PSPOTRF( UPLO, N, B, IB, JB, DESCB, INFO ) IF( INFO.NE.0 ) THEN IWORK( 1 ) = LIWMIN WORK( 1 ) = REAL( LWOPT ) IFAIL( 1 ) = INFO INFO = IERRNPD RETURN END IF * * Transform problem to standard eigenvalue problem and solve. * CALL PSSYNGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, WORK, LWORK, INFO ) CALL PSSYEVX( JOBZ, RANGE, UPLO, N, A, IA, JA, DESCA, VL, VU, IL, $ IU, ABSTOL, M, NZ, W, ORFAC, Z, IZ, JZ, DESCZ, WORK, $ LWORK, IWORK, LIWORK, IFAIL, ICLUSTR, GAP, INFO ) * IF( WANTZ ) THEN * * Backtransform eigenvectors to the original problem. * NEIG = M IF( IBTYPE.EQ.1 .OR. IBTYPE.EQ.2 ) THEN * * For sub( A )*x=(lambda)*sub( B )*x and * sub( A )*sub( B )*x=(lambda)*x; backtransform eigenvectors: * x = inv(L)'*y or inv(U)*y * IF( UPPER ) THEN TRANS = 'N' ELSE TRANS = 'T' END IF * CALL PSTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, $ B, IB, JB, DESCB, Z, IZ, JZ, DESCZ ) * ELSE IF( IBTYPE.EQ.3 ) THEN * * For sub( B )*sub( A )*x=(lambda)*x; * backtransform eigenvectors: x = L*y or U'*y * IF( UPPER ) THEN TRANS = 'T' ELSE TRANS = 'N' END IF * CALL PSTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, $ B, IB, JB, DESCB, Z, IZ, JZ, DESCZ ) END IF END IF * IF( SCALE.NE.ONE ) THEN CALL SSCAL( N, SCALE, W, 1 ) END IF * IWORK( 1 ) = LIWMIN WORK( 1 ) = REAL( LWOPT ) RETURN * * End of PSSYGVX * END scalapack-2.0.2/SRC/pssyngst.f000644 000766 000024 00000041416 10363532303 016405 0ustar00juliestaff000000 000000 SUBROUTINE PSSYNGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, LWORK, N REAL SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ), WORK( * ) * .. * * Purpose * * ======= * * PSSYNGST reduces a complex Hermitian-definite generalized * eigenproblem to standard form. * * PSSYNGST performs the same function as PSHEGST, but is based on * rank 2K updates, which are faster and more scalable than * triangular solves (the basis of PSSYNGST). * * PSSYNGST calls PSHEGST when UPLO='U', hence PSHENGST provides * improved performance only when UPLO='L', IBTYPE=1. * * PSSYNGST also calls PSHEGST when insufficient workspace is * provided, hence PSSYNGST provides improved * performance only when LWORK >= 2 * NP0 * NB + NQ0 * NB + NB * NB * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**H or L**H*sub( A )*L. * * sub( B ) must have been previously factorized as U**H*U or L*L**H by * PSPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H); * = 2 or 3: compute U*sub( A )*U**H or L**H*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**H*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**H. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) REAL pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PSPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * SCALE (global output) REAL * Amount by which the eigenvalues should be scaled to * compensate for the scaling performed in this routine. * At present, SCALE is always returned as 1.0, it is * returned here to allow for future enhancement. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP0 +1 ), 3 * NB ) * * When IBTYPE = 1 and UPLO = 'L', PSSYNGST provides improved * performance when LWORK >= 2 * NP0 * NB + NQ0 * NB + NB * NB * * where NB = MB_A = NB_A, * NP0 = NUMROC( N, NB, 0, 0, NPROW ), * NQ0 = NUMROC( N, NB, 0, 0, NPROW ), * * NUMROC ia a ScaLAPACK tool functions * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the * optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * * .. Parameters .. REAL ONEHALF, ONE, MONE PARAMETER ( ONEHALF = 0.5E0, ONE = 1.0E0, MONE = -1.0E0 ) INTEGER DLEN_, CTXT_, MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( DLEN_ = 9, CTXT_ = 2, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER I, IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, INDAA, INDG, INDR, INDRT, IROFFA, $ IROFFB, J, K, KB, LWMIN, LWOPT, MYCOL, MYROW, $ NB, NP0, NPCOL, NPK, NPROW, NQ0, POSTK * .. * .. Local Arrays .. INTEGER DESCAA( DLEN_ ), DESCG( DLEN_ ), $ DESCR( DLEN_ ), DESCRT( DLEN_ ), IDUM1( 2 ), $ IDUM2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL LSAME, INDXG2P, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ PSGEMM, PSLACPY, PSSYGST, PSSYMM, PSSYR2K, $ PSTRSM, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) SCALE = 1.0E0 * NB = DESCA( MB_ ) * * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) LWMIN = MAX( NB*( NP0+1 ), 3*NB ) IF( IBTYPE.EQ.1 .AND. .NOT.UPPER ) THEN LWOPT = 2*NP0*NB + NQ0*NB + NB*NB ELSE LWOPT = LWMIN END IF WORK( 1 ) = REAL( LWOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IB, $ JB, DESCB, 11, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYNGST', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * IF( IBTYPE.NE.1 .OR. UPPER .OR. LWORK.LT.LWOPT ) THEN CALL PSSYGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, INFO ) RETURN END IF * CALL DESCSET( DESCG, N, NB, NB, NB, IAROW, IACOL, ICTXT, NP0 ) CALL DESCSET( DESCR, N, NB, NB, NB, IAROW, IACOL, ICTXT, NP0 ) CALL DESCSET( DESCRT, NB, N, NB, NB, IAROW, IACOL, ICTXT, NB ) CALL DESCSET( DESCAA, NB, NB, NB, NB, IAROW, IACOL, ICTXT, NB ) * INDG = 1 INDR = INDG + DESCG( LLD_ )*NB INDAA = INDR + DESCR( LLD_ )*NB INDRT = INDAA + DESCAA( LLD_ )*NB * DO 30 K = 1, N, NB * KB = MIN( N-K+1, NB ) POSTK = K + KB NPK = N - POSTK + 1 * * CALL PSLACPY( 'A', N-POSTK+1, KB, B, POSTK+IB-1, K+JB-1, DESCB, $ WORK( INDG ), POSTK, 1, DESCG ) CALL PSLACPY( 'A', N-POSTK+1, KB, A, POSTK+IA-1, K+JA-1, DESCA, $ WORK( INDR ), POSTK, 1, DESCR ) CALL PSLACPY( 'A', KB, K-1, A, K+IA-1, JA, DESCA, $ WORK( INDRT ), 1, 1, DESCRT ) * CALL PSLACPY( 'L', KB, KB, A, K+IA-1, K+JA-1, DESCA, $ WORK( INDR ), K, 1, DESCR ) CALL PSTRSM( 'Right', 'L', 'N', 'N', NPK, KB, MONE, B, K+IB-1, $ K+JB-1, DESCB, WORK( INDG ), POSTK, 1, DESCG ) * CALL PSSYMM( 'Right', 'L', NPK, KB, ONEHALF, A, K+IA-1, K+JA-1, $ DESCA, WORK( INDG ), POSTK, 1, DESCG, ONE, $ WORK( INDR ), POSTK, 1, DESCR ) * CALL PSSYR2K( 'Lower', 'No T', NPK, KB, ONE, WORK( INDG ), $ POSTK, 1, DESCG, WORK( INDR ), POSTK, 1, DESCR, $ ONE, A, POSTK+IA-1, POSTK+JA-1, DESCA ) * CALL PSGEMM( 'No T', 'No Conj', NPK, K-1, KB, ONE, $ WORK( INDG ), POSTK, 1, DESCG, WORK( INDRT ), 1, $ 1, DESCRT, ONE, A, POSTK+IA-1, JA, DESCA ) * CALL PSSYMM( 'Right', 'L', NPK, KB, ONE, WORK( INDR ), K, 1, $ DESCR, WORK( INDG ), POSTK, 1, DESCG, ONE, A, $ POSTK+IA-1, K+JA-1, DESCA ) * CALL PSTRSM( 'Left', 'Lower', 'No Conj', 'Non-unit', KB, K-1, $ ONE, B, K+IB-1, K+JB-1, DESCB, A, K+IA-1, JA, $ DESCA ) * CALL PSLACPY( 'L', KB, KB, A, K+IA-1, K+JA-1, DESCA, $ WORK( INDAA ), 1, 1, DESCAA ) * IF( MYROW.EQ.DESCAA( RSRC_ ) .AND. MYCOL.EQ.DESCAA( CSRC_ ) ) $ THEN DO 20 I = 1, KB DO 10 J = 1, I WORK( INDAA+J-1+( I-1 )*DESCAA( LLD_ ) ) $ = WORK( INDAA+I-1+( J-1 )*DESCAA( LLD_ ) ) 10 CONTINUE 20 CONTINUE END IF * CALL PSTRSM( 'Left', 'Lower', 'No Conj', 'Non-unit', KB, KB, $ ONE, B, K+IB-1, K+JB-1, DESCB, WORK( INDAA ), 1, $ 1, DESCAA ) * CALL PSTRSM( 'Right', 'Lower', 'Conj', 'Non-unit', KB, KB, ONE, $ B, K+IB-1, K+JB-1, DESCB, WORK( INDAA ), 1, 1, $ DESCAA ) * CALL PSLACPY( 'L', KB, KB, WORK( INDAA ), 1, 1, DESCAA, A, $ K+IA-1, K+JA-1, DESCA ) * CALL PSTRSM( 'Right', 'Lower', 'Conj', 'Non-unit', NPK, KB, $ ONE, B, K+IB-1, K+JB-1, DESCB, A, POSTK+IA-1, $ K+JA-1, DESCA ) * DESCR( CSRC_ ) = MOD( DESCR( CSRC_ )+1, NPCOL ) DESCG( CSRC_ ) = MOD( DESCG( CSRC_ )+1, NPCOL ) DESCRT( RSRC_ ) = MOD( DESCRT( RSRC_ )+1, NPROW ) DESCAA( RSRC_ ) = MOD( DESCAA( RSRC_ )+1, NPROW ) DESCAA( CSRC_ ) = MOD( DESCAA( CSRC_ )+1, NPCOL ) 30 CONTINUE * WORK( 1 ) = REAL( LWOPT ) * RETURN END scalapack-2.0.2/SRC/pssyntrd.f000644 000766 000024 00000050433 10363532303 016400 0ustar00juliestaff000000 000000 SUBROUTINE PSSYNTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * Bugs * ==== * * * Support for UPLO='U' is limited to calling the old, slow, PSSYTRD * code. * * * Purpose * ======= * * PSSYNTRD is a prototype version of PSSYTRD which uses tailored * codes (either the serial, SSYTRD, or the parallel code, PSSYTTRD) * when the workspace provided by the user is adequate. * * * PSSYNTRD reduces a real symmetric matrix sub( A ) to symmetric * tridiagonal form T by an orthogonal similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Features * ======== * * PSSYNTRD is faster than PSSYTRD on almost all matrices, * particularly small ones (i.e. N < 500 * sqrt(P) ), provided that * enough workspace is available to use the tailored codes. * * The tailored codes provide performance that is essentially * independent of the input data layout. * * The tailored codes place no restrictions on IA, JA, MB or NB. * At present, IA, JA, MB and NB are restricted to those values allowed * by PSSYTRD to keep the interface simple. These restrictions are * documented below. (Search for "restrictions".) * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * symmetric distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the orthogonal matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK( 1 ) returns the optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP +1 ), 3 * NB ) * * For optimal performance, greater workspace is needed, i.e. * LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS + 4 ) * NPS * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PSSYTTRD', 'L', 0, 0, 0, 0 ) * SQNPC = INT( SQRT( REAL( NPROW * NPCOL ) ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA .AND. IROFFA.EQ.0 ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER COLCTOP, ROWCTOP INTEGER ANB, CTXTB, I, IACOL, IAROW, ICOFFA, ICTXT, $ IINFO, INDB, INDD, INDE, INDTAU, INDW, IPW, $ IROFFA, J, JB, JX, K, KK, LLWORK, LWMIN, MINSZ, $ MYCOL, MYCOLB, MYROW, MYROWB, NB, NP, NPCOL, $ NPCOLB, NPROW, NPROWB, NPS, NQ, ONEPMIN, SQNPC, $ TTLWMIN * .. * .. Local Arrays .. INTEGER DESCB( DLEN_ ), DESCW( DLEN_ ), IDUM1( 2 ), $ IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, CHK1MAT, DESCSET, IGAMN2D, $ PCHK1MAT, PSELSET, PSLAMR1D, PSLATRD, PSSYR2K, $ PSSYTD2, PSSYTTRD, PSTRMR2D, PB_TOPGET, $ PB_TOPSET, PXERBLA, SSYTRD * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC, PJLAENV EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC, PJLAENV * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, INT, MAX, MIN, MOD, REAL, SQRT * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = MAX( 1, NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) LWMIN = MAX( ( NP+1 )*NB, 3*NB ) ANB = PJLAENV( ICTXT, 3, 'PSSYTTRD', 'L', 0, 0, 0, 0 ) MINSZ = PJLAENV( ICTXT, 5, 'PSSYTTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( REAL( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) TTLWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS * WORK( 1 ) = REAL( TTLWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 * * The following two restrictions are not necessary provided * that either of the tailored codes are used. * ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 600+NB_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 11 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYNTRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * ONEPMIN = N*N + 3*N + 1 LLWORK = LWORK CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LLWORK, 1, 1, -1, -1, -1, $ -1 ) * * * * Use the serial, LAPACK, code: STRD on small matrices if we * we have enough space. * NPROWB = 0 IF( ( N.LT.MINSZ .OR. SQNPC.EQ.1 ) .AND. LLWORK.GE.ONEPMIN .AND. $ .NOT.UPPER ) THEN NPROWB = 1 NPS = N ELSE IF( LLWORK.GE.TTLWMIN .AND. .NOT.UPPER ) THEN NPROWB = SQNPC END IF END IF * IF( NPROWB.GE.1 ) THEN NPCOLB = NPROWB SQNPC = NPROWB INDB = 1 INDD = INDB + NPS*NPS INDE = INDD + NPS INDTAU = INDE + NPS INDW = INDTAU + NPS LLWORK = LLWORK - INDW + 1 * CALL BLACS_GET( ICTXT, 10, CTXTB ) CALL BLACS_GRIDINIT( CTXTB, 'Row major', SQNPC, SQNPC ) CALL BLACS_GRIDINFO( CTXTB, NPROWB, NPCOLB, MYROWB, MYCOLB ) CALL DESCSET( DESCB, N, N, 1, 1, 0, 0, CTXTB, NPS ) * CALL PSTRMR2D( UPLO, 'N', N, N, A, IA, JA, DESCA, WORK( INDB ), $ 1, 1, DESCB, ICTXT ) * * * Only those processors in context CTXTB are needed for a while * IF( NPROWB.GT.0 ) THEN * IF( NPROWB.EQ.1 ) THEN CALL SSYTRD( UPLO, N, WORK( INDB ), NPS, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDW ), $ LLWORK, INFO ) ELSE * CALL PSSYTTRD( 'L', N, WORK( INDB ), 1, 1, DESCB, $ WORK( INDD ), WORK( INDE ), $ WORK( INDTAU ), WORK( INDW ), LLWORK, $ INFO ) * END IF END IF * * All processors participate in moving the data back to the * way that PSSYNTRD expects it. * CALL PSLAMR1D( N-1, WORK( INDE ), 1, 1, DESCB, E, 1, JA, $ DESCA ) * CALL PSLAMR1D( N, WORK( INDD ), 1, 1, DESCB, D, 1, JA, DESCA ) * CALL PSLAMR1D( N, WORK( INDTAU ), 1, 1, DESCB, TAU, 1, JA, $ DESCA ) * CALL PSTRMR2D( UPLO, 'N', N, N, WORK( INDB ), 1, 1, DESCB, A, $ IA, JA, DESCA, ICTXT ) * IF( MYROWB.GE.0 ) $ CALL BLACS_GRIDEXIT( CTXTB ) * ELSE * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPW = NP*NB + 1 * IF( UPPER ) THEN * * Reduce the upper triangle of sub( A ). * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, $ INDXG2P( JA+N-KK, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ), ICTXT, MAX( 1, NP ) ) * DO 10 K = N - KK + 1, NB + 1, -NB JB = MIN( N-K+1, NB ) I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part of * the matrix * CALL PSLATRD( UPLO, K+JB-1, JB, A, IA, JA, DESCA, D, E, $ TAU, WORK, 1, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(IA:I-1,JA:J-1), using an * update of the form: * A(IA:I-1,JA:J-1) := A(IA:I-1,JA:J-1) - V*W' - W*V' * CALL PSSYR2K( UPLO, 'No transpose', K-1, JB, -ONE, A, IA, $ J, DESCA, WORK, 1, 1, DESCW, ONE, A, IA, $ JA, DESCA ) * * Copy last superdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J, NB, 0, IACOL, NPCOL ), NQ ) CALL PSELSET( A, I-1, J, DESCA, E( JX ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+NPCOL-1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PSSYTD2( UPLO, MIN( N, NB ), A, IA, JA, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) * ELSE * * Reduce the lower triangle of sub( A ) * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) * DO 20 K = 1, N - NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part * of the matrix * CALL PSLATRD( UPLO, N-K+1, NB, A, I, J, DESCA, D, E, TAU, $ WORK, K, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(I+NB:IA+N-1,I+NB:IA+N-1), * using an update of the form: A(I+NB:IA+N-1,I+NB:IA+N-1) := * A(I+NB:IA+N-1,I+NB:IA+N-1) - V*W' - W*V' * CALL PSSYR2K( UPLO, 'No transpose', N-K-NB+1, NB, -ONE, $ A, I+NB, J, DESCA, WORK, K+NB, 1, DESCW, $ ONE, A, I+NB, J+NB, DESCA ) * * Copy last subdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J+NB-1, NB, 0, IACOL, NPCOL ), NQ ) CALL PSELSET( A, I+NB, J+NB-1, DESCA, E( JX ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * 20 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PSSYTD2( UPLO, KK, A, IA+K-1, JA+K-1, DESCA, D, E, TAU, $ WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * END IF * WORK( 1 ) = REAL( TTLWMIN ) * RETURN * * End of PSSYNTRD * END scalapack-2.0.2/SRC/pssytd2.f000644 000766 000024 00000042136 10363532303 016123 0ustar00juliestaff000000 000000 SUBROUTINE PSSYTD2( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSSYTD2 reduces a real symmetric matrix sub( A ) to symmetric * tridiagonal form T by an orthogonal similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * symmetric distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the orthogonal matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*N. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL HALF, ONE, ZERO PARAMETER ( HALF = 0.5E+0, ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER IACOL, IAROW, ICOFFA, ICTXT, II, IK, IROFFA, J, $ JJ, JK, JN, LDA, LWMIN, MYCOL, MYROW, NPCOL, $ NPROW REAL ALPHA, TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PXERBLA, SAXPY, SGEBR2D, SGEBS2D, $ SLARFG, SSYMV, SSYR2 * .. * .. External Functions .. LOGICAL LSAME REAL SDOT EXTERNAL LSAME, SDOT * .. * .. Intrinsic Functions .. INTRINSIC REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) LWMIN = 3 * N * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYTD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Compute local information * LDA = DESCA( LLD_ ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) * IF( UPPER ) THEN * * Process(IAROW, IACOL) owns block to be reduced * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Reduce the upper triangle of sub( A ) * DO 10 J = N-1, 1, -1 IK = II + J - 1 JK = JJ + J - 1 * * Generate elementary reflector H(i) = I - tau * v * v' * to annihilate A(IA:IA+J-1,JA:JA+J-1) * CALL SLARFG( J, A( IK+JK*LDA ), A( II+JK*LDA ), 1, $ TAUI ) E( JK+1 ) = A( IK+JK*LDA ) * IF( TAUI.NE.ZERO ) THEN * * Apply H(i) from both sides to * A(IA:IA+J-1,JA:JA+J-1) * A( IK+JK*LDA ) = ONE * * Compute x := tau * A * v storing x in TAU(1:i) * CALL SSYMV( UPLO, J, TAUI, A( II+(JJ-1)*LDA ), $ LDA, A( II+JK*LDA ), 1, ZERO, $ TAU( JJ ), 1 ) * * Compute w := x - 1/2 * tau * (x'*v) * v * ALPHA = -HALF*TAUI*SDOT( J, TAU( JJ ), 1, $ A( II+JK*LDA ), 1 ) CALL SAXPY( J, ALPHA, A( II+JK*LDA ), 1, $ TAU( JJ ), 1 ) * * Apply the transformation as a rank-2 update: * A := A - v * w' - w * v' * CALL SSYR2( UPLO, J, -ONE, A( II+JK*LDA ), 1, $ TAU( JJ ), 1, A( II+(JJ-1)*LDA ), $ LDA ) A( IK+JK*LDA ) = E( JK+1 ) END IF * * Copy D, E, TAU to broadcast them columnwise. * D( JK+1 ) = A( IK+1+JK*LDA ) WORK( J+1 ) = D( JK+1 ) WORK( N+J+1 ) = E( JK+1 ) TAU( JK+1 ) = TAUI WORK( 2*N+J+1 ) = TAU( JK+1 ) * 10 CONTINUE D( JJ ) = A( II+(JJ-1)*LDA ) WORK( 1 ) = D( JJ ) WORK( N+1 ) = ZERO WORK( 2*N+1 ) = ZERO * CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1 ) * ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1, $ IAROW, IACOL ) DO 20 J = 2, N JN = JJ + J - 1 D( JN ) = WORK( J ) E( JN ) = WORK( N+J ) TAU( JN ) = WORK( 2*N+J ) 20 CONTINUE D( JJ ) = WORK( 1 ) END IF END IF * ELSE * * Process (IAROW, IACOL) owns block to be factorized * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Reduce the lower triangle of sub( A ) * DO 30 J = 1, N - 1 IK = II + J - 1 JK = JJ + J - 1 * * Generate elementary reflector H(i) = I - tau * v * v' * to annihilate A(IA+J-JA+2:IA+N-1,JA+J-1) * CALL SLARFG( N-J, A( IK+1+(JK-1)*LDA ), $ A( IK+2+(JK-1)*LDA ), 1, TAUI ) E( JK ) = A( IK+1+(JK-1)*LDA ) * IF( TAUI.NE.ZERO ) THEN * * Apply H(i) from both sides to * A(IA+J-JA+1:IA+N-1,JA+J+1:JA+N-1) * A( IK+1+(JK-1)*LDA ) = ONE * * Compute x := tau * A * v storing y in TAU(i:n-1) * CALL SSYMV( UPLO, N-J, TAUI, A( IK+1+JK*LDA ), $ LDA, A( IK+1+(JK-1)*LDA ), 1, $ ZERO, TAU( JK ), 1 ) * * Compute w := x - 1/2 * tau * (x'*v) * v * ALPHA = -HALF*TAUI*SDOT( N-J, TAU( JK ), 1, $ A( IK+1+(JK-1)*LDA ), 1 ) CALL SAXPY( N-J, ALPHA, A( IK+1+(JK-1)*LDA ), $ 1, TAU( JK ), 1 ) * * Apply the transformation as a rank-2 update: * A := A - v * w' - w * v' * CALL SSYR2( UPLO, N-J, -ONE, $ A( IK+1+(JK-1)*LDA ), 1, $ TAU( JK ), 1, A( IK+1+JK*LDA ), $ LDA ) A( IK+1+(JK-1)*LDA ) = E( JK ) END IF * * Copy D(JK), E(JK), TAU(JK) to broadcast them * columnwise. * D( JK ) = A( IK+(JK-1)*LDA ) WORK( J ) = D( JK ) WORK( N+J ) = E( JK ) TAU( JK ) = TAUI WORK( 2*N+J ) = TAU( JK ) 30 CONTINUE JN = JJ + N - 1 D( JN ) = A( II+N-1+(JN-1)*LDA ) WORK( N ) = D( JN ) TAU( JN ) = ZERO WORK( 2*N ) = ZERO * CALL SGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK, $ 1 ) * ELSE CALL SGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK, $ 1, IAROW, IACOL ) DO 40 J = 1, N - 1 JN = JJ + J - 1 D( JN ) = WORK( J ) E( JN ) = WORK( N+J ) TAU( JN ) = WORK( 2*N+J ) 40 CONTINUE JN = JJ + N - 1 D( JN ) = WORK( N ) TAU( JN ) = ZERO END IF END IF END IF * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSSYTD2 * END scalapack-2.0.2/SRC/pssytrd.f000644 000766 000024 00000040066 10363532303 016223 0ustar00juliestaff000000 000000 SUBROUTINE PSSYTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSSYTRD reduces a real symmetric matrix sub( A ) to symmetric * tridiagonal form T by an orthogonal similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * symmetric distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the orthogonal matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the orthogonal matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP +1 ), 3 * NB ) * * where NB = MB_A = NB_A, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a real scalar, and v is a real vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA .AND. IROFFA.EQ.0 ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, IINFO, IPW, $ IROFFA, J, JB, JX, K, KK, LWMIN, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PSLATRD, PSSYR2K, PSSYTD2, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = MAX( 1, NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) LWMIN = MAX( (NP+1)*NB, 3*NB ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 11 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYTRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPW = NP * NB + 1 * IF( UPPER ) THEN * * Reduce the upper triangle of sub( A ). * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, INDXG2P( JA+N-KK, $ NB, MYCOL, DESCA( CSRC_ ), NPCOL ), ICTXT, $ MAX( 1, NP ) ) * DO 10 K = N-KK+1, NB+1, -NB JB = MIN( N-K+1, NB ) I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part of * the matrix * CALL PSLATRD( UPLO, K+JB-1, JB, A, IA, JA, DESCA, D, E, TAU, $ WORK, 1, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(IA:I-1,JA:J-1), using an * update of the form: * A(IA:I-1,JA:J-1) := A(IA:I-1,JA:J-1) - V*W' - W*V' * CALL PSSYR2K( UPLO, 'No transpose', K-1, JB, -ONE, A, IA, J, $ DESCA, WORK, 1, 1, DESCW, ONE, A, IA, JA, $ DESCA ) * * Copy last superdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J, NB, 0, IACOL, NPCOL ), NQ ) CALL PSELSET( A, I-1, J, DESCA, E( JX ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PSSYTD2( UPLO, MIN( N, NB ), A, IA, JA, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) * ELSE * * Reduce the lower triangle of sub( A ) * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) * DO 20 K = 1, N-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part * of the matrix * CALL PSLATRD( UPLO, N-K+1, NB, A, I, J, DESCA, D, E, TAU, $ WORK, K, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(I+NB:IA+N-1,I+NB:IA+N-1), * using an update of the form: A(I+NB:IA+N-1,I+NB:IA+N-1) := * A(I+NB:IA+N-1,I+NB:IA+N-1) - V*W' - W*V' * CALL PSSYR2K( UPLO, 'No transpose', N-K-NB+1, NB, -ONE, A, $ I+NB, J, DESCA, WORK, K+NB, 1, DESCW, ONE, A, $ I+NB, J+NB, DESCA ) * * Copy last subdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J+NB-1, NB, 0, IACOL, NPCOL ), NQ ) CALL PSELSET( A, I+NB, J+NB-1, DESCA, E( JX ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + 1, NPCOL ) * 20 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PSSYTD2( UPLO, KK, A, IA+K-1, JA+K-1, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSSYTRD * END scalapack-2.0.2/SRC/pssyttrd.f000644 000766 000024 00000122476 11750130340 016411 0ustar00juliestaff000000 000000 SUBROUTINE PSSYTTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), D( * ), E( * ), TAU( * ), WORK( * ) * .. * * Purpose * * ======= * * PSSYTTRD reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding * process and memory location. * * Let A be a generic term for any 2D block cyclicly distributed * array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- ----------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, * indicating the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to * distribute the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to * distribute the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the * first row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCp(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCp( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCq( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes * of its process row. * The values of LOCp() and LOCq() may be determined via a call to * the ScaLAPACK tool function, NUMROC: * LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCq(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) REAL array, dim LOCq(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) REAL array, dim LOCq(JA+N-1) * if UPLO = 'U', LOCq(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) REAL, array, dimension * LOCq(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace) REAL array, dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal workspace * * LWORK (local input) INTEGER * The dimension of the array WORK. * LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + NPS * Where: * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) * ANB = PJLAENV( DESCA( CTXT_ ), 3, 'PSSYTTRD', 'L', 0, 0, * 0, 0 ) * * NUMROC is a ScaLAPACK tool function; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of * elementary reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of * elementary reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and * vi denotes an element of the vector defining H(i). * * Data storage requirements * ========================= * * PSSYTTRD is not intended to be called directly. All users are * encourage to call PSSYTRD which will then call PSHETTRD if * appropriate. A must be in cyclic format (i.e. MB = NB = 1), * the process grid must be square ( i.e. NPROW = NPCOL ) and * only lower triangular storage is supported. * * Local variables * =============== * * PSSYTTRD uses five local arrays: * WORK ( InV ) dimension ( NP, ANB+1): array V * WORK ( InH ) dimension ( NP, ANB+1): array H * WORK ( InVT ) dimension ( NQ, ANB+1): transpose of the array V * WORK ( InHT ) dimension ( NQ, ANB+1): transpose of the array H * WORK ( InVTT ) dimension ( NQ, 1): transpose of the array VT * * Arrays V and H are replicated across all processor columns. * Arrays V^T and H^T are replicated across all processor rows. * * WORK ( InVT ), or V^T, is stored as a tall skinny * array ( NQ x ANB-1 ) for efficiency. Since only the lower * triangular portion of A is updated, Av is computed as: * tril(A) * v + v^T * tril(A,-1). This is performed as * two local triangular matrix-vector multiplications (both in * MVR2) followed by a transpose and a sum across the columns. * In the local computation, WORK( InVT ) is used to compute * tril(A) * v and WORK( InV ) is used to compute * v^T * tril(A,-1) * * The following variables are global indices into A: * INDEX: The current global row and column number. * MAXINDEX: The global row and column for the first row and * column in the trailing block of A. * LIIB, LIJB: The first row, column in * * The following variables point into the arrays A, V, H, V^T, H^T: * BINDEX =INDEX-MININDEX: The column index in V, H, V^T, H^T. * LII: local index I: The local row number for row INDEX * LIJ: local index J: The local column number for column INDEX * LIIP1: local index I+1: The local row number for row INDEX+1 * LIJP1: local index J+1: The local col number for col INDEX+1 * LTLI: lower triangular local index I: The local row for the * upper left entry in tril( A(INDEX, INDEX) ) * LTLIP1: lower triangular local index I+1: The local row for the * upper left entry in tril( A(INDEX+1, INDEX+1) ) * * Details: The distinction between LII and LTLI (and between * LIIP1 and LTLIP1) is subtle. Within the current processor * column (i.e. MYCOL .eq. CURCOL) they are the same. However, * on some processors, A( LII, LIJ ) points to an element * above the diagonal, on these processors, LTLI = LII+1. * * The following variables give the number of rows and/or columns * in various matrices: * NP: The number of local rows in A( 1:N, 1:N ) * NQ: The number of local columns in A( 1:N, 1:N ) * NPM0: The number of local rows in A( INDEX:N, INDEX:N ) * NQM0: The number of local columns in A( INDEX:N, INDEX:N ) * NPM1: The number of local rows in A( INDEX+1:N, INDEX:N ) * NQM1: The number of local columns in A( INDEX+1:N, INDEX:N ) * LTNM0: The number of local rows & columns in * tril( A( INDEX:N, INDEX:N ) ) * LTNM1: The number of local rows & columns in * tril( A( INDEX+1:N, INDEX+1:N ) ) * NOTE: LTNM0 == LTNM1 on all processors except the diagonal * processors, i.e. those where MYCOL == MYROW. * * Invariants: * NP = NPM0 + LII - 1 * NQ = NQM0 + LIJ - 1 * NP = NPM1 + LIIP1 - 1 * NQ = NQM1 + LIJP1 - 1 * NP = LTLI + LTNM0 - 1 * NP = LTLIP1 + LTNM1 - 1 * * Temporary variables. The following variables are used within * a few lines after they are set and do hold state from one loop * iteration to the next: * * The matrix A: * The matrix A does not hold the same values that it would * in an unblocked code nor the values that it would hold in * in a blocked code. * * The value of A is confusing. It is easiest to state the * difference between trueA and A at the point that MVR2 is called, * so we will start there. * * Let trueA be the value that A would * have at a given point in an unblocked code and A * be the value that A has in this code at the same point. * * At the time of the call to MVR2, * trueA = A + V' * H + H' * V * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ). * * At the bottom of the inner loop, * trueA = A + V' * H + H' * V + v' * h + h' * v * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ) and * v = V( liip1:N, BINDEX+1 ) and * h = H( liip1:N, BINDEX+1 ) * * At the top of the loop, BINDEX gets incremented, hence: * trueA = A + V' * H + H' * V + v' * h + h' * v * where H = H( MAXINDEX:N, 1:BINDEX-1 ) and * V = V( MAXINDEX:N, 1:BINDEX-1 ) and * v = V( liip1:N, BINDEX ) and * h = H( liip1:N, BINDEX ) * * * A gets updated at the bottom of the outer loop * After this update, trueA = A + v' * h + h' * v * where v = V( liip1:N, BINDEX ) and * h = H( liip1:N, BINDEX ) and BINDEX = 0 * Indeed, the previous loop invariant as stated above for the * top of the loop still holds, but with BINDEX = 0, H and V * are null matrices. * * After the current column of A is updated, * trueA( INDEX, INDEX:N ) = A( INDEX, INDEX:N ) * the rest of A is untouched. * * After the current block column of A is updated, * trueA = A + V' * H + H' * V * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ) * * This brings us back to the point at which mvr2 is called. * * * Details of the parallelization: * * We delay spreading v across to all processor columns (which * would naturally happen at the bottom of the loop) in order to * combine the spread of v( : , i-1 ) with the spread of h( : , i ) * * In order to compute h( :, i ), we must update A( :, i ) * which means that the processor column owning A( :, i ) must * have: c, tau, v( i, i ) and h( i, i ). * * The traditional * way of computing v (and the one used in pzlatrd.f and * zlatrd.f) is: * v = tau * v * c = v' * h * alpha = - tau * c / 2 * v = v + alpha * h * However, the traditional way of computing v requires that tau * be broadcast to all processors in the current column (to compute * v = tau * v) and then a sum-to-all is required (to * compute v' * h ). We use the following formula instead: * c = v' * h * v = tau * ( v - c * tau' * h / 2 ) * The above formula allows tau to be spread down in the * same call to SGSUM2D which performs the sum-to-all of c. * * The computation of v, which could be performed in any processor * column (or other procesor subsets), is performed in the * processor column that owns A( :, i+1 ) so that A( :, i+1 ) * can be updated prior to spreading v across. * * We keep the block column of A up-to-date to minimize the * work required in updating the current column of A. Updating * the block column of A is reasonably load balanced whereas * updating the current column of A is not (only the current * processor column is involved). * * In the following overview of the steps performed, M in the * margin indicates message traffic and C indicates O(n^2 nb/sqrt(p)) * or more flops per processor. * * Inner loop: * A( index:n, index ) -= ( v * ht(bindex) + h * vt( bindex) ) *M h = house( A(index:n, index) ) *M Spread v, h across *M vt = v^T; ht = h^T * A( index+1:n, index+1:maxindex ) -= * ( v * ht(index+1:maxindex) + h *vt(index+1:maxindex) ) *C v = tril(A) * h; vt = ht * tril(A,-1) *MorC v = v - H*V*h - V*H*h *M v = v + vt^T *M c = v' * h * v = tau * ( v - c * tau' * h / 2 ) *C A = A - H*V - V*H * * * * ================================================================= * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E0 ) REAL Z_ONE, Z_NEGONE, Z_ZERO PARAMETER ( Z_ONE = 1.0E0, Z_NEGONE = -1.0E0, $ Z_ZERO = 0.0E0 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * * * .. Local Scalars .. * * LOGICAL BALANCED, INTERLEAVE, TWOGEMMS, UPPER INTEGER ANB, BINDEX, CURCOL, CURROW, I, ICTXT, INDEX, $ INDEXA, INDEXINH, INDEXINV, INH, INHB, INHT, $ INHTB, INTMP, INV, INVB, INVT, INVTB, J, LDA, $ LDV, LDZG, LII, LIIB, LIIP1, LIJ, LIJB, LIJP1, $ LTLIP1, LTNM1, LWMIN, MAXINDEX, MININDEX, $ MYCOL, MYFIRSTROW, MYROW, MYSETNUM, NBZG, NP, $ NPB, NPCOL, NPM0, NPM1, NPROW, NPS, NPSET, NQ, $ NQB, NQM1, NUMROWS, NXTCOL, NXTROW, PBMAX, $ PBMIN, PBSIZE, PNB, ROWSPERPROC REAL ALPHA, BETA, C, NORM, ONEOVERBETA, SAFMAX, $ SAFMIN, TOPH, TOPNV, TOPTAU, TOPV, TTOPH, TTOPV * .. * .. Local Arrays .. * * * * INTEGER IDUM1( 1 ), IDUM2( 1 ) REAL CC( 3 ), DTMP( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PSTREECOMB, $ PXERBLA, SCOMBNRM2, SGEBR2D, SGEBS2D, SGEMM, $ SGEMV, SGERV2D, SGESD2D, SGSUM2D, SLAMOV, $ SSCAL, STRMVT * .. * .. External Functions .. * LOGICAL LSAME INTEGER ICEIL, NUMROC, PJLAENV REAL PSLAMCH, SNRM2 EXTERNAL LSAME, ICEIL, NUMROC, PJLAENV, PSLAMCH, SNRM2 * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD, REAL, SIGN, SQRT * .. * * * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * * * Further details * =============== * * At the top of the loop, v and nh have been computed but not * spread across. Hence, A is out-of-date even after the * rank 2k update. Furthermore, we compute the next v before * nh is spread across. * * I claim that if we used a sum-to-all on NV, by summing CC within * each column, that we could compute NV locally and could avoid * spreading V across. Bruce claims that sum-to-all can be made * to cost no more than sum-to-one on the Paragon. If that is * true, this would be a win. But, * the BLACS sum-to-all is just a sum-to-one followed by a broadcast, * and hence the present scheme is better for now. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * SAFMAX = SQRT( PSLAMCH( ICTXT, 'O' ) ) / N SAFMIN = SQRT( PSLAMCH( ICTXT, 'S' ) ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE * * Here we set execution options for PSSYTTRD * PNB = PJLAENV( ICTXT, 2, 'PSSYTTRD', 'L', 0, 0, 0, 0 ) ANB = PJLAENV( ICTXT, 3, 'PSSYTTRD', 'L', 0, 0, 0, 0 ) * INTERLEAVE = ( PJLAENV( ICTXT, 4, 'PSSYTTRD', 'L', 1, 0, 0, $ 0 ).EQ.1 ) TWOGEMMS = ( PJLAENV( ICTXT, 4, 'PSSYTTRD', 'L', 2, 0, 0, $ 0 ).EQ.1 ) BALANCED = ( PJLAENV( ICTXT, 4, 'PSSYTTRD', 'L', 3, 0, 0, $ 0 ).EQ.1 ) * CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) * * UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 .AND. DESCA( NB_ ).NE.1 ) $ INFO = 600 + NB_ IF( INFO.EQ.0 ) THEN * * * Here is the arithmetic: * Let maxnpq = max( np, nq, 2 * ANB ) * LDV = 4 * max( np, nq ) + 2 * LWMIN = 2 * ( ANB + 1 ) * LDV + MAX( np, 2 * ANB ) * = 2 * ( ANB + 1 ) * ( 4 * NPS + 2 ) + NPS * * This overestimates memory requirements when ANB > NP/2 * Memory requirements are lower when interleave = .false. * Hence, we could have two sets of memory requirements, * one for interleave and one for * * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) LWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + NPS * WORK( 1 ) = REAL( LWMIN ) IF( .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IA.NE.1 ) THEN INFO = -4 ELSE IF( JA.NE.1 ) THEN INFO = -5 ELSE IF( NPROW.NE.NPCOL ) THEN INFO = -( 600+CTXT_ ) ELSE IF( DESCA( DTYPE_ ).NE.1 ) THEN INFO = -( 600+DTYPE_ ) ELSE IF( DESCA( MB_ ).NE.1 ) THEN INFO = -( 600+MB_ ) ELSE IF( DESCA( NB_ ).NE.1 ) THEN INFO = -( 600+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.0 ) THEN INFO = -( 600+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.0 ) THEN INFO = -( 600+CSRC_ ) ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 * CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYTTRD', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * * Reduce the lower triangle of sub( A ) NP = NUMROC( N, 1, MYROW, 0, NPROW ) NQ = NUMROC( N, 1, MYCOL, 0, NPCOL ) * NXTROW = 0 NXTCOL = 0 * LIIP1 = 1 LIJP1 = 1 NPM1 = NP NQM1 = NQ * LDA = DESCA( LLD_ ) ICTXT = DESCA( CTXT_ ) * * * * Miscellaneous details: * Put tau, D and E in the right places * Check signs * Place all the arrays in WORK, control their placement * in memory. * * * * Loop invariants * A(LIIP1, LIJ) points to the first element of A(I+1,J) * NPM1,NQM1 = the number of rows, cols in A( LII+1:N,LIJ+1:N ) * A(LII:N,LIJ:N) is one step out of date. * proc( CURROW, CURCOL ) owns A(LII,LIJ) * proc( NXTROW, CURCOL ) owns A(LIIP1,LIJ) * INH = 1 * IF( INTERLEAVE ) THEN * * H and V are interleaved to minimize memory movement * LDV has to be twice as large to accomodate interleaving. * In addition, LDV is doubled again to allow v, h and * toptau to be spreaad across and transposed in a * single communication operation with minimum memory * movement. * * We could reduce LDV back to 2*MAX(NPM1,NQM1) * by increasing the memory movement required in * the spread and transpose of v, h and toptau. * However, since the non-interleaved path already * provides a mear minimum memory requirement option, * we did not provide this additional path. * LDV = 4*( MAX( NPM1, NQM1 ) ) + 2 * INH = 1 * INV = INH + LDV / 2 INVT = INH + ( ANB+1 )*LDV * INHT = INVT + LDV / 2 INTMP = INVT + LDV*( ANB+1 ) * ELSE LDV = MAX( NPM1, NQM1 ) * INHT = INH + LDV*( ANB+1 ) INV = INHT + LDV*( ANB+1 ) * * The code works without this +1, but only because of a * coincidence. Without the +1, WORK(INVT) gets trashed, but * WORK(INVT) is only used once and when it is used, it is * multiplied by WORK( INH ) which is zero. Hence, the fact * that WORK(INVT) is trashed has no effect. * INVT = INV + LDV*( ANB+1 ) + 1 INTMP = INVT + LDV*( 2*ANB ) * END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSSYTTRD', -INFO ) WORK( 1 ) = REAL( LWMIN ) RETURN END IF * * * The satisfies the loop invariant: trueA = A - V * HT - H * VT, * (where V, H, VT and HT all have BINDEX+1 rows/columns) * the first ANB times through the loop. * * * * Setting either ( InH and InHT ) or InV to Z_ZERO * is adequate except in the face of NaNs. * * DO 10 I = 1, NP WORK( INH+I-1 ) = Z_ZERO WORK( INV+I-1 ) = Z_ZERO 10 CONTINUE DO 20 I = 1, NQ WORK( INHT+I-1 ) = Z_ZERO 20 CONTINUE * * * TOPNV = Z_ZERO * LTLIP1 = LIJP1 LTNM1 = NPM1 IF( MYCOL.GT.MYROW ) THEN LTLIP1 = LTLIP1 + 1 LTNM1 = LTNM1 - 1 END IF * * DO 210 MININDEX = 1, N - 1, ANB * * MAXINDEX = MIN( MININDEX+ANB-1, N ) LIJB = NUMROC( MAXINDEX, 1, MYCOL, 0, NPCOL ) + 1 LIIB = NUMROC( MAXINDEX, 1, MYROW, 0, NPROW ) + 1 * NQB = NQ - LIJB + 1 NPB = NP - LIIB + 1 INHTB = INHT + LIJB - 1 INVTB = INVT + LIJB - 1 INHB = INH + LIIB - 1 INVB = INV + LIIB - 1 * * * * DO 160 INDEX = MININDEX, MIN( MAXINDEX, N-1 ) * BINDEX = INDEX - MININDEX * CURROW = NXTROW CURCOL = NXTCOL * NXTROW = MOD( CURROW+1, NPROW ) NXTCOL = MOD( CURCOL+1, NPCOL ) * LII = LIIP1 LIJ = LIJP1 NPM0 = NPM1 * IF( MYROW.EQ.CURROW ) THEN NPM1 = NPM1 - 1 LIIP1 = LIIP1 + 1 END IF IF( MYCOL.EQ.CURCOL ) THEN NQM1 = NQM1 - 1 LIJP1 = LIJP1 + 1 LTLIP1 = LTLIP1 + 1 LTNM1 = LTNM1 - 1 END IF * * * * * V = NV, VT = NVT, H = NH, HT = NHT * * * Update the current column of A * * IF( MYCOL.EQ.CURCOL ) THEN * INDEXA = LII + ( LIJ-1 )*LDA INDEXINV = INV + LII - 1 + ( BINDEX-1 )*LDV INDEXINH = INH + LII - 1 + ( BINDEX-1 )*LDV TTOPH = WORK( INHT+LIJ-1+BINDEX*LDV ) TTOPV = TOPNV * IF( INDEX.GT.1 ) THEN DO 30 I = 0, NPM0 - 1 * A( INDEXA+I ) = A( INDEXA+I ) A( INDEXA+I ) = A( INDEXA+I ) - $ WORK( INDEXINV+LDV+I )*TTOPH - $ WORK( INDEXINH+LDV+I )*TTOPV 30 CONTINUE END IF * * END IF * * IF( MYCOL.EQ.CURCOL ) THEN * * Compute the householder vector * IF( MYROW.EQ.CURROW ) THEN DTMP( 2 ) = A( LII+( LIJ-1 )*LDA ) ELSE DTMP( 2 ) = ZERO END IF IF( MYROW.EQ.NXTROW ) THEN DTMP( 3 ) = A( LIIP1+( LIJ-1 )*LDA ) DTMP( 4 ) = ZERO ELSE DTMP( 3 ) = ZERO DTMP( 4 ) = ZERO END IF * NORM = SNRM2( NPM1, A( LIIP1+( LIJ-1 )*LDA ), 1 ) DTMP( 1 ) = NORM * * IF DTMP(5) = 1.0, NORM is too large and might cause * overflow, hence PSTREECOMB must be called. IF DTMP(5) * is zero on output, DTMP(1) can be trusted. * DTMP( 5 ) = ZERO IF( DTMP( 1 ).GE.SAFMAX .OR. DTMP( 1 ).LT.SAFMIN ) THEN DTMP( 5 ) = ONE DTMP( 1 ) = ZERO END IF * DTMP( 1 ) = DTMP( 1 )*DTMP( 1 ) CALL SGSUM2D( ICTXT, 'C', ' ', 5, 1, DTMP, 5, -1, $ CURCOL ) IF( DTMP( 5 ).EQ.ZERO ) THEN DTMP( 1 ) = SQRT( DTMP( 1 ) ) ELSE DTMP( 1 ) = NORM CALL PSTREECOMB( ICTXT, 'C', 1, DTMP, -1, MYCOL, $ SCOMBNRM2 ) END IF * NORM = DTMP( 1 ) * D( LIJ ) = DTMP( 2 ) IF( MYROW.EQ.CURROW .AND. MYCOL.EQ.CURCOL ) THEN A( LII+( LIJ-1 )*LDA ) = D( LIJ ) END IF * * ALPHA = DTMP( 3 ) * NORM = SIGN( NORM, ALPHA ) * IF( NORM.EQ.ZERO ) THEN TOPTAU = ZERO ELSE BETA = NORM + ALPHA TOPTAU = BETA / NORM ONEOVERBETA = 1.0E0 / BETA * CALL SSCAL( NPM1, ONEOVERBETA, $ A( LIIP1+( LIJ-1 )*LDA ), 1 ) END IF * IF( MYROW.EQ.NXTROW ) THEN A( LIIP1+( LIJ-1 )*LDA ) = Z_ONE END IF * TAU( LIJ ) = TOPTAU E( LIJ ) = -NORM * END IF * * * Spread v, nh, toptau across * DO 40 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+I ) = A( LIIP1+I+ $ ( LIJ-1 )*LDA ) 40 CONTINUE * IF( MYCOL.EQ.CURCOL ) THEN WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 ) = TOPTAU CALL SGEBS2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), $ NPM1+NPM1+1 ) ELSE CALL SGEBR2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), $ NPM1+NPM1+1, MYROW, CURCOL ) TOPTAU = WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 ) END IF DO 50 I = 0, NPM1 - 1 WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1- $ 1+BINDEX*LDV+NPM1+I ) 50 CONTINUE * IF( INDEX.LT.N ) THEN IF( MYROW.EQ.NXTROW .AND. MYCOL.EQ.CURCOL ) $ A( LIIP1+( LIJ-1 )*LDA ) = E( LIJ ) END IF * * Transpose v, nh * * IF( MYROW.EQ.MYCOL ) THEN DO 60 I = 0, NPM1 + NPM1 WORK( INVT+LIJP1-1+BINDEX*LDV+I ) = WORK( INV+LIIP1-1+ $ BINDEX*LDV+I ) 60 CONTINUE ELSE CALL SGESD2D( ICTXT, NPM1+NPM1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), NPM1+NPM1, $ MYCOL, MYROW ) CALL SGERV2D( ICTXT, NQM1+NQM1, 1, $ WORK( INVT+LIJP1-1+BINDEX*LDV ), NQM1+NQM1, $ MYCOL, MYROW ) END IF * DO 70 I = 0, NQM1 - 1 WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV+I ) = WORK( INVT+ $ LIJP1-1+BINDEX*LDV+NQM1+I ) 70 CONTINUE * * * Update the current block column of A * IF( INDEX.GT.1 ) THEN DO 90 J = LIJP1, LIJB - 1 DO 80 I = 0, NPM1 - 1 * A( LIIP1+I+( J-1 )*LDA ) = A( LIIP1+I+( J-1 )*LDA ) $ - WORK( INV+LIIP1-1+BINDEX*LDV+I )* $ WORK( INHT+J-1+BINDEX*LDV ) - $ WORK( INH+LIIP1-1+BINDEX*LDV+I )* $ WORK( INVT+J-1+BINDEX*LDV ) 80 CONTINUE 90 CONTINUE END IF * * * * Compute NV = A * NHT; NVT = A * NH * * These two lines are necessary because these elements * are not always involved in the calls to STRMVT * for two reasons: * 1) On diagonal processors, the call to TRMVT * involves only LTNM1-1 elements * 2) On some processes, NQM1 < LTM1 or LIIP1 < LTLIP1 * and when the results are combined across all processes, * uninitialized values may be included. WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ) = Z_ZERO WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+NQM1-1 ) = Z_ZERO * * IF( MYROW.EQ.MYCOL ) THEN IF( LTNM1.GT.1 ) THEN CALL STRMVT( 'L', LTNM1-1, $ A( LTLIP1+1+( LIJP1-1 )*LDA ), LDA, $ WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ), 1, $ WORK( INH+LTLIP1+1-1+( BINDEX+1 )*LDV ), $ 1, WORK( INV+LTLIP1+1-1+( BINDEX+1 )* $ LDV ), 1, WORK( INHT+LIJP1-1+( BINDEX+ $ 1 )*LDV ), 1 ) END IF DO 100 I = 1, LTNM1 WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV ) $ = WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV ) + $ A( LTLIP1+I-1+( LIJP1+I-1-1 )*LDA )* $ WORK( INH+LTLIP1+I-1-1+( BINDEX+1 )*LDV ) 100 CONTINUE ELSE IF( LTNM1.GT.0 ) $ CALL STRMVT( 'L', LTNM1, A( LTLIP1+( LIJP1-1 )*LDA ), $ LDA, WORK( INVT+LIJP1-1+( BINDEX+1 )* $ LDV ), 1, WORK( INH+LTLIP1-1+( BINDEX+ $ 1 )*LDV ), 1, WORK( INV+LTLIP1-1+ $ ( BINDEX+1 )*LDV ), 1, $ WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV ), $ 1 ) * END IF * * * We take advantage of the fact that: * A * sum( B ) = sum ( A * B ) for matrices A,B * * trueA = A + V * HT + H * VT * hence: (trueA)v = Av' + V * HT * v + H * VT * v * VT * v = sum_p_in_NPROW ( VTp * v ) * H * VT * v = H * sum (VTp * v) = sum ( H * VTp * v ) * * v = v + V * HT * h + H * VT * h * * * * tmp = HT * nh1 DO 110 I = 1, 2*( BINDEX+1 ) WORK( INTMP-1+I ) = 0 110 CONTINUE * IF( BALANCED ) THEN NPSET = NPROW MYSETNUM = MYROW ROWSPERPROC = ICEIL( NQB, NPSET ) MYFIRSTROW = MIN( NQB+1, 1+ROWSPERPROC*MYSETNUM ) NUMROWS = MIN( ROWSPERPROC, NQB-MYFIRSTROW+1 ) * * * tmp = HT * v * CALL SGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE, $ WORK( INHTB+MYFIRSTROW-1 ), LDV, $ WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ), $ 1, Z_ZERO, WORK( INTMP ), 1 ) * tmp2 = VT * v CALL SGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE, $ WORK( INVTB+MYFIRSTROW-1 ), LDV, $ WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ), $ 1, Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 ) * * CALL SGSUM2D( ICTXT, 'C', ' ', 2*( BINDEX+1 ), 1, $ WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 ) ELSE * tmp = HT * v * CALL SGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INHTB ), $ LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1, $ Z_ZERO, WORK( INTMP ), 1 ) * tmp2 = VT * v CALL SGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INVTB ), $ LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1, $ Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 ) * END IF * * * IF( BALANCED ) THEN MYSETNUM = MYCOL * ROWSPERPROC = ICEIL( NPB, NPSET ) MYFIRSTROW = MIN( NPB+1, 1+ROWSPERPROC*MYSETNUM ) NUMROWS = MIN( ROWSPERPROC, NPB-MYFIRSTROW+1 ) * CALL SGSUM2D( ICTXT, 'R', ' ', 2*( BINDEX+1 ), 1, $ WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 ) * * * v = v + V * tmp IF( INDEX.GT.1. ) THEN CALL SGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE, $ WORK( INVB+MYFIRSTROW-1 ), LDV, $ WORK( INTMP ), 1, Z_ONE, $ WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )* $ LDV ), 1 ) * * v = v + H * tmp2 CALL SGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE, $ WORK( INHB+MYFIRSTROW-1 ), LDV, $ WORK( INTMP+BINDEX+1 ), 1, Z_ONE, $ WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )* $ LDV ), 1 ) END IF * ELSE * v = v + V * tmp CALL SGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INVB ), $ LDV, WORK( INTMP ), 1, Z_ONE, $ WORK( INVB+( BINDEX+1 )*LDV ), 1 ) * * * v = v + H * tmp2 CALL SGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INHB ), $ LDV, WORK( INTMP+BINDEX+1 ), 1, Z_ONE, $ WORK( INVB+( BINDEX+1 )*LDV ), 1 ) * END IF * * * Transpose NV and add it back into NVT * IF( MYROW.EQ.MYCOL ) THEN DO 120 I = 0, NQM1 - 1 WORK( INTMP+I ) = WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+ $ I ) 120 CONTINUE ELSE CALL SGESD2D( ICTXT, NQM1, 1, $ WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ), $ NQM1, MYCOL, MYROW ) CALL SGERV2D( ICTXT, NPM1, 1, WORK( INTMP ), NPM1, MYCOL, $ MYROW ) * END IF DO 130 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1- $ 1+( BINDEX+1 )*LDV+I ) + WORK( INTMP+I ) 130 CONTINUE * * Sum-to-one NV rowwise (within a row) * CALL SGSUM2D( ICTXT, 'R', ' ', NPM1, 1, $ WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ), NPM1, $ MYROW, NXTCOL ) * * * Dot product c = NV * NH * Sum-to-all c within next processor column * * IF( MYCOL.EQ.NXTCOL ) THEN CC( 1 ) = Z_ZERO DO 140 I = 0, NPM1 - 1 CC( 1 ) = CC( 1 ) + WORK( INV+LIIP1-1+( BINDEX+1 )* $ LDV+I )*WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+ $ I ) 140 CONTINUE IF( MYROW.EQ.NXTROW ) THEN CC( 2 ) = WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ) CC( 3 ) = WORK( INH+LIIP1-1+( BINDEX+1 )*LDV ) ELSE CC( 2 ) = Z_ZERO CC( 3 ) = Z_ZERO END IF CALL SGSUM2D( ICTXT, 'C', ' ', 3, 1, CC, 3, -1, NXTCOL ) * TOPV = CC( 2 ) C = CC( 1 ) TOPH = CC( 3 ) * TOPNV = TOPTAU*( TOPV-C*TOPTAU / 2*TOPH ) * * * Compute V = Tau * (V - C * Tau' / 2 * H ) * * DO 150 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = TOPTAU* $ ( WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I )-C*TOPTAU / $ 2*WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+I ) ) 150 CONTINUE * END IF * * 160 CONTINUE * * * Perform the rank2k update * IF( MAXINDEX.LT.N ) THEN * DO 170 I = 0, NPM1 - 1 WORK( INTMP+I ) = WORK( INH+LIIP1-1+ANB*LDV+I ) 170 CONTINUE * * * IF( .NOT.TWOGEMMS ) THEN IF( INTERLEAVE ) THEN LDZG = LDV / 2 ELSE CALL SLAMOV( 'A', LTNM1, ANB, WORK( INHT+LIJP1-1 ), $ LDV, WORK( INVT+LIJP1-1+ANB*LDV ), LDV ) * CALL SLAMOV( 'A', LTNM1, ANB, WORK( INV+LTLIP1-1 ), $ LDV, WORK( INH+LTLIP1-1+ANB*LDV ), LDV ) LDZG = LDV END IF NBZG = ANB*2 ELSE LDZG = LDV NBZG = ANB END IF * * DO 180 PBMIN = 1, LTNM1, PNB * PBSIZE = MIN( PNB, LTNM1-PBMIN+1 ) PBMAX = MIN( LTNM1, PBMIN+PNB-1 ) CALL SGEMM( 'N', 'C', PBSIZE, PBMAX, NBZG, Z_NEGONE, $ WORK( INH+LTLIP1-1+PBMIN-1 ), LDZG, $ WORK( INVT+LIJP1-1 ), LDZG, Z_ONE, $ A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA ) IF( TWOGEMMS ) THEN CALL SGEMM( 'N', 'C', PBSIZE, PBMAX, ANB, Z_NEGONE, $ WORK( INV+LTLIP1-1+PBMIN-1 ), LDZG, $ WORK( INHT+LIJP1-1 ), LDZG, Z_ONE, $ A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA ) END IF 180 CONTINUE * * * DO 190 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+I ) = WORK( INV+LIIP1-1+ANB*LDV+I ) WORK( INH+LIIP1-1+I ) = WORK( INTMP+I ) 190 CONTINUE DO 200 I = 0, NQM1 - 1 WORK( INHT+LIJP1-1+I ) = WORK( INHT+LIJP1-1+ANB*LDV+I ) 200 CONTINUE * * END IF * * End of the update A code * 210 CONTINUE * IF( MYCOL.EQ.NXTCOL ) THEN IF( MYROW.EQ.NXTROW ) THEN * D( NQ ) = A( NP+( NQ-1 )*LDA ) * CALL SGEBS2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1 ) ELSE CALL SGEBR2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1, NXTROW, $ NXTCOL ) END IF END IF * * * * WORK( 1 ) = REAL( LWMIN ) RETURN * * End of PSSYTTRD * * END scalapack-2.0.2/SRC/pstrcon.f000644 000766 000024 00000040317 10363532303 016202 0ustar00juliestaff000000 000000 SUBROUTINE PSTRCON( NORM, UPLO, DIAG, N, A, IA, JA, DESCA, RCOND, $ WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER IA, JA, INFO, LIWORK, LWORK, N REAL RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), IWORK( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PSTRCON estimates the reciprocal of the condition number of a * triangular distributed matrix A(IA:IA+N-1,JA:JA+N-1), in either the * 1-norm or the infinity-norm. * * The norm of A(IA:IA+N-1,JA:JA+N-1) is computed and an estimate is * obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), then the reciprocal * of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies whether the 1-norm condition number or the * infinity-norm condition number is required: * = '1' or 'O': 1-norm; * = 'I': Infinity-norm. * * UPLO (global input) CHARACTER * = 'U': A(IA:IA+N-1,JA:JA+N-1) is upper triangular; * = 'L': A(IA:IA+N-1,JA:JA+N-1) is lower triangular. * * DIAG (global input) CHARACTER * = 'N': A(IA:IA+N-1,JA:JA+N-1) is non-unit triangular; * = 'U': A(IA:IA+N-1,JA:JA+N-1) is unit triangular. * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the triangular distributed * matrix A(IA:IA+N-1,JA:JA+N-1). If UPLO = 'U', the leading * N-by-N upper triangular part of this distributed matrix con- * tains the upper triangular matrix, and its strictly lower * triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of this ditributed * matrix contains the lower triangular matrix, and the strictly * upper triangular part is not referenced. If DIAG = 'U', the * diagonal elements of A(IA:IA+N-1,JA:JA+N-1) are also not * referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * RCOND (global output) REAL * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + LOCc(N+MOD(JA-1,NB_A)) * + MAX( 2, MAX( NB_A*MAX( 1, CEIL(NPROW-1,NPCOL) ), * LOCc(N+MOD(JA-1,NB_A)) + * NB_A*MAX( 1, CEIL(NPCOL-1,NPROW) ) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr(N+MOD(IA-1,MB_A)). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOUNIT, ONENRM, UPPER CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPN, IPV, IPW, $ IPX, IROFF, IV, IX, IXX, JJA, JV, JX, KASE, $ KASE1, LIWMIN, LWMIN, MYCOL, MYROW, NP, NPCOL, $ NPMOD, NPROW, NQ, NQMOD REAL AINVNM, ANORM, SCALE, SMLNUM REAL WMAX * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 5 ), $ IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG2L, $ PCHK1MAT, PSAMAX, PSLATRS, PSLACON, $ PSRSCL, PB_TOPGET, PB_TOPSET, PXERBLA, SGEBR2D, $ SGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH, PSLANTR EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH, $ PSLANTR * .. * .. Intrinsic Functions .. INTRINSIC ABS, ICHAR, MAX, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 800 + CTXT_ ) ELSE CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) ONENRM = NORM.EQ.'1' .OR. LSAME( NORM, 'O' ) NOUNIT = LSAME( DIAG, 'N' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + NQMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = REAL( LWMIN ) LIWMIN = NPMOD IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.ONENRM .AND. .NOT.LSAME( NORM, 'I' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF * IF( ONENRM ) THEN IDUM1( 1 ) = ICHAR( '1' ) ELSE IDUM1( 1 ) = ICHAR( 'I' ) END IF IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'U' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 11 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 13 CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 5, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSTRCON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * RCOND = ZERO SMLNUM = PSLAMCH( ICTXT, 'Safe minimum' )*REAL( MAX( 1, N ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPN = IPV + NP IPW = IPN + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Compute the norm of the triangular matrix A. * ANORM = PSLANTR( NORM, UPLO, DIAG, N, N, A, IA, JA, DESCA, WORK ) * * Continue only if ANORM > 0. * IF( ANORM.GT.ZERO ) THEN * * Estimate the norm of the inverse of A. * AINVNM = ZERO NORMIN = 'N' IF( ONENRM ) THEN KASE1 = 1 ELSE KASE1 = 2 END IF KASE = 0 10 CONTINUE CALL PSLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), $ IX, JX, DESCX, IWORK, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( KASE.EQ.KASE1 ) THEN * * Multiply by inv(A). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( UPLO, 'No transpose', DIAG, NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SCALE, WORK( IPN ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(A'). * DESCX( CSRC_ ) = IACOL CALL PSLATRS( UPLO, 'Transpose', DIAG, NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SCALE, WORK( IPN ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF NORMIN = 'Y' * * Multiply by 1/SCALE if doing so will not cause overflow. * IF( SCALE.NE.ONE ) THEN CALL PSAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, $ DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL SGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, $ 1 ) ELSE CALL SGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, $ 1, IAROW, MYCOL ) END IF END IF IF( SCALE.LT.ABS( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PSRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / ANORM ) / AINVNM END IF * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PSTRCON * END scalapack-2.0.2/SRC/pstrord.f000644 000766 000024 00000472517 11750130340 016216 0ustar00juliestaff000000 000000 SUBROUTINE PSTRORD( COMPQ, SELECT, PARA, N, T, IT, JT, $ DESCT, Q, IQ, JQ, DESCQ, WR, WI, M, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK computational routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER COMPQ INTEGER INFO, LIWORK, LWORK, M, N, $ IT, JT, IQ, JQ * .. * .. Array Arguments .. INTEGER SELECT( * ) INTEGER PARA( 6 ), DESCT( * ), DESCQ( * ), IWORK( * ) REAL Q( * ), T( * ), WI( * ), WORK( * ), WR( * ) * .. * * Purpose * ======= * * PSTRORD reorders the real Schur factorization of a real matrix * A = Q*T*Q**T, so that a selected cluster of eigenvalues appears * in the leading diagonal blocks of the upper quasi-triangular matrix * T, and the leading columns of Q form an orthonormal basis of the * corresponding right invariant subspace. * * T must be in Schur form (as returned by PSLAHQR), that is, block * upper triangular with 1-by-1 and 2-by-2 diagonal blocks. * * This subroutine uses a delay and accumulate procedure for performing * the off-diagonal updates (see references for details). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * * COMPQ (global input) CHARACTER*1 * = 'V': update the matrix Q of Schur vectors; * = 'N': do not update Q. * * SELECT (global input/output) INTEGER array, dimension (N) * SELECT specifies the eigenvalues in the selected cluster. To * select a real eigenvalue w(j), SELECT(j) must be set to 1. * To select a complex conjugate pair of eigenvalues * w(j) and w(j+1), corresponding to a 2-by-2 diagonal block, * either SELECT(j) or SELECT(j+1) or both must be set to 1; * a complex conjugate pair of eigenvalues must be * either both included in the cluster or both excluded. * On output, the (partial) reordering is displayed. * * PARA (global input) INTEGER*6 * Block parameters (some should be replaced by calls to * PILAENV and others by meaningful default values): * PARA(1) = maximum number of concurrent computational windows * allowed in the algorithm; * 0 < PARA(1) <= min(NPROW,NPCOL) must hold; * PARA(2) = number of eigenvalues in each window; * 0 < PARA(2) < PARA(3) must hold; * PARA(3) = window size; PARA(2) < PARA(3) < DESCT(MB_) * must hold; * PARA(4) = minimal percentage of flops required for * performing matrix-matrix multiplications instead * of pipelined orthogonal transformations; * 0 <= PARA(4) <= 100 must hold; * PARA(5) = width of block column slabs for row-wise * application of pipelined orthogonal * transformations in their factorized form; * 0 < PARA(5) <= DESCT(MB_) must hold. * PARA(6) = the maximum number of eigenvalues moved together * over a process border; in practice, this will be * approximately half of the cross border window size * 0 < PARA(6) <= PARA(2) must hold; * * N (global input) INTEGER * The order of the globally distributed matrix T. N >= 0. * * T (local input/output) REAL array, * dimension (LLD_T,LOCc(N)). * On entry, the local pieces of the global distributed * upper quasi-triangular matrix T, in Schur form. On exit, T is * overwritten by the local pieces of the reordered matrix T, * again in Schur form, with the selected eigenvalues in the * globally leading diagonal blocks. * * IT (global input) INTEGER * JT (global input) INTEGER * The row and column index in the global array T indicating the * first column of sub( T ). IT = JT = 1 must hold. * * DESCT (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the global distributed matrix T. * * Q (local input/output) REAL array, * dimension (LLD_Q,LOCc(N)). * On entry, if COMPQ = 'V', the local pieces of the global * distributed matrix Q of Schur vectors. * On exit, if COMPQ = 'V', Q has been postmultiplied by the * global orthogonal transformation matrix which reorders T; the * leading M columns of Q form an orthonormal basis for the * specified invariant subspace. * If COMPQ = 'N', Q is not referenced. * * IQ (global input) INTEGER * JQ (global input) INTEGER * The column index in the global array Q indicating the * first column of sub( Q ). IQ = JQ = 1 must hold. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the global distributed matrix Q. * * WR (global output) REAL array, dimension (N) * WI (global output) REAL array, dimension (N) * The real and imaginary parts, respectively, of the reordered * eigenvalues of T. The eigenvalues are in principle stored in * the same order as on the diagonal of T, with WR(i) = T(i,i) * and, if T(i:i+1,i:i+1) is a 2-by-2 diagonal block, WI(i) > 0 * and WI(i+1) = -WI(i). * Note also that if a complex eigenvalue is sufficiently * ill-conditioned, then its value may differ significantly * from its value before reordering. * * M (global output) INTEGER * The dimension of the specified invariant subspace. * 0 <= M <= N. * * WORK (local workspace/output) REAL array, * dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued by PXERBLA. * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The dimension of the array IWORK. * * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value. * If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*1000+j), if the i-th * argument is a scalar and had an illegal value, then INFO = -i. * > 0: here we have several possibilites * *) Reordering of T failed because some eigenvalues are too * close to separate (the problem is very ill-conditioned); * T may have been partially reordered, and WR and WI * contain the eigenvalues in the same order as in T. * On exit, INFO = {the index of T where the swap failed}. * *) A 2-by-2 block to be reordered split into two 1-by-1 * blocks and the second block failed to swap with an * adjacent block. * On exit, INFO = {the index of T where the swap failed}. * *) If INFO = N+1, there is no valid BLACS context (see the * BLACS documentation for details). * In a future release this subroutine may distinguish between * the case 1 and 2 above. * * Additional requirements * ======================= * * The following alignment requirements must hold: * (a) DESCT( MB_ ) = DESCT( NB_ ) = DESCQ( MB_ ) = DESCQ( NB_ ) * (b) DESCT( RSRC_ ) = DESCQ( RSRC_ ) * (c) DESCT( CSRC_ ) = DESCQ( CSRC_ ) * * All matrices must be blocked by a block factor larger than or * equal to two (3). This is to simplify reordering across processor * borders in the presence of 2-by-2 blocks. * * Limitations * =========== * * This algorithm cannot work on submatrices of T and Q, i.e., * IT = JT = IQ = JQ = 1 must hold. This is however no limitation * since PDLAHQR does not compute Schur forms of submatrices anyway. * * References * ========== * * [1] Z. Bai and J. W. Demmel; On swapping diagonal blocks in real * Schur form, Linear Algebra Appl., 186:73--95, 1993. Also as * LAPACK Working Note 54. * * [2] D. Kressner; Block algorithms for reordering standard and * generalized Schur forms, ACM TOMS, 32(4):521-532, 2006. * Also LAPACK Working Note 171. * * [3] R. Granat, B. Kagstrom, and D. Kressner; Parallel eigenvalue * reordering in real Schur form, Concurrency and Computations: * Practice and Experience, 21(9):1225-1250, 2009. Also as * LAPACK Working Note 192. * * Parallel execution recommendations * ================================== * * Use a square grid, if possible, for maximum performance. The block * parameters in PARA should be kept well below the data distribution * block size. In particular, see [3] for recommended settings for * these parameters. * * In general, the parallel algorithm strives to perform as much work * as possible without crossing the block borders on the main block * diagonal. * * Contributors * ============ * * Implemented by Robert Granat, Dept. of Computing Science and HPC2N, * Umea University, Sweden, March 2007, * in collaboration with Bo Kagstrom and Daniel Kressner. * Modified by Meiyue Shao, October 2011. * * Revisions * ========= * * Please send bug-reports to granat@cs.umu.se * * Keywords * ======== * * Real Schur form, eigenvalue reordering * * ===================================================================== * .. * .. Parameters .. CHARACTER TOP INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ REAL ZERO, ONE PARAMETER ( TOP = '1-Tree', $ BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, $ ZERO = 0.0, ONE = 1.0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, PAIR, SWAP, WANTQ, $ ISHH, FIRST, SKIP1CR, BORDER, LASTWAIT INTEGER NPROW, NPCOL, MYROW, MYCOL, NB, NPROCS, $ IERR, DIM1, INDX, LLDT, TRSRC, TCSRC, ILOC1, $ JLOC1, MYIERR, ICTXT, $ RSRC1, CSRC1, ILOC3, JLOC3, TRSRC3, $ TCSRC3, ILOC, JLOC, TRSRC4, TCSRC4, $ FLOPS, I, ILO, IHI, J, K, KK, KKS, $ KS, LIWMIN, LWMIN, MMULT, N1, N2, $ NCB, NDTRAF, NITRAF, NWIN, NUMWIN, PDTRAF, $ PITRAF, PDW, WINEIG, WINSIZ, LLDQ, $ RSRC, CSRC, ILILO, ILIHI, ILSEL, IRSRC, $ ICSRC, IPIW, IPW1, IPW2, IPW3, TIHI, TILO, $ LIHI, WINDOW, LILO, LSEL, BUFFER, $ NMWIN2, BUFFLEN, LROWS, LCOLS, ILOC2, JLOC2, $ WNEICR, WINDOW0, RSRC4, CSRC4, LIHI4, RSRC3, $ CSRC3, RSRC2, CSRC2, LIHIC, LIHI1, ILEN4, $ SELI4, ILEN1, DIM4, IPW4, QROWS, TROWS, $ TCOLS, IPW5, IPW6, IPW7, IPW8, JLOC4, $ EAST, WEST, ILOC4, SOUTH, NORTH, INDXS, $ ITT, JTT, ILEN, DLEN, INDXE, TRSRC1, TCSRC1, $ TRSRC2, TCSRC2, ILOS, DIR, TLIHI, TLILO, TLSEL, $ ROUND, LAST, WIN0S, WIN0E, WINE, MMAX, MMIN REAL ELEM, ELEM1, ELEM2, ELEM3, ELEM4, SN, CS, TMP, $ ELEM5 * .. * .. Local Arrays .. INTEGER IBUFF( 8 ), IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC, INDXG2P, INDXG2L EXTERNAL LSAME, NUMROC, INDXG2P, INDXG2L * .. * .. External Subroutines .. EXTERNAL PSLACPY, PXERBLA, PCHK1MAT, PCHK2MAT, $ SGEMM, SLAMOV, ILACPY, CHK1MAT, $ INFOG2L, DGSUM2D, SGESD2D, SGERV2D, SGEBS2D, $ SGEBR2D, IGSUM2D, BLACS_GRIDINFO, IGEBS2D, $ IGEBR2D, IGAMX2D, IGAMN2D, BSLAAPP, BDTREXC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT, MIN * .. * .. Local Functions .. INTEGER ICEIL * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCT( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL * * Test if grid is O.K., i.e., the context is valid. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = N+1 END IF * * Check if workspace query. * LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * Test dimensions for local sanity. * IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, INFO ) END IF * * Check the blocking sizes for alignment requirements. * IF( INFO.EQ.0 ) THEN IF( DESCT( MB_ ).NE.DESCT( NB_ ) ) INFO = -(1000*9 + MB_) END IF IF( INFO.EQ.0 ) THEN IF( DESCQ( MB_ ).NE.DESCQ( NB_ ) ) INFO = -(1000*13 + MB_) END IF IF( INFO.EQ.0 ) THEN IF( DESCT( MB_ ).NE.DESCQ( MB_ ) ) INFO = -(1000*9 + MB_) END IF * * Check the blocking sizes for minimum sizes. * IF( INFO.EQ.0 ) THEN IF( N.NE.DESCT( MB_ ) .AND. DESCT( MB_ ).LT.3 ) $ INFO = -(1000*9 + MB_) IF( N.NE.DESCQ( MB_ ) .AND. DESCQ( MB_ ).LT.3 ) $ INFO = -(1000*13 + MB_) END IF * * Check parameters in PARA. * NB = DESCT( MB_ ) IF( INFO.EQ.0 ) THEN IF( PARA(1).LT.1 .OR. PARA(1).GT.MIN(NPROW,NPCOL) ) $ INFO = -(1000 * 4 + 1) IF( PARA(2).LT.1 .OR. PARA(2).GE.PARA(3) ) $ INFO = -(1000 * 4 + 2) IF( PARA(3).LT.1 .OR. PARA(3).GT.NB ) $ INFO = -(1000 * 4 + 3) IF( PARA(4).LT.0 .OR. PARA(4).GT.100 ) $ INFO = -(1000 * 4 + 4) IF( PARA(5).LT.1 .OR. PARA(5).GT.NB ) $ INFO = -(1000 * 4 + 5) IF( PARA(6).LT.1 .OR. PARA(6).GT.PARA(2) ) $ INFO = -(1000 * 4 + 6) END IF * * Check requirements on IT, JT, IQ and JQ. * IF( INFO.EQ.0 ) THEN IF( IT.NE.1 ) INFO = -6 IF( JT.NE.IT ) INFO = -7 IF( IQ.NE.1 ) INFO = -10 IF( JQ.NE.IQ ) INFO = -11 END IF * * Test input parameters for global sanity. * IF( INFO.EQ.0 ) THEN CALL PCHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, 0, IDUM1, $ IDUM2, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL PCHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, 0, IDUM1, $ IDUM2, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL PCHK2MAT( N, 5, N, 5, IT, JT, DESCT, 9, N, 5, N, 5, $ IQ, JQ, DESCQ, 13, 0, IDUM1, IDUM2, INFO ) END IF * * Decode and test the input parameters. * IF( INFO.EQ.0 .OR. LQUERY ) THEN * WANTQ = LSAME( COMPQ, 'V' ) IF( N.LT.0 ) THEN INFO = -4 ELSE * * Extract local leading dimension. * LLDT = DESCT( LLD_ ) LLDQ = DESCQ( LLD_ ) * * Check the SELECT vector for consistency and set M to the * dimension of the specified invariant subspace. * M = 0 DO 10 K = 1, N IF( K.LT.N ) THEN CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ITT, JTT, TRSRC, TCSRC ) IF( MYROW.EQ.TRSRC .AND. MYCOL.EQ.TCSRC ) THEN ELEM = T( (JTT-1)*LLDT + ITT ) IF( ELEM.NE.ZERO ) THEN IF( SELECT(K).NE.0 .AND. $ SELECT(K+1).EQ.0 ) THEN * INFO = -2 SELECT(K+1) = 1 ELSEIF( SELECT(K).EQ.0 .AND. $ SELECT(K+1).NE.0 ) THEN * INFO = -2 SELECT(K) = 1 END IF END IF END IF END IF IF( SELECT(K).NE.0 ) M = M + 1 10 CONTINUE MMAX = M MMIN = M IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, MMAX, 1, -1, $ -1, -1, -1, -1 ) IF( NPROCS.GT.1 ) $ CALL IGAMN2D( ICTXT, 'All', TOP, 1, 1, MMIN, 1, -1, $ -1, -1, -1, -1 ) IF( MMAX.GT.MMIN ) THEN M = MMAX IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, N, 1, SELECT, N, $ -1, -1, -1, -1, -1 ) END IF * * Compute needed workspace. * N1 = M N2 = N - M * TROWS = NUMROC( N, NB, MYROW, DESCT(RSRC_), NPROW ) TCOLS = NUMROC( N, NB, MYCOL, DESCT(CSRC_), NPCOL ) LWMIN = N + 7*NB**2 + 2*TROWS*PARA( 3 ) + TCOLS*PARA( 3 ) + $ MAX( TROWS*PARA( 3 ), TCOLS*PARA( 3 ) ) LIWMIN = 5*PARA( 1 ) + PARA( 2 )*PARA( 3 ) - $ PARA( 2 ) * ( PARA( 2 ) + 1 ) / 2 * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF END IF END IF * * Global maximum on info. * IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, -1, -1, -1, $ -1, -1 ) * * Return if some argument is incorrect. * IF( INFO.NE.0 .AND. .NOT.LQUERY ) THEN M = 0 CALL PXERBLA( ICTXT, 'PSTRORD', -INFO ) RETURN ELSEIF( LQUERY ) THEN WORK( 1 ) = FLOAT(LWMIN) IWORK( 1 ) = LIWMIN RETURN END IF * * Quick return if possible. * IF( M.EQ.N .OR. M.EQ.0 ) GO TO 545 * * Set parameters. * NUMWIN = PARA( 1 ) WINEIG = MAX( PARA( 2 ), 2 ) WINSIZ = MIN( MAX( PARA( 3 ), PARA( 2 )*2 ), NB ) MMULT = PARA( 4 ) NCB = PARA( 5 ) WNEICR = PARA( 6 ) * * Insert some pointers into INTEGER workspace. * * Information about all the active windows is stored * in IWORK( 1:5*NUMWIN ). Each processor has a copy. * LILO: start position * LIHI: stop position * LSEL: number of selected eigenvalues * RSRC: processor id (row) * CSRC: processor id (col) * IWORK( IPIW+ ) contain information of orthogonal transformations. * ILILO = 1 ILIHI = ILILO + NUMWIN ILSEL = ILIHI + NUMWIN IRSRC = ILSEL + NUMWIN ICSRC = IRSRC + NUMWIN IPIW = ICSRC + NUMWIN * * Insert some pointers into REAL workspace - for now we * only need two pointers. * IPW1 = 1 IPW2 = IPW1 + NB * * Collect the selected blocks at the top-left corner of T. * * Globally: ignore eigenvalues that are already in order. * ILO is a global variable and is kept updated to be consistent * throughout the process mesh. * ILO = 0 40 CONTINUE ILO = ILO + 1 IF( ILO.LE.N ) THEN IF( SELECT(ILO).NE.0 ) GO TO 40 END IF * * Globally: start the collection at the top of the matrix. Here, * IHI is a global variable and is kept updated to be consistent * throughout the process mesh. * IHI = N * * Globally: While ( ILO <= M ) do 50 CONTINUE * IF( ILO.LE.M ) THEN * * Depending on the value of ILO, find the diagonal block index J, * such that T(1+(J-1)*NB:1+J*NB,1+(J-1)*NB:1+J*NB) contains the * first unsorted eigenvalue. Check that J does not point to a * block with only one selected eigenvalue in the last position * which belongs to a splitted 2-by-2 block. * ILOS = ILO - 1 52 CONTINUE ILOS = ILOS + 1 IF( SELECT(ILOS).EQ.0 ) GO TO 52 IF( ILOS.LT.N ) THEN IF( SELECT(ILOS+1).NE.0 .AND. MOD(ILOS,NB).EQ.0 ) THEN CALL PSELGET( 'All', TOP, ELEM, T, ILOS+1, ILOS, DESCT ) IF( ELEM.NE.ZERO ) GO TO 52 END IF END IF J = ICEIL(ILOS,NB) * * Globally: Set start values of LILO and LIHI for all processes. * Choose also the number of selected eigenvalues at top of each * diagonal block such that the number of eigenvalues which remain * to be reordered is an integer multiple of WINEIG. * * All the information is saved into the INTEGER workspace such * that all processors are aware of each others operations. * * Compute the number of concurrent windows. * NMWIN2 = (ICEIL(IHI,NB)*NB - (ILO-MOD(ILO,NB)+1)+1) / NB NMWIN2 = MIN( MIN( NUMWIN, NMWIN2 ), ICEIL(N,NB) - J + 1 ) * * For all windows, set LSEL = 0 and find a proper start value of * LILO such that LILO points at the first non-selected entry in * the corresponding diagonal block of T. * DO 80 K = 1, NMWIN2 IWORK( ILSEL+K-1) = 0 IWORK( ILILO+K-1) = MAX( ILO, (J-1)*NB+(K-1)*NB+1 ) LILO = IWORK( ILILO+K-1 ) 82 CONTINUE IF( SELECT(LILO).NE.0 .AND. LILO.LT.(J+K-1)*NB ) THEN LILO = LILO + 1 IF( LILO.LE.N ) GO TO 82 END IF IWORK( ILILO+K-1 ) = LILO * * Fix each LILO to ensure that no 2-by-2 block is cut in top * of the submatrix (LILO:LIHI,LILO:LIHI). * LILO = IWORK(ILILO+K-1) IF( LILO.GT.NB ) THEN CALL PSELGET( 'All', TOP, ELEM, T, LILO, LILO-1, DESCT ) IF( ELEM.NE.ZERO ) THEN IF( LILO.LT.(J+K-1)*NB ) THEN IWORK(ILILO+K-1) = IWORK(ILILO+K-1) + 1 ELSE IWORK(ILILO+K-1) = IWORK(ILILO+K-1) - 1 END IF END IF END IF * * Set a proper LIHI value for each window. Also find the * processors corresponding to the corresponding windows. * IWORK( ILIHI+K-1 ) = IWORK( ILILO+K-1 ) IWORK( IRSRC+K-1 ) = INDXG2P( IWORK(ILILO+K-1), NB, MYROW, $ DESCT( RSRC_ ), NPROW ) IWORK( ICSRC+K-1 ) = INDXG2P( IWORK(ILILO+K-1), NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) TILO = IWORK(ILILO+K-1) TIHI = MIN( N, ICEIL( TILO, NB ) * NB ) DO 90 KK = TIHI, TILO, -1 IF( SELECT(KK).NE.0 ) THEN IWORK(ILIHI+K-1) = MAX(IWORK(ILIHI+K-1) , KK ) IWORK(ILSEL+K-1) = IWORK(ILSEL+K-1) + 1 IF( IWORK(ILSEL+K-1).GT.WINEIG ) THEN IWORK(ILIHI+K-1) = KK IWORK(ILSEL+K-1) = 1 END IF END IF 90 CONTINUE * * Fix each LIHI to avoid that bottom of window cuts 2-by-2 * block. We exclude such a block if located on block (process) * border and on window border or if an inclusion would cause * violation on the maximum number of eigenvalues to reorder * inside each window. If only on window border, we include it. * The excluded block is included automatically later when a * subcluster is reordered into the block from South-East. * LIHI = IWORK(ILIHI+K-1) IF( LIHI.LT.N ) THEN CALL PSELGET( 'All', TOP, ELEM, T, LIHI+1, LIHI, DESCT ) IF( ELEM.NE.ZERO ) THEN IF( ICEIL( LIHI, NB ) .NE. ICEIL( LIHI+1, NB ) .OR. $ IWORK( ILSEL+K-1 ).EQ.WINEIG ) THEN IWORK( ILIHI+K-1 ) = IWORK( ILIHI+K-1 ) - 1 IF( IWORK( ILSEL+K-1 ).GT.2 ) $ IWORK( ILSEL+K-1 ) = IWORK( ILSEL+K-1 ) - 1 ELSE IWORK( ILIHI+K-1 ) = IWORK( ILIHI+K-1 ) + 1 IF( SELECT(LIHI+1).NE.0 ) $ IWORK( ILSEL+K-1 ) = IWORK( ILSEL+K-1 ) + 1 END IF END IF END IF 80 CONTINUE * * Fix the special cases of LSEL = 0 and LILO = LIHI for each * window by assuring that the stop-condition for local reordering * is fulfilled directly. Do this by setting LIHI = startposition * for the corresponding block and LILO = LIHI + 1. * DO 85 K = 1, NMWIN2 LILO = IWORK( ILILO + K - 1 ) LIHI = IWORK( ILIHI + K - 1 ) LSEL = IWORK( ILSEL + K - 1 ) IF( LSEL.EQ.0 .OR. LILO.EQ.LIHI ) THEN LIHI = IWORK( ILIHI + K - 1 ) IWORK( ILIHI + K - 1 ) = (ICEIL(LIHI,NB)-1)*NB + 1 IWORK( ILILO + K - 1 ) = IWORK( ILIHI + K - 1 ) + 1 END IF 85 CONTINUE * * Associate all processors with the first computational window * that should be activated, if possible. * LILO = IHI LIHI = ILO LSEL = M FIRST = .TRUE. DO 95 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) IF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN TLILO = IWORK( ILILO + WINDOW - 1 ) TLIHI = IWORK( ILIHI + WINDOW - 1 ) TLSEL = IWORK( ILSEL + WINDOW - 1 ) IF( (.NOT. ( LIHI .GE. LILO + LSEL ) ) .AND. $ ( (TLIHI .GE. TLILO + TLSEL) .OR. FIRST ) ) THEN IF( FIRST ) FIRST = .FALSE. LILO = TLILO LIHI = TLIHI LSEL = TLSEL GO TO 97 END IF END IF 95 CONTINUE 97 CONTINUE * * Exclude all processors that are not involved in any * computational window right now. * IERR = 0 IF( LILO.EQ.IHI .AND. LIHI.EQ.ILO .AND. LSEL.EQ.M ) $ GO TO 114 * * Make sure all processors associated with a compuational window * enter the local reordering the first time. * FIRST = .TRUE. * * Globally for all computational windows: * While ( LIHI >= LILO + LSEL ) do ROUND = 1 130 CONTINUE IF( FIRST .OR. ( LIHI .GE. LILO + LSEL ) ) THEN * * Perform computations in parallel: loop through all * compuational windows, do local reordering and accumulate * transformations, broadcast them in the corresponding block * row and columns and compute the corresponding updates. * DO 110 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) * * The process on the block diagonal computes the * reordering. * IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN LILO = IWORK(ILILO+WINDOW-1) LIHI = IWORK(ILIHI+WINDOW-1) LSEL = IWORK(ILSEL+WINDOW-1) * * Compute the local value of I -- start position. * I = MAX( LILO, LIHI - WINSIZ + 1 ) * * Fix my I to avoid that top of window cuts a 2-by-2 * block. * IF( I.GT.LILO ) THEN CALL INFOG2L( I, I-1, DESCT, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC, CSRC ) IF( T( LLDT*(JLOC-1) + ILOC ).NE.ZERO ) $ I = I + 1 END IF * * Compute local indicies for submatrix to operate on. * CALL INFOG2L( I, I, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC1, JLOC1, RSRC, CSRC ) * * The active window is ( I:LIHI, I:LIHI ). Reorder * eigenvalues within this window and pipeline * transformations. * NWIN = LIHI - I + 1 KS = 0 PITRAF = IPIW PDTRAF = IPW2 * PAIR = .FALSE. DO 140 K = I, LIHI IF( PAIR ) THEN PAIR = .FALSE. ELSE SWAP = SELECT( K ).NE.0 IF( K.LT.LIHI ) THEN CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC, CSRC ) IF( T( LLDT*(JLOC-1) + ILOC ).NE.ZERO ) $ PAIR = .TRUE. END IF IF( SWAP ) THEN KS = KS + 1 * * Swap the K-th block to position I+KS-1. * IERR = 0 KK = K - I + 1 KKS = KS IF( KK.NE.KS ) THEN NITRAF = LIWORK - PITRAF + 1 NDTRAF = LWORK - PDTRAF + 1 CALL BSTREXC( NWIN, $ T(LLDT*(JLOC1-1) + ILOC1), LLDT, KK, $ KKS, NITRAF, IWORK( PITRAF ), NDTRAF, $ WORK( PDTRAF ), WORK(IPW1), IERR ) PITRAF = PITRAF + NITRAF PDTRAF = PDTRAF + NDTRAF * * Update array SELECT. * IF ( PAIR ) THEN DO 150 J = I+KK-1, I+KKS, -1 SELECT(J+1) = SELECT(J-1) 150 CONTINUE SELECT(I+KKS-1) = 1 SELECT(I+KKS) = 1 ELSE DO 160 J = I+KK-1, I+KKS, -1 SELECT(J) = SELECT(J-1) 160 CONTINUE SELECT(I+KKS-1) = 1 END IF * IF ( IERR.EQ.1 .OR. IERR.EQ.2 ) THEN * * Some blocks are too close to swap: * prepare to leave in a clean fashion. If * IERR.EQ.2, we must update SELECT to * account for the fact that the 2 by 2 * block to be reordered did split and the * first part of this block is already * reordered. * IF ( IERR.EQ.2 ) THEN SELECT( I+KKS-3 ) = 1 SELECT( I+KKS-1 ) = 0 KKS = KKS + 1 END IF * * Update off-diagonal blocks immediately. * GO TO 170 END IF KS = KKS END IF IF( PAIR ) $ KS = KS + 1 END IF END IF 140 CONTINUE END IF 110 CONTINUE 170 CONTINUE * * The on-diagonal processes save their information from the * local reordering in the integer buffer. This buffer is * broadcasted to updating processors, see below. * DO 175 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN IBUFF( 1 ) = I IBUFF( 2 ) = NWIN IBUFF( 3 ) = PITRAF IBUFF( 4 ) = KS IBUFF( 5 ) = PDTRAF IBUFF( 6 ) = NDTRAF ILEN = PITRAF - IPIW DLEN = PDTRAF - IPW2 IBUFF( 7 ) = ILEN IBUFF( 8 ) = DLEN END IF 175 CONTINUE * * For the updates with respect to the local reordering, we * organize the updates in two phases where the update * "direction" (controlled by the DIR variable below) is first * chosen to be the corresponding rows, then the corresponding * columns. * DO 1111 DIR = 1, 2 * * Broadcast information about the reordering and the * accumulated transformations: I, NWIN, PITRAF, NITRAF, * PDTRAF, NDTRAF. If no broadcast is performed, use an * artificial value of KS to prevent updating indicies for * windows already finished (use KS = -1). * DO 111 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) IF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN LILO = IWORK(ILILO+WINDOW-1) LIHI = IWORK(ILIHI+WINDOW-1) LSEL = IWORK(ILSEL+WINDOW-1) END IF IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) $ CALL IGEBS2D( ICTXT, 'Row', TOP, 8, 1, IBUFF, 8 ) IF( NPROW.GT.1 .AND. DIR.EQ.2 ) $ CALL IGEBS2D( ICTXT, 'Col', TOP, 8, 1, IBUFF, 8 ) ELSEIF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 .AND. MYROW.EQ.RSRC ) $ THEN IF( FIRST .OR. (LIHI .GE. LILO + LSEL) ) THEN CALL IGEBR2D( ICTXT, 'Row', TOP, 8, 1, IBUFF, 8, $ RSRC, CSRC ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) ELSE ILEN = 0 DLEN = 0 KS = -1 END IF END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 .AND. MYCOL.EQ.CSRC ) $ THEN IF( FIRST .OR. (LIHI .GE. LILO + LSEL) ) THEN CALL IGEBR2D( ICTXT, 'Col', TOP, 8, 1, IBUFF, 8, $ RSRC, CSRC ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) ELSE ILEN = 0 DLEN = 0 KS = -1 END IF END IF END IF * * Broadcast the accumulated transformations - copy all * information from IWORK(IPIW:PITRAF-1) and * WORK(IPW2:PDTRAF-1) to a buffer and broadcast this * buffer in the corresponding block row and column. On * arrival, copy the information back to the correct part of * the workspace. This step is avoided if no computations * were performed at the diagonal processor, i.e., * BUFFLEN = 0. * IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( BUFFLEN.NE.0 ) THEN DO 180 INDX = 1, ILEN WORK( BUFFER+INDX-1 ) = $ FLOAT( IWORK(IPIW+INDX-1) ) 180 CONTINUE CALL SLAMOV( 'All', DLEN, 1, WORK( IPW2 ), $ DLEN, WORK(BUFFER+ILEN), DLEN ) IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) THEN CALL SGEBS2D( ICTXT, 'Row', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN ) END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 ) THEN CALL SGEBS2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN ) END IF END IF ELSEIF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 .AND. MYROW.EQ.RSRC ) $ THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( BUFFLEN.NE.0 ) THEN CALL SGEBR2D( ICTXT, 'Row', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC, CSRC ) END IF END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 .AND. MYCOL.EQ.CSRC ) $ THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( BUFFLEN.NE.0 ) THEN CALL SGEBR2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC, CSRC ) END IF END IF IF((NPCOL.GT.1.AND.DIR.EQ.1.AND.MYROW.EQ.RSRC).OR. $ (NPROW.GT.1.AND.DIR.EQ.2.AND.MYCOL.EQ.CSRC ) ) $ THEN IF( BUFFLEN.NE.0 ) THEN DO 190 INDX = 1, ILEN IWORK(IPIW+INDX-1) = $ INT(WORK( BUFFER+INDX-1 )) 190 CONTINUE CALL SLAMOV( 'All', DLEN, 1, $ WORK( BUFFER+ILEN ), DLEN, $ WORK( IPW2 ), DLEN ) END IF END IF END IF 111 CONTINUE * * Now really perform the updates by applying the orthogonal * transformations to the out-of-window parts of T and Q. This * step is avoided if no reordering was performed by the on- * diagonal processor from the beginning, i.e., BUFFLEN = 0. * * Count number of operations to decide whether to use * matrix-matrix multiplications for updating off-diagonal * parts or not. * DO 112 WINDOW = 1, NMWIN2 RSRC = IWORK(IRSRC+WINDOW-1) CSRC = IWORK(ICSRC+WINDOW-1) * IF( (MYROW.EQ.RSRC .AND. DIR.EQ.1 ).OR. $ (MYCOL.EQ.CSRC .AND. DIR.EQ.2 ) ) THEN LILO = IWORK(ILILO+WINDOW-1) LIHI = IWORK(ILIHI+WINDOW-1) LSEL = IWORK(ILSEL+WINDOW-1) * * Skip update part for current WINDOW if BUFFLEN = 0. * IF( BUFFLEN.EQ.0 ) GO TO 295 * NITRAF = PITRAF - IPIW ISHH = .FALSE. FLOPS = 0 DO 200 K = 1, NITRAF IF( IWORK( IPIW + K - 1 ).LE.NWIN ) THEN FLOPS = FLOPS + 6 ELSE FLOPS = FLOPS + 11 ISHH = .TRUE. END IF 200 CONTINUE * * Compute amount of work space necessary for performing * matrix-matrix multiplications. * PDW = BUFFER IPW3 = PDW + NWIN*NWIN ELSE FLOPS = 0 END IF * IF( FLOPS.NE.0 .AND. $ ( FLOPS*100 ) / ( 2*NWIN*NWIN ) .GE. MMULT ) THEN * * Update off-diagonal blocks and Q using matrix-matrix * multiplications; if there are no Householder * reflectors it is preferable to take the triangular * block structure of the transformation matrix into * account. * CALL SLASET( 'All', NWIN, NWIN, ZERO, ONE, $ WORK( PDW ), NWIN ) CALL BSLAAPP( 1, NWIN, NWIN, NCB, WORK( PDW ), NWIN, $ NITRAF, IWORK(IPIW), WORK( IPW2 ), WORK(IPW3) ) * IF( ISHH ) THEN * * Loop through the local blocks of the distributed * matrices T and Q and update them according to the * performed reordering. * * Update the columns of T and Q affected by the * reordering. * IF( DIR.EQ.2 ) THEN DO 210 INDX = 1, I-1, NB CALL INFOG2L( INDX, I, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN LROWS = MIN(NB,I-INDX) CALL SGEMM( 'No transpose', $ 'No transpose', LROWS, NWIN, NWIN, $ ONE, T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( PDW ), NWIN, ZERO, $ WORK(IPW3), LROWS ) CALL SLAMOV( 'All', LROWS, NWIN, $ WORK(IPW3), LROWS, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 210 CONTINUE IF( WANTQ ) THEN DO 220 INDX = 1, N, NB CALL INFOG2L( INDX, I, DESCQ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN LROWS = MIN(NB,N-INDX+1) CALL SGEMM( 'No transpose', $ 'No transpose', LROWS, NWIN, NWIN, $ ONE, Q((JLOC-1)*LLDQ+ILOC), LLDQ, $ WORK( PDW ), NWIN, ZERO, $ WORK(IPW3), LROWS ) CALL SLAMOV( 'All', LROWS, NWIN, $ WORK(IPW3), LROWS, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ ) END IF 220 CONTINUE END IF END IF * * Update the rows of T affected by the reordering * IF( DIR.EQ.1 ) THEN IF( LIHI.LT.N ) THEN IF( MOD(LIHI,NB).GT.0 ) THEN INDX = LIHI + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN LCOLS = MOD( MIN( NB-MOD(LIHI,NB), $ N-LIHI ), NB ) CALL SGEMM( 'Transpose', $ 'No Transpose', NWIN, LCOLS, NWIN, $ ONE, WORK( PDW ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT, ZERO, $ WORK(IPW3), NWIN ) CALL SLAMOV( 'All', NWIN, LCOLS, $ WORK(IPW3), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF END IF INDXS = ICEIL(LIHI,NB)*NB + 1 DO 230 INDX = INDXS, N, NB CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN LCOLS = MIN( NB, N-INDX+1 ) CALL SGEMM( 'Transpose', $ 'No Transpose', NWIN, LCOLS, NWIN, $ ONE, WORK( PDW ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT, ZERO, $ WORK(IPW3), NWIN ) CALL SLAMOV( 'All', NWIN, LCOLS, $ WORK(IPW3), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 230 CONTINUE END IF END IF ELSE * * The NWIN-by-NWIN matrix U containing the * accumulated orthogonal transformations has the * following structure: * * [ U11 U12 ] * U = [ ], * [ U21 U22 ] * * where U21 is KS-by-KS upper triangular and U12 is * (NWIN-KS)-by-(NWIN-KS) lower triangular. * * Update the columns of T and Q affected by the * reordering. * * Compute T2*U21 + T1*U11 in workspace. * IF( DIR.EQ.2 ) THEN DO 240 INDX = 1, I-1, NB CALL INFOG2L( INDX, I, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN JLOC1 = INDXG2L( I+NWIN-KS, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) LROWS = MIN(NB,I-INDX) CALL SLAMOV( 'All', LROWS, KS, $ T((JLOC1-1)*LLDT+ILOC ), LLDT, $ WORK(IPW3), LROWS ) CALL STRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', LROWS, KS, ONE, $ WORK( PDW+NWIN-KS ), NWIN, $ WORK(IPW3), LROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', LROWS, KS, NWIN-KS, $ ONE, T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( PDW ), NWIN, ONE, WORK(IPW3), $ LROWS ) * * Compute T1*U12 + T2*U22 in workspace. * CALL SLAMOV( 'All', LROWS, NWIN-KS, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( IPW3+KS*LROWS ), LROWS ) CALL STRMM( 'Right', 'Lower', $ 'No transpose', 'Non-unit', $ LROWS, NWIN-KS, ONE, $ WORK( PDW+NWIN*KS ), NWIN, $ WORK( IPW3+KS*LROWS ), LROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', LROWS, NWIN-KS, KS, $ ONE, T((JLOC1-1)*LLDT+ILOC), LLDT, $ WORK( PDW+NWIN*KS+NWIN-KS ), NWIN, $ ONE, WORK( IPW3+KS*LROWS ), LROWS ) * * Copy workspace to T. * CALL SLAMOV( 'All', LROWS, NWIN, $ WORK(IPW3), LROWS, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 240 CONTINUE IF( WANTQ ) THEN * * Compute Q2*U21 + Q1*U11 in workspace. * DO 250 INDX = 1, N, NB CALL INFOG2L( INDX, I, DESCQ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN JLOC1 = INDXG2L( I+NWIN-KS, NB, $ MYCOL, DESCQ( CSRC_ ), NPCOL ) LROWS = MIN(NB,N-INDX+1) CALL SLAMOV( 'All', LROWS, KS, $ Q((JLOC1-1)*LLDQ+ILOC ), LLDQ, $ WORK(IPW3), LROWS ) CALL STRMM( 'Right', 'Upper', $ 'No transpose', 'Non-unit', $ LROWS, KS, ONE, $ WORK( PDW+NWIN-KS ), NWIN, $ WORK(IPW3), LROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', LROWS, KS, $ NWIN-KS, ONE, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ, $ WORK( PDW ), NWIN, ONE, $ WORK(IPW3), LROWS ) * * Compute Q1*U12 + Q2*U22 in workspace. * CALL SLAMOV( 'All', LROWS, NWIN-KS, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ, $ WORK( IPW3+KS*LROWS ), LROWS) CALL STRMM( 'Right', 'Lower', $ 'No transpose', 'Non-unit', $ LROWS, NWIN-KS, ONE, $ WORK( PDW+NWIN*KS ), NWIN, $ WORK( IPW3+KS*LROWS ), LROWS) CALL SGEMM( 'No transpose', $ 'No transpose', LROWS, NWIN-KS, $ KS, ONE, Q((JLOC1-1)*LLDQ+ILOC), $ LLDQ, WORK(PDW+NWIN*KS+NWIN-KS), $ NWIN, ONE, WORK( IPW3+KS*LROWS ), $ LROWS ) * * Copy workspace to Q. * CALL SLAMOV( 'All', LROWS, NWIN, $ WORK(IPW3), LROWS, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ ) END IF 250 CONTINUE END IF END IF * IF( DIR.EQ.1 ) THEN IF ( LIHI.LT.N ) THEN * * Compute U21**T*T2 + U11**T*T1 in workspace. * IF( MOD(LIHI,NB).GT.0 ) THEN INDX = LIHI + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN ILOC1 = INDXG2L( I+NWIN-KS, NB, MYROW, $ DESCT( RSRC_ ), NPROW ) LCOLS = MOD( MIN( NB-MOD(LIHI,NB), $ N-LIHI ), NB ) CALL SLAMOV( 'All', KS, LCOLS, $ T((JLOC-1)*LLDT+ILOC1), LLDT, $ WORK(IPW3), NWIN ) CALL STRMM( 'Left', 'Upper', $ 'Transpose', 'Non-unit', KS, $ LCOLS, ONE, WORK( PDW+NWIN-KS ), $ NWIN, WORK(IPW3), NWIN ) CALL SGEMM( 'Transpose', $ 'No transpose', KS, LCOLS, $ NWIN-KS, ONE, WORK(PDW), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT, ONE, $ WORK(IPW3), NWIN ) * * Compute U12**T*T1 + U22**T*T2 in * workspace. * CALL SLAMOV( 'All', NWIN-KS, LCOLS, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( IPW3+KS ), NWIN ) CALL STRMM( 'Left', 'Lower', $ 'Transpose', 'Non-unit', $ NWIN-KS, LCOLS, ONE, $ WORK( PDW+NWIN*KS ), NWIN, $ WORK( IPW3+KS ), NWIN ) CALL SGEMM( 'Transpose', $ 'No Transpose', NWIN-KS, LCOLS, $ KS, ONE, $ WORK( PDW+NWIN*KS+NWIN-KS ), $ NWIN, T((JLOC-1)*LLDT+ILOC1), $ LLDT, ONE, WORK( IPW3+KS ), $ NWIN ) * * Copy workspace to T. * CALL SLAMOV( 'All', NWIN, LCOLS, $ WORK(IPW3), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF END IF INDXS = ICEIL(LIHI,NB)*NB + 1 DO 260 INDX = INDXS, N, NB CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC1 ) $ THEN * * Compute U21**T*T2 + U11**T*T1 in * workspace. * ILOC1 = INDXG2L( I+NWIN-KS, NB, $ MYROW, DESCT( RSRC_ ), NPROW ) LCOLS = MIN( NB, N-INDX+1 ) CALL SLAMOV( 'All', KS, LCOLS, $ T((JLOC-1)*LLDT+ILOC1), LLDT, $ WORK(IPW3), NWIN ) CALL STRMM( 'Left', 'Upper', $ 'Transpose', 'Non-unit', KS, $ LCOLS, ONE, $ WORK( PDW+NWIN-KS ), NWIN, $ WORK(IPW3), NWIN ) CALL SGEMM( 'Transpose', $ 'No transpose', KS, LCOLS, $ NWIN-KS, ONE, WORK(PDW), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT, ONE, $ WORK(IPW3), NWIN ) * * Compute U12**T*T1 + U22**T*T2 in * workspace. * CALL SLAMOV( 'All', NWIN-KS, LCOLS, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK( IPW3+KS ), NWIN ) CALL STRMM( 'Left', 'Lower', $ 'Transpose', 'Non-unit', $ NWIN-KS, LCOLS, ONE, $ WORK( PDW+NWIN*KS ), NWIN, $ WORK( IPW3+KS ), NWIN ) CALL SGEMM( 'Transpose', $ 'No Transpose', NWIN-KS, LCOLS, $ KS, ONE, $ WORK( PDW+NWIN*KS+NWIN-KS ), $ NWIN, T((JLOC-1)*LLDT+ILOC1), $ LLDT, ONE, WORK(IPW3+KS), NWIN ) * * Copy workspace to T. * CALL SLAMOV( 'All', NWIN, LCOLS, $ WORK(IPW3), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 260 CONTINUE END IF END IF END IF ELSEIF( FLOPS.NE.0 ) THEN * * Update off-diagonal blocks and Q using the pipelined * elementary transformations. * IF( DIR.EQ.2 ) THEN DO 270 INDX = 1, I-1, NB CALL INFOG2L( INDX, I, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN LROWS = MIN(NB,I-INDX) CALL BSLAAPP( 1, LROWS, NWIN, NCB, $ T((JLOC-1)*LLDT+ILOC ), LLDT, NITRAF, $ IWORK(IPIW), WORK( IPW2 ), $ WORK(IPW3) ) END IF 270 CONTINUE IF( WANTQ ) THEN DO 280 INDX = 1, N, NB CALL INFOG2L( INDX, I, DESCQ, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN LROWS = MIN(NB,N-INDX+1) CALL BSLAAPP( 1, LROWS, NWIN, NCB, $ Q((JLOC-1)*LLDQ+ILOC), LLDQ, NITRAF, $ IWORK(IPIW), WORK( IPW2 ), $ WORK(IPW3) ) END IF 280 CONTINUE END IF END IF IF( DIR.EQ.1 ) THEN IF( LIHI.LT.N ) THEN IF( MOD(LIHI,NB).GT.0 ) THEN INDX = LIHI + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN LCOLS = MOD( MIN( NB-MOD(LIHI,NB), $ N-LIHI ), NB ) CALL BSLAAPP( 0, NWIN, LCOLS, NCB, $ T((JLOC-1)*LLDT+ILOC), LLDT, NITRAF, $ IWORK(IPIW), WORK( IPW2 ), $ WORK(IPW3) ) END IF END IF INDXS = ICEIL(LIHI,NB)*NB + 1 DO 290 INDX = INDXS, N, NB CALL INFOG2L( I, INDX, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ILOC, JLOC, RSRC1, CSRC1 ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) $ THEN LCOLS = MIN( NB, N-INDX+1 ) CALL BSLAAPP( 0, NWIN, LCOLS, NCB, $ T((JLOC-1)*LLDT+ILOC), LLDT, NITRAF, $ IWORK(IPIW), WORK( IPW2 ), $ WORK(IPW3) ) END IF 290 CONTINUE END IF END IF END IF * * If I was not involved in the updates for the current * window or the window was fully processed, I go here and * try again for the next window. * 295 CONTINUE * * Update LIHI and LIHI depending on the number of * eigenvalues really moved - for on-diagonal processes we * do this update only once since each on-diagonal process * is only involved with one window at one time. The * indicies are updated in three cases: * 1) When some reordering was really performed * -- indicated by BUFFLEN > 0. * 2) When no selected eigenvalues was found in the * current window -- indicated by KS = 0. * 3) When some selected eigenvalues was found in the * current window but no one of them was moved * (KS > 0 and BUFFLEN = 0) * False index updating is avoided by sometimes setting * KS = -1. This will affect processors involved in more * than one window and where the first one ends up with * KS = 0 and for the second one is done already. * IF( MYROW.EQ.RSRC.AND.MYCOL.EQ.CSRC ) THEN IF( DIR.EQ.2 ) THEN IF( BUFFLEN.NE.0 .OR. KS.EQ.0 .OR. $ ( BUFFLEN.EQ.0 .AND. KS.GT.0 ) ) $ LIHI = I + KS - 1 IWORK( ILIHI+WINDOW-1 ) = LIHI IF( .NOT. LIHI.GE.LILO+LSEL ) THEN LILO = LILO + LSEL IWORK( ILILO+WINDOW-1 ) = LILO END IF END IF ELSEIF( MYROW.EQ.RSRC .AND. DIR.EQ.1 ) THEN IF( BUFFLEN.NE.0 .OR. KS.EQ.0 .OR. $ ( BUFFLEN.EQ.0 .AND. KS.GT.0 ) ) $ LIHI = I + KS - 1 IWORK( ILIHI+WINDOW-1 ) = LIHI IF( .NOT. LIHI.GE.LILO+LSEL ) THEN LILO = LILO + LSEL IWORK( ILILO+WINDOW-1 ) = LILO END IF ELSEIF( MYCOL.EQ.CSRC .AND. DIR.EQ.2 ) THEN IF( BUFFLEN.NE.0 .OR. KS.EQ.0 .OR. $ ( BUFFLEN.EQ.0 .AND. KS.GT.0 ) ) $ LIHI = I + KS - 1 IWORK( ILIHI+WINDOW-1 ) = LIHI IF( .NOT. LIHI.GE.LILO+LSEL ) THEN LILO = LILO + LSEL IWORK( ILILO+WINDOW-1 ) = LILO END IF END IF * 112 CONTINUE * * End of direction loop for updates with respect to local * reordering. * 1111 CONTINUE * * Associate each process with one of the corresponding * computational windows such that the test for another round * of local reordering is carried out properly. Since the * column updates were computed after the row updates, it is * sufficient to test for changing the association to the * window in the corresponding process row. * DO 113 WINDOW = 1, NMWIN2 RSRC = IWORK( IRSRC + WINDOW - 1 ) IF( MYROW.EQ.RSRC .AND. (.NOT. LIHI.GE.LILO+LSEL ) ) THEN LILO = IWORK( ILILO + WINDOW - 1 ) LIHI = IWORK( ILIHI + WINDOW - 1 ) LSEL = IWORK( ILSEL + WINDOW - 1 ) END IF 113 CONTINUE * * End While ( LIHI >= LILO + LSEL ) ROUND = ROUND + 1 IF( FIRST ) FIRST = .FALSE. GO TO 130 END IF * * All processors excluded from the local reordering go here. * 114 CONTINUE * * Barrier to collect the processes before proceeding. * CALL BLACS_BARRIER( ICTXT, 'All' ) * * Compute global maximum of IERR so that we know if some process * experienced a failure in the reordering. * MYIERR = IERR IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, IERR, 1, -1, $ -1, -1, -1, -1 ) * IF( IERR.NE.0 ) THEN * * When calling BDTREXC, the block at position I+KKS-1 failed * to swap. * IF( MYIERR.NE.0 ) INFO = MAX(1,I+KKS-1) IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, -1, $ -1, -1, -1, -1 ) GO TO 300 END IF * * Now, for each compuational window, move the selected * eigenvalues across the process border. Do this by forming the * processors into groups of four working together to bring the * window over the border. The processes are numbered as follows * * 1 | 2 * --+-- * 3 | 4 * * where '|' and '-' denotes the process (and block) borders. * This implies that the cluster to be reordered over the border * is held by process 4, process 1 will receive the cluster after * the reordering, process 3 holds the local (2,1)th element of a * 2-by-2 diagonal block located on the block border and process 2 * holds the closest off-diagonal part of the window that is * affected by the cross-border reordering. * * The active window is now ( I : LIHI[4], I : LIHI[4] ), where * I = MAX( ILO, LIHI - 2*MOD(LIHI,NB) ). If this active window is * too large compared to the value of PARA( 6 ), it will be * truncated in both ends such that a maximum of PARA( 6 ) * eigenvalues is reordered across the border this time. * * The active window will be collected and built in workspace at * process 1 and 4, which both compute the reordering and return * the updated parts to the corresponding processes 2-3. Next, the * accumulated transformations are broadcasted for updates in the * block rows and column that corresponds to the process rows and * columns where process 1 and 4 reside. * * The off-diagonal blocks are updated by the processes receiving * from the broadcasts of the orthogonal transformations. Since * the active window is split over the process borders, the * updates of T and Q requires that stripes of block rows of * columns are exchanged between neighboring processes in the * corresponding process rows and columns. * * First, form each group of processors involved in the * crossborder reordering. Do this in two (or three) phases: * 1) Reorder each odd window over the border. * 2) Reorder each even window over the border. * 3) Reorder the last odd window over the border, if it was not * processed in the first phase. * * When reordering the odd windows over the border, we must make * sure that no process row or column is involved in both the * first and the last window at the same time. This happens when * the total number of windows is odd, greater than one and equal * to the minumum process mesh dimension. Therefore the last odd * window may be reordered over the border at last. * LASTWAIT = NMWIN2.GT.1 .AND. MOD(NMWIN2,2).EQ.1 .AND. $ NMWIN2.EQ.MIN(NPROW,NPCOL) * LAST = 0 308 CONTINUE IF( LASTWAIT ) THEN IF( LAST.EQ.0 ) THEN WIN0S = 1 WIN0E = 2 WINE = NMWIN2 - 1 ELSE WIN0S = NMWIN2 WIN0E = NMWIN2 WINE = NMWIN2 END IF ELSE WIN0S = 1 WIN0E = 2 WINE = NMWIN2 END IF DO 310 WINDOW0 = WIN0S, WIN0E DO 320 WINDOW = WINDOW0, WINE, 2 * * Define the process holding the down-right part of the * window. * RSRC4 = IWORK(IRSRC+WINDOW-1) CSRC4 = IWORK(ICSRC+WINDOW-1) * * Define the other processes in the group of four. * RSRC3 = RSRC4 CSRC3 = MOD( CSRC4 - 1 + NPCOL, NPCOL ) RSRC2 = MOD( RSRC4 - 1 + NPROW, NPROW ) CSRC2 = CSRC4 RSRC1 = RSRC2 CSRC1 = CSRC3 IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) .OR. $ ( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN * * Compute the correct active window - for reordering * into a block that has not been active at all before, * we try to reorder as many of our eigenvalues over the * border as possible without knowing of the situation on * the other side - this may cause very few eigenvalues * to be reordered over the border this time (perhaps not * any) but this should be an initial problem. Anyway, * the bottom-right position of the block will be at * position LIHIC. * IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN LIHI4 = ( IWORK( ILILO + WINDOW - 1 ) + $ IWORK( ILIHI + WINDOW - 1 ) ) / 2 LIHIC = MIN(LIHI4,(ICEIL(LIHI4,NB)-1)*NB+WNEICR) * * Fix LIHIC to avoid that bottom of window cuts * 2-by-2 block and make sure all processors in the * group knows about the correct value. * IF( (.NOT. LIHIC.LE.NB) .AND. LIHIC.LT.N ) THEN ILOC = INDXG2L( LIHIC+1, NB, MYROW, $ DESCT( RSRC_ ), NPROW ) JLOC = INDXG2L( LIHIC, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) IF( T( (JLOC-1)*LLDT+ILOC ).NE.ZERO ) THEN IF( MOD( LIHIC, NB ).EQ.1 .OR. $ ( MOD( LIHIC, NB ).EQ.2 .AND. $ SELECT(LIHIC-2).EQ.0 ) ) $ THEN LIHIC = LIHIC + 1 ELSE LIHIC = LIHIC - 1 END IF END IF END IF IF( RSRC4.NE.RSRC1 .OR. CSRC4.NE.CSRC1 ) $ CALL IGESD2D( ICTXT, 1, 1, LIHIC, 1, RSRC1, $ CSRC1 ) IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) $ CALL IGESD2D( ICTXT, 1, 1, LIHIC, 1, RSRC2, $ CSRC2 ) IF( RSRC4.NE.RSRC3 .OR. CSRC4.NE.CSRC3 ) $ CALL IGESD2D( ICTXT, 1, 1, LIHIC, 1, RSRC3, $ CSRC3 ) END IF IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC4.NE.RSRC1 .OR. CSRC4.NE.CSRC1 ) $ CALL IGERV2D( ICTXT, 1, 1, LIHIC, 1, RSRC4, $ CSRC4 ) END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) $ CALL IGERV2D( ICTXT, 1, 1, LIHIC, 1, RSRC4, $ CSRC4 ) END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN IF( RSRC4.NE.RSRC3 .OR. CSRC4.NE.CSRC3 ) $ CALL IGERV2D( ICTXT, 1, 1, LIHIC, 1, RSRC4, $ CSRC4 ) END IF * * Avoid going over the border with the first window if * it resides in the block where the last global position * T(ILO,ILO) is or ILO has been updated to point to a * position right of T(LIHIC,LIHIC). * SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) * * Decide I, where to put top of window, such that top of * window does not cut 2-by-2 block. Make sure that we do * not end up in a situation where a 2-by-2 block * splitted on the border is left in its original place * -- this can cause infinite loops. * Remedy: make sure that the part of the window that * resides left to the border is at least of dimension * two (2) in case we have 2-by-2 blocks in top of the * cross border window. * * Also make sure all processors in the group knows about * the correct value of I. When skipping the crossborder * reordering, just set I = LIHIC. * IF( .NOT. SKIP1CR ) THEN IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( WINDOW.EQ.1 ) THEN LIHI1 = ILO ELSE LIHI1 = IWORK( ILIHI + WINDOW - 2 ) END IF I = MAX( LIHI1, $ MIN( LIHIC-2*MOD(LIHIC,NB) + 1, $ (ICEIL(LIHIC,NB)-1)*NB - 1 ) ) ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I-1, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) IF( T( (JLOC-1)*LLDT+ILOC ).NE.ZERO ) $ I = I - 1 IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) $ CALL IGESD2D( ICTXT, 1, 1, I, 1, RSRC4, $ CSRC4 ) IF( RSRC1.NE.RSRC2 .OR. CSRC1.NE.CSRC2 ) $ CALL IGESD2D( ICTXT, 1, 1, I, 1, RSRC2, $ CSRC2 ) IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) $ CALL IGESD2D( ICTXT, 1, 1, I, 1, RSRC3, $ CSRC3 ) END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN IF( RSRC1.NE.RSRC2 .OR. CSRC1.NE.CSRC2 ) $ CALL IGERV2D( ICTXT, 1, 1, I, 1, RSRC1, $ CSRC1 ) END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) $ CALL IGERV2D( ICTXT, 1, 1, I, 1, RSRC1, $ CSRC1 ) END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) $ CALL IGERV2D( ICTXT, 1, 1, I, 1, RSRC1, $ CSRC1 ) END IF ELSE I = LIHIC END IF * * Finalize computation of window size: active window is * now (I:LIHIC,I:LIHIC). * NWIN = LIHIC - I + 1 KS = 0 * * Skip rest of this part if appropriate. * IF( SKIP1CR ) GO TO 360 * * Divide workspace -- put active window in * WORK(IPW2:IPW2+NWIN**2-1) and orthogonal * transformations in WORK(IPW3:...). * CALL SLASET( 'All', NWIN, NWIN, ZERO, ZERO, $ WORK( IPW2 ), NWIN ) * PITRAF = IPIW IPW3 = IPW2 + NWIN*NWIN PDTRAF = IPW3 * * Exchange the current view of SELECT for the active * window between process 1 and 4 to make sure that * exactly the same job is performed for both processes. * IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) THEN ILEN4 = MOD(LIHIC,NB) SELI4 = ICEIL(I,NB)*NB+1 ILEN1 = NWIN - ILEN4 IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN CALL IGESD2D( ICTXT, ILEN1, 1, SELECT(I), $ ILEN1, RSRC4, CSRC4 ) CALL IGERV2D( ICTXT, ILEN4, 1, SELECT(SELI4), $ ILEN4, RSRC4, CSRC4 ) END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN CALL IGESD2D( ICTXT, ILEN4, 1, SELECT(SELI4), $ ILEN4, RSRC1, CSRC1 ) CALL IGERV2D( ICTXT, ILEN1, 1, SELECT(I), $ ILEN1, RSRC1, CSRC1 ) END IF END IF * * Form the active window by a series of point-to-point * sends and receives. * DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) CALL SLAMOV( 'All', DIM1, DIM1, $ T((JLOC-1)*LLDT+ILOC), LLDT, WORK(IPW2), $ NWIN ) IF( RSRC1.NE.RSRC4 .OR. CSRC1.NE.CSRC4 ) THEN CALL SGESD2D( ICTXT, DIM1, DIM1, $ WORK(IPW2), NWIN, RSRC4, CSRC4 ) CALL SGERV2D( ICTXT, DIM4, DIM4, $ WORK(IPW2+DIM1*NWIN+DIM1), NWIN, RSRC4, $ CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN ILOC = INDXG2L( I+DIM1, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I+DIM1, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) CALL SLAMOV( 'All', DIM4, DIM4, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK(IPW2+DIM1*NWIN+DIM1), NWIN ) IF( RSRC4.NE.RSRC1 .OR. CSRC4.NE.CSRC1 ) THEN CALL SGESD2D( ICTXT, DIM4, DIM4, $ WORK(IPW2+DIM1*NWIN+DIM1), NWIN, RSRC1, $ CSRC1 ) CALL SGERV2D( ICTXT, DIM1, DIM1, $ WORK(IPW2), NWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I+DIM1, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) CALL SLAMOV( 'All', DIM1, DIM4, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK(IPW2+DIM1*NWIN), NWIN ) IF( RSRC2.NE.RSRC1 .OR. CSRC2.NE.CSRC1 ) THEN CALL SGESD2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN IF( RSRC2.NE.RSRC4 .OR. CSRC2.NE.CSRC4 ) THEN CALL SGESD2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN ILOC = INDXG2L( I+DIM1, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I+DIM1-1, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) CALL SLAMOV( 'All', 1, 1, $ T((JLOC-1)*LLDT+ILOC), LLDT, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN ) IF( RSRC3.NE.RSRC1 .OR. CSRC3.NE.CSRC1 ) THEN CALL SGESD2D( ICTXT, 1, 1, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN, $ RSRC1, CSRC1 ) END IF END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN IF( RSRC3.NE.RSRC4 .OR. CSRC3.NE.CSRC4 ) THEN CALL SGESD2D( ICTXT, 1, 1, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN, $ RSRC4, CSRC4 ) END IF END IF IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC1.NE.RSRC2 .OR. CSRC1.NE.CSRC2 ) THEN CALL SGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC2, $ CSRC2 ) END IF IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) THEN CALL SGERV2D( ICTXT, 1, 1, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN, $ RSRC3, CSRC3 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) THEN CALL SGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC2, $ CSRC2 ) END IF IF( RSRC4.NE.RSRC3 .OR. CSRC4.NE.CSRC3 ) THEN CALL SGERV2D( ICTXT, 1, 1, $ WORK(IPW2+(DIM1-1)*NWIN+DIM1), NWIN, $ RSRC3, CSRC3 ) END IF END IF * * Compute the reordering (just as in the total local * case) and accumulate the transformations (ONLY * ON-DIAGONAL PROCESSES). * IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN PAIR = .FALSE. DO 330 K = I, LIHIC IF( PAIR ) THEN PAIR = .FALSE. ELSE SWAP = SELECT( K ).NE.0 IF( K.LT.LIHIC ) THEN ELEM = WORK(IPW2+(K-I)*NWIN+K-I+1) IF( ELEM.NE.ZERO ) $ PAIR = .TRUE. END IF IF( SWAP ) THEN KS = KS + 1 * * Swap the K-th block to position I+KS-1. * IERR = 0 KK = K - I + 1 KKS = KS IF( KK.NE.KS ) THEN NITRAF = LIWORK - PITRAF + 1 NDTRAF = LWORK - PDTRAF + 1 CALL BSTREXC( NWIN, WORK(IPW2), NWIN, $ KK, KKS, NITRAF, IWORK( PITRAF ), $ NDTRAF, WORK( PDTRAF ), $ WORK(IPW1), IERR ) PITRAF = PITRAF + NITRAF PDTRAF = PDTRAF + NDTRAF * * Update array SELECT. * IF ( PAIR ) THEN DO 340 J = I+KK-1, I+KKS, -1 SELECT(J+1) = SELECT(J-1) 340 CONTINUE SELECT(I+KKS-1) = 1 SELECT(I+KKS) = 1 ELSE DO 350 J = I+KK-1, I+KKS, -1 SELECT(J) = SELECT(J-1) 350 CONTINUE SELECT(I+KKS-1) = 1 END IF * IF ( IERR.EQ.1 .OR. IERR.EQ.2 ) THEN * IF ( IERR.EQ.2 ) THEN SELECT( I+KKS-3 ) = 1 SELECT( I+KKS-1 ) = 0 KKS = KKS + 1 END IF * GO TO 360 END IF KS = KKS END IF IF( PAIR ) $ KS = KS + 1 END IF END IF 330 CONTINUE END IF 360 CONTINUE * * Save information about the reordering. * IF( ( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) .OR. $ ( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) ) THEN IBUFF( 1 ) = I IBUFF( 2 ) = NWIN IBUFF( 3 ) = PITRAF IBUFF( 4 ) = KS IBUFF( 5 ) = PDTRAF IBUFF( 6 ) = NDTRAF ILEN = PITRAF - IPIW + 1 DLEN = PDTRAF - IPW3 + 1 IBUFF( 7 ) = ILEN IBUFF( 8 ) = DLEN * * Put reordered data back into global matrix if a * reordering took place. * IF( .NOT. SKIP1CR ) THEN IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I, NB, MYCOL, DESCT( CSRC_ ), $ NPCOL ) CALL SLAMOV( 'All', DIM1, DIM1, WORK(IPW2), $ NWIN, T((JLOC-1)*LLDT+ILOC), LLDT ) END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN ILOC = INDXG2L( I+DIM1, NB, MYROW, $ DESCT( RSRC_ ), NPROW ) JLOC = INDXG2L( I+DIM1, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) CALL SLAMOV( 'All', DIM4, DIM4, $ WORK(IPW2+DIM1*NWIN+DIM1), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF END IF END IF * * Break if appropriate -- IBUFF(3:8) may now contain * nonsens, but that's no problem. The processors outside * the cross border group only needs to know about I and * NWIN to get a correct value of SKIP1CR (see below) and * to skip the cross border updates if necessary. * IF( WINDOW.EQ.1 .AND. SKIP1CR ) GO TO 325 * * Return reordered data to process 2 and 3. * IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( RSRC1.NE.RSRC3 .OR. CSRC1.NE.CSRC3 ) THEN CALL SGESD2D( ICTXT, 1, 1, $ WORK( IPW2+(DIM1-1)*NWIN+DIM1 ), NWIN, $ RSRC3, CSRC3 ) END IF END IF IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( RSRC4.NE.RSRC2 .OR. CSRC4.NE.CSRC2 ) THEN CALL SGESD2D( ICTXT, DIM1, DIM4, $ WORK( IPW2+DIM1*NWIN), NWIN, RSRC2, $ CSRC2 ) END IF END IF IF( MYROW.EQ.RSRC2 .AND. MYCOL.EQ.CSRC2 ) THEN ILOC = INDXG2L( I, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) JLOC = INDXG2L( I+DIM1, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) IF( RSRC2.NE.RSRC4 .OR. CSRC2.NE.CSRC4 ) THEN CALL SGERV2D( ICTXT, DIM1, DIM4, $ WORK(IPW2+DIM1*NWIN), NWIN, RSRC4, CSRC4 ) END IF CALL SLAMOV( 'All', DIM1, DIM4, $ WORK( IPW2+DIM1*NWIN ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF IF( MYROW.EQ.RSRC3 .AND. MYCOL.EQ.CSRC3 ) THEN ILOC = INDXG2L( I+DIM1, NB, MYROW, $ DESCT( RSRC_ ), NPROW ) JLOC = INDXG2L( I+DIM1-1, NB, MYCOL, $ DESCT( CSRC_ ), NPCOL ) IF( RSRC3.NE.RSRC1 .OR. CSRC3.NE.CSRC1 ) THEN CALL SGERV2D( ICTXT, 1, 1, $ WORK( IPW2+(DIM1-1)*NWIN+DIM1 ), NWIN, $ RSRC1, CSRC1 ) END IF T((JLOC-1)*LLDT+ILOC) = $ WORK( IPW2+(DIM1-1)*NWIN+DIM1 ) END IF END IF * 325 CONTINUE * 320 CONTINUE * * For the crossborder updates, we use the same directions as * in the local reordering case above. * DO 2222 DIR = 1, 2 * * Broadcast information about the reordering. * DO 321 WINDOW = WINDOW0, WINE, 2 RSRC4 = IWORK(IRSRC+WINDOW-1) CSRC4 = IWORK(ICSRC+WINDOW-1) RSRC1 = MOD( RSRC4 - 1 + NPROW, NPROW ) CSRC1 = MOD( CSRC4 - 1 + NPCOL, NPCOL ) IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) $ CALL IGEBS2D( ICTXT, 'Row', TOP, 8, 1, $ IBUFF, 8 ) IF( NPROW.GT.1 .AND. DIR.EQ.2 ) $ CALL IGEBS2D( ICTXT, 'Col', TOP, 8, 1, $ IBUFF, 8 ) SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) ELSEIF( MYROW.EQ.RSRC1 .OR. MYCOL.EQ.CSRC1 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 .AND. $ MYROW.EQ.RSRC1 ) THEN CALL IGEBR2D( ICTXT, 'Row', TOP, 8, 1, $ IBUFF, 8, RSRC1, CSRC1 ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) BUFFLEN = ILEN + DLEN IPW3 = IPW2 + NWIN*NWIN DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 LIHIC = NWIN + I - 1 SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 .AND. $ MYCOL.EQ.CSRC1 ) THEN CALL IGEBR2D( ICTXT, 'Col', TOP, 8, 1, $ IBUFF, 8, RSRC1, CSRC1 ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) BUFFLEN = ILEN + DLEN IPW3 = IPW2 + NWIN*NWIN DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 LIHIC = NWIN + I - 1 SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) END IF END IF IF( RSRC1.NE.RSRC4 ) THEN IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) $ CALL IGEBS2D( ICTXT, 'Row', TOP, 8, 1, $ IBUFF, 8 ) SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) ELSEIF( MYROW.EQ.RSRC4 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) THEN CALL IGEBR2D( ICTXT, 'Row', TOP, 8, 1, $ IBUFF, 8, RSRC4, CSRC4 ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) BUFFLEN = ILEN + DLEN IPW3 = IPW2 + NWIN*NWIN DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 LIHIC = NWIN + I - 1 SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) END IF END IF END IF IF( CSRC1.NE.CSRC4 ) THEN IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN IF( NPROW.GT.1 .AND. DIR.EQ.2 ) $ CALL IGEBS2D( ICTXT, 'Col', TOP, 8, 1, $ IBUFF, 8 ) SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) ELSEIF( MYCOL.EQ.CSRC4 ) THEN IF( NPROW.GT.1 .AND. DIR.EQ.2 ) THEN CALL IGEBR2D( ICTXT, 'Col', TOP, 8, 1, $ IBUFF, 8, RSRC4, CSRC4 ) I = IBUFF( 1 ) NWIN = IBUFF( 2 ) PITRAF = IBUFF( 3 ) KS = IBUFF( 4 ) PDTRAF = IBUFF( 5 ) NDTRAF = IBUFF( 6 ) ILEN = IBUFF( 7 ) DLEN = IBUFF( 8 ) BUFFLEN = ILEN + DLEN IPW3 = IPW2 + NWIN*NWIN DIM1 = NB - MOD(I-1,NB) DIM4 = NWIN - DIM1 LIHIC = NWIN + I - 1 SKIP1CR = WINDOW.EQ.1 .AND. $ ICEIL(LIHIC,NB).LE.ICEIL(ILO,NB) END IF END IF END IF * * Skip rest of broadcasts and updates if appropriate. * IF( SKIP1CR ) GO TO 326 * * Broadcast the orthogonal transformations. * IF( MYROW.EQ.RSRC1 .AND. MYCOL.EQ.CSRC1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( (NPROW.GT.1 .AND. DIR.EQ.2) .OR. $ (NPCOL.GT.1 .AND. DIR.EQ.1) ) THEN DO 370 INDX = 1, ILEN WORK( BUFFER+INDX-1 ) = $ FLOAT( IWORK(IPIW+INDX-1) ) 370 CONTINUE CALL SLAMOV( 'All', DLEN, 1, WORK( IPW3 ), $ DLEN, WORK(BUFFER+ILEN), DLEN ) END IF IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) THEN CALL SGEBS2D( ICTXT, 'Row', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN ) END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 ) THEN CALL SGEBS2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN ) END IF ELSEIF( MYROW.EQ.RSRC1 .OR. MYCOL.EQ.CSRC1 ) THEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 .AND. $ MYROW.EQ.RSRC1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN CALL SGEBR2D( ICTXT, 'Row', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC1, CSRC1 ) END IF IF( NPROW.GT.1 .AND. DIR.EQ.2 .AND. $ MYCOL.EQ.CSRC1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN CALL SGEBR2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC1, CSRC1 ) END IF IF( (NPCOL.GT.1.AND.DIR.EQ.1.AND.MYROW.EQ.RSRC1) $ .OR. (NPROW.GT.1.AND.DIR.EQ.2.AND. $ MYCOL.EQ.CSRC1) ) THEN DO 380 INDX = 1, ILEN IWORK(IPIW+INDX-1) = $ INT( WORK( BUFFER+INDX-1 ) ) 380 CONTINUE CALL SLAMOV( 'All', DLEN, 1, $ WORK( BUFFER+ILEN ), DLEN, $ WORK( IPW3 ), DLEN ) END IF END IF IF( RSRC1.NE.RSRC4 ) THEN IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) THEN DO 390 INDX = 1, ILEN WORK( BUFFER+INDX-1 ) = $ FLOAT( IWORK(IPIW+INDX-1) ) 390 CONTINUE CALL SLAMOV( 'All', DLEN, 1, WORK( IPW3 ), $ DLEN, WORK(BUFFER+ILEN), DLEN ) CALL SGEBS2D( ICTXT, 'Row', TOP, BUFFLEN, $ 1, WORK(BUFFER), BUFFLEN ) END IF ELSEIF( MYROW.EQ.RSRC4 .AND. DIR.EQ.1 .AND. $ NPCOL.GT.1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN CALL SGEBR2D( ICTXT, 'Row', TOP, BUFFLEN, $ 1, WORK(BUFFER), BUFFLEN, RSRC4, CSRC4 ) DO 400 INDX = 1, ILEN IWORK(IPIW+INDX-1) = $ INT( WORK( BUFFER+INDX-1 ) ) 400 CONTINUE CALL SLAMOV( 'All', DLEN, 1, $ WORK( BUFFER+ILEN ), DLEN, $ WORK( IPW3 ), DLEN ) END IF END IF IF( CSRC1.NE.CSRC4 ) THEN IF( MYROW.EQ.RSRC4 .AND. MYCOL.EQ.CSRC4 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN IF( NPROW.GT.1 .AND. DIR.EQ.2 ) THEN DO 395 INDX = 1, ILEN WORK( BUFFER+INDX-1 ) = $ FLOAT( IWORK(IPIW+INDX-1) ) 395 CONTINUE CALL SLAMOV( 'All', DLEN, 1, WORK( IPW3 ), $ DLEN, WORK(BUFFER+ILEN), DLEN ) CALL SGEBS2D( ICTXT, 'Col', TOP, BUFFLEN, $ 1, WORK(BUFFER), BUFFLEN ) END IF ELSEIF( MYCOL.EQ.CSRC4 .AND. DIR.EQ.2 .AND. $ NPROW.GT.1 ) THEN BUFFER = PDTRAF BUFFLEN = DLEN + ILEN CALL SGEBR2D( ICTXT, 'Col', TOP, BUFFLEN, 1, $ WORK(BUFFER), BUFFLEN, RSRC4, CSRC4 ) DO 402 INDX = 1, ILEN IWORK(IPIW+INDX-1) = $ INT( WORK( BUFFER+INDX-1 ) ) 402 CONTINUE CALL SLAMOV( 'All', DLEN, 1, $ WORK( BUFFER+ILEN ), DLEN, $ WORK( IPW3 ), DLEN ) END IF END IF * 326 CONTINUE * 321 CONTINUE * * Compute crossborder updates. * DO 322 WINDOW = WINDOW0, WINE, 2 IF( WINDOW.EQ.1 .AND. SKIP1CR ) GO TO 327 RSRC4 = IWORK(IRSRC+WINDOW-1) CSRC4 = IWORK(ICSRC+WINDOW-1) RSRC1 = MOD( RSRC4 - 1 + NPROW, NPROW ) CSRC1 = MOD( CSRC4 - 1 + NPCOL, NPCOL ) * * Prepare workspaces for updates: * IPW3 holds now the orthogonal transformations * IPW4 holds the explicit orthogonal matrix, if formed * IPW5 holds the crossborder block column of T * IPW6 holds the crossborder block row of T * IPW7 holds the crossborder block column of Q * (if WANTQ=.TRUE.) * IPW8 points to the leftover workspace used as lhs in * matrix multiplications * IF( ((MYCOL.EQ.CSRC1.OR.MYCOL.EQ.CSRC4).AND.DIR.EQ.2) $ .OR. ((MYROW.EQ.RSRC1.OR.MYROW.EQ.RSRC4).AND. $ DIR.EQ.1)) THEN IPW4 = BUFFER IF( DIR.EQ.2 ) THEN IF( WANTQ ) THEN QROWS = NUMROC( N, NB, MYROW, DESCQ( RSRC_ ), $ NPROW ) ELSE QROWS = 0 END IF TROWS = NUMROC( I-1, NB, MYROW, DESCT( RSRC_ ), $ NPROW ) ELSE QROWS = 0 TROWS = 0 END IF IF( DIR.EQ.1 ) THEN TCOLS = NUMROC( N - (I+DIM1-1), NB, MYCOL, $ CSRC4, NPCOL ) IF( MYCOL.EQ.CSRC4 ) TCOLS = TCOLS - DIM4 ELSE TCOLS = 0 END IF IPW5 = IPW4 + NWIN*NWIN IPW6 = IPW5 + TROWS * NWIN IF( WANTQ ) THEN IPW7 = IPW6 + NWIN * TCOLS IPW8 = IPW7 + QROWS * NWIN ELSE IPW8 = IPW6 + NWIN * TCOLS END IF END IF * * Let each process row and column involved in the updates * exchange data in T and Q with their neighbours. * IF( DIR.EQ.2 ) THEN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) THEN DO 410 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, I, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC1, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', TROWS, DIM1, $ T((JLOC1-1)*LLDT+ILOC), LLDT, $ WORK(IPW5), TROWS ) IF( NPCOL.GT.1 ) THEN EAST = MOD( MYCOL + 1, NPCOL ) CALL SGESD2D( ICTXT, TROWS, DIM1, $ WORK(IPW5), TROWS, RSRC, $ EAST ) CALL SGERV2D( ICTXT, TROWS, DIM4, $ WORK(IPW5+TROWS*DIM1), TROWS, $ RSRC, EAST ) END IF END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, I+DIM1, $ DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC4, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', TROWS, DIM4, $ T((JLOC4-1)*LLDT+ILOC), LLDT, $ WORK(IPW5+TROWS*DIM1), TROWS ) IF( NPCOL.GT.1 ) THEN WEST = MOD( MYCOL-1+NPCOL, NPCOL ) CALL SGESD2D( ICTXT, TROWS, DIM4, $ WORK(IPW5+TROWS*DIM1), TROWS, $ RSRC, WEST ) CALL SGERV2D( ICTXT, TROWS, DIM1, $ WORK(IPW5), TROWS, RSRC, $ WEST ) END IF END IF END IF 410 CONTINUE END IF END IF * IF( DIR.EQ.1 ) THEN IF( MYROW.EQ.RSRC1 .OR. MYROW.EQ.RSRC4 ) THEN DO 420 INDX = 1, NPCOL IF( MYROW.EQ.RSRC1 ) THEN IF( INDX.EQ.1 ) THEN CALL INFOG2L( I, LIHIC+1, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC1, JLOC, $ RSRC1, CSRC ) ELSE CALL INFOG2L( I, $ (ICEIL(LIHIC,NB)+(INDX-2))*NB+1, $ DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC, RSRC1, CSRC ) END IF IF( MYCOL.EQ.CSRC ) THEN CALL SLAMOV( 'All', DIM1, TCOLS, $ T((JLOC-1)*LLDT+ILOC1), LLDT, $ WORK(IPW6), NWIN ) IF( NPROW.GT.1 ) THEN SOUTH = MOD( MYROW + 1, NPROW ) CALL SGESD2D( ICTXT, DIM1, TCOLS, $ WORK(IPW6), NWIN, SOUTH, $ CSRC ) CALL SGERV2D( ICTXT, DIM4, TCOLS, $ WORK(IPW6+DIM1), NWIN, SOUTH, $ CSRC ) END IF END IF END IF IF( MYROW.EQ.RSRC4 ) THEN IF( INDX.EQ.1 ) THEN CALL INFOG2L( I+DIM1, LIHIC+1, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, ILOC4, $ JLOC, RSRC4, CSRC ) ELSE CALL INFOG2L( I+DIM1, $ (ICEIL(LIHIC,NB)+(INDX-2))*NB+1, $ DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC, RSRC4, CSRC ) END IF IF( MYCOL.EQ.CSRC ) THEN CALL SLAMOV( 'All', DIM4, TCOLS, $ T((JLOC-1)*LLDT+ILOC4), LLDT, $ WORK(IPW6+DIM1), NWIN ) IF( NPROW.GT.1 ) THEN NORTH = MOD( MYROW-1+NPROW, NPROW ) CALL SGESD2D( ICTXT, DIM4, TCOLS, $ WORK(IPW6+DIM1), NWIN, NORTH, $ CSRC ) CALL SGERV2D( ICTXT, DIM1, TCOLS, $ WORK(IPW6), NWIN, NORTH, $ CSRC ) END IF END IF END IF 420 CONTINUE END IF END IF * IF( DIR.EQ.2 ) THEN IF( WANTQ ) THEN IF( MYCOL.EQ.CSRC1 .OR. MYCOL.EQ.CSRC4 ) THEN DO 430 INDX = 1, NPROW IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, I, DESCQ, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC1, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', QROWS, DIM1, $ Q((JLOC1-1)*LLDQ+ILOC), LLDQ, $ WORK(IPW7), QROWS ) IF( NPCOL.GT.1 ) THEN EAST = MOD( MYCOL + 1, NPCOL ) CALL SGESD2D( ICTXT, QROWS, DIM1, $ WORK(IPW7), QROWS, RSRC, $ EAST ) CALL SGERV2D( ICTXT, QROWS, DIM4, $ WORK(IPW7+QROWS*DIM1), $ QROWS, RSRC, EAST ) END IF END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( 1+(INDX-1)*NB, I+DIM1, $ DESCQ, NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC4, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', QROWS, DIM4, $ Q((JLOC4-1)*LLDQ+ILOC), LLDQ, $ WORK(IPW7+QROWS*DIM1), QROWS ) IF( NPCOL.GT.1 ) THEN WEST = MOD( MYCOL-1+NPCOL, $ NPCOL ) CALL SGESD2D( ICTXT, QROWS, DIM4, $ WORK(IPW7+QROWS*DIM1), $ QROWS, RSRC, WEST ) CALL SGERV2D( ICTXT, QROWS, DIM1, $ WORK(IPW7), QROWS, RSRC, $ WEST ) END IF END IF END IF 430 CONTINUE END IF END IF END IF * 327 CONTINUE * 322 CONTINUE * DO 323 WINDOW = WINDOW0, WINE, 2 RSRC4 = IWORK(IRSRC+WINDOW-1) CSRC4 = IWORK(ICSRC+WINDOW-1) RSRC1 = MOD( RSRC4 - 1 + NPROW, NPROW ) CSRC1 = MOD( CSRC4 - 1 + NPCOL, NPCOL ) FLOPS = 0 IF( ((MYCOL.EQ.CSRC1.OR.MYCOL.EQ.CSRC4).AND.DIR.EQ.2) $ .OR. ((MYROW.EQ.RSRC1.OR.MYROW.EQ.RSRC4).AND. $ DIR.EQ.1) ) THEN * * Skip this part of the updates if appropriate. * IF( WINDOW.EQ.1 .AND. SKIP1CR ) GO TO 328 * * Count number of operations to decide whether to use * matrix-matrix multiplications for updating * off-diagonal parts or not. * NITRAF = PITRAF - IPIW ISHH = .FALSE. DO 405 K = 1, NITRAF IF( IWORK( IPIW + K - 1 ).LE.NWIN ) THEN FLOPS = FLOPS + 6 ELSE FLOPS = FLOPS + 11 ISHH = .TRUE. END IF 405 CONTINUE * * Perform updates in parallel. * IF( FLOPS.NE.0 .AND. $ ( 2*FLOPS*100 )/( 2*NWIN*NWIN ) .GE. MMULT ) $ THEN * CALL SLASET( 'All', NWIN, NWIN, ZERO, ONE, $ WORK( IPW4 ), NWIN ) WORK(IPW8) = FLOAT(MYROW) WORK(IPW8+1) = FLOAT(MYCOL) CALL BSLAAPP( 1, NWIN, NWIN, NCB, WORK( IPW4 ), $ NWIN, NITRAF, IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) * * Test if sparsity structure of orthogonal matrix * can be exploited (see below). * IF( ISHH .OR. DIM1.NE.KS .OR. DIM4.NE.KS ) THEN * * Update the columns of T and Q affected by the * reordering. * IF( DIR.EQ.2 ) THEN DO 440 INDX = 1, MIN(I-1,1+(NPROW-1)*NB), $ NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SGEMM( 'No transpose', $ 'No transpose', TROWS, DIM1, $ NWIN, ONE, WORK( IPW5 ), $ TROWS, WORK( IPW4 ), NWIN, $ ZERO, WORK(IPW8), TROWS ) CALL SLAMOV( 'All', TROWS, DIM1, $ WORK(IPW8), TROWS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SGEMM( 'No transpose', $ 'No transpose', TROWS, DIM4, $ NWIN, ONE, WORK( IPW5 ), $ TROWS, $ WORK( IPW4+NWIN*DIM1 ), $ NWIN, ZERO, WORK(IPW8), $ TROWS ) CALL SLAMOV( 'All', TROWS, DIM4, $ WORK(IPW8), TROWS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF 440 CONTINUE * IF( WANTQ ) THEN DO 450 INDX = 1, MIN(N,1+(NPROW-1)*NB), $ NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCQ, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SGEMM( 'No transpose', $ 'No transpose', QROWS, $ DIM1, NWIN, ONE, $ WORK( IPW7 ), QROWS, $ WORK( IPW4 ), NWIN, $ ZERO, WORK(IPW8), $ QROWS ) CALL SLAMOV( 'All', QROWS, $ DIM1, WORK(IPW8), QROWS, $ Q((JLOC-1)*LLDQ+ILOC), $ LLDQ ) END IF END IF IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, $ DESCQ, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC, $ CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SGEMM( 'No transpose', $ 'No transpose', QROWS, $ DIM4, NWIN, ONE, $ WORK( IPW7 ), QROWS, $ WORK( IPW4+NWIN*DIM1 ), $ NWIN, ZERO, WORK(IPW8), $ QROWS ) CALL SLAMOV( 'All', QROWS, $ DIM4, WORK(IPW8), QROWS, $ Q((JLOC-1)*LLDQ+ILOC), $ LLDQ ) END IF END IF 450 CONTINUE END IF END IF * * Update the rows of T affected by the * reordering. * IF( DIR.EQ.1 ) THEN IF ( LIHIC.LT.N ) THEN IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC4 $ .AND.MOD(LIHIC,NB).NE.0 ) THEN INDX = LIHIC + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC4 ) CALL SGEMM( 'Transpose', $ 'No Transpose', DIM1, TCOLS, $ NWIN, ONE, WORK(IPW4), NWIN, $ WORK( IPW6 ), NWIN, ZERO, $ WORK(IPW8), DIM1 ) CALL SLAMOV( 'All', DIM1, TCOLS, $ WORK(IPW8), DIM1, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF IF( MYROW.EQ.RSRC4.AND.MYCOL.EQ.CSRC4 $ .AND.MOD(LIHIC,NB).NE.0 ) THEN INDX = LIHIC + 1 CALL INFOG2L( I+DIM1, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC4, CSRC4 ) CALL SGEMM( 'Transpose', $ 'No Transpose', DIM4, TCOLS, $ NWIN, ONE, $ WORK( IPW4+DIM1*NWIN ), NWIN, $ WORK( IPW6), NWIN, ZERO, $ WORK(IPW8), DIM4 ) CALL SLAMOV( 'All', DIM4, TCOLS, $ WORK(IPW8), DIM4, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF INDXS = ICEIL(LIHIC,NB)*NB + 1 INDXE = MIN(N,INDXS+(NPCOL-2)*NB) DO 460 INDX = INDXS, INDXE, NB IF( MYROW.EQ.RSRC1 ) THEN CALL INFOG2L( I, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC1, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL SGEMM( 'Transpose', $ 'No Transpose', DIM1, $ TCOLS, NWIN, ONE, $ WORK( IPW4 ), NWIN, $ WORK( IPW6 ), NWIN, $ ZERO, WORK(IPW8), DIM1 ) CALL SLAMOV( 'All', DIM1, $ TCOLS, WORK(IPW8), DIM1, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF IF( MYROW.EQ.RSRC4 ) THEN CALL INFOG2L( I+DIM1, INDX, $ DESCT, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC4, $ CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL SGEMM( 'Transpose', $ 'No Transpose', DIM4, $ TCOLS, NWIN, ONE, $ WORK( IPW4+NWIN*DIM1 ), $ NWIN, WORK( IPW6 ), $ NWIN, ZERO, WORK(IPW8), $ DIM4 ) CALL SLAMOV( 'All', DIM4, $ TCOLS, WORK(IPW8), DIM4, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF 460 CONTINUE END IF END IF ELSE * * The NWIN-by-NWIN matrix U containing the * accumulated orthogonal transformations has * the following structure: * * [ U11 U12 ] * U = [ ], * [ U21 U22 ] * * where U21 is KS-by-KS upper triangular and * U12 is (NWIN-KS)-by-(NWIN-KS) lower * triangular. For reordering over the border * the structure is only exploited when the * border cuts the columns of U conformally with * the structure itself. This happens exactly * when all eigenvalues in the subcluster was * moved to the other side of the border and * fits perfectly in their new positions, i.e., * the reordering stops when the last eigenvalue * to cross the border is reordered to the * position closest to the border. Tested by * checking is KS = DIM1 = DIM4 (see above). * This should hold quite often. But this branch * is entered only if all involved eigenvalues * are real. * * Update the columns of T and Q affected by the * reordering. * * Compute T2*U21 + T1*U11 on the left side of * the border. * IF( DIR.EQ.2 ) THEN INDXE = MIN(I-1,1+(NPROW-1)*NB) DO 470 INDX = 1, INDXE, NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', TROWS, KS, $ WORK( IPW5+TROWS*DIM4), $ TROWS, WORK(IPW8), TROWS ) CALL STRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', TROWS, KS, $ ONE, WORK( IPW4+DIM4 ), $ NWIN, WORK(IPW8), TROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', TROWS, KS, $ DIM4, ONE, WORK( IPW5 ), $ TROWS, WORK( IPW4 ), NWIN, $ ONE, WORK(IPW8), TROWS ) CALL SLAMOV( 'All', TROWS, KS, $ WORK(IPW8), TROWS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF * * Compute T1*U12 + T2*U22 on the right * side of the border. * IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC, CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', TROWS, DIM4, $ WORK(IPW5), TROWS, $ WORK( IPW8 ), TROWS ) CALL STRMM( 'Right', 'Lower', $ 'No transpose', $ 'Non-unit', TROWS, DIM4, $ ONE, WORK( IPW4+NWIN*KS ), $ NWIN, WORK( IPW8 ), TROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', TROWS, DIM4, $ KS, ONE, $ WORK( IPW5+TROWS*DIM4), $ TROWS, $ WORK( IPW4+NWIN*KS+DIM4 ), $ NWIN, ONE, WORK( IPW8 ), $ TROWS ) CALL SLAMOV( 'All', TROWS, DIM4, $ WORK(IPW8), TROWS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF 470 CONTINUE IF( WANTQ ) THEN * * Compute Q2*U21 + Q1*U11 on the left * side of border. * INDXE = MIN(N,1+(NPROW-1)*NB) DO 480 INDX = 1, INDXE, NB IF( MYCOL.EQ.CSRC1 ) THEN CALL INFOG2L( INDX, I, DESCQ, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC, CSRC1 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', QROWS, KS, $ WORK( IPW7+QROWS*DIM4), $ QROWS, WORK(IPW8), $ QROWS ) CALL STRMM( 'Right', 'Upper', $ 'No transpose', $ 'Non-unit', QROWS, $ KS, ONE, $ WORK( IPW4+DIM4 ), NWIN, $ WORK(IPW8), QROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', QROWS, $ KS, DIM4, ONE, $ WORK( IPW7 ), QROWS, $ WORK( IPW4 ), NWIN, ONE, $ WORK(IPW8), QROWS ) CALL SLAMOV( 'All', QROWS, KS, $ WORK(IPW8), QROWS, $ Q((JLOC-1)*LLDQ+ILOC), $ LLDQ ) END IF END IF * * Compute Q1*U12 + Q2*U22 on the right * side of border. * IF( MYCOL.EQ.CSRC4 ) THEN CALL INFOG2L( INDX, I+DIM1, $ DESCQ, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC, $ CSRC4 ) IF( MYROW.EQ.RSRC ) THEN CALL SLAMOV( 'All', QROWS, $ DIM4, WORK(IPW7), QROWS, $ WORK( IPW8 ), QROWS ) CALL STRMM( 'Right', 'Lower', $ 'No transpose', $ 'Non-unit', QROWS, $ DIM4, ONE, $ WORK( IPW4+NWIN*KS ), $ NWIN, WORK( IPW8 ), $ QROWS ) CALL SGEMM( 'No transpose', $ 'No transpose', QROWS, $ DIM4, KS, ONE, $ WORK(IPW7+QROWS*(DIM4)), $ QROWS, $ WORK(IPW4+NWIN*KS+DIM4), $ NWIN, ONE, WORK( IPW8 ), $ QROWS ) CALL SLAMOV( 'All', QROWS, $ DIM4, WORK(IPW8), QROWS, $ Q((JLOC-1)*LLDQ+ILOC), $ LLDQ ) END IF END IF 480 CONTINUE END IF END IF * IF( DIR.EQ.1 ) THEN IF ( LIHIC.LT.N ) THEN * * Compute U21**T*T2 + U11**T*T1 on the * upper side of the border. * IF( MYROW.EQ.RSRC1.AND.MYCOL.EQ.CSRC4 $ .AND.MOD(LIHIC,NB).NE.0 ) THEN INDX = LIHIC + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC1, CSRC4 ) CALL SLAMOV( 'All', KS, TCOLS, $ WORK( IPW6+DIM4 ), NWIN, $ WORK(IPW8), KS ) CALL STRMM( 'Left', 'Upper', $ 'Transpose', 'Non-unit', $ KS, TCOLS, ONE, $ WORK( IPW4+DIM4 ), NWIN, $ WORK(IPW8), KS ) CALL SGEMM( 'Transpose', $ 'No transpose', KS, TCOLS, $ DIM4, ONE, WORK(IPW4), NWIN, $ WORK(IPW6), NWIN, ONE, $ WORK(IPW8), KS ) CALL SLAMOV( 'All', KS, TCOLS, $ WORK(IPW8), KS, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF * * Compute U12**T*T1 + U22**T*T2 on the * lower side of the border. * IF( MYROW.EQ.RSRC4.AND.MYCOL.EQ.CSRC4 $ .AND.MOD(LIHIC,NB).NE.0 ) THEN INDX = LIHIC + 1 CALL INFOG2L( I+DIM1, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC4, CSRC4 ) CALL SLAMOV( 'All', DIM4, TCOLS, $ WORK( IPW6 ), NWIN, $ WORK( IPW8 ), DIM4 ) CALL STRMM( 'Left', 'Lower', $ 'Transpose', 'Non-unit', $ DIM4, TCOLS, ONE, $ WORK( IPW4+NWIN*KS ), NWIN, $ WORK( IPW8 ), DIM4 ) CALL SGEMM( 'Transpose', $ 'No Transpose', DIM4, TCOLS, $ KS, ONE, $ WORK( IPW4+NWIN*KS+DIM4 ), $ NWIN, WORK( IPW6+DIM1 ), NWIN, $ ONE, WORK( IPW8), DIM4 ) CALL SLAMOV( 'All', DIM4, TCOLS, $ WORK(IPW8), DIM4, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF * * Compute U21**T*T2 + U11**T*T1 on upper * side on border. * INDXS = ICEIL(LIHIC,NB)*NB+1 INDXE = MIN(N,INDXS+(NPCOL-2)*NB) DO 490 INDX = INDXS, INDXE, NB IF( MYROW.EQ.RSRC1 ) THEN CALL INFOG2L( I, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, $ ILOC, JLOC, RSRC1, CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL SLAMOV( 'All', KS, TCOLS, $ WORK( IPW6+DIM4 ), NWIN, $ WORK(IPW8), KS ) CALL STRMM( 'Left', 'Upper', $ 'Transpose', $ 'Non-unit', KS, $ TCOLS, ONE, $ WORK( IPW4+DIM4 ), NWIN, $ WORK(IPW8), KS ) CALL SGEMM( 'Transpose', $ 'No transpose', KS, $ TCOLS, DIM4, ONE, $ WORK(IPW4), NWIN, $ WORK(IPW6), NWIN, ONE, $ WORK(IPW8), KS ) CALL SLAMOV( 'All', KS, TCOLS, $ WORK(IPW8), KS, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF * * Compute U12**T*T1 + U22**T*T2 on * lower side of border. * IF( MYROW.EQ.RSRC4 ) THEN CALL INFOG2L( I+DIM1, INDX, $ DESCT, NPROW, NPCOL, MYROW, $ MYCOL, ILOC, JLOC, RSRC4, $ CSRC ) IF( MYCOL.EQ.CSRC ) THEN CALL SLAMOV( 'All', DIM4, $ TCOLS, WORK( IPW6 ), $ NWIN, WORK( IPW8 ), $ DIM4 ) CALL STRMM( 'Left', 'Lower', $ 'Transpose', $ 'Non-unit', DIM4, $ TCOLS, ONE, $ WORK( IPW4+NWIN*KS ), $ NWIN, WORK( IPW8 ), $ DIM4 ) CALL SGEMM( 'Transpose', $ 'No Transpose', DIM4, $ TCOLS, KS, ONE, $ WORK(IPW4+NWIN*KS+DIM4), $ NWIN, WORK( IPW6+DIM1 ), $ NWIN, ONE, WORK( IPW8), $ DIM4 ) CALL SLAMOV( 'All', DIM4, $ TCOLS, WORK(IPW8), DIM4, $ T((JLOC-1)*LLDT+ILOC), $ LLDT ) END IF END IF 490 CONTINUE END IF END IF END IF ELSEIF( FLOPS.NE.0 ) THEN * * Update off-diagonal blocks and Q using the * pipelined elementary transformations. Now we * have a delicate problem: how to do this without * redundant work? For now, we let the processes * involved compute the whole crossborder block * rows and column saving only the part belonging * to the corresponding side of the border. To make * this a realistic alternative, we have modified * the ratio r_flops (see Reference [2] above) to * give more favor to the ordinary matrix * multiplication. * IF( DIR.EQ.2 ) THEN INDXE = MIN(I-1,1+(NPROW-1)*NB) DO 500 INDX = 1, INDXE, NB CALL INFOG2L( INDX, I, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN CALL BSLAAPP( 1, TROWS, NWIN, NCB, $ WORK(IPW5), TROWS, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL SLAMOV( 'All', TROWS, DIM1, $ WORK(IPW5), TROWS, $ T((JLOC-1)*LLDT+ILOC ), LLDT ) END IF CALL INFOG2L( INDX, I+DIM1, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN IF( NPCOL.GT.1 ) $ CALL BSLAAPP( 1, TROWS, NWIN, NCB, $ WORK(IPW5), TROWS, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL SLAMOV( 'All', TROWS, DIM4, $ WORK(IPW5+TROWS*DIM1), TROWS, $ T((JLOC-1)*LLDT+ILOC ), LLDT ) END IF 500 CONTINUE IF( WANTQ ) THEN INDXE = MIN(N,1+(NPROW-1)*NB) DO 510 INDX = 1, INDXE, NB CALL INFOG2L( INDX, I, DESCQ, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN CALL BSLAAPP( 1, QROWS, NWIN, NCB, $ WORK(IPW7), QROWS, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL SLAMOV( 'All', QROWS, DIM1, $ WORK(IPW7), QROWS, $ Q((JLOC-1)*LLDQ+ILOC ), LLDQ ) END IF CALL INFOG2L( INDX, I+DIM1, DESCQ, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN IF( NPCOL.GT.1 ) $ CALL BSLAAPP( 1, QROWS, NWIN, $ NCB, WORK(IPW7), QROWS, $ NITRAF, IWORK(IPIW), $ WORK( IPW3 ), WORK(IPW8) ) CALL SLAMOV( 'All', QROWS, DIM4, $ WORK(IPW7+QROWS*DIM1), QROWS, $ Q((JLOC-1)*LLDQ+ILOC ), LLDQ ) END IF 510 CONTINUE END IF END IF * IF( DIR.EQ.1 ) THEN IF( LIHIC.LT.N ) THEN INDX = LIHIC + 1 CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC.AND. $ MOD(LIHIC,NB).NE.0 ) THEN CALL BSLAAPP( 0, NWIN, TCOLS, NCB, $ WORK( IPW6 ), NWIN, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL SLAMOV( 'All', DIM1, TCOLS, $ WORK( IPW6 ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF CALL INFOG2L( I+DIM1, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC.AND. $ MOD(LIHIC,NB).NE.0 ) THEN IF( NPROW.GT.1 ) $ CALL BSLAAPP( 0, NWIN, TCOLS, NCB, $ WORK( IPW6 ), NWIN, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL SLAMOV( 'All', DIM4, TCOLS, $ WORK( IPW6+DIM1 ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF INDXS = ICEIL(LIHIC,NB)*NB + 1 INDXE = MIN(N,INDXS+(NPCOL-2)*NB) DO 520 INDX = INDXS, INDXE, NB CALL INFOG2L( I, INDX, DESCT, NPROW, $ NPCOL, MYROW, MYCOL, ILOC, JLOC, $ RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN CALL BSLAAPP( 0, NWIN, TCOLS, NCB, $ WORK(IPW6), NWIN, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL SLAMOV( 'All', DIM1, TCOLS, $ WORK( IPW6 ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF CALL INFOG2L( I+DIM1, INDX, DESCT, $ NPROW, NPCOL, MYROW, MYCOL, ILOC, $ JLOC, RSRC, CSRC ) IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) $ THEN IF( NPROW.GT.1 ) $ CALL BSLAAPP( 0, NWIN, TCOLS, $ NCB, WORK(IPW6), NWIN, NITRAF, $ IWORK(IPIW), WORK( IPW3 ), $ WORK(IPW8) ) CALL SLAMOV( 'All', DIM4, TCOLS, $ WORK( IPW6+DIM1 ), NWIN, $ T((JLOC-1)*LLDT+ILOC), LLDT ) END IF 520 CONTINUE END IF END IF END IF END IF * 328 CONTINUE * 323 CONTINUE * * End of loops over directions (DIR). * 2222 CONTINUE * * End of loops over diagonal blocks for reordering over the * block diagonal. * 310 CONTINUE LAST = LAST + 1 IF( LASTWAIT .AND. LAST.LT.2 ) GO TO 308 * * Barrier to collect the processes before proceeding. * CALL BLACS_BARRIER( ICTXT, 'All' ) * * Compute global maximum of IERR so that we know if some process * experienced a failure in the reordering. * MYIERR = IERR IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, IERR, 1, -1, $ -1, -1, -1, -1 ) * IF( IERR.NE.0 ) THEN * * When calling BDTREXC, the block at position I+KKS-1 failed * to swap. * IF( MYIERR.NE.0 ) INFO = MAX(1,I+KKS-1) IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, -1, $ -1, -1, -1, -1 ) GO TO 300 END IF * * Do a global update of the SELECT vector. * DO 530 K = 1, N RSRC = INDXG2P( K, NB, MYROW, DESCT( RSRC_ ), NPROW ) CSRC = INDXG2P( K, NB, MYCOL, DESCT( CSRC_ ), NPCOL ) IF( MYROW.NE.RSRC .OR. MYCOL.NE.CSRC ) $ SELECT( K ) = 0 530 CONTINUE IF( NPROCS.GT.1 ) $ CALL IGSUM2D( ICTXT, 'All', TOP, N, 1, SELECT, N, -1, -1 ) * * Find the global minumum of ILO and IHI. * ILO = ILO - 1 523 CONTINUE ILO = ILO + 1 IF( ILO.LE.N ) THEN IF( SELECT(ILO).NE.0 ) GO TO 523 END IF IHI = IHI + 1 527 CONTINUE IHI = IHI - 1 IF( IHI.GE.1 ) THEN IF( SELECT(IHI).EQ.0 ) GO TO 527 END IF * * End While ( ILO <= M ) GO TO 50 END IF * 300 CONTINUE * * In case an error occured, do an additional global update of * SELECT. * IF( INFO.NE.0 ) THEN DO 540 K = 1, N RSRC = INDXG2P( K, NB, MYROW, DESCT( RSRC_ ), NPROW ) CSRC = INDXG2P( K, NB, MYCOL, DESCT( CSRC_ ), NPCOL ) IF( MYROW.NE.RSRC .OR. MYCOL.NE.CSRC ) $ SELECT( K ) = 0 540 CONTINUE IF( NPROCS.GT.1 ) $ CALL IGSUM2D( ICTXT, 'All', TOP, N, 1, SELECT, N, -1, -1 ) END IF * 545 CONTINUE * * Store the output eigenvalues in WR and WI: first let all the * processes compute the eigenvalue inside their diagonal blocks in * parallel, except for the eigenvalue located next to a block * border. After that, compute all eigenvalues located next to the * block borders. Finally, do a global summation over WR and WI so * that all processors receive the result. Notice: real eigenvalues * extracted from a non-canonical 2-by-2 block are not stored in * any particular order. * DO 550 K = 1, N WR( K ) = ZERO WI( K ) = ZERO 550 CONTINUE * * Loop 560: extract eigenvalues from the blocks which are not laid * out across a border of the processor mesh, except for those 1x1 * blocks on the border. * PAIR = .FALSE. DO 560 K = 1, N IF( .NOT. PAIR ) THEN BORDER = ( K.NE.N .AND. MOD( K, NB ).EQ.0 ) .OR. % ( K.NE.1 .AND. MOD( K, NB ).EQ.1 ) IF( .NOT. BORDER ) THEN CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC1, TRSRC1, TCSRC1 ) IF( MYROW.EQ.TRSRC1 .AND. MYCOL.EQ.TCSRC1 ) THEN ELEM1 = T((JLOC1-1)*LLDT+ILOC1) IF( K.LT.N ) THEN ELEM3 = T((JLOC1-1)*LLDT+ILOC1+1) ELSE ELEM3 = ZERO END IF IF( ELEM3.NE.ZERO ) THEN ELEM2 = T((JLOC1)*LLDT+ILOC1) ELEM4 = T((JLOC1)*LLDT+ILOC1+1) CALL SLANV2( ELEM1, ELEM2, ELEM3, ELEM4, $ WR( K ), WI( K ), WR( K+1 ), WI( K+1 ), SN, $ CS ) PAIR = .TRUE. ELSE IF( K.GT.1 ) THEN TMP = T((JLOC1-2)*LLDT+ILOC1) IF( TMP.NE.ZERO ) THEN ELEM1 = T((JLOC1-2)*LLDT+ILOC1-1) ELEM2 = T((JLOC1-1)*LLDT+ILOC1-1) ELEM3 = T((JLOC1-2)*LLDT+ILOC1) ELEM4 = T((JLOC1-1)*LLDT+ILOC1) CALL SLANV2( ELEM1, ELEM2, ELEM3, ELEM4, $ WR( K-1 ), WI( K-1 ), WR( K ), $ WI( K ), SN, CS ) ELSE WR( K ) = ELEM1 END IF ELSE WR( K ) = ELEM1 END IF END IF END IF END IF ELSE PAIR = .FALSE. END IF 560 CONTINUE * * Loop 570: extract eigenvalues from the blocks which are laid * out across a border of the processor mesh. The processors are * numbered as below: * * 1 | 2 * --+-- * 3 | 4 * DO 570 K = NB, N-1, NB CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC1, JLOC1, TRSRC1, TCSRC1 ) CALL INFOG2L( K, K+1, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC2, JLOC2, TRSRC2, TCSRC2 ) CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC3, JLOC3, TRSRC3, TCSRC3 ) CALL INFOG2L( K+1, K+1, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ ILOC4, JLOC4, TRSRC4, TCSRC4 ) IF( MYROW.EQ.TRSRC2 .AND. MYCOL.EQ.TCSRC2 ) THEN ELEM2 = T((JLOC2-1)*LLDT+ILOC2) IF( TRSRC1.NE.TRSRC2 .OR. TCSRC1.NE.TCSRC2 ) $ CALL SGESD2D( ICTXT, 1, 1, ELEM2, 1, TRSRC1, TCSRC1 ) END IF IF( MYROW.EQ.TRSRC3 .AND. MYCOL.EQ.TCSRC3 ) THEN ELEM3 = T((JLOC3-1)*LLDT+ILOC3) IF( TRSRC1.NE.TRSRC3 .OR. TCSRC1.NE.TCSRC3 ) $ CALL SGESD2D( ICTXT, 1, 1, ELEM3, 1, TRSRC1, TCSRC1 ) END IF IF( MYROW.EQ.TRSRC4 .AND. MYCOL.EQ.TCSRC4 ) THEN WORK(1) = T((JLOC4-1)*LLDT+ILOC4) IF( K+1.LT.N ) THEN WORK(2) = T((JLOC4-1)*LLDT+ILOC4+1) ELSE WORK(2) = ZERO END IF IF( TRSRC1.NE.TRSRC4 .OR. TCSRC1.NE.TCSRC4 ) $ CALL SGESD2D( ICTXT, 2, 1, WORK, 2, TRSRC1, TCSRC1 ) END IF IF( MYROW.EQ.TRSRC1 .AND. MYCOL.EQ.TCSRC1 ) THEN ELEM1 = T((JLOC1-1)*LLDT+ILOC1) IF( TRSRC1.NE.TRSRC2 .OR. TCSRC1.NE.TCSRC2 ) $ CALL SGERV2D( ICTXT, 1, 1, ELEM2, 1, TRSRC2, TCSRC2 ) IF( TRSRC1.NE.TRSRC3 .OR. TCSRC1.NE.TCSRC3 ) $ CALL SGERV2D( ICTXT, 1, 1, ELEM3, 1, TRSRC3, TCSRC3 ) IF( TRSRC1.NE.TRSRC4 .OR. TCSRC1.NE.TCSRC4 ) $ CALL SGERV2D( ICTXT, 2, 1, WORK, 2, TRSRC4, TCSRC4 ) ELEM4 = WORK(1) ELEM5 = WORK(2) IF( ELEM5.EQ.ZERO ) THEN IF( WR( K ).EQ.ZERO .AND. WI( K ).EQ.ZERO ) THEN CALL SLANV2( ELEM1, ELEM2, ELEM3, ELEM4, WR( K ), $ WI( K ), WR( K+1 ), WI( K+1 ), SN, CS ) ELSEIF( WR( K+1 ).EQ.ZERO .AND. WI( K+1 ).EQ.ZERO ) THEN WR( K+1 ) = ELEM4 END IF ELSEIF( WR( K ).EQ.ZERO .AND. WI( K ).EQ.ZERO ) THEN WR( K ) = ELEM1 END IF END IF 570 CONTINUE * IF( NPROCS.GT.1 ) THEN CALL SGSUM2D( ICTXT, 'All', TOP, N, 1, WR, N, -1, -1 ) CALL SGSUM2D( ICTXT, 'All', TOP, N, 1, WI, N, -1, -1 ) END IF * * Store storage requirements in workspaces. * WORK( 1 ) = FLOAT(LWMIN) IWORK( 1 ) = LIWMIN * * Return to calling program. * RETURN * * End of PSTRORD * END * scalapack-2.0.2/SRC/pstrrfs.f000644 000766 000024 00000073520 10363532303 016217 0ustar00juliestaff000000 000000 SUBROUTINE PSTRRFS( UPLO, TRANS, DIAG, N, NRHS, A, IA, JA, DESCA, $ B, IB, JB, DESCB, X, IX, JX, DESCX, FERR, $ BERR, WORK, LWORK, IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, TRANS, UPLO INTEGER INFO, IA, IB, IX, JA, JB, JX, LIWORK, LWORK, $ N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ), IWORK( * ) REAL A( * ), B( * ), BERR( * ), FERR( * ), $ WORK( * ), X( * ) * .. * * Purpose * ======= * * PSTRRFS provides error bounds and backward error estimates for the * solution to a system of linear equations with a triangular * coefficient matrix. * * The solution matrix X must be computed by PSTRTRS or some other * means before entering this routine. PSTRRFS does not do iterative * refinement because doing so cannot improve the backward error. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations. * = 'N': sub( A ) * sub( X ) = sub( B ) (No transpose) * = 'T': sub( A )**T * sub( X ) = sub( B ) (Transpose) * = 'C': sub( A )**T * sub( X ) = sub( B ) * (Conjugate transpose = Transpose) * * * DIAG (global input) CHARACTER*1 * = 'N': sub( A ) is non-unit triangular; * = 'U': sub( A ) is unit triangular. * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) REAL pointer into the local memory * to an array of local dimension (LLD_A,LOCc(JA+N-1) ). This * array contains the local pieces of the original triangular * distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. * If DIAG = 'U', the diagonal elements of sub( A ) are also * not referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) REAL pointer into the local memory * to an array of local dimension (LLD_B, LOCc(JB+NRHS-1) ). * On entry, this array contains the the local pieces of the * right hand sides sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input) REAL pointer into the local memory * to an array of local dimension (LLD_X, LOCc(JX+NRHS-1) ). * On entry, this array contains the the local pieces of the * solution vectors sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). The estimated forward error bounds for * each solution vector of sub( X ). If XTRUE is the true * solution, FERR bounds the magnitude of the largest entry * in (sub( X ) - XTRUE) divided by the magnitude of the * largest entry in sub( X ). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) REAL array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*LOCr( N + MOD( IA-1, MB_A ) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK. * LIWORK is local input and must be at least * LIWORK >= LOCr( N + MOD( IB-1, MB_B ) ). * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices sub( X ) and sub( B ) should be * distributed the same way on the same processes. These conditions * ensure that sub( X ) and sub( B ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER CHARACTER TRANST INTEGER IAROW, IXBCOL, IXBROW, IXCOL, IXROW, ICOFFA, $ ICOFFB, ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, $ IIW, IOFFXB, IPB, IPR, IPV, IROFFA, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LIWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ REAL EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC REAL PSLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PSLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG2L, $ PCHK1MAT, PCHK2MAT, PSATRMV, PSAXPY, $ PSCOPY, PSLACON, PSTRSV, PSTRMV, $ PXERBLA, SGAMX2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, ICHAR, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 900+CTXT_ ) ELSE CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IX, JX, DESCX, 17, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) NOUNIT = LSAME( DIAG, 'N' ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 3*NPMOD WORK( 1 ) = REAL( LWMIN ) LIWMIN = NPMOD IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 ELSE IF( NRHS.LT.0 ) THEN INFO = -5 ELSE IF( IROFFA.NE.0 ) THEN INFO = -7 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -8 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 900+NB_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -11 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1300+MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -15 ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -16 ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 1700+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 1700+NB_ ) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 1700+CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -21 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 2 ) = ICHAR( 'T' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'U' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 21 IF( LIWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 23 CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, 0, IDUM1, IDUM2, $ INFO ) CALL PCHK2MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, N, 4, NRHS, 5, $ IX, JX, DESCX, 17, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSTRRFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * IF( NOTRAN ) THEN TRANST = 'T' ELSE TRANST = 'N' END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = IPB + NP0 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = maximum number of nonzero entries in each row of A, plus 1 * NZ = N + 1 EPS = PSLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PSLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) )*DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 90 K = 0, JBRHS - 1 * * Compute residual R = B - op(A) * X, * where op(A) = A or A', depending on TRANS. * CALL PSCOPY( N, X, IX, JX+K, DESCX, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PSTRMV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) CALL PSAXPY( N, -ONE, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, $ JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the matrix * or vector Z. If the i-th component of the denominator is less * than SAFE2, then SAFE1 is added to the i-th components of the * numerator and denominator before dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 20 II = IIXB, IIXB + NP - 1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 20 CONTINUE END IF END IF * CALL PSATRMV( UPLO, TRANS, DIAG, N, ONE, A, IA, JA, DESCA, X, $ IX, JX+K, DESCX, 1, ONE, WORK( IPB ), IW, JW, $ DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIW - 1, IIW + NP - 2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 30 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Bound error from formula * * norm(X - XTRUE) / norm(X) .le. FERR = * norm( abs(inv(op(A)))* * ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(A)) is the inverse of op(A) * abs(Z) is the componentwise absolute value of the matrix or * vector Z * NZ is the maximum number of nonzeros in any row of A, plus 1 * EPS is machine epsilon * * The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) * is incremented by SAFE1 if the i-th component of * abs(op(A))*abs(X) + abs(B) is less than SAFE2. * * Use PSLACON to estimate the infinity-norm of the matrix * inv(op(A)) * diag(W), * where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW - 1, IIW + NP - 2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 40 CONTINUE END IF END IF * KASE = 0 50 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PSLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(A)'). * CALL PSTRSV( UPLO, TRANST, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 60 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 60 CONTINUE END IF END IF ELSE * * Multiply by inv(op(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = WORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF CALL PSTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) END IF GO TO 50 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIXB, IIXB + NP - 1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 80 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 90 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 180 J = JN + 1, JB + NRHS - 1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 170 K = 0, JBRHS - 1 * * Compute residual R = B - op(A) * X, * where op(A) = A or A', depending on TRANS. * CALL PSCOPY( N, X, IX, J+K, DESCX, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PSTRMV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) CALL PSAXPY( N, -ONE, B, IB, J+K, DESCB, 1, WORK( IPR ), $ IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 100 II = IIXB, IIXB + NP - 1 WORK( IIW+II-IIXB ) = ABS( B( II+IOFFXB ) ) 100 CONTINUE END IF END IF * CALL PSATRMV( UPLO, TRANS, DIAG, N, ONE, A, IA, JA, DESCA, $ X, IX, J+K, DESCX, 1, ONE, WORK( IPB ), IW, $ JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 110 II = IIW - 1, IIW + NP - 2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, ABS( WORK( IPR+II ) ) / $ WORK( IPB+II ) ) ELSE S = MAX( S, ( ABS( WORK( IPR+II ) )+SAFE1 ) / $ ( WORK( IPB+II )+SAFE1 ) ) END IF 110 CONTINUE END IF END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Bound error from formula * * norm(X - XTRUE) / norm(X) .le. FERR = * norm( abs(inv(op(A)))* * ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) )))/norm(X) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(A)) is the inverse of op(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of A, * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) * is incremented by SAFE1 if the i-th component of * abs(op(A))*abs(X) + abs(B) is less than SAFE2. * * Use PSLACON to estimate the infinity-norm of the matrix * inv(op(A)) * diag(W), * where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIW - 1, IIW + NP - 2 IF( WORK( IPB+II ).GT.SAFE2 ) THEN WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) ELSE WORK( IPB+II ) = ABS( WORK( IPR+II ) ) + $ NZ*EPS*WORK( IPB+II ) + SAFE1 END IF 120 CONTINUE END IF END IF * KASE = 0 130 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL SGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL SGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PSLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, IWORK, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(A)'). * CALL PSTRSV( UPLO, TRANST, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 140 CONTINUE END IF END IF ELSE * * Multiply by inv(op(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 150 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = WORK( IPB+II )* $ WORK( IPR+II ) 150 CONTINUE END IF END IF CALL PSTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) END IF GO TO 130 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIXB, IIXB + NP - 1 LSTRES = MAX( LSTRES, ABS( X( IOFFXB+II ) ) ) 160 CONTINUE END IF CALL SGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, $ IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 170 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * WORK( 1 ) = REAL( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PSTRRFS * END scalapack-2.0.2/SRC/pstrsen.f000644 000766 000024 00000065232 11705457544 016231 0ustar00juliestaff000000 000000 SUBROUTINE PSTRSEN( JOB, COMPQ, SELECT, PARA, N, T, IT, JT, $ DESCT, Q, IQ, JQ, DESCQ, WR, WI, M, S, SEP, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK computational routine (version 2.0.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * Univ. of Colorado Denver and University of California, Berkeley. * January, 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER COMPQ, JOB INTEGER INFO, LIWORK, LWORK, M, N, $ IT, JT, IQ, JQ REAL S, SEP * .. * .. Array Arguments .. LOGICAL SELECT( N ) INTEGER PARA( 6 ), DESCT( * ), DESCQ( * ), IWORK( * ) REAL Q( * ), T( * ), WI( * ), WORK( * ), WR( * ) * .. * * Purpose * ======= * * PSTRSEN reorders the real Schur factorization of a real matrix * A = Q*T*Q**T, so that a selected cluster of eigenvalues appears * in the leading diagonal blocks of the upper quasi-triangular matrix * T, and the leading columns of Q form an orthonormal basis of the * corresponding right invariant subspace. The reordering is performed * by PSTRORD. * * Optionally the routine computes the reciprocal condition numbers of * the cluster of eigenvalues and/or the invariant subspace. SCASY * library is needed for condition estimation. * * T must be in Schur form (as returned by PSLAHQR), that is, block * upper triangular with 1-by-1 and 2-by-2 diagonal blocks. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * JOB (global input) CHARACTER*1 * Specifies whether condition numbers are required for the * cluster of eigenvalues (S) or the invariant subspace (SEP): * = 'N': none; * = 'E': for eigenvalues only (S); * = 'V': for invariant subspace only (SEP); * = 'B': for both eigenvalues and invariant subspace (S and * SEP). * * COMPQ (global input) CHARACTER*1 * = 'V': update the matrix Q of Schur vectors; * = 'N': do not update Q. * * SELECT (global input) LOGICAL array, dimension (N) * SELECT specifies the eigenvalues in the selected cluster. To * select a real eigenvalue w(j), SELECT(j) must be set to * .TRUE.. To select a complex conjugate pair of eigenvalues * w(j) and w(j+1), corresponding to a 2-by-2 diagonal block, * either SELECT(j) or SELECT(j+1) or both must be set to * .TRUE.; a complex conjugate pair of eigenvalues must be * either both included in the cluster or both excluded. * * PARA (global input) INTEGER*6 * Block parameters (some should be replaced by calls to * PILAENV and others by meaningful default values): * PARA(1) = maximum number of concurrent computational windows * allowed in the algorithm; * 0 < PARA(1) <= min(NPROW,NPCOL) must hold; * PARA(2) = number of eigenvalues in each window; * 0 < PARA(2) < PARA(3) must hold; * PARA(3) = window size; PARA(2) < PARA(3) < DESCT(MB_) * must hold; * PARA(4) = minimal percentage of flops required for * performing matrix-matrix multiplications instead * of pipelined orthogonal transformations; * 0 <= PARA(4) <= 100 must hold; * PARA(5) = width of block column slabs for row-wise * application of pipelined orthogonal * transformations in their factorized form; * 0 < PARA(5) <= DESCT(MB_) must hold. * PARA(6) = the maximum number of eigenvalues moved together * over a process border; in practice, this will be * approximately half of the cross border window size * 0 < PARA(6) <= PARA(2) must hold; * * N (global input) INTEGER * The order of the globally distributed matrix T. N >= 0. * * T (local input/output) REAL array, * dimension (LLD_T,LOCc(N)). * On entry, the local pieces of the global distributed * upper quasi-triangular matrix T, in Schur form. On exit, T is * overwritten by the local pieces of the reordered matrix T, * again in Schur form, with the selected eigenvalues in the * globally leading diagonal blocks. * * IT (global input) INTEGER * JT (global input) INTEGER * The row and column index in the global array T indicating the * first column of sub( T ). IT = JT = 1 must hold. * * DESCT (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the global distributed matrix T. * * Q (local input/output) REAL array, * dimension (LLD_Q,LOCc(N)). * On entry, if COMPQ = 'V', the local pieces of the global * distributed matrix Q of Schur vectors. * On exit, if COMPQ = 'V', Q has been postmultiplied by the * global orthogonal transformation matrix which reorders T; the * leading M columns of Q form an orthonormal basis for the * specified invariant subspace. * If COMPQ = 'N', Q is not referenced. * * IQ (global input) INTEGER * JQ (global input) INTEGER * The column index in the global array Q indicating the * first column of sub( Q ). IQ = JQ = 1 must hold. * * DESCQ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the global distributed matrix Q. * * WR (global output) REAL array, dimension (N) * WI (global output) REAL array, dimension (N) * The real and imaginary parts, respectively, of the reordered * eigenvalues of T. The eigenvalues are in principle stored in * the same order as on the diagonal of T, with WR(i) = T(i,i) * and, if T(i:i+1,i:i+1) is a 2-by-2 diagonal block, WI(i) > 0 * and WI(i+1) = -WI(i). * Note also that if a complex eigenvalue is sufficiently * ill-conditioned, then its value may differ significantly * from its value before reordering. * * M (global output) INTEGER * The dimension of the specified invariant subspace. * 0 <= M <= N. * * S (global output) REAL * If JOB = 'E' or 'B', S is a lower bound on the reciprocal * condition number for the selected cluster of eigenvalues. * S cannot underestimate the true reciprocal condition number * by more than a factor of sqrt(N). If M = 0 or N, S = 1. * If JOB = 'N' or 'V', S is not referenced. * * SEP (global output) REAL * If JOB = 'V' or 'B', SEP is the estimated reciprocal * condition number of the specified invariant subspace. If * M = 0 or N, SEP = norm(T). * If JOB = 'N' or 'E', SEP is not referenced. * * WORK (local workspace/output) REAL array, * dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued by PXERBLA. * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * * LIWORK (local input) INTEGER * The dimension of the array IWORK. * * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value. * If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*1000+j), if the i-th * argument is a scalar and had an illegal value, then INFO = -i. * > 0: here we have several possibilites * *) Reordering of T failed because some eigenvalues are too * close to separate (the problem is very ill-conditioned); * T may have been partially reordered, and WR and WI * contain the eigenvalues in the same order as in T. * On exit, INFO = {the index of T where the swap failed}. * *) A 2-by-2 block to be reordered split into two 1-by-1 * blocks and the second block failed to swap with an * adjacent block. * On exit, INFO = {the index of T where the swap failed}. * *) If INFO = N+1, there is no valid BLACS context (see the * BLACS documentation for details). * *) If INFO = N+2, the routines used in the calculation of * the condition numbers raised a positive warning flag * (see the documentation for PGESYCTD and PSYCTCON of the * SCASY library). * *) If INFO = N+3, PGESYCTD raised an input error flag; * please report this bug to the authors (see below). * If INFO = N+4, PSYCTCON raised an input error flag; * please report this bug to the authors (see below). * In a future release this subroutine may distinguish between * the case 1 and 2 above. * * Method * ====== * * This routine performs parallel eigenvalue reordering in real Schur * form by parallelizing the approach proposed in [3]. The condition * number estimation part is performed by using techniques and code * from SCASY, see http://www.cs.umu.se/research/parallel/scasy. * * Additional requirements * ======================= * * The following alignment requirements must hold: * (a) DESCT( MB_ ) = DESCT( NB_ ) = DESCQ( MB_ ) = DESCQ( NB_ ) * (b) DESCT( RSRC_ ) = DESCQ( RSRC_ ) * (c) DESCT( CSRC_ ) = DESCQ( CSRC_ ) * * All matrices must be blocked by a block factor larger than or * equal to two (3). This to simplify reordering across processor * borders in the presence of 2-by-2 blocks. * * Limitations * =========== * * This algorithm cannot work on submatrices of T and Q, i.e., * IT = JT = IQ = JQ = 1 must hold. This is however no limitation * since PSLAHQR does not compute Schur forms of submatrices anyway. * * References * ========== * * [1] Z. Bai and J. W. Demmel; On swapping diagonal blocks in real * Schur form, Linear Algebra Appl., 186:73--95, 1993. Also as * LAPACK Working Note 54. * * [2] Z. Bai, J. W. Demmel, and A. McKenney; On computing condition * numbers for the nonsymmetric eigenvalue problem, ACM Trans. * Math. Software, 19(2):202--223, 1993. Also as LAPACK Working * Note 13. * * [3] D. Kressner; Block algorithms for reordering standard and * generalized Schur forms, ACM TOMS, 32(4):521-532, 2006. * Also LAPACK Working Note 171. * * [4] R. Granat, B. Kagstrom, and D. Kressner; Parallel eigenvalue * reordering in real Schur form, Concurrency and Computations: * Practice and Experience, 21(9):1225-1250, 2009. Also as * LAPACK Working Note 192. * * Parallel execution recommendations * ================================== * * Use a square grid, if possible, for maximum performance. The block * parameters in PARA should be kept well below the data distribution * block size. In particular, see [3,4] for recommended settings for * these parameters. * * In general, the parallel algorithm strives to perform as much work * as possible without crossing the block borders on the main block * diagonal. * * Contributors * ============ * * Implemented by Robert Granat, Dept. of Computing Science and HPC2N, * Umea University, Sweden, March 2007, * in collaboration with Bo Kagstrom and Daniel Kressner. * Modified by Meiyue Shao, October 2011. * * Revisions * ========= * * Please send bug-reports to granat@cs.umu.se * * Keywords * ======== * * Real Schur form, eigenvalue reordering, Sylvester matrix equation * * ===================================================================== * .. * .. Parameters .. CHARACTER TOP INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ REAL ZERO, ONE PARAMETER ( TOP = '1-Tree', $ BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9, $ ZERO = 0.0, ONE = 1.0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, WANTBH, WANTQ, WANTS, WANTSP INTEGER ICOFFT12, ICTXT, IDUM1, IDUM2, IERR, ILOC1, $ IPW1, ITER, ITT, JLOC1, JTT, K, LIWMIN, LLDT, $ LLDQ, LWMIN, MMAX, MMIN, MYROW, MYCOL, N1, N2, $ NB, NOEXSY, NPCOL, NPROCS, NPROW, SPACE, $ T12ROWS, T12COLS, TCOLS, TCSRC, TROWS, TRSRC, $ WRK1, IWRK1, WRK2, IWRK2, WRK3, IWRK3 REAL DPDUM1, ELEM, EST, SCALE, RNORM * .. Local Arrays .. INTEGER DESCT12( DLEN_ ), MBNB2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC REAL PSLANGE EXTERNAL LSAME, NUMROC, PSLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCINIT, $ IGAMX2D, INFOG2L, PSLACPY, PSTRORD, PXERBLA, $ PCHK1MAT, PCHK2MAT * $ , PGESYCTD, PSYCTCON * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCT( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW*NPCOL * * Test if grid is O.K., i.e., the context is valid * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = N+1 END IF * * Check if workspace * LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1 * * Test dimensions for local sanity * IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, INFO ) END IF * * Check the blocking sizes for alignment requirements * IF( INFO.EQ.0 ) THEN IF( DESCT( MB_ ).NE.DESCT( NB_ ) ) INFO = -(1000*9 + MB_) END IF IF( INFO.EQ.0 ) THEN IF( DESCQ( MB_ ).NE.DESCQ( NB_ ) ) INFO = -(1000*13 + MB_) END IF IF( INFO.EQ.0 ) THEN IF( DESCT( MB_ ).NE.DESCQ( MB_ ) ) INFO = -(1000*9 + MB_) END IF * * Check the blocking sizes for minimum sizes * IF( INFO.EQ.0 ) THEN IF( N.NE.DESCT( MB_ ) .AND. DESCT( MB_ ).LT.3 ) $ INFO = -(1000*9 + MB_) IF( N.NE.DESCQ( MB_ ) .AND. DESCQ( MB_ ).LT.3 ) $ INFO = -(1000*13 + MB_) END IF * * Check parameters in PARA * NB = DESCT( MB_ ) IF( INFO.EQ.0 ) THEN IF( PARA(1).LT.1 .OR. PARA(1).GT.MIN(NPROW,NPCOL) ) $ INFO = -(1000 * 4 + 1) IF( PARA(2).LT.1 .OR. PARA(2).GE.PARA(3) ) $ INFO = -(1000 * 4 + 2) IF( PARA(3).LT.1 .OR. PARA(3).GT.NB ) $ INFO = -(1000 * 4 + 3) IF( PARA(4).LT.0 .OR. PARA(4).GT.100 ) $ INFO = -(1000 * 4 + 4) IF( PARA(5).LT.1 .OR. PARA(5).GT.NB ) $ INFO = -(1000 * 4 + 5) IF( PARA(6).LT.1 .OR. PARA(6).GT.PARA(2) ) $ INFO = -(1000 * 4 + 6) END IF * * Check requirements on IT, JT, IQ and JQ * IF( INFO.EQ.0 ) THEN IF( IT.NE.1 ) INFO = -7 IF( JT.NE.IT ) INFO = -8 IF( IQ.NE.1 ) INFO = -11 IF( JQ.NE.IQ ) INFO = -12 END IF * * Test input parameters for global sanity * IF( INFO.EQ.0 ) THEN CALL PCHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, 0, IDUM1, $ IDUM2, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL PCHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, 0, IDUM1, $ IDUM2, INFO ) END IF IF( INFO.EQ.0 ) THEN CALL PCHK2MAT( N, 5, N, 5, IT, JT, DESCT, 9, N, 5, N, 5, $ IQ, JQ, DESCQ, 13, 0, IDUM1, IDUM2, INFO ) END IF * * Decode and test the input parameters * IF( INFO.EQ.0 .OR. LQUERY ) THEN WANTBH = LSAME( JOB, 'B' ) WANTS = LSAME( JOB, 'E' ) .OR. WANTBH WANTSP = LSAME( JOB, 'V' ) .OR. WANTBH WANTQ = LSAME( COMPQ, 'V' ) * IF( .NOT.LSAME( JOB, 'N' ) .AND. .NOT.WANTS .AND. .NOT.WANTSP ) $ THEN INFO = -1 ELSEIF( .NOT.LSAME( COMPQ, 'N' ) .AND. .NOT.WANTQ ) THEN INFO = -2 ELSEIF( N.LT.0 ) THEN INFO = -4 ELSE * * Extract local leading dimension * LLDT = DESCT( LLD_ ) LLDQ = DESCQ( LLD_ ) * * Check the SELECT vector for consistency and set M to the * dimension of the specified invariant subspace. * M = 0 DO 10 K = 1, N * * IWORK(1:N) is an integer copy of SELECT. * IF( SELECT(K) ) THEN IWORK(K) = 1 ELSE IWORK(K) = 0 END IF IF( K.LT.N ) THEN CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL, $ MYROW, MYCOL, ITT, JTT, TRSRC, TCSRC ) IF( MYROW.EQ.TRSRC .AND. MYCOL.EQ.TCSRC ) THEN ELEM = T( (JTT-1)*LLDT + ITT ) IF( ELEM.NE.ZERO ) THEN IF( SELECT(K) .AND. .NOT.SELECT(K+1) ) THEN * INFO = -3 IWORK(K+1) = 1 ELSEIF( .NOT.SELECT(K) .AND. SELECT(K+1) ) THEN * INFO = -3 IWORK(K) = 1 END IF END IF END IF END IF IF( SELECT(K) ) M = M + 1 10 CONTINUE MMAX = M MMIN = M IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, MMAX, 1, -1, $ -1, -1, -1, -1 ) IF( NPROCS.GT.1 ) $ CALL IGAMN2D( ICTXT, 'All', TOP, 1, 1, MMIN, 1, -1, $ -1, -1, -1, -1 ) IF( MMAX.GT.MMIN ) THEN M = MMAX IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, N, 1, IWORK, N, $ -1, -1, -1, -1, -1 ) END IF * * Set parameters for deep pipelining in parallel * Sylvester solver. * MBNB2( 1 ) = MIN( MAX( PARA( 3 ), PARA( 2 )*2 ), NB ) MBNB2( 2 ) = MBNB2( 1 ) * * Compute needed workspace * N1 = M N2 = N - M IF( WANTS ) THEN c CALL PGESYCTD( 'Solve', 'Schur', 'Schur', 'Notranspose', c $ 'Notranspose', -1, 'Demand', N1, N2, T, 1, 1, DESCT, c $ T, N1+1, N1+1, DESCT, T, 1, N1+1, DESCT, MBNB2, c $ WORK, -1, IWORK(N+1), -1, NOEXSY, SCALE, IERR ) WRK1 = INT(WORK(1)) IWRK1 = IWORK(N+1) WRK1 = 0 IWRK1 = 0 ELSE WRK1 = 0 IWRK1 = 0 END IF * IF( WANTSP ) THEN c CALL PSYCTCON( 'Notranspose', 'Notranspose', -1, c $ 'Demand', N1, N2, T, 1, 1, DESCT, T, N1+1, N1+1, c $ DESCT, MBNB2, WORK, -1, IWORK(N+1), -1, EST, ITER, c $ IERR ) WRK2 = INT(WORK(1)) IWRK2 = IWORK(N+1) WRK2 = 0 IWRK2 = 0 ELSE WRK2 = 0 IWRK2 = 0 END IF * TROWS = NUMROC( N, NB, MYROW, DESCT(RSRC_), NPROW ) TCOLS = NUMROC( N, NB, MYCOL, DESCT(CSRC_), NPCOL ) WRK3 = N + 7*NB**2 + 2*TROWS*PARA( 3 ) + TCOLS*PARA( 3 ) + $ MAX( TROWS*PARA( 3 ), TCOLS*PARA( 3 ) ) IWRK3 = 5*PARA( 1 ) + PARA(2)*PARA(3) - $ PARA(2) * (PARA(2) + 1 ) / 2 * IF( WANTSP ) THEN LWMIN = MAX( 1, MAX( WRK2, WRK3) ) LIWMIN = MAX( 1, MAX( IWRK2, IWRK3 ) )+N ELSE IF( LSAME( JOB, 'N' ) ) THEN LWMIN = MAX( 1, WRK3 ) LIWMIN = IWRK3+N ELSE IF( LSAME( JOB, 'E' ) ) THEN LWMIN = MAX( 1, MAX( WRK1, WRK3) ) LIWMIN = MAX( 1, MAX( IWRK1, IWRK3 ) )+N END IF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -20 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -22 END IF END IF END IF * * Global maximum on info * IF( NPROCS.GT.1 ) $ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, -1, -1, -1, $ -1, -1 ) * * Return if some argument is incorrect * IF( INFO.NE.0 .AND. .NOT.LQUERY ) THEN M = 0 S = ONE SEP = ZERO CALL PXERBLA( ICTXT, 'PSTRSEN', -INFO ) RETURN ELSEIF( LQUERY ) THEN WORK( 1 ) = FLOAT(LWMIN) IWORK( 1 ) = LIWMIN RETURN END IF * * Quick return if possible. * IF( M.EQ.N .OR. M.EQ.0 ) THEN IF( WANTS ) $ S = ONE IF( WANTSP ) $ SEP = PSLANGE( '1', N, N, T, IT, JT, DESCT, WORK ) GO TO 50 END IF * * Reorder the eigenvalues. * CALL PSTRORD( COMPQ, IWORK, PARA, N, T, IT, JT, $ DESCT, Q, IQ, JQ, DESCQ, WR, WI, M, WORK, LWORK, $ IWORK(N+1), LIWORK-N, INFO ) * IF( WANTS ) THEN * * Solve Sylvester equation T11*R - R*T2 = scale*T12 for R in * parallel. * * Copy T12 to workspace. * CALL INFOG2L( 1, N1+1, DESCT, NPROW, NPCOL, MYROW, $ MYCOL, ILOC1, JLOC1, TRSRC, TCSRC ) ICOFFT12 = MOD( N1, NB ) T12ROWS = NUMROC( N1, NB, MYROW, TRSRC, NPROW ) T12COLS = NUMROC( N2+ICOFFT12, NB, MYCOL, TCSRC, NPCOL ) CALL DESCINIT( DESCT12, N1, N2+ICOFFT12, NB, NB, TRSRC, $ TCSRC, ICTXT, MAX(1,T12ROWS), IERR ) CALL PSLACPY( 'All', N1, N2, T, 1, N1+1, DESCT, WORK, $ 1, 1+ICOFFT12, DESCT12 ) * * Solve the equation to get the solution in workspace. * SPACE = DESCT12( LLD_ ) * T12COLS IPW1 = 1 + SPACE c CALL PGESYCTD( 'Solve', 'Schur', 'Schur', 'Notranspose', c $ 'Notranspose', -1, 'Demand', N1, N2, T, 1, 1, DESCT, T, c $ N1+1, N1+1, DESCT, WORK, 1, 1+ICOFFT12, DESCT12, MBNB2, c $ WORK(IPW1), LWORK-SPACE+1, IWORK(N+1), LIWORK-N, NOEXSY, c $ SCALE, IERR ) IF( IERR.LT.0 ) THEN INFO = N+3 ELSE INFO = N+2 END IF * * Estimate the reciprocal of the condition number of the cluster * of eigenvalues. * RNORM = PSLANGE( 'Frobenius', N1, N2, WORK, 1, 1+ICOFFT12, $ DESCT12, DPDUM1 ) IF( RNORM.EQ.ZERO ) THEN S = ONE ELSE S = SCALE / ( SQRT( SCALE*SCALE / RNORM+RNORM )* $ SQRT( RNORM ) ) END IF END IF * IF( WANTSP ) THEN * * Estimate sep(T11,T21) in parallel. * c CALL PSYCTCON( 'Notranspose', 'Notranspose', -1, 'Demand', N1, c $ N2, T, 1, 1, DESCT, T, N1+1, N1+1, DESCT, MBNB2, WORK, c $ LWORK, IWORK(N+1), LIWORK-N, EST, ITER, IERR ) EST = EST * SQRT(FLOAT(N1*N2)) SEP = ONE / EST IF( IERR.LT.0 ) THEN INFO = N+4 ELSE INFO = N+2 END IF END IF * * Return to calling program. * 50 CONTINUE * RETURN * * End of PSTRSEN * END * scalapack-2.0.2/SRC/pstrti2.f000644 000766 000024 00000023650 10363532303 016122 0ustar00juliestaff000000 000000 SUBROUTINE PSTRTI2( UPLO, DIAG, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSTRTI2 computes the inverse of a real upper or lower triangular * block matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). This matrix should be * contained in one and only one process memory space (local operation). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * DIAG (global input) CHARACTER*1 * = 'N': sub( A ) is non-unit triangular * = 'U': sub( A ) is unit triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)), * this array contains the local pieces of the triangular matrix * sub( A ). If UPLO = 'U', the leading N-by-N upper triangular * part of the matrix sub( A ) contains the upper triangular * matrix, and the strictly lower triangular part of sub( A ) * is not referenced. If UPLO = 'L', the leading N-by-N lower * triangular part of the matrix sub( A ) contains the lower * triangular matrix, and the strictly upper triangular part * of sub( A ) is not referenced. If DIAG = 'U', the diagonal * elements of sub( A ) are also not referenced and are assumed * to be 1. On exit, the (triangular) inverse of the original * matrix, in the same storage format. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER IACOL, IAROW, ICTXT, ICURR, IDIAG, IIA, IOFFA, $ JJA, LDA, MYCOL, MYROW, NA, NPCOL, NPROW REAL AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PXERBLA, SSCAL, STRMV * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSTRTI2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Compute local indexes * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN * LDA = DESCA( LLD_ ) * IF( UPPER ) THEN * IOFFA = IIA + ( JJA - 1 ) * LDA ICURR = IOFFA + LDA * IF( NOUNIT ) THEN * * Compute inverse of upper non-unit triangular matrix. * A( IOFFA ) = ONE / A( IOFFA ) IDIAG = ICURR + 1 DO 10 NA = 1, N-1 A( IDIAG ) = ONE / A( IDIAG ) AJJ = -A( IDIAG ) * * Compute elements 1:j-1 of j-th column. * CALL STRMV( 'Upper', 'No transpose', DIAG, NA, $ A( IOFFA ), LDA, A( ICURR ), 1 ) CALL SSCAL( NA, AJJ, A( ICURR ), 1 ) IDIAG = IDIAG + LDA + 1 ICURR = ICURR + LDA 10 CONTINUE * ELSE * * Compute inverse of upper unit triangular matrix. * DO 20 NA = 1, N-1 * * Compute elements 1:j-1 of j-th column. * CALL STRMV( 'Upper', 'No transpose', DIAG, NA, $ A( IOFFA ), LDA, A( ICURR ), 1 ) CALL SSCAL( NA, -ONE, A( ICURR ), 1 ) ICURR = ICURR + LDA 20 CONTINUE * END IF * ELSE * ICURR = IIA + N - 1 + ( JJA + N - 2 ) * LDA IOFFA = ICURR - LDA * IF( NOUNIT ) THEN * * Compute inverse of lower non-unit triangular matrix. * A( ICURR ) = ONE / A( ICURR ) IDIAG = IOFFA - 1 DO 30 NA = 1, N-1 A( IDIAG ) = ONE / A( IDIAG ) AJJ = -A( IDIAG ) * * Compute elements j+1:n of j-th column. * CALL STRMV( 'Lower', 'No transpose', DIAG, NA, $ A( ICURR ), LDA, A( IOFFA ), 1 ) CALL SSCAL( NA, AJJ, A( IOFFA ), 1 ) ICURR = IDIAG IDIAG = IDIAG - LDA - 1 IOFFA = IDIAG + 1 30 CONTINUE * ELSE * * Compute inverse of lower unit triangular matrix. * DO 40 NA = 1, N-1 * * Compute elements j+1:n of j-th column. * CALL STRMV( 'Lower', 'No transpose', DIAG, NA, $ A( ICURR ), LDA, A( IOFFA ), 1 ) CALL SSCAL( NA, -ONE, A( IOFFA ), 1 ) ICURR = ICURR - LDA - 1 IOFFA = ICURR - LDA 40 CONTINUE * END IF * END IF * END IF * * End of PSTRTI2 * END scalapack-2.0.2/SRC/pstrtri.f000644 000766 000024 00000030676 10363532303 016230 0ustar00juliestaff000000 000000 SUBROUTINE PSTRTRI( UPLO, DIAG, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSTRTRI computes the inverse of a upper or lower triangular * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the distributed matrix sub( A ) is upper * or lower triangular: * = 'U': Upper triangular, * = 'L': Lower triangular. * * DIAG (global input) CHARACTER * Specifies whether or not the distributed matrix sub( A ) * is unit triangular: * = 'N': Non-unit triangular, * = 'U': Unit triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * triangular matrix sub( A ). If UPLO = 'U', the leading * N-by-N upper triangular part of the matrix sub( A ) contains * the upper triangular matrix to be inverted, and the strictly * lower triangular part of sub( A ) is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * the matrix sub( A ) contains the lower triangular matrix, * and the strictly upper triangular part of sub( A ) is not * referenced. * On exit, the (triangular) inverse of the original matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, A(IA+K-1,JA+K-1) is exactly zero. The * triangular matrix sub( A ) is singular and its * inverse can not be computed. * * ==================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER I, ICOFF, ICTXT, IROFF, ICURCOL, ICURROW, $ IDUMMY, II, IOFFA, J, JB, JJ, JN, LDA, MYCOL, $ MYROW, NN, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMX2D, INFOG2L, $ PCHK1MAT, PSTRTI2, PSTRMM, PSTRSM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) * CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOUNIT ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 * CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSTRTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) IF( NOUNIT ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, ICURROW, ICURCOL ) * * Handle first block separately * JB = JN-JA+1 LDA = DESCA( LLD_ ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*LDA DO 10 I = 0, JB-1 IF( A( IOFFA ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I + 1 IOFFA = IOFFA + LDA + 1 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*LDA DO 20 I = 0, JB-1 IF( A( IOFFA ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = J + I - JA + 1 IOFFA = IOFFA + LDA + 1 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUMMY, $ IDUMMY, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN END IF * * Use blocked code * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix * JB = JN-JA+1 * * Handle first block of column separately * CALL PSTRTI2( UPLO, DIAG, JB, A, IA, JA, DESCA, INFO ) * * Loop over remaining block of columns * DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * * Compute rows 1:j-1 of current block column * CALL PSTRMM( 'Left', UPLO, 'No transpose', DIAG, J-JA, JB, $ ONE, A, IA, JA, DESCA, A, IA, J, DESCA ) CALL PSTRSM( 'Right', UPLO, 'No transpose', DIAG, J-JA, $ JB, -ONE, A, I, J, DESCA, A, IA, J, DESCA ) * * Compute inverse of current diagonal block * CALL PSTRTI2( UPLO, DIAG, JB, A, I, J, DESCA, INFO ) * 40 CONTINUE * ELSE * * Compute inverse of lower triangular matrix * NN = ( ( JA+N-2 ) / DESCA( NB_ ) )*DESCA( NB_ ) + 1 DO 50 J = NN, JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA IF( J+JB.LE.JA+N-1 ) THEN * * Compute rows j+jb:ja+n-1 of current block column * CALL PSTRMM( 'Left', UPLO, 'No transpose', DIAG, $ JA+N-J-JB, JB, ONE, A, I+JB, J+JB, DESCA, $ A, I+JB, J, DESCA ) CALL PSTRSM( 'Right', UPLO, 'No transpose', DIAG, $ JA+N-J-JB, JB, -ONE, A, I, J, DESCA, $ A, I+JB, J, DESCA ) END IF * * Compute inverse of current diagonal block * CALL PSTRTI2( UPLO, DIAG, JB, A, I, J, DESCA, INFO ) * 50 CONTINUE * * Handle the last block of columns separately * JB = JN-JA+1 IF( JA+JB.LE.JA+N-1 ) THEN * * Compute rows ja+jb:ja+n-1 of current block column * CALL PSTRMM( 'Left', UPLO, 'No transpose', DIAG, N-JB, JB, $ ONE, A, IA+JB, JA+JB, DESCA, A, IA+JB, JA, $ DESCA ) CALL PSTRSM( 'Right', UPLO, 'No transpose', DIAG, N-JB, JB, $ -ONE, A, IA, JA, DESCA, A, IA+JB, JA, DESCA ) END IF * * Compute inverse of current diagonal block * CALL PSTRTI2( UPLO, DIAG, JB, A, IA, JA, DESCA, INFO ) * END IF * RETURN * * End PSTRTRI * END scalapack-2.0.2/SRC/pstrtrs.f000644 000766 000024 00000031323 10363532303 016230 0ustar00juliestaff000000 000000 SUBROUTINE PSTRTRS( UPLO, TRANS, DIAG, N, NRHS, A, IA, JA, DESCA, $ B, IB, JB, DESCB, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, TRANS, UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) REAL A( * ), B( * ) * .. * * Purpose * ======= * * PSTRTRS solves a triangular system of the form * * sub( A ) * X = sub( B ) or sub( A )**T * X = sub( B ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is a triangular * distributed matrix of order N, and B(IB:IB+N-1,JB:JB+NRHS-1) is an * N-by-NRHS distributed matrix denoted by sub( B ). A check is made * to verify that sub( A ) is nonsingular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': Solve sub( A ) * X = sub( B ) (No transpose) * = 'T': Solve sub( A )**T * X = sub( B ) (Transpose) * = 'C': Solve sub( A )**T * X = sub( B ) (Transpose) * * DIAG (global input) CHARACTER * = 'N': sub( A ) is non-unit triangular; * = 'U': sub( A ) is unit triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed matrix sub( B ). NRHS >= 0. * * A (local input) REAL pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1) ). This array * contains the local pieces of the distributed triangular * matrix sub( A ). If UPLO = 'U', the leading N-by-N upper * triangular part of sub( A ) contains the upper triangular * matrix, and the strictly lower triangular part of sub( A ) * is not referenced. If UPLO = 'L', the leading N-by-N lower * triangular part of sub( A ) contains the lower triangular * matrix, and the strictly upper triangular part of sub( A ) * is not referenced. If DIAG = 'U', the diagonal elements of * sub( A ) are also not referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) REAL pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the right hand side distributed matrix * sub( B ). On exit, if INFO = 0, sub( B ) is overwritten by * the solution matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, the i-th diagonal element of sub( A ) is * zero, indicating that the submatrix is singular and the * solutions X have not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN, NOUNIT, UPPER INTEGER I, IAROW, IBROW, ICOFFA, ICTXT, ICURCOL, $ ICURROW, IROFFA, IROFFB, IDUM, II, IOFFA, J, $ JBLK, JJ, JN, LDA, LL, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMX2D, INFOG2L, $ PCHK2MAT, PSTRSM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL ICEIL, INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -907 ELSE UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) NOTRAN = LSAME( TRANS, 'N' ) * CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA .OR. IROFFA.NE.0 ) THEN INFO = -8 ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IBROW ) THEN INFO = -11 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -904 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -1304 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 2 ) = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'D' ) END IF IDUM2( 3 ) = 3 CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 9, N, 4, NRHS, 5, $ IB, JB, DESCB, 13, 3, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSTRTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * IF( NOUNIT ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, ICURROW, ICURCOL ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) IOFFA = II + ( JJ - 1 ) * LDA * * Handle first block separately * JBLK = JN-JA+1 IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN LL = IOFFA DO 10 I = 0, JBLK-1 IF( A( LL ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I + 1 LL = IOFFA + LDA + 1 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JBLK IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JBLK*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JBLK = MIN( JA+N-J, DESCA( NB_ ) ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN LL = IOFFA DO 20 I = 0, JBLK-1 IF( A( LL ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = J + I - JA + 1 LL = IOFFA + LDA + 1 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JBLK IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JBLK*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUM, IDUM, $ -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN END IF * * Solve A * x = b or A' * x = b. * CALL PSTRSM( 'Left', UPLO, TRANS, DIAG, N, NRHS, ONE, A, IA, JA, $ DESCA, B, IB, JB, DESCB ) * RETURN * * End of PSTRTRS * END scalapack-2.0.2/SRC/pstzrzf.f000644 000766 000024 00000030777 10363532303 016245 0ustar00juliestaff000000 000000 SUBROUTINE PSTZRZF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PSTZRZF reduces the M-by-N ( M<=N ) real upper trapezoidal matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper triangular form by means * of orthogonal transformations. * * The upper trapezoidal matrix sub( A ) is factored as * * sub( A ) = ( R 0 ) * Z, * * where Z is an N-by-N orthogonal matrix and R is an M-by-M upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) REAL pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the leading M-by-M * upper triangular part of sub( A ) contains the upper trian- * gular matrix R, and elements M+1 to N of the first M rows of * sub( A ), with the array TAU, represent the orthogonal matrix * Z as a product of M elementary reflectors. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) REAL, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) REAL array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The factorization is obtained by Householder's method. The kth * transformation matrix, Z( k ), which is used to introduce zeros into * the (m - k + 1)th row of sub( A ), is given in the form * * Z( k ) = ( I 0 ), * ( 0 T( k ) ) * * where * * T( k ) = I - tau*u( k )*u( k )', u( k ) = ( 1 ), * ( 0 ) * ( z( k ) ) * * tau is a scalar and z( k ) is an ( n - m ) element vector. * tau and z( k ) are chosen to annihilate the elements of the kth row * of sub( A ). * * The scalar tau is returned in the kth element of TAU and the vector * u( k ) in the kth row of sub( A ), such that the elements of z( k ) * are in a( k, m + 1 ), ..., a( k, n ). The elements of R are returned * in the upper triangular part of sub( A ). * * Z is given by * * Z = Z( 1 ) * Z( 2 ) * ... * Z( m ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IIA, IL, IN, IPW, $ IROFFA, J, JM1, L, LWMIN, MP0, MYCOL, MYROW, $ NPCOL, NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, INFOG1L, PCHK1MAT, $ PSLATRZ, PSLARZB, PSLARZT, PB_TOPGET, $ PB_TOPSET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = REAL( LWMIN ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -9 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSTZRZF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( M.EQ.N ) THEN * CALL INFOG1L( IA, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ IIA, IAROW ) IF( MYROW.EQ.IAROW ) $ MP0 = MP0 - IROFFA DO 10 I = IIA, IIA+MP0-1 TAU( I ) = ZERO 10 CONTINUE * ELSE * L = N-M JM1 = JA + MIN( M+1, N ) - 1 IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IL = MAX( ( (IA+M-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * * Use blocked code initially * DO 20 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) J = JA + I - IA * * Compute the complete orthogonal factorization of the current * block A(i:i+ib-1,j:ja+n-1) * CALL PSLATRZ( IB, JA+N-J, L, A, I, J, DESCA, TAU, WORK ) * IF( I.GT.IA ) THEN * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PSLARZT( 'Backward', 'Rowwise', L, IB, A, I, JM1, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:i-1,j:ja+n-1) from the right * CALL PSLARZB( 'Right', 'No transpose', 'Backward', $ 'Rowwise', I-IA, JA+N-J, IB, L, A, I, JM1, $ DESCA, WORK, A, IA, J, DESCA, WORK( IPW ) ) END IF * 20 CONTINUE * * Use unblocked code to factor the last or only block * CALL PSLATRZ( IN-IA+1, N, N-M, A, IA, JA, DESCA, TAU, WORK ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = REAL( LWMIN ) * RETURN * * End of PSTZRZF * END scalapack-2.0.2/SRC/pxsyevx.h000644 000766 000024 00000004521 10363532303 016237 0ustar00juliestaff000000 000000 /* * These macros define how C routines will be called. ADD_ assumes that * they will be called by fortran, which expects C routines to have an * underscore postfixed to the name (Suns, and the Intel expect this). * NOCHANGE indicates that fortran will be calling, and that it expects * the name called by fortran to be identical to that compiled by the C * (RS6K's do this). UPCASE says it expects C routines called by fortran * to be in all upcase (CRAY wants this). */ #define ADD_ 0 #define NOCHANGE 1 #define UPCASE 2 #define C_CALL 3 #ifdef UpCase #define F77_CALL_C UPCASE #endif #ifdef NoChange #define F77_CALL_C NOCHANGE #endif #ifdef Add_ #define F77_CALL_C ADD_ #endif #ifndef F77_CALL_C #define F77_CALL_C ADD_ #endif #if (F77_CALL_C == ADD_) /* * These defines set up the naming scheme required to have a fortran 77 * routine call a C routine * No redefinition necessary to have following Fortran to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void pdgemm_(...) * * This is the default. */ #endif #if (F77_CALL_C == UPCASE) /* * These defines set up the naming scheme required to have a fortran 77 * routine call a C routine * following Fortran to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void PDGEMM(...) */ /* TOOLS */ #define pdlasnbt_ PDLASNBT #define pdlachkieee_ PDLACHKIEEE #define pdlaiectl_ PDLAIECTL #define pdlaiectb_ PDLAIECTB #define pslasnbt_ PSLASNBT #define pslachkieee_ PSLACHKIEEE #define pslaiect_ PSLAIECT #endif #if (F77_CALL_C == NOCHANGE) /* * These defines set up the naming scheme required to have a fortran 77 * routine call a C routine * for following Fortran to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void pdgemm(...) */ /* TOOLS */ #define pdlasnbt_ pdlasnbt #define pdlachkieee_ pdlachkieee #define pdlaiectl_ pdlaiectl #define pdlaiectb_ pdlaiectb #define pslasnbt_ pslasnbt #define pslachkieee_ pslachkieee #define pslaiect_ pslaiect #endif scalapack-2.0.2/SRC/pzaxpy.c000644 000766 000024 00000036552 10363532303 016050 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * Mark R. Fahey * August 2000 * This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0 * which fixes a bug in the incx=1 and incy=1 case. * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzaxpy_( n, alpha, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy ) /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; complex16 * alpha; /* .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZAXPY adds one distributed vector to another, * * sub( Y ) := sub( Y ) + alpha * sub( X ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * Parameters * ========== * * N (global input) pointer to INTEGER. * The length of the distributed vectors to be added. N >= 0. * * ALPHA (global input) pointer to COMPLEX*16 * The scalar used to multiply each component of sub( X ). * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input/local output) COMPLEX*16 array * containing the local pieces of a distributed matrix of * dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * On exit sub( Y ) is overwritten by sub( Y ) + alpha*sub( X ). * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * ===================================================================== * * .. Local Scalars .. */ int ictxt, info, iix, iiy, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, lcmq, mycol, myrow, nn, np, np0, nprow, npcol, nq, nq0, nz, ione=1, tmp1, wksz; complex16 one, tmp, zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgerv2d_(); void zgesd2d_(); void pbchkvect(); void pberror_(); char * getpbbuf(); F_VOID_FCT zaxpy_(); F_VOID_FCT zcopy_(); F_VOID_FCT pbztrnv_(); F_INTG_FCT ilcm_(); F_INTG_FCT numroc_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(600+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -10; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -9; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1100+MB_+1); } else { info = -12; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -10; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -9; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1100+MB_+1); } else { info = -12; } } else { info = -7; } } if( ictxt != desc_Y[CTXT_] ) info = -(1100+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZAXPY", &info ); return; } /* * Quick return if possible. */ if( *n == 0 ) return; /* * y <- y + alpha * x */ if( *n == 1 ) { if( ( myrow == iyrow ) && ( mycol == iycol ) ) { if( ( myrow != ixrow ) || ( mycol != ixcol ) ) zgerv2d_( &ictxt, n, n, &tmp, n, &ixrow, &ixcol ); else tmp = X[iix-1+(jjx-1)*desc_X[LLD_]]; zaxpy_( n, alpha, &tmp, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n ); } else if( ( myrow == ixrow ) && ( mycol == ixcol ) ) zgesd2d_( &ictxt, n, n, &X[iix-1+(jjx-1)*desc_X[LLD_]], n, &iyrow, &iycol ); return; } one.re = ONE; one.im = ZERO; zero.re = ZERO; zero.im = ZERO; if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) zaxpy_( &nq, alpha, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); } else { if( myrow == ixrow ) zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); else if( myrow == iyrow ) { buff = (complex16 *)getpbbuf( "PZAXPY", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zaxpy_( &nq, alpha, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) zaxpy_( &np, alpha, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); } else { if( mycol == ixcol ) zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); else if( mycol == iycol ) { buff = (complex16 *)getpbbuf( "PZAXPY", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zaxpy_( &np, alpha, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmq = lcm / npcol; nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[NB_]; nq0 = MYROC0( tmp1, nn, desc_Y[NB_], npcol ); tmp1 = nq0 / desc_Y[NB_]; wksz = np + MYROC0( tmp1, nq0, desc_Y[NB_], lcmq ); buff = (complex16 *)getpbbuf( "PZAXPY", wksz*sizeof(complex16) ); if( myrow == ixrow ) np -= nz; if( mycol == ixcol ) { zcopy_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, incx ); zscal_( &np, alpha, buff, incx ); } pbztrnv_( &ictxt, C2F_CHAR( "C" ), C2F_CHAR( "T" ), n, &desc_X[MB_], &nz, buff, incx, &one, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &ixcol, &iyrow, &iycol, buff+np ); } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*iy-1) % desc_Y[MB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_Y[MB_], &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZAXPY", wksz*sizeof(complex16) ); pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { if( myrow == iyrow ) np -= nz; zaxpy_( &np, alpha, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); } } } } scalapack-2.0.2/SRC/pzdbsv.f000644 000766 000024 00000045364 10363532303 016031 0ustar00juliestaff000000 000000 SUBROUTINE PZDBSV( N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER BWL, BWU, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PZDBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * banded diagonally dominant-like distributed * matrix with bandwidth BWL, BWU. * * Gaussian elimination without pivoting * is used to factor a reordering * of the matrix into L U. * * See PZDBTRF and PZDBTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * +max((max(bwl,bwu)*NRHS), max(bwl,bwu)*max(bwl,bwu)) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PXERBLA, PZDBTRF, PZDBTRS * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PZDBTRF and PZDBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PZDBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * * Factor the matrix * CALL PZDBTRF( N, BWL, BWU, A, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PZDBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PZDBTRS( 'N', N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZDBSV', -INFO ) RETURN END IF * RETURN * * End of PZDBSV * END scalapack-2.0.2/SRC/pzdbtrf.f000644 000766 000024 00000126110 11750130340 016155 0ustar00juliestaff000000 000000 SUBROUTINE PZDBTRF( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), AF( * ), WORK( * ) * .. * * * Purpose * ======= * * PZDBTRF computes a LU factorization * of an N-by-N complex banded * diagonally dominant-like distributed matrix * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PZDBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = L U * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZDBTRF and this is stored in AF. If a linear system * is to be solved using PZDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * max(bwl,bwu)*max(bwl,bwu) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN, $ LEVEL_DIST, LLDA, MAX_BW, MBW2, MYCOL, MYROW, $ MY_NUM_COLS, NB, NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, OFST, PART_OFFSET, PART_SIZE, $ PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE, $ STORE_N_A, UP_PREV_TRI_SIZE_M, $ UP_PREV_TRI_SIZE_N, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ DESC_CONVERT, GLOBCHK, PXERBLA, RESHAPE, ZAXPY, $ ZGEMM, ZGERV2D, ZGESD2D, ZLAMOV, ZLATCPY, $ ZPBTRF, ZPOTRF, ZSYRK, ZTBTRS, ZTRMM, ZTRRV2D, $ ZTRSD2D, ZTRSM, ZTRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) MBW2 = MAX_BW * MAX_BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LWORK .LT. -1) THEN INFO = -10 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -1 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -2 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 6*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, $ 'PZDBTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PZDBTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU) * IF( LAF .LT. LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PZDBTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = MAX(BWL,BWU)*MAX(BWL,BWU) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, $ 'PZDBTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BWU PARAM_CHECK( 3, 1 ) = BWL PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 10 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZDBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * Offset to workspace for Upper triangular factor * WORK_U = BWU*ODD_SIZE + 3*MBW2 * * * Zero out space for fillin * DO 10 I=1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * * Zero out space for work * DO 20 I=1, WORK_SIZE_MIN WORK( I ) = CZERO 20 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN PREV_TRI_SIZE_M= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) PREV_TRI_SIZE_N=MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .GT. 0 ) THEN UP_PREV_TRI_SIZE_M= MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) UP_PREV_TRI_SIZE_N=MIN( BWU, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN NEXT_TRI_SIZE_M=MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) NEXT_TRI_SIZE_N= MIN( BWL, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF ( MYCOL .LT. NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL ZTRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, $ A( OFST+(MY_NUM_COLS-BWL)*LLDA+(BWL+BWU+1) ), $ LLDA-1, 0, MYCOL+1 ) * ENDIF * * * Factor main partition A_i = L_i {U_i} in each processor * CALL ZDBTRF( ODD_SIZE, ODD_SIZE, BWL, BWU, A( OFST + 1), $ LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1500 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * * Apply factorization to lower connection block BL_i * conjugate transpose the connection block in preparation. * Apply factorization to upper connection block BU_i * Move the connection block in preparation. * CALL ZLATCPY( 'U', BWL, BWL, $ A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )), $ LLDA-1, AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), $ MAX_BW ) CALL ZLAMOV( 'L', BWU, BWU, A( ( OFST+1+ODD_SIZE*LLDA ) ), $ LLDA-1, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ), $ MAX_BW ) * * Perform the triangular system solve {L_i}{{BU'}_i} = {B_i} * CALL ZTBTRS( 'L', 'N', 'U', BWU, BWL, BWU, $ A( OFST+BWU+1+(ODD_SIZE-BWU )*LLDA ), LLDA, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ), $ MAX_BW, INFO ) * * Perform the triangular solve {U_i}^C{BL'}_i^C = {BL_i}^C * CALL ZTBTRS( 'U', 'C', 'N', BWL, BWU, BWL, $ A( OFST+1+(ODD_SIZE-BWL)*LLDA ), LLDA, $ AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW, $ INFO ) * * conjugate transpose resulting block to its location * in main storage. * CALL ZLATCPY( 'L', BWL, BWL, $ AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW, $ A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )), $ LLDA-1 ) * * Move the resulting block back to its location in main storage. * CALL ZLAMOV( 'L', BWU, BWU, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ), $ MAX_BW, A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i} * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL ZGEMM( 'C', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE , $ AF( ODD_SIZE*BWU+2*MBW2+1), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1), MAX_BW, CONE, $ A( OFST+ODD_SIZE*LLDA+1+BWU ), LLDA-1 ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1500 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for * off_diagonal block of reduced system. * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL ZTRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M, $ PREV_TRI_SIZE_N, AF( WORK_U+1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF (INFO.EQ.0) THEN * * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ . * CALL ZTBTRS( 'L', 'N', 'U', ODD_SIZE, BWL, BWL, $ A( OFST + BWU+1 ), LLDA, AF( WORK_U+1 ), $ ODD_SIZE, INFO ) * * * Calculate the "spike" fillin, ${U_i}^C {{GL}_i}^C = {DU_i}^C$ * * * Copy D block into AF storage for solve. * CALL ZLATCPY( 'L', UP_PREV_TRI_SIZE_N, UP_PREV_TRI_SIZE_M, $ A( OFST+1 ), LLDA-1, AF( 1 ), ODD_SIZE ) * CALL ZTBTRS( 'U', 'C', 'N', ODD_SIZE, BWU, BWU, $ A( OFST + 1 ), LLDA, $ AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = GL_i{GU_i} * * * Zero out space in case result is smaller than storage block * DO 30 I=1, MBW2 AF( ODD_SIZE*BWU+2*MBW2+I ) = CZERO 30 CONTINUE * CALL ZGEMM( 'C', 'N', BWU, BWL, ODD_SIZE, $ -CONE, AF( 1 ), ODD_SIZE, $ AF( WORK_U+1 ), ODD_SIZE, CZERO, $ AF( 1+MAX(0,BWL-BWU)+ODD_SIZE*BWU+ $ (2*MAX_BW+MAX(0,BWU-BWL))*MAX_BW), $ MAX_BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL ZGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0, $ MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * * Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage * as per requirements of BLAS routine ZTRMM. * Since we have GU_i stored, * conjugate transpose HU_i to HU_i^C. * CALL ZLATCPY( 'N', BWL, BWL, $ AF( WORK_U+ODD_SIZE-BWL+1 ), ODD_SIZE, $ AF( (ODD_SIZE)*BWU+1+(MAX_BW-BWL) ), $ MAX_BW ) * CALL ZTRMM( 'R', 'U', 'C', 'N', BWL, BWL, -CONE, $ A( ( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA ) ), $ LLDA-1, AF( (ODD_SIZE)*BWU+1+(MAX_BW-BWL) ), $ MAX_BW ) * * * Copy matrix HL_i (the last bwu rows of GL_i^C) to AFU store * as per requirements of BLAS routine ZTRMM. * Since we have GL_i^C stored, * conjugate transpose HL_i^C to HL_i. * CALL ZLATCPY( 'N', BWU, BWU, $ AF( ODD_SIZE-BWU+1 ), ODD_SIZE, $ AF( WORK_U+(ODD_SIZE)*BWL+1+MAX_BW-BWU ), $ MAX_BW ) * CALL ZTRMM( 'R', 'L', 'N', 'N', BWU, BWU, -CONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ AF( WORK_U+(ODD_SIZE)*BWL+1+MAX_BW-BWU ), $ MAX_BW ) * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL ZGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+1 ), $ MAX_BW, 0, MYCOL-1 ) * CALL ZGESD2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), $ MAX_BW, 0, MYCOL-1 ) * ENDIF * * Copy last diagonal block into AF storage for subsequent * operations. * CALL ZLAMOV( 'N', MAX_BW, MAX_BW, $ A( OFST+ODD_SIZE*LLDA+BWU+1 ), $ LLDA-1, AF( ODD_SIZE*BWU+MBW2+1 ), $ MAX_BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ MAX_BW, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * CALL ZAXPY( MBW2, CONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ 1, AF( ODD_SIZE*BWU+MBW2+1 ), 1 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL ZGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * CALL ZAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BWU+MBW2+1 ), 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL ZGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * CALL ZAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BWU+MBW2+1 ), 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL ZDBTRF( MAX_BW, MAX_BW, MIN( MAX_BW-1, BWL ), $ MIN( MAX_BW-1, BWU ), AF( ODD_SIZE*BWU+MBW2+1 $ -( MIN( MAX_BW-1, BWU ))), MAX_BW+1, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL ZLAMOV( 'N', MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ), $ MAX_BW, AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), $ MAX_BW ) * CALL ZLAMOV( 'N', MAX_BW, MAX_BW, AF( WORK_U+ODD_SIZE*BWL+1 ), $ MAX_BW, AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL ZGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+1 ), $ MAX_BW, 0, COMM_PROC ) * CALL ZGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), $ MAX_BW, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL ZTBTRS( $ 'L', 'N', 'U', BWU, MIN( BWL, BWU-1 ), BWU, $ AF( ODD_SIZE*BWU+ $ MBW2+1+(MAX_BW+1)*(MAX_BW-BWU)), MAX_BW+1, $ AF( WORK_U+ODD_SIZE*BWL+1+MAX_BW-BWU), MAX_BW, INFO ) * * Modify lower off_diagonal block with diagonal block * * CALL ZTBTRS( $ 'U', 'C', 'N', BWL, MIN( BWU, BWL-1 ), BWL, $ AF( ODD_SIZE*BWU+ $ MBW2+1-MIN( BWU, BWL-1 )+(MAX_BW+1)*(MAX_BW-BWL)), MAX_BW+1, $ AF( ODD_SIZE*BWU+1+MAX_BW-BWL), MAX_BW, INFO ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'C', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE, $ AF( (ODD_SIZE)*BWU+1 ), MAX_BW, $ AF( WORK_U+(ODD_SIZE)*BWL+1 ), MAX_BW, CZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL ZGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST/2 ) * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL ZGERV2D( ICTXT, MAX_BW, MAX_BW, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * * * Since ZTBTRS has no "left-right" option, we must transpose * CALL ZLATCPY( 'N', MAX_BW, MAX_BW, AF( WORK_U+ODD_SIZE*BWL+ $ 2*MBW2+1), MAX_BW, WORK( 1 ), MAX_BW ) * CALL ZTBTRS( $ 'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), BWL, $ AF( ODD_SIZE*BWU+MBW2+1), MAX_BW+1, $ WORK( 1+MAX_BW*(MAX_BW-BWL) ), MAX_BW, INFO ) * * Transpose back * CALL ZLATCPY( $ 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+ $ 2*MBW2+1), MAX_BW ) * * * * Since ZTBTRS has no "left-right" option, we must transpose * CALL ZLATCPY( 'N', MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+ $ 2*MBW2+1), MAX_BW, WORK( 1 ), MAX_BW ) * CALL ZTBTRS( $ 'U', 'C', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), BWU, $ AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, MAX_BW-1 )), MAX_BW+1, $ WORK( 1+MAX_BW*(MAX_BW-BWU) ), MAX_BW, INFO ) * * Transpose back * CALL ZLATCPY( $ 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ AF( ODD_SIZE*BWU+ $ 2*MBW2+1), MAX_BW ) * * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL ZGEMM( 'N', 'C', MAX_BW, MAX_BW, MAX_BW, -CONE, $ AF( (ODD_SIZE)*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+(ODD_SIZE)*BWL+2*MBW2+1 ), MAX_BW, $ CZERO, WORK( 1 ), MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL ZGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, $ AF( ODD_SIZE*BWU+1 ), MAX_BW, CZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to offdiagonal block's owning processor. * CALL ZGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ 0, COMM_PROC ) * CALL ZGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, $ AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, CZERO, $ WORK( 1 ), MAX_BW ) * * Send contribution to offdiagonal block's owning processor. * CALL ZGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * * 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PZDBTRF * END scalapack-2.0.2/SRC/pzdbtrs.f000644 000766 000024 00000064617 10363532303 016213 0ustar00juliestaff000000 000000 SUBROUTINE PZDBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 7, 2001 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PZDBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PZDBTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * banded diagonally dominant-like distributed * matrix with bandwidth BWL, BWU. * * Routine PZDBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZDBTRF and this is stored in AF. If a linear system * is to be solved using PZDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (max(bwl,bwu)*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NP_SAVE, PART_OFFSET, $ RETURN_CODE, STORE_M_B, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, PXERBLA, $ PZDBTRSV, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -5 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PZDBTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PZDBTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ (MAX(BWL,BWU)*NRHS) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, $ 'PZDBTRS: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZDBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( TRANS, 'N' ) ) THEN * CALL PZDBTRSV( 'L', 'N', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PZDBTRSV( 'U', 'C', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ENDIF * * Call backsolve routine * IF( LSAME( TRANS, 'C' ) ) THEN * CALL PZDBTRSV( 'L', 'C', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PZDBTRSV( 'U', 'N', N, BWL, BWU, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ENDIF 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PZDBTRS * END scalapack-2.0.2/SRC/pzdbtrsv.f000644 000766 000024 00000145137 11750130340 016372 0ustar00juliestaff000000 000000 SUBROUTINE PZDBTRSV( UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PZDBTRSV solves a banded triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a banded * triangular matrix factor produced by the * Gaussian elimination code PZ@(dom_pre)BTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H * is dictated by the user by the parameter TRANS. * * Routine PZDBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bwl+bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZDBTRF and this is stored in AF. If a linear system * is to be solved using PZDBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (max(bwl,bwu)*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*MAX(BWL,BWU) * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MAX_BW, MBW2, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 18, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ DESC_CONVERT, GLOBCHK, PXERBLA, RESHAPE, ZGEMM, $ ZGERV2D, ZGESD2D, ZLAMOV, ZMATADD, ZTBTRS, $ ZTRMM, ZTRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 9*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 12*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 12*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 12*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 12*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * * Size of separator blocks is maximum of bandwidths * MAX_BW = MAX(BWL,BWU) MBW2 = MAX_BW * MAX_BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -2 END IF * IF( LWORK .LT. -1) THEN INFO = -16 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -3 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 9*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -4 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -5 ENDIF * IF( LLDA .LT. (BWL+BWU+1) ) THEN INFO = -( 9*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 9*100 + 4 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 12*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 12*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -6 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -8 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 9*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PZDBTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN INFO = -( 9*100+4 ) CALL PXERBLA( ICTXT, $ 'PZDBTRSV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ MAX(BWL,BWU)*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, $ 'PZDBTRSV: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 18, 1 ) = DESCB(5) PARAM_CHECK( 17, 1 ) = DESCB(4) PARAM_CHECK( 16, 1 ) = DESCB(3) PARAM_CHECK( 15, 1 ) = DESCB(2) PARAM_CHECK( 14, 1 ) = DESCB(1) PARAM_CHECK( 13, 1 ) = IB PARAM_CHECK( 12, 1 ) = DESCA(5) PARAM_CHECK( 11, 1 ) = DESCA(4) PARAM_CHECK( 10, 1 ) = DESCA(3) PARAM_CHECK( 9, 1 ) = DESCA(1) PARAM_CHECK( 8, 1 ) = JA PARAM_CHECK( 7, 1 ) = NRHS PARAM_CHECK( 6, 1 ) = BWU PARAM_CHECK( 5, 1 ) = BWL PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 18, 2 ) = 1205 PARAM_CHECK( 17, 2 ) = 1204 PARAM_CHECK( 16, 2 ) = 1203 PARAM_CHECK( 15, 2 ) = 1202 PARAM_CHECK( 14, 2 ) = 1201 PARAM_CHECK( 13, 2 ) = 11 PARAM_CHECK( 12, 2 ) = 905 PARAM_CHECK( 11, 2 ) = 904 PARAM_CHECK( 10, 2 ) = 903 PARAM_CHECK( 9, 2 ) = 901 PARAM_CHECK( 8, 2 ) = 8 PARAM_CHECK( 7, 2 ) = 6 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 16 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 18, PARAM_CHECK, 18, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZDBTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - MAX_BW ENDIF * * Offset to workspace for Upper triangular factor * WORK_U = BWU*ODD_SIZE + 3*MBW2 * * * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * IF ( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL ZTBTRS( UPLO, 'N', 'U', ODD_SIZE, $ BWL, NRHS, $ A( OFST+1+BWU ), LLDA, $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL ZLAMOV( 'N', BWL, NRHS, $ B( PART_OFFSET+ODD_SIZE-BWL+1), LLDB, $ WORK( 1 ), MAX_BW ) * CALL ZTRMM( 'L', 'U', 'N', 'N', BWL, NRHS, -CONE, $ A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )), $ LLDA-1, WORK( 1 ), MAX_BW ) * CALL ZMATADD( BWL, NRHS, CONE, WORK( 1 ), MAX_BW, $ CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * Clear garbage out of workspace block * DO 10 IDUM1=1, WORK_SIZE_MIN WORK( IDUM1 )=0.0 10 CONTINUE * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL ZGEMM( 'C', 'N', BWU, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+MAX_BW-BWU ), MAX_BW ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL ZMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * CALL ZMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * CALL ZMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL ZTBTRS( 'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), NRHS, $ AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'C', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( (ODD_SIZE)*BWU+1 ), $ MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL ZGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 27 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 26 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 27 26 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL ZGEMM( 'C', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( ODD_SIZE*BWU+2*MBW2+1 ), $ MAX_BW, $ WORK( 1 ), $ MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( (ODD_SIZE)*BWU+1 ), $ MAX_BW, $ WORK( 1 ), $ MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL ZTBTRS( 'L', 'C', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), NRHS, $ AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 22 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 21 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 22 21 CONTINUE * [End of GOTO Loop] * 24 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL ZGEMM( 'N', 'N', ODD_SIZE, NRHS, BWU, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+MAX_BW-BWU ), MAX_BW, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL ZLAMOV( 'N', BWL, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB, $ WORK( 1+MAX_BW-BWL ), MAX_BW ) * CALL ZTRMM( 'L', 'U', 'C', 'N', BWL, NRHS, -CONE, $ A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )), $ LLDA-1, WORK( 1+MAX_BW-BWL ), MAX_BW ) * CALL ZMATADD( BWL, NRHS, CONE, WORK( 1+MAX_BW-BWL ), MAX_BW, $ CONE, B( PART_OFFSET+ODD_SIZE-BWL+1 ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL ZTBTRS( UPLO, 'C', 'U', ODD_SIZE, $ BWL, NRHS, $ A( OFST+1+BWU ), $ LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF ( LSAME( TRANS, 'C' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL ZTBTRS( UPLO, 'C', 'N', ODD_SIZE, $ BWU, NRHS, $ A( OFST+1 ), LLDA, $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL ZLAMOV( 'N', BWU, NRHS, $ B( PART_OFFSET+ODD_SIZE-BWU+1), LLDB, $ WORK( 1 ), MAX_BW ) * CALL ZTRMM( 'L', 'L', 'C', 'N', BWU, NRHS, -CONE, $ A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1, WORK( 1 ), $ MAX_BW ) * CALL ZMATADD( BWU, NRHS, CONE, WORK( 1 ), MAX_BW, $ CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * Clear garbage out of workspace block * DO 20 IDUM1=1, WORK_SIZE_MIN WORK( IDUM1 )=0.0 20 CONTINUE * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL ZGEMM( 'C', 'N', BWL, NRHS, ODD_SIZE, -CONE, $ AF( WORK_U+1 ), ODD_SIZE, B( PART_OFFSET+1 ), $ LLDB, CZERO, WORK( 1+MAX_BW-BWL ), MAX_BW ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL ZMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 44 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 42 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 41 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * CALL ZMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * CALL ZMATADD( MAX_BW, NRHS, CONE, $ WORK( 1 ), MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 42 41 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL ZTBTRS( 'U', 'C', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), NRHS, $ AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, MAX_BW-1 ) ), $ MAX_BW+1, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'C', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( WORK_U+(ODD_SIZE)*BWL+1 ), $ MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL ZGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), $ MAX_BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ MAX_BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 44 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 54 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 57 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 56 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 57 56 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL ZGEMM( 'C', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), $ MAX_BW, $ WORK( 1 ), $ MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), $ MAX_BW, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'N', 'N', MAX_BW, NRHS, MAX_BW, -CONE, $ AF( WORK_U+(ODD_SIZE)*BWL+1 ), $ MAX_BW, $ WORK( 1 ), $ MAX_BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL ZTBTRS( 'U', 'N', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), NRHS, $ AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, MAX_BW-1 ) ), $ MAX_BW+1, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 52 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 51 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 52 51 CONTINUE * [End of GOTO Loop] * 54 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGESD2D( ICTXT, MAX_BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGERV2D( ICTXT, MAX_BW, NRHS, $ WORK( 1 ), MAX_BW, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL ZGEMM( 'N', 'N', ODD_SIZE, NRHS, BWL, -CONE, $ AF( WORK_U+1 ), ODD_SIZE, WORK( 1+MAX_BW-BWL ), $ MAX_BW, CONE, B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL ZLAMOV( 'N', BWU, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB, $ WORK( 1+MAX_BW-BWU ), MAX_BW+BWL ) * CALL ZTRMM( 'L', 'L', 'N', 'N', BWU, NRHS, -CONE, $ A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1, $ WORK( 1+MAX_BW-BWU ), MAX_BW+BWL ) * CALL ZMATADD( BWU, NRHS, CONE, WORK( 1+MAX_BW-BWU ), $ MAX_BW+BWL, CONE, $ B( PART_OFFSET+ODD_SIZE-BWU+1 ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL ZTBTRS( UPLO, 'N', 'N', ODD_SIZE, $ BWU, NRHS, $ A( OFST+1 ), $ LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ENDIF * End of "IF( LSAME( UPLO, 'L' ) )"... 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PZDBTRSV * END scalapack-2.0.2/SRC/pzdotc.c000644 000766 000024 00000052071 10363532303 016012 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * Mark R. Fahey * August 2000 * This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0 * which fixes a bug in the incx=1 and incy=1 case. * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzdotc_( n, dotc, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy ) /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; complex16 * dotc; /* .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZDOTC forms the dot product of two distributed vectors, * * dotc := sub( X )**H * sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * * Parameters * ========== * * N (global input) pointer to INTEGER * The length of the distributed vectors to be multiplied. * N >= 0. * * DOTC (local output) pointer to COMPLEX*16 * The dot product of sub( X ) and sub( Y ) only in their scope. * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * ===================================================================== * * .. Local Scalars .. */ char * cbtop, * cctop, * rbtop, * rctop; int ictxt, iix, iiy, info, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, mone=-1, mycol, myrow, nn, np, np0, nprow, npcol, nq, nz, ione=1, tmp1, wksz; complex16 xwork[1], ywork[1], zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgebr2d_(); void zgebs2d_(); void zgerv2d_(); void zgesd2d_(); void zgsum2d_(); void pbchkvect(); void pberror_(); char * getpbbuf(); char * ptop(); F_VOID_FCT pbztrnv_(); F_VOID_FCT zzdotc_(); F_INTG_FCT ilcm_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(600+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -10; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -9; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1100+MB_+1); } else { info = -12; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -10; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -9; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1100+MB_+1); } else { info = -12; } } else { info = -7; } } if( ictxt != desc_Y[CTXT_] ) info = -(1100+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZDOTC", &info ); return; } /* * Quick return if possible. */ dotc->re = ZERO; dotc->im = ZERO; zero.re = ZERO; zero.im = ZERO; if( *n == 0 ) return; /* * dot <- x^{h} * y */ if( *n == 1 ) { if( ( myrow == ixrow ) && ( mycol == ixcol ) ) { buff = &X[iix-1+(jjx-1)*desc_X[LLD_]]; if( ( myrow != iyrow ) || ( mycol != iycol ) ) { zgesd2d_( &ictxt, n, n, buff, n, &iyrow, &iycol ); zgerv2d_( &ictxt, n, n, ywork, n, &iyrow, &iycol ); } else *ywork = Y[iiy-1+(jjy-1)*desc_Y[LLD_]]; zzdotc_( n, dotc, buff, n, ywork, n ); } else if( ( myrow == iyrow ) && ( mycol == iycol ) ) { zgesd2d_( &ictxt, n, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, &ixrow, &ixcol ); zgerv2d_( &ictxt, n, n, xwork, n, &ixrow, &ixcol ); zzdotc_( n, dotc, xwork, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n ); } if( ( *incx == desc_X[M_] ) && ( desc_X[M_] != 1 ) ) { if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &ixcol ); } } } else if( ( *incx == 1 ) && ( desc_X[M_] != 1 ) ) { if( mycol == ixcol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == ixrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione, &ixrow, &mycol ); } } } if( ( *incy == desc_Y[M_] ) && ( desc_Y[M_] != 1 ) ) { if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &iycol ); } } } else if( ( *incy == 1 ) && ( desc_Y[M_] != 1 ) ) { if( mycol == iycol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == iyrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione, &iyrow, &mycol ); } } } return; } if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zzdotc_( &nq, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } else { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTC", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotc_( &nq, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } else if( myrow == iyrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTC", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotc_( &nq, dotc, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotc_( &np, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } else { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); buff = (complex16 *)getpbbuf( "PZDOTC", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &iycol ); zzdotc_( &np, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } else if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); buff = (complex16 *)getpbbuf( "PZDOTC", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zgesd2d_( &ictxt, &np, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &myrow, &ixcol ); zzdotc_( &np, dotc, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmp = lcm / nprow; nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_X[MB_], nprow ); tmp1 = np0 / desc_X[MB_]; wksz = MYROC0( tmp1, np0, desc_X[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTC", wksz*sizeof(complex16) ); if( mycol == iycol ) jjy -= nz; if( myrow == ixrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &zero, buff, &ione, &iyrow, &iycol, &ixrow, &ixcol, buff+np ); if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotc_( &np, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &ixcol ); } } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*jx-1) % desc_X[NB_]; nn = *n + nz; tmp1 = nn / desc_X[MB_]; np = numroc_( &nn, desc_Y+MB_, &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTC", wksz*sizeof(complex16) ); if( myrow == iyrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotc_( &np, dotc, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &iycol ); } } } } scalapack-2.0.2/SRC/pzdotu.c000644 000766 000024 00000052111 10363532303 016027 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * Mark R. Fahey * August 2000 * This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0 * which fixes a bug in the incx=1 and incy=1 case. * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzdotu_( n, dotu, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy ) /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; complex16 * dotu; /* .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZDOTU forms the dot product of two distributed vectors, * * dotu := sub( X )**T * sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * * Parameters * ========== * * N (global input) pointer to INTEGER * The length of the distributed vectors to be multiplied. * N >= 0. * * DOTU (local output) pointer to COMPLEX*16 * The dot product of sub( X ) and sub( Y ) only in their scope. * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * ===================================================================== * * .. Local Scalars .. */ char * cbtop, * cctop, * rbtop, * rctop; int ictxt, iix, iiy, info, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, mone=-1, mycol, myrow, nn, np, np0, nprow, npcol, nq, nz, ione=1, tmp1, wksz; complex16 xwork[1], ywork[1], zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgebr2d_(); void zgebs2d_(); void zgerv2d_(); void zgesd2d_(); void zgsum2d_(); void pbchkvect(); void pberror_(); char * getpbbuf(); char * ptop(); F_VOID_FCT pbztrnv_(); F_VOID_FCT zzdotu_(); F_INTG_FCT ilcm_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(600+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -10; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -9; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1100+MB_+1); } else { info = -12; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -10; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -9; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1100+MB_+1); } else { info = -12; } } else { info = -7; } } if( ictxt != desc_Y[CTXT_] ) info = -(1100+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZDOTU", &info ); return; } /* * Quick return if possible. */ dotu->re = ZERO; dotu->im = ZERO; zero.re = ZERO; zero.im = ZERO; if( *n == 0 ) return; /* * dot <- x^{t} * y */ if( *n == 1 ) { if( ( myrow == ixrow ) && ( mycol == ixcol ) ) { buff = &X[iix-1+(jjx-1)*desc_X[LLD_]]; if( ( myrow != iyrow ) || ( mycol != iycol ) ) { zgesd2d_( &ictxt, n, n, buff, n, &iyrow, &iycol ); zgerv2d_( &ictxt, n, n, ywork, n, &iyrow, &iycol ); } else *ywork = Y[iiy-1+(jjy-1)*desc_Y[LLD_]]; zzdotu_( n, dotu, buff, n, ywork, n ); } else if( ( myrow == iyrow ) && ( mycol == iycol ) ) { zgesd2d_( &ictxt, n, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, &ixrow, &ixcol ); zgerv2d_( &ictxt, n, n, xwork, n, &ixrow, &ixcol ); zzdotu_( n, dotu, xwork, n, &Y[iiy-1+(jjx-1)*desc_X[LLD_]], n ); } if( ( *incx == desc_X[M_] ) && ( desc_X[M_] != 1 ) ) { if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &ixcol ); } } } else if( ( *incx == 1 ) && ( desc_X[M_] != 1 ) ) { if( mycol == ixcol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == ixrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione, &ixrow, &mycol ); } } } if( ( *incy == desc_Y[M_] ) && ( desc_Y[M_] != 1 ) ) { if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &iycol ); } } } else if( ( *incy == 1 ) && ( desc_Y[M_] != 1 ) ) { if( mycol == iycol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == iyrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione, &iyrow, &mycol ); } } } return; } if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zzdotu_( &nq, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } else { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTU", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotu_( &nq, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } else if( myrow == iyrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTU", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotu_( &nq, dotu, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotu_( &np, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } else { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); buff = (complex16 *)getpbbuf( "PZDOTU", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &iycol ); zzdotu_( &np, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } else if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zgesd2d_( &ictxt, &np, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &myrow, &ixcol ); buff = (complex16 *)getpbbuf( "PZDOTU", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zzdotu_( &np, dotu, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmp = lcm / nprow; nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_X[MB_], nprow ); tmp1 = np0 / desc_X[MB_]; wksz = MYROC0( tmp1, np0, desc_X[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTU", wksz*sizeof(complex16) ); if( mycol == iycol ) jjy -= nz; if( myrow == ixrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &zero, buff, &ione, &iyrow, &iycol, &ixrow, &ixcol, buff+np ); if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotu_( &np, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &ixcol ); } } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*jx-1) % desc_X[NB_]; nn = *n + nz; tmp1 = nn / desc_X[MB_]; np = numroc_( &nn, desc_Y+MB_, &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTU", wksz*sizeof(complex16) ); if( myrow == iyrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotu_( &np, dotu, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &iycol ); } } } } scalapack-2.0.2/SRC/pzdrscl.f000644 000766 000024 00000016304 10363532303 016172 0ustar00juliestaff000000 000000 SUBROUTINE PZDRSCL( N, SA, SX, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N DOUBLE PRECISION SA * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX*16 SX( * ) * .. * * Purpose * ======= * * PZDRSCL multiplies an N-element complex distributed vector * sub( X ) by the real scalar 1/a. This is done without overflow or * underflow as long as the final sub( X )/a does not overflow or * underflow. * * where sub( X ) denotes X(IX:IX+N-1,JX:JX), if INCX = 1, * X(IX:IX,JX:JX+N-1), if INCX = M_X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) pointer to INTEGER * The number of components of the distributed vector sub( X ). * N >= 0. * * SA (global input) DOUBLE PRECISION * The scalar a which is used to divide each component of * sub( X ). SA must be >= 0, or the subroutine will divide by * zero. * * SX (local input/local output) COMPLEX*16 array * containing the local pieces of a distributed matrix of * dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL DONE INTEGER ICTXT, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION BIGNUM, CDEN, CDEN1, CNUM, CNUM1, MUL, SMLNUM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PDLABAD, PZDSCAL * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Get machine parameters * SMLNUM = PDLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM CALL PDLABAD( ICTXT, SMLNUM, BIGNUM ) * * Initialize the denominator to SA and the numerator to 1. * CDEN = SA CNUM = ONE * 10 CONTINUE CDEN1 = CDEN*SMLNUM CNUM1 = CNUM / BIGNUM IF( ABS( CDEN1 ).GT.ABS( CNUM ) .AND. CNUM.NE.ZERO ) THEN * * Pre-multiply sub( X ) by SMLNUM if CDEN is large compared to * CNUM. * MUL = SMLNUM DONE = .FALSE. CDEN = CDEN1 ELSE IF( ABS( CNUM1 ).GT.ABS( CDEN ) ) THEN * * Pre-multiply sub( X ) by BIGNUM if CDEN is small compared to * CNUM. * MUL = BIGNUM DONE = .FALSE. CNUM = CNUM1 ELSE * * Multiply sub( X ) by CNUM / CDEN and return. * MUL = CNUM / CDEN DONE = .TRUE. END IF * * Scale the vector sub( X ) by MUL * CALL PZDSCAL( N, MUL, SX, IX, JX, DESCX, INCX ) * IF( .NOT.DONE ) $ GO TO 10 * RETURN * * End of PZDRSCL * END scalapack-2.0.2/SRC/pzdtsv.f000644 000766 000024 00000046377 10363532303 016060 0ustar00juliestaff000000 000000 SUBROUTINE PZDTSV( N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 B( * ), D( * ), DL( * ), DU( * ), WORK( * ) * .. * * * Purpose * ======= * * PZDTSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * tridiagonal diagonally dominant-like distributed * matrix. * * Gaussian elimination without pivoting * is used to factor a reordering * of the matrix into L U. * * See PZDTTRF and PZDTTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (12*NPCOL+3*NB) * +max(10*NPCOL+4*NRHS, 8*NPCOL) * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PXERBLA, PZDTTRF, PZDTTRS * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PZDTTRF and PZDTTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSEIF( DESCA( DTYPE_ ) .EQ. 502 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PZDTSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (12*NPCOL+3*NB) * * Factor the matrix * CALL PZDTTRF( N, DL, D, DU, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PZDTSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PZDTTRS( 'N', N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB, $ WORK, MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZDTSV', -INFO ) RETURN END IF * RETURN * * End of PZDTSV * END scalapack-2.0.2/SRC/pzdttrf.f000644 000766 000024 00000107437 11750130340 016212 0ustar00juliestaff000000 000000 SUBROUTINE PZDTTRF( N, DL, D, DU, JA, DESCA, AF, LAF, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 AF( * ), D( * ), DL( * ), DU( * ), WORK( * ) * .. * * * Purpose * ======= * * PZDTTRF computes a LU factorization * of an N-by-N complex tridiagonal * diagonally dominant-like distributed matrix * A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PZDTTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = L U * * where U is a tridiagonal upper triangular matrix and L is tridiagonal * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * DL (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZDTTRF and this is stored in AF. If a linear system * is to be solved using PZDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 8*NPCOL * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * diagonally dominant-like, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * stably factorable wo/interchanges, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN, $ LEVEL_DIST, LLDA, MYCOL, MYROW, MY_NUM_COLS, $ NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_N_A, $ TEMP, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ DESC_CONVERT, GLOBCHK, PXERBLA, RESHAPE, ZAXPY, $ ZGEMM, ZGERV2D, ZGESD2D, ZLAMOV, ZLATCPY, $ ZPBTRF, ZPOTRF, ZSYRK, ZTBTRS, ZTRMM, ZTRRV2D, $ ZTRSD2D, ZTRSM, ZTRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC COMPLEX*16 ZDOTC EXTERNAL LSAME, NUMROC, ZDOTC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LWORK .LT. -1) THEN INFO = -10 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -1 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, $ 'PZDTTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PZDTTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = (12*NPCOL+3*NB) * IF( LAF .LT. LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PZDTTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = 8*NPCOL * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, $ 'PZDTTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 7, 1 ) = DESCA(5) PARAM_CHECK( 6, 1 ) = DESCA(4) PARAM_CHECK( 5, 1 ) = DESCA(3) PARAM_CHECK( 4, 1 ) = DESCA(1) PARAM_CHECK( 3, 1 ) = JA PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 7, 2 ) = 605 PARAM_CHECK( 6, 2 ) = 604 PARAM_CHECK( 5, 2 ) = 603 PARAM_CHECK( 4, 2 ) = 601 PARAM_CHECK( 3, 2 ) = 5 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 10 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 7, PARAM_CHECK, 7, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZDTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * Offset to workspace for Upper triangular factor * WORK_U = INT_ONE*ODD_SIZE + 3 * * * Zero out space for fillin * DO 10 I=1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * IF ( MYCOL .LT. NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL ZTRSD2D( ICTXT, 'U', 'N', 1, 1, $ DU( PART_OFFSET+ODD_SIZE+1 ), LLDA-1, 0, $ MYCOL+1 ) * ENDIF * * * Factor main partition A_i = L_i {U_i} in each processor * CALL ZDTTRF( ODD_SIZE, DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1500 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * * Apply factorization to lower connection block BL_i * Apply factorization to upper connection block BU_i * * * Perform the triangular solve {U_i}^C{BL'}_i^C = {BL_i}^C * * DL( PART_OFFSET+ODD_SIZE+1 ) = $ ( DL( PART_OFFSET+ODD_SIZE+1 ) ) $ / ( D( PART_OFFSET+ODD_SIZE ) ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i} * * D( PART_OFFSET+ODD_SIZE+1 ) = D( PART_OFFSET+ODD_SIZE+1 )- $ DL( PART_OFFSET+ODD_SIZE+1 )*DU( PART_OFFSET+ODD_SIZE ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1500 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * * Move entry that causes spike to auxiliary storage * AF( WORK_U+1 ) = ( DL( PART_OFFSET+1 ) ) * IF (INFO.EQ.0) THEN * * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ . * CALL ZDTTRSV( 'L', 'N', ODD_SIZE, INT_ONE, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), AF( WORK_U+1 ), ODD_SIZE, $ INFO ) * * * Calculate the "spike" fillin, ${U_i}^C {{GL}_i}^C = {DU_i}^C$ * CALL ZTRRV2D( ICTXT, 'U', 'N', 1, 1, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * AF( 1 ) = DCONJG( AF( 1 ) ) * CALL ZDTTRSV( 'U', 'C', ODD_SIZE, INT_ONE, $ DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ), $ DU( PART_OFFSET+1 ), $ AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = GL_i{GU_i} * AF( ODD_SIZE+3 ) = -CONE * $ ZDOTC( ODD_SIZE, AF( 1 ), 1, AF( WORK_U+1 ), 1 ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+3 ), $ INT_ONE, 0, MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * AF( ODD_SIZE+1 ) = -CONE $ * DCONJG( DL( PART_OFFSET+ODD_SIZE+1 ) $ * AF( WORK_U+ODD_SIZE ) ) * * AF(WORK_U+(ODD_SIZE)+1 ) = -CONE $ * DU( PART_OFFSET+ODD_SIZE ) $ * DCONJG( AF( ODD_SIZE ) ) * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, $ AF( WORK_U+ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * ENDIF * * Copy last diagonal block into AF storage for subsequent * operations. * AF( ODD_SIZE+2 ) = $ DCMPLX( D( PART_OFFSET+ODD_SIZE+1 ) ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+AF( ODD_SIZE+3 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+WORK( 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+WORK( 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... IF( AF( ODD_SIZE+2 ) .EQ. CZERO ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * AF( WORK_U+ODD_SIZE+3 ) = AF( ODD_SIZE+1 ) * AF( ODD_SIZE+3 ) = AF( WORK_U+ODD_SIZE+1 ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( WORK_U+ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify lower off_diagonal block with diagonal block * * AF( ODD_SIZE+1 ) = AF( ODD_SIZE+1 ) $ / DCONJG( AF( ODD_SIZE+2 ) ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * WORK( 1 ) = -ONE*DCONJG( AF( ODD_SIZE+1 ) )* $ AF( WORK_U+(ODD_SIZE)+1 ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( WORK_U+ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST/2 ) * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * AF( ODD_SIZE+3 ) = AF( ODD_SIZE+3 ) $ / ( AF( ODD_SIZE+2 ) ) * * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 ) $ *DCONJG( AF( WORK_U+ODD_SIZE+3 ) ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * WORK( 1 ) = -ONE*AF( WORK_U+ODD_SIZE+3 ) $ * AF( ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 ) $ * AF( WORK_U+ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * * 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PZDTTRF * END scalapack-2.0.2/SRC/pzdttrs.f000644 000766 000024 00000067332 10363532303 016232 0ustar00juliestaff000000 000000 SUBROUTINE PZDTTRS( TRANS, N, NRHS, DL, D, DU, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 7, 2001 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 AF( * ), B( * ), D( * ), DL( * ), DU( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PZDTTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PZDTTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * tridiagonal diagonally dominant-like distributed * matrix. * * Routine PZDTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZDTTRF and this is stored in AF. If a linear system * is to be solved using PZDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 10*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ MY_NUM_COLS, NB, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, PART_OFFSET, PART_SIZE, $ RETURN_CODE, STORE_M_B, STORE_N_A, TEMP, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 15, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, PXERBLA, $ PZDTTRSV, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC COMPLEX*16 ZDOTC EXTERNAL LSAME, NUMROC, ZDOTC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -3 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PZDTTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PZDTTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ 10*NPCOL+4*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, $ 'PZDTTRS: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 15, 1 ) = DESCB(5) PARAM_CHECK( 14, 1 ) = DESCB(4) PARAM_CHECK( 13, 1 ) = DESCB(3) PARAM_CHECK( 12, 1 ) = DESCB(2) PARAM_CHECK( 11, 1 ) = DESCB(1) PARAM_CHECK( 10, 1 ) = IB PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = NRHS PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 15, 2 ) = 1105 PARAM_CHECK( 14, 2 ) = 1104 PARAM_CHECK( 13, 2 ) = 1103 PARAM_CHECK( 12, 2 ) = 1102 PARAM_CHECK( 11, 2 ) = 1101 PARAM_CHECK( 10, 2 ) = 10 PARAM_CHECK( 9, 2 ) = 805 PARAM_CHECK( 8, 2 ) = 804 PARAM_CHECK( 7, 2 ) = 803 PARAM_CHECK( 6, 2 ) = 801 PARAM_CHECK( 5, 2 ) = 7 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 15 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 15, PARAM_CHECK, 15, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZDTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( TRANS, 'N' ) ) THEN * CALL PZDTTRSV( 'L', 'N', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ELSE * CALL PZDTTRSV( 'U', 'C', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ENDIF * * Call backsolve routine * IF( LSAME( TRANS, 'C' ) ) THEN * CALL PZDTTRSV( 'L', 'C', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ELSE * CALL PZDTTRSV( 'U', 'N', N, NRHS, DL( PART_OFFSET+1 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), JA_NEW, $ DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, WORK, $ LWORK, INFO ) * ENDIF 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PZDTTRS * END scalapack-2.0.2/SRC/pzdttrsv.f000644 000766 000024 00000142127 11750130340 016410 0ustar00juliestaff000000 000000 SUBROUTINE PZDTTRSV( UPLO, TRANS, N, NRHS, DL, D, DU, JA, DESCA, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 AF( * ), B( * ), D( * ), DL( * ), DU( * ), $ WORK( * ) * .. * * * Purpose * ======= * * PZDTTRSV solves a tridiagonal triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a tridiagonal * triangular matrix factor produced by the * Gaussian elimination code PZ@(dom_pre)TTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H * is dictated by the user by the parameter TRANS. * * Routine PZDTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * DL (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the lower diagonal of the * matrix. Globally, DL(1) is not referenced, and DL must be * aligned with D. * Must be of size >= DESCA( NB_ ). * On exit, this array contains information containing the * factors of the matrix. * * D (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * DU (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZDTTRF and this is stored in AF. If a linear system * is to be solved using PZDTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * 2*(NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 10*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_M_B, STORE_N_A, $ TEMP, WORK_SIZE_MIN, WORK_U * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ DESC_CONVERT, GLOBCHK, PXERBLA, RESHAPE, ZGEMM, $ ZGERV2D, ZGESD2D, ZLAMOV, ZMATADD, ZTBTRS, $ ZTRMM, ZTRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC COMPLEX*16 ZDOTC EXTERNAL LSAME, NUMROC, ZDOTC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 9*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 12*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 12*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 12*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 12*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -2 END IF * IF( LWORK .LT. -1) THEN INFO = -16 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -3 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 9*100 + 6 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 12*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 12*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -4 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -8 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 9*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PZDTTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 9*100+4 ) CALL PXERBLA( ICTXT, $ 'PZDTTRSV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ INT_ONE*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, $ 'PZDTTRSV: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1205 PARAM_CHECK( 15, 2 ) = 1204 PARAM_CHECK( 14, 2 ) = 1203 PARAM_CHECK( 13, 2 ) = 1202 PARAM_CHECK( 12, 2 ) = 1201 PARAM_CHECK( 11, 2 ) = 11 PARAM_CHECK( 10, 2 ) = 905 PARAM_CHECK( 9, 2 ) = 904 PARAM_CHECK( 8, 2 ) = 903 PARAM_CHECK( 7, 2 ) = 901 PARAM_CHECK( 6, 2 ) = 8 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 16 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZDTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * Offset to workspace for Upper triangular factor * WORK_U = INT_ONE*ODD_SIZE + 3 * * * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * IF ( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL ZDTTRSV( UPLO, 'N', ODD_SIZE, NRHS, DL( PART_OFFSET+2 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL ZAXPY( NRHS, -DL( PART_OFFSET+ODD_SIZE+1 ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, ODD_SIZE, -CONE, $ AF( 1 ), ODD_SIZE, B( PART_OFFSET+1 ), LLDB, $ CZERO, WORK( 1+INT_ONE-INT_ONE ), INT_ONE ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL ZTBTRS( 'L', 'N', 'U', INT_ONE, MIN( INT_ONE, INT_ONE-1 ), $ NRHS, AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*INT_ONE+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL ZGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*INT_ONE+2+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 27 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 26 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 27 26 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*INT_ONE+2+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*INT_ONE+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL ZTBTRS( 'L', 'C', 'U', INT_ONE, MIN( INT_ONE, INT_ONE-1 ), $ NRHS, AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 22 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 21 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 22 21 CONTINUE * [End of GOTO Loop] * 24 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL ZGEMM( 'N', 'N', ODD_SIZE, NRHS, INT_ONE, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+INT_ONE-INT_ONE ), INT_ONE, $ CONE, B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL ZAXPY( NRHS, -DCONJG( DL( PART_OFFSET+ODD_SIZE+1 ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL ZDTTRSV( UPLO, 'C', ODD_SIZE, NRHS, DL( PART_OFFSET+2 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), $ B( PART_OFFSET+1 ), LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF ( LSAME( TRANS, 'C' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL ZDTTRSV( UPLO, 'C', ODD_SIZE, NRHS, DL( PART_OFFSET+2 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL ZAXPY( NRHS, -DCONJG( DU( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, ODD_SIZE, -CONE, $ AF( WORK_U+1 ), ODD_SIZE, B( PART_OFFSET+1 ), $ LLDB, CZERO, WORK( 1+INT_ONE-INT_ONE ), $ INT_ONE ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 44 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 42 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 41 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 42 41 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL ZTBTRS( 'U', 'C', 'N', INT_ONE, MIN( INT_ONE, INT_ONE-1 ), $ NRHS, AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+(ODD_SIZE)*INT_ONE+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL ZGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+ODD_SIZE*INT_ONE+2+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 44 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 54 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 57 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 56 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 57 56 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+ODD_SIZE*INT_ONE+2+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+(ODD_SIZE)*INT_ONE+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL ZTBTRS( 'U', 'N', 'N', INT_ONE, MIN( INT_ONE, INT_ONE-1 ), $ NRHS, AF( ODD_SIZE+2 ), INT_ONE+1, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 52 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 51 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 52 51 CONTINUE * [End of GOTO Loop] * 54 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL ZGEMM( 'N', 'N', ODD_SIZE, NRHS, INT_ONE, -CONE, $ AF( WORK_U+1 ), ODD_SIZE, $ WORK( 1+INT_ONE-INT_ONE ), INT_ONE, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL ZAXPY( NRHS, -( DU( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL ZDTTRSV( UPLO, 'N', ODD_SIZE, NRHS, DU( PART_OFFSET+2 ), $ D( PART_OFFSET+1 ), DU( PART_OFFSET+1 ), $ B( PART_OFFSET+1 ), LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ENDIF * End of "IF( LSAME( UPLO, 'L' ) )"... 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PZDTTRSV * END scalapack-2.0.2/SRC/pzgbsv.f000644 000766 000024 00000045630 10363532303 016030 0ustar00juliestaff000000 000000 SUBROUTINE PZGBSV( N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, $ DESCB, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER BWL, BWU, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) COMPLEX*16 A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PZGBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * banded distributed * matrix with bandwidth BWL, BWU. * * Gaussian elimination with pivoting * is used to factor a reordering * of the matrix into P L U. * * See PZGBTRF and PZGBTRS for details. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * +max(NRHS*(NB+2*bwl+4*bwu), 1) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * nonsingular, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * nonsingular, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PXERBLA, PZGBTRF, PZGBTRS * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PZGBTRF and PZGBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PZGBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * * Factor the matrix * CALL PZGBTRF( N, BWL, BWU, A, JA, DESCA, IPIV, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PZGBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PZGBTRS( 'N', N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, $ DESCB, WORK, MIN( LWORK, WS_FACTOR ), $ WORK( 1+WS_FACTOR), LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGBSV', -INFO ) RETURN END IF * RETURN * * End of PZGBSV * END scalapack-2.0.2/SRC/pzgbtrf.f000644 000766 000024 00000107647 11750130340 016176 0ustar00juliestaff000000 000000 SUBROUTINE PZGBTRF( N, BWL, BWU, A, JA, DESCA, IPIV, AF, LAF, $ WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX*16 A( * ), AF( * ), WORK( * ) * .. * * * Purpose * ======= * * PZGBTRF computes a LU factorization * of an N-by-N complex banded * distributed matrix * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PZGBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) Q = L U * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P and Q are permutation matrices. * The matrix Q represents reordering of columns * for parallelism's sake, while P represents * reordering of rows for numerical stability using * classic partial pivoting. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZGBTRF and this is stored in AF. If a linear system * is to be solved using PZGBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 1 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * nonsingular, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * nonsingular, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Implemented for ScaLAPACK by: * Andrew J. Cleary, Livermore National Lab and University of Tenn., * and Markus Hegland, Australian Natonal University. Feb., 1997. * Based on code written by : Peter Arbenz, ETH Zurich, 1996. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER APTR, BBPTR, BIPTR, BM, BM1, BM2, BMN, BN, BW, $ CSRC, DBPTR, FIRST_PROC, I, ICTXT, ICTXT_NEW, $ ICTXT_SAVE, IDUM3, J, JA_NEW, JPTR, L, LAF_MIN, $ LBWL, LBWU, LDB, LDBB, LLDA, LM, LMJ, LN, LNJ, $ LPTR, MYCOL, MYROW, MY_NUM_COLS, NB, NEICOL, $ NP, NPACT, NPCOL, NPROW, NPSTR, NP_SAVE, NRHS, $ ODD_N, ODD_SIZE, ODPTR, OFST, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT, $ GLOBCHK, PXERBLA, RESHAPE, ZAXPY, ZGEMM, $ ZGERV2D, ZGESD2D, ZLAMOV, ZLATCPY, ZPBTRF, $ ZPOTRF, ZSYRK, ZTBTRS, ZTRMM, ZTRRV2D, ZTRSD2D, $ ZTRSM, ZTRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LWORK .LT. -1) THEN INFO = -11 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -1 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -2 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (2*BWL+2*BWU+1) ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 6*100 + 4 ) ENDIF * BW = BWU+BWL * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, $ 'PZGBTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.(BWL+BWU+1) )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PZGBTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = (NB+BWU)*(BWL+BWU)+6*(BWL+BWU)*(BWL+2*BWU) * IF( LAF .LT. LAF_MIN ) THEN INFO = -9 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PZGBTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = 1 * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -11 * put minimum value of work into work( 1 ) WORK( 1 ) = WORK_SIZE_MIN CALL PXERBLA( ICTXT, $ 'PZGBTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BWU PARAM_CHECK( 3, 1 ) = BWL PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 11 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZGBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * * Zero out space for fillin * DO 10 I = 1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * DO 9 J = 1, ODD_SIZE DO 8 I = 1, BW A( I+(J-1)*LLDA ) = CZERO 8 CONTINUE 9 CONTINUE * * Begin main code * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Transfer triangle B_i of local matrix to next processor * for fillin. Overlap the send with the factorization of A_i. * IF (MYCOL .LE. NPCOL-2) THEN * * The last processor does not need to send anything. * BIPTR = location of triangle B_i in memory BIPTR = (NB-BW)*LLDA + 2*BW+1 * CALL ZTRSD2D( ICTXT, 'U', 'N', $ MIN( BW, BWU+NUMROC( N, NB, MYCOL+1, 0, NPCOL ) ), $ BW, A(BIPTR), LLDA-1, 0, MYCOL+1) * ENDIF * * Factor main partition P_i A_i = L_i U_i on each processor * * LBWL, LBWU: lower and upper bandwidth of local solver * Note that for MYCOL > 0 one has lower triangular blocks! * LM is the number of rows which is usually NB except for * MYCOL = 0 where it is BWU less and MYCOL=NPCOL-1 where it * is NR+BWU where NR is the number of columns on the last processor * Finally APTR is the pointer to the first element of A. As LAPACK * has a slightly different matrix format than Scalapack the pointer * has to be adjusted on processor MYCOL=0. * IF (MYCOL .NE. 0) THEN LBWL = BW LBWU = 0 APTR = 1 ELSE LBWL = BWL LBWU = BWU APTR = 1+BWU ENDIF * IF (MYCOL .NE. NPCOL-1) THEN LM = NB - LBWU LN = NB - BW ELSE IF (MYCOL .NE. 0) THEN LM = ODD_SIZE + BWU LN = MAX(ODD_SIZE-BW,0) ELSE LM = N LN = MAX( N-BW, 0 ) ENDIF * IF (LN .GT. 0) THEN * CALL ZGBTRF(LM,LN, LBWL,LBWU, A(APTR),LLDA, IPIV, INFO) * IF( INFO.NE.0 ) THEN INFO = INFO + NB*MYCOL GO TO 90 END IF * NRHS = BW LDB = LLDA-1 * * Update the last BW columns of A_i (code modified from ZGBTRS) * * Only the eliminations of unknowns > LN-BW have an effect on * the last BW columns. Loop over them... * DO 23 J = MAX(LN-BW+1,1), LN * LMJ = MIN( LBWL, LM-J ) LNJ = MIN( BW, J+BW-LN+APTR-1 ) * L = IPIV( J ) * JPTR = J-(LN+1)+2*BW+1-LBWL + LN*LLDA * IF( L.NE.J ) THEN * * Element (L,LN+1) is swapped with element (J,LN+1) etc * Furthermore, the elements in the same row are LDB=LLDA-1 apart * The complicated formulas are to cope with the banded * data format: * LPTR = L-(LN+1)+2*BW+1-LBWL + LN*LLDA * CALL ZSWAP( LNJ, A(LPTR),LDB, A(JPTR), LDB ) * ENDIF * * LPTR is the pointer to the beginning of the * coefficients of L * LPTR = BW+1+APTR + (J-1)*LLDA * CALL ZGERU(LMJ,LNJ,-CONE, A(LPTR),1, A(JPTR),LDB, $ A(JPTR+1),LDB) 23 CONTINUE * ENDIF * * Compute spike fill-in, L_i F_i = P_i B_{i-1} * * Receive triangle B_{i-1} from previous processor * IF (MYCOL .GT. 0) THEN * CALL ZTRRV2D( ICTXT, 'U', 'N', MIN(BW, LM), BW, AF( 1 ), $ LM, 0, MYCOL-1) * * * Permutation and forward elimination (triang. solve) * DO 24 J = 1, LN * LMJ = MIN( LBWL, LM-J ) L = IPIV( J ) * IF( L .NE. J ) THEN * CALL ZSWAP(NRHS, AF(L), LM, AF(J), LM ) ENDIF * LPTR = BW+1+APTR + (J-1)*LLDA * CALL ZGERU( LMJ,NRHS, -CONE, A(LPTR),1, $ AF(J), LM, AF(J+1), LM) * 24 CONTINUE * ENDIF * 90 CONTINUE * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Define the initial dimensions of the diagonal blocks * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW * IF (MYCOL .NE. NPCOL-1) THEN BM = BW - LBWU BN = BW ELSE BM = MIN(BW,ODD_SIZE) + BWU BN = MIN(BW,ODD_SIZE) ENDIF * * Pointer to first element of block bidiagonal matrix in AF * Leading dimension of block bidiagonal system * BBPTR = (NB+BWU)*BW + 1 LDBB = 2*BW + BWU * * Copy from A and AF into block bidiagonal matrix (tail of AF) * * DBPTR = Pointer to diagonal blocks in A DBPTR = BW+1 + LBWU + LN*LLDA * CALL ZLAMOV('G',BM,BN, A(DBPTR),LLDA-1, $ AF(BBPTR + BW*LDBB),LDBB) * * Zero out any junk entries that were copied * DO 870 J=1, BM DO 880 I=J+LBWL, BM-1 AF( BBPTR+BW*LDBB+(J-1)*LDBB+I ) = CZERO 880 CONTINUE 870 CONTINUE * IF (MYCOL .NE. 0) THEN * * ODPTR = Pointer to offdiagonal blocks in A * ODPTR = LM-BM+1 CALL ZLAMOV('G',BM,BW, AF(ODPTR),LM, $ AF(BBPTR +2*BW*LDBB),LDBB) ENDIF * IF (NPCOL.EQ.1) THEN * * In this case the loop over the levels will not be * performed. CALL ZGETRF( N-LN, N-LN, AF(BBPTR+BW*LDBB), LDBB, $ IPIV(LN+1), INFO) * ENDIF * * Loop over levels ... only occurs if npcol > 1 * * The two integers NPACT (nu. of active processors) and NPSTR * (stride between active processors) are used to control the * loop. * NPACT = NPCOL NPSTR = 1 * * Begin loop over levels * 200 IF (NPACT .LE. 1) GOTO 300 * * Test if processor is active * IF (MOD(MYCOL,NPSTR) .EQ. 0) THEN * * Send/Receive blocks * * IF (MOD(MYCOL,2*NPSTR) .EQ. 0) THEN * * This node will potentially do more work later * NEICOL = MYCOL + NPSTR * IF (NEICOL/NPSTR .LT. NPACT-1) THEN BMN = BW ELSE IF (NEICOL/NPSTR .EQ. NPACT-1) THEN ODD_N = NUMROC(N, NB, NPCOL-1, 0, NPCOL) BMN = MIN(BW,ODD_N) + BWU ELSE * * Last processor skips to next level GOTO 250 ENDIF * * BM1 = M for 1st block on proc pair, BM2 2nd block * BM1 = BM BM2 = BMN * IF (NEICOL/NPSTR .LE. NPACT-1 )THEN * CALL ZGESD2D( ICTXT, BM, 2*BW, AF(BBPTR+BW*LDBB), $ LDBB, 0, NEICOL ) * CALL ZGERV2D( ICTXT, BMN, 2*BW, AF(BBPTR+BM), $ LDBB, 0, NEICOL) * IF( NPACT .EQ. 2 ) THEN * * Copy diagonal block to align whole system * CALL ZLAMOV( 'G', BMN, BW, AF( BBPTR+BM ), $ LDBB, AF( BBPTR+2*BW*LDBB+BM ), LDBB ) ENDIF * ENDIF * ELSE * * This node stops work after this stage -- an extra copy * is required to make the odd and even frontal matrices * look identical * NEICOL = MYCOL - NPSTR * IF (NEICOL .EQ. 0) THEN BMN = BW - BWU ELSE BMN = BW ENDIF * BM1 = BMN BM2 = BM * CALL ZGESD2D( ICTXT, BM, 2*BW, AF(BBPTR+BW*LDBB), $ LDBB, 0, NEICOL ) * CALL ZLAMOV('G',BM, 2*BW, AF(BBPTR+BW*LDBB),LDBB, $ AF(BBPTR+BMN),LDBB) * DO 31 J=BBPTR+2*BW*LDBB, BBPTR+3*BW*LDBB-1, LDBB DO 32 I=0, LDBB-1 AF(I+J) = CZERO 32 CONTINUE 31 CONTINUE * CALL ZGERV2D( ICTXT, BMN, 2*BW, AF(BBPTR+BW*LDBB), $ LDBB, 0, NEICOL) * IF( NPACT .EQ. 2 ) THEN * * Copy diagonal block to align whole system * CALL ZLAMOV( 'G', BM, BW, AF( BBPTR+BMN ), $ LDBB, AF( BBPTR+2*BW*LDBB+BMN ), LDBB ) ENDIF * ENDIF * * LU factorization with partial pivoting * IF (NPACT .NE. 2) THEN * CALL ZGETRF(BM+BMN, BW, AF(BBPTR+BW*LDBB), LDBB, $ IPIV(LN+1), INFO) * * Backsolve left side * DO 301 J=BBPTR,BBPTR+BW*LDBB-1, LDBB DO 302 I=0, BM1-1 AF(I+J) = CZERO 302 CONTINUE 301 CONTINUE * CALL ZLASWP(BW, AF(BBPTR), LDBB, 1, BW, $ IPIV(LN+1), 1) * CALL ZTRSM('L','L','N','U', BW, BW, CONE, $ AF(BBPTR+BW*LDBB), LDBB, AF(BBPTR), LDBB) * * Use partial factors to update remainder * CALL ZGEMM( 'N', 'N', BM+BMN-BW, BW, BW, $ -CONE, AF(BBPTR+BW*LDBB+BW), LDBB, $ AF( BBPTR ), LDBB, CONE, $ AF( BBPTR+BW ), LDBB ) * * Backsolve right side * NRHS = BW * CALL ZLASWP(NRHS, AF(BBPTR+2*BW*LDBB), LDBB, 1, BW, $ IPIV(LN+1), 1) * CALL ZTRSM('L','L','N','U', BW, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, AF(BBPTR+2*BW*LDBB), LDBB) * * Use partial factors to update remainder * CALL ZGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, $ -CONE, AF(BBPTR+BW*LDBB+BW), LDBB, $ AF( BBPTR+2*BW*LDBB ), LDBB, CONE, $ AF( BBPTR+2*BW*LDBB+BW ), LDBB ) * * * Test if processor is active in next round * IF (MOD(MYCOL,2*NPSTR) .EQ. 0) THEN * * Reset BM * BM = BM1+BM2-BW * * Local copying in the block bidiagonal area * * CALL ZLAMOV('G',BM,BW, $ AF(BBPTR+BW), $ LDBB, AF(BBPTR+BW*LDBB), LDBB) CALL ZLAMOV('G',BM,BW, $ AF(BBPTR+2*BW*LDBB+BW), $ LDBB, AF(BBPTR+2*BW*LDBB), LDBB) * * Zero out space that held original copy * DO 1020 J=0, BW-1 DO 1021 I=0, BM-1 AF(BBPTR+2*BW*LDBB+BW+J*LDBB+I) = CZERO 1021 CONTINUE 1020 CONTINUE * ENDIF * ELSE * * Factor the final 2 by 2 block matrix * CALL ZGETRF(BM+BMN,BM+BMN, AF(BBPTR+BW*LDBB), LDBB, $ IPIV(LN+1), INFO) ENDIF * ENDIF * * Last processor in an odd-sized NPACT skips to here * 250 CONTINUE * NPACT = (NPACT + 1)/2 NPSTR = NPSTR * 2 GOTO 200 * 300 CONTINUE * End loop over levels * 1000 CONTINUE * If error was found in Phase 1, processors jump here. * * Free BLACS space used to hold standard-form grid. * ICTXT = ICTXT_SAVE IF( ICTXT.NE.ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) END IF * 1234 CONTINUE * If this processor did not hold part of the grid it * jumps here. * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PZGBTRF * END * scalapack-2.0.2/SRC/pzgbtrs.f000644 000766 000024 00000112740 11750130340 016201 0ustar00juliestaff000000 000000 SUBROUTINE PZGBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER BWU, BWL, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV(*) COMPLEX*16 A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PZGBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PZGBTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * banded distributed * matrix with bandwidth BWL, BWU. * * Routine PZGBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BWL (global input) INTEGER * Number of subdiagonals. 0 <= BWL <= N-1 * * BWU (global input) INTEGER * Number of superdiagonals. 0 <= BWU <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N unsymmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * IPIV (local output) INTEGER array, dimension >= DESCA( NB ). * Pivot indices for local factorizations. * Users *should not* alter the contents between * factorization and solve. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZGBTRF and this is stored in AF. If a linear system * is to be solved using PZGBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * NRHS*(NB+2*bwl+4*bwu) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= (BWL+BWU)+1 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (max(bwl,bwu)* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Implemented for ScaLAPACK by: * Andrew J. Cleary, Livermore National Lab and University of Tenn., * and Marbwus Hegland, Australian Natonal University. Feb., 1997. * Based on code written by : Peter Arbenz, ETH Zurich, 1996. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC, $ FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM2, IDUM3, J, JA_NEW, L, LBWL, LBWU, LDBB, $ LDW, LLDA, LLDB, LM, LMJ, LN, LPTR, MYCOL, $ MYROW, NB, NEICOL, NP, NPACT, NPCOL, NPROW, $ NPSTR, NP_SAVE, ODD_SIZE, PART_OFFSET, $ RECOVERY_VAL, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN, WPTR * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, PXERBLA, $ RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -16 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BWL .GT. N-1 ) .OR. $ ( BWL .LT. 0 ) ) THEN INFO = -3 ENDIF * IF(( BWU .GT. N-1 ) .OR. $ ( BWU .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (2*BWL+2*BWU+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * BW = BWU+BWL * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -5 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PZGBTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.(BWL+BWU+1) )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PZGBTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check worksize * WORK_SIZE_MIN = NRHS*(NB+2*BWL+4*BWU) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -16 CALL PXERBLA( ICTXT, $ 'PZGBTRS: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BWU PARAM_CHECK( 4, 1 ) = BWL PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM2 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 16 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZGBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * * * Begin main code * * Move data into workspace - communicate/copy (overlap) * IF (MYCOL .LT. NPCOL-1) THEN CALL ZGESD2D( ICTXT, BWU, NRHS, B(NB-BWU+1), LLDB, $ 0, MYCOL + 1) ENDIF * IF (MYCOL .LT. NPCOL-1) THEN LM = NB-BWU ELSE LM = NB ENDIF * IF (MYCOL .GT. 0) THEN WPTR = BWU+1 ELSE WPTR = 1 ENDIF * LDW = NB+BWU + 2*BW+BWU * CALL ZLAMOV( 'G', LM, NRHS, B(1), LLDB, WORK( WPTR ), LDW ) * * Zero out rest of work * DO 1501 J=1, NRHS DO 1502 L=WPTR+LM, LDW WORK( (J-1)*LDW+L ) = CZERO 1502 CONTINUE 1501 CONTINUE * IF (MYCOL .GT. 0) THEN CALL ZGERV2D( ICTXT, BWU, NRHS, WORK(1), LDW, $ 0, MYCOL-1) ENDIF * ******************************************************************** * PHASE 1: Local computation phase -- Solve L*X = B ******************************************************************** * * Size of main (or odd) partition in each processor * ODD_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL ) * IF (MYCOL .NE. 0) THEN LBWL = BW LBWU = 0 APTR = 1 ELSE LBWL = BWL LBWU = BWU APTR = 1+BWU ENDIF * IF (MYCOL .NE. NPCOL-1) THEN LM = NB - LBWU LN = NB - BW ELSE IF (MYCOL .NE. 0) THEN LM = ODD_SIZE + BWU LN = MAX(ODD_SIZE-BW,0) ELSE LM = N LN = MAX( N-BW, 0 ) ENDIF * DO 21 J = 1, LN * LMJ = MIN(LBWL,LM-J) L = IPIV( J ) * IF( L.NE.J ) THEN CALL ZSWAP(NRHS, WORK(L), LDW, WORK(J), LDW) ENDIF * LPTR = BW+1 + (J-1)*LLDA + APTR * CALL ZGERU(LMJ,NRHS,-CONE, A(LPTR),1, WORK(J),LDW, $ WORK(J+1),LDW) * 21 CONTINUE * ******************************************************************** * PHASE 2: Global computation phase -- Solve L*X = B ******************************************************************** * * Define the initial dimensions of the diagonal blocks * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW * IF (MYCOL .NE. NPCOL-1) THEN BM = BW - LBWU BN = BW ELSE BM = MIN(BW,ODD_SIZE) + BWU BN = MIN(BW,ODD_SIZE) ENDIF * * Pointer to first element of block bidiagonal matrix in AF * Leading dimension of block bidiagonal system * BBPTR = (NB+BWU)*BW + 1 LDBB = 2*BW + BWU * IF (NPCOL.EQ.1) THEN * * In this case the loop over the levels will not be * performed. CALL ZGETRS( 'N', N-LN, NRHS, AF(BBPTR+BW*LDBB), LDBB, $ IPIV(LN+1), WORK( LN+1 ), LDW, INFO) * ENDIF * * Loop over levels ... * * The two integers NPACT (nu. of active processors) and NPSTR * (stride between active processors) is used to control the * loop. * NPACT = NPCOL NPSTR = 1 * * Begin loop over levels 200 IF (NPACT .LE. 1) GOTO 300 * * Test if processor is active IF (MOD(MYCOL,NPSTR) .EQ. 0) THEN * * Send/Receive blocks * IF (MOD(MYCOL,2*NPSTR) .EQ. 0) THEN * NEICOL = MYCOL + NPSTR * IF (NEICOL/NPSTR .LE. NPACT-1) THEN * IF (NEICOL/NPSTR .LT. NPACT-1) THEN BMN = BW ELSE BMN = MIN(BW,NUMROC(N, NB, NEICOL, 0, NPCOL))+BWU ENDIF * CALL ZGESD2D( ICTXT, BM, NRHS, $ WORK(LN+1), LDW, 0, NEICOL ) * IF( NPACT .NE. 2 )THEN * * Receive answers back from partner processor * CALL ZGERV2D(ICTXT, BM+BMN-BW, NRHS, $ WORK( LN+1 ), LDW, 0, NEICOL ) * BM = BM+BMN-BW * ENDIF * ENDIF * ELSE * NEICOL = MYCOL - NPSTR * IF (NEICOL .EQ. 0) THEN BMN = BW - BWU ELSE BMN = BW ENDIF * CALL ZLAMOV( 'G', BM, NRHS, WORK(LN+1), LDW, $ WORK(NB+BWU+BMN+1), LDW ) * CALL ZGERV2D( ICTXT, BMN, NRHS, WORK( NB+BWU+1 ), $ LDW, 0, NEICOL ) * * and do the permutations and eliminations * IF (NPACT .NE. 2) THEN * * Solve locally for BW variables * CALL ZLASWP( NRHS, WORK(NB+BWU+1), LDW, 1, BW, $ IPIV(LN+1), 1) * CALL ZTRSM('L','L','N','U', BW, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW) * * Use soln just calculated to update RHS * CALL ZGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, $ -CONE, AF(BBPTR+BW*LDBB+BW), LDBB, $ WORK(NB+BWU+1), LDW, $ CONE, WORK(NB+BWU+1+BW), LDW ) * * Give answers back to partner processor * CALL ZGESD2D( ICTXT, BM+BMN-BW, NRHS, $ WORK(NB+BWU+1+BW), LDW, 0, NEICOL ) * ELSE * * Finish up calculations for final level * CALL ZLASWP( NRHS, WORK(NB+BWU+1), LDW, 1, BM+BMN, $ IPIV(LN+1), 1) * CALL ZTRSM('L','L','N','U', BM+BMN, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW) ENDIF * ENDIF * NPACT = (NPACT + 1)/2 NPSTR = NPSTR * 2 GOTO 200 * ENDIF * 300 CONTINUE * * ************************************** * BACKSOLVE ******************************************************************** * PHASE 2: Global computation phase -- Solve U*Y = X ******************************************************************** * IF (NPCOL.EQ.1) THEN * * In this case the loop over the levels will not be * performed. * In fact, the backsolve portion was done in the call to * ZGETRS in the frontsolve. * ENDIF * * Compute variable needed to reverse loop structure in * reduced system. * RECOVERY_VAL = NPACT*NPSTR - NPCOL * * Loop over levels * Terminal values of NPACT and NPSTR from frontsolve are used * 2200 IF( NPACT .GE. NPCOL ) GOTO 2300 * NPSTR = NPSTR/2 * NPACT = NPACT*2 * * Have to adjust npact for non-power-of-2 * NPACT = NPACT-MOD( (RECOVERY_VAL/NPSTR), 2 ) * * Find size of submatrix in this proc at this level * IF( MYCOL/NPSTR .LT. NPACT-1 ) THEN BN = BW ELSE BN = MIN(BW, NUMROC(N, NB, NPCOL-1, 0, NPCOL) ) ENDIF * * If this processor is even in this level... * IF( MOD( MYCOL, 2*NPSTR ) .EQ. 0 ) THEN * NEICOL = MYCOL+NPSTR * IF( NEICOL/NPSTR .LE. NPACT-1 ) THEN * IF( NEICOL/NPSTR .LT. NPACT-1 ) THEN BMN = BW BNN = BW ELSE BMN = MIN(BW,NUMROC(N, NB, NEICOL, 0, NPCOL))+BWU BNN = MIN(BW, NUMROC(N, NB, NEICOL, 0, NPCOL) ) ENDIF * IF( NPACT .GT. 2 ) THEN * CALL ZGESD2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * CALL ZGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * ELSE * CALL ZGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * ENDIF * ENDIF * ELSE * This processor is odd on this level * NEICOL = MYCOL - NPSTR * IF (NEICOL .EQ. 0) THEN BMN = BW - BWU ELSE BMN = BW ENDIF * IF( NEICOL .LT. NPCOL-1 ) THEN BNN = BW ELSE BNN = MIN(BW, NUMROC(N, NB, NEICOL, 0, NPCOL) ) ENDIF * IF( NPACT .GT. 2 ) THEN * * Move RHS to make room for received solutions * CALL ZLAMOV( 'G', BW, NRHS, WORK(NB+BWU+1), $ LDW, WORK(NB+BWU+BW+1), LDW ) * CALL ZGERV2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), $ LDW, 0, NEICOL ) * CALL ZGEMM( 'N', 'N', BW, NRHS, BN, $ -CONE, AF(BBPTR), LDBB, $ WORK(LN+1), LDW, $ CONE, WORK(NB+BWU+BW+1), LDW ) * * IF( MYCOL .GT. NPSTR ) THEN * CALL ZGEMM( 'N', 'N', BW, NRHS, BW, $ -CONE, AF(BBPTR+2*BW*LDBB), LDBB, $ WORK(LN+BW+1), LDW, $ CONE, WORK(NB+BWU+BW+1), LDW ) * ENDIF * CALL ZTRSM('L','U','N','N', BW, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+BW+1), LDW) * * Send new solution to neighbor * CALL ZGESD2D( ICTXT, BW, NRHS, $ WORK( NB+BWU+BW+1 ), LDW, 0, NEICOL ) * * Copy new solution into expected place * CALL ZLAMOV( 'G', BW, NRHS, WORK(NB+BWU+1+BW), $ LDW, WORK(LN+BW+1), LDW ) * ELSE * * Solve with local diagonal block * CALL ZTRSM( 'L','U','N','N', BN+BNN, NRHS, CONE, $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW) * * Send new solution to neighbor * CALL ZGESD2D( ICTXT, BW, NRHS, $ WORK(NB+BWU+1), LDW, 0, NEICOL ) * * Shift solutions into expected positions * CALL ZLAMOV( 'G', BNN+BN-BW, NRHS, WORK(NB+BWU+1+BW), $ LDW, WORK(LN+1), LDW ) * * IF( (NB+BWU+1) .NE. (LN+1+BW) ) THEN * * Copy one row at a time since spaces may overlap * DO 1064 J=1, BW CALL ZCOPY( NRHS, WORK(NB+BWU+J), LDW, $ WORK(LN+BW+J), LDW ) 1064 CONTINUE * ENDIF * ENDIF * ENDIF * GOTO 2200 * 2300 CONTINUE * End of loop over levels * ******************************************************************** * PHASE 1: (Almost) Local computation phase -- Solve U*Y = X ******************************************************************** * * Reset BM to value it had before reduced system frontsolve... * IF (MYCOL .NE. NPCOL-1) THEN BM = BW - LBWU ELSE BM = MIN(BW,ODD_SIZE) + BWU ENDIF * * First metastep is to account for the fillin blocks AF * IF( MYCOL .LT. NPCOL-1 ) THEN * CALL ZGESD2D( ICTXT, BW, NRHS, WORK( NB-BW+1 ), $ LDW, 0, MYCOL+1 ) * ENDIF * IF( MYCOL .GT. 0 ) THEN * CALL ZGERV2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ), $ LDW, 0, MYCOL-1 ) * * Modify local right hand sides with received rhs's * CALL ZGEMM( 'N', 'N', LM-BM, NRHS, BW, -CONE, $ AF( 1 ), LM, WORK( NB+BWU+1 ), LDW, CONE, $ WORK( 1 ), LDW ) * ENDIF * DO 2021 J = LN, 1, -1 * LMJ = MIN( BW, ODD_SIZE-1 ) * LPTR = BW-1+J*LLDA+APTR * * In the following, the TRANS=T option is used to reverse * the order of multiplication, not as a true transpose * CALL ZGEMV( 'T', LMJ, NRHS, -CONE, WORK( J+1), LDW, $ A( LPTR ), LLDA-1, CONE, WORK( J ), LDW ) * * Divide by diagonal element * CALL ZSCAL( NRHS, CONE/A( LPTR-LLDA+1 ), $ WORK( J ), LDW ) 2021 CONTINUE * * * CALL ZLAMOV( 'G', ODD_SIZE, NRHS, WORK( 1 ), LDW, $ B( 1 ), LLDB ) * * Free BLACS space used to hold standard-form grid. * ICTXT = ICTXT_SAVE IF( ICTXT .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * NP = NP_SAVE * * Output worksize * WORK( 1 ) = WORK_SIZE_MIN * RETURN * * End of PZGBTRS * END scalapack-2.0.2/SRC/pzgebd2.f000644 000766 000024 00000043615 10363532303 016053 0ustar00juliestaff000000 000000 SUBROUTINE PZGEBD2( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION D( * ), E( * ) COMPLEX*16 A( * ), TAUP( * ), TAUQ( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEBD2 reduces a complex general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper or lower bidiagonal * form B by an unitary transformation: Q' * sub( A ) * P = B. * * If M >= N, B is upper bidiagonal; if M < N, B is lower bidiagonal. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ). On exit, if M >= N, * the diagonal and the first superdiagonal of sub( A ) are * overwritten with the upper bidiagonal matrix B; the elements * below the diagonal, with the array TAUQ, represent the * unitary matrix Q as a product of elementary reflectors, and * the elements above the first superdiagonal, with the array * TAUP, represent the orthogonal matrix P as a product of * elementary reflectors. If M < N, the diagonal and the first * subdiagonal are overwritten with the lower bidiagonal * matrix B; the elements below the first subdiagonal, with the * array TAUQ, represent the unitary matrix Q as a product of * elementary reflectors, and the elements above the diagonal, * with the array TAUP, represent the orthogonal matrix P as a * product of elementary reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) COMPLEX*16 array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUQ is * tied to the distributed matrix A. See Further Details. * * TAUP (local output) COMPLEX*16 array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix P. TAUP is * tied to the distributed matrix A. See Further Details. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( MpA0, NqA0 ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ) * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+IROFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * If m >= n, * * Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors; * v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1); * u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in * A(ia+i-1,ja+i+1:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, * * Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors; * v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); * u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in * A(ia+i-1,ja+i:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) * ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) * ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) * ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) * ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) * ( v1 v2 v3 v4 v5 ) * * where d and e denote diagonal and off-diagonal elements of B, vi * denotes an element of the vector defining H(i), and ui an element of * the vector defining G(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, II, IROFFA, J, $ JJ, K, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 COMPLEX*16 ALPHA * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, DESCSET, $ DGEBR2D, DGEBS2D, INFOG2L, PXERBLA, $ PDELSET, PZELSET, PZLACGV, PZLARF, $ PZLARFC, PZLARFG, ZGEBR2D, ZGEBS2D, $ ZLARFG * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) LWMIN = MAX( MPA0, NQA0 ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IROFFA.NE.ICOFFA ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEBD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) * IF( M.EQ.1 .AND. N.EQ.1 ) THEN IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN I = II+(JJ-1)*DESCA( LLD_ ) CALL ZLARFG( 1, A( I ), A( I ), 1, TAUQ( JJ ) ) D( JJ ) = DBLE( A( I ) ) CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, D( JJ ), $ 1 ) CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TAUQ( JJ ), $ 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, D( JJ ), $ 1, IAROW, IACOL ) CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAUQ( JJ ), $ 1, IAROW, IACOL ) END IF END IF IF( MYROW.EQ.IAROW ) $ TAUP( II ) = ZERO RETURN END IF * ALPHA = ZERO * IF( M.GE.N ) THEN * * Reduce to upper bidiagonal form * CALL DESCSET( DESCD, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) DO 10 K = 1, N I = IA + K - 1 J = JA + K - 1 * * Generate elementary reflector H(j) to annihilate * A(ia+i:ia+m-1,j) * CALL PZLARFG( M-K+1, ALPHA, I, J, A, MIN( I+1, M+IA-1 ), $ J, DESCA, 1, TAUQ ) CALL PDELSET( D, 1, J, DESCD, DBLE( ALPHA ) ) CALL PZELSET( A, I, J, DESCA, ONE ) * * Apply H(i) to A(i:ia+m-1,i+1:ja+n-1) from the left * CALL PZLARFC( 'Left', M-K+1, N-K, A, I, J, DESCA, 1, TAUQ, $ A, I, J+1, DESCA, WORK ) CALL PZELSET( A, I, J, DESCA, DCMPLX( DBLE( ALPHA ) ) ) * IF( K.LT.N ) THEN * * Generate elementary reflector G(i) to annihilate * A(i,ja+j+1:ja+n-1) * CALL PZLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PZLARFG( N-K, ALPHA, I, J+1, A, I, $ MIN( J+2, JA+N-1 ), DESCA, DESCA( M_ ), $ TAUP ) CALL PDELSET( E, I, 1, DESCE, DBLE( ALPHA ) ) CALL PZELSET( A, I, J+1, DESCA, ONE ) * * Apply G(i) to A(i+1:ia+m-1,i+1:ja+n-1) from the right * CALL PZLARF( 'Right', M-K, N-K, A, I, J+1, DESCA, $ DESCA( M_ ), TAUP, A, I+1, J+1, DESCA, $ WORK ) CALL PZELSET( A, I, J+1, DESCA, DCMPLX( DBLE( ALPHA ) ) ) CALL PZLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) ELSE CALL PZELSET( TAUP, I, 1, DESCE, ZERO ) END IF 10 CONTINUE * ELSE * * Reduce to lower bidiagonal form * CALL DESCSET( DESCD, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) DO 20 K = 1, M I = IA + K - 1 J = JA + K - 1 * * Generate elementary reflector G(i) to annihilate * A(i,ja+j:ja+n-1) * CALL PZLACGV( N-K+1, A, I, J, DESCA, DESCA( M_ ) ) CALL PZLARFG( N-K+1, ALPHA, I, J, A, I, $ MIN( J+1, JA+N-1 ), DESCA, DESCA( M_ ), TAUP ) CALL PDELSET( D, I, 1, DESCD, DBLE( ALPHA ) ) CALL PZELSET( A, I, J, DESCA, ONE ) * * Apply G(i) to A(i:ia+m-1,j:ja+n-1) from the right * CALL PZLARF( 'Right', M-K, N-K+1, A, I, J, DESCA, $ DESCA( M_ ), TAUP, A, MIN( I+1, IA+M-1 ), J, $ DESCA, WORK ) CALL PZELSET( A, I, J, DESCA, DCMPLX( DBLE( ALPHA ) ) ) CALL PZLACGV( N-K+1, A, I, J, DESCA, DESCA( M_ ) ) * IF( K.LT.M ) THEN * * Generate elementary reflector H(i) to annihilate * A(i+2:ia+m-1,j) * CALL PZLARFG( M-K, ALPHA, I+1, J, A, $ MIN( I+2, IA+M-1 ), J, DESCA, 1, TAUQ ) CALL PDELSET( E, 1, J, DESCE, DBLE( ALPHA ) ) CALL PZELSET( A, I+1, J, DESCA, ONE ) * * Apply H(i) to A(i+1:ia+m-1,j+1:ja+n-1) from the left * CALL PZLARFC( 'Left', M-K, N-K, A, I+1, J, DESCA, 1, $ TAUQ, A, I+1, J+1, DESCA, WORK ) CALL PZELSET( A, I+1, J, DESCA, DCMPLX( DBLE( ALPHA ) ) ) ELSE CALL PZELSET( TAUQ, 1, J, DESCE, ZERO ) END IF 20 CONTINUE END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGEBD2 * END scalapack-2.0.2/SRC/pzgebrd.f000644 000766 000024 00000040254 10363532303 016147 0ustar00juliestaff000000 000000 SUBROUTINE PZGEBRD( M, N, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION D( * ), E( * ) COMPLEX*16 A( * ), TAUP( * ), TAUQ( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEBRD reduces a complex general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper or lower bidiagonal * form B by an unitary transformation: Q' * sub( A ) * P = B. * * If M >= N, B is upper bidiagonal; if M < N, B is lower bidiagonal. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ). On exit, if M >= N, * the diagonal and the first superdiagonal of sub( A ) are * overwritten with the upper bidiagonal matrix B; the elements * below the diagonal, with the array TAUQ, represent the * unitary matrix Q as a product of elementary reflectors, and * the elements above the first superdiagonal, with the array * TAUP, represent the orthogonal matrix P as a product of * elementary reflectors. If M < N, the diagonal and the first * subdiagonal are overwritten with the lower bidiagonal * matrix B; the elements below the first subdiagonal, with the * array TAUQ, represent the unitary matrix Q as a product of * elementary reflectors, and the elements above the diagonal, * with the array TAUP, represent the orthogonal matrix P as a * product of elementary reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; * if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) COMPLEX*16 array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUQ is * tied to the distributed matrix A. See Further Details. * * TAUP (local output) COMPLEX*16 array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix P. TAUP is * tied to the distributed matrix A. See Further Details. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB*( MpA0 + NqA0 + 1 ) + NqA0 * * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, NB, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * If m >= n, * * Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors; * v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1); * u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in * A(ia+i-1,ja+i+1:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, * * Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors; * v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); * u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in * A(ia+i-1,ja+i:ja+n-1); * tauq is stored in TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) * ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) * ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) * ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) * ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) * ( v1 v2 v3 v4 v5 ) * * where d and e denote diagonal and off-diagonal elements of B, vi * denotes an element of the vector defining H(i), and ui an element of * the vector defining G(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICTXT, IINFO, IOFF, IPW, IPY, $ IW, J, JB, JS, JW, K, L, LWMIN, MN, MP, MYCOL, $ MYROW, NB, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCWX( DLEN_ ), DESCWY( DLEN_ ), IDUM1( 1 ), $ IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZGEBD2, PZGEMM, PZLABRD * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCA( MB_ ) IOFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) MP = NUMROC( M+IOFF, NB, MYROW, IAROW, NPROW ) NQ = NUMROC( N+IOFF, NB, MYCOL, IACOL, NPCOL ) LWMIN = NB*( MP+NQ+1 ) + NQ * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IOFF.NE.MOD( JA-1, DESCA( NB_ ) ) ) THEN INFO = -5 ELSE IF( NB.NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 12 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEBRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * MN = MIN( M, N ) IF( MN.EQ.0 ) $ RETURN * * Initialize parameters. * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPY = MP * NB + 1 IPW = NQ * NB + IPY * CALL DESCSET( DESCWX, M+IOFF, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCWY, NB, N+IOFF, NB, NB, IAROW, IACOL, ICTXT, $ NB ) * MP = NUMROC( M+IA-1, NB, MYROW, DESCA( RSRC_ ), NPROW ) NQ = NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) K = 1 JB = NB - IOFF IW = IOFF + 1 JW = IOFF + 1 * DO 10 L = 1, MN+IOFF-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce rows and columns i:i+nb-1 to bidiagonal form and return * the matrices X and Y which are needed to update the unreduced * part of the matrix. * CALL PZLABRD( M-K+1, N-K+1, JB, A, I, J, DESCA, D, E, TAUQ, $ TAUP, WORK, IW, JW, DESCWX, WORK( IPY ), IW, $ JW, DESCWY, WORK( IPW ) ) * * Update the trailing submatrix A(i+nb:ia+m-1,j+nb:ja+n-1), using * an update of the form A := A - V*Y' - X*U'. * CALL PZGEMM( 'No transpose', 'No transpose', M-K-JB+1, $ N-K-JB+1, JB, -ONE, A, I+JB, J, DESCA, $ WORK( IPY ), IW, JW+JB, DESCWY, ONE, A, I+JB, $ J+JB, DESCA ) CALL PZGEMM( 'No transpose', 'No transpose', M-K-JB+1, $ N-K-JB+1, JB, -ONE, WORK, IW+JB, JW, DESCWX, A, I, $ J+JB, DESCA, ONE, A, I+JB, J+JB, DESCA ) * * Copy last off-diagonal elements of B back into sub( A ). * IF( M.GE.N ) THEN JS = MIN( INDXG2L( I+JB-1, NB, 0, DESCA( RSRC_ ), NPROW ), $ MP ) IF( JS.GT.0 ) $ CALL PZELSET( A, I+JB-1, J+JB, DESCA, DCMPLX( E( JS ) ) ) ELSE JS = MIN( INDXG2L( J+JB-1, NB, 0, DESCA( CSRC_ ), NPCOL ), $ NQ ) IF( JS.GT.0 ) $ CALL PZELSET( A, I+JB, J+JB-1, DESCA, DCMPLX( E( JS ) ) ) END IF * K = K + JB JB = NB IW = 1 JW = 1 DESCWX( M_ ) = DESCWX( M_ ) - JB DESCWX( RSRC_ ) = MOD( DESCWX( RSRC_ ) + 1, NPROW ) DESCWX( CSRC_ ) = MOD( DESCWX( CSRC_ ) + 1, NPCOL ) DESCWY( N_ ) = DESCWY( N_ ) - JB DESCWY( RSRC_ ) = MOD( DESCWY( RSRC_ ) + 1, NPROW ) DESCWY( CSRC_ ) = MOD( DESCWY( CSRC_ ) + 1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the remainder of the matrix. * CALL PZGEBD2( M-K+1, N-K+1, A, IA+K-1, JA+K-1, DESCA, D, E, TAUQ, $ TAUP, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGEBRD * END scalapack-2.0.2/SRC/pzgecon.f000644 000766 000024 00000037527 11252745702 016177 0ustar00juliestaff000000 000000 SUBROUTINE PZGECON( NORM, N, A, IA, JA, DESCA, ANORM, RCOND, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, INFO, JA, LRWORK, LWORK, N DOUBLE PRECISION ANORM, RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION RWORK( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZGECON estimates the reciprocal of the condition number of a general * distributed complex matrix A(IA:IA+N-1,JA:JA+N-1), in either the * 1-norm or the infinity-norm, using the LU factorization computed by * PZGETRF. * * An estimate is obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), and * the reciprocal of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies whether the 1-norm condition number or the * infinity-norm condition number is required: * = '1' or 'O': 1-norm * = 'I': Infinity-norm * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). On entry, * this array contains the local pieces of the factors L and U * from the factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U; the * unit diagonal elements of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ANORM (global input) DOUBLE PRECISION * If NORM = '1' or 'O', the 1-norm of the original distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * If NORM = 'I', the infinity-norm of the original distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * * RCOND (global output) DOUBLE PRECISION * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + * MAX( 2, MAX(NB_A*CEIL(NPROW-1,NPCOL),LOCc(N+MOD(JA-1,NB_A)) + * NB_A*CEIL(NPCOL-1,NPROW)) ). * * LOCr and LOCc values can be computed using the ScaLAPACK * tool function NUMROC; NPROW and NPCOL can be determined by * calling the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= MAX( 1, 2*LOCc(N+MOD(JA-1,NB_A)) ). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, ONENRM CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPNL, IPNU, $ IPV, IPW, IPX, IROFF, IV, IX, IXX, JJA, JV, JX, $ KASE, KASE1, LRWMIN, LWMIN, MYCOL, MYROW, NP, $ NPCOL, NPMOD, NPROW, NQ, NQMOD DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU COMPLEX*16 WMAX, ZDUM * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG2L, $ PCHK1MAT, PB_TOPGET, PB_TOPSET, PXERBLA, $ PZAMAX, PZLATRS, PZLACON, PZDRSCL, $ ZGEBR2D, ZGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, ICHAR, MAX, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600 + CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN ONENRM = NORM.EQ.'1' .OR. LSAME( NORM, 'O' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = DBLE( LWMIN ) LRWMIN = MAX( 1, 2*NQMOD ) RWORK( 1 ) = DBLE( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.ONENRM .AND. .NOT.LSAME( NORM, 'I' ) ) THEN INFO = -1 ELSE IF( ANORM.LT.ZERO ) THEN INFO = -7 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF * IF( ONENRM ) THEN IDUM1( 1 ) = ICHAR( '1' ) ELSE IDUM1( 1 ) = ICHAR( 'I' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 IF( LRWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 12 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGECON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * RCOND = ZERO IF( N.EQ.0 ) THEN RCOND = ONE RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN ELSE IF( N.EQ.1 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPW = IPV + NP IPNL = 1 IPNU = IPNL + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Estimate the norm of inv(A). * AINVNM = ZERO NORMIN = 'N' IF( ONENRM ) THEN KASE1 = 1 ELSE KASE1 = 2 END IF KASE = 0 * 10 CONTINUE CALL PZLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), IX, JX, $ DESCX, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( KASE.EQ.KASE1 ) THEN * * Multiply by inv(L). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( 'Lower', 'No transpose', 'Unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, RWORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL * * Multiply by inv(U). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( 'Upper', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, RWORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(U'). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( 'Upper', 'Conjugate transpose', 'Non-unit', $ NORMIN, N, A, IA, JA, DESCA, WORK( IPX ), IX, $ JX, DESCX, SU, RWORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL * * Multiply by inv(L'). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( 'Lower', 'Conjugate transpose', 'Unit', $ NORMIN, N, A, IA, JA, DESCA, WORK( IPX ), $ IX, JX, DESCX, SL, RWORK( IPNL ), $ WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF * * Divide X by 1/(SL*SU) if doing so will not cause overflow. * SCALE = SL*SU NORMIN = 'Y' IF( SCALE.NE.ONE ) THEN CALL PZAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL ZGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1 ) ELSE CALL ZGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1, $ IAROW, MYCOL ) END IF END IF IF( SCALE.LT.CABS1( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PZDRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / AINVNM ) / ANORM * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PZGECON * END scalapack-2.0.2/SRC/pzgeequ.f000644 000766 000024 00000033120 10363532303 016164 0ustar00juliestaff000000 000000 SUBROUTINE PZGEEQU( M, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N DOUBLE PRECISION AMAX, COLCND, ROWCND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION C( * ), R( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZGEEQU computes row and column scalings intended to equilibrate an * M-by-N distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA:JA+N-1) and * reduce its condition number. R returns the row scale factors and C * the column scale factors, chosen to try to make the largest entry in * each row and column of the distributed matrix B with elements * B(i,j) = R(i) * A(i,j) * C(j) have absolute value 1. * * R(i) and C(j) are restricted to be between SMLNUM = smallest safe * number and BIGNUM = largest safe number. Use of these scaling * factors is not guaranteed to reduce the condition number of * sub( A ) but works well in practice. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ), the * local pieces of the M-by-N distributed matrix whose * equilibration factors are to be computed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * R (local output) DOUBLE PRECISION array, dimension LOCr(M_A) * If INFO = 0 or INFO > IA+M-1, R(IA:IA+M-1) contains the row * scale factors for sub( A ). R is aligned with the distributed * matrix A, and replicated across every process column. R is * tied to the distributed matrix A. * * C (local output) DOUBLE PRECISION array, dimension LOCc(N_A) * If INFO = 0, C(JA:JA+N-1) contains the column scale factors * for sub( A ). C is aligned with the distributed matrix A, and * replicated down every process row. C is tied to the distri- * buted matrix A. * * ROWCND (global output) DOUBLE PRECISION * If INFO = 0 or INFO > IA+M-1, ROWCND contains the ratio of * the smallest R(i) to the largest R(i) (IA <= i <= IA+M-1). * If ROWCND >= 0.1 and AMAX is neither too large nor too small, * it is not worth scaling by R(IA:IA+M-1). * * COLCND (global output) DOUBLE PRECISION * If INFO = 0, COLCND contains the ratio of the smallest C(j) * to the largest C(j) (JA <= j <= JA+N-1). If COLCND >= 0.1, it * is not worth scaling by C(JA:JA+N-1). * * AMAX (global output) DOUBLE PRECISION * Absolute value of largest distributed matrix element. If * AMAX is very close to overflow or very close to underflow, * the matrix should be scaled. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, and i is * <= M: the i-th row of the distributed matrix sub( A ) * is exactly zero, * > M: the (i-M)-th column of the distributed * matrix sub( A ) is exactly zero. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IDUMM, IIA, $ IOFFA, IROFF, J, JJA, LDA, MP, MYCOL, MYROW, $ NPCOL, NPROW, NQ DOUBLE PRECISION BIGNUM, RCMAX, RCMIN, SMLNUM COMPLEX*16 ZDUM * .. * .. Local Arrays .. INTEGER DESCC( DLEN_ ), DESCR( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMN2D, $ DGAMX2D, IGAMX2D, INFOG2L, PCHK1MAT, PB_TOPGET, $ PXERBLA * .. * .. External Functions .. INTEGER INDXL2G, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL INDXL2G, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX, MIN, MOD * .. * .. Statement Functions .. DOUBLE PRECISION ZABS1 * .. * .. Statement Function definitions .. ZABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 0, IDUMM, IDUMM, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEEQU', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) THEN ROWCND = ONE COLCND = ONE AMAX = ZERO RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * * Get machine constants and local indexes. * SMLNUM = PDLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * * Assign descriptors for R and C arrays * CALL DESCSET( DESCR, M, 1, DESCA( MB_ ), 1, 0, 0, ICTXT, $ MAX( 1, MP ) ) CALL DESCSET( DESCC, 1, N, 1, DESCA( NB_ ), 0, 0, ICTXT, 1 ) * * Compute row scale factors. * DO 10 I = IIA, IIA+MP-1 R( I ) = ZERO 10 CONTINUE * * Find the maximum element in each row. * IOFFA = (JJA-1)*LDA DO 30 J = JJA, JJA+NQ-1 DO 20 I = IIA, IIA+MP-1 R( I ) = MAX( R( I ), ZABS1( A( IOFFA + I ) ) ) 20 CONTINUE IOFFA = IOFFA + LDA 30 CONTINUE CALL DGAMX2D( ICTXT, 'Rowwise', ROWCTOP, MP, 1, R( IIA ), $ MAX( 1, MP ), IDUMM, IDUMM, -1, -1, MYCOL ) * * Find the maximum and minimum scale factors. * RCMIN = BIGNUM RCMAX = ZERO DO 40 I = IIA, IIA+MP-1 RCMAX = MAX( RCMAX, R( I ) ) RCMIN = MIN( RCMIN, R( I ) ) 40 CONTINUE CALL DGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMAX, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) CALL DGAMN2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMIN, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) AMAX = RCMAX * IF( RCMIN.EQ.ZERO ) THEN * * Find the first zero scale factor and return an error code. * DO 50 I = IIA, IIA+MP-1 IF( R( I ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = INDXL2G( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) - IA + 1 50 CONTINUE CALL IGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, INFO, 1, $ IDUMM, IDUMM, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN ELSE * * Invert the scale factors. * DO 60 I = IIA, IIA+MP-1 R( I ) = ONE / MIN( MAX( R( I ), SMLNUM ), BIGNUM ) 60 CONTINUE * * Compute ROWCND = min(R(I)) / max(R(I)) * ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) * END IF * * Compute column scale factors * DO 70 J = JJA, JJA+NQ-1 C( J ) = ZERO 70 CONTINUE * * Find the maximum element in each column, * assuming the row scaling computed above. * IOFFA = (JJA-1)*LDA DO 90 J = JJA, JJA+NQ-1 DO 80 I = IIA, IIA+MP-1 C( J ) = MAX( C( J ), ZABS1( A( IOFFA + I ) )*R( I ) ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE CALL DGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, NQ, C( JJA ), $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) * * Find the maximum and minimum scale factors. * RCMIN = BIGNUM RCMAX = ZERO DO 100 J = JJA, JJA+NQ-1 RCMIN = MIN( RCMIN, C( J ) ) RCMAX = MAX( RCMAX, C( J ) ) 100 CONTINUE CALL DGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMAX, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) CALL DGAMN2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, RCMIN, 1, IDUMM, $ IDUMM, -1, -1, MYCOL ) * IF( RCMIN.EQ.ZERO ) THEN * * Find the first zero scale factor and return an error code. * DO 110 J = JJA, JJA+NQ-1 IF( C( J ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = M + INDXL2G( J, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) - JA + 1 110 CONTINUE CALL IGAMX2D( ICTXT, 'Columnwise', COLCTOP, 1, 1, INFO, 1, $ IDUMM, IDUMM, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN ELSE * * Invert the scale factors. * DO 120 J = JJA, JJA+NQ-1 C( J ) = ONE / MIN( MAX( C( J ), SMLNUM ), BIGNUM ) 120 CONTINUE * * Compute COLCND = min(C(J)) / max(C(J)) * COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) * END IF * RETURN * * End of PZGEEQU * END scalapack-2.0.2/SRC/pzgehd2.f000644 000766 000024 00000027007 10363532303 016056 0ustar00juliestaff000000 000000 SUBROUTINE PZGEHD2( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEHD2 reduces a complex general distributed matrix sub( A ) * to upper Hessenberg form H by an unitary similarity transformation: * Q' * sub( A ) * Q = H, where * sub( A ) = A(IA+N-1:IA+N-1,JA+N-1:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows IA:IA+ILO-2 and IA+IHI:IA+N-1 and columns JA:JA+JLO-2 * and JA+JHI:JA+N-1. See Further Details. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) to be reduced. On exit, * the upper triangle and the first subdiagonal of sub( A ) are * overwritten with the upper Hessenberg matrix H, and the ele- * ments below the first subdiagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16 array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). Elements JA:JA+ILO-2 and JA+IHI:JA+N-2 of TAU are * set to zero. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB + MAX( NpA0, NB ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of (ihi-ilo) elementary * reflectors * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on * exit in A(ia+ilo+i:ia+ihi-1,ja+ilo+i-2), and tau in TAU(ja+ilo+i-2). * * The contents of A(IA:IA+N-1,JA:JA+N-1) are illustrated by the follo- * wing example, with n = 7, ilo = 2 and ihi = 6: * * on entry on exit * * ( a a a a a a a ) ( a a h h h h a ) * ( a a a a a a ) ( a h h h h a ) * ( a a a a a a ) ( h h h h h h ) * ( a a a a a a ) ( v2 h h h h h ) * ( a a a a a a ) ( v2 v3 h h h h ) * ( a a a a a a ) ( v2 v3 v4 h h h ) * ( a ) ( a ) * * where a denotes an element of the original matrix sub( A ), h denotes * a modified element of the upper Hessenberg matrix H, and vi denotes * an element of the vector defining H(ja+ilo+i-2). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IAROW, ICOFFA, ICTXT, IROFFA, J, K, LWMIN, $ MYCOL, MYROW, NPA0, NPCOL, NPROW COMPLEX*16 AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, PXERBLA, $ PZELSET, PZLARF, PZLARFC, PZLARFG * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) NPA0 = NUMROC( IHI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = DESCA( NB_ ) + MAX( NPA0, DESCA( NB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -2 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEHD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * DO 10 K = ILO, IHI-1 I = IA + K - 1 J = JA + K - 1 * * Compute elementary reflector H(j) to annihilate * A(i+2:ihi+ia-1,j) * CALL PZLARFG( IHI-K, AII, I+1, J, A, MIN( I+2, N+IA-1 ), J, $ DESCA, 1, TAU ) CALL PZELSET( A, I+1, J, DESCA, ONE ) * * Apply H(k) to A(ia:ihi+ia-1,j+1:ihi+ja-1) from the right * CALL PZLARF( 'Right', IHI, IHI-K, A, I+1, J, DESCA, 1, TAU, A, $ IA, J+1, DESCA, WORK ) * * Apply H(j) to A(i+1:ia+ihi-1,j+1:ja+n-1) from the left * CALL PZLARFC( 'Left', IHI-K, N-K, A, I+1, J, DESCA, 1, TAU, A, $ I+1, J+1, DESCA, WORK ) * CALL PZELSET( A, I+1, J, DESCA, AII ) 10 CONTINUE * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGEHD2 * END scalapack-2.0.2/SRC/pzgehrd.f000644 000766 000024 00000035600 10363532303 016154 0ustar00juliestaff000000 000000 SUBROUTINE PZGEHRD( N, ILO, IHI, A, IA, JA, DESCA, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, IHI, ILO, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEHRD reduces a complex general distributed matrix sub( A ) * to upper Hessenberg form H by an unitary similarity transformation: * Q' * sub( A ) * Q = H, where * sub( A ) = A(IA+N-1:IA+N-1,JA+N-1:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that sub( A ) is already upper triangular in * rows IA:IA+ILO-2 and IA+IHI:IA+N-1 and columns JA:JA+ILO-2 * and JA+IHI:JA+N-1. See Further Details. If N > 0, * 1 <= ILO <= IHI <= N; otherwise set ILO = 1, IHI = N. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * general distributed matrix sub( A ) to be reduced. On exit, * the upper triangle and the first subdiagonal of sub( A ) are * overwritten with the upper Hessenberg matrix H, and the ele- * ments below the first subdiagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16 array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). Elements JA:JA+ILO-2 and JA+IHI:JA+N-2 of TAU are * set to zero. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB*NB + NB*MAX( IHIP+1, IHLP+INLQ ) * * where NB = MB_A = NB_A, IROFFA = MOD( IA-1, NB ), * ICOFFA = MOD( JA-1, NB ), IOFF = MOD( IA+ILO-2, NB ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ), * IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ), * ILROW = INDXG2P( IA+ILO-1, NB, MYROW, RSRC_A, NPROW ), * IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, ILROW, NPROW ), * ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, CSRC_A, NPCOL ), * INLQ = NUMROC( N-ILO+IOFF+1, NB, MYCOL, ILCOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of (ihi-ilo) elementary * reflectors * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:I) = 0, v(I+1) = 1 and v(IHI+1:N) = 0; v(I+2:IHI) is stored on * exit in A(IA+ILO+I:IA+IHI-1,JA+ILO+I-2), and tau in TAU(JA+ILO+I-2). * * The contents of A(IA:IA+N-1,JA:JA+N-1) are illustrated by the follow- * ing example, with N = 7, ILO = 2 and IHI = 6: * * on entry on exit * * ( a a a a a a a ) ( a a h h h h a ) * ( a a a a a a ) ( a h h h h a ) * ( a a a a a a ) ( h h h h h h ) * ( a a a a a a ) ( v2 h h h h h ) * ( a a a a a a ) ( v2 v3 h h h h ) * ( a a a a a a ) ( v2 v3 v4 h h h ) * ( a ) ( a ) * * where a denotes an element of the original matrix sub( A ), H denotes * a modified element of the upper Hessenberg matrix H, and vi denotes * an element of the vector defining H(JA+ILO+I-2). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, IB, ICOFFA, ICTXT, IHIP, $ IHLP, IIA, IINFO, ILCOL, ILROW, IMCOL, INLQ, $ IOFF, IPT, IPW, IPY, IROFFA, J, JJ, JJA, JY, $ K, L, LWMIN, MYCOL, MYROW, NB, NPCOL, NPROW, $ NQ COMPLEX*16 EI * .. * .. Local Arrays .. INTEGER DESCY( DLEN_ ), IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG1L, $ INFOG2L, PCHK1MAT, PB_TOPGET, PB_TOPSET, $ PXERBLA, PZGEMM, PZGEHD2, PZLAHRD, PZLARFB * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, NB ) ICOFFA = MOD( JA-1, NB ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) IHIP = NUMROC( IHI+IROFFA, NB, MYROW, IAROW, NPROW ) IOFF = MOD( IA+ILO-2, NB ) ILROW = INDXG2P( IA+ILO-1, NB, MYROW, DESCA( RSRC_ ), $ NPROW ) IHLP = NUMROC( IHI-ILO+IOFF+1, NB, MYROW, ILROW, NPROW ) ILCOL = INDXG2P( JA+ILO-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) INLQ = NUMROC( N-ILO+IOFF+1, NB, MYCOL, ILCOL, NPCOL ) LWMIN = NB*( NB + MAX( IHIP+1, IHLP+INLQ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, N ) ) THEN INFO = -2 ELSE IF( IHI.LT.MIN( ILO, N ) .OR. IHI.GT.N ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA .OR. IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = ILO IDUM2( 1 ) = 2 IDUM1( 2 ) = IHI IDUM2( 2 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 10 CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEHRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Set elements JA:JA+ILO-2 and JA+JHI-1:JA+N-2 of TAU to zero. * NQ = NUMROC( JA+N-2, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) CALL INFOG1L( JA+ILO-2, NB, NPCOL, MYCOL, DESCA( CSRC_ ), JJ, $ IMCOL ) DO 10 J = JJA, MIN( JJ, NQ ) TAU( J ) = ZERO 10 CONTINUE * CALL INFOG1L( JA+IHI-1, NB, NPCOL, MYCOL, DESCA( CSRC_ ), JJ, $ IMCOL ) DO 20 J = JJ, NQ TAU( J ) = ZERO 20 CONTINUE * * Quick return if possible * IF( IHI-ILO.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPT = 1 IPY = IPT + NB * NB IPW = IPY + IHIP * NB CALL DESCSET( DESCY, IHI+IROFFA, NB, NB, NB, IAROW, ILCOL, ICTXT, $ MAX( 1, IHIP ) ) * K = ILO IB = NB - IOFF JY = IOFF + 1 * * Loop over remaining block of columns * DO 30 L = 1, IHI-ILO+IOFF-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns j:j+ib-1 to Hessenberg form, returning the * matrices V and T of the block reflector H = I - V*T*V' * which performs the reduction, and also the matrix Y = A*V*T * CALL PZLAHRD( IHI, K, IB, A, IA, J, DESCA, TAU, WORK( IPT ), $ WORK( IPY ), 1, JY, DESCY, WORK( IPW ) ) * * Apply the block reflector H to A(ia:ia+ihi-1,j+ib:ja+ihi-1) * from the right, computing A := A - Y * V'. * V(i+ib,ib-1) must be set to 1. * CALL PZELSET2( EI, A, I+IB, J+IB-1, DESCA, ONE ) CALL PZGEMM( 'No transpose', 'Conjugate transpose', IHI, $ IHI-K-IB+1, IB, -ONE, WORK( IPY ), 1, JY, DESCY, $ A, I+IB, J, DESCA, ONE, A, IA, J+IB, DESCA ) CALL PZELSET( A, I+IB, J+IB-1, DESCA, EI ) * * Apply the block reflector H to A(i+1:ia+ihi-1,j+ib:ja+n-1) from * the left * CALL PZLARFB( 'Left', 'Conjugate transpose', 'Forward', $ 'Columnwise', IHI-K, N-K-IB+1, IB, A, I+1, J, $ DESCA, WORK( IPT ), A, I+1, J+IB, DESCA, $ WORK( IPY ) ) * K = K + IB IB = NB JY = 1 DESCY( CSRC_ ) = MOD( DESCY( CSRC_ ) + 1, NPCOL ) * 30 CONTINUE * * Use unblocked code to reduce the rest of the matrix * CALL PZGEHD2( N, K, IHI, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGEHRD * END scalapack-2.0.2/SRC/pzgelq2.f000644 000766 000024 00000025105 10363532303 016074 0ustar00juliestaff000000 000000 SUBROUTINE PZGELQ2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGELQ2 computes a LQ factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = L * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and below the diagonal of sub( A ) contain the M by min(M,N) * lower trapezoidal matrix L (L is lower triangular if M <= N); * the elements above the diagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * of the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia+k-1)' H(ia+k-2)' . . . H(ia)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in * A(ia+i-1,ja+i:ja+n-1), and tau in TAU(ia+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, J, K, LWMIN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ COMPLEX*16 AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZLACGV, PZLARF, PZLARFG * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQ + MAX( 1, MP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGELQ2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * K = MIN( M, N ) DO 10 I = IA, IA+K-1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * A(i,j+1:ja+n-1) * CALL PZLACGV( N-J+JA, A, I, J, DESCA, DESCA( M_ ) ) CALL PZLARFG( N-J+JA, AII, I, J, A, I, MIN( J+1, JA+N-1 ), $ DESCA, DESCA( M_ ), TAU ) * IF( I.LT.IA+M-1 ) THEN * * Apply H(i) to A(i+1:ia+m-1,j:ja+n-1) from the right * CALL PZELSET( A, I, J, DESCA, ONE ) CALL PZLARF( 'Right', M-I+IA-1, N-J+JA, A, I, J, DESCA, $ DESCA( M_ ), TAU, A, I+1, J, DESCA, WORK ) END IF CALL PZELSET( A, I, J, DESCA, AII ) CALL PZLACGV( N-J+JA, A, I, J, DESCA, DESCA( M_ ) ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGELQ2 * END scalapack-2.0.2/SRC/pzgelqf.f000644 000766 000024 00000027541 10363532303 016166 0ustar00juliestaff000000 000000 SUBROUTINE PZGELQF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGELQF computes a LQ factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = L * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and below the diagonal of sub( A ) contain the M by min(M,N) * lower trapezoidal matrix L (L is lower triangular if M <= N); * the elements above the diagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCr(IA+MIN(M,N)-1). This array contains the scalar factors * of the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia+k-1)' H(ia+k-2)' . . . H(ia)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in * A(ia+i-1,ja+i:ja+n-1), and tau in TAU(ia+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IN, IPW, $ IROFF, J, K, LWMIN, MP0, MYCOL, MYROW, NPCOL, $ NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZGELQ2, PZLARFB, $ PZLARFT * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * * Handle the first block of rows separately * IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IB = IN - IA + 1 * * Compute the LQ factorization of the first block A(ia:in:ja:ja+n-1) * CALL PZGELQ2( IB, N, A, IA, JA, DESCA, TAU, WORK, LWORK, IINFO ) * IF( IA+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(ia) H(ia+1) . . . H(in) * CALL PZLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia+ib:ia+m-1,ja:ja+n-1) from the right * CALL PZLARFB( 'Right', 'No transpose', 'Forward', 'Rowwise', $ M-IB, N, IB, A, IA, JA, DESCA, WORK, A, IA+IB, $ JA, DESCA, WORK( IPW ) ) END IF * * Loop over the remaining blocks of rows * DO 10 I = IN+1, IA+K-1, DESCA( MB_ ) IB = MIN( K-I+IA, DESCA( MB_ ) ) J = JA + I - IA * * Compute the LQ factorization of the current block * A(i:i+ib-1:j:ja+n-1) * CALL PZGELQ2( IB, N-I+IA, A, I, J, DESCA, TAU, WORK, LWORK, $ IINFO ) * IF( I+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PZLARFT( 'Forward', 'Rowwise', N-I+IA, IB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(i+ib:ia+m-1,j:ja+n-1) from the right * CALL PZLARFB( 'Right', 'No transpose', 'Forward', 'Rowwise', $ M-I-IB+IA, N-J+JA, IB, A, I, J, DESCA, WORK, $ A, I+IB, J, DESCA, WORK( IPW ) ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGELQF * END scalapack-2.0.2/SRC/pzgels.f000644 000766 000024 00000055142 11312467374 016032 0ustar00juliestaff000000 000000 SUBROUTINE PZGELS( TRANS, M, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ), WORK( * ) * .. * * Purpose * ======= * * PZGELS solves overdetermined or underdetermined complex linear * systems involving an M-by-N matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1), * or its conjugate-transpose, using a QR or LQ factorization of * sub( A ). It is assumed that sub( A ) has full rank. * * The following options are provided: * * 1. If TRANS = 'N' and m >= n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || sub( B ) - sub( A )*X ||. * * 2. If TRANS = 'N' and m < n: find the minimum norm solution of * an underdetermined system sub( A ) * X = sub( B ). * * 3. If TRANS = 'C' and m >= n: find the minimum norm solution of * an undetermined system sub( A )**H * X = sub( B ). * * 4. If TRANS = 'C' and m < n: find the least squares solution of * an overdetermined system, i.e., solve the least squares problem * minimize || sub( B ) - sub( A )**H * X ||. * * where sub( B ) denotes B( IB:IB+M-1, JB:JB+NRHS-1 ) when TRANS = 'N' * and B( IB:IB+N-1, JB:JB+NRHS-1 ) otherwise. Several right hand side * vectors b and solution vectors x can be handled in a single call; * When TRANS = 'N', the solution vectors are stored as the columns of * the N-by-NRHS right hand side matrix sub( B ) and the M-by-NRHS * right hand side matrix sub( B ) otherwise. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TRANS (global input) CHARACTER * = 'N': the linear system involves sub( A ); * = 'C': the linear system involves sub( A )**H. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of * rows of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e. the number of columns * of the distributed submatrices sub( B ) and X. NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of local dimension * ( LLD_A, LOCc(JA+N-1) ). On entry, the M-by-N matrix A. * if M >= N, sub( A ) is overwritten by details of its QR * factorization as returned by PZGEQRF; * if M < N, sub( A ) is overwritten by details of its LQ * factorization as returned by PZGELQF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of local dimension * (LLD_B, LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the distributed matrix B of right hand side * vectors, stored columnwise; * sub( B ) is M-by-NRHS if TRANS='N', and N-by-NRHS otherwise. * On exit, sub( B ) is overwritten by the solution vectors, * stored columnwise: if TRANS = 'N' and M >= N, rows 1 to N * of sub( B ) contain the least squares solution vectors; the * residual sum of squares for the solution in each column is * given by the sum of squares of elements N+1 to M in that * column; if TRANS = 'N' and M < N, rows 1 to N of sub( B ) * contain the minimum norm solution vectors; if TRANS = 'C' * and M >= N, rows 1 to M of sub( B ) contain the minimum norm * solution vectors; if TRANS = 'C' and M < N, rows 1 to M of * sub( B ) contain the least squares solution vectors; the * residual sum of squares for the solution in each column is * given by the sum of squares of elements M+1 to N in that * column. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= LTAU + MAX( LWF, LWS ) where * If M >= N, then * LTAU = NUMROC( JA+MIN(M,N)-1, NB_A, MYCOL, CSRC_A, NPCOL ), * LWF = NB_A * ( MpA0 + NqA0 + NB_A ) * LWS = MAX( (NB_A*(NB_A-1))/2, (NRHSqB0 + MpB0)*NB_A ) + * NB_A * NB_A * Else * LTAU = NUMROC( IA+MIN(M,N)-1, MB_A, MYROW, RSRC_A, NPROW ), * LWF = MB_A * ( MpA0 + NqA0 + MB_A ) * LWS = MAX( (MB_A*(MB_A-1))/2, ( NpB0 + MAX( NqA0 + * NUMROC( NUMROC( N+IROFFB, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NRHSqB0 ) )*MB_A ) + * MB_A * MB_A * End if * * where LCMP = LCM / NPROW with LCM = ILCM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * MpB0 = NUMROC( M+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NpB0 = NUMROC( N+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NRHSqB0 = NUMROC( NRHS+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, TPSD INTEGER BROW, IACOL, IAROW, IASCL, IBCOL, IBROW, IBSCL, $ ICOFFA, ICOFFB, ICTXT, IPW, IROFFA, IROFFB, $ LCM, LCMP, LTAU, LWF, LWMIN, LWS, MPA0, MPB0, $ MYCOL, MYROW, NPB0, NPCOL, NPROW, NQA0, $ NRHSQB0, SCLLEN DOUBLE PRECISION ANRM, BIGNUM, BNRM, SMLNUM * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) DOUBLE PRECISION RWORK( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL ILCM, INDXG2P, LSAME, NUMROC, PDLAMCH, $ PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PZGELQF, $ PZGEQRF, PDLABAD, PZLASCL, PZLASET, $ PZTRSM, PZUNMLQ, PZUNMQR, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 800 + CTXT_ ) ELSE CALL CHK1MAT( M, 2, N, 3, IA, JA, DESCA, 8, INFO ) IF ( M .GE. N ) THEN CALL CHK1MAT( M, 2, NRHS, 4, IB, JB, DESCB, 12, INFO ) ELSE CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 12, INFO ) ENDIF IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( IA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) * IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( IB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) NRHSQB0 = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, $ NPCOL ) IF( M.GE.N ) THEN MPB0 = NUMROC( M+IROFFB, DESCB( MB_ ), MYROW, IBROW, $ NPROW ) LTAU = NUMROC( JA+MIN(M,N)-1, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) LWF = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) LWS = MAX( ( DESCA( NB_ )*( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPB0 + NRHSQB0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ )*DESCA( NB_ ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW NPB0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, $ NPROW ) LTAU = NUMROC( IA+MIN(M,N)-1, DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) LWF = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) LWS = MAX( ( DESCA( MB_ )*( DESCA( MB_ ) - 1 ) ) / 2, $ ( NPB0 + MAX( NQA0 + NUMROC( NUMROC( N+IROFFB, $ DESCA( MB_ ), 0, 0, NPROW ), DESCA( MB_ ), 0, 0, $ LCMP ), NRHSQB0 ) )*DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF LWMIN = LTAU + MAX( LWF, LWS ) WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) * TPSD = .TRUE. IF( LSAME( TRANS, 'N' ) ) $ TPSD = .FALSE. * IF( .NOT.( LSAME( TRANS, 'N' ) .OR. $ LSAME( TRANS, 'C' ) ) ) THEN INFO = -1 ELSE IF( M.LT.0 ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( M.GE.N .AND. IROFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( M.GE.N .AND. IAROW.NE.IBROW ) THEN INFO = -10 ELSE IF( M.LT.N .AND. ICOFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( M.GE.N .AND. DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1200 + MB_ ) ELSE IF( M.LT.N .AND. DESCA( NB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1200 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1200 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF END IF * IF( .NOT.TPSD ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 14 CALL PCHK2MAT( M, 2, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, 4, $ IB, JB, DESCB, 12, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGELS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( MIN( M, N, NRHS ).EQ.0 ) THEN CALL PZLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, $ IB, JB, DESCB ) RETURN END IF * * Get machine parameters * SMLNUM = PDLAMCH( ICTXT, 'S' ) SMLNUM = SMLNUM / PDLAMCH( ICTXT, 'P' ) BIGNUM = ONE / SMLNUM CALL PDLABAD( ICTXT, SMLNUM, BIGNUM ) * * Scale A, B if max entry outside range [SMLNUM,BIGNUM] * ANRM = PZLANGE( 'M', M, N, A, IA, JA, DESCA, RWORK ) IASCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN * * Scale matrix norm up to SMLNUM * CALL PZLASCL( 'G', ANRM, SMLNUM, M, N, A, IA, JA, DESCA, $ INFO ) IASCL = 1 ELSE IF( ANRM.GT.BIGNUM ) THEN * * Scale matrix norm down to BIGNUM * CALL PZLASCL( 'G', ANRM, BIGNUM, M, N, A, IA, JA, DESCA, $ INFO ) IASCL = 2 ELSE IF( ANRM.EQ.ZERO ) THEN * * Matrix all zero. Return zero solution. * CALL PZLASET( 'F', MAX( M, N ), NRHS, CZERO, CZERO, B, IB, $ JB, DESCB ) GO TO 10 END IF * BROW = M IF( TPSD ) $ BROW = N * BNRM = PZLANGE( 'M', BROW, NRHS, B, IB, JB, DESCB, RWORK ) * IBSCL = 0 IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN * * Scale matrix norm up to SMLNUM * CALL PZLASCL( 'G', BNRM, SMLNUM, BROW, NRHS, B, IB, JB, $ DESCB, INFO ) IBSCL = 1 ELSE IF( BNRM.GT.BIGNUM ) THEN * * Scale matrix norm down to BIGNUM * CALL PZLASCL( 'G', BNRM, BIGNUM, BROW, NRHS, B, IB, JB, $ DESCB, INFO ) IBSCL = 2 END IF * IPW = LTAU + 1 * IF( M.GE.N ) THEN * * compute QR factorization of A * CALL PZGEQRF( M, N, A, IA, JA, DESCA, WORK, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least N, optimally N*NB * IF( .NOT.TPSD ) THEN * * Least-Squares Problem min || A * X - B || * * B(IB:IB+M-1,JB:JB+NRHS-1) := Q' * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PZUNMQR( 'Left', 'Conjugate transpose', M, NRHS, N, A, $ IA, JA, DESCA, WORK, B, IB, JB, DESCB, $ WORK( IPW ), LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * * B(IB:IB+N-1,JB:JB+NRHS-1) := inv(R) * * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PZTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, CONE, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * SCLLEN = N * ELSE * * Overdetermined system of equations sub( A )' * X = sub( B ) * * sub( B ) := inv(R') * sub( B ) * CALL PZTRSM( 'Left', 'Upper', 'Conjugate transpose', $ 'Non-unit', N, NRHS, CONE, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) * * B(IB+N:IB+M-1,JB:JB+NRHS-1) = ZERO * CALL PZLASET( 'All', M-N, NRHS, CZERO, CZERO, B, IB+N, JB, $ DESCB ) * * B(IB:IB+M-1,JB:JB+NRHS-1) := Q(1:N,:) * * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PZUNMQR( 'Left', 'No transpose', M, NRHS, N, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * SCLLEN = M * END IF * ELSE * * Compute LQ factorization of sub( A ) * CALL PZGELQF( M, N, A, IA, JA, DESCA, WORK, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least M, optimally M*NB. * IF( .NOT.TPSD ) THEN * * underdetermined system of equations sub( A ) * X = sub( B ) * * B(IB:IB+M-1,JB:JB+NRHS-1) := inv(L) * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PZTRSM( 'Left', 'Lower', 'No transpose', 'Non-unit', M, $ NRHS, CONE, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * B(IB+M:IB+N-1,JB:JB+NRHS-1) = 0 * CALL PZLASET( 'All', N-M, NRHS, CZERO, CZERO, B, IB+M, JB, $ DESCB ) * * B(IB:IB+N-1,JB:JB+NRHS-1) := Q(1:N,:)' * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PZUNMLQ( 'Left', 'Conjugate transpose', N, NRHS, M, A, $ IA, JA, DESCA, WORK, B, IB, JB, DESCB, $ WORK( IPW ), LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * SCLLEN = N * ELSE * * overdetermined system min || A' * X - B || * * B(IB:IB+N-1,JB:JB+NRHS-1) := Q * B(IB:IB+N-1,JB:JB+NRHS-1) * CALL PZUNMLQ( 'Left', 'No transpose', N, NRHS, M, A, IA, JA, $ DESCA, WORK, B, IB, JB, DESCB, WORK( IPW ), $ LWORK-LTAU, INFO ) * * workspace at least NRHS, optimally NRHS*NB * * B(IB:IB+M-1,JB:JB+NRHS-1) := inv(L') * * B(IB:IB+M-1,JB:JB+NRHS-1) * CALL PZTRSM( 'Left', 'Lower', 'Conjugate transpose', $ 'Non-unit', M, NRHS, CONE, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) * SCLLEN = M * END IF * END IF * * Undo scaling * IF( IASCL.EQ.1 ) THEN CALL PZLASCL( 'G', ANRM, SMLNUM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) ELSE IF( IASCL.EQ.2 ) THEN CALL PZLASCL( 'G', ANRM, BIGNUM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) END IF IF( IBSCL.EQ.1 ) THEN CALL PZLASCL( 'G', SMLNUM, BNRM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) ELSE IF( IBSCL.EQ.2 ) THEN CALL PZLASCL( 'G', BIGNUM, BNRM, SCLLEN, NRHS, B, IB, JB, $ DESCB, INFO ) END IF * 10 CONTINUE * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGELS * END scalapack-2.0.2/SRC/pzgeql2.f000644 000766 000024 00000030143 10363532303 016072 0ustar00juliestaff000000 000000 SUBROUTINE PZGEQL2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEQL2 computes a QL factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * L. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M >= N, the * lower triangle of the distributed submatrix * A( IA+M-N:IA+M-1, JA:JA+N-1 ) contains the N-by-N lower * triangular matrix L; if M <= N, the elements on and below * the (N-M)-th superdiagonal contain the M by N lower * trapezoidal matrix L; the remaining elements, with the * array TAU, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension LOCc(JA+N-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Mp0 + MAX( 1, Nq0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja+k-1) . . . H(ja+1) H(ja), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in * A(ia:ia+m-k+i-2,ja+n-k+i-1), and tau in TAU(ja+n-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICTXT, II, J, JJ, K, LWMIN, $ MP, MYCOL, MYROW, NPCOL, NPROW, NQ COMPLEX*16 AJJ, ALPHA * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZLARFC, PZLARFG, ZGEBR2D, ZGEBS2D, $ ZLARFG, ZSCAL * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MP + MAX( 1, NQ ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEQL2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( DESCA( M_ ).EQ.1 ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) IACOL = INDXG2P( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN I = II+(JJ+NQ-2)*DESCA( LLD_ ) AJJ = A( I ) CALL ZLARFG( 1, AJJ, A( I ), 1, TAU( JJ+NQ-1 ) ) IF( N.GT.1 ) THEN ALPHA = ONE - DCONJG( TAU( JJ+NQ-1 ) ) CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1 ) CALL ZSCAL( NQ-1, ALPHA, A( II+(JJ-1)*DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ+NQ-1 ), 1 ) A( I ) = AJJ ELSE IF( N.GT.1 ) THEN CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, IAROW, IACOL ) CALL ZSCAL( NQ, ALPHA, A( II+(JJ-1)*DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ+NQ-1 ), 1, IAROW, IACOL ) END IF * ELSE * K = MIN( M, N ) DO 10 J = JA+K-1, JA, -1 I = IA + J - JA * * Generate elementary reflector H(j) to annihilate * A(ia:i+m-k-1,j+n-k) * CALL PZLARFG( M-K+I-IA+1, AJJ, M-K+I, N-K+J, A, IA, $ N-K+J, DESCA, 1, TAU ) * * Apply H(j)' to A(ia:i+m-k,ja:j+n-k-1) from the left * CALL PZELSET( A, I+M-K, J+N-K, DESCA, ONE ) CALL PZLARFC( 'Left', M-K+I-IA+1, N-K+J-JA, A, IA, N-K+J, $ DESCA, 1, TAU, A, IA, JA, DESCA, WORK ) CALL PZELSET( A, I+M-K, J+N-K, DESCA, AJJ ) * 10 CONTINUE * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGEQL2 * END scalapack-2.0.2/SRC/pzgeqlf.f000644 000766 000024 00000027347 10363532303 016172 0ustar00juliestaff000000 000000 SUBROUTINE PZGEQLF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEQLF computes a QL factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * L. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M >= N, the * lower triangle of the distributed submatrix * A( IA+M-N:IA+M-1, JA:JA+N-1 ) contains the N-by-N lower * triangular matrix L; if M <= N, the elements on and below * the (N-M)-th superdiagonal contain the M by N lower * trapezoidal matrix L; the remaining elements, with the * array TAU, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension LOCc(JA+N-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( Mp0 + Nq0 + NB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja+k-1) . . . H(ja+1) H(ja), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in * A(ia:ia+m-k+i-2,ja+n-k+i-1), and tau in TAU(ja+n-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, IINFO, ICTXT, IPW, J, JB, JL, JN, $ K, LWMIN, MP0, MU, MYCOL, MYROW, NPCOL, NPROW, $ NQ0, NU * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZGEQL2, PZLARFB, $ PZLARFT * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MP0 + NQ0 + DESCA( NB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEQLF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JL = MAX( ( (JA+N-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( JL.GE.JN+1 ) THEN * * Use blocked code initially * DO 10 J = JL, JN+1, -DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Compute the QL factorization of the current block * A(ia:ia+m-n+j+jb-ja-1,j:j+jb-1) * CALL PZGEQL2( M-N+J+JB-JA, JB, A, IA, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * IF( J.GT.JA ) THEN * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PZLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:ia+m-n+j+jb-ja-1,ja:j-1) from the * left * CALL PZLARFB( 'Left', 'Conjugate transpose', 'Backward', $ 'Columnwise', M-N+J+JB-JA, J-JA, JB, A, IA, $ J, DESCA, WORK, A, IA, JA, DESCA, $ WORK( IPW ) ) END IF * 10 CONTINUE * MU = M - N + JN - JA + 1 NU = JN - JA + 1 * ELSE * MU = M NU = N * END IF * * Use unblocked code to factor the last or only block * IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL PZGEQL2( MU, NU, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGEQLF * END scalapack-2.0.2/SRC/pzgeqpf.f000644 000766 000024 00000052432 10363532303 016167 0ustar00juliestaff000000 000000 SUBROUTINE PZGEQPF( M, N, A, IA, JA, DESCA, IPIV, TAU, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 14, 2000 * * .. Scalar Arguments .. INTEGER IA, JA, INFO, LRWORK, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) DOUBLE PRECISION RWORK( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEQPF computes a QR factorization with column pivoting of a * M-by-N distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1): * * sub( A ) * P = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, repre- * sent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension LOCc(JA+N-1). * On exit, if IPIV(I) = K, the local i-th column of sub( A )*P * was the global K-th column of sub( A ). IPIV is tied to the * distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX(3,Mp0 + Nq0). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCc(JA+N-1)+Nq0. * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * LOCc(JA+N-1) = NUMROC( JA+N-1, NB_A, MYCOL, CSRC_A, NPCOL ) * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(1) H(2) . . . H(n) * * Each H(i) has the form * * H = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in * A(ia+i-1:ia+m-1,ja+i-1). * * The matrix P is represented in jpvt as follows: If * jpvt(j) = i * then the jth column of P is the ith canonical unit vector. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFF, ICTXT, ICURROW, $ ICURCOL, II, IIA, IOFFA, IPCOL, IROFF, ITEMP, $ J, JB, JJ, JJA, JJPVT, JN, KB, K, KK, KSTART, $ KSTEP, LDA, LL, LRWMIN, LWMIN, MN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ, NQ0, PVT DOUBLE PRECISION TEMP, TEMP2 COMPLEX*16 AJJ, ALPHA * .. * .. Local Arrays .. INTEGER DESCN( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, IGERV2D, $ IGESD2D, INFOG1L, INFOG2L, PCHK1MAT, PDAMAX, $ PDZNRM2, PXERBLA, PZELSET, $ PZLARFC, PZLARFG, ZCOPY, ZGEBR2D, $ ZGEBS2D, ZGERV2D, ZGESD2D, ZLARFG, $ ZSWAP * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DCMPLX, DCONJG, IDINT, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) NQ0 = NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) LWMIN = MAX( 3, MP + NQ ) LRWMIN = NQ0 + NQ * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) RWORK( 1 ) = DBLE( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 10 IF( LRWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 12 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEQPF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF MN = MIN( M, N ) * * Initialize the array of pivots * LDA = DESCA( LLD_ ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) KSTEP = NPCOL * DESCA( NB_ ) * IF( MYCOL.EQ.IACOL ) THEN * * Handle first block separately * JB = JN - JA + 1 DO 10 LL = JJA, JJA+JB-1 IPIV( LL ) = JA + LL - JJA 10 CONTINUE KSTART = JN + KSTEP - DESCA( NB_ ) * * Loop over remaining block of columns * DO 30 KK = JJA+JB, JJA+NQ-1, DESCA( NB_ ) KB = MIN( JJA+NQ-KK, DESCA( NB_ ) ) DO 20 LL = KK, KK+KB-1 IPIV( LL ) = KSTART+LL-KK+1 20 CONTINUE KSTART = KSTART + KSTEP 30 CONTINUE ELSE KSTART = JN + ( MOD( MYCOL-IACOL+NPCOL, NPCOL )-1 )* $ DESCA( NB_ ) DO 50 KK = JJA, JJA+NQ-1, DESCA( NB_ ) KB = MIN( JJA+NQ-KK, DESCA( NB_ ) ) DO 40 LL = KK, KK+KB-1 IPIV( LL ) = KSTART+LL-KK+1 40 CONTINUE KSTART = KSTART + KSTEP 50 CONTINUE END IF * * Initialize partial column norms, handle first block separately * CALL DESCSET( DESCN, 1, DESCA( N_ ), 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), ICTXT, 1 ) * JJ = JJA IF( MYCOL.EQ.IACOL ) THEN DO 60 KK = 0, JB-1 CALL PDZNRM2( M, RWORK( JJ+KK ), A, IA, JA+KK, DESCA, 1 ) RWORK( NQ+JJ+KK ) = RWORK( JJ+KK ) 60 CONTINUE JJ = JJ + JB END IF ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining blocks of columns * DO 80 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 70 KK = 0, JB-1 CALL PDZNRM2( M, RWORK( JJ+KK ), A, IA, J+KK, DESCA, 1 ) RWORK( NQ+JJ+KK ) = RWORK( JJ+KK ) 70 CONTINUE JJ = JJ + JB END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) 80 CONTINUE * * Compute factorization * DO 120 J = JA, JA+MN-1 I = IA + J - JA * CALL INFOG1L( J, DESCA( NB_ ), NPCOL, MYCOL, DESCA( CSRC_ ), $ JJ, ICURCOL ) K = JA + N - J IF( K.GT.1 ) THEN CALL PDAMAX( K, TEMP, PVT, RWORK, 1, J, DESCN, $ DESCN( M_ ) ) ELSE PVT = J END IF IF( J.NE.PVT ) THEN CALL INFOG1L( PVT, DESCA( NB_ ), NPCOL, MYCOL, $ DESCA( CSRC_ ), JJPVT, IPCOL ) IF( ICURCOL.EQ.IPCOL ) THEN IF( MYCOL.EQ.ICURCOL ) THEN CALL ZSWAP( MP, A( IIA+(JJ-1)*LDA ), 1, $ A( IIA+(JJPVT-1)*LDA ), 1 ) ITEMP = IPIV( JJPVT ) IPIV( JJPVT ) = IPIV( JJ ) IPIV( JJ ) = ITEMP RWORK( JJPVT ) = RWORK( JJ ) RWORK( NQ+JJPVT ) = RWORK( NQ+JJ ) END IF ELSE IF( MYCOL.EQ.ICURCOL ) THEN * CALL ZGESD2D( ICTXT, MP, 1, A( IIA+(JJ-1)*LDA ), LDA, $ MYROW, IPCOL ) WORK( 1 ) = DCMPLX( DBLE( IPIV( JJ ) ) ) WORK( 2 ) = DCMPLX( RWORK( JJ ) ) WORK( 3 ) = DCMPLX( RWORK( JJ + NQ ) ) CALL ZGESD2D( ICTXT, 3, 1, WORK, 3, MYROW, IPCOL ) * CALL ZGERV2D( ICTXT, MP, 1, A( IIA+(JJ-1)*LDA ), LDA, $ MYROW, IPCOL ) CALL IGERV2D( ICTXT, 1, 1, IPIV( JJ ), 1, MYROW, $ IPCOL ) * ELSE IF( MYCOL.EQ.IPCOL ) THEN * CALL ZGESD2D( ICTXT, MP, 1, A( IIA+(JJPVT-1)*LDA ), $ LDA, MYROW, ICURCOL ) CALL IGESD2D( ICTXT, 1, 1, IPIV( JJPVT ), 1, MYROW, $ ICURCOL ) * CALL ZGERV2D( ICTXT, MP, 1, A( IIA+(JJPVT-1)*LDA ), $ LDA, MYROW, ICURCOL ) CALL ZGERV2D( ICTXT, 3, 1, WORK, 3, MYROW, ICURCOL ) IPIV( JJPVT ) = IDINT( DBLE( WORK( 1 ) ) ) RWORK( JJPVT ) = DBLE( WORK( 2 ) ) RWORK( JJPVT+NQ ) = DBLE( WORK( 3 ) ) * END IF * END IF * END IF * * Generate elementary reflector H(i) * CALL INFOG1L( I, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ II, ICURROW ) IF( DESCA( M_ ).EQ.1 ) THEN IF( MYROW.EQ.ICURROW ) THEN IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*DESCA( LLD_ ) AJJ = A( IOFFA ) CALL ZLARFG( 1, AJJ, A( IOFFA ), 1, TAU( JJ ) ) IF( N.GT.1 ) THEN ALPHA = CMPLX( ONE ) - DCONJG( TAU( JJ ) ) CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1 ) CALL ZSCAL( NQ-JJ, ALPHA, A( IOFFA+DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( JJ ), 1 ) A( IOFFA ) = AJJ ELSE IF( N.GT.1 ) THEN CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, ICURROW, ICURCOL ) CALL ZSCAL( NQ-JJ+1, ALPHA, A( I ), DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.ICURCOL ) THEN CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), $ 1, ICURROW, ICURCOL ) END IF * ELSE * CALL PZLARFG( M-J+JA, AJJ, I, J, A, MIN( I+1, IA+M-1 ), J, $ DESCA, 1, TAU ) IF( J.LT.JA+N-1 ) THEN * * Apply H(i) to A(ia+j-ja:ia+m-1,j+1:ja+n-1) from the left * CALL PZELSET( A, I, J, DESCA, DCMPLX( ONE ) ) CALL PZLARFC( 'Left', M-J+JA, JA+N-1-J, A, I, J, DESCA, $ 1, TAU, A, I, J+1, DESCA, WORK ) END IF CALL PZELSET( A, I, J, DESCA, AJJ ) * END IF * * Update partial columns norms * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 IF( MOD( J, DESCA( NB_ ) ).EQ.0 ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) IF( (JJA+NQ-JJ).GT.0 ) THEN IF( MYROW.EQ.ICURROW ) THEN CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, JJA+NQ-JJ, $ A( II+( MIN( JJA+NQ-1, JJ )-1 )*LDA ), $ LDA ) CALL ZCOPY( JJA+NQ-JJ, A( II+( MIN( JJA+NQ-1, JJ ) $ -1)*LDA ), LDA, WORK( MIN( JJA+NQ-1, JJ ) ), $ 1 ) ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', JJA+NQ-JJ, 1, $ WORK( MIN( JJA+NQ-1, JJ ) ), MAX( 1, NQ ), $ ICURROW, MYCOL ) END IF END IF * JN = MIN( ICEIL( J+1, DESCA( NB_ ) ) * DESCA( NB_ ), $ JA + N - 1 ) IF( MYCOL.EQ.ICURCOL ) THEN DO 90 LL = JJ, JJ + JN - J - 1 IF( RWORK( LL ).NE.ZERO ) THEN TEMP = ONE-( ABS( WORK( LL ) ) / RWORK( LL ) )**2 TEMP = MAX( TEMP, ZERO ) TEMP2 = ONE + 0.05D+0*TEMP* $ ( RWORK( LL ) / RWORK( NQ+LL ) )**2 IF( TEMP2.EQ.ONE ) THEN IF( IA+M-1.GT.I ) THEN CALL PDZNRM2( IA+M-I-1, RWORK( LL ), A, $ I+1, J+LL-JJ, DESCA, 1 ) RWORK( NQ+LL ) = RWORK( LL ) ELSE RWORK( LL ) = ZERO RWORK( NQ+LL ) = ZERO END IF ELSE RWORK( LL ) = RWORK( LL ) * SQRT( TEMP ) END IF END IF 90 CONTINUE JJ = JJ + JN - J END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 110 K = JN+1, JA+N-1, DESCA( NB_ ) KB = MIN( JA+N-K, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 100 LL = JJ, JJ+KB-1 IF( RWORK(LL).NE.ZERO ) THEN TEMP = ONE-( ABS( WORK( LL ) ) / RWORK( LL ) )**2 TEMP = MAX( TEMP, ZERO ) TEMP2 = ONE + 0.05D+0*TEMP* $ ( RWORK( LL ) / RWORK( NQ+LL ) )**2 IF( TEMP2.EQ.ONE ) THEN IF( IA+M-1.GT.I ) THEN CALL PDZNRM2( IA+M-I-1, RWORK( LL ), A, $ I+1, K+LL-JJ, DESCA, 1 ) RWORK( NQ+LL ) = RWORK( LL ) ELSE RWORK( LL ) = ZERO RWORK( NQ+LL ) = ZERO END IF ELSE RWORK( LL ) = RWORK( LL ) * SQRT( TEMP ) END IF END IF 100 CONTINUE JJ = JJ + KB END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * 110 CONTINUE * 120 CONTINUE * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) RWORK( 1 ) = DBLE( LRWMIN ) * RETURN * * End of PZGEQPF * END scalapack-2.0.2/SRC/pzgeqr2.f000644 000766 000024 00000027564 10363532303 016115 0ustar00juliestaff000000 000000 SUBROUTINE PZGEQR2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEQR2 computes a QR factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Mp0 + MAX( 1, Nq0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(m,n). * * Each H(i) has the form * * H(j) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1), and tau in TAU(ja+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, II, IACOL, IAROW, ICTXT, J, JJ, K, LWMIN, $ MP, MYCOL, MYROW, NPCOL, NPROW, NQ COMPLEX*16 AJJ, ALPHA * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZLARFC, PZLARFG, ZGEBR2D, ZGEBS2D, $ ZLARFG, ZSCAL * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MP + MAX( 1, NQ ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEQR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) I = II+(JJ-1)*DESCA( LLD_ ) IF( MYCOL.EQ.IACOL ) THEN AJJ = A( I ) CALL ZLARFG( 1, AJJ, A( I ), 1, TAU( JJ ) ) IF( N.GT.1 ) THEN ALPHA = ONE - DCONJG( TAU( JJ ) ) CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1 ) CALL ZSCAL( NQ-JJ, ALPHA, A( I+DESCA( LLD_ ) ), $ DESCA( LLD_ ) ) END IF CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), $ 1 ) A( I ) = AJJ ELSE IF( N.GT.1 ) THEN CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, $ 1, IAROW, IACOL ) CALL ZSCAL( NQ-JJ+1, ALPHA, A( I ), DESCA( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAU( JJ ), 1, $ IAROW, IACOL ) END IF * ELSE * K = MIN( M, N ) DO 10 J = JA, JA+K-1 I = IA + J - JA * * Generate elementary reflector H(j) to annihilate * A(i+1:ia+m-1,j) * CALL PZLARFG( M-J+JA, AJJ, I, J, A, MIN( I+1, IA+M-1 ), J, $ DESCA, 1, TAU ) IF( J.LT.JA+N-1 ) THEN * * Apply H(j)' to A(i:ia+m-1,j+1:ja+n-1) from the left * CALL PZELSET( A, I, J, DESCA, ONE ) * CALL PZLARFC( 'Left', M-J+JA, N-J+JA-1, A, I, J, DESCA, $ 1, TAU, A, I, J+1, DESCA, WORK ) END IF CALL PZELSET( A, I, J, DESCA, AJJ ) * 10 CONTINUE * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGEQR2 * END scalapack-2.0.2/SRC/pzgeqrf.f000644 000766 000024 00000027564 10363532303 016201 0ustar00juliestaff000000 000000 SUBROUTINE PZGEQRF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGEQRF computes a QR factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = Q * R. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(M,N) by N * upper trapezoidal matrix R (R is upper triangular if M >= N); * the elements below the diagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCc(JA+MIN(M,N)-1). This array contains the scalar factors * TAU of the elementary reflectors. TAU is tied to the * distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( Mp0 + Nq0 + NB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(m,n). * * Each H(i) has the form * * H(j) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in * A(ia+i:ia+m-1,ja+i-1), and tau in TAU(ja+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IINFO, IPW, J, $ JB, JN, K, LWMIN, MP0, MYCOL, MYROW, NPCOL, $ NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZGEQR2, PZLARFB, $ PZLARFT * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MP0 + NQ0 + DESCA( NB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGEQRF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JB = JN - JA + 1 * * Compute the QR factorization of the first block A(ia:ia+m-1,ja:jn) * CALL PZGEQR2( M, JB, A, IA, JA, DESCA, TAU, WORK, LWORK, IINFO ) * IF( JA+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(ja) H(ja+1) . . . H(jn) * CALL PZLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:ia+m-1,ja+jb:ja+n-1) from the left * CALL PZLARFB( 'Left', 'Conjugate transpose', 'Forward', $ 'Columnwise', M, N-JB, JB, A, IA, JA, DESCA, $ WORK, A, IA, JA+JB, DESCA, WORK( IPW ) ) END IF * * Loop over the remaining blocks of columns * DO 10 J = JN+1, JA+K-1, DESCA( NB_ ) JB = MIN( K-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Compute the QR factorization of the current block * A(i:ia+m-1,j:j+jb-1) * CALL PZGEQR2( M-J+JA, JB, A, I, J, DESCA, TAU, WORK, LWORK, $ IINFO ) * IF( J+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PZLARFT( 'Forward', 'Columnwise', M-J+JA, JB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(i:ia+m-1,j+jb:ja+n-1) from the left * CALL PZLARFB( 'Left', 'Conjugate transpose', 'Forward', $ 'Columnwise', M-J+JA, N-J-JB+JA, JB, A, I, J, $ DESCA, WORK, A, I, J+JB, DESCA, WORK( IPW ) ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGEQRF * END scalapack-2.0.2/SRC/pzgerfs.f000644 000766 000024 00000102736 10363532303 016176 0ustar00juliestaff000000 000000 SUBROUTINE PZGERFS( TRANS, N, NRHS, A, IA, JA, DESCA, AF, IAF, $ JAF, DESCAF, IPIV, B, IB, JB, DESCB, X, IX, $ JX, DESCX, FERR, BERR, WORK, LWORK, RWORK, $ LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IAF, IB, IX, INFO, JA, JAF, JB, JX, $ LRWORK, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IPIV( * ) DOUBLE PRECISION BERR( * ), FERR( * ), RWORK( * ) COMPLEX*16 A( * ), AF( * ), B( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PZGERFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solutions. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations. * = 'N': sub( A ) * sub( X ) = sub( B ) (No transpose) * = 'T': sub( A )**T * sub( X ) = sub( B ) (Transpose) * = 'C': sub( A )**H * sub( X ) = sub( B ) * (Conjugate transpose) * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) COMPLEX*16 pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input) COMPLEX*16 pointer into the local * memory to an array of local dimension (LLD_AF,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * factors of the matrix sub( A ) = P * L * U as computed by * PZGETRF. * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * IPIV (local input) INTEGER array of dimension LOCr(M_AF)+MB_AF. * This array contains the pivoting information as computed * by PZGETRF. IPIV(i) -> The global row local row i * was swapped with. This array is tied to the distributed * matrix A. * * B (local input) COMPLEX*16 pointer into the local * memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1)). This array contains the local * pieces of the distributed matrix of right hand sides * sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input and output) COMPLEX*16 pointer into the * local memory to an array of local dimension * (LLD_X,LOCc(JX+NRHS-1)). On entry, this array contains * the local pieces of the distributed matrix solution * sub( X ). On exit, the improved solution vectors. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). * The estimated forward error bound for each solution vector * of sub( X ). If XTRUE is the true solution corresponding * to sub( X ), FERR is an estimated upper bound for the * magnitude of the largest element in (sub( X ) - XTRUE) * divided by the magnitude of the largest element in sub( X ). * The estimate is as reliable as the estimate for RCOND, and * is almost always a slight overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr( N + MOD(IA-1,MB_A) ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCr( N + MOD(IB-1,MB_B) ). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices op( A ) and op( AF ) (respectively * sub( X ) and sub( B ) ) should be distributed the same way on the * same processes. These conditions ensure that sub( A ) and sub( AF ) * (resp. sub( X ) and sub( B ) ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( AF ), sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IAF, DESCAF( MB_ ) ) = f( JAF, DESCAF( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) DOUBLE PRECISION ZERO, RONE, TWO, THREE PARAMETER ( ZERO = 0.0D+0, RONE = 1.0D+0, TWO = 2.0D+0, $ THREE = 3.0D+0 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOTRAN CHARACTER TRANSN, TRANST INTEGER COUNT, IACOL, IAFCOL, IAFROW, IAROW, IXBCOL, $ IXBROW, IXCOL, IXROW, ICOFFA, ICOFFAF, ICOFFB, $ ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, IIW, $ IOFFXB, IPB, IPR, IPV, IROFFA, IROFFAF, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LRWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ DOUBLE PRECISION EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN COMPLEX*16 ZDUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMX2D, $ INFOG2L, PCHK2MAT, PXERBLA, PZAGEMV, PZAXPY, $ PZCOPY, PZGEMV, PZGETRS, PZLACON, $ ZGEBR2D, ZGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DIMAG, ICHAR, MAX, MIN, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * NOTRAN = LSAME( TRANS, 'N' ) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, N, 2, IAF, JAF, DESCAF, 11, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 16, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IX, JX, DESCX, 20, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFAF = MOD( JAF-1, DESCAF( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFCOL = INDXG2P( JAF, DESCAF( NB_ ), MYCOL, $ DESCAF( CSRC_ ), NPCOL ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 2 * NPMOD LRWMIN = NPMOD WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) RWORK( 1 ) = DBLE( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( ( .NOT.NOTRAN ) .AND. ( .NOT.LSAME( TRANS, 'T' ) ) .AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700 + NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCAF( MB_ ) ) THEN INFO = -( 1100 + MB_ ) ELSE IF( IROFFAF.NE.0 .OR. IAROW.NE.IAFROW ) THEN INFO = -9 ELSE IF( DESCA( NB_ ).NE.DESCAF( NB_ ) ) THEN INFO = -( 1100 + NB_ ) ELSE IF( ICOFFAF.NE.0 .OR. IACOL.NE.IAFCOL ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -( 1100 + CTXT_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -14 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1600 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1600 + CTXT_ ) ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 2000 + MB_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -18 ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 2000 + NB_ ) ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -19 ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 2000 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -24 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -26 END IF END IF * IF( NOTRAN ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 1 ) = ICHAR( 'T' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 IDUM1( 2 ) = N IDUM2( 2 ) = 2 IDUM1( 3 ) = NRHS IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 24 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 26 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, N, 2, IAF, $ JAF, DESCAF, 11, 5, IDUM1, IDUM2, INFO ) CALL PCHK2MAT( N, 2, NRHS, 3, IB, JB, DESCB, 16, N, 2, NRHS, 3, $ IX, JX, DESCX, 20, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGERFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * IF( NOTRAN ) THEN TRANSN = 'N' TRANST = 'C' ELSE TRANSN = 'C' TRANST = 'N' END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = 1 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = 1 + maximum number of nonzero entries in each row of sub( A ) * NZ = N + 1 EPS = PDLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) ) * DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 100 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 20 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X), * where op(sub(A)) = sub(A), or sub(A)' (A**T or A**H), * depending on TRANS. * CALL PZCOPY( N, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PZGEMV( TRANS, N, N, -ONE, A, IA, JA, DESCA, X, IX, $ JX+K, DESCX, 1, ONE, WORK( IPR ), IW, JW, $ DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIXB, IIXB + NP - 1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 30 CONTINUE END IF END IF * CALL PZAGEMV( TRANS, N, N, RONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, RONE, RWORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 40 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during the * last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PZGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PZAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, IX, $ JX+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 20 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(op(sub(A))))* * ( abs(R) + NZ*EPS*( * abs(op(sub(A)))*abs(sub(X))+abs(sub(B)))))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(sub(A))) is the inverse of op(sub(A)) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(op(sub(A)))*abs(sub(X)) + abs(sub(B)) is less than * SAFE2. * * Use PZLACON to estimate the infinity-norm of the matrix * inv(op(sub(A))) * diag(W), where * W = abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 50 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 50 CONTINUE END IF END IF * KASE = 0 60 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PZLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(sub(A))'). * CALL PZGETRS( TRANST, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF ELSE * * Multiply by inv(op(sub(A)))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 80 CONTINUE END IF END IF * CALL PZGETRS( TRANSN, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 60 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 90 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 90 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 100 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 200 J = JN+1, JB+NRHS-1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 190 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 110 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X), * where op(sub(A)) = sub(A), or sub(A)' (A**T or A**H), * depending on TRANS. * CALL PZCOPY( N, B, IB, J+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PZGEMV( TRANS, N, N, -ONE, A, IA, JA, DESCA, X, $ IX, J+K, DESCX, 1, ONE, WORK( IPR ), IW, JW, $ DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) (abs(R(i))/(abs(op(sub(A)))*abs(sub(X)) + * abs(sub(B)))(i)) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIXB, IIXB+NP-1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 120 CONTINUE END IF END IF * CALL PZAGEMV( TRANS, N, N, RONE, A, IA, JA, DESCA, X, IX, $ J+K, DESCX, 1, RONE, RWORK( IPB ), IW, JW, $ DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 )THEN DO 130 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 130 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.ICURCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during * the last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. $ COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PZGETRS( TRANS, N, 1, AF, IAF, JAF, DESCAF, IPIV, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PZAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, $ IX, J+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 110 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(op(sub(A))))* * ( abs(R) + NZ*EPS*( * abs(op(sub(A)))*abs(sub(X))+abs(sub(B)))))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(sub(A))) is the inverse of op(sub(A)) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(op(sub(A)))*abs(sub(X)) + abs(sub(B)) is less than * SAFE2. * * Use PZLACON to estimate the infinity-norm of the matrix * inv(op(sub(A))) * diag(W), where * W = abs(R)+NZ*EPS*(abs(op(sub(A)))*abs(sub(X))+abs(sub(B))). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 140 CONTINUE END IF END IF * KASE = 0 150 CONTINUE IF( MYCOL.EQ.ICURCOL ) THEN CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, ICURCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PZLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = ICURCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(sub(A))'). * CALL PZGETRS( TRANST, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 160 CONTINUE END IF END IF ELSE * * Multiply by inv(op(sub(A)))*diag(W). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 170 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 170 CONTINUE END IF END IF * CALL PZGETRS( TRANSN, N, 1, AF, IAF, JAF, DESCAF, $ IPIV, WORK( IPR ), IW, JW, DESCW, $ INFO ) END IF GO TO 150 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 180 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 180 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, $ 1, IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 190 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 200 CONTINUE * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) RWORK( 1 ) = DBLE( LRWMIN ) * RETURN * * End of PZGERFS * END scalapack-2.0.2/SRC/pzgerq2.f000644 000766 000024 00000025174 10363532303 016110 0ustar00juliestaff000000 000000 SUBROUTINE PZGERQ2( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGERQ2 computes a RQ factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = R * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAU, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia)' H(ia+1)' . . . H(ia+k-1)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; conjg(v(1:n-k+i-1)) is stored on * exit in A(ia+m-k+i-1,ja:ja+n-k+i-2), and tau in TAU(ia+m-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, J, K, LWMIN, MP, MYCOL, $ MYROW, NPCOL, NPROW, NQ COMPLEX*16 AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZLACGV, PZLARF, PZLARFG * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQ + MAX( 1, MP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGERQ2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * K = MIN( M, N ) DO 10 I = IA+K-1, IA, -1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * A(i+m-k,ja:j+n-k-1) * CALL PZLACGV( N-K+J-JA+1, A, I+M-K, JA, DESCA, DESCA( M_ ) ) CALL PZLARFG( N-K+J-JA+1, AII, I+M-K, J+N-K, A, I+M-K, JA, $ DESCA, DESCA( M_ ), TAU ) * * Apply H(i) to A(ia:i+m-k-1,ja:j+n-k) from the right * CALL PZELSET( A, I+M-K, J+N-K, DESCA, ONE ) CALL PZLARF( 'Right', M-K+I-IA, N-K+J-JA+1, A, M-K+I, JA, $ DESCA, DESCA( M_ ), TAU, A, IA, JA, DESCA, WORK ) CALL PZELSET( A, I+M-K, J+N-K, DESCA, AII ) CALL PZLACGV( N-K+J-JA+1, A, I+M-K, JA, DESCA, DESCA( M_ ) ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGERQ2 * END scalapack-2.0.2/SRC/pzgerqf.f000644 000766 000024 00000027276 10363532303 016201 0ustar00juliestaff000000 000000 SUBROUTINE PZGERQF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZGERQF computes a RQ factorization of a complex distributed M-by-N * matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) = R * Q. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAU, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia)' H(ia+1)' . . . H(ia+k-1)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; conjg(v(1:n-k+i-1)) is stored on * exit in A(ia+m-k+i-1,ja:ja+n-k+i-2), and tau in TAU(ia+m-k+i-1). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IL, IN, IPW, $ K, LWMIN, MP0, MU, MYCOL, MYROW, NPCOL, NPROW, $ NQ0, NU * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZGERQ2, PZLARFB, $ PZLARFT * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -9 END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGERQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * K = MIN( M, N ) IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IL = MAX( ( (IA+M-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * IF( IL.GE.IN+1 ) THEN * * Use blocked code initially * DO 10 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Compute the RQ factorization of the current block * A(i:i+ib-1,ja:ja+n-m+i+ib-ia-1) * CALL PZGERQ2( IB, N-M+I+IB-IA, A, I, JA, DESCA, TAU, WORK, $ LWORK, IINFO ) * IF( I.GT.IA ) THEN * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PZLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, $ I, JA, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:i-1,ja:ja+n-m+i+ib-ia-1) from the * right * CALL PZLARFB( 'Right', 'No transpose', 'Backward', $ 'Rowwise', I-IA, N-M+I+IB-IA, IB, A, I, JA, $ DESCA, WORK, A, IA, JA, DESCA, $ WORK( IPW ) ) END IF * 10 CONTINUE * MU = IN - IA + 1 NU = N - M + IN - IA + 1 * ELSE * MU = M NU = N * END IF * * Use unblocked code to factor the last or only block * IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL PZGERQ2( MU, NU, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZGERQF * END scalapack-2.0.2/SRC/pzgesv.f000644 000766 000024 00000023116 10367447133 016040 0ustar00juliestaff000000 000000 SUBROUTINE PZGESV( N, NRHS, A, IA, JA, DESCA, IPIV, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * Jan 30, 2006 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) COMPLEX*16 A( * ), B( * ) * .. * * Purpose * ======= * * PZGESV computes the solution to a complex system of linear equations * * sub( A ) * X = sub( B ), * * where sub( A ) = A(IA:IA+N-1,JA:JA+N-1) is an N-by-N distributed * matrix and X and sub( B ) = B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS * distributed matrices. * * The LU decomposition with partial pivoting and row interchanges is * used to factor sub( A ) as sub( A ) = P * L * U, where P is a permu- * tation matrix, L is unit lower triangular, and U is upper triangular. * L and U are stored in sub( A ). The factored form of sub( A ) is then * used to solve the system of equations sub( A ) * X = sub( B ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the N-by-N distributed matrix * sub( A ) to be factored. On exit, this array contains the * local pieces of the factors L and U from the factorization * sub( A ) = P*L*U; the unit diagonal elements of L are not * stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, the right hand side * distributed matrix sub( B ). On exit, if INFO = 0, sub( B ) * is overwritten by the solution distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, so the solution could not be * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PXERBLA, $ PZGETRF, PZGETRS * .. * .. External Functions .. INTEGER INDXG2P EXTERNAL INDXG2P * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( N, 1, NRHS, 2, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IF( IROFFA.NE.0 ) THEN INFO = -4 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( IBROW.NE.IAROW .OR. ICOFFA.NE.IROFFB ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1100+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(1100+CTXT_) END IF END IF CALL PCHK2MAT( N, 1, N, 1, IA, JA, DESCA, 6, N, 1, NRHS, 2, $ IB, JB, DESCB, 11, 0, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGESV', -INFO ) RETURN END IF * * Compute the LU factorization of sub( A ). * CALL PZGETRF( N, N, A, IA, JA, DESCA, IPIV, INFO ) * IF( INFO.EQ.0 ) THEN * * Solve the system sub( A ) * X = sub( B ), overwriting sub( B ) * with X. * CALL PZGETRS( 'No transpose', N, NRHS, A, IA, JA, DESCA, IPIV, $ B, IB, JB, DESCB, INFO ) * END IF * RETURN * * End of PZGESV * END scalapack-2.0.2/SRC/pzgesvd.f000644 000766 000024 00000056063 10377355407 016216 0ustar00juliestaff000000 000000 SUBROUTINE PZGESVD(JOBU,JOBVT,M,N,A,IA,JA,DESCA,S,U,IU,JU,DESCU, + VT,IVT,JVT,DESCVT,WORK,LWORK,RWORK,INFO) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Oak Ridge National Laboratory * and Univ. of California Berkeley. * Jan 2006 * * .. Scalar Arguments .. CHARACTER JOBU,JOBVT INTEGER IA,INFO,IU,IVT,JA,JU,JVT,LWORK,M,N * .. * .. Array Arguments .. INTEGER DESCA(*),DESCU(*),DESCVT(*) COMPLEX*16 A(*),U(*),VT(*),WORK(*) DOUBLE PRECISION S(*) DOUBLE PRECISION RWORK(*) * .. * * Purpose * ======= * * PZGESVD computes the singular value decomposition (SVD) of an * M-by-N matrix A, optionally computing the left and/or right * singular vectors. The SVD is written as * * A = U * SIGMA * transpose(V) * * where SIGMA is an M-by-N matrix which is zero except for its * min(M,N) diagonal elements, U is an M-by-M orthogonal matrix, and * V is an N-by-N orthogonal matrix. The diagonal elements of SIGMA * are the singular values of A and the columns of U and V are the * corresponding right and left singular vectors, respectively. The * singular values are returned in array S in decreasing order and * only the first min(M,N) columns of U and rows of VT = V**T are * computed. * * Notes * ===== * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, and * assume that its process grid has dimension r x c. LOCr( K ) denotes * the number of elements of K that a process would receive if K were * distributed over the r processes of its process column. Similarly, * LOCc( K ) denotes the number of elements of K that a process would * receive if K were distributed over the c processes of its process * row. The values of LOCr() and LOCc() may be determined via a call * to the ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * MP = number of local rows in A and U * NQ = number of local columns in A and VT * SIZE = min( M, N ) * SIZEQ = number of local columns in U * SIZEP = number of local rows in VT * * JOBU (global input) CHARACTER*1 * Specifies options for computing U: * = 'V': the first SIZE columns of U (the left singular * vectors) are returned in the array U; * = 'N': no columns of U (no left singular vectors) are * computed. * * JOBVT (global input) CHARACTER*1 * Specifies options for computing V**T: * = 'V': the first SIZE rows of V**T (the right singular * vectors) are returned in the array VT; * = 'N': no rows of V**T (no right singular vectors) are * computed. * * M (global input) INTEGER * The number of rows of the input matrix A. M >= 0. * * N (global input) INTEGER * The number of columns of the input matrix A. N >= 0. * * A (local input/workspace) block cyclic COMPLEX*16 * array, * global dimension (M, N), local dimension (MP, NQ) * On exit, the contents of A are destroyed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix A. * * S (global output) DOUBLE PRECISION array, dimension SIZE * The singular values of A, sorted so that S(i) >= S(i+1). * * U (local output) COMPLEX*16 array, local dimension * (MP, SIZEQ), global dimension (M, SIZE) * if JOBU = 'V', U contains the first min(m,n) columns of U * if JOBU = 'N', U is not referenced. * * IU (global input) INTEGER * The row index in the global array U indicating the first * row of sub( U ). * * JU (global input) INTEGER * The column index in the global array U indicating the * first column of sub( U ). * * DESCU (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix U. * * VT (local output) COMPLEX*16 array, local dimension * (SIZEP, NQ), global dimension (SIZE, N). * If JOBVT = 'V', VT contains the first SIZE rows of * V**T. If JOBVT = 'N', VT is not referenced. * * IVT (global input) INTEGER * The row index in the global array VT indicating the first * row of sub( VT ). * * JVT (global input) INTEGER * The column index in the global array VT indicating the * first column of sub( VT ). * * DESCVT (global input) INTEGER array of dimension DLEN_ * The array descriptor for the distributed matrix VT. * * WORK (local workspace/output) COMPLEX*16 array, dimension * (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (local input) INTEGER * The dimension of the array WORK. * * LWORK >= 1 + 2*SIZEB + MAX(WATOBD, WBDTOSVD), * * where SIZEB = MAX(M,N), and WATOBD and WBDTOSVD refer, * respectively, to the workspace required to bidiagonalize * the matrix A and to go from the bidiagonal matrix to the * singular value decomposition U*S*VT. * * For WATOBD, the following holds: * * WATOBD = MAX(MAX(WPZLANGE,WPZGEBRD), * MAX(WPZLARED2D,WP(pre)LARED1D)), * * where WPZLANGE, WPZLARED1D, WPZLARED2D, WPZGEBRD are the * workspaces required respectively for the subprograms * PZLANGE, PDLARED1D, PDLARED2D, PZGEBRD. Using the * standard notation * * MP = NUMROC( M, MB, MYROW, DESCA( CTXT_ ), NPROW), * NQ = NUMROC( N, NB, MYCOL, DESCA( LLD_ ), NPCOL), * * the workspaces required for the above subprograms are * * WPZLANGE = MP, * WPDLARED1D = NQ0, * WPDLARED2D = MP0, * WPZGEBRD = NB*(MP + NQ + 1) + NQ, * * where NQ0 and MP0 refer, respectively, to the values obtained * at MYCOL = 0 and MYROW = 0. In general, the upper limit for * the workspace is given by a workspace required on * processor (0,0): * * WATOBD <= NB*(MP0 + NQ0 + 1) + NQ0. * * In case of a homogeneous process grid this upper limit can * be used as an estimate of the minimum workspace for every * processor. * * For WBDTOSVD, the following holds: * * WBDTOSVD = SIZE*(WANTU*NRU + WANTVT*NCVT) + * MAX(WZBDSQR, * MAX(WANTU*WPZORMBRQLN, WANTVT*WPZORMBRPRT)), * * where * * 1, if left(right) singular vectors are wanted * WANTU(WANTVT) = * 0, otherwise * * and WZBDSQR, WPZORMBRQLN and WPZORMBRPRT refer respectively * to the workspace required for the subprograms ZBDSQR, * PZUNMBR(QLN), and PZUNMBR(PRT), where QLN and PRT are the * values of the arguments VECT, SIDE, and TRANS in the call * to PZUNMBR. NRU is equal to the local number of rows of * the matrix U when distributed 1-dimensional "column" of * processes. Analogously, NCVT is equal to the local number * of columns of the matrix VT when distributed across * 1-dimensional "row" of processes. Calling the LAPACK * procedure ZBDSQR requires * * WZBDSQR = MAX(1, 4*SIZE ) * * on every processor. Finally, * * WPZORMBRQLN = MAX( (NB*(NB-1))/2, (SIZEQ+MP)*NB)+NB*NB, * WPZORMBRPRT = MAX( (MB*(MB-1))/2, (SIZEP+NQ)*MB )+MB*MB, * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the work array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * RWORK (workspace) REAL array, dimension (1+4*SIZEB) * On exit, if INFO = 0, RWORK(1) returns the necessary size * for RWORK. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if ZBDSQR did not converge * If INFO = MIN(M,N) + 1, then PZGESVD has detected * heterogeneity by finding that eigenvalues were not * identical across the process grid. In this case, the * accuracy of the results from PZGESVD cannot be * guaranteed. * * ===================================================================== * * The results of PZGEBRD, and therefore PZGESVD, may vary slightly * from run to run with the same input data. If repeatability is an * issue, call BLACS_SET with the appropriate option after defining * the process grid. * * Alignment requirements * ====================== * * The routine PZGESVD inherits the same alignement requirement as * the routine PZGEBRD, namely: * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expressions should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) * where NB = MB_A = NB_A, * IROFFA = MOD( IA-1, NB ), ICOFFA = MOD( JA-1, NB ), * * ===================================================================== * * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D,DLEN_,DTYPE_,CTXT_,M_,N_,MB_,NB_,RSRC_, + CSRC_,LLD_,ITHVAL PARAMETER (BLOCK_CYCLIC_2D=1,DLEN_=9,DTYPE_=1,CTXT_=2,M_=3,N_=4, + MB_=5,NB_=6,RSRC_=7,CSRC_=8,LLD_=9,ITHVAL=10) COMPLEX*16 ZERO,ONE PARAMETER (ZERO= ((0.0D+0,0.0D+0)),ONE= ((1.0D+0,0.0D+0))) DOUBLE PRECISION DZERO,DONE PARAMETER (DZERO=0.0D+0,DONE=1.0D+0) * .. * .. Local Scalars .. CHARACTER UPLO INTEGER CONTEXTC,CONTEXTR,I,INDD,INDD2,INDE,INDE2,INDTAUP,INDTAUQ, + INDU,INDV,INDWORK,IOFFD,IOFFE,ISCALE,J,K,LDU,LDVT,LLWORK, + LWMIN,MAXIM,MB,MP,MYPCOL,MYPCOLC,MYPCOLR,MYPROW,MYPROWC, + MYPROWR,NB,NCVT,NPCOL,NPCOLC,NPCOLR,NPROCS,NPROW,NPROWC, + NPROWR,NQ,NRU,SIZE,SIZEB,SIZEP,SIZEPOS,SIZEQ,WANTU,WANTVT, + WATOBD,WBDTOSVD,WZBDSQR,WPZGEBRD,WPZLANGE,WPZORMBRPRT, + WPZORMBRQLN DOUBLE PRECISION ANRM,BIGNUM,EPS,RMAX,RMIN,SAFMIN,SIGMA,SMLNUM * .. * .. Local Arrays .. INTEGER DESCTU(DLEN_),DESCTVT(DLEN_),IDUM1(3),IDUM2(3) DOUBLE PRECISION C(1,1) * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC DOUBLE PRECISION PDLAMCH,PZLANGE EXTERNAL LSAME,NUMROC,PDLAMCH,PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GET,BLACS_GRIDEXIT,BLACS_GRIDINFO,BLACS_GRIDINIT, + CHK1MAT,ZBDSQR,DESCINIT,DGAMN2D,DGAMX2D,DSCAL,IGAMX2D, + IGEBR2D,IGEBS2D,PCHK1MAT,PZGEBRD,PZGEMR2D,PDLARED1D, + PDLARED2D,PZLASCL,PZLASET,PZUNMBR,PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX,MIN,SQRT,DBLE INTRINSIC DCMPLX * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF (BLOCK_CYCLIC_2D*DTYPE_*LLD_*MB_*M_*NB_*N_.LT.0) RETURN * CALL BLACS_GRIDINFO(DESCA(CTXT_),NPROW,NPCOL,MYPROW,MYPCOL) ISCALE = 0 INFO = 0 * IF (NPROW.EQ.-1) THEN INFO = - (800+CTXT_) ELSE * SIZE = MIN(M,N) SIZEB = MAX(M,N) NPROCS = NPROW*NPCOL IF (M.GE.N) THEN IOFFD = JA - 1 IOFFE = IA - 1 SIZEPOS = 1 ELSE IOFFD = IA - 1 IOFFE = JA - 1 SIZEPOS = 3 END IF * IF (LSAME(JOBU,'V')) THEN WANTU = 1 ELSE WANTU = 0 END IF IF (LSAME(JOBVT,'V')) THEN WANTVT = 1 ELSE WANTVT = 0 END IF * CALL CHK1MAT(M,3,N,4,IA,JA,DESCA,8,INFO) IF (WANTU.EQ.1) THEN CALL CHK1MAT(M,3,SIZE,SIZEPOS,IU,JU,DESCU,13,INFO) END IF IF (WANTVT.EQ.1) THEN CALL CHK1MAT(SIZE,SIZEPOS,N,4,IVT,JVT,DESCVT,17,INFO) END IF CALL IGAMX2D(DESCA(CTXT_),'A',' ',1,1,INFO,1,1,1,-1,-1,0) * IF (INFO.EQ.0) THEN * * Set up pointers into the WORK array. * INDD = 2 INDE = INDD + SIZEB + IOFFD INDD2 = INDE + SIZEB + IOFFE INDE2 = INDD2 + SIZEB + IOFFD * INDTAUQ = 2 INDTAUP = INDTAUQ + SIZEB + JA - 1 INDWORK = INDTAUP + SIZEB + IA - 1 LLWORK = LWORK - INDWORK + 1 * * Initialize contexts for "column" and "row" process matrices. * CALL BLACS_GET(DESCA(CTXT_),10,CONTEXTC) CALL BLACS_GRIDINIT(CONTEXTC,'R',NPROCS,1) CALL BLACS_GRIDINFO(CONTEXTC,NPROWC,NPCOLC,MYPROWC, + MYPCOLC) CALL BLACS_GET(DESCA(CTXT_),10,CONTEXTR) CALL BLACS_GRIDINIT(CONTEXTR,'R',1,NPROCS) CALL BLACS_GRIDINFO(CONTEXTR,NPROWR,NPCOLR,MYPROWR, + MYPCOLR) * * Set local dimensions of matrices (this is for MB=NB=1). * NRU = NUMROC(M,1,MYPROWC,0,NPROCS) NCVT = NUMROC(N,1,MYPCOLR,0,NPROCS) NB = DESCA(NB_) MB = DESCA(MB_) MP = NUMROC(M,MB,MYPROW,DESCA(RSRC_),NPROW) NQ = NUMROC(N,NB,MYPCOL,DESCA(CSRC_),NPCOL) IF (WANTVT.EQ.1) THEN SIZEP = NUMROC(SIZE,DESCVT(MB_),MYPROW,DESCVT(RSRC_), + NPROW) ELSE SIZEP = 0 END IF IF (WANTU.EQ.1) THEN SIZEQ = NUMROC(SIZE,DESCU(NB_),MYPCOL,DESCU(CSRC_), + NPCOL) ELSE SIZEQ = 0 END IF * * Transmit MAX(NQ0, MP0). * IF (MYPROW.EQ.0 .AND. MYPCOL.EQ.0) THEN MAXIM = MAX(NQ,MP) CALL IGEBS2D(DESCA(CTXT_),'All',' ',1,1,MAXIM,1) ELSE CALL IGEBR2D(DESCA(CTXT_),'All',' ',1,1,MAXIM,1,0,0) END IF * WPZLANGE = MP WPZGEBRD = NB* (MP+NQ+1) + NQ WATOBD = MAX(MAX(WPZLANGE,WPZGEBRD),MAXIM) * WZBDSQR = MAX(1,4*SIZE) WPZORMBRQLN = MAX((NB* (NB-1))/2, (SIZEQ+MP)*NB) + NB*NB WPZORMBRPRT = MAX((MB* (MB-1))/2, (SIZEP+NQ)*MB) + MB*MB WBDTOSVD = SIZE* (WANTU*NRU+WANTVT*NCVT) + + MAX(WZBDSQR,MAX(WANTU*WPZORMBRQLN, + WANTVT*WPZORMBRPRT)) * * Finally, calculate required workspace. * LWMIN = 1 + 2*SIZEB + MAX(WATOBD,WBDTOSVD) WORK(1) = DCMPLX(LWMIN,0D+00) RWORK(1) = DBLE(1+4*SIZEB) * IF (WANTU.NE.1 .AND. .NOT. (LSAME(JOBU,'N'))) THEN INFO = -1 ELSE IF (WANTVT.NE.1 .AND. .NOT. (LSAME(JOBVT,'N'))) THEN INFO = -2 ELSE IF (LWORK.LT.LWMIN .AND. LWORK.NE.-1) THEN INFO = -19 END IF * END IF * IDUM1(1) = WANTU IDUM1(2) = WANTVT IF (LWORK.EQ.-1) THEN IDUM1(3) = -1 ELSE IDUM1(3) = 1 END IF IDUM2(1) = 1 IDUM2(2) = 2 IDUM2(3) = 19 CALL PCHK1MAT(M,3,N,4,IA,JA,DESCA,8,3,IDUM1,IDUM2,INFO) IF (INFO.EQ.0) THEN IF (WANTU.EQ.1) THEN CALL PCHK1MAT(M,3,SIZE,4,IU,JU,DESCU,13,0,IDUM1,IDUM2, + INFO) END IF IF (WANTVT.EQ.1) THEN CALL PCHK1MAT(SIZE,3,N,4,IVT,JVT,DESCVT,17,0,IDUM1, + IDUM2,INFO) END IF END IF * END IF * IF (INFO.NE.0) THEN CALL PXERBLA(DESCA(CTXT_),'PZGESVD',-INFO) RETURN ELSE IF (LWORK.EQ.-1) THEN GO TO 40 END IF * * Quick return if possible. * IF (M.LE.0 .OR. N.LE.0) GO TO 40 * * Get machine constants. * SAFMIN = PDLAMCH(DESCA(CTXT_),'Safe minimum') EPS = PDLAMCH(DESCA(CTXT_),'Precision') SMLNUM = SAFMIN/EPS BIGNUM = DONE/SMLNUM RMIN = SQRT(SMLNUM) RMAX = MIN(SQRT(BIGNUM),DONE/SQRT(SQRT(SAFMIN))) * * Scale matrix to allowable range, if necessary. * ANRM = PZLANGE('1',M,N,A,IA,JA,DESCA,WORK(INDWORK)) IF (ANRM.GT.DZERO .AND. ANRM.LT.RMIN) THEN ISCALE = 1 SIGMA = RMIN/ANRM ELSE IF (ANRM.GT.RMAX) THEN ISCALE = 1 SIGMA = RMAX/ANRM END IF * IF (ISCALE.EQ.1) THEN CALL PZLASCL('G',DONE,SIGMA,M,N,A,IA,JA,DESCA,INFO) END IF * CALL PZGEBRD(M,N,A,IA,JA,DESCA,RWORK(INDD),RWORK(INDE), + WORK(INDTAUQ),WORK(INDTAUP),WORK(INDWORK),LLWORK, + INFO) * * Copy D and E to all processes. * Array D is in local array of dimension: * LOCc(JA+MIN(M,N)-1) if M >= N; LOCr(IA+MIN(M,N)-1) otherwise. * Array E is in local array of dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * IF (M.GE.N) THEN * Distribute D CALL PDLARED1D(N+IOFFD,IA,JA,DESCA,RWORK(INDD),RWORK(INDD2), + WORK(INDWORK),LLWORK) * Distribute E CALL PDLARED2D(M+IOFFE,IA,JA,DESCA,RWORK(INDE),RWORK(INDE2), + WORK(INDWORK),LLWORK) ELSE * Distribute D CALL PDLARED2D(M+IOFFD,IA,JA,DESCA,RWORK(INDD),RWORK(INDD2), + WORK(INDWORK),LLWORK) * Distribute E CALL PDLARED1D(N+IOFFE,IA,JA,DESCA,RWORK(INDE),RWORK(INDE2), + WORK(INDWORK),LLWORK) END IF * * Prepare for calling PZBDSQR. * IF (M.GE.N) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * INDU = INDWORK INDV = INDU + SIZE*NRU*WANTU INDWORK = INDV + SIZE*NCVT*WANTVT * LDU = MAX(1,NRU) LDVT = MAX(1,SIZE) * CALL DESCINIT(DESCTU,M,SIZE,1,1,0,0,CONTEXTC,LDU,INFO) CALL DESCINIT(DESCTVT,SIZE,N,1,1,0,0,CONTEXTR,LDVT,INFO) * IF (WANTU.EQ.1) THEN CALL PZLASET('Full',M,SIZE,ZERO,ONE,WORK(INDU),1,1,DESCTU) ELSE NRU = 0 END IF * IF (WANTVT.EQ.1) THEN CALL PZLASET('Full',SIZE,N,ZERO,ONE,WORK(INDV),1,1,DESCTVT) ELSE NCVT = 0 END IF * CALL ZBDSQR(UPLO,SIZE,NCVT,NRU,0,RWORK(INDD2+IOFFD), + RWORK(INDE2+IOFFE),WORK(INDV),SIZE,WORK(INDU),LDU,C,1, + WORK(INDWORK),INFO) * * Redistribute elements of U and VT in the block-cyclic fashion. * IF (WANTU.EQ.1) CALL PZGEMR2D(M,SIZE,WORK(INDU),1,1,DESCTU,U,IU, + JU,DESCU,DESCU(CTXT_)) * IF (WANTVT.EQ.1) CALL PZGEMR2D(SIZE,N,WORK(INDV),1,1,DESCTVT,VT, + IVT,JVT,DESCVT,DESCVT(CTXT_)) * * Set to ZERO "non-square" elements of the larger matrices U, VT. * IF (M.GT.N .AND. WANTU.EQ.1) THEN CALL PZLASET('Full',M-SIZE,SIZE,ZERO,ZERO,U,IA+SIZE,JU,DESCU) ELSE IF (N.GT.M .AND. WANTVT.EQ.1) THEN CALL PZLASET('Full',SIZE,N-SIZE,ZERO,ZERO,VT,IVT,JVT+SIZE, + DESCVT) END IF * * Multiply Householder rotations from bidiagonalized matrix. * IF (WANTU.EQ.1) CALL PZUNMBR('Q','L','N',M,SIZE,N,A,IA,JA,DESCA, + WORK(INDTAUQ),U,IU,JU,DESCU, + WORK(INDWORK),LLWORK,INFO) * IF (WANTVT.EQ.1) CALL PZUNMBR('P','R','C',SIZE,N,M,A,IA,JA,DESCA, + WORK(INDTAUP),VT,IVT,JVT,DESCVT, + WORK(INDWORK),LLWORK,INFO) * * Copy singular values into output array S. * DO 10 I = 1,SIZE S(I) = RWORK(INDD2+IOFFD+I-1) 10 CONTINUE * * If matrix was scaled, then rescale singular values appropriately. * IF (ISCALE.EQ.1) THEN CALL DSCAL(SIZE,ONE/SIGMA,S,1) END IF * * Compare every ith eigenvalue, or all if there are only a few, * across the process grid to check for heterogeneity. * IF (SIZE.LE.ITHVAL) THEN J = SIZE K = 1 ELSE J = SIZE/ITHVAL K = ITHVAL END IF * DO 20 I = 1,J RWORK(I+INDE) = S((I-1)*K+1) RWORK(I+INDD2) = S((I-1)*K+1) 20 CONTINUE * CALL DGAMN2D(DESCA(CTXT_),'a',' ',J,1,RWORK(1+INDE),J,1,1,-1,-1,0) CALL DGAMX2D(DESCA(CTXT_),'a',' ',J,1,RWORK(1+INDD2),J,1,1,-1,-1, + 0) * DO 30 I = 1,J IF ((RWORK(I+INDE)-RWORK(I+INDD2)).NE.DZERO) THEN INFO = SIZE + 1 END IF 30 CONTINUE * 40 CONTINUE * CALL BLACS_GRIDEXIT(CONTEXTC) CALL BLACS_GRIDEXIT(CONTEXTR) * * End of PZGESVD * RETURN END scalapack-2.0.2/SRC/pzgesvx.f000644 000766 000024 00000105004 10363532303 016213 0ustar00juliestaff000000 000000 SUBROUTINE PZGESVX( FACT, TRANS, N, NRHS, A, IA, JA, DESCA, AF, $ IAF, JAF, DESCAF, IPIV, EQUED, R, C, B, IB, $ JB, DESCB, X, IX, JX, DESCX, RCOND, FERR, $ BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER EQUED, FACT, TRANS INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, LRWORK, $ LWORK, N, NRHS DOUBLE PRECISION RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ), IPIV( * ) DOUBLE PRECISION BERR( * ), C( * ), FERR( * ), R( * ), $ RWORK( * ) COMPLEX*16 A( * ), AF( * ), B( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PZGESVX uses the LU factorization to compute the solution to a * complex system of linear equations * * A(IA:IA+N-1,JA:JA+N-1) * X = B(IB:IB+N-1,JB:JB+NRHS-1), * * where A(IA:IA+N-1,JA:JA+N-1) is an N-by-N matrix and X and * B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS matrices. * * Error bounds on the solution and a condition estimate are also * provided. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Description * =========== * * In the following description, A denotes A(IA:IA+N-1,JA:JA+N-1), * B denotes B(IB:IB+N-1,JB:JB+NRHS-1) and X denotes * X(IX:IX+N-1,JX:JX+NRHS-1). * * The following steps are performed: * * 1. If FACT = 'E', real scaling factors are computed to equilibrate * the system: * TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B * TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B * TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') * or diag(C)*B (if TRANS = 'T' or 'C'). * * 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the * matrix A (after equilibration if FACT = 'E') as * A = P * L * U, * where P is a permutation matrix, L is a unit lower triangular * matrix, and U is upper triangular. * * 3. The factored form of A is used to estimate the condition number * of the matrix A. If the reciprocal of the condition number is * less than machine precision, steps 4-6 are skipped. * * 4. The system of equations is solved for X using the factored form * of A. * * 5. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 6. If FACT = 'E' and equilibration was used, the matrix X is * premultiplied by diag(C) (if TRANS = 'N') or diag(R) (if * TRANS = 'T' or 'C') so that it solves the original system * before equilibration. * * Arguments * ========= * * FACT (global input) CHARACTER * Specifies whether or not the factored form of the matrix * A(IA:IA+N-1,JA:JA+N-1) is supplied on entry, and if not, * whether the matrix A(IA:IA+N-1,JA:JA+N-1) should be * equilibrated before it is factored. * = 'F': On entry, AF(IAF:IAF+N-1,JAF:JAF+N-1) and IPIV con- * tain the factored form of A(IA:IA+N-1,JA:JA+N-1). * If EQUED is not 'N', the matrix * A(IA:IA+N-1,JA:JA+N-1) has been equilibrated with * scaling factors given by R and C. * A(IA:IA+N-1,JA:JA+N-1), AF(IAF:IAF+N-1,JAF:JAF+N-1), * and IPIV are not modified. * = 'N': The matrix A(IA:IA+N-1,JA:JA+N-1) will be copied to * AF(IAF:IAF+N-1,JAF:JAF+N-1) and factored. * = 'E': The matrix A(IA:IA+N-1,JA:JA+N-1) will be equili- * brated if necessary, then copied to * AF(IAF:IAF+N-1,JAF:JAF+N-1) and factored. * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': A(IA:IA+N-1,JA:JA+N-1) * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (No transpose) * = 'T': A(IA:IA+N-1,JA:JA+N-1)**T * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (Transpose) * = 'C': A(IA:IA+N-1,JA:JA+N-1)**H * X(IX:IX+N-1,JX:JX+NRHS-1) * = B(IB:IB+N-1,JB:JB+NRHS-1) (Conjugate transpose) * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * NRHS (global input) INTEGER * The number of right-hand sides, i.e., the number of columns * of the distributed submatrices B(IB:IB+N-1,JB:JB+NRHS-1) and * X(IX:IX+N-1,JX:JX+NRHS-1). NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * the local memory to an array of local dimension * (LLD_A,LOCc(JA+N-1)). On entry, the N-by-N matrix * A(IA:IA+N-1,JA:JA+N-1). If FACT = 'F' and EQUED is not 'N', * then A(IA:IA+N-1,JA:JA+N-1) must have been equilibrated by * the scaling factors in R and/or C. A(IA:IA+N-1,JA:JA+N-1) is * not modified if FACT = 'F' or 'N', or if FACT = 'E' and * EQUED = 'N' on exit. * * On exit, if EQUED .ne. 'N', A(IA:IA+N-1,JA:JA+N-1) is scaled * as follows: * EQUED = 'R': A(IA:IA+N-1,JA:JA+N-1) := * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * EQUED = 'C': A(IA:IA+N-1,JA:JA+N-1) := * A(IA:IA+N-1,JA:JA+N-1) * diag(C) * EQUED = 'B': A(IA:IA+N-1,JA:JA+N-1) := * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * diag(C). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input or local output) COMPLEX*16 pointer * into the local memory to an array of local dimension * (LLD_AF,LOCc(JA+N-1)). If FACT = 'F', then * AF(IAF:IAF+N-1,JAF:JAF+N-1) is an input argument and on * entry contains the factors L and U from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U as computed by PZGETRF. * If EQUED .ne. 'N', then AF is the factored form of the * equilibrated matrix A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'N', then AF(IAF:IAF+N-1,JAF:JAF+N-1) is an output * argument and on exit returns the factors L and U from the * factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the original * matrix A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'E', then AF(IAF:IAF+N-1,JAF:JAF+N-1) is an output * argument and on exit returns the factors L and U from the * factorization A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the equili- * brated matrix A(IA:IA+N-1,JA:JA+N-1) (see the description of * A(IA:IA+N-1,JA:JA+N-1) for the form of the equilibrated * matrix). * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * IPIV (local input or local output) INTEGER array, dimension * LOCr(M_A)+MB_A. If FACT = 'F', then IPIV is an input argu- * ment and on entry contains the pivot indices from the fac- * torization A(IA:IA+N-1,JA:JA+N-1) = P*L*U as computed by * PZGETRF; IPIV(i) -> The global row local row i was * swapped with. This array must be aligned with * A( IA:IA+N-1, * ). * * If FACT = 'N', then IPIV is an output argument and on exit * contains the pivot indices from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the original matrix * A(IA:IA+N-1,JA:JA+N-1). * * If FACT = 'E', then IPIV is an output argument and on exit * contains the pivot indices from the factorization * A(IA:IA+N-1,JA:JA+N-1) = P*L*U of the equilibrated matrix * A(IA:IA+N-1,JA:JA+N-1). * * EQUED (global input or global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration (always true if FACT = 'N'). * = 'R': Row equilibration, i.e., A(IA:IA+N-1,JA:JA+N-1) has * been premultiplied by diag(R). * = 'C': Column equilibration, i.e., A(IA:IA+N-1,JA:JA+N-1) * has been postmultiplied by diag(C). * = 'B': Both row and column equilibration, i.e., * A(IA:IA+N-1,JA:JA+N-1) has been replaced by * diag(R) * A(IA:IA+N-1,JA:JA+N-1) * diag(C). * EQUED is an input variable if FACT = 'F'; otherwise, it is an * output variable. * * R (local input or local output) DOUBLE PRECISION array, * dimension LOCr(M_A). * The row scale factors for A(IA:IA+N-1,JA:JA+N-1). * If EQUED = 'R' or 'B', A(IA:IA+N-1,JA:JA+N-1) is multiplied * on the left by diag(R); if EQUED='N' or 'C', R is not acces- * sed. R is an input variable if FACT = 'F'; otherwise, R is * an output variable. * If FACT = 'F' and EQUED = 'R' or 'B', each element of R must * be positive. * R is replicated in every process column, and is aligned * with the distributed matrix A. * * C (local input or local output) DOUBLE PRECISION array, * dimension LOCc(N_A). * The column scale factors for A(IA:IA+N-1,JA:JA+N-1). * If EQUED = 'C' or 'B', A(IA:IA+N-1,JA:JA+N-1) is multiplied * on the right by diag(C); if EQUED = 'N' or 'R', C is not * accessed. C is an input variable if FACT = 'F'; otherwise, * C is an output variable. If FACT = 'F' and EQUED = 'C' or * 'B', each element of C must be positive. * C is replicated in every process row, and is aligned with * the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer * into the local memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1) ). On entry, the N-by-NRHS right-hand * side matrix B(IB:IB+N-1,JB:JB+NRHS-1). On exit, if * EQUED = 'N', B(IB:IB+N-1,JB:JB+NRHS-1) is not modified; if * TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by * diag(R)*B(IB:IB+N-1,JB:JB+NRHS-1); if TRANS = 'T' or 'C' * and EQUED = 'C' or 'B', B(IB:IB+N-1,JB:JB+NRHS-1) is over- * written by diag(C)*B(IB:IB+N-1,JB:JB+NRHS-1). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input/local output) COMPLEX*16 pointer * into the local memory to an array of local dimension * (LLD_X, LOCc(JX+NRHS-1)). If INFO = 0, the N-by-NRHS * solution matrix X(IX:IX+N-1,JX:JX+NRHS-1) to the original * system of equations. Note that A(IA:IA+N-1,JA:JA+N-1) and * B(IB:IB+N-1,JB:JB+NRHS-1) are modified on exit if * EQUED .ne. 'N', and the solution to the equilibrated system * is inv(diag(C))*X(IX:IX+N-1,JX:JX+NRHS-1) if TRANS = 'N' * and EQUED = 'C' or 'B', or * inv(diag(R))*X(IX:IX+N-1,JX:JX+NRHS-1) if TRANS = 'T' or 'C' * and EQUED = 'R' or 'B'. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * RCOND (global output) DOUBLE PRECISION * The estimate of the reciprocal condition number of the matrix * A(IA:IA+N-1,JA:JA+N-1) after equilibration (if done). If * RCOND is less than the machine precision (in particular, if * RCOND = 0), the matrix is singular to working precision. * This condition is indicated by a return code of INFO > 0. * * FERR (local output) DOUBLE PRECISION array, dimension LOCc(N_B) * The estimated forward error bounds for each solution vector * X(j) (the j-th column of the solution matrix * X(IX:IX+N-1,JX:JX+NRHS-1). If XTRUE is the true solution, * FERR(j) bounds the magnitude of the largest entry in * (X(j) - XTRUE) divided by the magnitude of the largest entry * in X(j). The estimate is as reliable as the estimate for * RCOND, and is almost always a slight overestimate of the * true error. FERR is replicated in every process row, and is * aligned with the matrices B and X. * * BERR (local output) DOUBLE PRECISION array, dimension LOCc(N_B). * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any entry of A(IA:IA+N-1,JA:JA+N-1) or * B(IB:IB+N-1,JB:JB+NRHS-1) that makes X(j) an exact solution). * BERR is replicated in every process row, and is aligned * with the matrices B and X. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = MAX( PZGECON( LWORK ), PZGERFS( LWORK ) ) * + LOCr( N_A ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK = 2*LOCc(N_A). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, and i is * <= N: U(IA+I-1,IA+I-1) is exactly zero. The * factorization has been completed, but the * factor U is exactly singular, so the solution * and error bounds could not be computed. * = N+1: RCOND is less than machine precision. The * factorization has been completed, but the * matrix is singular to working precision, and * the solution and error bounds have not been * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLEQU, EQUIL, LQUERY, NOFACT, NOTRAN, ROWEQU CHARACTER NORM INTEGER CONWRK, I, IACOL, IAROW, IAFROW, IBROW, IBCOL, $ ICOFFA, ICOFFB, ICOFFX, ICTXT, IDUMM, $ IIA, IIB, IIX, $ INFEQU, IROFFA, IROFFAF, IROFFB, $ IROFFX, IXCOL, IXROW, J, JJA, JJB, JJX, $ LCM, LCMQ, $ LRWMIN, LWMIN, MYCOL, MYROW, NP, NPCOL, NPROW, $ NQ, NQB, NRHSQ, RFSWRK DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, $ ROWCND, SMLNUM * .. * .. Local Arrays .. INTEGER CDESC( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ DGEBR2D, DGEBS2D, DGAMN2D, $ DGAMX2D, INFOG2L, PDCOPY, PXERBLA, $ PZGECON, PZGEEQU, PZGERFS, $ PZGETRF, PZGETRS, PZLACPY, PZLAQGE * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC, PZLANGE, $ PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC DBLE, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(800+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 8, INFO ) IF( LSAME( FACT, 'F' ) ) $ CALL CHK1MAT( N, 3, N, 3, IAF, JAF, DESCAF, 12, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 20, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IX, JX, DESCX, 24, INFO ) NOFACT = LSAME( FACT, 'N' ) EQUIL = LSAME( FACT, 'E' ) NOTRAN = LSAME( TRANS, 'N' ) IF( NOFACT .OR. EQUIL ) THEN EQUED = 'N' ROWEQU = .FALSE. COLEQU = .FALSE. ELSE ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM END IF IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IIA, JJA, IAROW, IACOL ) NP = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFFA NQB = ICEIL( N+IROFFA, DESCA( NB_ )*NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL CONWRK = 2*NP + 2*NQ + MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQ + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) RFSWRK = 3*NP IF( LSAME( TRANS, 'N' ) ) THEN RFSWRK = RFSWRK + NP + NQ + $ ICEIL( NQB, LCMQ )*DESCA( NB_ ) ELSE IF( LSAME( TRANS, 'T' ).OR.LSAME( TRANS, 'C' ) ) THEN RFSWRK = RFSWRK + NP + NQ END IF LWMIN = MAX( CONWRK, RFSWRK ) LRWMIN = MAX( 2*NQ, NP ) RWORK( 1 ) = DBLE( LRWMIN ) IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. $ .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT. LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( ICOFFA.NE.0 .OR. IROFFA.NE.ICOFFA ) THEN INFO = -7 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(800+NB_) ELSE IF( IAFROW.NE.IAROW ) THEN INFO = -10 ELSE IF( IROFFAF.NE.0 ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -(1200+CTXT_) ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN INFO = -13 ELSE IF( ROWEQU ) THEN RCMIN = BIGNUM RCMAX = ZERO DO 10 J = IIA, IIA + NP - 1 RCMIN = MIN( RCMIN, R( J ) ) RCMAX = MAX( RCMAX, R( J ) ) 10 CONTINUE CALL DGAMN2D( ICTXT, 'Columnwise', ' ', 1, 1, RCMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, RCMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( RCMIN.LE.ZERO ) THEN INFO = -14 ELSE IF( N.GT.0 ) THEN ROWCND = MAX( RCMIN, SMLNUM ) / $ MIN( RCMAX, BIGNUM ) ELSE ROWCND = ONE END IF END IF IF( COLEQU .AND. INFO.EQ.0 ) THEN RCMIN = BIGNUM RCMAX = ZERO DO 20 J = JJA, JJA+NQ-1 RCMIN = MIN( RCMIN, C( J ) ) RCMAX = MAX( RCMAX, C( J ) ) 20 CONTINUE CALL DGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, RCMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, RCMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( RCMIN.LE.ZERO ) THEN INFO = -15 ELSE IF( N.GT.0 ) THEN COLCND = MAX( RCMIN, SMLNUM ) / $ MIN( RCMAX, BIGNUM ) ELSE COLCND = ONE END IF END IF END IF END IF * WORK( 1 ) = DBLE( LWMIN ) RWORK( 1 ) = DBLE( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IBROW.NE.IAROW ) THEN INFO = -18 ELSE IF( IXROW.NE.IBROW ) THEN INFO = -22 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2000+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(2000+CTXT_) ELSE IF( DESCX( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2400+NB_) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -(2400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -29 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -31 END IF IDUM1( 1 ) = ICHAR( FACT ) IDUM2( 1 ) = 1 IDUM1( 2 ) = ICHAR( TRANS ) IDUM2( 2 ) = 2 IF( LSAME( FACT, 'F' ) ) THEN IDUM1( 3 ) = ICHAR( EQUED ) IDUM2( 3 ) = 14 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 29 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 31 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, $ NRHS, 4, IB, JB, DESCB, 20, 5, IDUM1, $ IDUM2, INFO ) ELSE IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 29 IF( LRWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 31 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, $ NRHS, 4, IB, JB, DESCB, 20, 4, IDUM1, $ IDUM2, INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGESVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * IF( EQUIL ) THEN * * Compute row and column scalings to equilibrate the matrix A. * CALL PZGEEQU( N, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, INFEQU ) IF( INFEQU.EQ.0 ) THEN * * Equilibrate the matrix. * CALL PZLAQGE( N, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, EQUED ) ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) END IF END IF * * Scale the right-hand side. * CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, $ JJB, IBROW, IBCOL ) NP = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) IF( MYROW.EQ.IBROW ) $ NP = NP-IROFFB IF( MYCOL.EQ.IBCOL ) $ NRHSQ = NRHSQ-ICOFFB * IF( NOTRAN ) THEN IF( ROWEQU ) THEN DO 40 J = JJB, JJB+NRHSQ-1 DO 30 I = IIB, IIB+NP-1 B( I+( J-1 )*DESCB( LLD_ ) ) = R( I )* $ B( I+( J-1 )*DESCB( LLD_ ) ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( COLEQU ) THEN * * Transpose the Column scale factors * CALL DESCSET( CDESC, 1, N+ICOFFA, 1, DESCA( NB_ ), MYROW, $ IACOL, ICTXT, 1 ) CALL PDCOPY( N, C, 1, JA, CDESC, CDESC( LLD_ ), RWORK, IB, JB, $ DESCB, 1 ) IF( MYCOL.EQ.IBCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, RWORK( IIB ), $ DESCB( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, RWORK( IIB ), $ DESCB( LLD_ ), MYROW, IBCOL ) END IF DO 60 J = JJB, JJB+NRHSQ-1 DO 50 I = IIB, IIB+NP-1 B( I+( J-1 )*DESCB( LLD_ ) ) = RWORK( I )* $ B( I+( J-1 )*DESCB( LLD_ ) ) 50 CONTINUE 60 CONTINUE END IF * IF( NOFACT.OR.EQUIL ) THEN * * Compute the LU factorization of A. * CALL PZLACPY( 'Full', N, N, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF ) CALL PZGETRF( N, N, AF, IAF, JAF, DESCAF, IPIV, INFO ) * * Return if INFO is non-zero. * IF( INFO.NE.0 ) THEN IF( INFO.GT.0 ) $ RCOND = ZERO RETURN END IF END IF * * Compute the norm of the matrix A. * IF( NOTRAN ) THEN NORM = '1' ELSE NORM = 'I' END IF ANORM = PZLANGE( NORM, N, N, A, IA, JA, DESCA, RWORK ) * * Compute the reciprocal of the condition number of A. * CALL PZGECON( NORM, N, AF, IAF, JAF, DESCAF, ANORM, RCOND, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * Return if the matrix is singular to working precision. * IF( RCOND.LT.PDLAMCH( ICTXT, 'Epsilon' ) ) THEN INFO = IA + N RETURN END IF * * Compute the solution matrix X. * CALL PZLACPY( 'Full', N, NRHS, B, IB, JB, DESCB, X, IX, JX, $ DESCX ) CALL PZGETRS( TRANS, N, NRHS, AF, IAF, JAF, DESCAF, IPIV, X, IX, $ JX, DESCX, INFO ) * * Use iterative refinement to improve the computed solution and * compute error bounds and backward error estimates for it. * CALL PZGERFS( TRANS, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, IPIV, B, IB, JB, DESCB, X, IX, JX, DESCX, $ FERR, BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * Transform the solution matrix X to a solution of the original * system. * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) NP = NUMROC( N+IROFFX, DESCX( MB_ ), MYROW, IXROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFFX, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYROW.EQ.IBROW ) $ NP = NP-IROFFX IF( MYCOL.EQ.IBCOL ) $ NRHSQ = NRHSQ-ICOFFX * IF( NOTRAN ) THEN IF( COLEQU ) THEN * * Transpose the column scaling factors * CALL DESCSET( CDESC, 1, N+ICOFFA, 1, DESCA( NB_ ), MYROW, $ IACOL, ICTXT, 1 ) CALL PDCOPY( N, C, 1, JA, CDESC, CDESC( LLD_ ), RWORK, IX, $ JX, DESCX, 1 ) IF( MYCOL.EQ.IBCOL ) THEN CALL DGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, $ RWORK( IIX ), DESCX( LLD_ ) ) ELSE CALL DGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, $ RWORK( IIX ), DESCX( LLD_ ), MYROW, $ IBCOL ) END IF * DO 80 J = JJX, JJX+NRHSQ-1 DO 70 I = IIX, IIX+NP-1 X( I+( J-1 )*DESCX( LLD_ ) ) = RWORK( I )* $ X( I+( J-1 )*DESCX( LLD_ ) ) 70 CONTINUE 80 CONTINUE DO 90 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / COLCND 90 CONTINUE END IF ELSE IF( ROWEQU ) THEN DO 110 J = JJX, JJX+NRHSQ-1 DO 100 I = IIX, IIX+NP-1 X( I+( J-1 )*DESCX( LLD_ ) ) = R( I )* $ X( I+( J-1 )*DESCX( LLD_ ) ) 100 CONTINUE 110 CONTINUE DO 120 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / ROWCND 120 CONTINUE END IF * WORK( 1 ) = DBLE( LWMIN ) RWORK( 1 ) = DBLE( LRWMIN ) * RETURN * * End of PZGESVX * END scalapack-2.0.2/SRC/pzgetf2.f000644 000766 000024 00000022644 10363532303 016076 0ustar00juliestaff000000 000000 SUBROUTINE PZGETF2( M, N, A, IA, JA, DESCA, IPIV, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZGETF2 computes an LU factorization of a general M-by-N * distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) using * partial pivoting with row interchanges. * * The factorization has the form sub( A ) = P * L * U, where P is a * permutation matrix, L is lower triangular with unit diagonal * elements (lower trapezoidal if m > n), and U is upper triangular * (upper trapezoidal if m < n). * * This is the right-looking Parallel Level 2 BLAS version of the * algorithm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires N <= NB_A-MOD(JA-1, NB_A) and square block * decomposition ( MB_A = NB_A ). * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). * NB_A-MOD(JA-1, NB_A) >= N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ). On exit, this array contains * the local pieces of the factors L and U from the factoriza- * tion sub( A ) = P*L*U; the unit diagonal elements of L are * not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, and division by zero will occur if * it is used to solve a system of equations. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER ROWBTOP INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IIA, IROFF, J, $ JJA, MN, MYCOL, MYROW, NPCOL, NPROW COMPLEX*16 GMAX * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, IGEBR2D, $ IGEBS2D, INFOG2L, PB_TOPGET, PXERBLA, PZAMAX, $ PZGERU, PZSCAL, PZSWAP * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( N+ICOFF.GT.DESCA( NB_ ) ) THEN INFO = -2 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGETF2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * MN = MIN( M, N ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) * IF( MYCOL.EQ.IACOL ) THEN DO 10 J = JA, JA+MN-1 I = IA + J - JA * * Find pivot and test for singularity. * CALL PZAMAX( M-J+JA, GMAX, IPIV( IIA+J-JA ), A, I, J, $ DESCA, 1 ) IF( GMAX.NE.ZERO ) THEN * * Apply the row interchanges to columns JA:JA+N-1 * CALL PZSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, $ IPIV( IIA+J-JA ), JA, DESCA, DESCA( M_ ) ) * * Compute elements I+1:IA+M-1 of J-th column. * IF( J-JA+1.LT.M ) $ CALL PZSCAL( M-J+JA-1, ONE / GMAX, A, I+1, J, $ DESCA, 1 ) ELSE IF( INFO.EQ.0 ) THEN INFO = J - JA + 1 END IF * * Update trailing submatrix * IF( J-JA+1.LT.MN ) THEN CALL PZGERU( M-J+JA-1, N-J+JA-1, -ONE, A, I+1, J, DESCA, $ 1, A, I, J+1, DESCA, DESCA( M_ ), A, I+1, $ J+1, DESCA ) END IF 10 CONTINUE * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, MN, 1, IPIV( IIA ), $ MN ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, MN, 1, IPIV( IIA ), $ MN, MYROW, IACOL ) * END IF * RETURN * * End of PZGETF2 * END scalapack-2.0.2/SRC/pzgetrf.f000644 000766 000024 00000026451 10363532303 016176 0ustar00juliestaff000000 000000 SUBROUTINE PZGETRF( M, N, A, IA, JA, DESCA, IPIV, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZGETRF computes an LU factorization of a general M-by-N distributed * matrix sub( A ) = (IA:IA+M-1,JA:JA+N-1) using partial pivoting with * row interchanges. * * The factorization has the form sub( A ) = P * L * U, where P is a * permutation matrix, L is lower triangular with unit diagonal ele- * ments (lower trapezoidal if m > n), and U is upper triangular * (upper trapezoidal if m < n). L and U are stored in sub( A ). * * This is the right-looking Parallel Level 3 BLAS version of the * algorithm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the M-by-N * distributed matrix sub( A ) to be factored. On exit, this * array contains the local pieces of the factors L and U from * the factorization sub( A ) = P*L*U; the unit diagonal ele- * ments of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local output) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,JA+K-1) is exactly zero. * The factorization has been completed, but the factor U * is exactly singular, and division by zero will occur if * it is used to solve a system of equations. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. CHARACTER COLBTOP, COLCTOP, ROWBTOP INTEGER I, ICOFF, ICTXT, IINFO, IN, IROFF, J, JB, JN, $ MN, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMN2D, PCHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZGEMM, PZGETF2, $ PZLASWP, PZTRSM * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 0, IDUM1, $ IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGETRF', -INFO ) RETURN END IF * * Quick return if possible * IF( DESCA( M_ ).EQ.1 ) THEN IPIV( 1 ) = 1 RETURN ELSE IF( M.EQ.0 .OR. N.EQ.0 ) THEN RETURN END IF * * Split-ring topology for the communication along process rows * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', ' ' ) * * Handle the first block of columns separately * MN = MIN( M, N ) IN = MIN( ICEIL( IA, DESCA( MB_ ) )*DESCA( MB_ ), IA+M-1 ) JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+MN-1 ) JB = JN - JA + 1 * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL PZGETF2( M, JB, A, IA, JA, DESCA, IPIV, INFO ) * IF( JB+1.LE.N ) THEN * * Apply interchanges to columns JN+1:JA+N-1. * CALL PZLASWP( 'Forward', 'Rows', N-JB, A, IA, JN+1, DESCA, $ IA, IN, IPIV ) * * Compute block row of U. * CALL PZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-JB, ONE, A, IA, JA, DESCA, A, IA, JN+1, DESCA ) * IF( JB+1.LE.M ) THEN * * Update trailing submatrix. * CALL PZGEMM( 'No transpose', 'No transpose', M-JB, N-JB, JB, $ -ONE, A, IN+1, JA, DESCA, A, IA, JN+1, DESCA, $ ONE, A, IN+1, JN+1, DESCA ) * END IF END IF * * Loop over the remaining blocks of columns. * DO 10 J = JN+1, JA+MN-1, DESCA( NB_ ) JB = MIN( MN-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL PZGETF2( M-J+JA, JB, A, I, J, DESCA, IPIV, IINFO ) * IF( INFO.EQ.0 .AND. IINFO.GT.0 ) $ INFO = IINFO + J - JA * * Apply interchanges to columns JA:J-JA. * CALL PZLASWP( 'Forward', 'Rowwise', J-JA, A, IA, JA, DESCA, $ I, I+JB-1, IPIV ) * IF( J-JA+JB+1.LE.N ) THEN * * Apply interchanges to columns J+JB:JA+N-1. * CALL PZLASWP( 'Forward', 'Rowwise', N-J-JB+JA, A, IA, J+JB, $ DESCA, I, I+JB-1, IPIV ) * * Compute block row of U. * CALL PZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-J-JB+JA, ONE, A, I, J, DESCA, A, I, J+JB, $ DESCA ) * IF( J-JA+JB+1.LE.M ) THEN * * Update trailing submatrix. * CALL PZGEMM( 'No transpose', 'No transpose', M-J-JB+JA, $ N-J-JB+JA, JB, -ONE, A, I+JB, J, DESCA, A, $ I, J+JB, DESCA, ONE, A, I+JB, J+JB, DESCA ) * END IF END IF * 10 CONTINUE * IF( INFO.EQ.0 ) $ INFO = MN + 1 CALL IGAMN2D( ICTXT, 'Rowwise', ' ', 1, 1, INFO, 1, IDUM1, IDUM2, $ -1, -1, MYCOL ) IF( INFO.EQ.MN+1 ) $ INFO = 0 * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * RETURN * * End of PZGETRF * END scalapack-2.0.2/SRC/pzgetri.f000644 000766 000024 00000035201 10430435051 016167 0ustar00juliestaff000000 000000 SUBROUTINE PZGETRI( N, A, IA, JA, DESCA, IPIV, WORK, LWORK, $ IWORK, LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7.4) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * v1.7.4: May 10, 2006 * v1.7: May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LIWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ), IWORK( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZGETRI computes the inverse of a distributed matrix using the LU * factorization computed by PZGETRF. This method inverts U and then * computes the inverse of sub( A ) = A(IA:IA+N-1,JA:JA+N-1) denoted * InvA by solving the system InvA*L = inv(U) for InvA. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the L and U obtained by the * factorization sub( A ) = P*L*U computed by PZGETRF. On * exit, if INFO = 0, sub( A ) contains the inverse of the * original distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension LOCr(M_A)+MB_A * keeps track of the pivoting information. IPIV(i) is the * global row index the local row i was swapped with. This * array is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = LOCr(N+MOD(IA-1,MB_A))*NB_A. WORK is used to keep a * copy of at most an entire column block of sub( A ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/local output) INTEGER array, * dimension (LIWORK) * On exit, IWORK(1) returns the minimal and optimal LIWORK. * * LIWORK (local or global input) INTEGER * The dimension of the array IWORK used as workspace for * physically transposing the pivots. * LIWORK is local input and must be at least * if NPROW == NPCOL then * LIWORK = LOCc( N_A + MOD(JA-1, NB_A) ) + NB_A, * else * LIWORK = LOCc( N_A + MOD(JA-1, NB_A) ) + * MAX( CEIL(CEIL(LOCr(M_A)/MB_A)/(LCM/NPROW)), * NB_A ) * where LCM is the least common multiple of process * rows and columns (NPROW and NPCOL). * end if * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, U(IA+K-1,IA+K-1) is exactly zero; the * matrix is singular and its inverse could not be * computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IROFF, IW, J, $ JB, JN, LCM, LIWMIN, LWMIN, MP, MYCOL, MYROW, $ NN, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PZGEMM, PZLACPY, PZLASET, PZLAPIV, $ PZTRSM, PZTRTRI, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(500+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) LWMIN = NP * DESCA( NB_ ) * MP = NUMROC( DESCA( M_ ), DESCA( MB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) NQ = NUMROC( DESCA( N_ ), DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) IF( NPROW.EQ.NPCOL ) THEN LIWMIN = NQ + DESCA( NB_ ) ELSE * * Use the formula for the workspace given in PxLAPIV * to compute the minimum size LIWORK for IWORK * * The formula in PxLAPIV is * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + * MB_P * CEIL( CEIL(LOCr(M_P)/MB_P) / (LCM/NPROW) ) * * where * M_P is the global length of the pivot vector * MP = DESCA( M_ ) + DESCA( MB_ ) * NPROW * I_P is IA * I_P = IA * MB_P is the block size use for the block cyclic distribution of the * pivot vector * MB_P = DESCA (MB_ ) * LOCc ( . ) * NUMROC ( . , DESCA ( NB_ ), MYCOL, DESCA ( CSRC_ ), NPCOL ) * LOCr ( . ) * NUMROC ( . , DESCA ( MB_ ), MYROW, DESCA ( RSRC_ ), NPROW ) * CEIL ( X / Y ) * ICEIL( X, Y ) * LCM * LCM = ILCM( NPROW, NPCOL ) * LCM = ILCM( NPROW, NPCOL ) LIWMIN = NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW $ + MOD ( IA - 1, DESCA( MB_ ) ), DESCA ( NB_ ), $ MYCOL, DESCA( CSRC_ ), NPCOL ) + $ MAX ( DESCA( MB_ ) * ICEIL ( ICEIL( $ NUMROC( DESCA( M_ ) + DESCA( MB_ ) * NPROW, $ DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ), $ DESCA( MB_ ) ), LCM / NPROW ), DESCA( NB_ ) ) * END IF * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -4 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(500+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -8 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 8 IF( LIWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGETRI', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Form inv(U). If INFO > 0 from PZTRTRI, then U is singular, * and the inverse is not computed. * CALL PZTRTRI( 'Upper', 'Non-unit', N, A, IA, JA, DESCA, INFO ) IF( INFO.GT.0 ) $ RETURN * * Define array descriptor for working array WORK * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) NN = ( ( JA+N-2 ) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1 IACOL = INDXG2P( NN, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) CALL DESCSET( DESCW, N+IROFF, DESCA( NB_ ), DESCA( MB_ ), $ DESCA( NB_ ), IAROW, IACOL, ICTXT, MAX( 1, NP ) ) IW = IROFF + 1 * * Solve the equation inv(A)*L=inv(U) for inv(A) using blocked code. * DO 10 J = NN, JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * * Copy current block column of L to WORK and replace with zeros. * CALL PZLACPY( 'Lower', JA+N-1-J, JB, A, I+1, J, DESCA, $ WORK, IW+J-JA+1, 1, DESCW ) CALL PZLASET( 'Lower', JA+N-1-J, JB, ZERO, ZERO, A, I+1, J, $ DESCA ) * * Compute current block column of inv(A). * IF( J+JB.LE.JA+N-1 ) $ CALL PZGEMM( 'No transpose', 'No transpose', N, JB, $ JA+N-J-JB, -ONE, A, IA, J+JB, DESCA, WORK, $ IW+J+JB-JA, 1, DESCW, ONE, A, IA, J, DESCA ) CALL PZTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, $ ONE, WORK, IW+J-JA, 1, DESCW, A, IA, J, DESCA ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Handle the last block of columns separately * JB = JN-JA+1 * * Copy current block column of L to WORK and replace with zeros. * CALL PZLACPY( 'Lower', N-1, JB, A, IA+1, JA, DESCA, WORK, IW+1, $ 1, DESCW ) CALL PZLASET( 'Lower', N-1, JB, ZERO, ZERO, A, IA+1, JA, DESCA ) * * Compute current block column of inv(A). * IF( JA+JB.LE.JA+N-1 ) $ CALL PZGEMM( 'No transpose', 'No transpose', N, JB, $ N-JB, -ONE, A, IA, JA+JB, DESCA, WORK, IW+JB, 1, $ DESCW, ONE, A, IA, JA, DESCA ) CALL PZTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, $ ONE, WORK, IW, 1, DESCW, A, IA, JA, DESCA ) * * Use the row pivots and apply them to the columns of the global * matrix. * CALL DESCSET( DESCW, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ MP+DESCA( MB_ ) ) CALL PZLAPIV( 'Backward', 'Columns', 'Column', N, N, A, IA, $ JA, DESCA, IPIV, IA, 1, DESCW, IWORK ) * WORK( 1 ) = DBLE( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PZGETRI * END scalapack-2.0.2/SRC/pzgetrs.f000644 000766 000024 00000026110 10363532303 016203 0ustar00juliestaff000000 000000 SUBROUTINE PZGETRS( TRANS, N, NRHS, A, IA, JA, DESCA, IPIV, B, $ IB, JB, DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), IPIV( * ) COMPLEX*16 A( * ), B( * ) * .. * * Purpose * ======= * * PZGETRS solves a system of distributed linear equations * * op( sub( A ) ) * X = sub( B ) * * with a general N-by-N distributed matrix sub( A ) using the LU * factorization computed by PZGETRF. * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), op( A ) = A, A**T or A**H * and sub( B ) denotes B(IB:IB+N-1,JB:JB+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block data decomposition ( MB_A=NB_A ). * * Arguments * ========= * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': sub( A ) * X = sub( B ) (No transpose) * = 'T': sub( A )**T * X = sub( B ) (Transpose) * = 'C': sub( A )**H * X = sub( B ) (Conjugate transpose) * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) COMPLEX*16 pointer into the local * memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the factors * L and U from the factorization sub( A ) = P*L*U; the unit * diagonal elements of L are not stored. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension ( LOCr(M_A)+MB_A ) * This array contains the pivoting information. * IPIV(i) -> The global row local row i was swapped with. * This array is tied to the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, the right hand sides * sub( B ). On exit, sub( B ) is overwritten by the solution * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCIP( DLEN_ ), IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ PXERBLA, PZLAPIV, PZTRSM * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE NOTRAN = LSAME( TRANS, 'N' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1200+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(1200+CTXT_) END IF END IF IF( NOTRAN ) THEN IDUM1( 1 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 1 ) = ICHAR( 'T' ) ELSE IDUM1( 1 ) = ICHAR( 'C' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, 3, $ IB, JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGETRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * CALL DESCSET( DESCIP, DESCA( M_ ) + DESCA( MB_ )*NPROW, 1, $ DESCA( MB_ ), 1, DESCA( RSRC_ ), MYCOL, ICTXT, $ DESCA( MB_ ) + NUMROC( DESCA( M_ ), DESCA( MB_ ), $ MYROW, DESCA( RSRC_ ), NPROW ) ) * IF( NOTRAN ) THEN * * Solve sub( A ) * X = sub( B ). * * Apply row interchanges to the right hand sides. * CALL PZLAPIV( 'Forward', 'Row', 'Col', N, NRHS, B, IB, JB, $ DESCB, IPIV, IA, 1, DESCIP, IDUM1 ) * * Solve L*X = sub( B ), overwriting sub( B ) with X. * CALL PZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve U*X = sub( B ), overwriting sub( B ) with X. * CALL PZTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) ELSE * * Solve sub( A )' * X = sub( B ). * * Solve U'*X = sub( B ), overwriting sub( B ) with X. * CALL PZTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, NRHS, $ ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve L'*X = sub( B ), overwriting sub( B ) with X. * CALL PZTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, ONE, $ A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Apply row interchanges to the solution vectors. * CALL PZLAPIV( 'Backward', 'Row', 'Col', N, NRHS, B, IB, JB, $ DESCB, IPIV, IA, 1, DESCIP, IDUM1 ) * END IF * RETURN * * End of PZGETRS * END scalapack-2.0.2/SRC/pzggqrf.f000644 000766 000024 00000036330 10363532303 016172 0ustar00juliestaff000000 000000 SUBROUTINE PZGGQRF( N, M, P, A, IA, JA, DESCA, TAUA, B, IB, JB, $ DESCB, TAUB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, P * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ), TAUA( * ), TAUB( * ), WORK( * ) * .. * * Purpose * ======= * * PZGGQRF computes a generalized QR factorization of * an N-by-M matrix sub( A ) = A(IA:IA+N-1,JA:JA+M-1) and * an N-by-P matrix sub( B ) = B(IB:IB+N-1,JB:JB+P-1): * * sub( A ) = Q*R, sub( B ) = Q*T*Z, * * where Q is an N-by-N unitary matrix, Z is a P-by-P unitary matrix, * and R and T assume one of the forms: * * if N >= M, R = ( R11 ) M , or if N < M, R = ( R11 R12 ) N, * ( 0 ) N-M N M-N * M * * where R11 is upper triangular, and * * if N <= P, T = ( 0 T12 ) N, or if N > P, T = ( T11 ) N-P, * P-N N ( T21 ) P * P * * where T12 or T21 is upper triangular. * * In particular, if sub( B ) is square and nonsingular, the GQR * factorization of sub( A ) and sub( B ) implicitly gives the QR * factorization of inv( sub( B ) )* sub( A ): * * inv( sub( B ) )*sub( A )= Z'*(inv(T)*R) * * where inv( sub( B ) ) denotes the inverse of the matrix sub( B ), * and Z' denotes the conjugate transpose of matrix Z. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrices sub( A ) and sub( B ). N >= 0. * * M (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). M >= 0. * * P (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( B ). P >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+M-1)). * On entry, the local pieces of the N-by-M distributed matrix * sub( A ) which is to be factored. On exit, the elements on * and above the diagonal of sub( A ) contain the min(N,M) by M * upper trapezoidal matrix R (R is upper triangular if N >= M); * the elements below the diagonal, with the array TAUA, * represent the unitary matrix Q as a product of min(N,M) * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAUA (local output) COMPLEX*16, array, dimension * LOCc(JA+MIN(N,M)-1). This array contains the scalar factors * TAUA of the elementary reflectors which represent the unitary * matrix Q. TAUA is tied to the distributed matrix A. (see * Further Details). * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+P-1)). * On entry, the local pieces of the N-by-P distributed matrix * sub( B ) which is to be factored. On exit, if N <= P, the * upper triangle of B(IB:IB+N-1,JB+P-N:JB+P-1) contains the * N by N upper triangular matrix T; if N > P, the elements on * and above the (N-P)-th subdiagonal contain the N by P upper * trapezoidal matrix T; the remaining elements, with the array * TAUB, represent the unitary matrix Z as a product of * elementary reflectors (see Further Details). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * TAUB (local output) COMPLEX*16, array, dimension LOCr(IB+N-1) * This array contains the scalar factors of the elementary * reflectors which represent the unitary matrix Z. TAUB is * tied to the distributed matrix B (see Further Details). * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB_A * ( NpA0 + MqA0 + NB_A ), * MAX( (NB_A*(NB_A-1))/2, (PqB0 + NpB0)*NB_A ) + * NB_A * NB_A, * MB_B * ( NpB0 + PqB0 + MB_B ) ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * NpB0 = NUMROC( N+IROFFB, MB_B, MYROW, IBROW, NPROW ), * PqB0 = NUMROC( P+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ja) H(ja+1) . . . H(ja+k-1), where k = min(n,m). * * Each H(i) has the form * * H(i) = I - taua * v * v' * * where taua is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in * A(ia+i:ia+n-1,ja+i-1), and taua in TAUA(ja+i-1). * To form Q explicitly, use ScaLAPACK subroutine PZUNGQR. * To use Q to update another matrix, use ScaLAPACK subroutine PZUNMQR. * * The matrix Z is represented as a product of elementary reflectors * * Z = H(ib)' H(ib+1)' . . . H(ib+k-1)', where k = min(n,p). * * Each H(i) has the form * * H(i) = I - taub * v * v' * * where taub is a complex scalar, and v is a complex vector with * v(p-k+i+1:p) = 0 and v(p-k+i) = 1; conjg(v(1:p-k+i-1)) is stored on * exit in B(ib+n-k+i-1,jb:jb+p-k+i-2), and taub in TAUB(ib+n-k+i-1). * To form Z explicitly, use ScaLAPACK subroutine PZUNGRQ. * To use Z to update another matrix, use ScaLAPACK subroutine PZUNMRQ. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ) and sub( B ) must verify some * alignment properties, namely the following expression should be true: * * ( MB_A.EQ.MB_B .AND. IROFFA.EQ.IROFFB .AND. IAROW.EQ.IBROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, LWMIN, MQA0, MYCOL, $ MYROW, NPA0, NPB0, NPCOL, NPROW, PQB0 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PXERBLA, $ PZGEQRF, PZGERQF, PZUNMQR * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, INT, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -707 ELSE CALL CHK1MAT( N, 1, M, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 1, P, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) NPB0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) PQB0 = NUMROC( P+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) LWMIN = MAX( DESCA( NB_ ) * ( NPA0 + MQA0 + DESCA( NB_ ) ), $ MAX( MAX( ( DESCA( NB_ )*( DESCA( NB_ ) - 1 ) ) / 2, $ ( PQB0 + NPB0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ), $ DESCB( MB_ ) * ( NPB0 + PQB0 + DESCB( MB_ ) ) ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IAROW.NE.IBROW .OR. IROFFA.NE.IROFFB ) THEN INFO = -10 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -1203 ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -1207 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 15 CALL PCHK2MAT( N, 1, M, 2, IA, JA, DESCA, 7, N, 1, P, 3, IB, $ JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGGQRF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * QR factorization of N-by-M matrix sub( A ): sub( A ) = Q*R * CALL PZGEQRF( N, M, A, IA, JA, DESCA, TAUA, WORK, LWORK, INFO ) LWMIN = INT( WORK( 1 ) ) * * Update sub( B ) := Q'*sub( B ). * CALL PZUNMQR( 'Left', 'Conjugate Transpose', N, P, MIN( N, M ), A, $ IA, JA, DESCA, TAUA, B, IB, JB, DESCB, WORK, LWORK, $ INFO ) LWMIN = MIN( LWMIN, INT( WORK( 1 ) ) ) * * RQ factorization of N-by-P matrix sub( B ): sub( B ) = T*Z. * CALL PZGERQF( N, P, B, IB, JB, DESCB, TAUB, WORK, LWORK, INFO ) WORK( 1 ) = DCMPLX( DBLE( MAX( LWMIN, INT( WORK( 1 ) ) ) ) ) * RETURN * * End of PZGGQRF * END scalapack-2.0.2/SRC/pzggrqf.f000644 000766 000024 00000036336 10363532303 016200 0ustar00juliestaff000000 000000 SUBROUTINE PZGGRQF( M, P, N, A, IA, JA, DESCA, TAUA, B, IB, JB, $ DESCB, TAUB, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IB, INFO, JA, JB, LWORK, M, N, P * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ), TAUA( * ), TAUB( * ), WORK( * ) * .. * * Purpose * ======= * * PZGGRQF computes a generalized RQ factorization of * an M-by-N matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) * and a P-by-N matrix sub( B ) = B(IB:IB+P-1,JB:JB+N-1): * * sub( A ) = R*Q, sub( B ) = Z*T*Q, * * where Q is an N-by-N unitary matrix, Z is a P-by-P unitary matrix, * and R and T assume one of the forms: * * if M <= N, R = ( 0 R12 ) M, or if M > N, R = ( R11 ) M-N, * N-M M ( R21 ) N * N * * where R12 or R21 is upper triangular, and * * if P >= N, T = ( T11 ) N , or if P < N, T = ( T11 T12 ) P, * ( 0 ) P-N P N-P * N * * where T11 is upper triangular. * * In particular, if sub( B ) is square and nonsingular, the GRQ * factorization of sub( A ) and sub( B ) implicitly gives the RQ * factorization of sub( A )*inv( sub( B ) ): * * sub( A )*inv( sub( B ) ) = (R*inv(T))*Z' * * where inv( sub( B ) ) denotes the inverse of the matrix sub( B ), * and Z' denotes the conjugate transpose of matrix Z. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of * rows of the distributed submatrix sub( A ). M >= 0. * * P (global input) INTEGER * The number of rows to be operated on i.e the number of * rows of the distributed submatrix sub( B ). P >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrices sub( A ) and sub( B ). * N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, if M <= N, the * upper triangle of A( IA:IA+M-1, JA+N-M:JA+N-1 ) contains the * M by M upper triangular matrix R; if M >= N, the elements on * and above the (M-N)-th subdiagonal contain the M by N upper * trapezoidal matrix R; the remaining elements, with the array * TAUA, represent the unitary matrix Q as a product of * elementary reflectors (see Further Details). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAUA (local output) COMPLEX*16, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUA is * tied to the distributed matrix A (see Further Details). * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+N-1)). * On entry, the local pieces of the P-by-N distributed matrix * sub( B ) which is to be factored. On exit, the elements on * and above the diagonal of sub( B ) contain the min(P,N) by N * upper trapezoidal matrix T (T is upper triangular if P >= N); * the elements below the diagonal, with the array TAUB, * represent the unitary matrix Z as a product of elementary * reflectors (see Further Details). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * TAUB (local output) COMPLEX*16, array, dimension * LOCc(JB+MIN(P,N)-1). This array contains the scalar factors * TAUB of the elementary reflectors which represent the unitary * matrix Z. TAUB is tied to the distributed matrix B (see * Further Details). * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( MB_A * ( MpA0 + NqA0 + MB_A ), * MAX( (MB_A*(MB_A-1))/2, (PpB0 + NqB0)*MB_A ) + * MB_A * MB_A, * NB_B * ( PpB0 + NqB0 + NB_B ) ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFB = MOD( IB-1, MB_B ), ICOFFB = MOD( JB-1, NB_B ), * IBROW = INDXG2P( IB, MB_B, MYROW, RSRC_B, NPROW ), * IBCOL = INDXG2P( JB, NB_B, MYCOL, CSRC_B, NPCOL ), * PpB0 = NUMROC( P+IROFFB, MB_B, MYROW, IBROW, NPROW ), * NqB0 = NUMROC( N+ICOFFB, NB_B, MYCOL, IBCOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(ia)' H(ia+1)' . . . H(ia+k-1)', where k = min(m,n). * * Each H(i) has the form * * H(i) = I - taua * v * v' * * where taua is a complex scalar, and v is a complex vector with * v(n-k+i+1:n) = 0 and v(n-k+i) = 1; conjg(v(1:n-k+i-1)) is stored on * exit in A(ia+m-k+i-1,ja:ja+n-k+i-2), and taua in TAUA(ia+m-k+i-1). * To form Q explicitly, use ScaLAPACK subroutine PZUNGRQ. * To use Q to update another matrix, use ScaLAPACK subroutine PZUNMRQ. * * The matrix Z is represented as a product of elementary reflectors * * Z = H(jb) H(jb+1) . . . H(jb+k-1), where k = min(p,n). * * Each H(i) has the form * * H(i) = I - taub * v * v' * * where taub is a complex scalar, and v is a complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:p) is stored on exit in * B(ib+i:ib+p-1,jb+i-1), and taub in TAUB(jb+i-1). * To form Z explicitly, use ScaLAPACK subroutine PZUNGQR. * To use Z to update another matrix, use ScaLAPACK subroutine PZUNMQR. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ) and sub( B ) must verify some * alignment properties, namely the following expression should be true: * * ( NB_A.EQ.NB_B .AND. ICOFFA.EQ.ICOFFB .AND. IACOL.EQ.IBCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. Local Scalars .. LOGICAL LQUERY INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, LWMIN, MPA0, MYCOL, $ MYROW, NPCOL, NPROW, NQA0, NQB0, PPB0 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PXERBLA, $ PZGEQRF, PZGERQF, PZUNMRQ * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Functions .. INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, INT, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -707 ELSE CALL CHK1MAT( M, 1, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( P, 2, N, 3, IB, JB, DESCB, 12, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) PPB0 = NUMROC( P+IROFFB, DESCB( MB_ ), MYROW, IBROW, NPROW ) NQB0 = NUMROC( N+ICOFFB, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) LWMIN = MAX( DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ), $ MAX( MAX( ( DESCA( MB_ )*( DESCA( MB_ ) - 1 ) ) / 2, $ ( PPB0 + NQB0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ), $ DESCB( NB_ ) * ( PPB0 + NQB0 + DESCB( NB_ ) ) ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IACOL.NE.IBCOL .OR. ICOFFA.NE.ICOFFB ) THEN INFO = -11 ELSE IF( DESCA( NB_ ).NE.DESCB( NB_ ) ) THEN INFO = -1204 ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -1207 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF END IF IF( LWORK.EQ.-1 ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 15 CALL PCHK2MAT( M, 1, N, 3, IA, JA, DESCA, 7, P, 2, N, 3, IB, $ JB, DESCB, 12, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZGGRQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * RQ factorization of M-by-N matrix sub( A ): sub( A ) = R*Q * CALL PZGERQF( M, N, A, IA, JA, DESCA, TAUA, WORK, LWORK, INFO ) LWMIN = INT( WORK( 1 ) ) * * Update sub( B ) := sub( B )*Q' * CALL PZUNMRQ( 'Right', 'Conjugate Transpose', P, N, MIN( M, N ), $ A, MAX( IA, IA+M-N ), JA, DESCA, TAUA, B, IB, JB, $ DESCB, WORK, LWORK, INFO ) LWMIN = MAX( LWMIN, INT( WORK( 1 ) ) ) * * QR factorization of P-by-N matrix sub( B ): sub( B ) = Z*T * CALL PZGEQRF( P, N, B, IB, JB, DESCB, TAUB, WORK, LWORK, INFO ) WORK( 1 ) = DCMPLX( DBLE( MAX( LWMIN, INT( WORK( 1 ) ) ) ) ) * RETURN * * End of PZGGRQF * END scalapack-2.0.2/SRC/pzheev.f000644 000766 000024 00000055630 11657236177 016040 0ustar00juliestaff000000 000000 SUBROUTINE PZHEEV( JOBZ, UPLO, N, A, IA, JA, DESCA, W, Z, IZ, JZ, $ DESCZ, WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 14, 2001 * * .. Scalar Arguments .. CHARACTER JOBZ, UPLO INTEGER IA, INFO, IZ, JA, JZ, LRWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) DOUBLE PRECISION RWORK( * ), W( * ) COMPLEX*16 A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZHEEV computes selected eigenvalues and, optionally, eigenvectors * of a complex Hermitian matrix A by calling the recommended sequence * of ScaLAPACK routines. * * In its present form, PZHEEV assumes a homogeneous system and makes * only spot checks of the consistency of the eigenvalues across the * different processes. Because of this, it is possible that a * heterogeneous system may return incorrect results without any error * messages. * * Notes * ===== * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distributed * matrix A. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of A. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of A. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of the * distributed matrix A. * LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic COMPLEX*16 array, * global dimension (N, N), local dimension ( LLD_A, * LOCc(JA+N-1) ) * * On entry, the Hermitian matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the Hermitian matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * Hermitian matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PZHEEV cannot guarantee * correct error reporting. * * W (global output) DOUBLE PRECISION array, dimension (N) * If INFO=0, the eigenvalues in ascending order. * * Z (local output) COMPLEX*16 array, * global dimension (N, N), * local dimension (LLD_Z, LOCc(JZ+N-1)) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX*16 array, * dimension (LWORK) * On output, WORK(1) returns the workspace needed to guarantee * completion. If the input parameters are incorrect, WORK(1) * may also be incorrect. * * If JOBZ='N' WORK(1) = minimal workspace for eigenvalues only. * If JOBZ='V' WORK(1) = minimal workspace required to * generate all the eigenvectors. * * * LWORK (local input) INTEGER * See below for definitions of variables used to define LWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LWORK >= MAX( NB*( NP0+1 ), 3 ) +3*N * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required: * LWORK >= (NP0 + NQ0 + NB)*NB + 3*N + N^2 * * Variable definitions: * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * NQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) * * If LWORK = -1, the LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the WORK array. The required workspace is returned * as the first element of WORK and no error message is issued * by PXERBLA. * * RWORK (local workspace/output) COMPLEX*16 array, * dimension (LRWORK) * On output RWORK(1) returns the * DOUBLE PRECISION workspace needed to * guarantee completion. If the input parameters are incorrect, * RWORK(1) may also be incorrect. * * LRWORK (local input) INTEGER * Size of RWORK array. * If eigenvectors are desired (JOBZ = 'V') then * LRWORK >= 2*N + 2*N-2 * If eigenvectors are not desired (JOBZ = 'N') then * LRWORK >= 2*N * * If LRWORK = -1, the LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * size for the RWORK array. The required workspace is returned * as the first element of RWORK and no error message is issued * by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = 1 through N, the i(th) eigenvalue did not * converge in ZSTEQR2 after a total of 30*N iterations. * If INFO = N+1, then PZHEEV has detected heterogeneity * by finding that eigenvalues were not identical across * the process grid. In this case, the accuracy of * the results from PZHEEV cannot be guaranteed. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * ( MB_A.EQ.NB_A.EQ.MB_Z .AND. IROFFA.EQ.IROFFZ .AND. IROFFA.EQ.0 .AND. * IAROW.EQ.IZROW ) * where * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * Version 1.4 limitations: * DESCA(MB_) = DESCA(NB_) * DESCA(M_) = DESCZ(M_) * DESCA(N_) = DESCZ(N_) * DESCA(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) INTEGER ITHVAL PARAMETER ( ITHVAL = 10 ) * .. * .. Local Scalars .. LOGICAL LOWER, WANTZ INTEGER CONTEXTC, CSRC_A, I, IACOL, IAROW, ICOFFA, $ IINFO, INDD, INDE, INDRD, INDRE, INDRWORK, $ INDTAU, INDWORK, INDWORK2, IROFFA, IROFFZ, $ ISCALE, IZROW, J, K, LDC, LLRWORK, LLWORK, $ LRMIN, LRWMIN, LWMIN, MB_A, MB_Z, MYCOL, $ MYPCOLC, MYPROWC, MYROW, NB, NB_A, NB_Z, NP0, $ NPCOL, NPCOLC, NPROCS, NPROW, NPROWC, NQ0, NRC, $ RSIZEZSTEQR2, RSRC_A, RSRC_Z, SIZEPZHETRD, $ SIZEPZUNMTR, SIZEZSTEQR2 DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, $ SMLNUM * .. * .. Local Arrays .. INTEGER DESCQR( 10 ), IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC, SL_GRIDRESHAPE DOUBLE PRECISION PDLAMCH, PZLANHE EXTERNAL LSAME, INDXG2P, NUMROC, SL_GRIDRESHAPE, $ PDLAMCH, PZLANHE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDEXIT, BLACS_GRIDINFO, CHK1MAT, DCOPY, $ DESCINIT, DGAMN2D, DGAMX2D, DSCAL, PCHK1MAT, $ PCHK2MAT, PXERBLA, PZELGET, PZGEMR2D, PZHETRD, $ PZLASCL, PZLASET, PZUNMTR, ZSTEQR2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, ICHAR, INT, MAX, MIN, MOD, $ SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return * IF( N.EQ.0 ) $ RETURN * * Test the input arguments. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 * * Initialize pointer to some safe value * INDTAU = 1 INDD = 1 INDE = 1 INDWORK = 1 INDWORK2 = 1 * INDRE = 1 INDRD = 1 INDRWORK = 1 * WANTZ = LSAME( JOBZ, 'V' ) IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE IF( WANTZ ) THEN IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) END IF END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 3, N, 3, IZ, JZ, DESCZ, 12, INFO ) * IF( INFO.EQ.0 ) THEN * * Get machine constants. * SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe minimum' ) EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * NPROCS = NPROW*NPCOL NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A LOWER = LSAME( UPLO, 'L' ) * RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( 1, NB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( 1, MB_A, MYCOL, CSRC_A, NPCOL ) NP0 = NUMROC( N+IROFFA, NB, MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+ICOFFA, NB, MYCOL, IACOL, NPCOL ) IF( WANTZ ) THEN NB_Z = DESCZ( NB_ ) MB_Z = DESCZ( MB_ ) RSRC_Z = DESCZ( RSRC_ ) IROFFZ = MOD( IZ-1, MB_A ) IZROW = INDXG2P( 1, NB_A, MYROW, RSRC_Z, NPROW ) ELSE IROFFZ = 0 IZROW = 0 END IF * * COMPLEX*16 work space for PZHETRD * CALL PZHETRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDE ), WORK( INDTAU ), $ WORK( INDWORK ), -1, IINFO ) SIZEPZHETRD = INT( ABS( WORK( 1 ) ) ) * * COMPLEX*16 work space for PZUNMTR * IF( WANTZ ) THEN CALL PZUNMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), -1, IINFO ) SIZEPZUNMTR = INT( ABS( WORK( 1 ) ) ) ELSE SIZEPZUNMTR = 0 END IF * * DOUBLE PRECISION work space for ZSTEQR2 * IF( WANTZ ) THEN RSIZEZSTEQR2 = MIN( 1, 2*N-2 ) ELSE RSIZEZSTEQR2 = 0 END IF * * Initialize the context of the single column distributed * matrix required by ZSTEQR2. This specific distribution * allows each process to do 1/pth of the work updating matrix * Q during ZSTEQR2 and achieve some parallelization to an * otherwise serial subroutine. * LDC = 0 IF( WANTZ ) THEN CONTEXTC = SL_GRIDRESHAPE( DESCA( CTXT_ ), 0, 1, 1, $ NPROCS, 1 ) CALL BLACS_GRIDINFO( CONTEXTC, NPROWC, NPCOLC, MYPROWC, $ MYPCOLC ) NRC = NUMROC( N, NB_A, MYPROWC, 0, NPROCS ) LDC = MAX( 1, NRC ) CALL DESCINIT( DESCQR, N, N, NB, NB, 0, 0, CONTEXTC, LDC, $ INFO ) END IF * * COMPLEX*16 work space for ZSTEQR2 * IF( WANTZ ) THEN SIZEZSTEQR2 = N*LDC ELSE SIZEZSTEQR2 = 0 END IF * * Set up pointers into the WORK array * INDTAU = 1 INDD = INDTAU + N INDE = INDD + N INDWORK = INDE + N INDWORK2 = INDWORK + N*LDC LLWORK = LWORK - INDWORK + 1 * * Set up pointers into the RWORK array * INDRE = 1 INDRD = INDRE + N INDRWORK = INDRD + N LLRWORK = LRWORK - INDRWORK + 1 * * Compute the total amount of space needed * LRWMIN = 2*N + RSIZEZSTEQR2 LWMIN = 3*N + MAX( SIZEPZHETRD, SIZEPZUNMTR, SIZEZSTEQR2 ) * END IF IF( INFO.EQ.0 ) THEN IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -14 ELSE IF( LRWORK.LT.LRWMIN .AND. LRWORK.NE.-1 ) THEN INFO = -16 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( WANTZ ) THEN IF( IROFFA.NE.IROFFZ ) THEN INFO = -10 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -10 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 1200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 1200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 1200+RSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) END IF END IF END IF IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 3 IF( WANTZ ) THEN CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IZ, $ JZ, DESCZ, 12, 3, IDUM1, IDUM2, INFO ) ELSE CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 3, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = DCMPLX( LWMIN ) RWORK( 1 ) = DBLE( LRWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PZHEEV', -INFO ) IF( WANTZ ) $ CALL BLACS_GRIDEXIT( CONTEXTC ) RETURN ELSE IF( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) THEN IF( WANTZ ) $ CALL BLACS_GRIDEXIT( CONTEXTC ) RETURN END IF * * Scale matrix to allowable range, if necessary. * ISCALE = 0 * ANRM = PZLANHE( 'M', UPLO, N, A, IA, JA, DESCA, $ RWORK( INDRWORK ) ) * * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM END IF * IF( ISCALE.EQ.1 ) THEN CALL PZLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) END IF * * Reduce Hermitian matrix to tridiagonal form. * CALL PZHETRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * Copy the values of D, E to all processes. * DO 10 I = 1, N CALL PZELGET( 'A', ' ', WORK( INDD+I-1 ), A, I+IA-1, I+JA-1, $ DESCA ) RWORK( INDRD+I-1 ) = DBLE( WORK( INDD+I-1 ) ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PZELGET( 'A', ' ', WORK( INDE+I-1 ), A, I+IA-1, I+JA, $ DESCA ) RWORK( INDRE+I-1 ) = DBLE( WORK( INDE+I-1 ) ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PZELGET( 'A', ' ', WORK( INDE+I-1 ), A, I+IA, I+JA-1, $ DESCA ) RWORK( INDRE+I-1 ) = DBLE( WORK( INDE+I-1 ) ) 30 CONTINUE END IF * IF( WANTZ ) THEN * CALL PZLASET( 'Full', N, N, CZERO, CONE, WORK( INDWORK ), 1, 1, $ DESCQR ) * * ZSTEQR2 is a modified version of LAPACK's CSTEQR. The * modifications allow each process to perform partial updates * to matrix Q. * CALL ZSTEQR2( 'I', N, RWORK( INDRD ), RWORK( INDRE ), $ WORK( INDWORK ), LDC, NRC, RWORK( INDRWORK ), $ INFO ) * CALL PZGEMR2D( N, N, WORK( INDWORK ), 1, 1, DESCQR, Z, IA, JA, $ DESCZ, CONTEXTC ) * CALL PZUNMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) * ELSE * CALL ZSTEQR2( 'N', N, RWORK( INDRD ), RWORK( INDRE ), $ WORK( INDWORK ), 1, 1, RWORK( INDRWORK ), INFO ) END IF * * Copy eigenvalues from workspace to output array * CALL DCOPY( N, RWORK( INDD ), 1, W, 1 ) * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL DSCAL( N, ONE / SIGMA, W, 1 ) END IF * WORK( 1 ) = DBLE( LWMIN ) * * Free up resources * IF( WANTZ ) THEN CALL BLACS_GRIDEXIT( CONTEXTC ) END IF * * Compare every ith eigenvalue, or all if there are only a few, * across the process grid to check for heterogeneity. * IF( N.LE.ITHVAL ) THEN J = N K = 1 ELSE J = N / ITHVAL K = ITHVAL END IF * LRMIN = INT( RWORK( 1 ) ) INDTAU = 0 INDE = INDTAU + J DO 40 I = 1, J RWORK( I+INDTAU ) = W( ( I-1 )*K+1 ) RWORK( I+INDE ) = W( ( I-1 )*K+1 ) 40 CONTINUE * CALL DGAMN2D( DESCA( CTXT_ ), 'All', ' ', J, 1, RWORK( 1+INDTAU ), $ J, 1, 1, -1, -1, 0 ) CALL DGAMX2D( DESCA( CTXT_ ), 'All', ' ', J, 1, RWORK( 1+INDE ), $ J, 1, 1, -1, -1, 0 ) * DO 50 I = 1, J IF( INFO.EQ.0 .AND. ( RWORK( I+INDTAU )-RWORK( I+INDE ).NE. $ ZERO ) ) THEN INFO = N + 1 END IF 50 CONTINUE RWORK( 1 ) = LRMIN * RETURN * * End of PZHEEV * END scalapack-2.0.2/SRC/pzheevd.f000644 000766 000024 00000037347 10363532303 016170 0ustar00juliestaff000000 000000 SUBROUTINE PZHEEVD( JOBZ, UPLO, N, A, IA, JA, DESCA, W, Z, IZ, JZ, $ DESCZ, WORK, LWORK, RWORK, LRWORK, IWORK, $ LIWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 25, 2002 * * .. Scalar Arguments .. CHARACTER JOBZ, UPLO INTEGER IA, INFO, IZ, JA, JZ, LIWORK, LRWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) DOUBLE PRECISION RWORK( * ), W( * ) COMPLEX*16 A( * ), WORK( * ), Z( * ) * * * Purpose * ======= * * PZHEEVD computes all the eigenvalues and eigenvectors of a Hermitian * matrix A by using a divide and conquer algorithm. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; (NOT IMPLEMENTED YET) * = 'V': Compute eigenvalues and eigenvectors. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic COMPLEX*16 array, * global dimension (N, N), local dimension ( LLD_A, * LOCc(JA+N-1) ) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PZHEEV cannot guarantee * correct error reporting. * * W (global output) DOUBLE PRECISION array, dimension (N) * If INFO=0, the eigenvalues in ascending order. * * Z (local output) COMPLEX*16 array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * Z contains the orthonormal eigenvectors of the matrix A. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX*16 array, * dimension (LWORK) * On output, WORK(1) returns the workspace needed for the * computation. * * LWORK (local input) INTEGER * If eigenvectors are requested: * LWORK = N + ( NP0 + MQ0 + NB ) * NB, * with NP0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine calculates the size for all * work arrays. Each of these values is returned in the first * entry of the corresponding work array, and no error message * is issued by PXERBLA. * * RWORK (local workspace/output) DOUBLE PRECISION array, * dimension (LRWORK) * On output RWORK(1) returns the real workspace needed to * guarantee completion. If the input parameters are incorrect, * RWORK(1) may also be incorrect. * * LRWORK (local input) INTEGER * Size of RWORK array. * LRWORK >= 1 + 9*N + 3*NP*NQ, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) * NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) * * IWORK (local workspace/output) INTEGER array, dimension (LIWORK) * On output IWORK(1) returns the integer workspace needed. * * LIWORK (input) INTEGER * The dimension of the array IWORK. * LIWORK = 7*N + 8*NPCOL + 2 * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = 1 through N, the i(th) eigenvalue did not * converge in PDLAED3. * * Alignment requirements * ====================== * * The distributed submatrices sub( A ), sub( Z ) must verify * some alignment properties, namely the following expression * should be true: * ( MB_A.EQ.NB_A.EQ.MB_Z.EQ.NB_Z .AND. IROFFA.EQ.ICOFFA .AND. * IROFFA.EQ.0 .AND.IROFFA.EQ.IROFFZ. AND. IAROW.EQ.IZROW) * with IROFFA = MOD( IA-1, MB_A ) * and ICOFFA = MOD( JA-1, NB_A ). * * Further Details * ======= ======= * * Contributed by Francoise Tisseur, University of Manchester. * * Reference: F. Tisseur and J. Dongarra, "A Parallel Divide and * Conquer Algorithm for the Symmetric Eigenvalue Problem * on Distributed Memory Architectures", * SIAM J. Sci. Comput., 6:20 (1999), pp. 2223--2236. * (see also LAPACK Working Note 132) * http://www.netlib.org/lapack/lawns/lawn132.ps * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LOWER, LQUERY INTEGER CSRC_A, I, IACOL, IAROW, ICOFFA, IINFO, IIZ, $ INDD, INDE, INDE2, INDRWORK, INDTAU, INDWORK, $ INDZ, IPR, IPZ, IROFFA, IROFFZ, ISCALE, IZCOL, $ IZROW, J, JJZ, LDR, LDZ, LIWMIN, LLRWORK, $ LLWORK, LRWMIN, LWMIN, MB_A, MYCOL, MYROW, NB, $ NB_A, NN, NP0, NPCOL, NPROW, NQ, NQ0, OFFSET, $ RSRC_A DOUBLE PRECISION ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, $ SMLNUM * .. * .. Local Arrays .. INTEGER DESCRZ( 9 ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC DOUBLE PRECISION PZLANHE, PDLAMCH EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC, PZLANHE, $ PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCINIT, INFOG2L, $ PZELGET, PZHETRD, PCHK2MAT, PZLASCL, PZLASET, $ PZUNMTR, PDLARED1D, PDLASET, PDSTEDC, PXERBLA, $ DSCAL * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, ICHAR, MAX, MIN, MOD, DBLE, SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * INFO = 0 * * Quick return * IF( N.EQ.0 ) $ RETURN * * Test the input arguments. * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) CALL CHK1MAT( N, 2, N, 2, IZ, JZ, DESCZ, 11, INFO ) IF( INFO.EQ.0 ) THEN LOWER = LSAME( UPLO, 'L' ) NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( IA, NB_A, MYROW, RSRC_A, NPROW ) IACOL = INDXG2P( JA, MB_A, MYCOL, CSRC_A, NPCOL ) NP0 = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ0 = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) IROFFZ = MOD( IZ-1, MB_A ) CALL INFOG2L( IZ, JZ, DESCZ, NPROW, NPCOL, MYROW, MYCOL, $ IIZ, JJZ, IZROW, IZCOL ) LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * * Compute the total amount of space needed * NN = MAX( N, NB, 2 ) NQ = NUMROC( NN, NB, 0, 0, NPCOL ) LWMIN = N + ( NP0+NQ+NB )*NB LRWMIN = 1 + 9*N + 3*NP0*NQ0 LIWMIN = 7*N + 8*NPCOL + 2 WORK( 1 ) = DCMPLX( LWMIN ) RWORK( 1 ) = DBLE( LRWMIN ) IWORK( 1 ) = LIWMIN IF( .NOT.LSAME( JOBZ, 'V' ) ) THEN INFO = -1 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -14 ELSE IF( LRWORK.LT.LRWMIN .AND. LRWORK.NE.-1 ) THEN INFO = -16 ELSE IF( IROFFA.NE.0 ) THEN INFO = -4 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFA.NE.IROFFZ ) THEN INFO = -10 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -10 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 1200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 1200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 1200+RSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 1200+CTXT_ ) END IF END IF IF( LOWER ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'U' ) END IF IDUM2( 1 ) = 2 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 14 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IZ, $ JZ, DESCZ, 11, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCA( CTXT_ ), 'PZHEEVD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Get machine constants. * SAFMIN = PDLAMCH( DESCA( CTXT_ ), 'Safe minimum' ) EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * * Set up pointers into the WORK array * INDTAU = 1 INDWORK = INDTAU + N LLWORK = LWORK - INDWORK + 1 * * Set up pointers into the RWORK array * INDE = 1 INDD = INDE + N INDE2 = INDD + N INDRWORK = INDE2 + N LLRWORK = LRWORK - INDRWORK + 1 * * Scale matrix to allowable range, if necessary. * ISCALE = 0 * ANRM = PZLANHE( 'M', UPLO, N, A, IA, JA, DESCA, $ RWORK( INDRWORK ) ) * * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM END IF * IF( ISCALE.EQ.1 ) THEN CALL PZLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) END IF * * Reduce Hermitian matrix to tridiagonal form. * CALL PZHETRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDE2 ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, IINFO ) * * Copy the values of D, E to all processes * * Here PxLARED1D is used to redistribute the tridiagonal matrix. * PxLARED1D, however, doesn't yet workMx Mawith arbritary matrix * distributions so we have PxELGET as a backup. * OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. RSRC_A.EQ.0 .AND. CSRC_A.EQ.0 ) $ THEN CALL PDLARED1D( N, IA, JA, DESCA, RWORK( INDD ), W, $ RWORK( INDRWORK ), LLRWORK ) * CALL PDLARED1D( N, IA, JA, DESCA, RWORK( INDE2 ), $ RWORK( INDE ), RWORK( INDRWORK ), LLRWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PZELGET( 'A', ' ', WORK( INDWORK ), A, I+IA-1, I+JA-1, $ DESCA ) W( I ) = DBLE( WORK( INDWORK ) ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PZELGET( 'A', ' ', WORK( INDWORK ), A, I+IA-1, I+JA, $ DESCA ) RWORK( INDE+I-1 ) = DBLE( WORK( INDWORK ) ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PZELGET( 'A', ' ', WORK( INDWORK ), A, I+IA, I+JA-1, $ DESCA ) RWORK( INDE+I-1 ) = DBLE( WORK( INDWORK ) ) 30 CONTINUE END IF END IF * * Call PDSTEDC to compute eigenvalues and eigenvectors. * INDZ = INDE + N INDRWORK = INDZ + NP0*NQ0 LLRWORK = LRWORK - INDRWORK + 1 LDR = MAX( 1, NP0 ) CALL DESCINIT( DESCRZ, DESCZ( M_ ), DESCZ( N_ ), DESCZ( MB_ ), $ DESCZ( NB_ ), DESCZ( RSRC_ ), DESCZ( CSRC_ ), $ DESCZ( CTXT_ ), LDR, INFO ) CALL PZLASET( 'Full', N, N, CZERO, CONE, Z, IZ, JZ, DESCZ ) CALL PDLASET( 'Full', N, N, ZERO, ONE, RWORK( INDZ ), 1, 1, $ DESCRZ ) CALL PDSTEDC( 'I', N, W, RWORK( INDE+OFFSET ), RWORK( INDZ ), IZ, $ JZ, DESCRZ, RWORK( INDRWORK ), LLRWORK, IWORK, $ LIWORK, IINFO ) * LDZ = DESCZ( LLD_ ) LDR = DESCRZ( LLD_ ) IIZ = INDXG2L( IZ, NB, MYROW, MYROW, NPROW ) JJZ = INDXG2L( JZ, NB, MYCOL, MYCOL, NPCOL ) IPZ = IIZ + ( JJZ-1 )*LDZ IPR = INDZ - 1 + IIZ + ( JJZ-1 )*LDR DO 50 J = 0, NQ0 - 1 DO 40 I = 0, NP0 - 1 Z( IPZ+I+J*LDZ ) = RWORK( IPR+I+J*LDR ) 40 CONTINUE 50 CONTINUE * * Z = Q * Z * CALL PZUNMTR( 'L', UPLO, 'N', N, N, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, WORK( INDWORK ), $ LLWORK, IINFO ) * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL DSCAL( N, ONE / SIGMA, W, 1 ) END IF * WORK( 1 ) = DCMPLX( LWMIN ) RWORK( 1 ) = DBLE( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN * * End of PZHEEVD * END scalapack-2.0.2/SRC/pzheevr.f000644 000766 000024 00000134042 11750130340 016170 0ustar00juliestaff000000 000000 SUBROUTINE PZHEEVR( JOBZ, RANGE, UPLO, N, A, IA, JA, $ DESCA, VL, VU, IL, IU, M, NZ, W, Z, IZ, $ JZ, DESCZ, $ WORK, LWORK, RWORK, LRWORK, IWORK, LIWORK, $ INFO ) IMPLICIT NONE * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, INFO, IU, IZ, JA, JZ, LIWORK, LRWORK, $ LWORK, M, N, NZ DOUBLE PRECISION VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) DOUBLE PRECISION W( * ), RWORK( * ) COMPLEX*16 A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZHEEVR computes selected eigenvalues and, optionally, eigenvectors * of a complex Hermitian matrix A distributed in 2D blockcyclic format * by calling the recommended sequence of ScaLAPACK routines. * * First, the matrix A is reduced to real symmetric tridiagonal form. * Then, the eigenproblem is solved using the parallel MRRR algorithm. * Last, if eigenvectors have been computed, a backtransformation is done. * * Upon successful completion, each processor stores a copy of all computed * eigenvalues in W. The eigenvector matrix Z is stored in * 2D blockcyclic format distributed over all processors. * * For constructive feedback and comments, please contact cvoemel@lbl.gov * C. Voemel * * * Arguments * ========= * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0 * * A (local input/workspace) 2D block cyclic COMPLEX*16 array, * global dimension (N, N), * local dimension ( LLD_A, LOCc(JA+N-1) ) * (see Notes below for more detailed explanation of 2d arrays) * * On entry, the symmetric matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the symmetric matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * symmetric matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * (The ScaLAPACK descriptor length is DLEN_ = 9.) * The array descriptor for the distributed matrix A. * The descriptor stores details about the 2D block-cyclic * storage, see the notes below. * If DESCA is incorrect, PZHEEVR cannot work correctly. * Also note the array alignment requirements specified below * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A'. * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M * * W (global output) DOUBLE PRECISION array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * Z (local output) COMPLEX*16 array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * It should be set to 1 when operating on a full matrix. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX*16 array, * dimension (LWORK) * WORK(1) returns workspace adequate workspace to allow * optimal performance. * * LWORK (local input) INTEGER * Size of WORK array, must be at least 3. * If only eigenvalues are requested: * LWORK >= N + MAX( NB * ( NP00 + 1 ), NB * 3 ) * If eigenvectors are requested: * LWORK >= N + ( NP00 + MQ00 + NB ) * NB * For definitions of NP00 & MQ00, see LRWORK. * * For optimal performance, greater workspace is needed, i.e. * LWORK >= MAX( LWORK, NHETRD_LWORK ) * Where LWORK is as defined above, and * NHETRD_LWORK = N + 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 1 ) * NPS * * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) * SQNPC = SQRT( DBLE( NPROW * NPCOL ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the * optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * NOTE THAT FOR OPTIMAL PERFORMANCE, LWOPT IS RETURNED * (THE OPTIMUM WORKSPACE) RATHER THAN THE MINIMUM NECESSARY * WORKSPACE LWMIN WHEN A WORKSPACE QUERY IS ISSUED. * FOR VERY SMALL MATRICES, LWOPT >> LWMIN. * * RWORK (local workspace/output) DOUBLE PRECISION array, * dimension (LRWORK) * On return, RWORK(1) contains the optimal amount of * workspace required for efficient execution. * if JOBZ='N' RWORK(1) = optimal amount of workspace * required to compute the eigenvalues. * if JOBZ='V' RWORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors. * * LRWORK (local input) INTEGER * Size of RWORK, must be at least 3. * See below for definitions of variables used to define LRWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LRWORK >= 2 + 5 * N + MAX( 12 * N, NB * ( NP00 + 1 ) ) * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required is: * LRWORK >= 2 + 5 * N + MAX( 18*N, NP00 * MQ00 + 2 * NB * NB ) + * (2 + ICEIL( NEIG, NPROW*NPCOL))*N * * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP00 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ00 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance for all work arrays. Each of * these values is returned in the first entry of the * corresponding work arrays, and no error message is issued by * PXERBLA. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * * Let NNP = MAX( N, NPROW*NPCOL + 1, 4 ). Then: * LIWORK >= 12*NNP + 2*N when the eigenvectors are desired * LIWORK >= 10*NNP + 2*N when only the eigenvalues have to be computed * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * PZHEEVR assumes IEEE 754 standard compliant arithmetic. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and Z(IZ:IZ+M-1,JZ:JZ+N-1) * must satisfy the following alignment properties: * * 1.Identical (quadratic) dimension: * DESCA(M_) = DESCZ(M_) = DESCA(N_) = DESCZ(N_) * 2.Quadratic conformal blocking: * DESCA(MB_) = DESCA(NB_) = DESCZ(MB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * 3.MOD( IA-1, MB_A ) = MOD( IZ-1, MB_Z ) = 0 * 4.IAROW = IZROW * * * .. Parameters .. INTEGER CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_ PARAMETER ( CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, COLBRT, DOBCST, FINISH, FIRST, INDEIG, $ LOWER, LQUERY, VALEIG, VSTART, WANTZ INTEGER ANB, DOL, DOU, DSTCOL, DSTROW, EIGCNT, FRSTCL, $ I, IAROW, ICTXT, IIL, IINDERR, IINDWLC, IINFO, $ IIU, IM, INDD, INDD2, INDE, INDE2, INDERR, $ INDILU, INDRTAU, INDRW, INDRWORK, INDTAU, $ INDWLC, INDWORK, IPIL, IPIU, IPROC, IZROW, $ LASTCL, LENGTHI, LENGTHI2, LIWMIN, LLRWORK, $ LLWORK, LRWMIN, LRWOPT, LWMIN, LWOPT, MAXCLS, $ MQ00, MYCOL, MYIL, MYIU, MYPROC, MYROW, MZ, NB, $ NDEPTH, NEEDIL, NEEDIU, NHETRD_LWOPT, NNP, $ NP00, NPCOL, NPROCS, NPROW, NPS, NSPLIT, $ OFFSET, PARITY, RLENGTHI, RLENGTHI2, RSTARTI, $ SIZE1, SIZE2, SQNPC, SRCCOL, SRCROW, STARTI, $ ZOFFSET DOUBLE PRECISION PIVMIN, SAFMIN, SCALE, VLL, VUU, WL, $ WU * * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH, $ PJLAENV * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DCOPY, DGEBR2D, $ DGEBS2D, DGERV2D, DGESD2D, DLARRC, DLASRT2, $ DSTEGR2A, DSTEGR2B, DSTEGR2, IGEBR2D, $ IGEBS2D, IGERV2D, IGESD2D, IGSUM2D, PCHK1MAT, $ PCHK2MAT, PDLARED1D, PXERBLA, PZELGET, $ PZHENTRD, PZLAEVSWP, PZUNMTR * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, ICHAR, INT, MAX, MIN, MOD, $ SQRT * .. * .. Executable Statements .. * INFO = 0 *********************************************************************** * * Decode character arguments to find out what the code should do * *********************************************************************** WANTZ = LSAME( JOBZ, 'V' ) LOWER = LSAME( UPLO, 'L' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) *********************************************************************** * * GET MACHINE PARAMETERS * *********************************************************************** ICTXT = DESCA( CTXT_ ) SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) *********************************************************************** * * Set up pointers into the (complex) WORK array * *********************************************************************** INDTAU = 1 INDWORK = INDTAU + N LLWORK = LWORK - INDWORK + 1 *********************************************************************** * * Set up pointers into the RWORK array * *********************************************************************** INDRTAU = 1 INDD = INDRTAU + N INDE = INDD + N + 1 INDD2 = INDE + N + 1 INDE2 = INDD2 + N INDRWORK = INDE2 + N LLRWORK = LRWORK - INDRWORK + 1 *********************************************************************** * * BLACS PROCESSOR GRID SETUP * *********************************************************************** CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NPROCS = NPROW * NPCOL MYPROC = MYROW * NPCOL + MYCOL IF( NPROW.EQ.-1 ) THEN INFO = -( 800+CTXT_ ) ELSE IF( WANTZ ) THEN IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF *********************************************************************** * * COMPUTE REAL WORKSPACE * *********************************************************************** IF ( ALLEIG ) THEN MZ = N ELSE IF ( INDEIG ) THEN MZ = IU - IL + 1 ELSE * Take upper bound for VALEIG case MZ = N END IF * NB = DESCA( NB_ ) NP00 = NUMROC( N, NB, 0, 0, NPROW ) MQ00 = NUMROC( MZ, NB, 0, 0, NPCOL ) IF ( WANTZ ) THEN INDRW = INDRWORK + MAX(18*N, NP00*MQ00 + 2*NB*NB) LRWMIN = INDRW - 1 + (ICEIL(MZ, NPROCS) + 2)*N LWMIN = N + MAX((NP00 + MQ00 + NB) * NB, 3 * NB) ELSE INDRW = INDRWORK + 12*N LRWMIN = INDRW - 1 LWMIN = N + MAX( NB*( NP00 + 1 ), 3 * NB ) END IF * The code that validates the input requires 3 workspace entries LRWMIN = MAX(3, LRWMIN) LRWOPT = LRWMIN LWMIN = MAX(3, LWMIN) LWOPT = LWMIN * ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROCS ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS LWOPT = MAX( LWOPT, N+NHETRD_LWOPT ) * SIZE1 = INDRW - INDRWORK *********************************************************************** * * COMPUTE INTEGER WORKSPACE * *********************************************************************** NNP = MAX( N, NPROCS+1, 4 ) IF ( WANTZ ) THEN LIWMIN = 12*NNP + 2*N ELSE LIWMIN = 10*NNP + 2*N END IF *********************************************************************** * * Set up pointers into the IWORK array * *********************************************************************** * Pointer to eigenpair distribution over processors INDILU = LIWMIN - 2*NPROCS + 1 SIZE2 = INDILU - 2*N *********************************************************************** * * Test the input arguments. * *********************************************************************** IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 21, INFO ) * IF( INFO.EQ.0 ) THEN IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -3 ELSE IF( MOD( IA-1, DESCA( MB_ ) ).NE.0 ) THEN INFO = -6 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -10 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -11 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N )) $ THEN INFO = -12 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -21 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -25 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 800+NB_ ) END IF IF( WANTZ ) THEN IAROW = INDXG2P( 1, DESCA( NB_ ), MYROW, $ DESCA( RSRC_ ), NPROW ) IZROW = INDXG2P( 1, DESCA( NB_ ), MYROW, $ DESCZ( RSRC_ ), NPROW ) IF( IAROW.NE.IZROW ) THEN INFO = -19 ELSE IF( MOD( IA-1, DESCA( MB_ ) ).NE. $ MOD( IZ-1, DESCZ( MB_ ) ) ) THEN INFO = -19 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2100+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2100+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2100+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2100+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2100+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2100+CSRC_ ) ELSE IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( ALLEIG ) THEN IDUM1( 3 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 3 ) = ICHAR( 'I' ) ELSE IDUM1( 3 ) = ICHAR( 'V' ) END IF IDUM2( 3 ) = 3 IF( LQUERY ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 4 IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 8, N, 4, N, 4,IZ, $ JZ, DESCZ, 21, 4, IDUM1, IDUM2, INFO ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 4, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = DCMPLX( LWOPT ) RWORK( 1 ) = DBLE( LRWOPT ) IWORK( 1 ) = LIWMIN END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHEEVR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF *********************************************************************** * * Quick return if possible * *********************************************************************** IF( N.EQ.0 ) THEN IF( WANTZ ) THEN NZ = 0 END IF M = 0 WORK( 1 ) = DCMPLX( LWOPT ) RWORK( 1 ) = DBLE( LRWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF IF( VALEIG ) THEN VLL = VL VUU = VU ELSE VLL = ZERO VUU = ZERO END IF * * No scaling done here, leave this to MRRR kernel. * Scale tridiagonal rather than full matrix. * *********************************************************************** * * REDUCE MATRIX TO REAL SYMMETRIC TRIDIAGONAL FORM. * *********************************************************************** CALL PZHENTRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, RWORK( INDRWORK ), LLRWORK,IINFO ) IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'PZHENTRD', -IINFO ) RETURN END IF *********************************************************************** * * DISTRIBUTE TRIDIAGONAL TO ALL PROCESSORS * *********************************************************************** OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. $ DESCA( RSRC_ ).EQ.0 .AND. DESCA( CSRC_ ).EQ.0 ) $ THEN CALL PDLARED1D( N, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDD2 ), RWORK( INDRWORK ), LLRWORK ) * CALL PDLARED1D( N, IA, JA, DESCA, RWORK( INDE ), $ RWORK( INDE2 ), RWORK( INDRWORK ), LLRWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PZELGET( 'A', ' ', WORK( INDWORK ), A, $ I+IA-1, I+JA-1, DESCA ) RWORK( INDD2+I-1 ) = DBLE( WORK( INDWORK ) ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PZELGET( 'A', ' ', WORK( INDWORK ), A, $ I+IA-1, I+JA, DESCA ) RWORK( INDE2+I-1 ) = DBLE( WORK( INDWORK ) ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PZELGET( 'A', ' ', WORK( INDWORK ), A, $ I+IA, I+JA-1, DESCA ) RWORK( INDE2+I-1 ) = DBLE( WORK( INDWORK ) ) 30 CONTINUE END IF END IF *********************************************************************** * * SET IIL, IIU * *********************************************************************** IF ( ALLEIG ) THEN IIL = 1 IIU = N ELSE IF ( INDEIG ) THEN IIL = IL IIU = IU ELSE IF ( VALEIG ) THEN CALL DLARRC('T', N, VLL, VUU, RWORK( INDD2 ), $ RWORK( INDE2 + OFFSET ), SAFMIN, EIGCNT, IIL, IIU, INFO) * Refine upper bound N that was taken MZ = EIGCNT IIL = IIL + 1 ENDIF IF(MZ.EQ.0) THEN M = 0 IF( WANTZ ) THEN NZ = 0 END IF WORK( 1 ) = DBLE( LWOPT ) IWORK( 1 ) = LIWMIN RETURN END IF MYIL = 0 MYIU = 0 M = 0 IM = 0 *********************************************************************** * * COMPUTE WORK ASSIGNMENTS * *********************************************************************** * * Each processor computes the work assignments for all processors * CALL PMPIM2( IIL, IIU, NPROCS, $ IWORK(INDILU), IWORK(INDILU+NPROCS) ) * * Find local work assignment * MYIL = IWORK(INDILU+MYPROC) MYIU = IWORK(INDILU+NPROCS+MYPROC) ZOFFSET = MAX(0, MYIL - IIL - 1) FIRST = ( MYIL .EQ. IIL ) *********************************************************************** * * CALLS TO MRRR KERNEL * *********************************************************************** IF(.NOT.WANTZ) THEN * * Compute eigenvalues only. * IINFO = 0 IF ( MYIL.GT.0 ) THEN DOL = 1 DOU = MYIU - MYIL + 1 CALL DSTEGR2( JOBZ, 'I', N, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), VLL, VUU, MYIL, MYIU, $ IM, W( 1 ), RWORK( INDRW ), N, $ MYIU - MYIL + 1, $ IWORK( 1 ), RWORK( INDRWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, $ DOL, DOU, ZOFFSET, IINFO ) * DSTEGR2 zeroes out the entire W array, so we can't just give * it the part of W we need. So here we copy the W entries into * their correct location DO 49 I = 1, IM W( MYIL-IIL+I ) = W( I ) 49 CONTINUE * W( MYIL ) is at W( MYIL - IIL + 1 ) * W( X ) is at W(X - IIL + 1 ) END IF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'DSTEGR2', -IINFO ) RETURN END IF ELSEIF ( WANTZ .AND. NPROCS.EQ.1 ) THEN * * Compute eigenvalues and -vectors, but only on one processor * IINFO = 0 IF ( MYIL.GT.0 ) THEN DOL = MYIL - IIL + 1 DOU = MYIU - IIL + 1 CALL DSTEGR2( JOBZ, 'I', N, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), VLL, VUU, IIL, IIU, $ IM, W( 1 ), RWORK( INDRW ), N, $ N, $ IWORK( 1 ), RWORK( INDRWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, DOU, $ ZOFFSET, IINFO ) ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'DSTEGR2', -IINFO ) RETURN END IF ELSEIF ( WANTZ ) THEN * Compute representations in parallel. * Share eigenvalue computation for root between all processors * Then compute the eigenvectors. IINFO = 0 * Part 1. compute root representations and root eigenvalues IF ( MYIL.GT.0 ) THEN DOL = MYIL - IIL + 1 DOU = MYIU - IIL + 1 CALL DSTEGR2A( JOBZ, 'I', N, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), VLL, VUU, IIL, IIU, $ IM, W( 1 ), RWORK( INDRW ), N, $ N, RWORK( INDRWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, $ DOU, NEEDIL, NEEDIU, $ INDERR, NSPLIT, PIVMIN, SCALE, WL, WU, $ IINFO ) ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'DSTEGR2A', -IINFO ) RETURN END IF * * The second part of parallel MRRR, the representation tree * construction begins. Upon successful completion, the * eigenvectors have been computed. This is indicated by * the flag FINISH. * VSTART = .TRUE. FINISH = (MYIL.LE.0) C Part 2. Share eigenvalues and uncertainties between all processors IINDERR = INDRWORK + INDERR - 1 * * * There are currently two ways to communicate eigenvalue information * using the BLACS. * 1.) BROADCAST * 2.) POINT2POINT between collaborators (those processors working * jointly on a cluster. * For efficiency, BROADCAST has been disabled. * At a later stage, other more efficient communication algorithms * might be implemented, e. g. group or tree-based communication. DOBCST = .FALSE. IF(DOBCST) THEN * First gather everything on the first processor. * Then use BROADCAST-based communication DO 45 I = 2, NPROCS IF (MYPROC .EQ. (I - 1)) THEN DSTROW = 0 DSTCOL = 0 STARTI = DOL IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF (( STARTI.GE.1 ) .AND. ( LENGTHI.GE.1 )) THEN LENGTHI2 = 2*LENGTHI * Copy eigenvalues into communication buffer CALL DCOPY(LENGTHI,W( STARTI ),1, $ RWORK( INDD ), 1) * Copy uncertainties into communication buffer CALL DCOPY(LENGTHI,RWORK(IINDERR+STARTI-1),1, $ RWORK( INDD+LENGTHI ), 1) * send buffer CALL DGESD2D( ICTXT, LENGTHI2, $ 1, RWORK( INDD ), LENGTHI2, $ DSTROW, DSTCOL ) END IF ELSE IF (MYPROC .EQ. 0) THEN SRCROW = (I-1) / NPCOL SRCCOL = MOD(I-1, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) STARTI = IWORK(1) LENGTHI = IWORK(2) IF (( STARTI.GE.1 ) .AND. ( LENGTHI.GE.1 )) THEN LENGTHI2 = 2*LENGTHI * receive buffer CALL DGERV2D( ICTXT, LENGTHI2, 1, $ RWORK(INDD), LENGTHI2, SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL DCOPY( LENGTHI, RWORK(INDD), 1, $ W( STARTI ), 1) * copy uncertainties (errors) from communication buffer CALL DCOPY(LENGTHI,RWORK(INDD+LENGTHI),1, $ RWORK( IINDERR+STARTI-1 ), 1) END IF END IF 45 CONTINUE LENGTHI = IIU - IIL + 1 LENGTHI2 = LENGTHI * 2 IF (MYPROC .EQ. 0) THEN * Broadcast eigenvalues and errors to all processors CALL DCOPY(LENGTHI,W ,1, RWORK( INDD ), 1) CALL DCOPY(LENGTHI,RWORK( IINDERR ),1, $ RWORK( INDD+LENGTHI ), 1) CALL DGEBS2D( ICTXT, 'A', ' ', LENGTHI2, 1, $ RWORK(INDD), LENGTHI2 ) ELSE SRCROW = 0 SRCCOL = 0 CALL DGEBR2D( ICTXT, 'A', ' ', LENGTHI2, 1, $ RWORK(INDD), LENGTHI2, SRCROW, SRCCOL ) CALL DCOPY( LENGTHI, RWORK(INDD), 1, W, 1) CALL DCOPY(LENGTHI,RWORK(INDD+LENGTHI),1, $ RWORK( IINDERR ), 1) END IF ELSE * Enable point2point communication between collaborators * Find collaborators of MYPROC IF( (NPROCS.GT.1).AND.(MYIL.GT.0) ) THEN CALL PMPCOL( MYPROC, NPROCS, IIL, NEEDIL, NEEDIU, $ IWORK(INDILU), IWORK(INDILU+NPROCS), $ COLBRT, FRSTCL, LASTCL ) ELSE COLBRT = .FALSE. ENDIF IF(COLBRT) THEN * If the processor collaborates with others, * communicate information. DO 47 IPROC = FRSTCL, LASTCL IF (MYPROC .EQ. IPROC) THEN STARTI = DOL IWORK(1) = STARTI LENGTHI = MYIU - MYIL + 1 IWORK(2) = LENGTHI IF ((STARTI.GE.1) .AND. (LENGTHI.GE.1)) THEN * Copy eigenvalues into communication buffer CALL DCOPY(LENGTHI,W( STARTI ),1, $ RWORK(INDD), 1) * Copy uncertainties into communication buffer CALL DCOPY(LENGTHI, $ RWORK( IINDERR+STARTI-1 ),1, $ RWORK(INDD+LENGTHI), 1) ENDIF DO 46 I = FRSTCL, LASTCL IF(I.EQ.MYPROC) GOTO 46 DSTROW = I/ NPCOL DSTCOL = MOD(I, NPCOL) CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1) .AND. (LENGTHI.GE.1)) THEN LENGTHI2 = 2*LENGTHI * send buffer CALL DGESD2D( ICTXT, LENGTHI2, $ 1, RWORK(INDD), LENGTHI2, $ DSTROW, DSTCOL ) END IF 46 CONTINUE ELSE SRCROW = IPROC / NPCOL SRCCOL = MOD(IPROC, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) RSTARTI = IWORK(1) RLENGTHI = IWORK(2) IF ((RSTARTI.GE.1 ) .AND. (RLENGTHI.GE.1 )) THEN RLENGTHI2 = 2*RLENGTHI CALL DGERV2D( ICTXT, RLENGTHI2, 1, $ RWORK(INDE), RLENGTHI2, $ SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL DCOPY( RLENGTHI,RWORK(INDE), 1, $ W( RSTARTI ), 1) * copy uncertainties (errors) from communication buffer CALL DCOPY(RLENGTHI,RWORK(INDE+RLENGTHI),1, $ RWORK( IINDERR+RSTARTI-1 ), 1) END IF END IF 47 CONTINUE ENDIF ENDIF * Part 3. Compute representation tree and eigenvectors. * What follows is a loop in which the tree * is constructed in parallel from top to bottom, * on level at a time, until all eigenvectors * have been computed. * 100 CONTINUE IF ( MYIL.GT.0 ) THEN CALL DSTEGR2B( JOBZ, N, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), $ IM, W( 1 ), RWORK( INDRW ), N, N, $ IWORK( 1 ), RWORK( INDRWORK ), SIZE1, $ IWORK( 2*N+1 ), SIZE2, DOL, $ DOU, NEEDIL, NEEDIU, INDWLC, $ PIVMIN, SCALE, WL, WU, $ VSTART, FINISH, $ MAXCLS, NDEPTH, PARITY, ZOFFSET, IINFO ) IINDWLC = INDRWORK + INDWLC - 1 IF(.NOT.FINISH) THEN IF((NEEDIL.LT.DOL).OR.(NEEDIU.GT.DOU)) THEN CALL PMPCOL( MYPROC, NPROCS, IIL, NEEDIL, NEEDIU, $ IWORK(INDILU), IWORK(INDILU+NPROCS), $ COLBRT, FRSTCL, LASTCL ) ELSE COLBRT = .FALSE. FRSTCL = MYPROC LASTCL = MYPROC ENDIF * * Check if this processor collaborates, i.e. * communication is needed. * IF(COLBRT) THEN DO 147 IPROC = FRSTCL, LASTCL IF (MYPROC .EQ. IPROC) THEN STARTI = DOL IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN * Copy eigenvalues into communication buffer CALL DCOPY(LENGTHI, $ RWORK( IINDWLC+STARTI-1 ),1, $ RWORK(INDD), 1) * Copy uncertainties into communication buffer CALL DCOPY(LENGTHI, $ RWORK( IINDERR+STARTI-1 ),1, $ RWORK(INDD+LENGTHI), 1) ENDIF DO 146 I = FRSTCL, LASTCL IF(I.EQ.MYPROC) GOTO 146 DSTROW = I/ NPCOL DSTCOL = MOD(I, NPCOL) CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN LENGTHI2 = 2*LENGTHI * send buffer CALL DGESD2D( ICTXT, LENGTHI2, $ 1, RWORK(INDD), LENGTHI2, $ DSTROW, DSTCOL ) END IF 146 CONTINUE ELSE SRCROW = IPROC / NPCOL SRCCOL = MOD(IPROC, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) RSTARTI = IWORK(1) RLENGTHI = IWORK(2) IF ((RSTARTI.GE.1).AND.(RLENGTHI.GE.1)) THEN RLENGTHI2 = 2*RLENGTHI CALL DGERV2D( ICTXT,RLENGTHI2, 1, $ RWORK(INDE),RLENGTHI2, $ SRCROW, SRCCOL ) * copy eigenvalues from communication buffer CALL DCOPY(RLENGTHI,RWORK(INDE), 1, $ RWORK( IINDWLC+RSTARTI-1 ), 1) * copy uncertainties (errors) from communication buffer CALL DCOPY(RLENGTHI,RWORK(INDE+RLENGTHI), $ 1,RWORK( IINDERR+RSTARTI-1 ), 1) END IF END IF 147 CONTINUE ENDIF GOTO 100 ENDIF ENDIF IF (IINFO .NE. 0) THEN CALL PXERBLA( ICTXT, 'DSTEGR2B', -IINFO ) RETURN END IF * ENDIF * *********************************************************************** * * MAIN PART ENDS HERE * *********************************************************************** * *********************************************************************** * * ALLGATHER: EACH PROCESSOR SENDS ITS EIGENVALUES TO THE FIRST ONE, * THEN THE FIRST PROCESSOR BROADCASTS ALL EIGENVALUES * *********************************************************************** DO 50 I = 2, NPROCS IF (MYPROC .EQ. (I - 1)) THEN DSTROW = 0 DSTCOL = 0 STARTI = MYIL - IIL + 1 IWORK(1) = STARTI IF(MYIL.GT.0) THEN LENGTHI = MYIU - MYIL + 1 ELSE LENGTHI = 0 ENDIF IWORK(2) = LENGTHI CALL IGESD2D( ICTXT, 2, 1, IWORK, 2, $ DSTROW, DSTCOL ) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN CALL DGESD2D( ICTXT, LENGTHI, $ 1, W( STARTI ), LENGTHI, $ DSTROW, DSTCOL ) ENDIF ELSE IF (MYPROC .EQ. 0) THEN SRCROW = (I-1) / NPCOL SRCCOL = MOD(I-1, NPCOL) CALL IGERV2D( ICTXT, 2, 1, IWORK, 2, $ SRCROW, SRCCOL ) STARTI = IWORK(1) LENGTHI = IWORK(2) IF ((STARTI.GE.1).AND.(LENGTHI.GE.1)) THEN CALL DGERV2D( ICTXT, LENGTHI, 1, $ W( STARTI ), LENGTHI, SRCROW, SRCCOL ) ENDIF ENDIF 50 CONTINUE * Accumulate M from all processors M = IM CALL IGSUM2D( ICTXT, 'A', ' ', 1, 1, M, 1, -1, -1 ) * Broadcast eigenvalues to all processors IF (MYPROC .EQ. 0) THEN * Send eigenvalues CALL DGEBS2D( ICTXT, 'A', ' ', M, 1, W, M ) ELSE SRCROW = 0 SRCCOL = 0 CALL DGEBR2D( ICTXT, 'A', ' ', M, 1, $ W, M, SRCROW, SRCCOL ) END IF * * Sort the eigenvalues and keep permutation in IWORK to * sort the eigenvectors accordingly * DO 160 I = 1, M IWORK( NPROCS+1+I ) = I 160 CONTINUE CALL DLASRT2( 'I', M, W, IWORK( NPROCS+2 ), IINFO ) IF (IINFO.NE.0) THEN CALL PXERBLA( ICTXT, 'DLASRT2', -IINFO ) RETURN END IF *********************************************************************** * * TRANSFORM Z FROM 1D WORKSPACE INTO 2D BLOCKCYCLIC STORAGE * *********************************************************************** IF ( WANTZ ) THEN DO 170 I = 1, M IWORK( M+NPROCS+1+IWORK( NPROCS+1+I ) ) = I 170 CONTINUE * Store NVS in IWORK(1:NPROCS+1) for PZLAEVSWP IWORK( 1 ) = 0 DO 180 I = 1, NPROCS * Find IL and IU for processor i-1 * Has already been computed by PMPIM2 and stored IPIL = IWORK(INDILU+I-1) IPIU = IWORK(INDILU+NPROCS+I-1) IF (IPIL .EQ. 0) THEN IWORK( I + 1 ) = IWORK( I ) ELSE IWORK( I + 1 ) = IWORK( I ) + IPIU - IPIL + 1 ENDIF 180 CONTINUE IF ( FIRST ) THEN CALL PZLAEVSWP(N, RWORK( INDRW ), N, Z, IZ, JZ, $ DESCZ, IWORK( 1 ), IWORK( NPROCS+M+2 ), RWORK( INDRWORK ), $ SIZE1 ) ELSE CALL PZLAEVSWP(N, RWORK( INDRW + N ), N, Z, IZ, JZ, $ DESCZ, IWORK( 1 ), IWORK( NPROCS+M+2 ), RWORK( INDRWORK ), $ SIZE1 ) END IF * NZ = M * *********************************************************************** * * Compute eigenvectors of A from eigenvectors of T * *********************************************************************** IF( NZ.GT.0 ) THEN CALL PZUNMTR( 'L', UPLO, 'N', N, NZ, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) END IF IF (IINFO.NE.0) THEN CALL PXERBLA( ICTXT, 'PZUNMTR', -IINFO ) RETURN END IF * END IF * WORK( 1 ) = DCMPLX( LWOPT ) RWORK( 1 ) = DBLE( LRWOPT ) IWORK( 1 ) = LIWMIN RETURN * * End of PZHEEVR * END scalapack-2.0.2/SRC/pzheevx.f000644 000766 000024 00000117002 11657236177 016220 0ustar00juliestaff000000 000000 SUBROUTINE PZHEEVX( JOBZ, RANGE, UPLO, N, A, IA, JA, DESCA, VL, $ VU, IL, IU, ABSTOL, M, NZ, W, ORFAC, Z, IZ, $ JZ, DESCZ, WORK, LWORK, RWORK, LRWORK, IWORK, $ LIWORK, IFAIL, ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IL, INFO, IU, IZ, JA, JZ, LIWORK, LRWORK, $ LWORK, M, N, NZ DOUBLE PRECISION ABSTOL, ORFAC, VL, VU * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), ICLUSTR( * ), $ IFAIL( * ), IWORK( * ) DOUBLE PRECISION GAP( * ), RWORK( * ), W( * ) COMPLEX*16 A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZHEEVX computes selected eigenvalues and, optionally, eigenvectors * of a complex hermitian matrix A by calling the recommended sequence * of ScaLAPACK routines. Eigenvalues/vectors can be selected by * specifying a range of values or a range of indices for the desired * eigenvalues. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * PZHEEVX assumes IEEE 754 standard compliant arithmetic. To port * to a system which does not have IEEE 754 arithmetic, modify * the appropriate SLmake.inc file to include the compiler switch * -DNO_IEEE. This switch only affects the compilation of pdlaiect.c. * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * JOBZ (global input) CHARACTER*1 * Specifies whether or not to compute the eigenvectors: * = 'N': Compute eigenvalues only. * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns of the matrix A. N >= 0. * * A (local input/workspace) block cyclic COMPLEX*16 array, * global dimension (N, N), * local dimension ( LLD_A, LOCc(JA+N-1) ) * * On entry, the Hermitian matrix A. If UPLO = 'U', only the * upper triangular part of A is used to define the elements of * the Hermitian matrix. If UPLO = 'L', only the lower * triangular part of A is used to define the elements of the * Hermitian matrix. * * On exit, the lower triangle (if UPLO='L') or the upper * triangle (if UPLO='U') of A, including the diagonal, is * destroyed. * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PZHEEVX cannot guarantee * correct error reporting. * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * ABSTOL (global input) DOUBLE PRECISION * If JOBZ='V', setting ABSTOL to PDLAMCH( CONTEXT, 'U') yields * the most orthogonal eigenvectors. * * The absolute error tolerance for the eigenvalues. * An approximate eigenvalue is accepted as converged * when it is determined to lie in an interval [a,b] * of width less than or equal to * * ABSTOL + EPS * max( |a|,|b| ) , * * where EPS is the machine precision. If ABSTOL is less than * or equal to zero, then EPS*norm(T) will be used in its place, * where norm(T) is the 1-norm of the tridiagonal matrix * obtained by reducing A to tridiagonal form. * * Eigenvalues will be computed most accurately when ABSTOL is * set to twice the underflow threshold 2*PDLAMCH('S') not zero. * If this routine returns with ((MOD(INFO,2).NE.0) .OR. * (MOD(INFO/8,2).NE.0)), indicating that some eigenvalues or * eigenvectors did not converge, try setting ABSTOL to * 2*PDLAMCH('S'). * * See "Computing Small Singular Values of Bidiagonal Matrices * with Guaranteed High Relative Accuracy," by Demmel and * Kahan, LAPACK Working Note #3. * * See "On the correctness of Parallel Bisection in Floating * Point" by Demmel, Dhillon and Ren, LAPACK Working Note #70 * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M unless the user supplies * insufficient space and PZHEEVX is not able to detect this * before beginning computation. To get all the eigenvectors * requested, the user must supply both sufficient * space to hold the eigenvectors in Z (M .LE. DESCZ(N_)) * and sufficient workspace to compute them. (See LWORK below.) * PZHEEVX is always able to detect insufficient space without * computation unless RANGE .EQ. 'V'. * * W (global output) DOUBLE PRECISION array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * ORFAC (global input) DOUBLE PRECISION * Specifies which eigenvectors should be reorthogonalized. * Eigenvectors that correspond to eigenvalues which are within * tol=ORFAC*norm(A) of each other are to be reorthogonalized. * However, if the workspace is insufficient (see LWORK), * tol may be decreased until all eigenvectors to be * reorthogonalized can be stored in one process. * No reorthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) COMPLEX*16 array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. If an eigenvector * fails to converge, then that column of Z contains the latest * approximation to the eigenvector, and the index of the * eigenvector is returned in IFAIL. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX*16 array, * dimension (LWORK) * WORK(1) returns workspace adequate workspace to allow * optimal performance. * * LWORK (local input) INTEGER * Size of WORK array. If only eigenvalues are requested: * LWORK >= N + MAX( NB * ( NP0 + 1 ), 3 ) * If eigenvectors are requested: * LWORK >= N + ( NP0 + MQ0 + NB ) * NB * with NQ0 = NUMROC( NN, NB, 0, 0, NPCOL ). * * For optimal performance, greater workspace is needed, i.e. * LWORK >= MAX( LWORK, NHETRD_LWORK ) * Where LWORK is as defined above, and * NHETRD_LWORK = N + 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 1 ) * NPS * * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) * SQNPC = SQRT( DBLE( NPROW * NPCOL ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the * optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/output) DOUBLE PRECISION array, * dimension max(3,LRWORK) * On return, WORK(1) contains the optimal amount of * workspace required for efficient execution. * if JOBZ='N' RWORK(1) = optimal amount of workspace * required to compute eigenvalues efficiently * if JOBZ='V' RWORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors * efficiently with no guarantee on orthogonality. * If RANGE='V', it is assumed that all eigenvectors * may be required. * * LRWORK (local input) INTEGER * Size of RWORK * See below for definitions of variables used to define LRWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LRWORK >= 5 * NN + 4 * N * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * LRWORK >= 4*N + MAX( 5*NN, NP0 * MQ0 ) + * ICEIL( NEIG, NPROW*NPCOL)*NN * * The computed eigenvectors may not be orthogonal if the * minimal workspace is supplied and ORFAC is too small. * If you want to guarantee orthogonality (at the cost * of potentially poor performance) you should add * the following to LRWORK: * (CLUSTERSIZE-1)*N * where CLUSTERSIZE is the number of eigenvalues in the * largest cluster, where a cluster is defined as a set of * close eigenvalues: { W(K),...,W(K+CLUSTERSIZE-1) | * W(J+1) <= W(J) + ORFAC*2*norm(A) } * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = * DESCZ( MB_ ) = DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * When LRWORK is too small: * If LRWORK is too small to guarantee orthogonality, * PZHEEVX attempts to maintain orthogonality in * the clusters with the smallest * spacing between the eigenvalues. * If LRWORK is too small to compute all the eigenvectors * requested, no computation is performed and INFO=-25 * is returned. Note that when RANGE='V', PZHEEVX does * not know how many eigenvectors are requested until * the eigenvalues are computed. Therefore, when RANGE='V' * and as long as LRWORK is large enough to allow PZHEEVX to * compute the eigenvalues, PZHEEVX will compute the * eigenvalues and as many eigenvectors as it can. * * Relationship between workspace, orthogonality & performance: * If CLUSTERSIZE >= N/SQRT(NPROW*NPCOL), then providing * enough space to compute all the eigenvectors * orthogonally will cause serious degradation in * performance. In the limit (i.e. CLUSTERSIZE = N-1) * PZSTEIN will perform no better than ZSTEIN on 1 * processor. * For CLUSTERSIZE = N/SQRT(NPROW*NPCOL) reorthogonalizing * all eigenvectors will increase the total execution time * by a factor of 2 or more. * For CLUSTERSIZE > N/SQRT(NPROW*NPCOL) execution time will * grow as the square of the cluster size, all other factors * remaining equal and assuming enough workspace. Less * workspace means less reorthogonalization but faster * execution. * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the size * required for optimal performance for all work arrays. Each of * these values is returned in the first entry of the * corresponding work arrays, and no error message is issued by * PXERBLA. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * LIWORK >= 6 * NNP * Where: * NNP = MAX( N, NPROW*NPCOL + 1, 4 ) * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (global output) INTEGER array, dimension (N) * If JOBZ = 'V', then on normal exit, the first M elements of * IFAIL are zero. If (MOD(INFO,2).NE.0) on exit, then * IFAIL contains the * indices of the eigenvectors that failed to converge. * If JOBZ = 'N', then IFAIL is not referenced. * * ICLUSTR (global output) integer array, dimension (2*NPROW*NPCOL) * This array contains indices of eigenvectors corresponding to * a cluster of eigenvalues that could not be reorthogonalized * due to insufficient workspace (see LWORK, ORFAC and INFO). * Eigenvectors corresponding to clusters of eigenvalues indexed * ICLUSTR(2*I-1) to ICLUSTR(2*I), could not be * reorthogonalized due to lack of workspace. Hence the * eigenvectors corresponding to these clusters may not be * orthogonal. ICLUSTR() is a zero terminated array. * (ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0) if and only if * K is the number of clusters * ICLUSTR is not referenced if JOBZ = 'N' * * GAP (global output) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * This array contains the gap between eigenvalues whose * eigenvectors could not be reorthogonalized. The output * values in this array correspond to the clusters indicated * by the array ICLUSTR. As a result, the dot product between * eigenvectors correspoding to the I^th cluster may be as high * as ( C * n ) / GAP(I) where C is a small constant. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: if (MOD(INFO,2).NE.0), then one or more eigenvectors * failed to converge. Their indices are stored * in IFAIL. Ensure ABSTOL=2.0*PDLAMCH( 'U' ) * Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding * to one or more clusters of eigenvalues could not be * reorthogonalized because of insufficient workspace. * The indices of the clusters are stored in the array * ICLUSTR. * if (MOD(INFO/4,2).NE.0), then space limit prevented * PZHEEVX from computing all of the eigenvectors * between VL and VU. The number of eigenvectors * computed is returned in NZ. * if (MOD(INFO/8,2).NE.0), then PZSTEBZ failed to compute * eigenvalues. Ensure ABSTOL=2.0*PDLAMCH( 'U' ) * Send e-mail to scalapack@cs.utk.edu * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * ( MB_A.EQ.NB_A.EQ.MB_Z .AND. IROFFA.EQ.IROFFZ .AND. IROFFA.EQ.0 .AND. * IAROW.EQ.IZROW ) * where * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * Differences between PZHEEVX and ZHEEVX * ====================================== * * A, LDA -> A, IA, JA, DESCA * Z, LDZ -> Z, IZ, JZ, DESCZ * WORKSPACE needs are larger for PZHEEVX. * LIWORK parameter added * * ORFAC, ICLUSTER() and GAP() parameters added * meaning of INFO is changed * * Functional differences: * PZHEEVX does not promise orthogonality for eigenvectors associated * with tighly clustered eigenvalues. * PZHEEVX does not reorthogonalize eigenvectors * that are on different processes. The extent of reorthogonalization * is controlled by the input parameter LWORK. * * Version 1.4 limitations: * DESCA(MB_) = DESCA(NB_) * DESCA(M_) = DESCZ(M_) * DESCA(N_) = DESCZ(N_) * DESCA(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCZ(RSRC_) * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE, TEN, FIVE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TEN = 10.0D+0, $ FIVE = 5.0D+0 ) INTEGER IERREIN, IERRCLS, IERRSPC, IERREBZ PARAMETER ( IERREIN = 1, IERRCLS = 2, IERRSPC = 4, $ IERREBZ = 8 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LOWER, LQUERY, QUICKRETURN, $ VALEIG, WANTZ CHARACTER ORDER INTEGER ANB, CSRC_A, I, IAROW, ICOFFA, ICTXT, IINFO, $ INDD, INDD2, INDE, INDE2, INDIBL, INDISP, $ INDRWORK, INDTAU, INDWORK, IROFFA, IROFFZ, $ ISCALE, ISIZESTEBZ, ISIZESTEIN, IZROW, $ LALLWORK, LIWMIN, LLRWORK, LLWORK, LRWMIN, $ LRWOPT, LWMIN, LWOPT, MAXEIGS, MB_A, MQ0, $ MYCOL, MYROW, NB, NB_A, NEIG, NHETRD_LWOPT, NN, $ NNP, NP0, NPCOL, NPROCS, NPROW, NPS, NQ0, $ NSPLIT, NZZ, OFFSET, RSRC_A, RSRC_Z, SIZEHEEVX, $ SIZESTEIN, SQNPC DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, $ SIGMA, SMLNUM, VLL, VUU * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV DOUBLE PRECISION PDLAMCH, PZLANHE EXTERNAL LSAME, ICEIL, INDXG2P, NUMROC, PJLAENV, $ PDLAMCH, PZLANHE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DGEBR2D, DGEBS2D, $ DLASRT, DSCAL, IGAMN2D, PCHK1MAT, PCHK2MAT, $ PDLARED1D, PDSTEBZ, PXERBLA, PZELGET, PZHENTRD, $ PZLASCL, PZSTEIN, PZUNMTR * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, ICHAR, INT, MAX, MIN, MOD, $ SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * QUICKRETURN = ( N.EQ.0 ) * * Test the input arguments. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) INFO = 0 * WANTZ = LSAME( JOBZ, 'V' ) IF( NPROW.EQ.-1 ) THEN INFO = -( 800+CTXT_ ) ELSE IF( WANTZ ) THEN IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF IF( INFO.EQ.0 ) THEN CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( WANTZ ) $ CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 21, INFO ) * IF( INFO.EQ.0 ) THEN * * Get machine constants. * SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) EPS = PDLAMCH( ICTXT, 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * NPROCS = NPROW*NPCOL LOWER = LSAME( UPLO, 'L' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) * * Set up pointers into the WORK array * INDTAU = 1 INDWORK = INDTAU + N LLWORK = LWORK - INDWORK + 1 * * Set up pointers into the RWORK array * INDE = 1 INDD = INDE + N INDD2 = INDD + N INDE2 = INDD2 + N INDRWORK = INDE2 + N LLRWORK = LRWORK - INDRWORK + 1 * * Set up pointers into the IWORK array * ISIZESTEIN = 3*N + NPROCS + 1 ISIZESTEBZ = MAX( 4*N, 14, NPROCS ) INDIBL = ( MAX( ISIZESTEIN, ISIZESTEBZ ) ) + 1 INDISP = INDIBL + N * * Compute the total amount of space needed * LQUERY = .FALSE. IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) $ LQUERY = .TRUE. * NNP = MAX( N, NPROCS+1, 4 ) LIWMIN = 6*NNP * NPROCS = NPROW*NPCOL NB_A = DESCA( NB_ ) MB_A = DESCA( MB_ ) NB = NB_A NN = MAX( N, NB, 2 ) * RSRC_A = DESCA( RSRC_ ) CSRC_A = DESCA( CSRC_ ) IROFFA = MOD( IA-1, MB_A ) ICOFFA = MOD( JA-1, NB_A ) IAROW = INDXG2P( 1, NB_A, MYROW, RSRC_A, NPROW ) NP0 = NUMROC( N+IROFFA, NB, 0, 0, NPROW ) MQ0 = NUMROC( N+ICOFFA, NB, 0, 0, NPCOL ) IF( WANTZ ) THEN RSRC_Z = DESCZ( RSRC_ ) IROFFZ = MOD( IZ-1, MB_A ) IZROW = INDXG2P( 1, NB_A, MYROW, RSRC_Z, NPROW ) ELSE IROFFZ = 0 IZROW = 0 END IF * IF( ( .NOT.WANTZ ) .OR. ( VALEIG .AND. ( .NOT.LQUERY ) ) ) $ THEN LWMIN = N + MAX( NB*( NP0+1 ), 3 ) LWOPT = LWMIN LRWMIN = 5*NN + 4*N IF( WANTZ ) THEN MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) LRWOPT = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( N, NPROW*NPCOL )*NN ELSE LRWOPT = LRWMIN END IF NEIG = 0 ELSE IF( ALLEIG .OR. VALEIG ) THEN NEIG = N ELSE IF( INDEIG ) THEN NEIG = IU - IL + 1 END IF MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) NQ0 = NUMROC( NN, NB, 0, 0, NPCOL ) LWMIN = N + ( NP0+NQ0+NB )*NB LRWMIN = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN LRWOPT = LRWMIN LWOPT = LWMIN * END IF * * Compute how much workspace is needed to use the * new TRD code * ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS LWOPT = MAX( LWOPT, N+NHETRD_LWOPT ) * END IF IF( INFO.EQ.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN RWORK( 1 ) = ABSTOL IF( VALEIG ) THEN RWORK( 2 ) = VL RWORK( 3 ) = VU ELSE RWORK( 2 ) = ZERO RWORK( 3 ) = ZERO END IF CALL DGEBS2D( ICTXT, 'ALL', ' ', 3, 1, RWORK, 3 ) ELSE CALL DGEBR2D( ICTXT, 'ALL', ' ', 3, 1, RWORK, 3, 0, 0 ) END IF IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( .NOT.( LOWER .OR. LSAME( UPLO, 'U' ) ) ) THEN INFO = -3 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -10 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -11 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -12 ELSE IF( LWORK.LT.LWMIN .AND. LWORK.NE.-1 ) THEN INFO = -23 ELSE IF( LRWORK.LT.LRWMIN .AND. LRWORK.NE.-1 ) THEN INFO = -25 ELSE IF( LIWORK.LT.LIWMIN .AND. LIWORK.NE.-1 ) THEN INFO = -27 ELSE IF( VALEIG .AND. ( ABS( RWORK( 2 )-VL ).GT.FIVE*EPS* $ ABS( VL ) ) ) THEN INFO = -9 ELSE IF( VALEIG .AND. ( ABS( RWORK( 3 )-VU ).GT.FIVE*EPS* $ ABS( VU ) ) ) THEN INFO = -10 ELSE IF( ABS( RWORK( 1 )-ABSTOL ).GT.FIVE*EPS* $ ABS( ABSTOL ) ) THEN INFO = -13 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 800+NB_ ) END IF IF( WANTZ ) THEN IF( IROFFA.NE.IROFFZ ) THEN INFO = -19 ELSE IF( IAROW.NE.IZROW ) THEN INFO = -19 ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2100+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2100+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2100+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2100+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2100+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2100+CSRC_ ) ELSE IF( ICTXT.NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2100+CTXT_ ) END IF END IF END IF IF( WANTZ ) THEN IDUM1( 1 ) = ICHAR( 'V' ) ELSE IDUM1( 1 ) = ICHAR( 'N' ) END IF IDUM2( 1 ) = 1 IF( LOWER ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 IF( ALLEIG ) THEN IDUM1( 3 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 3 ) = ICHAR( 'I' ) ELSE IDUM1( 3 ) = ICHAR( 'V' ) END IF IDUM2( 3 ) = 3 IF( LQUERY ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 4 IF( WANTZ ) THEN CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 8, N, 4, N, 4, IZ, $ JZ, DESCZ, 21, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 4, IDUM1, $ IDUM2, INFO ) END IF WORK( 1 ) = DCMPLX( LWOPT ) RWORK( 1 ) = DBLE( LRWOPT ) IWORK( 1 ) = LIWMIN END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHEEVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( QUICKRETURN ) THEN IF( WANTZ ) THEN NZ = 0 ICLUSTR( 1 ) = 0 END IF M = 0 WORK( 1 ) = DCMPLX( LWOPT ) RWORK( 1 ) = DBLE( LRWMIN ) IWORK( 1 ) = LIWMIN RETURN END IF * * Scale matrix to allowable range, if necessary. * ABSTLL = ABSTOL ISCALE = 0 IF( VALEIG ) THEN VLL = VL VUU = VU ELSE VLL = ZERO VUU = ZERO END IF * ANRM = PZLANHE( 'M', UPLO, N, A, IA, JA, DESCA, $ RWORK( INDRWORK ) ) * IF( ANRM.GT.ZERO .AND. ANRM.LT.RMIN ) THEN ISCALE = 1 SIGMA = RMIN / ANRM ANRM = ANRM*SIGMA ELSE IF( ANRM.GT.RMAX ) THEN ISCALE = 1 SIGMA = RMAX / ANRM ANRM = ANRM*SIGMA END IF * IF( ISCALE.EQ.1 ) THEN CALL PZLASCL( UPLO, ONE, SIGMA, N, N, A, IA, JA, DESCA, IINFO ) IF( ABSTOL.GT.0 ) $ ABSTLL = ABSTOL*SIGMA IF( VALEIG ) THEN VLL = VL*SIGMA VUU = VU*SIGMA IF( VUU.EQ.VLL ) THEN VUU = VUU + 2*MAX( ABS( VUU )*EPS, SAFMIN ) END IF END IF END IF * * Call PZHENTRD to reduce Hermitian matrix to tridiagonal form. * LALLWORK = LLRWORK * CALL PZHENTRD( UPLO, N, A, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDE ), WORK( INDTAU ), WORK( INDWORK ), $ LLWORK, RWORK( INDRWORK ), LLRWORK, IINFO ) * * * Copy the values of D, E to all processes * * Here PxLARED1D is used to redistribute the tridiagonal matrix. * PxLARED1D, however, doesn't yet work with arbritary matrix * distributions so we have PxELGET as a backup. * OFFSET = 0 IF( IA.EQ.1 .AND. JA.EQ.1 .AND. RSRC_A.EQ.0 .AND. CSRC_A.EQ.0 ) $ THEN CALL PDLARED1D( N, IA, JA, DESCA, RWORK( INDD ), $ RWORK( INDD2 ), RWORK( INDRWORK ), LLRWORK ) * CALL PDLARED1D( N, IA, JA, DESCA, RWORK( INDE ), $ RWORK( INDE2 ), RWORK( INDRWORK ), LLRWORK ) IF( .NOT.LOWER ) $ OFFSET = 1 ELSE DO 10 I = 1, N CALL PZELGET( 'A', ' ', WORK( INDD2+I-1 ), A, I+IA-1, $ I+JA-1, DESCA ) RWORK( INDD2+I-1 ) = DBLE( WORK( INDD2+I-1 ) ) 10 CONTINUE IF( LSAME( UPLO, 'U' ) ) THEN DO 20 I = 1, N - 1 CALL PZELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA-1, $ I+JA, DESCA ) RWORK( INDE2+I-1 ) = DBLE( WORK( INDE2+I-1 ) ) 20 CONTINUE ELSE DO 30 I = 1, N - 1 CALL PZELGET( 'A', ' ', WORK( INDE2+I-1 ), A, I+IA, $ I+JA-1, DESCA ) RWORK( INDE2+I-1 ) = DBLE( WORK( INDE2+I-1 ) ) 30 CONTINUE END IF END IF * * Call PDSTEBZ and, if eigenvectors are desired, PZSTEIN. * IF( WANTZ ) THEN ORDER = 'B' ELSE ORDER = 'E' END IF * CALL PDSTEBZ( ICTXT, RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, $ RWORK( INDD2 ), RWORK( INDE2+OFFSET ), M, NSPLIT, W, $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWORK ), $ LLRWORK, IWORK( 1 ), ISIZESTEBZ, IINFO ) * * * IF PDSTEBZ fails, the error propogates to INFO, but * we do not propogate the eigenvalue(s) which failed because: * 1) This should never happen if the user specifies * ABSTOL = 2 * PDLAMCH( 'U' ) * 2) PDSTEIN will confirm/deny whether the eigenvalues are * close enough. * IF( IINFO.NE.0 ) THEN INFO = INFO + IERREBZ DO 40 I = 1, M IWORK( INDIBL+I-1 ) = ABS( IWORK( INDIBL+I-1 ) ) 40 CONTINUE END IF IF( WANTZ ) THEN * IF( VALEIG ) THEN * * Compute the maximum number of eigenvalues that we can * compute in the * workspace that we have, and that we can store in Z. * * Loop through the possibilities looking for the largest * NZ that we can feed to PZSTEIN and PZUNMTR * * Since all processes must end up with the same value * of NZ, we first compute the minimum of LALLWORK * CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LALLWORK, 1, 1, 1, -1, $ -1, -1 ) * MAXEIGS = DESCZ( N_ ) * DO 50 NZ = MIN( MAXEIGS, M ), 0, -1 MQ0 = NUMROC( NZ, NB, 0, 0, NPCOL ) SIZESTEIN = ICEIL( NZ, NPROCS )*N + MAX( 5*N, NP0*MQ0 ) SIZEHEEVX = SIZESTEIN IF( SIZEHEEVX.LE.LALLWORK ) $ GO TO 60 50 CONTINUE 60 CONTINUE ELSE NZ = M END IF NZ = MAX( NZ, 0 ) IF( NZ.NE.M ) THEN INFO = INFO + IERRSPC * DO 70 I = 1, M IFAIL( I ) = 0 70 CONTINUE * * The following code handles a rare special case * - NZ .NE. M means that we don't have enough room to store * all the vectors. * - NSPLIT .GT. 1 means that the matrix split * In this case, we cannot simply take the first NZ eigenvalues * because PDSTEBZ sorts the eigenvalues by block when * a split occurs. So, we have to make another call to * PDSTEBZ with a new upper limit - VUU. * IF( NSPLIT.GT.1 ) THEN CALL DLASRT( 'I', M, W, IINFO ) NZZ = 0 IF( NZ.GT.0 ) THEN * VUU = W( NZ ) - TEN*( EPS*ANRM+SAFMIN ) IF( VLL.GE.VUU ) THEN NZZ = 0 ELSE CALL PDSTEBZ( ICTXT, RANGE, ORDER, N, VLL, VUU, IL, $ IU, ABSTLL, RWORK( INDD2 ), $ RWORK( INDE2+OFFSET ), NZZ, NSPLIT, $ W, IWORK( INDIBL ), IWORK( INDISP ), $ RWORK( INDRWORK ), LLRWORK, $ IWORK( 1 ), ISIZESTEBZ, IINFO ) END IF * IF( MOD( INFO / IERREBZ, 1 ).EQ.0 ) THEN IF( NZZ.GT.NZ .OR. IINFO.NE.0 ) THEN INFO = INFO + IERREBZ END IF END IF END IF NZ = MIN( NZ, NZZ ) * END IF END IF CALL PZSTEIN( N, RWORK( INDD2 ), RWORK( INDE2+OFFSET ), NZ, W, $ IWORK( INDIBL ), IWORK( INDISP ), ORFAC, Z, IZ, $ JZ, DESCZ, RWORK( INDRWORK ), LALLWORK, $ IWORK( 1 ), ISIZESTEIN, IFAIL, ICLUSTR, GAP, $ IINFO ) * IF( IINFO.GE.NZ+1 ) $ INFO = INFO + IERRCLS IF( MOD( IINFO, NZ+1 ).NE.0 ) $ INFO = INFO + IERREIN * * Z = Q * Z * * IF( NZ.GT.0 ) THEN CALL PZUNMTR( 'L', UPLO, 'N', N, NZ, A, IA, JA, DESCA, $ WORK( INDTAU ), Z, IZ, JZ, DESCZ, $ WORK( INDWORK ), LLWORK, IINFO ) END IF * END IF * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( ISCALE.EQ.1 ) THEN CALL DSCAL( M, ONE / SIGMA, W, 1 ) END IF * WORK( 1 ) = DCMPLX( LWOPT ) RWORK( 1 ) = DBLE( LRWOPT ) IWORK( 1 ) = LIWMIN * RETURN * * End of PZHEEVX * END scalapack-2.0.2/SRC/pzhegs2.f000644 000766 000024 00000037561 10363532303 016103 0ustar00juliestaff000000 000000 * * SUBROUTINE PZHEGS2( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ) * .. * * Purpose * ======= * * PZHEGS2 reduces a complex Hermitian-definite generalized eigenproblem * to standard form. * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**H or L**H*sub( A )*L. * * sub( B ) must have been previously factorized as U**H*U or L*L**H by * PZPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H); * = 2 or 3: compute U*sub( A )*U**H or L**H*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**H*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**H. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PZPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, HALF PARAMETER ( ONE = 1.0D+0, HALF = 0.5D+0 ) COMPLEX*16 CONE PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IIA, IIB, IOFFA, IOFFB, IROFFA, IROFFB, $ JJA, JJB, K, LDA, LDB, MYCOL, MYROW, NPCOL, $ NPROW DOUBLE PRECISION AKK, BKK COMPLEX*16 CT * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PXERBLA, ZAXPY, ZDSCAL, ZHER2, ZLACGV, ZTRMV, $ ZTRSV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL LSAME, INDXG2P * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( N+ICOFFA.GT.DESCA( NB_ ) ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHEGS2', -INFO ) CALL BLACS_EXIT( ICTXT ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. ( MYROW.NE.IAROW .OR. MYCOL.NE.IACOL ) ) $ RETURN * * Compute local information * LDA = DESCA( LLD_ ) LDB = DESCB( LLD_ ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIB, JJB, $ IBROW, IBCOL ) * IF( IBTYPE.EQ.1 ) THEN * IF( UPPER ) THEN * IOFFA = IIA + JJA*LDA IOFFB = IIB + JJB*LDB * * Compute inv(U')*sub( A )*inv(U) * DO 10 K = 1, N * * Update the upper triangle of * A(ia+k-1:ia+n-a,ia+k-1:ia+n-1) * AKK = DBLE( A( IOFFA-LDA ) ) BKK = DBLE( B( IOFFB-LDB ) ) AKK = AKK / BKK**2 A( IOFFA-LDA ) = AKK IF( K.LT.N ) THEN CALL ZDSCAL( N-K, ONE / BKK, A( IOFFA ), LDA ) CT = -HALF*AKK CALL ZLACGV( N-K, A( IOFFA ), LDA ) CALL ZLACGV( N-K, B( IOFFB ), LDB ) CALL ZAXPY( N-K, CT, B( IOFFB ), LDB, A( IOFFA ), $ LDA ) CALL ZHER2( UPLO, N-K, -CONE, A( IOFFA ), LDA, $ B( IOFFB ), LDB, A( IOFFA+1 ), LDA ) CALL ZAXPY( N-K, CT, B( IOFFB ), LDB, A( IOFFA ), $ LDA ) CALL ZLACGV( N-K, B( IOFFB ), LDB ) CALL ZTRSV( UPLO, 'Conjugate transpose', 'Non-unit', $ N-K, B( IOFFB+1 ), LDB, A( IOFFA ), LDA ) CALL ZLACGV( N-K, A( IOFFA ), LDA ) END IF * * A( IOFFA ) -> A( K, K+1 ) * B( IOFFB ) -> B( K, K+1 ) * IOFFA = IOFFA + LDA + 1 IOFFB = IOFFB + LDB + 1 * 10 CONTINUE * ELSE * IOFFA = IIA + 1 + ( JJA-1 )*LDA IOFFB = IIB + 1 + ( JJB-1 )*LDB * * Compute inv(L)*sub( A )*inv(L') * DO 20 K = 1, N * * Update the lower triangle of * A(ia+k-1:ia+n-a,ia+k-1:ia+n-1) * AKK = DBLE( A( IOFFA-1 ) ) BKK = DBLE( B( IOFFB-1 ) ) AKK = AKK / BKK**2 A( IOFFA-1 ) = AKK * IF( K.LT.N ) THEN CALL ZDSCAL( N-K, ONE / BKK, A( IOFFA ), 1 ) CT = -HALF*AKK CALL ZAXPY( N-K, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL ZHER2( UPLO, N-K, -CONE, A( IOFFA ), 1, $ B( IOFFB ), 1, A( IOFFA+LDA ), LDA ) CALL ZAXPY( N-K, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL ZTRSV( UPLO, 'No transpose', 'Non-unit', N-K, $ B( IOFFB+LDB ), LDB, A( IOFFA ), 1 ) END IF * * A( IOFFA ) -> A( K+1, K ) * B( IOFFB ) -> B( K+1, K ) * IOFFA = IOFFA + LDA + 1 IOFFB = IOFFB + LDB + 1 * 20 CONTINUE * END IF * ELSE * IF( UPPER ) THEN * IOFFA = IIA + ( JJA-1 )*LDA IOFFB = IIB + ( JJB-1 )*LDB * * Compute U*sub( A )*U' * DO 30 K = 1, N * * Update the upper triangle of A(ia:ia+k-1,ja:ja+k-1) * AKK = DBLE( A( IOFFA+K-1 ) ) BKK = DBLE( B( IOFFB+K-1 ) ) CALL ZTRMV( UPLO, 'No transpose', 'Non-unit', K-1, $ B( IIB+( JJB-1 )*LDB ), LDB, A( IOFFA ), 1 ) CT = HALF*AKK CALL ZAXPY( K-1, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL ZHER2( UPLO, K-1, CONE, A( IOFFA ), 1, B( IOFFB ), $ 1, A( IIA+( JJA-1 )*LDA ), LDA ) CALL ZAXPY( K-1, CT, B( IOFFB ), 1, A( IOFFA ), 1 ) CALL ZDSCAL( K-1, BKK, A( IOFFA ), 1 ) A( IOFFA+K-1 ) = AKK*BKK**2 * * A( IOFFA ) -> A( 1, K ) * B( IOFFB ) -> B( 1, K ) * IOFFA = IOFFA + LDA IOFFB = IOFFB + LDB * 30 CONTINUE * ELSE * IOFFA = IIA + ( JJA-1 )*LDA IOFFB = IIB + ( JJB-1 )*LDB * * Compute L'*sub( A )*L * DO 40 K = 1, N * * Update the lower triangle of A(ia:ia+k-1,ja:ja+k-1) * AKK = DBLE( A( IOFFA+( K-1 )*LDA ) ) BKK = DBLE( B( IOFFB+( K-1 )*LDB ) ) CALL ZLACGV( K-1, A( IOFFA ), LDA ) CALL ZTRMV( UPLO, 'Conjugate transpose', 'Non-unit', K-1, $ B( IIB+( JJB-1 )*LDB ), LDB, A( IOFFA ), $ LDA ) CT = HALF*AKK CALL ZLACGV( K-1, B( IOFFB ), LDB ) CALL ZAXPY( K-1, CT, B( IOFFB ), LDB, A( IOFFA ), LDA ) CALL ZHER2( UPLO, K-1, CONE, A( IOFFA ), LDA, B( IOFFB ), $ LDB, A( IIA+( JJA-1 )*LDA ), LDA ) CALL ZAXPY( K-1, CT, B( IOFFB ), LDB, A( IOFFA ), LDA ) CALL ZLACGV( K-1, B( IOFFB ), LDB ) CALL ZDSCAL( K-1, BKK, A( IOFFA ), LDA ) CALL ZLACGV( K-1, A( IOFFA ), LDA ) A( IOFFA+( K-1 )*LDA ) = AKK*BKK**2 * * A( IOFFA ) -> A( K, 1 ) * B( IOFFB ) -> B( K, 1 ) * IOFFA = IOFFA + 1 IOFFB = IOFFB + 1 * 40 CONTINUE * END IF * END IF * RETURN * * End of PZHEGS2 * END scalapack-2.0.2/SRC/pzhegst.f000644 000766 000024 00000042341 10363532303 016175 0ustar00juliestaff000000 000000 * * SUBROUTINE PZHEGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, N DOUBLE PRECISION SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ) * .. * * Purpose * ======= * * PZHEGST reduces a complex Hermitian-definite generalized eigenproblem * to standard form. * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**H or L**H*sub( A )*L. * * sub( B ) must have been previously factorized as U**H*U or L*L**H by * PZPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H); * = 2 or 3: compute U*sub( A )*U**H or L**H*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**H*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**H. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PZPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * SCALE (global output) DOUBLE PRECISION * Amount by which the eigenvalues should be scaled to * compensate for the scaling performed in this routine. * At present, SCALE is always returned as 1.0, it is * returned here to allow for future enhancement. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) COMPLEX*16 CONE, HALF PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), $ HALF = ( 0.5D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, IROFFA, IROFFB, K, KB, MYCOL, MYROW, NB, $ NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PXERBLA, $ PZHEGS2, PZHEMM, PZHER2K, PZTRMM, PZTRSM * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL LSAME, ICEIL, INDXG2P * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * SCALE = ONE * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IB, $ JB, DESCB, 11, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHEGST', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( IBTYPE.EQ.1 ) THEN IF( UPPER ) THEN * * Compute inv(U')*sub( A )*inv(U) * K = 1 NB = DESCA( NB_ ) KB = MIN( ICEIL( JA, NB )*NB, JA+N-1 ) - JA + 1 * 10 CONTINUE * * Update the upper triangle of A(ia+k-1:ia+n-1,ja+k-1:ja+n-1) * CALL PZHEGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, IB+K-1, DESCB, INFO ) IF( K+KB.LE.N ) THEN CALL PZTRSM( 'Left', UPLO, 'Conjugate Transpose', $ 'Non-unit', KB, N-K-KB+1, CONE, B, IB+K-1, $ JB+K-1, DESCB, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PZHEMM( 'Left', UPLO, KB, N-K-KB+1, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K-1, JB+K+KB-1, $ DESCB, CONE, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PZHER2K( UPLO, 'Conjugate Transpose', N-K-KB+1, KB, $ -CONE, A, IA+K-1, JA+K+KB-1, DESCA, B, $ IB+K-1, JB+K+KB-1, DESCB, ONE, A, $ IA+K+KB-1, JA+K+KB-1, DESCA ) CALL PZHEMM( 'Left', UPLO, KB, N-K-KB+1, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K-1, JB+K+KB-1, $ DESCB, CONE, A, IA+K-1, JA+K+KB-1, DESCA ) CALL PZTRSM( 'Right', UPLO, 'No transpose', 'Non-unit', $ KB, N-K-KB+1, CONE, B, IB+K+KB-1, JB+K+KB-1, $ DESCB, A, IA+K-1, JA+K+KB-1, DESCA ) END IF K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 10 * ELSE * * Compute inv(L)*sub( A )*inv(L') * K = 1 NB = DESCA( MB_ ) KB = MIN( ICEIL( IA, NB )*NB, IA+N-1 ) - IA + 1 * 20 CONTINUE * * Update the lower triangle of A(ia+k-1:ia+n-1,ja+k-1:ja+n-1) * CALL PZHEGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) IF( K+KB.LE.N ) THEN CALL PZTRSM( 'Right', UPLO, 'Conjugate transpose', $ 'Non-unit', N-K-KB+1, KB, CONE, B, IB+K-1, $ JB+K-1, DESCB, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PZHEMM( 'Right', UPLO, N-K-KB+1, KB, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K+KB-1, JB+K-1, $ DESCB, CONE, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PZHER2K( UPLO, 'No transpose', N-K-KB+1, KB, -CONE, $ A, IA+K+KB-1, JA+K-1, DESCA, B, IB+K+KB-1, $ JB+K-1, DESCB, ONE, A, IA+K+KB-1, $ JA+K+KB-1, DESCA ) CALL PZHEMM( 'Right', UPLO, N-K-KB+1, KB, -HALF, A, $ IA+K-1, JA+K-1, DESCA, B, IB+K+KB-1, JB+K-1, $ DESCB, CONE, A, IA+K+KB-1, JA+K-1, DESCA ) CALL PZTRSM( 'Left', UPLO, 'No transpose', 'Non-unit', $ N-K-KB+1, KB, CONE, B, IB+K+KB-1, JB+K+KB-1, $ DESCB, A, IA+K+KB-1, JA+K-1, DESCA ) END IF K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 20 * END IF * ELSE * IF( UPPER ) THEN * * Compute U*sub( A )*U' * K = 1 NB = DESCA( NB_ ) KB = MIN( ICEIL( JA, NB )*NB, JA+N-1 ) - JA + 1 * 30 CONTINUE * * Update the upper triangle of A(ia:ia+k+kb-2,ja:ja+k+kb-2) * CALL PZTRMM( 'Left', UPLO, 'No transpose', 'Non-unit', K-1, $ KB, CONE, B, IB, JB, DESCB, A, IA, JA+K-1, $ DESCA ) CALL PZHEMM( 'Right', UPLO, K-1, KB, HALF, A, IA+K-1, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, CONE, A, $ IA, JA+K-1, DESCA ) CALL PZHER2K( UPLO, 'No transpose', K-1, KB, CONE, A, IA, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, ONE, A, $ IA, JA, DESCA ) CALL PZHEMM( 'Right', UPLO, K-1, KB, HALF, A, IA+K-1, $ JA+K-1, DESCA, B, IB, JB+K-1, DESCB, CONE, A, $ IA, JA+K-1, DESCA ) CALL PZTRMM( 'Right', UPLO, 'Conjugate transpose', $ 'Non-unit', K-1, KB, CONE, B, IB+K-1, JB+K-1, $ DESCB, A, IA, JA+K-1, DESCA ) CALL PZHEGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) * K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 30 * ELSE * * Compute L'*sub( A )*L * K = 1 NB = DESCA( MB_ ) KB = MIN( ICEIL( IA, NB )*NB, IA+N-1 ) - IA + 1 * 40 CONTINUE * * Update the lower triangle of A(ia:ia+k+kb-2,ja:ja+k+kb-2) * CALL PZTRMM( 'Right', UPLO, 'No transpose', 'Non-unit', KB, $ K-1, CONE, B, IB, JB, DESCB, A, IA+K-1, JA, $ DESCA ) CALL PZHEMM( 'Left', UPLO, KB, K-1, HALF, A, IA+K-1, JA+K-1, $ DESCA, B, IB+K-1, JB, DESCB, CONE, A, IA+K-1, $ JA, DESCA ) CALL PZHER2K( UPLO, 'Conjugate transpose', K-1, KB, CONE, A, $ IA+K-1, JA, DESCA, B, IB+K-1, JB, DESCB, ONE, $ A, IA, JA, DESCA ) CALL PZHEMM( 'Left', UPLO, KB, K-1, HALF, A, IA+K-1, JA+K-1, $ DESCA, B, IB+K-1, JB, DESCB, CONE, A, IA+K-1, $ JA, DESCA ) CALL PZTRMM( 'Left', UPLO, 'Conjugate transpose', $ 'Non-unit', KB, K-1, CONE, B, IB+K-1, JB+K-1, $ DESCB, A, IA+K-1, JA, DESCA ) CALL PZHEGS2( IBTYPE, UPLO, KB, A, IA+K-1, JA+K-1, DESCA, B, $ IB+K-1, JB+K-1, DESCB, INFO ) * K = K + KB KB = MIN( N-K+1, NB ) * IF( K.LE.N ) $ GO TO 40 * END IF * END IF * RETURN * * End of PZHEGST * END scalapack-2.0.2/SRC/pzhegvx.f000644 000766 000024 00000106111 10377154001 016200 0ustar00juliestaff000000 000000 SUBROUTINE PZHEGVX( IBTYPE, JOBZ, RANGE, UPLO, N, A, IA, JA, $ DESCA, B, IB, JB, DESCB, VL, VU, IL, IU, $ ABSTOL, M, NZ, W, ORFAC, Z, IZ, JZ, DESCZ, $ WORK, LWORK, RWORK, LRWORK, IWORK, LIWORK, $ IFAIL, ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE, UPLO INTEGER IA, IB, IBTYPE, IL, INFO, IU, IZ, JA, JB, JZ, $ LIWORK, LRWORK, LWORK, M, N, NZ DOUBLE PRECISION ABSTOL, ORFAC, VL, VU * .. * .. Array Arguments .. * INTEGER DESCA( * ), DESCB( * ), DESCZ( * ), $ ICLUSTR( * ), IFAIL( * ), IWORK( * ) DOUBLE PRECISION GAP( * ), RWORK( * ), W( * ) COMPLEX*16 A( * ), B( * ), WORK( * ), Z( * ) * .. * * Purpose * * ======= * * PZHEGVX computes all the eigenvalues, and optionally, * the eigenvectors * of a complex generalized Hermitian-definite eigenproblem, of the form * sub( A )*x=(lambda)*sub( B )*x, sub( A )*sub( B )x=(lambda)*x, or * sub( B )*sub( A )*x=(lambda)*x. * Here sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ) is assumed to be * Hermitian, and sub( B ) denoting B( IB:IB+N-1, JB:JB+N-1 ) is assumed * to be Hermitian positive definite. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * IBTYPE (global input) INTEGER * Specifies the problem type to be solved: * = 1: sub( A )*x = (lambda)*sub( B )*x * = 2: sub( A )*sub( B )*x = (lambda)*x * = 3: sub( B )*sub( A )*x = (lambda)*x * * JOBZ (global input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (global input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the interval [VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * UPLO (global input) CHARACTER*1 * = 'U': Upper triangles of sub( A ) and sub( B ) are stored; * = 'L': Lower triangles of sub( A ) and sub( B ) are stored. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix. * * On exit, if JOBZ = 'V', then if INFO = 0, sub( A ) contains * the distributed matrix Z of eigenvectors. The eigenvectors * are normalized as follows: * if IBTYPE = 1 or 2, Z**H*sub( B )*Z = I; * if IBTYPE = 3, Z**H*inv( sub( B ) )*Z = I. * If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') * or the lower triangle (if UPLO='L') of sub( A ), including * the diagonal, is destroyed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * If DESCA( CTXT_ ) is incorrect, PZHEGVX cannot guarantee * correct error reporting. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_B, LOCc(JB+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( B ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( B ) contains * the upper triangular part of the matrix. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( B ) contains * the lower triangular part of the matrix. * * On exit, if INFO <= N, the part of sub( B ) containing the * matrix is overwritten by the triangular factor U or L from * the Cholesky factorization sub( B ) = U**H*U or * sub( B ) = L*L**H. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * DESCB( CTXT_ ) must equal DESCA( CTXT_ ) * * VL (global input) DOUBLE PRECISION * If RANGE='V', the lower bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * VU (global input) DOUBLE PRECISION * If RANGE='V', the upper bound of the interval to be searched * for eigenvalues. Not referenced if RANGE = 'A' or 'I'. * * IL (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * smallest eigenvalue to be returned. IL >= 1. * Not referenced if RANGE = 'A' or 'V'. * * IU (global input) INTEGER * If RANGE='I', the index (from smallest to largest) of the * largest eigenvalue to be returned. min(IL,N) <= IU <= N. * Not referenced if RANGE = 'A' or 'V'. * * ABSTOL (global input) DOUBLE PRECISION * If JOBZ='V', setting ABSTOL to PDLAMCH( CONTEXT, 'U') yields * the most orthogonal eigenvectors. * * The absolute error tolerance for the eigenvalues. * An approximate eigenvalue is accepted as converged * when it is determined to lie in an interval [a,b] * of width less than or equal to * * ABSTOL + EPS * max( |a|,|b| ) , * * where EPS is the machine precision. If ABSTOL is less than * or equal to zero, then EPS*norm(T) will be used in its place, * where norm(T) is the 1-norm of the tridiagonal matrix * obtained by reducing A to tridiagonal form. * * Eigenvalues will be computed most accurately when ABSTOL is * set to twice the underflow threshold 2*PDLAMCH('S') not zero. * If this routine returns with ((MOD(INFO,2).NE.0) .OR. * (MOD(INFO/8,2).NE.0)), indicating that some eigenvalues or * eigenvectors did not converge, try setting ABSTOL to * 2*PDLAMCH('S'). * * See "Computing Small Singular Values of Bidiagonal Matrices * with Guaranteed High Relative Accuracy," by Demmel and * Kahan, LAPACK Working Note #3. * * See "On the correctness of Parallel Bisection in Floating * Point" by Demmel, Dhillon and Ren, LAPACK Working Note #70 * * M (global output) INTEGER * Total number of eigenvalues found. 0 <= M <= N. * * NZ (global output) INTEGER * Total number of eigenvectors computed. 0 <= NZ <= M. * The number of columns of Z that are filled. * If JOBZ .NE. 'V', NZ is not referenced. * If JOBZ .EQ. 'V', NZ = M unless the user supplies * insufficient space and PZHEGVX is not able to detect this * before beginning computation. To get all the eigenvectors * requested, the user must supply both sufficient * space to hold the eigenvectors in Z (M .LE. DESCZ(N_)) * and sufficient workspace to compute them. (See LWORK below.) * PZHEGVX is always able to detect insufficient space without * computation unless RANGE .EQ. 'V'. * * W (global output) DOUBLE PRECISION array, dimension (N) * On normal exit, the first M entries contain the selected * eigenvalues in ascending order. * * ORFAC (global input) DOUBLE PRECISION * Specifies which eigenvectors should be reorthogonalized. * Eigenvectors that correspond to eigenvalues which are within * tol=ORFAC*norm(A) of each other are to be reorthogonalized. * However, if the workspace is insufficient (see LWORK), * tol may be decreased until all eigenvectors to be * reorthogonalized can be stored in one process. * No reorthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) COMPLEX*16 array, * global dimension (N, N), * local dimension ( LLD_Z, LOCc(JZ+N-1) ) * If JOBZ = 'V', then on normal exit the first M columns of Z * contain the orthonormal eigenvectors of the matrix * corresponding to the selected eigenvalues. If an eigenvector * fails to converge, then that column of Z contains the latest * approximation to the eigenvector, and the index of the * eigenvector is returned in IFAIL. * If JOBZ = 'N', then Z is not referenced. * * IZ (global input) INTEGER * The row index in the global array Z indicating the first * row of sub( Z ). * * JZ (global input) INTEGER * The column index in the global array Z indicating the * first column of sub( Z ). * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * DESCZ( CTXT_ ) must equal DESCA( CTXT_ ) * * WORK (local workspace/output) COMPLEX*16 array, * dimension (LWORK) * WORK(1) returns the optimal workspace. * * LWORK (local input) INTEGER * Size of WORK array. If only eigenvalues are requested: * LWORK >= N + MAX( NB * ( NP0 + 1 ), 3 ) * If eigenvectors are requested: * LWORK >= N + ( NP0 + MQ0 + NB ) * NB * with NQ0 = NUMROC( NN, NB, 0, 0, NPCOL ). * * For optimal performance, greater workspace is needed, i.e. * LWORK >= MAX( LWORK, N + NHETRD_LWOPT, * NHEGST_LWOPT ) * Where LWORK is as defined above, and * NHETRD_LWORK = 2*( ANB+1 )*( 4*NPS+2 ) + * ( NPS + 1 ) * NPS * NHEGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB * * NB = DESCA( MB_ ) * NP0 = NUMROC( N, NB, 0, 0, NPROW ) * NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) * SQNPC = SQRT( DBLE( NPROW * NPCOL ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the optimal * size for all work arrays. Each of these values is returned * in the first entry of the correspondingwork array, and no * error message is issued by PXERBLA. * * RWORK (local workspace/output) DOUBLE PRECISION array, * dimension max(3,LRWORK) * On return, RWORK(1) contains the amount of workspace * required for optimal efficiency * if JOBZ='N' RWORK(1) = optimal amount of workspace * required to compute eigenvalues efficiently * if JOBZ='V' RWORK(1) = optimal amount of workspace * required to compute eigenvalues and eigenvectors * efficiently with no guarantee on orthogonality. * If RANGE='V', it is assumed that all eigenvectors * may be required when computing optimal workspace. * * LRWORK (local input) INTEGER * Size of RWORK * See below for definitions of variables used to define LRWORK. * If no eigenvectors are requested (JOBZ = 'N') then * LRWORK >= 5 * NN + 4 * N * If eigenvectors are requested (JOBZ = 'V' ) then * the amount of workspace required to guarantee that all * eigenvectors are computed is: * LRWORK >= 4*N + MAX( 5*NN, NP0 * MQ0 ) + * ICEIL( NEIG, NPROW*NPCOL)*NN * * The computed eigenvectors may not be orthogonal if the * minimal workspace is supplied and ORFAC is too small. * If you want to guarantee orthogonality (at the cost * of potentially poor performance) you should add * the following to LRWORK: * (CLUSTERSIZE-1)*N * where CLUSTERSIZE is the number of eigenvalues in the * largest cluster, where a cluster is defined as a set of * close eigenvalues: { W(K),...,W(K+CLUSTERSIZE-1) | * W(J+1) <= W(J) + ORFAC*2*norm(A) } * Variable definitions: * NEIG = number of eigenvectors requested * NB = DESCA( MB_ ) = DESCA( NB_ ) = DESCZ( MB_ ) = * DESCZ( NB_ ) * NN = MAX( N, NB, 2 ) * DESCA( RSRC_ ) = DESCA( NB_ ) = DESCZ( RSRC_ ) = * DESCZ( CSRC_ ) = 0 * NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) * ICEIL( X, Y ) is a ScaLAPACK function returning * ceiling(X/Y) * * When LRWORK is too small: * If LRWORK is too small to guarantee orthogonality, * PZHEGVX attempts to maintain orthogonality in * the clusters with the smallest * spacing between the eigenvalues. * If LRWORK is too small to compute all the eigenvectors * requested, no computation is performed and INFO=-25 * is returned. Note that when RANGE='V', PZHEGVX does * not know how many eigenvectors are requested until * the eigenvalues are computed. Therefore, when RANGE='V' * and as long as LRWORK is large enough to allow PZHEGVX to * compute the eigenvalues, PZHEGVX will compute the * eigenvalues and as many eigenvectors as it can. * * Relationship between workspace, orthogonality & performance: * If CLUSTERSIZE >= N/SQRT(NPROW*NPCOL), then providing * enough space to compute all the eigenvectors * orthogonally will cause serious degradation in * performance. In the limit (i.e. CLUSTERSIZE = N-1) * PZSTEIN will perform no better than ZSTEIN on 1 processor. * For CLUSTERSIZE = N/SQRT(NPROW*NPCOL) reorthogonalizing * all eigenvectors will increase the total execution time * by a factor of 2 or more. * For CLUSTERSIZE > N/SQRT(NPROW*NPCOL) execution time will * grow as the square of the cluster size, all other factors * remaining equal and assuming enough workspace. Less * workspace means less reorthogonalization but faster * execution. * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace) INTEGER array * On return, IWORK(1) contains the amount of integer workspace * required. * * LIWORK (local input) INTEGER * size of IWORK * LIWORK >= 6 * NNP * Where: * NNP = MAX( N, NPROW*NPCOL + 1, 4 ) * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (output) INTEGER array, dimension (N) * IFAIL provides additional information when INFO .NE. 0 * If (MOD(INFO/16,2).NE.0) then IFAIL(1) indicates the order of * the smallest minor which is not positive definite. * If (MOD(INFO,2).NE.0) on exit, then IFAIL contains the * indices of the eigenvectors that failed to converge. * * If neither of the above error conditions hold and JOBZ = 'V', * then the first M elements of IFAIL are set to zero. * * ICLUSTR (global output) integer array, dimension (2*NPROW*NPCOL) * This array contains indices of eigenvectors corresponding to * a cluster of eigenvalues that could not be reorthogonalized * due to insufficient workspace (see LWORK, ORFAC and INFO). * Eigenvectors corresponding to clusters of eigenvalues indexed * ICLUSTR(2*I-1) to ICLUSTR(2*I), could not be * reorthogonalized due to lack of workspace. Hence the * eigenvectors corresponding to these clusters may not be * orthogonal. ICLUSTR() is a zero terminated array. * (ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0) if and only if * K is the number of clusters * ICLUSTR is not referenced if JOBZ = 'N' * * GAP (global output) DOUBLE PRECISION array, * dimension (NPROW*NPCOL) * This array contains the gap between eigenvalues whose * eigenvectors could not be reorthogonalized. The output * values in this array correspond to the clusters indicated * by the array ICLUSTR. As a result, the dot product between * eigenvectors correspoding to the I^th cluster may be as high * as ( C * n ) / GAP(I) where C is a small constant. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: if (MOD(INFO,2).NE.0), then one or more eigenvectors * failed to converge. Their indices are stored * in IFAIL. Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/2,2).NE.0),then eigenvectors corresponding * to one or more clusters of eigenvalues could not be * reorthogonalized because of insufficient workspace. * The indices of the clusters are stored in the array * ICLUSTR. * if (MOD(INFO/4,2).NE.0), then space limit prevented * PZHEGVX from computing all of the eigenvectors * between VL and VU. The number of eigenvectors * computed is returned in NZ. * if (MOD(INFO/8,2).NE.0), then PZSTEBZ failed to * compute eigenvalues. * Send e-mail to scalapack@cs.utk.edu * if (MOD(INFO/16,2).NE.0), then B was not positive * definite. IFAIL(1) indicates the order of * the smallest minor which is not positive definite. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*), C(IC:IC+M-1,JC:JC+N-1), * and B( IB:IB+N-1, JB:JB+N-1 ) must verify some alignment properties, * namely the following expressions should be true: * * DESCA(MB_) = DESCA(NB_) * IA = IB = IZ * JA = IB = JZ * DESCA(M_) = DESCB(M_) =DESCZ(M_) * DESCA(N_) = DESCB(N_)= DESCZ(N_) * DESCA(MB_) = DESCB(MB_) = DESCZ(MB_) * DESCA(NB_) = DESCB(NB_) = DESCZ(NB_) * DESCA(RSRC_) = DESCB(RSRC_) = DESCZ(RSRC_) * DESCA(CSRC_) = DESCB(CSRC_) = DESCZ(CSRC_) * MOD( IA-1, DESCA( MB_ ) ) = 0 * MOD( JA-1, DESCA( NB_ ) ) = 0 * MOD( IB-1, DESCB( MB_ ) ) = 0 * MOD( JB-1, DESCB( NB_ ) ) = 0 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION FIVE, ZERO PARAMETER ( FIVE = 5.0D+0, ZERO = 0.0D+0 ) INTEGER IERRNPD PARAMETER ( IERRNPD = 16 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LQUERY, UPPER, VALEIG, WANTZ CHARACTER TRANS INTEGER ANB, IACOL, IAROW, IBCOL, IBROW, ICOFFA, $ ICOFFB, ICTXT, IROFFA, IROFFB, LIWMIN, LRWMIN, $ LRWOPT, LWMIN, LWOPT, MQ0, MYCOL, MYROW, NB, $ NEIG, NHEGST_LWOPT, NHETRD_LWOPT, NN, NP0, $ NPCOL, NPROW, NPS, NQ0, SQNPC DOUBLE PRECISION EPS, SCALE * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC, PJLAENV DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, ICEIL, INDXG2P, NUMROC, PJLAENV, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DGEBR2D, DGEBS2D, $ DSCAL, PCHK1MAT, PCHK2MAT, PXERBLA, PZHEEVX, $ PZHENGST, PZPOTRF, PZTRMM, PZTRSM * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, ICHAR, INT, MAX, MIN, MOD, $ SQRT * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 900+CTXT_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2600+CTXT_ ) ELSE * * Get machine constants. * EPS = PDLAMCH( DESCA( CTXT_ ), 'Precision' ) * WANTZ = LSAME( JOBZ, 'V' ) UPPER = LSAME( UPLO, 'U' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, N, 4, IB, JB, DESCB, 13, INFO ) CALL CHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 26, INFO ) IF( INFO.EQ.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN RWORK( 1 ) = ABSTOL IF( VALEIG ) THEN RWORK( 2 ) = VL RWORK( 3 ) = VU ELSE RWORK( 2 ) = ZERO RWORK( 3 ) = ZERO END IF CALL DGEBS2D( DESCA( CTXT_ ), 'ALL', ' ', 3, 1, RWORK, $ 3 ) ELSE CALL DGEBR2D( DESCA( CTXT_ ), 'ALL', ' ', 3, 1, RWORK, 3, $ 0, 0 ) END IF IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) * * Compute the total amount of space needed * LQUERY = .FALSE. IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) $ LQUERY = .TRUE. * LIWMIN = 6*MAX( N, ( NPROW*NPCOL )+1, 4 ) * NB = DESCA( MB_ ) NN = MAX( N, NB, 2 ) NP0 = NUMROC( NN, NB, 0, 0, NPROW ) * IF( ( .NOT.WANTZ ) .OR. ( VALEIG .AND. ( .NOT.LQUERY ) ) ) $ THEN LWMIN = N + MAX( NB*( NP0+1 ), 3 ) LWOPT = LWMIN LRWMIN = 5*NN + 4*N IF( WANTZ ) THEN MQ0 = NUMROC( MAX( N, NB, 2 ), NB, 0, 0, NPCOL ) LRWOPT = 4*N + MAX( 5*NN, NP0*MQ0 ) ELSE LRWOPT = LRWMIN END IF NEIG = 0 ELSE IF( ALLEIG .OR. VALEIG ) THEN NEIG = N ELSE IF( INDEIG ) THEN NEIG = IU - IL + 1 END IF MQ0 = NUMROC( MAX( NEIG, NB, 2 ), NB, 0, 0, NPCOL ) LWMIN = N + ( NP0+MQ0+NB )*NB LWOPT = LWMIN LRWMIN = 4*N + MAX( 5*NN, NP0*MQ0 ) + $ ICEIL( NEIG, NPROW*NPCOL )*NN LRWOPT = LRWMIN * END IF * * Compute how much workspace is needed to use the * new TRD and GST algorithms * ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) NHETRD_LWOPT = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+4 )*NPS NB = DESCA( MB_ ) NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) NHEGST_LWOPT = 2*NP0*NB + NQ0*NB + NB*NB LWOPT = MAX( LWOPT, N+NHETRD_LWOPT, NHEGST_LWOPT ) * * Version 1.0 Limitations * IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -2 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -3 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -5 ELSE IF( IROFFA.NE.0 ) THEN INFO = -7 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -8 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 900+NB_ ) ELSE IF( DESCA( M_ ).NE.DESCB( M_ ) ) THEN INFO = -( 1300+M_ ) ELSE IF( DESCA( N_ ).NE.DESCB( N_ ) ) THEN INFO = -( 1300+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1300+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCB( NB_ ) ) THEN INFO = -( 1300+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCB( RSRC_ ) ) THEN INFO = -( 1300+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCB( CSRC_ ) ) THEN INFO = -( 1300+CSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( DESCA( M_ ).NE.DESCZ( M_ ) ) THEN INFO = -( 2200+M_ ) ELSE IF( DESCA( N_ ).NE.DESCZ( N_ ) ) THEN INFO = -( 2200+N_ ) ELSE IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 2200+MB_ ) ELSE IF( DESCA( NB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 2200+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.DESCZ( RSRC_ ) ) THEN INFO = -( 2200+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.DESCZ( CSRC_ ) ) THEN INFO = -( 2200+CSRC_ ) ELSE IF( DESCA( CTXT_ ).NE.DESCZ( CTXT_ ) ) THEN INFO = -( 2200+CTXT_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -11 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -12 ELSE IF( VALEIG .AND. N.GT.0 .AND. VU.LE.VL ) THEN INFO = -15 ELSE IF( INDEIG .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -16 ELSE IF( INDEIG .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -17 ELSE IF( VALEIG .AND. ( ABS( RWORK( 2 )-VL ).GT.FIVE*EPS* $ ABS( VL ) ) ) THEN INFO = -14 ELSE IF( VALEIG .AND. ( ABS( RWORK( 3 )-VU ).GT.FIVE*EPS* $ ABS( VU ) ) ) THEN INFO = -15 ELSE IF( ABS( RWORK( 1 )-ABSTOL ).GT.FIVE*EPS* $ ABS( ABSTOL ) ) THEN INFO = -18 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -28 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -30 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -32 END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( WANTZ ) THEN IDUM1( 2 ) = ICHAR( 'V' ) ELSE IDUM1( 2 ) = ICHAR( 'N' ) END IF IDUM2( 2 ) = 2 IF( UPPER ) THEN IDUM1( 3 ) = ICHAR( 'U' ) ELSE IDUM1( 3 ) = ICHAR( 'L' ) END IF IDUM2( 3 ) = 3 IF( ALLEIG ) THEN IDUM1( 4 ) = ICHAR( 'A' ) ELSE IF( INDEIG ) THEN IDUM1( 4 ) = ICHAR( 'I' ) ELSE IDUM1( 4 ) = ICHAR( 'V' ) END IF IDUM2( 4 ) = 4 IF( LQUERY ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 5 CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 9, N, 4, N, 4, IB, $ JB, DESCB, 13, 5, IDUM1, IDUM2, INFO ) CALL PCHK1MAT( N, 4, N, 4, IZ, JZ, DESCZ, 26, 0, IDUM1, IDUM2, $ INFO ) END IF * IWORK( 1 ) = LIWMIN WORK( 1 ) = DCMPLX( DBLE( LWOPT ) ) RWORK( 1 ) = DBLE( LRWOPT ) * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHEGVX ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Form a Cholesky factorization of sub( B ). * CALL PZPOTRF( UPLO, N, B, IB, JB, DESCB, INFO ) IF( INFO.NE.0 ) THEN IWORK( 1 ) = LIWMIN WORK( 1 ) = DCMPLX( DBLE( LWOPT ) ) RWORK( 1 ) = DBLE( LRWOPT ) IFAIL( 1 ) = INFO INFO = IERRNPD RETURN END IF * * Transform problem to standard eigenvalue problem and solve. * CALL PZHENGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, WORK, LWORK, INFO ) CALL PZHEEVX( JOBZ, RANGE, UPLO, N, A, IA, JA, DESCA, VL, VU, IL, $ IU, ABSTOL, M, NZ, W, ORFAC, Z, IZ, JZ, DESCZ, WORK, $ LWORK, RWORK, LRWORK, IWORK, LIWORK, IFAIL, ICLUSTR, $ GAP, INFO ) * IF( WANTZ ) THEN * * Backtransform eigenvectors to the original problem. * NEIG = M IF( IBTYPE.EQ.1 .OR. IBTYPE.EQ.2 ) THEN * * For sub( A )*x=(lambda)*sub( B )*x and * sub( A )*sub( B )*x=(lambda)*x; backtransform eigenvectors: * x = inv(L)'*y or inv(U)*y * IF( UPPER ) THEN TRANS = 'N' ELSE TRANS = 'C' END IF * CALL PZTRSM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, $ B, IB, JB, DESCB, Z, IZ, JZ, DESCZ ) * ELSE IF( IBTYPE.EQ.3 ) THEN * * For sub( B )*sub( A )*x=(lambda)*x; * backtransform eigenvectors: x = L*y or U'*y * IF( UPPER ) THEN TRANS = 'C' ELSE TRANS = 'N' END IF * CALL PZTRMM( 'Left', UPLO, TRANS, 'Non-unit', N, NEIG, ONE, $ B, IB, JB, DESCB, Z, IZ, JZ, DESCZ ) END IF END IF * IF( SCALE.NE.ONE ) THEN CALL DSCAL( N, SCALE, W, 1 ) END IF * IWORK( 1 ) = LIWMIN WORK( 1 ) = DCMPLX( DBLE( LWOPT ) ) RWORK( 1 ) = DBLE( LRWOPT ) RETURN * * End of PZHEGVX * END scalapack-2.0.2/SRC/pzhengst.f000644 000766 000024 00000041760 10363532303 016357 0ustar00juliestaff000000 000000 SUBROUTINE PZHENGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, IBTYPE, INFO, JA, JB, LWORK, N DOUBLE PRECISION SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ), WORK( * ) * .. * * Purpose * * ======= * * PZHENGST reduces a complex Hermitian-definite generalized * eigenproblem to standard form. * * PZHENGST performs the same function as PZHEGST, but is based on * rank 2K updates, which are faster and more scalable than * triangular solves (the basis of PZHENGST). * * PZHENGST calls PZHEGST when UPLO='U', hence PZHENGST provides * improved performance only when UPLO='L', IBTYPE=1. * * PZHENGST also calls PZHEGST when insufficient workspace is * provided, hence PZHENGST provides improved * performance only when LWORK >= 2 * NP0 * NB + NQ0 * NB + NB * NB * * In the following sub( A ) denotes A( IA:IA+N-1, JA:JA+N-1 ) and * sub( B ) denotes B( IB:IB+N-1, JB:JB+N-1 ). * * If IBTYPE = 1, the problem is sub( A )*x = lambda*sub( B )*x, * and sub( A ) is overwritten by inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H) * * If IBTYPE = 2 or 3, the problem is sub( A )*sub( B )*x = lambda*x or * sub( B )*sub( A )*x = lambda*x, and sub( A ) is overwritten by * U*sub( A )*U**H or L**H*sub( A )*L. * * sub( B ) must have been previously factorized as U**H*U or L*L**H by * PZPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * IBTYPE (global input) INTEGER * = 1: compute inv(U**H)*sub( A )*inv(U) or * inv(L)*sub( A )*inv(L**H); * = 2 or 3: compute U*sub( A )*U**H or L**H*sub( A )*L. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored and sub( B ) is * factored as U**H*U; * = 'L': Lower triangle of sub( A ) is stored and sub( B ) is * factored as L*L**H. * * N (global input) INTEGER * The order of the matrices sub( A ) and sub( B ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ). If UPLO = 'U', * the leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains * the lower triangular part of the matrix, and its strictly * upper triangular part is not referenced. * * On exit, if INFO = 0, the transformed matrix, stored in the * same format as sub( A ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1)). On entry, * this array contains the local pieces of the triangular factor * from the Cholesky factorization of sub( B ), as returned by * PZPOTRF. * * IB (global input) INTEGER * B's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * SCALE (global output) DOUBLE PRECISION * Amount by which the eigenvalues should be scaled to * compensate for the scaling performed in this routine. * At present, SCALE is always returned as 1.0, it is * returned here to allow for future enhancement. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP0 +1 ), 3 * NB ) * * When IBTYPE = 1 and UPLO = 'L', PZHENGST provides improved * performance when LWORK >= 2 * NP0 * NB + NQ0 * NB + NB * NB * * where NB = MB_A = NB_A, * NP0 = NUMROC( N, NB, 0, 0, NPROW ), * NQ0 = NUMROC( N, NB, 0, 0, NPROW ), * * NUMROC ia a ScaLAPACK tool functions * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the * optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * * .. Parameters .. COMPLEX*16 ONEHALF, ONE, MONE DOUBLE PRECISION RONE PARAMETER ( ONEHALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ), $ MONE = ( -1.0D0, 0.0D0 ), RONE = 1.0D0 ) INTEGER DLEN_, CTXT_, MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( DLEN_ = 9, CTXT_ = 2, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER I, IACOL, IAROW, IBCOL, IBROW, ICOFFA, ICOFFB, $ ICTXT, INDAA, INDG, INDR, INDRT, IROFFA, $ IROFFB, J, K, KB, LWMIN, LWOPT, MYCOL, MYROW, $ NB, NP0, NPCOL, NPK, NPROW, NQ0, POSTK * .. * .. Local Arrays .. INTEGER DESCAA( DLEN_ ), DESCG( DLEN_ ), $ DESCR( DLEN_ ), DESCRT( DLEN_ ), IDUM1( 2 ), $ IDUM2( 2 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL LSAME, INDXG2P, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK2MAT, $ PXERBLA, PZGEMM, PZHEGST, PZHEMM, PZHER2K, $ PZLACPY, PZTRSM * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) SCALE = 1.0D0 * NB = DESCA( MB_ ) * * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 700+CTXT_ ) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 3, N, 3, IB, JB, DESCB, 11, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IBCOL = INDXG2P( JB, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) NP0 = NUMROC( N, NB, 0, 0, NPROW ) NQ0 = NUMROC( N, NB, 0, 0, NPCOL ) LWMIN = MAX( NB*( NP0+1 ), 3*NB ) IF( IBTYPE.EQ.1 .AND. .NOT.UPPER ) THEN LWOPT = 2*NP0*NB + NQ0*NB + NB*NB ELSE LWOPT = LWMIN END IF WORK( 1 ) = DCMPLX( DBLE( LWOPT ) ) LQUERY = ( LWORK.EQ.-1 ) IF( IBTYPE.LT.1 .OR. IBTYPE.GT.3 ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( ICOFFB.NE.0 .OR. IBCOL.NE.IACOL ) THEN INFO = -10 ELSE IF( DESCB( MB_ ).NE.DESCA( MB_ ) ) THEN INFO = -( 1100+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 1100+NB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1100+CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF IDUM1( 1 ) = IBTYPE IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 7, N, 3, N, 3, IB, $ JB, DESCB, 11, 2, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHENGST', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * IF( IBTYPE.NE.1 .OR. UPPER .OR. LWORK.LT.LWOPT ) THEN CALL PZHEGST( IBTYPE, UPLO, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB, SCALE, INFO ) RETURN END IF * CALL DESCSET( DESCG, N, NB, NB, NB, IAROW, IACOL, ICTXT, NP0 ) CALL DESCSET( DESCR, N, NB, NB, NB, IAROW, IACOL, ICTXT, NP0 ) CALL DESCSET( DESCRT, NB, N, NB, NB, IAROW, IACOL, ICTXT, NB ) CALL DESCSET( DESCAA, NB, NB, NB, NB, IAROW, IACOL, ICTXT, NB ) * INDG = 1 INDR = INDG + DESCG( LLD_ )*NB INDAA = INDR + DESCR( LLD_ )*NB INDRT = INDAA + DESCAA( LLD_ )*NB * DO 30 K = 1, N, NB * KB = MIN( N-K+1, NB ) POSTK = K + KB NPK = N - POSTK + 1 * * CALL PZLACPY( 'A', N-POSTK+1, KB, B, POSTK+IB-1, K+JB-1, DESCB, $ WORK( INDG ), POSTK, 1, DESCG ) CALL PZLACPY( 'A', N-POSTK+1, KB, A, POSTK+IA-1, K+JA-1, DESCA, $ WORK( INDR ), POSTK, 1, DESCR ) CALL PZLACPY( 'A', KB, K-1, A, K+IA-1, JA, DESCA, $ WORK( INDRT ), 1, 1, DESCRT ) * CALL PZLACPY( 'L', KB, KB, A, K+IA-1, K+JA-1, DESCA, $ WORK( INDR ), K, 1, DESCR ) CALL PZTRSM( 'Right', 'L', 'N', 'N', NPK, KB, MONE, B, K+IB-1, $ K+JB-1, DESCB, WORK( INDG ), POSTK, 1, DESCG ) * CALL PZHEMM( 'Right', 'L', NPK, KB, ONEHALF, A, K+IA-1, K+JA-1, $ DESCA, WORK( INDG ), POSTK, 1, DESCG, ONE, $ WORK( INDR ), POSTK, 1, DESCR ) * CALL PZHER2K( 'Lower', 'No T', NPK, KB, ONE, WORK( INDG ), $ POSTK, 1, DESCG, WORK( INDR ), POSTK, 1, DESCR, $ RONE, A, POSTK+IA-1, POSTK+JA-1, DESCA ) * CALL PZGEMM( 'No T', 'No Conj', NPK, K-1, KB, ONE, $ WORK( INDG ), POSTK, 1, DESCG, WORK( INDRT ), 1, $ 1, DESCRT, ONE, A, POSTK+IA-1, JA, DESCA ) * CALL PZHEMM( 'Right', 'L', NPK, KB, ONE, WORK( INDR ), K, 1, $ DESCR, WORK( INDG ), POSTK, 1, DESCG, ONE, A, $ POSTK+IA-1, K+JA-1, DESCA ) * CALL PZTRSM( 'Left', 'Lower', 'No Conj', 'Non-unit', KB, K-1, $ ONE, B, K+IB-1, K+JB-1, DESCB, A, K+IA-1, JA, $ DESCA ) * CALL PZLACPY( 'L', KB, KB, A, K+IA-1, K+JA-1, DESCA, $ WORK( INDAA ), 1, 1, DESCAA ) * IF( MYROW.EQ.DESCAA( RSRC_ ) .AND. MYCOL.EQ.DESCAA( CSRC_ ) ) $ THEN DO 20 I = 1, KB DO 10 J = 1, I WORK( INDAA+J-1+( I-1 )*DESCAA( LLD_ ) ) $ = DCONJG( WORK( INDAA+I-1+( J-1 )* $ DESCAA( LLD_ ) ) ) 10 CONTINUE 20 CONTINUE END IF * CALL PZTRSM( 'Left', 'Lower', 'No Conj', 'Non-unit', KB, KB, $ ONE, B, K+IB-1, K+JB-1, DESCB, WORK( INDAA ), 1, $ 1, DESCAA ) * CALL PZTRSM( 'Right', 'Lower', 'Conj', 'Non-unit', KB, KB, ONE, $ B, K+IB-1, K+JB-1, DESCB, WORK( INDAA ), 1, 1, $ DESCAA ) * CALL PZLACPY( 'L', KB, KB, WORK( INDAA ), 1, 1, DESCAA, A, $ K+IA-1, K+JA-1, DESCA ) * CALL PZTRSM( 'Right', 'Lower', 'Conj', 'Non-unit', NPK, KB, $ ONE, B, K+IB-1, K+JB-1, DESCB, A, POSTK+IA-1, $ K+JA-1, DESCA ) * DESCR( CSRC_ ) = MOD( DESCR( CSRC_ )+1, NPCOL ) DESCG( CSRC_ ) = MOD( DESCG( CSRC_ )+1, NPCOL ) DESCRT( RSRC_ ) = MOD( DESCRT( RSRC_ )+1, NPROW ) DESCAA( RSRC_ ) = MOD( DESCAA( RSRC_ )+1, NPROW ) DESCAA( CSRC_ ) = MOD( DESCAA( CSRC_ )+1, NPCOL ) 30 CONTINUE * WORK( 1 ) = DCMPLX( DBLE( LWOPT ) ) * RETURN END scalapack-2.0.2/SRC/pzhentrd.f000644 000766 000024 00000053230 10363532303 016346 0ustar00juliestaff000000 000000 SUBROUTINE PZHENTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LRWORK, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION D( * ), E( * ), RWORK( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * Bugs * ==== * * * Support for UPLO='U' is limited to calling the old, slow, PZHETRD * code. * * * Purpose * * ======= * * PZHENTRD is a prototype version of PZHETRD which uses tailored * codes (either the serial, ZHETRD, or the parallel code, PZHETTRD) * when the workspace provided by the user is adequate. * * * PZHENTRD reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Features * ======== * * PZHENTRD is faster than PZHETRD on almost all matrices, * particularly small ones (i.e. N < 500 * sqrt(P) ), provided that * enough workspace is available to use the tailored codes. * * The tailored codes provide performance that is essentially * independent of the input data layout. * * The tailored codes place no restrictions on IA, JA, MB or NB. * At present, IA, JA, MB and NB are restricted to those values allowed * by PZHETRD to keep the interface simple. These restrictions are * documented below. (Search for "restrictions".) * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK( 1 ) returns the optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP +1 ), 3 * NB ) * * For optimal performance, greater workspace is needed, i.e. * LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS + 4 ) * NPS * ICTXT = DESCA( CTXT_ ) * ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) * SQNPC = INT( SQRT( DBLE( NPROW * NPCOL ) ) ) * NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) * * NUMROC is a ScaLAPACK tool functions; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * * RWORK (local workspace/local output) COMPLEX*16 array, * dimension (LRWORK) * On exit, RWORK( 1 ) returns the optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= 1 * * For optimal performance, greater workspace is needed, i.e. * LRWORK >= MAX( 2 * N ) * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA .AND. IROFFA.EQ.0 ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) COMPLEX*16 CONE PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER COLCTOP, ROWCTOP INTEGER ANB, CTXTB, I, IACOL, IAROW, ICOFFA, ICTXT, $ IINFO, INDB, INDRD, INDRE, INDTAU, INDW, IPW, $ IROFFA, J, JB, JX, K, KK, LLRWORK, LLWORK, $ LRWMIN, LWMIN, MINSZ, MYCOL, MYCOLB, MYROW, $ MYROWB, NB, NP, NPCOL, NPCOLB, NPROW, NPROWB, $ NPS, NQ, ONEPMIN, ONEPRMIN, SQNPC, TTLRWMIN, $ TTLWMIN * .. * .. Local Arrays .. INTEGER DESCB( DLEN_ ), DESCW( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, CHK1MAT, DESCSET, IGAMN2D, $ PCHK1MAT, PDLAMR1D, PB_TOPGET, PB_TOPSET, $ PXERBLA, PZELSET, PZHER2K, PZHETD2, PZHETTRD, $ PZLAMR1D, PZLATRD, PZTRMR2D, ZHETRD * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC, PJLAENV EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC, PJLAENV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, INT, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = MAX( 1, NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) LWMIN = MAX( ( NP+1 )*NB, 3*NB ) ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) MINSZ = PJLAENV( ICTXT, 5, 'PZHETTRD', 'L', 0, 0, 0, 0 ) SQNPC = INT( SQRT( DBLE( NPROW*NPCOL ) ) ) NPS = MAX( NUMROC( N, 1, 0, 0, SQNPC ), 2*ANB ) TTLWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + ( NPS+2 )*NPS LRWMIN = 1 TTLRWMIN = 2*NPS * WORK( 1 ) = DCMPLX( DBLE( TTLWMIN ) ) RWORK( 1 ) = DBLE( TTLRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 * * The following two restrictions are not necessary provided * that either of the tailored codes are used. * ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 600+NB_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 11 IF( LRWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 13 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHENTRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * ONEPMIN = N*N + 3*N + 1 LLWORK = LWORK CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LLWORK, 1, 1, -1, -1, -1, $ -1 ) * ONEPRMIN = 2*N LLRWORK = LRWORK CALL IGAMN2D( ICTXT, 'A', ' ', 1, 1, LLRWORK, 1, 1, -1, -1, -1, $ -1 ) * * * Use the serial, LAPACK, code: ZTRD on small matrices if we * we have enough space. * NPROWB = 0 IF( ( N.LT.MINSZ .OR. SQNPC.EQ.1 ) .AND. LLWORK.GE.ONEPMIN .AND. $ LLRWORK.GE.ONEPRMIN .AND. .NOT.UPPER ) THEN NPROWB = 1 NPS = N ELSE IF( LLWORK.GE.TTLWMIN .AND. LLRWORK.GE.TTLRWMIN .AND. .NOT. $ UPPER ) THEN NPROWB = SQNPC END IF END IF * IF( NPROWB.GE.1 ) THEN NPCOLB = NPROWB SQNPC = NPROWB INDB = 1 INDRD = 1 INDRE = INDRD + NPS INDTAU = INDB + NPS*NPS INDW = INDTAU + NPS LLWORK = LLWORK - INDW + 1 * CALL BLACS_GET( ICTXT, 10, CTXTB ) CALL BLACS_GRIDINIT( CTXTB, 'Row major', SQNPC, SQNPC ) CALL BLACS_GRIDINFO( CTXTB, NPROWB, NPCOLB, MYROWB, MYCOLB ) CALL DESCSET( DESCB, N, N, 1, 1, 0, 0, CTXTB, NPS ) * CALL PZTRMR2D( UPLO, 'N', N, N, A, IA, JA, DESCA, WORK( INDB ), $ 1, 1, DESCB, ICTXT ) * * * Only those processors in context CTXTB are needed for a while * IF( NPROWB.GT.0 ) THEN * IF( NPROWB.EQ.1 ) THEN CALL ZHETRD( UPLO, N, WORK( INDB ), NPS, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), $ WORK( INDW ), LLWORK, INFO ) ELSE * CALL PZHETTRD( 'L', N, WORK( INDB ), 1, 1, DESCB, $ RWORK( INDRD ), RWORK( INDRE ), $ WORK( INDTAU ), WORK( INDW ), LLWORK, $ INFO ) * END IF END IF * * All processors participate in moving the data back to the * way that PZHENTRD expects it. * CALL PDLAMR1D( N-1, RWORK( INDRE ), 1, 1, DESCB, E, 1, JA, $ DESCA ) * CALL PDLAMR1D( N, RWORK( INDRD ), 1, 1, DESCB, D, 1, JA, $ DESCA ) * CALL PZLAMR1D( N, WORK( INDTAU ), 1, 1, DESCB, TAU, 1, JA, $ DESCA ) * CALL PZTRMR2D( UPLO, 'N', N, N, WORK( INDB ), 1, 1, DESCB, A, $ IA, JA, DESCA, ICTXT ) * IF( MYROWB.GE.0 ) $ CALL BLACS_GRIDEXIT( CTXTB ) * ELSE * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPW = NP*NB + 1 * IF( UPPER ) THEN * * Reduce the upper triangle of sub( A ). * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, $ INDXG2P( JA+N-KK, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ), ICTXT, MAX( 1, NP ) ) * DO 10 K = N - KK + 1, NB + 1, -NB JB = MIN( N-K+1, NB ) I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part of * the matrix * CALL PZLATRD( UPLO, K+JB-1, JB, A, IA, JA, DESCA, D, E, $ TAU, WORK, 1, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(IA:I-1,JA:J-1), using an * update of the form: * A(IA:I-1,JA:J-1) := A(IA:I-1,JA:J-1) - V*W' - W*V' * CALL PZHER2K( UPLO, 'No transpose', K-1, JB, -CONE, A, $ IA, J, DESCA, WORK, 1, 1, DESCW, ONE, A, $ IA, JA, DESCA ) * * Copy last superdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J, NB, 0, IACOL, NPCOL ), NQ ) CALL PZELSET( A, I-1, J, DESCA, DCMPLX( E( JX ) ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+NPCOL-1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PZHETD2( UPLO, MIN( N, NB ), A, IA, JA, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) * ELSE * * Reduce the lower triangle of sub( A ) * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) * DO 20 K = 1, N - NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part * of the matrix * CALL PZLATRD( UPLO, N-K+1, NB, A, I, J, DESCA, D, E, TAU, $ WORK, K, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(I+NB:IA+N-1,I+NB:IA+N-1), * using an update of the form: A(I+NB:IA+N-1,I+NB:IA+N-1) := * A(I+NB:IA+N-1,I+NB:IA+N-1) - V*W' - W*V' * CALL PZHER2K( UPLO, 'No transpose', N-K-NB+1, NB, -CONE, $ A, I+NB, J, DESCA, WORK, K+NB, 1, DESCW, $ ONE, A, I+NB, J+NB, DESCA ) * * Copy last subdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J+NB-1, NB, 0, IACOL, NPCOL ), NQ ) CALL PZELSET( A, I+NB, J+NB-1, DESCA, DCMPLX( E( JX ) ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * 20 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PZHETD2( UPLO, KK, A, IA+K-1, JA+K-1, DESCA, D, E, TAU, $ WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * END IF * WORK( 1 ) = DCMPLX( DBLE( TTLWMIN ) ) RWORK( 1 ) = DBLE( TTLRWMIN ) * RETURN * * End of PZHENTRD * END scalapack-2.0.2/SRC/pzhetd2.f000644 000766 000024 00000043137 10363532303 016075 0ustar00juliestaff000000 000000 SUBROUTINE PZHETD2( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION D( * ), E( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZHETD2 reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 3*N. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 HALF, ONE, ZERO PARAMETER ( HALF = ( 0.5D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER IACOL, IAROW, ICOFFA, ICTXT, II, IK, IROFFA, J, $ JJ, JK, JN, LDA, LWMIN, MYCOL, MYROW, NPCOL, $ NPROW COMPLEX*16 ALPHA, TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PXERBLA, ZAXPY, ZGEBR2D, ZGEBS2D, $ ZHEMV, ZHER2, ZLARFG * .. * .. External Functions .. LOGICAL LSAME COMPLEX*16 ZDOTC EXTERNAL LSAME, ZDOTC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) LWMIN = 3 * N * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.ICOFFA ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHETD2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Compute local information * LDA = DESCA( LLD_ ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) * IF( UPPER ) THEN * * Process(IAROW, IACOL) owns block to be reduced * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Reduce the upper triangle of sub( A ) * IK = II+N-1+(JJ+N-2)*LDA A( IK ) = DBLE( A( IK ) ) DO 10 J = N-1, 1, -1 IK = II + J - 1 JK = JJ + J - 1 * * Generate elementary reflector H(i) = I - tau * v * v' * to annihilate A(IA:IA+J-1,JA:JA+J-1) * ALPHA = A( IK+JK*LDA ) CALL ZLARFG( J, ALPHA, A( II+JK*LDA ), 1, TAUI ) E( JK+1 ) = DBLE( ALPHA ) * IF( TAUI.NE.ZERO ) THEN * * Apply H(i) from both sides to * A(IA:IA+J-1,JA:JA+J-1) * A( IK+JK*LDA ) = ONE * * Compute x := tau * A * v storing x in TAU(1:i) * CALL ZHEMV( UPLO, J, TAUI, A( II+(JJ-1)*LDA ), $ LDA, A( II+JK*LDA ), 1, ZERO, $ TAU( JJ ), 1 ) * * Compute w := x - 1/2 * tau * (x'*v) * v * ALPHA = -HALF*TAUI*ZDOTC( J, TAU( JJ ), 1, $ A( II+JK*LDA ), 1 ) CALL ZAXPY( J, ALPHA, A( II+JK*LDA ), 1, $ TAU( JJ ), 1 ) * * Apply the transformation as a rank-2 update: * A := A - v * w' - w * v' * CALL ZHER2( UPLO, J, -ONE, A( II+JK*LDA ), 1, $ TAU( JJ ), 1, A( II+(JJ-1)*LDA ), $ LDA ) END IF * * Copy D, E, TAU to broadcast them columnwise. * A( IK+JK*LDA ) = DCMPLX( E( JK+1 ) ) D( JK+1 ) = DBLE( A( IK+1+JK*LDA ) ) WORK( J+1 ) = DCMPLX( D( JK+1 ) ) WORK( N+J+1 ) = DCMPLX( E( JK+1 ) ) TAU( JK+1 ) = TAUI WORK( 2*N+J+1 ) = TAU( JK+1 ) * 10 CONTINUE D( JJ ) = DBLE( A( II+(JJ-1)*LDA ) ) WORK( 1 ) = DCMPLX( D( JJ ) ) WORK( N+1 ) = ZERO WORK( 2*N+1 ) = ZERO * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1 ) * ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N, WORK, 1, $ IAROW, IACOL ) DO 20 J = 2, N JN = JJ + J - 1 D( JN ) = DBLE( WORK( J ) ) E( JN ) = DBLE( WORK( N+J ) ) TAU( JN ) = WORK( 2*N+J ) 20 CONTINUE D( JJ ) = DBLE( WORK( 1 ) ) END IF END IF * ELSE * * Process (IAROW, IACOL) owns block to be factorized * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Reduce the lower triangle of sub( A ) * A( II+(JJ-1)*LDA ) = DBLE( A( II+(JJ-1)*LDA ) ) DO 30 J = 1, N - 1 IK = II + J - 1 JK = JJ + J - 1 * * Generate elementary reflector H(i) = I - tau * v * v' * to annihilate A(IA+J-JA+2:IA+N-1,JA+J-1) * ALPHA = A( IK+1+(JK-1)*LDA ) CALL ZLARFG( N-J, ALPHA, A( IK+2+(JK-1)*LDA ), 1, $ TAUI ) E( JK ) = DBLE( ALPHA ) * IF( TAUI.NE.ZERO ) THEN * * Apply H(i) from both sides to * A(IA+J-JA+1:IA+N-1,JA+J+1:JA+N-1) * A( IK+1+(JK-1)*LDA ) = ONE * * Compute x := tau * A * v storing y in TAU(i:n-1) * CALL ZHEMV( UPLO, N-J, TAUI, A( IK+1+JK*LDA ), $ LDA, A( IK+1+(JK-1)*LDA ), 1, $ ZERO, TAU( JK ), 1 ) * * Compute w := x - 1/2 * tau * (x'*v) * v * ALPHA = -HALF*TAUI*ZDOTC( N-J, TAU( JK ), 1, $ A( IK+1+(JK-1)*LDA ), 1 ) CALL ZAXPY( N-J, ALPHA, A( IK+1+(JK-1)*LDA ), $ 1, TAU( JK ), 1 ) * * Apply the transformation as a rank-2 update: * A := A - v * w' - w * v' * CALL ZHER2( UPLO, N-J, -ONE, $ A( IK+1+(JK-1)*LDA ), 1, $ TAU( JK ), 1, A( IK+1+JK*LDA ), $ LDA ) END IF * * Copy D(JK), E(JK), TAU(JK) to broadcast them * columnwise. * A( IK+1+(JK-1)*LDA ) = DCMPLX( E( JK ) ) D( JK ) = DBLE( A( IK+(JK-1)*LDA ) ) WORK( J ) = DCMPLX( D( JK ) ) WORK( N+J ) = DCMPLX( E( JK ) ) TAU( JK ) = TAUI WORK( 2*N+J ) = TAU( JK ) 30 CONTINUE JN = JJ + N - 1 D( JN ) = DBLE( A( II+N-1+(JN-1)*LDA ) ) WORK( N ) = DCMPLX( D( JN ) ) TAU( JN ) = ZERO WORK( 2*N ) = ZERO * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK, $ 1 ) * ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 3*N-1, WORK, $ 1, IAROW, IACOL ) DO 40 J = 1, N - 1 JN = JJ + J - 1 D( JN ) = DBLE( WORK( J ) ) E( JN ) = DBLE( WORK( N+J ) ) TAU( JN ) = WORK( 2*N+J ) 40 CONTINUE JN = JJ + N - 1 D( JN ) = DBLE( WORK( N ) ) TAU( JN ) = ZERO END IF END IF END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZHETD2 * END scalapack-2.0.2/SRC/pzhetrd.f000644 000766 000024 00000040404 10363532303 016167 0ustar00juliestaff000000 000000 SUBROUTINE PZHETRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION D( * ), E( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZHETRD reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MAX( NB * ( NP +1 ), 3 * NB ) * * where NB = MB_A = NB_A, * NP = NUMROC( N, NB, MYROW, IAROW, NPROW ), * IAROW = INDXG2P( IA, NB, MYROW, RSRC_A, NPROW ). * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and vi * denotes an element of the vector defining H(i). * * Alignment requirements * ====================== * * The distributed submatrix sub( A ) must verify some alignment proper- * ties, namely the following expression should be true: * ( MB_A.EQ.NB_A .AND. IROFFA.EQ.ICOFFA .AND. IROFFA.EQ.0 ) with * IROFFA = MOD( IA-1, MB_A ) and ICOFFA = MOD( JA-1, NB_A ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) COMPLEX*16 CONE PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER COLCTOP, ROWCTOP INTEGER I, IACOL, IAROW, ICOFFA, ICTXT, IINFO, IPW, $ IROFFA, J, JB, JX, K, KK, LWMIN, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, PCHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZHER2K, $ PZHETD2, PZLATRD * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL LSAME, INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN NB = DESCA( NB_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IAROW = INDXG2P( IA, NB, MYROW, DESCA( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, NB, MYCOL, DESCA( CSRC_ ), NPCOL ) NP = NUMROC( N, NB, MYROW, IAROW, NPROW ) NQ = MAX( 1, NUMROC( N+JA-1, NB, MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) LWMIN = MAX( (NP+1)*NB, 3*NB ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.ICOFFA .OR. ICOFFA.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 11 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHETRD', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * IPW = NP * NB + 1 * IF( UPPER ) THEN * * Reduce the upper triangle of sub( A ). * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, INDXG2P( JA+N-KK, $ NB, MYCOL, DESCA( CSRC_ ), NPCOL ), ICTXT, $ MAX( 1, NP ) ) * DO 10 K = N-KK+1, NB+1, -NB JB = MIN( N-K+1, NB ) I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part of * the matrix * CALL PZLATRD( UPLO, K+JB-1, JB, A, IA, JA, DESCA, D, E, TAU, $ WORK, 1, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(IA:I-1,JA:J-1), using an * update of the form: * A(IA:I-1,JA:J-1) := A(IA:I-1,JA:J-1) - V*W' - W*V' * CALL PZHER2K( UPLO, 'No transpose', K-1, JB, -CONE, A, IA, $ J, DESCA, WORK, 1, 1, DESCW, ONE, A, IA, JA, $ DESCA ) * * Copy last superdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J, NB, 0, IACOL, NPCOL ), NQ ) CALL PZELSET( A, I-1, J, DESCA, DCMPLX( E( JX ) ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + NPCOL - 1, NPCOL ) * 10 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PZHETD2( UPLO, MIN( N, NB ), A, IA, JA, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) * ELSE * * Reduce the lower triangle of sub( A ) * KK = MOD( JA+N-1, NB ) IF( KK.EQ.0 ) $ KK = NB CALL DESCSET( DESCW, N, NB, NB, NB, IAROW, IACOL, ICTXT, $ MAX( 1, NP ) ) * DO 20 K = 1, N-NB, NB I = IA + K - 1 J = JA + K - 1 * * Reduce columns I:I+NB-1 to tridiagonal form and form * the matrix W which is needed to update the unreduced part * of the matrix * CALL PZLATRD( UPLO, N-K+1, NB, A, I, J, DESCA, D, E, TAU, $ WORK, K, 1, DESCW, WORK( IPW ) ) * * Update the unreduced submatrix A(I+NB:IA+N-1,I+NB:IA+N-1), * using an update of the form: A(I+NB:IA+N-1,I+NB:IA+N-1) := * A(I+NB:IA+N-1,I+NB:IA+N-1) - V*W' - W*V' * CALL PZHER2K( UPLO, 'No transpose', N-K-NB+1, NB, -CONE, A, $ I+NB, J, DESCA, WORK, K+NB, 1, DESCW, ONE, A, $ I+NB, J+NB, DESCA ) * * Copy last subdiagonal element back into sub( A ) * JX = MIN( INDXG2L( J+NB-1, NB, 0, IACOL, NPCOL ), NQ ) CALL PZELSET( A, I+NB, J+NB-1, DESCA, DCMPLX( E( JX ) ) ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ ) + 1, NPCOL ) * 20 CONTINUE * * Use unblocked code to reduce the last or only block * CALL PZHETD2( UPLO, KK, A, IA+K-1, JA+K-1, DESCA, D, E, $ TAU, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZHETRD * END scalapack-2.0.2/SRC/pzhettrd.f000644 000766 000024 00000123431 11750130340 016351 0ustar00juliestaff000000 000000 SUBROUTINE PZHETTRD( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION D( * ), E( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * * ======= * * PZHETTRD reduces a complex Hermitian matrix sub( A ) to Hermitian * tridiagonal form T by an unitary similarity transformation: * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding * process and memory location. * * Let A be a generic term for any 2D block cyclicly distributed * array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- ----------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, * indicating the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to * distribute the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to * distribute the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the * first row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCp(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCp( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCq( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes * of its process row. * The values of LOCp() and LOCq() may be determined via a call to * the ScaLAPACK tool function, NUMROC: * LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCq(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. On exit, if UPLO = 'U', * the diagonal and first superdiagonal of sub( A ) are over- * written by the corresponding elements of the tridiagonal * matrix T, and the elements above the first superdiagonal, * with the array TAU, represent the unitary matrix Q as a * product of elementary reflectors; if UPLO = 'L', the diagonal * and first subdiagonal of sub( A ) are overwritten by the * corresponding elements of the tridiagonal matrix T, and the * elements below the first subdiagonal, with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1) * if UPLO = 'U', LOCq(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCq(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * On exit, WORK( 1 ) returns the minimal and optimal workspace * * LWORK (local input) INTEGER * The dimension of the array WORK. * LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + NPS * Where: * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) * ANB = PJLAENV( DESCA( CTXT_ ), 3, 'PZHETTRD', 'L', 0, 0, * 0, 0 ) * * NUMROC is a ScaLAPACK tool function; * PJLAENV is a ScaLAPACK envionmental inquiry function * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of * elementary reflectors * * Q = H(n-1) . . . H(2) H(1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of * elementary reflectors * * Q = H(1) H(2) . . . H(n-1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The contents of sub( A ) on exit are illustrated by the following * examples with n = 5: * * if UPLO = 'U': if UPLO = 'L': * * ( d e v2 v3 v4 ) ( d ) * ( d e v3 v4 ) ( e d ) * ( d e v4 ) ( v1 e d ) * ( d e ) ( v1 v2 e d ) * ( d ) ( v1 v2 v3 e d ) * * where d and e denote diagonal and off-diagonal elements of T, and * vi denotes an element of the vector defining H(i). * * Data storage requirements * ========================= * * PZHETTRD is not intended to be called directly. All users are * encourage to call PZHETRD which will then call PZHETTRD if * appropriate. A must be in cyclic format (i.e. MB = NB = 1), * the process grid must be square ( i.e. NPROW = NPCOL ) and * only lower triangular storage is supported. * * Local variables * =============== * * PZHETTRD uses five local arrays: * WORK ( InV ) dimension ( NP, ANB+1): array V * WORK ( InH ) dimension ( NP, ANB+1): array H * WORK ( InVT ) dimension ( NQ, ANB+1): transpose of the array V * WORK ( InHT ) dimension ( NQ, ANB+1): transpose of the array H * WORK ( InVTT ) dimension ( NQ, 1): transpose of the array VT * * Arrays V and H are replicated across all processor columns. * Arrays V^T and H^T are replicated across all processor rows. * * WORK ( InVT ), or V^T, is stored as a tall skinny * array ( NQ x ANB-1 ) for efficiency. Since only the lower * triangular portion of A is updated, Av is computed as: * tril(A) * v + v^T * tril(A,-1). This is performed as * two local triangular matrix-vector multiplications (both in * MVR2) followed by a transpose and a sum across the columns. * In the local computation, WORK( InVT ) is used to compute * tril(A) * v and WORK( InV ) is used to compute * v^T * tril(A,-1) * * The following variables are global indices into A: * INDEX: The current global row and column number. * MAXINDEX: The global row and column for the first row and * column in the trailing block of A. * LIIB, LIJB: The first row, column in * * The following variables point into the arrays A, V, H, V^T, H^T: * BINDEX =INDEX-MININDEX: The column index in V, H, V^T, H^T. * LII: local index I: The local row number for row INDEX * LIJ: local index J: The local column number for column INDEX * LIIP1: local index I+1: The local row number for row INDEX+1 * LIJP1: local index J+1: The local col number for col INDEX+1 * LTLI: lower triangular local index I: The local row for the * upper left entry in tril( A(INDEX, INDEX) ) * LTLIP1: lower triangular local index I+1: The local row for the * upper left entry in tril( A(INDEX+1, INDEX+1) ) * * Details: The distinction between LII and LTLI (and between * LIIP1 and LTLIP1) is subtle. Within the current processor * column (i.e. MYCOL .eq. CURCOL) they are the same. However, * on some processors, A( LII, LIJ ) points to an element * above the diagonal, on these processors, LTLI = LII+1. * * The following variables give the number of rows and/or columns * in various matrices: * NP: The number of local rows in A( 1:N, 1:N ) * NQ: The number of local columns in A( 1:N, 1:N ) * NPM0: The number of local rows in A( INDEX:N, INDEX:N ) * NQM0: The number of local columns in A( INDEX:N, INDEX:N ) * NPM1: The number of local rows in A( INDEX+1:N, INDEX:N ) * NQM1: The number of local columns in A( INDEX+1:N, INDEX:N ) * LTNM0: The number of local rows & columns in * tril( A( INDEX:N, INDEX:N ) ) * LTNM1: The number of local rows & columns in * tril( A( INDEX+1:N, INDEX+1:N ) ) * NOTE: LTNM0 == LTNM1 on all processors except the diagonal * processors, i.e. those where MYCOL == MYROW. * * Invariants: * NP = NPM0 + LII - 1 * NQ = NQM0 + LIJ - 1 * NP = NPM1 + LIIP1 - 1 * NQ = NQM1 + LIJP1 - 1 * NP = LTLI + LTNM0 - 1 * NP = LTLIP1 + LTNM1 - 1 * * Temporary variables. The following variables are used within * a few lines after they are set and do hold state from one loop * iteration to the next: * * The matrix A: * The matrix A does not hold the same values that it would * in an unblocked code nor the values that it would hold in * in a blocked code. * * The value of A is confusing. It is easiest to state the * difference between trueA and A at the point that MVR2 is called, * so we will start there. * * Let trueA be the value that A would * have at a given point in an unblocked code and A * be the value that A has in this code at the same point. * * At the time of the call to MVR2, * trueA = A + V' * H + H' * V * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ). * * At the bottom of the inner loop, * trueA = A + V' * H + H' * V + v' * h + h' * v * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ) and * v = V( liip1:N, BINDEX+1 ) and * h = H( liip1:N, BINDEX+1 ) * * At the top of the loop, BINDEX gets incremented, hence: * trueA = A + V' * H + H' * V + v' * h + h' * v * where H = H( MAXINDEX:N, 1:BINDEX-1 ) and * V = V( MAXINDEX:N, 1:BINDEX-1 ) and * v = V( liip1:N, BINDEX ) and * h = H( liip1:N, BINDEX ) * * * A gets updated at the bottom of the outer loop * After this update, trueA = A + v' * h + h' * v * where v = V( liip1:N, BINDEX ) and * h = H( liip1:N, BINDEX ) and BINDEX = 0 * Indeed, the previous loop invariant as stated above for the * top of the loop still holds, but with BINDEX = 0, H and V * are null matrices. * * After the current column of A is updated, * trueA( INDEX, INDEX:N ) = A( INDEX, INDEX:N ) * the rest of A is untouched. * * After the current block column of A is updated, * trueA = A + V' * H + H' * V * where H = H( MAXINDEX:N, 1:BINDEX ) and * V = V( MAXINDEX:N, 1:BINDEX ) * * This brings us back to the point at which mvr2 is called. * * * Details of the parallelization: * * We delay spreading v across to all processor columns (which * would naturally happen at the bottom of the loop) in order to * combine the spread of v( : , i-1 ) with the spread of h( : , i ) * * In order to compute h( :, i ), we must update A( :, i ) * which means that the processor column owning A( :, i ) must * have: c, tau, v( i, i ) and h( i, i ). * * The traditional * way of computing v (and the one used in pzlatrd.f and * zlatrd.f) is: * v = tau * v * c = v' * h * alpha = - tau * c / 2 * v = v + alpha * h * However, the traditional way of computing v requires that tau * be broadcast to all processors in the current column (to compute * v = tau * v) and then a sum-to-all is required (to * compute v' * h ). We use the following formula instead: * c = v' * h * v = tau * ( v - c * tau' * h / 2 ) * The above formula allows tau to be spread down in the * same call to DGSUM2D which performs the sum-to-all of c. * * The computation of v, which could be performed in any processor * column (or other procesor subsets), is performed in the * processor column that owns A( :, i+1 ) so that A( :, i+1 ) * can be updated prior to spreading v across. * * We keep the block column of A up-to-date to minimize the * work required in updating the current column of A. Updating * the block column of A is reasonably load balanced whereas * updating the current column of A is not (only the current * processor column is involved). * * In the following overview of the steps performed, M in the * margin indicates message traffic and C indicates O(n^2 nb/sqrt(p)) * or more flops per processor. * * Inner loop: * A( index:n, index ) -= ( v * ht(bindex) + h * vt( bindex) ) *M h = house( A(index:n, index) ) *M Spread v, h across *M vt = v^T; ht = h^T * A( index+1:n, index+1:maxindex ) -= * ( v * ht(index+1:maxindex) + h *vt(index+1:maxindex) ) *C v = tril(A) * h; vt = ht * tril(A,-1) *MorC v = v - H*V*h - V*H*h *M v = v + vt^T *M c = v' * h * v = tau * ( v - c * tau' * h / 2 ) *C A = A - H*V - V*H * * * * ================================================================= * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D0 ) COMPLEX*16 Z_ONE, Z_NEGONE, Z_ZERO PARAMETER ( Z_ONE = 1.0D0, Z_NEGONE = -1.0D0, $ Z_ZERO = 0.0D0 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * * * .. Local Scalars .. * * LOGICAL BALANCED, INTERLEAVE, TWOGEMMS, UPPER INTEGER ANB, BINDEX, CURCOL, CURROW, I, ICTXT, INDEX, $ INDEXA, INDEXINH, INDEXINV, INH, INHB, INHT, $ INHTB, INTMP, INV, INVB, INVT, INVTB, J, LDA, $ LDV, LDZG, LII, LIIB, LIIP1, LIJ, LIJB, LIJP1, $ LTLIP1, LTNM1, LWMIN, MAXINDEX, MININDEX, $ MYCOL, MYFIRSTROW, MYROW, MYSETNUM, NBZG, NP, $ NPB, NPCOL, NPM0, NPM1, NPROW, NPS, NPSET, NQ, $ NQB, NQM1, NUMROWS, NXTCOL, NXTROW, PBMAX, $ PBMIN, PBSIZE, PNB, ROWSPERPROC DOUBLE PRECISION NORM, SAFMAX, SAFMIN COMPLEX*16 ALPHA, BETA, C, CONJTOPH, CONJTOPV, $ ONEOVERBETA, TOPH, TOPNV, TOPTAU, TOPV * .. * .. Local Arrays .. * * * * INTEGER IDUM1( 1 ), IDUM2( 1 ) DOUBLE PRECISION DTMP( 5 ) COMPLEX*16 CC( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DCOMBNRM2, DGEBR2D, $ DGEBS2D, DGSUM2D, PCHK1MAT, PDTREECOMB, $ PXERBLA, ZGEBR2D, ZGEBS2D, ZGEMM, ZGEMV, $ ZGERV2D, ZGESD2D, ZGSUM2D, ZLAMOV, ZSCAL, $ ZTRMVT * .. * .. External Functions .. * LOGICAL LSAME INTEGER ICEIL, NUMROC, PJLAENV DOUBLE PRECISION DZNRM2, PDLAMCH EXTERNAL LSAME, ICEIL, NUMROC, PJLAENV, DZNRM2, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, DIMAG, ICHAR, MAX, MIN, $ MOD, SIGN, SQRT * .. * * * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * * * Further details * =============== * * At the top of the loop, v and nh have been computed but not * spread across. Hence, A is out-of-date even after the * rank 2k update. Furthermore, we compute the next v before * nh is spread across. * * I claim that if we used a sum-to-all on NV, by summing CC within * each column, that we could compute NV locally and could avoid * spreading V across. Bruce claims that sum-to-all can be made * to cost no more than sum-to-one on the Paragon. If that is * true, this would be a win. But, * the BLACS sum-to-all is just a sum-to-one followed by a broadcast, * and hence the present scheme is better for now. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * SAFMAX = SQRT( PDLAMCH( ICTXT, 'O' ) ) / N SAFMIN = SQRT( PDLAMCH( ICTXT, 'S' ) ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 600+CTXT_ ) ELSE * * Here we set execution options for PZHETTRD * PNB = PJLAENV( ICTXT, 2, 'PZHETTRD', 'L', 0, 0, 0, 0 ) ANB = PJLAENV( ICTXT, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 ) * INTERLEAVE = ( PJLAENV( ICTXT, 4, 'PZHETTRD', 'L', 1, 0, 0, $ 0 ).EQ.1 ) TWOGEMMS = ( PJLAENV( ICTXT, 4, 'PZHETTRD', 'L', 2, 0, 0, $ 0 ).EQ.1 ) BALANCED = ( PJLAENV( ICTXT, 4, 'PZHETTRD', 'L', 3, 0, 0, $ 0 ).EQ.1 ) * CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) * * UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 .AND. DESCA( NB_ ).NE.1 ) $ INFO = 600 + NB_ IF( INFO.EQ.0 ) THEN * * * Here is the arithmetic: * Let maxnpq = max( np, nq, 2 * ANB ) * LDV = 4 * max( np, nq ) + 2 * LWMIN = 2 * ( ANB + 1 ) * LDV + MAX( np, 2 * ANB ) * = 2 * ( ANB + 1 ) * ( 4 * NPS + 2 ) + NPS * * This overestimates memory requirements when ANB > NP/2 * Memory requirements are lower when interleave = .false. * Hence, we could have two sets of memory requirements, * one for interleave and one for * * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) LWMIN = 2*( ANB+1 )*( 4*NPS+2 ) + NPS * WORK( 1 ) = DCMPLX( LWMIN ) IF( .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IA.NE.1 ) THEN INFO = -4 ELSE IF( JA.NE.1 ) THEN INFO = -5 ELSE IF( NPROW.NE.NPCOL ) THEN INFO = -( 600+CTXT_ ) ELSE IF( DESCA( DTYPE_ ).NE.1 ) THEN INFO = -( 600+DTYPE_ ) ELSE IF( DESCA( MB_ ).NE.1 ) THEN INFO = -( 600+MB_ ) ELSE IF( DESCA( NB_ ).NE.1 ) THEN INFO = -( 600+NB_ ) ELSE IF( DESCA( RSRC_ ).NE.0 ) THEN INFO = -( 600+RSRC_ ) ELSE IF( DESCA( CSRC_ ).NE.0 ) THEN INFO = -( 600+CSRC_ ) ELSE IF( LWORK.LT.LWMIN ) THEN INFO = -11 END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 * CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHETTRD', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * * Reduce the lower triangle of sub( A ) NP = NUMROC( N, 1, MYROW, 0, NPROW ) NQ = NUMROC( N, 1, MYCOL, 0, NPCOL ) * NXTROW = 0 NXTCOL = 0 * LIIP1 = 1 LIJP1 = 1 NPM1 = NP NQM1 = NQ * LDA = DESCA( LLD_ ) ICTXT = DESCA( CTXT_ ) * * * * Miscellaneous details: * Put tau, D and E in the right places * Check signs * Place all the arrays in WORK, control their placement * in memory. * * * * Loop invariants * A(LIIP1, LIJ) points to the first element of A(I+1,J) * NPM1,NQM1 = the number of rows, cols in A( LII+1:N,LIJ+1:N ) * A(LII:N,LIJ:N) is one step out of date. * proc( CURROW, CURCOL ) owns A(LII,LIJ) * proc( NXTROW, CURCOL ) owns A(LIIP1,LIJ) * INH = 1 * IF( INTERLEAVE ) THEN * * H and V are interleaved to minimize memory movement * LDV has to be twice as large to accomodate interleaving. * In addition, LDV is doubled again to allow v, h and * toptau to be spreaad across and transposed in a * single communication operation with minimum memory * movement. * * We could reduce LDV back to 2*MAX(NPM1,NQM1) * by increasing the memory movement required in * the spread and transpose of v, h and toptau. * However, since the non-interleaved path already * provides a mear minimum memory requirement option, * we did not provide this additional path. * LDV = 4*( MAX( NPM1, NQM1 ) ) + 2 * INH = 1 * INV = INH + LDV / 2 INVT = INH + ( ANB+1 )*LDV * INHT = INVT + LDV / 2 INTMP = INVT + LDV*( ANB+1 ) * ELSE LDV = MAX( NPM1, NQM1 ) * INHT = INH + LDV*( ANB+1 ) INV = INHT + LDV*( ANB+1 ) * * The code works without this +1, but only because of a * coincidence. Without the +1, WORK(INVT) gets trashed, but * WORK(INVT) is only used once and when it is used, it is * multiplied by WORK( INH ) which is zero. Hence, the fact * that WORK(INVT) is trashed has no effect. * INVT = INV + LDV*( ANB+1 ) + 1 INTMP = INVT + LDV*( 2*ANB ) * END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZHETTRD', -INFO ) WORK( 1 ) = DCMPLX( LWMIN ) RETURN END IF * * * The satisfies the loop invariant: trueA = A - V * HT - H * VT, * (where V, H, VT and HT all have BINDEX+1 rows/columns) * the first ANB times through the loop. * * * * Setting either ( InH and InHT ) or InV to Z_ZERO * is adequate except in the face of NaNs. * * DO 10 I = 1, NP WORK( INH+I-1 ) = Z_ZERO WORK( INV+I-1 ) = Z_ZERO 10 CONTINUE DO 20 I = 1, NQ WORK( INHT+I-1 ) = Z_ZERO 20 CONTINUE * * * TOPNV = Z_ZERO * LTLIP1 = LIJP1 LTNM1 = NPM1 IF( MYCOL.GT.MYROW ) THEN LTLIP1 = LTLIP1 + 1 LTNM1 = LTNM1 - 1 END IF * * DO 210 MININDEX = 1, N - 1, ANB * * MAXINDEX = MIN( MININDEX+ANB-1, N ) LIJB = NUMROC( MAXINDEX, 1, MYCOL, 0, NPCOL ) + 1 LIIB = NUMROC( MAXINDEX, 1, MYROW, 0, NPROW ) + 1 * NQB = NQ - LIJB + 1 NPB = NP - LIIB + 1 INHTB = INHT + LIJB - 1 INVTB = INVT + LIJB - 1 INHB = INH + LIIB - 1 INVB = INV + LIIB - 1 * * * * DO 160 INDEX = MININDEX, MIN( MAXINDEX, N-1 ) * BINDEX = INDEX - MININDEX * CURROW = NXTROW CURCOL = NXTCOL * NXTROW = MOD( CURROW+1, NPROW ) NXTCOL = MOD( CURCOL+1, NPCOL ) * LII = LIIP1 LIJ = LIJP1 NPM0 = NPM1 * IF( MYROW.EQ.CURROW ) THEN NPM1 = NPM1 - 1 LIIP1 = LIIP1 + 1 END IF IF( MYCOL.EQ.CURCOL ) THEN NQM1 = NQM1 - 1 LIJP1 = LIJP1 + 1 LTLIP1 = LTLIP1 + 1 LTNM1 = LTNM1 - 1 END IF * * * * * V = NV, VT = NVT, H = NH, HT = NHT * * * Update the current column of A * * IF( MYCOL.EQ.CURCOL ) THEN * INDEXA = LII + ( LIJ-1 )*LDA INDEXINV = INV + LII - 1 + ( BINDEX-1 )*LDV INDEXINH = INH + LII - 1 + ( BINDEX-1 )*LDV CONJTOPH = DCONJG( WORK( INHT+LIJ-1+BINDEX*LDV ) ) CONJTOPV = DCONJG( TOPNV ) * IF( INDEX.GT.1 ) THEN DO 30 I = 0, NPM0 - 1 * A( INDEXA+I ) = A( INDEXA+I ) A( INDEXA+I ) = A( INDEXA+I ) - $ WORK( INDEXINV+LDV+I )*CONJTOPH - $ WORK( INDEXINH+LDV+I )*CONJTOPV 30 CONTINUE END IF * * END IF * * IF( MYCOL.EQ.CURCOL ) THEN * * Compute the householder vector * IF( MYROW.EQ.CURROW ) THEN DTMP( 2 ) = DBLE( A( LII+( LIJ-1 )*LDA ) ) ELSE DTMP( 2 ) = ZERO END IF IF( MYROW.EQ.NXTROW ) THEN DTMP( 3 ) = DBLE( A( LIIP1+( LIJ-1 )*LDA ) ) DTMP( 4 ) = DIMAG( A( LIIP1+( LIJ-1 )*LDA ) ) ELSE DTMP( 3 ) = ZERO DTMP( 4 ) = ZERO END IF * NORM = DZNRM2( NPM1, A( LIIP1+( LIJ-1 )*LDA ), 1 ) DTMP( 1 ) = NORM * * IF DTMP(5) = 1.0, NORM is too large and might cause * overflow, hence PDTREECOMB must be called. IF DTMP(5) * is zero on output, DTMP(1) can be trusted. * DTMP( 5 ) = ZERO IF( DTMP( 1 ).GE.SAFMAX .OR. DTMP( 1 ).LT.SAFMIN ) THEN DTMP( 5 ) = ONE DTMP( 1 ) = ZERO END IF * DTMP( 1 ) = DTMP( 1 )*DTMP( 1 ) CALL DGSUM2D( ICTXT, 'C', ' ', 5, 1, DTMP, 5, -1, $ CURCOL ) IF( DTMP( 5 ).EQ.ZERO ) THEN DTMP( 1 ) = SQRT( DTMP( 1 ) ) ELSE DTMP( 1 ) = NORM CALL PDTREECOMB( ICTXT, 'C', 1, DTMP, -1, MYCOL, $ DCOMBNRM2 ) END IF * NORM = DTMP( 1 ) * D( LIJ ) = DTMP( 2 ) IF( MYROW.EQ.CURROW .AND. MYCOL.EQ.CURCOL ) THEN A( LII+( LIJ-1 )*LDA ) = DCMPLX( D( LIJ ), ZERO ) END IF * * ALPHA = DCMPLX( DTMP( 3 ), DTMP( 4 ) ) * NORM = SIGN( NORM, DBLE( ALPHA ) ) * IF( NORM.EQ.ZERO ) THEN TOPTAU = ZERO ELSE BETA = NORM + ALPHA TOPTAU = BETA / NORM ONEOVERBETA = 1.0D0 / BETA * CALL ZSCAL( NPM1, ONEOVERBETA, $ A( LIIP1+( LIJ-1 )*LDA ), 1 ) END IF * IF( MYROW.EQ.NXTROW ) THEN A( LIIP1+( LIJ-1 )*LDA ) = Z_ONE END IF * TAU( LIJ ) = TOPTAU E( LIJ ) = -NORM * END IF * * * Spread v, nh, toptau across * DO 40 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+I ) = A( LIIP1+I+ $ ( LIJ-1 )*LDA ) 40 CONTINUE * IF( MYCOL.EQ.CURCOL ) THEN WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 ) = TOPTAU CALL ZGEBS2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), $ NPM1+NPM1+1 ) ELSE CALL ZGEBR2D( ICTXT, 'R', ' ', NPM1+NPM1+1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), $ NPM1+NPM1+1, MYROW, CURCOL ) TOPTAU = WORK( INV+LIIP1-1+BINDEX*LDV+NPM1+NPM1 ) END IF DO 50 I = 0, NPM1 - 1 WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1- $ 1+BINDEX*LDV+NPM1+I ) 50 CONTINUE * IF( INDEX.LT.N ) THEN IF( MYROW.EQ.NXTROW .AND. MYCOL.EQ.CURCOL ) $ A( LIIP1+( LIJ-1 )*LDA ) = E( LIJ ) END IF * * Transpose v, nh * * IF( MYROW.EQ.MYCOL ) THEN DO 60 I = 0, NPM1 + NPM1 WORK( INVT+LIJP1-1+BINDEX*LDV+I ) = WORK( INV+LIIP1-1+ $ BINDEX*LDV+I ) 60 CONTINUE ELSE CALL ZGESD2D( ICTXT, NPM1+NPM1, 1, $ WORK( INV+LIIP1-1+BINDEX*LDV ), NPM1+NPM1, $ MYCOL, MYROW ) CALL ZGERV2D( ICTXT, NQM1+NQM1, 1, $ WORK( INVT+LIJP1-1+BINDEX*LDV ), NQM1+NQM1, $ MYCOL, MYROW ) END IF * DO 70 I = 0, NQM1 - 1 WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV+I ) = WORK( INVT+ $ LIJP1-1+BINDEX*LDV+NQM1+I ) 70 CONTINUE * * * Update the current block column of A * IF( INDEX.GT.1 ) THEN DO 90 J = LIJP1, LIJB - 1 DO 80 I = 0, NPM1 - 1 * A( LIIP1+I+( J-1 )*LDA ) = A( LIIP1+I+( J-1 )*LDA ) $ - WORK( INV+LIIP1-1+BINDEX*LDV+I )* $ DCONJG( WORK( INHT+J-1+BINDEX*LDV ) ) - $ WORK( INH+LIIP1-1+BINDEX*LDV+I )* $ DCONJG( WORK( INVT+J-1+BINDEX*LDV ) ) 80 CONTINUE 90 CONTINUE END IF * * * * Compute NV = A * NHT; NVT = A * NH * * These two lines are necessary because these elements * are not always involved in the calls to ZTRMVT * for two reasons: * 1) On diagonal processors, the call to TRMVT * involves only LTNM1-1 elements * 2) On some processes, NQM1 < LTM1 or LIIP1 < LTLIP1 * and when the results are combined across all processes, * uninitialized values may be included. WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ) = Z_ZERO WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+NQM1-1 ) = Z_ZERO * * IF( MYROW.EQ.MYCOL ) THEN IF( LTNM1.GT.1 ) THEN CALL ZTRMVT( 'L', LTNM1-1, $ A( LTLIP1+1+( LIJP1-1 )*LDA ), LDA, $ WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ), 1, $ WORK( INH+LTLIP1+1-1+( BINDEX+1 )*LDV ), $ 1, WORK( INV+LTLIP1+1-1+( BINDEX+1 )* $ LDV ), 1, WORK( INHT+LIJP1-1+( BINDEX+ $ 1 )*LDV ), 1 ) END IF DO 100 I = 1, LTNM1 WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV ) $ = WORK( INVT+LIJP1+I-1-1+( BINDEX+1 )*LDV ) + $ A( LTLIP1+I-1+( LIJP1+I-1-1 )*LDA )* $ WORK( INH+LTLIP1+I-1-1+( BINDEX+1 )*LDV ) 100 CONTINUE ELSE IF( LTNM1.GT.0 ) $ CALL ZTRMVT( 'L', LTNM1, A( LTLIP1+( LIJP1-1 )*LDA ), $ LDA, WORK( INVT+LIJP1-1+( BINDEX+1 )* $ LDV ), 1, WORK( INH+LTLIP1-1+( BINDEX+ $ 1 )*LDV ), 1, WORK( INV+LTLIP1-1+ $ ( BINDEX+1 )*LDV ), 1, $ WORK( INHT+LIJP1-1+( BINDEX+1 )*LDV ), $ 1 ) * END IF * * * We take advantage of the fact that: * A * sum( B ) = sum ( A * B ) for matrices A,B * * trueA = A + V * HT + H * VT * hence: (trueA)v = Av' + V * HT * v + H * VT * v * VT * v = sum_p_in_NPROW ( VTp * v ) * H * VT * v = H * sum (VTp * v) = sum ( H * VTp * v ) * * v = v + V * HT * h + H * VT * h * * * * tmp = HT * nh1 DO 110 I = 1, 2*( BINDEX+1 ) WORK( INTMP-1+I ) = 0 110 CONTINUE * IF( BALANCED ) THEN NPSET = NPROW MYSETNUM = MYROW ROWSPERPROC = ICEIL( NQB, NPSET ) MYFIRSTROW = MIN( NQB+1, 1+ROWSPERPROC*MYSETNUM ) NUMROWS = MIN( ROWSPERPROC, NQB-MYFIRSTROW+1 ) * * * tmp = HT * v * CALL ZGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE, $ WORK( INHTB+MYFIRSTROW-1 ), LDV, $ WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ), $ 1, Z_ZERO, WORK( INTMP ), 1 ) * tmp2 = VT * v CALL ZGEMV( 'C', NUMROWS, BINDEX+1, Z_ONE, $ WORK( INVTB+MYFIRSTROW-1 ), LDV, $ WORK( INHTB+MYFIRSTROW-1+( BINDEX+1 )*LDV ), $ 1, Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 ) * * CALL ZGSUM2D( ICTXT, 'C', ' ', 2*( BINDEX+1 ), 1, $ WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 ) ELSE * tmp = HT * v * CALL ZGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INHTB ), $ LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1, $ Z_ZERO, WORK( INTMP ), 1 ) * tmp2 = VT * v CALL ZGEMV( 'C', NQB, BINDEX+1, Z_ONE, WORK( INVTB ), $ LDV, WORK( INHTB+( BINDEX+1 )*LDV ), 1, $ Z_ZERO, WORK( INTMP+BINDEX+1 ), 1 ) * END IF * * * IF( BALANCED ) THEN MYSETNUM = MYCOL * ROWSPERPROC = ICEIL( NPB, NPSET ) MYFIRSTROW = MIN( NPB+1, 1+ROWSPERPROC*MYSETNUM ) NUMROWS = MIN( ROWSPERPROC, NPB-MYFIRSTROW+1 ) * CALL ZGSUM2D( ICTXT, 'R', ' ', 2*( BINDEX+1 ), 1, $ WORK( INTMP ), 2*( BINDEX+1 ), -1, -1 ) * * * v = v + V * tmp IF( INDEX.GT.1. ) THEN CALL ZGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE, $ WORK( INVB+MYFIRSTROW-1 ), LDV, $ WORK( INTMP ), 1, Z_ONE, $ WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )* $ LDV ), 1 ) * * v = v + H * tmp2 CALL ZGEMV( 'N', NUMROWS, BINDEX+1, Z_NEGONE, $ WORK( INHB+MYFIRSTROW-1 ), LDV, $ WORK( INTMP+BINDEX+1 ), 1, Z_ONE, $ WORK( INVB+MYFIRSTROW-1+( BINDEX+1 )* $ LDV ), 1 ) END IF * ELSE * v = v + V * tmp CALL ZGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INVB ), $ LDV, WORK( INTMP ), 1, Z_ONE, $ WORK( INVB+( BINDEX+1 )*LDV ), 1 ) * * * v = v + H * tmp2 CALL ZGEMV( 'N', NPB, BINDEX+1, Z_NEGONE, WORK( INHB ), $ LDV, WORK( INTMP+BINDEX+1 ), 1, Z_ONE, $ WORK( INVB+( BINDEX+1 )*LDV ), 1 ) * END IF * * * Transpose NV and add it back into NVT * IF( MYROW.EQ.MYCOL ) THEN DO 120 I = 0, NQM1 - 1 WORK( INTMP+I ) = WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV+ $ I ) 120 CONTINUE ELSE CALL ZGESD2D( ICTXT, NQM1, 1, $ WORK( INVT+LIJP1-1+( BINDEX+1 )*LDV ), $ NQM1, MYCOL, MYROW ) CALL ZGERV2D( ICTXT, NPM1, 1, WORK( INTMP ), NPM1, MYCOL, $ MYROW ) * END IF DO 130 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = WORK( INV+LIIP1- $ 1+( BINDEX+1 )*LDV+I ) + WORK( INTMP+I ) 130 CONTINUE * * Sum-to-one NV rowwise (within a row) * CALL ZGSUM2D( ICTXT, 'R', ' ', NPM1, 1, $ WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ), NPM1, $ MYROW, NXTCOL ) * * * Dot product c = NV * NH * Sum-to-all c within next processor column * * IF( MYCOL.EQ.NXTCOL ) THEN CC( 1 ) = Z_ZERO DO 140 I = 0, NPM1 - 1 CC( 1 ) = CC( 1 ) + DCONJG( WORK( INV+LIIP1-1+ $ ( BINDEX+1 )*LDV+I ) )* $ WORK( INH+LIIP1-1+( BINDEX+1 )*LDV+I ) 140 CONTINUE IF( MYROW.EQ.NXTROW ) THEN CC( 2 ) = WORK( INV+LIIP1-1+( BINDEX+1 )*LDV ) CC( 3 ) = WORK( INH+LIIP1-1+( BINDEX+1 )*LDV ) ELSE CC( 2 ) = Z_ZERO CC( 3 ) = Z_ZERO END IF CALL ZGSUM2D( ICTXT, 'C', ' ', 3, 1, CC, 3, -1, NXTCOL ) * TOPV = CC( 2 ) C = CC( 1 ) TOPH = CC( 3 ) * TOPNV = TOPTAU*( TOPV-C*DCONJG( TOPTAU ) / 2*TOPH ) * * * Compute V = Tau * (V - C * Tau' / 2 * H ) * * DO 150 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I ) = TOPTAU* $ ( WORK( INV+LIIP1-1+( BINDEX+1 )*LDV+I )-C* $ DCONJG( TOPTAU ) / 2*WORK( INH+LIIP1-1+( BINDEX+ $ 1 )*LDV+I ) ) 150 CONTINUE * END IF * * 160 CONTINUE * * * Perform the rank2k update * IF( MAXINDEX.LT.N ) THEN * DO 170 I = 0, NPM1 - 1 WORK( INTMP+I ) = WORK( INH+LIIP1-1+ANB*LDV+I ) 170 CONTINUE * * * IF( .NOT.TWOGEMMS ) THEN IF( INTERLEAVE ) THEN LDZG = LDV / 2 ELSE CALL ZLAMOV( 'A', LTNM1, ANB, WORK( INHT+LIJP1-1 ), $ LDV, WORK( INVT+LIJP1-1+ANB*LDV ), LDV ) * CALL ZLAMOV( 'A', LTNM1, ANB, WORK( INV+LTLIP1-1 ), $ LDV, WORK( INH+LTLIP1-1+ANB*LDV ), LDV ) LDZG = LDV END IF NBZG = ANB*2 ELSE LDZG = LDV NBZG = ANB END IF * * DO 180 PBMIN = 1, LTNM1, PNB * PBSIZE = MIN( PNB, LTNM1-PBMIN+1 ) PBMAX = MIN( LTNM1, PBMIN+PNB-1 ) CALL ZGEMM( 'N', 'C', PBSIZE, PBMAX, NBZG, Z_NEGONE, $ WORK( INH+LTLIP1-1+PBMIN-1 ), LDZG, $ WORK( INVT+LIJP1-1 ), LDZG, Z_ONE, $ A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA ) IF( TWOGEMMS ) THEN CALL ZGEMM( 'N', 'C', PBSIZE, PBMAX, ANB, Z_NEGONE, $ WORK( INV+LTLIP1-1+PBMIN-1 ), LDZG, $ WORK( INHT+LIJP1-1 ), LDZG, Z_ONE, $ A( LTLIP1+PBMIN-1+( LIJP1-1 )*LDA ), LDA ) END IF 180 CONTINUE * * * DO 190 I = 0, NPM1 - 1 WORK( INV+LIIP1-1+I ) = WORK( INV+LIIP1-1+ANB*LDV+I ) WORK( INH+LIIP1-1+I ) = WORK( INTMP+I ) 190 CONTINUE DO 200 I = 0, NQM1 - 1 WORK( INHT+LIJP1-1+I ) = WORK( INHT+LIJP1-1+ANB*LDV+I ) 200 CONTINUE * * END IF * * End of the update A code * 210 CONTINUE * IF( MYCOL.EQ.NXTCOL ) THEN IF( MYROW.EQ.NXTROW ) THEN * D( NQ ) = DBLE( A( NP+( NQ-1 )*LDA ) ) A( NP+( NQ-1 )*LDA ) = D( NQ ) * CALL DGEBS2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1 ) ELSE CALL DGEBR2D( ICTXT, 'C', ' ', 1, 1, D( NQ ), 1, NXTROW, $ NXTCOL ) END IF END IF * * * * WORK( 1 ) = DCMPLX( LWMIN ) RETURN * * End of PZHETTRD * * END scalapack-2.0.2/SRC/pzlabrd.f000644 000766 000024 00000055423 10363532303 016154 0ustar00juliestaff000000 000000 SUBROUTINE PZLABRD( M, N, NB, A, IA, JA, DESCA, D, E, TAUQ, TAUP, $ X, IX, JX, DESCX, Y, IY, JY, DESCY, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, IX, IY, JA, JX, JY, M, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION D( * ), E( * ) COMPLEX*16 A( * ), TAUP( * ), TAUQ( * ), X( * ), Y( * ), $ WORK( * ) * .. * * Purpose * ======= * * PZLABRD reduces the first NB rows and columns of a complex general * M-by-N distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper * or lower bidiagonal form by an unitary transformation Q' * A * P, and * returns the matrices X and Y which are needed to apply the transfor- * mation to the unreduced part of sub( A ). * * If M >= N, sub( A ) is reduced to upper bidiagonal form; if M < N, to * lower bidiagonal form. * * This is an auxiliary routine called by PZGEBRD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * NB (global input) INTEGER * The number of leading rows and columns of sub( A ) to be * reduced. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * general distributed matrix sub( A ) to be reduced. On exit, * the first NB rows and columns of the matrix are overwritten; * the rest of the distributed matrix sub( A ) is unchanged. * If m >= n, elements on and below the diagonal in the first NB * columns, with the array TAUQ, represent the unitary * matrix Q as a product of elementary reflectors; and * elements above the diagonal in the first NB rows, with the * array TAUP, represent the unitary matrix P as a product * of elementary reflectors. * If m < n, elements below the diagonal in the first NB * columns, with the array TAUQ, represent the unitary * matrix Q as a product of elementary reflectors, and * elements on and above the diagonal in the first NB rows, * with the array TAUP, represent the unitary matrix P as * a product of elementary reflectors. * See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-1) otherwise. * The distributed diagonal elements of the bidiagonal matrix * B: D(i) = A(ia+i-1,ja+i-1). D is tied to the distributed * matrix A. * * E (local output) DOUBLE PRECISION array, dimension * LOCr(IA+MIN(M,N)-1) if M >= N; LOCc(JA+MIN(M,N)-2) otherwise. * The distributed off-diagonal elements of the bidiagonal * distributed matrix B: * if m >= n, E(i) = A(ia+i-1,ja+i) for i = 1,2,...,n-1; * if m < n, E(i) = A(ia+i,ja+i-1) for i = 1,2,...,m-1. * E is tied to the distributed matrix A. * * TAUQ (local output) COMPLEX*16 array dimension * LOCc(JA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix Q. TAUQ is * tied to the distributed matrix A. See Further Details. * * TAUP (local output) COMPLEX*16 array, dimension * LOCr(IA+MIN(M,N)-1). The scalar factors of the elementary * reflectors which represent the unitary matrix P. TAUP is * tied to the distributed matrix A. See Further Details. * * X (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_X,NB). On exit, the local * pieces of the distributed M-by-NB matrix * X(IX:IX+M-1,JX:JX+NB-1) required to update the unreduced * part of sub( A ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * Y (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_Y,NB). On exit, the local * pieces of the distributed N-by-NB matrix * Y(IY:IY+N-1,JY:JY+NB-1) required to update the unreduced * part of sub( A ). * * IY (global input) INTEGER * The row index in the global array Y indicating the first * row of sub( Y ). * * JY (global input) INTEGER * The column index in the global array Y indicating the * first column of sub( Y ). * * DESCY (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Y. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK >= NB_A + NQ, with * * NQ = NUMROC( N+MOD( IA-1, NB_Y ), NB_Y, MYCOL, IACOL, NPCOL ) * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ) * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Further Details * =============== * * The matrices Q and P are represented as products of elementary * reflectors: * * Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) * * Each H(i) and G(i) has the form: * * H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' * * where tauq and taup are complex scalars, and v and u are complex * vectors. * * If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in * A(ia+i-1:ia+m-1,ja+i-1); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is * stored on exit in A(ia+i-1,ja+i:ja+n-1); tauq is stored in * TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in * A(ia+i+1:ia+m-1,ja+i-1); u(1:i-1) = 0, u(i) = 1, and u(i:n) is * stored on exit in A(ia+i-1,ja+i:ja+n-1); tauq is stored in * TAUQ(ja+i-1) and taup in TAUP(ia+i-1). * * The elements of the vectors v and u together form the m-by-nb matrix * V and the nb-by-n matrix U' which are needed, with X and Y, to apply * the transformation to the unreduced part of the matrix, using a block * update of the form: sub( A ) := sub( A ) - V*Y' - X*U'. * * The contents of sub( A ) on exit are illustrated by the following * examples with nb = 2: * * m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): * * ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) * ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) * ( v1 v2 a a a ) ( v1 1 a a a a ) * ( v1 v2 a a a ) ( v1 v2 a a a a ) * ( v1 v2 a a a ) ( v1 v2 a a a a ) * ( v1 v2 a a a ) * * where a denotes an element of the original matrix which is unchanged, * vi denotes an element of the vector defining H(i), and ui an element * of the vector defining G(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, IPY, IW, J, JJ, $ JWY, K, MYCOL, MYROW, NPCOL, NPROW COMPLEX*16 ALPHA, TAU INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), $ DESCTP( DLEN_ ), DESCTQ( DLEN_ ), $ DESCW( DLEN_ ), DESCWY( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PDELSET, $ PZCOPY, PZELGET, PZELSET, PZGEMV, $ PZLACGV, PZLARFG, PZSCAL * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) IPY = DESCA( MB_ ) + 1 IW = MOD( IA-1, DESCA( NB_ ) ) + 1 ALPHA = ZERO * CALL DESCSET( DESCWY, 1, N+MOD( IA-1, DESCY( NB_ ) ), 1, $ DESCA( NB_ ), IAROW, IACOL, ICTXT, 1 ) CALL DESCSET( DESCW, DESCA( MB_ ), 1, DESCA( MB_ ), 1, IAROW, $ IACOL, ICTXT, DESCA( MB_ ) ) CALL DESCSET( DESCTQ, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), IAROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCTP, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), IACOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) * IF( M.GE.N ) THEN * * Reduce to upper bidiagonal form * CALL DESCSET( DESCD, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) CALL DESCSET( DESCE, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) DO 10 K = 1, NB I = IA + K - 1 J = JA + K - 1 JWY = IW + K * * Update A(i:ia+m-1,j) * IF( K.GT.1 ) THEN CALL PZGEMV( 'No transpose', M-K+1, K-1, -ONE, A, I, JA, $ DESCA, Y, IY, JY+K-1, DESCY, 1, ONE, A, I, $ J, DESCA, 1 ) CALL PZGEMV( 'No transpose', M-K+1, K-1, -ONE, X, IX+K-1, $ JX, DESCX, A, IA, J, DESCA, 1, ONE, A, I, J, $ DESCA, 1 ) CALL PZELSET( A, I-1, J, DESCA, ALPHA ) END IF * * Generate reflection Q(i) to annihilate A(i+1:ia+m-1,j) * CALL PZLARFG( M-K+1, ALPHA, I, J, A, I+1, J, DESCA, 1, $ TAUQ ) CALL PDELSET( D, 1, J, DESCD, DBLE( ALPHA ) ) CALL PZELSET( A, I, J, DESCA, ONE ) * * Compute Y(IA+I:IA+N-1,J) * CALL PZGEMV( 'Conjugate transpose', M-K+1, N-K, ONE, A, I, $ J+1, DESCA, A, I, J, DESCA, 1, ZERO, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PZGEMV( 'Conjugate transpose', M-K+1, K-1, ONE, A, I, $ JA, DESCA, A, I, J, DESCA, 1, ZERO, WORK, IW, $ 1, DESCW, 1 ) CALL PZGEMV( 'Conjugate transpose', K-1, N-K, -ONE, Y, IY, $ JY+K, DESCY, WORK, IW, 1, DESCW, 1, ONE, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PZGEMV( 'Conjugate transpose', M-K+1, K-1, ONE, X, $ IX+K-1, JX, DESCX, A, I, J, DESCA, 1, ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PZGEMV( 'Conjugate transpose', K-1, N-K, -ONE, A, IA, $ J+1, DESCA, WORK, IW, 1, DESCW, 1, ONE, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) * CALL PZELGET( 'Rowwise', ' ', TAU, TAUQ, 1, J, DESCTQ ) CALL PZSCAL( N-K, TAU, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PZLACGV( N-K, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PZCOPY( N-K, WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ), $ Y, IY+K-1, JY+K, DESCY, DESCY( M_ ) ) * * Update A(i,j+1:ja+n-1) * CALL PZLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PZLACGV( K, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZGEMV( 'Conjugate transpose', K, N-K, -ONE, Y, IY, $ JY+K, DESCY, A, I, JA, DESCA, DESCA( M_ ), ONE, $ A, I, J+1, DESCA, DESCA( M_ ) ) CALL PZLACGV( K, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZLACGV( K-1, X, IX+K-1, JX, DESCX, DESCX( M_ ) ) CALL PZGEMV( 'Conjugate transpose', K-1, N-K, -ONE, A, IA, $ J+1, DESCA, X, IX+K-1, JX, DESCX, DESCX( M_ ), $ ONE, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PZLACGV( K-1, X, IX+K-1, JX, DESCX, DESCX( M_ ) ) CALL PZELSET( A, I, J, DESCA, DCMPLX( DBLE( ALPHA ) ) ) * * Generate reflection P(i) to annihilate A(i,j+2:ja+n-1) * CALL PZLARFG( N-K, ALPHA, I, J+1, A, I, $ MIN( J+2, N+JA-1 ), DESCA, DESCA( M_ ), TAUP ) CALL PDELSET( E, I, 1, DESCE, DBLE( ALPHA ) ) CALL PZELSET( A, I, J+1, DESCA, ONE ) * * Compute X(I+1:IA+M-1,J) * CALL PZGEMV( 'No transpose', M-K, N-K, ONE, A, I+1, J+1, $ DESCA, A, I, J+1, DESCA, DESCA( M_ ), ZERO, X, $ IX+K, JX+K-1, DESCX, 1 ) CALL PZGEMV( 'No transpose', K, N-K, ONE, Y, IY, JY+K, $ DESCY, A, I, J+1, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PZGEMV( 'No transpose', M-K, K, -ONE, A, I+1, JA, $ DESCA, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) CALL PZGEMV( 'No transpose', K-1, N-K, ONE, A, IA, J+1, $ DESCA, A, I, J+1, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PZGEMV( 'No transpose', M-K, K-1, -ONE, X, IX+K, JX, $ DESCX, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) * CALL PZELGET( 'Columnwise', ' ', TAU, TAUP, I, 1, DESCTP ) CALL PZSCAL( M-K, TAU, X, IX+K, JX+K-1, DESCX, 1 ) CALL PZLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) 10 CONTINUE * ELSE * * Reduce to lower bidiagonal form * CALL DESCSET( DESCD, IA+MIN(M,N)-1, 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, DESCA( CTXT_ ), $ DESCA( LLD_ ) ) CALL DESCSET( DESCE, 1, JA+MIN(M,N)-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) DO 20 K = 1, NB I = IA + K - 1 J = JA + K - 1 JWY = IW + K * * Update A(i,j:ja+n-1) * CALL PZLACGV( N-K+1, A, I, J, DESCA, DESCA( M_ ) ) IF( K.GT.1 ) THEN CALL PZLACGV( K-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZGEMV( 'Conjugate transpose', K-1, N-K+1, -ONE, Y, $ IY, JY+K-1, DESCY, A, I, JA, DESCA, $ DESCA( M_ ), ONE, A, I, J, DESCA, $ DESCA( M_ ) ) CALL PZLACGV( K-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZLACGV( K-1, X, IX+K-1, JX, DESCX, DESCX( M_ ) ) CALL PZGEMV( 'Conjugate transpose', K-1, N-K+1, -ONE, A, $ IA, J, DESCA, X, IX+K-1, JX, DESCX, $ DESCX( M_ ), ONE, A, I, J, DESCA, $ DESCA( M_ ) ) CALL PZLACGV( K-1, X, IX+K-1, JX, DESCX, DESCX( M_ ) ) CALL PZELSET( A, I, J-1, DESCA, DCMPLX( DBLE( ALPHA ) ) ) END IF * * Generate reflection P(i) to annihilate A(i,j+1:ja+n-1) * CALL PZLARFG( N-K+1, ALPHA, I, J, A, I, J+1, DESCA, $ DESCA( M_ ), TAUP ) CALL PDELSET( D, I, 1, DESCD, DBLE( ALPHA ) ) CALL PZELSET( A, I, J, DESCA, ONE ) * * Compute X(i+1:ia+m-1,j) * CALL PZGEMV( 'No transpose', M-K, N-K+1, ONE, A, I+1, J, $ DESCA, A, I, J, DESCA, DESCA( M_ ), ZERO, X, $ IX+K, JX+K-1, DESCX, 1 ) CALL PZGEMV( 'No transpose', K-1, N-K+1, ONE, Y, IY, JY+K-1, $ DESCY, A, I, J, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PZGEMV( 'No transpose', M-K, K-1, -ONE, A, I+1, JA, $ DESCA, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) CALL PZGEMV( 'No transpose', K-1, N-K+1, ONE, A, IA, J, $ DESCA, A, I, J, DESCA, DESCA( M_ ), ZERO, $ WORK, IW, 1, DESCW, 1 ) CALL PZGEMV( 'No transpose', M-K, K-1, -ONE, X, IX+K, JX, $ DESCX, WORK, IW, 1, DESCW, 1, ONE, X, IX+K, $ JX+K-1, DESCX, 1 ) * CALL PZELGET( 'Columnwise', ' ', TAU, TAUP, I, 1, DESCTP ) CALL PZSCAL( M-K, TAU, X, IX+K, JX+K-1, DESCX, 1 ) CALL PZLACGV( N-K+1, A, I, J, DESCA, DESCA( M_ ) ) * * Update A(i+1:ia+m-1,j) * CALL PZGEMV( 'No transpose', M-K, K-1, -ONE, A, I+1, JA, $ DESCA, Y, IY, JY+K-1, DESCY, 1, ONE, A, I+1, J, $ DESCA, 1 ) CALL PZGEMV( 'No transpose', M-K, K, -ONE, X, IX+K, JX, $ DESCX, A, IA, J, DESCA, 1, ONE, A, I+1, J, $ DESCA, 1 ) CALL PZELSET( A, I, J, DESCA, ALPHA ) * * Generate reflection Q(i) to annihilate A(i+2:ia+m-1,j) * CALL PZLARFG( M-K, ALPHA, I+1, J, A, MIN( I+2, M+IA-1 ), $ J, DESCA, 1, TAUQ ) CALL PDELSET( E, 1, J, DESCE, DBLE( ALPHA ) ) CALL PZELSET( A, I+1, J, DESCA, ONE ) * * Compute Y(ia+i:ia+n-1,j) * CALL PZGEMV( 'Conjugate transpose', M-K, N-K, ONE, A, I+1, $ J+1, DESCA, A, I+1, J, DESCA, 1, ZERO, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PZGEMV( 'Conjugate transpose', M-K, K-1, ONE, A, I+1, $ JA, DESCA, A, I+1, J, DESCA, 1, ZERO, WORK, IW, $ 1, DESCW, 1 ) CALL PZGEMV( 'Conjugate transpose', K-1, N-K, -ONE, Y, IY, $ JY+K, DESCY, WORK, IW, 1, DESCW, 1, ONE, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) CALL PZGEMV( 'Conjugate transpose', M-K, K, ONE, X, IX+K, $ JX, DESCX, A, I+1, J, DESCA, 1, ZERO, WORK, IW, $ 1, DESCW, 1 ) CALL PZGEMV( 'Conjugate transpose', K, N-K, -ONE, A, IA, $ J+1, DESCA, WORK, IW, 1, DESCW, 1, ONE, $ WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ) ) * CALL PZELGET( 'Rowwise', ' ', TAU, TAUQ, 1, J, DESCTQ ) CALL PZSCAL( N-K, TAU, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PZLACGV( N-K, WORK( IPY ), 1, JWY, DESCWY, $ DESCWY( M_ ) ) CALL PZCOPY( N-K, WORK( IPY ), 1, JWY, DESCWY, DESCWY( M_ ), $ Y, IY+K-1, JY+K, DESCY, DESCY( M_ ) ) 20 CONTINUE END IF * RETURN * * End of PZLABRD * END scalapack-2.0.2/SRC/pzlacgv.f000644 000766 000024 00000015412 10363532303 016156 0ustar00juliestaff000000 000000 SUBROUTINE PZLACGV( N, X, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INCX, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX*16 X( * ) * .. * * Purpose * ======= * * PZLACGV conjugates a complex vector of length N, sub( X ), where * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = DESCX( M_ ) and * X(IX:IX+N-1,JX) if INCX = 1, and * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vector sub( X ). * * X (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_X,*). * On entry the vector to be conjugated * x( i ) = X(IX+(JX-1)*M_X +(i-1)*INCX ), 1 <= i <= N. * On exit the conjugated vector. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICOFFX, ICTXT, IIX, IOFFX, IROFFX, IXCOL, $ IXROW, JJX, LDX, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DCONJG, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) * LDX = DESCX( LLD_ ) IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is rowwise distributed. * IF( MYROW.NE.IXROW ) $ RETURN ICOFFX = MOD( JX-1, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFFX, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ - ICOFFX * IF( NQ.GT.0 ) THEN IOFFX = IIX+(JJX-1)*LDX DO 10 I = 1, NQ X( IOFFX ) = DCONJG( X( IOFFX ) ) IOFFX = IOFFX + LDX 10 CONTINUE END IF * ELSE IF( INCX.EQ.1 ) THEN * * sub( X ) is columnwise distributed. * IF( MYCOL.NE.IXCOL ) $ RETURN IROFFX = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFFX, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFFX * IF( NP.GT.0 ) THEN IOFFX = IIX+(JJX-1)*LDX DO 20 I = IOFFX, IOFFX+NP-1 X( I ) = DCONJG( X( I ) ) 20 CONTINUE END IF * END IF * RETURN * * End of PZLACGV * END scalapack-2.0.2/SRC/pzlacon.f000644 000766 000024 00000032371 10363532303 016161 0ustar00juliestaff000000 000000 SUBROUTINE PZLACON( N, V, IV, JV, DESCV, X, IX, JX, DESCX, EST, $ KASE ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IV, IX, JV, JX, KASE, N DOUBLE PRECISION EST * .. * .. Array Arguments .. INTEGER DESCV( * ), DESCX( * ) COMPLEX*16 V( * ), X( * ) * .. * * Purpose * ======= * * PZLACON estimates the 1-norm of a square, complex distributed matrix * A. Reverse communication is used for evaluating matrix-vector * products. X and V are aligned with the distributed matrix A, this * information is implicitly contained within IV, IX, DESCV, and DESCX. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vectors V and X. N >= 0. * * V (local workspace) COMPLEX*16 pointer into the local * memory to an array of dimension LOCr(N+MOD(IV-1,MB_V)). On * the final return, V = A*W, where EST = norm(V)/norm(W) * (W is not returned). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * X (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension * LOCr(N+MOD(IX-1,MB_X)). On an intermediate return, X * should be overwritten by * A * X, if KASE=1, * A' * X, if KASE=2, * where A' is the conjugate transpose of A, and PZLACON must * be re-called with all the other parameters unchanged. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * * EST (global output) DOUBLE PRECISION * An estimate (a lower bound) for norm(A). * * KASE (local input/local output) INTEGER * On the initial call to PZLACON, KASE should be 0. * On an intermediate return, KASE will be 1 or 2, indicating * whether X should be overwritten by A * X or A' * X. * On the final return from PZLACON, KASE will again be 0. * * Further Details * =============== * * The serial version ZLACON has been contributed by Nick Higham, * University of Manchester. It was originally named SONEST, dated * March 16, 1988. * * Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of * a real or complex matrix, with applications to condition estimation", * ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) DOUBLE PRECISION ONE, TWO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, ICTXT, IIVX, IMAXROW, IOFFVX, IROFF, ITER, $ IVXCOL, IVXROW, J, JLAST, JJVX, JUMP, K, $ MYCOL, MYROW, NP, NPCOL, NPROW DOUBLE PRECISION ALTSGN, ESTOLD, SAFMIN, TEMP COMPLEX*16 JLMAX, XMAX * .. * .. Local Arrays .. COMPLEX*16 WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, DGEBR2D, $ DGEBS2D, PDZSUM1, PZELGET, $ PZMAX1, ZCOPY, ZGEBR2D, ZGEBS2D * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, INDXL2G, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL INDXG2L, INDXG2P, INDXL2G, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX * .. * .. Save statement .. SAVE * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIVX, JJVX, IVXROW, IVXCOL ) IF( MYCOL.NE.IVXCOL ) $ RETURN IROFF = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IVXROW, NPROW ) IF( MYROW.EQ.IVXROW ) $ NP = NP - IROFF IOFFVX = IIVX + (JJVX-1)*DESCX( LLD_ ) * SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) IF( KASE.EQ.0 ) THEN DO 10 I = IOFFVX, IOFFVX+NP-1 X( I ) = DCMPLX( ONE / DBLE( N ) ) 10 CONTINUE KASE = 1 JUMP = 1 RETURN END IF * GO TO ( 20, 40, 70, 90, 120 )JUMP * * ................ ENTRY (JUMP = 1) * FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X * 20 CONTINUE IF( N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN V( IOFFVX ) = X( IOFFVX ) EST = ABS( V( IOFFVX ) ) CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF * ... QUIT GO TO 130 END IF CALL PDZSUM1( N, EST, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF END IF * DO 30 I = IOFFVX, IOFFVX+NP-1 IF( ABS( X( I ) ).GT.SAFMIN ) THEN X( I ) = X( I ) / DCMPLX( ABS( X( I ) ) ) ELSE X( I ) = CONE END IF 30 CONTINUE KASE = 2 JUMP = 2 RETURN * * ................ ENTRY (JUMP = 2) * FIRST ITERATION. X HAS BEEN OVERWRITTEN BY CTRANS(A)*X * 40 CONTINUE CALL PZMAX1( N, XMAX, J, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN WORK( 1 ) = XMAX WORK( 2 ) = DCMPLX( DBLE( J ) ) CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2 ) ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2, $ IVXROW, MYCOL ) XMAX = WORK( 1 ) J = NINT( DBLE( WORK( 2 ) ) ) END IF END IF ITER = 2 * * MAIN LOOP - ITERATIONS 2, 3,...,ITMAX * 50 CONTINUE DO 60 I = IOFFVX, IOFFVX+NP-1 X( I ) = CZERO 60 CONTINUE IMAXROW = INDXG2P( J, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) IF( MYROW.EQ.IMAXROW ) THEN I = INDXG2L( J, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) X( I ) = CONE END IF KASE = 1 JUMP = 3 RETURN * * ................ ENTRY (JUMP = 3) * X HAS BEEN OVERWRITTEN BY A*X * 70 CONTINUE CALL ZCOPY( NP, X( IOFFVX ), 1, V( IOFFVX ), 1 ) ESTOLD = EST CALL PDZSUM1( N, EST, V, IV, JV, DESCV, 1 ) IF( DESCV( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, EST, 1, $ IVXROW, MYCOL ) END IF END IF * * TEST FOR CYCLING IF( EST.LE.ESTOLD ) $ GO TO 100 * DO 80 I = IOFFVX, IOFFVX+NP-1 IF( ABS( X( I ) ).GT.SAFMIN ) THEN X( I ) = X( I ) / DCMPLX( ABS( X( I ) ) ) ELSE X( I ) = CONE END IF 80 CONTINUE KASE = 2 JUMP = 4 RETURN * * ................ ENTRY (JUMP = 4) * X HAS BEEN OVERWRITTEN BY CTRANS(A)*X * 90 CONTINUE JLAST = J CALL PZMAX1( N, XMAX, J, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN WORK( 1 ) = XMAX WORK( 2 ) = DCMPLX( DBLE( J ) ) CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2 ) ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 2, 1, WORK, 2, $ IVXROW, MYCOL ) XMAX = WORK( 1 ) J = NINT( DBLE( WORK( 2 ) ) ) END IF END IF CALL PZELGET( 'Columnwise', ' ', JLMAX, X, JLAST, JX, DESCX ) IF( ( DBLE( JLMAX ).NE.ABS( DBLE( XMAX ) ) ).AND. $ ( ITER.LT.ITMAX ) ) THEN ITER = ITER + 1 GO TO 50 END IF * * ITERATION COMPLETE. FINAL STAGE. * 100 CONTINUE DO 110 I = IOFFVX, IOFFVX+NP-1 K = INDXL2G( I-IOFFVX+IIVX, DESCX( MB_ ), MYROW, $ DESCX( RSRC_ ), NPROW )-IX+1 IF( MOD( K, 2 ).EQ.0 ) THEN ALTSGN = -ONE ELSE ALTSGN = ONE END IF X( I ) = DCMPLX( ALTSGN*( ONE+DBLE( K-1 ) / DBLE( N-1 ) ) ) 110 CONTINUE KASE = 1 JUMP = 5 RETURN * * ................ ENTRY (JUMP = 5) * X HAS BEEN OVERWRITTEN BY A*X * 120 CONTINUE CALL PDZSUM1( N, TEMP, X, IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN IF( MYROW.EQ.IVXROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, TEMP, 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TEMP, 1, $ IVXROW, MYCOL ) END IF END IF TEMP = TWO*( TEMP / DBLE( 3*N ) ) IF( TEMP.GT.EST ) THEN CALL ZCOPY( NP, X( IOFFVX ), 1, V( IOFFVX ), 1 ) EST = TEMP END IF * 130 CONTINUE KASE = 0 * RETURN * * End of PZLACON * END scalapack-2.0.2/SRC/pzlaconsb.f000644 000766 000024 00000051135 10363532303 016505 0ustar00juliestaff000000 000000 SUBROUTINE PZLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, BUF, $ LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. INTEGER I, L, LWORK, M COMPLEX*16 H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), BUF( * ) * .. * * Purpose * ======= * * PZLACONSB looks for two consecutive small subdiagonal elements by * seeing the effect of starting a double shift QR iteration * given by H44, H33, & H43H34 and see if this would make a * subdiagonal negligible. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * M (global output) INTEGER * On exit, this yields the starting location of the QR double * shift. This will satisfy: L <= M <= I-2. * * H44 * H33 * H43H34 (global input) COMPLEX*16 * These three values are for the double shift QR iteration. * * BUF (local output) COMPLEX*16 array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 7*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Logic: * ====== * * Two consecutive small subdiagonal elements will stall * convergence of a double shift if their product is small * relatively even if each is not very small. Thus it is * necessary to scan the "tridiagonal portion of the matrix." In * the LAPACK algorithm ZLAHQR, a loop of M goes from I-2 down to * L and examines * H(m,m),H(m+1,m+1),H(m+1,m),H(m,m+1),H(m-1,m-1),H(m,m-1), and * H(m+2,m-1). Since these elements may be on separate * processors, the first major loop (10) goes over the tridiagonal * and has each node store whatever values of the 7 it has that * the node owning H(m,m) does not. This will occur on a border * and can happen in no more than 3 locations per block assuming * square blocks. There are 5 buffers that each node stores these * values: a buffer to send diagonally down and right, a buffer * to send up, a buffer to send left, a buffer to send diagonally * up and left and a buffer to send right. Each of these buffers * is actually stored in one buffer BUF where BUF(ISTR1+1) starts * the first buffer, BUF(ISTR2+1) starts the second, etc.. After * the values are stored, if there are any values that a node * needs, they will be sent and received. Then the next major * loop passes over the data and searches for two consecutive * small subdiagonals. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IBUF1, IBUF2, IBUF3, IBUF4, $ IBUF5, ICOL1, II, IRCV1, IRCV2, IRCV3, IRCV4, $ IRCV5, IROW1, ISRC, ISTR1, ISTR2, ISTR3, ISTR4, $ ISTR5, JJ, JSRC, LDA, LEFT, MODKM1, MYCOL, $ MYROW, NPCOL, NPROW, NUM, RIGHT, UP DOUBLE PRECISION S, TST1, ULP COMPLEX*16 CDUM, H00, H10, H11, H12, H21, H22, H33S, H44S, $ V1, V2, V3 * .. * .. External Functions .. INTEGER ILCM DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, INFOG2L, PXERBLA, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 starts at BUF(ISTR1+1) and will contain IBUF1 elements * BUFFER2 starts at BUF(ISTR2+1) and will contain IBUF2 elements * BUFFER3 starts at BUF(ISTR3+1) and will contain IBUF3 elements * BUFFER4 starts at BUF(ISTR4+1) and will contain IBUF4 elements * BUFFER5 starts at BUF(ISTR5+1) and will contain IBUF5 elements * ISTR1 = 0 ISTR2 = ( ( I-L-1 ) / HBL ) IF( ISTR2*HBL.LT.( I-L-1 ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.7*ISTR2 ) THEN CALL PXERBLA( CONTXT, 'PZLACONSB', 10 ) RETURN END IF ISTR3 = 3*ISTR2 ISTR4 = ISTR3 + ISTR2 ISTR5 = ISTR3 + ISTR3 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) * * Copy our relevant pieces of triadiagonal that we owe into * 5 buffers to send to whomever owns H(M,M) as M moves diagonally * up the tridiagonal * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 IRCV1 = 0 IRCV2 = 0 IRCV3 = 0 IRCV4 = 0 IRCV5 = 0 DO 10 M = I - 2, L, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M-1,M-1) and send it diagonal down * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( M-1, M-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M ,M-1) and send it right * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF5 = IBUF5 + 1 BUF( ISTR5+IBUF5 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+1,M) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( MYROW.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M ,M+1) and send it left * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF3 = IBUF3 + 1 BUF( ISTR3+IBUF3 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M+1,M+1) & H(M+2,M+1) and send it * diagonally up * IF( ( UP.NE.MYROW ) .OR. ( LEFT.NE.MYCOL ) ) THEN CALL INFOG2L( M+1, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF4 = IBUF4 + 2 BUF( ISTR4+IBUF4-1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) BUF( ISTR4+IBUF4 ) = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+2,M+1) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * Add up the receives * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( M.GT.L ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M-1,M-1) from diagonal up * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) .AND. ( M.GT.L ) ) $ THEN * * We must receive H(M ,M-1) from left * IRCV5 = IRCV5 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+1,M ) from down * IRCV2 = IRCV2 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPCOL.GT.1 ) ) THEN * * We must receive H(M ,M+1) from right * IRCV3 = IRCV3 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M+1:M+2,M+1) from diagonal down * IRCV4 = IRCV4 + 2 END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+2,M+1) from down * IRCV2 = IRCV2 + 1 END IF END IF * * Possibly change owners (occurs only when MOD(M-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * * Send data on to the appropriate node if there is any data to send * IF( IBUF1.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, UP, $ MYCOL ) END IF IF( IBUF3.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF3, 1, BUF( ISTR3+1 ), IBUF3, MYROW, $ LEFT ) END IF IF( IBUF4.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF4, 1, BUF( ISTR4+1 ), IBUF4, UP, $ LEFT ) END IF IF( IBUF5.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF5, 1, BUF( ISTR5+1 ), IBUF5, MYROW, $ RIGHT ) END IF * * Receive appropriate data if there is any * IF( IRCV1.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, DOWN, $ MYCOL ) END IF IF( IRCV3.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV3, 1, BUF( ISTR3+1 ), IRCV3, MYROW, $ RIGHT ) END IF IF( IRCV4.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV4, 1, BUF( ISTR4+1 ), IRCV4, DOWN, $ RIGHT ) END IF IF( IRCV5.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV5, 1, BUF( ISTR5+1 ), IRCV5, MYROW, $ LEFT ) END IF * * Start main loop * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. $ ( MODKM1.NE.HBL-1 ) ) THEN CALL INFOG2L( I-2, I-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) END IF * * Look for two consecutive small subdiagonal elements. * DO 20 M = I - 2, L, -1 * * Determine the effect of starting the double-shift QR * iteration at row M, and see if this would make H(M,M-1) * negligible. * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H00 = BUF( ISTR1+IBUF1 ) ELSE H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF5 = IBUF5 + 1 H10 = BUF( ISTR5+IBUF5 ) ELSE H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF END IF IF( MODKM1.EQ.HBL-1 ) THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) H11 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( NUM.GT.1 ) THEN IBUF4 = IBUF4 + 2 H22 = BUF( ISTR4+IBUF4-1 ) V3 = BUF( ISTR4+IBUF4 ) ELSE H22 = A( ICOL1*LDA+IROW1+1 ) V3 = A( ( ICOL1+1 )*LDA+IROW1+1 ) END IF IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 H21 = BUF( ISTR2+IBUF2 ) ELSE H21 = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF3 = IBUF3 + 1 H12 = BUF( ISTR3+IBUF3 ) ELSE H12 = A( ICOL1*LDA+IROW1 ) END IF IF( M.GT.L ) THEN H00 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF * * Adjust ICOL1 for next iteration where MODKM1=HBL-2 * ICOL1 = ICOL1 + 1 END IF IF( MODKM1.EQ.HBL-2 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 V3 = BUF( ISTR2+IBUF2 ) ELSE V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) END IF H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.LT.HBL-2 ) .AND. ( MODKM1.GT.0 ) ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S IF( M.EQ.L ) $ GO TO 30 TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).LE.ULP*TST1 ) $ GO TO 30 * * Slide indices diagonally up one for next iteration * IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF IF( M.EQ.L ) THEN * * Stop regardless of which node we are * GO TO 30 END IF * * Possibly change owners if on border * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 20 CONTINUE 30 CONTINUE * CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, M, 1, L, L, -1, -1, -1 ) * RETURN * * End of PZLACONSB * END scalapack-2.0.2/SRC/pzlacp2.f000644 000766 000024 00000037410 11750130340 016061 0ustar00juliestaff000000 000000 SUBROUTINE PZLACP2( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ) * .. * * Purpose * ======= * * PZLACP2 copies all or part of a distributed matrix A to another * distributed matrix B. No communication is performed, PZLACP2 * performs a local copy sub( A ) := sub( B ), where sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1) and sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * PZLACP2 requires that only dimension of the matrix operands is * distributed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ) set as follows: * * if UPLO = 'U', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=j, 1<=j<=N; * if UPLO = 'L', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * j<=i<=M, 1<=j<=N; * otherwise, B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=M, 1<=j<=N. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER HEIGHT, IACOL, IAROW, IBASE, IBCOL, IBROW, $ ICOFFA, IIA, IIAA, IIB, IIBB, IIBEGA, IIBEGB, $ IIENDA, IINXTA, IINXTB, ILEFT, IRIGHT, IROFFA, $ ITOP, JJA, JJAA, JJB, JJBB, JJBEGA, JJBEGB, $ JJENDA, JJNXTA, JJNXTB, LDA, LDB, MBA, MP, $ MPAA, MYCOL, MYDIST, MYROW, NBA, NPCOL, NPROW, $ NQ, NQAA, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, ZLAMOV * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, JJB, $ IBROW, IBCOL ) * MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, MBA ) ICOFFA = MOD( JA-1, NBA ) LDB = DESCB( LLD_ ) * IF( N.LE.( NBA-ICOFFA ) ) THEN * * It is assumed that the local columns JJA:JJA+N-1 of the matrix * A are in the same process column (IACOL). * * N * JJA JJA+N-1 * / --------------------- \ * IROFFA| | | | * \ |...................| | ( IAROW ) * IIA |x | | MBA = DESCA( MB_ ) * | x | | * |--x----------------| / * | x | * | x | ITOP * | x | | * | x | /-------\ * |-------x-----------| |-------x-----------| * | x | | x | * | x | | x | * | x | | x | * | x | | x | * |------------x------| |------------x------| * | x | \____________/ * | x | | * | x | IBASE * | x | * |-----------------x-| Local picture * | x| * | | * | | * | | * |-------------------| * | | * . . * . . * . (IACOL) . * IF( MYCOL.EQ.IACOL ) THEN * MP = NUMROC( M+IROFFA, MBA, MYROW, IAROW, NPROW ) IF( MP.LE.0 ) $ RETURN IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA MYDIST = MOD( MYROW-IAROW+NPROW, NPROW ) ITOP = MYDIST * MBA - IROFFA * IF( LSAME( UPLO, 'U' ) ) THEN * ITOP = MAX( 0, ITOP ) IIBEGA = IIA IIENDA = IIA + MP - 1 IINXTA = MIN( ICEIL( IIBEGA, MBA ) * MBA, IIENDA ) IIBEGB = IIB IINXTB = IIBEGB + IINXTA - IIBEGA * 10 CONTINUE IF( ( N-ITOP ).GT.0 ) THEN CALL ZLAMOV( UPLO, IINXTA-IIBEGA+1, N-ITOP, $ A( IIBEGA+(JJA+ITOP-1)*LDA ), LDA, $ B( IIBEGB+(JJB+ITOP-1)*LDB ), LDB ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IIBEGA = IINXTA + 1 IINXTA = MIN( IINXTA+MBA, IIENDA ) IIBEGB = IINXTB + 1 IINXTB = IIBEGB + IINXTA - IIBEGA GO TO 10 END IF * ELSE IF( LSAME( UPLO, 'L' ) ) THEN * MPAA = MP IIAA = IIA JJAA = JJA IIBB = IIB JJBB = JJB IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 20 CONTINUE IF( JJAA.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL ZLAMOV( 'All', MPAA, ITOP-JJAA+JJA, $ A( IIAA+(JJAA-1)*LDA ), LDA, $ B( IIBB+(JJBB-1)*LDB ), LDB ) CALL ZLAMOV( UPLO, MPAA, HEIGHT, $ A( IIAA+(JJA+ITOP-1)*LDA ), LDA, $ B( IIBB+(JJB+ITOP-1)*LDB ), LDB ) MPAA = MAX( 0, MPAA - HEIGHT ) IIAA = IIAA + HEIGHT JJAA = JJA + IBASE IIBB = IIBB + HEIGHT JJBB = JJB + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 20 END IF * ELSE * CALL ZLAMOV( 'All', MP, N, A( IIA+(JJA-1)*LDA ), $ LDA, B( IIB+(JJB-1)*LDB ), LDB ) * END IF * END IF * ELSE IF( M.LE.( MBA-IROFFA ) ) THEN * * It is assumed that the local rows IIA:IIA+M-1 of the matrix A * are in the same process row (IAROW). * * ICOFFA * / \JJA * IIA ------------------ .... -------- * | .x | | | / | | \ * | . x | | | ILEFT| | | | * | . x | | | | | | * | . x | | \ x | | * | . |x | | |x | | IRIGHT * | . | x | | | x | | * (IAROW) | . | x | | | x | | * | . | x| | | x| | * | . | x | | x / * | . | |x | | | * | . | | x | | | * | . | | x | | | * | . | | x| | | * IIA+M-1 ------------------ .... ------- * NB_A * (IACOL) Local picture * IF( MYROW.EQ.IAROW ) THEN * NQ = NUMROC( N+ICOFFA, NBA, MYCOL, IACOL, NPCOL ) IF( NQ.LE.0 ) $ RETURN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA MYDIST = MOD( MYCOL-IACOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBA - ICOFFA * IF( LSAME( UPLO, 'L' ) ) THEN * ILEFT = MAX( 0, ILEFT ) JJBEGA = JJA JJENDA = JJA + NQ - 1 JJNXTA = MIN( ICEIL( JJBEGA, NBA ) * NBA, JJENDA ) JJBEGB = JJB JJNXTB = JJBEGB + JJNXTA - JJBEGA * 30 CONTINUE IF( ( M-ILEFT ).GT.0 ) THEN CALL ZLAMOV( UPLO, M-ILEFT, JJNXTA-JJBEGA+1, $ A( IIA+ILEFT+(JJBEGA-1)*LDA ), LDA, $ B( IIB+ILEFT+(JJBEGB-1)*LDB ), LDB ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA JJBEGA = JJNXTA +1 JJNXTA = MIN( JJNXTA+NBA, JJENDA ) JJBEGB = JJNXTB +1 JJNXTB = JJBEGB + JJNXTA - JJBEGA GO TO 30 END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * NQAA = NQ IIAA = IIA JJAA = JJA IIBB = IIB JJBB = JJB IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 40 CONTINUE IF( IIAA.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL ZLAMOV( 'All', ILEFT-IIAA+IIA, NQAA, $ A( IIAA+(JJAA-1)*LDA ), LDA, $ B( IIBB+(JJBB-1)*LDB ), LDB ) CALL ZLAMOV( UPLO, WIDE, NQAA, $ A( IIA+ILEFT+(JJAA-1)*LDA ), LDA, $ B( IIB+ILEFT+(JJBB-1)*LDB ), LDB ) NQAA = MAX( 0, NQAA - WIDE ) IIAA = IIA + IRIGHT JJAA = JJAA + WIDE IIBB = IIB + IRIGHT JJBB = JJBB + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 40 END IF * ELSE * CALL ZLAMOV( 'All', M, NQ, A( IIA+(JJA-1)*LDA ), $ LDA, B( IIB+(JJB-1)*LDB ), LDB ) * END IF * END IF * END IF * RETURN * * End of PZLACP2 * END scalapack-2.0.2/SRC/pzlacp3.f000644 000766 000024 00000030241 10363532303 016061 0ustar00juliestaff000000 000000 SUBROUTINE PZLACP3( M, I, A, DESCA, B, LDB, II, JJ, REV ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. INTEGER I, II, JJ, LDB, M, REV * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), B( LDB, * ) * .. * * Purpose * ======= * * PZLACP3 is an auxiliary routine that copies from a global parallel * array into a local replicated array or vise versa. Notice that * the entire submatrix that is copied gets placed on one node or * more. The receiving node can be specified precisely, or all nodes * can receive, or just one row or column of nodes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * M is the order of the square submatrix that is copied. * M >= 0. * Unchanged on exit * * I (global input) INTEGER * A(I,I) is the global location that the copying starts from. * Unchanged on exit. * * A (global input/output) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the parallel matrix to be copied into or from. * On exit, if REV=1, the copied data. * Unchanged on exit if REV=0. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/output) COMPLEX*16 array of size (LDB,M) * If REV=0, this is the global portion of the array * A(I:I+M-1,I:I+M-1). * If REV=1, this is the unchanged on exit. * * LDB (local input) INTEGER * The leading dimension of B. * * II (global input) INTEGER * By using REV 0 & 1, data can be sent out and returned again. * If REV=0, then II is destination row index for the node(s) * receiving the replicated B. * If II>=0,JJ>=0, then node (II,JJ) receives the data * If II=-1,JJ>=0, then all rows in column JJ receive the * data * If II>=0,JJ=-1, then all cols in row II receive the data * If II=-1,JJ=-1, then all nodes receive the data * If REV<>0, then II is the source row index for the node(s) * sending the replicated B. * * JJ (global input) INTEGER * Similar description as II above * * REV (global input) INTEGER * Use REV = 0 to send global A into locally replicated B * (on node (II,JJ)). * Use REV <> 0 to send locally replicated B from node (II,JJ) * to its owner (which changes depending on its location in * A) into the global A. * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER COL, CONTXT, HBL, ICOL1, ICOL2, IDI, IDJ, IFIN, $ III, IROW1, IROW2, ISTOP, ISTOPI, ISTOPJ, ITMP, $ JJJ, LDA, MYCOL, MYROW, NPCOL, NPROW, ROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG1L, ZGEBR2D, ZGEBS2D, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.LE.0 ) $ RETURN * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( REV.EQ.0 ) THEN DO 20 IDI = 1, M DO 10 IDJ = 1, M B( IDI, IDJ ) = ZERO 10 CONTINUE 20 CONTINUE END IF * IFIN = I + M - 1 * IF( MOD( I+HBL, HBL ).NE.0 ) THEN ISTOP = MIN( I+HBL-MOD( I+HBL, HBL ), IFIN ) ELSE ISTOP = I END IF IDJ = I ISTOPJ = ISTOP IF( IDJ.LE.IFIN ) THEN 30 CONTINUE IDI = I ISTOPI = ISTOP IF( IDI.LE.IFIN ) THEN 40 CONTINUE ROW = MOD( ( IDI-1 ) / HBL, NPROW ) COL = MOD( ( IDJ-1 ) / HBL, NPCOL ) CALL INFOG1L( IDI, HBL, NPROW, ROW, 0, IROW1, ITMP ) IROW2 = NUMROC( ISTOPI, HBL, ROW, 0, NPROW ) CALL INFOG1L( IDJ, HBL, NPCOL, COL, 0, ICOL1, ITMP ) ICOL2 = NUMROC( ISTOPJ, HBL, COL, 0, NPCOL ) IF( ( MYROW.EQ.ROW ) .AND. ( MYCOL.EQ.COL ) ) THEN IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to everyone * IF( REV.EQ.0 ) THEN CALL ZGEBS2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.NE.-1 ) ) THEN * * Send the message to Column MYCOL which better be JJ * IF( REV.EQ.0 ) THEN CALL ZGEBS2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to Row MYROW which better be II * IF( REV.EQ.0 ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.NE.-1 ) .AND. $ ( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) ) THEN * * Recv/Send the message to (II,JJ) * IF( REV.EQ.0 ) THEN CALL ZGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ A( ( ICOL1-1 )*LDA+IROW1 ), LDA, II, $ JJ ) ELSE CALL ZGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, II, JJ ) END IF END IF IF( REV.EQ.0 ) THEN DO 60 JJJ = ICOL1, ICOL2 DO 50 III = IROW1, IROW2 B( IDI+III-IROW1+1-I, IDJ+JJJ-ICOL1+1-I ) $ = A( ( JJJ-1 )*LDA+III ) 50 CONTINUE 60 CONTINUE ELSE DO 80 JJJ = ICOL1, ICOL2 DO 70 III = IROW1, IROW2 A( ( JJJ-1 )*LDA+III ) = B( IDI+III-IROW1+1-I, $ IDJ+JJJ-ICOL1+1-I ) 70 CONTINUE 80 CONTINUE END IF ELSE IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL ZGEBR2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL ZGEBR2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL ZGEBR2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL ZGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) ELSE CALL ZGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) * CALL ZGESD2D(CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, * $ A((ICOL1-1)*LDA+IROW1),LDA, ROW, COL) END IF END IF END IF IDI = ISTOPI + 1 ISTOPI = MIN( ISTOPI+HBL, IFIN ) IF( IDI.LE.IFIN ) $ GO TO 40 END IF IDJ = ISTOPJ + 1 ISTOPJ = MIN( ISTOPJ+HBL, IFIN ) IF( IDJ.LE.IFIN ) $ GO TO 30 END IF RETURN * * End of PZLACP3 * END scalapack-2.0.2/SRC/pzlacpy.f000644 000766 000024 00000022370 10363532303 016173 0ustar00juliestaff000000 000000 SUBROUTINE PZLACPY( UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, $ DESCB ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, JA, JB, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ) * .. * * Purpose * ======= * * PZLACPY copies all or part of a distributed matrix A to another * distributed matrix B. No communication is performed, PZLACPY * performs a local copy sub( A ) := sub( B ), where sub( A ) denotes * A(IA:IA+M-1,JA:JA+N-1) and sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * copied: * = 'U': Upper triangular part is copied; the strictly * lower triangular part of sub( A ) is not referenced; * = 'L': Lower triangular part is copied; the strictly * upper triangular part of sub( A ) is not referenced; * Otherwise: All of the matrix sub( A ) is copied. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix sub( A ) * to be copied from. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_B, LOCc(JB+N-1) ). This array * contains on exit the local pieces of the distributed matrix * sub( B ) set as follows: * * if UPLO = 'U', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=j, 1<=j<=N; * if UPLO = 'L', B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * j<=i<=M, 1<=j<=N; * otherwise, B(IB+i-1,JB+j-1) = A(IA+i-1,JA+j-1), * 1<=i<=M, 1<=j<=N. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, IAA, IBB, IBLK, IN, ITMP, J, JAA, JBB, $ JBLK, JN, JTMP * .. * .. External Subroutines .. EXTERNAL PZLACP2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) * IF( M.LE.( DESCA( MB_ ) - MOD( IA-1, DESCA( MB_ ) ) ) .OR. $ N.LE.( DESCA( NB_ ) - MOD( JA-1, DESCA( NB_ ) ) ) ) THEN CALL PZLACP2( UPLO, M, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) ELSE * IF( LSAME( UPLO, 'U' ) ) THEN CALL PZLACP2( UPLO, IN-IA+1, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) IBB = IB + ITMP JBB = JB + ITMP JAA = JA + ITMP CALL PZLACP2( UPLO, IBLK, N-ITMP, A, I, JAA, DESCA, $ B, IBB, JBB, DESCB ) 10 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN CALL PZLACP2( UPLO, M, JN-JA+1, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) IBB = IB + JTMP JBB = JB + JTMP IAA = IA + JTMP CALL PZLACP2( UPLO, M-JTMP, JBLK, A, IAA, J, DESCA, $ B, IBB, JBB, DESCB ) 20 CONTINUE ELSE IF( M.LE.N ) THEN CALL PZLACP2( UPLO, IN-IA+1, N, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) IBB = IB+ITMP CALL PZLACP2( UPLO, IBLK, N, A, I, JA, DESCA, $ B, IBB, JB, DESCB ) 30 CONTINUE ELSE CALL PZLACP2( UPLO, M, JN-JA+1, A, IA, JA, DESCA, $ B, IB, JB, DESCB ) DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) JBB = JB+JTMP CALL PZLACP2( UPLO, M, JBLK, A, IA, J, DESCA, $ B, IB, JBB, DESCB ) 40 CONTINUE END IF END IF * END IF * RETURN * * End of PZLACPY * END scalapack-2.0.2/SRC/pzlaevswp.f000644 000766 000024 00000025767 10363532303 016561 0ustar00juliestaff000000 000000 * * SUBROUTINE PZLAEVSWP( N, ZIN, LDZI, Z, IZ, JZ, DESCZ, NVS, KEY, $ RWORK, LRWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 15, 1997 * * .. Scalar Arguments .. INTEGER IZ, JZ, LDZI, LRWORK, N * .. * .. Array Arguments .. INTEGER DESCZ( * ), KEY( * ), NVS( * ) DOUBLE PRECISION RWORK( * ), ZIN( LDZI, * ) COMPLEX*16 Z( * ) * .. * * Purpose * ======= * * PZLAEVSWP moves the eigenvectors (potentially unsorted) from * where they are computed, to a ScaLAPACK standard block cyclic * array, sorted so that the corresponding eigenvalues are sorted. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * NP = the number of rows local to a given process. * NQ = the number of columns local to a given process. * * N (global input) INTEGER * The order of the matrix A. N >= 0. * * ZIN (local input) DOUBLE PRECISION array, * dimension ( LDZI, NVS(iam) ) * The eigenvectors on input. Each eigenvector resides entirely * in one process. Each process holds a contiguous set of * NVS(iam) eigenvectors. The first eigenvector which the * process holds is: sum for i=[0,iam-1) of NVS(i) * * LDZI (locl input) INTEGER * leading dimension of the ZIN array * * Z (local output) COMPLEX*16 array * global dimension (N, N), local dimension (DESCZ(DLEN_), NQ) * The eigenvectors on output. The eigenvectors are distributed * in a block cyclic manner in both dimensions, with a * block size of NB. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * NVS (global input) INTEGER array, dimension( nprocs+1 ) * nvs(i) = number of processes * number of eigenvectors held by processes [0,i-1) * nvs(1) = number of eigen vectors held by [0,1-1) == 0 * nvs(nprocs+1) = number of eigen vectors held by [0,nprocs) == * total number of eigenvectors * * KEY (global input) INTEGER array, dimension( N ) * Indicates the actual index (after sorting) for each of the * eigenvectors. * * RWORK (local workspace) DOUBLE PRECISION array, dimension (LRWORK) * * LRWORK (local input) INTEGER dimension of RWORK * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CYCLIC_I, CYCLIC_J, DIST, I, IAM, II, INCII, J, $ MAXI, MAXII, MINI, MINII, MYCOL, MYROW, NB, $ NBUFSIZE, NPCOL, NPROCS, NPROW, PCOL, RECVCOL, $ RECVFROM, RECVROW, SENDCOL, SENDROW, SENDTO * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P EXTERNAL INDXG2L, INDXG2P * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGERV2D, DGESD2D * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW + MYCOL*NPROW IAM = MYROW*NPCOL + MYCOL * NB = DESCZ( MB_ ) * NPROCS = NPROW*NPCOL * * If PxSTEIN operates on a sub-matrix of a global matrix, the * key [] that contains the indicies of the eigenvectors is refe- * renced to the dimensions of the sub-matrix and not the global * distrubited matrix. Because of this, PxLAEVSWP will incorrectly * map the eigenvectors to the global eigenvector matrix, Z, unless * the key[] elements are shifted as below. * DO 10 J = DESCZ( N_ ), 1, -1 KEY( J ) = KEY( J-JZ+1 ) + ( JZ-1 ) 10 CONTINUE * DO 110 DIST = 0, NPROCS - 1 * SENDTO = MOD( IAM+DIST, NPROCS ) RECVFROM = MOD( NPROCS+IAM-DIST, NPROCS ) * SENDROW = MOD( SENDTO, NPROW ) SENDCOL = SENDTO / NPROW RECVROW = MOD( RECVFROM, NPROW ) RECVCOL = RECVFROM / NPROW * SENDROW = SENDTO / NPCOL SENDCOL = MOD( SENDTO, NPCOL ) RECVROW = RECVFROM / NPCOL RECVCOL = MOD( RECVFROM, NPCOL ) * * Figure out what I have that process "sendto" wants * NBUFSIZE = 0 * * We are looping through the eigenvectors that I presently own. * DO 40 J = NVS( 1+IAM ) + JZ, NVS( 1+IAM+1 ) + JZ - 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( SENDCOL.EQ.PCOL ) THEN MINII = MOD( SENDROW+DESCZ( RSRC_ ), NPROW )* $ DESCZ( MB_ ) + 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 30 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) MAXI = MIN( II+DESCZ( MB_ )-1, N+IZ-1 ) DO 20 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 RWORK( NBUFSIZE ) = ZIN( I+1-IZ, $ J-NVS( 1+IAM )+1-JZ ) 20 CONTINUE 30 CONTINUE END IF 40 CONTINUE * * IF( MYROW.NE.SENDROW .OR. MYCOL.NE.SENDCOL ) $ CALL DGESD2D( DESCZ( CTXT_ ), NBUFSIZE, 1, RWORK, NBUFSIZE, $ SENDROW, SENDCOL ) * * * Figure out what process "recvfrom" has that I want * NBUFSIZE = 0 DO 70 J = NVS( 1+RECVFROM ) + JZ, $ NVS( 1+RECVFROM+1 ) + JZ - 1, 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.PCOL ) THEN MINII = MOD( MYROW+DESCZ( RSRC_ ), NPROW )*DESCZ( MB_ ) + $ 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 60 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) MAXI = MIN( II+NB-1, N+IZ-1 ) DO 50 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 50 CONTINUE 60 CONTINUE END IF 70 CONTINUE * * * IF( MYROW.NE.RECVROW .OR. MYCOL.NE.RECVCOL ) $ CALL DGERV2D( DESCZ( CTXT_ ), 1, NBUFSIZE, RWORK, 1, $ RECVROW, RECVCOL ) * NBUFSIZE = 0 DO 100 J = NVS( 1+RECVFROM ) + JZ, $ NVS( 1+RECVFROM+1 ) + JZ - 1, 1 PCOL = INDXG2P( KEY( J ), DESCZ( NB_ ), -1, DESCZ( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.PCOL ) THEN CYCLIC_J = INDXG2L( KEY( J ), DESCZ( MB_ ), -1, -1, $ NPCOL ) CYCLIC_I = 1 MINII = MOD( MYROW+DESCZ( RSRC_ ), NPROW )*DESCZ( MB_ ) + $ 1 MAXII = DESCZ( M_ ) INCII = DESCZ( MB_ )*NPROW DO 90 II = MINII, MAXII, INCII MINI = MAX( II, IZ ) CYCLIC_I = INDXG2L( MINI, DESCZ( MB_ ), -1, -1, $ NPROW ) MAXI = MIN( II+NB-1, N+IZ-1 ) DO 80 I = MINI, MAXI, 1 NBUFSIZE = NBUFSIZE + 1 Z( CYCLIC_I+( CYCLIC_J-1 )*DESCZ( LLD_ ) ) $ = DCMPLX( RWORK( NBUFSIZE ) ) CYCLIC_I = CYCLIC_I + 1 80 CONTINUE 90 CONTINUE END IF 100 CONTINUE * 110 CONTINUE RETURN * * End of PZLAEVSWP * END scalapack-2.0.2/SRC/pzlahqr.f000644 000766 000024 00000333323 10602576752 016211 0ustar00juliestaff000000 000000 SUBROUTINE PZLAHQR( WANTT, WANTZ, N, ILO, IHI, A, DESCA, W, ILOZ, $ IHIZ, Z, DESCZ, WORK, LWORK, IWORK, ILWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7.3) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * 1.7.3: March 22, 2006 * modification suggested by Mark Fahey and Greg Henry * 1.7.0: July 31, 2001 * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, ILWORK, INFO, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) COMPLEX*16 A( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZLAHQR is an auxiliary routine used to find the Schur decomposition * and or eigenvalues of a matrix already in Hessenberg form from * cols ILO to IHI. * If Z = I, and WANTT=WANTZ=.TRUE., H gets replaced with Z'HZ, * with Z'Z=I, and H in Schur form. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCp(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCp( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCq( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCp() and LOCq() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix A (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that A is already upper quasi-triangular in * rows and columns IHI+1:N, and that A(ILO,ILO-1) = 0 (unless * ILO = 1). PZLAHQR works primarily with the Hessenberg * submatrix in rows and columns ILO to IHI, but applies * transformations to all of H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * A (global input/output) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the upper Hessenberg matrix A. * On exit, if WANTT is .TRUE., A is upper triangular in rows * and columns ILO:IHI. If WANTT is .FALSE., the contents of * A are unspecified on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * W (global replicated output) COMPLEX*16 array, dimension (N) * The computed eigenvalues ILO to IHI are stored in the * corresponding elements of W. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in A. A may be returned with * larger diagonal blocks until the next release. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (global input/output) COMPLEX*16 array. * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations accumulated by PZHSEQR, and on * exit Z has been updated; transformations are applied only to * the submatrix Z(ILOZ:IHIZ,ILO:IHI). * If WANTZ is .FALSE., Z is not referenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local output) COMPLEX*16 array of size LWORK * (Unless LWORK=-1, in which case WORK must be at least size 1) * * LWORK (local input) INTEGER * WORK(LWORK) is a local array and LWORK is assumed big enough * so that LWORK >= 3*N + * MAX( 2*MAX(DESCZ(LLD_),DESCA(LLD_)) + 2*LOCq(N), * 7*Ceil(N/HBL)/LCM(NPROW,NPCOL)) + * MAX( 2*N, (8*LCM(NPROW,NPCOL)+2)**2 ) * If LWORK=-1, then WORK(1) gets set to the above number and * the code returns immediately. * * IWORK (global and local input) INTEGER array of size ILWORK * This will hold some of the IBLK integer arrays. * This is held as a place holder for a future release. * Currently unreferenced. * * ILWORK (local input) INTEGER * This will hold the size of the IWORK array. * This is held as a place holder for a future release. * Currently unreferenced. * * INFO (global output) INTEGER * < 0: parameter number -INFO incorrect or inconsistent * = 0: successful exit * > 0: PZLAHQR failed to compute all the eigenvalues ILO to IHI * in a total of 30*(IHI-ILO+1) iterations; if INFO = i, * elements i+1:ihi of W contains those eigenvalues * which have been successfully computed. * * Logic: * This algorithm is very similar to DLAHQR. Unlike DLAHQR, * instead of sending one double shift through the largest * unreduced submatrix, this algorithm sends multiple double shifts * and spaces them apart so that there can be parallelism across * several processor row/columns. Another critical difference is * that this algorithm aggregrates multiple transforms together in * order to apply them in a block fashion. * * Important Local Variables: * IBLK = The maximum number of bulges that can be computed. * Currently fixed. Future releases this won't be fixed. * HBL = The square block size (HBL=DESCA(MB_)=DESCA(NB_)) * ROTN = The number of transforms to block together * NBULGE = The number of bulges that will be attempted on the * current submatrix. * IBULGE = The current number of bulges started. * K1(*),K2(*) = The current bulge loops from K1(*) to K2(*). * * Subroutines: * From LAPACK, this routine calls: * ZLAHQR -> Serial QR used to determine shifts and * eigenvalues * ZLARFG -> Determine the Householder transforms * * This ScaLAPACK, this routine calls: * PZLACONSB -> To determine where to start each iteration * ZLAMSH -> Sends multiple shifts through a small * submatrix to see how the consecutive * subdiagonals change (if PZLACONSB indicates * we can start a run in the middle) * PZLAWIL -> Given the shift, get the transformation * PZLACP3 -> Parallel array to local replicated array copy * & back. * ZLAREF -> Row/column reflector applier. Core routine * here. * PZLASMSUB -> Finds negligible subdiagonal elements. * * Current Notes and/or Restrictions: * 1.) This code requires the distributed block size to be square * and at least six (6); unlike simpler codes like LU, this * algorithm is extremely sensitive to block size. Unwise * choices of too small a block size can lead to bad * performance. * 2.) This code requires A and Z to be distributed identically * and have identical contxts. A future version may allow Z to * have a different contxt to 1D row map it to all nodes (so no * communication on Z is necessary.) * 3.) This code does not currently block the initial transforms * so that none of the rows or columns for any bulge are * completed until all are started. To offset pipeline * start-up it is recommended that at least 2*LCM(NPROW,NPCOL) * bulges are used (if possible) * 4.) The maximum number of bulges currently supported is fixed at * 32. In future versions this will be limited only by the * incoming WORK and IWORK array. * 5.) The matrix A must be in upper Hessenberg form. If elements * below the subdiagonal are nonzero, the resulting transforms * may be nonsimilar. This is also true with the LAPACK * routine ZLAHQR. * 6.) For this release, this code has only been tested for * RSRC_=CSRC_=0, but it has been written for the general case. * 7.) Currently, all the eigenvalues are distributed to all the * nodes. Future releases will probably distribute the * eigenvalues by the column partitioning. * 8.) The internals of this routine are subject to change. * 9.) To optimize this for your architecture, try tuning ZLAREF. * 10.) This code has only been tested for WANTZ = .TRUE. and may * behave unpredictably for WANTZ set to .FALSE. * * Further Details * =============== * * Contributed by Mark Fahey, June, 2000. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION RONE PARAMETER ( RONE = 1.0D+0 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) DOUBLE PRECISION CONST PARAMETER ( CONST = 1.50D+0 ) INTEGER IBLK PARAMETER ( IBLK = 32 ) * .. * .. Local Scalars .. LOGICAL SKIP INTEGER CONTXT, DOWN, HBL, I, I1, I2, IAFIRST, IBULGE, $ ICBUF, ICOL, ICOL1, ICOL2, IDIA, IERR, II, $ IRBUF, IROW, IROW1, IROW2, ISPEC, ISTART, $ ISTARTCOL, ISTARTROW, ISTOP, ISUB, ISUP, $ ITERMAX, ITMP1, ITMP2, ITN, ITS, IZBUF, J, $ JAFIRST, JBLK, JJ, K, KI, L, LCMRC, LDA, LDZ, $ LEFT, LIHIH, LIHIZ, LILOH, LILOZ, LOCALI1, $ LOCALI2, LOCALK, LOCALM, M, MODKM1, MYCOL, $ MYROW, NBULGE, NH, NODE, NPCOL, NPROW, NQ, NR, $ NUM, NZ, RIGHT, ROTN, UP, VECSIDX DOUBLE PRECISION CS, OVFL, S, SMLNUM, ULP, UNFL COMPLEX*16 CDUM, H10, H11, H22, H33, H43H34, H44, SN, SUM, $ T1, T1COPY, T2, T3, V1SAVE, V2, V2SAVE, V3, $ V3SAVE * .. * .. Local Arrays .. INTEGER ICURCOL( IBLK ), ICURROW( IBLK ), K1( IBLK ), $ K2( IBLK ), KCOL( IBLK ), KP2COL( IBLK ), $ KP2ROW( IBLK ), KROW( IBLK ) COMPLEX*16 S1( 2*IBLK, 2*IBLK ), SMALLA( 6, 6, IBLK ), $ VCOPY( 3 ) * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMN2D, IGEBR2D, IGEBS2D, $ INFOG1L, INFOG2L, PDLABAD, PXERBLA, PZLACONSB, $ PZLACP3, PZLASMSUB, PZLAWIL, PZROT, ZCOPY, $ ZGEBR2D, ZGEBS2D, ZGERV2D, ZGESD2D, ZGSUM2D, $ ZLAHQR2, ZLAMSH, ZLANV2, ZLAREF, ZLARFG * .. * .. Intrinsic Functions .. * INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * INFO = 0 * ITERMAX = 30*( IHI-ILO+1 ) IF( N.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL NUM = NPROW*NPCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) LCMRC = ILCM( NPROW, NPCOL ) IF( ( NPROW.LE.3 ) .OR. ( NPCOL.LE.3 ) ) THEN SKIP = .TRUE. ELSE SKIP = .FALSE. END IF * * Determine the number of columns we have so we can check workspace * NQ = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) JJ = N / HBL IF( JJ*HBL.LT.N ) $ JJ = JJ + 1 JJ = 7*JJ / LCMRC JJ = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, JJ ) JJ = JJ + MAX( 2*N, ( 8*LCMRC+2 )**2 ) IF( LWORK.EQ.-1 ) THEN WORK( 1 ) = JJ RETURN END IF IF( LWORK.LT.JJ ) THEN INFO = -14 END IF IF( DESCZ( CTXT_ ).NE.DESCA( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) END IF IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1300+NB_ ) END IF IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1300+MB_ ) END IF IF( ( DESCA( RSRC_ ).NE.0 ) .OR. ( DESCA( CSRC_ ).NE.0 ) ) THEN INFO = -( 700+RSRC_ ) END IF IF( ( DESCZ( RSRC_ ).NE.0 ) .OR. ( DESCZ( CSRC_ ).NE.0 ) ) THEN INFO = -( 1300+RSRC_ ) END IF IF( ( ILO.GT.N ) .OR. ( ILO.LT.1 ) ) THEN INFO = -4 END IF IF( ( IHI.GT.N ) .OR. ( IHI.LT.1 ) ) THEN INFO = -5 END IF IF( HBL.LT.5 ) THEN INFO = -( 700+MB_ ) END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PZLAHQR', -INFO ) RETURN END IF * * Set work array indices * VECSIDX = 0 IDIA = 3*N ISUB = 3*N ISUP = 3*N IRBUF = 3*N ICBUF = 3*N IZBUF = 5*N * * Find a value for ROTN * ROTN = HBL / 3 ROTN = MIN( ROTN, HBL-2 ) ROTN = MAX( ROTN, 1 ) * IF( ILO.EQ.IHI ) THEN CALL INFOG2L( ILO, ILO, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN W( ILO ) = A( ( ICOL-1 )*LDA+IROW ) ELSE W( ILO ) = ZERO END IF RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * CALL INFOG1L( ILOZ, HBL, NPROW, MYROW, IAFIRST, LILOZ, LIHIZ ) LIHIZ = NUMROC( IHIZ, HBL, MYROW, IAFIRST, NPROW ) * * Set machine-dependent constants for the stopping criterion. * If NORM(H) <= SQRT(OVFL), overflow should not occur. * UNFL = PDLAMCH( CONTXT, 'SAFE MINIMUM' ) OVFL = RONE / UNFL CALL PDLABAD( CONTXT, UNFL, OVFL ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = ITERMAX * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of our schur block size (<=2*IBLK). Each * iteration of the loop works with the active submatrix in rows * and columns L to I. Eigenvalues I+1 to IHI have already * converged. Either L = ILO or the global A(L,L-1) is negligible * so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 570 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 540 ITS = 0, ITN * * Look for a single small subdiagonal element. * CALL PZLASMSUB( A, DESCA, I, L, K, SMLNUM, WORK( IRBUF+1 ), $ LWORK-IRBUF ) L = K * IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * CALL INFOG2L( L, L-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN A( ( ICOL-1 )*LDA+IROW ) = ZERO END IF WORK( ISUB+L-1 ) = ZERO END IF * * Exit from loop if a submatrix of order 1 or 2 has split off. * IF( WANTT ) THEN * For Schur form, use 2x2 blocks IF( L.GE.I-1 ) THEN GO TO 550 END IF ELSE * If we don't want the Schur form, use bigger blocks. IF( L.GE.I-( 2*IBLK-1 ) ) THEN GO TO 550 END IF END IF * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * * Copy submatrix of size 2*JBLK and prepare to do generalized * Wilkinson shift or an exceptional shift * JBLK = MIN( IBLK, ( ( I-L+1 ) / 2 )-1 ) IF( JBLK.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * JBLK = JBLK - MOD( JBLK, LCMRC ) END IF JBLK = MIN( JBLK, 2*LCMRC ) JBLK = MAX( JBLK, 1 ) * CALL PZLACP3( 2*JBLK, I-2*JBLK+1, A, DESCA, S1, 2*IBLK, -1, -1, $ 0 ) IF( ( ITS.EQ.20 .OR. ITS.EQ.40 ) .AND. ( JBLK.GT.1 ) ) THEN * * Exceptional shift. * DO 20 II = 2*JBLK, 2, -1 S1( II, II ) = CONST*( CABS1( S1( II, II ) )+ $ CABS1( S1( II, II-1 ) ) ) S1( II, II-1 ) = ZERO S1( II-1, II ) = ZERO 20 CONTINUE S1( 1, 1 ) = CONST*CABS1( S1( 1, 1 ) ) ELSE CALL ZLAHQR2( .FALSE., .FALSE., 2*JBLK, 1, 2*JBLK, S1, $ 2*IBLK, WORK( IRBUF+1 ), 1, 2*JBLK, Z, LDZ, $ IERR ) * * Prepare to use Wilkinson's double shift * H44 = S1( 2*JBLK, 2*JBLK ) H33 = S1( 2*JBLK-1, 2*JBLK-1 ) H43H34 = S1( 2*JBLK-1, 2*JBLK )*S1( 2*JBLK, 2*JBLK-1 ) * END IF * * Look for two consecutive small subdiagonal elements: * PZLACONSB is the routine that does this. * CALL PZLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, $ WORK( IRBUF+1 ), LWORK-IRBUF ) * * Double-shift QR step * * NBULGE is the number of bulges that will be attempted * ISTOP = MIN( M+ROTN-1-MOD( M-( M / HBL )*HBL-1, ROTN ), I-2 ) ISTOP = MIN( ISTOP, M+HBL-3-MOD( M-1, HBL ) ) ISTOP = MIN( ISTOP, I2-2 ) ISTOP = MAX( ISTOP, M ) NBULGE = ( I-1-ISTOP ) / HBL * * Do not exceed maximum determined. * NBULGE = MIN( NBULGE, JBLK ) IF( NBULGE.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * NBULGE = NBULGE - MOD( NBULGE, LCMRC ) END IF NBULGE = MAX( NBULGE, 1 ) * * If we are starting in the middle because of consecutive small * subdiagonal elements, we need to see how many bulges we * can send through without breaking the consecutive small * subdiagonal property. * IF( ( NBULGE.GT.1 ) .AND. ( M.GT.L ) ) THEN * * Copy a chunk of elements from global A(M-1:,M-1:) * CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ITMP1, ITMP2 ) II = MIN( 4*NBULGE+2, N-M+2 ) CALL PZLACP3( II, M-1, A, DESCA, WORK( IRBUF+1 ), II, ITMP1, $ ITMP2, 0 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN * * Find a new NBULGE based on the bulges we have. * CALL ZLAMSH( S1, 2*IBLK, NBULGE, JBLK, WORK( IRBUF+1 ), $ II, II, ULP ) IF( NUM.GT.1 ) THEN CALL IGEBS2D( CONTXT, 'ALL', ' ', 1, 1, NBULGE, 1 ) END IF ELSE * * Everyone needs to receive the new NBULGE * CALL IGEBR2D( CONTXT, 'ALL', ' ', 1, 1, NBULGE, 1, ITMP1, $ ITMP2 ) END IF END IF * * IBULGE is the number of bulges going so far * IBULGE = 1 * * "A" row defs : main row transforms from LOCALK to LOCALI2 * CALL INFOG1L( M, HBL, NPCOL, MYCOL, JAFIRST, ITMP1, LOCALK ) LOCALK = NQ CALL INFOG1L( 1, HBL, NPCOL, MYCOL, JAFIRST, ICOL1, LOCALI2 ) LOCALI2 = NUMROC( I2, HBL, MYCOL, JAFIRST, NPCOL ) * * "A" col defs : main col transforms from LOCALI1 to LOCALM * CALL INFOG1L( I1, HBL, NPROW, MYROW, IAFIRST, LOCALI1, ICOL1 ) CALL INFOG1L( 1, HBL, NPROW, MYROW, IAFIRST, LOCALM, ICOL1 ) ICOL1 = NUMROC( MIN( M+3, I ), HBL, MYROW, IAFIRST, NPROW ) * * Which row & column will start the bulges * ISTARTROW = MOD( ( M+1 ) / HBL, NPROW ) + IAFIRST ISTARTCOL = MOD( ( M+1 ) / HBL, NPCOL ) + JAFIRST * CALL INFOG1L( M, HBL, NPROW, MYROW, IAFIRST, II, ITMP2 ) CALL INFOG1L( M, HBL, NPCOL, MYCOL, JAFIRST, JJ, ITMP2 ) CALL INFOG1L( 1, HBL, NPROW, MYROW, IAFIRST, ISTOP, $ KP2ROW( 1 ) ) KP2ROW( 1 ) = NUMROC( M+2, HBL, MYROW, IAFIRST, NPROW ) CALL INFOG1L( 1, HBL, NPCOL, MYCOL, JAFIRST, ISTOP, $ KP2COL( 1 ) ) KP2COL( 1 ) = NUMROC( M+2, HBL, MYCOL, JAFIRST, NPCOL ) * * Set all values for bulges. All bulges are stored in * intermediate steps as loops over KI. Their current "task" * over the global M to I-1 values is always K1(KI) to K2(KI). * However, because there are many bulges, K1(KI) & K2(KI) might * go past that range while later bulges (KI+1,KI+2,etc..) are * finishing up. Even if ROTN=1, in order to minimize border * communication sometimes K1(KI)=HBL-2 & K2(KI)=HBL-1 so both * border messages can be handled at once. * * Rules: * If MOD(K1(KI)-1,HBL) < HBL-2 then MOD(K2(KI)-1,HBL)= 0. * * K (global input) INTEGER * The offset for the reduction. Elements below the k-th * subdiagonal in the first NB columns are reduced to zero. * * NB (global input) INTEGER * The number of columns to be reduced. * * A (local input/local output) COMPLEX*16 pointer into * the local memory to an array of dimension (LLD_A, * LOCc(JA+N-K)). On entry, this array contains the the local * pieces of the N-by-(N-K+1) general distributed matrix * A(IA:IA+N-1,JA:JA+N-K). On exit, the elements on and above * the k-th subdiagonal in the first NB columns are overwritten * with the corresponding elements of the reduced distributed * matrix; the elements below the k-th subdiagonal, with the * array TAU, represent the matrix Q as a product of elementary * reflectors. The other columns of A(IA:IA+N-1,JA:JA+N-K) are * unchanged. See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16 array, dimension LOCc(JA+N-2) * The scalar factors of the elementary reflectors (see Further * Details). TAU is tied to the distributed matrix A. * * T (local output) COMPLEX*16 array, dimension (NB_A,NB_A) * The upper triangular matrix T. * * Y (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_Y,NB_A). On exit, this array * contains the local pieces of the N-by-NB distributed * matrix Y. LLD_Y >= LOCr(IA+N-1). * * IY (global input) INTEGER * The row index in the global array Y indicating the first * row of sub( Y ). * * JY (global input) INTEGER * The column index in the global array Y indicating the * first column of sub( Y ). * * DESCY (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Y. * * WORK (local workspace) COMPLEX*16 array, dimension (NB) * * Further Details * =============== * * The matrix Q is represented as a product of nb elementary reflectors * * Q = H(1) H(2) . . . H(nb). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in * A(ia+i+k:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The elements of the vectors v together form the (n-k+1)-by-nb matrix * V which is needed, with T and Y, to apply the transformation to the * unreduced part of the matrix, using an update of the form: * A(ia:ia+n-1,ja:ja+n-k) := (I-V*T*V')*(A(ia:ia+n-1,ja:ja+n-k)-Y*V'). * * The contents of A(ia:ia+n-1,ja:ja+n-k) on exit are illustrated by the * following example with n = 7, k = 3 and nb = 2: * * ( a h a a a ) * ( a h a a a ) * ( a h a a a ) * ( h h a a a ) * ( v1 h a a a ) * ( v1 v2 a a a ) * ( v1 v2 a a a ) * * where a denotes an element of the original matrix * A(ia:ia+n-1,ja:ja+n-k), h denotes a modified element of the upper * Hessenberg matrix H, and vi denotes an element of the vector * defining H(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL IPROC INTEGER I, IACOL, IAROW, ICTXT, IOFF, II, J, JJ, JL, $ JT, JW, L, MYROW, MYCOL, NPCOL, NPROW, NQ COMPLEX*16 EI * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PZELSET, $ PZGEMV, PZLACGV, PZLARFG, PZSCAL, $ ZAXPY, ZCOPY, ZSCAL, ZTRMV * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.1 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA+K, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) * IPROC = ( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) NQ = NUMROC( N+JA-1, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - IOFF * EI = ZERO JW = IOFF + 1 CALL DESCSET( DESCW, 1, DESCA( MB_ ), 1, DESCA( MB_ ), IAROW, $ IACOL, ICTXT, 1 ) * DO 10 L = 1, NB I = IA + K + L - 2 J = JA + L - 1 * IF( L.GT.1 ) THEN * * Update A(ia:ia+n-1,j) * * Compute i-th column of A - Y * V' * CALL PZLACGV( L-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZGEMV( 'No transpose', N, L-1, -ONE, Y, IY, JY, DESCY, $ A, I, JA, DESCA, DESCA( M_ ), ONE, A, IA, J, $ DESCA, 1 ) CALL PZLACGV( L-1, A, I, JA, DESCA, DESCA( M_ ) ) * * Apply I - V * T' * V' to this column (call it b) from the * left, using the last column of T as workspace * * Let V = ( V1 ) and b = ( b1 ) (first I-1 rows) * ( V2 ) ( b2 ) * * where V1 is unit lower triangular * * w := V1' * b1 * IF( IPROC ) THEN CALL ZCOPY( L-1, A( (JJ+L-2)*DESCA( LLD_ )+II ), 1, $ WORK( JW ), 1 ) CALL ZTRMV( 'Lower', 'Conjugate transpose', 'Unit', L-1, $ A( (JJ-1)*DESCA( LLD_ )+II ), DESCA( LLD_ ), $ WORK( JW ), 1 ) END IF * * w := w + V2'*b2 * CALL PZGEMV( 'Conjugate transpose', N-K-L+1, L-1, ONE, A, $ I+1, JA, DESCA, A, I+1, J, DESCA, 1, ONE, WORK, $ 1, JW, DESCW, DESCW( M_ ) ) * * w := T'*w * IF( IPROC ) $ CALL ZTRMV( 'Upper', 'Conjugate transpose', 'Non-unit', $ L-1, T, DESCA( NB_ ), WORK( JW ), 1 ) * * b2 := b2 - V2*w * CALL PZGEMV( 'No transpose', N-K-L+1, L-1, -ONE, A, I+1, JA, $ DESCA, WORK, 1, JW, DESCW, DESCW( M_ ), ONE, $ A, I+1, J, DESCA, 1 ) * * b1 := b1 - V1*w * IF( IPROC ) THEN CALL ZTRMV( 'Lower', 'No transpose', 'Unit', L-1, $ A( (JJ-1)*DESCA( LLD_ )+II ), DESCA( LLD_ ), $ WORK( JW ), 1 ) CALL ZAXPY( L-1, -ONE, WORK( JW ), 1, $ A( ( JJ+L-2 )*DESCA( LLD_ )+II ), 1 ) END IF CALL PZELSET( A, I, J-1, DESCA, EI ) END IF * * Generate the elementary reflector H(i) to annihilate * A(ia+k+i:ia+n-1,j) * CALL PZLARFG( N-K-L+1, EI, I+1, J, A, MIN( I+2, N+IA-1 ), J, $ DESCA, 1, TAU ) CALL PZELSET( A, I+1, J, DESCA, ONE ) * * Compute Y(iy:y+n-1,jy+l-1) * CALL PZGEMV( 'No transpose', N, N-K-L+1, ONE, A, IA, J+1, $ DESCA, A, I+1, J, DESCA, 1, ZERO, Y, IY, JY+L-1, $ DESCY, 1 ) CALL PZGEMV( 'Conjugate transpose', N-K-L+1, L-1, ONE, A, I+1, $ JA, DESCA, A, I+1, J, DESCA, 1, ZERO, WORK, 1, JW, $ DESCW, DESCW( M_ ) ) CALL PZGEMV( 'No transpose', N, L-1, -ONE, Y, IY, JY, DESCY, $ WORK, 1, JW, DESCW, DESCW( M_ ), ONE, Y, IY, $ JY+L-1, DESCY, 1 ) JL = MIN( JJ+L-1, JA+NQ-1 ) CALL PZSCAL( N, TAU( JL ), Y, IY, JY+L-1, DESCY, 1 ) * * Compute T(1:i,i) * IF( IPROC ) THEN JT = ( L-1 ) * DESCA( NB_ ) CALL ZSCAL( L-1, -TAU( JL ), WORK( JW ), 1 ) CALL ZCOPY( L-1, WORK( JW ), 1, T( JT+1 ), 1 ) CALL ZTRMV( 'Upper', 'No transpose', 'Non-unit', L-1, T, $ DESCA( NB_ ), T( JT+1 ), 1 ) T( JT+L ) = TAU( JL ) END IF 10 CONTINUE * CALL PZELSET( A, K+NB+IA-1, J, DESCA, EI ) * RETURN * * End of PZLAHRD * END scalapack-2.0.2/SRC/pzlamr1d.f000644 000766 000024 00000010671 10363532303 016244 0ustar00juliestaff000000 000000 SUBROUTINE PZLAMR1D( N, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * October 15, 1999 * * .. Scalar Arguments .. INTEGER IA, IB, JA, JB, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ) * .. * * Bugs * ==== * * I am not sure that this works correctly when IB and JB are not equal * to 1. Indeed, I suspect that IB should always be set to 1 or ignored * with 1 used in its place. * * PZLAMR1D has not been tested except withint the contect of * PZHEPTRD, the prototype reduction to tridiagonal form code. * * Purpose * * ======= * * PZLAMR1D redistributes a one-dimensional row vector from one data * decomposition to another. * * This is an auxiliary routine called by PZHETRD to redistribute D, E * and TAU. * * Notes * ===== * * Although all processes call PZGEMR2D, only the processes that own * the first column of A send data and only processes that own the * first column of B receive data. The calls to ZGEBS2D/ZGEBR2D * spread the data down. * * Arguments * ========= * * N (global input) INTEGER * The size of the matrix to be transposed. * * A (local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LOCc(JA+N-1)). * On output, A is replicated across all processes in * this processor column. * * IA (global input) INTEGER * A's global row index, which points to the beginning of * the submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LOCc(JB+N-1)). * * IB (global input) INTEGER * B's global row index, NOT USED * * JB (global input) INTEGER * B's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * WORK (local workspace) COMPLEX*16 array, dimension ( LWORK ) * * LWORK (local input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCAA( DLEN_ ), DESCBB( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PZGEMR2D, ZGEBR2D, ZGEBS2D * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Executable Statements .. * This is just to keep ftnchek and toolpack/1 happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * * Quick return if possible * IF( N.LE.0 ) $ RETURN * DO 10 I = 1, DLEN_ DESCAA( I ) = DESCA( I ) DESCBB( I ) = DESCB( I ) 10 CONTINUE * DESCAA( M_ ) = 1 DESCBB( M_ ) = 1 DESCAA( LLD_ ) = 1 DESCBB( LLD_ ) = 1 * ICTXT = DESCB( CTXT_ ) CALL PZGEMR2D( 1, N, A, IA, JA, DESCAA, B, IB, JB, DESCBB, ICTXT ) * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NQ = NUMROC( N, DESCB( NB_ ), MYCOL, 0, NPCOL ) * IF( MYROW.EQ.0 ) THEN CALL ZGEBS2D( ICTXT, 'C', ' ', NQ, 1, B, NQ ) ELSE CALL ZGEBR2D( ICTXT, 'C', ' ', NQ, 1, B, NQ, 0, MYCOL ) END IF * RETURN * * End of PZLAMR1D * END scalapack-2.0.2/SRC/pzlange.f000644 000766 000024 00000026666 10363532303 016165 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PZLANGE( NORM, M, N, A, IA, JA, DESCA, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION WORK( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLANGE returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * distributed matrix sub( A ) = A(IA:IA+M-1, JA:JA+N-1). * * PZLANGE returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+M-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PZLANGE as described * above. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). When M = 0, PZLANGE * is set to zero. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). When N = 0, * PZLANGE is set to zero. N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the distributed matrix sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, ICOFF, IOFFA, $ IROFF, J, JJ, LDA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ DOUBLE PRECISION SCALE, SUM, VALUE * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOMBSSQ, DGEBR2D, $ DGEBS2D, DGAMX2D, DGSUM2D, INFOG2L, $ PDTREECOMB, ZLASSQ * .. * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX, NUMROC EXTERNAL LSAME, IDAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * IF( MIN( M, N ).EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO IF( NQ.GT.0 .AND. MP.GT.0 ) THEN IOFFA = (JJ-1)*LDA DO 20 J = JJ, JJ+NQ-1 DO 10 I = II, MP+II-1 VALUE = MAX( VALUE, ABS( A( IOFFA+I ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE END IF CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, J, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * * Find norm1( sub( A ) ). * IF( NQ.GT.0 ) THEN IOFFA = ( JJ - 1 ) * LDA DO 40 J = JJ, JJ+NQ-1 SUM = ZERO IF( MP.GT.0 ) THEN DO 30 I = II, MP+II-1 SUM = SUM + ABS( A( IOFFA+I ) ) 30 CONTINUE END IF IOFFA = IOFFA + LDA WORK( J-JJ+1 ) = SUM 40 CONTINUE END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( IDAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, J, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * * Find normI( sub( A ) ). * IF( MP.GT.0 ) THEN IOFFA = II + ( JJ - 1 ) * LDA DO 60 I = II, II+MP-1 SUM = ZERO IF( NQ.GT.0 ) THEN DO 50 J = IOFFA, IOFFA + NQ*LDA - 1, LDA SUM = SUM + ABS( A( J ) ) 50 CONTINUE END IF WORK( I-II+1 ) = SUM IOFFA = IOFFA + 1 60 CONTINUE END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, WORK, MAX( 1, MP ), $ MYROW, 0 ) * * Find maximum sum of rows for supnorm * IF( MYCOL.EQ.0 ) THEN IF( MP.GT.0 ) THEN VALUE = WORK( IDAMAX( MP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, I, $ J, -1, 0, 0 ) END IF * ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE IOFFA = II + ( JJ - 1 ) * LDA IF( NQ.GT.0 ) THEN DO 70 J = IOFFA, IOFFA + NQ*LDA - 1, LDA CALL ZLASSQ( MP, A( J ), 1, SCALE, SUM ) 70 CONTINUE END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PDTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, DCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PZLANGE = VALUE * RETURN * * End of PZLANGE * END scalapack-2.0.2/SRC/pzlanhe.f000644 000766 000024 00000102511 10363532303 016146 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PZLANHE( NORM, UPLO, N, A, IA, JA, $ DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION WORK( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLANHE returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * complex hermitian distributed matrix sub(A) = A(IA:IA+N-1,JA:JA+N-1). * * PZLANHE returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PZLANHE as described * above. * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * hermitian matrix sub( A ) is to be referenced. * = 'U': Upper triangular part of sub( A ) is referenced, * = 'L': Lower triangular part of sub( A ) is referenced. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PZLANHE is set to zero. N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the hermitian distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular matrix which norm is * to be computed, and the strictly lower triangular part of * this matrix is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular matrix which norm is to be computed, and the * strictly upper triangular part of sub( A ) is not referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * 2*Nq0+Np0+LDW if NORM = '1', 'O', 'o', 'I' or 'i', * where LDW is given by: * IF( NPROW.NE.NPCOL ) THEN * LDW = MB_A*CEIL(CEIL(Np0/MB_A)/(LCM/NPROW)) * ELSE * LDW = 0 * END IF * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * * where LCM is the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) and CEIL denotes the ceiling * operation (ICEIL). * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * ICEIL, ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IAROW, IACOL, IB, ICOFF, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, IROFF, ICSR, ICSR0, $ IOFFA, IRSC, IRSC0, IRSR, IRSR0, JJ, JJA, K, $ LDA, LL, MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION ABSA, SCALE, SUM, VALUE * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DAXPY, DCOMBSSQ, $ DGAMX2D, DGSUM2D, DGEBR2D, $ DGEBS2D, PDCOL2ROW, PDTREECOMB, $ ZLASSQ * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, IDAMAX, NUMROC EXTERNAL ICEIL, IDAMAX, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters and local indexes. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) ICSR = 1 IRSR = ICSR + NQ IRSC = IRSR + NQ IF( MYROW.EQ.IAROW ) THEN IRSC0 = IRSC + IROFF NP = NP - IROFF ELSE IRSC0 = IRSC END IF IF( MYCOL.EQ.IACOL ) THEN ICSR0 = ICSR + ICOFF IRSR0 = IRSR + ICOFF NQ = NQ - ICOFF ELSE ICSR0 = ICSR IRSR0 = IRSR END IF IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+N-1 ) LDA = DESCA( LLD_ ) * * If the matrix is Hermitian, we address only a triangular portion * of the matrix. A sum of row (column) i of the complete matrix * can be obtained by adding along row i and column i of the the * triangular matrix, stopping/starting at the diagonal, which is * the point of reflection. The pictures below demonstrate this. * In the following code, the row sums created by --- rows below are * refered to as ROWSUMS, and the column sums shown by | are refered * to as COLSUMS. Infinity-norm = 1-norm = ROWSUMS+COLSUMS. * * UPLO = 'U' UPLO = 'L' * ____i______ ___________ * |\ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \|------| i i|---\ | * | \ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * |__________\| |___|______\| * i * * II, JJ : local indices into array A * ICURROW : process row containing diagonal block * ICURCOL : process column containing diagonal block * IRSC0 : pointer to part of work used to store the ROWSUMS while * they are stored along a process column * IRSR0 : pointer to part of work used to store the ROWSUMS after * they have been transposed to be along a process row * II = IIA JJ = JJA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 20 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 10 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 10 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 20 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 40 K = II, II+IB-1 IF( MYCOL.EQ.IACOL ) THEN IF( JJ.LE.JJA+NQ-1 ) THEN VALUE = MAX( VALUE, $ ABS( DBLE( A( K+(JJ-1)*LDA ) ) ) ) DO 30 LL = JJ*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 30 CONTINUE END IF ELSE IF( JJ.LE.JJA+NQ-1 ) THEN DO 35 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 35 CONTINUE END IF END IF IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 40 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining rows/columns of the matrix. * DO 90 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 60 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 50 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 50 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 60 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 80 K = II, II+IB-1 IF( MYCOL.EQ.ICURCOL ) THEN IF( JJ.LE.JJA+NQ-1 ) THEN VALUE = MAX( VALUE, $ ABS( DBLE( A( K+(JJ-1)*LDA ) ) ) ) DO 70 LL = JJ*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 70 CONTINUE END IF ELSE IF( JJ.LE.JJA+NQ-1 ) THEN DO 75 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 75 CONTINUE END IF END IF IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 80 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 90 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 110 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( MYROW.EQ.IAROW ) THEN IF( II.LE.IIA+NP-1 ) THEN VALUE = MAX( VALUE, ABS( DBLE( A( II+K ) ) ) ) DO 100 LL = II+1, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 100 CONTINUE END IF ELSE IF( II.LE.IIA+NP-1 ) THEN DO 105 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 105 CONTINUE END IF END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 110 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 130 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 120 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 120 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 130 CONTINUE ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 180 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 150 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( MYROW.EQ.ICURROW ) THEN IF( II.LE.IIA+NP-1 ) THEN VALUE = MAX( VALUE, $ ABS( DBLE( A( II+K ) ) ) ) DO 140 LL = II+1, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 140 CONTINUE END IF ELSE IF( II.LE.IIA+NP-1 ) THEN DO 145 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 145 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 150 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 170 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 160 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 160 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 170 CONTINUE ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * END IF * * Gather the result on process (IAROW,IACOL). * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, K, -1, $ IAROW, IACOL ) * ELSE IF( LSAME( NORM, 'I' ) .OR. LSAME( NORM, 'O' ) .OR. $ NORM.EQ.'1' ) THEN * * Find normI( sub( A ) ) ( = norm1( sub( A ) ), since sub( A ) is * hermitian). * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 200 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 190 LL = IIA, II-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 190 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 200 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 220 K = II, II+IB-1 SUM = ZERO IF( MYCOL.EQ.IACOL ) THEN IF( JJA+NQ.GT.JJ ) THEN SUM = ABS( DBLE( A( K+(JJ-1)*LDA ) ) ) DO 210 LL = JJ*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 210 CONTINUE END IF ELSE IF( JJA+NQ.GT.JJ ) THEN DO 215 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 215 CONTINUE END IF END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 220 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining rows/columns of global matrix. * DO 270 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 240 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 230 LL = IIA, II-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 230 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 240 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 260 K = II, II+IB-1 SUM = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( JJA+NQ.GT.JJ ) THEN SUM = ABS( DBLE( A( K+(JJ-1)*LDA ) ) ) DO 250 LL = JJ*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 250 CONTINUE END IF ELSE IF( JJA+NQ.GT.JJ ) THEN DO 255 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 255 CONTINUE END IF END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 260 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 270 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = (JJ-1)*LDA DO 290 K = 0, IB-1 SUM = ZERO IF( MYROW.EQ.IAROW ) THEN IF( IIA+NP.GT.II ) THEN SUM = ABS( DBLE( A( IOFFA+II ) ) ) DO 280 LL = II+1, IIA+NP-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 280 CONTINUE END IF ELSE DO 285 LL = II, IIA+NP-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 285 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 290 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 310 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 300 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 300 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 310 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 360 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 330 K = 0, IB-1 SUM = ZERO IF( MYROW.EQ.ICURROW ) THEN IF( IIA+NP.GT.II ) THEN SUM = ABS( DBLE( A( II+IOFFA ) ) ) DO 320 LL = II+1, IIA+NP-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 320 CONTINUE ELSE IF( II.EQ.IIA+NP-1 ) THEN SUM = ABS( DBLE( A( II+IOFFA ) ) ) END IF ELSE DO 325 LL = II, IIA+NP-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 325 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 330 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 350 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 340 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 340 CONTINUE END IF WORK(K-IIA+IRSC0) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 350 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 360 CONTINUE END IF * * After calls to DGSUM2D, process row 0 will have global * COLSUMS and process column 0 will have global ROWSUMS. * Transpose ROWSUMS and add to COLSUMS to get global row/column * sum, the max of which is the infinity or 1 norm. * IF( MYCOL.EQ.IACOL ) $ NQ = NQ + ICOFF CALL DGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK( ICSR ), 1, $ IAROW, MYCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP + IROFF CALL DGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IRSC ), $ MAX( 1, NP ), MYROW, IACOL ) * CALL PDCOL2ROW( ICTXT, N, 1, DESCA( MB_ ), WORK( IRSC ), $ MAX( 1, NP ), WORK( IRSR ), MAX( 1, NQ ), $ IAROW, IACOL, IAROW, IACOL, WORK( IRSC+NP ) ) * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF CALL DAXPY( NQ, ONE, WORK( IRSR0 ), 1, WORK( ICSR0 ), 1 ) IF( NQ.LT.1 ) THEN VALUE = ZERO ELSE VALUE = WORK( IDAMAX( NQ, WORK( ICSR0 ), 1 ) ) END IF CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, K, $ -1, IAROW, IACOL ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE * * Add off-diagonal entries, first * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 370 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL ZLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) CALL ZLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) THEN IF( DBLE( A( II+K ) ).NE.ZERO ) THEN ABSA = ABS( DBLE( A( II+K ) ) ) IF( SCALE.LT.ABSA ) THEN SUM = ONE + SUM * ( SCALE / ABSA )**2 SCALE = ABSA ELSE SUM = SUM + ( ABSA / SCALE )**2 END IF END IF II = II + 1 END IF 370 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 390 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 380 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL ZLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) CALL ZLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) THEN IF( DBLE( A( II+K ) ).NE.ZERO ) THEN ABSA = ABS( DBLE( A( II+K ) ) ) IF( SCALE.LT.ABSA ) THEN SUM = ONE + SUM * ( SCALE / ABSA )**2 SCALE = ABSA ELSE SUM = SUM + ( ABSA / SCALE )**2 END IF END IF II = II + 1 END IF 380 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 400 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( MYROW.EQ.IAROW ) THEN IF( DBLE( A( II+K ) ).NE.ZERO ) THEN ABSA = ABS( DBLE( A( II+K ) ) ) IF( SCALE.LT.ABSA ) THEN SUM = ONE + SUM * ( SCALE / ABSA )**2 SCALE = ABSA ELSE SUM = SUM + ( ABSA / SCALE )**2 END IF END IF II = II + 1 END IF CALL ZLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) CALL ZLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 400 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 420 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 410 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( MYROW.EQ.ICURROW ) THEN IF( DBLE( A( II+K ) ).NE.ZERO ) THEN ABSA = ABS( DBLE( A( II+K ) ) ) IF( SCALE.LT.ABSA ) THEN SUM = ONE + SUM * ( SCALE / ABSA )**2 SCALE = ABSA ELSE SUM = SUM + ( ABSA / SCALE )**2 END IF END IF II = II + 1 END IF CALL ZLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) CALL ZLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 410 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 420 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM * CALL PDTREECOMB( ICTXT, 'All', 2, RWORK, IAROW, IACOL, $ DCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to the other processes * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, IAROW, $ IACOL ) END IF * PZLANHE = VALUE * RETURN * * End of PZLANHE * END scalapack-2.0.2/SRC/pzlanhs.f000644 000766 000024 00000062525 10363532303 016176 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PZLANHS( NORM, N, A, IA, JA, DESCA, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION WORK( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLANHS returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * Hessenberg distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * PZLANHS returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PZLANHS as described * above. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PZLANHS is set to zero. N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, II, IIA, ICOFF, INXTROW, $ IOFFA, IROFF, J, JB, JJ, JJA, JN, KK, LDA, LL, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION SCALE, SUM, VALUE * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOMBSSQ, DGEBR2D, $ DGEBS2D, DGAMX2D, DGSUM2D, $ INFOG2L, PDTREECOMB, ZLASSQ * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, IDAMAX, NUMROC EXTERNAL LSAME, ICEIL, IDAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) IOFFA = ( JJA - 1 ) * LDA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * VALUE = ZERO * * Find max(abs(A(i,j))). * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 20 LL = JJ, JJ+JB-1 DO 10 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 50 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 40 LL = JJ, JJ+JB-1 DO 30 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 50 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 70 LL = JJ, JJ + JB -1 DO 60 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 60 CONTINUE IOFFA = IOFFA + LDA 70 CONTINUE ELSE DO 90 LL = JJ, JJ+JB-1 DO 80 KK = IIA, MIN( II-1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ VALUE = MAX( VALUE, ABS( A( II+(JJ+JB-2)*LDA ) ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 140 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 110 LL = JJ, JJ + JB -1 DO 100 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE ELSE DO 130 LL = JJ, JJ + JB -1 DO 120 KK = IIA, MIN( II-1, IIA+NP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 120 CONTINUE IOFFA = IOFFA + LDA 130 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ VALUE = MAX( VALUE, $ ABS( A( II+(JJ+JB-2)*LDA ) ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 140 CONTINUE * END IF * * Gather the intermediate results to process (0,0). * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, KK, LL, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * VALUE = ZERO II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 160 LL = JJ, JJ+JB-1 SUM = ZERO DO 150 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 150 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 160 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 190 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 180 LL = JJ, JJ+JB-1 SUM = ZERO DO 170 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 170 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 180 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 190 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 210 LL = JJ, JJ + JB -1 SUM = ZERO DO 200 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 200 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 210 CONTINUE ELSE DO 230 LL = JJ, JJ + JB -1 SUM = ZERO DO 220 KK = IIA, MIN( II-1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 220 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 230 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( JJ+JB-JJA ) = WORK( JJ+JB-JJA ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 280 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 250 LL = JJ, JJ + JB -1 SUM = ZERO DO 240 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 240 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 250 CONTINUE ELSE DO 270 LL = JJ, JJ + JB -1 SUM = ZERO DO 260 KK = IIA, MIN( II-1, IIA+NP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 260 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 270 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( JJ+JB-JJA ) = WORK( JJ+JB-JJA ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 280 CONTINUE * END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( IDAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, KK, LL, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * DO 290 KK = IIA, IIA+NP-1 WORK( KK ) = ZERO 290 CONTINUE * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 310 LL = JJ, JJ+JB-1 DO 300 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 300 CONTINUE IOFFA = IOFFA + LDA 310 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 340 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 330 LL = JJ, JJ+JB-1 DO 320 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 320 CONTINUE IOFFA = IOFFA + LDA 330 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 340 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 360 LL = JJ, JJ + JB -1 DO 350 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 350 CONTINUE IOFFA = IOFFA + LDA 360 CONTINUE ELSE DO 380 LL = JJ, JJ + JB -1 DO 370 KK = IIA, MIN( II-1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 370 CONTINUE IOFFA = IOFFA + LDA 380 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( II-IIA+1 ) = WORK( II-IIA+1 ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 430 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 400 LL = JJ, JJ + JB -1 DO 390 KK = IIA, MIN( II+LL-JJ+1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 390 CONTINUE IOFFA = IOFFA + LDA 400 CONTINUE ELSE DO 420 LL = JJ, JJ + JB -1 DO 410 KK = IIA, MIN( II-1, IIA+NP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS(A(IOFFA+KK)) 410 CONTINUE IOFFA = IOFFA + LDA 420 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ WORK( II-IIA+1 ) = WORK( II-IIA+1 ) + $ ABS( A( II+(JJ+JB-2)*LDA ) ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 430 CONTINUE * END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK, MAX( 1, NP ), $ MYROW, 0 ) * * Find maximum sum of rows for Infinity-norm * IF( MYCOL.EQ.0 ) THEN IF( NP.GT.0 ) THEN VALUE = WORK( IDAMAX( NP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, KK, $ LL, -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * SCALE = ZERO SUM = ONE II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.IACOL ) THEN DO 440 LL = JJ, JJ+JB-1 CALL ZLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 440 CONTINUE JJ = JJ + JB END IF * IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 460 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN DO 450 LL = JJ, JJ+JB-1 CALL ZLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 450 CONTINUE JJ = JJ + JB END IF * II = II + JB IACOL = MOD( IACOL+1, NPCOL ) * 460 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( IAROW+1, NPROW ) IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 470 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 470 CONTINUE ELSE DO 480 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II-1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 480 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ CALL ZLASSQ( 1, A( II+(JJ+JB-2)*LDA ), 1, $ SCALE, SUM ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 510 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 490 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II+LL-JJ+1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 490 CONTINUE ELSE DO 500 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II-1, IIA+NP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 500 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+NP-1 ) $ CALL ZLASSQ( 1, A( II+(JJ+JB-2)*LDA ), 1, $ SCALE, SUM ) END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = INXTROW IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 510 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PDTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, DCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PZLANHS = VALUE * RETURN * * End of PZLANHS * END scalapack-2.0.2/SRC/pzlansy.f000644 000766 000024 00000070330 10363532303 016210 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PZLANSY( NORM, UPLO, N, A, IA, JA, $ DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION WORK( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLANSY returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * real symmetric distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * PZLANSY returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with IA <= i <= IA+N-1, * ( and JA <= j <= JA+N-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PZLANSY as described * above. * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric matrix sub( A ) is to be referenced. * = 'U': Upper triangular part of sub( A ) is referenced, * = 'L': Lower triangular part of sub( A ) is referenced. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * number of rows and columns of the distributed submatrix * sub( A ). When N = 0, PZLANSY is set to zero. N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1)) containing the * local pieces of the symmetric distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular matrix which norm is * to be computed, and the strictly lower triangular part of * this matrix is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular matrix which norm is to be computed, and the * strictly upper triangular part of sub( A ) is not referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * 2*Nq0+Np0+LDW if NORM = '1', 'O', 'o', 'I' or 'i', * where LDW is given by: * IF( NPROW.NE.NPCOL ) THEN * LDW = MB_A*CEIL(CEIL(Np0/MB_A)/(LCM/NPROW)) * ELSE * LDW = 0 * END IF * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * * where LCM is the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) and CEIL denotes the ceiling * operation (ICEIL). * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Np0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * ICEIL, ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IAROW, IACOL, IB, ICOFF, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, IROFF, ICSR, ICSR0, $ IOFFA, IRSC, IRSC0, IRSR, IRSR0, JJ, JJA, K, $ LDA, LL, MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION SCALE, SUM, VALUE * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DAXPY, DCOMBSSQ, $ DGAMX2D, DGSUM2D, DGEBR2D, $ DGEBS2D, PDCOL2ROW, PDTREECOMB, $ ZLASSQ * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, IDAMAX, NUMROC EXTERNAL ICEIL, IDAMAX, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters and local indexes. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) * IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) ICSR = 1 IRSR = ICSR + NQ IRSC = IRSR + NQ IF( MYROW.EQ.IAROW ) THEN IRSC0 = IRSC + IROFF NP = NP - IROFF ELSE IRSC0 = IRSC END IF IF( MYCOL.EQ.IACOL ) THEN ICSR0 = ICSR + ICOFF IRSR0 = IRSR + ICOFF NQ = NQ - ICOFF ELSE ICSR0 = ICSR IRSR0 = IRSR END IF IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+N-1 ) LDA = DESCA( LLD_ ) * * If the matrix is symmetric, we address only a triangular portion * of the matrix. A sum of row (column) i of the complete matrix * can be obtained by adding along row i and column i of the the * triangular matrix, stopping/starting at the diagonal, which is * the point of reflection. The pictures below demonstrate this. * In the following code, the row sums created by --- rows below are * refered to as ROWSUMS, and the column sums shown by | are refered * to as COLSUMS. Infinity-norm = 1-norm = ROWSUMS+COLSUMS. * * UPLO = 'U' UPLO = 'L' * ____i______ ___________ * |\ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \|------| i i|---\ | * | \ | | |\ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * | \ | | | \ | * |__________\| |___|______\| * i * * II, JJ : local indices into array A * ICURROW : process row containing diagonal block * ICURCOL : process column containing diagonal block * IRSC0 : pointer to part of work used to store the ROWSUMS while * they are stored along a process column * IRSR0 : pointer to part of work used to store the ROWSUMS after * they have been transposed to be along a process row * II = IIA JJ = JJA * IF( N.EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 20 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 10 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 10 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 20 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 40 K = II, II+IB-1 IF( JJ.LE.JJA+NQ-1 ) THEN DO 30 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 30 CONTINUE END IF IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 40 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over the remaining rows/columns of the matrix. * DO 90 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 60 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.GT.IIA ) THEN DO 50 LL = IIA, II-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 50 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 60 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 80 K = II, II+IB-1 IF( JJ.LE.JJA+NQ-1 ) THEN DO 70 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( K+LL ) ) ) 70 CONTINUE END IF IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 80 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 90 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLMAXS * IF( MYCOL.EQ.IACOL ) THEN DO 110 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.LE.IIA+NP-1 ) THEN DO 100 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 100 CONTINUE END IF IF( MYROW.EQ.IAROW ) $ II = II + 1 110 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.IAROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.IAROW ) THEN DO 130 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 120 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 120 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 130 CONTINUE ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 180 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLMAXS * IF( MYCOL.EQ.ICURCOL ) THEN DO 150 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA IF( II.LE.IIA+NP-1 ) THEN DO 140 LL = II, IIA+NP-1 VALUE = MAX( VALUE, ABS( A( LL+K ) ) ) 140 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + 1 150 CONTINUE * * Reset local indices so we can find ROWMAXS * IF( MYROW.EQ.ICURROW ) $ II = II - IB END IF * * Find ROWMAXS * IF( MYROW.EQ.ICURROW ) THEN DO 170 K = 0, IB-1 IF( JJ.GT.JJA ) THEN DO 160 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA VALUE = MAX( VALUE, ABS( A( II+LL ) ) ) 160 CONTINUE END IF II = II + 1 IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 170 CONTINUE ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * END IF * * Gather the result on process (IAROW,IACOL). * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, I, K, -1, $ IAROW, IACOL ) * ELSE IF( LSAME( NORM, 'I' ) .OR. LSAME( NORM, 'O' ) .OR. $ NORM.EQ.'1' ) THEN * * Find normI( sub( A ) ) ( = norm1( sub( A ) ), since sub( A ) is * symmetric). * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 200 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 190 LL = IIA, II-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 190 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 200 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 220 K = II, II+IB-1 SUM = ZERO IF( JJA+NQ.GT.JJ ) THEN DO 210 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 210 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 220 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining rows/columns of global matrix. * DO 270 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 240 K = 0, IB-1 SUM = ZERO IF( II.GT.IIA ) THEN DO 230 LL = IIA, II-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 230 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 240 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 260 K = II, II+IB-1 SUM = ZERO IF( JJA+NQ.GT.JJ ) THEN DO 250 LL = (JJ-1)*LDA, (JJA+NQ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 250 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 260 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 270 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * * Find COLSUMS * IF( MYCOL.EQ.IACOL ) THEN IOFFA = (JJ-1)*LDA DO 290 K = 0, IB-1 SUM = ZERO IF( IIA+NP.GT.II ) THEN DO 280 LL = II, IIA+NP-1 SUM = SUM + ABS( A( IOFFA+LL ) ) 280 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.IAROW ) $ II = II + 1 290 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.IAROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.IAROW ) THEN DO 310 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 300 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 300 CONTINUE END IF WORK( K-IIA+IRSC0 ) = SUM IF( MYCOL.EQ.IACOL ) $ JJ = JJ + 1 310 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 360 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * * Find COLSUMS * IF( MYCOL.EQ.ICURCOL ) THEN IOFFA = ( JJ - 1 ) * LDA DO 330 K = 0, IB-1 SUM = ZERO IF( IIA+NP.GT.II ) THEN DO 320 LL = II, IIA+NP-1 SUM = SUM + ABS( A( LL+IOFFA ) ) 320 CONTINUE END IF IOFFA = IOFFA + LDA WORK( JJ+K-JJA+ICSR0 ) = SUM IF( MYROW.EQ.ICURROW ) $ II = II + 1 330 CONTINUE * * Reset local indices so we can find ROWSUMS * IF( MYROW.EQ.ICURROW ) $ II = II - IB * END IF * * Find ROWSUMS * IF( MYROW.EQ.ICURROW ) THEN DO 350 K = II, II+IB-1 SUM = ZERO IF( JJ.GT.JJA ) THEN DO 340 LL = (JJA-1)*LDA, (JJ-2)*LDA, LDA SUM = SUM + ABS( A( K+LL ) ) 340 CONTINUE END IF WORK(K-IIA+IRSC0) = SUM IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + 1 350 CONTINUE II = II + IB ELSE IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 360 CONTINUE END IF * * After calls to DGSUM2D, process row 0 will have global * COLSUMS and process column 0 will have global ROWSUMS. * Transpose ROWSUMS and add to COLSUMS to get global row/column * sum, the max of which is the infinity or 1 norm. * IF( MYCOL.EQ.IACOL ) $ NQ = NQ + ICOFF CALL DGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK( ICSR ), 1, $ IAROW, MYCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP + IROFF CALL DGSUM2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IRSC ), $ MAX( 1, NP ), MYROW, IACOL ) * CALL PDCOL2ROW( ICTXT, N, 1, DESCA( MB_ ), WORK( IRSC ), $ MAX( 1, NP ), WORK( IRSR ), MAX( 1, NQ ), $ IAROW, IACOL, IAROW, IACOL, WORK( IRSC+NP ) ) * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF CALL DAXPY( NQ, ONE, WORK( IRSR0 ), 1, WORK( ICSR0 ), 1 ) IF( NQ.LT.1 ) THEN VALUE = ZERO ELSE VALUE = WORK( IDAMAX( NQ, WORK( ICSR0 ), 1 ) ) END IF CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, I, K, $ -1, IAROW, IACOL ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * * Find normF( sub( A ) ). * SCALE = ZERO SUM = ONE * * Add off-diagonal entries, first * IF( LSAME( UPLO, 'U' ) ) THEN * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 370 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL ZLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) $ II = II + 1 CALL ZLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) 370 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 390 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 380 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL ZLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) $ II = II + 1 CALL ZLASSQ( II-IIA, A( IIA+K ), 1, SCALE, SUM ) 380 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Handle first block separately * IB = IN-IA+1 * IF( MYCOL.EQ.IACOL ) THEN DO 400 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL ZLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) IF( MYROW.EQ.IAROW ) $ II = II + 1 CALL ZLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 400 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.IAROW ) THEN II = II + IB END IF * ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over rows/columns of global matrix. * DO 420 I = IN+1, IA+N-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+N-I ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 410 K = (JJ-1)*LDA, (JJ+IB-2)*LDA, LDA CALL ZLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) IF( MYROW.EQ.ICURROW ) $ II = II + 1 CALL ZLASSQ( IIA+NP-II, A( II+K ), 1, SCALE, SUM ) 410 CONTINUE * JJ = JJ + IB ELSE IF( MYROW.EQ.ICURROW ) THEN II = II + IB END IF * ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 420 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM * CALL PDTREECOMB( ICTXT, 'All', 2, RWORK, IAROW, IACOL, $ DCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to the other processes * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, IAROW, $ IACOL ) END IF * PZLANSY = VALUE * RETURN * * End of PZLANSY * END scalapack-2.0.2/SRC/pzlantr.f000644 000766 000024 00000110646 10363532303 016207 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION PZLANTR( NORM, UPLO, DIAG, M, N, A, $ IA, JA, DESCA, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER IA, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION WORK( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLANTR returns the value of the one norm, or the Frobenius norm, * or the infinity norm, or the element of largest absolute value of a * trapezoidal or triangular distributed matrix sub( A ) denoting * A(IA:IA+M-1, JA:JA+N-1). * * PZLANTR returns the value * * ( max(abs(A(i,j))), NORM = 'M' or 'm' with ia <= i <= ia+m-1, * ( and ja <= j <= ja+n-1, * ( * ( norm1( sub( A ) ), NORM = '1', 'O' or 'o' * ( * ( normI( sub( A ) ), NORM = 'I' or 'i' * ( * ( normF( sub( A ) ), NORM = 'F', 'f', 'E' or 'e' * * where norm1 denotes the one norm of a matrix (maximum column sum), * normI denotes the infinity norm of a matrix (maximum row sum) and * normF denotes the Frobenius norm of a matrix (square root of sum of * squares). Note that max(abs(A(i,j))) is not a matrix norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies the value to be returned in PZLANTR as described * above. * * UPLO (global input) CHARACTER * Specifies whether the matrix sub( A ) is upper or lower * trapezoidal. * = 'U': Upper trapezoidal * = 'L': Lower trapezoidal * Note that sub( A ) is triangular instead of trapezoidal * if M = N. * * DIAG (global input) CHARACTER * Specifies whether or not the distributed matrix sub( A ) has * unit diagonal. * = 'N': Non-unit diagonal * = 'U': Unit diagonal * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). When M = 0, PZLANTR is * set to zero. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). When N = 0, * PZLANTR is set to zero. N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A, LOCc(JA+N-1) ) containing * the local pieces of sub( A ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * WORK (local workspace) DOUBLE PRECISION array dimension (LWORK) * LWORK >= 0 if NORM = 'M' or 'm' (not referenced), * Nq0 if NORM = '1', 'O' or 'o', * Mp0 if NORM = 'I' or 'i', * 0 if NORM = 'F', 'f', 'E' or 'e' (not referenced), * where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL UDIAG INTEGER IACOL, IAROW, ICTXT, II, IIA, ICOFF, IOFFA, $ IROFF, J, JB, JJ, JJA, JN, KK, LDA, LL, MP, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ DOUBLE PRECISION SCALE, SUM, VALUE * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOMBSSQ, DGEBR2D, $ DGEBS2D, DGAMX2D, DGSUM2D, INFOG2L, $ PDTREECOMB, ZLASSQ * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, IDAMAX, NUMROC EXTERNAL LSAME, ICEIL, IDAMAX, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * UDIAG = LSAME( DIAG, 'U' ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) IOFFA = ( JJA - 1 ) * LDA * IF( MIN( M, N ).EQ.0 ) THEN * VALUE = ZERO * ELSE IF( LSAME( NORM, 'M' ) ) THEN * * Find max(abs(A(i,j))). * IF( UDIAG ) THEN VALUE = ONE ELSE VALUE = ZERO END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 20 LL = JJ, JJ + JB -1 DO 10 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE ELSE DO 40 LL = JJ, JJ + JB -1 DO 30 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE END IF ELSE DO 60 LL = JJ, JJ + JB -1 DO 50 KK = IIA, MIN( II-1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 80 LL = JJ, JJ + JB -1 DO 70 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 70 CONTINUE IOFFA = IOFFA + LDA 80 CONTINUE ELSE DO 100 LL = JJ, JJ + JB -1 DO 90 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 90 CONTINUE IOFFA = IOFFA + LDA 100 CONTINUE END IF ELSE DO 120 LL = JJ, JJ + JB -1 DO 110 KK = IIA, MIN( II-1, IIA+MP-1 ) VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 110 CONTINUE IOFFA = IOFFA + LDA 120 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 130 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 150 LL = JJ, JJ + JB -1 DO 140 KK = II+LL-JJ+1, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 140 CONTINUE IOFFA = IOFFA + LDA 150 CONTINUE ELSE DO 170 LL = JJ, JJ + JB -1 DO 160 KK = II+LL-JJ, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 160 CONTINUE IOFFA = IOFFA + LDA 170 CONTINUE END IF ELSE DO 190 LL = JJ, JJ + JB -1 DO 180 KK = II, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 180 CONTINUE IOFFA = IOFFA + LDA 190 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 260 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 210 LL = JJ, JJ + JB -1 DO 200 KK = II+LL-JJ+1, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 200 CONTINUE IOFFA = IOFFA + LDA 210 CONTINUE ELSE DO 230 LL = JJ, JJ + JB -1 DO 220 KK = II+LL-JJ, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 220 CONTINUE IOFFA = IOFFA + LDA 230 CONTINUE END IF ELSE DO 250 LL = JJ, JJ + JB -1 DO 240 KK = II, IIA+MP-1 VALUE = MAX( VALUE, ABS( A( IOFFA+KK ) ) ) 240 CONTINUE IOFFA = IOFFA + LDA 250 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 260 CONTINUE * END IF * * Gather the intermediate results to process (0,0). * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, KK, LL, -1, $ 0, 0 ) * ELSE IF( LSAME( NORM, 'O' ) .OR. NORM.EQ.'1' ) THEN * VALUE = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 280 LL = JJ, JJ + JB -1 SUM = ONE DO 270 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 270 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 280 CONTINUE ELSE DO 300 LL = JJ, JJ + JB -1 SUM = ZERO DO 290 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 290 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 300 CONTINUE END IF ELSE DO 320 LL = JJ, JJ + JB -1 SUM = ZERO DO 310 KK = IIA, MIN( II-1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 310 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 320 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 390 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 340 LL = JJ, JJ + JB -1 SUM = ONE DO 330 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 330 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 340 CONTINUE ELSE DO 360 LL = JJ, JJ + JB -1 SUM = ZERO DO 350 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 350 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 360 CONTINUE END IF ELSE DO 380 LL = JJ, JJ + JB -1 SUM = ZERO DO 370 KK = IIA, MIN( II-1, IIA+MP-1 ) SUM = SUM + ABS( A( IOFFA+KK ) ) 370 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 380 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 390 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 410 LL = JJ, JJ + JB -1 SUM = ONE DO 400 KK = II+LL-JJ+1, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 400 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 410 CONTINUE ELSE DO 430 LL = JJ, JJ + JB -1 SUM = ZERO DO 420 KK = II+LL-JJ, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 420 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 430 CONTINUE END IF ELSE DO 450 LL = JJ, JJ + JB -1 SUM = ZERO DO 440 KK = II, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 440 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 450 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 520 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 470 LL = JJ, JJ + JB -1 SUM = ONE DO 460 KK = II+LL-JJ+1, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 460 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 470 CONTINUE ELSE DO 490 LL = JJ, JJ + JB -1 SUM = ZERO DO 480 KK = II+LL-JJ, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 480 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 490 CONTINUE END IF ELSE DO 510 LL = JJ, JJ + JB -1 SUM = ZERO DO 500 KK = II, IIA+MP-1 SUM = SUM + ABS( A( IOFFA+KK ) ) 500 CONTINUE IOFFA = IOFFA + LDA WORK( LL-JJA+1 ) = SUM 510 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 520 CONTINUE * END IF * * Find sum of global matrix columns and store on row 0 of * process grid * CALL DGSUM2D( ICTXT, 'Columnwise', ' ', 1, NQ, WORK, 1, $ 0, MYCOL ) * * Find maximum sum of columns for 1-norm * IF( MYROW.EQ.0 ) THEN IF( NQ.GT.0 ) THEN VALUE = WORK( IDAMAX( NQ, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Rowwise', ' ', 1, 1, VALUE, 1, KK, LL, $ -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'I' ) ) THEN * IF( LSAME( UPLO, 'U' ) ) THEN IF( UDIAG ) THEN DO 530 KK = IIA, IIA+MP-1 WORK( KK ) = ONE 530 CONTINUE ELSE DO 540 KK = IIA, IIA+MP-1 WORK( KK ) = ZERO 540 CONTINUE END IF ELSE IF( UDIAG ) THEN NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF DO 550 KK = IIA, IIA+NP-1 WORK( KK ) = ONE 550 CONTINUE DO 560 KK = IIA+NP, IIA+MP-1 WORK( KK ) = ZERO 560 CONTINUE ELSE DO 570 KK = IIA, IIA+MP-1 WORK( KK ) = ZERO 570 CONTINUE END IF END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 590 LL = JJ, JJ + JB -1 DO 580 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 580 CONTINUE IOFFA = IOFFA + LDA 590 CONTINUE ELSE DO 610 LL = JJ, JJ + JB -1 DO 600 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 600 CONTINUE IOFFA = IOFFA + LDA 610 CONTINUE END IF ELSE DO 630 LL = JJ, JJ + JB -1 DO 620 KK = IIA, MIN( II-1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 620 CONTINUE IOFFA = IOFFA + LDA 630 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 700 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 650 LL = JJ, JJ + JB -1 DO 640 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 640 CONTINUE IOFFA = IOFFA + LDA 650 CONTINUE ELSE DO 670 LL = JJ, JJ + JB -1 DO 660 KK = IIA, MIN( II+LL-JJ, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 660 CONTINUE IOFFA = IOFFA + LDA 670 CONTINUE END IF ELSE DO 690 LL = JJ, JJ + JB -1 DO 680 KK = IIA, MIN( II-1, IIA+MP-1 ) WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 680 CONTINUE IOFFA = IOFFA + LDA 690 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 700 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 720 LL = JJ, JJ + JB -1 DO 710 KK = II+LL-JJ+1, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 710 CONTINUE IOFFA = IOFFA + LDA 720 CONTINUE ELSE DO 740 LL = JJ, JJ + JB -1 DO 730 KK = II+LL-JJ, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 730 CONTINUE IOFFA = IOFFA + LDA 740 CONTINUE END IF ELSE DO 760 LL = JJ, JJ + JB -1 DO 750 KK = II, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 750 CONTINUE IOFFA = IOFFA + LDA 760 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 830 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 780 LL = JJ, JJ + JB -1 DO 770 KK = II+LL-JJ+1, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 770 CONTINUE IOFFA = IOFFA + LDA 780 CONTINUE ELSE DO 800 LL = JJ, JJ + JB -1 DO 790 KK = II+LL-JJ, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 790 CONTINUE IOFFA = IOFFA + LDA 800 CONTINUE END IF ELSE DO 820 LL = JJ, JJ + JB -1 DO 810 KK = II, IIA+MP-1 WORK( KK-IIA+1 ) = WORK( KK-IIA+1 ) + $ ABS( A( IOFFA+KK ) ) 810 CONTINUE IOFFA = IOFFA + LDA 820 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 830 CONTINUE * END IF * * Find sum of global matrix rows and store on column 0 of * process grid * CALL DGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, WORK, MAX( 1, MP ), $ MYROW, 0 ) * * Find maximum sum of rows for Infinity-norm * IF( MYCOL.EQ.0 ) THEN IF( MP.GT.0 ) THEN VALUE = WORK( IDAMAX( MP, WORK, 1 ) ) ELSE VALUE = ZERO END IF CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, VALUE, 1, KK, $ LL, -1, 0, 0 ) END IF * ELSE IF( LSAME( NORM, 'F' ) .OR. LSAME( NORM, 'E' ) ) THEN * IF( UDIAG ) THEN SCALE = ONE SUM = DBLE( MIN( M, N ) ) / DBLE( NPROW*NPCOL ) ELSE SCALE = ZERO SUM = ONE END IF * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 840 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II+LL-JJ, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 840 CONTINUE ELSE DO 850 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II+LL-JJ+1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 850 CONTINUE END IF ELSE DO 860 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II-1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 860 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 900 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 870 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II+LL-JJ+1, IIA+MP-1 )- $ IIA+1, A( IIA+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 870 CONTINUE ELSE DO 880 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II+LL-JJ, IIA+MP-1 )- $ IIA+1, A( IIA+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 880 CONTINUE END IF ELSE DO 890 LL = JJ, JJ + JB -1 CALL ZLASSQ( MIN( II-1, IIA+MP-1 )-IIA+1, $ A( IIA+IOFFA ), 1, SCALE, SUM ) IOFFA = IOFFA + LDA 890 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 900 CONTINUE * ELSE * * Lower triangular matrix * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 910 LL = JJ, JJ + JB -1 CALL ZLASSQ( IIA+MP-(II+LL-JJ+1), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 910 CONTINUE ELSE DO 920 LL = JJ, JJ + JB -1 CALL ZLASSQ( IIA+MP-(II+LL-JJ), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 920 CONTINUE END IF ELSE DO 930 LL = JJ, JJ + JB -1 CALL ZLASSQ( IIA+MP-II, A( II+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 930 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 970 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN IF( UDIAG ) THEN DO 940 LL = JJ, JJ + JB -1 CALL ZLASSQ( IIA+MP-(II+LL-JJ+1), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 940 CONTINUE ELSE DO 950 LL = JJ, JJ + JB -1 CALL ZLASSQ( IIA+MP-(II+LL-JJ), $ A( II+LL-JJ+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 950 CONTINUE END IF ELSE DO 960 LL = JJ, JJ + JB -1 CALL ZLASSQ( IIA+MP-II, A( II+IOFFA ), 1, SCALE, $ SUM ) IOFFA = IOFFA + LDA 960 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 970 CONTINUE * END IF * * Perform the global scaled sum * RWORK( 1 ) = SCALE RWORK( 2 ) = SUM CALL PDTREECOMB( ICTXT, 'All', 2, RWORK, 0, 0, DCOMBSSQ ) VALUE = RWORK( 1 ) * SQRT( RWORK( 2 ) ) * END IF * * Broadcast the result to every process in the grid. * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1 ) ELSE CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, VALUE, 1, 0, 0 ) END IF * PZLANTR = VALUE * RETURN * * End of PZLANTR * END scalapack-2.0.2/SRC/pzlapiv.f000644 000766 000024 00000033647 10363532303 016207 0ustar00juliestaff000000 000000 SUBROUTINE PZLAPIV( DIREC, ROWCOL, PIVROC, M, N, A, IA, JA, $ DESCA, IPIV, IP, JP, DESCIP, IWORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER*1 DIREC, PIVROC, ROWCOL INTEGER IA, IP, JA, JP, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCIP( * ), IPIV( * ), IWORK( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLAPIV applies either P (permutation matrix indicated by IPIV) * or inv( P ) to a general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1), resulting in row or column * pivoting. The pivot vector may be distributed across a process row * or a column. The pivot vector should be aligned with the distributed * matrix A. This routine will transpose the pivot vector if necessary. * For example if the row pivots should be applied to the columns of * sub( A ), pass ROWCOL='C' and PIVROC='C'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Restrictions * ============ * * IPIV must always be a distributed vector (not a matrix). Thus: * IF( ROWPIV .EQ. 'C' ) THEN * JP must be 1 * ELSE * IP must be 1 * END IF * * The following restrictions apply when IPIV must be transposed: * IF( ROWPIV.EQ.'C' .AND. PIVROC.EQ.'C') THEN * DESCIP(MB_) must equal DESCA(NB_) * ELSE IF( ROWPIV.EQ.'R" .AND. PIVROC.EQ.'R') THEN * DESCIP(NB_) must equal DESCA(MB_) * END IF * * Arguments * ========= * * DIREC (global input) CHARACTER*1 * Specifies in which order the permutation is applied: * = 'F' (Forward) Applies pivots Forward from top of matrix. * Computes P*sub( A ). * = 'B' (Backward) Applies pivots Backward from bottom of * matrix. Computes inv( P )*sub( A ). * * ROWCOL (global input) CHARACTER*1 * Specifies if the rows or columns are to be permuted: * = 'R' Rows will be permuted, * = 'C' Columns will be permuted. * * PIVROC (global input) CHARACTER*1 * Specifies whether IPIV is distributed over a process row * or column: * = 'R' IPIV distributed over a process row * = 'C' IPIV distributed over a process column * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of * rows of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * distributed submatrix sub( A ) to which the row or column * interchanges will be applied. On exit, the local pieces * of the permuted distributed submatrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (local input) INTEGER array, dimension (LIPIV) where LIPIV is * when ROWCOL='R' or 'r': * >= LOCr( IA+M-1 ) + MB_A if PIVROC='C' or 'c', * >= LOCc( M + MOD(JP-1,NB_P) ) if PIVROC='R' or 'r', and, * when ROWCOL='C' or 'c': * >= LOCr( N + MOD(IP-1,MB_P) ) if PIVROC='C' or 'c', * >= LOCc( JA+N-1 ) + NB_A if PIVROC='R' or 'r'. * This array contains the pivoting information. IPIV(i) is the * global row (column), local row (column) i was swapped with. * When ROWCOL='R' or 'r' and PIVROC='C' or 'c', or ROWCOL='C' * or 'c' and PIVROC='R' or 'r', the last piece of this array of * size MB_A (resp. NB_A) is used as workspace. In those cases, * this array is tied to the distributed matrix A. * * IP (global input) INTEGER * The row index in the global array P indicating the first * row of sub( P ). * * JP (global input) INTEGER * The column index in the global array P indicating the * first column of sub( P ). * * DESCIP (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed vector IPIV. * * IWORK (local workspace) INTEGER array, dimension (LDW) * where LDW is equal to the workspace necessary for * transposition, and the storage of the tranposed IPIV: * * Let LCM be the least common multiple of NPROW and NPCOL. * IF( ROWCOL.EQ.'R' .AND. PIVROC.EQ.'R' ) THEN * IF( NPROW.EQ.NPCOL ) THEN * LDW = LOCr( N_P + MOD(JP-1, NB_P) ) + NB_P * ELSE * LDW = LOCr( N_P + MOD(JP-1, NB_P) ) + * NB_P * CEIL( CEIL(LOCc(N_P)/NB_P) / (LCM/NPCOL) ) * END IF * ELSE IF( ROWCOL.EQ.'C' .AND. PIVROC.EQ.'C' ) THEN * IF( NPROW.EQ.NPCOL ) THEN * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + MB_P * ELSE * LDW = LOCc( M_P + MOD(IP-1, MB_P) ) + * MB_P * CEIL( CEIL(LOCr(M_P)/MB_P) / (LCM/NPROW) ) * END IF * ELSE * IWORK is not referenced. * END IF * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL ROWPVT INTEGER I, ICTXT, ICURCOL, ICURROW, IIP, ITMP, IPT, $ JJP, JPT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER DESCPT( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, $ INFOG2L, PICOL2ROW, PIROW2COL, PZLAPV2 * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC, INDXG2P EXTERNAL LSAME, NUMROC, INDXG2P * .. * .. Intrinsic Functions .. INTRINSIC MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) ROWPVT = LSAME( ROWCOL, 'R' ) * * If we're pivoting the rows of sub( A ) * IF( ROWPVT ) THEN IF( M.LE.1 .OR. N.LT.1 ) $ RETURN * * If the pivot vector is already distributed correctly * IF( LSAME( PIVROC, 'C' ) ) THEN CALL PZLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * Otherwise, we must redistribute IPIV to match PZLAPV2 * ELSE * * Take IPIV distributed over row 0, and store it in * iwork, distributed over column 0 * IPT = MOD( JP-1, DESCA(MB_) ) DESCPT(M_) = M + IPT + NPROW*DESCA(MB_) DESCPT(N_) = 1 DESCPT(MB_) = DESCA(MB_) DESCPT(NB_) = 1 DESCPT(RSRC_) = INDXG2P( IA, DESCA(MB_), IA, DESCA(RSRC_), $ NPROW ) DESCPT(CSRC_) = MYCOL DESCPT(CTXT_) = ICTXT DESCPT(LLD_) = NUMROC( DESCPT(M_), DESCPT(MB_), MYROW, $ DESCPT(RSRC_), NPROW ) ITMP = NUMROC( DESCIP(N_), DESCIP(NB_), MYCOL, $ DESCIP(CSRC_), NPCOL ) CALL INFOG2L( IP, JP-IPT, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) CALL PIROW2COL( ICTXT, M+IPT, 1, DESCIP(NB_), IPIV(JJP), $ ITMP, IWORK, DESCPT(LLD_), 0, ICURCOL, $ DESCPT(RSRC_), $ MYCOL, IWORK(DESCPT(LLD_)-DESCPT(MB_)+1) ) * * Send column-distributed pivots to all columns * ITMP = DESCPT(LLD_) - DESCPT(MB_) IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'Row', ' ', ITMP, 1, IWORK, ITMP ) ELSE CALL IGEBR2D( ICTXT, 'Row', ' ', ITMP, 1, IWORK, ITMP, $ MYROW, 0 ) END IF * * Adjust pivots so they are relative to the start of IWORK, * not IPIV * IPT = IPT + 1 DO 10 I = 1, ITMP IWORK(I) = IWORK(I) - JP + IPT 10 CONTINUE CALL PZLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IWORK, $ IPT, 1, DESCPT ) END IF * * Otherwise, we're pivoting the columns of sub( A ) * ELSE IF( M.LT.1 .OR. N.LE.1 ) $ RETURN * * If the pivot vector is already distributed correctly * IF( LSAME( PIVROC, 'R' ) ) THEN CALL PZLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * Otherwise, we must redistribute IPIV to match PZLAPV2 * ELSE * * Take IPIV distributed over column 0, and store it in * iwork, distributed over row 0 * JPT = MOD( IP-1, DESCA(NB_) ) DESCPT(M_) = 1 DESCPT(N_) = N + JPT + NPCOL*DESCA(NB_) DESCPT(MB_) = 1 DESCPT(NB_) = DESCA(NB_) DESCPT(RSRC_) = MYROW DESCPT(CSRC_) = INDXG2P( JA, DESCA(NB_), JA, DESCA(CSRC_), $ NPCOL ) DESCPT(CTXT_) = ICTXT DESCPT(LLD_) = 1 CALL INFOG2L( IP-JPT, JP, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) ITMP = NUMROC( N+JPT, DESCPT(NB_), MYCOL, DESCPT(CSRC_), $ NPCOL ) CALL PICOL2ROW( ICTXT, N+JPT, 1, DESCIP(MB_), IPIV(IIP), $ DESCIP(LLD_), IWORK, MAX(1, ITMP), ICURROW, $ 0, 0, DESCPT(CSRC_), IWORK(ITMP+1) ) * * Send row-distributed pivots to all rows * IF( MYROW.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'Column', ' ', ITMP, 1, IWORK, $ ITMP ) ELSE CALL IGEBR2D( ICTXT, 'Column', ' ', ITMP, 1, IWORK, $ ITMP, 0, MYCOL ) END IF * * Adjust pivots so they are relative to the start of IWORK, * not IPIV * JPT = JPT + 1 DO 20 I = 1, ITMP IWORK(I) = IWORK(I) - IP + JPT 20 CONTINUE CALL PZLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IWORK, $ 1, JPT, DESCPT ) END IF END IF * RETURN * * End of PZLAPIV * END scalapack-2.0.2/SRC/pzlapv2.f000644 000766 000024 00000036733 10363532303 016117 0ustar00juliestaff000000 000000 SUBROUTINE PZLAPV2( DIREC, ROWCOL, M, N, A, IA, JA, DESCA, IPIV, $ IP, JP, DESCIP ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIREC, ROWCOL INTEGER IA, IP, JA, JP, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCIP( * ), IPIV( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLAPV2 applies either P (permutation matrix indicated by IPIV) * or inv( P ) to a M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1), resulting in row or column pivoting. The * pivot vector should be aligned with the distributed matrix A. For * pivoting the rows of sub( A ), IPIV should be distributed along a * process column and replicated over all process rows. Similarly, * IPIV should be distributed along a process row and replicated over * all process columns for column pivoting. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIREC (global input) CHARACTER * Specifies in which order the permutation is applied: * = 'F' (Forward) Applies pivots Forward from top of matrix. * Computes P * sub( A ); * = 'B' (Backward) Applies pivots Backward from bottom of * matrix. Computes inv( P ) * sub( A ). * * ROWCOL (global input) CHARACTER * Specifies if the rows or columns are to be permuted: * = 'R' Rows will be permuted, * = 'C' Columns will be permuted. * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this local array contains the local pieces of the * distributed matrix sub( A ) to which the row or columns * interchanges will be applied. On exit, this array contains * the local pieces of the permuted distributed matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IPIV (input) INTEGER array, dimension >= LOCr(M_A)+MB_A if * ROWCOL = 'R', LOCc(N_A)+NB_A otherwise. It contains * the pivoting information. IPIV(i) is the global row (column), * local row (column) i was swapped with. The last piece of the * array of size MB_A (resp. NB_A) is used as workspace. IPIV is * tied to the distributed matrix A. * * IP (global input) INTEGER * IPIV's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JP (global input) INTEGER * IPIV's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCIP (global and local input) INTEGER array of dimension 8 * The array descriptor for the distributed matrix IPIV. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL FORWRD, ROWPVT INTEGER I, IB, ICTXT, ICURCOL, ICURROW, IIP, IP1, ITMP, $ IPVWRK, J, JB, JJP, JP1, K, MA, MBA, MYCOL, $ MYROW, NBA, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBS2D, IGEBR2D, INFOG2L, $ PZSWAP * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * ROWPVT = LSAME( ROWCOL, 'R' ) IF( ROWPVT ) THEN IF( M.LE.1 .OR. N.LT.1 ) $ RETURN ELSE IF( M.LT.1 .OR. N.LE.1 ) $ RETURN END IF FORWRD = LSAME( DIREC, 'F' ) * * * Get grid and matrix parameters * MA = DESCA( M_ ) MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * If I'm applying pivots from beginning to end (e.g., repeating * pivoting done earlier). Thus this section computes P * sub( A ). * IF( FORWRD ) THEN CALL INFOG2L( IP, JP, DESCIP, NPROW, NPCOL, MYROW, MYCOL, $ IIP, JJP, ICURROW, ICURCOL ) * * If I'm pivoting the rows of sub( A ) * IF( ROWPVT ) THEN IPVWRK = NUMROC( DESCIP( M_ ), DESCIP( MB_ ), MYROW, $ DESCIP( RSRC_ ), NPROW ) + 1 - $ DESCIP( MB_ ) * * Loop over rows of sub( A ) * I = IA IB = MIN( M, ICEIL( IA, MBA ) * MBA - IA + 1 ) 10 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process column * IF( MYROW.EQ.ICURROW ) THEN CALL IGEBS2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IIP ), IB ) ITMP = IIP IIP = IIP + IB ELSE ITMP = IPVWRK CALL IGEBR2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( ITMP ), IB, ICURROW, MYCOL ) END IF * * Pivot the block of rows * DO 20 K = I, I+IB-1 IP1 = IPIV( ITMP ) - IP + IA IF( IP1.NE.K ) $ CALL PZSWAP( N, A, K, JA, DESCA, MA, A, IP1, JA, $ DESCA, MA ) ITMP = ITMP + 1 20 CONTINUE * * Go on to next row of processes, increment row counter, * and figure number of rows to pivot next * ICURROW = MOD( ICURROW+1, NPROW ) I = I + IB IB = MIN( MBA, M-I+IA ) IF( IB .GT. 0 ) GOTO 10 * * If I am pivoting the columns of sub( A ) * ELSE IPVWRK = NUMROC( DESCIP( N_ ), DESCIP( NB_ ), MYCOL, $ DESCIP( CSRC_ ), NPCOL ) + 1 - $ DESCIP( NB_ ) * * Loop over columns of sub( A ) * J = JA JB = MIN( N, ICEIL( JA, NBA ) * NBA - JA + 1 ) 30 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process row * IF( MYCOL.EQ.ICURCOL ) THEN CALL IGEBS2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( JJP ), JB ) ITMP = JJP JJP = JJP + JB ELSE ITMP = IPVWRK CALL IGEBR2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( ITMP ), JB, MYROW, ICURCOL ) END IF * * Pivot the block of columns * DO 40 K = J, J+JB-1 JP1 = IPIV( ITMP ) - JP + JA IF( JP1.NE.K ) $ CALL PZSWAP( M, A, IA, K, DESCA, 1, A, IA, JP1, $ DESCA, 1 ) ITMP = ITMP + 1 40 CONTINUE * * Go on to next column of processes, increment column * counter, and figure number of columns to pivot next * ICURCOL = MOD( ICURCOL+1, NPCOL ) J = J + JB JB = MIN( NBA, N-J+JA ) IF( JB .GT. 0 ) GOTO 30 END IF * * If I want to apply pivots in reverse order, i.e. reversing * pivoting done earlier. Thus this section computes * inv( P ) * sub( A ). * ELSE * * If I'm pivoting the rows of sub( A ) * IF( ROWPVT ) THEN CALL INFOG2L( IP+M-1, JP, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) * IPVWRK = NUMROC( DESCIP( M_ ), DESCIP( MB_ ), MYROW, $ DESCIP( RSRC_ ), NPROW ) + 1 - $ DESCIP( MB_ ) * * If I'm not in the current process row, my IIP points out * past end of pivot vector (since I don't own a piece of the * last row). Adjust IIP so it points at last pivot entry. * IF( MYROW.NE.ICURROW ) IIP = IIP - 1 * * Loop over rows in reverse order, starting at last row * I = IA + M - 1 IB = MOD( I, MBA ) IF( IB .EQ. 0 ) IB = MBA IB = MIN( IB, M ) 50 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process column * IF( MYROW.EQ.ICURROW ) THEN ITMP = IIP IIP = IIP - IB CALL IGEBS2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IIP+1 ), IB ) ELSE CALL IGEBR2D( ICTXT, 'Columnwise', ' ', IB, 1, $ IPIV( IPVWRK ), IB, ICURROW, MYCOL ) ITMP = IPVWRK + IB - 1 END IF * * Pivot the block of rows * DO 60 K = I, I-IB+1, -1 IP1 = IPIV( ITMP ) - IP + IA IF( IP1.NE.K ) $ CALL PZSWAP( N, A, K, JA, DESCA, MA, A, IP1, JA, $ DESCA, MA ) ITMP = ITMP - 1 60 CONTINUE * * Go to previous row of processes, decrement row counter, * and figure number of rows to be pivoted next * ICURROW = MOD( NPROW+ICURROW-1, NPROW ) I = I - IB IB = MIN( MBA, I-IA+1 ) IF( IB .GT. 0 ) GOTO 50 * * Otherwise, I'm pivoting the columns of sub( A ) * ELSE CALL INFOG2L( IP, JP+N-1, DESCIP, NPROW, NPCOL, MYROW, $ MYCOL, IIP, JJP, ICURROW, ICURCOL ) IPVWRK = NUMROC( DESCIP( N_ ), DESCIP( NB_ ), MYCOL, $ DESCIP( CSRC_ ), NPCOL ) + 1 - $ DESCIP( NB_ ) * * If I'm not in the current process column, my JJP points out * past end of pivot vector (since I don't own a piece of the * last column). Adjust JJP so it points at last pivot entry. * IF( MYCOL.NE.ICURCOL ) JJP = JJP - 1 * * Loop over columns in reverse order starting at last column * J = JA + N - 1 JB = MOD( J, NBA ) IF( JB .EQ. 0 ) JB = NBA JB = MIN( JB, N ) 70 CONTINUE * * Find local pointer into IPIV, and broadcast this block's * pivot information to everyone in process row * IF( MYCOL.EQ.ICURCOL ) THEN ITMP = JJP JJP = JJP - JB CALL IGEBS2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( JJP+1 ), JB ) ELSE CALL IGEBR2D( ICTXT, 'Rowwise', ' ', JB, 1, $ IPIV( IPVWRK ), JB, MYROW, ICURCOL ) ITMP = IPVWRK + JB - 1 END IF * * Pivot a block of columns * DO 80 K = J, J-JB+1, -1 JP1 = IPIV( ITMP ) - JP + JA IF( JP1.NE.K ) $ CALL PZSWAP( M, A, IA, K, DESCA, 1, A, IA, JP1, $ DESCA, 1 ) ITMP = ITMP - 1 80 CONTINUE * * Go to previous row of processes, decrement row counter, * and figure number of rows to be pivoted next * ICURCOL = MOD( NPCOL+ICURCOL-1, NPCOL ) J = J - JB JB = MIN( NBA, J-JA+1 ) IF( JB .GT. 0 ) GOTO 70 END IF * END IF * RETURN * * End PZLAPV2 * END scalapack-2.0.2/SRC/pzlaqge.f000644 000766 000024 00000023410 10363532303 016150 0ustar00juliestaff000000 000000 SUBROUTINE PZLAQGE( M, N, A, IA, JA, DESCA, R, C, ROWCND, COLCND, $ AMAX, EQUED ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER EQUED INTEGER IA, JA, M, N DOUBLE PRECISION AMAX, COLCND, ROWCND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION C( * ), R( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLAQGE equilibrates a general M-by-N distributed matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) using the row and scaling * factors in the vectors R and C. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)) * containing on entry the M-by-N matrix sub( A ). On exit, * the equilibrated distributed matrix. See EQUED for the * form of the equilibrated distributed submatrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * R (local input) DOUBLE PRECISION array, dimension LOCr(M_A) * The row scale factors for sub( A ). R is aligned with the * distributed matrix A, and replicated across every process * column. R is tied to the distributed matrix A. * * C (local input) DOUBLE PRECISION array, dimension LOCc(N_A) * The column scale factors of sub( A ). C is aligned with the * distributed matrix A, and replicated down every process * row. C is tied to the distributed matrix A. * * ROWCND (global input) DOUBLE PRECISION * The global ratio of the smallest R(i) to the largest R(i), * IA <= i <= IA+M-1. * * COLCND (global input) DOUBLE PRECISION * The global ratio of the smallest C(i) to the largest C(i), * JA <= j <= JA+N-1. * * AMAX (global input) DOUBLE PRECISION * Absolute value of largest distributed submatrix entry. * * EQUED (global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration * = 'R': Row equilibration, i.e., sub( A ) has been pre- * multiplied by diag(R(IA:IA+M-1)), * = 'C': Column equilibration, i.e., sub( A ) has been post- * multiplied by diag(C(JA:JA+N-1)), * = 'B': Both row and column equilibration, i.e., sub( A ) * has been replaced by * diag(R(IA:IA+M-1)) * sub( A ) * diag(C(JA:JA+N-1)). * * Internal Parameters * =================== * * THRESH is a threshold value used to decide if row or column scaling * should be done based on the ratio of the row or column scaling * factors. If ROWCND < THRESH, row scaling is done, and if * COLCND < THRESH, column scaling is done. * * LARGE and SMALL are threshold values used to decide if row scaling * should be done based on the absolute size of the largest matrix * element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, THRESH PARAMETER ( ONE = 1.0D+0, THRESH = 0.1D+0 ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICOFF, ICTXT, IIA, IOFFA, $ IROFF, J, JJA, LDA, MP, MYCOL, MYROW, NPCOL, $ NPROW, NQ DOUBLE PRECISION CJ, LARGE, SMALL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. INTEGER NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) THEN EQUED = 'N' RETURN END IF * * Get grid parameters and compute local indexes * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) MP = NUMROC( M+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF LDA = DESCA( LLD_ ) * * Initialize LARGE and SMALL. * SMALL = PDLAMCH( ICTXT, 'Safe minimum' ) / $ PDLAMCH( ICTXT, 'Precision' ) LARGE = ONE / SMALL * IF( ROWCND.GE.THRESH .AND. AMAX.GE.SMALL .AND. AMAX.LE.LARGE ) $ THEN * * No row scaling * IF( COLCND.GE.THRESH ) THEN * * No column scaling * EQUED = 'N' * ELSE * * Column scaling * IOFFA = (JJA-1)*LDA DO 20 J = JJA, JJA+NQ-1 CJ = C( J ) DO 10 I = IIA, IIA+MP-1 A( IOFFA + I ) = CJ*A( IOFFA + I ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE EQUED = 'C' END IF * ELSE IF( COLCND.GE.THRESH ) THEN * * Row scaling, no column scaling * IOFFA = (JJA-1)*LDA DO 40 J = JJA, JJA+NQ-1 DO 30 I = IIA, IIA+MP-1 A( IOFFA + I ) = R( I )*A( IOFFA + I ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE EQUED = 'R' * ELSE * * Row and column scaling * IOFFA = (JJA-1)*LDA DO 60 J = JJA, JJA+NQ-1 CJ = C( J ) DO 50 I = IIA, IIA+MP-1 A( IOFFA + I ) = CJ*R( I )*A( IOFFA + I ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE EQUED = 'B' * END IF * RETURN * * End of PZLAQGE * END scalapack-2.0.2/SRC/pzlaqsy.f000644 000766 000024 00000032121 10363532303 016207 0ustar00juliestaff000000 000000 SUBROUTINE PZLAQSY( UPLO, N, A, IA, JA, DESCA, SR, SC, SCOND, $ AMAX, EQUED ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER EQUED, UPLO INTEGER IA, JA, N DOUBLE PRECISION AMAX, SCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION SC( * ), SR( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLAQSY equilibrates a symmetric distributed matrix * sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the scaling factors in the * vectors SR and SC. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * symmetric distributed matrix sub( A ) is to be referenced: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (input/output) COMPLEX*16 pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1)). * On entry, the local pieces of the distributed symmetric * matrix sub( A ). If UPLO = 'U', the leading N-by-N upper * triangular part of sub( A ) contains the upper triangular * part of the matrix, and the strictly lower triangular part * of sub( A ) is not referenced. If UPLO = 'L', the leading * N-by-N lower triangular part of sub( A ) contains the lower * triangular part of the matrix, and the strictly upper trian- * gular part of sub( A ) is not referenced. * On exit, if EQUED = 'Y', the equilibrated matrix: * diag(SR(IA:IA+N-1)) * sub( A ) * diag(SC(JA:JA+N-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * SR (local input) DOUBLE PRECISION array, dimension LOCr(M_A) * The scale factors for A(IA:IA+M-1,JA:JA+N-1). SR is aligned * with the distributed matrix A, and replicated across every * process column. SR is tied to the distributed matrix A. * * SC (local input) DOUBLE PRECISION array, dimension LOCc(N_A) * The scale factors for sub( A ). SC is aligned with the dis- * tributed matrix A, and replicated down every process row. * SC is tied to the distributed matrix A. * * SCOND (global input) DOUBLE PRECISION * Ratio of the smallest SR(i) (respectively SC(j)) to the * largest SR(i) (respectively SC(j)), with IA <= i <= IA+N-1 * and JA <= j <= JA+N-1. * * AMAX (global input) DOUBLE PRECISION * Absolute value of the largest distributed submatrix entry. * * EQUED (output) CHARACTER*1 * Specifies whether or not equilibration was done. * = 'N': No equilibration. * = 'Y': Equilibration was done, i.e., sub( A ) has been re- * placed by: * diag(SR(IA:IA+N-1)) * sub( A ) * diag(SC(JA:JA+N-1)). * * Internal Parameters * =================== * * THRESH is a threshold value used to decide if scaling should be done * based on the ratio of the scaling factors. If SCOND < THRESH, * scaling is done. * * LARGE and SMALL are threshold values used to decide if scaling should * be done based on the absolute size of the largest matrix element. * If AMAX > LARGE or AMAX < SMALL, scaling is done. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, THRESH PARAMETER ( ONE = 1.0D+0, THRESH = 0.1D+0 ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICTXT, II, IIA, IOFFA, IROFF, J, $ JB, JJ, JJA, JN, KK, LDA, LL, MYCOL, MYROW, NP, $ NPCOL, NPROW DOUBLE PRECISION CJ, LARGE, SMALL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, LSAME, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) THEN EQUED = 'N' RETURN END IF * * Get grid parameters and compute local indexes * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) LDA = DESCA( LLD_ ) * * Initialize LARGE and SMALL. * SMALL = PDLAMCH( ICTXT, 'Safe minimum' ) / $ PDLAMCH( ICTXT, 'Precision' ) LARGE = ONE / SMALL * IF( SCOND.GE.THRESH .AND. AMAX.GE.SMALL .AND. AMAX.LE.LARGE ) THEN * * No equilibration * EQUED = 'N' * ELSE * II = IIA JJ = JJA JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 * * Replace A by diag(S) * A * diag(S). * IF( LSAME( UPLO, 'U' ) ) THEN * * Upper triangle of A(IA:IA+N-1,JA:JA+N-1) is stored. * Handle first block separately * IOFFA = (JJ-1)*LDA IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 20 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 10 KK = IIA, II+LL-JJ+1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 10 CONTINUE IOFFA = IOFFA + LDA 20 CONTINUE ELSE IOFFA = IOFFA + JB*LDA END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 70 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 40 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 30 KK = IIA, II+LL-JJ+1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 30 CONTINUE IOFFA = IOFFA + LDA 40 CONTINUE ELSE DO 60 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 50 KK = IIA, II-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 50 CONTINUE IOFFA = IOFFA + LDA 60 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 70 CONTINUE * ELSE * * Lower triangle of A(IA:IA+N-1,JA:JA+N-1) is stored. * Handle first block separately * IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFF * IOFFA = (JJ-1)*LDA IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 90 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 80 KK = II+LL-JJ, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE ELSE DO 110 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 100 KK = II, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining block of columns * DO 160 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN DO 130 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 120 KK = II+LL-JJ, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 120 CONTINUE IOFFA = IOFFA + LDA 130 CONTINUE ELSE DO 150 LL = JJ, JJ + JB -1 CJ = SC( LL ) DO 140 KK = II, IIA+NP-1 A( IOFFA + KK ) = CJ*SR( KK )*A( IOFFA + KK ) 140 CONTINUE IOFFA = IOFFA + LDA 150 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.IAROW ) $ II = II + JB IAROW = MOD( IAROW+1, NPROW ) IACOL = MOD( IACOL+1, NPCOL ) * 160 CONTINUE * END IF * EQUED = 'Y' * END IF * RETURN * * End of PZLAQSY * END scalapack-2.0.2/SRC/pzlarf.f000644 000766 000024 00000070150 10363532303 016006 0ustar00juliestaff000000 000000 SUBROUTINE PZLARF( SIDE, M, N, V, IV, JV, DESCV, INCV, TAU, $ C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX*16 C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PZLARF applies a complex elementary reflector Q to a complex M-by-N * distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), from either the * left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a complex scalar and v is a complex vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also have the first row of sub( C ). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also have the first column of sub( C ) and * MOD(JV-1,NB_V) must be equal to MOD(JC-1,NB_C), if INCV = 1 only the * last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q * sub( C ), * = 'R': form sub( C ) * Q. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * V (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+M-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+M-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+N-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+N-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q * sub( C ) if SIDE = 'L', or * sub( C ) * Q if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL, ICOFF, ICROW, ICTXT, IIC, IIV, IOFFC, $ IOFFV, IPW, IROFF, IVCOL, IVROW, JJC, JJV, LDC, $ LDV, MYCOL, MYROW, MP, NCC, NCV, NPCOL, NPROW, $ NQ, RDEST COMPLEX*16 TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PB_TOPGET, PBZTRNV, $ ZCOPY, ZGEBR2D, ZGEBS2D, ZGEMV, $ ZGERC, ZGERV2D, ZGESD2D, ZGSUM2D, $ ZLASET * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) JJC = MIN( JJC, NCC ) JJV = MIN( JJV, NCV ) IOFFC = IIC+(JJC-1)*LDC IOFFV = IIV+(JJV-1)*LDV * IROFF = MOD( IC-1, DESCC( MB_ ) ) ICOFF = MOD( JC-1, DESCC( NB_ ) ) MP = NUMROC( M+IROFF, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYROW.EQ.ICROW ) $ MP = MP - IROFF IF( MYCOL.EQ.ICCOL ) $ NQ = NQ - ICOFF * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFF) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFF) ) * IF( LSAME( SIDE, 'L' ) ) THEN * IF( CRBLCK ) THEN RDEST = ICROW ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V * IPW = MP+1 CALL PBZTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * TAULOC = TAU( JJV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MP, NQ, $ ONE, C( IOFFC ), LDC, V( IOFFV ), 1, $ ZERO, WORK, 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK, MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK, MAX( 1, NQ ), RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL ZGERC( MP, NQ, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL ZCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL ZGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL ) * ELSE IF( MYCOL.EQ.ICCOL ) THEN * IPW = MP+1 CALL ZGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MP, NQ, $ ONE, C( IOFFC ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V * IPW = MP+1 CALL PBZTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN IF( IOFFC.GT.0 ) $ CALL ZGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( IOFFC.GT.0 ) $ CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL ZCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK(IPW) = TAU( JJV ) CALL ZGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( JJV ) * ELSE * IPW = MP+1 CALL ZGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN IF( IOFFC.GT.0 ) $ CALL ZGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( IOFFC.GT.0 ) $ CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW ) THEN * TAULOC = TAU( IIV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, V( IOFFV ), LDV, $ ZERO, WORK, 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, $ WORK, MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK, MAX( 1, MP ), RDEST, ICCOL ) * * sub( C ) := sub( C ) - w * v' * IF( IOFFV.GT.0 .AND. IOFFC.GT.0 ) $ CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC ), $ LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW * IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 CALL ZCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL ZGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW ) THEN * IPW = NQ+1 CALL ZGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V * IPW = NQ+1 CALL PBZTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = TAU( JJV ) * ELSE * CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 IF( IOFFV.GT.0 ) $ CALL ZCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL ZGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( IIV ) * ELSE * IPW = NQ+1 CALL ZGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * IF( IOFFC.GT.0 ) $ CALL ZGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V * IPW = NQ+1 CALL PBZTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = TAU( JJV ) * ELSE * CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, WORK( IPW ), $ MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, 1, $ C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PZLARF * END scalapack-2.0.2/SRC/pzlarfb.f000644 000766 000024 00000104140 11750130340 016141 0ustar00juliestaff000000 000000 SUBROUTINE PZLARFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, V, IV, $ JV, DESCV, T, C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, DIRECT, STOREV INTEGER IC, IV, JC, JV, K, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX*16 C( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PZLARFB applies a complex block reflector Q or its conjugate * transpose Q**H to a complex M-by-N distributed matrix sub( C ) * denoting C(IC:IC+M-1,JC:JC+N-1), from the left or the right. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * DIRECT (global input) CHARACTER * Indicates how Q is formed from a product of elementary * reflectors * = 'F': Q = H(1) H(2) . . . H(k) (Forward) * = 'B': Q = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Indicates how the vectors which define the elementary * reflectors are stored: * = 'C': Columnwise * = 'R': Rowwise * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The order of the matrix T (= the number of elementary * reflectors whose product defines the block reflector). * * V (local input) COMPLEX*16 pointer into the local memory * to an array of dimension ( LLD_V, LOCc(JV+K-1) ) if * STOREV = 'C', ( LLD_V, LOCc(JV+M-1)) if STOREV = 'R' and * SIDE = 'L', ( LLD_V, LOCc(JV+N-1) ) if STOREV = 'R' and * SIDE = 'R'. It contains the local pieces of the distributed * vectors V representing the Householder transformation. * See further details. * If STOREV = 'C' and SIDE = 'L', LLD_V >= MAX(1,LOCr(IV+M-1)); * if STOREV = 'C' and SIDE = 'R', LLD_V >= MAX(1,LOCr(IV+N-1)); * if STOREV = 'R', LLD_V >= LOCr(IV+K-1). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * T (local input) COMPLEX*16 array, dimension MB_V by MB_V * if STOREV = 'R' and NB_V by NB_V if STOREV = 'C'. The trian- * gular matrix T in the representation of the block reflector. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the M-by-N distributed matrix sub( C ). On exit, * sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) or * sub( C )*Q or sub( C )*Q'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * If STOREV = 'C', * if SIDE = 'L', * LWORK >= ( NqC0 + MpC0 ) * K * else if SIDE = 'R', * LWORK >= ( NqC0 + MAX( NpV0 + NUMROC( NUMROC( N+ICOFFC, * NB_V, 0, 0, NPCOL ), NB_V, 0, 0, LCMQ ), * MpC0 ) ) * K * end if * else if STOREV = 'R', * if SIDE = 'L', * LWORK >= ( MpC0 + MAX( MqV0 + NUMROC( NUMROC( M+IROFFC, * MB_V, 0, 0, NPROW ), MB_V, 0, 0, LCMP ), * NqC0 ) ) * K * else if SIDE = 'R', * LWORK >= ( MpC0 + NqC0 ) * K * end if * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFV = MOD( IV-1, MB_V ), ICOFFV = MOD( JV-1, NB_V ), * IVROW = INDXG2P( IV, MB_V, MYROW, RSRC_V, NPROW ), * IVCOL = INDXG2P( JV, NB_V, MYCOL, CSRC_V, NPCOL ), * MqV0 = NUMROC( M+ICOFFV, NB_V, MYCOL, IVCOL, NPCOL ), * NpV0 = NUMROC( N+IROFFV, MB_V, MYROW, IVROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NpC0 = NUMROC( N+ICOFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If STOREV = 'Columnwise' * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if STOREV = 'Rowwise' * If SIDE = 'Left', * ( NB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL FORWARD CHARACTER COLBTOP, ROWBTOP, TRANST, UPLO INTEGER HEIGHT, IBASE, ICCOL, ICOFFC, ICOFFV, ICROW, $ ICTXT, II, IIBEG, IIC, IIEND, IINXT, IIV, $ ILASTCOL, ILASTROW, ILEFT, IOFF, IOFFC, IOFFV, $ IPT, IPV, IPW, IPW1, IRIGHT, IROFFC, IROFFV, $ ITOP, IVCOL, IVROW, JJ, JJBEG, JJC, JJEND, $ JJNXT, JJV, KP, KQ, LDC, LDV, LV, LW, MBV, MPC, $ MPC0, MQV, MQV0, MYCOL, MYDIST, MYROW, NBV, $ NPV, NPV0, NPCOL, NPROW, NQC, NQC0, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG1L, INFOG2L, PB_TOPGET, $ PBZTRAN, ZGEBR2D, ZGEBS2D, ZGEMM, $ ZGSUM2D, ZLAMOV, ZLASET, ZTRBR2D, $ ZTRBS2D, ZTRMM * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 .OR. K.LE.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( TRANS, 'N' ) ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF FORWARD = LSAME( DIRECT, 'F' ) IF( FORWARD ) THEN UPLO = 'U' ELSE UPLO = 'L' END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) MBV = DESCV( MB_ ) NBV = DESCV( NB_ ) IROFFV = MOD( IV-1, MBV ) ICOFFV = MOD( JV-1, NBV ) MPC = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYCOL.EQ.ICCOL ) $ NQC = NQC - ICOFFC IF( MYROW.EQ.ICROW ) $ MPC = MPC - IROFFC JJC = MIN( JJC, MAX( 1, JJC+NQC-1 ) ) JJV = MIN( JJV, MAX( 1, NUMROC( DESCV( N_ ), NBV, MYCOL, $ DESCV( CSRC_ ), NPCOL ) ) ) IOFFC = IIC + ( JJC-1 ) * LDC IOFFV = IIV + ( JJV-1 ) * LDV * IF( LSAME( STOREV, 'C' ) ) THEN * * V is stored columnwise * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * Locally V( IOFFV ) is MPV x K, C( IOFFC ) is MPC x NQC * WORK( IPV ) is MPC x K = V( IOFFV ), MPC = MPV * WORK( IPW ) is NQC x K = C( IOFFC )' * V( IOFFV ) * IPV = 1 IPW = IPV + MPC * K LV = MAX( 1, MPC ) LW = MAX( 1, NQC ) * * Broadcast V to the other process columns. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN CALL ZGEBS2D( ICTXT, 'Rowwise', ROWBTOP, MPC, K, $ V( IOFFV ), LDV ) IF( MYROW.EQ.IVROW ) $ CALL ZTRBS2D( ICTXT, 'Rowwise', ROWBTOP, UPLO, $ 'Non unit', K, K, T, NBV ) CALL ZLAMOV( 'All', MPC, K, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ROWBTOP, MPC, K, $ WORK( IPV ), LV, MYROW, IVCOL ) IF( MYROW.EQ.IVROW ) $ CALL ZTRBR2D( ICTXT, 'Rowwise', ROWBTOP, UPLO, $ 'Non unit', K, K, T, NBV, MYROW, IVCOL ) END IF * IF( FORWARD ) THEN * * WORK(IPV) = ( V1 ) where V1 is unit lower triangular, * ( V2 ) zeroes upper triangular part of V1 * MYDIST = MOD( MYROW-IVROW+NPROW, NPROW ) ITOP = MAX( 0, MYDIST*MBV - IROFFV ) IIBEG = IIV IIEND = IIBEG + MPC - 1 IINXT = MIN( ICEIL( IIBEG, MBV )*MBV, IIEND ) * 10 CONTINUE IF( K-ITOP .GT.0 ) THEN CALL ZLASET( 'Upper', IINXT-IIBEG+1, K-ITOP, ZERO, $ ONE, WORK( IPV+IIBEG-IIV+ITOP*LV ), LV ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IROFFV IIBEG = IINXT + 1 IINXT = MIN( IINXT+MBV, IIEND ) GO TO 10 END IF * ELSE * * WORK(IPV) = ( V1 ) where V2 is unit upper triangular, * ( V2 ) zeroes lower triangular part of V2 * JJ = JJV IOFF = MOD( IV+M-K-1, MBV ) CALL INFOG1L( IV+M-K, MBV, NPROW, MYROW, DESCV( RSRC_ ), $ II, ILASTROW ) KP = NUMROC( K+IOFF, MBV, MYROW, ILASTROW, NPROW ) IF( MYROW.EQ.ILASTROW ) $ KP = KP - IOFF MYDIST = MOD( MYROW-ILASTROW+NPROW, NPROW ) ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP+MBV, K ) ITOP = MIN( MAX( 0, ITOP ), K ) * 20 CONTINUE IF( JJ.LE.( JJV+K-1 ) ) THEN HEIGHT = IBASE - ITOP CALL ZLASET( 'All', KP, ITOP-JJ+JJV, ZERO, ZERO, $ WORK( IPV+II-IIV+(JJ-JJV)*LV ), LV ) CALL ZLASET( 'Lower', KP, HEIGHT, ZERO, ONE, $ WORK( IPV+II-IIV+ITOP*LV ), LV ) KP = MAX( 0, KP - HEIGHT ) II = II + HEIGHT JJ = JJV + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP + MBV, K ) ITOP = MIN( ITOP, K ) GO TO 20 END IF * END IF * * WORK( IPW ) = C( IOFFC )' * V (NQC x MPC x K) -> NQC x K * IF( MPC.GT.0 ) THEN CALL ZGEMM( 'Conjugate transpose', 'No transpose', NQC, $ K, MPC, ONE, C( IOFFC ), LDC, WORK( IPV ), LV, $ ZERO, WORK( IPW ), LW ) ELSE CALL ZLASET( 'All', NQC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * CALL ZTRMM( 'Right', UPLO, TRANST, 'Non unit', NQC, K, $ ONE, T, NBV, WORK( IPW ), LW ) CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW ) ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C C - V * W' * C( IOFFC ) = C( IOFFC ) - WORK( IPV ) * WORK( IPW )' * MPC x NQC MPC x K K x NQC * CALL ZGEMM( 'No transpose', 'Conjugate transpose', MPC, NQC, $ K, -ONE, WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC ), LDC ) * ELSE * * Form sub( C )*Q or sub( C )*Q' * * ICOFFC = IROFFV is required by the current transposition * routine PBZTRAN * NPV0 = NUMROC( N+IROFFV, MBV, MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) THEN NPV = NPV0 - IROFFV ELSE NPV = NPV0 END IF IF( MYCOL.EQ.ICCOL ) THEN NQC0 = NQC + ICOFFC ELSE NQC0 = NQC END IF * * Locally V( IOFFV ) is NPV x K C( IOFFC ) is MPC x NQC * WORK( IPV ) is K x NQC0 = [ . V( IOFFV ) ]' * WORK( IPW ) is NPV0 x K = [ . V( IOFFV )' ]' * WORK( IPT ) is the workspace for PBZTRAN * IPV = 1 IPW = IPV + K * NQC0 IPT = IPW + NPV0 * K LV = MAX( 1, K ) LW = MAX( 1, NPV0 ) * IF( MYCOL.EQ.IVCOL ) THEN IF( MYROW.EQ.IVROW ) THEN CALL ZLASET( 'All', IROFFV, K, ZERO, ZERO, $ WORK( IPW ), LW ) IPW1 = IPW + IROFFV CALL ZLAMOV( 'All', NPV, K, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) ELSE IPW1 = IPW CALL ZLAMOV( 'All', NPV, K, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) END IF * IF( FORWARD ) THEN * * WORK(IPW) = ( . V1' V2' )' where V1 is unit lower * triangular, zeroes upper triangular part of V1 * MYDIST = MOD( MYROW-IVROW+NPROW, NPROW ) ITOP = MAX( 0, MYDIST*MBV - IROFFV ) IIBEG = IIV IIEND = IIBEG + NPV - 1 IINXT = MIN( ICEIL( IIBEG, MBV )*MBV, IIEND ) * 30 CONTINUE IF( ( K-ITOP ).GT.0 ) THEN CALL ZLASET( 'Upper', IINXT-IIBEG+1, K-ITOP, ZERO, $ ONE, WORK( IPW1+IIBEG-IIV+ITOP*LW ), $ LW ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IROFFV IIBEG = IINXT + 1 IINXT = MIN( IINXT+MBV, IIEND ) GO TO 30 END IF * ELSE * * WORK( IPW ) = ( . V1' V2' )' where V2 is unit upper * triangular, zeroes lower triangular part of V2. * JJ = JJV CALL INFOG1L( IV+N-K, MBV, NPROW, MYROW, $ DESCV( RSRC_ ), II, ILASTROW ) IOFF = MOD( IV+N-K-1, MBV ) KP = NUMROC( K+IOFF, MBV, MYROW, ILASTROW, NPROW ) IF( MYROW.EQ.ILASTROW ) $ KP = KP - IOFF MYDIST = MOD( MYROW-ILASTROW+NPROW, NPROW ) ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP+MBV, K ) ITOP = MIN( MAX( 0, ITOP ), K ) * 40 CONTINUE IF( JJ.LE.( JJV+K-1 ) ) THEN HEIGHT = IBASE - ITOP CALL ZLASET( 'All', KP, ITOP-JJ+JJV, ZERO, ZERO, $ WORK( IPW1+II-IIV+(JJ-JJV)*LW ), LW ) CALL ZLASET( 'Lower', KP, HEIGHT, ZERO, ONE, $ WORK( IPW1+II-IIV+ITOP*LW ), LW ) KP = MAX( 0, KP - HEIGHT ) II = II + HEIGHT JJ = JJV + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBV - IOFF IBASE = MIN( ITOP + MBV, K ) ITOP = MIN( ITOP, K ) GO TO 40 END IF END IF END IF * CALL PBZTRAN( ICTXT, 'Columnwise', 'Conjugate transpose', $ N+IROFFV, K, MBV, WORK( IPW ), LW, ZERO, $ WORK( IPV ), LV, IVROW, IVCOL, -1, ICCOL, $ WORK( IPT ) ) * * WORK( IPV ) = ( . V' ) -> WORK( IPV ) = V' is K x NQC * IF( MYCOL.EQ.ICCOL ) $ IPV = IPV + ICOFFC * LV * * WORK( IPW ) becomes MPC x K = C( IOFFC ) * V * WORK( IPW ) = C( IOFFC ) * V (MPC x NQC x K) -> MPC x K * LW = MAX( 1, MPC ) * IF( NQC.GT.0 ) THEN CALL ZGEMM( 'No transpose', 'Conjugate transpose', MPC, $ K, NQC, ONE, C( IOFFC ), LDC, WORK( IPV ), $ LV, ZERO, WORK( IPW ), LW ) ELSE CALL ZLASET( 'All', MPC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN IF( MYROW.EQ.IVROW ) THEN * * Broadcast the block reflector to the other rows. * CALL ZTRBS2D( ICTXT, 'Columnwise', ' ', UPLO, $ 'Non unit', K, K, T, NBV ) ELSE CALL ZTRBR2D( ICTXT, 'Columnwise', ' ', UPLO, $ 'Non unit', K, K, T, NBV, IVROW, MYCOL ) END IF CALL ZTRMM( 'Right', UPLO, TRANS, 'Non unit', MPC, K, $ ONE, T, NBV, WORK( IPW ), LW ) * CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C C - W * V' * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC x NQC MPC x K K x NQC * CALL ZGEMM( 'No transpose', 'No transpose', MPC, NQC, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC ), LDC ) END IF * ELSE * * V is stored rowwise * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * IROFFC = ICOFFV is required by the current transposition * routine PBZTRAN * MQV0 = NUMROC( M+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN MQV = MQV0 - ICOFFV ELSE MQV = MQV0 END IF IF( MYROW.EQ.ICROW ) THEN MPC0 = MPC + IROFFC ELSE MPC0 = MPC END IF * * Locally V( IOFFV ) is K x MQV, C( IOFFC ) is MPC x NQC * WORK( IPV ) is MPC0 x K = [ . V( IOFFV ) ]' * WORK( IPW ) is K x MQV0 = [ . V( IOFFV ) ] * WORK( IPT ) is the workspace for PBZTRAN * IPV = 1 IPW = IPV + MPC0 * K IPT = IPW + K * MQV0 LV = MAX( 1, MPC0 ) LW = MAX( 1, K ) * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN CALL ZLASET( 'All', K, ICOFFV, ZERO, ZERO, $ WORK( IPW ), LW ) IPW1 = IPW + ICOFFV * LW CALL ZLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) ELSE IPW1 = IPW CALL ZLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW1 ), LW ) END IF * IF( FORWARD ) THEN * * WORK( IPW ) = ( . V1 V2 ) where V1 is unit upper * triangular, zeroes lower triangular part of V1 * MYDIST = MOD( MYCOL-IVCOL+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBV - ICOFFV ) JJBEG = JJV JJEND = JJV + MQV - 1 JJNXT = MIN( ICEIL( JJBEG, NBV ) * NBV, JJEND ) * 50 CONTINUE IF( ( K-ILEFT ).GT.0 ) THEN CALL ZLASET( 'Lower', K-ILEFT, JJNXT-JJBEG+1, ZERO, $ ONE, $ WORK( IPW1+ILEFT+(JJBEG-JJV)*LW ), $ LW ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - ICOFFV JJBEG = JJNXT + 1 JJNXT = MIN( JJNXT+NBV, JJEND ) GO TO 50 END IF * ELSE * * WORK( IPW ) = ( . V1 V2 ) where V2 is unit lower * triangular, zeroes upper triangular part of V2. * II = IIV CALL INFOG1L( JV+M-K, NBV, NPCOL, MYCOL, $ DESCV( CSRC_ ), JJ, ILASTCOL ) IOFF = MOD( JV+M-K-1, NBV ) KQ = NUMROC( K+IOFF, NBV, MYCOL, ILASTCOL, NPCOL ) IF( MYCOL.EQ.ILASTCOL ) $ KQ = KQ - IOFF MYDIST = MOD( MYCOL-ILASTCOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT+NBV, K ) ILEFT = MIN( MAX( 0, ILEFT ), K ) * 60 CONTINUE IF( II.LE.( IIV+K-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL ZLASET( 'All', ILEFT-II+IIV, KQ, ZERO, ZERO, $ WORK( IPW1+II-IIV+(JJ-JJV)*LW ), LW ) CALL ZLASET( 'Upper', WIDE, KQ, ZERO, ONE, $ WORK( IPW1+ILEFT+(JJ-JJV)*LW ), LW ) KQ = MAX( 0, KQ - WIDE ) II = IIV + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT + NBV, K ) ILEFT = MIN( ILEFT, K ) GO TO 60 END IF END IF END IF * * WORK( IPV ) = WORK( IPW )' (replicated) is MPC0 x K * CALL PBZTRAN( ICTXT, 'Rowwise', 'Conjugate transpose', K, $ M+ICOFFV, NBV, WORK( IPW ), LW, ZERO, $ WORK( IPV ), LV, IVROW, IVCOL, ICROW, -1, $ WORK( IPT ) ) * * WORK( IPV ) = ( . V )' -> WORK( IPV ) = V' is MPC x K * IF( MYROW.EQ.ICROW ) $ IPV = IPV + IROFFC * * WORK( IPW ) becomes NQC x K = C( IOFFC )' * V' * WORK( IPW ) = C( IOFFC )' * V' (NQC x MPC x K) -> NQC x K * LW = MAX( 1, NQC ) * IF( MPC.GT.0 ) THEN CALL ZGEMM( 'Conjugate transpose', 'No transpose', NQC, $ K, MPC, ONE, C( IOFFC ), LDC, WORK( IPV ), $ LV, ZERO, WORK( IPW ), LW ) ELSE CALL ZLASET( 'All', NQC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN * * Broadcast the block reflector to the other columns. * CALL ZTRBS2D( ICTXT, 'Rowwise', ' ', UPLO, 'Non unit', $ K, K, T, MBV ) ELSE CALL ZTRBR2D( ICTXT, 'Rowwise', ' ', UPLO, 'Non unit', $ K, K, T, MBV, MYROW, IVCOL ) END IF CALL ZTRMM( 'Right', UPLO, TRANST, 'Non unit', NQC, K, $ ONE, T, MBV, WORK( IPW ), LW ) * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW ) ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', NQC, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C C - V' * W' * C( IOFFC ) = C( IOFFC ) - WORK( IPV ) * WORK( IPW )' * MPC x NQC MPC x K K x NQC * CALL ZGEMM( 'No transpose', 'Conjugate transpose', MPC, NQC, $ K, -ONE, WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC ), LDC ) * ELSE * * Form Q*sub( C ) or Q'*sub( C ) * * Locally V( IOFFV ) is K x NQV, C( IOFFC ) is MPC x NQC * WORK( IPV ) is K x NQV = V( IOFFV ), NQV = NQC * WORK( IPW ) is MPC x K = C( IOFFC ) * V( IOFFV )' * IPV = 1 IPW = IPV + K * NQC LV = MAX( 1, K ) LW = MAX( 1, MPC ) * * Broadcast V to the other process rows. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) IF( MYROW.EQ.IVROW ) THEN CALL ZGEBS2D( ICTXT, 'Columnwise', COLBTOP, K, NQC, $ V( IOFFV ), LDV ) IF( MYCOL.EQ.IVCOL ) $ CALL ZTRBS2D( ICTXT, 'Columnwise', COLBTOP, UPLO, $ 'Non unit', K, K, T, MBV ) CALL ZLAMOV( 'All', K, NQC, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', COLBTOP, K, NQC, $ WORK( IPV ), LV, IVROW, MYCOL ) IF( MYCOL.EQ.IVCOL ) $ CALL ZTRBR2D( ICTXT, 'Columnwise', COLBTOP, UPLO, $ 'Non unit', K, K, T, MBV, IVROW, MYCOL ) END IF * IF( FORWARD ) THEN * * WORK(IPW) = ( V1 V2 ) where V1 is unit upper * triangular, zeroes lower triangular part of V1 * MYDIST = MOD( MYCOL-IVCOL+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBV - ICOFFV ) JJBEG = JJV JJEND = JJV + NQC - 1 JJNXT = MIN( ICEIL( JJBEG, NBV ) * NBV, JJEND ) * 70 CONTINUE IF( ( K-ILEFT ).GT.0 ) THEN CALL ZLASET( 'Lower', K-ILEFT, JJNXT-JJBEG+1, ZERO, $ ONE, WORK( IPV+ILEFT+(JJBEG-JJV)*LV ), $ LV ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - ICOFFV JJBEG = JJNXT + 1 JJNXT = MIN( JJNXT+NBV, JJEND ) GO TO 70 END IF * ELSE * * WORK( IPW ) = ( . V1 V2 ) where V2 is unit lower * triangular, zeroes upper triangular part of V2. * II = IIV CALL INFOG1L( JV+N-K, NBV, NPCOL, MYCOL, DESCV( CSRC_ ), $ JJ, ILASTCOL ) IOFF = MOD( JV+N-K-1, NBV ) KQ = NUMROC( K+IOFF, NBV, MYCOL, ILASTCOL, NPCOL ) IF( MYCOL.EQ.ILASTCOL ) $ KQ = KQ - IOFF MYDIST = MOD( MYCOL-ILASTCOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT+NBV, K ) ILEFT = MIN( MAX( 0, ILEFT ), K ) * 80 CONTINUE IF( II.LE.( IIV+K-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL ZLASET( 'All', ILEFT-II+IIV, KQ, ZERO, ZERO, $ WORK( IPV+II-IIV+(JJ-JJV)*LV ), LV ) CALL ZLASET( 'Upper', WIDE, KQ, ZERO, ONE, $ WORK( IPV+ILEFT+(JJ-JJV)*LV ), LV ) KQ = MAX( 0, KQ - WIDE ) II = IIV + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBV - IOFF IRIGHT = MIN( ILEFT + NBV, K ) ILEFT = MIN( ILEFT, K ) GO TO 80 END IF * END IF * * WORK( IPV ) is K x NQC = V = V( IOFFV ) * WORK( IPW ) = C( IOFFC ) * V' (MPC x NQC x K) -> MPC x K * IF( NQC.GT.0 ) THEN CALL ZGEMM( 'No transpose', 'Conjugate transpose', MPC, $ K, NQC, ONE, C( IOFFC ), LDC, WORK( IPV ), $ LV, ZERO, WORK( IPW ), LW ) ELSE CALL ZLASET( 'All', MPC, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN CALL ZTRMM( 'Right', UPLO, TRANS, 'Non unit', MPC, K, $ ONE, T, MBV, WORK( IPW ), LW ) CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', MPC, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C C - W * V * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * WORK( IPV ) * MPC x NQC MPC x K K x NQC * CALL ZGEMM( 'No transpose', 'No transpose', MPC, NQC, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC ), LDC ) * END IF * END IF * RETURN * * End of PZLARFB * END scalapack-2.0.2/SRC/pzlarfc.f000644 000766 000024 00000070104 10363532303 016150 0ustar00juliestaff000000 000000 SUBROUTINE PZLARFC( SIDE, M, N, V, IV, JV, DESCV, INCV, TAU, $ C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX*16 C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PZLARFC applies a complex elementary reflector Q**H to a * complex M-by-N distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), * from either the left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a complex scalar and v is a complex vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also have the first row of sub( C ). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also have the first column of sub( C ) and * MOD(JV-1,NB_V) must be equal to MOD(JC-1,NB_C), if INCV = 1 only the * last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q**H * sub( C ), * = 'R': form sub( C ) * Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * V (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+M-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+M-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+N-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+N-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q**H * sub( C ) if SIDE = 'L', or * sub( C ) * Q**H if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL, ICOFF, ICROW, ICTXT, IIC, IIV, IOFFC, $ IOFFV, IPW, IROFF, IVCOL, IVROW, JJC, JJV, LDC, $ LDV, MYCOL, MYROW, MP, NCC, NCV, NPCOL, NPROW, $ NQ, RDEST COMPLEX*16 TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PB_TOPGET, PBZTRNV, $ ZCOPY, ZGEBR2D, ZGEBS2D, ZGEMV, $ ZGERC, ZGERV2D, ZGESD2D, ZGSUM2D, $ ZLASET * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC, JJC, $ ICROW, ICCOL ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDC = DESCC( LLD_ ) LDV = DESCV( LLD_ ) IIC = MIN( IIC, LDC ) IIV = MIN( IIV, LDV ) JJC = MIN( JJC, NCC ) JJV = MIN( JJV, NCV ) IOFFC = IIC+(JJC-1)*LDC IOFFV = IIV+(JJV-1)*LDV * IROFF = MOD( IC-1, DESCC( MB_ ) ) ICOFF = MOD( JC-1, DESCC( NB_ ) ) MP = NUMROC( M+IROFF, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) IF( MYROW.EQ.ICROW ) $ MP = MP - IROFF IF( MYCOL.EQ.ICCOL ) $ NQ = NQ - ICOFF * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFF) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFF) ) * IF( LSAME( SIDE, 'L' ) ) THEN * IF( CRBLCK ) THEN RDEST = ICROW ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V * IPW = MP+1 CALL PBZTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = DCONJG( TAU( IIV ) ) * ELSE * CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) TAULOC = DCONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL ) THEN * TAULOC = DCONJG( TAU( JJV ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MP, NQ, $ ONE, C( IOFFC ), LDC, V( IOFFV ), 1, $ ZERO, WORK, 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK, MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK, MAX( 1, NQ ), RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL ZGERC( MP, NQ, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL ZCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL ZGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL ) * ELSE IF( MYCOL.EQ.ICCOL ) THEN * IPW = MP+1 CALL ZGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = DCONJG( WORK( IPW ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MP, NQ, $ ONE, C( IOFFC ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V * IPW = MP+1 CALL PBZTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFF, V( IOFFV ), LDV, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = DCONJG( TAU( IIV ) ) * ELSE * CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) TAULOC = DCONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), 1, $ C( IOFFC ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MP+1 CALL ZCOPY( MP, V( IOFFV ), 1, WORK, 1 ) WORK(IPW) = TAU( JJV ) CALL ZGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = DCONJG( TAU( JJV ) ) * ELSE * IPW = MP+1 CALL ZGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = DCONJG( WORK( IPW ) ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MP.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQ, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQ ) ) END IF CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQ, 1, $ WORK( IPW ), MAX( 1, NQ ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, WORK( IPW ), 1, $ C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW ) THEN * TAULOC = DCONJG( TAU( IIV ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, V( IOFFV ), LDV, $ ZERO, WORK, 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, $ WORK, MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK, MAX( 1, MP ), RDEST, ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MP, NQ, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW * IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 CALL ZCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL ZGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW ) THEN * IPW = NQ+1 CALL ZGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = DCONJG( WORK( IPW ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V * IPW = NQ+1 CALL PBZTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = DCONJG( TAU( JJV ) ) * ELSE * CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) TAULOC = DCONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQ+1 CALL ZCOPY( NQ, V( IOFFV ), LDV, WORK, 1 ) WORK(IPW) = TAU( IIV ) CALL ZGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = DCONJG( TAU( IIV ) ) * ELSE * IPW = NQ+1 CALL ZGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = DCONJG( WORK( IPW ) ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No Transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, 1, $ C( IOFFC ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V * IPW = NQ+1 CALL PBZTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFF, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = DCONJG( TAU( JJV ) ) * ELSE * CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) TAULOC = DCONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQ.GT.0 ) THEN CALL ZGEMV( 'No transpose', MP, NQ, ONE, $ C( IOFFC ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MP, 1, ZERO, ZERO, WORK( IPW ), $ MAX( 1, MP ) ) END IF CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MP, 1, $ WORK( IPW ), MAX( 1, MP ), RDEST, $ ICCOL ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MP, NQ, -TAULOC, WORK( IPW ), 1, WORK, 1, $ C( IOFFC ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PZLARFC * END scalapack-2.0.2/SRC/pzlarfg.f000644 000766 000024 00000024401 10363532303 016153 0ustar00juliestaff000000 000000 SUBROUTINE PZLARFG( N, ALPHA, IAX, JAX, X, IX, JX, DESCX, INCX, $ TAU ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IAX, INCX, IX, JAX, JX, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX*16 TAU( * ), X( * ) * .. * * Purpose * ======= * * PZLARFG generates a complex elementary reflector H of order n, such * that * * H * sub( X ) = H * ( x(iax,jax) ) = ( alpha ), H' * H = I. * ( x ) ( 0 ) * * where alpha is a real scalar, and sub( X ) is an (N-1)-element * complex distributed vector X(IX:IX+N-2,JX) if INCX = 1 and * X(IX,JX:JX+N-2) if INCX = DESCX(M_). H is represented in the form * * H = I - tau * ( 1 ) * ( 1 v' ) , * ( v ) * * where tau is a complex scalar and v is a complex (N-1)-element * vector. Note that H is not Hermitian. * * If the elements of sub( X ) are all zero and X(IAX,JAX) is real, * then tau = 0 and H is taken to be the unit matrix. * * Otherwise 1 <= real(tau) <= 2 and abs(tau-1) <= 1. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Arguments * ========= * * N (global input) INTEGER * The global order of the elementary reflector. N >= 0. * * ALPHA (local output) COMPLEX*16 * On exit, alpha is computed in the process scope having the * vector sub( X ). * * IAX (global input) INTEGER * The global row index in X of X(IAX,JAX). * * JAX (global input) INTEGER * The global column index in X of X(IAX,JAX). * * X (local input/local output) COMPLEX*16, pointer into the * local memory to an array of dimension (LLD_X,*). This array * contains the local pieces of the distributed vector sub( X ). * Before entry, the incremented array sub( X ) must contain * the vector x. On exit, it is overwritten with the vector v. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * TAU (local output) COMPLEX*16, array, dimension LOCc(JX) * if INCX = 1, and LOCr(IX) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix X. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER ICTXT, IIAX, INDXTAU, IXCOL, IXROW, J, JJAX, $ KNT, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ALPHI, ALPHR, BETA, RSAFMN, SAFMIN, XNORM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PDZNRM2, $ ZGEBR2D, ZGEBS2D, PZSCAL, $ PZDSCAL * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLAPY3 COMPLEX*16 ZLADIV EXTERNAL DLAMCH, DLAPY3, ZLADIV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DIMAG, SIGN * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is distributed across a process row. * CALL INFOG2L( IX, JAX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIAX, JJAX, IXROW, IXCOL ) * IF( MYROW.NE.IXROW ) $ RETURN * * Broadcast X(IAX,JAX) across the process row. * IF( MYCOL.EQ.IXCOL ) THEN J = IIAX+(JJAX-1)*DESCX( LLD_ ) CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, X( J ), 1 ) ALPHA = X( J ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, ALPHA, 1, $ MYROW, IXCOL ) END IF * INDXTAU = IIAX * ELSE * * sub( X ) is distributed across a process column. * CALL INFOG2L( IAX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIAX, JJAX, IXROW, IXCOL ) * IF( MYCOL.NE.IXCOL ) $ RETURN * * Broadcast X(IAX,JAX) across the process column. * IF( MYROW.EQ.IXROW ) THEN J = IIAX+(JJAX-1)*DESCX( LLD_ ) CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, X( J ), 1 ) ALPHA = X( J ) ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, ALPHA, 1, $ IXROW, MYCOL ) END IF * INDXTAU = JJAX * END IF * IF( N.LE.0 ) THEN TAU( INDXTAU ) = ZERO RETURN END IF * CALL PDZNRM2( N-1, XNORM, X, IX, JX, DESCX, INCX ) ALPHR = DBLE( ALPHA ) ALPHI = DIMAG( ALPHA ) * IF( XNORM.EQ.ZERO .AND. ALPHI.EQ.ZERO ) THEN * * H = I * TAU( INDXTAU ) = ZERO * ELSE * * General case * BETA = -SIGN( DLAPY3( ALPHR, ALPHI, XNORM ), ALPHR ) SAFMIN = DLAMCH( 'S' ) RSAFMN = ONE / SAFMIN IF( ABS( BETA ).LT.SAFMIN ) THEN * * XNORM, BETA may be inaccurate; scale X and recompute them * KNT = 0 10 CONTINUE KNT = KNT + 1 CALL PZDSCAL( N-1, RSAFMN, X, IX, JX, DESCX, INCX ) BETA = BETA*RSAFMN ALPHI = ALPHI*RSAFMN ALPHR = ALPHR*RSAFMN IF( ABS( BETA ).LT.SAFMIN ) $ GO TO 10 * * New BETA is at most 1, at least SAFMIN * CALL PDZNRM2( N-1, XNORM, X, IX, JX, DESCX, INCX ) ALPHA = DCMPLX( ALPHR, ALPHI ) BETA = -SIGN( DLAPY3( ALPHR, ALPHI, XNORM ), ALPHR ) TAU( INDXTAU ) = DCMPLX( ( BETA-ALPHR ) / BETA, $ -ALPHI / BETA ) ALPHA = ZLADIV( DCMPLX( ONE ), ALPHA-BETA ) CALL PZSCAL( N-1, ALPHA, X, IX, JX, DESCX, INCX ) * * If ALPHA is subnormal, it may lose relative accuracy * ALPHA = BETA DO 20 J = 1, KNT ALPHA = ALPHA*SAFMIN 20 CONTINUE ELSE TAU( INDXTAU ) = DCMPLX( ( BETA-ALPHR ) / BETA, $ -ALPHI / BETA ) ALPHA = ZLADIV( DCMPLX( ONE ), ALPHA-BETA ) CALL PZSCAL( N-1, ALPHA, X, IX, JX, DESCX, INCX ) ALPHA = BETA END IF END IF * RETURN * * End of PZLARFG * END scalapack-2.0.2/SRC/pzlarft.f000644 000766 000024 00000045072 10363532303 016177 0ustar00juliestaff000000 000000 SUBROUTINE PZLARFT( DIRECT, STOREV, N, K, V, IV, JV, DESCV, TAU, $ T, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIRECT, STOREV INTEGER IV, JV, K, N * .. * .. Array Arguments .. INTEGER DESCV( * ) COMPLEX*16 TAU( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PZLARFT forms the triangular factor T of a complex block reflector H * of order n, which is defined as a product of k elementary reflectors. * * If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; * * If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. * * If STOREV = 'C', the vector which defines the elementary reflector * H(i) is stored in the i-th column of the distributed matrix V, and * * H = I - V * T * V' * * If STOREV = 'R', the vector which defines the elementary reflector * H(i) is stored in the i-th row of the distributed matrix V, and * * H = I - V' * T * V * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIRECT (global input) CHARACTER*1 * Specifies the order in which the elementary reflectors are * multiplied to form the block reflector: * = 'F': H = H(1) H(2) . . . H(k) (Forward) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER*1 * Specifies how the vectors which define the elementary * reflectors are stored (see also Further Details): * = 'C': columnwise * = 'R': rowwise * * N (global input) INTEGER * The order of the block reflector H. N >= 0. * * K (global input) INTEGER * The order of the triangular factor T (= the number of * elementary reflectors). 1 <= K <= MB_V (= NB_V). * * V (input/output) COMPLEX*16 pointer into the local memory * to an array of local dimension (LOCr(IV+N-1),LOCc(JV+K-1)) * if STOREV = 'C', and (LOCr(IV+K-1),LOCc(JV+N-1)) if * STOREV = 'R'. The distributed matrix V contains the * Householder vectors. See further details. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * TAU (local input) COMPLEX*16, array, dimension LOCr(IV+K-1) * if INCV = M_V, and LOCc(JV+K-1) otherwise. This array * contains the Householder scalars related to the Householder * vectors. TAU is tied to the distributed matrix V. * * T (local output) COMPLEX*16 array, dimension (NB_V,NB_V) * if STOREV = 'Col', and (MB_V,MB_V) otherwise. It contains * the k-by-k triangular factor of the block reflector asso- * ciated with V. If DIRECT = 'F', T is upper triangular; * if DIRECT = 'B', T is lower triangular. * * WORK (local workspace) COMPLEX*16 array, * dimension (K*(K-1)/2) * * Further Details * =============== * * The shape of the matrix V and the storage of the vectors which define * the H(i) is best illustrated by the following example with n = 5 and * k = 3. The elements equal to 1 are not stored; the corresponding * array elements are modified but restored on exit. The rest of the * array is not used. * * DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': * * V( IV:IV+N-1, ( 1 ) V( IV:IV+K-1, ( 1 v1 v1 v1 v1 ) * JV:JV+K-1 ) = ( v1 1 ) JV:JV+N-1 ) = ( 1 v2 v2 v2 ) * ( v1 v2 1 ) ( 1 v3 v3 ) * ( v1 v2 v3 ) * ( v1 v2 v3 ) * * DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': * * V( IV:IV+N-1, ( v1 v2 v3 ) V( IV:IV+K-1, ( v1 v1 1 ) * JV:JV+K-1 ) = ( v1 v2 v3 ) JV:JV+N-1 ) = ( v2 v2 v2 1 ) * ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) * ( 1 v3 ) * ( 1 ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL FORWARD INTEGER ICOFF, ICTXT, II, IIV, IROFF, IVCOL, IVROW, $ ITMP0, ITMP1, IW, JJ, JJV, LDV, MICOL, MIROW, $ MYCOL, MYROW, NP, NPCOL, NPROW, NQ COMPLEX*16 VII * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, ZCOPY, ZGEMV, $ ZGSUM2D, ZLACGV, ZLASET, ZTRMV * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC EXTERNAL INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 .OR. K.LE.0 ) $ RETURN * ICTXT = DESCV( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * FORWARD = LSAME( DIRECT, 'F' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, $ IIV, JJV, IVROW, IVCOL ) * IF( LSAME( STOREV, 'C' ) .AND. MYCOL.EQ.IVCOL ) THEN * IW = 1 LDV = DESCV( LLD_ ) IROFF = MOD( IV-1, DESCV( MB_ ) ) * IF( FORWARD ) THEN * * DIRECT = 'Forward', STOREV = 'Columnwise' * NP = NUMROC( N+IROFF, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) THEN NP = NP - IROFF II = IIV + 1 ELSE II = IIV END IF IF( IROFF+1.EQ.DESCV( MB_ ) ) THEN MIROW = MOD( IVROW+1, NPROW ) ELSE MIROW = IVROW END IF ITMP0 = 0 * DO 10 JJ = JJV+1, JJV+K-1 * IF( MYROW.EQ.MIROW ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( jv+i-1 ) * * V(iv+i-1:iv+n-1,jv:jv+i-2)' * V(iv+i-1:iv+n-1,jv+i-1) * ITMP0 = ITMP0 + 1 IF( NP-II+IIV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', NP-II+IIV, ITMP0, $ -TAU( JJ ), V( II+(JJV-1)*LDV ), LDV, $ V( II+(JJ-1)*LDV ), 1, ZERO, $ WORK( IW ), 1 ) ELSE CALL ZLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF * IW = IW + ITMP0 IF( MYROW.EQ.MIROW ) THEN V( II+(JJ-1)*LDV ) = VII II = II + 1 END IF * IF( MOD( IV+ITMP0, DESCV( MB_ ) ).EQ.0 ) $ MIROW = MOD( MIROW+1, NPROW ) * 10 CONTINUE * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', IW-1, 1, WORK, IW-1, $ IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * IW = 1 ITMP0 = 0 ITMP1 = 1 * T( ITMP1 ) = TAU( JJV ) * DO 20 JJ = JJV+1, JJV+K-1 * * T(1:j-1,j) = T(1:j-1,1:j-1) * T(1:j-1,j) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 + DESCV( NB_ ) CALL ZCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL ZTRMV( 'Upper', 'No transpose', 'Non-unit', $ ITMP0, T, DESCV( NB_ ), T( ITMP1 ), 1 ) T(ITMP1+ITMP0) = TAU( JJ ) * 20 CONTINUE * END IF * ELSE * * DIRECT = 'Backward', STOREV = 'Columnwise' * NP = NUMROC( N+IROFF-1, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ NP = NP - IROFF MIROW = INDXG2P( IV+N-2, DESCV( MB_ ), MYROW, $ DESCV( RSRC_ ), NPROW ) II = IIV + NP - 1 ITMP0 = 0 * DO 30 JJ = JJV+K-2, JJV, -1 * IF( MYROW.EQ.MIROW ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( jv+i-1 ) * * V(iv:iv+n-k+i-1,jv+i:jv+k-1)' * V(iv:iv+n-k+i-1,jv+i-1) * ITMP0 = ITMP0 + 1 IF( II-IIV+1.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', II-IIV+1, ITMP0, $ -TAU( JJ ), V( IIV+JJ*LDV ), LDV, $ V( IIV+(JJ-1)*LDV ), 1, ZERO, $ WORK( IW ), 1 ) ELSE CALL ZLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF * IW = IW + ITMP0 IF( MYROW.EQ.MIROW ) THEN V( II+(JJ-1)*LDV ) = VII II = II - 1 END IF * IF( MOD( IV+N-ITMP0-2, DESCV(MB_) ).EQ.0 ) $ MIROW = MOD( MIROW+NPROW-1, NPROW ) * 30 CONTINUE * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', IW-1, 1, WORK, IW-1, $ IVROW, MYCOL ) * IF( MYROW.EQ.IVROW ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( NB_ ) * T( ITMP1-1 ) = TAU( JJV+K-1 ) * DO 40 JJ = JJV+K-2, JJV, -1 * * T(j+1:k,j) = T(j+1:k,j+1:k) * T(j+1:k,j) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( NB_ ) - 1 CALL ZCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL ZTRMV( 'Lower', 'No transpose', 'Non-unit', $ ITMP0, T( ITMP1+DESCV( NB_ ) ), $ DESCV( NB_ ), T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( JJ ) * 40 CONTINUE * END IF * END IF * ELSE IF( LSAME( STOREV, 'R' ) .AND. MYROW.EQ.IVROW ) THEN * IW = 1 LDV = DESCV( LLD_ ) ICOFF = MOD( JV-1, DESCV( NB_ ) ) * IF( FORWARD ) THEN * * DIRECT = 'Forward', STOREV = 'Rowwise' * NQ = NUMROC( N+ICOFF, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN NQ = NQ - ICOFF JJ = JJV + 1 ELSE JJ = JJV END IF IF( ICOFF+1.EQ.DESCV( NB_ ) ) THEN MICOL = MOD( IVCOL+1, NPCOL ) ELSE MICOL = IVCOL END IF ITMP0 = 0 * DO 50 II = IIV+1, IIV+K-1 * IF( MYCOL.EQ.MICOL ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(1:i-1,i) = -tau( iv+i-1 ) * * V(iv+i-1,jv+i-1:jv+n-1) * V(iv:iv+i-2,jv+i-1:jv+n-1)' * ITMP0 = ITMP0 + 1 IF( NQ-JJ+JJV.GT.0 ) THEN CALL ZLACGV( NQ-JJ+JJV, V( II+(JJ-1)*LDV ), LDV ) CALL ZGEMV( 'No transpose', ITMP0, NQ-JJ+JJV, $ -TAU(II), V( IIV+(JJ-1)*LDV ), LDV, $ V( II+(JJ-1)*LDV ), LDV, ZERO, $ WORK( IW ), 1 ) CALL ZLACGV( NQ-JJ+JJV, V( II+(JJ-1)*LDV ), LDV ) ELSE CALL ZLASET( 'All', ITMP0, 1, ZERO, ZERO, $ WORK( IW ), ITMP0 ) END IF * IW = IW + ITMP0 IF( MYCOL.EQ.MICOL ) THEN V( II+(JJ-1)*LDV ) = VII JJ = JJ + 1 END IF * IF( MOD( JV+ITMP0, DESCV( NB_ ) ).EQ.0 ) $ MICOL = MOD( MICOL+1, NPCOL ) * 50 CONTINUE * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = 1 * T( ITMP1 ) = TAU( IIV ) * DO 60 II = IIV+1, IIV+K-1 * * T(1:i-1,i) = T(1:i-1,1:i-1) * T(1:i-1,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 + DESCV( MB_ ) CALL ZCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL ZTRMV( 'Upper', 'No transpose', 'Non-unit', $ ITMP0, T, DESCV( MB_ ), T( ITMP1 ), 1 ) T( ITMP1+ITMP0 ) = TAU( II ) * 60 CONTINUE * END IF * ELSE * * DIRECT = 'Backward', STOREV = 'Rowwise' * NQ = NUMROC( N+ICOFF-1, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQ = NQ - ICOFF MICOL = INDXG2P( JV+N-2, DESCV( NB_ ), MYCOL, $ DESCV( CSRC_ ), NPCOL ) JJ = JJV + NQ - 1 ITMP0 = 0 * DO 70 II = IIV+K-2, IIV, -1 * IF( MYCOL.EQ.MICOL ) THEN VII = V( II+(JJ-1)*LDV ) V( II+(JJ-1)*LDV ) = ONE END IF * * T(i+1:k,i) = -tau( iv+i-1 ) * * V(iv+i:iv+k-1,jv:jv+n-k+i-1)' * V(iv+i-1,jv:jv+n-k+i-1)' * ITMP0 = ITMP0 + 1 IF( JJ-JJV+1.GT.0 ) THEN CALL ZLACGV( JJ-JJV+1, V( II+(JJV-1)*LDV ), LDV ) CALL ZGEMV( 'No transpose', ITMP0, JJ-JJV+1, $ -TAU( II ), V( II+1+(JJV-1)*LDV ), LDV, $ V( II+(JJV-1)*LDV ), LDV, ZERO, $ WORK( IW ), 1 ) CALL ZLACGV( JJ-JJV+1, V( II+(JJV-1)*LDV ), LDV ) ELSE CALL ZLASET( 'All', ITMP0, 1, ZERO, ZERO, $ WORK( IW ), ITMP0 ) END IF * IW = IW + ITMP0 IF( MYCOL.EQ.MICOL ) THEN V( II+(JJ-1)*LDV ) = VII JJ = JJ - 1 END IF * IF( MOD( JV+N-ITMP0-2, DESCV( NB_ ) ).EQ.0 ) $ MICOL = MOD( MICOL+NPCOL-1, NPCOL ) * 70 CONTINUE * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( MB_ ) * T( ITMP1-1 ) = TAU( IIV+K-1 ) * DO 80 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = T(i+1:k,i+1:k) * T(i+1:k,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( MB_ ) - 1 CALL ZCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL ZTRMV( 'Lower', 'No transpose', 'Non-unit', $ ITMP0, T( ITMP1+DESCV( MB_ ) ), $ DESCV( MB_ ), T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( II ) * 80 CONTINUE * END IF * END IF * END IF * RETURN * * End of PZLARFT * END scalapack-2.0.2/SRC/pzlarz.f000644 000766 000024 00000102120 10363532303 016023 0ustar00juliestaff000000 000000 SUBROUTINE PZLARZ( SIDE, M, N, L, V, IV, JV, DESCV, INCV, TAU, C, $ IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX*16 C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PZLARZ applies a complex elementary reflector Q to a complex M-by-N * distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), from either the * left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a complex scalar and v is a complex vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Q is a product of k elementary reflectors as returned by PZTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also own C(IC+M-L,JC:JC+N-1). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC+N-L-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also own C(IC:IC+M-1,JC+N-L) and * MOD(JV-1,NB_V) must be equal to MOD(JC+N-L-1,NB_C), if INCV = 1 only * the last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q * sub( C ), * = 'R': form sub( C ) * Q. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+L-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+L-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q * sub( C ) if SIDE = 'L', or * sub( C ) * Q if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK, LEFT CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIC1, IIC2, IIV, IOFFC1, $ IOFFC2, IOFFV, IPW, IROFFC1, IROFFC2, IROFFV, $ IVCOL, IVROW, JJC1, JJC2, JJV, LDC, LDV, MPC2, $ MPV, MYCOL, MYROW, NCC, NCV, NPCOL, NPROW, $ NQC2, NQV, RDEST COMPLEX*16 TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PB_TOPGET, PBZTRNV, $ ZAXPY, ZCOPY, ZGEBR2D, ZGEBS2D, $ ZGEMV, ZGERC, ZGERV2D, ZGESD2D, $ ZGSUM2D, ZLASET * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * LEFT = LSAME( SIDE, 'L' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) IROFFV = MOD( IV-1, DESCV( NB_ ) ) MPV = NUMROC( L+IROFFV, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ MPV = MPV - IROFFV ICOFFV = MOD( JV-1, DESCV( NB_ ) ) NQV = NUMROC( L+ICOFFV, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, NCV ) IOFFV = IIV+(JJV-1)*LDV NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC1, JJC1, ICROW1, ICCOL1 ) IROFFC1 = MOD( IC-1, DESCC( MB_ ) ) ICOFFC1 = MOD( JC-1, DESCC( NB_ ) ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NCC ) ) IOFFC1 = IIC1 + ( JJC1-1 ) * LDC * IF( LEFT ) THEN CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, DESCC( MB_ ) ) ICOFFC2 = MOD( JC-1, DESCC( NB_ ) ) NQC2 = NUMROC( N+ICOFFC2, DESCC( NB_ ), MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 ELSE CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC-1, DESCC( MB_ ) ) MPC2 = NUMROC( M+IROFFC2, DESCC( MB_ ), MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = MOD( JC+N-L-1, DESCC( NB_ ) ) END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NCC ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFFC1) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFFC1) ) * IF( LEFT ) THEN * IF( CRBLCK ) THEN RDEST = ICROW2 ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V (ICOFFV = IROFFC2) * IPW = MPV+1 CALL PBZTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL2 ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * TAULOC = TAU( JJV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, V( IOFFV ), $ 1, ZERO, WORK, 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK, MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK, MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK, MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK, $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL2 * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL ZCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL ZGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL2 ) * ELSE IF( MYCOL.EQ.ICCOL2 ) THEN * IPW = MPV+1 CALL ZGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), $ RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V (ICOFFV=IROFFC2) * IPW = MPV+1 CALL PBZTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = TAU( IIV ) * ELSE * CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL ZCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL ZGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( JJV ) * ELSE * IPW = MPV+1 CALL ZGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW2 ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW2 ) THEN * TAULOC = TAU( IIV ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, V( IOFFV ), $ LDV, ZERO, WORK, 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK, MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK, 1 ) * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK, MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) * IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK, 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * IF( MPC2.GT.0 .AND. NQV.GT.0 ) $ CALL ZGERC( MPC2, NQV, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC2 ), $ LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW2 * IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL ZCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL ZGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW2, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW2 ) THEN * IPW = NQV+1 CALL ZGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = WORK( IPW ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), $ RDEST, ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V (IROFFV = ICOFFC2) * IPW = NQV+1 CALL PBZTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW2 ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = TAU( JJV ) * ELSE * CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL ZCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL ZGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = TAU( IIV ) * ELSE * IPW = NQV+1 CALL ZGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = WORK( IPW ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No Transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V (ICOFFC2=IROFFV) * IPW = NQV+1 CALL PBZTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = TAU( JJV ) * ELSE * CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PZLARZ * END scalapack-2.0.2/SRC/pzlarzb.f000644 000766 000024 00000056600 11750130340 016174 0ustar00juliestaff000000 000000 SUBROUTINE PZLARZB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, V, $ IV, JV, DESCV, T, C, IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER DIRECT, SIDE, STOREV, TRANS INTEGER IC, IV, JC, JV, K, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX*16 C( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PZLARZB applies a complex block reflector Q or its conjugate * transpose Q**H to a complex M-by-N distributed matrix sub( C ) * denoting C(IC:IC+M-1,JC:JC+N-1), from the left or the right. * * Q is a product of k elementary reflectors as returned by PZTZRZF. * * Currently, only STOREV = 'R' and DIRECT = 'B' are supported. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * DIRECT (global input) CHARACTER * Indicates how H is formed from a product of elementary * reflectors * = 'F': H = H(1) H(2) . . . H(k) (Forward, not supported yet) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Indicates how the vectors which define the elementary * reflectors are stored: * = 'C': Columnwise (not supported yet) * = 'R': Rowwise * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The order of the matrix T (= the number of elementary * reflectors whose product defines the block reflector). * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_V, LOCc(JV+M-1)) if SIDE = 'L', * (LLD_V, LOCc(JV+N-1)) if SIDE = 'R'. It contains the local * pieces of the distributed vectors V representing the * Householder transformation as returned by PZTZRZF. * LLD_V >= LOCr(IV+K-1). * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * T (local input) COMPLEX*16 array, dimension MB_V by MB_V * The lower triangular matrix T in the representation of the * block reflector. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the M-by-N distributed matrix sub( C ). On exit, * sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) or * sub( C )*Q or sub( C )*Q'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * If STOREV = 'C', * if SIDE = 'L', * LWORK >= ( NqC0 + MpC0 ) * K * else if SIDE = 'R', * LWORK >= ( NqC0 + MAX( NpV0 + NUMROC( NUMROC( N+ICOFFC, * NB_V, 0, 0, NPCOL ), NB_V, 0, 0, LCMQ ), * MpC0 ) ) * K * end if * else if STOREV = 'R', * if SIDE = 'L', * LWORK >= ( MpC0 + MAX( MqV0 + NUMROC( NUMROC( M+IROFFC, * MB_V, 0, 0, NPROW ), MB_V, 0, 0, LCMP ), * NqC0 ) ) * K * else if SIDE = 'R', * LWORK >= ( MpC0 + NqC0 ) * K * end if * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFV = MOD( IV-1, MB_V ), ICOFFV = MOD( JV-1, NB_V ), * IVROW = INDXG2P( IV, MB_V, MYROW, RSRC_V, NPROW ), * IVCOL = INDXG2P( JV, NB_V, MYCOL, CSRC_V, NPCOL ), * MqV0 = NUMROC( M+ICOFFV, NB_V, MYCOL, IVCOL, NPCOL ), * NpV0 = NUMROC( N+IROFFV, MB_V, MYROW, IVROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NpC0 = NUMROC( N+ICOFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If STOREV = 'Columnwise' * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if STOREV = 'Rowwise' * If SIDE = 'Left', * ( NB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT CHARACTER COLBTOP, TRANST INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIBEG, IIC1, IIC2, $ IIEND, IINXT, IIV, ILEFT, INFO, IOFFC2, IOFFV, $ IPT, IPV, IPW, IROFFC1, IROFFC2, ITOP, IVCOL, $ IVROW, J, JJBEG, JJEND, JJNXT, JJC1, JJC2, JJV, $ LDC, LDV, LV, LW, MBC, MBV, MPC1, MPC2, MPC20, $ MQV, MQV0, MYCOL, MYDIST, MYROW, NBC, NBV, $ NPCOL, NPROW, NQC1, NQC2, NQCALL, NQV * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, INFOG2L, $ PBZMATADD, PB_TOPGET, PXERBLA, PBZTRAN, $ ZGEBR2D, ZGEBS2D, ZGEMM, $ ZGSUM2D, ZLACGV, ZLAMOV, ZLASET, $ ZTRBR2D, ZTRBS2D, ZTRMM * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 .OR. K.LE.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check for currently supported options * INFO = 0 IF( .NOT.LSAME( DIRECT, 'B' ) ) THEN INFO = -3 ELSE IF( .NOT.LSAME( STOREV, 'R' ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZLARZB', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * LEFT = LSAME( SIDE, 'L' ) IF( LSAME( TRANS, 'N' ) ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) MBV = DESCV( MB_ ) NBV = DESCV( NB_ ) ICOFFV = MOD( JV-1, NBV ) NQV = NUMROC( L+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, MAX( 1, NUMROC( DESCV( N_ ), NBV, MYCOL, $ DESCV( CSRC_ ), NPCOL ) ) ) IOFFV = IIV + ( JJV-1 ) * LDV MBC = DESCC( MB_ ) NBC = DESCC( NB_ ) NQCALL = NUMROC( DESCC( N_ ), NBC, MYCOL, DESCC( CSRC_ ), NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, IIC1, $ JJC1, ICROW1, ICCOL1 ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NQCALL ) ) * IF( LEFT ) THEN IROFFC1 = MOD( IC-1, MBC ) MPC1 = NUMROC( K+IROFFC1, MBC, MYROW, ICROW1, NPROW ) IF( MYROW.EQ.ICROW1 ) $ MPC1 = MPC1 - IROFFC1 ICOFFC1 = MOD( JC-1, NBC ) NQC1 = NUMROC( N+ICOFFC1, NBC, MYCOL, ICCOL1, NPCOL ) IF( MYCOL.EQ.ICCOL1 ) $ NQC1 = NQC1 - ICOFFC1 CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, MBC ) MPC2 = NUMROC( L+IROFFC2, MBC, MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = ICOFFC1 NQC2 = NQC1 ELSE IROFFC1 = MOD( IC-1, MBC ) MPC1 = NUMROC( M+IROFFC1, MBC, MYROW, ICROW1, NPROW ) IF( MYROW.EQ.ICROW1 ) $ MPC1 = MPC1 - IROFFC1 ICOFFC1 = MOD( JC-1, NBC ) NQC1 = NUMROC( K+ICOFFC1, NBC, MYCOL, ICCOL1, NPCOL ) IF( MYCOL.EQ.ICCOL1 ) $ NQC1 = NQC1 - ICOFFC1 CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = IROFFC1 MPC2 = MPC1 ICOFFC2 = MOD( JC+N-L-1, NBC ) NQC2 = NUMROC( L+ICOFFC2, NBC, MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NQCALL ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * IF( LSAME( SIDE, 'L' ) ) THEN * * Form Q*sub( C ) or Q'*sub( C ) * * IROFFC2 = ICOFFV is required by the current transposition * routine PBZTRAN * MQV0 = NUMROC( M+ICOFFV, NBV, MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) THEN MQV = MQV0 - ICOFFV ELSE MQV = MQV0 END IF IF( MYROW.EQ.ICROW2 ) THEN MPC20 = MPC2 + IROFFC2 ELSE MPC20 = MPC2 END IF * * Locally V( IOFFV ) is K x MQV, C( IOFFC2 ) is MPC2 x NQC2 * WORK( IPV ) is MPC20 x K = [ . V( IOFFV ) ]' * WORK( IPW ) is K x MQV0 = [ . V( IOFFV ) ] * WORK( IPT ) is the workspace for PBZTRAN * IPV = 1 IPW = IPV + MPC20 * K IPT = IPW + K * MQV0 LV = MAX( 1, MPC20 ) LW = MAX( 1, K ) * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN CALL ZLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW+ICOFFV*LW ), LW ) ELSE CALL ZLAMOV( 'All', K, MQV, V( IOFFV ), LDV, $ WORK( IPW ), LW ) END IF END IF * * WORK( IPV ) = WORK( IPW )' (replicated) is MPC20 x K * CALL PBZTRAN( ICTXT, 'Rowwise', 'Conjugate transpose', K, $ M+ICOFFV, DESCV( NB_ ), WORK( IPW ), LW, ZERO, $ WORK( IPV ), LV, IVROW, IVCOL, ICROW2, -1, $ WORK( IPT ) ) * * WORK( IPV ) = ( . V )' -> WORK( IPV ) = V' is MPC2 x K * IF( MYROW.EQ.ICROW2 ) $ IPV = IPV + IROFFC2 * * WORK( IPW ) becomes NQC2 x K = C( IOFFC2 )' * V' * WORK( IPW ) = C( IOFFC2 )' * V' (NQC2 x MPC2 x K) -> NQC2 x K * LW = MAX( 1, NQC2 ) * IF( MPC2.GT.0 ) THEN CALL ZGEMM( 'Transpose', 'No transpose', NQC2, K, MPC2, $ ONE, C( IOFFC2 ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL ZLASET( 'All', NQC2, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * * WORK( IPW ) = WORK( IPW ) + C1 ( NQC1 = NQC2 ) * IF( MPC1.GT.0 ) THEN MYDIST = MOD( MYROW-ICROW1+NPROW, NPROW ) ITOP = MAX( 0, MYDIST * MBC - IROFFC1 ) IIBEG = IIC1 IIEND = IIC1 + MPC1 - 1 IINXT = MIN( ICEIL( IIBEG, MBC ) * MBC, IIEND ) * 10 CONTINUE IF( IIBEG.LE.IINXT ) THEN CALL PBZMATADD( ICTXT, 'Transpose', NQC2, IINXT-IIBEG+1, $ ONE, C( IIBEG+(JJC1-1)*LDC ), LDC, ONE, $ WORK( IPW+ITOP ), LW ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBC - IROFFC1 IIBEG = IINXT +1 IINXT = MIN( IINXT+MBC, IIEND ) GO TO 10 END IF END IF * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, K, WORK( IPW ), $ LW, IVROW, MYCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYROW.EQ.IVROW ) THEN IF( MYCOL.EQ.IVCOL ) THEN * * Broadcast the block reflector to the other columns. * CALL ZTRBS2D( ICTXT, 'Rowwise', ' ', 'Lower', 'Non unit', $ K, K, T, MBV ) ELSE CALL ZTRBR2D( ICTXT, 'Rowwise', ' ', 'Lower', 'Non unit', $ K, K, T, MBV, MYROW, IVCOL ) END IF CALL ZTRMM( 'Right', 'Lower', TRANST, 'Non unit', NQC2, K, $ ONE, T, MBV, WORK( IPW ), LW ) * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', NQC2, K, $ WORK( IPW ), LW ) ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', NQC2, K, $ WORK( IPW ), LW, IVROW, MYCOL ) END IF * * C1 = C1 - WORK( IPW ) * IF( MPC1.GT.0 ) THEN MYDIST = MOD( MYROW-ICROW1+NPROW, NPROW ) ITOP = MAX( 0, MYDIST * MBC - IROFFC1 ) IIBEG = IIC1 IIEND = IIC1 + MPC1 - 1 IINXT = MIN( ICEIL( IIBEG, MBC ) * MBC, IIEND ) * 20 CONTINUE IF( IIBEG.LE.IINXT ) THEN CALL PBZMATADD( ICTXT, 'Transpose', IINXT-IIBEG+1, NQC2, $ -ONE, WORK( IPW+ITOP ), LW, ONE, $ C( IIBEG+(JJC1-1)*LDC ), LDC ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBC - IROFFC1 IIBEG = IINXT +1 IINXT = MIN( IINXT+MBC, IIEND ) GO TO 20 END IF END IF * * C2 C2 - V' * W' * C( IOFFC2 ) = C( IOFFC2 ) - WORK( IPV ) * WORK( IPW )' * MPC2 x NQC2 MPC2 x K K x NQC2 * DO 30 J = 1, K CALL ZLACGV( MPC2, WORK( IPV+(J-1)*LV ), 1 ) 30 CONTINUE CALL ZGEMM( 'No transpose', 'Transpose', MPC2, NQC2, K, -ONE, $ WORK( IPV ), LV, WORK( IPW ), LW, ONE, $ C( IOFFC2 ), LDC ) * ELSE * * Form sub( C ) * Q or sub( C ) * Q' * * Locally V( IOFFV ) is K x NQV, C( IOFFC2 ) is MPC2 x NQC2 * WORK( IPV ) is K x NQV = V( IOFFV ), NQV = NQC2 * WORK( IPW ) is MPC2 x K = C( IOFFC2 ) * V( IOFFV )' * IPV = 1 IPW = IPV + K * NQC2 LV = MAX( 1, K ) LW = MAX( 1, MPC2 ) * * Broadcast V to the other process rows. * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) IF( MYROW.EQ.IVROW ) THEN CALL ZGEBS2D( ICTXT, 'Columnwise', COLBTOP, K, NQC2, $ V( IOFFV ), LDV ) IF( MYCOL.EQ.IVCOL ) $ CALL ZTRBS2D( ICTXT, 'Columnwise', COLBTOP, 'Lower', $ 'Non unit', K, K, T, MBV ) CALL ZLAMOV( 'All', K, NQC2, V( IOFFV ), LDV, WORK( IPV ), $ LV ) ELSE CALL ZGEBR2D( ICTXT, 'Columnwise', COLBTOP, K, NQC2, $ WORK( IPV ), LV, IVROW, MYCOL ) IF( MYCOL.EQ.IVCOL ) $ CALL ZTRBR2D( ICTXT, 'Columnwise', COLBTOP, 'Lower', $ 'Non unit', K, K, T, MBV, IVROW, MYCOL ) END IF * * WORK( IPV ) is K x NQC2 = V = V( IOFFV ) * WORK( IPW ) = C( IOFFC2 ) * V' (MPC2 x NQC2 x K) -> MPC2 x K * IF( NQC2.GT.0 ) THEN CALL ZGEMM( 'No Transpose', 'Transpose', MPC2, K, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK( IPV ), LV, ZERO, $ WORK( IPW ), LW ) ELSE CALL ZLASET( 'All', MPC2, K, ZERO, ZERO, WORK( IPW ), LW ) END IF * * WORK( IPW ) = WORK( IPW ) + C1 ( MPC1 = MPC2 ) * IF( NQC1.GT.0 ) THEN MYDIST = MOD( MYCOL-ICCOL1+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBC - ICOFFC1 ) JJBEG = JJC1 JJEND = JJC1 + NQC1 - 1 JJNXT = MIN( ICEIL( JJBEG, NBC ) * NBC, JJEND ) * 40 CONTINUE IF( JJBEG.LE.JJNXT ) THEN CALL PBZMATADD( ICTXT, 'No transpose', MPC2, $ JJNXT-JJBEG+1, ONE, $ C( IIC1+(JJBEG-1)*LDC ), LDC, ONE, $ WORK( IPW+ILEFT*LW ), LW ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBC - ICOFFC1 JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBC, JJEND ) GO TO 40 END IF END IF * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW, MYROW, IVCOL ) * * WORK( IPW ) = WORK( IPW ) * T' or WORK( IPW ) * T * IF( MYCOL.EQ.IVCOL ) THEN DO 50 J = 1, K CALL ZLACGV( K-J+1, T( J+(J-1)*MBV ), 1 ) 50 CONTINUE CALL ZTRMM( 'Right', 'Lower', TRANS, 'Non unit', MPC2, K, $ ONE, T, MBV, WORK( IPW ), LW ) CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW ) DO 60 J = 1, K CALL ZLACGV( K-J+1, T( J+(J-1)*MBV ), 1 ) 60 CONTINUE ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', MPC2, K, WORK( IPW ), $ LW, MYROW, IVCOL ) END IF * * C1 = C1 - WORK( IPW ) * IF( NQC1.GT.0 ) THEN MYDIST = MOD( MYCOL-ICCOL1+NPCOL, NPCOL ) ILEFT = MAX( 0, MYDIST * NBC - ICOFFC1 ) JJBEG = JJC1 JJEND = JJC1 + NQC1 - 1 JJNXT = MIN( ICEIL( JJBEG, NBC ) * NBC, JJEND ) * 70 CONTINUE IF( JJBEG.LE.JJNXT ) THEN CALL PBZMATADD( ICTXT, 'No transpose', MPC2, $ JJNXT-JJBEG+1, -ONE, $ WORK( IPW+ILEFT*LW ), LW, ONE, $ C( IIC1+(JJBEG-1)*LDC ), LDC ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBC - ICOFFC1 JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBC, JJEND ) GO TO 70 END IF END IF * * C2 C2 - W * conjg( V ) * C( IOFFC ) = C( IOFFC ) - WORK( IPW ) * conjg( WORK( IPV ) ) * MPC2 x NQC2 MPC2 x K K x NQC2 * DO 80 J = 1, NQC2 CALL ZLACGV( K, WORK( IPV+(J-1)*LV ), 1 ) 80 CONTINUE IF( IOFFC2.GT.0 ) $ CALL ZGEMM( 'No transpose', 'No transpose', MPC2, NQC2, K, $ -ONE, WORK( IPW ), LW, WORK( IPV ), LV, ONE, $ C( IOFFC2 ), LDC ) * END IF * RETURN * * End of PZLARZB * END scalapack-2.0.2/SRC/pzlarzc.f000644 000766 000024 00000102455 10363532303 016201 0ustar00juliestaff000000 000000 SUBROUTINE PZLARZC( SIDE, M, N, L, V, IV, JV, DESCV, INCV, TAU, C, $ IC, JC, DESCC, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE INTEGER IC, INCV, IV, JC, JV, L, M, N * .. * .. Array Arguments .. INTEGER DESCC( * ), DESCV( * ) COMPLEX*16 C( * ), TAU( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PZLARZC applies a complex elementary reflector Q**H to a * complex M-by-N distributed matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1), * from either the left or the right. Q is represented in the form * * Q = I - tau * v * v' * * where tau is a complex scalar and v is a complex vector. * * If tau = 0, then Q is taken to be the unit matrix. * * Q is a product of k elementary reflectors as returned by PZTZRZF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * Restrictions * ============ * * If SIDE = 'Left' and INCV = 1, then the row process having the first * entry V(IV,JV) must also own C(IC+M-L,JC:JC+N-1). Moreover, * MOD(IV-1,MB_V) must be equal to MOD(IC+N-L-1,MB_C), if INCV=M_V, only * the last equality must be satisfied. * * If SIDE = 'Right' and INCV = M_V then the column process having the * first entry V(IV,JV) must also own C(IC:IC+M-1,JC+N-L) and * MOD(JV-1,NB_V) must be equal to MOD(JC+N-L-1,NB_C), if INCV = 1 only * the last equality must be satisfied. * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': form Q**H * sub( C ), * = 'R': form sub( C ) * Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * V (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_V,*) containing the local * pieces of the distributed vectors V representing the * Householder transformation Q, * V(IV:IV+L-1,JV) if SIDE = 'L' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'L' and INCV = M_V, * V(IV:IV+L-1,JV) if SIDE = 'R' and INCV = 1, * V(IV,JV:JV+L-1) if SIDE = 'R' and INCV = M_V, * * The vector v in the representation of Q. V is not used if * TAU = 0. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * INCV (global input) INTEGER * The global increment for the elements of V. Only two values * of INCV are supported in this version, namely 1 and M_V. * INCV must not be zero. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JV) if * INCV = 1, and LOCr(IV) otherwise. This array contains the * Householder scalars related to the Householder vectors. * TAU is tied to the distributed matrix V. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C, LOCc(JC+N-1) ), * containing the local pieces of sub( C ). On exit, sub( C ) * is overwritten by the Q**H * sub( C ) if SIDE = 'L', or * sub( C ) * Q**H if SIDE = 'R'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * If INCV = 1, * if SIDE = 'L', * if IVCOL = ICCOL, * LWORK >= NqC0 * else * LWORK >= MpC0 + MAX( 1, NqC0 ) * end if * else if SIDE = 'R', * LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( NUMROC( * N+ICOFFC,NB_V,0,0,NPCOL ),NB_V,0,0,LCMQ ) ) * end if * else if INCV = M_V, * if SIDE = 'L', * LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( NUMROC( * M+IROFFC,MB_V,0,0,NPROW ),MB_V,0,0,LCMP ) ) * else if SIDE = 'R', * if IVROW = ICROW, * LWORK >= MpC0 * else * LWORK >= NqC0 + MAX( 1, MpC0 ) * end if * end if * end if * * where LCM is the least common multiple of NPROW and NPCOL and * LCM = ILCM( NPROW, NPCOL ), LCMP = LCM / NPROW, * LCMQ = LCM / NPCOL, * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Alignment requirements * ====================== * * The distributed submatrices V(IV:*, JV:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * MB_V = NB_V, * * If INCV = 1, * If SIDE = 'Left', * ( MB_V.EQ.MB_C .AND. IROFFV.EQ.IROFFC .AND. IVROW.EQ.ICROW ) * If SIDE = 'Right', * ( MB_V.EQ.NB_A .AND. MB_V.EQ.NB_C .AND. IROFFV.EQ.ICOFFC ) * else if INCV = M_V, * If SIDE = 'Left', * ( MB_V.EQ.NB_V .AND. MB_V.EQ.MB_C .AND. ICOFFV.EQ.IROFFC ) * If SIDE = 'Right', * ( NB_V.EQ.NB_C .AND. ICOFFV.EQ.ICOFFC .AND. IVCOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CCBLCK, CRBLCK, LEFT CHARACTER COLBTOP, ROWBTOP INTEGER ICCOL1, ICCOL2, ICOFFC1, ICOFFC2, ICOFFV, $ ICROW1, ICROW2, ICTXT, IIC1, IIC2, IIV, IOFFC1, $ IOFFC2, IOFFV, IPW, IROFFC1, IROFFC2, IROFFV, $ IVCOL, IVROW, JJC1, JJC2, JJV, LDC, LDV, MPC2, $ MPV, MYCOL, MYROW, NCC, NCV, NPCOL, NPROW, $ NQC2, NQV, RDEST COMPLEX*16 TAULOC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PB_TOPGET, PBZTRNV, $ ZAXPY, ZCOPY, ZGEBR2D, ZGEBS2D, $ ZGEMV, ZGERC, ZGERV2D, ZGESD2D, $ ZGSUM2D, ZLASET * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Get grid parameters. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * LEFT = LSAME( SIDE, 'L' ) CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, IIV, JJV, $ IVROW, IVCOL ) IROFFV = MOD( IV-1, DESCV( NB_ ) ) MPV = NUMROC( L+IROFFV, DESCV( MB_ ), MYROW, IVROW, NPROW ) IF( MYROW.EQ.IVROW ) $ MPV = MPV - IROFFV ICOFFV = MOD( JV-1, DESCV( NB_ ) ) NQV = NUMROC( L+ICOFFV, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQV = NQV - ICOFFV LDV = DESCV( LLD_ ) NCV = NUMROC( DESCV( N_ ), DESCV( NB_ ), MYCOL, DESCV( CSRC_ ), $ NPCOL ) LDV = DESCV( LLD_ ) IIV = MIN( IIV, LDV ) JJV = MIN( JJV, NCV ) IOFFV = IIV+(JJV-1)*LDV NCC = NUMROC( DESCC( N_ ), DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC1, JJC1, ICROW1, ICCOL1 ) IROFFC1 = MOD( IC-1, DESCC( MB_ ) ) ICOFFC1 = MOD( JC-1, DESCC( NB_ ) ) LDC = DESCC( LLD_ ) IIC1 = MIN( IIC1, LDC ) JJC1 = MIN( JJC1, MAX( 1, NCC ) ) IOFFC1 = IIC1 + ( JJC1-1 ) * LDC * IF( LEFT ) THEN CALL INFOG2L( IC+M-L, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC+M-L-1, DESCC( MB_ ) ) ICOFFC2 = MOD( JC-1, DESCC( NB_ ) ) NQC2 = NUMROC( N+ICOFFC2, DESCC( NB_ ), MYCOL, ICCOL2, NPCOL ) IF( MYCOL.EQ.ICCOL2 ) $ NQC2 = NQC2 - ICOFFC2 ELSE CALL INFOG2L( IC, JC+N-L, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC2, JJC2, ICROW2, ICCOL2 ) IROFFC2 = MOD( IC-1, DESCC( MB_ ) ) MPC2 = NUMROC( M+IROFFC2, DESCC( MB_ ), MYROW, ICROW2, NPROW ) IF( MYROW.EQ.ICROW2 ) $ MPC2 = MPC2 - IROFFC2 ICOFFC2 = MOD( JC+N-L-1, DESCC( NB_ ) ) END IF IIC2 = MIN( IIC2, LDC ) JJC2 = MIN( JJC2, NCC ) IOFFC2 = IIC2 + ( JJC2-1 ) * LDC * * Is sub( C ) only distributed over a process row ? * CRBLCK = ( M.LE.(DESCC( MB_ )-IROFFC1) ) * * Is sub( C ) only distributed over a process column ? * CCBLCK = ( N.LE.(DESCC( NB_ )-ICOFFC1) ) * IF( LEFT ) THEN * IF( CRBLCK ) THEN RDEST = ICROW2 ELSE RDEST = -1 END IF * IF( CCBLCK ) THEN * * sub( C ) is distributed over a process column * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose row vector V (ICOFFV = IROFFC2) * IPW = MPV+1 CALL PBZTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * IF( MYROW.EQ.IVROW ) THEN * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = DCONJG( TAU( IIV ) ) * ELSE * CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAULOC, 1, IVROW, MYCOL ) TAULOC = DCONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * V is a column vector * IF( IVCOL.EQ.ICCOL2 ) THEN * * Perform the local computation within a process column * IF( MYCOL.EQ.ICCOL2 ) THEN * TAULOC = DCONJG( TAU( JJV ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, V( IOFFV ), $ 1, ZERO, WORK, 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK, MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK, MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK, MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK, $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, V( IOFFV ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process column ICCOL2 * IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL ZCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL ZGESD2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ ICCOL2 ) * ELSE IF( MYCOL.EQ.ICCOL2 ) THEN * IPW = MPV+1 CALL ZGERV2D( ICTXT, IPW, 1, WORK, IPW, MYROW, $ IVCOL ) TAULOC = DCONJG( WORK( IPW ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, $ ONE, C( IOFFC2 ), LDC, WORK, 1, $ ZERO, WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), $ RDEST, MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), $ LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, WORK, 1, $ WORK( IPW ), 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Transpose and broadcast row vector V (ICOFFV=IROFFC2) * IPW = MPV+1 CALL PBZTRNV( ICTXT, 'Rowwise', 'Transpose', M, $ DESCV( NB_ ), IROFFC2, V( IOFFV ), LDV, $ ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, -1, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.IVROW ) THEN * CALL ZGEBS2D( ICTXT, 'Columnwise', ' ', 1, 1, $ TAU( IIV ), 1 ) TAULOC = DCONJG( TAU( IIV ) ) * ELSE * CALL ZGEBR2D( ICTXT, 'Columnwise', ' ', 1, 1, TAULOC, $ 1, IVROW, MYCOL ) TAULOC = DCONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Broadcast column vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) IF( MYCOL.EQ.IVCOL ) THEN * IPW = MPV+1 CALL ZCOPY( MPV, V( IOFFV ), 1, WORK, 1 ) WORK( IPW ) = TAU( JJV ) CALL ZGEBS2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, $ WORK, IPW ) TAULOC = DCONJG( TAU( JJV ) ) * ELSE * IPW = MPV+1 CALL ZGEBR2D( ICTXT, 'Rowwise', ROWBTOP, IPW, 1, WORK, $ IPW, MYROW, IVCOL ) TAULOC = DCONJG( WORK( IPW ) ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C )' * v * IF( MPV.GT.0 ) THEN CALL ZGEMV( 'Conjugate transpose', MPV, NQC2, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', NQC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, NQC2 ) ) END IF IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, ONE, C( IOFFC1 ), LDC, $ WORK( IPW ), MAX( 1, NQC2 ) ) * CALL ZGSUM2D( ICTXT, 'Columnwise', ' ', NQC2, 1, $ WORK( IPW ), MAX( 1, NQC2 ), RDEST, $ MYCOL ) * * sub( C ) := sub( C ) - v * w' * IF( MYROW.EQ.ICROW1 ) $ CALL ZAXPY( NQC2, -TAULOC, WORK( IPW ), $ MAX( 1, NQC2 ), C( IOFFC1 ), LDC ) CALL ZGERC( MPV, NQC2, -TAULOC, WORK, 1, WORK( IPW ), $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * IF( CCBLCK ) THEN RDEST = MYROW ELSE RDEST = -1 END IF * IF( CRBLCK ) THEN * * sub( C ) is distributed over a process row * IF( DESCV( M_ ).EQ.INCV ) THEN * * V is a row vector * IF( IVROW.EQ.ICROW2 ) THEN * * Perform the local computation within a process row * IF( MYROW.EQ.ICROW2 ) THEN * TAULOC = DCONJG( TAU( IIV ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, V( IOFFV ), $ LDV, ZERO, WORK, 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK, MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK, 1 ) * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK, MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) * IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK, 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MPC2, NQV, -TAULOC, WORK, 1, $ V( IOFFV ), LDV, C( IOFFC2 ), LDC ) END IF * END IF * ELSE * * Send V and TAU to the process row ICROW2 * IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL ZCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL ZGESD2D( ICTXT, IPW, 1, WORK, IPW, ICROW2, $ MYCOL ) * ELSE IF( MYROW.EQ.ICROW2 ) THEN * IPW = NQV+1 CALL ZGERV2D( ICTXT, IPW, 1, WORK, IPW, IVROW, $ MYCOL ) TAULOC = DCONJG( WORK( IPW ) ) * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), $ RDEST, ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * Transpose column vector V (IROFFV = ICOFFC2) * IPW = NQV+1 CALL PBZTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, ICROW2, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYROW.EQ.ICROW2 ) THEN * IF( MYCOL.EQ.IVCOL ) THEN * CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, $ TAU( JJV ), 1 ) TAULOC = DCONJG( TAU( JJV ) ) * ELSE * CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, $ 1, MYROW, IVCOL ) TAULOC = DCONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, $ WORK, 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * ELSE * * sub( C ) is a proper distributed matrix * IF( DESCV( M_ ).EQ.INCV ) THEN * * Broadcast row vector V * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ COLBTOP ) IF( MYROW.EQ.IVROW ) THEN * IPW = NQV+1 CALL ZCOPY( NQV, V( IOFFV ), LDV, WORK, 1 ) WORK( IPW ) = TAU( IIV ) CALL ZGEBS2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW ) TAULOC = DCONJG( TAU( IIV ) ) * ELSE * IPW = NQV+1 CALL ZGEBR2D( ICTXT, 'Columnwise', COLBTOP, IPW, 1, $ WORK, IPW, IVROW, MYCOL ) TAULOC = DCONJG( WORK( IPW ) ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No Transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * ELSE * * Transpose and broadcast column vector V (ICOFFC2=IROFFV) * IPW = NQV+1 CALL PBZTRNV( ICTXT, 'Columnwise', 'Transpose', N, $ DESCV( MB_ ), ICOFFC2, V( IOFFV ), 1, ZERO, $ WORK, 1, IVROW, IVCOL, -1, ICCOL2, $ WORK( IPW ) ) * * Perform the local computation within a process column * IF( MYCOL.EQ.IVCOL ) THEN * CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, TAU( JJV ), $ 1 ) TAULOC = DCONJG( TAU( JJV ) ) * ELSE * CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, TAULOC, 1, $ MYROW, IVCOL ) TAULOC = DCONJG( TAULOC ) * END IF * IF( TAULOC.NE.ZERO ) THEN * * w := sub( C ) * v * IF( NQV.GT.0 ) THEN CALL ZGEMV( 'No transpose', MPC2, NQV, ONE, $ C( IOFFC2 ), LDC, WORK, 1, ZERO, $ WORK( IPW ), 1 ) ELSE CALL ZLASET( 'All', MPC2, 1, ZERO, ZERO, $ WORK( IPW ), MAX( 1, MPC2 ) ) END IF IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, ONE, C( IOFFC1 ), 1, $ WORK( IPW ), 1 ) CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', MPC2, 1, $ WORK( IPW ), MAX( 1, MPC2 ), RDEST, $ ICCOL2 ) IF( MYCOL.EQ.ICCOL1 ) $ CALL ZAXPY( MPC2, -TAULOC, WORK( IPW ), 1, $ C( IOFFC1 ), 1 ) * * sub( C ) := sub( C ) - w * v' * CALL ZGERC( MPC2, NQV, -TAULOC, WORK( IPW ), 1, WORK, $ 1, C( IOFFC2 ), LDC ) END IF * END IF * END IF * END IF * RETURN * * End of PZLARZC * END scalapack-2.0.2/SRC/pzlarzt.f000644 000766 000024 00000026105 10363532303 016217 0ustar00juliestaff000000 000000 SUBROUTINE PZLARZT( DIRECT, STOREV, N, K, V, IV, JV, DESCV, TAU, $ T, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIRECT, STOREV INTEGER IV, JV, K, N * .. * .. Array Arguments .. INTEGER DESCV( * ) COMPLEX*16 TAU( * ), T( * ), V( * ), WORK( * ) * .. * * Purpose * ======= * * PZLARZT forms the triangular factor T of a complex block reflector * H of order > n, which is defined as a product of k elementary * reflectors as returned by PZTZRZF. * * If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; * * If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. * * If STOREV = 'C', the vector which defines the elementary reflector * H(i) is stored in the i-th column of the array V, and * * H = I - V * T * V' * * If STOREV = 'R', the vector which defines the elementary reflector * H(i) is stored in the i-th row of the array V, and * * H = I - V' * T * V * * Currently, only STOREV = 'R' and DIRECT = 'B' are supported. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIRECT (global input) CHARACTER * Specifies the order in which the elementary reflectors are * multiplied to form the block reflector: * = 'F': H = H(1) H(2) . . . H(k) (Forward, not supported yet) * = 'B': H = H(k) . . . H(2) H(1) (Backward) * * STOREV (global input) CHARACTER * Specifies how the vectors which define the elementary * reflectors are stored (see also Further Details): * = 'C': columnwise (not supported yet) * = 'R': rowwise * * N (global input) INTEGER * The number of meaningful entries of the block reflector H. * N >= 0. * * K (global input) INTEGER * The order of the triangular factor T (= the number of * elementary reflectors). 1 <= K <= MB_V (= NB_V). * * V (input/output) COMPLEX*16 pointer into the local memory * to an array of local dimension (LOCr(IV+K-1),LOCc(JV+N-1)). * The distributed matrix V contains the Householder vectors. * See further details. * * IV (global input) INTEGER * The row index in the global array V indicating the first * row of sub( V ). * * JV (global input) INTEGER * The column index in the global array V indicating the * first column of sub( V ). * * DESCV (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix V. * * TAU (local input) COMPLEX*16, array, dimension LOCr(IV+K-1) * if INCV = M_V, and LOCc(JV+K-1) otherwise. This array * contains the Householder scalars related to the Householder * vectors. TAU is tied to the distributed matrix V. * * T (local output) COMPLEX*16 array, dimension (MB_V,MB_V) * It contains the k-by-k triangular factor of the block * reflector associated with V. T is lower triangular. * * WORK (local workspace) COMPLEX*16 array, * dimension (K*(K-1)/2) * * Further Details * =============== * * The shape of the matrix V and the storage of the vectors which define * the H(i) is best illustrated by the following example with n = 5 and * k = 3. The elements equal to 1 are not stored; the corresponding * array elements are modified but restored on exit. The rest of the * array is not used. * * DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': * * ______V_____ * ( v1 v2 v3 ) / \ * ( v1 v2 v3 ) ( v1 v1 v1 v1 v1 . . . . 1 ) * V = ( v1 v2 v3 ) ( v2 v2 v2 v2 v2 . . . 1 ) * ( v1 v2 v3 ) ( v3 v3 v3 v3 v3 . . 1 ) * ( v1 v2 v3 ) * . . . * . . . * 1 . . * 1 . * 1 * * DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': * * ______V_____ * 1 / \ * . 1 ( 1 . . . . v1 v1 v1 v1 v1 ) * . . 1 ( . 1 . . . v2 v2 v2 v2 v2 ) * . . . ( . . 1 . . v3 v3 v3 v3 v3 ) * . . . * ( v1 v2 v3 ) * ( v1 v2 v3 ) * V = ( v1 v2 v3 ) * ( v1 v2 v3 ) * ( v1 v2 v3 ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER ICOFF, ICTXT, II, IIV, INFO, IVCOL, IVROW, $ ITMP0, ITMP1, IW, JJV, LDV, MYCOL, MYROW, $ NPCOL, NPROW, NQ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, INFOG2L, PXERBLA, $ ZCOPY, ZGEMV, ZGSUM2D, ZLACGV, $ ZLASET, ZTRMV * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCV( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check for currently supported options * INFO = 0 IF( .NOT.LSAME( DIRECT, 'B' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( STOREV, 'R' ) ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZLARZT', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * CALL INFOG2L( IV, JV, DESCV, NPROW, NPCOL, MYROW, MYCOL, $ IIV, JJV, IVROW, IVCOL ) * IF( MYROW.EQ.IVROW ) THEN IW = 1 ITMP0 = 0 LDV = DESCV( LLD_ ) ICOFF = MOD( JV-1, DESCV( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCV( NB_ ), MYCOL, IVCOL, NPCOL ) IF( MYCOL.EQ.IVCOL ) $ NQ = NQ - ICOFF * DO 10 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = -tau( iv+i-1 ) * * V(iv+i:iv+k-1,jv:jv+n-1) * V(iv+i-1,jv:jv+n-1)' * ITMP0 = ITMP0 + 1 IF( NQ.GT.0 ) THEN CALL ZLACGV( NQ, V( II+(JJV-1)*LDV ), LDV ) CALL ZGEMV( 'No transpose', ITMP0, NQ, -TAU( II ), $ V( II+1+(JJV-1)*LDV ), LDV, $ V( II+(JJV-1)*LDV ), LDV, ZERO, WORK( IW ), $ 1 ) CALL ZLACGV( NQ, V( II+(JJV-1)*LDV ), LDV ) ELSE CALL ZLASET( 'All', ITMP0, 1, ZERO, ZERO, WORK( IW ), $ ITMP0 ) END IF IW = IW + ITMP0 * 10 CONTINUE * CALL ZGSUM2D( ICTXT, 'Rowwise', ' ', IW-1, 1, WORK, IW-1, $ MYROW, IVCOL ) * IF( MYCOL.EQ.IVCOL ) THEN * IW = 1 ITMP0 = 0 ITMP1 = K + 1 + (K-1) * DESCV( MB_ ) * T( ITMP1-1 ) = TAU( IIV+K-1 ) * DO 20 II = IIV+K-2, IIV, -1 * * T(i+1:k,i) = T(i+1:k,i+1:k) * T(i+1:k,i) * ITMP0 = ITMP0 + 1 ITMP1 = ITMP1 - DESCV( MB_ ) - 1 CALL ZCOPY( ITMP0, WORK( IW ), 1, T( ITMP1 ), 1 ) IW = IW + ITMP0 * CALL ZTRMV( 'Lower', 'No transpose', 'Non-unit', ITMP0, $ T( ITMP1+DESCV( MB_ ) ), DESCV( MB_ ), $ T( ITMP1 ), 1 ) T( ITMP1-1 ) = TAU( II ) * 20 CONTINUE * END IF * END IF * RETURN * * End of PZLARZT * END scalapack-2.0.2/SRC/pzlascl.f000644 000766 000024 00000043043 11552067542 016172 0ustar00juliestaff000000 000000 SUBROUTINE PZLASCL( TYPE, CFROM, CTO, M, N, A, IA, JA, DESCA, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER TYPE INTEGER IA, INFO, JA, M, N DOUBLE PRECISION CFROM, CTO * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLASCL multiplies the M-by-N complex distributed matrix sub( A ) * denoting A(IA:IA+M-1,JA:JA+N-1) by the real scalar CTO/CFROM. This * is done without over/underflow as long as the final result * CTO * A(I,J) / CFROM does not over/underflow. TYPE specifies that * sub( A ) may be full, upper triangular, lower triangular or upper * Hessenberg. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * TYPE (global input) CHARACTER * TYPE indices the storage type of the input distributed * matrix. * = 'G': sub( A ) is a full matrix, * = 'L': sub( A ) is a lower triangular matrix, * = 'U': sub( A ) is an upper triangular matrix, * = 'H': sub( A ) is an upper Hessenberg matrix. * * CFROM (global input) DOUBLE PRECISION * CTO (global input) DOUBLE PRECISION * The distributed matrix sub( A ) is multiplied by CTO/CFROM. * A(I,J) is computed without over/underflow if the final * result CTO * A(I,J) / CFROM can be represented without * over/underflow. CFROM must be nonzero. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * This array contains the local pieces of the distributed * matrix sub( A ). On exit, this array contains the local * pieces of the distributed matrix multiplied by CTO/CFROM. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. LOGICAL DONE INTEGER IACOL, IAROW, ICOFFA, ICTXT, ICURCOL, ICURROW, $ IIA, II, INXTROW, IOFFA, IROFFA, ITYPE, J, JB, $ JJA, JJ, JN, KK, LDA, LL, MYCOL, MYROW, MP, $ NPCOL, NPROW, NQ DOUBLE PRECISION BIGNUM, CFROM1, CFROMC, CTO1, CTOC, MUL, SMLNUM * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, INFOG2L, PXERBLA * .. * .. External Functions .. LOGICAL LSAME, DISNAN INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL DISNAN, ICEIL, LSAME, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * IF( NPROW.EQ.-1 ) THEN INFO = -907 ELSE INFO = 0 CALL CHK1MAT( M, 4, N, 6, IA, JA, DESCA, 9, INFO ) IF( INFO.EQ.0 ) THEN IF( LSAME( TYPE, 'G' ) ) THEN ITYPE = 0 ELSE IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 ELSE ITYPE = -1 END IF IF( ITYPE.EQ.-1 ) THEN INFO = -1 ELSE IF( CFROM.EQ.ZERO .OR. DISNAN(CFROM) ) THEN INFO = -4 ELSE IF( DISNAN(CTO) ) THEN INFO = -5 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZLASCL', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) $ RETURN * * Get machine parameters * SMLNUM = PDLAMCH( ICTXT, 'S' ) BIGNUM = ONE / SMLNUM * CFROMC = CFROM CTOC = CTO * * Compute local indexes * LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MP = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ MP = MP - IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFFA * 10 CONTINUE CFROM1 = CFROMC*SMLNUM IF( CFROM1.EQ.CFROMC ) THEN ! CFROMC is an inf. Multiply by a correctly signed zero for ! finite CTOC, or a NaN if CTOC is infinite. MUL = CTOC / CFROMC DONE = .TRUE. CTO1 = CTOC ELSE CTO1 = CTOC / BIGNUM IF( CTO1.EQ.CTOC ) THEN ! CTOC is either 0 or an inf. In both cases, CTOC itself ! serves as the correct multiplication factor. MUL = CTOC DONE = .TRUE. CFROMC = ONE ELSE IF( ABS( CFROM1 ).GT.ABS( CTOC ) .AND. CTOC.NE.ZERO ) THEN MUL = SMLNUM DONE = .FALSE. CFROMC = CFROM1 ELSE IF( ABS( CTO1 ).GT.ABS( CFROMC ) ) THEN MUL = BIGNUM DONE = .FALSE. CTOC = CTO1 ELSE MUL = CTOC / CFROMC DONE = .TRUE. END IF END IF * IOFFA = ( JJA - 1 ) * LDA ICURROW = IAROW ICURCOL = IACOL * IF( ITYPE.EQ.0 ) THEN * * Full matrix * DO 30 JJ = JJA, JJA+NQ-1 DO 20 II = IIA, IIA+MP-1 A( IOFFA+II ) = A( IOFFA+II ) * MUL 20 CONTINUE IOFFA = IOFFA + LDA 30 CONTINUE * ELSE IF( ITYPE.EQ.1 ) THEN * * Lower triangular matrix * II = IIA JJ = JJA JB = JN-JA+1 * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 50 LL = JJ, JJ + JB -1 DO 40 KK = II+LL-JJ, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 40 CONTINUE IOFFA = IOFFA + LDA 50 CONTINUE ELSE DO 70 LL = JJ, JJ + JB -1 DO 60 KK = II, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 60 CONTINUE IOFFA = IOFFA + LDA 70 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 120 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 90 LL = JJ, JJ + JB -1 DO 80 KK = II+LL-JJ, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 80 CONTINUE IOFFA = IOFFA + LDA 90 CONTINUE ELSE DO 110 LL = JJ, JJ + JB -1 DO 100 KK = II, IIA+MP-1 A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 100 CONTINUE IOFFA = IOFFA + LDA 110 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 120 CONTINUE * ELSE IF( ITYPE.EQ.2 ) THEN * * Upper triangular matrix * II = IIA JJ = JJA JB = JN-JA+1 * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 140 LL = JJ, JJ + JB -1 DO 130 KK = IIA, MIN(II+LL-JJ,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 130 CONTINUE IOFFA = IOFFA + LDA 140 CONTINUE ELSE DO 160 LL = JJ, JJ + JB -1 DO 150 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 150 CONTINUE IOFFA = IOFFA + LDA 160 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 210 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 180 LL = JJ, JJ + JB -1 DO 170 KK = IIA, MIN(II+LL-JJ,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 170 CONTINUE IOFFA = IOFFA + LDA 180 CONTINUE ELSE DO 200 LL = JJ, JJ + JB -1 DO 190 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 190 CONTINUE IOFFA = IOFFA + LDA 200 CONTINUE END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 210 CONTINUE * ELSE IF( ITYPE.EQ.3 ) THEN * * Upper Hessenberg matrix * II = IIA JJ = JJA JB = JN-JA+1 * * Only one process row * IF( NPROW.EQ.1 ) THEN * * Handle first block of columns separately * IF( MYCOL.EQ.ICURCOL ) THEN DO 230 LL = JJ, JJ+JB-1 DO 220 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 220 CONTINUE IOFFA = IOFFA + LDA 230 CONTINUE JJ = JJ + JB END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 260 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN DO 250 LL = JJ, JJ+JB-1 DO 240 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK )*MUL 240 CONTINUE IOFFA = IOFFA + LDA 250 CONTINUE JJ = JJ + JB END IF * II = II + JB ICURCOL = MOD( ICURCOL+1, NPCOL ) * 260 CONTINUE * ELSE * * Handle first block of columns separately * INXTROW = MOD( ICURROW+1, NPROW ) IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 280 LL = JJ, JJ + JB -1 DO 270 KK = IIA, MIN(II+LL-JJ+1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 270 CONTINUE IOFFA = IOFFA + LDA 280 CONTINUE ELSE DO 300 LL = JJ, JJ + JB -1 DO 290 KK = IIA, MIN(II-1,IIA+MP-1) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 290 CONTINUE IOFFA = IOFFA + LDA 300 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+MP-1 ) $ A( II+(JJ+JB-2)*LDA ) = A( II+(JJ+JB-2)*LDA ) * MUL END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = INXTROW ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining block of columns * DO 350 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( MYROW.EQ.ICURROW ) THEN DO 320 LL = JJ, JJ + JB -1 DO 310 KK = IIA, MIN( II+LL-JJ+1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 310 CONTINUE IOFFA = IOFFA + LDA 320 CONTINUE ELSE DO 340 LL = JJ, JJ + JB -1 DO 330 KK = IIA, MIN( II-1, IIA+MP-1 ) A( IOFFA+KK ) = A( IOFFA+KK ) * MUL 330 CONTINUE IOFFA = IOFFA + LDA 340 CONTINUE IF( MYROW.EQ.INXTROW .AND. II.LE.IIA+MP-1 ) $ A( II+(JJ+JB-2)*LDA ) = A( II+(JJ+JB-2)*LDA ) * $ MUL END IF JJ = JJ + JB END IF * IF( MYROW.EQ.ICURROW ) $ II = II + JB ICURROW = INXTROW ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 350 CONTINUE * END IF * END IF * IF( .NOT.DONE ) $ GO TO 10 * RETURN * * End of PZLASCL * END scalapack-2.0.2/SRC/pzlase2.f000644 000766 000024 00000037374 10363532303 016103 0ustar00juliestaff000000 000000 SUBROUTINE PZLASE2( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLASE2 initializes an M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1) to BETA on the diagonal and ALPHA on the * offdiagonals. PZLASE2 requires that only dimension of the matrix * operand is distributed. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * set: * = 'U': Upper triangular part is set; the strictly lower * triangular part of sub( A ) is not changed; * = 'L': Lower triangular part is set; the strictly upper * triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * ALPHA (global input) COMPLEX*16 * The constant to which the offdiagonal elements are to be * set. * * BETA (global input) COMPLEX*16 * The constant to which the diagonal elements are to be set. * * A (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ) * to be set. On exit, the leading M-by-N submatrix sub( A ) * is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER HEIGHT, IACOL, IAROW, IBASE, ICOFFA, II, IIA, $ IIBEG, IIEND, IINXT, ILEFT, IRIGHT, IROFFA, $ ITOP, JJ, JJA, JJBEG, JJEND, JJNXT, LDA, MBA, $ MP, MPA, MYCOL, MYDIST, MYROW, NBA, NPCOL, $ NPROW, NQ, NQA, WIDE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, ZLASET * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL ICEIL, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) MBA = DESCA( MB_ ) NBA = DESCA( NB_ ) LDA = DESCA( LLD_ ) IROFFA = MOD( IA-1, MBA ) ICOFFA = MOD( JA-1, NBA ) * IF( N.LE.( NBA-ICOFFA ) ) THEN * * It is assumed that the local columns JJA:JJA+N-1 of the matrix * A are in the same process column (IACOL). * * N * JJA JJA+N-1 * / --------------------- \ * IROFFA| | | | * \ |...................| | ( IAROW ) * IIA |x | | MB_A * | x | | * |--x----------------| / * | x | * | x | ITOP * | x | | * | x | /-------\ * |-------x-----------| |-------x-----------| * | x | | x | * | x | | x | * | x | | x | * | x | | x | * |------------x------| |------------x------| * | x | \____________/ * | x | | * | x | IBASE * | x | * |-----------------x-| Local picture * | x| * | | * | | * | | * |-------------------| * | | * . . * . . * . (IACOL) . * IF( MYCOL.EQ.IACOL ) THEN * MPA = NUMROC( M+IROFFA, MBA, MYROW, IAROW, NPROW ) IF( MPA.LE.0 ) $ RETURN IF( MYROW.EQ.IAROW ) $ MPA = MPA - IROFFA MYDIST = MOD( MYROW-IAROW+NPROW, NPROW ) ITOP = MYDIST * MBA - IROFFA * IF( LSAME( UPLO, 'U' ) ) THEN * ITOP = MAX( 0, ITOP ) IIBEG = IIA IIEND = IIA + MPA - 1 IINXT = MIN( ICEIL( IIBEG, MBA ) * MBA, IIEND ) * 10 CONTINUE IF( ( N-ITOP ).GT.0 ) THEN CALL ZLASET( UPLO, IINXT-IIBEG+1, N-ITOP, ALPHA, BETA, $ A( IIBEG+(JJA+ITOP-1)*LDA ), LDA ) MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IIBEG = IINXT +1 IINXT = MIN( IINXT+MBA, IIEND ) GO TO 10 END IF * ELSE IF( LSAME( UPLO, 'L' ) ) THEN * II = IIA JJ = JJA MP = MPA IBASE = MIN( ITOP+MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 20 CONTINUE IF( JJ.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL ZLASET( 'All', MP, ITOP-JJ+JJA, ALPHA, ALPHA, $ A( II+(JJ-1)*LDA ), LDA ) CALL ZLASET( UPLO, MP, HEIGHT, ALPHA, BETA, $ A( II+(JJA+ITOP-1)*LDA ), LDA ) MP = MAX( 0, MP - HEIGHT ) II = II + HEIGHT JJ = JJA + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 20 END IF * ELSE * II = IIA JJ = JJA MP = MPA IBASE = MIN( ITOP+MBA, N ) ITOP = MIN( MAX( 0, ITOP ), N ) * 30 CONTINUE IF( JJ.LE.( JJA+N-1 ) ) THEN HEIGHT = IBASE - ITOP CALL ZLASET( 'All', MPA, ITOP-JJ+JJA, ALPHA, ALPHA, $ A( IIA+(JJ-1)*LDA ), LDA ) CALL ZLASET( 'All', MPA-MP, HEIGHT, ALPHA, ALPHA, $ A( IIA+(JJA+ITOP-1)*LDA ), LDA ) CALL ZLASET( 'All', MP, HEIGHT, ALPHA, BETA, $ A( II+(JJA+ITOP-1)*LDA ), LDA ) MP = MAX( 0, MP - HEIGHT ) II = II + HEIGHT JJ = JJA + IBASE MYDIST = MYDIST + NPROW ITOP = MYDIST * MBA - IROFFA IBASE = MIN( ITOP + MBA, N ) ITOP = MIN( ITOP, N ) GO TO 30 END IF * END IF * END IF * ELSE IF( M.LE.( MBA-IROFFA ) ) THEN * * It is assumed that the local rows IIA:IIA+M-1 of the matrix A * are in the same process row (IAROW). * * ICOFFA * / \JJA * IIA ------------------ .... -------- * | .x | | | / | | \ * | . x | | | ILEFT| | | | * | . x | | | | | | * | . x | | \ x | | * | . |x | | |x | | IRIGHT * | . | x | | | x | | * (IAROW) | . | x | | | x | | * | . | x| | | x| | * | . | x | | x / * | . | |x | | | * | . | | x | | | * | . | | x | | | * | . | | x| | | * IIA+M-1 ------------------ .... ------- * NB_A * (IACOL) Local picture * IF( MYROW.EQ.IAROW ) THEN * NQA = NUMROC( N+ICOFFA, NBA, MYCOL, IACOL, NPCOL ) IF( NQA.LE.0 ) $ RETURN IF( MYCOL.EQ.IACOL ) $ NQA = NQA - ICOFFA MYDIST = MOD( MYCOL-IACOL+NPCOL, NPCOL ) ILEFT = MYDIST * NBA - ICOFFA * IF( LSAME( UPLO, 'L' ) ) THEN * ILEFT = MAX( 0, ILEFT ) JJBEG = JJA JJEND = JJA + NQA - 1 JJNXT = MIN( ICEIL( JJBEG, NBA ) * NBA, JJEND ) * 40 CONTINUE IF( ( M-ILEFT ).GT.0 ) THEN CALL ZLASET( UPLO, M-ILEFT, JJNXT-JJBEG+1, ALPHA, $ BETA, A( IIA+ILEFT+(JJBEG-1)*LDA ), LDA ) MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA JJBEG = JJNXT +1 JJNXT = MIN( JJNXT+NBA, JJEND ) GO TO 40 END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * II = IIA JJ = JJA NQ = NQA IRIGHT = MIN( ILEFT+NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 50 CONTINUE IF( II.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL ZLASET( 'All', ILEFT-II+IIA, NQ, ALPHA, ALPHA, $ A( II+(JJ-1)*LDA ), LDA ) CALL ZLASET( UPLO, WIDE, NQ, ALPHA, BETA, $ A( IIA+ILEFT+(JJ-1)*LDA ), LDA ) NQ = MAX( 0, NQ - WIDE ) II = IIA + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 50 END IF * ELSE * II = IIA JJ = JJA NQ = NQA IRIGHT = MIN( ILEFT+NBA, M ) ILEFT = MIN( MAX( 0, ILEFT ), M ) * 60 CONTINUE IF( II.LE.( IIA+M-1 ) ) THEN WIDE = IRIGHT - ILEFT CALL ZLASET( 'All', ILEFT-II+IIA, NQA, ALPHA, ALPHA, $ A( II+(JJA-1)*LDA ), LDA ) CALL ZLASET( 'All', WIDE, NQA-NQ, ALPHA, ALPHA, $ A( IIA+ILEFT+(JJA-1)*LDA ), LDA ) CALL ZLASET( 'All', WIDE, NQ, ALPHA, BETA, $ A( IIA+ILEFT+(JJ-1)*LDA ), LDA ) NQ = MAX( 0, NQ - WIDE ) II = IIA + IRIGHT JJ = JJ + WIDE MYDIST = MYDIST + NPCOL ILEFT = MYDIST * NBA - ICOFFA IRIGHT = MIN( ILEFT + NBA, M ) ILEFT = MIN( ILEFT, M ) GO TO 60 END IF * END IF * END IF * END IF * RETURN * * End of PZLASE2 * END scalapack-2.0.2/SRC/pzlaset.f000644 000766 000024 00000021711 10363532303 016171 0ustar00juliestaff000000 000000 SUBROUTINE PZLASET( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLASET initializes an M-by-N distributed matrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1) to BETA on the diagonal and ALPHA on the * offdiagonals. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies the part of the distributed matrix sub( A ) to be * set: * = 'U': Upper triangular part is set; the strictly lower * triangular part of sub( A ) is not changed; * = 'L': Lower triangular part is set; the strictly upper * triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( A ). N >= 0. * * ALPHA (global input) COMPLEX*16 * The constant to which the offdiagonal elements are to be * set. * * BETA (global input) COMPLEX*16 * The constant to which the diagonal elements are to be set. * * A (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1)). This array * contains the local pieces of the distributed matrix sub( A ) * to be set. On exit, the leading M-by-N submatrix sub( A ) * is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, IAA, IBLK, IN, ITMP, J, JAA, JBLK, JN, JTMP * .. * .. External Subroutines .. EXTERNAL PZLASE2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( M.LE.( DESCA( MB_ ) - MOD( IA-1, DESCA( MB_ ) ) ) .OR. $ N.LE.( DESCA( NB_ ) - MOD( JA-1, DESCA( NB_ ) ) ) ) THEN CALL PZLASE2( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) ELSE * IF( LSAME( UPLO, 'U' ) ) THEN IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) CALL PZLASE2( UPLO, IN-IA+1, N, ALPHA, BETA, A, IA, JA, $ DESCA ) DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) JAA = JA + ITMP CALL PZLASE2( UPLO, IBLK, N-ITMP, ALPHA, BETA, $ A, I, JAA, DESCA ) 10 CONTINUE ELSE IF( LSAME( UPLO, 'L' ) ) THEN JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) CALL PZLASE2( UPLO, M, JN-JA+1, ALPHA, BETA, A, IA, JA, $ DESCA ) DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) IAA = IA + JTMP CALL PZLASE2( UPLO, M-JTMP, JBLK, ALPHA, BETA, A, IAA, $ J, DESCA ) 20 CONTINUE ELSE IF( M.LE.N ) THEN IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), $ IA+M-1 ) CALL PZLASE2( UPLO, IN-IA+1, N, ALPHA, BETA, A, IA, $ JA, DESCA ) DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) ITMP = I-IA IBLK = MIN( DESCA( MB_ ), M-ITMP ) CALL PZLASE2( UPLO, IBLK, I-IA, ALPHA, ALPHA, A, I, $ JA, DESCA ) CALL PZLASE2( UPLO, IBLK, N-I+IA, ALPHA, BETA, A, I, $ JA+I-IA, DESCA ) 30 CONTINUE ELSE JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), $ JA+N-1 ) CALL PZLASE2( UPLO, M, JN-JA+1, ALPHA, BETA, A, IA, $ JA, DESCA ) DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JTMP = J-JA JBLK = MIN( DESCA( NB_ ), N-JTMP ) CALL PZLASE2( UPLO, J-JA, JBLK, ALPHA, ALPHA, A, IA, $ J, DESCA ) CALL PZLASE2( UPLO, M-J+JA, JBLK, ALPHA, BETA, A, $ IA+J-JA, J, DESCA ) 40 CONTINUE END IF END IF * END IF * RETURN * * End of PZLASET * END scalapack-2.0.2/SRC/pzlasmsub.f000644 000766 000024 00000031637 10602576752 016553 0ustar00juliestaff000000 000000 SUBROUTINE PZLASMSUB( A, DESCA, I, L, K, SMLNUM, BUF, LWORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. INTEGER I, K, L, LWORK DOUBLE PRECISION SMLNUM * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), BUF( * ) * .. * * Purpose * ======= * * PZLASMSUB looks for a small subdiagonal element from the bottom * of the matrix that it can safely set to zero. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) COMPLEX*16 array, dimension (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * K (global output) INTEGER * On exit, this yields the bottom portion of the unreduced * submatrix. This will satisfy: L <= M <= I-1. * * SMLNUM (global input) DOUBLE PRECISION * On entry, a "small number" for the given matrix. * Unchanged on exit. * * BUF (local output) COMPLEX*16 array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 2*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * This code is basically a parallelization of the following snip * of LAPACK code from ZLAHQR: * * Look for a single small subdiagonal element. * * DO 20 K = I, L + 1, -1 * TST1 = CABS1( H( K-1, K-1 ) ) + CABS1( H( K, K ) ) * IF( TST1.EQ.ZERO ) * $ TST1 = ZLANHS( '1', I-L+1, H( L, L ), LDH, WORK ) * IF( CABS1( H( K, K-1 ) ).LE.MAX( ULP*TST1, SMLNUM ) ) * $ GO TO 30 * 20 CONTINUE * 30 CONTINUE * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IBUF1, IBUF2, ICOL1, ICOL2, $ II, III, IRCV1, IRCV2, IROW1, IROW2, ISRC, $ ISTR1, ISTR2, ITMP1, ITMP2, JJ, JJJ, JSRC, LDA, $ LEFT, MODKM1, MYCOL, MYROW, NPCOL, NPROW, NUM, $ RIGHT, UP DOUBLE PRECISION TST1, ULP COMPLEX*16 CDUM, H10, H11, H22 * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, INFOG1L, INFOG2L, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 STARTS AT BUF(ISTR1+1) AND WILL CONTAINS IBUF1 ELEMENTS * BUFFER2 STARTS AT BUF(ISTR2+1) AND WILL CONTAINS IBUF2 ELEMENTS * ISTR1 = 0 ISTR2 = ( ( I-L ) / HBL ) IF( ISTR2*HBL.LT.( I-L ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.2*ISTR2 ) THEN * * Error! * RETURN END IF CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * COPY OUR RELEVANT PIECES OF TRIADIAGONAL THAT WE OWE INTO * 2 BUFFERS TO SEND TO WHOMEVER OWNS H(K,K) AS K MOVES DIAGONALLY * UP THE TRIDIAGONAL * IBUF1 = 0 IBUF2 = 0 IRCV1 = 0 IRCV2 = 0 DO 10 K = I, L + 1, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K-1,K-1) AND SEND IT DIAGONAL DOWN * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K ,K-1) AND SEND IT RIGHT * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( K, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * ADD UP THE RECEIVES * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT. $ 1 ) ) ) THEN * * WE MUST RECEIVE H(K-1,K-1) FROM DIAGONAL UP * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) ) THEN * * WE MUST RECEIVE H(K ,K-1) FROM LEFT * IRCV2 = IRCV2 + 1 END IF END IF * * POSSIBLY CHANGE OWNERS (OCCURS ONLY WHEN MOD(K-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * SEND DATA ON TO THE APPROPRIATE NODE IF THERE IS ANY DATA TO SEND * IF( IBUF1.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, MYROW, $ RIGHT ) END IF * * RECEIVE APPROPRIATE DATA IF THERE IS ANY * IF( IRCV1.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, MYROW, $ LEFT ) END IF * * START MAIN LOOP * IBUF1 = 0 IBUF2 = 0 CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * LOOK FOR A SINGLE SMALL SUBDIAGONAL ELEMENT. * * Start loop for subdiagonal search * DO 40 K = I, L + 1, -1 IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN * * Grab information from WORK array * IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H11 = BUF( ISTR1+IBUF1 ) ELSE H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF2 = IBUF2 + 1 H10 = BUF( ISTR2+IBUF2 ) ELSE H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF ELSE * * Information is local * H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF H22 = A( ( ICOL1-1 )*LDA+IROW1 ) TST1 = CABS1( H11 ) + CABS1( H22 ) IF( TST1.EQ.ZERO ) THEN * * FIND SOME NORM OF THE LOCAL H(L:I,L:I) * CALL INFOG1L( L, HBL, NPROW, MYROW, 0, ITMP1, III ) IROW2 = NUMROC( I, HBL, MYROW, 0, NPROW ) CALL INFOG1L( L, HBL, NPCOL, MYCOL, 0, ITMP2, III ) ICOL2 = NUMROC( I, HBL, MYCOL, 0, NPCOL ) DO 30 III = ITMP1, IROW2 DO 20 JJJ = ITMP2, ICOL2 TST1 = TST1 + CABS1( A( ( JJJ-1 )*LDA+III ) ) 20 CONTINUE 30 CONTINUE END IF IF( CABS1( H10 ).LE.MAX( ULP*TST1, SMLNUM ) ) $ GO TO 50 IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 IF( ( MODKM1.EQ.HBL-1 ) .AND. ( K.GT.2 ) ) THEN II = MOD( II+NPROW-1, NPROW ) JJ = MOD( JJ+NPCOL-1, NPCOL ) CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ITMP1, ITMP2 ) END IF 40 CONTINUE 50 CONTINUE CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, K, 1, ITMP1, ITMP2, -1, $ -1, -1 ) RETURN * * End of PZLASMSUB * END scalapack-2.0.2/SRC/pzlassq.f000644 000766 000024 00000024346 10363532303 016213 0ustar00juliestaff000000 000000 SUBROUTINE PZLASSQ( N, X, IX, JX, DESCX, INCX, SCALE, SUMSQ ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IX, INCX, JX, N DOUBLE PRECISION SCALE, SUMSQ * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX*16 X( * ) * .. * * Purpose * ======= * * PZLASSQ returns the values scl and smsq such that * * ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, * * where x( i ) = sub( X ) = abs( X( IX+(JX-1)*DESCX(M_)+(i-1)*INCX ) ). * The value of sumsq is assumed to be at least unity and the value of * ssq will then satisfy * * 1.0 .le. ssq .le. ( sumsq + 2*n ). * * scale is assumed to be non-negative and scl returns the value * * scl = max( scale, abs( real( x( i ) ) ), abs( aimag( x( i ) ) ) ), * i * * scale and sumsq must be supplied in SCALE and SUMSQ respectively. * SCALE and SUMSQ are overwritten by scl and ssq respectively. * * The routine makes only one pass through the vector sub( X ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * The result are only available in the scope of sub( X ), i.e if * sub( X ) is distributed along a process row, the correct results are * only available in this process row of the grid. Similarly if sub( X ) * is distributed along a process column, the correct results are only * available in this process column of the grid. * * Arguments * ========= * * N (global input) INTEGER * The length of the distributed vector sub( X ). * * X (input) COMPLEX*16 * The vector for which a scaled sum of squares is computed. * x( i ) = X(IX+(JX-1)*M_X +(i-1)*INCX ), 1 <= i <= n. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * SCALE (local input/local output) DOUBLE PRECISION * On entry, the value scale in the equation above. * On exit, SCALE is overwritten with scl , the scaling factor * for the sum of squares. * * SUMSQ (local input/local output) DOUBLE PRECISION * On entry, the value sumsq in the equation above. * On exit, SUMSQ is overwritten with smsq , the basic sum of * squares from which scl has been factored out. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, ICOFF, ICTXT, IIX, IOFF, IROFF, IXCOL, $ IXROW, JJX, LDX, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ DOUBLE PRECISION TEMP1 * .. * .. Local Arrays .. DOUBLE PRECISION WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOMBSSQ, INFOG2L, PDTREECOMB * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Figure local indexes * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) * LDX = DESCX( LLD_ ) IF( INCX.EQ.DESCX( M_ ) ) THEN * * X is rowwise distributed. * IF( MYROW.NE.IXROW ) $ RETURN ICOFF = MOD( JX, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ - ICOFF * * Code direct from LAPACK's ZLASSQ, (save subroutine call) * IF( NQ.GT.0 ) THEN IOFF = IIX + ( JJX - 1 ) * LDX DO 10 I = 1, NQ IF( DBLE( X( IOFF ) ).NE.ZERO ) THEN TEMP1 = ABS( DBLE( X( IOFF ) ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ * ( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IF( DIMAG( X( IOFF ) ).NE.ZERO ) THEN TEMP1 = ABS( DIMAG( X( IOFF ) ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ * ( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IOFF = IOFF + LDX 10 CONTINUE END IF * * Take local result and find global * WORK( 1 ) = SCALE WORK( 2 ) = SUMSQ * CALL PDTREECOMB( ICTXT, 'Rowwise', 2, WORK, -1, IXCOL, $ DCOMBSSQ ) * SCALE = WORK( 1 ) SUMSQ = WORK( 2 ) * ELSE IF( INCX.EQ.1 ) THEN * * X is columnwise distributed. * IF( MYCOL.NE.IXCOL ) $ RETURN IROFF = MOD( IX, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFF * * Code direct from LAPACK's ZLASSQ, (save subroutine call) * IF( NP.GT.0 ) THEN IOFF = IIX + ( JJX - 1 ) * LDX DO 20 I = 1, NP IF( DBLE( X( IOFF ) ).NE.ZERO ) THEN TEMP1 = ABS( DBLE( X( IOFF ) ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ*( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IF( DIMAG( X( IOFF ) ).NE.ZERO ) THEN TEMP1 = ABS( DIMAG( X( IOFF ) ) ) IF( SCALE.LT.TEMP1 ) THEN SUMSQ = 1 + SUMSQ*( SCALE / TEMP1 )**2 SCALE = TEMP1 ELSE SUMSQ = SUMSQ + ( TEMP1 / SCALE )**2 END IF END IF IOFF = IOFF + 1 20 CONTINUE END IF * * Take local result and find global * WORK( 1 ) = SCALE WORK( 2 ) = SUMSQ * CALL PDTREECOMB( ICTXT, 'Columnwise', 2, WORK, -1, IXCOL, $ DCOMBSSQ ) * SCALE = WORK( 1 ) SUMSQ = WORK( 2 ) * END IF * RETURN * * End of PZLASSQ * END scalapack-2.0.2/SRC/pzlaswp.f000644 000766 000024 00000020342 10363532303 016206 0ustar00juliestaff000000 000000 SUBROUTINE PZLASWP( DIREC, ROWCOL, N, A, IA, JA, DESCA, K1, K2, $ IPIV ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIREC, ROWCOL INTEGER IA, JA, K1, K2, N * .. * .. Array Arguments .. INTEGER DESCA( * ), IPIV( * ) COMPLEX*16 A( * ) * .. * * Purpose: * ======== * * PZLASWP performs a series of row or column interchanges on * the distributed matrix sub( A ) = A(IA:IA+M-1,JA:JA+N-1). One * interchange is initiated for each of rows or columns K1 trough K2 of * sub( A ). This routine assumes that the pivoting information has * already been broadcast along the process row or column. * Also note that this routine will only work for K1-K2 being in the * same MB (or NB) block. If you want to pivot a full matrix, use * PZLAPIV. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * DIREC (global input) CHARACTER * Specifies in which order the permutation is applied: * = 'F' (Forward) * = 'B' (Backward) * * ROWCOL (global input) CHARACTER * Specifies if the rows or columns are permuted: * = 'R' (Rows) * = 'C' (Columns) * * N (global input) INTEGER * If ROWCOL = 'R', the length of the rows of the distributed * matrix A(*,JA:JA+N-1) to be permuted; * If ROWCOL = 'C', the length of the columns of the distributed * matrix A(IA:IA+N-1,*) to be permuted. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, * ). * On entry, this array contains the local pieces of the distri- * buted matrix to which the row/columns interchanges will be * applied. On exit the permuted distributed matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * K1 (global input) INTEGER * The first element of IPIV for which a row or column inter- * change will be done. * * K2 (global input) INTEGER * The last element of IPIV for which a row or column inter- * change will be done. * * IPIV (local input) INTEGER array, dimension LOCr(M_A)+MB_A for * row pivoting and LOCc(N_A)+NB_A for column pivoting. This * array is tied to the matrix A, IPIV(K) = L implies rows * (or columns) K and L are to be interchanged. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER I, ICURCOL, ICURROW, IIA, IP, J, JJA, JP, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, PZSWAP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( ROWCOL, 'R' ) ) THEN IF( LSAME( DIREC, 'F' ) ) THEN CALL INFOG2L( K1, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 10 I = K1, K2 IP = IPIV( IIA+I-K1 ) IF( IP.NE.I ) $ CALL PZSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP, $ JA, DESCA, DESCA( M_ ) ) 10 CONTINUE ELSE CALL INFOG2L( K2, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 20 I = K2, K1, -1 IP = IPIV( IIA+I-K1 ) IF( IP.NE.I ) $ CALL PZSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP, $ JA, DESCA, DESCA( M_ ) ) 20 CONTINUE END IF ELSE IF( LSAME( DIREC, 'F' ) ) THEN CALL INFOG2L( IA, K1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 30 J = K1, K2 JP = IPIV( JJA+J-K1 ) IF( JP.NE.J ) $ CALL PZSWAP( N, A, IA, J, DESCA, 1, A, IA, JP, $ DESCA, 1 ) 30 CONTINUE ELSE CALL INFOG2L( IA, K2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, ICURROW, ICURCOL ) DO 40 J = K2, K1, -1 JP = IPIV( JJA+J-K1 ) IF( JP.NE.J ) $ CALL PZSWAP( N, A, IA, J, DESCA, 1, A, IA, JP, $ DESCA, 1 ) 40 CONTINUE END IF END IF * RETURN * * End PZLASWP * END scalapack-2.0.2/SRC/pzlatra.f000644 000766 000024 00000015450 10363532303 016167 0ustar00juliestaff000000 000000 COMPLEX*16 FUNCTION PZLATRA( N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLATRA computes the trace of an N-by-N distributed matrix sub( A ) * denoting A( IA:IA+N-1, JA:JA+N-1 ). The result is left on every * process of the grid. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the distributed matrix the trace * is to be computed. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ==================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER ICURCOL, ICURROW, II, IOFFA, J, JB, JJ, JN, $ LDA, LL, MYCOL, MYROW, NPCOL, NPROW COMPLEX*16 TRACE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, ZGSUM2D * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * TRACE = ZERO IF( N.EQ.0 ) THEN PZLATRA = TRACE RETURN END IF * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, $ ICURROW, ICURCOL ) * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) JB = JN-JA+1 LDA = DESCA( LLD_ ) IOFFA = II + ( JJ - 1 ) * LDA * * Handle first diagonal block separately * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 10 LL = IOFFA, IOFFA + (JB-1)*(LDA+1), LDA+1 TRACE = TRACE + A( LL ) 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JB IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JB*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over the remaining block of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN DO 20 LL = IOFFA, IOFFA + (JB-1)*(LDA+1), LDA+1 TRACE = TRACE + A( LL ) 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JB IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JB*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE * CALL ZGSUM2D( DESCA( CTXT_ ), 'All', ' ', 1, 1, TRACE, 1, -1, $ MYCOL ) * PZLATRA = TRACE * RETURN * * End of PZLATRA * END scalapack-2.0.2/SRC/pzlatrd.f000644 000766 000024 00000044275 10363532303 016201 0ustar00juliestaff000000 000000 SUBROUTINE PZLATRD( UPLO, N, NB, A, IA, JA, DESCA, D, E, TAU, W, $ IW, JW, DESCW, WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IW, JA, JW, N, NB * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCW( * ) DOUBLE PRECISION D( * ), E( * ) COMPLEX*16 A( * ), TAU( * ), W( * ), WORK( * ) * .. * * Purpose * ======= * * PZLATRD reduces NB rows and columns of a complex Hermitian * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) to complex * tridiagonal form by an unitary similarity transformation * Q' * sub( A ) * Q, and returns the matrices V and W which are * needed to apply the transformation to the unreduced part of sub( A ). * * If UPLO = 'U', PZLATRD reduces the last NB rows and columns of a * matrix, of which the upper triangle is supplied; * if UPLO = 'L', PZLATRD reduces the first NB rows and columns of a * matrix, of which the lower triangle is supplied. * * This is an auxiliary routine called by PZHETRD. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NB (global input) INTEGER * The number of rows and columns to be reduced. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * Hermitian distributed matrix sub( A ). If UPLO = 'U', the * leading N-by-N upper triangular part of sub( A ) contains * the upper triangular part of the matrix, and its strictly * lower triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of sub( A ) contains the * lower triangular part of the matrix, and its strictly upper * triangular part is not referenced. * On exit, if UPLO = 'U', the last NB columns have been reduced * to tridiagonal form, with the diagonal elements overwriting * the diagonal elements of sub( A ); the elements above the * diagonal with the array TAU, represent the unitary matrix Q * as a product of elementary reflectors. If UPLO = 'L', the * first NB columns have been reduced to tridiagonal form, with * the diagonal elements overwriting the diagonal elements of * sub( A ); the elements below the diagonal with the array TAU, * represent the unitary matrix Q as a product of elementary * reflectors; See Further Details. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * D (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * The diagonal elements of the tridiagonal matrix T: * D(i) = A(i,i). D is tied to the distributed matrix A. * * E (local output) DOUBLE PRECISION array, dimension LOCc(JA+N-1) * if UPLO = 'U', LOCc(JA+N-2) otherwise. The off-diagonal * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the * distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension * LOCc(JA+N-1). This array contains the scalar factors TAU of * the elementary reflectors. TAU is tied to the distributed * matrix A. * * W (local output) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_W,NB_W), This array contains * the local pieces of the N-by-NB_W matrix W required to * update the unreduced part of sub( A ). * * IW (global input) INTEGER * The row index in the global array W indicating the first * row of sub( W ). * * JW (global input) INTEGER * The column index in the global array W indicating the * first column of sub( W ). * * DESCW (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix W. * * WORK (local workspace) COMPLEX*16 array, dimension (NB_A) * * Further Details * =============== * * If UPLO = 'U', the matrix Q is represented as a product of elementary * reflectors * * Q = H(n) H(n-1) . . . H(n-nb+1). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). * * If UPLO = 'L', the matrix Q is represented as a product of elementary * reflectors * * Q = H(1) H(2) . . . H(nb). * * Each H(i) has the form * * H(i) = I - tau * v * v' * * where tau is a complex scalar, and v is a complex vector with * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). * * The elements of the vectors v together form the N-by-NB matrix V * which is needed, with W, to apply the transformation to the unreduced * part of the matrix, using a Hermitian rank-2k update of the form: * sub( A ) := sub( A ) - V*W' - W*V'. * * The contents of A on exit are illustrated by the following examples * with n = 5 and nb = 2: * * if UPLO = 'U': if UPLO = 'L': * * ( a a a v4 v5 ) ( d ) * ( a a v4 v5 ) ( 1 d ) * ( a 1 v5 ) ( v1 1 a ) * ( d 1 ) ( v1 v2 a a ) * ( d ) ( v1 v2 a a a ) * * where d denotes a diagonal element of the reduced matrix, a denotes * an element of the original matrix that is unchanged, and vi denotes * an element of the vector defining H(i). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 HALF, ONE, ZERO PARAMETER ( HALF = ( 0.5D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, ICTXT, II, J, JJ, JP, JWK, K, $ KW, MYCOL, MYROW, NPCOL, NPROW, NQ COMPLEX*16 AII, ALPHA, BETA * .. * .. Local Arrays .. INTEGER DESCD( DLEN_ ), DESCE( DLEN_ ), DESCWK( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, DGEBR2D, DGEBS2D, $ INFOG2L, PDELSET, PZAXPY, PZDOTC, $ PZELGET, PZELSET, PZGEMV, PZHEMV, $ PZLACGV, PZLARFG, PZSCAL * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NQ = MAX( 1, NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) CALL DESCSET( DESCD, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) AII = ZERO BETA = ZERO * IF( LSAME( UPLO, 'U' ) ) THEN * CALL INFOG2L( N+IA-NB, N+JA-NB, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, II, JJ, IAROW, IACOL ) CALL DESCSET( DESCWK, 1, DESCW( NB_ ), 1, DESCW( NB_ ), IAROW, $ IACOL, ICTXT, 1 ) CALL DESCSET( DESCE, 1, JA+N-1, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * * Reduce last NB columns of upper triangle * DO 10 J = JA+N-1, JA+N-NB, -1 I = IA + J - JA K = J - JA + 1 KW = MOD( K-1, DESCA( MB_ ) ) + 1 * * Update A(IA:I,I) * CALL PZELGET( 'E', ' ', AII, A, I, J, DESCA ) CALL PZELSET( A, I, J, DESCA, DCMPLX( DBLE( AII ) ) ) CALL PZLACGV( N-K, W, IW+K-1, JW+KW, DESCW, DESCW( M_ ) ) CALL PZGEMV( 'No transpose', K, N-K, -ONE, A, IA, J+1, $ DESCA, W, IW+K-1, JW+KW, DESCW, DESCW( M_ ), $ ONE, A, IA, J, DESCA, 1 ) CALL PZLACGV( N-K, W, IW+K-1, JW+KW, DESCW, DESCW( M_ ) ) CALL PZLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PZGEMV( 'No transpose', K, N-K, -ONE, W, IW, JW+KW, $ DESCW, A, I, J+1, DESCA, DESCA( M_ ), ONE, A, $ IA, J, DESCA, 1 ) CALL PZLACGV( N-K, A, I, J+1, DESCA, DESCA( M_ ) ) CALL PZELGET( 'E', ' ', AII, A, I, J, DESCA ) CALL PZELSET( A, I, J, DESCA, DCMPLX( DBLE( AII ) ) ) IF( N-K.GT.0 ) $ CALL PZELSET( A, I, J+1, DESCA, DCMPLX( E( JP ) ) ) * * Generate elementary reflector H(i) to annihilate * A(IA:I-2,I) * JP = MIN( JJ+KW-1, NQ ) CALL PZLARFG( K-1, BETA, I-1, J, A, IA, J, DESCA, 1, $ TAU ) CALL PDELSET( E, 1, J, DESCE, DBLE( BETA ) ) CALL PZELSET( A, I-1, J, DESCA, ONE ) * * Compute W(IW:IW+K-2,JW+KW-1) * CALL PZHEMV( 'Upper', K-1, ONE, A, IA, JA, DESCA, A, IA, J, $ DESCA, 1, ZERO, W, IW, JW+KW-1, DESCW, 1 ) * JWK = MOD( K-1, DESCWK( NB_ ) ) + 2 CALL PZGEMV( 'Conjugate transpose', K-1, N-K, ONE, W, IW, $ JW+KW, DESCW, A, IA, J, DESCA, 1, ZERO, WORK, $ 1, JWK, DESCWK, DESCWK( M_ ) ) CALL PZGEMV( 'No transpose', K-1, N-K, -ONE, A, IA, J+1, $ DESCA, WORK, 1, JWK, DESCWK, DESCWK( M_ ), ONE, $ W, IW, JW+KW-1, DESCW, 1 ) CALL PZGEMV( 'Conjugate transpose', K-1, N-K, ONE, A, IA, $ J+1, DESCA, A, IA, J, DESCA, 1, ZERO, WORK, 1, $ JWK, DESCWK, DESCWK( M_ ) ) CALL PZGEMV( 'No transpose', K-1, N-K, -ONE, W, IW, JW+KW, $ DESCW, WORK, 1, JWK, DESCWK, DESCWK( M_ ), ONE, $ W, IW, JW+KW-1, DESCW, 1 ) CALL PZSCAL( K-1, TAU( JP ), W, IW, JW+KW-1, DESCW, 1 ) * CALL PZDOTC( K-1, ALPHA, W, IW, JW+KW-1, DESCW, 1, A, IA, J, $ DESCA, 1 ) IF( MYCOL.EQ.IACOL ) $ ALPHA = -HALF*TAU( JP )*ALPHA CALL PZAXPY( K-1, ALPHA, A, IA, J, DESCA, 1, W, IW, JW+KW-1, $ DESCW, 1 ) CALL PZELGET( 'E', ' ', BETA, A, I, J, DESCA ) CALL PDELSET( D, 1, J, DESCD, DBLE( BETA ) ) * 10 CONTINUE * ELSE * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL DESCSET( DESCWK, 1, DESCW( NB_ ), 1, DESCW( NB_ ), IAROW, $ IACOL, ICTXT, 1 ) CALL DESCSET( DESCE, 1, JA+N-2, 1, DESCA( NB_ ), MYROW, $ DESCA( CSRC_ ), DESCA( CTXT_ ), 1 ) * * Reduce first NB columns of lower triangle * DO 20 J = JA, JA+NB-1 I = IA + J - JA K = J - JA + 1 * * Update A(J:JA+N-1,J) * CALL PZELGET( 'E', ' ', AII, A, I, J, DESCA ) CALL PZELSET( A, I, J, DESCA, DCMPLX( DBLE( AII ) ) ) CALL PZLACGV( K-1, W, IW+K-1, JW, DESCW, DESCW( M_ ) ) CALL PZGEMV( 'No transpose', N-K+1, K-1, -ONE, A, I, JA, $ DESCA, W, IW+K-1, JW, DESCW, DESCW( M_ ), ONE, $ A, I, J, DESCA, 1 ) CALL PZLACGV( K-1, W, IW+K-1, JW, DESCW, DESCW( M_ ) ) CALL PZLACGV( K-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZGEMV( 'No transpose', N-K+1, K-1, -ONE, W, IW+K-1, $ JW, DESCW, A, I, JA, DESCA, DESCA( M_ ), ONE, $ A, I, J, DESCA, 1 ) CALL PZLACGV( K-1, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZELGET( 'E', ' ', AII, A, I, J, DESCA ) CALL PZELSET( A, I, J, DESCA, DCMPLX( DBLE( AII ) ) ) IF( K.GT.1 ) $ CALL PZELSET( A, I, J-1, DESCA, DCMPLX( E( JP ) ) ) * * * Generate elementary reflector H(i) to annihilate * A(I+2:IA+N-1,I) * JP = MIN( JJ+K-1, NQ ) CALL PZLARFG( N-K, BETA, I+1, J, A, I+2, J, DESCA, 1, $ TAU ) CALL PDELSET( E, 1, J, DESCE, DBLE( BETA ) ) CALL PZELSET( A, I+1, J, DESCA, ONE ) * * Compute W(IW+K:IW+N-1,JW+K-1) * CALL PZHEMV( 'Lower', N-K, ONE, A, I+1, J+1, DESCA, A, I+1, $ J, DESCA, 1, ZERO, W, IW+K, JW+K-1, DESCW, 1 ) * CALL PZGEMV( 'Conjugate Transpose', N-K, K-1, ONE, W, IW+K, $ JW, DESCW, A, I+1, J, DESCA, 1, ZERO, WORK, 1, $ 1, DESCWK, DESCWK( M_ ) ) CALL PZGEMV( 'No transpose', N-K, K-1, -ONE, A, I+1, JA, $ DESCA, WORK, 1, 1, DESCWK, DESCWK( M_ ), ONE, W, $ IW+K, JW+K-1, DESCW, 1 ) CALL PZGEMV( 'Conjugate transpose', N-K, K-1, ONE, A, I+1, $ JA, DESCA, A, I+1, J, DESCA, 1, ZERO, WORK, 1, $ 1, DESCWK, DESCWK( M_ ) ) CALL PZGEMV( 'No transpose', N-K, K-1, -ONE, W, IW+K, JW, $ DESCW, WORK, 1, 1, DESCWK, DESCWK( M_ ), ONE, W, $ IW+K, JW+K-1, DESCW, 1 ) CALL PZSCAL( N-K, TAU( JP ), W, IW+K, JW+K-1, DESCW, 1 ) CALL PZDOTC( N-K, ALPHA, W, IW+K, JW+K-1, DESCW, 1, A, I+1, $ J, DESCA, 1 ) IF( MYCOL.EQ.IACOL ) $ ALPHA = -HALF*TAU( JP )*ALPHA CALL PZAXPY( N-K, ALPHA, A, I+1, J, DESCA, 1, W, IW+K, $ JW+K-1, DESCW, 1 ) CALL PZELGET( 'E', ' ', BETA, A, I, J, DESCA ) CALL PDELSET( D, 1, J, DESCD, DBLE( BETA ) ) * 20 CONTINUE * END IF * * Broadcast columnwise the diagonal elements into D. * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN CALL DGEBS2D( ICTXT, 'Columnwise', ' ', 1, NB, D( JJ ), 1 ) ELSE CALL DGEBR2D( ICTXT, 'Columnwise', ' ', 1, NB, D( JJ ), 1, $ IAROW, MYCOL ) END IF END IF * RETURN * * End of PZLATRD * END scalapack-2.0.2/SRC/pzlatrs.f000644 000766 000024 00000005313 10363532303 016206 0ustar00juliestaff000000 000000 SUBROUTINE PZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, IA, $ JA, DESCA, X, IX, JX, DESCX, SCALE, CNORM, $ WORK ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, NORMIN, TRANS, UPLO INTEGER IA, IX, JA, JX, N DOUBLE PRECISION SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION CNORM( * ) COMPLEX*16 A( * ), X( * ), WORK( * ) * .. * * Purpose * ======= * * PZLATRS solves a triangular system. This routine in unfinished * at this time, but will be part of the next release. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * * .. Local Scalars .. INTEGER ICTXT, IIX, IROFF, JJX, MYCOL, MYROW, NP, $ NPCOL, NPROW, LDX, IXCOL, IXROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, ZGEBR2D, ZGEBS2D, INFOG2L, $ PZTRSV * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * ***** NO SCALING ***** Call PZTRSV for all cases ***** * SCALE = ONE CALL PZTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, $ DESCX, 1 ) * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) LDX = DESCX( LLD_ ) IROFF = MOD( IX-1, DESCX(MB_) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IXCOL ) THEN CALL ZGEBS2D( ICTXT, 'R', ' ', NP, 1, X( IIX+(JJX-1)*LDX ), $ LDX ) ELSE CALL ZGEBR2D( ICTXT, 'R', ' ', NP, 1, X( IIX+(JJX-1)*LDX ), $ LDX, MYROW, IXCOL ) END IF * RETURN * * End of PZLATRS * END scalapack-2.0.2/SRC/pzlatrz.f000644 000766 000024 00000022727 10363532303 016225 0ustar00juliestaff000000 000000 SUBROUTINE PZLATRZ( M, N, L, A, IA, JA, DESCA, TAU, WORK ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER IA, JA, L, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZLATRZ reduces the M-by-N ( M<=N ) complex upper trapezoidal * matrix sub( A ) = [A(IA:IA+M-1,JA:JA+M-1) A(IA:IA+M-1,JA+N-L:JA+N-1)] * to upper triangular form by means of unitary transformations. * * The upper trapezoidal matrix sub( A ) is factored as * * sub( A ) = ( R 0 ) * Z, * * where Z is an N-by-N unitary matrix and R is an M-by-M upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. L > 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the leading M-by-M * upper triangular part of sub( A ) contains the upper trian- * gular matrix R, and elements N-L+1 to N of the first M rows * of sub( A ), with the array TAU, represent the unitary matrix * Z as a product of M elementary reflectors. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK) * LWORK >= Nq0 + MAX( 1, Mp0 ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * Further Details * =============== * * The factorization is obtained by Householder's method. The kth * transformation matrix, Z( k ), whose conjugate transpose is used to * introduce zeros into the (m - k + 1)th row of sub( A ), is given in * the form * * Z( k ) = ( I 0 ), * ( 0 T( k ) ) * * where * * T( k ) = I - tau*u( k )*u( k )', u( k ) = ( 1 ), * ( 0 ) * ( z( k ) ) * * tau is a scalar and z( k ) is an ( n - m ) element vector. * tau and z( k ) are chosen to annihilate the elements of the kth row * of sub( A ). * * The scalar tau is returned in the kth element of TAU and the vector * u( k ) in the kth row of sub( A ), such that the elements of z( k ) * are in a( k, m + 1 ), ..., a( k, n ). The elements of R are returned * in the upper triangular part of sub( A ). * * Z is given by * * Z = Z( 1 ) * Z( 2 ) * ... * Z( m ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IAROW, ICTXT, II, J, J1, MP, MYCOL, MYROW, $ NPCOL, NPROW COMPLEX*16 AII * .. * .. Local Arrays .. INTEGER DESCTAU( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL DESCSET, INFOG1L, PZELSET, PZLACGV, $ PZLARFG, PZLARZ * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX * .. * .. Executable Statements .. * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * MP = NUMROC( IA+M-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) * CALL DESCSET( DESCTAU, DESCA( M_ ), 1, DESCA( MB_ ), 1, $ DESCA( RSRC_ ), MYCOL, ICTXT, MAX( 1, MP ) ) * IF( M.EQ.N ) THEN * CALL INFOG1L( IA, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ II, IAROW ) DO 10 I = II, MP TAU( I ) = ZERO 10 CONTINUE * ELSE * AII = ZERO * J1 = JA + N - L DO 20 I = IA+M-1, IA, -1 J = JA + I - IA * * Generate elementary reflector H(i) to annihilate * [ A(i, j) A(i,j1:ja+n-1) ] * CALL PZLACGV( 1, A, I, J, DESCA, DESCA( M_ ) ) CALL PZLACGV( L, A, I, J1, DESCA, DESCA( M_ ) ) CALL PZLARFG( L+1, AII, I, J, A, I, J1, DESCA, DESCA( M_ ), $ TAU ) * * Apply H(i) to A(ia:i-1,j:ja+n-1) from the right * CALL PZLARZ( 'Right', I-IA, JA+N-J, L, A, I, J1, DESCA, $ DESCA( M_ ), TAU, A, IA, J, DESCA, WORK ) CALL PZELSET( A, I, J, DESCA, DCONJG( AII ) ) * 20 CONTINUE * CALL PZLACGV( M, TAU, IA, 1, DESCTAU, 1 ) * END IF * RETURN * * End of PZLATRZ * END scalapack-2.0.2/SRC/pzlattrs.f000644 000766 000024 00000130322 10363532303 016371 0ustar00juliestaff000000 000000 SUBROUTINE PZLATTRS( UPLO, TRANS, DIAG, NORMIN, N, A, IA, JA, $ DESCA, X, IX, JX, DESCX, SCALE, CNORM, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. CHARACTER DIAG, NORMIN, TRANS, UPLO INTEGER IA, INFO, IX, JA, JX, N DOUBLE PRECISION SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION CNORM( * ) COMPLEX*16 A( * ), X( * ) * .. * * Purpose * ======= * * PZLATTRS solves one of the triangular systems * * A * x = s*b, A**T * x = s*b, or A**H * x = s*b, * * with scaling to prevent overflow. Here A is an upper or lower * triangular matrix, A**T denotes the transpose of A, A**H denotes the * conjugate transpose of A, x and b are n-element vectors, and s is a * scaling factor, usually less than or equal to 1, chosen so that the * components of x will be less than the overflow threshold. If the * unscaled problem will not cause overflow, the Level 2 PBLAS routine * PZTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j) * then s is set to 0 and a non-trivial solution to A*x = 0 is returned. * * This is very slow relative to PZTRSV. This should only be used * when scaling is necessary to control overflow, or when it is modified * to scale better. * Notes * * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the matrix A is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * TRANS (global input) CHARACTER*1 * Specifies the operation applied to A. * = 'N': Solve A * x = s*b (No transpose) * = 'T': Solve A**T * x = s*b (Transpose) * = 'C': Solve A**H * x = s*b (Conjugate transpose) * * DIAG (global input) CHARACTER*1 * Specifies whether or not the matrix A is unit triangular. * = 'N': Non-unit triangular * = 'U': Unit triangular * * NORMIN (global input) CHARACTER*1 * Specifies whether CNORM has been set or not. * = 'Y': CNORM contains the column norms on entry * = 'N': CNORM is not set on entry. On exit, the norms will * be computed and stored in CNORM. * * N (global input) INTEGER * The order of the matrix A. N >= 0. * * A (local input) COMPLEX*16 array, dimension (DESCA(LLD_),*) * The triangular matrix A. If UPLO = 'U', the leading n by n * upper triangular part of the array A contains the upper * triangular matrix, and the strictly lower triangular part of * A is not referenced. If UPLO = 'L', the leading n by n lower * triangular part of the array A contains the lower triangular * matrix, and the strictly upper triangular part of A is not * referenced. If DIAG = 'U', the diagonal elements of A are * also not referenced and are assumed to be 1. * * IA (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix A to operate on. * * JA (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix A to operate on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input/output) COMPLEX*16 array, * dimension (DESCX(LLD_),*) * On entry, the right hand side b of the triangular system. * On exit, X is overwritten by the solution vector x. * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * SCALE (global output) DOUBLE PRECISION * The scaling factor s for the triangular system * A * x = s*b, A**T * x = s*b, or A**H * x = s*b. * If SCALE = 0, the matrix A is singular or badly scaled, and * the vector x is an exact or approximate solution to A*x = 0. * * CNORM (global input or global output) DOUBLE PRECISION array, * dimension (N) * If NORMIN = 'Y', CNORM is an input argument and CNORM(j) * contains the norm of the off-diagonal part of the j-th column * of A. If TRANS = 'N', CNORM(j) must be greater than or equal * to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) * must be greater than or equal to the 1-norm. * * If NORMIN = 'N', CNORM is an output argument and CNORM(j) * returns the 1-norm of the offdiagonal part of the j-th column * of A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * Further Details * ======= ======= * * A rough bound on x is computed; if that is less than overflow, PZTRSV * is called, otherwise, specific code is used which checks for possible * overflow or divide-by-zero at every operation. * * A columnwise scheme is used for solving A*x = b. The basic algorithm * if A is lower triangular is * * x[1:n] := b[1:n] * for j = 1, ..., n * x(j) := x(j) / A(j,j) * x[j+1:n] := x[j+1:n] - x(j) * A[j+1:n,j] * end * * Define bounds on the components of x after j iterations of the loop: * M(j) = bound on x[1:j] * G(j) = bound on x[j+1:n] * Initially, let M(0) = 0 and G(0) = max{x(i), i=1,...,n}. * * Then for iteration j+1 we have * M(j+1) <= G(j) / | A(j+1,j+1) | * G(j+1) <= G(j) + M(j+1) * | A[j+2:n,j+1] | * <= G(j) ( 1 + CNORM(j+1) / | A(j+1,j+1) | ) * * where CNORM(j+1) is greater than or equal to the infinity-norm of * column j+1 of A, not counting the diagonal. Hence * * G(j) <= G(0) product ( 1 + CNORM(i) / | A(i,i) | ) * 1<=i<=j * and * * |x(j)| <= ( G(0) / |A(j,j)| ) product ( 1 + CNORM(i) / |A(i,i)| ) * 1<=i< j * * Since |x(j)| <= M(j), we use the Level 2 PBLAS routine PZTRSV if the * reciprocal of the largest M(j), j=1,..,n, is larger than * max(underflow, 1/overflow). * * The bound on x(j) is also used to determine when a step in the * columnwise method can be performed without fear of overflow. If * the computed bound is greater than a large constant, x is scaled to * prevent overflow, but if the bound overflows, x is set to 0, x(j) to * 1, and scale to 0, and a non-trivial solution to A*x = 0 is found. * * Similarly, a row-wise scheme is used to solve A**T *x = b or * A**H *x = b. The basic algorithm for A upper triangular is * * for j = 1, ..., n * x(j) := ( b(j) - A[1:j-1,j]' * x[1:j-1] ) / A(j,j) * end * * We simultaneously compute two bounds * G(j) = bound on ( b(i) - A[1:i-1,i]' * x[1:i-1] ), 1<=i<=j * M(j) = bound on x(i), 1<=i<=j * * The initial values are G(0) = 0, M(0) = max{b(i), i=1,..,n}, and we * add the constraint G(j) >= G(j-1) and M(j) >= M(j-1) for j >= 1. * Then the bound on x(j) is * * M(j) <= M(j-1) * ( 1 + CNORM(j) ) / | A(j,j) | * * <= M(0) * product ( ( 1 + CNORM(i) ) / |A(i,i)| ) * 1<=i<=j * * and we can safely call PZTRSV if 1/M(n) and 1/G(n) are both greater * than max(underflow, 1/overflow). * * Last modified by: Mark R. Fahey, August 2000 * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, $ TWO = 2.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL NOTRAN, NOUNIT, UPPER INTEGER CONTXT, CSRC, I, ICOL, ICOLX, IMAX, IROW, $ IROWX, ITMP1, ITMP1X, ITMP2, ITMP2X, J, JFIRST, $ JINC, JLAST, LDA, LDX, MB, MYCOL, MYROW, NB, $ NPCOL, NPROW, RSRC DOUBLE PRECISION BIGNUM, GROW, REC, SMLNUM, TJJ, TMAX, TSCAL, $ XBND, XJ, XMAX COMPLEX*16 CSUMJ, TJJS, USCAL, XJTMP, ZDUM * .. * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX DOUBLE PRECISION PDLAMCH COMPLEX*16 ZLADIV EXTERNAL LSAME, IDAMAX, PDLAMCH, ZLADIV * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGSUM2D, DSCAL, INFOG2L, $ PDLABAD, PDZASUM, PXERBLA, PZAMAX, PZAXPY, $ PZDOTC, PZDOTU, PZDSCAL, PZLASET, PZSCAL, $ PZTRSV, ZGEBR2D, ZGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN * .. * .. Statement Functions .. DOUBLE PRECISION CABS1, CABS2 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) CABS2( ZDUM ) = ABS( DBLE( ZDUM ) / 2.D0 ) + $ ABS( DIMAG( ZDUM ) / 2.D0 ) * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) NOUNIT = LSAME( DIAG, 'N' ) * CONTXT = DESCA( CTXT_ ) RSRC = DESCA( RSRC_ ) CSRC = DESCA( CSRC_ ) MB = DESCA( MB_ ) NB = DESCA( NB_ ) LDA = DESCA( LLD_ ) LDX = DESCX( LLD_ ) * * Test the input parameters. * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. $ LSAME( NORMIN, 'N' ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -5 END IF * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( INFO.NE.0 ) THEN CALL PXERBLA( CONTXT, 'PZLATTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * SMLNUM = PDLAMCH( CONTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM CALL PDLABAD( CONTXT, SMLNUM, BIGNUM ) SMLNUM = SMLNUM / PDLAMCH( CONTXT, 'Precision' ) BIGNUM = ONE / SMLNUM SCALE = ONE * * IF( LSAME( NORMIN, 'N' ) ) THEN * * Compute the 1-norm of each column, not including the diagonal. * IF( UPPER ) THEN * * A is upper triangular. * CNORM( 1 ) = ZERO DO 10 J = 2, N CALL PDZASUM( J-1, CNORM( J ), A, IA, JA+J-1, DESCA, 1 ) 10 CONTINUE ELSE * * A is lower triangular. * DO 20 J = 1, N - 1 CALL PDZASUM( N-J, CNORM( J ), A, IA+J, JA+J-1, DESCA, $ 1 ) 20 CONTINUE CNORM( N ) = ZERO END IF CALL DGSUM2D( CONTXT, 'Row', ' ', N, 1, CNORM, 1, -1, -1 ) END IF * * Scale the column norms by TSCAL if the maximum element in CNORM is * greater than BIGNUM/2. * IMAX = IDAMAX( N, CNORM, 1 ) TMAX = CNORM( IMAX ) IF( TMAX.LE.BIGNUM*HALF ) THEN TSCAL = ONE ELSE TSCAL = HALF / ( SMLNUM*TMAX ) CALL DSCAL( N, TSCAL, CNORM, 1 ) END IF * * Compute a bound on the computed solution vector to see if the * Level 2 PBLAS routine PZTRSV can be used. * XMAX = ZERO CALL PZAMAX( N, ZDUM, IMAX, X, IX, JX, DESCX, 1 ) XMAX = CABS2( ZDUM ) CALL DGSUM2D( CONTXT, 'Row', ' ', 1, 1, XMAX, 1, -1, -1 ) XBND = XMAX * IF( NOTRAN ) THEN * * Compute the growth in A * x = b. * IF( UPPER ) THEN JFIRST = N JLAST = 1 JINC = -1 ELSE JFIRST = 1 JLAST = N JINC = 1 END IF * IF( TSCAL.NE.ONE ) THEN GROW = ZERO GO TO 50 END IF * IF( NOUNIT ) THEN * * A is non-unit triangular. * * Compute GROW = 1/G(j) and XBND = 1/M(j). * Initially, G(0) = max{x(i), i=1,...,n}. * GROW = HALF / MAX( XBND, SMLNUM ) XBND = GROW DO 30 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 50 * * TJJS = A( J, J ) CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN TJJS = A( ( ICOL-1 )*LDA+IROW ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF TJJ = CABS1( TJJS ) * IF( TJJ.GE.SMLNUM ) THEN * * M(j) = G(j-1) / abs(A(j,j)) * XBND = MIN( XBND, MIN( ONE, TJJ )*GROW ) ELSE * * M(j) could overflow, set XBND to 0. * XBND = ZERO END IF * IF( TJJ+CNORM( J ).GE.SMLNUM ) THEN * * G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) * GROW = GROW*( TJJ / ( TJJ+CNORM( J ) ) ) ELSE * * G(j) could overflow, set GROW to 0. * GROW = ZERO END IF 30 CONTINUE GROW = XBND ELSE * * A is unit triangular. * * Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. * GROW = MIN( ONE, HALF / MAX( XBND, SMLNUM ) ) DO 40 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 50 * * G(j) = G(j-1)*( 1 + CNORM(j) ) * GROW = GROW*( ONE / ( ONE+CNORM( J ) ) ) 40 CONTINUE END IF 50 CONTINUE * ELSE * * Compute the growth in A**T * x = b or A**H * x = b. * IF( UPPER ) THEN JFIRST = 1 JLAST = N JINC = 1 ELSE JFIRST = N JLAST = 1 JINC = -1 END IF * IF( TSCAL.NE.ONE ) THEN GROW = ZERO GO TO 80 END IF * IF( NOUNIT ) THEN * * A is non-unit triangular. * * Compute GROW = 1/G(j) and XBND = 1/M(j). * Initially, M(0) = max{x(i), i=1,...,n}. * GROW = HALF / MAX( XBND, SMLNUM ) XBND = GROW DO 60 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 80 * * G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) * XJ = ONE + CNORM( J ) GROW = MIN( GROW, XBND / XJ ) * * TJJS = A( J, J ) CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN TJJS = A( ( ICOL-1 )*LDA+IROW ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF TJJ = CABS1( TJJS ) * IF( TJJ.GE.SMLNUM ) THEN * * M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) * IF( XJ.GT.TJJ ) $ XBND = XBND*( TJJ / XJ ) ELSE * * M(j) could overflow, set XBND to 0. * XBND = ZERO END IF 60 CONTINUE GROW = MIN( GROW, XBND ) ELSE * * A is unit triangular. * * Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. * GROW = MIN( ONE, HALF / MAX( XBND, SMLNUM ) ) DO 70 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 80 * * G(j) = ( 1 + CNORM(j) )*G(j-1) * XJ = ONE + CNORM( J ) GROW = GROW / XJ 70 CONTINUE END IF 80 CONTINUE END IF * IF( ( GROW*TSCAL ).GT.SMLNUM ) THEN * * Use the Level 2 PBLAS solve if the reciprocal of the bound on * elements of X is not too small. * CALL PZTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, $ DESCX, 1 ) ELSE * * Use a Level 1 PBLAS solve, scaling intermediate results. * IF( XMAX.GT.BIGNUM*HALF ) THEN * * Scale X so that its components are less than or equal to * BIGNUM in absolute value. * SCALE = ( BIGNUM*HALF ) / XMAX CALL PZDSCAL( N, SCALE, X, IX, JX, DESCX, 1 ) XMAX = BIGNUM ELSE XMAX = XMAX*TWO END IF * IF( NOTRAN ) THEN * * Solve A * x = b * DO 100 J = JFIRST, JLAST, JINC * * Compute x(j) = b(j) / A(j,j), scaling x if necessary. * * XJ = CABS1( X( J ) ) CALL INFOG2L( IX+J-1, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, IROWX, ICOLX, ITMP1X, ITMP2X ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) THEN XJTMP = X( IROWX ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1, $ ITMP1X, ITMP2X ) END IF XJ = CABS1( XJTMP ) IF( NOUNIT ) THEN * TJJS = A( J, J )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN TJJS = A( ( ICOL-1 )*LDA+IROW )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL IF( TSCAL.EQ.ONE ) $ GO TO 90 END IF TJJ = CABS1( TJJS ) IF( TJJ.GT.SMLNUM ) THEN * * abs(A(j,j)) > SMLNUM: * IF( TJJ.LT.ONE ) THEN IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by 1/b(j). * REC = ONE / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * X( J ) = ZLADIV( X( J ), TJJS ) * XJ = CABS1( X( J ) ) XJTMP = ZLADIV( XJTMP, TJJS ) XJ = CABS1( XJTMP ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF ELSE IF( TJJ.GT.ZERO ) THEN * * 0 < abs(A(j,j)) <= SMLNUM: * IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM * to avoid overflow when dividing by A(j,j). * REC = ( TJJ*BIGNUM ) / XJ IF( CNORM( J ).GT.ONE ) THEN * * Scale by 1/CNORM(j) to avoid overflow when * multiplying x(j) times column j. * REC = REC / CNORM( J ) END IF CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF * X( J ) = ZLADIV( X( J ), TJJS ) * XJ = CABS1( X( J ) ) XJTMP = ZLADIV( XJTMP, TJJS ) XJ = CABS1( XJTMP ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF ELSE * * A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and * scale = 0, and compute a solution to A*x = 0. * CALL PZLASET( ' ', N, 1, CZERO, CZERO, X, IX, JX, $ DESCX ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = CONE END IF XJTMP = CONE XJ = ONE SCALE = ZERO XMAX = ZERO END IF 90 CONTINUE * * Scale x if necessary to avoid overflow when adding a * multiple of column j of A. * IF( XJ.GT.ONE ) THEN REC = ONE / XJ IF( CNORM( J ).GT.( BIGNUM-XMAX )*REC ) THEN * * Scale x by 1/(2*abs(x(j))). * REC = REC*HALF CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC END IF ELSE IF( XJ*CNORM( J ).GT.( BIGNUM-XMAX ) ) THEN * * Scale x by 1/2. * CALL PZDSCAL( N, HALF, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*HALF SCALE = SCALE*HALF END IF * IF( UPPER ) THEN IF( J.GT.1 ) THEN * * Compute the update * x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) * ZDUM = -XJTMP*TSCAL CALL PZAXPY( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1, X, $ IX, JX, DESCX, 1 ) CALL PZAMAX( J-1, ZDUM, IMAX, X, IX, JX, DESCX, 1 ) XMAX = CABS1( ZDUM ) CALL DGSUM2D( CONTXT, 'Row', ' ', 1, 1, XMAX, 1, $ -1, -1 ) END IF ELSE IF( J.LT.N ) THEN * * Compute the update * x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) * ZDUM = -XJTMP*TSCAL CALL PZAXPY( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) CALL PZAMAX( N-J, ZDUM, I, X, IX+J, JX, DESCX, 1 ) XMAX = CABS1( ZDUM ) CALL DGSUM2D( CONTXT, 'Row', ' ', 1, 1, XMAX, 1, $ -1, -1 ) END IF END IF 100 CONTINUE * ELSE IF( LSAME( TRANS, 'T' ) ) THEN * * Solve A**T * x = b * DO 120 J = JFIRST, JLAST, JINC * * Compute x(j) = b(j) - sum A(k,j)*x(k). * k<>j * * XJ = CABS1( X( J ) ) CALL INFOG2L( IX+J-1, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, IROWX, ICOLX, ITMP1X, ITMP2X ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) THEN XJTMP = X( IROWX ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1, $ ITMP1X, ITMP2X ) END IF XJ = CABS1( XJTMP ) USCAL = DCMPLX( TSCAL ) REC = ONE / MAX( XMAX, ONE ) IF( CNORM( J ).GT.( BIGNUM-XJ )*REC ) THEN * * If x(j) could overflow, scale x by 1/(2*XMAX). * REC = REC*HALF IF( NOUNIT ) THEN * TJJS = A( J, J )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, $ ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = A( ( ICOL-1 )*LDA+IROW )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, $ 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL END IF TJJ = CABS1( TJJS ) IF( TJJ.GT.ONE ) THEN * * Divide by A(j,j) when scaling x if A(j,j) > 1. * REC = MIN( ONE, REC*TJJ ) USCAL = ZLADIV( USCAL, TJJS ) END IF IF( REC.LT.ONE ) THEN CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * CSUMJ = CZERO IF( USCAL.EQ.CONE ) THEN * * If the scaling needed for A in the dot product is 1, * call PZDOTU to perform the dot product. * IF( UPPER ) THEN CALL PZDOTU( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ELSE IF( J.LT.N ) THEN CALL PZDOTU( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF ELSE * * Otherwise, scale column of A by USCAL before dot * product. Below is not the best way to do it. * IF( UPPER ) THEN * DO 130 I = 1, J - 1 * CSUMJ = CSUMJ + ( A( I, J )*USCAL )*X( I ) * 130 CONTINUE ZDUM = DCONJG( USCAL ) CALL PZSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) CALL PZDOTU( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ZDUM = ZLADIV( ZDUM, USCAL ) CALL PZSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) ELSE IF( J.LT.N ) THEN * DO 140 I = J + 1, N * CSUMJ = CSUMJ + ( A( I, J )*USCAL )*X( I ) * 140 CONTINUE ZDUM = DCONJG( USCAL ) CALL PZSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) CALL PZDOTU( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) ZDUM = ZLADIV( ZDUM, USCAL ) CALL PZSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF END IF * IF( USCAL.EQ.DCMPLX( TSCAL ) ) THEN * * Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) * was not used to scale the dotproduct. * * X( J ) = X( J ) - CSUMJ * XJ = CABS1( X( J ) ) XJTMP = XJTMP - CSUMJ XJ = CABS1( XJTMP ) * IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) * $ X( IROWX ) = XJTMP IF( NOUNIT ) THEN * TJJS = A( J, J )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, $ ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = A( ( ICOL-1 )*LDA+IROW )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, $ 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL IF( TSCAL.EQ.ONE ) $ GO TO 110 END IF * * Compute x(j) = x(j) / A(j,j), scaling if necessary. * TJJ = CABS1( TJJS ) IF( TJJ.GT.SMLNUM ) THEN * * abs(A(j,j)) > SMLNUM: * IF( TJJ.LT.ONE ) THEN IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale X by 1/abs(x(j)). * REC = ONE / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * X( J ) = ZLADIV( X( J ), TJJS ) XJTMP = ZLADIV( XJTMP, TJJS ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF ELSE IF( TJJ.GT.ZERO ) THEN * * 0 < abs(A(j,j)) <= SMLNUM: * IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. * REC = ( TJJ*BIGNUM ) / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF * X( J ) = ZLADIV( X( J ), TJJS ) XJTMP = ZLADIV( XJTMP, TJJS ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF ELSE * * A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and * scale = 0 and compute a solution to A**T *x = 0. * CALL PZLASET( ' ', N, 1, CZERO, CZERO, X, IX, JX, $ DESCX ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = CONE END IF XJTMP = CONE SCALE = ZERO XMAX = ZERO END IF 110 CONTINUE ELSE * * Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot * product has already been divided by 1/A(j,j). * * X( J ) = ZLADIV( X( J ), TJJS ) - CSUMJ XJTMP = ZLADIV( XJTMP, TJJS ) - CSUMJ IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ THEN X( IROWX ) = XJTMP END IF END IF XMAX = MAX( XMAX, CABS1( XJTMP ) ) 120 CONTINUE * ELSE * * Solve A**H * x = b * DO 140 J = JFIRST, JLAST, JINC * * Compute x(j) = b(j) - sum A(k,j)*x(k). * k<>j * CALL INFOG2L( IX+J-1, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, IROWX, ICOLX, ITMP1X, ITMP2X ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) THEN XJTMP = X( IROWX ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1, $ ITMP1X, ITMP2X ) END IF XJ = CABS1( XJTMP ) USCAL = TSCAL REC = ONE / MAX( XMAX, ONE ) IF( CNORM( J ).GT.( BIGNUM-XJ )*REC ) THEN * * If x(j) could overflow, scale x by 1/(2*XMAX). * REC = REC*HALF IF( NOUNIT ) THEN * TJJS = DCONJG( A( J, J ) )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, $ ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = DCONJG( A( ( ICOL-1 )*LDA+IROW ) )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, $ 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL END IF TJJ = CABS1( TJJS ) IF( TJJ.GT.ONE ) THEN * * Divide by A(j,j) when scaling x if A(j,j) > 1. * REC = MIN( ONE, REC*TJJ ) USCAL = ZLADIV( USCAL, TJJS ) END IF IF( REC.LT.ONE ) THEN CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * CSUMJ = CZERO IF( USCAL.EQ.CONE ) THEN * * If the scaling needed for A in the dot product is 1, * call PZDOTC to perform the dot product. * IF( UPPER ) THEN CALL PZDOTC( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ELSE IF( J.LT.N ) THEN CALL PZDOTC( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF ELSE * * Otherwise, scale column of A by USCAL before dot * product. Below is not the best way to do it. * IF( UPPER ) THEN * DO 180 I = 1, J - 1 * CSUMJ = CSUMJ + ( DCONJG( A( I, J ) )*USCAL )* * $ X( I ) * 180 CONTINUE ZDUM = DCONJG( USCAL ) CALL PZSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) CALL PZDOTC( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ZDUM = ZLADIV( CONE, ZDUM ) CALL PZSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) ELSE IF( J.LT.N ) THEN * DO 190 I = J + 1, N * CSUMJ = CSUMJ + ( DCONJG( A( I, J ) )*USCAL )* * $ X( I ) * 190 CONTINUE ZDUM = DCONJG( USCAL ) CALL PZSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) CALL PZDOTC( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) ZDUM = ZLADIV( CONE, ZDUM ) CALL PZSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF END IF * IF( USCAL.EQ.DCMPLX( TSCAL ) ) THEN * * Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) * was not used to scale the dotproduct. * * X( J ) = X( J ) - CSUMJ * XJ = CABS1( X( J ) ) XJTMP = XJTMP - CSUMJ XJ = CABS1( XJTMP ) * IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) * $ X( IROWX ) = XJTMP IF( NOUNIT ) THEN * TJJS = DCONJG( A( J, J ) )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, $ ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = DCONJG( A( ( ICOL-1 )*LDA+IROW ) )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, $ 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL IF( TSCAL.EQ.ONE ) $ GO TO 130 END IF * * Compute x(j) = x(j) / A(j,j), scaling if necessary. * TJJ = CABS1( TJJS ) IF( TJJ.GT.SMLNUM ) THEN * * abs(A(j,j)) > SMLNUM: * IF( TJJ.LT.ONE ) THEN IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale X by 1/abs(x(j)). * REC = ONE / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * X( J ) = ZLADIV( X( J ), TJJS ) XJTMP = ZLADIV( XJTMP, TJJS ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = XJTMP ELSE IF( TJJ.GT.ZERO ) THEN * * 0 < abs(A(j,j)) <= SMLNUM: * IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. * REC = ( TJJ*BIGNUM ) / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF * X( J ) = ZLADIV( X( J ), TJJS ) XJTMP = ZLADIV( XJTMP, TJJS ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = XJTMP ELSE * * A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and * scale = 0 and compute a solution to A**H *x = 0. * CALL PZLASET( ' ', N, 1, CZERO, CZERO, X, IX, JX, $ DESCX ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = CONE XJTMP = CONE SCALE = ZERO XMAX = ZERO END IF 130 CONTINUE ELSE * * Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot * product has already been divided by 1/A(j,j). * * X( J ) = ZLADIV( X( J ), TJJS ) - CSUMJ XJTMP = ZLADIV( XJTMP, TJJS ) - CSUMJ IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = XJTMP END IF XMAX = MAX( XMAX, CABS1( XJTMP ) ) 140 CONTINUE END IF SCALE = SCALE / TSCAL END IF * * Scale the column norms by 1/TSCAL for return. * IF( TSCAL.NE.ONE ) THEN CALL DSCAL( N, ONE / TSCAL, CNORM, 1 ) END IF * RETURN * * End of PZLATTRS * END scalapack-2.0.2/SRC/pzlauu2.f000644 000766 000024 00000020425 10363532303 016112 0ustar00juliestaff000000 000000 SUBROUTINE PZLAUU2( UPLO, N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLAUU2 computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in sub( A ). * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in sub( A ). * * This is the unblocked form of the algorithm, calling Level 2 BLAS. * No communication is performed by this routine, the matrix to operate * on should be strictly local to one process. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the triangular factor stored in the matrix * sub( A ) is upper or lower triangular: * = 'U': Upper triangular, * = 'L': Lower triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the order of the triangular factor U or L. N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor L or U. * On exit, if UPLO = 'U', the upper triangle of the distributed * matrix sub( A ) is overwritten with the upper triangle of the * product U * U'; if UPLO = 'L', the lower triangle of sub( A ) * is overwritten with the lower triangle of the product L' * L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER IACOL, IAROW, ICURR, IDIAG, IIA, IOFFA, JJA, $ LDA, MYCOL, MYROW, NA, NPCOL, NPROW DOUBLE PRECISION AII * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, ZDSCAL, ZGEMV, $ ZLACGV * .. * .. External Functions .. LOGICAL LSAME COMPLEX*16 ZDOTC EXTERNAL LSAME, ZDOTC * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, DBLE * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Get grid parameters and compute local indexes * CALL BLACS_GRIDINFO( DESCA( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * IF( LSAME( UPLO, 'U' ) ) THEN * * Compute the product U * U'. * DO 10 NA = N-1, 1, -1 AII = A( IDIAG ) ICURR = IDIAG + LDA A( IDIAG ) = AII*AII + DBLE( ZDOTC( NA, A( ICURR ), LDA, $ A( ICURR ), LDA ) ) CALL ZLACGV( NA, A( ICURR ), LDA ) CALL ZGEMV( 'No transpose', N-NA-1, NA, ONE, $ A( IOFFA+LDA ), LDA, A( ICURR ), LDA, $ DCMPLX( AII ), A( IOFFA ), 1 ) CALL ZLACGV( NA, A( ICURR ), LDA ) IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + LDA 10 CONTINUE AII = A( IDIAG ) CALL ZDSCAL( N, AII, A( IOFFA ), 1 ) * ELSE * * Compute the product L' * L. * DO 20 NA = 1, N-1 AII = A( IDIAG ) ICURR = IDIAG + 1 A( IDIAG ) = AII*AII + DBLE( ZDOTC( N-NA, A( ICURR ), 1, $ A( ICURR ), 1 ) ) CALL ZLACGV( NA-1, A( IOFFA ), LDA ) CALL ZGEMV( 'Conjugate transpose', N-NA, NA-1, ONE, $ A( IOFFA+1 ), LDA, A( ICURR ), 1, $ DCMPLX( AII ), A( IOFFA ), LDA ) CALL ZLACGV( NA-1, A( IOFFA ), LDA ) IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + 1 20 CONTINUE AII = A( IDIAG ) CALL ZDSCAL( N, AII, A( IOFFA ), LDA ) * END IF * END IF * RETURN * * End of PZLAUU2 * END scalapack-2.0.2/SRC/pzlauum.f000644 000766 000024 00000021175 10363532303 016210 0ustar00juliestaff000000 000000 SUBROUTINE PZLAUUM( UPLO, N, A, IA, JA, DESCA ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLAUUM computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in sub( A ). * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in sub( A ). * * This is the blocked form of the algorithm, calling Level 3 PBLAS. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the triangular factor stored in the * distributed matrix sub( A ) is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the triangular factor U or L. N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor L or U. * On exit, if UPLO = 'U', the upper triangle of the distributed * matrix sub( A ) is overwritten with the upper triangle of the * product U * U'; if UPLO = 'L', the lower triangle of sub( A ) * is overwritten with the lower triangle of the product L' * L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) COMPLEX*16 CONE PARAMETER ( CONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J, JB, JN * .. * .. External Subroutines .. EXTERNAL PZGEMM, PZHERK, PZLAUU2, PZTRMM * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) IF( LSAME( UPLO, 'U' ) ) THEN * * Compute the product U * U'. * * Handle first block separately * JB = JN-JA+1 CALL PZLAUU2( 'Upper', JB, A, IA, JA, DESCA ) IF( JB.LE.N-1 ) THEN CALL PZHERK( 'Upper', 'No transpose', JB, N-JB, ONE, A, IA, $ JA+JB, DESCA, ONE, A, IA, JA, DESCA ) END IF * * Loop over remaining block of columns * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA CALL PZTRMM( 'Right', 'Upper', 'Conjugate transpose', $ 'Non-unit', J-JA, JB, CONE, A, I, J, DESCA, $ A, IA, J, DESCA ) CALL PZLAUU2( 'Upper', JB, A, I, J, DESCA ) IF( J+JB.LE.JA+N-1 ) THEN CALL PZGEMM( 'No transpose', 'Conjugate transpose', $ J-JA, JB, N-J-JB+JA, CONE, A, IA, J+JB, $ DESCA, A, I, J+JB, DESCA, CONE, A, IA, $ J, DESCA ) CALL PZHERK( 'Upper', 'No transpose', JB, N-J-JB+JA, ONE, $ A, I, J+JB, DESCA, ONE, A, I, J, DESCA ) END IF 10 CONTINUE ELSE * * Compute the product L' * L. * * Handle first block separately * JB = JN-JA+1 CALL PZLAUU2( 'Lower', JB, A, IA, JA, DESCA ) IF( JB.LE.N-1 ) THEN CALL PZHERK( 'Lower', 'Conjugate transpose', JB, N-JB, ONE, $ A, IA+JB, JA, DESCA, ONE, A, IA, JA, DESCA ) END IF * * Loop over remaining block of columns * DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA CALL PZTRMM( 'Left', 'Lower', 'Conjugate Transpose', $ 'Non-unit', JB, J-JA, CONE, A, I, J, DESCA, A, $ I, JA, DESCA ) CALL PZLAUU2( 'Lower', JB, A, I, J, DESCA ) IF( J+JB.LE.JA+N-1 ) THEN CALL PZGEMM( 'Conjugate transpose', 'No transpose', JB, $ J-JA, N-J-JB+JA, CONE, A, I+JB, J, DESCA, $ A, I+JB, JA, DESCA, CONE, A, I, JA, DESCA ) CALL PZHERK( 'Lower', 'Conjugate transpose', JB, $ N-J-JB+JA, ONE, A, I+JB, J, DESCA, ONE, $ A, I, J, DESCA ) END IF 20 CONTINUE END IF * RETURN * * End of PZLAUUM * END scalapack-2.0.2/SRC/pzlawil.f000644 000766 000024 00000024203 10363532303 016170 0ustar00juliestaff000000 000000 SUBROUTINE PZLAWIL( II, JJ, M, A, DESCA, H44, H33, H43H34, V ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. INTEGER II, JJ, M COMPLEX*16 H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), V( * ) * .. * * Purpose * ======= * * PZLAWIL gets the transform given by H44,H33, & H43H34 into V * starting at row M. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * II (global input) INTEGER * Row owner of H(M+2,M+2) * * JJ (global input) INTEGER * Column owner of H(M+2,M+2) * * M (global input) INTEGER * On entry, this is where the transform starts (row M.) * Unchanged on exit. * * A (global input) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * Unchanged on exit. * * H44 * H33 * H43H34 (global input) COMPLEX*16 * These three values are for the double shift QR iteration. * Unchanged on exit. * * V (global output) COMPLEX*16 array of size 3. * Contains the transform on ouput. * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, ICOL, IROW, JSRC, LDA, LEFT, $ MODKM1, MYCOL, MYROW, NPCOL, NPROW, NUM, RIGHT, $ RSRC, UP DOUBLE PRECISION S COMPLEX*16 CDUM, H11, H12, H21, H22, H33S, H44S, V1, V2, $ V3 * .. * .. Local Arrays .. COMPLEX*16 BUF( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * On node (II,JJ) collect all DIA,SUP,SUB info from M, M+1 * MODKM1 = MOD( M+1, HBL ) IF( MODKM1.EQ.0 ) THEN IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) CALL ZGESD2D( CONTXT, 1, 1, BUF, 1, II, JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) BUF( 2 ) = A( ( ICOL-1 )*LDA+IROW+1 ) BUF( 3 ) = A( ICOL*LDA+IROW ) BUF( 4 ) = A( ICOL*LDA+IROW+1 ) CALL ZGESD2D( CONTXT, 4, 1, BUF, 4, II, JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NPCOL.GT.1 ) THEN CALL ZGERV2D( CONTXT, 1, 1, V3, 1, MYROW, LEFT ) ELSE V3 = A( ( ICOL-2 )*LDA+IROW ) END IF IF( NUM.GT.1 ) THEN CALL ZGERV2D( CONTXT, 4, 1, BUF, 4, UP, LEFT ) H11 = BUF( 1 ) H21 = BUF( 2 ) H12 = BUF( 3 ) H22 = BUF( 4 ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) END IF END IF END IF IF( MODKM1.EQ.1 ) THEN IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) CALL ZGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. ( NPROW.GT.1 ) ) $ THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL ZGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL ZGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NUM.GT.1 ) THEN CALL ZGERV2D( CONTXT, 1, 1, H11, 1, UP, LEFT ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) END IF IF( NPROW.GT.1 ) THEN CALL ZGERV2D( CONTXT, 1, 1, H12, 1, UP, MYCOL ) ELSE H12 = A( ( ICOL-2 )*LDA+IROW-2 ) END IF IF( NPCOL.GT.1 ) THEN CALL ZGERV2D( CONTXT, 1, 1, H21, 1, MYROW, LEFT ) ELSE H21 = A( ( ICOL-3 )*LDA+IROW-1 ) END IF H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF END IF IF( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) $ RETURN * IF( MODKM1.GT.1 ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF * H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 * RETURN * * End of PZLAWIL * END scalapack-2.0.2/SRC/pzmax1.f000644 000766 000024 00000033305 10363532303 015731 0ustar00juliestaff000000 000000 SUBROUTINE PZMAX1( N, AMAX, INDX, X, IX, JX, DESCX, INCX ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INDX, INCX, IX, JX, N COMPLEX*16 AMAX * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX*16 X( * ) * .. * * Purpose * ======= * * PZMAX1 computes the global index of the maximum element in absolute * value of a distributed vector sub( X ). The global index is returned * in INDX and the value is returned in AMAX, * * where sub( X ) denotes X(IX:IX+N-1,JX) if INCX = 1, * X(IX,JX:JX+N-1) if INCX = M_X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be viewed as a subclass of matrices, a * distributed vector is considered to be a distributed matrix. * * When the result of a vector-oriented PBLAS call is a scalar, it will * be made available only within the scope which owns the vector(s) * being operated on. Let X be a generic term for the input vector(s). * Then, the processes which receive the answer will be (note that if * an operation involves more than one vector, the processes which re- * ceive the result will be the union of the following calculation for * each vector): * * If N = 1, M_X = 1 and INCX = 1, then one can't determine if a process * row or process column owns the vector operand, therefore only the * process of coordinate {RSRC_X, CSRC_X} receives the result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process part of this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process part of this column receives the result; * * Based on PZAMAX from Level 1 PBLAS. The change is to use the * 'genuine' absolute value. * * The serial version was contributed to LAPACK by Nick Higham for use * with ZLACON. * * Arguments * ========= * * N (global input) pointer to INTEGER * The number of components of the distributed vector sub( X ). * N >= 0. * * AMAX (global output) pointer to DOUBLE PRECISION * The absolute value of the largest entry of the distributed * vector sub( X ) only in the scope of sub( X ). * * INDX (global output) pointer to INTEGER * The global index of the element of the distributed vector * sub( X ) whose real part has maximum absolute value. * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * INCX (global input) INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * INCX must not be zero. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER CBTOP, CCTOP, RBTOP, RCTOP INTEGER ICOFF, ICTXT, IDUMM, IIX, IROFF, IXCOL, IXROW, $ JJX, LCINDX, LDX, MAXPOS, MYCOL, MYROW, NP, $ NPCOL, NPROW, NQ * .. * .. Local Arrays .. COMPLEX*16 WORK( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGEBR2D, IGEBS2D, INFOG2L, $ PB_TOPGET, PZTREECOMB, ZCOMBAMAX1, ZGAMX2D * .. * .. External Functions .. LOGICAL LSAME INTEGER IZMAX1, INDXL2G, NUMROC EXTERNAL IZMAX1, INDXL2G, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, MOD, NINT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible. * INDX = 0 AMAX = ZERO IF( N.LE.0 ) $ RETURN * * Retrieve local information for vector X. * LDX = DESCX( LLD_ ) CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, JJX, $ IXROW, IXCOL ) * IF( INCX.EQ.1 .AND. DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN INDX = JX AMAX = X( IIX+(JJX-1)*LDX ) RETURN END IF * * Find the maximum value and its index * IF( INCX.EQ.DESCX( M_ ) ) THEN * IF( MYROW.EQ.IXROW ) THEN * ICOFF = MOD( JX-1, DESCX( NB_ ) ) NQ = NUMROC( N+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYCOL.EQ.IXCOL ) $ NQ = NQ-ICOFF * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', RBTOP ) * IF( LSAME( RBTOP, ' ' ) ) THEN * IF( NQ.GT.0 ) THEN LCINDX = JJX-1+IZMAX1( NQ, X( IIX+(JJX-1)*LDX ), LDX ) WORK( 1 ) = X( IIX+(LCINDX-1)*LDX ) WORK( 2 ) = DCMPLX( DBLE( INDXL2G( LCINDX, $ DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), NPCOL ) ) ) ELSE WORK( 1 ) = ZERO WORK( 2 ) = ZERO END IF * CALL PZTREECOMB( ICTXT, 'Row', 2, WORK, -1, MYCOL, $ ZCOMBAMAX1 ) * AMAX = WORK( 1 ) IF( AMAX.EQ.ZERO ) THEN INDX = JX ELSE INDX = NINT( DBLE( WORK( 2 ) ) ) END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', RCTOP ) * IF( NQ.GT.0 ) THEN LCINDX = JJX-1+IZMAX1( NQ, X( IIX+(JJX-1)*LDX ), LDX ) AMAX = X( IIX + (LCINDX-1)*LDX ) ELSE AMAX = ZERO END IF * * Find the maximum value * CALL ZGAMX2D( ICTXT, 'Rowwise', RCTOP, 1, 1, AMAX, 1, $ IDUMM, MAXPOS, 1, -1, MYROW ) * IF( AMAX.NE.ZERO ) THEN * * Broadcast corresponding global index * IF( MYCOL.EQ.MAXPOS ) THEN INDX = INDXL2G( LCINDX, DESCX( NB_ ), MYCOL, $ DESCX( CSRC_ ), NPCOL ) CALL IGEBS2D( ICTXT, 'Rowwise', RBTOP, 1, 1, INDX, $ 1 ) ELSE CALL IGEBR2D( ICTXT, 'Rowwise', RBTOP, 1, 1, INDX, $ 1, MYROW, MAXPOS ) END IF * ELSE * INDX = JX * END IF * END IF * END IF * ELSE * IF( MYCOL.EQ.IXCOL ) THEN * IROFF = MOD( IX-1, DESCX( MB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) IF( MYROW.EQ.IXROW ) $ NP = NP-IROFF * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) * IF( LSAME( CBTOP, ' ' ) ) THEN * IF( NP.GT.0 ) THEN LCINDX = IIX-1+IZMAX1( NP, X( IIX+(JJX-1)*LDX ), 1 ) WORK( 1 ) = X( LCINDX + (JJX-1)*LDX ) WORK( 2 ) = DCMPLX( DBLE( INDXL2G( LCINDX, $ DESCX( MB_ ), MYROW, DESCX( RSRC_ ), NPROW ) ) ) ELSE WORK( 1 ) = ZERO WORK( 2 ) = ZERO END IF * CALL PZTREECOMB( ICTXT, 'Column', 2, WORK, -1, MYCOL, $ ZCOMBAMAX1 ) * AMAX = WORK( 1 ) IF( AMAX.EQ.ZERO ) THEN INDX = IX ELSE INDX = NINT( DBLE( WORK( 2 ) ) ) END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', CCTOP ) * IF( NP.GT.0 ) THEN LCINDX = IIX-1+IZMAX1( NP, X( IIX+(JJX-1)*LDX ), 1 ) AMAX = X( LCINDX + (JJX-1)*LDX ) ELSE AMAX = ZERO END IF * * Find the maximum value * CALL ZGAMX2D( ICTXT, 'Columnwise', CCTOP, 1, 1, AMAX, 1, $ MAXPOS, IDUMM, 1, -1, MYCOL ) * IF( AMAX.NE.ZERO ) THEN * * Broadcast corresponding global index * IF( MYROW.EQ.MAXPOS ) THEN INDX = INDXL2G( LCINDX, DESCX( MB_ ), MYROW, $ DESCX( RSRC_ ), NPROW ) CALL IGEBS2D( ICTXT, 'Columnwise', CBTOP, 1, 1, $ INDX, 1 ) ELSE CALL IGEBR2D( ICTXT, 'Columnwise', CBTOP, 1, 1, $ INDX, 1, MAXPOS, MYCOL ) END IF * ELSE * INDX = IX * END IF * END IF * END IF * END IF * RETURN * * End of PZMAX1 * END * SUBROUTINE ZCOMBAMAX1 ( V1, V2 ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Array Arguments .. COMPLEX*16 V1( 2 ), V2( 2 ) * .. * * Purpose * ======= * * ZCOMBAMAX1 finds the element having maximum real part absolute * value as well as its corresponding globl index. * * Arguments * ========= * * V1 (local input/local output) COMPLEX*16 array of * dimension 2. The first maximum absolute value element and * its global index. V1(1) = AMAX, V1(2) = INDX. * * V2 (local input) COMPLEX*16 array of dimension 2. * The second maximum absolute value element and its global * index. V2(1) = AMAX, V2(2) = INDX. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, DBLE * .. * .. Executable Statements .. * IF( ABS( DBLE( V1( 1 ) ) ).LT.ABS( DBLE( V2( 1 ) ) ) ) THEN V1( 1 ) = V2( 1 ) V1( 2 ) = V2( 2 ) END IF * RETURN * * End of ZCOMBAMAX1 * END scalapack-2.0.2/SRC/pzpbsv.f000644 000766 000024 00000045305 10363532303 016040 0ustar00juliestaff000000 000000 SUBROUTINE PZPBSV( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PZPBSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * banded symmetric positive definite distributed * matrix with bandwidth BW. * * Cholesky factorization is used to factor a reordering of * the matrix into L L'. * * See PZPBTRF and PZPBTRS for details. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (NB+2*bw)*bw * +max((bw*NRHS), bw*bw) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PXERBLA, PZPBTRF, PZPBTRS * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PZPBTRF and PZPBTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 6*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PZPBSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (NB+2*BW)*BW * * Factor the matrix * CALL PZPBTRF( UPLO, N, BW, A, JA, DESCA, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR ), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PZPBSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PZPBTRS( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPBSV', -INFO ) RETURN END IF * RETURN * * End of PZPBSV * END scalapack-2.0.2/SRC/pzpbtrf.f000644 000766 000024 00000141650 11750130340 016177 0ustar00juliestaff000000 000000 SUBROUTINE PZPBTRF( UPLO, N, BW, A, JA, DESCA, AF, LAF, WORK, $ LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), AF( * ), WORK( * ) * .. * * * Purpose * ======= * * PZPBTRF computes a Cholesky factorization * of an N-by-N complex banded * symmetric positive definite distributed matrix * with bandwidth BW: A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PZPBTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = U' U , if UPLO = 'U', or * * P A(1:N, JA:JA+N-1) P^T = L L', if UPLO = 'L' * * where U is a banded upper triangular matrix and L is banded * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed matrix * A(1:N, JA:JA+N-1) to be factored. * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * On exit, this array contains information containing details * of the factorization. * Note that permutations are performed on the matrix, so that * the factors returned are different from those returned * by LAPACK. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZPBTRF and this is stored in AF. If a linear system * is to be solved using PZPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * bw*bw * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM1, IDUM3, JA_NEW, $ LAF_MIN, LEVEL_DIST, LLDA, MBW2, MYCOL, MYROW, $ MY_NUM_COLS, NB, NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, OFST, PART_OFFSET, PART_SIZE, $ PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE, $ STORE_N_A, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ DESC_CONVERT, GLOBCHK, PXERBLA, RESHAPE, ZAXPY, $ ZGEMM, ZGERV2D, ZGESD2D, ZLAMOV, ZLATCPY, $ ZPBTRF, ZPOTRF, ZSYRK, ZTBTRS, ZTRMM, ZTRRV2D, $ ZTRSD2D, ZTRSM, ZTRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * * Pre-calculate bw^2 * MBW2 = BW * BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -10 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 6*100 + 4 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PZPBTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PZPBTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = (NB+2*BW)*BW * IF( LAF .LT. LAF_MIN ) THEN INFO = -8 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PZPBTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = BW*BW * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -10 CALL PXERBLA( ICTXT, $ 'PZPBTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 10 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZPBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW ENDIF * * * Zero out space for fillin * DO 10 I=1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * * Zero out space for work * DO 20 I=1, WORK_SIZE_MIN WORK( I ) = CZERO 20 CONTINUE * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN PREV_TRI_SIZE_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) PREV_TRI_SIZE_N=MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN NEXT_TRI_SIZE_M=MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) NEXT_TRI_SIZE_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * IF ( MYCOL .LT. NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL ZTRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M, $ NEXT_TRI_SIZE_N, A( OFST+ODD_SIZE*LLDA+(BW+1) ), $ LLDA-1, 0, MYCOL+1 ) * ENDIF * * * Factor main partition A_i = L_i {L_i}^C in each processor * CALL ZPBTRF( UPLO, ODD_SIZE, BW, A( OFST + 1), $ LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1500 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * conjugate transpose the connection block in preparation. * CALL ZLATCPY( 'U', BW, BW, $ A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW ) * * Perform the triangular system solve {L_i}{{B'}_i}^C = {B_i}^C * CALL ZTRTRS( 'L', 'N', 'N', BW, BW, $ A( OFST+1+(ODD_SIZE-BW)*LLDA ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO ) * * * conjugate transpose resulting block to its location * in main storage. * CALL ZLATCPY( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), $ BW, A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), $ LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^C * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL ZHERK( UPLO, 'C', BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE, $ A( OFST+1+ODD_SIZE*LLDA ), LLDA-1 ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1500 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bw)*bw+1 ) and use for * off_diagonal block of reduced system. * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL ZTRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M, $ PREV_TRI_SIZE_N, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF (INFO.EQ.0) THEN * * Calculate the "spike" fillin, ${L_i} {{G}_i}^C = {D_i}$ . * CALL ZTBTRS( 'L', 'N', 'N', ODD_SIZE, BW, BW, A( OFST + 1 ), $ LLDA, AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = G_i{G_i}^C * CALL ZHERK( 'L', 'C', BW, ODD_SIZE, $ -ONE, AF( 1 ), ODD_SIZE, ZERO, $ AF( 1 + (ODD_SIZE+2*BW)*BW), BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL ZGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ 0, MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * {F_i}^C = {H_i}{{B'}_i}^C * * Copy matrix H_i (the last bw cols of G_i) to AF storage * as per requirements of BLAS routine ZTRMM. * Since we have G_i^C stored, conjugate transpose * H_i^C to H_i. * CALL ZLATCPY( 'N', BW, BW, $ AF( ODD_SIZE-BW+1 ), ODD_SIZE, $ AF( (ODD_SIZE)*BW+1), BW ) * CALL ZTRMM( 'R', 'U', 'C', 'N', BW, BW, -CONE, $ A( ( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA ) ), LLDA-1, $ AF( (ODD_SIZE)*BW+1 ), BW ) * * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL ZGESD2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+1 ), $ BW, 0, MYCOL-1 ) * ENDIF * * Copy last diagonal block into AF storage for subsequent * operations. * CALL ZLAMOV( 'N', BW, BW, $ A( OFST+ODD_SIZE*LLDA+1 ), $ LLDA-1, AF( ODD_SIZE*BW+MBW2+1 ), $ BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * CALL ZAXPY( MBW2, CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ 1, AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL ZGERV2D( ICTXT, BW, BW, WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * CALL ZAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL ZGERV2D( ICTXT, BW, BW, WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * CALL ZAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL ZPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL ZLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL ZGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+1 ), $ BW, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL ZTRSM( 'L', 'L', 'N', 'N', BW, BW, CONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL ZHERK( 'L', 'C', BW, BW, -ONE, $ AF( (ODD_SIZE)*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL ZGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * CALL ZTRSM( 'R', 'L', 'C', 'N', BW, BW, CONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL ZHERK( 'L', 'N', BW, BW, -ONE, $ AF( (ODD_SIZE+2*BW)*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL ZGEMM( 'N', 'N', BW, BW, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW, CZERO, WORK( 1 ), $ BW ) * * Send contribution to offdiagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * * CASE UPLO = 'U' * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * * Sizes of the extra triangles communicated bewtween processors * IF( MYCOL .GT. 0 ) THEN PREV_TRI_SIZE_M= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) PREV_TRI_SIZE_N=MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) ) ENDIF * IF( MYCOL .LT. NPCOL-1 ) THEN NEXT_TRI_SIZE_M=MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) ) NEXT_TRI_SIZE_N= MIN( BW, $ NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) ) ENDIF * * * * Factor main partition A_i^C = U_i {U_i}^C in each processor * CALL ZPBTRF( UPLO, ODD_SIZE, BW, A( OFST + 1), $ LLDA, INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1600 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * Move the connection block in preparation. * CALL ZLAMOV( 'L', BW, BW, A( ( OFST+1+ODD_SIZE*LLDA ) ), $ LLDA-1, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW ) * * * Perform the triangular solve {L_i}{{B'}_i}^C = {B_i}^C * CALL ZTRTRS( 'U', 'C', 'N', BW, BW, $ A( OFST+BW+1+(ODD_SIZE-BW)*LLDA ), LLDA-1, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO ) * * Move the resulting block back to its location in main storage. * CALL ZLAMOV( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), $ BW, A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1 ) * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i^C = {C_i}^C-{{B'}_i}^C{{B'}_i} * * The following method uses more flops than necessary but * does not necessitate the writing of a new BLAS routine. * * CALL ZHERK( UPLO, 'C', BW, BW, -ONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE, $ A( OFST+BW+1+ODD_SIZE*LLDA ), LLDA-1 ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1600 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * Discard temporary matrix stored beginning in * AF( (odd_size+2*bw)*bw+1 ) and use for * off_diagonal block of reduced system. * * Calculate the "spike" fillin, ${L_i} {{G}_i}^C = {D_i}$ . * * * Copy D block into AF storage for solve. * CALL ZLATCPY( 'L', PREV_TRI_SIZE_N, PREV_TRI_SIZE_M, $ A( OFST+1 ), LLDA-1, AF( 1 ), ODD_SIZE ) * IF ( INFO.EQ.0 ) THEN * CALL ZTBTRS( 'U', 'C', 'N', ODD_SIZE, BW, BW, $ A( OFST + 1 ), LLDA, $ AF( 1 ), ODD_SIZE, INFO ) * * * Calculate the update block for previous proc, E_i = G_i{G_i}^C * CALL ZHERK( 'L', 'C', BW, ODD_SIZE, $ -ONE, AF( 1 ), ODD_SIZE, ZERO, $ AF( 1 + (ODD_SIZE+2*BW)*BW), BW ) * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL ZGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ 0, MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * {F_i}^C = {H_i}{{B'}_i}^C * * Copy matrix H_i (the last bw cols of G_i) to AF storage * as per requirements of BLAS routine ZTRMM. * Since we have G_i^C stored, conjugate transpose * H_i^C to H_i. * CALL ZLATCPY( 'N', BW, BW, $ AF( ODD_SIZE-BW+1 ), ODD_SIZE, $ AF( (ODD_SIZE)*BW+1), BW ) * CALL ZTRMM( 'R', 'L', 'N', 'N', BW, BW, -CONE, $ A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1, $ AF( (ODD_SIZE)*BW+1 ), BW ) * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL ZGESD2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+1 ), $ BW, 0, MYCOL-1 ) * ENDIF * * Transpose last diagonal block into AF storage for subsequent * operations. * CALL ZLATCPY( 'U', BW, BW, $ A( OFST+ ODD_SIZE*LLDA+1+BW ), $ LLDA-1, AF( ODD_SIZE*BW+MBW2+1 ), $ BW ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * CALL ZAXPY( MBW2, CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ 1, AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 22 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 21 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL ZGERV2D( ICTXT, BW, BW, WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * CALL ZAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL ZGERV2D( ICTXT, BW, BW, WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * CALL ZAXPY( MBW2, CONE, WORK( 1 ), 1, $ AF( ODD_SIZE*BW+MBW2+1 ), 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 22 21 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... * * Factor diagonal block * CALL ZPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, INFO ) * IF( INFO.NE.0 ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * CALL ZLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL ZGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+1 ), $ BW, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * CALL ZTRSM( 'L', 'L', 'N', 'N', BW, BW, CONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * CALL ZHERK( 'L', 'C', BW, BW, -ONE, $ AF( (ODD_SIZE)*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL ZGERV2D( ICTXT, BW, BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * CALL ZTRSM( 'R', 'L', 'C', 'N', BW, BW, CONE, $ AF( ODD_SIZE*BW+MBW2+1 ), BW, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW ) * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * CALL ZHERK( 'L', 'N', BW, BW, -ONE, $ AF( (ODD_SIZE+2*BW)*BW+1 ), BW, ZERO, $ WORK( 1 ), BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * CALL ZGEMM( 'N', 'N', BW, BW, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), BW, $ AF( ODD_SIZE*BW+1 ), BW, CZERO, WORK( 1 ), $ BW ) * * Send contribution to offdiagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 24 CONTINUE * ENDIF * 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PZPBTRF * END scalapack-2.0.2/SRC/pzpbtrs.f000644 000766 000024 00000064075 10363532303 016225 0ustar00juliestaff000000 000000 SUBROUTINE PZPBTRS( UPLO, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, $ AF, LAF, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 7, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PZPBTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PZPBTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * banded symmetric positive definite distributed * matrix with bandwidth BW. * Depending on the value of UPLO, A stores either U or L in the equn * A(1:N, JA:JA+N-1) = U'*U or L*L' as computed by PZPBTRF. * * Routine PZPBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZPBTRF and this is stored in AF. If a linear system * is to be solved using PZPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (bw*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM3, JA_NEW, LLDA, LLDB, MYCOL, MYROW, $ NB, NP, NPCOL, NPROW, NP_SAVE, PART_OFFSET, $ RETURN_CODE, STORE_M_B, STORE_N_A, $ WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, PXERBLA, $ PZPBTRSV, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 7*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 10*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 10*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 10*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 10*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -14 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -3 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 7*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 7*100 + 4 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 10*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 10*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -3 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -6 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 7*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PZPBTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 7*100+4 ) CALL PXERBLA( ICTXT, $ 'PZPBTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ (BW*NRHS) * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, $ 'PZPBTRS: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = BW PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1005 PARAM_CHECK( 15, 2 ) = 1004 PARAM_CHECK( 14, 2 ) = 1003 PARAM_CHECK( 13, 2 ) = 1002 PARAM_CHECK( 12, 2 ) = 1001 PARAM_CHECK( 11, 2 ) = 9 PARAM_CHECK( 10, 2 ) = 705 PARAM_CHECK( 9, 2 ) = 704 PARAM_CHECK( 8, 2 ) = 703 PARAM_CHECK( 7, 2 ) = 701 PARAM_CHECK( 6, 2 ) = 6 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 14 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZPBTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PZPBTRSV( 'L', 'N', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PZPBTRSV( 'U', 'C', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ENDIF * * Call backsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PZPBTRSV( 'L', 'C', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ELSE * CALL PZPBTRSV( 'U', 'N', N, BW, NRHS, A( PART_OFFSET+1 ), $ JA_NEW, DESCA_1XP, B, IB, DESCB_PX1, AF, LAF, $ WORK, LWORK, INFO ) * ENDIF 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PZPBTRS * END scalapack-2.0.2/SRC/pzpbtrsv.f000644 000766 000024 00000141660 11750130340 016403 0ustar00juliestaff000000 000000 SUBROUTINE PZPBTRSV( UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B, $ IB, DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), AF( * ), B( * ), WORK( * ) * .. * * * Purpose * ======= * * PZPBTRSV solves a banded triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a banded * triangular matrix factor produced by the * Cholesky factorization code PZPBTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H * is dictated by the user by the parameter TRANS. * * Routine PZPBTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * BW (global input) INTEGER * Number of subdiagonals in L or U. 0 <= BW <= N-1 * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * local memory to an array with first dimension * LLD_A >=(bw+1) (stored in DESCA). * On entry, this array contains the local pieces of the * N-by-N symmetric banded distributed Cholesky factor L or * L^T A(1:N, JA:JA+N-1). * This local portion is stored in the packed banded format * used in LAPACK. Please see the Notes below and the * ScaLAPACK manual for more detail on the format of * distributed matrices. * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9 . * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZPBTRF and this is stored in AF. If a linear system * is to be solved using PZPBTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2*bw)*bw * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (bw*NRHS) * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2*BW * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the banded matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small (BW* (P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: banded codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK NO OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * LLD_A (local) DESCA( 6 ) The leading dimension of the local array * storing the local blocks of the distri- * buted array A. Minimum value of LLD_A * depends on TYPE_A. * TYPE_A = 501: LLD_A >= * size of undistributed dimension, 1. * TYPE_A = 502: LLD_A >=NB_A, 1. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MBW2, MYCOL, MYROW, MY_NUM_COLS, NB, NP, $ NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 17, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ DESC_CONVERT, GLOBCHK, PXERBLA, RESHAPE, ZGEMM, $ ZGERV2D, ZGESD2D, ZLAMOV, ZMATADD, ZTBTRS, $ ZTRMM, ZTRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * * Pre-calculate bw^2 * MBW2 = BW * BW * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -2 END IF * IF( LWORK .LT. -1) THEN INFO = -14 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -3 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF(( BW .GT. N-1 ) .OR. $ ( BW .LT. 0 ) ) THEN INFO = -4 ENDIF * IF( LLDA .LT. (BW+1) ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( NB .LE. 0 ) THEN INFO = -( 8*100 + 4 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -5 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PZPBTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PZPBTRSV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ BW*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -14 CALL PXERBLA( ICTXT, $ 'PZPBTRSV: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 17, 1 ) = DESCB(5) PARAM_CHECK( 16, 1 ) = DESCB(4) PARAM_CHECK( 15, 1 ) = DESCB(3) PARAM_CHECK( 14, 1 ) = DESCB(2) PARAM_CHECK( 13, 1 ) = DESCB(1) PARAM_CHECK( 12, 1 ) = IB PARAM_CHECK( 11, 1 ) = DESCA(5) PARAM_CHECK( 10, 1 ) = DESCA(4) PARAM_CHECK( 9, 1 ) = DESCA(3) PARAM_CHECK( 8, 1 ) = DESCA(1) PARAM_CHECK( 7, 1 ) = JA PARAM_CHECK( 6, 1 ) = NRHS PARAM_CHECK( 5, 1 ) = BW PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 17, 2 ) = 1105 PARAM_CHECK( 16, 2 ) = 1104 PARAM_CHECK( 15, 2 ) = 1103 PARAM_CHECK( 14, 2 ) = 1102 PARAM_CHECK( 13, 2 ) = 1101 PARAM_CHECK( 12, 2 ) = 10 PARAM_CHECK( 11, 2 ) = 805 PARAM_CHECK( 10, 2 ) = 804 PARAM_CHECK( 9, 2 ) = 803 PARAM_CHECK( 8, 2 ) = 801 PARAM_CHECK( 7, 2 ) = 7 PARAM_CHECK( 6, 2 ) = 5 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 14 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZPBTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Offset in elements * OFST = PART_OFFSET*LLDA * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - BW ENDIF * * * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * IF ( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL ZTBTRS( UPLO, 'N', 'N', ODD_SIZE, $ BW, NRHS, $ A( OFST+1 ), LLDA, $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL ZLAMOV( 'N', BW, NRHS, $ B( PART_OFFSET+ODD_SIZE-BW+1), LLDB, $ WORK( 1 ), BW ) * CALL ZTRMM( 'L', 'U', 'N', 'N', BW, NRHS, -CONE, $ A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), LLDA-1, $ WORK( 1 ), BW ) * CALL ZMATADD( BW, NRHS, CONE, WORK( 1 ), BW, $ CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL ZGEMM( 'C', 'N', BW, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+BW-BW ), BW ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL ZMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * CALL ZMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * CALL ZMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL ZTRTRS( 'L', 'N', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'C', 'N', BW, NRHS, BW, -CONE, $ AF( (ODD_SIZE)*BW+1 ), $ BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL ZGEMM( 'N', 'N', BW, NRHS, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 27 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 26 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 27 26 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL ZGEMM( 'C', 'N', BW, NRHS, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, $ WORK( 1 ), $ BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'N', 'N', BW, NRHS, BW, -CONE, $ AF( (ODD_SIZE)*BW+1 ), $ BW, $ WORK( 1 ), $ BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL ZTRTRS( 'L', 'C', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 22 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 21 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 22 21 CONTINUE * [End of GOTO Loop] * 24 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL ZGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+BW-BW ), BW, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL ZLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB, $ WORK( 1+BW-BW ), BW ) * CALL ZTRMM( 'L', 'U', 'C', 'N', BW, NRHS, -CONE, $ A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), LLDA-1, $ WORK( 1+BW-BW ), BW ) * CALL ZMATADD( BW, NRHS, CONE, WORK( 1+BW-BW ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL ZTBTRS( UPLO, 'C', 'N', ODD_SIZE, $ BW, NRHS, $ A( OFST+1 ), $ LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF ( LSAME( TRANS, 'C' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL ZTBTRS( UPLO, 'C', 'N', ODD_SIZE, $ BW, NRHS, $ A( OFST+1 ), LLDA, $ B( PART_OFFSET+1 ), LLDB, INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL ZLAMOV( 'N', BW, NRHS, $ B( PART_OFFSET+ODD_SIZE-BW+1), LLDB, $ WORK( 1 ), BW ) * CALL ZTRMM( 'L', 'L', 'C', 'N', BW, NRHS, -CONE, $ A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1, WORK( 1 ), $ BW ) * CALL ZMATADD( BW, NRHS, CONE, WORK( 1 ), BW, $ CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL ZGEMM( 'C', 'N', BW, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+BW-BW ), BW ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL ZMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 44 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 42 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 41 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * CALL ZMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * CALL ZMATADD( BW, NRHS, CONE, $ WORK( 1 ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 42 41 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL ZTRTRS( 'L', 'N', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'C', 'N', BW, NRHS, BW, -CONE, $ AF( (ODD_SIZE)*BW+1 ), $ BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL ZGEMM( 'N', 'N', BW, NRHS, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ BW ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 44 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 54 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 57 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 56 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 57 56 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL ZGEMM( 'C', 'N', BW, NRHS, BW, -CONE, $ AF( ODD_SIZE*BW+2*MBW2+1 ), $ BW, $ WORK( 1 ), $ BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), $ BW, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'N', 'N', BW, NRHS, BW, -CONE, $ AF( (ODD_SIZE)*BW+1 ), $ BW, $ WORK( 1 ), $ BW, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL ZTRTRS( 'L', 'C', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ), $ BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 52 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 51 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 52 51 CONTINUE * [End of GOTO Loop] * 54 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGESD2D( ICTXT, BW, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGERV2D( ICTXT, BW, NRHS, $ WORK( 1 ), BW, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL ZGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+BW-BW ), BW, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * * * First copy and multiply it into temporary storage, * then use it on RHS * CALL ZLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB, $ WORK( 1+BW-BW ), BW ) * CALL ZTRMM( 'L', 'L', 'N', 'N', BW, NRHS, -CONE, $ A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1, $ WORK( 1+BW-BW ), BW ) * CALL ZMATADD( BW, NRHS, CONE, WORK( 1+BW-BW ), BW, CONE, $ B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL ZTBTRS( UPLO, 'N', 'N', ODD_SIZE, $ BW, NRHS, $ A( OFST+1 ), $ LLDA, B( PART_OFFSET+1 ), $ LLDB, INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ENDIF * End of "IF( LSAME( UPLO, 'L' ) )"... 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PZPBTRSV * END scalapack-2.0.2/SRC/pzpocon.f000644 000766 000024 00000036720 10363532303 016205 0ustar00juliestaff000000 000000 SUBROUTINE PZPOCON( UPLO, N, A, IA, JA, DESCA, ANORM, RCOND, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, LRWORK, LWORK, N DOUBLE PRECISION ANORM, RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION RWORK( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZPOCON estimates the reciprocal of the condition number (in the * 1-norm) of a complex Hermitian positive definite distributed matrix * using the Cholesky factorization A = U**H*U or A = L*L**H computed by * PZPOTRF. * * An estimate is obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), and * the reciprocal of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the factor stored in * A(IA:IA+N-1,JA:JA+N-1) is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory to * an array of dimension ( LLD_A, LOCc(JA+N-1) ). On entry, this * array contains the local pieces of the factors L or U from * the Cholesky factorization A(IA:IA+N-1,JA:JA+N-1) = U'*U or * L*L', as computed by PZPOTRF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * ANORM (global input) DOUBLE PRECISION * The 1-norm (or infinity-norm) of the hermitian distributed * matrix A(IA:IA+N-1,JA:JA+N-1). * * RCOND (global output) DOUBLE PRECISION * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + * MAX( 2, MAX(NB_A*MAX(1,CEIL(P-1,Q)),LOCc(N+MOD(JA-1,NB_A)) + * NB_A*MAX(1,CEIL(Q-1,P))) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= 2*LOCc(N+MOD(JA-1,NB_A)). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPNL, IPNU, $ IPV, IPW, IPX, IROFF, IV, IX, IXX, JJA, JV, $ JX, KASE, LRWMIN, LWMIN, MYCOL, MYROW, NP, $ NPCOL, NPROW, NPMOD, NQ, NQMOD DOUBLE PRECISION AINVNM, SCALE, SL, SU, SMLNUM COMPLEX*16 WMAX, ZDUM * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 3 ), $ IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG2L, $ PB_TOPGET, PB_TOPSET, PXERBLA, PCHK1MAT, $ PZAMAX, PZLATRS, PZLACON, PZDRSCL, $ ZGEBR2D, ZGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, ICHAR, MAX, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = DBLE( LWMIN ) LRWMIN = 2*NQMOD RWORK( 1 ) = DBLE( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( ANORM.LT.ZERO ) THEN INFO = -7 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -12 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 IF( LRWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 12 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 3, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPOCON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * RCOND = ZERO IF( N.EQ.0 ) THEN RCOND = ONE RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN ELSE IF( N.EQ.1 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPW = IPV + NP IPNL = 1 IPNU = IPNL + NQ * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Estimate the 1-norm (or I-norm) of inv(A). * AINVNM = ZERO KASE = 0 NORMIN = 'N' * 10 CONTINUE CALL PZLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), IX, JX, $ DESCX, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( UPPER ) THEN * * Multiply by inv(U'). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( 'Upper', 'Conjugate transpose', 'Non-unit', $ NORMIN, N, A, IA, JA, DESCA, WORK( IPX ), $ IX, JX, DESCX, SL, RWORK( IPNL ), $ WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL NORMIN = 'Y' * * Multiply by inv(U). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( 'Upper', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SU, RWORK( IPNU ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(L). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( 'Lower', 'No transpose', 'Non-unit', NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SL, RWORK( IPNL ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL NORMIN = 'Y' * * Multiply by inv(L'). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( 'Lower', 'Conjugate transpose', 'Non-unit', $ NORMIN, N, A, IA, JA, DESCA, WORK( IPX ), $ IX, JX, DESCX, SU, RWORK( IPNU ), $ WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF * * Multiply by 1/SCALE if doing so will not cause overflow. * SCALE = SL*SU IF( SCALE.NE.ONE ) THEN CALL PZAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL ZGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1 ) ELSE CALL ZGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, 1, $ IAROW, MYCOL ) END IF END IF IF( SCALE.LT.CABS1( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PZDRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / AINVNM ) / ANORM * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PZPOCON * END scalapack-2.0.2/SRC/pzpoequ.f000644 000766 000024 00000031333 10363532303 016213 0ustar00juliestaff000000 000000 SUBROUTINE PZPOEQU( N, A, IA, JA, DESCA, SR, SC, SCOND, AMAX, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, N DOUBLE PRECISION AMAX, SCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION SC( * ), SR( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZPOEQU computes row and column scalings intended to * equilibrate a distributed Hermitian positive definite matrix * sub( A ) = A(IA:IA+N-1,JA:JA+N-1) and reduce its condition number * (with respect to the two-norm). SR and SC contain the scale * factors, S(i) = 1/sqrt(A(i,i)), chosen so that the scaled distri- * buted matrix B with elements B(i,j) = S(i)*A(i,j)*S(j) has ones on * the diagonal. This choice of SR and SC puts the condition number * of B within a factor N of the smallest possible condition number * over all possible diagonal scalings. * * The scaling factor are stored along process rows in SR and along * process columns in SC. The duplication of information simplifies * greatly the application of the factors. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory to an * array of local dimension ( LLD_A, LOCc(JA+N-1) ), the * N-by-N Hermitian positive definite distributed matrix * sub( A ) whose scaling factors are to be computed. Only the * diagonal elements of sub( A ) are referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * SR (local output) DOUBLE PRECISION array, dimension LOCr(M_A) * If INFO = 0, SR(IA:IA+N-1) contains the row scale factors * for sub( A ). SR is aligned with the distributed matrix A, * and replicated across every process column. SR is tied to the * distributed matrix A. * * SC (local output) DOUBLE PRECISION array, dimension LOCc(N_A) * If INFO = 0, SC(JA:JA+N-1) contains the column scale factors * for A(IA:IA+M-1,JA:JA+N-1). SC is aligned with the distribu- * ted matrix A, and replicated down every process row. SC is * tied to the distributed matrix A. * * SCOND (global output) DOUBLE PRECISION * If INFO = 0, SCOND contains the ratio of the smallest SR(i) * (or SC(j)) to the largest SR(i) (or SC(j)), with * IA <= i <= IA+N-1 and JA <= j <= JA+N-1. If SCOND >= 0.1 * and AMAX is neither too large nor too small, it is not worth * scaling by SR (or SC). * * AMAX (global output) DOUBLE PRECISION * Absolute value of largest matrix element. If AMAX is very * close to overflow or very close to underflow, the matrix * should be scaled. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the K-th diagonal entry of sub( A ) is * nonpositive. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. CHARACTER ALLCTOP, COLCTOP, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURCOL, ICURROW, $ IDUMM, II, IIA, IOFFA, IOFFD, IROFF, J, JB, JJ, $ JJA, JN, LDA, LL, MYCOL, MYROW, NP, NPCOL, $ NPROW, NQ DOUBLE PRECISION AII, SMIN * .. * .. Local Arrays .. INTEGER DESCSC( DLEN_ ), DESCSR( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMN2D, $ DGAMX2D, DGSUM2D, IGAMN2D, INFOG2L, $ PCHK1MAT, PB_TOPGET, PXERBLA * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, NUMROC, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(500+CTXT_) ELSE CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, INFO ) CALL PCHK1MAT( N, 1, N, 1, IA, JA, DESCA, 5, 0, IDUMM, IDUMM, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPOEQU', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN SCOND = ONE AMAX = ZERO RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'All', ALLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) * * Compute some local indexes * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ = NUMROC( N+ICOFF, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF IF( MYCOL.EQ.IACOL ) $ NQ = NQ - ICOFF JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) * * Assign descriptors for SR and SC arrays * CALL DESCSET( DESCSR, N, 1, DESCA( MB_ ), 1, 0, 0, ICTXT, $ MAX( 1, NP ) ) CALL DESCSET( DESCSC, 1, N, 1, DESCA( NB_ ), 0, 0, ICTXT, 1 ) * * Initialize the scaling factors to zero. * DO 10 II = IIA, IIA+NP-1 SR( II ) = ZERO 10 CONTINUE * DO 20 JJ = JJA, JJA+NQ-1 SC( JJ ) = ZERO 20 CONTINUE * * Find the minimum and maximum diagonal elements. * Handle first block separately. * II = IIA JJ = JJA JB = JN-JA+1 SMIN = ONE / PDLAMCH( ICTXT, 'S' ) AMAX = ZERO * IOFFA = II+(JJ-1)*LDA IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN IOFFD = IOFFA DO 30 LL = 0, JB-1 AII = DBLE( A( IOFFD ) ) SR( II+LL ) = AII SC( JJ+LL ) = AII SMIN = MIN( SMIN, AII ) AMAX = MAX( AMAX, AII ) IF( AII.LE.ZERO .AND. INFO.EQ.0 ) $ INFO = LL + 1 IOFFD = IOFFD + LDA + 1 30 CONTINUE END IF * IF( MYROW.EQ.IAROW ) THEN II = II + JB IOFFA = IOFFA + JB END IF IF( MYCOL.EQ.IACOL ) THEN JJ = JJ + JB IOFFA = IOFFA + JB*LDA END IF ICURROW = MOD( IAROW+1, NPROW ) ICURCOL = MOD( IACOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 50 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) * IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFD = IOFFA DO 40 LL = 0, JB-1 AII = DBLE( A( IOFFD ) ) SR( II+LL ) = AII SC( JJ+LL ) = AII SMIN = MIN( SMIN, AII ) AMAX = MAX( AMAX, AII ) IF( AII.LE.ZERO .AND. INFO.EQ.0 ) $ INFO = J + LL - JA + 1 IOFFD = IOFFD + LDA + 1 40 CONTINUE END IF * IF( MYROW.EQ.ICURROW ) THEN II = II + JB IOFFA = IOFFA + JB END IF IF( MYCOL.EQ.ICURCOL ) THEN JJ = JJ + JB IOFFA = IOFFA + JB*LDA END IF ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * 50 CONTINUE * * Compute scaling factors * CALL DGSUM2D( ICTXT, 'Columnwise', COLCTOP, 1, NQ, SC( JJA ), $ 1, -1, MYCOL ) CALL DGSUM2D( ICTXT, 'Rowwise', ROWCTOP, NP, 1, SR( IIA ), $ MAX( 1, NP ), -1, MYCOL ) * CALL DGAMX2D( ICTXT, 'All', ALLCTOP, 1, 1, AMAX, 1, IDUMM, IDUMM, $ -1, -1, MYCOL ) CALL DGAMN2D( ICTXT, 'All', ALLCTOP, 1, 1, SMIN, 1, IDUMM, IDUMM, $ -1, -1, MYCOL ) * IF( SMIN.LE.ZERO ) THEN * * Find the first non-positive diagonal element and return. * CALL IGAMN2D( ICTXT, 'All', ALLCTOP, 1, 1, INFO, 1, II, JJ, -1, $ -1, MYCOL ) RETURN * ELSE * * Set the scale factors to the reciprocals * of the diagonal elements. * DO 60 II = IIA, IIA+NP-1 SR( II ) = ONE / SQRT( SR( II ) ) 60 CONTINUE * DO 70 JJ = JJA, JJA+NQ-1 SC( JJ ) = ONE / SQRT( SC( JJ ) ) 70 CONTINUE * * Compute SCOND = min(S(I)) / max(S(I)) * SCOND = SQRT( SMIN ) / SQRT( AMAX ) * END IF * RETURN * * End of PZPOEQU * END scalapack-2.0.2/SRC/pzporfs.f000644 000766 000024 00000101414 10363532303 016211 0ustar00juliestaff000000 000000 SUBROUTINE PZPORFS( UPLO, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, B, IB, JB, DESCB, X, IX, JX, DESCX, $ FERR, BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, $ LRWORK, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), $ DESCX( * ) COMPLEX*16 A( * ), AF( * ), B( * ), WORK( * ), X( * ) DOUBLE PRECISION BERR( * ), FERR( * ), RWORK( * ) * .. * * Purpose * ======= * * PZPORFS improves the computed solution to a system of linear * equations when the coefficient matrix is Hermitian positive definite * and provides error bounds and backward error estimates for the * solutions. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix sub( A ) is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) COMPLEX*16 pointer into the local * memory to an array of local dimension (LLD_A,LOCc(JA+N-1) ). * This array contains the local pieces of the N-by-N Hermitian * distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input) COMPLEX*16 pointer into the local memory * to an array of local dimension (LLD_AF,LOCc(JA+N-1)). * On entry, this array contains the factors L or U from the * Cholesky factorization sub( A ) = L*L**H or U**H*U, as * computed by PZPOTRF. * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * B (local input) COMPLEX*16 pointer into the local memory * to an array of local dimension (LLD_B, LOCc(JB+NRHS-1) ). * On entry, this array contains the the local pieces of the * right hand sides sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input) COMPLEX*16 pointer into the local memory * to an array of local dimension (LLD_X, LOCc(JX+NRHS-1) ). * On entry, this array contains the the local pieces of the * solution vectors sub( X ). On exit, it contains the * improved solution vectors. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). * The estimated forward error bound for each solution vector * of sub( X ). If XTRUE is the true solution corresponding * to sub( X ), FERR is an estimated upper bound for the * magnitude of the largest element in (sub( X ) - XTRUE) * divided by the magnitude of the largest element in sub( X ). * The estimate is as reliable as the estimate for RCOND, and * is almost always a slight overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr( N + MOD( IA-1, MB_A ) ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCr( N + MOD( IB-1, MB_B ) ). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices op( A ) and op( AF ) (respectively * sub( X ) and sub( B ) ) should be distributed the same way on the * same processes. These conditions ensure that sub( A ) and sub( AF ) * (resp. sub( X ) and sub( B ) ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( AF ), sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IAF, DESCAF( MB_ ) ) = f( JAF, DESCAF( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER ITMAX PARAMETER ( ITMAX = 5 ) DOUBLE PRECISION ZERO, RONE, TWO, THREE PARAMETER ( ZERO = 0.0D+0, RONE = 1.0D+0, TWO = 2.0D+0, $ THREE = 3.0D+0 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER INTEGER COUNT, IACOL, IAFCOL, IAFROW, IAROW, IXBCOL, $ IXBROW, IXCOL, IXROW, ICOFFA, ICOFFAF, ICOFFB, $ ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, IIW, $ IOFFXB, IPB, IPR, IPV, IROFFA, IROFFAF, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LRWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ DOUBLE PRECISION EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN COMPLEX*16 ZDUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMX2D, $ INFOG2L, PCHK2MAT, PXERBLA, PZAHEMV, PZAXPY, $ PZCOPY, PZHEMV, PZPOTRS, PZLACON, $ ZGEBR2D, ZGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DIMAG, ICHAR, MAX, MIN, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, N, 2, IAF, JAF, DESCAF, 11, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 15, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IX, JX, DESCX, 19, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFAF = MOD( JAF-1, DESCAF( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFCOL = INDXG2P( JAF, DESCAF( NB_ ), MYCOL, $ DESCAF( CSRC_ ), NPCOL ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 2 * NPMOD LRWMIN = NPMOD WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) RWORK( 1 ) = DBLE( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700 + NB_ ) ELSE IF( DESCA( MB_ ).NE.DESCAF( MB_ ) ) THEN INFO = -( 1100 + MB_ ) ELSE IF( IROFFAF.NE.0 .OR. IAROW.NE.IAFROW ) THEN INFO = -9 ELSE IF( DESCA( NB_ ).NE.DESCAF( NB_ ) ) THEN INFO = -( 1100 + NB_ ) ELSE IF( ICOFFAF.NE.0 .OR. IACOL.NE.IAFCOL ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -( 1100 + CTXT_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -13 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1500 + MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1500 + CTXT_ ) ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 1900 + MB_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -17 ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 1900 + NB_ ) ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -18 ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 1900 + CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -25 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IDUM1( 2 ) = N IDUM2( 2 ) = 2 IDUM1( 3 ) = NRHS IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 23 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 25 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, N, 2, IAF, $ JAF, DESCAF, 11, 0, IDUM1, IDUM2, INFO ) CALL PCHK2MAT( N, 2, NRHS, 3, IB, JB, DESCB, 15, N, 2, NRHS, 3, $ IX, JX, DESCX, 19, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPORFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = 1 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = 1 + maximum number of nonzero entries in each row of sub( A ) * NZ = N + 1 EPS = PDLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) ) * DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 100 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 20 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub(B) - op(sub(A)) * sub(X) * CALL PZCOPY( N, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PZHEMV( UPLO, N, -ONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, ONE, WORK( IPR ), IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i))/(abs(sub(A))*abs(sub(X))+abs(sub(B)) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to * the i-th components of the numerator and denominator * before dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIXB, IIXB + NP - 1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 30 CONTINUE END IF END IF * CALL PZAHEMV( UPLO, N, RONE, A, IA, JA, DESCA, X, IX, JX+K, $ DESCX, 1, RONE, RWORK( IPB ), IW, JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 40 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J) is larger than machine epsilon, and * 2) BERR(J) decreased by at least a factor of 2 during the * last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PZPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PZAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, IX, $ JX+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 20 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(sub(A)))* * ( abs(R) + * NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B)) ))) / norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(sub(A)) is the inverse of sub(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(sub(A))*abs(sub(X))+abs(sub(B))) * is incremented by SAFE1 if the i-th component of * abs(sub(A))*abs(sub(X)) + abs(sub(B)) is less than SAFE2. * * Use PZLACON to estimate the infinity-norm of the matrix * inv(sub(A)) * diag(W), where * W = abs(R) + NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B))))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 50 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 50 CONTINUE END IF END IF * KASE = 0 60 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PZLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(sub(A)'). * CALL PZPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF ELSE * * Multiply by inv(sub(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 80 CONTINUE END IF END IF * CALL PZPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 60 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 90 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 90 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 100 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 200 J = JN+1, JB+NRHS-1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 190 K = 0, JBRHS-1 * COUNT = 1 LSTRES = THREE 110 CONTINUE * * Loop until stopping criterion is satisfied. * * Compute residual R = sub( B ) - sub( A )*sub( X ). * CALL PZCOPY( N, B, IB, J+K, DESCB, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PZHEMV( UPLO, N, -ONE, A, IA, JA, DESCA, X, IX, J+K, $ DESCX, 1, ONE, WORK( IPR ), IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / * ( abs(sub(A))*abs(sub(X)) + abs(sub(B)) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIXB, IIXB+NP-1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 120 CONTINUE END IF END IF * CALL PZAHEMV( UPLO, N, RONE, A, IA, JA, DESCA, X, IX, J+K, $ DESCX, 1, RONE, RWORK( IPB ), IW, JW, DESCW, $ 1 ) * S = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 )THEN DO 130 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 130 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.ICURCOL ) $ BERR( JJFBE ) = S * * Test stopping criterion. Continue iterating if * 1) The residual BERR(J+K) is larger than machine epsilon, * and * 2) BERR(J+K) decreased by at least a factor of 2 during * the last iteration, and * 3) At most ITMAX iterations tried. * IF( S.GT.EPS .AND. TWO*S.LE.LSTRES .AND. $ COUNT.LE.ITMAX ) THEN * * Update solution and try again. * CALL PZPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) CALL PZAXPY( N, ONE, WORK( IPR ), IW, JW, DESCW, 1, X, $ IX, J+K, DESCX, 1 ) LSTRES = S COUNT = COUNT + 1 GO TO 110 END IF * * Bound error from formula * * norm(sub(X) - XTRUE) / norm(sub(X)) .le. FERR = * norm( abs(inv(sub(A)))* * ( abs(R) + NZ*EPS*( * abs(sub(A))*abs(sub(X))+abs(sub(B)) )))/norm(sub(X)) * * where * norm(Z) is the magnitude of the largest component of Z * inv(sub(A)) is the inverse of sub(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of sub(A), * plus 1 * EPS is machine epsilon * * The i-th component of abs(R)+NZ*EPS*(abs(sub(A))*abs(sub(X)) * +abs(sub(B))) is incremented by SAFE1 if the i-th component * of abs(sub(A))*abs(sub(X)) + abs(sub(B)) is less than SAFE2. * * Use PZLACON to estimate the infinity-norm of the matrix * inv(sub(A)) * diag(W), where * W = abs(R) + NZ*EPS*( abs(sub(A))*abs(sub(X))+abs(sub(B))))) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW-1, IIW+NP-2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 140 CONTINUE END IF END IF * KASE = 0 150 CONTINUE IF( MYCOL.EQ.ICURCOL ) THEN CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, ICURCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PZLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = ICURCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(sub(A)'). * CALL PZPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 160 CONTINUE END IF END IF ELSE * * Multiply by inv(sub(A))*diag(W). * IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 170 II = IIW-1, IIW+NP-2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 170 CONTINUE END IF END IF * CALL PZPOTRS( UPLO, N, 1, AF, IAF, JAF, DESCAF, $ WORK( IPR ), IW, JW, DESCW, INFO ) END IF GO TO 150 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.ICURCOL ) THEN IF( NP.GT.0 ) THEN DO 180 II = IIXB, IIXB+NP-1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 180 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, $ IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 190 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 200 CONTINUE * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) RWORK( 1 ) = DBLE( LRWMIN ) * RETURN * * End of PZPORFS * END scalapack-2.0.2/SRC/pzposv.f000644 000766 000024 00000024330 10363532303 016050 0ustar00juliestaff000000 000000 SUBROUTINE PZPOSV( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ) * .. * * Purpose * ======= * * PZPOSV computes the solution to a complex system of linear equations * * sub( A ) * X = sub( B ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is an N-by-N * hermitian distributed positive definite matrix and X and sub( B ) * denoting B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS distributed * matrices. * * The Cholesky decomposition is used to factor sub( A ) as * * sub( A ) = U**H * U, if UPLO = 'U', or * * sub( A ) = L * L**H, if UPLO = 'L', * * where U is an upper triangular matrix and L is a lower triangular * matrix. The factored form of sub( A ) is then used to solve the * system of equations. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if INFO = 0, this array contains the * local pieces of the factor U or L from the Cholesky factori- * zation sub( A ) = U**H*U or L*L**H. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_B,LOC(JB+NRHS-1)). * On entry, the local pieces of the right hand sides distribu- * ted matrix sub( B ). On exit, if INFO = 0, sub( B ) is over- * written with the solution distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed, and the * solution has not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IAROW, IBROW, ICOFFA, ICTXT, IROFFA, IROFFB, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PXERBLA, $ PZPOTRF, PZPOTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1000+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, $ 3, IB, JB, DESCB, 11, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPOSV', -INFO ) RETURN END IF * * Compute the Cholesky factorization sub( A ) = U'*U or L*L'. * CALL PZPOTRF( UPLO, N, A, IA, JA, DESCA, INFO ) * IF( INFO.EQ.0 ) THEN * * Solve the system sub( A ) * X = sub( B ) overwriting sub( B ) * with X. * CALL PZPOTRS( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * END IF * RETURN * * End of PZPOSV * END scalapack-2.0.2/SRC/pzposvx.f000644 000766 000024 00000065714 10363532303 016253 0ustar00juliestaff000000 000000 SUBROUTINE PZPOSVX( FACT, UPLO, N, NRHS, A, IA, JA, DESCA, AF, $ IAF, JAF, DESCAF, EQUED, SR, SC, B, IB, JB, $ DESCB, X, IX, JX, DESCX, RCOND, FERR, BERR, $ WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER EQUED, FACT, UPLO INTEGER IA, IAF, IB, INFO, IX, JA, JAF, JB, JX, LRWORK, $ LWORK, N, NRHS DOUBLE PRECISION RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCAF( * ), DESCB( * ), DESCX( * ) DOUBLE PRECISION BERR( * ), FERR( * ), SC( * ), $ SR( * ), RWORK( * ) COMPLEX*16 A( * ), AF( * ), $ B( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PZPOSVX uses the Cholesky factorization A = U**H*U or A = L*L**H to * compute the solution to a complex system of linear equations * * A(IA:IA+N-1,JA:JA+N-1) * X = B(IB:IB+N-1,JB:JB+NRHS-1), * * where A(IA:IA+N-1,JA:JA+N-1) is an N-by-N matrix and X and * B(IB:IB+N-1,JB:JB+NRHS-1) are N-by-NRHS matrices. * * Error bounds on the solution and a condition estimate are also * provided. In the following comments Y denotes Y(IY:IY+M-1,JY:JY+K-1) * a M-by-K matrix where Y can be A, AF, B and X. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Description * =========== * * The following steps are performed: * * 1. If FACT = 'E', real scaling factors are computed to equilibrate * the system: * diag(SR) * A * diag(SC) * inv(diag(SC)) * X = diag(SR) * B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(SR)*A*diag(SC) and B by diag(SR)*B. * * 2. If FACT = 'N' or 'E', the Cholesky decomposition is used to * factor the matrix A (after equilibration if FACT = 'E') as * A = U**T* U, if UPLO = 'U', or * A = L * L**T, if UPLO = 'L', * where U is an upper triangular matrix and L is a lower triangular * matrix. * * 3. The factored form of A is used to estimate the condition number * of the matrix A. If the reciprocal of the condition number is * less than machine precision, steps 4-6 are skipped. * * 4. The system of equations is solved for X using the factored form * of A. * * 5. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 6. If equilibration was used, the matrix X is premultiplied by * diag(SR) so that it solves the original system before * equilibration. * * Arguments * ========= * * FACT (global input) CHARACTER * Specifies whether or not the factored form of the matrix A is * supplied on entry, and if not, whether the matrix A should be * equilibrated before it is factored. * = 'F': On entry, AF contains the factored form of A. * If EQUED = 'Y', the matrix A has been equilibrated * with scaling factors given by S. A and AF will not * be modified. * = 'N': The matrix A will be copied to AF and factored. * = 'E': The matrix A will be equilibrated if necessary, then * copied to AF and factored. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrices B and X. NRHS >= 0. * * A (local input/local output) COMPLEX*16 pointer into * the local memory to an array of local dimension * ( LLD_A, LOCc(JA+N-1) ). * On entry, the Hermitian matrix A, except if FACT = 'F' and * EQUED = 'Y', then A must contain the equilibrated matrix * diag(SR)*A*diag(SC). If UPLO = 'U', the leading * N-by-N upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. A is not modified if * FACT = 'F' or 'N', or if FACT = 'E' and EQUED = 'N' on exit. * * On exit, if FACT = 'E' and EQUED = 'Y', A is overwritten by * diag(SR)*A*diag(SC). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * AF (local input or local output) COMPLEX*16 pointer * into the local memory to an array of local dimension * ( LLD_AF, LOCc(JA+N-1)). * If FACT = 'F', then AF is an input argument and on entry * contains the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T, in the same storage * format as A. If EQUED .ne. 'N', then AF is the factored form * of the equilibrated matrix diag(SR)*A*diag(SC). * * If FACT = 'N', then AF is an output argument and on exit * returns the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T of the original * matrix A. * * If FACT = 'E', then AF is an output argument and on exit * returns the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T of the equilibrated * matrix A (see the description of A for the form of the * equilibrated matrix). * * IAF (global input) INTEGER * The row index in the global array AF indicating the first * row of sub( AF ). * * JAF (global input) INTEGER * The column index in the global array AF indicating the * first column of sub( AF ). * * DESCAF (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix AF. * * EQUED (global input/global output) CHARACTER * Specifies the form of equilibration that was done. * = 'N': No equilibration (always true if FACT = 'N'). * = 'Y': Equilibration was done, i.e., A has been replaced by * diag(SR) * A * diag(SC). * EQUED is an input variable if FACT = 'F'; otherwise, it is an * output variable. * * SR (local input/local output) COMPLEX*16 array, * dimension (LLD_A) * The scale factors for A distributed across process rows; * not accessed if EQUED = 'N'. SR is an input variable if * FACT = 'F'; otherwise, SR is an output variable. * If FACT = 'F' and EQUED = 'Y', each element of SR must be * positive. * * SC (local input/local output) COMPLEX*16 array, * dimension (LOC(N_A)) * The scale factors for A distributed across * process columns; not accessed if EQUED = 'N'. SC is an input * variable if FACT = 'F'; otherwise, SC is an output variable. * If FACT = 'F' and EQUED = 'Y', each element of SC must be * positive. * * B (local input/local output) COMPLEX*16 pointer into * the local memory to an array of local dimension * ( LLD_B, LOCc(JB+NRHS-1) ). * On entry, the N-by-NRHS right-hand side matrix B. * On exit, if EQUED = 'N', B is not modified; if TRANS = 'N' * and EQUED = 'R' or 'B', B is overwritten by diag(R)*B; if * TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is overwritten * by diag(C)*B. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input/local output) COMPLEX*16 pointer into * the local memory to an array of local dimension * ( LLD_X, LOCc(JX+NRHS-1) ). * If INFO = 0, the N-by-NRHS solution matrix X to the original * system of equations. Note that A and B are modified on exit * if EQUED .ne. 'N', and the solution to the equilibrated * system is inv(diag(SC))*X if TRANS = 'N' and EQUED = 'C' or * 'B', or inv(diag(SR))*X if TRANS = 'T' or 'C' and EQUED = 'R' * or 'B'. * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * RCOND (global output) DOUBLE PRECISION * The estimate of the reciprocal condition number of the matrix * A after equilibration (if done). If RCOND is less than the * machine precision (in particular, if RCOND = 0), the matrix * is singular to working precision. This condition is * indicated by a return code of INFO > 0, and the solution and * error bounds are not computed. * * FERR (local output) DOUBLE PRECISION array, dimension (LOC(N_B)) * The estimated forward error bounds for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution, FERR(j) bounds the magnitude * of the largest entry in (X(j) - XTRUE) divided by * the magnitude of the largest entry in X(j). The quality of * the error bound depends on the quality of the estimate of * norm(inv(A)) computed in the code; if the estimate of * norm(inv(A)) is accurate, the error bound is guaranteed. * * BERR (local output) DOUBLE PRECISION array, dimension (LOC(N_B)) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any entry of A or B that makes X(j) an exact solution). * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK = MAX( PZPOCON( LWORK ), PZPORFS( LWORK ) ) * + LOCr( N_A ). * LWORK = 3*DESCA( LLD_ ) * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK = 2*LOCc(N_A). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, and i is * <= N: if INFO = i, the leading minor of order i of A * is not positive definite, so the factorization * could not be completed, and the solution and error * bounds could not be computed. * = N+1: RCOND is less than machine precision. The * factorization has been completed, but the matrix * is singular to working precision, and the solution * and error bounds have not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL EQUIL, LQUERY, NOFACT, RCEQU INTEGER I, IACOL, IAROW, IAFROW, IBROW, IBCOL, ICOFF, $ ICOFFA, ICTXT, IDUMM, IIA, IIB, IIX, INFEQU, $ IROFF, IROFFA, IROFFAF, IROFFB, IROFFX, IXCOL, $ IXROW, J, JJA, JJB, JJX, LDB, LDX, LRWMIN, $ LWMIN, MYCOL, MYROW, NP, NPCOL, NPROW, NRHSQ, $ NQ DOUBLE PRECISION AMAX, ANORM, BIGNUM, SCOND, SMAX, SMIN, SMLNUM * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, $ DGAMN2D, DGAMX2D, INFOG2L, $ PXERBLA, PZPOCON, PZPOEQU, $ PZPORFS, PZPOTRF, PZPOTRS, $ PZLACPY, PZLAQSY * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PZLANHE EXTERNAL PDLAMCH, INDXG2P, LSAME, NUMROC, PZLANHE * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(800+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 8, INFO ) IF( LSAME( FACT, 'F' ) ) $ CALL CHK1MAT( N, 3, N, 3, IAF, JAF, DESCAF, 12, INFO ) CALL CHK1MAT( N, 3, NRHS, 4, IB, JB, DESCB, 20, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IAFROW = INDXG2P( IAF, DESCAF( MB_ ), MYROW, $ DESCAF( RSRC_ ), NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFAF = MOD( IAF-1, DESCAF( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IIA, JJA, IAROW, IACOL ) NP = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP-IROFFA NQ = NUMROC( N+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, NPCOL ) IF( MYCOL.EQ.IACOL ) $ NQ = NQ-ICOFFA LWMIN = 3*DESCA( LLD_ ) LRWMIN = MAX( 2*NQ, NP ) NOFACT = LSAME( FACT, 'N' ) EQUIL = LSAME( FACT, 'E' ) IF( NOFACT .OR. EQUIL ) THEN EQUED = 'N' RCEQU = .FALSE. ELSE RCEQU = LSAME( EQUED, 'Y' ) SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM END IF IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. $ .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( IROFFA.NE.0 ) THEN INFO = -6 ELSE IF( ICOFFA.NE.0 .OR. IROFFA.NE.ICOFFA ) THEN INFO = -7 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(800+NB_) ELSE IF( IAFROW.NE.IAROW ) THEN INFO = -10 ELSE IF( IROFFAF.NE.0 ) THEN INFO = -10 ELSE IF( ICTXT.NE.DESCAF( CTXT_ ) ) THEN INFO = -(1200+CTXT_) ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. $ ( RCEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN INFO = -13 ELSE IF( RCEQU ) THEN * SMIN = BIGNUM SMAX = ZERO DO 10 J = IIA, IIA + NP - 1 SMIN = MIN( SMIN, SR( J ) ) SMAX = MAX( SMAX, SR( J ) ) 10 CONTINUE CALL DGAMN2D( ICTXT, 'Columnwise', ' ', 1, 1, SMIN, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'Columnwise', ' ', 1, 1, SMAX, $ 1, IDUMM, IDUMM, -1, -1, MYCOL ) IF( SMIN.LE.ZERO ) THEN INFO = -14 ELSE IF( N.GT.0 ) THEN SCOND = MAX( SMIN, SMLNUM ) / MIN( SMAX, BIGNUM ) ELSE SCOND = ONE END IF END IF END IF END IF * WORK( 1 ) = DBLE( LWMIN ) RWORK( 1 ) = DBLE( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( INFO.EQ.0 ) THEN IF( IBROW.NE.IAROW ) THEN INFO = -18 ELSE IF( IXROW.NE.IBROW ) THEN INFO = -22 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(2000+NB_) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -(2000+CTXT_) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -(2400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -28 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -30 END IF IDUM1( 1 ) = ICHAR( FACT ) IDUM2( 1 ) = 1 IDUM1( 2 ) = ICHAR( UPLO ) IDUM2( 2 ) = 2 IF( LSAME( FACT, 'F' ) ) THEN IDUM1( 3 ) = ICHAR( EQUED ) IDUM2( 3 ) = 13 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 28 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 30 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, $ 4, IB, JB, DESCB, 19, 5, IDUM1, IDUM2, $ INFO ) ELSE IF( LWORK.EQ.-1 ) THEN IDUM1( 3 ) = -1 ELSE IDUM1( 3 ) = 1 END IF IDUM2( 3 ) = 28 IF( LRWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 30 CALL PCHK2MAT( N, 3, N, 3, IA, JA, DESCA, 8, N, 3, NRHS, $ 4, IB, JB, DESCB, 19, 4, IDUM1, IDUM2, $ INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPOSVX', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * IF( EQUIL ) THEN * * Compute row and column scalings to equilibrate the matrix A. * CALL PZPOEQU( N, A, IA, JA, DESCA, SR, SC, SCOND, AMAX, $ INFEQU ) * IF( INFEQU.EQ.0 ) THEN * * Equilibrate the matrix. * CALL PZLAQSY( UPLO, N, A, IA, JA, DESCA, SR, SC, SCOND, $ AMAX, EQUED ) RCEQU = LSAME( EQUED, 'Y' ) END IF END IF * * Scale the right-hand side. * CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, IIB, $ JJB, IBROW, IBCOL ) LDB = DESCB( LLD_ ) IROFF = MOD( IB-1, DESCB( MB_ ) ) ICOFF = MOD( JB-1, DESCB( NB_ ) ) NP = NUMROC( N+IROFF, DESCB( MB_ ), MYROW, IBROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFF, DESCB( NB_ ), MYCOL, IBCOL, NPCOL ) IF( MYROW.EQ.IBROW ) NP = NP-IROFF IF( MYCOL.EQ.IBCOL ) NRHSQ = NRHSQ-ICOFF * IF( RCEQU ) THEN DO 30 J = JJB, JJB+NRHSQ-1 DO 20 I = IIB, IIB+NP-1 B( I + ( J-1 )*LDB ) = SR( I )*B( I + ( J-1 )*LDB ) 20 CONTINUE 30 CONTINUE END IF * IF( NOFACT .OR. EQUIL ) THEN * * Compute the Cholesky factorization A = U'*U or A = L*L'. * CALL PZLACPY( 'Full', N, N, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF ) CALL PZPOTRF( UPLO, N, AF, IAF, JAF, DESCAF, INFO ) * * Return if INFO is non-zero. * IF( INFO.NE.0 ) THEN IF( INFO.GT.0 ) $ RCOND = ZERO RETURN END IF END IF * * Compute the norm of the matrix A. * ANORM = PZLANHE( '1', UPLO, N, A, IA, JA, DESCA, RWORK ) * * Compute the reciprocal of the condition number of A. * CALL PZPOCON( UPLO, N, AF, IAF, JAF, DESCAF, ANORM, RCOND, WORK, $ LWORK, RWORK, LRWORK, INFO ) * * Return if the matrix is singular to working precision. * IF( RCOND.LT.PDLAMCH( ICTXT, 'Epsilon' ) ) THEN INFO = IA + N RETURN END IF * * Compute the solution matrix X. * CALL PZLACPY( 'Full', N, NRHS, B, IB, JB, DESCB, X, IX, JX, $ DESCX ) CALL PZPOTRS( UPLO, N, NRHS, AF, IAF, JAF, DESCAF, X, IX, JX, $ DESCX, INFO ) * * Use iterative refinement to improve the computed solution and * compute error bounds and backward error estimates for it. * CALL PZPORFS( UPLO, N, NRHS, A, IA, JA, DESCA, AF, IAF, JAF, $ DESCAF, B, IB, JB, DESCB, X, IX, JX, DESCX, FERR, $ BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * Transform the solution matrix X to a solution of the original * system. * CALL INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) LDX = DESCX( LLD_ ) IROFF = MOD( IX-1, DESCX( MB_ ) ) ICOFF = MOD( JX-1, DESCX( NB_ ) ) NP = NUMROC( N+IROFF, DESCX( MB_ ), MYROW, IXROW, NPROW ) NRHSQ = NUMROC( NRHS+ICOFF, DESCX( NB_ ), MYCOL, IXCOL, NPCOL ) IF( MYROW.EQ.IBROW ) NP = NP-IROFF IF( MYCOL.EQ.IBCOL ) NRHSQ = NRHSQ-ICOFF * IF( RCEQU ) THEN DO 50 J = JJX, JJX+NRHSQ-1 DO 40 I = IIX, IIX+NP-1 X( I + ( J-1 )*LDX ) = SR( I )*X( I + ( J-1 )*LDX ) 40 CONTINUE 50 CONTINUE DO 60 J = JJX, JJX+NRHSQ-1 FERR( J ) = FERR( J ) / SCOND 60 CONTINUE END IF * WORK( 1 ) = DBLE( LWMIN ) RWORK( 1 ) = DBLE( LRWMIN ) RETURN * * End of PZPOSVX * END scalapack-2.0.2/SRC/pzpotf2.f000644 000766 000024 00000031337 10363532303 016120 0ustar00juliestaff000000 000000 SUBROUTINE PZPOTF2( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZPOTF2 computes the Cholesky factorization of a complex hermitian * positive definite distributed matrix sub( A )=A(IA:IA+N-1,JA:JA+N-1). * * The factorization has the form * * sub( A ) = U' * U , if UPLO = 'U', or * * sub( A ) = L * L', if UPLO = 'L', * * where U is an upper triangular matrix and L is lower triangular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires N <= NB_A-MOD(JA-1, NB_A) and square block * decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N symmetric distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if UPLO = 'U', the upper triangular * part of the distributed matrix contains the Cholesky factor * U, if UPLO = 'L', the lower triangular part of the distribu- * ted matrix contains the Cholesky factor L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) COMPLEX*16 CONE PARAMETER ( CONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, ICURR, IDIAG, IIA, $ IOFFA, IROFF, J, JJA, LDA, MYCOL, MYROW, $ NPCOL, NPROW DOUBLE PRECISION AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, IGEBR2D, $ IGEBS2D, INFOG2L, PB_TOPGET, PXERBLA, ZGEMV, $ ZLACGV, ZDSCAL * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MOD, SQRT * .. * .. External Functions .. LOGICAL LSAME COMPLEX*16 ZDOTC EXTERNAL LSAME, ZDOTC * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N+ICOFF.GT.DESCA( NB_ ) ) THEN INFO = -2 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPOTF2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Compute local information * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF ( UPPER ) THEN * * Process (IAROW, IACOL) owns block to be factorized * IF( MYROW.EQ.IAROW ) THEN IF( MYCOL.EQ.IACOL ) THEN * * Compute the Cholesky factorization A = U'*U. * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * DO 10 J = JA, JA+N-1 * * Compute U(J,J) and test for non-positive-definiteness. * AJJ = DBLE( A( IDIAG ) ) - $ ZDOTC( J-JA, A( IOFFA ), 1, A( IOFFA ), 1 ) IF( AJJ.LE.ZERO ) THEN A( IDIAG ) = AJJ INFO = J - JA + 1 GO TO 20 END IF AJJ = SQRT( AJJ ) A( IDIAG ) = AJJ * * Compute elements J+1:JA+N-1 of row J. * IF( J.LT.JA+N-1 ) THEN ICURR = IDIAG + LDA CALL ZLACGV( J-JA, A( IOFFA ), 1 ) CALL ZGEMV( 'Transpose', J-JA, JA+N-J-1, -CONE, $ A( IOFFA+LDA ), LDA, A( IOFFA ), 1, $ CONE, A( ICURR ), LDA ) CALL ZLACGV( J-JA, A( IOFFA ), 1 ) CALL ZDSCAL( JA+N-J-1, ONE / AJJ, A( ICURR ), $ LDA ) END IF IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + LDA 10 CONTINUE * 20 CONTINUE * * Broadcast INFO to all processes in my IAROW. * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1, $ MYROW, IACOL ) END IF * * IAROW bcasts along columns so that everyone has INFO * CALL IGEBS2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, 1, $ IAROW, MYCOL ) * END IF * ELSE * * Process (IAROW, IACOL) owns block to be factorized * IF( MYCOL.EQ.IACOL ) THEN IF( MYROW.EQ.IAROW ) THEN * * Compute the Cholesky factorization A = L*L'. * LDA = DESCA( LLD_ ) IDIAG = IIA + ( JJA - 1 ) * LDA IOFFA = IDIAG * DO 30 J = JA, JA+N-1 * * Compute L(J,J) and test for non-positive-definiteness. * AJJ = DBLE( A( IDIAG ) ) - $ ZDOTC( J-JA, A( IOFFA ), LDA, A( IOFFA ), LDA ) IF ( AJJ.LE.ZERO ) THEN A( IDIAG ) = AJJ INFO = J - JA + 1 GO TO 40 END IF AJJ = SQRT( AJJ ) A( IDIAG ) = AJJ * * Compute elements J+1:JA+N-1 of column J. * IF( J.LT.JA+N-1 ) THEN ICURR = IDIAG + 1 CALL ZLACGV( J-JA, A( IOFFA ), LDA ) CALL ZGEMV( 'No transpose', JA+N-J-1, J-JA, -CONE, $ A( IOFFA+1 ), LDA, A( IOFFA ), LDA, $ CONE, A( ICURR ), 1 ) CALL ZLACGV( J-JA, A( IOFFA ), LDA ) CALL ZDSCAL( JA+N-J-1, ONE / AJJ, A( ICURR ), 1 ) END IF IDIAG = IDIAG + LDA + 1 IOFFA = IOFFA + 1 30 CONTINUE * 40 CONTINUE * * Broadcast INFO to everyone in IACOL * CALL IGEBS2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, $ 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Columnwise', COLBTOP, 1, 1, INFO, $ 1, IAROW, MYCOL ) * END IF * * IACOL bcasts INFO along rows so that everyone has it * CALL IGEBS2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1 ) * ELSE * CALL IGEBR2D( ICTXT, 'Rowwise', ROWBTOP, 1, 1, INFO, 1, $ MYROW, IACOL ) * END IF * END IF * RETURN * * End of PZPOTF2 * END scalapack-2.0.2/SRC/pzpotrf.f000644 000766 000024 00000031650 10363532303 016216 0ustar00juliestaff000000 000000 SUBROUTINE PZPOTRF( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZPOTRF computes the Cholesky factorization of an N-by-N complex * hermitian positive definite distributed matrix sub( A ) denoting * A(IA:IA+N-1, JA:JA+N-1). * * The factorization has the form * * sub( A ) = U' * U , if UPLO = 'U', or * * sub( A ) = L * L', if UPLO = 'L', * * where U is an upper triangular matrix and L is lower triangular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * N-by-N Hermitian distributed matrix sub( A ) to be factored. * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. On exit, if UPLO = 'U', the upper triangular * part of the distributed matrix contains the Cholesky factor * U, if UPLO = 'L', the lower triangular part of the distribu- * ted matrix contains the Cholesky factor L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, the leading minor of order K, * A(IA:IA+K-1,JA:JA+K-1) is not positive definite, and * the factorization could not be completed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) COMPLEX*16 CONE PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER CHARACTER COLBTOP, ROWBTOP INTEGER I, ICOFF, ICTXT, IROFF, J, JB, JN, MYCOL, $ MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZPOTF2, PZHERK, $ PZTRSM * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFF.NE.0 ) THEN INFO = -4 ELSE IF( ICOFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPOTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( UPPER ) THEN * * Split-ring topology for the communication along process * columns, 1-tree topology along process rows. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'S-ring' ) * * A is upper triangular, compute Cholesky factorization A = U'*U. * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA(NB_), JA+N-1 ) JB = JN - JA + 1 * * Perform unblocked Cholesky factorization on JB block * CALL PZPOTF2( UPLO, JB, A, IA, JA, DESCA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 * IF( JB+1.LE.N ) THEN * * Form the row panel of U using the triangular solver * CALL PZTRSM( 'Left', UPLO, 'Conjugate transpose', $ 'Non-Unit', JB, N-JB, CONE, A, IA, JA, DESCA, $ A, IA, JA+JB, DESCA ) * * Update the trailing matrix, A = A - U'*U * CALL PZHERK( UPLO, 'Conjugate transpose', N-JB, JB, -ONE, A, $ IA, JA+JB, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) END IF * * Loop over remaining block of columns * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Perform unblocked Cholesky factorization on JB block * CALL PZPOTF2( UPLO, JB, A, I, J, DESCA, INFO ) IF( INFO.NE.0 ) THEN INFO = INFO + J - JA GO TO 30 END IF * IF( J-JA+JB+1.LE.N ) THEN * * Form the row panel of U using the triangular solver * CALL PZTRSM( 'Left', UPLO, 'Conjugate transpose', $ 'Non-Unit', JB, N-J-JB+JA, CONE, A, I, J, $ DESCA, A, I, J+JB, DESCA ) * * Update the trailing matrix, A = A - U'*U * CALL PZHERK( UPLO, 'Conjugate transpose', N-J-JB+JA, JB, $ -ONE, A, I, J+JB, DESCA, ONE, A, I+JB, $ J+JB, DESCA ) END IF 10 CONTINUE * ELSE * * 1-tree topology for the communication along process columns, * Split-ring topology along process rows. * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'S-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * A is lower triangular, compute Cholesky factorization A = L*L' * (right-looking) * * Handle the first block of columns separately * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) JB = JN - JA + 1 * * Perform unblocked Cholesky factorization on JB block * CALL PZPOTF2( UPLO, JB, A, IA, JA, DESCA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 * IF( JB+1.LE.N ) THEN * * Form the column panel of L using the triangular solver * CALL PZTRSM( 'Right', UPLO, 'Conjugate transpose', $ 'Non-Unit', N-JB, JB, CONE, A, IA, JA, DESCA, $ A, IA+JB, JA, DESCA ) * * Update the trailing matrix, A = A - L*L' * CALL PZHERK( UPLO, 'No Transpose', N-JB, JB, -ONE, A, IA+JB, $ JA, DESCA, ONE, A, IA+JB, JA+JB, DESCA ) * END IF * DO 20 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( N-J+JA, DESCA( NB_ ) ) I = IA + J - JA * * Perform unblocked Cholesky factorization on JB block * CALL PZPOTF2( UPLO, JB, A, I, J, DESCA, INFO ) IF( INFO.NE.0 ) THEN INFO = INFO + J - JA GO TO 30 END IF * IF( J-JA+JB+1.LE.N ) THEN * * Form the column panel of L using the triangular solver * CALL PZTRSM( 'Right', UPLO, 'Conjugate transpose', $ 'Non-Unit', N-J-JB+JA, JB, CONE, A, I, J, $ DESCA, A, I+JB, J, DESCA ) * * Update the trailing matrix, A = A - L*L' * CALL PZHERK( UPLO, 'No Transpose', N-J-JB+JA, JB, -ONE, $ A, I+JB, J, DESCA, ONE, A, I+JB, J+JB, $ DESCA ) * END IF 20 CONTINUE * END IF * 30 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * RETURN * * End of PZPOTRF * END scalapack-2.0.2/SRC/pzpotri.f000644 000766 000024 00000017156 10363532303 016226 0ustar00juliestaff000000 000000 SUBROUTINE PZPOTRI( UPLO, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZPOTRI computes the inverse of a complex Hermitian positive definite * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the * Cholesky factorization sub( A ) = U**H*U or L*L**H computed by * PZPOTRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the triangular factor U or L * from the Cholesky factorization of the distributed matrix * sub( A ) = U**H*U or L*L**H, as computed by PZPOTRF. * On exit, the local pieces of the upper or lower triangle of * the (Hermitian) inverse of sub( A ), overwriting the input * factor U or L. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, the (i,i) element of the factor U or L is * zero, and the inverse could not be computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER ICOFF, ICTXT, IROFF, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PXERBLA, $ PZLAUUM, PZTRTRI * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.NE.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -5 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(600+NB_) END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK1MAT( N, 2, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPOTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Invert the triangular Cholesky factor U or L. * CALL PZTRTRI( UPLO, 'Non-unit', N, A, IA, JA, DESCA, INFO ) * IF( INFO.GT.0 ) $ RETURN * * Form inv(U)*inv(U)' or inv(L)'*inv(L). * CALL PZLAUUM( UPLO, N, A, IA, JA, DESCA ) * RETURN * * End of PZPOTRI * END scalapack-2.0.2/SRC/pzpotrs.f000644 000766 000024 00000024025 10363532303 016231 0ustar00juliestaff000000 000000 SUBROUTINE PZPOTRS( UPLO, N, NRHS, A, IA, JA, DESCA, B, IB, JB, $ DESCB, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ) * .. * * Purpose * ======= * * PZPOTRS solves a system of linear equations * * sub( A ) * X = sub( B ) * A(IA:IA+N-1,JA:JA+N-1)*X = B(IB:IB+N-1,JB:JB+NRHS-1) * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is a N-by-N * hermitian positive definite distributed matrix using the Cholesky * factorization sub( A ) = U**H*U or L*L**H computed by PZPOTRF. * sub( B ) denotes the distributed matrix B(IB:IB+N-1,JB:JB+NRHS-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * This routine requires square block decomposition ( MB_A = NB_A ). * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of sub( A ) is stored; * = 'L': Lower triangle of sub( A ) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix sub( B ). NRHS >= 0. * * A (local input) COMPLEX*16 pointer into local memory to * an array of dimension (LLD_A, LOCc(JA+N-1)). On entry, this * array contains the factors L or U from the Cholesky facto- * rization sub( A ) = L*L**H or U**H*U, as computed by PZPOTRF. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of local dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * the local pieces of the right hand sides sub( B ). * On exit, this array contains the local pieces of the solution * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER IAROW, IBROW, ICTXT, IROFFA, IROFFB, ICOFFA, $ MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PXERBLA, $ PZTRSM * .. * .. External Functions .. LOGICAL LSAME INTEGER INDXG2P EXTERNAL INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MOD * .. * .. Executable Statements .. * * Get grid parameters. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 2, N, 2, IA, JA, DESCA, 7, INFO ) CALL CHK1MAT( N, 2, NRHS, 3, IB, JB, DESCB, 11, INFO ) UPPER = LSAME( UPLO, 'U' ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IF ( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( IROFFA.NE.0 ) THEN INFO = -5 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) ELSE IF( IROFFB.NE.0 .OR. IBROW.NE.IAROW ) THEN INFO = -9 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(1100+NB_) END IF END IF IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 CALL PCHK2MAT( N, 2, N, 2, IA, JA, DESCA, 7, N, 2, NRHS, $ 3, IB, JB, DESCB, 11, 1, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPOTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Solve sub( A ) * X = sub( B ) where sub( A ) = U'*U. * * Solve U'*X = sub( B ), overwriting sub( B ) with X. * CALL PZTRSM( 'Left', 'Upper', 'Conjugate transpose', $ 'Non-unit', N, NRHS, ONE, A, IA, JA, DESCA, B, IB, $ JB, DESCB ) * * Solve U*X = sub( B ), overwriting sub( B ) with X. * CALL PZTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) ELSE * * Solve sub( A ) *X = sub( B ) where sub( A ) = L*L'. * * Solve L*X = sub( B ), overwriting sub( B ) with X. * CALL PZTRSM( 'Left', 'Lower', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, IA, JA, DESCA, B, IB, JB, DESCB ) * * Solve L'*X = sub( B ), overwriting sub( B ) with X. * CALL PZTRSM( 'Left', 'Lower', 'Conjugate transpose', $ 'Non-unit', N, NRHS, ONE, A, IA, JA, DESCA, B, IB, $ JB, DESCB ) END IF * RETURN * * End of PZPOTRS * END scalapack-2.0.2/SRC/pzptsv.f000644 000766 000024 00000046055 10363532303 016065 0ustar00juliestaff000000 000000 SUBROUTINE PZPTSV( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, $ WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IB, INFO, JA, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 B( * ), E( * ), WORK( * ) DOUBLE PRECISION D( * ) * .. * * * Purpose * ======= * * PZPTSV solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is an N-by-N complex * tridiagonal symmetric positive definite distributed * matrix. * * Cholesky factorization is used to factor a reordering of * the matrix into L L'. * * See PZPTTRF and PZPTTRS for details. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (12*NPCOL + 3*NB) * +max((10+2*min(100,NRHS))*NPCOL+4*NRHS, 8*NPCOL) * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW, $ WS_FACTOR * .. * .. External Subroutines .. EXTERNAL PXERBLA, PZPTTRF, PZPTTRS * .. * .. Executable Statements .. * * Note: to avoid duplication, most error checking is not performed * in this routine and is left to routines * PZPTTRF and PZPTTRS. * * Begin main code * INFO = 0 * * Get block size to calculate workspace requirements * IF( DESCA( DTYPE_ ) .EQ. BLOCK_CYCLIC_2D ) THEN NB = DESCA( NB_ ) ICTXT = DESCA( CTXT_ ) ELSEIF( DESCA( DTYPE_ ) .EQ. 501 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSEIF( DESCA( DTYPE_ ) .EQ. 502 ) THEN NB = DESCA( 4 ) ICTXT = DESCA( 2 ) ELSE INFO = -( 5*100 + DTYPE_ ) CALL PXERBLA( ICTXT, $ 'PZPTSV', $ -INFO ) RETURN ENDIF * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * * Size needed for AF in factorization * WS_FACTOR = (12*NPCOL + 3*NB) * * Factor the matrix * CALL PZPTTRF( N, D, E, JA, DESCA, WORK, MIN( LWORK, WS_FACTOR ), $ WORK( 1+WS_FACTOR ), LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN IF( INFO .LT. 0 ) THEN CALL PXERBLA( ICTXT, 'PZPTSV', -INFO ) ENDIF RETURN END IF * * Solve the system using the factorization * CALL PZPTTRS( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, WORK, $ MIN( LWORK, WS_FACTOR ), WORK( 1+WS_FACTOR), $ LWORK-WS_FACTOR, INFO ) * * Check info for error conditions * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZPTSV', -INFO ) RETURN END IF * RETURN * * End of PZPTSV * END scalapack-2.0.2/SRC/pzpttrf.f000644 000766 000024 00000105105 11750130340 016214 0ustar00juliestaff000000 000000 SUBROUTINE PZPTTRF( N, D, E, JA, DESCA, AF, LAF, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. INTEGER INFO, JA, LAF, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 AF( * ), E( * ), WORK( * ) DOUBLE PRECISION D( * ) * .. * * * Purpose * ======= * * PZPTTRF computes a Cholesky factorization * of an N-by-N complex tridiagonal * symmetric positive definite distributed matrix * A(1:N, JA:JA+N-1). * Reordering is used to increase parallelism in the factorization. * This reordering results in factors that are DIFFERENT from those * produced by equivalent sequential codes. These factors cannot * be used directly by users; however, they can be used in * subsequent calls to PZPTTRS to solve linear systems. * * The factorization has the form * * P A(1:N, JA:JA+N-1) P^T = U' D U or * * P A(1:N, JA:JA+N-1) P^T = L D L', * * where U is a tridiagonal upper triangular matrix and L is tridiagonal * lower triangular, and P is a permutation matrix. * * ===================================================================== * * Arguments * ========= * * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * D (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZPTTRF and this is stored in AF. If a linear system * is to be solved using PZPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * 8*NPCOL * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K<=NPROCS, the submatrix stored on processor * INFO and factored locally was not * positive definite, and * the factorization was not completed. * If INFO = K>NPROCS, the submatrix stored on processor * INFO-NPROCS representing interactions with other * processors was not * positive definite, * and the factorization was not completed. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT, $ ICTXT_NEW, ICTXT_SAVE, IDUM3, INT_TEMP, JA_NEW, $ LAF_MIN, LEVEL_DIST, LLDA, MYCOL, MYROW, $ MY_NUM_COLS, NB, NP, NPCOL, NPROW, NP_SAVE, $ ODD_SIZE, PART_OFFSET, PART_SIZE, RETURN_CODE, $ STORE_N_A, TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ DESC_CONVERT, GLOBCHK, PXERBLA, RESHAPE, ZAXPY, $ ZGEMM, ZGERV2D, ZGESD2D, ZLAMOV, ZLATCPY, $ ZPBTRF, ZPOTRF, ZSYRK, ZTBTRS, ZTRMM, ZTRRV2D, $ ZTRSD2D, ZTRSM, ZTRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 5*100 + 2 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LWORK .LT. -1) THEN INFO = -9 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -1 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 5*100 + 6 ) ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 5*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 1 ) CALL PXERBLA( ICTXT, $ 'PZPTTRF, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 5*100+4 ) CALL PXERBLA( ICTXT, $ 'PZPTTRF, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * * Check auxiliary storage size * LAF_MIN = (12*NPCOL + 3*NB) * IF( LAF .LT. LAF_MIN ) THEN INFO = -7 * put minimum value of laf into AF( 1 ) AF( 1 ) = LAF_MIN CALL PXERBLA( ICTXT, $ 'PZPTTRF: auxiliary storage error ', $ -INFO ) RETURN ENDIF * * Check worksize * WORK_SIZE_MIN = 8*NPCOL * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -9 CALL PXERBLA( ICTXT, $ 'PZPTTRF: worksize error ', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 7, 1 ) = DESCA(5) PARAM_CHECK( 6, 1 ) = DESCA(4) PARAM_CHECK( 5, 1 ) = DESCA(3) PARAM_CHECK( 4, 1 ) = DESCA(1) PARAM_CHECK( 3, 1 ) = JA PARAM_CHECK( 2, 1 ) = N PARAM_CHECK( 1, 1 ) = IDUM3 * PARAM_CHECK( 7, 2 ) = 505 PARAM_CHECK( 6, 2 ) = 504 PARAM_CHECK( 5, 2 ) = 503 PARAM_CHECK( 4, 2 ) = 501 PARAM_CHECK( 3, 2 ) = 4 PARAM_CHECK( 2, 2 ) = 1 PARAM_CHECK( 1, 2 ) = 9 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 7, PARAM_CHECK, 7, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZPTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * * Zero out space for fillin * DO 10 I=1, LAF_MIN AF( I ) = CZERO 10 CONTINUE * * Begin main code * * ******************************************************************** * PHASE 1: Local computation phase. ******************************************************************** * * IF ( MYCOL .LT. NP-1 ) THEN * Transfer last triangle D_i of local matrix to next processor * which needs it to calculate fillin due to factorization of * its main (odd) block A_i. * Overlap the send with the factorization of A_i. * CALL ZTRSD2D( ICTXT, 'U', 'N', 1, 1, $ E( PART_OFFSET+ODD_SIZE+1 ), LLDA-1, 0, $ MYCOL+1 ) * ENDIF * * * Factor main partition A_i = L_i {L_i}^C in each processor * Or A_i = {U_i}^C {U_i} if E is the upper superdiagonal * CALL ZPTTRF( ODD_SIZE, D( PART_OFFSET+1 ), E( PART_OFFSET+1 ), $ INFO ) * IF( INFO.NE.0 ) THEN INFO = MYCOL+1 GOTO 1500 ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Apply factorization to odd-even connection block B_i * * * Perform the triangular system solve {L_i}{{B'}_i}^C = {B_i}^C * by dividing B_i by diagonal element * E( PART_OFFSET+ODD_SIZE ) = E( PART_OFFSET+ODD_SIZE )/ $ D( PART_OFFSET+ODD_SIZE ) * * * * Compute contribution to diagonal block(s) of reduced system. * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^C * D( PART_OFFSET+ODD_SIZE+1 ) = D( PART_OFFSET+ODD_SIZE+1 )- $ D( PART_OFFSET+ODD_SIZE )*DBLE( E( PART_OFFSET+ODD_SIZE )* $ DCONJG( E( PART_OFFSET+ODD_SIZE ) ) ) * ENDIF * End of "if ( MYCOL .lt. NP-1 )..." loop * * 1500 CONTINUE * If the processor could not locally factor, it jumps here. * IF ( MYCOL .NE. 0 ) THEN * * Receive previously transmitted matrix section, which forms * the right-hand-side for the triangular solve that calculates * the "spike" fillin. * * CALL ZTRRV2D( ICTXT, 'U', 'N', 1, 1, AF( 1 ), ODD_SIZE, 0, $ MYCOL-1 ) * IF (INFO.EQ.0) THEN * * Calculate the "spike" fillin, ${L_i} {{G}_i}^C = {D_i}$ . * CALL ZPTTRSV( 'L', 'N', ODD_SIZE, INT_ONE, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), AF( 1 ), ODD_SIZE, INFO ) * * Divide by D * DO 20 I=1, ODD_SIZE AF( I ) = AF( I )/D( PART_OFFSET+I ) 20 CONTINUE * * * Calculate the update block for previous proc, E_i = G_i{G_i}^C * * * Since there is no element-by-element vector multiplication in * the BLAS, this loop must be hardwired in without a BLAS call * INT_TEMP = ODD_SIZE*INT_ONE+2+1 AF( INT_TEMP ) = 0 * DO 30 I=1, ODD_SIZE AF( INT_TEMP ) = AF( INT_TEMP )-D( PART_OFFSET+I )* $ ( AF( I )*DCONJG( AF( I ) ) ) 30 CONTINUE * * * Initiate send of E_i to previous processor to overlap * with next computation. * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+3 ), $ INT_ONE, 0, MYCOL-1 ) * * IF ( MYCOL .LT. NP-1 ) THEN * * Calculate off-diagonal block(s) of reduced system. * Note: for ease of use in solution of reduced system, store * L's off-diagonal block in conjugate transpose form. * {F_i}^C = {H_i}{{B'}_i}^C * AF( ODD_SIZE+1 ) = $ - D( PART_OFFSET+ODD_SIZE ) $ * DCONJG( E( PART_OFFSET+ODD_SIZE ) $ * AF( ODD_SIZE ) ) * * ENDIF * ENDIF * End of "if ( MYCOL .ne. 0 )..." * ENDIF * End of "if (info.eq.0) then" * * * Check to make sure no processors have found errors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * IF ( INFO.NE.0 ) THEN GOTO 1000 ENDIF * No errors found, continue * * ******************************************************************** * PHASE 2: Formation and factorization of Reduced System. ******************************************************************** * * Gather up local sections of reduced system * * * The last processor does not participate in the factorization of * the reduced system, having sent its E_i already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * Initiate send of off-diag block(s) to overlap with next part. * Off-diagonal block needed on neighboring processor to start * algorithm. * IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+1 ), $ INT_ONE, 0, MYCOL-1 ) * ENDIF * * Copy last diagonal block into AF storage for subsequent * operations. * AF( ODD_SIZE+2 ) = $ DCMPLX( D( PART_OFFSET+ODD_SIZE+1 ) ) * * Receive cont. to diagonal block that is stored on this proc. * IF( MYCOL.LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, $ MYCOL+1 ) * * Add contribution to diagonal block * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+AF( ODD_SIZE+3 ) * ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to diagonal block from the left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+WORK( 1 ) * ENDIF * * Receive and add contribution to diagonal block from the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 )+WORK( 1 ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * ********************************* * Calculate and use this proc's blocks to modify other procs'... IF( AF( ODD_SIZE+2 ) .EQ. CZERO ) THEN INFO = NPCOL + MYCOL ENDIF * * **************************************************************** * Receive offdiagonal block from processor to right. * If this is the first group of processors, the receive comes * from a different processor than otherwise. * IF( LEVEL_DIST .EQ. 1 )THEN COMM_PROC = MYCOL + 1 * * Move block into place that it will be expected to be for * calcs. * AF( ODD_SIZE+3 ) = AF( ODD_SIZE+1 ) * ELSE COMM_PROC = MYCOL + LEVEL_DIST/2 ENDIF * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+1 ), $ INT_ONE, 0, COMM_PROC ) * IF( INFO .EQ. 0 ) THEN * * * Modify upper off_diagonal block with diagonal block * * AF( ODD_SIZE+1 ) = AF( ODD_SIZE+1 )/AF( ODD_SIZE+2 ) * ENDIF * End of "if ( info.eq.0 ) then" * * Calculate contribution from this block to next diagonal block * WORK( 1 ) = -ONE*AF( ODD_SIZE+1 )*AF( ODD_SIZE+2 ) $ *DCONJG( AF( ODD_SIZE+1 ) ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * **************************************************************** * Receive off_diagonal block from left and use to finish with this * processor. * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * IF( LEVEL_DIST .GT. 1)THEN * * Receive offdiagonal block(s) from proc level_dist/2 to the * left * CALL ZGERV2D( ICTXT, INT_ONE, INT_ONE, $ AF( ODD_SIZE+2+1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST/2 ) * ENDIF * * IF( INFO.EQ.0 ) THEN * * Use diagonal block(s) to modify this offdiagonal block * AF( ODD_SIZE+3 ) = ( AF( ODD_SIZE+3 ) ) $ /AF( ODD_SIZE+2 ) * ENDIF * End of "if( info.eq.0 ) then" * * Use offdiag block(s) to calculate modification to diag block * of processor to the left * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*AF( ODD_SIZE+2 ) $ *DCONJG( AF( ODD_SIZE+3 ) ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, MYCOL-LEVEL_DIST ) * * ******************************************************* * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN * * Decide which processor offdiagonal block(s) goes to * IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN COMM_PROC = MYCOL + LEVEL_DIST ELSE COMM_PROC = MYCOL - LEVEL_DIST ENDIF * * Use offdiagonal blocks to calculate offdiag * block to send to neighboring processor. Depending * on circumstances, may need to transpose the matrix. * WORK( 1 ) = -ONE*AF( ODD_SIZE+3 ) $ * AF( ODD_SIZE+2 ) $ * AF( ODD_SIZE+1 ) * * Send contribution to offdiagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, $ 0, COMM_PROC ) * ENDIF * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * * 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * Make INFO consistent across processors * CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, $ -1, 0, 0 ) * IF( MYCOL.EQ.0 ) THEN CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 ) ELSE CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 ) ENDIF * * RETURN * * End of PZPTTRF * END scalapack-2.0.2/SRC/pzpttrs.f000644 000766 000024 00000067243 10363532303 016247 0ustar00juliestaff000000 000000 SUBROUTINE PZPTTRS( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, $ AF, LAF, WORK, LWORK, INFO ) * * * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * August 7, 2001 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 AF( * ), B( * ), E( * ), WORK( * ) DOUBLE PRECISION D( * ) * .. * * * Purpose * ======= * * PZPTTRS solves a system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors * stored in A(1:N,JA:JA+N-1) and AF by PZPTTRF. * A(1:N, JA:JA+N-1) is an N-by-N complex * tridiagonal symmetric positive definite distributed * matrix. * Depending on the value of UPLO, A stores either U or L in the equn * A(1:N, JA:JA+N-1) = U'D *U or L*D L' as computed by PZPTTRF. * * Routine PZPTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * IMPORTANT NOTE: The current version of this code supports * only IB=JA * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZPTTRF and this is stored in AF. If a linear system * is to be solved using PZPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (10+2*min(100,NRHS))*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, I, ICTXT, ICTXT_NEW, $ ICTXT_SAVE, IDUM1, IDUM3, JA_NEW, LLDA, LLDB, $ MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, $ PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B, $ STORE_N_A, TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 15, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, PXERBLA, $ PZPTTRSV, RESHAPE * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 6*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 9*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 9*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 9*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 9*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LWORK .LT. -1) THEN INFO = -13 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -2 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 6*100 + 6 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 9*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 9*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -3 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -5 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 6*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 2 ) CALL PXERBLA( ICTXT, $ 'PZPTTRS, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 6*100+4 ) CALL PXERBLA( ICTXT, $ 'PZPTTRS, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ (10+2*MIN(100,NRHS))*NPCOL+4*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -13 CALL PXERBLA( ICTXT, $ 'PZPTTRS: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 15, 1 ) = DESCB(5) PARAM_CHECK( 14, 1 ) = DESCB(4) PARAM_CHECK( 13, 1 ) = DESCB(3) PARAM_CHECK( 12, 1 ) = DESCB(2) PARAM_CHECK( 11, 1 ) = DESCB(1) PARAM_CHECK( 10, 1 ) = IB PARAM_CHECK( 9, 1 ) = DESCA(5) PARAM_CHECK( 8, 1 ) = DESCA(4) PARAM_CHECK( 7, 1 ) = DESCA(3) PARAM_CHECK( 6, 1 ) = DESCA(1) PARAM_CHECK( 5, 1 ) = JA PARAM_CHECK( 4, 1 ) = NRHS PARAM_CHECK( 3, 1 ) = N PARAM_CHECK( 2, 1 ) = IDUM3 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 15, 2 ) = 905 PARAM_CHECK( 14, 2 ) = 904 PARAM_CHECK( 13, 2 ) = 903 PARAM_CHECK( 12, 2 ) = 902 PARAM_CHECK( 11, 2 ) = 901 PARAM_CHECK( 10, 2 ) = 8 PARAM_CHECK( 9, 2 ) = 605 PARAM_CHECK( 8, 2 ) = 604 PARAM_CHECK( 7, 2 ) = 603 PARAM_CHECK( 6, 2 ) = 601 PARAM_CHECK( 5, 2 ) = 5 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 2 PARAM_CHECK( 2, 2 ) = 13 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 15, PARAM_CHECK, 15, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZPTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * * * Begin main code * INFO = 0 * * Call frontsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PZPTTRSV( 'L', 'N', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * ELSE * CALL PZPTTRSV( 'U', 'C', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * ENDIF * * Divide by the main diagonal: B <- D^{-1} B * * The main partition is first * DO 10 I=PART_OFFSET+1, PART_OFFSET+ODD_SIZE CALL ZSCAL( NRHS, DCMPLX( CONE/D( I ) ), B( I ), LLDB ) 10 CONTINUE * * Reduced system is next * IF( MYCOL .LT. NPCOL-1 ) THEN I=PART_OFFSET+ODD_SIZE+1 CALL ZSCAL( NRHS, CONE/AF( ODD_SIZE+2 ), B( I ), LLDB ) ENDIF * * Call backsolve routine * IF( LSAME( UPLO, 'L' ) ) THEN * CALL PZPTTRSV( 'L', 'C', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * ELSE * CALL PZPTTRSV( 'U', 'N', N, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), JA_NEW, DESCA_1XP, B, IB, $ DESCB_PX1, AF, LAF, WORK, LWORK, INFO ) * ENDIF 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PZPTTRS * END scalapack-2.0.2/SRC/pzpttrsv.f000644 000766 000024 00000140132 11750130340 016416 0ustar00juliestaff000000 000000 SUBROUTINE PZPTTRSV( UPLO, TRANS, N, NRHS, D, E, JA, DESCA, B, IB, $ DESCB, AF, LAF, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 AF( * ), B( * ), E( * ), WORK( * ) DOUBLE PRECISION D( * ) * .. * * * Purpose * ======= * * PZPTTRSV solves a tridiagonal triangular system of linear equations * * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) * or * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS) * * where A(1:N, JA:JA+N-1) is a tridiagonal * triangular matrix factor produced by the * Cholesky factorization code PZPTTRF * and is stored in A(1:N,JA:JA+N-1) and AF. * The matrix stored in A(1:N, JA:JA+N-1) is either * upper or lower triangular according to UPLO, * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H * is dictated by the user by the parameter TRANS. * * Routine PZPTTRF MUST be called first. * * ===================================================================== * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored; * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored. * * TRANS (global input) CHARACTER * = 'N': Solve with A(1:N, JA:JA+N-1); * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) ); * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed submatrix B(IB:IB+N-1, 1:NRHS). * NRHS >= 0. * * D (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the main diagonal of the * matrix. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * E (local input/local output) COMPLEX*16 pointer to local * part of global vector storing the upper diagonal of the * matrix. Globally, DU(n) is not referenced, and DU must be * aligned with D. * On exit, this array contains information containing the * factors of the matrix. * Must be of size >= DESCA( NB_ ). * * JA (global input) INTEGER * The index in the global array A that points to the start of * the matrix to be operated on (which may be either all of A * or a submatrix of A). * * DESCA (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_A=501 or 502), DLEN >= 7; * if 2D type (DTYPE_A=1), DLEN >= 9. * The array descriptor for the distributed matrix A. * Contains information of mapping of A to memory. Please * see NOTES below for full description and options. * * B (local input/local output) COMPLEX*16 pointer into * local memory to an array of local lead dimension lld_b>=NB. * On entry, this array contains the * the local pieces of the right hand sides * B(IB:IB+N-1, 1:NRHS). * On exit, this contains the local piece of the solutions * distributed matrix X. * * IB (global input) INTEGER * The row index in the global array B that points to the first * row of the matrix to be operated on (which may be either * all of B or a submatrix of B). * * DESCB (global and local input) INTEGER array of dimension DLEN. * if 1D type (DTYPE_B=502), DLEN >=7; * if 2D type (DTYPE_B=1), DLEN >= 9. * The array descriptor for the distributed matrix B. * Contains information of mapping of B to memory. Please * see NOTES below for full description and options. * * AF (local output) COMPLEX*16 array, dimension LAF. * Auxiliary Fillin Space. * Fillin is created during the factorization routine * PZPTTRF and this is stored in AF. If a linear system * is to be solved using PZPTTRS after the factorization * routine, AF *must not be altered* after the factorization. * * LAF (local input) INTEGER * Size of user-input Auxiliary Fillin space AF. Must be >= * (NB+2) * If LAF is not large enough, an error code will be returned * and the minimum acceptable size will be returned in AF( 1 ) * * WORK (local workspace/local output) * COMPLEX*16 temporary workspace. This space may * be overwritten in between calls to routines. WORK must be * the size given in LWORK. * On exit, WORK( 1 ) contains the minimal LWORK. * * LWORK (local input or global input) INTEGER * Size of user-input workspace WORK. * If LWORK is too small, the minimal acceptable size will be * returned in WORK(1) and an error code is returned. LWORK>= * (10+2*min(100,NRHS))*NPCOL+4*NRHS * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * * Restrictions * ============ * * The following are restrictions on the input parameters. Some of these * are temporary and will be removed in future releases, while others * may reflect fundamental technical limitations. * * Non-cyclic restriction: VERY IMPORTANT! * P*NB>= mod(JA-1,NB)+N. * The mapping for matrices must be blocked, reflecting the nature * of the divide and conquer algorithm as a task-parallel algorithm. * This formula in words is: no processor may have more than one * chunk of the matrix. * * Blocksize cannot be too small: * If the matrix spans more than one processor, the following * restriction on NB, the size of each block on each processor, * must hold: * NB >= 2 * The bulk of parallel computation is done on the matrix of size * O(NB) on each processor. If this is too small, divide and conquer * is a poor choice of algorithm. * * Submatrix reference: * JA = IB * Alignment restriction that prevents unnecessary communication. * * * ===================================================================== * * * Notes * ===== * * If the factorization routine and the solve routine are to be called * separately (to solve various sets of righthand sides using the same * coefficient matrix), the auxiliary space AF *must not be altered* * between calls to the factorization routine and the solve routine. * * The best algorithm for solving banded and tridiagonal linear systems * depends on a variety of parameters, especially the bandwidth. * Currently, only algorithms designed for the case N/P >> bw are * implemented. These go by many names, including Divide and Conquer, * Partitioning, domain decomposition-type, etc. * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C * algorithms are the appropriate choice. * * Algorithm description: Divide and Conquer * * The Divide and Conqer algorithm assumes the matrix is narrowly * banded compared with the number of equations. In this situation, * it is best to distribute the input matrix A one-dimensionally, * with columns atomic and rows divided amongst the processes. * The basic algorithm divides the tridiagonal matrix up into * P pieces with one stored on each processor, * and then proceeds in 2 phases for the factorization or 3 for the * solution of a linear system. * 1) Local Phase: * The individual pieces are factored independently and in * parallel. These factors are applied to the matrix creating * fillin, which is stored in a non-inspectable way in auxiliary * space AF. Mathematically, this is equivalent to reordering * the matrix A as P A P^T and then factoring the principal * leading submatrix of size equal to the sum of the sizes of * the matrices factored on each processor. The factors of * these submatrices overwrite the corresponding parts of A * in memory. * 2) Reduced System Phase: * A small ((P-1)) system is formed representing * interaction of the larger blocks, and is stored (as are its * factors) in the space AF. A parallel Block Cyclic Reduction * algorithm is used. For a linear system, a parallel front solve * followed by an analagous backsolve, both using the structure * of the factored matrix, are performed. * 3) Backsubsitution Phase: * For a linear system, a local backsubstitution is performed on * each processor in parallel. * * * Descriptors * =========== * * Descriptors now have *types* and differ from ScaLAPACK 1.0. * * Note: tridiagonal codes can use either the old two dimensional * or new one-dimensional descriptors, though the processor grid in * both cases *must be one-dimensional*. We describe both types below. * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * One-dimensional descriptors: * * One-dimensional descriptors are a new addition to ScaLAPACK since * version 1.0. They simplify and shorten the descriptor for 1D * arrays. * * Since ScaLAPACK supports two-dimensional arrays as the fundamental * object, we allow 1D arrays to be distributed either over the * first dimension of the array (as if the grid were P-by-1) or the * 2nd dimension (as if the grid were 1-by-P). This choice is * indicated by the descriptor type (501 or 502) * as described below. * However, for tridiagonal matrices, since the objects being * distributed are the individual vectors storing the diagonals, we * have adopted the convention that both the P-by-1 descriptor and * the 1-by-P descriptor are allowed and are equivalent for * tridiagonal matrices. Thus, for tridiagonal matrices, * DTYPE_A = 501 or 502 can be used interchangeably * without any other change. * We require that the distributed vectors storing the diagonals of a * tridiagonal matrix be aligned with each other. Because of this, a * single descriptor, DESCA, serves to describe the distribution of * of all diagonals simultaneously. * * IMPORTANT NOTE: the actual BLACS grid represented by the * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P * irrespective of which one-dimensional descriptor type * (501 or 502) is input. * This routine will interpret the grid properly either way. * ScaLAPACK routines *do not support intercontext operations* so that * the grid passed to a single ScaLAPACK routine *must be the same* * for all array descriptors passed to that routine. * * NOTE: In all cases where 1D descriptors are used, 2D descriptors * may also be used, since a one-dimensional array is a special case * of a two-dimensional array with one dimension of size unity. * The two-dimensional array used in this case *must* be of the * proper orientation: * If the appropriate one-dimensional descriptor is DTYPEA=501 * (1 by P type), then the two dimensional descriptor must * have a CTXT value that refers to a 1 by P BLACS grid; * If the appropriate one-dimensional descriptor is DTYPEA=502 * (P by 1 type), then the two dimensional descriptor must * have a CTXT value that refers to a P by 1 BLACS grid. * * * Summary of allowed descriptors, types, and BLACS grids: * DTYPE 501 502 1 1 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1 * ----------------------------------------------------- * A OK OK OK NO * B NO OK NO OK * * Note that a consequence of this chart is that it is not possible * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead * to opposite requirements for the orientation of the BLACS grid, * and as noted before, the *same* BLACS context must be used in * all descriptors in a single ScaLAPACK subroutine call. * * Let A be a generic term for any 1D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- ---------- ------------------------------------------ * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, * TYPE_A = 501: 1-by-P grid. * TYPE_A = 502: P-by-1 grid. * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * N_A (global) DESCA( 3 ) The size of the array dimension being * distributed. * NB_A (global) DESCA( 4 ) The blocking factor used to distribute * the distributed dimension of the array. * SRC_A (global) DESCA( 5 ) The process row or column over which the * first row or column of the array * is distributed. * Ignored DESCA( 6 ) Ignored for tridiagonal matrices. * Reserved DESCA( 7 ) Reserved for future use. * * * * ===================================================================== * * Code Developer: Andrew J. Cleary, University of Tennessee. * Current address: Lawrence Livermore National Labs. * This version released: August, 2001. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER INT_ONE PARAMETER ( INT_ONE = 1 ) INTEGER DESCMULT, BIGNUM PARAMETER (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE, $ IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA, $ LLDB, MYCOL, MYROW, MY_NUM_COLS, NB, NP, NPCOL, $ NPROW, NP_SAVE, ODD_SIZE, PART_OFFSET, $ PART_SIZE, RETURN_CODE, STORE_M_B, STORE_N_A, $ TEMP, WORK_SIZE_MIN * .. * .. Local Arrays .. INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ), $ PARAM_CHECK( 16, 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ DESC_CONVERT, GLOBCHK, PXERBLA, RESHAPE, ZGEMM, $ ZGERV2D, ZGESD2D, ZLAMOV, ZMATADD, ZTBTRS, $ ZTRMM, ZTRTRS * .. * .. External Functions .. LOGICAL LSAME INTEGER NUMROC EXTERNAL LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 * * Convert descriptor into standard form for easy access to * parameters, check that grid is of right shape. * DESCA_1XP( 1 ) = 501 DESCB_PX1( 1 ) = 502 * TEMP = DESCA( DTYPE_ ) IF( TEMP .EQ. 502 ) THEN * Temporarily set the descriptor type to 1xP type DESCA( DTYPE_ ) = 501 ENDIF * CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE ) * DESCA( DTYPE_ ) = TEMP * IF( RETURN_CODE .NE. 0) THEN INFO = -( 8*100 + 2 ) ENDIF * CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE ) * IF( RETURN_CODE .NE. 0) THEN INFO = -( 11*100 + 2 ) ENDIF * * Consistency checks for DESCA and DESCB. * * Context must be the same IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN INFO = -( 11*100 + 2 ) ENDIF * * These are alignment restrictions that may or may not be removed * in future releases. -Andy Cleary, April 14, 1996. * * Block sizes must be the same IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN INFO = -( 11*100 + 4 ) ENDIF * * Source processor must be the same * IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN INFO = -( 11*100 + 5 ) ENDIF * * Get values out of descriptor for use in code. * ICTXT = DESCA_1XP( 2 ) CSRC = DESCA_1XP( 5 ) NB = DESCA_1XP( 4 ) LLDA = DESCA_1XP( 6 ) STORE_N_A = DESCA_1XP( 3 ) LLDB = DESCB_PX1( 6 ) STORE_M_B = DESCB_PX1( 3 ) * * Get grid parameters * * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) NP = NPROW * NPCOL * * * IF( LSAME( UPLO, 'U' ) ) THEN IDUM1 = ICHAR( 'U' ) ELSE IF ( LSAME( UPLO, 'L' ) ) THEN IDUM1 = ICHAR( 'L' ) ELSE INFO = -1 END IF * IF( LSAME( TRANS, 'N' ) ) THEN IDUM2 = ICHAR( 'N' ) ELSE IF ( LSAME( TRANS, 'C' ) ) THEN IDUM2 = ICHAR( 'C' ) ELSE INFO = -2 END IF * IF( LWORK .LT. -1) THEN INFO = -15 ELSE IF ( LWORK .EQ. -1 ) THEN IDUM3 = -1 ELSE IDUM3 = 1 ENDIF * IF( N .LT. 0 ) THEN INFO = -3 ENDIF * IF( N+JA-1 .GT. STORE_N_A ) THEN INFO = -( 8*100 + 6 ) ENDIF * IF( N+IB-1 .GT. STORE_M_B ) THEN INFO = -( 11*100 + 3 ) ENDIF * IF( LLDB .LT. NB ) THEN INFO = -( 11*100 + 6 ) ENDIF * IF( NRHS .LT. 0 ) THEN INFO = -4 ENDIF * * Current alignment restriction * IF( JA .NE. IB) THEN INFO = -7 ENDIF * * Argument checking that is specific to Divide & Conquer routine * IF( NPROW .NE. 1 ) THEN INFO = -( 8*100+2 ) ENDIF * IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN INFO = -( 3 ) CALL PXERBLA( ICTXT, $ 'PZPTTRSV, D&C alg.: only 1 block per proc', $ -INFO ) RETURN ENDIF * IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*INT_ONE )) THEN INFO = -( 8*100+4 ) CALL PXERBLA( ICTXT, $ 'PZPTTRSV, D&C alg.: NB too small', $ -INFO ) RETURN ENDIF * * WORK_SIZE_MIN = $ INT_ONE*NRHS * WORK( 1 ) = WORK_SIZE_MIN * IF( LWORK .LT. WORK_SIZE_MIN ) THEN IF( LWORK .NE. -1 ) THEN INFO = -15 CALL PXERBLA( ICTXT, $ 'PZPTTRSV: worksize error', $ -INFO ) ENDIF RETURN ENDIF * * Pack params and positions into arrays for global consistency check * PARAM_CHECK( 16, 1 ) = DESCB(5) PARAM_CHECK( 15, 1 ) = DESCB(4) PARAM_CHECK( 14, 1 ) = DESCB(3) PARAM_CHECK( 13, 1 ) = DESCB(2) PARAM_CHECK( 12, 1 ) = DESCB(1) PARAM_CHECK( 11, 1 ) = IB PARAM_CHECK( 10, 1 ) = DESCA(5) PARAM_CHECK( 9, 1 ) = DESCA(4) PARAM_CHECK( 8, 1 ) = DESCA(3) PARAM_CHECK( 7, 1 ) = DESCA(1) PARAM_CHECK( 6, 1 ) = JA PARAM_CHECK( 5, 1 ) = NRHS PARAM_CHECK( 4, 1 ) = N PARAM_CHECK( 3, 1 ) = IDUM3 PARAM_CHECK( 2, 1 ) = IDUM2 PARAM_CHECK( 1, 1 ) = IDUM1 * PARAM_CHECK( 16, 2 ) = 1105 PARAM_CHECK( 15, 2 ) = 1104 PARAM_CHECK( 14, 2 ) = 1103 PARAM_CHECK( 13, 2 ) = 1102 PARAM_CHECK( 12, 2 ) = 1101 PARAM_CHECK( 11, 2 ) = 10 PARAM_CHECK( 10, 2 ) = 805 PARAM_CHECK( 9, 2 ) = 804 PARAM_CHECK( 8, 2 ) = 803 PARAM_CHECK( 7, 2 ) = 801 PARAM_CHECK( 6, 2 ) = 7 PARAM_CHECK( 5, 2 ) = 4 PARAM_CHECK( 4, 2 ) = 3 PARAM_CHECK( 3, 2 ) = 14 PARAM_CHECK( 2, 2 ) = 2 PARAM_CHECK( 1, 2 ) = 1 * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the * descriptor multiplier. * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Check consistency across processors * CALL GLOBCHK( ICTXT, 16, PARAM_CHECK, 16, $ PARAM_CHECK( 1, 3 ), INFO ) * * Prepare output: set info = 0 if no error, and divide by DESCMULT * if error is not in a descriptor entry. * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN INFO = -INFO / DESCMULT ELSE INFO = -INFO END IF * IF( INFO.LT.0 ) THEN CALL PXERBLA( ICTXT, 'PZPTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( NRHS.EQ.0 ) $ RETURN * * * Adjust addressing into matrix space to properly get into * the beginning part of the relevant data * PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) ) * IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN PART_OFFSET = PART_OFFSET + NB ENDIF * IF ( MYCOL .LT. CSRC ) THEN PART_OFFSET = PART_OFFSET - NB ENDIF * * Form a new BLACS grid (the "standard form" grid) with only procs * holding part of the matrix, of size 1xNP where NP is adjusted, * starting at csrc=0, with JA modified to reflect dropped procs. * * First processor to hold part of the matrix: * FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL ) * * Calculate new JA one while dropping off unused processors. * JA_NEW = MOD( JA-1, NB ) + 1 * * Save and compute new value of NP * NP_SAVE = NP NP = ( JA_NEW+N-2 )/NB + 1 * * Call utility routine that forms "standard-form" grid * CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, $ FIRST_PROC, INT_ONE, NP ) * * Use new context from standard grid as context. * ICTXT_SAVE = ICTXT ICTXT = ICTXT_NEW DESCA_1XP( 2 ) = ICTXT_NEW DESCB_PX1( 2 ) = ICTXT_NEW * * Get information about new grid. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Drop out processors that do not have part of the matrix. * IF( MYROW .LT. 0 ) THEN GOTO 1234 ENDIF * * ******************************** * Values reused throughout routine * * User-input value of partition size * PART_SIZE = NB * * Number of columns in each processor * MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) * * Offset in columns to beginning of main partition in each proc * IF ( MYCOL .EQ. 0 ) THEN PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE ) MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE ) ENDIF * * Size of main (or odd) partition in each processor * ODD_SIZE = MY_NUM_COLS IF ( MYCOL .LT. NP-1 ) THEN ODD_SIZE = ODD_SIZE - INT_ONE ENDIF * * * * Begin main code * IF ( LSAME( UPLO, 'L' ) ) THEN * IF ( LSAME( TRANS, 'N' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL ZPTTRSV( UPLO, 'N', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL ZAXPY( NRHS, -E( PART_OFFSET+ODD_SIZE ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL ZGEMM( 'C', 'N', 1, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+INT_ONE-1 ), INT_ONE ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 14 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 12 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 12 11 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL ZTRTRS( 'L', 'N', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*1+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL ZGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*1+2+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 14 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 24 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 27 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 26 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 27 26 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*1+2+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'N', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*1+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL ZTRTRS( 'L', 'C', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 22 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 21 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 22 21 CONTINUE * [End of GOTO Loop] * 24 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL ZGEMM( 'N', 'N', ODD_SIZE, NRHS, 1, -CONE, AF( 1 ), $ ODD_SIZE, WORK( 1+INT_ONE-1 ), INT_ONE, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL ZAXPY( NRHS, -DCONJG( E( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL ZPTTRSV( UPLO, 'C', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ELSE *************************************************************** * CASE UPLO = 'U' * *************************************************************** IF ( LSAME( TRANS, 'C' ) ) THEN * * Frontsolve * * ****************************************** * Local computation phase ****************************************** * * Use main partition in each processor to solve locally * CALL ZPTTRSV( UPLO, 'C', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL ZAXPY( NRHS, -DCONJG( E( PART_OFFSET+ODD_SIZE ) ), $ B( PART_OFFSET+ODD_SIZE ), LLDB, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB ) * ENDIF * * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution to previous * processor's righthand-side. * CALL ZGEMM( 'T', 'N', 1, NRHS, ODD_SIZE, -CONE, AF( 1 ), $ ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO, $ WORK( 1+INT_ONE-1 ), INT_ONE ) ENDIF * * ************************************************ * Formation and solution of reduced system ************************************************ * * * Send modifications to prior processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL + 1 ) * * Combine contribution to locally stored right hand sides * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * * The last processor does not participate in the solution of the * reduced system, having sent its contribution already. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 44 ENDIF * * * ************************************* * Modification Loop * * The distance for sending and receiving for each level starts * at 1 for the first level. LEVEL_DIST = 1 * * Do until this proc is needed to modify other procs' equations * 42 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 41 * * Receive and add contribution to righthand sides from left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * * Receive and add contribution to righthand sides from right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * CALL ZMATADD( INT_ONE, NRHS, CONE, $ WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE + 1 ), LLDB ) * ENDIF * LEVEL_DIST = LEVEL_DIST*2 * GOTO 42 41 CONTINUE * [End of GOTO Loop] * * * * ********************************* * Calculate and use this proc's blocks to modify other procs * * Solve with diagonal block * CALL ZTRTRS( 'L', 'N', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * * ********* IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*1+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * ************ IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * * Use offdiagonal block to calculate modification to diag block * of processor to the left * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*1+2+1 ), $ INT_ONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, CZERO, $ WORK( 1 ), $ INT_ONE ) * * Send contribution to diagonal block's owning processor. * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * 44 CONTINUE * ELSE * ******************** BACKSOLVE ************************************* * ******************************************************************** * .. Begin reduced system phase of algorithm .. ******************************************************************** * * * * The last processor does not participate in the solution of the * reduced system and just waits to receive its solution. IF( MYCOL .EQ. NPCOL-1 ) THEN GOTO 54 ENDIF * * Determine number of steps in tree loop * LEVEL_DIST = 1 57 CONTINUE IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 56 * LEVEL_DIST = LEVEL_DIST*2 * GOTO 57 56 CONTINUE * * IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND. $ ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN * * Receive solution from processor to left * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL-LEVEL_DIST ) * * * Use offdiagonal block to calculate modification to RHS stored * on this processor * CALL ZGEMM( 'T', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( ODD_SIZE*1+2+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) ENDIF * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." * * IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN * * Receive solution from processor to right * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), $ INT_ONE, 0, MYCOL+LEVEL_DIST ) * * Calculate contribution from this block to next diagonal block * CALL ZGEMM( 'C', 'N', INT_ONE, NRHS, INT_ONE, -CONE, $ AF( (ODD_SIZE)*1+1 ), $ INT_ONE, $ WORK( 1 ), $ INT_ONE, CONE, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB ) * ENDIF * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." * * * Solve with diagonal block * CALL ZTRTRS( 'L', 'C', 'U', INT_ONE, NRHS, AF( ODD_SIZE+2 ), $ INT_ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO ) * IF( INFO.NE.0 ) THEN GO TO 1000 ENDIF * * * ***Modification Loop ******* * 52 CONTINUE IF( LEVEL_DIST .EQ. 1 ) GOTO 51 * LEVEL_DIST = LEVEL_DIST/2 * * Send solution to the right * IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL+LEVEL_DIST ) * ENDIF * * Send solution to left * IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), $ LLDB, 0, MYCOL-LEVEL_DIST ) * ENDIF * GOTO 52 51 CONTINUE * [End of GOTO Loop] * 54 CONTINUE * [Processor npcol - 1 jumped to here to await next stage] * ******************************* * Reduced system has been solved, communicate solutions to nearest * neighbors in preparation for local computation phase. * * * Send elements of solution to next proc * IF( MYCOL .LT. NPCOL-1) THEN * CALL ZGESD2D( ICTXT, INT_ONE, NRHS, $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ 0, MYCOL +1 ) * ENDIF * * Receive modifications to processor's right hand sides * IF( MYCOL .GT. 0) THEN * CALL ZGERV2D( ICTXT, INT_ONE, NRHS, $ WORK( 1 ), INT_ONE, $ 0, MYCOL - 1 ) * ENDIF * * * ********************************************** * Local computation phase ********************************************** * IF ( MYCOL .NE. 0 ) THEN * Use the "spike" fillin to calculate contribution from previous * processor's solution. * CALL ZGEMM( 'C', 'N', ODD_SIZE, NRHS, 1, -CONE, AF( 1 ), $ INT_ONE, WORK( 1 ), INT_ONE, CONE, $ B( PART_OFFSET+1 ), LLDB ) * ENDIF * * IF ( MYCOL .LT. NP-1 ) THEN * Use factorization of odd-even connection block to modify * locally stored portion of right hand side(s) * CALL ZAXPY( NRHS, -E( PART_OFFSET+ODD_SIZE ), $ B( PART_OFFSET+ODD_SIZE+1 ), LLDB, $ B( PART_OFFSET+ODD_SIZE ), LLDB ) * ENDIF * * Use main partition in each processor to solve locally * CALL ZPTTRSV( UPLO, 'N', ODD_SIZE, NRHS, D( PART_OFFSET+1 ), $ E( PART_OFFSET+1 ), B( PART_OFFSET+1 ), LLDB, $ INFO ) * ENDIF * End of "IF( LSAME( TRANS, 'N' ) )"... * * ENDIF * End of "IF( LSAME( UPLO, 'L' ) )"... 1000 CONTINUE * * * Free BLACS space used to hold standard-form grid. * IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN CALL BLACS_GRIDEXIT( ICTXT_NEW ) ENDIF * 1234 CONTINUE * * Restore saved input parameters * ICTXT = ICTXT_SAVE NP = NP_SAVE * * Output minimum worksize * WORK( 1 ) = WORK_SIZE_MIN * * RETURN * * End of PZPTTRSV * END scalapack-2.0.2/SRC/pzrot.c000644 000766 000024 00000042305 10602576752 015700 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- Mark R. Fahey * June 28, 2000 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzrot_( n, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy, c, s ) /* * Mark Fahey * June 22, 2000 */ /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; double * c; complex16 * s; /* * .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZROT applies a plane rotation, where the cos (C) is real and the * sin (S) is complex, and the vectors CX and CY are complex, i.e., * * [ sub( X ) ] := [ C S ] [ sub( X ) ] * [ sub( Y ) ] := [ -conjg(S) C ] [ sub( Y ) ] * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y, * * and where C*C + S*CONJG(S) = 1.0. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * Arguments * ========= * * N (input) INTEGER * The number of elements in the vectors CX and CY. * * X (local input) COMPLEX array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * On output, CX is overwritten with C*X + S*Y. * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input) COMPLEX array containing the local * pieces of a distributed matrix of dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * On output, CY is overwritten with -CONJG(S)*X + C*Y. * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * C (input) pointer to DOUBLE * S (input) pointer COMPLEX * C and S define a rotation * [ C S ] * [ -conjg(S) C ] * where C*C + S*CONJG(S) = 1.0. * * ===================================================================== * * .. Local Scalars .. */ int ictxt, iix, iiy, info, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, mycol, myrow, nn, np, np0, nprow, npcol, nq, nz, ione=1, tmp1, wksz; complex16 xwork[1], ywork[1], zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgerv2d_(); void zgesd2d_(); void pbchkvect(); void PB_Cabort(); char * getpbbuf(); F_INTG_FCT pbztrnv_(); F_INTG_FCT zrot_(); F_INTG_FCT ilcm_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(500+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 5, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 10, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -9; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1000+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -8; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1000+MB_+1); } else { info = -11; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -9; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1000+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -8; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1000+MB_+1); } else { info = -11; } } else { info = -6; } } if( ictxt != desc_Y[CTXT_] ) info = -(1000+CTXT_+1); } } if( info ) { PB_Cabort( ictxt, "PZROT", info ); return; } /* if( info ) { pberror_( &ictxt, "PZROT", &info ); return; } */ /* * Quick return if possible. */ zero.re = ZERO; zero.im = ZERO; if( *n == 0 ) return; /* * rotation */ if( *n == 1 ) { if( ( myrow == ixrow ) && ( mycol == ixcol ) ) { buff = &X[iix-1+(jjx-1)*desc_X[LLD_]]; if( ( myrow != iyrow ) || ( mycol != iycol ) ) { zgesd2d_( &ictxt, n, n, buff, n, &iyrow, &iycol ); zgerv2d_( &ictxt, n, n, ywork, n, &iyrow, &iycol ); } else *ywork = Y[iiy-1+(jjy-1)*desc_Y[LLD_]]; zrot_( n, buff, n, ywork, n, c, s ); X[iix-1+(jjx-1)*desc_X[LLD_]] = *buff; if( ( myrow == iyrow ) && ( mycol == iycol ) ) Y[iiy-1+(jjy-1)*desc_Y[LLD_]] = *ywork; } else if( ( myrow == iyrow ) && ( mycol == iycol ) ) { zgesd2d_( &ictxt, n, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, &ixrow, &ixcol ); zgerv2d_( &ictxt, n, n, xwork, n, &ixrow, &ixcol ); zrot_( n, xwork, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, c, s ); } return; } if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) { zrot_( &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], c, s ); } } else { if( myrow == ixrow ) { zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); buff = (complex16 *)getpbbuf( "PZROT", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &nq, &iyrow, &mycol ); zrot_( &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], buff, &ione, c, s ); } else if( myrow == iyrow ) { zgesd2d_( &ictxt, &ione, &nq, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &mycol ); buff = (complex16 *)getpbbuf( "PZROT", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &nq, &ixrow, &mycol ); zrot_( &nq, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], c, s ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) { zrot_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy, c, s ); } } else { if( mycol == ixcol ) { zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); buff = (complex16 *)getpbbuf( "PZROT", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &np, &myrow, &iycol ); zrot_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione, c, s ); } else if( mycol == iycol ) { zgesd2d_( &ictxt, &np, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &myrow, &ixcol ); buff = (complex16 *)getpbbuf( "PZROT", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &np, &myrow, &ixcol ); zrot_( &np, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy, c, s ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmp = lcm / nprow; nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_X[MB_], nprow ); tmp1 = np0 / desc_X[MB_]; wksz = MYROC0( tmp1, np0, desc_X[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZROT", wksz*sizeof(complex16) ); if( mycol == iycol ) jjy -= nz; if( myrow == ixrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &zero, buff, &ione, &iyrow, &iycol, &ixrow, &ixcol, buff+np ); if( mycol == ixcol ) { zrot_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione, c, s ); } pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, buff, &ione, &zero, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &ixcol, &iyrow, &iycol, buff+np ); } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*jx-1) % desc_X[NB_]; nn = *n + nz; tmp1 = nn / desc_X[MB_]; np = numroc_( &nn, desc_Y+MB_, &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZROT", wksz*sizeof(complex16) ); if( myrow == iyrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { zrot_( &np, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy, c, s ); } pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, buff, &ione, &zero, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &iycol, &ixrow, &ixcol, buff+np ); } } } scalapack-2.0.2/SRC/pzstein.f000644 000766 000024 00000060423 10602576752 016222 0ustar00juliestaff000000 000000 SUBROUTINE PZSTEIN( N, D, E, M, W, IBLOCK, ISPLIT, ORFAC, Z, IZ, $ JZ, DESCZ, WORK, LWORK, IWORK, LIWORK, IFAIL, $ ICLUSTR, GAP, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 15, 1997 * * .. Scalar Arguments .. INTEGER INFO, IZ, JZ, LIWORK, LWORK, M, N DOUBLE PRECISION ORFAC * .. * .. Array Arguments .. INTEGER DESCZ( * ), IBLOCK( * ), ICLUSTR( * ), $ IFAIL( * ), ISPLIT( * ), IWORK( * ) DOUBLE PRECISION D( * ), E( * ), GAP( * ), W( * ), WORK( * ) COMPLEX*16 Z( * ) * .. * * Purpose * ======= * * PZSTEIN computes the eigenvectors of a symmetric tridiagonal matrix * in parallel, using inverse iteration. The eigenvectors found * correspond to user specified eigenvalues. PZSTEIN does not * orthogonalize vectors that are on different processes. The extent * of orthogonalization is controlled by the input parameter LWORK. * Eigenvectors that are to be orthogonalized are computed by the same * process. PZSTEIN decides on the allocation of work among the * processes and then calls DSTEIN2 (modified LAPACK routine) on each * individual process. If insufficient workspace is allocated, the * expected orthogonalization may not be done. * * Note : If the eigenvectors obtained are not orthogonal, increase * LWORK and run the code again. * * Notes * ===== * * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * * Arguments * ========= * * P = NPROW * NPCOL is the total number of processes * * N (global input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * D (global input) DOUBLE PRECISION array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. * * E (global input) DOUBLE PRECISION array, dimension (N-1) * The (n-1) off-diagonal elements of the tridiagonal matrix T. * * M (global input) INTEGER * The total number of eigenvectors to be found. 0 <= M <= N. * * W (global input/global output) DOUBLE PRECISION array, dim (M) * On input, the first M elements of W contain all the * eigenvalues for which eigenvectors are to be computed. The * eigenvalues should be grouped by split-off block and ordered * from smallest to largest within the block (The output array * W from PDSTEBZ with ORDER='b' is expected here). This * array should be replicated on all processes. * On output, the first M elements contain the input * eigenvalues in ascending order. * * Note : To obtain orthogonal vectors, it is best if * eigenvalues are computed to highest accuracy ( this can be * done by setting ABSTOL to the underflow threshold = * DLAMCH('U') --- ABSTOL is an input parameter * to PDSTEBZ ) * * IBLOCK (global input) INTEGER array, dimension (N) * The submatrix indices associated with the corresponding * eigenvalues in W -- 1 for eigenvalues belonging to the * first submatrix from the top, 2 for those belonging to * the second submatrix, etc. (The output array IBLOCK * from PDSTEBZ is expected here). * * ISPLIT (global input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N (The output array * ISPLIT from PDSTEBZ is expected here.) * * ORFAC (global input) DOUBLE PRECISION * ORFAC specifies which eigenvectors should be orthogonalized. * Eigenvectors that correspond to eigenvalues which are within * ORFAC*||T|| of each other are to be orthogonalized. * However, if the workspace is insufficient (see LWORK), this * tolerance may be decreased until all eigenvectors to be * orthogonalized can be stored in one process. * No orthogonalization will be done if ORFAC equals zero. * A default value of 10^-3 is used if ORFAC is negative. * ORFAC should be identical on all processes. * * Z (local output) COMPLEX*16 array, * dimension (DESCZ(DLEN_), N/npcol + NB) * Z contains the computed eigenvectors associated with the * specified eigenvalues. Any vector which fails to converge is * set to its current iterate after MAXITS iterations ( See * DSTEIN2 ). * On output, Z is distributed across the P processes in block * cyclic format. * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local workspace/global output) DOUBLE PRECISION array, * dimension ( LWORK ) * On output, WORK(1) gives a lower bound on the * workspace ( LWORK ) that guarantees the user desired * orthogonalization (see ORFAC). * Note that this may overestimate the minimum workspace needed. * * LWORK (local input) integer * LWORK controls the extent of orthogonalization which can be * done. The number of eigenvectors for which storage is * allocated on each process is * NVEC = floor(( LWORK- max(5*N,NP00*MQ00) )/N). * Eigenvectors corresponding to eigenvalue clusters of size * NVEC - ceil(M/P) + 1 are guaranteed to be orthogonal ( the * orthogonality is similar to that obtained from ZSTEIN2). * Note : LWORK must be no smaller than: * max(5*N,NP00*MQ00) + ceil(M/P)*N, * and should have the same input value on all processes. * It is the minimum value of LWORK input on different processes * that is significant. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IWORK (local workspace/global output) INTEGER array, * dimension ( 3*N+P+1 ) * On return, IWORK(1) contains the amount of integer workspace * required. * On return, the IWORK(2) through IWORK(P+2) indicate * the eigenvectors computed by each process. Process I computes * eigenvectors indexed IWORK(I+2)+1 thru' IWORK(I+3). * * LIWORK (local input) INTEGER * Size of array IWORK. Must be >= 3*N + P + 1 * * If LIWORK = -1, then LIWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * IFAIL (global output) integer array, dimension (M) * On normal exit, all elements of IFAIL are zero. * If one or more eigenvectors fail to converge after MAXITS * iterations (as in ZSTEIN), then INFO > 0 is returned. * If mod(INFO,M+1)>0, then * for I=1 to mod(INFO,M+1), the eigenvector * corresponding to the eigenvalue W(IFAIL(I)) failed to * converge ( W refers to the array of eigenvalues on output ). * * ICLUSTR (global output) integer array, dimension (2*P) * This output array contains indices of eigenvectors * corresponding to a cluster of eigenvalues that could not be * orthogonalized due to insufficient workspace (see LWORK, * ORFAC and INFO). Eigenvectors corresponding to clusters of * eigenvalues indexed ICLUSTR(2*I-1) to ICLUSTR(2*I), I = 1 to * INFO/(M+1), could not be orthogonalized due to lack of * workspace. Hence the eigenvectors corresponding to these * clusters may not be orthogonal. ICLUSTR is a zero terminated * array --- ( ICLUSTR(2*K).NE.0 .AND. ICLUSTR(2*K+1).EQ.0 ) * if and only if K is the number of clusters. * * GAP (global output) DOUBLE PRECISION array, dimension (P) * This output array contains the gap between eigenvalues whose * eigenvectors could not be orthogonalized. The INFO/M output * values in this array correspond to the INFO/(M+1) clusters * indicated by the array ICLUSTR. As a result, the dot product * between eigenvectors corresponding to the I^th cluster may be * as high as ( O(n)*macheps ) / GAP(I). * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * < 0 : if INFO = -I, the I-th argument had an illegal value * > 0 : if mod(INFO,M+1) = I, then I eigenvectors failed to * converge in MAXITS iterations. Their indices are * stored in the array IFAIL. * if INFO/(M+1) = I, then eigenvectors corresponding to * I clusters of eigenvalues could not be orthogonalized * due to insufficient workspace. The indices of the * clusters are stored in the array ICLUSTR. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. External Functions .. INTEGER ICEIL, NUMROC EXTERNAL ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DGEBR2D, DGEBS2D, $ DLASRT2, DSTEIN2, IGAMN2D, IGEBR2D, IGEBS2D, $ PCHK1MAT, PXERBLA, PZLAEVSWP * .. * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, NEGONE, ODM1, FIVE, ODM3, ODM18 PARAMETER ( ZERO = 0.0D+0, NEGONE = -1.0D+0, $ ODM1 = 1.0D-1, FIVE = 5.0D+0, ODM3 = 1.0D-3, $ ODM18 = 1.0D-18 ) * .. * .. Local Scalars .. LOGICAL LQUERY, SORTED INTEGER B1, BN, BNDRY, CLSIZ, COL, I, IFIRST, IINFO, $ ILAST, IM, INDRW, ITMP, J, K, LGCLSIZ, LLWORK, $ LOAD, LOCINFO, MAXVEC, MQ00, MYCOL, MYROW, $ NBLK, NERR, NEXT, NP00, NPCOL, NPROW, NVS, $ OLNBLK, P, ROW, SELF, TILL, TOTERR DOUBLE PRECISION DIFF, MINGAP, ONENRM, ORGFAC, ORTOL, TMPFAC * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. Executable Statements .. * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CALL BLACS_GRIDINFO( DESCZ( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) SELF = MYROW*NPCOL + MYCOL * * Make sure that we belong to this context (before calling PCHK1MAT) * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1200+CTXT_ ) ELSE * * Make sure that NPROW>0 and NPCOL>0 before calling NUMROC * CALL CHK1MAT( N, 1, N, 1, IZ, JZ, DESCZ, 12, INFO ) IF( INFO.EQ.0 ) THEN * * Now we know that our context is good enough to * perform the rest of the checks * NP00 = NUMROC( N, DESCZ( MB_ ), 0, 0, NPROW ) MQ00 = NUMROC( M, DESCZ( NB_ ), 0, 0, NPCOL ) P = NPROW*NPCOL * * Compute the maximum number of vectors per process * LLWORK = LWORK CALL IGAMN2D( DESCZ( CTXT_ ), 'A', ' ', 1, 1, LLWORK, 1, 1, $ 1, -1, -1, -1 ) INDRW = MAX( 5*N, NP00*MQ00 ) IF( N.NE.0 ) $ MAXVEC = ( LLWORK-INDRW ) / N LOAD = ICEIL( M, P ) IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN TMPFAC = ORFAC CALL DGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, TMPFAC, $ 1 ) ELSE CALL DGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, TMPFAC, $ 1, 0, 0 ) END IF * LQUERY = ( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) IF( M.LT.0 .OR. M.GT.N ) THEN INFO = -4 ELSE IF( MAXVEC.LT.LOAD .AND. .NOT.LQUERY ) THEN INFO = -14 ELSE IF( LIWORK.LT.3*N+P+1 .AND. .NOT.LQUERY ) THEN INFO = -16 ELSE DO 10 I = 2, M IF( IBLOCK( I ).LT.IBLOCK( I-1 ) ) THEN INFO = -6 GO TO 20 END IF IF( IBLOCK( I ).EQ.IBLOCK( I-1 ) .AND. W( I ).LT. $ W( I-1 ) ) THEN INFO = -5 GO TO 20 END IF 10 CONTINUE 20 CONTINUE IF( INFO.EQ.0 ) THEN IF( ABS( TMPFAC-ORFAC ).GT.FIVE*ABS( TMPFAC ) ) $ INFO = -8 END IF END IF * END IF IDUM1( 1 ) = M IDUM2( 1 ) = 4 CALL PCHK1MAT( N, 1, N, 1, IZ, JZ, DESCZ, 12, 1, IDUM1, IDUM2, $ INFO ) WORK( 1 ) = DBLE( MAX( 5*N, NP00*MQ00 )+ICEIL( M, P )*N ) IWORK( 1 ) = 3*N + P + 1 END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( DESCZ( CTXT_ ), 'PZSTEIN', -INFO ) RETURN ELSE IF( LWORK.EQ.-1 .OR. LIWORK.EQ.-1 ) THEN RETURN END IF * DO 30 I = 1, M IFAIL( I ) = 0 30 CONTINUE DO 40 I = 1, P + 1 IWORK( I ) = 0 40 CONTINUE DO 50 I = 1, P GAP( I ) = NEGONE ICLUSTR( 2*I-1 ) = 0 ICLUSTR( 2*I ) = 0 50 CONTINUE * * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) $ RETURN * IF( ORFAC.GE.ZERO ) THEN TMPFAC = ORFAC ELSE TMPFAC = ODM3 END IF ORGFAC = TMPFAC * * Allocate the work among the processes * ILAST = M / LOAD IF( MOD( M, LOAD ).EQ.0 ) $ ILAST = ILAST - 1 OLNBLK = -1 NVS = 0 NEXT = 1 IM = 0 ONENRM = ZERO DO 100 I = 0, ILAST - 1 NEXT = NEXT + LOAD J = NEXT - 1 IF( J.GT.NVS ) THEN NBLK = IBLOCK( NEXT ) IF( NBLK.EQ.IBLOCK( NEXT-1 ) .AND. NBLK.NE.OLNBLK ) THEN * * Compute orthogonalization criterion * IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 60 J = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( J ) )+ABS( E( J-1 ) )+ $ ABS( E( J ) ) ) 60 CONTINUE OLNBLK = NBLK END IF TILL = NVS + MAXVEC 70 CONTINUE J = NEXT - 1 IF( TMPFAC.GT.ODM18 ) THEN ORTOL = TMPFAC*ONENRM DO 80 J = NEXT - 1, MIN( TILL, M-1 ) IF( IBLOCK( J+1 ).NE.IBLOCK( J ) .OR. W( J+1 )- $ W( J ).GE.ORTOL ) THEN GO TO 90 END IF 80 CONTINUE IF( J.EQ.M .AND. TILL.GE.M ) $ GO TO 90 TMPFAC = TMPFAC*ODM1 GO TO 70 END IF 90 CONTINUE J = MIN( J, TILL ) END IF IF( SELF.EQ.I ) $ IM = MAX( 0, J-NVS ) * IWORK( I+1 ) = NVS NVS = MAX( J, NVS ) 100 CONTINUE IF( SELF.EQ.ILAST ) $ IM = M - NVS IWORK( ILAST+1 ) = NVS DO 110 I = ILAST + 2, P + 1 IWORK( I ) = M 110 CONTINUE * CLSIZ = 1 LGCLSIZ = 1 ILAST = 0 NBLK = 0 BNDRY = 2 K = 1 DO 140 I = 1, M IF( IBLOCK( I ).NE.NBLK ) THEN NBLK = IBLOCK( I ) IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 120 J = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( J ) )+ABS( E( J-1 ) )+ $ ABS( E( J ) ) ) 120 CONTINUE * END IF IF( I.GT.1 ) THEN DIFF = W( I ) - W( I-1 ) IF( IBLOCK( I ).NE.IBLOCK( I-1 ) .OR. I.EQ.M .OR. DIFF.GT. $ ORGFAC*ONENRM ) THEN IFIRST = ILAST IF( I.EQ.M ) THEN IF( IBLOCK( M ).NE.IBLOCK( M-1 ) .OR. DIFF.GT.ORGFAC* $ ONENRM ) THEN ILAST = M - 1 ELSE ILAST = M END IF ELSE ILAST = I - 1 END IF CLSIZ = ILAST - IFIRST IF( CLSIZ.GT.1 ) THEN IF( LGCLSIZ.LT.CLSIZ ) $ LGCLSIZ = CLSIZ MINGAP = ONENRM 130 CONTINUE IF( BNDRY.GT.P+1 ) $ GO TO 150 IF( IWORK( BNDRY ).GT.IFIRST .AND. IWORK( BNDRY ).LT. $ ILAST ) THEN MINGAP = MIN( W( IWORK( BNDRY )+1 )- $ W( IWORK( BNDRY ) ), MINGAP ) ELSE IF( IWORK( BNDRY ).GE.ILAST ) THEN IF( MINGAP.LT.ONENRM ) THEN ICLUSTR( 2*K-1 ) = IFIRST + 1 ICLUSTR( 2*K ) = ILAST GAP( K ) = MINGAP / ONENRM K = K + 1 END IF GO TO 140 END IF BNDRY = BNDRY + 1 GO TO 130 END IF END IF END IF 140 CONTINUE 150 CONTINUE INFO = ( K-1 )*( M+1 ) * * Call DSTEIN2 to find the eigenvectors * CALL DSTEIN2( N, D, E, IM, W( IWORK( SELF+1 )+1 ), $ IBLOCK( IWORK( SELF+1 )+1 ), ISPLIT, ORGFAC, $ WORK( INDRW+1 ), N, WORK, IWORK( P+2 ), $ IFAIL( IWORK( SELF+1 )+1 ), LOCINFO ) * * Redistribute the eigenvector matrix to conform with the block * cyclic distribution of the input matrix * * DO 160 I = 1, M IWORK( P+1+I ) = I 160 CONTINUE * CALL DLASRT2( 'I', M, W, IWORK( P+2 ), IINFO ) * DO 170 I = 1, M IWORK( M+P+1+IWORK( P+1+I ) ) = I 170 CONTINUE * * DO 180 I = 1, LOCINFO ITMP = IWORK( SELF+1 ) + I IFAIL( ITMP ) = IFAIL( ITMP ) + ITMP - I IFAIL( ITMP ) = IWORK( M+P+1+IFAIL( ITMP ) ) 180 CONTINUE * DO 190 I = 1, K - 1 ICLUSTR( 2*I-1 ) = IWORK( M+P+1+ICLUSTR( 2*I-1 ) ) ICLUSTR( 2*I ) = IWORK( M+P+1+ICLUSTR( 2*I ) ) 190 CONTINUE * * * Still need to apply the above permutation to IFAIL * * TOTERR = 0 DO 210 I = 1, P IF( SELF.EQ.I-1 ) THEN CALL IGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, LOCINFO, 1 ) IF( LOCINFO.NE.0 ) THEN CALL IGEBS2D( DESCZ( CTXT_ ), 'ALL', ' ', LOCINFO, 1, $ IFAIL( IWORK( I )+1 ), LOCINFO ) DO 200 J = 1, LOCINFO IFAIL( TOTERR+J ) = IFAIL( IWORK( I )+J ) 200 CONTINUE TOTERR = TOTERR + LOCINFO END IF ELSE * ROW = ( I-1 ) / NPCOL COL = MOD( I-1, NPCOL ) * CALL IGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', 1, 1, NERR, 1, $ ROW, COL ) IF( NERR.NE.0 ) THEN CALL IGEBR2D( DESCZ( CTXT_ ), 'ALL', ' ', NERR, 1, $ IFAIL( TOTERR+1 ), NERR, ROW, COL ) TOTERR = TOTERR + NERR END IF END IF 210 CONTINUE INFO = INFO + TOTERR * * CALL PZLAEVSWP( N, WORK( INDRW+1 ), N, Z, IZ, JZ, DESCZ, IWORK, $ IWORK( M+P+2 ), WORK, INDRW ) * DO 220 I = 2, P IWORK( I ) = IWORK( M+P+1+IWORK( I ) ) 220 CONTINUE * * * Sort the IWORK array * * 230 CONTINUE SORTED = .TRUE. DO 240 I = 2, P - 1 IF( IWORK( I ).GT.IWORK( I+1 ) ) THEN ITMP = IWORK( I+1 ) IWORK( I+1 ) = IWORK( I ) IWORK( I ) = ITMP SORTED = .FALSE. END IF 240 CONTINUE IF( .NOT.SORTED ) $ GO TO 230 * DO 250 I = P + 1, 1, -1 IWORK( I+1 ) = IWORK( I ) 250 CONTINUE * WORK( 1 ) = ( LGCLSIZ+LOAD-1 )*N + INDRW IWORK( 1 ) = 3*N + P + 1 * * End of PZSTEIN * END scalapack-2.0.2/SRC/pztrcon.f000644 000766 000024 00000040446 10363532303 016214 0ustar00juliestaff000000 000000 SUBROUTINE PZTRCON( NORM, UPLO, DIAG, N, A, IA, JA, DESCA, RCOND, $ WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER IA, JA, INFO, LRWORK, LWORK, N DOUBLE PRECISION RCOND * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION RWORK( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PZTRCON estimates the reciprocal of the condition number of a * triangular distributed matrix A(IA:IA+N-1,JA:JA+N-1), in either the * 1-norm or the infinity-norm. * * The norm of A(IA:IA+N-1,JA:JA+N-1) is computed and an estimate is * obtained for norm(inv(A(IA:IA+N-1,JA:JA+N-1))), then the reciprocal * of the condition number is computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * NORM (global input) CHARACTER * Specifies whether the 1-norm condition number or the * infinity-norm condition number is required: * = '1' or 'O': 1-norm; * = 'I': Infinity-norm. * * UPLO (global input) CHARACTER * = 'U': A(IA:IA+N-1,JA:JA+N-1) is upper triangular; * = 'L': A(IA:IA+N-1,JA:JA+N-1) is lower triangular. * * DIAG (global input) CHARACTER * = 'N': A(IA:IA+N-1,JA:JA+N-1) is non-unit triangular; * = 'U': A(IA:IA+N-1,JA:JA+N-1) is unit triangular. * * N (global input) INTEGER * The order of the distributed matrix A(IA:IA+N-1,JA:JA+N-1). * N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension ( LLD_A, LOCc(JA+N-1) ). This array * contains the local pieces of the triangular distributed * matrix A(IA:IA+N-1,JA:JA+N-1). If UPLO = 'U', the leading * N-by-N upper triangular part of this distributed matrix con- * tains the upper triangular matrix, and its strictly lower * triangular part is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of this ditributed * matrix contains the lower triangular matrix, and the strictly * upper triangular part is not referenced. If DIAG = 'U', the * diagonal elements of A(IA:IA+N-1,JA:JA+N-1) are also not * referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * RCOND (global output) DOUBLE PRECISION * The reciprocal of the condition number of the distributed * matrix A(IA:IA+N-1,JA:JA+N-1), computed as * RCOND = 1 / ( norm( A(IA:IA+N-1,JA:JA+N-1) ) * * norm( inv(A(IA:IA+N-1,JA:JA+N-1)) ) ). * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr(N+MOD(IA-1,MB_A)) + * MAX( 2, MAX(NB_A*CEIL(P-1,Q),LOCc(N+MOD(JA-1,NB_A)) + * NB_A*CEIL(Q-1,P)) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCc(N+MOD(JA-1,NB_A)). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOUNIT, ONENRM, UPPER CHARACTER CBTOP, COLCTOP, NORMIN, ROWCTOP INTEGER IACOL, IAROW, ICOFF, ICTXT, IIA, IPN, IPV, IPW, $ IPX, IROFF, IV, IX, IXX, JJA, JV, JX, KASE, $ KASE1, LRWMIN, LWMIN, MYCOL, MYROW, NP, NPCOL, $ NPMOD, NPROW, NQMOD DOUBLE PRECISION AINVNM, ANORM, SCALE, SMLNUM COMPLEX*16 WMAX, ZDUM * .. * .. Local Arrays .. INTEGER DESCV( DLEN_ ), DESCX( DLEN_ ), IDUM1( 5 ), $ IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, INFOG2L, $ PB_TOPGET, PB_TOPSET, PXERBLA, PCHK1MAT, $ PZAMAX, PZLATRS, PZLACON, PZDRSCL, $ ZGEBR2D, ZGEBS2D * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH, PZLANTR EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH, $ PZLANTR * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, ICHAR, MAX, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 800 + CTXT_ ) ELSE CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) ONENRM = NORM.EQ.'1' .OR. LSAME( NORM, 'O' ) NOUNIT = LSAME( DIAG, 'N' ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N + MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQMOD = NUMROC( N + MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = 2*NPMOD + $ MAX( 2, MAX( DESCA( NB_ )* $ MAX( 1, ICEIL( NPROW-1, NPCOL ) ), NQMOD + $ DESCA( NB_ )* $ MAX( 1, ICEIL( NPCOL-1, NPROW ) ) ) ) WORK( 1 ) = DBLE( LWMIN ) LRWMIN = NQMOD RWORK( 1 ) = DBLE( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.ONENRM .AND. .NOT.LSAME( NORM, 'I' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF END IF * IF( ONENRM ) THEN IDUM1( 1 ) = ICHAR( '1' ) ELSE IDUM1( 1 ) = ICHAR( 'I' ) END IF IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'U' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 11 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 13 CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 8, 5, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZTRCON', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) THEN RCOND = ONE RETURN END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPGET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', '1-tree' ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', '1-tree' ) * RCOND = ZERO SMLNUM = PDLAMCH( ICTXT, 'Safe minimum' )*DBLE( MAX( 1, N ) ) CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IV = IROFF + 1 IX = IV JV = ICOFF + 1 JX = JV * IPX = 1 IPV = IPX + NP IPW = IPV + NP IPN = 1 * CALL DESCSET( DESCV, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) CALL DESCSET( DESCX, N+IROFF, 1, DESCA( MB_ ), 1, IAROW, MYCOL, $ ICTXT, MAX( 1, NP ) ) * * Compute the norm of the triangular matrix A. * ANORM = PZLANTR( NORM, UPLO, DIAG, N, N, A, IA, JA, DESCA, RWORK ) * * Continue only if ANORM > 0. * IF( ANORM.GT.ZERO ) THEN * * Estimate the norm of the inverse of A. * AINVNM = ZERO NORMIN = 'N' IF( ONENRM ) THEN KASE1 = 1 ELSE KASE1 = 2 END IF KASE = 0 10 CONTINUE CALL PZLACON( N, WORK( IPV ), IV, JV, DESCV, WORK( IPX ), $ IX, JX, DESCX, AINVNM, KASE ) IF( KASE.NE.0 ) THEN IF( KASE.EQ.KASE1 ) THEN * * Multiply by inv(A). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( UPLO, 'No transpose', DIAG, NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SCALE, RWORK( IPN ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL ELSE * * Multiply by inv(A'). * DESCX( CSRC_ ) = IACOL CALL PZLATRS( UPLO, 'Conjugate transpose', DIAG, NORMIN, $ N, A, IA, JA, DESCA, WORK( IPX ), IX, JX, $ DESCX, SCALE, RWORK( IPN ), WORK( IPW ) ) DESCX( CSRC_ ) = MYCOL END IF NORMIN = 'Y' * * Multiply by 1/SCALE if doing so will not cause overflow. * IF( SCALE.NE.ONE ) THEN CALL PZAMAX( N, WMAX, IXX, WORK( IPX ), IX, JX, $ DESCX, 1 ) IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', $ CBTOP ) IF( MYROW.EQ.IAROW ) THEN CALL ZGEBS2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, $ 1 ) ELSE CALL ZGEBR2D( ICTXT, 'Column', CBTOP, 1, 1, WMAX, $ 1, IAROW, MYCOL ) END IF END IF IF( SCALE.LT.CABS1( WMAX )*SMLNUM .OR. SCALE.EQ.ZERO ) $ GO TO 20 CALL PZDRSCL( N, SCALE, WORK( IPX ), IX, JX, DESCX, 1 ) END IF GO TO 10 END IF * * Compute the estimate of the reciprocal condition number. * IF( AINVNM.NE.ZERO ) $ RCOND = ( ONE / ANORM ) / AINVNM END IF * 20 CONTINUE * CALL PB_TOPSET( ICTXT, 'Combine', 'Columnwise', COLCTOP ) CALL PB_TOPSET( ICTXT, 'Combine', 'Rowwise', ROWCTOP ) * RETURN * * End of PZTRCON * END scalapack-2.0.2/SRC/pztrevc.f000644 000766 000024 00000052653 10602576752 016231 0ustar00juliestaff000000 000000 SUBROUTINE PZTREVC( SIDE, HOWMNY, SELECT, N, T, DESCT, VL, DESCVL, $ VR, DESCVR, MM, M, WORK, RWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * July 31, 2001 * * .. Scalar Arguments .. CHARACTER HOWMNY, SIDE INTEGER INFO, M, MM, N * .. * .. Array Arguments .. LOGICAL SELECT( * ) INTEGER DESCT( * ), DESCVL( * ), DESCVR( * ) DOUBLE PRECISION RWORK( * ) COMPLEX*16 T( * ), VL( * ), VR( * ), WORK( * ) * .. * * Purpose * ======= * * PZTREVC computes some or all of the right and/or left eigenvectors of * a complex upper triangular matrix T in parallel. * * The right eigenvector x and the left eigenvector y of T corresponding * to an eigenvalue w are defined by: * * T*x = w*x, y'*T = w*y' * * where y' denotes the conjugate transpose of the vector y. * * If all eigenvectors are requested, the routine may either return the * matrices X and/or Y of right or left eigenvectors of T, or the * products Q*X and/or Q*Y, where Q is an input unitary * matrix. If T was obtained from the Schur factorization of an * original matrix A = Q*T*Q', then Q*X and Q*Y are the matrices of * right or left eigenvectors of A. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * = 'R': compute right eigenvectors only; * = 'L': compute left eigenvectors only; * = 'B': compute both right and left eigenvectors. * * HOWMNY (global input) CHARACTER*1 * = 'A': compute all right and/or left eigenvectors; * = 'B': compute all right and/or left eigenvectors, * and backtransform them using the input matrices * supplied in VR and/or VL; * = 'S': compute selected right and/or left eigenvectors, * specified by the logical array SELECT. * * SELECT (global input) LOGICAL array, dimension (N) * If HOWMNY = 'S', SELECT specifies the eigenvectors to be * computed. * If HOWMNY = 'A' or 'B', SELECT is not referenced. * To select the eigenvector corresponding to the j-th * eigenvalue, SELECT(j) must be set to .TRUE.. * * N (global input) INTEGER * The order of the matrix T. N >= 0. * * T (global input/output) COMPLEX*16 array, dimension * (DESCT(LLD_),*) * The upper triangular matrix T. T is modified, but restored * on exit. * * DESCT (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix T. * * VL (global input/output) COMPLEX*16 array, dimension * (DESCVL(LLD_),MM) * On entry, if SIDE = 'L' or 'B' and HOWMNY = 'B', VL must * contain an N-by-N matrix Q (usually the unitary matrix Q of * Schur vectors returned by ZHSEQR). * On exit, if SIDE = 'L' or 'B', VL contains: * if HOWMNY = 'A', the matrix Y of left eigenvectors of T; * if HOWMNY = 'B', the matrix Q*Y; * if HOWMNY = 'S', the left eigenvectors of T specified by * SELECT, stored consecutively in the columns * of VL, in the same order as their * eigenvalues. * If SIDE = 'R', VL is not referenced. * * DESCVL (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix VL. * * VR (global input/output) COMPLEX*16 array, dimension * (DESCVR(LLD_),MM) * On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must * contain an N-by-N matrix Q (usually the unitary matrix Q of * Schur vectors returned by ZHSEQR). * On exit, if SIDE = 'R' or 'B', VR contains: * if HOWMNY = 'A', the matrix X of right eigenvectors of T; * if HOWMNY = 'B', the matrix Q*X; * if HOWMNY = 'S', the right eigenvectors of T specified by * SELECT, stored consecutively in the columns * of VR, in the same order as their * eigenvalues. * If SIDE = 'L', VR is not referenced. * * DESCVR (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix VR. * * MM (global input) INTEGER * The number of columns in the arrays VL and/or VR. MM >= M. * * M (global output) INTEGER * The number of columns in the arrays VL and/or VR actually * used to store the eigenvectors. If HOWMNY = 'A' or 'B', M * is set to N. Each selected eigenvector occupies one * column. * * WORK (local workspace) COMPLEX*16 array, * dimension ( 2*DESCT(LLD_) ) * Additional workspace may be required if PZLATTRS is updated * to use WORK. * * RWORK (local workspace) DOUBLE PRECISION array, * dimension ( DESCT(LLD_) ) * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Further Details * =============== * * The algorithm used in this program is basically backward (forward) * substitution. It is the hope that scaling would be used to make the * the code robust against possible overflow. But scaling has not yet * been implemented in PZLATTRS which is called by this routine to solve * the triangular systems. PZLATTRS just calls PZTRSV. * * Each eigenvector is normalized so that the element of largest * magnitude has magnitude 1; here the magnitude of a complex number * (x,y) is taken to be |x| + |y|. * * Further Details * =============== * * Implemented by Mark R. Fahey, June, 2000 * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL ALLV, BOTHV, LEFTV, OVER, RIGHTV, SOMEV INTEGER CONTXT, CSRC, I, ICOL, II, IROW, IS, ITMP1, $ ITMP2, J, K, KI, LDT, LDVL, LDVR, LDW, MB, $ MYCOL, MYROW, NB, NPCOL, NPROW, RSRC REAL SELF DOUBLE PRECISION OVFL, REMAXD, SCALE, SMIN, SMLNUM, ULP, UNFL COMPLEX*16 CDUM, REMAXC, SHIFT * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, DGSUM2D, IGAMN2D, $ INFOG2L, PDLABAD, PDZASUM, PXERBLA, PZAMAX, $ PZCOPY, PZDSCAL, PZGEMV, PZLASET, PZLATTRS, $ ZGSUM2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CONTXT = DESCT( CTXT_ ) RSRC = DESCT( RSRC_ ) CSRC = DESCT( CSRC_ ) MB = DESCT( MB_ ) NB = DESCT( NB_ ) LDT = DESCT( LLD_ ) LDW = LDT LDVR = DESCVR( LLD_ ) LDVL = DESCVL( LLD_ ) * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) SELF = MYROW*NPCOL + MYCOL * * Decode and test the input parameters * BOTHV = LSAME( SIDE, 'B' ) RIGHTV = LSAME( SIDE, 'R' ) .OR. BOTHV LEFTV = LSAME( SIDE, 'L' ) .OR. BOTHV * ALLV = LSAME( HOWMNY, 'A' ) OVER = LSAME( HOWMNY, 'B' ) .OR. LSAME( HOWMNY, 'O' ) SOMEV = LSAME( HOWMNY, 'S' ) * * Set M to the number of columns required to store the selected * eigenvectors. * IF( SOMEV ) THEN M = 0 DO 10 J = 1, N IF( SELECT( J ) ) $ M = M + 1 10 CONTINUE ELSE M = N END IF * INFO = 0 IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN INFO = -1 ELSE IF( .NOT.ALLV .AND. .NOT.OVER .AND. .NOT.SOMEV ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -4 ELSE IF( MM.LT.M ) THEN INFO = -11 END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PZTREVC', -INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * Set the constants to control overflow. * UNFL = PDLAMCH( CONTXT, 'Safe minimum' ) OVFL = ONE / UNFL CALL PDLABAD( CONTXT, UNFL, OVFL ) ULP = PDLAMCH( CONTXT, 'Precision' ) SMLNUM = UNFL*( N / ULP ) * * Store the diagonal elements of T in working array WORK( LDW+1 ). * DO 20 I = 1, N CALL INFOG2L( I, I, DESCT, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN WORK( LDW+IROW ) = T( ( ICOL-1 )*LDT+IROW ) END IF 20 CONTINUE * * Compute 1-norm of each column of strictly upper triangular * part of T to control overflow in triangular solver. Computed, * but not used. For use in PZLATTRS. * RWORK( 1 ) = ZERO DO 30 J = 2, N CALL PDZASUM( J-1, RWORK( J ), T, 1, J, DESCT, 1 ) 30 CONTINUE * I replicate the norms in RWORK. Should they be distributed * over the process rows? CALL DGSUM2D( CONTXT, 'Row', ' ', N, 1, RWORK, N, -1, -1 ) * IF( RIGHTV ) THEN * * Compute right eigenvectors. * * Need to set the distribution pattern of WORK * CALL DESCINIT( DESCW, N, 1, NB, 1, RSRC, CSRC, CONTXT, LDW, $ INFO ) * IS = M DO 70 KI = N, 1, -1 * IF( SOMEV ) THEN IF( .NOT.SELECT( KI ) ) $ GO TO 70 END IF * SMIN = ZERO SHIFT = CZERO CALL INFOG2L( KI, KI, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN SHIFT = T( ( ICOL-1 )*LDT+IROW ) SMIN = MAX( ULP*( CABS1( SHIFT ) ), SMLNUM ) END IF CALL DGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SMIN, 1, -1, -1 ) CALL ZGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SHIFT, 1, -1, -1 ) * CALL INFOG2L( 1, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN WORK( 1 ) = CONE END IF * * Form right-hand side. Distribute rhs onto first column * of processor grid. * IF( KI.GT.1 ) THEN CALL PZCOPY( KI-1, T, 1, KI, DESCT, 1, WORK, 1, 1, DESCW, $ 1 ) END IF DO 40 K = 1, KI - 1 CALL INFOG2L( K, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = -WORK( IROW ) END IF 40 CONTINUE * * Solve the triangular system: * (T(1:KI-1,1:KI-1) - T(KI,KI))*X = SCALE*WORK. * DO 50 K = 1, KI - 1 CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( ( ICOL-1 )*LDT+IROW ) = T( ( ICOL-1 )*LDT+IROW ) - $ SHIFT IF( CABS1( T( ( ICOL-1 )*LDT+IROW ) ).LT.SMIN ) THEN T( ( ICOL-1 )*LDT+IROW ) = DCMPLX( SMIN ) END IF END IF 50 CONTINUE * IF( KI.GT.1 ) THEN CALL PZLATTRS( 'Upper', 'No transpose', 'Non-unit', 'Y', $ KI-1, T, 1, 1, DESCT, WORK, 1, 1, DESCW, $ SCALE, RWORK, INFO ) CALL INFOG2L( KI, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = DCMPLX( SCALE ) END IF END IF * * Copy the vector x or Q*x to VR and normalize. * IF( .NOT.OVER ) THEN CALL PZCOPY( KI, WORK, 1, 1, DESCW, 1, VR, 1, IS, DESCVR, $ 1 ) * CALL PZAMAX( KI, REMAXC, II, VR, 1, IS, DESCVR, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PZDSCAL( KI, REMAXD, VR, 1, IS, DESCVR, 1 ) * CALL PZLASET( ' ', N-KI, 1, CZERO, CZERO, VR, KI+1, IS, $ DESCVR ) ELSE IF( KI.GT.1 ) $ CALL PZGEMV( 'N', N, KI-1, CONE, VR, 1, 1, DESCVR, $ WORK, 1, 1, DESCW, 1, DCMPLX( SCALE ), $ VR, 1, KI, DESCVR, 1 ) * CALL PZAMAX( N, REMAXC, II, VR, 1, KI, DESCVR, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PZDSCAL( N, REMAXD, VR, 1, KI, DESCVR, 1 ) END IF * * Set back the original diagonal elements of T. * DO 60 K = 1, KI - 1 CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( ( ICOL-1 )*LDT+IROW ) = WORK( LDW+IROW ) END IF 60 CONTINUE * IS = IS - 1 70 CONTINUE END IF * IF( LEFTV ) THEN * * Compute left eigenvectors. * * Need to set the distribution pattern of WORK * CALL DESCINIT( DESCW, N, 1, MB, 1, RSRC, CSRC, CONTXT, LDW, $ INFO ) * IS = 1 DO 110 KI = 1, N * IF( SOMEV ) THEN IF( .NOT.SELECT( KI ) ) $ GO TO 110 END IF * SMIN = ZERO SHIFT = CZERO CALL INFOG2L( KI, KI, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN SHIFT = T( ( ICOL-1 )*LDT+IROW ) SMIN = MAX( ULP*( CABS1( SHIFT ) ), SMLNUM ) END IF CALL DGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SMIN, 1, -1, -1 ) CALL ZGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SHIFT, 1, -1, -1 ) * CALL INFOG2L( N, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN WORK( IROW ) = CONE END IF * * Form right-hand side. * IF( KI.LT.N ) THEN CALL PZCOPY( N-KI, T, KI, KI+1, DESCT, N, WORK, KI+1, 1, $ DESCW, 1 ) END IF DO 80 K = KI + 1, N CALL INFOG2L( K, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = -DCONJG( WORK( IROW ) ) END IF 80 CONTINUE * * Solve the triangular system: * (T(KI+1:N,KI+1:N) - T(KI,KI))'*X = SCALE*WORK. * DO 90 K = KI + 1, N CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( ( ICOL-1 )*LDT+IROW ) = T( ( ICOL-1 )*LDT+IROW ) - $ SHIFT IF( CABS1( T( ( ICOL-1 )*LDT+IROW ) ).LT.SMIN ) $ T( ( ICOL-1 )*LDT+IROW ) = DCMPLX( SMIN ) END IF 90 CONTINUE * IF( KI.LT.N ) THEN CALL PZLATTRS( 'Upper', 'Conjugate transpose', 'Nonunit', $ 'Y', N-KI, T, KI+1, KI+1, DESCT, WORK, $ KI+1, 1, DESCW, SCALE, RWORK, INFO ) CALL INFOG2L( KI, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = DCMPLX( SCALE ) END IF END IF * * Copy the vector x or Q*x to VL and normalize. * IF( .NOT.OVER ) THEN CALL PZCOPY( N-KI+1, WORK, KI, 1, DESCW, 1, VL, KI, IS, $ DESCVL, 1 ) * CALL PZAMAX( N-KI+1, REMAXC, II, VL, KI, IS, DESCVL, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PZDSCAL( N-KI+1, REMAXD, VL, KI, IS, DESCVL, 1 ) * CALL PZLASET( ' ', KI-1, 1, CZERO, CZERO, VL, 1, IS, $ DESCVL ) ELSE IF( KI.LT.N ) $ CALL PZGEMV( 'N', N, N-KI, CONE, VL, 1, KI+1, DESCVL, $ WORK, KI+1, 1, DESCW, 1, DCMPLX( SCALE ), $ VL, 1, KI, DESCVL, 1 ) * CALL PZAMAX( N, REMAXC, II, VL, 1, KI, DESCVL, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PZDSCAL( N, REMAXD, VL, 1, KI, DESCVL, 1 ) END IF * * Set back the original diagonal elements of T. * DO 100 K = KI + 1, N CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( ( ICOL-1 )*LDT+IROW ) = WORK( LDW+IROW ) END IF 100 CONTINUE * IS = IS + 1 110 CONTINUE END IF * RETURN * * End of PZTREVC * END scalapack-2.0.2/SRC/pztrrfs.f000644 000766 000024 00000074463 10363532303 016235 0ustar00juliestaff000000 000000 SUBROUTINE PZTRRFS( UPLO, TRANS, DIAG, N, NRHS, A, IA, JA, DESCA, $ B, IB, JB, DESCB, X, IX, JX, DESCX, FERR, $ BERR, WORK, LWORK, RWORK, LRWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, TRANS, UPLO INTEGER INFO, IA, IB, IX, JA, JB, JX, LRWORK, LWORK, $ N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCX( * ) DOUBLE PRECISION BERR( * ), FERR( * ), RWORK( * ) COMPLEX*16 A( * ), B( * ), WORK( * ), X( * ) * .. * * Purpose * ======= * * PZTRRFS provides error bounds and backward error estimates for the * solution to a system of linear equations with a triangular * coefficient matrix. * * The solution matrix X must be computed by PZTRTRS or some other * means before entering this routine. PZTRRFS does not do iterative * refinement because doing so cannot improve the backward error. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * In the following comments, sub( A ), sub( X ) and sub( B ) denote * respectively A(IA:IA+N-1,JA:JA+N-1), X(IX:IX+N-1,JX:JX+NRHS-1) and * B(IB:IB+N-1,JB:JB+NRHS-1). * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * TRANS (global input) CHARACTER*1 * Specifies the form of the system of equations. * = 'N': sub( A ) * sub( X ) = sub( B ) (No transpose) * = 'T': sub( A )**T * sub( X ) = sub( B ) (Transpose) * = 'C': sub( A )**H * sub( X ) = sub( B ) * (Conjugate transpose) * * DIAG (global input) CHARACTER*1 * = 'N': sub( A ) is non-unit triangular; * = 'U': sub( A ) is unit triangular. * * N (global input) INTEGER * The order of the matrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrices sub( B ) and sub( X ). NRHS >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of local dimension (LLD_A,LOCc(JA+N-1) ). This * array contains the local pieces of the original triangular * distributed matrix sub( A ). * If UPLO = 'U', the leading N-by-N upper triangular part of * sub( A ) contains the upper triangular part of the matrix, * and its strictly lower triangular part is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * sub( A ) contains the lower triangular part of the distribu- * ted matrix, and its strictly upper triangular part is not * referenced. * If DIAG = 'U', the diagonal elements of sub( A ) are also * not referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input) COMPLEX*16 pointer into the local memory * to an array of local dimension (LLD_B, LOCc(JB+NRHS-1) ). * On entry, this array contains the the local pieces of the * right hand sides sub( B ). * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * X (local input) COMPLEX*16 pointer into the local memory * to an array of local dimension (LLD_X, LOCc(JX+NRHS-1) ). * On entry, this array contains the the local pieces of the * solution vectors sub( X ). * * IX (global input) INTEGER * The row index in the global array X indicating the first * row of sub( X ). * * JX (global input) INTEGER * The column index in the global array X indicating the * first column of sub( X ). * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * FERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). The estimated forward error bounds for * each solution vector of sub( X ). If XTRUE is the true * solution, FERR bounds the magnitude of the largest entry * in (sub( X ) - XTRUE) divided by the magnitude of the * largest entry in sub( X ). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * This array is tied to the distributed matrix X. * * BERR (local output) DOUBLE PRECISION array of local dimension * LOCc(JB+NRHS-1). The componentwise relative backward * error of each solution vector (i.e., the smallest re- * lative change in any entry of sub( A ) or sub( B ) * that makes sub( X ) an exact solution). * This array is tied to the distributed matrix X. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= 2*LOCr( N + MOD( IA-1, MB_A ) ). * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * RWORK (local workspace/local output) DOUBLE PRECISION array, * dimension (LRWORK) * On exit, RWORK(1) returns the minimal and optimal LRWORK. * * LRWORK (local or global input) INTEGER * The dimension of the array RWORK. * LRWORK is local input and must be at least * LRWORK >= LOCr( N + MOD( IB-1, MB_B ) ). * * If LRWORK = -1, then LRWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Notes * ===== * * This routine temporarily returns when N <= 1. * * The distributed submatrices sub( X ) and sub( B ) should be * distributed the same way on the same processes. These conditions * ensure that sub( X ) and sub( B ) are "perfectly" aligned. * * Moreover, this routine requires the distributed submatrices sub( A ), * sub( X ), and sub( B ) to be aligned on a block boundary, * i.e., if f(x,y) = MOD( x-1, y ): * f( IA, DESCA( MB_ ) ) = f( JA, DESCA( NB_ ) ) = 0, * f( IB, DESCB( MB_ ) ) = f( JB, DESCB( NB_ ) ) = 0, and * f( IX, DESCX( MB_ ) ) = f( JX, DESCX( NB_ ) ) = 0. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, RONE PARAMETER ( ZERO = 0.0D+0, RONE = 1.0D+0 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER CHARACTER TRANSN, TRANST INTEGER IAROW, IXBCOL, IXBROW, IXCOL, IXROW, ICOFFA, $ ICOFFB, ICOFFX, ICTXT, ICURCOL, IDUM, II, IIXB, $ IIW, IOFFXB, IPB, IPR, IPV, IROFFA, IROFFB, $ IROFFX, IW, J, JBRHS, JJ, JJFBE, JJXB, JN, JW, $ K, KASE, LDXB, LRWMIN, LWMIN, MYCOL, MYRHS, $ MYROW, NP, NP0, NPCOL, NPMOD, NPROW, NZ DOUBLE PRECISION EPS, EST, LSTRES, S, SAFE1, SAFE2, SAFMIN COMPLEX*16 ZDUM * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ), IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, INDXG2P, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, DESCSET, DGAMX2D, $ INFOG2L, PCHK1MAT, PCHK2MAT, PXERBLA, PZATRMV, $ PZAXPY, PZCOPY, PZLACON, PZTRMV, $ PZTRSV, ZGEBR2D, ZGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DIMAG, ICHAR, MAX, MIN, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters. * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 900+CTXT_ ) ELSE CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IX, JX, DESCX, 17, INFO ) IF( INFO.EQ.0 ) THEN UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) NOUNIT = LSAME( DIAG, 'N' ) IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) ICOFFB = MOD( JB-1, DESCB( NB_ ) ) IROFFX = MOD( IX-1, DESCX( MB_ ) ) ICOFFX = MOD( JX-1, DESCX( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) CALL INFOG2L( IB, JB, DESCB, NPROW, NPCOL, MYROW, MYCOL, $ IIXB, JJXB, IXBROW, IXBCOL ) IXROW = INDXG2P( IX, DESCX( MB_ ), MYROW, DESCX( RSRC_ ), $ NPROW ) IXCOL = INDXG2P( JX, DESCX( NB_ ), MYCOL, DESCX( CSRC_ ), $ NPCOL ) NPMOD = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LWMIN = 2*NPMOD WORK( 1 ) = DBLE( LWMIN ) LRWMIN = NPMOD RWORK( 1 ) = DBLE( LRWMIN ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 ELSE IF( NRHS.LT.0 ) THEN INFO = -5 ELSE IF( IROFFA.NE.0 ) THEN INFO = -7 ELSE IF( ICOFFA.NE.0 ) THEN INFO = -8 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 900+NB_ ) ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IXBROW ) THEN INFO = -11 ELSE IF( DESCA( MB_ ).NE.DESCB( MB_ ) ) THEN INFO = -( 1300+MB_ ) ELSE IF( ICTXT.NE.DESCB( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) ELSE IF( IROFFX.NE.0 .OR. IXBROW.NE.IXROW ) THEN INFO = -15 ELSE IF( ICOFFB.NE.ICOFFX .OR. IXBCOL.NE.IXCOL ) THEN INFO = -16 ELSE IF( DESCB( MB_ ).NE.DESCX( MB_ ) ) THEN INFO = -( 1700+MB_ ) ELSE IF( DESCB( NB_ ).NE.DESCX( NB_ ) ) THEN INFO = -( 1700+NB_ ) ELSE IF( ICTXT.NE.DESCX( CTXT_ ) ) THEN INFO = -( 1700+CTXT_ ) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -21 ELSE IF( LRWORK.LT.LRWMIN .AND. .NOT.LQUERY ) THEN INFO = -23 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 2 ) = ICHAR( 'T' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'U' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 21 IF( LRWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 23 CALL PCHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, 0, IDUM1, IDUM2, $ INFO ) CALL PCHK2MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, N, 4, NRHS, 5, $ IX, JX, DESCX, 17, 5, IDUM1, IDUM2, INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZTRRFS', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * JJFBE = JJXB MYRHS = NUMROC( JB+NRHS-1, DESCB( NB_ ), MYCOL, DESCB( CSRC_ ), $ NPCOL ) * * Quick return if possible * IF( N.LE.1 .OR. NRHS.EQ.0 ) THEN DO 10 JJ = JJFBE, MYRHS FERR( JJ ) = ZERO BERR( JJ ) = ZERO 10 CONTINUE RETURN END IF * IF( NOTRAN ) THEN TRANSN = 'N' TRANST = 'C' ELSE TRANSN = 'C' TRANST = 'N' END IF * NP0 = NUMROC( N+IROFFB, DESCB( MB_ ), MYROW, IXBROW, NPROW ) CALL DESCSET( DESCW, N+IROFFB, 1, DESCA( MB_ ), 1, IXBROW, IXBCOL, $ ICTXT, MAX( 1, NP0 ) ) IPB = 1 IPR = 1 IPV = IPR + NP0 IF( MYROW.EQ.IXBROW ) THEN IIW = 1 + IROFFB NP = NP0 - IROFFB ELSE IIW = 1 NP = NP0 END IF IW = 1 + IROFFB JW = 1 LDXB = DESCB( LLD_ ) IOFFXB = ( JJXB-1 )*LDXB * * NZ = maximum number of nonzero entries in each row of A, plus 1 * NZ = N + 1 EPS = PDLAMCH( ICTXT, 'Epsilon' ) SAFMIN = PDLAMCH( ICTXT, 'Safe minimum' ) SAFE1 = NZ*SAFMIN SAFE2 = SAFE1 / EPS JN = MIN( ICEIL( JB, DESCB( NB_ ) )*DESCB( NB_ ), JB+NRHS-1 ) * * Handle first block separately * JBRHS = JN - JB + 1 DO 90 K = 0, JBRHS - 1 * * Compute residual R = B - op(A) * X, * where op(A) = A or A', depending on TRANS. * CALL PZCOPY( N, X, IX, JX+K, DESCX, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PZTRMV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) CALL PZAXPY( N, -ONE, B, IB, JB+K, DESCB, 1, WORK( IPR ), IW, $ JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the matrix * or vector Z. If the i-th component of the denominator is less * than SAFE2, then SAFE1 is added to the i-th components of the * numerator and denominator before dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 20 II = IIXB, IIXB + NP - 1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 20 CONTINUE END IF END IF * CALL PZATRMV( UPLO, TRANS, DIAG, N, RONE, A, IA, JA, DESCA, X, $ IX, JX+K, DESCX, 1, RONE, RWORK( IPB ), IW, JW, $ DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 30 II = IIW - 1, IIW + NP - 2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 30 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Bound error from formula * * norm(X - XTRUE) / norm(X) .le. FERR = * norm( abs(inv(op(A)))* * ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(A)) is the inverse of op(A) * abs(Z) is the componentwise absolute value of the matrix or * vector Z * NZ is the maximum number of nonzeros in any row of A, plus 1 * EPS is machine epsilon * * The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) * is incremented by SAFE1 if the i-th component of * abs(op(A))*abs(X) + abs(B) is less than SAFE2. * * Use PZLACON to estimate the infinity-norm of the matrix * inv(op(A)) * diag(W), * where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 40 II = IIW - 1, IIW + NP - 2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 40 CONTINUE END IF END IF * KASE = 0 50 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PZLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(A)'). * CALL PZTRSV( UPLO, TRANST, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 60 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 60 CONTINUE END IF END IF ELSE * * Multiply by inv(op(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 70 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = RWORK( IPB+II )*WORK( IPR+II ) 70 CONTINUE END IF END IF CALL PZTRSV( UPLO, TRANSN, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) END IF GO TO 50 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 80 II = IIXB, IIXB + NP - 1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 80 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, IDUM, $ IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 90 CONTINUE * ICURCOL = MOD( IXBCOL+1, NPCOL ) * * Do for each right hand side * DO 180 J = JN + 1, JB + NRHS - 1, DESCB( NB_ ) JBRHS = MIN( JB+NRHS-J, DESCB( NB_ ) ) DESCW( CSRC_ ) = ICURCOL * DO 170 K = 0, JBRHS - 1 * * Compute residual R = B - op(A) * X, * where op(A) = A or A', depending on TRANS. * CALL PZCOPY( N, X, IX, J+K, DESCX, 1, WORK( IPR ), IW, JW, $ DESCW, 1 ) CALL PZTRMV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) CALL PZAXPY( N, -ONE, B, IB, J+K, DESCB, 1, WORK( IPR ), $ IW, JW, DESCW, 1 ) * * Compute componentwise relative backward error from formula * * max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) * * where abs(Z) is the componentwise absolute value of the * matrix or vector Z. If the i-th component of the * denominator is less than SAFE2, then SAFE1 is added to the * i-th components of the numerator and denominator before * dividing. * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 100 II = IIXB, IIXB + NP - 1 RWORK( IIW+II-IIXB ) = CABS1( B( II+IOFFXB ) ) 100 CONTINUE END IF END IF * CALL PZATRMV( UPLO, TRANS, DIAG, N, RONE, A, IA, JA, DESCA, $ X, IX, J+K, DESCX, 1, RONE, RWORK( IPB ), IW, $ JW, DESCW, 1 ) * S = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 110 II = IIW - 1, IIW + NP - 2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN S = MAX( S, CABS1( WORK( IPR+II ) ) / $ RWORK( IPB+II ) ) ELSE S = MAX( S, ( CABS1( WORK( IPR+II ) )+SAFE1 ) / $ ( RWORK( IPB+II )+SAFE1 ) ) END IF 110 CONTINUE END IF END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, S, 1, IDUM, IDUM, 1, $ -1, MYCOL ) IF( MYCOL.EQ.IXBCOL ) $ BERR( JJFBE ) = S * * Bound error from formula * * norm(X - XTRUE) / norm(X) .le. FERR = * norm( abs(inv(op(A)))* * ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) )))/norm(X) * * where * norm(Z) is the magnitude of the largest component of Z * inv(op(A)) is the inverse of op(A) * abs(Z) is the componentwise absolute value of the matrix * or vector Z * NZ is the maximum number of nonzeros in any row of A, * plus 1 * EPS is machine epsilon * * The i-th component of * abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) * is incremented by SAFE1 if the i-th component of * abs(op(A))*abs(X) + abs(B) is less than SAFE2. * * Use PZLACON to estimate the infinity-norm of the matrix * inv(op(A)) * diag(W), * where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 120 II = IIW - 1, IIW + NP - 2 IF( RWORK( IPB+II ).GT.SAFE2 ) THEN RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) ELSE RWORK( IPB+II ) = CABS1( WORK( IPR+II ) ) + $ NZ*EPS*RWORK( IPB+II ) + SAFE1 END IF 120 CONTINUE END IF END IF * KASE = 0 130 CONTINUE IF( MYCOL.EQ.IXBCOL ) THEN CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ) ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', NP, 1, WORK( IPR ), $ DESCW( LLD_ ), MYROW, IXBCOL ) END IF DESCW( CSRC_ ) = MYCOL CALL PZLACON( N, WORK( IPV ), IW, JW, DESCW, WORK( IPR ), $ IW, JW, DESCW, EST, KASE ) DESCW( CSRC_ ) = IXBCOL * IF( KASE.NE.0 ) THEN IF( KASE.EQ.1 ) THEN * * Multiply by diag(W)*inv(op(A)'). * CALL PZTRSV( UPLO, TRANST, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 140 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 140 CONTINUE END IF END IF ELSE * * Multiply by inv(op(A))*diag(W). * IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 150 II = IIW - 1, IIW + NP - 2 WORK( IPR+II ) = RWORK( IPB+II )* $ WORK( IPR+II ) 150 CONTINUE END IF END IF CALL PZTRSV( UPLO, TRANSN, DIAG, N, A, IA, JA, DESCA, $ WORK( IPR ), IW, JW, DESCW, 1 ) END IF GO TO 130 END IF * * Normalize error. * LSTRES = ZERO IF( MYCOL.EQ.IXBCOL ) THEN IF( NP.GT.0 ) THEN DO 160 II = IIXB, IIXB + NP - 1 LSTRES = MAX( LSTRES, CABS1( X( IOFFXB+II ) ) ) 160 CONTINUE END IF CALL DGAMX2D( ICTXT, 'Column', ' ', 1, 1, LSTRES, 1, $ IDUM, IDUM, 1, -1, MYCOL ) IF( LSTRES.NE.ZERO ) $ FERR( JJFBE ) = EST / LSTRES * JJXB = JJXB + 1 JJFBE = JJFBE + 1 IOFFXB = IOFFXB + LDXB * END IF * 170 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 180 CONTINUE * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) RWORK( 1 ) = DBLE( LRWMIN ) * RETURN * * End of PZTRRFS * END scalapack-2.0.2/SRC/pztrti2.f000644 000766 000024 00000023661 10363532303 016133 0ustar00juliestaff000000 000000 SUBROUTINE PZTRTI2( UPLO, DIAG, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZTRTI2 computes the inverse of a complex upper or lower triangular * block matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). This matrix should be * contained in one and only one process memory space (local operation). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * DIAG (global input) CHARACTER*1 * = 'N': sub( A ) is non-unit triangular * = 'U': sub( A ) is unit triangular * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)), * this array contains the local pieces of the triangular matrix * sub( A ). If UPLO = 'U', the leading N-by-N upper triangular * part of the matrix sub( A ) contains the upper triangular * matrix, and the strictly lower triangular part of sub( A ) * is not referenced. If UPLO = 'L', the leading N-by-N lower * triangular part of the matrix sub( A ) contains the lower * triangular matrix, and the strictly upper triangular part * of sub( A ) is not referenced. If DIAG = 'U', the diagonal * elements of sub( A ) are also not referenced and are assumed * to be 1. On exit, the (triangular) inverse of the original * matrix, in the same storage format. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER IACOL, IAROW, ICTXT, ICURR, IDIAG, IIA, IOFFA, $ JJA, LDA, MYCOL, MYROW, NA, NPCOL, NPROW COMPLEX*16 AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PXERBLA, ZSCAL, ZTRMV * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZTRTI2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN END IF * * Compute local indexes * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.IACOL ) THEN * LDA = DESCA( LLD_ ) * IF( UPPER ) THEN * IOFFA = IIA + ( JJA - 1 ) * LDA ICURR = IOFFA + LDA * IF( NOUNIT ) THEN * * Compute inverse of upper non-unit triangular matrix. * A( IOFFA ) = ONE / A( IOFFA ) IDIAG = ICURR + 1 DO 10 NA = 1, N-1 A( IDIAG ) = ONE / A( IDIAG ) AJJ = -A( IDIAG ) * * Compute elements 1:j-1 of j-th column. * CALL ZTRMV( 'Upper', 'No transpose', DIAG, NA, $ A( IOFFA ), LDA, A( ICURR ), 1 ) CALL ZSCAL( NA, AJJ, A( ICURR ), 1 ) IDIAG = IDIAG + LDA + 1 ICURR = ICURR + LDA 10 CONTINUE * ELSE * * Compute inverse of upper unit triangular matrix. * DO 20 NA = 1, N-1 * * Compute elements 1:j-1 of j-th column. * CALL ZTRMV( 'Upper', 'No transpose', DIAG, NA, $ A( IOFFA ), LDA, A( ICURR ), 1 ) CALL ZSCAL( NA, -ONE, A( ICURR ), 1 ) ICURR = ICURR + LDA 20 CONTINUE * END IF * ELSE * ICURR = IIA + N - 1 + ( JJA + N - 2 ) * LDA IOFFA = ICURR - LDA * IF( NOUNIT ) THEN * * Compute inverse of lower non-unit triangular matrix. * A( ICURR ) = ONE / A( ICURR ) IDIAG = IOFFA - 1 DO 30 NA = 1, N-1 A( IDIAG ) = ONE / A( IDIAG ) AJJ = -A( IDIAG ) * * Compute elements j+1:n of j-th column. * CALL ZTRMV( 'Lower', 'No transpose', DIAG, NA, $ A( ICURR ), LDA, A( IOFFA ), 1 ) CALL ZSCAL( NA, AJJ, A( IOFFA ), 1 ) ICURR = IDIAG IDIAG = IDIAG - LDA - 1 IOFFA = IDIAG + 1 30 CONTINUE * ELSE * * Compute inverse of lower unit triangular matrix. * DO 40 NA = 1, N-1 * * Compute elements j+1:n of j-th column. * CALL ZTRMV( 'Lower', 'No transpose', DIAG, NA, $ A( ICURR ), LDA, A( IOFFA ), 1 ) CALL ZSCAL( NA, -ONE, A( IOFFA ), 1 ) ICURR = ICURR - LDA - 1 IOFFA = ICURR - LDA 40 CONTINUE * END IF * END IF * END IF * * End of PZTRTI2 * END scalapack-2.0.2/SRC/pztrtri.f000644 000766 000024 00000030704 10363532303 016227 0ustar00juliestaff000000 000000 SUBROUTINE PZTRTRI( UPLO, DIAG, N, A, IA, JA, DESCA, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER IA, INFO, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZTRTRI computes the inverse of a upper or lower triangular * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * Specifies whether the distributed matrix sub( A ) is upper * or lower triangular: * = 'U': Upper triangular, * = 'L': Lower triangular. * * DIAG (global input) CHARACTER * Specifies whether or not the distributed matrix sub( A ) * is unit triangular: * = 'N': Non-unit triangular, * = 'U': Unit triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on, i.e. the * order of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the * triangular matrix sub( A ). If UPLO = 'U', the leading * N-by-N upper triangular part of the matrix sub( A ) contains * the upper triangular matrix to be inverted, and the strictly * lower triangular part of sub( A ) is not referenced. * If UPLO = 'L', the leading N-by-N lower triangular part of * the matrix sub( A ) contains the lower triangular matrix, * and the strictly upper triangular part of sub( A ) is not * referenced. * On exit, the (triangular) inverse of the original matrix. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = K, A(IA+K-1,JA+K-1) is exactly zero. The * triangular matrix sub( A ) is singular and its * inverse can not be computed. * * ==================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER I, ICOFF, ICTXT, IROFF, ICURCOL, ICURROW, $ IDUMMY, II, IOFFA, J, JB, JJ, JN, LDA, MYCOL, $ MYROW, NN, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMX2D, INFOG2L, $ PCHK1MAT, PXERBLA, PZTRTI2, PZTRMM, $ PZTRSM * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) * CALL CHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IROFF = MOD( IA-1, DESCA( MB_ ) ) ICOFF = MOD( JA-1, DESCA( NB_ ) ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( IROFF.NE.ICOFF .OR. IROFF.NE.0 ) THEN INFO = -6 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -(700+NB_) END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOUNIT ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'U' ) END IF IDUM2( 2 ) = 2 * CALL PCHK1MAT( N, 3, N, 3, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZTRTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) IF( NOUNIT ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, ICURROW, ICURCOL ) * * Handle first block separately * JB = JN-JA+1 LDA = DESCA( LLD_ ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*LDA DO 10 I = 0, JB-1 IF( A( IOFFA ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I + 1 IOFFA = IOFFA + LDA + 1 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN IOFFA = II+(JJ-1)*LDA DO 20 I = 0, JB-1 IF( A( IOFFA ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = J + I - JA + 1 IOFFA = IOFFA + LDA + 1 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ II = II + JB IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUMMY, $ IDUMMY, -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN END IF * * Use blocked code * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix * JB = JN-JA+1 * * Handle first block of column separately * CALL PZTRTI2( UPLO, DIAG, JB, A, IA, JA, DESCA, INFO ) * * Loop over remaining block of columns * DO 40 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * * Compute rows 1:j-1 of current block column * CALL PZTRMM( 'Left', UPLO, 'No transpose', DIAG, J-JA, JB, $ ONE, A, IA, JA, DESCA, A, IA, J, DESCA ) CALL PZTRSM( 'Right', UPLO, 'No transpose', DIAG, J-JA, $ JB, -ONE, A, I, J, DESCA, A, IA, J, DESCA ) * * Compute inverse of current diagonal block * CALL PZTRTI2( UPLO, DIAG, JB, A, I, J, DESCA, INFO ) * 40 CONTINUE * ELSE * * Compute inverse of lower triangular matrix * NN = ( ( JA+N-2 ) / DESCA( NB_ ) )*DESCA( NB_ ) + 1 DO 50 J = NN, JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA IF( J+JB.LE.JA+N-1 ) THEN * * Compute rows j+jb:ja+n-1 of current block column * CALL PZTRMM( 'Left', UPLO, 'No transpose', DIAG, $ JA+N-J-JB, JB, ONE, A, I+JB, J+JB, DESCA, $ A, I+JB, J, DESCA ) CALL PZTRSM( 'Right', UPLO, 'No transpose', DIAG, $ JA+N-J-JB, JB, -ONE, A, I, J, DESCA, $ A, I+JB, J, DESCA ) END IF * * Compute inverse of current diagonal block * CALL PZTRTI2( UPLO, DIAG, JB, A, I, J, DESCA, INFO ) * 50 CONTINUE * * Handle the last block of columns separately * JB = JN-JA+1 IF( JA+JB.LE.JA+N-1 ) THEN * * Compute rows ja+jb:ja+n-1 of current block column * CALL PZTRMM( 'Left', UPLO, 'No transpose', DIAG, N-JB, JB, $ ONE, A, IA+JB, JA+JB, DESCA, A, IA+JB, JA, $ DESCA ) CALL PZTRSM( 'Right', UPLO, 'No transpose', DIAG, N-JB, JB, $ -ONE, A, IA, JA, DESCA, A, IA+JB, JA, DESCA ) END IF * * Compute inverse of current diagonal block * CALL PZTRTI2( UPLO, DIAG, JB, A, IA, JA, DESCA, INFO ) * END IF * RETURN * * End PZTRTRI * END scalapack-2.0.2/SRC/pztrtrs.f000644 000766 000024 00000031441 10363532303 016240 0ustar00juliestaff000000 000000 SUBROUTINE PZTRTRS( UPLO, TRANS, DIAG, N, NRHS, A, IA, JA, DESCA, $ B, IB, JB, DESCB, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER DIAG, TRANS, UPLO INTEGER IA, IB, INFO, JA, JB, N, NRHS * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ) COMPLEX*16 A( * ), B( * ) * .. * * Purpose * ======= * * PZTRTRS solves a triangular system of the form * * sub( A ) * X = sub( B ) or sub( A )**T * X = sub( B ) or * * sub( A )**H * X = sub( B ), * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1) and is a triangular * distributed matrix of order N, and B(IB:IB+N-1,JB:JB+NRHS-1) is an * N-by-NRHS distributed matrix denoted by sub( B ). A check is made * to verify that sub( A ) is nonsingular. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER * = 'U': sub( A ) is upper triangular; * = 'L': sub( A ) is lower triangular. * * TRANS (global input) CHARACTER * Specifies the form of the system of equations: * = 'N': Solve sub( A ) * X = sub( B ) (No transpose) * = 'T': Solve sub( A )**T * X = sub( B ) (Transpose) * = 'C': Solve sub( A )**H * X = sub( B ) (Conjugate transpose) * * DIAG (global input) CHARACTER * = 'N': sub( A ) is non-unit triangular; * = 'U': sub( A ) is unit triangular. * * N (global input) INTEGER * The number of rows and columns to be operated on i.e the * order of the distributed submatrix sub( A ). N >= 0. * * NRHS (global input) INTEGER * The number of right hand sides, i.e., the number of columns * of the distributed matrix sub( B ). NRHS >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+N-1) ). This array * contains the local pieces of the distributed triangular * matrix sub( A ). If UPLO = 'U', the leading N-by-N upper * triangular part of sub( A ) contains the upper triangular * matrix, and the strictly lower triangular part of sub( A ) * is not referenced. If UPLO = 'L', the leading N-by-N lower * triangular part of sub( A ) contains the lower triangular * matrix, and the strictly upper triangular part of sub( A ) * is not referenced. If DIAG = 'U', the diagonal elements of * sub( A ) are also not referenced and are assumed to be 1. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension * (LLD_B,LOCc(JB+NRHS-1)). On entry, this array contains the * local pieces of the right hand side distributed matrix * sub( B ). On exit, if INFO = 0, sub( B ) is overwritten by * the solution matrix X. * * IB (global input) INTEGER * The row index in the global array B indicating the first * row of sub( B ). * * JB (global input) INTEGER * The column index in the global array B indicating the * first column of sub( B ). * * DESCB (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix B. * * INFO (output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * > 0: If INFO = i, the i-th diagonal element of sub( A ) is * zero, indicating that the submatrix is singular and the * solutions X have not been computed. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN, NOUNIT, UPPER INTEGER I, IAROW, IBROW, ICOFFA, ICTXT, ICURCOL, $ ICURROW, IROFFA, IROFFB, IDUM, II, IOFFA, J, $ JBLK, JJ, JN, LDA, LL, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. Local Arrays .. INTEGER IDUM1( 3 ), IDUM2( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, IGAMX2D, INFOG2L, $ PCHK2MAT, PXERBLA, PZTRSM * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, INDXG2P EXTERNAL ICEIL, INDXG2P, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ICHAR, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -907 ELSE UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) NOTRAN = LSAME( TRANS, 'N' ) * CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 9, INFO ) CALL CHK1MAT( N, 4, NRHS, 5, IB, JB, DESCB, 13, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFB = MOD( IB-1, DESCB( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IBROW = INDXG2P( IB, DESCB( MB_ ), MYROW, DESCB( RSRC_ ), $ NPROW ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( IROFFA.NE.ICOFFA .OR. IROFFA.NE.0 ) THEN INFO = -8 ELSE IF( IROFFA.NE.IROFFB .OR. IAROW.NE.IBROW ) THEN INFO = -11 ELSE IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -904 ELSE IF( DESCB( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -1304 END IF END IF * IF( UPPER ) THEN IDUM1( 1 ) = ICHAR( 'U' ) ELSE IDUM1( 1 ) = ICHAR( 'L' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IF( LSAME( TRANS, 'T' ) ) THEN IDUM1( 2 ) = ICHAR( 'T' ) ELSE IF( LSAME( TRANS, 'C' ) ) THEN IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IF( NOUNIT ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'D' ) END IF IDUM2( 3 ) = 3 CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 9, N, 4, NRHS, 5, $ IB, JB, DESCB, 13, 3, IDUM1, IDUM2, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZTRTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * IF( NOUNIT ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ II, JJ, ICURROW, ICURCOL ) JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) IOFFA = II + ( JJ - 1 ) * LDA * * Handle first block separately * JBLK = JN-JA+1 IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN LL = IOFFA DO 10 I = 0, JBLK-1 IF( A( LL ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I + 1 LL = IOFFA + LDA + 1 10 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JBLK IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JBLK*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining blocks of columns * DO 30 J = JN+1, JA+N-1, DESCA( NB_ ) JBLK = MIN( JA+N-J, DESCA( NB_ ) ) IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN LL = IOFFA DO 20 I = 0, JBLK-1 IF( A( LL ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = J + I - JA + 1 LL = IOFFA + LDA + 1 20 CONTINUE END IF IF( MYROW.EQ.ICURROW ) $ IOFFA = IOFFA + JBLK IF( MYCOL.EQ.ICURCOL ) $ IOFFA = IOFFA + JBLK*LDA ICURROW = MOD( ICURROW+1, NPROW ) ICURCOL = MOD( ICURCOL+1, NPCOL ) 30 CONTINUE CALL IGAMX2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, IDUM, IDUM, $ -1, -1, MYCOL ) IF( INFO.NE.0 ) $ RETURN END IF * * Solve A * x = b, A**T * x = b, or A**H * x = b. * CALL PZTRSM( 'Left', UPLO, TRANS, DIAG, N, NRHS, ONE, A, IA, JA, $ DESCA, B, IB, JB, DESCB ) * RETURN * * End of PZTRTRS * END scalapack-2.0.2/SRC/pztzrzf.f000644 000766 000024 00000031112 10363532303 016234 0ustar00juliestaff000000 000000 SUBROUTINE PZTZRZF( M, N, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZTZRZF reduces the M-by-N ( M<=N ) complex upper trapezoidal matrix * sub( A ) = A(IA:IA+M-1,JA:JA+N-1) to upper triangular form by means * of unitary transformations. * * The upper trapezoidal matrix sub( A ) is factored as * * sub( A ) = ( R 0 ) * Z, * * where Z is an N-by-N unitary matrix and R is an M-by-M upper * triangular matrix. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on, i.e. the number of rows * of the distributed submatrix sub( A ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on, i.e. the number of * columns of the distributed submatrix sub( A ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A, LOCc(JA+N-1)). * On entry, the local pieces of the M-by-N distributed matrix * sub( A ) which is to be factored. On exit, the leading M-by-M * upper triangular part of sub( A ) contains the upper trian- * gular matrix R, and elements M+1 to N of the first M rows of * sub( A ), with the array TAU, represent the unitary matrix Z * as a product of M elementary reflectors. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local output) COMPLEX*16, array, dimension LOCr(IA+M-1) * This array contains the scalar factors of the elementary * reflectors. TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( Mp0 + Nq0 + MB_A ), where * * IROFF = MOD( IA-1, MB_A ), ICOFF = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * Mp0 = NUMROC( M+IROFF, MB_A, MYROW, IAROW, NPROW ), * Nq0 = NUMROC( N+ICOFF, NB_A, MYCOL, IACOL, NPCOL ), * * and NUMROC, INDXG2P are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Further Details * =============== * * The factorization is obtained by Householder's method. The kth * transformation matrix, Z( k ), whose conjugate transpose is used to * introduce zeros into the (m - k + 1)th row of sub( A ), is given in * the form * * Z( k ) = ( I 0 ), * ( 0 T( k ) ) * * where * * T( k ) = I - tau*u( k )*u( k )', u( k ) = ( 1 ), * ( 0 ) * ( z( k ) ) * * tau is a scalar and z( k ) is an ( n - m ) element vector. * tau and z( k ) are chosen to annihilate the elements of the kth row * of sub( A ). * * The scalar tau is returned in the kth element of TAU and the vector * u( k ) in the kth row of sub( A ), such that the elements of z( k ) * are in a( k, m + 1 ), ..., a( k, n ). The elements of R are returned * in the upper triangular part of sub( A ). * * Z is given by * * Z = Z( 1 ) * Z( 2 ) * ... * Z( m ). * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IIA, IL, IN, IPW, $ IROFFA, J, JM1, L, LWMIN, MP0, MYCOL, MYROW, $ NPCOL, NPROW, NQ0 * .. * .. Local Arrays .. INTEGER IDUM1( 1 ), IDUM2( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, INFOG1L, PCHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZLATRZ, $ PZLARZB, PZLARZT * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(600+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MP0 = NUMROC( M+IROFFA, DESCA( MB_ ), MYROW, IAROW, NPROW ) NQ0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MP0 + NQ0 + DESCA( MB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -9 END IF END IF IF( LQUERY ) THEN IDUM1( 1 ) = -1 ELSE IDUM1( 1 ) = 1 END IF IDUM2( 1 ) = 9 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 6, 1, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZTZRZF', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( M.EQ.N ) THEN * CALL INFOG1L( IA, DESCA( MB_ ), NPROW, MYROW, DESCA( RSRC_ ), $ IIA, IAROW ) IF( MYROW.EQ.IAROW ) $ MP0 = MP0 - IROFFA DO 10 I = IIA, IIA+MP0-1 TAU( I ) = ZERO 10 CONTINUE * ELSE * L = N-M JM1 = JA + MIN( M+1, N ) - 1 IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 ) IL = MAX( ( (IA+M-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * * Use blocked code initially * DO 20 I = IL, IN+1, -DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) J = JA + I - IA * * Compute the complete orthogonal factorization of the current * block A(i:i+ib-1,j:ja+n-1) * CALL PZLATRZ( IB, JA+N-J, L, A, I, J, DESCA, TAU, WORK ) * IF( I.GT.IA ) THEN * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PZLARZT( 'Backward', 'Rowwise', L, IB, A, I, JM1, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:i-1,j:ja+n-1) from the right * CALL PZLARZB( 'Right', 'No transpose', 'Backward', $ 'Rowwise', I-IA, JA+N-J, IB, L, A, I, JM1, $ DESCA, WORK, A, IA, J, DESCA, WORK( IPW ) ) END IF * 20 CONTINUE * * Use unblocked code to factor the last or only block * CALL PZLATRZ( IN-IA+1, N, N-M, A, IA, JA, DESCA, TAU, WORK ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZTZRZF * END scalapack-2.0.2/SRC/pzung2l.f000644 000766 000024 00000025732 10363532303 016117 0ustar00juliestaff000000 000000 SUBROUTINE PZUNG2L( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNG2L generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the last N columns of a product of K elementary reflectors of order M * * Q = H(k) . . . H(2) H(1) * * as returned by PZGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA+N-K <= j <= JA+N-1, * as returned by PZGEQLF in the K columns of its distributed * matrix argument A(IA:*,JA+N-K:JA+N-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PZGEQLF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MpA0 + MAX( 1, NqA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, J, JJ, LWMIN, MPA0, MYCOL, $ MYROW, NPCOL, NPROW, NQA0 COMPLEX*16 TAUJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, PZLARF, $ PZLASET, PZSCAL * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MPA0 + MAX( 1, NQA0 ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNG2L', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Initialise columns ja:ja+n-k-1 to columns of the unit matrix * CALL PZLASET( 'All', M-N, N-K, ZERO, ZERO, A, IA, JA, DESCA ) CALL PZLASET( 'All', N, N-K, ZERO, ONE, A, IA+M-N, JA, DESCA ) * TAUJ = ZERO NQA0 = MAX( 1, NUMROC( JA+N-1, DESCA( NB_ ), MYCOL, $ DESCA( CSRC_ ), NPCOL ) ) DO 10 J = JA+N-K, JA+N-1 * * Apply H(j) to A(ia:ia+m-n+j-ja,ja:j) from the left * CALL PZELSET( A, IA+M-N+J-JA, J, DESCA, ONE ) CALL PZLARF( 'Left', M-N+J-JA+1, J-JA, A, IA, J, DESCA, 1, TAU, $ A, IA, JA, DESCA, WORK ) * JJ = INDXG2L( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) IACOL = INDXG2P( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ TAUJ = TAU( MIN( JJ, NQA0 ) ) CALL PZSCAL( M-N+J-JA, -TAUJ, A, IA, J, DESCA, 1 ) CALL PZELSET( A, IA+M-N+J-JA, J, DESCA, ONE-TAUJ ) * * Set A(ia+m-n+j-ja+1:ia+m-1,j) to zero * CALL PZLASET( 'All', JA+N-1-J, 1, ZERO, ZERO, A, IA+M-N+J-JA+1, $ J, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNG2L * END scalapack-2.0.2/SRC/pzung2r.f000644 000766 000024 00000026001 10363532303 016113 0ustar00juliestaff000000 000000 SUBROUTINE PZUNG2R( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNG2R generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the first N columns of a product of K elementary reflectors of order * M * * Q = H(1) H(2) . . . H(k) * * as returned by PZGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA <= j <= JA+K-1, as * returned by PZGEQRF in the K columns of its array * argument A(IA:*,JA:JA+K-1). On exit, this array contains * the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PZGEQRF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MpA0 + MAX( 1, NqA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, J, JJ, KQ, LWMIN, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 COMPLEX*16 TAUJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, PZLARF, $ PZLASET, PZSCAL * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = MPA0 + MAX( 1, NQA0 ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNG2R', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Initialise columns ja+k:ja+n-1 to columns of the unit matrix * CALL PZLASET( 'All', K, N-K, ZERO, ZERO, A, IA, JA+K, DESCA ) CALL PZLASET( 'All', M-K, N-K, ZERO, ONE, A, IA+K, JA+K, DESCA ) * TAUJ = ZERO KQ = MAX( 1, NUMROC( JA+K-1, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ) DO 10 J = JA+K-1, JA, -1 * * Apply H(j) to A(ia+j-ja:ia+m-1,j:ja+n-1) from the left * IF( J.LT.JA+N-1 ) THEN CALL PZELSET( A, IA+J-JA, J, DESCA, ONE ) CALL PZLARF( 'Left', M-J+JA, JA+N-J-1, A, IA+J-JA, J, DESCA, $ 1, TAU, A, IA+J-JA, J+1, DESCA, WORK ) END IF * JJ = INDXG2L( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), NPCOL ) IACOL = INDXG2P( J, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) $ TAUJ = TAU( MIN( JJ, KQ ) ) IF( J-JA.LT.M-1 ) $ CALL PZSCAL( M-J+JA-1, -TAUJ, A, IA+J-JA+1, J, DESCA, 1 ) CALL PZELSET( A, IA+J-JA, J, DESCA, ONE-TAUJ ) * * Set A(ia:ia+j-ja-1,j) to zero * CALL PZLASET( 'All', J-JA, 1, ZERO, ZERO, A, IA, J, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNG2R * END scalapack-2.0.2/SRC/pzungl2.f000644 000766 000024 00000026432 10363532303 016115 0ustar00juliestaff000000 000000 SUBROUTINE PZUNGL2( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNGL2 generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as * the first M rows of a product of K elementary reflectors of order N * * Q = H(k)' . . . H(2)' H(1)' * * as returned by PZGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA <= i <= IA+K-1, as * returned by PZGELQF in the K rows of its distributed matrix * argument A(IA:IA+K-1,JA:*). On exit, this array contains the * local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCr(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZGELQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NqA0 + MAX( 1, MpA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, II, J, KP, LWMIN, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 COMPLEX*16 TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZLACGV, PZLARFC, PZLASET, PZSCAL * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQA0 + MAX( 1, MPA0 ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNGL2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * IF( K.LT.M ) THEN * * Initialise rows ia+k:ia+m-1 to rows of the unit matrix * CALL PZLASET( 'All', M-K, K, ZERO, ZERO, A, IA+K, JA, DESCA ) CALL PZLASET( 'All', M-K, N-K, ZERO, ONE, A, IA+K, JA+K, $ DESCA ) * END IF * TAUI = ZERO KP = NUMROC( IA+K-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) * DO 10 I = IA+K-1, IA, -1 * * Apply H(i)' to A(i:ia+m-1,ja+i-ia:ja+n-1) from the right * J = JA + I - IA II = INDXG2L( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IAROW = INDXG2P( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IF( MYROW.EQ.IAROW ) $ TAUI = TAU( MIN( II, KP ) ) IF( J.LT.JA+N-1 ) THEN CALL PZLACGV( N-J+JA-1, A, I, J+1, DESCA, DESCA( M_ ) ) IF( I.LT.IA+M-1 ) THEN CALL PZELSET( A, I, J, DESCA, ONE ) CALL PZLARFC( 'Right', M-I+IA-1, N-J+JA, A, I, J, DESCA, $ DESCA( M_ ), TAU, A, I+1, J, DESCA, WORK ) END IF CALL PZSCAL( N-J+JA-1, -TAUI, A, I, J+1, DESCA, $ DESCA( M_ ) ) CALL PZLACGV( N-J+JA-1, A, I, J+1, DESCA, DESCA( M_ ) ) END IF CALL PZELSET( A, I, J, DESCA, ONE-DCONJG( TAUI ) ) * * Set A(i,ja:j-1) to zero * CALL PZLASET( 'All', 1, J-JA, ZERO, ZERO, A, I, JA, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNGL2 * END scalapack-2.0.2/SRC/pzunglq.f000644 000766 000024 00000031003 10363532303 016202 0ustar00juliestaff000000 000000 SUBROUTINE PZUNGLQ( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNGLQ generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as * the first M rows of a product of K elementary reflectors of order N * * Q = H(k)' . . . H(2)' H(1)' * * as returned by PZGELQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA <= i <= IA+K-1, as * returned by PZGELQF in the K rows of its distributed matrix * argument A(IA:IA+K-1,JA:*). On exit, this array contains the * local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCr(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZGELQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( MpA0 + NqA0 + MB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IL, IN, IPW, $ J, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZLARFB, PZLARFT, $ PZLASET, PZUNGL2 * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNGLQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) IL = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) * CALL PZLASET( 'All', IA+M-IL, IL-IA, ZERO, ZERO, A, IL, JA, $ DESCA ) * * Use unblocked code for the last or only block. * CALL PZUNGL2( IA+M-IL, N-IL+IA, IA+K-IL, A, IL, JA+IL-IA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Is there at least one block of rows to loop over ? * IF( IL.GT.IN+1 ) THEN * * Use blocked code * DO 10 I = IL-DESCA( MB_ ), IN+1, -DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) J = JA + I - IA * IF( I+IB.LE.IA+M-1 ) THEN * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PZLARFT( 'Forward', 'Rowwise', N-I+IA, IB, A, I, J, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(i+ib:ia+m-1,j:ja+n-1) from the right * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-I-IB+IA, N-I+IA, IB, A, I, J, $ DESCA, WORK, A, I+IB, J, DESCA, $ WORK( IPW ) ) END IF * * Apply H' to columns j:ja+n-1 of current block * CALL PZUNGL2( IB, N-I+IA, IB, A, I, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set columns ia:i-1 of current block to zero * CALL PZLASET( 'All', IB, I-IA, ZERO, ZERO, A, I, JA, DESCA ) 10 CONTINUE * END IF * * Handle first block separately * IF( IL.GT.IA ) THEN * IB = IN - IA + 1 * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PZLARFT( 'Forward', 'Rowwise', N, IB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia+ib:ia+m-1,ja:ja+n-1) from the right * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Forward', $ 'Rowwise', M-IB, N, IB, A, IA, JA, DESCA, WORK, $ A, IA+IB, JA, DESCA, WORK( IPW ) ) * * Apply H' to columns ja:ja+n-1 of current block * CALL PZUNGL2( IB, N, IB, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNGLQ * END scalapack-2.0.2/SRC/pzungql.f000644 000766 000024 00000027042 10363532303 016212 0ustar00juliestaff000000 000000 SUBROUTINE PZUNGQL( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNGQL generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the last N columns of a product of K elementary reflectors of order M * * Q = H(k) . . . H(2) H(1) * * as returned by PZGEQLF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA+N-K <= j <= JA+N-1, * as returned by PZGEQLF in the K columns of its distributed * matrix argument A(IA:*,JA+N-K:JA+N-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PZGEQLF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( NqA0 + MpA0 + NB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICTXT, IINFO, IPW, J, JB, JN, $ LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZLARFB, PZLARFT, $ PZLASET, PZUNG2L * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNGQL', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * IPW = DESCA( NB_ )*DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA+N-K, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * * Set A(ia+m-n+jn-ja+1:ia-m+1,ja:jn) to zero. * CALL PZLASET( 'All', N-JN+JA-1, JN-JA+1, ZERO, ZERO, A, $ IA+M-N+JN-JA+1, JA, DESCA ) * * Use unblocked code for the first or only block. * CALL PZUNG2L( M-N+JN-JA+1, JN-JA+1, JN-JA-N+K+1, A, IA, JA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Use blocked code * DO 10 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PZLARFT( 'Backward', 'Columnwise', M-N+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:ia+m-n+j+jb-ja-1,ja:j-1) from the left * CALL PZLARFB( 'Left', 'No transpose', 'Backward', $ 'Columnwise', M-N+J+JB-JA, J-JA, JB, A, IA, $ J, DESCA, WORK, A, IA, JA, DESCA, WORK( IPW ) ) * * Apply H to rows ia:m-k+i+ib-1 of current block * CALL PZUNG2L( M-N+J+JB-JA, JB, JB, A, IA, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows ia+m-n+j+jb-ja:ia+m-1,j:j+jb-1 of current block to * zero * CALL PZLASET( 'All', N-J-JB+JA, JB, ZERO, ZERO, A, $ IA+M-N+J+JB-JA, J, DESCA ) * 10 CONTINUE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNGQL * END scalapack-2.0.2/SRC/pzungqr.f000644 000766 000024 00000030773 10363532303 016225 0ustar00juliestaff000000 000000 SUBROUTINE PZUNGQR( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNGQR generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal columns, which is defined as * the first N columns of a product of K elementary reflectors of order * M * * Q = H(1) H(2) . . . H(k) * * as returned by PZGEQRF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. M >= N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. N >= K >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the j-th column must contain the vector which * defines the elementary reflector H(j), JA <= j <= JA+K-1, as * returned by PZGEQRF in the K columns of its distributed * matrix argument A(IA:*,JA:JA+K-1). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JA+K-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PZGEQRF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NB_A * ( NqA0 + MpA0 + NB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, ICTXT, IINFO, IPW, J, JB, JL, $ JN, LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, $ NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZLARFB, PZLARFT, $ PZLASET, PZUNG2R * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( NB_ ) * ( MPA0 + NQA0 + DESCA( NB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.GT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.N ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNGQR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( N.LE.0 ) $ RETURN * IPW = DESCA( NB_ )*DESCA( NB_ ) + 1 JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) JL = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) * CALL PZLASET( 'All', JL-JA, JA+N-JL, ZERO, ZERO, A, IA, JL, $ DESCA ) * * Use unblocked code for the last or only block. * CALL PZUNG2R( M-JL+JA, JA+N-JL, JA+K-JL, A, IA+JL-JA, JL, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Is there at least one block of columns to loop over ? * IF( JL.GT.JN+1 ) THEN * * Use blocked code * DO 10 J = JL-DESCA( NB_ ), JN+1, -DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) I = IA + J - JA * IF( J+JB.LE.JA+N-1 ) THEN * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PZLARFT( 'Forward', 'Columnwise', M-I+IA, JB, A, I, $ J, DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H to A(i:ia+m-1,j+jb:ja+n-1) from the left * CALL PZLARFB( 'Left', 'No transpose', 'Forward', $ 'Columnwise', M-I+IA, N-J-JB+JA, JB, A, I, $ J, DESCA, WORK, A, I, J+JB, DESCA, $ WORK( IPW ) ) END IF * * Apply H to rows i:ia+m-1 of current block * CALL PZUNG2R( M-I+IA, JB, JB, A, I, J, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows ia:i-1 of current block to zero * CALL PZLASET( 'All', I-IA, JB, ZERO, ZERO, A, IA, J, DESCA ) * 10 CONTINUE * END IF * * Handle first block separately * IF( JL.GT.JA ) THEN * JB = JN - JA + 1 * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PZLARFT( 'Forward', 'Columnwise', M, JB, A, IA, JA, DESCA, $ TAU, WORK, WORK( IPW ) ) * * Apply H to A(ia:ia+m-1,ja+jb:ja+n-1) from the left * CALL PZLARFB( 'Left', 'No transpose', 'Forward', 'Columnwise', $ M, N-JB, JB, A, IA, JA, DESCA, WORK, A, IA, $ JA+JB, DESCA, WORK( IPW ) ) * * Apply H to rows ia:ia+m-1 of current block * CALL PZUNG2R( M, JB, JB, A, IA, JA, DESCA, TAU, WORK, LWORK, $ IINFO ) * END IF * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNGQR * END scalapack-2.0.2/SRC/pzungr2.f000644 000766 000024 00000026213 10363532303 016120 0ustar00juliestaff000000 000000 SUBROUTINE PZUNGR2( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNGR2 generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as the * last M rows of a product of K elementary reflectors of order N * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PZGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA+M-K <= i <= IA+M-1, as * returned by PZGERQF in the K rows of its distributed * matrix argument A(IA+M-K:IA+M-1,JA:*). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCr(IA+M-1) * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZGERQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= NqA0 + MAX( 1, MpA0 ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, I, ICTXT, II, LWMIN, MP, MPA0, $ MYCOL, MYROW, NPCOL, NPROW, NQA0 COMPLEX*16 TAUI * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZLACGV, PZLARFC, PZLASET, PZSCAL * .. * .. External Functions .. INTEGER INDXG2L, INDXG2P, NUMROC EXTERNAL INDXG2L, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = NQA0 + MAX( 1, MPA0 ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNGR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * IF( K.LT.M ) THEN * * Initialise rows ia:ia+m-k-1 to rows of the unit matrix * CALL PZLASET( 'All', M-K, N-M, ZERO, ZERO, A, IA, JA, DESCA ) CALL PZLASET( 'All', M-K, M, ZERO, ONE, A, IA, JA+N-M, DESCA ) * END IF * TAUI = ZERO MP = NUMROC( IA+M-1, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) * DO 10 I = IA+M-K, IA+M-1 * * Apply H(i)' to A(ia:i,ja:ja+n-m+i-ia) from the right * CALL PZLACGV( I-IA+N-M, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZELSET( A, I, JA+N-M+I-IA, DESCA, ONE ) CALL PZLARFC( 'Right', I-IA, I-IA+N-M+1, A, I, JA, DESCA, $ DESCA( M_ ), TAU, A, IA, JA, DESCA, WORK ) II = INDXG2L( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), NPROW ) IAROW = INDXG2P( I, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IF( MYROW.EQ.IAROW ) $ TAUI = TAU( MIN( II, MP ) ) CALL PZSCAL( I-IA+N-M, -TAUI, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZLACGV( I-IA+N-M, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZELSET( A, I, JA+N-M+I-IA, DESCA, ONE-DCONJG( TAUI ) ) * * Set A(i,ja+n-m+i-ia+1:ja+n-1) to zero * CALL PZLASET( 'All', 1, IA+M-1-I, ZERO, ZERO, A, I, $ JA+N-M+I-IA+1, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNGR2 * END scalapack-2.0.2/SRC/pzungrq.f000644 000766 000024 00000027062 10363532303 016222 0ustar00juliestaff000000 000000 SUBROUTINE PZUNGRQ( M, N, K, A, IA, JA, DESCA, TAU, WORK, LWORK, $ INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNGRQ generates an M-by-N complex distributed matrix Q denoting * A(IA:IA+M-1,JA:JA+N-1) with orthonormal rows, which is defined as the * last M rows of a product of K elementary reflectors of order N * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PZGERQF. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix Q. M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix Q. * N >= M >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. M >= K >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, the i-th row must contain the vector which defines * the elementary reflector H(i), IA+M-K <= i <= IA+M-1, as * returned by PZGERQF in the K rows of its distributed * matrix argument A(IA+M-K:IA+M-1,JA:*). On exit, this array * contains the local pieces of the M-by-N distributed matrix Q. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCr(IA+M-1) * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZGERQF. * TAU is tied to the distributed matrix A. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * LWORK >= MB_A * ( MpA0 + NqA0 + MB_A ), where * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MpA0 = NUMROC( M+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LQUERY CHARACTER COLBTOP, ROWBTOP INTEGER I, IACOL, IAROW, IB, ICTXT, IINFO, IN, IPW, $ LWMIN, MPA0, MYCOL, MYROW, NPCOL, NPROW, NQA0 * .. * .. Local Arrays .. INTEGER IDUM1( 2 ), IDUM2( 2 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZLARFB, PZLARFT, $ PZLASET, PZUNGR2 * .. * .. External Functions .. INTEGER ICEIL, INDXG2P, NUMROC EXTERNAL ICEIL, INDXG2P, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(700+CTXT_) ELSE CALL CHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, INFO ) IF( INFO.EQ.0 ) THEN IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) MPA0 = NUMROC( M+MOD( IA-1, DESCA( MB_ ) ), DESCA( MB_ ), $ MYROW, IAROW, NPROW ) NQA0 = NUMROC( N+MOD( JA-1, DESCA( NB_ ) ), DESCA( NB_ ), $ MYCOL, IACOL, NPCOL ) LWMIN = DESCA( MB_ ) * ( MPA0 + NQA0 + DESCA( MB_ ) ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.M ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.M ) THEN INFO = -3 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF END IF IDUM1( 1 ) = K IDUM2( 1 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 2 ) = -1 ELSE IDUM1( 2 ) = 1 END IF IDUM2( 2 ) = 10 CALL PCHK1MAT( M, 1, N, 2, IA, JA, DESCA, 7, 2, IDUM1, IDUM2, $ INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNGRQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.LE.0 ) $ RETURN * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 IN = MIN( ICEIL( IA+M-K, DESCA( MB_ ) )*DESCA( MB_ ), IA+M-1 ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) * * Set A(ia:in,ja+n-m+in-ia+1:ja-n+1) to zero. * CALL PZLASET( 'All', IN-IA+1, M-IN+IA-1, ZERO, ZERO, A, IA, $ JA+N-M+IN-IA+1, DESCA ) * * Use unblocked code for the first or only block. * CALL PZUNGR2( IN-IA+1, N-M+IN-IA+1, IN-IA-M+K+1, A, IA, JA, DESCA, $ TAU, WORK, LWORK, IINFO ) * * Use blocked code * DO 10 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PZLARFT( 'Backward', 'Rowwise', N-M+I+IB-IA, IB, A, I, JA, $ DESCA, TAU, WORK, WORK( IPW ) ) * * Apply H' to A(ia:i-1,ja:ja+n-m+i+ib-ia-1) from the right * CALL PZLARFB( 'Right', 'Conjugate transpose', 'Backward', $ 'Rowwise', I-IA, N-M+I+IB-IA, IB, A, I, JA, $ DESCA, WORK, A, IA, JA, DESCA, WORK( IPW ) ) * * Apply H' to columns ja:ja+n-m+i+ib-ia-1 of current block * CALL PZUNGR2( IB, N-M+I+IB-IA, IB, A, I, JA, DESCA, TAU, WORK, $ LWORK, IINFO ) * * Set rows i:i+ib-1,ja+n-m+i+ib-ia:ja+n-1 of current block to * zero * CALL PZLASET( 'All', IB, M-I-IB+IA, ZERO, ZERO, A, I, $ JA+N-M+I+IB-IA, DESCA ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNGRQ * END scalapack-2.0.2/SRC/pzunm2l.f000644 000766 000024 00000041465 10363532303 016126 0ustar00juliestaff000000 000000 SUBROUTINE PZUNM2L( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNM2L overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PZGEQLF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PZGEQLF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ), * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PZGEQLF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( 1, NqC0 ); * if SIDE = 'R', LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( * NUMROC( N+ICOFFC,NB_A,0,0,NPCOL ),NB_A,0,0,LCMQ ) ); * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICCOL, ICOFFC, ICROW, ICTXT, ICC, $ II, IROFFA, IROFFC, J, J1, J2, J3, JCC, JJ, $ LCM, LCMQ, LWMIN, MI, MP, MPC0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 COMPLEX*16 AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZELSET2, PZLARF, PZLARFC, ZGEBR2D, ZGEBS2D, $ ZGERV2D, ZGESD2D, ZSCAL * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MPC0 + MAX( 1, NQC0 ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = NQC0 + MAX( MAX( 1, MPC0 ), NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ) ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNM2L', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, ICC, $ JCC, ICROW, ICCOL ) IF( LEFT ) THEN IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JC+N-1, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) THEN IF( NOTRAN ) THEN AJJ = ONE - TAU( JJ ) ELSE AJJ = ONE - DCONJG( TAU( JJ ) ) END IF CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1 ) CALL ZSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1, $ IAROW, IACOL ) CALL ZSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN IF( NOTRAN ) THEN AJJ = ONE - TAU( JJ ) ELSE AJJ = ONE - DCONJG( TAU( JJ ) ) END IF END IF * IF( IACOL.NE.ICCOL ) THEN IF( MYCOL.EQ.IACOL ) $ CALL ZGESD2D( ICTXT, 1, 1, AJJ, 1, MYROW, ICCOL ) IF( MYCOL.EQ.ICCOL ) $ CALL ZGERV2D( ICTXT, 1, 1, AJJ, 1, MYROW, IACOL ) END IF * IF( MYCOL.EQ.ICCOL ) THEN MP = NUMROC( IC+M-1, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) CALL ZSCAL( MP-ICC+1, AJJ, C( ICC+(JCC-1)* $ DESCC( LLD_ ) ), 1 ) END IF * END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( LEFT .AND. NOTRAN .OR. .NOT.LEFT .AND. .NOT.NOTRAN ) THEN J1 = JA J2 = JA+K-1 J3 = 1 ELSE J1 = JA+K-1 J2 = JA J3 = -1 END IF * IF( LEFT ) THEN NI = N IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M END IF * DO 10 J = J1, J2, J3 * IF( LEFT ) THEN * * H(j) or H(j)' is applied to C(ic:ic+m-k+j-ja,jc:jc+n-1) * MI = M - K + J - JA + 1 ELSE * * H(j) or H(j)' is applied to C(ic:ic+m-1,jc:jc+n-k+j-ja) * NI = N - K + J - JA + 1 END IF * * Apply H(j) or H(j)' * CALL PZELSET2( AJJ, A, IA+NQ-K+J-JA, J, DESCA, ONE ) IF( NOTRAN ) THEN CALL PZLARF( SIDE, MI, NI, A, IA, J, DESCA, 1, TAU, C, $ IC, JC, DESCC, WORK ) ELSE CALL PZLARFC( SIDE, MI, NI, A, IA, J, DESCA, 1, TAU, C, $ IC, JC, DESCC, WORK ) END IF CALL PZELSET( A, IA+NQ-K+J-JA, J, DESCA, AJJ ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNM2L * END scalapack-2.0.2/SRC/pzunm2r.f000644 000766 000024 00000041632 10363532303 016130 0ustar00juliestaff000000 000000 SUBROUTINE PZUNM2R( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNM2R overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of k elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PZGEQRF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PZGEQRF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ); * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PZGEQRF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( 1, NqC0 ); * if SIDE = 'R', LWORK >= NqC0 + MAX( MAX( 1, MpC0 ), NUMROC( * NUMROC( N+ICOFFC,NB_A,0,0,NPCOL ),NB_A,0,0,LCMQ ) ); * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IACOL, IAROW, ICCOL, ICOFFC, ICROW, ICTXT, ICC, $ II, IROFFA, IROFFC, J, J1, J2, J3, JCC, JJ, $ LCM, LCMQ, LWMIN, MI, MP, MPC0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 COMPLEX*16 AJJ * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, INFOG2L, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZELSET2, PZLARF, PZLARFC, ZGEBR2D, ZGEBS2D, $ ZGERV2D, ZGESD2D, ZSCAL * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MPC0 + MAX( 1, NQC0 ) ELSE LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = NQC0 + MAX( MAX( 1, MPC0 ), NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ) ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNM2R', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * IF( DESCA( M_ ).EQ.1 ) THEN CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, IAROW, IACOL ) CALL INFOG2L( IC, JC, DESCC, NPROW, NPCOL, MYROW, MYCOL, ICC, $ JCC, ICROW, ICCOL ) IF( LEFT ) THEN IF( MYROW.EQ.IAROW ) THEN NQ = NUMROC( JC+N-1, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) IF( MYCOL.EQ.IACOL ) THEN IF( NOTRAN ) THEN AJJ = ONE - TAU( JJ ) ELSE AJJ = ONE - DCONJG( TAU( JJ ) ) END IF CALL ZGEBS2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1 ) CALL ZSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) ELSE CALL ZGEBR2D( ICTXT, 'Rowwise', ' ', 1, 1, AJJ, 1, $ IAROW, IACOL ) CALL ZSCAL( NQ-JCC+1, AJJ, $ C( ICC+(JCC-1)*DESCC( LLD_ ) ), $ DESCC( LLD_ ) ) END IF END IF ELSE IF( MYCOL.EQ.IACOL ) THEN IF( NOTRAN ) THEN AJJ = ONE - TAU( JJ ) ELSE AJJ = ONE - DCONJG( TAU( JJ ) ) END IF END IF * IF( IACOL.NE.ICCOL ) THEN IF( MYCOL.EQ.IACOL ) $ CALL ZGESD2D( ICTXT, 1, 1, AJJ, 1, MYROW, ICCOL ) IF( MYCOL.EQ.ICCOL ) $ CALL ZGERV2D( ICTXT, 1, 1, AJJ, 1, MYROW, IACOL ) END IF * IF( MYCOL.EQ.ICCOL ) THEN MP = NUMROC( IC+M-1, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) CALL ZSCAL( MP-ICC+1, AJJ, C( ICC+(JCC-1)* $ DESCC( LLD_ ) ), 1 ) END IF * END IF * ELSE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) THEN J1 = JA J2 = JA+K-1 J3 = 1 ELSE J1 = JA+K-1 J2 = JA J3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M ICC = IC END IF * DO 10 J = J1, J2, J3 IF( LEFT ) THEN * * H(j) or H(j)' is applied to C(ic+j-ja:ic+m-1,jc:jc+n-1) * MI = M - J + JA ICC = IC + J - JA ELSE * * H(j) or H(j)' is applied to C(ic:ic+m-1,jc+j-ja:jc+n-1) * NI = N - J + JA JCC = JC + J - JA END IF * * Apply H(j) or H(j)' * CALL PZELSET2( AJJ, A, IA+J-JA, J, DESCA, ONE ) IF( NOTRAN ) THEN CALL PZLARF( SIDE, MI, NI, A, IA+J-JA, J, DESCA, 1, TAU, $ C, ICC, JCC, DESCC, WORK ) ELSE CALL PZLARFC( SIDE, MI, NI, A, IA+J-JA, J, DESCA, 1, TAU, $ C, ICC, JCC, DESCC, WORK ) END IF CALL PZELSET( A, IA+J-JA, J, DESCA, AJJ ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNM2R * END scalapack-2.0.2/SRC/pzunmbr.f000644 000766 000024 00000054374 10363532303 016217 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMBR( VECT, SIDE, TRANS, M, N, K, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, VECT INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * If VECT = 'Q', PZUNMBR overwrites the general complex distributed * M-by-N matrix sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * If VECT = 'P', PZUNMBR overwrites sub( C ) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': P * sub( C ) sub( C ) * P * TRANS = 'C': P**H * sub( C ) sub( C ) * P**H * * Here Q and P**H are the unitary distributed matrices determined by * PZGEBRD when reducing a complex distributed matrix A(IA:*,JA:*) to * bidiagonal form: A(IA:*,JA:*) = Q * B * P**H. Q and P**H are defined * as products of elementary reflectors H(i) and G(i) respectively. * * Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the * order of the unitary matrix Q or P**H that is applied. * * If VECT = 'Q', A(IA:*,JA:*) is assumed to have been an NQ-by-K * matrix: * if nq >= k, Q = H(1) H(2) . . . H(k); * if nq < k, Q = H(1) H(2) . . . H(nq-1). * * If VECT = 'P', A(IA:*,JA:*) is assumed to have been a K-by-NQ * matrix: * if k < nq, P = G(1) G(2) . . . G(k); * if k >= nq, P = G(1) G(2) . . . G(nq-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * VECT (global input) CHARACTER * = 'Q': apply Q or Q**H; * = 'P': apply P or P**H. * * SIDE (global input) CHARACTER * = 'L': apply Q, Q**H, P or P**H from the Left; * = 'R': apply Q, Q**H, P or P**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q or P; * = 'C': Conjugate transpose, apply Q**H or P**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * If VECT = 'Q', the number of columns in the original * distributed matrix reduced by PZGEBRD. * If VECT = 'P', the number of rows in the original * distributed matrix reduced by PZGEBRD. * K >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+MIN(NQ,K)-1)) if * VECT='Q', and (LLD_A,LOCc(JA+NQ-1)) if VECT = 'P'. NQ = M * if SIDE = 'L', and NQ = N otherwise. The vectors which * define the elementary reflectors H(i) and G(i), whose * products determine the matrices Q and P, as returned by * PZGEBRD. * If VECT = 'Q', LLD_A >= max(1,LOCr(IA+NQ-1)); * if VECT = 'P', LLD_A >= max(1,LOCr(IA+MIN(NQ,K)-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16 array, dimension * LOCc(JA+MIN(NQ,K)-1) if VECT = 'Q', LOCr(IA+MIN(NQ,K)-1) if * VECT = 'P', TAU(i) must contain the scalar factor of the * elementary reflector H(i) or G(i), which determines Q or P, * as returned by PDGEBRD in its array argument TAUQ or TAUP. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, if VECT='Q', sub( C ) is overwritten by Q*sub( C ) * or Q'*sub( C ) or sub( C )*Q' or sub( C )*Q; if VECT='P, * sub( C ) is overwritten by P*sub( C ) or P'*sub( C ) or * sub( C )*P or sub( C )*P'. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * NQ = M; * if( (VECT = 'Q' and NQ >= K) or (VECT <> 'Q' and NQ > K) ), * IAA=IA; JAA=JA; MI=M; NI=N; ICC=IC; JCC=JC; * else * IAA=IA+1; JAA=JA; MI=M-1; NI=N; ICC=IC+1; JCC=JC; * end if * else if SIDE = 'R', * NQ = N; * if( (VECT = 'Q' and NQ >= K) or (VECT <> 'Q' and NQ > K) ), * IAA=IA; JAA=JA; MI=M; NI=N; ICC=IC; JCC=JC; * else * IAA=IA; JAA=JA+1; MI=M; NI=N-1; ICC=IC; JCC=JC+1; * end if * end if * * If VECT = 'Q', * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * else if VECT <> 'Q', * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( MI+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * end if * * where LCMP = LCM / NPROW, LCMQ = LCM / NPCOL, with * LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JAA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( MI+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If VECT = 'Q', * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * else * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * end if * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL APPLYQ, LEFT, LQUERY, NOTRAN CHARACTER TRANST INTEGER IAA, IACOL, IAROW, ICC, ICCOL, ICOFFA, ICOFFC, $ ICROW, ICTXT, IINFO, IROFFA, IROFFC, JAA, JCC, $ LCM, LCMP, LCMQ, LWMIN, MI, MPC0, MQA0, MYCOL, $ MYROW, NI, NPA0, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK1MAT, PXERBLA, $ PZUNMLQ, PZUNMQR * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(1000+CTXT_) ELSE APPLYQ = LSAME( VECT, 'Q' ) LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q or P * IF( LEFT ) THEN NQ = M IF( ( APPLYQ .AND. NQ.GE.K ) .OR. $ ( .NOT.APPLYQ .AND. NQ.GT.K ) ) THEN IAA = IA JAA = JA MI = M NI = N ICC = IC JCC = JC ELSE IAA = IA + 1 JAA = JA MI = M - 1 NI = N ICC = IC + 1 JCC = JC END IF * IF( APPLYQ ) THEN CALL CHK1MAT( M, 4, K, 6, IA, JA, DESCA, 10, INFO ) ELSE CALL CHK1MAT( K, 6, M, 4, IA, JA, DESCA, 10, INFO ) END IF ELSE NQ = N IF( ( APPLYQ .AND. NQ.GE.K ) .OR. $ ( .NOT.APPLYQ .AND. NQ.GT.K ) ) THEN IAA = IA JAA = JA MI = M NI = N ICC = IC JCC = JC ELSE IAA = IA JAA = JA + 1 MI = M NI = N - 1 ICC = IC JCC = JC + 1 END IF * IF( APPLYQ ) THEN CALL CHK1MAT( N, 5, K, 6, IA, JA, DESCA, 10, INFO ) ELSE CALL CHK1MAT( K, 6, N, 5, IA, JA, DESCA, 10, INFO ) END IF END IF CALL CHK1MAT( M, 4, N, 5, IC, JC, DESCC, 15, INFO ) * IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) ICOFFA = MOD( JAA-1, DESCA( NB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JAA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( APPLYQ ) THEN IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF ELSE * IF( LEFT ) THEN MQA0 = NUMROC( MI+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ MI+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.APPLYQ .AND. .NOT.LSAME( VECT, 'P' ) ) THEN INFO = -1 ELSE IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -2 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -3 ELSE IF( K.LT.0 ) THEN INFO = -6 ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( APPLYQ .AND. LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( APPLYQ .AND. LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -13 ELSE IF( .NOT.APPLYQ .AND. LEFT .AND. $ ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ IROFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( APPLYQ .AND. LEFT .AND. $ DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( .NOT.APPLYQ .AND. LEFT .AND. $ DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( .NOT.APPLYQ .AND. .NOT.LEFT .AND. $ DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF * IF( APPLYQ ) THEN IDUM1( 1 ) = ICHAR( 'Q' ) ELSE IDUM1( 1 ) = ICHAR( 'P' ) END IF IDUM2( 1 ) = 1 IF( LEFT ) THEN IDUM1( 2 ) = ICHAR( 'L' ) ELSE IDUM1( 2 ) = ICHAR( 'R' ) END IF IDUM2( 2 ) = 2 IF( NOTRAN ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'C' ) END IF IDUM2( 3 ) = 3 IDUM1( 4 ) = K IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( APPLYQ ) THEN IF( LEFT ) THEN CALL PCHK2MAT( M, 4, K, 6, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( N, 5, K, 6, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) END IF ELSE IF( LEFT ) THEN CALL PCHK2MAT( K, 6, M, 4, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( K, 6, N, 5, IA, JA, DESCA, 10, M, 4, N, $ 5, IC, JC, DESCC, 15, 5, IDUM1, IDUM2, $ INFO ) END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMBR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( APPLYQ ) THEN * * Apply Q * IF( NQ.GE.K ) THEN * * Q was determined by a call to PZGEBRD with nq >= k * CALL PZUNMQR( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) ELSE IF( NQ.GT.1 ) THEN * * Q was determined by a call to PZGEBRD with nq < k * CALL PZUNMQR( SIDE, TRANS, MI, NI, NQ-1, A, IA+1, JA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) END IF ELSE * * Apply P * IF( NOTRAN ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF IF( NQ.GT.K ) THEN * * P was determined by a call to PZGEBRD with nq > k * CALL PZUNMLQ( SIDE, TRANST, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) ELSE IF( NQ.GT.1 ) THEN * * P was determined by a call to PZGEBRD with nq <= k * CALL PZUNMLQ( SIDE, TRANST, MI, NI, NQ-1, A, IA, JA+1, $ DESCA, TAU, C, ICC, JCC, DESCC, WORK, LWORK, $ IINFO ) END IF END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMBR * END scalapack-2.0.2/SRC/pzunmhr.f000644 000766 000024 00000036470 10363532303 016222 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMHR( SIDE, TRANS, M, N, ILO, IHI, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, IHI, ILO, INFO, JA, JC, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNMHR overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix of order nq, with * nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the * product of IHI-ILO elementary reflectors, as returned by PZGEHRD: * * Q = H(ilo) H(ilo+1) . . . H(ihi-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * ILO and IHI must have the same values as in the previous call * of PZGEHRD. Q is equal to the unit matrix except in the * distributed submatrix Q(ia+ilo:ia+ihi-1,ia+ilo:ja+ihi-1). * If SIDE = 'L', 1 <= ILO <= IHI <= max(1,M); * if SIDE = 'R', 1 <= ILO <= IHI <= max(1,N); * ILO and IHI are relative indexes. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE = 'R'. The vectors which * define the elementary reflectors, as returned by PZGEHRD. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JA+M-2) * if SIDE = 'L', and LOCc(JA+N-2) if SIDE = 'R'. This array * contains the scalar factors TAU(j) of the elementary * reflectors H(j) as returned by PZGEHRD. TAU is tied to * the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * * IAA = IA + ILO; JAA = JA+ILO-1; * If SIDE = 'L', * MI = IHI-ILO; NI = N; ICC = IC + ILO; JCC = JC; * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * MI = M; NI = IHI-ILO; ICC = IC; JCC = JC + ILO; * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN INTEGER IAA, IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, $ IINFO, IROFFA, IROFFC, JAA, JCC, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NH, NI, NPA0, $ NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PXERBLA, $ PZUNMQR * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 NH = IHI - ILO IF( NPROW.EQ.-1 ) THEN INFO = -(1000+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) IAA = IA + ILO JAA = JA + ILO - 1 * * NQ is the order of Q * IF( LEFT ) THEN NQ = M MI = NH NI = N ICC = IC + ILO JCC = JC CALL CHK1MAT( M, 3, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N MI = M NI = NH ICC = IC JCC = JC + ILO CALL CHK1MAT( N, 4, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( ILO.LT.1 .OR. ILO.GT.MAX( 1, NQ ) ) THEN INFO = -5 ELSE IF( IHI.LT.MIN( ILO, NQ ) .OR. IHI.GT.NQ ) THEN INFO = -6 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1500+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = ILO IDUM2( 3 ) = 5 IDUM1( 4 ) = IHI IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, M, 3, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, N, 4, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMHR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. NH.EQ.0 ) $ RETURN * CALL PZUNMQR( SIDE, TRANS, MI, NI, NH, A, IAA, JAA, DESCA, TAU, $ C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMHR * END scalapack-2.0.2/SRC/pzunml2.f000644 000766 000024 00000036246 10363532303 016127 0ustar00juliestaff000000 000000 SUBROUTINE PZUNML2( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNML2 overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(k)' . . . H(2)' H(1)' * * as returned by PZGELQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= max(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PZGELQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZGELQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IROFFC, JCC, LCM, LCMP, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPCOL, $ NPROW, NQ, NQC0 COMPLEX*16 AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZELSET2, PZLACGV, PZLARF, PZLARFC * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNML2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN .OR. .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K -1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC ELSE MI = M ICC = IC CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(i:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H(i) or H(i)' * IF( I-IA+1.LT.NQ ) $ CALL PZLACGV( NQ-I+IA-1, A, I, JA+I-IA+1, DESCA, $ DESCA( M_ ) ) CALL PZELSET2( AII, A, I, JA+I-IA, DESCA, ONE ) IF( NOTRAN ) THEN CALL PZLARFC( SIDE, MI, NI, A, I, JA+I-IA, DESCA, $ DESCA( M_ ), TAU, C, ICC, JCC, DESCC, WORK ) ELSE CALL PZLARF( SIDE, MI, NI, A, I, JA+I-IA, DESCA, $ DESCA( M_ ), TAU, C, ICC, JCC, DESCC, WORK ) END IF CALL PZELSET( A, I, JA+I-IA, DESCA, AII ) IF( I-IA+1.LT.NQ ) $ CALL PZLACGV( NQ-I+IA-1, A, I, JA+I-IA+1, DESCA, $ DESCA( M_ ) ) * 10 CONTINUE * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNML2 * END scalapack-2.0.2/SRC/pzunmlq.f000644 000766 000024 00000042076 10363532303 016224 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMLQ( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNMLQ overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(k)' . . . H(2)' H(1)' * * as returned by PZGELQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= max(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PZGELQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZGELQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, JCC, $ LCM, LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, MYROW, $ NI, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZLARFB, PZLARFT, $ PZUNML2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMLQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC ELSE MI = M ICC = IC CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. NOTRAN ) .OR. ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) $ CALL PZUNML2( SIDE, TRANS, M, N, I1-IA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * IPW = DESCA( MB_ ) * DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) * CALL PZLARFT( 'Forward', 'Rowwise', NQ-I+IA, IB, A, I, JA+I-IA, $ DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+i-ia:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H or H' * CALL PZLARFB( SIDE, TRANST, 'Forward', 'Rowwise', MI, NI, IB, $ A, I, JA+I-IA, DESCA, WORK, C, ICC, JCC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. ( .NOT.LEFT .AND. NOTRAN ) ) $ CALL PZUNML2( SIDE, TRANS, M, N, I2-IA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMLQ * END scalapack-2.0.2/SRC/pzunmql.f000644 000766 000024 00000042324 10363532303 016220 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMQL( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNMQL overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(k) . . . H(2) H(1) * * as returned by PZGEQLF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PZGEQLF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ), * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JA+N-1) * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PZGEQLF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( N+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IAROW, ICCOL, ICOFFC, ICROW, ICTXT, IINFO, IPW, $ IROFFA, IROFFC, J, J1, J2, J3, JB, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPA0, NPCOL, $ NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZLARFB, PZLARFT, $ PZUNM2L * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMQL', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN J1 = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+K-1 ) + 1 J2 = JA+K-1 J3 = DESCA( NB_ ) ELSE J1 = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) J2 = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+K-1 ) + 1 J3 = -DESCA( NB_ ) END IF * IF( LEFT ) THEN NI = N IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M END IF * * Use unblocked code for the first block if necessary * IF( ( LEFT .AND. NOTRAN ) .OR. $ ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) THEN JB = J1 - JA IF( LEFT ) THEN MI = M - K + JB ELSE NI = N - K + JB END IF CALL PZUNM2L( SIDE, TRANS, MI, NI, JB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 DO 10 J = J1, J2, J3 JB = MIN( DESCA( NB_ ), K-J+JA ) * * Form the triangular factor of the block reflector * H = H(j+jb-1) . . . H(j+1) H(j) * CALL PZLARFT( 'Backward', 'Columnwise', NQ-K+J+JB-JA, JB, $ A, IA, J, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic:ic+m-k+j+jb-ja-1,jc:jc+n-1) * MI = M - K + J + JB - JA ELSE * * H or H' is applied to C(ic:ic+m-1,jc:jc+n-k+j+jb-ja-1) * NI = N - K + J + JB - JA END IF * * Apply H or H' * CALL PZLARFB( SIDE, TRANS, 'Backward', 'Columnwise', MI, NI, $ JB, A, IA, J, DESCA, WORK, C, IC, JC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN JB = J2 - JA IF( LEFT ) THEN MI = M - K + JB ELSE NI = N - K + JB END IF CALL PZUNM2L( SIDE, TRANS, MI, NI, JB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMQL * END scalapack-2.0.2/SRC/pzunmqr.f000644 000766 000024 00000042142 10363532303 016224 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMQR( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNMQR overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of k elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by PZGEQRF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+K-1)). On entry, the * j-th column must contain the vector which defines the elemen- * tary reflector H(j), JA <= j <= JA+K-1, as returned by * PZGEQRF in the K columns of its distributed matrix * argument A(IA:*,JA:JA+K-1). A(IA:*,JA:JA+K-1) is modified by * the routine but restored on exit. * If SIDE = 'L', LLD_A >= MAX( 1, LOCr(IA+M-1) ); * if SIDE = 'R', LLD_A >= MAX( 1, LOCr(IA+N-1) ). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(JA+K-1). * This array contains the scalar factors TAU(j) of the * elementary reflectors H(j) as returned by PZGEQRF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( N+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, IINFO, $ IPW, IROFFA, IROFFC, J, J1, J2, J3, JB, JCC, $ LCM, LCMQ, LWMIN, MI, MPC0, MYCOL, MYROW, NI, $ NPA0, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZLARFB, PZLARFT, $ PZUNM2R * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( M, 3, K, 5, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( N, 4, K, 5, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IA-1, DESCA( MB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( N+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ N+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( M, 3, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( N, 4, K, 5, IA, JA, DESCA, 9, M, 3, N, 4, IC, $ JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMQR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN J1 = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) $ + 1 J2 = JA+K-1 J3 = DESCA( NB_ ) ELSE J1 = MAX( ( (JA+K-2) / DESCA( NB_ ) ) * DESCA( NB_ ) + 1, JA ) J2 = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+K-1 ) $ + 1 J3 = -DESCA( NB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'D-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', 'I-ring' ) END IF CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', ' ' ) ELSE MI = M ICC = IC END IF * * Use unblocked code for the first block if necessary * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. ( .NOT.LEFT .AND. NOTRAN ) ) $ CALL PZUNM2R( SIDE, TRANS, M, N, J1-JA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * IPW = DESCA( NB_ ) * DESCA( NB_ ) + 1 DO 10 J = J1, J2, J3 JB = MIN( DESCA( NB_ ), K-J+JA ) * * Form the triangular factor of the block reflector * H = H(j) H(j+1) . . . H(j+jb-1) * CALL PZLARFT( 'Forward', 'Columnwise', NQ-J+JA, JB, A, $ IA+J-JA, J, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+j-ja:ic+m-1,jc:jc+n-1) * MI = M - J + JA ICC = IC + J - JA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+j-ja:jc+n-1) * NI = N - J + JA JCC = JC + J - JA END IF * * Apply H or H' * CALL PZLARFB( SIDE, TRANS, 'Forward', 'Columnwise', MI, NI, $ JB, A, IA+J-JA, J, DESCA, WORK, C, ICC, JCC, $ DESCC, WORK( IPW ) ) 10 CONTINUE * * Use unblocked code for the last block if necessary * IF( ( LEFT .AND. NOTRAN ) .OR. ( .NOT.LEFT .AND. .NOT.NOTRAN ) ) $ CALL PZUNM2R( SIDE, TRANS, M, N, J2-JA, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMQR * END scalapack-2.0.2/SRC/pzunmr2.f000644 000766 000024 00000035341 10363532303 016130 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMR2( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNMR2 overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PZGERQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PZGERQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZGERQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICCOL, ICOFFA, ICOFFC, $ ICROW, ICTXT, IROFFC, LCM, LCMP, LWMIN, MI, $ MPC0, MYCOL, MYROW, NI, NPCOL, NPROW, NQ, NQC0 COMPLEX*16 AII * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZELSET, $ PZELSET2, PZLACGV, PZLARF, PZLARFC * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMR2', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K - 1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N ELSE MI = M CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(ic:ic+m-k+i-ia,jc:jc+n-1) * MI = M - K + I - IA + 1 ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc:jc+n-k+i-ia+1) * NI = N - K + I - IA + 1 END IF * * Apply H(i) or H(i)' * CALL PZLACGV( NQ-K+I-IA, A, I, JA, DESCA, DESCA( M_ ) ) CALL PZELSET2( AII, A, I, JA+NQ-K+I-IA, DESCA, ONE ) IF( NOTRAN ) THEN CALL PZLARFC( SIDE, MI, NI, A, I, JA, DESCA, DESCA( M_ ), $ TAU, C, IC, JC, DESCC, WORK ) ELSE CALL PZLARF( SIDE, MI, NI, A, I, JA, DESCA, DESCA( M_ ), $ TAU, C, IC, JC, DESCC, WORK ) END IF CALL PZELSET( A, I, JA+NQ-K+I-IA, DESCA, AII ) CALL PZLACGV( NQ-K+I-IA, A, I, JA, DESCA, DESCA( M_ ) ) * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMR2 * END scalapack-2.0.2/SRC/pzunmr3.f000644 000766 000024 00000035466 10363532303 016141 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMR3( SIDE, TRANS, M, N, K, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, L, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNMR3 overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PZTZRZF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PZTZRZF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZTZRZF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * If SIDE = 'L', LWORK >= MpC0 + MAX( MAX( 1, NqC0 ), NUMROC( * NUMROC( M+IROFFC,MB_A,0,0,NPROW ),MB_A,0,0,LCMP ) ); * if SIDE = 'R', LWORK >= NqC0 + MAX( 1, MpC0 ); * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP INTEGER I, I1, I2, I3, IACOL, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IROFFC, JAA, JCC, LCM, $ LCMP, LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPCOL, $ NPROW, NQ, NQC0 * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GRIDINFO, CHK1MAT, $ PB_TOPGET, PB_TOPSET, PXERBLA, PZLARZ, PZLARZC * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MPC0 + MAX( MAX( 1, NQC0 ), NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ) ) ELSE LWMIN = NQC0 + MAX( 1, MPC0 ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( L.LT.0 .OR. L.GT.NQ ) THEN INFO = -6 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMR3', -INFO ) CALL BLACS_ABORT( ICTXT, 1 ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN .OR. .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = IA I2 = IA + K - 1 I3 = 1 ELSE I1 = IA + K - 1 I2 = IA I3 = -1 END IF * IF( LEFT ) THEN NI = N JCC = JC JAA = JA + M - L ELSE MI = M ICC = IC JAA = JA + N - L CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * DO 10 I = I1, I2, I3 IF( LEFT ) THEN * * H(i) or H(i)' is applied to C(ic+i-ia:icc+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H(i) or H(i)' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H(i) or H(i)' * IF( NOTRAN ) THEN CALL PZLARZ( SIDE, MI, NI, L, A, I, JAA, DESCA, DESCA( M_ ), $ TAU, C, ICC, JCC, DESCC, WORK ) ELSE CALL PZLARZC( SIDE, MI, NI, L, A, I, JAA, DESCA, $ DESCA( M_ ), TAU, C, ICC, JCC, DESCC, WORK ) END IF * 10 CONTINUE * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMR3 * END scalapack-2.0.2/SRC/pzunmrq.f000644 000766 000024 00000043114 11663037655 016242 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMRQ( SIDE, TRANS, M, N, K, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNMRQ overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PZGERQF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PZGERQF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZGERQF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN, RIGHT, TRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, LCM, $ LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, MYROW, NI, $ NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZLARFB, PZLARFT, $ PZUNMR2 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE IF( LSAME( SIDE, 'L' ) ) THEN LEFT = .TRUE. RIGHT = .FALSE. ELSE LEFT = .FALSE. RIGHT = .TRUE. END IF IF( LSAME( TRANS, 'N' ) ) THEN NOTRAN = .TRUE. TRAN = .FALSE. ELSE NOTRAN = .FALSE. TRAN = .TRUE. END IF * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 9, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 9, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1400+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 9, M, 3, N, 4, $ IC, JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 9, M, 3, N, 4, $ IC, JC, DESCC, 14, 4, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMRQ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N ELSE MI = M CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I1 - IA IF( LEFT ) THEN MI = M - K + IB ELSE NI = N - K + IB END IF CALL PZUNMR2( SIDE, TRANS, MI, NI, IB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PZLARFT( 'Backward', 'Rowwise', NQ-K+I+IB-IA, IB, $ A, I, JA, DESCA, TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic:ic+m-k+i+ib-ia-1,jc:jc+n-1) * MI = M - K + I + IB - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc:jc+n-k+i+ib-ia-1) * NI = N - K + I + IB - IA END IF * * Apply H or H' * CALL PZLARFB( SIDE, TRANST, 'Backward', 'Rowwise', MI, NI, $ IB, A, I, JA, DESCA, WORK, C, IC, JC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( RIGHT .AND. TRAN ) .OR. $ ( LEFT .AND. NOTRAN ) ) THEN IB = I2 - IA IF( LEFT ) THEN MI = M - K + IB ELSE NI = N - K + IB END IF CALL PZUNMR2( SIDE, TRANS, MI, NI, IB, A, IA, JA, DESCA, TAU, $ C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMRQ * END scalapack-2.0.2/SRC/pzunmrz.f000644 000766 000024 00000043406 10363532303 016241 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMRZ( SIDE, TRANS, M, N, K, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 25, 2001 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS INTEGER IA, IC, INFO, JA, JC, K, L, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNMRZ overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix defined as the * product of K elementary reflectors * * Q = H(1)' H(2)' . . . H(k)' * * as returned by PZTZRZF. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * K (global input) INTEGER * The number of elementary reflectors whose product defines the * matrix Q. If SIDE = 'L', M >= K >= 0, if SIDE = 'R', * N >= K >= 0. * * L (global input) INTEGER * The columns of the distributed submatrix sub( A ) containing * the meaningful part of the Householder reflectors. * If SIDE = 'L', M >= L >= 0, if SIDE = 'R', N >= L >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * and (LLD_A,LOCc(JA+N-1)) if SIDE='R', where * LLD_A >= MAX(1,LOCr(IA+K-1)); On entry, the i-th row must * contain the vector which defines the elementary reflector * H(i), IA <= i <= IA+K-1, as returned by PZTZRZF in the * K rows of its distributed matrix argument A(IA:IA+K-1,JA:*). * A(IA:IA+K-1,JA:*) is modified by the routine but restored on * exit. * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16, array, dimension LOCc(IA+K-1). * This array contains the scalar factors TAU(i) of the * elementary reflectors H(i) as returned by PZTZRZF. * TAU is tied to the distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * if SIDE = 'L', * LWORK >= MAX( (MB_A*(MB_A-1))/2, ( MpC0 + MAX( MqA0 + * NUMROC( NUMROC( M+IROFFC, MB_A, 0, 0, NPROW ), * MB_A, 0, 0, LCMP ), NqC0 ) )*MB_A ) + * MB_A * MB_A * else if SIDE = 'R', * LWORK >= MAX( (MB_A*(MB_A-1))/2, (MpC0 + NqC0)*MB_A ) + * MB_A * MB_A * end if * * where LCMP = LCM / NPROW with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IA-1, MB_A ), ICOFFA = MOD( JA-1, NB_A ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * MqA0 = NUMROC( M+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * IROFFC = MOD( IC-1, MB_C ), ICOFFC = MOD( JC-1, NB_C ), * ICROW = INDXG2P( IC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( M+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( N+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( NB_A.EQ.MB_C .AND. ICOFFA.EQ.IROFFC ) * If SIDE = 'R', * ( NB_A.EQ.NB_C .AND. ICOFFA.EQ.ICOFFC .AND. IACOL.EQ.ICCOL ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN CHARACTER COLBTOP, ROWBTOP, TRANST INTEGER I, I1, I2, I3, IACOL, IB, ICC, ICCOL, ICOFFA, $ ICOFFC, ICROW, ICTXT, IINFO, IPW, IROFFC, JAA, $ JCC, LCM, LCMP, LWMIN, MI, MPC0, MQA0, MYCOL, $ MYROW, NI, NPCOL, NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 5 ), IDUM2( 5 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PB_TOPGET, $ PB_TOPSET, PXERBLA, PZLARZB, PZLARZT, $ PZUNMR3 * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, ILCM, INDXG2P, NUMROC EXTERNAL ICEIL, ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MIN, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * NQ is the order of Q * IF( LEFT ) THEN NQ = M CALL CHK1MAT( K, 5, M, 3, IA, JA, DESCA, 10, INFO ) ELSE NQ = N CALL CHK1MAT( K, 5, N, 4, IA, JA, DESCA, 10, INFO ) END IF CALL CHK1MAT( M, 3, N, 4, IC, JC, DESCC, 15, INFO ) IF( INFO.EQ.0 ) THEN ICOFFA = MOD( JA-1, DESCA( NB_ ) ) IROFFC = MOD( IC-1, DESCC( MB_ ) ) ICOFFC = MOD( JC-1, DESCC( NB_ ) ) IACOL = INDXG2P( JA, DESCA( NB_ ), MYCOL, DESCA( CSRC_ ), $ NPCOL ) ICROW = INDXG2P( IC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( M+IROFFC, DESCC( MB_ ), MYROW, ICROW, NPROW ) NQC0 = NUMROC( N+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, NPCOL ) * IF( LEFT ) THEN MQA0 = NUMROC( M+ICOFFA, DESCA( NB_ ), MYCOL, IACOL, $ NPCOL ) LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) $ / 2, ( MPC0 + MAX( MQA0 + NUMROC( NUMROC( $ M+IROFFC, DESCA( MB_ ), 0, 0, NPROW ), $ DESCA( MB_ ), 0, 0, LCMP ), NQC0 ) ) * $ DESCA( MB_ ) ) + DESCA( MB_ ) * DESCA( MB_ ) ELSE LWMIN = MAX( ( DESCA( MB_ ) * ( DESCA( MB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( MB_ ) ) + $ DESCA( MB_ ) * DESCA( MB_ ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -5 ELSE IF( K.LT.0 .OR. K.GT.NQ ) THEN INFO = -6 ELSE IF( LEFT .AND. DESCA( NB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1000+NB_) ELSE IF( LEFT .AND. ICOFFA.NE.IROFFC ) THEN INFO = -13 ELSE IF( .NOT.LEFT .AND. ICOFFA.NE.ICOFFC ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. IACOL.NE.ICCOL ) THEN INFO = -14 ELSE IF( .NOT.LEFT .AND. DESCA( NB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(1500+NB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1500+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 END IF END IF IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( NOTRAN ) THEN IDUM1( 2 ) = ICHAR( 'N' ) ELSE IDUM1( 2 ) = ICHAR( 'C' ) END IF IDUM2( 2 ) = 2 IDUM1( 3 ) = K IDUM2( 3 ) = 5 IDUM1( 4 ) = L IDUM2( 4 ) = 6 IF( LWORK.EQ.-1 ) THEN IDUM1( 5 ) = -1 ELSE IDUM1( 5 ) = 1 END IF IDUM2( 5 ) = 17 IF( LEFT ) THEN CALL PCHK2MAT( K, 5, M, 3, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) ELSE CALL PCHK2MAT( K, 5, N, 4, IA, JA, DESCA, 10, M, 3, N, 4, $ IC, JC, DESCC, 15, 5, IDUM1, IDUM2, INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMRZ', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. K.EQ.0 ) $ RETURN * CALL PB_TOPGET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPGET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN I1 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I2 = IA + K - 1 I3 = DESCA( MB_ ) ELSE I1 = MAX( ( (IA+K-2) / DESCA( MB_ ) ) * DESCA( MB_ ) + 1, IA ) I2 = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+K-1 ) $ + 1 I3 = -DESCA( MB_ ) END IF * IF( LEFT ) THEN NI = N JCC = JC JAA = JA + M - L ELSE MI = M ICC = IC JAA = JA + N - L CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ' ' ) IF( NOTRAN ) THEN CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'I-ring' ) ELSE CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', 'D-ring' ) END IF END IF * IF( NOTRAN ) THEN TRANST = 'C' ELSE TRANST = 'N' END IF * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I1 - IA IF( LEFT ) THEN MI = M ELSE NI = N END IF CALL PZUNMR3( SIDE, TRANS, MI, NI, IB, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * IPW = DESCA( MB_ )*DESCA( MB_ ) + 1 DO 10 I = I1, I2, I3 IB = MIN( DESCA( MB_ ), K-I+IA ) * * Form the triangular factor of the block reflector * H = H(i+ib-1) . . . H(i+1) H(i) * CALL PZLARZT( 'Backward', 'Rowwise', L, IB, A, I, JAA, DESCA, $ TAU, WORK, WORK( IPW ) ) IF( LEFT ) THEN * * H or H' is applied to C(ic+i-ia:ic+m-1,jc:jc+n-1) * MI = M - I + IA ICC = IC + I - IA ELSE * * H or H' is applied to C(ic:ic+m-1,jc+i-ia:jc+n-1) * NI = N - I + IA JCC = JC + I - IA END IF * * Apply H or H' * CALL PZLARZB( SIDE, TRANST, 'Backward', 'Rowwise', MI, NI, IB, $ L, A, I, JAA, DESCA, WORK, C, ICC, JCC, DESCC, $ WORK( IPW ) ) 10 CONTINUE * IF( ( LEFT .AND. .NOT.NOTRAN ) .OR. $ ( .NOT.LEFT .AND. NOTRAN ) ) THEN IB = I2 - IA IF( LEFT ) THEN MI = M ELSE NI = N END IF CALL PZUNMR3( SIDE, TRANS, MI, NI, IB, L, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, IINFO ) END IF * CALL PB_TOPSET( ICTXT, 'Broadcast', 'Rowwise', ROWBTOP ) CALL PB_TOPSET( ICTXT, 'Broadcast', 'Columnwise', COLBTOP ) * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMRZ * END scalapack-2.0.2/SRC/pzunmtr.f000644 000766 000024 00000040463 10363532303 016233 0ustar00juliestaff000000 000000 SUBROUTINE PZUNMTR( SIDE, UPLO, TRANS, M, N, A, IA, JA, DESCA, $ TAU, C, IC, JC, DESCC, WORK, LWORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER SIDE, TRANS, UPLO INTEGER IA, IC, INFO, JA, JC, LWORK, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), TAU( * ), WORK( * ) * .. * * Purpose * ======= * * PZUNMTR overwrites the general complex M-by-N distributed matrix * sub( C ) = C(IC:IC+M-1,JC:JC+N-1) with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * sub( C ) sub( C ) * Q * TRANS = 'C': Q**H * sub( C ) sub( C ) * Q**H * * where Q is a complex unitary distributed matrix of order nq, with * nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the * product of nq-1 elementary reflectors, as returned by PZHETRD: * * if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1); * * if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1). * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER * = 'L': apply Q or Q**H from the Left; * = 'R': apply Q or Q**H from the Right. * * UPLO (global input) CHARACTER * = 'U': Upper triangle of A(IA:*,JA:*) contains elementary * reflectors from PZHETRD; * = 'L': Lower triangle of A(IA:*,JA:*) contains elementary * reflectors from PZHETRD. * * TRANS (global input) CHARACTER * = 'N': No transpose, apply Q; * = 'C': Conjugate transpose, apply Q**H. * * M (global input) INTEGER * The number of rows to be operated on i.e the number of rows * of the distributed submatrix sub( C ). M >= 0. * * N (global input) INTEGER * The number of columns to be operated on i.e the number of * columns of the distributed submatrix sub( C ). N >= 0. * * A (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_A,LOCc(JA+M-1)) if SIDE='L', * or (LLD_A,LOCc(JA+N-1)) if SIDE = 'R'. The vectors which * define the elementary reflectors, as returned by PZHETRD. * If SIDE = 'L', LLD_A >= max(1,LOCr(IA+M-1)); * if SIDE = 'R', LLD_A >= max(1,LOCr(IA+N-1)). * * IA (global input) INTEGER * The row index in the global array A indicating the first * row of sub( A ). * * JA (global input) INTEGER * The column index in the global array A indicating the * first column of sub( A ). * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * TAU (local input) COMPLEX*16 array, dimension LTAU, where * if SIDE = 'L' and UPLO = 'U', LTAU = LOCc(M_A), * if SIDE = 'L' and UPLO = 'L', LTAU = LOCc(JA+M-2), * if SIDE = 'R' and UPLO = 'U', LTAU = LOCc(N_A), * if SIDE = 'R' and UPLO = 'L', LTAU = LOCc(JA+N-2). * TAU(i) must contain the scalar factor of the elementary * reflector H(i), as returned by PZHETRD. TAU is tied to the * distributed matrix A. * * C (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_C,LOCc(JC+N-1)). * On entry, the local pieces of the distributed matrix sub(C). * On exit, sub( C ) is overwritten by Q*sub( C ) or Q'*sub( C ) * or sub( C )*Q' or sub( C )*Q. * * IC (global input) INTEGER * The row index in the global array C indicating the first * row of sub( C ). * * JC (global input) INTEGER * The column index in the global array C indicating the * first column of sub( C ). * * DESCC (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix C. * * WORK (local workspace/local output) COMPLEX*16 array, * dimension (LWORK) * On exit, WORK(1) returns the minimal and optimal LWORK. * * LWORK (local or global input) INTEGER * The dimension of the array WORK. * LWORK is local input and must be at least * * If UPLO = 'U', * IAA = IA, JAA = JA+1, ICC = IC, JCC = JC; * else UPLO = 'L', * IAA = IA+1, JAA = JA; * if SIDE = 'L', * ICC = IC+1; JCC = JC; * else * ICC = IC; JCC = JC+1; * end if * end if * * If SIDE = 'L', * MI = M-1; NI = N; * LWORK >= MAX( (NB_A*(NB_A-1))/2, (NqC0 + MpC0)*NB_A ) + * NB_A * NB_A * else if SIDE = 'R', * MI = M; MI = N-1; * LWORK >= MAX( (NB_A*(NB_A-1))/2, ( NqC0 + MAX( NpA0 + * NUMROC( NUMROC( NI+ICOFFC, NB_A, 0, 0, NPCOL ), * NB_A, 0, 0, LCMQ ), MpC0 ) )*NB_A ) + * NB_A * NB_A * end if * * where LCMQ = LCM / NPCOL with LCM = ICLM( NPROW, NPCOL ), * * IROFFA = MOD( IAA-1, MB_A ), ICOFFA = MOD( JAA-1, NB_A ), * IAROW = INDXG2P( IAA, MB_A, MYROW, RSRC_A, NPROW ), * NpA0 = NUMROC( NI+IROFFA, MB_A, MYROW, IAROW, NPROW ), * * IROFFC = MOD( ICC-1, MB_C ), ICOFFC = MOD( JCC-1, NB_C ), * ICROW = INDXG2P( ICC, MB_C, MYROW, RSRC_C, NPROW ), * ICCOL = INDXG2P( JCC, NB_C, MYCOL, CSRC_C, NPCOL ), * MpC0 = NUMROC( MI+IROFFC, MB_C, MYROW, ICROW, NPROW ), * NqC0 = NUMROC( NI+ICOFFC, NB_C, MYCOL, ICCOL, NPCOL ), * * ILCM, INDXG2P and NUMROC are ScaLAPACK tool functions; * MYROW, MYCOL, NPROW and NPCOL can be determined by calling * the subroutine BLACS_GRIDINFO. * * If LWORK = -1, then LWORK is global input and a workspace * query is assumed; the routine only calculates the minimum * and optimal size for all work arrays. Each of these * values is returned in the first entry of the corresponding * work array, and no error message is issued by PXERBLA. * * * INFO (global output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * Alignment requirements * ====================== * * The distributed submatrices A(IA:*, JA:*) and C(IC:IC+M-1,JC:JC+N-1) * must verify some alignment properties, namely the following * expressions should be true: * * If SIDE = 'L', * ( MB_A.EQ.MB_C .AND. IROFFA.EQ.IROFFC .AND. IAROW.EQ.ICROW ) * If SIDE = 'R', * ( MB_A.EQ.NB_C .AND. IROFFA.EQ.ICOFFC ) * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL LEFT, LQUERY, NOTRAN, UPPER INTEGER IAA, IAROW, ICC, ICCOL, ICOFFC, ICROW, ICTXT, $ IINFO, IROFFA, IROFFC, JAA, JCC, LCM, LCMQ, $ LWMIN, MI, MPC0, MYCOL, MYROW, NI, NPA0, NPCOL, $ NPROW, NQ, NQC0 * .. * .. Local Arrays .. INTEGER IDUM1( 4 ), IDUM2( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CHK1MAT, PCHK2MAT, PXERBLA, $ PZUNMQL, PZUNMQR * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, INDXG2P, NUMROC EXTERNAL ILCM, INDXG2P, LSAME, NUMROC * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, ICHAR, MAX, MOD * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -(900+CTXT_) ELSE LEFT = LSAME( SIDE, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) UPPER = LSAME( UPLO, 'U' ) * IF( UPPER ) THEN IAA = IA JAA = JA+1 ICC = IC JCC = JC ELSE IAA = IA+1 JAA = JA IF( LEFT ) THEN ICC = IC + 1 JCC = JC ELSE ICC = IC JCC = JC + 1 END IF END IF * * NQ is the order of Q * IF( LEFT ) THEN NQ = M MI = M - 1 NI = N CALL CHK1MAT( MI, 4, NQ-1, 4, IAA, JAA, DESCA, 9, INFO ) ELSE NQ = N MI = M NI = N - 1 CALL CHK1MAT( NI, 5, NQ-1, 5, IAA, JAA, DESCA, 9, INFO ) END IF CALL CHK1MAT( MI, 4, NI, 5, ICC, JCC, DESCC, 14, INFO ) IF( INFO.EQ.0 ) THEN IROFFA = MOD( IAA-1, DESCA( MB_ ) ) IROFFC = MOD( ICC-1, DESCC( MB_ ) ) ICOFFC = MOD( JCC-1, DESCC( NB_ ) ) IAROW = INDXG2P( IAA, DESCA( MB_ ), MYROW, DESCA( RSRC_ ), $ NPROW ) ICROW = INDXG2P( ICC, DESCC( MB_ ), MYROW, DESCC( RSRC_ ), $ NPROW ) ICCOL = INDXG2P( JCC, DESCC( NB_ ), MYCOL, DESCC( CSRC_ ), $ NPCOL ) MPC0 = NUMROC( MI+IROFFC, DESCC( MB_ ), MYROW, ICROW, $ NPROW ) NQC0 = NUMROC( NI+ICOFFC, DESCC( NB_ ), MYCOL, ICCOL, $ NPCOL ) * IF( LEFT ) THEN LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) / 2, $ ( MPC0 + NQC0 ) * DESCA( NB_ ) ) + $ DESCA( NB_ ) * DESCA( NB_ ) ELSE NPA0 = NUMROC( NI+IROFFA, DESCA( MB_ ), MYROW, IAROW, $ NPROW ) LCM = ILCM( NPROW, NPCOL ) LCMQ = LCM / NPCOL LWMIN = MAX( ( DESCA( NB_ ) * ( DESCA( NB_ ) - 1 ) ) $ / 2, ( NQC0 + MAX( NPA0 + NUMROC( NUMROC( $ NI+ICOFFC, DESCA( NB_ ), 0, 0, NPCOL ), $ DESCA( NB_ ), 0, 0, LCMQ ), MPC0 ) ) * $ DESCA( NB_ ) ) + DESCA( NB_ ) * DESCA( NB_ ) END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.LEFT .AND. .NOT.LSAME( SIDE, 'R' ) ) THEN INFO = -1 ELSE IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -2 ELSE IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = -3 ELSE IF( .NOT.LEFT .AND. DESCA( MB_ ).NE.DESCC( NB_ ) ) THEN INFO = -(900+NB_) ELSE IF( LEFT .AND. IROFFA.NE.IROFFC ) THEN INFO = -12 ELSE IF( LEFT .AND. IAROW.NE.ICROW ) THEN INFO = -12 ELSE IF( .NOT.LEFT .AND. IROFFA.NE.ICOFFC ) THEN INFO = -13 ELSE IF( LEFT .AND. DESCA( MB_ ).NE.DESCC( MB_ ) ) THEN INFO = -(1400+MB_) ELSE IF( ICTXT.NE.DESCC( CTXT_ ) ) THEN INFO = -(1400+CTXT_) ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF END IF * IF( LEFT ) THEN IDUM1( 1 ) = ICHAR( 'L' ) ELSE IDUM1( 1 ) = ICHAR( 'R' ) END IF IDUM2( 1 ) = 1 IF( UPPER ) THEN IDUM1( 2 ) = ICHAR( 'U' ) ELSE IDUM1( 2 ) = ICHAR( 'L' ) END IF IDUM2( 2 ) = 2 IF( NOTRAN ) THEN IDUM1( 3 ) = ICHAR( 'N' ) ELSE IDUM1( 3 ) = ICHAR( 'C' ) END IF IDUM2( 3 ) = 3 IF( LWORK.EQ.-1 ) THEN IDUM1( 4 ) = -1 ELSE IDUM1( 4 ) = 1 END IF IDUM2( 4 ) = 16 IF( LEFT ) THEN CALL PCHK2MAT( MI, 4, NQ-1, 4, IAA, JAA, DESCA, 9, MI, 4, $ NI, 5, ICC, JCC, DESCC, 14, 4, IDUM1, IDUM2, $ INFO ) ELSE CALL PCHK2MAT( NI, 5, NQ-1, 5, IAA, JAA, DESCA, 9, MI, 4, $ NI, 5, ICC, JCC, DESCC, 14, 4, IDUM1, IDUM2, $ INFO ) END IF END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZUNMTR', -INFO ) RETURN ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 .OR. NQ.EQ.1 ) $ RETURN * IF( UPPER ) THEN * * Q was determined by a call to PZHETRD with UPLO = 'U' * CALL PZUNMQL( SIDE, TRANS, MI, NI, NQ-1, A, IA, JAA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * ELSE * * Q was determined by a call to PZHETRD with UPLO = 'L' * CALL PZUNMQR( SIDE, TRANS, MI, NI, NQ-1, A, IAA, JAA, DESCA, $ TAU, C, ICC, JCC, DESCC, WORK, LWORK, IINFO ) * END IF * WORK( 1 ) = DCMPLX( DBLE( LWMIN ) ) * RETURN * * End of PZUNMTR * END scalapack-2.0.2/SRC/sdbtf2.f000644 000766 000024 00000012353 11657111056 015702 0ustar00juliestaff000000 000000 SUBROUTINE SDBTF2( M, N, KL, KU, AB, LDAB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Modified by Andrew J. Cleary in November, 96 from: * -- LAPACK auxiliary routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * August 6, 1991 * * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDAB, M, N * .. * .. Array Arguments .. REAL AB( LDAB, * ) * .. * * Purpose * ======= * * Sdbtrf computes an LU factorization of a real m-by-n band matrix A * without using partial pivoting with row interchanges. * * This is the unblocked version of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of subdiagonals within the band of A. KL >= 0. * * KU (input) INTEGER * The number of superdiagonals within the band of A. KU >= 0. * * AB (input/output) REAL array, dimension (LDAB,N) * On entry, the matrix A in band storage, in rows KL+1 to * 2*KL+KU+1; rows 1 to KL of the array need not be set. * The j-th column of A is stored in the j-th column of the * array AB as follows: * AB(kl+ku+1+i-j,j) = A(i,j) for max(1,j-ku)<=i<=min(m,j+kl) * * On exit, details of the factorization: U is stored as an * upper triangular band matrix with KL+KU superdiagonals in * rows 1 to KL+KU+1, and the multipliers used during the * factorization are stored in rows KL+KU+2 to 2*KL+KU+1. * See below for further details. * * LDAB (input) INTEGER * The leading dimension of the array AB. LDAB >= 2*KL+KU+1. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = +i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * Further Details * =============== * * The band storage scheme is illustrated by the following example, when * M = N = 6, KL = 2, KU = 1: * * On entry: On exit: * * * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 * a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 * a21 a32 a43 a54 a65 * m21 m32 m43 m54 m65 * * a31 a42 a53 a64 * * m31 m42 m53 m64 * * * * Array elements marked * are not used by the routine; elements marked * + need not be set on entry, but are required by the routine to store * elements of U, because of fill-in resulting from the row * interchanges. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER J, JP, JU, KM, KV * .. * .. External Functions .. INTEGER ISAMAX EXTERNAL ISAMAX * .. * .. External Subroutines .. EXTERNAL SGER, SSCAL, SSWAP * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * KV is the number of superdiagonals in the factor U, allowing for * fill-in. * KV = KU * * Test the input parameters. * INFO = 0 *ECA IF( M.LT.0 ) THEN *ECA INFO = -1 *ECA ELSE IF( N.LT.0 ) THEN *ECA INFO = -2 *ECA ELSE IF( KL.LT.0 ) THEN *ECA INFO = -3 *ECA ELSE IF( KU.LT.0 ) THEN *ECA INFO = -4 *ECA ELSE IF( LDAB.LT.KL+KV+1 ) THEN *ECA INFO = -6 *ECA END IF *ECA IF( INFO.NE.0 ) THEN *ECA CALL XERBLA( 'SDBTF2', -INFO ) *ECA RETURN *ECA END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Gaussian elimination without partial pivoting * * JU is the index of the last column affected by the current stage * of the factorization. * JU = 1 * DO 40 J = 1, MIN( M, N ) * * Test for singularity. KM is the number of * subdiagonal elements in the current column. * KM = MIN( KL, M-J ) JP = 1 IF( AB( KV+1, J ).NE.ZERO ) THEN JU = MAX( JU, MIN( J+KU, N ) ) * IF( KM.GT.0 ) THEN * * Compute multipliers. * CALL SSCAL( KM, ONE / AB( KU+1, J ), AB( KU+2, J ), 1 ) * * Update trailing submatrix within the band. * IF( JU.GT.J ) THEN CALL SGER( KM, JU-J, -ONE, AB( KU+2, J ), 1, $ AB( KU, J+1 ), LDAB-1, AB( KU+1, J+1 ), $ LDAB-1 ) END IF END IF ELSE * IF( INFO.EQ.0 ) $ INFO = J END IF 40 CONTINUE RETURN * * End of SDBTF2 * END scalapack-2.0.2/SRC/sdbtrf.f000644 000766 000024 00000025110 11657111056 015775 0ustar00juliestaff000000 000000 SUBROUTINE SDBTRF( M, N, KL, KU, AB, LDAB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * August, 1996. * Modified from SGBTRF: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * August 6, 1991 * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDAB, M, N * .. * .. Array Arguments .. REAL AB( LDAB, * ) * .. * * Purpose * ======= * * Sdbtrf computes an LU factorization of a real m-by-n band matrix A * without using partial pivoting or row interchanges. * * This is the blocked version of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of subdiagonals within the band of A. KL >= 0. * * KU (input) INTEGER * The number of superdiagonals within the band of A. KU >= 0. * * AB (input/output) REAL array, dimension (LDAB,N) * On entry, the matrix A in band storage, in rows KL+1 to * 2*KL+KU+1; rows 1 to KL of the array need not be set. * The j-th column of A is stored in the j-th column of the * array AB as follows: * AB(kl+ku+1+i-j,j) = A(i,j) for max(1,j-ku)<=i<=min(m,j+kl) * * On exit, details of the factorization: U is stored as an * upper triangular band matrix with KL+KU superdiagonals in * rows 1 to KL+KU+1, and the multipliers used during the * factorization are stored in rows KL+KU+2 to 2*KL+KU+1. * See below for further details. * * LDAB (input) INTEGER * The leading dimension of the array AB. LDAB >= 2*KL+KU+1. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = +i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * Further Details * =============== * * The band storage scheme is illustrated by the following example, when * M = N = 6, KL = 2, KU = 1: * * On entry: On exit: * * * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 * a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 * a21 a32 a43 a54 a65 * m21 m32 m43 m54 m65 * * a31 a42 a53 a64 * * m31 m42 m53 m64 * * * * Array elements marked * are not used by the routine. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0 ) PARAMETER ( ZERO = 0.0E+0 ) INTEGER NBMAX, LDWORK PARAMETER ( NBMAX = 64, LDWORK = NBMAX+1 ) * .. * .. Local Scalars .. INTEGER I, I2, I3, II, J, J2, J3, JB, JJ, JM, JP, $ JU, KM, KV, NB, NW * .. * .. Local Arrays .. REAL WORK13( LDWORK, NBMAX ), $ WORK31( LDWORK, NBMAX ) * .. * .. External Functions .. INTEGER ILAENV, ISAMAX EXTERNAL ILAENV, ISAMAX * .. * .. External Subroutines .. EXTERNAL SCOPY, SDBTF2, SGEMM, SGER, SSCAL, $ SSWAP, STRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * KV is the number of superdiagonals in the factor U * KV = KU * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( KL.LT.0 ) THEN INFO = -3 ELSE IF( KU.LT.0 ) THEN INFO = -4 ELSE IF( LDAB.LT.MIN( MIN( KL+KV+1,M ),N ) ) THEN INFO = -6 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SDBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Determine the block size for this environment * NB = ILAENV( 1, 'SDBTRF', ' ', M, N, KL, KU ) * * The block size must not exceed the limit set by the size of the * local arrays WORK13 and WORK31. * NB = MIN( NB, NBMAX ) * IF( NB.LE.1 .OR. NB.GT.KL ) THEN * * Use unblocked code * CALL SDBTF2( M, N, KL, KU, AB, LDAB, INFO ) ELSE * * Use blocked code * * Zero the superdiagonal elements of the work array WORK13 * DO 20 J = 1, NB DO 10 I = 1, J - 1 WORK13( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * * Zero the subdiagonal elements of the work array WORK31 * DO 40 J = 1, NB DO 30 I = J + 1, NB WORK31( I, J ) = ZERO 30 CONTINUE 40 CONTINUE * * JU is the index of the last column affected by the current * stage of the factorization * JU = 1 * DO 180 J = 1, MIN( M, N ), NB JB = MIN( NB, MIN( M, N )-J+1 ) * * The active part of the matrix is partitioned * * A11 A12 A13 * A21 A22 A23 * A31 A32 A33 * * Here A11, A21 and A31 denote the current block of JB columns * which is about to be factorized. The number of rows in the * partitioning are JB, I2, I3 respectively, and the numbers * of columns are JB, J2, J3. The superdiagonal elements of A13 * and the subdiagonal elements of A31 lie outside the band. * I2 = MIN( KL-JB, M-J-JB+1 ) I3 = MIN( JB, M-J-KL+1 ) * * J2 and J3 are computed after JU has been updated. * * Factorize the current block of JB columns * DO 80 JJ = J, J + JB - 1 * * Find pivot and test for singularity. KM is the number of * subdiagonal elements in the current column. * KM = MIN( KL, M-JJ ) JP = 1 IF( AB( KV+JP, JJ ).NE.ZERO ) THEN JU = MAX( JU, MIN( JJ+KU+JP-1, N ) ) * * Compute multipliers * CALL SSCAL( KM, ONE / AB( KV+1, JJ ), AB( KV+2, JJ ), $ 1 ) * * Update trailing submatrix within the band and within * the current block. JM is the index of the last column * which needs to be updated. * JM = MIN( JU, J+JB-1 ) IF( JM.GT.JJ ) THEN CALL SGER( KM, JM-JJ, -ONE, AB( KV+2, JJ ), 1, $ AB( KV, JJ+1 ), LDAB-1, $ AB( KV+1, JJ+1 ), LDAB-1 ) END IF END IF * * Copy current column of A31 into the work array WORK31 * NW = MIN( JJ-J+1, I3 ) IF( NW.GT.0 ) $ CALL SCOPY( NW, AB( KV+KL+1-JJ+J, JJ ), 1, $ WORK31( 1, JJ-J+1 ), 1 ) 80 CONTINUE IF( J+JB.LE.N ) THEN * * Apply the row interchanges to the other blocks. * J2 = MIN( JU-J+1, KV ) - JB J3 = MAX( 0, JU-J-KV+1 ) * * Update the relevant part of the trailing submatrix * IF( J2.GT.0 ) THEN * * Update A12 * CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', $ JB, J2, ONE, AB( KV+1, J ), LDAB-1, $ AB( KV+1-JB, J+JB ), LDAB-1 ) * IF( I2.GT.0 ) THEN * * Update A22 * CALL SGEMM( 'No transpose', 'No transpose', I2, J2, $ JB, -ONE, AB( KV+1+JB, J ), LDAB-1, $ AB( KV+1-JB, J+JB ), LDAB-1, ONE, $ AB( KV+1, J+JB ), LDAB-1 ) END IF * IF( I3.GT.0 ) THEN * * Update A32 * CALL SGEMM( 'No transpose', 'No transpose', I3, J2, $ JB, -ONE, WORK31, LDWORK, $ AB( KV+1-JB, J+JB ), LDAB-1, ONE, $ AB( KV+KL+1-JB, J+JB ), LDAB-1 ) END IF END IF * IF( J3.GT.0 ) THEN * * Copy the lower triangle of A13 into the work array * WORK13 * DO 130 JJ = 1, J3 DO 120 II = JJ, JB WORK13( II, JJ ) = AB( II-JJ+1, JJ+J+KV-1 ) 120 CONTINUE 130 CONTINUE * * Update A13 in the work array * CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', $ JB, J3, ONE, AB( KV+1, J ), LDAB-1, $ WORK13, LDWORK ) * IF( I2.GT.0 ) THEN * * Update A23 * CALL SGEMM( 'No transpose', 'No transpose', I2, J3, $ JB, -ONE, AB( KV+1+JB, J ), LDAB-1, $ WORK13, LDWORK, ONE, AB( 1+JB, J+KV ), $ LDAB-1 ) END IF * IF( I3.GT.0 ) THEN * * Update A33 * CALL SGEMM( 'No transpose', 'No transpose', I3, J3, $ JB, -ONE, WORK31, LDWORK, WORK13, $ LDWORK, ONE, AB( 1+KL, J+KV ), LDAB-1 ) END IF * * Copy the lower triangle of A13 back into place * DO 150 JJ = 1, J3 DO 140 II = JJ, JB AB( II-JJ+1, JJ+J+KV-1 ) = WORK13( II, JJ ) 140 CONTINUE 150 CONTINUE END IF ELSE END IF * * copy the upper triangle of A31 back into place * DO 170 JJ = J + JB - 1, J, -1 * * Copy the current column of A31 back into place * NW = MIN( I3, JJ-J+1 ) IF( NW.GT.0 ) $ CALL SCOPY( NW, WORK31( 1, JJ-J+1 ), 1, $ AB( KV+KL+1-JJ+J, JJ ), 1 ) 170 CONTINUE 180 CONTINUE END IF * RETURN * * End of SDBTRF * END scalapack-2.0.2/SRC/sdttrf.f000644 000766 000024 00000006473 11657111056 016032 0ustar00juliestaff000000 000000 SUBROUTINE SDTTRF( N, DL, D, DU, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, November 1996. * Modified from SGTTRF: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. INTEGER INFO, N * .. * .. Array Arguments .. REAL D( * ), DL( * ), DU( * ) * .. * * Purpose * ======= * * SDTTRF computes an LU factorization of a complex tridiagonal matrix A * using elimination without partial pivoting. * * The factorization has the form * A = L * U * where L is a product of unit lower bidiagonal * matrices and U is upper triangular with nonzeros in only the main * diagonal and first superdiagonal. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * DL (input/output) COMPLEX array, dimension (N-1) * On entry, DL must contain the (n-1) subdiagonal elements of * A. * On exit, DL is overwritten by the (n-1) multipliers that * define the matrix L from the LU factorization of A. * * D (input/output) COMPLEX array, dimension (N) * On entry, D must contain the diagonal elements of A. * On exit, D is overwritten by the n diagonal elements of the * upper triangular matrix U from the LU factorization of A. * * DU (input/output) COMPLEX array, dimension (N-1) * On entry, DU must contain the (n-1) superdiagonal elements * of A. * On exit, DU is overwritten by the (n-1) elements of the first * superdiagonal of U. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Local Scalars .. INTEGER I REAL FACT * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Executable Statements .. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 CALL XERBLA( 'SDTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * DO 20 I = 1, N - 1 IF( DL( I ).EQ.ZERO ) THEN * * Subdiagonal is zero, no elimination is required. * IF( D( I ).EQ.ZERO .AND. INFO.EQ.0 ) $ INFO = I ELSE * FACT = DL( I ) / D( I ) DL( I ) = FACT D( I+1 ) = D( I+1 ) - FACT*DU( I ) END IF 20 CONTINUE IF( D( N ).EQ.ZERO .AND. INFO.EQ.0 ) THEN INFO = N RETURN END IF * RETURN * * End of SDTTRF * END scalapack-2.0.2/SRC/sdttrsv.f000644 000766 000024 00000011565 11657111056 016233 0ustar00juliestaff000000 000000 SUBROUTINE SDTTRSV( UPLO, TRANS, N, NRHS, DL, D, DU, $ B, LDB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * August, 1996. * Modified from SGTTRS: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. CHARACTER UPLO, TRANS INTEGER INFO, LDB, N, NRHS * .. * .. Array Arguments .. REAL B( LDB, * ), D( * ), DL( * ), DU( * ) * .. * * Purpose * ======= * * SDTTRSV solves one of the systems of equations * L * X = B, L**T * X = B, or L**H * X = B, * U * X = B, U**T * X = B, or U**H * X = B, * with factors of the tridiagonal matrix A from the LU factorization * computed by SDTTRF. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether to solve with L or U. * * TRANS (input) CHARACTER * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * DL (input) COMPLEX array, dimension (N-1) * The (n-1) multipliers that define the matrix L from the * LU factorization of A. * * D (input) COMPLEX array, dimension (N) * The n diagonal elements of the upper triangular matrix U from * the LU factorization of A. * * DU (input) COMPLEX array, dimension (N-1) * The (n-1) elements of the first superdiagonal of U. * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, B is overwritten by the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Local Scalars .. LOGICAL LOWER, NOTRAN INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) LOWER = LSAME( UPLO, 'L' ) IF( .NOT.LOWER .AND. .NOT.LSAME( UPLO, 'U' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( N, 1 ) ) THEN INFO = -9 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SDTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( NOTRAN ) THEN * IF( LOWER ) THEN * Solve L*X = B, overwriting B with X. * DO 35 J = 1, NRHS * * Solve L*x = b. * DO 10 I = 1, N - 1 B( I+1, J ) = B( I+1, J ) - DL( I )*B( I, J ) 10 CONTINUE 35 CONTINUE * ELSE * Solve U*x = b. * DO 30 J = 1, NRHS B( N, J ) = B( N, J ) / D( N ) IF( N.GT.1 ) $ B( N-1, J ) = ( B( N-1, J )-DU( N-1 )*B( N, J ) ) / $ D( N-1 ) DO 20 I = N - 2, 1, -1 B( I, J ) = ( B( I, J )-DU( I )*B( I+1, J ) ) / D( I ) 20 CONTINUE 30 CONTINUE * ENDIF * ELSE * IF( .NOT. LOWER ) THEN * Solve U**T * X = B, overwriting B with X. * DO 65 J = 1, NRHS * * Solve U**T * x = b. * B( 1, J ) = B( 1, J ) / D( 1 ) IF( N.GT.1 ) $ B( 2, J ) = ( B( 2, J )-DU( 1 )*B( 1, J ) ) / D( 2 ) DO 40 I = 3, N B( I, J ) = ( B( I, J )-DU( I-1 )*B( I-1, J ) ) / D( I ) 40 CONTINUE 65 CONTINUE * ELSE * * Solve L**T * X = B, overwriting B with X. DO 60 J = 1, NRHS * * Solve L**T * x = b. * DO 50 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - DL( I )*B( I+1, J ) 50 CONTINUE 60 CONTINUE ENDIF END IF * * End of SDTTRSV * END scalapack-2.0.2/SRC/slamov.c000644 000766 000024 00000000252 11745567264 016025 0ustar00juliestaff000000 000000 // // slamov.c // // Written by Lee Killough 04/19/2012 // #define TYPE float #define FUNC "SLAMOV" #define LAMOV slamov_ #define LACPY slacpy_ #include "lamov.h" scalapack-2.0.2/SRC/slamsh.f000644 000766 000024 00000021207 10363532303 015776 0ustar00juliestaff000000 000000 SUBROUTINE SLAMSH ( S, LDS, NBULGE, JBLK, H, LDH, N, ULP ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER LDS, NBULGE, JBLK, LDH, N REAL ULP * .. * .. Array Arguments .. REAL S(LDS,*), H(LDH,*) * .. * * Purpose * ======= * * SLAMSH sends multiple shifts through a small (single node) matrix to * see how consecutive small subdiagonal elements are modified by * subsequent shifts in an effort to maximize the number of bulges * that can be sent through. * SLAMSH should only be called when there are multiple shifts/bulges * (NBULGE > 1) and the first shift is starting in the middle of an * unreduced Hessenberg matrix because of two or more consecutive small * subdiagonal elements. * * Arguments * ========= * * S (local input/output) REAL array, (LDS,*) * On entry, the matrix of shifts. Only the 2x2 diagonal of S is * referenced. It is assumed that S has JBLK double shifts * (size 2). * On exit, the data is rearranged in the best order for * applying. * * LDS (local input) INTEGER * On entry, the leading dimension of S. Unchanged on exit. * 1 < NBULGE <= JBLK <= LDS/2 * * NBULGE (local input/output) INTEGER * On entry, the number of bulges to send through H ( >1 ). * NBULGE should be less than the maximum determined (JBLK). * 1 < NBULGE <= JBLK <= LDS/2 * On exit, the maximum number of bulges that can be sent * through. * * JBLK (local input) INTEGER * On entry, the number of shifts determined for S. * Unchanged on exit. * * H (local input/output) REAL array (LDH,N) * On entry, the local matrix to apply the shifts on. * H should be aligned so that the starting row is 2. * On exit, the data is destroyed. * * LDS (local input) INTEGER * On entry, the leading dimension of S. Unchanged on exit. * * N (local input) INTEGER * On entry, the size of H. If all the bulges are expected to * go through, N should be at least 4*NBULGE+2. * Otherwise, NBULGE may be reduced by this routine. * * ULP (local input) REAL * On entry, machine precision * Unchanged on exit. * * Implemented by: G. Henry, May 1, 1997 * * ===================================================================== * * .. Parameters .. REAL ZERO, TEN PARAMETER ( ZERO = 0.0E+0, TEN = 10.0E+0 ) * .. * .. Local Scalars .. INTEGER K, IBULGE, M, NR, J, IVAL, I REAL H44, H33, H43H34, H11, H22, H21, H12, H44S, $ H33S, V1, V2, V3, H00, H10, TST1, T1, T2, T3, $ SUM, S1, DVAL * .. * .. Local Arrays .. REAL V(3) * .. * .. External Subroutines .. EXTERNAL SLARFG, SCOPY * .. * .. Intrinsic Functions .. INTRINSIC MAX, ABS * .. * .. Executable Statements .. * M = 2 DO 10 IBULGE = 1, NBULGE H44 = S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2) H33 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) H43H34 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2)* $ S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = ABS( V1 )*( ABS( H00 )+ABS( H11 )+ABS( H22 ) ) IF( ABS( H10 )*( ABS( V2 )+ABS( V3 ) ).GT.ULP*TST1 ) THEN * Find minimum DVAL = (ABS(H10)*(ABS(V2)+ABS(V3))) / (ULP*TST1) IVAL = IBULGE DO 15 I = IBULGE+1, NBULGE H44 = S(2*JBLK-2*I+2, 2*JBLK-2*I+2) H33 = S(2*JBLK-2*I+1,2*JBLK-2*I+1) H43H34 = S(2*JBLK-2*I+1,2*JBLK-2*I+2)* $ S(2*JBLK-2*I+2, 2*JBLK-2*I+1) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = ABS( V1 )*( ABS( H00 )+ABS( H11 )+ABS( H22 ) ) IF ( (DVAL.GT.(ABS(H10)*(ABS(V2)+ABS(V3)))/(ULP*TST1)) $ .AND. ( DVAL .GT. 1.D0 ) ) THEN DVAL = (ABS(H10)*(ABS(V2)+ABS(V3))) / (ULP*TST1) IVAL = I END IF 15 CONTINUE IF ( (DVAL .LT. TEN) .AND. (IVAL .NE. IBULGE) ) THEN H44 = S(2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+2) H33 = S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+1) H43H34 = S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+2) H10 = S(2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1) S(2*JBLK-2*IVAL+2,2*JBLK-2*IVAL+2) = $ S(2*JBLK-2*IBULGE+2,2*JBLK-2*IBULGE+2) S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+1) = $ S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+2) = $ S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2) S(2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1) = $ S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2) = H44 S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) = H33 S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2) = H43H34 S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) = H10 END IF H44 = S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2) H33 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) H43H34 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2)* $ S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = ABS( V1 ) + ABS( V2 ) + ABS( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = ABS( V1 )*( ABS( H00 )+ABS( H11 )+ABS( H22 ) ) END IF IF( ABS( H10 )*( ABS( V2 )+ABS( V3 ) ).GT.TEN*ULP*TST1 ) THEN * IBULGE better not be 1 here or we have a bug! NBULGE = MAX(IBULGE -1,1) RETURN END IF DO 120 K = M, N - 1 NR = MIN( 3, N-K+1 ) IF( K.GT.M ) $ CALL SCOPY( NR, H( K, K-1 ), 1, V, 1 ) CALL SLARFG( NR, V( 1 ), V( 2 ), 1, T1 ) IF( K.GT.M ) THEN H( K, K-1 ) = V( 1 ) H( K+1, K-1 ) = ZERO IF( K.LT.N-1 ) $ H( K+2, K-1 ) = ZERO ELSE H( K, K-1 ) = -H( K, K-1 ) END IF V2 = V( 2 ) T2 = T1*V2 IF( NR.EQ.3 ) THEN V3 = V( 3 ) T3 = T1*V3 DO 60 J = K, N SUM = H( K, J ) + V2*H( K+1, J ) + V3*H( K+2, J ) H( K, J ) = H( K, J ) - SUM*T1 H( K+1, J ) = H( K+1, J ) - SUM*T2 H( K+2, J ) = H( K+2, J ) - SUM*T3 60 CONTINUE DO 70 J = 1, MIN( K+3, N ) SUM = H( J, K ) + V2*H( J, K+1 ) + V3*H( J, K+2 ) H( J, K ) = H( J, K ) - SUM*T1 H( J, K+1 ) = H( J, K+1 ) - SUM*T2 H( J, K+2 ) = H( J, K+2 ) - SUM*T3 70 CONTINUE END IF 120 CONTINUE 10 CONTINUE * RETURN END scalapack-2.0.2/SRC/slapst.f000644 000766 000024 00000015004 10363532303 016013 0ustar00juliestaff000000 000000 SUBROUTINE SLAPST( ID, N, D, INDX, INFO ) * * -- ScaLAPACK auxiliary routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. CHARACTER ID INTEGER INFO, N * .. * .. Array Arguments .. INTEGER INDX( * ) REAL D( * ) * .. * * Purpose * ======= * SLAPST is a modified version of the LAPACK routine SLASRT. * * Define a permutation INDX that sorts the numbers in D * in increasing order (if ID = 'I') or * in decreasing order (if ID = 'D' ). * * Use Quick Sort, reverting to Insertion sort on arrays of * size <= 20. Dimension of STACK limits N to about 2**32. * * Arguments * ========= * * ID (input) CHARACTER*1 * = 'I': sort D in increasing order; * = 'D': sort D in decreasing order. * * N (input) INTEGER * The length of the array D. * * D (input) REAL array, dimension (N) * The array to be sorted. * * INDX (ouput) INTEGER array, dimension (N). * The permutation which sorts the array D. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. INTEGER SELECT PARAMETER ( SELECT = 20 ) * .. * .. Local Scalars .. INTEGER DIR, ENDD, I, ITMP, J, START, STKPNT REAL D1, D2, D3, DMNMX * .. * .. Local Arrays .. INTEGER STACK( 2, 32 ) * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input paramters. * INFO = 0 DIR = -1 IF( LSAME( ID, 'D' ) ) THEN DIR = 0 ELSE IF( LSAME( ID, 'I' ) ) THEN DIR = 1 END IF IF( DIR.EQ.-1 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAPST', -INFO ) RETURN END IF * * Quick return if possible * IF( N.LE.1 ) $ RETURN * DO 10 I = 1, N INDX( I ) = I 10 CONTINUE * STKPNT = 1 STACK( 1, 1 ) = 1 STACK( 2, 1 ) = N 20 CONTINUE START = STACK( 1, STKPNT ) ENDD = STACK( 2, STKPNT ) STKPNT = STKPNT - 1 IF( ENDD-START.LE.SELECT .AND. ENDD-START.GT.0 ) THEN * * Do Insertion sort on D( START:ENDD ) * IF( DIR.EQ.0 ) THEN * * Sort into decreasing order * DO 40 I = START + 1, ENDD DO 30 J = I, START + 1, -1 IF( D( INDX( J ) ).GT.D( INDX( J-1 ) ) ) THEN ITMP = INDX( J ) INDX( J ) = INDX( J-1 ) INDX( J-1 ) = ITMP ELSE GO TO 40 END IF 30 CONTINUE 40 CONTINUE * ELSE * * Sort into increasing order * DO 60 I = START + 1, ENDD DO 50 J = I, START + 1, -1 IF( D( INDX( J ) ).LT.D( INDX( J-1 ) ) ) THEN ITMP = INDX( J ) INDX( J ) = INDX( J-1 ) INDX( J-1 ) = ITMP ELSE GO TO 60 END IF 50 CONTINUE 60 CONTINUE * END IF * ELSE IF( ENDD-START.GT.SELECT ) THEN * * Partition D( START:ENDD ) and stack parts, largest one first * * Choose partition entry as median of 3 * D1 = D( INDX( START ) ) D2 = D( INDX( ENDD ) ) I = ( START+ENDD ) / 2 D3 = D( INDX( I ) ) IF( D1.LT.D2 ) THEN IF( D3.LT.D1 ) THEN DMNMX = D1 ELSE IF( D3.LT.D2 ) THEN DMNMX = D3 ELSE DMNMX = D2 END IF ELSE IF( D3.LT.D2 ) THEN DMNMX = D2 ELSE IF( D3.LT.D1 ) THEN DMNMX = D3 ELSE DMNMX = D1 END IF END IF * IF( DIR.EQ.0 ) THEN * * Sort into decreasing order * I = START - 1 J = ENDD + 1 70 CONTINUE 80 CONTINUE J = J - 1 IF( D( INDX( J ) ).LT.DMNMX ) $ GO TO 80 90 CONTINUE I = I + 1 IF( D( INDX( I ) ).GT.DMNMX ) $ GO TO 90 IF( I.LT.J ) THEN ITMP = INDX( I ) INDX( I ) = INDX( J ) INDX( J ) = ITMP GO TO 70 END IF IF( J-START.GT.ENDD-J-1 ) THEN STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD ELSE STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J END IF ELSE * * Sort into increasing order * I = START - 1 J = ENDD + 1 100 CONTINUE 110 CONTINUE J = J - 1 IF( D( INDX( J ) ).GT.DMNMX ) $ GO TO 110 120 CONTINUE I = I + 1 IF( D( INDX( I ) ).LT.DMNMX ) $ GO TO 120 IF( I.LT.J ) THEN ITMP = INDX( I ) INDX( I ) = INDX( J ) INDX( J ) = ITMP GO TO 100 END IF IF( J-START.GT.ENDD-J-1 ) THEN STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD ELSE STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J END IF END IF END IF IF( STKPNT.GT.0 ) $ GO TO 20 RETURN * * End of SLAPST * END scalapack-2.0.2/SRC/slaqr6.f000644 000766 000024 00000102110 11750130340 015704 0ustar00juliestaff000000 000000 SUBROUTINE SLAQR6( JOB, WANTT, WANTZ, KACC22, N, KTOP, KBOT, $ NSHFTS, SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, $ V, LDV, U, LDU, NV, WV, LDWV, NH, WH, LDWH ) * * Contribution from the Department of Computing Science and HPC2N, * Umea University, Sweden * * -- ScaLAPACK auxiliary routine (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER JOB INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV, $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. REAL H( LDH, * ), SI( * ), SR( * ), U( LDU, * ), $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ), $ Z( LDZ, * ) * .. * * This auxiliary subroutine called by PSLAQR5 performs a * single small-bulge multi-shift QR sweep, moving the chain * of bulges from top to bottom in the submatrix * H(KTOP:KBOT,KTOP:KBOT), collecting the transformations in the * matrix HV *or* accumulating the transformations in the matrix * Z (see below). * * This is a modified version of DLAQR5 from LAPACK 3.1. * * ====================================================================== * * JOB (input) character scalar * Set the kind of job to do in SLAQR6, as follows: * JOB = 'I': Introduce and chase bulges in submatrix * JOB = 'C': Chase bulges from top to bottom of submatrix * JOB = 'O': Chase bulges off submatrix * * WANTT (input) logical scalar * WANTT = .true. if the quasi-triangular Schur factor * is being computed. WANTT is set to .false. otherwise. * * WANTZ (input) logical scalar * WANTZ = .true. if the orthogonal Schur factor is being * computed. WANTZ is set to .false. otherwise. * * KACC22 (input) integer with value 0, 1, or 2. * Specifies the computation mode of far-from-diagonal * orthogonal updates. * = 0: SLAQR6 does not accumulate reflections and does not * use matrix-matrix multiply to update far-from-diagonal * matrix entries. * = 1: SLAQR6 accumulates reflections and uses matrix-matrix * multiply to update the far-from-diagonal matrix entries. * = 2: SLAQR6 accumulates reflections, uses matrix-matrix * multiply to update the far-from-diagonal matrix entries, * and takes advantage of 2-by-2 block structure during * matrix multiplies. * * N (input) integer scalar * N is the order of the Hessenberg matrix H upon which this * subroutine operates. * * KTOP (input) integer scalar * KBOT (input) integer scalar * These are the first and last rows and columns of an * isolated diagonal block upon which the QR sweep is to be * applied. It is assumed without a check that * either KTOP = 1 or H(KTOP,KTOP-1) = 0 * and * either KBOT = N or H(KBOT+1,KBOT) = 0. * * NSHFTS (input) integer scalar * NSHFTS gives the number of simultaneous shifts. NSHFTS * must be positive and even. * * SR (input) REAL array of size (NSHFTS) * SI (input) REAL array of size (NSHFTS) * SR contains the real parts and SI contains the imaginary * parts of the NSHFTS shifts of origin that define the * multi-shift QR sweep. * * H (input/output) REAL array of size (LDH,N) * On input H contains a Hessenberg matrix. On output a * multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied * to the isolated diagonal block in rows and columns KTOP * through KBOT. * * LDH (input) integer scalar * LDH is the leading dimension of H just as declared in the * calling procedure. LDH.GE.MAX(1,N). * * ILOZ (input) INTEGER * IHIZ (input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N * * Z (input/output) REAL array of size (LDZ,IHI) * If WANTZ = .TRUE., then the QR Sweep orthogonal * similarity transformation is accumulated into * Z(ILOZ:IHIZ,ILO:IHI) from the right. * If WANTZ = .FALSE., then Z is unreferenced. * * LDZ (input) integer scalar * LDA is the leading dimension of Z just as declared in * the calling procedure. LDZ.GE.N. * * V (workspace) REAL array of size (LDV,NSHFTS/2) * * LDV (input) integer scalar * LDV is the leading dimension of V as declared in the * calling procedure. LDV.GE.3. * * U (workspace) REAL array of size * (LDU,3*NSHFTS-3) * * LDU (input) integer scalar * LDU is the leading dimension of U just as declared in the * in the calling subroutine. LDU.GE.3*NSHFTS-3. * * NH (input) integer scalar * NH is the number of columns in array WH available for * workspace. NH.GE.1 is required for usage of this * workspace, otherwise the updates of the far-from-diagonal * elements will be updated without level 3 BLAS. * * WH (workspace) REAL array of size (LDWH,NH) * * LDWH (input) integer scalar * Leading dimension of WH just as declared in the * calling procedure. LDWH.GE.3*NSHFTS-3. * * NV (input) integer scalar * NV is the number of rows in WV agailable for workspace. * NV.GE.1 is required for usage of this * workspace, otherwise the updates of the far-from-diagonal * elements will be updated without level 3 BLAS. * * WV (workspace) REAL array of size * (LDWV,3*NSHFTS-3) * * LDWV (input) integer scalar * LDWV is the leading dimension of WV as declared in the * in the calling subroutine. LDWV.GE.NV. * * * ================================================================ * Based on contributions by * Karen Braman and Ralph Byers, Department of Mathematics, * University of Kansas, USA * * Robert Granat, Department of Computing Science and HPC2N, * Umea University, Sweden * * ============================================================ * Reference: * * K. Braman, R. Byers and R. Mathias, The Multi-Shift QR * Algorithm Part I: Maintaining Well Focused Shifts, and * Level 3 Performance, SIAM Journal of Matrix Analysis, * volume 23, pages 929--947, 2002. * * ============================================================ * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0e0, ONE = 1.0e0 ) * .. * .. Local Scalars .. REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, $ NS, NU, SINCOL, EINCOL, UINCOL, IPHV, CHUNK, $ THREADS, JLEN2, JCOL2, GCHUNK, JROW2, MAXCHUNK LOGICAL ACCUM, BLK22, BMP22, INTRO, CHASE, OFF, ALL * .. * .. External Functions .. LOGICAL LSAME INTEGER PILAENVX REAL SLAMCH EXTERNAL LSAME, SLAMCH, PILAENVX * .. * .. Intrinsic Functions .. * INTRINSIC ABS, FLOAT, MAX, MIN, MOD * .. * .. Local Arrays .. REAL VT( 3 ) * .. * .. External Subroutines .. EXTERNAL SGEMM, SLABAD, SLAMOV, SLAQR1, SLARFG, SLASET, $ STRMM * .. * .. Executable Statements .. * * ==== If there are no shifts, then there is nothing to do. ==== * IF( NSHFTS.LT.2 ) $ RETURN * * ==== If the active block is empty or 1-by-1, then there * . is nothing to do. ==== * IF( KTOP.GE.KBOT ) $ RETURN THREADS = 1 * * ==== Shuffle shifts into pairs of real shifts and pairs * . of complex conjugate shifts assuming complex * . conjugate shifts are already adjacent to one * . another. ==== * DO 10 I = 1, NSHFTS - 2, 2 IF( SI( I ).NE.-SI( I+1 ) ) THEN * SWAP = SR( I ) SR( I ) = SR( I+1 ) SR( I+1 ) = SR( I+2 ) SR( I+2 ) = SWAP * SWAP = SI( I ) SI( I ) = SI( I+1 ) SI( I+1 ) = SI( I+2 ) SI( I+2 ) = SWAP END IF 10 CONTINUE * * ==== NSHFTS is supposed to be even, but if it is odd, * . then simply reduce it by one. The shuffle above * . ensures that the dropped shift is real and that * . the remaining shifts are paired. ==== * NS = NSHFTS - MOD( NSHFTS, 2 ) * * ==== Machine constants for deflation ==== * SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( FLOAT( N ) / ULP ) * * ==== Use accumulated reflections to update far-from-diagonal * . entries ? This is only performed if both NH and NV is * greater than 1. ==== * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) ACCUM = ACCUM .AND. NH.GE.1 .AND. NV.GE.1 * * ==== If so, exploit the 2-by-2 block structure? ==== * BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) * * ==== Decode JOB ==== * ALL = LSAME( JOB, 'A' ) IF( .NOT. ALL ) $ INTRO = LSAME( JOB, 'I' ) IF( .NOT. ALL .AND. .NOT. INTRO ) $ CHASE = LSAME( JOB, 'C' ) IF( .NOT. ALL .AND. .NOT. INTRO .AND. .NOT. CHASE ) THEN OFF = LSAME( JOB, 'O' ) IF( .NOT. OFF ) $ RETURN END IF * * ==== clear trash ==== * IF( INTRO.OR.ALL .AND. KTOP+2.LE.KBOT ) $ H( KTOP+2, KTOP ) = ZERO * * ==== NBMPS = number of 2-shift bulges in the chain ==== * NBMPS = NS / 2 * * ==== KDU = width of slab ==== * KDU = 6*NBMPS - 3 * * Set loop limits for bulge-chasing depending on working mode * IF( ALL ) THEN SINCOL = 3*( 1-NBMPS ) + KTOP - 1 EINCOL = KBOT - 2 UINCOL = 3*NBMPS - 2 ELSEIF( INTRO ) THEN SINCOL = 3*( 1-NBMPS ) + KTOP - 1 EINCOL = KBOT - 3*NBMPS - 1 UINCOL = 3*NBMPS - 2 ELSEIF( CHASE ) THEN SINCOL = KTOP EINCOL = KBOT - 3*NBMPS - 1 UINCOL = 3*NBMPS - 2 ELSEIF( OFF ) THEN SINCOL = KTOP EINCOL = KBOT - 2 UINCOL = 3*NBMPS - 2 END IF IPHV = 0 * * ==== Create and/or chase chains of NBMPS bulges ==== * DO 220 INCOL = SINCOL, EINCOL, UINCOL NDCOL = MIN( INCOL + KDU, EINCOL ) IF( ACCUM ) $ CALL SLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge * . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The * . following loop chases a 3*NBMPS column long chain of * . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * DO 150 KRCOL = INCOL, MIN( EINCOL, INCOL+3*NBMPS-3, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small * . 2-by-2 bulge, if there is room. The inactive bulges * . (if any) must wait until the active bulges have moved * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) M22 = MBOT + 1 BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * DO 20 M = MTOP, MBOT K = KRCOL + 3*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL SLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), $ V( 1, M ) ) ALPHA = V( 1, M ) CALL SLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL SLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) * * ==== A Bulge may collapse because of vigilant * . deflation or destructive underflow. In the * . underflow case, try the two-small-subdiagonals * . trick to try to reinflate the bulge. ==== * IF( H( K+3, K ).NE.ZERO .OR. H( K+3, K+1 ).NE. $ ZERO .OR. H( K+3, K+2 ).EQ.ZERO ) THEN * * ==== Typical case: not collapsed (yet). ==== * H( K+1, K ) = BETA H( K+2, K ) = ZERO H( K+3, K ) = ZERO ELSE * * ==== Atypical case: collapsed. Attempt to * . reintroduce ignoring H(K+1,K) and H(K+2,K). * . If the fill resulting from the new * . reflector is too large, then abandon it. * . Otherwise, use the new one. ==== * CALL SLAQR1( 3, H( K+1, K+1 ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), $ VT ) ALPHA = VT( 1 ) CALL SLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) ) REFSUM = VT( 1 )*( H( K+1, K )+VT( 2 )* $ H( K+2, K ) ) * IF( ABS( H( K+2, K )-REFSUM*VT( 2 ) )+ $ ABS( REFSUM*VT( 3 ) ).GT.ULP* $ ( ABS( H( K, K ) )+ABS( H( K+1, $ K+1 ) )+ABS( H( K+2, K+2 ) ) ) ) THEN * * ==== Starting a new bulge here would * . create non-negligible fill. Use * . the old one with trepidation. ==== * H( K+1, K ) = BETA H( K+2, K ) = ZERO H( K+3, K ) = ZERO ELSE * * ==== Stating a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== * H( K+1, K ) = H( K+1, K ) - REFSUM H( K+2, K ) = ZERO H( K+3, K ) = ZERO V( 1, M ) = VT( 1 ) V( 2, M ) = VT( 2 ) V( 3, M ) = VT( 3 ) END IF END IF END IF 20 CONTINUE * * ==== Generate a 2-by-2 reflection, if needed. ==== * K = KRCOL + 3*( M22-1 ) IF( BMP22 ) THEN IF( K.EQ.KTOP-1 ) THEN CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), $ V( 1, M22 ) ) BETA = V( 1, M22 ) CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) ELSE BETA = H( K+1, K ) V( 2, M22 ) = H( K+2, K ) CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) H( K+1, K ) = BETA H( K+2, K ) = ZERO END IF ELSE * * ==== Initialize V(1,M22) here to avoid possible undefined * . variable problems later. ==== * V( 1, M22 ) = ZERO END IF * * ==== Multiply H by reflections from the left ==== * IF( ACCUM ) THEN JBOT = MIN( MAX(INCOL+KDU,NDCOL), KBOT ) ELSE IF( WANTT ) THEN JBOT = N ELSE JBOT = KBOT END IF DO 40 J = MAX( KTOP, KRCOL ), JBOT MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) DO 30 M = MTOP, MEND K = KRCOL + 3*( M-1 ) REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) H( K+1, J ) = H( K+1, J ) - REFSUM H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) 30 CONTINUE 40 CONTINUE IF( BMP22 ) THEN K = KRCOL + 3*( M22-1 ) DO 50 J = MAX( K+1, KTOP ), JBOT REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* $ H( K+2, J ) ) H( K+1, J ) = H( K+1, J ) - REFSUM H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) 50 CONTINUE END IF * * ==== Multiply H by reflections from the right. * . Delay filling in the last row until the * . vigilant deflation check is complete. ==== * IF( ACCUM ) THEN JTOP = MAX( KTOP, INCOL ) ELSE IF( WANTT ) THEN JTOP = 1 ELSE JTOP = KTOP END IF DO 90 M = MTOP, MBOT IF( V( 1, M ).NE.ZERO ) THEN K = KRCOL + 3*( M-1 ) DO 60 J = JTOP, MIN( KBOT, K+3 ) REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) H( J, K+1 ) = H( J, K+1 ) - REFSUM H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) 60 CONTINUE * IF( ACCUM ) THEN * * ==== Accumulate U. (If necessary, update Z later * . with with an efficient matrix-matrix * . multiply.) ==== * KMS = K - INCOL DO 70 J = MAX( 1, KTOP-INCOL ), KDU REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) 70 CONTINUE ELSE IF( WANTZ ) THEN * * ==== U is not accumulated, so update Z * . now by multiplying by reflections * . from the right. ==== * DO 80 J = ILOZ, IHIZ REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) Z( J, K+1 ) = Z( J, K+1 ) - REFSUM Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) 80 CONTINUE END IF END IF 90 CONTINUE * * ==== Special case: 2-by-2 reflection (if needed) ==== * K = KRCOL + 3*( M22-1 ) IF( BMP22 ) THEN IF( V( 1, M22 ).NE.ZERO ) THEN DO 100 J = JTOP, MIN( KBOT, K+3 ) REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* $ H( J, K+2 ) ) H( J, K+1 ) = H( J, K+1 ) - REFSUM H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) 100 CONTINUE * IF( ACCUM ) THEN KMS = K - INCOL DO 110 J = MAX( 1, KTOP-INCOL ), KDU REFSUM = V( 1, M22 )*( U( J, KMS+1 ) + $ V( 2, M22 )*U( J, KMS+2 ) ) U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM U( J, KMS+2 ) = U( J, KMS+2 ) - $ REFSUM*V( 2, M22 ) 110 CONTINUE ELSE IF( WANTZ ) THEN DO 120 J = ILOZ, IHIZ REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* $ Z( J, K+2 ) ) Z( J, K+1 ) = Z( J, K+1 ) - REFSUM Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) 120 CONTINUE END IF END IF END IF * * ==== Vigilant deflation check ==== * MSTART = MTOP IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) $ MSTART = MSTART + 1 MEND = MBOT IF( BMP22 ) $ MEND = MEND + 1 IF( KRCOL.EQ.KBOT-2 ) $ MEND = MEND + 1 DO 130 M = MSTART, MEND K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals * . criterion and the Ahues & Tisseur (LAWN 122, 1997) * . criteria both be satisfied. The latter improves * . accuracy in some examples. Falling back on an * . alternate convergence criterion when TST1 or TST2 * . is zero (as done here) is traditional but probably * . unnecessary. ==== * IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN IF( K.GE.KTOP+1 ) $ TST1 = TST1 + ABS( H( K, K-1 ) ) IF( K.GE.KTOP+2 ) $ TST1 = TST1 + ABS( H( K, K-2 ) ) IF( K.GE.KTOP+3 ) $ TST1 = TST1 + ABS( H( K, K-3 ) ) IF( K.LE.KBOT-2 ) $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) IF( K.LE.KBOT-3 ) $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) IF( K.LE.KBOT-4 ) $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) END IF IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) ) $ THEN H12 = MAX( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) H21 = MIN( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) H11 = MAX( ABS( H( K+1, K+1 ) ), $ ABS( H( K, K )-H( K+1, K+1 ) ) ) H22 = MIN( ABS( H( K+1, K+1 ) ), $ ABS( H( K, K )-H( K+1, K+1 ) ) ) SCL = H11 + H12 TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF 130 CONTINUE * * ==== Fill in the last row of each bulge. ==== * MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) DO 140 M = MTOP, MEND K = KRCOL + 3*( M-1 ) REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) H( K+4, K+1 ) = -REFSUM H( K+4, K+2 ) = -REFSUM*V( 2, M ) H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) 140 CONTINUE * * ==== End of near-the-diagonal bulge chase. ==== * 150 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as * . well. ==== * IF( ACCUM ) THEN IF( WANTT ) THEN JTOP = 1 JBOT = N ELSE JTOP = KTOP JBOT = KBOT END IF K1 = MAX( 1, KTOP-INCOL ) NU = ( KDU-MAX( 0, MAX(INCOL+KDU,NDCOL)-KBOT ) ) - K1 + 1 IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) .OR. $ NU.LT.KDU ) THEN * * ==== Updates not exploiting the 2-by-2 block * . structure of U. K1 and NU keep track of * . the location and size of U in the special * . cases of introducing bulges and chasing * . bulges off the bottom. In these special * . cases and in case the number of shifts * . is NS = 2, there is no 2-by-2 block * . structure to exploit. ==== * * ==== Horizontal Multiply ==== * DO 160 JCOL = MIN(MAX(INCOL+KDU,NDCOL),KBOT)+ 1, JBOT, NH JLEN = MIN( NH, JBOT-JCOL+1 ) CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, $ LDWH ) CALL SLAMOV( 'ALL', NU, JLEN, WH, LDWH, $ H( INCOL+K1, JCOL ), LDH ) 160 CONTINUE * * ==== Vertical multiply ==== * DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL SLAMOV( 'ALL', JLEN, NU, WV, LDWV, $ H( JROW, INCOL+K1 ), LDH ) 170 CONTINUE * * ==== Z multiply (also vertical) ==== * IF( WANTZ ) THEN DO 180 JROW = ILOZ, IHIZ, NV JLEN = MIN( NV, IHIZ-JROW+1 ) CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL SLAMOV( 'ALL', JLEN, NU, WV, LDWV, $ Z( JROW, INCOL+K1 ), LDZ ) 180 CONTINUE END IF ELSE * * ==== Updates exploiting U's 2-by-2 block structure. * . (I2, I4, J2, J4 are the last rows and columns * . of the blocks.) ==== * I2 = ( KDU+1 ) / 2 I4 = KDU J2 = I4 - I2 J4 = KDU * * ==== KZS and KNZ deal with the band of zeros * . along the diagonal of one of the triangular * . blocks. ==== * KZS = ( J4-J2 ) - ( NS+1 ) KNZ = NS + 1 * * ==== Horizontal multiply ==== * DO 190 JCOL = MIN(MAX(INCOL+KDU,NDCOL),KBOT)+ 1, JBOT, NH JLEN = MIN( NH, JBOT-JCOL+1 ) * * ==== Copy bottom of H to top+KZS of scratch ==== * (The first KZS rows get multiplied by zero.) ==== * CALL SLAMOV( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), $ LDH, WH( KZS+1, 1 ), LDWH ) CALL SLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) * * ==== Multiply by U21' ==== * CALL STRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), $ LDWH ) * * ==== Multiply top of H by U11' ==== * CALL SGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) * * ==== Copy top of H to bottom of WH ==== * CALL SLAMOV( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, $ WH( I2+1, 1 ), LDWH ) * * ==== Multiply by U21' ==== * CALL STRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) * * ==== Multiply by U22 ==== * CALL SGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, $ U( J2+1, I2+1 ), LDU, $ H( INCOL+1+J2, JCOL ), LDH, ONE, $ WH( I2+1, 1 ), LDWH ) * * ==== Copy it back ==== * CALL SLAMOV( 'ALL', KDU, JLEN, WH, LDWH, $ H( INCOL+1, JCOL ), LDH ) 190 CONTINUE * * ==== Vertical multiply ==== * DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) * * ==== Copy right of H to scratch (the first KZS * . columns get multiplied by zero) ==== * CALL SLAMOV( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), $ LDH, WV( 1, 1+KZS ), LDWV ) CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) * * ==== Multiply by U21 ==== * CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), $ LDWV ) * * ==== Multiply by U11 ==== * CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, $ LDWV ) * * ==== Copy left of H to right of scratch ==== * CALL SLAMOV( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, $ WV( 1, 1+I2 ), LDWV ) * * ==== Multiply by U21 ==== * CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) * * ==== Multiply by U22 ==== * CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, $ H( JROW, INCOL+1+J2 ), LDH, $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), $ LDWV ) * * ==== Copy it back ==== * CALL SLAMOV( 'ALL', JLEN, KDU, WV, LDWV, $ H( JROW, INCOL+1 ), LDH ) 200 CONTINUE * * ==== Multiply Z (also vertical) ==== * IF( WANTZ ) THEN DO 210 JROW = ILOZ, IHIZ, NV JLEN = MIN( NV, IHIZ-JROW+1 ) * * ==== Copy right of Z to left of scratch (first * . KZS columns get multiplied by zero) ==== * CALL SLAMOV( 'ALL', JLEN, KNZ, $ Z( JROW, INCOL+1+J2 ), LDZ, $ WV( 1, 1+KZS ), LDWV ) * * ==== Multiply by U12 ==== * CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, $ LDWV ) CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), $ LDWV ) * * ==== Multiply by U11 ==== * CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, $ WV, LDWV ) * * ==== Copy left of Z to right of scratch ==== * CALL SLAMOV( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), $ LDZ, WV( 1, 1+I2 ), LDWV ) * * ==== Multiply by U21 ==== * CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), $ LDWV ) * * ==== Multiply by U22 ==== * CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, $ Z( JROW, INCOL+1+J2 ), LDZ, $ U( J2+1, I2+1 ), LDU, ONE, $ WV( 1, 1+I2 ), LDWV ) * * ==== Copy the result back to Z ==== * CALL SLAMOV( 'ALL', JLEN, KDU, WV, LDWV, $ Z( JROW, INCOL+1 ), LDZ ) 210 CONTINUE END IF END IF END IF 220 CONTINUE * * ==== Clear out workspaces and return. ==== * IF( N.GE.5 ) $ CALL SLASET( 'Lower', N-4, N-4, ZERO, ZERO, H(5,1), LDH ) * * ==== End of SLAQR6 ==== * END scalapack-2.0.2/SRC/slar1va.f000644 000766 000024 00000033002 11657111056 016061 0ustar00juliestaff000000 000000 SUBROUTINE SLAR1VA(N, B1, BN, LAMBDA, D, L, LD, LLD, $ PIVMIN, GAPTOL, Z, WANTNC, NEGCNT, ZTZ, MINGMA, $ R, ISUPPZ, NRMINV, RESID, RQCORR, WORK ) * IMPLICIT NONE * * -- ScaLAPACK computational routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * * .. Scalar Arguments .. LOGICAL WANTNC INTEGER B1, BN, N, NEGCNT, R REAL GAPTOL, LAMBDA, MINGMA, NRMINV, PIVMIN, RESID, $ RQCORR, ZTZ * .. * .. Array Arguments .. INTEGER ISUPPZ( * ) REAL D( * ), L( * ), LD( * ), LLD( * ), $ WORK( * ) REAL Z( * ) * * Purpose * ======= * * SLAR1VA computes the (scaled) r-th column of the inverse of * the sumbmatrix in rows B1 through BN of the tridiagonal matrix * L D L^T - sigma I. When sigma is close to an eigenvalue, the * computed vector is an accurate eigenvector. Usually, r corresponds * to the index where the eigenvector is largest in magnitude. * The following steps accomplish this computation : * (a) Stationary qd transform, L D L^T - sigma I = L(+) D(+) L(+)^T, * (b) Progressive qd transform, L D L^T - sigma I = U(-) D(-) U(-)^T, * (c) Computation of the diagonal elements of the inverse of * L D L^T - sigma I by combining the above transforms, and choosing * r as the index where the diagonal of the inverse is (one of the) * largest in magnitude. * (d) Computation of the (scaled) r-th column of the inverse using the * twisted factorization obtained by combining the top part of the * the stationary and the bottom part of the progressive transform. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix L D L^T. * * B1 (input) INTEGER * First index of the submatrix of L D L^T. * * BN (input) INTEGER * Last index of the submatrix of L D L^T. * * LAMBDA (input) REAL * The shift. In order to compute an accurate eigenvector, * LAMBDA should be a good approximation to an eigenvalue * of L D L^T. * * L (input) REAL array, dimension (N-1) * The (n-1) subdiagonal elements of the unit bidiagonal matrix * L, in elements 1 to N-1. * * D (input) REAL array, dimension (N) * The n diagonal elements of the diagonal matrix D. * * LD (input) REAL array, dimension (N-1) * The n-1 elements L(i)*D(i). * * LLD (input) REAL array, dimension (N-1) * The n-1 elements L(i)*L(i)*D(i). * * PIVMIN (input) REAL * The minimum pivot in the Sturm sequence. * * GAPTOL (input) REAL * Tolerance that indicates when eigenvector entries are negligible * w.r.t. their contribution to the residual. * * Z (input/output) REAL array, dimension (N) * On input, all entries of Z must be set to 0. * On output, Z contains the (scaled) r-th column of the * inverse. The scaling is such that Z(R) equals 1. * * WANTNC (input) LOGICAL * Specifies whether NEGCNT has to be computed. * * NEGCNT (output) INTEGER * If WANTNC is .TRUE. then NEGCNT = the number of pivots < pivmin * in the matrix factorization L D L^T, and NEGCNT = -1 otherwise. * * ZTZ (output) REAL * The square of the 2-norm of Z. * * MINGMA (output) REAL * The reciprocal of the largest (in magnitude) diagonal * element of the inverse of L D L^T - sigma I. * * R (input/output) INTEGER * The twist index for the twisted factorization used to * compute Z. * On input, 0 <= R <= N. If R is input as 0, R is set to * the index where (L D L^T - sigma I)^{-1} is largest * in magnitude. If 1 <= R <= N, R is unchanged. * On output, R contains the twist index used to compute Z. * Ideally, R designates the position of the maximum entry in the * eigenvector. * * ISUPPZ (output) INTEGER array, dimension (2) * The support of the vector in Z, i.e., the vector Z is * nonzero only in elements ISUPPZ(1) through ISUPPZ( 2 ). * * NRMINV (output) REAL * NRMINV = 1/SQRT( ZTZ ) * * RESID (output) REAL * The residual of the FP vector. * RESID = ABS( MINGMA )/SQRT( ZTZ ) * * RQCORR (output) REAL * The Rayleigh Quotient correction to LAMBDA. * RQCORR = MINGMA*TMP * * WORK (workspace) REAL array, dimension (4*N) * * Further Details * =============== * * Based on contributions by * Beresford Parlett, University of California, Berkeley, USA * Jim Demmel, University of California, Berkeley, USA * Inderjit Dhillon, University of Texas, Austin, USA * Osni Marques, LBNL/NERSC, USA * Christof Voemel, University of California, Berkeley, USA * * ===================================================================== * * .. Parameters .. INTEGER BLKLEN PARAMETER ( BLKLEN = 16 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) * .. * .. Local Scalars .. LOGICAL SAWNAN1, SAWNAN2 INTEGER BI, I, INDLPL, INDP, INDS, INDUMN, NB, NEG1, $ NEG2, NX, R1, R2, TO REAL ABSZCUR, ABSZPREV, DMINUS, DPLUS, EPS, $ S, TMP, ZPREV * .. * .. External Functions .. LOGICAL SISNAN REAL SLAMCH EXTERNAL SISNAN, SLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, REAL * .. * .. Executable Statements .. * EPS = SLAMCH( 'Precision' ) IF( R.EQ.0 ) THEN R1 = B1 R2 = BN ELSE R1 = R R2 = R END IF * Storage for LPLUS INDLPL = 0 * Storage for UMINUS INDUMN = N INDS = 2*N + 1 INDP = 3*N + 1 IF( B1.EQ.1 ) THEN WORK( INDS ) = ZERO ELSE WORK( INDS+B1-1 ) = LLD( B1-1 ) END IF * * Compute the stationary transform (using the differential form) * until the index R2. * SAWNAN1 = .FALSE. NEG1 = 0 S = WORK( INDS+B1-1 ) - LAMBDA DO 50 I = B1, R1 - 1 DPLUS = D( I ) + S WORK( INDLPL+I ) = LD( I ) / DPLUS IF(DPLUS.LT.ZERO) NEG1 = NEG1 + 1 WORK( INDS+I ) = S*WORK( INDLPL+I )*L( I ) S = WORK( INDS+I ) - LAMBDA 50 CONTINUE SAWNAN1 = SISNAN( S ) IF( SAWNAN1 ) GOTO 60 DO 51 I = R1, R2 - 1 DPLUS = D( I ) + S WORK( INDLPL+I ) = LD( I ) / DPLUS WORK( INDS+I ) = S*WORK( INDLPL+I )*L( I ) S = WORK( INDS+I ) - LAMBDA 51 CONTINUE SAWNAN1 = SISNAN( S ) * 60 CONTINUE IF( SAWNAN1 ) THEN * Runs a slower version of the above loop if a NaN is detected NEG1 = 0 S = WORK( INDS+B1-1 ) - LAMBDA DO 70 I = B1, R1 - 1 DPLUS = D( I ) + S IF(ABS(DPLUS).LT.PIVMIN) DPLUS = -PIVMIN WORK( INDLPL+I ) = LD( I ) / DPLUS IF(DPLUS.LT.ZERO) NEG1 = NEG1 + 1 WORK( INDS+I ) = S*WORK( INDLPL+I )*L( I ) IF( WORK( INDLPL+I ).EQ.ZERO ) $ WORK( INDS+I ) = LLD( I ) S = WORK( INDS+I ) - LAMBDA 70 CONTINUE DO 71 I = R1, R2 - 1 DPLUS = D( I ) + S IF(ABS(DPLUS).LT.PIVMIN) DPLUS = -PIVMIN WORK( INDLPL+I ) = LD( I ) / DPLUS WORK( INDS+I ) = S*WORK( INDLPL+I )*L( I ) IF( WORK( INDLPL+I ).EQ.ZERO ) $ WORK( INDS+I ) = LLD( I ) S = WORK( INDS+I ) - LAMBDA 71 CONTINUE END IF * * Compute the progressive transform (using the differential form) * until the index R1 * SAWNAN2 = .FALSE. NEG2 = 0 WORK( INDP+BN-1 ) = D( BN ) - LAMBDA DO 80 I = BN - 1, R1, -1 DMINUS = LLD( I ) + WORK( INDP+I ) TMP = D( I ) / DMINUS IF(DMINUS.LT.ZERO) NEG2 = NEG2 + 1 WORK( INDUMN+I ) = L( I )*TMP WORK( INDP+I-1 ) = WORK( INDP+I )*TMP - LAMBDA 80 CONTINUE TMP = WORK( INDP+R1-1 ) SAWNAN2 = SISNAN( TMP ) IF( SAWNAN2 ) THEN * Runs a slower version of the above loop if a NaN is detected NEG2 = 0 DO 100 I = BN-1, R1, -1 DMINUS = LLD( I ) + WORK( INDP+I ) IF(ABS(DMINUS).LT.PIVMIN) DMINUS = -PIVMIN TMP = D( I ) / DMINUS IF(DMINUS.LT.ZERO) NEG2 = NEG2 + 1 WORK( INDUMN+I ) = L( I )*TMP WORK( INDP+I-1 ) = WORK( INDP+I )*TMP - LAMBDA IF( TMP.EQ.ZERO ) $ WORK( INDP+I-1 ) = D( I ) - LAMBDA 100 CONTINUE END IF * * Find the index (from R1 to R2) of the largest (in magnitude) * diagonal element of the inverse * MINGMA = WORK( INDS+R1-1 ) + WORK( INDP+R1-1 ) IF( MINGMA.LT.ZERO ) NEG1 = NEG1 + 1 IF( WANTNC ) THEN NEGCNT = NEG1 + NEG2 ELSE NEGCNT = -1 ENDIF IF( ABS(MINGMA).EQ.ZERO ) $ MINGMA = EPS*WORK( INDS+R1-1 ) R = R1 DO 110 I = R1, R2 - 1 TMP = WORK( INDS+I ) + WORK( INDP+I ) IF( TMP.EQ.ZERO ) $ TMP = EPS*WORK( INDS+I ) IF( ABS( TMP ).LE.ABS( MINGMA ) ) THEN MINGMA = TMP R = I + 1 END IF 110 CONTINUE * * Compute the FP vector: solve N^T v = e_r * ISUPPZ( 1 ) = B1 ISUPPZ( 2 ) = BN Z( R ) = ONE ZTZ = ONE * * Compute the FP vector upwards from R * NB = INT((R-B1)/BLKLEN) NX = R-NB*BLKLEN IF( .NOT.SAWNAN1 ) THEN DO 210 BI = R-1, NX, -BLKLEN TO = BI-BLKLEN+1 DO 205 I = BI, TO, -1 Z( I ) = -( WORK(INDLPL+I)*Z(I+1) ) ZTZ = ZTZ + Z( I )*Z( I ) 205 CONTINUE IF( ABS(Z(TO)).LT.EPS .AND. $ ABS(Z(TO+1)).LT.EPS ) THEN ISUPPZ(1) = TO GOTO 220 ENDIF 210 CONTINUE DO 215 I = NX-1, B1, -1 Z( I ) = -( WORK(INDLPL+I)*Z(I+1) ) ZTZ = ZTZ + Z( I )*Z( I ) 215 CONTINUE 220 CONTINUE ELSE * Run slower loop if NaN occurred. DO 230 BI = R-1, NX, -BLKLEN TO = BI-BLKLEN+1 DO 225 I = BI, TO, -1 IF( Z( I+1 ).EQ.ZERO ) THEN Z( I ) = -( LD( I+1 ) / LD( I ) )*Z( I+2 ) ELSE Z( I ) = -( WORK( INDLPL+I )*Z( I+1 ) ) END IF ZTZ = ZTZ + Z( I )*Z( I ) 225 CONTINUE IF( ABS(Z(TO)).LT.EPS .AND. $ ABS(Z(TO+1)).LT.EPS ) THEN ISUPPZ(1) = TO GOTO 240 ENDIF 230 CONTINUE DO 235 I = NX-1, B1, -1 IF( Z( I+1 ).EQ.ZERO ) THEN Z( I ) = -( LD( I+1 ) / LD( I ) )*Z( I+2 ) ELSE Z( I ) = -( WORK( INDLPL+I )*Z( I+1 ) ) END IF ZTZ = ZTZ + Z( I )*Z( I ) 235 CONTINUE 240 CONTINUE ENDIF DO 245 I= B1, (ISUPPZ(1)-1) Z(I) = ZERO 245 CONTINUE * Compute the FP vector downwards from R in blocks of size BLKLEN IF( .NOT.SAWNAN2 ) THEN DO 260 BI = R+1, BN, BLKLEN TO = BI+BLKLEN-1 IF ( TO.LE.BN ) THEN DO 250 I = BI, TO Z(I) = -(WORK(INDUMN+I-1)*Z(I-1)) ZTZ = ZTZ + Z( I )*Z( I ) 250 CONTINUE IF( ABS(Z(TO)).LE.EPS .AND. $ ABS(Z(TO-1)).LE.EPS ) THEN ISUPPZ(2) = TO GOTO 265 ENDIF ELSE DO 255 I = BI, BN Z(I) = -(WORK(INDUMN+I-1)*Z(I-1)) ZTZ = ZTZ + Z( I )*Z( I ) 255 CONTINUE ENDIF 260 CONTINUE 265 CONTINUE ELSE * Run slower loop if NaN occurred. DO 280 BI = R+1, BN, BLKLEN TO = BI+BLKLEN-1 IF ( TO.LE.BN ) THEN DO 270 I = BI, TO ZPREV = Z(I-1) ABSZPREV = ABS(ZPREV) IF( ZPREV.NE.ZERO ) THEN Z(I)= -(WORK(INDUMN+I-1)*ZPREV) ELSE Z(I)= -(LD(I-2)/LD(I-1))*Z(I-2) END IF ABSZCUR = ABS(Z(I)) ZTZ = ZTZ + ABSZCUR**2 270 CONTINUE IF( ABSZCUR.LT.EPS .AND. $ ABSZPREV.LT.EPS ) THEN ISUPPZ(2) = I GOTO 285 ENDIF ELSE DO 275 I = BI, BN ZPREV = Z(I-1) ABSZPREV = ABS(ZPREV) IF( ZPREV.NE.ZERO ) THEN Z(I)= -(WORK(INDUMN+I-1)*ZPREV) ELSE Z(I)= -(LD(I-2)/LD(I-1))*Z(I-2) END IF ABSZCUR = ABS(Z(I)) ZTZ = ZTZ + ABSZCUR**2 275 CONTINUE ENDIF 280 CONTINUE 285 CONTINUE END IF DO 290 I= ISUPPZ(2)+1,BN Z(I) = ZERO 290 CONTINUE * * Compute quantities for convergence test * TMP = ONE / ZTZ NRMINV = SQRT( TMP ) RESID = ABS( MINGMA )*NRMINV RQCORR = MINGMA*TMP * RETURN * * End of SLAR1VA * END scalapack-2.0.2/SRC/slaref.f000644 000766 000024 00000030353 11654534541 015777 0ustar00juliestaff000000 000000 SUBROUTINE SLAREF( TYPE, A, LDA, WANTZ, Z, LDZ, BLOCK, IROW1, $ ICOL1, ISTART, ISTOP, ITMP1, ITMP2, LILOZ, $ LIHIZ, VECS, V2, V3, T1, T2, T3 ) IMPLICIT NONE * * -- ScaLAPACK auxiliary routine (version 1.5) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. LOGICAL BLOCK, WANTZ CHARACTER TYPE INTEGER ICOL1, IROW1, ISTART, ISTOP, ITMP1, ITMP2, LDA, $ LDZ, LIHIZ, LILOZ REAL T1, T2, T3, V2, V3 * .. * .. Array Arguments .. REAL A( LDA, * ), VECS( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * SLAREF applies one or several Householder reflectors of size 3 * to one or two matrices (if column is specified) on either their * rows or columns. * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * If 'R': Apply reflectors to the rows of the matrix * (apply from left) * Otherwise: Apply reflectors to the columns of the matrix * Unchanged on exit. * * A (global input/output) REAL array, (LDA,*) * On entry, the matrix to receive the reflections. * The updated matrix on exit. * * LDA (local input) INTEGER * On entry, the leading dimension of A. Unchanged on exit. * * WANTZ (global input) LOGICAL * If .TRUE., then apply any column reflections to Z as well. * If .FALSE., then do no additional work on Z. * * Z (global input/output) REAL array, (LDZ,*) * On entry, the second matrix to receive column reflections. * This is changed only if WANTZ is set. * * LDZ (local input) INTEGER * On entry, the leading dimension of Z. Unchanged on exit. * * BLOCK (global input) LOGICAL * If .TRUE., then apply several reflectors at once and read * their data from the VECS array. * If .FALSE., apply the single reflector given by V2, V3, * T1, T2, and T3. * * IROW1 (local input/output) INTEGER * On entry, the local row element of A. * Undefined on output. * * * ICOL1 (local input/output) INTEGER * On entry, the local column element of A. * Undefined on output. * * ISTART (global input) INTEGER * Specifies the "number" of the first reflector. This is * used as an index into VECS if BLOCK is set. * ISTART is ignored if BLOCK is .FALSE.. * * ISTOP (global input) INTEGER * Specifies the "number" of the last reflector. This is * used as an index into VECS if BLOCK is set. * ISTOP is ignored if BLOCK is .FALSE.. * * ITMP1 (local input) INTEGER * Starting range into A. For rows, this is the local * first column. For columns, this is the local first row. * * ITMP2 (local input) INTEGER * Ending range into A. For rows, this is the local last * column. For columns, this is the local last row. * * LILOZ * LIHIZ (local input) INTEGER * These serve the same purpose as ITMP1,ITMP2 but for Z * when WANTZ is set. * * VECS (global input) REAL array of size 3*N (matrix * size) * This holds the size 3 reflectors one after another and this * is only accessed when BLOCK is .TRUE. * * V2 * V3 * T1 * T2 * T3 (global input/output) REAL * This holds information on a single size 3 Householder * reflector and is read when BLOCK is .FALSE., and * overwritten when BLOCK is .TRUE. * * Implemented by: G. Henry, May 1, 1997 * * ===================================================================== * * .. Local Scalars .. INTEGER J, K REAL H11, H22, SUM, T12, T13, T22, T23, T32, T33, $ V22, V23, V32, V33, A1, A2, A3, A4, A5, B1, $ B2, B3, B4, B5, TMP1, TMP2, TMP3, SUM1, SUM2, $ SUM3, A11, A22 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * IF( LSAME( TYPE, 'R' ) ) THEN IF( BLOCK ) THEN DO 30 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 10 J = ITMP1, ITMP2-MOD(ITMP2-ITMP1+1,2), 2 A1 = A ( IROW1 , J ) A2 = A ( IROW1+1, J ) A3 = A ( IROW1+2, J ) A4 = A ( IROW1+3, J ) A5 = A ( IROW1+4, J ) B1 = A ( IROW1 , J+1 ) B2 = A ( IROW1+1, J+1 ) B3 = A ( IROW1+2, J+1 ) B4 = A ( IROW1+3, J+1 ) B5 = A ( IROW1+4, J+1 ) SUM1 = A1 + V2*A2 + V3*A3 A( IROW1 , J ) = A1 - SUM1 * T1 H11 = A2 - SUM1 * T2 H22 = A3 - SUM1 * T3 TMP1 = B1 + V2*B2 + V3*B3 A( IROW1 , J+1 ) = B1 - TMP1 * T1 A11 = B2 - TMP1 * T2 A22 = B3 - TMP1 * T3 SUM2 = H11 + V22*H22 + V32*A4 A( IROW1+1, J ) = H11 - SUM2 * T12 H11 = H22 - SUM2 * T22 H22 = A4 - SUM2 * T32 TMP2 = A11 + V22*A22 + V32*B4 A( IROW1+1, J+1 ) = A11 - TMP2 * T12 A11 = A22 - TMP2 * T22 A22 = B4 - TMP2 * T32 SUM3 = H11 + V23*H22 + V33*A5 A( IROW1+2, J ) = H11 - SUM3 * T13 A( IROW1+3, J ) = H22 - SUM3 * T23 A( IROW1+4, J ) = A5 - SUM3 * T33 TMP3 = A11 + V23*A22 + V33*B5 A( IROW1+2, J+1 ) = A11 - TMP3 * T13 A( IROW1+3, J+1 ) = A22 - TMP3 * T23 A( IROW1+4, J+1 ) = B5 - TMP3 * T33 10 CONTINUE DO 20 J = ITMP2-MOD(ITMP2-ITMP1+1,2)+1, ITMP2 SUM = A( IROW1, J ) + V2*A( IROW1+1, J ) + $ V3*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM*T1 H11 = A( IROW1+1, J ) - SUM*T2 H22 = A( IROW1+2, J ) - SUM*T3 SUM = H11 + V22*H22 + V32*A( IROW1+3, J ) A( IROW1+1, J ) = H11 - SUM*T12 H11 = H22 - SUM*T22 H22 = A( IROW1+3, J ) - SUM*T32 SUM = H11 + V23*H22 + V33*A( IROW1+4, J ) A( IROW1+2, J ) = H11 - SUM*T13 A( IROW1+3, J ) = H22 - SUM*T23 A( IROW1+4, J ) = A( IROW1+4, J ) - SUM*T33 20 CONTINUE IROW1 = IROW1 + 3 30 CONTINUE DO 50 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 40 J = ITMP1, ITMP2 SUM = A( IROW1, J ) + V2*A( IROW1+1, J ) + $ V3*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM*T1 A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*T2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*T3 40 CONTINUE IROW1 = IROW1 + 1 50 CONTINUE ELSE DO 60 J = ITMP1, ITMP2 SUM = A( IROW1, J ) + V2*A( IROW1+1, J ) + $ V3*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM*T1 A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*T2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*T3 60 CONTINUE END IF ELSE * * Do column transforms * IF( BLOCK ) THEN DO 90 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 70 J = ITMP1, ITMP2 SUM = A( J, ICOL1 ) + V2*A( J, ICOL1+1 ) + $ V3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM*T1 H11 = A( J, ICOL1+1 ) - SUM*T2 H22 = A( J, ICOL1+2 ) - SUM*T3 SUM = H11 + V22*H22 + V32*A( J, ICOL1+3 ) A( J, ICOL1+1 ) = H11 - SUM*T12 H11 = H22 - SUM*T22 H22 = A( J, ICOL1+3 ) - SUM*T32 SUM = H11 + V23*H22 + V33*A( J, ICOL1+4 ) A( J, ICOL1+2 ) = H11 - SUM*T13 A( J, ICOL1+3 ) = H22 - SUM*T23 A( J, ICOL1+4 ) = A( J, ICOL1+4 ) - SUM*T33 70 CONTINUE IF( WANTZ ) THEN DO 80 J = LILOZ, LIHIZ SUM = Z( J, ICOL1 ) + V2*Z( J, ICOL1+1 ) + $ V3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM*T1 H11 = Z( J, ICOL1+1 ) - SUM*T2 H22 = Z( J, ICOL1+2 ) - SUM*T3 SUM = H11 + V22*H22 + V32*Z( J, ICOL1+3 ) Z( J, ICOL1+1 ) = H11 - SUM*T12 H11 = H22 - SUM*T22 H22 = Z( J, ICOL1+3 ) - SUM*T32 SUM = H11 + V23*H22 + V33*Z( J, ICOL1+4 ) Z( J, ICOL1+2 ) = H11 - SUM*T13 Z( J, ICOL1+3 ) = H22 - SUM*T23 Z( J, ICOL1+4 ) = Z( J, ICOL1+4 ) - SUM*T33 80 CONTINUE END IF ICOL1 = ICOL1 + 3 90 CONTINUE DO 120 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 100 J = ITMP1, ITMP2 SUM = A( J, ICOL1 ) + V2*A( J, ICOL1+1 ) + $ V3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM*T1 A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*T2 A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*T3 100 CONTINUE IF( WANTZ ) THEN DO 110 J = LILOZ, LIHIZ SUM = Z( J, ICOL1 ) + V2*Z( J, ICOL1+1 ) + $ V3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM*T1 Z( J, ICOL1+1 ) = Z( J, ICOL1+1 ) - SUM*T2 Z( J, ICOL1+2 ) = Z( J, ICOL1+2 ) - SUM*T3 110 CONTINUE END IF ICOL1 = ICOL1 + 1 120 CONTINUE ELSE DO 130 J = ITMP1, ITMP2 SUM = A( J, ICOL1 ) + V2*A( J, ICOL1+1 ) + $ V3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM*T1 A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*T2 A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*T3 130 CONTINUE END IF END IF RETURN * * End of SLAREF * END scalapack-2.0.2/SRC/slarrb2.f000644 000766 000024 00000047071 11657111056 016072 0ustar00juliestaff000000 000000 SUBROUTINE SLARRB2( N, D, LLD, IFIRST, ILAST, RTOL1, $ RTOL2, OFFSET, W, WGAP, WERR, WORK, IWORK, $ PIVMIN, LGPVMN, LGSPDM, TWIST, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER IFIRST, ILAST, INFO, N, OFFSET, TWIST REAL LGPVMN, LGSPDM, PIVMIN, $ RTOL1, RTOL2 * .. * .. Array Arguments .. INTEGER IWORK( * ) REAL D( * ), LLD( * ), W( * ), $ WERR( * ), WGAP( * ), WORK( * ) * .. * * Purpose * ======= * * Given the relatively robust representation(RRR) L D L^T, SLARRB2 * does "limited" bisection to refine the eigenvalues of L D L^T, * W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial * guesses for these eigenvalues are input in W, the corresponding estimate * of the error in these guesses and their gaps are input in WERR * and WGAP, respectively. During bisection, intervals * [left, right] are maintained by storing their mid-points and * semi-widths in the arrays W and WERR respectively. * * NOTE: * There are very few minor differences between SLARRB from LAPACK * and this current subroutine SLARRB2. * The most important reason for creating this nearly identical copy * is profiling: in the ScaLAPACK MRRR algorithm, eigenvalue computation * using SLARRB2 is used for refinement in the construction of * the representation tree, as opposed to the initial computation of the * eigenvalues for the root RRR which uses SLARRB. When profiling, * this allows an easy quantification of refinement work vs. computing * eigenvalues of the root. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix. * * D (input) REAL array, dimension (N) * The N diagonal elements of the diagonal matrix D. * * LLD (input) REAL array, dimension (N-1) * The (N-1) elements L(i)*L(i)*D(i). * * IFIRST (input) INTEGER * The index of the first eigenvalue to be computed. * * ILAST (input) INTEGER * The index of the last eigenvalue to be computed. * * RTOL1 (input) REAL * RTOL2 (input) REAL * Tolerance for the convergence of the bisection intervals. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) * where GAP is the (estimated) distance to the nearest * eigenvalue. * * OFFSET (input) INTEGER * Offset for the arrays W, WGAP and WERR, i.e., the IFIRST-OFFSET * through ILAST-OFFSET elements of these arrays are to be used. * * W (input/output) REAL array, dimension (N) * On input, W( IFIRST-OFFSET ) through W( ILAST-OFFSET ) are * estimates of the eigenvalues of L D L^T indexed IFIRST through ILAST. * On output, these estimates are refined. * * WGAP (input/output) REAL array, dimension (N-1) * On input, the (estimated) gaps between consecutive * eigenvalues of L D L^T, i.e., WGAP(I-OFFSET) is the gap between * eigenvalues I and I+1. Note that if IFIRST.EQ.ILAST * then WGAP(IFIRST-OFFSET) must be set to ZERO. * On output, these gaps are refined. * * WERR (input/output) REAL array, dimension (N) * On input, WERR( IFIRST-OFFSET ) through WERR( ILAST-OFFSET ) are * the errors in the estimates of the corresponding elements in W. * On output, these errors are refined. * * WORK (workspace) REAL array, dimension (4*N) * Workspace. * * IWORK (workspace) INTEGER array, dimension (2*N) * Workspace. * * PIVMIN (input) REAL * The minimum pivot in the sturm sequence. * * LGPVMN (input) REAL * Logarithm of PIVMIN, precomputed. * * LGSPDM (input) REAL * Logarithm of the spectral diameter, precomputed. * * TWIST (input) INTEGER * The twist index for the twisted factorization that is used * for the negcount. * TWIST = N: Compute negcount from L D L^T - LAMBDA I = L+ D+ L+^T * TWIST = 1: Compute negcount from L D L^T - LAMBDA I = U- D- U-^T * TWIST = R: Compute negcount from L D L^T - LAMBDA I = N(r) D(r) N(r) * * INFO (output) INTEGER * Error flag. * * .. Parameters .. REAL ZERO, TWO, HALF PARAMETER ( ZERO = 0.0E0, TWO = 2.0E0, $ HALF = 0.5E0 ) INTEGER MAXITR * .. * .. Local Scalars .. INTEGER I, I1, II, INDLLD, IP, ITER, J, K, NEGCNT, $ NEXT, NINT, OLNINT, PREV, R REAL BACK, CVRGD, GAP, LEFT, LGAP, MID, MNWDTH, $ RGAP, RIGHT, SAVGAP, TMP, WIDTH LOGICAL PARANOID * .. * .. External Functions .. LOGICAL SISNAN REAL SLAMCH INTEGER SLANEG2A EXTERNAL SISNAN, SLAMCH, $ SLANEG2A * * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * INFO = 0 * * Turn on paranoid check for rounding errors * invalidating uncertainty intervals of eigenvalues * PARANOID = .TRUE. * MAXITR = INT( ( LGSPDM - LGPVMN ) / LOG( TWO ) ) + 2 MNWDTH = TWO * PIVMIN * R = TWIST * INDLLD = 2*N DO 5 J = 1, N-1 I=2*J WORK(INDLLD+I-1) = D(J) WORK(INDLLD+I) = LLD(J) 5 CONTINUE WORK(INDLLD+2*N-1) = D(N) * IF((R.LT.1).OR.(R.GT.N)) R = N * * Initialize unconverged intervals in [ WORK(2*I-1), WORK(2*I) ]. * The Sturm Count, Count( WORK(2*I-1) ) is arranged to be I-1, while * Count( WORK(2*I) ) is stored in IWORK( 2*I ). The integer IWORK( 2*I-1 ) * for an unconverged interval is set to the index of the next unconverged * interval, and is -1 or 0 for a converged interval. Thus a linked * list of unconverged intervals is set up. * I1 = IFIRST * The number of unconverged intervals NINT = 0 * The last unconverged interval found PREV = 0 RGAP = WGAP( I1-OFFSET ) DO 75 I = I1, ILAST K = 2*I II = I - OFFSET LEFT = W( II ) - WERR( II ) RIGHT = W( II ) + WERR( II ) LGAP = RGAP RGAP = WGAP( II ) GAP = MIN( LGAP, RGAP ) IF((ABS(LEFT).LE.16*PIVMIN).OR.(ABS(RIGHT).LE.16*PIVMIN)) $ THEN INFO = -1 RETURN ENDIF IF( PARANOID ) THEN * Make sure that [LEFT,RIGHT] contains the desired eigenvalue * Compute negcount from dstqds facto L+D+L+^T = L D L^T - LEFT * * Do while( NEGCNT(LEFT).GT.I-1 ) * BACK = WERR( II ) 20 CONTINUE NEGCNT = SLANEG2A( N, WORK(INDLLD+1), LEFT, PIVMIN, R ) IF( NEGCNT.GT.I-1 ) THEN LEFT = LEFT - BACK BACK = TWO*BACK GO TO 20 END IF * * Do while( NEGCNT(RIGHT).LT.I ) * Compute negcount from dstqds facto L+D+L+^T = L D L^T - RIGHT * BACK = WERR( II ) 50 CONTINUE NEGCNT = SLANEG2A( N, WORK(INDLLD+1),RIGHT, PIVMIN, R ) IF( NEGCNT.LT.I ) THEN RIGHT = RIGHT + BACK BACK = TWO*BACK GO TO 50 END IF ENDIF WIDTH = HALF*ABS( LEFT - RIGHT ) TMP = MAX( ABS( LEFT ), ABS( RIGHT ) ) CVRGD = MAX(RTOL1*GAP,RTOL2*TMP) IF( WIDTH.LE.CVRGD .OR. WIDTH.LE.MNWDTH ) THEN * This interval has already converged and does not need refinement. * (Note that the gaps might change through refining the * eigenvalues, however, they can only get bigger.) * Remove it from the list. IWORK( K-1 ) = -1 * Make sure that I1 always points to the first unconverged interval IF((I.EQ.I1).AND.(I.LT.ILAST)) I1 = I + 1 IF((PREV.GE.I1).AND.(I.LE.ILAST)) IWORK( 2*PREV-1 ) = I + 1 ELSE * unconverged interval found PREV = I NINT = NINT + 1 IWORK( K-1 ) = I + 1 IWORK( K ) = NEGCNT END IF WORK( K-1 ) = LEFT WORK( K ) = RIGHT 75 CONTINUE * * Do while( NINT.GT.0 ), i.e. there are still unconverged intervals * and while (ITER.LT.MAXITR) * ITER = 0 80 CONTINUE PREV = I1 - 1 I = I1 OLNINT = NINT DO 100 IP = 1, OLNINT K = 2*I II = I - OFFSET RGAP = WGAP( II ) LGAP = RGAP IF(II.GT.1) LGAP = WGAP( II-1 ) GAP = MIN( LGAP, RGAP ) NEXT = IWORK( K-1 ) LEFT = WORK( K-1 ) RIGHT = WORK( K ) MID = HALF*( LEFT + RIGHT ) * semiwidth of interval WIDTH = RIGHT - MID TMP = MAX( ABS( LEFT ), ABS( RIGHT ) ) CVRGD = MAX(RTOL1*GAP,RTOL2*TMP) IF( ( WIDTH.LE.CVRGD ) .OR. ( WIDTH.LE.MNWDTH ).OR. $ ( ITER.EQ.MAXITR ) )THEN * reduce number of unconverged intervals NINT = NINT - 1 * Mark interval as converged. IWORK( K-1 ) = 0 IF( I1.EQ.I ) THEN I1 = NEXT ELSE * Prev holds the last unconverged interval previously examined IF(PREV.GE.I1) IWORK( 2*PREV-1 ) = NEXT END IF I = NEXT GO TO 100 END IF PREV = I * * Perform one bisection step * NEGCNT = SLANEG2A( N, WORK(INDLLD+1), MID, PIVMIN, R ) IF( NEGCNT.LE.I-1 ) THEN WORK( K-1 ) = MID ELSE WORK( K ) = MID END IF I = NEXT 100 CONTINUE ITER = ITER + 1 * do another loop if there are still unconverged intervals * However, in the last iteration, all intervals are accepted * since this is the best we can do. IF( ( NINT.GT.0 ).AND.(ITER.LE.MAXITR) ) GO TO 80 * * * At this point, all the intervals have converged * * save this gap to restore it after the loop SAVGAP = WGAP( ILAST-OFFSET ) * LEFT = WORK( 2*IFIRST-1 ) DO 110 I = IFIRST, ILAST K = 2*I II = I - OFFSET * RIGHT is the right boundary of this current interval RIGHT = WORK( K ) * All intervals marked by '0' have been refined. IF( IWORK( K-1 ).EQ.0 ) THEN W( II ) = HALF*( LEFT+RIGHT ) WERR( II ) = RIGHT - W( II ) END IF * Left is the boundary of the next interval LEFT = WORK( K +1 ) WGAP( II ) = MAX( ZERO, LEFT - RIGHT ) 110 CONTINUE * restore the last gap which was overwritten by garbage WGAP( ILAST-OFFSET ) = SAVGAP RETURN * * End of SLARRB2 * END * * * FUNCTION SLANEG2( N, D, LLD, SIGMA, PIVMIN, R ) * IMPLICIT NONE * INTEGER SLANEG2 * * .. Scalar Arguments .. INTEGER N, R REAL PIVMIN, SIGMA * .. * .. Array Arguments .. REAL D( * ), LLD( * ) * REAL ZERO PARAMETER ( ZERO = 0.0E0 ) INTEGER BLKLEN PARAMETER ( BLKLEN = 2048 ) * .. * .. Local Scalars .. INTEGER BJ, J, NEG1, NEG2, NEGCNT, TO REAL DMINUS, DPLUS, GAMMA, P, S, T, TMP, XSAV LOGICAL SAWNAN * .. * .. External Functions .. LOGICAL SISNAN EXTERNAL SISNAN NEGCNT = 0 * * I) upper part: L D L^T - SIGMA I = L+ D+ L+^T * run dstqds block-wise to avoid excessive work when NaNs occur * S = ZERO DO 210 BJ = 1, R-1, BLKLEN NEG1 = 0 XSAV = S TO = BJ+BLKLEN-1 IF ( TO.LE.R-1 ) THEN DO 21 J = BJ, TO T = S - SIGMA DPLUS = D( J ) + T IF( DPLUS.LT.ZERO ) NEG1=NEG1 + 1 S = T*LLD( J ) / DPLUS 21 CONTINUE ELSE DO 22 J = BJ, R-1 T = S - SIGMA DPLUS = D( J ) + T IF( DPLUS.LT.ZERO ) NEG1=NEG1 + 1 S = T*LLD( J ) / DPLUS 22 CONTINUE ENDIF SAWNAN = SISNAN( S ) * IF( SAWNAN ) THEN NEG1 = 0 S = XSAV TO = BJ+BLKLEN-1 IF ( TO.LE.R-1 ) THEN DO 23 J = BJ, TO T = S - SIGMA DPLUS = D( J ) + T IF(ABS(DPLUS).LT.PIVMIN) $ DPLUS = -PIVMIN TMP = LLD( J ) / DPLUS IF( DPLUS.LT.ZERO ) $ NEG1 = NEG1 + 1 S = T*TMP IF( TMP.EQ.ZERO ) S = LLD( J ) 23 CONTINUE ELSE DO 24 J = BJ, R-1 T = S - SIGMA DPLUS = D( J ) + T IF(ABS(DPLUS).LT.PIVMIN) $ DPLUS = -PIVMIN TMP = LLD( J ) / DPLUS IF( DPLUS.LT.ZERO ) NEG1=NEG1+1 S = T*TMP IF( TMP.EQ.ZERO ) S = LLD( J ) 24 CONTINUE ENDIF END IF NEGCNT = NEGCNT + NEG1 210 CONTINUE * * II) lower part: L D L^T - SIGMA I = U- D- U-^T * P = D( N ) - SIGMA DO 230 BJ = N-1, R, -BLKLEN NEG2 = 0 XSAV = P TO = BJ-BLKLEN+1 IF ( TO.GE.R ) THEN DO 25 J = BJ, TO, -1 DMINUS = LLD( J ) + P IF( DMINUS.LT.ZERO ) NEG2=NEG2+1 TMP = P / DMINUS P = TMP * D( J ) - SIGMA 25 CONTINUE ELSE DO 26 J = BJ, R, -1 DMINUS = LLD( J ) + P IF( DMINUS.LT.ZERO ) NEG2=NEG2+1 TMP = P / DMINUS P = TMP * D( J ) - SIGMA 26 CONTINUE ENDIF SAWNAN = SISNAN( P ) * IF( SAWNAN ) THEN NEG2 = 0 P = XSAV TO = BJ-BLKLEN+1 IF ( TO.GE.R ) THEN DO 27 J = BJ, TO, -1 DMINUS = LLD( J ) + P IF(ABS(DMINUS).LT.PIVMIN) $ DMINUS = -PIVMIN TMP = D( J ) / DMINUS IF( DMINUS.LT.ZERO ) $ NEG2 = NEG2 + 1 P = P*TMP - SIGMA IF( TMP.EQ.ZERO ) $ P = D( J ) - SIGMA 27 CONTINUE ELSE DO 28 J = BJ, R, -1 DMINUS = LLD( J ) + P IF(ABS(DMINUS).LT.PIVMIN) $ DMINUS = -PIVMIN TMP = D( J ) / DMINUS IF( DMINUS.LT.ZERO ) $ NEG2 = NEG2 + 1 P = P*TMP - SIGMA IF( TMP.EQ.ZERO ) $ P = D( J ) - SIGMA 28 CONTINUE ENDIF END IF NEGCNT = NEGCNT + NEG2 230 CONTINUE * * III) Twist index * GAMMA = S + P IF( GAMMA.LT.ZERO ) NEGCNT = NEGCNT+1 SLANEG2 = NEGCNT END * * * FUNCTION SLANEG2A( N, DLLD, SIGMA, PIVMIN, R ) * IMPLICIT NONE * INTEGER SLANEG2A * * .. Scalar Arguments .. INTEGER N, R REAL PIVMIN, SIGMA * .. * .. Array Arguments .. REAL DLLD( * ) * REAL ZERO PARAMETER ( ZERO = 0.0E0 ) INTEGER BLKLEN PARAMETER ( BLKLEN = 512 ) * * .. * .. Intrinsic Functions .. INTRINSIC INT * .. * .. Local Scalars .. INTEGER BJ, I, J, NB, NEG1, NEG2, NEGCNT, NX REAL DMINUS, DPLUS, GAMMA, P, S, T, TMP, XSAV LOGICAL SAWNAN * .. * .. External Functions .. LOGICAL SISNAN EXTERNAL SISNAN NEGCNT = 0 * * I) upper part: L D L^T - SIGMA I = L+ D+ L+^T * run dstqds block-wise to avoid excessive work when NaNs occur, * first in chunks of size BLKLEN and then the remainder * NB = INT((R-1)/BLKLEN) NX = NB*BLKLEN S = ZERO DO 210 BJ = 1, NX, BLKLEN NEG1 = 0 XSAV = S DO 21 J = BJ, BJ+BLKLEN-1 I = 2*J T = S - SIGMA DPLUS = DLLD( I-1 ) + T IF( DPLUS.LT.ZERO ) NEG1=NEG1 + 1 S = T*DLLD( I ) / DPLUS 21 CONTINUE SAWNAN = SISNAN( S ) * IF( SAWNAN ) THEN NEG1 = 0 S = XSAV DO 23 J = BJ, BJ+BLKLEN-1 I = 2*J T = S - SIGMA DPLUS = DLLD( I-1 ) + T IF(ABS(DPLUS).LT.PIVMIN) $ DPLUS = -PIVMIN TMP = DLLD( I ) / DPLUS IF( DPLUS.LT.ZERO ) $ NEG1 = NEG1 + 1 S = T*TMP IF( TMP.EQ.ZERO ) S = DLLD( I ) 23 CONTINUE END IF NEGCNT = NEGCNT + NEG1 210 CONTINUE * NEG1 = 0 XSAV = S DO 22 J = NX+1, R-1 I = 2*J T = S - SIGMA DPLUS = DLLD( I-1 ) + T IF( DPLUS.LT.ZERO ) NEG1=NEG1 + 1 S = T*DLLD( I ) / DPLUS 22 CONTINUE SAWNAN = SISNAN( S ) * IF( SAWNAN ) THEN NEG1 = 0 S = XSAV DO 24 J = NX+1, R-1 I = 2*J T = S - SIGMA DPLUS = DLLD( I-1 ) + T IF(ABS(DPLUS).LT.PIVMIN) $ DPLUS = -PIVMIN TMP = DLLD( I ) / DPLUS IF( DPLUS.LT.ZERO ) NEG1=NEG1+1 S = T*TMP IF( TMP.EQ.ZERO ) S = DLLD( I ) 24 CONTINUE ENDIF NEGCNT = NEGCNT + NEG1 * * II) lower part: L D L^T - SIGMA I = U- D- U-^T * NB = INT((N-R)/BLKLEN) NX = N-NB*BLKLEN P = DLLD( 2*N-1 ) - SIGMA DO 230 BJ = N-1, NX, -BLKLEN NEG2 = 0 XSAV = P DO 25 J = BJ, BJ-BLKLEN+1, -1 I = 2*J DMINUS = DLLD( I ) + P IF( DMINUS.LT.ZERO ) NEG2=NEG2+1 TMP = P / DMINUS P = TMP * DLLD( I-1 ) - SIGMA 25 CONTINUE SAWNAN = SISNAN( P ) * IF( SAWNAN ) THEN NEG2 = 0 P = XSAV DO 27 J = BJ, BJ-BLKLEN+1, -1 I = 2*J DMINUS = DLLD( I ) + P IF(ABS(DMINUS).LT.PIVMIN) $ DMINUS = -PIVMIN TMP = DLLD( I-1 ) / DMINUS IF( DMINUS.LT.ZERO ) $ NEG2 = NEG2 + 1 P = P*TMP - SIGMA IF( TMP.EQ.ZERO ) $ P = DLLD( I-1 ) - SIGMA 27 CONTINUE END IF NEGCNT = NEGCNT + NEG2 230 CONTINUE NEG2 = 0 XSAV = P DO 26 J = NX-1, R, -1 I = 2*J DMINUS = DLLD( I ) + P IF( DMINUS.LT.ZERO ) NEG2=NEG2+1 TMP = P / DMINUS P = TMP * DLLD( I-1 ) - SIGMA 26 CONTINUE SAWNAN = SISNAN( P ) * IF( SAWNAN ) THEN NEG2 = 0 P = XSAV DO 28 J = NX-1, R, -1 I = 2*J DMINUS = DLLD( I ) + P IF(ABS(DMINUS).LT.PIVMIN) $ DMINUS = -PIVMIN TMP = DLLD( I-1 ) / DMINUS IF( DMINUS.LT.ZERO ) $ NEG2 = NEG2 + 1 P = P*TMP - SIGMA IF( TMP.EQ.ZERO ) $ P = DLLD( I-1 ) - SIGMA 28 CONTINUE END IF NEGCNT = NEGCNT + NEG2 * * III) Twist index * GAMMA = S + P IF( GAMMA.LT.ZERO ) NEGCNT = NEGCNT+1 SLANEG2A = NEGCNT END scalapack-2.0.2/SRC/slarrd2.f000644 000766 000024 00000060026 11657111056 016067 0ustar00juliestaff000000 000000 SUBROUTINE SLARRD2( RANGE, ORDER, N, VL, VU, IL, IU, GERS, $ RELTOL, D, E, E2, PIVMIN, NSPLIT, ISPLIT, $ M, W, WERR, WL, WU, IBLOCK, INDEXW, $ WORK, IWORK, DOL, DOU, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * * .. Scalar Arguments .. CHARACTER ORDER, RANGE INTEGER DOL, DOU, IL, INFO, IU, M, N, NSPLIT REAL PIVMIN, RELTOL, VL, VU, WL, WU * .. * .. Array Arguments .. INTEGER IBLOCK( * ), INDEXW( * ), $ ISPLIT( * ), IWORK( * ) REAL D( * ), E( * ), E2( * ), $ GERS( * ), W( * ), WERR( * ), WORK( * ) * .. * * Purpose * ======= * * SLARRD2 computes the eigenvalues of a symmetric tridiagonal * matrix T to limited initial accuracy. This is an auxiliary code to be * called from SLARRE2A. * * SLARRD2 has been created using the LAPACK code SLARRD * which itself stems from SSTEBZ. The motivation for creating * SLARRD2 is efficiency: When computing eigenvalues in parallel * and the input tridiagonal matrix splits into blocks, SLARRD2 * can skip over blocks which contain none of the eigenvalues from * DOL to DOU for which the processor responsible. In extreme cases (such * as large matrices consisting of many blocks of small size, e.g. 2x2, * the gain can be substantial. * * Arguments * ========= * * RANGE (input) CHARACTER * = 'A': ("All") all eigenvalues will be found. * = 'V': ("Value") all eigenvalues in the half-open interval * (VL, VU] will be found. * = 'I': ("Index") the IL-th through IU-th eigenvalues (of the * entire matrix) will be found. * * ORDER (input) CHARACTER * = 'B': ("By Block") the eigenvalues will be grouped by * split-off block (see IBLOCK, ISPLIT) and * ordered from smallest to largest within * the block. * = 'E': ("Entire matrix") * the eigenvalues for the entire matrix * will be ordered from smallest to * largest. * * N (input) INTEGER * The order of the tridiagonal matrix T. N >= 0. * * VL (input) REAL * VU (input) REAL * If RANGE='V', the lower and upper bounds of the interval to * be searched for eigenvalues. Eigenvalues less than or equal * to VL, or greater than VU, will not be returned. VL < VU. * Not referenced if RANGE = 'A' or 'I'. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. * Not referenced if RANGE = 'A' or 'V'. * * GERS (input) REAL array, dimension (2*N) * The N Gerschgorin intervals (the i-th Gerschgorin interval * is (GERS(2*i-1), GERS(2*i)). * * RELTOL (input) REAL * The minimum relative width of an interval. When an interval * is narrower than RELTOL times the larger (in * magnitude) endpoint, then it is considered to be * sufficiently small, i.e., converged. Note: this should * always be at least radix*machine epsilon. * * D (input) REAL array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. * * E (input) REAL array, dimension (N-1) * The (n-1) off-diagonal elements of the tridiagonal matrix T. * * E2 (input) REAL array, dimension (N-1) * The (n-1) squared off-diagonal elements of the tridiagonal matrix T. * * PIVMIN (input) REAL * The minimum pivot allowed in the sturm sequence for T. * * NSPLIT (input) INTEGER * The number of diagonal blocks in the matrix T. * 1 <= NSPLIT <= N. * * ISPLIT (input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N. * (Only the first NSPLIT elements will actually be used, but * since the user cannot know a priori what value NSPLIT will * have, N words must be reserved for ISPLIT.) * * M (output) INTEGER * The actual number of eigenvalues found. 0 <= M <= N. * (See also the description of INFO=2,3.) * * W (output) REAL array, dimension (N) * On exit, the first M elements of W will contain the * eigenvalue approximations. SLARRD2 computes an interval * I_j = (a_j, b_j] that includes eigenvalue j. The eigenvalue * approximation is given as the interval midpoint * W(j)= ( a_j + b_j)/2. The corresponding error is bounded by * WERR(j) = abs( a_j - b_j)/2 * * WERR (output) REAL array, dimension (N) * The error bound on the corresponding eigenvalue approximation * in W. * * WL (output) REAL * WU (output) REAL * The interval (WL, WU] contains all the wanted eigenvalues. * If RANGE='V', then WL=VL and WU=VU. * If RANGE='A', then WL and WU are the global Gerschgorin bounds * on the spectrum. * If RANGE='I', then WL and WU are computed by SLAEBZ from the * index range specified. * * IBLOCK (output) INTEGER array, dimension (N) * At each row/column j where E(j) is zero or small, the * matrix T is considered to split into a block diagonal * matrix. On exit, if INFO = 0, IBLOCK(i) specifies to which * block (from 1 to the number of blocks) the eigenvalue W(i) * belongs. (SLARRD2 may use the remaining N-M elements as * workspace.) * * INDEXW (output) INTEGER array, dimension (N) * The indices of the eigenvalues within each block (submatrix); * for example, INDEXW(i)= j and IBLOCK(i)=k imply that the * i-th eigenvalue W(i) is the j-th eigenvalue in block k. * * WORK (workspace) REAL array, dimension (4*N) * * IWORK (workspace) INTEGER array, dimension (3*N) * * DOL (input) INTEGER * DOU (input) INTEGER * If the user wants to work on only a selected part of the * representation tree, he can specify an index range DOL:DOU. * Otherwise, the setting DOL=1, DOU=N should be applied. * Note that DOL and DOU refer to the order in which the eigenvalues * are stored in W. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: some or all of the eigenvalues failed to converge or * were not computed: * =1 or 3: Bisection failed to converge for some * eigenvalues; these eigenvalues are flagged by a * negative block number. The effect is that the * eigenvalues may not be as accurate as the * absolute and relative tolerances. This is * generally caused by unexpectedly inaccurate * arithmetic. * =2 or 3: RANGE='I' only: Not all of the eigenvalues * IL:IU were found. * Effect: M < IU+1-IL * Cause: non-monotonic arithmetic, causing the * Sturm sequence to be non-monotonic. * Cure: recalculate, using RANGE='A', and pick * out eigenvalues IL:IU. In some cases, * increasing the PARAMETER "FUDGE" may * make things work. * = 4: RANGE='I', and the Gershgorin interval * initially used was too small. No eigenvalues * were computed. * Probable cause: your machine has sloppy * floating-point arithmetic. * Cure: Increase the PARAMETER "FUDGE", * recompile, and try again. * * Internal Parameters * =================== * * FUDGE REAL , default = 2 originally, increased to 10. * A "fudge factor" to widen the Gershgorin intervals. Ideally, * a value of 1 should work, but on machines with sloppy * arithmetic, this needs to be larger. The default for * publicly released versions should be large enough to handle * the worst machine around. Note that this has no effect * on accuracy of the solution. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE, TWO, HALF, FUDGE PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, $ TWO = 2.0E0, HALF = ONE/TWO, $ FUDGE = 10.0E0 ) * .. * .. Local Scalars .. LOGICAL NCNVRG, TOOFEW INTEGER I, IB, IBEGIN, IDISCL, IDISCU, IE, IEND, IINFO, $ IM, IN, IOFF, IORDER, IOUT, IRANGE, ITMAX, $ ITMP1, ITMP2, IW, IWOFF, J, JBLK, JDISC, JE, $ JEE, NB, NWL, NWU REAL ATOLI, EPS, GL, GU, RTOLI, SPDIAM, TMP1, TMP2, $ TNORM, UFLOW, WKILL, WLU, WUL * .. * .. Local Arrays .. INTEGER IDUMMA( 1 ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV REAL SLAMCH EXTERNAL LSAME, ILAENV, SLAMCH * .. * .. External Subroutines .. EXTERNAL SLAEBZ * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, LOG, MAX, MIN, SQRT * .. * .. Executable Statements .. * INFO = 0 * * Decode RANGE * IF( LSAME( RANGE, 'A' ) ) THEN IRANGE = 1 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IRANGE = 2 ELSE IF( LSAME( RANGE, 'I' ) ) THEN IRANGE = 3 ELSE IRANGE = 0 END IF * * Decode ORDER * IF( LSAME( ORDER, 'B' ) ) THEN IORDER = 2 ELSE IF( LSAME( ORDER, 'E' ) ) THEN IORDER = 1 ELSE IORDER = 0 END IF * * Check for Errors * IF( IRANGE.LE.0 ) THEN INFO = -1 ELSE IF( IORDER.LE.0 ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IRANGE.EQ.2 ) THEN IF( VL.GE.VU ) $ INFO = -5 ELSE IF( IRANGE.EQ.3 .AND. ( IL.LT.1 .OR. IL.GT.MAX( 1, N ) ) ) $ THEN INFO = -6 ELSE IF( IRANGE.EQ.3 .AND. ( IU.LT.MIN( N, IL ) .OR. IU.GT.N ) ) $ THEN INFO = -7 END IF * IF( INFO.NE.0 ) THEN RETURN END IF * Initialize error flags INFO = 0 NCNVRG = .FALSE. TOOFEW = .FALSE. * Quick return if possible M = 0 IF( N.EQ.0 ) RETURN * Simplification: IF( IRANGE.EQ.3 .AND. IL.EQ.1 .AND. IU.EQ.N ) IRANGE = 1 * Get machine constants EPS = SLAMCH( 'P' ) UFLOW = SLAMCH( 'U' ) * Special Case when N=1 * Treat case of 1x1 matrix for quick return IF( N.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR. $ ((IRANGE.EQ.2).AND.(D(1).GT.VL).AND.(D(1).LE.VU)).OR. $ ((IRANGE.EQ.3).AND.(IL.EQ.1).AND.(IU.EQ.1)) ) THEN M = 1 W(1) = D(1) * The computation error of the eigenvalue is zero WERR(1) = ZERO IBLOCK( 1 ) = 1 INDEXW( 1 ) = 1 ENDIF RETURN END IF * NB is the minimum vector length for vector bisection, or 0 * if only scalar is to be done. NB = ILAENV( 1, 'SSTEBZ', ' ', N, -1, -1, -1 ) IF( NB.LE.1 ) NB = 0 * Find global spectral radius GL = D(1) GU = D(1) DO 5 I = 1,N GL = MIN( GL, GERS( 2*I - 1)) GU = MAX( GU, GERS(2*I) ) 5 CONTINUE * Compute global Gerschgorin bounds and spectral diameter TNORM = MAX( ABS( GL ), ABS( GU ) ) GL = GL - FUDGE*TNORM*EPS*N - FUDGE*TWO*PIVMIN GU = GU + FUDGE*TNORM*EPS*N + FUDGE*TWO*PIVMIN SPDIAM = GU - GL * Input arguments for SLAEBZ: * The relative tolerance. An interval (a,b] lies within * "relative tolerance" if b-a < RELTOL*max(|a|,|b|), RTOLI = RELTOL ATOLI = FUDGE*TWO*UFLOW + FUDGE*TWO*PIVMIN IF( IRANGE.EQ.3 ) THEN * RANGE='I': Compute an interval containing eigenvalues * IL through IU. The initial interval [GL,GU] from the global * Gerschgorin bounds GL and GU is refined by SLAEBZ. ITMAX = INT( ( LOG( TNORM+PIVMIN )-LOG( PIVMIN ) ) / $ LOG( TWO ) ) + 2 WORK( N+1 ) = GL WORK( N+2 ) = GL WORK( N+3 ) = GU WORK( N+4 ) = GU WORK( N+5 ) = GL WORK( N+6 ) = GU IWORK( 1 ) = -1 IWORK( 2 ) = -1 IWORK( 3 ) = N + 1 IWORK( 4 ) = N + 1 IWORK( 5 ) = IL - 1 IWORK( 6 ) = IU * CALL SLAEBZ( 3, ITMAX, N, 2, 2, NB, ATOLI, RTOLI, PIVMIN, $ D, E, E2, IWORK( 5 ), WORK( N+1 ), WORK( N+5 ), IOUT, $ IWORK, W, IBLOCK, IINFO ) IF( IINFO .NE. 0 ) THEN INFO = IINFO RETURN END IF * On exit, output intervals may not be ordered by ascending negcount IF( IWORK( 6 ).EQ.IU ) THEN WL = WORK( N+1 ) WLU = WORK( N+3 ) NWL = IWORK( 1 ) WU = WORK( N+4 ) WUL = WORK( N+2 ) NWU = IWORK( 4 ) ELSE WL = WORK( N+2 ) WLU = WORK( N+4 ) NWL = IWORK( 2 ) WU = WORK( N+3 ) WUL = WORK( N+1 ) NWU = IWORK( 3 ) END IF * On exit, the interval [WL, WLU] contains a value with negcount NWL, * and [WUL, WU] contains a value with negcount NWU. IF( NWL.LT.0 .OR. NWL.GE.N .OR. NWU.LT.1 .OR. NWU.GT.N ) THEN INFO = 4 RETURN END IF ELSEIF( IRANGE.EQ.2 ) THEN WL = VL WU = VU ELSEIF( IRANGE.EQ.1 ) THEN WL = GL WU = GU ENDIF * Find Eigenvalues -- Loop Over blocks and recompute NWL and NWU. * NWL accumulates the number of eigenvalues .le. WL, * NWU accumulates the number of eigenvalues .le. WU M = 0 IEND = 0 INFO = 0 NWL = 0 NWU = 0 * DO 70 JBLK = 1, NSPLIT IOFF = IEND IBEGIN = IOFF + 1 IEND = ISPLIT( JBLK ) IN = IEND - IOFF * IF( IRANGE.EQ.1 ) THEN IF( (IEND.LT.DOL).OR.(IBEGIN.GT.DOU) ) THEN * the local block contains none of eigenvalues that matter * to this processor NWU = NWU + IN DO 30 J = 1, IN M = M + 1 IBLOCK( M ) = JBLK 30 CONTINUE GO TO 70 END IF END IF IF( IN.EQ.1 ) THEN * 1x1 block IF( WL.GE.D( IBEGIN )-PIVMIN ) $ NWL = NWL + 1 IF( WU.GE.D( IBEGIN )-PIVMIN ) $ NWU = NWU + 1 IF( IRANGE.EQ.1 .OR. ( WL.LT.D( IBEGIN )-PIVMIN .AND. WU.GE. $ D( IBEGIN )-PIVMIN ) ) THEN M = M + 1 W( M ) = D( IBEGIN ) WERR(M) = ZERO * The gap for a single block doesn't matter for the later * algorithm and is assigned an arbitrary large value IBLOCK( M ) = JBLK INDEXW( M ) = 1 END IF ELSE * General Case - block of size IN > 2 * Compute local Gerschgorin interval and use it as the initial * interval for SLAEBZ GU = D( IBEGIN ) GL = D( IBEGIN ) TMP1 = ZERO DO 40 J = IBEGIN, IEND GL = MIN( GL, GERS( 2*J - 1)) GU = MAX( GU, GERS(2*J) ) 40 CONTINUE SPDIAM = GU - GL GL = GL - FUDGE*TNORM*EPS*IN - FUDGE*PIVMIN GU = GU + FUDGE*TNORM*EPS*IN + FUDGE*PIVMIN * IF( IRANGE.GT.1 ) THEN IF( GU.LT.WL ) THEN * the local block contains none of the wanted eigenvalues NWL = NWL + IN NWU = NWU + IN GO TO 70 END IF * refine search interval if possible, only range (WL,WU] matters GL = MAX( GL, WL ) GU = MIN( GU, WU ) IF( GL.GE.GU ) $ GO TO 70 END IF * Find negcount of initial interval boundaries GL and GU WORK( N+1 ) = GL WORK( N+IN+1 ) = GU CALL SLAEBZ( 1, 0, IN, IN, 1, NB, ATOLI, RTOLI, PIVMIN, $ D( IBEGIN ), E( IBEGIN ), E2( IBEGIN ), $ IDUMMA, WORK( N+1 ), WORK( N+2*IN+1 ), IM, $ IWORK, W( M+1 ), IBLOCK( M+1 ), IINFO ) IF( IINFO .NE. 0 ) THEN INFO = IINFO RETURN END IF * NWL = NWL + IWORK( 1 ) NWU = NWU + IWORK( IN+1 ) IWOFF = M - IWORK( 1 ) * Compute Eigenvalues ITMAX = INT( ( LOG( GU-GL+PIVMIN )-LOG( PIVMIN ) ) / $ LOG( TWO ) ) + 2 CALL SLAEBZ( 2, ITMAX, IN, IN, 1, NB, ATOLI, RTOLI, PIVMIN, $ D( IBEGIN ), E( IBEGIN ), E2( IBEGIN ), $ IDUMMA, WORK( N+1 ), WORK( N+2*IN+1 ), IOUT, $ IWORK, W( M+1 ), IBLOCK( M+1 ), IINFO ) IF( IINFO .NE. 0 ) THEN INFO = IINFO RETURN END IF * * Copy eigenvalues into W and IBLOCK * Use -JBLK for block number for unconverged eigenvalues. * Loop over the number of output intervals from SLAEBZ DO 60 J = 1, IOUT * eigenvalue approximation is middle point of interval TMP1 = HALF*( WORK( J+N )+WORK( J+IN+N ) ) * semi length of error interval TMP2 = HALF*ABS( WORK( J+N )-WORK( J+IN+N ) ) IF( J.GT.IOUT-IINFO ) THEN * Flag non-convergence. NCNVRG = .TRUE. IB = -JBLK ELSE IB = JBLK END IF DO 50 JE = IWORK( J ) + 1 + IWOFF, $ IWORK( J+IN ) + IWOFF W( JE ) = TMP1 WERR( JE ) = TMP2 INDEXW( JE ) = JE - IWOFF IBLOCK( JE ) = IB 50 CONTINUE 60 CONTINUE * M = M + IM END IF 70 CONTINUE * If RANGE='I', then (WL,WU) contains eigenvalues NWL+1,...,NWU * If NWL+1 < IL or NWU > IU, discard extra eigenvalues. IF( IRANGE.EQ.3 ) THEN IDISCL = IL - 1 - NWL IDISCU = NWU - IU * IF( IDISCL.GT.0 ) THEN IM = 0 DO 80 JE = 1, M * Remove some of the smallest eigenvalues from the left so that * at the end IDISCL =0. Move all eigenvalues up to the left. IF( W( JE ).LE.WLU .AND. IDISCL.GT.0 ) THEN IDISCL = IDISCL - 1 ELSE IM = IM + 1 W( IM ) = W( JE ) WERR( IM ) = WERR( JE ) INDEXW( IM ) = INDEXW( JE ) IBLOCK( IM ) = IBLOCK( JE ) END IF 80 CONTINUE M = IM END IF IF( IDISCU.GT.0 ) THEN * Remove some of the largest eigenvalues from the right so that * at the end IDISCU =0. Move all eigenvalues up to the left. IM=M+1 DO 81 JE = M, 1, -1 IF( W( JE ).GE.WUL .AND. IDISCU.GT.0 ) THEN IDISCU = IDISCU - 1 ELSE IM = IM - 1 W( IM ) = W( JE ) WERR( IM ) = WERR( JE ) INDEXW( IM ) = INDEXW( JE ) IBLOCK( IM ) = IBLOCK( JE ) END IF 81 CONTINUE JEE = 0 DO 82 JE = IM, M JEE = JEE + 1 W( JEE ) = W( JE ) WERR( JEE ) = WERR( JE ) INDEXW( JEE ) = INDEXW( JE ) IBLOCK( JEE ) = IBLOCK( JE ) 82 CONTINUE M = M-IM+1 END IF IF( IDISCL.GT.0 .OR. IDISCU.GT.0 ) THEN * Code to deal with effects of bad arithmetic. (If N(w) is * monotone non-decreasing, this should never happen.) * Some low eigenvalues to be discarded are not in (WL,WLU], * or high eigenvalues to be discarded are not in (WUL,WU] * so just kill off the smallest IDISCL/largest IDISCU * eigenvalues, by marking the corresponding IBLOCK = 0 IF( IDISCL.GT.0 ) THEN WKILL = WU DO 100 JDISC = 1, IDISCL IW = 0 DO 90 JE = 1, M IF( IBLOCK( JE ).NE.0 .AND. $ ( W( JE ).LT.WKILL .OR. IW.EQ.0 ) ) THEN IW = JE WKILL = W( JE ) END IF 90 CONTINUE IBLOCK( IW ) = 0 100 CONTINUE END IF IF( IDISCU.GT.0 ) THEN WKILL = WL DO 120 JDISC = 1, IDISCU IW = 0 DO 110 JE = 1, M IF( IBLOCK( JE ).NE.0 .AND. $ ( W( JE ).GE.WKILL .OR. IW.EQ.0 ) ) THEN IW = JE WKILL = W( JE ) END IF 110 CONTINUE IBLOCK( IW ) = 0 120 CONTINUE END IF * Now erase all eigenvalues with IBLOCK set to zero IM = 0 DO 130 JE = 1, M IF( IBLOCK( JE ).NE.0 ) THEN IM = IM + 1 W( IM ) = W( JE ) WERR( IM ) = WERR( JE ) INDEXW( IM ) = INDEXW( JE ) IBLOCK( IM ) = IBLOCK( JE ) END IF 130 CONTINUE M = IM END IF IF( IDISCL.LT.0 .OR. IDISCU.LT.0 ) THEN TOOFEW = .TRUE. END IF END IF * IF(( IRANGE.EQ.1 .AND. M.NE.N ).OR. $ ( IRANGE.EQ.3 .AND. M.NE.IU-IL+1 ) ) THEN TOOFEW = .TRUE. END IF * If ORDER='B',(IBLOCK = 2), do nothing the eigenvalues are already sorted * by block. * If ORDER='E',(IBLOCK = 1), sort the eigenvalues from smallest to largest IF( IORDER.EQ.1 .AND. NSPLIT.GT.1 ) THEN DO 150 JE = 1, M - 1 IE = 0 TMP1 = W( JE ) DO 140 J = JE + 1, M IF( W( J ).LT.TMP1 ) THEN IE = J TMP1 = W( J ) END IF 140 CONTINUE IF( IE.NE.0 ) THEN TMP2 = WERR( IE ) ITMP1 = IBLOCK( IE ) ITMP2 = INDEXW( IE ) W( IE ) = W( JE ) WERR( IE ) = WERR( JE ) IBLOCK( IE ) = IBLOCK( JE ) INDEXW( IE ) = INDEXW( JE ) W( JE ) = TMP1 WERR( JE ) = TMP2 IBLOCK( JE ) = ITMP1 INDEXW( JE ) = ITMP2 END IF 150 CONTINUE END IF * INFO = 0 IF( NCNVRG ) $ INFO = INFO + 1 IF( TOOFEW ) $ INFO = INFO + 2 RETURN * * End of SLARRD2 * END scalapack-2.0.2/SRC/slarre2.f000644 000766 000024 00000071001 11657111056 016063 0ustar00juliestaff000000 000000 SUBROUTINE SLARRE2( RANGE, N, VL, VU, IL, IU, D, E, E2, $ RTOL1, RTOL2, SPLTOL, NSPLIT, ISPLIT, $ M, DOL, DOU, $ W, WERR, WGAP, IBLOCK, INDEXW, GERS, PIVMIN, $ WORK, IWORK, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * * .. Scalar Arguments .. CHARACTER RANGE INTEGER DOL, DOU, IL, INFO, IU, M, N, NSPLIT REAL PIVMIN, RTOL1, RTOL2, SPLTOL, VL, VU * .. * .. Array Arguments .. INTEGER IBLOCK( * ), ISPLIT( * ), IWORK( * ), $ INDEXW( * ) REAL D( * ), E( * ), E2( * ), GERS( * ), $ W( * ),WERR( * ), WGAP( * ), WORK( * ) * * Purpose * ======= * * To find the desired eigenvalues of a given real symmetric * tridiagonal matrix T, SLARRE2 sets, via SLARRA, * "small" off-diagonal elements to zero. For each block T_i, it finds * (a) a suitable shift at one end of the block's spectrum, * (b) the root RRR, T_i - sigma_i I = L_i D_i L_i^T, and * (c) eigenvalues of each L_i D_i L_i^T. * The representations and eigenvalues found are then returned to * SSTEGR2 to compute the eigenvectors T. * * SLARRE2 is more suitable for parallel computation than the * original LAPACK code for computing the root RRR and its eigenvalues. * When computing eigenvalues in parallel and the input tridiagonal * matrix splits into blocks, SLARRE2 * can skip over blocks which contain none of the eigenvalues from * DOL to DOU for which the processor responsible. In extreme cases (such * as large matrices consisting of many blocks of small size, e.g. 2x2, * the gain can be substantial. * * Arguments * ========= * * RANGE (input) CHARACTER * = 'A': ("All") all eigenvalues will be found. * = 'V': ("Value") all eigenvalues in the half-open interval * (VL, VU] will be found. * = 'I': ("Index") the IL-th through IU-th eigenvalues (of the * entire matrix) will be found. * * N (input) INTEGER * The order of the matrix. N > 0. * * VL (input/output) REAL * VU (input/output) REAL * If RANGE='V', the lower and upper bounds for the eigenvalues. * Eigenvalues less than or equal to VL, or greater than VU, * will not be returned. VL < VU. * If RANGE='I' or ='A', SLARRE2 computes bounds on the desired * part of the spectrum. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N. * * D (input/output) REAL array, dimension (N) * On entry, the N diagonal elements of the tridiagonal * matrix T. * On exit, the N diagonal elements of the diagonal * matrices D_i. * * E (input/output) REAL array, dimension (N) * On entry, the first (N-1) entries contain the subdiagonal * elements of the tridiagonal matrix T; E(N) need not be set. * On exit, E contains the subdiagonal elements of the unit * bidiagonal matrices L_i. The entries E( ISPLIT( I ) ), * 1 <= I <= NSPLIT, contain the base points sigma_i on output. * * E2 (input/output) REAL array, dimension (N) * On entry, the first (N-1) entries contain the SQUARES of the * subdiagonal elements of the tridiagonal matrix T; * E2(N) need not be set. * On exit, the entries E2( ISPLIT( I ) ), * 1 <= I <= NSPLIT, have been set to zero * * RTOL1 (input) REAL * RTOL2 (input) REAL * Parameters for bisection. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) * * SPLTOL (input) REAL * The threshold for splitting. * * NSPLIT (output) INTEGER * The number of blocks T splits into. 1 <= NSPLIT <= N. * * ISPLIT (output) INTEGER array, dimension (N) * The splitting points, at which T breaks up into blocks. * The first block consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N. * * M (output) INTEGER * The total number of eigenvalues (of all L_i D_i L_i^T) * found. * * DOL (input) INTEGER * DOU (input) INTEGER * If the user wants to work on only a selected part of the * representation tree, he can specify an index range DOL:DOU. * Otherwise, the setting DOL=1, DOU=N should be applied. * Note that DOL and DOU refer to the order in which the eigenvalues * are stored in W. * * W (output) REAL array, dimension (N) * The first M elements contain the eigenvalues. The * eigenvalues of each of the blocks, L_i D_i L_i^T, are * sorted in ascending order ( SLARRE2 may use the * remaining N-M elements as workspace). * Note that immediately after exiting this routine, only * the eigenvalues from position DOL:DOU in W might be * reliable on this processor * when the eigenvalue computation is done in parallel. * * WERR (output) REAL array, dimension (N) * The error bound on the corresponding eigenvalue in W. * Note that immediately after exiting this routine, only * the uncertainties from position DOL:DOU in WERR might be * reliable on this processor * when the eigenvalue computation is done in parallel. * * WGAP (output) REAL array, dimension (N) * The separation from the right neighbor eigenvalue in W. * The gap is only with respect to the eigenvalues of the same block * as each block has its own representation tree. * Exception: at the right end of a block we store the left gap * Note that immediately after exiting this routine, only * the gaps from position DOL:DOU in WGAP might be * reliable on this processor * when the eigenvalue computation is done in parallel. * * IBLOCK (output) INTEGER array, dimension (N) * The indices of the blocks (submatrices) associated with the * corresponding eigenvalues in W; IBLOCK(i)=1 if eigenvalue * W(i) belongs to the first block from the top, =2 if W(i) * belongs to the second block, etc. * * INDEXW (output) INTEGER array, dimension (N) * The indices of the eigenvalues within each block (submatrix); * for example, INDEXW(i)= 10 and IBLOCK(i)=2 imply that the * i-th eigenvalue W(i) is the 10-th eigenvalue in block 2 * * GERS (output) REAL array, dimension (2*N) * The N Gerschgorin intervals (the i-th Gerschgorin interval * is (GERS(2*i-1), GERS(2*i)). * * PIVMIN (output) DOUBLE PRECISION * The minimum pivot in the sturm sequence for T. * * WORK (workspace) REAL array, dimension (6*N) * Workspace. * * IWORK (workspace) INTEGER array, dimension (5*N) * Workspace. * * INFO (output) INTEGER * = 0: successful exit * > 0: A problem occured in SLARRE2. * < 0: One of the called subroutines signaled an internal problem. * Needs inspection of the corresponding parameter IINFO * for further information. * * =-1: Problem in SLARRD. * = 2: No base representation could be found in MAXTRY iterations. * Increasing MAXTRY and recompilation might be a remedy. * =-3: Problem in SLARRB when computing the refined root * representation for SLASQ2. * =-4: Problem in SLARRB when preforming bisection on the * desired part of the spectrum. * =-5: Problem in SLASQ2. * =-6: Problem in SLASQ2. * * ===================================================================== * * .. Parameters .. REAL FAC, FOUR, FOURTH, FUDGE, HALF, HNDRD, $ MAXGROWTH, ONE, PERT, TWO, ZERO PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, $ TWO = 2.0E0, FOUR=4.0E0, $ HNDRD = 100.0E0, $ PERT = 4.0E0, $ HALF = ONE/TWO, FOURTH = ONE/FOUR, FAC= HALF, $ MAXGROWTH = 64.0E0, FUDGE = 2.0E0 ) INTEGER MAXTRY PARAMETER ( MAXTRY = 6 ) * .. * .. Local Scalars .. LOGICAL FORCEB, NOREP, RNDPRT, USEDQD INTEGER CNT, CNT1, CNT2, I, IBEGIN, IDUM, IEND, IINFO, $ IN, INDL, INDU, IRANGE, J, JBLK, MB, MM, $ WBEGIN, WEND REAL AVGAP, BSRTOL, CLWDTH, DMAX, DPIVOT, EABS, $ EMAX, EOLD, EPS, GL, GU, ISLEFT, ISRGHT, RTL, $ RTOL, S1, S2, SAFMIN, SGNDEF, SIGMA, SPDIAM, $ TAU, TMP, TMP1 * .. * .. Local Arrays .. INTEGER ISEED( 4 ) * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH EXTERNAL SLAMCH, LSAME * .. * .. External Subroutines .. EXTERNAL SCOPY, SLARNV, SLARRA, SLARRB, SLARRC, $ SLARRD, SLASQ2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * INFO = 0 * Dis-/Enable a small random perturbation of the root representation RNDPRT = .TRUE. * * Decode RANGE * IF( LSAME( RANGE, 'A' ) ) THEN IRANGE = 1 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IRANGE = 2 ELSE IF( LSAME( RANGE, 'I' ) ) THEN IRANGE = 3 END IF M = 0 * Get machine constants SAFMIN = SLAMCH( 'S' ) EPS = SLAMCH( 'P' ) * Set parameters RTL = HNDRD*EPS BSRTOL = 1.0E-1 * Treat case of 1x1 matrix for quick return IF( N.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR. $ ((IRANGE.EQ.2).AND.(D(1).GT.VL).AND.(D(1).LE.VU)).OR. $ ((IRANGE.EQ.3).AND.(IL.EQ.1).AND.(IU.EQ.1)) ) THEN M = 1 W(1) = D(1) * The computation error of the eigenvalue is zero WERR(1) = ZERO WGAP(1) = ZERO IBLOCK( 1 ) = 1 INDEXW( 1 ) = 1 GERS(1) = D( 1 ) GERS(2) = D( 1 ) ENDIF * store the shift for the initial RRR, which is zero in this case E(1) = ZERO RETURN END IF * General case: tridiagonal matrix of order > 1 * * Init WERR, WGAP. Compute Gerschgorin intervals and spectral diameter. * Compute maximum off-diagonal entry and pivmin. GL = D(1) GU = D(1) EOLD = ZERO EMAX = ZERO E(N) = ZERO DO 5 I = 1,N WERR(I) = ZERO WGAP(I) = ZERO EABS = ABS( E(I) ) IF( EABS .GE. EMAX ) THEN EMAX = EABS END IF TMP1 = EABS + EOLD GERS( 2*I-1) = D(I) - TMP1 GL = MIN( GL, GERS( 2*I - 1)) GERS( 2*I ) = D(I) + TMP1 GU = MAX( GU, GERS(2*I) ) EOLD = EABS 5 CONTINUE * The minimum pivot allowed in the sturm sequence for T PIVMIN = SAFMIN * MAX( ONE, EMAX**2 ) * Compute spectral diameter. The Gerschgorin bounds give an * estimate that is wrong by at most a factor of SQRT(2) SPDIAM = GU - GL * Compute splitting points CALL SLARRA( N, D, E, E2, SPLTOL, SPDIAM, $ NSPLIT, ISPLIT, IINFO ) * Can force use of bisection instead of faster DQDS FORCEB = .FALSE. IF( (IRANGE.EQ.1) .AND. (.NOT. FORCEB) ) THEN * Set interval [VL,VU] that contains all eigenvalues VL = GL VU = GU ELSE * We call SLARRD to find crude approximations to the eigenvalues * in the desired range. In case IRANGE = 3, we also obtain the * interval (VL,VU] that contains all the wanted eigenvalues. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.RTOL*MAX(ABS(LEFT),ABS(RIGHT)) * SLARRD needs a WORK of size 4*N, IWORK of size 3*N CALL SLARRD( RANGE, 'B', N, VL, VU, IL, IU, GERS, $ BSRTOL, D, E, E2, PIVMIN, NSPLIT, ISPLIT, $ MM, W, WERR, VL, VU, IBLOCK, INDEXW, $ WORK, IWORK, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF * Make sure that the entries M+1 to N in W, WERR, IBLOCK, INDEXW are 0 DO 14 I = MM+1,N W( I ) = ZERO WERR( I ) = ZERO IBLOCK( I ) = 0 INDEXW( I ) = 0 14 CONTINUE END IF *** * Loop over unreduced blocks IBEGIN = 1 WBEGIN = 1 DO 170 JBLK = 1, NSPLIT IEND = ISPLIT( JBLK ) IN = IEND - IBEGIN + 1 * 1 X 1 block IF( IN.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR.( (IRANGE.EQ.2).AND. $ ( D( IBEGIN ).GT.VL ).AND.( D( IBEGIN ).LE.VU ) ) $ .OR. ( (IRANGE.EQ.3).AND.(IBLOCK(WBEGIN).EQ.JBLK)) $ ) THEN M = M + 1 W( M ) = D( IBEGIN ) WERR(M) = ZERO * The gap for a single block doesn't matter for the later * algorithm and is assigned an arbitrary large value WGAP(M) = ZERO IBLOCK( M ) = JBLK INDEXW( M ) = 1 WBEGIN = WBEGIN + 1 ENDIF * E( IEND ) holds the shift for the initial RRR E( IEND ) = ZERO IBEGIN = IEND + 1 GO TO 170 END IF * * Blocks of size larger than 1x1 * * E( IEND ) will hold the shift for the initial RRR, for now set it =0 E( IEND ) = ZERO * * Find local outer bounds GL,GU for the block GL = D(IBEGIN) GU = D(IBEGIN) DO 15 I = IBEGIN , IEND GL = MIN( GERS( 2*I-1 ), GL ) GU = MAX( GERS( 2*I ), GU ) 15 CONTINUE SPDIAM = GU - GL IF(.NOT. ((IRANGE.EQ.1).AND.(.NOT.FORCEB)) ) THEN * Count the number of eigenvalues in the current block. MB = 0 DO 20 I = WBEGIN,MM IF( IBLOCK(I).EQ.JBLK ) THEN MB = MB+1 ELSE GOTO 21 ENDIF 20 CONTINUE 21 CONTINUE IF( MB.EQ.0) THEN * No eigenvalue in the current block lies in the desired range * E( IEND ) holds the shift for the initial RRR E( IEND ) = ZERO IBEGIN = IEND + 1 GO TO 170 ELSE * Decide whether dqds or bisection is more efficient USEDQD = ( (MB .GT. FAC*IN) .AND. (.NOT.FORCEB) ) WEND = WBEGIN + MB - 1 * Calculate gaps for the current block * In later stages, when representations for individual * eigenvalues are different, we use SIGMA = E( IEND ). SIGMA = ZERO DO 30 I = WBEGIN, WEND - 1 WGAP( I ) = MAX( ZERO, $ W(I+1)-WERR(I+1) - (W(I)+WERR(I)) ) 30 CONTINUE WGAP( WEND ) = MAX( ZERO, $ VU - SIGMA - (W( WEND )+WERR( WEND ))) * Find local index of the first and last desired evalue. INDL = INDEXW(WBEGIN) INDU = INDEXW( WEND ) ENDIF ELSE * MB = number of eigenvalues to compute MB = IN WEND = WBEGIN + MB - 1 INDL = 1 INDU = IN ENDIF IF( (WEND.LT.DOL).OR.(WBEGIN.GT.DOU) ) THEN * if this subblock contains no desired eigenvalues, * skip the computation of this representation tree IBEGIN = IEND + 1 WBEGIN = WEND + 1 M = M + INDU - INDL + 1 GO TO 170 END IF * Find approximations to the extremal eigenvalues of the block CALL SLARRK( IN, 1, GL, GU, D(IBEGIN), $ E2(IBEGIN), PIVMIN, RTL, TMP, TMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF ISLEFT = MAX(GL, TMP - TMP1 $ - HNDRD * EPS* ABS(TMP - TMP1)) CALL SLARRK( IN, IN, GL, GU, D(IBEGIN), $ E2(IBEGIN), PIVMIN, RTL, TMP, TMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF ISRGHT = MIN(GU, TMP + TMP1 $ + HNDRD * EPS * ABS(TMP + TMP1)) IF(( (IRANGE.EQ.1) .AND. (.NOT. FORCEB) ).OR.USEDQD) THEN * Case of DQDS * Improve the estimate of the spectral diameter SPDIAM = ISRGHT - ISLEFT ELSE * Case of bisection * Find approximations to the wanted extremal eigenvalues ISLEFT = MAX(GL, W(WBEGIN) - WERR(WBEGIN) $ - HNDRD * EPS*ABS(W(WBEGIN)- WERR(WBEGIN) )) ISRGHT = MIN(GU,W(WEND) + WERR(WEND) $ + HNDRD * EPS * ABS(W(WEND)+ WERR(WEND))) ENDIF * Decide whether the base representation for the current block * L_JBLK D_JBLK L_JBLK^T = T_JBLK - sigma_JBLK I * should be on the left or the right end of the current block. * The strategy is to shift to the end which is "more populated" * Furthermore, decide whether to use DQDS for the computation of * the eigenvalue approximations at the end of SLARRE2 or bisection. * dqds is chosen if all eigenvalues are desired or the number of * eigenvalues to be computed is large compared to the blocksize. IF( ( IRANGE.EQ.1 ) .AND. (.NOT.FORCEB) ) THEN * If all the eigenvalues have to be computed, we use dqd USEDQD = .TRUE. * INDL is the local index of the first eigenvalue to compute INDL = 1 INDU = IN * MB = number of eigenvalues to compute MB = IN WEND = WBEGIN + MB - 1 * Define 1/4 and 3/4 points of the spectrum S1 = ISLEFT + FOURTH * SPDIAM S2 = ISRGHT - FOURTH * SPDIAM ELSE * SLARRD has computed IBLOCK and INDEXW for each eigenvalue * approximation. * choose sigma IF( USEDQD ) THEN S1 = ISLEFT + FOURTH * SPDIAM S2 = ISRGHT - FOURTH * SPDIAM ELSE TMP = MIN(ISRGHT,VU) - MAX(ISLEFT,VL) S1 = MAX(ISLEFT,VL) + FOURTH * TMP S2 = MIN(ISRGHT,VU) - FOURTH * TMP ENDIF ENDIF * Compute the negcount at the 1/4 and 3/4 points IF(MB.GT.1) THEN CALL SLARRC( 'T', IN, S1, S2, D(IBEGIN), $ E(IBEGIN), PIVMIN, CNT, CNT1, CNT2, IINFO) ENDIF IF(MB.EQ.1) THEN SIGMA = GL SGNDEF = ONE ELSEIF( CNT1 - INDL .GE. INDU - CNT2 ) THEN IF( ( IRANGE.EQ.1 ) .AND. (.NOT.FORCEB) ) THEN SIGMA = MAX(ISLEFT,GL) ELSEIF( USEDQD ) THEN * use Gerschgorin bound as shift to get pos def matrix * for dqds SIGMA = ISLEFT ELSE * use approximation of the first desired eigenvalue of the * block as shift SIGMA = MAX(ISLEFT,VL) ENDIF SGNDEF = ONE ELSE IF( ( IRANGE.EQ.1 ) .AND. (.NOT.FORCEB) ) THEN SIGMA = MIN(ISRGHT,GU) ELSEIF( USEDQD ) THEN * use Gerschgorin bound as shift to get neg def matrix * for dqds SIGMA = ISRGHT ELSE * use approximation of the first desired eigenvalue of the * block as shift SIGMA = MIN(ISRGHT,VU) ENDIF SGNDEF = -ONE ENDIF * An initial SIGMA has been chosen that will be used for computing * T - SIGMA I = L D L^T * Define the increment TAU of the shift in case the initial shift * needs to be refined to obtain a factorization with not too much * element growth. IF( USEDQD ) THEN TAU = SPDIAM*EPS*N + TWO*PIVMIN TAU = MAX(TAU,EPS*ABS(SIGMA)) ELSE IF(MB.GT.1) THEN CLWDTH = W(WEND) + WERR(WEND) - W(WBEGIN) - WERR(WBEGIN) AVGAP = ABS(CLWDTH / REAL(WEND-WBEGIN)) IF( SGNDEF.EQ.ONE ) THEN TAU = HALF*MAX(WGAP(WBEGIN),AVGAP) TAU = MAX(TAU,WERR(WBEGIN)) ELSE TAU = HALF*MAX(WGAP(WEND-1),AVGAP) TAU = MAX(TAU,WERR(WEND)) ENDIF ELSE TAU = WERR(WBEGIN) ENDIF ENDIF * DO 80 IDUM = 1, MAXTRY * Compute L D L^T factorization of tridiagonal matrix T - sigma I. * Store D in WORK(1:IN), L in WORK(IN+1:2*IN), and reciprocals of * pivots in WORK(2*IN+1:3*IN) DPIVOT = D( IBEGIN ) - SIGMA WORK( 1 ) = DPIVOT DMAX = ABS( WORK(1) ) J = IBEGIN DO 70 I = 1, IN - 1 WORK( 2*IN+I ) = ONE / WORK( I ) TMP = E( J )*WORK( 2*IN+I ) WORK( IN+I ) = TMP DPIVOT = ( D( J+1 )-SIGMA ) - TMP*E( J ) WORK( I+1 ) = DPIVOT DMAX = MAX( DMAX, ABS(DPIVOT) ) J = J + 1 70 CONTINUE * check for element growth IF( DMAX .GT. MAXGROWTH*SPDIAM ) THEN NOREP = .TRUE. ELSE NOREP = .FALSE. ENDIF IF(NOREP) THEN * Note that in the case of IRANGE=1, we use the Gerschgorin * shift which makes the matrix definite. So we should end up * here really only in the case of IRANGE = 2,3 IF( IDUM.EQ.MAXTRY-1 ) THEN IF( SGNDEF.EQ.ONE ) THEN * The fudged Gerschgorin shift should succeed SIGMA = $ GL - FUDGE*SPDIAM*EPS*N - FUDGE*TWO*PIVMIN ELSE SIGMA = $ GU + FUDGE*SPDIAM*EPS*N + FUDGE*TWO*PIVMIN END IF ELSE SIGMA = SIGMA - SGNDEF * TAU TAU = TWO * TAU END IF ELSE * an initial RRR is found GO TO 83 END IF 80 CONTINUE * if the program reaches this point, no base representation could be * found in MAXTRY iterations. INFO = 2 RETURN 83 CONTINUE * At this point, we have found an initial base representation * T - SIGMA I = L D L^T with not too much element growth. * Store the shift. E( IEND ) = SIGMA * Store D and L. CALL SCOPY( IN, WORK, 1, D( IBEGIN ), 1 ) CALL SCOPY( IN-1, WORK( IN+1 ), 1, E( IBEGIN ), 1 ) IF(RNDPRT .AND. MB.GT.1 ) THEN * * Perturb each entry of the base representation by a small * (but random) relative amount to overcome difficulties with * glued matrices. * DO 122 I = 1, 4 ISEED( I ) = 1 122 CONTINUE CALL SLARNV(2, ISEED, 2*IN-1, WORK(1)) DO 125 I = 1,IN-1 D(IBEGIN+I-1) = D(IBEGIN+I-1)*(ONE+EPS*PERT*WORK(I)) E(IBEGIN+I-1) = E(IBEGIN+I-1)*(ONE+EPS*PERT*WORK(IN+I)) 125 CONTINUE D(IEND) = D(IEND)*(ONE+EPS*FOUR*WORK(IN)) * ENDIF * * Don't update the Gerschgorin intervals because keeping track * of the updates would be too much work in SLARRV. * We update W instead and use it to locate the proper Gerschgorin * intervals. * Compute the required eigenvalues of L D L' by bisection or dqds IF ( .NOT.USEDQD ) THEN * If SLARRD has been used, shift the eigenvalue approximations * according to their representation. This is necessary for * a uniform SLARRV since dqds computes eigenvalues of the * shifted representation. In SLARRV, W will always hold the * UNshifted eigenvalue approximation. DO 134 J=WBEGIN,WEND W(J) = W(J) - SIGMA WERR(J) = WERR(J) + ABS(W(J)) * EPS 134 CONTINUE * call SLARRB to reduce eigenvalue error of the approximations * from SLARRD DO 135 I = IBEGIN, IEND-1 WORK( I ) = D( I ) * E( I )**2 135 CONTINUE * use bisection to find EV from INDL to INDU CALL SLARRB(IN, D(IBEGIN), WORK(IBEGIN), $ INDL, INDU, RTOL1, RTOL2, INDL-1, $ W(WBEGIN), WGAP(WBEGIN), WERR(WBEGIN), $ WORK( 2*N+1 ), IWORK, PIVMIN, SPDIAM, $ IN, IINFO ) IF( IINFO .NE. 0 ) THEN INFO = -4 RETURN END IF * SLARRB computes all gaps correctly except for the last one * Record distance to VU/GU WGAP( WEND ) = MAX( ZERO, $ ( VU-SIGMA ) - ( W( WEND ) + WERR( WEND ) ) ) DO 138 I = INDL, INDU M = M + 1 IBLOCK(M) = JBLK INDEXW(M) = I 138 CONTINUE ELSE * Call dqds to get all eigs (and then possibly delete unwanted * eigenvalues). * Note that dqds finds the eigenvalues of the L D L^T representation * of T to high relative accuracy. High relative accuracy * might be lost when the shift of the RRR is subtracted to obtain * the eigenvalues of T. However, T is not guaranteed to define its * eigenvalues to high relative accuracy anyway. * Set RTOL to the order of the tolerance used in SLASQ2 * This is an ESTIMATED error, the worst case bound is 4*N*EPS * which is usually too large and requires unnecessary work to be * done by bisection when computing the eigenvectors RTOL = LOG(REAL(IN)) * FOUR * EPS J = IBEGIN DO 140 I = 1, IN - 1 WORK( 2*I-1 ) = ABS( D( J ) ) WORK( 2*I ) = E( J )*E( J )*WORK( 2*I-1 ) J = J + 1 140 CONTINUE WORK( 2*IN-1 ) = ABS( D( IEND ) ) WORK( 2*IN ) = ZERO CALL SLASQ2( IN, WORK, IINFO ) IF( IINFO .NE. 0 ) THEN * If IINFO = -5 then an index is part of a tight cluster * and should be changed. The index is in IWORK(1) and the * gap is in WORK(N+1) INFO = -5 RETURN ELSE * Test that all eigenvalues are positive as expected DO 149 I = 1, IN IF( WORK( I ).LT.ZERO ) THEN INFO = -6 RETURN ENDIF 149 CONTINUE END IF IF( SGNDEF.GT.ZERO ) THEN DO 150 I = INDL, INDU M = M + 1 W( M ) = WORK( IN-I+1 ) IBLOCK( M ) = JBLK INDEXW( M ) = I 150 CONTINUE ELSE DO 160 I = INDL, INDU M = M + 1 W( M ) = -WORK( I ) IBLOCK( M ) = JBLK INDEXW( M ) = I 160 CONTINUE END IF DO 165 I = M - MB + 1, M * the value of RTOL below should be the tolerance in SLASQ2 WERR( I ) = RTOL * ABS( W(I) ) 165 CONTINUE DO 166 I = M - MB + 1, M - 1 * compute the right gap between the intervals WGAP( I ) = MAX( ZERO, $ W(I+1)-WERR(I+1) - (W(I)+WERR(I)) ) 166 CONTINUE WGAP( M ) = MAX( ZERO, $ ( VU-SIGMA ) - ( W( M ) + WERR( M ) ) ) END IF * proceed with next block IBEGIN = IEND + 1 WBEGIN = WEND + 1 170 CONTINUE * RETURN * * end of SLARRE2 * END scalapack-2.0.2/SRC/slarre2a.f000644 000766 000024 00000066774 11657111056 016251 0ustar00juliestaff000000 000000 SUBROUTINE SLARRE2A( RANGE, N, VL, VU, IL, IU, D, E, E2, $ RTOL1, RTOL2, SPLTOL, NSPLIT, ISPLIT, $ M, DOL, DOU, NEEDIL, NEEDIU, $ W, WERR, WGAP, IBLOCK, INDEXW, GERS, $ SDIAM, PIVMIN, WORK, IWORK, MINRGP, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER RANGE INTEGER DOL, DOU, IL, INFO, IU, M, N, NSPLIT, $ NEEDIL, NEEDIU REAL MINRGP, PIVMIN, RTOL1, RTOL2, SPLTOL, VL, VU * .. * .. Array Arguments .. INTEGER IBLOCK( * ), ISPLIT( * ), IWORK( * ), $ INDEXW( * ) REAL D( * ), E( * ), E2( * ), GERS( * ), $ SDIAM( * ), W( * ),WERR( * ), $ WGAP( * ), WORK( * ) * * Purpose * ======= * * To find the desired eigenvalues of a given real symmetric * tridiagonal matrix T, SLARRE2 sets any "small" off-diagonal * elements to zero, and for each unreduced block T_i, it finds * (a) a suitable shift at one end of the block's spectrum, * (b) the base representation, T_i - sigma_i I = L_i D_i L_i^T, and * (c) eigenvalues of each L_i D_i L_i^T. * * NOTE: * The algorithm obtains a crude picture of all the wanted eigenvalues * (as selected by RANGE). However, to reduce work and improve scalability, * only the eigenvalues DOL to DOU are refined. Furthermore, if the matrix * splits into blocks, RRRs for blocks that do not contain eigenvalues * from DOL to DOU are skipped. * The DQDS algorithm (subroutine SLASQ2) is not used, unlike in * the sequential case. Instead, eigenvalues are computed in parallel to some * figures using bisection. * * Arguments * ========= * * RANGE (input) CHARACTER * = 'A': ("All") all eigenvalues will be found. * = 'V': ("Value") all eigenvalues in the half-open interval * (VL, VU] will be found. * = 'I': ("Index") the IL-th through IU-th eigenvalues (of the * entire matrix) will be found. * * N (input) INTEGER * The order of the matrix. N > 0. * * VL (input/output) REAL * VU (input/output) REAL * If RANGE='V', the lower and upper bounds for the eigenvalues. * Eigenvalues less than or equal to VL, or greater than VU, * will not be returned. VL < VU. * If RANGE='I' or ='A', SLARRE2A computes bounds on the desired * part of the spectrum. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N. * * D (input/output) REAL array, dimension (N) * On entry, the N diagonal elements of the tridiagonal * matrix T. * On exit, the N diagonal elements of the diagonal * matrices D_i. * * E (input/output) REAL array, dimension (N) * On entry, the first (N-1) entries contain the subdiagonal * elements of the tridiagonal matrix T; E(N) need not be set. * On exit, E contains the subdiagonal elements of the unit * bidiagonal matrices L_i. The entries E( ISPLIT( I ) ), * 1 <= I <= NSPLIT, contain the base points sigma_i on output. * * E2 (input/output) REAL array, dimension (N) * On entry, the first (N-1) entries contain the SQUARES of the * subdiagonal elements of the tridiagonal matrix T; * E2(N) need not be set. * On exit, the entries E2( ISPLIT( I ) ), * 1 <= I <= NSPLIT, have been set to zero * * RTOL1 (input) REAL * RTOL2 (input) REAL * Parameters for bisection. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) * * SPLTOL (input) REAL * The threshold for splitting. * * NSPLIT (output) INTEGER * The number of blocks T splits into. 1 <= NSPLIT <= N. * * ISPLIT (output) INTEGER array, dimension (N) * The splitting points, at which T breaks up into blocks. * The first block consists of rows/columns 1 to ISPLIT(1), * the second of rows/columns ISPLIT(1)+1 through ISPLIT(2), * etc., and the NSPLIT-th consists of rows/columns * ISPLIT(NSPLIT-1)+1 through ISPLIT(NSPLIT)=N. * * M (output) INTEGER * The total number of eigenvalues (of all L_i D_i L_i^T) * found. * * DOL (input) INTEGER * DOU (input) INTEGER * If the user wants to work on only a selected part of the * representation tree, he can specify an index range DOL:DOU. * Otherwise, the setting DOL=1, DOU=N should be applied. * Note that DOL and DOU refer to the order in which the eigenvalues * are stored in W. * * NEEDIL (output) INTEGER * NEEDIU (output) INTEGER * The indices of the leftmost and rightmost eigenvalues * of the root node RRR which are * needed to accurately compute the relevant part of the * representation tree. * * W (output) REAL array, dimension (N) * The first M elements contain the eigenvalues. The * eigenvalues of each of the blocks, L_i D_i L_i^T, are * sorted in ascending order ( SLARRE2A may use the * remaining N-M elements as workspace). * Note that immediately after exiting this routine, only * the eigenvalues from position DOL:DOU in W are * reliable on this processor * because the eigenvalue computation is done in parallel. * * WERR (output) REAL array, dimension (N) * The error bound on the corresponding eigenvalue in W. * Note that immediately after exiting this routine, only * the uncertainties from position DOL:DOU in WERR are * reliable on this processor * because the eigenvalue computation is done in parallel. * * WGAP (output) REAL array, dimension (N) * The separation from the right neighbor eigenvalue in W. * The gap is only with respect to the eigenvalues of the same block * as each block has its own representation tree. * Exception: at the right end of a block we store the left gap * Note that immediately after exiting this routine, only * the gaps from position DOL:DOU in WGAP are * reliable on this processor * because the eigenvalue computation is done in parallel. * * IBLOCK (output) INTEGER array, dimension (N) * The indices of the blocks (submatrices) associated with the * corresponding eigenvalues in W; IBLOCK(i)=1 if eigenvalue * W(i) belongs to the first block from the top, =2 if W(i) * belongs to the second block, etc. * * INDEXW (output) INTEGER array, dimension (N) * The indices of the eigenvalues within each block (submatrix); * for example, INDEXW(i)= 10 and IBLOCK(i)=2 imply that the * i-th eigenvalue W(i) is the 10-th eigenvalue in block 2 * * GERS (output) REAL array, dimension (2*N) * The N Gerschgorin intervals (the i-th Gerschgorin interval * is (GERS(2*i-1), GERS(2*i)). * * PIVMIN (output) DOUBLE PRECISION * The minimum pivot in the sturm sequence for T. * * WORK (workspace) REAL array, dimension (6*N) * Workspace. * * IWORK (workspace) INTEGER array, dimension (5*N) * Workspace. * * MINRGP (input) REAL * The minimum relativ gap threshold to decide whether an eigenvalue * or a cluster boundary is reached. * * INFO (output) INTEGER * = 0: successful exit * > 0: A problem occured in SLARRE2A. * < 0: One of the called subroutines signaled an internal problem. * Needs inspection of the corresponding parameter IINFO * for further information. * * =-1: Problem in SLARRD2. * = 2: No base representation could be found in MAXTRY iterations. * Increasing MAXTRY and recompilation might be a remedy. * =-3: Problem in SLARRB2 when computing the refined root * representation * =-4: Problem in SLARRB2 when preforming bisection on the * desired part of the spectrum. * = -9 Problem: M < DOU-DOL+1, that is the code found fewer * eigenvalues than it was supposed to * * * ===================================================================== * * .. Parameters .. REAL FAC, FOUR, FOURTH, FUDGE, HALF, HNDRD, $ MAXGROWTH, ONE, PERT, TWO, ZERO PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, $ TWO = 2.0E0, FOUR=4.0E0, $ HNDRD = 100.0E0, $ PERT = 4.0E0, $ HALF = ONE/TWO, FOURTH = ONE/FOUR, FAC= HALF, $ MAXGROWTH = 64.0E0, FUDGE = 2.0E0 ) INTEGER MAXTRY PARAMETER ( MAXTRY = 6 ) * .. * .. Local Scalars .. LOGICAL NOREP, RNDPRT, USEDQD INTEGER CNT, CNT1, CNT2, I, IBEGIN, IDUM, IEND, IINFO, $ IN, INDL, INDU, IRANGE, J, JBLK, MB, MM, $ MYINDL, MYINDU, MYWBEG, MYWEND, WBEGIN, WEND REAL AVGAP, BSRTOL, CLWDTH, DMAX, DPIVOT, EABS, $ EMAX, EOLD, EPS, GL, GU, ISLEFT, ISRGHT, $ LGPVMN, LGSPDM, RTL, S1, S2, SAFMIN, SGNDEF, $ SIGMA, SPDIAM, TAU, TMP, TMP1, TMP2 * .. * .. Local Arrays .. INTEGER ISEED( 4 ) * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH EXTERNAL SLAMCH, LSAME * .. * .. External Subroutines .. EXTERNAL SCOPY, SLARNV, SLARRA, SLARRB2, $ SLARRC, SLARRD2 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * INFO = 0 * Dis-/Enable a small random perturbation of the root representation RNDPRT = .TRUE. * * Decode RANGE * IF( LSAME( RANGE, 'A' ) ) THEN IRANGE = 1 ELSE IF( LSAME( RANGE, 'V' ) ) THEN IRANGE = 2 ELSE IF( LSAME( RANGE, 'I' ) ) THEN IRANGE = 3 END IF M = 0 * Get machine constants SAFMIN = SLAMCH( 'S' ) EPS = SLAMCH( 'P' ) * Set parameters RTL = HNDRD*EPS BSRTOL = 1.0E-1 * Treat case of 1x1 matrix for quick return IF( N.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR. $ ((IRANGE.EQ.2).AND.(D(1).GT.VL).AND.(D(1).LE.VU)).OR. $ ((IRANGE.EQ.3).AND.(IL.EQ.1).AND.(IU.EQ.1)) ) THEN M = 1 W(1) = D(1) * The computation error of the eigenvalue is zero WERR(1) = ZERO WGAP(1) = ZERO IBLOCK( 1 ) = 1 INDEXW( 1 ) = 1 GERS(1) = D( 1 ) GERS(2) = D( 1 ) ENDIF * store the shift for the initial RRR, which is zero in this case E(1) = ZERO RETURN END IF * General case: tridiagonal matrix of order > 1 * Init WERR, WGAP. DO 1 I =1,N WERR(I) = ZERO 1 CONTINUE DO 2 I =1,N WGAP(I) = ZERO 2 CONTINUE * Compute Gerschgorin intervals and spectral diameter. * Compute maximum off-diagonal entry and pivmin. GL = D(1) GU = D(1) EOLD = ZERO EMAX = ZERO E(N) = ZERO DO 5 I = 1,N EABS = ABS( E(I) ) IF( EABS .GE. EMAX ) THEN EMAX = EABS END IF TMP = EABS + EOLD EOLD = EABS TMP1 = D(I) - TMP TMP2 = D(I) + TMP GL = MIN( GL, TMP1 ) GU = MAX( GU, TMP2 ) GERS( 2*I-1) = TMP1 GERS( 2*I ) = TMP2 5 CONTINUE * The minimum pivot allowed in the sturm sequence for T PIVMIN = SAFMIN * MAX( ONE, EMAX**2 ) * Compute spectral diameter. The Gerschgorin bounds give an * estimate that is wrong by at most a factor of SQRT(2) SPDIAM = GU - GL * Compute splitting points CALL SLARRA( N, D, E, E2, SPLTOL, SPDIAM, $ NSPLIT, ISPLIT, IINFO ) IF( IRANGE.EQ.1 ) THEN * Set interval [VL,VU] that contains all eigenvalues VL = GL VU = GU ENDIF * We call SLARRD2 to find crude approximations to the eigenvalues * in the desired range. In case IRANGE = 3, we also obtain the * interval (VL,VU] that contains all the wanted eigenvalues. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.RTOL*MAX(ABS(LEFT),ABS(RIGHT)) * SLARRD2 needs a WORK of size 4*N, IWORK of size 3*N CALL SLARRD2( RANGE, 'B', N, VL, VU, IL, IU, GERS, $ BSRTOL, D, E, E2, PIVMIN, NSPLIT, ISPLIT, $ MM, W, WERR, VL, VU, IBLOCK, INDEXW, $ WORK, IWORK, DOL, DOU, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF * Make sure that the entries M+1 to N in W, WERR, IBLOCK, INDEXW are 0 DO 14 I = MM+1,N W( I ) = ZERO WERR( I ) = ZERO IBLOCK( I ) = 0 INDEXW( I ) = 0 14 CONTINUE *** * Loop over unreduced blocks IBEGIN = 1 WBEGIN = 1 DO 170 JBLK = 1, NSPLIT IEND = ISPLIT( JBLK ) IN = IEND - IBEGIN + 1 * 1 X 1 block IF( IN.EQ.1 ) THEN IF( (IRANGE.EQ.1).OR.( (IRANGE.EQ.2).AND. $ ( D( IBEGIN ).GT.VL ).AND.( D( IBEGIN ).LE.VU ) ) $ .OR. ( (IRANGE.EQ.3).AND.(IBLOCK(WBEGIN).EQ.JBLK)) $ ) THEN M = M + 1 W( M ) = D( IBEGIN ) WERR(M) = ZERO * The gap for a single block doesn't matter for the later * algorithm and is assigned an arbitrary large value WGAP(M) = ZERO IBLOCK( M ) = JBLK INDEXW( M ) = 1 WBEGIN = WBEGIN + 1 ENDIF * E( IEND ) holds the shift for the initial RRR E( IEND ) = ZERO IBEGIN = IEND + 1 GO TO 170 END IF * * Blocks of size larger than 1x1 * * E( IEND ) will hold the shift for the initial RRR, for now set it =0 E( IEND ) = ZERO IF( ( IRANGE.EQ.1 ) .OR. $ ((IRANGE.EQ.3).AND.(IL.EQ.1.AND.IU.EQ.N)) ) THEN * MB = number of eigenvalues to compute MB = IN WEND = WBEGIN + MB - 1 INDL = 1 INDU = IN ELSE * Count the number of eigenvalues in the current block. MB = 0 DO 20 I = WBEGIN,MM IF( IBLOCK(I).EQ.JBLK ) THEN MB = MB+1 ELSE GOTO 21 ENDIF 20 CONTINUE 21 CONTINUE IF( MB.EQ.0) THEN * No eigenvalue in the current block lies in the desired range * E( IEND ) holds the shift for the initial RRR E( IEND ) = ZERO IBEGIN = IEND + 1 GO TO 170 ENDIF * WEND = WBEGIN + MB - 1 * Find local index of the first and last desired evalue. INDL = INDEXW(WBEGIN) INDU = INDEXW( WEND ) ENDIF * IF( (WEND.LT.DOL).OR.(WBEGIN.GT.DOU) ) THEN * if this subblock contains no desired eigenvalues, * skip the computation of this representation tree IBEGIN = IEND + 1 WBEGIN = WEND + 1 M = M + MB GO TO 170 END IF * IF(.NOT. ( IRANGE.EQ.1 ) ) THEN * At this point, the sequential code decides * whether dqds or bisection is more efficient. * Note: in the parallel code, we do not use dqds. * However, we do not change the shift strategy * if USEDQD is TRUE, then the same shift is used as for * the sequential code when it uses dqds. * USEDQD = ( MB .GT. FAC*IN ) * * Calculate gaps for the current block * In later stages, when representations for individual * eigenvalues are different, we use SIGMA = E( IEND ). SIGMA = ZERO DO 30 I = WBEGIN, WEND - 1 WGAP( I ) = MAX( ZERO, $ W(I+1)-WERR(I+1) - (W(I)+WERR(I)) ) 30 CONTINUE WGAP( WEND ) = MAX( ZERO, $ VU - SIGMA - (W( WEND )+WERR( WEND ))) ENDIF * * Find local outer bounds GL,GU for the block GL = D(IBEGIN) GU = D(IBEGIN) DO 15 I = IBEGIN , IEND GL = MIN( GERS( 2*I-1 ), GL ) GU = MAX( GERS( 2*I ), GU ) 15 CONTINUE SPDIAM = GU - GL * Save local spectral diameter for later use SDIAM(JBLK) = SPDIAM * Find approximations to the extremal eigenvalues of the block CALL SLARRK( IN, 1, GL, GU, D(IBEGIN), $ E2(IBEGIN), PIVMIN, RTL, TMP, TMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF ISLEFT = MAX(GL, TMP - TMP1 $ - HNDRD * EPS* ABS(TMP - TMP1)) CALL SLARRK( IN, IN, GL, GU, D(IBEGIN), $ E2(IBEGIN), PIVMIN, RTL, TMP, TMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF ISRGHT = MIN(GU, TMP + TMP1 $ + HNDRD * EPS * ABS(TMP + TMP1)) IF( ( IRANGE.EQ.1 ).OR.USEDQD ) THEN * Case of DQDS shift * Improve the estimate of the spectral diameter SPDIAM = ISRGHT - ISLEFT ELSE * Case of bisection * Find approximations to the wanted extremal eigenvalues ISLEFT = MAX(GL, W(WBEGIN) - WERR(WBEGIN) $ - HNDRD * EPS*ABS(W(WBEGIN)- WERR(WBEGIN) )) ISRGHT = MIN(GU,W(WEND) + WERR(WEND) $ + HNDRD * EPS * ABS(W(WEND)+ WERR(WEND))) ENDIF * Decide whether the base representation for the current block * L_JBLK D_JBLK L_JBLK^T = T_JBLK - sigma_JBLK I * should be on the left or the right end of the current block. * The strategy is to shift to the end which is "more populated" IF( IRANGE.EQ.1 ) THEN * If all the eigenvalues have to be computed, we use dqd USEDQD = .TRUE. * INDL is the local index of the first eigenvalue to compute INDL = 1 INDU = IN * MB = number of eigenvalues to compute MB = IN WEND = WBEGIN + MB - 1 * Define 1/4 and 3/4 points of the spectrum S1 = ISLEFT + FOURTH * SPDIAM S2 = ISRGHT - FOURTH * SPDIAM ELSE * SLARRD2 has computed IBLOCK and INDEXW for each eigenvalue * approximation. * choose sigma IF( USEDQD ) THEN S1 = ISLEFT + FOURTH * SPDIAM S2 = ISRGHT - FOURTH * SPDIAM ELSE TMP = MIN(ISRGHT,VU) - MAX(ISLEFT,VL) S1 = MAX(ISLEFT,VL) + FOURTH * TMP S2 = MIN(ISRGHT,VU) - FOURTH * TMP ENDIF ENDIF * Compute the negcount at the 1/4 and 3/4 points IF(MB.GT.2) THEN CALL SLARRC( 'T', IN, S1, S2, D(IBEGIN), $ E(IBEGIN), PIVMIN, CNT, CNT1, CNT2, IINFO) ENDIF IF(MB.LE.2) THEN SIGMA = GL SGNDEF = ONE ELSEIF( CNT1 - INDL .GE. INDU - CNT2 ) THEN IF( IRANGE.EQ.1 ) THEN SIGMA = MAX(ISLEFT,GL) ELSEIF( USEDQD ) THEN * use Gerschgorin bound as shift to get pos def matrix SIGMA = ISLEFT ELSE * use approximation of the first desired eigenvalue of the * block as shift SIGMA = MAX(ISLEFT,VL) ENDIF SGNDEF = ONE ELSE IF( IRANGE.EQ.1 ) THEN SIGMA = MIN(ISRGHT,GU) ELSEIF( USEDQD ) THEN * use Gerschgorin bound as shift to get neg def matrix * for dqds SIGMA = ISRGHT ELSE * use approximation of the first desired eigenvalue of the * block as shift SIGMA = MIN(ISRGHT,VU) ENDIF SGNDEF = -ONE ENDIF * An initial SIGMA has been chosen that will be used for computing * T - SIGMA I = L D L^T * Define the increment TAU of the shift in case the initial shift * needs to be refined to obtain a factorization with not too much * element growth. IF( USEDQD ) THEN TAU = SPDIAM*EPS*N + TWO*PIVMIN TAU = MAX(TAU,EPS*ABS(SIGMA)) ELSE IF(MB.GT.1) THEN CLWDTH = W(WEND) + WERR(WEND) - W(WBEGIN) - WERR(WBEGIN) AVGAP = ABS(CLWDTH / REAL(WEND-WBEGIN)) IF( SGNDEF.EQ.ONE ) THEN TAU = HALF*MAX(WGAP(WBEGIN),AVGAP) TAU = MAX(TAU,WERR(WBEGIN)) ELSE TAU = HALF*MAX(WGAP(WEND-1),AVGAP) TAU = MAX(TAU,WERR(WEND)) ENDIF ELSE TAU = WERR(WBEGIN) ENDIF ENDIF * DO 80 IDUM = 1, MAXTRY * Compute L D L^T factorization of tridiagonal matrix T - sigma I. * Store D in WORK(1:IN), L in WORK(IN+1:2*IN), and reciprocals of * pivots in WORK(2*IN+1:3*IN) DPIVOT = D( IBEGIN ) - SIGMA WORK( 1 ) = DPIVOT DMAX = ABS( WORK(1) ) J = IBEGIN DO 70 I = 1, IN - 1 WORK( 2*IN+I ) = ONE / WORK( I ) TMP = E( J )*WORK( 2*IN+I ) WORK( IN+I ) = TMP DPIVOT = ( D( J+1 )-SIGMA ) - TMP*E( J ) WORK( I+1 ) = DPIVOT DMAX = MAX( DMAX, ABS(DPIVOT) ) J = J + 1 70 CONTINUE * check for element growth IF( DMAX .GT. MAXGROWTH*SPDIAM ) THEN NOREP = .TRUE. ELSE NOREP = .FALSE. ENDIF IF(NOREP) THEN * Note that in the case of IRANGE=1, we use the Gerschgorin * shift which makes the matrix definite. So we should end up * here really only in the case of IRANGE = 2,3 IF( IDUM.EQ.MAXTRY-1 ) THEN IF( SGNDEF.EQ.ONE ) THEN * The fudged Gerschgorin shift should succeed SIGMA = $ GL - FUDGE*SPDIAM*EPS*N - FUDGE*TWO*PIVMIN ELSE SIGMA = $ GU + FUDGE*SPDIAM*EPS*N + FUDGE*TWO*PIVMIN END IF ELSE SIGMA = SIGMA - SGNDEF * TAU TAU = TWO * TAU END IF ELSE * an initial RRR is found GO TO 83 END IF 80 CONTINUE * if the program reaches this point, no base representation could be * found in MAXTRY iterations. INFO = 2 RETURN 83 CONTINUE * At this point, we have found an initial base representation * T - SIGMA I = L D L^T with not too much element growth. * Store the shift. E( IEND ) = SIGMA * Store D and L. CALL SCOPY( IN, WORK, 1, D( IBEGIN ), 1 ) CALL SCOPY( IN-1, WORK( IN+1 ), 1, E( IBEGIN ), 1 ) IF(RNDPRT .AND. MB.GT.1 ) THEN * * Perturb each entry of the base representation by a small * (but random) relative amount to overcome difficulties with * glued matrices. * DO 122 I = 1, 4 ISEED( I ) = 1 122 CONTINUE CALL SLARNV(2, ISEED, 2*IN-1, WORK(1)) DO 125 I = 1,IN-1 D(IBEGIN+I-1) = D(IBEGIN+I-1)*(ONE+EPS*PERT*WORK(2*I-1)) E(IBEGIN+I-1) = E(IBEGIN+I-1)*(ONE+EPS*PERT*WORK(2*I)) 125 CONTINUE D(IEND) = D(IEND)*(ONE+EPS*PERT*WORK(2*IN-1)) * ENDIF * * Compute the required eigenvalues of L D L' by bisection * Shift the eigenvalue approximations * according to the shift of their representation. DO 134 J=WBEGIN,WEND W(J) = W(J) - SIGMA WERR(J) = WERR(J) + ABS(W(J)) * EPS 134 CONTINUE * call SLARRB2 to reduce eigenvalue error of the approximations * from SLARRD2 DO 135 I = IBEGIN, IEND-1 WORK( I ) = D( I ) * E( I )**2 135 CONTINUE * use bisection to find EV from INDL to INDU INDL = INDEXW( WBEGIN ) INDU = INDEXW( WEND ) * * Indicate that the current block contains eigenvalues that * are potentially needed later. * NEEDIL = MIN(NEEDIL,WBEGIN) NEEDIU = MAX(NEEDIU,WEND) * * For the parallel distributed case, only compute * those eigenvalues that have to be computed as indicated by DOL, DOU * MYWBEG = MAX(WBEGIN,DOL) MYWEND = MIN(WEND,DOU) * IF(MYWBEG.GT.WBEGIN) THEN * This is the leftmost block containing wanted eigenvalues * as well as unwanted ones. To save on communication, * check if NEEDIL can be increased even further: * on the left end, only the eigenvalues of the cluster * including MYWBEG are needed DO 136 I = WBEGIN, MYWBEG-1 IF ( WGAP(I).GE.MINRGP*ABS(W(I)) ) THEN NEEDIL = MAX(I+1,NEEDIL) ENDIF 136 CONTINUE ENDIF IF(MYWEND.LT.WEND) THEN * This is the rightmost block containing wanted eigenvalues * as well as unwanted ones. To save on communication, * Check if NEEDIU can be decreased even further. DO 137 I = MYWEND,WEND-1 IF ( WGAP(I).GE.MINRGP*ABS(W(I)) ) THEN NEEDIU = MIN(I,NEEDIU) GOTO 138 ENDIF 137 CONTINUE 138 CONTINUE ENDIF * * Only compute eigenvalues from MYINDL to MYINDU * instead of INDL to INDU * MYINDL = INDEXW( MYWBEG ) MYINDU = INDEXW( MYWEND ) * LGPVMN = LOG( PIVMIN ) LGSPDM = LOG( SPDIAM + PIVMIN ) CALL SLARRB2(IN, D(IBEGIN), WORK(IBEGIN), $ MYINDL, MYINDU, RTOL1, RTOL2, MYINDL-1, $ W(MYWBEG), WGAP(MYWBEG), WERR(MYWBEG), $ WORK( 2*N+1 ), IWORK, PIVMIN, $ LGPVMN, LGSPDM, IN, IINFO ) IF( IINFO .NE. 0 ) THEN INFO = -4 RETURN END IF * SLARRB2 computes all gaps correctly except for the last one * Record distance to VU/GU WGAP( WEND ) = MAX( ZERO, $ ( VU-SIGMA ) - ( W( WEND ) + WERR( WEND ) ) ) DO 140 I = INDL, INDU M = M + 1 IBLOCK(M) = JBLK INDEXW(M) = I 140 CONTINUE * * proceed with next block IBEGIN = IEND + 1 WBEGIN = WEND + 1 170 CONTINUE * IF (M.LT.DOU-DOL+1) THEN INFO = -9 ENDIF RETURN * * end of SLARRE2A * END scalapack-2.0.2/SRC/slarrf2.f000644 000766 000024 00000026214 11657111056 016072 0ustar00juliestaff000000 000000 SUBROUTINE SLARRF2( N, D, L, LD, CLSTRT, CLEND, $ CLMID1, CLMID2, W, WGAP, WERR, TRYMID, $ SPDIAM, CLGAPL, CLGAPR, PIVMIN, SIGMA, $ DPLUS, LPLUS, WORK, INFO ) * * -- ScaLAPACK computational routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER CLSTRT, CLEND, CLMID1, CLMID2, INFO, N REAL CLGAPL, CLGAPR, PIVMIN, SIGMA, SPDIAM LOGICAL TRYMID * .. * .. Array Arguments .. REAL D( * ), DPLUS( * ), L( * ), LD( * ), $ LPLUS( * ), W( * ), WGAP( * ), WERR( * ), WORK( * ) * .. * * Purpose * ======= * * Given the initial representation L D L^T and its cluster of close * eigenvalues (in a relative measure), W( CLSTRT ), W( CLSTRT+1 ), ... * W( CLEND ), SLARRF2 finds a new relatively robust representation * L D L^T - SIGMA I = L(+) D(+) L(+)^T such that at least one of the * eigenvalues of L(+) D(+) L(+)^T is relatively isolated. * * This is an enhanced version of SLARRF that also tries shifts in * the middle of the cluster, should there be a large gap, in order to * break large clusters into at least two pieces. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix (subblock, if the matrix splitted). * * D (input) REAL array, dimension (N) * The N diagonal elements of the diagonal matrix D. * * L (input) REAL array, dimension (N-1) * The (N-1) subdiagonal elements of the unit bidiagonal * matrix L. * * LD (input) REAL array, dimension (N-1) * The (N-1) elements L(i)*D(i). * * CLSTRT (input) INTEGER * The index of the first eigenvalue in the cluster. * * CLEND (input) INTEGER * The index of the last eigenvalue in the cluster. * * CLMID1,2(input) INTEGER * The index of a middle eigenvalue pair with large gap * * W (input) REAL array, dimension >= (CLEND-CLSTRT+1) * The eigenvalue APPROXIMATIONS of L D L^T in ascending order. * W( CLSTRT ) through W( CLEND ) form the cluster of relatively * close eigenalues. * * WGAP (input/output) REAL array, dimension >= (CLEND-CLSTRT+1) * The separation from the right neighbor eigenvalue in W. * * WERR (input) REAL array, dimension >= (CLEND-CLSTRT+1) * WERR contain the semiwidth of the uncertainty * interval of the corresponding eigenvalue APPROXIMATION in W * * SPDIAM (input) estimate of the spectral diameter obtained from the * Gerschgorin intervals * * CLGAPL, CLGAPR (input) absolute gap on each end of the cluster. * Set by the calling routine to protect against shifts too close * to eigenvalues outside the cluster. * * PIVMIN (input) DOUBLE PRECISION * The minimum pivot allowed in the sturm sequence. * * SIGMA (output) REAL * The shift used to form L(+) D(+) L(+)^T. * * DPLUS (output) REAL array, dimension (N) * The N diagonal elements of the diagonal matrix D(+). * * LPLUS (output) REAL array, dimension (N-1) * The first (N-1) elements of LPLUS contain the subdiagonal * elements of the unit bidiagonal matrix L(+). * * WORK (workspace) REAL array, dimension (2*N) * Workspace. * * Further Details * =============== * * Based on contributions by * Beresford Parlett, University of California, Berkeley, USA * Jim Demmel, University of California, Berkeley, USA * Inderjit Dhillon, University of Texas, Austin, USA * Osni Marques, LBNL/NERSC, USA * Christof Voemel, University of California, Berkeley, USA * * ===================================================================== * * .. Parameters .. REAL FOUR, MAXGROWTH1, MAXGROWTH2, ONE, QUART, TWO PARAMETER ( ONE = 1.0E0, TWO = 2.0E0, $ FOUR = 4.0E0, QUART = 0.25E0, $ MAXGROWTH1 = 8.E0, $ MAXGROWTH2 = 8.E0 ) * .. * .. Local Scalars .. LOGICAL DORRR1, NOFAIL, SAWNAN1, SAWNAN2, TRYRRR1 INTEGER BI,I,J,KTRY,KTRYMAX,SLEFT,SRIGHT,SMID,SHIFT PARAMETER ( KTRYMAX = 1, SMID =0, SLEFT = 1, SRIGHT = 2 ) * DSTQDS loops will be blocked to detect NaNs earlier if they occur INTEGER BLKLEN PARAMETER ( BLKLEN = 512 ) REAL AVGAP, BESTSHIFT, CLWDTH, EPS, FACT, FAIL, $ FAIL2, GROWTHBOUND, LDELTA, LDMAX, LEASTGROWTH, $ LSIGMA, MAX1, MAX2, MINGAP, MSIGMA1, MSIGMA2, $ OLDP, PROD, RDELTA, RDMAX, RRR1, RRR2, RSIGMA, $ S, TMP, ZNM2 * .. * .. External Functions .. LOGICAL SISNAN REAL SLAMCH EXTERNAL SISNAN, SLAMCH * .. * .. External Subroutines .. EXTERNAL SCOPY * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * INFO = 0 FACT = REAL(2**KTRYMAX) EPS = SLAMCH( 'Precision' ) SHIFT = 0 * Decide whether the code should accept the best among all * representations despite large element growth or signal INFO=1 NOFAIL = .TRUE. * * Compute the average gap length of the cluster CLWDTH = ABS(W(CLEND)-W(CLSTRT)) + WERR(CLEND) + WERR(CLSTRT) AVGAP = CLWDTH / REAL(CLEND-CLSTRT) MINGAP = MIN(CLGAPL, CLGAPR) * Initial values for shifts to both ends of cluster LSIGMA = MIN(W( CLSTRT ),W( CLEND )) - WERR( CLSTRT ) RSIGMA = MAX(W( CLSTRT ),W( CLEND )) + WERR( CLEND ) MSIGMA1 = W( CLMID1 ) + WERR( CLMID1 ) MSIGMA2 = W( CLMID2 ) - WERR( CLMID2 ) * Use a small fudge to make sure that we really shift to the outside LSIGMA = LSIGMA - ABS(LSIGMA)* TWO * EPS RSIGMA = RSIGMA + ABS(RSIGMA)* TWO * EPS * Compute upper bounds for how much to back off the initial shifts LDMAX = QUART * MINGAP + TWO * PIVMIN RDMAX = QUART * MINGAP + TWO * PIVMIN LDELTA = MAX(AVGAP,WGAP( CLSTRT ))/FACT RDELTA = MAX(AVGAP,WGAP( CLEND-1 ))/FACT * * Initialize the record of the best representation found * S = SLAMCH( 'S' ) LEASTGROWTH = ONE / S FAIL = REAL(N-1)*MINGAP/(SPDIAM*EPS) FAIL2 = REAL(N-1)*MINGAP/(SPDIAM*SQRT(EPS)) GROWTHBOUND = MAXGROWTH1*SPDIAM * * Set default best shift * BESTSHIFT = LSIGMA IF(.NOT.TRYMID) GOTO 4 * * Try shifts in the middle * SHIFT = SMID DO 3 J=1,2 SAWNAN1 = .FALSE. IF(J.EQ.1) THEN * Try left middle point SIGMA = MSIGMA1 ELSE * Try left middle point SIGMA = MSIGMA2 ENDIF S = -SIGMA DPLUS( 1 ) = D( 1 ) + S MAX1 = ABS( DPLUS( 1 ) ) DO 2 BI = 1, N-1, BLKLEN DO 1 I = BI, MIN( BI+BLKLEN-1, N-1) LPLUS( I ) = LD( I ) / DPLUS( I ) S = S*LPLUS( I )*L( I ) - SIGMA DPLUS( I+1 ) = D( I+1 ) + S MAX1 = MAX( MAX1,ABS(DPLUS(I+1)) ) 1 CONTINUE SAWNAN1=SAWNAN1 .OR. SISNAN(MAX1) IF (SAWNAN1) GOTO 3 2 CONTINUE IF( .NOT.SAWNAN1 ) THEN IF( MAX1.LE.GROWTHBOUND ) THEN GOTO 100 ELSE IF( MAX1.LE.LEASTGROWTH ) THEN LEASTGROWTH = MAX1 BESTSHIFT = SIGMA ENDIF ENDIF 3 CONTINUE 4 CONTINUE * * Shifts in the middle not tried or not succeeded * Find best shift on the outside of the cluster * * while (KTRY <= KTRYMAX) KTRY = 0 * * * 5 CONTINUE * Compute element growth when shifting to both ends of the cluster * accept shift if there is no element growth at one of the two ends * Left end SAWNAN1 = .FALSE. S = -LSIGMA DPLUS( 1 ) = D( 1 ) + S MAX1 = ABS( DPLUS( 1 ) ) DO 12 BI = 1, N-1, BLKLEN DO 11 I = BI, MIN( BI+BLKLEN-1, N-1) LPLUS( I ) = LD( I ) / DPLUS( I ) S = S*LPLUS( I )*L( I ) - LSIGMA DPLUS( I+1 ) = D( I+1 ) + S MAX1 = MAX( MAX1,ABS(DPLUS(I+1)) ) 11 CONTINUE SAWNAN1=SAWNAN1 .OR. SISNAN(MAX1) IF (SAWNAN1) GOTO 13 12 CONTINUE IF( .NOT.SAWNAN1 ) THEN IF( MAX1.LE.GROWTHBOUND ) THEN SIGMA = LSIGMA SHIFT = SLEFT GOTO 100 ELSE IF( MAX1.LE.LEASTGROWTH ) THEN LEASTGROWTH = MAX1 BESTSHIFT = LSIGMA ENDIF ENDIF 13 CONTINUE * Right end SAWNAN2 = .FALSE. S = -RSIGMA WORK( 1 ) = D( 1 ) + S MAX2 = ABS( WORK( 1 ) ) DO 22 BI = 1, N-1, BLKLEN DO 21 I = BI, MIN( BI+BLKLEN-1, N-1) WORK( N+I ) = LD( I ) / WORK( I ) S = S*WORK( N+I )*L( I ) - RSIGMA WORK( I+1 ) = D( I+1 ) + S MAX2 = MAX( MAX2,ABS(WORK(I+1)) ) 21 CONTINUE SAWNAN2=SAWNAN2 .OR. SISNAN(MAX2) IF (SAWNAN2) GOTO 23 22 CONTINUE IF( .NOT.SAWNAN2 ) THEN IF( MAX2.LE.GROWTHBOUND ) THEN SIGMA = RSIGMA SHIFT = SRIGHT GOTO 100 ELSE IF( MAX2.LE.LEASTGROWTH ) THEN LEASTGROWTH = MAX2 BESTSHIFT = RSIGMA ENDIF ENDIF 23 CONTINUE * If we are at this point, both shifts led to too much element growth 50 CONTINUE IF (KTRY.LT.KTRYMAX) THEN * If we are here, both shifts failed also the RRR test. * Back off to the outside LSIGMA = MAX( LSIGMA - LDELTA, $ LSIGMA - LDMAX) RSIGMA = MIN( RSIGMA + RDELTA, $ RSIGMA + RDMAX ) LDELTA = TWO * LDELTA RDELTA = TWO * RDELTA * Ensure that we do not back off too much of the initial shifts LDELTA = MIN(LDMAX,LDELTA) RDELTA = MIN(RDMAX,RDELTA) KTRY = KTRY + 1 GOTO 5 ELSE * None of the representations investigated satisfied our * criteria. Take the best one we found. IF((LEASTGROWTH.LT.FAIL).OR.NOFAIL) THEN LSIGMA = BESTSHIFT SAWNAN1 = .FALSE. S = -LSIGMA DPLUS( 1 ) = D( 1 ) + S DO 6 I = 1, N - 1 LPLUS( I ) = LD( I ) / DPLUS( I ) S = S*LPLUS( I )*L( I ) - LSIGMA DPLUS( I+1 ) = D( I+1 ) + S IF(ABS(DPLUS(I+1)).LT.PIVMIN) THEN DPLUS(I+1) = -PIVMIN ENDIF 6 CONTINUE SIGMA = LSIGMA SHIFT = SLEFT GOTO 100 ELSE INFO = 1 RETURN ENDIF END IF 100 CONTINUE IF (SHIFT.EQ.SLEFT .OR. SHIFT.EQ.SMID ) THEN ELSEIF (SHIFT.EQ.SRIGHT) THEN * store new L and D back into DPLUS, LPLUS CALL SCOPY( N, WORK, 1, DPLUS, 1 ) CALL SCOPY( N-1, WORK(N+1), 1, LPLUS, 1 ) ENDIF RETURN * * End of SLARRF2 * END scalapack-2.0.2/SRC/slarrv2.f000644 000766 000024 00000143305 11657111056 016113 0ustar00juliestaff000000 000000 SUBROUTINE SLARRV2( N, VL, VU, D, L, PIVMIN, $ ISPLIT, M, DOL, DOU, NEEDIL, NEEDIU, $ MINRGP, RTOL1, RTOL2, W, WERR, WGAP, $ IBLOCK, INDEXW, GERS, SDIAM, $ Z, LDZ, ISUPPZ, $ WORK, IWORK, VSTART, FINISH, $ MAXCLS, NDEPTH, PARITY, ZOFFSET, INFO ) * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. INTEGER DOL, DOU, INFO, LDZ, M, N, MAXCLS, $ NDEPTH, NEEDIL, NEEDIU, PARITY, ZOFFSET REAL MINRGP, PIVMIN, RTOL1, RTOL2, VL, VU LOGICAL VSTART, FINISH * .. * .. Array Arguments .. INTEGER IBLOCK( * ), INDEXW( * ), ISPLIT( * ), $ ISUPPZ( * ), IWORK( * ) REAL D( * ), GERS( * ), L( * ), SDIAM( * ), $ W( * ), WERR( * ), $ WGAP( * ), WORK( * ) REAL Z( LDZ, * ) * * Purpose * ======= * * SLARRV2 computes the eigenvectors of the tridiagonal matrix * T = L D L^T given L, D and APPROXIMATIONS to the eigenvalues of L D L^T. * The input eigenvalues should have been computed by SLARRE2A * or by precious calls to SLARRV2. * * The major difference between the parallel and the sequential construction * of the representation tree is that in the parallel case, not all eigenvalues * of a given cluster might be computed locally. Other processors might "own" * and refine part of an eigenvalue cluster. This is crucial for scalability. * Thus there might be communication necessary before the current level of the * representation tree can be parsed. * * Please note: * 1. The calling sequence has two additional INTEGER parameters, * DOL and DOU, that should satisfy M>=DOU>=DOL>=1. * These parameters are only relevant for the case JOBZ = 'V'. * SLARRV2 ONLY computes the eigenVECTORS * corresponding to eigenvalues DOL through DOU in W. (That is, * instead of computing the eigenvectors belonging to W(1) * through W(M), only the eigenvectors belonging to eigenvalues * W(DOL) through W(DOU) are computed. In this case, only the * eigenvalues DOL:DOU are guaranteed to be accurately refined * to all figures by Rayleigh-Quotient iteration. * * 2. The additional arguments VSTART, FINISH, NDEPTH, PARITY, ZOFFSET * are included as a thread-safe implementation equivalent to SAVE variables. * These variables store details about the local representation tree which is * computed layerwise. For scalability reasons, eigenvalues belonging to the * locally relevant representation tree might be computed on other processors. * These need to be communicated before the inspection of the RRRs can proceed * on any given layer. * Note that only when the variable FINISH is true, the computation has ended * All eigenpairs between DOL and DOU have been computed. M is set = DOU - DOL + 1. * * 3. SLARRV2 needs more workspace in Z than the sequential SLARRV. * It is used to store the conformal embedding of the local representation tree. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix. N >= 0. * * VL (input) REAL * VU (input) REAL * Lower and upper bounds of the interval that contains the desired * eigenvalues. VL < VU. Needed to compute gaps on the left or right * end of the extremal eigenvalues in the desired RANGE. * VU is currently not used but kept as parameter in case needed. * * D (input/output) REAL array, dimension (N) * On entry, the N diagonal elements of the diagonal matrix D. * On exit, D is overwritten. * * L (input/output) REAL array, dimension (N) * On entry, the (N-1) subdiagonal elements of the unit * bidiagonal matrix L are in elements 1 to N-1 of L * (if the matrix is not splitted.) At the end of each block * is stored the corresponding shift as given by SLARRE. * On exit, L is overwritten. * * PIVMIN (in) DOUBLE PRECISION * The minimum pivot allowed in the sturm sequence. * * ISPLIT (input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into blocks. * The first block consists of rows/columns 1 to * ISPLIT( 1 ), the second of rows/columns ISPLIT( 1 )+1 * through ISPLIT( 2 ), etc. * * M (input) INTEGER * The total number of input eigenvalues. 0 <= M <= N. * * DOL (input) INTEGER * DOU (input) INTEGER * If the user wants to compute only selected eigenvectors from all * the eigenvalues supplied, he can specify an index range DOL:DOU. * Or else the setting DOL=1, DOU=M should be applied. * Note that DOL and DOU refer to the order in which the eigenvalues * are stored in W. * If the user wants to compute only selected eigenpairs, then * the columns DOL-1 to DOU+1 of the eigenvector space Z contain the * computed eigenvectors. All other columns of Z are set to zero. * If DOL > 1, then Z(:,DOL-1-ZOFFSET) is used. * If DOU < M, then Z(:,DOU+1-ZOFFSET) is used. * * * NEEDIL (input/output) INTEGER * NEEDIU (input/output) INTEGER * Describe which are the left and right outermost eigenvalues * that still need to be included in the computation. These indices * indicate whether eigenvalues from other processors are needed to * correctly compute the conformally embedded representation tree. * When DOL<=NEEDIL<=NEEDIU<=DOU, all required eigenvalues are local * to the processor and no communication is required to compute its * part of the representation tree. * * MINRGP (input) REAL * The minimum relativ gap threshold to decide whether an eigenvalue * or a cluster boundary is reached. * * RTOL1 (input) REAL * RTOL2 (input) REAL * Parameters for bisection. * An interval [LEFT,RIGHT] has converged if * RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) * * W (input/output) REAL array, dimension (N) * The first M elements of W contain the APPROXIMATE eigenvalues for * which eigenvectors are to be computed. The eigenvalues * should be grouped by split-off block and ordered from * smallest to largest within the block. (The output array * W from SSTEGR2A is expected here.) Furthermore, they are with * respect to the shift of the corresponding root representation * for their block. On exit, * W holds those UNshifted eigenvalues * for which eigenvectors have already been computed. * * WERR (input/output) REAL array, dimension (N) * The first M elements contain the semiwidth of the uncertainty * interval of the corresponding eigenvalue in W * * WGAP (input/output) REAL array, dimension (N) * The separation from the right neighbor eigenvalue in W. * * IBLOCK (input) INTEGER array, dimension (N) * The indices of the blocks (submatrices) associated with the * corresponding eigenvalues in W; IBLOCK(i)=1 if eigenvalue * W(i) belongs to the first block from the top, =2 if W(i) * belongs to the second block, etc. * * INDEXW (input) INTEGER array, dimension (N) * The indices of the eigenvalues within each block (submatrix); * for example, INDEXW(i)= 10 and IBLOCK(i)=2 imply that the * i-th eigenvalue W(i) is the 10-th eigenvalue in the second block. * * GERS (input) REAL array, dimension (2*N) * The N Gerschgorin intervals (the i-th Gerschgorin interval * is (GERS(2*i-1), GERS(2*i)). The Gerschgorin intervals should * be computed from the original UNshifted matrix. * Currently NOT used but kept as parameter in case it becomes * needed in the future. * * SDIAM (input) REAL array, dimension (N) * The spectral diameters for all unreduced blocks. * * Z (output) REAL array, dimension (LDZ, max(1,M) ) * If INFO = 0, the first M columns of Z contain the * orthonormal eigenvectors of the matrix T * corresponding to the input eigenvalues, with the i-th * column of Z holding the eigenvector associated with W(i). * In the distributed version, only a subset of columns * is accessed, see DOL,DOU and ZOFFSET. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * JOBZ = 'V', LDZ >= max(1,N). * * ISUPPZ (output) INTEGER array, dimension ( 2*max(1,M) ) * The support of the eigenvectors in Z, i.e., the indices * indicating the nonzero elements in Z. The I-th eigenvector * is nonzero only in elements ISUPPZ( 2*I-1 ) through * ISUPPZ( 2*I ). * * WORK (workspace) REAL array, dimension (12*N) * * IWORK (workspace) INTEGER array, dimension (7*N) * * VSTART (input/output) LOGICAL * .TRUE. on initialization, set to .FALSE. afterwards. * * FINISH (input/output) LOGICAL * A flag that indicates whether all eigenpairs have been computed. * * MAXCLS (input/output) INTEGER * The largest cluster worked on by this processor in the * representation tree. * * NDEPTH (input/output) INTEGER * The current depth of the representation tree. Set to * zero on initial pass, changed when the deeper levels of * the representation tree are generated. * * PARITY (input/output) INTEGER * An internal parameter needed for the storage of the * clusters on the current level of the representation tree. * * ZOFFSET (input) INTEGER * Offset for storing the eigenpairs when Z is distributed * in 1D-cyclic fashion. * * INFO (output) INTEGER * = 0: successful exit * * > 0: A problem occured in SLARRV2. * < 0: One of the called subroutines signaled an internal problem. * Needs inspection of the corresponding parameter IINFO * for further information. * * =-1: Problem in SLARRB2 when refining a child's eigenvalues. * =-2: Problem in SLARRF2 when computing the RRR of a child. * When a child is inside a tight cluster, it can be difficult * to find an RRR. A partial remedy from the user's point of * view is to make the parameter MINRGP smaller and recompile. * However, as the orthogonality of the computed vectors is * proportional to 1/MINRGP, the user should be aware that * he might be trading in precision when he decreases MINRGP. * =-3: Problem in SLARRB2 when refining a single eigenvalue * after the Rayleigh correction was rejected. * = 5: The Rayleigh Quotient Iteration failed to converge to * full accuracy in MAXITR steps. * * ===================================================================== * * .. Parameters .. INTEGER MAXITR, USE30, USE31, USE32A, USE32B PARAMETER ( MAXITR = 10, USE30=30, USE31=31, $ USE32A=3210, USE32B = 3211 ) REAL ZERO, ONE, TWO, THREE, FOUR, HALF PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, $ TWO = 2.0E0, THREE = 3.0E0, $ FOUR = 4.0E0, HALF = 0.5E0) * .. * .. Local Arrays .. INTEGER SPLACE( 4 ) * .. * .. Local Scalars .. LOGICAL DELREF, ESKIP, NEEDBS, ONLYLC, STP2II, TRYMID, $ TRYRQC, USEDBS, USEDRQ INTEGER I, IBEGIN, IEND, II, IINCLS, IINDC1, IINDC2, $ IINDWK, IINFO, IM, IN, INDEIG, INDLD, INDLLD, $ INDWRK, ISUPMN, ISUPMX, ITER, ITMP1, ITWIST, J, $ JBLK, K, KK, MINIWSIZE, MINWSIZE, MYWFST, $ MYWLST, NCLUS, NEGCNT, NEWCLS, NEWFST, NEWFTT, $ NEWLST, NEWSIZ, OFFSET, OLDCLS, OLDFST, OLDIEN, $ OLDLST, OLDNCL, P, Q, VRTREE, WBEGIN, WEND, $ WINDEX, WINDMN, WINDPL, ZFROM, ZINDEX, ZTO, $ ZUSEDL, ZUSEDU, ZUSEDW REAL AVGAP, BSTRES, BSTW, ENUFGP, EPS, FUDGE, GAP, $ GAPTOL, LAMBDA, LEFT, LGAP, LGPVMN, LGSPDM, $ LOG_IN, MGAP, MINGMA, MYERR, NRMINV, NXTERR, $ ORTOL, RESID, RGAP, RIGHT, RLTL30, RQCORR, $ RQTOL, SAVEGP, SGNDEF, SIGMA, SPDIAM, SSIGMA, $ TAU, TMP, TOL, ZTZ * .. * .. External Functions .. REAL SLAMCH REAL SDOT, SNRM2 EXTERNAL SDOT, SLAMCH, SNRM2 * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SLAR1VA, SLARRB2, $ SLARRF2, SLASET, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, MAX, MIN, SQRT * .. * .. Executable Statements .. * .. INFO = 0 * The first N entries of WORK are reserved for the eigenvalues INDLD = N+1 INDLLD= 2*N+1 INDWRK= 3*N+1 MINWSIZE = 12 * N * IWORK(IINCLS+JBLK) holds the number of clusters on the current level * of the reptree for block JBLK IINCLS = 0 * IWORK(IINDC1+1:IINC2+N) are used to store the clusters of the current * layer and the one above. IINDC1 = N IINDC2 = 2*N IINDWK = 3*N + 1 MINIWSIZE = 7 * N EPS = SLAMCH( 'Precision' ) RQTOL = TWO * EPS TRYRQC = .TRUE. * Decide which representation tree criterion to use * USE30 = Lapack 3.0 criterion * USE31 = LAPACK 3.1 criterion * USE32A = two criteria, determines singletons with USE31, and groups with avgap. * USE32B = two criteria, determines singletons with USE31, and groups with USE30. VRTREE = USE32A * LGPVMN = LOG( PIVMIN ) IF(VSTART) THEN * * PREPROCESSING, DONE ONLY IN THE FIRST CALL * VSTART = .FALSE. * MAXCLS = 1 * Set delayed eigenvalue refinement * In order to enable more parallelism, refinement * must be done immediately and cannot be delayed until * the next representation tree level. DELREF = .FALSE. DO 1 I= 1,MINWSIZE WORK( I ) = ZERO 1 CONTINUE DO 2 I= 1,MINIWSIZE IWORK( I ) = 0 2 CONTINUE ZUSEDL = 1 IF(DOL.GT.1) THEN * Set lower bound for use of Z ZUSEDL = DOL-1 ENDIF ZUSEDU = M IF(DOU.LT.M) THEN * Set lower bound for use of Z ZUSEDU = DOU+1 ENDIF * The width of the part of Z that is used ZUSEDW = ZUSEDU - ZUSEDL + 1 * CALL SLASET( 'Full', N, ZUSEDW, ZERO, ZERO, $ Z(1,(ZUSEDL-ZOFFSET)), LDZ ) * Initialize NDEPTH, the current depth of the representation tree NDEPTH = 0 * Initialize parity PARITY = 1 * Go through blocks, initialize data structures IBEGIN = 1 WBEGIN = 1 DO 10 JBLK = 1, IBLOCK( M ) IEND = ISPLIT( JBLK ) SIGMA = L( IEND ) WEND = WBEGIN - 1 3 CONTINUE IF( WEND.LT.M ) THEN IF( IBLOCK( WEND+1 ).EQ.JBLK ) THEN WEND = WEND + 1 GO TO 3 END IF END IF IF( WEND.LT.WBEGIN ) THEN IWORK( IINCLS + JBLK ) = 0 IBEGIN = IEND + 1 GO TO 10 ELSEIF( (WEND.LT.DOL).OR.(WBEGIN.GT.DOU) ) THEN IWORK( IINCLS + JBLK ) = 0 IBEGIN = IEND + 1 WBEGIN = WEND + 1 GO TO 10 END IF * The number of eigenvalues in the current block IM = WEND - WBEGIN + 1 * This is for a 1x1 block IF( IBEGIN.EQ.IEND ) THEN IWORK( IINCLS + JBLK ) = 0 Z( IBEGIN, (WBEGIN-ZOFFSET) ) = ONE ISUPPZ( 2*WBEGIN-1 ) = IBEGIN ISUPPZ( 2*WBEGIN ) = IBEGIN W( WBEGIN ) = W( WBEGIN ) + SIGMA WORK( WBEGIN ) = W( WBEGIN ) IBEGIN = IEND + 1 WBEGIN = WBEGIN + 1 GO TO 10 END IF CALL SCOPY( IM, W( WBEGIN ), 1, & WORK( WBEGIN ), 1 ) * We store in W the eigenvalue approximations w.r.t. the original * matrix T. DO 5 I=1,IM W(WBEGIN+I-1) = W(WBEGIN+I-1)+SIGMA 5 CONTINUE * Initialize cluster counter for this block IWORK( IINCLS + JBLK ) = 1 IWORK( IINDC1+IBEGIN ) = 1 IWORK( IINDC1+IBEGIN+1 ) = IM * IBEGIN = IEND + 1 WBEGIN = WEND + 1 10 CONTINUE * ENDIF * Init NEEDIL and NEEDIU NEEDIL = DOU NEEDIU = DOL * Here starts the main loop * Only one pass through the loop is done until no collaboration * with other processors is needed. 40 CONTINUE PARITY = 1 - PARITY * For each block, build next level of representation tree * if there are still remaining clusters IBEGIN = 1 WBEGIN = 1 DO 170 JBLK = 1, IBLOCK( M ) IEND = ISPLIT( JBLK ) SIGMA = L( IEND ) * Find the eigenvectors of the submatrix indexed IBEGIN * through IEND. IF(M.EQ.N) THEN * all eigenpairs are computed WEND = IEND ELSE * count how many wanted eigenpairs are in this block WEND = WBEGIN - 1 15 CONTINUE IF( WEND.LT.M ) THEN IF( IBLOCK( WEND+1 ).EQ.JBLK ) THEN WEND = WEND + 1 GO TO 15 END IF END IF ENDIF OLDNCL = IWORK( IINCLS + JBLK ) IF( OLDNCL.EQ.0 ) THEN IBEGIN = IEND + 1 WBEGIN = WEND + 1 GO TO 170 END IF * OLDIEN is the last index of the previous block OLDIEN = IBEGIN - 1 * Calculate the size of the current block IN = IEND - IBEGIN + 1 * The number of eigenvalues in the current block IM = WEND - WBEGIN + 1 * Find local spectral diameter of the block SPDIAM = SDIAM(JBLK) LGSPDM = LOG( SPDIAM + PIVMIN ) * Compute ORTOL parameter, similar to SSTEIN ORTOL = SPDIAM*1.0E-3 * Compute average gap AVGAP = SPDIAM/REAL(IN-1) * Compute the minimum of average gap and ORTOL parameter * This can used as a lower bound for acceptable separation * between eigenvalues ENUFGP = MIN(ORTOL,AVGAP) * Any 1x1 block has been treated before * loop while( OLDNCLS.GT.0 ) * generate the next representation tree level for the current block IF( OLDNCL.GT.0 ) THEN * This is a crude protection against infinitely deep trees IF( NDEPTH.GT.M ) THEN INFO = -2 RETURN ENDIF * breadth first processing of the current level of the representation * tree: OLDNCL = number of clusters on current level * NCLUS is the number of clusters for the next level of the reptree * reset NCLUS to count the number of child clusters NCLUS = 0 * LOG_IN = LOG(REAL(IN)) * RLTL30 = MIN( 1.0E-2, ONE / REAL( IN ) ) * IF( PARITY.EQ.0 ) THEN OLDCLS = IINDC1+IBEGIN-1 NEWCLS = IINDC2+IBEGIN-1 ELSE OLDCLS = IINDC2+IBEGIN-1 NEWCLS = IINDC1+IBEGIN-1 END IF * Process the clusters on the current level DO 150 I = 1, OLDNCL J = OLDCLS + 2*I * OLDFST, OLDLST = first, last index of current cluster. * cluster indices start with 1 and are relative * to WBEGIN when accessing W, WGAP, WERR, Z OLDFST = IWORK( J-1 ) OLDLST = IWORK( J ) IF( NDEPTH.GT.0 ) THEN * Retrieve relatively robust representation (RRR) of cluster * that has been computed at the previous level * The RRR is stored in Z and overwritten once the eigenvectors * have been computed or when the cluster is refined IF((DOL.EQ.1).AND.(DOU.EQ.M)) THEN * Get representation from location of the leftmost evalue * of the cluster J = WBEGIN + OLDFST - 1 ELSE IF(WBEGIN+OLDFST-1.LT.DOL) THEN * Get representation from the left end of Z array J = DOL - 1 ELSEIF(WBEGIN+OLDFST-1.GT.DOU) THEN * Get representation from the right end of Z array J = DOU ELSE J = WBEGIN + OLDFST - 1 ENDIF ENDIF CALL SCOPY( IN, Z( IBEGIN, (J-ZOFFSET) ), $ 1, D( IBEGIN ), 1 ) CALL SCOPY( IN-1, Z( IBEGIN, (J+1-ZOFFSET) ), $ 1, L( IBEGIN ),1 ) SIGMA = Z( IEND, (J+1-ZOFFSET) ) * Set the corresponding entries in Z to zero CALL SLASET( 'Full', IN, 2, ZERO, ZERO, $ Z( IBEGIN, (J-ZOFFSET) ), LDZ ) END IF * Compute DL and DLL of current RRR DO 50 J = IBEGIN, IEND-1 TMP = D( J )*L( J ) WORK( INDLD-1+J ) = TMP WORK( INDLLD-1+J ) = TMP*L( J ) 50 CONTINUE IF( NDEPTH.GT.0 .AND. DELREF ) THEN * P and Q are index of the first and last eigenvalue to compute * within the current block P = INDEXW( WBEGIN-1+OLDFST ) Q = INDEXW( WBEGIN-1+OLDLST ) * Offset for the arrays WORK, WGAP and WERR, i.e., th P-OFFSET * thru' Q-OFFSET elements of these arrays are to be used. C OFFSET = P-OLDFST OFFSET = INDEXW( WBEGIN ) - 1 * perform limited bisection (if necessary) to get approximate * eigenvalues to the precision needed. CALL SLARRB2( IN, D( IBEGIN ), $ WORK(INDLLD+IBEGIN-1), $ P, Q, RTOL1, RTOL2, OFFSET, $ WORK(WBEGIN),WGAP(WBEGIN),WERR(WBEGIN), $ WORK( INDWRK ), IWORK( IINDWK ), $ PIVMIN, LGPVMN, LGSPDM, IN, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF * We also recompute the extremal gaps. W holds all eigenvalues * of the unshifted matrix and must be used for computation * of WGAP, the entries of WORK might stem from RRRs with * different shifts. The gaps from WBEGIN-1+OLDFST to * WBEGIN-1+OLDLST are correctly computed in SLARRB2. * However, we only allow the gaps to become greater since * this is what should happen when we decrease WERR IF( OLDFST.GT.1) THEN WGAP( WBEGIN+OLDFST-2 ) = $ MAX(WGAP(WBEGIN+OLDFST-2), $ W(WBEGIN+OLDFST-1)-WERR(WBEGIN+OLDFST-1) $ - W(WBEGIN+OLDFST-2)-WERR(WBEGIN+OLDFST-2) ) ENDIF IF( WBEGIN + OLDLST -1 .LT. WEND ) THEN WGAP( WBEGIN+OLDLST-1 ) = $ MAX(WGAP(WBEGIN+OLDLST-1), $ W(WBEGIN+OLDLST)-WERR(WBEGIN+OLDLST) $ - W(WBEGIN+OLDLST-1)-WERR(WBEGIN+OLDLST-1) ) ENDIF * Each time the eigenvalues in WORK get refined, we store * the newly found approximation with all shifts applied in W DO 53 J=OLDFST,OLDLST W(WBEGIN+J-1) = WORK(WBEGIN+J-1)+SIGMA 53 CONTINUE ELSEIF( (NDEPTH.EQ.0) .OR. (.NOT.DELREF) ) THEN * Some of the eigenvalues might have been computed on * other processors * Recompute gaps for this cluster * (all eigenvalues have the same * representation, i.e. the same shift, so this is easy) DO 54 J = OLDFST, OLDLST-1 MYERR = WERR(WBEGIN + J - 1) NXTERR = WERR(WBEGIN + J ) WGAP(WBEGIN+J-1) = MAX(WGAP(WBEGIN+J-1), $ ( WORK(WBEGIN+J) - NXTERR ) $ - ( WORK(WBEGIN+J-1) + MYERR ) $ ) 54 CONTINUE END IF * * Process the current node. * NEWFST = OLDFST DO 140 J = OLDFST, OLDLST IF( J.EQ.OLDLST ) THEN * we are at the right end of the cluster, this is also the * boundary of the child cluster NEWLST = J ELSE IF (VRTREE.EQ.USE30) THEN IF(WGAP( WBEGIN + J -1).GE. $ RLTL30 * ABS(WORK(WBEGIN + J -1)) ) THEN * the right relgap is big enough by the Lapack 3.0 criterion NEWLST = J ELSE * inside a child cluster, the relative gap is not * big enough. GOTO 140 ENDIF ELSE IF (VRTREE.EQ.USE31) THEN IF ( WGAP( WBEGIN + J -1).GE. $ MINRGP* ABS( WORK(WBEGIN + J -1) ) ) THEN * the right relgap is big enough by the Lapack 3.1 criterion * (NEWFST,..,NEWLST) is well separated from the following NEWLST = J ELSE * inside a child cluster, the relative gap is not * big enough. GOTO 140 ENDIF ELSE IF (VRTREE.EQ.USE32A) THEN IF( (J.EQ.OLDFST).AND.( WGAP(WBEGIN+J-1).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ) ) THEN * the right relgap is big enough by the Lapack 3.1 criterion * Found a singleton NEWLST = J ELSE IF( (J.GT.OLDFST).AND.(J.EQ.NEWFST).AND. $ ( WGAP(WBEGIN+J-2).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ).AND. $ ( WGAP(WBEGIN+J-1).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ) $ ) THEN * Found a singleton NEWLST = J ELSE IF( (J.GT.NEWFST).AND.WGAP(WBEGIN+J-1).GE. $ (MINRGP*ABS(WORK(WBEGIN+J-1)) ) ) $ THEN * the right relgap is big enough by the Lapack 3.1 criterion NEWLST = J ELSE IF((J.GT.NEWFST).AND.(J+1.LT.OLDLST).AND. $ (WGAP(WBEGIN+J-1).GE.ENUFGP)) $ THEN * the right gap is bigger than ENUFGP * Care needs to be taken with this criterion to make * sure it does not create a remaining `false' singleton NEWLST = J ELSE * inside a child cluster, the relative gap is not * big enough. GOTO 140 ENDIF ELSE IF (VRTREE.EQ.USE32B) THEN IF( (J.EQ.OLDFST).AND.( WGAP(WBEGIN+J-1).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ) ) THEN * the right relgap is big enough by the Lapack 3.1 criterion * Found a singleton NEWLST = J ELSE IF( (J.GT.OLDFST).AND.(J.EQ.NEWFST).AND. $ ( WGAP(WBEGIN+J-2).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ).AND. $ ( WGAP(WBEGIN+J-1).GE. $ MINRGP* ABS(WORK(WBEGIN+J-1)) ) $ ) THEN * Found a singleton NEWLST = J ELSE IF( (J.GT.NEWFST).AND.WGAP(WBEGIN+J-1).GE. $ (MINRGP*ABS(WORK(WBEGIN+J-1)) ) ) $ THEN * the right relgap is big enough by the Lapack 3.1 criterion NEWLST = J ELSE IF((J.GT.NEWFST).AND.(J+1.LT.OLDLST).AND. $ (WGAP( WBEGIN + J -1).GE. $ RLTL30 * ABS(WORK(WBEGIN + J -1)) )) $ THEN * the right relgap is big enough by the Lapack 3.0 criterion * Care needs to be taken with this criterion to make * sure it does not create a remaining `false' singleton NEWLST = J ELSE * inside a child cluster, the relative gap is not * big enough. GOTO 140 ENDIF END IF END IF * Compute size of child cluster found NEWSIZ = NEWLST - NEWFST + 1 MAXCLS = MAX( NEWSIZ, MAXCLS ) * NEWFTT is the place in Z where the new RRR or the computed * eigenvector is to be stored IF((DOL.EQ.1).AND.(DOU.EQ.M)) THEN * Store representation at location of the leftmost evalue * of the cluster NEWFTT = WBEGIN + NEWFST - 1 ELSE IF(WBEGIN+NEWFST-1.LT.DOL) THEN * Store representation at the left end of Z array NEWFTT = DOL - 1 ELSEIF(WBEGIN+NEWFST-1.GT.DOU) THEN * Store representation at the right end of Z array NEWFTT = DOU ELSE NEWFTT = WBEGIN + NEWFST - 1 ENDIF ENDIF * FOR 1D-DISTRIBUTED Z, COMPUTE NEWFTT SHIFTED BY ZOFFSET NEWFTT = NEWFTT - ZOFFSET IF( NEWSIZ.GT.1) THEN * * Current child is not a singleton but a cluster. * * IF((WBEGIN+NEWLST-1.LT.DOL).OR. $ (WBEGIN+NEWFST-1.GT.DOU)) THEN * if the cluster contains no desired eigenvalues * skip the computation of that branch of the rep. tree GOTO 139 ENDIF * Compute left and right cluster gap. * IF( NEWFST.EQ.1 ) THEN LGAP = MAX( ZERO, $ W(WBEGIN)-WERR(WBEGIN) - VL ) ELSE LGAP = WGAP( WBEGIN+NEWFST-2 ) ENDIF RGAP = WGAP( WBEGIN+NEWLST-1 ) * * For larger clusters, record the largest gap observed * somewhere near the middle of the cluster as a possible * alternative position for a shift when TRYMID is TRUE * MGAP = ZERO IF(NEWSIZ.GE.50) THEN KK = NEWFST DO 545 K =NEWFST+NEWSIZ/3,NEWLST-NEWSIZ/3 IF(MGAP.LT.WGAP( WBEGIN+K-1 )) THEN KK = K MGAP = WGAP( WBEGIN+K-1 ) ENDIF 545 CONTINUE ENDIF * * Record the left- and right-most eigenvalues needed * for the next level of the representation tree NEEDIL = MIN(NEEDIL,WBEGIN+NEWFST-1) NEEDIU = MAX(NEEDIU,WBEGIN+NEWLST-1) * * Check if middle gap is large enough to shift there * GAP = MIN(LGAP,RGAP) TRYMID = (MGAP.GT.GAP) SPLACE(1) = NEWFST SPLACE(2) = NEWLST IF(TRYMID) THEN SPLACE(3) = KK SPLACE(4) = KK+1 ELSE SPLACE(3) = NEWFST SPLACE(4) = NEWLST ENDIF * * Compute left- and rightmost eigenvalue of child * to high precision in order to shift as close * as possible and obtain as large relative gaps * as possible * DO 55 K =1,4 P = INDEXW( WBEGIN-1+SPLACE(K) ) OFFSET = INDEXW( WBEGIN ) - 1 CALL SLARRB2( IN, D(IBEGIN), $ WORK( INDLLD+IBEGIN-1 ),P,P, $ RQTOL, RQTOL, OFFSET, $ WORK(WBEGIN),WGAP(WBEGIN), $ WERR(WBEGIN),WORK( INDWRK ), $ IWORK( IINDWK ), $ PIVMIN, LGPVMN, LGSPDM, IN, IINFO ) 55 CONTINUE * * Compute RRR of child cluster. * Note that the new RRR is stored in Z * C SLARRF2 needs LWORK = 2*N CALL SLARRF2( IN, D( IBEGIN ), L( IBEGIN ), $ WORK(INDLD+IBEGIN-1), $ SPLACE(1), SPLACE(2), $ SPLACE(3), SPLACE(4), WORK(WBEGIN), $ WGAP(WBEGIN), WERR(WBEGIN), TRYMID, $ SPDIAM, LGAP, RGAP, PIVMIN, TAU, $ Z( IBEGIN, NEWFTT ), $ Z( IBEGIN, NEWFTT+1 ), $ WORK( INDWRK ), IINFO ) IF( IINFO.EQ.0 ) THEN * a new RRR for the cluster was found by SLARRF2 * update shift and store it SSIGMA = SIGMA + TAU Z( IEND, NEWFTT+1 ) = SSIGMA * WORK() are the midpoints and WERR() the semi-width * Note that the entries in W are unchanged. DO 116 K = NEWFST, NEWLST FUDGE = $ THREE*EPS*ABS(WORK(WBEGIN+K-1)) WORK( WBEGIN + K - 1 ) = $ WORK( WBEGIN + K - 1) - TAU FUDGE = FUDGE + $ FOUR*EPS*ABS(WORK(WBEGIN+K-1)) * Fudge errors WERR( WBEGIN + K - 1 ) = $ WERR( WBEGIN + K - 1 ) + FUDGE 116 CONTINUE NCLUS = NCLUS + 1 K = NEWCLS + 2*NCLUS IWORK( K-1 ) = NEWFST IWORK( K ) = NEWLST * IF(.NOT.DELREF) THEN ONLYLC = .TRUE. * IF(ONLYLC) THEN MYWFST = MAX(WBEGIN-1+NEWFST,DOL-1) MYWLST = MIN(WBEGIN-1+NEWLST,DOU+1) ELSE MYWFST = WBEGIN-1+NEWFST MYWLST = WBEGIN-1+NEWLST ENDIF * Compute LLD of new RRR DO 5000 K = IBEGIN, IEND-1 WORK( INDWRK-1+K ) = $ Z(K,NEWFTT)* $ (Z(K,NEWFTT+1)**2) 5000 CONTINUE * P and Q are index of the first and last * eigenvalue to compute within the new cluster P = INDEXW( MYWFST ) Q = INDEXW( MYWLST ) * Offset for the arrays WORK, WGAP and WERR OFFSET = INDEXW( WBEGIN ) - 1 * perform limited bisection (if necessary) to get approximate * eigenvalues to the precision needed. CALL SLARRB2( IN, $ Z(IBEGIN, NEWFTT ), $ WORK(INDWRK+IBEGIN-1), $ P, Q, RTOL1, RTOL2, OFFSET, $ WORK(WBEGIN),WGAP(WBEGIN),WERR(WBEGIN), $ WORK( INDWRK+N ), IWORK( IINDWK ), $ PIVMIN, LGPVMN, LGSPDM, IN, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -1 RETURN ENDIF * Each time the eigenvalues in WORK get refined, we store * the newly found approximation with all shifts applied in W DO 5003 K=NEWFST,NEWLST W(WBEGIN+K-1) = WORK(WBEGIN+K-1)+SSIGMA 5003 CONTINUE ENDIF * ELSE INFO = -2 RETURN ENDIF ELSE * * Compute eigenvector of singleton * ITER = 0 * TOL = FOUR * LOG_IN * EPS * K = NEWFST WINDEX = WBEGIN + K - 1 ZINDEX = WINDEX - ZOFFSET WINDMN = MAX(WINDEX - 1,1) WINDPL = MIN(WINDEX + 1,M) LAMBDA = WORK( WINDEX ) * Check if eigenvector computation is to be skipped IF((WINDEX.LT.DOL).OR. $ (WINDEX.GT.DOU)) THEN ESKIP = .TRUE. GOTO 125 ELSE ESKIP = .FALSE. ENDIF LEFT = WORK( WINDEX ) - WERR( WINDEX ) RIGHT = WORK( WINDEX ) + WERR( WINDEX ) INDEIG = INDEXW( WINDEX ) IF( K .EQ. 1) THEN LGAP = EPS*MAX(ABS(LEFT),ABS(RIGHT)) ELSE LGAP = WGAP(WINDMN) ENDIF IF( K .EQ. IM) THEN RGAP = EPS*MAX(ABS(LEFT),ABS(RIGHT)) ELSE RGAP = WGAP(WINDEX) ENDIF GAP = MIN( LGAP, RGAP ) IF(( K .EQ. 1).OR.(K .EQ. IM)) THEN GAPTOL = ZERO ELSE GAPTOL = GAP * EPS ENDIF ISUPMN = IN ISUPMX = 1 * Update WGAP so that it holds the minimum gap * to the left or the right. This is crucial in the * case where bisection is used to ensure that the * eigenvalue is refined up to the required precision. * The correct value is restored afterwards. SAVEGP = WGAP(WINDEX) WGAP(WINDEX) = GAP * We want to use the Rayleigh Quotient Correction * as often as possible since it converges quadratically * when we are close enough to the desired eigenvalue. * However, the Rayleigh Quotient can have the wrong sign * and lead us away from the desired eigenvalue. In this * case, the best we can do is to use bisection. USEDBS = .FALSE. USEDRQ = .FALSE. * Bisection is initially turned off unless it is forced NEEDBS = .NOT.TRYRQC * Reset ITWIST ITWIST = 0 120 CONTINUE * Check if bisection should be used to refine eigenvalue IF(NEEDBS) THEN * Take the bisection as new iterate USEDBS = .TRUE. * Temporary copy of twist index needed ITMP1 = ITWIST OFFSET = INDEXW( WBEGIN ) - 1 CALL SLARRB2( IN, D(IBEGIN), $ WORK(INDLLD+IBEGIN-1),INDEIG,INDEIG, $ ZERO, TWO*EPS, OFFSET, $ WORK(WBEGIN),WGAP(WBEGIN), $ WERR(WBEGIN),WORK( INDWRK ), $ IWORK( IINDWK ), $ PIVMIN, LGPVMN, LGSPDM, ITMP1, IINFO ) IF( IINFO.NE.0 ) THEN INFO = -3 RETURN ENDIF LAMBDA = WORK( WINDEX ) * Reset twist index from inaccurate LAMBDA to * force computation of true MINGMA ITWIST = 0 ENDIF * Given LAMBDA, compute the eigenvector. CALL SLAR1VA( IN, 1, IN, LAMBDA, D(IBEGIN), $ L( IBEGIN ), WORK(INDLD+IBEGIN-1), $ WORK(INDLLD+IBEGIN-1), $ PIVMIN, GAPTOL, Z( IBEGIN, ZINDEX), $ .NOT.USEDBS, NEGCNT, ZTZ, MINGMA, $ ITWIST, ISUPPZ( 2*WINDEX-1 ), $ NRMINV, RESID, RQCORR, WORK( INDWRK ) ) IF(ITER .EQ. 0) THEN BSTRES = RESID BSTW = LAMBDA ELSEIF(RESID.LT.BSTRES) THEN BSTRES = RESID BSTW = LAMBDA ENDIF ISUPMN = MIN(ISUPMN,ISUPPZ( 2*WINDEX-1 )) ISUPMX = MAX(ISUPMX,ISUPPZ( 2*WINDEX )) ITER = ITER + 1 * * Convergence test for Rayleigh-Quotient iteration * (omitted when Bisection has been used) * IF( RESID.GT.TOL*GAP .AND. ABS( RQCORR ).GT. $ RQTOL*ABS( LAMBDA ) .AND. .NOT. USEDBS) $ THEN * We need to check that the RQCORR update doesn't * move the eigenvalue away from the desired one and * towards a neighbor. -> protection with bisection IF(INDEIG.LE.NEGCNT) THEN * The wanted eigenvalue lies to the left SGNDEF = -ONE ELSE * The wanted eigenvalue lies to the right SGNDEF = ONE ENDIF * We only use the RQCORR if it improves the * the iterate reasonably. IF( ( RQCORR*SGNDEF.GE.ZERO ) $ .AND.( LAMBDA + RQCORR.LE. RIGHT) $ .AND.( LAMBDA + RQCORR.GE. LEFT) $ ) THEN USEDRQ = .TRUE. * Store new midpoint of bisection interval in WORK IF(SGNDEF.EQ.ONE) THEN * The current LAMBDA is on the left of the true * eigenvalue LEFT = LAMBDA ELSE * The current LAMBDA is on the right of the true * eigenvalue RIGHT = LAMBDA ENDIF WORK( WINDEX ) = $ HALF * (RIGHT + LEFT) * Take RQCORR since it has the correct sign and * improves the iterate reasonably LAMBDA = LAMBDA + RQCORR * Update width of error interval WERR( WINDEX ) = $ HALF * (RIGHT-LEFT) ELSE NEEDBS = .TRUE. ENDIF IF(RIGHT-LEFT.LT.RQTOL*ABS(LAMBDA)) THEN * The eigenvalue is computed to bisection accuracy * compute eigenvector and stop USEDBS = .TRUE. GOTO 120 ELSEIF( ITER.LT.MAXITR ) THEN GOTO 120 ELSEIF( ITER.EQ.MAXITR ) THEN NEEDBS = .TRUE. GOTO 120 ELSE INFO = 5 RETURN END IF ELSE STP2II = .FALSE. IF(USEDRQ .AND. USEDBS .AND. $ BSTRES.LE.RESID) THEN LAMBDA = BSTW STP2II = .TRUE. ENDIF IF (STP2II) THEN CALL SLAR1VA( IN, 1, IN, LAMBDA, $ D( IBEGIN ), L( IBEGIN ), $ WORK(INDLD+IBEGIN-1), $ WORK(INDLLD+IBEGIN-1), $ PIVMIN, GAPTOL, $ Z( IBEGIN, ZINDEX ), $ .NOT.USEDBS, NEGCNT, ZTZ, MINGMA, $ ITWIST, $ ISUPPZ( 2*WINDEX-1 ), $ NRMINV, RESID, RQCORR, WORK( INDWRK ) ) ENDIF WORK( WINDEX ) = LAMBDA END IF * * Compute FP-vector support w.r.t. whole matrix * ISUPPZ( 2*WINDEX-1 ) = ISUPPZ( 2*WINDEX-1 )+OLDIEN ISUPPZ( 2*WINDEX ) = ISUPPZ( 2*WINDEX )+OLDIEN ZFROM = ISUPPZ( 2*WINDEX-1 ) ZTO = ISUPPZ( 2*WINDEX ) ISUPMN = ISUPMN + OLDIEN ISUPMX = ISUPMX + OLDIEN * Ensure vector is ok if support in the RQI has changed IF(ISUPMN.LT.ZFROM) THEN DO 122 II = ISUPMN,ZFROM-1 Z( II, ZINDEX ) = ZERO 122 CONTINUE ENDIF IF(ISUPMX.GT.ZTO) THEN DO 123 II = ZTO+1,ISUPMX Z( II, ZINDEX ) = ZERO 123 CONTINUE ENDIF CALL SSCAL( ZTO-ZFROM+1, NRMINV, $ Z( ZFROM, ZINDEX ), 1 ) 125 CONTINUE * Update W W( WINDEX ) = LAMBDA+SIGMA * Recompute the gaps on the left and right * But only allow them to become larger and not * smaller (which can only happen through "bad" * cancellation and doesn't reflect the theory * where the initial gaps are underestimated due * to WERR being too crude.) IF(.NOT.ESKIP) THEN IF( K.GT.1) THEN WGAP( WINDMN ) = MAX( WGAP(WINDMN), $ W(WINDEX)-WERR(WINDEX) $ - W(WINDMN)-WERR(WINDMN) ) ENDIF IF( WINDEX.LT.WEND ) THEN WGAP( WINDEX ) = MAX( SAVEGP, $ W( WINDPL )-WERR( WINDPL ) $ - W( WINDEX )-WERR( WINDEX) ) ENDIF ENDIF ENDIF * here ends the code for the current child * 139 CONTINUE * Proceed to any remaining child nodes NEWFST = J + 1 140 CONTINUE 150 CONTINUE * Store number of clusters IWORK( IINCLS + JBLK ) = NCLUS * END IF IBEGIN = IEND + 1 WBEGIN = WEND + 1 170 CONTINUE * * Check if everything is done: no clusters left for * this processor in any block * FINISH = .TRUE. DO 180 JBLK = 1, IBLOCK( M ) FINISH = FINISH .AND. (IWORK(IINCLS + JBLK).EQ.0) 180 CONTINUE IF(.NOT.FINISH) THEN NDEPTH = NDEPTH + 1 IF((NEEDIL.GE.DOL).AND.(NEEDIU.LE.DOU)) THEN * Once this processor's part of the * representation tree consists exclusively of eigenvalues * between DOL and DOU, it can work independently from all * others. GOTO 40 ENDIF ENDIF * RETURN * * End of SLARRV2 * END scalapack-2.0.2/SRC/slasorte.f000644 000766 000024 00000010007 10602576752 016353 0ustar00juliestaff000000 000000 SUBROUTINE SLASORTE( S, LDS, J, OUT, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * December 31, 1998 * * .. Scalar Arguments .. INTEGER INFO, J, LDS * .. * .. Array Arguments .. REAL OUT( J, * ), S( LDS, * ) * .. * * Purpose * ======= * * SLASORTE sorts eigenpairs so that real eigenpairs are together and * complex are together. This way one can employ 2x2 shifts easily * since every 2nd subdiagonal is guaranteed to be zero. * This routine does no parallel work. * * Arguments * ========= * * S (local input/output) REAL array, dimension LDS * On entry, a matrix already in Schur form. * On exit, the diagonal blocks of S have been rewritten to pair * the eigenvalues. The resulting matrix is no longer * similar to the input. * * LDS (local input) INTEGER * On entry, the leading dimension of the local array S. * Unchanged on exit. * * J (local input) INTEGER * On entry, the order of the matrix S. * Unchanged on exit. * * OUT (local input/output) REAL array, dimension Jx2 * This is the work buffer required by this routine. * * INFO (local input) INTEGER * This is set if the input matrix had an odd number of real * eigenvalues and things couldn't be paired or if the input * matrix S was not originally in Schur form. * 0 indicates successful completion. * * Implemented by: G. Henry, November 17, 1996 * * ===================================================================== * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER BOT, I, LAST, TOP * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * LAST = J TOP = 1 BOT = J INFO = 0 DO 10 I = J - 1, 1, -1 IF( S( I+1, I ).EQ.ZERO ) THEN IF( LAST-I.EQ.2 ) THEN OUT( BOT-1, 1 ) = S( I+1, I+1 ) OUT( BOT, 2 ) = S( I+2, I+2 ) OUT( BOT-1, 2 ) = S( I+1, I+2 ) OUT( BOT, 1 ) = S( I+2, I+1 ) BOT = BOT - 2 END IF IF( LAST-I.EQ.1 ) THEN IF( MOD( TOP, 2 ).EQ.1 ) THEN * * FIRST OF A PAIR * IF( ( I.EQ.J-1 ) .OR. ( I.EQ.1 ) ) THEN OUT( TOP, 1 ) = S( I+1, I+1 ) ELSE OUT( TOP, 1 ) = S( I+1, I+1 ) END IF OUT( TOP, 2 ) = ZERO ELSE * * SECOND OF A PAIR * IF( ( I.EQ.J-1 ) .OR. ( I.EQ.1 ) ) THEN OUT( TOP, 2 ) = S( I+1, I+1 ) ELSE OUT( TOP, 2 ) = S( I+1, I+1 ) END IF OUT( TOP, 1 ) = ZERO END IF TOP = TOP + 1 END IF IF( LAST-I.GT.2 ) THEN INFO = I RETURN END IF LAST = I END IF 10 CONTINUE IF( LAST.EQ.2 ) THEN * * GRAB LAST DOUBLE PAIR * OUT( BOT-1, 1 ) = S( 1, 1 ) OUT( BOT, 2 ) = S( 2, 2 ) OUT( BOT-1, 2 ) = S( 1, 2 ) OUT( BOT, 1 ) = S( 2, 1 ) BOT = BOT - 2 END IF IF( LAST.EQ.1 .and. mod(top, 2) .eq. 0 ) THEN * * GRAB SECOND PART OF LAST PAIR * OUT(TOP, 2) = s(1,1) OUT(TOP, 1) = zero TOP = TOP + 1 END IF IF( TOP-1.NE.BOT ) THEN INFO = -BOT RETURN END IF * * Overwrite the S diagonals * DO 20 I = 1, J, 2 S( I, I ) = OUT( I, 1 ) S( I+1, I ) = OUT( I+1, 1 ) S( I, I+1 ) = OUT( I, 2 ) S( I+1, I+1 ) = OUT( I+1, 2 ) 20 CONTINUE * RETURN * * End of SLASORTE * END scalapack-2.0.2/SRC/slasrt2.f000644 000766 000024 00000016127 10363532303 016106 0ustar00juliestaff000000 000000 * * SUBROUTINE SLASRT2( ID, N, D, KEY, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. CHARACTER ID INTEGER INFO, N * .. * .. Array Arguments .. INTEGER KEY( * ) REAL D( * ) * .. * * Purpose * ======= * * Sort the numbers in D in increasing order (if ID = 'I') or * in decreasing order (if ID = 'D' ). * * Use Quick Sort, reverting to Insertion sort on arrays of * size <= 20. Dimension of STACK limits N to about 2**32. * * Arguments * ========= * * ID (input) CHARACTER*1 * = 'I': sort D in increasing order; * = 'D': sort D in decreasing order. * * N (input) INTEGER * The length of the array D. * * D (input/output) REAL array, dimension (N) * On entry, the array to be sorted. * On exit, D has been sorted into increasing order * (D(1) <= ... <= D(N) ) or into decreasing order * (D(1) >= ... >= D(N) ), depending on ID. * * KEY (input/output) INTEGER array, dimension (N) * On entry, KEY contains a key to each of the entries in D() * Typically, KEY(I) = I for all I * On exit, KEY is permuted in exactly the same manner as * D() was permuted from input to output * Therefore, if KEY(I) = I for all I upon input, then * D_out(I) = D_in(KEY(I)) * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. INTEGER SELECT PARAMETER ( SELECT = 20 ) * .. * .. Local Scalars .. INTEGER DIR, ENDD, I, J, START, STKPNT, TMPKEY REAL D1, D2, D3, DMNMX, TMP * .. * .. Local Arrays .. INTEGER STACK( 2, 32 ) * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input paramters. * * INFO = 0 DIR = -1 IF( LSAME( ID, 'D' ) ) THEN DIR = 0 ELSE IF( LSAME( ID, 'I' ) ) THEN DIR = 1 END IF IF( DIR.EQ.-1 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLASRT2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.LE.1 ) $ RETURN * STKPNT = 1 STACK( 1, 1 ) = 1 STACK( 2, 1 ) = N 10 CONTINUE START = STACK( 1, STKPNT ) ENDD = STACK( 2, STKPNT ) STKPNT = STKPNT - 1 IF( ENDD-START.GT.0 ) THEN * * Do Insertion sort on D( START:ENDD ) * IF( DIR.EQ.0 ) THEN * * Sort into decreasing order * DO 30 I = START + 1, ENDD DO 20 J = I, START + 1, -1 IF( D( J ).GT.D( J-1 ) ) THEN DMNMX = D( J ) D( J ) = D( J-1 ) D( J-1 ) = DMNMX TMPKEY = KEY( J ) KEY( J ) = KEY( J-1 ) KEY( J-1 ) = TMPKEY ELSE GO TO 30 END IF 20 CONTINUE 30 CONTINUE * ELSE * * Sort into increasing order * DO 50 I = START + 1, ENDD DO 40 J = I, START + 1, -1 IF( D( J ).LT.D( J-1 ) ) THEN DMNMX = D( J ) D( J ) = D( J-1 ) D( J-1 ) = DMNMX TMPKEY = KEY( J ) KEY( J ) = KEY( J-1 ) KEY( J-1 ) = TMPKEY ELSE GO TO 50 END IF 40 CONTINUE 50 CONTINUE * END IF * ELSE IF( ENDD-START.GT.SELECT ) THEN * * Partition D( START:ENDD ) and stack parts, largest one first * * Choose partition entry as median of 3 * D1 = D( START ) D2 = D( ENDD ) I = ( START+ENDD ) / 2 D3 = D( I ) IF( D1.LT.D2 ) THEN IF( D3.LT.D1 ) THEN DMNMX = D1 ELSE IF( D3.LT.D2 ) THEN DMNMX = D3 ELSE DMNMX = D2 END IF ELSE IF( D3.LT.D2 ) THEN DMNMX = D2 ELSE IF( D3.LT.D1 ) THEN DMNMX = D3 ELSE DMNMX = D1 END IF END IF * IF( DIR.EQ.0 ) THEN * * Sort into decreasing order * I = START - 1 J = ENDD + 1 60 CONTINUE 70 CONTINUE J = J - 1 IF( D( J ).LT.DMNMX ) $ GO TO 70 80 CONTINUE I = I + 1 IF( D( I ).GT.DMNMX ) $ GO TO 80 IF( I.LT.J ) THEN TMP = D( I ) D( I ) = D( J ) D( J ) = TMP TMPKEY = KEY( J ) KEY( J ) = KEY( I ) KEY( I ) = TMPKEY GO TO 60 END IF IF( J-START.GT.ENDD-J-1 ) THEN STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD ELSE STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J END IF ELSE * * Sort into increasing order * I = START - 1 J = ENDD + 1 90 CONTINUE 100 CONTINUE J = J - 1 IF( D( J ).GT.DMNMX ) $ GO TO 100 110 CONTINUE I = I + 1 IF( D( I ).LT.DMNMX ) $ GO TO 110 IF( I.LT.J ) THEN TMP = D( I ) D( I ) = D( J ) D( J ) = TMP TMPKEY = KEY( J ) KEY( J ) = KEY( I ) KEY( I ) = TMPKEY GO TO 90 END IF IF( J-START.GT.ENDD-J-1 ) THEN STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD ELSE STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = J + 1 STACK( 2, STKPNT ) = ENDD STKPNT = STKPNT + 1 STACK( 1, STKPNT ) = START STACK( 2, STKPNT ) = J END IF END IF END IF IF( STKPNT.GT.0 ) $ GO TO 10 * * RETURN * * End of SLASRT2 * END scalapack-2.0.2/SRC/spttrsv.f000644 000766 000024 00000007113 11657111056 016241 0ustar00juliestaff000000 000000 SUBROUTINE SPTTRSV( TRANS, N, NRHS, D, E, B, LDB, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * November, 1996. * Modified from SPTTRS: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. CHARACTER TRANS INTEGER INFO, LDB, N, NRHS * .. * .. Array Arguments .. REAL D( * ) REAL B( LDB, * ), E( * ) * .. * * Purpose * ======= * * SPTTRSV solves one of the triangular systems * L**T* X = B, or L * X = B, * where L is the Cholesky factor of a Hermitian positive * definite tridiagonal matrix A such that * A = L*D*L**H (computed by SPTTRF). * * Arguments * ========= * * TRANS (input) CHARACTER * Specifies the form of the system of equations: * = 'N': L * X = B (No transpose) * = 'T': L**T * X = B (Transpose) * * N (input) INTEGER * The order of the tridiagonal matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * D (input) REAL array, dimension (N) * The n diagonal elements of the diagonal matrix D from the * factorization computed by SPTTRF. * * E (input) COMPLEX array, dimension (N-1) * The (n-1) off-diagonal elements of the unit bidiagonal * factor U or L from the factorization computed by SPTTRF * (see UPLO). * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Local Scalars .. LOGICAL NOTRAN INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input arguments. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SPTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN IF( NOTRAN ) THEN * DO 60 J = 1, NRHS * * Solve L * x = b. * DO 40 I = 2, N B( I, J ) = B( I, J ) - B( I-1, J )*E( I-1 ) 40 CONTINUE 60 CONTINUE * ELSE * DO 65 J = 1, NRHS * * Solve L**H * x = b. * DO 50 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - $ B( I+1, J )*( E( I ) ) 50 CONTINUE 65 CONTINUE ENDIF * RETURN * * End of SPTTRS * END scalapack-2.0.2/SRC/sstegr2.f000644 000766 000024 00000044346 11657111056 016116 0ustar00juliestaff000000 000000 SUBROUTINE SSTEGR2( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, $ M, W, Z, LDZ, NZC, ISUPPZ, WORK, LWORK, IWORK, $ LIWORK, DOL, DOU, ZOFFSET, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE INTEGER DOL, DOU, IL, INFO, IU, $ LDZ, NZC, LIWORK, LWORK, M, N, ZOFFSET REAL VL, VU * .. * .. Array Arguments .. INTEGER ISUPPZ( * ), IWORK( * ) REAL D( * ), E( * ), W( * ), WORK( * ) REAL Z( LDZ, * ) * .. * * Purpose * ======= * * SSTEGR2 computes selected eigenvalues and, optionally, eigenvectors * of a real symmetric tridiagonal matrix T. It is invoked in the * ScaLAPACK MRRR driver PSSYEVR and the corresponding Hermitian * version either when only eigenvalues are to be computed, or when only * a single processor is used (the sequential-like case). * * SSTEGR2 has been adapted from LAPACK's SSTEGR. Please note the * following crucial changes. * * 1. The calling sequence has two additional INTEGER parameters, * DOL and DOU, that should satisfy M>=DOU>=DOL>=1. * SSTEGR2 ONLY computes the eigenpairs * corresponding to eigenvalues DOL through DOU in W. (That is, * instead of computing the eigenpairs belonging to W(1) * through W(M), only the eigenvectors belonging to eigenvalues * W(DOL) through W(DOU) are computed. In this case, only the * eigenvalues DOL:DOU are guaranteed to be fully accurate. * * 2. M is NOT the number of eigenvalues specified by RANGE, but is * M = DOU - DOL + 1. This concerns the case where only eigenvalues * are computed, but on more than one processor. Thus, in this case * M refers to the number of eigenvalues computed on this processor. * * 3. The arrays W and Z might not contain all the wanted eigenpairs * locally, instead this information is distributed over other * processors. * * Arguments * ========= * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the half-open interval (VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) REAL array, dimension (N) * On entry, the N diagonal elements of the tridiagonal matrix * T. On exit, D is overwritten. * * E (input/output) REAL array, dimension (N) * On entry, the (N-1) subdiagonal elements of the tridiagonal * matrix T in elements 1 to N-1 of E. E(N) need not be set on * input, but is used internally as workspace. * On exit, E is overwritten. * * VL (input) REAL * VU (input) REAL * If RANGE='V', the lower and upper bounds of the interval to * be searched for eigenvalues. VL < VU. * Not referenced if RANGE = 'A' or 'I'. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N, if N > 0. * Not referenced if RANGE = 'A' or 'V'. * * M (output) INTEGER * Globally summed over all processors, M equals * the total number of eigenvalues found. 0 <= M <= N. * If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. * The local output equals M = DOU - DOL + 1. * * W (output) REAL array, dimension (N) * The first M elements contain the selected eigenvalues in * ascending order. Note that immediately after exiting this * routine, only the eigenvalues from * position DOL:DOU are to reliable on this processor * because the eigenvalue computation is done in parallel. * Other processors will hold reliable information on other * parts of the W array. This information is communicated in * the ScaLAPACK driver. * * Z (output) REAL array, dimension (LDZ, max(1,M) ) * If JOBZ = 'V', and if INFO = 0, then the first M columns of Z * contain some of the orthonormal eigenvectors of the matrix T * corresponding to the selected eigenvalues, with the i-th * column of Z holding the eigenvector associated with W(i). * If JOBZ = 'N', then Z is not referenced. * Note: the user must ensure that at least max(1,M) columns are * supplied in the array Z; if RANGE = 'V', the exact value of M * is not known in advance and can be computed with a workspace * query by setting NZC = -1, see below. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * JOBZ = 'V', then LDZ >= max(1,N). * * NZC (input) INTEGER * The number of eigenvectors to be held in the array Z. * If RANGE = 'A', then NZC >= max(1,N). * If RANGE = 'V', then NZC >= the number of eigenvalues in (VL,VU]. * If RANGE = 'I', then NZC >= IU-IL+1. * If NZC = -1, then a workspace query is assumed; the * routine calculates the number of columns of the array Z that * are needed to hold the eigenvectors. * This value is returned as the first entry of the Z array, and * no error message related to NZC is issued. * * ISUPPZ (output) INTEGER ARRAY, dimension ( 2*max(1,M) ) * The support of the eigenvectors in Z, i.e., the indices * indicating the nonzero elements in Z. The i-th computed eigenvector * is nonzero only in elements ISUPPZ( 2*i-1 ) through * ISUPPZ( 2*i ). This is relevant in the case when the matrix * is split. ISUPPZ is only set if N>2. * * WORK (workspace/output) REAL array, dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal * (and minimal) LWORK. * * LWORK (input) INTEGER * The dimension of the array WORK. LWORK >= max(1,18*N) * if JOBZ = 'V', and LWORK >= max(1,12*N) if JOBZ = 'N'. * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued. * * IWORK (workspace/output) INTEGER array, dimension (LIWORK) * On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. LIWORK >= max(1,10*N) * if the eigenvectors are desired, and LIWORK >= max(1,8*N) * if only the eigenvalues are to be computed. * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued. * * DOL (input) INTEGER * DOU (input) INTEGER * From the eigenvalues W(1:M), only eigenvectors * Z(:,DOL) to Z(:,DOU) are computed. * If DOL > 1, then Z(:,DOL-1-ZOFFSET) is used and overwritten. * If DOU < M, then Z(:,DOU+1-ZOFFSET) is used and overwritten. * * ZOFFSET (input) INTEGER * Offset for storing the eigenpairs when Z is distributed * in 1D-cyclic fashion * * INFO (output) INTEGER * On exit, INFO * = 0: successful exit * other:if INFO = -i, the i-th argument had an illegal value * if INFO = 10X, internal error in SLARRE2, * if INFO = 20X, internal error in SLARRV. * Here, the digit X = ABS( IINFO ) < 10, where IINFO is * the nonzero error code returned by SLARRE2 or * SLARRV, respectively. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE, FOUR, MINRGP PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, $ FOUR = 4.0E0, $ MINRGP = 3.0E-3 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY INTEGER I, IIL, IINDBL, IINDW, IINDWK, IINFO, IINSPL, $ IIU, INDE2, INDERR, INDGP, INDGRS, INDWRK, $ ITMP, ITMP2, J, JJ, LIWMIN, LWMIN, NSPLIT, $ NZCMIN REAL BIGNUM, EPS, PIVMIN, RMAX, RMIN, RTOL1, RTOL2, $ SAFMIN, SCALE, SMLNUM, THRESH, TMP, TNRM, WL, $ WU * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH, SLANST EXTERNAL LSAME, SLAMCH, SLANST * .. * .. External Subroutines .. EXTERNAL SCOPY, SLAE2, SLAEV2, SLARRC, SLARRE2, $ SLARRV, SLASRT, SSCAL, SSWAP * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, REAL, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * WANTZ = LSAME( JOBZ, 'V' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) * SSTEGR2 needs WORK of size 6*N, IWORK of size 3*N. * In addition, SLARRE2 needs WORK of size 6*N, IWORK of size 5*N. * Furthermore, SLARRV needs WORK of size 12*N, IWORK of size 7*N. IF( WANTZ ) THEN LWMIN = 18*N LIWMIN = 10*N ELSE * need less workspace if only the eigenvalues are wanted LWMIN = 12*N LIWMIN = 8*N ENDIF WL = ZERO WU = ZERO IIL = 0 IIU = 0 IF( VALEIG ) THEN * We do not reference VL, VU in the cases RANGE = 'I','A' * The interval (WL, WU] contains all the wanted eigenvalues. * It is either given by the user or computed in SLARRE2. WL = VL WU = VU ELSEIF( INDEIG ) THEN * We do not reference IL, IU in the cases RANGE = 'V','A' IIL = IL IIU = IU ENDIF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( VALEIG .AND. N.GT.0 .AND. WU.LE.WL ) THEN INFO = -7 ELSE IF( INDEIG .AND. ( IIL.LT.1 .OR. IIL.GT.N ) ) THEN INFO = -8 ELSE IF( INDEIG .AND. ( IIU.LT.IIL .OR. IIU.GT.N ) ) THEN INFO = -9 ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN INFO = -13 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * * Get machine constants. * SAFMIN = SLAMCH( 'Safe minimum' ) EPS = SLAMCH( 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * IF( INFO.EQ.0 ) THEN WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN * IF( WANTZ .AND. ALLEIG ) THEN NZCMIN = N IIL = 1 IIU = N ELSE IF( WANTZ .AND. VALEIG ) THEN CALL SLARRC( 'T', N, VL, VU, D, E, SAFMIN, $ NZCMIN, ITMP, ITMP2, INFO ) IIL = ITMP+1 IIU = ITMP2 ELSE IF( WANTZ .AND. INDEIG ) THEN NZCMIN = IIU-IIL+1 ELSE * WANTZ .EQ. FALSE. NZCMIN = 0 ENDIF IF( ZQUERY .AND. INFO.EQ.0 ) THEN Z( 1,1 ) = NZCMIN ELSE IF( NZC.LT.NZCMIN .AND. .NOT.ZQUERY ) THEN INFO = -14 END IF END IF IF ( WANTZ ) THEN IF ( DOL.LT.1 .OR. DOL.GT.NZCMIN ) THEN INFO = -20 ENDIF IF ( DOU.LT.1 .OR. DOU.GT.NZCMIN .OR. DOU.LT.DOL) THEN INFO = -21 ENDIF ENDIF IF( INFO.NE.0 ) THEN * C Disable sequential error handler C for parallel case C CALL XERBLA( 'SSTEGR2', -INFO ) * RETURN ELSE IF( LQUERY .OR. ZQUERY ) THEN RETURN END IF * * Quick return if possible * M = 0 IF( N.EQ.0 ) $ RETURN * IF( N.EQ.1 ) THEN IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = D( 1 ) ELSE IF( WL.LT.D( 1 ) .AND. WU.GE.D( 1 ) ) THEN M = 1 W( 1 ) = D( 1 ) END IF END IF IF( WANTZ ) $ Z( 1, 1 ) = ONE RETURN END IF * INDGRS = 1 INDERR = 2*N + 1 INDGP = 3*N + 1 INDE2 = 5*N + 1 INDWRK = 6*N + 1 * IINSPL = 1 IINDBL = N + 1 IINDW = 2*N + 1 IINDWK = 3*N + 1 * * Scale matrix to allowable range, if necessary. * SCALE = ONE TNRM = SLANST( 'M', N, D, E ) IF( TNRM.GT.ZERO .AND. TNRM.LT.RMIN ) THEN SCALE = RMIN / TNRM ELSE IF( TNRM.GT.RMAX ) THEN SCALE = RMAX / TNRM END IF IF( SCALE.NE.ONE ) THEN CALL SSCAL( N, SCALE, D, 1 ) CALL SSCAL( N-1, SCALE, E, 1 ) TNRM = TNRM*SCALE IF( VALEIG ) THEN * If eigenvalues in interval have to be found, * scale (WL, WU] accordingly WL = WL*SCALE WU = WU*SCALE ENDIF END IF * * Compute the desired eigenvalues of the tridiagonal after splitting * into smaller subblocks if the corresponding off-diagonal elements * are small * THRESH is the splitting parameter for SLARRE2 * A negative THRESH forces the old splitting criterion based on the * size of the off-diagonal. A positive THRESH switches to splitting * which preserves relative accuracy. * IINFO = -1 * Set the splitting criterion IF (IINFO.EQ.0) THEN THRESH = EPS ELSE THRESH = -EPS ENDIF * * Store the squares of the offdiagonal values of T DO 5 J = 1, N-1 WORK( INDE2+J-1 ) = E(J)**2 5 CONTINUE * Set the tolerance parameters for bisection IF( .NOT.WANTZ ) THEN * SLARRE2 computes the eigenvalues to full precision. RTOL1 = FOUR * EPS RTOL2 = FOUR * EPS ELSE * SLARRE2 computes the eigenvalues to less than full precision. * SLARRV will refine the eigenvalue approximations, and we can * need less accurate initial bisection in SLARRE2. * Note: these settings do only affect the subset case and SLARRE2 RTOL1 = SQRT(EPS) RTOL2 = MAX( SQRT(EPS)*5.0E-3, FOUR * EPS ) ENDIF CALL SLARRE2( RANGE, N, WL, WU, IIL, IIU, D, E, $ WORK(INDE2), RTOL1, RTOL2, THRESH, NSPLIT, $ IWORK( IINSPL ), M, DOL, DOU, $ W, WORK( INDERR ), $ WORK( INDGP ), IWORK( IINDBL ), $ IWORK( IINDW ), WORK( INDGRS ), PIVMIN, $ WORK( INDWRK ), IWORK( IINDWK ), IINFO ) IF( IINFO.NE.0 ) THEN INFO = 100 + ABS( IINFO ) RETURN END IF * Note that if RANGE .NE. 'V', SLARRE2 computes bounds on the desired * part of the spectrum. All desired eigenvalues are contained in * (WL,WU] IF( WANTZ ) THEN * * Compute the desired eigenvectors corresponding to the computed * eigenvalues * CALL SLARRV( N, WL, WU, D, E, $ PIVMIN, IWORK( IINSPL ), M, $ DOL, DOU, MINRGP, RTOL1, RTOL2, $ W, WORK( INDERR ), WORK( INDGP ), IWORK( IINDBL ), $ IWORK( IINDW ), WORK( INDGRS ), Z, LDZ, $ ISUPPZ, WORK( INDWRK ), IWORK( IINDWK ), IINFO ) IF( IINFO.NE.0 ) THEN INFO = 200 + ABS( IINFO ) RETURN END IF ELSE * SLARRE2 computes eigenvalues of the (shifted) root representation * SLARRV returns the eigenvalues of the unshifted matrix. * However, if the eigenvectors are not desired by the user, we need * to apply the corresponding shifts from SLARRE2 to obtain the * eigenvalues of the original matrix. DO 20 J = 1, M ITMP = IWORK( IINDBL+J-1 ) W( J ) = W( J ) + E( IWORK( IINSPL+ITMP-1 ) ) 20 CONTINUE END IF * * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( SCALE.NE.ONE ) THEN CALL SSCAL( M, ONE / SCALE, W, 1 ) END IF * * Correct M if needed * IF ( WANTZ ) THEN IF( DOL.NE.1 .OR. DOU.NE.M ) THEN M = DOU - DOL +1 ENDIF ENDIF * * If eigenvalues are not in increasing order, then sort them, * possibly along with eigenvectors. * IF( NSPLIT.GT.1 ) THEN IF( .NOT. WANTZ ) THEN CALL SLASRT( 'I', DOU - DOL +1, W(DOL), IINFO ) IF( IINFO.NE.0 ) THEN INFO = 3 RETURN END IF ELSE DO 60 J = DOL, DOU - 1 I = 0 TMP = W( J ) DO 50 JJ = J + 1, M IF( W( JJ ).LT.TMP ) THEN I = JJ TMP = W( JJ ) END IF 50 CONTINUE IF( I.NE.0 ) THEN W( I ) = W( J ) W( J ) = TMP IF( WANTZ ) THEN CALL SSWAP( N, Z( 1, I-ZOFFSET ), $ 1, Z( 1, J-ZOFFSET ), 1 ) ITMP = ISUPPZ( 2*I-1 ) ISUPPZ( 2*I-1 ) = ISUPPZ( 2*J-1 ) ISUPPZ( 2*J-1 ) = ITMP ITMP = ISUPPZ( 2*I ) ISUPPZ( 2*I ) = ISUPPZ( 2*J ) ISUPPZ( 2*J ) = ITMP END IF END IF 60 CONTINUE END IF ENDIF * WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN RETURN * * End of SSTEGR2 * END scalapack-2.0.2/SRC/sstegr2a.f000644 000766 000024 00000041150 11657111056 016245 0ustar00juliestaff000000 000000 SUBROUTINE SSTEGR2A( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, $ M, W, Z, LDZ, NZC, WORK, LWORK, IWORK, $ LIWORK, DOL, DOU, NEEDIL, NEEDIU, $ INDERR, NSPLIT, PIVMIN, SCALE, WL, WU, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER JOBZ, RANGE INTEGER DOL, DOU, IL, INDERR, INFO, IU, LDZ, LIWORK, $ LWORK, M, N, NEEDIL, NEEDIU, NSPLIT, NZC REAL PIVMIN, SCALE, VL, VU, WL, WU * .. * .. Array Arguments .. INTEGER IWORK( * ) REAL D( * ), E( * ), W( * ), WORK( * ) REAL Z( LDZ, * ) * .. * * Purpose * ======= * * SSTEGR2A computes selected eigenvalues and initial representations. * needed for eigenvector computations in SSTEGR2B. It is invoked in the * ScaLAPACK MRRR driver PSSYEVR and the corresponding Hermitian * version when both eigenvalues and eigenvectors are computed in parallel. * on multiple processors. For this case, SSTEGR2A implements the FIRST * part of the MRRR algorithm, parallel eigenvalue computation and finding * the root RRR. At the end of SSTEGR2A, * other processors might have a part of the spectrum that is needed to * continue the computation locally. Once this eigenvalue information has * been received by the processor, the computation can then proceed by calling * the SECOND part of the parallel MRRR algorithm, SSTEGR2B. * * Please note: * 1. The calling sequence has two additional INTEGER parameters, * (compared to LAPACK's SSTEGR), these are * DOL and DOU and should satisfy M>=DOU>=DOL>=1. * These parameters are only relevant for the case JOBZ = 'V'. * * Globally invoked over all processors, SSTEGR2A computes * ALL the eigenVALUES specified by RANGE. * RANGE= 'A': all eigenvalues will be found. * = 'V': all eigenvalues in (VL,VU] will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * SSTEGR2A LOCALLY only computes the eigenvalues * corresponding to eigenvalues DOL through DOU in W. (That is, * instead of computing the eigenvectors belonging to W(1) * through W(M), only the eigenvectors belonging to eigenvalues * W(DOL) through W(DOU) are computed. In this case, only the * eigenvalues DOL:DOU are guaranteed to be fully accurate. * * 2. M is NOT the number of eigenvalues specified by RANGE, but it is * M = DOU - DOL + 1. Instead, M refers to the number of eigenvalues computed on * this processor. * * 3. While no eigenvectors are computed in SSTEGR2A itself (this is * done later in SSTEGR2B), the interface * If JOBZ = 'V' then, depending on RANGE and DOL, DOU, SSTEGR2A * might need more workspace in Z then the original SSTEGR. * In particular, the arrays W and Z might not contain all the wanted eigenpairs * locally, instead this information is distributed over other * processors. * * Arguments * ========= * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * RANGE (input) CHARACTER*1 * = 'A': all eigenvalues will be found. * = 'V': all eigenvalues in the half-open interval (VL,VU] * will be found. * = 'I': the IL-th through IU-th eigenvalues will be found. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) REAL array, dimension (N) * On entry, the N diagonal elements of the tridiagonal matrix * T. On exit, D is overwritten. * * E (input/output) REAL array, dimension (N) * On entry, the (N-1) subdiagonal elements of the tridiagonal * matrix T in elements 1 to N-1 of E. E(N) need not be set on * input, but is used internally as workspace. * On exit, E is overwritten. * * VL (input) REAL * VU (input) REAL * If RANGE='V', the lower and upper bounds of the interval to * be searched for eigenvalues. VL < VU. * Not referenced if RANGE = 'A' or 'I'. * * IL (input) INTEGER * IU (input) INTEGER * If RANGE='I', the indices (in ascending order) of the * smallest and largest eigenvalues to be returned. * 1 <= IL <= IU <= N, if N > 0. * Not referenced if RANGE = 'A' or 'V'. * * M (output) INTEGER * Globally summed over all processors, M equals * the total number of eigenvalues found. 0 <= M <= N. * If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. * The local output equals M = DOU - DOL + 1. * * W (output) REAL array, dimension (N) * The first M elements contain approximations to the selected * eigenvalues in ascending order. Note that immediately after * exiting this routine, only the eigenvalues from * position DOL:DOU are to reliable on this processor * because the eigenvalue computation is done in parallel. * The other entries outside DOL:DOU are very crude preliminary * approximations. Other processors hold reliable information on * these other parts of the W array. * This information is communicated in the ScaLAPACK driver. * * Z (output) REAL array, dimension (LDZ, max(1,M) ) * SSTEGR2A does not compute eigenvectors, this is done * in SSTEGR2B. The argument Z as well as all related * other arguments only appear to keep the interface consistent * and to signal to the user that this subroutine is meant to * be used when eigenvectors are computed. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * JOBZ = 'V', then LDZ >= max(1,N). * * NZC (input) INTEGER * The number of eigenvectors to be held in the array Z. * If RANGE = 'A', then NZC >= max(1,N). * If RANGE = 'V', then NZC >= the number of eigenvalues in (VL,VU]. * If RANGE = 'I', then NZC >= IU-IL+1. * If NZC = -1, then a workspace query is assumed; the * routine calculates the number of columns of the array Z that * are needed to hold the eigenvectors. * This value is returned as the first entry of the Z array, and * no error message related to NZC is issued. * * WORK (workspace/output) REAL array, dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal * (and minimal) LWORK. * * LWORK (input) INTEGER * The dimension of the array WORK. LWORK >= max(1,18*N) * if JOBZ = 'V', and LWORK >= max(1,12*N) if JOBZ = 'N'. * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued. * * IWORK (workspace/output) INTEGER array, dimension (LIWORK) * On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. LIWORK >= max(1,10*N) * if the eigenvectors are desired, and LIWORK >= max(1,8*N) * if only the eigenvalues are to be computed. * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued. * * DOL (input) INTEGER * DOU (input) INTEGER * From all the eigenvalues W(1:M), only eigenvalues * W(DOL:DOU) are computed. * * NEEDIL (output) INTEGER * NEEDIU (output) INTEGER * The indices of the leftmost and rightmost eigenvalues * needed to accurately compute the relevant part of the * representation tree. This information can be used to * find out which processors have the relevant eigenvalue * information needed so that it can be communicated. * * INDERR (output) INTEGER * INDERR points to the place in the work space where * the eigenvalue uncertainties (errors) are stored. * * NSPLIT (output) INTEGER * The number of blocks T splits into. 1 <= NSPLIT <= N. * * PIVMIN (output) REAL * The minimum pivot in the sturm sequence for T. * * SCALE (output) REAL * The scaling factor for the tridiagonal T. * * WL (output) REAL * WU (output) REAL * The interval (WL, WU] contains all the wanted eigenvalues. * It is either given by the user or computed in SLARRE2A. * * INFO (output) INTEGER * On exit, INFO * = 0: successful exit * other:if INFO = -i, the i-th argument had an illegal value * if INFO = 10X, internal error in SLARRE2A, * Here, the digit X = ABS( IINFO ) < 10, where IINFO is * the nonzero error code returned by SLARRE2A. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE, FOUR, MINRGP PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, $ FOUR = 4.0E0, $ MINRGP = 3.0E-3 ) * .. * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY INTEGER IIL, IINDBL, IINDW, IINDWK, IINFO, IINSPL, IIU, $ INDE2, INDGP, INDGRS, INDSDM, INDWRK, ITMP, $ ITMP2, J, LIWMIN, LWMIN, NZCMIN REAL BIGNUM, EPS, RMAX, RMIN, RTOL1, RTOL2, SAFMIN, $ SMLNUM, THRESH, TNRM * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH, SLANST EXTERNAL LSAME, SLAMCH, SLANST * .. * .. External Subroutines .. EXTERNAL SLARRC, SLARRE2A, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, REAL, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * WANTZ = LSAME( JOBZ, 'V' ) ALLEIG = LSAME( RANGE, 'A' ) VALEIG = LSAME( RANGE, 'V' ) INDEIG = LSAME( RANGE, 'I' ) * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) * SSTEGR2A needs WORK of size 6*N, IWORK of size 3*N. * In addition, SLARRE2A needs WORK of size 6*N, IWORK of size 5*N. * Furthermore, SLARRV2 needs WORK of size 12*N, IWORK of size 7*N. * Workspace is kept consistent with SSTEGR2B even though * SLARRV2 is not called here. IF( WANTZ ) THEN LWMIN = 18*N LIWMIN = 10*N ELSE * need less workspace if only the eigenvalues are wanted LWMIN = 12*N LIWMIN = 8*N ENDIF WL = ZERO WU = ZERO IIL = 0 IIU = 0 IF( VALEIG ) THEN * We do not reference VL, VU in the cases RANGE = 'I','A' * The interval (WL, WU] contains all the wanted eigenvalues. * It is either given by the user or computed in SLARRE2A. WL = VL WU = VU ELSEIF( INDEIG ) THEN * We do not reference IL, IU in the cases RANGE = 'V','A' IIL = IL IIU = IU ENDIF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( ALLEIG .OR. VALEIG .OR. INDEIG ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( VALEIG .AND. N.GT.0 .AND. WU.LE.WL ) THEN INFO = -7 ELSE IF( INDEIG .AND. ( IIL.LT.1 .OR. IIL.GT.N ) ) THEN INFO = -8 ELSE IF( INDEIG .AND. ( IIU.LT.IIL .OR. IIU.GT.N ) ) THEN INFO = -9 ELSE IF( LDZ.LT.1 .OR. ( WANTZ .AND. LDZ.LT.N ) ) THEN INFO = -13 ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -17 ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * * Get machine constants. * SAFMIN = SLAMCH( 'Safe minimum' ) EPS = SLAMCH( 'Precision' ) SMLNUM = SAFMIN / EPS BIGNUM = ONE / SMLNUM RMIN = SQRT( SMLNUM ) RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * IF( INFO.EQ.0 ) THEN WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN * IF( WANTZ .AND. ALLEIG ) THEN NZCMIN = N IIL = 1 IIU = N ELSE IF( WANTZ .AND. VALEIG ) THEN CALL SLARRC( 'T', N, VL, VU, D, E, SAFMIN, $ NZCMIN, ITMP, ITMP2, INFO ) IIL = ITMP+1 IIU = ITMP2 ELSE IF( WANTZ .AND. INDEIG ) THEN NZCMIN = IIU-IIL+1 ELSE * WANTZ .EQ. FALSE. NZCMIN = 0 ENDIF IF( ZQUERY .AND. INFO.EQ.0 ) THEN Z( 1,1 ) = NZCMIN ELSE IF( NZC.LT.NZCMIN .AND. .NOT.ZQUERY ) THEN INFO = -14 END IF END IF IF ( WANTZ ) THEN IF ( DOL.LT.1 .OR. DOL.GT.NZCMIN ) THEN INFO = -20 ENDIF IF ( DOU.LT.1 .OR. DOU.GT.NZCMIN .OR. DOU.LT.DOL) THEN INFO = -21 ENDIF ENDIF IF( INFO.NE.0 ) THEN * C Disable sequential error handler C for parallel case C CALL XERBLA( 'SSTEGR2A', -INFO ) * RETURN ELSE IF( LQUERY .OR. ZQUERY ) THEN RETURN END IF * Initialize NEEDIL and NEEDIU, these values are changed in SLARRE2A NEEDIL = DOU NEEDIU = DOL * * Quick return if possible * M = 0 IF( N.EQ.0 ) $ RETURN * IF( N.EQ.1 ) THEN IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = D( 1 ) ELSE IF( WL.LT.D( 1 ) .AND. WU.GE.D( 1 ) ) THEN M = 1 W( 1 ) = D( 1 ) END IF END IF IF( WANTZ ) $ Z( 1, 1 ) = ONE RETURN END IF * INDGRS = 1 INDERR = 2*N + 1 INDGP = 3*N + 1 INDSDM = 4*N + 1 INDE2 = 5*N + 1 INDWRK = 6*N + 1 * IINSPL = 1 IINDBL = N + 1 IINDW = 2*N + 1 IINDWK = 3*N + 1 * * Scale matrix to allowable range, if necessary. * SCALE = ONE TNRM = SLANST( 'M', N, D, E ) IF( TNRM.GT.ZERO .AND. TNRM.LT.RMIN ) THEN SCALE = RMIN / TNRM ELSE IF( TNRM.GT.RMAX ) THEN SCALE = RMAX / TNRM END IF IF( SCALE.NE.ONE ) THEN CALL SSCAL( N, SCALE, D, 1 ) CALL SSCAL( N-1, SCALE, E, 1 ) TNRM = TNRM*SCALE IF( VALEIG ) THEN * If eigenvalues in interval have to be found, * scale (WL, WU] accordingly WL = WL*SCALE WU = WU*SCALE ENDIF END IF * * Compute the desired eigenvalues of the tridiagonal after splitting * into smaller subblocks if the corresponding off-diagonal elements * are small * THRESH is the splitting parameter for SLARRA in SLARRE2A * A negative THRESH forces the old splitting criterion based on the * size of the off-diagonal. THRESH = -EPS IINFO = 0 * Store the squares of the offdiagonal values of T DO 5 J = 1, N-1 WORK( INDE2+J-1 ) = E(J)**2 5 CONTINUE * Set the tolerance parameters for bisection IF( .NOT.WANTZ ) THEN * SLARRE2A computes the eigenvalues to full precision. RTOL1 = FOUR * EPS RTOL2 = FOUR * EPS ELSE * SLARRE2A computes the eigenvalues to less than full precision. * SLARRV2 will refine the eigenvalue approximations, and we can * need less accurate initial bisection in SLARRE2A. RTOL1 = FOUR*SQRT(EPS) RTOL2 = MAX( SQRT(EPS)*5.0E-3, FOUR * EPS ) ENDIF CALL SLARRE2A( RANGE, N, WL, WU, IIL, IIU, D, E, $ WORK(INDE2), RTOL1, RTOL2, THRESH, NSPLIT, $ IWORK( IINSPL ), M, DOL, DOU, NEEDIL, NEEDIU, $ W, WORK( INDERR ), $ WORK( INDGP ), IWORK( IINDBL ), $ IWORK( IINDW ), WORK( INDGRS ), $ WORK( INDSDM ), PIVMIN, $ WORK( INDWRK ), IWORK( IINDWK ), $ MINRGP, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 100 + ABS( IINFO ) RETURN END IF * Note that if RANGE .NE. 'V', SLARRE2A computes bounds on the desired * part of the spectrum. All desired eigenvalues are contained in * (WL,WU] RETURN * * End of SSTEGR2A * END scalapack-2.0.2/SRC/sstegr2b.f000644 000766 000024 00000031055 11657111056 016251 0ustar00juliestaff000000 000000 SUBROUTINE SSTEGR2B( JOBZ, N, D, E, $ M, W, Z, LDZ, NZC, ISUPPZ, WORK, LWORK, IWORK, $ LIWORK, DOL, DOU, NEEDIL, NEEDIU, $ INDWLC, PIVMIN, SCALE, WL, WU, $ VSTART, FINISH, MAXCLS, $ NDEPTH, PARITY, ZOFFSET, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * July 4, 2010 * IMPLICIT NONE * * .. Scalar Arguments .. CHARACTER JOBZ INTEGER DOL, DOU, INDWLC, INFO, LDZ, LIWORK, LWORK, M, $ MAXCLS, N, NDEPTH, NEEDIL, NEEDIU, NZC, PARITY, $ ZOFFSET REAL PIVMIN, SCALE, WL, WU LOGICAL VSTART, FINISH * .. * .. Array Arguments .. INTEGER ISUPPZ( * ), IWORK( * ) REAL D( * ), E( * ), W( * ), WORK( * ) REAL Z( LDZ, * ) * .. * * Purpose * ======= * * SSTEGR2B should only be called after a call to SSTEGR2A. * From eigenvalues and initial representations computed by SSTEGR2A, * SSTEGR2B computes the selected eigenvalues and eigenvectors * of the real symmetric tridiagonal matrix in parallel * on multiple processors. It is potentially invoked multiple times * on a given processor because the locally relevant representation tree * might depend on spectral information that is "owned" by other processors * and might need to be communicated. * * Please note: * 1. The calling sequence has two additional INTEGER parameters, * DOL and DOU, that should satisfy M>=DOU>=DOL>=1. * These parameters are only relevant for the case JOBZ = 'V'. * SSTEGR2B ONLY computes the eigenVECTORS * corresponding to eigenvalues DOL through DOU in W. (That is, * instead of computing the eigenvectors belonging to W(1) * through W(M), only the eigenvectors belonging to eigenvalues * W(DOL) through W(DOU) are computed. In this case, only the * eigenvalues DOL:DOU are guaranteed to be accurately refined * to all figures by Rayleigh-Quotient iteration. * * 2. The additional arguments VSTART, FINISH, NDEPTH, PARITY, ZOFFSET * are included as a thread-safe implementation equivalent to SAVE variables. * These variables store details about the local representation tree which is * computed layerwise. For scalability reasons, eigenvalues belonging to the * locally relevant representation tree might be computed on other processors. * These need to be communicated before the inspection of the RRRs can proceed * on any given layer. * Note that only when the variable FINISH is true, the computation has ended * All eigenpairs between DOL and DOU have been computed. M is set = DOU - DOL + 1. * * 3. SSTEGR2B needs more workspace in Z than the sequential SSTEGR. * It is used to store the conformal embedding of the local representation tree. * * Arguments * ========= * * JOBZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only; * = 'V': Compute eigenvalues and eigenvectors. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) REAL array, dimension (N) * On entry, the N diagonal elements of the tridiagonal matrix * T. On exit, D is overwritten. * * E (input/output) REAL array, dimension (N) * On entry, the (N-1) subdiagonal elements of the tridiagonal * matrix T in elements 1 to N-1 of E. E(N) need not be set on * input, but is used internally as workspace. * On exit, E is overwritten. * * M (input) INTEGER * The total number of eigenvalues found * in SSTEGR2A. 0 <= M <= N. * * W (input) REAL array, dimension (N) * The first M elements contain approximations to the selected * eigenvalues in ascending order. Note that only the eigenvalues from * the locally relevant part of the representation tree, that is * all the clusters that include eigenvalues from DOL:DOU, are reliable * on this processor. (It does not need to know about any others anyway.) * * Z (output) REAL array, dimension (LDZ, max(1,M) ) * If JOBZ = 'V', and if INFO = 0, then * a subset of the first M columns of Z * contain the orthonormal eigenvectors of the matrix T * corresponding to the selected eigenvalues, with the i-th * column of Z holding the eigenvector associated with W(i). * See DOL, DOU for more information. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * JOBZ = 'V', then LDZ >= max(1,N). * * NZC (input) INTEGER * The number of eigenvectors to be held in the array Z. * * ISUPPZ (output) INTEGER ARRAY, dimension ( 2*max(1,M) ) * The support of the eigenvectors in Z, i.e., the indices * indicating the nonzero elements in Z. The i-th computed eigenvector * is nonzero only in elements ISUPPZ( 2*i-1 ) through * ISUPPZ( 2*i ). This is relevant in the case when the matrix * is split. ISUPPZ is only set if N>2. * * WORK (workspace/output) REAL array, dimension (LWORK) * On exit, if INFO = 0, WORK(1) returns the optimal * (and minimal) LWORK. * * LWORK (input) INTEGER * The dimension of the array WORK. LWORK >= max(1,18*N) * if JOBZ = 'V', and LWORK >= max(1,12*N) if JOBZ = 'N'. * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued. * * IWORK (workspace/output) INTEGER array, dimension (LIWORK) * On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. * * LIWORK (input) INTEGER * The dimension of the array IWORK. LIWORK >= max(1,10*N) * if the eigenvectors are desired, and LIWORK >= max(1,8*N) * if only the eigenvalues are to be computed. * If LIWORK = -1, then a workspace query is assumed; the * routine only calculates the optimal size of the IWORK array, * returns this value as the first entry of the IWORK array, and * no error message related to LIWORK is issued. * * DOL (input) INTEGER * DOU (input) INTEGER * From the eigenvalues W(1:M), only eigenvectors * Z(:,DOL) to Z(:,DOU) are computed. * If DOL > 1, then Z(:,DOL-1-ZOFFSET) is used and overwritten. * If DOU < M, then Z(:,DOU+1-ZOFFSET) is used and overwritten. * * NEEDIL (input/output) INTEGER * NEEDIU (input/output) INTEGER * Describes which are the left and right outermost eigenvalues * still to be computed. Initially computed by SLARRE2A, * modified in the course of the algorithm. * * INDWLC (output) REAL * Pointer into the workspace, location where the local * eigenvalue representations are stored. ("Local eigenvalues" * are those relative to the individual shifts of the RRRs.) * * PIVMIN (input) REAL * The minimum pivot in the sturm sequence for T. * * SCALE (input) REAL * The scaling factor for T. Used for unscaling the eigenvalues * at the very end of the algorithm. * * WL (input) REAL * WU (input) REAL * The interval (WL, WU] contains all the wanted eigenvalues. * * VSTART (input/output) LOGICAL * .TRUE. on initialization, set to .FALSE. afterwards. * * FINISH (input/output) LOGICAL * indicates whether all eigenpairs have been computed * * MAXCLS (input/output) INTEGER * The largest cluster worked on by this processor in the * representation tree. * * NDEPTH (input/output) INTEGER * The current depth of the representation tree. Set to * zero on initial pass, changed when the deeper levels of * the representation tree are generated. * * PARITY (input/output) INTEGER * An internal parameter needed for the storage of the * clusters on the current level of the representation tree. * * ZOFFSET (input) INTEGER * Offset for storing the eigenpairs when Z is distributed * in 1D-cyclic fashion * * INFO (output) INTEGER * On exit, INFO * = 0: successful exit * other:if INFO = -i, the i-th argument had an illegal value * if INFO = 20X, internal error in SLARRV2. * Here, the digit X = ABS( IINFO ) < 10, where IINFO is * the nonzero error code returned by SLARRV2. * * .. Parameters .. REAL ONE, FOUR, MINRGP PARAMETER ( ONE = 1.0E0, $ FOUR = 4.0E0, $ MINRGP = 3.0E-3 ) * .. * .. Local Scalars .. LOGICAL LQUERY, WANTZ, ZQUERY INTEGER IINDBL, IINDW, IINDWK, IINFO, IINSPL, INDERR, $ INDGP, INDGRS, INDSDM, INDWRK, ITMP, J, LIWMIN, $ LWMIN REAL EPS, RTOL1, RTOL2 * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH, SLANST EXTERNAL LSAME, SLAMCH, SLANST * .. * .. External Subroutines .. EXTERNAL SLARRV2, SSCAL * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, REAL, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * WANTZ = LSAME( JOBZ, 'V' ) * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) * SSTEGR2B needs WORK of size 6*N, IWORK of size 3*N. * In addition, SLARRE2A needed WORK of size 6*N, IWORK of size 5*N. * Workspace is kept consistent even though SLARRE2A is not called here. * Furthermore, SLARRV2 needs WORK of size 12*N, IWORK of size 7*N. IF( WANTZ ) THEN LWMIN = 18*N LIWMIN = 10*N ELSE * need less workspace if only the eigenvalues are wanted LWMIN = 12*N LIWMIN = 8*N ENDIF * INFO = 0 * * Get machine constants. * EPS = SLAMCH( 'Precision' ) * IF( (N.EQ.0).OR.(N.EQ.1) ) THEN FINISH = .TRUE. RETURN ENDIF IF(ZQUERY.OR.LQUERY) $ RETURN * INDGRS = 1 INDERR = 2*N + 1 INDGP = 3*N + 1 INDSDM = 4*N + 1 INDWRK = 6*N + 1 INDWLC = INDWRK * IINSPL = 1 IINDBL = N + 1 IINDW = 2*N + 1 IINDWK = 3*N + 1 * Set the tolerance parameters for bisection RTOL1 = FOUR*SQRT(EPS) RTOL2 = MAX( SQRT(EPS)*5.0E-3, FOUR * EPS ) IF( WANTZ ) THEN * * Compute the desired eigenvectors corresponding to the computed * eigenvalues * CALL SLARRV2( N, WL, WU, D, E, $ PIVMIN, IWORK( IINSPL ), M, $ DOL, DOU, NEEDIL, NEEDIU, MINRGP, RTOL1, RTOL2, $ W, WORK( INDERR ), WORK( INDGP ), IWORK( IINDBL ), $ IWORK( IINDW ), WORK( INDGRS ), $ WORK( INDSDM ), Z, LDZ, $ ISUPPZ, WORK( INDWRK ), IWORK( IINDWK ), $ VSTART, FINISH, $ MAXCLS, NDEPTH, PARITY, ZOFFSET, IINFO ) IF( IINFO.NE.0 ) THEN INFO = 200 + ABS( IINFO ) RETURN END IF * ELSE * SLARRE2A computed eigenvalues of the (shifted) root representation * SLARRV2 returns the eigenvalues of the unshifted matrix. * However, if the eigenvectors are not desired by the user, we need * to apply the corresponding shifts from SLARRE2A to obtain the * eigenvalues of the original matrix. DO 30 J = 1, M ITMP = IWORK( IINDBL+J-1 ) W( J ) = W( J ) + E( IWORK( IINSPL+ITMP-1 ) ) 30 CONTINUE * FINISH = .TRUE. * END IF * IF(FINISH) THEN * All eigenpairs have been computed * * If matrix was scaled, then rescale eigenvalues appropriately. * IF( SCALE.NE.ONE ) THEN CALL SSCAL( M, ONE / SCALE, W, 1 ) END IF * * Correct M if needed * IF ( WANTZ ) THEN IF( DOL.NE.1 .OR. DOU.NE.M ) THEN M = DOU - DOL +1 ENDIF ENDIF * * No sorting of eigenpairs is done here, done later in the * calling subroutine * WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN ENDIF RETURN * * End of SSTEGR2B * END scalapack-2.0.2/SRC/sstein2.f000644 000766 000024 00000026375 10363532303 016111 0ustar00juliestaff000000 000000 * * SUBROUTINE SSTEIN2( N, D, E, M, W, IBLOCK, ISPLIT, ORFAC, Z, LDZ, $ WORK, IWORK, IFAIL, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * May 1, 1997 * * .. Scalar Arguments .. INTEGER INFO, LDZ, M, N REAL ORFAC * .. * .. Array Arguments .. INTEGER IBLOCK( * ), IFAIL( * ), ISPLIT( * ), $ IWORK( * ) REAL D( * ), E( * ), W( * ), WORK( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * SSTEIN2 computes the eigenvectors of a real symmetric tridiagonal * matrix T corresponding to specified eigenvalues, using inverse * iteration. * * The maximum number of iterations allowed for each eigenvector is * specified by an internal parameter MAXITS (currently set to 5). * * Arguments * ========= * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input) REAL array, dimension (N) * The n diagonal elements of the tridiagonal matrix T. * * E (input) REAL array, dimension (N) * The (n-1) subdiagonal elements of the tridiagonal matrix * T, in elements 1 to N-1. E(N) need not be set. * * M (input) INTEGER * The number of eigenvectors to be found. 0 <= M <= N. * * W (input) REAL array, dimension (N) * The first M elements of W contain the eigenvalues for * which eigenvectors are to be computed. The eigenvalues * should be grouped by split-off block and ordered from * smallest to largest within the block. ( The output array * W from SSTEBZ with ORDER = 'B' is expected here. ) * * IBLOCK (input) INTEGER array, dimension (N) * The submatrix indices associated with the corresponding * eigenvalues in W; IBLOCK(i)=1 if eigenvalue W(i) belongs to * the first submatrix from the top, =2 if W(i) belongs to * the second submatrix, etc. ( The output array IBLOCK * from SSTEBZ is expected here. ) * * ISPLIT (input) INTEGER array, dimension (N) * The splitting points, at which T breaks up into submatrices. * The first submatrix consists of rows/columns 1 to * ISPLIT( 1 ), the second of rows/columns ISPLIT( 1 )+1 * through ISPLIT( 2 ), etc. * ( The output array ISPLIT from SSTEBZ is expected here. ) * * ORFAC (input) REAL * ORFAC specifies which eigenvectors should be * orthogonalized. Eigenvectors that correspond to eigenvalues * which are within ORFAC*||T|| of each other are to be * orthogonalized. * * Z (output) REAL array, dimension (LDZ, M) * The computed eigenvectors. The eigenvector associated * with the eigenvalue W(i) is stored in the i-th column of * Z. Any vector which fails to converge is set to its current * iterate after MAXITS iterations. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= max(1,N). * * WORK (workspace) REAL array, dimension (5*N) * * IWORK (workspace) INTEGER array, dimension (N) * * IFAIL (output) INTEGER array, dimension (M) * On normal exit, all elements of IFAIL are zero. * If one or more eigenvectors fail to converge after * MAXITS iterations, then their indices are stored in * array IFAIL. * * INFO (output) INTEGER * = 0: successful exit. * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, then i eigenvectors failed to converge * in MAXITS iterations. Their indices are stored in * array IFAIL. * * Internal Parameters * =================== * * MAXITS INTEGER, default = 5 * The maximum number of iterations performed. * * EXTRA INTEGER, default = 2 * The number of iterations performed after norm growth * criterion is satisfied, should be at least 1. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE, TEN, ODM1 PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TEN = 1.0E+1, $ ODM1 = 1.0E-1 ) INTEGER MAXITS, EXTRA PARAMETER ( MAXITS = 5, EXTRA = 2 ) * .. * .. Local Scalars .. INTEGER B1, BLKSIZ, BN, GPIND, I, IINFO, INDRV1, $ INDRV2, INDRV3, INDRV4, INDRV5, ITS, J, J1, $ JBLK, JMAX, NBLK, NRMCHK REAL EPS, EPS1, NRM, ONENRM, ORTOL, PERTOL, SCL, $ SEP, STPCRT, TOL, XJ, XJM, ZTR * .. * .. Local Arrays .. INTEGER ISEED( 4 ) * .. * .. External Functions .. INTEGER ISAMAX REAL SASUM, SDOT, SLAMCH, SNRM2 EXTERNAL ISAMAX, SASUM, SDOT, SLAMCH, SNRM2 * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SLAGTF, SLAGTS, SLARNV, SSCAL, $ XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 DO 10 I = 1, M IFAIL( I ) = 0 10 CONTINUE * IF( N.LT.0 ) THEN INFO = -1 ELSE IF( M.LT.0 .OR. M.GT.N ) THEN INFO = -4 ELSE IF( ORFAC.LT.ZERO ) THEN INFO = -8 ELSE IF( LDZ.LT.MAX( 1, N ) ) THEN INFO = -10 ELSE DO 20 J = 2, M IF( IBLOCK( J ).LT.IBLOCK( J-1 ) ) THEN INFO = -6 GO TO 30 END IF IF( IBLOCK( J ).EQ.IBLOCK( J-1 ) .AND. W( J ).LT.W( J-1 ) ) $ THEN INFO = -5 GO TO 30 END IF 20 CONTINUE 30 CONTINUE END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSTEIN2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. M.EQ.0 ) THEN RETURN ELSE IF( N.EQ.1 ) THEN Z( 1, 1 ) = ONE RETURN END IF * * Get machine constants. * EPS = SLAMCH( 'Precision' ) * * Initialize seed for random number generator SLARNV. * DO 40 I = 1, 4 ISEED( I ) = 1 40 CONTINUE * * Initialize pointers. * INDRV1 = 0 INDRV2 = INDRV1 + N INDRV3 = INDRV2 + N INDRV4 = INDRV3 + N INDRV5 = INDRV4 + N * * Compute eigenvectors of matrix blocks. * J1 = 1 DO 160 NBLK = 1, IBLOCK( M ) * * Find starting and ending indices of block nblk. * IF( NBLK.EQ.1 ) THEN B1 = 1 ELSE B1 = ISPLIT( NBLK-1 ) + 1 END IF BN = ISPLIT( NBLK ) BLKSIZ = BN - B1 + 1 IF( BLKSIZ.EQ.1 ) $ GO TO 60 GPIND = J1 * * Compute reorthogonalization criterion and stopping criterion. * ONENRM = ABS( D( B1 ) ) + ABS( E( B1 ) ) ONENRM = MAX( ONENRM, ABS( D( BN ) )+ABS( E( BN-1 ) ) ) DO 50 I = B1 + 1, BN - 1 ONENRM = MAX( ONENRM, ABS( D( I ) )+ABS( E( I-1 ) )+ $ ABS( E( I ) ) ) 50 CONTINUE ORTOL = ORFAC*ONENRM * STPCRT = SQRT( ODM1 / BLKSIZ ) * * Loop through eigenvalues of block nblk. * 60 CONTINUE JBLK = 0 DO 150 J = J1, M IF( IBLOCK( J ).NE.NBLK ) THEN J1 = J GO TO 160 END IF JBLK = JBLK + 1 XJ = W( J ) * * Skip all the work if the block size is one. * IF( BLKSIZ.EQ.1 ) THEN WORK( INDRV1+1 ) = ONE GO TO 120 END IF * * If eigenvalues j and j-1 are too close, add a relatively * small perturbation. * IF( JBLK.GT.1 ) THEN EPS1 = ABS( EPS*XJ ) PERTOL = TEN*EPS1 SEP = XJ - XJM IF( SEP.LT.PERTOL ) $ XJ = XJM + PERTOL END IF * ITS = 0 NRMCHK = 0 * * Get random starting vector. * CALL SLARNV( 2, ISEED, BLKSIZ, WORK( INDRV1+1 ) ) * * Copy the matrix T so it won't be destroyed in factorization. * CALL SCOPY( BLKSIZ, D( B1 ), 1, WORK( INDRV4+1 ), 1 ) CALL SCOPY( BLKSIZ-1, E( B1 ), 1, WORK( INDRV2+2 ), 1 ) CALL SCOPY( BLKSIZ-1, E( B1 ), 1, WORK( INDRV3+1 ), 1 ) * * Compute LU factors with partial pivoting ( PT = LU ) * TOL = ZERO CALL SLAGTF( BLKSIZ, WORK( INDRV4+1 ), XJ, WORK( INDRV2+2 ), $ WORK( INDRV3+1 ), TOL, WORK( INDRV5+1 ), IWORK, $ IINFO ) * * Update iteration count. * 70 CONTINUE ITS = ITS + 1 IF( ITS.GT.MAXITS ) $ GO TO 100 * * Normalize and scale the righthand side vector Pb. * SCL = BLKSIZ*ONENRM*MAX( EPS, $ ABS( WORK( INDRV4+BLKSIZ ) ) ) / $ SASUM( BLKSIZ, WORK( INDRV1+1 ), 1 ) CALL SSCAL( BLKSIZ, SCL, WORK( INDRV1+1 ), 1 ) * * Solve the system LU = Pb. * CALL SLAGTS( -1, BLKSIZ, WORK( INDRV4+1 ), WORK( INDRV2+2 ), $ WORK( INDRV3+1 ), WORK( INDRV5+1 ), IWORK, $ WORK( INDRV1+1 ), TOL, IINFO ) * * Reorthogonalize by modified Gram-Schmidt if eigenvalues are * close enough. * IF( JBLK.EQ.1 ) $ GO TO 90 IF( ABS( XJ-XJM ).GT.ORTOL ) $ GPIND = J * IF( GPIND.NE.J ) THEN DO 80 I = GPIND, J - 1 ZTR = -SDOT( BLKSIZ, WORK( INDRV1+1 ), 1, Z( B1, I ), $ 1 ) CALL SAXPY( BLKSIZ, ZTR, Z( B1, I ), 1, $ WORK( INDRV1+1 ), 1 ) 80 CONTINUE END IF * * Check the infinity norm of the iterate. * 90 CONTINUE JMAX = ISAMAX( BLKSIZ, WORK( INDRV1+1 ), 1 ) NRM = ABS( WORK( INDRV1+JMAX ) ) * * Continue for additional iterations after norm reaches * stopping criterion. * IF( NRM.LT.STPCRT ) $ GO TO 70 NRMCHK = NRMCHK + 1 IF( NRMCHK.LT.EXTRA+1 ) $ GO TO 70 * GO TO 110 * * If stopping criterion was not satisfied, update info and * store eigenvector number in array ifail. * 100 CONTINUE INFO = INFO + 1 IFAIL( INFO ) = J * * Accept iterate as jth eigenvector. * 110 CONTINUE SCL = ONE / SNRM2( BLKSIZ, WORK( INDRV1+1 ), 1 ) JMAX = ISAMAX( BLKSIZ, WORK( INDRV1+1 ), 1 ) IF( WORK( INDRV1+JMAX ).LT.ZERO ) $ SCL = -SCL CALL SSCAL( BLKSIZ, SCL, WORK( INDRV1+1 ), 1 ) 120 CONTINUE DO 130 I = 1, N Z( I, J ) = ZERO 130 CONTINUE DO 140 I = 1, BLKSIZ Z( B1+I-1, J ) = WORK( INDRV1+I ) 140 CONTINUE * * Save the shift to check eigenvalue spacing at next * iteration. * XJM = XJ * 150 CONTINUE 160 CONTINUE * RETURN * * End of SSTEIN2 * END scalapack-2.0.2/SRC/ssteqr2.f000644 000766 000024 00000032624 10363532303 016117 0ustar00juliestaff000000 000000 SUBROUTINE SSTEQR2( COMPZ, N, D, E, Z, LDZ, NR, WORK, INFO ) * * -- LAPACK routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER COMPZ INTEGER INFO, LDZ, N, NR * .. * .. Array Arguments .. REAL D( * ), E( * ), WORK( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * SSTEQR2 is a modified version of LAPACK routine SSTEQR. * SSTEQR2 computes all eigenvalues and, optionally, eigenvectors of a * symmetric tridiagonal matrix using the implicit QL or QR method. * running SSTEQR2 to perform updates on a distributed matrix Q. * Proper usage of SSTEQR2 can be gleaned from examination of ScaLAPACK's * PSSYEV. * * Arguments * ========= * * COMPZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only. * = 'I': Compute eigenvalues and eigenvectors of the * tridiagonal matrix. Z must be initialized to the * identity matrix by PDLASET or DLASET prior to entering * this subroutine. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) REAL array, dimension (N) * On entry, the diagonal elements of the tridiagonal matrix. * On exit, if INFO = 0, the eigenvalues in ascending order. * * E (input/output) REAL array, dimension (N-1) * On entry, the (n-1) subdiagonal elements of the tridiagonal * matrix. * On exit, E has been destroyed. * * Z (local input/local output) REAL array, global * dimension (N, N), local dimension (LDZ, NR). * On entry, if COMPZ = 'V', then Z contains the orthogonal * matrix used in the reduction to tridiagonal form. * On exit, if INFO = 0, then if COMPZ = 'V', Z contains the * orthonormal eigenvectors of the original symmetric matrix, * and if COMPZ = 'I', Z contains the orthonormal eigenvectors * of the symmetric tridiagonal matrix. * If COMPZ = 'N', then Z is not referenced. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * eigenvectors are desired, then LDZ >= max(1,N). * * NR (input) INTEGER * NR = MAX(1, NUMROC( N, NB, MYPROW, 0, NPROCS ) ). * If COMPZ = 'N', then NR is not referenced. * * WORK (workspace) REAL array, dimension (max(1,2*N-2)) * If COMPZ = 'N', then WORK is not referenced. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: the algorithm has failed to find all the eigenvalues in * a total of 30*N iterations; if INFO = i, then i * elements of E have not converged to zero; on exit, D * and E contain the elements of a symmetric tridiagonal * matrix which is orthogonally similar to the original * matrix. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE, TWO, THREE PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, TWO = 2.0E0, $ THREE = 3.0E0 ) INTEGER MAXIT PARAMETER ( MAXIT = 30 ) * .. * .. Local Scalars .. INTEGER I, ICOMPZ, II, ISCALE, J, JTOT, K, L, L1, LEND, $ LENDM1, LENDP1, LENDSV, LM1, LSV, M, MM, MM1, $ NM1, NMAXIT REAL ANORM, B, C, EPS, EPS2, F, G, P, R, RT1, RT2, $ S, SAFMAX, SAFMIN, SSFMAX, SSFMIN, TST * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH, SLANST, SLAPY2 EXTERNAL LSAME, SLAMCH, SLANST, SLAPY2 * .. * .. External Subroutines .. EXTERNAL SLAE2, SLAEV2, SLARTG, SLASCL, SLASR, $ SLASRT, SSWAP, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SIGN, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 * IF( LSAME( COMPZ, 'N' ) ) THEN ICOMPZ = 0 ELSE IF( LSAME( COMPZ, 'I' ) ) THEN ICOMPZ = 1 ELSE ICOMPZ = -1 END IF IF( ICOMPZ.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( ICOMPZ.GT.0 .AND. LDZ.LT.MAX( 1, NR ) ) THEN INFO = -6 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSTEQR2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( N.EQ.1 ) THEN IF( ICOMPZ.EQ.1 ) $ Z( 1, 1 ) = ONE RETURN END IF * * Determine the unit roundoff and over/underflow thresholds. * EPS = SLAMCH( 'E' ) EPS2 = EPS**2 SAFMIN = SLAMCH( 'S' ) SAFMAX = ONE / SAFMIN SSFMAX = SQRT( SAFMAX ) / THREE SSFMIN = SQRT( SAFMIN ) / EPS2 * * Compute the eigenvalues and eigenvectors of the tridiagonal * matrix. * NMAXIT = N*MAXIT JTOT = 0 * * Determine where the matrix splits and choose QL or QR iteration * for each block, according to whether top or bottom diagonal * element is smaller. * L1 = 1 NM1 = N - 1 * 10 CONTINUE IF( L1.GT.N ) $ GO TO 160 IF( L1.GT.1 ) $ E( L1-1 ) = ZERO IF( L1.LE.NM1 ) THEN DO 20 M = L1, NM1 TST = ABS( E( M ) ) IF( TST.EQ.ZERO ) $ GO TO 30 IF( TST.LE.( SQRT( ABS( D( M ) ) )*SQRT( ABS( D( M+ $ 1 ) ) ) )*EPS ) THEN E( M ) = ZERO GO TO 30 END IF 20 CONTINUE END IF M = N * 30 CONTINUE L = L1 LSV = L LEND = M LENDSV = LEND L1 = M + 1 IF( LEND.EQ.L ) $ GO TO 10 * * Scale submatrix in rows and columns L to LEND * ANORM = SLANST( 'I', LEND-L+1, D( L ), E( L ) ) ISCALE = 0 IF( ANORM.EQ.ZERO ) $ GO TO 10 IF( ANORM.GT.SSFMAX ) THEN ISCALE = 1 CALL SLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L+1, 1, D( L ), N, $ INFO ) CALL SLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L, 1, E( L ), N, $ INFO ) ELSE IF( ANORM.LT.SSFMIN ) THEN ISCALE = 2 CALL SLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L+1, 1, D( L ), N, $ INFO ) CALL SLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L, 1, E( L ), N, $ INFO ) END IF * * Choose between QL and QR iteration * IF( ABS( D( LEND ) ).LT.ABS( D( L ) ) ) THEN LEND = LSV L = LENDSV END IF * IF( LEND.GT.L ) THEN * * QL Iteration * * Look for small subdiagonal element. * 40 CONTINUE IF( L.NE.LEND ) THEN LENDM1 = LEND - 1 DO 50 M = L, LENDM1 TST = ABS( E( M ) )**2 IF( TST.LE.( EPS2*ABS( D( M ) ) )*ABS( D( M+1 ) )+ $ SAFMIN )GO TO 60 50 CONTINUE END IF * M = LEND * 60 CONTINUE IF( M.LT.LEND ) $ E( M ) = ZERO P = D( L ) IF( M.EQ.L ) $ GO TO 80 * * If remaining matrix is 2-by-2, use SLAE2 or SLAEV2 * to compute its eigensystem. * IF( M.EQ.L+1 ) THEN IF( ICOMPZ.GT.0 ) THEN CALL SLAEV2( D( L ), E( L ), D( L+1 ), RT1, RT2, C, S ) WORK( L ) = C WORK( N-1+L ) = S CALL SLASR( 'R', 'V', 'B', NR, 2, WORK( L ), $ WORK( N-1+L ), Z( 1, L ), LDZ ) ELSE CALL SLAE2( D( L ), E( L ), D( L+1 ), RT1, RT2 ) END IF D( L ) = RT1 D( L+1 ) = RT2 E( L ) = ZERO L = L + 2 IF( L.LE.LEND ) $ GO TO 40 GO TO 140 END IF * IF( JTOT.EQ.NMAXIT ) $ GO TO 140 JTOT = JTOT + 1 * * Form shift. * G = ( D( L+1 )-P ) / ( TWO*E( L ) ) R = SLAPY2( G, ONE ) G = D( M ) - P + ( E( L ) / ( G+SIGN( R, G ) ) ) * S = ONE C = ONE P = ZERO * * Inner loop * MM1 = M - 1 DO 70 I = MM1, L, -1 F = S*E( I ) B = C*E( I ) CALL SLARTG( G, F, C, S, R ) IF( I.NE.M-1 ) $ E( I+1 ) = R G = D( I+1 ) - P R = ( D( I )-G )*S + TWO*C*B P = S*R D( I+1 ) = G + P G = C*R - B * * If eigenvectors are desired, then save rotations. * IF( ICOMPZ.GT.0 ) THEN WORK( I ) = C WORK( N-1+I ) = -S END IF * 70 CONTINUE * * If eigenvectors are desired, then apply saved rotations. * IF( ICOMPZ.GT.0 ) THEN MM = M - L + 1 CALL SLASR( 'R', 'V', 'B', NR, MM, WORK( L ), WORK( N-1+L ), $ Z( 1, L ), LDZ ) END IF * D( L ) = D( L ) - P E( L ) = G GO TO 40 * * Eigenvalue found. * 80 CONTINUE D( L ) = P * L = L + 1 IF( L.LE.LEND ) $ GO TO 40 GO TO 140 * ELSE * * QR Iteration * * Look for small superdiagonal element. * 90 CONTINUE IF( L.NE.LEND ) THEN LENDP1 = LEND + 1 DO 100 M = L, LENDP1, -1 TST = ABS( E( M-1 ) )**2 IF( TST.LE.( EPS2*ABS( D( M ) ) )*ABS( D( M-1 ) )+ $ SAFMIN )GO TO 110 100 CONTINUE END IF * M = LEND * 110 CONTINUE IF( M.GT.LEND ) $ E( M-1 ) = ZERO P = D( L ) IF( M.EQ.L ) $ GO TO 130 * * If remaining matrix is 2-by-2, use SLAE2 or SLAEV2 * to compute its eigensystem. * IF( M.EQ.L-1 ) THEN IF( ICOMPZ.GT.0 ) THEN CALL SLAEV2( D( L-1 ), E( L-1 ), D( L ), RT1, RT2, C, S ) WORK( M ) = C WORK( N-1+M ) = S CALL SLASR( 'R', 'V', 'F', NR, 2, WORK( M ), $ WORK( N-1+M ), Z( 1, L-1 ), LDZ ) ELSE CALL SLAE2( D( L-1 ), E( L-1 ), D( L ), RT1, RT2 ) END IF D( L-1 ) = RT1 D( L ) = RT2 E( L-1 ) = ZERO L = L - 2 IF( L.GE.LEND ) $ GO TO 90 GO TO 140 END IF * IF( JTOT.EQ.NMAXIT ) $ GO TO 140 JTOT = JTOT + 1 * * Form shift. * G = ( D( L-1 )-P ) / ( TWO*E( L-1 ) ) R = SLAPY2( G, ONE ) G = D( M ) - P + ( E( L-1 ) / ( G+SIGN( R, G ) ) ) * S = ONE C = ONE P = ZERO * * Inner loop * LM1 = L - 1 DO 120 I = M, LM1 F = S*E( I ) B = C*E( I ) CALL SLARTG( G, F, C, S, R ) IF( I.NE.M ) $ E( I-1 ) = R G = D( I ) - P R = ( D( I+1 )-G )*S + TWO*C*B P = S*R D( I ) = G + P G = C*R - B * * If eigenvectors are desired, then save rotations. * IF( ICOMPZ.GT.0 ) THEN WORK( I ) = C WORK( N-1+I ) = S END IF * 120 CONTINUE * * If eigenvectors are desired, then apply saved rotations. * IF( ICOMPZ.GT.0 ) THEN MM = L - M + 1 CALL SLASR( 'R', 'V', 'F', NR, MM, WORK( M ), WORK( N-1+M ), $ Z( 1, M ), LDZ ) END IF * D( L ) = D( L ) - P E( LM1 ) = G GO TO 90 * * Eigenvalue found. * 130 CONTINUE D( L ) = P * L = L - 1 IF( L.GE.LEND ) $ GO TO 90 GO TO 140 * END IF * * Undo scaling if necessary * 140 CONTINUE IF( ISCALE.EQ.1 ) THEN CALL SLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV+1, 1, $ D( LSV ), N, INFO ) CALL SLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV, 1, E( LSV ), $ N, INFO ) ELSE IF( ISCALE.EQ.2 ) THEN CALL SLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV+1, 1, $ D( LSV ), N, INFO ) CALL SLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV, 1, E( LSV ), $ N, INFO ) END IF * * Check for no convergence to an eigenvalue after a total * of N*MAXIT iterations. * IF( JTOT.LT.NMAXIT ) $ GO TO 10 DO 150 I = 1, N - 1 IF( E( I ).NE.ZERO ) $ INFO = INFO + 1 150 CONTINUE GO TO 190 * * Order eigenvalues and eigenvectors. * 160 CONTINUE IF( ICOMPZ.EQ.0 ) THEN * * Use Quick Sort * CALL SLASRT( 'I', N, D, INFO ) * ELSE * * Use Selection Sort to minimize swaps of eigenvectors * DO 180 II = 2, N I = II - 1 K = I P = D( I ) DO 170 J = II, N IF( D( J ).LT.P ) THEN K = J P = D( J ) END IF 170 CONTINUE IF( K.NE.I ) THEN D( K ) = D( I ) D( I ) = P CALL SSWAP( NR, Z( 1, I ), 1, Z( 1, K ), 1 ) END IF 180 CONTINUE END IF * 190 CONTINUE RETURN * * End of SSTEQR2 * END scalapack-2.0.2/SRC/strmvt.f000644 000766 000024 00000011123 10363532303 016042 0ustar00juliestaff000000 000000 SUBROUTINE STRMVT( UPLO, N, T, LDT, X, INCX, Y, INCY, W, INCW, Z, $ INCZ ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCW, INCX, INCY, INCZ, LDT, N * .. * .. Array Arguments .. REAL T( LDT, * ), W( * ), X( * ), Y( * ), Z( * ) * .. * * Purpose * ======= * * STRMVT performs the matrix-vector operations * * x := T' *y, and w := T *z, * * where x is an n element vector and T is an n by n * upper or lower triangular matrix. * * Arguments * ========= * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * T - REAL array of DIMENSION ( LDT, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array T must contain the upper * triangular matrix and the strictly lower triangular part of * T is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array T must contain the lower * triangular matrix and the strictly upper triangular part of * T is not referenced. * * LDT - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * On exit, X = T' * y * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * W - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCW ) ). * On exit, W = T * z * * INCW - INTEGER. * On entry, INCW specifies the increment for the elements of * W. INCW must not be zero. * Unchanged on exit. * * Z - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCZ ) ). * Before entry, the incremented array Z must contain the n * element vector z. Unchanged on exit. * * INCY - INTEGER. * On entrz, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * * .. Local Scalars .. INTEGER INFO * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL SCOPY, STRMV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN INFO = 4 ELSE IF( INCW.EQ.0 ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 10 ELSE IF( INCZ.EQ.0 ) THEN INFO = 12 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'STRMVT', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * * IF( INCX.NE.1 .OR. INCY.NE.1 .OR. INCW.NE.1 .OR. INCZ.NE.1 .OR. $ .TRUE. ) THEN CALL SCOPY( N, Y, INCY, X, INCX ) CALL STRMV( UPLO, 'C', 'N', N, T, LDT, X, INCX ) CALL SCOPY( N, Z, INCZ, W, INCW ) CALL STRMV( UPLO, 'N', 'N', N, T, LDT, W, INCW ) RETURN END IF * RETURN * * End of STRMVT. * END scalapack-2.0.2/SRC/tools.h000644 000766 000024 00000002200 10363532303 015641 0ustar00juliestaff000000 000000 #include "./pblas.h" #ifdef __STDC__ typedef void (*CPYPTR)(int, int, float *, int, float *, int); #define SLVOID void #else typedef void (*CPYPTR)(); #define SLVOID char #endif #define ErrPrnt fprintf(stderr, "line %d of file %s\n",__LINE__, __FILE__); #define Mdescset(desc, m, n, mb, nb, rsrc, csrc, ictxt, lld) \ { \ (desc)[DT_] = BLOCK_CYCLIC_2D; \ (desc)[CTXT_] = (ictxt); \ (desc)[M_] = (m); \ (desc)[N_] = (n); \ (desc)[MB_] = (mb); \ (desc)[NB_] = (nb); \ (desc)[RSRC_] = (rsrc); \ (desc)[CSRC_] = (csrc); \ (desc)[LLD_] = (lld); \ } #define MCindxg2p(IG, nb, srcproc, nprocs) \ ( ((srcproc) + (IG)/(nb)) % nprocs ) typedef struct {double r, i;} DCOMPLEX; typedef struct {float r, i;} SCOMPLEX; #define Mmalloc(M_ptr, M_type, M_elt, M_i, M_ctxt) \ { \ void pberror_(); \ (M_ptr) = ( M_type * ) malloc((M_elt)*(sizeof(M_type))); \ if (!(M_ptr)) \ { \ if ((M_elt) > 0) \ { \ (M_i) = 1; \ fprintf(stderr, "Not enough memory on line %d of file %s!!\n", \ __LINE__, __FILE__); \ pberror_(&(M_ctxt), __FILE__, &(M_i)); \ } \ } \ } scalapack-2.0.2/SRC/zdbtf2.f000644 000766 000024 00000012602 11657111056 015706 0ustar00juliestaff000000 000000 SUBROUTINE ZDBTF2( M, N, KL, KU, AB, LDAB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Modified by Andrew J. Cleary in November, 96 from: * -- LAPACK auxiliary routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * August 6, 1991 * * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDAB, M, N * .. * .. Array Arguments .. COMPLEX*16 AB( LDAB, * ) * .. * * Purpose * ======= * * Zdbtrf computes an LU factorization of a real m-by-n band matrix A * without using partial pivoting with row interchanges. * * This is the unblocked version of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of subdiagonals within the band of A. KL >= 0. * * KU (input) INTEGER * The number of superdiagonals within the band of A. KU >= 0. * * AB (input/output) COMPLEX*16 array, dimension (LDAB,N) * On entry, the matrix A in band storage, in rows KL+1 to * 2*KL+KU+1; rows 1 to KL of the array need not be set. * The j-th column of A is stored in the j-th column of the * array AB as follows: * AB(kl+ku+1+i-j,j) = A(i,j) for max(1,j-ku)<=i<=min(m,j+kl) * * On exit, details of the factorization: U is stored as an * upper triangular band matrix with KL+KU superdiagonals in * rows 1 to KL+KU+1, and the multipliers used during the * factorization are stored in rows KL+KU+2 to 2*KL+KU+1. * See below for further details. * * LDAB (input) INTEGER * The leading dimension of the array AB. LDAB >= 2*KL+KU+1. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = +i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * Further Details * =============== * * The band storage scheme is illustrated by the following example, when * M = N = 6, KL = 2, KU = 1: * * On entry: On exit: * * * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 * a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 * a21 a32 a43 a54 a65 * m21 m32 m43 m54 m65 * * a31 a42 a53 a64 * * m31 m42 m53 m64 * * * * Array elements marked * are not used by the routine; elements marked * + need not be set on entry, but are required by the routine to store * elements of U, because of fill-in resulting from the row * interchanges. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER J, JP, JU, KM, KV * .. * .. External Functions .. INTEGER ISAMAX EXTERNAL ISAMAX * .. * .. External Subroutines .. EXTERNAL ZGERU, ZSCAL, ZSWAP * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * KV is the number of superdiagonals in the factor U, allowing for * fill-in. * KV = KU * * Test the input parameters. * INFO = 0 *ECA IF( M.LT.0 ) THEN *ECA INFO = -1 *ECA ELSE IF( N.LT.0 ) THEN *ECA INFO = -2 *ECA ELSE IF( KL.LT.0 ) THEN *ECA INFO = -3 *ECA ELSE IF( KU.LT.0 ) THEN *ECA INFO = -4 *ECA ELSE IF( LDAB.LT.KL+KV+1 ) THEN *ECA INFO = -6 *ECA END IF *ECA IF( INFO.NE.0 ) THEN *ECA CALL XERBLA( 'ZDBTF2', -INFO ) *ECA RETURN *ECA END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Gaussian elimination without partial pivoting * * JU is the index of the last column affected by the current stage * of the factorization. * JU = 1 * DO 40 J = 1, MIN( M, N ) * * Test for singularity. KM is the number of * subdiagonal elements in the current column. * KM = MIN( KL, M-J ) JP = 1 IF( AB( KV+1, J ).NE.ZERO ) THEN JU = MAX( JU, MIN( J+KU, N ) ) * IF( KM.GT.0 ) THEN * * Compute multipliers. * CALL ZSCAL( KM, ONE / AB( KU+1, J ), AB( KU+2, J ), 1 ) * * Update trailing submatrix within the band. * IF( JU.GT.J ) THEN CALL ZGERU( KM, JU-J, -CONE, AB( KU+2, J ), 1, $ AB( KU, J+1 ), LDAB-1, AB( KU+1, J+1 ), $ LDAB-1 ) END IF END IF ELSE * IF( INFO.EQ.0 ) $ INFO = J END IF 40 CONTINUE RETURN * * End of ZDBTF2 * END scalapack-2.0.2/SRC/zdbtrf.f000644 000766 000024 00000025351 11657111056 016013 0ustar00juliestaff000000 000000 SUBROUTINE ZDBTRF( M, N, KL, KU, AB, LDAB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * August, 1996. * Modified from ZGBTRF: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * August 6, 1991 * * .. Scalar Arguments .. INTEGER INFO, KL, KU, LDAB, M, N * .. * .. Array Arguments .. COMPLEX*16 AB( LDAB, * ) * .. * * Purpose * ======= * * Zdbtrf computes an LU factorization of a real m-by-n band matrix A * without using partial pivoting or row interchanges. * * This is the blocked version of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * KL (input) INTEGER * The number of subdiagonals within the band of A. KL >= 0. * * KU (input) INTEGER * The number of superdiagonals within the band of A. KU >= 0. * * AB (input/output) REAL array, dimension (LDAB,N) * On entry, the matrix A in band storage, in rows KL+1 to * 2*KL+KU+1; rows 1 to KL of the array need not be set. * The j-th column of A is stored in the j-th column of the * array AB as follows: * AB(kl+ku+1+i-j,j) = A(i,j) for max(1,j-ku)<=i<=min(m,j+kl) * * On exit, details of the factorization: U is stored as an * upper triangular band matrix with KL+KU superdiagonals in * rows 1 to KL+KU+1, and the multipliers used during the * factorization are stored in rows KL+KU+2 to 2*KL+KU+1. * See below for further details. * * LDAB (input) INTEGER * The leading dimension of the array AB. LDAB >= 2*KL+KU+1. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = +i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * Further Details * =============== * * The band storage scheme is illustrated by the following example, when * M = N = 6, KL = 2, KU = 1: * * On entry: On exit: * * * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 * a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 * a21 a32 a43 a54 a65 * m21 m32 m43 m54 m65 * * a31 a42 a53 a64 * * m31 m42 m53 m64 * * * * Array elements marked * are not used by the routine. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0 ) PARAMETER ( ZERO = 0.0D+0 ) COMPLEX*16 CONE, CZERO PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER NBMAX, LDWORK PARAMETER ( NBMAX = 64, LDWORK = NBMAX+1 ) * .. * .. Local Scalars .. INTEGER I, I2, I3, II, J, J2, J3, JB, JJ, JM, JP, $ JU, KM, KV, NB, NW * .. * .. Local Arrays .. COMPLEX*16 WORK13( LDWORK, NBMAX ), $ WORK31( LDWORK, NBMAX ) * .. * .. External Functions .. INTEGER ILAENV, ISAMAX EXTERNAL ILAENV, ISAMAX * .. * .. External Subroutines .. EXTERNAL ZCOPY, ZDBTF2, ZGEMM, ZGERU, ZSCAL, $ ZSWAP, ZTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * KV is the number of superdiagonals in the factor U * KV = KU * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( KL.LT.0 ) THEN INFO = -3 ELSE IF( KU.LT.0 ) THEN INFO = -4 ELSE IF( LDAB.LT.MIN( MIN( KL+KV+1,M ),N ) ) THEN INFO = -6 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZDBTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Determine the block size for this environment * NB = ILAENV( 1, 'ZDBTRF', ' ', M, N, KL, KU ) * * The block size must not exceed the limit set by the size of the * local arrays WORK13 and WORK31. * NB = MIN( NB, NBMAX ) * IF( NB.LE.1 .OR. NB.GT.KL ) THEN * * Use unblocked code * CALL ZDBTF2( M, N, KL, KU, AB, LDAB, INFO ) ELSE * * Use blocked code * * Zero the superdiagonal elements of the work array WORK13 * DO 20 J = 1, NB DO 10 I = 1, J - 1 WORK13( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * * Zero the subdiagonal elements of the work array WORK31 * DO 40 J = 1, NB DO 30 I = J + 1, NB WORK31( I, J ) = ZERO 30 CONTINUE 40 CONTINUE * * JU is the index of the last column affected by the current * stage of the factorization * JU = 1 * DO 180 J = 1, MIN( M, N ), NB JB = MIN( NB, MIN( M, N )-J+1 ) * * The active part of the matrix is partitioned * * A11 A12 A13 * A21 A22 A23 * A31 A32 A33 * * Here A11, A21 and A31 denote the current block of JB columns * which is about to be factorized. The number of rows in the * partitioning are JB, I2, I3 respectively, and the numbers * of columns are JB, J2, J3. The superdiagonal elements of A13 * and the subdiagonal elements of A31 lie outside the band. * I2 = MIN( KL-JB, M-J-JB+1 ) I3 = MIN( JB, M-J-KL+1 ) * * J2 and J3 are computed after JU has been updated. * * Factorize the current block of JB columns * DO 80 JJ = J, J + JB - 1 * * Find pivot and test for singularity. KM is the number of * subdiagonal elements in the current column. * KM = MIN( KL, M-JJ ) JP = 1 IF( AB( KV+JP, JJ ).NE.ZERO ) THEN JU = MAX( JU, MIN( JJ+KU+JP-1, N ) ) * * Compute multipliers * CALL ZSCAL( KM, ONE / AB( KV+1, JJ ), AB( KV+2, JJ ), $ 1 ) * * Update trailing submatrix within the band and within * the current block. JM is the index of the last column * which needs to be updated. * JM = MIN( JU, J+JB-1 ) IF( JM.GT.JJ ) THEN CALL ZGERU( KM, JM-JJ, -CONE, AB( KV+2, JJ ), 1, $ AB( KV, JJ+1 ), LDAB-1, $ AB( KV+1, JJ+1 ), LDAB-1 ) END IF END IF * * Copy current column of A31 into the work array WORK31 * NW = MIN( JJ-J+1, I3 ) IF( NW.GT.0 ) $ CALL ZCOPY( NW, AB( KV+KL+1-JJ+J, JJ ), 1, $ WORK31( 1, JJ-J+1 ), 1 ) 80 CONTINUE IF( J+JB.LE.N ) THEN * * Apply the row interchanges to the other blocks. * J2 = MIN( JU-J+1, KV ) - JB J3 = MAX( 0, JU-J-KV+1 ) * * Update the relevant part of the trailing submatrix * IF( J2.GT.0 ) THEN * * Update A12 * CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', $ JB, J2, CONE, AB( KV+1, J ), LDAB-1, $ AB( KV+1-JB, J+JB ), LDAB-1 ) * IF( I2.GT.0 ) THEN * * Update A22 * CALL ZGEMM( 'No transpose', 'No transpose', I2, J2, $ JB, -CONE, AB( KV+1+JB, J ), LDAB-1, $ AB( KV+1-JB, J+JB ), LDAB-1, CONE, $ AB( KV+1, J+JB ), LDAB-1 ) END IF * IF( I3.GT.0 ) THEN * * Update A32 * CALL ZGEMM( 'No transpose', 'No transpose', I3, J2, $ JB, -CONE, WORK31, LDWORK, $ AB( KV+1-JB, J+JB ), LDAB-1, CONE, $ AB( KV+KL+1-JB, J+JB ), LDAB-1 ) END IF END IF * IF( J3.GT.0 ) THEN * * Copy the lower triangle of A13 into the work array * WORK13 * DO 130 JJ = 1, J3 DO 120 II = JJ, JB WORK13( II, JJ ) = AB( II-JJ+1, JJ+J+KV-1 ) 120 CONTINUE 130 CONTINUE * * Update A13 in the work array * CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', $ JB, J3, CONE, AB( KV+1, J ), LDAB-1, $ WORK13, LDWORK ) * IF( I2.GT.0 ) THEN * * Update A23 * CALL ZGEMM( 'No transpose', 'No transpose', I2, J3, $ JB, -CONE, AB( KV+1+JB, J ), LDAB-1, $ WORK13, LDWORK, CONE, AB( 1+JB, J+KV ), $ LDAB-1 ) END IF * IF( I3.GT.0 ) THEN * * Update A33 * CALL ZGEMM( 'No transpose', 'No transpose', I3, J3, $ JB, -CONE, WORK31, LDWORK, WORK13, $ LDWORK, CONE, AB( 1+KL, J+KV ), LDAB-1 ) END IF * * Copy the lower triangle of A13 back into place * DO 150 JJ = 1, J3 DO 140 II = JJ, JB AB( II-JJ+1, JJ+J+KV-1 ) = WORK13( II, JJ ) 140 CONTINUE 150 CONTINUE END IF ELSE END IF * * copy the upper triangle of A31 back into place * DO 170 JJ = J + JB - 1, J, -1 * * Copy the current column of A31 back into place * NW = MIN( I3, JJ-J+1 ) IF( NW.GT.0 ) $ CALL ZCOPY( NW, WORK31( 1, JJ-J+1 ), 1, $ AB( KV+KL+1-JJ+J, JJ ), 1 ) 170 CONTINUE 180 CONTINUE END IF * RETURN * * End of ZDBTRF * END scalapack-2.0.2/SRC/zdttrf.f000644 000766 000024 00000006514 11657111056 016035 0ustar00juliestaff000000 000000 SUBROUTINE ZDTTRF( N, DL, D, DU, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, November 1996. * Modified from ZGTTRF: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. INTEGER INFO, N * .. * .. Array Arguments .. COMPLEX*16 D( * ), DL( * ), DU( * ) * .. * * Purpose * ======= * * ZDTTRF computes an LU factorization of a complex tridiagonal matrix A * using elimination without partial pivoting. * * The factorization has the form * A = L * U * where L is a product of unit lower bidiagonal * matrices and U is upper triangular with nonzeros in only the main * diagonal and first superdiagonal. * * Arguments * ========= * * N (input) INTEGER * The order of the matrix A. N >= 0. * * DL (input/output) COMPLEX array, dimension (N-1) * On entry, DL must contain the (n-1) subdiagonal elements of * A. * On exit, DL is overwritten by the (n-1) multipliers that * define the matrix L from the LU factorization of A. * * D (input/output) COMPLEX array, dimension (N) * On entry, D must contain the diagonal elements of A. * On exit, D is overwritten by the n diagonal elements of the * upper triangular matrix U from the LU factorization of A. * * DU (input/output) COMPLEX array, dimension (N-1) * On entry, DU must contain the (n-1) superdiagonal elements * of A. * On exit, DU is overwritten by the (n-1) elements of the first * superdiagonal of U. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Local Scalars .. INTEGER I COMPLEX*16 FACT * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Parameters .. COMPLEX*16 CZERO PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 CALL XERBLA( 'ZDTTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * DO 20 I = 1, N - 1 IF( DL( I ).EQ.CZERO ) THEN * * Subdiagonal is zero, no elimination is required. * IF( D( I ).EQ.CZERO .AND. INFO.EQ.0 ) $ INFO = I ELSE * FACT = DL( I ) / D( I ) DL( I ) = FACT D( I+1 ) = D( I+1 ) - FACT*DU( I ) END IF 20 CONTINUE IF( D( N ).EQ.CZERO .AND. INFO.EQ.0 ) THEN INFO = N RETURN END IF * RETURN * * End of ZDTTRF * END scalapack-2.0.2/SRC/zdttrsv.f000644 000766 000024 00000013361 11657111056 016236 0ustar00juliestaff000000 000000 SUBROUTINE ZDTTRSV( UPLO, TRANS, N, NRHS, DL, D, DU, $ B, LDB, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * August, 1996. * Modified from ZGTTRS: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. CHARACTER UPLO, TRANS INTEGER INFO, LDB, N, NRHS * .. * .. Array Arguments .. COMPLEX*16 B( LDB, * ), D( * ), DL( * ), DU( * ) * .. * * Purpose * ======= * * ZDTTRSV solves one of the systems of equations * L * X = B, L**T * X = B, or L**H * X = B, * U * X = B, U**T * X = B, or U**H * X = B, * with factors of the tridiagonal matrix A from the LU factorization * computed by ZDTTRF. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether to solve with L or U. * * TRANS (input) CHARACTER * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * DL (input) COMPLEX array, dimension (N-1) * The (n-1) multipliers that define the matrix L from the * LU factorization of A. * * D (input) COMPLEX array, dimension (N) * The n diagonal elements of the upper triangular matrix U from * the LU factorization of A. * * DU (input) COMPLEX array, dimension (N-1) * The (n-1) elements of the first superdiagonal of U. * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, B is overwritten by the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Local Scalars .. LOGICAL LOWER, NOTRAN INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) LOWER = LSAME( UPLO, 'L' ) IF( .NOT.LOWER .AND. .NOT.LSAME( UPLO, 'U' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( N, 1 ) ) THEN INFO = -9 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZDTTRSV', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( NOTRAN ) THEN * IF( LOWER ) THEN * Solve L*X = B, overwriting B with X. * DO 35 J = 1, NRHS * * Solve L*x = b. * DO 10 I = 1, N - 1 B( I+1, J ) = B( I+1, J ) - DL( I )*B( I, J ) 10 CONTINUE 35 CONTINUE * ELSE * Solve U*x = b. * DO 30 J = 1, NRHS B( N, J ) = B( N, J ) / D( N ) IF( N.GT.1 ) $ B( N-1, J ) = ( B( N-1, J )-DU( N-1 )*B( N, J ) ) / $ D( N-1 ) DO 20 I = N - 2, 1, -1 B( I, J ) = ( B( I, J )-DU( I )*B( I+1, J ) ) / D( I ) 20 CONTINUE 30 CONTINUE * ENDIF * ELSE IF( LSAME( TRANS, 'T' ) ) THEN * IF( .NOT. LOWER ) THEN * Solve U**T * X = B, overwriting B with X. * DO 65 J = 1, NRHS * * Solve U**T * x = b. * B( 1, J ) = B( 1, J ) / D( 1 ) IF( N.GT.1 ) $ B( 2, J ) = ( B( 2, J )-DU( 1 )*B( 1, J ) ) / D( 2 ) DO 40 I = 3, N B( I, J ) = ( B( I, J )-DU( I-1 )*B( I-1, J ) ) / D( I ) 40 CONTINUE 65 CONTINUE * ELSE * * Solve L**T * X = B, overwriting B with X. DO 60 J = 1, NRHS * * Solve L**T * x = b. * DO 50 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - DL( I )*B( I+1, J ) 50 CONTINUE 60 CONTINUE ENDIF ELSE * IF( .NOT. LOWER ) THEN * Solve U**H * X = B, overwriting B with X. * DO 95 J = 1, NRHS * * Solve U**H * x = b. * B( 1, J ) = B( 1, J ) / DCONJG( D( 1 ) ) IF( N.GT.1 ) $ B( 2, J ) = ( B( 2, J )-DCONJG( DU( 1 ) )*B( 1, J ) ) / $ DCONJG( D( 2 ) ) DO 70 I = 3, N B( I, J ) = ( B( I, J ) $ -DCONJG( DU( I-1 ) )*B( I-1, J ) ) / $ DCONJG( D( I ) ) 70 CONTINUE 95 CONTINUE * ELSE * * Solve L**H * X = B, overwriting B with X. DO 90 J = 1, NRHS * * Solve L**H * x = b. * DO 80 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - DCONJG( DL( I ) )*B( I+1, J ) 80 CONTINUE 90 CONTINUE ENDIF END IF * * End of ZDTTRSV * END scalapack-2.0.2/SRC/zlahqr2.f000644 000766 000024 00000034527 10363532303 016103 0ustar00juliestaff000000 000000 SUBROUTINE ZLAHQR2( WANTT, WANTZ, N, ILO, IHI, H, LDH, W, ILOZ, $ IHIZ, Z, LDZ, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 22, 2000 * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, INFO, LDH, LDZ, N * .. * .. Array Arguments .. COMPLEX*16 H( LDH, * ), W( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * ZLAHQR2 is an auxiliary routine called by ZHSEQR to update the * eigenvalues and Schur decomposition already computed by ZHSEQR, by * dealing with the Hessenberg submatrix in rows and columns ILO to IHI. * This version of ZLAHQR (not the standard LAPACK version) uses a * double-shift algorithm (like LAPACK's DLAHQR). * Unlike the standard LAPACK convention, this does not assume the * subdiagonal is real, nor does it work to preserve this quality if * given. * * Arguments * ========= * * WANTT (input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (input) INTEGER * The order of the matrix H. N >= 0. * * ILO (input) INTEGER * IHI (input) INTEGER * It is assumed that H is already upper triangular in rows and * columns IHI+1:N, and that H(ILO,ILO-1) = 0 (unless ILO = 1). * ZLAHQR works primarily with the Hessenberg submatrix in rows * and columns ILO to IHI, but applies transformations to all of * H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * H (input/output) COMPLEX*16 array, dimension (LDH,N) * On entry, the upper Hessenberg matrix H. * On exit, if WANTT is .TRUE., H is upper triangular in rows * and columns ILO:IHI. If WANTT is .FALSE., the contents of H * are unspecified on exit. * * LDH (input) INTEGER * The leading dimension of the array H. LDH >= max(1,N). * * W (output) COMPLEX*16 array, dimension (N) * The computed eigenvalues ILO to IHI are stored in the * corresponding elements of W. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in H, with W(i) = H(i,i). * * ILOZ (input) INTEGER * IHIZ (input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (input/output) COMPLEX*16 array, dimension (LDZ,N) * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations, and on exit Z has been updated; * transformations are applied only to the submatrix * Z(ILOZ:IHIZ,ILO:IHI). If WANTZ is .FALSE., Z is not * referenced. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * > 0: if INFO = i, ZLAHQR failed to compute all the * eigenvalues ILO to IHI in a total of 30*(IHI-ILO+1) * iterations; elements i+1:ihi of W contain those * eigenvalues which have been successfully computed. * * Further Details * =============== * * Modified by Mark R. Fahey, June, 2000 * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D+0, RONE = 1.0D+0 ) DOUBLE PRECISION DAT1, DAT2 PARAMETER ( DAT1 = 0.75D+0, DAT2 = -0.4375D+0 ) * .. * .. Local Scalars .. INTEGER I, I1, I2, ITN, ITS, J, K, L, M, NH, NR, NZ DOUBLE PRECISION CS, OVFL, S, SMLNUM, TST1, ULP, UNFL COMPLEX*16 CDUM, H00, H10, H11, H12, H21, H22, H33, H33S, $ H43H34, H44, H44S, SN, SUM, T1, T2, T3, V1, V2, $ V3 * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 1 ) COMPLEX*16 V( 3 ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, ZLANHS EXTERNAL DLAMCH, ZLANHS * .. * .. External Subroutines .. EXTERNAL DLABAD, ZCOPY, ZLANV2, ZLARFG, ZROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( N.EQ.0 ) $ RETURN IF( ILO.EQ.IHI ) THEN W( ILO ) = H( ILO, ILO ) RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * * Set machine-dependent constants for the stopping criterion. * If norm(H) <= sqrt(OVFL), overflow should not occur. * UNFL = DLAMCH( 'Safe minimum' ) OVFL = RONE / UNFL CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Precision' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = 30*NH * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of 1 or 2. Each iteration of the loop works * with the active submatrix in rows and columns L to I. * Eigenvalues I+1 to IHI have already converged. Either L = ILO, or * H(L,L-1) is negligible so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 150 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 130 ITS = 0, ITN * * Look for a single small subdiagonal element. * DO 20 K = I, L + 1, -1 TST1 = CABS1( H( K-1, K-1 ) ) + CABS1( H( K, K ) ) IF( TST1.EQ.RZERO ) $ TST1 = ZLANHS( '1', I-L+1, H( L, L ), LDH, RWORK ) IF( CABS1( H( K, K-1 ) ).LE.MAX( ULP*TST1, SMLNUM ) ) $ GO TO 30 20 CONTINUE 30 CONTINUE L = K IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * H( L, L-1 ) = ZERO END IF * * Exit from loop if a submatrix of order 1 or 2 has split off. * IF( L.GE.I-1 ) $ GO TO 140 * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * IF( ITS.EQ.10 .OR. ITS.EQ.20 ) THEN * * Exceptional shift. * * S = ABS( DBLE( H( I,I-1 ) ) ) + ABS( DBLE( H( I-1,I-2 ) ) ) S = CABS1( H( I, I-1 ) ) + CABS1( H( I-1, I-2 ) ) H44 = DAT1*S H33 = H44 H43H34 = DAT2*S*S ELSE * * Prepare to use Wilkinson's shift. * H44 = H( I, I ) H33 = H( I-1, I-1 ) H43H34 = H( I, I-1 )*H( I-1, I ) END IF * * Look for two consecutive small subdiagonal elements. * DO 40 M = I - 2, L, -1 * * Determine the effect of starting the double-shift QR * iteration at row M, and see if this would make H(M,M-1) * negligible. * H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S = CABS1( V1 ) + CABS1( V2 ) + ABS( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 IF( M.EQ.L ) $ GO TO 50 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).LE.ULP*TST1 ) $ GO TO 50 40 CONTINUE 50 CONTINUE * * Double-shift QR step * DO 120 K = M, I - 1 * * The first iteration of this loop determines a reflection G * from the vector V and applies it from left and right to H, * thus creating a nonzero bulge below the subdiagonal. * * Each subsequent iteration determines a reflection G to * restore the Hessenberg form in the (K-1)th column, and thus * chases the bulge one step toward the bottom of the active * submatrix. NR is the order of G * NR = MIN( 3, I-K+1 ) IF( K.GT.M ) $ CALL ZCOPY( NR, H( K, K-1 ), 1, V, 1 ) CALL ZLARFG( NR, V( 1 ), V( 2 ), 1, T1 ) IF( K.GT.M ) THEN H( K, K-1 ) = V( 1 ) H( K+1, K-1 ) = ZERO IF( K.LT.I-1 ) $ H( K+2, K-1 ) = ZERO ELSE IF( M.GT.L ) THEN * The real double-shift code uses H( K, K-1 ) = -H( K, K-1 ) * instead of the following. H( K, K-1 ) = H( K, K-1 ) - DCONJG( T1 )*H( K, K-1 ) END IF V2 = V( 2 ) T2 = T1*V2 IF( NR.EQ.3 ) THEN V3 = V( 3 ) T3 = T1*V3 * * Apply G from the left to transform the rows of the matrix * in columns K to I2. * DO 60 J = K, I2 SUM = DCONJG( T1 )*H( K, J ) + $ DCONJG( T2 )*H( K+1, J ) + $ DCONJG( T3 )*H( K+2, J ) H( K, J ) = H( K, J ) - SUM H( K+1, J ) = H( K+1, J ) - SUM*V2 H( K+2, J ) = H( K+2, J ) - SUM*V3 60 CONTINUE * * Apply G from the right to transform the columns of the * matrix in rows I1 to min(K+3,I). * DO 70 J = I1, MIN( K+3, I ) SUM = T1*H( J, K ) + T2*H( J, K+1 ) + T3*H( J, K+2 ) H( J, K ) = H( J, K ) - SUM H( J, K+1 ) = H( J, K+1 ) - SUM*DCONJG( V2 ) H( J, K+2 ) = H( J, K+2 ) - SUM*DCONJG( V3 ) 70 CONTINUE * IF( WANTZ ) THEN * * Accumulate transformations in the matrix Z * DO 80 J = ILOZ, IHIZ SUM = T1*Z( J, K ) + T2*Z( J, K+1 ) + $ T3*Z( J, K+2 ) Z( J, K ) = Z( J, K ) - SUM Z( J, K+1 ) = Z( J, K+1 ) - SUM*DCONJG( V2 ) Z( J, K+2 ) = Z( J, K+2 ) - SUM*DCONJG( V3 ) 80 CONTINUE END IF ELSE IF( NR.EQ.2 ) THEN * * Apply G from the left to transform the rows of the matrix * in columns K to I2. * DO 90 J = K, I2 SUM = DCONJG( T1 )*H( K, J ) + $ DCONJG( T2 )*H( K+1, J ) H( K, J ) = H( K, J ) - SUM H( K+1, J ) = H( K+1, J ) - SUM*V2 90 CONTINUE * * Apply G from the right to transform the columns of the * matrix in rows I1 to min(K+2,I). * DO 100 J = I1, MIN( K+2, I ) SUM = T1*H( J, K ) + T2*H( J, K+1 ) H( J, K ) = H( J, K ) - SUM H( J, K+1 ) = H( J, K+1 ) - SUM*DCONJG( V2 ) 100 CONTINUE * IF( WANTZ ) THEN * * Accumulate transformations in the matrix Z * DO 110 J = ILOZ, IHIZ SUM = T1*Z( J, K ) + T2*Z( J, K+1 ) Z( J, K ) = Z( J, K ) - SUM Z( J, K+1 ) = Z( J, K+1 ) - SUM*DCONJG( V2 ) 110 CONTINUE END IF END IF * * Since at the start of the QR step we have for M > L * H( K, K-1 ) = H( K, K-1 ) - DCONJG( T1 )*H( K, K-1 ) * then we don't need to do the following * IF( K.EQ.M .AND. M.GT.L ) THEN * If the QR step was started at row M > L because two * consecutive small subdiagonals were found, then H(M,M-1) * must also be updated by a factor of (1-T1). * TEMP = ONE - T1 * H( m, m-1 ) = H( m, m-1 )*DCONJG( TEMP ) * END IF 120 CONTINUE * * Ensure that H(I,I-1) is real. * 130 CONTINUE * * Failure to converge in remaining number of iterations * INFO = I RETURN * 140 CONTINUE * IF( L.EQ.I ) THEN * * H(I,I-1) is negligible: one eigenvalue has converged. * W( I ) = H( I, I ) * ELSE IF( L.EQ.I-1 ) THEN * * H(I-1,I-2) is negligible: a pair of eigenvalues have converged. * * Transform the 2-by-2 submatrix to standard Schur form, * and compute and store the eigenvalues. * CALL ZLANV2( H( I-1, I-1 ), H( I-1, I ), H( I, I-1 ), $ H( I, I ), W( I-1 ), W( I ), CS, SN ) * IF( WANTT ) THEN * * Apply the transformation to the rest of H. * IF( I2.GT.I ) $ CALL ZROT( I2-I, H( I-1, I+1 ), LDH, H( I, I+1 ), LDH, $ CS, SN ) CALL ZROT( I-I1-1, H( I1, I-1 ), 1, H( I1, I ), 1, CS, $ DCONJG( SN ) ) END IF IF( WANTZ ) THEN * * Apply the transformation to Z. * CALL ZROT( NZ, Z( ILOZ, I-1 ), 1, Z( ILOZ, I ), 1, CS, $ DCONJG( SN ) ) END IF * END IF * * Decrement number of remaining iterations, and return to start of * the main loop with new value of I. * ITN = ITN - ITS I = L - 1 GO TO 10 * 150 CONTINUE RETURN * * End of ZLAHQR2 * END scalapack-2.0.2/SRC/zlamov.c000644 000766 000024 00000000256 11745567264 016040 0ustar00juliestaff000000 000000 // // zlamov.c // // Written by Lee Killough 04/19/2012 // #define TYPE complex16 #define FUNC "ZLAMOV" #define LAMOV zlamov_ #define LACPY zlacpy_ #include "lamov.h" scalapack-2.0.2/SRC/zlamsh.f000644 000766 000024 00000022754 10363532303 016015 0ustar00juliestaff000000 000000 SUBROUTINE ZLAMSH( S, LDS, NBULGE, JBLK, H, LDH, N, ULP ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * May 28, 1999 * * .. Scalar Arguments .. INTEGER JBLK, LDH, LDS, N, NBULGE DOUBLE PRECISION ULP * .. * .. Array Arguments .. COMPLEX*16 H( LDH, * ), S( LDS, * ) * .. * * Purpose * ======= * * ZLAMSH sends multiple shifts through a small (single node) matrix to * see how consecutive small subdiagonal elements are modified by * subsequent shifts in an effort to maximize the number of bulges * that can be sent through. * ZLAMSH should only be called when there are multiple shifts/bulges * (NBULGE > 1) and the first shift is starting in the middle of an * unreduced Hessenberg matrix because of two or more consecutive * small subdiagonal elements. * * Arguments * ========= * * S (local input/output) COMPLEX*16 array, ( LDS,* ) * On entry, the matrix of shifts. Only the 2x2 diagonal of S * is referenced. It is assumed that S has JBLK double shifts * (size 2). * On exit, the data is rearranged in the best order for * applying. * * LDS (local input) INTEGER * On entry, the leading dimension of S. Unchanged on exit. * 1 < NBULGE <= JBLK <= LDS/2 * * NBULGE (local input/output) INTEGER * On entry, the number of bulges to send through H ( >1 ). * NBULGE should be less than the maximum determined (JBLK). * 1 < NBULGE <= JBLK <= LDS/2 * On exit, the maximum number of bulges that can be sent * through. * * JBLK (local input) INTEGER * On entry, the number of shifts determined for S. * Unchanged on exit. * * H (local input/output) COMPLEX*16 array ( LDH,N ) * On entry, the local matrix to apply the shifts on. * H should be aligned so that the starting row is 2. * On exit, the data is destroyed. * * LDH (local input) INTEGER * On entry, the leading dimension of H. Unchanged on exit. * * N (local input) INTEGER * On entry, the size of H. If all the bulges are expected to * go through, N should be at least 4*NBULGE+2. * Otherwise, NBULGE may be reduced by this routine. * * ULP (local input) DOUBLE PRECISION * On entry, machine precision * Unchanged on exit. * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION RONE, TEN PARAMETER ( RONE = 1.0D+0, TEN = 10.0D+0 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IBULGE, IVAL, J, K, M, NR DOUBLE PRECISION DVAL, S1, TST1 COMPLEX*16 CDUM, H00, H10, H11, H12, H21, H22, H33, H33S, $ H43H34, H44, H44S, SUM, T1, T2, T3, V1, V2, V3 * .. * .. Local Arrays .. COMPLEX*16 V( 3 ) * .. * .. External Subroutines .. EXTERNAL ZCOPY, ZLARFG * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * M = 2 DO 50 IBULGE = 1, NBULGE H44 = S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2 ) H33 = S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+1 ) H43H34 = S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+2 )* $ S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1 ) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).GT.ULP*TST1 ) THEN * Find minimum DVAL = ( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ) ) / $ ( ULP*TST1 ) IVAL = IBULGE DO 10 I = IBULGE + 1, NBULGE H44 = S( 2*JBLK-2*I+2, 2*JBLK-2*I+2 ) H33 = S( 2*JBLK-2*I+1, 2*JBLK-2*I+1 ) H43H34 = S( 2*JBLK-2*I+1, 2*JBLK-2*I+2 )* $ S( 2*JBLK-2*I+2, 2*JBLK-2*I+1 ) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF( ( DVAL.GT.( CABS1( H10 )*( CABS1( V2 )+ $ CABS1( V3 ) ) ) / ( ULP*TST1 ) ) .AND. $ ( DVAL.GT.RONE ) ) THEN DVAL = ( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ) ) / $ ( ULP*TST1 ) IVAL = I END IF 10 CONTINUE IF( ( DVAL.LT.TEN ) .AND. ( IVAL.NE.IBULGE ) ) THEN H44 = S( 2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+2 ) H33 = S( 2*JBLK-2*IVAL+1, 2*JBLK-2*IVAL+1 ) H43H34 = S( 2*JBLK-2*IVAL+1, 2*JBLK-2*IVAL+2 ) H10 = S( 2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1 ) S( 2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+2 ) = S( 2*JBLK-2* $ IBULGE+2, 2*JBLK-2*IBULGE+2 ) S( 2*JBLK-2*IVAL+1, 2*JBLK-2*IVAL+1 ) = S( 2*JBLK-2* $ IBULGE+1, 2*JBLK-2*IBULGE+1 ) S( 2*JBLK-2*IVAL+1, 2*JBLK-2*IVAL+2 ) = S( 2*JBLK-2* $ IBULGE+1, 2*JBLK-2*IBULGE+2 ) S( 2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1 ) = S( 2*JBLK-2* $ IBULGE+2, 2*JBLK-2*IBULGE+1 ) S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2 ) = H44 S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+1 ) = H33 S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+2 ) = H43H34 S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1 ) = H10 END IF H44 = S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2 ) H33 = S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+1 ) H43H34 = S( 2*JBLK-2*IBULGE+1, 2*JBLK-2*IBULGE+2 )* $ S( 2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1 ) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) END IF IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).GT.TEN*ULP*TST1 ) $ THEN * IBULGE better not be 1 here or we have a bug! NBULGE = MAX( IBULGE-1, 1 ) RETURN END IF DO 40 K = M, N - 1 NR = MIN( 3, N-K+1 ) IF( K.GT.M ) $ CALL ZCOPY( NR, H( K, K-1 ), 1, V, 1 ) CALL ZLARFG( NR, V( 1 ), V( 2 ), 1, T1 ) IF( K.GT.M ) THEN H( K, K-1 ) = V( 1 ) H( K+1, K-1 ) = ZERO IF( K.LT.N-1 ) $ H( K+2, K-1 ) = ZERO ELSE * H(m,m-1) must be updated, * H( K, K-1 ) = H( K, K-1 ) - DCONJG( T1 )*H( K, K-1 ) END IF V2 = V( 2 ) T2 = T1*V2 IF( NR.EQ.3 ) THEN V3 = V( 3 ) T3 = T1*V3 DO 20 J = K, N SUM = DCONJG( T1 )*H( K, J ) + $ DCONJG( T2 )*H( K+1, J ) + $ DCONJG( T3 )*H( K+2, J ) H( K, J ) = H( K, J ) - SUM H( K+1, J ) = H( K+1, J ) - SUM*V2 H( K+2, J ) = H( K+2, J ) - SUM*V3 20 CONTINUE DO 30 J = 1, MIN( K+3, N ) SUM = T1*H( J, K ) + T2*H( J, K+1 ) + T3*H( J, K+2 ) H( J, K ) = H( J, K ) - SUM H( J, K+1 ) = H( J, K+1 ) - SUM*DCONJG( V2 ) H( J, K+2 ) = H( J, K+2 ) - SUM*DCONJG( V3 ) 30 CONTINUE END IF 40 CONTINUE 50 CONTINUE * RETURN * * End of ZLAMSH * END scalapack-2.0.2/SRC/zlanv2.f000644 000766 000024 00000006740 10363532303 015730 0ustar00juliestaff000000 000000 SUBROUTINE ZLANV2( A, B, C, D, RT1, RT2, CS, SN ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * May 28, 1999 * * .. Scalar Arguments .. DOUBLE PRECISION CS COMPLEX*16 A, B, C, D, RT1, RT2, SN * .. * * Purpose * ======= * * ZLANV2 computes the Schur factorization of a complex 2-by-2 * nonhermitian matrix in standard form: * * [ A B ] = [ CS -SN ] [ AA BB ] [ CS SN ] * [ C D ] [ SN CS ] [ 0 DD ] [-SN CS ] * * Arguments * ========= * * A (input/output) COMPLEX*16 * B (input/output) COMPLEX*16 * C (input/output) COMPLEX*16 * D (input/output) COMPLEX*16 * On entry, the elements of the input matrix. * On exit, they are overwritten by the elements of the * standardised Schur form. * * RT1 (output) COMPLEX*16 * RT2 (output) COMPLEX*16 * The two eigenvalues. * * CS (output) DOUBLE PRECISION * SN (output) COMPLEX*16 * Parameters of the rotation matrix. * * Further Details * =============== * * Implemented by Mark R. Fahey, May 28, 1999 * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION RZERO, HALF, RONE PARAMETER ( RZERO = 0.0D+0, HALF = 0.5D+0, $ RONE = 1.0D+0 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. COMPLEX*16 AA, BB, DD, T, TEMP, TEMP2, U, X, Y * .. * .. External Functions .. COMPLEX*16 ZLADIV EXTERNAL ZLADIV * .. * .. External Subroutines .. EXTERNAL ZLARTG * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, DIMAG, SQRT * .. * .. Executable Statements .. * * Initialize CS and SN * CS = RONE SN = ZERO * IF( C.EQ.ZERO ) THEN GO TO 10 * ELSE IF( B.EQ.ZERO ) THEN * * Swap rows and columns * CS = RZERO SN = ONE TEMP = D D = A A = TEMP B = -C C = ZERO GO TO 10 ELSE IF( ( A-D ).EQ.ZERO ) THEN TEMP = SQRT( B*C ) A = A + TEMP D = D - TEMP IF( ( B+C ).EQ.ZERO ) THEN CS = SQRT( HALF ) SN = DCMPLX( RZERO, RONE )*CS ELSE TEMP = SQRT( B+C ) TEMP2 = ZLADIV( SQRT( B ), TEMP ) CS = DBLE( TEMP2 ) SN = ZLADIV( SQRT( C ), TEMP ) END IF B = B - C C = ZERO GO TO 10 ELSE * * Compute eigenvalue closest to D * T = D U = B*C X = HALF*( A-T ) Y = SQRT( X*X+U ) IF( DBLE( X )*DBLE( Y )+DIMAG( X )*DIMAG( Y ).LT.RZERO ) $ Y = -Y T = T - ZLADIV( U, ( X+Y ) ) * * Do one QR step with exact shift T - resulting 2 x 2 in * triangular form. * CALL ZLARTG( A-T, C, CS, SN, AA ) * D = D - T BB = CS*B + SN*D DD = -DCONJG( SN )*B + CS*D * A = AA*CS + BB*DCONJG( SN ) + T B = -AA*SN + BB*CS C = ZERO D = T * END IF * 10 CONTINUE * * Store eigenvalues in RT1 and RT2. * RT1 = A RT2 = D RETURN * * End of ZLANV2 * END scalapack-2.0.2/SRC/zlaref.f000644 000766 000024 00000031603 10363532303 015773 0ustar00juliestaff000000 000000 SUBROUTINE ZLAREF( TYPE, A, LDA, WANTZ, Z, LDZ, BLOCK, IROW1, $ ICOL1, ISTART, ISTOP, ITMP1, ITMP2, LILOZ, $ LIHIZ, VECS, V2, V3, T1, T2, T3 ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * May 28, 1999 * * .. Scalar Arguments .. LOGICAL BLOCK, WANTZ CHARACTER TYPE INTEGER ICOL1, IROW1, ISTART, ISTOP, ITMP1, ITMP2, LDA, $ LDZ, LIHIZ, LILOZ COMPLEX*16 T1, T2, T3, V2, V3 * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), VECS( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * ZLAREF applies one or several Householder reflectors of size 3 * to one or two matrices (if column is specified) on either their * rows or columns. * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * If 'R': Apply reflectors to the rows of the matrix * (apply from left) * Otherwise: Apply reflectors to the columns of the matrix * Unchanged on exit. * * A (global input/output) COMPLEX*16 array, (LDA,*) * On entry, the matrix to receive the reflections. * The updated matrix on exit. * * LDA (local input) INTEGER * On entry, the leading dimension of A. Unchanged on exit. * * WANTZ (global input) LOGICAL * If .TRUE., then apply any column reflections to Z as well. * If .FALSE., then do no additional work on Z. * * Z (global input/output) COMPLEX*16 array, (LDZ,*) * On entry, the second matrix to receive column reflections. * This is changed only if WANTZ is set. * * LDZ (local input) INTEGER * On entry, the leading dimension of Z. Unchanged on exit. * * BLOCK (global input) LOGICAL * If .TRUE., then apply several reflectors at once and read * their data from the VECS array. * If .FALSE., apply the single reflector given by V2, V3, * T1, T2, and T3. * * IROW1 (local input/output) INTEGER * On entry, the local row element of A. * Undefined on output. * * * ICOL1 (local input/output) INTEGER * On entry, the local column element of A. * Undefined on output. * * ISTART (global input) INTEGER * Specifies the "number" of the first reflector. This is * used as an index into VECS if BLOCK is set. * ISTART is ignored if BLOCK is .FALSE.. * * ISTOP (global input) INTEGER * Specifies the "number" of the last reflector. This is * used as an index into VECS if BLOCK is set. * ISTOP is ignored if BLOCK is .FALSE.. * * ITMP1 (local input) INTEGER * Starting range into A. For rows, this is the local * first column. For columns, this is the local first row. * * ITMP2 (local input) INTEGER * Ending range into A. For rows, this is the local last * column. For columns, this is the local last row. * * LILOZ * LIHIZ (local input) INTEGER * These serve the same purpose as ITMP1,ITMP2 but for Z * when WANTZ is set. * * VECS (global input) COMPLEX*16 array of size 3*N (matrix size) * This holds the size 3 reflectors one after another and this * is only accessed when BLOCK is .TRUE. * * V2 * V3 * T1 * T2 * T3 (global input/output) COMPLEX*16 * This holds information on a single size 3 Householder * reflector and is read when BLOCK is .FALSE., and * overwritten when BLOCK is .TRUE. * * Further Details * =============== * * Implemented by: M. Fahey, May 28, 1999 * * ===================================================================== * * .. Local Scalars .. INTEGER J, K COMPLEX*16 A1, A11, A2, A22, A3, A4, A5, B1, B2, B3, B4, $ B5, H11, H22, SUM, SUM1, SUM2, SUM3, T12, T13, $ T22, T23, T32, T33, TMP1, TMP2, TMP3, V22, V23, $ V32, V33 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DCONJG, MOD * .. * .. Executable Statements .. * IF( LSAME( TYPE, 'R' ) ) THEN IF( BLOCK ) THEN DO 30 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 10 J = ITMP1, ITMP2 - MOD( ITMP2-ITMP1+1, 2 ), 2 A1 = A( IROW1, J ) A2 = A( IROW1+1, J ) A3 = A( IROW1+2, J ) A4 = A( IROW1+3, J ) A5 = A( IROW1+4, J ) B1 = A( IROW1, J+1 ) B2 = A( IROW1+1, J+1 ) B3 = A( IROW1+2, J+1 ) B4 = A( IROW1+3, J+1 ) B5 = A( IROW1+4, J+1 ) SUM1 = DCONJG( T1 )*A1 + DCONJG( T2 )*A2 + $ DCONJG( T3 )*A3 A( IROW1, J ) = A1 - SUM1 H11 = A2 - SUM1*V2 H22 = A3 - SUM1*V3 TMP1 = DCONJG( T1 )*B1 + DCONJG( T2 )*B2 + $ DCONJG( T3 )*B3 A( IROW1, J+1 ) = B1 - TMP1 A11 = B2 - TMP1*V2 A22 = B3 - TMP1*V3 SUM2 = DCONJG( T12 )*H11 + DCONJG( T22 )*H22 + $ DCONJG( T32 )*A4 A( IROW1+1, J ) = H11 - SUM2 H11 = H22 - SUM2*V22 H22 = A4 - SUM2*V32 TMP2 = DCONJG( T12 )*A11 + DCONJG( T22 )*A22 + $ DCONJG( T32 )*B4 A( IROW1+1, J+1 ) = A11 - TMP2 A11 = A22 - TMP2*V22 A22 = B4 - TMP2*V32 SUM3 = DCONJG( T13 )*H11 + DCONJG( T23 )*H22 + $ DCONJG( T33 )*A5 A( IROW1+2, J ) = H11 - SUM3 A( IROW1+3, J ) = H22 - SUM3*V23 A( IROW1+4, J ) = A5 - SUM3*V33 TMP3 = DCONJG( T13 )*A11 + DCONJG( T23 )*A22 + $ DCONJG( T33 )*B5 A( IROW1+2, J+1 ) = A11 - TMP3 A( IROW1+3, J+1 ) = A22 - TMP3*V23 A( IROW1+4, J+1 ) = B5 - TMP3*V33 10 CONTINUE DO 20 J = ITMP2 - MOD( ITMP2-ITMP1+1, 2 ) + 1, ITMP2 SUM = DCONJG( T1 )*A( IROW1, J ) + $ DCONJG( T2 )*A( IROW1+1, J ) + $ DCONJG( T3 )*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM H11 = A( IROW1+1, J ) - SUM*V2 H22 = A( IROW1+2, J ) - SUM*V3 SUM = DCONJG( T12 )*H11 + DCONJG( T22 )*H22 + $ DCONJG( T32 )*A( IROW1+3, J ) A( IROW1+1, J ) = H11 - SUM H11 = H22 - SUM*V22 H22 = A( IROW1+3, J ) - SUM*V32 SUM = DCONJG( T13 )*H11 + DCONJG( T23 )*H22 + $ DCONJG( T33 )*A( IROW1+4, J ) A( IROW1+2, J ) = H11 - SUM A( IROW1+3, J ) = H22 - SUM*V23 A( IROW1+4, J ) = A( IROW1+4, J ) - SUM*V33 20 CONTINUE IROW1 = IROW1 + 3 30 CONTINUE DO 50 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 40 J = ITMP1, ITMP2 SUM = DCONJG( T1 )*A( IROW1, J ) + $ DCONJG( T2 )*A( IROW1+1, J ) + $ DCONJG( T3 )*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*V2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*V3 40 CONTINUE IROW1 = IROW1 + 1 50 CONTINUE ELSE DO 60 J = ITMP1, ITMP2 SUM = DCONJG( T1 )*A( IROW1, J ) + $ DCONJG( T2 )*A( IROW1+1, J ) + $ DCONJG( T3 )*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*V2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*V3 60 CONTINUE END IF ELSE * * Do column transforms * IF( BLOCK ) THEN DO 90 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 70 J = ITMP1, ITMP2 SUM = T1*A( J, ICOL1 ) + T2*A( J, ICOL1+1 ) + $ T3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM H11 = A( J, ICOL1+1 ) - SUM*DCONJG( V2 ) H22 = A( J, ICOL1+2 ) - SUM*DCONJG( V3 ) SUM = T12*H11 + T22*H22 + T32*A( J, ICOL1+3 ) A( J, ICOL1+1 ) = H11 - SUM H11 = H22 - SUM*DCONJG( V22 ) H22 = A( J, ICOL1+3 ) - SUM*DCONJG( V32 ) SUM = T13*H11 + T23*H22 + T33*A( J, ICOL1+4 ) A( J, ICOL1+2 ) = H11 - SUM A( J, ICOL1+3 ) = H22 - SUM*DCONJG( V23 ) A( J, ICOL1+4 ) = A( J, ICOL1+4 ) - SUM*DCONJG( V33 ) 70 CONTINUE IF( WANTZ ) THEN DO 80 J = LILOZ, LIHIZ SUM = T1*Z( J, ICOL1 ) + T2*Z( J, ICOL1+1 ) + $ T3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM H11 = Z( J, ICOL1+1 ) - SUM*DCONJG( V2 ) H22 = Z( J, ICOL1+2 ) - SUM*DCONJG( V3 ) SUM = T12*H11 + T22*H22 + T32*Z( J, ICOL1+3 ) Z( J, ICOL1+1 ) = H11 - SUM H11 = H22 - SUM*DCONJG( V22 ) H22 = Z( J, ICOL1+3 ) - SUM*DCONJG( V32 ) SUM = T13*H11 + T23*H22 + T33*Z( J, ICOL1+4 ) Z( J, ICOL1+2 ) = H11 - SUM Z( J, ICOL1+3 ) = H22 - SUM*DCONJG( V23 ) Z( J, ICOL1+4 ) = Z( J, ICOL1+4 ) - $ SUM*DCONJG( V33 ) 80 CONTINUE END IF ICOL1 = ICOL1 + 3 90 CONTINUE DO 120 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 100 J = ITMP1, ITMP2 SUM = T1*A( J, ICOL1 ) + T2*A( J, ICOL1+1 ) + $ T3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*DCONJG( V2 ) A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*DCONJG( V3 ) 100 CONTINUE IF( WANTZ ) THEN DO 110 J = LILOZ, LIHIZ SUM = T1*Z( J, ICOL1 ) + T2*Z( J, ICOL1+1 ) + $ T3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM Z( J, ICOL1+1 ) = Z( J, ICOL1+1 ) - $ SUM*DCONJG( V2 ) Z( J, ICOL1+2 ) = Z( J, ICOL1+2 ) - $ SUM*DCONJG( V3 ) 110 CONTINUE END IF ICOL1 = ICOL1 + 1 120 CONTINUE ELSE DO 130 J = ITMP1, ITMP2 SUM = T1*A( J, ICOL1 ) + T2*A( J, ICOL1+1 ) + $ T3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*DCONJG( V2 ) A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*DCONJG( V3 ) 130 CONTINUE END IF END IF RETURN * * End of ZLAREF * END scalapack-2.0.2/SRC/zpttrsv.f000644 000766 000024 00000011415 11657111056 016250 0ustar00juliestaff000000 000000 SUBROUTINE ZPTTRSV( UPLO, TRANS, N, NRHS, D, E, B, LDB, $ INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * * Written by Andrew J. Cleary, University of Tennessee. * November, 1996. * Modified from ZPTTRS: * -- LAPACK routine (preliminary version) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * * .. Scalar Arguments .. CHARACTER UPLO, TRANS INTEGER INFO, LDB, N, NRHS * .. * .. Array Arguments .. DOUBLE PRECISION D( * ) COMPLEX*16 B( LDB, * ), E( * ) * .. * * Purpose * ======= * * ZPTTRSV solves one of the triangular systems * L * X = B, or L**H * X = B, * U * X = B, or U**H * X = B, * where L or U is the Cholesky factor of a Hermitian positive * definite tridiagonal matrix A such that * A = U**H*D*U or A = L*D*L**H (computed by ZPTTRF). * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the superdiagonal or the subdiagonal * of the tridiagonal matrix A is stored and the form of the * factorization: * = 'U': E is the superdiagonal of U, and A = U'*D*U; * = 'L': E is the subdiagonal of L, and A = L*D*L'. * (The two forms are equivalent if A is real.) * * TRANS (input) CHARACTER * Specifies the form of the system of equations: * = 'N': L * X = B (No transpose) * = 'N': L * X = B (No transpose) * = 'C': U**H * X = B (Conjugate transpose) * = 'C': L**H * X = B (Conjugate transpose) * * N (input) INTEGER * The order of the tridiagonal matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * D (input) REAL array, dimension (N) * The n diagonal elements of the diagonal matrix D from the * factorization computed by ZPTTRF. * * E (input) COMPLEX array, dimension (N-1) * The (n-1) off-diagonal elements of the unit bidiagonal * factor U or L from the factorization computed by ZPTTRF * (see UPLO). * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Local Scalars .. LOGICAL NOTRAN, UPPER INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX * .. * .. Executable Statements .. * * Test the input arguments. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( NRHS.LT.0 ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZPTTRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * IF( .NOT.NOTRAN ) THEN * DO 30 J = 1, NRHS * * Solve U**T (or H) * x = b. * DO 10 I = 2, N B( I, J ) = B( I, J ) - B( I-1, J )*DCONJG( E( I-1 ) ) 10 CONTINUE 30 CONTINUE * ELSE * DO 35 J = 1, NRHS * * Solve U * x = b. * DO 20 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - B( I+1, J )*E( I ) 20 CONTINUE 35 CONTINUE ENDIF * ELSE * IF( NOTRAN ) THEN * DO 60 J = 1, NRHS * * Solve L * x = b. * DO 40 I = 2, N B( I, J ) = B( I, J ) - B( I-1, J )*E( I-1 ) 40 CONTINUE 60 CONTINUE * ELSE * DO 65 J = 1, NRHS * * Solve L**H * x = b. * DO 50 I = N - 1, 1, -1 B( I, J ) = B( I, J ) - $ B( I+1, J )*DCONJG( E( I ) ) 50 CONTINUE 65 CONTINUE ENDIF * END IF * RETURN * * End of ZPTTRS * END scalapack-2.0.2/SRC/zsteqr2.f000644 000766 000024 00000044577 10363532303 016140 0ustar00juliestaff000000 000000 SUBROUTINE ZSTEQR2( COMPZ, N, D, E, Z, LDZ, NR, WORK, INFO ) * * -- ScaLAPACK routine (version 1.7) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * November 15, 1997 * * .. Scalar Arguments .. CHARACTER COMPZ INTEGER INFO, LDZ, N, NR * .. * .. Array Arguments .. DOUBLE PRECISION D( * ), E( * ), WORK( * ) COMPLEX*16 Z( LDZ, * ) * .. * * Purpose * ======= * * ZSTEQR2 is a modified version of LAPACK routine ZSTEQR. * ZSTEQR2 computes all eigenvalues and, optionally, eigenvectors of a * symmetric tridiagonal matrix using the implicit QL or QR method. * ZSTEQR2 is modified from ZSTEQR to allow each ScaLAPACK process * running ZSTEQR2 to perform updates on a distributed matrix Q. * Proper usage of ZSTEQR2 can be gleaned from * examination of ScaLAPACK's * PZHEEV. * ZSTEQR2 incorporates changes attributed to Greg Henry. * * Arguments * ========= * * COMPZ (input) CHARACTER*1 * = 'N': Compute eigenvalues only. * = 'I': Compute eigenvalues and eigenvectors of the * tridiagonal matrix. Z must be initialized to the * identity matrix by PZLASET or ZLASET prior * to entering this subroutine. * * N (input) INTEGER * The order of the matrix. N >= 0. * * D (input/output) DOUBLE PRECISION array, dimension (N) * On entry, the diagonal elements of the tridiagonal matrix. * On exit, if INFO = 0, the eigenvalues in ascending order. * * E (input/output) DOUBLE PRECISION array, dimension (N-1) * On entry, the (n-1) subdiagonal elements of the tridiagonal * matrix. * On exit, E has been destroyed. * * Z (local input/local output) COMPLEX*16 array, global * dimension (N, N), local dimension (LDZ, NR). * On entry, if COMPZ = 'V', then Z contains the orthogonal * matrix used in the reduction to tridiagonal form. * On exit, if INFO = 0, then if COMPZ = 'V', Z contains the * orthonormal eigenvectors of the original symmetric matrix, * and if COMPZ = 'I', Z contains the orthonormal eigenvectors * of the symmetric tridiagonal matrix. * If COMPZ = 'N', then Z is not referenced. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= 1, and if * eigenvectors are desired, then LDZ >= max(1,N). * * NR (input) INTEGER * NR = MAX(1, NUMROC( N, NB, MYPROW, 0, NPROCS ) ). * If COMPZ = 'N', then NR is not referenced. * * WORK (workspace) DOUBLE PRECISION array, dimension (max(1,2*N-2)) * If COMPZ = 'N', then WORK is not referenced. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: the algorithm has failed to find all the eigenvalues in * a total of 30*N iterations; if INFO = i, then i * elements of E have not converged to zero; on exit, D * and E contain the elements of a symmetric tridiagonal * matrix which is orthogonally similar to the original * matrix. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, TWO, THREE, HALF PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, TWO = 2.0D0, $ THREE = 3.0D0, HALF = 0.5D0 ) COMPLEX*16 CONE PARAMETER ( CONE = ( 1.0D0, 1.0D0 ) ) INTEGER MAXIT, NMAXLOOK PARAMETER ( MAXIT = 30, NMAXLOOK = 15 ) * .. * .. Local Scalars .. INTEGER I, ICOMPZ, II, ILAST, ISCALE, J, JTOT, K, L, $ L1, LEND, LENDM1, LENDP1, LENDSV, LM1, LSV, M, $ MM, MM1, NLOOK, NM1, NMAXIT DOUBLE PRECISION ANORM, B, C, EPS, EPS2, F, G, GP, OLDEL, OLDGP, $ OLDRP, P, R, RP, RT1, RT2, S, SAFMAX, SAFMIN, $ SSFMAX, SSFMIN, TST, TST1 * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANST, DLAPY2 EXTERNAL LSAME, DLAMCH, DLANST, DLAPY2 * .. * .. External Subroutines .. EXTERNAL DLAEV2, DLARTG, DLASCL, DSTERF, XERBLA, ZLASR, $ ZSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SIGN, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * ILAST = 0 INFO = 0 * IF( LSAME( COMPZ, 'N' ) ) THEN ICOMPZ = 0 ELSEIF( LSAME( COMPZ, 'I' ) ) THEN ICOMPZ = 1 ELSE ICOMPZ = -1 ENDIF IF( ICOMPZ.LT.0 ) THEN INFO = -1 ELSEIF( N.LT.0 ) THEN INFO = -2 ELSEIF( ICOMPZ.GT.0 .AND. LDZ.LT.MAX( 1, NR ) ) THEN INFO = -6 ENDIF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZSTEQR2', -INFO ) RETURN ENDIF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * If eigenvectors aren't not desired, this is faster * IF( ICOMPZ.EQ.0 ) THEN CALL DSTERF( N, D, E, INFO ) RETURN ENDIF * IF( N.EQ.1 ) THEN Z( 1, 1 ) = CONE RETURN ENDIF * * Determine the unit roundoff and over/underflow thresholds. * EPS = DLAMCH( 'E' ) EPS2 = EPS**2 SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN SSFMAX = SQRT( SAFMAX ) / THREE SSFMIN = SQRT( SAFMIN ) / EPS2 * * Compute the eigenvalues and eigenvectors of the tridiagonal * matrix. * NMAXIT = N*MAXIT JTOT = 0 * * Determine where the matrix splits and choose QL or QR iteration * for each block, according to whether top or bottom diagonal * element is smaller. * L1 = 1 NM1 = N - 1 * 10 CONTINUE IF( L1.GT.N ) $ GOTO 220 IF( L1.GT.1 ) $ E( L1-1 ) = ZERO IF( L1.LE.NM1 ) THEN DO 20 M = L1, NM1 TST = ABS( E( M ) ) IF( TST.EQ.ZERO ) $ GOTO 30 IF( TST.LE.( SQRT( ABS( D( M ) ) )*SQRT( ABS( D( M+ $ 1 ) ) ) )*EPS ) THEN E( M ) = ZERO GOTO 30 ENDIF 20 CONTINUE ENDIF M = N * 30 CONTINUE L = L1 LSV = L LEND = M LENDSV = LEND L1 = M + 1 IF( LEND.EQ.L ) $ GOTO 10 * * Scale submatrix in rows and columns L to LEND * ANORM = DLANST( 'I', LEND-L+1, D( L ), E( L ) ) ISCALE = 0 IF( ANORM.EQ.ZERO ) $ GOTO 10 IF( ANORM.GT.SSFMAX ) THEN ISCALE = 1 CALL DLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L+1, 1, D( L ), N, $ INFO ) CALL DLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L, 1, E( L ), N, $ INFO ) ELSEIF( ANORM.LT.SSFMIN ) THEN ISCALE = 2 CALL DLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L+1, 1, D( L ), N, $ INFO ) CALL DLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L, 1, E( L ), N, $ INFO ) ENDIF * * Choose between QL and QR iteration * IF( ABS( D( LEND ) ).LT.ABS( D( L ) ) ) THEN LEND = LSV L = LENDSV ENDIF * IF( LEND.GT.L ) THEN * * QL Iteration * * Look for small subdiagonal element. * 40 CONTINUE IF( L.NE.LEND ) THEN LENDM1 = LEND - 1 DO 50 M = L, LENDM1 TST = ABS( E( M ) )**2 IF( TST.LE.( EPS2*ABS( D( M ) ) )*ABS( D( M+1 ) )+ $ SAFMIN )GOTO 60 50 CONTINUE ENDIF * M = LEND * 60 CONTINUE IF( M.LT.LEND ) $ E( M ) = ZERO P = D( L ) IF( M.EQ.L ) $ GOTO 110 * * If remaining matrix is 2-by-2, use DLAE2 or DLAEV2 * to compute its eigensystem. * IF( M.EQ.L+1 ) THEN CALL DLAEV2( D( L ), E( L ), D( L+1 ), RT1, RT2, C, S ) WORK( L ) = C WORK( N-1+L ) = S CALL ZLASR( 'R', 'V', 'B', NR, 2, WORK( L ), WORK( N-1+L ), $ Z( 1, L ), LDZ ) D( L ) = RT1 D( L+1 ) = RT2 E( L ) = ZERO L = L + 2 IF( L.LE.LEND ) $ GOTO 40 GOTO 200 ENDIF * IF( JTOT.EQ.NMAXIT ) $ GOTO 200 JTOT = JTOT + 1 * * Form shift. * G = ( D( L+1 )-P ) / ( TWO*E( L ) ) R = DLAPY2( G, ONE ) G = D( M ) - P + ( E( L ) / ( G+SIGN( R, G ) ) ) * IF( ICOMPZ.EQ.0 ) THEN * Do not do a lookahead! GOTO 90 ENDIF * OLDEL = ABS( E( L ) ) GP = G RP = R TST = ABS( E( L ) )**2 TST = TST / ( ( EPS2*ABS( D( L ) ) )*ABS( D( L+1 ) )+SAFMIN ) * NLOOK = 1 IF( ( TST.GT.ONE ) .AND. ( NLOOK.LE.NMAXLOOK ) ) THEN 70 CONTINUE * * This is the lookahead loop, going until we have * convergence or too many steps have been taken. * S = ONE C = ONE P = ZERO MM1 = M - 1 DO 80 I = MM1, L, -1 F = S*E( I ) B = C*E( I ) CALL DLARTG( GP, F, C, S, RP ) GP = D( I+1 ) - P RP = ( D( I )-GP )*S + TWO*C*B P = S*RP IF( I.NE.L ) $ GP = C*RP - B 80 CONTINUE OLDGP = GP OLDRP = RP * Find GP & RP for the next iteration IF( ABS( C*OLDRP-B ).GT.SAFMIN ) THEN GP = ( ( OLDGP+P )-( D( L )-P ) ) / ( TWO*( C*OLDRP-B ) ) ELSE * * Goto put in by G. Henry to fix ALPHA problem * GOTO 90 * GP = ( ( OLDGP+P )-( D( L )-P ) ) / * $ ( TWO*( C*OLDRP-B )+SAFMIN ) ENDIF RP = DLAPY2( GP, ONE ) GP = D( M ) - ( D( L )-P ) + $ ( ( C*OLDRP-B ) / ( GP+SIGN( RP, GP ) ) ) TST1 = TST TST = ABS( C*OLDRP-B )**2 TST = TST / ( ( EPS2*ABS( D( L )-P ) )*ABS( OLDGP+P )+ $ SAFMIN ) * Make sure that we are making progress IF( ABS( C*OLDRP-B ).GT.0.9D0*OLDEL ) THEN IF( ABS( C*OLDRP-B ).GT.OLDEL ) THEN GP = G RP = R ENDIF TST = HALF ELSE OLDEL = ABS( C*OLDRP-B ) ENDIF NLOOK = NLOOK + 1 IF( ( TST.GT.ONE ) .AND. ( NLOOK.LE.NMAXLOOK ) ) $ GOTO 70 ENDIF * IF( ( TST.LE.ONE ) .AND. ( TST.NE.HALF ) .AND. $ ( ABS( P ).LT.EPS*ABS( D( L ) ) ) .AND. $ ( ILAST.EQ.L ) .AND. ( ABS( E( L ) )**2.LE.10000.0D0* $ ( ( EPS2*ABS( D( L ) ) )*ABS( D( L+1 ) )+SAFMIN ) ) ) THEN * * Skip the current step: the subdiagonal info is just noise. * M = L E( M ) = ZERO P = D( L ) JTOT = JTOT - 1 GOTO 110 ENDIF G = GP R = RP * * Lookahead over * 90 CONTINUE * S = ONE C = ONE P = ZERO * * Inner loop * MM1 = M - 1 DO 100 I = MM1, L, -1 F = S*E( I ) B = C*E( I ) CALL DLARTG( G, F, C, S, R ) IF( I.NE.M-1 ) $ E( I+1 ) = R G = D( I+1 ) - P R = ( D( I )-G )*S + TWO*C*B P = S*R D( I+1 ) = G + P G = C*R - B * * If eigenvectors are desired, then save rotations. * WORK( I ) = C WORK( N-1+I ) = -S * 100 CONTINUE * * If eigenvectors are desired, then apply saved rotations. * MM = M - L + 1 CALL ZLASR( 'R', 'V', 'B', NR, MM, WORK( L ), WORK( N-1+L ), $ Z( 1, L ), LDZ ) * D( L ) = D( L ) - P E( L ) = G ILAST = L GOTO 40 * * Eigenvalue found. * 110 CONTINUE D( L ) = P * L = L + 1 IF( L.LE.LEND ) $ GOTO 40 GOTO 200 * ELSE * * QR Iteration * * Look for small superdiagonal element. * 120 CONTINUE IF( L.NE.LEND ) THEN LENDP1 = LEND + 1 DO 130 M = L, LENDP1, -1 TST = ABS( E( M-1 ) )**2 IF( TST.LE.( EPS2*ABS( D( M ) ) )*ABS( D( M-1 ) )+ $ SAFMIN )GOTO 140 130 CONTINUE ENDIF * M = LEND * 140 CONTINUE IF( M.GT.LEND ) $ E( M-1 ) = ZERO P = D( L ) IF( M.EQ.L ) $ GOTO 190 * * If remaining matrix is 2-by-2, use DLAE2 or DLAEV2 * to compute its eigensystem. * IF( M.EQ.L-1 ) THEN CALL DLAEV2( D( L-1 ), E( L-1 ), D( L ), RT1, RT2, C, S ) WORK( M ) = C WORK( N-1+M ) = S CALL ZLASR( 'R', 'V', 'F', NR, 2, WORK( M ), WORK( N-1+M ), $ Z( 1, L-1 ), LDZ ) D( L-1 ) = RT1 D( L ) = RT2 E( L-1 ) = ZERO L = L - 2 IF( L.GE.LEND ) $ GOTO 120 GOTO 200 ENDIF * IF( JTOT.EQ.NMAXIT ) $ GOTO 200 JTOT = JTOT + 1 * * Form shift. * G = ( D( L-1 )-P ) / ( TWO*E( L-1 ) ) R = DLAPY2( G, ONE ) G = D( M ) - P + ( E( L-1 ) / ( G+SIGN( R, G ) ) ) * IF( ICOMPZ.EQ.0 ) THEN * Do not do a lookahead! GOTO 170 ENDIF * OLDEL = ABS( E( L-1 ) ) GP = G RP = R TST = ABS( E( L-1 ) )**2 TST = TST / ( ( EPS2*ABS( D( L ) ) )*ABS( D( L-1 ) )+SAFMIN ) NLOOK = 1 IF( ( TST.GT.ONE ) .AND. ( NLOOK.LE.NMAXLOOK ) ) THEN 150 CONTINUE * * This is the lookahead loop, going until we have * convergence or too many steps have been taken. * S = ONE C = ONE P = ZERO * * Inner loop * LM1 = L - 1 DO 160 I = M, LM1 F = S*E( I ) B = C*E( I ) CALL DLARTG( GP, F, C, S, RP ) GP = D( I ) - P RP = ( D( I+1 )-GP )*S + TWO*C*B P = S*RP IF( I.LT.LM1 ) $ GP = C*RP - B 160 CONTINUE OLDGP = GP OLDRP = RP * Find GP & RP for the next iteration IF( ABS( C*OLDRP-B ).GT.SAFMIN ) THEN GP = ( ( OLDGP+P )-( D( L )-P ) ) / ( TWO*( C*OLDRP-B ) ) ELSE * * Goto put in by G. Henry to fix ALPHA problem * GOTO 170 * GP = ( ( OLDGP+P )-( D( L )-P ) ) / * $ ( TWO*( C*OLDRP-B )+SAFMIN ) ENDIF RP = DLAPY2( GP, ONE ) GP = D( M ) - ( D( L )-P ) + $ ( ( C*OLDRP-B ) / ( GP+SIGN( RP, GP ) ) ) TST1 = TST TST = ABS( ( C*OLDRP-B ) )**2 TST = TST / ( ( EPS2*ABS( D( L )-P ) )*ABS( OLDGP+P )+ $ SAFMIN ) * Make sure that we are making progress IF( ABS( C*OLDRP-B ).GT.0.9D0*OLDEL ) THEN IF( ABS( C*OLDRP-B ).GT.OLDEL ) THEN GP = G RP = R ENDIF TST = HALF ELSE OLDEL = ABS( C*OLDRP-B ) ENDIF NLOOK = NLOOK + 1 IF( ( TST.GT.ONE ) .AND. ( NLOOK.LE.NMAXLOOK ) ) $ GOTO 150 ENDIF IF( ( TST.LE.ONE ) .AND. ( TST.NE.HALF ) .AND. $ ( ABS( P ).LT.EPS*ABS( D( L ) ) ) .AND. $ ( ILAST.EQ.L ) .AND. ( ABS( E( L-1 ) )**2.LE.10000.0D0* $ ( ( EPS2*ABS( D( L-1 ) ) )*ABS( D( L ) )+SAFMIN ) ) ) THEN * * Skip the current step: the subdiagonal info is just noise. * M = L E( M-1 ) = ZERO P = D( L ) JTOT = JTOT - 1 GOTO 190 ENDIF * G = GP R = RP * * Lookahead over * 170 CONTINUE * S = ONE C = ONE P = ZERO DO 180 I = M, LM1 F = S*E( I ) B = C*E( I ) CALL DLARTG( G, F, C, S, R ) IF( I.NE.M ) $ E( I-1 ) = R G = D( I ) - P R = ( D( I+1 )-G )*S + TWO*C*B P = S*R D( I ) = G + P G = C*R - B * * If eigenvectors are desired, then save rotations. * WORK( I ) = C WORK( N-1+I ) = S * 180 CONTINUE * * If eigenvectors are desired, then apply saved rotations. * MM = L - M + 1 CALL ZLASR( 'R', 'V', 'F', NR, MM, WORK( M ), WORK( N-1+M ), $ Z( 1, M ), LDZ ) * D( L ) = D( L ) - P E( LM1 ) = G ILAST = L GOTO 120 * * Eigenvalue found. * 190 CONTINUE D( L ) = P * L = L - 1 IF( L.GE.LEND ) $ GOTO 120 GOTO 200 * ENDIF * * Undo scaling if necessary * 200 CONTINUE IF( ISCALE.EQ.1 ) THEN CALL DLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV+1, 1, $ D( LSV ), N, INFO ) CALL DLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV, 1, E( LSV ), $ N, INFO ) ELSEIF( ISCALE.EQ.2 ) THEN CALL DLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV+1, 1, $ D( LSV ), N, INFO ) CALL DLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV, 1, E( LSV ), $ N, INFO ) ENDIF * * Check for no convergence to an eigenvalue after a total * of N*MAXIT iterations. * IF( JTOT.LT.NMAXIT ) $ GOTO 10 DO 210 I = 1, N - 1 IF( E( I ).NE.ZERO ) $ INFO = INFO + 1 210 CONTINUE GOTO 250 * * Order eigenvalues and eigenvectors. * 220 CONTINUE * * Use Selection Sort to minimize swaps of eigenvectors * DO 240 II = 2, N I = II - 1 K = I P = D( I ) DO 230 J = II, N IF( D( J ).LT.P ) THEN K = J P = D( J ) ENDIF 230 CONTINUE IF( K.NE.I ) THEN D( K ) = D( I ) D( I ) = P CALL ZSWAP( NR, Z( 1, I ), 1, Z( 1, K ), 1 ) ENDIF 240 CONTINUE * 250 CONTINUE * WRITE( *, FMT = * )'JTOT', JTOT RETURN * * End of DSTEQR2 * END scalapack-2.0.2/SRC/ztrmvt.f000644 000766 000024 00000011172 10363532303 016055 0ustar00juliestaff000000 000000 SUBROUTINE ZTRMVT( UPLO, N, T, LDT, X, INCX, Y, INCY, W, INCW, Z, $ INCZ ) * * -- ScaLAPACK routine (version 1.7) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 13, 2000 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCW, INCX, INCY, INCZ, LDT, N * .. * .. Array Arguments .. COMPLEX*16 T( LDT, * ), W( * ), X( * ), Y( * ), Z( * ) * .. * * Purpose * ======= * * ZTRMVT performs the matrix-vector operations * * x := conjg( T' ) *y, and w := T *z, * * where x is an n element vector and T is an n by n * upper or lower triangular matrix. * * Arguments * ========= * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * T - COMPLEX*16 array of DIMENSION ( LDT, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array T must contain the upper * triangular matrix and the strictly lower triangular part of * T is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array T must contain the lower * triangular matrix and the strictly upper triangular part of * T is not referenced. * * LDT - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * On exit, X = T' * y * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * W - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCW ) ). * On exit, W = T * z * * INCW - INTEGER. * On entry, INCW specifies the increment for the elements of * W. INCW must not be zero. * Unchanged on exit. * * Z - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCZ ) ). * Before entry, the incremented array Z must contain the n * element vector z. Unchanged on exit. * * INCY - INTEGER. * On entrz, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * * .. Local Scalars .. INTEGER INFO * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA, ZCOPY, ZTRMV * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN INFO = 4 ELSE IF( INCW.EQ.0 ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 10 ELSE IF( INCZ.EQ.0 ) THEN INFO = 12 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZTRMVT', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * * IF( INCX.NE.1 .OR. INCY.NE.1 .OR. INCW.NE.1 .OR. INCZ.NE.1 .OR. $ .TRUE. ) THEN CALL ZCOPY( N, Y, INCY, X, INCX ) CALL ZTRMV( UPLO, 'C', 'N', N, T, LDT, X, INCX ) CALL ZCOPY( N, Z, INCZ, W, INCW ) CALL ZTRMV( UPLO, 'N', 'N', N, T, LDT, W, INCW ) RETURN END IF * RETURN * * End of ZTRMVT. * END scalapack-2.0.2/REDIST/CMakeLists.txt000644 000766 000024 00000000060 11656312637 017451 0ustar00juliestaff000000 000000 add_subdirectory(SRC) add_subdirectory(TESTING) scalapack-2.0.2/REDIST/SRC/000755 000766 000024 00000000000 11750301601 015324 5ustar00juliestaff000000 000000 scalapack-2.0.2/REDIST/TESTING/000755 000766 000024 00000000000 11750301602 016013 5ustar00juliestaff000000 000000 scalapack-2.0.2/REDIST/TESTING/CMakeLists.txt000644 000766 000024 00000003135 11656312637 020574 0ustar00juliestaff000000 000000 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/scalapack/REDIST/TESTING) file(COPY GEMR2D.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY TRMR2D.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) add_executable(xigemr pigemrdrv.c) add_executable(xsgemr psgemrdrv.c) add_executable(xdgemr pdgemrdrv.c) add_executable(xcgemr pcgemrdrv.c) add_executable(xzgemr pzgemrdrv.c) add_executable(xitrmr pitrmrdrv.c) add_executable(xstrmr pstrmrdrv.c) add_executable(xdtrmr pdtrmrdrv.c) add_executable(xctrmr pctrmrdrv.c) add_executable(xztrmr pztrmrdrv.c) target_link_libraries(xigemr scalapack ) target_link_libraries(xsgemr scalapack ) target_link_libraries(xdgemr scalapack ) target_link_libraries(xcgemr scalapack ) target_link_libraries(xzgemr scalapack ) target_link_libraries(xitrmr scalapack ) target_link_libraries(xstrmr scalapack ) target_link_libraries(xdtrmr scalapack ) target_link_libraries(xctrmr scalapack ) target_link_libraries(xztrmr scalapack ) #add_test(xigemr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xigemr) #add_test(xsgemr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xsgemr) #add_test(xdgemr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdgemr) #add_test(xcgemr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xcgemr) #add_test(xzgemr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xzgemr) #add_test(xitrmr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xitrmr) #add_test(xstrmr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xstrmr) #add_test(xdtrmr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xdtrmr) #add_test(xctrmr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xctrmr) #add_test(xztrmr ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xztrmr) scalapack-2.0.2/REDIST/TESTING/GEMR2D.dat000644 000766 000024 00000017611 10363532303 017436 0ustar00juliestaff000000 000000 90 200 200 200 200 0 0 1 1 1 4 3 6 200 200 0 0 1 1 1 4 5 3 200 200 200 200 0 0 1 1 4 1 7 9 200 200 0 0 1 1 4 1 4 9 200 200 200 200 0 0 1 1 4 1 2 9 200 200 0 0 1 1 2 2 7 7 200 200 200 200 0 0 1 1 2 2 6 3 200 200 0 0 1 1 2 2 5 9 200 200 200 200 0 0 1 1 2 2 9 10 200 200 0 0 1 1 1 4 3 4 200 200 200 200 0 0 1 1 2 2 1 10 200 200 0 0 1 1 1 3 1 4 200 200 200 200 0 0 1 1 1 4 14 6 200 200 0 0 1 1 1 3 8 5 200 200 200 200 0 0 1 1 1 3 6 4 200 200 0 0 1 1 4 1 2 2 200 200 200 200 0 0 1 1 4 1 3 10 200 200 0 0 1 1 1 4 11 5 200 200 200 200 0 0 1 1 2 2 10 3 200 200 0 0 1 1 1 3 11 7 300 300 300 300 0 0 1 1 1 4 10 2 300 300 0 0 1 1 2 2 4 5 300 300 300 300 0 0 1 1 2 2 1 12 300 300 0 0 1 1 3 1 10 6 300 300 300 300 0 0 1 1 1 4 13 8 300 300 0 0 1 1 2 2 12 4 300 300 300 300 0 0 1 1 2 2 12 9 300 300 0 0 1 1 1 3 16 10 300 300 300 300 0 0 1 1 1 4 15 2 300 300 0 0 1 1 2 2 12 1 300 300 300 300 0 0 1 1 2 2 4 11 300 300 0 0 1 1 1 3 9 3 300 300 300 300 0 0 1 1 2 2 4 12 300 300 0 0 1 1 4 1 6 1 300 300 300 300 0 0 1 1 3 1 7 7 300 300 0 0 1 1 4 1 5 11 300 300 300 300 0 0 1 1 3 1 10 11 300 300 0 0 1 1 1 3 10 3 300 300 300 300 0 0 1 1 2 2 1 1 300 300 0 0 1 1 1 3 9 2 400 400 400 400 0 0 1 1 4 1 2 4 400 400 0 0 1 1 1 3 5 10 400 400 400 400 0 0 1 1 1 3 5 1 400 400 0 0 1 1 2 2 14 4 400 400 400 400 0 0 1 1 4 1 3 16 400 400 0 0 1 1 1 3 1 7 400 400 400 400 0 0 1 1 1 3 8 5 400 400 0 0 1 1 3 1 9 2 400 400 400 400 0 0 1 1 1 4 9 3 400 400 0 0 1 1 1 3 11 3 400 400 400 400 0 0 1 1 2 2 12 11 400 400 0 0 1 1 4 1 5 9 400 400 400 400 0 0 1 1 3 1 6 12 400 400 0 0 1 1 1 4 15 6 400 400 400 400 0 0 1 1 4 1 5 5 400 400 0 0 1 1 4 1 7 15 400 400 400 400 0 0 1 1 3 1 4 6 400 400 0 0 1 1 4 1 2 4 400 400 400 400 0 0 1 1 3 1 6 3 400 400 0 0 1 1 1 4 18 10 500 500 500 500 0 0 1 1 3 1 9 21 500 500 0 0 1 1 1 3 13 4 500 500 500 500 0 0 1 1 2 2 5 8 500 500 0 0 1 1 4 1 4 19 500 500 500 500 0 0 1 1 1 4 14 9 500 500 0 0 1 1 1 4 17 11 500 500 500 500 0 0 1 1 1 4 4 2 500 500 0 0 1 1 2 2 4 1 500 500 500 500 0 0 1 1 1 3 7 4 500 500 0 0 1 1 2 2 3 12 500 500 500 500 0 0 1 1 2 2 8 8 500 500 0 0 1 1 1 4 16 4 500 500 500 500 0 0 1 1 1 4 20 10 500 500 0 0 1 1 2 2 8 7 500 500 500 500 0 0 1 1 2 2 1 14 500 500 0 0 1 1 3 1 6 20 500 500 500 500 0 0 1 1 4 1 5 15 500 500 0 0 1 1 1 3 2 8 500 500 500 500 0 0 1 1 1 3 6 3 500 500 0 0 1 1 3 1 12 4 600 600 600 600 0 0 1 1 1 4 4 8 600 600 0 0 1 1 2 2 2 9 600 600 600 600 0 0 1 1 2 2 13 7 600 600 0 0 1 1 3 1 2 6 600 600 600 600 0 0 1 1 1 3 2 2 600 600 0 0 1 1 1 4 13 4 600 600 600 600 0 0 1 1 3 1 7 1 600 600 0 0 1 1 2 2 1 9 600 600 600 600 0 0 1 1 1 3 17 9 600 600 0 0 1 1 2 2 7 17 600 600 600 600 0 0 1 1 2 2 13 3 600 600 0 0 1 1 1 3 15 4 600 600 600 600 0 0 1 1 1 3 3 13 600 600 0 0 1 1 1 3 22 3 600 600 600 600 0 0 1 1 1 4 2 8 600 600 0 0 1 1 2 2 9 14 600 600 600 600 0 0 1 1 2 2 8 5 600 600 0 0 1 1 1 3 5 3 600 600 600 600 0 0 1 1 3 1 3 3 600 600 0 0 1 1 2 2 13 14 700 700 700 700 0 0 1 1 1 3 24 2 700 700 0 0 1 1 1 4 11 11 700 700 700 700 0 0 1 1 3 1 10 6 700 700 0 0 1 1 1 4 11 7 700 700 700 700 0 0 1 1 1 3 24 10 700 700 0 0 1 1 1 3 25 4 700 700 700 700 0 0 1 1 3 1 8 3 700 700 0 0 1 1 4 1 5 9 700 700 700 700 0 0 1 1 1 3 18 6 700 700 0 0 1 1 1 4 22 11 700 700 700 700 0 0 1 1 2 2 17 14 700 700 0 0 1 1 3 1 8 21 700 700 700 700 0 0 1 1 3 1 11 20 700 700 0 0 1 1 4 1 8 5 700 700 700 700 0 0 1 1 2 2 13 16 700 700 0 0 1 1 2 2 8 11 700 700 700 700 0 0 1 1 4 1 5 11 700 700 0 0 1 1 3 1 5 11 700 700 700 700 0 0 1 1 2 2 11 17 700 700 0 0 1 1 1 3 1 6 800 800 800 800 0 0 1 1 1 4 23 13 800 800 0 0 1 1 3 1 15 17 800 800 800 800 0 0 1 1 1 3 27 8 800 800 0 0 1 1 3 1 8 3 800 800 800 800 0 0 1 1 3 1 12 6 800 800 0 0 1 1 4 1 4 7 800 800 800 800 0 0 1 1 3 1 16 14 800 800 0 0 1 1 4 1 14 6 800 800 800 800 0 0 1 1 4 1 9 4 800 800 0 0 1 1 3 1 3 15 800 800 800 800 0 0 1 1 4 1 5 12 800 800 0 0 1 1 1 3 22 9 800 800 800 800 0 0 1 1 3 1 9 11 800 800 0 0 1 1 2 2 2 19 800 800 800 800 0 0 1 1 1 4 16 11 800 800 0 0 1 1 1 3 13 5 800 800 800 800 0 0 1 1 1 3 18 4 800 800 0 0 1 1 3 1 9 8 800 800 800 800 0 0 1 1 3 1 1 7 800 800 0 0 1 1 3 1 7 4 900 900 900 900 0 0 1 1 1 3 19 11 900 900 0 0 1 1 4 1 12 25 900 900 900 900 0 0 1 1 1 3 18 15 900 900 0 0 1 1 4 1 9 6 900 900 900 900 0 0 1 1 3 1 11 10 900 900 0 0 1 1 4 1 15 18 900 900 900 900 0 0 1 1 3 1 2 30 900 900 0 0 1 1 1 3 3 5 900 900 900 900 0 0 1 1 1 4 22 12 900 900 0 0 1 1 3 1 1 20 900 900 900 900 0 0 1 1 3 1 10 20 900 900 0 0 1 1 1 3 2 5 900 900 900 900 0 0 1 1 2 2 11 21 900 900 0 0 1 1 1 3 22 12 900 900 900 900 0 0 1 1 3 1 4 5 900 900 0 0 1 1 3 1 5 11 900 900 900 900 0 0 1 1 1 4 29 12 900 900 0 0 1 1 2 2 3 15 900 900 900 900 0 0 1 1 2 2 15 7 900 900 0 0 1 1 3 1 17 7 1000 1000 1000 1000 0 0 1 1 1 3 1 15 1000 1000 0 0 1 1 2 2 7 18 1000 1000 1000 1000 0 0 1 1 1 3 6 5 1000 1000 0 0 1 1 3 1 18 15 1000 1000 1000 1000 0 0 1 1 1 3 11 8 1000 1000 0 0 1 1 3 1 10 29 1000 1000 1000 1000 0 0 1 1 4 1 8 31 1000 1000 0 0 1 1 2 2 17 22 1000 1000 1000 1000 0 0 1 1 3 1 9 26 1000 1000 0 0 1 1 2 2 22 20 1000 1000 1000 1000 0 0 1 1 2 2 2 17 1000 1000 0 0 1 1 1 3 29 6 1000 1000 1000 1000 0 0 1 1 3 1 8 17 1000 1000 0 0 1 1 3 1 4 20 1000 1000 1000 1000 0 0 1 1 1 3 21 12 1000 1000 0 0 1 1 4 1 15 26 1000 1000 1000 1000 0 0 1 1 1 3 8 3 1000 1000 0 0 1 1 2 2 11 12 1000 1000 1000 1000 0 0 1 1 2 2 10 20 1000 1000 0 0 1 1 2 2 11 20 scalapack-2.0.2/REDIST/TESTING/Makefile000644 000766 000024 00000004531 11654025546 017473 0ustar00juliestaff000000 000000 ############################################################################ # # Program: ScaLAPACK # # Module: Makefile # # Purpose: Redistribution Testing Makefile # # Creation date: March 20, 1995 # # Modified: February 15, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../SLmake.inc igemrexe = xigemr sgemrexe = xsgemr dgemrexe = xdgemr cgemrexe = xcgemr zgemrexe = xzgemr itrmrexe = xitrmr strmrexe = xstrmr dtrmrexe = xdtrmr ctrmrexe = xctrmr ztrmrexe = xztrmr igemr = pigemrdrv.o sgemr = psgemrdrv.o dgemr = pdgemrdrv.o cgemr = pcgemrdrv.o zgemr = pzgemrdrv.o itrmr = pitrmrdrv.o strmr = pstrmrdrv.o dtrmr = pdtrmrdrv.o ctrmr = pctrmrdrv.o ztrmr = pztrmrdrv.o all: exe exe: integer single double complex complex16 integer: $(igemrexe) $(itrmrexe) single: $(sgemrexe) $(strmrexe) double: $(dgemrexe) $(dtrmrexe) complex: $(cgemrexe) $(ctrmrexe) complex16: $(zgemrexe) $(ztrmrexe) $(igemrexe): ../../$(SCALAPACKLIB) $(igemr) $(CCLOADER) $(CCLOADFLAGS) -o $(igemrexe) $(igemr) ../../$(SCALAPACKLIB) $(LIBS) $(sgemrexe): ../../$(SCALAPACKLIB) $(sgemr) $(CCLOADER) $(CCLOADFLAGS) -o $(sgemrexe) $(sgemr) ../../$(SCALAPACKLIB) $(LIBS) $(dgemrexe): ../../$(SCALAPACKLIB) $(dgemr) $(CCLOADER) $(CCLOADFLAGS) -o $(dgemrexe) $(dgemr) ../../$(SCALAPACKLIB) $(LIBS) $(cgemrexe): ../../$(SCALAPACKLIB) $(cgemr) $(CCLOADER) $(CCLOADFLAGS) -o $(cgemrexe) $(cgemr) ../../$(SCALAPACKLIB) $(LIBS) $(zgemrexe): ../../$(SCALAPACKLIB) $(zgemr) $(CCLOADER) $(CCLOADFLAGS) -o $(zgemrexe) $(zgemr) ../../$(SCALAPACKLIB) $(LIBS) $(itrmrexe): ../../$(SCALAPACKLIB) $(itrmr) $(CCLOADER) $(CCLOADFLAGS) -o $(itrmrexe) $(itrmr) ../../$(SCALAPACKLIB) $(LIBS) $(strmrexe): ../../$(SCALAPACKLIB) $(strmr) $(CCLOADER) $(CCLOADFLAGS) -o $(strmrexe) $(strmr) ../../$(SCALAPACKLIB) $(LIBS) $(dtrmrexe): ../../$(SCALAPACKLIB) $(dtrmr) $(CCLOADER) $(CCLOADFLAGS) -o $(dtrmrexe) $(dtrmr) ../../$(SCALAPACKLIB) $(LIBS) $(ctrmrexe): ../../$(SCALAPACKLIB) $(ctrmr) $(CCLOADER) $(CCLOADFLAGS) -o $(ctrmrexe) $(ctrmr) ../../$(SCALAPACKLIB) $(LIBS) $(ztrmrexe): ../../$(SCALAPACKLIB) $(ztrmr) $(CCLOADER) $(CCLOADFLAGS) -o $(ztrmrexe) $(ztrmr) ../../$(SCALAPACKLIB) $(LIBS) clean : rm -f *.o x* .c.o : ; $(CC) -c $(CDEFS) $(CCFLAGS) $*.c scalapack-2.0.2/REDIST/TESTING/pcgemrdrv.c000644 000766 000024 00000034215 10363532303 020160 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pcgemrdrv.c,v 1.1.1.1 2000/02/15 18:04:11 susan Exp $ * * pcgemrdrv.c : * * * PURPOSE: * * this driver is testing the PCGEMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed COMPLEX (block * scattered) matrix. Then it calls PCGEMR2D for the inverse redistribution * and checks the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See pcgemr.c file for detailed info on the PCGEMR2D function. * * * The testing parameters are read from the file GEMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are COMPLEX * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(complex) * bytes, * * * - use the procedures of the files: * * pcgemr.o pcgemr2.o pcgemraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pcgemr2do_ #define fortran_mr2dnew pcgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PCGEMR2D #define fortran_mr2d PCGEMR2DO #define ccopy_ CCOPY #define clacpy_ CLACPY #else #define fortran_mr2d pcgemr2do #define fortran_mr2dnew pcgemr2d #define ccopy_ ccopy #define clacpy_ clacpy #endif #define Clacpy Ccgelacpy void Clacpy(); typedef struct { float r, i; } complex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Ccgesd2d(); extern void Ccgerv2d(); /* lapack */ void clacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 cgescanD0 #define dispmat cgedispmat #define setmemory cgesetmemory #define freememory cgefreememory #define scan_intervals cgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpcgemr2do(); extern void Cpcgemr2d(); /* some defines for Cpcgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) complex *block; int m, n; { complex *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata).r = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xcgemr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; complex *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("GEMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open GEMR2D.dat\n"); exit(1); }; printf("\n// CGEMR2D TESTER for COMPLEX //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, NULL); printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(complex)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i].r = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpcgemr2d(m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpcgemr2d(m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; if (!in) { ptrmyblockcopy[i].r = -1; } if (ptrmyblockvide[i].r != ptrmyblockcopy[i].r) { nberrors++; printf("Proc %d : Error element number %d, value = %f , initvalue =%f \n" ,mypnum, i, ptrmyblockvide[i].r, ptrmyblockcopy[i].r); }; }; if (nberrors > 0) { printf("Processor %d, has tested %d COMPLEX elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/pctrmrdrv.c000644 000766 000024 00000035323 10363532303 020213 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pctrmrdrv.c,v 1.1.1.1 2000/02/15 18:04:11 susan Exp $ * * pctrmrdrv.c : * * * PURPOSE: * * this driver is testing the PCTRMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed COMPLEX (block * scattered) matrix. Then it calls PCTRMR2D for the inverse redistribution * and checks the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See pctrmr.c file for detailed info on the PCTRMR2D function. * * * The testing parameters are read from the file TRMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are COMPLEX * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(complex) * bytes, * * * - use the procedures of the files: * * pctrmr.o pctrmr2.o pctrmraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pctrmr2do_ #define fortran_mr2dnew pctrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PCTRMR2D #define fortran_mr2d PCTRMR2DO #define ccopy_ CCOPY #define clacpy_ CLACPY #else #define fortran_mr2d pctrmr2do #define fortran_mr2dnew pctrmr2d #define ccopy_ ccopy #define clacpy_ clacpy #endif #define Clacpy Cctrlacpy void Clacpy(); typedef struct { float r, i; } complex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Ccgesd2d(); extern void Ccgerv2d(); /* lapack */ void clacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 ctrscanD0 #define dispmat ctrdispmat #define setmemory ctrsetmemory #define freememory ctrfreememory #define scan_intervals ctrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpctrmr2do(); extern void Cpctrmr2d(); /* some defines for Cpctrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) complex *block; int m, n; { complex *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata).r = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xctrmr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; char *uplo, *diag; complex *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("TRMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open TRMR2D.dat\n"); exit(1); }; printf("\n// CTRMR2D TESTER for COMPLEX //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, &u, &d, NULL); uplo = u ? "UPPER" : "LOWER"; diag = d ? "UNIT" : "NONUNIT"; printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf(" %s %s", toupper(*uplo) == 'U' ? "up" : "low", toupper(*diag) == 'U' ? "unit" : "nonunit"); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; parameters[22] = *uplo == 'U'; parameters[23] = *diag == 'U'; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; uplo = parameters[22] ? "UPPER" : "LOWER"; diag = parameters[23] ? "UNIT" : "NONUNIT"; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(complex)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i].r = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpctrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpctrmr2d(uplo, diag, m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; else if (toupper(*uplo) == 'U') in = (gi <= gj + max(0, m - n) - (toupper(*diag) == 'U')); else in = (gi >= gj - max(0, n - m) + (toupper(*diag) == 'U')); if (!in) { ptrmyblockcopy[i].r = -1; } if (ptrmyblockvide[i].r != ptrmyblockcopy[i].r) { nberrors++; printf("Proc %d : Error element number %d, value = %f , initvalue =%f \n" ,mypnum, i, ptrmyblockvide[i].r, ptrmyblockcopy[i].r); }; }; if (nberrors > 0) { printf("Processor %d, has tested %d COMPLEX elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/pdgemrdrv.c000644 000766 000024 00000034160 10363532303 020160 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pdgemrdrv.c,v 1.1.1.1 2000/02/15 18:04:11 susan Exp $ * * pdgemrdrv.c : * * * PURPOSE: * * this driver is testing the PDGEMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed DOUBLE PRECISION * (block scattered) matrix. Then it calls PDGEMR2D for the inverse * redistribution and checks the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See pdgemr.c file for detailed info on the PDGEMR2D function. * * * The testing parameters are read from the file GEMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are DOUBLE PRECISION * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(double) * bytes, * * * - use the procedures of the files: * * pdgemr.o pdgemr2.o pdgemraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pdgemr2do_ #define fortran_mr2dnew pdgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PDGEMR2D #define fortran_mr2d PDGEMR2DO #define dcopy_ DCOPY #define dlacpy_ DLACPY #else #define fortran_mr2d pdgemr2do #define fortran_mr2dnew pdgemr2d #define dcopy_ dcopy #define dlacpy_ dlacpy #endif #define Clacpy Cdgelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cdgesd2d(); extern void Cdgerv2d(); /* lapack */ void dlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 dgescanD0 #define dispmat dgedispmat #define setmemory dgesetmemory #define freememory dgefreememory #define scan_intervals dgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpdgemr2do(); extern void Cpdgemr2d(); /* some defines for Cpdgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) double *block; int m, n; { double *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata) = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xdgemr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; double *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("GEMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open GEMR2D.dat\n"); exit(1); }; printf("\n// DGEMR2D TESTER for DOUBLE PRECISION //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, NULL); printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(double)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i] = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpdgemr2d(m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpdgemr2d(m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; if (!in) { ptrmyblockcopy[i] = -1; } if (ptrmyblockvide[i] != ptrmyblockcopy[i]) { nberrors++; printf("Proc %d : Error element number %d, value = %f , initvalue =%f \n" ,mypnum, i, ptrmyblockvide[i], ptrmyblockcopy[i]); }; }; if (nberrors > 0) { printf("Processor %d, has tested %d DOUBLE PRECISION elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/pdtrmrdrv.c000644 000766 000024 00000035266 10363532303 020222 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pdtrmrdrv.c,v 1.1.1.1 2000/02/15 18:04:11 susan Exp $ * * pdtrmrdrv.c : * * * PURPOSE: * * this driver is testing the PDTRMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed DOUBLE PRECISION * (block scattered) matrix. Then it calls PDTRMR2D for the inverse * redistribution and checks the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See pdtrmr.c file for detailed info on the PDTRMR2D function. * * * The testing parameters are read from the file TRMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are DOUBLE PRECISION * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(double) * bytes, * * * - use the procedures of the files: * * pdtrmr.o pdtrmr2.o pdtrmraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pdtrmr2do_ #define fortran_mr2dnew pdtrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PDTRMR2D #define fortran_mr2d PDTRMR2DO #define dcopy_ DCOPY #define dlacpy_ DLACPY #else #define fortran_mr2d pdtrmr2do #define fortran_mr2dnew pdtrmr2d #define dcopy_ dcopy #define dlacpy_ dlacpy #endif #define Clacpy Cdtrlacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cdgesd2d(); extern void Cdgerv2d(); /* lapack */ void dlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 dtrscanD0 #define dispmat dtrdispmat #define setmemory dtrsetmemory #define freememory dtrfreememory #define scan_intervals dtrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpdtrmr2do(); extern void Cpdtrmr2d(); /* some defines for Cpdtrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) double *block; int m, n; { double *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata) = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xdtrmr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; char *uplo, *diag; double *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("TRMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open TRMR2D.dat\n"); exit(1); }; printf("\n// DTRMR2D TESTER for DOUBLE PRECISION //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, &u, &d, NULL); uplo = u ? "UPPER" : "LOWER"; diag = d ? "UNIT" : "NONUNIT"; printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf(" %s %s", toupper(*uplo) == 'U' ? "up" : "low", toupper(*diag) == 'U' ? "unit" : "nonunit"); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; parameters[22] = *uplo == 'U'; parameters[23] = *diag == 'U'; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; uplo = parameters[22] ? "UPPER" : "LOWER"; diag = parameters[23] ? "UNIT" : "NONUNIT"; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(double)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i] = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpdtrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpdtrmr2d(uplo, diag, m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; else if (toupper(*uplo) == 'U') in = (gi <= gj + max(0, m - n) - (toupper(*diag) == 'U')); else in = (gi >= gj - max(0, n - m) + (toupper(*diag) == 'U')); if (!in) { ptrmyblockcopy[i] = -1; } if (ptrmyblockvide[i] != ptrmyblockcopy[i]) { nberrors++; printf("Proc %d : Error element number %d, value = %f , initvalue =%f \n" ,mypnum, i, ptrmyblockvide[i], ptrmyblockcopy[i]); }; }; if (nberrors > 0) { printf("Processor %d, has tested %d DOUBLE PRECISION elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/pigemrdrv.c000644 000766 000024 00000034072 10363532303 020167 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pigemrdrv.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $ * * pigemrdrv.c : * * * PURPOSE: * * this driver is testing the PIGEMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed INTEGER (block * scattered) matrix. Then it calls PIGEMR2D for the inverse redistribution * and checks the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See pigemr.c file for detailed info on the PIGEMR2D function. * * * The testing parameters are read from the file GEMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are INTEGER * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(int) bytes, * * * - use the procedures of the files: * * pigemr.o pigemr2.o pigemraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pigemr2do_ #define fortran_mr2dnew pigemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PIGEMR2D #define fortran_mr2d PIGEMR2DO #define icopy_ ICOPY #define ilacpy_ ILACPY #else #define fortran_mr2d pigemr2do #define fortran_mr2dnew pigemr2d #define icopy_ icopy #define ilacpy_ ilacpy #endif #define Clacpy Cigelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cigesd2d(); extern void Cigerv2d(); /* lapack */ void ilacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 igescanD0 #define dispmat igedispmat #define setmemory igesetmemory #define freememory igefreememory #define scan_intervals igescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpigemr2do(); extern void Cpigemr2d(); /* some defines for Cpigemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) int *block; int m, n; { int *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata) = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xigemr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; int *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("GEMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open GEMR2D.dat\n"); exit(1); }; printf("\n// IGEMR2D TESTER for INTEGER //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, NULL); printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(int)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i] = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpigemr2d(m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpigemr2d(m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; if (!in) { ptrmyblockcopy[i] = -1; } if (ptrmyblockvide[i] != ptrmyblockcopy[i]) { nberrors++; printf("Proc %d : Error element number %d, value = %d , initvalue =%d \n" ,mypnum, i, ptrmyblockvide[i], ptrmyblockcopy[i]); }; }; if (nberrors > 0) { printf("Processor %d, has tested %d INTEGER elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/pitrmrdrv.c000644 000766 000024 00000035200 10363532303 020213 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pitrmrdrv.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $ * * pitrmrdrv.c : * * * PURPOSE: * * this driver is testing the PITRMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed INTEGER (block * scattered) matrix. Then it calls PITRMR2D for the inverse redistribution * and checks the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See pitrmr.c file for detailed info on the PITRMR2D function. * * * The testing parameters are read from the file TRMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are INTEGER * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(int) bytes, * * * - use the procedures of the files: * * pitrmr.o pitrmr2.o pitrmraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pitrmr2do_ #define fortran_mr2dnew pitrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PITRMR2D #define fortran_mr2d PITRMR2DO #define icopy_ ICOPY #define ilacpy_ ILACPY #else #define fortran_mr2d pitrmr2do #define fortran_mr2dnew pitrmr2d #define icopy_ icopy #define ilacpy_ ilacpy #endif #define Clacpy Citrlacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cigesd2d(); extern void Cigerv2d(); /* lapack */ void ilacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 itrscanD0 #define dispmat itrdispmat #define setmemory itrsetmemory #define freememory itrfreememory #define scan_intervals itrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpitrmr2do(); extern void Cpitrmr2d(); /* some defines for Cpitrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) int *block; int m, n; { int *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata) = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xitrmr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; char *uplo, *diag; int *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("TRMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open TRMR2D.dat\n"); exit(1); }; printf("\n// ITRMR2D TESTER for INTEGER //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, &u, &d, NULL); uplo = u ? "UPPER" : "LOWER"; diag = d ? "UNIT" : "NONUNIT"; printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf(" %s %s", toupper(*uplo) == 'U' ? "up" : "low", toupper(*diag) == 'U' ? "unit" : "nonunit"); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; parameters[22] = *uplo == 'U'; parameters[23] = *diag == 'U'; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; uplo = parameters[22] ? "UPPER" : "LOWER"; diag = parameters[23] ? "UNIT" : "NONUNIT"; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(int)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i] = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpitrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpitrmr2d(uplo, diag, m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; else if (toupper(*uplo) == 'U') in = (gi <= gj + max(0, m - n) - (toupper(*diag) == 'U')); else in = (gi >= gj - max(0, n - m) + (toupper(*diag) == 'U')); if (!in) { ptrmyblockcopy[i] = -1; } if (ptrmyblockvide[i] != ptrmyblockcopy[i]) { nberrors++; printf("Proc %d : Error element number %d, value = %d , initvalue =%d \n" ,mypnum, i, ptrmyblockvide[i], ptrmyblockcopy[i]); }; }; if (nberrors > 0) { printf("Processor %d, has tested %d INTEGER elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/psgemrdrv.c000644 000766 000024 00000033662 10363532303 020205 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: psgemrdrv.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $ * * psgemrdrv.c : * * * PURPOSE: * * this driver is testing the PSGEMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed REAL (block scattered) * matrix. Then it calls PSGEMR2D for the inverse redistribution and checks * the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See psgemr.c file for detailed info on the PSGEMR2D function. * * * The testing parameters are read from the file GEMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are REAL * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(float) bytes, * * * - use the procedures of the files: * * psgemr.o psgemr2.o psgemraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d psgemr2do_ #define fortran_mr2dnew psgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PSGEMR2D #define fortran_mr2d PSGEMR2DO #define scopy_ SCOPY #define slacpy_ SLACPY #else #define fortran_mr2d psgemr2do #define fortran_mr2dnew psgemr2d #define scopy_ scopy #define slacpy_ slacpy #endif #define Clacpy Csgelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Csgesd2d(); extern void Csgerv2d(); /* lapack */ void slacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 sgescanD0 #define dispmat sgedispmat #define setmemory sgesetmemory #define freememory sgefreememory #define scan_intervals sgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpsgemr2do(); extern void Cpsgemr2d(); /* some defines for Cpsgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) float *block; int m, n; { float *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata) = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xsgemr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; float *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("GEMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open GEMR2D.dat\n"); exit(1); }; printf("\n// SGEMR2D TESTER for REAL //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, NULL); printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(float)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i] = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpsgemr2d(m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpsgemr2d(m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; if (!in) { ptrmyblockcopy[i] = -1; } if (ptrmyblockvide[i] != ptrmyblockcopy[i]) { nberrors++; }; }; if (nberrors > 0) { printf("Processor %d, has tested %d REAL elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/pstrmrdrv.c000644 000766 000024 00000034770 10363532303 020240 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pstrmrdrv.c,v 1.1.1.1 2000/02/15 18:04:11 susan Exp $ * * pstrmrdrv.c : * * * PURPOSE: * * this driver is testing the PSTRMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed REAL (block scattered) * matrix. Then it calls PSTRMR2D for the inverse redistribution and checks * the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See pstrmr.c file for detailed info on the PSTRMR2D function. * * * The testing parameters are read from the file TRMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are REAL * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(float) bytes, * * * - use the procedures of the files: * * pstrmr.o pstrmr2.o pstrmraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pstrmr2do_ #define fortran_mr2dnew pstrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PSTRMR2D #define fortran_mr2d PSTRMR2DO #define scopy_ SCOPY #define slacpy_ SLACPY #else #define fortran_mr2d pstrmr2do #define fortran_mr2dnew pstrmr2d #define scopy_ scopy #define slacpy_ slacpy #endif #define Clacpy Cstrlacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Csgesd2d(); extern void Csgerv2d(); /* lapack */ void slacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 strscanD0 #define dispmat strdispmat #define setmemory strsetmemory #define freememory strfreememory #define scan_intervals strscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpstrmr2do(); extern void Cpstrmr2d(); /* some defines for Cpstrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) float *block; int m, n; { float *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata) = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xstrmr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; char *uplo, *diag; float *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("TRMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open TRMR2D.dat\n"); exit(1); }; printf("\n// STRMR2D TESTER for REAL //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, &u, &d, NULL); uplo = u ? "UPPER" : "LOWER"; diag = d ? "UNIT" : "NONUNIT"; printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf(" %s %s", toupper(*uplo) == 'U' ? "up" : "low", toupper(*diag) == 'U' ? "unit" : "nonunit"); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; parameters[22] = *uplo == 'U'; parameters[23] = *diag == 'U'; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; uplo = parameters[22] ? "UPPER" : "LOWER"; diag = parameters[23] ? "UNIT" : "NONUNIT"; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(float)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i] = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpstrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpstrmr2d(uplo, diag, m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; else if (toupper(*uplo) == 'U') in = (gi <= gj + max(0, m - n) - (toupper(*diag) == 'U')); else in = (gi >= gj - max(0, n - m) + (toupper(*diag) == 'U')); if (!in) { ptrmyblockcopy[i] = -1; } if (ptrmyblockvide[i] != ptrmyblockcopy[i]) { nberrors++; }; }; if (nberrors > 0) { printf("Processor %d, has tested %d REAL elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/pzgemrdrv.c000644 000766 000024 00000034240 10363532303 020205 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pzgemrdrv.c,v 1.1.1.1 2000/02/15 18:04:11 susan Exp $ * * pzgemrdrv.c : * * * PURPOSE: * * this driver is testing the PZGEMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed COMPLEX*16 (block * scattered) matrix. Then it calls PZGEMR2D for the inverse redistribution * and checks the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See pzgemr.c file for detailed info on the PZGEMR2D function. * * * The testing parameters are read from the file GEMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are COMPLEX*16 * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(dcomplex) * bytes, * * * - use the procedures of the files: * * pzgemr.o pzgemr2.o pzgemraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pzgemr2do_ #define fortran_mr2dnew pzgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PZGEMR2D #define fortran_mr2d PZGEMR2DO #define zcopy_ ZCOPY #define zlacpy_ ZLACPY #else #define fortran_mr2d pzgemr2do #define fortran_mr2dnew pzgemr2d #define zcopy_ zcopy #define zlacpy_ zlacpy #endif #define Clacpy Czgelacpy void Clacpy(); typedef struct { double r, i; } dcomplex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Czgesd2d(); extern void Czgerv2d(); /* lapack */ void zlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 zgescanD0 #define dispmat zgedispmat #define setmemory zgesetmemory #define freememory zgefreememory #define scan_intervals zgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpzgemr2do(); extern void Cpzgemr2d(); /* some defines for Cpzgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) dcomplex *block; int m, n; { dcomplex *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata).r = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xzgemr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; dcomplex *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("GEMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open GEMR2D.dat\n"); exit(1); }; printf("\n// ZGEMR2D TESTER for COMPLEX*16 //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, NULL); printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(dcomplex)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i].r = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpzgemr2d(m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpzgemr2d(m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; if (!in) { ptrmyblockcopy[i].r = -1; } if (ptrmyblockvide[i].r != ptrmyblockcopy[i].r) { nberrors++; printf("Proc %d : Error element number %d, value = %f , initvalue =%f \n" ,mypnum, i, ptrmyblockvide[i].r, ptrmyblockcopy[i].r); }; }; if (nberrors > 0) { printf("Processor %d, has tested %d COMPLEX*16 elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/pztrmrdrv.c000644 000766 000024 00000035346 10363532303 020247 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pztrmrdrv.c,v 1.1.1.1 2000/02/15 18:04:11 susan Exp $ * * pztrmrdrv.c : * * * PURPOSE: * * this driver is testing the PZTRMR2D routine. It calls it to obtain a new * scattered block data decomposition of a distributed COMPLEX*16 (block * scattered) matrix. Then it calls PZTRMR2D for the inverse redistribution * and checks the results with the initial data. * * Data are going from a Block Scattered nbrow0 x nbcol0 decomposition on the * processor grid p0 x q0, to data distributed in a BS nbrow1 x nbcol1 on the * processor grid p1 x q1, then back to the BS nbrow0 x nbcol0 decomposition * on the processor grid p0 x q0. * * See pztrmr.c file for detailed info on the PZTRMR2D function. * * * The testing parameters are read from the file TRMR2D.dat, see the file in the * distribution to have an example. * * created by Bernard Tourancheau in April 1994. * * modifications : see sccs history * * =================================== * * * NOTE : * * - the matrix elements are COMPLEX*16 * * - memory requirements : this procedure requires approximately 3 times the * memory space of the initial data block in grid 0 (initial block, copy for * test and second redistribution result) and 1 time the memory space of the * result data block in grid 1. with the element size = sizeof(dcomplex) * bytes, * * * - use the procedures of the files: * * pztrmr.o pztrmr2.o pztrmraux.o * * * ====================================== * * WARNING ASSUMPTIONS : * * * ======================================== * * * Planned changes: * * * * ========================================= */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pztrmr2do_ #define fortran_mr2dnew pztrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PZTRMR2D #define fortran_mr2d PZTRMR2DO #define zcopy_ ZCOPY #define zlacpy_ ZLACPY #else #define fortran_mr2d pztrmr2do #define fortran_mr2dnew pztrmr2d #define zcopy_ zcopy #define zlacpy_ zlacpy #endif #define Clacpy Cztrlacpy void Clacpy(); typedef struct { double r, i; } dcomplex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Czgesd2d(); extern void Czgerv2d(); /* lapack */ void zlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 ztrscanD0 #define dispmat ztrdispmat #define setmemory ztrsetmemory #define freememory ztrfreememory #define scan_intervals ztrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpztrmr2do(); extern void Cpztrmr2d(); /* some defines for Cpztrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* initblock: intialize the local part of a matrix with random data (well, * not very random) */ static2 void initblock(block, m, n) dcomplex *block; int m, n; { dcomplex *pdata; int i; pdata = block; for (i = 0; i < m * n; i++, pdata++) { (*pdata).r = i; }; } /* getparam:read from a file a list of integer parameters, the end of the * parameters to read is given by a NULL at the end of the args list */ #ifdef __STDC__ #include static void getparam(FILE * f,...) { #else #include static void getparam(va_alist) va_dcl { FILE *f; #endif va_list ap; int i; static int nbline; char *ptr, *next; int *var; static char buffer[200]; #ifdef __STDC__ va_start(ap, f); #else va_start(ap); f = va_arg(ap, FILE *); #endif do { next = fgets(buffer, 200, f); if (next == NULL) { fprintf(stderr, "bad configuration driver file:after line %d\n", nbline); exit(1); } nbline += 1; } while (buffer[0] == '#'); ptr = buffer; var = va_arg(ap, int *); while (var != NULL) { *var = strtol(ptr, &next, 10); if (ptr == next) { fprintf(stderr, "bad configuration driver file:error line %d\n", nbline); exit(1); } ptr = next; var = va_arg(ap, int *); } va_end(ap); } void initforpvm(argc, argv) int argc; char *argv[]; { int pnum, nproc; Cblacs_pinfo(&pnum, &nproc); if (nproc < 1) { /* we are with PVM */ if (pnum == 0) { if (argc < 2) { fprintf(stderr, "usage with PVM:xztrmr nbproc\n\ \t where nbproc is the number of nodes to initialize\n"); exit(1); } nproc = atoi(argv[1]); } Cblacs_setup(&pnum, &nproc); } } int main(argc, argv) int argc; char *argv[]; { /* We initialize the data-block on the current processor, then redistribute * it, and perform the inverse redistribution to compare the local memory * with the initial one. */ /* Data file */ FILE *fp; int nbre, nbremax; /* Data distribution 0 parameters */ int p0, /* # of rows in the processor grid */ q0; /* # of columns in the processor grid */ /* Data distribution 1 parameters */ int p1, q1; /* # of parameter to be read on the keyboard */ #define nbparameter 24 /* General variables */ int blocksize0; int mypnum, nprocs; int parameters[nbparameter], nberrors; int i; int ia, ja, ib, jb, m, n; int gcontext, context0, context1; int myprow1, myprow0, mypcol0, mypcol1; int dummy; MDESC ma, mb; char *uplo, *diag; dcomplex *ptrmyblock, *ptrsavemyblock, *ptrmyblockcopy, *ptrmyblockvide; #ifdef UsingMpiBlacs MPI_Init(&argc, &argv); #endif setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #ifdef T3D free(malloc(14000000)); #endif initforpvm(argc, argv); /* Read physical parameters */ Cblacs_pinfo(&mypnum, &nprocs); /* initialize BLACS for the parameter communication */ Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", nprocs, 1); Cblacs_gridinfo(gcontext, &dummy, &dummy, &mypnum, &dummy); if (mypnum == 0) { if ((fp = fopen("TRMR2D.dat", "r")) == NULL) { fprintf(stderr, "Can't open TRMR2D.dat\n"); exit(1); }; printf("\n// ZTRMR2D TESTER for COMPLEX*16 //\n"); getparam(fp, &nbre, NULL); printf("////////// %d tests \n\n", nbre); parameters[0] = nbre; Cigebs2d(gcontext, "All", "H", 1, 1, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, 1, parameters, 1, 0, 0); nbre = parameters[0]; }; if (mypnum == 0) { printf("\n m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 \ m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1\n\n"); }; /****** TEST LOOP *****/ /* Here we are in grip 1xnprocs */ nbremax = nbre; #ifdef DEBUG fprintf(stderr, "bonjour,je suis le noeud %d\n", mypnum); #endif while (nbre-- != 0) { /* Loop on the serie of tests */ /* All the processors read the parameters so we have to be in a 1xnprocs * grid at each iteration */ /* Read processors grid and matrices parameters */ if (mypnum == 0) { int u, d; getparam(fp, &m, &n, &ma.m, &ma.n, &ma.sprow, &ma.spcol, &ia, &ja, &p0, &q0, &ma.nbrow, &ma.nbcol, &mb.m, &mb.n, &mb.sprow, &mb.spcol, &ib, &jb, &p1, &q1, &mb.nbrow, &mb.nbcol, &u, &d, NULL); uplo = u ? "UPPER" : "LOWER"; diag = d ? "UNIT" : "NONUNIT"; printf("\t\t************* TEST # %d **********\n", nbremax - nbre); printf(" %3d %3d %3d %3d %3d %3d %3d %3d \ %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d", m, n, ma.m, ma.n, ma.sprow, ma.spcol, ia, ja, p0, q0, ma.nbrow, ma.nbcol, mb.m, mb.n, mb.sprow, mb.spcol, ib, jb, p1, q1, mb.nbrow, mb.nbcol); printf(" %s %s", toupper(*uplo) == 'U' ? "up" : "low", toupper(*diag) == 'U' ? "unit" : "nonunit"); printf("\n"); if (p0 * q0 > nprocs || p1 * q1 > nprocs) { fprintf(stderr, "not enough nodes:%d processors required\n", max(p0 * q0, p1 * q1)); exit(1); } parameters[0] = p0; parameters[1] = q0; parameters[2] = ma.nbrow; parameters[3] = ma.nbcol; parameters[4] = p1; parameters[5] = q1; parameters[6] = mb.nbrow; parameters[7] = mb.nbcol; parameters[8] = ma.m; parameters[9] = ma.n; parameters[10] = ma.sprow; parameters[11] = ma.spcol; parameters[12] = mb.sprow; parameters[13] = mb.spcol; parameters[14] = ia; parameters[15] = ja; parameters[16] = ib; parameters[17] = jb; parameters[18] = m; parameters[19] = n; parameters[20] = mb.m; parameters[21] = mb.n; parameters[22] = *uplo == 'U'; parameters[23] = *diag == 'U'; Cigebs2d(gcontext, "All", "H", 1, nbparameter, parameters, 1); } else { Cigebr2d(gcontext, "All", "H", 1, nbparameter, parameters, 1, 0, 0); p0 = parameters[0]; q0 = parameters[1]; ma.nbrow = parameters[2]; ma.nbcol = parameters[3]; p1 = parameters[4]; q1 = parameters[5]; mb.nbrow = parameters[6]; mb.nbcol = parameters[7]; ma.m = parameters[8]; ma.n = parameters[9]; ma.sprow = parameters[10]; ma.spcol = parameters[11]; mb.sprow = parameters[12]; mb.spcol = parameters[13]; ia = parameters[14]; ja = parameters[15]; ib = parameters[16]; jb = parameters[17]; m = parameters[18]; n = parameters[19]; mb.m = parameters[20]; mb.n = parameters[21]; ma.desctype = BLOCK_CYCLIC_2D; mb.desctype = BLOCK_CYCLIC_2D; uplo = parameters[22] ? "UPPER" : "LOWER"; diag = parameters[23] ? "UNIT" : "NONUNIT"; }; Cblacs_get(0, 0, &context0); Cblacs_gridinit(&context0, "R", p0, q0); Cblacs_get(0, 0, &context1); Cblacs_gridinit(&context1, "R", p1, q1); Cblacs_gridinfo(context0, &dummy, &dummy, &myprow0, &mypcol0); if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; Cblacs_gridinfo(context1, &dummy, &dummy, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); ma.ctxt = context0; mb.ctxt = context1; /* From here, we are not assuming that only the processors working in the * redistribution are calling xxMR2D, but the ones not concerned will do * nothing. */ /* We compute the exact size of the local memory block for the memory * allocations */ if (myprow0 >= 0 && mypcol0 >= 0) { blocksize0 = memoryblocksize(&ma); ma.lda = localsize(SHIFT(myprow0, ma.sprow, p0), p0, ma.nbrow, ma.m); setmemory(&ptrmyblock, blocksize0); initblock(ptrmyblock, 1, blocksize0); setmemory(&ptrmyblockcopy, blocksize0); memcpy((char *) ptrmyblockcopy, (char *) ptrmyblock, blocksize0 * sizeof(dcomplex)); setmemory(&ptrmyblockvide, blocksize0); for (i = 0; i < blocksize0; i++) ptrmyblockvide[i].r = -1; }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { setmemory(&ptrsavemyblock, memoryblocksize(&mb)); mb.lda = localsize(SHIFT(myprow1, mb.sprow, p1), p1, mb.nbrow, mb.m); }; /* if (mypnum < p1 * q1) */ /* Redistribute the matrix from grid 0 to grid 1 (memory location * ptrmyblock to ptrsavemyblock) */ Cpztrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, &ma, ptrsavemyblock, ib, jb, &mb, gcontext); /* Perform the inverse redistribution of the matrix from grid 1 to grid 0 * (memory location ptrsavemyblock to ptrmyblockvide) */ Cpztrmr2d(uplo, diag, m, n, ptrsavemyblock, ib, jb, &mb, ptrmyblockvide, ia, ja, &ma, gcontext); /* Check the differences */ nberrors = 0; if (myprow0 >= 0 && mypcol0 >= 0) { /* only for the processors that do have data at the begining */ for (i = 0; i < blocksize0; i++) { int li, lj, gi, gj; int in; in = 1; li = i % ma.lda; lj = i / ma.lda; gi = (li / ma.nbrow) * p0 * ma.nbrow + SHIFT(myprow0, ma.sprow, p0) * ma.nbrow + li % ma.nbrow; gj = (lj / ma.nbcol) * q0 * ma.nbcol + SHIFT(mypcol0, ma.spcol, q0) * ma.nbcol + lj % ma.nbcol; assert(gi < ma.m && gj < ma.n); gi -= (ia - 1); gj -= (ja - 1); if (gi < 0 || gj < 0 || gi >= m || gj >= n) in = 0; else if (toupper(*uplo) == 'U') in = (gi <= gj + max(0, m - n) - (toupper(*diag) == 'U')); else in = (gi >= gj - max(0, n - m) + (toupper(*diag) == 'U')); if (!in) { ptrmyblockcopy[i].r = -1; } if (ptrmyblockvide[i].r != ptrmyblockcopy[i].r) { nberrors++; printf("Proc %d : Error element number %d, value = %f , initvalue =%f \n" ,mypnum, i, ptrmyblockvide[i].r, ptrmyblockcopy[i].r); }; }; if (nberrors > 0) { printf("Processor %d, has tested %d COMPLEX*16 elements,\ Number of redistribution errors = %d \n", mypnum, blocksize0, nberrors); } } /* Look at the errors on all the processors at this point. */ Cigsum2d(gcontext, "All", "H", 1, 1, &nberrors, 1, 0, 0); if (mypnum == 0) if (nberrors) printf(" => Total number of redistribution errors = %d \n", nberrors); else printf("TEST PASSED OK\n"); /* release memory for the next iteration */ if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrmyblock); freememory((char *) ptrmyblockvide); freememory((char *) ptrmyblockcopy); }; /* if (mypnum < p0 * q0) */ /* release memory for the next iteration */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrsavemyblock); }; if (myprow0 >= 0) Cblacs_gridexit(context0); if (myprow1 >= 0) Cblacs_gridexit(context1); }; /* while nbre != 0 */ if (mypnum == 0) { fclose(fp); }; Cblacs_exit(0); return 0; }/* main */ scalapack-2.0.2/REDIST/TESTING/redist.h000644 000766 000024 00000000166 10363532303 017464 0ustar00juliestaff000000 000000 #ifdef T3D #define float double #endif #ifdef T3E #define float double #endif #ifdef CRAY #define float double #endif scalapack-2.0.2/REDIST/TESTING/TRMR2D.dat000644 000766 000024 00000002314 10363532303 017462 0ustar00juliestaff000000 000000 # test file for SCALAPACK routine TRMR2D 10 # number of tests # m n m0 n0 sr0 sc0 i0 j0 p0 q0 nbr0 nbc0 m1 n1 sr1 sc1 i1 j1 p1 q1 nbr1 nbc1 UP UNIT 12 20 81 79 0 0 18 52 1 1 6 8 56 103 1 1 42 34 2 2 8 8 1 0 59 79 98 100 0 1 13 22 1 2 8 1 62 173 0 0 4 25 1 2 9 8 1 0 22 25 87 121 0 1 17 15 2 2 12 51 90 157 1 1 19 11 2 2 36 91 1 0 1 13 1 109 0 0 1 51 2 2 8 9 134 123 0 1 15 74 1 2 16 8 0 0 129 11 187 74 0 0 1 11 2 2 7 8 185 94 0 0 27 20 2 1 8 21 1 1 43 10 45 27 1 0 2 4 2 1 10 2 149 96 0 0 78 70 1 1 9 9 1 0 1 27 80 29 0 1 72 1 2 2 19 8 41 43 1 0 18 10 2 1 8 9 0 0 29 5 37 40 0 1 1 29 1 2 17 9 46 86 0 0 5 81 2 1 7 9 0 1 59 25 91 151 0 1 20 62 2 2 5 150 81 89 0 0 5 9 1 1 9 12 0 0 31 76 114 95 0 1 79 17 1 2 8 49 169 169 0 1 50 16 1 2 9 8 1 0 12 127 22 191 1 1 11 21 2 2 8 1 28 193 1 0 11 46 2 1 9 8 1 1 scalapack-2.0.2/REDIST/SRC/CMakeLists.txt000644 000766 000024 00000000607 11656312637 020107 0ustar00juliestaff000000 000000 set (ALLAUX pgemraux.c) set (IMRSRC pigemr.c pigemr2.c pitrmr.c pitrmr2.c) set (SMRSRC psgemr.c psgemr2.c pstrmr.c pstrmr2.c) set (CMRSRC pcgemr.c pcgemr2.c pctrmr.c pctrmr2.c) set (DMRSRC pdgemr.c pdgemr2.c pdtrmr.c pdtrmr2.c) set (ZMRSRC pzgemr.c pzgemr2.c pztrmr.c pztrmr2.c) set(redist ${ALLAUX} ${IMRSRC} ${SMRSRC} ${CMRSRC} ${DMRSRC} ${ZMRSRC}) scalapack-2.0.2/REDIST/SRC/Makefile000644 000766 000024 00000005412 11654025546 017004 0ustar00juliestaff000000 000000 ############################################################################ # # Program: ScaLAPACK Redistrib # # Module: Makefile # # Purpose: Redistribution Sources Makefile # # Creation date: March 20, 1995 # # Modified: February 15, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../SLmake.inc ####################################################################### # This is the makefile to create a library for redistribution. # The files are organized as follows: # ALLAUX -- Auxiliary routines called from all precisions # IMRSRC -- Integer REDIST routines # SMRSRC -- Single precision real REDIST routines # CMRSRC -- Single precision complex REDIST routines # DMRSRC -- Double precision real REDIST routines # ZMRSRC -- Double precision complex REDIST routines # # The library can be set up to include routines for any combination # of the four precisions. First, modify the ARCH, ARCHFLAGS, RANLIB, # CC and CFLAGS definitions in ../../SLmake.inc to match your library # archiver, compiler and the options to be used. # Then to create or add to the library, enter make followed by one or # more of the precisions desired. Some examples: # make single # make single complex # make single double complex complex16 # Alternatively, the command # make # without any arguments creates a library of all four precisions. # The name of the library is defined by ../../$(SCALAPACKLIB) in # ../../SLmake.inc. # # To remove the object files after the library is created, enter # make clean # ####################################################################### ALLAUX = pgemraux.o IMRSRC = pigemr.o pigemr2.o pitrmr.o pitrmr2.o SMRSRC = psgemr.o psgemr2.o pstrmr.o pstrmr2.o CMRSRC = pcgemr.o pcgemr2.o pctrmr.o pctrmr2.o DMRSRC = pdgemr.o pdgemr2.o pdtrmr.o pdtrmr2.o ZMRSRC = pzgemr.o pzgemr2.o pztrmr.o pztrmr2.o all: integer single complex double complex16 lib: all integer: $(IMRSRC) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(IMRSRC) $(ALLAUX) $(RANLIB) ../../$(SCALAPACKLIB) single: $(SMRSRC) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(SMRSRC) $(ALLAUX) $(RANLIB) ../../$(SCALAPACKLIB) complex: $(CMRSRC) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(CMRSRC) $(ALLAUX) $(RANLIB) ../../$(SCALAPACKLIB) double: $(DMRSRC) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(DMRSRC) $(ALLAUX) $(RANLIB) ../../$(SCALAPACKLIB) complex16: $(ZMRSRC) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(ZMRSRC) $(ALLAUX) $(RANLIB) ../../$(SCALAPACKLIB) clean : rm -f *.o .c.o : ; $(CC) -c $(CDEFS) $(CCFLAGS) $*.c scalapack-2.0.2/REDIST/SRC/pcgemr.c000644 000766 000024 00000054421 10363532303 016757 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: pcgemr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PCGEMR2D( M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PCGEMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) COMPLEX On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) COMPLEX On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pcgemr2do_ #define fortran_mr2dnew pcgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PCGEMR2D #define fortran_mr2d PCGEMR2DO #define ccopy_ CCOPY #define clacpy_ CLACPY #else #define fortran_mr2d pcgemr2do #define fortran_mr2dnew pcgemr2d #define ccopy_ ccopy #define clacpy_ clacpy #endif #define Clacpy Ccgelacpy void Clacpy(); typedef struct { float r, i; } complex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Ccgesd2d(); extern void Ccgerv2d(); /* lapack */ void clacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 cgescanD0 #define dispmat cgedispmat #define setmemory cgesetmemory #define freememory cgefreememory #define scan_intervals cgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpcgemr2do(); extern void Cpcgemr2d(); /* some defines for Cpcgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; complex *A, *B; { Cpcgemr2do(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; complex *A, *B; int *gcontext; { Cpcgemr2d(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpcgemr2do(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) complex *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpcgemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpcgemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) complex *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { complex *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; complex *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); sendsize = block2buff(v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock, ma, ptrsendbuff); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Ccgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); recvsize = inter_len(hinter_nb, h_inter, vinter_nb, v_inter); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Ccgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { buff2block(v_inter, vinter_nb, h_inter, hinter_nb, recvptr, ptrmynewblock, mb); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } #define Mlacpy(mo,no,ao,ldao,bo,ldbo) \ { \ complex *_a,*_b; \ int _m,_n,_lda,_ldb; \ int _i,_j; \ _m = (mo);_n = (no); \ _a = (ao);_b = (bo); \ _lda = (ldao) - _m; \ _ldb = (ldbo) - _m; \ assert(_lda >= 0 && _ldb >= 0); \ for (_j=0;_j<_n;_j++) { \ for (_i=0;_i<_m;_i++) \ *_b++ = *_a++; \ _b += _ldb; \ _a += _lda; \ } \ } static2 int block2buff(vi, vinb, hi, hinb, ptra, ma, buff) int hinb, vinb; IDESC *hi, *vi; MDESC *ma; complex *buff, *ptra; { int h, v, sizebuff; complex *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptra + hi[h].lstart * ma->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, ptr2 + vi[v].lstart, ma->lda, buff + sizebuff, vi[v].len); sizebuff += hi[h].len * vi[v].len; } } return sizebuff; } static2 void buff2block(vi, vinb, hi, hinb, buff, ptrb, mb) int hinb, vinb; IDESC *hi, *vi; MDESC *mb; complex *buff, *ptrb; { int h, v, sizebuff; complex *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptrb + hi[h].lstart * mb->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, buff + sizebuff, vi[v].len, ptr2 + vi[v].lstart, mb->lda); sizebuff += hi[h].len * vi[v].len; } } } static2 int inter_len(hinb, hi, vinb, vi) int hinb, vinb; IDESC *hi, *vi; { int hlen, vlen, h, v; hlen = 0; for (h = 0; h < hinb; h++) hlen += hi[h].len; vlen = 0; for (v = 0; v < vinb; v++) vlen += vi[v].len; return hlen * vlen; } void Clacpy(m, n, a, lda, b, ldb) complex *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/pcgemr2.c000644 000766 000024 00000013427 10363532303 017042 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pcgemr2.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ * * some functions used by the pcgemr2d routine see file pcgemr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pcgemr2do_ #define fortran_mr2dnew pcgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PCGEMR2D #define fortran_mr2d PCGEMR2DO #define ccopy_ CCOPY #define clacpy_ CLACPY #else #define fortran_mr2d pcgemr2do #define fortran_mr2dnew pcgemr2d #define ccopy_ ccopy #define clacpy_ clacpy #endif #define Clacpy Ccgelacpy void Clacpy(); typedef struct { float r, i; } complex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Ccgesd2d(); extern void Ccgerv2d(); /* lapack */ void clacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 cgescanD0 #define dispmat cgedispmat #define setmemory cgesetmemory #define freememory cgefreememory #define scan_intervals cgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpcgemr2do(); extern void Cpcgemr2d(); /* some defines for Cpcgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) complex **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (complex *) mr2d_malloc( blocksize * sizeof(complex)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) complex *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern ccopy_(); */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpcgemr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].lstart = l + start - j0; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } scalapack-2.0.2/REDIST/SRC/pctrmr.c000644 000766 000024 00000053562 10363532303 017016 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: pctrmr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PCTRMR2D(UPLO, DIAG, M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PCTRMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. The submatrix to be copied is assumed to be trapezoidal. So only the upper or the lower part will be copied. The other part is unchanged. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== UPLO (input) CHARACTER*1. On entry, UPLO specifies whether we should copy the upper part of the lower part of the defined submatrix: UPLO = 'U' or 'u' copy the upper triangular part. UPLO = 'L' or 'l' copy the lower triangular part. Unchanged on exit. DIAG (input) CHARACTER*1. On entry, DIAG specifies whether we should copy the diagonal. DIAG = 'U' or 'u' do NOT copy the diagonal of the submatrix. DIAG = 'N' or 'n' DO copy the diagonal of the submatrix. Unchanged on exit. M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) COMPLEX On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) COMPLEX On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pctrmr2do_ #define fortran_mr2dnew pctrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PCTRMR2D #define fortran_mr2d PCTRMR2DO #define ccopy_ CCOPY #define clacpy_ CLACPY #else #define fortran_mr2d pctrmr2do #define fortran_mr2dnew pctrmr2d #define ccopy_ ccopy #define clacpy_ clacpy #endif #define Clacpy Cctrlacpy void Clacpy(); typedef struct { float r, i; } complex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Ccgesd2d(); extern void Ccgerv2d(); /* lapack */ void clacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 ctrscanD0 #define dispmat ctrdispmat #define setmemory ctrsetmemory #define freememory ctrfreememory #define scan_intervals ctrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpctrmr2do(); extern void Cpctrmr2d(); /* some defines for Cpctrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; complex *A, *B; { Cpctrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; complex *A, *B; int *gcontext; { Cpctrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpctrmr2do(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) char *uplo, *diag; complex *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpctrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpctrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) char *uplo, *diag; complex *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { complex *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; complex *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Ccgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Ccgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } void Clacpy(m, n, a, lda, b, ldb) complex *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/pctrmr2.c000644 000766 000024 00000026332 10363532303 017073 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pctrmr2.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $ * * some functions used by the pctrmr2d routine see file pctrmr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pctrmr2do_ #define fortran_mr2dnew pctrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PCTRMR2D #define fortran_mr2d PCTRMR2DO #define ccopy_ CCOPY #define clacpy_ CLACPY #else #define fortran_mr2d pctrmr2do #define fortran_mr2dnew pctrmr2d #define ccopy_ ccopy #define clacpy_ clacpy #endif #define Clacpy Cctrlacpy void Clacpy(); typedef struct { float r, i; } complex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Ccgesd2d(); extern void Ccgerv2d(); /* lapack */ void clacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 ctrscanD0 #define dispmat ctrdispmat #define setmemory ctrsetmemory #define freememory ctrfreememory #define scan_intervals ctrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpctrmr2do(); extern void Cpctrmr2d(); /* some defines for Cpctrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) complex **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (complex *) mr2d_malloc( blocksize * sizeof(complex)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) complex *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern ccopy_(); */ /**************************************************************/ /* return the number of elements int the column after i and the distance of * the first one from i, i,j can be negative out of borns, the number of * elements returned can be negative (means 0) */ static2 int insidemat(uplo, diag, i, j, m, n, offset) int m, n, i, j; /* coordonnees de depart, taille de la sous-matrice */ char *uplo, *diag; int *offset; { /* tests outside mxn */ assert(j >= 0 && j < n); assert(i >= 0); if (toupper(*uplo) == 'U') { int nbline; /* number of lines in the j_th column */ int virtualnbline; /* number of line if we were not limited by m */ *offset = 0; virtualnbline = max(m - n, 0) + j + (toupper(*diag) == 'N'); nbline = min(virtualnbline, m); return nbline - i; } else { int firstline; /* first line in the j_th column */ int diagcol; /* column where the diag begin */ int virtualline; /* virtual first line if the matrix was extended with * negative indices */ int off; diagcol = max(n - m, 0);; virtualline = j - diagcol + (toupper(*diag) == 'U'); firstline = max(0, virtualline); off = max(firstline - i, 0); *offset = off; i += off; return m - i; } }/* insidemat() */ /********************************************************************/ /* Execute an action on the local memories when an intersection occurs (the * action can be the filling of the memory buffer, the count of the memory * buffer size or the setting of the memory with the element received) */ static2 void intersect(uplo, diag, j, start, end, action, ptrsizebuff, pptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1) int action, *ptrsizebuff; int j, start, end; complex **pptrbuff, *ptrblock; int templateheight0, templatewidth0; int templateheight1, templatewidth1; MDESC *ma, *mb; int ia, ja, ib, jb, m, n; char *uplo, *diag; /* Execute the action on the local memory for the current interval and * increment pptrbuff and ptrsizebuff of the intervalsize */ /* Notice that if the interval is contigous in the virtual matrice, it is * also contigous in the real one ! */ { /* int un = 1; only when we use dcopy instead of memcpy */ complex *ptrstart; int offset, nbline; int intervalsize; assert(start < end); assert(j >= 0 && j < n); nbline = insidemat(uplo, diag, start, j, m, n, &offset); if (nbline <= 0) return; start += offset; if (start >= end) return; intervalsize = min(end - start, nbline); (*ptrsizebuff) += intervalsize; switch (action) { case SENDBUFF: /* fill buff with local elements to be sent */ ptrstart = ptrblock + localindice(start + ia, j + ja, templateheight0, templatewidth0, ma); memcpy((char *) (*pptrbuff), (char *) ptrstart, intervalsize * sizeof(complex)); /* ccopy_(&intervalsize, (char *) (ptrstart), &un, (char *) (*pptrbuff), * &un); */ (*pptrbuff) += intervalsize; break; case RECVBUFF: /* fill local memory with the values received */ ptrstart = ptrblock + localindice(start + ib, j + jb, templateheight1, templatewidth1, mb); memcpy((char *) ptrstart, (char *) (*pptrbuff), intervalsize * sizeof(complex)); /* ccopy_(&intervalsize, (char *) (*pptrbuff), &un, (char *) (ptrstart), * &un); */ (*pptrbuff) += intervalsize; break; case SIZEBUFF: /* computation of sizebuff */ break; default: printf("action is %d outside the scope of the case [0..2] !! \n ", action); exit(0); break; }; /* switch (action) */ }/* intersect() */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpctrmr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].gstart = start; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } /*********************************************************************/ /* Do the scanning of intervals and the requested action */ void scanD0(uplo, diag, action, ptrbuff, ptrsizebuff, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrblock) int action, /* # of the action done on the intersected intervals */ *ptrsizebuff; /* size of the communication ptrbuffer (chosen to be * an output parameter in every cases) */ complex *ptrbuff /* address of the communication ptrbuffer (a * suffisant memory space is supposed to be allocated before the call) */ , *ptrblock; int p0, q0, p1, q1; IDESC *v_inter, *h_inter; int vinter_nb, hinter_nb; int m, n; int ia, ja, ib, jb; MDESC *ma, *mb; char *uplo, *diag; {/* Rmk: the a+au type addresses are strict bounds as a+au does not belong to * the [a..a+au-1] interval of length au */ int templateheight1, templatewidth1; int templateheight0, templatewidth0; int h, v; /* for scanning the intervals */ /* initializations */ templateheight1 = p1 * mb->nbrow; templateheight0 = p0 * ma->nbrow; templatewidth1 = q1 * mb->nbcol; templatewidth0 = q0 * ma->nbcol; /* we now will deal will logical grids, that's to say we change our * numbering of processors so that (0,0) begin on logical processor (0,0) */ /* in case we will not enter the while loop */ (*ptrsizebuff) = 0; for (h = 0; h < hinter_nb; h++) for (v = 0; v < vinter_nb; v++) { int j; for (j = 0; j < h_inter[h].len; j++) intersect(uplo, diag, j + h_inter[h].gstart, v_inter[v].gstart, v_inter[v].gstart + v_inter[v].len, action, ptrsizebuff, &ptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1); } }/* scanD0() */ scalapack-2.0.2/REDIST/SRC/pdgemr.c000644 000766 000024 00000054351 10363532303 016762 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: pdgemr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PDGEMR2D( M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PDGEMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) DOUBLE PRECISION On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) DOUBLE PRECISION On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pdgemr2do_ #define fortran_mr2dnew pdgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PDGEMR2D #define fortran_mr2d PDGEMR2DO #define dcopy_ DCOPY #define dlacpy_ DLACPY #else #define fortran_mr2d pdgemr2do #define fortran_mr2dnew pdgemr2d #define dcopy_ dcopy #define dlacpy_ dlacpy #endif #define Clacpy Cdgelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cdgesd2d(); extern void Cdgerv2d(); /* lapack */ void dlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 dgescanD0 #define dispmat dgedispmat #define setmemory dgesetmemory #define freememory dgefreememory #define scan_intervals dgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpdgemr2do(); extern void Cpdgemr2d(); /* some defines for Cpdgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; double *A, *B; { Cpdgemr2do(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; double *A, *B; int *gcontext; { Cpdgemr2d(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpdgemr2do(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) double *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpdgemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpdgemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) double *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { double *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; double *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); sendsize = block2buff(v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock, ma, ptrsendbuff); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Cdgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); recvsize = inter_len(hinter_nb, h_inter, vinter_nb, v_inter); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Cdgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { buff2block(v_inter, vinter_nb, h_inter, hinter_nb, recvptr, ptrmynewblock, mb); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } #define Mlacpy(mo,no,ao,ldao,bo,ldbo) \ { \ double *_a,*_b; \ int _m,_n,_lda,_ldb; \ int _i,_j; \ _m = (mo);_n = (no); \ _a = (ao);_b = (bo); \ _lda = (ldao) - _m; \ _ldb = (ldbo) - _m; \ assert(_lda >= 0 && _ldb >= 0); \ for (_j=0;_j<_n;_j++) { \ for (_i=0;_i<_m;_i++) \ *_b++ = *_a++; \ _b += _ldb; \ _a += _lda; \ } \ } static2 int block2buff(vi, vinb, hi, hinb, ptra, ma, buff) int hinb, vinb; IDESC *hi, *vi; MDESC *ma; double *buff, *ptra; { int h, v, sizebuff; double *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptra + hi[h].lstart * ma->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, ptr2 + vi[v].lstart, ma->lda, buff + sizebuff, vi[v].len); sizebuff += hi[h].len * vi[v].len; } } return sizebuff; } static2 void buff2block(vi, vinb, hi, hinb, buff, ptrb, mb) int hinb, vinb; IDESC *hi, *vi; MDESC *mb; double *buff, *ptrb; { int h, v, sizebuff; double *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptrb + hi[h].lstart * mb->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, buff + sizebuff, vi[v].len, ptr2 + vi[v].lstart, mb->lda); sizebuff += hi[h].len * vi[v].len; } } } static2 int inter_len(hinb, hi, vinb, vi) int hinb, vinb; IDESC *hi, *vi; { int hlen, vlen, h, v; hlen = 0; for (h = 0; h < hinb; h++) hlen += hi[h].len; vlen = 0; for (v = 0; v < vinb; v++) vlen += vi[v].len; return hlen * vlen; } void Clacpy(m, n, a, lda, b, ldb) double *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/pdgemr2.c000644 000766 000024 00000013344 10363532303 017041 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pdgemr2.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ * * some functions used by the pdgemr2d routine see file pdgemr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pdgemr2do_ #define fortran_mr2dnew pdgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PDGEMR2D #define fortran_mr2d PDGEMR2DO #define dcopy_ DCOPY #define dlacpy_ DLACPY #else #define fortran_mr2d pdgemr2do #define fortran_mr2dnew pdgemr2d #define dcopy_ dcopy #define dlacpy_ dlacpy #endif #define Clacpy Cdgelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cdgesd2d(); extern void Cdgerv2d(); /* lapack */ void dlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 dgescanD0 #define dispmat dgedispmat #define setmemory dgesetmemory #define freememory dgefreememory #define scan_intervals dgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpdgemr2do(); extern void Cpdgemr2d(); /* some defines for Cpdgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) double **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (double *) mr2d_malloc( blocksize * sizeof(double)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) double *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern dcopy_(); */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpdgemr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].lstart = l + start - j0; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } scalapack-2.0.2/REDIST/SRC/pdtrmr.c000644 000766 000024 00000053517 10363532303 017017 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: pdtrmr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PDTRMR2D(UPLO, DIAG, M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PDTRMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. The submatrix to be copied is assumed to be trapezoidal. So only the upper or the lower part will be copied. The other part is unchanged. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== UPLO (input) CHARACTER*1. On entry, UPLO specifies whether we should copy the upper part of the lower part of the defined submatrix: UPLO = 'U' or 'u' copy the upper triangular part. UPLO = 'L' or 'l' copy the lower triangular part. Unchanged on exit. DIAG (input) CHARACTER*1. On entry, DIAG specifies whether we should copy the diagonal. DIAG = 'U' or 'u' do NOT copy the diagonal of the submatrix. DIAG = 'N' or 'n' DO copy the diagonal of the submatrix. Unchanged on exit. M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) DOUBLE PRECISION On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) DOUBLE PRECISION On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pdtrmr2do_ #define fortran_mr2dnew pdtrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PDTRMR2D #define fortran_mr2d PDTRMR2DO #define dcopy_ DCOPY #define dlacpy_ DLACPY #else #define fortran_mr2d pdtrmr2do #define fortran_mr2dnew pdtrmr2d #define dcopy_ dcopy #define dlacpy_ dlacpy #endif #define Clacpy Cdtrlacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cdgesd2d(); extern void Cdgerv2d(); /* lapack */ void dlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 dtrscanD0 #define dispmat dtrdispmat #define setmemory dtrsetmemory #define freememory dtrfreememory #define scan_intervals dtrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpdtrmr2do(); extern void Cpdtrmr2d(); /* some defines for Cpdtrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; double *A, *B; { Cpdtrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; double *A, *B; int *gcontext; { Cpdtrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpdtrmr2do(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) char *uplo, *diag; double *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpdtrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpdtrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) char *uplo, *diag; double *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { double *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; double *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Cdgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Cdgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } void Clacpy(m, n, a, lda, b, ldb) double *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/pdtrmr2.c000644 000766 000024 00000026242 10363532303 017074 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pdtrmr2.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ * * some functions used by the pdtrmr2d routine see file pdtrmr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pdtrmr2do_ #define fortran_mr2dnew pdtrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PDTRMR2D #define fortran_mr2d PDTRMR2DO #define dcopy_ DCOPY #define dlacpy_ DLACPY #else #define fortran_mr2d pdtrmr2do #define fortran_mr2dnew pdtrmr2d #define dcopy_ dcopy #define dlacpy_ dlacpy #endif #define Clacpy Cdtrlacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cdgesd2d(); extern void Cdgerv2d(); /* lapack */ void dlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 dtrscanD0 #define dispmat dtrdispmat #define setmemory dtrsetmemory #define freememory dtrfreememory #define scan_intervals dtrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpdtrmr2do(); extern void Cpdtrmr2d(); /* some defines for Cpdtrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) double **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (double *) mr2d_malloc( blocksize * sizeof(double)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) double *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern dcopy_(); */ /**************************************************************/ /* return the number of elements int the column after i and the distance of * the first one from i, i,j can be negative out of borns, the number of * elements returned can be negative (means 0) */ static2 int insidemat(uplo, diag, i, j, m, n, offset) int m, n, i, j; /* coordonnees de depart, taille de la sous-matrice */ char *uplo, *diag; int *offset; { /* tests outside mxn */ assert(j >= 0 && j < n); assert(i >= 0); if (toupper(*uplo) == 'U') { int nbline; /* number of lines in the j_th column */ int virtualnbline; /* number of line if we were not limited by m */ *offset = 0; virtualnbline = max(m - n, 0) + j + (toupper(*diag) == 'N'); nbline = min(virtualnbline, m); return nbline - i; } else { int firstline; /* first line in the j_th column */ int diagcol; /* column where the diag begin */ int virtualline; /* virtual first line if the matrix was extended with * negative indices */ int off; diagcol = max(n - m, 0);; virtualline = j - diagcol + (toupper(*diag) == 'U'); firstline = max(0, virtualline); off = max(firstline - i, 0); *offset = off; i += off; return m - i; } }/* insidemat() */ /********************************************************************/ /* Execute an action on the local memories when an intersection occurs (the * action can be the filling of the memory buffer, the count of the memory * buffer size or the setting of the memory with the element received) */ static2 void intersect(uplo, diag, j, start, end, action, ptrsizebuff, pptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1) int action, *ptrsizebuff; int j, start, end; double **pptrbuff, *ptrblock; int templateheight0, templatewidth0; int templateheight1, templatewidth1; MDESC *ma, *mb; int ia, ja, ib, jb, m, n; char *uplo, *diag; /* Execute the action on the local memory for the current interval and * increment pptrbuff and ptrsizebuff of the intervalsize */ /* Notice that if the interval is contigous in the virtual matrice, it is * also contigous in the real one ! */ { /* int un = 1; only when we use dcopy instead of memcpy */ double *ptrstart; int offset, nbline; int intervalsize; assert(start < end); assert(j >= 0 && j < n); nbline = insidemat(uplo, diag, start, j, m, n, &offset); if (nbline <= 0) return; start += offset; if (start >= end) return; intervalsize = min(end - start, nbline); (*ptrsizebuff) += intervalsize; switch (action) { case SENDBUFF: /* fill buff with local elements to be sent */ ptrstart = ptrblock + localindice(start + ia, j + ja, templateheight0, templatewidth0, ma); memcpy((char *) (*pptrbuff), (char *) ptrstart, intervalsize * sizeof(double)); /* dcopy_(&intervalsize, (char *) (ptrstart), &un, (char *) (*pptrbuff), * &un); */ (*pptrbuff) += intervalsize; break; case RECVBUFF: /* fill local memory with the values received */ ptrstart = ptrblock + localindice(start + ib, j + jb, templateheight1, templatewidth1, mb); memcpy((char *) ptrstart, (char *) (*pptrbuff), intervalsize * sizeof(double)); /* dcopy_(&intervalsize, (char *) (*pptrbuff), &un, (char *) (ptrstart), * &un); */ (*pptrbuff) += intervalsize; break; case SIZEBUFF: /* computation of sizebuff */ break; default: printf("action is %d outside the scope of the case [0..2] !! \n ", action); exit(0); break; }; /* switch (action) */ }/* intersect() */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpdtrmr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].gstart = start; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } /*********************************************************************/ /* Do the scanning of intervals and the requested action */ void scanD0(uplo, diag, action, ptrbuff, ptrsizebuff, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrblock) int action, /* # of the action done on the intersected intervals */ *ptrsizebuff; /* size of the communication ptrbuffer (chosen to be * an output parameter in every cases) */ double *ptrbuff /* address of the communication ptrbuffer (a * suffisant memory space is supposed to be allocated before the call) */ , *ptrblock; int p0, q0, p1, q1; IDESC *v_inter, *h_inter; int vinter_nb, hinter_nb; int m, n; int ia, ja, ib, jb; MDESC *ma, *mb; char *uplo, *diag; {/* Rmk: the a+au type addresses are strict bounds as a+au does not belong to * the [a..a+au-1] interval of length au */ int templateheight1, templatewidth1; int templateheight0, templatewidth0; int h, v; /* for scanning the intervals */ /* initializations */ templateheight1 = p1 * mb->nbrow; templateheight0 = p0 * ma->nbrow; templatewidth1 = q1 * mb->nbcol; templatewidth0 = q0 * ma->nbcol; /* we now will deal will logical grids, that's to say we change our * numbering of processors so that (0,0) begin on logical processor (0,0) */ /* in case we will not enter the while loop */ (*ptrsizebuff) = 0; for (h = 0; h < hinter_nb; h++) for (v = 0; v < vinter_nb; v++) { int j; for (j = 0; j < h_inter[h].len; j++) intersect(uplo, diag, j + h_inter[h].gstart, v_inter[v].gstart, v_inter[v].gstart + v_inter[v].len, action, ptrsizebuff, &ptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1); } }/* scanD0() */ scalapack-2.0.2/REDIST/SRC/pgemraux.c000644 000766 000024 00000021336 10363532303 017331 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pgemraux.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $ * * some functions used by the pigemr2d routine see file pigemr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pigemr2do_ #define fortran_mr2dnew pigemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PIGEMR2D #define fortran_mr2d PIGEMR2DO #define icopy_ ICOPY #define ilacpy_ ILACPY #else #define fortran_mr2d pigemr2do #define fortran_mr2dnew pigemr2d #define icopy_ icopy #define ilacpy_ ilacpy #endif #define Clacpy Cigelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cigesd2d(); extern void Cigerv2d(); /* lapack */ void ilacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 igescanD0 #define dispmat igedispmat #define setmemory igesetmemory #define freememory igefreememory #define scan_intervals igescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpigemr2do(); extern void Cpigemr2d(); /* some defines for Cpigemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include void * mr2d_malloc(n) int n; { void *ptr; assert(n > 0); ptr = (void *) malloc(n); if (ptr == NULL) { fprintf(stderr, "xxmr2d:out of memory\n"); exit(2); } return ptr; } int pgcd(a, b) int a, b; { int aux; if (a < b) return pgcd(b, a); else { aux = a % b; if (aux == 0) return b; else return pgcd(b, aux); } } int ppcm(a, b) int a, b; { int pg; pg = pgcd(a, b); return a * (b / pg); } /* localsize:return the number of rows on the local processor given by its * row number myprow, of a distributed matrix with m rows distributed of on a * grid of processors with p rows with blocksize nbrow : this procedure can * also be used to compute the number of cols by replacing rows by cols */ int localsize(myprow, p, nbrow, m) int myprow, p, nbrow, m; { int templateheight, blockheight; templateheight = p * nbrow; if (m % templateheight != 0) { /* not an exact boundary */ if ((m % templateheight) > (nbrow * myprow)) { /* processor * (myprow,mypcol) has * some elements in that * incomplete template */ if ((m % templateheight) >= (nbrow * (myprow + 1))) { /* processor * (myprow,mypcol)'s * part is complete */ blockheight = (m / templateheight) * nbrow + nbrow; } else { /* processor (myprow,mypcol)'s part is not complete */ blockheight = (m / templateheight) * nbrow + (m % nbrow); }; /* if ((m%templateheight) > (nbrow*(myprow+1))) */ } else { /* processor (myprow,mypcol) has no element in that * incomplete template */ blockheight = (m / templateheight) * nbrow; }; /* if ((m%templateheight) > (nbrow*myprow)) */ } else { /* exact boundary */ blockheight = m / p; /* (m/templateheight) * nbrow */ }; /* if (m%templateheight !=0) */ return blockheight; } /****************************************************************/ /* Returns the exact memory block size corresponding to the parameters */ int memoryblocksize(a) MDESC *a; { int myprow, mypcol, p, q; /* Compute the (myprow,mypcol) indices of processor mypnum in P0xQ0 We * assume the row-major ordering of the BLACS */ Cblacs_gridinfo(a->ctxt, &p, &q, &myprow, &mypcol); myprow = SHIFT(myprow, a->sprow, p); mypcol = SHIFT(mypcol, a->spcol, q); assert(myprow >= 0 && mypcol >= 0); return localsize(myprow, p, a->nbrow, a->m) * localsize(mypcol, q, a->nbcol, a->n); } void checkequal(ctxt, a) int a, ctxt; { int np, dummy, nbrow, myp, b; Cblacs_gridinfo(ctxt, &nbrow, &np, &dummy, &myp); assert(nbrow == 1); if (np == 1) return; if (myp == 0) { Cigesd2d(ctxt, 1, 1, &a, 1, 0, 1); Cigerv2d(ctxt, 1, 1, &b, 1, 0, np - 1); assert(a == b); } else { Cigerv2d(ctxt, 1, 1, &b, 1, 0, myp - 1); assert(a == b); Cigesd2d(ctxt, 1, 1, &a, 1, 0, (myp + 1) % np); } } void paramcheck(a, i, j, m, n, p, q, gcontext) MDESC *a; int i, j, m, n, p, q; { int p2, q2, myprow, mypcol; #ifndef NDEBUG checkequal(gcontext, p); checkequal(gcontext, q); checkequal(gcontext, a->sprow); checkequal(gcontext, a->spcol); checkequal(gcontext, a->m); checkequal(gcontext, a->n); checkequal(gcontext, i); checkequal(gcontext, j); checkequal(gcontext, a->nbrow); checkequal(gcontext, a->nbcol); #endif Cblacs_gridinfo(a->ctxt, &p2, &q2, &myprow, &mypcol); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow >= p2 || mypcol >= q2) myprow = mypcol = -1; if ((myprow >= 0 || mypcol >= 0) && (p2 != p && q2 != q)) { fprintf(stderr, "??MR2D:incoherent p,q parameters\n"); exit(1); } assert(myprow < p && mypcol < q); if (a->sprow < 0 || a->sprow >= p || a->spcol < 0 || a->spcol >= q) { fprintf(stderr, "??MR2D:Bad first processor coordinates\n"); exit(1); } if (i < 0 || j < 0 || i + m > a->m || j + n > a->n) { fprintf(stderr, "??MR2D:Bad submatrix:i=%d,j=%d,\ m=%d,n=%d,M=%d,N=%d\n", i, j, m, n, a->m, a->n); exit(1); } if ((myprow >= 0 || mypcol >= 0) && localsize(SHIFT(myprow, a->sprow, p), p, a->nbrow, a->m) > a->lda) { fprintf(stderr, "??MR2D:bad lda arg:row=%d,m=%d,p=%d,\ nbrow=%d,lda=%d,sprow=%d\n", myprow, a->m, p, a->nbrow, a->lda, a->sprow); exit(1); } } /* to change from the submatrix beginning at line i to one beginning at line * i' with i'< blocksize return the line number on the local process where * the new matrix begin, the new process number, and i' */ int changeorigin(myp, sp, p, bs, i, decal, newsp) int myp, sp, p, bs, i; int *decal, *newsp; { int tempheight, firstblock, firsttemp; /* we begin by changing the parameters so that ia < templatewidth,... */ tempheight = bs * p; firsttemp = i / tempheight; firstblock = (i / bs) % p; *newsp = (sp + firstblock) % p; if (myp >= 0) *decal = firsttemp * bs + (SHIFT(myp, sp, p) < firstblock ? bs : 0); else *decal = 0; return i % bs; } /******************************************************************/ /* Return the indice in local memory of element of indice a in the matrix */ int localindice(ig, jg, templateheight, templatewidth, a) int templateheight, templatewidth, ig, jg; MDESC *a; /* Return the indice in local memory (scattered distribution) of the element * of indice a in global matrix */ { int vtemp, htemp, vsubtemp, hsubtemp, il, jl; assert(ig >= 0 && ig < a->m && jg >= 0 && jg < a->n); /* coordinates in global matrix with the tests in intersect, ig MUST BE in * [0..m] and jg in [0..n] */ /* coordinates of the template that "owns" the element */ vtemp = ig / templateheight; htemp = jg / templatewidth; /* coordinates of the element in the subblock of the (vtemp, htemp) * template */ vsubtemp = ig % a->nbrow; hsubtemp = jg % a->nbcol; /* coordinates of the element in the local block of the processor */ il = a->nbrow * vtemp + vsubtemp; jl = a->nbcol * htemp + hsubtemp; assert(il < a->lda); #ifndef NDEBUG { int pr, pc, p, q, lp, lq; Cblacs_gridinfo(a->ctxt, &p, &q, &pr, &pc); p = templateheight / a->nbrow; q = templatewidth / a->nbcol; lp = ig % templateheight / a->nbrow; lq = jg % templatewidth / a->nbcol; assert(lp == SHIFT(pr, a->sprow, p)); assert(lq == SHIFT(pc, a->spcol, q)); } #endif return (jl * a->lda + il); } scalapack-2.0.2/REDIST/SRC/pigemr.c000644 000766 000024 00000054276 10363532303 016775 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: pigemr.c,v 1.1.1.1 2000/02/15 18:04:08 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PIGEMR2D( M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PIGEMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) INTEGER On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) INTEGER On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pigemr2do_ #define fortran_mr2dnew pigemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PIGEMR2D #define fortran_mr2d PIGEMR2DO #define icopy_ ICOPY #define ilacpy_ ILACPY #else #define fortran_mr2d pigemr2do #define fortran_mr2dnew pigemr2d #define icopy_ icopy #define ilacpy_ ilacpy #endif #define Clacpy Cigelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cigesd2d(); extern void Cigerv2d(); /* lapack */ void ilacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 igescanD0 #define dispmat igedispmat #define setmemory igesetmemory #define freememory igefreememory #define scan_intervals igescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpigemr2do(); extern void Cpigemr2d(); /* some defines for Cpigemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; int *A, *B; { Cpigemr2do(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; int *A, *B; int *gcontext; { Cpigemr2d(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpigemr2do(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) int *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpigemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpigemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) int *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { int *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; int *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); sendsize = block2buff(v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock, ma, ptrsendbuff); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Cigesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); recvsize = inter_len(hinter_nb, h_inter, vinter_nb, v_inter); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Cigerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { buff2block(v_inter, vinter_nb, h_inter, hinter_nb, recvptr, ptrmynewblock, mb); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } #define Mlacpy(mo,no,ao,ldao,bo,ldbo) \ { \ int *_a,*_b; \ int _m,_n,_lda,_ldb; \ int _i,_j; \ _m = (mo);_n = (no); \ _a = (ao);_b = (bo); \ _lda = (ldao) - _m; \ _ldb = (ldbo) - _m; \ assert(_lda >= 0 && _ldb >= 0); \ for (_j=0;_j<_n;_j++) { \ for (_i=0;_i<_m;_i++) \ *_b++ = *_a++; \ _b += _ldb; \ _a += _lda; \ } \ } static2 int block2buff(vi, vinb, hi, hinb, ptra, ma, buff) int hinb, vinb; IDESC *hi, *vi; MDESC *ma; int *buff, *ptra; { int h, v, sizebuff; int *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptra + hi[h].lstart * ma->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, ptr2 + vi[v].lstart, ma->lda, buff + sizebuff, vi[v].len); sizebuff += hi[h].len * vi[v].len; } } return sizebuff; } static2 void buff2block(vi, vinb, hi, hinb, buff, ptrb, mb) int hinb, vinb; IDESC *hi, *vi; MDESC *mb; int *buff, *ptrb; { int h, v, sizebuff; int *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptrb + hi[h].lstart * mb->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, buff + sizebuff, vi[v].len, ptr2 + vi[v].lstart, mb->lda); sizebuff += hi[h].len * vi[v].len; } } } static2 int inter_len(hinb, hi, vinb, vi) int hinb, vinb; IDESC *hi, *vi; { int hlen, vlen, h, v; hlen = 0; for (h = 0; h < hinb; h++) hlen += hi[h].len; vlen = 0; for (v = 0; v < vinb; v++) vlen += vi[v].len; return hlen * vlen; } void Clacpy(m, n, a, lda, b, ldb) int *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/pigemr2.c000644 000766 000024 00000013326 10363532303 017046 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pigemr2.c,v 1.1.1.1 2000/02/15 18:04:08 susan Exp $ * * some functions used by the pigemr2d routine see file pigemr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pigemr2do_ #define fortran_mr2dnew pigemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PIGEMR2D #define fortran_mr2d PIGEMR2DO #define icopy_ ICOPY #define ilacpy_ ILACPY #else #define fortran_mr2d pigemr2do #define fortran_mr2dnew pigemr2d #define icopy_ icopy #define ilacpy_ ilacpy #endif #define Clacpy Cigelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cigesd2d(); extern void Cigerv2d(); /* lapack */ void ilacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 igescanD0 #define dispmat igedispmat #define setmemory igesetmemory #define freememory igefreememory #define scan_intervals igescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpigemr2do(); extern void Cpigemr2d(); /* some defines for Cpigemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) int **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (int *) mr2d_malloc( blocksize * sizeof(int)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) int *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern icopy_(); */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpigemr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].lstart = l + start - j0; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } scalapack-2.0.2/REDIST/SRC/pitrmr.c000644 000766 000024 00000053457 10363532303 017027 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: pitrmr.c,v 1.1.1.1 2000/02/15 18:04:08 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PITRMR2D(UPLO, DIAG, M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PITRMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. The submatrix to be copied is assumed to be trapezoidal. So only the upper or the lower part will be copied. The other part is unchanged. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== UPLO (input) CHARACTER*1. On entry, UPLO specifies whether we should copy the upper part of the lower part of the defined submatrix: UPLO = 'U' or 'u' copy the upper triangular part. UPLO = 'L' or 'l' copy the lower triangular part. Unchanged on exit. DIAG (input) CHARACTER*1. On entry, DIAG specifies whether we should copy the diagonal. DIAG = 'U' or 'u' do NOT copy the diagonal of the submatrix. DIAG = 'N' or 'n' DO copy the diagonal of the submatrix. Unchanged on exit. M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) INTEGER On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) INTEGER On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pitrmr2do_ #define fortran_mr2dnew pitrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PITRMR2D #define fortran_mr2d PITRMR2DO #define icopy_ ICOPY #define ilacpy_ ILACPY #else #define fortran_mr2d pitrmr2do #define fortran_mr2dnew pitrmr2d #define icopy_ icopy #define ilacpy_ ilacpy #endif #define Clacpy Citrlacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cigesd2d(); extern void Cigerv2d(); /* lapack */ void ilacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 itrscanD0 #define dispmat itrdispmat #define setmemory itrsetmemory #define freememory itrfreememory #define scan_intervals itrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpitrmr2do(); extern void Cpitrmr2d(); /* some defines for Cpitrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; int *A, *B; { Cpitrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; int *A, *B; int *gcontext; { Cpitrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpitrmr2do(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) char *uplo, *diag; int *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpitrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpitrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) char *uplo, *diag; int *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { int *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; int *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Cigesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Cigerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } void Clacpy(m, n, a, lda, b, ldb) int *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/pitrmr2.c000644 000766 000024 00000026201 10363532303 017074 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pitrmr2.c,v 1.1.1.1 2000/02/15 18:04:08 susan Exp $ * * some functions used by the pitrmr2d routine see file pitrmr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pitrmr2do_ #define fortran_mr2dnew pitrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PITRMR2D #define fortran_mr2d PITRMR2DO #define icopy_ ICOPY #define ilacpy_ ILACPY #else #define fortran_mr2d pitrmr2do #define fortran_mr2dnew pitrmr2d #define icopy_ icopy #define ilacpy_ ilacpy #endif #define Clacpy Citrlacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cigesd2d(); extern void Cigerv2d(); /* lapack */ void ilacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 itrscanD0 #define dispmat itrdispmat #define setmemory itrsetmemory #define freememory itrfreememory #define scan_intervals itrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpitrmr2do(); extern void Cpitrmr2d(); /* some defines for Cpitrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) int **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (int *) mr2d_malloc( blocksize * sizeof(int)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) int *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern icopy_(); */ /**************************************************************/ /* return the number of elements int the column after i and the distance of * the first one from i, i,j can be negative out of borns, the number of * elements returned can be negative (means 0) */ static2 int insidemat(uplo, diag, i, j, m, n, offset) int m, n, i, j; /* coordonnees de depart, taille de la sous-matrice */ char *uplo, *diag; int *offset; { /* tests outside mxn */ assert(j >= 0 && j < n); assert(i >= 0); if (toupper(*uplo) == 'U') { int nbline; /* number of lines in the j_th column */ int virtualnbline; /* number of line if we were not limited by m */ *offset = 0; virtualnbline = max(m - n, 0) + j + (toupper(*diag) == 'N'); nbline = min(virtualnbline, m); return nbline - i; } else { int firstline; /* first line in the j_th column */ int diagcol; /* column where the diag begin */ int virtualline; /* virtual first line if the matrix was extended with * negative indices */ int off; diagcol = max(n - m, 0);; virtualline = j - diagcol + (toupper(*diag) == 'U'); firstline = max(0, virtualline); off = max(firstline - i, 0); *offset = off; i += off; return m - i; } }/* insidemat() */ /********************************************************************/ /* Execute an action on the local memories when an intersection occurs (the * action can be the filling of the memory buffer, the count of the memory * buffer size or the setting of the memory with the element received) */ static2 void intersect(uplo, diag, j, start, end, action, ptrsizebuff, pptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1) int action, *ptrsizebuff; int j, start, end; int **pptrbuff, *ptrblock; int templateheight0, templatewidth0; int templateheight1, templatewidth1; MDESC *ma, *mb; int ia, ja, ib, jb, m, n; char *uplo, *diag; /* Execute the action on the local memory for the current interval and * increment pptrbuff and ptrsizebuff of the intervalsize */ /* Notice that if the interval is contigous in the virtual matrice, it is * also contigous in the real one ! */ { /* int un = 1; only when we use dcopy instead of memcpy */ int *ptrstart; int offset, nbline; int intervalsize; assert(start < end); assert(j >= 0 && j < n); nbline = insidemat(uplo, diag, start, j, m, n, &offset); if (nbline <= 0) return; start += offset; if (start >= end) return; intervalsize = min(end - start, nbline); (*ptrsizebuff) += intervalsize; switch (action) { case SENDBUFF: /* fill buff with local elements to be sent */ ptrstart = ptrblock + localindice(start + ia, j + ja, templateheight0, templatewidth0, ma); memcpy((char *) (*pptrbuff), (char *) ptrstart, intervalsize * sizeof(int)); /* icopy_(&intervalsize, (char *) (ptrstart), &un, (char *) (*pptrbuff), * &un); */ (*pptrbuff) += intervalsize; break; case RECVBUFF: /* fill local memory with the values received */ ptrstart = ptrblock + localindice(start + ib, j + jb, templateheight1, templatewidth1, mb); memcpy((char *) ptrstart, (char *) (*pptrbuff), intervalsize * sizeof(int)); /* icopy_(&intervalsize, (char *) (*pptrbuff), &un, (char *) (ptrstart), * &un); */ (*pptrbuff) += intervalsize; break; case SIZEBUFF: /* computation of sizebuff */ break; default: printf("action is %d outside the scope of the case [0..2] !! \n ", action); exit(0); break; }; /* switch (action) */ }/* intersect() */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpitrmr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].gstart = start; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } /*********************************************************************/ /* Do the scanning of intervals and the requested action */ void scanD0(uplo, diag, action, ptrbuff, ptrsizebuff, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrblock) int action, /* # of the action done on the intersected intervals */ *ptrsizebuff; /* size of the communication ptrbuffer (chosen to be * an output parameter in every cases) */ int *ptrbuff /* address of the communication ptrbuffer (a suffisant memory space is supposed to be allocated before the call) */ , *ptrblock; int p0, q0, p1, q1; IDESC *v_inter, *h_inter; int vinter_nb, hinter_nb; int m, n; int ia, ja, ib, jb; MDESC *ma, *mb; char *uplo, *diag; {/* Rmk: the a+au type addresses are strict bounds as a+au does not belong to * the [a..a+au-1] interval of length au */ int templateheight1, templatewidth1; int templateheight0, templatewidth0; int h, v; /* for scanning the intervals */ /* initializations */ templateheight1 = p1 * mb->nbrow; templateheight0 = p0 * ma->nbrow; templatewidth1 = q1 * mb->nbcol; templatewidth0 = q0 * ma->nbcol; /* we now will deal will logical grids, that's to say we change our * numbering of processors so that (0,0) begin on logical processor (0,0) */ /* in case we will not enter the while loop */ (*ptrsizebuff) = 0; for (h = 0; h < hinter_nb; h++) for (v = 0; v < vinter_nb; v++) { int j; for (j = 0; j < h_inter[h].len; j++) intersect(uplo, diag, j + h_inter[h].gstart, v_inter[v].gstart, v_inter[v].gstart + v_inter[v].len, action, ptrsizebuff, &ptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1); } }/* scanD0() */ scalapack-2.0.2/REDIST/SRC/psgemr.c000644 000766 000024 00000054305 10363532303 017000 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: psgemr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PSGEMR2D( M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PSGEMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) REAL On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) REAL On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d psgemr2do_ #define fortran_mr2dnew psgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PSGEMR2D #define fortran_mr2d PSGEMR2DO #define scopy_ SCOPY #define slacpy_ SLACPY #else #define fortran_mr2d psgemr2do #define fortran_mr2dnew psgemr2d #define scopy_ scopy #define slacpy_ slacpy #endif #define Clacpy Csgelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Csgesd2d(); extern void Csgerv2d(); /* lapack */ void slacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 sgescanD0 #define dispmat sgedispmat #define setmemory sgesetmemory #define freememory sgefreememory #define scan_intervals sgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpsgemr2do(); extern void Cpsgemr2d(); /* some defines for Cpsgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; float *A, *B; { Cpsgemr2do(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; float *A, *B; int *gcontext; { Cpsgemr2d(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpsgemr2do(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) float *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpsgemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpsgemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) float *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { float *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; float *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); sendsize = block2buff(v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock, ma, ptrsendbuff); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Csgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); recvsize = inter_len(hinter_nb, h_inter, vinter_nb, v_inter); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Csgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { buff2block(v_inter, vinter_nb, h_inter, hinter_nb, recvptr, ptrmynewblock, mb); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } #define Mlacpy(mo,no,ao,ldao,bo,ldbo) \ { \ float *_a,*_b; \ int _m,_n,_lda,_ldb; \ int _i,_j; \ _m = (mo);_n = (no); \ _a = (ao);_b = (bo); \ _lda = (ldao) - _m; \ _ldb = (ldbo) - _m; \ assert(_lda >= 0 && _ldb >= 0); \ for (_j=0;_j<_n;_j++) { \ for (_i=0;_i<_m;_i++) \ *_b++ = *_a++; \ _b += _ldb; \ _a += _lda; \ } \ } static2 int block2buff(vi, vinb, hi, hinb, ptra, ma, buff) int hinb, vinb; IDESC *hi, *vi; MDESC *ma; float *buff, *ptra; { int h, v, sizebuff; float *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptra + hi[h].lstart * ma->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, ptr2 + vi[v].lstart, ma->lda, buff + sizebuff, vi[v].len); sizebuff += hi[h].len * vi[v].len; } } return sizebuff; } static2 void buff2block(vi, vinb, hi, hinb, buff, ptrb, mb) int hinb, vinb; IDESC *hi, *vi; MDESC *mb; float *buff, *ptrb; { int h, v, sizebuff; float *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptrb + hi[h].lstart * mb->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, buff + sizebuff, vi[v].len, ptr2 + vi[v].lstart, mb->lda); sizebuff += hi[h].len * vi[v].len; } } } static2 int inter_len(hinb, hi, vinb, vi) int hinb, vinb; IDESC *hi, *vi; { int hlen, vlen, h, v; hlen = 0; for (h = 0; h < hinb; h++) hlen += hi[h].len; vlen = 0; for (v = 0; v < vinb; v++) vlen += vi[v].len; return hlen * vlen; } void Clacpy(m, n, a, lda, b, ldb) float *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/psgemr2.c000644 000766 000024 00000013337 10363532303 017062 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: psgemr2.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ * * some functions used by the psgemr2d routine see file psgemr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d psgemr2do_ #define fortran_mr2dnew psgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PSGEMR2D #define fortran_mr2d PSGEMR2DO #define scopy_ SCOPY #define slacpy_ SLACPY #else #define fortran_mr2d psgemr2do #define fortran_mr2dnew psgemr2d #define scopy_ scopy #define slacpy_ slacpy #endif #define Clacpy Csgelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Csgesd2d(); extern void Csgerv2d(); /* lapack */ void slacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 sgescanD0 #define dispmat sgedispmat #define setmemory sgesetmemory #define freememory sgefreememory #define scan_intervals sgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpsgemr2do(); extern void Cpsgemr2d(); /* some defines for Cpsgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) float **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (float *) mr2d_malloc( blocksize * sizeof(float)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) float *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern scopy_(); */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpsgemr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].lstart = l + start - j0; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } scalapack-2.0.2/REDIST/SRC/pstrmr.c000644 000766 000024 00000053460 10363532303 017033 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: pstrmr.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PSTRMR2D(UPLO, DIAG, M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PSTRMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. The submatrix to be copied is assumed to be trapezoidal. So only the upper or the lower part will be copied. The other part is unchanged. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== UPLO (input) CHARACTER*1. On entry, UPLO specifies whether we should copy the upper part of the lower part of the defined submatrix: UPLO = 'U' or 'u' copy the upper triangular part. UPLO = 'L' or 'l' copy the lower triangular part. Unchanged on exit. DIAG (input) CHARACTER*1. On entry, DIAG specifies whether we should copy the diagonal. DIAG = 'U' or 'u' do NOT copy the diagonal of the submatrix. DIAG = 'N' or 'n' DO copy the diagonal of the submatrix. Unchanged on exit. M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) REAL On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) REAL On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pstrmr2do_ #define fortran_mr2dnew pstrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PSTRMR2D #define fortran_mr2d PSTRMR2DO #define scopy_ SCOPY #define slacpy_ SLACPY #else #define fortran_mr2d pstrmr2do #define fortran_mr2dnew pstrmr2d #define scopy_ scopy #define slacpy_ slacpy #endif #define Clacpy Cstrlacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Csgesd2d(); extern void Csgerv2d(); /* lapack */ void slacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 strscanD0 #define dispmat strdispmat #define setmemory strsetmemory #define freememory strfreememory #define scan_intervals strscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpstrmr2do(); extern void Cpstrmr2d(); /* some defines for Cpstrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; float *A, *B; { Cpstrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; float *A, *B; int *gcontext; { Cpstrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpstrmr2do(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) char *uplo, *diag; float *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpstrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpstrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) char *uplo, *diag; float *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { float *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; float *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Csgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Csgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } void Clacpy(m, n, a, lda, b, ldb) float *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/pstrmr2.c000644 000766 000024 00000026230 10363532303 017110 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pstrmr2.c,v 1.1.1.1 2000/02/15 18:04:09 susan Exp $ * * some functions used by the pstrmr2d routine see file pstrmr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pstrmr2do_ #define fortran_mr2dnew pstrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PSTRMR2D #define fortran_mr2d PSTRMR2DO #define scopy_ SCOPY #define slacpy_ SLACPY #else #define fortran_mr2d pstrmr2do #define fortran_mr2dnew pstrmr2d #define scopy_ scopy #define slacpy_ slacpy #endif #define Clacpy Cstrlacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Csgesd2d(); extern void Csgerv2d(); /* lapack */ void slacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 strscanD0 #define dispmat strdispmat #define setmemory strsetmemory #define freememory strfreememory #define scan_intervals strscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpstrmr2do(); extern void Cpstrmr2d(); /* some defines for Cpstrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) float **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (float *) mr2d_malloc( blocksize * sizeof(float)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) float *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern scopy_(); */ /**************************************************************/ /* return the number of elements int the column after i and the distance of * the first one from i, i,j can be negative out of borns, the number of * elements returned can be negative (means 0) */ static2 int insidemat(uplo, diag, i, j, m, n, offset) int m, n, i, j; /* coordonnees de depart, taille de la sous-matrice */ char *uplo, *diag; int *offset; { /* tests outside mxn */ assert(j >= 0 && j < n); assert(i >= 0); if (toupper(*uplo) == 'U') { int nbline; /* number of lines in the j_th column */ int virtualnbline; /* number of line if we were not limited by m */ *offset = 0; virtualnbline = max(m - n, 0) + j + (toupper(*diag) == 'N'); nbline = min(virtualnbline, m); return nbline - i; } else { int firstline; /* first line in the j_th column */ int diagcol; /* column where the diag begin */ int virtualline; /* virtual first line if the matrix was extended with * negative indices */ int off; diagcol = max(n - m, 0);; virtualline = j - diagcol + (toupper(*diag) == 'U'); firstline = max(0, virtualline); off = max(firstline - i, 0); *offset = off; i += off; return m - i; } }/* insidemat() */ /********************************************************************/ /* Execute an action on the local memories when an intersection occurs (the * action can be the filling of the memory buffer, the count of the memory * buffer size or the setting of the memory with the element received) */ static2 void intersect(uplo, diag, j, start, end, action, ptrsizebuff, pptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1) int action, *ptrsizebuff; int j, start, end; float **pptrbuff, *ptrblock; int templateheight0, templatewidth0; int templateheight1, templatewidth1; MDESC *ma, *mb; int ia, ja, ib, jb, m, n; char *uplo, *diag; /* Execute the action on the local memory for the current interval and * increment pptrbuff and ptrsizebuff of the intervalsize */ /* Notice that if the interval is contigous in the virtual matrice, it is * also contigous in the real one ! */ { /* int un = 1; only when we use dcopy instead of memcpy */ float *ptrstart; int offset, nbline; int intervalsize; assert(start < end); assert(j >= 0 && j < n); nbline = insidemat(uplo, diag, start, j, m, n, &offset); if (nbline <= 0) return; start += offset; if (start >= end) return; intervalsize = min(end - start, nbline); (*ptrsizebuff) += intervalsize; switch (action) { case SENDBUFF: /* fill buff with local elements to be sent */ ptrstart = ptrblock + localindice(start + ia, j + ja, templateheight0, templatewidth0, ma); memcpy((char *) (*pptrbuff), (char *) ptrstart, intervalsize * sizeof(float)); /* scopy_(&intervalsize, (char *) (ptrstart), &un, (char *) (*pptrbuff), * &un); */ (*pptrbuff) += intervalsize; break; case RECVBUFF: /* fill local memory with the values received */ ptrstart = ptrblock + localindice(start + ib, j + jb, templateheight1, templatewidth1, mb); memcpy((char *) ptrstart, (char *) (*pptrbuff), intervalsize * sizeof(float)); /* scopy_(&intervalsize, (char *) (*pptrbuff), &un, (char *) (ptrstart), * &un); */ (*pptrbuff) += intervalsize; break; case SIZEBUFF: /* computation of sizebuff */ break; default: printf("action is %d outside the scope of the case [0..2] !! \n ", action); exit(0); break; }; /* switch (action) */ }/* intersect() */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpstrmr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].gstart = start; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } /*********************************************************************/ /* Do the scanning of intervals and the requested action */ void scanD0(uplo, diag, action, ptrbuff, ptrsizebuff, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrblock) int action, /* # of the action done on the intersected intervals */ *ptrsizebuff; /* size of the communication ptrbuffer (chosen to be * an output parameter in every cases) */ float *ptrbuff /* address of the communication ptrbuffer (a * suffisant memory space is supposed to be allocated before the call) */ , *ptrblock; int p0, q0, p1, q1; IDESC *v_inter, *h_inter; int vinter_nb, hinter_nb; int m, n; int ia, ja, ib, jb; MDESC *ma, *mb; char *uplo, *diag; {/* Rmk: the a+au type addresses are strict bounds as a+au does not belong to * the [a..a+au-1] interval of length au */ int templateheight1, templatewidth1; int templateheight0, templatewidth0; int h, v; /* for scanning the intervals */ /* initializations */ templateheight1 = p1 * mb->nbrow; templateheight0 = p0 * ma->nbrow; templatewidth1 = q1 * mb->nbcol; templatewidth0 = q0 * ma->nbcol; /* we now will deal will logical grids, that's to say we change our * numbering of processors so that (0,0) begin on logical processor (0,0) */ /* in case we will not enter the while loop */ (*ptrsizebuff) = 0; for (h = 0; h < hinter_nb; h++) for (v = 0; v < vinter_nb; v++) { int j; for (j = 0; j < h_inter[h].len; j++) intersect(uplo, diag, j + h_inter[h].gstart, v_inter[v].gstart, v_inter[v].gstart + v_inter[v].len, action, ptrsizebuff, &ptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1); } }/* scanD0() */ scalapack-2.0.2/REDIST/SRC/pzgemr.c000644 000766 000024 00000054445 10363532303 017014 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: pzgemr.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PZGEMR2D( M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PZGEMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) COMPLEX*16 On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) COMPLEX*16 On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pzgemr2do_ #define fortran_mr2dnew pzgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PZGEMR2D #define fortran_mr2d PZGEMR2DO #define zcopy_ ZCOPY #define zlacpy_ ZLACPY #else #define fortran_mr2d pzgemr2do #define fortran_mr2dnew pzgemr2d #define zcopy_ zcopy #define zlacpy_ zlacpy #endif #define Clacpy Czgelacpy void Clacpy(); typedef struct { double r, i; } dcomplex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Czgesd2d(); extern void Czgerv2d(); /* lapack */ void zlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 zgescanD0 #define dispmat zgedispmat #define setmemory zgesetmemory #define freememory zgefreememory #define scan_intervals zgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpzgemr2do(); extern void Cpzgemr2d(); /* some defines for Cpzgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; dcomplex *A, *B; { Cpzgemr2do(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; dcomplex *A, *B; int *gcontext; { Cpzgemr2d(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpzgemr2do(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) dcomplex *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpzgemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpzgemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) dcomplex *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { dcomplex *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; dcomplex *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); sendsize = block2buff(v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock, ma, ptrsendbuff); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Czgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); recvsize = inter_len(hinter_nb, h_inter, vinter_nb, v_inter); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Czgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { buff2block(v_inter, vinter_nb, h_inter, hinter_nb, recvptr, ptrmynewblock, mb); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } #define Mlacpy(mo,no,ao,ldao,bo,ldbo) \ { \ dcomplex *_a,*_b; \ int _m,_n,_lda,_ldb; \ int _i,_j; \ _m = (mo);_n = (no); \ _a = (ao);_b = (bo); \ _lda = (ldao) - _m; \ _ldb = (ldbo) - _m; \ assert(_lda >= 0 && _ldb >= 0); \ for (_j=0;_j<_n;_j++) { \ for (_i=0;_i<_m;_i++) \ *_b++ = *_a++; \ _b += _ldb; \ _a += _lda; \ } \ } static2 int block2buff(vi, vinb, hi, hinb, ptra, ma, buff) int hinb, vinb; IDESC *hi, *vi; MDESC *ma; dcomplex *buff, *ptra; { int h, v, sizebuff; dcomplex *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptra + hi[h].lstart * ma->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, ptr2 + vi[v].lstart, ma->lda, buff + sizebuff, vi[v].len); sizebuff += hi[h].len * vi[v].len; } } return sizebuff; } static2 void buff2block(vi, vinb, hi, hinb, buff, ptrb, mb) int hinb, vinb; IDESC *hi, *vi; MDESC *mb; dcomplex *buff, *ptrb; { int h, v, sizebuff; dcomplex *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptrb + hi[h].lstart * mb->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, buff + sizebuff, vi[v].len, ptr2 + vi[v].lstart, mb->lda); sizebuff += hi[h].len * vi[v].len; } } } static2 int inter_len(hinb, hi, vinb, vi) int hinb, vinb; IDESC *hi, *vi; { int hlen, vlen, h, v; hlen = 0; for (h = 0; h < hinb; h++) hlen += hi[h].len; vlen = 0; for (v = 0; v < vinb; v++) vlen += vi[v].len; return hlen * vlen; } void Clacpy(m, n, a, lda, b, ldb) dcomplex *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/pzgemr2.c000644 000766 000024 00000013427 10363532303 017071 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pzgemr2.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $ * * some functions used by the pzgemr2d routine see file pzgemr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pzgemr2do_ #define fortran_mr2dnew pzgemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PZGEMR2D #define fortran_mr2d PZGEMR2DO #define zcopy_ ZCOPY #define zlacpy_ ZLACPY #else #define fortran_mr2d pzgemr2do #define fortran_mr2dnew pzgemr2d #define zcopy_ zcopy #define zlacpy_ zlacpy #endif #define Clacpy Czgelacpy void Clacpy(); typedef struct { double r, i; } dcomplex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Czgesd2d(); extern void Czgerv2d(); /* lapack */ void zlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 zgescanD0 #define dispmat zgedispmat #define setmemory zgesetmemory #define freememory zgefreememory #define scan_intervals zgescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpzgemr2do(); extern void Cpzgemr2d(); /* some defines for Cpzgemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) dcomplex **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (dcomplex *) mr2d_malloc( blocksize * sizeof(dcomplex)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) dcomplex *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern zcopy_(); */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpzgemr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].lstart = l + start - j0; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } scalapack-2.0.2/REDIST/SRC/pztrmr.c000644 000766 000024 00000053601 10363532303 017037 0ustar00juliestaff000000 000000 #include "redist.h" /** $Id: pztrmr.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PZTRMR2D(UPLO, DIAG, M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PZTRMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. The submatrix to be copied is assumed to be trapezoidal. So only the upper or the lower part will be copied. The other part is unchanged. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== UPLO (input) CHARACTER*1. On entry, UPLO specifies whether we should copy the upper part of the lower part of the defined submatrix: UPLO = 'U' or 'u' copy the upper triangular part. UPLO = 'L' or 'l' copy the lower triangular part. Unchanged on exit. DIAG (input) CHARACTER*1. On entry, DIAG specifies whether we should copy the diagonal. DIAG = 'U' or 'u' do NOT copy the diagonal of the submatrix. DIAG = 'N' or 'n' DO copy the diagonal of the submatrix. Unchanged on exit. M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) COMPLEX*16 On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) COMPLEX*16 On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pztrmr2do_ #define fortran_mr2dnew pztrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PZTRMR2D #define fortran_mr2d PZTRMR2DO #define zcopy_ ZCOPY #define zlacpy_ ZLACPY #else #define fortran_mr2d pztrmr2do #define fortran_mr2dnew pztrmr2d #define zcopy_ zcopy #define zlacpy_ zlacpy #endif #define Clacpy Cztrlacpy void Clacpy(); typedef struct { double r, i; } dcomplex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Czgesd2d(); extern void Czgerv2d(); /* lapack */ void zlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 ztrscanD0 #define dispmat ztrdispmat #define setmemory ztrsetmemory #define freememory ztrfreememory #define scan_intervals ztrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpztrmr2do(); extern void Cpztrmr2d(); /* some defines for Cpztrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; dcomplex *A, *B; { Cpztrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) char *uplo, *diag; int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; dcomplex *A, *B; int *gcontext; { Cpztrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpztrmr2do(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) char *uplo, *diag; dcomplex *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpztrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpztrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) char *uplo, *diag; dcomplex *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { dcomplex *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; dcomplex *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Czgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Czgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } void Clacpy(m, n, a, lda, b, ldb) dcomplex *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); } scalapack-2.0.2/REDIST/SRC/pztrmr2.c000644 000766 000024 00000026337 10363532303 017127 0ustar00juliestaff000000 000000 #include "redist.h" /* $Id: pztrmr2.c,v 1.1.1.1 2000/02/15 18:04:10 susan Exp $ * * some functions used by the pztrmr2d routine see file pztrmr.c for more * documentation. * * Created March 1993 by B. Tourancheau (See sccs for modifications). */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pztrmr2do_ #define fortran_mr2dnew pztrmr2d_ #elif defined(UpCase) #define fortran_mr2dnew PZTRMR2D #define fortran_mr2d PZTRMR2DO #define zcopy_ ZCOPY #define zlacpy_ ZLACPY #else #define fortran_mr2d pztrmr2do #define fortran_mr2dnew pztrmr2d #define zcopy_ zcopy #define zlacpy_ zlacpy #endif #define Clacpy Cztrlacpy void Clacpy(); typedef struct { double r, i; } dcomplex; typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int gstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Czgesd2d(); extern void Czgerv2d(); /* lapack */ void zlacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 ztrscanD0 #define dispmat ztrdispmat #define setmemory ztrsetmemory #define freememory ztrfreememory #define scan_intervals ztrscan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpztrmr2do(); extern void Cpztrmr2d(); /* some defines for Cpztrmr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #include #include /* Created March 1993 by B. Tourancheau (See sccs for modifications). */ /************************************************************************/ /* Set the memory space with the malloc function */ void setmemory(adpointer, blocksize) dcomplex **adpointer; int blocksize; { assert(blocksize >= 0); if (blocksize == 0) { *adpointer = NULL; return; } *adpointer = (dcomplex *) mr2d_malloc( blocksize * sizeof(dcomplex)); } /******************************************************************/ /* Free the memory space after the malloc */ void freememory(ptrtobefreed) dcomplex *ptrtobefreed; { if (ptrtobefreed == NULL) return; free((char *) ptrtobefreed); } /* extern functions for intersect() extern zcopy_(); */ /**************************************************************/ /* return the number of elements int the column after i and the distance of * the first one from i, i,j can be negative out of borns, the number of * elements returned can be negative (means 0) */ static2 int insidemat(uplo, diag, i, j, m, n, offset) int m, n, i, j; /* coordonnees de depart, taille de la sous-matrice */ char *uplo, *diag; int *offset; { /* tests outside mxn */ assert(j >= 0 && j < n); assert(i >= 0); if (toupper(*uplo) == 'U') { int nbline; /* number of lines in the j_th column */ int virtualnbline; /* number of line if we were not limited by m */ *offset = 0; virtualnbline = max(m - n, 0) + j + (toupper(*diag) == 'N'); nbline = min(virtualnbline, m); return nbline - i; } else { int firstline; /* first line in the j_th column */ int diagcol; /* column where the diag begin */ int virtualline; /* virtual first line if the matrix was extended with * negative indices */ int off; diagcol = max(n - m, 0);; virtualline = j - diagcol + (toupper(*diag) == 'U'); firstline = max(0, virtualline); off = max(firstline - i, 0); *offset = off; i += off; return m - i; } }/* insidemat() */ /********************************************************************/ /* Execute an action on the local memories when an intersection occurs (the * action can be the filling of the memory buffer, the count of the memory * buffer size or the setting of the memory with the element received) */ static2 void intersect(uplo, diag, j, start, end, action, ptrsizebuff, pptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1) int action, *ptrsizebuff; int j, start, end; dcomplex **pptrbuff, *ptrblock; int templateheight0, templatewidth0; int templateheight1, templatewidth1; MDESC *ma, *mb; int ia, ja, ib, jb, m, n; char *uplo, *diag; /* Execute the action on the local memory for the current interval and * increment pptrbuff and ptrsizebuff of the intervalsize */ /* Notice that if the interval is contigous in the virtual matrice, it is * also contigous in the real one ! */ { /* int un = 1; only when we use dcopy instead of memcpy */ dcomplex *ptrstart; int offset, nbline; int intervalsize; assert(start < end); assert(j >= 0 && j < n); nbline = insidemat(uplo, diag, start, j, m, n, &offset); if (nbline <= 0) return; start += offset; if (start >= end) return; intervalsize = min(end - start, nbline); (*ptrsizebuff) += intervalsize; switch (action) { case SENDBUFF: /* fill buff with local elements to be sent */ ptrstart = ptrblock + localindice(start + ia, j + ja, templateheight0, templatewidth0, ma); memcpy((char *) (*pptrbuff), (char *) ptrstart, intervalsize * sizeof(dcomplex)); /* zcopy_(&intervalsize, (char *) (ptrstart), &un, (char *) (*pptrbuff), * &un); */ (*pptrbuff) += intervalsize; break; case RECVBUFF: /* fill local memory with the values received */ ptrstart = ptrblock + localindice(start + ib, j + jb, templateheight1, templatewidth1, mb); memcpy((char *) ptrstart, (char *) (*pptrbuff), intervalsize * sizeof(dcomplex)); /* zcopy_(&intervalsize, (char *) (*pptrbuff), &un, (char *) (ptrstart), * &un); */ (*pptrbuff) += intervalsize; break; case SIZEBUFF: /* computation of sizebuff */ break; default: printf("action is %d outside the scope of the case [0..2] !! \n ", action); exit(0); break; }; /* switch (action) */ }/* intersect() */ /* scan_intervals: scans two distributions in one dimension, and compute the * intersections on the local processor. result must be long enough to * contains the result that are stocked in IDESC structure, the function * returns the number of intersections found */ int scan_intervals(type, ja, jb, n, ma, mb, q0, q1, col0, col1, result) char type; int ja, jb, n, q0, q1, col0, col1; MDESC *ma, *mb; IDESC *result; { int offset, j0, j1, templatewidth0, templatewidth1, nbcol0, nbcol1; int l; /* local indice on the beginning of the interval */ assert(type == 'c' || type == 'r'); nbcol0 = (type == 'c' ? ma->nbcol : ma->nbrow); nbcol1 = (type == 'c' ? mb->nbcol : mb->nbrow); templatewidth0 = q0 * nbcol0; templatewidth1 = q1 * nbcol1; { int sp0 = (type == 'c' ? ma->spcol : ma->sprow); int sp1 = (type == 'c' ? mb->spcol : mb->sprow); j0 = SHIFT(col0, sp0, q0) * nbcol0 - ja; j1 = SHIFT(col1, sp1, q1) * nbcol1 - jb; } offset = 0; l = 0; /* a small check to verify that the submatrix begin inside the first block * of the original matrix, this done by a sort of coordinate change at the * beginning of the Cpztrmr2d */ assert(j0 + nbcol0 > 0); assert(j1 + nbcol1 > 0); while ((j0 < n) && (j1 < n)) { int end0, end1; int start, end; end0 = j0 + nbcol0; end1 = j1 + nbcol1; if (end0 <= j1) { j0 += templatewidth0; l += nbcol0; continue; } if (end1 <= j0) { j1 += templatewidth1; continue; } /* compute the raw intersection */ start = max(j0, j1); start = max(start, 0); /* the start is correct now, update the corresponding fields */ result[offset].gstart = start; end = min(end0, end1); if (end0 == end) { j0 += templatewidth0; l += nbcol0; } if (end1 == end) j1 += templatewidth1; /* throw the limit if they go out of the matrix */ end = min(end, n); assert(end > start); /* it is a bit tricky to see why the length is always positive after all * this min and max, first we have the property that every interval * considered is at least partly into the submatrix, second we arrive * here only if the raw intersection is non-void, if we remove a limit * that means the corresponding frontier is in both intervals which * proove the final interval is non-void, clear ?? */ result[offset].len = end - start; offset += 1; } /* while */ return offset; } /*********************************************************************/ /* Do the scanning of intervals and the requested action */ void scanD0(uplo, diag, action, ptrbuff, ptrsizebuff, m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, v_inter, vinter_nb, h_inter, hinter_nb, ptrblock) int action, /* # of the action done on the intersected intervals */ *ptrsizebuff; /* size of the communication ptrbuffer (chosen to be * an output parameter in every cases) */ dcomplex *ptrbuff /* address of the communication ptrbuffer (a * suffisant memory space is supposed to be allocated before the call) */ , *ptrblock; int p0, q0, p1, q1; IDESC *v_inter, *h_inter; int vinter_nb, hinter_nb; int m, n; int ia, ja, ib, jb; MDESC *ma, *mb; char *uplo, *diag; {/* Rmk: the a+au type addresses are strict bounds as a+au does not belong to * the [a..a+au-1] interval of length au */ int templateheight1, templatewidth1; int templateheight0, templatewidth0; int h, v; /* for scanning the intervals */ /* initializations */ templateheight1 = p1 * mb->nbrow; templateheight0 = p0 * ma->nbrow; templatewidth1 = q1 * mb->nbcol; templatewidth0 = q0 * ma->nbcol; /* we now will deal will logical grids, that's to say we change our * numbering of processors so that (0,0) begin on logical processor (0,0) */ /* in case we will not enter the while loop */ (*ptrsizebuff) = 0; for (h = 0; h < hinter_nb; h++) for (v = 0; v < vinter_nb; v++) { int j; for (j = 0; j < h_inter[h].len; j++) intersect(uplo, diag, j + h_inter[h].gstart, v_inter[v].gstart, v_inter[v].gstart + v_inter[v].len, action, ptrsizebuff, &ptrbuff, ptrblock, m, n, ma, ia, ja, templateheight0, templatewidth0, mb, ib, jb, templateheight1, templatewidth1); } }/* scanD0() */ scalapack-2.0.2/REDIST/SRC/redist.h000644 000766 000024 00000000166 10363532303 016776 0ustar00juliestaff000000 000000 #ifdef T3D #define float double #endif #ifdef T3E #define float double #endif #ifdef CRAY #define float double #endif scalapack-2.0.2/PBLAS/CMakeLists.txt000644 000766 000024 00000000111 11656312637 017315 0ustar00juliestaff000000 000000 add_subdirectory(SRC) add_subdirectory(TESTING) add_subdirectory(TIMING) scalapack-2.0.2/PBLAS/SRC/000755 000766 000024 00000000000 11750301612 015175 5ustar00juliestaff000000 000000 scalapack-2.0.2/PBLAS/TESTING/000755 000766 000024 00000000000 11750301616 015667 5ustar00juliestaff000000 000000 scalapack-2.0.2/PBLAS/TIMING/000755 000766 000024 00000000000 11750301613 015536 5ustar00juliestaff000000 000000 scalapack-2.0.2/PBLAS/TIMING/CMakeLists.txt000644 000766 000024 00000007252 11656312637 020321 0ustar00juliestaff000000 000000 file(COPY ../SRC/PTOOLS/PB_Cwarn.c DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}) file(COPY ../SRC/PTOOLS/PB_Cabort.c DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}) set (PblasErrorHandler PB_Cwarn.c PB_Cabort.c) set (pbtcom pblastim.f ${PblasErrorHandler}) set (spbtcom psblastim.f ${pbtcom}) set (dpbtcom pdblastim.f ${pbtcom}) set (cpbtcom pcblastim.f ${pbtcom}) set (zpbtcom pzblastim.f ${pbtcom}) set_property( SOURCE ${PblasErrorHandler} APPEND PROPERTY COMPILE_DEFINITIONS TestingPblas ) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/PBLAS/TIMING) file(COPY PCBLAS1TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PCBLAS2TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PCBLAS3TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PDBLAS1TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PDBLAS2TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PDBLAS3TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PSBLAS1TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PSBLAS2TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PSBLAS3TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PZBLAS1TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PZBLAS2TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PZBLAS3TIM.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) add_executable(spb1tim psblas1tim.f ${spbtcom}) add_executable(dpb1tim pdblas1tim.f ${dpbtcom}) add_executable(cpb1tim pcblas1tim.f ${cpbtcom}) add_executable(zpb1tim pzblas1tim.f ${zpbtcom}) add_executable(spb2tim psblas2tim.f ${spbtcom}) add_executable(dpb2tim pdblas2tim.f ${dpbtcom}) add_executable(cpb2tim pcblas2tim.f ${cpbtcom}) add_executable(zpb2tim pzblas2tim.f ${zpbtcom}) add_executable(spb3tim psblas3tim.f ${spbtcom}) add_executable(dpb3tim pdblas3tim.f ${dpbtcom}) add_executable(cpb3tim pcblas3tim.f ${cpbtcom}) add_executable(zpb3tim pzblas3tim.f ${zpbtcom}) target_link_libraries(spb1tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(dpb1tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(cpb1tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(zpb1tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(spb2tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(dpb2tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(cpb2tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(zpb2tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(spb3tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(dpb3tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(cpb3tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(zpb3tim scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) add_test(spb1tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./spb1tim) add_test(dpb1tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./dpb1tim) add_test(cpb1tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./cpb1tim) add_test(zpb1tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./zpb1tim) add_test(spb2tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./spb2tim) add_test(dpb2tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./dpb2tim) add_test(cpb2tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./cpb2tim) add_test(zpb2tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./zpb2tim) add_test(spb3tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./spb3tim) add_test(dpb3tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./dpb3tim) add_test(cpb3tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./cpb3tim) add_test(zpb3tim ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./zpb3tim) scalapack-2.0.2/PBLAS/TIMING/Makefile000644 000766 000024 00000007626 11707553632 017225 0ustar00juliestaff000000 000000 ############################################################################ # # Program: PBLAS -- (version 2.0) # # Module: Makefile # # Purpose: Timing Makefile # # Creation date: April 1, 1998 # # Modified: February 15, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../SLmake.inc sPBLAS1exe = xspblas1tim dPBLAS1exe = xdpblas1tim cPBLAS1exe = xcpblas1tim zPBLAS1exe = xzpblas1tim sPBLAS2exe = xspblas2tim dPBLAS2exe = xdpblas2tim cPBLAS2exe = xcpblas2tim zPBLAS2exe = xzpblas2tim sPBLAS3exe = xspblas3tim dPBLAS3exe = xdpblas3tim cPBLAS3exe = xcpblas3tim zPBLAS3exe = xzpblas3tim spb1tim = $(sPBLAS1exe) dpb1tim = $(dPBLAS1exe) cpb1tim = $(cPBLAS1exe) zpb1tim = $(zPBLAS1exe) spb2tim = $(sPBLAS2exe) dpb2tim = $(dPBLAS2exe) cpb2tim = $(cPBLAS2exe) zpb2tim = $(zPBLAS2exe) spb3tim = $(sPBLAS3exe) dpb3tim = $(dPBLAS3exe) cpb3tim = $(cPBLAS3exe) zpb3tim = $(zPBLAS3exe) pbtcom = pblastim.o PB_Cwarn.o PB_Cabort.o spb1t = psblas1tim.o psblastim.o $(pbtcom) dpb1t = pdblas1tim.o pdblastim.o $(pbtcom) cpb1t = pcblas1tim.o pcblastim.o $(pbtcom) zpb1t = pzblas1tim.o pzblastim.o $(pbtcom) spb2t = psblas2tim.o psblastim.o $(pbtcom) dpb2t = pdblas2tim.o pdblastim.o $(pbtcom) cpb2t = pcblas2tim.o pcblastim.o $(pbtcom) zpb2t = pzblas2tim.o pzblastim.o $(pbtcom) spb3t = psblas3tim.o psblastim.o $(pbtcom) dpb3t = pdblas3tim.o pdblastim.o $(pbtcom) cpb3t = pcblas3tim.o pcblastim.o $(pbtcom) zpb3t = pzblas3tim.o pzblastim.o $(pbtcom) all : single double complex complex16 single: PblasErrorHandler $(spb1tim) $(spb2tim) $(spb3tim) double: PblasErrorHandler $(dpb1tim) $(dpb2tim) $(dpb3tim) complex: PblasErrorHandler $(cpb1tim) $(cpb2tim) $(cpb3tim) complex16: PblasErrorHandler $(zpb1tim) $(zpb2tim) $(zpb3tim) PB_Cwarn.o: $(CC) -c $(CDEFS) $(CCFLAGS) -DTestingPblas -I../SRC -o PB_Cwarn.o ../SRC/PTOOLS/PB_Cwarn.c PB_Cabort.o: $(CC) -c $(CDEFS) $(CCFLAGS) -DTestingPblas -I../SRC -o PB_Cabort.o ../SRC/PTOOLS/PB_Cabort.c PblasErrorHandler: PB_Cwarn.o PB_Cabort.o $(spb1tim) : ../../$(SCALAPACKLIB) $(spb1t) $(FCLOADER) $(FCLOADFLAGS) -o $(spb1tim) $(spb1t) ../../$(SCALAPACKLIB) $(LIBS) $(dpb1tim) : ../../$(SCALAPACKLIB) $(dpb1t) $(FCLOADER) $(FCLOADFLAGS) -o $(dpb1tim) $(dpb1t) ../../$(SCALAPACKLIB) $(LIBS) $(cpb1tim) : ../../$(SCALAPACKLIB) $(cpb1t) $(FCLOADER) $(FCLOADFLAGS) -o $(cpb1tim) $(cpb1t) ../../$(SCALAPACKLIB) $(LIBS) $(zpb1tim) : ../../$(SCALAPACKLIB) $(zpb1t) $(FCLOADER) $(FCLOADFLAGS) -o $(zpb1tim) $(zpb1t) ../../$(SCALAPACKLIB) $(LIBS) $(spb2tim) : ../../$(SCALAPACKLIB) $(spb2t) $(FCLOADER) $(FCLOADFLAGS) -o $(spb2tim) $(spb2t) ../../$(SCALAPACKLIB) $(LIBS) $(dpb2tim) : ../../$(SCALAPACKLIB) $(dpb2t) $(FCLOADER) $(FCLOADFLAGS) -o $(dpb2tim) $(dpb2t) ../../$(SCALAPACKLIB) $(LIBS) $(cpb2tim) : ../../$(SCALAPACKLIB) $(cpb2t) $(FCLOADER) $(FCLOADFLAGS) -o $(cpb2tim) $(cpb2t) ../../$(SCALAPACKLIB) $(LIBS) $(zpb2tim) : ../../$(SCALAPACKLIB) $(zpb2t) $(FCLOADER) $(FCLOADFLAGS) -o $(zpb2tim) $(zpb2t) ../../$(SCALAPACKLIB) $(LIBS) $(spb3tim) : ../../$(SCALAPACKLIB) $(spb3t) $(FCLOADER) $(FCLOADFLAGS) -o $(spb3tim) $(spb3t) ../../$(SCALAPACKLIB) $(LIBS) $(dpb3tim) : ../../$(SCALAPACKLIB) $(dpb3t) $(FCLOADER) $(FCLOADFLAGS) -o $(dpb3tim) $(dpb3t) ../../$(SCALAPACKLIB) $(LIBS) $(cpb3tim) : ../../$(SCALAPACKLIB) $(cpb3t) $(FCLOADER) $(FCLOADFLAGS) -o $(cpb3tim) $(cpb3t) ../../$(SCALAPACKLIB) $(LIBS) $(zpb3tim) : ../../$(SCALAPACKLIB) $(zpb3t) $(FCLOADER) $(FCLOADFLAGS) -o $(zpb3tim) $(zpb3t) ../../$(SCALAPACKLIB) $(LIBS) clean : rm -f *.o x* .f.o : ; $(FC) -c $(FCFLAGS) $*.f .c.o : ; $(CC) -c $(CDEFS) $(CCFLAGS) -DTestingPblas $*.c scalapack-2.0.2/PBLAS/TIMING/pblastim.f000644 000766 000024 00000571674 10363532303 017545 0ustar00juliestaff000000 000000 SUBROUTINE PVDIMCHK( ICTXT, NOUT, N, MATRIX, IX, JX, DESCX, INCX, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 MATRIX INTEGER ICTXT, INCX, INFO, IX, JX, N, NOUT * .. * .. Array Arguments .. INTEGER DESCX( * ) * .. * * Purpose * ======= * * PVDIMCHK checks the validity of the input test dimensions. In case of * an invalid parameter or discrepancy between the parameters, this rou- * tine displays error messages and returns an non-zero error code in * INFO. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * MATRIX (global input) CHARACTER*1 * On entry, MATRIX specifies the one character matrix identi- * fier. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, when INFO is zero, no error has been detected, * otherwise an error has been detected. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. Executable Statements .. * INFO = 0 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.LT.0 ) THEN INFO = 1 ELSE IF( N.EQ.0 ) THEN IF( DESCX( M_ ).LT.0 ) $ INFO = 1 IF( DESCX( N_ ).LT.0 ) $ INFO = 1 ELSE IF( INCX.EQ.DESCX( M_ ) .AND. $ DESCX( N_ ).LT.( JX+N-1 ) ) THEN INFO = 1 ELSE IF( INCX.EQ.1 .AND. INCX.NE.DESCX( M_ ) .AND. $ DESCX( M_ ).LT.( IX+N-1 ) ) THEN INFO = 1 ELSE IF( IX.GT.DESCX( M_ ) ) THEN INFO = 1 ELSE IF( JX.GT.DESCX( N_ ) ) THEN INFO = 1 END IF END IF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) MATRIX WRITE( NOUT, FMT = 9998 ) N, MATRIX, IX, MATRIX, JX, MATRIX, $ INCX WRITE( NOUT, FMT = 9997 ) MATRIX, DESCX( M_ ), MATRIX, $ DESCX( N_ ) WRITE( NOUT, FMT = * ) END IF END IF * 9999 FORMAT( 'Incompatible arguments for matrix ', A1, ':' ) 9998 FORMAT( 'N = ', I6, ', I', A1, ' = ', I6, ', J', A1, ' = ', $ I6, ',INC', A1, ' = ', I6 ) 9997 FORMAT( 'DESC', A1, '( M_ ) = ', I6, ', DESC', A1, '( N_ ) = ', $ I6, '.' ) * RETURN * * End of PVDIMCHK * END SUBROUTINE PMDIMCHK( ICTXT, NOUT, M, N, MATRIX, IA, JA, DESCA, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 MATRIX INTEGER ICTXT, INFO, IA, JA, M, N, NOUT * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PMDIMCHK checks the validity of the input test dimensions. In case of * an invalid parameter or discrepancy between the parameters, this rou- * tine displays error messages and returns an non-zero error code in * INFO. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * MATRIX (global input) CHARACTER*1 * On entry, MATRIX specifies the one character matrix identi- * fier. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, when INFO is zero, no error has been detected, * otherwise an error has been detected. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. Executable Statements .. * INFO = 0 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( ( M.LT.0 ).OR.( N.LT.0 ) ) THEN INFO = 1 ELSE IF( ( M.EQ.0 ).OR.( N.EQ.0 ) )THEN IF( DESCA( M_ ).LT.0 ) $ INFO = 1 IF( DESCA( N_ ).LT.0 ) $ INFO = 1 ELSE IF( DESCA( M_ ).LT.( IA+M-1 ) ) $ INFO = 1 IF( DESCA( N_ ).LT.( JA+N-1 ) ) $ INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) MATRIX WRITE( NOUT, FMT = 9998 ) M, N, MATRIX, IA, MATRIX, JA WRITE( NOUT, FMT = 9997 ) MATRIX, DESCA( M_ ), MATRIX, $ DESCA( N_ ) WRITE( NOUT, FMT = * ) END IF END IF * 9999 FORMAT( 'Incompatible arguments for matrix ', A1, ':' ) 9998 FORMAT( 'M = ', I6, ', N = ', I6, ', I', A1, ' = ', I6, $ ', J', A1, ' = ', I6 ) 9997 FORMAT( 'DESC', A1, '( M_ ) = ', I6, ', DESC', A1, '( N_ ) = ', $ I6, '.' ) * RETURN * * End of PMDIMCHK * END SUBROUTINE PVDESCCHK( ICTXT, NOUT, MATRIX, DESCX, DTX, MX, NX, $ IMBX, INBX, MBX, NBX, RSRCX, CSRCX, INCX, $ MPX, NQX, IPREX, IMIDX, IPOSTX, IGAP, $ GAPMUL, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 MATRIX INTEGER CSRCX, DTX, GAPMUL, ICTXT, IGAP, IMBX, IMIDX, $ INBX, INCX, INFO, IPOSTX, IPREX, MBX, MPX, MX, $ NBX, NOUT, NQX, NX, RSRCX * .. * .. Array Arguments .. INTEGER DESCX( * ) * .. * * Purpose * ======= * * PVDESCCHK checks the validity of the input test parameters and ini- * tializes the descriptor DESCX and the scalar variables MPX, NQX. In * case of an invalid parameter, this routine displays error messages * and return an non-zero error code in INFO. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * MATRIX (global input) CHARACTER*1 * On entry, MATRIX specifies the one character matrix identi- * fier. * * DESCX (global output) INTEGER array * On entry, DESCX is an array of dimension DLEN_. DESCX is the * array descriptor to be set. * * DTYPEX (global input) INTEGER * On entry, DTYPEX specifies the descriptor type. In this ver- * sion, DTYPEX must be BLOCK_CYCLIC_INB_2D. * * MX (global input) INTEGER * On entry, MX specifies the number of rows in the matrix. MX * must be at least zero. * * NX (global input) INTEGER * On entry, NX specifies the number of columns in the matrix. * NX must be at least zero. * * IMBX (global input) INTEGER * On entry, IMBX specifies the row blocking factor used to dis- * tribute the first IMBX rows of the matrix. IMBX must be at * least one. * * INBX (global input) INTEGER * On entry, INBX specifies the column blocking factor used to * distribute the first INBX columns of the matrix. INBX must * be at least one. * * MBX (global input) INTEGER * On entry, MBX specifies the row blocking factor used to dis- * tribute the rows of the matrix. MBX must be at least one. * * NBX (global input) INTEGER * On entry, NBX specifies the column blocking factor used to * distribute the columns of the matrix. NBX must be at least * one. * * RSRCX (global input) INTEGER * On entry, RSRCX specifies the process row in which the first * row of the matrix resides. When RSRCX is -1, the matrix is * row replicated, otherwise RSCRX must be at least zero and * strictly less than NPROW. * * CSRCX (global input) INTEGER * On entry, CSRCX specifies the process column in which the * first column of the matrix resides. When CSRCX is -1, the * matrix is column replicated, otherwise CSCRX must be at least * zero and strictly less than NPCOL. * * INCX (global input) INTEGER * On entry, INCX specifies the global vector increment. INCX * must be one or MX. * * MPX (local output) INTEGER * On exit, MPX is Lr( 1, MX ). * * NQX (local output) INTEGER * On exit, NQX is Lc( 1, NX ). * * IPREX (local output) INTEGER * On exit, IPREX specifies the size of the guard zone to put * before the start of the local padded array. * * IMIDX (local output) INTEGER * On exit, IMIDX specifies the ldx-gap of the guard zone to * put after each column of the local padded array. * * IPOSTX (local output) INTEGER * On exit, IPOSTX specifies the size of the guard zone to put * after the local padded array. * * IGAP (global input) INTEGER * On entry, IGAP specifies the size of the ldx-gap. * * GAPMUL (global input) INTEGER * On entry, GAPMUL is a constant factor controlling the size * of the pre- and post guardzone. * * INFO (global output) INTEGER * On exit, when INFO is zero, no error has been detected, * otherwise an error has been detected. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER LLDX, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_DESCINIT2 * .. * .. External Functions .. INTEGER PB_NUMROC EXTERNAL PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * INFO = 0 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Verify descriptor type DTYPE_ * IF( DTX.NE.BLOCK_CYCLIC_2D_INB ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) MATRIX, 'DTYPE', MATRIX, DTX, $ BLOCK_CYCLIC_2D_INB INFO = 1 END IF * * Verify global matrix dimensions (M_,N_) are correct * IF( MX.LT.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) MATRIX, 'M', MATRIX, MX INFO = 1 ELSE IF( NX.LT.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) MATRIX, 'N', MATRIX, NX INFO = 1 END IF * * Verify if blocking factors (IMB_, INB_) are correct * IF( IMBX.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) MATRIX, 'IMB', MATRIX, IMBX INFO = 1 ELSE IF( INBX.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) MATRIX, 'INB', MATRIX, INBX INFO = 1 END IF * * Verify if blocking factors (MB_, NB_) are correct * IF( MBX.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) MATRIX, 'MB', MATRIX, MBX INFO = 1 ELSE IF( NBX.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) MATRIX, 'NB', MATRIX, NBX INFO = 1 END IF * * Verify if origin process coordinates (RSRC_, CSRC_) are valid * IF( RSRCX.LT.-1 .OR. RSRCX.GE.NPROW ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) MATRIX WRITE( NOUT, FMT = 9990 ) 'RSRC', MATRIX, RSRCX, NPROW END IF INFO = 1 ELSE IF( CSRCX.LT.-1 .OR. CSRCX.GE.NPCOL ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9991 ) MATRIX WRITE( NOUT, FMT = 9990 ) 'CSRC', MATRIX, CSRCX, NPCOL END IF INFO = 1 END IF * * Check input increment value * IF( INCX.NE.1 .AND. INCX.NE.MX ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) MATRIX WRITE( NOUT, FMT = 9988 ) 'INC', MATRIX, INCX, MATRIX, MX END IF INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9987 ) MATRIX WRITE( NOUT, FMT = * ) END IF * ELSE * * Compute local testing leading dimension * MPX = PB_NUMROC( MX, 1, IMBX, MBX, MYROW, RSRCX, NPROW ) NQX = PB_NUMROC( NX, 1, INBX, NBX, MYCOL, CSRCX, NPCOL ) IPREX = MAX( GAPMUL*NBX, MPX ) IMIDX = IGAP IPOSTX = MAX( GAPMUL*NBX, NQX ) LLDX = MAX( 1, MPX ) + IMIDX * CALL PB_DESCINIT2( DESCX, MX, NX, IMBX, INBX, MBX, NBX, RSRCX, $ CSRCX, ICTXT, LLDX, INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9987 ) MATRIX WRITE( NOUT, FMT = * ) END IF END IF * END IF * 9999 FORMAT( 2X, '>> Invalid matrix ', A1, ' descriptor type ', A5, A1, $ ': ', I6, ' should be ', I3, '.' ) 9998 FORMAT( 2X, '>> Invalid matrix ', A1, ' row dimension ', A1, A1, $ ': ', I6, ' should be at least 1.' ) 9997 FORMAT( 2X, '>> Invalid matrix ', A1, ' column dimension ', A1, $ A1, ': ', I6, ' should be at least 1.' ) 9996 FORMAT( 2X, '>> Invalid matrix ', A1, ' first row block size ', $ A3, A1, ': ', I6, ' should be at least 1.' ) 9995 FORMAT( 2X, '>> Invalid matrix ', A1, ' first column block size ', $ A3, A1,': ', I6, ' should be at least 1.' ) 9994 FORMAT( 2X, '>> Invalid matrix ', A1, ' row block size ', A2, A1, $ ': ', I6, ' should be at least 1.' ) 9993 FORMAT( 2X, '>> Invalid matrix ', A1, ' column block size ', A2, $ A1,': ', I6, ' should be at least 1.' ) 9992 FORMAT( 2X, '>> Invalid matrix ', A1, ' row process source:' ) 9991 FORMAT( 2X, '>> Invalid matrix ', A1, ' column process source:' ) 9990 FORMAT( 2X, '>> ', A4, A1, '= ', I6, ' should be >= -1 and < ', $ I6, '.' ) 9989 FORMAT( 2X, '>> Invalid vector ', A1, ' increment:' ) 9988 FORMAT( 2X, '>> ', A3, A1, '= ', I6, ' should be 1 or M', A1, $ ' = ', I6, '.' ) 9987 FORMAT( 2X, '>> Invalid matrix ', A1, ' descriptor: going on to ', $ 'next test case.' ) * RETURN * * End of PVDESCCHK * END SUBROUTINE PMDESCCHK( ICTXT, NOUT, MATRIX, DESCA, DTA, MA, NA, $ IMBA, INBA, MBA, NBA, RSRCA, CSRCA, MPA, $ NQA, IPREA, IMIDA, IPOSTA, IGAP, GAPMUL, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 MATRIX INTEGER CSRCA, DTA, GAPMUL, ICTXT, IGAP, IMBA, IMIDA, $ INBA, INFO, IPOSTA, IPREA, MA, MBA, MPA, NA, $ NBA, NOUT, NQA, RSRCA * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PMDESCCHK checks the validity of the input test parameters and ini- * tializes the descriptor DESCA and the scalar variables MPA, NQA. In * case of an invalid parameter, this routine displays error messages * and return an non-zero error code in INFO. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * MATRIX (global input) CHARACTER*1 * On entry, MATRIX specifies the one character matrix identi- * fier. * * DESCA (global output) INTEGER array * On entry, DESCA is an array of dimension DLEN_. DESCA is the * array descriptor to be set. * * DTYPEA (global input) INTEGER * On entry, DTYPEA specifies the descriptor type. In this ver- * sion, DTYPEA must be BLOCK_CYCLIC_INB_2D. * * MA (global input) INTEGER * On entry, MA specifies the number of rows in the matrix. MA * must be at least zero. * * NA (global input) INTEGER * On entry, NA specifies the number of columns in the matrix. * NA must be at least zero. * * IMBA (global input) INTEGER * On entry, IMBA specifies the row blocking factor used to dis- * tribute the first IMBA rows of the matrix. IMBA must be at * least one. * * INBA (global input) INTEGER * On entry, INBA specifies the column blocking factor used to * distribute the first INBA columns of the matrix. INBA must * be at least one. * * MBA (global input) INTEGER * On entry, MBA specifies the row blocking factor used to dis- * tribute the rows of the matrix. MBA must be at least one. * * NBA (global input) INTEGER * On entry, NBA specifies the column blocking factor used to * distribute the columns of the matrix. NBA must be at least * one. * * RSRCA (global input) INTEGER * On entry, RSRCA specifies the process row in which the first * row of the matrix resides. When RSRCA is -1, the matrix is * row replicated, otherwise RSCRA must be at least zero and * strictly less than NPROW. * * CSRCA (global input) INTEGER * On entry, CSRCA specifies the process column in which the * first column of the matrix resides. When CSRCA is -1, the * matrix is column replicated, otherwise CSCRA must be at least * zero and strictly less than NPCOL. * * MPA (local output) INTEGER * On exit, MPA is Lr( 1, MA ). * * NQA (local output) INTEGER * On exit, NQA is Lc( 1, NA ). * * IPREA (local output) INTEGER * On exit, IPREA specifies the size of the guard zone to put * before the start of the local padded array. * * IMIDA (local output) INTEGER * On exit, IMIDA specifies the lda-gap of the guard zone to * put after each column of the local padded array. * * IPOSTA (local output) INTEGER * On exit, IPOSTA specifies the size of the guard zone to put * after the local padded array. * * IGAP (global input) INTEGER * On entry, IGAP specifies the size of the lda-gap. * * GAPMUL (global input) INTEGER * On entry, GAPMUL is a constant factor controlling the size * of the pre- and post guardzone. * * INFO (global output) INTEGER * On exit, when INFO is zero, no error has been detected, * otherwise an error has been detected. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER LLDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_DESCINIT2 * .. * .. External Functions .. INTEGER PB_NUMROC EXTERNAL PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * INFO = 0 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Verify descriptor type DTYPE_ * IF( DTA.NE.BLOCK_CYCLIC_2D_INB ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) MATRIX, 'DTYPE', MATRIX, DTA, $ BLOCK_CYCLIC_2D_INB INFO = 1 END IF * * Verify global matrix dimensions (M_,N_) are correct * IF( MA.LT.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) MATRIX, 'M', MATRIX, MA INFO = 1 ELSE IF( NA.LT.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) MATRIX, 'N', MATRIX, NA INFO = 1 END IF * * Verify if blocking factors (IMB_, INB_) are correct * IF( IMBA.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) MATRIX, 'IMB', MATRIX, IMBA INFO = 1 ELSE IF( INBA.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) MATRIX, 'INB', MATRIX, INBA INFO = 1 END IF * * Verify if blocking factors (MB_, NB_) are correct * IF( MBA.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) MATRIX, 'MB', MATRIX, MBA INFO = 1 ELSE IF( NBA.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) MATRIX, 'NB', MATRIX, NBA INFO = 1 END IF * * Verify if origin process coordinates (RSRC_, CSRC_) are valid * IF( RSRCA.LT.-1 .OR. RSRCA.GE.NPROW ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) MATRIX WRITE( NOUT, FMT = 9990 ) 'RSRC', MATRIX, RSRCA, NPROW END IF INFO = 1 ELSE IF( CSRCA.LT.-1 .OR. CSRCA.GE.NPCOL ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9991 ) MATRIX WRITE( NOUT, FMT = 9990 ) 'CSRC', MATRIX, CSRCA, NPCOL END IF INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) MATRIX WRITE( NOUT, FMT = * ) END IF * ELSE * * Compute local testing leading dimension * MPA = PB_NUMROC( MA, 1, IMBA, MBA, MYROW, RSRCA, NPROW ) NQA = PB_NUMROC( NA, 1, INBA, NBA, MYCOL, CSRCA, NPCOL ) IPREA = MAX( GAPMUL*NBA, MPA ) IMIDA = IGAP IPOSTA = MAX( GAPMUL*NBA, NQA ) LLDA = MAX( 1, MPA ) + IMIDA * CALL PB_DESCINIT2( DESCA, MA, NA, IMBA, INBA, MBA, NBA, RSRCA, $ CSRCA, ICTXT, LLDA, INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) MATRIX WRITE( NOUT, FMT = * ) END IF END IF * END IF * 9999 FORMAT( 2X, '>> Invalid matrix ', A1, ' descriptor type ', A5, A1, $ ': ', I6, ' should be ', I3, '.' ) 9998 FORMAT( 2X, '>> Invalid matrix ', A1, ' row dimension ', A1, A1, $ ': ', I6, ' should be at least 1.' ) 9997 FORMAT( 2X, '>> Invalid matrix ', A1, ' column dimension ', A1, $ A1, ': ', I6, ' should be at least 1.' ) 9996 FORMAT( 2X, '>> Invalid matrix ', A1, ' first row block size ', $ A3, A1, ': ', I6, ' should be at least 1.' ) 9995 FORMAT( 2X, '>> Invalid matrix ', A1, ' first column block size ', $ A3, A1,': ', I6, ' should be at least 1.' ) 9994 FORMAT( 2X, '>> Invalid matrix ', A1, ' row block size ', A2, A1, $ ': ', I6, ' should be at least 1.' ) 9993 FORMAT( 2X, '>> Invalid matrix ', A1, ' column block size ', A2, $ A1,': ', I6, ' should be at least 1.' ) 9992 FORMAT( 2X, '>> Invalid matrix ', A1, ' row process source:' ) 9991 FORMAT( 2X, '>> Invalid matrix ', A1, ' column process source:' ) 9990 FORMAT( 2X, '>> ', A4, A1, '= ', I6, ' should be >= -1 and < ', $ I6, '.' ) 9989 FORMAT( 2X, '>> Invalid matrix ', A1, ' descriptor: going on to ', $ 'next test case.' ) * RETURN * * End of PMDESCCHK * END DOUBLE PRECISION FUNCTION PDOPBL2( SUBNAM, M, N, KKL, KKU ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*7 SUBNAM INTEGER KKL, KKU, M, N * .. * * Purpose * ======= * * PDOPBL2 computes an approximation of the number of floating point * operations performed by a subroutine SUBNAM with the given values of * the parameters M, N, KL, and KU. * * This version counts operations for the Level 2 PBLAS. * * Arguments * ========= * * SUBNAM (input) CHARACTER*7 * On entry, SUBNAM specifies the name of the subroutine. * * M (input) INTEGER * On entry, M specifies the number of rows of the coefficient * matrix. M must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the coeffi- * cient matrix. If the matrix is square (such as in a solve * routine) then N is the number of right hand sides. N must be * at least zero. * * KKL (input) INTEGER * On entry, KKL specifies the lower band width of the coeffi- * cient matrix. KL is set to max( 0, min( M-1, KKL ) ). * * KKU (input) INTEGER * On entry, KKU specifies the upper band width of the coeffi- * cient matrix. KU is set to max( 0, min( N-1, KKU ) ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, SIX, TWO, ZERO PARAMETER ( ONE = 1.0D+0, SIX = 6.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER*1 C1 CHARACTER*2 C2 CHARACTER*3 C3 DOUBLE PRECISION ADDS, EK, EM, EN, KL, KU, MULTS * .. * .. External Functions .. LOGICAL LSAME, LSAMEN EXTERNAL LSAME, LSAMEN * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. .NOT.( LSAMEN( 2, SUBNAM, 'PS' ) .OR. $ LSAMEN( 2, SUBNAM, 'PD' ) .OR. $ LSAMEN( 2, SUBNAM, 'PC' ) .OR. LSAMEN( 2, SUBNAM, 'PZ' ) ) ) $ THEN PDOPBL2 = ZERO RETURN END IF * C1 = SUBNAM( 2: 2 ) C2 = SUBNAM( 3: 4 ) C3 = SUBNAM( 5: 7 ) MULTS = ZERO ADDS = ZERO KL = MAX( 0, MIN( M-1, KKL ) ) KU = MAX( 0, MIN( N-1, KKU ) ) EM = DBLE( M ) EN = DBLE( N ) EK = DBLE( KL ) * * ------------------------------- * Matrix-vector multiply routines * ------------------------------- * IF( LSAMEN( 3, C3, 'MV ' ) ) THEN * IF( LSAMEN( 2, C2, 'GE' ) ) THEN * MULTS = EM * ( EN + ONE ) ADDS = EM * EN * * Assume M <= N + KL and KL < M * N <= M + KU and KU < N * so that the zero sections are triangles. * ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN * MULTS = EM * ( EN + ONE ) - $ ( EM - ONE - KL ) * ( EM - KL ) / TWO - $ ( EN - ONE - KU ) * ( EN - KU ) / TWO ADDS = EM * ( EN + ONE ) - $ ( EM - ONE - KL ) * ( EM - KL ) / TWO - $ ( EN - ONE - KU ) * ( EN - KU ) / TWO * ELSE IF( LSAMEN( 2, C2, 'SY' ) .OR. LSAMEN( 2, C2, 'SP' ) .OR. $ LSAMEN( 2, C2, 'HE' ) .OR. LSAMEN( 2, C2, 'HP' ) ) $ THEN * MULTS = EM * ( EM + ONE ) ADDS = EM * EM * ELSE IF( LSAMEN( 2, C2, 'SB' ) .OR. $ LSAMEN( 2, C2, 'HB' ) ) THEN * MULTS = EM * ( EM + ONE ) - ( EM - ONE - EK ) * ( EM - EK ) ADDS = EM * EM - ( EM - ONE - EK ) * ( EM - EK ) * ELSE IF( LSAMEN( 2, C2, 'TR' ) .OR. LSAMEN( 2, C2, 'TP' ) ) $ THEN * MULTS = EM * ( EM + ONE ) / TWO ADDS = ( EM - ONE ) * EM / TWO * ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN * MULTS = EM * ( EM + ONE ) / TWO - $ ( EM - EK - ONE ) * ( EM - EK ) / TWO ADDS = ( EM - ONE ) * EM / TWO - $ ( EM - EK - ONE ) * ( EM - EK ) / TWO * END IF * * --------------------- * Matrix solve routines * --------------------- * ELSE IF( LSAMEN( 3, C3, 'SV ' ) ) THEN * IF( LSAMEN( 2, C2, 'TR' ) .OR. LSAMEN( 2, C2, 'TP' ) ) THEN * MULTS = EM * ( EM + ONE ) / TWO ADDS = ( EM - ONE ) * EM / TWO * ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN * MULTS = EM * ( EM + ONE ) / TWO - $ ( EM - EK - ONE ) * ( EM - EK ) / TWO ADDS = ( EM - ONE ) * EM / TWO - $ ( EM - EK - ONE ) * ( EM - EK ) / TWO * END IF * * ---------------- * Rank-one updates * ---------------- * ELSE IF( LSAMEN( 3, C3, 'R ' ) ) THEN * IF( LSAMEN( 2, C2, 'GE' ) ) THEN * MULTS = EM * EN + MIN( EM, EN ) ADDS = EM * EN * ELSE IF( LSAMEN( 2, C2, 'SY' ) .OR. LSAMEN( 2, C2, 'SP' ) .OR. $ LSAMEN( 2, C2, 'HE' ) .OR. LSAMEN( 2, C2, 'HP' ) ) $ THEN * MULTS = EM * ( EM + ONE ) / TWO + EM ADDS = EM * ( EM + ONE ) / TWO * END IF * ELSE IF( LSAMEN( 3, C3, 'RC ' ) .OR. LSAMEN( 3, C3, 'RU ' ) ) THEN * IF( LSAMEN( 2, C2, 'GE' ) ) THEN * MULTS = EM * EN + MIN( EM, EN ) ADDS = EM * EN * END IF * * ---------------- * Rank-two updates * ---------------- * ELSE IF( LSAMEN( 3, C3, 'R2 ' ) ) THEN IF( LSAMEN( 2, C2, 'SY' ) .OR. LSAMEN( 2, C2, 'SP' ) .OR. $ LSAMEN( 2, C2, 'HE' ) .OR. LSAMEN( 2, C2, 'HP' ) ) THEN * MULTS = EM * ( EM + ONE ) + TWO * EM ADDS = EM * ( EM + ONE ) * END IF END IF * * ------------------------------------------------ * Compute the total number of operations. * For real and double precision routines, count * 1 for each multiply and 1 for each add. * For complex and complex*16 routines, count * 6 for each multiply and 2 for each add. * ------------------------------------------------ * IF( LSAME( C1, 'S' ) .OR. LSAME( C1, 'D' ) ) THEN * PDOPBL2 = MULTS + ADDS * ELSE * PDOPBL2 = SIX * MULTS + TWO * ADDS * END IF * RETURN * * End of PDOPBL2 * END DOUBLE PRECISION FUNCTION PDOPBL3( SUBNAM, M, N, K ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*7 SUBNAM INTEGER K, M, N * .. * * Purpose * ======= * * PDOPBL3 computes an approximation of the number of floating point * operations performed by a subroutine SUBNAM with the given values of * the parameters M, N and K. * * This version counts operations for the Level 3 PBLAS. * * Arguments * ========= * * SUBNAM (input) CHARACTER*7 * On entry, SUBNAM specifies the name of the subroutine. * * M (input) INTEGER * N (input) INTEGER * K (input) INTEGER * On entry, M, N, and K contain parameter values used by the * Level 3 PBLAS. The output matrix is always M x N or N x N if * symmetric, but K has different uses in different contexts. * For example, in the matrix-matrix multiply routine, we have * C = A * B where C is M x N, A is M x K, and B is K x N. In * PxSYMM, PxHEMM, PxTRMM, and PxTRSM, K indicates whether the * matrix A is applied on the left or right. If K <= 0, the ma- * trix is applied on the left, and if K > 0, on the right. In * PxTRADD, K indicates whether the matrix C is upper or lower * triangular. If K <= 0, the matrix C is upper triangular, and * lower triangular otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, SIX, TWO, ZERO PARAMETER ( ONE = 1.0D+0, SIX = 6.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. CHARACTER*1 C1 CHARACTER*2 C2 CHARACTER*3 C3 DOUBLE PRECISION ADDS, EK, EM, EN, MULTS * .. * .. External Functions .. LOGICAL LSAME, LSAMEN EXTERNAL LSAME, LSAMEN * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. .NOT.( LSAMEN( 2, SUBNAM, 'PS' ) .OR. $ LSAMEN( 2, SUBNAM, 'PD' ) .OR. LSAMEN( 2, SUBNAM, 'PC' ) $ .OR. LSAMEN( 2, SUBNAM, 'PZ' ) ) ) $ THEN PDOPBL3 = ZERO RETURN END IF * C1 = SUBNAM( 2: 2 ) C2 = SUBNAM( 3: 4 ) C3 = SUBNAM( 5: 7 ) MULTS = ZERO ADDS = ZERO EM = DBLE( M ) EN = DBLE( N ) EK = DBLE( K ) * * ---------------------- * Matrix-matrix products * assume beta = 1 * ---------------------- * IF( LSAMEN( 3, C3, 'MM ' ) ) THEN * IF( LSAMEN( 2, C2, 'GE' ) ) THEN * MULTS = EM * EK * EN ADDS = EM * EK * EN * ELSE IF( LSAMEN( 2, C2, 'SY' ) .OR. $ LSAMEN( 2, C2, 'HE' ) ) THEN * * IF K <= 0, assume A multiplies B on the left. * IF( K.LE.0 ) THEN MULTS = EM * EM * EN ADDS = EM * EM * EN ELSE MULTS = EM * EN * EN ADDS = EM * EN * EN END IF * ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN * * IF K <= 0, assume A multiplies B on the left. * IF( K.LE.0 ) THEN MULTS = EN * EM * ( EM + ONE ) / TWO ADDS = EN * EM * ( EM - ONE ) / TWO ELSE MULTS = EM * EN * ( EN + ONE ) / TWO ADDS = EM * EN * ( EN - ONE ) / TWO END IF * END IF * * ------------------------------------------------ * Rank-K update of a symmetric or Hermitian matrix * ------------------------------------------------ * ELSE IF( LSAMEN( 3, C3, 'RK ' ) ) THEN * IF( LSAMEN( 2, C2, 'SY' ) .OR. $ LSAMEN( 2, C2, 'HE' ) ) THEN * MULTS = EK * EM *( EM + ONE ) / TWO ADDS = EK * EM *( EM + ONE ) / TWO END IF * * ------------------------------------------------- * Rank-2K update of a symmetric or Hermitian matrix * ------------------------------------------------- * ELSE IF( LSAMEN( 3, C3, 'R2K' ) ) THEN * IF( LSAMEN( 2, C2, 'SY' ) .OR. $ LSAMEN( 3, C2, 'HE' ) ) THEN * MULTS = EK * EM * EM ADDS = EK * EM * EM + EM END IF * * ----------------------------------------- * Solving system with many right hand sides * ----------------------------------------- * ELSE IF( LSAMEN( 4, SUBNAM( 3:6 ), 'TRSM' ) ) THEN * IF( K.LE.0 ) THEN MULTS = EN * EM * ( EM + ONE ) / TWO ADDS = EN * EM * ( EM - ONE ) / TWO ELSE MULTS = EM * EN * ( EN + ONE ) / TWO ADDS = EM * EN * ( EN - ONE ) / TWO END IF * * -------------------------- * Matrix (tranpose) Addition * -------------------------- * ELSE IF( LSAMEN( 3, C3, 'ADD' ) ) THEN * IF( LSAMEN( 2, C2, 'GE' ) ) THEN * MULTS = 2 * EM * EN ADDS = EM * EN * ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN * * IF K <= 0, assume C is upper triangular. * IF( K.LE.0 ) THEN IF( M.LE.N ) THEN MULTS = EM * ( TWO * EN - EM + ONE ) ADDS = EM * ( EM + ONE ) / TWO + EM * ( EN - EM ) ELSE MULTS = EN * ( EN + ONE ) ADDS = EN * ( EN + ONE ) / TWO END IF ELSE IF( M.GE.N ) THEN MULTS = EN * ( TWO * EM - EN + ONE ) ADDS = EN * ( EN + ONE ) / TWO + EN * ( EM - EN ) ELSE MULTS = EM * ( EM + ONE ) ADDS = EM * ( EM + ONE ) / TWO END IF END IF * END IF * END IF * * ------------------------------------------------ * Compute the total number of operations. * For real and double precision routines, count * 1 for each multiply and 1 for each add. * For complex and complex*16 routines, count * 6 for each multiply and 2 for each add. * ------------------------------------------------ * IF( LSAME( C1, 'S' ) .OR. LSAME( C1, 'D' ) ) THEN * PDOPBL3 = MULTS + ADDS * ELSE * PDOPBL3 = SIX * MULTS + TWO * ADDS * END IF * RETURN * * End of PDOPBL3 * END SUBROUTINE PXERBLA( ICTXT, SRNAME, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INFO * .. * .. Array Arguments .. CHARACTER*(*) SRNAME * .. * * Purpose * ======= * * PXERBLA is an error handler for the ScaLAPACK routines. It is called * by a ScaLAPACK routine if an input parameter has an invalid value. A * message is printed. Installers may consider modifying this routine in * order to call system-specific exception-handling facilities. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * SRNAME (global input) CHARACTER*(*) * On entry, SRNAME specifies the name of the routine which cal- * ling PXERBLA. * * INFO (global input) INTEGER * On entry, INFO specifies the position of the invalid parame- * ter in the parameter list of the calling routine. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * WRITE( *, FMT = 9999 ) MYROW, MYCOL, SRNAME, INFO * 9999 FORMAT( '{', I5, ',', I5, '}: On entry to ', A, $ ' parameter number ', I4, ' had an illegal value' ) * RETURN * * End of PXERBLA * END LOGICAL FUNCTION LSAME( CA, CB ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER CA, CB * .. * * Purpose * ======= * * LSAME returns .TRUE. if CA is the same letter as CB regardless of * case. * * Arguments * ========= * * CA (input) CHARACTER*1 * CB (input) CHARACTER*1 * CA and CB specify the single characters to be compared. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ICHAR * .. * .. Local Scalars .. INTEGER INTA, INTB, ZCODE * .. * .. Executable Statements .. * * Test if the characters are equal * LSAME = CA.EQ.CB IF( LSAME ) $ RETURN * * Now test for equivalence if both characters are alphabetic. * ZCODE = ICHAR( 'Z' ) * * Use 'Z' rather than 'A' so that ASCII can be detected on Prime * machines, on which ICHAR returns a value with bit 8 set. * ICHAR('A') on Prime machines returns 193 which is the same as * ICHAR('A') on an EBCDIC machine. * INTA = ICHAR( CA ) INTB = ICHAR( CB ) * IF( ZCODE.EQ.90 .OR. ZCODE.EQ.122 ) THEN * * ASCII is assumed - ZCODE is the ASCII code of either lower or * upper case 'Z'. * IF( INTA.GE.97 .AND. INTA.LE.122 ) INTA = INTA - 32 IF( INTB.GE.97 .AND. INTB.LE.122 ) INTB = INTB - 32 * ELSE IF( ZCODE.EQ.233 .OR. ZCODE.EQ.169 ) THEN * * EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or * upper case 'Z'. * IF( INTA.GE.129 .AND. INTA.LE.137 .OR. $ INTA.GE.145 .AND. INTA.LE.153 .OR. $ INTA.GE.162 .AND. INTA.LE.169 ) INTA = INTA + 64 IF( INTB.GE.129 .AND. INTB.LE.137 .OR. $ INTB.GE.145 .AND. INTB.LE.153 .OR. $ INTB.GE.162 .AND. INTB.LE.169 ) INTB = INTB + 64 * ELSE IF( ZCODE.EQ.218 .OR. ZCODE.EQ.250 ) THEN * * ASCII is assumed, on Prime machines - ZCODE is the ASCII code * plus 128 of either lower or upper case 'Z'. * IF( INTA.GE.225 .AND. INTA.LE.250 ) INTA = INTA - 32 IF( INTB.GE.225 .AND. INTB.LE.250 ) INTB = INTB - 32 END IF LSAME = INTA.EQ.INTB * * RETURN * * End of LSAME * END LOGICAL FUNCTION LSAMEN( N, CA, CB ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER*( * ) CA, CB INTEGER N * .. * * Purpose * ======= * * LSAMEN tests if the first N letters of CA are the same as the * first N letters of CB, regardless of case. * LSAMEN returns .TRUE. if CA and CB are equivalent except for case * and .FALSE. otherwise. LSAMEN also returns .FALSE. if LEN( CA ) * or LEN( CB ) is less than N. * * Arguments * ========= * * N (input) INTEGER * The number of characters in CA and CB to be compared. * * CA (input) CHARACTER*(*) * CB (input) CHARACTER*(*) * CA and CB specify two character strings of length at least N. * Only the first N characters of each string will be accessed. * * ===================================================================== * * .. Local Scalars .. INTEGER I * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC LEN * .. * .. Executable Statements .. * LSAMEN = .FALSE. IF( LEN( CA ).LT.N .OR. LEN( CB ).LT.N ) $ GO TO 20 * * Do for each character in the two strings. * DO 10 I = 1, N * * Test if the characters are equal using LSAME. * IF( .NOT.LSAME( CA( I: I ), CB( I: I ) ) ) $ GO TO 20 * 10 CONTINUE LSAMEN = .TRUE. * 20 CONTINUE RETURN * * End of LSAMEN * END SUBROUTINE ICOPY( N, SX, INCX, SY, INCY ) * * -- LAPACK auxiliary test routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * February 29, 1992 * * .. Scalar Arguments .. INTEGER INCX, INCY, N * .. * .. Array Arguments .. INTEGER SX( * ), SY( * ) * .. * * Purpose * ======= * * ICOPY copies an integer vector x to an integer vector y. * Uses unrolled loops for increments equal to 1. * * Arguments * ========= * * N (input) INTEGER * The length of the vectors SX and SY. * * SX (input) INTEGER array, dimension (1+(N-1)*abs(INCX)) * The vector X. * * INCX (input) INTEGER * The spacing between consecutive elements of SX. * * SY (output) INTEGER array, dimension (1+(N-1)*abs(INCY)) * The vector Y. * * INCY (input) INTEGER * The spacing between consecutive elements of SY. * * ===================================================================== * * .. Local Scalars .. INTEGER I, IX, IY, M, MP1 * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * IF( N.LE.0 ) $ RETURN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) $ GO TO 20 * * Code for unequal increments or equal increments not equal to 1 * IX = 1 IY = 1 IF( INCX.LT.0 ) $ IX = ( -N+1 )*INCX + 1 IF( INCY.LT.0 ) $ IY = ( -N+1 )*INCY + 1 DO 10 I = 1, N SY( IY ) = SX( IX ) IX = IX + INCX IY = IY + INCY 10 CONTINUE RETURN * * Code for both increments equal to 1 * * Clean-up loop * 20 CONTINUE M = MOD( N, 7 ) IF( M.EQ.0 ) $ GO TO 40 DO 30 I = 1, M SY( I ) = SX( I ) 30 CONTINUE IF( N.LT.7 ) $ RETURN 40 CONTINUE MP1 = M + 1 DO 50 I = MP1, N, 7 SY( I ) = SX( I ) SY( I+1 ) = SX( I+1 ) SY( I+2 ) = SX( I+2 ) SY( I+3 ) = SX( I+3 ) SY( I+4 ) = SX( I+4 ) SY( I+5 ) = SX( I+5 ) SY( I+6 ) = SX( I+6 ) 50 CONTINUE RETURN * * End of ICOPY * END INTEGER FUNCTION PB_NOABORT( CINFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER CINFO * .. * * Purpose * ======= * * PB_NOABORT transmits the info parameter of a PBLAS routine to the * tester and tells the PBLAS error handler to avoid aborting on erro- * neous input arguments. * * Notes * ===== * * This routine is necessary because of the CRAY C fortran interface * and the fact that the usual PBLAS error handler routine has been * initially written in C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Common Blocks .. INTEGER INFO, NBLOG, NOUT LOGICAL ABRTFLG COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Executable Statements .. * INFO = CINFO IF( ABRTFLG ) THEN PB_NOABORT = 0 ELSE PB_NOABORT = 1 END IF * RETURN * * End of PB_NOABORT * END SUBROUTINE PB_INFOG2L( I, J, DESC, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, PROW, PCOL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER I, II, J, JJ, MYCOL, MYROW, NPCOL, NPROW, PCOL, $ PROW * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * PB_INFOG2L computes the starting local index II, JJ corresponding to * the submatrix starting globally at the entry pointed by I, J. This * routine returns the coordinates in the grid of the process owning the * matrix entry of global indexes I, J, namely PROW and PCOL. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * I (global input) INTEGER * On entry, I specifies the global starting row index of the * submatrix. I must at least one. * * J (global input) INTEGER * On entry, J specifies the global starting column index of * the submatrix. J must at least one. * * DESC (global and local input) INTEGER array * On entry, DESC is an integer array of dimension DLEN_. This * is the array descriptor of the underlying matrix. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process columns * over which the matrix is distributed. NPCOL must be at least * one. * * MYROW (local input) INTEGER * On entry, MYROW specifies the row coordinate of the process * whose local index II is determined. MYROW must be at least * zero and strictly less than NPROW. * * MYCOL (local input) INTEGER * On entry, MYCOL specifies the column coordinate of the pro- * cess whose local index JJ is determined. MYCOL must be at * least zero and strictly less than NPCOL. * * II (local output) INTEGER * On exit, II specifies the local starting row index of the * submatrix. On exit, II is at least one. * * JJ (local output) INTEGER * On exit, JJ specifies the local starting column index of the * submatrix. On exit, JJ is at least one. * * PROW (global output) INTEGER * On exit, PROW specifies the row coordinate of the process * that possesses the first row of the submatrix. On exit, PROW * is -1 if DESC( RSRC_ ) is -1 on input, and, at least zero * and strictly less than NPROW otherwise. * * PCOL (global output) INTEGER * On exit, PCOL specifies the column coordinate of the process * that possesses the first column of the submatrix. On exit, * PCOL is -1 if DESC( CSRC_ ) is -1 on input, and, at least * zero and strictly less than NPCOL otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER CSRC, I1, ILOCBLK, IMB, INB, J1, MB, MYDIST, $ NB, NBLOCKS, RSRC * .. * .. Local Arrays .. INTEGER DESC2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL PB_DESCTRANS * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESC, DESC2 ) * IMB = DESC2( IMB_ ) PROW = DESC2( RSRC_ ) * * Has every process row I ? * IF( ( PROW.EQ.-1 ).OR.( NPROW.EQ.1 ) ) THEN * II = I * ELSE IF( I.LE.IMB ) THEN * * I is in range of first block * IF( MYROW.EQ.PROW ) THEN II = I ELSE II = 1 END IF * ELSE * * I is not in first block of matrix, figure out who has it. * RSRC = PROW MB = DESC2( MB_ ) * IF( MYROW.EQ.RSRC ) THEN * NBLOCKS = ( I - IMB - 1 ) / MB + 1 PROW = PROW + NBLOCKS PROW = PROW - ( PROW / NPROW ) * NPROW * ILOCBLK = NBLOCKS / NPROW * IF( ILOCBLK.GT.0 ) THEN IF( ( ILOCBLK*NPROW ).GE.NBLOCKS ) THEN IF( MYROW.EQ.PROW ) THEN II = I + ( ILOCBLK - NBLOCKS ) * MB ELSE II = IMB + ( ILOCBLK - 1 ) * MB + 1 END IF ELSE II = IMB + ILOCBLK * MB + 1 END IF ELSE II = IMB + 1 END IF * ELSE * I1 = I - IMB NBLOCKS = ( I1 - 1 ) / MB + 1 PROW = PROW + NBLOCKS PROW = PROW - ( PROW / NPROW ) * NPROW * MYDIST = MYROW - RSRC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROW * ILOCBLK = NBLOCKS / NPROW * IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPROW IF( MYDIST.LT.0 ) THEN II = MB + ILOCBLK * MB + 1 ELSE IF( MYROW.EQ.PROW ) THEN II = I1 + ( ILOCBLK - NBLOCKS + 1 ) * MB ELSE II = ILOCBLK * MB + 1 END IF END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN II = MB + 1 ELSE IF( MYROW.EQ.PROW ) THEN II = I1 + ( 1 - NBLOCKS ) * MB ELSE II = 1 END IF END IF END IF * END IF * INB = DESC2( INB_ ) PCOL = DESC2( CSRC_ ) * * Has every process column J ? * IF( ( PCOL.EQ.-1 ).OR.( NPCOL.EQ.1 ) ) THEN * JJ = J * ELSE IF( J.LE.INB ) THEN * * J is in range of first block * IF( MYCOL.EQ.PCOL ) THEN JJ = J ELSE JJ = 1 END IF * ELSE * * J is not in first block of matrix, figure out who has it. * CSRC = PCOL NB = DESC2( NB_ ) * IF( MYCOL.EQ.CSRC ) THEN * NBLOCKS = ( J - INB - 1 ) / NB + 1 PCOL = PCOL + NBLOCKS PCOL = PCOL - ( PCOL / NPCOL ) * NPCOL * ILOCBLK = NBLOCKS / NPCOL * IF( ILOCBLK.GT.0 ) THEN IF( ( ILOCBLK*NPCOL ).GE.NBLOCKS ) THEN IF( MYCOL.EQ.PCOL ) THEN JJ = J + ( ILOCBLK - NBLOCKS ) * NB ELSE JJ = INB + ( ILOCBLK - 1 ) * NB + 1 END IF ELSE JJ = INB + ILOCBLK * NB + 1 END IF ELSE JJ = INB + 1 END IF * ELSE * J1 = J - INB NBLOCKS = ( J1 - 1 ) / NB + 1 PCOL = PCOL + NBLOCKS PCOL = PCOL - ( PCOL / NPCOL ) * NPCOL * MYDIST = MYCOL - CSRC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPCOL * ILOCBLK = NBLOCKS / NPCOL * IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPCOL IF( MYDIST.LT.0 ) THEN JJ = NB + ILOCBLK * NB + 1 ELSE IF( MYCOL.EQ.PCOL ) THEN JJ = J1 + ( ILOCBLK - NBLOCKS + 1 ) * NB ELSE JJ = ILOCBLK * NB + 1 END IF END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN JJ = NB + 1 ELSE IF( MYCOL.EQ.PCOL ) THEN JJ = J1 + ( 1 - NBLOCKS ) * NB ELSE JJ = 1 END IF END IF END IF * END IF * RETURN * * End of PB_INFOG2L * END SUBROUTINE PB_AINFOG2L( M, N, I, J, DESC, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, II, JJ, PROW, $ PCOL, RPROW, RPCOL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER I, II, IMB1, INB1, J, JJ, M, MP, MYCOL, MYROW, $ N, NPCOL, NPROW, NQ, PCOL, PROW, RPCOL, RPROW * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * PB_AINFOG2L computes the starting local row and column indexes II, * JJ corresponding to the submatrix starting globally at the entry * pointed by I, J. This routine returns the coordinates in the grid of * the process owning the matrix entry of global indexes I, J, namely * PROW and PCOL. In addition, this routine computes the quantities MP * and NQ, which are respectively the local number of rows and columns * owned by the process of coordinate MYROW, MYCOL corresponding to the * global submatrix A(I:I+M-1,J:J+N-1). Finally, the size of the first * partial block and the relative process coordinates are also returned * respectively in IMB, INB and RPROW, RPCOL. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the global number of rows of the subma- * trix. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of columns of the * submatrix. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global starting row index of the * submatrix. I must at least one. * * J (global input) INTEGER * On entry, J specifies the global starting column index of * the submatrix. J must at least one. * * DESC (global and local input) INTEGER array * On entry, DESC is an integer array of dimension DLEN_. This * is the array descriptor of the underlying matrix. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process columns * over which the matrix is distributed. NPCOL must be at least * one. * * MYROW (local input) INTEGER * On entry, MYROW specifies the row coordinate of the process * whose local index II is determined. MYROW must be at least * zero and strictly less than NPROW. * * MYCOL (local input) INTEGER * On entry, MYCOL specifies the column coordinate of the pro- * cess whose local index JJ is determined. MYCOL must be at * least zero and strictly less than NPCOL. * * IMB1 (global output) INTEGER * On exit, IMB1 specifies the number of rows of the upper left * block of the submatrix. On exit, IMB1 is less or equal than * M and greater or equal than MIN( 1, M ). * * INB1 (global output) INTEGER * On exit, INB1 specifies the number of columns of the upper * left block of the submatrix. On exit, INB1 is less or equal * than N and greater or equal than MIN( 1, N ). * * MP (local output) INTEGER * On exit, MP specifies the local number of rows of the subma- * trix, that the processes of row coordinate MYROW own. MP is * at least zero. * * NQ (local output) INTEGER * On exit, NQ specifies the local number of columns of the * submatrix, that the processes of column coordinate MYCOL * own. NQ is at least zero. * * II (local output) INTEGER * On exit, II specifies the local starting row index of the * submatrix. On exit, II is at least one. * * JJ (local output) INTEGER * On exit, JJ specifies the local starting column index of * the submatrix. On exit, II is at least one. * * PROW (global output) INTEGER * On exit, PROW specifies the row coordinate of the process * that possesses the first row of the submatrix. On exit, PROW * is -1 if DESC(RSRC_) is -1 on input, and, at least zero and * strictly less than NPROW otherwise. * * PCOL (global output) INTEGER * On exit, PCOL specifies the column coordinate of the process * that possesses the first column of the submatrix. On exit, * PCOL is -1 if DESC(CSRC_) is -1 on input, and, at least zero * and strictly less than NPCOL otherwise. * * RPROW (global output) INTEGER * On exit, RPROW specifies the relative row coordinate of the * process that possesses the first row I of the submatrix. On * exit, RPROW is -1 if DESC(RSRC_) is -1 on input, and, at * least zero and strictly less than NPROW otherwise. * * RPCOL (global output) INTEGER * On exit, RPCOL specifies the relative column coordinate of * the process that possesses the first column J of the subma- * trix. On exit, RPCOL is -1 if DESC(CSRC_) is -1 on input, * and, at least zero and strictly less than NPCOL otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER CSRC, I1, ILOCBLK, J1, M1, MB, MYDIST, N1, NB, $ NBLOCKS, RSRC * .. * .. Local Arrays .. INTEGER DESC2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESC, DESC2 ) * MB = DESC2( MB_ ) IMB1 = DESC2( IMB_ ) RSRC = DESC2( RSRC_ ) * IF( ( RSRC.EQ.-1 ).OR.( NPROW.EQ.1 ) ) THEN * II = I IMB1 = IMB1 - I + 1 IF( IMB1.LE.0 ) $ IMB1 = ( ( -IMB1 ) / MB + 1 ) * MB + IMB1 IMB1 = MIN( IMB1, M ) MP = M PROW = RSRC RPROW = 0 * ELSE * * Figure out PROW, II and IMB1 first * IF( I.LE.IMB1 ) THEN * PROW = RSRC * IF( MYROW.EQ.PROW ) THEN II = I ELSE II = 1 END IF * IMB1 = IMB1 - I + 1 * ELSE * I1 = I - IMB1 - 1 NBLOCKS = I1 / MB + 1 PROW = RSRC + NBLOCKS PROW = PROW - ( PROW / NPROW ) * NPROW * IF( MYROW.EQ.RSRC ) THEN * ILOCBLK = NBLOCKS / NPROW * IF( ILOCBLK.GT.0 ) THEN IF( ( ILOCBLK*NPROW ).GE.NBLOCKS ) THEN IF( MYROW.EQ.PROW ) THEN II = I + ( ILOCBLK - NBLOCKS ) * MB ELSE II = IMB1 + ( ILOCBLK - 1 ) * MB + 1 END IF ELSE II = IMB1 + ILOCBLK * MB + 1 END IF ELSE II = IMB1 + 1 END IF * ELSE * MYDIST = MYROW - RSRC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROW * ILOCBLK = NBLOCKS / NPROW * IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPROW IF( MYDIST.LT.0 ) THEN II = ( ILOCBLK + 1 ) * MB + 1 ELSE IF( MYROW.EQ.PROW ) THEN II = I1 + ( ILOCBLK - NBLOCKS + 1 ) * MB + 1 ELSE II = ILOCBLK * MB + 1 END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN II = MB + 1 ELSE IF( MYROW.EQ.PROW ) THEN II = I1 + ( 1 - NBLOCKS ) * MB + 1 ELSE II = 1 END IF END IF END IF * IMB1 = NBLOCKS * MB - I1 * END IF * * Figure out MP * IF( M.LE.IMB1 ) THEN * IF( MYROW.EQ.PROW ) THEN MP = M ELSE MP = 0 END IF * ELSE * M1 = M - IMB1 NBLOCKS = M1 / MB + 1 * IF( MYROW.EQ.PROW ) THEN ILOCBLK = NBLOCKS / NPROW IF( ILOCBLK.GT.0 ) THEN IF( ( NBLOCKS - ILOCBLK * NPROW ).GT.0 ) THEN MP = IMB1 + ILOCBLK * MB ELSE MP = M + MB * ( ILOCBLK - NBLOCKS ) END IF ELSE MP = IMB1 END IF ELSE MYDIST = MYROW - PROW IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROW ILOCBLK = NBLOCKS / NPROW IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPROW IF( MYDIST.LT.0 ) THEN MP = ( ILOCBLK + 1 ) * MB ELSE IF( MYDIST.GT.0 ) THEN MP = ILOCBLK * MB ELSE MP = M1 + MB * ( ILOCBLK - NBLOCKS + 1 ) END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN MP = MB ELSE IF( MYDIST.GT.0 ) THEN MP = 0 ELSE MP = M1 + MB * ( 1 - NBLOCKS ) END IF END IF END IF * END IF * IMB1 = MIN( IMB1, M ) RPROW = MYROW - PROW IF( RPROW.LT.0 ) $ RPROW = RPROW + NPROW * END IF * NB = DESC2( NB_ ) INB1 = DESC2( INB_ ) CSRC = DESC2( CSRC_ ) * IF( ( CSRC.EQ.-1 ).OR.( NPCOL.EQ.1 ) ) THEN * JJ = J INB1 = INB1 - I + 1 IF( INB1.LE.0 ) $ INB1 = ( ( -INB1 ) / NB + 1 ) * NB + INB1 INB1 = MIN( INB1, N ) NQ = N PCOL = CSRC RPCOL = 0 * ELSE * * Figure out PCOL, JJ and INB1 first * IF( J.LE.INB1 ) THEN * PCOL = CSRC * IF( MYCOL.EQ.PCOL ) THEN JJ = J ELSE JJ = 1 END IF * INB1 = INB1 - J + 1 * ELSE * J1 = J - INB1 - 1 NBLOCKS = J1 / NB + 1 PCOL = CSRC + NBLOCKS PCOL = PCOL - ( PCOL / NPCOL ) * NPCOL * IF( MYCOL.EQ.CSRC ) THEN * ILOCBLK = NBLOCKS / NPCOL * IF( ILOCBLK.GT.0 ) THEN IF( ( ILOCBLK*NPCOL ).GE.NBLOCKS ) THEN IF( MYCOL.EQ.PCOL ) THEN JJ = J + ( ILOCBLK - NBLOCKS ) * NB ELSE JJ = INB1 + ( ILOCBLK - 1 ) * NB + 1 END IF ELSE JJ = INB1 + ILOCBLK * NB + 1 END IF ELSE JJ = INB1 + 1 END IF * ELSE * MYDIST = MYCOL - CSRC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPCOL * ILOCBLK = NBLOCKS / NPCOL * IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPCOL IF( MYDIST.LT.0 ) THEN JJ = ( ILOCBLK + 1 ) * NB + 1 ELSE IF( MYCOL.EQ.PCOL ) THEN JJ = J1 + ( ILOCBLK - NBLOCKS + 1 ) * NB + 1 ELSE JJ = ILOCBLK * NB + 1 END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN JJ = NB + 1 ELSE IF( MYCOL.EQ.PCOL ) THEN JJ = J1 + ( 1 - NBLOCKS ) * NB + 1 ELSE JJ = 1 END IF END IF END IF * INB1 = NBLOCKS * NB - J1 * END IF * * Figure out NQ * IF( N.LE.INB1 ) THEN * IF( MYCOL.EQ.PCOL ) THEN NQ = N ELSE NQ = 0 END IF * ELSE * N1 = N - INB1 NBLOCKS = N1 / NB + 1 * IF( MYCOL.EQ.PCOL ) THEN ILOCBLK = NBLOCKS / NPCOL IF( ILOCBLK.GT.0 ) THEN IF( ( NBLOCKS - ILOCBLK * NPCOL ).GT.0 ) THEN NQ = INB1 + ILOCBLK * NB ELSE NQ = N + NB * ( ILOCBLK - NBLOCKS ) END IF ELSE NQ = INB1 END IF ELSE MYDIST = MYCOL - PCOL IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPCOL ILOCBLK = NBLOCKS / NPCOL IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPCOL IF( MYDIST.LT.0 ) THEN NQ = ( ILOCBLK + 1 ) * NB ELSE IF( MYDIST.GT.0 ) THEN NQ = ILOCBLK * NB ELSE NQ = N1 + NB * ( ILOCBLK - NBLOCKS + 1 ) END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN NQ = NB ELSE IF( MYDIST.GT.0 ) THEN NQ = 0 ELSE NQ = N1 + NB * ( 1 - NBLOCKS ) END IF END IF END IF * END IF * INB1 = MIN( INB1, N ) RPCOL = MYCOL - PCOL IF( RPCOL.LT.0 ) $ RPCOL = RPCOL + NPCOL * END IF * RETURN * * End of PB_AINFOG2L * END INTEGER FUNCTION PB_NUMROC( N, I, INB, NB, PROC, SRCPROC, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER I, INB, N, NB, NPROCS, PROC, SRCPROC * .. * * Purpose * ======= * * PB_NUMROC returns the local number of matrix rows/columns process * PROC will get if we give out N rows/columns starting from global in- * dex I. * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the number of rows/columns being dealt * out. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global index of the matrix entry. * I must be at least one. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * PROC (local input) INTEGER * On entry, PROC specifies the coordinate of the process whose * local portion is determined. PROC must be at least zero and * strictly less than NPROCS. * * SRCPROC (global input) INTEGER * On entry, SRCPROC specifies the coordinate of the process * that possesses the first row or column of the matrix. When * SRCPROC = -1, the data is not distributed but replicated, * otherwise SRCPROC must be at least zero and strictly less * than NPROCS. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the matrix is distributed. NPROCS must * be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I1, ILOCBLK, INB1, MYDIST, N1, NBLOCKS, $ SRCPROC1 * .. * .. Executable Statements .. * IF( ( SRCPROC.EQ.-1 ).OR.( NPROCS.EQ.1 ) ) THEN PB_NUMROC = N RETURN END IF * * Compute coordinate of process owning I and corresponding INB * IF( I.LE.INB ) THEN * * I is in range of first block, i.e SRCPROC owns I. * SRCPROC1 = SRCPROC INB1 = INB - I + 1 * ELSE * * I is not in first block of matrix, figure out who has it * I1 = I - 1 - INB NBLOCKS = I1 / NB + 1 SRCPROC1 = SRCPROC + NBLOCKS SRCPROC1 = SRCPROC1 - ( SRCPROC1 / NPROCS ) * NPROCS INB1 = NBLOCKS*NB - I1 * END IF * * Now everything is just like I=1. Search now who has N-1, Is N-1 * in the first block ? * IF( N.LE.INB1 ) THEN IF( PROC.EQ.SRCPROC1 ) THEN PB_NUMROC = N ELSE PB_NUMROC = 0 END IF RETURN END IF * N1 = N - INB1 NBLOCKS = N1 / NB + 1 * IF( PROC.EQ.SRCPROC1 ) THEN ILOCBLK = NBLOCKS / NPROCS IF( ILOCBLK.GT.0 ) THEN IF( ( NBLOCKS - ILOCBLK * NPROCS ).GT.0 ) THEN PB_NUMROC = INB1 + ILOCBLK * NB ELSE PB_NUMROC = N + NB * ( ILOCBLK - NBLOCKS ) END IF ELSE PB_NUMROC = INB1 END IF ELSE MYDIST = PROC - SRCPROC1 IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROCS ILOCBLK = NBLOCKS / NPROCS IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPROCS IF( MYDIST.LT.0 ) THEN PB_NUMROC = ( ILOCBLK + 1 ) * NB ELSE IF( MYDIST.GT.0 ) THEN PB_NUMROC = ILOCBLK * NB ELSE PB_NUMROC = N1 + NB * ( ILOCBLK - NBLOCKS + 1 ) END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN PB_NUMROC = NB ELSE IF( MYDIST.GT.0 ) THEN PB_NUMROC = 0 ELSE PB_NUMROC = N1 + NB * ( 1 - NBLOCKS ) END IF END IF END IF * RETURN * * End of PB_NUMROC * END SUBROUTINE PB_BOOT() * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * * Purpose * ======= * * PB_BOOT (re)sets all timers to 0, and enables PB_TIMER. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) DOUBLE PRECISION STARTFLAG, ZERO PARAMETER ( STARTFLAG = -5.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * DISABLED = .FALSE. DO 10 I = 1, NTIMER CPUSEC( I ) = ZERO WALLSEC( I ) = ZERO CPUSTART( I ) = STARTFLAG WALLSTART( I ) = STARTFLAG 10 CONTINUE * RETURN * * End of PB_BOOT * END * SUBROUTINE PB_TIMER( I ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER I * .. * * Purpose * ======= * * PB_TIMER provides a "stopwatch" functionality cpu/wall timer in se- * conds. Up to 64 separate timers can be functioning at once. The first * call starts the timer, and the second stops it. This routine can be * disenabled, so that calls to the timer are ignored. This feature can * be used to make sure certain sections of code do not affect timings, * even if they call routines which have PB_TIMER calls in them. * * Arguments * ========= * * I (global input) INTEGER * On entry, I specifies the timer to stop/start. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) DOUBLE PRECISION STARTFLAG PARAMETER ( STARTFLAG = -5.0D+0 ) * .. * .. External Functions .. DOUBLE PRECISION DCPUTIME00, DWALLTIME00 EXTERNAL DCPUTIME00, DWALLTIME00 * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * * If timing disabled, return * IF( DISABLED ) $ RETURN * IF( WALLSTART( I ).EQ.STARTFLAG ) THEN * * If timer has not been started, start it * WALLSTART( I ) = DWALLTIME00() CPUSTART( I ) = DCPUTIME00() * ELSE * * Stop timer and add interval to count * CPUSEC( I ) = CPUSEC( I ) + DCPUTIME00() - CPUSTART( I ) WALLSEC( I ) = WALLSEC( I ) + DWALLTIME00() - WALLSTART( I ) WALLSTART( I ) = STARTFLAG * END IF * RETURN * * End of PB_TIMER * END * SUBROUTINE PB_ENABLE() * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * * Purpose * ======= * * PB_ENABLE sets it so calls to PB_TIMER are not ignored. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * DISABLED = .FALSE. * RETURN * * End of PB_ENABLE * END * SUBROUTINE PB_DISABLE() * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * Purpose * ======= * * PB_DISABLE sets it so calls to PB_TIMER are ignored. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * DISABLED = .TRUE. * RETURN * * End of PB_DISABLE * END * DOUBLE PRECISION FUNCTION PB_INQUIRE( TMTYPE, I ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TMTYPE INTEGER I * .. * * Purpose * ======= * * PB_INQUIRE returns wall or cpu time that has accumulated in timer I. * * Arguments * ========= * * TMTYPE (global input) CHARACTER * On entry, TMTYPE specifies what time will be returned as fol- * lows * = 'W': wall clock time is returned, * = 'C': CPU time is returned (default). * * I (global input) INTEGER * On entry, I specifies the timer to return. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) DOUBLE PRECISION ERRFLAG PARAMETER ( ERRFLAG = -1.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION TIME * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DCPUTIME00, DWALLTIME00 EXTERNAL DCPUTIME00, DWALLTIME00, LSAME * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * IF( LSAME( TMTYPE, 'W' ) ) THEN * * If walltime not available on this machine, return -1 flag * IF( DWALLTIME00().EQ.ERRFLAG ) THEN TIME = ERRFLAG ELSE TIME = WALLSEC( I ) END IF ELSE IF( DCPUTIME00().EQ.ERRFLAG ) THEN TIME = ERRFLAG ELSE TIME = CPUSEC( I ) END IF END IF * PB_INQUIRE = TIME * RETURN * * End of PB_INQUIRE * END * SUBROUTINE PB_COMBINE( ICTXT, SCOPE, OP, TMTYPE, N, IBEG, $ TIMES ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 OP, SCOPE, TMTYPE INTEGER IBEG, ICTXT, N * .. * .. Array Arguments .. DOUBLE PRECISION TIMES( N ) * .. * * Purpose * ======= * * PB_COMBINE returns wall or cpu time that has accumulated in timer I. * * Arguments * ========= * * TMTYPE (global input) CHARACTER * On entry, TMTYPE specifies what time will be returned as fol- * lows * = 'W': wall clock time is returned, * = 'C': CPU time is returned (default). * * I (global input) INTEGER * On entry, I specifies the timer to return. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NTIMER PARAMETER ( NTIMER = 64 ) DOUBLE PRECISION ERRFLAG PARAMETER ( ERRFLAG = -1.0D+0 ) * .. * .. Local Scalars .. CHARACTER*1 TOP LOGICAL TMPDIS INTEGER I * .. * .. External Subroutines .. EXTERNAL DGAMX2D, DGAMN2D, DGSUM2D, PB_TOPGET * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DCPUTIME00, DWALLTIME00 EXTERNAL DCPUTIME00, DWALLTIME00, LSAME * .. * .. Common Blocks .. LOGICAL DISABLED DOUBLE PRECISION CPUSEC( NTIMER ), CPUSTART( NTIMER ), $ WALLSEC( NTIMER ), WALLSTART( NTIMER ) COMMON /SLTIMER00/ CPUSEC, WALLSEC, CPUSTART, WALLSTART, DISABLED * .. * .. Executable Statements .. * * Disable timer for combine operation * TMPDIS = DISABLED DISABLED = .TRUE. * * Copy timer information into user's times array * IF( LSAME( TMTYPE, 'W' ) ) THEN * * If walltime not available on this machine, fill in times * with -1 flag, and return * IF( DWALLTIME00().EQ.ERRFLAG ) THEN DO 10 I = 1, N TIMES( I ) = ERRFLAG 10 CONTINUE RETURN ELSE DO 20 I = 1, N TIMES( I ) = WALLSEC( IBEG + I - 1 ) 20 CONTINUE END IF ELSE IF( DCPUTIME00().EQ.ERRFLAG ) THEN DO 30 I = 1, N TIMES( I ) = ERRFLAG 30 CONTINUE RETURN ELSE DO 40 I = 1, N TIMES( I ) = CPUSEC( IBEG + I - 1 ) 40 CONTINUE END IF ENDIF * * Combine all nodes' information, restore disabled, and return * IF( OP.EQ.'>' ) THEN CALL PB_TOPGET( ICTXT, 'Combine', SCOPE, TOP ) CALL DGAMX2D( ICTXT, SCOPE, TOP, N, 1, TIMES, N, -1, -1, $ -1, -1, 0 ) ELSE IF( OP.EQ.'<' ) THEN CALL PB_TOPGET( ICTXT, 'Combine', SCOPE, TOP ) CALL DGAMN2D( ICTXT, SCOPE, TOP, N, 1, TIMES, N, -1, -1, $ -1, -1, 0 ) ELSE IF( OP.EQ.'+' ) THEN CALL PB_TOPGET( ICTXT, 'Combine', SCOPE, TOP ) CALL DGSUM2D( ICTXT, SCOPE, TOP, N, 1, TIMES, N, -1, 0 ) ELSE CALL PB_TOPGET( ICTXT, 'Combine', SCOPE, TOP ) CALL DGAMX2D( ICTXT, SCOPE, TOP, N, 1, TIMES, N, -1, -1, $ -1, -1, 0 ) END IF * DISABLED = TMPDIS * RETURN * * End of PB_COMBINE * END SUBROUTINE PB_CHKMAT( ICTXT, M, MPOS0, N, NPOS0, IA, JA, DESCA, $ DPOS0, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER DPOS0, IA, ICTXT, INFO, JA, M, MPOS0, N, NPOS0 * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PB_CHKMAT checks the validity of a descriptor vector DESCA, the re- * lated global indexes IA, JA from a local view point. If an inconsis- * tency is found among its parameters IA, JA and DESCA, the routine re- * turns an error code in INFO. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * M (global input) INTEGER * On entry, M specifies the number of rows the submatrix * sub( A ). * * MPOS0 (global input) INTEGER * On entry, MPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter M appears. * * N (global input) INTEGER * On entry, N specifies the number of columns the submatrix * sub( A ). * * NPOS0 (global input) INTEGER * On entry, NPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter N appears. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * DPOS0 (global input) INTEGER * On entry, DPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter DESCA ap- * pears. Note that it is assumed that IA and JA are respecti- * vely 2 and 1 entries behind DESCA. * * INFO (local input/local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had an * illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) * .. * .. Local Scalars .. INTEGER DPOS, IAPOS, JAPOS, MP, MPOS, MYCOL, MYROW, $ NPCOL, NPOS, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_DESCTRANS * .. * .. External Functions .. INTEGER PB_NUMROC EXTERNAL PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MAX * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the des- * criptor multiplier * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Figure where in parameter list each parameter was, factoring in * descriptor multiplier * MPOS = MPOS0 * DESCMULT NPOS = NPOS0 * DESCMULT IAPOS = ( DPOS0 - 2 ) * DESCMULT JAPOS = ( DPOS0 - 1 ) * DESCMULT DPOS = DPOS0 * DESCMULT * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check that matrix values make sense from local viewpoint * IF( M.LT.0 ) $ INFO = MIN( INFO, MPOS ) IF( N.LT.0 ) $ INFO = MIN( INFO, NPOS ) IF( IA.LT.1 ) $ INFO = MIN( INFO, IAPOS ) IF( JA.LT.1 ) $ INFO = MIN( INFO, JAPOS ) IF( DESCA2( DTYPE_ ).NE.BLOCK_CYCLIC_2D_INB ) $ INFO = MIN( INFO, DPOS + DTYPE_ ) IF( DESCA2( IMB_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + IMB_ ) IF( DESCA2( INB_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + INB_ ) IF( DESCA2( MB_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + MB_ ) IF( DESCA2( NB_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + NB_ ) IF( DESCA2( RSRC_ ).LT.-1 .OR. DESCA2( RSRC_ ).GE.NPROW ) $ INFO = MIN( INFO, DPOS + RSRC_ ) IF( DESCA2( CSRC_ ).LT.-1 .OR. DESCA2( CSRC_ ).GE.NPCOL ) $ INFO = MIN( INFO, DPOS + CSRC_ ) IF( DESCA2( CTXT_ ).NE.ICTXT ) $ INFO = MIN( INFO, DPOS + CTXT_ ) * IF( M.EQ.0 .OR. N.EQ.0 ) THEN * * NULL matrix, relax some checks * IF( DESCA2( M_ ).LT.0 ) $ INFO = MIN( INFO, DPOS + M_ ) IF( DESCA2( N_ ).LT.0 ) $ INFO = MIN( INFO, DPOS + N_ ) IF( DESCA2( LLD_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + LLD_ ) * ELSE * * more rigorous checks for non-degenerate matrices * MP = PB_NUMROC( DESCA2( M_ ), 1, DESCA2( IMB_ ), DESCA2( MB_ ), $ MYROW, DESCA2( RSRC_ ), NPROW ) * IF( DESCA2( M_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + M_ ) IF( DESCA2( N_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + N_ ) IF( IA.GT.DESCA2( M_ ) ) $ INFO = MIN( INFO, IAPOS ) IF( JA.GT.DESCA2( N_ ) ) $ INFO = MIN( INFO, JAPOS ) IF( IA+M-1.GT.DESCA2( M_ ) ) $ INFO = MIN( INFO, MPOS ) IF( JA+N-1.GT.DESCA2( N_ ) ) $ INFO = MIN( INFO, NPOS ) * IF( DESCA2( LLD_ ).LT.MAX( 1, MP ) ) THEN NQ = PB_NUMROC( DESCA2( N_ ), 1, DESCA2( INB_ ), $ DESCA2( NB_ ), MYCOL, DESCA2( CSRC_ ), $ NPCOL ) IF( DESCA2( LLD_ ).LT.1 ) THEN INFO = MIN( INFO, DPOS + LLD_ ) ELSE IF( NQ.GT.0 ) THEN INFO = MIN( INFO, DPOS + LLD_ ) END IF END IF * END IF * * Prepare output: set info = 0 if no error, and divide by * DESCMULT if error is not in a descriptor entry * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -( INFO / DESCMULT ) ELSE INFO = -INFO END IF * RETURN * * End of PB_CHKMAT * END SUBROUTINE PB_DESCTRANS( DESCIN, DESCOUT ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER DESCIN( * ), DESCOUT( * ) * .. * * Purpose * ======= * * PB_DESCTRANS converts a descriptor DESCIN of type BLOCK_CYCLIC_2D * or BLOCK_CYCLIC_INB_2D into a descriptor DESCOUT of type * BLOCK_CYCLIC_INB_2D. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ----------------------------------- * DTYPE_A (global) DESCA( DTYPE1_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT1_ ) The BLACS context handle indicating * the NPROW x NPCOL BLACS process * grid A is distributed over. The * context itself is global, but the * handle (the integer value) may * vary. * M_A (global) DESCA( M1_ ) The number of rows in the distri- * buted matrix A, M_A >= 0. * N_A (global) DESCA( N1_ ) The number of columns in the dis- * tributed matrix A, N_A >= 0. * MB_A (global) DESCA( MB1_ ) The blocking factor used to distri- * bute the rows of A, MB_A > 0. * NB_A (global) DESCA( NB1_ ) The blocking factor used to distri- * bute the columns of A, NB_A > 0. * RSRC_A (global) DESCA( RSRC1_ ) The process row over which the * first row of the matrix A is dis- * tributed, NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC1_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD1_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, MB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, NB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * DESCIN (global and local input) INTEGER array * On entry, DESCIN is an array of dimension DLEN1_ or DLEN_ as * specified by its first entry DESCIN( DTYPE_ ). DESCIN is the * source array descriptor of type BLOCK_CYCLIC_2D or of type * BLOCK_CYCLIC_2D_INB. * * DESCOUT (global and local output) INTEGER array * On entry, DESCOUT is an array of dimension DLEN_. DESCOUT is * the target array descriptor of type BLOCK_CYCLIC_2D_INB. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC1_, CTXT1_, DLEN1_, $ DTYPE1_, LLD1_, M1_, MB1_, N1_, NB1_, RSRC1_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN1_ = 9, DTYPE1_ = 1, $ CTXT1_ = 2, M1_ = 3, N1_ = 4, MB1_ = 5, $ NB1_ = 6, RSRC1_ = 7, CSRC1_ = 8, LLD1_ = 9 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I * .. * .. Executable Statements .. * IF( DESCIN( DTYPE_ ).EQ.BLOCK_CYCLIC_2D ) THEN DESCOUT( DTYPE_ ) = BLOCK_CYCLIC_2D_INB DESCOUT( CTXT_ ) = DESCIN( CTXT1_ ) DESCOUT( M_ ) = DESCIN( M1_ ) DESCOUT( N_ ) = DESCIN( N1_ ) DESCOUT( IMB_ ) = DESCIN( MB1_ ) DESCOUT( INB_ ) = DESCIN( NB1_ ) DESCOUT( MB_ ) = DESCIN( MB1_ ) DESCOUT( NB_ ) = DESCIN( NB1_ ) DESCOUT( RSRC_ ) = DESCIN( RSRC1_ ) DESCOUT( CSRC_ ) = DESCIN( CSRC1_ ) DESCOUT( LLD_ ) = DESCIN( LLD1_ ) ELSE IF( DESCIN( DTYPE_ ).EQ.BLOCK_CYCLIC_2D_INB ) THEN DO 10 I = 1, DLEN_ DESCOUT( I ) = DESCIN( I ) 10 CONTINUE ELSE DESCOUT( DTYPE_ ) = DESCIN( 1 ) DESCOUT( CTXT_ ) = DESCIN( 2 ) DESCOUT( M_ ) = 0 DESCOUT( N_ ) = 0 DESCOUT( IMB_ ) = 1 DESCOUT( INB_ ) = 1 DESCOUT( MB_ ) = 1 DESCOUT( NB_ ) = 1 DESCOUT( RSRC_ ) = 0 DESCOUT( CSRC_ ) = 0 DESCOUT( LLD_ ) = 1 END IF * RETURN * * End of PB_DESCTRANS * END SUBROUTINE PB_DESCSET2( DESC, M, N, IMB, INB, MB, NB, RSRC, CSRC, $ CTXT, LLD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER CSRC, CTXT, IMB, INB, LLD, M, MB, N, NB, RSRC * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * PB_DESCSET2 uses its 10 input arguments M, N, IMB, INB, MB, NB, * RSRC, CSRC, CTXT and LLD to initialize a descriptor vector of type * BLOCK_CYCLIC_2D_INB. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ----------------------------------- * DTYPE_A (global) DESCA( DTYPE1_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT1_ ) The BLACS context handle indicating * the NPROW x NPCOL BLACS process * grid A is distributed over. The * context itself is global, but the * handle (the integer value) may * vary. * M_A (global) DESCA( M1_ ) The number of rows in the distri- * buted matrix A, M_A >= 0. * N_A (global) DESCA( N1_ ) The number of columns in the dis- * tributed matrix A, N_A >= 0. * MB_A (global) DESCA( MB1_ ) The blocking factor used to distri- * bute the rows of A, MB_A > 0. * NB_A (global) DESCA( NB1_ ) The blocking factor used to distri- * bute the columns of A, NB_A > 0. * RSRC_A (global) DESCA( RSRC1_ ) The process row over which the * first row of the matrix A is dis- * tributed, NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC1_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD1_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, MB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, NB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * DESC (global and local output) INTEGER array * On entry, DESC is an array of dimension DLEN_. DESC is the * array descriptor to be set. * * M (global input) INTEGER * On entry, M specifies the number of rows of the matrix. * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the matrix. * N must be at least zero. * * IMB (global input) INTEGER * On entry, IMB specifies the row size of the first block of * the global matrix distribution. IMB must be at least one. * * INB (global input) INTEGER * On entry, INB specifies the column size of the first block * of the global matrix distribution. INB must be at least one. * * MB (global input) INTEGER * On entry, MB specifies the row size of the blocks used to * partition the matrix. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the column size of the blocks used to * partition the matrix. NB must be at least one. * * RSRC (global input) INTEGER * On entry, RSRC specifies the row coordinate of the process * that possesses the first row of the matrix. When RSRC = -1, * the data is not distributed but replicated, otherwise RSRC * must be at least zero and strictly less than NPROW. * * CSRC (global input) INTEGER * On entry, CSRC specifies the column coordinate of the pro- * cess that possesses the first column of the matrix. When * CSRC = -1, the data is not distributed but replicated, other- * wise CSRC must be at least zero and strictly less than NPCOL. * * CTXT (local input) INTEGER * On entry, CTXT specifies the BLACS context handle, indicating * the global communication context. The value of the context * itself is local. * * LLD (local input) INTEGER * On entry, LLD specifies the leading dimension of the local * array storing the local entries of the matrix. LLD must be at * least MAX( 1, Lr(1,M) ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Executable Statements .. * DESC( DTYPE_ ) = BLOCK_CYCLIC_2D_INB DESC( CTXT_ ) = CTXT DESC( M_ ) = M DESC( N_ ) = N DESC( IMB_ ) = IMB DESC( INB_ ) = INB DESC( MB_ ) = MB DESC( NB_ ) = NB DESC( RSRC_ ) = RSRC DESC( CSRC_ ) = CSRC DESC( LLD_ ) = LLD * RETURN * * End of PB_DESCSET2 * END SUBROUTINE PB_DESCINIT2( DESC, M, N, IMB, INB, MB, NB, RSRC, CSRC, $ CTXT, LLD, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER CSRC, CTXT, IMB, INB, INFO, LLD, M, MB, N, NB, $ RSRC * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * PB_DESCINIT2 uses its 10 input arguments M, N, IMB, INB, MB, NB, * RSRC, CSRC, CTXT and LLD to initialize a descriptor vector of type * BLOCK_CYCLIC_2D_INB. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * DESC (global and local output) INTEGER array * On entry, DESC is an array of dimension DLEN_. DESC is the * array descriptor to be set. * * M (global input) INTEGER * On entry, M specifies the number of rows of the matrix. * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the matrix. * N must be at least zero. * * IMB (global input) INTEGER * On entry, IMB specifies the row size of the first block of * the global matrix distribution. IMB must be at least one. * * INB (global input) INTEGER * On entry, INB specifies the column size of the first block * of the global matrix distribution. INB must be at least one. * * MB (global input) INTEGER * On entry, MB specifies the row size of the blocks used to * partition the matrix. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the column size of the blocks used to * partition the matrix. NB must be at least one. * * RSRC (global input) INTEGER * On entry, RSRC specifies the row coordinate of the process * that possesses the first row of the matrix. When RSRC = -1, * the data is not distributed but replicated, otherwise RSRC * must be at least zero and strictly less than NPROW. * * CSRC (global input) INTEGER * On entry, CSRC specifies the column coordinate of the pro- * cess that possesses the first column of the matrix. When * CSRC = -1, the data is not distributed but replicated, other- * wise CSRC must be at least zero and strictly less than NPCOL. * * CTXT (local input) INTEGER * On entry, CTXT specifies the BLACS context handle, indicating * the global communication context. The value of the context * itself is local. * * LLD (local input) INTEGER * On entry, LLD specifies the leading dimension of the local * array storing the local entries of the matrix. LLD must be at * least MAX( 1, Lr(1,M) ). * * INFO (local output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value. * * Notes * ===== * * If the routine can recover from an erroneous input argument, it will * return an acceptable descriptor vector. For example, if LLD = 0 on * input, DESC( LLD_ ) will contain the smallest leading dimension re- * quired to store the specified m by n matrix, INFO will however be set * to -11 on exit in that case. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER LLDMIN, MP, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA * .. * .. External Functions .. INTEGER PB_NUMROC EXTERNAL PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( CTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 IF( M.LT.0 ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IMB.LT.1 ) THEN INFO = -4 ELSE IF( INB.LT.1 ) THEN INFO = -5 ELSE IF( MB.LT.1 ) THEN INFO = -6 ELSE IF( NB.LT.1 ) THEN INFO = -7 ELSE IF( RSRC.LT.-1 .OR. RSRC.GE.NPROW ) THEN INFO = -8 ELSE IF( CSRC.LT.-1 .OR. CSRC.GE.NPCOL ) THEN INFO = -9 ELSE IF( NPROW.EQ.-1 ) THEN INFO = -10 END IF * * Compute minimum LLD if safe (to avoid division by 0) * IF( INFO.EQ.0 ) THEN MP = PB_NUMROC( M, 1, IMB, MB, MYROW, RSRC, NPROW ) IF( PB_NUMROC( N, 1, INB, NB, MYCOL, CSRC, NPCOL ).GT.0 ) THEN LLDMIN = MAX( 1, MP ) ELSE LLDMIN = 1 END IF IF( LLD.LT.LLDMIN ) $ INFO = -11 END IF * IF( INFO.NE.0 ) $ CALL PXERBLA( CTXT, 'PB_DESCINIT2', -INFO ) * DESC( DTYPE_ ) = BLOCK_CYCLIC_2D_INB DESC( CTXT_ ) = CTXT DESC( M_ ) = MAX( 0, M ) DESC( N_ ) = MAX( 0, N ) DESC( IMB_ ) = MAX( 1, IMB ) DESC( INB_ ) = MAX( 1, INB ) DESC( MB_ ) = MAX( 1, MB ) DESC( NB_ ) = MAX( 1, NB ) DESC( RSRC_ ) = MAX( -1, MIN( RSRC, NPROW-1 ) ) DESC( CSRC_ ) = MAX( -1, MIN( CSRC, NPCOL-1 ) ) DESC( LLD_ ) = MAX( LLD, LLDMIN ) * RETURN * * End of PB_DESCINIT2 * END SUBROUTINE PB_BINFO( OFFD, M, N, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ILOW, IMB1, IMBLOC, INB1, INBLOC, IUPP, LCMT00, $ LMBLOC, LNBLOC, LOW, M, MB, MBLKS, MRCOL, $ MRROW, N, NB, NBLKS, OFFD, UPP * .. * * Purpose * ======= * * PB_BINFO initializes the local information of an m by n local array * owned by the process of relative coordinates ( MRROW, MRCOL ). Note * that if m or n is less or equal than zero, there is no data, in which * case this process does not need the local information computed by * this routine to proceed. * * Arguments * ========= * * OFFD (global input) INTEGER * On entry, OFFD specifies the off-diagonal of the underlying * matrix of interest as follows: * OFFD = 0 specifies the main diagonal, * OFFD > 0 specifies lower subdiagonals, and * OFFD < 0 specifies upper superdiagonals. * * M (local input) INTEGER * On entry, M specifies the local number of rows of the under- * lying matrix owned by the process of relative coordinates * ( MRROW, MRCOL ). M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the local number of columns of the un- * derlying matrix owned by the process of relative coordinates * ( MRROW, MRCOL ). N must be at least zero. * * IMB1 (global input) INTEGER * On input, IMB1 specifies the global true size of the first * block of rows of the underlying global submatrix. IMB1 must * be at least MIN( 1, M ). * * INB1 (global input) INTEGER * On input, INB1 specifies the global true size of the first * block of columns of the underlying global submatrix. INB1 * must be at least MIN( 1, N ). * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * MRROW (local input) INTEGER * On entry, MRROW specifies the relative row coordinate of the * process that possesses these M rows. MRROW must be least zero * and strictly less than NPROW. * * MRCOL (local input) INTEGER * On entry, MRCOL specifies the relative column coordinate of * the process that possesses these N columns. MRCOL must be * least zero and strictly less than NPCOL. * * LCMT00 (local output) INTEGER * On exit, LCMT00 is the LCM value of the left upper block of * this m by n local block owned by the process of relative co- * ordinates ( MRROW, MRCOL ). * * MBLKS (local output) INTEGER * On exit, MBLKS specifies the local number of blocks of rows * corresponding to M. MBLKS must be at least zero. * * NBLKS (local output) INTEGER * On exit, NBLKS specifies the local number of blocks of co- * lumns corresponding to N. NBLKS must be at least zero. * * IMBLOC (local output) INTEGER * On exit, IMBLOC specifies the number of rows (size) of the * uppest blocks of this m by n local array owned by the process * of relative coordinates ( MRROW, MRCOL ). IMBLOC is at least * MIN( 1, M ). * * INBLOC (local output) INTEGER * On exit, INBLOC specifies the number of columns (size) of * the leftmost blocks of this m by n local array owned by the * process of relative coordinates ( MRROW, MRCOL ). INBLOC is * at least MIN( 1, N ). * * LMBLOC (local output) INTEGER * On exit, LMBLOC specifies the number of rows (size) of the * lowest blocks of this m by n local array owned by the process * of relative coordinates ( MRROW, MRCOL ). LMBLOC is at least * MIN( 1, M ). * * LNBLOC (local output) INTEGER * On exit, LNBLOC specifies the number of columns (size) of the * rightmost blocks of this m by n local array owned by the * process of relative coordinates ( MRROW, MRCOL ). LNBLOC is * at least MIN( 1, N ). * * ILOW (local output) INTEGER * On exit, ILOW is the lower bound characterizing the first co- * lumn block owning offdiagonals of this m by n array. ILOW * must be less or equal than zero. * * LOW (global output) INTEGER * On exit, LOW is the lower bound characterizing the column * blocks with te exception of the first one (see ILOW) owning * offdiagonals of this m by n array. LOW must be less or equal * than zero. * * IUPP (local output) INTEGER * On exit, IUPP is the upper bound characterizing the first row * block owning offdiagonals of this m by n array. IUPP must be * greater or equal than zero. * * UPP (global output) INTEGER * On exit, UPP is the upper bound characterizing the row * blocks with te exception of the first one (see IUPP) owning * offdiagonals of this m by n array. UPP must be greater or * equal than zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER TMP1 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Initialize LOW, ILOW, UPP, IUPP, LMBLOC, LNBLOC, IMBLOC, INBLOC, * MBLKS, NBLKS and LCMT00. * LOW = 1 - NB UPP = MB - 1 * LCMT00 = OFFD * IF( M.LE.0 .OR. N.LE.0 ) THEN * IF( MRROW.GT.0 ) THEN IUPP = MB - 1 ELSE IUPP = MAX( 0, IMB1 - 1 ) END IF IMBLOC = 0 MBLKS = 0 LMBLOC = 0 * IF( MRCOL.GT.0 ) THEN ILOW = 1 - NB ELSE ILOW = MIN( 0, 1 - INB1 ) END IF INBLOC = 0 NBLKS = 0 LNBLOC = 0 * LCMT00 = LCMT00 + ( LOW - ILOW + MRCOL * NB ) - $ ( IUPP - UPP + MRROW * MB ) * RETURN * END IF * IF( MRROW.GT.0 ) THEN * IMBLOC = MIN( M, MB ) IUPP = MB - 1 LCMT00 = LCMT00 - ( IMB1 - MB + MRROW * MB ) MBLKS = ( M - 1 ) / MB + 1 LMBLOC = M - ( M / MB ) * MB IF( LMBLOC.EQ.0 ) $ LMBLOC = MB * IF( MRCOL.GT.0 ) THEN * INBLOC = MIN( N, NB ) ILOW = 1 - NB LCMT00 = LCMT00 + INB1 - NB + MRCOL * NB NBLKS = ( N - 1 ) / NB + 1 LNBLOC = N - ( N / NB ) * NB IF( LNBLOC.EQ.0 ) $ LNBLOC = NB * ELSE * INBLOC = INB1 ILOW = 1 - INB1 TMP1 = N - INB1 IF( TMP1.GT.0 ) THEN * * more than one block * NBLKS = ( TMP1 - 1 ) / NB + 2 LNBLOC = TMP1 - ( TMP1 / NB ) * NB IF( LNBLOC.EQ.0 ) $ LNBLOC = NB * ELSE * NBLKS = 1 LNBLOC = INB1 * END IF * END IF * ELSE * IMBLOC = IMB1 IUPP = IMB1 - 1 TMP1 = M - IMB1 IF( TMP1.GT.0 ) THEN * * more than one block * MBLKS = ( TMP1 - 1 ) / MB + 2 LMBLOC = TMP1 - ( TMP1 / MB ) * MB IF( LMBLOC.EQ.0 ) $ LMBLOC = MB * ELSE * MBLKS = 1 LMBLOC = IMB1 * END IF * IF( MRCOL.GT.0 ) THEN * INBLOC = MIN( N, NB ) ILOW = 1 - NB LCMT00 = LCMT00 + INB1 - NB + MRCOL * NB NBLKS = ( N - 1 ) / NB + 1 LNBLOC = N - ( N / NB ) * NB IF( LNBLOC.EQ.0 ) $ LNBLOC = NB * ELSE * INBLOC = INB1 ILOW = 1 - INB1 TMP1 = N - INB1 IF( TMP1.GT.0 ) THEN * * more than one block * NBLKS = ( TMP1 - 1 ) / NB + 2 LNBLOC = TMP1 - ( TMP1 / NB ) * NB IF( LNBLOC.EQ.0 ) $ LNBLOC = NB * ELSE * NBLKS = 1 LNBLOC = INB1 * END IF * END IF * END IF * RETURN * * End of PB_BINFO * END INTEGER FUNCTION PILAENV( ICTXT, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT CHARACTER*1 PREC * .. * * Purpose * ======= * * PILAENV returns the logical computational block size to be used by * the PBLAS routines during testing and timing. This is a special ver- * sion to be used only as part of the testing or timing PBLAS programs * for testing different values of logical computational block sizes for * the PBLAS routines. It is called by the PBLAS routines to retrieve a * logical computational block size value. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * PREC (dummy input) CHARACTER*1 * On entry, PREC is a dummy argument. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Common Blocks .. INTEGER INFO, NBLOG COMMON /INFOC/INFO, NBLOG * .. * .. Executable Statements .. * PILAENV = NBLOG * RETURN * * End of PILAENV * END SUBROUTINE PB_LOCINFO( I, INB, NB, MYROC, SRCPROC, NPROCS, $ ILOCBLK, ILOCOFF, MYDIST ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER I, ILOCBLK, ILOCOFF, INB, MYDIST, MYROC, NB, $ NPROCS, SRCPROC * .. * * Purpose * ======= * * PB_LOCINFO computes local information about the beginning of a sub- * matrix starting at the global index I. * * Arguments * ========= * * I (global input) INTEGER * On entry, I specifies the global starting index in the ma- * trix. I must be at least one. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of rows * or columns of the matrix. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks of rows or co- * lumns of the matrix is partitioned into. NB must be at least * one. * * MYROC (local input) INTEGER * On entry, MYROC is the coordinate of the process whose local * information is determined. MYROC is at least zero and * strictly less than NPROCS. * * SRCPROC (global input) INTEGER * On entry, SRCPROC specifies the coordinate of the process * that possesses the first row or column of the matrix. When * SRCPROC = -1, the data is not distributed but replicated, * otherwise SRCPROC must be at least zero and strictly less * than NPROCS. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the submatrix is distributed. NPROCS * must be at least one. * * ILOCBLK (local output) INTEGER * On exit, ILOCBLK specifies the local row or column block * coordinate corresponding to the row or column I of the ma- * trix. ILOCBLK must be at least zero. * * ILOCOFF (local output) INTEGER * On exit, ILOCOFF specifies the local row offset in the block * of local coordinate ILOCBLK corresponding to the row or co- * lumn I of the matrix. ILOCOFF must at least zero. * * MYDIST (local output) INTEGER * On exit, MYDIST specifies the relative process coordinate of * the process specified by MYROC to the process owning the row * or column I. MYDIST is at least zero and strictly less than * NPROCS. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER ITMP, NBLOCKS, PROC * .. * .. Executable Statements .. * ILOCOFF = 0 * IF( SRCPROC.LT.0 ) THEN * MYDIST = 0 * IF( I.LE.INB ) THEN * ILOCBLK = 0 ILOCOFF = I - 1 * ELSE * ITMP = I - INB NBLOCKS = ( ITMP - 1 ) / NB + 1 ILOCBLK = NBLOCKS ILOCOFF = ITMP - 1 - ( NBLOCKS - 1 ) * NB * END IF * ELSE * PROC = SRCPROC MYDIST = MYROC - PROC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROCS * IF( I.LE.INB ) THEN * ILOCBLK = 0 IF( MYROC.EQ.PROC ) $ ILOCOFF = I - 1 * ELSE * ITMP = I - INB NBLOCKS = ( ITMP - 1 ) / NB + 1 PROC = PROC + NBLOCKS PROC = PROC - ( PROC / NPROCS ) * NPROCS ILOCBLK = NBLOCKS / NPROCS * IF( ( ILOCBLK*NPROCS ).LT.( MYDIST-NBLOCKS ) ) $ ILOCBLK = ILOCBLK + 1 * IF( MYROC.EQ.PROC ) $ ILOCOFF = ITMP - 1 - ( NBLOCKS - 1 ) * NB * END IF * END IF * RETURN * * End of PB_LOCINFO * END SUBROUTINE PB_INITJMP( COLMAJ, NVIR, IMBVIR, INBVIR, IMBLOC, $ INBLOC, MB, NB, RSRC, CSRC, NPROW, NPCOL, $ STRIDE, JMP ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL COLMAJ INTEGER CSRC, IMBLOC, IMBVIR, INBLOC, INBVIR, MB, NB, $ NPCOL, NPROW, NVIR, RSRC, STRIDE * .. * .. Array Arguments .. INTEGER JMP( * ) * .. * * Purpose * ======= * * PB_INITJMP initializes the jump values JMP used by the random matrix * generator. * * Arguments * ========= * * COLMAJ (global input) LOGICAL * On entry, COLMAJ specifies the ordering of the random sequen- * ce. When COLMAJ is .TRUE., the random sequence will be used * for a column major ordering, and otherwise a row-major orde- * ring. This impacts on the computation of the jump values. * * NVIR (global input) INTEGER * On entry, NVIR specifies the size of the underlying virtual * matrix. NVIR must be at least zero. * * IMBVIR (local input) INTEGER * On entry, IMBVIR specifies the number of virtual rows of the * upper left block of the underlying virtual submatrix. IMBVIR * must be at least IMBLOC. * * INBVIR (local input) INTEGER * On entry, INBVIR specifies the number of virtual columns of * the upper left block of the underlying virtual submatrix. * INBVIR must be at least INBLOC. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the size of the blocks used to parti- * tion the matrix rows. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix columns. NB must be at least one. * * RSRC (global input) INTEGER * On entry, RSRC specifies the row coordinate of the process * that possesses the first row of the matrix. When RSRC = -1, * the rows are not distributed but replicated, otherwise RSRC * must be at least zero and strictly less than NPROW. * * CSRC (global input) INTEGER * On entry, CSRC specifies the column coordinate of the pro- * cess that possesses the first column of the matrix. When CSRC * is equal to -1, the columns are not distributed but replica- * ted, otherwise CSRC must be at least zero and strictly less * than NPCOL. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process co- * lumns over which the matrix is distributed. NPCOL must be at * least one. * * STRIDE (global input) INTEGER * On entry, STRIDE specifies the number of random numbers to be * generated to compute one matrix entry. In the real case, * STRIDE is usually 1, where as in the complex case STRIDE is * usually 2 in order to generate the real and imaginary parts. * * JMP (local output) INTEGER array * On entry, JMP is an array of dimension JMP_LEN. On exit, this * array contains the different jump values used by the random * matrix generator. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. INTEGER NPMB, NQNB * .. * .. Executable Statements .. * IF( RSRC.LT.0 ) THEN NPMB = MB ELSE NPMB = NPROW * MB END IF IF( CSRC.LT.0 ) THEN NQNB = NB ELSE NQNB = NPCOL * NB END IF * JMP( JMP_1 ) = 1 * JMP( JMP_MB ) = MB JMP( JMP_IMBV ) = IMBVIR JMP( JMP_NPMB ) = NPMB JMP( JMP_NPIMBLOC ) = IMBLOC + NPMB - MB * JMP( JMP_NB ) = NB JMP( JMP_INBV ) = INBVIR JMP( JMP_NQNB ) = NQNB JMP( JMP_NQINBLOC ) = INBLOC + NQNB - NB * IF( COLMAJ ) THEN JMP( JMP_ROW ) = STRIDE JMP( JMP_COL ) = STRIDE * NVIR ELSE JMP( JMP_ROW ) = STRIDE * NVIR JMP( JMP_COL ) = STRIDE END IF * RETURN * * End of PB_INITJMP * END SUBROUTINE PB_INITMULADD( MULADD0, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER IMULADD( 4, * ), JMP( * ), MULADD0( * ) * .. * * Purpose * ======= * * PB_INITMULADD initializes the constants a's and c's corresponding to * the jump values (JMP) used by the matrix generator. * * Arguments * ========= * * MULADD0 (local input) INTEGER array * On entry, MULADD0 is an array of dimension 4 containing the * encoded initial constants a and c to jump from X( n ) to * X( n+1 ) = a*X( n ) + c in the random sequence. MULADD0(1:2) * contains respectively the 16-lower and 16-higher bits of the * constant a, and MULADD0(3:4) contains the 16-lower and * 16-higher bits of the constant c. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the matrix generator. * * IMULADD (local output) INTEGER array * On entry, IMULADD is an array of dimension ( 4, JMP_LEN ). On * exit, the jth column of this array contains the encoded ini- * tial constants a_j and c_j to jump from X( n ) to X(n+JMP(j)) * (= a_j*X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * * .. Local Arrays .. INTEGER ITMP1( 2 ), ITMP2( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMP * .. * .. Executable Statements .. * ITMP2( 1 ) = 100 ITMP2( 2 ) = 0 * * Compute IMULADD for all JMP values * CALL PB_JUMP( JMP( JMP_1 ), MULADD0, ITMP2, ITMP1, $ IMULADD( 1, JMP_1 ) ) * CALL PB_JUMP( JMP( JMP_ROW ), MULADD0, ITMP1, ITMP2, $ IMULADD( 1, JMP_ROW ) ) CALL PB_JUMP( JMP( JMP_COL ), MULADD0, ITMP1, ITMP2, $ IMULADD( 1, JMP_COL ) ) * * Compute constants a and c to jump JMP( * ) numbers in the * sequence for column- or row-major ordering of the sequence. * CALL PB_JUMP( JMP( JMP_IMBV ), IMULADD( 1, JMP_ROW ), ITMP1, $ ITMP2, IMULADD( 1, JMP_IMBV ) ) CALL PB_JUMP( JMP( JMP_MB ), IMULADD( 1, JMP_ROW ), ITMP1, $ ITMP2, IMULADD( 1, JMP_MB ) ) CALL PB_JUMP( JMP( JMP_NPMB ), IMULADD( 1, JMP_ROW ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NPMB ) ) CALL PB_JUMP( JMP( JMP_NPIMBLOC ), IMULADD( 1, JMP_ROW ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NPIMBLOC ) ) * CALL PB_JUMP( JMP( JMP_INBV ), IMULADD( 1, JMP_COL ), ITMP1, $ ITMP2, IMULADD( 1, JMP_INBV ) ) CALL PB_JUMP( JMP( JMP_NB ), IMULADD( 1, JMP_COL ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NB ) ) CALL PB_JUMP( JMP( JMP_NQNB ), IMULADD( 1, JMP_COL ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NQNB ) ) CALL PB_JUMP( JMP( JMP_NQINBLOC ), IMULADD( 1, JMP_COL ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NQINBLOC ) ) * RETURN * * End of PB_INITMULADD * END SUBROUTINE PB_SETLOCRAN( SEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ILOCBLK, ILOCOFF, JLOCBLK, JLOCOFF, MYCDIST, $ MYRDIST, NPCOL, NPROW, SEED * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) * .. * * Purpose * ======= * * PB_SETLOCRAN locally initializes the random number generator. * * Arguments * ========= * * SEED (global input) INTEGER * On entry, SEED specifies a positive integer used to initiali- * ze the first number in the random sequence used by the matrix * generator. SEED must be at least zero. * * ILOCBLK (local input) INTEGER * On entry, ILOCBLK specifies the local row block coordinate * corresponding to the first row of the submatrix of interest. * ILOCBLK must be at least zero. * * ILOCOFF (local input) INTEGER * On entry, ILOCOFF specifies the local row offset in the block * of local coordinate ILOCBLK corresponding to the first row of * the submatrix of interest. ILOCOFF must at least zero. * * JLOCBLK (local input) INTEGER * On entry, JLOCBLK specifies the local column block coordinate * corresponding to the first column of the submatrix of inte- * rest. JLOCBLK must be at least zero. * * JLOCOFF (local input) INTEGER * On entry, JLOCOFF specifies the local column offset in the * block of local coordinate JLOCBLK corresponding to the first * column of the submatrix of interest. JLOCOFF must be at least * zero. * * MYRDIST (local input) INTEGER * On entry, MYRDIST specifies the relative row process coordi- * nate to the process owning the first row of the submatrix of * interest. MYRDIST must be at least zero and stricly less than * NPROW (see the subroutine PB_LOCINFO). * * MYCDIST (local input) INTEGER * On entry, MYCDIST specifies the relative column process coor- * dinate to the process owning the first column of the subma- * trix of interest. MYCDIST must be at least zero and stricly * less than NPCOL (see the subroutine PB_LOCINFO). * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process co- * lumns over which the matrix is distributed. NPCOL must be at * least one. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * IRAN (local output) INTEGER array * On entry, IRAN is an array of dimension 2. On exit, IRAN con- * tains respectively the 16-lower and 32-higher bits of the en- * coding of the entry of the random sequence corresponding lo- * cally to the first local array entry to generate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Arrays .. INTEGER IMULADDTMP( 4 ), ITMP( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMP, PB_SETRAN * .. * .. Executable Statements .. * * Compute and set the value of IRAN corresponding to A( IA, JA ) * ITMP( 1 ) = SEED ITMP( 2 ) = 0 * CALL PB_JUMP( JMP( JMP_1 ), IMULADD( 1, JMP_1 ), ITMP, IRAN, $ IMULADDTMP ) * * Jump ILOCBLK blocks of rows + ILOCOFF rows * CALL PB_JUMP( ILOCOFF, IMULADD( 1, JMP_ROW ), IRAN, ITMP, $ IMULADDTMP ) IF( MYRDIST.GT.0 ) THEN CALL PB_JUMP( JMP( JMP_IMBV ), IMULADD( 1, JMP_ROW ), ITMP, $ IRAN, IMULADDTMP ) CALL PB_JUMP( MYRDIST - 1, IMULADD( 1, JMP_MB ), IRAN, $ ITMP, IMULADDTMP ) CALL PB_JUMP( ILOCBLK, IMULADD( 1, JMP_NPMB ), ITMP, $ IRAN, IMULADDTMP ) ELSE IF( ILOCBLK.GT.0 ) THEN CALL PB_JUMP( JMP( JMP_IMBV ), IMULADD( 1, JMP_ROW ), ITMP, $ IRAN, IMULADDTMP ) CALL PB_JUMP( NPROW - 1, IMULADD( 1, JMP_MB ), IRAN, $ ITMP, IMULADDTMP ) CALL PB_JUMP( ILOCBLK - 1, IMULADD( 1, JMP_NPMB ), ITMP, $ IRAN, IMULADDTMP ) ELSE CALL PB_JUMP( 0, IMULADD( 1, JMP_1 ), ITMP, $ IRAN, IMULADDTMP ) END IF END IF * * Jump JLOCBLK blocks of columns + JLOCOFF columns * CALL PB_JUMP( JLOCOFF, IMULADD( 1, JMP_COL ), IRAN, ITMP, $ IMULADDTMP ) IF( MYCDIST.GT.0 ) THEN CALL PB_JUMP( JMP( JMP_INBV ), IMULADD( 1, JMP_COL ), ITMP, $ IRAN, IMULADDTMP ) CALL PB_JUMP( MYCDIST - 1, IMULADD( 1, JMP_NB ), IRAN, $ ITMP, IMULADDTMP ) CALL PB_JUMP( JLOCBLK, IMULADD( 1, JMP_NQNB ), ITMP, $ IRAN, IMULADDTMP ) ELSE IF( JLOCBLK.GT.0 ) THEN CALL PB_JUMP( JMP( JMP_INBV ), IMULADD( 1, JMP_COL ), ITMP, $ IRAN, IMULADDTMP ) CALL PB_JUMP( NPCOL - 1, IMULADD( 1, JMP_NB ), IRAN, $ ITMP, IMULADDTMP ) CALL PB_JUMP( JLOCBLK - 1, IMULADD( 1, JMP_NQNB ), ITMP, $ IRAN, IMULADDTMP ) ELSE CALL PB_JUMP( 0, IMULADD( 1, JMP_1 ), ITMP, $ IRAN, IMULADDTMP ) END IF END IF * CALL PB_SETRAN( IRAN, IMULADD( 1, JMP_1 ) ) * RETURN * * End of PB_SETLOCRAN * END SUBROUTINE PB_LADD( J, K, I ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER I( 2 ), J( 2 ), K( 2 ) * .. * * Purpose * ======= * * PB_LADD adds without carry two long positive integers K and J and put * the result into I. The long integers I, J, K are encoded on 31 bits * using an array of 2 integers. The 16-lower bits are stored in the * first entry of each array, the 15-higher bits in the second entry. * For efficiency purposes, the intrisic modulo function is inlined. * * Arguments * ========= * * J (local input) INTEGER array * On entry, J is an array of dimension 2 containing the encoded * long integer J. * * K (local input) INTEGER array * On entry, K is an array of dimension 2 containing the encoded * long integer K. * * I (local output) INTEGER array * On entry, I is an array of dimension 2. On exit, this array * contains the encoded long integer I. * * Further Details * =============== * * K( 2 ) K( 1 ) * 0XXXXXXX XXXXXXXX K I( 1 ) = MOD( K( 1 ) + J( 1 ), 2**16 ) * + carry = ( K( 1 ) + J( 1 ) ) / 2**16 * J( 2 ) J( 1 ) * 0XXXXXXX XXXXXXXX J I( 2 ) = K( 2 ) + J( 2 ) + carry * ---------------------- I( 2 ) = MOD( I( 2 ), 2**15 ) * I( 2 ) I( 1 ) * 0XXXXXXX XXXXXXXX I * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER IPOW15, IPOW16 PARAMETER ( IPOW15 = 2**15, IPOW16 = 2**16 ) * .. * .. Local Scalars .. INTEGER ITMP1, ITMP2 * .. * .. Executable Statements .. * * I( 1 ) = MOD( K( 1 ) + J( 1 ), IPOW16 ) * ITMP1 = K( 1 ) + J( 1 ) ITMP2 = ITMP1 / IPOW16 I( 1 ) = ITMP1 - ITMP2 * IPOW16 * * I( 2 ) = MOD( ( K( 1 ) + J( 1 ) ) / IPOW16 + K( 2 ) + J( 2 ), * IPOW15 ) * ITMP1 = ITMP2 + K( 2 ) + J( 2 ) ITMP2 = ITMP1 / IPOW15 I( 2 ) = ITMP1 - ITMP2 * IPOW15 * RETURN * * End of PB_LADD * END SUBROUTINE PB_LMUL( K, J, I ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER I( 2 ), J( 2 ), K( 2 ) * .. * * Purpose * ======= * * PB_LMUL multiplies without carry two long positive integers K and J * and put the result into I. The long integers I, J, K are encoded on * 31 bits using an array of 2 integers. The 16-lower bits are stored in * the first entry of each array, the 15-higher bits in the second entry * of each array. For efficiency purposes, the intrisic modulo function * is inlined. * * Arguments * ========= * * K (local input) INTEGER array * On entry, K is an array of dimension 2 containing the encoded * long integer K. * * J (local input) INTEGER array * On entry, J is an array of dimension 2 containing the encoded * long integer J. * * I (local output) INTEGER array * On entry, I is an array of dimension 2. On exit, this array * contains the encoded long integer I. * * Further Details * =============== * * K( 2 ) K( 1 ) * 0XXXXXXX XXXXXXXX K I( 1 ) = MOD( K( 1 ) + J( 1 ), 2**16 ) * * carry = ( K( 1 ) + J( 1 ) ) / 2**16 * J( 2 ) J( 1 ) * 0XXXXXXX XXXXXXXX J I( 2 ) = K( 2 ) + J( 2 ) + carry * ---------------------- I( 2 ) = MOD( I( 2 ), 2**15 ) * I( 2 ) I( 1 ) * 0XXXXXXX XXXXXXXX I * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER IPOW15, IPOW16, IPOW30 PARAMETER ( IPOW15 = 2**15, IPOW16 = 2**16, $ IPOW30 = 2**30 ) * .. * .. Local Scalars .. INTEGER ITMP1, ITMP2 * .. * .. Executable Statements .. * ITMP1 = K( 1 ) * J( 1 ) IF( ITMP1.LT.0 ) $ ITMP1 = ( ITMP1 + IPOW30 ) + IPOW30 * * I( 1 ) = MOD( ITMP1, IPOW16 ) * ITMP2 = ITMP1 / IPOW16 I( 1 ) = ITMP1 - ITMP2 * IPOW16 * ITMP1 = K( 1 ) * J( 2 ) + K( 2 ) * J( 1 ) IF( ITMP1.LT.0 ) $ ITMP1 = ( ITMP1 + IPOW30 ) + IPOW30 * ITMP1 = ITMP2 + ITMP1 IF( ITMP1.LT.0 ) $ ITMP1 = ( ITMP1 + IPOW30 ) + IPOW30 * * I( 2 ) = MOD( ITMP1, IPOW15 ) * I( 2 ) = ITMP1 - ( ITMP1 / IPOW15 ) * IPOW15 * RETURN * * End of PB_LMUL * END SUBROUTINE PB_JUMP( K, MULADD, IRANN, IRANM, IMA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER K * .. * .. Array Arguments .. INTEGER IMA( 4 ), IRANM( 2 ), IRANN( 2 ), MULADD( 4 ) * .. * * Purpose * ======= * * PB_JUMP computes the constants A and C to jump K numbers in the ran- * dom sequence: * * X( n+K ) = A * X( n ) + C. * * The constants encoded in MULADD specify how to jump from entry in the * sequence to the next. * * Arguments * ========= * * K (local input) INTEGER * On entry, K specifies the number of entries of the sequence * to jump over. When K is less or equal than zero, A and C are * not computed, and IRANM is set to IRANN corresponding to a * jump of size zero. * * MULADD (local input) INTEGER array * On entry, MULADD is an array of dimension 4 containing the * encoded constants a and c to jump from X( n ) to X( n+1 ) * ( = a*X( n )+c) in the random sequence. MULADD(1:2) contains * respectively the 16-lower and 16-higher bits of the constant * a, and MULADD(3:4) contains the 16-lower and 16-higher bits * of the constant c. * * IRANN (local input) INTEGER array * On entry, IRANN is an array of dimension 2. This array con- * tains respectively the 16-lower and 16-higher bits of the en- * coding of X( n ). * * IRANM (local output) INTEGER array * On entry, IRANM is an array of dimension 2. On exit, this * array contains respectively the 16-lower and 16-higher bits * of the encoding of X( n+K ). * * IMA (local output) INTEGER array * On entry, IMA is an array of dimension 4. On exit, when K is * greater than zero, this array contains the encoded constants * A and C to jump from X( n ) to X( n+K ) in the random se- * quence. IMA(1:2) contains respectively the 16-lower and * 16-higher bits of the constant A, and IMA(3:4) contains the * 16-lower and 16-higher bits of the constant C. When K is * less or equal than zero, this array is not referenced. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Executable Statements .. * IF( K.GT.0 ) THEN * IMA( 1 ) = MULADD( 1 ) IMA( 2 ) = MULADD( 2 ) IMA( 3 ) = MULADD( 3 ) IMA( 4 ) = MULADD( 4 ) * DO 10 I = 1, K - 1 * CALL PB_LMUL( IMA, MULADD, J ) * IMA( 1 ) = J( 1 ) IMA( 2 ) = J( 2 ) * CALL PB_LMUL( IMA( 3 ), MULADD, J ) CALL PB_LADD( MULADD( 3 ), J, IMA( 3 ) ) * 10 CONTINUE * CALL PB_LMUL( IRANN, IMA, J ) CALL PB_LADD( J, IMA( 3 ), IRANM ) * ELSE * IRANM( 1 ) = IRANN( 1 ) IRANM( 2 ) = IRANN( 2 ) * END IF * RETURN * * End of PB_JUMP * END SUBROUTINE PB_SETRAN( IRAN, IAC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER IAC( 4 ), IRAN( 2 ) * .. * * Purpose * ======= * * PB_SETRAN initializes the random generator with the encoding of the * first number X( 1 ) in the sequence, and the constants a and c used * to compute the next element in the sequence: * * X( n+1 ) = a * X( n ) + c. * * X( 1 ), a and c are stored in the common block RANCOM for later use * (see the routines PB_SRAN or PB_DRAN). * * Arguments * ========= * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2. This array con- * tains respectively the 16-lower and 16-higher bits of the en- * coding of X( 1 ). * * IAC (local input) INTEGER array * On entry, IAC is an array of dimension 4. IAC(1:2) contain * respectively the 16-lower and 16-higher bits of the constant * a, and IAC(3:4) contain the 16-lower and 16-higher bits of * the constant c. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * IRAND( 1 ) = IRAN( 1 ) IRAND( 2 ) = IRAN( 2 ) IACS( 1 ) = IAC( 1 ) IACS( 2 ) = IAC( 2 ) IACS( 3 ) = IAC( 3 ) IACS( 4 ) = IAC( 4 ) * RETURN * * End of PB_SETRAN * END SUBROUTINE PB_JUMPIT( MULADD, IRANN, IRANM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER IRANM( 2 ), IRANN( 2 ), MULADD( 4 ) * .. * * Purpose * ======= * * PB_JUMPIT jumps in the random sequence from the number X( n ) enco- * ded in IRANN to the number X( m ) encoded in IRANM using the cons- * tants A and C encoded in MULADD: * * X( m ) = A * X( n ) + C. * * The constants A and C obviously depend on m and n, see the subroutine * PB_JUMP in order to set them up. * * Arguments * ========= * * MULADD (local input) INTEGER array * On netry, MULADD is an array of dimension 4. MULADD(1:2) con- * tains respectively the 16-lower and 16-higher bits of the * constant A, and MULADD(3:4) contains the 16-lower and * 16-higher bits of the constant C. * * IRANN (local input) INTEGER array * On entry, IRANN is an array of dimension 2. This array con- * tains respectively the 16-lower and 16-higher bits of the en- * coding of X( n ). * * IRANM (local output) INTEGER array * On entry, IRANM is an array of dimension 2. On exit, this * array contains respectively the 16-lower and 16-higher bits * of the encoding of X( m ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * CALL PB_LMUL( IRANN, MULADD, J ) CALL PB_LADD( J, MULADD( 3 ), IRANM ) * IRAND( 1 ) = IRANM( 1 ) IRAND( 2 ) = IRANM( 2 ) * RETURN * * End of PB_JUMPIT * END scalapack-2.0.2/PBLAS/TIMING/PCBLAS1TIM.dat000644 000766 000024 00000002476 10363532303 017641 0ustar00juliestaff000000 000000 'Level 1 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PCBLAS1TIM.SUMM' output file name (if any) 6 device out 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q (2.0E0, -3.0E0) value of ALPHA 2 number of tests problems 1000 1000 values of N 1000 1 values of M_X 1 1500 values of N_X 32 32 values of IMB_X 32 32 values of INB_X 32 32 values of MB_X 32 32 values of NB_X 0 0 values of RSRC_X 0 0 values of CSRC_X 1 1 values of IX 1 1 values of JX 1 1 values of INCX 1 1 values of M_Y 1000 1500 values of N_Y 32 32 values of IMB_Y 32 32 values of INB_Y 32 32 values of MB_Y 32 32 values of NB_Y 0 0 values of RSRC_Y 0 0 values of CSRC_Y 1 1 values of IY 1 1 values of JY 1 1 values of INCY PCSWAP T put F for no test in the same column PCSCAL T put F for no test in the same column PCSSCAL T put F for no test in the same column PCCOPY T put F for no test in the same column PCAXPY T put F for no test in the same column PCDOTU T put F for no test in the same column PCDOTC T put F for no test in the same column PSCNRM2 T put F for no test in the same column PSCASUM T put F for no test in the same column PCAMAX T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/pcblas1tim.f000644 000766 000024 00000124725 11750130340 017754 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 10) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PCSWAP ', 'PCSCAL ', $ 'PCSSCAL', 'PCCOPY', 'PCAXPY ', $ 'PCDOTU ', 'PCDOTC' , 'PSCNRM2', $ 'PSCASUM', 'PCAMAX '/ END BLOCK DATA PROGRAM PCBLA1TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PCBLA1TIM is the main timing program for the Level 1 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 42 lines: * 'Level 1 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PCBLAS1TIM.SUMM' output file name (if any) * 6 device out * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0E0, 0.0E0) value of ALPHA * 2 number of tests problems * 3 4 values of N * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PCSWAP T put F for no test in the same column * PCSCAL T put F for no test in the same column * PCSSCAL T put F for no test in the same column * PCCOPY T put F for no test in the same column * PCAXPY T put F for no test in the same column * PCDOTU T put F for no test in the same column * PCDOTC T put F for no test in the same column * PSCNRM2 T put F for no test in the same column * PSCASUM T put F for no test in the same column * PCAMAX T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * CPLXSZ INTEGER * REALSZ and CPLXSZ indicate the length in bytes on the given * platform for a single precision real and a single precision * complex. By default, REALSZ is set to four and CPLXSZ is set * to eight. * * MEM COMPLEX array * MEM is an array of dimension TOTMEM / CPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, CPLXSZ, TOTMEM, MEMSIZ, $ NSUBS PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, CPLXSZ = 8, $ TOTMEM = 2000000, NSUBS = 10, $ MEMSIZ = TOTMEM / CPLXSZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER CSRCX, CSRCY, I, IAM, ICTXT, IMBX, IMBY, IMIDX, $ IMIDY, INBX, INBY, INCX, INCY, IPOSTX, IPOSTY, $ IPREX, IPREY, IPX, IPY, IX, IXSEED, IY, IYSEED, $ J, JX, JY, K, MBX, MBY, MEMREQD, MPX, MPY, MX, $ MY, MYCOL, MYROW, N, NBX, NBY, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQX, NQY, NTESTS, NX, NY, $ PISCLR, RSRCX, RSRCY REAL PUSCLR DOUBLE PRECISION ADDS, CFLOPS, MULTS, NOPS, WFLOPS COMPLEX ALPHA, PSCLR * .. * .. Local Arrays .. CHARACTER*80 OUTFILE LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) INTEGER CSCXVAL( MAXTESTS ), CSCYVAL( MAXTESTS ), $ DESCX( DLEN_ ), DESCY( DLEN_ ), IERR( 2 ), $ IMBXVAL( MAXTESTS ), IMBYVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JXVAL( MAXTESTS ), JYVAL( MAXTESTS ), $ MBXVAL( MAXTESTS ), MBYVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCXVAL( MAXTESTS ), $ RSCYVAL( MAXTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PCAMAX, PCAXPY, PCBLA1TIMINFO, $ PCCOPY, PCDOTC, PCDOTU, PCLAGEN, PCSCAL, $ PCSSCAL, PCSWAP, PSCASUM, PSCNRM2, PVDESCCHK, $ PVDIMCHK * .. * .. Intrinsic Functions .. INTRINSIC DBLE, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .FALSE., .FALSE., .TRUE., $ .TRUE., .TRUE., .TRUE., .FALSE., .FALSE., $ .FALSE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * Seeds for random matrix generations. * IXSEED = 100 IYSEED = 200 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PCBLA1TIMINFO( OUTFILE, NOUT, NTESTS, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, RSCXVAL, $ CSCXVAL, IXVAL, JXVAL, INCXVAL, MYVAL, $ NYVAL, IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, INCYVAL, $ MAXTESTS, NGRIDS, PVAL, MAXGRIDS, QVAL, $ MAXGRIDS, LTEST, IAM, NPROCS, ALPHA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * N = NVAL( J ) MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9983 ) END IF * * Check the validity of the input and initialize DESC_ * CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, 0, 0, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, 0, 0, IERR( 2 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 ) $ GO TO 40 * * Assign pointers into MEM for matrices corresponding to * vectors X and Y. Ex: IPX starts at position MEM( 1 ). * IPX = 1 IPY = IPX + DESCX( LLD_ ) * NQX * * Check if sufficient memory. * MEMREQD = IPY + DESCY( LLD_ ) * NQY - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) MEMREQD*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) GO TO 40 END IF * * Loop over all PBLAS 1 routines * DO 30 K = 1, NSUBS * * Continue only if this sub has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * * Check the validity of the operand sizes * CALL PVDIMCHK( ICTXT, NOUT, N, 'X', IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, N, 'Y', IY, JY, DESCY, INCY, $ IERR( 2 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 ) $ GO TO 30 * * Generate distributed matrices X and Y * CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the PBLAS routine * IF( K.EQ.1 ) THEN * * Test PCSWAP * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PCSWAP( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.2 ) THEN * * Test PCSCAL * ADDS = 0.0D+0 MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PCSCAL( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.3 ) THEN * * Test PCSSCAL * ADDS = 0.0D+0 MULTS = DBLE( 2*N ) CALL PB_TIMER( 1 ) CALL PCSSCAL( N, REAL( ALPHA ), MEM( IPX ), IX, JX, $ DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.4 ) THEN * * Test PCCOPY * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PCCOPY( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.5 ) THEN * * Test PCAXPY * ADDS = DBLE( 2*N ) MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PCAXPY( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.6 ) THEN * * Test PCDOTU * ADDS = DBLE( 2 * ( N - 1 ) ) MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PCDOTU( N, PSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.7 ) THEN * * Test PCDOTC * ADDS = DBLE( 2 * ( N - 1 ) ) MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PCDOTC( N, PSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.8 ) THEN * * Test PSCNRM2 * ADDS = DBLE( 2 * ( N - 1 ) ) MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PSCNRM2( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.9 ) THEN * * Test PSCASUM * ADDS = DBLE( 2 * ( N - 1 ) ) MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PSCASUM( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.10 ) THEN * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PCAMAX( N, PSCLR, PISCLR, MEM( IPX ), IX, JX, $ DESCX, INCX ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Calculate total flops * NOPS = ADDS + MULTS * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9984 ) SNAMES( K ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9988 ) J END IF * 50 CONTINUE * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, '---------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' N IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9993 FORMAT( 2X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I5,1X,I5,1X,I5,1X,I5,1X, $ I5,1X,I5,1X,I6 ) 9992 FORMAT( 2X, ' N IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9991 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9990 FORMAT( 'Not enough memory. Need: ', I12 ) 9988 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9987 FORMAT( 2X, 'End of Tests.' ) 9986 FORMAT( 2X, 'Tests started.' ) 9985 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9984 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9983 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PCBLA1TIM * END SUBROUTINE PCBLA1TIMINFO( SUMMRY, NOUT, NMAT, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, $ RSCXVAL, CSCXVAL, IXVAL, JXVAL, $ INCXVAL, MYVAL, NYVAL, IMBYVAL, MBYVAL, $ INBYVAL, NBYVAL, RSCYVAL, CSCYVAL, $ IYVAL, JYVAL, INCYVAL, LDVAL, NGRIDS, $ PVAL, LDPVAL, QVAL, LDQVAL, LTEST, IAM, $ NPROCS, ALPHA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NGRIDS, NMAT, NOUT, $ NPROCS COMPLEX ALPHA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY LOGICAL LTEST( * ) INTEGER CSCXVAL( LDVAL ), CSCYVAL( LDVAL ), $ IMBXVAL( LDVAL ), IMBYVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JXVAL( LDVAL ), $ JYVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCXVAL( LDVAL ), RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PCBLA1TIMINFO get the needed startup information for timing various * Level 1 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DESCX(:), IX, JX, INCX, DESCY(:), * IY, JY and INCY. This is also the maximum number of test * cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least ten. On * exit, if LTEST( i ) is .TRUE., the i-th Level 1 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) COMPLEX * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 2, 2*NGRIDS+23*NMAT+NSUBS ) with NSUBS = 10. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 10 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, CGEBR2D, CGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PCBLAS1TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 100 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 100 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA * READ( NIN, FMT = * ) ALPHA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 100 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 100 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, WORK, 2 ) * I = 1 CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 70 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 70 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 1 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Timing of the complex single precision '// $ 'Level 1 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9989 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9989 ) SNAMES( 1 ), ' ... No ' END IF DO 80 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( I ), ' ... No ' END IF 80 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, WORK, 2, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) * I = 2*NGRIDS + 23*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 90 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 90 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 100 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9989 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9988 FORMAT( 2X, ' ', A, A8 ) * * End of PCBLA1TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/PCBLAS2TIM.dat000644 000766 000024 00000006077 10363532303 017643 0ustar00juliestaff000000 000000 'Level 2 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PCBLAS2TIM.SUMM' output file name (if any) 6 device out 10 value of the logical computational blocksize NB 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q (2.0E0, -4.0E0) value of ALPHA (3.0E0, -2.0E0) value of BETA 16 number of tests problems 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' values of UPLO 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANS 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' values of DIAG 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 values of M 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 values of N 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_A 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_A 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_A 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IA 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JA 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_X 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_X 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_X 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IX 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JX 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of INCX 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_Y 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_Y 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IY 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JY 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of INCY PCGEMV T put F for no test in the same column PCHEMV T put F for no test in the same column PCTRMV T put F for no test in the same column PCTRSV T put F for no test in the same column PCGERU T put F for no test in the same column PCGERC T put F for no test in the same column PCHER T put F for no test in the same column PCHER2 T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/pcblas2tim.f000644 000766 000024 00000155032 11750130340 017750 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( 8 ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PCGEMV ', 'PCHEMV ', 'PCTRMV ', $ 'PCTRSV ', 'PCGERU ', 'PCGERC ', $ 'PCHER ', 'PCHER2 '/ END BLOCK DATA PROGRAM PCBLA2TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PCBLA2TIM is the main timing program for the Level 2 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 56 lines: * 'Level 2 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PCBLAS2TIM.SUMM' output file name (if any) * 6 device out * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0E0, 0.0E0) value of ALPHA * (1.0E0, 0.0E0) value of BETA * 2 number of tests problems * 'U' 'L' values of UPLO * 'N' 'T' values of TRANS * 'N' 'U' values of DIAG * 3 4 values of M * 3 4 values of N * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PCGEMV T put F for no test in the same column * PCHEMV T put F for no test in the same column * PCTRMV T put F for no test in the same column * PCTRSV T put F for no test in the same column * PCGERU T put F for no test in the same column * PCGERC T put F for no test in the same column * PCHER T put F for no test in the same column * PCHER2 T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * CPLXSZ INTEGER * REALSZ and CPLXSZ indicate the length in bytes on the given * platform for a single precision real and a single precision * complex. By default, REALSZ is set to four and CPLXSZ is set * to eight. * * MEM COMPLEX array * MEM is an array of dimension TOTMEM / CPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, CPLXSZ, TOTMEM, MEMSIZ, $ NSUBS COMPLEX ONE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, CPLXSZ = 8, $ ONE = ( 1.0E+0, 0.0E+0 ), TOTMEM = 2000000, $ NSUBS = 8, MEMSIZ = TOTMEM / CPLXSZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 AFORM, DIAG, DIAGDO, TRANS, UPLO INTEGER CSRCA, CSRCX, CSRCY, I, IA, IAM, IASEED, ICTXT, $ IMBA, IMBX, IMBY, IMIDA, IMIDX, IMIDY, INBA, $ INBX, INBY, INCX, INCY, IPA, IPOSTA, IPOSTX, $ IPOSTY, IPREA, IPREX, IPREY, IPX, IPY, IX, $ IXSEED, IY, IYSEED, J, JA, JX, JY, K, M, MA, $ MBA, MBX, MBY, MEMREQD, MPA, MPX, MPY, MX, MY, $ MYCOL, MYROW, N, NA, NBA, NBX, NBY, NCOLA, $ NGRIDS, NLX, NLY, NOUT, NPCOL, NPROCS, NPROW, $ NQA, NQX, NQY, NROWA, NTESTS, NX, NY, OFFD, $ RSRCA, RSRCX, RSRCY DOUBLE PRECISION CFLOPS, NOPS, WFLOPS COMPLEX ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), TRANVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCXVAL( MAXTESTS ), $ CSCYVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ), $ IAVAL( MAXTESTS ), IERR( 3 ), $ IMBAVAL( MAXTESTS ), IMBXVAL( MAXTESTS ), $ IMBYVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JXVAL( MAXTESTS ), $ JYVAL( MAXTESTS ), MAVAL( MAXTESTS ), $ MBAVAL( MAXTESTS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NAVAL( MAXTESTS ), NBAVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCAVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PCBLA2TIMINFO, PCGEMV, PCGERC, $ PCGERU, PCHEMV, PCHER, PCHER2, PCLAGEN, $ PCLASCAL, PCTRMV, PCTRSV, PMDESCCHK, PMDIMCHK, $ PVDESCCHK, PVDIMCHK * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDOPBL2 EXTERNAL LSAME, PDOPBL2 * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, DBLE, MAX, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .TRUE., .FALSE., .FALSE., $ .TRUE., .TRUE., .FALSE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .TRUE. * * Seeds for random matrix generations. * IASEED = 100 IXSEED = 200 IYSEED = 300 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PCBLA2TIMINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, $ IAVAL, JAVAL, MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, $ CSCYVAL, IYVAL, JYVAL, INCYVAL, MAXTESTS, $ NGRIDS, PVAL, MAXGRIDS, QVAL, MAXGRIDS, $ NBLOG, LTEST, IAM, NPROCS, ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) TRANS = TRANVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) * MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, UPLO, TRANS, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9980 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' GO TO 40 END IF * IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANS' GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) TRANS WRITE( NOUT, FMT = 9997 ) 'DIAG' GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, 0, 0, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, 0, 0, IERR( 2 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, 0, 0, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN GO TO 40 END IF * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = 1 IPX = IPA + DESCA( LLD_ ) * NQA IPY = IPX + DESCX( LLD_ ) * NQX * * Check if sufficient memory. * MEMREQD = IPY + DESCY( LLD_ ) * NQY - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) MEMREQD*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) GO TO 40 END IF * * Loop over all PBLAS 2 routines * DO 30 K = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * * Define the size of the operands * IF( K.EQ.1 ) THEN NROWA = M NCOLA = N IF( LSAME( TRANS, 'N' ) ) THEN NLX = N NLY = M ELSE NLX = M NLY = N END IF ELSE IF( K.EQ.5 .OR. K.EQ.6 ) THEN NROWA = M NCOLA = N NLX = M NLY = N ELSE NROWA = N NCOLA = N NLX = N NLY = N END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLX, 'X', IX, JX, DESCX, $ INCX, IERR( 2 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLY, 'Y', IY, JY, DESCY, $ INCY, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN GO TO 30 END IF * * Generate distributed matrices A, X and Y * IF( K.EQ.2 .OR. K.EQ.7 .OR. K.EQ.8 ) THEN AFORM = 'H' DIAGDO = 'N' OFFD = IA - JA ELSE IF( ( K.EQ.4 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN AFORM = 'N' DIAGDO = 'D' OFFD = IA - JA ELSE AFORM = 'N' DIAGDO = 'N' OFFD = 0 END IF * CALL PCLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, $ 1, 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MY, $ NY, 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * IF( ( K.EQ.4 ).AND.( .NOT.( LSAME( DIAG, 'N' ) ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / CMPLX( REAL( MAX( NROWA, NCOLA ) ) ) IF( LSAME( UPLO, 'L' ) ) THEN CALL PCLASCAL( 'Lower', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA+1, JA, DESCA ) ELSE CALL PCLASCAL( 'Upper', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA, JA+1, DESCA ) END IF END IF * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the Level 2 PBLAS routine * IF( K.EQ.1 ) THEN * * Test PCGEMV * CALL PB_TIMER( 1 ) CALL PCGEMV( TRANS, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.2 ) THEN * * Test PCHEMV * CALL PB_TIMER( 1 ) CALL PCHEMV( UPLO, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.3 ) THEN * * Test PCTRMV * CALL PB_TIMER( 1 ) CALL PCTRMV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.4 ) THEN * * Test PCTRSV * CALL PB_TIMER( 1 ) CALL PCTRSV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.5 ) THEN * * Test PCGERU * CALL PB_TIMER( 1 ) CALL PCGERU( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.6 ) THEN * * Test PCGERC * CALL PB_TIMER( 1 ) CALL PCGERC( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.7 ) THEN * * Test PCHER * CALL PB_TIMER( 1 ) CALL PCHER( UPLO, N, REAL( ALPHA ), MEM( IPX ), IX, $ JX, DESCX, INCX, MEM( IPA ), IA, JA, $ DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.8 ) THEN * * Test PCHER2 * CALL PB_TIMER( 1 ) CALL PCHER2( UPLO, N, ALPHA, MEM( IPX ), IX, JX, $ DESCX, INCX, MEM( IPY ), IY, JY, DESCY, $ INCY, MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Calculate total flops * NOPS = PDOPBL2( SNAMES( K ), NROWA, NCOLA, 0, 0 ) * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9981 ) SNAMES( K ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' M N UPLO TRANS DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,9X,A1,11X,A1,10X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9989 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5,1X,I6 ) 9988 FORMAT( 2X, ' IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9987 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9986 FORMAT( 'Not enough memory. Need: ', I12 ) 9985 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9984 FORMAT( 2X, 'End of Tests.' ) 9983 FORMAT( 2X, 'Tests started.' ) 9982 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9981 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9980 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PCBLA2TIM * END SUBROUTINE PCBLA2TIMINFO( SUMMRY, NOUT, NMAT, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, $ IMBAVAL, MBAVAL, INBAVAL, NBAVAL, $ RSCAVAL, CSCAVAL, IAVAL, JAVAL, $ MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, $ IXVAL, JXVAL, INCXVAL, MYVAL, NYVAL, $ IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, $ INCYVAL, LDVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, NBLOG, LTEST, IAM, NPROCS, $ ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NBLOG, NGRIDS, $ NMAT, NOUT, NPROCS COMPLEX ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), TRANVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCXVAL( LDVAL ), $ CSCYVAL( LDVAL ), IAVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBXVAL( LDVAL ), $ IMBYVAL( LDVAL ), INBAVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JAVAL( LDVAL ), $ JXVAL( LDVAL ), JYVAL( LDVAL ), MAVAL( LDVAL ), $ MBAVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NAVAL( LDVAL ), $ NBAVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCXVAL( LDVAL ), $ RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PCBLA2TIMINFO get the needed startup information for timing various * Level 2 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * TRANVAL (global output) CHARACTER array * On entry, TRANVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANS to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, TRANS, UPLO, M, N, DESCA(:), * IA, JA, DESCX(:), IX, JX, INCX, DESCY(:), IY, JY and INCY. * This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 2 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) COMPLEX * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) COMPLEX * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+37*NMAT+NSUBS ) with NSUBS = 8. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, CGEBR2D, CGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PCBLAS2TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRANVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( TRANVAL( J ) ) WORK( I+2 ) = ICHAR( UPLOVAL( J ) ) I = I + 3 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 2 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex single precision '// $ 'Level 2 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) NMAT WRITE( NOUT, FMT = 9986 ) NBLOG WRITE( NOUT, FMT = 9991 ) NGRIDS WRITE( NOUT, FMT = 9989 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9989 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA WRITE( NOUT, FMT = 9993 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 1, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 37*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) TRANVAL( J ) = CHAR( WORK( I+1 ) ) UPLOVAL( J ) = CHAR( WORK( I+2 ) ) I = I + 3 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9993 FORMAT( 2X, 'Beta : (', G16.6, $ ',', G16.6, ')' ) 9992 FORMAT( 2X, 'Number of Tests : ', I6 ) 9991 FORMAT( 2X, 'Number of process grids : ', I6 ) 9990 FORMAT( 2X, ' : ', 5I6 ) 9989 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9987 FORMAT( 2X, ' ', A, A8 ) 9986 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PCBLA2TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/PCBLAS3TIM.dat000644 000766 000024 00000004563 10363532303 017642 0ustar00juliestaff000000 000000 'Level 3 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PCBLAS3TIM.SUMM' output file name (if any) 6 device out 10 value of the logical computational blocksize NB 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q (2.0E0, -4.0E0) value of ALPHA (3.0E0, -2.0E0) value of BETA 8 number of tests problems 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' values of DIAG 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' values of SIDE 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANSA 'N' 'N' 'T' 'T' 'N' 'N' 'T' 'T' values of TRANSB 'U' 'L' 'U' 'L' 'U' 'L' 'U' 'L' values of UPLO 3 4 3 4 3 4 3 4 values of M 3 4 3 4 3 4 3 4 values of N 3 4 3 4 3 4 3 4 values of K 6 10 6 10 6 10 6 10 values of M_A 6 10 6 10 6 10 6 10 values of N_A 2 5 2 5 2 5 2 5 values of IMB_A 2 5 2 5 2 5 2 5 values of INB_A 2 5 2 5 2 5 2 5 values of MB_A 2 5 2 5 2 5 2 5 values of NB_A 0 1 0 1 0 1 0 1 values of RSRC_A 0 0 0 0 0 0 0 0 values of CSRC_A 1 1 1 1 1 1 1 1 values of IA 1 1 1 1 1 1 1 1 values of JA 6 10 6 10 6 10 6 10 values of M_B 6 10 6 10 6 10 6 10 values of N_B 2 5 2 5 2 5 2 5 values of IMB_B 2 5 2 5 2 5 2 5 values of INB_B 2 5 2 5 2 5 2 5 values of MB_B 2 5 2 5 2 5 2 5 values of NB_B 0 1 0 1 0 1 0 1 values of RSRC_B 0 0 0 0 0 0 0 0 values of CSRC_B 1 1 1 1 1 1 1 1 values of IB 1 1 1 1 1 1 1 1 values of JB 6 10 6 10 6 10 6 10 values of M_C 6 10 6 10 6 10 6 10 values of N_C 2 5 2 5 2 5 2 5 values of IMB_C 2 5 2 5 2 5 2 5 values of INB_C 2 5 2 5 2 5 2 5 values of MB_C 2 5 2 5 2 5 2 5 values of NB_C 0 1 0 1 0 1 0 1 values of RSRC_C 0 0 0 0 0 0 0 0 values of CSRC_C 1 1 1 1 1 1 1 1 values of IC 1 1 1 1 1 1 1 1 values of JC PCGEMM T put F for no test in the same column PCSYMM T put F for no test in the same column PCHEMM T put F for no test in the same column PCSYRK T put F for no test in the same column PCHERK T put F for no test in the same column PCSYR2K T put F for no test in the same column PCHER2K T put F for no test in the same column PCTRMM T put F for no test in the same column PCTRSM T put F for no test in the same column PCGEADD T put F for no test in the same column PCTRADD T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/pcblas3tim.f000644 000766 000024 00000174561 11750130340 017761 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 11) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PCGEMM ', 'PCSYMM ', 'PCHEMM ', $ 'PCSYRK ', 'PCHERK ', 'PCSYR2K', $ 'PCHER2K', 'PCTRMM ', 'PCTRSM ', $ 'PCGEADD', 'PCTRADD'/ END BLOCK DATA PROGRAM PCBLA3TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PCBLA3TIM is the main timing program for the Level 3 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 59 lines: * 'Level 3 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PCBLAS3TIM.SUMM' output file name (if any) * 6 device out * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0E0, 0.0E0) value of ALPHA * (1.0E0, 0.0E0) value of BETA * 2 number of tests problems * 'N' 'U' values of DIAG * 'L' 'R' values of SIDE * 'N' 'T' values of TRANSA * 'N' 'T' values of TRANSB * 'U' 'L' values of UPLO * 3 4 values of M * 3 4 values of N * 3 4 values of K * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_B * 6 10 values of N_B * 2 5 values of IMB_B * 2 5 values of INB_B * 2 5 values of MB_B * 2 5 values of NB_B * 0 1 values of RSRC_B * 0 0 values of CSRC_B * 1 1 values of IB * 1 1 values of JB * 6 10 values of M_C * 6 10 values of N_C * 2 5 values of IMB_C * 2 5 values of INB_C * 2 5 values of MB_C * 2 5 values of NB_C * 0 1 values of RSRC_C * 0 0 values of CSRC_C * 1 1 values of IC * 1 1 values of JC * PCGEMM T put F for no test in the same column * PCSYMM T put F for no test in the same column * PCHEMM T put F for no test in the same column * PCSYRK T put F for no test in the same column * PCHERK T put F for no test in the same column * PCSYR2K T put F for no test in the same column * PCHER2K T put F for no test in the same column * PCTRMM T put F for no test in the same column * PCTRSM T put F for no test in the same column * PCGEADD T put F for no test in the same column * PCTRADD T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * CPLXSZ INTEGER * REALSZ and CPLXSZ indicate the length in bytes on the given * platform for a single precision real and a single precision * complex. By default, REALSZ is set to four and CPLXSZ is set * to eight. * * MEM COMPLEX array * MEM is an array of dimension TOTMEM / CPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, CPLXSZ, TOTMEM, MEMSIZ, $ NSUBS COMPLEX ONE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, CPLXSZ = 8, $ ONE = ( 1.0E+0, 0.0E+0 ), TOTMEM = 2000000, $ NSUBS = 11, MEMSIZ = TOTMEM / CPLXSZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 ADIAGDO, AFORM, CFORM, DIAG, SIDE, TRANSA, $ TRANSB, UPLO INTEGER CSRCA, CSRCB, CSRCC, I, IA, IAM, IASEED, IB, $ IBSEED, IC, ICSEED, ICTXT, IMBA, IMBB, IMBC, $ IMIDA, IMIDB, IMIDC, INBA, INBB, INBC, IPA, $ IPB, IPC, IPOSTA, IPOSTB, IPOSTC, IPREA, IPREB, $ IPREC, J, JA, JB, JC, K, L, M, MA, MB, MBA, $ MBB, MBC, MC, MEMREQD, MPA, MPB, MPC, MYCOL, $ MYROW, N, NA, NB, NBA, NBB, NBC, NC, NCOLA, $ NCOLB, NCOLC, NGRIDS, NOUT, NPCOL, NPROCS, $ NPROW, NQA, NQB, NQC, NROWA, NROWB, NROWC, $ NTESTS, OFFDA, OFFDC, RSRCA, RSRCB, RSRCC DOUBLE PRECISION CFLOPS, NOPS, WFLOPS COMPLEX ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), BCHECK( NSUBS ), $ CCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), SIDEVAL( MAXTESTS ), $ TRNAVAL( MAXTESTS ), TRNBVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCBVAL( MAXTESTS ), $ CSCCVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCB( DLEN_ ), DESCC( DLEN_ ), $ IAVAL( MAXTESTS ), IBVAL( MAXTESTS ), $ ICVAL( MAXTESTS ), IERR( 3 ), $ IMBAVAL( MAXTESTS ), IMBBVAL( MAXTESTS ), $ IMBCVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBBVAL( MAXTESTS ), INBCVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JBVAL( MAXTESTS ), $ JCVAL( MAXTESTS ), KVAL( MAXTESTS ), $ MAVAL( MAXTESTS ), MBAVAL( MAXTESTS ), $ MBBVAL( MAXTESTS ), MBCVAL( MAXTESTS ), $ MBVAL( MAXTESTS ), MCVAL( MAXTESTS ), $ MVAL( MAXTESTS ), NAVAL( MAXTESTS ), $ NBAVAL( MAXTESTS ), NBBVAL( MAXTESTS ), $ NBCVAL( MAXTESTS ), NBVAL( MAXTESTS ), $ NCVAL( MAXTESTS ), NVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCAVAL( MAXTESTS ), RSCBVAL( MAXTESTS ), $ RSCCVAL( MAXTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PCBLA3TIMINFO, PCGEADD, PCGEMM, $ PCHEMM, PCHER2K, PCHERK, PCLAGEN, PCLASCAL, $ PCSYMM, PCSYR2K, PCSYRK, PCTRADD, PCTRMM, $ PCTRSM, PMDESCCHK, PMDIMCHK * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDOPBL3 EXTERNAL LSAME, PDOPBL3 * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, DBLE, MAX, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA BCHECK/.TRUE., .TRUE., .TRUE., .FALSE., $ .FALSE., .TRUE., .TRUE., .TRUE., .TRUE., $ .FALSE., .FALSE./ DATA CCHECK/.TRUE., .TRUE., .TRUE., .TRUE., .TRUE., $ .TRUE., .TRUE., .FALSE., .FALSE., .TRUE., $ .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * Seeds for random matrix generations. * IASEED = 100 IBSEED = 200 ICSEED = 300 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PCBLA3TIMINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, NVAL, $ KVAL, MAVAL, NAVAL, IMBAVAL, MBAVAL, $ INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, IAVAL, $ JAVAL, MBVAL, NBVAL, IMBBVAL, MBBVAL, $ INBBVAL, NBBVAL, RSCBVAL, CSCBVAL, IBVAL, $ JBVAL, MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, ICVAL, $ JCVAL, MAXTESTS, NGRIDS, PVAL, MAXGRIDS, $ QVAL, MAXGRIDS, NBLOG, LTEST, IAM, NPROCS, $ ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) SIDE = SIDEVAL( J ) TRANSA = TRNAVAL( J ) TRANSB = TRNBVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) K = KVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MB = MBVAL( J ) NB = NBVAL( J ) IMBB = IMBBVAL( J ) MBB = MBBVAL( J ) INBB = INBBVAL( J ) NBB = NBBVAL( J ) RSRCB = RSCBVAL( J ) CSRCB = CSCBVAL( J ) IB = IBVAL( J ) JB = JBVAL( J ) * MC = MCVAL( J ) NC = NCVAL( J ) IMBC = IMBCVAL( J ) MBC = MBCVAL( J ) INBC = INBCVAL( J ) NBC = NBCVAL( J ) RSRCC = RSCCVAL( J ) CSRCC = CSCCVAL( J ) IC = ICVAL( J ) JC = JCVAL( J ) * IF( IAM.EQ.0 ) THEN * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, K, SIDE, UPLO, TRANSA, $ TRANSB, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IB, JB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IC, JC, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9980 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( SIDE, 'L' ).AND. $ .NOT.LSAME( SIDE, 'R' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'SIDE' GO TO 40 END IF * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' GO TO 40 END IF * IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSA' GO TO 40 END IF * IF( .NOT.LSAME( TRANSB, 'N' ).AND. $ .NOT.LSAME( TRANSB, 'T' ).AND. $ .NOT.LSAME( TRANSB, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSB' GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'DIAG' GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, 0, 0, IERR( 1 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'B', DESCB, $ BLOCK_CYCLIC_2D_INB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB, MPB, NQB, IPREB, $ IMIDB, IPOSTB, 0, 0, IERR( 2 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'C', DESCC, $ BLOCK_CYCLIC_2D_INB, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC, MPC, NQC, IPREC, $ IMIDC, IPOSTC, 0, 0, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN GO TO 40 END IF * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPB = IPA + DESCA( LLD_ )*NQA IPC = IPB + DESCB( LLD_ )*NQB * * Check if sufficient memory. * MEMREQD = IPC + DESCC( LLD_ )*NQC - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) MEMREQD*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) GO TO 40 END IF * * Loop over all PBLAS 3 routines * DO 30 L = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( L ) ) $ GO TO 30 * * Define the size of the operands * IF( L.EQ.1 ) THEN * * PCGEMM * NROWC = M NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF( LSAME( TRANSB, 'N' ) ) THEN NROWB = K NCOLB = N ELSE NROWB = N NCOLB = K END IF ELSE IF( L.EQ.2 .OR. L.EQ.3 ) THEN * * PCSYMM, PCHEMM * NROWC = M NCOLC = N NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF ELSE IF( L.EQ.4 .OR. L.EQ.5 ) THEN * * PCSYRK, PCHERK * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K ELSE NROWA = K NCOLA = N END IF NROWB = 0 NCOLB = 0 ELSE IF( L.EQ.6 .OR. L.EQ.7 ) THEN * * PCSYR2K, PCHER2K * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K NROWB = N NCOLB = K ELSE NROWA = K NCOLA = N NROWB = K NCOLB = N END IF ELSE IF( L.EQ.8 .OR. L.EQ.9 ) THEN * * PCTRMM, PCTRSM * NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF NROWC = 0 NCOLC = 0 ELSE IF( L.EQ.10 .OR. L.EQ.11 ) THEN * * PCGEADD, PCTRADD * IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = N ELSE NROWA = N NCOLA = M END IF NROWC = M NCOLC = N NROWB = 0 NCOLB = 0 * END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWB, NCOLB, 'B', IB, JB, $ DESCB, IERR( 2 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWC, NCOLC, 'C', IC, JC, $ DESCC, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN GO TO 30 END IF * * Check special values of TRANSA for symmetric and * hermitian rank-k and rank-2k updates. * IF( L.EQ.4 .OR. L.EQ.6 ) THEN IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) SNAMES( L ), 'TRANSA' GO TO 30 END IF ELSE IF( L.EQ.5 .OR. L.EQ.7 ) THEN IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) SNAMES( L ), 'TRANSA' GO TO 30 END IF END IF * * Generate distributed matrices A, B and C * IF( L.EQ.2 ) THEN * * PCSYMM * AFORM = 'S' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.3 ) THEN * * PCHEMM * AFORM = 'H' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.4 .OR. L.EQ.6 ) THEN * * PCSYRK, PCSYR2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'S' OFFDC = IC - JC * ELSE IF( L.EQ.5 .OR. L.EQ.7 ) THEN * * PCHERK, PCHER2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'H' OFFDC = IC - JC * ELSE IF( ( L.EQ.9 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN * * PCTRSM * AFORM = 'N' ADIAGDO = 'D' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE * * Default values * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'N' OFFDC = 0 * END IF * CALL PCLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) IF( ( L.EQ.9 ).AND.( .NOT.( LSAME( DIAG, 'N' ) ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / CMPLX( REAL( MAX( NROWA, NCOLA ) ) ) IF( LSAME( UPLO, 'L' ) ) THEN CALL PCLASCAL( 'Lower', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA+1, JA, DESCA ) ELSE CALL PCLASCAL( 'Upper', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA, JA+1, DESCA ) END IF * END IF * IF( BCHECK( L ) ) $ CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCB, IBSEED, MEM( IPB ), $ DESCB( LLD_ ) ) * IF( CCHECK( L ) ) $ CALL PCLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCC, ICSEED, MEM( IPC ), $ DESCC( LLD_ ) ) * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the Level 3 PBLAS routine * IF( L.EQ.1 ) THEN * * Test PCGEMM * NOPS = PDOPBL3( SNAMES( L ), M, N, K ) * CALL PB_TIMER( 1 ) CALL PCGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.2 ) THEN * * Test PCSYMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PCSYMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.3 ) THEN * * Test PCHEMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PCHEMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.4 ) THEN * * Test PCSYRK * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PCSYRK( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.5 ) THEN * * Test PCHERK * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PCHERK( UPLO, TRANSA, N, K, REAL( ALPHA ), $ MEM( IPA ), IA, JA, DESCA, REAL( BETA ), $ MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.6 ) THEN * * Test PCSYR2K * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PCSYR2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.7 ) THEN * * Test PCHER2K * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PCHER2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, REAL( BETA ), MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.8 ) THEN * * Test PCTRMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PCTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.9 ) THEN * * Test PCTRSM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PCTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.10 ) THEN * * Test PCGEADD * NOPS = PDOPBL3( SNAMES( L ), M, N, M ) * CALL PB_TIMER( 1 ) CALL PCGEADD( TRANSA, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.11 ) THEN * * Test PCTRADD * IF( LSAME( UPLO, 'U' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PCTRADD( UPLO, TRANSA, M, N, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9981 ) SNAMES( L ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9986 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '-------------------' ) 9994 FORMAT( 2X, ' M N K SIDE UPLO TRANSA ', $ 'TRANSB DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,1X,I6,6X,A1,5X,A1,7X,A1,7X,A1,5X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IB JB MB NB IMBB INBB', $ ' MBB NBB RSRCB CSRCB' ) 9989 FORMAT( 2X, ' IC JC MC NC IMBC INBC', $ ' MBC NBC RSRCC CSRCC' ) 9988 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9987 FORMAT( 'Not enough memory. Need: ', I12 ) 9986 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9985 FORMAT( 2X, 'End of Tests.' ) 9984 FORMAT( 2X, 'Tests started.' ) 9983 FORMAT( 5X, A, ' ***** ', A, ' has an incorrect value: ', $ ' BYPASS *****' ) 9982 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9981 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9980 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PCBLA3TIM * END SUBROUTINE PCBLA3TIMINFO( SUMMRY, NOUT, NMAT, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, $ NVAL, KVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, $ CSCAVAL, IAVAL, JAVAL, MBVAL, NBVAL, $ IMBBVAL, MBBVAL, INBBVAL, NBBVAL, $ RSCBVAL, CSCBVAL, IBVAL, JBVAL, $ MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, $ ICVAL, JCVAL, LDVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, NBLOG, LTEST, $ IAM, NPROCS, ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NBLOG, NGRIDS, $ NMAT, NOUT, NPROCS COMPLEX ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), SIDEVAL( LDVAL ), $ TRNAVAL( LDVAL ), TRNBVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCBVAL( LDVAL ), $ CSCCVAL( LDVAL ), IAVAL( LDVAL ), $ IBVAL( LDVAL ), ICVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBBVAL( LDVAL ), $ IMBCVAL( LDVAL ), INBAVAL( LDVAL ), $ INBBVAL( LDVAL ), INBCVAL( LDVAL ), $ JAVAL( LDVAL ), JBVAL( LDVAL ), JCVAL( LDVAL ), $ KVAL( LDVAL ), MAVAL( LDVAL ), MBAVAL( LDVAL ), $ MBBVAL( LDVAL ), MBCVAL( LDVAL ), $ MBVAL( LDVAL ), MCVAL( LDVAL ), MVAL( LDVAL ), $ NAVAL( LDVAL ), NBAVAL( LDVAL ), $ NBBVAL( LDVAL ), NBCVAL( LDVAL ), $ NBVAL( LDVAL ), NCVAL( LDVAL ), NVAL( LDVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCBVAL( LDVAL ), $ RSCCVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PCBLA3TIMINFO get the needed startup information for timing various * Level 3 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * SIDEVAL (global output) CHARACTER array * On entry, SIDEVAL is an array of dimension LDVAL. On exit, * this array contains the values of SIDE to run the code with. * * TRNAVAL (global output) CHARACTER array * On entry, TRNAVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSA to run the code * with. * * TRNBVAL (global output) CHARACTER array * On entry, TRNBVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSB to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * KVAL (global output) INTEGER array * On entry, KVAL is an array of dimension LDVAL. On exit, this * array contains the values of K to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MBVAL (global output) INTEGER array * On entry, MBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( M_ ) to run the code * with. * * NBVAL (global output) INTEGER array * On entry, NBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( N_ ) to run the code * with. * * IMBBVAL (global output) INTEGER array * On entry, IMBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( IMB_ ) to run the * code with. * * MBBVAL (global output) INTEGER array * On entry, MBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( MB_ ) to run the * code with. * * INBBVAL (global output) INTEGER array * On entry, INBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( INB_ ) to run the * code with. * * NBBVAL (global output) INTEGER array * On entry, NBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( NB_ ) to run the * code with. * * RSCBVAL (global output) INTEGER array * On entry, RSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( RSRC_ ) to run the * code with. * * CSCBVAL (global output) INTEGER array * On entry, CSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( CSRC_ ) to run the * code with. * * IBVAL (global output) INTEGER array * On entry, IBVAL is an array of dimension LDVAL. On exit, this * array contains the values of IB to run the code with. * * JBVAL (global output) INTEGER array * On entry, JBVAL is an array of dimension LDVAL. On exit, this * array contains the values of JB to run the code with. * * MCVAL (global output) INTEGER array * On entry, MCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( M_ ) to run the code * with. * * NCVAL (global output) INTEGER array * On entry, NCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( N_ ) to run the code * with. * * IMBCVAL (global output) INTEGER array * On entry, IMBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( IMB_ ) to run the * code with. * * MBCVAL (global output) INTEGER array * On entry, MBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( MB_ ) to run the * code with. * * INBCVAL (global output) INTEGER array * On entry, INBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( INB_ ) to run the * code with. * * NBCVAL (global output) INTEGER array * On entry, NBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( NB_ ) to run the * code with. * * RSCCVAL (global output) INTEGER array * On entry, RSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( RSRC_ ) to run the * code with. * * CSCCVAL (global output) INTEGER array * On entry, CSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( CSRC_ ) to run the * code with. * * ICVAL (global output) INTEGER array * On entry, ICVAL is an array of dimension LDVAL. On exit, this * array contains the values of IC to run the code with. * * JCVAL (global output) INTEGER array * On entry, JCVAL is an array of dimension LDVAL. On exit, this * array contains the values of JC to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, SIDE, TRANSA, TRANSB, UPLO, * M, N, K, DESCA(:), IA, JA, DESCB(:), IB, JB, DESCC(:), IC, * JC. This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eleven. On * exit, if LTEST( i ) is .TRUE., the i-th Level 3 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) COMPLEX * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) COMPLEX * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+38*NMAT+NSUBS ) with NSUBS = 11. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 11 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, CGEBR2D, CGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PCBLAS3TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( SIDEVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( KVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( ICVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JCVAL ( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( SIDEVAL( J ) ) WORK( I+2 ) = ICHAR( TRNAVAL( J ) ) WORK( I+3 ) = ICHAR( TRNBVAL( J ) ) WORK( I+4 ) = ICHAR( UPLOVAL( J ) ) I = I + 5 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, KVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, ICVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JCVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 3 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex single precision '// $ 'Level 3 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) NMAT WRITE( NOUT, FMT = 9986 ) NBLOG WRITE( NOUT, FMT = 9991 ) NGRIDS WRITE( NOUT, FMT = 9989 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9989 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA WRITE( NOUT, FMT = 9993 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 38*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) SIDEVAL( J ) = CHAR( WORK( I+1 ) ) TRNAVAL( J ) = CHAR( WORK( I+2 ) ) TRNBVAL( J ) = CHAR( WORK( I+3 ) ) UPLOVAL( J ) = CHAR( WORK( I+4 ) ) I = I + 5 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, KVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, ICVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JCVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9993 FORMAT( 2X, 'Beta : (', G16.6, $ ',', G16.6, ')' ) 9992 FORMAT( 2X, 'Number of Tests : ', I6 ) 9991 FORMAT( 2X, 'Number of process grids : ', I6 ) 9990 FORMAT( 2X, ' : ', 5I6 ) 9989 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9987 FORMAT( 2X, ' ', A, A8 ) 9986 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PCBLA3TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/pcblastim.f000644 000766 000024 00000263135 10363532303 017676 0ustar00juliestaff000000 000000 SUBROUTINE PCLASCAL( TYPE, M, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TYPE INTEGER IA, JA, M, N COMPLEX ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLASCAL scales the m by n submatrix A(IA:IA+M-1,JA:JA+N-1) denoted * by sub( A ) by the scalar alpha. TYPE specifies if sub( A ) is full, * upper triangular, lower triangular or upper Hessenberg. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * On entry, TYPE specifies the type of the input submatrix as * follows: * = 'L' or 'l': sub( A ) is a lower triangular matrix, * = 'U' or 'u': sub( A ) is an upper triangular matrix, * = 'H' or 'h': sub( A ) is an upper Hessenberg matrix, * otherwise sub( A ) is a full matrix. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( A ) are overwritten by * the local entries of the m by n scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 UPLO LOGICAL GODOWN, GOLEFT, LOWER, UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, ITYPE, $ IUPP, JJA, JJMAX, JOFFA, JOFFD, LCMT, LCMT00, $ LDA, LMBLOC, LNBLOC, LOW, M1, MB, MBLKD, MBLKS, $ MBLOC, MP, MRCOL, MRROW, MYCOL, MYROW, N1, NB, $ NBLKD, NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, $ QNB, TMP1, UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CLASCAL, PB_DESCTRANS, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_NUMROC EXTERNAL LSAME, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 UPLO = TYPE UPPER = .FALSE. LOWER = .TRUE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 UPLO = TYPE UPPER = .TRUE. LOWER = .FALSE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 UPLO = 'U' UPPER = .TRUE. LOWER = .FALSE. IOFFD = 1 ELSE ITYPE = 0 UPLO = 'A' UPPER = .TRUE. LOWER = .TRUE. IOFFD = 0 END IF * * Compute local indexes * IF( ITYPE.EQ.0 ) THEN * * Full matrix * CALL PB_INFOG2L( IA, JA, DESCA2, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = PB_NUMROC( M, IA, DESCA2( IMB_ ), DESCA2( MB_ ), MYROW, $ DESCA2( RSRC_ ), NPROW ) NQ = PB_NUMROC( N, JA, DESCA2( INB_ ), DESCA2( NB_ ), MYCOL, $ DESCA2( CSRC_ ), NPCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * LDA = DESCA2( LLD_ ) IOFFA = IIA + ( JJA - 1 ) * LDA * CALL PB_CLASCAL( 'All', MP, NQ, 0, ALPHA, A( IOFFA ), LDA ) * ELSE * * Trapezoidal matrix * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, * LNBLOC, ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) LDA = DESCA2( LLD_ ) * CALL PB_BINFO( IOFFD, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * M1 = MP N1 = NQ IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and * update LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_CLASCAL( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_CLASCAL( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_CLASCAL( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_CLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_CLASCAL( UPLO, MBLOC, INBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_CLASCAL( 'All', TMP1, INBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_CLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_CLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_CLASCAL( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_CLASCAL( 'All', IMBLOC, TMP1, 0, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_CLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_CLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_CLASCAL( UPLO, MBLOC, NBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_CLASCAL( 'All', TMP1, NBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_CLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * END IF * RETURN * * End of PCLASCAL * END SUBROUTINE PCLAGEN( INPLACE, AFORM, DIAG, OFFA, M, N, IA, JA, $ DESCA, IASEED, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE CHARACTER*1 AFORM, DIAG INTEGER IA, IASEED, JA, LDA, M, N, OFFA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PCLAGEN generates (or regenerates) a submatrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the generated submatrix is diago- * nally dominant or not as follows: * DIAG = 'D' : sub( A ) is diagonally dominant, * DIAG = 'N' : sub( A ) is not diagonally dominant. * * OFFA (global input) INTEGER * On entry, OFFA specifies the offdiagonal of the underlying * matrix A(1:DESCA(M_),1:DESCA(N_)) of interest when the subma- * trix is symmetric, Hermitian or diagonally dominant. OFFA = 0 * specifies the main diagonal, OFFA > 0 specifies a subdiago- * nal, and OFFA < 0 specifies a superdiagonal (see further de- * tails). * * M (global input) INTEGER * On entry, M specifies the global number of matrix rows of the * submatrix sub( A ) to be generated. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of matrix columns of * the submatrix sub( A ) to be generated. N must be at least * zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IASEED (global input) INTEGER * On entry, IASEED specifies the seed number to generate the * matrix A. IASEED must be at least zero. * * A (local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). On exit, this array contains the * local entries of the randomly generated submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. When INPLACE is .FALSE., LDA is usually DESCA(LLD_). * This restriction is however not enforced, and this subroutine * requires only that LDA >= MAX( 1, Mp ) where * * Mp = PB_NUMROC( M, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ). * * PB_NUMROC is a ScaLAPACK tool function; MYROW, MYCOL, NPROW * and NPCOL can be determined by calling the BLACS subroutine * BLACS_GRIDINFO. * * Further Details * =============== * * OFFD is tied to the matrix described by DESCA, as opposed to the * piece that is currently (re)generated. This is a global information * independent from the distribution parameters. Below are examples of * the meaning of OFFD for a global 7 by 5 matrix: * * --------------------------------------------------------------------- * OFFD | 0 -1 -2 -3 -4 0 -1 -2 -3 -4 0 -1 -2 -3 -4 * -------|------------------------------------------------------------- * | | OFFD=-1 | OFFD=0 OFFD=2 * | V V * 0 | . d . . . -> d . . . . . . . . . * 1 | . . d . . . d . . . . . . . . * 2 | . . . d . . . d . . -> d . . . . * 3 | . . . . d . . . d . . d . . . * 4 | . . . . . . . . . d . . d . . * 5 | . . . . . . . . . . . . . d . * 6 | . . . . . . . . . . . . . . d * --------------------------------------------------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL DIAGDO, SYMM, HERM, NOTRAN INTEGER CSRC, I, IACOL, IAROW, ICTXT, IIA, ILOCBLK, $ ILOCOFF, ILOW, IMB, IMB1, IMBLOC, IMBVIR, INB, $ INB1, INBLOC, INBVIR, INFO, IOFFDA, ITMP, IUPP, $ IVIR, JJA, JLOCBLK, JLOCOFF, JVIR, LCMT00, $ LMBLOC, LNBLOC, LOW, MAXMN, MB, MBLKS, MP, $ MRCOL, MRROW, MYCDIST, MYCOL, MYRDIST, MYROW, $ NB, NBLKS, NPCOL, NPROW, NQ, NVIR, RSRC, UPP COMPLEX ALPHA * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ), IMULADD( 4, JMP_LEN ), $ IRAN( 2 ), JMP( JMP_LEN ), MULADD0( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CHKMAT, PB_CLAGEN, PB_DESCTRANS, PB_INITJMP, $ PB_INITMULADD, PB_JUMP, PB_JUMPIT, PB_LOCINFO, $ PB_SETLOCRAN, PB_SETRAN, PCLADOM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL * .. * .. Data Statements .. DATA ( MULADD0( I ), I = 1, 4 ) / 20077, 16838, $ 12345, 0 / * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Test the input arguments * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1000 + CTXT_ ) ELSE SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) NOTRAN = LSAME( AFORM, 'N' ) DIAGDO = LSAME( DIAG, 'D' ) IF( .NOT.( SYMM.OR.HERM.OR.NOTRAN ) .AND. $ .NOT.( LSAME( AFORM, 'T' ) ) .AND. $ .NOT.( LSAME( AFORM, 'C' ) ) ) THEN INFO = -2 ELSE IF( ( .NOT.DIAGDO ) .AND. $ ( .NOT.LSAME( DIAG, 'N' ) ) ) THEN INFO = -3 END IF CALL PB_CHKMAT( ICTXT, M, 5, N, 6, IA, JA, DESCA2, 10, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCLAGEN', -INFO ) RETURN END IF * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) IMB = DESCA2( IMB_ ) INB = DESCA2( INB_ ) RSRC = DESCA2( RSRC_ ) CSRC = DESCA2( CSRC_ ) * * Figure out local information about the distributed matrix operand * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * IOFFDA = JA + OFFA - IA CALL PB_BINFO( IOFFDA, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * * Initialize ILOCBLK, ILOCOFF, MYRDIST, JLOCBLK, JLOCOFF, MYCDIST * This values correspond to the square virtual underlying matrix * of size MAX( M_ + MAX( 0, -OFFA ), N_ + MAX( 0, OFFA ) ) used * to set up the random sequence. For practical purposes, the size * of this virtual matrix is upper bounded by M_ + N_ - 1. * ITMP = MAX( 0, -OFFA ) IVIR = IA + ITMP IMBVIR = IMB + ITMP NVIR = DESCA2( M_ ) + ITMP * CALL PB_LOCINFO( IVIR, IMBVIR, MB, MYROW, RSRC, NPROW, ILOCBLK, $ ILOCOFF, MYRDIST ) * ITMP = MAX( 0, OFFA ) JVIR = JA + ITMP INBVIR = INB + ITMP NVIR = MAX( MAX( NVIR, DESCA2( N_ ) + ITMP ), $ DESCA2( M_ ) + DESCA2( N_ ) - 1 ) * CALL PB_LOCINFO( JVIR, INBVIR, NB, MYCOL, CSRC, NPCOL, JLOCBLK, $ JLOCOFF, MYCDIST ) * IF( SYMM .OR. HERM .OR. NOTRAN ) THEN * CALL PB_INITJMP( .TRUE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 2, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_CLAGEN( 'Lower', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( SYMM .OR. HERM .OR. ( .NOT. NOTRAN ) ) THEN * CALL PB_INITJMP( .FALSE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 2, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_CLAGEN( 'Upper', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( DIAGDO ) THEN * MAXMN = MAX( DESCA2( M_ ), DESCA2( N_ ) ) IF( HERM ) THEN ALPHA = CMPLX( REAL( 2 * MAXMN ), ZERO ) ELSE ALPHA = CMPLX( REAL( MAXMN ), REAL( MAXMN ) ) END IF * IF( IOFFDA.GE.0 ) THEN CALL PCLADOM( INPLACE, MIN( MAX( 0, M-IOFFDA ), N ), ALPHA, $ A, MIN( IA+IOFFDA, IA+M-1 ), JA, DESCA ) ELSE CALL PCLADOM( INPLACE, MIN( M, MAX( 0, N+IOFFDA ) ), ALPHA, $ A, IA, MIN( JA-IOFFDA, JA+N-1 ), DESCA ) END IF * END IF * RETURN * * End of PCLAGEN * END SUBROUTINE PCLADOM( INPLACE, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE INTEGER IA, JA, N COMPLEX ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLADOM adds alpha to the diagonal entries of an n by n submatrix * sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * N (global input) INTEGER * On entry, N specifies the global order of the submatrix * sub( A ) to be modified. N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the local entries * of this array corresponding to the main diagonal of sub( A ) * have been updated. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP COMPLEX ATMP * .. * .. Local Scalars .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, MAX, MIN, REAL * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.EQ.0 ) $ RETURN * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) * CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA+LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PCLADOM * END SUBROUTINE PB_CLASCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX ALPHA * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PB_CLASCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (input/output) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA * A( I, J ) 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = ALPHA * A( I, J ) 50 CONTINUE 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA * A( I, J ) 100 CONTINUE 110 CONTINUE * END IF * RETURN * * End of PB_CLASCAL * END SUBROUTINE PB_CLAGEN( UPLO, AFORM, A, LDA, LCMT00, IRAN, MBLKS, $ IMBLOC, MB, LMBLOC, NBLKS, INBLOC, NB, $ LNBLOC, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO, AFORM INTEGER IMBLOC, INBLOC, LCMT00, LDA, LMBLOC, LNBLOC, $ MB, MBLKS, NB, NBLKS * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PB_CLAGEN locally initializes an array A. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the lower (UPLO='L') trape- * zoidal part or the upper (UPLO='U') trapezoidal part is to be * generated when the matrix to be generated is symmetric or * Hermitian. For all the other values of AFORM, the value of * this input argument is ignored. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * A (local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, *). On exit, * this array contains the local entries of the randomly genera- * ted submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. LDA must be at least one. * * LCMT00 (global input) INTEGER * On entry, LCMT00 is the LCM value specifying the off-diagonal * of the underlying matrix of interest. LCMT00=0 specifies the * main diagonal, LCMT00 > 0 specifies a subdiagonal, LCMT00 < 0 * specifies superdiagonals. * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2 containing respec- * tively the 16-lower and 16-higher bits of the encoding of the * entry of the random sequence corresponding locally to the * first local array entry to generate. Usually, this array is * computed by PB_SETLOCRAN. * * MBLKS (local input) INTEGER * On entry, MBLKS specifies the local number of blocks of rows. * MBLKS is at least zero. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * LMBLOC (local input) INTEGER * On entry, LMBLOC specifies the number of rows (size) of the * local lowest blocks. LMBLOC is at least zero. * * NBLKS (local input) INTEGER * On entry, NBLKS specifies the local number of blocks of co- * lumns. NBLKS is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * LNBLOC (local input) INTEGER * On entry, LNBLOC specifies the number of columns (size) of * the local rightmost blocks. LNBLOC is at least zero. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the random matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IB, IBLK, II, IK, ITMP, JB, JBLK, JJ, JK, $ JTMP, LCMTC, LCMTR, LOW, MNB, UPP COMPLEX DUMMY * .. * .. Local Arrays .. INTEGER IB0( 2 ), IB1( 2 ), IB2( 2 ), IB3( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMPIT * .. * .. External Functions .. LOGICAL LSAME REAL PB_SRAND EXTERNAL LSAME, PB_SRAND * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL * .. * .. Executable Statements .. * DO 10 I = 1, 2 IB1( I ) = IRAN( I ) IB2( I ) = IRAN( I ) IB3( I ) = IRAN( I ) 10 CONTINUE * IF( LSAME( AFORM, 'N' ) ) THEN * * Generate random matrix * JJ = 1 * DO 50 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * DO 40 JK = JJ, JJ + JB - 1 * II = 1 * DO 30 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * * Blocks are IB by JB * DO 20 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), PB_SRAND( 0 ) ) 20 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 30 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 40 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 50 CONTINUE * ELSE IF( LSAME( AFORM, 'T' ) ) THEN * * Generate the transpose of the matrix that would be normally * generated. * II = 1 * DO 90 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 80 IK = II, II + IB - 1 * JJ = 1 * DO 70 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 60 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), PB_SRAND( 0 ) ) 60 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 70 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 80 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 90 CONTINUE * ELSE IF( LSAME( AFORM, 'S' ) ) THEN * * Generate a symmetric matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 170 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 160 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 150 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 100 IK = II, II + IB - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 100 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 110 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 110 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 120 IK = II, ITMP - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 120 CONTINUE * DO 130 IK = ITMP, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 130 CONTINUE * END IF * ELSE * DO 140 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 140 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 150 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 160 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 170 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 250 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 240 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 230 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 180 JK = JJ, JJ + JB - 1 DUMMY = CMPLX( PB_SRAND( 0 ), PB_SRAND( 0 ) ) 180 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 190 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 190 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 200 JK = JJ, JTMP - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 200 CONTINUE * DO 210 JK = JTMP, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 210 CONTINUE * END IF * ELSE * DO 220 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 220 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 230 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 240 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 250 CONTINUE * END IF * ELSE IF( LSAME( AFORM, 'C' ) ) THEN * * Generate the conjugate transpose of the matrix that would be * normally generated. * II = 1 * DO 290 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 280 IK = II, II + IB - 1 * JJ = 1 * DO 270 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 260 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 260 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 270 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 280 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 290 CONTINUE * ELSE IF( LSAME( AFORM, 'H' ) ) THEN * * Generate a Hermitian matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 370 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 360 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 350 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 300 IK = II, II + IB - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 300 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 310 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 310 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 320 IK = II, ITMP - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 320 CONTINUE * IF( ITMP.LE.( II + IB - 1 ) ) THEN DUMMY = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) A( ITMP, JK ) = CMPLX( REAL( DUMMY ), $ ZERO ) END IF * DO 330 IK = ITMP + 1, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 330 CONTINUE * END IF * ELSE * DO 340 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 340 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 350 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 360 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 370 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 450 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 440 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 430 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 380 JK = JJ, JJ + JB - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 380 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 390 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 390 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 400 JK = JJ, JTMP - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 400 CONTINUE * IF( JTMP.LE.( JJ + JB - 1 ) ) THEN DUMMY = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) A( IK, JTMP ) = CMPLX( REAL( DUMMY ), $ ZERO ) END IF * DO 410 JK = JTMP + 1, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 410 CONTINUE * END IF * ELSE * DO 420 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 420 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 430 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 440 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 450 CONTINUE * END IF * END IF * RETURN * * End of PB_CLAGEN * END REAL FUNCTION PB_SRAND( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_SRAND generates the next number in the random sequence. This func- * tion ensures that this number will be in the interval ( -1.0, 1.0 ). * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. External Functions .. REAL PB_SRAN EXTERNAL PB_SRAN * .. * .. Executable Statements .. * PB_SRAND = ONE - TWO * PB_SRAN( IDUMM ) * RETURN * * End of PB_SRAND * END REAL FUNCTION PB_SRAN( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_SRAN generates the next number in the random sequence. * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL DIVFAC, POW16 PARAMETER ( DIVFAC = 2.147483648E+9, $ POW16 = 6.5536E+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Intrinsic Functions .. INTRINSIC REAL * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * PB_SRAN = ( REAL( IRAND( 1 ) ) + POW16 * REAL( IRAND( 2 ) ) ) / $ DIVFAC * CALL PB_LMUL( IRAND, IACS, J ) CALL PB_LADD( J, IACS( 3 ), IRAND ) * RETURN * * End of PB_SRAN * END scalapack-2.0.2/PBLAS/TIMING/PDBLAS1TIM.dat000644 000766 000024 00000002327 10363532303 017635 0ustar00juliestaff000000 000000 'Level 1 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PDBLAS1TIM.SUMM' output file name (if any) 6 device out 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q 2.0D0 value of ALPHA 2 number of tests problems 1000 1000 values of N 1000 1 values of M_X 1 1500 values of N_X 32 32 values of IMB_X 32 32 values of INB_X 32 32 values of MB_X 32 32 values of NB_X 0 0 values of RSRC_X 0 0 values of CSRC_X 1 1 values of IX 1 1 values of JX 1 1 values of INCX 1 1 values of M_Y 1000 1500 values of N_Y 32 32 values of IMB_Y 32 32 values of INB_Y 32 32 values of MB_Y 32 32 values of NB_Y 0 0 values of RSRC_Y 0 0 values of CSRC_Y 1 1 values of IY 1 1 values of JY 1 1 values of INCY PDSWAP T put F for no test in the same column PDSCAL T put F for no test in the same column PDCOPY T put F for no test in the same column PDAXPY T put F for no test in the same column PDDOT T put F for no test in the same column PDNRM2 T put F for no test in the same column PDASUM T put F for no test in the same column PDAMAX T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/pdblas1tim.f000644 000766 000024 00000122471 11750130340 017751 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PDSWAP ', 'PDSCAL ', 'PDCOPY ', $ 'PDAXPY ', 'PDDOT ', 'PDNRM2 ', $ 'PDASUM ', 'PDAMAX '/ END BLOCK DATA PROGRAM PDBLA1TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PDBLA1TIM is the main timing program for the Level 1 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 40 lines: * 'Level 1 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PDBLAS1TIM.SUMM' output file name (if any) * 6 device out * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0D0 value of ALPHA * 2 number of tests problems * 3 4 values of N * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PDSWAP T put F for no test in the same column * PDSCAL T put F for no test in the same column * PDCOPY T put F for no test in the same column * PDAXPY T put F for no test in the same column * PDDOT T put F for no test in the same column * PDNRM2 T put F for no test in the same column * PDASUM T put F for no test in the same column * PDAMAX T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * DBLESZ indicates the length in bytes on the given platform * for a double precision real. By default, DBLESZ is set to * eight. * * MEM DOUBLE PRECISION array * MEM is an array of dimension TOTMEM / DBLESZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, DBLESZ, TOTMEM, MEMSIZ, $ NSUBS PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, DBLESZ = 8, $ TOTMEM = 2000000, NSUBS = 8, $ MEMSIZ = TOTMEM / DBLESZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER CSRCX, CSRCY, I, IAM, ICTXT, IMBX, IMBY, IMIDX, $ IMIDY, INBX, INBY, INCX, INCY, IPOSTX, IPOSTY, $ IPREX, IPREY, IPX, IPY, IX, IXSEED, IY, IYSEED, $ J, JX, JY, K, MBX, MBY, MEMREQD, MPX, MPY, MX, $ MY, MYCOL, MYROW, N, NBX, NBY, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQX, NQY, NTESTS, NX, NY, $ PISCLR, RSRCX, RSRCY DOUBLE PRECISION ADDS, ALPHA, CFLOPS, MULTS, NOPS, PSCLR, $ PUSCLR, WFLOPS * .. * .. Local Arrays .. CHARACTER*80 OUTFILE LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) INTEGER CSCXVAL( MAXTESTS ), CSCYVAL( MAXTESTS ), $ DESCX( DLEN_ ), DESCY( DLEN_ ), IERR( 2 ), $ IMBXVAL( MAXTESTS ), IMBYVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JXVAL( MAXTESTS ), JYVAL( MAXTESTS ), $ MBXVAL( MAXTESTS ), MBYVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCXVAL( MAXTESTS ), $ RSCYVAL( MAXTESTS ) DOUBLE PRECISION CTIME( 1 ), MEM( MEMSIZ ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PDAMAX, PDASUM, PDAXPY, $ PDBLA1TIMINFO, PDCOPY, PDDOT, PDLAGEN, PDNRM2, $ PDSCAL, PDSWAP, PVDESCCHK, PVDIMCHK * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .FALSE., .TRUE., .TRUE., .TRUE., $ .FALSE., .FALSE., .FALSE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * Seeds for random matrix generations. * IXSEED = 100 IYSEED = 200 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDBLA1TIMINFO( OUTFILE, NOUT, NTESTS, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, RSCXVAL, $ CSCXVAL, IXVAL, JXVAL, INCXVAL, MYVAL, $ NYVAL, IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, INCYVAL, $ MAXTESTS, NGRIDS, PVAL, MAXGRIDS, QVAL, $ MAXGRIDS, LTEST, IAM, NPROCS, ALPHA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * N = NVAL( J ) MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9983 ) END IF * * Check the validity of the input and initialize DESC_ * CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, 0, 0, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, 0, 0, IERR( 2 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 ) $ GO TO 40 * * Assign pointers into MEM for matrices corresponding to * vectors X and Y. Ex: IPX starts at position MEM( 1 ). * IPX = 1 IPY = IPX + DESCX( LLD_ ) * NQX * * Check if sufficient memory. * MEMREQD = IPY + DESCY( LLD_ ) * NQY - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) MEMREQD*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) GO TO 40 END IF * * Loop over all PBLAS 1 routines * DO 30 K = 1, NSUBS * * Continue only if this sub has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * * Check the validity of the operand sizes * CALL PVDIMCHK( ICTXT, NOUT, N, 'X', IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, N, 'Y', IY, JY, DESCY, INCY, $ IERR( 2 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 ) $ GO TO 30 * * Generate distributed matrices X and Y * CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the PBLAS routine * IF( K.EQ.1 ) THEN * * Test PDSWAP * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PDSWAP( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.2 ) THEN * * Test PDSCAL * ADDS = 0.0D+0 MULTS = DBLE( N ) CALL PB_TIMER( 1 ) CALL PDSCAL( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.3 ) THEN * * Test PDCOPY * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PDCOPY( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.4 ) THEN * * Test PDAXPY * ADDS = DBLE( N ) MULTS = DBLE( N ) CALL PB_TIMER( 1 ) CALL PDAXPY( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.5 ) THEN * * Test PDDOT * ADDS = DBLE( N-1 ) MULTS = DBLE( N ) CALL PB_TIMER( 1 ) CALL PDDOT( N, PSCLR, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.6 ) THEN * * Test PDNRM2 * ADDS = DBLE( N-1 ) MULTS = DBLE( N ) CALL PB_TIMER( 1 ) CALL PDNRM2( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.7 ) THEN * * Test PDASUM * ADDS = DBLE( N - 1 ) MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PDASUM( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.8 ) THEN * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PDAMAX( N, PSCLR, PISCLR, MEM( IPX ), IX, JX, $ DESCX, INCX ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Calculate total flops * NOPS = ADDS + MULTS * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9984 ) SNAMES( K ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9988 ) J END IF * 50 CONTINUE * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, '---------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' N IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9993 FORMAT( 2X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I5,1X,I5,1X,I5,1X,I5,1X, $ I5,1X,I5,1X,I6 ) 9992 FORMAT( 2X, ' N IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9991 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9990 FORMAT( 'Not enough memory. Need: ', I12 ) 9988 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9987 FORMAT( 2X, 'End of Tests.' ) 9986 FORMAT( 2X, 'Tests started.' ) 9985 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9984 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9983 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PDBLA1TIM * END SUBROUTINE PDBLA1TIMINFO( SUMMRY, NOUT, NMAT, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, $ RSCXVAL, CSCXVAL, IXVAL, JXVAL, $ INCXVAL, MYVAL, NYVAL, IMBYVAL, MBYVAL, $ INBYVAL, NBYVAL, RSCYVAL, CSCYVAL, $ IYVAL, JYVAL, INCYVAL, LDVAL, NGRIDS, $ PVAL, LDPVAL, QVAL, LDQVAL, LTEST, IAM, $ NPROCS, ALPHA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NGRIDS, NMAT, NOUT, $ NPROCS DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY LOGICAL LTEST( * ) INTEGER CSCXVAL( LDVAL ), CSCYVAL( LDVAL ), $ IMBXVAL( LDVAL ), IMBYVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JXVAL( LDVAL ), $ JYVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCXVAL( LDVAL ), RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PDBLA1TIMINFO get the needed startup information for timing various * Level 1 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DESCX(:), IX, JX, INCX, DESCY(:), * IY, JY and INCY. This is also the maximum number of test * cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 1 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) DOUBLE PRECISION * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 2, 2*NGRIDS+23*NMAT+NSUBS ) with NSUBS = 8. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, DGEBR2D, DGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PDBLAS1TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 100 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 100 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA * READ( NIN, FMT = * ) ALPHA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 100 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 100 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, WORK, 2 ) * I = 1 CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 70 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 70 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 1 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Timing of the real double precision '// $ 'Level 1 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9989 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9989 ) SNAMES( 1 ), ' ... No ' END IF DO 80 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( I ), ' ... No ' END IF 80 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, WORK, 2, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) * I = 2*NGRIDS + 23*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 90 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 90 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 100 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : ', G16.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9989 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9988 FORMAT( 2X, ' ', A, A8 ) * * End of PDBLA1TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/PDBLAS2TIM.dat000644 000766 000024 00000005776 10363532303 017651 0ustar00juliestaff000000 000000 'Level 2 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PDBLAS2TIM.SUMM' output file name (if any) 6 device out 10 value of the logical computational blocksize NB 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q 2.0D0 value of ALPHA 3.0D0 value of BETA 16 number of tests problems 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' values of UPLO 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANS 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' values of DIAG 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 values of M 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 values of N 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_A 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_A 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_A 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IA 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JA 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_X 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_X 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_X 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IX 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JX 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of INCX 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_Y 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_Y 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IY 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JY 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of INCY PDGEMV T put F for no test in the same column PDSYMV T put F for no test in the same column PDTRMV T put F for no test in the same column PDTRSV T put F for no test in the same column PDGER T put F for no test in the same column PDSYR T put F for no test in the same column PDSYR2 T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/pdblas2tim.f000644 000766 000024 00000153533 11750130340 017755 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 7) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PDGEMV ', 'PDSYMV ', 'PDTRMV ', $ 'PDTRSV ', 'PDGER ', 'PDSYR ', $ 'PDSYR2 '/ END BLOCK DATA PROGRAM PDBLA2TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PDBLA2TIM is the main timing program for the Level 2 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 55 lines: * 'Level 2 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PDBLAS2TIM.SUMM' output file name (if any) * 6 device out * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0D0 value of ALPHA * 1.0D0 value of BETA * 2 number of tests problems * 'U' 'L' values of UPLO * 'N' 'T' values of TRANS * 'N' 'U' values of DIAG * 3 4 values of M * 3 4 values of N * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PDGEMV T put F for no test in the same column * PDSYMV T put F for no test in the same column * PDTRMV T put F for no test in the same column * PDTRSV T put F for no test in the same column * PDGER T put F for no test in the same column * PDSYR T put F for no test in the same column * PDSYR2 T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * DBLESZ indicates the length in bytes on the given platform * for a double precision real. By default, DBLESZ is set to * eight. * * MEM DOUBLE PRECISION array * MEM is an array of dimension TOTMEM / DBLESZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, DBLESZ, TOTMEM, MEMSIZ, $ NSUBS DOUBLE PRECISION ONE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, DBLESZ = 8, $ ONE = 1.0D+0, TOTMEM = 2000000, NSUBS = 7, $ MEMSIZ = TOTMEM / DBLESZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 AFORM, DIAG, DIAGDO, TRANS, UPLO INTEGER CSRCA, CSRCX, CSRCY, I, IA, IAM, IASEED, ICTXT, $ IMBA, IMBX, IMBY, IMIDA, IMIDX, IMIDY, INBA, $ INBX, INBY, INCX, INCY, IPA, IPOSTA, IPOSTX, $ IPOSTY, IPREA, IPREX, IPREY, IPX, IPY, IX, $ IXSEED, IY, IYSEED, J, JA, JX, JY, K, M, MA, $ MBA, MBX, MBY, MEMREQD, MPA, MPX, MPY, MX, MY, $ MYCOL, MYROW, N, NA, NBA, NBX, NBY, NCOLA, $ NGRIDS, NLX, NLY, NOUT, NPCOL, NPROCS, NPROW, $ NQA, NQX, NQY, NROWA, NTESTS, NX, NY, OFFD, $ RSRCA, RSRCX, RSRCY DOUBLE PRECISION ALPHA, BETA, CFLOPS, NOPS, SCALE, WFLOPS * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), TRANVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCXVAL( MAXTESTS ), $ CSCYVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ), $ IAVAL( MAXTESTS ), IERR( 3 ), $ IMBAVAL( MAXTESTS ), IMBXVAL( MAXTESTS ), $ IMBYVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JXVAL( MAXTESTS ), $ JYVAL( MAXTESTS ), MAVAL( MAXTESTS ), $ MBAVAL( MAXTESTS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NAVAL( MAXTESTS ), NBAVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCAVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) DOUBLE PRECISION CTIME( 1 ), MEM( MEMSIZ ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PDBLA2TIMINFO, PDGEMV, PDGER, $ PDLAGEN, PDLASCAL, PDSYMV, PDSYR, PDSYR2, $ PDTRMV, PDTRSV, PMDESCCHK, PMDIMCHK, PVDESCCHK, $ PVDIMCHK * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDOPBL2 EXTERNAL LSAME, PDOPBL2 * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .TRUE., .FALSE., .FALSE., $ .TRUE., .FALSE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .TRUE. * * Seeds for random matrix generations. * IASEED = 100 IXSEED = 200 IYSEED = 300 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDBLA2TIMINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, $ IAVAL, JAVAL, MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, $ CSCYVAL, IYVAL, JYVAL, INCYVAL, MAXTESTS, $ NGRIDS, PVAL, MAXGRIDS, QVAL, MAXGRIDS, $ NBLOG, LTEST, IAM, NPROCS, ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) TRANS = TRANVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) * MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, UPLO, TRANS, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9980 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' GO TO 40 END IF * IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANS' GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) TRANS WRITE( NOUT, FMT = 9997 ) 'DIAG' GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, 0, 0, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, 0, 0, IERR( 2 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, 0, 0, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN GO TO 40 END IF * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = 1 IPX = IPA + DESCA( LLD_ ) * NQA IPY = IPX + DESCX( LLD_ ) * NQX * * Check if sufficient memory. * MEMREQD = IPY + DESCY( LLD_ ) * NQY - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) MEMREQD*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) GO TO 40 END IF * * Loop over all PBLAS 2 routines * DO 30 K = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * * Define the size of the operands * IF( K.EQ.1 ) THEN NROWA = M NCOLA = N IF( LSAME( TRANS, 'N' ) ) THEN NLX = N NLY = M ELSE NLX = M NLY = N END IF ELSE IF( K.EQ.5 ) THEN NROWA = M NCOLA = N NLX = M NLY = N ELSE NROWA = N NCOLA = N NLX = N NLY = N END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLX, 'X', IX, JX, DESCX, $ INCX, IERR( 2 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLY, 'Y', IY, JY, DESCY, $ INCY, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN GO TO 30 END IF * * Generate distributed matrices A, X and Y * IF( K.EQ.2 .OR. K.EQ.6 .OR. K.EQ.7 ) THEN AFORM = 'S' DIAGDO = 'N' OFFD = IA - JA ELSE IF( ( K.EQ.4 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN AFORM = 'N' DIAGDO = 'D' OFFD = IA - JA ELSE AFORM = 'N' DIAGDO = 'N' OFFD = 0 END IF * CALL PDLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, $ 1, 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MY, $ NY, 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * IF( ( K.EQ.4 ).AND.( .NOT.( LSAME( DIAG, 'N' ) ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / DBLE( MAX( NROWA, NCOLA ) ) IF( LSAME( UPLO, 'L' ) ) THEN CALL PDLASCAL( 'Lower', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA+1, JA, DESCA ) ELSE CALL PDLASCAL( 'Upper', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA, JA+1, DESCA ) END IF END IF * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the Level 2 PBLAS routine * IF( K.EQ.1 ) THEN * * Test PDGEMV * CALL PB_TIMER( 1 ) CALL PDGEMV( TRANS, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.2 ) THEN * * Test PDSYMV * CALL PB_TIMER( 1 ) CALL PDSYMV( UPLO, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.3 ) THEN * * Test PDTRMV * CALL PB_TIMER( 1 ) CALL PDTRMV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.4 ) THEN * * Test PDTRSV * CALL PB_TIMER( 1 ) CALL PDTRSV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.5 ) THEN * * Test PDGER * CALL PB_TIMER( 1 ) CALL PDGER( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.6 ) THEN * * Test PDSYR * CALL PB_TIMER( 1 ) CALL PDSYR( UPLO, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.7 ) THEN * * Test PDSYR2 * CALL PB_TIMER( 1 ) CALL PDSYR2( UPLO, N, ALPHA, MEM( IPX ), IX, JX, $ DESCX, INCX, MEM( IPY ), IY, JY, DESCY, $ INCY, MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Calculate total flops * NOPS = PDOPBL2( SNAMES( K ), NROWA, NCOLA, 0, 0 ) * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9981 ) SNAMES( K ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' M N UPLO TRANS DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,9X,A1,11X,A1,10X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9989 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5,1X,I6 ) 9988 FORMAT( 2X, ' IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9987 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9986 FORMAT( 'Not enough memory. Need: ', I12 ) 9985 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9984 FORMAT( 2X, 'End of Tests.' ) 9983 FORMAT( 2X, 'Tests started.' ) 9982 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9981 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9980 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PDBLA2TIM * END SUBROUTINE PDBLA2TIMINFO( SUMMRY, NOUT, NMAT, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, $ IMBAVAL, MBAVAL, INBAVAL, NBAVAL, $ RSCAVAL, CSCAVAL, IAVAL, JAVAL, $ MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, $ IXVAL, JXVAL, INCXVAL, MYVAL, NYVAL, $ IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, $ INCYVAL, LDVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, NBLOG, LTEST, IAM, NPROCS, $ ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NBLOG, NGRIDS, $ NMAT, NOUT, NPROCS DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), TRANVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCXVAL( LDVAL ), $ CSCYVAL( LDVAL ), IAVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBXVAL( LDVAL ), $ IMBYVAL( LDVAL ), INBAVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JAVAL( LDVAL ), $ JXVAL( LDVAL ), JYVAL( LDVAL ), MAVAL( LDVAL ), $ MBAVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NAVAL( LDVAL ), $ NBAVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCXVAL( LDVAL ), $ RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PDBLA2TIMINFO get the needed startup information for timing various * Level 2 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * TRANVAL (global output) CHARACTER array * On entry, TRANVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANS to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, TRANS, UPLO, M, N, DESCA(:), * IA, JA, DESCX(:), IX, JX, INCX, DESCY(:), IY, JY and INCY. * This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least seven. On * exit, if LTEST( i ) is .TRUE., the i-th Level 2 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) DOUBLE PRECISION * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) DOUBLE PRECISION * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+37*NMAT+NSUBS ) with NSUBS = 7. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 7 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, DGEBR2D, DGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PDBLAS2TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRANVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( TRANVAL( J ) ) WORK( I+2 ) = ICHAR( UPLOVAL( J ) ) I = I + 3 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 2 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real double precision '// $ 'Level 2 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) NMAT WRITE( NOUT, FMT = 9986 ) NBLOG WRITE( NOUT, FMT = 9991 ) NGRIDS WRITE( NOUT, FMT = 9989 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9989 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA WRITE( NOUT, FMT = 9993 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 1, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 37*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) TRANVAL( J ) = CHAR( WORK( I+1 ) ) UPLOVAL( J ) = CHAR( WORK( I+2 ) ) I = I + 3 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : ', G16.6 ) 9993 FORMAT( 2X, 'Beta : ', G16.6 ) 9992 FORMAT( 2X, 'Number of Tests : ', I6 ) 9991 FORMAT( 2X, 'Number of process grids : ', I6 ) 9990 FORMAT( 2X, ' : ', 5I6 ) 9989 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9987 FORMAT( 2X, ' ', A, A8 ) 9986 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PDBLA2TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/PDBLAS3TIM.dat000644 000766 000024 00000004324 10363532303 017636 0ustar00juliestaff000000 000000 'Level 3 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PDBLAS3TIM.SUMM' output file name (if any) 6 device out 10 value of the logical computational blocksize NB 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q 2.0D0 value of ALPHA 3.0D0 value of BETA 8 number of tests problems 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' values of DIAG 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' values of SIDE 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANSA 'N' 'N' 'T' 'T' 'N' 'N' 'T' 'T' values of TRANSB 'U' 'L' 'U' 'L' 'U' 'L' 'U' 'L' values of UPLO 3 4 3 4 3 4 3 4 values of M 3 4 3 4 3 4 3 4 values of N 3 4 3 4 3 4 3 4 values of K 6 10 6 10 6 10 6 10 values of M_A 6 10 6 10 6 10 6 10 values of N_A 2 5 2 5 2 5 2 5 values of IMB_A 2 5 2 5 2 5 2 5 values of INB_A 2 5 2 5 2 5 2 5 values of MB_A 2 5 2 5 2 5 2 5 values of NB_A 0 1 0 1 0 1 0 1 values of RSRC_A 0 0 0 0 0 0 0 0 values of CSRC_A 1 1 1 1 1 1 1 1 values of IA 1 1 1 1 1 1 1 1 values of JA 6 10 6 10 6 10 6 10 values of M_B 6 10 6 10 6 10 6 10 values of N_B 2 5 2 5 2 5 2 5 values of IMB_B 2 5 2 5 2 5 2 5 values of INB_B 2 5 2 5 2 5 2 5 values of MB_B 2 5 2 5 2 5 2 5 values of NB_B 0 1 0 1 0 1 0 1 values of RSRC_B 0 0 0 0 0 0 0 0 values of CSRC_B 1 1 1 1 1 1 1 1 values of IB 1 1 1 1 1 1 1 1 values of JB 6 10 6 10 6 10 6 10 values of M_C 6 10 6 10 6 10 6 10 values of N_C 2 5 2 5 2 5 2 5 values of IMB_C 2 5 2 5 2 5 2 5 values of INB_C 2 5 2 5 2 5 2 5 values of MB_C 2 5 2 5 2 5 2 5 values of NB_C 0 1 0 1 0 1 0 1 values of RSRC_C 0 0 0 0 0 0 0 0 values of CSRC_C 1 1 1 1 1 1 1 1 values of IC 1 1 1 1 1 1 1 1 values of JC PDGEMM T put F for no test in the same column PDSYMM T put F for no test in the same column PDSYRK T put F for no test in the same column PDSYR2K T put F for no test in the same column PDTRMM T put F for no test in the same column PDTRSM T put F for no test in the same column PDGEADD T put F for no test in the same column PDTRADD T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/pdblas3tim.f000644 000766 000024 00000166132 11750130340 017755 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PDGEMM ', 'PDSYMM ', 'PDSYRK ', $ 'PDSYR2K', 'PDTRMM ', 'PDTRSM ', $ 'PDGEADD', 'PDTRADD'/ END BLOCK DATA PROGRAM PDBLA3TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PDBLA3TIM is the main timing program for the Level 3 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 56 lines: * 'Level 3 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PDBLAS3TIM.SUMM' output file name (if any) * 6 device out * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0D0 value of ALPHA * 1.0D0 value of BETA * 2 number of tests problems * 'N' 'U' values of DIAG * 'L' 'R' values of SIDE * 'N' 'T' values of TRANSA * 'N' 'T' values of TRANSB * 'U' 'L' values of UPLO * 3 4 values of M * 3 4 values of N * 3 4 values of K * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_B * 6 10 values of N_B * 2 5 values of IMB_B * 2 5 values of INB_B * 2 5 values of MB_B * 2 5 values of NB_B * 0 1 values of RSRC_B * 0 0 values of CSRC_B * 1 1 values of IB * 1 1 values of JB * 6 10 values of M_C * 6 10 values of N_C * 2 5 values of IMB_C * 2 5 values of INB_C * 2 5 values of MB_C * 2 5 values of NB_C * 0 1 values of RSRC_C * 0 0 values of CSRC_C * 1 1 values of IC * 1 1 values of JC * PDGEMM T put F for no test in the same column * PDSYMM T put F for no test in the same column * PDSYRK T put F for no test in the same column * PDSYR2K T put F for no test in the same column * PDTRMM T put F for no test in the same column * PDTRSM T put F for no test in the same column * PDGEADD T put F for no test in the same column * PDTRADD T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * DBLESZ indicates the length in bytes on the given platform * for a double precision real. By default, DBLESZ is set to * eight. * * MEM DOUBLE PRECISION array * MEM is an array of dimension TOTMEM / DBLESZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, DBLESZ, TOTMEM, MEMSIZ, $ NSUBS DOUBLE PRECISION ONE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, DBLESZ = 8, $ ONE = 1.0D+0, TOTMEM = 2000000, NSUBS = 8, $ MEMSIZ = TOTMEM / DBLESZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 ADIAGDO, AFORM, CFORM, DIAG, SIDE, TRANSA, $ TRANSB, UPLO INTEGER CSRCA, CSRCB, CSRCC, I, IA, IAM, IASEED, IB, $ IBSEED, IC, ICSEED, ICTXT, IMBA, IMBB, IMBC, $ IMIDA, IMIDB, IMIDC, INBA, INBB, INBC, IPA, $ IPB, IPC, IPOSTA, IPOSTB, IPOSTC, IPREA, IPREB, $ IPREC, J, JA, JB, JC, K, L, M, MA, MB, MBA, $ MBB, MBC, MC, MEMREQD, MPA, MPB, MPC, MYCOL, $ MYROW, N, NA, NB, NBA, NBB, NBC, NC, NCOLA, $ NCOLB, NCOLC, NGRIDS, NOUT, NPCOL, NPROCS, $ NPROW, NQA, NQB, NQC, NROWA, NROWB, NROWC, $ NTESTS, OFFDA, OFFDC, RSRCA, RSRCB, RSRCC DOUBLE PRECISION ALPHA, BETA, CFLOPS, NOPS, SCALE, WFLOPS * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), BCHECK( NSUBS ), $ CCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), SIDEVAL( MAXTESTS ), $ TRNAVAL( MAXTESTS ), TRNBVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCBVAL( MAXTESTS ), $ CSCCVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCB( DLEN_ ), DESCC( DLEN_ ), $ IAVAL( MAXTESTS ), IBVAL( MAXTESTS ), $ ICVAL( MAXTESTS ), IERR( 3 ), $ IMBAVAL( MAXTESTS ), IMBBVAL( MAXTESTS ), $ IMBCVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBBVAL( MAXTESTS ), INBCVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JBVAL( MAXTESTS ), $ JCVAL( MAXTESTS ), KVAL( MAXTESTS ), $ MAVAL( MAXTESTS ), MBAVAL( MAXTESTS ), $ MBBVAL( MAXTESTS ), MBCVAL( MAXTESTS ), $ MBVAL( MAXTESTS ), MCVAL( MAXTESTS ), $ MVAL( MAXTESTS ), NAVAL( MAXTESTS ), $ NBAVAL( MAXTESTS ), NBBVAL( MAXTESTS ), $ NBCVAL( MAXTESTS ), NBVAL( MAXTESTS ), $ NCVAL( MAXTESTS ), NVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCAVAL( MAXTESTS ), RSCBVAL( MAXTESTS ), $ RSCCVAL( MAXTESTS ) DOUBLE PRECISION CTIME( 1 ), MEM( MEMSIZ ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PDBLA3TIMINFO, PDGEADD, PDGEMM, $ PDLAGEN, PDLASCAL, PDSYMM, PDSYR2K, PDSYRK, $ PDTRADD, PDTRMM, PDTRSM, PMDESCCHK, PMDIMCHK * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDOPBL3 EXTERNAL LSAME, PDOPBL3 * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA BCHECK/.TRUE., .TRUE., .FALSE., .TRUE., .TRUE., $ .TRUE., .FALSE., .FALSE./ DATA CCHECK/.TRUE., .TRUE., .TRUE., .TRUE., .FALSE., $ .FALSE., .TRUE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * Seeds for random matrix generations. * IASEED = 100 IBSEED = 200 ICSEED = 300 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDBLA3TIMINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, NVAL, $ KVAL, MAVAL, NAVAL, IMBAVAL, MBAVAL, $ INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, IAVAL, $ JAVAL, MBVAL, NBVAL, IMBBVAL, MBBVAL, $ INBBVAL, NBBVAL, RSCBVAL, CSCBVAL, IBVAL, $ JBVAL, MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, ICVAL, $ JCVAL, MAXTESTS, NGRIDS, PVAL, MAXGRIDS, $ QVAL, MAXGRIDS, NBLOG, LTEST, IAM, NPROCS, $ ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) SIDE = SIDEVAL( J ) TRANSA = TRNAVAL( J ) TRANSB = TRNBVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) K = KVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MB = MBVAL( J ) NB = NBVAL( J ) IMBB = IMBBVAL( J ) MBB = MBBVAL( J ) INBB = INBBVAL( J ) NBB = NBBVAL( J ) RSRCB = RSCBVAL( J ) CSRCB = CSCBVAL( J ) IB = IBVAL( J ) JB = JBVAL( J ) * MC = MCVAL( J ) NC = NCVAL( J ) IMBC = IMBCVAL( J ) MBC = MBCVAL( J ) INBC = INBCVAL( J ) NBC = NBCVAL( J ) RSRCC = RSCCVAL( J ) CSRCC = CSCCVAL( J ) IC = ICVAL( J ) JC = JCVAL( J ) * IF( IAM.EQ.0 ) THEN * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, K, SIDE, UPLO, TRANSA, $ TRANSB, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IB, JB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IC, JC, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9980 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( SIDE, 'L' ).AND. $ .NOT.LSAME( SIDE, 'R' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'SIDE' GO TO 40 END IF * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' GO TO 40 END IF * IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSA' GO TO 40 END IF * IF( .NOT.LSAME( TRANSB, 'N' ).AND. $ .NOT.LSAME( TRANSB, 'T' ).AND. $ .NOT.LSAME( TRANSB, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSB' GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'DIAG' GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, 0, 0, IERR( 1 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'B', DESCB, $ BLOCK_CYCLIC_2D_INB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB, MPB, NQB, IPREB, $ IMIDB, IPOSTB, 0, 0, IERR( 2 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'C', DESCC, $ BLOCK_CYCLIC_2D_INB, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC, MPC, NQC, IPREC, $ IMIDC, IPOSTC, 0, 0, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN GO TO 40 END IF * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPB = IPA + DESCA( LLD_ )*NQA IPC = IPB + DESCB( LLD_ )*NQB * * Check if sufficient memory. * MEMREQD = IPC + DESCC( LLD_ )*NQC - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) MEMREQD*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) GO TO 40 END IF * * Loop over all PBLAS 3 routines * DO 30 L = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( L ) ) $ GO TO 30 * * Define the size of the operands * IF( L.EQ.1 ) THEN * * PDGEMM * NROWC = M NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF( LSAME( TRANSB, 'N' ) ) THEN NROWB = K NCOLB = N ELSE NROWB = N NCOLB = K END IF ELSE IF( L.EQ.2 ) THEN * * PDSYMM * NROWC = M NCOLC = N NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF ELSE IF( L.EQ.3 ) THEN * * PDSYRK * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K ELSE NROWA = K NCOLA = N END IF NROWB = 0 NCOLB = 0 ELSE IF( L.EQ.4 ) THEN * * PDSYR2K * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K NROWB = N NCOLB = K ELSE NROWA = K NCOLA = N NROWB = K NCOLB = N END IF ELSE IF( L.EQ.5 .OR. L.EQ.6 ) THEN * * PDTRMM, PDTRSM * NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF NROWC = 0 NCOLC = 0 ELSE IF( L.EQ.7 .OR. L.EQ.8 ) THEN * * PDGEADD, PDTRADD * IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = N ELSE NROWA = N NCOLA = M END IF NROWC = M NCOLC = N NROWB = 0 NCOLB = 0 * END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWB, NCOLB, 'B', IB, JB, $ DESCB, IERR( 2 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWC, NCOLC, 'C', IC, JC, $ DESCC, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN GO TO 30 END IF * * Generate distributed matrices A, B and C * IF( L.EQ.2 ) THEN * * PDSYMM * AFORM = 'S' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.3 .OR. L.EQ.4 ) THEN * * PDSYRK, PDSYR2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'S' OFFDC = IC - JC * ELSE IF( ( L.EQ.6 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN * * PDTRSM * AFORM = 'N' ADIAGDO = 'D' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE * * Default values * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'N' OFFDC = 0 * END IF * CALL PDLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) IF( ( L.EQ.6 ).AND.( .NOT.( LSAME( DIAG, 'N' ) ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / DBLE( MAX( NROWA, NCOLA ) ) IF( LSAME( UPLO, 'L' ) ) THEN CALL PDLASCAL( 'Lower', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA+1, JA, DESCA ) ELSE CALL PDLASCAL( 'Upper', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA, JA+1, DESCA ) END IF * END IF * IF( BCHECK( L ) ) $ CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCB, IBSEED, MEM( IPB ), $ DESCB( LLD_ ) ) * IF( CCHECK( L ) ) $ CALL PDLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCC, ICSEED, MEM( IPC ), $ DESCC( LLD_ ) ) * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the Level 3 PBLAS routine * IF( L.EQ.1 ) THEN * * Test PDGEMM * NOPS = PDOPBL3( SNAMES( L ), M, N, K ) * CALL PB_TIMER( 1 ) CALL PDGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.2 ) THEN * * Test PDSYMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PDSYMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.3 ) THEN * * Test PDSYRK * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PDSYRK( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.4 ) THEN * * Test PDSYR2K * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PDSYR2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.5 ) THEN * * Test PDTRMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PDTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.6 ) THEN * * Test PDTRSM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PDTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.7 ) THEN * * Test PDGEADD * NOPS = PDOPBL3( SNAMES( L ), M, N, M ) * CALL PB_TIMER( 1 ) CALL PDGEADD( TRANSA, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.8 ) THEN * * Test PDTRADD * IF( LSAME( UPLO, 'U' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PDTRADD( UPLO, TRANSA, M, N, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9981 ) SNAMES( L ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9986 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '-------------------' ) 9994 FORMAT( 2X, ' M N K SIDE UPLO TRANSA ', $ 'TRANSB DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,1X,I6,6X,A1,5X,A1,7X,A1,7X,A1,5X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IB JB MB NB IMBB INBB', $ ' MBB NBB RSRCB CSRCB' ) 9989 FORMAT( 2X, ' IC JC MC NC IMBC INBC', $ ' MBC NBC RSRCC CSRCC' ) 9988 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9987 FORMAT( 'Not enough memory. Need: ', I12 ) 9986 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9985 FORMAT( 2X, 'End of Tests.' ) 9984 FORMAT( 2X, 'Tests started.' ) 9983 FORMAT( 5X, A, ' ***** ', A, ' has an incorrect value: ', $ ' BYPASS *****' ) 9982 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9981 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9980 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PDBLA3TIM * END SUBROUTINE PDBLA3TIMINFO( SUMMRY, NOUT, NMAT, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, $ NVAL, KVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, $ CSCAVAL, IAVAL, JAVAL, MBVAL, NBVAL, $ IMBBVAL, MBBVAL, INBBVAL, NBBVAL, $ RSCBVAL, CSCBVAL, IBVAL, JBVAL, $ MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, $ ICVAL, JCVAL, LDVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, NBLOG, LTEST, $ IAM, NPROCS, ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NBLOG, NGRIDS, $ NMAT, NOUT, NPROCS DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), SIDEVAL( LDVAL ), $ TRNAVAL( LDVAL ), TRNBVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCBVAL( LDVAL ), $ CSCCVAL( LDVAL ), IAVAL( LDVAL ), $ IBVAL( LDVAL ), ICVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBBVAL( LDVAL ), $ IMBCVAL( LDVAL ), INBAVAL( LDVAL ), $ INBBVAL( LDVAL ), INBCVAL( LDVAL ), $ JAVAL( LDVAL ), JBVAL( LDVAL ), JCVAL( LDVAL ), $ KVAL( LDVAL ), MAVAL( LDVAL ), MBAVAL( LDVAL ), $ MBBVAL( LDVAL ), MBCVAL( LDVAL ), $ MBVAL( LDVAL ), MCVAL( LDVAL ), MVAL( LDVAL ), $ NAVAL( LDVAL ), NBAVAL( LDVAL ), $ NBBVAL( LDVAL ), NBCVAL( LDVAL ), $ NBVAL( LDVAL ), NCVAL( LDVAL ), NVAL( LDVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCBVAL( LDVAL ), $ RSCCVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PDBLA3TIMINFO get the needed startup information for timing various * Level 3 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * SIDEVAL (global output) CHARACTER array * On entry, SIDEVAL is an array of dimension LDVAL. On exit, * this array contains the values of SIDE to run the code with. * * TRNAVAL (global output) CHARACTER array * On entry, TRNAVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSA to run the code * with. * * TRNBVAL (global output) CHARACTER array * On entry, TRNBVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSB to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * KVAL (global output) INTEGER array * On entry, KVAL is an array of dimension LDVAL. On exit, this * array contains the values of K to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MBVAL (global output) INTEGER array * On entry, MBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( M_ ) to run the code * with. * * NBVAL (global output) INTEGER array * On entry, NBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( N_ ) to run the code * with. * * IMBBVAL (global output) INTEGER array * On entry, IMBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( IMB_ ) to run the * code with. * * MBBVAL (global output) INTEGER array * On entry, MBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( MB_ ) to run the * code with. * * INBBVAL (global output) INTEGER array * On entry, INBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( INB_ ) to run the * code with. * * NBBVAL (global output) INTEGER array * On entry, NBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( NB_ ) to run the * code with. * * RSCBVAL (global output) INTEGER array * On entry, RSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( RSRC_ ) to run the * code with. * * CSCBVAL (global output) INTEGER array * On entry, CSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( CSRC_ ) to run the * code with. * * IBVAL (global output) INTEGER array * On entry, IBVAL is an array of dimension LDVAL. On exit, this * array contains the values of IB to run the code with. * * JBVAL (global output) INTEGER array * On entry, JBVAL is an array of dimension LDVAL. On exit, this * array contains the values of JB to run the code with. * * MCVAL (global output) INTEGER array * On entry, MCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( M_ ) to run the code * with. * * NCVAL (global output) INTEGER array * On entry, NCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( N_ ) to run the code * with. * * IMBCVAL (global output) INTEGER array * On entry, IMBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( IMB_ ) to run the * code with. * * MBCVAL (global output) INTEGER array * On entry, MBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( MB_ ) to run the * code with. * * INBCVAL (global output) INTEGER array * On entry, INBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( INB_ ) to run the * code with. * * NBCVAL (global output) INTEGER array * On entry, NBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( NB_ ) to run the * code with. * * RSCCVAL (global output) INTEGER array * On entry, RSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( RSRC_ ) to run the * code with. * * CSCCVAL (global output) INTEGER array * On entry, CSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( CSRC_ ) to run the * code with. * * ICVAL (global output) INTEGER array * On entry, ICVAL is an array of dimension LDVAL. On exit, this * array contains the values of IC to run the code with. * * JCVAL (global output) INTEGER array * On entry, JCVAL is an array of dimension LDVAL. On exit, this * array contains the values of JC to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, SIDE, TRANSA, TRANSB, UPLO, * M, N, K, DESCA(:), IA, JA, DESCB(:), IB, JB, DESCC(:), IC, * JC. This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 3 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) DOUBLE PRECISION * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) DOUBLE PRECISION * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+38*NMAT+NSUBS ) with NSUBS = 8. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, DGEBR2D, DGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PDBLAS3TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( SIDEVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( KVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( ICVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JCVAL ( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( SIDEVAL( J ) ) WORK( I+2 ) = ICHAR( TRNAVAL( J ) ) WORK( I+3 ) = ICHAR( TRNBVAL( J ) ) WORK( I+4 ) = ICHAR( UPLOVAL( J ) ) I = I + 5 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, KVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, ICVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JCVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 3 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real double precision '// $ 'Level 3 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) NMAT WRITE( NOUT, FMT = 9986 ) NBLOG WRITE( NOUT, FMT = 9991 ) NGRIDS WRITE( NOUT, FMT = 9989 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9989 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA WRITE( NOUT, FMT = 9993 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 38*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) SIDEVAL( J ) = CHAR( WORK( I+1 ) ) TRNAVAL( J ) = CHAR( WORK( I+2 ) ) TRNBVAL( J ) = CHAR( WORK( I+3 ) ) UPLOVAL( J ) = CHAR( WORK( I+4 ) ) I = I + 5 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, KVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, ICVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JCVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : ', G16.6 ) 9993 FORMAT( 2X, 'Beta : ', G16.6 ) 9992 FORMAT( 2X, 'Number of Tests : ', I6 ) 9991 FORMAT( 2X, 'Number of process grids : ', I6 ) 9990 FORMAT( 2X, ' : ', 5I6 ) 9989 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9987 FORMAT( 2X, ' ', A, A8 ) 9986 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PDBLA3TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/pdblastim.f000644 000766 000024 00000230154 10363532303 017672 0ustar00juliestaff000000 000000 SUBROUTINE PDLASCAL( TYPE, M, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TYPE INTEGER IA, JA, M, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLASCAL scales the m by n submatrix A(IA:IA+M-1,JA:JA+N-1) denoted * by sub( A ) by the scalar alpha. TYPE specifies if sub( A ) is full, * upper triangular, lower triangular or upper Hessenberg. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * On entry, TYPE specifies the type of the input submatrix as * follows: * = 'L' or 'l': sub( A ) is a lower triangular matrix, * = 'U' or 'u': sub( A ) is an upper triangular matrix, * = 'H' or 'h': sub( A ) is an upper Hessenberg matrix, * otherwise sub( A ) is a full matrix. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( A ) are overwritten by * the local entries of the m by n scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 UPLO LOGICAL GODOWN, GOLEFT, LOWER, UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, ITYPE, $ IUPP, JJA, JJMAX, JOFFA, JOFFD, LCMT, LCMT00, $ LDA, LMBLOC, LNBLOC, LOW, M1, MB, MBLKD, MBLKS, $ MBLOC, MP, MRCOL, MRROW, MYCOL, MYROW, N1, NB, $ NBLKD, NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, $ QNB, TMP1, UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS, PB_DLASCAL, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_NUMROC EXTERNAL LSAME, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 UPLO = TYPE UPPER = .FALSE. LOWER = .TRUE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 UPLO = TYPE UPPER = .TRUE. LOWER = .FALSE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 UPLO = 'U' UPPER = .TRUE. LOWER = .FALSE. IOFFD = 1 ELSE ITYPE = 0 UPLO = 'A' UPPER = .TRUE. LOWER = .TRUE. IOFFD = 0 END IF * * Compute local indexes * IF( ITYPE.EQ.0 ) THEN * * Full matrix * CALL PB_INFOG2L( IA, JA, DESCA2, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = PB_NUMROC( M, IA, DESCA2( IMB_ ), DESCA2( MB_ ), MYROW, $ DESCA2( RSRC_ ), NPROW ) NQ = PB_NUMROC( N, JA, DESCA2( INB_ ), DESCA2( NB_ ), MYCOL, $ DESCA2( CSRC_ ), NPCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * LDA = DESCA2( LLD_ ) IOFFA = IIA + ( JJA - 1 ) * LDA * CALL PB_DLASCAL( 'All', MP, NQ, 0, ALPHA, A( IOFFA ), LDA ) * ELSE * * Trapezoidal matrix * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, * LNBLOC, ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) LDA = DESCA2( LLD_ ) * CALL PB_BINFO( IOFFD, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * M1 = MP N1 = NQ IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and * update LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_DLASCAL( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_DLASCAL( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_DLASCAL( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_DLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_DLASCAL( UPLO, MBLOC, INBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_DLASCAL( 'All', TMP1, INBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_DLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_DLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_DLASCAL( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_DLASCAL( 'All', IMBLOC, TMP1, 0, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_DLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_DLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_DLASCAL( UPLO, MBLOC, NBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_DLASCAL( 'All', TMP1, NBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_DLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * END IF * RETURN * * End of PDLASCAL * END SUBROUTINE PDLAGEN( INPLACE, AFORM, DIAG, OFFA, M, N, IA, JA, $ DESCA, IASEED, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE CHARACTER*1 AFORM, DIAG INTEGER IA, IASEED, JA, LDA, M, N, OFFA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PDLAGEN generates (or regenerates) a submatrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the generated submatrix is diago- * nally dominant or not as follows: * DIAG = 'D' : sub( A ) is diagonally dominant, * DIAG = 'N' : sub( A ) is not diagonally dominant. * * OFFA (global input) INTEGER * On entry, OFFA specifies the offdiagonal of the underlying * matrix A(1:DESCA(M_),1:DESCA(N_)) of interest when the subma- * trix is symmetric, Hermitian or diagonally dominant. OFFA = 0 * specifies the main diagonal, OFFA > 0 specifies a subdiago- * nal, and OFFA < 0 specifies a superdiagonal (see further de- * tails). * * M (global input) INTEGER * On entry, M specifies the global number of matrix rows of the * submatrix sub( A ) to be generated. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of matrix columns of * the submatrix sub( A ) to be generated. N must be at least * zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IASEED (global input) INTEGER * On entry, IASEED specifies the seed number to generate the * matrix A. IASEED must be at least zero. * * A (local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). On exit, this array contains the * local entries of the randomly generated submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. When INPLACE is .FALSE., LDA is usually DESCA(LLD_). * This restriction is however not enforced, and this subroutine * requires only that LDA >= MAX( 1, Mp ) where * * Mp = PB_NUMROC( M, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ). * * PB_NUMROC is a ScaLAPACK tool function; MYROW, MYCOL, NPROW * and NPCOL can be determined by calling the BLACS subroutine * BLACS_GRIDINFO. * * Further Details * =============== * * OFFD is tied to the matrix described by DESCA, as opposed to the * piece that is currently (re)generated. This is a global information * independent from the distribution parameters. Below are examples of * the meaning of OFFD for a global 7 by 5 matrix: * * --------------------------------------------------------------------- * OFFD | 0 -1 -2 -3 -4 0 -1 -2 -3 -4 0 -1 -2 -3 -4 * -------|------------------------------------------------------------- * | | OFFD=-1 | OFFD=0 OFFD=2 * | V V * 0 | . d . . . -> d . . . . . . . . . * 1 | . . d . . . d . . . . . . . . * 2 | . . . d . . . d . . -> d . . . . * 3 | . . . . d . . . d . . d . . . * 4 | . . . . . . . . . d . . d . . * 5 | . . . . . . . . . . . . . d . * 6 | . . . . . . . . . . . . . . d * --------------------------------------------------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. LOGICAL DIAGDO, SYMM, HERM, NOTRAN INTEGER CSRC, I, IACOL, IAROW, ICTXT, IIA, ILOCBLK, $ ILOCOFF, ILOW, IMB, IMB1, IMBLOC, IMBVIR, INB, $ INB1, INBLOC, INBVIR, INFO, IOFFDA, ITMP, IUPP, $ IVIR, JJA, JLOCBLK, JLOCOFF, JVIR, LCMT00, $ LMBLOC, LNBLOC, LOW, MAXMN, MB, MBLKS, MP, $ MRCOL, MRROW, MYCDIST, MYCOL, MYRDIST, MYROW, $ NB, NBLKS, NPCOL, NPROW, NQ, NVIR, RSRC, UPP DOUBLE PRECISION ALPHA * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ), IMULADD( 4, JMP_LEN ), $ IRAN( 2 ), JMP( JMP_LEN ), MULADD0( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CHKMAT, PB_DESCTRANS, PB_DLAGEN, PB_INITJMP, $ PB_INITMULADD, PB_JUMP, PB_JUMPIT, PB_LOCINFO, $ PB_SETLOCRAN, PB_SETRAN, PDLADOM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA ( MULADD0( I ), I = 1, 4 ) / 20077, 16838, $ 12345, 0 / * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Test the input arguments * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1000 + CTXT_ ) ELSE SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) NOTRAN = LSAME( AFORM, 'N' ) DIAGDO = LSAME( DIAG, 'D' ) IF( .NOT.( SYMM.OR.HERM.OR.NOTRAN ) .AND. $ .NOT.( LSAME( AFORM, 'T' ) ) .AND. $ .NOT.( LSAME( AFORM, 'C' ) ) ) THEN INFO = -2 ELSE IF( ( .NOT.DIAGDO ) .AND. $ ( .NOT.LSAME( DIAG, 'N' ) ) ) THEN INFO = -3 END IF CALL PB_CHKMAT( ICTXT, M, 5, N, 6, IA, JA, DESCA2, 10, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDLAGEN', -INFO ) RETURN END IF * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) IMB = DESCA2( IMB_ ) INB = DESCA2( INB_ ) RSRC = DESCA2( RSRC_ ) CSRC = DESCA2( CSRC_ ) * * Figure out local information about the distributed matrix operand * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * IOFFDA = JA + OFFA - IA CALL PB_BINFO( IOFFDA, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * * Initialize ILOCBLK, ILOCOFF, MYRDIST, JLOCBLK, JLOCOFF, MYCDIST * This values correspond to the square virtual underlying matrix * of size MAX( M_ + MAX( 0, -OFFA ), N_ + MAX( 0, OFFA ) ) used * to set up the random sequence. For practical purposes, the size * of this virtual matrix is upper bounded by M_ + N_ - 1. * ITMP = MAX( 0, -OFFA ) IVIR = IA + ITMP IMBVIR = IMB + ITMP NVIR = DESCA2( M_ ) + ITMP * CALL PB_LOCINFO( IVIR, IMBVIR, MB, MYROW, RSRC, NPROW, ILOCBLK, $ ILOCOFF, MYRDIST ) * ITMP = MAX( 0, OFFA ) JVIR = JA + ITMP INBVIR = INB + ITMP NVIR = MAX( MAX( NVIR, DESCA2( N_ ) + ITMP ), $ DESCA2( M_ ) + DESCA2( N_ ) - 1 ) * CALL PB_LOCINFO( JVIR, INBVIR, NB, MYCOL, CSRC, NPCOL, JLOCBLK, $ JLOCOFF, MYCDIST ) * IF( SYMM .OR. HERM .OR. NOTRAN ) THEN * CALL PB_INITJMP( .TRUE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 1, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_DLAGEN( 'Lower', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( SYMM .OR. HERM .OR. ( .NOT. NOTRAN ) ) THEN * CALL PB_INITJMP( .FALSE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 1, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_DLAGEN( 'Upper', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( DIAGDO ) THEN * MAXMN = MAX( DESCA2( M_ ), DESCA2( N_ ) ) ALPHA = DBLE( MAXMN ) * IF( IOFFDA.GE.0 ) THEN CALL PDLADOM( INPLACE, MIN( MAX( 0, M-IOFFDA ), N ), ALPHA, $ A, MIN( IA+IOFFDA, IA+M-1 ), JA, DESCA ) ELSE CALL PDLADOM( INPLACE, MIN( M, MAX( 0, N+IOFFDA ) ), ALPHA, $ A, IA, MIN( JA-IOFFDA, JA+N-1 ), DESCA ) END IF * END IF * RETURN * * End of PDLAGEN * END SUBROUTINE PDLADOM( INPLACE, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE INTEGER IA, JA, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLADOM adds alpha to the diagonal entries of an n by n submatrix * sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * N (global input) INTEGER * On entry, N specifies the global order of the submatrix * sub( A ) to be modified. N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the local entries * of this array corresponding to the main diagonal of sub( A ) * have been updated. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP DOUBLE PRECISION ATMP * .. * .. Local Scalars .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.EQ.0 ) $ RETURN * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) * CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA+LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PDLADOM * END SUBROUTINE PB_DLASCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PB_DLASCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (input/output) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA * A( I, J ) 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = ALPHA * A( I, J ) 50 CONTINUE 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA * A( I, J ) 100 CONTINUE 110 CONTINUE * END IF * RETURN * * End of PB_DLASCAL * END SUBROUTINE PB_DLAGEN( UPLO, AFORM, A, LDA, LCMT00, IRAN, MBLKS, $ IMBLOC, MB, LMBLOC, NBLKS, INBLOC, NB, $ LNBLOC, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO, AFORM INTEGER IMBLOC, INBLOC, LCMT00, LDA, LMBLOC, LNBLOC, $ MB, MBLKS, NB, NBLKS * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PB_DLAGEN locally initializes an array A. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the lower (UPLO='L') trape- * zoidal part or the upper (UPLO='U') trapezoidal part is to be * generated when the matrix to be generated is symmetric or * Hermitian. For all the other values of AFORM, the value of * this input argument is ignored. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * A (local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, *). On exit, * this array contains the local entries of the randomly genera- * ted submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. LDA must be at least one. * * LCMT00 (global input) INTEGER * On entry, LCMT00 is the LCM value specifying the off-diagonal * of the underlying matrix of interest. LCMT00=0 specifies the * main diagonal, LCMT00 > 0 specifies a subdiagonal, LCMT00 < 0 * specifies superdiagonals. * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2 containing respec- * tively the 16-lower and 16-higher bits of the encoding of the * entry of the random sequence corresponding locally to the * first local array entry to generate. Usually, this array is * computed by PB_SETLOCRAN. * * MBLKS (local input) INTEGER * On entry, MBLKS specifies the local number of blocks of rows. * MBLKS is at least zero. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * LMBLOC (local input) INTEGER * On entry, LMBLOC specifies the number of rows (size) of the * local lowest blocks. LMBLOC is at least zero. * * NBLKS (local input) INTEGER * On entry, NBLKS specifies the local number of blocks of co- * lumns. NBLKS is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * LNBLOC (local input) INTEGER * On entry, LNBLOC specifies the number of columns (size) of * the local rightmost blocks. LNBLOC is at least zero. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the random matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. INTEGER I, IB, IBLK, II, IK, ITMP, JB, JBLK, JJ, JK, $ JTMP, LCMTC, LCMTR, LOW, MNB, UPP DOUBLE PRECISION DUMMY * .. * .. Local Arrays .. INTEGER IB0( 2 ), IB1( 2 ), IB2( 2 ), IB3( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMPIT * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PB_DRAND EXTERNAL LSAME, PB_DRAND * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * DO 10 I = 1, 2 IB1( I ) = IRAN( I ) IB2( I ) = IRAN( I ) IB3( I ) = IRAN( I ) 10 CONTINUE * IF( LSAME( AFORM, 'N' ) ) THEN * * Generate random matrix * JJ = 1 * DO 50 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * DO 40 JK = JJ, JJ + JB - 1 * II = 1 * DO 30 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * * Blocks are IB by JB * DO 20 IK = II, II + IB - 1 A( IK, JK ) = PB_DRAND( 0 ) 20 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 30 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 40 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 50 CONTINUE * ELSE IF( LSAME( AFORM, 'T' ) .OR. LSAME( AFORM, 'C' ) ) THEN * * Generate the transpose of the matrix that would be normally * generated. * II = 1 * DO 90 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 80 IK = II, II + IB - 1 * JJ = 1 * DO 70 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 60 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_DRAND( 0 ) 60 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 70 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 80 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 90 CONTINUE * ELSE IF( ( LSAME( AFORM, 'S' ) ).OR.( LSAME( AFORM, 'H' ) ) ) THEN * * Generate a symmetric matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 170 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 160 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 150 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 100 IK = II, II + IB - 1 DUMMY = PB_DRAND( 0 ) 100 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 110 IK = II, II + IB - 1 A( IK, JK ) = PB_DRAND( 0 ) 110 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 120 IK = II, ITMP - 1 DUMMY = PB_DRAND( 0 ) 120 CONTINUE * DO 130 IK = ITMP, II + IB - 1 A( IK, JK ) = PB_DRAND( 0 ) 130 CONTINUE * END IF * ELSE * DO 140 IK = II, II + IB - 1 A( IK, JK ) = PB_DRAND( 0 ) 140 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 150 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 160 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 170 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 250 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 240 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 230 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 180 JK = JJ, JJ + JB - 1 DUMMY = PB_DRAND( 0 ) 180 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 190 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_DRAND( 0 ) 190 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 200 JK = JJ, JTMP - 1 DUMMY = PB_DRAND( 0 ) 200 CONTINUE * DO 210 JK = JTMP, JJ + JB - 1 A( IK, JK ) = PB_DRAND( 0 ) 210 CONTINUE * END IF * ELSE * DO 220 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_DRAND( 0 ) 220 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 230 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 240 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 250 CONTINUE * END IF * END IF * RETURN * * End of PB_DLAGEN * END DOUBLE PRECISION FUNCTION PB_DRAND( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_DRAND generates the next number in the random sequence. This func- * tion ensures that this number will be in the interval ( -1.0, 1.0 ). * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0 ) * .. * .. External Functions .. DOUBLE PRECISION PB_DRAN EXTERNAL PB_DRAN * .. * .. Executable Statements .. * PB_DRAND = ONE - TWO * PB_DRAN( IDUMM ) * RETURN * * End of PB_DRAND * END DOUBLE PRECISION FUNCTION PB_DRAN( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_DRAN generates the next number in the random sequence. * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION DIVFAC, POW16 PARAMETER ( DIVFAC = 2.147483648D+9, $ POW16 = 6.5536D+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * PB_DRAN = ( DBLE( IRAND( 1 ) ) + POW16 * DBLE( IRAND( 2 ) ) ) / $ DIVFAC * CALL PB_LMUL( IRAND, IACS, J ) CALL PB_LADD( J, IACS( 3 ), IRAND ) * RETURN * * End of PB_DRAN * END scalapack-2.0.2/PBLAS/TIMING/PSBLAS1TIM.dat000644 000766 000024 00000002327 10363532303 017654 0ustar00juliestaff000000 000000 'Level 1 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PSBLAS1TIM.SUMM' output file name (if any) 6 device out 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q 2.0E0 value of ALPHA 2 number of tests problems 1000 1000 values of N 1000 1 values of M_X 1 1500 values of N_X 32 32 values of IMB_X 32 32 values of INB_X 32 32 values of MB_X 32 32 values of NB_X 0 0 values of RSRC_X 0 0 values of CSRC_X 1 1 values of IX 1 1 values of JX 1 1 values of INCX 1 1 values of M_Y 1000 1500 values of N_Y 32 32 values of IMB_Y 32 32 values of INB_Y 32 32 values of MB_Y 32 32 values of NB_Y 0 0 values of RSRC_Y 0 0 values of CSRC_Y 1 1 values of IY 1 1 values of JY 1 1 values of INCY PSSWAP T put F for no test in the same column PSSCAL T put F for no test in the same column PSCOPY T put F for no test in the same column PSAXPY T put F for no test in the same column PSDOT T put F for no test in the same column PSNRM2 T put F for no test in the same column PSASUM T put F for no test in the same column PSAMAX T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/psblas1tim.f000644 000766 000024 00000122437 11750130340 017772 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PSSWAP ', 'PSSCAL ', 'PSCOPY ', $ 'PSAXPY ', 'PSDOT ', 'PSNRM2 ', $ 'PSASUM ', 'PSAMAX '/ END BLOCK DATA PROGRAM PSBLA1TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PSBLA1TIM is the main timing program for the Level 1 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 40 lines: * 'Level 1 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PSBLAS1TIM.SUMM' output file name (if any) * 6 device out * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0E0 value of ALPHA * 2 number of tests problems * 3 4 values of N * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PSSWAP T put F for no test in the same column * PSSCAL T put F for no test in the same column * PSCOPY T put F for no test in the same column * PSAXPY T put F for no test in the same column * PSDOT T put F for no test in the same column * PSNRM2 T put F for no test in the same column * PSASUM T put F for no test in the same column * PSAMAX T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * REALSZ indicates the length in bytes on the given platform * for a single precision real. By default, REALSZ is set to * four. * * MEM REAL array * MEM is an array of dimension TOTMEM / REALSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, REALSZ, TOTMEM, MEMSIZ, $ NSUBS PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, REALSZ = 4, $ TOTMEM = 2000000, NSUBS = 8, $ MEMSIZ = TOTMEM / REALSZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER CSRCX, CSRCY, I, IAM, ICTXT, IMBX, IMBY, IMIDX, $ IMIDY, INBX, INBY, INCX, INCY, IPOSTX, IPOSTY, $ IPREX, IPREY, IPX, IPY, IX, IXSEED, IY, IYSEED, $ J, JX, JY, K, MBX, MBY, MEMREQD, MPX, MPY, MX, $ MY, MYCOL, MYROW, N, NBX, NBY, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQX, NQY, NTESTS, NX, NY, $ PISCLR, RSRCX, RSRCY REAL ALPHA, PSCLR, PUSCLR DOUBLE PRECISION ADDS, CFLOPS, MULTS, NOPS, WFLOPS * .. * .. Local Arrays .. CHARACTER*80 OUTFILE LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) INTEGER CSCXVAL( MAXTESTS ), CSCYVAL( MAXTESTS ), $ DESCX( DLEN_ ), DESCY( DLEN_ ), IERR( 2 ), $ IMBXVAL( MAXTESTS ), IMBYVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JXVAL( MAXTESTS ), JYVAL( MAXTESTS ), $ MBXVAL( MAXTESTS ), MBYVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCXVAL( MAXTESTS ), $ RSCYVAL( MAXTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PSAMAX, PSASUM, PSAXPY, $ PSBLA1TIMINFO, PSCOPY, PSDOT, PSLAGEN, PSNRM2, $ PSSCAL, PSSWAP, PVDESCCHK, PVDIMCHK * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .FALSE., .TRUE., .TRUE., .TRUE., $ .FALSE., .FALSE., .FALSE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * Seeds for random matrix generations. * IXSEED = 100 IYSEED = 200 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PSBLA1TIMINFO( OUTFILE, NOUT, NTESTS, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, RSCXVAL, $ CSCXVAL, IXVAL, JXVAL, INCXVAL, MYVAL, $ NYVAL, IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, INCYVAL, $ MAXTESTS, NGRIDS, PVAL, MAXGRIDS, QVAL, $ MAXGRIDS, LTEST, IAM, NPROCS, ALPHA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * N = NVAL( J ) MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9983 ) END IF * * Check the validity of the input and initialize DESC_ * CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, 0, 0, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, 0, 0, IERR( 2 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 ) $ GO TO 40 * * Assign pointers into MEM for matrices corresponding to * vectors X and Y. Ex: IPX starts at position MEM( 1 ). * IPX = 1 IPY = IPX + DESCX( LLD_ ) * NQX * * Check if sufficient memory. * MEMREQD = IPY + DESCY( LLD_ ) * NQY - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) MEMREQD*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) GO TO 40 END IF * * Loop over all PBLAS 1 routines * DO 30 K = 1, NSUBS * * Continue only if this sub has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * * Check the validity of the operand sizes * CALL PVDIMCHK( ICTXT, NOUT, N, 'X', IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, N, 'Y', IY, JY, DESCY, INCY, $ IERR( 2 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 ) $ GO TO 30 * * Generate distributed matrices X and Y * CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the PBLAS routine * IF( K.EQ.1 ) THEN * * Test PSSWAP * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PSSWAP( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.2 ) THEN * * Test PSSCAL * ADDS = 0.0D+0 MULTS = DBLE( N ) CALL PB_TIMER( 1 ) CALL PSSCAL( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.3 ) THEN * * Test PSCOPY * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PSCOPY( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.4 ) THEN * * Test PSAXPY * ADDS = DBLE( N ) MULTS = DBLE( N ) CALL PB_TIMER( 1 ) CALL PSAXPY( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.5 ) THEN * * Test PSDOT * ADDS = DBLE( N-1 ) MULTS = DBLE( N ) CALL PB_TIMER( 1 ) CALL PSDOT( N, PSCLR, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.6 ) THEN * * Test PSNRM2 * ADDS = DBLE( N-1 ) MULTS = DBLE( N ) CALL PB_TIMER( 1 ) CALL PSNRM2( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.7 ) THEN * * Test PSASUM * ADDS = DBLE( N - 1 ) MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PSASUM( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.8 ) THEN * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PSAMAX( N, PSCLR, PISCLR, MEM( IPX ), IX, JX, $ DESCX, INCX ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Calculate total flops * NOPS = ADDS + MULTS * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9984 ) SNAMES( K ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9988 ) J END IF * 50 CONTINUE * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, '---------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' N IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9993 FORMAT( 2X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I5,1X,I5,1X,I5,1X,I5,1X, $ I5,1X,I5,1X,I6 ) 9992 FORMAT( 2X, ' N IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9991 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9990 FORMAT( 'Not enough memory. Need: ', I12 ) 9988 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9987 FORMAT( 2X, 'End of Tests.' ) 9986 FORMAT( 2X, 'Tests started.' ) 9985 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9984 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9983 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PSBLA1TIM * END SUBROUTINE PSBLA1TIMINFO( SUMMRY, NOUT, NMAT, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, $ RSCXVAL, CSCXVAL, IXVAL, JXVAL, $ INCXVAL, MYVAL, NYVAL, IMBYVAL, MBYVAL, $ INBYVAL, NBYVAL, RSCYVAL, CSCYVAL, $ IYVAL, JYVAL, INCYVAL, LDVAL, NGRIDS, $ PVAL, LDPVAL, QVAL, LDQVAL, LTEST, IAM, $ NPROCS, ALPHA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NGRIDS, NMAT, NOUT, $ NPROCS REAL ALPHA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY LOGICAL LTEST( * ) INTEGER CSCXVAL( LDVAL ), CSCYVAL( LDVAL ), $ IMBXVAL( LDVAL ), IMBYVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JXVAL( LDVAL ), $ JYVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCXVAL( LDVAL ), RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PSBLA1TIMINFO get the needed startup information for timing various * Level 1 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DESCX(:), IX, JX, INCX, DESCY(:), * IY, JY and INCY. This is also the maximum number of test * cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 1 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) REAL * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 2, 2*NGRIDS+23*NMAT+NSUBS ) with NSUBS = 8. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PSBLAS1TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 100 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 100 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA * READ( NIN, FMT = * ) ALPHA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 100 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 100 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, WORK, 2 ) * I = 1 CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 70 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 70 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 1 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Timing of the real single precision '// $ 'Level 1 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9989 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9989 ) SNAMES( 1 ), ' ... No ' END IF DO 80 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( I ), ' ... No ' END IF 80 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, WORK, 2, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) * I = 2*NGRIDS + 23*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 90 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 90 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 100 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : ', G16.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9989 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9988 FORMAT( 2X, ' ', A, A8 ) * * End of PSBLA1TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/PSBLAS2TIM.dat000644 000766 000024 00000005776 10363532303 017670 0ustar00juliestaff000000 000000 'Level 2 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PSBLAS2TIM.SUMM' output file name (if any) 6 device out 10 value of the logical computational blocksize NB 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q 2.0E0 value of ALPHA 3.0E0 value of BETA 16 number of tests problems 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' values of UPLO 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANS 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' values of DIAG 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 values of M 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 values of N 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_A 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_A 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_A 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IA 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JA 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_X 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_X 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_X 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IX 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JX 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of INCX 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_Y 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_Y 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IY 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JY 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of INCY PSGEMV T put F for no test in the same column PSSYMV T put F for no test in the same column PSTRMV T put F for no test in the same column PSTRSV T put F for no test in the same column PSGER T put F for no test in the same column PSSYR T put F for no test in the same column PSSYR2 T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/psblas2tim.f000644 000766 000024 00000153524 11750130340 017774 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 7) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PSGEMV ', 'PSSYMV ', 'PSTRMV ', $ 'PSTRSV ', 'PSGER ', 'PSSYR ', $ 'PSSYR2 '/ END BLOCK DATA PROGRAM PSBLA2TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PSBLA2TIM is the main timing program for the Level 2 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 55 lines: * 'Level 2 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PSBLAS2TIM.SUMM' output file name (if any) * 6 device out * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0E0 value of ALPHA * 1.0E0 value of BETA * 2 number of tests problems * 'U' 'L' values of UPLO * 'N' 'T' values of TRANS * 'N' 'U' values of DIAG * 3 4 values of M * 3 4 values of N * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PSGEMV T put F for no test in the same column * PSSYMV T put F for no test in the same column * PSTRMV T put F for no test in the same column * PSTRSV T put F for no test in the same column * PSGER T put F for no test in the same column * PSSYR T put F for no test in the same column * PSSYR2 T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * REALSZ indicates the length in bytes on the given platform * for a single precision real. By default, REALSZ is set to * four. * * MEM REAL array * MEM is an array of dimension TOTMEM / REALSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, REALSZ, TOTMEM, MEMSIZ, $ NSUBS REAL ONE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, REALSZ = 4, $ ONE = 1.0E+0, TOTMEM = 2000000, NSUBS = 7, $ MEMSIZ = TOTMEM / REALSZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 AFORM, DIAG, DIAGDO, TRANS, UPLO INTEGER CSRCA, CSRCX, CSRCY, I, IA, IAM, IASEED, ICTXT, $ IMBA, IMBX, IMBY, IMIDA, IMIDX, IMIDY, INBA, $ INBX, INBY, INCX, INCY, IPA, IPOSTA, IPOSTX, $ IPOSTY, IPREA, IPREX, IPREY, IPX, IPY, IX, $ IXSEED, IY, IYSEED, J, JA, JX, JY, K, M, MA, $ MBA, MBX, MBY, MEMREQD, MPA, MPX, MPY, MX, MY, $ MYCOL, MYROW, N, NA, NBA, NBX, NBY, NCOLA, $ NGRIDS, NLX, NLY, NOUT, NPCOL, NPROCS, NPROW, $ NQA, NQX, NQY, NROWA, NTESTS, NX, NY, OFFD, $ RSRCA, RSRCX, RSRCY REAL ALPHA, BETA, SCALE DOUBLE PRECISION CFLOPS, NOPS, WFLOPS * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), TRANVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCXVAL( MAXTESTS ), $ CSCYVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ), $ IAVAL( MAXTESTS ), IERR( 3 ), $ IMBAVAL( MAXTESTS ), IMBXVAL( MAXTESTS ), $ IMBYVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JXVAL( MAXTESTS ), $ JYVAL( MAXTESTS ), MAVAL( MAXTESTS ), $ MBAVAL( MAXTESTS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NAVAL( MAXTESTS ), NBAVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCAVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PMDESCCHK, PMDIMCHK, PSBLA2TIMINFO, $ PSGEMV, PSGER, PSLAGEN, PSLASCAL, PSSYMV, $ PSSYR, PSSYR2, PSTRMV, PSTRSV, PVDESCCHK, $ PVDIMCHK * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDOPBL2 EXTERNAL LSAME, PDOPBL2 * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .TRUE., .FALSE., .FALSE., $ .TRUE., .FALSE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .TRUE. * * Seeds for random matrix generations. * IASEED = 100 IXSEED = 200 IYSEED = 300 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PSBLA2TIMINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, $ IAVAL, JAVAL, MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, $ CSCYVAL, IYVAL, JYVAL, INCYVAL, MAXTESTS, $ NGRIDS, PVAL, MAXGRIDS, QVAL, MAXGRIDS, $ NBLOG, LTEST, IAM, NPROCS, ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) TRANS = TRANVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) * MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, UPLO, TRANS, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9980 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' GO TO 40 END IF * IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANS' GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) TRANS WRITE( NOUT, FMT = 9997 ) 'DIAG' GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, 0, 0, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, 0, 0, IERR( 2 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, 0, 0, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN GO TO 40 END IF * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = 1 IPX = IPA + DESCA( LLD_ ) * NQA IPY = IPX + DESCX( LLD_ ) * NQX * * Check if sufficient memory. * MEMREQD = IPY + DESCY( LLD_ ) * NQY - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) MEMREQD*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) GO TO 40 END IF * * Loop over all PBLAS 2 routines * DO 30 K = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * * Define the size of the operands * IF( K.EQ.1 ) THEN NROWA = M NCOLA = N IF( LSAME( TRANS, 'N' ) ) THEN NLX = N NLY = M ELSE NLX = M NLY = N END IF ELSE IF( K.EQ.5 ) THEN NROWA = M NCOLA = N NLX = M NLY = N ELSE NROWA = N NCOLA = N NLX = N NLY = N END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLX, 'X', IX, JX, DESCX, $ INCX, IERR( 2 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLY, 'Y', IY, JY, DESCY, $ INCY, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN GO TO 30 END IF * * Generate distributed matrices A, X and Y * IF( K.EQ.2 .OR. K.EQ.6 .OR. K.EQ.7 ) THEN AFORM = 'S' DIAGDO = 'N' OFFD = IA - JA ELSE IF( ( K.EQ.4 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN AFORM = 'N' DIAGDO = 'D' OFFD = IA - JA ELSE AFORM = 'N' DIAGDO = 'N' OFFD = 0 END IF * CALL PSLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, $ 1, 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MY, $ NY, 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * IF( ( K.EQ.4 ).AND.( .NOT.( LSAME( DIAG, 'N' ) ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / REAL( MAX( NROWA, NCOLA ) ) IF( LSAME( UPLO, 'L' ) ) THEN CALL PSLASCAL( 'Lower', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA+1, JA, DESCA ) ELSE CALL PSLASCAL( 'Upper', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA, JA+1, DESCA ) END IF END IF * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the Level 2 PBLAS routine * IF( K.EQ.1 ) THEN * * Test PSGEMV * CALL PB_TIMER( 1 ) CALL PSGEMV( TRANS, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.2 ) THEN * * Test PSSYMV * CALL PB_TIMER( 1 ) CALL PSSYMV( UPLO, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.3 ) THEN * * Test PSTRMV * CALL PB_TIMER( 1 ) CALL PSTRMV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.4 ) THEN * * Test PSTRSV * CALL PB_TIMER( 1 ) CALL PSTRSV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.5 ) THEN * * Test PSGER * CALL PB_TIMER( 1 ) CALL PSGER( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.6 ) THEN * * Test PSSYR * CALL PB_TIMER( 1 ) CALL PSSYR( UPLO, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.7 ) THEN * * Test PSSYR2 * CALL PB_TIMER( 1 ) CALL PSSYR2( UPLO, N, ALPHA, MEM( IPX ), IX, JX, $ DESCX, INCX, MEM( IPY ), IY, JY, DESCY, $ INCY, MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Calculate total flops * NOPS = PDOPBL2( SNAMES( K ), NROWA, NCOLA, 0, 0 ) * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9981 ) SNAMES( K ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' M N UPLO TRANS DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,9X,A1,11X,A1,10X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9989 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5,1X,I6 ) 9988 FORMAT( 2X, ' IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9987 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9986 FORMAT( 'Not enough memory. Need: ', I12 ) 9985 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9984 FORMAT( 2X, 'End of Tests.' ) 9983 FORMAT( 2X, 'Tests started.' ) 9982 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9981 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9980 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PSBLA2TIM * END SUBROUTINE PSBLA2TIMINFO( SUMMRY, NOUT, NMAT, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, $ IMBAVAL, MBAVAL, INBAVAL, NBAVAL, $ RSCAVAL, CSCAVAL, IAVAL, JAVAL, $ MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, $ IXVAL, JXVAL, INCXVAL, MYVAL, NYVAL, $ IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, $ INCYVAL, LDVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, NBLOG, LTEST, IAM, NPROCS, $ ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NBLOG, NGRIDS, $ NMAT, NOUT, NPROCS REAL ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), TRANVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCXVAL( LDVAL ), $ CSCYVAL( LDVAL ), IAVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBXVAL( LDVAL ), $ IMBYVAL( LDVAL ), INBAVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JAVAL( LDVAL ), $ JXVAL( LDVAL ), JYVAL( LDVAL ), MAVAL( LDVAL ), $ MBAVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NAVAL( LDVAL ), $ NBAVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCXVAL( LDVAL ), $ RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PSBLA2TIMINFO get the needed startup information for timing various * Level 2 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * TRANVAL (global output) CHARACTER array * On entry, TRANVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANS to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, TRANS, UPLO, M, N, DESCA(:), * IA, JA, DESCX(:), IX, JX, INCX, DESCY(:), IY, JY and INCY. * This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least seven. On * exit, if LTEST( i ) is .TRUE., the i-th Level 2 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) REAL * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) REAL * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+37*NMAT+NSUBS ) with NSUBS = 7. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 7 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PSBLAS2TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRANVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( TRANVAL( J ) ) WORK( I+2 ) = ICHAR( UPLOVAL( J ) ) I = I + 3 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 2 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real single precision '// $ 'Level 2 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) NMAT WRITE( NOUT, FMT = 9986 ) NBLOG WRITE( NOUT, FMT = 9991 ) NGRIDS WRITE( NOUT, FMT = 9989 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9989 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA WRITE( NOUT, FMT = 9993 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 1, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 37*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) TRANVAL( J ) = CHAR( WORK( I+1 ) ) UPLOVAL( J ) = CHAR( WORK( I+2 ) ) I = I + 3 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : ', G16.6 ) 9993 FORMAT( 2X, 'Beta : ', G16.6 ) 9992 FORMAT( 2X, 'Number of Tests : ', I6 ) 9991 FORMAT( 2X, 'Number of process grids : ', I6 ) 9990 FORMAT( 2X, ' : ', 5I6 ) 9989 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9987 FORMAT( 2X, ' ', A, A8 ) 9986 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PSBLA2TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/PSBLAS3TIM.dat000644 000766 000024 00000004324 10363532303 017655 0ustar00juliestaff000000 000000 'Level 3 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PSBLAS3TIM.SUMM' output file name (if any) 6 device out 10 value of the logical computational blocksize NB 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q 2.0E0 value of ALPHA 3.0E0 value of BETA 8 number of tests problems 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' values of DIAG 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' values of SIDE 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANSA 'N' 'N' 'T' 'T' 'N' 'N' 'T' 'T' values of TRANSB 'U' 'L' 'U' 'L' 'U' 'L' 'U' 'L' values of UPLO 3 4 3 4 3 4 3 4 values of M 3 4 3 4 3 4 3 4 values of N 3 4 3 4 3 4 3 4 values of K 6 10 6 10 6 10 6 10 values of M_A 6 10 6 10 6 10 6 10 values of N_A 2 5 2 5 2 5 2 5 values of IMB_A 2 5 2 5 2 5 2 5 values of INB_A 2 5 2 5 2 5 2 5 values of MB_A 2 5 2 5 2 5 2 5 values of NB_A 0 1 0 1 0 1 0 1 values of RSRC_A 0 0 0 0 0 0 0 0 values of CSRC_A 1 1 1 1 1 1 1 1 values of IA 1 1 1 1 1 1 1 1 values of JA 6 10 6 10 6 10 6 10 values of M_B 6 10 6 10 6 10 6 10 values of N_B 2 5 2 5 2 5 2 5 values of IMB_B 2 5 2 5 2 5 2 5 values of INB_B 2 5 2 5 2 5 2 5 values of MB_B 2 5 2 5 2 5 2 5 values of NB_B 0 1 0 1 0 1 0 1 values of RSRC_B 0 0 0 0 0 0 0 0 values of CSRC_B 1 1 1 1 1 1 1 1 values of IB 1 1 1 1 1 1 1 1 values of JB 6 10 6 10 6 10 6 10 values of M_C 6 10 6 10 6 10 6 10 values of N_C 2 5 2 5 2 5 2 5 values of IMB_C 2 5 2 5 2 5 2 5 values of INB_C 2 5 2 5 2 5 2 5 values of MB_C 2 5 2 5 2 5 2 5 values of NB_C 0 1 0 1 0 1 0 1 values of RSRC_C 0 0 0 0 0 0 0 0 values of CSRC_C 1 1 1 1 1 1 1 1 values of IC 1 1 1 1 1 1 1 1 values of JC PSGEMM T put F for no test in the same column PSSYMM T put F for no test in the same column PSSYRK T put F for no test in the same column PSSYR2K T put F for no test in the same column PSTRMM T put F for no test in the same column PSTRSM T put F for no test in the same column PSGEADD T put F for no test in the same column PSTRADD T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/psblas3tim.f000644 000766 000024 00000166131 11750130340 017773 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PSGEMM ', 'PSSYMM ', 'PSSYRK ', $ 'PSSYR2K', 'PSTRMM ', 'PSTRSM ', $ 'PSGEADD', 'PSTRADD'/ END BLOCK DATA PROGRAM PSBLA3TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PSBLA3TIM is the main timing program for the Level 3 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 56 lines: * 'Level 3 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PSBLAS3TIM.SUMM' output file name (if any) * 6 device out * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0E0 value of ALPHA * 1.0E0 value of BETA * 2 number of tests problems * 'N' 'U' values of DIAG * 'L' 'R' values of SIDE * 'N' 'T' values of TRANSA * 'N' 'T' values of TRANSB * 'U' 'L' values of UPLO * 3 4 values of M * 3 4 values of N * 3 4 values of K * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_B * 6 10 values of N_B * 2 5 values of IMB_B * 2 5 values of INB_B * 2 5 values of MB_B * 2 5 values of NB_B * 0 1 values of RSRC_B * 0 0 values of CSRC_B * 1 1 values of IB * 1 1 values of JB * 6 10 values of M_C * 6 10 values of N_C * 2 5 values of IMB_C * 2 5 values of INB_C * 2 5 values of MB_C * 2 5 values of NB_C * 0 1 values of RSRC_C * 0 0 values of CSRC_C * 1 1 values of IC * 1 1 values of JC * PSGEMM T put F for no test in the same column * PSSYMM T put F for no test in the same column * PSSYRK T put F for no test in the same column * PSSYR2K T put F for no test in the same column * PSTRMM T put F for no test in the same column * PSTRSM T put F for no test in the same column * PSGEADD T put F for no test in the same column * PSTRADD T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * REALSZ indicates the length in bytes on the given platform * for a single precision real. By default, REALSZ is set to * four. * * MEM REAL array * MEM is an array of dimension TOTMEM / REALSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, REALSZ, TOTMEM, MEMSIZ, $ NSUBS REAL ONE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, REALSZ = 4, $ ONE = 1.0E+0, TOTMEM = 2000000, NSUBS = 8, $ MEMSIZ = TOTMEM / REALSZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 ADIAGDO, AFORM, CFORM, DIAG, SIDE, TRANSA, $ TRANSB, UPLO INTEGER CSRCA, CSRCB, CSRCC, I, IA, IAM, IASEED, IB, $ IBSEED, IC, ICSEED, ICTXT, IMBA, IMBB, IMBC, $ IMIDA, IMIDB, IMIDC, INBA, INBB, INBC, IPA, $ IPB, IPC, IPOSTA, IPOSTB, IPOSTC, IPREA, IPREB, $ IPREC, J, JA, JB, JC, K, L, M, MA, MB, MBA, $ MBB, MBC, MC, MEMREQD, MPA, MPB, MPC, MYCOL, $ MYROW, N, NA, NB, NBA, NBB, NBC, NC, NCOLA, $ NCOLB, NCOLC, NGRIDS, NOUT, NPCOL, NPROCS, $ NPROW, NQA, NQB, NQC, NROWA, NROWB, NROWC, $ NTESTS, OFFDA, OFFDC, RSRCA, RSRCB, RSRCC REAL ALPHA, BETA, SCALE DOUBLE PRECISION CFLOPS, NOPS, WFLOPS * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), BCHECK( NSUBS ), $ CCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), SIDEVAL( MAXTESTS ), $ TRNAVAL( MAXTESTS ), TRNBVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCBVAL( MAXTESTS ), $ CSCCVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCB( DLEN_ ), DESCC( DLEN_ ), $ IAVAL( MAXTESTS ), IBVAL( MAXTESTS ), $ ICVAL( MAXTESTS ), IERR( 3 ), $ IMBAVAL( MAXTESTS ), IMBBVAL( MAXTESTS ), $ IMBCVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBBVAL( MAXTESTS ), INBCVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JBVAL( MAXTESTS ), $ JCVAL( MAXTESTS ), KVAL( MAXTESTS ), $ MAVAL( MAXTESTS ), MBAVAL( MAXTESTS ), $ MBBVAL( MAXTESTS ), MBCVAL( MAXTESTS ), $ MBVAL( MAXTESTS ), MCVAL( MAXTESTS ), $ MVAL( MAXTESTS ), NAVAL( MAXTESTS ), $ NBAVAL( MAXTESTS ), NBBVAL( MAXTESTS ), $ NBCVAL( MAXTESTS ), NBVAL( MAXTESTS ), $ NCVAL( MAXTESTS ), NVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCAVAL( MAXTESTS ), RSCBVAL( MAXTESTS ), $ RSCCVAL( MAXTESTS ) REAL MEM( MEMSIZ ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PMDESCCHK, PMDIMCHK, PSBLA3TIMINFO, $ PSGEADD, PSGEMM, PSLAGEN, PSLASCAL, PSSYMM, $ PSSYR2K, PSSYRK, PSTRADD, PSTRMM, PSTRSM * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDOPBL3 EXTERNAL LSAME, PDOPBL3 * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA BCHECK/.TRUE., .TRUE., .FALSE., .TRUE., .TRUE., $ .TRUE., .FALSE., .FALSE./ DATA CCHECK/.TRUE., .TRUE., .TRUE., .TRUE., .FALSE., $ .FALSE., .TRUE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * Seeds for random matrix generations. * IASEED = 100 IBSEED = 200 ICSEED = 300 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PSBLA3TIMINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, NVAL, $ KVAL, MAVAL, NAVAL, IMBAVAL, MBAVAL, $ INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, IAVAL, $ JAVAL, MBVAL, NBVAL, IMBBVAL, MBBVAL, $ INBBVAL, NBBVAL, RSCBVAL, CSCBVAL, IBVAL, $ JBVAL, MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, ICVAL, $ JCVAL, MAXTESTS, NGRIDS, PVAL, MAXGRIDS, $ QVAL, MAXGRIDS, NBLOG, LTEST, IAM, NPROCS, $ ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) SIDE = SIDEVAL( J ) TRANSA = TRNAVAL( J ) TRANSB = TRNBVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) K = KVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MB = MBVAL( J ) NB = NBVAL( J ) IMBB = IMBBVAL( J ) MBB = MBBVAL( J ) INBB = INBBVAL( J ) NBB = NBBVAL( J ) RSRCB = RSCBVAL( J ) CSRCB = CSCBVAL( J ) IB = IBVAL( J ) JB = JBVAL( J ) * MC = MCVAL( J ) NC = NCVAL( J ) IMBC = IMBCVAL( J ) MBC = MBCVAL( J ) INBC = INBCVAL( J ) NBC = NBCVAL( J ) RSRCC = RSCCVAL( J ) CSRCC = CSCCVAL( J ) IC = ICVAL( J ) JC = JCVAL( J ) * IF( IAM.EQ.0 ) THEN * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, K, SIDE, UPLO, TRANSA, $ TRANSB, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IB, JB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IC, JC, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9980 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( SIDE, 'L' ).AND. $ .NOT.LSAME( SIDE, 'R' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'SIDE' GO TO 40 END IF * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' GO TO 40 END IF * IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSA' GO TO 40 END IF * IF( .NOT.LSAME( TRANSB, 'N' ).AND. $ .NOT.LSAME( TRANSB, 'T' ).AND. $ .NOT.LSAME( TRANSB, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSB' GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'DIAG' GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, 0, 0, IERR( 1 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'B', DESCB, $ BLOCK_CYCLIC_2D_INB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB, MPB, NQB, IPREB, $ IMIDB, IPOSTB, 0, 0, IERR( 2 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'C', DESCC, $ BLOCK_CYCLIC_2D_INB, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC, MPC, NQC, IPREC, $ IMIDC, IPOSTC, 0, 0, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN GO TO 40 END IF * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPB = IPA + DESCA( LLD_ )*NQA IPC = IPB + DESCB( LLD_ )*NQB * * Check if sufficient memory. * MEMREQD = IPC + DESCC( LLD_ )*NQC - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) MEMREQD*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) GO TO 40 END IF * * Loop over all PBLAS 3 routines * DO 30 L = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( L ) ) $ GO TO 30 * * Define the size of the operands * IF( L.EQ.1 ) THEN * * PSGEMM * NROWC = M NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF( LSAME( TRANSB, 'N' ) ) THEN NROWB = K NCOLB = N ELSE NROWB = N NCOLB = K END IF ELSE IF( L.EQ.2 ) THEN * * PSSYMM * NROWC = M NCOLC = N NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF ELSE IF( L.EQ.3 ) THEN * * PSSYRK * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K ELSE NROWA = K NCOLA = N END IF NROWB = 0 NCOLB = 0 ELSE IF( L.EQ.4 ) THEN * * PSSYR2K * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K NROWB = N NCOLB = K ELSE NROWA = K NCOLA = N NROWB = K NCOLB = N END IF ELSE IF( L.EQ.5 .OR. L.EQ.6 ) THEN * * PSTRMM, PSTRSM * NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF NROWC = 0 NCOLC = 0 ELSE IF( L.EQ.7 .OR. L.EQ.8 ) THEN * * PSGEADD, PSTRADD * IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = N ELSE NROWA = N NCOLA = M END IF NROWC = M NCOLC = N NROWB = 0 NCOLB = 0 * END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWB, NCOLB, 'B', IB, JB, $ DESCB, IERR( 2 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWC, NCOLC, 'C', IC, JC, $ DESCC, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN GO TO 30 END IF * * Generate distributed matrices A, B and C * IF( L.EQ.2 ) THEN * * PSSYMM * AFORM = 'S' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.3 .OR. L.EQ.4 ) THEN * * PSSYRK, PSSYR2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'S' OFFDC = IC - JC * ELSE IF( ( L.EQ.6 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN * * PSTRSM * AFORM = 'N' ADIAGDO = 'D' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE * * Default values * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'N' OFFDC = 0 * END IF * CALL PSLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) IF( ( L.EQ.6 ).AND.( .NOT.( LSAME( DIAG, 'N' ) ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / REAL( MAX( NROWA, NCOLA ) ) IF( LSAME( UPLO, 'L' ) ) THEN CALL PSLASCAL( 'Lower', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA+1, JA, DESCA ) ELSE CALL PSLASCAL( 'Upper', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA, JA+1, DESCA ) END IF * END IF * IF( BCHECK( L ) ) $ CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCB, IBSEED, MEM( IPB ), $ DESCB( LLD_ ) ) * IF( CCHECK( L ) ) $ CALL PSLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCC, ICSEED, MEM( IPC ), $ DESCC( LLD_ ) ) * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the Level 3 PBLAS routine * IF( L.EQ.1 ) THEN * * Test PSGEMM * NOPS = PDOPBL3( SNAMES( L ), M, N, K ) * CALL PB_TIMER( 1 ) CALL PSGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.2 ) THEN * * Test PSSYMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PSSYMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.3 ) THEN * * Test PSSYRK * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PSSYRK( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.4 ) THEN * * Test PSSYR2K * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PSSYR2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.5 ) THEN * * Test PSTRMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PSTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.6 ) THEN * * Test PSTRSM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PSTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.7 ) THEN * * Test PSGEADD * NOPS = PDOPBL3( SNAMES( L ), M, N, M ) * CALL PB_TIMER( 1 ) CALL PSGEADD( TRANSA, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.8 ) THEN * * Test PSTRADD * IF( LSAME( UPLO, 'U' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PSTRADD( UPLO, TRANSA, M, N, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9981 ) SNAMES( L ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9986 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '-------------------' ) 9994 FORMAT( 2X, ' M N K SIDE UPLO TRANSA ', $ 'TRANSB DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,1X,I6,6X,A1,5X,A1,7X,A1,7X,A1,5X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IB JB MB NB IMBB INBB', $ ' MBB NBB RSRCB CSRCB' ) 9989 FORMAT( 2X, ' IC JC MC NC IMBC INBC', $ ' MBC NBC RSRCC CSRCC' ) 9988 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9987 FORMAT( 'Not enough memory. Need: ', I12 ) 9986 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9985 FORMAT( 2X, 'End of Tests.' ) 9984 FORMAT( 2X, 'Tests started.' ) 9983 FORMAT( 5X, A, ' ***** ', A, ' has an incorrect value: ', $ ' BYPASS *****' ) 9982 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9981 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9980 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PSBLA3TIM * END SUBROUTINE PSBLA3TIMINFO( SUMMRY, NOUT, NMAT, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, $ NVAL, KVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, $ CSCAVAL, IAVAL, JAVAL, MBVAL, NBVAL, $ IMBBVAL, MBBVAL, INBBVAL, NBBVAL, $ RSCBVAL, CSCBVAL, IBVAL, JBVAL, $ MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, $ ICVAL, JCVAL, LDVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, NBLOG, LTEST, $ IAM, NPROCS, ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NBLOG, NGRIDS, $ NMAT, NOUT, NPROCS REAL ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), SIDEVAL( LDVAL ), $ TRNAVAL( LDVAL ), TRNBVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCBVAL( LDVAL ), $ CSCCVAL( LDVAL ), IAVAL( LDVAL ), $ IBVAL( LDVAL ), ICVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBBVAL( LDVAL ), $ IMBCVAL( LDVAL ), INBAVAL( LDVAL ), $ INBBVAL( LDVAL ), INBCVAL( LDVAL ), $ JAVAL( LDVAL ), JBVAL( LDVAL ), JCVAL( LDVAL ), $ KVAL( LDVAL ), MAVAL( LDVAL ), MBAVAL( LDVAL ), $ MBBVAL( LDVAL ), MBCVAL( LDVAL ), $ MBVAL( LDVAL ), MCVAL( LDVAL ), MVAL( LDVAL ), $ NAVAL( LDVAL ), NBAVAL( LDVAL ), $ NBBVAL( LDVAL ), NBCVAL( LDVAL ), $ NBVAL( LDVAL ), NCVAL( LDVAL ), NVAL( LDVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCBVAL( LDVAL ), $ RSCCVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PSBLA3TIMINFO get the needed startup information for timing various * Level 3 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * SIDEVAL (global output) CHARACTER array * On entry, SIDEVAL is an array of dimension LDVAL. On exit, * this array contains the values of SIDE to run the code with. * * TRNAVAL (global output) CHARACTER array * On entry, TRNAVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSA to run the code * with. * * TRNBVAL (global output) CHARACTER array * On entry, TRNBVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSB to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * KVAL (global output) INTEGER array * On entry, KVAL is an array of dimension LDVAL. On exit, this * array contains the values of K to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MBVAL (global output) INTEGER array * On entry, MBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( M_ ) to run the code * with. * * NBVAL (global output) INTEGER array * On entry, NBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( N_ ) to run the code * with. * * IMBBVAL (global output) INTEGER array * On entry, IMBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( IMB_ ) to run the * code with. * * MBBVAL (global output) INTEGER array * On entry, MBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( MB_ ) to run the * code with. * * INBBVAL (global output) INTEGER array * On entry, INBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( INB_ ) to run the * code with. * * NBBVAL (global output) INTEGER array * On entry, NBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( NB_ ) to run the * code with. * * RSCBVAL (global output) INTEGER array * On entry, RSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( RSRC_ ) to run the * code with. * * CSCBVAL (global output) INTEGER array * On entry, CSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( CSRC_ ) to run the * code with. * * IBVAL (global output) INTEGER array * On entry, IBVAL is an array of dimension LDVAL. On exit, this * array contains the values of IB to run the code with. * * JBVAL (global output) INTEGER array * On entry, JBVAL is an array of dimension LDVAL. On exit, this * array contains the values of JB to run the code with. * * MCVAL (global output) INTEGER array * On entry, MCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( M_ ) to run the code * with. * * NCVAL (global output) INTEGER array * On entry, NCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( N_ ) to run the code * with. * * IMBCVAL (global output) INTEGER array * On entry, IMBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( IMB_ ) to run the * code with. * * MBCVAL (global output) INTEGER array * On entry, MBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( MB_ ) to run the * code with. * * INBCVAL (global output) INTEGER array * On entry, INBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( INB_ ) to run the * code with. * * NBCVAL (global output) INTEGER array * On entry, NBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( NB_ ) to run the * code with. * * RSCCVAL (global output) INTEGER array * On entry, RSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( RSRC_ ) to run the * code with. * * CSCCVAL (global output) INTEGER array * On entry, CSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( CSRC_ ) to run the * code with. * * ICVAL (global output) INTEGER array * On entry, ICVAL is an array of dimension LDVAL. On exit, this * array contains the values of IC to run the code with. * * JCVAL (global output) INTEGER array * On entry, JCVAL is an array of dimension LDVAL. On exit, this * array contains the values of JC to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, SIDE, TRANSA, TRANSB, UPLO, * M, N, K, DESCA(:), IA, JA, DESCB(:), IB, JB, DESCC(:), IC, * JC. This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 3 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) REAL * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) REAL * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+38*NMAT+NSUBS ) with NSUBS = 8. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PSBLAS3TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( SIDEVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( KVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( ICVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JCVAL ( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( SIDEVAL( J ) ) WORK( I+2 ) = ICHAR( TRNAVAL( J ) ) WORK( I+3 ) = ICHAR( TRNBVAL( J ) ) WORK( I+4 ) = ICHAR( UPLOVAL( J ) ) I = I + 5 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, KVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, ICVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JCVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 3 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real single precision '// $ 'Level 3 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) NMAT WRITE( NOUT, FMT = 9986 ) NBLOG WRITE( NOUT, FMT = 9991 ) NGRIDS WRITE( NOUT, FMT = 9989 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9989 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA WRITE( NOUT, FMT = 9993 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 38*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) SIDEVAL( J ) = CHAR( WORK( I+1 ) ) TRNAVAL( J ) = CHAR( WORK( I+2 ) ) TRNBVAL( J ) = CHAR( WORK( I+3 ) ) UPLOVAL( J ) = CHAR( WORK( I+4 ) ) I = I + 5 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, KVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, ICVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JCVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : ', G16.6 ) 9993 FORMAT( 2X, 'Beta : ', G16.6 ) 9992 FORMAT( 2X, 'Number of Tests : ', I6 ) 9991 FORMAT( 2X, 'Number of process grids : ', I6 ) 9990 FORMAT( 2X, ' : ', 5I6 ) 9989 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9987 FORMAT( 2X, ' ', A, A8 ) 9986 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PSBLA3TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/psblastim.f000644 000766 000024 00000230045 10363532303 017710 0ustar00juliestaff000000 000000 SUBROUTINE PSLASCAL( TYPE, M, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TYPE INTEGER IA, JA, M, N REAL ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLASCAL scales the m by n submatrix A(IA:IA+M-1,JA:JA+N-1) denoted * by sub( A ) by the scalar alpha. TYPE specifies if sub( A ) is full, * upper triangular, lower triangular or upper Hessenberg. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * On entry, TYPE specifies the type of the input submatrix as * follows: * = 'L' or 'l': sub( A ) is a lower triangular matrix, * = 'U' or 'u': sub( A ) is an upper triangular matrix, * = 'H' or 'h': sub( A ) is an upper Hessenberg matrix, * otherwise sub( A ) is a full matrix. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( A ) are overwritten by * the local entries of the m by n scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 UPLO LOGICAL GODOWN, GOLEFT, LOWER, UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, ITYPE, $ IUPP, JJA, JJMAX, JOFFA, JOFFD, LCMT, LCMT00, $ LDA, LMBLOC, LNBLOC, LOW, M1, MB, MBLKD, MBLKS, $ MBLOC, MP, MRCOL, MRROW, MYCOL, MYROW, N1, NB, $ NBLKD, NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, $ QNB, TMP1, UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS, PB_INFOG2L, PB_SLASCAL * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_NUMROC EXTERNAL LSAME, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 UPLO = TYPE UPPER = .FALSE. LOWER = .TRUE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 UPLO = TYPE UPPER = .TRUE. LOWER = .FALSE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 UPLO = 'U' UPPER = .TRUE. LOWER = .FALSE. IOFFD = 1 ELSE ITYPE = 0 UPLO = 'A' UPPER = .TRUE. LOWER = .TRUE. IOFFD = 0 END IF * * Compute local indexes * IF( ITYPE.EQ.0 ) THEN * * Full matrix * CALL PB_INFOG2L( IA, JA, DESCA2, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = PB_NUMROC( M, IA, DESCA2( IMB_ ), DESCA2( MB_ ), MYROW, $ DESCA2( RSRC_ ), NPROW ) NQ = PB_NUMROC( N, JA, DESCA2( INB_ ), DESCA2( NB_ ), MYCOL, $ DESCA2( CSRC_ ), NPCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * LDA = DESCA2( LLD_ ) IOFFA = IIA + ( JJA - 1 ) * LDA * CALL PB_SLASCAL( 'All', MP, NQ, 0, ALPHA, A( IOFFA ), LDA ) * ELSE * * Trapezoidal matrix * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, * LNBLOC, ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) LDA = DESCA2( LLD_ ) * CALL PB_BINFO( IOFFD, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * M1 = MP N1 = NQ IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and * update LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_SLASCAL( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_SLASCAL( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_SLASCAL( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_SLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_SLASCAL( UPLO, MBLOC, INBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_SLASCAL( 'All', TMP1, INBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_SLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_SLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_SLASCAL( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_SLASCAL( 'All', IMBLOC, TMP1, 0, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_SLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_SLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_SLASCAL( UPLO, MBLOC, NBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_SLASCAL( 'All', TMP1, NBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_SLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * END IF * RETURN * * End of PSLASCAL * END SUBROUTINE PSLAGEN( INPLACE, AFORM, DIAG, OFFA, M, N, IA, JA, $ DESCA, IASEED, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE CHARACTER*1 AFORM, DIAG INTEGER IA, IASEED, JA, LDA, M, N, OFFA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( LDA, * ) * .. * * Purpose * ======= * * PSLAGEN generates (or regenerates) a submatrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the generated submatrix is diago- * nally dominant or not as follows: * DIAG = 'D' : sub( A ) is diagonally dominant, * DIAG = 'N' : sub( A ) is not diagonally dominant. * * OFFA (global input) INTEGER * On entry, OFFA specifies the offdiagonal of the underlying * matrix A(1:DESCA(M_),1:DESCA(N_)) of interest when the subma- * trix is symmetric, Hermitian or diagonally dominant. OFFA = 0 * specifies the main diagonal, OFFA > 0 specifies a subdiago- * nal, and OFFA < 0 specifies a superdiagonal (see further de- * tails). * * M (global input) INTEGER * On entry, M specifies the global number of matrix rows of the * submatrix sub( A ) to be generated. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of matrix columns of * the submatrix sub( A ) to be generated. N must be at least * zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IASEED (global input) INTEGER * On entry, IASEED specifies the seed number to generate the * matrix A. IASEED must be at least zero. * * A (local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). On exit, this array contains the * local entries of the randomly generated submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. When INPLACE is .FALSE., LDA is usually DESCA(LLD_). * This restriction is however not enforced, and this subroutine * requires only that LDA >= MAX( 1, Mp ) where * * Mp = PB_NUMROC( M, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ). * * PB_NUMROC is a ScaLAPACK tool function; MYROW, MYCOL, NPROW * and NPCOL can be determined by calling the BLACS subroutine * BLACS_GRIDINFO. * * Further Details * =============== * * OFFD is tied to the matrix described by DESCA, as opposed to the * piece that is currently (re)generated. This is a global information * independent from the distribution parameters. Below are examples of * the meaning of OFFD for a global 7 by 5 matrix: * * --------------------------------------------------------------------- * OFFD | 0 -1 -2 -3 -4 0 -1 -2 -3 -4 0 -1 -2 -3 -4 * -------|------------------------------------------------------------- * | | OFFD=-1 | OFFD=0 OFFD=2 * | V V * 0 | . d . . . -> d . . . . . . . . . * 1 | . . d . . . d . . . . . . . . * 2 | . . . d . . . d . . -> d . . . . * 3 | . . . . d . . . d . . d . . . * 4 | . . . . . . . . . d . . d . . * 5 | . . . . . . . . . . . . . d . * 6 | . . . . . . . . . . . . . . d * --------------------------------------------------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. LOGICAL DIAGDO, SYMM, HERM, NOTRAN INTEGER CSRC, I, IACOL, IAROW, ICTXT, IIA, ILOCBLK, $ ILOCOFF, ILOW, IMB, IMB1, IMBLOC, IMBVIR, INB, $ INB1, INBLOC, INBVIR, INFO, IOFFDA, ITMP, IUPP, $ IVIR, JJA, JLOCBLK, JLOCOFF, JVIR, LCMT00, $ LMBLOC, LNBLOC, LOW, MAXMN, MB, MBLKS, MP, $ MRCOL, MRROW, MYCDIST, MYCOL, MYRDIST, MYROW, $ NB, NBLKS, NPCOL, NPROW, NQ, NVIR, RSRC, UPP REAL ALPHA * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ), IMULADD( 4, JMP_LEN ), $ IRAN( 2 ), JMP( JMP_LEN ), MULADD0( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CHKMAT, PB_DESCTRANS, PB_INITJMP, $ PB_INITMULADD, PB_JUMP, PB_JUMPIT, PB_LOCINFO, $ PB_SETLOCRAN, PB_SETRAN, PB_SLAGEN, PSLADOM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, REAL * .. * .. Data Statements .. DATA ( MULADD0( I ), I = 1, 4 ) / 20077, 16838, $ 12345, 0 / * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Test the input arguments * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1000 + CTXT_ ) ELSE SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) NOTRAN = LSAME( AFORM, 'N' ) DIAGDO = LSAME( DIAG, 'D' ) IF( .NOT.( SYMM.OR.HERM.OR.NOTRAN ) .AND. $ .NOT.( LSAME( AFORM, 'T' ) ) .AND. $ .NOT.( LSAME( AFORM, 'C' ) ) ) THEN INFO = -2 ELSE IF( ( .NOT.DIAGDO ) .AND. $ ( .NOT.LSAME( DIAG, 'N' ) ) ) THEN INFO = -3 END IF CALL PB_CHKMAT( ICTXT, M, 5, N, 6, IA, JA, DESCA2, 10, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSLAGEN', -INFO ) RETURN END IF * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) IMB = DESCA2( IMB_ ) INB = DESCA2( INB_ ) RSRC = DESCA2( RSRC_ ) CSRC = DESCA2( CSRC_ ) * * Figure out local information about the distributed matrix operand * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * IOFFDA = JA + OFFA - IA CALL PB_BINFO( IOFFDA, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * * Initialize ILOCBLK, ILOCOFF, MYRDIST, JLOCBLK, JLOCOFF, MYCDIST * This values correspond to the square virtual underlying matrix * of size MAX( M_ + MAX( 0, -OFFA ), N_ + MAX( 0, OFFA ) ) used * to set up the random sequence. For practical purposes, the size * of this virtual matrix is upper bounded by M_ + N_ - 1. * ITMP = MAX( 0, -OFFA ) IVIR = IA + ITMP IMBVIR = IMB + ITMP NVIR = DESCA2( M_ ) + ITMP * CALL PB_LOCINFO( IVIR, IMBVIR, MB, MYROW, RSRC, NPROW, ILOCBLK, $ ILOCOFF, MYRDIST ) * ITMP = MAX( 0, OFFA ) JVIR = JA + ITMP INBVIR = INB + ITMP NVIR = MAX( MAX( NVIR, DESCA2( N_ ) + ITMP ), $ DESCA2( M_ ) + DESCA2( N_ ) - 1 ) * CALL PB_LOCINFO( JVIR, INBVIR, NB, MYCOL, CSRC, NPCOL, JLOCBLK, $ JLOCOFF, MYCDIST ) * IF( SYMM .OR. HERM .OR. NOTRAN ) THEN * CALL PB_INITJMP( .TRUE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 1, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_SLAGEN( 'Lower', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( SYMM .OR. HERM .OR. ( .NOT. NOTRAN ) ) THEN * CALL PB_INITJMP( .FALSE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 1, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_SLAGEN( 'Upper', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( DIAGDO ) THEN * MAXMN = MAX( DESCA2( M_ ), DESCA2( N_ ) ) ALPHA = REAL( MAXMN ) * IF( IOFFDA.GE.0 ) THEN CALL PSLADOM( INPLACE, MIN( MAX( 0, M-IOFFDA ), N ), ALPHA, $ A, MIN( IA+IOFFDA, IA+M-1 ), JA, DESCA ) ELSE CALL PSLADOM( INPLACE, MIN( M, MAX( 0, N+IOFFDA ) ), ALPHA, $ A, IA, MIN( JA-IOFFDA, JA+N-1 ), DESCA ) END IF * END IF * RETURN * * End of PSLAGEN * END SUBROUTINE PSLADOM( INPLACE, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE INTEGER IA, JA, N REAL ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLADOM adds alpha to the diagonal entries of an n by n submatrix * sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * N (global input) INTEGER * On entry, N specifies the global order of the submatrix * sub( A ) to be modified. N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the local entries * of this array corresponding to the main diagonal of sub( A ) * have been updated. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP REAL ATMP * .. * .. Local Scalars .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.EQ.0 ) $ RETURN * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) * CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA+LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PSLADOM * END SUBROUTINE PB_SLASCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N REAL ALPHA * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * PB_SLASCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (input/output) REAL array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA * A( I, J ) 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = ALPHA * A( I, J ) 50 CONTINUE 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA * A( I, J ) 100 CONTINUE 110 CONTINUE * END IF * RETURN * * End of PB_SLASCAL * END SUBROUTINE PB_SLAGEN( UPLO, AFORM, A, LDA, LCMT00, IRAN, MBLKS, $ IMBLOC, MB, LMBLOC, NBLKS, INBLOC, NB, $ LNBLOC, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO, AFORM INTEGER IMBLOC, INBLOC, LCMT00, LDA, LMBLOC, LNBLOC, $ MB, MBLKS, NB, NBLKS * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) REAL A( LDA, * ) * .. * * Purpose * ======= * * PB_SLAGEN locally initializes an array A. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the lower (UPLO='L') trape- * zoidal part or the upper (UPLO='U') trapezoidal part is to be * generated when the matrix to be generated is symmetric or * Hermitian. For all the other values of AFORM, the value of * this input argument is ignored. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * A (local output) REAL array * On entry, A is an array of dimension (LLD_A, *). On exit, * this array contains the local entries of the randomly genera- * ted submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. LDA must be at least one. * * LCMT00 (global input) INTEGER * On entry, LCMT00 is the LCM value specifying the off-diagonal * of the underlying matrix of interest. LCMT00=0 specifies the * main diagonal, LCMT00 > 0 specifies a subdiagonal, LCMT00 < 0 * specifies superdiagonals. * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2 containing respec- * tively the 16-lower and 16-higher bits of the encoding of the * entry of the random sequence corresponding locally to the * first local array entry to generate. Usually, this array is * computed by PB_SETLOCRAN. * * MBLKS (local input) INTEGER * On entry, MBLKS specifies the local number of blocks of rows. * MBLKS is at least zero. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * LMBLOC (local input) INTEGER * On entry, LMBLOC specifies the number of rows (size) of the * local lowest blocks. LMBLOC is at least zero. * * NBLKS (local input) INTEGER * On entry, NBLKS specifies the local number of blocks of co- * lumns. NBLKS is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * LNBLOC (local input) INTEGER * On entry, LNBLOC specifies the number of columns (size) of * the local rightmost blocks. LNBLOC is at least zero. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the random matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. INTEGER I, IB, IBLK, II, IK, ITMP, JB, JBLK, JJ, JK, $ JTMP, LCMTC, LCMTR, LOW, MNB, UPP REAL DUMMY * .. * .. Local Arrays .. INTEGER IB0( 2 ), IB1( 2 ), IB2( 2 ), IB3( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMPIT * .. * .. External Functions .. LOGICAL LSAME REAL PB_SRAND EXTERNAL LSAME, PB_SRAND * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * DO 10 I = 1, 2 IB1( I ) = IRAN( I ) IB2( I ) = IRAN( I ) IB3( I ) = IRAN( I ) 10 CONTINUE * IF( LSAME( AFORM, 'N' ) ) THEN * * Generate random matrix * JJ = 1 * DO 50 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * DO 40 JK = JJ, JJ + JB - 1 * II = 1 * DO 30 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * * Blocks are IB by JB * DO 20 IK = II, II + IB - 1 A( IK, JK ) = PB_SRAND( 0 ) 20 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 30 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 40 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 50 CONTINUE * ELSE IF( LSAME( AFORM, 'T' ) .OR. LSAME( AFORM, 'C' ) ) THEN * * Generate the transpose of the matrix that would be normally * generated. * II = 1 * DO 90 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 80 IK = II, II + IB - 1 * JJ = 1 * DO 70 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 60 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_SRAND( 0 ) 60 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 70 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 80 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 90 CONTINUE * ELSE IF( ( LSAME( AFORM, 'S' ) ).OR.( LSAME( AFORM, 'H' ) ) ) THEN * * Generate a symmetric matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 170 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 160 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 150 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 100 IK = II, II + IB - 1 DUMMY = PB_SRAND( 0 ) 100 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 110 IK = II, II + IB - 1 A( IK, JK ) = PB_SRAND( 0 ) 110 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 120 IK = II, ITMP - 1 DUMMY = PB_SRAND( 0 ) 120 CONTINUE * DO 130 IK = ITMP, II + IB - 1 A( IK, JK ) = PB_SRAND( 0 ) 130 CONTINUE * END IF * ELSE * DO 140 IK = II, II + IB - 1 A( IK, JK ) = PB_SRAND( 0 ) 140 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 150 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 160 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 170 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 250 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 240 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 230 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 180 JK = JJ, JJ + JB - 1 DUMMY = PB_SRAND( 0 ) 180 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 190 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_SRAND( 0 ) 190 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 200 JK = JJ, JTMP - 1 DUMMY = PB_SRAND( 0 ) 200 CONTINUE * DO 210 JK = JTMP, JJ + JB - 1 A( IK, JK ) = PB_SRAND( 0 ) 210 CONTINUE * END IF * ELSE * DO 220 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_SRAND( 0 ) 220 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 230 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 240 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 250 CONTINUE * END IF * END IF * RETURN * * End of PB_SLAGEN * END REAL FUNCTION PB_SRAND( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_SRAND generates the next number in the random sequence. This func- * tion ensures that this number will be in the interval ( -1.0, 1.0 ). * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. External Functions .. REAL PB_SRAN EXTERNAL PB_SRAN * .. * .. Executable Statements .. * PB_SRAND = ONE - TWO * PB_SRAN( IDUMM ) * RETURN * * End of PB_SRAND * END REAL FUNCTION PB_SRAN( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_SRAN generates the next number in the random sequence. * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL DIVFAC, POW16 PARAMETER ( DIVFAC = 2.147483648E+9, $ POW16 = 6.5536E+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Intrinsic Functions .. INTRINSIC REAL * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * PB_SRAN = ( REAL( IRAND( 1 ) ) + POW16 * REAL( IRAND( 2 ) ) ) / $ DIVFAC * CALL PB_LMUL( IRAND, IACS, J ) CALL PB_LADD( J, IACS( 3 ), IRAND ) * RETURN * * End of PB_SRAN * END scalapack-2.0.2/PBLAS/TIMING/PZBLAS1TIM.dat000644 000766 000024 00000002476 10363532303 017670 0ustar00juliestaff000000 000000 'Level 1 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PZBLAS1TIM.SUMM' output file name (if any) 6 device out 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q (2.0D0, -3.0D0) value of ALPHA 2 number of tests problems 1000 1000 values of N 1000 1 values of M_X 1 1500 values of N_X 32 32 values of IMB_X 32 32 values of INB_X 32 32 values of MB_X 32 32 values of NB_X 0 0 values of RSRC_X 0 0 values of CSRC_X 1 1 values of IX 1 1 values of JX 1 1 values of INCX 1 1 values of M_Y 1000 1500 values of N_Y 32 32 values of IMB_Y 32 32 values of INB_Y 32 32 values of MB_Y 32 32 values of NB_Y 0 0 values of RSRC_Y 0 0 values of CSRC_Y 1 1 values of IY 1 1 values of JY 1 1 values of INCY PZSWAP T put F for no test in the same column PZSCAL T put F for no test in the same column PZDSCAL T put F for no test in the same column PZCOPY T put F for no test in the same column PZAXPY T put F for no test in the same column PZDOTU T put F for no test in the same column PZDOTC T put F for no test in the same column PDZNRM2 T put F for no test in the same column PDZASUM T put F for no test in the same column PZAMAX T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/pzblas1tim.f000644 000766 000024 00000124700 11750130340 017774 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 10) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PZSWAP ', 'PZSCAL ', $ 'PZDSCAL', 'PZCOPY', 'PZAXPY ', $ 'PZDOTU ', 'PZDOTC' , 'PDZNRM2', $ 'PDZASUM', 'PZAMAX '/ END BLOCK DATA PROGRAM PZBLA1TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PZBLA1TIM is the main timing program for the Level 1 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 42 lines: * 'Level 1 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PZBLAS1TIM.SUMM' output file name (if any) * 6 device out * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0D0, 0.0D0) value of ALPHA * 2 number of tests problems * 3 4 values of N * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PZSWAP T put F for no test in the same column * PZSCAL T put F for no test in the same column * PZDSCAL T put F for no test in the same column * PZCOPY T put F for no test in the same column * PZAXPY T put F for no test in the same column * PZDOTU T put F for no test in the same column * PZDOTC T put F for no test in the same column * PDZNRM2 T put F for no test in the same column * PDZASUM T put F for no test in the same column * PZAMAX T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * ZPLXSZ INTEGER * DBLESZ and ZPLXSZ indicate the length in bytes on the given * platform for a double precision real and a double precision * complex. By default, DBLESZ is set to eight and ZPLXSZ is set * to sixteen. * * MEM COMPLEX*16 array * MEM is an array of dimension TOTMEM / ZPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, ZPLXSZ, TOTMEM, MEMSIZ, $ NSUBS PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, ZPLXSZ = 16, $ TOTMEM = 2000000, NSUBS = 10, $ MEMSIZ = TOTMEM / ZPLXSZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER CSRCX, CSRCY, I, IAM, ICTXT, IMBX, IMBY, IMIDX, $ IMIDY, INBX, INBY, INCX, INCY, IPOSTX, IPOSTY, $ IPREX, IPREY, IPX, IPY, IX, IXSEED, IY, IYSEED, $ J, JX, JY, K, MBX, MBY, MEMREQD, MPX, MPY, MX, $ MY, MYCOL, MYROW, N, NBX, NBY, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQX, NQY, NTESTS, NX, NY, $ PISCLR, RSRCX, RSRCY DOUBLE PRECISION ADDS, CFLOPS, MULTS, NOPS, PUSCLR, WFLOPS COMPLEX*16 ALPHA, PSCLR * .. * .. Local Arrays .. CHARACTER*80 OUTFILE LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) INTEGER CSCXVAL( MAXTESTS ), CSCYVAL( MAXTESTS ), $ DESCX( DLEN_ ), DESCY( DLEN_ ), IERR( 2 ), $ IMBXVAL( MAXTESTS ), IMBYVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JXVAL( MAXTESTS ), JYVAL( MAXTESTS ), $ MBXVAL( MAXTESTS ), MBYVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCXVAL( MAXTESTS ), $ RSCYVAL( MAXTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PDZASUM, PDZNRM2, PVDESCCHK, $ PVDIMCHK, PZAMAX, PZAXPY, PZBLA1TIMINFO, $ PZCOPY, PZDOTC, PZDOTU, PZDSCAL, PZLAGEN, $ PZSCAL, PZSWAP * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .FALSE., .FALSE., .TRUE., $ .TRUE., .TRUE., .TRUE., .FALSE., .FALSE., $ .FALSE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * Seeds for random matrix generations. * IXSEED = 100 IYSEED = 200 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PZBLA1TIMINFO( OUTFILE, NOUT, NTESTS, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, RSCXVAL, $ CSCXVAL, IXVAL, JXVAL, INCXVAL, MYVAL, $ NYVAL, IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, INCYVAL, $ MAXTESTS, NGRIDS, PVAL, MAXGRIDS, QVAL, $ MAXGRIDS, LTEST, IAM, NPROCS, ALPHA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * N = NVAL( J ) MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9983 ) END IF * * Check the validity of the input and initialize DESC_ * CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, 0, 0, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, 0, 0, IERR( 2 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 ) $ GO TO 40 * * Assign pointers into MEM for matrices corresponding to * vectors X and Y. Ex: IPX starts at position MEM( 1 ). * IPX = 1 IPY = IPX + DESCX( LLD_ ) * NQX * * Check if sufficient memory. * MEMREQD = IPY + DESCY( LLD_ ) * NQY - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) MEMREQD*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) GO TO 40 END IF * * Loop over all PBLAS 1 routines * DO 30 K = 1, NSUBS * * Continue only if this sub has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * * Check the validity of the operand sizes * CALL PVDIMCHK( ICTXT, NOUT, N, 'X', IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, N, 'Y', IY, JY, DESCY, INCY, $ IERR( 2 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 ) $ GO TO 30 * * Generate distributed matrices X and Y * CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the PBLAS routine * IF( K.EQ.1 ) THEN * * Test PZSWAP * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PZSWAP( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.2 ) THEN * * Test PZSCAL * ADDS = 0.0D+0 MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PZSCAL( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.3 ) THEN * * Test PZDSCAL * ADDS = 0.0D+0 MULTS = DBLE( 2*N ) CALL PB_TIMER( 1 ) CALL PZDSCAL( N, DBLE( ALPHA ), MEM( IPX ), IX, JX, $ DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.4 ) THEN * * Test PZCOPY * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PZCOPY( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.5 ) THEN * * Test PZAXPY * ADDS = DBLE( 2*N ) MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PZAXPY( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.6 ) THEN * * Test PZDOTU * ADDS = DBLE( 2 * ( N - 1 ) ) MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PZDOTU( N, PSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.7 ) THEN * * Test PZDOTC * ADDS = DBLE( 2 * ( N - 1 ) ) MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PZDOTC( N, PSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.8 ) THEN * * Test PDZNRM2 * ADDS = DBLE( 2 * ( N - 1 ) ) MULTS = DBLE( 6*N ) CALL PB_TIMER( 1 ) CALL PDZNRM2( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.9 ) THEN * * Test PDZASUM * ADDS = DBLE( 2 * ( N - 1 ) ) MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PDZASUM( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.10 ) THEN * ADDS = 0.0D+0 MULTS = 0.0D+0 CALL PB_TIMER( 1 ) CALL PZAMAX( N, PSCLR, PISCLR, MEM( IPX ), IX, JX, $ DESCX, INCX ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Calculate total flops * NOPS = ADDS + MULTS * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9984 ) SNAMES( K ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9988 ) J END IF * 50 CONTINUE * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, '---------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' N IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9993 FORMAT( 2X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I5,1X,I5,1X,I5,1X,I5,1X, $ I5,1X,I5,1X,I6 ) 9992 FORMAT( 2X, ' N IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9991 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9990 FORMAT( 'Not enough memory. Need: ', I12 ) 9988 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9987 FORMAT( 2X, 'End of Tests.' ) 9986 FORMAT( 2X, 'Tests started.' ) 9985 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9984 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9983 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PZBLA1TIM * END SUBROUTINE PZBLA1TIMINFO( SUMMRY, NOUT, NMAT, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, $ RSCXVAL, CSCXVAL, IXVAL, JXVAL, $ INCXVAL, MYVAL, NYVAL, IMBYVAL, MBYVAL, $ INBYVAL, NBYVAL, RSCYVAL, CSCYVAL, $ IYVAL, JYVAL, INCYVAL, LDVAL, NGRIDS, $ PVAL, LDPVAL, QVAL, LDQVAL, LTEST, IAM, $ NPROCS, ALPHA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NGRIDS, NMAT, NOUT, $ NPROCS COMPLEX*16 ALPHA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY LOGICAL LTEST( * ) INTEGER CSCXVAL( LDVAL ), CSCYVAL( LDVAL ), $ IMBXVAL( LDVAL ), IMBYVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JXVAL( LDVAL ), $ JYVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCXVAL( LDVAL ), RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PZBLA1TIMINFO get the needed startup information for timing various * Level 1 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DESCX(:), IX, JX, INCX, DESCY(:), * IY, JY and INCY. This is also the maximum number of test * cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least ten. On * exit, if LTEST( i ) is .TRUE., the i-th Level 1 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) COMPLEX*16 * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 2, 2*NGRIDS+23*NMAT+NSUBS ) with NSUBS = 10. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 10 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D, ZGEBR2D, ZGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PZBLAS1TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 100 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 100 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA * READ( NIN, FMT = * ) ALPHA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 100 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 100 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, WORK, 2 ) * I = 1 CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 70 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 70 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 1 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Timing of the complex double precision '// $ 'Level 1 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9989 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9989 ) SNAMES( 1 ), ' ... No ' END IF DO 80 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( I ), ' ... No ' END IF 80 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, WORK, 2, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) * I = 2*NGRIDS + 23*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 90 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 90 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 100 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9989 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9988 FORMAT( 2X, ' ', A, A8 ) * * End of PZBLA1TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/PZBLAS2TIM.dat000644 000766 000024 00000006077 10363532303 017672 0ustar00juliestaff000000 000000 'Level 2 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PZBLAS2TIM.SUMM' output file name (if any) 6 device out 10 value of the logical computational blocksize NB 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q (2.0D0, -4.0D0) value of ALPHA (3.0D0, -2.0D0) value of BETA 16 number of tests problems 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' values of UPLO 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANS 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' values of DIAG 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 values of M 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 values of N 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_A 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_A 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_A 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_A 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IA 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JA 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_X 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_X 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_X 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_X 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IX 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JX 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of INCX 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of M_Y 6 10 6 10 6 10 6 10 6 10 6 10 6 10 6 10 values of N_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of IMB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of INB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of MB_Y 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 values of NB_Y 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 values of RSRC_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IY 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JY 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 values of INCY PZGEMV T put F for no test in the same column PZHEMV T put F for no test in the same column PZTRMV T put F for no test in the same column PZTRSV T put F for no test in the same column PZGERU T put F for no test in the same column PZGERC T put F for no test in the same column PZHER T put F for no test in the same column PZHER2 T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/pzblas2tim.f000644 000766 000024 00000155040 11750130340 017776 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PZGEMV ', 'PZHEMV ', 'PZTRMV ', $ 'PZTRSV ', 'PZGERU ', 'PZGERC ', $ 'PZHER ', 'PZHER2 '/ END BLOCK DATA PROGRAM PZBLA2TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PZBLA2TIM is the main timing program for the Level 2 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 56 lines: * 'Level 2 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PZBLAS2TIM.SUMM' output file name (if any) * 6 device out * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0D0, 0.0D0) value of ALPHA * (1.0D0, 0.0D0) value of BETA * 2 number of tests problems * 'U' 'L' values of UPLO * 'N' 'T' values of TRANS * 'N' 'U' values of DIAG * 3 4 values of M * 3 4 values of N * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PZGEMV T put F for no test in the same column * PZHEMV T put F for no test in the same column * PZTRMV T put F for no test in the same column * PZTRSV T put F for no test in the same column * PZGERU T put F for no test in the same column * PZGERC T put F for no test in the same column * PZHER T put F for no test in the same column * PZHER2 T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * ZPLXSZ INTEGER * DBLESZ and ZPLXSZ indicate the length in bytes on the given * platform for a double precision real and a double precision * complex. By default, DBLESZ is set to eight and ZPLXSZ is set * to sixteen. * * MEM COMPLEX*16 array * MEM is an array of dimension TOTMEM / ZPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, ZPLXSZ, TOTMEM, MEMSIZ, $ NSUBS COMPLEX*16 ONE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, ZPLXSZ = 16, $ ONE = ( 1.0D+0, 0.0D+0 ), TOTMEM = 2000000, $ NSUBS = 8, MEMSIZ = TOTMEM / ZPLXSZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 AFORM, DIAG, DIAGDO, TRANS, UPLO INTEGER CSRCA, CSRCX, CSRCY, I, IA, IAM, IASEED, ICTXT, $ IMBA, IMBX, IMBY, IMIDA, IMIDX, IMIDY, INBA, $ INBX, INBY, INCX, INCY, IPA, IPOSTA, IPOSTX, $ IPOSTY, IPREA, IPREX, IPREY, IPX, IPY, IX, $ IXSEED, IY, IYSEED, J, JA, JX, JY, K, M, MA, $ MBA, MBX, MBY, MEMREQD, MPA, MPX, MPY, MX, MY, $ MYCOL, MYROW, N, NA, NBA, NBX, NBY, NCOLA, $ NGRIDS, NLX, NLY, NOUT, NPCOL, NPROCS, NPROW, $ NQA, NQX, NQY, NROWA, NTESTS, NX, NY, OFFD, $ RSRCA, RSRCX, RSRCY DOUBLE PRECISION CFLOPS, NOPS, WFLOPS COMPLEX*16 ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), TRANVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCXVAL( MAXTESTS ), $ CSCYVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ), $ IAVAL( MAXTESTS ), IERR( 3 ), $ IMBAVAL( MAXTESTS ), IMBXVAL( MAXTESTS ), $ IMBYVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JXVAL( MAXTESTS ), $ JYVAL( MAXTESTS ), MAVAL( MAXTESTS ), $ MBAVAL( MAXTESTS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NAVAL( MAXTESTS ), NBAVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCAVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PMDESCCHK, PMDIMCHK, PVDESCCHK, $ PVDIMCHK, PZBLA2TIMINFO, PZGEMV, PZGERC, $ PZGERU, PZHEMV, PZHER, PZHER2, PZLAGEN, $ PZLASCAL, PZTRMV, PZTRSV * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDOPBL2 EXTERNAL LSAME, PDOPBL2 * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .TRUE., .FALSE., .FALSE., $ .TRUE., .TRUE., .FALSE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .TRUE. * * Seeds for random matrix generations. * IASEED = 100 IXSEED = 200 IYSEED = 300 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PZBLA2TIMINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, $ IAVAL, JAVAL, MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, $ CSCYVAL, IYVAL, JYVAL, INCYVAL, MAXTESTS, $ NGRIDS, PVAL, MAXGRIDS, QVAL, MAXGRIDS, $ NBLOG, LTEST, IAM, NPROCS, ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) TRANS = TRANVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) * MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, UPLO, TRANS, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9980 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' GO TO 40 END IF * IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANS' GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) TRANS WRITE( NOUT, FMT = 9997 ) 'DIAG' GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, 0, 0, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, 0, 0, IERR( 2 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, 0, 0, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN GO TO 40 END IF * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = 1 IPX = IPA + DESCA( LLD_ ) * NQA IPY = IPX + DESCX( LLD_ ) * NQX * * Check if sufficient memory. * MEMREQD = IPY + DESCY( LLD_ ) * NQY - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) MEMREQD*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) GO TO 40 END IF * * Loop over all PBLAS 2 routines * DO 30 K = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * * Define the size of the operands * IF( K.EQ.1 ) THEN NROWA = M NCOLA = N IF( LSAME( TRANS, 'N' ) ) THEN NLX = N NLY = M ELSE NLX = M NLY = N END IF ELSE IF( K.EQ.5 .OR. K.EQ.6 ) THEN NROWA = M NCOLA = N NLX = M NLY = N ELSE NROWA = N NCOLA = N NLX = N NLY = N END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLX, 'X', IX, JX, DESCX, $ INCX, IERR( 2 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLY, 'Y', IY, JY, DESCY, $ INCY, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN GO TO 30 END IF * * Generate distributed matrices A, X and Y * IF( K.EQ.2 .OR. K.EQ.7 .OR. K.EQ.8 ) THEN AFORM = 'H' DIAGDO = 'N' OFFD = IA - JA ELSE IF( ( K.EQ.4 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN AFORM = 'N' DIAGDO = 'D' OFFD = IA - JA ELSE AFORM = 'N' DIAGDO = 'N' OFFD = 0 END IF * CALL PZLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, $ 1, 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MY, $ NY, 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * IF( ( K.EQ.4 ).AND.( .NOT.( LSAME( DIAG, 'N' ) ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / DCMPLX( DBLE( MAX( NROWA, NCOLA ) ) ) IF( LSAME( UPLO, 'L' ) ) THEN CALL PZLASCAL( 'Lower', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA+1, JA, DESCA ) ELSE CALL PZLASCAL( 'Upper', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA, JA+1, DESCA ) END IF END IF * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the Level 2 PBLAS routine * IF( K.EQ.1 ) THEN * * Test PZGEMV * CALL PB_TIMER( 1 ) CALL PZGEMV( TRANS, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.2 ) THEN * * Test PZHEMV * CALL PB_TIMER( 1 ) CALL PZHEMV( UPLO, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.3 ) THEN * * Test PZTRMV * CALL PB_TIMER( 1 ) CALL PZTRMV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.4 ) THEN * * Test PZTRSV * CALL PB_TIMER( 1 ) CALL PZTRSV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.5 ) THEN * * Test PZGERU * CALL PB_TIMER( 1 ) CALL PZGERU( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.6 ) THEN * * Test PZGERC * CALL PB_TIMER( 1 ) CALL PZGERC( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.7 ) THEN * * Test PZHER * CALL PB_TIMER( 1 ) CALL PZHER( UPLO, N, DBLE( ALPHA ), MEM( IPX ), IX, $ JX, DESCX, INCX, MEM( IPA ), IA, JA, $ DESCA ) CALL PB_TIMER( 1 ) * ELSE IF( K.EQ.8 ) THEN * * Test PZHER2 * CALL PB_TIMER( 1 ) CALL PZHER2( UPLO, N, ALPHA, MEM( IPX ), IX, JX, $ DESCX, INCX, MEM( IPY ), IY, JY, DESCY, $ INCY, MEM( IPA ), IA, JA, DESCA ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Calculate total flops * NOPS = PDOPBL2( SNAMES( K ), NROWA, NCOLA, 0, 0 ) * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9981 ) SNAMES( K ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' M N UPLO TRANS DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,9X,A1,11X,A1,10X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9989 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5,1X,I6 ) 9988 FORMAT( 2X, ' IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9987 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9986 FORMAT( 'Not enough memory. Need: ', I12 ) 9985 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9984 FORMAT( 2X, 'End of Tests.' ) 9983 FORMAT( 2X, 'Tests started.' ) 9982 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9981 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9980 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PZBLA2TIM * END SUBROUTINE PZBLA2TIMINFO( SUMMRY, NOUT, NMAT, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, $ IMBAVAL, MBAVAL, INBAVAL, NBAVAL, $ RSCAVAL, CSCAVAL, IAVAL, JAVAL, $ MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, $ IXVAL, JXVAL, INCXVAL, MYVAL, NYVAL, $ IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, $ INCYVAL, LDVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, NBLOG, LTEST, IAM, NPROCS, $ ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NBLOG, NGRIDS, $ NMAT, NOUT, NPROCS COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), TRANVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCXVAL( LDVAL ), $ CSCYVAL( LDVAL ), IAVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBXVAL( LDVAL ), $ IMBYVAL( LDVAL ), INBAVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JAVAL( LDVAL ), $ JXVAL( LDVAL ), JYVAL( LDVAL ), MAVAL( LDVAL ), $ MBAVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NAVAL( LDVAL ), $ NBAVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCXVAL( LDVAL ), $ RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PZBLA2TIMINFO get the needed startup information for timing various * Level 2 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * TRANVAL (global output) CHARACTER array * On entry, TRANVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANS to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, TRANS, UPLO, M, N, DESCA(:), * IA, JA, DESCX(:), IX, JX, INCX, DESCY(:), IY, JY and INCY. * This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 2 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) COMPLEX*16 * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) COMPLEX*16 * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+37*NMAT+NSUBS ) with NSUBS = 8. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D, ZGEBR2D, ZGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PZBLAS2TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRANVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( TRANVAL( J ) ) WORK( I+2 ) = ICHAR( UPLOVAL( J ) ) I = I + 3 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 2 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex double precision '// $ 'Level 2 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) NMAT WRITE( NOUT, FMT = 9986 ) NBLOG WRITE( NOUT, FMT = 9991 ) NGRIDS WRITE( NOUT, FMT = 9989 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9989 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA WRITE( NOUT, FMT = 9993 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 1, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 37*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) TRANVAL( J ) = CHAR( WORK( I+1 ) ) UPLOVAL( J ) = CHAR( WORK( I+2 ) ) I = I + 3 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9993 FORMAT( 2X, 'Beta : (', G16.6, $ ',', G16.6, ')' ) 9992 FORMAT( 2X, 'Number of Tests : ', I6 ) 9991 FORMAT( 2X, 'Number of process grids : ', I6 ) 9990 FORMAT( 2X, ' : ', 5I6 ) 9989 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9987 FORMAT( 2X, ' ', A, A8 ) 9986 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PZBLA2TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/PZBLAS3TIM.dat000644 000766 000024 00000004563 10363532303 017671 0ustar00juliestaff000000 000000 'Level 3 PBLAS, Timing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PZBLAS3TIM.SUMM' output file name (if any) 6 device out 10 value of the logical computational blocksize NB 1 number of process grids (ordered pairs of P & Q) 2 2 1 4 2 3 8 values of P 2 2 4 1 3 2 1 values of Q (2.0D0, -4.0D0) value of ALPHA (3.0D0, -2.0D0) value of BETA 8 number of tests problems 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' values of DIAG 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' values of SIDE 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANSA 'N' 'N' 'T' 'T' 'N' 'N' 'T' 'T' values of TRANSB 'U' 'L' 'U' 'L' 'U' 'L' 'U' 'L' values of UPLO 3 4 3 4 3 4 3 4 values of M 3 4 3 4 3 4 3 4 values of N 3 4 3 4 3 4 3 4 values of K 6 10 6 10 6 10 6 10 values of M_A 6 10 6 10 6 10 6 10 values of N_A 2 5 2 5 2 5 2 5 values of IMB_A 2 5 2 5 2 5 2 5 values of INB_A 2 5 2 5 2 5 2 5 values of MB_A 2 5 2 5 2 5 2 5 values of NB_A 0 1 0 1 0 1 0 1 values of RSRC_A 0 0 0 0 0 0 0 0 values of CSRC_A 1 1 1 1 1 1 1 1 values of IA 1 1 1 1 1 1 1 1 values of JA 6 10 6 10 6 10 6 10 values of M_B 6 10 6 10 6 10 6 10 values of N_B 2 5 2 5 2 5 2 5 values of IMB_B 2 5 2 5 2 5 2 5 values of INB_B 2 5 2 5 2 5 2 5 values of MB_B 2 5 2 5 2 5 2 5 values of NB_B 0 1 0 1 0 1 0 1 values of RSRC_B 0 0 0 0 0 0 0 0 values of CSRC_B 1 1 1 1 1 1 1 1 values of IB 1 1 1 1 1 1 1 1 values of JB 6 10 6 10 6 10 6 10 values of M_C 6 10 6 10 6 10 6 10 values of N_C 2 5 2 5 2 5 2 5 values of IMB_C 2 5 2 5 2 5 2 5 values of INB_C 2 5 2 5 2 5 2 5 values of MB_C 2 5 2 5 2 5 2 5 values of NB_C 0 1 0 1 0 1 0 1 values of RSRC_C 0 0 0 0 0 0 0 0 values of CSRC_C 1 1 1 1 1 1 1 1 values of IC 1 1 1 1 1 1 1 1 values of JC PZGEMM T put F for no test in the same column PZSYMM T put F for no test in the same column PZHEMM T put F for no test in the same column PZSYRK T put F for no test in the same column PZHERK T put F for no test in the same column PZSYR2K T put F for no test in the same column PZHER2K T put F for no test in the same column PZTRMM T put F for no test in the same column PZTRSM T put F for no test in the same column PZGEADD T put F for no test in the same column PZTRADD T put F for no test in the same column scalapack-2.0.2/PBLAS/TIMING/pzblas3tim.f000644 000766 000024 00000174571 11750130340 020011 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 11) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PZGEMM ', 'PZSYMM ', 'PZHEMM ', $ 'PZSYRK ', 'PZHERK ', 'PZSYR2K', $ 'PZHER2K', 'PZTRMM ', 'PZTRSM ', $ 'PZGEADD', 'PZTRADD'/ END BLOCK DATA PROGRAM PZBLA3TIM * * -- PBLAS timing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PZBLA3TIM is the main timing program for the Level 3 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 59 lines: * 'Level 3 PBLAS, Timing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PZBLAS3TIM.SUMM' output file name (if any) * 6 device out * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0D0, 0.0D0) value of ALPHA * (1.0D0, 0.0D0) value of BETA * 2 number of tests problems * 'N' 'U' values of DIAG * 'L' 'R' values of SIDE * 'N' 'T' values of TRANSA * 'N' 'T' values of TRANSB * 'U' 'L' values of UPLO * 3 4 values of M * 3 4 values of N * 3 4 values of K * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_B * 6 10 values of N_B * 2 5 values of IMB_B * 2 5 values of INB_B * 2 5 values of MB_B * 2 5 values of NB_B * 0 1 values of RSRC_B * 0 0 values of CSRC_B * 1 1 values of IB * 1 1 values of JB * 6 10 values of M_C * 6 10 values of N_C * 2 5 values of IMB_C * 2 5 values of INB_C * 2 5 values of MB_C * 2 5 values of NB_C * 0 1 values of RSRC_C * 0 0 values of CSRC_C * 1 1 values of IC * 1 1 values of JC * PZGEMM T put F for no test in the same column * PZSYMM T put F for no test in the same column * PZHEMM T put F for no test in the same column * PZSYRK T put F for no test in the same column * PZHERK T put F for no test in the same column * PZSYR2K T put F for no test in the same column * PZHER2K T put F for no test in the same column * PZTRMM T put F for no test in the same column * PZTRSM T put F for no test in the same column * PZGEADD T put F for no test in the same column * PZTRADD T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * ZPLXSZ INTEGER * DBLESZ and ZPLXSZ indicate the length in bytes on the given * platform for a double precision real and a double precision * complex. By default, DBLESZ is set to eight and ZPLXSZ is set * to sixteen. * * MEM COMPLEX*16 array * MEM is an array of dimension TOTMEM / ZPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, ZPLXSZ, TOTMEM, MEMSIZ, $ NSUBS COMPLEX*16 ONE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, ZPLXSZ = 16, $ ONE = ( 1.0D+0, 0.0D+0 ), TOTMEM = 2000000, $ NSUBS = 11, MEMSIZ = TOTMEM / ZPLXSZ ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 ADIAGDO, AFORM, CFORM, DIAG, SIDE, TRANSA, $ TRANSB, UPLO INTEGER CSRCA, CSRCB, CSRCC, I, IA, IAM, IASEED, IB, $ IBSEED, IC, ICSEED, ICTXT, IMBA, IMBB, IMBC, $ IMIDA, IMIDB, IMIDC, INBA, INBB, INBC, IPA, $ IPB, IPC, IPOSTA, IPOSTB, IPOSTC, IPREA, IPREB, $ IPREC, J, JA, JB, JC, K, L, M, MA, MB, MBA, $ MBB, MBC, MC, MEMREQD, MPA, MPB, MPC, MYCOL, $ MYROW, N, NA, NB, NBA, NBB, NBC, NC, NCOLA, $ NCOLB, NCOLC, NGRIDS, NOUT, NPCOL, NPROCS, $ NPROW, NQA, NQB, NQC, NROWA, NROWB, NROWC, $ NTESTS, OFFDA, OFFDC, RSRCA, RSRCB, RSRCC DOUBLE PRECISION CFLOPS, NOPS, WFLOPS COMPLEX*16 ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), BCHECK( NSUBS ), $ CCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), SIDEVAL( MAXTESTS ), $ TRNAVAL( MAXTESTS ), TRNBVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCBVAL( MAXTESTS ), $ CSCCVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCB( DLEN_ ), DESCC( DLEN_ ), $ IAVAL( MAXTESTS ), IBVAL( MAXTESTS ), $ ICVAL( MAXTESTS ), IERR( 3 ), $ IMBAVAL( MAXTESTS ), IMBBVAL( MAXTESTS ), $ IMBCVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBBVAL( MAXTESTS ), INBCVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JBVAL( MAXTESTS ), $ JCVAL( MAXTESTS ), KVAL( MAXTESTS ), $ MAVAL( MAXTESTS ), MBAVAL( MAXTESTS ), $ MBBVAL( MAXTESTS ), MBCVAL( MAXTESTS ), $ MBVAL( MAXTESTS ), MCVAL( MAXTESTS ), $ MVAL( MAXTESTS ), NAVAL( MAXTESTS ), $ NBAVAL( MAXTESTS ), NBBVAL( MAXTESTS ), $ NBCVAL( MAXTESTS ), NBVAL( MAXTESTS ), $ NCVAL( MAXTESTS ), NVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCAVAL( MAXTESTS ), RSCBVAL( MAXTESTS ), $ RSCCVAL( MAXTESTS ) DOUBLE PRECISION CTIME( 1 ), WTIME( 1 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, IGSUM2D, PB_BOOT, PB_COMBINE, $ PB_TIMER, PMDESCCHK, PMDIMCHK, PZBLA3TIMINFO, $ PZGEADD, PZGEMM, PZHEMM, PZHER2K, PZHERK, $ PZLAGEN, PZLASCAL, PZSYMM, PZSYR2K, PZSYRK, $ PZTRADD, PZTRMM, PZTRSM * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDOPBL3 EXTERNAL LSAME, PDOPBL3 * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA BCHECK/.TRUE., .TRUE., .TRUE., .FALSE., $ .FALSE., .TRUE., .TRUE., .TRUE., .TRUE., $ .FALSE., .FALSE./ DATA CCHECK/.TRUE., .TRUE., .TRUE., .TRUE., .TRUE., $ .TRUE., .TRUE., .FALSE., .FALSE., .TRUE., $ .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * Seeds for random matrix generations. * IASEED = 100 IBSEED = 200 ICSEED = 300 * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PZBLA3TIMINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, NVAL, $ KVAL, MAVAL, NAVAL, IMBAVAL, MBAVAL, $ INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, IAVAL, $ JAVAL, MBVAL, NBVAL, IMBBVAL, MBBVAL, $ INBBVAL, NBBVAL, RSCBVAL, CSCBVAL, IBVAL, $ JBVAL, MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, ICVAL, $ JCVAL, MAXTESTS, NGRIDS, PVAL, MAXGRIDS, $ QVAL, MAXGRIDS, NBLOG, LTEST, IAM, NPROCS, $ ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) SIDE = SIDEVAL( J ) TRANSA = TRNAVAL( J ) TRANSB = TRNBVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) K = KVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MB = MBVAL( J ) NB = NBVAL( J ) IMBB = IMBBVAL( J ) MBB = MBBVAL( J ) INBB = INBBVAL( J ) NBB = NBBVAL( J ) RSRCB = RSCBVAL( J ) CSRCB = CSCBVAL( J ) IB = IBVAL( J ) JB = JBVAL( J ) * MC = MCVAL( J ) NC = NCVAL( J ) IMBC = IMBCVAL( J ) MBC = MBCVAL( J ) INBC = INBCVAL( J ) NBC = NBCVAL( J ) RSRCC = RSCCVAL( J ) CSRCC = CSCCVAL( J ) IC = ICVAL( J ) JC = JCVAL( J ) * IF( IAM.EQ.0 ) THEN * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) J, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, K, SIDE, UPLO, TRANSA, $ TRANSB, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IB, JB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IC, JC, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9980 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( SIDE, 'L' ).AND. $ .NOT.LSAME( SIDE, 'R' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'SIDE' GO TO 40 END IF * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' GO TO 40 END IF * IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSA' GO TO 40 END IF * IF( .NOT.LSAME( TRANSB, 'N' ).AND. $ .NOT.LSAME( TRANSB, 'T' ).AND. $ .NOT.LSAME( TRANSB, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSB' GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'DIAG' GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, 0, 0, IERR( 1 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'B', DESCB, $ BLOCK_CYCLIC_2D_INB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB, MPB, NQB, IPREB, $ IMIDB, IPOSTB, 0, 0, IERR( 2 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'C', DESCC, $ BLOCK_CYCLIC_2D_INB, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC, MPC, NQC, IPREC, $ IMIDC, IPOSTC, 0, 0, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN GO TO 40 END IF * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPB = IPA + DESCA( LLD_ )*NQA IPC = IPB + DESCB( LLD_ )*NQB * * Check if sufficient memory. * MEMREQD = IPC + DESCC( LLD_ )*NQC - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) MEMREQD*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) GO TO 40 END IF * * Loop over all PBLAS 3 routines * DO 30 L = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( L ) ) $ GO TO 30 * * Define the size of the operands * IF( L.EQ.1 ) THEN * * PZGEMM * NROWC = M NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF( LSAME( TRANSB, 'N' ) ) THEN NROWB = K NCOLB = N ELSE NROWB = N NCOLB = K END IF ELSE IF( L.EQ.2 .OR. L.EQ.3 ) THEN * * PZSYMM, PZHEMM * NROWC = M NCOLC = N NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF ELSE IF( L.EQ.4 .OR. L.EQ.5 ) THEN * * PZSYRK, PZHERK * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K ELSE NROWA = K NCOLA = N END IF NROWB = 0 NCOLB = 0 ELSE IF( L.EQ.6 .OR. L.EQ.7 ) THEN * * PZSYR2K, PZHER2K * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K NROWB = N NCOLB = K ELSE NROWA = K NCOLA = N NROWB = K NCOLB = N END IF ELSE IF( L.EQ.8 .OR. L.EQ.9 ) THEN * * PZTRMM, PZTRSM * NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF NROWC = 0 NCOLC = 0 ELSE IF( L.EQ.10 .OR. L.EQ.11 ) THEN * * PZGEADD, PZTRADD * IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = N ELSE NROWA = N NCOLA = M END IF NROWC = M NCOLC = N NROWB = 0 NCOLB = 0 * END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWB, NCOLB, 'B', IB, JB, $ DESCB, IERR( 2 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWC, NCOLC, 'C', IC, JC, $ DESCC, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN GO TO 30 END IF * * Check special values of TRANSA for symmetric and * hermitian rank-k and rank-2k updates. * IF( L.EQ.4 .OR. L.EQ.6 ) THEN IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) SNAMES( L ), 'TRANSA' GO TO 30 END IF ELSE IF( L.EQ.5 .OR. L.EQ.7 ) THEN IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) SNAMES( L ), 'TRANSA' GO TO 30 END IF END IF * * Generate distributed matrices A, B and C * IF( L.EQ.2 ) THEN * * PZSYMM * AFORM = 'S' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.3 ) THEN * * PZHEMM * AFORM = 'H' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.4 .OR. L.EQ.6 ) THEN * * PZSYRK, PZSYR2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'S' OFFDC = IC - JC * ELSE IF( L.EQ.5 .OR. L.EQ.7 ) THEN * * PZHERK, PZHER2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'H' OFFDC = IC - JC * ELSE IF( ( L.EQ.9 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN * * PZTRSM * AFORM = 'N' ADIAGDO = 'D' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE * * Default values * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'N' OFFDC = 0 * END IF * CALL PZLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) IF( ( L.EQ.9 ).AND.( .NOT.( LSAME( DIAG, 'N' ) ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / DCMPLX( DBLE( MAX( NROWA, NCOLA ) ) ) IF( LSAME( UPLO, 'L' ) ) THEN CALL PZLASCAL( 'Lower', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA+1, JA, DESCA ) ELSE CALL PZLASCAL( 'Upper', NROWA-1, NCOLA-1, SCALE, $ MEM( IPA ), IA, JA+1, DESCA ) END IF * END IF * IF( BCHECK( L ) ) $ CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCB, IBSEED, MEM( IPB ), $ DESCB( LLD_ ) ) * IF( CCHECK( L ) ) $ CALL PZLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCC, ICSEED, MEM( IPC ), $ DESCC( LLD_ ) ) * INFO = 0 CALL PB_BOOT() CALL BLACS_BARRIER( ICTXT, 'All' ) * * Call the Level 3 PBLAS routine * IF( L.EQ.1 ) THEN * * Test PZGEMM * NOPS = PDOPBL3( SNAMES( L ), M, N, K ) * CALL PB_TIMER( 1 ) CALL PZGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.2 ) THEN * * Test PZSYMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PZSYMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.3 ) THEN * * Test PZHEMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PZHEMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.4 ) THEN * * Test PZSYRK * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PZSYRK( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.5 ) THEN * * Test PZHERK * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PZHERK( UPLO, TRANSA, N, K, DBLE( ALPHA ), $ MEM( IPA ), IA, JA, DESCA, DBLE( BETA ), $ MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.6 ) THEN * * Test PZSYR2K * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PZSYR2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.7 ) THEN * * Test PZHER2K * NOPS = PDOPBL3( SNAMES( L ), N, N, K ) * CALL PB_TIMER( 1 ) CALL PZHER2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, DBLE( BETA ), MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.8 ) THEN * * Test PZTRMM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PZTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.9 ) THEN * * Test PZTRSM * IF( LSAME( SIDE, 'L' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PZTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.10 ) THEN * * Test PZGEADD * NOPS = PDOPBL3( SNAMES( L ), M, N, M ) * CALL PB_TIMER( 1 ) CALL PZGEADD( TRANSA, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, BETA, MEM( IPC ), IC, JC, DESCC ) CALL PB_TIMER( 1 ) * ELSE IF( L.EQ.11 ) THEN * * Test PZTRADD * IF( LSAME( UPLO, 'U' ) ) THEN NOPS = PDOPBL3( SNAMES( L ), M, N, 0 ) ELSE NOPS = PDOPBL3( SNAMES( L ), M, N, 1 ) END IF * CALL PB_TIMER( 1 ) CALL PZTRADD( UPLO, TRANSA, M, N, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) CALL PB_TIMER( 1 ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) INFO GO TO 30 END IF * CALL PB_COMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL PB_COMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Only node 0 prints timing test result * IF( IAM.EQ.0 ) THEN * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN WFLOPS = NOPS / ( WTIME( 1 ) * 1.0D+6 ) ELSE WFLOPS = 0.0D+0 END IF * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN CFLOPS = NOPS / ( CTIME( 1 ) * 1.0D+6 ) ELSE CFLOPS = 0.0D+0 END IF * WRITE( NOUT, FMT = 9981 ) SNAMES( L ), WTIME( 1 ), $ WFLOPS, CTIME( 1 ), CFLOPS * END IF * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9986 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) WRITE( NOUT, FMT = * ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I2 , ' started on a ', I4, ' x ', $ I4, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '-------------------' ) 9994 FORMAT( 2X, ' M N K SIDE UPLO TRANSA ', $ 'TRANSB DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,1X,I6,6X,A1,5X,A1,7X,A1,7X,A1,5X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IB JB MB NB IMBB INBB', $ ' MBB NBB RSRCB CSRCB' ) 9989 FORMAT( 2X, ' IC JC MC NC IMBC INBC', $ ' MBC NBC RSRCC CSRCC' ) 9988 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9987 FORMAT( 'Not enough memory. Need: ', I12 ) 9986 FORMAT( 2X, 'Test number ', I2, ' completed.' ) 9985 FORMAT( 2X, 'End of Tests.' ) 9984 FORMAT( 2X, 'Tests started.' ) 9983 FORMAT( 5X, A, ' ***** ', A, ' has an incorrect value: ', $ ' BYPASS *****' ) 9982 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) 9981 FORMAT( 2X, '| ', A, 2X, F13.3, 2X, F13.3, 2X, F13.3, 2X, F13.3 ) 9980 FORMAT( 2X, ' WALL time (s) WALL Mflops ', $ ' CPU time (s) CPU Mflops' ) * STOP * * End of PZBLA3TIM * END SUBROUTINE PZBLA3TIMINFO( SUMMRY, NOUT, NMAT, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, $ NVAL, KVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, $ CSCAVAL, IAVAL, JAVAL, MBVAL, NBVAL, $ IMBBVAL, MBBVAL, INBBVAL, NBBVAL, $ RSCBVAL, CSCBVAL, IBVAL, JBVAL, $ MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, $ ICVAL, JCVAL, LDVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, NBLOG, LTEST, $ IAM, NPROCS, ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IAM, LDPVAL, LDQVAL, LDVAL, NBLOG, NGRIDS, $ NMAT, NOUT, NPROCS COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), SIDEVAL( LDVAL ), $ TRNAVAL( LDVAL ), TRNBVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCBVAL( LDVAL ), $ CSCCVAL( LDVAL ), IAVAL( LDVAL ), $ IBVAL( LDVAL ), ICVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBBVAL( LDVAL ), $ IMBCVAL( LDVAL ), INBAVAL( LDVAL ), $ INBBVAL( LDVAL ), INBCVAL( LDVAL ), $ JAVAL( LDVAL ), JBVAL( LDVAL ), JCVAL( LDVAL ), $ KVAL( LDVAL ), MAVAL( LDVAL ), MBAVAL( LDVAL ), $ MBBVAL( LDVAL ), MBCVAL( LDVAL ), $ MBVAL( LDVAL ), MCVAL( LDVAL ), MVAL( LDVAL ), $ NAVAL( LDVAL ), NBAVAL( LDVAL ), $ NBBVAL( LDVAL ), NBCVAL( LDVAL ), $ NBVAL( LDVAL ), NCVAL( LDVAL ), NVAL( LDVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCBVAL( LDVAL ), $ RSCCVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PZBLA3TIMINFO get the needed startup information for timing various * Level 3 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * SIDEVAL (global output) CHARACTER array * On entry, SIDEVAL is an array of dimension LDVAL. On exit, * this array contains the values of SIDE to run the code with. * * TRNAVAL (global output) CHARACTER array * On entry, TRNAVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSA to run the code * with. * * TRNBVAL (global output) CHARACTER array * On entry, TRNBVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSB to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * KVAL (global output) INTEGER array * On entry, KVAL is an array of dimension LDVAL. On exit, this * array contains the values of K to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MBVAL (global output) INTEGER array * On entry, MBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( M_ ) to run the code * with. * * NBVAL (global output) INTEGER array * On entry, NBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( N_ ) to run the code * with. * * IMBBVAL (global output) INTEGER array * On entry, IMBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( IMB_ ) to run the * code with. * * MBBVAL (global output) INTEGER array * On entry, MBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( MB_ ) to run the * code with. * * INBBVAL (global output) INTEGER array * On entry, INBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( INB_ ) to run the * code with. * * NBBVAL (global output) INTEGER array * On entry, NBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( NB_ ) to run the * code with. * * RSCBVAL (global output) INTEGER array * On entry, RSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( RSRC_ ) to run the * code with. * * CSCBVAL (global output) INTEGER array * On entry, CSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( CSRC_ ) to run the * code with. * * IBVAL (global output) INTEGER array * On entry, IBVAL is an array of dimension LDVAL. On exit, this * array contains the values of IB to run the code with. * * JBVAL (global output) INTEGER array * On entry, JBVAL is an array of dimension LDVAL. On exit, this * array contains the values of JB to run the code with. * * MCVAL (global output) INTEGER array * On entry, MCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( M_ ) to run the code * with. * * NCVAL (global output) INTEGER array * On entry, NCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( N_ ) to run the code * with. * * IMBCVAL (global output) INTEGER array * On entry, IMBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( IMB_ ) to run the * code with. * * MBCVAL (global output) INTEGER array * On entry, MBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( MB_ ) to run the * code with. * * INBCVAL (global output) INTEGER array * On entry, INBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( INB_ ) to run the * code with. * * NBCVAL (global output) INTEGER array * On entry, NBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( NB_ ) to run the * code with. * * RSCCVAL (global output) INTEGER array * On entry, RSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( RSRC_ ) to run the * code with. * * CSCCVAL (global output) INTEGER array * On entry, CSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( CSRC_ ) to run the * code with. * * ICVAL (global output) INTEGER array * On entry, ICVAL is an array of dimension LDVAL. On exit, this * array contains the values of IC to run the code with. * * JCVAL (global output) INTEGER array * On entry, JCVAL is an array of dimension LDVAL. On exit, this * array contains the values of JC to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, SIDE, TRANSA, TRANSB, UPLO, * M, N, K, DESCA(:), IA, JA, DESCB(:), IB, JB, DESCC(:), IC, * JC. This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eleven. On * exit, if LTEST( i ) is .TRUE., the i-th Level 3 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) COMPLEX*16 * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) COMPLEX*16 * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+38*NMAT+NSUBS ) with NSUBS = 11. This array * is used to pack all output arrays in order to send info in * one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 11 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D, ZGEBR2D, ZGEBS2D * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PZBLAS3TIM.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( SIDEVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( KVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( ICVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JCVAL ( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( SIDEVAL( J ) ) WORK( I+2 ) = ICHAR( TRNAVAL( J ) ) WORK( I+3 ) = ICHAR( TRNBVAL( J ) ) WORK( I+4 ) = ICHAR( UPLOVAL( J ) ) I = I + 5 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, KVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, ICVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JCVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'Level 3 PBLAS timing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex double precision '// $ 'Level 3 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 ) NMAT WRITE( NOUT, FMT = 9986 ) NBLOG WRITE( NOUT, FMT = 9991 ) NGRIDS WRITE( NOUT, FMT = 9989 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9989 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9990 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9994 ) ALPHA WRITE( NOUT, FMT = 9993 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9988 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9987 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 38*NMAT + NSUBS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) SIDEVAL( J ) = CHAR( WORK( I+1 ) ) TRNAVAL( J ) = CHAR( WORK( I+2 ) ) TRNBVAL( J ) = CHAR( WORK( I+3 ) ) UPLOVAL( J ) = CHAR( WORK( I+4 ) ) I = I + 5 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, KVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, ICVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JCVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9993 FORMAT( 2X, 'Beta : (', G16.6, $ ',', G16.6, ')' ) 9992 FORMAT( 2X, 'Number of Tests : ', I6 ) 9991 FORMAT( 2X, 'Number of process grids : ', I6 ) 9990 FORMAT( 2X, ' : ', 5I6 ) 9989 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9987 FORMAT( 2X, ' ', A, A8 ) 9986 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PZBLA3TIMINFO * END scalapack-2.0.2/PBLAS/TIMING/pzblastim.f000644 000766 000024 00000263526 10363532303 017731 0ustar00juliestaff000000 000000 SUBROUTINE PZLASCAL( TYPE, M, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TYPE INTEGER IA, JA, M, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLASCAL scales the m by n submatrix A(IA:IA+M-1,JA:JA+N-1) denoted * by sub( A ) by the scalar alpha. TYPE specifies if sub( A ) is full, * upper triangular, lower triangular or upper Hessenberg. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * On entry, TYPE specifies the type of the input submatrix as * follows: * = 'L' or 'l': sub( A ) is a lower triangular matrix, * = 'U' or 'u': sub( A ) is an upper triangular matrix, * = 'H' or 'h': sub( A ) is an upper Hessenberg matrix, * otherwise sub( A ) is a full matrix. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( A ) are overwritten by * the local entries of the m by n scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 UPLO LOGICAL GODOWN, GOLEFT, LOWER, UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, ITYPE, $ IUPP, JJA, JJMAX, JOFFA, JOFFD, LCMT, LCMT00, $ LDA, LMBLOC, LNBLOC, LOW, M1, MB, MBLKD, MBLKS, $ MBLOC, MP, MRCOL, MRROW, MYCOL, MYROW, N1, NB, $ NBLKD, NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, $ QNB, TMP1, UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS, PB_INFOG2L, PB_ZLASCAL * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_NUMROC EXTERNAL LSAME, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 UPLO = TYPE UPPER = .FALSE. LOWER = .TRUE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 UPLO = TYPE UPPER = .TRUE. LOWER = .FALSE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 UPLO = 'U' UPPER = .TRUE. LOWER = .FALSE. IOFFD = 1 ELSE ITYPE = 0 UPLO = 'A' UPPER = .TRUE. LOWER = .TRUE. IOFFD = 0 END IF * * Compute local indexes * IF( ITYPE.EQ.0 ) THEN * * Full matrix * CALL PB_INFOG2L( IA, JA, DESCA2, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = PB_NUMROC( M, IA, DESCA2( IMB_ ), DESCA2( MB_ ), MYROW, $ DESCA2( RSRC_ ), NPROW ) NQ = PB_NUMROC( N, JA, DESCA2( INB_ ), DESCA2( NB_ ), MYCOL, $ DESCA2( CSRC_ ), NPCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * LDA = DESCA2( LLD_ ) IOFFA = IIA + ( JJA - 1 ) * LDA * CALL PB_ZLASCAL( 'All', MP, NQ, 0, ALPHA, A( IOFFA ), LDA ) * ELSE * * Trapezoidal matrix * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, * LNBLOC, ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) LDA = DESCA2( LLD_ ) * CALL PB_BINFO( IOFFD, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * M1 = MP N1 = NQ IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and * update LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_ZLASCAL( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_ZLASCAL( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_ZLASCAL( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_ZLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_ZLASCAL( UPLO, MBLOC, INBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_ZLASCAL( 'All', TMP1, INBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_ZLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_ZLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_ZLASCAL( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_ZLASCAL( 'All', IMBLOC, TMP1, 0, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_ZLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_ZLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_ZLASCAL( UPLO, MBLOC, NBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_ZLASCAL( 'All', TMP1, NBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_ZLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * END IF * RETURN * * End of PZLASCAL * END SUBROUTINE PZLAGEN( INPLACE, AFORM, DIAG, OFFA, M, N, IA, JA, $ DESCA, IASEED, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE CHARACTER*1 AFORM, DIAG INTEGER IA, IASEED, JA, LDA, M, N, OFFA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PZLAGEN generates (or regenerates) a submatrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the generated submatrix is diago- * nally dominant or not as follows: * DIAG = 'D' : sub( A ) is diagonally dominant, * DIAG = 'N' : sub( A ) is not diagonally dominant. * * OFFA (global input) INTEGER * On entry, OFFA specifies the offdiagonal of the underlying * matrix A(1:DESCA(M_),1:DESCA(N_)) of interest when the subma- * trix is symmetric, Hermitian or diagonally dominant. OFFA = 0 * specifies the main diagonal, OFFA > 0 specifies a subdiago- * nal, and OFFA < 0 specifies a superdiagonal (see further de- * tails). * * M (global input) INTEGER * On entry, M specifies the global number of matrix rows of the * submatrix sub( A ) to be generated. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of matrix columns of * the submatrix sub( A ) to be generated. N must be at least * zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IASEED (global input) INTEGER * On entry, IASEED specifies the seed number to generate the * matrix A. IASEED must be at least zero. * * A (local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). On exit, this array contains the * local entries of the randomly generated submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. When INPLACE is .FALSE., LDA is usually DESCA(LLD_). * This restriction is however not enforced, and this subroutine * requires only that LDA >= MAX( 1, Mp ) where * * Mp = PB_NUMROC( M, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ). * * PB_NUMROC is a ScaLAPACK tool function; MYROW, MYCOL, NPROW * and NPCOL can be determined by calling the BLACS subroutine * BLACS_GRIDINFO. * * Further Details * =============== * * OFFD is tied to the matrix described by DESCA, as opposed to the * piece that is currently (re)generated. This is a global information * independent from the distribution parameters. Below are examples of * the meaning of OFFD for a global 7 by 5 matrix: * * --------------------------------------------------------------------- * OFFD | 0 -1 -2 -3 -4 0 -1 -2 -3 -4 0 -1 -2 -3 -4 * -------|------------------------------------------------------------- * | | OFFD=-1 | OFFD=0 OFFD=2 * | V V * 0 | . d . . . -> d . . . . . . . . . * 1 | . . d . . . d . . . . . . . . * 2 | . . . d . . . d . . -> d . . . . * 3 | . . . . d . . . d . . d . . . * 4 | . . . . . . . . . d . . d . . * 5 | . . . . . . . . . . . . . d . * 6 | . . . . . . . . . . . . . . d * --------------------------------------------------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL DIAGDO, SYMM, HERM, NOTRAN INTEGER CSRC, I, IACOL, IAROW, ICTXT, IIA, ILOCBLK, $ ILOCOFF, ILOW, IMB, IMB1, IMBLOC, IMBVIR, INB, $ INB1, INBLOC, INBVIR, INFO, IOFFDA, ITMP, IUPP, $ IVIR, JJA, JLOCBLK, JLOCOFF, JVIR, LCMT00, $ LMBLOC, LNBLOC, LOW, MAXMN, MB, MBLKS, MP, $ MRCOL, MRROW, MYCDIST, MYCOL, MYRDIST, MYROW, $ NB, NBLKS, NPCOL, NPROW, NQ, NVIR, RSRC, UPP COMPLEX*16 ALPHA * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ), IMULADD( 4, JMP_LEN ), $ IRAN( 2 ), JMP( JMP_LEN ), MULADD0( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CHKMAT, PB_DESCTRANS, PB_INITJMP, $ PB_INITMULADD, PB_JUMP, PB_JUMPIT, PB_LOCINFO, $ PB_SETLOCRAN, PB_SETRAN, PB_ZLAGEN, PXERBLA, $ PZLADOM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN * .. * .. Data Statements .. DATA ( MULADD0( I ), I = 1, 4 ) / 20077, 16838, $ 12345, 0 / * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Test the input arguments * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1000 + CTXT_ ) ELSE SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) NOTRAN = LSAME( AFORM, 'N' ) DIAGDO = LSAME( DIAG, 'D' ) IF( .NOT.( SYMM.OR.HERM.OR.NOTRAN ) .AND. $ .NOT.( LSAME( AFORM, 'T' ) ) .AND. $ .NOT.( LSAME( AFORM, 'C' ) ) ) THEN INFO = -2 ELSE IF( ( .NOT.DIAGDO ) .AND. $ ( .NOT.LSAME( DIAG, 'N' ) ) ) THEN INFO = -3 END IF CALL PB_CHKMAT( ICTXT, M, 5, N, 6, IA, JA, DESCA2, 10, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZLAGEN', -INFO ) RETURN END IF * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) IMB = DESCA2( IMB_ ) INB = DESCA2( INB_ ) RSRC = DESCA2( RSRC_ ) CSRC = DESCA2( CSRC_ ) * * Figure out local information about the distributed matrix operand * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * IOFFDA = JA + OFFA - IA CALL PB_BINFO( IOFFDA, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * * Initialize ILOCBLK, ILOCOFF, MYRDIST, JLOCBLK, JLOCOFF, MYCDIST * This values correspond to the square virtual underlying matrix * of size MAX( M_ + MAX( 0, -OFFA ), N_ + MAX( 0, OFFA ) ) used * to set up the random sequence. For practical purposes, the size * of this virtual matrix is upper bounded by M_ + N_ - 1. * ITMP = MAX( 0, -OFFA ) IVIR = IA + ITMP IMBVIR = IMB + ITMP NVIR = DESCA2( M_ ) + ITMP * CALL PB_LOCINFO( IVIR, IMBVIR, MB, MYROW, RSRC, NPROW, ILOCBLK, $ ILOCOFF, MYRDIST ) * ITMP = MAX( 0, OFFA ) JVIR = JA + ITMP INBVIR = INB + ITMP NVIR = MAX( MAX( NVIR, DESCA2( N_ ) + ITMP ), $ DESCA2( M_ ) + DESCA2( N_ ) - 1 ) * CALL PB_LOCINFO( JVIR, INBVIR, NB, MYCOL, CSRC, NPCOL, JLOCBLK, $ JLOCOFF, MYCDIST ) * IF( SYMM .OR. HERM .OR. NOTRAN ) THEN * CALL PB_INITJMP( .TRUE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 2, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_ZLAGEN( 'Lower', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( SYMM .OR. HERM .OR. ( .NOT. NOTRAN ) ) THEN * CALL PB_INITJMP( .FALSE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 2, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_ZLAGEN( 'Upper', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( DIAGDO ) THEN * MAXMN = MAX( DESCA2( M_ ), DESCA2( N_ ) ) IF( HERM ) THEN ALPHA = DCMPLX( DBLE( 2 * MAXMN ), ZERO ) ELSE ALPHA = DCMPLX( DBLE( NVIR ), DBLE( MAXMN ) ) END IF * IF( IOFFDA.GE.0 ) THEN CALL PZLADOM( INPLACE, MIN( MAX( 0, M-IOFFDA ), N ), ALPHA, $ A, MIN( IA+IOFFDA, IA+M-1 ), JA, DESCA ) ELSE CALL PZLADOM( INPLACE, MIN( M, MAX( 0, N+IOFFDA ) ), ALPHA, $ A, IA, MIN( JA-IOFFDA, JA+N-1 ), DESCA ) END IF * END IF * RETURN * * End of PZLAGEN * END SUBROUTINE PZLADOM( INPLACE, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE INTEGER IA, JA, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLADOM adds alpha to the diagonal entries of an n by n submatrix * sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * N (global input) INTEGER * On entry, N specifies the global order of the submatrix * sub( A ) to be modified. N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the local entries * of this array corresponding to the main diagonal of sub( A ) * have been updated. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP COMPLEX*16 ATMP * .. * .. Local Scalars .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DIMAG, MAX, MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.EQ.0 ) $ RETURN * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) * CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA+LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PZLADOM * END SUBROUTINE PB_ZLASCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PB_ZLASCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (input/output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA * A( I, J ) 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = ALPHA * A( I, J ) 50 CONTINUE 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA * A( I, J ) 100 CONTINUE 110 CONTINUE * END IF * RETURN * * End of PB_ZLASCAL * END SUBROUTINE PB_ZLAGEN( UPLO, AFORM, A, LDA, LCMT00, IRAN, MBLKS, $ IMBLOC, MB, LMBLOC, NBLKS, INBLOC, NB, $ LNBLOC, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO, AFORM INTEGER IMBLOC, INBLOC, LCMT00, LDA, LMBLOC, LNBLOC, $ MB, MBLKS, NB, NBLKS * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PB_ZLAGEN locally initializes an array A. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the lower (UPLO='L') trape- * zoidal part or the upper (UPLO='U') trapezoidal part is to be * generated when the matrix to be generated is symmetric or * Hermitian. For all the other values of AFORM, the value of * this input argument is ignored. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * A (local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, *). On exit, * this array contains the local entries of the randomly genera- * ted submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. LDA must be at least one. * * LCMT00 (global input) INTEGER * On entry, LCMT00 is the LCM value specifying the off-diagonal * of the underlying matrix of interest. LCMT00=0 specifies the * main diagonal, LCMT00 > 0 specifies a subdiagonal, LCMT00 < 0 * specifies superdiagonals. * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2 containing respec- * tively the 16-lower and 16-higher bits of the encoding of the * entry of the random sequence corresponding locally to the * first local array entry to generate. Usually, this array is * computed by PB_SETLOCRAN. * * MBLKS (local input) INTEGER * On entry, MBLKS specifies the local number of blocks of rows. * MBLKS is at least zero. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * LMBLOC (local input) INTEGER * On entry, LMBLOC specifies the number of rows (size) of the * local lowest blocks. LMBLOC is at least zero. * * NBLKS (local input) INTEGER * On entry, NBLKS specifies the local number of blocks of co- * lumns. NBLKS is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * LNBLOC (local input) INTEGER * On entry, LNBLOC specifies the number of columns (size) of * the local rightmost blocks. LNBLOC is at least zero. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the random matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IB, IBLK, II, IK, ITMP, JB, JBLK, JJ, JK, $ JTMP, LCMTC, LCMTR, LOW, MNB, UPP COMPLEX*16 DUMMY * .. * .. Local Arrays .. INTEGER IB0( 2 ), IB1( 2 ), IB2( 2 ), IB3( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMPIT * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PB_DRAND EXTERNAL LSAME, PB_DRAND * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN * .. * .. Executable Statements .. * DO 10 I = 1, 2 IB1( I ) = IRAN( I ) IB2( I ) = IRAN( I ) IB3( I ) = IRAN( I ) 10 CONTINUE * IF( LSAME( AFORM, 'N' ) ) THEN * * Generate random matrix * JJ = 1 * DO 50 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * DO 40 JK = JJ, JJ + JB - 1 * II = 1 * DO 30 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * * Blocks are IB by JB * DO 20 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 20 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 30 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 40 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 50 CONTINUE * ELSE IF( LSAME( AFORM, 'T' ) ) THEN * * Generate the transpose of the matrix that would be normally * generated. * II = 1 * DO 90 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 80 IK = II, II + IB - 1 * JJ = 1 * DO 70 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 60 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 60 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 70 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 80 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 90 CONTINUE * ELSE IF( LSAME( AFORM, 'S' ) ) THEN * * Generate a symmetric matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 170 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 160 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 150 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 100 IK = II, II + IB - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 100 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 110 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 110 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 120 IK = II, ITMP - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 120 CONTINUE * DO 130 IK = ITMP, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 130 CONTINUE * END IF * ELSE * DO 140 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 140 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 150 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 160 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 170 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 250 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 240 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 230 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 180 JK = JJ, JJ + JB - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 180 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 190 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 190 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 200 JK = JJ, JTMP - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 200 CONTINUE * DO 210 JK = JTMP, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 210 CONTINUE * END IF * ELSE * DO 220 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 220 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 230 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 240 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 250 CONTINUE * END IF * ELSE IF( LSAME( AFORM, 'C' ) ) THEN * * Generate the conjugate transpose of the matrix that would be * normally generated. * II = 1 * DO 290 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 280 IK = II, II + IB - 1 * JJ = 1 * DO 270 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 260 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 260 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 270 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 280 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 290 CONTINUE * ELSE IF( LSAME( AFORM, 'H' ) ) THEN * * Generate a Hermitian matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 370 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 360 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 350 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 300 IK = II, II + IB - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 300 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 310 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 310 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 320 IK = II, ITMP - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 320 CONTINUE * IF( ITMP.LE.( II + IB - 1 ) ) THEN DUMMY = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) A( ITMP, JK ) = DCMPLX( DBLE( DUMMY ), $ ZERO ) END IF * DO 330 IK = ITMP + 1, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 330 CONTINUE * END IF * ELSE * DO 340 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 340 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 350 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 360 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 370 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 450 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 440 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 430 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 380 JK = JJ, JJ + JB - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 380 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 390 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 390 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 400 JK = JJ, JTMP - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 400 CONTINUE * IF( JTMP.LE.( JJ + JB - 1 ) ) THEN DUMMY = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) A( IK, JTMP ) = DCMPLX( DBLE( DUMMY ), $ ZERO ) END IF * DO 410 JK = JTMP + 1, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 410 CONTINUE * END IF * ELSE * DO 420 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 420 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 430 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 440 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 450 CONTINUE * END IF * END IF * RETURN * * End of PB_ZLAGEN * END DOUBLE PRECISION FUNCTION PB_DRAND( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_DRAND generates the next number in the random sequence. This func- * tion ensures that this number will be in the interval ( -1.0, 1.0 ). * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0 ) * .. * .. External Functions .. DOUBLE PRECISION PB_DRAN EXTERNAL PB_DRAN * .. * .. Executable Statements .. * PB_DRAND = ONE - TWO * PB_DRAN( IDUMM ) * RETURN * * End of PB_DRAND * END DOUBLE PRECISION FUNCTION PB_DRAN( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_DRAN generates the next number in the random sequence. * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION DIVFAC, POW16 PARAMETER ( DIVFAC = 2.147483648D+9, $ POW16 = 6.5536D+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * PB_DRAN = ( DBLE( IRAND( 1 ) ) + POW16 * DBLE( IRAND( 2 ) ) ) / $ DIVFAC * CALL PB_LMUL( IRAND, IACS, J ) CALL PB_LADD( J, IACS( 3 ), IRAND ) * RETURN * * End of PB_DRAN * END scalapack-2.0.2/PBLAS/TESTING/CMakeLists.txt000644 000766 000024 00000007314 11656312637 020446 0ustar00juliestaff000000 000000 file(COPY ../SRC/PTOOLS/PB_Cwarn.c DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}) file(COPY ../SRC/PTOOLS/PB_Cabort.c DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}) set (PblasErrorHandler PB_Cwarn.c PB_Cabort.c) set (pbtcom pblastst.f ${PblasErrorHandler}) set (spbtcom psblastst.f slamch.f ${pbtcom}) set (dpbtcom pdblastst.f dlamch.f ${pbtcom}) set (cpbtcom pcblastst.f slamch.f ${pbtcom}) set (zpbtcom pzblastst.f dlamch.f ${pbtcom}) set_property( SOURCE ${PblasErrorHandler} APPEND PROPERTY COMPILE_DEFINITIONS TestingPblas ) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/PBLAS/TESTING) file(COPY PCBLAS1TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PCBLAS2TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PCBLAS3TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PDBLAS1TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PDBLAS2TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PDBLAS3TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PSBLAS1TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PSBLAS2TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PSBLAS3TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PZBLAS1TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PZBLAS2TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) file(COPY PZBLAS3TST.dat DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) add_executable(spb1tst psblas1tst.f ${spbtcom}) add_executable(dpb1tst pdblas1tst.f ${dpbtcom}) add_executable(cpb1tst pcblas1tst.f ${cpbtcom}) add_executable(zpb1tst pzblas1tst.f ${zpbtcom}) add_executable(spb2tst psblas2tst.f ${spbtcom}) add_executable(dpb2tst pdblas2tst.f ${dpbtcom}) add_executable(cpb2tst pcblas2tst.f ${cpbtcom}) add_executable(zpb2tst pzblas2tst.f ${zpbtcom}) add_executable(spb3tst psblas3tst.f ${spbtcom}) add_executable(dpb3tst pdblas3tst.f ${dpbtcom}) add_executable(cpb3tst pcblas3tst.f ${cpbtcom}) add_executable(zpb3tst pzblas3tst.f ${zpbtcom}) target_link_libraries(spb1tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(dpb1tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(cpb1tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(zpb1tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(spb2tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(dpb2tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(cpb2tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(zpb2tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(spb3tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(dpb3tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(cpb3tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) target_link_libraries(zpb3tst scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) add_test(spb1tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./spb1tst) add_test(dpb1tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./dpb1tst) add_test(cpb1tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./cpb1tst) add_test(zpb1tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./zpb1tst) add_test(spb2tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./spb2tst) add_test(dpb2tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./dpb2tst) add_test(cpb2tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./cpb2tst) add_test(zpb2tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./zpb2tst) add_test(spb3tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./spb3tst) add_test(dpb3tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./dpb3tst) add_test(cpb3tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./cpb3tst) add_test(zpb3tst ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./zpb3tst) scalapack-2.0.2/PBLAS/TESTING/dlamch.f000644 000766 000024 00000060503 10363532303 017270 0ustar00juliestaff000000 000000 DOUBLE PRECISION FUNCTION DLAMCH( CMACH ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. CHARACTER CMACH * .. * * Purpose * ======= * * DLAMCH determines double precision machine parameters. * * Arguments * ========= * * CMACH (input) CHARACTER*1 * Specifies the value to be returned by DLAMCH: * = 'E' or 'e', DLAMCH := eps * = 'S' or 's , DLAMCH := sfmin * = 'B' or 'b', DLAMCH := base * = 'P' or 'p', DLAMCH := eps*base * = 'N' or 'n', DLAMCH := t * = 'R' or 'r', DLAMCH := rnd * = 'M' or 'm', DLAMCH := emin * = 'U' or 'u', DLAMCH := rmin * = 'L' or 'l', DLAMCH := emax * = 'O' or 'o', DLAMCH := rmax * * where * * eps = relative machine precision * sfmin = safe minimum, such that 1/sfmin does not overflow * base = base of the machine * prec = eps*base * t = number of (base) digits in the mantissa * rnd = 1.0 when rounding occurs in addition, 0.0 otherwise * emin = minimum exponent before (gradual) underflow * rmin = underflow threshold - base**(emin-1) * emax = largest exponent before overflow * rmax = overflow threshold - (base**emax)*(1-eps) * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL FIRST, LRND INTEGER BETA, IMAX, IMIN, IT DOUBLE PRECISION BASE, EMAX, EMIN, EPS, PREC, RMACH, RMAX, RMIN, $ RND, SFMIN, SMALL, T * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DLAMC2 * .. * .. Save statement .. SAVE FIRST, EPS, SFMIN, BASE, T, RND, EMIN, RMIN, $ EMAX, RMAX, PREC * .. * .. Data statements .. DATA FIRST / .TRUE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. CALL DLAMC2( BETA, IT, LRND, EPS, IMIN, RMIN, IMAX, RMAX ) BASE = BETA T = IT IF( LRND ) THEN RND = ONE EPS = ( BASE**( 1-IT ) ) / 2 ELSE RND = ZERO EPS = BASE**( 1-IT ) END IF PREC = EPS*BASE EMIN = IMIN EMAX = IMAX SFMIN = RMIN SMALL = ONE / RMAX IF( SMALL.GE.SFMIN ) THEN * * Use SMALL plus a bit, to avoid the possibility of rounding * causing overflow when computing 1/sfmin. * SFMIN = SMALL*( ONE+EPS ) END IF END IF * IF( LSAME( CMACH, 'E' ) ) THEN RMACH = EPS ELSE IF( LSAME( CMACH, 'S' ) ) THEN RMACH = SFMIN ELSE IF( LSAME( CMACH, 'B' ) ) THEN RMACH = BASE ELSE IF( LSAME( CMACH, 'P' ) ) THEN RMACH = PREC ELSE IF( LSAME( CMACH, 'N' ) ) THEN RMACH = T ELSE IF( LSAME( CMACH, 'R' ) ) THEN RMACH = RND ELSE IF( LSAME( CMACH, 'M' ) ) THEN RMACH = EMIN ELSE IF( LSAME( CMACH, 'U' ) ) THEN RMACH = RMIN ELSE IF( LSAME( CMACH, 'L' ) ) THEN RMACH = EMAX ELSE IF( LSAME( CMACH, 'O' ) ) THEN RMACH = RMAX END IF * DLAMCH = RMACH RETURN * * End of DLAMCH * END * ************************************************************************ * SUBROUTINE DLAMC1( BETA, T, RND, IEEE1 ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL IEEE1, RND INTEGER BETA, T * .. * * Purpose * ======= * * DLAMC1 determines the machine parameters given by BETA, T, RND, and * IEEE1. * * Arguments * ========= * * BETA (output) INTEGER * The base of the machine. * * T (output) INTEGER * The number of ( BETA ) digits in the mantissa. * * RND (output) LOGICAL * Specifies whether proper rounding ( RND = .TRUE. ) or * chopping ( RND = .FALSE. ) occurs in addition. This may not * be a reliable guide to the way in which the machine performs * its arithmetic. * * IEEE1 (output) LOGICAL * Specifies whether rounding appears to be done in the IEEE * 'round to nearest' style. * * Further Details * =============== * * The routine is based on the routine ENVRON by Malcolm and * incorporates suggestions by Gentleman and Marovich. See * * Malcolm M. A. (1972) Algorithms to reveal properties of * floating-point arithmetic. Comms. of the ACM, 15, 949-951. * * Gentleman W. M. and Marovich S. B. (1974) More on algorithms * that reveal properties of floating point arithmetic units. * Comms. of the ACM, 17, 276-277. * * ===================================================================== * * .. Local Scalars .. LOGICAL FIRST, LIEEE1, LRND INTEGER LBETA, LT DOUBLE PRECISION A, B, C, F, ONE, QTR, SAVEC, T1, T2 * .. * .. External Functions .. DOUBLE PRECISION DLAMC3 EXTERNAL DLAMC3 * .. * .. Save statement .. SAVE FIRST, LIEEE1, LBETA, LRND, LT * .. * .. Data statements .. DATA FIRST / .TRUE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. ONE = 1 * * LBETA, LIEEE1, LT and LRND are the local values of BETA, * IEEE1, T and RND. * * Throughout this routine we use the function DLAMC3 to ensure * that relevant values are stored and not held in registers, or * are not affected by optimizers. * * Compute a = 2.0**m with the smallest positive integer m such * that * * fl( a + 1.0 ) = a. * A = 1 C = 1 * *+ WHILE( C.EQ.ONE )LOOP 10 CONTINUE IF( C.EQ.ONE ) THEN A = 2*A C = DLAMC3( A, ONE ) C = DLAMC3( C, -A ) GO TO 10 END IF *+ END WHILE * * Now compute b = 2.0**m with the smallest positive integer m * such that * * fl( a + b ) .gt. a. * B = 1 C = DLAMC3( A, B ) * *+ WHILE( C.EQ.A )LOOP 20 CONTINUE IF( C.EQ.A ) THEN B = 2*B C = DLAMC3( A, B ) GO TO 20 END IF *+ END WHILE * * Now compute the base. a and c are neighbouring floating point * numbers in the interval ( beta**t, beta**( t + 1 ) ) and so * their difference is beta. Adding 0.25 to c is to ensure that it * is truncated to beta and not ( beta - 1 ). * QTR = ONE / 4 SAVEC = C C = DLAMC3( C, -A ) LBETA = C + QTR * * Now determine whether rounding or chopping occurs, by adding a * bit less than beta/2 and a bit more than beta/2 to a. * B = LBETA F = DLAMC3( B / 2, -B / 100 ) C = DLAMC3( F, A ) IF( C.EQ.A ) THEN LRND = .TRUE. ELSE LRND = .FALSE. END IF F = DLAMC3( B / 2, B / 100 ) C = DLAMC3( F, A ) IF( ( LRND ) .AND. ( C.EQ.A ) ) $ LRND = .FALSE. * * Try and decide whether rounding is done in the IEEE 'round to * nearest' style. B/2 is half a unit in the last place of the two * numbers A and SAVEC. Furthermore, A is even, i.e. has last bit * zero, and SAVEC is odd. Thus adding B/2 to A should not change * A, but adding B/2 to SAVEC should change SAVEC. * T1 = DLAMC3( B / 2, A ) T2 = DLAMC3( B / 2, SAVEC ) LIEEE1 = ( T1.EQ.A ) .AND. ( T2.GT.SAVEC ) .AND. LRND * * Now find the mantissa, t. It should be the integer part of * log to the base beta of a, however it is safer to determine t * by powering. So we find t as the smallest positive integer for * which * * fl( beta**t + 1.0 ) = 1.0. * LT = 0 A = 1 C = 1 * *+ WHILE( C.EQ.ONE )LOOP 30 CONTINUE IF( C.EQ.ONE ) THEN LT = LT + 1 A = A*LBETA C = DLAMC3( A, ONE ) C = DLAMC3( C, -A ) GO TO 30 END IF *+ END WHILE * END IF * BETA = LBETA T = LT RND = LRND IEEE1 = LIEEE1 RETURN * * End of DLAMC1 * END * ************************************************************************ * SUBROUTINE DLAMC2( BETA, T, RND, EPS, EMIN, RMIN, EMAX, RMAX ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL RND INTEGER BETA, EMAX, EMIN, T DOUBLE PRECISION EPS, RMAX, RMIN * .. * * Purpose * ======= * * DLAMC2 determines the machine parameters specified in its argument * list. * * Arguments * ========= * * BETA (output) INTEGER * The base of the machine. * * T (output) INTEGER * The number of ( BETA ) digits in the mantissa. * * RND (output) LOGICAL * Specifies whether proper rounding ( RND = .TRUE. ) or * chopping ( RND = .FALSE. ) occurs in addition. This may not * be a reliable guide to the way in which the machine performs * its arithmetic. * * EPS (output) DOUBLE PRECISION * The smallest positive number such that * * fl( 1.0 - EPS ) .LT. 1.0, * * where fl denotes the computed value. * * EMIN (output) INTEGER * The minimum exponent before (gradual) underflow occurs. * * RMIN (output) DOUBLE PRECISION * The smallest normalized number for the machine, given by * BASE**( EMIN - 1 ), where BASE is the floating point value * of BETA. * * EMAX (output) INTEGER * The maximum exponent before overflow occurs. * * RMAX (output) DOUBLE PRECISION * The largest positive number for the machine, given by * BASE**EMAX * ( 1 - EPS ), where BASE is the floating point * value of BETA. * * Further Details * =============== * * The computation of EPS is based on a routine PARANOIA by * W. Kahan of the University of California at Berkeley. * * ===================================================================== * * .. Local Scalars .. LOGICAL FIRST, IEEE, IWARN, LIEEE1, LRND INTEGER GNMIN, GPMIN, I, LBETA, LEMAX, LEMIN, LT, $ NGNMIN, NGPMIN DOUBLE PRECISION A, B, C, HALF, LEPS, LRMAX, LRMIN, ONE, RBASE, $ SIXTH, SMALL, THIRD, TWO, ZERO * .. * .. External Functions .. DOUBLE PRECISION DLAMC3 EXTERNAL DLAMC3 * .. * .. External Subroutines .. EXTERNAL DLAMC1, DLAMC4, DLAMC5 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Save statement .. SAVE FIRST, IWARN, LBETA, LEMAX, LEMIN, LEPS, LRMAX, $ LRMIN, LT * .. * .. Data statements .. DATA FIRST / .TRUE. / , IWARN / .FALSE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. ZERO = 0 ONE = 1 TWO = 2 * * LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values of * BETA, T, RND, EPS, EMIN and RMIN. * * Throughout this routine we use the function DLAMC3 to ensure * that relevant values are stored and not held in registers, or * are not affected by optimizers. * * DLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. * CALL DLAMC1( LBETA, LT, LRND, LIEEE1 ) * * Start to find EPS. * B = LBETA A = B**( -LT ) LEPS = A * * Try some tricks to see whether or not this is the correct EPS. * B = TWO / 3 HALF = ONE / 2 SIXTH = DLAMC3( B, -HALF ) THIRD = DLAMC3( SIXTH, SIXTH ) B = DLAMC3( THIRD, -HALF ) B = DLAMC3( B, SIXTH ) B = ABS( B ) IF( B.LT.LEPS ) $ B = LEPS * LEPS = 1 * *+ WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP 10 CONTINUE IF( ( LEPS.GT.B ) .AND. ( B.GT.ZERO ) ) THEN LEPS = B C = DLAMC3( HALF*LEPS, ( TWO**5 )*( LEPS**2 ) ) C = DLAMC3( HALF, -C ) B = DLAMC3( HALF, C ) C = DLAMC3( HALF, -B ) B = DLAMC3( HALF, C ) GO TO 10 END IF *+ END WHILE * IF( A.LT.LEPS ) $ LEPS = A * * Computation of EPS complete. * * Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3)). * Keep dividing A by BETA until (gradual) underflow occurs. This * is detected when we cannot recover the previous A. * RBASE = ONE / LBETA SMALL = ONE DO 20 I = 1, 3 SMALL = DLAMC3( SMALL*RBASE, ZERO ) 20 CONTINUE A = DLAMC3( ONE, SMALL ) CALL DLAMC4( NGPMIN, ONE, LBETA ) CALL DLAMC4( NGNMIN, -ONE, LBETA ) CALL DLAMC4( GPMIN, A, LBETA ) CALL DLAMC4( GNMIN, -A, LBETA ) IEEE = .FALSE. * IF( ( NGPMIN.EQ.NGNMIN ) .AND. ( GPMIN.EQ.GNMIN ) ) THEN IF( NGPMIN.EQ.GPMIN ) THEN LEMIN = NGPMIN * ( Non twos-complement machines, no gradual underflow; * e.g., VAX ) ELSE IF( ( GPMIN-NGPMIN ).EQ.3 ) THEN LEMIN = NGPMIN - 1 + LT IEEE = .TRUE. * ( Non twos-complement machines, with gradual underflow; * e.g., IEEE standard followers ) ELSE LEMIN = MIN( NGPMIN, GPMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE IF( ( NGPMIN.EQ.GPMIN ) .AND. ( NGNMIN.EQ.GNMIN ) ) THEN IF( ABS( NGPMIN-NGNMIN ).EQ.1 ) THEN LEMIN = MAX( NGPMIN, NGNMIN ) * ( Twos-complement machines, no gradual underflow; * e.g., CYBER 205 ) ELSE LEMIN = MIN( NGPMIN, NGNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE IF( ( ABS( NGPMIN-NGNMIN ).EQ.1 ) .AND. $ ( GPMIN.EQ.GNMIN ) ) THEN IF( ( GPMIN-MIN( NGPMIN, NGNMIN ) ).EQ.3 ) THEN LEMIN = MAX( NGPMIN, NGNMIN ) - 1 + LT * ( Twos-complement machines with gradual underflow; * no known machine ) ELSE LEMIN = MIN( NGPMIN, NGNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE LEMIN = MIN( NGPMIN, NGNMIN, GPMIN, GNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF *** * Comment out this if block if EMIN is ok IF( IWARN ) THEN FIRST = .TRUE. WRITE( 6, FMT = 9999 )LEMIN END IF *** * * Assume IEEE arithmetic if we found denormalised numbers above, * or if arithmetic seems to round in the IEEE style, determined * in routine DLAMC1. A true IEEE machine should have both things * true; however, faulty machines may have one or the other. * IEEE = IEEE .OR. LIEEE1 * * Compute RMIN by successive division by BETA. We could compute * RMIN as BASE**( EMIN - 1 ), but some machines underflow during * this computation. * LRMIN = 1 DO 30 I = 1, 1 - LEMIN LRMIN = DLAMC3( LRMIN*RBASE, ZERO ) 30 CONTINUE * * Finally, call DLAMC5 to compute EMAX and RMAX. * CALL DLAMC5( LBETA, LT, LEMIN, IEEE, LEMAX, LRMAX ) END IF * BETA = LBETA T = LT RND = LRND EPS = LEPS EMIN = LEMIN RMIN = LRMIN EMAX = LEMAX RMAX = LRMAX * RETURN * 9999 FORMAT( / / ' WARNING. The value EMIN may be incorrect:-', $ ' EMIN = ', I8, / $ ' If, after inspection, the value EMIN looks', $ ' acceptable please comment out ', $ / ' the IF block as marked within the code of routine', $ ' DLAMC2,', / ' otherwise supply EMIN explicitly.', / ) * * End of DLAMC2 * END * ************************************************************************ * DOUBLE PRECISION FUNCTION DLAMC3( A, B ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. DOUBLE PRECISION A, B * .. * * Purpose * ======= * * DLAMC3 is intended to force A and B to be stored prior to doing * the addition of A and B , for use in situations where optimizers * might hold one of these in a register. * * Arguments * ========= * * A, B (input) DOUBLE PRECISION * The values A and B. * * ===================================================================== * * .. Executable Statements .. * DLAMC3 = A + B * RETURN * * End of DLAMC3 * END * ************************************************************************ * SUBROUTINE DLAMC4( EMIN, START, BASE ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. INTEGER BASE, EMIN DOUBLE PRECISION START * .. * * Purpose * ======= * * DLAMC4 is a service routine for DLAMC2. * * Arguments * ========= * * EMIN (output) EMIN * The minimum exponent before (gradual) underflow, computed by * setting A = START and dividing by BASE until the previous A * can not be recovered. * * START (input) DOUBLE PRECISION * The starting point for determining EMIN. * * BASE (input) INTEGER * The base of the machine. * * ===================================================================== * * .. Local Scalars .. INTEGER I DOUBLE PRECISION A, B1, B2, C1, C2, D1, D2, ONE, RBASE, ZERO * .. * .. External Functions .. DOUBLE PRECISION DLAMC3 EXTERNAL DLAMC3 * .. * .. Executable Statements .. * A = START ONE = 1 RBASE = ONE / BASE ZERO = 0 EMIN = 1 B1 = DLAMC3( A*RBASE, ZERO ) C1 = A C2 = A D1 = A D2 = A *+ WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. * $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP 10 CONTINUE IF( ( C1.EQ.A ) .AND. ( C2.EQ.A ) .AND. ( D1.EQ.A ) .AND. $ ( D2.EQ.A ) ) THEN EMIN = EMIN - 1 A = B1 B1 = DLAMC3( A / BASE, ZERO ) C1 = DLAMC3( B1*BASE, ZERO ) D1 = ZERO DO 20 I = 1, BASE D1 = D1 + B1 20 CONTINUE B2 = DLAMC3( A*RBASE, ZERO ) C2 = DLAMC3( B2 / RBASE, ZERO ) D2 = ZERO DO 30 I = 1, BASE D2 = D2 + B2 30 CONTINUE GO TO 10 END IF *+ END WHILE * RETURN * * End of DLAMC4 * END * ************************************************************************ * SUBROUTINE DLAMC5( BETA, P, EMIN, IEEE, EMAX, RMAX ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL IEEE INTEGER BETA, EMAX, EMIN, P DOUBLE PRECISION RMAX * .. * * Purpose * ======= * * DLAMC5 attempts to compute RMAX, the largest machine floating-point * number, without overflow. It assumes that EMAX + abs(EMIN) sum * approximately to a power of 2. It will fail on machines where this * assumption does not hold, for example, the Cyber 205 (EMIN = -28625, * EMAX = 28718). It will also fail if the value supplied for EMIN is * too large (i.e. too close to zero), probably with overflow. * * Arguments * ========= * * BETA (input) INTEGER * The base of floating-point arithmetic. * * P (input) INTEGER * The number of base BETA digits in the mantissa of a * floating-point value. * * EMIN (input) INTEGER * The minimum exponent before (gradual) underflow. * * IEEE (input) LOGICAL * A logical flag specifying whether or not the arithmetic * system is thought to comply with the IEEE standard. * * EMAX (output) INTEGER * The largest exponent before overflow * * RMAX (output) DOUBLE PRECISION * The largest machine floating-point number. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. INTEGER EXBITS, EXPSUM, I, LEXP, NBITS, TRY, UEXP DOUBLE PRECISION OLDY, RECBAS, Y, Z * .. * .. External Functions .. DOUBLE PRECISION DLAMC3 EXTERNAL DLAMC3 * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * First compute LEXP and UEXP, two powers of 2 that bound * abs(EMIN). We then assume that EMAX + abs(EMIN) will sum * approximately to the bound that is closest to abs(EMIN). * (EMAX is the exponent of the required number RMAX). * LEXP = 1 EXBITS = 1 10 CONTINUE TRY = LEXP*2 IF( TRY.LE.( -EMIN ) ) THEN LEXP = TRY EXBITS = EXBITS + 1 GO TO 10 END IF IF( LEXP.EQ.-EMIN ) THEN UEXP = LEXP ELSE UEXP = TRY EXBITS = EXBITS + 1 END IF * * Now -LEXP is less than or equal to EMIN, and -UEXP is greater * than or equal to EMIN. EXBITS is the number of bits needed to * store the exponent. * IF( ( UEXP+EMIN ).GT.( -LEXP-EMIN ) ) THEN EXPSUM = 2*LEXP ELSE EXPSUM = 2*UEXP END IF * * EXPSUM is the exponent range, approximately equal to * EMAX - EMIN + 1 . * EMAX = EXPSUM + EMIN - 1 NBITS = 1 + EXBITS + P * * NBITS is the total number of bits needed to store a * floating-point number. * IF( ( MOD( NBITS, 2 ).EQ.1 ) .AND. ( BETA.EQ.2 ) ) THEN * * Either there are an odd number of bits used to store a * floating-point number, which is unlikely, or some bits are * not used in the representation of numbers, which is possible, * (e.g. Cray machines) or the mantissa has an implicit bit, * (e.g. IEEE machines, Dec Vax machines), which is perhaps the * most likely. We have to assume the last alternative. * If this is true, then we need to reduce EMAX by one because * there must be some way of representing zero in an implicit-bit * system. On machines like Cray, we are reducing EMAX by one * unnecessarily. * EMAX = EMAX - 1 END IF * IF( IEEE ) THEN * * Assume we are on an IEEE machine which reserves one exponent * for infinity and NaN. * EMAX = EMAX - 1 END IF * * Now create RMAX, the largest machine number, which should * be equal to (1.0 - BETA**(-P)) * BETA**EMAX . * * First compute 1.0 - BETA**(-P), being careful that the * result is less than 1.0 . * RECBAS = ONE / BETA Z = BETA - ONE Y = ZERO DO 20 I = 1, P Z = Z*RECBAS IF( Y.LT.ONE ) $ OLDY = Y Y = DLAMC3( Y, Z ) 20 CONTINUE IF( Y.GE.ONE ) $ Y = OLDY * * Now multiply by BETA**EMAX to get RMAX. * DO 30 I = 1, EMAX Y = DLAMC3( Y*BETA, ZERO ) 30 CONTINUE * RMAX = Y RETURN * * End of DLAMC5 * END scalapack-2.0.2/PBLAS/TESTING/Makefile000644 000766 000024 00000010036 11707553632 017340 0ustar00juliestaff000000 000000 ############################################################################ # # Program: PBLAS -- (version 2.0) # # Module: Makefile # # Purpose: Testing Makefile # # Creation date: April 1, 1998 # # Modified: February 15, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../SLmake.inc sPBLAS1exe = xspblas1tst dPBLAS1exe = xdpblas1tst cPBLAS1exe = xcpblas1tst zPBLAS1exe = xzpblas1tst sPBLAS2exe = xspblas2tst dPBLAS2exe = xdpblas2tst cPBLAS2exe = xcpblas2tst zPBLAS2exe = xzpblas2tst sPBLAS3exe = xspblas3tst dPBLAS3exe = xdpblas3tst cPBLAS3exe = xcpblas3tst zPBLAS3exe = xzpblas3tst spb1tst = $(sPBLAS1exe) dpb1tst = $(dPBLAS1exe) cpb1tst = $(cPBLAS1exe) zpb1tst = $(zPBLAS1exe) spb2tst = $(sPBLAS2exe) dpb2tst = $(dPBLAS2exe) cpb2tst = $(cPBLAS2exe) zpb2tst = $(zPBLAS2exe) spb3tst = $(sPBLAS3exe) dpb3tst = $(dPBLAS3exe) cpb3tst = $(cPBLAS3exe) zpb3tst = $(zPBLAS3exe) pbtcom = pblastst.o PB_Cwarn.o PB_Cabort.o spbtcom = psblastst.o slamch.o $(pbtcom) dpbtcom = pdblastst.o dlamch.o $(pbtcom) cpbtcom = pcblastst.o slamch.o $(pbtcom) zpbtcom = pzblastst.o dlamch.o $(pbtcom) spb1t = psblas1tst.o $(spbtcom) dpb1t = pdblas1tst.o $(dpbtcom) cpb1t = pcblas1tst.o $(cpbtcom) zpb1t = pzblas1tst.o $(zpbtcom) spb2t = psblas2tst.o $(spbtcom) dpb2t = pdblas2tst.o $(dpbtcom) cpb2t = pcblas2tst.o $(cpbtcom) zpb2t = pzblas2tst.o $(zpbtcom) spb3t = psblas3tst.o $(spbtcom) dpb3t = pdblas3tst.o $(dpbtcom) cpb3t = pcblas3tst.o $(cpbtcom) zpb3t = pzblas3tst.o $(zpbtcom) all : single double complex complex16 single: PblasErrorHandler $(spb1tst) $(spb2tst) $(spb3tst) double: PblasErrorHandler $(dpb1tst) $(dpb2tst) $(dpb3tst) complex: PblasErrorHandler $(cpb1tst) $(cpb2tst) $(cpb3tst) complex16: PblasErrorHandler $(zpb1tst) $(zpb2tst) $(zpb3tst) PB_Cwarn.o: $(CC) -c $(CDEFS) $(CCFLAGS) -I../SRC -o PB_Cwarn.o -DTestingPblas ../SRC/PTOOLS/PB_Cwarn.c PB_Cabort.o: $(CC) -c $(CDEFS) $(CCFLAGS) -I../SRC -o PB_Cabort.o -DTestingPblas ../SRC/PTOOLS/PB_Cabort.c PblasErrorHandler: PB_Cwarn.o PB_Cwarn.o $(spb1tst) : ../../$(SCALAPACKLIB) $(spb1t) $(FCLOADER) $(FCLOADFLAGS) -o $(spb1tst) $(spb1t) ../../$(SCALAPACKLIB) $(LIBS) $(dpb1tst) : ../../$(SCALAPACKLIB) $(dpb1t) $(FCLOADER) $(FCLOADFLAGS) -o $(dpb1tst) $(dpb1t) ../../$(SCALAPACKLIB) $(LIBS) $(cpb1tst) : ../../$(SCALAPACKLIB) $(cpb1t) $(FCLOADER) $(FCLOADFLAGS) -o $(cpb1tst) $(cpb1t) ../../$(SCALAPACKLIB) $(LIBS) $(zpb1tst) : ../../$(SCALAPACKLIB) $(zpb1t) $(FCLOADER) $(FCLOADFLAGS) -o $(zpb1tst) $(zpb1t) ../../$(SCALAPACKLIB) $(LIBS) $(spb2tst) : ../../$(SCALAPACKLIB) $(spb2t) $(FCLOADER) $(FCLOADFLAGS) -o $(spb2tst) $(spb2t) ../../$(SCALAPACKLIB) $(LIBS) $(dpb2tst) : ../../$(SCALAPACKLIB) $(dpb2t) $(FCLOADER) $(FCLOADFLAGS) -o $(dpb2tst) $(dpb2t) ../../$(SCALAPACKLIB) $(LIBS) $(cpb2tst) : ../../$(SCALAPACKLIB) $(cpb2t) $(FCLOADER) $(FCLOADFLAGS) -o $(cpb2tst) $(cpb2t) ../../$(SCALAPACKLIB) $(LIBS) $(zpb2tst) : ../../$(SCALAPACKLIB) $(zpb2t) $(FCLOADER) $(FCLOADFLAGS) -o $(zpb2tst) $(zpb2t) ../../$(SCALAPACKLIB) $(LIBS) $(spb3tst) : ../../$(SCALAPACKLIB) $(spb3t) $(FCLOADER) $(FCLOADFLAGS) -o $(spb3tst) $(spb3t) ../../$(SCALAPACKLIB) $(LIBS) $(dpb3tst) : ../../$(SCALAPACKLIB) $(dpb3t) $(FCLOADER) $(FCLOADFLAGS) -o $(dpb3tst) $(dpb3t) ../../$(SCALAPACKLIB) $(LIBS) $(cpb3tst) : ../../$(SCALAPACKLIB) $(cpb3t) $(FCLOADER) $(FCLOADFLAGS) -o $(cpb3tst) $(cpb3t) ../../$(SCALAPACKLIB) $(LIBS) $(zpb3tst) : ../../$(SCALAPACKLIB) $(zpb3t) $(FCLOADER) $(FCLOADFLAGS) -o $(zpb3tst) $(zpb3t) ../../$(SCALAPACKLIB) $(LIBS) clean : rm -f *.o x* slamch.o: $(FC) -c $(NOOPT) slamch.f dlamch.o: $(FC) -c $(NOOPT) dlamch.f .f.o : ; $(FC) -c $(FCFLAGS) $*.f .c.o : ; $(CC) -c $(CDEFS) $(CCFLAGS) -DTestingPblas $*.c scalapack-2.0.2/PBLAS/TESTING/pblastst.f000644 000766 000024 00000533171 10363532303 017702 0ustar00juliestaff000000 000000 SUBROUTINE PVDIMCHK( ICTXT, NOUT, N, MATRIX, IX, JX, DESCX, INCX, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 MATRIX INTEGER ICTXT, INCX, INFO, IX, JX, N, NOUT * .. * .. Array Arguments .. INTEGER DESCX( * ) * .. * * Purpose * ======= * * PVDIMCHK checks the validity of the input test dimensions. In case of * an invalid parameter or discrepancy between the parameters, this rou- * tine displays error messages and returns an non-zero error code in * INFO. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * MATRIX (global input) CHARACTER*1 * On entry, MATRIX specifies the one character matrix identi- * fier. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, when INFO is zero, no error has been detected, * otherwise an error has been detected. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. Executable Statements .. * INFO = 0 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.LT.0 ) THEN INFO = 1 ELSE IF( N.EQ.0 ) THEN IF( DESCX( M_ ).LT.0 ) $ INFO = 1 IF( DESCX( N_ ).LT.0 ) $ INFO = 1 ELSE IF( INCX.EQ.DESCX( M_ ) .AND. $ DESCX( N_ ).LT.( JX+N-1 ) ) THEN INFO = 1 ELSE IF( INCX.EQ.1 .AND. INCX.NE.DESCX( M_ ) .AND. $ DESCX( M_ ).LT.( IX+N-1 ) ) THEN INFO = 1 ELSE IF( IX.GT.DESCX( M_ ) ) THEN INFO = 1 ELSE IF( JX.GT.DESCX( N_ ) ) THEN INFO = 1 END IF END IF END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) MATRIX WRITE( NOUT, FMT = 9998 ) N, MATRIX, IX, MATRIX, JX, MATRIX, $ INCX WRITE( NOUT, FMT = 9997 ) MATRIX, DESCX( M_ ), MATRIX, $ DESCX( N_ ) WRITE( NOUT, FMT = * ) END IF END IF * 9999 FORMAT( 'Incompatible arguments for matrix ', A1, ':' ) 9998 FORMAT( 'N = ', I6, ', I', A1, ' = ', I6, ', J', A1, ' = ', $ I6, ',INC', A1, ' = ', I6 ) 9997 FORMAT( 'DESC', A1, '( M_ ) = ', I6, ', DESC', A1, '( N_ ) = ', $ I6, '.' ) * RETURN * * End of PVDIMCHK * END SUBROUTINE PMDIMCHK( ICTXT, NOUT, M, N, MATRIX, IA, JA, DESCA, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 MATRIX INTEGER ICTXT, INFO, IA, JA, M, N, NOUT * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PMDIMCHK checks the validity of the input test dimensions. In case of * an invalid parameter or discrepancy between the parameters, this rou- * tine displays error messages and returns an non-zero error code in * INFO. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * MATRIX (global input) CHARACTER*1 * On entry, MATRIX specifies the one character matrix identi- * fier. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, when INFO is zero, no error has been detected, * otherwise an error has been detected. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. Executable Statements .. * INFO = 0 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( ( M.LT.0 ).OR.( N.LT.0 ) ) THEN INFO = 1 ELSE IF( ( M.EQ.0 ).OR.( N.EQ.0 ) )THEN IF( DESCA( M_ ).LT.0 ) $ INFO = 1 IF( DESCA( N_ ).LT.0 ) $ INFO = 1 ELSE IF( DESCA( M_ ).LT.( IA+M-1 ) ) $ INFO = 1 IF( DESCA( N_ ).LT.( JA+N-1 ) ) $ INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) MATRIX WRITE( NOUT, FMT = 9998 ) M, N, MATRIX, IA, MATRIX, JA WRITE( NOUT, FMT = 9997 ) MATRIX, DESCA( M_ ), MATRIX, $ DESCA( N_ ) WRITE( NOUT, FMT = * ) END IF END IF * 9999 FORMAT( 'Incompatible arguments for matrix ', A1, ':' ) 9998 FORMAT( 'M = ', I6, ', N = ', I6, ', I', A1, ' = ', I6, $ ', J', A1, ' = ', I6 ) 9997 FORMAT( 'DESC', A1, '( M_ ) = ', I6, ', DESC', A1, '( N_ ) = ', $ I6, '.' ) * RETURN * * End of PMDIMCHK * END SUBROUTINE PVDESCCHK( ICTXT, NOUT, MATRIX, DESCX, DTX, MX, NX, $ IMBX, INBX, MBX, NBX, RSRCX, CSRCX, INCX, $ MPX, NQX, IPREX, IMIDX, IPOSTX, IGAP, $ GAPMUL, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 MATRIX INTEGER CSRCX, DTX, GAPMUL, ICTXT, IGAP, IMBX, IMIDX, $ INBX, INCX, INFO, IPOSTX, IPREX, MBX, MPX, MX, $ NBX, NOUT, NQX, NX, RSRCX * .. * .. Array Arguments .. INTEGER DESCX( * ) * .. * * Purpose * ======= * * PVDESCCHK checks the validity of the input test parameters and ini- * tializes the descriptor DESCX and the scalar variables MPX, NQX. In * case of an invalid parameter, this routine displays error messages * and return an non-zero error code in INFO. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * MATRIX (global input) CHARACTER*1 * On entry, MATRIX specifies the one character matrix identi- * fier. * * DESCX (global output) INTEGER array * On entry, DESCX is an array of dimension DLEN_. DESCX is the * array descriptor to be set. * * DTYPEX (global input) INTEGER * On entry, DTYPEX specifies the descriptor type. In this ver- * sion, DTYPEX must be BLOCK_CYCLIC_INB_2D. * * MX (global input) INTEGER * On entry, MX specifies the number of rows in the matrix. MX * must be at least zero. * * NX (global input) INTEGER * On entry, NX specifies the number of columns in the matrix. * NX must be at least zero. * * IMBX (global input) INTEGER * On entry, IMBX specifies the row blocking factor used to dis- * tribute the first IMBX rows of the matrix. IMBX must be at * least one. * * INBX (global input) INTEGER * On entry, INBX specifies the column blocking factor used to * distribute the first INBX columns of the matrix. INBX must * be at least one. * * MBX (global input) INTEGER * On entry, MBX specifies the row blocking factor used to dis- * tribute the rows of the matrix. MBX must be at least one. * * NBX (global input) INTEGER * On entry, NBX specifies the column blocking factor used to * distribute the columns of the matrix. NBX must be at least * one. * * RSRCX (global input) INTEGER * On entry, RSRCX specifies the process row in which the first * row of the matrix resides. When RSRCX is -1, the matrix is * row replicated, otherwise RSCRX must be at least zero and * strictly less than NPROW. * * CSRCX (global input) INTEGER * On entry, CSRCX specifies the process column in which the * first column of the matrix resides. When CSRCX is -1, the * matrix is column replicated, otherwise CSCRX must be at least * zero and strictly less than NPCOL. * * INCX (global input) INTEGER * On entry, INCX specifies the global vector increment. INCX * must be one or MX. * * MPX (local output) INTEGER * On exit, MPX is Lr( 1, MX ). * * NQX (local output) INTEGER * On exit, NQX is Lc( 1, NX ). * * IPREX (local output) INTEGER * On exit, IPREX specifies the size of the guard zone to put * before the start of the local padded array. * * IMIDX (local output) INTEGER * On exit, IMIDX specifies the ldx-gap of the guard zone to * put after each column of the local padded array. * * IPOSTX (local output) INTEGER * On exit, IPOSTX specifies the size of the guard zone to put * after the local padded array. * * IGAP (global input) INTEGER * On entry, IGAP specifies the size of the ldx-gap. * * GAPMUL (global input) INTEGER * On entry, GAPMUL is a constant factor controlling the size * of the pre- and post guardzone. * * INFO (global output) INTEGER * On exit, when INFO is zero, no error has been detected, * otherwise an error has been detected. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER LLDX, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_DESCINIT2 * .. * .. External Functions .. INTEGER PB_NUMROC EXTERNAL PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * INFO = 0 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Verify descriptor type DTYPE_ * IF( DTX.NE.BLOCK_CYCLIC_2D_INB ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) MATRIX, 'DTYPE', MATRIX, DTX, $ BLOCK_CYCLIC_2D_INB INFO = 1 END IF * * Verify global matrix dimensions (M_,N_) are correct * IF( MX.LT.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) MATRIX, 'M', MATRIX, MX INFO = 1 ELSE IF( NX.LT.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) MATRIX, 'N', MATRIX, NX INFO = 1 END IF * * Verify if blocking factors (IMB_, INB_) are correct * IF( IMBX.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) MATRIX, 'IMB', MATRIX, IMBX INFO = 1 ELSE IF( INBX.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) MATRIX, 'INB', MATRIX, INBX INFO = 1 END IF * * Verify if blocking factors (MB_, NB_) are correct * IF( MBX.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) MATRIX, 'MB', MATRIX, MBX INFO = 1 ELSE IF( NBX.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) MATRIX, 'NB', MATRIX, NBX INFO = 1 END IF * * Verify if origin process coordinates (RSRC_, CSRC_) are valid * IF( RSRCX.LT.-1 .OR. RSRCX.GE.NPROW ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) MATRIX WRITE( NOUT, FMT = 9990 ) 'RSRC', MATRIX, RSRCX, NPROW END IF INFO = 1 ELSE IF( CSRCX.LT.-1 .OR. CSRCX.GE.NPCOL ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9991 ) MATRIX WRITE( NOUT, FMT = 9990 ) 'CSRC', MATRIX, CSRCX, NPCOL END IF INFO = 1 END IF * * Check input increment value * IF( INCX.NE.1 .AND. INCX.NE.MX ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) MATRIX WRITE( NOUT, FMT = 9988 ) 'INC', MATRIX, INCX, MATRIX, MX END IF INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9987 ) MATRIX WRITE( NOUT, FMT = * ) END IF * ELSE * * Compute local testing leading dimension * MPX = PB_NUMROC( MX, 1, IMBX, MBX, MYROW, RSRCX, NPROW ) NQX = PB_NUMROC( NX, 1, INBX, NBX, MYCOL, CSRCX, NPCOL ) IPREX = MAX( GAPMUL*NBX, MPX ) IMIDX = IGAP IPOSTX = MAX( GAPMUL*NBX, NQX ) LLDX = MAX( 1, MPX ) + IMIDX * CALL PB_DESCINIT2( DESCX, MX, NX, IMBX, INBX, MBX, NBX, RSRCX, $ CSRCX, ICTXT, LLDX, INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9987 ) MATRIX WRITE( NOUT, FMT = * ) END IF END IF * END IF * 9999 FORMAT( 2X, '>> Invalid matrix ', A1, ' descriptor type ', A5, A1, $ ': ', I6, ' should be ', I3, '.' ) 9998 FORMAT( 2X, '>> Invalid matrix ', A1, ' row dimension ', A1, A1, $ ': ', I6, ' should be at least 1.' ) 9997 FORMAT( 2X, '>> Invalid matrix ', A1, ' column dimension ', A1, $ A1, ': ', I6, ' should be at least 1.' ) 9996 FORMAT( 2X, '>> Invalid matrix ', A1, ' first row block size ', $ A3, A1, ': ', I6, ' should be at least 1.' ) 9995 FORMAT( 2X, '>> Invalid matrix ', A1, ' first column block size ', $ A3, A1,': ', I6, ' should be at least 1.' ) 9994 FORMAT( 2X, '>> Invalid matrix ', A1, ' row block size ', A2, A1, $ ': ', I6, ' should be at least 1.' ) 9993 FORMAT( 2X, '>> Invalid matrix ', A1, ' column block size ', A2, $ A1,': ', I6, ' should be at least 1.' ) 9992 FORMAT( 2X, '>> Invalid matrix ', A1, ' row process source:' ) 9991 FORMAT( 2X, '>> Invalid matrix ', A1, ' column process source:' ) 9990 FORMAT( 2X, '>> ', A4, A1, '= ', I6, ' should be >= -1 and < ', $ I6, '.' ) 9989 FORMAT( 2X, '>> Invalid vector ', A1, ' increment:' ) 9988 FORMAT( 2X, '>> ', A3, A1, '= ', I6, ' should be 1 or M', A1, $ ' = ', I6, '.' ) 9987 FORMAT( 2X, '>> Invalid matrix ', A1, ' descriptor: going on to ', $ 'next test case.' ) * RETURN * * End of PVDESCCHK * END SUBROUTINE PMDESCCHK( ICTXT, NOUT, MATRIX, DESCA, DTA, MA, NA, $ IMBA, INBA, MBA, NBA, RSRCA, CSRCA, MPA, $ NQA, IPREA, IMIDA, IPOSTA, IGAP, GAPMUL, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 MATRIX INTEGER CSRCA, DTA, GAPMUL, ICTXT, IGAP, IMBA, IMIDA, $ INBA, INFO, IPOSTA, IPREA, MA, MBA, MPA, NA, $ NBA, NOUT, NQA, RSRCA * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PMDESCCHK checks the validity of the input test parameters and ini- * tializes the descriptor DESCA and the scalar variables MPA, NQA. In * case of an invalid parameter, this routine displays error messages * and return an non-zero error code in INFO. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * MATRIX (global input) CHARACTER*1 * On entry, MATRIX specifies the one character matrix identi- * fier. * * DESCA (global output) INTEGER array * On entry, DESCA is an array of dimension DLEN_. DESCA is the * array descriptor to be set. * * DTYPEA (global input) INTEGER * On entry, DTYPEA specifies the descriptor type. In this ver- * sion, DTYPEA must be BLOCK_CYCLIC_INB_2D. * * MA (global input) INTEGER * On entry, MA specifies the number of rows in the matrix. MA * must be at least zero. * * NA (global input) INTEGER * On entry, NA specifies the number of columns in the matrix. * NA must be at least zero. * * IMBA (global input) INTEGER * On entry, IMBA specifies the row blocking factor used to dis- * tribute the first IMBA rows of the matrix. IMBA must be at * least one. * * INBA (global input) INTEGER * On entry, INBA specifies the column blocking factor used to * distribute the first INBA columns of the matrix. INBA must * be at least one. * * MBA (global input) INTEGER * On entry, MBA specifies the row blocking factor used to dis- * tribute the rows of the matrix. MBA must be at least one. * * NBA (global input) INTEGER * On entry, NBA specifies the column blocking factor used to * distribute the columns of the matrix. NBA must be at least * one. * * RSRCA (global input) INTEGER * On entry, RSRCA specifies the process row in which the first * row of the matrix resides. When RSRCA is -1, the matrix is * row replicated, otherwise RSCRA must be at least zero and * strictly less than NPROW. * * CSRCA (global input) INTEGER * On entry, CSRCA specifies the process column in which the * first column of the matrix resides. When CSRCA is -1, the * matrix is column replicated, otherwise CSCRA must be at least * zero and strictly less than NPCOL. * * MPA (local output) INTEGER * On exit, MPA is Lr( 1, MA ). * * NQA (local output) INTEGER * On exit, NQA is Lc( 1, NA ). * * IPREA (local output) INTEGER * On exit, IPREA specifies the size of the guard zone to put * before the start of the local padded array. * * IMIDA (local output) INTEGER * On exit, IMIDA specifies the lda-gap of the guard zone to * put after each column of the local padded array. * * IPOSTA (local output) INTEGER * On exit, IPOSTA specifies the size of the guard zone to put * after the local padded array. * * IGAP (global input) INTEGER * On entry, IGAP specifies the size of the lda-gap. * * GAPMUL (global input) INTEGER * On entry, GAPMUL is a constant factor controlling the size * of the pre- and post guardzone. * * INFO (global output) INTEGER * On exit, when INFO is zero, no error has been detected, * otherwise an error has been detected. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER LLDA, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_DESCINIT2 * .. * .. External Functions .. INTEGER PB_NUMROC EXTERNAL PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * INFO = 0 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Verify descriptor type DTYPE_ * IF( DTA.NE.BLOCK_CYCLIC_2D_INB ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) MATRIX, 'DTYPE', MATRIX, DTA, $ BLOCK_CYCLIC_2D_INB INFO = 1 END IF * * Verify global matrix dimensions (M_,N_) are correct * IF( MA.LT.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) MATRIX, 'M', MATRIX, MA INFO = 1 ELSE IF( NA.LT.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) MATRIX, 'N', MATRIX, NA INFO = 1 END IF * * Verify if blocking factors (IMB_, INB_) are correct * IF( IMBA.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) MATRIX, 'IMB', MATRIX, IMBA INFO = 1 ELSE IF( INBA.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9995 ) MATRIX, 'INB', MATRIX, INBA INFO = 1 END IF * * Verify if blocking factors (MB_, NB_) are correct * IF( MBA.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9994 ) MATRIX, 'MB', MATRIX, MBA INFO = 1 ELSE IF( NBA.LT.1 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9993 ) MATRIX, 'NB', MATRIX, NBA INFO = 1 END IF * * Verify if origin process coordinates (RSRC_, CSRC_) are valid * IF( RSRCA.LT.-1 .OR. RSRCA.GE.NPROW ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9992 ) MATRIX WRITE( NOUT, FMT = 9990 ) 'RSRC', MATRIX, RSRCA, NPROW END IF INFO = 1 ELSE IF( CSRCA.LT.-1 .OR. CSRCA.GE.NPCOL ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9991 ) MATRIX WRITE( NOUT, FMT = 9990 ) 'CSRC', MATRIX, CSRCA, NPCOL END IF INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) MATRIX WRITE( NOUT, FMT = * ) END IF * ELSE * * Compute local testing leading dimension * MPA = PB_NUMROC( MA, 1, IMBA, MBA, MYROW, RSRCA, NPROW ) NQA = PB_NUMROC( NA, 1, INBA, NBA, MYCOL, CSRCA, NPCOL ) IPREA = MAX( GAPMUL*NBA, MPA ) IMIDA = IGAP IPOSTA = MAX( GAPMUL*NBA, NQA ) LLDA = MAX( 1, MPA ) + IMIDA * CALL PB_DESCINIT2( DESCA, MA, NA, IMBA, INBA, MBA, NBA, RSRCA, $ CSRCA, ICTXT, LLDA, INFO ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( INFO.NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9989 ) MATRIX WRITE( NOUT, FMT = * ) END IF END IF * END IF * 9999 FORMAT( 2X, '>> Invalid matrix ', A1, ' descriptor type ', A5, A1, $ ': ', I6, ' should be ', I3, '.' ) 9998 FORMAT( 2X, '>> Invalid matrix ', A1, ' row dimension ', A1, A1, $ ': ', I6, ' should be at least 1.' ) 9997 FORMAT( 2X, '>> Invalid matrix ', A1, ' column dimension ', A1, $ A1, ': ', I6, ' should be at least 1.' ) 9996 FORMAT( 2X, '>> Invalid matrix ', A1, ' first row block size ', $ A3, A1, ': ', I6, ' should be at least 1.' ) 9995 FORMAT( 2X, '>> Invalid matrix ', A1, ' first column block size ', $ A3, A1,': ', I6, ' should be at least 1.' ) 9994 FORMAT( 2X, '>> Invalid matrix ', A1, ' row block size ', A2, A1, $ ': ', I6, ' should be at least 1.' ) 9993 FORMAT( 2X, '>> Invalid matrix ', A1, ' column block size ', A2, $ A1,': ', I6, ' should be at least 1.' ) 9992 FORMAT( 2X, '>> Invalid matrix ', A1, ' row process source:' ) 9991 FORMAT( 2X, '>> Invalid matrix ', A1, ' column process source:' ) 9990 FORMAT( 2X, '>> ', A4, A1, '= ', I6, ' should be >= -1 and < ', $ I6, '.' ) 9989 FORMAT( 2X, '>> Invalid matrix ', A1, ' descriptor: going on to ', $ 'next test case.' ) * RETURN * * End of PMDESCCHK * END SUBROUTINE PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INFOT, NOUT CHARACTER*(*) SNAME * .. * * Purpose * ======= * * PCHKPBE tests whether a PBLAS routine has detected an error when it * should. This routine does a global operation to ensure all processes * have detected this error. If an error has been detected an error * message is displayed. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * INFOT (global input) INTEGER * On entry, INFOT specifies the position of the wrong argument. * If the PBLAS error handler is called, INFO will be set to * -INFOT. This routine verifies if the error was reported by * all processes by doing a global sum, and assert the result to * be NPROW * NPCOL. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER GERR, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. Common Blocks .. INTEGER INFO, NBLOG COMMON /INFOC/INFO, NBLOG * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * GERR = 0 IF( INFO.NE.-INFOT ) $ GERR = 1 * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, GERR, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN IF( GERR.EQ.( NPROW * NPCOL ) ) THEN WRITE( NOUT, FMT = 9999 ) SNAME, INFO, -INFOT END IF END IF * 9999 FORMAT( 1X, A7, ': *** ERROR *** ERROR CODE RETURNED = ', I6, $ ' SHOULD HAVE BEEN ', I6 ) * RETURN * * End of PCHKPBE * END REAL FUNCTION PSDIFF( X, Y ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL X, Y * .. * * Purpose * ======= * * PSDIFF returns the scalar difference X - Y. Similarly to the * BLAS tester, this routine allows for the possibility of computing a * more accurate difference if necessary. * * Arguments * ========= * * X (input) REAL * The real scalar X. * * Y (input) REAL * The real scalar Y. * * ===================================================================== * * .. Executable Statements .. * PSDIFF = X - Y * RETURN * * End of PSDIFF * END * DOUBLE PRECISION FUNCTION PDDIFF( X, Y ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. * * Purpose * ======= * * PDDIFF returns the scalar difference X - Y. Similarly to the * BLAS tester, this routine allows for the possibility of computing a * more accurate difference if necessary. * * Arguments * ========= * * X (input) DOUBLE PRECISION * The real scalar X. * * Y (input) DOUBLE PRECISION * The real scalar Y. * * ===================================================================== * * .. Executable Statements .. * PDDIFF = X - Y * RETURN * * End of PDDIFF * END SUBROUTINE PXERBLA( ICTXT, SRNAME, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INFO * .. * .. Array Arguments .. CHARACTER*(*) SRNAME * .. * * Purpose * ======= * * PXERBLA is an error handler for the ScaLAPACK routines. It is called * by a ScaLAPACK routine if an input parameter has an invalid value. A * message is printed. Installers may consider modifying this routine in * order to call system-specific exception-handling facilities. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * SRNAME (global input) CHARACTER*(*) * On entry, SRNAME specifies the name of the routine which cal- * ling PXERBLA. * * INFO (global input) INTEGER * On entry, INFO specifies the position of the invalid parame- * ter in the parameter list of the calling routine. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * WRITE( *, FMT = 9999 ) MYROW, MYCOL, SRNAME, INFO * 9999 FORMAT( '{', I5, ',', I5, '}: On entry to ', A, $ ' parameter number ', I4, ' had an illegal value' ) * RETURN * * End of PXERBLA * END LOGICAL FUNCTION LSAME( CA, CB ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER CA, CB * .. * * Purpose * ======= * * LSAME returns .TRUE. if CA is the same letter as CB regardless of * case. * * Arguments * ========= * * CA (input) CHARACTER*1 * CB (input) CHARACTER*1 * CA and CB specify the single characters to be compared. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ICHAR * .. * .. Local Scalars .. INTEGER INTA, INTB, ZCODE * .. * .. Executable Statements .. * * Test if the characters are equal * LSAME = CA.EQ.CB IF( LSAME ) $ RETURN * * Now test for equivalence if both characters are alphabetic. * ZCODE = ICHAR( 'Z' ) * * Use 'Z' rather than 'A' so that ASCII can be detected on Prime * machines, on which ICHAR returns a value with bit 8 set. * ICHAR('A') on Prime machines returns 193 which is the same as * ICHAR('A') on an EBCDIC machine. * INTA = ICHAR( CA ) INTB = ICHAR( CB ) * IF( ZCODE.EQ.90 .OR. ZCODE.EQ.122 ) THEN * * ASCII is assumed - ZCODE is the ASCII code of either lower or * upper case 'Z'. * IF( INTA.GE.97 .AND. INTA.LE.122 ) INTA = INTA - 32 IF( INTB.GE.97 .AND. INTB.LE.122 ) INTB = INTB - 32 * ELSE IF( ZCODE.EQ.233 .OR. ZCODE.EQ.169 ) THEN * * EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or * upper case 'Z'. * IF( INTA.GE.129 .AND. INTA.LE.137 .OR. $ INTA.GE.145 .AND. INTA.LE.153 .OR. $ INTA.GE.162 .AND. INTA.LE.169 ) INTA = INTA + 64 IF( INTB.GE.129 .AND. INTB.LE.137 .OR. $ INTB.GE.145 .AND. INTB.LE.153 .OR. $ INTB.GE.162 .AND. INTB.LE.169 ) INTB = INTB + 64 * ELSE IF( ZCODE.EQ.218 .OR. ZCODE.EQ.250 ) THEN * * ASCII is assumed, on Prime machines - ZCODE is the ASCII code * plus 128 of either lower or upper case 'Z'. * IF( INTA.GE.225 .AND. INTA.LE.250 ) INTA = INTA - 32 IF( INTB.GE.225 .AND. INTB.LE.250 ) INTB = INTB - 32 END IF LSAME = INTA.EQ.INTB * * RETURN * * End of LSAME * END LOGICAL FUNCTION LSAMEN( N, CA, CB ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER*( * ) CA, CB INTEGER N * .. * * Purpose * ======= * * LSAMEN tests if the first N letters of CA are the same as the * first N letters of CB, regardless of case. * LSAMEN returns .TRUE. if CA and CB are equivalent except for case * and .FALSE. otherwise. LSAMEN also returns .FALSE. if LEN( CA ) * or LEN( CB ) is less than N. * * Arguments * ========= * * N (input) INTEGER * The number of characters in CA and CB to be compared. * * CA (input) CHARACTER*(*) * CB (input) CHARACTER*(*) * CA and CB specify two character strings of length at least N. * Only the first N characters of each string will be accessed. * * ===================================================================== * * .. Local Scalars .. INTEGER I * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC LEN * .. * .. Executable Statements .. * LSAMEN = .FALSE. IF( LEN( CA ).LT.N .OR. LEN( CB ).LT.N ) $ GO TO 20 * * Do for each character in the two strings. * DO 10 I = 1, N * * Test if the characters are equal using LSAME. * IF( .NOT.LSAME( CA( I: I ), CB( I: I ) ) ) $ GO TO 20 * 10 CONTINUE LSAMEN = .TRUE. * 20 CONTINUE RETURN * * End of LSAMEN * END SUBROUTINE ICOPY( N, SX, INCX, SY, INCY ) * * -- LAPACK auxiliary test routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * February 29, 1992 * * .. Scalar Arguments .. INTEGER INCX, INCY, N * .. * .. Array Arguments .. INTEGER SX( * ), SY( * ) * .. * * Purpose * ======= * * ICOPY copies an integer vector x to an integer vector y. * Uses unrolled loops for increments equal to 1. * * Arguments * ========= * * N (input) INTEGER * The length of the vectors SX and SY. * * SX (input) INTEGER array, dimension (1+(N-1)*abs(INCX)) * The vector X. * * INCX (input) INTEGER * The spacing between consecutive elements of SX. * * SY (output) INTEGER array, dimension (1+(N-1)*abs(INCY)) * The vector Y. * * INCY (input) INTEGER * The spacing between consecutive elements of SY. * * ===================================================================== * * .. Local Scalars .. INTEGER I, IX, IY, M, MP1 * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * IF( N.LE.0 ) $ RETURN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) $ GO TO 20 * * Code for unequal increments or equal increments not equal to 1 * IX = 1 IY = 1 IF( INCX.LT.0 ) $ IX = ( -N+1 )*INCX + 1 IF( INCY.LT.0 ) $ IY = ( -N+1 )*INCY + 1 DO 10 I = 1, N SY( IY ) = SX( IX ) IX = IX + INCX IY = IY + INCY 10 CONTINUE RETURN * * Code for both increments equal to 1 * * Clean-up loop * 20 CONTINUE M = MOD( N, 7 ) IF( M.EQ.0 ) $ GO TO 40 DO 30 I = 1, M SY( I ) = SX( I ) 30 CONTINUE IF( N.LT.7 ) $ RETURN 40 CONTINUE MP1 = M + 1 DO 50 I = MP1, N, 7 SY( I ) = SX( I ) SY( I+1 ) = SX( I+1 ) SY( I+2 ) = SX( I+2 ) SY( I+3 ) = SX( I+3 ) SY( I+4 ) = SX( I+4 ) SY( I+5 ) = SX( I+5 ) SY( I+6 ) = SX( I+6 ) 50 CONTINUE RETURN * * End of ICOPY * END INTEGER FUNCTION PB_NOABORT( CINFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER CINFO * .. * * Purpose * ======= * * PB_NOABORT transmits the info parameter of a PBLAS routine to the * tester and tells the PBLAS error handler to avoid aborting on erro- * neous input arguments. * * Notes * ===== * * This routine is necessary because of the CRAY C fortran interface * and the fact that the usual PBLAS error handler routine has been * initially written in C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Common Blocks .. INTEGER INFO, NBLOG, NOUT LOGICAL ABRTFLG COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Executable Statements .. * INFO = CINFO IF( ABRTFLG ) THEN PB_NOABORT = 0 ELSE PB_NOABORT = 1 END IF * RETURN * * End of PB_NOABORT * END SUBROUTINE PB_INFOG2L( I, J, DESC, NPROW, NPCOL, MYROW, MYCOL, II, $ JJ, PROW, PCOL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER I, II, J, JJ, MYCOL, MYROW, NPCOL, NPROW, PCOL, $ PROW * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * PB_INFOG2L computes the starting local index II, JJ corresponding to * the submatrix starting globally at the entry pointed by I, J. This * routine returns the coordinates in the grid of the process owning the * matrix entry of global indexes I, J, namely PROW and PCOL. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * I (global input) INTEGER * On entry, I specifies the global starting row index of the * submatrix. I must at least one. * * J (global input) INTEGER * On entry, J specifies the global starting column index of * the submatrix. J must at least one. * * DESC (global and local input) INTEGER array * On entry, DESC is an integer array of dimension DLEN_. This * is the array descriptor of the underlying matrix. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process columns * over which the matrix is distributed. NPCOL must be at least * one. * * MYROW (local input) INTEGER * On entry, MYROW specifies the row coordinate of the process * whose local index II is determined. MYROW must be at least * zero and strictly less than NPROW. * * MYCOL (local input) INTEGER * On entry, MYCOL specifies the column coordinate of the pro- * cess whose local index JJ is determined. MYCOL must be at * least zero and strictly less than NPCOL. * * II (local output) INTEGER * On exit, II specifies the local starting row index of the * submatrix. On exit, II is at least one. * * JJ (local output) INTEGER * On exit, JJ specifies the local starting column index of the * submatrix. On exit, JJ is at least one. * * PROW (global output) INTEGER * On exit, PROW specifies the row coordinate of the process * that possesses the first row of the submatrix. On exit, PROW * is -1 if DESC( RSRC_ ) is -1 on input, and, at least zero * and strictly less than NPROW otherwise. * * PCOL (global output) INTEGER * On exit, PCOL specifies the column coordinate of the process * that possesses the first column of the submatrix. On exit, * PCOL is -1 if DESC( CSRC_ ) is -1 on input, and, at least * zero and strictly less than NPCOL otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER CSRC, I1, ILOCBLK, IMB, INB, J1, MB, MYDIST, $ NB, NBLOCKS, RSRC * .. * .. Local Arrays .. INTEGER DESC2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL PB_DESCTRANS * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESC, DESC2 ) * IMB = DESC2( IMB_ ) PROW = DESC2( RSRC_ ) * * Has every process row I ? * IF( ( PROW.EQ.-1 ).OR.( NPROW.EQ.1 ) ) THEN * II = I * ELSE IF( I.LE.IMB ) THEN * * I is in range of first block * IF( MYROW.EQ.PROW ) THEN II = I ELSE II = 1 END IF * ELSE * * I is not in first block of matrix, figure out who has it. * RSRC = PROW MB = DESC2( MB_ ) * IF( MYROW.EQ.RSRC ) THEN * NBLOCKS = ( I - IMB - 1 ) / MB + 1 PROW = PROW + NBLOCKS PROW = PROW - ( PROW / NPROW ) * NPROW * ILOCBLK = NBLOCKS / NPROW * IF( ILOCBLK.GT.0 ) THEN IF( ( ILOCBLK*NPROW ).GE.NBLOCKS ) THEN IF( MYROW.EQ.PROW ) THEN II = I + ( ILOCBLK - NBLOCKS ) * MB ELSE II = IMB + ( ILOCBLK - 1 ) * MB + 1 END IF ELSE II = IMB + ILOCBLK * MB + 1 END IF ELSE II = IMB + 1 END IF * ELSE * I1 = I - IMB NBLOCKS = ( I1 - 1 ) / MB + 1 PROW = PROW + NBLOCKS PROW = PROW - ( PROW / NPROW ) * NPROW * MYDIST = MYROW - RSRC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROW * ILOCBLK = NBLOCKS / NPROW * IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPROW IF( MYDIST.LT.0 ) THEN II = MB + ILOCBLK * MB + 1 ELSE IF( MYROW.EQ.PROW ) THEN II = I1 + ( ILOCBLK - NBLOCKS + 1 ) * MB ELSE II = ILOCBLK * MB + 1 END IF END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN II = MB + 1 ELSE IF( MYROW.EQ.PROW ) THEN II = I1 + ( 1 - NBLOCKS ) * MB ELSE II = 1 END IF END IF END IF * END IF * INB = DESC2( INB_ ) PCOL = DESC2( CSRC_ ) * * Has every process column J ? * IF( ( PCOL.EQ.-1 ).OR.( NPCOL.EQ.1 ) ) THEN * JJ = J * ELSE IF( J.LE.INB ) THEN * * J is in range of first block * IF( MYCOL.EQ.PCOL ) THEN JJ = J ELSE JJ = 1 END IF * ELSE * * J is not in first block of matrix, figure out who has it. * CSRC = PCOL NB = DESC2( NB_ ) * IF( MYCOL.EQ.CSRC ) THEN * NBLOCKS = ( J - INB - 1 ) / NB + 1 PCOL = PCOL + NBLOCKS PCOL = PCOL - ( PCOL / NPCOL ) * NPCOL * ILOCBLK = NBLOCKS / NPCOL * IF( ILOCBLK.GT.0 ) THEN IF( ( ILOCBLK*NPCOL ).GE.NBLOCKS ) THEN IF( MYCOL.EQ.PCOL ) THEN JJ = J + ( ILOCBLK - NBLOCKS ) * NB ELSE JJ = INB + ( ILOCBLK - 1 ) * NB + 1 END IF ELSE JJ = INB + ILOCBLK * NB + 1 END IF ELSE JJ = INB + 1 END IF * ELSE * J1 = J - INB NBLOCKS = ( J1 - 1 ) / NB + 1 PCOL = PCOL + NBLOCKS PCOL = PCOL - ( PCOL / NPCOL ) * NPCOL * MYDIST = MYCOL - CSRC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPCOL * ILOCBLK = NBLOCKS / NPCOL * IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPCOL IF( MYDIST.LT.0 ) THEN JJ = NB + ILOCBLK * NB + 1 ELSE IF( MYCOL.EQ.PCOL ) THEN JJ = J1 + ( ILOCBLK - NBLOCKS + 1 ) * NB ELSE JJ = ILOCBLK * NB + 1 END IF END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN JJ = NB + 1 ELSE IF( MYCOL.EQ.PCOL ) THEN JJ = J1 + ( 1 - NBLOCKS ) * NB ELSE JJ = 1 END IF END IF END IF * END IF * RETURN * * End of PB_INFOG2L * END SUBROUTINE PB_AINFOG2L( M, N, I, J, DESC, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, II, JJ, PROW, $ PCOL, RPROW, RPCOL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER I, II, IMB1, INB1, J, JJ, M, MP, MYCOL, MYROW, $ N, NPCOL, NPROW, NQ, PCOL, PROW, RPCOL, RPROW * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * PB_AINFOG2L computes the starting local row and column indexes II, * JJ corresponding to the submatrix starting globally at the entry * pointed by I, J. This routine returns the coordinates in the grid of * the process owning the matrix entry of global indexes I, J, namely * PROW and PCOL. In addition, this routine computes the quantities MP * and NQ, which are respectively the local number of rows and columns * owned by the process of coordinate MYROW, MYCOL corresponding to the * global submatrix A(I:I+M-1,J:J+N-1). Finally, the size of the first * partial block and the relative process coordinates are also returned * respectively in IMB, INB and RPROW, RPCOL. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the global number of rows of the subma- * trix. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of columns of the * submatrix. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global starting row index of the * submatrix. I must at least one. * * J (global input) INTEGER * On entry, J specifies the global starting column index of * the submatrix. J must at least one. * * DESC (global and local input) INTEGER array * On entry, DESC is an integer array of dimension DLEN_. This * is the array descriptor of the underlying matrix. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process columns * over which the matrix is distributed. NPCOL must be at least * one. * * MYROW (local input) INTEGER * On entry, MYROW specifies the row coordinate of the process * whose local index II is determined. MYROW must be at least * zero and strictly less than NPROW. * * MYCOL (local input) INTEGER * On entry, MYCOL specifies the column coordinate of the pro- * cess whose local index JJ is determined. MYCOL must be at * least zero and strictly less than NPCOL. * * IMB1 (global output) INTEGER * On exit, IMB1 specifies the number of rows of the upper left * block of the submatrix. On exit, IMB1 is less or equal than * M and greater or equal than MIN( 1, M ). * * INB1 (global output) INTEGER * On exit, INB1 specifies the number of columns of the upper * left block of the submatrix. On exit, INB1 is less or equal * than N and greater or equal than MIN( 1, N ). * * MP (local output) INTEGER * On exit, MP specifies the local number of rows of the subma- * trix, that the processes of row coordinate MYROW own. MP is * at least zero. * * NQ (local output) INTEGER * On exit, NQ specifies the local number of columns of the * submatrix, that the processes of column coordinate MYCOL * own. NQ is at least zero. * * II (local output) INTEGER * On exit, II specifies the local starting row index of the * submatrix. On exit, II is at least one. * * JJ (local output) INTEGER * On exit, JJ specifies the local starting column index of * the submatrix. On exit, II is at least one. * * PROW (global output) INTEGER * On exit, PROW specifies the row coordinate of the process * that possesses the first row of the submatrix. On exit, PROW * is -1 if DESC(RSRC_) is -1 on input, and, at least zero and * strictly less than NPROW otherwise. * * PCOL (global output) INTEGER * On exit, PCOL specifies the column coordinate of the process * that possesses the first column of the submatrix. On exit, * PCOL is -1 if DESC(CSRC_) is -1 on input, and, at least zero * and strictly less than NPCOL otherwise. * * RPROW (global output) INTEGER * On exit, RPROW specifies the relative row coordinate of the * process that possesses the first row I of the submatrix. On * exit, RPROW is -1 if DESC(RSRC_) is -1 on input, and, at * least zero and strictly less than NPROW otherwise. * * RPCOL (global output) INTEGER * On exit, RPCOL specifies the relative column coordinate of * the process that possesses the first column J of the subma- * trix. On exit, RPCOL is -1 if DESC(CSRC_) is -1 on input, * and, at least zero and strictly less than NPCOL otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER CSRC, I1, ILOCBLK, J1, M1, MB, MYDIST, N1, NB, $ NBLOCKS, RSRC * .. * .. Local Arrays .. INTEGER DESC2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESC, DESC2 ) * MB = DESC2( MB_ ) IMB1 = DESC2( IMB_ ) RSRC = DESC2( RSRC_ ) * IF( ( RSRC.EQ.-1 ).OR.( NPROW.EQ.1 ) ) THEN * II = I IMB1 = IMB1 - I + 1 IF( IMB1.LE.0 ) $ IMB1 = ( ( -IMB1 ) / MB + 1 ) * MB + IMB1 IMB1 = MIN( IMB1, M ) MP = M PROW = RSRC RPROW = 0 * ELSE * * Figure out PROW, II and IMB1 first * IF( I.LE.IMB1 ) THEN * PROW = RSRC * IF( MYROW.EQ.PROW ) THEN II = I ELSE II = 1 END IF * IMB1 = IMB1 - I + 1 * ELSE * I1 = I - IMB1 - 1 NBLOCKS = I1 / MB + 1 PROW = RSRC + NBLOCKS PROW = PROW - ( PROW / NPROW ) * NPROW * IF( MYROW.EQ.RSRC ) THEN * ILOCBLK = NBLOCKS / NPROW * IF( ILOCBLK.GT.0 ) THEN IF( ( ILOCBLK*NPROW ).GE.NBLOCKS ) THEN IF( MYROW.EQ.PROW ) THEN II = I + ( ILOCBLK - NBLOCKS ) * MB ELSE II = IMB1 + ( ILOCBLK - 1 ) * MB + 1 END IF ELSE II = IMB1 + ILOCBLK * MB + 1 END IF ELSE II = IMB1 + 1 END IF * ELSE * MYDIST = MYROW - RSRC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROW * ILOCBLK = NBLOCKS / NPROW * IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPROW IF( MYDIST.LT.0 ) THEN II = ( ILOCBLK + 1 ) * MB + 1 ELSE IF( MYROW.EQ.PROW ) THEN II = I1 + ( ILOCBLK - NBLOCKS + 1 ) * MB + 1 ELSE II = ILOCBLK * MB + 1 END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN II = MB + 1 ELSE IF( MYROW.EQ.PROW ) THEN II = I1 + ( 1 - NBLOCKS ) * MB + 1 ELSE II = 1 END IF END IF END IF * IMB1 = NBLOCKS * MB - I1 * END IF * * Figure out MP * IF( M.LE.IMB1 ) THEN * IF( MYROW.EQ.PROW ) THEN MP = M ELSE MP = 0 END IF * ELSE * M1 = M - IMB1 NBLOCKS = M1 / MB + 1 * IF( MYROW.EQ.PROW ) THEN ILOCBLK = NBLOCKS / NPROW IF( ILOCBLK.GT.0 ) THEN IF( ( NBLOCKS - ILOCBLK * NPROW ).GT.0 ) THEN MP = IMB1 + ILOCBLK * MB ELSE MP = M + MB * ( ILOCBLK - NBLOCKS ) END IF ELSE MP = IMB1 END IF ELSE MYDIST = MYROW - PROW IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROW ILOCBLK = NBLOCKS / NPROW IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPROW IF( MYDIST.LT.0 ) THEN MP = ( ILOCBLK + 1 ) * MB ELSE IF( MYDIST.GT.0 ) THEN MP = ILOCBLK * MB ELSE MP = M1 + MB * ( ILOCBLK - NBLOCKS + 1 ) END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN MP = MB ELSE IF( MYDIST.GT.0 ) THEN MP = 0 ELSE MP = M1 + MB * ( 1 - NBLOCKS ) END IF END IF END IF * END IF * IMB1 = MIN( IMB1, M ) RPROW = MYROW - PROW IF( RPROW.LT.0 ) $ RPROW = RPROW + NPROW * END IF * NB = DESC2( NB_ ) INB1 = DESC2( INB_ ) CSRC = DESC2( CSRC_ ) * IF( ( CSRC.EQ.-1 ).OR.( NPCOL.EQ.1 ) ) THEN * JJ = J INB1 = INB1 - I + 1 IF( INB1.LE.0 ) $ INB1 = ( ( -INB1 ) / NB + 1 ) * NB + INB1 INB1 = MIN( INB1, N ) NQ = N PCOL = CSRC RPCOL = 0 * ELSE * * Figure out PCOL, JJ and INB1 first * IF( J.LE.INB1 ) THEN * PCOL = CSRC * IF( MYCOL.EQ.PCOL ) THEN JJ = J ELSE JJ = 1 END IF * INB1 = INB1 - J + 1 * ELSE * J1 = J - INB1 - 1 NBLOCKS = J1 / NB + 1 PCOL = CSRC + NBLOCKS PCOL = PCOL - ( PCOL / NPCOL ) * NPCOL * IF( MYCOL.EQ.CSRC ) THEN * ILOCBLK = NBLOCKS / NPCOL * IF( ILOCBLK.GT.0 ) THEN IF( ( ILOCBLK*NPCOL ).GE.NBLOCKS ) THEN IF( MYCOL.EQ.PCOL ) THEN JJ = J + ( ILOCBLK - NBLOCKS ) * NB ELSE JJ = INB1 + ( ILOCBLK - 1 ) * NB + 1 END IF ELSE JJ = INB1 + ILOCBLK * NB + 1 END IF ELSE JJ = INB1 + 1 END IF * ELSE * MYDIST = MYCOL - CSRC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPCOL * ILOCBLK = NBLOCKS / NPCOL * IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPCOL IF( MYDIST.LT.0 ) THEN JJ = ( ILOCBLK + 1 ) * NB + 1 ELSE IF( MYCOL.EQ.PCOL ) THEN JJ = J1 + ( ILOCBLK - NBLOCKS + 1 ) * NB + 1 ELSE JJ = ILOCBLK * NB + 1 END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN JJ = NB + 1 ELSE IF( MYCOL.EQ.PCOL ) THEN JJ = J1 + ( 1 - NBLOCKS ) * NB + 1 ELSE JJ = 1 END IF END IF END IF * INB1 = NBLOCKS * NB - J1 * END IF * * Figure out NQ * IF( N.LE.INB1 ) THEN * IF( MYCOL.EQ.PCOL ) THEN NQ = N ELSE NQ = 0 END IF * ELSE * N1 = N - INB1 NBLOCKS = N1 / NB + 1 * IF( MYCOL.EQ.PCOL ) THEN ILOCBLK = NBLOCKS / NPCOL IF( ILOCBLK.GT.0 ) THEN IF( ( NBLOCKS - ILOCBLK * NPCOL ).GT.0 ) THEN NQ = INB1 + ILOCBLK * NB ELSE NQ = N + NB * ( ILOCBLK - NBLOCKS ) END IF ELSE NQ = INB1 END IF ELSE MYDIST = MYCOL - PCOL IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPCOL ILOCBLK = NBLOCKS / NPCOL IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPCOL IF( MYDIST.LT.0 ) THEN NQ = ( ILOCBLK + 1 ) * NB ELSE IF( MYDIST.GT.0 ) THEN NQ = ILOCBLK * NB ELSE NQ = N1 + NB * ( ILOCBLK - NBLOCKS + 1 ) END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN NQ = NB ELSE IF( MYDIST.GT.0 ) THEN NQ = 0 ELSE NQ = N1 + NB * ( 1 - NBLOCKS ) END IF END IF END IF * END IF * INB1 = MIN( INB1, N ) RPCOL = MYCOL - PCOL IF( RPCOL.LT.0 ) $ RPCOL = RPCOL + NPCOL * END IF * RETURN * * End of PB_AINFOG2L * END INTEGER FUNCTION PB_NUMROC( N, I, INB, NB, PROC, SRCPROC, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER I, INB, N, NB, NPROCS, PROC, SRCPROC * .. * * Purpose * ======= * * PB_NUMROC returns the local number of matrix rows/columns process * PROC will get if we give out N rows/columns starting from global in- * dex I. * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the number of rows/columns being dealt * out. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global index of the matrix entry. * I must be at least one. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * PROC (local input) INTEGER * On entry, PROC specifies the coordinate of the process whose * local portion is determined. PROC must be at least zero and * strictly less than NPROCS. * * SRCPROC (global input) INTEGER * On entry, SRCPROC specifies the coordinate of the process * that possesses the first row or column of the matrix. When * SRCPROC = -1, the data is not distributed but replicated, * otherwise SRCPROC must be at least zero and strictly less * than NPROCS. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the matrix is distributed. NPROCS must * be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I1, ILOCBLK, INB1, MYDIST, N1, NBLOCKS, $ SRCPROC1 * .. * .. Executable Statements .. * IF( ( SRCPROC.EQ.-1 ).OR.( NPROCS.EQ.1 ) ) THEN PB_NUMROC = N RETURN END IF * * Compute coordinate of process owning I and corresponding INB * IF( I.LE.INB ) THEN * * I is in range of first block, i.e SRCPROC owns I. * SRCPROC1 = SRCPROC INB1 = INB - I + 1 * ELSE * * I is not in first block of matrix, figure out who has it * I1 = I - 1 - INB NBLOCKS = I1 / NB + 1 SRCPROC1 = SRCPROC + NBLOCKS SRCPROC1 = SRCPROC1 - ( SRCPROC1 / NPROCS ) * NPROCS INB1 = NBLOCKS*NB - I1 * END IF * * Now everything is just like I=1. Search now who has N-1, Is N-1 * in the first block ? * IF( N.LE.INB1 ) THEN IF( PROC.EQ.SRCPROC1 ) THEN PB_NUMROC = N ELSE PB_NUMROC = 0 END IF RETURN END IF * N1 = N - INB1 NBLOCKS = N1 / NB + 1 * IF( PROC.EQ.SRCPROC1 ) THEN ILOCBLK = NBLOCKS / NPROCS IF( ILOCBLK.GT.0 ) THEN IF( ( NBLOCKS - ILOCBLK * NPROCS ).GT.0 ) THEN PB_NUMROC = INB1 + ILOCBLK * NB ELSE PB_NUMROC = N + NB * ( ILOCBLK - NBLOCKS ) END IF ELSE PB_NUMROC = INB1 END IF ELSE MYDIST = PROC - SRCPROC1 IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROCS ILOCBLK = NBLOCKS / NPROCS IF( ILOCBLK.GT.0 ) THEN MYDIST = MYDIST - NBLOCKS + ILOCBLK * NPROCS IF( MYDIST.LT.0 ) THEN PB_NUMROC = ( ILOCBLK + 1 ) * NB ELSE IF( MYDIST.GT.0 ) THEN PB_NUMROC = ILOCBLK * NB ELSE PB_NUMROC = N1 + NB * ( ILOCBLK - NBLOCKS + 1 ) END IF ELSE MYDIST = MYDIST - NBLOCKS IF( MYDIST.LT.0 ) THEN PB_NUMROC = NB ELSE IF( MYDIST.GT.0 ) THEN PB_NUMROC = 0 ELSE PB_NUMROC = N1 + NB * ( 1 - NBLOCKS ) END IF END IF END IF * RETURN * * End of PB_NUMROC * END INTEGER FUNCTION PB_FCEIL( NUM, DENOM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL DENOM, NUM * .. * * Purpose * ======= * * PB_FCEIL returns the ceiling of the division of two integers. The * integer operands are passed as real to avoid integer overflow. * * Arguments * ========= * * NUM (local input) REAL * On entry, NUM specifies the numerator of the fraction to be * evaluated. * * DENOM (local input) REAL * On entry, DENOM specifies the denominator of the fraction to * be evaluated. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC NINT * .. * .. Executable Statements .. * PB_FCEIL = NINT( ( ( NUM + DENOM - 1.0E+0 ) / DENOM ) - 0.5E+0 ) * RETURN * * End of PB_FCEIL * END SUBROUTINE PB_CHKMAT( ICTXT, M, MPOS0, N, NPOS0, IA, JA, DESCA, $ DPOS0, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER DPOS0, IA, ICTXT, INFO, JA, M, MPOS0, N, NPOS0 * .. * .. Array Arguments .. INTEGER DESCA( * ) * .. * * Purpose * ======= * * PB_CHKMAT checks the validity of a descriptor vector DESCA, the re- * lated global indexes IA, JA from a local view point. If an inconsis- * tency is found among its parameters IA, JA and DESCA, the routine re- * turns an error code in INFO. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * M (global input) INTEGER * On entry, M specifies the number of rows the submatrix * sub( A ). * * MPOS0 (global input) INTEGER * On entry, MPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter M appears. * * N (global input) INTEGER * On entry, N specifies the number of columns the submatrix * sub( A ). * * NPOS0 (global input) INTEGER * On entry, NPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter N appears. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * DPOS0 (global input) INTEGER * On entry, DPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter DESCA ap- * pears. Note that it is assumed that IA and JA are respecti- * vely 2 and 1 entries behind DESCA. * * INFO (local input/local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had an * illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER DESCMULT, BIGNUM PARAMETER ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT ) * .. * .. Local Scalars .. INTEGER DPOS, IAPOS, JAPOS, MP, MPOS, MYCOL, MYROW, $ NPCOL, NPOS, NPROW, NQ * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_DESCTRANS * .. * .. External Functions .. INTEGER PB_NUMROC EXTERNAL PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN, MAX * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Want to find errors with MIN( ), so if no error, set it to a big * number. If there already is an error, multiply by the the des- * criptor multiplier * IF( INFO.GE.0 ) THEN INFO = BIGNUM ELSE IF( INFO.LT.-DESCMULT ) THEN INFO = -INFO ELSE INFO = -INFO * DESCMULT END IF * * Figure where in parameter list each parameter was, factoring in * descriptor multiplier * MPOS = MPOS0 * DESCMULT NPOS = NPOS0 * DESCMULT IAPOS = ( DPOS0 - 2 ) * DESCMULT JAPOS = ( DPOS0 - 1 ) * DESCMULT DPOS = DPOS0 * DESCMULT * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check that matrix values make sense from local viewpoint * IF( M.LT.0 ) $ INFO = MIN( INFO, MPOS ) IF( N.LT.0 ) $ INFO = MIN( INFO, NPOS ) IF( IA.LT.1 ) $ INFO = MIN( INFO, IAPOS ) IF( JA.LT.1 ) $ INFO = MIN( INFO, JAPOS ) IF( DESCA2( DTYPE_ ).NE.BLOCK_CYCLIC_2D_INB ) $ INFO = MIN( INFO, DPOS + DTYPE_ ) IF( DESCA2( IMB_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + IMB_ ) IF( DESCA2( INB_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + INB_ ) IF( DESCA2( MB_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + MB_ ) IF( DESCA2( NB_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + NB_ ) IF( DESCA2( RSRC_ ).LT.-1 .OR. DESCA2( RSRC_ ).GE.NPROW ) $ INFO = MIN( INFO, DPOS + RSRC_ ) IF( DESCA2( CSRC_ ).LT.-1 .OR. DESCA2( CSRC_ ).GE.NPCOL ) $ INFO = MIN( INFO, DPOS + CSRC_ ) IF( DESCA2( CTXT_ ).NE.ICTXT ) $ INFO = MIN( INFO, DPOS + CTXT_ ) * IF( M.EQ.0 .OR. N.EQ.0 ) THEN * * NULL matrix, relax some checks * IF( DESCA2( M_ ).LT.0 ) $ INFO = MIN( INFO, DPOS + M_ ) IF( DESCA2( N_ ).LT.0 ) $ INFO = MIN( INFO, DPOS + N_ ) IF( DESCA2( LLD_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + LLD_ ) * ELSE * * more rigorous checks for non-degenerate matrices * MP = PB_NUMROC( DESCA2( M_ ), 1, DESCA2( IMB_ ), DESCA2( MB_ ), $ MYROW, DESCA2( RSRC_ ), NPROW ) * IF( DESCA2( M_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + M_ ) IF( DESCA2( N_ ).LT.1 ) $ INFO = MIN( INFO, DPOS + N_ ) IF( IA.GT.DESCA2( M_ ) ) $ INFO = MIN( INFO, IAPOS ) IF( JA.GT.DESCA2( N_ ) ) $ INFO = MIN( INFO, JAPOS ) IF( IA+M-1.GT.DESCA2( M_ ) ) $ INFO = MIN( INFO, MPOS ) IF( JA+N-1.GT.DESCA2( N_ ) ) $ INFO = MIN( INFO, NPOS ) * IF( DESCA2( LLD_ ).LT.MAX( 1, MP ) ) THEN NQ = PB_NUMROC( DESCA2( N_ ), 1, DESCA2( INB_ ), $ DESCA2( NB_ ), MYCOL, DESCA2( CSRC_ ), $ NPCOL ) IF( DESCA2( LLD_ ).LT.1 ) THEN INFO = MIN( INFO, DPOS + LLD_ ) ELSE IF( NQ.GT.0 ) THEN INFO = MIN( INFO, DPOS + LLD_ ) END IF END IF * END IF * * Prepare output: set info = 0 if no error, and divide by * DESCMULT if error is not in a descriptor entry * IF( INFO.EQ.BIGNUM ) THEN INFO = 0 ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN INFO = -( INFO / DESCMULT ) ELSE INFO = -INFO END IF * RETURN * * End of PB_CHKMAT * END SUBROUTINE PB_DESCTRANS( DESCIN, DESCOUT ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER DESCIN( * ), DESCOUT( * ) * .. * * Purpose * ======= * * PB_DESCTRANS converts a descriptor DESCIN of type BLOCK_CYCLIC_2D * or BLOCK_CYCLIC_INB_2D into a descriptor DESCOUT of type * BLOCK_CYCLIC_INB_2D. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ----------------------------------- * DTYPE_A (global) DESCA( DTYPE1_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT1_ ) The BLACS context handle indicating * the NPROW x NPCOL BLACS process * grid A is distributed over. The * context itself is global, but the * handle (the integer value) may * vary. * M_A (global) DESCA( M1_ ) The number of rows in the distri- * buted matrix A, M_A >= 0. * N_A (global) DESCA( N1_ ) The number of columns in the dis- * tributed matrix A, N_A >= 0. * MB_A (global) DESCA( MB1_ ) The blocking factor used to distri- * bute the rows of A, MB_A > 0. * NB_A (global) DESCA( NB1_ ) The blocking factor used to distri- * bute the columns of A, NB_A > 0. * RSRC_A (global) DESCA( RSRC1_ ) The process row over which the * first row of the matrix A is dis- * tributed, NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC1_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD1_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, MB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, NB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * DESCIN (global and local input) INTEGER array * On entry, DESCIN is an array of dimension DLEN1_ or DLEN_ as * specified by its first entry DESCIN( DTYPE_ ). DESCIN is the * source array descriptor of type BLOCK_CYCLIC_2D or of type * BLOCK_CYCLIC_2D_INB. * * DESCOUT (global and local output) INTEGER array * On entry, DESCOUT is an array of dimension DLEN_. DESCOUT is * the target array descriptor of type BLOCK_CYCLIC_2D_INB. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC1_, CTXT1_, DLEN1_, $ DTYPE1_, LLD1_, M1_, MB1_, N1_, NB1_, RSRC1_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN1_ = 9, DTYPE1_ = 1, $ CTXT1_ = 2, M1_ = 3, N1_ = 4, MB1_ = 5, $ NB1_ = 6, RSRC1_ = 7, CSRC1_ = 8, LLD1_ = 9 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I * .. * .. Executable Statements .. * IF( DESCIN( DTYPE_ ).EQ.BLOCK_CYCLIC_2D ) THEN DESCOUT( DTYPE_ ) = BLOCK_CYCLIC_2D_INB DESCOUT( CTXT_ ) = DESCIN( CTXT1_ ) DESCOUT( M_ ) = DESCIN( M1_ ) DESCOUT( N_ ) = DESCIN( N1_ ) DESCOUT( IMB_ ) = DESCIN( MB1_ ) DESCOUT( INB_ ) = DESCIN( NB1_ ) DESCOUT( MB_ ) = DESCIN( MB1_ ) DESCOUT( NB_ ) = DESCIN( NB1_ ) DESCOUT( RSRC_ ) = DESCIN( RSRC1_ ) DESCOUT( CSRC_ ) = DESCIN( CSRC1_ ) DESCOUT( LLD_ ) = DESCIN( LLD1_ ) ELSE IF( DESCIN( DTYPE_ ).EQ.BLOCK_CYCLIC_2D_INB ) THEN DO 10 I = 1, DLEN_ DESCOUT( I ) = DESCIN( I ) 10 CONTINUE ELSE DESCOUT( DTYPE_ ) = DESCIN( 1 ) DESCOUT( CTXT_ ) = DESCIN( 2 ) DESCOUT( M_ ) = 0 DESCOUT( N_ ) = 0 DESCOUT( IMB_ ) = 1 DESCOUT( INB_ ) = 1 DESCOUT( MB_ ) = 1 DESCOUT( NB_ ) = 1 DESCOUT( RSRC_ ) = 0 DESCOUT( CSRC_ ) = 0 DESCOUT( LLD_ ) = 1 END IF * RETURN * * End of PB_DESCTRANS * END SUBROUTINE PB_DESCSET2( DESC, M, N, IMB, INB, MB, NB, RSRC, CSRC, $ CTXT, LLD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER CSRC, CTXT, IMB, INB, LLD, M, MB, N, NB, RSRC * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * PB_DESCSET2 uses its 10 input arguments M, N, IMB, INB, MB, NB, * RSRC, CSRC, CTXT and LLD to initialize a descriptor vector of type * BLOCK_CYCLIC_2D_INB. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ----------------------------------- * DTYPE_A (global) DESCA( DTYPE1_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT1_ ) The BLACS context handle indicating * the NPROW x NPCOL BLACS process * grid A is distributed over. The * context itself is global, but the * handle (the integer value) may * vary. * M_A (global) DESCA( M1_ ) The number of rows in the distri- * buted matrix A, M_A >= 0. * N_A (global) DESCA( N1_ ) The number of columns in the dis- * tributed matrix A, N_A >= 0. * MB_A (global) DESCA( MB1_ ) The blocking factor used to distri- * bute the rows of A, MB_A > 0. * NB_A (global) DESCA( NB1_ ) The blocking factor used to distri- * bute the columns of A, NB_A > 0. * RSRC_A (global) DESCA( RSRC1_ ) The process row over which the * first row of the matrix A is dis- * tributed, NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC1_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD1_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, MB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, NB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * DESC (global and local output) INTEGER array * On entry, DESC is an array of dimension DLEN_. DESC is the * array descriptor to be set. * * M (global input) INTEGER * On entry, M specifies the number of rows of the matrix. * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the matrix. * N must be at least zero. * * IMB (global input) INTEGER * On entry, IMB specifies the row size of the first block of * the global matrix distribution. IMB must be at least one. * * INB (global input) INTEGER * On entry, INB specifies the column size of the first block * of the global matrix distribution. INB must be at least one. * * MB (global input) INTEGER * On entry, MB specifies the row size of the blocks used to * partition the matrix. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the column size of the blocks used to * partition the matrix. NB must be at least one. * * RSRC (global input) INTEGER * On entry, RSRC specifies the row coordinate of the process * that possesses the first row of the matrix. When RSRC = -1, * the data is not distributed but replicated, otherwise RSRC * must be at least zero and strictly less than NPROW. * * CSRC (global input) INTEGER * On entry, CSRC specifies the column coordinate of the pro- * cess that possesses the first column of the matrix. When * CSRC = -1, the data is not distributed but replicated, other- * wise CSRC must be at least zero and strictly less than NPCOL. * * CTXT (local input) INTEGER * On entry, CTXT specifies the BLACS context handle, indicating * the global communication context. The value of the context * itself is local. * * LLD (local input) INTEGER * On entry, LLD specifies the leading dimension of the local * array storing the local entries of the matrix. LLD must be at * least MAX( 1, Lr(1,M) ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Executable Statements .. * DESC( DTYPE_ ) = BLOCK_CYCLIC_2D_INB DESC( CTXT_ ) = CTXT DESC( M_ ) = M DESC( N_ ) = N DESC( IMB_ ) = IMB DESC( INB_ ) = INB DESC( MB_ ) = MB DESC( NB_ ) = NB DESC( RSRC_ ) = RSRC DESC( CSRC_ ) = CSRC DESC( LLD_ ) = LLD * RETURN * * End of PB_DESCSET2 * END SUBROUTINE PB_DESCINIT2( DESC, M, N, IMB, INB, MB, NB, RSRC, CSRC, $ CTXT, LLD, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER CSRC, CTXT, IMB, INB, INFO, LLD, M, MB, N, NB, $ RSRC * .. * .. Array Arguments .. INTEGER DESC( * ) * .. * * Purpose * ======= * * PB_DESCINIT2 uses its 10 input arguments M, N, IMB, INB, MB, NB, * RSRC, CSRC, CTXT and LLD to initialize a descriptor vector of type * BLOCK_CYCLIC_2D_INB. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * DESC (global and local output) INTEGER array * On entry, DESC is an array of dimension DLEN_. DESC is the * array descriptor to be set. * * M (global input) INTEGER * On entry, M specifies the number of rows of the matrix. * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the matrix. * N must be at least zero. * * IMB (global input) INTEGER * On entry, IMB specifies the row size of the first block of * the global matrix distribution. IMB must be at least one. * * INB (global input) INTEGER * On entry, INB specifies the column size of the first block * of the global matrix distribution. INB must be at least one. * * MB (global input) INTEGER * On entry, MB specifies the row size of the blocks used to * partition the matrix. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the column size of the blocks used to * partition the matrix. NB must be at least one. * * RSRC (global input) INTEGER * On entry, RSRC specifies the row coordinate of the process * that possesses the first row of the matrix. When RSRC = -1, * the data is not distributed but replicated, otherwise RSRC * must be at least zero and strictly less than NPROW. * * CSRC (global input) INTEGER * On entry, CSRC specifies the column coordinate of the pro- * cess that possesses the first column of the matrix. When * CSRC = -1, the data is not distributed but replicated, other- * wise CSRC must be at least zero and strictly less than NPCOL. * * CTXT (local input) INTEGER * On entry, CTXT specifies the BLACS context handle, indicating * the global communication context. The value of the context * itself is local. * * LLD (local input) INTEGER * On entry, LLD specifies the leading dimension of the local * array storing the local entries of the matrix. LLD must be at * least MAX( 1, Lr(1,M) ). * * INFO (local output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value. * * Notes * ===== * * If the routine can recover from an erroneous input argument, it will * return an acceptable descriptor vector. For example, if LLD = 0 on * input, DESC( LLD_ ) will contain the smallest leading dimension re- * quired to store the specified m by n matrix, INFO will however be set * to -11 on exit in that case. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER LLDMIN, MP, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PXERBLA * .. * .. External Functions .. INTEGER PB_NUMROC EXTERNAL PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( CTXT, NPROW, NPCOL, MYROW, MYCOL ) * INFO = 0 IF( M.LT.0 ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( IMB.LT.1 ) THEN INFO = -4 ELSE IF( INB.LT.1 ) THEN INFO = -5 ELSE IF( MB.LT.1 ) THEN INFO = -6 ELSE IF( NB.LT.1 ) THEN INFO = -7 ELSE IF( RSRC.LT.-1 .OR. RSRC.GE.NPROW ) THEN INFO = -8 ELSE IF( CSRC.LT.-1 .OR. CSRC.GE.NPCOL ) THEN INFO = -9 ELSE IF( NPROW.EQ.-1 ) THEN INFO = -10 END IF * * Compute minimum LLD if safe (to avoid division by 0) * IF( INFO.EQ.0 ) THEN MP = PB_NUMROC( M, 1, IMB, MB, MYROW, RSRC, NPROW ) IF( PB_NUMROC( N, 1, INB, NB, MYCOL, CSRC, NPCOL ).GT.0 ) THEN LLDMIN = MAX( 1, MP ) ELSE LLDMIN = 1 END IF IF( LLD.LT.LLDMIN ) $ INFO = -11 END IF * IF( INFO.NE.0 ) $ CALL PXERBLA( CTXT, 'PB_DESCINIT2', -INFO ) * DESC( DTYPE_ ) = BLOCK_CYCLIC_2D_INB DESC( CTXT_ ) = CTXT DESC( M_ ) = MAX( 0, M ) DESC( N_ ) = MAX( 0, N ) DESC( IMB_ ) = MAX( 1, IMB ) DESC( INB_ ) = MAX( 1, INB ) DESC( MB_ ) = MAX( 1, MB ) DESC( NB_ ) = MAX( 1, NB ) DESC( RSRC_ ) = MAX( -1, MIN( RSRC, NPROW-1 ) ) DESC( CSRC_ ) = MAX( -1, MIN( CSRC, NPCOL-1 ) ) DESC( LLD_ ) = MAX( LLD, LLDMIN ) * RETURN * * End of PB_DESCINIT2 * END SUBROUTINE PB_BINFO( OFFD, M, N, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ILOW, IMB1, IMBLOC, INB1, INBLOC, IUPP, LCMT00, $ LMBLOC, LNBLOC, LOW, M, MB, MBLKS, MRCOL, $ MRROW, N, NB, NBLKS, OFFD, UPP * .. * * Purpose * ======= * * PB_BINFO initializes the local information of an m by n local array * owned by the process of relative coordinates ( MRROW, MRCOL ). Note * that if m or n is less or equal than zero, there is no data, in which * case this process does not need the local information computed by * this routine to proceed. * * Arguments * ========= * * OFFD (global input) INTEGER * On entry, OFFD specifies the off-diagonal of the underlying * matrix of interest as follows: * OFFD = 0 specifies the main diagonal, * OFFD > 0 specifies lower subdiagonals, and * OFFD < 0 specifies upper superdiagonals. * * M (local input) INTEGER * On entry, M specifies the local number of rows of the under- * lying matrix owned by the process of relative coordinates * ( MRROW, MRCOL ). M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the local number of columns of the un- * derlying matrix owned by the process of relative coordinates * ( MRROW, MRCOL ). N must be at least zero. * * IMB1 (global input) INTEGER * On input, IMB1 specifies the global true size of the first * block of rows of the underlying global submatrix. IMB1 must * be at least MIN( 1, M ). * * INB1 (global input) INTEGER * On input, INB1 specifies the global true size of the first * block of columns of the underlying global submatrix. INB1 * must be at least MIN( 1, N ). * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * MRROW (local input) INTEGER * On entry, MRROW specifies the relative row coordinate of the * process that possesses these M rows. MRROW must be least zero * and strictly less than NPROW. * * MRCOL (local input) INTEGER * On entry, MRCOL specifies the relative column coordinate of * the process that possesses these N columns. MRCOL must be * least zero and strictly less than NPCOL. * * LCMT00 (local output) INTEGER * On exit, LCMT00 is the LCM value of the left upper block of * this m by n local block owned by the process of relative co- * ordinates ( MRROW, MRCOL ). * * MBLKS (local output) INTEGER * On exit, MBLKS specifies the local number of blocks of rows * corresponding to M. MBLKS must be at least zero. * * NBLKS (local output) INTEGER * On exit, NBLKS specifies the local number of blocks of co- * lumns corresponding to N. NBLKS must be at least zero. * * IMBLOC (local output) INTEGER * On exit, IMBLOC specifies the number of rows (size) of the * uppest blocks of this m by n local array owned by the process * of relative coordinates ( MRROW, MRCOL ). IMBLOC is at least * MIN( 1, M ). * * INBLOC (local output) INTEGER * On exit, INBLOC specifies the number of columns (size) of * the leftmost blocks of this m by n local array owned by the * process of relative coordinates ( MRROW, MRCOL ). INBLOC is * at least MIN( 1, N ). * * LMBLOC (local output) INTEGER * On exit, LMBLOC specifies the number of rows (size) of the * lowest blocks of this m by n local array owned by the process * of relative coordinates ( MRROW, MRCOL ). LMBLOC is at least * MIN( 1, M ). * * LNBLOC (local output) INTEGER * On exit, LNBLOC specifies the number of columns (size) of the * rightmost blocks of this m by n local array owned by the * process of relative coordinates ( MRROW, MRCOL ). LNBLOC is * at least MIN( 1, N ). * * ILOW (local output) INTEGER * On exit, ILOW is the lower bound characterizing the first co- * lumn block owning offdiagonals of this m by n array. ILOW * must be less or equal than zero. * * LOW (global output) INTEGER * On exit, LOW is the lower bound characterizing the column * blocks with te exception of the first one (see ILOW) owning * offdiagonals of this m by n array. LOW must be less or equal * than zero. * * IUPP (local output) INTEGER * On exit, IUPP is the upper bound characterizing the first row * block owning offdiagonals of this m by n array. IUPP must be * greater or equal than zero. * * UPP (global output) INTEGER * On exit, UPP is the upper bound characterizing the row * blocks with te exception of the first one (see IUPP) owning * offdiagonals of this m by n array. UPP must be greater or * equal than zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER TMP1 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Initialize LOW, ILOW, UPP, IUPP, LMBLOC, LNBLOC, IMBLOC, INBLOC, * MBLKS, NBLKS and LCMT00. * LOW = 1 - NB UPP = MB - 1 * LCMT00 = OFFD * IF( M.LE.0 .OR. N.LE.0 ) THEN * IF( MRROW.GT.0 ) THEN IUPP = MB - 1 ELSE IUPP = MAX( 0, IMB1 - 1 ) END IF IMBLOC = 0 MBLKS = 0 LMBLOC = 0 * IF( MRCOL.GT.0 ) THEN ILOW = 1 - NB ELSE ILOW = MIN( 0, 1 - INB1 ) END IF INBLOC = 0 NBLKS = 0 LNBLOC = 0 * LCMT00 = LCMT00 + ( LOW - ILOW + MRCOL * NB ) - $ ( IUPP - UPP + MRROW * MB ) * RETURN * END IF * IF( MRROW.GT.0 ) THEN * IMBLOC = MIN( M, MB ) IUPP = MB - 1 LCMT00 = LCMT00 - ( IMB1 - MB + MRROW * MB ) MBLKS = ( M - 1 ) / MB + 1 LMBLOC = M - ( M / MB ) * MB IF( LMBLOC.EQ.0 ) $ LMBLOC = MB * IF( MRCOL.GT.0 ) THEN * INBLOC = MIN( N, NB ) ILOW = 1 - NB LCMT00 = LCMT00 + INB1 - NB + MRCOL * NB NBLKS = ( N - 1 ) / NB + 1 LNBLOC = N - ( N / NB ) * NB IF( LNBLOC.EQ.0 ) $ LNBLOC = NB * ELSE * INBLOC = INB1 ILOW = 1 - INB1 TMP1 = N - INB1 IF( TMP1.GT.0 ) THEN * * more than one block * NBLKS = ( TMP1 - 1 ) / NB + 2 LNBLOC = TMP1 - ( TMP1 / NB ) * NB IF( LNBLOC.EQ.0 ) $ LNBLOC = NB * ELSE * NBLKS = 1 LNBLOC = INB1 * END IF * END IF * ELSE * IMBLOC = IMB1 IUPP = IMB1 - 1 TMP1 = M - IMB1 IF( TMP1.GT.0 ) THEN * * more than one block * MBLKS = ( TMP1 - 1 ) / MB + 2 LMBLOC = TMP1 - ( TMP1 / MB ) * MB IF( LMBLOC.EQ.0 ) $ LMBLOC = MB * ELSE * MBLKS = 1 LMBLOC = IMB1 * END IF * IF( MRCOL.GT.0 ) THEN * INBLOC = MIN( N, NB ) ILOW = 1 - NB LCMT00 = LCMT00 + INB1 - NB + MRCOL * NB NBLKS = ( N - 1 ) / NB + 1 LNBLOC = N - ( N / NB ) * NB IF( LNBLOC.EQ.0 ) $ LNBLOC = NB * ELSE * INBLOC = INB1 ILOW = 1 - INB1 TMP1 = N - INB1 IF( TMP1.GT.0 ) THEN * * more than one block * NBLKS = ( TMP1 - 1 ) / NB + 2 LNBLOC = TMP1 - ( TMP1 / NB ) * NB IF( LNBLOC.EQ.0 ) $ LNBLOC = NB * ELSE * NBLKS = 1 LNBLOC = INB1 * END IF * END IF * END IF * RETURN * * End of PB_BINFO * END INTEGER FUNCTION PILAENV( ICTXT, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT CHARACTER*1 PREC * .. * * Purpose * ======= * * PILAENV returns the logical computational block size to be used by * the PBLAS routines during testing and timing. This is a special ver- * sion to be used only as part of the testing or timing PBLAS programs * for testing different values of logical computational block sizes for * the PBLAS routines. It is called by the PBLAS routines to retrieve a * logical computational block size value. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * PREC (dummy input) CHARACTER*1 * On entry, PREC is a dummy argument. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Common Blocks .. INTEGER INFO, NBLOG COMMON /INFOC/INFO, NBLOG * .. * .. Executable Statements .. * PILAENV = NBLOG * RETURN * * End of PILAENV * END SUBROUTINE PB_LOCINFO( I, INB, NB, MYROC, SRCPROC, NPROCS, $ ILOCBLK, ILOCOFF, MYDIST ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER I, ILOCBLK, ILOCOFF, INB, MYDIST, MYROC, NB, $ NPROCS, SRCPROC * .. * * Purpose * ======= * * PB_LOCINFO computes local information about the beginning of a sub- * matrix starting at the global index I. * * Arguments * ========= * * I (global input) INTEGER * On entry, I specifies the global starting index in the ma- * trix. I must be at least one. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of rows * or columns of the matrix. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks of rows or co- * lumns of the matrix is partitioned into. NB must be at least * one. * * MYROC (local input) INTEGER * On entry, MYROC is the coordinate of the process whose local * information is determined. MYROC is at least zero and * strictly less than NPROCS. * * SRCPROC (global input) INTEGER * On entry, SRCPROC specifies the coordinate of the process * that possesses the first row or column of the matrix. When * SRCPROC = -1, the data is not distributed but replicated, * otherwise SRCPROC must be at least zero and strictly less * than NPROCS. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the submatrix is distributed. NPROCS * must be at least one. * * ILOCBLK (local output) INTEGER * On exit, ILOCBLK specifies the local row or column block * coordinate corresponding to the row or column I of the ma- * trix. ILOCBLK must be at least zero. * * ILOCOFF (local output) INTEGER * On exit, ILOCOFF specifies the local row offset in the block * of local coordinate ILOCBLK corresponding to the row or co- * lumn I of the matrix. ILOCOFF must at least zero. * * MYDIST (local output) INTEGER * On exit, MYDIST specifies the relative process coordinate of * the process specified by MYROC to the process owning the row * or column I. MYDIST is at least zero and strictly less than * NPROCS. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER ITMP, NBLOCKS, PROC * .. * .. Executable Statements .. * ILOCOFF = 0 * IF( SRCPROC.LT.0 ) THEN * MYDIST = 0 * IF( I.LE.INB ) THEN * ILOCBLK = 0 ILOCOFF = I - 1 * ELSE * ITMP = I - INB NBLOCKS = ( ITMP - 1 ) / NB + 1 ILOCBLK = NBLOCKS ILOCOFF = ITMP - 1 - ( NBLOCKS - 1 ) * NB * END IF * ELSE * PROC = SRCPROC MYDIST = MYROC - PROC IF( MYDIST.LT.0 ) $ MYDIST = MYDIST + NPROCS * IF( I.LE.INB ) THEN * ILOCBLK = 0 IF( MYROC.EQ.PROC ) $ ILOCOFF = I - 1 * ELSE * ITMP = I - INB NBLOCKS = ( ITMP - 1 ) / NB + 1 PROC = PROC + NBLOCKS PROC = PROC - ( PROC / NPROCS ) * NPROCS ILOCBLK = NBLOCKS / NPROCS * IF( ( ILOCBLK*NPROCS ).LT.( MYDIST-NBLOCKS ) ) $ ILOCBLK = ILOCBLK + 1 * IF( MYROC.EQ.PROC ) $ ILOCOFF = ITMP - 1 - ( NBLOCKS - 1 ) * NB * END IF * END IF * RETURN * * End of PB_LOCINFO * END SUBROUTINE PB_INITJMP( COLMAJ, NVIR, IMBVIR, INBVIR, IMBLOC, $ INBLOC, MB, NB, RSRC, CSRC, NPROW, NPCOL, $ STRIDE, JMP ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL COLMAJ INTEGER CSRC, IMBLOC, IMBVIR, INBLOC, INBVIR, MB, NB, $ NPCOL, NPROW, NVIR, RSRC, STRIDE * .. * .. Array Arguments .. INTEGER JMP( * ) * .. * * Purpose * ======= * * PB_INITJMP initializes the jump values JMP used by the random matrix * generator. * * Arguments * ========= * * COLMAJ (global input) LOGICAL * On entry, COLMAJ specifies the ordering of the random sequen- * ce. When COLMAJ is .TRUE., the random sequence will be used * for a column major ordering, and otherwise a row-major orde- * ring. This impacts on the computation of the jump values. * * NVIR (global input) INTEGER * On entry, NVIR specifies the size of the underlying virtual * matrix. NVIR must be at least zero. * * IMBVIR (local input) INTEGER * On entry, IMBVIR specifies the number of virtual rows of the * upper left block of the underlying virtual submatrix. IMBVIR * must be at least IMBLOC. * * INBVIR (local input) INTEGER * On entry, INBVIR specifies the number of virtual columns of * the upper left block of the underlying virtual submatrix. * INBVIR must be at least INBLOC. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the size of the blocks used to parti- * tion the matrix rows. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix columns. NB must be at least one. * * RSRC (global input) INTEGER * On entry, RSRC specifies the row coordinate of the process * that possesses the first row of the matrix. When RSRC = -1, * the rows are not distributed but replicated, otherwise RSRC * must be at least zero and strictly less than NPROW. * * CSRC (global input) INTEGER * On entry, CSRC specifies the column coordinate of the pro- * cess that possesses the first column of the matrix. When CSRC * is equal to -1, the columns are not distributed but replica- * ted, otherwise CSRC must be at least zero and strictly less * than NPCOL. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process co- * lumns over which the matrix is distributed. NPCOL must be at * least one. * * STRIDE (global input) INTEGER * On entry, STRIDE specifies the number of random numbers to be * generated to compute one matrix entry. In the real case, * STRIDE is usually 1, where as in the complex case STRIDE is * usually 2 in order to generate the real and imaginary parts. * * JMP (local output) INTEGER array * On entry, JMP is an array of dimension JMP_LEN. On exit, this * array contains the different jump values used by the random * matrix generator. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. INTEGER NPMB, NQNB * .. * .. Executable Statements .. * IF( RSRC.LT.0 ) THEN NPMB = MB ELSE NPMB = NPROW * MB END IF IF( CSRC.LT.0 ) THEN NQNB = NB ELSE NQNB = NPCOL * NB END IF * JMP( JMP_1 ) = 1 * JMP( JMP_MB ) = MB JMP( JMP_IMBV ) = IMBVIR JMP( JMP_NPMB ) = NPMB JMP( JMP_NPIMBLOC ) = IMBLOC + NPMB - MB * JMP( JMP_NB ) = NB JMP( JMP_INBV ) = INBVIR JMP( JMP_NQNB ) = NQNB JMP( JMP_NQINBLOC ) = INBLOC + NQNB - NB * IF( COLMAJ ) THEN JMP( JMP_ROW ) = STRIDE JMP( JMP_COL ) = STRIDE * NVIR ELSE JMP( JMP_ROW ) = STRIDE * NVIR JMP( JMP_COL ) = STRIDE END IF * RETURN * * End of PB_INITJMP * END SUBROUTINE PB_INITMULADD( MULADD0, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER IMULADD( 4, * ), JMP( * ), MULADD0( * ) * .. * * Purpose * ======= * * PB_INITMULADD initializes the constants a's and c's corresponding to * the jump values (JMP) used by the matrix generator. * * Arguments * ========= * * MULADD0 (local input) INTEGER array * On entry, MULADD0 is an array of dimension 4 containing the * encoded initial constants a and c to jump from X( n ) to * X( n+1 ) = a*X( n ) + c in the random sequence. MULADD0(1:2) * contains respectively the 16-lower and 16-higher bits of the * constant a, and MULADD0(3:4) contains the 16-lower and * 16-higher bits of the constant c. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the matrix generator. * * IMULADD (local output) INTEGER array * On entry, IMULADD is an array of dimension ( 4, JMP_LEN ). On * exit, the jth column of this array contains the encoded ini- * tial constants a_j and c_j to jump from X( n ) to X(n+JMP(j)) * (= a_j*X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * * .. Local Arrays .. INTEGER ITMP1( 2 ), ITMP2( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMP * .. * .. Executable Statements .. * ITMP2( 1 ) = 100 ITMP2( 2 ) = 0 * * Compute IMULADD for all JMP values * CALL PB_JUMP( JMP( JMP_1 ), MULADD0, ITMP2, ITMP1, $ IMULADD( 1, JMP_1 ) ) * CALL PB_JUMP( JMP( JMP_ROW ), MULADD0, ITMP1, ITMP2, $ IMULADD( 1, JMP_ROW ) ) CALL PB_JUMP( JMP( JMP_COL ), MULADD0, ITMP1, ITMP2, $ IMULADD( 1, JMP_COL ) ) * * Compute constants a and c to jump JMP( * ) numbers in the * sequence for column- or row-major ordering of the sequence. * CALL PB_JUMP( JMP( JMP_IMBV ), IMULADD( 1, JMP_ROW ), ITMP1, $ ITMP2, IMULADD( 1, JMP_IMBV ) ) CALL PB_JUMP( JMP( JMP_MB ), IMULADD( 1, JMP_ROW ), ITMP1, $ ITMP2, IMULADD( 1, JMP_MB ) ) CALL PB_JUMP( JMP( JMP_NPMB ), IMULADD( 1, JMP_ROW ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NPMB ) ) CALL PB_JUMP( JMP( JMP_NPIMBLOC ), IMULADD( 1, JMP_ROW ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NPIMBLOC ) ) * CALL PB_JUMP( JMP( JMP_INBV ), IMULADD( 1, JMP_COL ), ITMP1, $ ITMP2, IMULADD( 1, JMP_INBV ) ) CALL PB_JUMP( JMP( JMP_NB ), IMULADD( 1, JMP_COL ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NB ) ) CALL PB_JUMP( JMP( JMP_NQNB ), IMULADD( 1, JMP_COL ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NQNB ) ) CALL PB_JUMP( JMP( JMP_NQINBLOC ), IMULADD( 1, JMP_COL ), ITMP1, $ ITMP2, IMULADD( 1, JMP_NQINBLOC ) ) * RETURN * * End of PB_INITMULADD * END SUBROUTINE PB_SETLOCRAN( SEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ILOCBLK, ILOCOFF, JLOCBLK, JLOCOFF, MYCDIST, $ MYRDIST, NPCOL, NPROW, SEED * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) * .. * * Purpose * ======= * * PB_SETLOCRAN locally initializes the random number generator. * * Arguments * ========= * * SEED (global input) INTEGER * On entry, SEED specifies a positive integer used to initiali- * ze the first number in the random sequence used by the matrix * generator. SEED must be at least zero. * * ILOCBLK (local input) INTEGER * On entry, ILOCBLK specifies the local row block coordinate * corresponding to the first row of the submatrix of interest. * ILOCBLK must be at least zero. * * ILOCOFF (local input) INTEGER * On entry, ILOCOFF specifies the local row offset in the block * of local coordinate ILOCBLK corresponding to the first row of * the submatrix of interest. ILOCOFF must at least zero. * * JLOCBLK (local input) INTEGER * On entry, JLOCBLK specifies the local column block coordinate * corresponding to the first column of the submatrix of inte- * rest. JLOCBLK must be at least zero. * * JLOCOFF (local input) INTEGER * On entry, JLOCOFF specifies the local column offset in the * block of local coordinate JLOCBLK corresponding to the first * column of the submatrix of interest. JLOCOFF must be at least * zero. * * MYRDIST (local input) INTEGER * On entry, MYRDIST specifies the relative row process coordi- * nate to the process owning the first row of the submatrix of * interest. MYRDIST must be at least zero and stricly less than * NPROW (see the subroutine PB_LOCINFO). * * MYCDIST (local input) INTEGER * On entry, MYCDIST specifies the relative column process coor- * dinate to the process owning the first column of the subma- * trix of interest. MYCDIST must be at least zero and stricly * less than NPCOL (see the subroutine PB_LOCINFO). * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process co- * lumns over which the matrix is distributed. NPCOL must be at * least one. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * IRAN (local output) INTEGER array * On entry, IRAN is an array of dimension 2. On exit, IRAN con- * tains respectively the 16-lower and 32-higher bits of the en- * coding of the entry of the random sequence corresponding lo- * cally to the first local array entry to generate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Arrays .. INTEGER IMULADDTMP( 4 ), ITMP( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMP, PB_SETRAN * .. * .. Executable Statements .. * * Compute and set the value of IRAN corresponding to A( IA, JA ) * ITMP( 1 ) = SEED ITMP( 2 ) = 0 * CALL PB_JUMP( JMP( JMP_1 ), IMULADD( 1, JMP_1 ), ITMP, IRAN, $ IMULADDTMP ) * * Jump ILOCBLK blocks of rows + ILOCOFF rows * CALL PB_JUMP( ILOCOFF, IMULADD( 1, JMP_ROW ), IRAN, ITMP, $ IMULADDTMP ) IF( MYRDIST.GT.0 ) THEN CALL PB_JUMP( JMP( JMP_IMBV ), IMULADD( 1, JMP_ROW ), ITMP, $ IRAN, IMULADDTMP ) CALL PB_JUMP( MYRDIST - 1, IMULADD( 1, JMP_MB ), IRAN, $ ITMP, IMULADDTMP ) CALL PB_JUMP( ILOCBLK, IMULADD( 1, JMP_NPMB ), ITMP, $ IRAN, IMULADDTMP ) ELSE IF( ILOCBLK.GT.0 ) THEN CALL PB_JUMP( JMP( JMP_IMBV ), IMULADD( 1, JMP_ROW ), ITMP, $ IRAN, IMULADDTMP ) CALL PB_JUMP( NPROW - 1, IMULADD( 1, JMP_MB ), IRAN, $ ITMP, IMULADDTMP ) CALL PB_JUMP( ILOCBLK - 1, IMULADD( 1, JMP_NPMB ), ITMP, $ IRAN, IMULADDTMP ) ELSE CALL PB_JUMP( 0, IMULADD( 1, JMP_1 ), ITMP, $ IRAN, IMULADDTMP ) END IF END IF * * Jump JLOCBLK blocks of columns + JLOCOFF columns * CALL PB_JUMP( JLOCOFF, IMULADD( 1, JMP_COL ), IRAN, ITMP, $ IMULADDTMP ) IF( MYCDIST.GT.0 ) THEN CALL PB_JUMP( JMP( JMP_INBV ), IMULADD( 1, JMP_COL ), ITMP, $ IRAN, IMULADDTMP ) CALL PB_JUMP( MYCDIST - 1, IMULADD( 1, JMP_NB ), IRAN, $ ITMP, IMULADDTMP ) CALL PB_JUMP( JLOCBLK, IMULADD( 1, JMP_NQNB ), ITMP, $ IRAN, IMULADDTMP ) ELSE IF( JLOCBLK.GT.0 ) THEN CALL PB_JUMP( JMP( JMP_INBV ), IMULADD( 1, JMP_COL ), ITMP, $ IRAN, IMULADDTMP ) CALL PB_JUMP( NPCOL - 1, IMULADD( 1, JMP_NB ), IRAN, $ ITMP, IMULADDTMP ) CALL PB_JUMP( JLOCBLK - 1, IMULADD( 1, JMP_NQNB ), ITMP, $ IRAN, IMULADDTMP ) ELSE CALL PB_JUMP( 0, IMULADD( 1, JMP_1 ), ITMP, $ IRAN, IMULADDTMP ) END IF END IF * CALL PB_SETRAN( IRAN, IMULADD( 1, JMP_1 ) ) * RETURN * * End of PB_SETLOCRAN * END SUBROUTINE PB_LADD( J, K, I ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER I( 2 ), J( 2 ), K( 2 ) * .. * * Purpose * ======= * * PB_LADD adds without carry two long positive integers K and J and put * the result into I. The long integers I, J, K are encoded on 31 bits * using an array of 2 integers. The 16-lower bits are stored in the * first entry of each array, the 15-higher bits in the second entry. * For efficiency purposes, the intrisic modulo function is inlined. * * Arguments * ========= * * J (local input) INTEGER array * On entry, J is an array of dimension 2 containing the encoded * long integer J. * * K (local input) INTEGER array * On entry, K is an array of dimension 2 containing the encoded * long integer K. * * I (local output) INTEGER array * On entry, I is an array of dimension 2. On exit, this array * contains the encoded long integer I. * * Further Details * =============== * * K( 2 ) K( 1 ) * 0XXXXXXX XXXXXXXX K I( 1 ) = MOD( K( 1 ) + J( 1 ), 2**16 ) * + carry = ( K( 1 ) + J( 1 ) ) / 2**16 * J( 2 ) J( 1 ) * 0XXXXXXX XXXXXXXX J I( 2 ) = K( 2 ) + J( 2 ) + carry * ---------------------- I( 2 ) = MOD( I( 2 ), 2**15 ) * I( 2 ) I( 1 ) * 0XXXXXXX XXXXXXXX I * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER IPOW15, IPOW16 PARAMETER ( IPOW15 = 2**15, IPOW16 = 2**16 ) * .. * .. Local Scalars .. INTEGER ITMP1, ITMP2 * .. * .. Executable Statements .. * * I( 1 ) = MOD( K( 1 ) + J( 1 ), IPOW16 ) * ITMP1 = K( 1 ) + J( 1 ) ITMP2 = ITMP1 / IPOW16 I( 1 ) = ITMP1 - ITMP2 * IPOW16 * * I( 2 ) = MOD( ( K( 1 ) + J( 1 ) ) / IPOW16 + K( 2 ) + J( 2 ), * IPOW15 ) * ITMP1 = ITMP2 + K( 2 ) + J( 2 ) ITMP2 = ITMP1 / IPOW15 I( 2 ) = ITMP1 - ITMP2 * IPOW15 * RETURN * * End of PB_LADD * END SUBROUTINE PB_LMUL( K, J, I ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER I( 2 ), J( 2 ), K( 2 ) * .. * * Purpose * ======= * * PB_LMUL multiplies without carry two long positive integers K and J * and put the result into I. The long integers I, J, K are encoded on * 31 bits using an array of 2 integers. The 16-lower bits are stored in * the first entry of each array, the 15-higher bits in the second entry * of each array. For efficiency purposes, the intrisic modulo function * is inlined. * * Arguments * ========= * * K (local input) INTEGER array * On entry, K is an array of dimension 2 containing the encoded * long integer K. * * J (local input) INTEGER array * On entry, J is an array of dimension 2 containing the encoded * long integer J. * * I (local output) INTEGER array * On entry, I is an array of dimension 2. On exit, this array * contains the encoded long integer I. * * Further Details * =============== * * K( 2 ) K( 1 ) * 0XXXXXXX XXXXXXXX K I( 1 ) = MOD( K( 1 ) + J( 1 ), 2**16 ) * * carry = ( K( 1 ) + J( 1 ) ) / 2**16 * J( 2 ) J( 1 ) * 0XXXXXXX XXXXXXXX J I( 2 ) = K( 2 ) + J( 2 ) + carry * ---------------------- I( 2 ) = MOD( I( 2 ), 2**15 ) * I( 2 ) I( 1 ) * 0XXXXXXX XXXXXXXX I * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER IPOW15, IPOW16, IPOW30 PARAMETER ( IPOW15 = 2**15, IPOW16 = 2**16, $ IPOW30 = 2**30 ) * .. * .. Local Scalars .. INTEGER ITMP1, ITMP2 * .. * .. Executable Statements .. * ITMP1 = K( 1 ) * J( 1 ) IF( ITMP1.LT.0 ) $ ITMP1 = ( ITMP1 + IPOW30 ) + IPOW30 * * I( 1 ) = MOD( ITMP1, IPOW16 ) * ITMP2 = ITMP1 / IPOW16 I( 1 ) = ITMP1 - ITMP2 * IPOW16 * ITMP1 = K( 1 ) * J( 2 ) + K( 2 ) * J( 1 ) IF( ITMP1.LT.0 ) $ ITMP1 = ( ITMP1 + IPOW30 ) + IPOW30 * ITMP1 = ITMP2 + ITMP1 IF( ITMP1.LT.0 ) $ ITMP1 = ( ITMP1 + IPOW30 ) + IPOW30 * * I( 2 ) = MOD( ITMP1, IPOW15 ) * I( 2 ) = ITMP1 - ( ITMP1 / IPOW15 ) * IPOW15 * RETURN * * End of PB_LMUL * END SUBROUTINE PB_JUMP( K, MULADD, IRANN, IRANM, IMA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER K * .. * .. Array Arguments .. INTEGER IMA( 4 ), IRANM( 2 ), IRANN( 2 ), MULADD( 4 ) * .. * * Purpose * ======= * * PB_JUMP computes the constants A and C to jump K numbers in the ran- * dom sequence: * * X( n+K ) = A * X( n ) + C. * * The constants encoded in MULADD specify how to jump from entry in the * sequence to the next. * * Arguments * ========= * * K (local input) INTEGER * On entry, K specifies the number of entries of the sequence * to jump over. When K is less or equal than zero, A and C are * not computed, and IRANM is set to IRANN corresponding to a * jump of size zero. * * MULADD (local input) INTEGER array * On entry, MULADD is an array of dimension 4 containing the * encoded constants a and c to jump from X( n ) to X( n+1 ) * ( = a*X( n )+c) in the random sequence. MULADD(1:2) contains * respectively the 16-lower and 16-higher bits of the constant * a, and MULADD(3:4) contains the 16-lower and 16-higher bits * of the constant c. * * IRANN (local input) INTEGER array * On entry, IRANN is an array of dimension 2. This array con- * tains respectively the 16-lower and 16-higher bits of the en- * coding of X( n ). * * IRANM (local output) INTEGER array * On entry, IRANM is an array of dimension 2. On exit, this * array contains respectively the 16-lower and 16-higher bits * of the encoding of X( n+K ). * * IMA (local output) INTEGER array * On entry, IMA is an array of dimension 4. On exit, when K is * greater than zero, this array contains the encoded constants * A and C to jump from X( n ) to X( n+K ) in the random se- * quence. IMA(1:2) contains respectively the 16-lower and * 16-higher bits of the constant A, and IMA(3:4) contains the * 16-lower and 16-higher bits of the constant C. When K is * less or equal than zero, this array is not referenced. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Executable Statements .. * IF( K.GT.0 ) THEN * IMA( 1 ) = MULADD( 1 ) IMA( 2 ) = MULADD( 2 ) IMA( 3 ) = MULADD( 3 ) IMA( 4 ) = MULADD( 4 ) * DO 10 I = 1, K - 1 * CALL PB_LMUL( IMA, MULADD, J ) * IMA( 1 ) = J( 1 ) IMA( 2 ) = J( 2 ) * CALL PB_LMUL( IMA( 3 ), MULADD, J ) CALL PB_LADD( MULADD( 3 ), J, IMA( 3 ) ) * 10 CONTINUE * CALL PB_LMUL( IRANN, IMA, J ) CALL PB_LADD( J, IMA( 3 ), IRANM ) * ELSE * IRANM( 1 ) = IRANN( 1 ) IRANM( 2 ) = IRANN( 2 ) * END IF * RETURN * * End of PB_JUMP * END SUBROUTINE PB_SETRAN( IRAN, IAC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER IAC( 4 ), IRAN( 2 ) * .. * * Purpose * ======= * * PB_SETRAN initializes the random generator with the encoding of the * first number X( 1 ) in the sequence, and the constants a and c used * to compute the next element in the sequence: * * X( n+1 ) = a * X( n ) + c. * * X( 1 ), a and c are stored in the common block RANCOM for later use * (see the routines PB_SRAN or PB_DRAN). * * Arguments * ========= * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2. This array con- * tains respectively the 16-lower and 16-higher bits of the en- * coding of X( 1 ). * * IAC (local input) INTEGER array * On entry, IAC is an array of dimension 4. IAC(1:2) contain * respectively the 16-lower and 16-higher bits of the constant * a, and IAC(3:4) contain the 16-lower and 16-higher bits of * the constant c. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * IRAND( 1 ) = IRAN( 1 ) IRAND( 2 ) = IRAN( 2 ) IACS( 1 ) = IAC( 1 ) IACS( 2 ) = IAC( 2 ) IACS( 3 ) = IAC( 3 ) IACS( 4 ) = IAC( 4 ) * RETURN * * End of PB_SETRAN * END SUBROUTINE PB_JUMPIT( MULADD, IRANN, IRANM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Array Arguments .. INTEGER IRANM( 2 ), IRANN( 2 ), MULADD( 4 ) * .. * * Purpose * ======= * * PB_JUMPIT jumps in the random sequence from the number X( n ) enco- * ded in IRANN to the number X( m ) encoded in IRANM using the cons- * tants A and C encoded in MULADD: * * X( m ) = A * X( n ) + C. * * The constants A and C obviously depend on m and n, see the subroutine * PB_JUMP in order to set them up. * * Arguments * ========= * * MULADD (local input) INTEGER array * On netry, MULADD is an array of dimension 4. MULADD(1:2) con- * tains respectively the 16-lower and 16-higher bits of the * constant A, and MULADD(3:4) contains the 16-lower and * 16-higher bits of the constant C. * * IRANN (local input) INTEGER array * On entry, IRANN is an array of dimension 2. This array con- * tains respectively the 16-lower and 16-higher bits of the en- * coding of X( n ). * * IRANM (local output) INTEGER array * On entry, IRANM is an array of dimension 2. On exit, this * array contains respectively the 16-lower and 16-higher bits * of the encoding of X( m ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * CALL PB_LMUL( IRANN, MULADD, J ) CALL PB_LADD( J, MULADD( 3 ), IRANM ) * IRAND( 1 ) = IRANM( 1 ) IRAND( 2 ) = IRANM( 2 ) * RETURN * * End of PB_JUMPIT * END scalapack-2.0.2/PBLAS/TESTING/PCBLAS1TST.dat000644 000766 000024 00000003106 10363532303 017777 0ustar00juliestaff000000 000000 'Level 1 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PCBLAS1TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q (2.0E0, -3.0E0) value of ALPHA 4 number of tests problems 14 44 28 7 values of N 36 9 39 53 values of M_X 24 67 3 12 values of N_X 2 5 2 5 values of IMB_X 2 5 2 5 values of INB_X 2 5 2 5 values of MB_X 2 5 2 5 values of NB_X 0 0 0 0 values of RSRC_X 0 0 0 0 values of CSRC_X 5 3 1 1 values of IX 2 6 1 1 values of JX 1 9 1 53 values of INCX 2 6 35 14 values of M_Y 27 55 43 12 values of N_Y 2 5 2 5 values of IMB_Y 2 5 2 5 values of INB_Y 2 5 2 5 values of MB_Y 2 5 2 5 values of NB_Y 0 0 0 0 values of RSRC_Y 0 0 0 0 values of CSRC_Y 1 4 1 1 values of IY 7 6 1 1 values of JY 2 6 35 1 values of INCY PCSWAP T put F for no test in the same column PCSCAL T put F for no test in the same column PCSSCAL T put F for no test in the same column PCCOPY T put F for no test in the same column PCAXPY T put F for no test in the same column PCDOTU T put F for no test in the same column PCDOTC T put F for no test in the same column PSCNRM2 T put F for no test in the same column PSCASUM T put F for no test in the same column PCAMAX T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/pcblas1tst.f000644 000766 000024 00000420524 11750130340 020117 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 10) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PCSWAP ', 'PCSCAL ', $ 'PCSSCAL', 'PCCOPY ', 'PCAXPY ', $ 'PCDOTU ', 'PCDOTC ', 'PSCNRM2', $ 'PSCASUM', 'PCAMAX'/ END BLOCK DATA PROGRAM PCBLA1TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PCBLA1TST is the main testing program for the PBLAS Level 1 routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 46 lines: * 'Level 1 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PCBLAS1TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0E0, 0.0E0) value of ALPHA * 2 number of tests problems * 3 4 values of N * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PCSWAP T put F for no test in the same column * PCSCAL T put F for no test in the same column * PCSSCAL T put F for no test in the same column * PCCOPY T put F for no test in the same column * PCAXPY T put F for no test in the same column * PCDOTU T put F for no test in the same column * PCDOTC T put F for no test in the same column * PSCNRM2 T put F for no test in the same column * PSCASUM T put F for no test in the same column * PCAMAX T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * CPLXSZ INTEGER * REALSZ and CPLXSZ indicate the length in bytes on the given * platform for a single precision real and a single precision * complex. By default, REALSZ is set to four and CPLXSZ is set * to eight. * * MEM COMPLEX array * MEM is an array of dimension TOTMEM / CPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, CPLXSZ, TOTMEM, $ MEMSIZ, NSUBS REAL RZERO COMPLEX PADVAL, ZERO PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ CPLXSZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / CPLXSZ, $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ RZERO = 0.0E+0, ZERO = ( 0.0E+0, 0.0E+0 ), $ NSUBS = 10 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE INTEGER CSRCX, CSRCY, I, IAM, ICTXT, IGAP, IMBX, IMBY, $ IMIDX, IMIDY, INBX, INBY, INCX, INCY, IPMATX, $ IPMATY, IPOSTX, IPOSTY, IPREX, IPREY, IPW, IPX, $ IPY, IVERB, IX, IXSEED, IY, IYSEED, J, JX, JY, $ K, LDX, LDY, MBX, MBY, MEMREQD, MPX, MPY, MX, $ MY, MYCOL, MYROW, N, NBX, NBY, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQX, NQY, NTESTS, NX, NY, $ PISCLR, RSRCX, RSRCY, TSKIP, TSTCNT REAL PUSCLR COMPLEX ALPHA, PSCLR * .. * .. Local Arrays .. CHARACTER*80 OUTFILE LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) INTEGER CSCXVAL( MAXTESTS ), CSCYVAL( MAXTESTS ), $ DESCX( DLEN_ ), DESCXR( DLEN_ ), $ DESCY( DLEN_ ), DESCYR( DLEN_ ), IERR( 4 ), $ IMBXVAL( MAXTESTS ), IMBYVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JXVAL( MAXTESTS ), JYVAL( MAXTESTS ), $ KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MXVAL( MAXTESTS ), $ MYVAL( MAXTESTS ), NBXVAL( MAXTESTS ), $ NBYVAL( MAXTESTS ), NVAL( MAXTESTS ), $ NXVAL( MAXTESTS ), NYVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_CCHEKPAD, PB_CFILLPAD, PB_DESCSET2, $ PB_PCLAPRNT, PCAMAX, PCAXPY, PCBLA1TSTINFO, $ PCBLAS1TSTCHK, PCBLAS1TSTCHKE, PCCHKARG1, $ PCCHKVOUT, PCCOPY, PCDOTC, PCDOTU, PCLAGEN, $ PCMPRNT, PCSCAL, PCSSCAL, PCSWAP, PCVPRNT, $ PSCASUM, PSCNRM2, PVDESCCHK, PVDIMCHK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .FALSE., .FALSE., .TRUE., $ .TRUE., .TRUE., .TRUE., .FALSE., .FALSE., $ .FALSE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler will abort on errors. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IXSEED = 100 IYSEED = 200 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PCBLA1TSTINFO( OUTFILE, NOUT, NTESTS, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, RSCXVAL, $ CSCXVAL, IXVAL, JXVAL, INCXVAL, MYVAL, $ NYVAL, IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, INCYVAL, $ MAXTESTS, NGRIDS, PVAL, MAXGRIDS, QVAL, $ MAXGRIDS, LTEST, SOF, TEE, IAM, IGAP, IVERB, $ NPROCS, ALPHA, MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PCBLAS1TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * N = NVAL( J ) MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN TSTCNT = TSTCNT + 1 WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY WRITE( NOUT, FMT = 9995 ) END IF * * Check the validity of the input and initialize DESC_ * CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, IGAP, GAPMUL, $ IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, IGAP, GAPMUL, $ IERR( 2 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDX = MAX( 1, MX ) LDY = MAX( 1, MY ) * * Assign pointers into MEM for matrices corresponding to * vectors X and Y. Ex: IPX starts at position MEM( IPREX+1 ). * IPX = IPREX + 1 IPY = IPX + DESCX( LLD_ ) * NQX + IPOSTX + IPREY IPMATX = IPY + DESCY( LLD_ ) * NQY + IPOSTY IPMATY = IPMATX + MX * NX IPW = IPMATY + MY * NY * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * MEMREQD = IPW - 1 + $ MAX( MAX( IMBX, MBX ), MAX( IMBY, MBY ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) MEMREQD*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 1 routines * DO 30 K = 1, NSUBS * * Continue only if this sub has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9989 ) SNAMES( K ) END IF * * Check the validity of the operand sizes * CALL PVDIMCHK( ICTXT, NOUT, N, 'X', IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, N, 'Y', IY, JY, DESCY, INCY, $ IERR( 2 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 GO TO 30 END IF * * Generate distributed matrices X and Y * CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCXR, MX, NX, IMBX, INBX, MBX, NBX, $ -1, -1, ICTXT, MAX( 1, MX ) ) CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCXR, IXSEED, MEM( IPMATX ), $ DESCXR( LLD_ ) ) IF( YCHECK( K ) ) THEN CALL PB_DESCSET2( DESCYR, MY, NY, IMBY, INBY, MBY, $ NBY, -1, -1, ICTXT, MAX( 1, MY ) ) CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCYR, IYSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) END IF * * Pad the guard zones of X, and Y * CALL PB_CFILLPAD( ICTXT, MPX, NQX, MEM( IPX-IPREX ), $ DESCX( LLD_ ), IPREX, IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_CFILLPAD( ICTXT, MPY, NQY, MEM( IPY-IPREY ), $ DESCY( LLD_ ), IPREY, IPOSTY, $ PADVAL ) END IF * * Initialize the check for INPUT only args. * INFO = 0 CALL PCCHKARG1( ICTXT, NOUT, SNAMES( K ), N, ALPHA, IX, $ JX, DESCX, INCX, IY, JY, DESCY, INCY, $ INFO ) * INFO = 0 PSCLR = ZERO PUSCLR = RZERO PISCLR = 0 * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PCLAPRNT( 1, N, MEM( IPX ), IX, JX, DESCX, $ 0, 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) ELSE CALL PB_PCLAPRNT( N, 1, MEM( IPX ), IX, JX, DESCX, $ 0, 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) END IF IF( YCHECK( K ) ) THEN IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PCLAPRNT( 1, N, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) ELSE CALL PB_PCLAPRNT( N, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) END IF END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PCLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, 0, $ 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) IF( YCHECK( K ) ) $ CALL PB_PCLAPRNT( MY, NY, MEM( IPY ), 1, 1, DESCY, $ 0, 0, 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) END IF * * Call the PBLAS routine * IF( K.EQ.1 ) THEN * * Test PCSWAP * CALL PCSWAP( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.2 ) THEN * * Test PCSCAL * PSCLR = ALPHA CALL PCSCAL( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.3 ) THEN * * Test PCSSCAL * PUSCLR = REAL( ALPHA ) CALL PCSSCAL( N, REAL( ALPHA ), MEM( IPX ), IX, JX, $ DESCX, INCX ) * ELSE IF( K.EQ.4 ) THEN * * Test PCCOPY * CALL PCCOPY( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.5 ) THEN * * Test PCAXPY * PSCLR = ALPHA CALL PCAXPY( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.6 ) THEN * * Test PCDOTU * CALL PCDOTU( N, PSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.7 ) THEN * * Test PCDOTC * CALL PCDOTC( N, PSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.8 ) THEN * * Test PSCNRM2 * CALL PSCNRM2( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.9 ) THEN * * Test PSCASUM * CALL PSCASUM( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.10 ) THEN * CALL PCAMAX( N, PSCLR, PISCLR, MEM( IPX ), IX, JX, $ DESCX, INCX ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9978 ) INFO GO TO 30 END IF * * Check the computations * CALL PCBLAS1TSTCHK( ICTXT, NOUT, K, N, PSCLR, PUSCLR, $ PISCLR, MEM( IPMATX ), MEM( IPX ), $ IX, JX, DESCX, INCX, MEM( IPMATY ), $ MEM( IPY ), IY, JY, DESCY, INCY, $ INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 END IF * * Check padding * CALL PB_CCHEKPAD( ICTXT, SNAMES( K ), MPX, NQX, $ MEM( IPX-IPREX ), DESCX( LLD_ ), $ IPREX, IPOSTX, PADVAL ) IF( YCHECK( K ) ) THEN CALL PB_CCHEKPAD( ICTXT, SNAMES( K ), MPY, NQY, $ MEM( IPY-IPREY ), DESCY( LLD_ ), $ IPREY, IPOSTY, PADVAL ) END IF * * Check input-only scalar arguments * INFO = 1 CALL PCCHKARG1( ICTXT, NOUT, SNAMES( K ), N, ALPHA, IX, $ JX, DESCX, INCX, IY, JY, DESCY, INCY, $ INFO ) * * Check input-only array arguments * CALL PCCHKVOUT( N, MEM( IPMATX ), MEM( IPX ), IX, JX, $ DESCX, INCX, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) 'PARALLEL_X', SNAMES( K ) END IF * IF( YCHECK( K ) ) THEN CALL PCCHKVOUT( N, MEM( IPMATY ), MEM( IPY ), IY, JY, $ DESCY, INCY, IERR( 4 ) ) IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) 'PARALLEL_Y', $ SNAMES( K ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE. 0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) SNAMES( K ) KFAIL( K ) = KFAIL( K ) + 1 ERRFLG = .TRUE. ELSE IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) SNAMES( K ) KPASS( K ) = KPASS( K ) + 1 END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 3 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PCMPRNT( ICTXT, NOUT, MX, NX, MEM( IPMATX ), $ LDX, 0, 0, 'SERIAL_X' ) CALL PB_PCLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, $ 0, 0, 'PARALLEL_X', NOUT, $ MEM( IPMATX ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( N.GT.0 ) $ CALL PCVPRNT( ICTXT, NOUT, N, $ MEM( IPMATX+IX-1+(JX-1)*LDX ), $ INCX, 0, 0, 'SERIAL_X' ) IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PCLAPRNT( 1, N, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PCLAPRNT( N, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) END IF END IF IF( YCHECK( K ) ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PCMPRNT( ICTXT, NOUT, MY, NY, $ MEM( IPMATY ), LDY, 0, 0, $ 'SERIAL_Y' ) CALL PB_PCLAPRNT( MY, NY, MEM( IPY ), 1, 1, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( N.GT.0 ) $ CALL PCVPRNT( ICTXT, NOUT, N, $ MEM( IPMATY+IY-1+(JY-1)*LDY ), $ INCY, 0, 0, 'SERIAL_Y' ) IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PCLAPRNT( 1, N, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PCLAPRNT( N, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) END IF END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9984 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, '---------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' N IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9993 FORMAT( 2X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I5,1X,I5,1X,I5,1X,I5,1X, $ I5,1X,I5,1X,I6 ) 9992 FORMAT( 2X, ' N IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9991 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9990 FORMAT( 'Not enough memory. Need: ', I12 ) 9989 FORMAT( 2X, ' Tested Subroutine: ', A ) 9988 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9987 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9986 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9985 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9984 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9983 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9982 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9981 FORMAT( 2X, 'Testing Summary') 9980 FORMAT( 2X, 'End of Tests.' ) 9979 FORMAT( 2X, 'Tests started.' ) 9978 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PCBLA1TST * END SUBROUTINE PCBLA1TSTINFO( SUMMRY, NOUT, NMAT, NVAL, MXVAL, $ NXVAL, IMBXVAL, MBXVAL, INBXVAL, $ NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, $ CSCYVAL, IYVAL, JYVAL, INCYVAL, $ LDVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, LTEST, SOF, TEE, IAM, IGAP, $ IVERB, NPROCS, ALPHA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, $ NGRIDS, NMAT, NOUT, NPROCS COMPLEX ALPHA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY LOGICAL LTEST( * ) INTEGER CSCXVAL( LDVAL ), CSCYVAL( LDVAL ), $ IMBXVAL( LDVAL ), IMBYVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JXVAL( LDVAL ), $ JYVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCXVAL( LDVAL ), RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PCBLA1TSTINFO get the needed startup information for testing various * Level 1 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DESCX(:), IX, JX, INCX, DESCY(:), * IY, JY and INCY. This is also the maximum number of test * cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least ten. On * exit, if LTEST( i ) is .TRUE., the i-th Level 1 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) COMPLEX * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 2, 2*NGRIDS+23*NMAT+NSUBS+4 ) with NSUBS equal to 10. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 10 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J REAL EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, CGEBR2D, CGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PCBLAS1TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 100 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 100 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA * READ( NIN, FMT = * ) ALPHA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 100 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 100 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, WORK, 2 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 70 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 70 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 1 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex single precision '// $ 'Level 1 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9982 ) ALPHA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 80 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 80 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, WORK, 2, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) * I = 2*NGRIDS + 23*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 90 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 90 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 100 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) * * End of PCBLA1TSTINFO * END SUBROUTINE PCBLAS1TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PCBLAS1TSTCHKE tests the error exits of the Level 1 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 10 (NSUBS). * If LTEST( 1 ) is .TRUE., PCSWAP will be tested; * If LTEST( 2 ) is .TRUE., PCSCAL will be tested; * If LTEST( 3 ) is .TRUE., PCSSCAL will be tested; * If LTEST( 4 ) is .TRUE., PCCOPY will be tested; * If LTEST( 5 ) is .TRUE., PCAXPY will be tested; * If LTEST( 6 ) is .TRUE., PCDOTU will be tested; * If LTEST( 7 ) is .TRUE., PCDOTC will be tested; * If LTEST( 8 ) is .TRUE., PSCNRM2 will be tested; * If LTEST( 9 ) is .TRUE., PSCASUM will be tested; * If LTEST( 10 ) is .TRUE., PCAMAX will be tested. * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 10 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PCAMAX, PCAXPY, PCCOPY, $ PCDIMEE, PCDOTC, PCDOTU, PCSCAL, PCSSCAL, $ PCSWAP, PCVECEE, PSCASUM, PSCNRM2 * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/11, 12, 12, 11, 13, 13, 13, 15, 15, 14/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PCSWAP * I = 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCSWAP, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCSWAP, SCODE( I ), SNAMES( I ) ) END IF * * Test PCSCAL * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCSCAL, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCSCAL, SCODE( I ), SNAMES( I ) ) END IF * * Test PCSSCAL * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCSSCAL, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCSSCAL, SCODE( I ), SNAMES( I ) ) END IF * * Test PCCOPY * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCCOPY, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCCOPY, SCODE( I ), SNAMES( I ) ) END IF * * Test PCAXPY * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCAXPY, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCAXPY, SCODE( I ), SNAMES( I ) ) END IF * * Test PCDOTU * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCDOTU, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCDOTU, SCODE( I ), SNAMES( I ) ) END IF * * Test PCDOTC * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCDOTC, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCDOTC, SCODE( I ), SNAMES( I ) ) END IF * * PSCNRM2 * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PSCNRM2, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PSCNRM2, SCODE( I ), SNAMES( I ) ) END IF * * Test PSCASUM * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PSCASUM, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PSCASUM, SCODE( I ), SNAMES( I ) ) END IF * * Test PCAMAX * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCAMAX, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCAMAX, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PCBLAS1TSTCHKE * END SUBROUTINE PCCHKARG1( ICTXT, NOUT, SNAME, N, ALPHA, IX, JX, $ DESCX, INCX, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, INCY, INFO, IX, IY, JX, JY, N, $ NOUT COMPLEX ALPHA * .. * .. Array Arguments .. CHARACTER*(*) SNAME INTEGER DESCX( * ), DESCY( * ) * .. * * Purpose * ======= * * PCCHKARG1 checks the input-only arguments of the Level 1 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * N (global input) INTEGER * On entry, N specifies the length of the subvector operands. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, INCXREF, INCYREF, IXREF, IYREF, JXREF, $ JYREF, MYCOL, MYROW, NPCOL, NPROW, NREF COMPLEX ALPHAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCXREF( DLEN_ ), DESCYREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * NREF = N IXREF = IX JXREF = JX DO 10 I = 1, DLEN_ DESCXREF( I ) = DESCX( I ) 10 CONTINUE INCXREF = INCX IYREF = IY JYREF = JY DO 20 I = 1, DLEN_ DESCYREF( I ) = DESCY( I ) 20 CONTINUE INCYREF = INCY ALPHAREF = ALPHA * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( IX.NE.IXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IX' ELSE IF( JX.NE.JXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JX' ELSE IF( DESCX( DTYPE_ ).NE.DESCXREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( DTYPE_ )' ELSE IF( DESCX( M_ ).NE.DESCXREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( M_ )' ELSE IF( DESCX( N_ ).NE.DESCXREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( N_ )' ELSE IF( DESCX( IMB_ ).NE.DESCXREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( IMB_ )' ELSE IF( DESCX( INB_ ).NE.DESCXREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( INB_ )' ELSE IF( DESCX( MB_ ).NE.DESCXREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( MB_ )' ELSE IF( DESCX( NB_ ).NE.DESCXREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( NB_ )' ELSE IF( DESCX( RSRC_ ).NE.DESCXREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( RSRC_ )' ELSE IF( DESCX( CSRC_ ).NE.DESCXREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CSRC_ )' ELSE IF( DESCX( CTXT_ ).NE.DESCXREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CTXT_ )' ELSE IF( DESCX( LLD_ ).NE.DESCXREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( LLD_ )' ELSE IF( INCX.NE.INCXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCX' ELSE IF( IY.NE.IYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IY' ELSE IF( JY.NE.JYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JY' ELSE IF( DESCY( DTYPE_ ).NE.DESCYREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( DTYPE_ )' ELSE IF( DESCY( M_ ).NE.DESCYREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( M_ )' ELSE IF( DESCY( N_ ).NE.DESCYREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( N_ )' ELSE IF( DESCY( IMB_ ).NE.DESCYREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( IMB_ )' ELSE IF( DESCY( INB_ ).NE.DESCYREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( INB_ )' ELSE IF( DESCY( MB_ ).NE.DESCYREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( MB_ )' ELSE IF( DESCY( NB_ ).NE.DESCYREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( NB_ )' ELSE IF( DESCY( RSRC_ ).NE.DESCYREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( RSRC_ )' ELSE IF( DESCY( CSRC_ ).NE.DESCYREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CSRC_ )' ELSE IF( DESCY( CTXT_ ).NE.DESCYREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CTXT_ )' ELSE IF( DESCY( LLD_ ).NE.DESCYREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( LLD_ )' ELSE IF( INCY.NE.INCYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCY' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.GT.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PCCHKARG1 * END LOGICAL FUNCTION PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) * .. * * Purpose * ======= * * PISINSCOPE returns .TRUE. if the calling process is in the scope of * sub( X ) = X( IX+(JX-1)*DESCX(M_)+(i-1)*INCX ) and .FALSE. if it is * not. This routine is used to determine which processes should check * the answer returned by some Level 1 PBLAS routines. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * N (global input) INTEGER * The length of the subvector sub( X ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER IIX, IXCOL, IXROW, JJX, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_INFOG2L * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN * * This is the special case, find process owner of IX, JX, and * only this process is the scope. * PISINSCOPE = ( ( IXROW.EQ.MYROW .OR. ROWREP ) .AND. $ ( IXCOL.EQ.MYCOL .OR. COLREP ) ) * ELSE * IF( INCX.EQ.DESCX( M_ ) ) THEN * * row vector * PISINSCOPE = ( MYROW.EQ.IXROW .OR. ROWREP ) * ELSE * * column vector * PISINSCOPE = ( MYCOL.EQ.IXCOL .OR. COLREP ) * END IF * END IF * RETURN * * End of PISINSCOPE * END SUBROUTINE PCBLAS1TSTCHK( ICTXT, NOUT, NROUT, N, PSCLR, PUSCLR, $ PISCLR, X, PX, IX, JX, DESCX, INCX, Y, $ PY, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, INCY, INFO, IX, IY, JX, JY, N, $ NOUT, NROUT, PISCLR REAL PUSCLR COMPLEX PSCLR * .. * .. Array Arguments .. INTEGER DESCX( * ), DESCY( * ) COMPLEX PX( * ), PY( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PCBLAS1TSTCHK performs the computational tests of the Level 1 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PCSWAP will be tested; * else if NROUT = 2, PCSCAL will be tested; * else if NROUT = 3, PCSSCAL will be tested; * else if NROUT = 4, PCCOPY will be tested; * else if NROUT = 5, PCAXPY will be tested; * else if NROUT = 6, PCDOTU will be tested; * else if NROUT = 7, PCDOTC will be tested; * else if NROUT = 8, PSCNRM2 will be tested; * else if NROUT = 9, PSCASUM will be tested; * else if NROUT = 10, PCAMAX will be tested. * * N (global input) INTEGER * On entry, N specifies the length of the subvector operands. * * PSCLR (global input) COMPLEX * On entry, depending on the value of NROUT, PSCLR specifies * the scalar ALPHA, or the output scalar returned by the PBLAS, * i.e., the dot product, the 2-norm, the absolute sum or the * value of AMAX. * * PUSCLR (global input) REAL * On entry, PUSCLR specifies the real part of the scalar ALPHA * used by the real scaling, the 2-norm, or the absolute sum * routines. PUSCLR is not used in the real versions of this * routine. * * PISCLR (global input) REAL * On entry, PISCLR specifies the value of the global index re- * turned by PCAMAX, otherwise PISCLR is not used. * * X (local input/local output) COMPLEX array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) COMPLEX array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) COMPLEX array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) COMPLEX array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on X has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on Y has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL RZERO COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ RZERO = 0.0E+0 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL COLREP, INXSCOPE, INYSCOPE, ROWREP INTEGER I, IB, ICURCOL, ICURROW, IDUMM, IIX, IIY, IN, $ IOFFX, IOFFY, ISCLR, IXCOL, IXROW, IYCOL, $ IYROW, J, JB, JJX, JJY, JN, KK, LDX, LDY, $ MYCOL, MYROW, NPCOL, NPROW REAL ERR, ERRMAX, PREC, USCLR COMPLEX SCLR * .. * .. Local Arrays .. INTEGER IERR( 6 ) CHARACTER*5 ARGIN1, ARGIN2, ARGOUT1, ARGOUT2 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CCOPY, CSWAP, IGAMX2D, $ PB_INFOG2L, PCCHKVIN, PCERRASUM, PCERRAXPY, $ PCERRDOTC, PCERRDOTU, PCERRNRM2, PCERRSCAL, $ PCSERRSCAL * .. * .. External Functions .. LOGICAL PISINSCOPE INTEGER ICAMAX REAL PSLAMCH EXTERNAL ICAMAX, PISINSCOPE, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * ARGIN1 = ' ' ARGIN2 = ' ' ARGOUT1 = ' ' ARGOUT2 = ' ' DO 10 I = 1, 6 IERR( I ) = 0 10 CONTINUE * PREC = PSLAMCH( ICTXT, 'precision' ) * IF( NROUT.EQ.1 ) THEN * * Test PCSWAP * IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL CSWAP( N, X( IOFFX ), INCX, Y( IOFFY ), INCY ) CALL PCCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PCCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) * ELSE IF( NROUT.EQ.2 ) THEN * * Test PCSCAL * LDX = DESCX( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ICURROW = IXROW ICURCOL = IXCOL ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * DO 20 J = JX, JN * CALL PCERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 20 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 40 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * DO 30 KK = 0, JB-1 * CALL PCERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 30 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 40 CONTINUE * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * DO 50 I = IX, IN * CALL PCERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 70 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * DO 60 KK = 0, IB-1 * CALL PCERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX 60 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 70 CONTINUE * END IF * ELSE IF( NROUT.EQ.3 ) THEN * * Test PCSSCAL * LDX = DESCX( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ICURROW = IXROW ICURCOL = IXCOL ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * DO 80 J = JX, JN * CALL PCSERRSCAL( ERR, PUSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 80 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 100 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * DO 90 KK = 0, JB-1 * CALL PCSERRSCAL( ERR, PUSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 90 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 100 CONTINUE * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * DO 110 I = IX, IN * CALL PCSERRSCAL( ERR, PUSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * DO 120 KK = 0, IB-1 * CALL PCSERRSCAL( ERR, PUSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * ELSE IF( NROUT.EQ.4 ) THEN * * Test PCCOPY * IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL CCOPY( N, X( IOFFX ), INCX, Y( IOFFY ), INCY ) CALL PCCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PCCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) * ELSE IF( NROUT.EQ.5 ) THEN * * Test PCAXPY * CALL PCCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) LDY = DESCY( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PB_INFOG2L( IY, JY, DESCY, NPROW, NPCOL, MYROW, MYCOL, $ IIY, JJY, IYROW, IYCOL ) ICURROW = IYROW ICURCOL = IYCOL ROWREP = ( IYROW.EQ.-1 ) COLREP = ( IYCOL.EQ.-1 ) * IF( INCY.EQ.DESCY( M_ ) ) THEN * * sub( Y ) is a row vector * JB = DESCY( INB_ ) - JY + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCY( NB_ ) + 1 ) * DESCY( NB_ ) + JB JB = MIN( JB, N ) JN = JY + JB - 1 * DO 140 J = JY, JN * CALL PCERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF JJY = JJY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 140 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 160 J = JN+1, JY+N-1, DESCY( NB_ ) JB = MIN( JY+N-J, DESCY( NB_ ) ) * DO 150 KK = 0, JB-1 * CALL PCERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF JJY = JJY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 150 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 160 CONTINUE * ELSE * * sub( Y ) is a column vector * IB = DESCY( IMB_ ) - IY + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCY( MB_ ) + 1 ) * DESCY( MB_ ) + IB IB = MIN( IB, N ) IN = IY + IB - 1 * DO 170 I = IY, IN * CALL PCERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF IIY = IIY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 170 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 190 I = IN+1, IY+N-1, DESCY( MB_ ) IB = MIN( IY+N-I, DESCY( MB_ ) ) * DO 180 KK = 0, IB-1 * CALL PCERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF IIY = IIY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 180 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 190 CONTINUE * END IF * ELSE IF( NROUT.EQ.6 ) THEN * * Test PCDOTU * CALL PCCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PCCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PCERRDOTU( ERR, N, SCLR, X( IOFFX ), INCX, Y( IOFFY ), $ INCY, PREC ) INXSCOPE = PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) INYSCOPE = PISINSCOPE( ICTXT, N, IY, JY, DESCY, INCY ) IF( INXSCOPE.OR.INYSCOPE ) THEN IF( ABS( PSCLR - SCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'DOTU' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF ELSE SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'DOTU' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.7 ) THEN * * Test PCDOTC * CALL PCCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PCCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PCERRDOTC( ERR, N, SCLR, X( IOFFX ), INCX, Y( IOFFY ), $ INCY, PREC ) INXSCOPE = PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) INYSCOPE = PISINSCOPE( ICTXT, N, IY, JY, DESCY, INCY ) IF( INXSCOPE.OR.INYSCOPE ) THEN IF( ABS( PSCLR - SCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'DOTC' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF ELSE SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'DOTC' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.8 ) THEN * * Test PSCNRM2 * CALL PCCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PCERRNRM2( ERR, N, USCLR, X( IOFFX ), INCX, PREC ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN IF( ABS( PUSCLR - USCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'NRM2' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9994 ) USCLR, PUSCLR END IF END IF ELSE USCLR = RZERO IF( PUSCLR.NE.USCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'NRM2' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9994 ) USCLR, PUSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.9 ) THEN * * Test PSCASUM * CALL PCCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PCERRASUM( ERR, N, USCLR, X( IOFFX ), INCX, PREC ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN IF( ABS( PUSCLR - USCLR ) .GT. ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'ASUM' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9994 ) USCLR, PUSCLR END IF END IF ELSE USCLR = RZERO IF( PUSCLR.NE.USCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'ASUM' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9994 ) USCLR, PUSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.10 ) THEN * * Test PCAMAX * CALL PCCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN ISCLR = ICAMAX( N, X( IOFFX ), INCX ) IF( N.LT.1 ) THEN SCLR = ZERO ELSE IF( ( INCX.EQ.1 ).AND.( DESCX( M_ ).EQ.1 ).AND. $ ( N.EQ.1 ) ) THEN ISCLR = JX SCLR = X( IOFFX ) ELSE IF( INCX.EQ.DESCX( M_ ) ) THEN ISCLR = JX + ISCLR - 1 SCLR = X( IX + ( ISCLR - 1 ) * DESCX( M_ ) ) ELSE ISCLR = IX + ISCLR - 1 SCLR = X( ISCLR + ( JX - 1 ) * DESCX( M_ ) ) END IF * IF( PSCLR.NE.SCLR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'AMAX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF * IF( PISCLR.NE.ISCLR ) THEN IERR( 5 ) = 1 WRITE( ARGIN2, FMT = '(A)' ) 'INDX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN2 WRITE( NOUT, FMT = 9995 ) ISCLR, PISCLR END IF END IF ELSE ISCLR = 0 SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'AMAX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF IF( PISCLR.NE.ISCLR ) THEN IERR( 6 ) = 1 WRITE( ARGOUT2, FMT = '(A)' ) 'INDX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT2 WRITE( NOUT, FMT = 9995 ) ISCLR, PISCLR END IF END IF END IF * END IF * * Find IERR across all processes * CALL IGAMX2D( ICTXT, 'All', ' ', 6, 1, IERR, 6, IDUMM, IDUMM, -1, $ -1, 0 ) * * Encode the errors found in INFO * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'X' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Y' END IF * IF( IERR( 3 ).NE.0 ) $ INFO = INFO + 4 * IF( IERR( 4 ).NE.0 ) $ INFO = INFO + 8 * IF( IERR( 5 ).NE.0 ) $ INFO = INFO + 16 * IF( IERR( 6 ).NE.0 ) $ INFO = INFO + 32 * 9999 FORMAT( 2X, ' ***** ERROR: Vector operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** ERROR: Output scalar result ', A, $ ' in scope is incorrect.' ) 9997 FORMAT( 2X, ' ***** ERROR: Output scalar result ', A, $ ' out of scope is incorrect.' ) 9996 FORMAT( 2X, ' ***** Expected value is: ', E16.8, '+i*(', $ E16.8, '),', /2X, ' Obtained value is: ', $ E16.8, '+i*(', E16.8, ')' ) 9995 FORMAT( 2X, ' ***** Expected value is: ', I6, /2X, $ ' Obtained value is: ', I6 ) 9994 FORMAT( 2X, ' ***** Expected value is: ', E16.8, /2X, $ ' Obtained value is: ', E16.8 ) * RETURN * * End of PCBLAS1TSTCHK * END SUBROUTINE PCERRDOTU( ERRBND, N, SCLR, X, INCX, Y, INCY, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N REAL ERRBND, PREC COMPLEX SCLR * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * .. * * Purpose * ======= * * PCERRDOTU serially computes the dot product X**T * Y and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If dot1 = SCLR and dot2 are two different computed results, and dot1 * is being assumed to be correct, we require * * abs( dot1 - dot2 ) <= ERRBND = ERRFACT * abs( dot1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operands. * * SCLR (global output) COMPLEX * On exit, SCLR specifies the dot product of the two vectors * X and Y. * * X (global input) COMPLEX array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (global input) COMPLEX array * On entry, Y is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremen- * ted array Y must contain the vector y. * * INCY (global input) INTEGER. * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IX, IY REAL ADDBND, FACT, SUMINEG, SUMIPOS, SUMRNEG, $ SUMRPOS, TMP * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL * .. * .. Executable Statements .. * IX = 1 IY = 1 SCLR = ZERO SUMIPOS = ZERO SUMINEG = ZERO SUMRPOS = ZERO SUMRNEG = ZERO FACT = TWO * ( ONE + PREC ) ADDBND = TWO * TWO * TWO * PREC * DO 10 I = 1, N * SCLR = SCLR + X( IX ) * Y( IY ) * TMP = REAL( X( IX ) ) * REAL( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMRPOS = SUMRPOS + TMP * FACT ELSE SUMRNEG = SUMRNEG - TMP * FACT END IF * TMP = - AIMAG( X( IX ) ) * AIMAG( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMRPOS = SUMRPOS + TMP * FACT ELSE SUMRNEG = SUMRNEG - TMP * FACT END IF * TMP = AIMAG( X( IX ) ) * REAL( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMIPOS = SUMIPOS + TMP * FACT ELSE SUMINEG = SUMINEG - TMP * FACT END IF * TMP = REAL( X( IX ) ) * AIMAG( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMIPOS = SUMIPOS + TMP * FACT ELSE SUMINEG = SUMINEG - TMP * FACT END IF * IX = IX + INCX IY = IY + INCY * 10 CONTINUE * ERRBND = ADDBND * MAX( MAX( SUMRPOS, SUMRNEG ), $ MAX( SUMIPOS, SUMINEG ) ) * RETURN * * End of PCERRDOTU * END SUBROUTINE PCERRDOTC( ERRBND, N, SCLR, X, INCX, Y, INCY, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N REAL ERRBND, PREC COMPLEX SCLR * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * .. * * Purpose * ======= * * PCERRDOTC serially computes the dot product X**H * Y and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If dot1 = SCLR and dot2 are two different computed results, and dot1 * is being assumed to be correct, we require * * abs( dot1 - dot2 ) <= ERRBND = ERRFACT * abs( dot1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operands. * * SCLR (global output) COMPLEX * On exit, SCLR specifies the dot product of the two vectors * X and Y. * * X (global input) COMPLEX array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (global input) COMPLEX array * On entry, Y is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremen- * ted array Y must contain the vector y. * * INCY (global input) INTEGER. * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IX, IY REAL ADDBND, FACT, SUMINEG, SUMIPOS, SUMRNEG, $ SUMRPOS, TMP * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, REAL * .. * .. Executable Statements .. * IX = 1 IY = 1 SCLR = ZERO SUMIPOS = ZERO SUMINEG = ZERO SUMRPOS = ZERO SUMRNEG = ZERO FACT = TWO * ( ONE + PREC ) ADDBND = TWO * TWO * TWO * PREC * DO 10 I = 1, N * SCLR = SCLR + CONJG( X( IX ) ) * Y( IY ) * TMP = REAL( X( IX ) ) * REAL( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMRPOS = SUMRPOS + TMP * FACT ELSE SUMRNEG = SUMRNEG - TMP * FACT END IF * TMP = AIMAG( X( IX ) ) * AIMAG( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMRPOS = SUMRPOS + TMP * FACT ELSE SUMRNEG = SUMRNEG - TMP * FACT END IF * TMP = - AIMAG( X( IX ) ) * REAL( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMIPOS = SUMIPOS + TMP * FACT ELSE SUMINEG = SUMINEG - TMP * FACT END IF * TMP = REAL( X( IX ) ) * AIMAG( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMIPOS = SUMIPOS + TMP * FACT ELSE SUMINEG = SUMINEG - TMP * FACT END IF * IX = IX + INCX IY = IY + INCY * 10 CONTINUE * ERRBND = ADDBND * MAX( MAX( SUMRPOS, SUMRNEG ), $ MAX( SUMIPOS, SUMINEG ) ) * RETURN * * End of PCERRDOTC * END SUBROUTINE PCERRNRM2( ERRBND, N, USCLR, X, INCX, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N REAL ERRBND, PREC, USCLR * .. * .. Array Arguments .. COMPLEX X( * ) * .. * * Purpose * ======= * * PCERRNRM2 serially computes the 2-norm the vector X and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If norm1 = SCLR and norm2 are two different computed results, and * norm1 being assumed to be correct, we require * * abs( norm1 - norm2 ) <= ERRBND = ERRFACT * abs( norm1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operand. * * USCLR (global output) REAL * On exit, USCLR specifies the 2-norm of the vector X. * * X (global input) COMPLEX array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER IX REAL ABSXI, ADDBND, FACT, SCALE, SSQ, SUMSCA, SUMSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, REAL * .. * .. Executable Statements .. * USCLR = ZERO SUMSSQ = ONE SUMSCA = ZERO ADDBND = TWO * TWO * TWO * PREC FACT = ONE + TWO * ( ( ONE + PREC )**3 - ONE ) * SCALE = ZERO SSQ = ONE DO 10 IX = 1, 1 + ( N - 1 )*INCX, INCX IF( REAL( X( IX ) ).NE.ZERO ) THEN ABSXI = ABS( REAL( X( IX ) ) ) IF( SCALE.LT.ABSXI )THEN SUMSSQ = ONE + ( SSQ*( SCALE/ABSXI )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = ONE + SSQ*( SCALE/ABSXI )**2 SUMSCA = ABSXI SCALE = ABSXI ELSE SUMSSQ = SSQ + ( ( ABSXI/SCALE )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = SSQ + ( ABSXI/SCALE )**2 END IF END IF IF( AIMAG( X( IX ) ).NE.ZERO ) THEN ABSXI = ABS( AIMAG( X( IX ) ) ) IF( SCALE.LT.ABSXI )THEN SUMSSQ = ONE + ( SSQ*( SCALE/ABSXI )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = ONE + SSQ*( SCALE/ABSXI )**2 SUMSCA = ABSXI SCALE = ABSXI ELSE SUMSSQ = SSQ + ( ( ABSXI/SCALE )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = SSQ + ( ABSXI/SCALE )**2 END IF END IF 10 CONTINUE * USCLR = SCALE * SQRT( SSQ ) * * Error on square root * ERRBND = SQRT( SUMSSQ ) * ( ONE + TWO * ( 1.00001E+0 * PREC ) ) * ERRBND = ( SUMSCA * ERRBND ) - USCLR * RETURN * * End of PCERRNRM2 * END SUBROUTINE PCERRASUM( ERRBND, N, USCLR, X, INCX, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N REAL ERRBND, PREC, USCLR * .. * .. Array Arguments .. COMPLEX X( * ) * .. * * Purpose * ======= * * PCERRASUM serially computes the sum of absolute values of the vector * X and returns a scaled relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies a scaled relative acceptable error * bound. In this case the error bound is just the absolute sum * multiplied by a constant proportional to the machine preci- * sion. * * N (global input) INTEGER * On entry, N specifies the length of the vector operand. * * USCLR (global output) REAL * On exit, USCLR specifies the sum of absolute values of the * vector X. * * X (global input) COMPLEX array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL TWO, ZERO PARAMETER ( TWO = 2.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER IX REAL ADDBND * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, REAL * .. * .. Executable Statements .. * IX = 1 USCLR = ZERO ADDBND = TWO * TWO * TWO * PREC * DO 10 IX = 1, 1 + ( N - 1 )*INCX, INCX USCLR = USCLR + ABS( REAL( X( IX ) ) ) + $ ABS( AIMAG( X( IX ) ) ) 10 CONTINUE * ERRBND = ADDBND * USCLR * RETURN * * End of PCERRASUM * END SUBROUTINE PCERRSCAL( ERRBND, PSCLR, X, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL ERRBND, PREC COMPLEX PSCLR, X * .. * * Purpose * ======= * * PCERRSCAL serially computes the product PSCLR * X and returns a sca- * led relative acceptable error bound on the result. * * Notes * ===== * * If s1 = PSCLR*X and s2 are two different computed results, and s1 is * being assumed to be correct, we require * * abs( s1 - s2 ) <= ERRBND = ERRFACT * abs( s1 ), * * where ERRFACT is computed as two times the machine precision. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PSCLR (global input) COMPLEX * On entry, PSCLR specifies the scale factor. * * X (global input/global output) COMPLEX * On entry, X specifies the scalar to be scaled. On exit, X is * the scaled entry. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL TWO PARAMETER ( TWO = 2.0E+0 ) * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * X = PSCLR * X * ERRBND = ( TWO * PREC ) * ABS( X ) * RETURN * * End of PCERRSCAL * END SUBROUTINE PCSERRSCAL( ERRBND, PUSCLR, X, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL ERRBND, PREC, PUSCLR COMPLEX X * .. * * Purpose * ======= * * PCSERRSCAL serially computes the product PUSCLR * X and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If s1 = PUSCLR*X and s2 are two different computed results, and s1 is * being assumed to be correct, we require * * abs( s1 - s2 ) <= ERRBND = ERRFACT * abs( s1 ), * * where ERRFACT is computed as two times the machine precision. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PUSCLR (global input) REAL * On entry, PUSCLR specifies the real scale factor. * * X (global input/global output) COMPLEX * On entry, X specifies the scalar to be scaled. On exit, X is * the scaled entry. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL TWO PARAMETER ( TWO = 2.0E+0 ) * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, REAL * .. * .. Executable Statements .. * X = CMPLX( PUSCLR * REAL( X ), PUSCLR * AIMAG( X ) ) * ERRBND = ( TWO * PREC ) * ABS( X ) * RETURN * * End of PCSERRSCAL * END SUBROUTINE PCERRAXPY( ERRBND, PSCLR, X, Y, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL ERRBND, PREC COMPLEX PSCLR, X, Y * .. * * Purpose * ======= * * PCERRAXPY serially computes Y := Y + PSCLR * X and returns a scaled * relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PSCLR (global input) COMPLEX * On entry, PSCLR specifies the scale factor. * * X (global input) COMPLEX * On entry, X specifies the scalar to be scaled. * * Y (global input/global output) COMPLEX * On entry, Y specifies the scalar to be added. On exit, Y con- * tains the resulting scalar. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. REAL ADDBND, FACT, SUMINEG, SUMIPOS, SUMRNEG, $ SUMRPOS COMPLEX TMP * .. * .. Intrinsic Functions .. INTRINSIC AIMAG, MAX, REAL * .. * .. Executable Statements .. * SUMIPOS = ZERO SUMINEG = ZERO SUMRPOS = ZERO SUMRNEG = ZERO FACT = ONE + TWO * PREC ADDBND = TWO * TWO * TWO * PREC * TMP = PSCLR * X IF( REAL( TMP ).GE.ZERO ) THEN SUMRPOS = SUMRPOS + REAL( TMP ) * FACT ELSE SUMRNEG = SUMRNEG - REAL( TMP ) * FACT END IF IF( AIMAG( TMP ).GE.ZERO ) THEN SUMIPOS = SUMIPOS + AIMAG( TMP ) * FACT ELSE SUMINEG = SUMINEG - AIMAG( TMP ) * FACT END IF * TMP = Y IF( REAL( TMP ).GE.ZERO ) THEN SUMRPOS = SUMRPOS + REAL( TMP ) ELSE SUMRNEG = SUMRNEG - REAL( TMP ) END IF IF( AIMAG( TMP ).GE.ZERO ) THEN SUMIPOS = SUMIPOS + AIMAG( TMP ) ELSE SUMINEG = SUMINEG - AIMAG( TMP ) END IF * Y = Y + ( PSCLR * X ) * ERRBND = ADDBND * MAX( MAX( SUMRPOS, SUMRNEG ), $ MAX( SUMIPOS, SUMINEG ) ) * RETURN * * End of PCERRAXPY * END scalapack-2.0.2/PBLAS/TESTING/PCBLAS2TST.dat000644 000766 000024 00000006455 10363532303 020012 0ustar00juliestaff000000 000000 'Level 2 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PCBLAS2TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 16.0 threshold value of test ratio 10 value of the logical computational blocksize NB 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q (2.0E0, -4.0E0) value of ALPHA (3.0E0, -2.0E0) value of BETA 4 number of tests problems 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' values of UPLO 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANS 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' values of DIAG 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 values of M 8 10 12 14 16 18 20 22 24 9 11 13 15 17 19 21 values of N 31 33 35 36 34 32 30 31 33 35 36 34 32 30 38 40 values of M_A 31 33 35 36 34 32 30 31 33 35 36 34 32 30 38 40 values of N_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_A 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IA 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JA 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 values of M_X 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 values of N_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_X 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IX 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JX 30 1 30 1 30 1 30 1 30 1 30 1 30 1 30 1 values of INCX 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 values of M_Y 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 values of N_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_Y 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IY 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JY 40 1 40 1 40 1 40 1 40 1 40 1 40 1 40 1 values of INCY PCGEMV T put F for no test in the same column PCHEMV T put F for no test in the same column PCTRMV T put F for no test in the same column PCTRSV T put F for no test in the same column PCGERU T put F for no test in the same column PCGERC T put F for no test in the same column PCHER T put F for no test in the same column PCHER2 T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/pcblas2tst.f000644 000766 000024 00000354130 11750130340 020117 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PCGEMV ', 'PCHEMV ', 'PCTRMV ', $ 'PCTRSV ', 'PCGERU ', 'PCGERC ', $ 'PCHER ', 'PCHER2 '/ END BLOCK DATA PROGRAM PCBLA2TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PCBLA2TST is the main testing program for the PBLAS Level 2 routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 61 lines: * 'Level 2 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PCBLAS2TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 16.0 threshold value of test ratio * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0E0, 0.0E0) value of ALPHA * (1.0E0, 0.0E0) value of BETA * 2 number of tests problems * 'U' 'L' values of UPLO * 'N' 'T' values of TRANS * 'N' 'U' values of DIAG * 3 4 values of M * 3 4 values of N * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PCGEMV T put F for no test in the same column * PCHEMV T put F for no test in the same column * PCTRMV T put F for no test in the same column * PCTRSV T put F for no test in the same column * PCGERU T put F for no test in the same column * PCGERC T put F for no test in the same column * PCHER T put F for no test in the same column * PCHER2 T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * CPLXSZ INTEGER * REALSZ and CPLXSZ indicate the length in bytes on the given * platform for a single precision real and a single precision * complex. By default, REALSZ is set to four and CPLXSZ is set * to eight. * * MEM COMPLEX array * MEM is an array of dimension TOTMEM / CPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, CPLXSZ, TOTMEM, $ MEMSIZ, NSUBS, REALSZ COMPLEX ONE, PADVAL, ZERO, ROGUE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ CPLXSZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / CPLXSZ, REALSZ = 4, $ ONE = ( 1.0E+0, 0.0E+0 ), $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ROGUE = ( -1.0E+10, 1.0E+10 ), $ ZERO = ( 0.0E+0, 0.0E+0 ), NSUBS = 8 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE CHARACTER*1 AFORM, DIAG, DIAGDO, TRANS, UPLO INTEGER CSRCA, CSRCX, CSRCY, I, IA, IAM, IASEED, ICTXT, $ IGAP, IMBA, IMBX, IMBY, IMIDA, IMIDX, IMIDY, $ INBA, INBX, INBY, INCX, INCY, IPA, IPG, IPMATA, $ IPMATX, IPMATY, IPOSTA, IPOSTX, IPOSTY, IPREA, $ IPREX, IPREY, IPX, IPY, IVERB, IX, IXSEED, IY, $ IYSEED, J, JA, JX, JY, K, LDA, LDX, LDY, M, MA, $ MBA, MBX, MBY, MEMREQD, MPA, MPX, MPY, MX, MY, $ MYCOL, MYROW, N, NA, NBA, NBX, NBY, NCOLA, $ NGRIDS, NLX, NLY, NOUT, NPCOL, NPROCS, NPROW, $ NQA, NQX, NQY, NROWA, NTESTS, NX, NY, OFFD, $ RSRCA, RSRCX, RSRCY, TSKIP, TSTCNT REAL THRESH COMPLEX ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), TRANVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCXVAL( MAXTESTS ), $ CSCYVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCAR( DLEN_ ), DESCX( DLEN_ ), $ DESCXR( DLEN_ ), DESCY( DLEN_ ), $ DESCYR( DLEN_ ), IAVAL( MAXTESTS ), IERR( 6 ), $ IMBAVAL( MAXTESTS ), IMBXVAL( MAXTESTS ), $ IMBYVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JXVAL( MAXTESTS ), $ JYVAL( MAXTESTS ) INTEGER KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), MAVAL( MAXTESTS ), $ MBAVAL( MAXTESTS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NAVAL( MAXTESTS ), NBAVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCAVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_CCHEKPAD, PB_CFILLPAD, PB_CLASCAL, $ PB_CLASET, PB_DESCSET2, PB_PCLAPRNT, $ PCBLA2TSTINFO, PCBLAS2TSTCHK, PCBLAS2TSTCHKE, $ PCCHKARG2, PCCHKVOUT, PCGEMV, PCGERC, PCGERU, $ PCHEMV, PCHER, PCHER2, PCIPSET, PCLAGEN, $ PCLASCAL, PCLASET, PCMPRNT, PCTRMV, PCTRSV, $ PCVPRNT, PMDESCCHK, PMDIMCHK, PVDESCCHK, $ PVDIMCHK * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_FCEIL EXTERNAL PB_FCEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, MAX, MOD, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .TRUE., .FALSE., .FALSE., $ .TRUE., .TRUE., .FALSE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IASEED = 100 IXSEED = 200 IYSEED = 300 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PCBLA2TSTINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, $ IAVAL, JAVAL, MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, CSCYVAL, $ IYVAL, JYVAL, INCYVAL, MAXTESTS, NGRIDS, $ PVAL, MAXGRIDS, QVAL, MAXGRIDS, NBLOG, LTEST, $ SOF, TEE, IAM, IGAP, IVERB, NPROCS, THRESH, $ ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PCBLAS2TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) TRANS = TRANVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) INBA = INBAVAL( J ) MBA = MBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) INBX = INBXVAL( J ) MBX = MBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) * MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) INBY = INBYVAL( J ) MBY = MBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN TSTCNT = TSTCNT + 1 WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, UPLO, TRANS, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY * WRITE( NOUT, FMT = 9995 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANS' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) TRANS WRITE( NOUT, FMT = 9997 ) 'DIAG' TSKIP = TSKIP + 1 GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, IGAP, GAPMUL, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, IGAP, GAPMUL, $ IERR( 2 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, IGAP, GAPMUL, $ IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDA = MAX( 1, MA ) LDX = MAX( 1, MX ) LDY = MAX( 1, MY ) * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPX = IPA + DESCA( LLD_ )*NQA + IPOSTA + IPREX IPY = IPX + DESCX( LLD_ )*NQX + IPOSTX + IPREY IPMATA = IPY + DESCY( LLD_ )*NQY + IPOSTY IPMATX = IPMATA + MA*NA IPMATY = IPMATX + MX*NX IPG = IPMATY + MAX( MX*NX, MY*NY ) * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * MEMREQD = IPG + PB_FCEIL( REAL( MAX( M, N ) ) * $ REAL( REALSZ ), REAL( CPLXSZ ) ) - 1 + $ MAX( MAX( IMBA, MBA ), $ MAX( MAX( IMBX, MBX ), $ MAX( IMBY, MBY ) ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) MEMREQD*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 2 routines * DO 30 K = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) SNAMES( K ) END IF * * Define the size of the operands * IF( K.EQ.1 ) THEN NROWA = M NCOLA = N IF( LSAME( TRANS, 'N' ) ) THEN NLX = N NLY = M ELSE NLX = M NLY = N END IF ELSE IF( K.EQ.5 .OR. K.EQ.6 ) THEN NROWA = M NCOLA = N NLX = M NLY = N ELSE NROWA = N NCOLA = N NLX = N NLY = N END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLX, 'X', IX, JX, DESCX, $ INCX, IERR( 2 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLY, 'Y', IY, JY, DESCY, $ INCY, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 GO TO 30 END IF * * Generate distributed matrices A, X and Y * IF( K.EQ.2 .OR. K.EQ.7 .OR. K.EQ.8 ) THEN AFORM = 'H' DIAGDO = 'N' OFFD = IA - JA ELSE IF( ( K.EQ.4 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN AFORM = 'N' DIAGDO = 'D' OFFD = IA - JA ELSE AFORM = 'N' DIAGDO = 'N' OFFD = 0 END IF * CALL PCLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCAR, MA, NA, IMBA, INBA, MBA, NBA, $ -1, -1, ICTXT, MAX( 1, MA ) ) CALL PCLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCAR, IASEED, MEM( IPMATA ), $ DESCAR( LLD_ ) ) CALL PB_DESCSET2( DESCXR, MX, NX, IMBX, INBX, MBX, NBX, $ -1, -1, ICTXT, MAX( 1, MX ) ) CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCXR, IXSEED, MEM( IPMATX ), $ DESCXR( LLD_ ) ) IF( YCHECK( K ) ) THEN * CALL PB_DESCSET2( DESCYR, MY, NY, IMBY, INBY, MBY, $ NBY, -1, -1, ICTXT, MAX( 1, MY ) ) CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCYR, IYSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) * ELSE * * If Y is not needed, generate a copy of X instead * CALL PB_DESCSET2( DESCYR, MX, NX, IMBX, INBX, MBX, $ NBX, -1, -1, ICTXT, MAX( 1, MX ) ) CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, $ 1, 1, DESCYR, IXSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) * END IF * * Zero non referenced part of the matrices A * IF( ( K.EQ.2 .OR. K.EQ.7 .OR. K.EQ.8 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN * * The distributed matrix A is Hermitian * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of A. * CALL PCLASET( 'Upper', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA, JA+1, DESCA ) IF( K.NE.2 ) THEN CALL PB_CLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of A. * CALL PCLASET( 'Lower', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA+1, JA, DESCA ) IF( K.NE.2 ) THEN CALL PB_CLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * END IF * ELSE IF( K.EQ.3 .OR. K.EQ.4 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix A is lower triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PCLASET( 'Upper', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA, $ JA+1, DESCA ) CALL PB_CLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA-1+JA*LDA ), $ LDA ) END IF * ELSE * CALL PCLASET( 'Upper', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_CLASET( 'Upper', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( K.EQ.4 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / $ CMPLX( REAL( MAX( NROWA, NCOLA ) ) ) CALL PCLASCAL( 'Lower', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA+1, JA, $ DESCA ) CALL PB_CLASCAL( 'Lower', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix A is upper triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PCLASET( 'Lower', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA+1, $ JA, DESCA ) CALL PB_CLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * ELSE * CALL PCLASET( 'Lower', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_CLASET( 'Lower', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( K.EQ.4 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / $ CMPLX( REAL( MAX( NROWA, NCOLA ) ) ) CALL PCLASCAL( 'Upper', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA, JA+1, $ DESCA ) CALL PB_CLASCAL( 'Upper', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * END IF * END IF * END IF * * Pad the guard zones of A, X and Y * CALL PB_CFILLPAD( ICTXT, MPA, NQA, MEM( IPA-IPREA ), $ DESCA( LLD_ ), IPREA, IPOSTA, PADVAL ) * CALL PB_CFILLPAD( ICTXT, MPX, NQX, MEM( IPX-IPREX ), $ DESCX( LLD_ ), IPREX, IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_CFILLPAD( ICTXT, MPY, NQY, MEM( IPY-IPREY ), $ DESCY( LLD_ ), IPREY, IPOSTY, $ PADVAL ) END IF * * Initialize the check for INPUT-only arguments. * INFO = 0 CALL PCCHKARG2( ICTXT, NOUT, SNAMES( K ), UPLO, TRANS, $ DIAG, M, N, ALPHA, IA, JA, DESCA, IX, $ JX, DESCX, INCX, BETA, IY, JY, DESCY, $ INCY, INFO ) * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN CALL PB_PCLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_INITIAL_A', $ NOUT, MEM( IPG ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PCLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, 0, $ 0, 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPG ) ) END IF * IF( IVERB.EQ.2 ) THEN IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PCLAPRNT( 1, NLX, MEM( IPX ), IX, JX, $ DESCX, 0, 0, $ 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) ELSE CALL PB_PCLAPRNT( NLX, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, $ 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PCLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, 0, $ 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) END IF * IF( YCHECK( K ) ) THEN IF( IVERB.EQ.2 ) THEN IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PCLAPRNT( 1, NLY, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) ELSE CALL PB_PCLAPRNT( NLY, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PCLAPRNT( MY, NY, MEM( IPY ), 1, 1, DESCY, $ 0, 0, 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) END IF END IF * * Call the Level 2 PBLAS routine * INFO = 0 IF( K.EQ.1 ) THEN * * Test PCGEMV * CALL PCGEMV( TRANS, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.2 ) THEN * * Test PCHEMV * CALL PCIPSET( 'Bignum', N, MEM( IPA ), IA, JA, DESCA ) * CALL PCHEMV( UPLO, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) * CALL PCIPSET( 'Zero', N, MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.3 ) THEN * * Test PCTRMV * CALL PCTRMV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) * ELSE IF( K.EQ.4 ) THEN * * Test PCTRSV * CALL PCTRSV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) * ELSE IF( K.EQ.5 ) THEN * * Test PCGERU * CALL PCGERU( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.6 ) THEN * * Test PCGERC * CALL PCGERC( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.7 ) THEN * * Test PCHER * IF( CMPLX( REAL( ALPHA ) ).NE.ZERO ) $ CALL PCIPSET( 'Bignum', N, MEM( IPA ), IA, JA, $ DESCA ) * CALL PCHER( UPLO, N, REAL( ALPHA ), MEM( IPX ), IX, $ JX, DESCX, INCX, MEM( IPA ), IA, JA, $ DESCA ) * ELSE IF( K.EQ.8 ) THEN * * Test PCHER2 * IF( ALPHA.NE.ZERO ) $ CALL PCIPSET( 'Bignum', N, MEM( IPA ), IA, JA, $ DESCA ) * CALL PCHER2( UPLO, N, ALPHA, MEM( IPX ), IX, JX, $ DESCX, INCX, MEM( IPY ), IY, JY, DESCY, $ INCY, MEM( IPA ), IA, JA, DESCA ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9974 ) INFO GO TO 30 END IF * * Check padding * CALL PB_CCHEKPAD( ICTXT, SNAMES( K ), MPA, NQA, $ MEM( IPA-IPREA ), DESCA( LLD_ ), IPREA, $ IPOSTA, PADVAL ) * CALL PB_CCHEKPAD( ICTXT, SNAMES( K ), MPX, NQX, $ MEM( IPX-IPREX ), DESCX( LLD_ ), IPREX, $ IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_CCHEKPAD( ICTXT, SNAMES( K ), MPY, NQY, $ MEM( IPY-IPREY ), DESCY( LLD_ ), $ IPREY, IPOSTY, PADVAL ) END IF * * Check the computations * CALL PCBLAS2TSTCHK( ICTXT, NOUT, K, UPLO, TRANS, DIAG, M, $ N, ALPHA, MEM( IPMATA ), MEM( IPA ), $ IA, JA, DESCA, MEM( IPMATX ), $ MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPMATY ), MEM( IPY ), IY, $ JY, DESCY, INCY, THRESH, ROGUE, $ MEM( IPG ), INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 ) THEN IERR( 3 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 IERR( 3 ) = 1 END IF * * Check input-only scalar arguments * INFO = 1 CALL PCCHKARG2( ICTXT, NOUT, SNAMES( K ), UPLO, TRANS, $ DIAG, M, N, ALPHA, IA, JA, DESCA, IX, $ JX, DESCX, INCX, BETA, IY, JY, DESCY, $ INCY, INFO ) * * Check input-only array arguments * CALL PCCHKMOUT( NROWA, NCOLA, MEM( IPMATA ), MEM( IPA ), $ IA, JA, DESCA, IERR( 4 ) ) CALL PCCHKVOUT( NLX, MEM( IPMATX ), MEM( IPX ), IX, JX, $ DESCX, INCX, IERR( 5 ) ) * IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_A', $ SNAMES( K ) END IF * IF( IERR( 5 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_X', $ SNAMES( K ) END IF * IF( YCHECK( K ) ) THEN CALL PCCHKVOUT( NLY, MEM( IPMATY ), MEM( IPY ), IY, $ JY, DESCY, INCY, IERR( 6 ) ) IF( IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_Y', $ SNAMES( K ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE.0 .OR. IERR( 5 ).NE.0 .OR. $ IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) SNAMES( K ) KFAIL( K ) = KFAIL( K ) + 1 ERRFLG = .TRUE. ELSE IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) SNAMES( K ) KPASS( K ) = KPASS( K ) + 1 END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PCMPRNT( ICTXT, NOUT, MA, NA, MEM( IPMATA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PCLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( ( NROWA.GT.0 ).AND.( NCOLA.GT.0 ) ) $ CALL PCMPRNT( ICTXT, NOUT, NROWA, NCOLA, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PCLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_A', $ NOUT, MEM( IPMATA ) ) END IF IF( IERR( 5 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PCMPRNT( ICTXT, NOUT, MX, NX, MEM( IPMATX ), $ LDX, 0, 0, 'SERIAL_X' ) CALL PB_PCLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, $ 0, 0, 'PARALLEL_X', NOUT, $ MEM( IPMATX ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( NLX.GT.0 ) $ CALL PCVPRNT( ICTXT, NOUT, NLX, $ MEM( IPMATX+IX-1+(JX-1)*LDX ), $ INCX, 0, 0, 'SERIAL_X' ) IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PCLAPRNT( 1, NLX, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PCLAPRNT( NLX, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) END IF END IF IF( YCHECK( K ) ) THEN IF( IERR( 6 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PCMPRNT( ICTXT, NOUT, MY, NY, $ MEM( IPMATY ), LDY, 0, 0, $ 'SERIAL_Y' ) CALL PB_PCLAPRNT( MY, NY, MEM( IPY ), 1, 1, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE IF( IERR( 3 ).NE.0 ) THEN IF( NLY.GT.0 ) $ CALL PCVPRNT( ICTXT, NOUT, NLY, $ MEM( IPMATY+IY-1+(JY-1)*LDY ), $ INCY, 0, 0, 'SERIAL_Y' ) IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PCLAPRNT( 1, NLY, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PCLAPRNT( NLY, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) END IF END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9981 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9980 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' M N UPLO TRANS DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,9X,A1,11X,A1,10X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9989 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5,1X,I6 ) 9988 FORMAT( 2X, ' IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9987 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9986 FORMAT( 'Not enough memory. Need: ', I12 ) 9985 FORMAT( 2X, ' Tested Subroutine: ', A ) 9984 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9983 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9982 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9981 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9980 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9979 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9978 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9977 FORMAT( 2X, 'Testing Summary') 9976 FORMAT( 2X, 'End of Tests.' ) 9975 FORMAT( 2X, 'Tests started.' ) 9974 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PCBLA2TST * END SUBROUTINE PCBLA2TSTINFO( SUMMRY, NOUT, NMAT, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, $ IMBAVAL, MBAVAL, INBAVAL, NBAVAL, $ RSCAVAL, CSCAVAL, IAVAL, JAVAL, $ MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, $ IXVAL, JXVAL, INCXVAL, MYVAL, NYVAL, $ IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, $ INCYVAL, LDVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, NBLOG, LTEST, SOF, TEE, $ IAM, IGAP, IVERB, NPROCS, THRESH, ALPHA, $ BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, NBLOG, $ NGRIDS, NMAT, NOUT, NPROCS REAL THRESH COMPLEX ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), TRANVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCXVAL( LDVAL ), $ CSCYVAL( LDVAL ), IAVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBXVAL( LDVAL ), $ IMBYVAL( LDVAL ), INBAVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JAVAL( LDVAL ), $ JXVAL( LDVAL ), JYVAL( LDVAL ), MAVAL( LDVAL ), $ MBAVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NAVAL( LDVAL ), $ NBAVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCXVAL( LDVAL ), $ RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PCBLA2TSTINFO get the needed startup information for testing various * Level 2 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * TRANVAL (global output) CHARACTER array * On entry, TRANVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANS to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, TRANS, UPLO, M, N, DESCA(:), * IA, JA, DESCX(:), IX, JX, INCX, DESCY(:), IY, JY and INCY. * This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 2 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * THRESH (global output) REAL * On exit, THRESH specifies the threshhold value for the test * ratio. * * ALPHA (global output) COMPLEX * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) COMPLEX * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+37*NMAT+NSUBS+4 ) with NSUBS equal to 8. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J REAL EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, CGEBR2D, CGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D *ype real dble cplx zplx * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PCBLAS2TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Read the threshold value for test ratio * READ( NIN, FMT = * ) THRESH IF( THRESH.LT.0.0 ) $ THRESH = 16.0 * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRANVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( TRANVAL( J ) ) WORK( I+2 ) = ICHAR( UPLOVAL( J ) ) I = I + 3 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 2 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex single precision '// $ 'Level 2 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9979 ) NBLOG WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9980 ) THRESH WRITE( NOUT, FMT = 9982 ) ALPHA WRITE( NOUT, FMT = 9981 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 37*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) TRANVAL( J ) = CHAR( WORK( I+1 ) ) UPLOVAL( J ) = CHAR( WORK( I+2 ) ) I = I + 3 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9981 FORMAT( 2X, 'Beta : (', G16.6, $ ',', G16.6, ')' ) 9980 FORMAT( 2X, 'Threshold value : ', G16.6 ) 9979 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PCBLA2TSTINFO * END SUBROUTINE PCBLAS2TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PCBLAS2TSTCHKE tests the error exits of the Level 2 PBLAS. * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 8 (NSUBS). * If LTEST( 1 ) is .TRUE., PCGEMV will be tested; * If LTEST( 2 ) is .TRUE., PCHEMV will be tested; * If LTEST( 3 ) is .TRUE., PCTRMV will be tested; * If LTEST( 4 ) is .TRUE., PCTRSV will be tested; * If LTEST( 5 ) is .TRUE., PCGERU will be tested; * If LTEST( 6 ) is .TRUE., PCGERC will be tested; * If LTEST( 7 ) is .TRUE., PCHER will be tested; * If LTEST( 8 ) is .TRUE., PCHER2 will be tested; * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PCDIMEE, PCGEMV, PCGERC, $ PCGERU, PCHEMV, PCHER, PCHER2, PCMATEE, $ PCOPTEE, PCTRMV, PCTRSV, PCVECEE * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/21, 22, 23, 23, 24, 24, 26, 27/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PCGEMV * I = 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCGEMV, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCGEMV, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCGEMV, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCGEMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PCHEMV * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCHEMV, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCHEMV, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCHEMV, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCHEMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PCTRMV * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCTRMV, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCTRMV, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCTRMV, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCTRMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PCTRSV * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCTRSV, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCTRSV, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCTRSV, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCTRSV, SCODE( I ), SNAMES( I ) ) END IF * * Test PCGERU * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCGERU, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCGERU, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCGERU, SCODE( I ), SNAMES( I ) ) END IF * * Test PCGERC * I = I + 1 IF( LTEST( I ) ) THEN CALL PCDIMEE( ICTXT, NOUT, PCGERC, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCGERC, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCGERC, SCODE( I ), SNAMES( I ) ) END IF * * Test PCHER * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCHER, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCHER, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCHER, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCHER, SCODE( I ), SNAMES( I ) ) END IF * * Test PCHER2 * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCHER2, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCHER2, SCODE( I ), SNAMES( I ) ) CALL PCVECEE( ICTXT, NOUT, PCHER2, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCHER2, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PCBLAS2TSTCHKE * END SUBROUTINE PCCHKARG2( ICTXT, NOUT, SNAME, UPLO, TRANS, DIAG, M, $ N, ALPHA, IA, JA, DESCA, IX, JX, DESCX, $ INCX, BETA, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N, NOUT COMPLEX ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*(*) SNAME INTEGER DESCA( * ), DESCX( * ), DESCY( * ) * .. * * Purpose * ======= * * PCCHKARG2 checks the input-only arguments of the Level 2 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the UPLO option in the Level 2 PBLAS * operation. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the TRANS option in the Level 2 * PBLAS operation. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies the DIAG option in the Level 2 PBLAS * operation. * * M (global input) INTEGER * On entry, M specifies the dimension of the submatrix ope- * rands. * * N (global input) INTEGER * On entry, N specifies the dimension of the submatrix ope- * rands. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 DIAGREF, TRANSREF, UPLOREF INTEGER I, IAREF, INCXREF, INCYREF, IXREF, IYREF, $ JAREF, JXREF, JYREF, MREF, MYCOL, MYROW, NPCOL, $ NPROW, NREF COMPLEX ALPHAREF, BETAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCAREF( DLEN_ ), DESCXREF( DLEN_ ), $ DESCYREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * DIAGREF = DIAG TRANSREF = TRANS UPLOREF = UPLO MREF = M NREF = N ALPHAREF = ALPHA IAREF = IA JAREF = JA DO 10 I = 1, DLEN_ DESCAREF( I ) = DESCA( I ) 10 CONTINUE IXREF = IX JXREF = JX DO 20 I = 1, DLEN_ DESCXREF( I ) = DESCX( I ) 20 CONTINUE INCXREF = INCX BETAREF = BETA IYREF = IY JYREF = JY DO 30 I = 1, DLEN_ DESCYREF( I ) = DESCY( I ) 30 CONTINUE INCYREF = INCY * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( .NOT. LSAME( DIAG, DIAGREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DIAG' ELSE IF( .NOT. LSAME( TRANS, TRANSREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANS' ELSE IF( .NOT. LSAME( UPLO, UPLOREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'UPLO' ELSE IF( M.NE.MREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'M' ELSE IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE IF( IA.NE.IAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IA' ELSE IF( JA.NE.JAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JA' ELSE IF( DESCA( DTYPE_ ).NE.DESCAREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( DTYPE_ )' ELSE IF( DESCA( M_ ).NE.DESCAREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( M_ )' ELSE IF( DESCA( N_ ).NE.DESCAREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( N_ )' ELSE IF( DESCA( IMB_ ).NE.DESCAREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( IMB_ )' ELSE IF( DESCA( INB_ ).NE.DESCAREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( INB_ )' ELSE IF( DESCA( MB_ ).NE.DESCAREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( MB_ )' ELSE IF( DESCA( NB_ ).NE.DESCAREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( NB_ )' ELSE IF( DESCA( RSRC_ ).NE.DESCAREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( RSRC_ )' ELSE IF( DESCA( CSRC_ ).NE.DESCAREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CSRC_ )' ELSE IF( DESCA( CTXT_ ).NE.DESCAREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CTXT_ )' ELSE IF( DESCA( LLD_ ).NE.DESCAREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( LLD_ )' ELSE IF( IX.NE.IXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IX' ELSE IF( JX.NE.JXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JX' ELSE IF( DESCX( DTYPE_ ).NE.DESCXREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( DTYPE_ )' ELSE IF( DESCX( M_ ).NE.DESCXREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( M_ )' ELSE IF( DESCX( N_ ).NE.DESCXREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( N_ )' ELSE IF( DESCX( IMB_ ).NE.DESCXREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( IMB_ )' ELSE IF( DESCX( INB_ ).NE.DESCXREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( INB_ )' ELSE IF( DESCX( MB_ ).NE.DESCXREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( MB_ )' ELSE IF( DESCX( NB_ ).NE.DESCXREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( NB_ )' ELSE IF( DESCX( RSRC_ ).NE.DESCXREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( RSRC_ )' ELSE IF( DESCX( CSRC_ ).NE.DESCXREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CSRC_ )' ELSE IF( DESCX( CTXT_ ).NE.DESCXREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CTXT_ )' ELSE IF( DESCX( LLD_ ).NE.DESCXREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( LLD_ )' ELSE IF( INCX.NE.INCXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCX' ELSE IF( BETA.NE.BETAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'BETA' ELSE IF( IY.NE.IYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IY' ELSE IF( JY.NE.JYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JY' ELSE IF( DESCY( DTYPE_ ).NE.DESCYREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( DTYPE_ )' ELSE IF( DESCY( M_ ).NE.DESCYREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( M_ )' ELSE IF( DESCY( N_ ).NE.DESCYREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( N_ )' ELSE IF( DESCY( IMB_ ).NE.DESCYREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( IMB_ )' ELSE IF( DESCY( INB_ ).NE.DESCYREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( INB_ )' ELSE IF( DESCY( MB_ ).NE.DESCYREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( MB_ )' ELSE IF( DESCY( NB_ ).NE.DESCYREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( NB_ )' ELSE IF( DESCY( RSRC_ ).NE.DESCYREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( RSRC_ )' ELSE IF( DESCY( CSRC_ ).NE.DESCYREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CSRC_ )' ELSE IF( DESCY( CTXT_ ).NE.DESCYREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CTXT_ )' ELSE IF( DESCY( LLD_ ).NE.DESCYREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( LLD_ )' ELSE IF( INCY.NE.INCYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCY' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.NE.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PCCHKARG2 * END SUBROUTINE PCBLAS2TSTCHK( ICTXT, NOUT, NROUT, UPLO, TRANS, DIAG, $ M, N, ALPHA, A, PA, IA, JA, DESCA, X, $ PX, IX, JX, DESCX, INCX, BETA, Y, PY, $ IY, JY, DESCY, INCY, THRESH, ROGUE, $ WORK, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N, NOUT, NROUT REAL THRESH COMPLEX ALPHA, BETA, ROGUE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL WORK( * ) COMPLEX A( * ), PA( * ), PX( * ), PY( * ), X( * ), $ Y( * ) * .. * * Purpose * ======= * * PCBLAS2TSTCHK performs the computational tests of the Level 2 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PCGEMV will be tested; * else if NROUT = 2, PCHEMV will be tested; * else if NROUT = 3, PCTRMV will be tested; * else if NROUT = 4, PCTRSV will be tested; * else if NROUT = 5, PCGERU will be tested; * else if NROUT = 6, PCGERC will be tested; * else if NROUT = 7, PCHER will be tested; * else if NROUT = 8, PCHER2 will be tested; * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies if the upper or lower part of the * matrix operand is to be referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies if the matrix operand A is to be * transposed. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the triangular matrix operand is * unit or non-unit. * * M (global input) INTEGER * On entry, M specifies the number of rows of A. * * N (global input) INTEGER * On entry, N specifies the number of columns of A. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) COMPLEX array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) COMPLEX array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * Y (local input/local output) COMPLEX array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) COMPLEX array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * THRESH (global input) REAL * On entry, THRESH is the threshold value for the test ratio. * * ROGUE (global input) COMPLEX * On entry, ROGUE specifies the constant used to pad the * non-referenced part of triangular, symmetric or Hermitian ma- * trices. * * WORK (workspace) REAL array * On entry, WORK is an array of dimension LWORK where LWORK is * at least MAX( M, N ). This array is used to store the compu- * ted gauges (see PCMVCH). * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on A has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on X has been found, * if( MOD( INFO/4, 2 ) = 1 ) then an error on Y has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL RZERO PARAMETER ( RZERO = 0.0E+0 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW REAL ERR COMPLEX ALPHA1 * .. * .. Local Arrays .. INTEGER IERR( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CTRSV, PB_CLASET, PCCHKMIN, $ PCCHKVIN, PCMVCH, PCTRMV, PCVMCH, PCVMCH2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MIN, REAL * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 10 I = 1, 3 IERR( I ) = 0 10 CONTINUE * IF( NROUT.EQ.1 ) THEN * * Test PCGEMV * * Check the resulting vector Y * CALL PCMVCH( ICTXT, TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, $ IX, JX, DESCX, INCX, BETA, Y, PY, IY, JY, DESCY, $ INCY, WORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PCCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) IF( LSAME( TRANS, 'N' ) ) THEN CALL PCCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 2 ) ) ELSE CALL PCCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, $ IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.2 ) THEN * * Test PCHEMV * * Check the resulting vector Y * CALL PCMVCH( ICTXT, 'No transpose', N, N, ALPHA, A, IA, JA, $ DESCA, X, IX, JX, DESCX, INCX, BETA, Y, PY, IY, $ JY, DESCY, INCY, WORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN CALL PB_CLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF CALL PCCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PCCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) * ELSE IF( NROUT.EQ.3 ) THEN * * Test PCTRMV * * Check the resulting vector X * CALL PCMVCH( ICTXT, TRANS, N, N, ONE, A, IA, JA, DESCA, Y, IX, $ JX, DESCX, INCX, ZERO, X, PX, IX, JX, DESCX, INCX, $ WORK, ERR, IERR( 2 ) ) * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF END IF CALL PCCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) * ELSE IF( NROUT.EQ.4 ) THEN * * Test PCTRSV * * Check the resulting vector X * CALL CTRSV( UPLO, TRANS, DIAG, N, A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ), X( IX+(JX-1)*DESCX( M_ ) ), INCX ) CALL PCTRMV( UPLO, TRANS, DIAG, N, PA, IA, JA, DESCA, PX, IX, $ JX, DESCX, INCX ) CALL PCMVCH( ICTXT, TRANS, N, N, ONE, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX, ZERO, Y, PX, IX, JX, DESCX, INCX, $ WORK, ERR, IERR( 2 ) ) * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF END IF CALL PCCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) * ELSE IF( NROUT.EQ.5 ) THEN * * Test PCGERU * * Check the resulting matrix A * CALL PCVMCH( ICTXT, 'No transpose', 'Ge', M, N, ALPHA, X, IX, $ JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, PA, $ IA, JA, DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PCCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PCCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * ELSE IF( NROUT.EQ.6 ) THEN * * Test PCGERC * * Check the resulting matrix A * CALL PCVMCH( ICTXT, 'Conjugate transpose', 'Ge', M, N, ALPHA, $ X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, $ A, PA, IA, JA, DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PCCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PCCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * ELSE IF( NROUT.EQ.7 ) THEN * * Test PCHER * * Check the resulting matrix A * ALPHA1 = CMPLX( REAL( ALPHA ), RZERO ) CALL PCVMCH( ICTXT, 'Conjugate transpose', UPLO, N, N, ALPHA1, $ X, IX, JX, DESCX, INCX, X, IX, JX, DESCX, INCX, A, $ PA, IA, JA, DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PCCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) * ELSE IF( NROUT.EQ.8 ) THEN * * Test PCHER2 * * Check the resulting matrix A * CALL PCVMCH2( ICTXT, UPLO, N, N, ALPHA, X, IX, JX, DESCX, INCX, $ Y, IY, JY, DESCY, INCY, A, PA, IA, JA, DESCA, $ WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PCCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PCCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * END IF * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'A' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'X' END IF * IF( IERR( 3 ).NE.0 ) THEN INFO = INFO + 4 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'Y' END IF * 9999 FORMAT( 2X, ' ***** ERROR: Matrix operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** ERROR: Vector operand ', A, $ ' is incorrect.' ) 9997 FORMAT( 2X, ' ***** FATAL ERROR - Computed result is less ', $ 'than half accurate *****' ) 9996 FORMAT( 2X, ' ***** Test completed with maximum test ratio: ', $ F11.5, ' SUSPECT *****' ) * RETURN * * End of PCBLAS2TSTCHK * END scalapack-2.0.2/PBLAS/TESTING/PCBLAS3TST.dat000644 000766 000024 00000005157 10363532303 020011 0ustar00juliestaff000000 000000 'Level 3 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PCBLAS3TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 16.0 threshold value of test ratio 10 value of the logical computational blocksize NB 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q (2.0E0, -4.0E0) value of ALPHA (3.0E0, -2.0E0) value of BETA 4 number of tests problems 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' values of DIAG 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' values of SIDE 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANSA 'N' 'N' 'T' 'T' 'N' 'N' 'T' 'T' values of TRANSB 'U' 'L' 'U' 'L' 'U' 'L' 'U' 'L' values of UPLO 22 30 8 20 3 5 11 8 values of M 23 29 7 21 3 14 11 8 values of N 24 28 9 22 3 9 11 8 values of K 31 31 31 31 31 31 31 31 values of M_A 31 31 31 31 31 31 31 31 values of N_A 4 5 6 3 2 1 4 5 values of IMB_A 4 5 6 3 2 1 4 5 values of INB_A 4 5 6 3 2 1 4 5 values of MB_A 4 5 6 3 2 1 4 5 values of NB_A 0 0 0 0 0 0 0 0 values of RSRC_A 0 0 0 0 0 0 0 0 values of CSRC_A 5 1 7 7 1 12 1 11 values of IA 5 1 7 7 1 12 1 11 values of JA 32 32 32 32 32 32 32 32 values of M_B 32 32 32 32 32 32 32 32 values of N_B 4 5 6 3 2 1 4 5 values of IMB_B 4 5 6 3 2 1 4 5 values of INB_B 4 5 6 3 2 1 4 5 values of MB_B 4 5 6 3 2 1 4 5 values of NB_B 0 0 0 0 0 0 0 0 values of RSRC_B 0 0 0 0 0 0 0 0 values of CSRC_B 5 1 7 7 1 12 1 11 values of IB 5 1 7 7 1 12 1 11 values of JB 35 35 35 35 35 35 35 35 values of M_C 35 35 35 35 35 35 35 35 values of N_C 4 5 6 3 2 1 4 5 values of IMB_C 4 5 6 3 2 1 4 5 values of INB_C 4 5 6 3 2 1 4 5 values of MB_C 4 5 6 3 2 1 4 5 values of NB_C 0 0 0 0 0 0 0 0 values of RSRC_C 0 0 0 0 0 0 0 0 values of CSRC_C 5 1 7 7 1 12 1 11 values of IC 5 1 7 7 1 12 1 11 values of JC PCGEMM T put F for no test in the same column PCSYMM T put F for no test in the same column PCHEMM T put F for no test in the same column PCSYRK T put F for no test in the same column PCHERK T put F for no test in the same column PCSYR2K T put F for no test in the same column PCHER2K T put F for no test in the same column PCTRMM T put F for no test in the same column PCTRSM T put F for no test in the same column PCGEADD T put F for no test in the same column PCTRADD T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/pcblas3tst.f000644 000766 000024 00000421171 11750130340 020120 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 11) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PCGEMM ', 'PCSYMM ', 'PCHEMM ', $ 'PCSYRK ', 'PCHERK ', 'PCSYR2K', $ 'PCHER2K', 'PCTRMM ', 'PCTRSM ', $ 'PCGEADD', 'PCTRADD'/ END BLOCK DATA PROGRAM PCBLA3TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PCBLA3TST is the main testing program for the Level 3 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * * from the following 64 lines: * 'Level 3 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PCBLAS3TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 16.0 threshold value of test ratio * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0E0, 0.0E0) value of ALPHA * (1.0E0, 0.0E0) value of BETA * 2 number of tests problems * 'N' 'U' values of DIAG * 'L' 'R' values of SIDE * 'N' 'T' values of TRANSA * 'N' 'T' values of TRANSB * 'U' 'L' values of UPLO * 3 4 values of M * 3 4 values of N * 3 4 values of K * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_B * 6 10 values of N_B * 2 5 values of IMB_B * 2 5 values of INB_B * 2 5 values of MB_B * 2 5 values of NB_B * 0 1 values of RSRC_B * 0 0 values of CSRC_B * 1 1 values of IB * 1 1 values of JB * 6 10 values of M_C * 6 10 values of N_C * 2 5 values of IMB_C * 2 5 values of INB_C * 2 5 values of MB_C * 2 5 values of NB_C * 0 1 values of RSRC_C * 0 0 values of CSRC_C * 1 1 values of IC * 1 1 values of JC * PCGEMM T put F for no test in the same column * PCSYMM T put F for no test in the same column * PCHEMM T put F for no test in the same column * PCSYRK T put F for no test in the same column * PCHERK T put F for no test in the same column * PCSYR2K T put F for no test in the same column * PCHER2K T put F for no test in the same column * PCTRMM T put F for no test in the same column * PCTRSM T put F for no test in the same column * PCGEADD T put F for no test in the same column * PCTRADD T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * CPLXSZ INTEGER * REALSZ and CPLXSZ indicate the length in bytes on the given * platform for a single precision real and a single precision * complex. By default, REALSZ is set to four and CPLXSZ is set * to eight. * * MEM COMPLEX array * MEM is an array of dimension TOTMEM / CPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, CPLXSZ, TOTMEM, $ MEMSIZ, NSUBS, REALSZ COMPLEX ONE, PADVAL, ZERO, ROGUE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ CPLXSZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / CPLXSZ, REALSZ = 4, $ ONE = ( 1.0E+0, 0.0E+0 ), $ PADVAL = ( -9923.0E+0, -9923.0E+0 ), $ ROGUE = ( -1.0E+10, 1.0E+10 ), $ ZERO = ( 0.0E+0, 0.0E+0 ), NSUBS = 11 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE CHARACTER*1 ADIAGDO, AFORM, CFORM, DIAG, SIDE, TRANSA, $ TRANSB, UPLO INTEGER CSRCA, CSRCB, CSRCC, I, IA, IAM, IASEED, IB, $ IBSEED, IC, ICSEED, ICTXT, IGAP, IMBA, IMBB, $ IMBC, IMIDA, IMIDB, IMIDC, INBA, INBB, INBC, $ IPA, IPB, IPC, IPG, IPMATA, IPMATB, IPMATC, $ IPOSTA, IPOSTB, IPOSTC, IPREA, IPREB, IPREC, $ IPW, IVERB, J, JA, JB, JC, K, L, LDA, LDB, LDC, $ M, MA, MB, MBA, MBB, MBC, MC, MEMREQD, MPA, $ MPB, MPC, MYCOL, MYROW, N, NA, NB, NBA, NBB, $ NBC, NC, NCOLA, NCOLB, NCOLC, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQA, NQB, NQC, NROWA, $ NROWB, NROWC, NTESTS, OFFDA, OFFDC, RSRCA, $ RSRCB, RSRCC, TSKIP, TSTCNT REAL THRESH COMPLEX ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL BCHECK( NSUBS ), CCHECK( NSUBS ), $ LTEST( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), SIDEVAL( MAXTESTS ), $ TRNAVAL( MAXTESTS ), TRNBVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCBVAL( MAXTESTS ), $ CSCCVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCAR( DLEN_ ), DESCB( DLEN_ ), $ DESCBR( DLEN_ ), DESCC( DLEN_ ), $ DESCCR( DLEN_ ), IAVAL( MAXTESTS ), $ IBVAL( MAXTESTS ), ICVAL( MAXTESTS ), $ IERR( 6 ), IMBAVAL( MAXTESTS ), $ IMBBVAL( MAXTESTS ), IMBCVAL( MAXTESTS ), $ INBAVAL( MAXTESTS ), INBBVAL( MAXTESTS ), $ INBCVAL( MAXTESTS ), JAVAL( MAXTESTS ), $ JBVAL( MAXTESTS ), JCVAL( MAXTESTS ) INTEGER KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), KVAL( MAXTESTS ), $ MAVAL( MAXTESTS ), MBAVAL( MAXTESTS ), $ MBBVAL( MAXTESTS ), MBCVAL( MAXTESTS ), $ MBVAL( MAXTESTS ), MCVAL( MAXTESTS ), $ MVAL( MAXTESTS ), NAVAL( MAXTESTS ), $ NBAVAL( MAXTESTS ), NBBVAL( MAXTESTS ), $ NBCVAL( MAXTESTS ), NBVAL( MAXTESTS ), $ NCVAL( MAXTESTS ), NVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCAVAL( MAXTESTS ), RSCBVAL( MAXTESTS ), $ RSCCVAL( MAXTESTS ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_CCHEKPAD, PB_CFILLPAD, PB_CLASCAL, $ PB_CLASET, PB_DESCSET2, PB_PCLAPRNT, $ PCBLA3TSTINFO, PCBLAS3TSTCHK, PCBLAS3TSTCHKE, $ PCCHKARG3, PCCHKMOUT, PCGEADD, PCGEMM, PCHEMM, $ PCHER2K, PCHERK, PCIPSET, PCLAGEN, PCLASCAL, $ PCLASET, PCMPRNT, PCSYMM, PCSYR2K, PCSYRK, $ PCTRADD, PCTRMM, PCTRSM, PMDESCCHK, PMDIMCHK * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_FCEIL EXTERNAL PB_FCEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, MAX, MOD, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA BCHECK/.TRUE., .TRUE., .TRUE., .FALSE., $ .FALSE., .TRUE., .TRUE., .TRUE., .TRUE., $ .FALSE., .FALSE./ DATA CCHECK/.TRUE., .TRUE., .TRUE., .TRUE., .TRUE., $ .TRUE., .TRUE., .FALSE., .FALSE., .TRUE., $ .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, * so that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IASEED = 100 IBSEED = 200 ICSEED = 300 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PCBLA3TSTINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, NVAL, $ KVAL, MAVAL, NAVAL, IMBAVAL, MBAVAL, $ INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, IAVAL, $ JAVAL, MBVAL, NBVAL, IMBBVAL, MBBVAL, $ INBBVAL, NBBVAL, RSCBVAL, CSCBVAL, IBVAL, $ JBVAL, MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, ICVAL, $ JCVAL, MAXTESTS, NGRIDS, PVAL, MAXGRIDS, $ QVAL, MAXGRIDS, NBLOG, LTEST, SOF, TEE, IAM, $ IGAP, IVERB, NPROCS, THRESH, ALPHA, BETA, $ MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PCBLAS3TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) SIDE = SIDEVAL( J ) TRANSA = TRNAVAL( J ) TRANSB = TRNBVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) K = KVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MB = MBVAL( J ) NB = NBVAL( J ) IMBB = IMBBVAL( J ) MBB = MBBVAL( J ) INBB = INBBVAL( J ) NBB = NBBVAL( J ) RSRCB = RSCBVAL( J ) CSRCB = CSCBVAL( J ) IB = IBVAL( J ) JB = JBVAL( J ) * MC = MCVAL( J ) NC = NCVAL( J ) IMBC = IMBCVAL( J ) MBC = MBCVAL( J ) INBC = INBCVAL( J ) NBC = NBCVAL( J ) RSRCC = RSCCVAL( J ) CSRCC = CSCCVAL( J ) IC = ICVAL( J ) JC = JCVAL( J ) * IF( IAM.EQ.0 ) THEN * TSTCNT = TSTCNT + 1 * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, K, SIDE, UPLO, TRANSA, $ TRANSB, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IB, JB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IC, JC, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC * WRITE( NOUT, FMT = 9995 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( SIDE, 'L' ).AND. $ .NOT.LSAME( SIDE, 'R' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'SIDE' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSA' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANSB, 'N' ).AND. $ .NOT.LSAME( TRANSB, 'T' ).AND. $ .NOT.LSAME( TRANSB, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSB' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'DIAG' TSKIP = TSKIP + 1 GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, IGAP, GAPMUL, IERR( 1 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'B', DESCB, $ BLOCK_CYCLIC_2D_INB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB, MPB, NQB, IPREB, $ IMIDB, IPOSTB, IGAP, GAPMUL, IERR( 2 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'C', DESCC, $ BLOCK_CYCLIC_2D_INB, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC, MPC, NQC, IPREC, $ IMIDC, IPOSTC, IGAP, GAPMUL, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDA = MAX( 1, MA ) LDB = MAX( 1, MB ) LDC = MAX( 1, MC ) * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPB = IPA + DESCA( LLD_ )*NQA + IPOSTA + IPREB IPC = IPB + DESCB( LLD_ )*NQB + IPOSTB + IPREC IPMATA = IPC + DESCC( LLD_ )*NQC + IPOSTC IPMATB = IPMATA + MA*NA IPMATC = IPMATB + MB*NB IPG = IPMATC + MAX( MB*NB, MC*NC ) * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * IPW = IPG + MAX( MAX( MAX( IMBA, MBA ), $ MAX( IMBB, MBB ) ), $ MAX( IMBC, MBC ) ) + MAX( M, MAX( N, K ) ) MEMREQD = IPW + PB_FCEIL( REAL( MAX( M, MAX( N, K ) ) ) * $ REAL( REALSZ ), REAL( CPLXSZ ) ) - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) MEMREQD*CPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 3 routines * DO 30 L = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( L ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9986 ) SNAMES( L ) END IF * * Define the size of the operands * IF( L.EQ.1 ) THEN * * PCGEMM * NROWC = M NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF( LSAME( TRANSB, 'N' ) ) THEN NROWB = K NCOLB = N ELSE NROWB = N NCOLB = K END IF * ELSE IF( L.EQ.2 .OR. L.EQ.3 ) THEN * * PCSYMM, PCHEMM * NROWC = M NCOLC = N NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF * ELSE IF( L.EQ.4 .OR. L.EQ.5 ) THEN * * PCSYRK, PCHERK * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K ELSE NROWA = K NCOLA = N END IF NROWB = 0 NCOLB = 0 * ELSE IF( L.EQ.6 .OR. L.EQ.7 ) THEN * * PCSYR2K, PCHER2K * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K NROWB = N NCOLB = K ELSE NROWA = K NCOLA = N NROWB = K NCOLB = N END IF * ELSE IF( L.EQ.8 .OR. L.EQ.9 ) THEN NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF NROWC = 0 NCOLC = 0 * ELSE IF( L.EQ.10 .OR. L.EQ.11 ) THEN * * PCGEADD, PCTRADD * IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = N ELSE NROWA = N NCOLA = M END IF NROWC = M NCOLC = N NROWB = 0 NCOLB = 0 * END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWB, NCOLB, 'B', IB, JB, $ DESCB, IERR( 2 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWC, NCOLC, 'C', IC, JC, $ DESCC, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN KSKIP( L ) = KSKIP( L ) + 1 GO TO 30 END IF * * Check special values of TRANSA for symmetric and * hermitian rank-k and rank-2k updates. * IF( L.EQ.4 .OR. L.EQ.6 ) THEN IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9975 ) 'TRANSA' KSKIP( L ) = KSKIP( L ) + 1 GO TO 30 END IF ELSE IF( L.EQ.5 .OR. L.EQ.7 ) THEN IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9975 ) 'TRANSA' KSKIP( L ) = KSKIP( L ) + 1 GO TO 30 END IF END IF * * Generate distributed matrices A, B and C * IF( L.EQ.2 ) THEN * * PCSYMM * AFORM = 'S' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.3 ) THEN * * PCHEMM * AFORM = 'H' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.4 .OR. L.EQ.6 ) THEN * * PCSYRK, PCSYR2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'S' OFFDC = IC - JC * ELSE IF( L.EQ.5 .OR. L.EQ.7 ) THEN * * PCHERK, PCHER2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'H' OFFDC = IC - JC * ELSE IF( ( L.EQ.9 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN * * PCTRSM * AFORM = 'N' ADIAGDO = 'D' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE * * Default values * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'N' OFFDC = 0 * END IF * CALL PCLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) * IF( BCHECK( L ) ) $ CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCB, IBSEED, MEM( IPB ), $ DESCB( LLD_ ) ) * IF( CCHECK( L ) ) $ CALL PCLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCC, ICSEED, MEM( IPC ), $ DESCC( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCAR, MA, NA, IMBA, INBA, MBA, NBA, $ -1, -1, ICTXT, MAX( 1, MA ) ) CALL PCLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCAR, IASEED, MEM( IPMATA ), $ DESCAR( LLD_ ) ) * IF( BCHECK( L ) ) THEN CALL PB_DESCSET2( DESCBR, MB, NB, IMBB, INBB, MBB, $ NBB, -1, -1, ICTXT, MAX( 1, MB ) ) CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCBR, IBSEED, MEM( IPMATB ), $ DESCBR( LLD_ ) ) END IF * IF( CCHECK( L ) ) THEN * CALL PB_DESCSET2( DESCCR, MC, NC, IMBC, INBC, MBC, $ NBC, -1, -1, ICTXT, MAX( 1, MC ) ) CALL PCLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCCR, ICSEED, MEM( IPMATC ), $ DESCCR( LLD_ ) ) * ELSE * * If C is not needed, generate a copy of B instead * CALL PB_DESCSET2( DESCCR, MB, NB, IMBB, INBB, MBB, $ NBB, -1, -1, ICTXT, MAX( 1, MB ) ) CALL PCLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCCR, IBSEED, MEM( IPMATC ), $ DESCCR( LLD_ ) ) * END IF * * Zero non referenced part of the matrices A, B, C * IF( ( ( L.EQ.2 ).OR. ( L.EQ.3 ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN * * The distributed matrix A is symmetric or Hermitian * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of A. * CALL PCLASET( 'Upper', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA, JA+1, DESCA ) * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of A. * CALL PCLASET( 'Lower', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA+1, JA, DESCA ) * END IF * ELSE IF( ( ( L.EQ.4 ).OR.( L.EQ.5 ).OR.( L.EQ.6 ).OR. $ ( L.EQ.7 ) ).AND. $ ( MAX( NROWC, NCOLC ).GT.1 ) ) THEN * * The distributed matrix C is symmetric or Hermitian * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of C. * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PCLASET( 'Upper', NROWC-1, NCOLC-1, ROGUE, $ ROGUE, MEM( IPC ), IC, JC+1, $ DESCC ) CALL PB_CLASET( 'Upper', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC-1+JC*LDC ), LDC ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of C. * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PCLASET( 'Lower', NROWC-1, NCOLC-1, ROGUE, $ ROGUE, MEM( IPC ), IC+1, JC, $ DESCC ) CALL PB_CLASET( 'Lower', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC+(JC-1)*LDC ), $ LDC ) END IF * END IF * ELSE IF( L.EQ.8 .OR. L.EQ.9 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix A is lower triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PCLASET( 'Upper', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA, $ JA+1, DESCA ) CALL PB_CLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA-1+JA*LDA ), $ LDA ) END IF * ELSE * CALL PCLASET( 'Upper', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_CLASET( 'Upper', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( L.EQ.9 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / $ CMPLX( REAL( MAX( NROWA, NCOLA ) ) ) CALL PCLASCAL( 'Lower', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA+1, JA, $ DESCA ) CALL PB_CLASCAL( 'Lower', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix A is upper triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PCLASET( 'Lower', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA+1, $ JA, DESCA ) CALL PB_CLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * ELSE * CALL PCLASET( 'Lower', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_CLASET( 'Lower', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( L.EQ.9 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / $ CMPLX( REAL( MAX( NROWA, NCOLA ) ) ) CALL PCLASCAL( 'Upper', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA, JA+1, $ DESCA ) CALL PB_CLASCAL( 'Upper', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * END IF * END IF * ELSE IF( L.EQ.11 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix C is lower triangular * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PCLASET( 'Upper', NROWC-1, NCOLC-1, $ ROGUE, ROGUE, MEM( IPC ), IC, $ JC+1, DESCC ) CALL PB_CLASET( 'Upper', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC-1+JC*LDC ), LDC ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix C is upper triangular * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PCLASET( 'Lower', NROWC-1, NCOLC-1, $ ROGUE, ROGUE, MEM( IPC ), IC+1, $ JC, DESCC ) CALL PB_CLASET( 'Lower', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC+(JC-1)*LDC ), $ LDC ) END IF * END IF * END IF * * Pad the guard zones of A, B and C * CALL PB_CFILLPAD( ICTXT, MPA, NQA, MEM( IPA-IPREA ), $ DESCA( LLD_ ), IPREA, IPOSTA, PADVAL ) * IF( BCHECK( L ) ) THEN CALL PB_CFILLPAD( ICTXT, MPB, NQB, MEM( IPB-IPREB ), $ DESCB( LLD_ ), IPREB, IPOSTB, $ PADVAL ) END IF * IF( CCHECK( L ) ) THEN CALL PB_CFILLPAD( ICTXT, MPC, NQC, MEM( IPC-IPREC ), $ DESCC( LLD_ ), IPREC, IPOSTC, $ PADVAL ) END IF * * Initialize the check for INPUT-only arguments. * INFO = 0 CALL PCCHKARG3( ICTXT, NOUT, SNAMES( L ), SIDE, UPLO, $ TRANSA, TRANSB, DIAG, M, N, K, ALPHA, IA, $ JA, DESCA, IB, JB, DESCB, BETA, IC, JC, $ DESCC, INFO ) * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN CALL PB_PCLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, $ 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PCLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPW ) ) END IF * IF( BCHECK( L ) ) THEN IF( IVERB.EQ.2 ) THEN CALL PB_PCLAPRNT( NROWB, NCOLB, MEM( IPB ), IB, JB, $ DESCB, 0, 0, $ 'PARALLEL_INITIAL_B', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PCLAPRNT( MB, NB, MEM( IPB ), 1, 1, DESCB, $ 0, 0, 'PARALLEL_INITIAL_B', NOUT, $ MEM( IPW ) ) END IF END IF * IF( CCHECK( L ) ) THEN IF( IVERB.EQ.2 ) THEN CALL PB_PCLAPRNT( NROWC, NCOLC, MEM( IPC ), IC, JC, $ DESCC, 0, 0, $ 'PARALLEL_INITIAL_C', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PCLAPRNT( MC, NC, MEM( IPC ), 1, 1, DESCC, $ 0, 0, 'PARALLEL_INITIAL_C', NOUT, $ MEM( IPW ) ) END IF END IF * * Call the Level 3 PBLAS routine * INFO = 0 IF( L.EQ.1 ) THEN * * Test PCGEMM * CALL PCGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.2 ) THEN * * Test PCSYMM * CALL PCSYMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.3 ) THEN * * Test PCHEMM * CALL PCIPSET( 'Bignum', NROWA, MEM( IPA ), IA, JA, $ DESCA ) * CALL PCHEMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) * CALL PCIPSET( 'Zero', NROWA, MEM( IPA ), IA, JA, $ DESCA ) * ELSE IF( L.EQ.4 ) THEN * * Test PCSYRK * CALL PCSYRK( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.5 ) THEN * * Test PCHERK * IF( ( ( CMPLX( REAL( ALPHA ) ).NE.ZERO ).AND. $ ( K.NE.0 ) ).OR. $ ( CMPLX( REAL( BETA ) ).NE.ONE ) ) $ CALL PCIPSET( 'Bignum', N, MEM( IPC ), IC, JC, $ DESCC ) * CALL PCHERK( UPLO, TRANSA, N, K, REAL( ALPHA ), $ MEM( IPA ), IA, JA, DESCA, REAL( BETA ), $ MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.6 ) THEN * * Test PCSYR2K * CALL PCSYR2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.7 ) THEN * * Test PCHER2K * IF( ( ( ALPHA.NE.ZERO ).AND.( K.NE.0 ) ).OR. $ ( CMPLX( REAL( BETA ) ).NE.ONE ) ) $ CALL PCIPSET( 'Bignum', N, MEM( IPC ), IC, JC, $ DESCC ) * CALL PCHER2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, REAL( BETA ), MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.8 ) THEN * * Test PCTRMM * CALL PCTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) * ELSE IF( L.EQ.9 ) THEN * * Test PCTRSM * CALL PCTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) * * ELSE IF( L.EQ.10 ) THEN * * Test PCGEADD * CALL PCGEADD( TRANSA, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, BETA, MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.11 ) THEN * * Test PCTRADD * CALL PCTRADD( UPLO, TRANSA, M, N, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( L ) = KSKIP( L ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9974 ) INFO GO TO 30 END IF * * Check padding * CALL PB_CCHEKPAD( ICTXT, SNAMES( L ), MPA, NQA, $ MEM( IPA-IPREA ), DESCA( LLD_ ), $ IPREA, IPOSTA, PADVAL ) * IF( BCHECK( L ) ) THEN CALL PB_CCHEKPAD( ICTXT, SNAMES( L ), MPB, NQB, $ MEM( IPB-IPREB ), DESCB( LLD_ ), $ IPREB, IPOSTB, PADVAL ) END IF * IF( CCHECK( L ) ) THEN CALL PB_CCHEKPAD( ICTXT, SNAMES( L ), MPC, NQC, $ MEM( IPC-IPREC ), DESCC( LLD_ ), $ IPREC, IPOSTC, PADVAL ) END IF * * Check the computations * CALL PCBLAS3TSTCHK( ICTXT, NOUT, L, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, $ MEM( IPMATA ), MEM( IPA ), IA, JA, $ DESCA, MEM( IPMATB ), MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPMATC ), $ MEM( IPC ), IC, JC, DESCC, THRESH, $ ROGUE, MEM( IPG ), MEM( IPW ), INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 ) THEN IERR( 3 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 IERR( 3 ) = 1 END IF * * Check input-only scalar arguments * INFO = 1 CALL PCCHKARG3( ICTXT, NOUT, SNAMES( L ), SIDE, UPLO, $ TRANSA, TRANSB, DIAG, M, N, K, ALPHA, IA, $ JA, DESCA, IB, JB, DESCB, BETA, IC, JC, $ DESCC, INFO ) * * Check input-only array arguments * CALL PCCHKMOUT( NROWA, NCOLA, MEM( IPMATA ), $ MEM( IPA ), IA, JA, DESCA, IERR( 4 ) ) IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_A', $ SNAMES( L ) END IF * IF( BCHECK( L ) ) THEN CALL PCCHKMOUT( NROWB, NCOLB, MEM( IPMATB ), $ MEM( IPB ), IB, JB, DESCB, IERR( 5 ) ) IF( IERR( 5 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_B', $ SNAMES( L ) END IF END IF * IF( CCHECK( L ) ) THEN CALL PCCHKMOUT( NROWC, NCOLC, MEM( IPMATC ), $ MEM( IPC ), IC, JC, DESCC, IERR( 6 ) ) IF( IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_C', $ SNAMES( L ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE.0 .OR. IERR( 5 ).NE.0 .OR. $ IERR( 6 ).NE.0 ) THEN KFAIL( L ) = KFAIL( L ) + 1 ERRFLG = .TRUE. IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) SNAMES( L ) ELSE KPASS( L ) = KPASS( L ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) SNAMES( L ) END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PCMPRNT( ICTXT, NOUT, MA, NA, MEM( IPMATA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PCLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( ( NROWA.GT.0 ).AND.( NCOLA.GT.0 ) ) $ CALL PCMPRNT( ICTXT, NOUT, NROWA, NCOLA, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PCLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) END IF IF( BCHECK( L ) ) THEN IF( IERR( 5 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PCMPRNT( ICTXT, NOUT, MB, NB, $ MEM( IPMATB ), LDB, 0, 0, $ 'SERIAL_B' ) CALL PB_PCLAPRNT( MB, NB, MEM( IPB ), 1, 1, $ DESCB, 0, 0, 'PARALLEL_B', $ NOUT, MEM( IPMATB ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( ( NROWB.GT.0 ).AND.( NCOLB.GT.0 ) ) $ CALL PCMPRNT( ICTXT, NOUT, NROWB, NCOLB, $ MEM( IPMATB+IB-1+(JB-1)*LDB ), $ LDB, 0, 0, 'SERIAL_B' ) CALL PB_PCLAPRNT( NROWB, NCOLB, MEM( IPB ), IB, $ JB, DESCB, 0, 0, 'PARALLEL_B', $ NOUT, MEM( IPMATB ) ) END IF END IF IF( CCHECK( L ) ) THEN IF( IERR( 6 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PCMPRNT( ICTXT, NOUT, MC, NC, $ MEM( IPMATC ), LDC, 0, 0, $ 'SERIAL_C' ) CALL PB_PCLAPRNT( MC, NC, MEM( IPC ), 1, 1, $ DESCC, 0, 0, 'PARALLEL_C', $ NOUT, MEM( IPMATC ) ) ELSE IF( IERR( 3 ).NE.0 ) THEN IF( ( NROWB.GT.0 ).AND.( NCOLB.GT.0 ) ) $ CALL PCMPRNT( ICTXT, NOUT, NROWC, NCOLC, $ MEM( IPMATC+IC-1+(JC-1)*LDC ), $ LDC, 0, 0, 'SERIAL_C' ) CALL PB_PCLAPRNT( NROWC, NCOLC, MEM( IPC ), IC, $ JC, DESCC, 0, 0, 'PARALLEL_C', $ NOUT, MEM( IPMATC ) ) END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9982 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9981 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '-------------------' ) 9994 FORMAT( 2X, ' M N K SIDE UPLO TRANSA ', $ 'TRANSB DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,1X,I6,6X,A1,5X,A1,7X,A1,7X,A1,5X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IB JB MB NB IMBB INBB', $ ' MBB NBB RSRCB CSRCB' ) 9989 FORMAT( 2X, ' IC JC MC NC IMBC INBC', $ ' MBC NBC RSRCC CSRCC' ) 9988 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9987 FORMAT( 'Not enough memory. Need: ', I12 ) 9986 FORMAT( 2X, ' Tested Subroutine: ', A ) 9985 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9984 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9983 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9982 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9981 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9980 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9979 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9978 FORMAT( 2X, 'Testing Summary') 9977 FORMAT( 2X, 'End of Tests.' ) 9976 FORMAT( 2X, 'Tests started.' ) 9975 FORMAT( 2X, ' ***** ', A, ' has an incorrect value: ', $ ' BYPASS *****' ) 9974 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PCBLA3TST * END SUBROUTINE PCBLA3TSTINFO( SUMMRY, NOUT, NMAT, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, $ NVAL, KVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, $ CSCAVAL, IAVAL, JAVAL, MBVAL, NBVAL, $ IMBBVAL, MBBVAL, INBBVAL, NBBVAL, $ RSCBVAL, CSCBVAL, IBVAL, JBVAL, $ MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, $ ICVAL, JCVAL, LDVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, NBLOG, LTEST, SOF, $ TEE, IAM, IGAP, IVERB, NPROCS, THRESH, $ ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, NBLOG, $ NGRIDS, NMAT, NOUT, NPROCS REAL THRESH COMPLEX ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), SIDEVAL( LDVAL ), $ TRNAVAL( LDVAL ), TRNBVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCBVAL( LDVAL ), $ CSCCVAL( LDVAL ), IAVAL( LDVAL ), $ IBVAL( LDVAL ), ICVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBBVAL( LDVAL ), $ IMBCVAL( LDVAL ), INBAVAL( LDVAL ), $ INBBVAL( LDVAL ), INBCVAL( LDVAL ), $ JAVAL( LDVAL ), JBVAL( LDVAL ), JCVAL( LDVAL ), $ KVAL( LDVAL ), MAVAL( LDVAL ), MBAVAL( LDVAL ), $ MBBVAL( LDVAL ), MBCVAL( LDVAL ), $ MBVAL( LDVAL ), MCVAL( LDVAL ), MVAL( LDVAL ), $ NAVAL( LDVAL ), NBAVAL( LDVAL ), $ NBBVAL( LDVAL ), NBCVAL( LDVAL ), $ NBVAL( LDVAL ), NCVAL( LDVAL ), NVAL( LDVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCBVAL( LDVAL ), $ RSCCVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PCBLA3TSTINFO get the needed startup information for testing various * Level 3 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * SIDEVAL (global output) CHARACTER array * On entry, SIDEVAL is an array of dimension LDVAL. On exit, * this array contains the values of SIDE to run the code with. * * TRNAVAL (global output) CHARACTER array * On entry, TRNAVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSA to run the code * with. * * TRNBVAL (global output) CHARACTER array * On entry, TRNBVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSB to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * KVAL (global output) INTEGER array * On entry, KVAL is an array of dimension LDVAL. On exit, this * array contains the values of K to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MBVAL (global output) INTEGER array * On entry, MBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( M_ ) to run the code * with. * * NBVAL (global output) INTEGER array * On entry, NBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( N_ ) to run the code * with. * * IMBBVAL (global output) INTEGER array * On entry, IMBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( IMB_ ) to run the * code with. * * MBBVAL (global output) INTEGER array * On entry, MBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( MB_ ) to run the * code with. * * INBBVAL (global output) INTEGER array * On entry, INBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( INB_ ) to run the * code with. * * NBBVAL (global output) INTEGER array * On entry, NBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( NB_ ) to run the * code with. * * RSCBVAL (global output) INTEGER array * On entry, RSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( RSRC_ ) to run the * code with. * * CSCBVAL (global output) INTEGER array * On entry, CSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( CSRC_ ) to run the * code with. * * IBVAL (global output) INTEGER array * On entry, IBVAL is an array of dimension LDVAL. On exit, this * array contains the values of IB to run the code with. * * JBVAL (global output) INTEGER array * On entry, JBVAL is an array of dimension LDVAL. On exit, this * array contains the values of JB to run the code with. * * MCVAL (global output) INTEGER array * On entry, MCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( M_ ) to run the code * with. * * NCVAL (global output) INTEGER array * On entry, NCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( N_ ) to run the code * with. * * IMBCVAL (global output) INTEGER array * On entry, IMBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( IMB_ ) to run the * code with. * * MBCVAL (global output) INTEGER array * On entry, MBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( MB_ ) to run the * code with. * * INBCVAL (global output) INTEGER array * On entry, INBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( INB_ ) to run the * code with. * * NBCVAL (global output) INTEGER array * On entry, NBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( NB_ ) to run the * code with. * * RSCCVAL (global output) INTEGER array * On entry, RSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( RSRC_ ) to run the * code with. * * CSCCVAL (global output) INTEGER array * On entry, CSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( CSRC_ ) to run the * code with. * * ICVAL (global output) INTEGER array * On entry, ICVAL is an array of dimension LDVAL. On exit, this * array contains the values of IC to run the code with. * * JCVAL (global output) INTEGER array * On entry, JCVAL is an array of dimension LDVAL. On exit, this * array contains the values of JC to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, SIDE, TRANSA, TRANSB, UPLO, * M, N, K, DESCA(:), IA, JA, DESCB(:), IB, JB, DESCC(:), IC, * JC. This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eleven. On * exit, if LTEST( i ) is .TRUE., the i-th Level 3 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * THRESH (global output) REAL * On exit, THRESH specifies the threshhold value for the test * ratio. * * ALPHA (global output) COMPLEX * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) COMPLEX * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+38*NMAT+NSUBS+4 ) with NSUBS equal to 11. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 11 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J REAL EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, CGEBR2D, CGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PCBLAS3TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Read the threshold value for test ratio * READ( NIN, FMT = * ) THRESH IF( THRESH.LT.0.0 ) $ THRESH = 16.0 * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( SIDEVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( KVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( ICVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JCVAL ( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL CGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( SIDEVAL( J ) ) WORK( I+2 ) = ICHAR( TRNAVAL( J ) ) WORK( I+3 ) = ICHAR( TRNBVAL( J ) ) WORK( I+4 ) = ICHAR( UPLOVAL( J ) ) I = I + 5 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, KVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, ICVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JCVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 3 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex single precision '// $ 'Level 3 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9979 ) NBLOG WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9980 ) THRESH WRITE( NOUT, FMT = 9982 ) ALPHA WRITE( NOUT, FMT = 9981 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL CGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 38*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) SIDEVAL( J ) = CHAR( WORK( I+1 ) ) TRNAVAL( J ) = CHAR( WORK( I+2 ) ) TRNBVAL( J ) = CHAR( WORK( I+3 ) ) UPLOVAL( J ) = CHAR( WORK( I+4 ) ) I = I + 5 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, KVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, ICVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JCVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9981 FORMAT( 2X, 'Beta : (', G16.6, $ ',', G16.6, ')' ) 9980 FORMAT( 2X, 'Threshold value : ', G16.6 ) 9979 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PCBLA3TSTINFO * END SUBROUTINE PCBLAS3TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PCBLAS3TSTCHKE tests the error exits of the Level 3 PBLAS. * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 11 (NSUBS). * If LTEST( 1 ) is .TRUE., PCGEMM will be tested; * If LTEST( 2 ) is .TRUE., PCSYMM will be tested; * If LTEST( 3 ) is .TRUE., PCHEMM will be tested; * If LTEST( 4 ) is .TRUE., PCSYRK will be tested; * If LTEST( 5 ) is .TRUE., PCHERK will be tested; * If LTEST( 6 ) is .TRUE., PCSYR2K will be tested; * If LTEST( 7 ) is .TRUE., PCHER2K will be tested; * If LTEST( 8 ) is .TRUE., PCTRMM will be tested; * If LTEST( 9 ) is .TRUE., PCTRSM will be tested; * If LTEST( 10 ) is .TRUE., PCGEADD will be tested; * If LTEST( 11 ) is .TRUE., PCTRADD will be tested; * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 11 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PCDIMEE, PCGEADD, PCGEMM, $ PCHEMM, PCHER2K, PCHERK, PCMATEE, PCOPTEE, $ PCSYMM, PCSYR2K, PCSYRK, PCTRADD, PCTRMM, $ PCTRSM * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/31, 32, 32, 33, 34, 35, 36, 38, 38, 39, $ 40/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PCGEMM * I = 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCGEMM, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCGEMM, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCGEMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PCSYMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCSYMM, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCSYMM, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCSYMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PCHEMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCHEMM, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCHEMM, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCHEMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PCSYRK * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCSYRK, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCSYRK, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCSYRK, SCODE( I ), SNAMES( I ) ) END IF * * Test PCHERK * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCHERK, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCHERK, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCHERK, SCODE( I ), SNAMES( I ) ) END IF * * Test PCSYR2K * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCSYR2K, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCSYR2K, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCSYR2K, SCODE( I ), SNAMES( I ) ) END IF * * Test PCHER2K * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCHER2K, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCHER2K, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCHER2K, SCODE( I ), SNAMES( I ) ) END IF * * Test PCTRMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCTRMM, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCTRMM, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCTRMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PCTRSM * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCTRSM, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCTRSM, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCTRSM, SCODE( I ), SNAMES( I ) ) END IF * * Test PCGEADD * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCGEADD, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCGEADD, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCGEADD, SCODE( I ), SNAMES( I ) ) END IF * * Test PCTRADD * I = I + 1 IF( LTEST( I ) ) THEN CALL PCOPTEE( ICTXT, NOUT, PCTRADD, SCODE( I ), SNAMES( I ) ) CALL PCDIMEE( ICTXT, NOUT, PCTRADD, SCODE( I ), SNAMES( I ) ) CALL PCMATEE( ICTXT, NOUT, PCTRADD, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PCBLAS3TSTCHKE * END SUBROUTINE PCCHKARG3( ICTXT, NOUT, SNAME, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, IA, JA, $ DESCA, IB, JB, DESCB, BETA, IC, JC, DESCC, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N, $ NOUT COMPLEX ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*7 SNAME INTEGER DESCA( * ), DESCB( * ), DESCC( * ) * .. * * Purpose * ======= * * PCCHKARG3 checks the input-only arguments of the Level 3 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies the SIDE option in the Level 3 PBLAS * operation. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the UPLO option in the Level 3 PBLAS * operation. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the TRANSA option in the Level 3 * PBLAS operation. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies the TRANSB option in the Level 3 * PBLAS operation. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies the DIAG option in the Level 3 PBLAS * operation. * * M (global input) INTEGER * On entry, M specifies the dimension of the submatrix ope- * rands. * * N (global input) INTEGER * On entry, N specifies the dimension of the submatrix ope- * rands. * * K (global input) INTEGER * On entry, K specifies the dimension of the submatrix ope- * rands. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 DIAGREF, SIDEREF, TRANSAREF, TRANSBREF, UPLOREF INTEGER I, IAREF, IBREF, ICREF, JAREF, JBREF, JCREF, $ KREF, MREF, MYCOL, MYROW, NPCOL, NPROW, NREF COMPLEX ALPHAREF, BETAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCAREF( DLEN_ ), DESCBREF( DLEN_ ), $ DESCCREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * DIAGREF = DIAG SIDEREF = SIDE TRANSAREF = TRANSA TRANSBREF = TRANSB UPLOREF = UPLO MREF = M NREF = N KREF = K ALPHAREF = ALPHA IAREF = IA JAREF = JA DO 10 I = 1, DLEN_ DESCAREF( I ) = DESCA( I ) 10 CONTINUE IBREF = IB JBREF = JB DO 20 I = 1, DLEN_ DESCBREF( I ) = DESCB( I ) 20 CONTINUE BETAREF = BETA ICREF = IC JCREF = JC DO 30 I = 1, DLEN_ DESCCREF( I ) = DESCC( I ) 30 CONTINUE * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( .NOT. LSAME( DIAG, DIAGREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DIAG' ELSE IF( .NOT. LSAME( SIDE, SIDEREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'SIDE' ELSE IF( .NOT. LSAME( TRANSA, TRANSAREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANSA' ELSE IF( .NOT. LSAME( TRANSB, TRANSBREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANSB' ELSE IF( .NOT. LSAME( UPLO, UPLOREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'UPLO' ELSE IF( M.NE.MREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'M' ELSE IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( K.NE.KREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'K' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE IF( IA.NE.IAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IA' ELSE IF( JA.NE.JAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JA' ELSE IF( DESCA( DTYPE_ ).NE.DESCAREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( DTYPE_ )' ELSE IF( DESCA( M_ ).NE.DESCAREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( M_ )' ELSE IF( DESCA( N_ ).NE.DESCAREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( N_ )' ELSE IF( DESCA( IMB_ ).NE.DESCAREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( IMB_ )' ELSE IF( DESCA( INB_ ).NE.DESCAREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( INB_ )' ELSE IF( DESCA( MB_ ).NE.DESCAREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( MB_ )' ELSE IF( DESCA( NB_ ).NE.DESCAREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( NB_ )' ELSE IF( DESCA( RSRC_ ).NE.DESCAREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( RSRC_ )' ELSE IF( DESCA( CSRC_ ).NE.DESCAREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CSRC_ )' ELSE IF( DESCA( CTXT_ ).NE.DESCAREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CTXT_ )' ELSE IF( DESCA( LLD_ ).NE.DESCAREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( LLD_ )' ELSE IF( IB.NE.IBREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IB' ELSE IF( JB.NE.JBREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JB' ELSE IF( DESCB( DTYPE_ ).NE.DESCBREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( DTYPE_ )' ELSE IF( DESCB( M_ ).NE.DESCBREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( M_ )' ELSE IF( DESCB( N_ ).NE.DESCBREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( N_ )' ELSE IF( DESCB( IMB_ ).NE.DESCBREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( IMB_ )' ELSE IF( DESCB( INB_ ).NE.DESCBREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( INB_ )' ELSE IF( DESCB( MB_ ).NE.DESCBREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( MB_ )' ELSE IF( DESCB( NB_ ).NE.DESCBREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( NB_ )' ELSE IF( DESCB( RSRC_ ).NE.DESCBREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( RSRC_ )' ELSE IF( DESCB( CSRC_ ).NE.DESCBREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( CSRC_ )' ELSE IF( DESCB( CTXT_ ).NE.DESCBREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( CTXT_ )' ELSE IF( DESCB( LLD_ ).NE.DESCBREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( LLD_ )' ELSE IF( BETA.NE.BETAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'BETA' ELSE IF( IC.NE.ICREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IC' ELSE IF( JC.NE.JCREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JC' ELSE IF( DESCC( DTYPE_ ).NE.DESCCREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( DTYPE_ )' ELSE IF( DESCC( M_ ).NE.DESCCREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( M_ )' ELSE IF( DESCC( N_ ).NE.DESCCREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( N_ )' ELSE IF( DESCC( IMB_ ).NE.DESCCREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( IMB_ )' ELSE IF( DESCC( INB_ ).NE.DESCCREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( INB_ )' ELSE IF( DESCC( MB_ ).NE.DESCCREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( MB_ )' ELSE IF( DESCC( NB_ ).NE.DESCCREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( NB_ )' ELSE IF( DESCC( RSRC_ ).NE.DESCCREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( RSRC_ )' ELSE IF( DESCC( CSRC_ ).NE.DESCCREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( CSRC_ )' ELSE IF( DESCC( CTXT_ ).NE.DESCCREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( CTXT_ )' ELSE IF( DESCC( LLD_ ).NE.DESCCREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( LLD_ )' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.NE.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PCCHKARG3 * END SUBROUTINE PCBLAS3TSTCHK( ICTXT, NOUT, NROUT, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, A, PA, IA, $ JA, DESCA, B, PB, IB, JB, DESCB, BETA, $ C, PC, IC, JC, DESCC, THRESH, ROGUE, $ WORK, RWORK, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N, $ NOUT, NROUT REAL THRESH COMPLEX ALPHA, BETA, ROGUE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) REAL RWORK( * ) COMPLEX A( * ), B( * ), C( * ), PA( * ), PB( * ), $ PC( * ), WORK( * ) * .. * * Purpose * ======= * * PCBLAS3TSTCHK performs the computational tests of the Level 3 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PCGEMM will be tested; * else if NROUT = 2, PCSYMM will be tested; * else if NROUT = 3, PCHEMM will be tested; * else if NROUT = 4, PCSYRK will be tested; * else if NROUT = 5, PCHERK will be tested; * else if NROUT = 6, PCSYR2K will be tested; * else if NROUT = 7, PCHER2K will be tested; * else if NROUT = 8, PCTRMM will be tested; * else if NROUT = 9, PCTRSM will be tested; * else if NROUT = 10, PCGEADD will be tested; * else if NROUT = 11, PCTRADD will be tested; * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies if the multiplication should be per- * formed from the left or the right. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies if the upper or lower part of the * matrix operand is to be referenced. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies if the matrix operand A is to be * transposed. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies if the matrix operand B is to be * transposed. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the triangular matrix operand is * unit or non-unit. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANSA = 'N' (resp. TRANSA <> 'N') in PxGEMM, PxSYRK, * PxSYR2K, PxHERK and PxHER2K. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) COMPLEX array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * PB (local input) COMPLEX array * On entry, PB is an array of dimension (DESCB( LLD_ ),*). This * array contains the local entries of the matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * THRESH (global input) REAL * On entry, THRESH is the threshold value for the test ratio. * * ROGUE (global input) COMPLEX * On entry, ROGUE specifies the constant used to pad the * non-referenced part of triangular, symmetric or Hermitian ma- * trices. * * WORK (workspace) COMPLEX array * On entry, WORK is an array of dimension LWORK where LWORK is * at least MAX( M, MAX( N, K ) ). This array is used to store * a copy of a column of C (see PCMMCH). * * RWORK (workspace) REAL array * On entry, RWORK is an array of dimension LRWORK where LRWORK * is at least MAX( M, MAX( N, K ) ). This array is used to sto- * re the computed gauges (see PCMMCH). * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on A has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on B has been found, * if( MOD( INFO/4, 2 ) = 1 ) then an error on C has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL RZERO PARAMETER ( RZERO = 0.0E+0 ) COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW REAL ERR COMPLEX ALPHA1, BETA1 * .. * .. Local Arrays .. INTEGER IERR( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CTRSM, PB_CLASET, PCCHKMIN, $ PCMMCH, PCMMCH1, PCMMCH2, PCMMCH3, PCTRMM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, REAL * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 10 I = 1, 3 IERR( I ) = 0 10 CONTINUE * IF( NROUT.EQ.1 ) THEN * * Test PCGEMM * * Check the resulting matrix C * CALL PCMMCH( ICTXT, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, $ DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, JC, $ DESCC, WORK, RWORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCCHKMIN( ERR, M, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PCCHKMIN( ERR, K, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF IF( LSAME( TRANSB, 'N' ) ) THEN CALL PCCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PCCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.2 ) THEN * * Test PCSYMM * * Check the resulting matrix C * IF( LSAME( SIDE, 'L' ) ) THEN CALL PCMMCH( ICTXT, 'No transpose', 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, $ BETA, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) ELSE CALL PCMMCH( ICTXT, 'No transpose', 'No transpose', M, N, N, $ ALPHA, B, IB, JB, DESCB, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_CLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_CLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF * IF( LSAME( SIDE, 'L' ) ) THEN CALL PCCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PCCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF CALL PCCHKMIN( ERR, M, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) * ELSE IF( NROUT.EQ.3 ) THEN * * Test PCHEMM * * Check the resulting matrix C * IF( LSAME( SIDE, 'L' ) ) THEN CALL PCMMCH( ICTXT, 'No transpose', 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, $ BETA, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) ELSE CALL PCMMCH( ICTXT, 'No transpose', 'No transpose', M, N, N, $ ALPHA, B, IB, JB, DESCB, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_CLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_CLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF * IF( LSAME( SIDE, 'L' ) ) THEN CALL PCCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PCCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF CALL PCCHKMIN( ERR, M, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) * ELSE IF( NROUT.EQ.4 ) THEN * * Test PCSYRK * * Check the resulting matrix C * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCMMCH1( ICTXT, UPLO, 'No transpose', N, K, ALPHA, A, $ IA, JA, DESCA, BETA, C, PC, IC, JC, DESCC, $ WORK, RWORK, ERR, IERR( 3 ) ) ELSE CALL PCMMCH1( ICTXT, UPLO, 'Transpose', N, K, ALPHA, A, IA, $ JA, DESCA, BETA, C, PC, IC, JC, DESCC, WORK, $ RWORK, ERR, IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PCCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.5 ) THEN * * Test PCHERK * * Check the resulting matrix C * BETA1 = CMPLX( REAL( BETA ), RZERO ) ALPHA1 = CMPLX( REAL( ALPHA ), RZERO ) IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCMMCH1( ICTXT, UPLO, 'Hermitian', N, K, ALPHA1, A, IA, $ JA, DESCA, BETA1, C, PC, IC, JC, DESCC, WORK, $ RWORK, ERR, IERR( 3 ) ) ELSE CALL PCMMCH1( ICTXT, UPLO, 'Conjugate transpose', N, K, $ ALPHA1, A, IA, JA, DESCA, BETA1, C, PC, IC, $ JC, DESCC, WORK, RWORK, ERR, IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PCCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.6 ) THEN * * Test PCSYR2K * * Check the resulting matrix C * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCMMCH2( ICTXT, UPLO, 'No transpose', N, K, ALPHA, A, $ IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, $ IC, JC, DESCC, WORK, RWORK, ERR, IERR( 3 ) ) ELSE CALL PCMMCH2( ICTXT, UPLO, 'Transpose', N, K, ALPHA, A, $ IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, $ IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PCCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PCCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PCCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.7 ) THEN * * Test PCHER2K * * Check the resulting matrix C * BETA1 = CMPLX( REAL( BETA ), RZERO ) IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCMMCH2( ICTXT, UPLO, 'Hermitian', N, K, ALPHA, A, IA, $ JA, DESCA, B, IB, JB, DESCB, BETA1, C, PC, IC, $ JC, DESCC, WORK, RWORK, ERR, IERR( 3 ) ) ELSE CALL PCMMCH2( ICTXT, UPLO, 'Conjugate transpose', N, K, $ ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, $ BETA1, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PCCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PCCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PCCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.8 ) THEN * * Test PCTRMM * * Check the resulting matrix B * IF( LSAME( SIDE, 'L' ) ) THEN CALL PCMMCH( ICTXT, TRANSA, 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, C, IB, JB, DESCB, $ ZERO, B, PB, IB, JB, DESCB, WORK, RWORK, ERR, $ IERR( 2 ) ) ELSE CALL PCMMCH( ICTXT, 'No transpose', TRANSA, M, N, N, $ ALPHA, C, IB, JB, DESCB, A, IA, JA, DESCA, $ ZERO, B, PB, IB, JB, DESCB, WORK, RWORK, ERR, $ IERR( 2 ) ) END IF * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( SIDE, 'L' ) ) THEN IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Upper', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Lower', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PCCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PCCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.9 ) THEN * * Test PCTRSM * * Check the resulting matrix B * CALL CTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ), $ B( IB+(JB-1)*DESCB( M_ ) ), DESCB( M_ ) ) CALL PCTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, PA, IA, JA, $ DESCA, PB, IB, JB, DESCB ) IF( LSAME( SIDE, 'L' ) ) THEN CALL PCMMCH( ICTXT, TRANSA, 'No transpose', M, N, M, ALPHA, $ A, IA, JA, DESCA, B, IB, JB, DESCB, ZERO, C, $ PB, IB, JB, DESCB, WORK, RWORK, ERR, $ IERR( 2 ) ) ELSE CALL PCMMCH( ICTXT, 'No transpose', TRANSA, M, N, N, ALPHA, $ B, IB, JB, DESCB, A, IA, JA, DESCA, ZERO, C, $ PB, IB, JB, DESCB, WORK, RWORK, ERR, $ IERR( 2 ) ) END IF * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( SIDE, 'L' ) ) THEN IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Upper', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Lower', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PCCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_CLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_CLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PCCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF ELSE IF( NROUT.EQ.10 ) THEN * * Test PCGEADD * * Check the resulting matrix C * CALL PCMMCH3( 'All', TRANSA, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, IERR( 3 ) ) * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PCCHKMIN( ERR, N, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.11 ) THEN * * Test PCTRADD * * Check the resulting matrix C * CALL PCMMCH3( UPLO, TRANSA, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, IERR( 3 ) ) * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PCCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PCCHKMIN( ERR, N, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * END IF * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'A' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'B' END IF * IF( IERR( 3 ).NE.0 ) THEN INFO = INFO + 4 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'C' END IF * 9999 FORMAT( 2X, ' ***** ERROR: Matrix operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** FATAL ERROR - Computed result is less ', $ 'than half accurate *****' ) 9997 FORMAT( 2X, ' ***** Test completed with maximum test ratio: ', $ F11.5, ' SUSPECT *****' ) * RETURN * * End of PCBLAS3TSTCHK * END scalapack-2.0.2/PBLAS/TESTING/pcblastst.f000644 000766 000024 00001470627 11622500733 020055 0ustar00juliestaff000000 000000 SUBROUTINE PCOPTEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PCOPTEE tests whether the PBLAS respond correctly to a bad option * argument. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PCCHKOPT * .. * .. Executable Statements .. * * Level 2 PBLAS * IF( SCODE.EQ.21 ) THEN * * Check 1st (and only) option * APOS = 1 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.22 .OR. SCODE.EQ.25 .OR. SCODE.EQ.26 .OR. $ SCODE.EQ.27 ) THEN * * Check 1st (and only) option * APOS = 1 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st option * APOS = 1 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 2nd option * APOS = 2 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 3rd option * APOS = 3 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'D', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st option * APOS = 1 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2'nd option * APOS = 2 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * ELSE IF( SCODE.EQ.32 ) THEN * * Check 1st option * APOS = 1 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'S', APOS ) * * Check 2nd option * APOS = 2 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 .OR. SCODE.EQ.35 .OR. $ SCODE.EQ.36 .OR. SCODE.EQ.40 ) THEN * * Check 1st option * APOS = 1 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 2'nd option * APOS = 2 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st option * APOS = 1 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'S', APOS ) * * Check 2nd option * APOS = 2 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 3rd option * APOS = 3 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 4th option * APOS = 4 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'D', APOS ) * * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st option * APOS = 1 CALL PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * END IF * RETURN * * End of PCOPTEE * END SUBROUTINE PCCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PCCHKOPT tests the option ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the option to be * checked. ARGNAM can either be 'D', 'S', 'A', 'B', or 'U'. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the option ARGNAM * to be tested. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER INFOT * .. * .. External Subroutines .. EXTERNAL PCCALLSUB, PCHKPBE, PCSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. CHARACTER DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO * .. * .. Executable Statements .. * * Reiniatilize the dummy arguments to correct values * CALL PCSETPBLAS( ICTXT ) * IF( LSAME( ARGNAM, 'D' ) ) THEN * * Generate bad DIAG option * DIAG = '/' * ELSE IF( LSAME( ARGNAM, 'S' ) ) THEN * * Generate bad SIDE option * SIDE = '/' * ELSE IF( LSAME( ARGNAM, 'A' ) ) THEN * * Generate bad TRANSA option * TRANSA = '/' * ELSE IF( LSAME( ARGNAM, 'B' ) ) THEN * * Generate bad TRANSB option * TRANSB = '/' * ELSE IF( LSAME( ARGNAM, 'U' ) ) THEN * * Generate bad UPLO option * UPLO = '/' * END IF * * Set INFOT to the position of the bad dimension argument * INFOT = ARGPOS * * Call the PBLAS routine * CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * RETURN * * End of PCCHKOPT * END SUBROUTINE PCDIMEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PCDIMEE tests whether the PBLAS respond correctly to a bad dimension * argument. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PCCHKDIM * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 .OR. SCODE.EQ.12 .OR. SCODE.EQ.13 .OR. $ SCODE.EQ.14 .OR. SCODE.EQ.15 ) THEN * * Check 1st (and only) dimension * APOS = 1 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * * Check 1st dimension * APOS = 2 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 3 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.22 .OR. SCODE.EQ.25 .OR. SCODE.EQ.26 .OR. $ SCODE.EQ.27 ) THEN * * Check 1st (and only) dimension * APOS = 2 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st (and only) dimension * APOS = 4 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.24 ) THEN * * Check 1st dimension * APOS = 1 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 2 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st dimension * APOS = 3 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Check 3rd dimension * APOS = 5 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'K', APOS ) * ELSE IF( SCODE.EQ.32 ) THEN * * Check 1st dimension * APOS = 3 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 .OR. SCODE.EQ.35 .OR. $ SCODE.EQ.36 ) THEN * * Check 1st dimension * APOS = 3 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Check 2nd dimension * APOS = 4 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'K', APOS ) * ELSE IF( SCODE.EQ.37 ) THEN * * Check 1st dimension * APOS = 1 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 2 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st dimension * APOS = 5 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 6 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st dimension * APOS = 2 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 3 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.40 ) THEN * * Check 1st dimension * APOS = 3 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * END IF * RETURN * * End of PCDIMEE * END SUBROUTINE PCCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PCCHKDIM tests the dimension ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the dimension to be * checked. ARGNAM can either be 'M', 'N' or 'K'. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the option ARGNAM * to be tested. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER INFOT * .. * .. External Subroutines .. EXTERNAL PCCALLSUB, PCHKPBE, PCSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. INTEGER KDIM, MDIM, NDIM COMMON /PBLASN/KDIM, MDIM, NDIM * .. * .. Executable Statements .. * * Reiniatilize the dummy arguments to correct values * CALL PCSETPBLAS( ICTXT ) * IF( LSAME( ARGNAM, 'M' ) ) THEN * * Generate bad MDIM * MDIM = -1 * ELSE IF( LSAME( ARGNAM, 'N' ) ) THEN * * Generate bad NDIM * NDIM = -1 * ELSE * * Generate bad KDIM * KDIM = -1 * END IF * * Set INFOT to the position of the bad dimension argument * INFOT = ARGPOS * * Call the PBLAS routine * CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * RETURN * * End of PCCHKDIM * END SUBROUTINE PCVECEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*7 SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PCVECEE tests whether the PBLAS respond correctly to a bad vector * argument. Each vector is described by: , I, J, * DESC, INC. Out of all these, only I, J, * DESC, and INC can be tested. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PCCHKMAT * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 ) THEN * * Check 1st vector * APOS = 2 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 7 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.12 .OR. SCODE.EQ.15 ) THEN * * Check 1st (and only) vector * APOS = 3 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * ELSE IF( SCODE.EQ.13 ) THEN * * Check 1st vector * APOS = 3 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 8 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.14 ) THEN * * Check 1st (and only) vector * APOS = 4 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * * Check 1st vector * APOS = 9 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 15 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.22 ) THEN * * Check 1st vector * APOS = 8 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 14 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st (and only) vector * APOS = 9 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * ELSE IF( SCODE.EQ.24 .OR. SCODE.EQ.27 ) THEN * * Check 1st vector * APOS = 4 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 9 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.26 .OR. SCODE.EQ.27 ) THEN * * Check 1'st (and only) vector * APOS = 4 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * END IF * RETURN * * End of PCVECEE * END SUBROUTINE PCMATEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*7 SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PCMATEE tests whether the PBLAS respond correctly to a bad matrix * argument. Each matrix is described by: , I, J, * and DESC. Out of all these, only I, J and DESC * can be tested. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PCCHKMAT * .. * .. Executable Statements .. * * Level 2 PBLAS * IF( SCODE.EQ.21 .OR. SCODE.EQ.23 ) THEN * * Check 1st (and only) matrix * APOS = 5 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.22 ) THEN * * Check 1st (and only) matrix * APOS = 4 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.24 .OR. SCODE.EQ.27 ) THEN * * Check 1st (and only) matrix * APOS = 14 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.25 .OR. SCODE.EQ.26 ) THEN * * Check 1st (and only) matrix * APOS = 9 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st matrix * APOS = 7 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * * Check 3nd matrix * APOS = 16 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.32 .OR. SCODE.EQ.35 .OR. SCODE.EQ.36 ) THEN * * Check 1st matrix * APOS = 6 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 10 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * * Check 3nd matrix * APOS = 15 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 ) THEN * * Check 1st matrix * APOS = 6 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.37 ) THEN * * Check 1st matrix * APOS = 4 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 9 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st matrix * APOS = 8 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 12 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st matrix * APOS = 5 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 10 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.40 ) THEN * * Check 1st matrix * APOS = 6 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * END IF * RETURN * * End of PCMATEE * END SUBROUTINE PCSETPBLAS( ICTXT ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT * .. * * Purpose * ======= * * PCSETPBLAS initializes *all* the dummy arguments to correct values. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL RONE COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ RONE = 1.0E+0 ) * .. * .. External Subroutines .. EXTERNAL PB_DESCSET2 * .. * .. Common Blocks .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY, KDIM, MDIM, NDIM REAL USCLR COMPLEX SCLR INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) COMPLEX A( 2, 2 ), B( 2, 2 ), C( 2, 2 ), X( 2 ), Y( 2 ) COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY COMMON /PBLASM/A, B, C COMMON /PBLASN/KDIM, MDIM, NDIM COMMON /PBLASS/SCLR, USCLR COMMON /PBLASV/X, Y * .. * .. Executable Statements .. * * Set default values for options * DIAG = 'N' SIDE = 'L' TRANSA = 'N' TRANSB = 'N' UPLO = 'U' * * Set default values for scalars * KDIM = 1 MDIM = 1 NDIM = 1 ISCLR = 1 SCLR = ONE USCLR = RONE * * Set default values for distributed matrix A * A( 1, 1 ) = ONE A( 2, 1 ) = ONE A( 1, 2 ) = ONE A( 2, 2 ) = ONE IA = 1 JA = 1 CALL PB_DESCSET2( DESCA, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix B * B( 1, 1 ) = ONE B( 2, 1 ) = ONE B( 1, 2 ) = ONE B( 2, 2 ) = ONE IB = 1 JB = 1 CALL PB_DESCSET2( DESCB, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix C * C( 1, 1 ) = ONE C( 2, 1 ) = ONE C( 1, 2 ) = ONE C( 2, 2 ) = ONE IC = 1 JC = 1 CALL PB_DESCSET2( DESCC, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix X * X( 1 ) = ONE X( 2 ) = ONE IX = 1 JX = 1 CALL PB_DESCSET2( DESCX, 2, 1, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) INCX = 1 * * Set default values for distributed matrix Y * Y( 1 ) = ONE Y( 2 ) = ONE IY = 1 JY = 1 CALL PB_DESCSET2( DESCY, 2, 1, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) INCY = 1 * RETURN * * End of PCSETPBLAS * END SUBROUTINE PCCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PCCHKMAT tests the matrix (or vector) ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the matrix or vector * to be checked. ARGNAM can either be 'A', 'B' or 'C' when one * wants to check a matrix, and 'X' or 'Y' for a vector. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the first argument * of the matrix (or vector) ARGNAM. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER DESCMULT PARAMETER ( DESCMULT = 100 ) * .. * .. Local Scalars .. INTEGER I, INFOT, NPROW, NPCOL, MYROW, MYCOL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PCCALLSUB, PCHKPBE, PCSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( ARGNAM, 'A' ) ) THEN * * Check IA. Set all other OK, bad IA * CALL PCSETPBLAS( ICTXT ) IA = -1 INFOT = ARGPOS + 1 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JA. Set all other OK, bad JA * CALL PCSETPBLAS( ICTXT ) JA = -1 INFOT = ARGPOS + 2 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCA. Set all other OK, bad DESCA * DO 10 I = 1, DLEN_ * * Set I'th entry of DESCA to incorrect value, rest ok. * CALL PCSETPBLAS( ICTXT ) DESCA( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCA, CSRCA, LDA * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PCSETPBLAS( ICTXT ) * * Test RSRCA >= NPROW * IF( I.EQ.RSRC_ ) $ DESCA( I ) = NPROW * * Test CSRCA >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCA( I ) = NPCOL * * Test LDA >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCA( I ) = 1 ELSE DESCA( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 10 CONTINUE * ELSE IF( LSAME( ARGNAM, 'B' ) ) THEN * * Check IB. Set all other OK, bad IB * CALL PCSETPBLAS( ICTXT ) IB = -1 INFOT = ARGPOS + 1 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JB. Set all other OK, bad JB * CALL PCSETPBLAS( ICTXT ) JB = -1 INFOT = ARGPOS + 2 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCB. Set all other OK, bad DESCB * DO 20 I = 1, DLEN_ * * Set I'th entry of DESCB to incorrect value, rest ok. * CALL PCSETPBLAS( ICTXT ) DESCB( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCB, CSRCB, LDB * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PCSETPBLAS( ICTXT ) * * Test RSRCB >= NPROW * IF( I.EQ.RSRC_ ) $ DESCB( I ) = NPROW * * Test CSRCB >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCB( I ) = NPCOL * * Test LDB >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCB( I ) = 1 ELSE DESCB( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 20 CONTINUE * ELSE IF( LSAME( ARGNAM, 'C' ) ) THEN * * Check IC. Set all other OK, bad IC * CALL PCSETPBLAS( ICTXT ) IC = -1 INFOT = ARGPOS + 1 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JC. Set all other OK, bad JC * CALL PCSETPBLAS( ICTXT ) JC = -1 INFOT = ARGPOS + 2 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCC. Set all other OK, bad DESCC * DO 30 I = 1, DLEN_ * * Set I'th entry of DESCC to incorrect value, rest ok. * CALL PCSETPBLAS( ICTXT ) DESCC( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCC, CSRCC, LDC * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PCSETPBLAS( ICTXT ) * * Test RSRCC >= NPROW * IF( I.EQ.RSRC_ ) $ DESCC( I ) = NPROW * * Test CSRCC >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCC( I ) = NPCOL * * Test LDC >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCC( I ) = 1 ELSE DESCC( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 30 CONTINUE * ELSE IF( LSAME( ARGNAM, 'X' ) ) THEN * * Check IX. Set all other OK, bad IX * CALL PCSETPBLAS( ICTXT ) IX = -1 INFOT = ARGPOS + 1 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JX. Set all other OK, bad JX * CALL PCSETPBLAS( ICTXT ) JX = -1 INFOT = ARGPOS + 2 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCX. Set all other OK, bad DESCX * DO 40 I = 1, DLEN_ * * Set I'th entry of DESCX to incorrect value, rest ok. * CALL PCSETPBLAS( ICTXT ) DESCX( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCX, CSRCX, LDX * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PCSETPBLAS( ICTXT ) * * Test RSRCX >= NPROW * IF( I.EQ.RSRC_ ) $ DESCX( I ) = NPROW * * Test CSRCX >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCX( I ) = NPCOL * * Test LDX >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCX( I ) = 1 ELSE DESCX( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 40 CONTINUE * * Check INCX. Set all other OK, bad INCX * CALL PCSETPBLAS( ICTXT ) INCX = -1 INFOT = ARGPOS + 4 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * ELSE * * Check IY. Set all other OK, bad IY * CALL PCSETPBLAS( ICTXT ) IY = -1 INFOT = ARGPOS + 1 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JY. Set all other OK, bad JY * CALL PCSETPBLAS( ICTXT ) JY = -1 INFOT = ARGPOS + 2 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCY. Set all other OK, bad DESCY * DO 50 I = 1, DLEN_ * * Set I'th entry of DESCY to incorrect value, rest ok. * CALL PCSETPBLAS( ICTXT ) DESCY( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCY, CSRCY, LDY * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PCSETPBLAS( ICTXT ) * * Test RSRCY >= NPROW * IF( I.EQ.RSRC_ ) $ DESCY( I ) = NPROW * * Test CSRCY >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCY( I ) = NPCOL * * Test LDY >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCY( I ) = 1 ELSE DESCY( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 50 CONTINUE * * Check INCY. Set all other OK, bad INCY * CALL PCSETPBLAS( ICTXT ) INCY = -1 INFOT = ARGPOS + 4 CALL PCCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * RETURN * * End of PCCHKMAT * END SUBROUTINE PCCALLSUB( SUBPTR, SCODE ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER SCODE * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PCCALLSUB calls the subroutine SUBPTR with the calling sequence iden- * tified by SCODE. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Common Blocks .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY, KDIM, MDIM, NDIM REAL USCLR COMPLEX SCLR INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) COMPLEX A( 2, 2 ), B( 2, 2 ), C( 2, 2 ), X( 2 ), Y( 2 ) COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY COMMON /PBLASM/A, B, C COMMON /PBLASN/KDIM, MDIM, NDIM COMMON /PBLASS/SCLR, USCLR COMMON /PBLASV/X, Y * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 ) THEN * CALL SUBPTR( NDIM, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, $ INCY ) * ELSE IF( SCODE.EQ.12 ) THEN * CALL SUBPTR( NDIM, SCLR, X, IX, JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.13 ) THEN * CALL SUBPTR( NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, JY, $ DESCY, INCY ) * ELSE IF( SCODE.EQ.14 ) THEN * CALL SUBPTR( NDIM, SCLR, ISCLR, X, IX, JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.15 ) THEN * CALL SUBPTR( NDIM, USCLR, X, IX, JX, DESCX, INCX ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * CALL SUBPTR( TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX, SCLR, Y, IY, JY, DESCY, INCY ) * ELSE IF( SCODE.EQ.22 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, A, IA, JA, DESCA, X, IX, JX, $ DESCX, INCX, SCLR, Y, IY, JY, DESCY, INCY ) * ELSE IF( SCODE.EQ.23 ) THEN * CALL SUBPTR( UPLO, TRANSA, DIAG, NDIM, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.24 ) THEN * CALL SUBPTR( MDIM, NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, $ JY, DESCY, INCY, A, IA, JA, DESCA ) * ELSE IF( SCODE.EQ.25 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, X, IX, JX, DESCX, INCX, A, IA, $ JA, DESCA ) * ELSE IF( SCODE.EQ.26 ) THEN * CALL SUBPTR( UPLO, NDIM, USCLR, X, IX, JX, DESCX, INCX, A, IA, $ JA, DESCA ) * ELSE IF( SCODE.EQ.27 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, $ JY, DESCY, INCY, A, IA, JA, DESCA ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * CALL SUBPTR( TRANSA, TRANSB, MDIM, NDIM, KDIM, SCLR, A, IA, JA, $ DESCA, B, IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.32 ) THEN * CALL SUBPTR( SIDE, UPLO, MDIM, NDIM, SCLR, A, IA, JA, DESCA, B, $ IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.33 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.34 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, USCLR, A, IA, JA, DESCA, $ USCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.35 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ B, IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.36 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ B, IB, JB, DESCB, USCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.37 ) THEN * CALL SUBPTR( MDIM, NDIM, SCLR, A, IA, JA, DESCA, SCLR, C, IC, $ JC, DESCC ) * ELSE IF( SCODE.EQ.38 ) THEN * CALL SUBPTR( SIDE, UPLO, TRANSA, DIAG, MDIM, NDIM, SCLR, A, IA, $ JA, DESCA, B, IB, JB, DESCB ) * ELSE IF( SCODE.EQ.39 ) THEN * CALL SUBPTR( TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, SCLR, $ C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.40 ) THEN * CALL SUBPTR( UPLO, TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, $ SCLR, C, IC, JC, DESCC ) * END IF * RETURN * * End of PCCALLSUB * END SUBROUTINE PCERRSET( ERR, ERRMAX, XTRUE, X ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL ERR, ERRMAX COMPLEX X, XTRUE * .. * * Purpose * ======= * * PCERRSET computes the absolute difference ERR = |XTRUE - X| and com- * pares it with zero. ERRMAX accumulates the absolute error difference. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERR (local output) REAL * On exit, ERR specifies the absolute difference |XTRUE - X|. * * ERRMAX (local input/local output) REAL * On entry, ERRMAX specifies a previously computed error. On * exit ERRMAX is the accumulated error MAX( ERRMAX, ERR ). * * XTRUE (local input) COMPLEX * On entry, XTRUE specifies the true value. * * X (local input) COMPLEX * On entry, X specifies the value to be compared to XTRUE. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. REAL PSDIFF EXTERNAL PSDIFF * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL * .. * .. Executable Statements .. * ERR = ABS( PSDIFF( REAL( XTRUE ), REAL( X ) ) ) ERR = MAX( ERR, ABS( PSDIFF( AIMAG( XTRUE ), AIMAG( X ) ) ) ) * ERRMAX = MAX( ERRMAX, ERR ) * RETURN * * End of PCERRSET * END SUBROUTINE PCCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INFO, IX, JX, N REAL ERRMAX * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX PX( * ), X( * ) * .. * * Purpose * ======= * * PCCHKVIN checks that the submatrix sub( PX ) remained unchanged. The * local array entries are compared element by element, and their dif- * ference is tested against 0.0 as well as the epsilon machine. Notice * that this difference should be numerically exactly the zero machine, * but because of the possible fluctuation of some of the data we flag- * ged differently a difference less than twice the epsilon machine. The * largest error is also returned. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERRMAX (global output) REAL * On exit, ERRMAX specifies the largest absolute element-wise * difference between sub( X ) and sub( PX ). * * N (global input) INTEGER * On entry, N specifies the length of the subvector operand * sub( X ). N must be at least zero. * * X (local input) COMPLEX array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) COMPLEX array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, ICURROW, IIX, IN, IXCOL, $ IXROW, J, JB, JJX, JN, KK, LDPX, LDX, LL, $ MYCOL, MYROW, NPCOL, NPROW REAL ERR, EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_INFOG2L, PCERRSET, SGAMX2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) * LDX = DESCX( M_ ) LDPX = DESCX( LLD_ ) ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( N.EQ.1 ) THEN * IF( ( MYROW.EQ.IXROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.IXCOL .OR. COLREP ) ) $ CALL PCERRSET( ERR, ERRMAX, X( IX+(JX-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) * ELSE IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * IF( MYROW.EQ.IXROW .OR. ROWREP ) THEN * ICURCOL = IXCOL IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN DO 10 J = JX, JN CALL PCERRSET( ERR, ERRMAX, X( IX+(J-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) JJX = JJX + 1 10 CONTINUE END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 30 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * DO 20 KK = 0, JB-1 CALL PCERRSET( ERR, ERRMAX, X( IX+(J+KK-1)*LDX ), $ PX( IIX+(JJX+KK-1)*LDPX ) ) 20 CONTINUE * JJX = JJX + JB * END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 30 CONTINUE * END IF * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * IF( MYCOL.EQ.IXCOL .OR. COLREP ) THEN * ICURROW = IXROW IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 40 I = IX, IN CALL PCERRSET( ERR, ERRMAX, X( I+(JX-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) IIX = IIX + 1 40 CONTINUE END IF ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * DO 50 KK = 0, IB-1 CALL PCERRSET( ERR, ERRMAX, X( I+KK+(JX-1)*LDX ), $ PX( IIX+KK+(JJX-1)*LDPX ) ) 50 CONTINUE * IIX = IIX + IB * END IF * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PCCHKVIN * END SUBROUTINE PCCHKVOUT( N, X, PX, IX, JX, DESCX, INCX, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INFO, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX PX( * ), X( * ) * .. * * Purpose * ======= * * PCCHKVOUT checks that the matrix PX \ sub( PX ) remained unchanged. * The local array entries are compared element by element, and their * difference is tested against 0.0 as well as the epsilon machine. No- * tice that this difference should be numerically exactly the zero ma- * chine, but because of the possible movement of some of the data we * flagged differently a difference less than twice the epsilon machine. * The largest error is reported. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector operand * sub( X ). N must be at least zero. * * X (local input) COMPLEX array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) COMPLEX array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, ICURROW, II, IMBX, INBX, $ J, JB, JJ, KK, LDPX, LDX, LL, MBX, MPALL, $ MYCOL, MYCOLDIST, MYROW, MYROWDIST, NBX, NPCOL, $ NPROW, NQALL REAL EPS, ERR, ERRMAX * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PCERRSET, SGAMX2D * .. * .. External Functions .. INTEGER PB_NUMROC REAL PSLAMCH EXTERNAL PSLAMCH, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( ( DESCX( M_ ).LE.0 ).OR.( DESCX( N_ ).LE.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * MPALL = PB_NUMROC( DESCX( M_ ), 1, DESCX( IMB_ ), DESCX( MB_ ), $ MYROW, DESCX( RSRC_ ), NPROW ) NQALL = PB_NUMROC( DESCX( N_ ), 1, DESCX( INB_ ), DESCX( NB_ ), $ MYCOL, DESCX( CSRC_ ), NPCOL ) * MBX = DESCX( MB_ ) NBX = DESCX( NB_ ) LDX = DESCX( M_ ) LDPX = DESCX( LLD_ ) ICURROW = DESCX( RSRC_ ) ICURCOL = DESCX( CSRC_ ) ROWREP = ( ICURROW.EQ.-1 ) COLREP = ( ICURCOL.EQ.-1 ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN IMBX = DESCX( IMB_ ) ELSE IMBX = MBX END IF IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN INBX = DESCX( INB_ ) ELSE INBX = NBX END IF IF( ROWREP ) THEN MYROWDIST = 0 ELSE MYROWDIST = MOD( MYROW - ICURROW + NPROW, NPROW ) END IF IF( COLREP ) THEN MYCOLDIST = 0 ELSE MYCOLDIST = MOD( MYCOL - ICURCOL + NPCOL, NPCOL ) END IF II = 1 JJ = 1 * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * I = 1 IF( MYCOLDIST.EQ.0 ) THEN J = 1 ELSE J = DESCX( INB_ ) + ( MYCOLDIST - 1 ) * NBX + 1 END IF JB = MIN( MAX( 0, DESCX( N_ ) - J + 1 ), INBX ) IB = MIN( DESCX( M_ ), DESCX( IMB_ ) ) * DO 20 KK = 0, JB-1 DO 10 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. J+KK.GT.JX+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 10 CONTINUE 20 CONTINUE IF( COLREP ) THEN J = J + INBX ELSE J = J + INBX + ( NPCOL - 1 ) * NBX END IF * DO 50 JJ = INBX+1, NQALL, NBX JB = MIN( NQALL-JJ+1, NBX ) * DO 40 KK = 0, JB-1 DO 30 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 30 CONTINUE 40 CONTINUE * IF( COLREP ) THEN J = J + NBX ELSE J = J + NPCOL * NBX END IF * 50 CONTINUE * II = II + IB * END IF * ICURROW = MOD( ICURROW + 1, NPROW ) * DO 110 I = DESCX( IMB_ ) + 1, DESCX( M_ ), MBX IB = MIN( DESCX( M_ ) - I + 1, MBX ) * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * IF( MYCOLDIST.EQ.0 ) THEN J = 1 ELSE J = DESCX( INB_ ) + ( MYCOLDIST - 1 ) * NBX + 1 END IF * JJ = 1 JB = MIN( MAX( 0, DESCX( N_ ) - J + 1 ), INBX ) DO 70 KK = 0, JB-1 DO 60 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 60 CONTINUE 70 CONTINUE IF( COLREP ) THEN J = J + INBX ELSE J = J + INBX + ( NPCOL - 1 ) * NBX END IF * DO 100 JJ = INBX+1, NQALL, NBX JB = MIN( NQALL-JJ+1, NBX ) * DO 90 KK = 0, JB-1 DO 80 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 80 CONTINUE 90 CONTINUE * IF( COLREP ) THEN J = J + NBX ELSE J = J + NPCOL * NBX END IF * 100 CONTINUE * II = II + IB * END IF * ICURROW = MOD( ICURROW + 1, NPROW ) * 110 CONTINUE * ELSE * * sub( X ) is a column vector * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * J = 1 IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCX( IMB_ ) + ( MYROWDIST - 1 ) * MBX + 1 END IF IB = MIN( MAX( 0, DESCX( M_ ) - I + 1 ), IMBX ) JB = MIN( DESCX( N_ ), DESCX( INB_ ) ) * DO 130 KK = 0, JB-1 DO 120 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. I+LL.GT.IX+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 120 CONTINUE 130 CONTINUE IF( ROWREP ) THEN I = I + IMBX ELSE I = I + IMBX + ( NPROW - 1 ) * MBX END IF * DO 160 II = IMBX+1, MPALL, MBX IB = MIN( MPALL-II+1, MBX ) * DO 150 KK = 0, JB-1 DO 140 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 140 CONTINUE 150 CONTINUE * IF( ROWREP ) THEN I = I + MBX ELSE I = I + NPROW * MBX END IF * 160 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * DO 220 J = DESCX( INB_ ) + 1, DESCX( N_ ), NBX JB = MIN( DESCX( N_ ) - J + 1, NBX ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCX( IMB_ ) + ( MYROWDIST - 1 ) * MBX + 1 END IF * II = 1 IB = MIN( MAX( 0, DESCX( M_ ) - I + 1 ), IMBX ) DO 180 KK = 0, JB-1 DO 170 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 170 CONTINUE 180 CONTINUE IF( ROWREP ) THEN I = I + IMBX ELSE I = I + IMBX + ( NPROW - 1 ) * MBX END IF * DO 210 II = IMBX+1, MPALL, MBX IB = MIN( MPALL-II+1, MBX ) * DO 200 KK = 0, JB-1 DO 190 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 190 CONTINUE 200 CONTINUE * IF( ROWREP ) THEN I = I + MBX ELSE I = I + NPROW * MBX END IF * 210 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * 220 CONTINUE * END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PCCHKVOUT * END SUBROUTINE PCCHKMIN( ERRMAX, M, N, A, PA, IA, JA, DESCA, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N REAL ERRMAX * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX PA( * ), A( * ) * .. * * Purpose * ======= * * PCCHKMIN checks that the submatrix sub( PA ) remained unchanged. The * local array entries are compared element by element, and their dif- * ference is tested against 0.0 as well as the epsilon machine. Notice * that this difference should be numerically exactly the zero machine, * but because of the possible fluctuation of some of the data we flag- * ged differently a difference less than twice the epsilon machine. The * largest error is also returned. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERRMAX (global output) REAL * On exit, ERRMAX specifies the largest absolute element-wise * difference between sub( A ) and sub( PA ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * operand sub( A ). N must be at least zero. * * A (local input) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ KK, LDA, LDPA, LL, MYCOL, MYROW, NPCOL, NPROW REAL ERR, EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_INFOG2L, PCERRSET, SGAMX2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, MIN, MOD, REAL * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if posssible * IF( ( M.EQ.0 ).OR.( N.EQ.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL PB_INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) * II = IIA JJ = JJA LDA = DESCA( M_ ) LDPA = DESCA( LLD_ ) ICURROW = IAROW ICURCOL = IACOL ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * * Handle the first block of column separately * JB = DESCA( INB_ ) - JA + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCA( NB_ ) + 1 ) * DESCA( NB_ ) + JB JB = MIN( JB, N ) JN = JA + JB - 1 * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * DO 40 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 10 K = 0, IB-1 CALL PCERRSET( ERR, ERRMAX, A( IA+K+(JA+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 10 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) * * Loop over remaining block of rows * DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 20 K = 0, IB-1 CALL PCERRSET( ERR, ERRMAX, A( I+K+(JA+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 20 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) 30 CONTINUE * II = IIA ICURROW = IAROW 40 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining column blocks * DO 90 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN DO 80 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 )*DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 50 K = 0, IB-1 CALL PCERRSET( ERR, ERRMAX, A( IA+K+(J+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 50 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) * * Loop over remaining block of rows * DO 70 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 60 K = 0, IB-1 CALL PCERRSET( ERR, ERRMAX, $ A( I+K+(J+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 60 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) 70 CONTINUE * II = IIA ICURROW = IAROW 80 CONTINUE * JJ = JJ + JB END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 90 CONTINUE * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PCCHKMIN * END SUBROUTINE PCCHKMOUT( M, N, A, PA, IA, JA, DESCA, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ), PA( * ) * .. * * Purpose * ======= * * PCCHKMOUT checks that the matrix PA \ sub( PA ) remained unchanged. * The local array entries are compared element by element, and their * difference is tested against 0.0 as well as the epsilon machine. No- * tice that this difference should be numerically exactly the zero ma- * chine, but because of the possible movement of some of the data we * flagged differently a difference less than twice the epsilon machine. * The largest error is reported. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( PA ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( PA ). N must be at least zero. * * A (local input) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, II, IMBA, J, JB, JJ, KK, $ LDA, LDPA, LL, MPALL, MYCOL, MYROW, MYROWDIST, $ NPCOL, NPROW REAL EPS, ERR, ERRMAX * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PCERRSET, SGAMX2D * .. * .. External Functions .. INTEGER PB_NUMROC REAL PSLAMCH EXTERNAL PSLAMCH, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( ( DESCA( M_ ).LE.0 ).OR.( DESCA( N_ ).LE.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * MPALL = PB_NUMROC( DESCA( M_ ), 1, DESCA( IMB_ ), DESCA( MB_ ), $ MYROW, DESCA( RSRC_ ), NPROW ) * LDA = DESCA( M_ ) LDPA = DESCA( LLD_ ) * II = 1 JJ = 1 ROWREP = ( DESCA( RSRC_ ).EQ.-1 ) COLREP = ( DESCA( CSRC_ ).EQ.-1 ) ICURCOL = DESCA( CSRC_ ) IF( MYROW.EQ.DESCA( RSRC_ ) .OR. ROWREP ) THEN IMBA = DESCA( IMB_ ) ELSE IMBA = DESCA( MB_ ) END IF IF( ROWREP ) THEN MYROWDIST = 0 ELSE MYROWDIST = MOD( MYROW - DESCA( RSRC_ ) + NPROW, NPROW ) END IF * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * J = 1 IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCA( IMB_ ) + ( MYROWDIST - 1 ) * DESCA( MB_ ) + 1 END IF IB = MIN( MAX( 0, DESCA( M_ ) - I + 1 ), IMBA ) JB = MIN( DESCA( N_ ), DESCA( INB_ ) ) * DO 20 KK = 0, JB-1 DO 10 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 10 CONTINUE 20 CONTINUE IF( ROWREP ) THEN I = I + IMBA ELSE I = I + IMBA + ( NPROW - 1 ) * DESCA( MB_ ) END IF * DO 50 II = IMBA + 1, MPALL, DESCA( MB_ ) IB = MIN( MPALL-II+1, DESCA( MB_ ) ) * DO 40 KK = 0, JB-1 DO 30 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 30 CONTINUE 40 CONTINUE * IF( ROWREP ) THEN I = I + DESCA( MB_ ) ELSE I = I + NPROW * DESCA( MB_ ) END IF * 50 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * DO 110 J = DESCA( INB_ ) + 1, DESCA( N_ ), DESCA( NB_ ) JB = MIN( DESCA( N_ ) - J + 1, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCA( IMB_ ) + ( MYROWDIST - 1 ) * DESCA( MB_ ) + 1 END IF * II = 1 IB = MIN( MAX( 0, DESCA( M_ ) - I + 1 ), IMBA ) DO 70 KK = 0, JB-1 DO 60 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 60 CONTINUE 70 CONTINUE IF( ROWREP ) THEN I = I + IMBA ELSE I = I + IMBA + ( NPROW - 1 ) * DESCA( MB_ ) END IF * DO 100 II = IMBA+1, MPALL, DESCA( MB_ ) IB = MIN( MPALL-II+1, DESCA( MB_ ) ) * DO 90 KK = 0, JB-1 DO 80 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PCERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 80 CONTINUE 90 CONTINUE * IF( ROWREP ) THEN I = I + DESCA( MB_ ) ELSE I = I + NPROW * DESCA( MB_ ) END IF * 100 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * INSERT MODE 110 CONTINUE * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PCCHKMOUT * END SUBROUTINE PCMPRNT( ICTXT, NOUT, M, N, A, LDA, IRPRNT, ICPRNT, $ CMATNM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICPRNT, ICTXT, IRPRNT, LDA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PCMPRNT prints to the standard output an array A of size m by n. Only * the process of coordinates ( IRPRNT, ICPRNT ) is printing. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * M (global input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LDA,N). The leading m * by n part of this array is printed. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array A to be printed. LDA must be at least MAX( 1, M ). * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the process row coordinate of the * printing process. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the process column coordinate of * the printing process. * * CMATNM (global input) CHARACTER*(*) * On entry, CMATNM specifies the identifier of the matrix to be * printed. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC AIMAG, REAL * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN * WRITE( NOUT, FMT = * ) DO 20 J = 1, N * DO 10 I = 1, M * WRITE( NOUT, FMT = 9999 ) CMATNM, I, J, $ REAL( A( I, J ) ), AIMAG( A( I, J ) ) * 10 CONTINUE * 20 CONTINUE * END IF * 9999 FORMAT( 1X, A, '(', I6, ',', I6, ')=', E16.8, '+i*(', $ E16.8, ')' ) * RETURN * * End of PCMPRNT * END SUBROUTINE PCVPRNT( ICTXT, NOUT, N, X, INCX, IRPRNT, ICPRNT, $ CVECNM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICPRNT, ICTXT, INCX, IRPRNT, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CVECNM COMPLEX X( * ) * .. * * Purpose * ======= * * PCVPRNT prints to the standard output an vector x of length n. Only * the process of coordinates ( IRPRNT, ICPRNT ) is printing. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * N (global input) INTEGER * On entry, N specifies the length of the vector X. N must be * at least zero. * * X (global input) COMPLEX array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the process row coordinate of the * printing process. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the process column coordinate of * the printing process. * * CVECNM (global input) CHARACTER*(*) * On entry, CVECNM specifies the identifier of the vector to be * printed. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC AIMAG, REAL * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN * WRITE( NOUT, FMT = * ) DO 10 I = 1, 1 + ( N-1 )*INCX, INCX * WRITE( NOUT, FMT = 9999 ) CVECNM, I, REAL( X( I ) ), $ AIMAG( X( I ) ) * 10 CONTINUE * END IF * 9999 FORMAT( 1X, A, '(', I6, ')=', E16.8, '+i*(', E16.8, ')' ) * RETURN * * End of PCVPRNT * END SUBROUTINE PCMVCH( ICTXT, TRANS, M, N, ALPHA, A, IA, JA, DESCA, $ X, IX, JX, DESCX, INCX, BETA, Y, PY, IY, JY, $ DESCY, INCY, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N REAL ERR COMPLEX ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL G( * ) COMPLEX A( * ), PY( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PCMVCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies which matrix-vector product is to * be computed as follows: * If TRANS = 'T', * sub( Y ) = BETA * sub( Y ) + sub( A )**T * sub( X ), * else if TRANS = 'C', * sub( Y ) = BETA * sub( Y ) + sub( A )**H * sub( X ), * otherwise * sub( Y ) = BETA * sub( Y ) + sub( A ) * sub( X ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * Y (local input/local output) COMPLEX array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) COMPLEX array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0E+0, RONE = 1.0E+0 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL COLREP, CTRAN, ROWREP, TRAN INTEGER I, IB, ICURCOL, ICURROW, IIY, IN, IOFFA, IOFFX, $ IOFFY, IYCOL, IYROW, J, JB, JJY, JN, KK, LDA, $ LDPY, LDX, LDY, ML, MYCOL, MYROW, NL, NPCOL, $ NPROW REAL EPS, ERRI, GTMP COMPLEX C, TBETA, YTMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, MIN, MOD, REAL, SQRT * .. * .. Statement Functions .. REAL ABS1 ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * IF( M.EQ.0 .OR. N.EQ.0 ) THEN TBETA = ONE ELSE TBETA = BETA END IF * TRAN = LSAME( TRANS, 'T' ) CTRAN = LSAME( TRANS, 'C' ) IF( TRAN.OR.CTRAN ) THEN ML = N NL = M ELSE ML = M NL = N END IF * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in Y using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * IOFFY = IY + ( JY - 1 ) * LDY DO 40 I = 1, ML YTMP = ZERO GTMP = RZERO IOFFX = IX + ( JX - 1 ) * LDX IF( TRAN )THEN IOFFA = IA + ( JA + I - 2 ) * LDA DO 10 J = 1, NL YTMP = YTMP + A( IOFFA ) * X( IOFFX ) GTMP = GTMP + ABS1( A( IOFFA ) ) * ABS1( X( IOFFX ) ) IOFFA = IOFFA + 1 IOFFX = IOFFX + INCX 10 CONTINUE ELSE IF( CTRAN )THEN IOFFA = IA + ( JA + I - 2 ) * LDA DO 20 J = 1, NL YTMP = YTMP + CONJG( A( IOFFA ) ) * X( IOFFX ) GTMP = GTMP + ABS1( A( IOFFA ) ) * ABS1( X( IOFFX ) ) IOFFA = IOFFA + 1 IOFFX = IOFFX + INCX 20 CONTINUE ELSE IOFFA = IA + I - 1 + ( JA - 1 ) * LDA DO 30 J = 1, NL YTMP = YTMP + A( IOFFA ) * X( IOFFX ) GTMP = GTMP + ABS1( A( IOFFA ) ) * ABS1( X( IOFFX ) ) IOFFA = IOFFA + LDA IOFFX = IOFFX + INCX 30 CONTINUE END IF G( I ) = ABS1( ALPHA )*GTMP + ABS1( TBETA )*ABS1( Y( IOFFY ) ) Y( IOFFY ) = ALPHA * YTMP + TBETA * Y( IOFFY ) IOFFY = IOFFY + INCY 40 CONTINUE * * Compute the error ratio for this result. * ERR = RZERO INFO = 0 LDPY = DESCY( LLD_ ) IOFFY = IY + ( JY - 1 ) * LDY CALL PB_INFOG2L( IY, JY, DESCY, NPROW, NPCOL, MYROW, MYCOL, IIY, $ JJY, IYROW, IYCOL ) ICURROW = IYROW ICURCOL = IYCOL ROWREP = ( IYROW.EQ.-1 ) COLREP = ( IYCOL.EQ.-1 ) * IF( INCY.EQ.DESCY( M_ ) ) THEN * * sub( Y ) is a row vector * JB = DESCY( INB_ ) - JY + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCY( NB_ ) + 1 ) * DESCY( NB_ ) + JB JB = MIN( JB, ML ) JN = JY + JB - 1 * DO 50 J = JY, JN * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) ) / EPS IF( G( J-JY+1 ).NE.RZERO ) $ ERRI = ERRI / G( J-JY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 JJY = JJY + 1 END IF * IOFFY = IOFFY + INCY * 50 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 70 J = JN+1, JY+ML-1, DESCY( NB_ ) JB = MIN( JY+ML-J, DESCY( NB_ ) ) * DO 60 KK = 0, JB-1 * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) )/EPS IF( G( J+KK-JY+1 ).NE.RZERO ) $ ERRI = ERRI / G( J+KK-JY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 JJY = JJY + 1 END IF * IOFFY = IOFFY + INCY * 60 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 70 CONTINUE * ELSE * * sub( Y ) is a column vector * IB = DESCY( IMB_ ) - IY + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCY( MB_ ) + 1 ) * DESCY( MB_ ) + IB IB = MIN( IB, ML ) IN = IY + IB - 1 * DO 80 I = IY, IN * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) ) / EPS IF( G( I-IY+1 ).NE.RZERO ) $ ERRI = ERRI / G( I-IY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIY = IIY + 1 END IF * IOFFY = IOFFY + INCY * 80 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 100 I = IN+1, IY+ML-1, DESCY( MB_ ) IB = MIN( IY+ML-I, DESCY( MB_ ) ) * DO 90 KK = 0, IB-1 * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) )/EPS IF( G( I+KK-IY+1 ).NE.RZERO ) $ ERRI = ERRI / G( I+KK-IY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIY = IIY + 1 END IF * IOFFY = IOFFY + INCY * 90 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 100 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) * RETURN * * End of PCMVCH * END SUBROUTINE PCVMCH( ICTXT, TRANS, UPLO, M, N, ALPHA, X, IX, JX, $ DESCX, INCX, Y, IY, JY, DESCY, INCY, A, PA, $ IA, JA, DESCA, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N REAL ERR COMPLEX ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL G( * ) COMPLEX A( * ), PA( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PCVMCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed in * the complex cases: * if TRANS = 'C', * sub( A ) := sub( A ) + alpha * sub( X ) * sub( Y )**H, * otherwise * sub( A ) := sub( A ) + alpha * sub( X ) * sub( Y )**T. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of the submatrix sub( A ) * is to be referenced as follows: * If UPLO = 'L', only the lower triangular part, * If UPLO = 'U', only the upper triangular part, * else the entire matrix is to be referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * X (local input) COMPLEX array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, CTRAN, LOWER, ROWREP, UPPER INTEGER I, IACOL, IAROW, IB, IBEG, ICURROW, IEND, IIA, $ IN, IOFFA, IOFFX, IOFFY, J, JJA, KK, LDA, LDPA, $ LDX, LDY, MYCOL, MYROW, NPCOL, NPROW REAL EPS, ERRI, GTMP COMPLEX ATMP, C * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, MIN, MOD, REAL, SQRT * .. * .. Statement Functions .. REAL ABS1 ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * CTRAN = LSAME( TRANS, 'C' ) UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in A using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 70 J = 1, N * IOFFY = IY + ( JY - 1 ) * LDY + ( J - 1 ) * INCY * IF( LOWER ) THEN IBEG = J IEND = M DO 10 I = 1, J-1 G( I ) = ZERO 10 CONTINUE ELSE IF( UPPER ) THEN IBEG = 1 IEND = J DO 20 I = J+1, M G( I ) = ZERO 20 CONTINUE ELSE IBEG = 1 IEND = M END IF * DO 30 I = IBEG, IEND * IOFFX = IX + ( JX - 1 ) * LDX + ( I - 1 ) * INCX IOFFA = IA + I - 1 + ( JA + J - 2 ) * LDA IF( CTRAN ) THEN ATMP = X( IOFFX ) * CONJG( Y( IOFFY ) ) ELSE ATMP = X( IOFFX ) * Y( IOFFY ) END IF GTMP = ABS1( X( IOFFX ) ) * ABS1( Y( IOFFY ) ) G( I ) = ABS1( ALPHA ) * GTMP + ABS1( A( IOFFA ) ) A( IOFFA ) = ALPHA * ATMP + A( IOFFA ) * 30 CONTINUE * * Compute the error ratio for this result. * INFO = 0 ERR = ZERO LDPA = DESCA( LLD_ ) IOFFA = IA + ( JA + J - 2 ) * LDA CALL PB_INFOG2L( IA, JA+J-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * IF( MYCOL.EQ.IACOL .OR. COLREP ) THEN * ICURROW = IAROW IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 * DO 40 I = IA, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA ) - A( IOFFA ) )/EPS IF( G( I-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 40 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * DO 50 KK = 0, IB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA )-A( IOFFA ) )/EPS IF( G( I+KK-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 80 * 70 CONTINUE * 80 CONTINUE * RETURN * * End of PCVMCH * END SUBROUTINE PCVMCH2( ICTXT, UPLO, M, N, ALPHA, X, IX, JX, DESCX, $ INCX, Y, IY, JY, DESCY, INCY, A, PA, IA, $ JA, DESCA, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N REAL ERR COMPLEX ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL G( * ) COMPLEX A( * ), PA( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PCVMCH2 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of the submatrix sub( A ) * is to be referenced as follows: * If UPLO = 'L', only the lower triangular part, * If UPLO = 'U', only the upper triangular part, * else the entire matrix is to be referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * X (local input) COMPLEX array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, LOWER, ROWREP, UPPER INTEGER I, IACOL, IAROW, IB, IBEG, ICURROW, IEND, IIA, $ IN, IOFFA, IOFFXI, IOFFXJ, IOFFYI, IOFFYJ, J, $ JJA, KK, LDA, LDPA, LDX, LDY, MYCOL, MYROW, $ NPCOL, NPROW REAL EPS, ERRI, GTMP COMPLEX C, ATMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, MIN, MOD, REAL, SQRT * .. * .. Statement Functions .. REAL ABS1 ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in A using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 70 J = 1, N * IOFFXJ = IX + ( JX - 1 ) * LDX + ( J - 1 ) * INCX IOFFYJ = IY + ( JY - 1 ) * LDY + ( J - 1 ) * INCY * IF( LOWER ) THEN IBEG = J IEND = M DO 10 I = 1, J-1 G( I ) = ZERO 10 CONTINUE ELSE IF( UPPER ) THEN IBEG = 1 IEND = J DO 20 I = J+1, M G( I ) = ZERO 20 CONTINUE ELSE IBEG = 1 IEND = M END IF * DO 30 I = IBEG, IEND IOFFA = IA + I - 1 + ( JA + J - 2 ) * LDA IOFFXI = IX + ( JX - 1 ) * LDX + ( I - 1 ) * INCX IOFFYI = IY + ( JY - 1 ) * LDY + ( I - 1 ) * INCY ATMP = ALPHA * X( IOFFXI ) * CONJG( Y( IOFFYJ ) ) ATMP = ATMP + Y( IOFFYI ) * CONJG( ALPHA * X( IOFFXJ ) ) GTMP = ABS1( ALPHA * X( IOFFXI ) ) * ABS1( Y( IOFFYJ ) ) GTMP = GTMP + ABS1( Y( IOFFYI ) ) * $ ABS1( CONJG( ALPHA * X( IOFFXJ ) ) ) G( I ) = GTMP + ABS1( A( IOFFA ) ) A( IOFFA ) = A( IOFFA ) + ATMP * 30 CONTINUE * * Compute the error ratio for this result. * INFO = 0 ERR = ZERO LDPA = DESCA( LLD_ ) IOFFA = IA + ( JA + J - 2 ) * LDA CALL PB_INFOG2L( IA, JA+J-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * IF( MYCOL.EQ.IACOL .OR. COLREP ) THEN * ICURROW = IAROW IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 * DO 40 I = IA, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA ) - A( IOFFA ) )/EPS IF( G( I-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 40 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * DO 50 KK = 0, IB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA )-A( IOFFA ) )/EPS IF( G( I+KK-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 80 * 70 CONTINUE * 80 CONTINUE * RETURN * * End of PCVMCH2 * END SUBROUTINE PCMMCH( ICTXT, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, $ JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, $ JC, DESCC, CT, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANSA, TRANSB INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N REAL ERR COMPLEX ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) REAL G( * ) COMPLEX A( * ), B( * ), C( * ), CT( * ), PC( * ) * .. * * Purpose * ======= * * PCMMCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies if the matrix operand A is to be * transposed. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies if the matrix operand B is to be * transposed. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANSA = 'N' (resp. TRANSA <> 'N') in PxGEMM, PxSYRK, * PxSYR2K, PxHERK and PxHER2K. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) COMPLEX array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0E+0, RONE = 1.0E+0 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL COLREP, CTRANA, CTRANB, ROWREP, TRANA, TRANB INTEGER I, IBB, ICCOL, ICROW, ICURROW, IIC, IN, IOFFA, $ IOFFB, IOFFC, J, JJC, KK, LDA, LDB, LDC, LDPC, $ MYCOL, MYROW, NPCOL, NPROW REAL EPS, ERRI COMPLEX Z * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, MIN, MOD, REAL, SQRT * .. * .. Statement Functions .. REAL ABS1 ABS1( Z ) = ABS( REAL( Z ) ) + ABS( AIMAG( Z ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * TRANA = LSAME( TRANSA, 'T' ).OR.LSAME( TRANSA, 'C' ) TRANB = LSAME( TRANSB, 'T' ).OR.LSAME( TRANSB, 'C' ) CTRANA = LSAME( TRANSA, 'C' ) CTRANB = LSAME( TRANSB, 'C' ) * LDA = MAX( 1, DESCA( M_ ) ) LDB = MAX( 1, DESCB( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 240 J = 1, N * IOFFC = IC + ( JC + J - 2 ) * LDC DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE * IF( .NOT.TRANA .AND. .NOT.TRANB ) THEN DO 30 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 20 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA .AND. .NOT.TRANB ) THEN IF( CTRANA ) THEN DO 50 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 40 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + CONJG( A( IOFFA ) ) * $ B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 60 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA .AND. TRANB ) THEN IF( CTRANB ) THEN DO 90 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 80 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * $ CONJG( B( IOFFB ) ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 100 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA .AND. TRANB ) THEN IF( CTRANA ) THEN IF( CTRANB ) THEN DO 130 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 120 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + CONJG( A( IOFFA ) ) * $ CONJG( B( IOFFB ) ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 140 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + CONJG( A( IOFFA ) ) * $ B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB ) THEN DO 170 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 160 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * $ CONJG( B( IOFFB ) ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 180 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF * DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA * C( IOFFC ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 200 CONTINUE * * Compute the error ratio for this result. * ERR = RZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, M ) IN = IC + IBB - 1 * DO 210 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 210 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 230 I = IN+1, IC+M-1, DESCC( MB_ ) IBB = MIN( IC+M-I, DESCC( MB_ ) ) * DO 220 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 220 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 230 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 250 * 240 CONTINUE * 250 CONTINUE * RETURN * * End of PCMMCH * END SUBROUTINE PCMMCH1( ICTXT, UPLO, TRANS, N, K, ALPHA, A, IA, JA, $ DESCA, BETA, C, PC, IC, JC, DESCC, CT, G, $ ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IC, ICTXT, INFO, JA, JC, K, N REAL ERR COMPLEX ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL G( * ) COMPLEX A( * ), C( * ), CT( * ), PC( * ) * .. * * Purpose * ======= * * PCMMCH1 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrix A has to be * transposed or not before computing the matrix-matrix product. * * N (global input) INTEGER * On entry, N specifies the order the submatrix operand C. N * must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANS = 'N' (resp. TRANS <> 'N'). K must be at least * zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) COMPLEX array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0E+0, RONE = 1.0E+0 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL COLREP, HTRAN, NOTRAN, ROWREP, TRAN, UPPER INTEGER I, IBB, IBEG, ICCOL, ICROW, ICURROW, IEND, IIC, $ IN, IOFFAK, IOFFAN, IOFFC, J, JJC, KK, LDA, $ LDC, LDPC, MYCOL, MYROW, NPCOL, NPROW REAL EPS, ERRI COMPLEX Z * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, MIN, MOD, REAL, SQRT * .. * .. Statement Functions .. REAL ABS1 ABS1( Z ) = ABS( REAL( Z ) ) + ABS( AIMAG( Z ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) HTRAN = LSAME( TRANS, 'H' ) * LDA = MAX( 1, DESCA( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 140 J = 1, N * IF( UPPER ) THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF * DO 10 I = 1, N CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE * IF( NOTRAN ) THEN DO 30 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA DO 20 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAK ) * A( IOFFAN ) G( I ) = G( I ) + ABS1( A( IOFFAK ) ) * $ ABS1( A( IOFFAN ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRAN ) THEN DO 50 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA DO 40 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAK ) * A( IOFFAN ) G( I ) = G( I ) + ABS1( A( IOFFAK ) ) * $ ABS1( A( IOFFAN ) ) 40 CONTINUE 50 CONTINUE ELSE IF( HTRAN ) THEN DO 70 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA DO 60 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAN ) * $ CONJG( A( IOFFAK ) ) G( I ) = G( I ) + ABS1( A( IOFFAK ) ) * $ ABS1( A( IOFFAN ) ) 60 CONTINUE 70 CONTINUE ELSE DO 90 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA DO 80 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + CONJG( A( IOFFAN ) ) * A( IOFFAK ) G( I ) = G( I ) + ABS1( CONJG( A( IOFFAN ) ) ) * $ ABS1( A( IOFFAK ) ) 80 CONTINUE 90 CONTINUE END IF * IOFFC = IC + IBEG - 1 + ( JC + J - 2 ) * LDC * DO 100 I = IBEG, IEND CT( I ) = ALPHA*CT( I ) + BETA * C( IOFFC ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 100 CONTINUE * * Compute the error ratio for this result. * ERR = RZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, N ) IN = IC + IBB - 1 * DO 110 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IC+N-1, DESCC( MB_ ) IBB = MIN( IC+N-I, DESCC( MB_ ) ) * DO 120 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 150 * 140 CONTINUE * 150 CONTINUE * RETURN * * End of PCMMCH1 * END SUBROUTINE PCMMCH2( ICTXT, UPLO, TRANS, N, K, ALPHA, A, IA, JA, $ DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, $ JC, DESCC, CT, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, N REAL ERR COMPLEX ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) REAL G( * ) COMPLEX A( * ), B( * ), C( * ), CT( * ), $ PC( * ) * .. * * Purpose * ======= * * PCMMCH2 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrices A and B have * to be transposed or not before computing the matrix-matrix * product. * * N (global input) INTEGER * On entry, N specifies the order the submatrix operand C. N * must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * and B when TRANS = 'N' (resp. TRANS <> 'N'). K must be at * least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) COMPLEX array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0E+0, RONE = 1.0E+0 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL COLREP, HTRAN, NOTRAN, ROWREP, TRAN, UPPER INTEGER I, IBB, IBEG, ICCOL, ICROW, ICURROW, IEND, IIC, $ IN, IOFFAK, IOFFAN, IOFFBK, IOFFBN, IOFFC, J, $ JJC, KK, LDA, LDB, LDC, LDPC, MYCOL, MYROW, $ NPCOL, NPROW REAL EPS, ERRI COMPLEX Z * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, MIN, MOD, REAL, SQRT * .. * .. Statement Functions .. REAL ABS1 ABS1( Z ) = ABS( REAL( Z ) ) + ABS( AIMAG( Z ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) HTRAN = LSAME( TRANS, 'H' ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) * LDA = MAX( 1, DESCA( M_ ) ) LDB = MAX( 1, DESCB( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 140 J = 1, N * IF( UPPER ) THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF * DO 10 I = 1, N CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE * IF( NOTRAN ) THEN DO 30 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA IOFFBK = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 20 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA IOFFBN = IB + I - 1 + ( JB + KK - 2 ) * LDB CT( I ) = CT( I ) + ALPHA * ( $ A( IOFFAN ) * B( IOFFBK ) + $ B( IOFFBN ) * A( IOFFAK ) ) G( I ) = G( I ) + ABS( ALPHA ) * ( $ ABS1( A( IOFFAN ) ) * ABS1( B( IOFFBK ) ) + $ ABS1( B( IOFFBN ) ) * ABS1( A( IOFFAK ) ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRAN ) THEN DO 50 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA IOFFBK = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 40 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA IOFFBN = IB + KK - 1 + ( JB + I - 2 ) * LDB CT( I ) = CT( I ) + ALPHA * ( $ A( IOFFAN ) * B( IOFFBK ) + $ B( IOFFBN ) * A( IOFFAK ) ) G( I ) = G( I ) + ABS( ALPHA ) * ( $ ABS1( A( IOFFAN ) ) * ABS1( B( IOFFBK ) ) + $ ABS1( B( IOFFBN ) ) * ABS1( A( IOFFAK ) ) ) 40 CONTINUE 50 CONTINUE ELSE IF( HTRAN ) THEN DO 70 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA IOFFBK = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 60 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA IOFFBN = IB + I - 1 + ( JB + KK - 2 ) * LDB CT( I ) = CT( I ) + $ ALPHA * A( IOFFAN ) * CONJG( B( IOFFBK ) ) + $ B( IOFFBN ) * CONJG( ALPHA * A( IOFFAK ) ) G( I ) = G( I ) + ABS1( ALPHA ) * ( $ ABS1( A( IOFFAN ) ) * ABS1( B( IOFFBK ) ) + $ ABS1( B( IOFFBN ) ) * ABS1( A( IOFFAK ) ) ) 60 CONTINUE 70 CONTINUE ELSE DO 90 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA IOFFBK = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 80 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA IOFFBN = IB + KK - 1 + ( JB + I - 2 ) * LDB CT( I ) = CT( I ) + $ ALPHA * CONJG( A( IOFFAN ) ) * B( IOFFBK ) + $ CONJG( ALPHA * B( IOFFBN ) ) * A( IOFFAK ) G( I ) = G( I ) + ABS1( ALPHA ) * ( $ ABS1( CONJG( A( IOFFAN ) ) * B( IOFFBK ) ) + $ ABS1( CONJG( B( IOFFBN ) ) * A( IOFFAK ) ) ) 80 CONTINUE 90 CONTINUE END IF * IOFFC = IC + IBEG - 1 + ( JC + J - 2 ) * LDC * DO 100 I = IBEG, IEND CT( I ) = CT( I ) + BETA * C( IOFFC ) G( I ) = G( I ) + ABS1( BETA )*ABS1( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 100 CONTINUE * * Compute the error ratio for this result. * ERR = RZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, N ) IN = IC + IBB - 1 * DO 110 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IC+N-1, DESCC( MB_ ) IBB = MIN( IC+N-I, DESCC( MB_ ) ) * DO 120 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 150 * 140 CONTINUE * 150 CONTINUE * RETURN * * End of PCMMCH2 * END SUBROUTINE PCMMCH3( UPLO, TRANS, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IC, INFO, JA, JC, M, N REAL ERR COMPLEX ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX A( * ), C( * ), PC( * ) * .. * * Purpose * ======= * * PCMMCH3 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrix A has to be * transposed or not before computing the matrix-matrix addi- * tion. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, CTRAN, LOWER, NOTRAN, ROWREP, UPPER INTEGER I, ICCOL, ICROW, ICTXT, IIC, IOFFA, IOFFC, J, $ JJC, LDA, LDC, LDPC, MYCOL, MYROW, NPCOL, $ NPROW REAL ERR0, ERRI, PREC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, $ PCERRAXPBY, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, CONJG, MAX * .. * .. Executable Statements .. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * PREC = PSLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) CTRAN = LSAME( TRANS, 'C' ) * * Compute expected result in C using data in A and C. This part of * the computation is performed by every process in the grid. * INFO = 0 ERR = ZERO * LDA = MAX( 1, DESCA( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) LDPC = MAX( 1, DESCC( LLD_ ) ) ROWREP = ( DESCC( RSRC_ ).EQ.-1 ) COLREP = ( DESCC( CSRC_ ).EQ.-1 ) * IF( NOTRAN ) THEN * DO 20 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( JA - 1 + J - JC ) * LDA * DO 10 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PCERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PCERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PCERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFA = IOFFA + 1 IOFFC = IOFFC + 1 * 10 CONTINUE * 20 CONTINUE * ELSE IF( CTRAN ) THEN * DO 40 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( J - JC ) + ( JA - 1 ) * LDA * DO 30 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PCERRAXPBY( ERRI, ALPHA, CONJG( A( IOFFA ) ), $ BETA, C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PCERRAXPBY( ERRI, ALPHA, CONJG( A( IOFFA ) ), $ BETA, C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PCERRAXPBY( ERRI, ALPHA, CONJG( A( IOFFA ) ), $ BETA, C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFC = IOFFC + 1 IOFFA = IOFFA + LDA * 30 CONTINUE * 40 CONTINUE * ELSE * DO 60 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( J - JC ) + ( JA - 1 ) * LDA * DO 50 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PCERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PCERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PCERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFC = IOFFC + 1 IOFFA = IOFFA + LDA * 50 CONTINUE * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) * RETURN * * End of PCMMCH3 * END SUBROUTINE PCERRAXPBY( ERRBND, ALPHA, X, BETA, Y, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL ERRBND, PREC COMPLEX ALPHA, BETA, X, Y * .. * * Purpose * ======= * * PCERRAXPBY serially computes y := beta*y + alpha * x and returns a * scaled relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * X (global input) COMPLEX * On entry, X specifies the scalar x to be scaled. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. * * Y (global input/global output) COMPLEX * On entry, Y specifies the scalar y to be added. On exit, Y * contains the resulting scalar y. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. REAL ADDBND, FACT, SUMINEG, SUMIPOS, SUMRNEG, $ SUMRPOS COMPLEX TMP * .. * .. Intrinsic Functions .. * .. * .. Executable Statements .. * SUMIPOS = ZERO SUMINEG = ZERO SUMRPOS = ZERO SUMRNEG = ZERO FACT = ONE + TWO * PREC ADDBND = TWO * TWO * TWO * PREC * TMP = ALPHA * X IF( REAL( TMP ).GE.ZERO ) THEN SUMRPOS = SUMRPOS + REAL( TMP ) * FACT ELSE SUMRNEG = SUMRNEG - REAL( TMP ) * FACT END IF IF( AIMAG( TMP ).GE.ZERO ) THEN SUMIPOS = SUMIPOS + AIMAG( TMP ) * FACT ELSE SUMINEG = SUMINEG - AIMAG( TMP ) * FACT END IF * TMP = BETA * Y IF( REAL( TMP ).GE.ZERO ) THEN SUMRPOS = SUMRPOS + REAL( TMP ) * FACT ELSE SUMRNEG = SUMRNEG - REAL( TMP ) * FACT END IF IF( AIMAG( TMP ).GE.ZERO ) THEN SUMIPOS = SUMIPOS + AIMAG( TMP ) * FACT ELSE SUMINEG = SUMINEG - AIMAG( TMP ) * FACT END IF * Y = ( BETA * Y ) + ( ALPHA * X ) * ERRBND = ADDBND * MAX( MAX( SUMRPOS, SUMRNEG ), $ MAX( SUMIPOS, SUMINEG ) ) * RETURN * * End of PCERRAXPBY * END SUBROUTINE PCIPSET( TOGGLE, N, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TOGGLE INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCIPSET sets the imaginary part of the diagonal entries of an n by n * matrix sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). This is used to * test the PBLAS routines for complex Hermitian matrices, which are * either not supposed to access or use the imaginary parts of the dia- * gonals, or supposed to set them to zero. The value used to set the * imaginary part of the diagonals depends on the value of TOGGLE. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TOGGLE (global input) CHARACTER*1 * On entry, TOGGLE specifies the set-value to be used as fol- * lows: * If TOGGLE = 'Z' or 'z', the imaginary part of the diago- * nals are set to zero, * If TOGGLE = 'B' or 'b', the imaginary part of the diago- * nals are set to a large value. * * N (global input) INTEGER * On entry, N specifies the order of sub( A ). N must be at * least zero. * * A (local input/local output) pointer to COMPLEX * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the diagonals of * sub( A ) have been updated as specified by TOGGLE. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, GODOWN, GOLEFT, ROWREP INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP REAL ALPHA, ATMP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.LE.0 ) $ RETURN * IF( LSAME( TOGGLE, 'Z' ) ) THEN ALPHA = ZERO ELSE IF( LSAME( TOGGLE, 'B' ) ) THEN ALPHA = PSLAMCH( ICTXT, 'Epsilon' ) ALPHA = ALPHA / PSLAMCH( ICTXT, 'Safe minimum' ) END IF * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( NP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 ROWREP = ( DESCA2( RSRC_ ).EQ.-1 ) COLREP = ( DESCA2( CSRC_ ).EQ.-1 ) LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( ROWREP ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( COLREP ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA + LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = REAL( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = CMPLX( ATMP, ALPHA ) 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = REAL( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = CMPLX( ATMP, ALPHA ) 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = REAL( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = CMPLX( ATMP, ALPHA ) 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = REAL( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = CMPLX( ATMP, ALPHA ) 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = REAL( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = CMPLX( ATMP, ALPHA ) 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = REAL( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = CMPLX( ATMP, ALPHA ) 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = REAL( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = CMPLX( ATMP, ALPHA ) 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = REAL( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = CMPLX( ATMP, ALPHA ) 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PCIPSET * END REAL FUNCTION PSLAMCH( ICTXT, CMACH ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 CMACH INTEGER ICTXT * .. * * Purpose * ======= * * * .. Local Scalars .. CHARACTER*1 TOP INTEGER IDUMM REAL TEMP * .. * .. External Subroutines .. EXTERNAL PB_TOPGET, SGAMN2D, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH EXTERNAL LSAME, SLAMCH * .. * .. Executable Statements .. * TEMP = SLAMCH( CMACH ) * IF( LSAME( CMACH, 'E' ).OR.LSAME( CMACH, 'S' ).OR. $ LSAME( CMACH, 'M' ).OR.LSAME( CMACH, 'U' ) ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) IDUMM = 0 CALL SGAMX2D( ICTXT, 'All', TOP, 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) ELSE IF( LSAME( CMACH, 'L' ).OR.LSAME( CMACH, 'O' ) ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) IDUMM = 0 CALL SGAMN2D( ICTXT, 'All', TOP, 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) END IF * PSLAMCH = TEMP * RETURN * * End of PSLAMCH * END SUBROUTINE PCLASET( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, JA, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLASET initializes an m by n submatrix A(IA:IA+M-1,JA:JA+N-1) deno- * ted by sub( A ) to beta on the diagonal and alpha on the offdiago- * nals. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the part of the submatrix sub( A ) * to be set: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of sub( A ) is not changed; * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha, i.e., the cons- * tant to which the offdiagonal elements are to be set. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta, i.e., the constant * to which the diagonal elements are to be set. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A to be set. On exit, the * leading m by n submatrix sub( A ) is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * and IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT, ISCOLREP, ISROWREP, LOWER, $ UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, JJA, $ JJMAX, JOFFA, JOFFD, LCMT, LCMT00, LDA, LMBLOC, $ LNBLOC, LOW, M1, MB, MBLKD, MBLKS, MBLOC, MP, $ MRCOL, MRROW, MYCOL, MYROW, N1, NB, NBLKD, $ NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, QNB, TMP1, $ UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CLASET, PB_DESCTRANS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * ISROWREP = ( DESCA2( RSRC_ ).LT.0 ) ISCOLREP = ( DESCA2( CSRC_ ).LT.0 ) LDA = DESCA2( LLD_ ) * UPPER = .NOT.( LSAME( UPLO, 'L' ) ) LOWER = .NOT.( LSAME( UPLO, 'U' ) ) * IF( ( ( LOWER.AND.UPPER ).AND.( ALPHA.EQ.BETA ) ).OR. $ ( ISROWREP .AND. ISCOLREP ) ) THEN IF( ( MP.GT.0 ).AND.( NQ.GT.0 ) ) $ CALL PB_CLASET( UPLO, MP, NQ, 0, ALPHA, BETA, $ A( IIA + ( JJA - 1 ) * LDA ), LDA ) RETURN END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) CALL PB_BINFO( 0, MP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( ISROWREP ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( ISCOLREP ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * M1 = MP N1 = NQ * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_CLASET( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, BETA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_CLASET( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ ALPHA, A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_CLASET( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ ALPHA, A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_CLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_CLASET( UPLO, MBLOC, INBLOC, LCMT, ALPHA, BETA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_CLASET( 'ALL', TMP1, INBLOC, 0, ALPHA, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_CLASET( 'ALL', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_CLASET( 'All', M1, TMP1, 0, ALPHA, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_CLASET( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, BETA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_CLASET( 'All', IMBLOC, TMP1, 0, ALPHA, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_CLASET( 'All', M1, TMP1, 0, ALPHA, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_CLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_CLASET( UPLO, MBLOC, NBLOC, LCMT, ALPHA, BETA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_CLASET( 'All', TMP1, NBLOC, 0, ALPHA, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_CLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * RETURN * * End of PCLASET * END SUBROUTINE PCLASCAL( TYPE, M, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TYPE INTEGER IA, JA, M, N COMPLEX ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLASCAL scales the m by n submatrix A(IA:IA+M-1,JA:JA+N-1) denoted * by sub( A ) by the scalar alpha. TYPE specifies if sub( A ) is full, * upper triangular, lower triangular or upper Hessenberg. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * On entry, TYPE specifies the type of the input submatrix as * follows: * = 'L' or 'l': sub( A ) is a lower triangular matrix, * = 'U' or 'u': sub( A ) is an upper triangular matrix, * = 'H' or 'h': sub( A ) is an upper Hessenberg matrix, * otherwise sub( A ) is a full matrix. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( A ) are overwritten by * the local entries of the m by n scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 UPLO LOGICAL GODOWN, GOLEFT, LOWER, UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, ITYPE, $ IUPP, JJA, JJMAX, JOFFA, JOFFD, LCMT, LCMT00, $ LDA, LMBLOC, LNBLOC, LOW, M1, MB, MBLKD, MBLKS, $ MBLOC, MP, MRCOL, MRROW, MYCOL, MYROW, N1, NB, $ NBLKD, NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, $ QNB, TMP1, UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CLASCAL, PB_DESCTRANS, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_NUMROC EXTERNAL LSAME, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 UPLO = TYPE UPPER = .FALSE. LOWER = .TRUE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 UPLO = TYPE UPPER = .TRUE. LOWER = .FALSE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 UPLO = 'U' UPPER = .TRUE. LOWER = .FALSE. IOFFD = 1 ELSE ITYPE = 0 UPLO = 'A' UPPER = .TRUE. LOWER = .TRUE. IOFFD = 0 END IF * * Compute local indexes * IF( ITYPE.EQ.0 ) THEN * * Full matrix * CALL PB_INFOG2L( IA, JA, DESCA2, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = PB_NUMROC( M, IA, DESCA2( IMB_ ), DESCA2( MB_ ), MYROW, $ DESCA2( RSRC_ ), NPROW ) NQ = PB_NUMROC( N, JA, DESCA2( INB_ ), DESCA2( NB_ ), MYCOL, $ DESCA2( CSRC_ ), NPCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * LDA = DESCA2( LLD_ ) IOFFA = IIA + ( JJA - 1 ) * LDA * CALL PB_CLASCAL( 'All', MP, NQ, 0, ALPHA, A( IOFFA ), LDA ) * ELSE * * Trapezoidal matrix * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, * LNBLOC, ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) LDA = DESCA2( LLD_ ) * CALL PB_BINFO( IOFFD, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * M1 = MP N1 = NQ IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and * update LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_CLASCAL( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_CLASCAL( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_CLASCAL( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_CLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_CLASCAL( UPLO, MBLOC, INBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_CLASCAL( 'All', TMP1, INBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_CLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_CLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_CLASCAL( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_CLASCAL( 'All', IMBLOC, TMP1, 0, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_CLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_CLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_CLASCAL( UPLO, MBLOC, NBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_CLASCAL( 'All', TMP1, NBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_CLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * END IF * RETURN * * End of PCLASCAL * END SUBROUTINE PCLAGEN( INPLACE, AFORM, DIAG, OFFA, M, N, IA, JA, $ DESCA, IASEED, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE CHARACTER*1 AFORM, DIAG INTEGER IA, IASEED, JA, LDA, M, N, OFFA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PCLAGEN generates (or regenerates) a submatrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the generated submatrix is diago- * nally dominant or not as follows: * DIAG = 'D' : sub( A ) is diagonally dominant, * DIAG = 'N' : sub( A ) is not diagonally dominant. * * OFFA (global input) INTEGER * On entry, OFFA specifies the offdiagonal of the underlying * matrix A(1:DESCA(M_),1:DESCA(N_)) of interest when the subma- * trix is symmetric, Hermitian or diagonally dominant. OFFA = 0 * specifies the main diagonal, OFFA > 0 specifies a subdiago- * nal, and OFFA < 0 specifies a superdiagonal (see further de- * tails). * * M (global input) INTEGER * On entry, M specifies the global number of matrix rows of the * submatrix sub( A ) to be generated. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of matrix columns of * the submatrix sub( A ) to be generated. N must be at least * zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IASEED (global input) INTEGER * On entry, IASEED specifies the seed number to generate the * matrix A. IASEED must be at least zero. * * A (local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). On exit, this array contains the * local entries of the randomly generated submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. When INPLACE is .FALSE., LDA is usually DESCA(LLD_). * This restriction is however not enforced, and this subroutine * requires only that LDA >= MAX( 1, Mp ) where * * Mp = PB_NUMROC( M, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ). * * PB_NUMROC is a ScaLAPACK tool function; MYROW, MYCOL, NPROW * and NPCOL can be determined by calling the BLACS subroutine * BLACS_GRIDINFO. * * Further Details * =============== * * OFFD is tied to the matrix described by DESCA, as opposed to the * piece that is currently (re)generated. This is a global information * independent from the distribution parameters. Below are examples of * the meaning of OFFD for a global 7 by 5 matrix: * * --------------------------------------------------------------------- * OFFD | 0 -1 -2 -3 -4 0 -1 -2 -3 -4 0 -1 -2 -3 -4 * -------|------------------------------------------------------------- * | | OFFD=-1 | OFFD=0 OFFD=2 * | V V * 0 | . d . . . -> d . . . . . . . . . * 1 | . . d . . . d . . . . . . . . * 2 | . . . d . . . d . . -> d . . . . * 3 | . . . . d . . . d . . d . . . * 4 | . . . . . . . . . d . . d . . * 5 | . . . . . . . . . . . . . d . * 6 | . . . . . . . . . . . . . . d * --------------------------------------------------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL DIAGDO, SYMM, HERM, NOTRAN INTEGER CSRC, I, IACOL, IAROW, ICTXT, IIA, ILOCBLK, $ ILOCOFF, ILOW, IMB, IMB1, IMBLOC, IMBVIR, INB, $ INB1, INBLOC, INBVIR, INFO, IOFFDA, ITMP, IUPP, $ IVIR, JJA, JLOCBLK, JLOCOFF, JVIR, LCMT00, $ LMBLOC, LNBLOC, LOW, MAXMN, MB, MBLKS, MP, $ MRCOL, MRROW, MYCDIST, MYCOL, MYRDIST, MYROW, $ NB, NBLKS, NPCOL, NPROW, NQ, NVIR, RSRC, UPP COMPLEX ALPHA * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ), IMULADD( 4, JMP_LEN ), $ IRAN( 2 ), JMP( JMP_LEN ), MULADD0( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CHKMAT, PB_CLAGEN, PB_DESCTRANS, PB_INITJMP, $ PB_INITMULADD, PB_JUMP, PB_JUMPIT, PB_LOCINFO, $ PB_SETLOCRAN, PB_SETRAN, PCLADOM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL * .. * .. Data Statements .. DATA ( MULADD0( I ), I = 1, 4 ) / 20077, 16838, $ 12345, 0 / * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Test the input arguments * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1000 + CTXT_ ) ELSE SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) NOTRAN = LSAME( AFORM, 'N' ) DIAGDO = LSAME( DIAG, 'D' ) IF( .NOT.( SYMM.OR.HERM.OR.NOTRAN ) .AND. $ .NOT.( LSAME( AFORM, 'T' ) ) .AND. $ .NOT.( LSAME( AFORM, 'C' ) ) ) THEN INFO = -2 ELSE IF( ( .NOT.DIAGDO ) .AND. $ ( .NOT.LSAME( DIAG, 'N' ) ) ) THEN INFO = -3 END IF CALL PB_CHKMAT( ICTXT, M, 5, N, 6, IA, JA, DESCA2, 10, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PCLAGEN', -INFO ) RETURN END IF * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) IMB = DESCA2( IMB_ ) INB = DESCA2( INB_ ) RSRC = DESCA2( RSRC_ ) CSRC = DESCA2( CSRC_ ) * * Figure out local information about the distributed matrix operand * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * IOFFDA = JA + OFFA - IA CALL PB_BINFO( IOFFDA, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * * Initialize ILOCBLK, ILOCOFF, MYRDIST, JLOCBLK, JLOCOFF, MYCDIST * This values correspond to the square virtual underlying matrix * of size MAX( M_ + MAX( 0, -OFFA ), N_ + MAX( 0, OFFA ) ) used * to set up the random sequence. For practical purposes, the size * of this virtual matrix is upper bounded by M_ + N_ - 1. * ITMP = MAX( 0, -OFFA ) IVIR = IA + ITMP IMBVIR = IMB + ITMP NVIR = DESCA2( M_ ) + ITMP * CALL PB_LOCINFO( IVIR, IMBVIR, MB, MYROW, RSRC, NPROW, ILOCBLK, $ ILOCOFF, MYRDIST ) * ITMP = MAX( 0, OFFA ) JVIR = JA + ITMP INBVIR = INB + ITMP NVIR = MAX( MAX( NVIR, DESCA2( N_ ) + ITMP ), $ DESCA2( M_ ) + DESCA2( N_ ) - 1 ) * CALL PB_LOCINFO( JVIR, INBVIR, NB, MYCOL, CSRC, NPCOL, JLOCBLK, $ JLOCOFF, MYCDIST ) * IF( SYMM .OR. HERM .OR. NOTRAN ) THEN * CALL PB_INITJMP( .TRUE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 2, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_CLAGEN( 'Lower', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( SYMM .OR. HERM .OR. ( .NOT. NOTRAN ) ) THEN * CALL PB_INITJMP( .FALSE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 2, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_CLAGEN( 'Upper', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( DIAGDO ) THEN * MAXMN = MAX( DESCA2( M_ ), DESCA2( N_ ) ) IF( HERM ) THEN ALPHA = CMPLX( REAL( 2 * MAXMN ), ZERO ) ELSE ALPHA = CMPLX( REAL( MAXMN ), REAL( MAXMN ) ) END IF * IF( IOFFDA.GE.0 ) THEN CALL PCLADOM( INPLACE, MIN( MAX( 0, M-IOFFDA ), N ), ALPHA, $ A, MIN( IA+IOFFDA, IA+M-1 ), JA, DESCA ) ELSE CALL PCLADOM( INPLACE, MIN( M, MAX( 0, N+IOFFDA ) ), ALPHA, $ A, IA, MIN( JA-IOFFDA, JA+N-1 ), DESCA ) END IF * END IF * RETURN * * End of PCLAGEN * END SUBROUTINE PCLADOM( INPLACE, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE INTEGER IA, JA, N COMPLEX ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX A( * ) * .. * * Purpose * ======= * * PCLADOM adds alpha to the diagonal entries of an n by n submatrix * sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * N (global input) INTEGER * On entry, N specifies the global order of the submatrix * sub( A ) to be modified. N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the local entries * of this array corresponding to the main diagonal of sub( A ) * have been updated. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP COMPLEX ATMP * .. * .. Local Scalars .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, MAX, MIN, REAL * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.EQ.0 ) $ RETURN * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) * CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA+LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ CMPLX( ABS( REAL( ATMP ) ), $ ABS( AIMAG( ATMP ) ) ) 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PCLADOM * END SUBROUTINE PB_PCLAPRNT( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * Purpose * ======= * * PB_PCLAPRNT prints to the standard output a submatrix sub( A ) deno- * ting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and printed by * the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the row index of the printing pro- * cess. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the column index of the printing * process. * * CMATNM (global input) CHARACTER*(*) * On entry, CMATNM is the name of the matrix to be printed. * * NOUT (global input) INTEGER * On entry, NOUT specifies the output unit number. When NOUT is * equal to 6, the submatrix is printed on the screen. * * WORK (local workspace) COMPLEX array * On entry, WORK is a work array of dimension at least equal to * MAX( IMB_A, MB_A ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW, PCOL, PROW * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_DESCTRANS, PB_PCLAPRN2 * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * CALL BLACS_GRIDINFO( DESCA2( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( DESCA2( RSRC_ ).GE.0 ) THEN IF( DESCA2( CSRC_ ).GE.0 ) THEN CALL PB_PCLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, ICPRNT, $ CMATNM, NOUT, DESCA2( RSRC_ ), $ DESCA2( CSRC_ ), WORK ) ELSE DO 10 PCOL = 0, NPCOL - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Colum-replicated array -- ' , $ 'copy in process column: ', PCOL CALL PB_PCLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, DESCA2( RSRC_ ), $ PCOL, WORK ) 10 CONTINUE END IF ELSE IF( DESCA2( CSRC_ ).GE.0 ) THEN DO 20 PROW = 0, NPROW - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Row-replicated array -- ' , $ 'copy in process row: ', PROW CALL PB_PCLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, PROW, $ DESCA2( CSRC_ ), WORK ) 20 CONTINUE ELSE DO 40 PROW = 0, NPROW - 1 DO 30 PCOL = 0, NPCOL - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Replicated array -- ' , $ 'copy in process (', PROW, ',', PCOL, ')' CALL PB_PCLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, PROW, PCOL, $ WORK ) 30 CONTINUE 40 CONTINUE END IF END IF * RETURN * * End of PB_PCLAPRNT * END SUBROUTINE PB_PCLAPRN2( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, PROW, PCOL, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT, PCOL, PROW * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) COMPLEX A( * ), WORK( * ) * .. * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL AISCOLREP, AISROWREP INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, LDW, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, CGERV2D, $ CGESD2D, PB_INFOG2L * .. * .. Intrinsic Functions .. INTRINSIC AIMAG, MIN, REAL * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL PB_INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) II = IIA JJ = JJA IF( DESCA( RSRC_ ).LT.0 ) THEN AISROWREP = .TRUE. IAROW = PROW ICURROW = PROW ELSE AISROWREP = .FALSE. ICURROW = IAROW END IF IF( DESCA( CSRC_ ).LT.0 ) THEN AISCOLREP = .TRUE. IACOL = PCOL ICURCOL = PCOL ELSE AISCOLREP = .FALSE. ICURCOL = IACOL END IF LDA = DESCA( LLD_ ) LDW = MAX( DESCA( IMB_ ), DESCA( MB_ ) ) * * Handle the first block of column separately * JB = DESCA( INB_ ) - JA + 1 IF( JB.LE.0 ) $ JB = ( (-JB) / DESCA( NB_ ) + 1 ) * DESCA( NB_ ) + JB JB = MIN( JB, N ) JN = JA+JB-1 DO 60 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( (-IB) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA+IB-1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, JA+H, $ REAL( A(II+K+(JJ+H-1)*LDA) ), $ AIMAG( A(II+K+(JJ+H-1)*LDA) ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, JA+H, REAL( WORK( K ) ), $ AIMAG( WORK( K ) ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, JA+H, $ REAL( A( II+K+(JJ+H-1)*LDA ) ), $ AIMAG( A( II+K+(JJ+H-1)*LDA ) ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, JA+H, REAL( WORK( K ) ), $ AIMAG( WORK( K ) ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB IF( .NOT.AISCOLREP ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IB = DESCA( IMB_ )-IA+1 IF( IB.LE.0 ) $ IB = ( (-IB) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA+IB-1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, J+H, $ REAL( A( II+K+(JJ+H-1)*LDA ) ), $ AIMAG( A( II+K+(JJ+H-1)*LDA ) ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, J+H, REAL( WORK( K ) ), $ AIMAG( WORK( K ) ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, J+H, $ REAL( A( II+K+(JJ+H-1)*LDA ) ), $ AIMAG( A( II+K+(JJ+H-1)*LDA ) ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL CGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL CGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, J+H, REAL( WORK( K ) ), $ AIMAG( WORK( K ) ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB IF( .NOT.AISCOLREP ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * 9999 FORMAT( 1X, A, '(', I6, ',', I6, ')=', E16.8, '+i*(', $ E16.8, ')' ) * RETURN * * End of PB_PCLAPRN2 * END SUBROUTINE PB_CFILLPAD( ICTXT, M, N, A, LDA, IPRE, IPOST, CHKVAL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N COMPLEX CHKVAL * .. * .. Array Arguments .. COMPLEX A( * ) * .. * * Purpose * ======= * * PB_CFILLPAD surrounds a two dimensional local array with a guard-zone * initialized to the value CHKVAL. The user may later call the routine * PB_CCHEKPAD to discover if the guardzone has been violated. There are * three guardzones. The first is a buffer of size IPRE that is before * the start of the array. The second is the buffer of size IPOST which * is after the end of the array to be padded. Finally, there is a guard * zone inside every column of the array to be padded, in the elements * of A(M+1:LDA, J). * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * M (local input) INTEGER * On entry, M specifies the number of rows in the local array * A. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns in the local ar- * ray A. N must be at least zero. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LDA,N). On exit, this * array is the padded array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array to be padded. LDA must be at least MAX( 1, M ). * * IPRE (local input) INTEGER * On entry, IPRE specifies the size of the guard zone to put * before the start of the padded array. * * IPOST (local input) INTEGER * On entry, IPOST specifies the size of the guard zone to put * after the end of the padded array. * * CHKVAL (local input) COMPLEX * On entry, CHKVAL specifies the value to pad the array with. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE A( I ) = CHKVAL 10 CONTINUE ELSE WRITE( *, FMT = '(A)' ) $ 'WARNING no pre-guardzone in PB_CFILLPAD' END IF * * Put check buffer in back of A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 A( I ) = CHKVAL 20 CONTINUE ELSE WRITE( *, FMT = '(A)' ) $ 'WARNING no post-guardzone in PB_CFILLPAD' END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + ( LDA - M ) - 1 A( I ) = CHKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * RETURN * * End of PB_CFILLPAD * END SUBROUTINE PB_CCHEKPAD( ICTXT, MESS, M, N, A, LDA, IPRE, IPOST, $ CHKVAL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N COMPLEX CHKVAL * .. * .. Array Arguments .. CHARACTER*(*) MESS COMPLEX A( * ) * .. * * Purpose * ======= * * PB_CCHEKPAD checks that the padding around a local array has not been * overwritten since the call to PB_CFILLPAD. Three types of errors are * reported: * * 1) Overwrite in pre-guardzone. This indicates a memory overwrite has * occurred in the first IPRE elements which form a buffer before the * beginning of A. Therefore, the error message: * 'Overwrite in pre-guardzone: loc( 5) = 18.00000' * tells that the 5th element of the IPRE long buffer has been overwrit- * ten with the value 18, where it should still have the value CHKVAL. * * 2) Overwrite in post-guardzone. This indicates a memory overwrite has * occurred in the last IPOST elements which form a buffer after the end * of A. Error reports are refered from the end of A. Therefore, * 'Overwrite in post-guardzone: loc( 19) = 24.00000' * tells that the 19th element after the end of A was overwritten with * the value 24, where it should still have the value of CHKVAL. * * 3) Overwrite in lda-m gap. Tells you elements between M and LDA were * overwritten. So, * 'Overwrite in lda-m gap: A( 12, 3) = 22.00000' * tells that the element at the 12th row and 3rd column of A was over- * written with the value of 22, where it should still have the value of * CHKVAL. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * MESS (local input) CHARACTER*(*) * On entry, MESS is a ttring containing a user-defined message. * * M (local input) INTEGER * On entry, M specifies the number of rows in the local array * A. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns in the local ar- * ray A. N must be at least zero. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LDA,N). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array to be padded. LDA must be at least MAX( 1, M ). * * IPRE (local input) INTEGER * On entry, IPRE specifies the size of the guard zone to put * before the start of the padded array. * * IPOST (local input) INTEGER * On entry, IPOST specifies the size of the guard zone to put * after the end of the padded array. * * CHKVAL (local input) COMPLEX * On entry, CHKVAL specifies the value to pad the array with. * * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. CHARACTER*1 TOP INTEGER I, IAM, IDUMM, INFO, J, K, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, PB_TOPGET * .. * .. Intrinsic Functions .. INTRINSIC AIMAG, REAL * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW*NPCOL + MYCOL INFO = -1 * * Check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, ' pre', I, $ REAL( A( I ) ), AIMAG( A( I ) ) INFO = IAM END IF 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PB_CCHEKPAD' END IF * * Check buffer after A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, 'post', $ I-J+1, REAL( A( I ) ), $ AIMAG( A( I ) ) INFO = IAM END IF 20 CONTINUE ELSE WRITE( *, FMT = * ) $ 'WARNING no post-guardzone buffer in PB_CCHEKPAD' END IF * * Check all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9997 ) MYROW, MYCOL, MESS, $ I-IPRE-LDA*(J-1), J, REAL( A( I ) ), $ AIMAG( A( I ) ) INFO = IAM END IF 30 CONTINUE K = K + LDA 40 CONTINUE END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, IDUMM, IDUMM, -1, $ 0, 0 ) IF( IAM.EQ.0 .AND. INFO.GE.0 ) THEN WRITE( *, FMT = 9999 ) INFO / NPCOL, MOD( INFO, NPCOL ), MESS END IF * 9999 FORMAT( '{', I5, ',', I5, '}: Memory overwrite in ', A ) 9998 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ A4, '-guardzone: loc(', I3, ') = ', G11.4, '+ i*', $ G11.4 ) 9997 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ 'lda-m gap: loc(', I3, ',', I3, ') = ', G11.4, $ '+ i*', G11.4 ) * RETURN * * End of PB_CCHEKPAD * END SUBROUTINE PB_CLASET( UPLO, M, N, IOFFD, ALPHA, BETA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PB_CLASET initializes a two-dimensional array A to beta on the diago- * nal specified by IOFFD and alpha on the offdiagonals. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be set as follows: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of A is not changed, * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of A is not changed, * = 'D' or 'd' Only the diagonal of A is set, * Otherwise: All of the array A is set. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the value to which the offdiagonal * array elements are set to. * * BETA (input) COMPLEX * On entry, BETA specifies the value to which the diagonal ar- * ray elements are set to. * * A (input/output) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be set, and the strictly lower trape- * zoidal part of A is not referenced; When IUPLO = 'L' or 'l', * the leading m by n part of the array A must contain the * lower trapezoidal part of the matrix as specified by IOFFD to * be set, and the strictly upper trapezoidal part of A is * not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Set the diagonal to BETA and the strictly lower triangular * part of the array to ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = BETA DO 30 I = JTMP + 1, M A( I, J ) = ALPHA 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Set the diagonal to BETA and the strictly upper triangular * part of the array to ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 50 I = 1, JTMP - 1 A( I, J ) = ALPHA 50 CONTINUE A( JTMP, J ) = BETA 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Set the array to BETA on the diagonal. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 90 CONTINUE * ELSE * * Set the array to BETA on the diagonal and ALPHA on the * offdiagonal. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA 100 CONTINUE 110 CONTINUE IF( ALPHA.NE.BETA .AND. IOFFD.LT.M .AND. IOFFD.GT.-N ) THEN DO 120 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 120 CONTINUE END IF * END IF * RETURN * * End of PB_CLASET * END SUBROUTINE PB_CLASCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX ALPHA * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PB_CLASCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * A (input/output) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA * A( I, J ) 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = ALPHA * A( I, J ) 50 CONTINUE 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA * A( I, J ) 100 CONTINUE 110 CONTINUE * END IF * RETURN * * End of PB_CLASCAL * END SUBROUTINE PB_CLAGEN( UPLO, AFORM, A, LDA, LCMT00, IRAN, MBLKS, $ IMBLOC, MB, LMBLOC, NBLKS, INBLOC, NB, $ LNBLOC, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO, AFORM INTEGER IMBLOC, INBLOC, LCMT00, LDA, LMBLOC, LNBLOC, $ MB, MBLKS, NB, NBLKS * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PB_CLAGEN locally initializes an array A. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the lower (UPLO='L') trape- * zoidal part or the upper (UPLO='U') trapezoidal part is to be * generated when the matrix to be generated is symmetric or * Hermitian. For all the other values of AFORM, the value of * this input argument is ignored. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * A (local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, *). On exit, * this array contains the local entries of the randomly genera- * ted submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. LDA must be at least one. * * LCMT00 (global input) INTEGER * On entry, LCMT00 is the LCM value specifying the off-diagonal * of the underlying matrix of interest. LCMT00=0 specifies the * main diagonal, LCMT00 > 0 specifies a subdiagonal, LCMT00 < 0 * specifies superdiagonals. * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2 containing respec- * tively the 16-lower and 16-higher bits of the encoding of the * entry of the random sequence corresponding locally to the * first local array entry to generate. Usually, this array is * computed by PB_SETLOCRAN. * * MBLKS (local input) INTEGER * On entry, MBLKS specifies the local number of blocks of rows. * MBLKS is at least zero. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * LMBLOC (local input) INTEGER * On entry, LMBLOC specifies the number of rows (size) of the * local lowest blocks. LMBLOC is at least zero. * * NBLKS (local input) INTEGER * On entry, NBLKS specifies the local number of blocks of co- * lumns. NBLKS is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * LNBLOC (local input) INTEGER * On entry, LNBLOC specifies the number of columns (size) of * the local rightmost blocks. LNBLOC is at least zero. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the random matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IB, IBLK, II, IK, ITMP, JB, JBLK, JJ, JK, $ JTMP, LCMTC, LCMTR, LOW, MNB, UPP COMPLEX DUMMY * .. * .. Local Arrays .. INTEGER IB0( 2 ), IB1( 2 ), IB2( 2 ), IB3( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMPIT * .. * .. External Functions .. LOGICAL LSAME REAL PB_SRAND EXTERNAL LSAME, PB_SRAND * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL * .. * .. Executable Statements .. * DO 10 I = 1, 2 IB1( I ) = IRAN( I ) IB2( I ) = IRAN( I ) IB3( I ) = IRAN( I ) 10 CONTINUE * IF( LSAME( AFORM, 'N' ) ) THEN * * Generate random matrix * JJ = 1 * DO 50 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * DO 40 JK = JJ, JJ + JB - 1 * II = 1 * DO 30 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * * Blocks are IB by JB * DO 20 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), PB_SRAND( 0 ) ) 20 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 30 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 40 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 50 CONTINUE * ELSE IF( LSAME( AFORM, 'T' ) ) THEN * * Generate the transpose of the matrix that would be normally * generated. * II = 1 * DO 90 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 80 IK = II, II + IB - 1 * JJ = 1 * DO 70 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 60 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), PB_SRAND( 0 ) ) 60 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 70 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 80 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 90 CONTINUE * ELSE IF( LSAME( AFORM, 'S' ) ) THEN * * Generate a symmetric matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 170 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 160 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 150 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 100 IK = II, II + IB - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 100 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 110 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 110 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 120 IK = II, ITMP - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 120 CONTINUE * DO 130 IK = ITMP, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 130 CONTINUE * END IF * ELSE * DO 140 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 140 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 150 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 160 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 170 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 250 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 240 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 230 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 180 JK = JJ, JJ + JB - 1 DUMMY = CMPLX( PB_SRAND( 0 ), PB_SRAND( 0 ) ) 180 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 190 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 190 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 200 JK = JJ, JTMP - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 200 CONTINUE * DO 210 JK = JTMP, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 210 CONTINUE * END IF * ELSE * DO 220 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 220 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 230 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 240 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 250 CONTINUE * END IF * ELSE IF( LSAME( AFORM, 'C' ) ) THEN * * Generate the conjugate transpose of the matrix that would be * normally generated. * II = 1 * DO 290 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 280 IK = II, II + IB - 1 * JJ = 1 * DO 270 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 260 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 260 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 270 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 280 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 290 CONTINUE * ELSE IF( LSAME( AFORM, 'H' ) ) THEN * * Generate a Hermitian matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 370 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 360 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 350 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 300 IK = II, II + IB - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 300 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 310 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 310 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 320 IK = II, ITMP - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 320 CONTINUE * IF( ITMP.LE.( II + IB - 1 ) ) THEN DUMMY = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) A( ITMP, JK ) = CMPLX( REAL( DUMMY ), $ ZERO ) END IF * DO 330 IK = ITMP + 1, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 330 CONTINUE * END IF * ELSE * DO 340 IK = II, II + IB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ PB_SRAND( 0 ) ) 340 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 350 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 360 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 370 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 450 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 440 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 430 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 380 JK = JJ, JJ + JB - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 380 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 390 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 390 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 400 JK = JJ, JTMP - 1 DUMMY = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 400 CONTINUE * IF( JTMP.LE.( JJ + JB - 1 ) ) THEN DUMMY = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) A( IK, JTMP ) = CMPLX( REAL( DUMMY ), $ ZERO ) END IF * DO 410 JK = JTMP + 1, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 410 CONTINUE * END IF * ELSE * DO 420 JK = JJ, JJ + JB - 1 A( IK, JK ) = CMPLX( PB_SRAND( 0 ), $ -PB_SRAND( 0 ) ) 420 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 430 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 440 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 450 CONTINUE * END IF * END IF * RETURN * * End of PB_CLAGEN * END REAL FUNCTION PB_SRAND( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_SRAND generates the next number in the random sequence. This func- * tion ensures that this number will be in the interval ( -1.0, 1.0 ). * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. External Functions .. REAL PB_SRAN EXTERNAL PB_SRAN * .. * .. Executable Statements .. * PB_SRAND = ONE - TWO * PB_SRAN( IDUMM ) * RETURN * * End of PB_SRAND * END REAL FUNCTION PB_SRAN( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_SRAN generates the next number in the random sequence. * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL DIVFAC, POW16 PARAMETER ( DIVFAC = 2.147483648E+9, $ POW16 = 6.5536E+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Intrinsic Functions .. INTRINSIC REAL * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * PB_SRAN = ( REAL( IRAND( 1 ) ) + POW16 * REAL( IRAND( 2 ) ) ) / $ DIVFAC * CALL PB_LMUL( IRAND, IACS, J ) CALL PB_LADD( J, IACS( 3 ), IRAND ) * RETURN * * End of PB_SRAN * END scalapack-2.0.2/PBLAS/TESTING/PDBLAS1TST.dat000644 000766 000024 00000002737 10363532303 020011 0ustar00juliestaff000000 000000 'Level 1 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PDBLAS1TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q 2.0D0 value of ALPHA 4 number of tests problems 14 44 28 7 values of N 36 9 39 53 values of M_X 24 67 3 12 values of N_X 2 5 2 5 values of IMB_X 2 5 2 5 values of INB_X 2 5 2 5 values of MB_X 2 5 2 5 values of NB_X 0 0 0 0 values of RSRC_X 0 0 0 0 values of CSRC_X 5 3 1 1 values of IX 2 6 1 1 values of JX 1 9 1 53 values of INCX 2 6 35 14 values of M_Y 27 55 43 12 values of N_Y 2 5 2 5 values of IMB_Y 2 5 2 5 values of INB_Y 2 5 2 5 values of MB_Y 2 5 2 5 values of NB_Y 0 0 0 0 values of RSRC_Y 0 0 0 0 values of CSRC_Y 1 4 1 1 values of IY 7 6 1 1 values of JY 2 6 35 1 values of INCY PDSWAP T put F for no test in the same column PDSCAL T put F for no test in the same column PDCOPY T put F for no test in the same column PDAXPY T put F for no test in the same column PDDOT T put F for no test in the same column PDNRM2 T put F for no test in the same column PDASUM T put F for no test in the same column PDAMAX T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/pdblas1tst.f000644 000766 000024 00000364110 11750130340 020116 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PDSWAP ', 'PDSCAL ', 'PDCOPY ', $ 'PDAXPY ', 'PDDOT ', 'PDNRM2 ', $ 'PDASUM ', 'PDAMAX '/ END BLOCK DATA PROGRAM PDBLA1TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PDBLA1TST is the main testing program for the PBLAS Level 1 routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 44 lines: * 'Level 1 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PDBLAS1TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0D0 value of ALPHA * 2 number of tests problems * 3 4 values of N * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PDSWAP T put F for no test in the same column * PDSCAL T put F for no test in the same column * PDCOPY T put F for no test in the same column * PDAXPY T put F for no test in the same column * PDDOT T put F for no test in the same column * PDNRM2 T put F for no test in the same column * PDASUM T put F for no test in the same column * PDAMAX T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * DBLESZ indicates the length in bytes on the given platform * for a double precision real. By default, DBLESZ is set to * eight. * * MEM DOUBLE PRECISION array * MEM is an array of dimension TOTMEM / DBLESZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, DBLESZ, TOTMEM, $ MEMSIZ, NSUBS DOUBLE PRECISION PADVAL, ZERO PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ DBLESZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, ZERO = 0.0D+0, $ PADVAL = -9923.0D+0, NSUBS = 8 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE INTEGER CSRCX, CSRCY, I, IAM, ICTXT, IGAP, IMBX, IMBY, $ IMIDX, IMIDY, INBX, INBY, INCX, INCY, IPMATX, $ IPMATY, IPOSTX, IPOSTY, IPREX, IPREY, IPW, IPX, $ IPY, IVERB, IX, IXSEED, IY, IYSEED, J, JX, JY, $ K, LDX, LDY, MBX, MBY, MEMREQD, MPX, MPY, MX, $ MY, MYCOL, MYROW, N, NBX, NBY, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQX, NQY, NTESTS, NX, NY, $ PISCLR, RSRCX, RSRCY, TSKIP, TSTCNT DOUBLE PRECISION ALPHA, PSCLR, PUSCLR * .. * .. Local Arrays .. CHARACTER*80 OUTFILE LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) INTEGER CSCXVAL( MAXTESTS ), CSCYVAL( MAXTESTS ), $ DESCX( DLEN_ ), DESCXR( DLEN_ ), $ DESCY( DLEN_ ), DESCYR( DLEN_ ), IERR( 4 ), $ IMBXVAL( MAXTESTS ), IMBYVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JXVAL( MAXTESTS ), JYVAL( MAXTESTS ), $ KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MXVAL( MAXTESTS ), $ MYVAL( MAXTESTS ), NBXVAL( MAXTESTS ), $ NBYVAL( MAXTESTS ), NVAL( MAXTESTS ), $ NXVAL( MAXTESTS ), NYVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) DOUBLE PRECISION MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_DCHEKPAD, PB_DESCSET2, PB_DFILLPAD, $ PB_PDLAPRNT, PDAMAX, PDASUM, PDAXPY, $ PDBLA1TSTINFO, PDBLAS1TSTCHK, PDBLAS1TSTCHKE, $ PDCHKARG1, PDCHKVOUT, PDCOPY, PDDOT, PDLAGEN, $ PDMPRNT, PDNRM2, PDSCAL, PDSWAP, PDVPRNT, $ PVDESCCHK, PVDIMCHK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .FALSE., .TRUE., .TRUE., .TRUE., $ .FALSE., .FALSE., .FALSE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler will abort on errors. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IXSEED = 100 IYSEED = 200 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDBLA1TSTINFO( OUTFILE, NOUT, NTESTS, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, RSCXVAL, $ CSCXVAL, IXVAL, JXVAL, INCXVAL, MYVAL, $ NYVAL, IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, INCYVAL, $ MAXTESTS, NGRIDS, PVAL, MAXGRIDS, QVAL, $ MAXGRIDS, LTEST, SOF, TEE, IAM, IGAP, IVERB, $ NPROCS, ALPHA, MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PDBLAS1TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * N = NVAL( J ) MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN TSTCNT = TSTCNT + 1 WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY WRITE( NOUT, FMT = 9995 ) END IF * * Check the validity of the input and initialize DESC_ * CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, IGAP, GAPMUL, $ IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, IGAP, GAPMUL, $ IERR( 2 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDX = MAX( 1, MX ) LDY = MAX( 1, MY ) * * Assign pointers into MEM for matrices corresponding to * vectors X and Y. Ex: IPX starts at position MEM( IPREX+1 ). * IPX = IPREX + 1 IPY = IPX + DESCX( LLD_ ) * NQX + IPOSTX + IPREY IPMATX = IPY + DESCY( LLD_ ) * NQY + IPOSTY IPMATY = IPMATX + MX * NX IPW = IPMATY + MY * NY * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * MEMREQD = IPW - 1 + $ MAX( MAX( IMBX, MBX ), MAX( IMBY, MBY ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) MEMREQD*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 1 routines * DO 30 K = 1, NSUBS * * Continue only if this sub has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9989 ) SNAMES( K ) END IF * * Check the validity of the operand sizes * CALL PVDIMCHK( ICTXT, NOUT, N, 'X', IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, N, 'Y', IY, JY, DESCY, INCY, $ IERR( 2 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 GO TO 30 END IF * * Generate distributed matrices X and Y * CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCXR, MX, NX, IMBX, INBX, MBX, NBX, $ -1, -1, ICTXT, MAX( 1, MX ) ) CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCXR, IXSEED, MEM( IPMATX ), $ DESCXR( LLD_ ) ) IF( YCHECK( K ) ) THEN CALL PB_DESCSET2( DESCYR, MY, NY, IMBY, INBY, MBY, $ NBY, -1, -1, ICTXT, MAX( 1, MY ) ) CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCYR, IYSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) END IF * * Pad the guard zones of X, and Y * CALL PB_DFILLPAD( ICTXT, MPX, NQX, MEM( IPX-IPREX ), $ DESCX( LLD_ ), IPREX, IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_DFILLPAD( ICTXT, MPY, NQY, MEM( IPY-IPREY ), $ DESCY( LLD_ ), IPREY, IPOSTY, $ PADVAL ) END IF * * Initialize the check for INPUT only args. * INFO = 0 CALL PDCHKARG1( ICTXT, NOUT, SNAMES( K ), N, ALPHA, IX, $ JX, DESCX, INCX, IY, JY, DESCY, INCY, $ INFO ) * INFO = 0 PSCLR = ZERO PUSCLR = ZERO PISCLR = 0 * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PDLAPRNT( 1, N, MEM( IPX ), IX, JX, DESCX, $ 0, 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) ELSE CALL PB_PDLAPRNT( N, 1, MEM( IPX ), IX, JX, DESCX, $ 0, 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) END IF IF( YCHECK( K ) ) THEN IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PDLAPRNT( 1, N, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) ELSE CALL PB_PDLAPRNT( N, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) END IF END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PDLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, 0, $ 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) IF( YCHECK( K ) ) $ CALL PB_PDLAPRNT( MY, NY, MEM( IPY ), 1, 1, DESCY, $ 0, 0, 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) END IF * * Call the PBLAS routine * IF( K.EQ.1 ) THEN * * Test PDSWAP * CALL PDSWAP( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.2 ) THEN * * Test PDSCAL * PSCLR = ALPHA CALL PDSCAL( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.3 ) THEN * * Test PDCOPY * CALL PDCOPY( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.4 ) THEN * * Test PDAXPY * PSCLR = ALPHA CALL PDAXPY( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.5 ) THEN * * Test PDDOT * CALL PDDOT( N, PSCLR, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.6 ) THEN * * Test PDNRM2 * CALL PDNRM2( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.7 ) THEN * * Test PDASUM * CALL PDASUM( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.8 ) THEN * CALL PDAMAX( N, PSCLR, PISCLR, MEM( IPX ), IX, JX, $ DESCX, INCX ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9978 ) INFO GO TO 30 END IF * * Check the computations * CALL PDBLAS1TSTCHK( ICTXT, NOUT, K, N, PSCLR, PUSCLR, $ PISCLR, MEM( IPMATX ), MEM( IPX ), $ IX, JX, DESCX, INCX, MEM( IPMATY ), $ MEM( IPY ), IY, JY, DESCY, INCY, $ INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 END IF * * Check padding * CALL PB_DCHEKPAD( ICTXT, SNAMES( K ), MPX, NQX, $ MEM( IPX-IPREX ), DESCX( LLD_ ), $ IPREX, IPOSTX, PADVAL ) IF( YCHECK( K ) ) THEN CALL PB_DCHEKPAD( ICTXT, SNAMES( K ), MPY, NQY, $ MEM( IPY-IPREY ), DESCY( LLD_ ), $ IPREY, IPOSTY, PADVAL ) END IF * * Check input-only scalar arguments * INFO = 1 CALL PDCHKARG1( ICTXT, NOUT, SNAMES( K ), N, ALPHA, IX, $ JX, DESCX, INCX, IY, JY, DESCY, INCY, $ INFO ) * * Check input-only array arguments * CALL PDCHKVOUT( N, MEM( IPMATX ), MEM( IPX ), IX, JX, $ DESCX, INCX, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) 'PARALLEL_X', SNAMES( K ) END IF * IF( YCHECK( K ) ) THEN CALL PDCHKVOUT( N, MEM( IPMATY ), MEM( IPY ), IY, JY, $ DESCY, INCY, IERR( 4 ) ) IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) 'PARALLEL_Y', $ SNAMES( K ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE. 0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) SNAMES( K ) KFAIL( K ) = KFAIL( K ) + 1 ERRFLG = .TRUE. ELSE IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) SNAMES( K ) KPASS( K ) = KPASS( K ) + 1 END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 3 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PDMPRNT( ICTXT, NOUT, MX, NX, MEM( IPMATX ), $ LDX, 0, 0, 'SERIAL_X' ) CALL PB_PDLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, $ 0, 0, 'PARALLEL_X', NOUT, $ MEM( IPMATX ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( N.GT.0 ) $ CALL PDVPRNT( ICTXT, NOUT, N, $ MEM( IPMATX+IX-1+(JX-1)*LDX ), $ INCX, 0, 0, 'SERIAL_X' ) IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PDLAPRNT( 1, N, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PDLAPRNT( N, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) END IF END IF IF( YCHECK( K ) ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PDMPRNT( ICTXT, NOUT, MY, NY, $ MEM( IPMATY ), LDY, 0, 0, $ 'SERIAL_Y' ) CALL PB_PDLAPRNT( MY, NY, MEM( IPY ), 1, 1, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( N.GT.0 ) $ CALL PDVPRNT( ICTXT, NOUT, N, $ MEM( IPMATY+IY-1+(JY-1)*LDY ), $ INCY, 0, 0, 'SERIAL_Y' ) IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PDLAPRNT( 1, N, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PDLAPRNT( N, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) END IF END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9984 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, '---------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' N IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9993 FORMAT( 2X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I5,1X,I5,1X,I5,1X,I5,1X, $ I5,1X,I5,1X,I6 ) 9992 FORMAT( 2X, ' N IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9991 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9990 FORMAT( 'Not enough memory. Need: ', I12 ) 9989 FORMAT( 2X, ' Tested Subroutine: ', A ) 9988 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9987 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9986 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9985 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9984 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9983 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9982 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9981 FORMAT( 2X, 'Testing Summary') 9980 FORMAT( 2X, 'End of Tests.' ) 9979 FORMAT( 2X, 'Tests started.' ) 9978 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PDBLA1TST * END SUBROUTINE PDBLA1TSTINFO( SUMMRY, NOUT, NMAT, NVAL, MXVAL, $ NXVAL, IMBXVAL, MBXVAL, INBXVAL, $ NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, $ CSCYVAL, IYVAL, JYVAL, INCYVAL, $ LDVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, LTEST, SOF, TEE, IAM, IGAP, $ IVERB, NPROCS, ALPHA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, $ NGRIDS, NMAT, NOUT, NPROCS DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY LOGICAL LTEST( * ) INTEGER CSCXVAL( LDVAL ), CSCYVAL( LDVAL ), $ IMBXVAL( LDVAL ), IMBYVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JXVAL( LDVAL ), $ JYVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCXVAL( LDVAL ), RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PDBLA1TSTINFO get the needed startup information for testing various * Level 1 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DESCX(:), IX, JX, INCX, DESCY(:), * IY, JY and INCY. This is also the maximum number of test * cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 1 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) DOUBLE PRECISION * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 2, 2*NGRIDS+23*NMAT+NSUBS+4 ) with NSUBS equal to 8. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J DOUBLE PRECISION EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, DGEBR2D, DGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PDBLAS1TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 100 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 100 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA * READ( NIN, FMT = * ) ALPHA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 100 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 100 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, WORK, 2 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 70 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 70 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 1 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real double precision '// $ 'Level 1 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9982 ) ALPHA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 80 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 80 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, WORK, 2, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) * I = 2*NGRIDS + 23*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 90 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 90 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 100 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : ', G16.6 ) * * End of PDBLA1TSTINFO * END SUBROUTINE PDBLAS1TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PDBLAS1TSTCHKE tests the error exits of the Level 1 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 8 (NSUBS). * If LTEST( 1 ) is .TRUE., PDSWAP will be tested; * If LTEST( 2 ) is .TRUE., PDSCAL will be tested; * If LTEST( 3 ) is .TRUE., PDCOPY will be tested; * If LTEST( 4 ) is .TRUE., PDAXPY will be tested; * If LTEST( 5 ) is .TRUE., PDDOT will be tested; * If LTEST( 6 ) is .TRUE., PDNRM2 will be tested; * If LTEST( 7 ) is .TRUE., PDASUM will be tested; * If LTEST( 8 ) is .TRUE., PDAMAX will be tested. * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PDAMAX, PDASUM, PDAXPY, PDCOPY, $ PDDIMEE, PDDOT, PDNRM2, PDSCAL, PDSWAP, $ PDVECEE * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/11, 12, 11, 13, 13, 15, 15, 14/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PDSWAP * I = 1 IF( LTEST( I ) ) THEN CALL PDDIMEE( ICTXT, NOUT, PDSWAP, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDSWAP, SCODE( I ), SNAMES( I ) ) END IF * * Test PDSCAL * I = I + 1 IF( LTEST( I ) ) THEN CALL PDDIMEE( ICTXT, NOUT, PDSCAL, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDSCAL, SCODE( I ), SNAMES( I ) ) END IF * * Test PDCOPY * I = I + 1 IF( LTEST( I ) ) THEN CALL PDDIMEE( ICTXT, NOUT, PDCOPY, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDCOPY, SCODE( I ), SNAMES( I ) ) END IF * * Test PDAXPY * I = I + 1 IF( LTEST( I ) ) THEN CALL PDDIMEE( ICTXT, NOUT, PDAXPY, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDAXPY, SCODE( I ), SNAMES( I ) ) END IF * * Test PDDOT * I = I + 1 IF( LTEST( I ) ) THEN CALL PDDIMEE( ICTXT, NOUT, PDDOT, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDDOT, SCODE( I ), SNAMES( I ) ) END IF * * Test PDNRM2 * I = I + 1 IF( LTEST( I ) ) THEN CALL PDDIMEE( ICTXT, NOUT, PDNRM2, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDNRM2, SCODE( I ), SNAMES( I ) ) END IF * * Test PDASUM * I = I + 1 IF( LTEST( I ) ) THEN CALL PDDIMEE( ICTXT, NOUT, PDASUM, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDASUM, SCODE( I ), SNAMES( I ) ) END IF * * Test PDAMAX * I = I + 1 IF( LTEST( I ) ) THEN CALL PDDIMEE( ICTXT, NOUT, PDAMAX, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDAMAX, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PDBLAS1TSTCHKE * END SUBROUTINE PDCHKARG1( ICTXT, NOUT, SNAME, N, ALPHA, IX, JX, $ DESCX, INCX, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, INCY, INFO, IX, IY, JX, JY, N, $ NOUT DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. CHARACTER*(*) SNAME INTEGER DESCX( * ), DESCY( * ) * .. * * Purpose * ======= * * PDCHKARG1 checks the input-only arguments of the Level 1 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * N (global input) INTEGER * On entry, N specifies the length of the subvector operands. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, INCXREF, INCYREF, IXREF, IYREF, JXREF, $ JYREF, MYCOL, MYROW, NPCOL, NPROW, NREF DOUBLE PRECISION ALPHAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCXREF( DLEN_ ), DESCYREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * NREF = N IXREF = IX JXREF = JX DO 10 I = 1, DLEN_ DESCXREF( I ) = DESCX( I ) 10 CONTINUE INCXREF = INCX IYREF = IY JYREF = JY DO 20 I = 1, DLEN_ DESCYREF( I ) = DESCY( I ) 20 CONTINUE INCYREF = INCY ALPHAREF = ALPHA * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( IX.NE.IXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IX' ELSE IF( JX.NE.JXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JX' ELSE IF( DESCX( DTYPE_ ).NE.DESCXREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( DTYPE_ )' ELSE IF( DESCX( M_ ).NE.DESCXREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( M_ )' ELSE IF( DESCX( N_ ).NE.DESCXREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( N_ )' ELSE IF( DESCX( IMB_ ).NE.DESCXREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( IMB_ )' ELSE IF( DESCX( INB_ ).NE.DESCXREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( INB_ )' ELSE IF( DESCX( MB_ ).NE.DESCXREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( MB_ )' ELSE IF( DESCX( NB_ ).NE.DESCXREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( NB_ )' ELSE IF( DESCX( RSRC_ ).NE.DESCXREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( RSRC_ )' ELSE IF( DESCX( CSRC_ ).NE.DESCXREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CSRC_ )' ELSE IF( DESCX( CTXT_ ).NE.DESCXREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CTXT_ )' ELSE IF( DESCX( LLD_ ).NE.DESCXREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( LLD_ )' ELSE IF( INCX.NE.INCXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCX' ELSE IF( IY.NE.IYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IY' ELSE IF( JY.NE.JYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JY' ELSE IF( DESCY( DTYPE_ ).NE.DESCYREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( DTYPE_ )' ELSE IF( DESCY( M_ ).NE.DESCYREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( M_ )' ELSE IF( DESCY( N_ ).NE.DESCYREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( N_ )' ELSE IF( DESCY( IMB_ ).NE.DESCYREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( IMB_ )' ELSE IF( DESCY( INB_ ).NE.DESCYREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( INB_ )' ELSE IF( DESCY( MB_ ).NE.DESCYREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( MB_ )' ELSE IF( DESCY( NB_ ).NE.DESCYREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( NB_ )' ELSE IF( DESCY( RSRC_ ).NE.DESCYREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( RSRC_ )' ELSE IF( DESCY( CSRC_ ).NE.DESCYREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CSRC_ )' ELSE IF( DESCY( CTXT_ ).NE.DESCYREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CTXT_ )' ELSE IF( DESCY( LLD_ ).NE.DESCYREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( LLD_ )' ELSE IF( INCY.NE.INCYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCY' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.GT.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PDCHKARG1 * END LOGICAL FUNCTION PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) * .. * * Purpose * ======= * * PISINSCOPE returns .TRUE. if the calling process is in the scope of * sub( X ) = X( IX+(JX-1)*DESCX(M_)+(i-1)*INCX ) and .FALSE. if it is * not. This routine is used to determine which processes should check * the answer returned by some Level 1 PBLAS routines. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * N (global input) INTEGER * The length of the subvector sub( X ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER IIX, IXCOL, IXROW, JJX, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_INFOG2L * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN * * This is the special case, find process owner of IX, JX, and * only this process is the scope. * PISINSCOPE = ( ( IXROW.EQ.MYROW .OR. ROWREP ) .AND. $ ( IXCOL.EQ.MYCOL .OR. COLREP ) ) * ELSE * IF( INCX.EQ.DESCX( M_ ) ) THEN * * row vector * PISINSCOPE = ( MYROW.EQ.IXROW .OR. ROWREP ) * ELSE * * column vector * PISINSCOPE = ( MYCOL.EQ.IXCOL .OR. COLREP ) * END IF * END IF * RETURN * * End of PISINSCOPE * END SUBROUTINE PDBLAS1TSTCHK( ICTXT, NOUT, NROUT, N, PSCLR, PUSCLR, $ PISCLR, X, PX, IX, JX, DESCX, INCX, Y, $ PY, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, INCY, INFO, IX, IY, JX, JY, N, $ NOUT, NROUT, PISCLR DOUBLE PRECISION PSCLR, PUSCLR * .. * .. Array Arguments .. INTEGER DESCX( * ), DESCY( * ) DOUBLE PRECISION PX( * ), PY( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PDBLAS1TSTCHK performs the computational tests of the Level 1 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PDSWAP will be tested; * else if NROUT = 2, PDSCAL will be tested; * else if NROUT = 3, PDCOPY will be tested; * else if NROUT = 4, PDAXPY will be tested; * else if NROUT = 5, PDDOT will be tested; * else if NROUT = 6, PDNRM2 will be tested; * else if NROUT = 7, PDASUM will be tested; * else if NROUT = 8, PDAMAX will be tested. * * N (global input) INTEGER * On entry, N specifies the length of the subvector operands. * * PSCLR (global input) DOUBLE PRECISION * On entry, depending on the value of NROUT, PSCLR specifies * the scalar ALPHA, or the output scalar returned by the PBLAS, * i.e., the dot product, the 2-norm, the absolute sum or the * value of AMAX. * * PUSCLR (global input) DOUBLE PRECISION * On entry, PUSCLR specifies the real part of the scalar ALPHA * used by the real scaling, the 2-norm, or the absolute sum * routines. PUSCLR is not used in the real versions of this * routine. * * PISCLR (global input) DOUBLE PRECISION * On entry, PISCLR specifies the value of the global index re- * turned by PDAMAX, otherwise PISCLR is not used. * * X (local input/local output) DOUBLE PRECISION array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) DOUBLE PRECISION array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) DOUBLE PRECISION array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on X has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on Y has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL COLREP, INXSCOPE, INYSCOPE, ROWREP INTEGER I, IB, ICURCOL, ICURROW, IDUMM, IIX, IIY, IN, $ IOFFX, IOFFY, ISCLR, IXCOL, IXROW, IYCOL, $ IYROW, J, JB, JJX, JJY, JN, KK, LDX, LDY, $ MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR, ERRMAX, PREC, SCLR, USCLR * .. * .. Local Arrays .. INTEGER IERR( 6 ) CHARACTER*5 ARGIN1, ARGIN2, ARGOUT1, ARGOUT2 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DCOPY, DSWAP, IGAMX2D, $ PB_INFOG2L, PDCHKVIN, PDERRASUM, PDERRAXPY, $ PDERRDOT, PDERRNRM2, PDERRSCAL * .. * .. External Functions .. LOGICAL PISINSCOPE INTEGER IDAMAX DOUBLE PRECISION PDLAMCH EXTERNAL IDAMAX, PDLAMCH, PISINSCOPE * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * ARGIN1 = ' ' ARGIN2 = ' ' ARGOUT1 = ' ' ARGOUT2 = ' ' DO 10 I = 1, 6 IERR( I ) = 0 10 CONTINUE * PREC = PDLAMCH( ICTXT, 'precision' ) * IF( NROUT.EQ.1 ) THEN * * Test PDSWAP * IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL DSWAP( N, X( IOFFX ), INCX, Y( IOFFY ), INCY ) CALL PDCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PDCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) * ELSE IF( NROUT.EQ.2 ) THEN * * Test PDSCAL * LDX = DESCX( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ICURROW = IXROW ICURCOL = IXCOL ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * DO 20 J = JX, JN * CALL PDERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 20 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 40 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * DO 30 KK = 0, JB-1 * CALL PDERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 30 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 40 CONTINUE * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * DO 50 I = IX, IN * CALL PDERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 70 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * DO 60 KK = 0, IB-1 * CALL PDERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX 60 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 70 CONTINUE * END IF * ELSE IF( NROUT.EQ.3 ) THEN * * Test PDCOPY * IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL DCOPY( N, X( IOFFX ), INCX, Y( IOFFY ), INCY ) CALL PDCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PDCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) * ELSE IF( NROUT.EQ.4 ) THEN * * Test PDAXPY * CALL PDCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) LDY = DESCY( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PB_INFOG2L( IY, JY, DESCY, NPROW, NPCOL, MYROW, MYCOL, $ IIY, JJY, IYROW, IYCOL ) ICURROW = IYROW ICURCOL = IYCOL ROWREP = ( IYROW.EQ.-1 ) COLREP = ( IYCOL.EQ.-1 ) * IF( INCY.EQ.DESCY( M_ ) ) THEN * * sub( Y ) is a row vector * JB = DESCY( INB_ ) - JY + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCY( NB_ ) + 1 ) * DESCY( NB_ ) + JB JB = MIN( JB, N ) JN = JY + JB - 1 * DO 140 J = JY, JN * CALL PDERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF JJY = JJY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 140 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 160 J = JN+1, JY+N-1, DESCY( NB_ ) JB = MIN( JY+N-J, DESCY( NB_ ) ) * DO 150 KK = 0, JB-1 * CALL PDERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF JJY = JJY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 150 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 160 CONTINUE * ELSE * * sub( Y ) is a column vector * IB = DESCY( IMB_ ) - IY + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCY( MB_ ) + 1 ) * DESCY( MB_ ) + IB IB = MIN( IB, N ) IN = IY + IB - 1 * DO 170 I = IY, IN * CALL PDERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF IIY = IIY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 170 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 190 I = IN+1, IY+N-1, DESCY( MB_ ) IB = MIN( IY+N-I, DESCY( MB_ ) ) * DO 180 KK = 0, IB-1 * CALL PDERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF IIY = IIY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 180 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 190 CONTINUE * END IF * ELSE IF( NROUT.EQ.5 ) THEN * * Test PDDOT * CALL PDCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PDCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PDERRDOT( ERR, N, SCLR, X( IOFFX ), INCX, Y( IOFFY ), $ INCY, PREC ) INXSCOPE = PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) INYSCOPE = PISINSCOPE( ICTXT, N, IY, JY, DESCY, INCY ) IF( INXSCOPE.OR.INYSCOPE ) THEN IF( ABS( PSCLR - SCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'DOT' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF ELSE SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'DOT' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.6 ) THEN * * Test PDNRM2 * CALL PDCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PDERRNRM2( ERR, N, USCLR, X( IOFFX ), INCX, PREC ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN IF( ABS( PUSCLR - USCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'NRM2' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) USCLR, PUSCLR END IF END IF ELSE USCLR = ZERO IF( PUSCLR.NE.USCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'NRM2' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) USCLR, PUSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.7 ) THEN * * Test PDASUM * CALL PDCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PDERRASUM( ERR, N, USCLR, X( IOFFX ), INCX, PREC ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN IF( ABS( PUSCLR - USCLR ) .GT. ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'ASUM' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) USCLR, PUSCLR END IF END IF ELSE USCLR = ZERO IF( PUSCLR.NE.USCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'ASUM' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) USCLR, PUSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.8 ) THEN * * Test PDAMAX * CALL PDCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN ISCLR = IDAMAX( N, X( IOFFX ), INCX ) IF( N.LT.1 ) THEN SCLR = ZERO ELSE IF( ( INCX.EQ.1 ).AND.( DESCX( M_ ).EQ.1 ).AND. $ ( N.EQ.1 ) ) THEN ISCLR = JX SCLR = X( IOFFX ) ELSE IF( INCX.EQ.DESCX( M_ ) ) THEN ISCLR = JX + ISCLR - 1 SCLR = X( IX + ( ISCLR - 1 ) * DESCX( M_ ) ) ELSE ISCLR = IX + ISCLR - 1 SCLR = X( ISCLR + ( JX - 1 ) * DESCX( M_ ) ) END IF * IF( PSCLR.NE.SCLR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'AMAX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF * IF( PISCLR.NE.ISCLR ) THEN IERR( 5 ) = 1 WRITE( ARGIN2, FMT = '(A)' ) 'INDX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN2 WRITE( NOUT, FMT = 9995 ) ISCLR, PISCLR END IF END IF ELSE ISCLR = 0 SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'AMAX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF IF( PISCLR.NE.ISCLR ) THEN IERR( 6 ) = 1 WRITE( ARGOUT2, FMT = '(A)' ) 'INDX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT2 WRITE( NOUT, FMT = 9995 ) ISCLR, PISCLR END IF END IF END IF * END IF * * Find IERR across all processes * CALL IGAMX2D( ICTXT, 'All', ' ', 6, 1, IERR, 6, IDUMM, IDUMM, -1, $ -1, 0 ) * * Encode the errors found in INFO * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'X' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Y' END IF * IF( IERR( 3 ).NE.0 ) $ INFO = INFO + 4 * IF( IERR( 4 ).NE.0 ) $ INFO = INFO + 8 * IF( IERR( 5 ).NE.0 ) $ INFO = INFO + 16 * IF( IERR( 6 ).NE.0 ) $ INFO = INFO + 32 * 9999 FORMAT( 2X, ' ***** ERROR: Vector operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** ERROR: Output scalar result ', A, $ ' in scope is incorrect.' ) 9997 FORMAT( 2X, ' ***** ERROR: Output scalar result ', A, $ ' out of scope is incorrect.' ) 9996 FORMAT( 2X, ' ***** Expected value is: ', D30.18, /2X, $ ' Obtained value is: ', D30.18 ) 9995 FORMAT( 2X, ' ***** Expected value is: ', I6, /2X, $ ' Obtained value is: ', I6 ) * RETURN * * End of PDBLAS1TSTCHK * END SUBROUTINE PDERRDOT( ERRBND, N, SCLR, X, INCX, Y, INCY, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N DOUBLE PRECISION ERRBND, PREC, SCLR * .. * .. Array Arguments .. DOUBLE PRECISION X( * ), Y( * ) * .. * * Purpose * ======= * * PDERRDOT serially computes the dot product X**T * Y and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If dot1 = SCLR and dot2 are two different computed results, and dot1 * is being assumed to be correct, we require * * abs( dot1 - dot2 ) <= ERRBND = ERRFACT * abs( dot1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operands. * * SCLR (global output) DOUBLE PRECISION * On exit, SCLR specifies the dot product of the two vectors * X and Y. * * X (global input) DOUBLE PRECISION array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (global input) DOUBLE PRECISION array * On entry, Y is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremen- * ted array Y must contain the vector y. * * INCY (global input) INTEGER. * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IX, IY DOUBLE PRECISION ADDBND, FACT, SUMNEG, SUMPOS, TMP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * IX = 1 IY = 1 SCLR = ZERO SUMPOS = ZERO SUMNEG = ZERO FACT = TWO * ( ONE + PREC ) ADDBND = TWO * TWO * TWO * PREC * DO 10 I = 1, N TMP = X( IX ) * Y( IY ) SCLR = SCLR + TMP IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP * FACT ELSE SUMNEG = SUMNEG - TMP * FACT END IF IX = IX + INCX IY = IY + INCY 10 CONTINUE * ERRBND = ADDBND * MAX( SUMPOS, SUMNEG ) * RETURN * * End of PDERRDOT * END SUBROUTINE PDERRNRM2( ERRBND, N, USCLR, X, INCX, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N DOUBLE PRECISION ERRBND, PREC, USCLR * .. * .. Array Arguments .. DOUBLE PRECISION X( * ) * .. * * Purpose * ======= * * PDERRNRM2 serially computes the 2-norm the vector X and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If norm1 = SCLR and norm2 are two different computed results, and * norm1 being assumed to be correct, we require * * abs( norm1 - norm2 ) <= ERRBND = ERRFACT * abs( norm1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operand. * * USCLR (global output) DOUBLE PRECISION * On exit, USCLR specifies the 2-norm of the vector X. * * X (global input) DOUBLE PRECISION array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER IX DOUBLE PRECISION ABSXI, ADDBND, FACT, SCALE, SSQ, SUMSCA, SUMSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * USCLR = ZERO SUMSSQ = ONE SUMSCA = ZERO ADDBND = TWO * TWO * TWO * PREC FACT = ONE + TWO * ( ( ONE + PREC )**3 - ONE ) * SCALE = ZERO SSQ = ONE DO 10 IX = 1, 1 + ( N - 1 )*INCX, INCX IF( X( IX ).NE.ZERO ) THEN ABSXI = ABS( X( IX ) ) IF( SCALE.LT.ABSXI )THEN SUMSSQ = ONE + ( SSQ*( SCALE/ABSXI )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = ONE + SSQ*( SCALE/ABSXI )**2 SUMSCA = ABSXI SCALE = ABSXI ELSE SUMSSQ = SSQ + ( ( ABSXI/SCALE )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = SSQ + ( ABSXI/SCALE )**2 END IF END IF 10 CONTINUE * USCLR = SCALE * SQRT( SSQ ) * * Error on square root * ERRBND = SQRT( SUMSSQ ) * ( ONE + TWO * ( 1.00001D+0 * PREC ) ) * ERRBND = ( SUMSCA * ERRBND ) - USCLR * RETURN * * End of PDERRNRM2 * END SUBROUTINE PDERRASUM( ERRBND, N, USCLR, X, INCX, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N DOUBLE PRECISION ERRBND, PREC, USCLR * .. * .. Array Arguments .. DOUBLE PRECISION X( * ) * .. * * Purpose * ======= * * PDERRASUM serially computes the sum of absolute values of the vector * X and returns a scaled relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies a scaled relative acceptable error * bound. In this case the error bound is just the absolute sum * multiplied by a constant proportional to the machine preci- * sion. * * N (global input) INTEGER * On entry, N specifies the length of the vector operand. * * USCLR (global output) DOUBLE PRECISION * On exit, USCLR specifies the sum of absolute values of the * vector X. * * X (global input) DOUBLE PRECISION array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION TWO, ZERO PARAMETER ( TWO = 2.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER IX DOUBLE PRECISION ADDBND * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * IX = 1 USCLR = ZERO ADDBND = TWO * TWO * TWO * PREC * DO 10 IX = 1, 1 + ( N - 1 )*INCX, INCX USCLR = USCLR + ABS( X( IX ) ) 10 CONTINUE * ERRBND = ADDBND * USCLR * RETURN * * End of PDERRASUM * END SUBROUTINE PDERRSCAL( ERRBND, PSCLR, X, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION ERRBND, PREC, PSCLR, X * .. * * Purpose * ======= * * PDERRSCAL serially computes the product PSCLR * X and returns a sca- * led relative acceptable error bound on the result. * * Notes * ===== * * If s1 = PSCLR*X and s2 are two different computed results, and s1 is * being assumed to be correct, we require * * abs( s1 - s2 ) <= ERRBND = ERRFACT * abs( s1 ), * * where ERRFACT is computed as two times the machine precision. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PSCLR (global input) DOUBLE PRECISION * On entry, PSCLR specifies the scale factor. * * X (global input/global output) DOUBLE PRECISION * On entry, X specifies the scalar to be scaled. On exit, X is * the scaled entry. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION TWO PARAMETER ( TWO = 2.0D+0 ) * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * X = PSCLR * X * ERRBND = ( TWO * PREC ) * ABS( X ) * RETURN * * End of PDERRSCAL * END SUBROUTINE PDERRAXPY( ERRBND, PSCLR, X, Y, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION ERRBND, PREC, PSCLR, X, Y * .. * * Purpose * ======= * * PDERRAXPY serially computes Y := Y + PSCLR * X and returns a scaled * relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PSCLR (global input) DOUBLE PRECISION * On entry, PSCLR specifies the scale factor. * * X (global input) DOUBLE PRECISION * On entry, X specifies the scalar to be scaled. * * Y (global input/global output) DOUBLE PRECISION * On entry, Y specifies the scalar to be added. On exit, Y con- * tains the resulting scalar. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION ADDBND, FACT, SUMPOS, SUMNEG, TMP * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * SUMPOS = ZERO SUMNEG = ZERO FACT = ONE + TWO * PREC ADDBND = TWO * TWO * TWO * PREC * TMP = PSCLR * X IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP * FACT ELSE SUMNEG = SUMNEG - TMP * FACT END IF * TMP = Y IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP ELSE SUMNEG = SUMNEG - TMP END IF * Y = Y + ( PSCLR * X ) * ERRBND = ADDBND * MAX( SUMPOS, SUMNEG ) * RETURN * * End of PDERRAXPY * END scalapack-2.0.2/PBLAS/TESTING/PDBLAS2TST.dat000644 000766 000024 00000006354 10363532303 020011 0ustar00juliestaff000000 000000 'Level 2 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PDBLAS2TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 16.0 threshold value of test ratio 10 value of the logical computational blocksize NB 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q 2.0D0 value of ALPHA 3.0D0 value of BETA 4 number of tests problems 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' values of UPLO 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANS 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' values of DIAG 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 values of M 8 10 12 14 16 18 20 22 24 9 11 13 15 17 19 21 values of N 31 33 35 36 34 32 30 31 33 35 36 34 32 30 38 40 values of M_A 31 33 35 36 34 32 30 31 33 35 36 34 32 30 38 40 values of N_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_A 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IA 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JA 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 values of M_X 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 values of N_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_X 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IX 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JX 30 1 30 1 30 1 30 1 30 1 30 1 30 1 30 1 values of INCX 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 values of M_Y 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 values of N_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_Y 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IY 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JY 40 1 40 1 40 1 40 1 40 1 40 1 40 1 40 1 values of INCY PDGEMV T put F for no test in the same column PDSYMV T put F for no test in the same column PDTRMV T put F for no test in the same column PDTRSV T put F for no test in the same column PDGER T put F for no test in the same column PDSYR T put F for no test in the same column PDSYR2 T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/pdblas2tst.f000644 000766 000024 00000346700 11750130340 020124 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 7) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PDGEMV ', 'PDSYMV ', 'PDTRMV ', $ 'PDTRSV ', 'PDGER ', 'PDSYR ', $ 'PDSYR2 '/ END BLOCK DATA PROGRAM PDBLA2TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PDBLA2TST is the main testing program for the PBLAS Level 2 routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 60 lines: * 'Level 2 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PDBLAS2TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 16.0 threshold value of test ratio * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0D0 value of ALPHA * 1.0D0 value of BETA * 2 number of tests problems * 'U' 'L' values of UPLO * 'N' 'T' values of TRANS * 'N' 'U' values of DIAG * 3 4 values of M * 3 4 values of N * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PDGEMV T put F for no test in the same column * PDSYMV T put F for no test in the same column * PDTRMV T put F for no test in the same column * PDTRSV T put F for no test in the same column * PDGER T put F for no test in the same column * PDSYR T put F for no test in the same column * PDSYR2 T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * DBLESZ indicates the length in bytes on the given platform * for a double precision real. By default, DBLESZ is set to * eight. * * MEM DOUBLE PRECISION array * MEM is an array of dimension TOTMEM / DBLESZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, DBLESZ, TOTMEM, $ MEMSIZ, NSUBS DOUBLE PRECISION ONE, PADVAL, ZERO, ROGUE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ DBLESZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, ZERO = 0.0D+0, $ ONE = 1.0D+0, PADVAL = -9923.0D+0, $ NSUBS = 7, ROGUE = -1.0D+10 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE CHARACTER*1 AFORM, DIAG, DIAGDO, TRANS, UPLO INTEGER CSRCA, CSRCX, CSRCY, I, IA, IAM, IASEED, ICTXT, $ IGAP, IMBA, IMBX, IMBY, IMIDA, IMIDX, IMIDY, $ INBA, INBX, INBY, INCX, INCY, IPA, IPG, IPMATA, $ IPMATX, IPMATY, IPOSTA, IPOSTX, IPOSTY, IPREA, $ IPREX, IPREY, IPX, IPY, IVERB, IX, IXSEED, IY, $ IYSEED, J, JA, JX, JY, K, LDA, LDX, LDY, M, MA, $ MBA, MBX, MBY, MEMREQD, MPA, MPX, MPY, MX, MY, $ MYCOL, MYROW, N, NA, NBA, NBX, NBY, NCOLA, $ NGRIDS, NLX, NLY, NOUT, NPCOL, NPROCS, NPROW, $ NQA, NQX, NQY, NROWA, NTESTS, NX, NY, OFFD, $ RSRCA, RSRCX, RSRCY, TSKIP, TSTCNT REAL THRESH DOUBLE PRECISION ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), TRANVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCXVAL( MAXTESTS ), $ CSCYVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCAR( DLEN_ ), DESCX( DLEN_ ), $ DESCXR( DLEN_ ), DESCY( DLEN_ ), $ DESCYR( DLEN_ ), IAVAL( MAXTESTS ), IERR( 6 ), $ IMBAVAL( MAXTESTS ), IMBXVAL( MAXTESTS ), $ IMBYVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JXVAL( MAXTESTS ), $ JYVAL( MAXTESTS ) INTEGER KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), MAVAL( MAXTESTS ), $ MBAVAL( MAXTESTS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NAVAL( MAXTESTS ), NBAVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCAVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) DOUBLE PRECISION MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_DCHEKPAD, PB_DESCSET2, PB_DFILLPAD, $ PB_DLASCAL, PB_DLASET, PB_PDLAPRNT, $ PDBLA2TSTINFO, PDBLAS2TSTCHK, PDBLAS2TSTCHKE, $ PDCHKARG2, PDCHKVOUT, PDGEMV, PDGER, PDLAGEN, $ PDLASCAL, PDLASET, PDMPRNT, PDSYMV, PDSYR, $ PDSYR2, PDTRMV, PDTRSV, PDVPRNT, PMDESCCHK, $ PMDIMCHK, PVDESCCHK, PVDIMCHK * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MOD * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .TRUE., .FALSE., .FALSE., $ .TRUE., .FALSE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IASEED = 100 IXSEED = 200 IYSEED = 300 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDBLA2TSTINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, $ IAVAL, JAVAL, MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, CSCYVAL, $ IYVAL, JYVAL, INCYVAL, MAXTESTS, NGRIDS, $ PVAL, MAXGRIDS, QVAL, MAXGRIDS, NBLOG, LTEST, $ SOF, TEE, IAM, IGAP, IVERB, NPROCS, THRESH, $ ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PDBLAS2TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) TRANS = TRANVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) INBA = INBAVAL( J ) MBA = MBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) INBX = INBXVAL( J ) MBX = MBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) * MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) INBY = INBYVAL( J ) MBY = MBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN TSTCNT = TSTCNT + 1 WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, UPLO, TRANS, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY * WRITE( NOUT, FMT = 9995 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANS' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) TRANS WRITE( NOUT, FMT = 9997 ) 'DIAG' TSKIP = TSKIP + 1 GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, IGAP, GAPMUL, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, IGAP, GAPMUL, $ IERR( 2 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, IGAP, GAPMUL, $ IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDA = MAX( 1, MA ) LDX = MAX( 1, MX ) LDY = MAX( 1, MY ) * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPX = IPA + DESCA( LLD_ )*NQA + IPOSTA + IPREX IPY = IPX + DESCX( LLD_ )*NQX + IPOSTX + IPREY IPMATA = IPY + DESCY( LLD_ )*NQY + IPOSTY IPMATX = IPMATA + MA*NA IPMATY = IPMATX + MX*NX IPG = IPMATY + MAX( MX*NX, MY*NY ) * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * MEMREQD = IPG + MAX( M, N ) - 1 + $ MAX( MAX( IMBA, MBA ), $ MAX( MAX( IMBX, MBX ), $ MAX( IMBY, MBY ) ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) MEMREQD*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 2 routines * DO 30 K = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) SNAMES( K ) END IF * * Define the size of the operands * IF( K.EQ.1 ) THEN NROWA = M NCOLA = N IF( LSAME( TRANS, 'N' ) ) THEN NLX = N NLY = M ELSE NLX = M NLY = N END IF ELSE IF( K.EQ.5 ) THEN NROWA = M NCOLA = N NLX = M NLY = N ELSE NROWA = N NCOLA = N NLX = N NLY = N END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLX, 'X', IX, JX, DESCX, $ INCX, IERR( 2 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLY, 'Y', IY, JY, DESCY, $ INCY, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 GO TO 30 END IF * * Generate distributed matrices A, X and Y * IF( K.EQ.2 .OR. K.EQ.6 .OR. K.EQ.7 ) THEN AFORM = 'S' DIAGDO = 'N' OFFD = IA - JA ELSE IF( ( K.EQ.4 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN AFORM = 'N' DIAGDO = 'D' OFFD = IA - JA ELSE AFORM = 'N' DIAGDO = 'N' OFFD = 0 END IF * CALL PDLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCAR, MA, NA, IMBA, INBA, MBA, NBA, $ -1, -1, ICTXT, MAX( 1, MA ) ) CALL PDLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCAR, IASEED, MEM( IPMATA ), $ DESCAR( LLD_ ) ) CALL PB_DESCSET2( DESCXR, MX, NX, IMBX, INBX, MBX, NBX, $ -1, -1, ICTXT, MAX( 1, MX ) ) CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCXR, IXSEED, MEM( IPMATX ), $ DESCXR( LLD_ ) ) IF( YCHECK( K ) ) THEN * CALL PB_DESCSET2( DESCYR, MY, NY, IMBY, INBY, MBY, $ NBY, -1, -1, ICTXT, MAX( 1, MY ) ) CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCYR, IYSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) * ELSE * * If Y is not needed, generate a copy of X instead * CALL PB_DESCSET2( DESCYR, MX, NX, IMBX, INBX, MBX, $ NBX, -1, -1, ICTXT, MAX( 1, MX ) ) CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, $ 1, 1, DESCYR, IXSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) * END IF * * Zero non referenced part of the matrices A * IF( ( K.EQ.2 .OR. K.EQ.6 .OR. K.EQ.7 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN * * The distributed matrix A is symmetric * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of A. * CALL PDLASET( 'Upper', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA, JA+1, DESCA ) IF( K.NE.2 ) THEN CALL PB_DLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of A. * CALL PDLASET( 'Lower', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA+1, JA, DESCA ) IF( K.NE.2 ) THEN CALL PB_DLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * END IF * ELSE IF( K.EQ.3 .OR. K.EQ.4 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix A is lower triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PDLASET( 'Upper', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA, $ JA+1, DESCA ) CALL PB_DLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA-1+JA*LDA ), $ LDA ) END IF * ELSE * CALL PDLASET( 'Upper', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_DLASET( 'Upper', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( K.EQ.4 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / DBLE( MAX( NROWA, NCOLA ) ) CALL PDLASCAL( 'Lower', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA+1, JA, $ DESCA ) CALL PB_DLASCAL( 'Lower', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix A is upper triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PDLASET( 'Lower', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA+1, $ JA, DESCA ) CALL PB_DLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * ELSE * CALL PDLASET( 'Lower', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_DLASET( 'Lower', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( K.EQ.4 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / DBLE( MAX( NROWA, NCOLA ) ) CALL PDLASCAL( 'Upper', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA, JA+1, $ DESCA ) CALL PB_DLASCAL( 'Upper', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * END IF * END IF * END IF * * Pad the guard zones of A, X and Y * CALL PB_DFILLPAD( ICTXT, MPA, NQA, MEM( IPA-IPREA ), $ DESCA( LLD_ ), IPREA, IPOSTA, PADVAL ) * CALL PB_DFILLPAD( ICTXT, MPX, NQX, MEM( IPX-IPREX ), $ DESCX( LLD_ ), IPREX, IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_DFILLPAD( ICTXT, MPY, NQY, MEM( IPY-IPREY ), $ DESCY( LLD_ ), IPREY, IPOSTY, $ PADVAL ) END IF * * Initialize the check for INPUT-only arguments. * INFO = 0 CALL PDCHKARG2( ICTXT, NOUT, SNAMES( K ), UPLO, TRANS, $ DIAG, M, N, ALPHA, IA, JA, DESCA, IX, $ JX, DESCX, INCX, BETA, IY, JY, DESCY, $ INCY, INFO ) * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN CALL PB_PDLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_INITIAL_A', $ NOUT, MEM( IPG ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PDLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, 0, $ 0, 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPG ) ) END IF * IF( IVERB.EQ.2 ) THEN IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PDLAPRNT( 1, NLX, MEM( IPX ), IX, JX, $ DESCX, 0, 0, $ 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) ELSE CALL PB_PDLAPRNT( NLX, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, $ 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PDLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, 0, $ 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) END IF * IF( YCHECK( K ) ) THEN IF( IVERB.EQ.2 ) THEN IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PDLAPRNT( 1, NLY, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) ELSE CALL PB_PDLAPRNT( NLY, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PDLAPRNT( MY, NY, MEM( IPY ), 1, 1, DESCY, $ 0, 0, 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) END IF END IF * * Call the Level 2 PBLAS routine * INFO = 0 IF( K.EQ.1 ) THEN * * Test PDGEMV * CALL PDGEMV( TRANS, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.2 ) THEN * * Test PDSYMV * CALL PDSYMV( UPLO, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.3 ) THEN * * Test PDTRMV * CALL PDTRMV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) * ELSE IF( K.EQ.4 ) THEN * * Test PDTRSV * CALL PDTRSV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) * ELSE IF( K.EQ.5 ) THEN * * Test PDGER * CALL PDGER( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.6 ) THEN * * Test PDSYR * CALL PDSYR( UPLO, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.7 ) THEN * * Test PDSYR2 * CALL PDSYR2( UPLO, N, ALPHA, MEM( IPX ), IX, JX, $ DESCX, INCX, MEM( IPY ), IY, JY, DESCY, $ INCY, MEM( IPA ), IA, JA, DESCA ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9974 ) INFO GO TO 30 END IF * * Check padding * CALL PB_DCHEKPAD( ICTXT, SNAMES( K ), MPA, NQA, $ MEM( IPA-IPREA ), DESCA( LLD_ ), IPREA, $ IPOSTA, PADVAL ) * CALL PB_DCHEKPAD( ICTXT, SNAMES( K ), MPX, NQX, $ MEM( IPX-IPREX ), DESCX( LLD_ ), IPREX, $ IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_DCHEKPAD( ICTXT, SNAMES( K ), MPY, NQY, $ MEM( IPY-IPREY ), DESCY( LLD_ ), $ IPREY, IPOSTY, PADVAL ) END IF * * Check the computations * CALL PDBLAS2TSTCHK( ICTXT, NOUT, K, UPLO, TRANS, DIAG, M, $ N, ALPHA, MEM( IPMATA ), MEM( IPA ), $ IA, JA, DESCA, MEM( IPMATX ), $ MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPMATY ), MEM( IPY ), IY, $ JY, DESCY, INCY, THRESH, ROGUE, $ MEM( IPG ), INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 ) THEN IERR( 3 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 IERR( 3 ) = 1 END IF * * Check input-only scalar arguments * INFO = 1 CALL PDCHKARG2( ICTXT, NOUT, SNAMES( K ), UPLO, TRANS, $ DIAG, M, N, ALPHA, IA, JA, DESCA, IX, $ JX, DESCX, INCX, BETA, IY, JY, DESCY, $ INCY, INFO ) * * Check input-only array arguments * CALL PDCHKMOUT( NROWA, NCOLA, MEM( IPMATA ), MEM( IPA ), $ IA, JA, DESCA, IERR( 4 ) ) CALL PDCHKVOUT( NLX, MEM( IPMATX ), MEM( IPX ), IX, JX, $ DESCX, INCX, IERR( 5 ) ) * IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_A', $ SNAMES( K ) END IF * IF( IERR( 5 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_X', $ SNAMES( K ) END IF * IF( YCHECK( K ) ) THEN CALL PDCHKVOUT( NLY, MEM( IPMATY ), MEM( IPY ), IY, $ JY, DESCY, INCY, IERR( 6 ) ) IF( IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_Y', $ SNAMES( K ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE.0 .OR. IERR( 5 ).NE.0 .OR. $ IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) SNAMES( K ) KFAIL( K ) = KFAIL( K ) + 1 ERRFLG = .TRUE. ELSE IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) SNAMES( K ) KPASS( K ) = KPASS( K ) + 1 END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PDMPRNT( ICTXT, NOUT, MA, NA, MEM( IPMATA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PDLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( ( NROWA.GT.0 ).AND.( NCOLA.GT.0 ) ) $ CALL PDMPRNT( ICTXT, NOUT, NROWA, NCOLA, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PDLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_A', $ NOUT, MEM( IPMATA ) ) END IF IF( IERR( 5 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PDMPRNT( ICTXT, NOUT, MX, NX, MEM( IPMATX ), $ LDX, 0, 0, 'SERIAL_X' ) CALL PB_PDLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, $ 0, 0, 'PARALLEL_X', NOUT, $ MEM( IPMATX ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( NLX.GT.0 ) $ CALL PDVPRNT( ICTXT, NOUT, NLX, $ MEM( IPMATX+IX-1+(JX-1)*LDX ), $ INCX, 0, 0, 'SERIAL_X' ) IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PDLAPRNT( 1, NLX, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PDLAPRNT( NLX, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) END IF END IF IF( YCHECK( K ) ) THEN IF( IERR( 6 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PDMPRNT( ICTXT, NOUT, MY, NY, $ MEM( IPMATY ), LDY, 0, 0, $ 'SERIAL_Y' ) CALL PB_PDLAPRNT( MY, NY, MEM( IPY ), 1, 1, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE IF( IERR( 3 ).NE.0 ) THEN IF( NLY.GT.0 ) $ CALL PDVPRNT( ICTXT, NOUT, NLY, $ MEM( IPMATY+IY-1+(JY-1)*LDY ), $ INCY, 0, 0, 'SERIAL_Y' ) IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PDLAPRNT( 1, NLY, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PDLAPRNT( NLY, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) END IF END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9981 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9980 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' M N UPLO TRANS DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,9X,A1,11X,A1,10X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9989 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5,1X,I6 ) 9988 FORMAT( 2X, ' IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9987 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9986 FORMAT( 'Not enough memory. Need: ', I12 ) 9985 FORMAT( 2X, ' Tested Subroutine: ', A ) 9984 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9983 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9982 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9981 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9980 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9979 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9978 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9977 FORMAT( 2X, 'Testing Summary') 9976 FORMAT( 2X, 'End of Tests.' ) 9975 FORMAT( 2X, 'Tests started.' ) 9974 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PDBLA2TST * END SUBROUTINE PDBLA2TSTINFO( SUMMRY, NOUT, NMAT, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, $ IMBAVAL, MBAVAL, INBAVAL, NBAVAL, $ RSCAVAL, CSCAVAL, IAVAL, JAVAL, $ MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, $ IXVAL, JXVAL, INCXVAL, MYVAL, NYVAL, $ IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, $ INCYVAL, LDVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, NBLOG, LTEST, SOF, TEE, $ IAM, IGAP, IVERB, NPROCS, THRESH, ALPHA, $ BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, NBLOG, $ NGRIDS, NMAT, NOUT, NPROCS REAL THRESH DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), TRANVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCXVAL( LDVAL ), $ CSCYVAL( LDVAL ), IAVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBXVAL( LDVAL ), $ IMBYVAL( LDVAL ), INBAVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JAVAL( LDVAL ), $ JXVAL( LDVAL ), JYVAL( LDVAL ), MAVAL( LDVAL ), $ MBAVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NAVAL( LDVAL ), $ NBAVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCXVAL( LDVAL ), $ RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PDBLA2TSTINFO get the needed startup information for testing various * Level 2 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * TRANVAL (global output) CHARACTER array * On entry, TRANVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANS to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, TRANS, UPLO, M, N, DESCA(:), * IA, JA, DESCX(:), IX, JX, INCX, DESCY(:), IY, JY and INCY. * This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least seven. On * exit, if LTEST( i ) is .TRUE., the i-th Level 2 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * THRESH (global output) REAL * On exit, THRESH specifies the threshhold value for the test * ratio. * * ALPHA (global output) DOUBLE PRECISION * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) DOUBLE PRECISION * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+37*NMAT+NSUBS+4 ) with NSUBS equal to 7. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 7 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J DOUBLE PRECISION EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, DGEBR2D, DGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D *ype real dble cplx zplx * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PDBLAS2TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Read the threshold value for test ratio * READ( NIN, FMT = * ) THRESH IF( THRESH.LT.0.0 ) $ THRESH = 16.0 * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRANVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( TRANVAL( J ) ) WORK( I+2 ) = ICHAR( UPLOVAL( J ) ) I = I + 3 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 2 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real double precision '// $ 'Level 2 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9979 ) NBLOG WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9980 ) THRESH WRITE( NOUT, FMT = 9982 ) ALPHA WRITE( NOUT, FMT = 9981 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 37*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) TRANVAL( J ) = CHAR( WORK( I+1 ) ) UPLOVAL( J ) = CHAR( WORK( I+2 ) ) I = I + 3 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : ', G16.6 ) 9981 FORMAT( 2X, 'Beta : ', G16.6 ) 9980 FORMAT( 2X, 'Threshold value : ', G16.6 ) 9979 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PDBLA2TSTINFO * END SUBROUTINE PDBLAS2TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PDBLAS2TSTCHKE tests the error exits of the Level 2 PBLAS. * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 7 (NSUBS). * If LTEST( 1 ) is .TRUE., PDGEMV will be tested; * If LTEST( 2 ) is .TRUE., PDSYMV will be tested; * If LTEST( 3 ) is .TRUE., PDTRMV will be tested; * If LTEST( 4 ) is .TRUE., PDTRSV will be tested; * If LTEST( 5 ) is .TRUE., PDGER will be tested; * If LTEST( 6 ) is .TRUE., PDSYR will be tested; * If LTEST( 7 ) is .TRUE., PDSYR2 will be tested; * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 7 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PDDIMEE, PDGEMV, PDGER, $ PDMATEE, PDOPTEE, PDSYMV, PDSYR, PDSYR2, $ PDTRMV, PDTRSV, PDVECEE * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/21, 22, 23, 23, 24, 25, 27/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PDGEMV * I = 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDGEMV, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDGEMV, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDGEMV, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDGEMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PDSYMV * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDSYMV, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDSYMV, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDSYMV, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDSYMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PDTRMV * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDTRMV, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDTRMV, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDTRMV, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDTRMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PDTRSV * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDTRSV, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDTRSV, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDTRSV, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDTRSV, SCODE( I ), SNAMES( I ) ) END IF * * Test PDGER * I = I + 1 IF( LTEST( I ) ) THEN CALL PDDIMEE( ICTXT, NOUT, PDGER, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDGER, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDGER, SCODE( I ), SNAMES( I ) ) END IF * * Test PDSYR * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDSYR, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDSYR, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDSYR, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDSYR, SCODE( I ), SNAMES( I ) ) END IF * * Test PDSYR2 * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDSYR2, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDSYR2, SCODE( I ), SNAMES( I ) ) CALL PDVECEE( ICTXT, NOUT, PDSYR2, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDSYR2, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PDBLAS2TSTCHKE * END SUBROUTINE PDCHKARG2( ICTXT, NOUT, SNAME, UPLO, TRANS, DIAG, M, $ N, ALPHA, IA, JA, DESCA, IX, JX, DESCX, $ INCX, BETA, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N, NOUT DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*(*) SNAME INTEGER DESCA( * ), DESCX( * ), DESCY( * ) * .. * * Purpose * ======= * * PDCHKARG2 checks the input-only arguments of the Level 2 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the UPLO option in the Level 2 PBLAS * operation. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the TRANS option in the Level 2 * PBLAS operation. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies the DIAG option in the Level 2 PBLAS * operation. * * M (global input) INTEGER * On entry, M specifies the dimension of the submatrix ope- * rands. * * N (global input) INTEGER * On entry, N specifies the dimension of the submatrix ope- * rands. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 DIAGREF, TRANSREF, UPLOREF INTEGER I, IAREF, INCXREF, INCYREF, IXREF, IYREF, $ JAREF, JXREF, JYREF, MREF, MYCOL, MYROW, NPCOL, $ NPROW, NREF DOUBLE PRECISION ALPHAREF, BETAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCAREF( DLEN_ ), DESCXREF( DLEN_ ), $ DESCYREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * DIAGREF = DIAG TRANSREF = TRANS UPLOREF = UPLO MREF = M NREF = N ALPHAREF = ALPHA IAREF = IA JAREF = JA DO 10 I = 1, DLEN_ DESCAREF( I ) = DESCA( I ) 10 CONTINUE IXREF = IX JXREF = JX DO 20 I = 1, DLEN_ DESCXREF( I ) = DESCX( I ) 20 CONTINUE INCXREF = INCX BETAREF = BETA IYREF = IY JYREF = JY DO 30 I = 1, DLEN_ DESCYREF( I ) = DESCY( I ) 30 CONTINUE INCYREF = INCY * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( .NOT. LSAME( DIAG, DIAGREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DIAG' ELSE IF( .NOT. LSAME( TRANS, TRANSREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANS' ELSE IF( .NOT. LSAME( UPLO, UPLOREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'UPLO' ELSE IF( M.NE.MREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'M' ELSE IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE IF( IA.NE.IAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IA' ELSE IF( JA.NE.JAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JA' ELSE IF( DESCA( DTYPE_ ).NE.DESCAREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( DTYPE_ )' ELSE IF( DESCA( M_ ).NE.DESCAREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( M_ )' ELSE IF( DESCA( N_ ).NE.DESCAREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( N_ )' ELSE IF( DESCA( IMB_ ).NE.DESCAREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( IMB_ )' ELSE IF( DESCA( INB_ ).NE.DESCAREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( INB_ )' ELSE IF( DESCA( MB_ ).NE.DESCAREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( MB_ )' ELSE IF( DESCA( NB_ ).NE.DESCAREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( NB_ )' ELSE IF( DESCA( RSRC_ ).NE.DESCAREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( RSRC_ )' ELSE IF( DESCA( CSRC_ ).NE.DESCAREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CSRC_ )' ELSE IF( DESCA( CTXT_ ).NE.DESCAREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CTXT_ )' ELSE IF( DESCA( LLD_ ).NE.DESCAREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( LLD_ )' ELSE IF( IX.NE.IXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IX' ELSE IF( JX.NE.JXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JX' ELSE IF( DESCX( DTYPE_ ).NE.DESCXREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( DTYPE_ )' ELSE IF( DESCX( M_ ).NE.DESCXREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( M_ )' ELSE IF( DESCX( N_ ).NE.DESCXREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( N_ )' ELSE IF( DESCX( IMB_ ).NE.DESCXREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( IMB_ )' ELSE IF( DESCX( INB_ ).NE.DESCXREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( INB_ )' ELSE IF( DESCX( MB_ ).NE.DESCXREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( MB_ )' ELSE IF( DESCX( NB_ ).NE.DESCXREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( NB_ )' ELSE IF( DESCX( RSRC_ ).NE.DESCXREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( RSRC_ )' ELSE IF( DESCX( CSRC_ ).NE.DESCXREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CSRC_ )' ELSE IF( DESCX( CTXT_ ).NE.DESCXREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CTXT_ )' ELSE IF( DESCX( LLD_ ).NE.DESCXREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( LLD_ )' ELSE IF( INCX.NE.INCXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCX' ELSE IF( BETA.NE.BETAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'BETA' ELSE IF( IY.NE.IYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IY' ELSE IF( JY.NE.JYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JY' ELSE IF( DESCY( DTYPE_ ).NE.DESCYREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( DTYPE_ )' ELSE IF( DESCY( M_ ).NE.DESCYREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( M_ )' ELSE IF( DESCY( N_ ).NE.DESCYREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( N_ )' ELSE IF( DESCY( IMB_ ).NE.DESCYREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( IMB_ )' ELSE IF( DESCY( INB_ ).NE.DESCYREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( INB_ )' ELSE IF( DESCY( MB_ ).NE.DESCYREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( MB_ )' ELSE IF( DESCY( NB_ ).NE.DESCYREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( NB_ )' ELSE IF( DESCY( RSRC_ ).NE.DESCYREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( RSRC_ )' ELSE IF( DESCY( CSRC_ ).NE.DESCYREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CSRC_ )' ELSE IF( DESCY( CTXT_ ).NE.DESCYREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CTXT_ )' ELSE IF( DESCY( LLD_ ).NE.DESCYREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( LLD_ )' ELSE IF( INCY.NE.INCYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCY' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.NE.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PDCHKARG2 * END SUBROUTINE PDBLAS2TSTCHK( ICTXT, NOUT, NROUT, UPLO, TRANS, DIAG, $ M, N, ALPHA, A, PA, IA, JA, DESCA, X, $ PX, IX, JX, DESCX, INCX, BETA, Y, PY, $ IY, JY, DESCY, INCY, THRESH, ROGUE, $ WORK, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N, NOUT, NROUT REAL THRESH DOUBLE PRECISION ALPHA, BETA, ROGUE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION A( * ), PA( * ), PX( * ), PY( * ), WORK( * ), $ X( * ), Y( * ) * .. * * Purpose * ======= * * PDBLAS2TSTCHK performs the computational tests of the Level 2 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PDGEMV will be tested; * else if NROUT = 2, PDSYMV will be tested; * else if NROUT = 3, PDTRMV will be tested; * else if NROUT = 4, PDTRSV will be tested; * else if NROUT = 5, PDGER will be tested; * else if NROUT = 6, PDSYR will be tested; * else if NROUT = 7, PDSYR2 will be tested; * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies if the upper or lower part of the * matrix operand is to be referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies if the matrix operand A is to be * transposed. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the triangular matrix operand is * unit or non-unit. * * M (global input) INTEGER * On entry, M specifies the number of rows of A. * * N (global input) INTEGER * On entry, N specifies the number of columns of A. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) DOUBLE PRECISION array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) DOUBLE PRECISION array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) DOUBLE PRECISION array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) DOUBLE PRECISION array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * THRESH (global input) REAL * On entry, THRESH is the threshold value for the test ratio. * * ROGUE (global input) DOUBLE PRECISION * On entry, ROGUE specifies the constant used to pad the * non-referenced part of triangular or symmetric matrices. * * WORK (workspace) DOUBLE PRECISION array * On entry, WORK is an array of dimension LWORK where LWORK is * at least MAX( M, N ). This array is used to store the compu- * ted gauges (see PDMVCH). * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on A has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on X has been found, * if( MOD( INFO/4, 2 ) = 1 ) then an error on Y has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR * .. * .. Local Arrays .. INTEGER IERR( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DTRSV, PB_DLASET, PDCHKMIN, $ PDCHKVIN, PDMVCH, PDTRMV, PDVMCH, PDVMCH2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 10 I = 1, 3 IERR( I ) = 0 10 CONTINUE * IF( NROUT.EQ.1 ) THEN * * Test PDGEMV * * Check the resulting vector Y * CALL PDMVCH( ICTXT, TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, $ IX, JX, DESCX, INCX, BETA, Y, PY, IY, JY, DESCY, $ INCY, WORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PDCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) IF( LSAME( TRANS, 'N' ) ) THEN CALL PDCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 2 ) ) ELSE CALL PDCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, $ IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.2 ) THEN * * Test PDSYMV * * Check the resulting vector Y * CALL PDMVCH( ICTXT, 'No transpose', N, N, ALPHA, A, IA, JA, $ DESCA, X, IX, JX, DESCX, INCX, BETA, Y, PY, IY, $ JY, DESCY, INCY, WORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN CALL PB_DLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF CALL PDCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PDCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) * ELSE IF( NROUT.EQ.3 ) THEN * * Test PDTRMV * * Check the resulting vector X * CALL PDMVCH( ICTXT, TRANS, N, N, ONE, A, IA, JA, DESCA, Y, IX, $ JX, DESCX, INCX, ZERO, X, PX, IX, JX, DESCX, INCX, $ WORK, ERR, IERR( 2 ) ) * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF END IF CALL PDCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) * ELSE IF( NROUT.EQ.4 ) THEN * * Test PDTRSV * * Check the resulting vector X * CALL DTRSV( UPLO, TRANS, DIAG, N, A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ), X( IX+(JX-1)*DESCX( M_ ) ), INCX ) CALL PDTRMV( UPLO, TRANS, DIAG, N, PA, IA, JA, DESCA, PX, IX, $ JX, DESCX, INCX ) CALL PDMVCH( ICTXT, TRANS, N, N, ONE, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX, ZERO, Y, PX, IX, JX, DESCX, INCX, $ WORK, ERR, IERR( 2 ) ) * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF END IF CALL PDCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) * ELSE IF( NROUT.EQ.5 ) THEN * * Test PDGER * * Check the resulting matrix A * CALL PDVMCH( ICTXT, 'Ge', M, N, ALPHA, X, IX, JX, DESCX, $ INCX, Y, IY, JY, DESCY, INCY, A, PA, IA, JA, $ DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PDCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PDCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * ELSE IF( NROUT.EQ.6 ) THEN * * Test PDSYR * * Check the resulting matrix A * CALL PDVMCH( ICTXT, UPLO, N, N, ALPHA, X, IX, JX, DESCX, $ INCX, X, IX, JX, DESCX, INCX, A, PA, IA, JA, $ DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PDCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) * ELSE IF( NROUT.EQ.7 ) THEN * * Test PDSYR2 * * Check the resulting matrix A * CALL PDVMCH2( ICTXT, UPLO, N, N, ALPHA, X, IX, JX, DESCX, INCX, $ Y, IY, JY, DESCY, INCY, A, PA, IA, JA, DESCA, $ WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PDCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PDCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * END IF * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'A' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'X' END IF * IF( IERR( 3 ).NE.0 ) THEN INFO = INFO + 4 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'Y' END IF * 9999 FORMAT( 2X, ' ***** ERROR: Matrix operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** ERROR: Vector operand ', A, $ ' is incorrect.' ) 9997 FORMAT( 2X, ' ***** FATAL ERROR - Computed result is less ', $ 'than half accurate *****' ) 9996 FORMAT( 2X, ' ***** Test completed with maximum test ratio: ', $ F11.5, ' SUSPECT *****' ) * RETURN * * End of PDBLAS2TSTCHK * END scalapack-2.0.2/PBLAS/TESTING/PDBLAS3TST.dat000644 000766 000024 00000004720 10363532303 020005 0ustar00juliestaff000000 000000 'Level 3 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PDBLAS3TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 16.0 threshold value of test ratio 10 value of the logical computational blocksize NB 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q 2.0D0 value of ALPHA 3.0D0 value of BETA 4 number of tests problems 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' values of DIAG 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' values of SIDE 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANSA 'N' 'N' 'T' 'T' 'N' 'N' 'T' 'T' values of TRANSB 'U' 'L' 'U' 'L' 'U' 'L' 'U' 'L' values of UPLO 22 30 8 20 3 5 11 8 values of M 23 29 7 21 3 14 11 8 values of N 24 28 9 22 3 9 11 8 values of K 31 31 31 31 31 31 31 31 values of M_A 31 31 31 31 31 31 31 31 values of N_A 4 5 6 3 2 1 4 5 values of IMB_A 4 5 6 3 2 1 4 5 values of INB_A 4 5 6 3 2 1 4 5 values of MB_A 4 5 6 3 2 1 4 5 values of NB_A 0 0 0 0 0 0 0 0 values of RSRC_A 0 0 0 0 0 0 0 0 values of CSRC_A 5 1 7 7 1 12 1 11 values of IA 5 1 7 7 1 12 1 11 values of JA 32 32 32 32 32 32 32 32 values of M_B 32 32 32 32 32 32 32 32 values of N_B 4 5 6 3 2 1 4 5 values of IMB_B 4 5 6 3 2 1 4 5 values of INB_B 4 5 6 3 2 1 4 5 values of MB_B 4 5 6 3 2 1 4 5 values of NB_B 0 0 0 0 0 0 0 0 values of RSRC_B 0 0 0 0 0 0 0 0 values of CSRC_B 5 1 7 7 1 12 1 11 values of IB 5 1 7 7 1 12 1 11 values of JB 35 35 35 35 35 35 35 35 values of M_C 35 35 35 35 35 35 35 35 values of N_C 4 5 6 3 2 1 4 5 values of IMB_C 4 5 6 3 2 1 4 5 values of INB_C 4 5 6 3 2 1 4 5 values of MB_C 4 5 6 3 2 1 4 5 values of NB_C 0 0 0 0 0 0 0 0 values of RSRC_C 0 0 0 0 0 0 0 0 values of CSRC_C 5 1 7 7 1 12 1 11 values of IC 5 1 7 7 1 12 1 11 values of JC PDGEMM T put F for no test in the same column PDSYMM T put F for no test in the same column PDSYRK T put F for no test in the same column PDSYR2K T put F for no test in the same column PDTRMM T put F for no test in the same column PDTRSM T put F for no test in the same column PDGEADD T put F for no test in the same column PDTRADD T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/pdblas3tst.f000644 000766 000024 00000375306 11750130340 020131 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PDGEMM ', 'PDSYMM ', 'PDSYRK ', $ 'PDSYR2K', 'PDTRMM ', 'PDTRSM ', $ 'PDGEADD', 'PDTRADD'/ END BLOCK DATA PROGRAM PDBA3TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PDBLA3TST is the main testing program for the Level 3 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * * from the following 61 lines: * 'Level 3 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PDBLAS3TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 16.0 threshold value of test ratio * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0D0 value of ALPHA * 1.0D0 value of BETA * 2 number of tests problems * 'N' 'U' values of DIAG * 'L' 'R' values of SIDE * 'N' 'T' values of TRANSA * 'N' 'T' values of TRANSB * 'U' 'L' values of UPLO * 3 4 values of M * 3 4 values of N * 3 4 values of K * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_B * 6 10 values of N_B * 2 5 values of IMB_B * 2 5 values of INB_B * 2 5 values of MB_B * 2 5 values of NB_B * 0 1 values of RSRC_B * 0 0 values of CSRC_B * 1 1 values of IB * 1 1 values of JB * 6 10 values of M_C * 6 10 values of N_C * 2 5 values of IMB_C * 2 5 values of INB_C * 2 5 values of MB_C * 2 5 values of NB_C * 0 1 values of RSRC_C * 0 0 values of CSRC_C * 1 1 values of IC * 1 1 values of JC * PDGEMM T put F for no test in the same column * PDSYMM T put F for no test in the same column * PDSYRK T put F for no test in the same column * PDSYR2K T put F for no test in the same column * PDTRMM T put F for no test in the same column * PDTRSM T put F for no test in the same column * PDGEADD T put F for no test in the same column * PDTRADD T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * DBLESZ indicates the length in bytes on the given platform * for a double precision real. By default, DBLESZ is set to * eight. * * MEM DOUBLE PRECISION array * MEM is an array of dimension TOTMEM / DBLESZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, DBLESZ, TOTMEM, $ MEMSIZ, NSUBS DOUBLE PRECISION ONE, PADVAL, ZERO, ROGUE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ DBLESZ = 8, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ, ZERO = 0.0D+0, $ ONE = 1.0D+0, PADVAL = -9923.0D+0, $ NSUBS = 8, ROGUE = -1.0D+10 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE CHARACTER*1 ADIAGDO, AFORM, CFORM, DIAG, SIDE, TRANSA, $ TRANSB, UPLO INTEGER CSRCA, CSRCB, CSRCC, I, IA, IAM, IASEED, IB, $ IBSEED, IC, ICSEED, ICTXT, IGAP, IMBA, IMBB, $ IMBC, IMIDA, IMIDB, IMIDC, INBA, INBB, INBC, $ IPA, IPB, IPC, IPG, IPMATA, IPMATB, IPMATC, $ IPOSTA, IPOSTB, IPOSTC, IPREA, IPREB, IPREC, $ IPW, IVERB, J, JA, JB, JC, K, L, LDA, LDB, LDC, $ M, MA, MB, MBA, MBB, MBC, MC, MEMREQD, MPA, $ MPB, MPC, MYCOL, MYROW, N, NA, NB, NBA, NBB, $ NBC, NC, NCOLA, NCOLB, NCOLC, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQA, NQB, NQC, NROWA, $ NROWB, NROWC, NTESTS, OFFDA, OFFDC, RSRCA, $ RSRCB, RSRCC, TSKIP, TSTCNT REAL THRESH DOUBLE PRECISION ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL BCHECK( NSUBS ), CCHECK( NSUBS ), $ LTEST( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), SIDEVAL( MAXTESTS ), $ TRNAVAL( MAXTESTS ), TRNBVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCBVAL( MAXTESTS ), $ CSCCVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCAR( DLEN_ ), DESCB( DLEN_ ), $ DESCBR( DLEN_ ), DESCC( DLEN_ ), $ DESCCR( DLEN_ ), IAVAL( MAXTESTS ), $ IBVAL( MAXTESTS ), ICVAL( MAXTESTS ), $ IERR( 6 ), IMBAVAL( MAXTESTS ), $ IMBBVAL( MAXTESTS ), IMBCVAL( MAXTESTS ), $ INBAVAL( MAXTESTS ), INBBVAL( MAXTESTS ), $ INBCVAL( MAXTESTS ), JAVAL( MAXTESTS ), $ JBVAL( MAXTESTS ), JCVAL( MAXTESTS ) INTEGER KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), KVAL( MAXTESTS ), $ MAVAL( MAXTESTS ), MBAVAL( MAXTESTS ), $ MBBVAL( MAXTESTS ), MBCVAL( MAXTESTS ), $ MBVAL( MAXTESTS ), MCVAL( MAXTESTS ), $ MVAL( MAXTESTS ), NAVAL( MAXTESTS ), $ NBAVAL( MAXTESTS ), NBBVAL( MAXTESTS ), $ NBCVAL( MAXTESTS ), NBVAL( MAXTESTS ), $ NCVAL( MAXTESTS ), NVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCAVAL( MAXTESTS ), RSCBVAL( MAXTESTS ), $ RSCCVAL( MAXTESTS ) DOUBLE PRECISION MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_DCHEKPAD, PB_DESCSET2, PB_DFILLPAD, $ PB_DLASCAL, PB_DLASET, PB_PDLAPRNT, $ PDBLA3TSTINFO, PDBLAS3TSTCHK, PDBLAS3TSTCHKE, $ PDCHKARG3, PDCHKMOUT, PDGEADD, PDGEMM, PDLAGEN, $ PDLASCAL, PDLASET, PDMPRNT, PDSYMM, PDSYR2K, $ PDSYRK, PDTRADD, PDTRMM, PDTRSM, PMDESCCHK, $ PMDIMCHK * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MOD * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA BCHECK/.TRUE., .TRUE., .FALSE., .TRUE., .TRUE., $ .TRUE., .FALSE., .FALSE./ DATA CCHECK/.TRUE., .TRUE., .TRUE., .TRUE., .FALSE., $ .FALSE., .TRUE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, * so that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IASEED = 100 IBSEED = 200 ICSEED = 300 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDBLA3TSTINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, NVAL, $ KVAL, MAVAL, NAVAL, IMBAVAL, MBAVAL, $ INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, IAVAL, $ JAVAL, MBVAL, NBVAL, IMBBVAL, MBBVAL, $ INBBVAL, NBBVAL, RSCBVAL, CSCBVAL, IBVAL, $ JBVAL, MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, ICVAL, $ JCVAL, MAXTESTS, NGRIDS, PVAL, MAXGRIDS, $ QVAL, MAXGRIDS, NBLOG, LTEST, SOF, TEE, IAM, $ IGAP, IVERB, NPROCS, THRESH, ALPHA, BETA, $ MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PDBLAS3TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) SIDE = SIDEVAL( J ) TRANSA = TRNAVAL( J ) TRANSB = TRNBVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) K = KVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MB = MBVAL( J ) NB = NBVAL( J ) IMBB = IMBBVAL( J ) MBB = MBBVAL( J ) INBB = INBBVAL( J ) NBB = NBBVAL( J ) RSRCB = RSCBVAL( J ) CSRCB = CSCBVAL( J ) IB = IBVAL( J ) JB = JBVAL( J ) * MC = MCVAL( J ) NC = NCVAL( J ) IMBC = IMBCVAL( J ) MBC = MBCVAL( J ) INBC = INBCVAL( J ) NBC = NBCVAL( J ) RSRCC = RSCCVAL( J ) CSRCC = CSCCVAL( J ) IC = ICVAL( J ) JC = JCVAL( J ) * IF( IAM.EQ.0 ) THEN * TSTCNT = TSTCNT + 1 * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, K, SIDE, UPLO, TRANSA, $ TRANSB, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IB, JB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IC, JC, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC * WRITE( NOUT, FMT = 9995 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( SIDE, 'L' ).AND. $ .NOT.LSAME( SIDE, 'R' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'SIDE' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSA' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANSB, 'N' ).AND. $ .NOT.LSAME( TRANSB, 'T' ).AND. $ .NOT.LSAME( TRANSB, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSB' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'DIAG' TSKIP = TSKIP + 1 GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, IGAP, GAPMUL, IERR( 1 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'B', DESCB, $ BLOCK_CYCLIC_2D_INB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB, MPB, NQB, IPREB, $ IMIDB, IPOSTB, IGAP, GAPMUL, IERR( 2 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'C', DESCC, $ BLOCK_CYCLIC_2D_INB, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC, MPC, NQC, IPREC, $ IMIDC, IPOSTC, IGAP, GAPMUL, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDA = MAX( 1, MA ) LDB = MAX( 1, MB ) LDC = MAX( 1, MC ) * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPB = IPA + DESCA( LLD_ )*NQA + IPOSTA + IPREB IPC = IPB + DESCB( LLD_ )*NQB + IPOSTB + IPREC IPMATA = IPC + DESCC( LLD_ )*NQC + IPOSTC IPMATB = IPMATA + MA*NA IPMATC = IPMATB + MB*NB IPG = IPMATC + MAX( MB*NB, MC*NC ) * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * IPW = IPG + 2*MAX( M, MAX( N, K ) ) MEMREQD = IPW - 1 + MAX( MAX( MAX( IMBA, MBA ), $ MAX( IMBB, MBB ) ), $ MAX( IMBC, MBC ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) MEMREQD*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 3 routines * DO 30 L = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( L ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9986 ) SNAMES( L ) END IF * * Define the size of the operands * IF( L.EQ.1 ) THEN * * PDGEMM * NROWC = M NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF( LSAME( TRANSB, 'N' ) ) THEN NROWB = K NCOLB = N ELSE NROWB = N NCOLB = K END IF * ELSE IF( L.EQ.2 ) THEN * * PDSYMM * NROWC = M NCOLC = N NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF * ELSE IF( L.EQ.3 ) THEN * * PDSYRK * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K ELSE NROWA = K NCOLA = N END IF NROWB = 0 NCOLB = 0 * ELSE IF( L.EQ.4 ) THEN * * PDSYR2K * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K NROWB = N NCOLB = K ELSE NROWA = K NCOLA = N NROWB = K NCOLB = N END IF * ELSE IF( L.EQ.5 .OR. L.EQ.6 ) THEN NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF NROWC = 0 NCOLC = 0 * ELSE IF( L.EQ.7 .OR. L.EQ.8 ) THEN * * PDGEADD, PDTRADD * IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = N ELSE NROWA = N NCOLA = M END IF NROWC = M NCOLC = N NROWB = 0 NCOLB = 0 * END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWB, NCOLB, 'B', IB, JB, $ DESCB, IERR( 2 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWC, NCOLC, 'C', IC, JC, $ DESCC, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN KSKIP( L ) = KSKIP( L ) + 1 GO TO 30 END IF * * Generate distributed matrices A, B and C * IF( L.EQ.2 ) THEN * * PDSYMM * AFORM = 'S' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.3 .OR. L.EQ.4 ) THEN * * PDSYRK, PDSYR2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'S' OFFDC = IC - JC * ELSE IF( ( L.EQ.6 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN * * PDTRSM * AFORM = 'N' ADIAGDO = 'D' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE * * Default values * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'N' OFFDC = 0 * END IF * CALL PDLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) * IF( BCHECK( L ) ) $ CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCB, IBSEED, MEM( IPB ), $ DESCB( LLD_ ) ) * IF( CCHECK( L ) ) $ CALL PDLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCC, ICSEED, MEM( IPC ), $ DESCC( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCAR, MA, NA, IMBA, INBA, MBA, NBA, $ -1, -1, ICTXT, MAX( 1, MA ) ) CALL PDLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCAR, IASEED, MEM( IPMATA ), $ DESCAR( LLD_ ) ) * IF( BCHECK( L ) ) THEN CALL PB_DESCSET2( DESCBR, MB, NB, IMBB, INBB, MBB, $ NBB, -1, -1, ICTXT, MAX( 1, MB ) ) CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCBR, IBSEED, MEM( IPMATB ), $ DESCBR( LLD_ ) ) END IF * IF( CCHECK( L ) ) THEN * CALL PB_DESCSET2( DESCCR, MC, NC, IMBC, INBC, MBC, $ NBC, -1, -1, ICTXT, MAX( 1, MC ) ) CALL PDLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCCR, ICSEED, MEM( IPMATC ), $ DESCCR( LLD_ ) ) * ELSE * * If C is not needed, generate a copy of B instead * CALL PB_DESCSET2( DESCCR, MB, NB, IMBB, INBB, MBB, $ NBB, -1, -1, ICTXT, MAX( 1, MB ) ) CALL PDLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCCR, IBSEED, MEM( IPMATC ), $ DESCCR( LLD_ ) ) * END IF * * Zero non referenced part of the matrices A, B, C * IF( ( L.EQ.2 ).AND.( MAX( NROWA, NCOLA ).GT.1 ) ) THEN * * The distributed matrix A is symmetric * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of A. * CALL PDLASET( 'Upper', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA, JA+1, DESCA ) * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of A. * CALL PDLASET( 'Lower', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA+1, JA, DESCA ) * END IF * ELSE IF( ( ( L.EQ.3 ).OR.( L.EQ.4 ) ).AND. $ ( MAX( NROWC, NCOLC ).GT.1 ) ) THEN * * The distributed matrix C is symmetric * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of C. * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PDLASET( 'Upper', NROWC-1, NCOLC-1, ROGUE, $ ROGUE, MEM( IPC ), IC, JC+1, $ DESCC ) CALL PB_DLASET( 'Upper', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC-1+JC*LDC ), LDC ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of C. * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PDLASET( 'Lower', NROWC-1, NCOLC-1, ROGUE, $ ROGUE, MEM( IPC ), IC+1, JC, $ DESCC ) CALL PB_DLASET( 'Lower', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC+(JC-1)*LDC ), $ LDC ) END IF * END IF * ELSE IF( L.EQ.5 .OR. L.EQ.6 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix A is lower triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PDLASET( 'Upper', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA, $ JA+1, DESCA ) CALL PB_DLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA-1+JA*LDA ), $ LDA ) END IF * ELSE * CALL PDLASET( 'Upper', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_DLASET( 'Upper', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( L.EQ.6 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / DBLE( MAX( NROWA, NCOLA ) ) CALL PDLASCAL( 'Lower', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA+1, JA, $ DESCA ) CALL PB_DLASCAL( 'Lower', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix A is upper triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PDLASET( 'Lower', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA+1, $ JA, DESCA ) CALL PB_DLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * ELSE * CALL PDLASET( 'Lower', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_DLASET( 'Lower', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( L.EQ.6 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / DBLE( MAX( NROWA, NCOLA ) ) CALL PDLASCAL( 'Upper', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA, JA+1, $ DESCA ) CALL PB_DLASCAL( 'Upper', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * END IF * END IF * ELSE IF( L.EQ.8 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix C is lower triangular * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PDLASET( 'Upper', NROWC-1, NCOLC-1, $ ROGUE, ROGUE, MEM( IPC ), IC, $ JC+1, DESCC ) CALL PB_DLASET( 'Upper', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC-1+JC*LDC ), LDC ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix C is upper triangular * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PDLASET( 'Lower', NROWC-1, NCOLC-1, $ ROGUE, ROGUE, MEM( IPC ), IC+1, $ JC, DESCC ) CALL PB_DLASET( 'Lower', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC+(JC-1)*LDC ), $ LDC ) END IF * END IF * END IF * * Pad the guard zones of A, B and C * CALL PB_DFILLPAD( ICTXT, MPA, NQA, MEM( IPA-IPREA ), $ DESCA( LLD_ ), IPREA, IPOSTA, PADVAL ) * IF( BCHECK( L ) ) THEN CALL PB_DFILLPAD( ICTXT, MPB, NQB, MEM( IPB-IPREB ), $ DESCB( LLD_ ), IPREB, IPOSTB, $ PADVAL ) END IF * IF( CCHECK( L ) ) THEN CALL PB_DFILLPAD( ICTXT, MPC, NQC, MEM( IPC-IPREC ), $ DESCC( LLD_ ), IPREC, IPOSTC, $ PADVAL ) END IF * * Initialize the check for INPUT-only arguments. * INFO = 0 CALL PDCHKARG3( ICTXT, NOUT, SNAMES( L ), SIDE, UPLO, $ TRANSA, TRANSB, DIAG, M, N, K, ALPHA, IA, $ JA, DESCA, IB, JB, DESCB, BETA, IC, JC, $ DESCC, INFO ) * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN CALL PB_PDLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, $ 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PDLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPW ) ) END IF * IF( BCHECK( L ) ) THEN IF( IVERB.EQ.2 ) THEN CALL PB_PDLAPRNT( NROWB, NCOLB, MEM( IPB ), IB, JB, $ DESCB, 0, 0, $ 'PARALLEL_INITIAL_B', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PDLAPRNT( MB, NB, MEM( IPB ), 1, 1, DESCB, $ 0, 0, 'PARALLEL_INITIAL_B', NOUT, $ MEM( IPW ) ) END IF END IF * IF( CCHECK( L ) ) THEN IF( IVERB.EQ.2 ) THEN CALL PB_PDLAPRNT( NROWC, NCOLC, MEM( IPC ), IC, JC, $ DESCC, 0, 0, $ 'PARALLEL_INITIAL_C', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PDLAPRNT( MC, NC, MEM( IPC ), 1, 1, DESCC, $ 0, 0, 'PARALLEL_INITIAL_C', NOUT, $ MEM( IPW ) ) END IF END IF * * Call the Level 3 PBLAS routine * INFO = 0 IF( L.EQ.1 ) THEN * * Test PDGEMM * CALL PDGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.2 ) THEN * * Test PDSYMM * CALL PDSYMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.3 ) THEN * * Test PDSYRK * CALL PDSYRK( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.4 ) THEN * * Test PDSYR2K * CALL PDSYR2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.5 ) THEN * * Test PDTRMM * CALL PDTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) * ELSE IF( L.EQ.6 ) THEN * * Test PDTRSM * CALL PDTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) * * ELSE IF( L.EQ.7 ) THEN * * Test PDGEADD * CALL PDGEADD( TRANSA, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, BETA, MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.8 ) THEN * * Test PDTRADD * CALL PDTRADD( UPLO, TRANSA, M, N, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( L ) = KSKIP( L ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9974 ) INFO GO TO 30 END IF * * Check padding * CALL PB_DCHEKPAD( ICTXT, SNAMES( L ), MPA, NQA, $ MEM( IPA-IPREA ), DESCA( LLD_ ), $ IPREA, IPOSTA, PADVAL ) * IF( BCHECK( L ) ) THEN CALL PB_DCHEKPAD( ICTXT, SNAMES( L ), MPB, NQB, $ MEM( IPB-IPREB ), DESCB( LLD_ ), $ IPREB, IPOSTB, PADVAL ) END IF * IF( CCHECK( L ) ) THEN CALL PB_DCHEKPAD( ICTXT, SNAMES( L ), MPC, NQC, $ MEM( IPC-IPREC ), DESCC( LLD_ ), $ IPREC, IPOSTC, PADVAL ) END IF * * Check the computations * CALL PDBLAS3TSTCHK( ICTXT, NOUT, L, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, $ MEM( IPMATA ), MEM( IPA ), IA, JA, $ DESCA, MEM( IPMATB ), MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPMATC ), $ MEM( IPC ), IC, JC, DESCC, THRESH, $ ROGUE, MEM( IPG ), INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 ) THEN IERR( 3 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 IERR( 3 ) = 1 END IF * * Check input-only scalar arguments * INFO = 1 CALL PDCHKARG3( ICTXT, NOUT, SNAMES( L ), SIDE, UPLO, $ TRANSA, TRANSB, DIAG, M, N, K, ALPHA, IA, $ JA, DESCA, IB, JB, DESCB, BETA, IC, JC, $ DESCC, INFO ) * * Check input-only array arguments * CALL PDCHKMOUT( NROWA, NCOLA, MEM( IPMATA ), $ MEM( IPA ), IA, JA, DESCA, IERR( 4 ) ) IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_A', $ SNAMES( L ) END IF * IF( BCHECK( L ) ) THEN CALL PDCHKMOUT( NROWB, NCOLB, MEM( IPMATB ), $ MEM( IPB ), IB, JB, DESCB, IERR( 5 ) ) IF( IERR( 5 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_B', $ SNAMES( L ) END IF END IF * IF( CCHECK( L ) ) THEN CALL PDCHKMOUT( NROWC, NCOLC, MEM( IPMATC ), $ MEM( IPC ), IC, JC, DESCC, IERR( 6 ) ) IF( IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_C', $ SNAMES( L ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE.0 .OR. IERR( 5 ).NE.0 .OR. $ IERR( 6 ).NE.0 ) THEN KFAIL( L ) = KFAIL( L ) + 1 ERRFLG = .TRUE. IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) SNAMES( L ) ELSE KPASS( L ) = KPASS( L ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) SNAMES( L ) END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PDMPRNT( ICTXT, NOUT, MA, NA, MEM( IPMATA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PDLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( ( NROWA.GT.0 ).AND.( NCOLA.GT.0 ) ) $ CALL PDMPRNT( ICTXT, NOUT, NROWA, NCOLA, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PDLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) END IF IF( BCHECK( L ) ) THEN IF( IERR( 5 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PDMPRNT( ICTXT, NOUT, MB, NB, $ MEM( IPMATB ), LDB, 0, 0, $ 'SERIAL_B' ) CALL PB_PDLAPRNT( MB, NB, MEM( IPB ), 1, 1, $ DESCB, 0, 0, 'PARALLEL_B', $ NOUT, MEM( IPMATB ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( ( NROWB.GT.0 ).AND.( NCOLB.GT.0 ) ) $ CALL PDMPRNT( ICTXT, NOUT, NROWB, NCOLB, $ MEM( IPMATB+IB-1+(JB-1)*LDB ), $ LDB, 0, 0, 'SERIAL_B' ) CALL PB_PDLAPRNT( NROWB, NCOLB, MEM( IPB ), IB, $ JB, DESCB, 0, 0, 'PARALLEL_B', $ NOUT, MEM( IPMATB ) ) END IF END IF IF( CCHECK( L ) ) THEN IF( IERR( 6 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PDMPRNT( ICTXT, NOUT, MC, NC, $ MEM( IPMATC ), LDC, 0, 0, $ 'SERIAL_C' ) CALL PB_PDLAPRNT( MC, NC, MEM( IPC ), 1, 1, $ DESCC, 0, 0, 'PARALLEL_C', $ NOUT, MEM( IPMATC ) ) ELSE IF( IERR( 3 ).NE.0 ) THEN IF( ( NROWB.GT.0 ).AND.( NCOLB.GT.0 ) ) $ CALL PDMPRNT( ICTXT, NOUT, NROWC, NCOLC, $ MEM( IPMATC+IC-1+(JC-1)*LDC ), $ LDC, 0, 0, 'SERIAL_C' ) CALL PB_PDLAPRNT( NROWC, NCOLC, MEM( IPC ), IC, $ JC, DESCC, 0, 0, 'PARALLEL_C', $ NOUT, MEM( IPMATC ) ) END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9982 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9981 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '-------------------' ) 9994 FORMAT( 2X, ' M N K SIDE UPLO TRANSA ', $ 'TRANSB DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,1X,I6,6X,A1,5X,A1,7X,A1,7X,A1,5X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IB JB MB NB IMBB INBB', $ ' MBB NBB RSRCB CSRCB' ) 9989 FORMAT( 2X, ' IC JC MC NC IMBC INBC', $ ' MBC NBC RSRCC CSRCC' ) 9988 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9987 FORMAT( 'Not enough memory. Need: ', I12 ) 9986 FORMAT( 2X, ' Tested Subroutine: ', A ) 9985 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9984 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9983 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9982 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9981 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9980 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9979 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9978 FORMAT( 2X, 'Testing Summary') 9977 FORMAT( 2X, 'End of Tests.' ) 9976 FORMAT( 2X, 'Tests started.' ) 9975 FORMAT( 2X, ' ***** ', A, ' has an incorrect value: ', $ ' BYPASS *****' ) 9974 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PDBLA3TST * END SUBROUTINE PDBLA3TSTINFO( SUMMRY, NOUT, NMAT, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, $ NVAL, KVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, $ CSCAVAL, IAVAL, JAVAL, MBVAL, NBVAL, $ IMBBVAL, MBBVAL, INBBVAL, NBBVAL, $ RSCBVAL, CSCBVAL, IBVAL, JBVAL, $ MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, $ ICVAL, JCVAL, LDVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, NBLOG, LTEST, SOF, $ TEE, IAM, IGAP, IVERB, NPROCS, THRESH, $ ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, NBLOG, $ NGRIDS, NMAT, NOUT, NPROCS REAL THRESH DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), SIDEVAL( LDVAL ), $ TRNAVAL( LDVAL ), TRNBVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCBVAL( LDVAL ), $ CSCCVAL( LDVAL ), IAVAL( LDVAL ), $ IBVAL( LDVAL ), ICVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBBVAL( LDVAL ), $ IMBCVAL( LDVAL ), INBAVAL( LDVAL ), $ INBBVAL( LDVAL ), INBCVAL( LDVAL ), $ JAVAL( LDVAL ), JBVAL( LDVAL ), JCVAL( LDVAL ), $ KVAL( LDVAL ), MAVAL( LDVAL ), MBAVAL( LDVAL ), $ MBBVAL( LDVAL ), MBCVAL( LDVAL ), $ MBVAL( LDVAL ), MCVAL( LDVAL ), MVAL( LDVAL ), $ NAVAL( LDVAL ), NBAVAL( LDVAL ), $ NBBVAL( LDVAL ), NBCVAL( LDVAL ), $ NBVAL( LDVAL ), NCVAL( LDVAL ), NVAL( LDVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCBVAL( LDVAL ), $ RSCCVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PDBLA3TSTINFO get the needed startup information for testing various * Level 3 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * SIDEVAL (global output) CHARACTER array * On entry, SIDEVAL is an array of dimension LDVAL. On exit, * this array contains the values of SIDE to run the code with. * * TRNAVAL (global output) CHARACTER array * On entry, TRNAVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSA to run the code * with. * * TRNBVAL (global output) CHARACTER array * On entry, TRNBVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSB to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * KVAL (global output) INTEGER array * On entry, KVAL is an array of dimension LDVAL. On exit, this * array contains the values of K to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MBVAL (global output) INTEGER array * On entry, MBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( M_ ) to run the code * with. * * NBVAL (global output) INTEGER array * On entry, NBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( N_ ) to run the code * with. * * IMBBVAL (global output) INTEGER array * On entry, IMBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( IMB_ ) to run the * code with. * * MBBVAL (global output) INTEGER array * On entry, MBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( MB_ ) to run the * code with. * * INBBVAL (global output) INTEGER array * On entry, INBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( INB_ ) to run the * code with. * * NBBVAL (global output) INTEGER array * On entry, NBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( NB_ ) to run the * code with. * * RSCBVAL (global output) INTEGER array * On entry, RSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( RSRC_ ) to run the * code with. * * CSCBVAL (global output) INTEGER array * On entry, CSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( CSRC_ ) to run the * code with. * * IBVAL (global output) INTEGER array * On entry, IBVAL is an array of dimension LDVAL. On exit, this * array contains the values of IB to run the code with. * * JBVAL (global output) INTEGER array * On entry, JBVAL is an array of dimension LDVAL. On exit, this * array contains the values of JB to run the code with. * * MCVAL (global output) INTEGER array * On entry, MCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( M_ ) to run the code * with. * * NCVAL (global output) INTEGER array * On entry, NCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( N_ ) to run the code * with. * * IMBCVAL (global output) INTEGER array * On entry, IMBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( IMB_ ) to run the * code with. * * MBCVAL (global output) INTEGER array * On entry, MBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( MB_ ) to run the * code with. * * INBCVAL (global output) INTEGER array * On entry, INBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( INB_ ) to run the * code with. * * NBCVAL (global output) INTEGER array * On entry, NBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( NB_ ) to run the * code with. * * RSCCVAL (global output) INTEGER array * On entry, RSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( RSRC_ ) to run the * code with. * * CSCCVAL (global output) INTEGER array * On entry, CSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( CSRC_ ) to run the * code with. * * ICVAL (global output) INTEGER array * On entry, ICVAL is an array of dimension LDVAL. On exit, this * array contains the values of IC to run the code with. * * JCVAL (global output) INTEGER array * On entry, JCVAL is an array of dimension LDVAL. On exit, this * array contains the values of JC to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, SIDE, TRANSA, TRANSB, UPLO, * M, N, K, DESCA(:), IA, JA, DESCB(:), IB, JB, DESCC(:), IC, * JC. This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 3 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * THRESH (global output) REAL * On exit, THRESH specifies the threshhold value for the test * ratio. * * ALPHA (global output) DOUBLE PRECISION * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) DOUBLE PRECISION * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+38*NMAT+NSUBS+4 ) with NSUBS equal to 8. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J DOUBLE PRECISION EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, DGEBR2D, DGEBS2D, $ ICOPY, IGEBR2D, IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PDBLAS3TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Read the threshold value for test ratio * READ( NIN, FMT = * ) THRESH IF( THRESH.LT.0.0 ) $ THRESH = 16.0 * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( SIDEVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( KVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( ICVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JCVAL ( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL DGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( SIDEVAL( J ) ) WORK( I+2 ) = ICHAR( TRNAVAL( J ) ) WORK( I+3 ) = ICHAR( TRNBVAL( J ) ) WORK( I+4 ) = ICHAR( UPLOVAL( J ) ) I = I + 5 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, KVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, ICVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JCVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 3 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real double precision '// $ 'Level 3 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9979 ) NBLOG WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9980 ) THRESH WRITE( NOUT, FMT = 9982 ) ALPHA WRITE( NOUT, FMT = 9981 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL DGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 38*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) SIDEVAL( J ) = CHAR( WORK( I+1 ) ) TRNAVAL( J ) = CHAR( WORK( I+2 ) ) TRNBVAL( J ) = CHAR( WORK( I+3 ) ) UPLOVAL( J ) = CHAR( WORK( I+4 ) ) I = I + 5 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, KVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, ICVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JCVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : ', G16.6 ) 9981 FORMAT( 2X, 'Beta : ', G16.6 ) 9980 FORMAT( 2X, 'Threshold value : ', G16.6 ) 9979 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PDBLA3TSTINFO * END SUBROUTINE PDBLAS3TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PDBLAS3TSTCHKE tests the error exits of the Level 3 PBLAS. * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 7 (NSUBS). * If LTEST( 1 ) is .TRUE., PDGEMM will be tested; * If LTEST( 2 ) is .TRUE., PDSYMM will be tested; * If LTEST( 3 ) is .TRUE., PDSYRK will be tested; * If LTEST( 4 ) is .TRUE., PDSYR2K will be tested; * If LTEST( 5 ) is .TRUE., PDTRMM will be tested; * If LTEST( 6 ) is .TRUE., PDTRSM will be tested; * If LTEST( 7 ) is .TRUE., PDGEADD will be tested; * If LTEST( 8 ) is .TRUE., PDTRADD will be tested; * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PDDIMEE, PDGEADD, PDGEMM, $ PDMATEE, PDOPTEE, PDSYMM, PDSYR2K, PDSYRK, $ PDTRADD, PDTRMM, PDTRSM * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/31, 32, 33, 35, 38, 38, 39, 40/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PDGEMM * I = 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDGEMM, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDGEMM, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDGEMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PDSYMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDSYMM, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDSYMM, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDSYMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PDSYRK * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDSYRK, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDSYRK, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDSYRK, SCODE( I ), SNAMES( I ) ) END IF * * Test PDSYR2K * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDSYR2K, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDSYR2K, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDSYR2K, SCODE( I ), SNAMES( I ) ) END IF * * Test PDTRMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDTRMM, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDTRMM, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDTRMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PDTRSM * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDTRSM, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDTRSM, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDTRSM, SCODE( I ), SNAMES( I ) ) END IF * * Test PDGEADD * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDGEADD, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDGEADD, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDGEADD, SCODE( I ), SNAMES( I ) ) END IF * * Test PDTRADD * I = I + 1 IF( LTEST( I ) ) THEN CALL PDOPTEE( ICTXT, NOUT, PDTRADD, SCODE( I ), SNAMES( I ) ) CALL PDDIMEE( ICTXT, NOUT, PDTRADD, SCODE( I ), SNAMES( I ) ) CALL PDMATEE( ICTXT, NOUT, PDTRADD, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PDBLAS3TSTCHKE * END SUBROUTINE PDCHKARG3( ICTXT, NOUT, SNAME, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, IA, JA, $ DESCA, IB, JB, DESCB, BETA, IC, JC, DESCC, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N, $ NOUT DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*7 SNAME INTEGER DESCA( * ), DESCB( * ), DESCC( * ) * .. * * Purpose * ======= * * PDCHKARG3 checks the input-only arguments of the Level 3 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies the SIDE option in the Level 3 PBLAS * operation. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the UPLO option in the Level 3 PBLAS * operation. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the TRANSA option in the Level 3 * PBLAS operation. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies the TRANSB option in the Level 3 * PBLAS operation. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies the DIAG option in the Level 3 PBLAS * operation. * * M (global input) INTEGER * On entry, M specifies the dimension of the submatrix ope- * rands. * * N (global input) INTEGER * On entry, N specifies the dimension of the submatrix ope- * rands. * * K (global input) INTEGER * On entry, K specifies the dimension of the submatrix ope- * rands. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 DIAGREF, SIDEREF, TRANSAREF, TRANSBREF, UPLOREF INTEGER I, IAREF, IBREF, ICREF, JAREF, JBREF, JCREF, $ KREF, MREF, MYCOL, MYROW, NPCOL, NPROW, NREF DOUBLE PRECISION ALPHAREF, BETAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCAREF( DLEN_ ), DESCBREF( DLEN_ ), $ DESCCREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * DIAGREF = DIAG SIDEREF = SIDE TRANSAREF = TRANSA TRANSBREF = TRANSB UPLOREF = UPLO MREF = M NREF = N KREF = K ALPHAREF = ALPHA IAREF = IA JAREF = JA DO 10 I = 1, DLEN_ DESCAREF( I ) = DESCA( I ) 10 CONTINUE IBREF = IB JBREF = JB DO 20 I = 1, DLEN_ DESCBREF( I ) = DESCB( I ) 20 CONTINUE BETAREF = BETA ICREF = IC JCREF = JC DO 30 I = 1, DLEN_ DESCCREF( I ) = DESCC( I ) 30 CONTINUE * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( .NOT. LSAME( DIAG, DIAGREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DIAG' ELSE IF( .NOT. LSAME( SIDE, SIDEREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'SIDE' ELSE IF( .NOT. LSAME( TRANSA, TRANSAREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANSA' ELSE IF( .NOT. LSAME( TRANSB, TRANSBREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANSB' ELSE IF( .NOT. LSAME( UPLO, UPLOREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'UPLO' ELSE IF( M.NE.MREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'M' ELSE IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( K.NE.KREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'K' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE IF( IA.NE.IAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IA' ELSE IF( JA.NE.JAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JA' ELSE IF( DESCA( DTYPE_ ).NE.DESCAREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( DTYPE_ )' ELSE IF( DESCA( M_ ).NE.DESCAREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( M_ )' ELSE IF( DESCA( N_ ).NE.DESCAREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( N_ )' ELSE IF( DESCA( IMB_ ).NE.DESCAREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( IMB_ )' ELSE IF( DESCA( INB_ ).NE.DESCAREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( INB_ )' ELSE IF( DESCA( MB_ ).NE.DESCAREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( MB_ )' ELSE IF( DESCA( NB_ ).NE.DESCAREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( NB_ )' ELSE IF( DESCA( RSRC_ ).NE.DESCAREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( RSRC_ )' ELSE IF( DESCA( CSRC_ ).NE.DESCAREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CSRC_ )' ELSE IF( DESCA( CTXT_ ).NE.DESCAREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CTXT_ )' ELSE IF( DESCA( LLD_ ).NE.DESCAREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( LLD_ )' ELSE IF( IB.NE.IBREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IB' ELSE IF( JB.NE.JBREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JB' ELSE IF( DESCB( DTYPE_ ).NE.DESCBREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( DTYPE_ )' ELSE IF( DESCB( M_ ).NE.DESCBREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( M_ )' ELSE IF( DESCB( N_ ).NE.DESCBREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( N_ )' ELSE IF( DESCB( IMB_ ).NE.DESCBREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( IMB_ )' ELSE IF( DESCB( INB_ ).NE.DESCBREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( INB_ )' ELSE IF( DESCB( MB_ ).NE.DESCBREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( MB_ )' ELSE IF( DESCB( NB_ ).NE.DESCBREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( NB_ )' ELSE IF( DESCB( RSRC_ ).NE.DESCBREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( RSRC_ )' ELSE IF( DESCB( CSRC_ ).NE.DESCBREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( CSRC_ )' ELSE IF( DESCB( CTXT_ ).NE.DESCBREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( CTXT_ )' ELSE IF( DESCB( LLD_ ).NE.DESCBREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( LLD_ )' ELSE IF( BETA.NE.BETAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'BETA' ELSE IF( IC.NE.ICREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IC' ELSE IF( JC.NE.JCREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JC' ELSE IF( DESCC( DTYPE_ ).NE.DESCCREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( DTYPE_ )' ELSE IF( DESCC( M_ ).NE.DESCCREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( M_ )' ELSE IF( DESCC( N_ ).NE.DESCCREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( N_ )' ELSE IF( DESCC( IMB_ ).NE.DESCCREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( IMB_ )' ELSE IF( DESCC( INB_ ).NE.DESCCREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( INB_ )' ELSE IF( DESCC( MB_ ).NE.DESCCREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( MB_ )' ELSE IF( DESCC( NB_ ).NE.DESCCREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( NB_ )' ELSE IF( DESCC( RSRC_ ).NE.DESCCREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( RSRC_ )' ELSE IF( DESCC( CSRC_ ).NE.DESCCREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( CSRC_ )' ELSE IF( DESCC( CTXT_ ).NE.DESCCREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( CTXT_ )' ELSE IF( DESCC( LLD_ ).NE.DESCCREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( LLD_ )' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.NE.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PDCHKARG3 * END SUBROUTINE PDBLAS3TSTCHK( ICTXT, NOUT, NROUT, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, A, PA, IA, $ JA, DESCA, B, PB, IB, JB, DESCB, BETA, $ C, PC, IC, JC, DESCC, THRESH, ROGUE, $ WORK, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N, $ NOUT, NROUT REAL THRESH DOUBLE PRECISION ALPHA, BETA, ROGUE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) DOUBLE PRECISION A( * ), B( * ), C( * ), PA( * ), PB( * ), $ PC( * ), WORK( * ) * .. * * Purpose * ======= * * PDBLAS3TSTCHK performs the computational tests of the Level 3 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PDGEMM will be tested; * else if NROUT = 2, PDSYMM will be tested; * else if NROUT = 3, PDSYRK will be tested; * else if NROUT = 4, PDSYR2K will be tested; * else if NROUT = 5, PDTRMM will be tested; * else if NROUT = 6, PDTRSM will be tested; * else if NROUT = 7, PDGEADD will be tested; * else if NROUT = 8, PDTRADD will be tested; * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies if the multiplication should be per- * formed from the left or the right. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies if the upper or lower part of the * matrix operand is to be referenced. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies if the matrix operand A is to be * transposed. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies if the matrix operand B is to be * transposed. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the triangular matrix operand is * unit or non-unit. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANSA = 'N' (resp. TRANSA <> 'N') in PxGEMM, PxSYRK, * PxSYR2K, PxHERK and PxHER2K. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) DOUBLE PRECISION array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) DOUBLE PRECISION array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * PB (local input) DOUBLE PRECISION array * On entry, PB is an array of dimension (DESCB( LLD_ ),*). This * array contains the local entries of the matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) DOUBLE PRECISION array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * THRESH (global input) REAL * On entry, THRESH is the threshold value for the test ratio. * * ROGUE (global input) DOUBLE PRECISION * On entry, ROGUE specifies the constant used to pad the * non-referenced part of triangular or symmetric matrices. * * WORK (workspace) DOUBLE PRECISION array * On entry, WORK is an array of dimension LWORK where LWORK is * at least 2*MAX( M, MAX( N, K ) ). This array is used to store * a copy of a column of C and the computed gauges (see PDMMCH). * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on A has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on B has been found, * if( MOD( INFO/4, 2 ) = 1 ) then an error on C has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, IPG, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR * .. * .. Local Arrays .. INTEGER IERR( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DTRSM, PB_DLASET, PDCHKMIN, $ PDMMCH, PDMMCH1, PDMMCH2, PDMMCH3, PDTRMM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 10 I = 1, 3 IERR( I ) = 0 10 CONTINUE IPG = MAX( M, MAX( N, K ) ) + 1 * IF( NROUT.EQ.1 ) THEN * * Test PDGEMM * * Check the resulting matrix C * CALL PDMMCH( ICTXT, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, $ DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, JC, $ DESCC, WORK, WORK( IPG ), ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PDCHKMIN( ERR, M, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PDCHKMIN( ERR, K, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF IF( LSAME( TRANSB, 'N' ) ) THEN CALL PDCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PDCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.2 ) THEN * * Test PDSYMM * * Check the resulting matrix C * IF( LSAME( SIDE, 'L' ) ) THEN CALL PDMMCH( ICTXT, 'No transpose', 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, $ BETA, C, PC, IC, JC, DESCC, WORK, WORK( IPG ), $ ERR, IERR( 3 ) ) ELSE CALL PDMMCH( ICTXT, 'No transpose', 'No transpose', M, N, N, $ ALPHA, B, IB, JB, DESCB, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, WORK, WORK( IPG ), $ ERR, IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_DLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_DLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF * IF( LSAME( SIDE, 'L' ) ) THEN CALL PDCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PDCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF CALL PDCHKMIN( ERR, M, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) * ELSE IF( NROUT.EQ.3 ) THEN * * Test PDSYRK * * Check the resulting matrix C * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PDMMCH1( ICTXT, UPLO, 'No transpose', N, K, ALPHA, A, $ IA, JA, DESCA, BETA, C, PC, IC, JC, DESCC, $ WORK, WORK( IPG ), ERR, IERR( 3 ) ) ELSE CALL PDMMCH1( ICTXT, UPLO, 'Transpose', N, K, ALPHA, A, IA, $ JA, DESCA, BETA, C, PC, IC, JC, DESCC, WORK, $ WORK( IPG ), ERR, IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PDCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PDCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.4 ) THEN * * Test PDSYR2K * * Check the resulting matrix C * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PDMMCH2( ICTXT, UPLO, 'No transpose', N, K, ALPHA, A, $ IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, $ IC, JC, DESCC, WORK, WORK( IPG ), ERR, $ IERR( 3 ) ) ELSE CALL PDMMCH2( ICTXT, UPLO, 'Transpose', N, K, ALPHA, A, $ IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, $ IC, JC, DESCC, WORK, WORK( IPG ), ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PDCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PDCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PDCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PDCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.5 ) THEN * * Test PDTRMM * * Check the resulting matrix B * IF( LSAME( SIDE, 'L' ) ) THEN CALL PDMMCH( ICTXT, TRANSA, 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, C, IB, JB, DESCB, $ ZERO, B, PB, IB, JB, DESCB, WORK, $ WORK( IPG ), ERR, IERR( 2 ) ) ELSE CALL PDMMCH( ICTXT, 'No transpose', TRANSA, M, N, N, $ ALPHA, C, IB, JB, DESCB, A, IA, JA, DESCA, $ ZERO, B, PB, IB, JB, DESCB, WORK, $ WORK( IPG ), ERR, IERR( 2 ) ) END IF * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( SIDE, 'L' ) ) THEN IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Upper', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Lower', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PDCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PDCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.6 ) THEN * * Test PDTRSM * * Check the resulting matrix B * CALL DTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ), $ B( IB+(JB-1)*DESCB( M_ ) ), DESCB( M_ ) ) CALL PDTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, PA, IA, JA, $ DESCA, PB, IB, JB, DESCB ) IF( LSAME( SIDE, 'L' ) ) THEN CALL PDMMCH( ICTXT, TRANSA, 'No transpose', M, N, M, ALPHA, $ A, IA, JA, DESCA, B, IB, JB, DESCB, ZERO, C, $ PB, IB, JB, DESCB, WORK, WORK( IPG ), ERR, $ IERR( 2 ) ) ELSE CALL PDMMCH( ICTXT, 'No transpose', TRANSA, M, N, N, ALPHA, $ B, IB, JB, DESCB, A, IA, JA, DESCA, ZERO, C, $ PB, IB, JB, DESCB, WORK, WORK( IPG ), ERR, $ IERR( 2 ) ) END IF * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( SIDE, 'L' ) ) THEN IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Upper', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Lower', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PDCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_DLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_DLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PDCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF ELSE IF( NROUT.EQ.7 ) THEN * * Test PDGEADD * * Check the resulting matrix C * CALL PDMMCH3( 'All', TRANSA, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, IERR( 3 ) ) * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PDCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PDCHKMIN( ERR, N, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.8 ) THEN * * Test PDTRADD * * Check the resulting matrix C * CALL PDMMCH3( UPLO, TRANSA, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, IERR( 3 ) ) * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PDCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PDCHKMIN( ERR, N, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * END IF * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'A' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'B' END IF * IF( IERR( 3 ).NE.0 ) THEN INFO = INFO + 4 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'C' END IF * 9999 FORMAT( 2X, ' ***** ERROR: Matrix operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** FATAL ERROR - Computed result is less ', $ 'than half accurate *****' ) 9997 FORMAT( 2X, ' ***** Test completed with maximum test ratio: ', $ F11.5, ' SUSPECT *****' ) * RETURN * * End of PDBLAS3TSTCHK * END scalapack-2.0.2/PBLAS/TESTING/pdblastst.f000644 000766 000024 00001356240 11622500733 020050 0ustar00juliestaff000000 000000 SUBROUTINE PDOPTEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PDOPTEE tests whether the PBLAS respond correctly to a bad option * argument. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PDCHKOPT * .. * .. Executable Statements .. * * Level 2 PBLAS * IF( SCODE.EQ.21 ) THEN * * Check 1st (and only) option * APOS = 1 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.22 .OR. SCODE.EQ.25 .OR. SCODE.EQ.26 .OR. $ SCODE.EQ.27 ) THEN * * Check 1st (and only) option * APOS = 1 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st option * APOS = 1 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 2nd option * APOS = 2 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 3rd option * APOS = 3 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'D', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st option * APOS = 1 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2'nd option * APOS = 2 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * ELSE IF( SCODE.EQ.32 ) THEN * * Check 1st option * APOS = 1 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'S', APOS ) * * Check 2nd option * APOS = 2 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 .OR. SCODE.EQ.35 .OR. $ SCODE.EQ.36 .OR. SCODE.EQ.40 ) THEN * * Check 1st option * APOS = 1 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 2'nd option * APOS = 2 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st option * APOS = 1 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'S', APOS ) * * Check 2nd option * APOS = 2 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 3rd option * APOS = 3 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 4th option * APOS = 4 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'D', APOS ) * * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st option * APOS = 1 CALL PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * END IF * RETURN * * End of PDOPTEE * END SUBROUTINE PDCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PDCHKOPT tests the option ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the option to be * checked. ARGNAM can either be 'D', 'S', 'A', 'B', or 'U'. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the option ARGNAM * to be tested. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER INFOT * .. * .. External Subroutines .. EXTERNAL PCHKPBE, PDCALLSUB, PDSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. CHARACTER DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO * .. * .. Executable Statements .. * * Reiniatilize the dummy arguments to correct values * CALL PDSETPBLAS( ICTXT ) * IF( LSAME( ARGNAM, 'D' ) ) THEN * * Generate bad DIAG option * DIAG = '/' * ELSE IF( LSAME( ARGNAM, 'S' ) ) THEN * * Generate bad SIDE option * SIDE = '/' * ELSE IF( LSAME( ARGNAM, 'A' ) ) THEN * * Generate bad TRANSA option * TRANSA = '/' * ELSE IF( LSAME( ARGNAM, 'B' ) ) THEN * * Generate bad TRANSB option * TRANSB = '/' * ELSE IF( LSAME( ARGNAM, 'U' ) ) THEN * * Generate bad UPLO option * UPLO = '/' * END IF * * Set INFOT to the position of the bad dimension argument * INFOT = ARGPOS * * Call the PBLAS routine * CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * RETURN * * End of PDCHKOPT * END SUBROUTINE PDDIMEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PDDIMEE tests whether the PBLAS respond correctly to a bad dimension * argument. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PDCHKDIM * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 .OR. SCODE.EQ.12 .OR. SCODE.EQ.13 .OR. $ SCODE.EQ.14 .OR. SCODE.EQ.15 ) THEN * * Check 1st (and only) dimension * APOS = 1 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * * Check 1st dimension * APOS = 2 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 3 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.22 .OR. SCODE.EQ.25 .OR. SCODE.EQ.26 .OR. $ SCODE.EQ.27 ) THEN * * Check 1st (and only) dimension * APOS = 2 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st (and only) dimension * APOS = 4 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.24 ) THEN * * Check 1st dimension * APOS = 1 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 2 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st dimension * APOS = 3 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Check 3rd dimension * APOS = 5 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'K', APOS ) * ELSE IF( SCODE.EQ.32 ) THEN * * Check 1st dimension * APOS = 3 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 .OR. SCODE.EQ.35 .OR. $ SCODE.EQ.36 ) THEN * * Check 1st dimension * APOS = 3 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Check 2nd dimension * APOS = 4 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'K', APOS ) * ELSE IF( SCODE.EQ.37 ) THEN * * Check 1st dimension * APOS = 1 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 2 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st dimension * APOS = 5 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 6 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st dimension * APOS = 2 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 3 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.40 ) THEN * * Check 1st dimension * APOS = 3 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * END IF * RETURN * * End of PDDIMEE * END SUBROUTINE PDCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PDCHKDIM tests the dimension ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the dimension to be * checked. ARGNAM can either be 'M', 'N' or 'K'. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the option ARGNAM * to be tested. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER INFOT * .. * .. External Subroutines .. EXTERNAL PCHKPBE, PDCALLSUB, PDSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. INTEGER KDIM, MDIM, NDIM COMMON /PBLASN/KDIM, MDIM, NDIM * .. * .. Executable Statements .. * * Reiniatilize the dummy arguments to correct values * CALL PDSETPBLAS( ICTXT ) * IF( LSAME( ARGNAM, 'M' ) ) THEN * * Generate bad MDIM * MDIM = -1 * ELSE IF( LSAME( ARGNAM, 'N' ) ) THEN * * Generate bad NDIM * NDIM = -1 * ELSE * * Generate bad KDIM * KDIM = -1 * END IF * * Set INFOT to the position of the bad dimension argument * INFOT = ARGPOS * * Call the PBLAS routine * CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * RETURN * * End of PDCHKDIM * END SUBROUTINE PDVECEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*7 SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PDVECEE tests whether the PBLAS respond correctly to a bad vector * argument. Each vector is described by: , I, J, * DESC, INC. Out of all these, only I, J, * DESC, and INC can be tested. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PDCHKMAT * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 ) THEN * * Check 1st vector * APOS = 2 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 7 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.12 .OR. SCODE.EQ.15 ) THEN * * Check 1st (and only) vector * APOS = 3 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * ELSE IF( SCODE.EQ.13 ) THEN * * Check 1st vector * APOS = 3 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 8 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.14 ) THEN * * Check 1st (and only) vector * APOS = 4 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * * Check 1st vector * APOS = 9 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 15 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.22 ) THEN * * Check 1st vector * APOS = 8 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 14 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st (and only) vector * APOS = 9 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * ELSE IF( SCODE.EQ.24 .OR. SCODE.EQ.27 ) THEN * * Check 1st vector * APOS = 4 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 9 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.26 .OR. SCODE.EQ.27 ) THEN * * Check 1'st (and only) vector * APOS = 4 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * END IF * RETURN * * End of PDVECEE * END SUBROUTINE PDMATEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*7 SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PDMATEE tests whether the PBLAS respond correctly to a bad matrix * argument. Each matrix is described by: , I, J, * and DESC. Out of all these, only I, J and DESC * can be tested. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PDCHKMAT * .. * .. Executable Statements .. * * Level 2 PBLAS * IF( SCODE.EQ.21 .OR. SCODE.EQ.23 ) THEN * * Check 1st (and only) matrix * APOS = 5 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.22 ) THEN * * Check 1st (and only) matrix * APOS = 4 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.24 .OR. SCODE.EQ.27 ) THEN * * Check 1st (and only) matrix * APOS = 14 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.25 .OR. SCODE.EQ.26 ) THEN * * Check 1st (and only) matrix * APOS = 9 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st matrix * APOS = 7 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * * Check 3nd matrix * APOS = 16 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.32 .OR. SCODE.EQ.35 .OR. SCODE.EQ.36 ) THEN * * Check 1st matrix * APOS = 6 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 10 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * * Check 3nd matrix * APOS = 15 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 ) THEN * * Check 1st matrix * APOS = 6 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.37 ) THEN * * Check 1st matrix * APOS = 4 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 9 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st matrix * APOS = 8 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 12 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st matrix * APOS = 5 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 10 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.40 ) THEN * * Check 1st matrix * APOS = 6 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * END IF * RETURN * * End of PDMATEE * END SUBROUTINE PDSETPBLAS( ICTXT ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT * .. * * Purpose * ======= * * PDSETPBLAS initializes *all* the dummy arguments to correct values. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. External Subroutines .. EXTERNAL PB_DESCSET2 * .. * .. Common Blocks .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY, KDIM, MDIM, NDIM DOUBLE PRECISION USCLR, SCLR INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) DOUBLE PRECISION A( 2, 2 ), B( 2, 2 ), C( 2, 2 ), X( 2 ), Y( 2 ) COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY COMMON /PBLASM/A, B, C COMMON /PBLASN/KDIM, MDIM, NDIM COMMON /PBLASS/SCLR, USCLR COMMON /PBLASV/X, Y * .. * .. Executable Statements .. * * Set default values for options * DIAG = 'N' SIDE = 'L' TRANSA = 'N' TRANSB = 'N' UPLO = 'U' * * Set default values for scalars * KDIM = 1 MDIM = 1 NDIM = 1 ISCLR = 1 SCLR = ONE USCLR = ONE * * Set default values for distributed matrix A * A( 1, 1 ) = ONE A( 2, 1 ) = ONE A( 1, 2 ) = ONE A( 2, 2 ) = ONE IA = 1 JA = 1 CALL PB_DESCSET2( DESCA, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix B * B( 1, 1 ) = ONE B( 2, 1 ) = ONE B( 1, 2 ) = ONE B( 2, 2 ) = ONE IB = 1 JB = 1 CALL PB_DESCSET2( DESCB, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix C * C( 1, 1 ) = ONE C( 2, 1 ) = ONE C( 1, 2 ) = ONE C( 2, 2 ) = ONE IC = 1 JC = 1 CALL PB_DESCSET2( DESCC, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix X * X( 1 ) = ONE X( 2 ) = ONE IX = 1 JX = 1 CALL PB_DESCSET2( DESCX, 2, 1, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) INCX = 1 * * Set default values for distributed matrix Y * Y( 1 ) = ONE Y( 2 ) = ONE IY = 1 JY = 1 CALL PB_DESCSET2( DESCY, 2, 1, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) INCY = 1 * RETURN * * End of PDSETPBLAS * END SUBROUTINE PDCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PDCHKMAT tests the matrix (or vector) ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the matrix or vector * to be checked. ARGNAM can either be 'A', 'B' or 'C' when one * wants to check a matrix, and 'X' or 'Y' for a vector. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the first argument * of the matrix (or vector) ARGNAM. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER DESCMULT PARAMETER ( DESCMULT = 100 ) * .. * .. Local Scalars .. INTEGER I, INFOT, NPROW, NPCOL, MYROW, MYCOL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PCHKPBE, PDCALLSUB, PDSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( ARGNAM, 'A' ) ) THEN * * Check IA. Set all other OK, bad IA * CALL PDSETPBLAS( ICTXT ) IA = -1 INFOT = ARGPOS + 1 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JA. Set all other OK, bad JA * CALL PDSETPBLAS( ICTXT ) JA = -1 INFOT = ARGPOS + 2 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCA. Set all other OK, bad DESCA * DO 10 I = 1, DLEN_ * * Set I'th entry of DESCA to incorrect value, rest ok. * CALL PDSETPBLAS( ICTXT ) DESCA( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCA, CSRCA, LDA * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PDSETPBLAS( ICTXT ) * * Test RSRCA >= NPROW * IF( I.EQ.RSRC_ ) $ DESCA( I ) = NPROW * * Test CSRCA >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCA( I ) = NPCOL * * Test LDA >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCA( I ) = 1 ELSE DESCA( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 10 CONTINUE * ELSE IF( LSAME( ARGNAM, 'B' ) ) THEN * * Check IB. Set all other OK, bad IB * CALL PDSETPBLAS( ICTXT ) IB = -1 INFOT = ARGPOS + 1 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JB. Set all other OK, bad JB * CALL PDSETPBLAS( ICTXT ) JB = -1 INFOT = ARGPOS + 2 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCB. Set all other OK, bad DESCB * DO 20 I = 1, DLEN_ * * Set I'th entry of DESCB to incorrect value, rest ok. * CALL PDSETPBLAS( ICTXT ) DESCB( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCB, CSRCB, LDB * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PDSETPBLAS( ICTXT ) * * Test RSRCB >= NPROW * IF( I.EQ.RSRC_ ) $ DESCB( I ) = NPROW * * Test CSRCB >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCB( I ) = NPCOL * * Test LDB >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCB( I ) = 1 ELSE DESCB( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 20 CONTINUE * ELSE IF( LSAME( ARGNAM, 'C' ) ) THEN * * Check IC. Set all other OK, bad IC * CALL PDSETPBLAS( ICTXT ) IC = -1 INFOT = ARGPOS + 1 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JC. Set all other OK, bad JC * CALL PDSETPBLAS( ICTXT ) JC = -1 INFOT = ARGPOS + 2 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCC. Set all other OK, bad DESCC * DO 30 I = 1, DLEN_ * * Set I'th entry of DESCC to incorrect value, rest ok. * CALL PDSETPBLAS( ICTXT ) DESCC( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCC, CSRCC, LDC * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PDSETPBLAS( ICTXT ) * * Test RSRCC >= NPROW * IF( I.EQ.RSRC_ ) $ DESCC( I ) = NPROW * * Test CSRCC >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCC( I ) = NPCOL * * Test LDC >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCC( I ) = 1 ELSE DESCC( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 30 CONTINUE * ELSE IF( LSAME( ARGNAM, 'X' ) ) THEN * * Check IX. Set all other OK, bad IX * CALL PDSETPBLAS( ICTXT ) IX = -1 INFOT = ARGPOS + 1 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JX. Set all other OK, bad JX * CALL PDSETPBLAS( ICTXT ) JX = -1 INFOT = ARGPOS + 2 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCX. Set all other OK, bad DESCX * DO 40 I = 1, DLEN_ * * Set I'th entry of DESCX to incorrect value, rest ok. * CALL PDSETPBLAS( ICTXT ) DESCX( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCX, CSRCX, LDX * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PDSETPBLAS( ICTXT ) * * Test RSRCX >= NPROW * IF( I.EQ.RSRC_ ) $ DESCX( I ) = NPROW * * Test CSRCX >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCX( I ) = NPCOL * * Test LDX >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCX( I ) = 1 ELSE DESCX( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 40 CONTINUE * * Check INCX. Set all other OK, bad INCX * CALL PDSETPBLAS( ICTXT ) INCX = -1 INFOT = ARGPOS + 4 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * ELSE * * Check IY. Set all other OK, bad IY * CALL PDSETPBLAS( ICTXT ) IY = -1 INFOT = ARGPOS + 1 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JY. Set all other OK, bad JY * CALL PDSETPBLAS( ICTXT ) JY = -1 INFOT = ARGPOS + 2 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCY. Set all other OK, bad DESCY * DO 50 I = 1, DLEN_ * * Set I'th entry of DESCY to incorrect value, rest ok. * CALL PDSETPBLAS( ICTXT ) DESCY( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCY, CSRCY, LDY * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PDSETPBLAS( ICTXT ) * * Test RSRCY >= NPROW * IF( I.EQ.RSRC_ ) $ DESCY( I ) = NPROW * * Test CSRCY >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCY( I ) = NPCOL * * Test LDY >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCY( I ) = 1 ELSE DESCY( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 50 CONTINUE * * Check INCY. Set all other OK, bad INCY * CALL PDSETPBLAS( ICTXT ) INCY = -1 INFOT = ARGPOS + 4 CALL PDCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * RETURN * * End of PDCHKMAT * END SUBROUTINE PDCALLSUB( SUBPTR, SCODE ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER SCODE * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PDCALLSUB calls the subroutine SUBPTR with the calling sequence iden- * tified by SCODE. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Common Blocks .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY, KDIM, MDIM, NDIM DOUBLE PRECISION USCLR, SCLR INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) DOUBLE PRECISION A( 2, 2 ), B( 2, 2 ), C( 2, 2 ), X( 2 ), Y( 2 ) COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY COMMON /PBLASM/A, B, C COMMON /PBLASN/KDIM, MDIM, NDIM COMMON /PBLASS/SCLR, USCLR COMMON /PBLASV/X, Y * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 ) THEN * CALL SUBPTR( NDIM, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, $ INCY ) * ELSE IF( SCODE.EQ.12 ) THEN * CALL SUBPTR( NDIM, SCLR, X, IX, JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.13 ) THEN * CALL SUBPTR( NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, JY, $ DESCY, INCY ) * ELSE IF( SCODE.EQ.14 ) THEN * CALL SUBPTR( NDIM, SCLR, ISCLR, X, IX, JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.15 ) THEN * CALL SUBPTR( NDIM, USCLR, X, IX, JX, DESCX, INCX ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * CALL SUBPTR( TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX, SCLR, Y, IY, JY, DESCY, INCY ) * ELSE IF( SCODE.EQ.22 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, A, IA, JA, DESCA, X, IX, JX, $ DESCX, INCX, SCLR, Y, IY, JY, DESCY, INCY ) * ELSE IF( SCODE.EQ.23 ) THEN * CALL SUBPTR( UPLO, TRANSA, DIAG, NDIM, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.24 ) THEN * CALL SUBPTR( MDIM, NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, $ JY, DESCY, INCY, A, IA, JA, DESCA ) * ELSE IF( SCODE.EQ.25 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, X, IX, JX, DESCX, INCX, A, IA, $ JA, DESCA ) * ELSE IF( SCODE.EQ.26 ) THEN * CALL SUBPTR( UPLO, NDIM, USCLR, X, IX, JX, DESCX, INCX, A, IA, $ JA, DESCA ) * ELSE IF( SCODE.EQ.27 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, $ JY, DESCY, INCY, A, IA, JA, DESCA ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * CALL SUBPTR( TRANSA, TRANSB, MDIM, NDIM, KDIM, SCLR, A, IA, JA, $ DESCA, B, IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.32 ) THEN * CALL SUBPTR( SIDE, UPLO, MDIM, NDIM, SCLR, A, IA, JA, DESCA, B, $ IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.33 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.34 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, USCLR, A, IA, JA, DESCA, $ USCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.35 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ B, IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.36 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ B, IB, JB, DESCB, USCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.37 ) THEN * CALL SUBPTR( MDIM, NDIM, SCLR, A, IA, JA, DESCA, SCLR, C, IC, $ JC, DESCC ) * ELSE IF( SCODE.EQ.38 ) THEN * CALL SUBPTR( SIDE, UPLO, TRANSA, DIAG, MDIM, NDIM, SCLR, A, IA, $ JA, DESCA, B, IB, JB, DESCB ) * ELSE IF( SCODE.EQ.39 ) THEN * CALL SUBPTR( TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, SCLR, $ C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.40 ) THEN * CALL SUBPTR( UPLO, TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, $ SCLR, C, IC, JC, DESCC ) * END IF * RETURN * * End of PDCALLSUB * END SUBROUTINE PDERRSET( ERR, ERRMAX, XTRUE, X ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION ERR, ERRMAX, X, XTRUE * .. * * Purpose * ======= * * PDERRSET computes the absolute difference ERR = |XTRUE - X| and com- * pares it with zero. ERRMAX accumulates the absolute error difference. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERR (local output) DOUBLE PRECISION * On exit, ERR specifies the absolute difference |XTRUE - X|. * * ERRMAX (local input/local output) DOUBLE PRECISION * On entry, ERRMAX specifies a previously computed error. On * exit ERRMAX is the accumulated error MAX( ERRMAX, ERR ). * * XTRUE (local input) DOUBLE PRECISION * On entry, XTRUE specifies the true value. * * X (local input) DOUBLE PRECISION * On entry, X specifies the value to be compared to XTRUE. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. DOUBLE PRECISION PDDIFF EXTERNAL PDDIFF * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * ERR = ABS( PDDIFF( XTRUE, X ) ) * ERRMAX = MAX( ERRMAX, ERR ) * RETURN * * End of PDERRSET * END SUBROUTINE PDCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INFO, IX, JX, N DOUBLE PRECISION ERRMAX * .. * .. Array Arguments .. INTEGER DESCX( * ) DOUBLE PRECISION PX( * ), X( * ) * .. * * Purpose * ======= * * PDCHKVIN checks that the submatrix sub( PX ) remained unchanged. The * local array entries are compared element by element, and their dif- * ference is tested against 0.0 as well as the epsilon machine. Notice * that this difference should be numerically exactly the zero machine, * but because of the possible fluctuation of some of the data we flag- * ged differently a difference less than twice the epsilon machine. The * largest error is also returned. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERRMAX (global output) DOUBLE PRECISION * On exit, ERRMAX specifies the largest absolute element-wise * difference between sub( X ) and sub( PX ). * * N (global input) INTEGER * On entry, N specifies the length of the subvector operand * sub( X ). N must be at least zero. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) DOUBLE PRECISION array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, ICURROW, IIX, IN, IXCOL, $ IXROW, J, JB, JJX, JN, KK, LDPX, LDX, LL, $ MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR, EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PB_INFOG2L, PDERRSET * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) * LDX = DESCX( M_ ) LDPX = DESCX( LLD_ ) ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( N.EQ.1 ) THEN * IF( ( MYROW.EQ.IXROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.IXCOL .OR. COLREP ) ) $ CALL PDERRSET( ERR, ERRMAX, X( IX+(JX-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) * ELSE IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * IF( MYROW.EQ.IXROW .OR. ROWREP ) THEN * ICURCOL = IXCOL IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN DO 10 J = JX, JN CALL PDERRSET( ERR, ERRMAX, X( IX+(J-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) JJX = JJX + 1 10 CONTINUE END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 30 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * DO 20 KK = 0, JB-1 CALL PDERRSET( ERR, ERRMAX, X( IX+(J+KK-1)*LDX ), $ PX( IIX+(JJX+KK-1)*LDPX ) ) 20 CONTINUE * JJX = JJX + JB * END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 30 CONTINUE * END IF * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * IF( MYCOL.EQ.IXCOL .OR. COLREP ) THEN * ICURROW = IXROW IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 40 I = IX, IN CALL PDERRSET( ERR, ERRMAX, X( I+(JX-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) IIX = IIX + 1 40 CONTINUE END IF ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * DO 50 KK = 0, IB-1 CALL PDERRSET( ERR, ERRMAX, X( I+KK+(JX-1)*LDX ), $ PX( IIX+KK+(JJX-1)*LDPX ) ) 50 CONTINUE * IIX = IIX + IB * END IF * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PDCHKVIN * END SUBROUTINE PDCHKVOUT( N, X, PX, IX, JX, DESCX, INCX, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INFO, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) DOUBLE PRECISION PX( * ), X( * ) * .. * * Purpose * ======= * * PDCHKVOUT checks that the matrix PX \ sub( PX ) remained unchanged. * The local array entries are compared element by element, and their * difference is tested against 0.0 as well as the epsilon machine. No- * tice that this difference should be numerically exactly the zero ma- * chine, but because of the possible movement of some of the data we * flagged differently a difference less than twice the epsilon machine. * The largest error is reported. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector operand * sub( X ). N must be at least zero. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) DOUBLE PRECISION array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, ICURROW, II, IMBX, INBX, $ J, JB, JJ, KK, LDPX, LDX, LL, MBX, MPALL, $ MYCOL, MYCOLDIST, MYROW, MYROWDIST, NBX, NPCOL, $ NPROW, NQALL DOUBLE PRECISION EPS, ERR, ERRMAX * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PDERRSET * .. * .. External Functions .. INTEGER PB_NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( ( DESCX( M_ ).LE.0 ).OR.( DESCX( N_ ).LE.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * MPALL = PB_NUMROC( DESCX( M_ ), 1, DESCX( IMB_ ), DESCX( MB_ ), $ MYROW, DESCX( RSRC_ ), NPROW ) NQALL = PB_NUMROC( DESCX( N_ ), 1, DESCX( INB_ ), DESCX( NB_ ), $ MYCOL, DESCX( CSRC_ ), NPCOL ) * MBX = DESCX( MB_ ) NBX = DESCX( NB_ ) LDX = DESCX( M_ ) LDPX = DESCX( LLD_ ) ICURROW = DESCX( RSRC_ ) ICURCOL = DESCX( CSRC_ ) ROWREP = ( ICURROW.EQ.-1 ) COLREP = ( ICURCOL.EQ.-1 ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN IMBX = DESCX( IMB_ ) ELSE IMBX = MBX END IF IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN INBX = DESCX( INB_ ) ELSE INBX = NBX END IF IF( ROWREP ) THEN MYROWDIST = 0 ELSE MYROWDIST = MOD( MYROW - ICURROW + NPROW, NPROW ) END IF IF( COLREP ) THEN MYCOLDIST = 0 ELSE MYCOLDIST = MOD( MYCOL - ICURCOL + NPCOL, NPCOL ) END IF II = 1 JJ = 1 * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * I = 1 IF( MYCOLDIST.EQ.0 ) THEN J = 1 ELSE J = DESCX( INB_ ) + ( MYCOLDIST - 1 ) * NBX + 1 END IF JB = MIN( MAX( 0, DESCX( N_ ) - J + 1 ), INBX ) IB = MIN( DESCX( M_ ), DESCX( IMB_ ) ) * DO 20 KK = 0, JB-1 DO 10 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. J+KK.GT.JX+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 10 CONTINUE 20 CONTINUE IF( COLREP ) THEN J = J + INBX ELSE J = J + INBX + ( NPCOL - 1 ) * NBX END IF * DO 50 JJ = INBX+1, NQALL, NBX JB = MIN( NQALL-JJ+1, NBX ) * DO 40 KK = 0, JB-1 DO 30 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 30 CONTINUE 40 CONTINUE * IF( COLREP ) THEN J = J + NBX ELSE J = J + NPCOL * NBX END IF * 50 CONTINUE * II = II + IB * END IF * ICURROW = MOD( ICURROW + 1, NPROW ) * DO 110 I = DESCX( IMB_ ) + 1, DESCX( M_ ), MBX IB = MIN( DESCX( M_ ) - I + 1, MBX ) * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * IF( MYCOLDIST.EQ.0 ) THEN J = 1 ELSE J = DESCX( INB_ ) + ( MYCOLDIST - 1 ) * NBX + 1 END IF * JJ = 1 JB = MIN( MAX( 0, DESCX( N_ ) - J + 1 ), INBX ) DO 70 KK = 0, JB-1 DO 60 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 60 CONTINUE 70 CONTINUE IF( COLREP ) THEN J = J + INBX ELSE J = J + INBX + ( NPCOL - 1 ) * NBX END IF * DO 100 JJ = INBX+1, NQALL, NBX JB = MIN( NQALL-JJ+1, NBX ) * DO 90 KK = 0, JB-1 DO 80 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 80 CONTINUE 90 CONTINUE * IF( COLREP ) THEN J = J + NBX ELSE J = J + NPCOL * NBX END IF * 100 CONTINUE * II = II + IB * END IF * ICURROW = MOD( ICURROW + 1, NPROW ) * 110 CONTINUE * ELSE * * sub( X ) is a column vector * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * J = 1 IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCX( IMB_ ) + ( MYROWDIST - 1 ) * MBX + 1 END IF IB = MIN( MAX( 0, DESCX( M_ ) - I + 1 ), IMBX ) JB = MIN( DESCX( N_ ), DESCX( INB_ ) ) * DO 130 KK = 0, JB-1 DO 120 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. I+LL.GT.IX+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 120 CONTINUE 130 CONTINUE IF( ROWREP ) THEN I = I + IMBX ELSE I = I + IMBX + ( NPROW - 1 ) * MBX END IF * DO 160 II = IMBX+1, MPALL, MBX IB = MIN( MPALL-II+1, MBX ) * DO 150 KK = 0, JB-1 DO 140 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 140 CONTINUE 150 CONTINUE * IF( ROWREP ) THEN I = I + MBX ELSE I = I + NPROW * MBX END IF * 160 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * DO 220 J = DESCX( INB_ ) + 1, DESCX( N_ ), NBX JB = MIN( DESCX( N_ ) - J + 1, NBX ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCX( IMB_ ) + ( MYROWDIST - 1 ) * MBX + 1 END IF * II = 1 IB = MIN( MAX( 0, DESCX( M_ ) - I + 1 ), IMBX ) DO 180 KK = 0, JB-1 DO 170 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 170 CONTINUE 180 CONTINUE IF( ROWREP ) THEN I = I + IMBX ELSE I = I + IMBX + ( NPROW - 1 ) * MBX END IF * DO 210 II = IMBX+1, MPALL, MBX IB = MIN( MPALL-II+1, MBX ) * DO 200 KK = 0, JB-1 DO 190 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 190 CONTINUE 200 CONTINUE * IF( ROWREP ) THEN I = I + MBX ELSE I = I + NPROW * MBX END IF * 210 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * 220 CONTINUE * END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PDCHKVOUT * END SUBROUTINE PDCHKMIN( ERRMAX, M, N, A, PA, IA, JA, DESCA, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N DOUBLE PRECISION ERRMAX * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION PA( * ), A( * ) * .. * * Purpose * ======= * * PDCHKMIN checks that the submatrix sub( PA ) remained unchanged. The * local array entries are compared element by element, and their dif- * ference is tested against 0.0 as well as the epsilon machine. Notice * that this difference should be numerically exactly the zero machine, * but because of the possible fluctuation of some of the data we flag- * ged differently a difference less than twice the epsilon machine. The * largest error is also returned. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERRMAX (global output) DOUBLE PRECISION * On exit, ERRMAX specifies the largest absolute element-wise * difference between sub( A ) and sub( PA ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * operand sub( A ). N must be at least zero. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) DOUBLE PRECISION array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ KK, LDA, LDPA, LL, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR, EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PB_INFOG2L, PDERRSET * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if posssible * IF( ( M.EQ.0 ).OR.( N.EQ.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL PB_INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) * II = IIA JJ = JJA LDA = DESCA( M_ ) LDPA = DESCA( LLD_ ) ICURROW = IAROW ICURCOL = IACOL ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * * Handle the first block of column separately * JB = DESCA( INB_ ) - JA + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCA( NB_ ) + 1 ) * DESCA( NB_ ) + JB JB = MIN( JB, N ) JN = JA + JB - 1 * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * DO 40 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 10 K = 0, IB-1 CALL PDERRSET( ERR, ERRMAX, A( IA+K+(JA+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 10 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) * * Loop over remaining block of rows * DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 20 K = 0, IB-1 CALL PDERRSET( ERR, ERRMAX, A( I+K+(JA+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 20 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) 30 CONTINUE * II = IIA ICURROW = IAROW 40 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining column blocks * DO 90 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN DO 80 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 )*DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 50 K = 0, IB-1 CALL PDERRSET( ERR, ERRMAX, A( IA+K+(J+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 50 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) * * Loop over remaining block of rows * DO 70 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 60 K = 0, IB-1 CALL PDERRSET( ERR, ERRMAX, $ A( I+K+(J+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 60 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) 70 CONTINUE * II = IIA ICURROW = IAROW 80 CONTINUE * JJ = JJ + JB END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 90 CONTINUE * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PDCHKMIN * END SUBROUTINE PDCHKMOUT( M, N, A, PA, IA, JA, DESCA, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ), PA( * ) * .. * * Purpose * ======= * * PDCHKMOUT checks that the matrix PA \ sub( PA ) remained unchanged. * The local array entries are compared element by element, and their * difference is tested against 0.0 as well as the epsilon machine. No- * tice that this difference should be numerically exactly the zero ma- * chine, but because of the possible movement of some of the data we * flagged differently a difference less than twice the epsilon machine. * The largest error is reported. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( PA ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( PA ). N must be at least zero. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) DOUBLE PRECISION array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, II, IMBA, J, JB, JJ, KK, $ LDA, LDPA, LL, MPALL, MYCOL, MYROW, MYROWDIST, $ NPCOL, NPROW DOUBLE PRECISION EPS, ERR, ERRMAX * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PDERRSET * .. * .. External Functions .. INTEGER PB_NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( ( DESCA( M_ ).LE.0 ).OR.( DESCA( N_ ).LE.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * MPALL = PB_NUMROC( DESCA( M_ ), 1, DESCA( IMB_ ), DESCA( MB_ ), $ MYROW, DESCA( RSRC_ ), NPROW ) * LDA = DESCA( M_ ) LDPA = DESCA( LLD_ ) * II = 1 JJ = 1 ROWREP = ( DESCA( RSRC_ ).EQ.-1 ) COLREP = ( DESCA( CSRC_ ).EQ.-1 ) ICURCOL = DESCA( CSRC_ ) IF( MYROW.EQ.DESCA( RSRC_ ) .OR. ROWREP ) THEN IMBA = DESCA( IMB_ ) ELSE IMBA = DESCA( MB_ ) END IF IF( ROWREP ) THEN MYROWDIST = 0 ELSE MYROWDIST = MOD( MYROW - DESCA( RSRC_ ) + NPROW, NPROW ) END IF * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * J = 1 IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCA( IMB_ ) + ( MYROWDIST - 1 ) * DESCA( MB_ ) + 1 END IF IB = MIN( MAX( 0, DESCA( M_ ) - I + 1 ), IMBA ) JB = MIN( DESCA( N_ ), DESCA( INB_ ) ) * DO 20 KK = 0, JB-1 DO 10 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 10 CONTINUE 20 CONTINUE IF( ROWREP ) THEN I = I + IMBA ELSE I = I + IMBA + ( NPROW - 1 ) * DESCA( MB_ ) END IF * DO 50 II = IMBA + 1, MPALL, DESCA( MB_ ) IB = MIN( MPALL-II+1, DESCA( MB_ ) ) * DO 40 KK = 0, JB-1 DO 30 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 30 CONTINUE 40 CONTINUE * IF( ROWREP ) THEN I = I + DESCA( MB_ ) ELSE I = I + NPROW * DESCA( MB_ ) END IF * 50 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * DO 110 J = DESCA( INB_ ) + 1, DESCA( N_ ), DESCA( NB_ ) JB = MIN( DESCA( N_ ) - J + 1, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCA( IMB_ ) + ( MYROWDIST - 1 ) * DESCA( MB_ ) + 1 END IF * II = 1 IB = MIN( MAX( 0, DESCA( M_ ) - I + 1 ), IMBA ) DO 70 KK = 0, JB-1 DO 60 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 60 CONTINUE 70 CONTINUE IF( ROWREP ) THEN I = I + IMBA ELSE I = I + IMBA + ( NPROW - 1 ) * DESCA( MB_ ) END IF * DO 100 II = IMBA+1, MPALL, DESCA( MB_ ) IB = MIN( MPALL-II+1, DESCA( MB_ ) ) * DO 90 KK = 0, JB-1 DO 80 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PDERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 80 CONTINUE 90 CONTINUE * IF( ROWREP ) THEN I = I + DESCA( MB_ ) ELSE I = I + NPROW * DESCA( MB_ ) END IF * 100 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * INSERT MODE 110 CONTINUE * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PDCHKMOUT * END SUBROUTINE PDMPRNT( ICTXT, NOUT, M, N, A, LDA, IRPRNT, ICPRNT, $ CMATNM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICPRNT, ICTXT, IRPRNT, LDA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PDMPRNT prints to the standard output an array A of size m by n. Only * the process of coordinates ( IRPRNT, ICPRNT ) is printing. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * M (global input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). The leading m * by n part of this array is printed. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array A to be printed. LDA must be at least MAX( 1, M ). * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the process row coordinate of the * printing process. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the process column coordinate of * the printing process. * * CMATNM (global input) CHARACTER*(*) * On entry, CMATNM specifies the identifier of the matrix to be * printed. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN * WRITE( NOUT, FMT = * ) DO 20 J = 1, N * DO 10 I = 1, M * WRITE( NOUT, FMT = 9999 ) CMATNM, I, J, A( I, J ) * 10 CONTINUE * 20 CONTINUE * END IF * 9999 FORMAT( 1X, A, '(', I6, ',', I6, ')=', D30.18 ) * RETURN * * End of PDMPRNT * END SUBROUTINE PDVPRNT( ICTXT, NOUT, N, X, INCX, IRPRNT, ICPRNT, $ CVECNM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICPRNT, ICTXT, INCX, IRPRNT, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CVECNM DOUBLE PRECISION X( * ) * .. * * Purpose * ======= * * PDVPRNT prints to the standard output an vector x of length n. Only * the process of coordinates ( IRPRNT, ICPRNT ) is printing. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * N (global input) INTEGER * On entry, N specifies the length of the vector X. N must be * at least zero. * * X (global input) DOUBLE PRECISION array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the process row coordinate of the * printing process. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the process column coordinate of * the printing process. * * CVECNM (global input) CHARACTER*(*) * On entry, CVECNM specifies the identifier of the vector to be * printed. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN * WRITE( NOUT, FMT = * ) DO 10 I = 1, 1 + ( N-1 )*INCX, INCX * WRITE( NOUT, FMT = 9999 ) CVECNM, I, X( I ) * 10 CONTINUE * END IF * 9999 FORMAT( 1X, A, '(', I6, ')=', D30.18 ) * RETURN * * End of PDVPRNT * END SUBROUTINE PDMVCH( ICTXT, TRANS, M, N, ALPHA, A, IA, JA, DESCA, $ X, IX, JX, DESCX, INCX, BETA, Y, PY, IY, JY, $ DESCY, INCY, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N DOUBLE PRECISION ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION A( * ), G( * ), PY( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PDMVCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies which matrix-vector product is to * be computed as follows: * If TRANS = 'N', * sub( Y ) = BETA * sub( Y ) + sub( A ) * sub( X ), * otherwise * sub( Y ) = BETA * sub( Y ) + sub( A )' * sub( X ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) DOUBLE PRECISION array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP, TRAN INTEGER I, IB, ICURCOL, ICURROW, IIY, IN, IOFFA, IOFFX, $ IOFFY, IYCOL, IYROW, J, JB, JJY, JN, KK, LDA, $ LDPY, LDX, LDY, ML, MYCOL, MYROW, NL, NPCOL, $ NPROW DOUBLE PRECISION EPS, ERRI, GTMP, TBETA, YTMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * IF( M.EQ.0 .OR. N.EQ.0 ) THEN TBETA = ONE ELSE TBETA = BETA END IF * TRAN = LSAME( TRANS, 'T' ).OR.LSAME( TRANS, 'C' ) IF( TRAN ) THEN ML = N NL = M ELSE ML = M NL = N END IF * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in Y using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * IOFFY = IY + ( JY - 1 ) * LDY DO 30 I = 1, ML YTMP = ZERO GTMP = ZERO IOFFX = IX + ( JX - 1 ) * LDX IF( TRAN )THEN IOFFA = IA + ( JA + I - 2 ) * LDA DO 10 J = 1, NL YTMP = YTMP + A( IOFFA ) * X( IOFFX ) GTMP = GTMP + ABS( A( IOFFA ) * X( IOFFX ) ) IOFFA = IOFFA + 1 IOFFX = IOFFX + INCX 10 CONTINUE ELSE IOFFA = IA + I - 1 + ( JA - 1 ) * LDA DO 20 J = 1, NL YTMP = YTMP + A( IOFFA ) * X( IOFFX ) GTMP = GTMP + ABS( A( IOFFA ) * X( IOFFX ) ) IOFFA = IOFFA + LDA IOFFX = IOFFX + INCX 20 CONTINUE END IF G( I ) = ABS( ALPHA ) * GTMP + ABS( TBETA * Y( IOFFY ) ) Y( IOFFY ) = ALPHA * YTMP + TBETA * Y( IOFFY ) IOFFY = IOFFY + INCY 30 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO INFO = 0 LDPY = DESCY( LLD_ ) IOFFY = IY + ( JY - 1 ) * LDY CALL PB_INFOG2L( IY, JY, DESCY, NPROW, NPCOL, MYROW, MYCOL, IIY, $ JJY, IYROW, IYCOL ) ICURROW = IYROW ICURCOL = IYCOL ROWREP = ( IYROW.EQ.-1 ) COLREP = ( IYCOL.EQ.-1 ) * IF( INCY.EQ.DESCY( M_ ) ) THEN * * sub( Y ) is a row vector * JB = DESCY( INB_ ) - JY + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCY( NB_ ) + 1 ) * DESCY( NB_ ) + JB JB = MIN( JB, ML ) JN = JY + JB - 1 * DO 50 J = JY, JN * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) ) / EPS IF( G( J-JY+1 ).NE.ZERO ) $ ERRI = ERRI / G( J-JY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 JJY = JJY + 1 END IF * IOFFY = IOFFY + INCY * 50 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 70 J = JN+1, JY+ML-1, DESCY( NB_ ) JB = MIN( JY+ML-J, DESCY( NB_ ) ) * DO 60 KK = 0, JB-1 * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) )/EPS IF( G( J+KK-JY+1 ).NE.ZERO ) $ ERRI = ERRI / G( J+KK-JY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 JJY = JJY + 1 END IF * IOFFY = IOFFY + INCY * 60 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 70 CONTINUE * ELSE * * sub( Y ) is a column vector * IB = DESCY( IMB_ ) - IY + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCY( MB_ ) + 1 ) * DESCY( MB_ ) + IB IB = MIN( IB, ML ) IN = IY + IB - 1 * DO 80 I = IY, IN * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) ) / EPS IF( G( I-IY+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIY = IIY + 1 END IF * IOFFY = IOFFY + INCY * 80 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 100 I = IN+1, IY+ML-1, DESCY( MB_ ) IB = MIN( IY+ML-I, DESCY( MB_ ) ) * DO 90 KK = 0, IB-1 * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) )/EPS IF( G( I+KK-IY+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIY = IIY + 1 END IF * IOFFY = IOFFY + INCY * 90 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 100 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) * RETURN * * End of PDMVCH * END SUBROUTINE PDVMCH( ICTXT, UPLO, M, N, ALPHA, X, IX, JX, DESCX, $ INCX, Y, IY, JY, DESCY, INCY, A, PA, IA, JA, $ DESCA, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N DOUBLE PRECISION ALPHA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION A( * ), G( * ), PA( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PDVMCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of the submatrix sub( A ) * is to be referenced as follows: * If UPLO = 'L', only the lower triangular part, * If UPLO = 'U', only the upper triangular part, * else the entire matrix is to be referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) DOUBLE PRECISION array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) DOUBLE PRECISION array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, LOWER, ROWREP, UPPER INTEGER I, IACOL, IAROW, IB, IBEG, ICURROW, IEND, IIA, $ IN, IOFFA, IOFFX, IOFFY, J, JJA, KK, LDA, LDPA, $ LDX, LDY, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ATMP, EPS, ERRI, GTMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in A using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 70 J = 1, N * IOFFY = IY + ( JY - 1 ) * LDY + ( J - 1 ) * INCY * IF( LOWER ) THEN IBEG = J IEND = M DO 10 I = 1, J-1 G( I ) = ZERO 10 CONTINUE ELSE IF( UPPER ) THEN IBEG = 1 IEND = J DO 20 I = J+1, M G( I ) = ZERO 20 CONTINUE ELSE IBEG = 1 IEND = M END IF * DO 30 I = IBEG, IEND * IOFFX = IX + ( JX - 1 ) * LDX + ( I - 1 ) * INCX IOFFA = IA + I - 1 + ( JA + J - 2 ) * LDA ATMP = X( IOFFX ) * Y( IOFFY ) GTMP = ABS( X( IOFFX ) * Y( IOFFY ) ) G( I ) = ABS( ALPHA ) * GTMP + ABS( A( IOFFA ) ) A( IOFFA ) = ALPHA * ATMP + A( IOFFA ) * 30 CONTINUE * * Compute the error ratio for this result. * INFO = 0 ERR = ZERO LDPA = DESCA( LLD_ ) IOFFA = IA + ( JA + J - 2 ) * LDA CALL PB_INFOG2L( IA, JA+J-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * IF( MYCOL.EQ.IACOL .OR. COLREP ) THEN * ICURROW = IAROW IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 * DO 40 I = IA, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA ) - A( IOFFA ) )/EPS IF( G( I-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 40 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * DO 50 KK = 0, IB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA )-A( IOFFA ) )/EPS IF( G( I+KK-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 80 * 70 CONTINUE * 80 CONTINUE * RETURN * * End of PDVMCH * END SUBROUTINE PDVMCH2( ICTXT, UPLO, M, N, ALPHA, X, IX, JX, DESCX, $ INCX, Y, IY, JY, DESCY, INCY, A, PA, IA, $ JA, DESCA, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N DOUBLE PRECISION ALPHA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION A( * ), G( * ), PA( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PDVMCH2 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of the submatrix sub( A ) * is to be referenced as follows: * If UPLO = 'L', only the lower triangular part, * If UPLO = 'U', only the upper triangular part, * else the entire matrix is to be referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) DOUBLE PRECISION array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) DOUBLE PRECISION array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, LOWER, ROWREP, UPPER INTEGER I, IACOL, IAROW, IB, IBEG, ICURROW, IEND, IIA, $ IN, IOFFA, IOFFXI, IOFFXJ, IOFFYI, IOFFYJ, J, $ JJA, KK, LDA, LDPA, LDX, LDY, MYCOL, MYROW, $ NPCOL, NPROW DOUBLE PRECISION EPS, ERRI, GTMP, ATMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in A using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 70 J = 1, N * IOFFXJ = IX + ( JX - 1 ) * LDX + ( J - 1 ) * INCX IOFFYJ = IY + ( JY - 1 ) * LDY + ( J - 1 ) * INCY * IF( LOWER ) THEN IBEG = J IEND = M DO 10 I = 1, J-1 G( I ) = ZERO 10 CONTINUE ELSE IF( UPPER ) THEN IBEG = 1 IEND = J DO 20 I = J+1, M G( I ) = ZERO 20 CONTINUE ELSE IBEG = 1 IEND = M END IF * DO 30 I = IBEG, IEND IOFFA = IA + I - 1 + ( JA + J - 2 ) * LDA IOFFXI = IX + ( JX - 1 ) * LDX + ( I - 1 ) * INCX IOFFYI = IY + ( JY - 1 ) * LDY + ( I - 1 ) * INCY ATMP = X( IOFFXI ) * Y( IOFFYJ ) ATMP = ATMP + Y( IOFFYI ) * X( IOFFXJ ) GTMP = ABS( X( IOFFXI ) * Y( IOFFYJ ) ) GTMP = GTMP + ABS( Y( IOFFYI ) * X( IOFFXJ ) ) G( I ) = ABS( ALPHA ) * GTMP + ABS( A( IOFFA ) ) A( IOFFA ) = ALPHA*ATMP + A( IOFFA ) * 30 CONTINUE * * Compute the error ratio for this result. * INFO = 0 ERR = ZERO LDPA = DESCA( LLD_ ) IOFFA = IA + ( JA + J - 2 ) * LDA CALL PB_INFOG2L( IA, JA+J-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * IF( MYCOL.EQ.IACOL .OR. COLREP ) THEN * ICURROW = IAROW IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 * DO 40 I = IA, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA ) - A( IOFFA ) )/EPS IF( G( I-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 40 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * DO 50 KK = 0, IB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA )-A( IOFFA ) )/EPS IF( G( I+KK-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 80 * 70 CONTINUE * 80 CONTINUE * RETURN * * End of PDVMCH2 * END SUBROUTINE PDMMCH( ICTXT, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, $ JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, $ JC, DESCC, CT, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANSA, TRANSB INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N DOUBLE PRECISION ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) DOUBLE PRECISION A( * ), B( * ), C( * ), CT( * ), G( * ), $ PC( * ) * .. * * Purpose * ======= * * PDMMCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies if the matrix operand A is to be * transposed. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies if the matrix operand B is to be * transposed. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANSA = 'N' (resp. TRANSA <> 'N') in PxGEMM, PxSYRK, * PxSYR2K, PxHERK and PxHER2K. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) DOUBLE PRECISION array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) DOUBLE PRECISION array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) DOUBLE PRECISION array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP, TRANA, TRANB INTEGER I, IBB, ICCOL, ICROW, ICURROW, IIC, IN, IOFFA, $ IOFFB, IOFFC, J, JJC, KK, LDA, LDB, LDC, LDPC, $ MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION EPS, ERRI * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * TRANA = LSAME( TRANSA, 'T' ).OR.LSAME( TRANSA, 'C' ) TRANB = LSAME( TRANSB, 'T' ).OR.LSAME( TRANSB, 'C' ) * LDA = MAX( 1, DESCA( M_ ) ) LDB = MAX( 1, DESCB( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 240 J = 1, N * IOFFC = IC + ( JC + J - 2 ) * LDC DO 10 I = 1, M CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE * IF( .NOT.TRANA .AND. .NOT.TRANB ) THEN DO 30 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 20 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA .AND. .NOT.TRANB ) THEN DO 50 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 40 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 40 CONTINUE 50 CONTINUE ELSE IF( .NOT.TRANA .AND. TRANB ) THEN DO 70 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 60 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 60 CONTINUE 70 CONTINUE ELSE IF( TRANA .AND. TRANB ) THEN DO 90 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 80 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 80 CONTINUE 90 CONTINUE END IF * DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA * C( IOFFC ) G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, M ) IN = IC + IBB - 1 * DO 210 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 210 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 230 I = IN+1, IC+M-1, DESCC( MB_ ) IBB = MIN( IC+M-I, DESCC( MB_ ) ) * DO 220 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 220 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 230 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 250 * 240 CONTINUE * 250 CONTINUE * RETURN * * End of PDMMCH * END SUBROUTINE PDMMCH1( ICTXT, UPLO, TRANS, N, K, ALPHA, A, IA, JA, $ DESCA, BETA, C, PC, IC, JC, DESCC, CT, G, $ ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IC, ICTXT, INFO, JA, JC, K, N DOUBLE PRECISION ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), CT( * ), G( * ), PC( * ) * .. * * Purpose * ======= * * PDMMCH1 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrix A has to be * transposed or not before computing the matrix-matrix product. * * N (global input) INTEGER * On entry, N specifies the order the submatrix operand C. N * must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANS = 'N' (resp. TRANS <> 'N'). K must be at least * zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) DOUBLE PRECISION array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) DOUBLE PRECISION array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, NOTRAN, ROWREP, TRAN, UPPER INTEGER I, IBB, IBEG, ICCOL, ICROW, ICURROW, IEND, IIC, $ IN, IOFFAK, IOFFAN, IOFFC, J, JJC, KK, LDA, $ LDC, LDPC, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION EPS, ERRI * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) * LDA = MAX( 1, DESCA( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 140 J = 1, N * IF( UPPER ) THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF * DO 10 I = 1, N CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE * IF( NOTRAN ) THEN DO 30 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA DO 20 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAK ) * A( IOFFAN ) G( I ) = G( I ) + ABS( A( IOFFAK ) ) * $ ABS( A( IOFFAN ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRAN ) THEN DO 50 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA DO 40 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAK ) * A( IOFFAN ) G( I ) = G( I ) + ABS( A( IOFFAK ) ) * $ ABS( A( IOFFAN ) ) 40 CONTINUE 50 CONTINUE END IF * IOFFC = IC + IBEG - 1 + ( JC + J - 2 ) * LDC * DO 100 I = IBEG, IEND CT( I ) = ALPHA*CT( I ) + BETA * C( IOFFC ) G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 100 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, N ) IN = IC + IBB - 1 * DO 110 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IC+N-1, DESCC( MB_ ) IBB = MIN( IC+N-I, DESCC( MB_ ) ) * DO 120 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 150 * 140 CONTINUE * 150 CONTINUE * RETURN * * End of PDMMCH1 * END SUBROUTINE PDMMCH2( ICTXT, UPLO, TRANS, N, K, ALPHA, A, IA, JA, $ DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, $ JC, DESCC, CT, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, N DOUBLE PRECISION ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) DOUBLE PRECISION A( * ), B( * ), C( * ), CT( * ), G( * ), $ PC( * ) * .. * * Purpose * ======= * * PDMMCH2 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrices A and B have * to be transposed or not before computing the matrix-matrix * product. * * N (global input) INTEGER * On entry, N specifies the order the submatrix operand C. N * must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * and B when TRANS = 'N' (resp. TRANS <> 'N'). K must be at * least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) DOUBLE PRECISION array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) DOUBLE PRECISION array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) DOUBLE PRECISION array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, NOTRAN, ROWREP, TRAN, UPPER INTEGER I, IBB, IBEG, ICCOL, ICROW, ICURROW, IEND, IIC, $ IN, IOFFAK, IOFFAN, IOFFBK, IOFFBN, IOFFC, J, $ JJC, KK, LDA, LDB, LDC, LDPC, MYCOL, MYROW, $ NPCOL, NPROW DOUBLE PRECISION EPS, ERRI * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) * LDA = MAX( 1, DESCA( M_ ) ) LDB = MAX( 1, DESCB( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 140 J = 1, N * IF( UPPER ) THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF * DO 10 I = 1, N CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE * IF( NOTRAN ) THEN DO 30 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA IOFFBK = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 20 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA IOFFBN = IB + I - 1 + ( JB + KK - 2 ) * LDB CT( I ) = CT( I ) + ALPHA * ( $ A( IOFFAN ) * B( IOFFBK ) + $ B( IOFFBN ) * A( IOFFAK ) ) G( I ) = G( I ) + ABS( ALPHA ) * ( $ ABS( A( IOFFAN ) ) * ABS( B( IOFFBK ) ) + $ ABS( B( IOFFBN ) ) * ABS( A( IOFFAK ) ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRAN ) THEN DO 50 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA IOFFBK = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 40 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA IOFFBN = IB + KK - 1 + ( JB + I - 2 ) * LDB CT( I ) = CT( I ) + ALPHA * ( $ A( IOFFAN ) * B( IOFFBK ) + $ B( IOFFBN ) * A( IOFFAK ) ) G( I ) = G( I ) + ABS( ALPHA ) * ( $ ABS( A( IOFFAN ) ) * ABS( B( IOFFBK ) ) + $ ABS( B( IOFFBN ) ) * ABS( A( IOFFAK ) ) ) 40 CONTINUE 50 CONTINUE END IF * IOFFC = IC + IBEG - 1 + ( JC + J - 2 ) * LDC * DO 100 I = IBEG, IEND CT( I ) = CT( I ) + BETA * C( IOFFC ) G( I ) = G( I ) + ABS( BETA )*ABS( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 100 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, N ) IN = IC + IBB - 1 * DO 110 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IC+N-1, DESCC( MB_ ) IBB = MIN( IC+N-I, DESCC( MB_ ) ) * DO 120 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 150 * 140 CONTINUE * 150 CONTINUE * RETURN * * End of PDMMCH2 * END SUBROUTINE PDMMCH3( UPLO, TRANS, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IC, INFO, JA, JC, M, N DOUBLE PRECISION ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION A( * ), C( * ), PC( * ) * .. * * Purpose * ======= * * PDMMCH3 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrix A has to be * transposed or not before computing the matrix-matrix addi- * tion. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) DOUBLE PRECISION array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, LOWER, NOTRAN, ROWREP, UPPER INTEGER I, ICCOL, ICROW, ICTXT, IIC, IOFFA, IOFFC, J, $ JJC, LDA, LDC, LDPC, MYCOL, MYROW, NPCOL, $ NPROW DOUBLE PRECISION ERR0, ERRI, PREC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L, $ PDERRAXPBY * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * PREC = PDLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * Compute expected result in C using data in A and C. This part of * the computation is performed by every process in the grid. * INFO = 0 ERR = ZERO * LDA = MAX( 1, DESCA( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) LDPC = MAX( 1, DESCC( LLD_ ) ) ROWREP = ( DESCC( RSRC_ ).EQ.-1 ) COLREP = ( DESCC( CSRC_ ).EQ.-1 ) * IF( NOTRAN ) THEN * DO 20 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( JA - 1 + J - JC ) * LDA * DO 10 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PDERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PDERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PDERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFA = IOFFA + 1 IOFFC = IOFFC + 1 * 10 CONTINUE * 20 CONTINUE * ELSE * DO 40 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( J - JC ) + ( JA - 1 ) * LDA * DO 30 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PDERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PDERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PDERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFC = IOFFC + 1 IOFFA = IOFFA + LDA * 30 CONTINUE * 40 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) * RETURN * * End of PDMMCH3 * END SUBROUTINE PDERRAXPBY( ERRBND, ALPHA, X, BETA, Y, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA, ERRBND, PREC, X, Y * .. * * Purpose * ======= * * PDERRAXPBY serially computes y := beta*y + alpha * x and returns a * scaled relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * X (global input) DOUBLE PRECISION * On entry, X specifies the scalar x to be scaled. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. * * Y (global input/global output) DOUBLE PRECISION * On entry, Y specifies the scalar y to be added. On exit, Y * contains the resulting scalar y. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION ADDBND, FACT, SUMPOS, SUMNEG, TMP * .. * .. Intrinsic Functions .. * .. * .. Executable Statements .. * SUMPOS = ZERO SUMNEG = ZERO FACT = ONE + TWO * PREC ADDBND = TWO * TWO * TWO * PREC * TMP = ALPHA * X IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP * FACT ELSE SUMNEG = SUMNEG - TMP * FACT END IF * TMP = BETA * Y IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP * FACT ELSE SUMNEG = SUMNEG - TMP * FACT END IF * Y = ( BETA * Y ) + ( ALPHA * X ) * ERRBND = ADDBND * MAX( SUMPOS, SUMNEG ) * RETURN * * End of PDERRAXPBY * END DOUBLE PRECISION FUNCTION PDLAMCH( ICTXT, CMACH ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 CMACH INTEGER ICTXT * .. * * Purpose * ======= * * PDLAMCH determines double precision machine parameters. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * CMACH (global input) CHARACTER*1 * On entry, CMACH specifies the value to be returned by PDLAMCH * as follows: * = 'E' or 'e', PDLAMCH := eps, * = 'S' or 's , PDLAMCH := sfmin, * = 'B' or 'b', PDLAMCH := base, * = 'P' or 'p', PDLAMCH := eps*base, * = 'N' or 'n', PDLAMCH := t, * = 'R' or 'r', PDLAMCH := rnd, * = 'M' or 'm', PDLAMCH := emin, * = 'U' or 'u', PDLAMCH := rmin, * = 'L' or 'l', PDLAMCH := emax, * = 'O' or 'o', PDLAMCH := rmax, * * where * * eps = relative machine precision, * sfmin = safe minimum, such that 1/sfmin does not overflow, * base = base of the machine, * prec = eps*base, * t = number of (base) digits in the mantissa, * rnd = 1.0 when rounding occurs in addition, 0.0 otherwise, * emin = minimum exponent before (gradual) underflow, * rmin = underflow threshold - base**(emin-1), * emax = largest exponent before overflow, * rmax = overflow threshold - (base**emax)*(1-eps). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. CHARACTER*1 TOP INTEGER IDUMM DOUBLE PRECISION TEMP * .. * .. External Subroutines .. EXTERNAL DGAMN2D, DGAMX2D, PB_TOPGET * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH, LSAME * .. * .. Executable Statements .. * TEMP = DLAMCH( CMACH ) IDUMM = 0 * IF( LSAME( CMACH, 'E' ).OR.LSAME( CMACH, 'S' ).OR. $ LSAME( CMACH, 'M' ).OR.LSAME( CMACH, 'U' ) ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) CALL DGAMX2D( ICTXT, 'All', TOP, 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) ELSE IF( LSAME( CMACH, 'L' ).OR.LSAME( CMACH, 'O' ) ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) CALL DGAMN2D( ICTXT, 'All', TOP, 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) END IF * PDLAMCH = TEMP * RETURN * * End of PDLAMCH * END SUBROUTINE PDLASET( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, JA, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLASET initializes an m by n submatrix A(IA:IA+M-1,JA:JA+N-1) deno- * ted by sub( A ) to beta on the diagonal and alpha on the offdiago- * nals. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the part of the submatrix sub( A ) * to be set: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of sub( A ) is not changed; * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha, i.e., the cons- * tant to which the offdiagonal elements are to be set. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta, i.e., the constant * to which the diagonal elements are to be set. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A to be set. On exit, the * leading m by n submatrix sub( A ) is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * and IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT, ISCOLREP, ISROWREP, LOWER, $ UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, JJA, $ JJMAX, JOFFA, JOFFD, LCMT, LCMT00, LDA, LMBLOC, $ LNBLOC, LOW, M1, MB, MBLKD, MBLKS, MBLOC, MP, $ MRCOL, MRROW, MYCOL, MYROW, N1, NB, NBLKD, $ NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, QNB, TMP1, $ UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS, PB_DLASET * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * ISROWREP = ( DESCA2( RSRC_ ).LT.0 ) ISCOLREP = ( DESCA2( CSRC_ ).LT.0 ) LDA = DESCA2( LLD_ ) * UPPER = .NOT.( LSAME( UPLO, 'L' ) ) LOWER = .NOT.( LSAME( UPLO, 'U' ) ) * IF( ( ( LOWER.AND.UPPER ).AND.( ALPHA.EQ.BETA ) ).OR. $ ( ISROWREP .AND. ISCOLREP ) ) THEN IF( ( MP.GT.0 ).AND.( NQ.GT.0 ) ) $ CALL PB_DLASET( UPLO, MP, NQ, 0, ALPHA, BETA, $ A( IIA + ( JJA - 1 ) * LDA ), LDA ) RETURN END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) CALL PB_BINFO( 0, MP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( ISROWREP ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( ISCOLREP ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * M1 = MP N1 = NQ * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_DLASET( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, BETA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_DLASET( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ ALPHA, A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_DLASET( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ ALPHA, A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_DLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_DLASET( UPLO, MBLOC, INBLOC, LCMT, ALPHA, BETA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_DLASET( 'ALL', TMP1, INBLOC, 0, ALPHA, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_DLASET( 'ALL', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_DLASET( 'All', M1, TMP1, 0, ALPHA, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_DLASET( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, BETA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_DLASET( 'All', IMBLOC, TMP1, 0, ALPHA, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_DLASET( 'All', M1, TMP1, 0, ALPHA, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_DLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_DLASET( UPLO, MBLOC, NBLOC, LCMT, ALPHA, BETA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_DLASET( 'All', TMP1, NBLOC, 0, ALPHA, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_DLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * RETURN * * End of PDLASET * END SUBROUTINE PDLASCAL( TYPE, M, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TYPE INTEGER IA, JA, M, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLASCAL scales the m by n submatrix A(IA:IA+M-1,JA:JA+N-1) denoted * by sub( A ) by the scalar alpha. TYPE specifies if sub( A ) is full, * upper triangular, lower triangular or upper Hessenberg. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * On entry, TYPE specifies the type of the input submatrix as * follows: * = 'L' or 'l': sub( A ) is a lower triangular matrix, * = 'U' or 'u': sub( A ) is an upper triangular matrix, * = 'H' or 'h': sub( A ) is an upper Hessenberg matrix, * otherwise sub( A ) is a full matrix. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( A ) are overwritten by * the local entries of the m by n scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 UPLO LOGICAL GODOWN, GOLEFT, LOWER, UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, ITYPE, $ IUPP, JJA, JJMAX, JOFFA, JOFFD, LCMT, LCMT00, $ LDA, LMBLOC, LNBLOC, LOW, M1, MB, MBLKD, MBLKS, $ MBLOC, MP, MRCOL, MRROW, MYCOL, MYROW, N1, NB, $ NBLKD, NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, $ QNB, TMP1, UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS, PB_DLASCAL, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_NUMROC EXTERNAL LSAME, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 UPLO = TYPE UPPER = .FALSE. LOWER = .TRUE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 UPLO = TYPE UPPER = .TRUE. LOWER = .FALSE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 UPLO = 'U' UPPER = .TRUE. LOWER = .FALSE. IOFFD = 1 ELSE ITYPE = 0 UPLO = 'A' UPPER = .TRUE. LOWER = .TRUE. IOFFD = 0 END IF * * Compute local indexes * IF( ITYPE.EQ.0 ) THEN * * Full matrix * CALL PB_INFOG2L( IA, JA, DESCA2, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = PB_NUMROC( M, IA, DESCA2( IMB_ ), DESCA2( MB_ ), MYROW, $ DESCA2( RSRC_ ), NPROW ) NQ = PB_NUMROC( N, JA, DESCA2( INB_ ), DESCA2( NB_ ), MYCOL, $ DESCA2( CSRC_ ), NPCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * LDA = DESCA2( LLD_ ) IOFFA = IIA + ( JJA - 1 ) * LDA * CALL PB_DLASCAL( 'All', MP, NQ, 0, ALPHA, A( IOFFA ), LDA ) * ELSE * * Trapezoidal matrix * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, * LNBLOC, ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) LDA = DESCA2( LLD_ ) * CALL PB_BINFO( IOFFD, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * M1 = MP N1 = NQ IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and * update LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_DLASCAL( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_DLASCAL( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_DLASCAL( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_DLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_DLASCAL( UPLO, MBLOC, INBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_DLASCAL( 'All', TMP1, INBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_DLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_DLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_DLASCAL( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_DLASCAL( 'All', IMBLOC, TMP1, 0, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_DLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_DLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_DLASCAL( UPLO, MBLOC, NBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_DLASCAL( 'All', TMP1, NBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_DLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * END IF * RETURN * * End of PDLASCAL * END SUBROUTINE PDLAGEN( INPLACE, AFORM, DIAG, OFFA, M, N, IA, JA, $ DESCA, IASEED, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE CHARACTER*1 AFORM, DIAG INTEGER IA, IASEED, JA, LDA, M, N, OFFA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PDLAGEN generates (or regenerates) a submatrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the generated submatrix is diago- * nally dominant or not as follows: * DIAG = 'D' : sub( A ) is diagonally dominant, * DIAG = 'N' : sub( A ) is not diagonally dominant. * * OFFA (global input) INTEGER * On entry, OFFA specifies the offdiagonal of the underlying * matrix A(1:DESCA(M_),1:DESCA(N_)) of interest when the subma- * trix is symmetric, Hermitian or diagonally dominant. OFFA = 0 * specifies the main diagonal, OFFA > 0 specifies a subdiago- * nal, and OFFA < 0 specifies a superdiagonal (see further de- * tails). * * M (global input) INTEGER * On entry, M specifies the global number of matrix rows of the * submatrix sub( A ) to be generated. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of matrix columns of * the submatrix sub( A ) to be generated. N must be at least * zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IASEED (global input) INTEGER * On entry, IASEED specifies the seed number to generate the * matrix A. IASEED must be at least zero. * * A (local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). On exit, this array contains the * local entries of the randomly generated submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. When INPLACE is .FALSE., LDA is usually DESCA(LLD_). * This restriction is however not enforced, and this subroutine * requires only that LDA >= MAX( 1, Mp ) where * * Mp = PB_NUMROC( M, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ). * * PB_NUMROC is a ScaLAPACK tool function; MYROW, MYCOL, NPROW * and NPCOL can be determined by calling the BLACS subroutine * BLACS_GRIDINFO. * * Further Details * =============== * * OFFD is tied to the matrix described by DESCA, as opposed to the * piece that is currently (re)generated. This is a global information * independent from the distribution parameters. Below are examples of * the meaning of OFFD for a global 7 by 5 matrix: * * --------------------------------------------------------------------- * OFFD | 0 -1 -2 -3 -4 0 -1 -2 -3 -4 0 -1 -2 -3 -4 * -------|------------------------------------------------------------- * | | OFFD=-1 | OFFD=0 OFFD=2 * | V V * 0 | . d . . . -> d . . . . . . . . . * 1 | . . d . . . d . . . . . . . . * 2 | . . . d . . . d . . -> d . . . . * 3 | . . . . d . . . d . . d . . . * 4 | . . . . . . . . . d . . d . . * 5 | . . . . . . . . . . . . . d . * 6 | . . . . . . . . . . . . . . d * --------------------------------------------------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. LOGICAL DIAGDO, SYMM, HERM, NOTRAN INTEGER CSRC, I, IACOL, IAROW, ICTXT, IIA, ILOCBLK, $ ILOCOFF, ILOW, IMB, IMB1, IMBLOC, IMBVIR, INB, $ INB1, INBLOC, INBVIR, INFO, IOFFDA, ITMP, IUPP, $ IVIR, JJA, JLOCBLK, JLOCOFF, JVIR, LCMT00, $ LMBLOC, LNBLOC, LOW, MAXMN, MB, MBLKS, MP, $ MRCOL, MRROW, MYCDIST, MYCOL, MYRDIST, MYROW, $ NB, NBLKS, NPCOL, NPROW, NQ, NVIR, RSRC, UPP DOUBLE PRECISION ALPHA * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ), IMULADD( 4, JMP_LEN ), $ IRAN( 2 ), JMP( JMP_LEN ), MULADD0( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CHKMAT, PB_DESCTRANS, PB_DLAGEN, PB_INITJMP, $ PB_INITMULADD, PB_JUMP, PB_JUMPIT, PB_LOCINFO, $ PB_SETLOCRAN, PB_SETRAN, PDLADOM, PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data Statements .. DATA ( MULADD0( I ), I = 1, 4 ) / 20077, 16838, $ 12345, 0 / * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Test the input arguments * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1000 + CTXT_ ) ELSE SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) NOTRAN = LSAME( AFORM, 'N' ) DIAGDO = LSAME( DIAG, 'D' ) IF( .NOT.( SYMM.OR.HERM.OR.NOTRAN ) .AND. $ .NOT.( LSAME( AFORM, 'T' ) ) .AND. $ .NOT.( LSAME( AFORM, 'C' ) ) ) THEN INFO = -2 ELSE IF( ( .NOT.DIAGDO ) .AND. $ ( .NOT.LSAME( DIAG, 'N' ) ) ) THEN INFO = -3 END IF CALL PB_CHKMAT( ICTXT, M, 5, N, 6, IA, JA, DESCA2, 10, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PDLAGEN', -INFO ) RETURN END IF * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) IMB = DESCA2( IMB_ ) INB = DESCA2( INB_ ) RSRC = DESCA2( RSRC_ ) CSRC = DESCA2( CSRC_ ) * * Figure out local information about the distributed matrix operand * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * IOFFDA = JA + OFFA - IA CALL PB_BINFO( IOFFDA, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * * Initialize ILOCBLK, ILOCOFF, MYRDIST, JLOCBLK, JLOCOFF, MYCDIST * This values correspond to the square virtual underlying matrix * of size MAX( M_ + MAX( 0, -OFFA ), N_ + MAX( 0, OFFA ) ) used * to set up the random sequence. For practical purposes, the size * of this virtual matrix is upper bounded by M_ + N_ - 1. * ITMP = MAX( 0, -OFFA ) IVIR = IA + ITMP IMBVIR = IMB + ITMP NVIR = DESCA2( M_ ) + ITMP * CALL PB_LOCINFO( IVIR, IMBVIR, MB, MYROW, RSRC, NPROW, ILOCBLK, $ ILOCOFF, MYRDIST ) * ITMP = MAX( 0, OFFA ) JVIR = JA + ITMP INBVIR = INB + ITMP NVIR = MAX( MAX( NVIR, DESCA2( N_ ) + ITMP ), $ DESCA2( M_ ) + DESCA2( N_ ) - 1 ) * CALL PB_LOCINFO( JVIR, INBVIR, NB, MYCOL, CSRC, NPCOL, JLOCBLK, $ JLOCOFF, MYCDIST ) * IF( SYMM .OR. HERM .OR. NOTRAN ) THEN * CALL PB_INITJMP( .TRUE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 1, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_DLAGEN( 'Lower', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( SYMM .OR. HERM .OR. ( .NOT. NOTRAN ) ) THEN * CALL PB_INITJMP( .FALSE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 1, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_DLAGEN( 'Upper', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( DIAGDO ) THEN * MAXMN = MAX( DESCA2( M_ ), DESCA2( N_ ) ) ALPHA = DBLE( MAXMN ) * IF( IOFFDA.GE.0 ) THEN CALL PDLADOM( INPLACE, MIN( MAX( 0, M-IOFFDA ), N ), ALPHA, $ A, MIN( IA+IOFFDA, IA+M-1 ), JA, DESCA ) ELSE CALL PDLADOM( INPLACE, MIN( M, MAX( 0, N+IOFFDA ) ), ALPHA, $ A, IA, MIN( JA-IOFFDA, JA+N-1 ), DESCA ) END IF * END IF * RETURN * * End of PDLAGEN * END SUBROUTINE PDLADOM( INPLACE, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE INTEGER IA, JA, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PDLADOM adds alpha to the diagonal entries of an n by n submatrix * sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * N (global input) INTEGER * On entry, N specifies the global order of the submatrix * sub( A ) to be modified. N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the local entries * of this array corresponding to the main diagonal of sub( A ) * have been updated. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP DOUBLE PRECISION ATMP * .. * .. Local Scalars .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.EQ.0 ) $ RETURN * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) * CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA+LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PDLADOM * END SUBROUTINE PB_PDLAPRNT( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * Purpose * ======= * * PB_PDLAPRNT prints to the standard output a submatrix sub( A ) deno- * ting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and printed by * the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the row index of the printing pro- * cess. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the column index of the printing * process. * * CMATNM (global input) CHARACTER*(*) * On entry, CMATNM is the name of the matrix to be printed. * * NOUT (global input) INTEGER * On entry, NOUT specifies the output unit number. When NOUT is * equal to 6, the submatrix is printed on the screen. * * WORK (local workspace) DOUBLE PRECISION array * On entry, WORK is a work array of dimension at least equal to * MAX( IMB_A, MB_A ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW, PCOL, PROW * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_DESCTRANS, PB_PDLAPRN2 * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * CALL BLACS_GRIDINFO( DESCA2( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( DESCA2( RSRC_ ).GE.0 ) THEN IF( DESCA2( CSRC_ ).GE.0 ) THEN CALL PB_PDLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, ICPRNT, $ CMATNM, NOUT, DESCA2( RSRC_ ), $ DESCA2( CSRC_ ), WORK ) ELSE DO 10 PCOL = 0, NPCOL - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Colum-replicated array -- ' , $ 'copy in process column: ', PCOL CALL PB_PDLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, DESCA2( RSRC_ ), $ PCOL, WORK ) 10 CONTINUE END IF ELSE IF( DESCA2( CSRC_ ).GE.0 ) THEN DO 20 PROW = 0, NPROW - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Row-replicated array -- ' , $ 'copy in process row: ', PROW CALL PB_PDLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, PROW, $ DESCA2( CSRC_ ), WORK ) 20 CONTINUE ELSE DO 40 PROW = 0, NPROW - 1 DO 30 PCOL = 0, NPCOL - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Replicated array -- ' , $ 'copy in process (', PROW, ',', PCOL, ')' CALL PB_PDLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, PROW, PCOL, $ WORK ) 30 CONTINUE 40 CONTINUE END IF END IF * RETURN * * End of PB_PDLAPRNT * END SUBROUTINE PB_PDLAPRN2( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, PROW, PCOL, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT, PCOL, PROW * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) DOUBLE PRECISION A( * ), WORK( * ) * .. * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL AISCOLREP, AISROWREP INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, LDW, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, DGERV2D, $ DGESD2D, PB_INFOG2L * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL PB_INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) II = IIA JJ = JJA IF( DESCA( RSRC_ ).LT.0 ) THEN AISROWREP = .TRUE. IAROW = PROW ICURROW = PROW ELSE AISROWREP = .FALSE. ICURROW = IAROW END IF IF( DESCA( CSRC_ ).LT.0 ) THEN AISCOLREP = .TRUE. IACOL = PCOL ICURCOL = PCOL ELSE AISCOLREP = .FALSE. ICURCOL = IACOL END IF LDA = DESCA( LLD_ ) LDW = MAX( DESCA( IMB_ ), DESCA( MB_ ) ) * * Handle the first block of column separately * JB = DESCA( INB_ ) - JA + 1 IF( JB.LE.0 ) $ JB = ( (-JB) / DESCA( NB_ ) + 1 ) * DESCA( NB_ ) + JB JB = MIN( JB, N ) JN = JA+JB-1 DO 60 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( (-IB) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA+IB-1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, JA+H, WORK( K ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, JA+H, WORK( K ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB IF( .NOT.AISCOLREP ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IB = DESCA( IMB_ )-IA+1 IF( IB.LE.0 ) $ IB = ( (-IB) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA+IB-1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, J+H, A( II+K+(JJ+H-1)*LDA ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, J+H, WORK( K ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, J+H, A( II+K+(JJ+H-1)*LDA ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL DGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, J+H, WORK( K ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB IF( .NOT.AISCOLREP ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * 9999 FORMAT( 1X, A, '(', I6, ',', I6, ')=', D30.18 ) * RETURN * * End of PB_PDLAPRN2 * END SUBROUTINE PB_DFILLPAD( ICTXT, M, N, A, LDA, IPRE, IPOST, CHKVAL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N DOUBLE PRECISION CHKVAL * .. * .. Array Arguments .. DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PB_DFILLPAD surrounds a two dimensional local array with a guard-zone * initialized to the value CHKVAL. The user may later call the routine * PB_DCHEKPAD to discover if the guardzone has been violated. There are * three guardzones. The first is a buffer of size IPRE that is before * the start of the array. The second is the buffer of size IPOST which * is after the end of the array to be padded. Finally, there is a guard * zone inside every column of the array to be padded, in the elements * of A(M+1:LDA, J). * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * M (local input) INTEGER * On entry, M specifies the number of rows in the local array * A. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns in the local ar- * ray A. N must be at least zero. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). On exit, this * array is the padded array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array to be padded. LDA must be at least MAX( 1, M ). * * IPRE (local input) INTEGER * On entry, IPRE specifies the size of the guard zone to put * before the start of the padded array. * * IPOST (local input) INTEGER * On entry, IPOST specifies the size of the guard zone to put * after the end of the padded array. * * CHKVAL (local input) DOUBLE PRECISION * On entry, CHKVAL specifies the value to pad the array with. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE A( I ) = CHKVAL 10 CONTINUE ELSE WRITE( *, FMT = '(A)' ) $ 'WARNING no pre-guardzone in PB_DFILLPAD' END IF * * Put check buffer in back of A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 A( I ) = CHKVAL 20 CONTINUE ELSE WRITE( *, FMT = '(A)' ) $ 'WARNING no post-guardzone in PB_DFILLPAD' END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + ( LDA - M ) - 1 A( I ) = CHKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * RETURN * * End of PB_DFILLPAD * END SUBROUTINE PB_DCHEKPAD( ICTXT, MESS, M, N, A, LDA, IPRE, IPOST, $ CHKVAL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N DOUBLE PRECISION CHKVAL * .. * .. Array Arguments .. CHARACTER*(*) MESS DOUBLE PRECISION A( * ) * .. * * Purpose * ======= * * PB_DCHEKPAD checks that the padding around a local array has not been * overwritten since the call to PB_DFILLPAD. Three types of errors are * reported: * * 1) Overwrite in pre-guardzone. This indicates a memory overwrite has * occurred in the first IPRE elements which form a buffer before the * beginning of A. Therefore, the error message: * 'Overwrite in pre-guardzone: loc( 5) = 18.00000' * tells that the 5th element of the IPRE long buffer has been overwrit- * ten with the value 18, where it should still have the value CHKVAL. * * 2) Overwrite in post-guardzone. This indicates a memory overwrite has * occurred in the last IPOST elements which form a buffer after the end * of A. Error reports are refered from the end of A. Therefore, * 'Overwrite in post-guardzone: loc( 19) = 24.00000' * tells that the 19th element after the end of A was overwritten with * the value 24, where it should still have the value of CHKVAL. * * 3) Overwrite in lda-m gap. Tells you elements between M and LDA were * overwritten. So, * 'Overwrite in lda-m gap: A( 12, 3) = 22.00000' * tells that the element at the 12th row and 3rd column of A was over- * written with the value of 22, where it should still have the value of * CHKVAL. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * MESS (local input) CHARACTER*(*) * On entry, MESS is a ttring containing a user-defined message. * * M (local input) INTEGER * On entry, M specifies the number of rows in the local array * A. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns in the local ar- * ray A. N must be at least zero. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array to be padded. LDA must be at least MAX( 1, M ). * * IPRE (local input) INTEGER * On entry, IPRE specifies the size of the guard zone to put * before the start of the padded array. * * IPOST (local input) INTEGER * On entry, IPOST specifies the size of the guard zone to put * after the end of the padded array. * * CHKVAL (local input) DOUBLE PRECISION * On entry, CHKVAL specifies the value to pad the array with. * * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. CHARACTER*1 TOP INTEGER I, IAM, IDUMM, INFO, J, K, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, PB_TOPGET * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW*NPCOL + MYCOL INFO = -1 * * Check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, ' pre', I, $ A( I ) INFO = IAM END IF 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PB_DCHEKPAD' END IF * * Check buffer after A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, 'post', $ I-J+1, A( I ) INFO = IAM END IF 20 CONTINUE ELSE WRITE( *, FMT = * ) $ 'WARNING no post-guardzone buffer in PB_DCHEKPAD' END IF * * Check all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9997 ) MYROW, MYCOL, MESS, $ I-IPRE-LDA*(J-1), J, A( I ) INFO = IAM END IF 30 CONTINUE K = K + LDA 40 CONTINUE END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, IDUMM, IDUMM, -1, $ 0, 0 ) IF( IAM.EQ.0 .AND. INFO.GE.0 ) THEN WRITE( *, FMT = 9999 ) INFO / NPCOL, MOD( INFO, NPCOL ), MESS END IF * 9999 FORMAT( '{', I5, ',', I5, '}: Memory overwrite in ', A ) 9998 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ A4, '-guardzone: loc(', I3, ') = ', G20.7 ) 9997 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ 'lda-m gap: loc(', I3, ',', I3, ') = ', G20.7 ) * RETURN * * End of PB_DCHEKPAD * END SUBROUTINE PB_DLASET( UPLO, M, N, IOFFD, ALPHA, BETA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PB_DLASET initializes a two-dimensional array A to beta on the diago- * nal specified by IOFFD and alpha on the offdiagonals. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be set as follows: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of A is not changed, * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of A is not changed, * = 'D' or 'd' Only the diagonal of A is set, * Otherwise: All of the array A is set. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the value to which the offdiagonal * array elements are set to. * * BETA (input) DOUBLE PRECISION * On entry, BETA specifies the value to which the diagonal ar- * ray elements are set to. * * A (input/output) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be set, and the strictly lower trape- * zoidal part of A is not referenced; When IUPLO = 'L' or 'l', * the leading m by n part of the array A must contain the * lower trapezoidal part of the matrix as specified by IOFFD to * be set, and the strictly upper trapezoidal part of A is * not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Set the diagonal to BETA and the strictly lower triangular * part of the array to ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = BETA DO 30 I = JTMP + 1, M A( I, J ) = ALPHA 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Set the diagonal to BETA and the strictly upper triangular * part of the array to ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 50 I = 1, JTMP - 1 A( I, J ) = ALPHA 50 CONTINUE A( JTMP, J ) = BETA 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Set the array to BETA on the diagonal. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 90 CONTINUE * ELSE * * Set the array to BETA on the diagonal and ALPHA on the * offdiagonal. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA 100 CONTINUE 110 CONTINUE IF( ALPHA.NE.BETA .AND. IOFFD.LT.M .AND. IOFFD.GT.-N ) THEN DO 120 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 120 CONTINUE END IF * END IF * RETURN * * End of PB_DLASET * END SUBROUTINE PB_DLASCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PB_DLASCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * A (input/output) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA * A( I, J ) 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = ALPHA * A( I, J ) 50 CONTINUE 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA * A( I, J ) 100 CONTINUE 110 CONTINUE * END IF * RETURN * * End of PB_DLASCAL * END SUBROUTINE PB_DLAGEN( UPLO, AFORM, A, LDA, LCMT00, IRAN, MBLKS, $ IMBLOC, MB, LMBLOC, NBLKS, INBLOC, NB, $ LNBLOC, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO, AFORM INTEGER IMBLOC, INBLOC, LCMT00, LDA, LMBLOC, LNBLOC, $ MB, MBLKS, NB, NBLKS * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PB_DLAGEN locally initializes an array A. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the lower (UPLO='L') trape- * zoidal part or the upper (UPLO='U') trapezoidal part is to be * generated when the matrix to be generated is symmetric or * Hermitian. For all the other values of AFORM, the value of * this input argument is ignored. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * A (local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, *). On exit, * this array contains the local entries of the randomly genera- * ted submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. LDA must be at least one. * * LCMT00 (global input) INTEGER * On entry, LCMT00 is the LCM value specifying the off-diagonal * of the underlying matrix of interest. LCMT00=0 specifies the * main diagonal, LCMT00 > 0 specifies a subdiagonal, LCMT00 < 0 * specifies superdiagonals. * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2 containing respec- * tively the 16-lower and 16-higher bits of the encoding of the * entry of the random sequence corresponding locally to the * first local array entry to generate. Usually, this array is * computed by PB_SETLOCRAN. * * MBLKS (local input) INTEGER * On entry, MBLKS specifies the local number of blocks of rows. * MBLKS is at least zero. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * LMBLOC (local input) INTEGER * On entry, LMBLOC specifies the number of rows (size) of the * local lowest blocks. LMBLOC is at least zero. * * NBLKS (local input) INTEGER * On entry, NBLKS specifies the local number of blocks of co- * lumns. NBLKS is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * LNBLOC (local input) INTEGER * On entry, LNBLOC specifies the number of columns (size) of * the local rightmost blocks. LNBLOC is at least zero. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the random matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. INTEGER I, IB, IBLK, II, IK, ITMP, JB, JBLK, JJ, JK, $ JTMP, LCMTC, LCMTR, LOW, MNB, UPP DOUBLE PRECISION DUMMY * .. * .. Local Arrays .. INTEGER IB0( 2 ), IB1( 2 ), IB2( 2 ), IB3( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMPIT * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PB_DRAND EXTERNAL LSAME, PB_DRAND * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * DO 10 I = 1, 2 IB1( I ) = IRAN( I ) IB2( I ) = IRAN( I ) IB3( I ) = IRAN( I ) 10 CONTINUE * IF( LSAME( AFORM, 'N' ) ) THEN * * Generate random matrix * JJ = 1 * DO 50 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * DO 40 JK = JJ, JJ + JB - 1 * II = 1 * DO 30 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * * Blocks are IB by JB * DO 20 IK = II, II + IB - 1 A( IK, JK ) = PB_DRAND( 0 ) 20 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 30 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 40 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 50 CONTINUE * ELSE IF( LSAME( AFORM, 'T' ) .OR. LSAME( AFORM, 'C' ) ) THEN * * Generate the transpose of the matrix that would be normally * generated. * II = 1 * DO 90 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 80 IK = II, II + IB - 1 * JJ = 1 * DO 70 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 60 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_DRAND( 0 ) 60 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 70 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 80 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 90 CONTINUE * ELSE IF( ( LSAME( AFORM, 'S' ) ).OR.( LSAME( AFORM, 'H' ) ) ) THEN * * Generate a symmetric matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 170 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 160 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 150 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 100 IK = II, II + IB - 1 DUMMY = PB_DRAND( 0 ) 100 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 110 IK = II, II + IB - 1 A( IK, JK ) = PB_DRAND( 0 ) 110 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 120 IK = II, ITMP - 1 DUMMY = PB_DRAND( 0 ) 120 CONTINUE * DO 130 IK = ITMP, II + IB - 1 A( IK, JK ) = PB_DRAND( 0 ) 130 CONTINUE * END IF * ELSE * DO 140 IK = II, II + IB - 1 A( IK, JK ) = PB_DRAND( 0 ) 140 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 150 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 160 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 170 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 250 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 240 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 230 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 180 JK = JJ, JJ + JB - 1 DUMMY = PB_DRAND( 0 ) 180 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 190 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_DRAND( 0 ) 190 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 200 JK = JJ, JTMP - 1 DUMMY = PB_DRAND( 0 ) 200 CONTINUE * DO 210 JK = JTMP, JJ + JB - 1 A( IK, JK ) = PB_DRAND( 0 ) 210 CONTINUE * END IF * ELSE * DO 220 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_DRAND( 0 ) 220 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 230 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 240 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 250 CONTINUE * END IF * END IF * RETURN * * End of PB_DLAGEN * END DOUBLE PRECISION FUNCTION PB_DRAND( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_DRAND generates the next number in the random sequence. This func- * tion ensures that this number will be in the interval ( -1.0, 1.0 ). * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0 ) * .. * .. External Functions .. DOUBLE PRECISION PB_DRAN EXTERNAL PB_DRAN * .. * .. Executable Statements .. * PB_DRAND = ONE - TWO * PB_DRAN( IDUMM ) * RETURN * * End of PB_DRAND * END DOUBLE PRECISION FUNCTION PB_DRAN( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_DRAN generates the next number in the random sequence. * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION DIVFAC, POW16 PARAMETER ( DIVFAC = 2.147483648D+9, $ POW16 = 6.5536D+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * PB_DRAN = ( DBLE( IRAND( 1 ) ) + POW16 * DBLE( IRAND( 2 ) ) ) / $ DIVFAC * CALL PB_LMUL( IRAND, IACS, J ) CALL PB_LADD( J, IACS( 3 ), IRAND ) * RETURN * * End of PB_DRAN * END scalapack-2.0.2/PBLAS/TESTING/PSBLAS1TST.dat000644 000766 000024 00000002737 10363532303 020030 0ustar00juliestaff000000 000000 'Level 1 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PSBLAS1TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q 2.0E0 value of ALPHA 4 number of tests problems 14 44 28 7 values of N 36 9 39 53 values of M_X 24 67 3 12 values of N_X 2 5 2 5 values of IMB_X 2 5 2 5 values of INB_X 2 5 2 5 values of MB_X 2 5 2 5 values of NB_X 0 0 0 0 values of RSRC_X 0 0 0 0 values of CSRC_X 5 3 1 1 values of IX 2 6 1 1 values of JX 1 9 1 53 values of INCX 2 6 35 14 values of M_Y 27 55 43 12 values of N_Y 2 5 2 5 values of IMB_Y 2 5 2 5 values of INB_Y 2 5 2 5 values of MB_Y 2 5 2 5 values of NB_Y 0 0 0 0 values of RSRC_Y 0 0 0 0 values of CSRC_Y 1 4 1 1 values of IY 7 6 1 1 values of JY 2 6 35 1 values of INCY PSSWAP T put F for no test in the same column PSSCAL T put F for no test in the same column PSCOPY T put F for no test in the same column PSAXPY T put F for no test in the same column PSDOT T put F for no test in the same column PSNRM2 T put F for no test in the same column PSASUM T put F for no test in the same column PSAMAX T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/psblas1tst.f000644 000766 000024 00000363227 11750130340 020145 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PSSWAP ', 'PSSCAL ', 'PSCOPY ', $ 'PSAXPY ', 'PSDOT ', 'PSNRM2 ', $ 'PSASUM ', 'PSAMAX '/ END BLOCK DATA PROGRAM PSBLA1TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PSBLA1TST is the main testing program for the PBLAS Level 1 routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 44 lines: * 'Level 1 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PSBLAS1TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0E0 value of ALPHA * 2 number of tests problems * 3 4 values of N * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PSSWAP T put F for no test in the same column * PSSCAL T put F for no test in the same column * PSCOPY T put F for no test in the same column * PSAXPY T put F for no test in the same column * PSDOT T put F for no test in the same column * PSNRM2 T put F for no test in the same column * PSASUM T put F for no test in the same column * PSAMAX T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * REALSZ indicates the length in bytes on the given platform * for a single precision real. By default, REALSZ is set to * four. * * MEM REAL array * MEM is an array of dimension TOTMEM / REALSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, REALSZ, TOTMEM, $ MEMSIZ, NSUBS REAL PADVAL, ZERO PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, ZERO = 0.0E+0, $ PADVAL = -9923.0E+0, NSUBS = 8 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE INTEGER CSRCX, CSRCY, I, IAM, ICTXT, IGAP, IMBX, IMBY, $ IMIDX, IMIDY, INBX, INBY, INCX, INCY, IPMATX, $ IPMATY, IPOSTX, IPOSTY, IPREX, IPREY, IPW, IPX, $ IPY, IVERB, IX, IXSEED, IY, IYSEED, J, JX, JY, $ K, LDX, LDY, MBX, MBY, MEMREQD, MPX, MPY, MX, $ MY, MYCOL, MYROW, N, NBX, NBY, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQX, NQY, NTESTS, NX, NY, $ PISCLR, RSRCX, RSRCY, TSKIP, TSTCNT REAL ALPHA, PSCLR, PUSCLR * .. * .. Local Arrays .. CHARACTER*80 OUTFILE LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) INTEGER CSCXVAL( MAXTESTS ), CSCYVAL( MAXTESTS ), $ DESCX( DLEN_ ), DESCXR( DLEN_ ), $ DESCY( DLEN_ ), DESCYR( DLEN_ ), IERR( 4 ), $ IMBXVAL( MAXTESTS ), IMBYVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JXVAL( MAXTESTS ), JYVAL( MAXTESTS ), $ KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MXVAL( MAXTESTS ), $ MYVAL( MAXTESTS ), NBXVAL( MAXTESTS ), $ NBYVAL( MAXTESTS ), NVAL( MAXTESTS ), $ NXVAL( MAXTESTS ), NYVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) REAL MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_DESCSET2, PB_PSLAPRNT, PB_SCHEKPAD, $ PB_SFILLPAD, PSAMAX, PSASUM, PSAXPY, $ PSBLA1TSTINFO, PSBLAS1TSTCHK, PSBLAS1TSTCHKE, $ PSCHKARG1, PSCHKVOUT, PSCOPY, PSDOT, PSLAGEN, $ PSMPRNT, PSNRM2, PSSCAL, PSSWAP, PSVPRNT, $ PVDESCCHK, PVDIMCHK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .FALSE., .TRUE., .TRUE., .TRUE., $ .FALSE., .FALSE., .FALSE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler will abort on errors. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IXSEED = 100 IYSEED = 200 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PSBLA1TSTINFO( OUTFILE, NOUT, NTESTS, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, RSCXVAL, $ CSCXVAL, IXVAL, JXVAL, INCXVAL, MYVAL, $ NYVAL, IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, INCYVAL, $ MAXTESTS, NGRIDS, PVAL, MAXGRIDS, QVAL, $ MAXGRIDS, LTEST, SOF, TEE, IAM, IGAP, IVERB, $ NPROCS, ALPHA, MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PSBLAS1TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * N = NVAL( J ) MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN TSTCNT = TSTCNT + 1 WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY WRITE( NOUT, FMT = 9995 ) END IF * * Check the validity of the input and initialize DESC_ * CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, IGAP, GAPMUL, $ IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, IGAP, GAPMUL, $ IERR( 2 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDX = MAX( 1, MX ) LDY = MAX( 1, MY ) * * Assign pointers into MEM for matrices corresponding to * vectors X and Y. Ex: IPX starts at position MEM( IPREX+1 ). * IPX = IPREX + 1 IPY = IPX + DESCX( LLD_ ) * NQX + IPOSTX + IPREY IPMATX = IPY + DESCY( LLD_ ) * NQY + IPOSTY IPMATY = IPMATX + MX * NX IPW = IPMATY + MY * NY * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * MEMREQD = IPW - 1 + $ MAX( MAX( IMBX, MBX ), MAX( IMBY, MBY ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) MEMREQD*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 1 routines * DO 30 K = 1, NSUBS * * Continue only if this sub has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9989 ) SNAMES( K ) END IF * * Check the validity of the operand sizes * CALL PVDIMCHK( ICTXT, NOUT, N, 'X', IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, N, 'Y', IY, JY, DESCY, INCY, $ IERR( 2 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 GO TO 30 END IF * * Generate distributed matrices X and Y * CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCXR, MX, NX, IMBX, INBX, MBX, NBX, $ -1, -1, ICTXT, MAX( 1, MX ) ) CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCXR, IXSEED, MEM( IPMATX ), $ DESCXR( LLD_ ) ) IF( YCHECK( K ) ) THEN CALL PB_DESCSET2( DESCYR, MY, NY, IMBY, INBY, MBY, $ NBY, -1, -1, ICTXT, MAX( 1, MY ) ) CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCYR, IYSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) END IF * * Pad the guard zones of X, and Y * CALL PB_SFILLPAD( ICTXT, MPX, NQX, MEM( IPX-IPREX ), $ DESCX( LLD_ ), IPREX, IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_SFILLPAD( ICTXT, MPY, NQY, MEM( IPY-IPREY ), $ DESCY( LLD_ ), IPREY, IPOSTY, $ PADVAL ) END IF * * Initialize the check for INPUT only args. * INFO = 0 CALL PSCHKARG1( ICTXT, NOUT, SNAMES( K ), N, ALPHA, IX, $ JX, DESCX, INCX, IY, JY, DESCY, INCY, $ INFO ) * INFO = 0 PSCLR = ZERO PUSCLR = ZERO PISCLR = 0 * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PSLAPRNT( 1, N, MEM( IPX ), IX, JX, DESCX, $ 0, 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) ELSE CALL PB_PSLAPRNT( N, 1, MEM( IPX ), IX, JX, DESCX, $ 0, 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) END IF IF( YCHECK( K ) ) THEN IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PSLAPRNT( 1, N, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) ELSE CALL PB_PSLAPRNT( N, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) END IF END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PSLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, 0, $ 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) IF( YCHECK( K ) ) $ CALL PB_PSLAPRNT( MY, NY, MEM( IPY ), 1, 1, DESCY, $ 0, 0, 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) END IF * * Call the PBLAS routine * IF( K.EQ.1 ) THEN * * Test PSSWAP * CALL PSSWAP( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.2 ) THEN * * Test PSSCAL * PSCLR = ALPHA CALL PSSCAL( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.3 ) THEN * * Test PSCOPY * CALL PSCOPY( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.4 ) THEN * * Test PSAXPY * PSCLR = ALPHA CALL PSAXPY( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.5 ) THEN * * Test PSDOT * CALL PSDOT( N, PSCLR, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.6 ) THEN * * Test PSNRM2 * CALL PSNRM2( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.7 ) THEN * * Test PSASUM * CALL PSASUM( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.8 ) THEN * CALL PSAMAX( N, PSCLR, PISCLR, MEM( IPX ), IX, JX, $ DESCX, INCX ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9978 ) INFO GO TO 30 END IF * * Check the computations * CALL PSBLAS1TSTCHK( ICTXT, NOUT, K, N, PSCLR, PUSCLR, $ PISCLR, MEM( IPMATX ), MEM( IPX ), $ IX, JX, DESCX, INCX, MEM( IPMATY ), $ MEM( IPY ), IY, JY, DESCY, INCY, $ INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 END IF * * Check padding * CALL PB_SCHEKPAD( ICTXT, SNAMES( K ), MPX, NQX, $ MEM( IPX-IPREX ), DESCX( LLD_ ), $ IPREX, IPOSTX, PADVAL ) IF( YCHECK( K ) ) THEN CALL PB_SCHEKPAD( ICTXT, SNAMES( K ), MPY, NQY, $ MEM( IPY-IPREY ), DESCY( LLD_ ), $ IPREY, IPOSTY, PADVAL ) END IF * * Check input-only scalar arguments * INFO = 1 CALL PSCHKARG1( ICTXT, NOUT, SNAMES( K ), N, ALPHA, IX, $ JX, DESCX, INCX, IY, JY, DESCY, INCY, $ INFO ) * * Check input-only array arguments * CALL PSCHKVOUT( N, MEM( IPMATX ), MEM( IPX ), IX, JX, $ DESCX, INCX, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) 'PARALLEL_X', SNAMES( K ) END IF * IF( YCHECK( K ) ) THEN CALL PSCHKVOUT( N, MEM( IPMATY ), MEM( IPY ), IY, JY, $ DESCY, INCY, IERR( 4 ) ) IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) 'PARALLEL_Y', $ SNAMES( K ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE. 0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) SNAMES( K ) KFAIL( K ) = KFAIL( K ) + 1 ERRFLG = .TRUE. ELSE IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) SNAMES( K ) KPASS( K ) = KPASS( K ) + 1 END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 3 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PSMPRNT( ICTXT, NOUT, MX, NX, MEM( IPMATX ), $ LDX, 0, 0, 'SERIAL_X' ) CALL PB_PSLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, $ 0, 0, 'PARALLEL_X', NOUT, $ MEM( IPMATX ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( N.GT.0 ) $ CALL PSVPRNT( ICTXT, NOUT, N, $ MEM( IPMATX+IX-1+(JX-1)*LDX ), $ INCX, 0, 0, 'SERIAL_X' ) IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PSLAPRNT( 1, N, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PSLAPRNT( N, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) END IF END IF IF( YCHECK( K ) ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PSMPRNT( ICTXT, NOUT, MY, NY, $ MEM( IPMATY ), LDY, 0, 0, $ 'SERIAL_Y' ) CALL PB_PSLAPRNT( MY, NY, MEM( IPY ), 1, 1, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( N.GT.0 ) $ CALL PSVPRNT( ICTXT, NOUT, N, $ MEM( IPMATY+IY-1+(JY-1)*LDY ), $ INCY, 0, 0, 'SERIAL_Y' ) IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PSLAPRNT( 1, N, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PSLAPRNT( N, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) END IF END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9984 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, '---------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' N IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9993 FORMAT( 2X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I5,1X,I5,1X,I5,1X,I5,1X, $ I5,1X,I5,1X,I6 ) 9992 FORMAT( 2X, ' N IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9991 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9990 FORMAT( 'Not enough memory. Need: ', I12 ) 9989 FORMAT( 2X, ' Tested Subroutine: ', A ) 9988 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9987 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9986 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9985 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9984 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9983 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9982 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9981 FORMAT( 2X, 'Testing Summary') 9980 FORMAT( 2X, 'End of Tests.' ) 9979 FORMAT( 2X, 'Tests started.' ) 9978 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PSBLA1TST * END SUBROUTINE PSBLA1TSTINFO( SUMMRY, NOUT, NMAT, NVAL, MXVAL, $ NXVAL, IMBXVAL, MBXVAL, INBXVAL, $ NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, $ CSCYVAL, IYVAL, JYVAL, INCYVAL, $ LDVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, LTEST, SOF, TEE, IAM, IGAP, $ IVERB, NPROCS, ALPHA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, $ NGRIDS, NMAT, NOUT, NPROCS REAL ALPHA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY LOGICAL LTEST( * ) INTEGER CSCXVAL( LDVAL ), CSCYVAL( LDVAL ), $ IMBXVAL( LDVAL ), IMBYVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JXVAL( LDVAL ), $ JYVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCXVAL( LDVAL ), RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PSBLA1TSTINFO get the needed startup information for testing various * Level 1 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DESCX(:), IX, JX, INCX, DESCY(:), * IY, JY and INCY. This is also the maximum number of test * cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 1 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) REAL * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 2, 2*NGRIDS+23*NMAT+NSUBS+4 ) with NSUBS equal to 8. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J REAL EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PSBLAS1TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 100 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 100 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA * READ( NIN, FMT = * ) ALPHA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 100 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 100 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, WORK, 2 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 70 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 70 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 1 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real single precision '// $ 'Level 1 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9982 ) ALPHA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 80 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 80 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, WORK, 2, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) * I = 2*NGRIDS + 23*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 90 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 90 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 100 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : ', G16.6 ) * * End of PSBLA1TSTINFO * END SUBROUTINE PSBLAS1TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PSBLAS1TSTCHKE tests the error exits of the Level 1 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 8 (NSUBS). * If LTEST( 1 ) is .TRUE., PSSWAP will be tested; * If LTEST( 2 ) is .TRUE., PSSCAL will be tested; * If LTEST( 3 ) is .TRUE., PSCOPY will be tested; * If LTEST( 4 ) is .TRUE., PSAXPY will be tested; * If LTEST( 5 ) is .TRUE., PSDOT will be tested; * If LTEST( 6 ) is .TRUE., PSNRM2 will be tested; * If LTEST( 7 ) is .TRUE., PSASUM will be tested; * If LTEST( 8 ) is .TRUE., PSAMAX will be tested. * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PSAMAX, PSASUM, PSAXPY, PSCOPY, $ PSDIMEE, PSDOT, PSNRM2, PSSCAL, PSSWAP, $ PSVECEE * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/11, 12, 11, 13, 13, 15, 15, 14/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PSSWAP * I = 1 IF( LTEST( I ) ) THEN CALL PSDIMEE( ICTXT, NOUT, PSSWAP, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSSWAP, SCODE( I ), SNAMES( I ) ) END IF * * Test PSSCAL * I = I + 1 IF( LTEST( I ) ) THEN CALL PSDIMEE( ICTXT, NOUT, PSSCAL, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSSCAL, SCODE( I ), SNAMES( I ) ) END IF * * Test PSCOPY * I = I + 1 IF( LTEST( I ) ) THEN CALL PSDIMEE( ICTXT, NOUT, PSCOPY, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSCOPY, SCODE( I ), SNAMES( I ) ) END IF * * Test PSAXPY * I = I + 1 IF( LTEST( I ) ) THEN CALL PSDIMEE( ICTXT, NOUT, PSAXPY, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSAXPY, SCODE( I ), SNAMES( I ) ) END IF * * Test PSDOT * I = I + 1 IF( LTEST( I ) ) THEN CALL PSDIMEE( ICTXT, NOUT, PSDOT, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSDOT, SCODE( I ), SNAMES( I ) ) END IF * * Test PSNRM2 * I = I + 1 IF( LTEST( I ) ) THEN CALL PSDIMEE( ICTXT, NOUT, PSNRM2, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSNRM2, SCODE( I ), SNAMES( I ) ) END IF * * Test PSASUM * I = I + 1 IF( LTEST( I ) ) THEN CALL PSDIMEE( ICTXT, NOUT, PSASUM, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSASUM, SCODE( I ), SNAMES( I ) ) END IF * * Test PSAMAX * I = I + 1 IF( LTEST( I ) ) THEN CALL PSDIMEE( ICTXT, NOUT, PSAMAX, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSAMAX, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PSBLAS1TSTCHKE * END SUBROUTINE PSCHKARG1( ICTXT, NOUT, SNAME, N, ALPHA, IX, JX, $ DESCX, INCX, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, INCY, INFO, IX, IY, JX, JY, N, $ NOUT REAL ALPHA * .. * .. Array Arguments .. CHARACTER*(*) SNAME INTEGER DESCX( * ), DESCY( * ) * .. * * Purpose * ======= * * PSCHKARG1 checks the input-only arguments of the Level 1 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * N (global input) INTEGER * On entry, N specifies the length of the subvector operands. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, INCXREF, INCYREF, IXREF, IYREF, JXREF, $ JYREF, MYCOL, MYROW, NPCOL, NPROW, NREF REAL ALPHAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCXREF( DLEN_ ), DESCYREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * NREF = N IXREF = IX JXREF = JX DO 10 I = 1, DLEN_ DESCXREF( I ) = DESCX( I ) 10 CONTINUE INCXREF = INCX IYREF = IY JYREF = JY DO 20 I = 1, DLEN_ DESCYREF( I ) = DESCY( I ) 20 CONTINUE INCYREF = INCY ALPHAREF = ALPHA * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( IX.NE.IXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IX' ELSE IF( JX.NE.JXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JX' ELSE IF( DESCX( DTYPE_ ).NE.DESCXREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( DTYPE_ )' ELSE IF( DESCX( M_ ).NE.DESCXREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( M_ )' ELSE IF( DESCX( N_ ).NE.DESCXREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( N_ )' ELSE IF( DESCX( IMB_ ).NE.DESCXREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( IMB_ )' ELSE IF( DESCX( INB_ ).NE.DESCXREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( INB_ )' ELSE IF( DESCX( MB_ ).NE.DESCXREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( MB_ )' ELSE IF( DESCX( NB_ ).NE.DESCXREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( NB_ )' ELSE IF( DESCX( RSRC_ ).NE.DESCXREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( RSRC_ )' ELSE IF( DESCX( CSRC_ ).NE.DESCXREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CSRC_ )' ELSE IF( DESCX( CTXT_ ).NE.DESCXREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CTXT_ )' ELSE IF( DESCX( LLD_ ).NE.DESCXREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( LLD_ )' ELSE IF( INCX.NE.INCXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCX' ELSE IF( IY.NE.IYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IY' ELSE IF( JY.NE.JYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JY' ELSE IF( DESCY( DTYPE_ ).NE.DESCYREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( DTYPE_ )' ELSE IF( DESCY( M_ ).NE.DESCYREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( M_ )' ELSE IF( DESCY( N_ ).NE.DESCYREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( N_ )' ELSE IF( DESCY( IMB_ ).NE.DESCYREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( IMB_ )' ELSE IF( DESCY( INB_ ).NE.DESCYREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( INB_ )' ELSE IF( DESCY( MB_ ).NE.DESCYREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( MB_ )' ELSE IF( DESCY( NB_ ).NE.DESCYREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( NB_ )' ELSE IF( DESCY( RSRC_ ).NE.DESCYREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( RSRC_ )' ELSE IF( DESCY( CSRC_ ).NE.DESCYREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CSRC_ )' ELSE IF( DESCY( CTXT_ ).NE.DESCYREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CTXT_ )' ELSE IF( DESCY( LLD_ ).NE.DESCYREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( LLD_ )' ELSE IF( INCY.NE.INCYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCY' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.GT.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PSCHKARG1 * END LOGICAL FUNCTION PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) * .. * * Purpose * ======= * * PISINSCOPE returns .TRUE. if the calling process is in the scope of * sub( X ) = X( IX+(JX-1)*DESCX(M_)+(i-1)*INCX ) and .FALSE. if it is * not. This routine is used to determine which processes should check * the answer returned by some Level 1 PBLAS routines. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * N (global input) INTEGER * The length of the subvector sub( X ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER IIX, IXCOL, IXROW, JJX, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_INFOG2L * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN * * This is the special case, find process owner of IX, JX, and * only this process is the scope. * PISINSCOPE = ( ( IXROW.EQ.MYROW .OR. ROWREP ) .AND. $ ( IXCOL.EQ.MYCOL .OR. COLREP ) ) * ELSE * IF( INCX.EQ.DESCX( M_ ) ) THEN * * row vector * PISINSCOPE = ( MYROW.EQ.IXROW .OR. ROWREP ) * ELSE * * column vector * PISINSCOPE = ( MYCOL.EQ.IXCOL .OR. COLREP ) * END IF * END IF * RETURN * * End of PISINSCOPE * END SUBROUTINE PSBLAS1TSTCHK( ICTXT, NOUT, NROUT, N, PSCLR, PUSCLR, $ PISCLR, X, PX, IX, JX, DESCX, INCX, Y, $ PY, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, INCY, INFO, IX, IY, JX, JY, N, $ NOUT, NROUT, PISCLR REAL PSCLR, PUSCLR * .. * .. Array Arguments .. INTEGER DESCX( * ), DESCY( * ) REAL PX( * ), PY( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PSBLAS1TSTCHK performs the computational tests of the Level 1 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PSSWAP will be tested; * else if NROUT = 2, PSSCAL will be tested; * else if NROUT = 3, PSCOPY will be tested; * else if NROUT = 4, PSAXPY will be tested; * else if NROUT = 5, PSDOT will be tested; * else if NROUT = 6, PSNRM2 will be tested; * else if NROUT = 7, PSASUM will be tested; * else if NROUT = 8, PSAMAX will be tested. * * N (global input) INTEGER * On entry, N specifies the length of the subvector operands. * * PSCLR (global input) REAL * On entry, depending on the value of NROUT, PSCLR specifies * the scalar ALPHA, or the output scalar returned by the PBLAS, * i.e., the dot product, the 2-norm, the absolute sum or the * value of AMAX. * * PUSCLR (global input) REAL * On entry, PUSCLR specifies the real part of the scalar ALPHA * used by the real scaling, the 2-norm, or the absolute sum * routines. PUSCLR is not used in the real versions of this * routine. * * PISCLR (global input) REAL * On entry, PISCLR specifies the value of the global index re- * turned by PSAMAX, otherwise PISCLR is not used. * * X (local input/local output) REAL array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) REAL array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) REAL array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on X has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on Y has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL COLREP, INXSCOPE, INYSCOPE, ROWREP INTEGER I, IB, ICURCOL, ICURROW, IDUMM, IIX, IIY, IN, $ IOFFX, IOFFY, ISCLR, IXCOL, IXROW, IYCOL, $ IYROW, J, JB, JJX, JJY, JN, KK, LDX, LDY, $ MYCOL, MYROW, NPCOL, NPROW REAL ERR, ERRMAX, PREC, SCLR, USCLR * .. * .. Local Arrays .. INTEGER IERR( 6 ) CHARACTER*5 ARGIN1, ARGIN2, ARGOUT1, ARGOUT2 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, PB_INFOG2L, PSCHKVIN, $ PSERRASUM, PSERRAXPY, PSERRDOT, PSERRNRM2, $ PSERRSCAL, SCOPY, SSWAP * .. * .. External Functions .. LOGICAL PISINSCOPE INTEGER ISAMAX REAL PSLAMCH EXTERNAL ISAMAX, PISINSCOPE, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * ARGIN1 = ' ' ARGIN2 = ' ' ARGOUT1 = ' ' ARGOUT2 = ' ' DO 10 I = 1, 6 IERR( I ) = 0 10 CONTINUE * PREC = PSLAMCH( ICTXT, 'precision' ) * IF( NROUT.EQ.1 ) THEN * * Test PSSWAP * IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL SSWAP( N, X( IOFFX ), INCX, Y( IOFFY ), INCY ) CALL PSCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PSCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) * ELSE IF( NROUT.EQ.2 ) THEN * * Test PSSCAL * LDX = DESCX( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ICURROW = IXROW ICURCOL = IXCOL ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * DO 20 J = JX, JN * CALL PSERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 20 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 40 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * DO 30 KK = 0, JB-1 * CALL PSERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 30 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 40 CONTINUE * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * DO 50 I = IX, IN * CALL PSERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 70 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * DO 60 KK = 0, IB-1 * CALL PSERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX 60 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 70 CONTINUE * END IF * ELSE IF( NROUT.EQ.3 ) THEN * * Test PSCOPY * IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL SCOPY( N, X( IOFFX ), INCX, Y( IOFFY ), INCY ) CALL PSCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PSCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) * ELSE IF( NROUT.EQ.4 ) THEN * * Test PSAXPY * CALL PSCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) LDY = DESCY( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PB_INFOG2L( IY, JY, DESCY, NPROW, NPCOL, MYROW, MYCOL, $ IIY, JJY, IYROW, IYCOL ) ICURROW = IYROW ICURCOL = IYCOL ROWREP = ( IYROW.EQ.-1 ) COLREP = ( IYCOL.EQ.-1 ) * IF( INCY.EQ.DESCY( M_ ) ) THEN * * sub( Y ) is a row vector * JB = DESCY( INB_ ) - JY + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCY( NB_ ) + 1 ) * DESCY( NB_ ) + JB JB = MIN( JB, N ) JN = JY + JB - 1 * DO 140 J = JY, JN * CALL PSERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF JJY = JJY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 140 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 160 J = JN+1, JY+N-1, DESCY( NB_ ) JB = MIN( JY+N-J, DESCY( NB_ ) ) * DO 150 KK = 0, JB-1 * CALL PSERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF JJY = JJY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 150 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 160 CONTINUE * ELSE * * sub( Y ) is a column vector * IB = DESCY( IMB_ ) - IY + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCY( MB_ ) + 1 ) * DESCY( MB_ ) + IB IB = MIN( IB, N ) IN = IY + IB - 1 * DO 170 I = IY, IN * CALL PSERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF IIY = IIY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 170 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 190 I = IN+1, IY+N-1, DESCY( MB_ ) IB = MIN( IY+N-I, DESCY( MB_ ) ) * DO 180 KK = 0, IB-1 * CALL PSERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF IIY = IIY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 180 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 190 CONTINUE * END IF * ELSE IF( NROUT.EQ.5 ) THEN * * Test PSDOT * CALL PSCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PSCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PSERRDOT( ERR, N, SCLR, X( IOFFX ), INCX, Y( IOFFY ), $ INCY, PREC ) INXSCOPE = PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) INYSCOPE = PISINSCOPE( ICTXT, N, IY, JY, DESCY, INCY ) IF( INXSCOPE.OR.INYSCOPE ) THEN IF( ABS( PSCLR - SCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'DOT' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF ELSE SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'DOT' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.6 ) THEN * * Test PSNRM2 * CALL PSCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PSERRNRM2( ERR, N, USCLR, X( IOFFX ), INCX, PREC ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN IF( ABS( PUSCLR - USCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'NRM2' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) USCLR, PUSCLR END IF END IF ELSE USCLR = ZERO IF( PUSCLR.NE.USCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'NRM2' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) USCLR, PUSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.7 ) THEN * * Test PSASUM * CALL PSCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PSERRASUM( ERR, N, USCLR, X( IOFFX ), INCX, PREC ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN IF( ABS( PUSCLR - USCLR ) .GT. ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'ASUM' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) USCLR, PUSCLR END IF END IF ELSE USCLR = ZERO IF( PUSCLR.NE.USCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'ASUM' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) USCLR, PUSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.8 ) THEN * * Test PSAMAX * CALL PSCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN ISCLR = ISAMAX( N, X( IOFFX ), INCX ) IF( N.LT.1 ) THEN SCLR = ZERO ELSE IF( ( INCX.EQ.1 ).AND.( DESCX( M_ ).EQ.1 ).AND. $ ( N.EQ.1 ) ) THEN ISCLR = JX SCLR = X( IOFFX ) ELSE IF( INCX.EQ.DESCX( M_ ) ) THEN ISCLR = JX + ISCLR - 1 SCLR = X( IX + ( ISCLR - 1 ) * DESCX( M_ ) ) ELSE ISCLR = IX + ISCLR - 1 SCLR = X( ISCLR + ( JX - 1 ) * DESCX( M_ ) ) END IF * IF( PSCLR.NE.SCLR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'AMAX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF * IF( PISCLR.NE.ISCLR ) THEN IERR( 5 ) = 1 WRITE( ARGIN2, FMT = '(A)' ) 'INDX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN2 WRITE( NOUT, FMT = 9995 ) ISCLR, PISCLR END IF END IF ELSE ISCLR = 0 SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'AMAX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF IF( PISCLR.NE.ISCLR ) THEN IERR( 6 ) = 1 WRITE( ARGOUT2, FMT = '(A)' ) 'INDX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT2 WRITE( NOUT, FMT = 9995 ) ISCLR, PISCLR END IF END IF END IF * END IF * * Find IERR across all processes * CALL IGAMX2D( ICTXT, 'All', ' ', 6, 1, IERR, 6, IDUMM, IDUMM, -1, $ -1, 0 ) * * Encode the errors found in INFO * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'X' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Y' END IF * IF( IERR( 3 ).NE.0 ) $ INFO = INFO + 4 * IF( IERR( 4 ).NE.0 ) $ INFO = INFO + 8 * IF( IERR( 5 ).NE.0 ) $ INFO = INFO + 16 * IF( IERR( 6 ).NE.0 ) $ INFO = INFO + 32 * 9999 FORMAT( 2X, ' ***** ERROR: Vector operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** ERROR: Output scalar result ', A, $ ' in scope is incorrect.' ) 9997 FORMAT( 2X, ' ***** ERROR: Output scalar result ', A, $ ' out of scope is incorrect.' ) 9996 FORMAT( 2X, ' ***** Expected value is: ', E16.8, /2X, $ ' Obtained value is: ', E16.8 ) 9995 FORMAT( 2X, ' ***** Expected value is: ', I6, /2X, $ ' Obtained value is: ', I6 ) * RETURN * * End of PSBLAS1TSTCHK * END SUBROUTINE PSERRDOT( ERRBND, N, SCLR, X, INCX, Y, INCY, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N REAL ERRBND, PREC, SCLR * .. * .. Array Arguments .. REAL X( * ), Y( * ) * .. * * Purpose * ======= * * PSERRDOT serially computes the dot product X**T * Y and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If dot1 = SCLR and dot2 are two different computed results, and dot1 * is being assumed to be correct, we require * * abs( dot1 - dot2 ) <= ERRBND = ERRFACT * abs( dot1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operands. * * SCLR (global output) REAL * On exit, SCLR specifies the dot product of the two vectors * X and Y. * * X (global input) REAL array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (global input) REAL array * On entry, Y is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremen- * ted array Y must contain the vector y. * * INCY (global input) INTEGER. * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IX, IY REAL ADDBND, FACT, SUMNEG, SUMPOS, TMP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * IX = 1 IY = 1 SCLR = ZERO SUMPOS = ZERO SUMNEG = ZERO FACT = TWO * ( ONE + PREC ) ADDBND = TWO * TWO * TWO * PREC * DO 10 I = 1, N TMP = X( IX ) * Y( IY ) SCLR = SCLR + TMP IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP * FACT ELSE SUMNEG = SUMNEG - TMP * FACT END IF IX = IX + INCX IY = IY + INCY 10 CONTINUE * ERRBND = ADDBND * MAX( SUMPOS, SUMNEG ) * RETURN * * End of PSERRDOT * END SUBROUTINE PSERRNRM2( ERRBND, N, USCLR, X, INCX, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N REAL ERRBND, PREC, USCLR * .. * .. Array Arguments .. REAL X( * ) * .. * * Purpose * ======= * * PSERRNRM2 serially computes the 2-norm the vector X and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If norm1 = SCLR and norm2 are two different computed results, and * norm1 being assumed to be correct, we require * * abs( norm1 - norm2 ) <= ERRBND = ERRFACT * abs( norm1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operand. * * USCLR (global output) REAL * On exit, USCLR specifies the 2-norm of the vector X. * * X (global input) REAL array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER IX REAL ABSXI, ADDBND, FACT, SCALE, SSQ, SUMSCA, SUMSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * USCLR = ZERO SUMSSQ = ONE SUMSCA = ZERO ADDBND = TWO * TWO * TWO * PREC FACT = ONE + TWO * ( ( ONE + PREC )**3 - ONE ) * SCALE = ZERO SSQ = ONE DO 10 IX = 1, 1 + ( N - 1 )*INCX, INCX IF( X( IX ).NE.ZERO ) THEN ABSXI = ABS( X( IX ) ) IF( SCALE.LT.ABSXI )THEN SUMSSQ = ONE + ( SSQ*( SCALE/ABSXI )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = ONE + SSQ*( SCALE/ABSXI )**2 SUMSCA = ABSXI SCALE = ABSXI ELSE SUMSSQ = SSQ + ( ( ABSXI/SCALE )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = SSQ + ( ABSXI/SCALE )**2 END IF END IF 10 CONTINUE * USCLR = SCALE * SQRT( SSQ ) * * Error on square root * ERRBND = SQRT( SUMSSQ ) * ( ONE + TWO * ( 1.00001E+0 * PREC ) ) * ERRBND = ( SUMSCA * ERRBND ) - USCLR * RETURN * * End of PSERRNRM2 * END SUBROUTINE PSERRASUM( ERRBND, N, USCLR, X, INCX, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N REAL ERRBND, PREC, USCLR * .. * .. Array Arguments .. REAL X( * ) * .. * * Purpose * ======= * * PSERRASUM serially computes the sum of absolute values of the vector * X and returns a scaled relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies a scaled relative acceptable error * bound. In this case the error bound is just the absolute sum * multiplied by a constant proportional to the machine preci- * sion. * * N (global input) INTEGER * On entry, N specifies the length of the vector operand. * * USCLR (global output) REAL * On exit, USCLR specifies the sum of absolute values of the * vector X. * * X (global input) REAL array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL TWO, ZERO PARAMETER ( TWO = 2.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER IX REAL ADDBND * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * IX = 1 USCLR = ZERO ADDBND = TWO * TWO * TWO * PREC * DO 10 IX = 1, 1 + ( N - 1 )*INCX, INCX USCLR = USCLR + ABS( X( IX ) ) 10 CONTINUE * ERRBND = ADDBND * USCLR * RETURN * * End of PSERRASUM * END SUBROUTINE PSERRSCAL( ERRBND, PSCLR, X, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL ERRBND, PREC, PSCLR, X * .. * * Purpose * ======= * * PSERRSCAL serially computes the product PSCLR * X and returns a sca- * led relative acceptable error bound on the result. * * Notes * ===== * * If s1 = PSCLR*X and s2 are two different computed results, and s1 is * being assumed to be correct, we require * * abs( s1 - s2 ) <= ERRBND = ERRFACT * abs( s1 ), * * where ERRFACT is computed as two times the machine precision. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PSCLR (global input) REAL * On entry, PSCLR specifies the scale factor. * * X (global input/global output) REAL * On entry, X specifies the scalar to be scaled. On exit, X is * the scaled entry. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL TWO PARAMETER ( TWO = 2.0E+0 ) * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * X = PSCLR * X * ERRBND = ( TWO * PREC ) * ABS( X ) * RETURN * * End of PSERRSCAL * END SUBROUTINE PSERRAXPY( ERRBND, PSCLR, X, Y, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL ERRBND, PREC, PSCLR, X, Y * .. * * Purpose * ======= * * PSERRAXPY serially computes Y := Y + PSCLR * X and returns a scaled * relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PSCLR (global input) REAL * On entry, PSCLR specifies the scale factor. * * X (global input) REAL * On entry, X specifies the scalar to be scaled. * * Y (global input/global output) REAL * On entry, Y specifies the scalar to be added. On exit, Y con- * tains the resulting scalar. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. REAL ADDBND, FACT, SUMPOS, SUMNEG, TMP * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * SUMPOS = ZERO SUMNEG = ZERO FACT = ONE + TWO * PREC ADDBND = TWO * TWO * TWO * PREC * TMP = PSCLR * X IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP * FACT ELSE SUMNEG = SUMNEG - TMP * FACT END IF * TMP = Y IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP ELSE SUMNEG = SUMNEG - TMP END IF * Y = Y + ( PSCLR * X ) * ERRBND = ADDBND * MAX( SUMPOS, SUMNEG ) * RETURN * * End of PSERRAXPY * END scalapack-2.0.2/PBLAS/TESTING/PSBLAS2TST.dat000644 000766 000024 00000006354 10363532303 020030 0ustar00juliestaff000000 000000 'Level 2 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PSBLAS2TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 16.0 threshold value of test ratio 10 value of the logical computational blocksize NB 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q 2.0E0 value of ALPHA 3.0E0 value of BETA 4 number of tests problems 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' values of UPLO 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANS 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' values of DIAG 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 values of M 8 10 12 14 16 18 20 22 24 9 11 13 15 17 19 21 values of N 31 33 35 36 34 32 30 31 33 35 36 34 32 30 38 40 values of M_A 31 33 35 36 34 32 30 31 33 35 36 34 32 30 38 40 values of N_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_A 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IA 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JA 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 values of M_X 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 values of N_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_X 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IX 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JX 30 1 30 1 30 1 30 1 30 1 30 1 30 1 30 1 values of INCX 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 values of M_Y 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 values of N_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_Y 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IY 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JY 40 1 40 1 40 1 40 1 40 1 40 1 40 1 40 1 values of INCY PSGEMV T put F for no test in the same column PSSYMV T put F for no test in the same column PSTRMV T put F for no test in the same column PSTRSV T put F for no test in the same column PSGER T put F for no test in the same column PSSYR T put F for no test in the same column PSSYR2 T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/psblas2tst.f000644 000766 000024 00000346072 11750130340 020145 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 7) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PSGEMV ', 'PSSYMV ', 'PSTRMV ', $ 'PSTRSV ', 'PSGER ', 'PSSYR ', $ 'PSSYR2 '/ END BLOCK DATA PROGRAM PSBLA2TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PSBLA2TST is the main testing program for the PBLAS Level 2 routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 60 lines: * 'Level 2 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PSBLAS2TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 16.0 threshold value of test ratio * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0E0 value of ALPHA * 1.0E0 value of BETA * 2 number of tests problems * 'U' 'L' values of UPLO * 'N' 'T' values of TRANS * 'N' 'U' values of DIAG * 3 4 values of M * 3 4 values of N * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PSGEMV T put F for no test in the same column * PSSYMV T put F for no test in the same column * PSTRMV T put F for no test in the same column * PSTRSV T put F for no test in the same column * PSGER T put F for no test in the same column * PSSYR T put F for no test in the same column * PSSYR2 T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * REALSZ indicates the length in bytes on the given platform * for a single precision real. By default, REALSZ is set to * four. * * MEM REAL array * MEM is an array of dimension TOTMEM / REALSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, REALSZ, TOTMEM, $ MEMSIZ, NSUBS REAL ONE, PADVAL, ZERO, ROGUE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, ZERO = 0.0E+0, $ ONE = 1.0E+0, PADVAL = -9923.0E+0, $ NSUBS = 7, ROGUE = -1.0E+10 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE CHARACTER*1 AFORM, DIAG, DIAGDO, TRANS, UPLO INTEGER CSRCA, CSRCX, CSRCY, I, IA, IAM, IASEED, ICTXT, $ IGAP, IMBA, IMBX, IMBY, IMIDA, IMIDX, IMIDY, $ INBA, INBX, INBY, INCX, INCY, IPA, IPG, IPMATA, $ IPMATX, IPMATY, IPOSTA, IPOSTX, IPOSTY, IPREA, $ IPREX, IPREY, IPX, IPY, IVERB, IX, IXSEED, IY, $ IYSEED, J, JA, JX, JY, K, LDA, LDX, LDY, M, MA, $ MBA, MBX, MBY, MEMREQD, MPA, MPX, MPY, MX, MY, $ MYCOL, MYROW, N, NA, NBA, NBX, NBY, NCOLA, $ NGRIDS, NLX, NLY, NOUT, NPCOL, NPROCS, NPROW, $ NQA, NQX, NQY, NROWA, NTESTS, NX, NY, OFFD, $ RSRCA, RSRCX, RSRCY, TSKIP, TSTCNT REAL ALPHA, BETA, SCALE, THRESH * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), TRANVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCXVAL( MAXTESTS ), $ CSCYVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCAR( DLEN_ ), DESCX( DLEN_ ), $ DESCXR( DLEN_ ), DESCY( DLEN_ ), $ DESCYR( DLEN_ ), IAVAL( MAXTESTS ), IERR( 6 ), $ IMBAVAL( MAXTESTS ), IMBXVAL( MAXTESTS ), $ IMBYVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JXVAL( MAXTESTS ), $ JYVAL( MAXTESTS ) INTEGER KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), MAVAL( MAXTESTS ), $ MBAVAL( MAXTESTS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NAVAL( MAXTESTS ), NBAVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCAVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) REAL MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_DESCSET2, PB_PSLAPRNT, PB_SCHEKPAD, $ PB_SFILLPAD, PB_SLASCAL, PB_SLASET, PMDESCCHK, $ PMDIMCHK, PSBLA2TSTINFO, PSBLAS2TSTCHK, $ PSBLAS2TSTCHKE, PSCHKARG2, PSCHKVOUT, PSGEMV, $ PSGER, PSLAGEN, PSLASCAL, PSLASET, PSMPRNT, $ PSSYMV, PSSYR, PSSYR2, PSTRMV, PSTRSV, PSVPRNT, $ PVDESCCHK, PVDIMCHK * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .TRUE., .FALSE., .FALSE., $ .TRUE., .FALSE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IASEED = 100 IXSEED = 200 IYSEED = 300 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PSBLA2TSTINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, $ IAVAL, JAVAL, MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, CSCYVAL, $ IYVAL, JYVAL, INCYVAL, MAXTESTS, NGRIDS, $ PVAL, MAXGRIDS, QVAL, MAXGRIDS, NBLOG, LTEST, $ SOF, TEE, IAM, IGAP, IVERB, NPROCS, THRESH, $ ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PSBLAS2TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) TRANS = TRANVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) INBA = INBAVAL( J ) MBA = MBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) INBX = INBXVAL( J ) MBX = MBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) * MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) INBY = INBYVAL( J ) MBY = MBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN TSTCNT = TSTCNT + 1 WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, UPLO, TRANS, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY * WRITE( NOUT, FMT = 9995 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANS' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) TRANS WRITE( NOUT, FMT = 9997 ) 'DIAG' TSKIP = TSKIP + 1 GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, IGAP, GAPMUL, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, IGAP, GAPMUL, $ IERR( 2 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, IGAP, GAPMUL, $ IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDA = MAX( 1, MA ) LDX = MAX( 1, MX ) LDY = MAX( 1, MY ) * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPX = IPA + DESCA( LLD_ )*NQA + IPOSTA + IPREX IPY = IPX + DESCX( LLD_ )*NQX + IPOSTX + IPREY IPMATA = IPY + DESCY( LLD_ )*NQY + IPOSTY IPMATX = IPMATA + MA*NA IPMATY = IPMATX + MX*NX IPG = IPMATY + MAX( MX*NX, MY*NY ) * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * MEMREQD = IPG + MAX( M, N ) - 1 + $ MAX( MAX( IMBA, MBA ), $ MAX( MAX( IMBX, MBX ), $ MAX( IMBY, MBY ) ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) MEMREQD*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 2 routines * DO 30 K = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) SNAMES( K ) END IF * * Define the size of the operands * IF( K.EQ.1 ) THEN NROWA = M NCOLA = N IF( LSAME( TRANS, 'N' ) ) THEN NLX = N NLY = M ELSE NLX = M NLY = N END IF ELSE IF( K.EQ.5 ) THEN NROWA = M NCOLA = N NLX = M NLY = N ELSE NROWA = N NCOLA = N NLX = N NLY = N END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLX, 'X', IX, JX, DESCX, $ INCX, IERR( 2 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLY, 'Y', IY, JY, DESCY, $ INCY, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 GO TO 30 END IF * * Generate distributed matrices A, X and Y * IF( K.EQ.2 .OR. K.EQ.6 .OR. K.EQ.7 ) THEN AFORM = 'S' DIAGDO = 'N' OFFD = IA - JA ELSE IF( ( K.EQ.4 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN AFORM = 'N' DIAGDO = 'D' OFFD = IA - JA ELSE AFORM = 'N' DIAGDO = 'N' OFFD = 0 END IF * CALL PSLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCAR, MA, NA, IMBA, INBA, MBA, NBA, $ -1, -1, ICTXT, MAX( 1, MA ) ) CALL PSLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCAR, IASEED, MEM( IPMATA ), $ DESCAR( LLD_ ) ) CALL PB_DESCSET2( DESCXR, MX, NX, IMBX, INBX, MBX, NBX, $ -1, -1, ICTXT, MAX( 1, MX ) ) CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCXR, IXSEED, MEM( IPMATX ), $ DESCXR( LLD_ ) ) IF( YCHECK( K ) ) THEN * CALL PB_DESCSET2( DESCYR, MY, NY, IMBY, INBY, MBY, $ NBY, -1, -1, ICTXT, MAX( 1, MY ) ) CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCYR, IYSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) * ELSE * * If Y is not needed, generate a copy of X instead * CALL PB_DESCSET2( DESCYR, MX, NX, IMBX, INBX, MBX, $ NBX, -1, -1, ICTXT, MAX( 1, MX ) ) CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, $ 1, 1, DESCYR, IXSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) * END IF * * Zero non referenced part of the matrices A * IF( ( K.EQ.2 .OR. K.EQ.6 .OR. K.EQ.7 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN * * The distributed matrix A is symmetric * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of A. * CALL PSLASET( 'Upper', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA, JA+1, DESCA ) IF( K.NE.2 ) THEN CALL PB_SLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of A. * CALL PSLASET( 'Lower', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA+1, JA, DESCA ) IF( K.NE.2 ) THEN CALL PB_SLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * END IF * ELSE IF( K.EQ.3 .OR. K.EQ.4 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix A is lower triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PSLASET( 'Upper', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA, $ JA+1, DESCA ) CALL PB_SLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA-1+JA*LDA ), $ LDA ) END IF * ELSE * CALL PSLASET( 'Upper', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_SLASET( 'Upper', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( K.EQ.4 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / REAL( MAX( NROWA, NCOLA ) ) CALL PSLASCAL( 'Lower', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA+1, JA, $ DESCA ) CALL PB_SLASCAL( 'Lower', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix A is upper triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PSLASET( 'Lower', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA+1, $ JA, DESCA ) CALL PB_SLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * ELSE * CALL PSLASET( 'Lower', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_SLASET( 'Lower', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( K.EQ.4 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / REAL( MAX( NROWA, NCOLA ) ) CALL PSLASCAL( 'Upper', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA, JA+1, $ DESCA ) CALL PB_SLASCAL( 'Upper', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * END IF * END IF * END IF * * Pad the guard zones of A, X and Y * CALL PB_SFILLPAD( ICTXT, MPA, NQA, MEM( IPA-IPREA ), $ DESCA( LLD_ ), IPREA, IPOSTA, PADVAL ) * CALL PB_SFILLPAD( ICTXT, MPX, NQX, MEM( IPX-IPREX ), $ DESCX( LLD_ ), IPREX, IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_SFILLPAD( ICTXT, MPY, NQY, MEM( IPY-IPREY ), $ DESCY( LLD_ ), IPREY, IPOSTY, $ PADVAL ) END IF * * Initialize the check for INPUT-only arguments. * INFO = 0 CALL PSCHKARG2( ICTXT, NOUT, SNAMES( K ), UPLO, TRANS, $ DIAG, M, N, ALPHA, IA, JA, DESCA, IX, $ JX, DESCX, INCX, BETA, IY, JY, DESCY, $ INCY, INFO ) * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN CALL PB_PSLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_INITIAL_A', $ NOUT, MEM( IPG ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PSLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, 0, $ 0, 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPG ) ) END IF * IF( IVERB.EQ.2 ) THEN IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PSLAPRNT( 1, NLX, MEM( IPX ), IX, JX, $ DESCX, 0, 0, $ 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) ELSE CALL PB_PSLAPRNT( NLX, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, $ 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PSLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, 0, $ 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) END IF * IF( YCHECK( K ) ) THEN IF( IVERB.EQ.2 ) THEN IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PSLAPRNT( 1, NLY, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) ELSE CALL PB_PSLAPRNT( NLY, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PSLAPRNT( MY, NY, MEM( IPY ), 1, 1, DESCY, $ 0, 0, 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) END IF END IF * * Call the Level 2 PBLAS routine * INFO = 0 IF( K.EQ.1 ) THEN * * Test PSGEMV * CALL PSGEMV( TRANS, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.2 ) THEN * * Test PSSYMV * CALL PSSYMV( UPLO, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.3 ) THEN * * Test PSTRMV * CALL PSTRMV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) * ELSE IF( K.EQ.4 ) THEN * * Test PSTRSV * CALL PSTRSV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) * ELSE IF( K.EQ.5 ) THEN * * Test PSGER * CALL PSGER( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.6 ) THEN * * Test PSSYR * CALL PSSYR( UPLO, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.7 ) THEN * * Test PSSYR2 * CALL PSSYR2( UPLO, N, ALPHA, MEM( IPX ), IX, JX, $ DESCX, INCX, MEM( IPY ), IY, JY, DESCY, $ INCY, MEM( IPA ), IA, JA, DESCA ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9974 ) INFO GO TO 30 END IF * * Check padding * CALL PB_SCHEKPAD( ICTXT, SNAMES( K ), MPA, NQA, $ MEM( IPA-IPREA ), DESCA( LLD_ ), IPREA, $ IPOSTA, PADVAL ) * CALL PB_SCHEKPAD( ICTXT, SNAMES( K ), MPX, NQX, $ MEM( IPX-IPREX ), DESCX( LLD_ ), IPREX, $ IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_SCHEKPAD( ICTXT, SNAMES( K ), MPY, NQY, $ MEM( IPY-IPREY ), DESCY( LLD_ ), $ IPREY, IPOSTY, PADVAL ) END IF * * Check the computations * CALL PSBLAS2TSTCHK( ICTXT, NOUT, K, UPLO, TRANS, DIAG, M, $ N, ALPHA, MEM( IPMATA ), MEM( IPA ), $ IA, JA, DESCA, MEM( IPMATX ), $ MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPMATY ), MEM( IPY ), IY, $ JY, DESCY, INCY, THRESH, ROGUE, $ MEM( IPG ), INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 ) THEN IERR( 3 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 IERR( 3 ) = 1 END IF * * Check input-only scalar arguments * INFO = 1 CALL PSCHKARG2( ICTXT, NOUT, SNAMES( K ), UPLO, TRANS, $ DIAG, M, N, ALPHA, IA, JA, DESCA, IX, $ JX, DESCX, INCX, BETA, IY, JY, DESCY, $ INCY, INFO ) * * Check input-only array arguments * CALL PSCHKMOUT( NROWA, NCOLA, MEM( IPMATA ), MEM( IPA ), $ IA, JA, DESCA, IERR( 4 ) ) CALL PSCHKVOUT( NLX, MEM( IPMATX ), MEM( IPX ), IX, JX, $ DESCX, INCX, IERR( 5 ) ) * IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_A', $ SNAMES( K ) END IF * IF( IERR( 5 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_X', $ SNAMES( K ) END IF * IF( YCHECK( K ) ) THEN CALL PSCHKVOUT( NLY, MEM( IPMATY ), MEM( IPY ), IY, $ JY, DESCY, INCY, IERR( 6 ) ) IF( IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_Y', $ SNAMES( K ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE.0 .OR. IERR( 5 ).NE.0 .OR. $ IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) SNAMES( K ) KFAIL( K ) = KFAIL( K ) + 1 ERRFLG = .TRUE. ELSE IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) SNAMES( K ) KPASS( K ) = KPASS( K ) + 1 END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PSMPRNT( ICTXT, NOUT, MA, NA, MEM( IPMATA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PSLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( ( NROWA.GT.0 ).AND.( NCOLA.GT.0 ) ) $ CALL PSMPRNT( ICTXT, NOUT, NROWA, NCOLA, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PSLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_A', $ NOUT, MEM( IPMATA ) ) END IF IF( IERR( 5 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PSMPRNT( ICTXT, NOUT, MX, NX, MEM( IPMATX ), $ LDX, 0, 0, 'SERIAL_X' ) CALL PB_PSLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, $ 0, 0, 'PARALLEL_X', NOUT, $ MEM( IPMATX ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( NLX.GT.0 ) $ CALL PSVPRNT( ICTXT, NOUT, NLX, $ MEM( IPMATX+IX-1+(JX-1)*LDX ), $ INCX, 0, 0, 'SERIAL_X' ) IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PSLAPRNT( 1, NLX, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PSLAPRNT( NLX, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) END IF END IF IF( YCHECK( K ) ) THEN IF( IERR( 6 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PSMPRNT( ICTXT, NOUT, MY, NY, $ MEM( IPMATY ), LDY, 0, 0, $ 'SERIAL_Y' ) CALL PB_PSLAPRNT( MY, NY, MEM( IPY ), 1, 1, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE IF( IERR( 3 ).NE.0 ) THEN IF( NLY.GT.0 ) $ CALL PSVPRNT( ICTXT, NOUT, NLY, $ MEM( IPMATY+IY-1+(JY-1)*LDY ), $ INCY, 0, 0, 'SERIAL_Y' ) IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PSLAPRNT( 1, NLY, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PSLAPRNT( NLY, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) END IF END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9981 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9980 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' M N UPLO TRANS DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,9X,A1,11X,A1,10X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9989 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5,1X,I6 ) 9988 FORMAT( 2X, ' IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9987 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9986 FORMAT( 'Not enough memory. Need: ', I12 ) 9985 FORMAT( 2X, ' Tested Subroutine: ', A ) 9984 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9983 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9982 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9981 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9980 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9979 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9978 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9977 FORMAT( 2X, 'Testing Summary') 9976 FORMAT( 2X, 'End of Tests.' ) 9975 FORMAT( 2X, 'Tests started.' ) 9974 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PSBLA2TST * END SUBROUTINE PSBLA2TSTINFO( SUMMRY, NOUT, NMAT, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, $ IMBAVAL, MBAVAL, INBAVAL, NBAVAL, $ RSCAVAL, CSCAVAL, IAVAL, JAVAL, $ MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, $ IXVAL, JXVAL, INCXVAL, MYVAL, NYVAL, $ IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, $ INCYVAL, LDVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, NBLOG, LTEST, SOF, TEE, $ IAM, IGAP, IVERB, NPROCS, THRESH, ALPHA, $ BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, NBLOG, $ NGRIDS, NMAT, NOUT, NPROCS REAL ALPHA, BETA, THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), TRANVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCXVAL( LDVAL ), $ CSCYVAL( LDVAL ), IAVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBXVAL( LDVAL ), $ IMBYVAL( LDVAL ), INBAVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JAVAL( LDVAL ), $ JXVAL( LDVAL ), JYVAL( LDVAL ), MAVAL( LDVAL ), $ MBAVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NAVAL( LDVAL ), $ NBAVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCXVAL( LDVAL ), $ RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PSBLA2TSTINFO get the needed startup information for testing various * Level 2 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * TRANVAL (global output) CHARACTER array * On entry, TRANVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANS to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, TRANS, UPLO, M, N, DESCA(:), * IA, JA, DESCX(:), IX, JX, INCX, DESCY(:), IY, JY and INCY. * This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least seven. On * exit, if LTEST( i ) is .TRUE., the i-th Level 2 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * THRESH (global output) REAL * On exit, THRESH specifies the threshhold value for the test * ratio. * * ALPHA (global output) REAL * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) REAL * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+37*NMAT+NSUBS+4 ) with NSUBS equal to 7. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 7 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J REAL EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D *ype real dble cplx zplx * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PSBLAS2TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Read the threshold value for test ratio * READ( NIN, FMT = * ) THRESH IF( THRESH.LT.0.0 ) $ THRESH = 16.0 * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRANVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( TRANVAL( J ) ) WORK( I+2 ) = ICHAR( UPLOVAL( J ) ) I = I + 3 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 2 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real single precision '// $ 'Level 2 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9979 ) NBLOG WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9980 ) THRESH WRITE( NOUT, FMT = 9982 ) ALPHA WRITE( NOUT, FMT = 9981 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 37*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) TRANVAL( J ) = CHAR( WORK( I+1 ) ) UPLOVAL( J ) = CHAR( WORK( I+2 ) ) I = I + 3 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : ', G16.6 ) 9981 FORMAT( 2X, 'Beta : ', G16.6 ) 9980 FORMAT( 2X, 'Threshold value : ', G16.6 ) 9979 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PSBLA2TSTINFO * END SUBROUTINE PSBLAS2TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PSBLAS2TSTCHKE tests the error exits of the Level 2 PBLAS. * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 7 (NSUBS). * If LTEST( 1 ) is .TRUE., PSGEMV will be tested; * If LTEST( 2 ) is .TRUE., PSSYMV will be tested; * If LTEST( 3 ) is .TRUE., PSTRMV will be tested; * If LTEST( 4 ) is .TRUE., PSTRSV will be tested; * If LTEST( 5 ) is .TRUE., PSGER will be tested; * If LTEST( 6 ) is .TRUE., PSSYR will be tested; * If LTEST( 7 ) is .TRUE., PSSYR2 will be tested; * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 7 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PSDIMEE, PSGEMV, PSGER, $ PSMATEE, PSOPTEE, PSSYMV, PSSYR, PSSYR2, $ PSTRMV, PSTRSV, PSVECEE * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/21, 22, 23, 23, 24, 25, 27/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PSGEMV * I = 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSGEMV, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSGEMV, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSGEMV, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSGEMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PSSYMV * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSSYMV, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSSYMV, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSSYMV, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSSYMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PSTRMV * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSTRMV, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSTRMV, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSTRMV, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSTRMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PSTRSV * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSTRSV, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSTRSV, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSTRSV, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSTRSV, SCODE( I ), SNAMES( I ) ) END IF * * Test PSGER * I = I + 1 IF( LTEST( I ) ) THEN CALL PSDIMEE( ICTXT, NOUT, PSGER, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSGER, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSGER, SCODE( I ), SNAMES( I ) ) END IF * * Test PSSYR * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSSYR, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSSYR, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSSYR, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSSYR, SCODE( I ), SNAMES( I ) ) END IF * * Test PSSYR2 * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSSYR2, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSSYR2, SCODE( I ), SNAMES( I ) ) CALL PSVECEE( ICTXT, NOUT, PSSYR2, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSSYR2, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PSBLAS2TSTCHKE * END SUBROUTINE PSCHKARG2( ICTXT, NOUT, SNAME, UPLO, TRANS, DIAG, M, $ N, ALPHA, IA, JA, DESCA, IX, JX, DESCX, $ INCX, BETA, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N, NOUT REAL ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*(*) SNAME INTEGER DESCA( * ), DESCX( * ), DESCY( * ) * .. * * Purpose * ======= * * PSCHKARG2 checks the input-only arguments of the Level 2 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the UPLO option in the Level 2 PBLAS * operation. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the TRANS option in the Level 2 * PBLAS operation. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies the DIAG option in the Level 2 PBLAS * operation. * * M (global input) INTEGER * On entry, M specifies the dimension of the submatrix ope- * rands. * * N (global input) INTEGER * On entry, N specifies the dimension of the submatrix ope- * rands. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 DIAGREF, TRANSREF, UPLOREF INTEGER I, IAREF, INCXREF, INCYREF, IXREF, IYREF, $ JAREF, JXREF, JYREF, MREF, MYCOL, MYROW, NPCOL, $ NPROW, NREF REAL ALPHAREF, BETAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCAREF( DLEN_ ), DESCXREF( DLEN_ ), $ DESCYREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * DIAGREF = DIAG TRANSREF = TRANS UPLOREF = UPLO MREF = M NREF = N ALPHAREF = ALPHA IAREF = IA JAREF = JA DO 10 I = 1, DLEN_ DESCAREF( I ) = DESCA( I ) 10 CONTINUE IXREF = IX JXREF = JX DO 20 I = 1, DLEN_ DESCXREF( I ) = DESCX( I ) 20 CONTINUE INCXREF = INCX BETAREF = BETA IYREF = IY JYREF = JY DO 30 I = 1, DLEN_ DESCYREF( I ) = DESCY( I ) 30 CONTINUE INCYREF = INCY * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( .NOT. LSAME( DIAG, DIAGREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DIAG' ELSE IF( .NOT. LSAME( TRANS, TRANSREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANS' ELSE IF( .NOT. LSAME( UPLO, UPLOREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'UPLO' ELSE IF( M.NE.MREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'M' ELSE IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE IF( IA.NE.IAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IA' ELSE IF( JA.NE.JAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JA' ELSE IF( DESCA( DTYPE_ ).NE.DESCAREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( DTYPE_ )' ELSE IF( DESCA( M_ ).NE.DESCAREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( M_ )' ELSE IF( DESCA( N_ ).NE.DESCAREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( N_ )' ELSE IF( DESCA( IMB_ ).NE.DESCAREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( IMB_ )' ELSE IF( DESCA( INB_ ).NE.DESCAREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( INB_ )' ELSE IF( DESCA( MB_ ).NE.DESCAREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( MB_ )' ELSE IF( DESCA( NB_ ).NE.DESCAREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( NB_ )' ELSE IF( DESCA( RSRC_ ).NE.DESCAREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( RSRC_ )' ELSE IF( DESCA( CSRC_ ).NE.DESCAREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CSRC_ )' ELSE IF( DESCA( CTXT_ ).NE.DESCAREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CTXT_ )' ELSE IF( DESCA( LLD_ ).NE.DESCAREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( LLD_ )' ELSE IF( IX.NE.IXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IX' ELSE IF( JX.NE.JXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JX' ELSE IF( DESCX( DTYPE_ ).NE.DESCXREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( DTYPE_ )' ELSE IF( DESCX( M_ ).NE.DESCXREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( M_ )' ELSE IF( DESCX( N_ ).NE.DESCXREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( N_ )' ELSE IF( DESCX( IMB_ ).NE.DESCXREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( IMB_ )' ELSE IF( DESCX( INB_ ).NE.DESCXREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( INB_ )' ELSE IF( DESCX( MB_ ).NE.DESCXREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( MB_ )' ELSE IF( DESCX( NB_ ).NE.DESCXREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( NB_ )' ELSE IF( DESCX( RSRC_ ).NE.DESCXREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( RSRC_ )' ELSE IF( DESCX( CSRC_ ).NE.DESCXREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CSRC_ )' ELSE IF( DESCX( CTXT_ ).NE.DESCXREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CTXT_ )' ELSE IF( DESCX( LLD_ ).NE.DESCXREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( LLD_ )' ELSE IF( INCX.NE.INCXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCX' ELSE IF( BETA.NE.BETAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'BETA' ELSE IF( IY.NE.IYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IY' ELSE IF( JY.NE.JYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JY' ELSE IF( DESCY( DTYPE_ ).NE.DESCYREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( DTYPE_ )' ELSE IF( DESCY( M_ ).NE.DESCYREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( M_ )' ELSE IF( DESCY( N_ ).NE.DESCYREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( N_ )' ELSE IF( DESCY( IMB_ ).NE.DESCYREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( IMB_ )' ELSE IF( DESCY( INB_ ).NE.DESCYREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( INB_ )' ELSE IF( DESCY( MB_ ).NE.DESCYREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( MB_ )' ELSE IF( DESCY( NB_ ).NE.DESCYREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( NB_ )' ELSE IF( DESCY( RSRC_ ).NE.DESCYREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( RSRC_ )' ELSE IF( DESCY( CSRC_ ).NE.DESCYREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CSRC_ )' ELSE IF( DESCY( CTXT_ ).NE.DESCYREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CTXT_ )' ELSE IF( DESCY( LLD_ ).NE.DESCYREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( LLD_ )' ELSE IF( INCY.NE.INCYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCY' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.NE.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PSCHKARG2 * END SUBROUTINE PSBLAS2TSTCHK( ICTXT, NOUT, NROUT, UPLO, TRANS, DIAG, $ M, N, ALPHA, A, PA, IA, JA, DESCA, X, $ PX, IX, JX, DESCX, INCX, BETA, Y, PY, $ IY, JY, DESCY, INCY, THRESH, ROGUE, $ WORK, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N, NOUT, NROUT REAL ALPHA, BETA, ROGUE, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL A( * ), PA( * ), PX( * ), PY( * ), WORK( * ), $ X( * ), Y( * ) * .. * * Purpose * ======= * * PSBLAS2TSTCHK performs the computational tests of the Level 2 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PSGEMV will be tested; * else if NROUT = 2, PSSYMV will be tested; * else if NROUT = 3, PSTRMV will be tested; * else if NROUT = 4, PSTRSV will be tested; * else if NROUT = 5, PSGER will be tested; * else if NROUT = 6, PSSYR will be tested; * else if NROUT = 7, PSSYR2 will be tested; * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies if the upper or lower part of the * matrix operand is to be referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies if the matrix operand A is to be * transposed. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the triangular matrix operand is * unit or non-unit. * * M (global input) INTEGER * On entry, M specifies the number of rows of A. * * N (global input) INTEGER * On entry, N specifies the number of columns of A. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) REAL array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) REAL array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) REAL array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) REAL array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * THRESH (global input) REAL * On entry, THRESH is the threshold value for the test ratio. * * ROGUE (global input) REAL * On entry, ROGUE specifies the constant used to pad the * non-referenced part of triangular or symmetric matrices. * * WORK (workspace) REAL array * On entry, WORK is an array of dimension LWORK where LWORK is * at least MAX( M, N ). This array is used to store the compu- * ted gauges (see PSMVCH). * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on A has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on X has been found, * if( MOD( INFO/4, 2 ) = 1 ) then an error on Y has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW REAL ERR * .. * .. Local Arrays .. INTEGER IERR( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_SLASET, PSCHKMIN, PSCHKVIN, $ PSMVCH, PSTRMV, PSVMCH, PSVMCH2, STRSV * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 10 I = 1, 3 IERR( I ) = 0 10 CONTINUE * IF( NROUT.EQ.1 ) THEN * * Test PSGEMV * * Check the resulting vector Y * CALL PSMVCH( ICTXT, TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, $ IX, JX, DESCX, INCX, BETA, Y, PY, IY, JY, DESCY, $ INCY, WORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PSCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) IF( LSAME( TRANS, 'N' ) ) THEN CALL PSCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 2 ) ) ELSE CALL PSCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, $ IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.2 ) THEN * * Test PSSYMV * * Check the resulting vector Y * CALL PSMVCH( ICTXT, 'No transpose', N, N, ALPHA, A, IA, JA, $ DESCA, X, IX, JX, DESCX, INCX, BETA, Y, PY, IY, $ JY, DESCY, INCY, WORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN CALL PB_SLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF CALL PSCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PSCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) * ELSE IF( NROUT.EQ.3 ) THEN * * Test PSTRMV * * Check the resulting vector X * CALL PSMVCH( ICTXT, TRANS, N, N, ONE, A, IA, JA, DESCA, Y, IX, $ JX, DESCX, INCX, ZERO, X, PX, IX, JX, DESCX, INCX, $ WORK, ERR, IERR( 2 ) ) * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF END IF CALL PSCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) * ELSE IF( NROUT.EQ.4 ) THEN * * Test PSTRSV * * Check the resulting vector X * CALL STRSV( UPLO, TRANS, DIAG, N, A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ), X( IX+(JX-1)*DESCX( M_ ) ), INCX ) CALL PSTRMV( UPLO, TRANS, DIAG, N, PA, IA, JA, DESCA, PX, IX, $ JX, DESCX, INCX ) CALL PSMVCH( ICTXT, TRANS, N, N, ONE, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX, ZERO, Y, PX, IX, JX, DESCX, INCX, $ WORK, ERR, IERR( 2 ) ) * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF END IF CALL PSCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) * ELSE IF( NROUT.EQ.5 ) THEN * * Test PSGER * * Check the resulting matrix A * CALL PSVMCH( ICTXT, 'Ge', M, N, ALPHA, X, IX, JX, DESCX, $ INCX, Y, IY, JY, DESCY, INCY, A, PA, IA, JA, $ DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PSCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PSCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * ELSE IF( NROUT.EQ.6 ) THEN * * Test PSSYR * * Check the resulting matrix A * CALL PSVMCH( ICTXT, UPLO, N, N, ALPHA, X, IX, JX, DESCX, $ INCX, X, IX, JX, DESCX, INCX, A, PA, IA, JA, $ DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PSCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) * ELSE IF( NROUT.EQ.7 ) THEN * * Test PSSYR2 * * Check the resulting matrix A * CALL PSVMCH2( ICTXT, UPLO, N, N, ALPHA, X, IX, JX, DESCX, INCX, $ Y, IY, JY, DESCY, INCY, A, PA, IA, JA, DESCA, $ WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PSCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PSCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * END IF * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'A' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'X' END IF * IF( IERR( 3 ).NE.0 ) THEN INFO = INFO + 4 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'Y' END IF * 9999 FORMAT( 2X, ' ***** ERROR: Matrix operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** ERROR: Vector operand ', A, $ ' is incorrect.' ) 9997 FORMAT( 2X, ' ***** FATAL ERROR - Computed result is less ', $ 'than half accurate *****' ) 9996 FORMAT( 2X, ' ***** Test completed with maximum test ratio: ', $ F11.5, ' SUSPECT *****' ) * RETURN * * End of PSBLAS2TSTCHK * END scalapack-2.0.2/PBLAS/TESTING/PSBLAS3TST.dat000644 000766 000024 00000004720 10363532303 020024 0ustar00juliestaff000000 000000 'Level 3 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PSBLAS3TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 16.0 threshold value of test ratio 10 value of the logical computational blocksize NB 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q 2.0E0 value of ALPHA 3.0E0 value of BETA 4 number of tests problems 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' values of DIAG 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' values of SIDE 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANSA 'N' 'N' 'T' 'T' 'N' 'N' 'T' 'T' values of TRANSB 'U' 'L' 'U' 'L' 'U' 'L' 'U' 'L' values of UPLO 22 30 8 20 3 5 11 8 values of M 23 29 7 21 3 14 11 8 values of N 24 28 9 22 3 9 11 8 values of K 31 31 31 31 31 31 31 31 values of M_A 31 31 31 31 31 31 31 31 values of N_A 4 5 6 3 2 1 4 5 values of IMB_A 4 5 6 3 2 1 4 5 values of INB_A 4 5 6 3 2 1 4 5 values of MB_A 4 5 6 3 2 1 4 5 values of NB_A 0 0 0 0 0 0 0 0 values of RSRC_A 0 0 0 0 0 0 0 0 values of CSRC_A 5 1 7 7 1 12 1 11 values of IA 5 1 7 7 1 12 1 11 values of JA 32 32 32 32 32 32 32 32 values of M_B 32 32 32 32 32 32 32 32 values of N_B 4 5 6 3 2 1 4 5 values of IMB_B 4 5 6 3 2 1 4 5 values of INB_B 4 5 6 3 2 1 4 5 values of MB_B 4 5 6 3 2 1 4 5 values of NB_B 0 0 0 0 0 0 0 0 values of RSRC_B 0 0 0 0 0 0 0 0 values of CSRC_B 5 1 7 7 1 12 1 11 values of IB 5 1 7 7 1 12 1 11 values of JB 35 35 35 35 35 35 35 35 values of M_C 35 35 35 35 35 35 35 35 values of N_C 4 5 6 3 2 1 4 5 values of IMB_C 4 5 6 3 2 1 4 5 values of INB_C 4 5 6 3 2 1 4 5 values of MB_C 4 5 6 3 2 1 4 5 values of NB_C 0 0 0 0 0 0 0 0 values of RSRC_C 0 0 0 0 0 0 0 0 values of CSRC_C 5 1 7 7 1 12 1 11 values of IC 5 1 7 7 1 12 1 11 values of JC PSGEMM T put F for no test in the same column PSSYMM T put F for no test in the same column PSSYRK T put F for no test in the same column PSSYR2K T put F for no test in the same column PSTRMM T put F for no test in the same column PSTRSM T put F for no test in the same column PSGEADD T put F for no test in the same column PSTRADD T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/psblas3tst.f000644 000766 000024 00000374553 11750130340 020153 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PSGEMM ', 'PSSYMM ', 'PSSYRK ', $ 'PSSYR2K', 'PSTRMM ', 'PSTRSM ', $ 'PSGEADD', 'PSTRADD'/ END BLOCK DATA PROGRAM PSBLA3TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PSBLA3TST is the main testing program for the Level 3 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * * from the following 61 lines: * 'Level 3 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PSBLAS3TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 16.0 threshold value of test ratio * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * 1.0E0 value of ALPHA * 1.0E0 value of BETA * 2 number of tests problems * 'N' 'U' values of DIAG * 'L' 'R' values of SIDE * 'N' 'T' values of TRANSA * 'N' 'T' values of TRANSB * 'U' 'L' values of UPLO * 3 4 values of M * 3 4 values of N * 3 4 values of K * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_B * 6 10 values of N_B * 2 5 values of IMB_B * 2 5 values of INB_B * 2 5 values of MB_B * 2 5 values of NB_B * 0 1 values of RSRC_B * 0 0 values of CSRC_B * 1 1 values of IB * 1 1 values of JB * 6 10 values of M_C * 6 10 values of N_C * 2 5 values of IMB_C * 2 5 values of INB_C * 2 5 values of MB_C * 2 5 values of NB_C * 0 1 values of RSRC_C * 0 0 values of CSRC_C * 1 1 values of IC * 1 1 values of JC * PSGEMM T put F for no test in the same column * PSSYMM T put F for no test in the same column * PSSYRK T put F for no test in the same column * PSSYR2K T put F for no test in the same column * PSTRMM T put F for no test in the same column * PSTRSM T put F for no test in the same column * PSGEADD T put F for no test in the same column * PSTRADD T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * REALSZ INTEGER * REALSZ indicates the length in bytes on the given platform * for a single precision real. By default, REALSZ is set to * four. * * MEM REAL array * MEM is an array of dimension TOTMEM / REALSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, REALSZ, TOTMEM, $ MEMSIZ, NSUBS REAL ONE, PADVAL, ZERO, ROGUE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ REALSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ, ZERO = 0.0E+0, $ ONE = 1.0E+0, PADVAL = -9923.0E+0, $ NSUBS = 8, ROGUE = -1.0E+10 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE CHARACTER*1 ADIAGDO, AFORM, CFORM, DIAG, SIDE, TRANSA, $ TRANSB, UPLO INTEGER CSRCA, CSRCB, CSRCC, I, IA, IAM, IASEED, IB, $ IBSEED, IC, ICSEED, ICTXT, IGAP, IMBA, IMBB, $ IMBC, IMIDA, IMIDB, IMIDC, INBA, INBB, INBC, $ IPA, IPB, IPC, IPG, IPMATA, IPMATB, IPMATC, $ IPOSTA, IPOSTB, IPOSTC, IPREA, IPREB, IPREC, $ IPW, IVERB, J, JA, JB, JC, K, L, LDA, LDB, LDC, $ M, MA, MB, MBA, MBB, MBC, MC, MEMREQD, MPA, $ MPB, MPC, MYCOL, MYROW, N, NA, NB, NBA, NBB, $ NBC, NC, NCOLA, NCOLB, NCOLC, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQA, NQB, NQC, NROWA, $ NROWB, NROWC, NTESTS, OFFDA, OFFDC, RSRCA, $ RSRCB, RSRCC, TSKIP, TSTCNT REAL ALPHA, BETA, SCALE, THRESH * .. * .. Local Arrays .. LOGICAL BCHECK( NSUBS ), CCHECK( NSUBS ), $ LTEST( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), SIDEVAL( MAXTESTS ), $ TRNAVAL( MAXTESTS ), TRNBVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCBVAL( MAXTESTS ), $ CSCCVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCAR( DLEN_ ), DESCB( DLEN_ ), $ DESCBR( DLEN_ ), DESCC( DLEN_ ), $ DESCCR( DLEN_ ), IAVAL( MAXTESTS ), $ IBVAL( MAXTESTS ), ICVAL( MAXTESTS ), $ IERR( 6 ), IMBAVAL( MAXTESTS ), $ IMBBVAL( MAXTESTS ), IMBCVAL( MAXTESTS ), $ INBAVAL( MAXTESTS ), INBBVAL( MAXTESTS ), $ INBCVAL( MAXTESTS ), JAVAL( MAXTESTS ), $ JBVAL( MAXTESTS ), JCVAL( MAXTESTS ) INTEGER KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), KVAL( MAXTESTS ), $ MAVAL( MAXTESTS ), MBAVAL( MAXTESTS ), $ MBBVAL( MAXTESTS ), MBCVAL( MAXTESTS ), $ MBVAL( MAXTESTS ), MCVAL( MAXTESTS ), $ MVAL( MAXTESTS ), NAVAL( MAXTESTS ), $ NBAVAL( MAXTESTS ), NBBVAL( MAXTESTS ), $ NBCVAL( MAXTESTS ), NBVAL( MAXTESTS ), $ NCVAL( MAXTESTS ), NVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCAVAL( MAXTESTS ), RSCBVAL( MAXTESTS ), $ RSCCVAL( MAXTESTS ) REAL MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_DESCSET2, PB_PSLAPRNT, PB_SCHEKPAD, $ PB_SFILLPAD, PB_SLASCAL, PB_SLASET, PMDESCCHK, $ PMDIMCHK, PSBLA3TSTINFO, PSBLAS3TSTCHK, $ PSBLAS3TSTCHKE, PSCHKARG3, PSCHKMOUT, PSGEADD, $ PSGEMM, PSLAGEN, PSLASCAL, PSLASET, PSMPRNT, $ PSSYMM, PSSYR2K, PSSYRK, PSTRADD, PSTRMM, $ PSTRSM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MOD, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA BCHECK/.TRUE., .TRUE., .FALSE., .TRUE., .TRUE., $ .TRUE., .FALSE., .FALSE./ DATA CCHECK/.TRUE., .TRUE., .TRUE., .TRUE., .FALSE., $ .FALSE., .TRUE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, * so that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IASEED = 100 IBSEED = 200 ICSEED = 300 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PSBLA3TSTINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, NVAL, $ KVAL, MAVAL, NAVAL, IMBAVAL, MBAVAL, $ INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, IAVAL, $ JAVAL, MBVAL, NBVAL, IMBBVAL, MBBVAL, $ INBBVAL, NBBVAL, RSCBVAL, CSCBVAL, IBVAL, $ JBVAL, MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, ICVAL, $ JCVAL, MAXTESTS, NGRIDS, PVAL, MAXGRIDS, $ QVAL, MAXGRIDS, NBLOG, LTEST, SOF, TEE, IAM, $ IGAP, IVERB, NPROCS, THRESH, ALPHA, BETA, $ MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PSBLAS3TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) SIDE = SIDEVAL( J ) TRANSA = TRNAVAL( J ) TRANSB = TRNBVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) K = KVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MB = MBVAL( J ) NB = NBVAL( J ) IMBB = IMBBVAL( J ) MBB = MBBVAL( J ) INBB = INBBVAL( J ) NBB = NBBVAL( J ) RSRCB = RSCBVAL( J ) CSRCB = CSCBVAL( J ) IB = IBVAL( J ) JB = JBVAL( J ) * MC = MCVAL( J ) NC = NCVAL( J ) IMBC = IMBCVAL( J ) MBC = MBCVAL( J ) INBC = INBCVAL( J ) NBC = NBCVAL( J ) RSRCC = RSCCVAL( J ) CSRCC = CSCCVAL( J ) IC = ICVAL( J ) JC = JCVAL( J ) * IF( IAM.EQ.0 ) THEN * TSTCNT = TSTCNT + 1 * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, K, SIDE, UPLO, TRANSA, $ TRANSB, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IB, JB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IC, JC, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC * WRITE( NOUT, FMT = 9995 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( SIDE, 'L' ).AND. $ .NOT.LSAME( SIDE, 'R' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'SIDE' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSA' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANSB, 'N' ).AND. $ .NOT.LSAME( TRANSB, 'T' ).AND. $ .NOT.LSAME( TRANSB, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSB' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'DIAG' TSKIP = TSKIP + 1 GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, IGAP, GAPMUL, IERR( 1 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'B', DESCB, $ BLOCK_CYCLIC_2D_INB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB, MPB, NQB, IPREB, $ IMIDB, IPOSTB, IGAP, GAPMUL, IERR( 2 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'C', DESCC, $ BLOCK_CYCLIC_2D_INB, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC, MPC, NQC, IPREC, $ IMIDC, IPOSTC, IGAP, GAPMUL, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDA = MAX( 1, MA ) LDB = MAX( 1, MB ) LDC = MAX( 1, MC ) * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPB = IPA + DESCA( LLD_ )*NQA + IPOSTA + IPREB IPC = IPB + DESCB( LLD_ )*NQB + IPOSTB + IPREC IPMATA = IPC + DESCC( LLD_ )*NQC + IPOSTC IPMATB = IPMATA + MA*NA IPMATC = IPMATB + MB*NB IPG = IPMATC + MAX( MB*NB, MC*NC ) * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * IPW = IPG + 2*MAX( M, MAX( N, K ) ) MEMREQD = IPW - 1 + MAX( MAX( MAX( IMBA, MBA ), $ MAX( IMBB, MBB ) ), $ MAX( IMBC, MBC ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) MEMREQD*REALSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 3 routines * DO 30 L = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( L ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9986 ) SNAMES( L ) END IF * * Define the size of the operands * IF( L.EQ.1 ) THEN * * PSGEMM * NROWC = M NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF( LSAME( TRANSB, 'N' ) ) THEN NROWB = K NCOLB = N ELSE NROWB = N NCOLB = K END IF * ELSE IF( L.EQ.2 ) THEN * * PSSYMM * NROWC = M NCOLC = N NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF * ELSE IF( L.EQ.3 ) THEN * * PSSYRK * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K ELSE NROWA = K NCOLA = N END IF NROWB = 0 NCOLB = 0 * ELSE IF( L.EQ.4 ) THEN * * PSSYR2K * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K NROWB = N NCOLB = K ELSE NROWA = K NCOLA = N NROWB = K NCOLB = N END IF * ELSE IF( L.EQ.5 .OR. L.EQ.6 ) THEN NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF NROWC = 0 NCOLC = 0 * ELSE IF( L.EQ.7 .OR. L.EQ.8 ) THEN * * PSGEADD, PSTRADD * IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = N ELSE NROWA = N NCOLA = M END IF NROWC = M NCOLC = N NROWB = 0 NCOLB = 0 * END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWB, NCOLB, 'B', IB, JB, $ DESCB, IERR( 2 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWC, NCOLC, 'C', IC, JC, $ DESCC, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN KSKIP( L ) = KSKIP( L ) + 1 GO TO 30 END IF * * Generate distributed matrices A, B and C * IF( L.EQ.2 ) THEN * * PSSYMM * AFORM = 'S' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.3 .OR. L.EQ.4 ) THEN * * PSSYRK, PSSYR2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'S' OFFDC = IC - JC * ELSE IF( ( L.EQ.6 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN * * PSTRSM * AFORM = 'N' ADIAGDO = 'D' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE * * Default values * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'N' OFFDC = 0 * END IF * CALL PSLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) * IF( BCHECK( L ) ) $ CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCB, IBSEED, MEM( IPB ), $ DESCB( LLD_ ) ) * IF( CCHECK( L ) ) $ CALL PSLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCC, ICSEED, MEM( IPC ), $ DESCC( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCAR, MA, NA, IMBA, INBA, MBA, NBA, $ -1, -1, ICTXT, MAX( 1, MA ) ) CALL PSLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCAR, IASEED, MEM( IPMATA ), $ DESCAR( LLD_ ) ) * IF( BCHECK( L ) ) THEN CALL PB_DESCSET2( DESCBR, MB, NB, IMBB, INBB, MBB, $ NBB, -1, -1, ICTXT, MAX( 1, MB ) ) CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCBR, IBSEED, MEM( IPMATB ), $ DESCBR( LLD_ ) ) END IF * IF( CCHECK( L ) ) THEN * CALL PB_DESCSET2( DESCCR, MC, NC, IMBC, INBC, MBC, $ NBC, -1, -1, ICTXT, MAX( 1, MC ) ) CALL PSLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCCR, ICSEED, MEM( IPMATC ), $ DESCCR( LLD_ ) ) * ELSE * * If C is not needed, generate a copy of B instead * CALL PB_DESCSET2( DESCCR, MB, NB, IMBB, INBB, MBB, $ NBB, -1, -1, ICTXT, MAX( 1, MB ) ) CALL PSLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCCR, IBSEED, MEM( IPMATC ), $ DESCCR( LLD_ ) ) * END IF * * Zero non referenced part of the matrices A, B, C * IF( ( L.EQ.2 ).AND.( MAX( NROWA, NCOLA ).GT.1 ) ) THEN * * The distributed matrix A is symmetric * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of A. * CALL PSLASET( 'Upper', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA, JA+1, DESCA ) * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of A. * CALL PSLASET( 'Lower', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA+1, JA, DESCA ) * END IF * ELSE IF( ( ( L.EQ.3 ).OR.( L.EQ.4 ) ).AND. $ ( MAX( NROWC, NCOLC ).GT.1 ) ) THEN * * The distributed matrix C is symmetric * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of C. * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PSLASET( 'Upper', NROWC-1, NCOLC-1, ROGUE, $ ROGUE, MEM( IPC ), IC, JC+1, $ DESCC ) CALL PB_SLASET( 'Upper', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC-1+JC*LDC ), LDC ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of C. * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PSLASET( 'Lower', NROWC-1, NCOLC-1, ROGUE, $ ROGUE, MEM( IPC ), IC+1, JC, $ DESCC ) CALL PB_SLASET( 'Lower', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC+(JC-1)*LDC ), $ LDC ) END IF * END IF * ELSE IF( L.EQ.5 .OR. L.EQ.6 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix A is lower triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PSLASET( 'Upper', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA, $ JA+1, DESCA ) CALL PB_SLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA-1+JA*LDA ), $ LDA ) END IF * ELSE * CALL PSLASET( 'Upper', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_SLASET( 'Upper', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( L.EQ.6 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / REAL( MAX( NROWA, NCOLA ) ) CALL PSLASCAL( 'Lower', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA+1, JA, $ DESCA ) CALL PB_SLASCAL( 'Lower', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix A is upper triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PSLASET( 'Lower', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA+1, $ JA, DESCA ) CALL PB_SLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * ELSE * CALL PSLASET( 'Lower', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_SLASET( 'Lower', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( L.EQ.6 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / REAL( MAX( NROWA, NCOLA ) ) CALL PSLASCAL( 'Upper', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA, JA+1, $ DESCA ) CALL PB_SLASCAL( 'Upper', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * END IF * END IF * ELSE IF( L.EQ.8 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix C is lower triangular * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PSLASET( 'Upper', NROWC-1, NCOLC-1, $ ROGUE, ROGUE, MEM( IPC ), IC, $ JC+1, DESCC ) CALL PB_SLASET( 'Upper', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC-1+JC*LDC ), LDC ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix C is upper triangular * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PSLASET( 'Lower', NROWC-1, NCOLC-1, $ ROGUE, ROGUE, MEM( IPC ), IC+1, $ JC, DESCC ) CALL PB_SLASET( 'Lower', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC+(JC-1)*LDC ), $ LDC ) END IF * END IF * END IF * * Pad the guard zones of A, B and C * CALL PB_SFILLPAD( ICTXT, MPA, NQA, MEM( IPA-IPREA ), $ DESCA( LLD_ ), IPREA, IPOSTA, PADVAL ) * IF( BCHECK( L ) ) THEN CALL PB_SFILLPAD( ICTXT, MPB, NQB, MEM( IPB-IPREB ), $ DESCB( LLD_ ), IPREB, IPOSTB, $ PADVAL ) END IF * IF( CCHECK( L ) ) THEN CALL PB_SFILLPAD( ICTXT, MPC, NQC, MEM( IPC-IPREC ), $ DESCC( LLD_ ), IPREC, IPOSTC, $ PADVAL ) END IF * * Initialize the check for INPUT-only arguments. * INFO = 0 CALL PSCHKARG3( ICTXT, NOUT, SNAMES( L ), SIDE, UPLO, $ TRANSA, TRANSB, DIAG, M, N, K, ALPHA, IA, $ JA, DESCA, IB, JB, DESCB, BETA, IC, JC, $ DESCC, INFO ) * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN CALL PB_PSLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, $ 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PSLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPW ) ) END IF * IF( BCHECK( L ) ) THEN IF( IVERB.EQ.2 ) THEN CALL PB_PSLAPRNT( NROWB, NCOLB, MEM( IPB ), IB, JB, $ DESCB, 0, 0, $ 'PARALLEL_INITIAL_B', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PSLAPRNT( MB, NB, MEM( IPB ), 1, 1, DESCB, $ 0, 0, 'PARALLEL_INITIAL_B', NOUT, $ MEM( IPW ) ) END IF END IF * IF( CCHECK( L ) ) THEN IF( IVERB.EQ.2 ) THEN CALL PB_PSLAPRNT( NROWC, NCOLC, MEM( IPC ), IC, JC, $ DESCC, 0, 0, $ 'PARALLEL_INITIAL_C', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PSLAPRNT( MC, NC, MEM( IPC ), 1, 1, DESCC, $ 0, 0, 'PARALLEL_INITIAL_C', NOUT, $ MEM( IPW ) ) END IF END IF * * Call the Level 3 PBLAS routine * INFO = 0 IF( L.EQ.1 ) THEN * * Test PSGEMM * CALL PSGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.2 ) THEN * * Test PSSYMM * CALL PSSYMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.3 ) THEN * * Test PSSYRK * CALL PSSYRK( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.4 ) THEN * * Test PSSYR2K * CALL PSSYR2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.5 ) THEN * * Test PSTRMM * CALL PSTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) * ELSE IF( L.EQ.6 ) THEN * * Test PSTRSM * CALL PSTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) * * ELSE IF( L.EQ.7 ) THEN * * Test PSGEADD * CALL PSGEADD( TRANSA, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, BETA, MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.8 ) THEN * * Test PSTRADD * CALL PSTRADD( UPLO, TRANSA, M, N, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( L ) = KSKIP( L ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9974 ) INFO GO TO 30 END IF * * Check padding * CALL PB_SCHEKPAD( ICTXT, SNAMES( L ), MPA, NQA, $ MEM( IPA-IPREA ), DESCA( LLD_ ), $ IPREA, IPOSTA, PADVAL ) * IF( BCHECK( L ) ) THEN CALL PB_SCHEKPAD( ICTXT, SNAMES( L ), MPB, NQB, $ MEM( IPB-IPREB ), DESCB( LLD_ ), $ IPREB, IPOSTB, PADVAL ) END IF * IF( CCHECK( L ) ) THEN CALL PB_SCHEKPAD( ICTXT, SNAMES( L ), MPC, NQC, $ MEM( IPC-IPREC ), DESCC( LLD_ ), $ IPREC, IPOSTC, PADVAL ) END IF * * Check the computations * CALL PSBLAS3TSTCHK( ICTXT, NOUT, L, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, $ MEM( IPMATA ), MEM( IPA ), IA, JA, $ DESCA, MEM( IPMATB ), MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPMATC ), $ MEM( IPC ), IC, JC, DESCC, THRESH, $ ROGUE, MEM( IPG ), INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 ) THEN IERR( 3 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 IERR( 3 ) = 1 END IF * * Check input-only scalar arguments * INFO = 1 CALL PSCHKARG3( ICTXT, NOUT, SNAMES( L ), SIDE, UPLO, $ TRANSA, TRANSB, DIAG, M, N, K, ALPHA, IA, $ JA, DESCA, IB, JB, DESCB, BETA, IC, JC, $ DESCC, INFO ) * * Check input-only array arguments * CALL PSCHKMOUT( NROWA, NCOLA, MEM( IPMATA ), $ MEM( IPA ), IA, JA, DESCA, IERR( 4 ) ) IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_A', $ SNAMES( L ) END IF * IF( BCHECK( L ) ) THEN CALL PSCHKMOUT( NROWB, NCOLB, MEM( IPMATB ), $ MEM( IPB ), IB, JB, DESCB, IERR( 5 ) ) IF( IERR( 5 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_B', $ SNAMES( L ) END IF END IF * IF( CCHECK( L ) ) THEN CALL PSCHKMOUT( NROWC, NCOLC, MEM( IPMATC ), $ MEM( IPC ), IC, JC, DESCC, IERR( 6 ) ) IF( IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_C', $ SNAMES( L ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE.0 .OR. IERR( 5 ).NE.0 .OR. $ IERR( 6 ).NE.0 ) THEN KFAIL( L ) = KFAIL( L ) + 1 ERRFLG = .TRUE. IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) SNAMES( L ) ELSE KPASS( L ) = KPASS( L ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) SNAMES( L ) END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PSMPRNT( ICTXT, NOUT, MA, NA, MEM( IPMATA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PSLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( ( NROWA.GT.0 ).AND.( NCOLA.GT.0 ) ) $ CALL PSMPRNT( ICTXT, NOUT, NROWA, NCOLA, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PSLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) END IF IF( BCHECK( L ) ) THEN IF( IERR( 5 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PSMPRNT( ICTXT, NOUT, MB, NB, $ MEM( IPMATB ), LDB, 0, 0, $ 'SERIAL_B' ) CALL PB_PSLAPRNT( MB, NB, MEM( IPB ), 1, 1, $ DESCB, 0, 0, 'PARALLEL_B', $ NOUT, MEM( IPMATB ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( ( NROWB.GT.0 ).AND.( NCOLB.GT.0 ) ) $ CALL PSMPRNT( ICTXT, NOUT, NROWB, NCOLB, $ MEM( IPMATB+IB-1+(JB-1)*LDB ), $ LDB, 0, 0, 'SERIAL_B' ) CALL PB_PSLAPRNT( NROWB, NCOLB, MEM( IPB ), IB, $ JB, DESCB, 0, 0, 'PARALLEL_B', $ NOUT, MEM( IPMATB ) ) END IF END IF IF( CCHECK( L ) ) THEN IF( IERR( 6 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PSMPRNT( ICTXT, NOUT, MC, NC, $ MEM( IPMATC ), LDC, 0, 0, $ 'SERIAL_C' ) CALL PB_PSLAPRNT( MC, NC, MEM( IPC ), 1, 1, $ DESCC, 0, 0, 'PARALLEL_C', $ NOUT, MEM( IPMATC ) ) ELSE IF( IERR( 3 ).NE.0 ) THEN IF( ( NROWB.GT.0 ).AND.( NCOLB.GT.0 ) ) $ CALL PSMPRNT( ICTXT, NOUT, NROWC, NCOLC, $ MEM( IPMATC+IC-1+(JC-1)*LDC ), $ LDC, 0, 0, 'SERIAL_C' ) CALL PB_PSLAPRNT( NROWC, NCOLC, MEM( IPC ), IC, $ JC, DESCC, 0, 0, 'PARALLEL_C', $ NOUT, MEM( IPMATC ) ) END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9982 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9981 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '-------------------' ) 9994 FORMAT( 2X, ' M N K SIDE UPLO TRANSA ', $ 'TRANSB DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,1X,I6,6X,A1,5X,A1,7X,A1,7X,A1,5X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IB JB MB NB IMBB INBB', $ ' MBB NBB RSRCB CSRCB' ) 9989 FORMAT( 2X, ' IC JC MC NC IMBC INBC', $ ' MBC NBC RSRCC CSRCC' ) 9988 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9987 FORMAT( 'Not enough memory. Need: ', I12 ) 9986 FORMAT( 2X, ' Tested Subroutine: ', A ) 9985 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9984 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9983 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9982 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9981 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9980 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9979 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9978 FORMAT( 2X, 'Testing Summary') 9977 FORMAT( 2X, 'End of Tests.' ) 9976 FORMAT( 2X, 'Tests started.' ) 9975 FORMAT( 2X, ' ***** ', A, ' has an incorrect value: ', $ ' BYPASS *****' ) 9974 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PSBLA3TST * END SUBROUTINE PSBLA3TSTINFO( SUMMRY, NOUT, NMAT, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, $ NVAL, KVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, $ CSCAVAL, IAVAL, JAVAL, MBVAL, NBVAL, $ IMBBVAL, MBBVAL, INBBVAL, NBBVAL, $ RSCBVAL, CSCBVAL, IBVAL, JBVAL, $ MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, $ ICVAL, JCVAL, LDVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, NBLOG, LTEST, SOF, $ TEE, IAM, IGAP, IVERB, NPROCS, THRESH, $ ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, NBLOG, $ NGRIDS, NMAT, NOUT, NPROCS REAL ALPHA, BETA, THRESH * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), SIDEVAL( LDVAL ), $ TRNAVAL( LDVAL ), TRNBVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCBVAL( LDVAL ), $ CSCCVAL( LDVAL ), IAVAL( LDVAL ), $ IBVAL( LDVAL ), ICVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBBVAL( LDVAL ), $ IMBCVAL( LDVAL ), INBAVAL( LDVAL ), $ INBBVAL( LDVAL ), INBCVAL( LDVAL ), $ JAVAL( LDVAL ), JBVAL( LDVAL ), JCVAL( LDVAL ), $ KVAL( LDVAL ), MAVAL( LDVAL ), MBAVAL( LDVAL ), $ MBBVAL( LDVAL ), MBCVAL( LDVAL ), $ MBVAL( LDVAL ), MCVAL( LDVAL ), MVAL( LDVAL ), $ NAVAL( LDVAL ), NBAVAL( LDVAL ), $ NBBVAL( LDVAL ), NBCVAL( LDVAL ), $ NBVAL( LDVAL ), NCVAL( LDVAL ), NVAL( LDVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCBVAL( LDVAL ), $ RSCCVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PSBLA3TSTINFO get the needed startup information for testing various * Level 3 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * SIDEVAL (global output) CHARACTER array * On entry, SIDEVAL is an array of dimension LDVAL. On exit, * this array contains the values of SIDE to run the code with. * * TRNAVAL (global output) CHARACTER array * On entry, TRNAVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSA to run the code * with. * * TRNBVAL (global output) CHARACTER array * On entry, TRNBVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSB to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * KVAL (global output) INTEGER array * On entry, KVAL is an array of dimension LDVAL. On exit, this * array contains the values of K to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MBVAL (global output) INTEGER array * On entry, MBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( M_ ) to run the code * with. * * NBVAL (global output) INTEGER array * On entry, NBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( N_ ) to run the code * with. * * IMBBVAL (global output) INTEGER array * On entry, IMBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( IMB_ ) to run the * code with. * * MBBVAL (global output) INTEGER array * On entry, MBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( MB_ ) to run the * code with. * * INBBVAL (global output) INTEGER array * On entry, INBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( INB_ ) to run the * code with. * * NBBVAL (global output) INTEGER array * On entry, NBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( NB_ ) to run the * code with. * * RSCBVAL (global output) INTEGER array * On entry, RSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( RSRC_ ) to run the * code with. * * CSCBVAL (global output) INTEGER array * On entry, CSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( CSRC_ ) to run the * code with. * * IBVAL (global output) INTEGER array * On entry, IBVAL is an array of dimension LDVAL. On exit, this * array contains the values of IB to run the code with. * * JBVAL (global output) INTEGER array * On entry, JBVAL is an array of dimension LDVAL. On exit, this * array contains the values of JB to run the code with. * * MCVAL (global output) INTEGER array * On entry, MCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( M_ ) to run the code * with. * * NCVAL (global output) INTEGER array * On entry, NCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( N_ ) to run the code * with. * * IMBCVAL (global output) INTEGER array * On entry, IMBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( IMB_ ) to run the * code with. * * MBCVAL (global output) INTEGER array * On entry, MBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( MB_ ) to run the * code with. * * INBCVAL (global output) INTEGER array * On entry, INBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( INB_ ) to run the * code with. * * NBCVAL (global output) INTEGER array * On entry, NBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( NB_ ) to run the * code with. * * RSCCVAL (global output) INTEGER array * On entry, RSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( RSRC_ ) to run the * code with. * * CSCCVAL (global output) INTEGER array * On entry, CSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( CSRC_ ) to run the * code with. * * ICVAL (global output) INTEGER array * On entry, ICVAL is an array of dimension LDVAL. On exit, this * array contains the values of IC to run the code with. * * JCVAL (global output) INTEGER array * On entry, JCVAL is an array of dimension LDVAL. On exit, this * array contains the values of JC to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, SIDE, TRANSA, TRANSB, UPLO, * M, N, K, DESCA(:), IA, JA, DESCB(:), IB, JB, DESCC(:), IC, * JC. This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 3 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * THRESH (global output) REAL * On exit, THRESH specifies the threshhold value for the test * ratio. * * ALPHA (global output) REAL * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) REAL * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+38*NMAT+NSUBS+4 ) with NSUBS equal to 8. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J REAL EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PSBLAS3TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Read the threshold value for test ratio * READ( NIN, FMT = * ) THRESH IF( THRESH.LT.0.0 ) $ THRESH = 16.0 * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( SIDEVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( KVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( ICVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JCVAL ( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( SIDEVAL( J ) ) WORK( I+2 ) = ICHAR( TRNAVAL( J ) ) WORK( I+3 ) = ICHAR( TRNBVAL( J ) ) WORK( I+4 ) = ICHAR( UPLOVAL( J ) ) I = I + 5 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, KVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, ICVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JCVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 3 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the real single precision '// $ 'Level 3 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9979 ) NBLOG WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9980 ) THRESH WRITE( NOUT, FMT = 9982 ) ALPHA WRITE( NOUT, FMT = 9981 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 38*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) SIDEVAL( J ) = CHAR( WORK( I+1 ) ) TRNAVAL( J ) = CHAR( WORK( I+2 ) ) TRNBVAL( J ) = CHAR( WORK( I+3 ) ) UPLOVAL( J ) = CHAR( WORK( I+4 ) ) I = I + 5 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, KVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, ICVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JCVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : ', G16.6 ) 9981 FORMAT( 2X, 'Beta : ', G16.6 ) 9980 FORMAT( 2X, 'Threshold value : ', G16.6 ) 9979 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PSBLA3TSTINFO * END SUBROUTINE PSBLAS3TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PSBLAS3TSTCHKE tests the error exits of the Level 3 PBLAS. * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 7 (NSUBS). * If LTEST( 1 ) is .TRUE., PSGEMM will be tested; * If LTEST( 2 ) is .TRUE., PSSYMM will be tested; * If LTEST( 3 ) is .TRUE., PSSYRK will be tested; * If LTEST( 4 ) is .TRUE., PSSYR2K will be tested; * If LTEST( 5 ) is .TRUE., PSTRMM will be tested; * If LTEST( 6 ) is .TRUE., PSTRSM will be tested; * If LTEST( 7 ) is .TRUE., PSGEADD will be tested; * If LTEST( 8 ) is .TRUE., PSTRADD will be tested; * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PSDIMEE, PSGEADD, PSGEMM, $ PSMATEE, PSOPTEE, PSSYMM, PSSYR2K, PSSYRK, $ PSTRADD, PSTRMM, PSTRSM * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/31, 32, 33, 35, 38, 38, 39, 40/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PSGEMM * I = 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSGEMM, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSGEMM, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSGEMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PSSYMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSSYMM, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSSYMM, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSSYMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PSSYRK * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSSYRK, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSSYRK, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSSYRK, SCODE( I ), SNAMES( I ) ) END IF * * Test PSSYR2K * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSSYR2K, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSSYR2K, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSSYR2K, SCODE( I ), SNAMES( I ) ) END IF * * Test PSTRMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSTRMM, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSTRMM, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSTRMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PSTRSM * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSTRSM, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSTRSM, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSTRSM, SCODE( I ), SNAMES( I ) ) END IF * * Test PSGEADD * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSGEADD, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSGEADD, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSGEADD, SCODE( I ), SNAMES( I ) ) END IF * * Test PSTRADD * I = I + 1 IF( LTEST( I ) ) THEN CALL PSOPTEE( ICTXT, NOUT, PSTRADD, SCODE( I ), SNAMES( I ) ) CALL PSDIMEE( ICTXT, NOUT, PSTRADD, SCODE( I ), SNAMES( I ) ) CALL PSMATEE( ICTXT, NOUT, PSTRADD, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PSBLAS3TSTCHKE * END SUBROUTINE PSCHKARG3( ICTXT, NOUT, SNAME, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, IA, JA, $ DESCA, IB, JB, DESCB, BETA, IC, JC, DESCC, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N, $ NOUT REAL ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*7 SNAME INTEGER DESCA( * ), DESCB( * ), DESCC( * ) * .. * * Purpose * ======= * * PSCHKARG3 checks the input-only arguments of the Level 3 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies the SIDE option in the Level 3 PBLAS * operation. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the UPLO option in the Level 3 PBLAS * operation. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the TRANSA option in the Level 3 * PBLAS operation. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies the TRANSB option in the Level 3 * PBLAS operation. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies the DIAG option in the Level 3 PBLAS * operation. * * M (global input) INTEGER * On entry, M specifies the dimension of the submatrix ope- * rands. * * N (global input) INTEGER * On entry, N specifies the dimension of the submatrix ope- * rands. * * K (global input) INTEGER * On entry, K specifies the dimension of the submatrix ope- * rands. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 DIAGREF, SIDEREF, TRANSAREF, TRANSBREF, UPLOREF INTEGER I, IAREF, IBREF, ICREF, JAREF, JBREF, JCREF, $ KREF, MREF, MYCOL, MYROW, NPCOL, NPROW, NREF REAL ALPHAREF, BETAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCAREF( DLEN_ ), DESCBREF( DLEN_ ), $ DESCCREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * DIAGREF = DIAG SIDEREF = SIDE TRANSAREF = TRANSA TRANSBREF = TRANSB UPLOREF = UPLO MREF = M NREF = N KREF = K ALPHAREF = ALPHA IAREF = IA JAREF = JA DO 10 I = 1, DLEN_ DESCAREF( I ) = DESCA( I ) 10 CONTINUE IBREF = IB JBREF = JB DO 20 I = 1, DLEN_ DESCBREF( I ) = DESCB( I ) 20 CONTINUE BETAREF = BETA ICREF = IC JCREF = JC DO 30 I = 1, DLEN_ DESCCREF( I ) = DESCC( I ) 30 CONTINUE * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( .NOT. LSAME( DIAG, DIAGREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DIAG' ELSE IF( .NOT. LSAME( SIDE, SIDEREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'SIDE' ELSE IF( .NOT. LSAME( TRANSA, TRANSAREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANSA' ELSE IF( .NOT. LSAME( TRANSB, TRANSBREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANSB' ELSE IF( .NOT. LSAME( UPLO, UPLOREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'UPLO' ELSE IF( M.NE.MREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'M' ELSE IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( K.NE.KREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'K' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE IF( IA.NE.IAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IA' ELSE IF( JA.NE.JAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JA' ELSE IF( DESCA( DTYPE_ ).NE.DESCAREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( DTYPE_ )' ELSE IF( DESCA( M_ ).NE.DESCAREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( M_ )' ELSE IF( DESCA( N_ ).NE.DESCAREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( N_ )' ELSE IF( DESCA( IMB_ ).NE.DESCAREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( IMB_ )' ELSE IF( DESCA( INB_ ).NE.DESCAREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( INB_ )' ELSE IF( DESCA( MB_ ).NE.DESCAREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( MB_ )' ELSE IF( DESCA( NB_ ).NE.DESCAREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( NB_ )' ELSE IF( DESCA( RSRC_ ).NE.DESCAREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( RSRC_ )' ELSE IF( DESCA( CSRC_ ).NE.DESCAREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CSRC_ )' ELSE IF( DESCA( CTXT_ ).NE.DESCAREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CTXT_ )' ELSE IF( DESCA( LLD_ ).NE.DESCAREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( LLD_ )' ELSE IF( IB.NE.IBREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IB' ELSE IF( JB.NE.JBREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JB' ELSE IF( DESCB( DTYPE_ ).NE.DESCBREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( DTYPE_ )' ELSE IF( DESCB( M_ ).NE.DESCBREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( M_ )' ELSE IF( DESCB( N_ ).NE.DESCBREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( N_ )' ELSE IF( DESCB( IMB_ ).NE.DESCBREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( IMB_ )' ELSE IF( DESCB( INB_ ).NE.DESCBREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( INB_ )' ELSE IF( DESCB( MB_ ).NE.DESCBREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( MB_ )' ELSE IF( DESCB( NB_ ).NE.DESCBREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( NB_ )' ELSE IF( DESCB( RSRC_ ).NE.DESCBREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( RSRC_ )' ELSE IF( DESCB( CSRC_ ).NE.DESCBREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( CSRC_ )' ELSE IF( DESCB( CTXT_ ).NE.DESCBREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( CTXT_ )' ELSE IF( DESCB( LLD_ ).NE.DESCBREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( LLD_ )' ELSE IF( BETA.NE.BETAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'BETA' ELSE IF( IC.NE.ICREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IC' ELSE IF( JC.NE.JCREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JC' ELSE IF( DESCC( DTYPE_ ).NE.DESCCREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( DTYPE_ )' ELSE IF( DESCC( M_ ).NE.DESCCREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( M_ )' ELSE IF( DESCC( N_ ).NE.DESCCREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( N_ )' ELSE IF( DESCC( IMB_ ).NE.DESCCREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( IMB_ )' ELSE IF( DESCC( INB_ ).NE.DESCCREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( INB_ )' ELSE IF( DESCC( MB_ ).NE.DESCCREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( MB_ )' ELSE IF( DESCC( NB_ ).NE.DESCCREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( NB_ )' ELSE IF( DESCC( RSRC_ ).NE.DESCCREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( RSRC_ )' ELSE IF( DESCC( CSRC_ ).NE.DESCCREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( CSRC_ )' ELSE IF( DESCC( CTXT_ ).NE.DESCCREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( CTXT_ )' ELSE IF( DESCC( LLD_ ).NE.DESCCREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( LLD_ )' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.NE.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PSCHKARG3 * END SUBROUTINE PSBLAS3TSTCHK( ICTXT, NOUT, NROUT, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, A, PA, IA, $ JA, DESCA, B, PB, IB, JB, DESCB, BETA, $ C, PC, IC, JC, DESCC, THRESH, ROGUE, $ WORK, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N, $ NOUT, NROUT REAL ALPHA, BETA, ROGUE, THRESH * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) REAL A( * ), B( * ), C( * ), PA( * ), PB( * ), $ PC( * ), WORK( * ) * .. * * Purpose * ======= * * PSBLAS3TSTCHK performs the computational tests of the Level 3 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PSGEMM will be tested; * else if NROUT = 2, PSSYMM will be tested; * else if NROUT = 3, PSSYRK will be tested; * else if NROUT = 4, PSSYR2K will be tested; * else if NROUT = 5, PSTRMM will be tested; * else if NROUT = 6, PSTRSM will be tested; * else if NROUT = 7, PSGEADD will be tested; * else if NROUT = 8, PSTRADD will be tested; * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies if the multiplication should be per- * formed from the left or the right. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies if the upper or lower part of the * matrix operand is to be referenced. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies if the matrix operand A is to be * transposed. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies if the matrix operand B is to be * transposed. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the triangular matrix operand is * unit or non-unit. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANSA = 'N' (resp. TRANSA <> 'N') in PxGEMM, PxSYRK, * PxSYR2K, PxHERK and PxHER2K. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) REAL array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) REAL array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * PB (local input) REAL array * On entry, PB is an array of dimension (DESCB( LLD_ ),*). This * array contains the local entries of the matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * C (local input/local output) REAL array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) REAL array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * THRESH (global input) REAL * On entry, THRESH is the threshold value for the test ratio. * * ROGUE (global input) REAL * On entry, ROGUE specifies the constant used to pad the * non-referenced part of triangular or symmetric matrices. * * WORK (workspace) REAL array * On entry, WORK is an array of dimension LWORK where LWORK is * at least 2*MAX( M, MAX( N, K ) ). This array is used to store * a copy of a column of C and the computed gauges (see PSMMCH). * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on A has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on B has been found, * if( MOD( INFO/4, 2 ) = 1 ) then an error on C has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, IPG, MYCOL, MYROW, NPCOL, NPROW REAL ERR * .. * .. Local Arrays .. INTEGER IERR( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_SLASET, PSCHKMIN, PSMMCH, $ PSMMCH1, PSMMCH2, PSMMCH3, PSTRMM, STRSM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 10 I = 1, 3 IERR( I ) = 0 10 CONTINUE IPG = MAX( M, MAX( N, K ) ) + 1 * IF( NROUT.EQ.1 ) THEN * * Test PSGEMM * * Check the resulting matrix C * CALL PSMMCH( ICTXT, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, $ DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, JC, $ DESCC, WORK, WORK( IPG ), ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PSCHKMIN( ERR, M, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PSCHKMIN( ERR, K, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF IF( LSAME( TRANSB, 'N' ) ) THEN CALL PSCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PSCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.2 ) THEN * * Test PSSYMM * * Check the resulting matrix C * IF( LSAME( SIDE, 'L' ) ) THEN CALL PSMMCH( ICTXT, 'No transpose', 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, $ BETA, C, PC, IC, JC, DESCC, WORK, WORK( IPG ), $ ERR, IERR( 3 ) ) ELSE CALL PSMMCH( ICTXT, 'No transpose', 'No transpose', M, N, N, $ ALPHA, B, IB, JB, DESCB, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, WORK, WORK( IPG ), $ ERR, IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_SLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_SLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF * IF( LSAME( SIDE, 'L' ) ) THEN CALL PSCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PSCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF CALL PSCHKMIN( ERR, M, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) * ELSE IF( NROUT.EQ.3 ) THEN * * Test PSSYRK * * Check the resulting matrix C * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PSMMCH1( ICTXT, UPLO, 'No transpose', N, K, ALPHA, A, $ IA, JA, DESCA, BETA, C, PC, IC, JC, DESCC, $ WORK, WORK( IPG ), ERR, IERR( 3 ) ) ELSE CALL PSMMCH1( ICTXT, UPLO, 'Transpose', N, K, ALPHA, A, IA, $ JA, DESCA, BETA, C, PC, IC, JC, DESCC, WORK, $ WORK( IPG ), ERR, IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PSCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PSCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.4 ) THEN * * Test PSSYR2K * * Check the resulting matrix C * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PSMMCH2( ICTXT, UPLO, 'No transpose', N, K, ALPHA, A, $ IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, $ IC, JC, DESCC, WORK, WORK( IPG ), ERR, $ IERR( 3 ) ) ELSE CALL PSMMCH2( ICTXT, UPLO, 'Transpose', N, K, ALPHA, A, $ IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, $ IC, JC, DESCC, WORK, WORK( IPG ), ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PSCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PSCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PSCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PSCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.5 ) THEN * * Test PSTRMM * * Check the resulting matrix B * IF( LSAME( SIDE, 'L' ) ) THEN CALL PSMMCH( ICTXT, TRANSA, 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, C, IB, JB, DESCB, $ ZERO, B, PB, IB, JB, DESCB, WORK, $ WORK( IPG ), ERR, IERR( 2 ) ) ELSE CALL PSMMCH( ICTXT, 'No transpose', TRANSA, M, N, N, $ ALPHA, C, IB, JB, DESCB, A, IA, JA, DESCA, $ ZERO, B, PB, IB, JB, DESCB, WORK, $ WORK( IPG ), ERR, IERR( 2 ) ) END IF * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( SIDE, 'L' ) ) THEN IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Upper', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Lower', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PSCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PSCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.6 ) THEN * * Test PSTRSM * * Check the resulting matrix B * CALL STRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ), $ B( IB+(JB-1)*DESCB( M_ ) ), DESCB( M_ ) ) CALL PSTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, PA, IA, JA, $ DESCA, PB, IB, JB, DESCB ) IF( LSAME( SIDE, 'L' ) ) THEN CALL PSMMCH( ICTXT, TRANSA, 'No transpose', M, N, M, ALPHA, $ A, IA, JA, DESCA, B, IB, JB, DESCB, ZERO, C, $ PB, IB, JB, DESCB, WORK, WORK( IPG ), ERR, $ IERR( 2 ) ) ELSE CALL PSMMCH( ICTXT, 'No transpose', TRANSA, M, N, N, ALPHA, $ B, IB, JB, DESCB, A, IA, JA, DESCA, ZERO, C, $ PB, IB, JB, DESCB, WORK, WORK( IPG ), ERR, $ IERR( 2 ) ) END IF * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.THRESH ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( SIDE, 'L' ) ) THEN IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Upper', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Lower', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PSCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_SLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_SLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PSCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF ELSE IF( NROUT.EQ.7 ) THEN * * Test PSGEADD * * Check the resulting matrix C * CALL PSMMCH3( 'All', TRANSA, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, IERR( 3 ) ) * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PSCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PSCHKMIN( ERR, N, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.8 ) THEN * * Test PSTRADD * * Check the resulting matrix C * CALL PSMMCH3( UPLO, TRANSA, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, IERR( 3 ) ) * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PSCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PSCHKMIN( ERR, N, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * END IF * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'A' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'B' END IF * IF( IERR( 3 ).NE.0 ) THEN INFO = INFO + 4 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'C' END IF * 9999 FORMAT( 2X, ' ***** ERROR: Matrix operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** FATAL ERROR - Computed result is less ', $ 'than half accurate *****' ) 9997 FORMAT( 2X, ' ***** Test completed with maximum test ratio: ', $ F11.5, ' SUSPECT *****' ) * RETURN * * End of PSBLAS3TSTCHK * END scalapack-2.0.2/PBLAS/TESTING/psblastst.f000644 000766 000024 00001354114 11622500733 020065 0ustar00juliestaff000000 000000 SUBROUTINE PSOPTEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PSOPTEE tests whether the PBLAS respond correctly to a bad option * argument. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PSCHKOPT * .. * .. Executable Statements .. * * Level 2 PBLAS * IF( SCODE.EQ.21 ) THEN * * Check 1st (and only) option * APOS = 1 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.22 .OR. SCODE.EQ.25 .OR. SCODE.EQ.26 .OR. $ SCODE.EQ.27 ) THEN * * Check 1st (and only) option * APOS = 1 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st option * APOS = 1 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 2nd option * APOS = 2 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 3rd option * APOS = 3 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'D', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st option * APOS = 1 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2'nd option * APOS = 2 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * ELSE IF( SCODE.EQ.32 ) THEN * * Check 1st option * APOS = 1 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'S', APOS ) * * Check 2nd option * APOS = 2 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 .OR. SCODE.EQ.35 .OR. $ SCODE.EQ.36 .OR. SCODE.EQ.40 ) THEN * * Check 1st option * APOS = 1 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 2'nd option * APOS = 2 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st option * APOS = 1 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'S', APOS ) * * Check 2nd option * APOS = 2 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 3rd option * APOS = 3 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 4th option * APOS = 4 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'D', APOS ) * * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st option * APOS = 1 CALL PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * END IF * RETURN * * End of PSOPTEE * END SUBROUTINE PSCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PSCHKOPT tests the option ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the option to be * checked. ARGNAM can either be 'D', 'S', 'A', 'B', or 'U'. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the option ARGNAM * to be tested. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER INFOT * .. * .. External Subroutines .. EXTERNAL PCHKPBE, PSCALLSUB, PSSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. CHARACTER DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO * .. * .. Executable Statements .. * * Reiniatilize the dummy arguments to correct values * CALL PSSETPBLAS( ICTXT ) * IF( LSAME( ARGNAM, 'D' ) ) THEN * * Generate bad DIAG option * DIAG = '/' * ELSE IF( LSAME( ARGNAM, 'S' ) ) THEN * * Generate bad SIDE option * SIDE = '/' * ELSE IF( LSAME( ARGNAM, 'A' ) ) THEN * * Generate bad TRANSA option * TRANSA = '/' * ELSE IF( LSAME( ARGNAM, 'B' ) ) THEN * * Generate bad TRANSB option * TRANSB = '/' * ELSE IF( LSAME( ARGNAM, 'U' ) ) THEN * * Generate bad UPLO option * UPLO = '/' * END IF * * Set INFOT to the position of the bad dimension argument * INFOT = ARGPOS * * Call the PBLAS routine * CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * RETURN * * End of PSCHKOPT * END SUBROUTINE PSDIMEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PSDIMEE tests whether the PBLAS respond correctly to a bad dimension * argument. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PSCHKDIM * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 .OR. SCODE.EQ.12 .OR. SCODE.EQ.13 .OR. $ SCODE.EQ.14 .OR. SCODE.EQ.15 ) THEN * * Check 1st (and only) dimension * APOS = 1 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * * Check 1st dimension * APOS = 2 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 3 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.22 .OR. SCODE.EQ.25 .OR. SCODE.EQ.26 .OR. $ SCODE.EQ.27 ) THEN * * Check 1st (and only) dimension * APOS = 2 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st (and only) dimension * APOS = 4 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.24 ) THEN * * Check 1st dimension * APOS = 1 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 2 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st dimension * APOS = 3 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Check 3rd dimension * APOS = 5 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'K', APOS ) * ELSE IF( SCODE.EQ.32 ) THEN * * Check 1st dimension * APOS = 3 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 .OR. SCODE.EQ.35 .OR. $ SCODE.EQ.36 ) THEN * * Check 1st dimension * APOS = 3 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Check 2nd dimension * APOS = 4 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'K', APOS ) * ELSE IF( SCODE.EQ.37 ) THEN * * Check 1st dimension * APOS = 1 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 2 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st dimension * APOS = 5 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 6 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st dimension * APOS = 2 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 3 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.40 ) THEN * * Check 1st dimension * APOS = 3 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * END IF * RETURN * * End of PSDIMEE * END SUBROUTINE PSCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PSCHKDIM tests the dimension ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the dimension to be * checked. ARGNAM can either be 'M', 'N' or 'K'. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the option ARGNAM * to be tested. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER INFOT * .. * .. External Subroutines .. EXTERNAL PCHKPBE, PSCALLSUB, PSSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. INTEGER KDIM, MDIM, NDIM COMMON /PBLASN/KDIM, MDIM, NDIM * .. * .. Executable Statements .. * * Reiniatilize the dummy arguments to correct values * CALL PSSETPBLAS( ICTXT ) * IF( LSAME( ARGNAM, 'M' ) ) THEN * * Generate bad MDIM * MDIM = -1 * ELSE IF( LSAME( ARGNAM, 'N' ) ) THEN * * Generate bad NDIM * NDIM = -1 * ELSE * * Generate bad KDIM * KDIM = -1 * END IF * * Set INFOT to the position of the bad dimension argument * INFOT = ARGPOS * * Call the PBLAS routine * CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * RETURN * * End of PSCHKDIM * END SUBROUTINE PSVECEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*7 SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PSVECEE tests whether the PBLAS respond correctly to a bad vector * argument. Each vector is described by: , I, J, * DESC, INC. Out of all these, only I, J, * DESC, and INC can be tested. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PSCHKMAT * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 ) THEN * * Check 1st vector * APOS = 2 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 7 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.12 .OR. SCODE.EQ.15 ) THEN * * Check 1st (and only) vector * APOS = 3 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * ELSE IF( SCODE.EQ.13 ) THEN * * Check 1st vector * APOS = 3 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 8 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.14 ) THEN * * Check 1st (and only) vector * APOS = 4 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * * Check 1st vector * APOS = 9 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 15 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.22 ) THEN * * Check 1st vector * APOS = 8 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 14 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st (and only) vector * APOS = 9 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * ELSE IF( SCODE.EQ.24 .OR. SCODE.EQ.27 ) THEN * * Check 1st vector * APOS = 4 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 9 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.26 .OR. SCODE.EQ.27 ) THEN * * Check 1'st (and only) vector * APOS = 4 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * END IF * RETURN * * End of PSVECEE * END SUBROUTINE PSMATEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*7 SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PSMATEE tests whether the PBLAS respond correctly to a bad matrix * argument. Each matrix is described by: , I, J, * and DESC. Out of all these, only I, J and DESC * can be tested. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PSCHKMAT * .. * .. Executable Statements .. * * Level 2 PBLAS * IF( SCODE.EQ.21 .OR. SCODE.EQ.23 ) THEN * * Check 1st (and only) matrix * APOS = 5 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.22 ) THEN * * Check 1st (and only) matrix * APOS = 4 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.24 .OR. SCODE.EQ.27 ) THEN * * Check 1st (and only) matrix * APOS = 14 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.25 .OR. SCODE.EQ.26 ) THEN * * Check 1st (and only) matrix * APOS = 9 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st matrix * APOS = 7 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * * Check 3nd matrix * APOS = 16 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.32 .OR. SCODE.EQ.35 .OR. SCODE.EQ.36 ) THEN * * Check 1st matrix * APOS = 6 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 10 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * * Check 3nd matrix * APOS = 15 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 ) THEN * * Check 1st matrix * APOS = 6 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.37 ) THEN * * Check 1st matrix * APOS = 4 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 9 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st matrix * APOS = 8 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 12 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st matrix * APOS = 5 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 10 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.40 ) THEN * * Check 1st matrix * APOS = 6 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * END IF * RETURN * * End of PSMATEE * END SUBROUTINE PSSETPBLAS( ICTXT ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT * .. * * Purpose * ======= * * PSSETPBLAS initializes *all* the dummy arguments to correct values. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. External Subroutines .. EXTERNAL PB_DESCSET2 * .. * .. Common Blocks .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY, KDIM, MDIM, NDIM REAL USCLR, SCLR INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) REAL A( 2, 2 ), B( 2, 2 ), C( 2, 2 ), X( 2 ), Y( 2 ) COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY COMMON /PBLASM/A, B, C COMMON /PBLASN/KDIM, MDIM, NDIM COMMON /PBLASS/SCLR, USCLR COMMON /PBLASV/X, Y * .. * .. Executable Statements .. * * Set default values for options * DIAG = 'N' SIDE = 'L' TRANSA = 'N' TRANSB = 'N' UPLO = 'U' * * Set default values for scalars * KDIM = 1 MDIM = 1 NDIM = 1 ISCLR = 1 SCLR = ONE USCLR = ONE * * Set default values for distributed matrix A * A( 1, 1 ) = ONE A( 2, 1 ) = ONE A( 1, 2 ) = ONE A( 2, 2 ) = ONE IA = 1 JA = 1 CALL PB_DESCSET2( DESCA, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix B * B( 1, 1 ) = ONE B( 2, 1 ) = ONE B( 1, 2 ) = ONE B( 2, 2 ) = ONE IB = 1 JB = 1 CALL PB_DESCSET2( DESCB, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix C * C( 1, 1 ) = ONE C( 2, 1 ) = ONE C( 1, 2 ) = ONE C( 2, 2 ) = ONE IC = 1 JC = 1 CALL PB_DESCSET2( DESCC, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix X * X( 1 ) = ONE X( 2 ) = ONE IX = 1 JX = 1 CALL PB_DESCSET2( DESCX, 2, 1, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) INCX = 1 * * Set default values for distributed matrix Y * Y( 1 ) = ONE Y( 2 ) = ONE IY = 1 JY = 1 CALL PB_DESCSET2( DESCY, 2, 1, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) INCY = 1 * RETURN * * End of PSSETPBLAS * END SUBROUTINE PSCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PSCHKMAT tests the matrix (or vector) ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the matrix or vector * to be checked. ARGNAM can either be 'A', 'B' or 'C' when one * wants to check a matrix, and 'X' or 'Y' for a vector. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the first argument * of the matrix (or vector) ARGNAM. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER DESCMULT PARAMETER ( DESCMULT = 100 ) * .. * .. Local Scalars .. INTEGER I, INFOT, NPROW, NPCOL, MYROW, MYCOL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PCHKPBE, PSCALLSUB, PSSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( ARGNAM, 'A' ) ) THEN * * Check IA. Set all other OK, bad IA * CALL PSSETPBLAS( ICTXT ) IA = -1 INFOT = ARGPOS + 1 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JA. Set all other OK, bad JA * CALL PSSETPBLAS( ICTXT ) JA = -1 INFOT = ARGPOS + 2 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCA. Set all other OK, bad DESCA * DO 10 I = 1, DLEN_ * * Set I'th entry of DESCA to incorrect value, rest ok. * CALL PSSETPBLAS( ICTXT ) DESCA( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCA, CSRCA, LDA * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PSSETPBLAS( ICTXT ) * * Test RSRCA >= NPROW * IF( I.EQ.RSRC_ ) $ DESCA( I ) = NPROW * * Test CSRCA >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCA( I ) = NPCOL * * Test LDA >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCA( I ) = 1 ELSE DESCA( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 10 CONTINUE * ELSE IF( LSAME( ARGNAM, 'B' ) ) THEN * * Check IB. Set all other OK, bad IB * CALL PSSETPBLAS( ICTXT ) IB = -1 INFOT = ARGPOS + 1 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JB. Set all other OK, bad JB * CALL PSSETPBLAS( ICTXT ) JB = -1 INFOT = ARGPOS + 2 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCB. Set all other OK, bad DESCB * DO 20 I = 1, DLEN_ * * Set I'th entry of DESCB to incorrect value, rest ok. * CALL PSSETPBLAS( ICTXT ) DESCB( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCB, CSRCB, LDB * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PSSETPBLAS( ICTXT ) * * Test RSRCB >= NPROW * IF( I.EQ.RSRC_ ) $ DESCB( I ) = NPROW * * Test CSRCB >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCB( I ) = NPCOL * * Test LDB >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCB( I ) = 1 ELSE DESCB( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 20 CONTINUE * ELSE IF( LSAME( ARGNAM, 'C' ) ) THEN * * Check IC. Set all other OK, bad IC * CALL PSSETPBLAS( ICTXT ) IC = -1 INFOT = ARGPOS + 1 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JC. Set all other OK, bad JC * CALL PSSETPBLAS( ICTXT ) JC = -1 INFOT = ARGPOS + 2 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCC. Set all other OK, bad DESCC * DO 30 I = 1, DLEN_ * * Set I'th entry of DESCC to incorrect value, rest ok. * CALL PSSETPBLAS( ICTXT ) DESCC( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCC, CSRCC, LDC * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PSSETPBLAS( ICTXT ) * * Test RSRCC >= NPROW * IF( I.EQ.RSRC_ ) $ DESCC( I ) = NPROW * * Test CSRCC >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCC( I ) = NPCOL * * Test LDC >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCC( I ) = 1 ELSE DESCC( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 30 CONTINUE * ELSE IF( LSAME( ARGNAM, 'X' ) ) THEN * * Check IX. Set all other OK, bad IX * CALL PSSETPBLAS( ICTXT ) IX = -1 INFOT = ARGPOS + 1 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JX. Set all other OK, bad JX * CALL PSSETPBLAS( ICTXT ) JX = -1 INFOT = ARGPOS + 2 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCX. Set all other OK, bad DESCX * DO 40 I = 1, DLEN_ * * Set I'th entry of DESCX to incorrect value, rest ok. * CALL PSSETPBLAS( ICTXT ) DESCX( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCX, CSRCX, LDX * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PSSETPBLAS( ICTXT ) * * Test RSRCX >= NPROW * IF( I.EQ.RSRC_ ) $ DESCX( I ) = NPROW * * Test CSRCX >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCX( I ) = NPCOL * * Test LDX >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCX( I ) = 1 ELSE DESCX( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 40 CONTINUE * * Check INCX. Set all other OK, bad INCX * CALL PSSETPBLAS( ICTXT ) INCX = -1 INFOT = ARGPOS + 4 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * ELSE * * Check IY. Set all other OK, bad IY * CALL PSSETPBLAS( ICTXT ) IY = -1 INFOT = ARGPOS + 1 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JY. Set all other OK, bad JY * CALL PSSETPBLAS( ICTXT ) JY = -1 INFOT = ARGPOS + 2 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCY. Set all other OK, bad DESCY * DO 50 I = 1, DLEN_ * * Set I'th entry of DESCY to incorrect value, rest ok. * CALL PSSETPBLAS( ICTXT ) DESCY( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCY, CSRCY, LDY * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PSSETPBLAS( ICTXT ) * * Test RSRCY >= NPROW * IF( I.EQ.RSRC_ ) $ DESCY( I ) = NPROW * * Test CSRCY >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCY( I ) = NPCOL * * Test LDY >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCY( I ) = 1 ELSE DESCY( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 50 CONTINUE * * Check INCY. Set all other OK, bad INCY * CALL PSSETPBLAS( ICTXT ) INCY = -1 INFOT = ARGPOS + 4 CALL PSCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * RETURN * * End of PSCHKMAT * END SUBROUTINE PSCALLSUB( SUBPTR, SCODE ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER SCODE * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PSCALLSUB calls the subroutine SUBPTR with the calling sequence iden- * tified by SCODE. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Common Blocks .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY, KDIM, MDIM, NDIM REAL USCLR, SCLR INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) REAL A( 2, 2 ), B( 2, 2 ), C( 2, 2 ), X( 2 ), Y( 2 ) COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY COMMON /PBLASM/A, B, C COMMON /PBLASN/KDIM, MDIM, NDIM COMMON /PBLASS/SCLR, USCLR COMMON /PBLASV/X, Y * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 ) THEN * CALL SUBPTR( NDIM, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, $ INCY ) * ELSE IF( SCODE.EQ.12 ) THEN * CALL SUBPTR( NDIM, SCLR, X, IX, JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.13 ) THEN * CALL SUBPTR( NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, JY, $ DESCY, INCY ) * ELSE IF( SCODE.EQ.14 ) THEN * CALL SUBPTR( NDIM, SCLR, ISCLR, X, IX, JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.15 ) THEN * CALL SUBPTR( NDIM, USCLR, X, IX, JX, DESCX, INCX ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * CALL SUBPTR( TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX, SCLR, Y, IY, JY, DESCY, INCY ) * ELSE IF( SCODE.EQ.22 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, A, IA, JA, DESCA, X, IX, JX, $ DESCX, INCX, SCLR, Y, IY, JY, DESCY, INCY ) * ELSE IF( SCODE.EQ.23 ) THEN * CALL SUBPTR( UPLO, TRANSA, DIAG, NDIM, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.24 ) THEN * CALL SUBPTR( MDIM, NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, $ JY, DESCY, INCY, A, IA, JA, DESCA ) * ELSE IF( SCODE.EQ.25 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, X, IX, JX, DESCX, INCX, A, IA, $ JA, DESCA ) * ELSE IF( SCODE.EQ.26 ) THEN * CALL SUBPTR( UPLO, NDIM, USCLR, X, IX, JX, DESCX, INCX, A, IA, $ JA, DESCA ) * ELSE IF( SCODE.EQ.27 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, $ JY, DESCY, INCY, A, IA, JA, DESCA ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * CALL SUBPTR( TRANSA, TRANSB, MDIM, NDIM, KDIM, SCLR, A, IA, JA, $ DESCA, B, IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.32 ) THEN * CALL SUBPTR( SIDE, UPLO, MDIM, NDIM, SCLR, A, IA, JA, DESCA, B, $ IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.33 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.34 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, USCLR, A, IA, JA, DESCA, $ USCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.35 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ B, IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.36 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ B, IB, JB, DESCB, USCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.37 ) THEN * CALL SUBPTR( MDIM, NDIM, SCLR, A, IA, JA, DESCA, SCLR, C, IC, $ JC, DESCC ) * ELSE IF( SCODE.EQ.38 ) THEN * CALL SUBPTR( SIDE, UPLO, TRANSA, DIAG, MDIM, NDIM, SCLR, A, IA, $ JA, DESCA, B, IB, JB, DESCB ) * ELSE IF( SCODE.EQ.39 ) THEN * CALL SUBPTR( TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, SCLR, $ C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.40 ) THEN * CALL SUBPTR( UPLO, TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, $ SCLR, C, IC, JC, DESCC ) * END IF * RETURN * * End of PSCALLSUB * END SUBROUTINE PSERRSET( ERR, ERRMAX, XTRUE, X ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL ERR, ERRMAX, X, XTRUE * .. * * Purpose * ======= * * PSERRSET computes the absolute difference ERR = |XTRUE - X| and com- * pares it with zero. ERRMAX accumulates the absolute error difference. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERR (local output) REAL * On exit, ERR specifies the absolute difference |XTRUE - X|. * * ERRMAX (local input/local output) REAL * On entry, ERRMAX specifies a previously computed error. On * exit ERRMAX is the accumulated error MAX( ERRMAX, ERR ). * * XTRUE (local input) REAL * On entry, XTRUE specifies the true value. * * X (local input) REAL * On entry, X specifies the value to be compared to XTRUE. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. REAL PSDIFF EXTERNAL PSDIFF * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * ERR = ABS( PSDIFF( XTRUE, X ) ) * ERRMAX = MAX( ERRMAX, ERR ) * RETURN * * End of PSERRSET * END SUBROUTINE PSCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INFO, IX, JX, N REAL ERRMAX * .. * .. Array Arguments .. INTEGER DESCX( * ) REAL PX( * ), X( * ) * .. * * Purpose * ======= * * PSCHKVIN checks that the submatrix sub( PX ) remained unchanged. The * local array entries are compared element by element, and their dif- * ference is tested against 0.0 as well as the epsilon machine. Notice * that this difference should be numerically exactly the zero machine, * but because of the possible fluctuation of some of the data we flag- * ged differently a difference less than twice the epsilon machine. The * largest error is also returned. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERRMAX (global output) REAL * On exit, ERRMAX specifies the largest absolute element-wise * difference between sub( X ) and sub( PX ). * * N (global input) INTEGER * On entry, N specifies the length of the subvector operand * sub( X ). N must be at least zero. * * X (local input) REAL array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) REAL array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, ICURROW, IIX, IN, IXCOL, $ IXROW, J, JB, JJX, JN, KK, LDPX, LDX, LL, $ MYCOL, MYROW, NPCOL, NPROW REAL ERR, EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_INFOG2L, PSERRSET, SGAMX2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) * LDX = DESCX( M_ ) LDPX = DESCX( LLD_ ) ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( N.EQ.1 ) THEN * IF( ( MYROW.EQ.IXROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.IXCOL .OR. COLREP ) ) $ CALL PSERRSET( ERR, ERRMAX, X( IX+(JX-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) * ELSE IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * IF( MYROW.EQ.IXROW .OR. ROWREP ) THEN * ICURCOL = IXCOL IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN DO 10 J = JX, JN CALL PSERRSET( ERR, ERRMAX, X( IX+(J-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) JJX = JJX + 1 10 CONTINUE END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 30 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * DO 20 KK = 0, JB-1 CALL PSERRSET( ERR, ERRMAX, X( IX+(J+KK-1)*LDX ), $ PX( IIX+(JJX+KK-1)*LDPX ) ) 20 CONTINUE * JJX = JJX + JB * END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 30 CONTINUE * END IF * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * IF( MYCOL.EQ.IXCOL .OR. COLREP ) THEN * ICURROW = IXROW IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 40 I = IX, IN CALL PSERRSET( ERR, ERRMAX, X( I+(JX-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) IIX = IIX + 1 40 CONTINUE END IF ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * DO 50 KK = 0, IB-1 CALL PSERRSET( ERR, ERRMAX, X( I+KK+(JX-1)*LDX ), $ PX( IIX+KK+(JJX-1)*LDPX ) ) 50 CONTINUE * IIX = IIX + IB * END IF * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PSCHKVIN * END SUBROUTINE PSCHKVOUT( N, X, PX, IX, JX, DESCX, INCX, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INFO, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) REAL PX( * ), X( * ) * .. * * Purpose * ======= * * PSCHKVOUT checks that the matrix PX \ sub( PX ) remained unchanged. * The local array entries are compared element by element, and their * difference is tested against 0.0 as well as the epsilon machine. No- * tice that this difference should be numerically exactly the zero ma- * chine, but because of the possible movement of some of the data we * flagged differently a difference less than twice the epsilon machine. * The largest error is reported. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector operand * sub( X ). N must be at least zero. * * X (local input) REAL array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) REAL array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, ICURROW, II, IMBX, INBX, $ J, JB, JJ, KK, LDPX, LDX, LL, MBX, MPALL, $ MYCOL, MYCOLDIST, MYROW, MYROWDIST, NBX, NPCOL, $ NPROW, NQALL REAL EPS, ERR, ERRMAX * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PSERRSET, SGAMX2D * .. * .. External Functions .. INTEGER PB_NUMROC REAL PSLAMCH EXTERNAL PSLAMCH, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( ( DESCX( M_ ).LE.0 ).OR.( DESCX( N_ ).LE.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * MPALL = PB_NUMROC( DESCX( M_ ), 1, DESCX( IMB_ ), DESCX( MB_ ), $ MYROW, DESCX( RSRC_ ), NPROW ) NQALL = PB_NUMROC( DESCX( N_ ), 1, DESCX( INB_ ), DESCX( NB_ ), $ MYCOL, DESCX( CSRC_ ), NPCOL ) * MBX = DESCX( MB_ ) NBX = DESCX( NB_ ) LDX = DESCX( M_ ) LDPX = DESCX( LLD_ ) ICURROW = DESCX( RSRC_ ) ICURCOL = DESCX( CSRC_ ) ROWREP = ( ICURROW.EQ.-1 ) COLREP = ( ICURCOL.EQ.-1 ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN IMBX = DESCX( IMB_ ) ELSE IMBX = MBX END IF IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN INBX = DESCX( INB_ ) ELSE INBX = NBX END IF IF( ROWREP ) THEN MYROWDIST = 0 ELSE MYROWDIST = MOD( MYROW - ICURROW + NPROW, NPROW ) END IF IF( COLREP ) THEN MYCOLDIST = 0 ELSE MYCOLDIST = MOD( MYCOL - ICURCOL + NPCOL, NPCOL ) END IF II = 1 JJ = 1 * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * I = 1 IF( MYCOLDIST.EQ.0 ) THEN J = 1 ELSE J = DESCX( INB_ ) + ( MYCOLDIST - 1 ) * NBX + 1 END IF JB = MIN( MAX( 0, DESCX( N_ ) - J + 1 ), INBX ) IB = MIN( DESCX( M_ ), DESCX( IMB_ ) ) * DO 20 KK = 0, JB-1 DO 10 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. J+KK.GT.JX+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 10 CONTINUE 20 CONTINUE IF( COLREP ) THEN J = J + INBX ELSE J = J + INBX + ( NPCOL - 1 ) * NBX END IF * DO 50 JJ = INBX+1, NQALL, NBX JB = MIN( NQALL-JJ+1, NBX ) * DO 40 KK = 0, JB-1 DO 30 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 30 CONTINUE 40 CONTINUE * IF( COLREP ) THEN J = J + NBX ELSE J = J + NPCOL * NBX END IF * 50 CONTINUE * II = II + IB * END IF * ICURROW = MOD( ICURROW + 1, NPROW ) * DO 110 I = DESCX( IMB_ ) + 1, DESCX( M_ ), MBX IB = MIN( DESCX( M_ ) - I + 1, MBX ) * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * IF( MYCOLDIST.EQ.0 ) THEN J = 1 ELSE J = DESCX( INB_ ) + ( MYCOLDIST - 1 ) * NBX + 1 END IF * JJ = 1 JB = MIN( MAX( 0, DESCX( N_ ) - J + 1 ), INBX ) DO 70 KK = 0, JB-1 DO 60 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 60 CONTINUE 70 CONTINUE IF( COLREP ) THEN J = J + INBX ELSE J = J + INBX + ( NPCOL - 1 ) * NBX END IF * DO 100 JJ = INBX+1, NQALL, NBX JB = MIN( NQALL-JJ+1, NBX ) * DO 90 KK = 0, JB-1 DO 80 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 80 CONTINUE 90 CONTINUE * IF( COLREP ) THEN J = J + NBX ELSE J = J + NPCOL * NBX END IF * 100 CONTINUE * II = II + IB * END IF * ICURROW = MOD( ICURROW + 1, NPROW ) * 110 CONTINUE * ELSE * * sub( X ) is a column vector * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * J = 1 IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCX( IMB_ ) + ( MYROWDIST - 1 ) * MBX + 1 END IF IB = MIN( MAX( 0, DESCX( M_ ) - I + 1 ), IMBX ) JB = MIN( DESCX( N_ ), DESCX( INB_ ) ) * DO 130 KK = 0, JB-1 DO 120 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. I+LL.GT.IX+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 120 CONTINUE 130 CONTINUE IF( ROWREP ) THEN I = I + IMBX ELSE I = I + IMBX + ( NPROW - 1 ) * MBX END IF * DO 160 II = IMBX+1, MPALL, MBX IB = MIN( MPALL-II+1, MBX ) * DO 150 KK = 0, JB-1 DO 140 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 140 CONTINUE 150 CONTINUE * IF( ROWREP ) THEN I = I + MBX ELSE I = I + NPROW * MBX END IF * 160 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * DO 220 J = DESCX( INB_ ) + 1, DESCX( N_ ), NBX JB = MIN( DESCX( N_ ) - J + 1, NBX ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCX( IMB_ ) + ( MYROWDIST - 1 ) * MBX + 1 END IF * II = 1 IB = MIN( MAX( 0, DESCX( M_ ) - I + 1 ), IMBX ) DO 180 KK = 0, JB-1 DO 170 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 170 CONTINUE 180 CONTINUE IF( ROWREP ) THEN I = I + IMBX ELSE I = I + IMBX + ( NPROW - 1 ) * MBX END IF * DO 210 II = IMBX+1, MPALL, MBX IB = MIN( MPALL-II+1, MBX ) * DO 200 KK = 0, JB-1 DO 190 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 190 CONTINUE 200 CONTINUE * IF( ROWREP ) THEN I = I + MBX ELSE I = I + NPROW * MBX END IF * 210 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * 220 CONTINUE * END IF * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PSCHKVOUT * END SUBROUTINE PSCHKMIN( ERRMAX, M, N, A, PA, IA, JA, DESCA, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N REAL ERRMAX * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL PA( * ), A( * ) * .. * * Purpose * ======= * * PSCHKMIN checks that the submatrix sub( PA ) remained unchanged. The * local array entries are compared element by element, and their dif- * ference is tested against 0.0 as well as the epsilon machine. Notice * that this difference should be numerically exactly the zero machine, * but because of the possible fluctuation of some of the data we flag- * ged differently a difference less than twice the epsilon machine. The * largest error is also returned. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERRMAX (global output) REAL * On exit, ERRMAX specifies the largest absolute element-wise * difference between sub( A ) and sub( PA ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * operand sub( A ). N must be at least zero. * * A (local input) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) REAL array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ KK, LDA, LDPA, LL, MYCOL, MYROW, NPCOL, NPROW REAL ERR, EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_INFOG2L, PSERRSET, SGAMX2D * .. * .. External Functions .. REAL PSLAMCH EXTERNAL PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if posssible * IF( ( M.EQ.0 ).OR.( N.EQ.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * CALL PB_INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) * II = IIA JJ = JJA LDA = DESCA( M_ ) LDPA = DESCA( LLD_ ) ICURROW = IAROW ICURCOL = IACOL ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * * Handle the first block of column separately * JB = DESCA( INB_ ) - JA + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCA( NB_ ) + 1 ) * DESCA( NB_ ) + JB JB = MIN( JB, N ) JN = JA + JB - 1 * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * DO 40 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 10 K = 0, IB-1 CALL PSERRSET( ERR, ERRMAX, A( IA+K+(JA+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 10 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) * * Loop over remaining block of rows * DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 20 K = 0, IB-1 CALL PSERRSET( ERR, ERRMAX, A( I+K+(JA+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 20 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) 30 CONTINUE * II = IIA ICURROW = IAROW 40 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining column blocks * DO 90 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN DO 80 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 )*DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 50 K = 0, IB-1 CALL PSERRSET( ERR, ERRMAX, A( IA+K+(J+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 50 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) * * Loop over remaining block of rows * DO 70 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 60 K = 0, IB-1 CALL PSERRSET( ERR, ERRMAX, $ A( I+K+(J+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 60 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) 70 CONTINUE * II = IIA ICURROW = IAROW 80 CONTINUE * JJ = JJ + JB END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 90 CONTINUE * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PSCHKMIN * END SUBROUTINE PSCHKMOUT( M, N, A, PA, IA, JA, DESCA, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ), PA( * ) * .. * * Purpose * ======= * * PSCHKMOUT checks that the matrix PA \ sub( PA ) remained unchanged. * The local array entries are compared element by element, and their * difference is tested against 0.0 as well as the epsilon machine. No- * tice that this difference should be numerically exactly the zero ma- * chine, but because of the possible movement of some of the data we * flagged differently a difference less than twice the epsilon machine. * The largest error is reported. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( PA ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( PA ). N must be at least zero. * * A (local input) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) REAL array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, II, IMBA, J, JB, JJ, KK, $ LDA, LDPA, LL, MPALL, MYCOL, MYROW, MYROWDIST, $ NPCOL, NPROW REAL EPS, ERR, ERRMAX * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PSERRSET, SGAMX2D * .. * .. External Functions .. INTEGER PB_NUMROC REAL PSLAMCH EXTERNAL PSLAMCH, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( ( DESCA( M_ ).LE.0 ).OR.( DESCA( N_ ).LE.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * MPALL = PB_NUMROC( DESCA( M_ ), 1, DESCA( IMB_ ), DESCA( MB_ ), $ MYROW, DESCA( RSRC_ ), NPROW ) * LDA = DESCA( M_ ) LDPA = DESCA( LLD_ ) * II = 1 JJ = 1 ROWREP = ( DESCA( RSRC_ ).EQ.-1 ) COLREP = ( DESCA( CSRC_ ).EQ.-1 ) ICURCOL = DESCA( CSRC_ ) IF( MYROW.EQ.DESCA( RSRC_ ) .OR. ROWREP ) THEN IMBA = DESCA( IMB_ ) ELSE IMBA = DESCA( MB_ ) END IF IF( ROWREP ) THEN MYROWDIST = 0 ELSE MYROWDIST = MOD( MYROW - DESCA( RSRC_ ) + NPROW, NPROW ) END IF * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * J = 1 IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCA( IMB_ ) + ( MYROWDIST - 1 ) * DESCA( MB_ ) + 1 END IF IB = MIN( MAX( 0, DESCA( M_ ) - I + 1 ), IMBA ) JB = MIN( DESCA( N_ ), DESCA( INB_ ) ) * DO 20 KK = 0, JB-1 DO 10 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 10 CONTINUE 20 CONTINUE IF( ROWREP ) THEN I = I + IMBA ELSE I = I + IMBA + ( NPROW - 1 ) * DESCA( MB_ ) END IF * DO 50 II = IMBA + 1, MPALL, DESCA( MB_ ) IB = MIN( MPALL-II+1, DESCA( MB_ ) ) * DO 40 KK = 0, JB-1 DO 30 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 30 CONTINUE 40 CONTINUE * IF( ROWREP ) THEN I = I + DESCA( MB_ ) ELSE I = I + NPROW * DESCA( MB_ ) END IF * 50 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * DO 110 J = DESCA( INB_ ) + 1, DESCA( N_ ), DESCA( NB_ ) JB = MIN( DESCA( N_ ) - J + 1, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCA( IMB_ ) + ( MYROWDIST - 1 ) * DESCA( MB_ ) + 1 END IF * II = 1 IB = MIN( MAX( 0, DESCA( M_ ) - I + 1 ), IMBA ) DO 70 KK = 0, JB-1 DO 60 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 60 CONTINUE 70 CONTINUE IF( ROWREP ) THEN I = I + IMBA ELSE I = I + IMBA + ( NPROW - 1 ) * DESCA( MB_ ) END IF * DO 100 II = IMBA+1, MPALL, DESCA( MB_ ) IB = MIN( MPALL-II+1, DESCA( MB_ ) ) * DO 90 KK = 0, JB-1 DO 80 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PSERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 80 CONTINUE 90 CONTINUE * IF( ROWREP ) THEN I = I + DESCA( MB_ ) ELSE I = I + NPROW * DESCA( MB_ ) END IF * 100 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * INSERT MODE 110 CONTINUE * CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PSCHKMOUT * END SUBROUTINE PSMPRNT( ICTXT, NOUT, M, N, A, LDA, IRPRNT, ICPRNT, $ CMATNM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICPRNT, ICTXT, IRPRNT, LDA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM REAL A( LDA, * ) * .. * * Purpose * ======= * * PSMPRNT prints to the standard output an array A of size m by n. Only * the process of coordinates ( IRPRNT, ICPRNT ) is printing. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * M (global input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * A (local input) REAL array * On entry, A is an array of dimension (LDA,N). The leading m * by n part of this array is printed. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array A to be printed. LDA must be at least MAX( 1, M ). * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the process row coordinate of the * printing process. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the process column coordinate of * the printing process. * * CMATNM (global input) CHARACTER*(*) * On entry, CMATNM specifies the identifier of the matrix to be * printed. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN * WRITE( NOUT, FMT = * ) DO 20 J = 1, N * DO 10 I = 1, M * WRITE( NOUT, FMT = 9999 ) CMATNM, I, J, A( I, J ) * 10 CONTINUE * 20 CONTINUE * END IF * 9999 FORMAT( 1X, A, '(', I6, ',', I6, ')=', E16.8 ) * RETURN * * End of PSMPRNT * END SUBROUTINE PSVPRNT( ICTXT, NOUT, N, X, INCX, IRPRNT, ICPRNT, $ CVECNM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICPRNT, ICTXT, INCX, IRPRNT, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CVECNM REAL X( * ) * .. * * Purpose * ======= * * PSVPRNT prints to the standard output an vector x of length n. Only * the process of coordinates ( IRPRNT, ICPRNT ) is printing. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * N (global input) INTEGER * On entry, N specifies the length of the vector X. N must be * at least zero. * * X (global input) REAL array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the process row coordinate of the * printing process. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the process column coordinate of * the printing process. * * CVECNM (global input) CHARACTER*(*) * On entry, CVECNM specifies the identifier of the vector to be * printed. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN * WRITE( NOUT, FMT = * ) DO 10 I = 1, 1 + ( N-1 )*INCX, INCX * WRITE( NOUT, FMT = 9999 ) CVECNM, I, X( I ) * 10 CONTINUE * END IF * 9999 FORMAT( 1X, A, '(', I6, ')=', E16.8 ) * RETURN * * End of PSVPRNT * END SUBROUTINE PSMVCH( ICTXT, TRANS, M, N, ALPHA, A, IA, JA, DESCA, $ X, IX, JX, DESCX, INCX, BETA, Y, PY, IY, JY, $ DESCY, INCY, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N REAL ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL A( * ), G( * ), PY( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PSMVCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies which matrix-vector product is to * be computed as follows: * If TRANS = 'N', * sub( Y ) = BETA * sub( Y ) + sub( A ) * sub( X ), * otherwise * sub( Y ) = BETA * sub( Y ) + sub( A )' * sub( X ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) REAL array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) REAL array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP, TRAN INTEGER I, IB, ICURCOL, ICURROW, IIY, IN, IOFFA, IOFFX, $ IOFFY, IYCOL, IYROW, J, JB, JJY, JN, KK, LDA, $ LDPY, LDX, LDY, ML, MYCOL, MYROW, NL, NPCOL, $ NPROW REAL EPS, ERRI, GTMP, TBETA, YTMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * IF( M.EQ.0 .OR. N.EQ.0 ) THEN TBETA = ONE ELSE TBETA = BETA END IF * TRAN = LSAME( TRANS, 'T' ).OR.LSAME( TRANS, 'C' ) IF( TRAN ) THEN ML = N NL = M ELSE ML = M NL = N END IF * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in Y using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * IOFFY = IY + ( JY - 1 ) * LDY DO 30 I = 1, ML YTMP = ZERO GTMP = ZERO IOFFX = IX + ( JX - 1 ) * LDX IF( TRAN )THEN IOFFA = IA + ( JA + I - 2 ) * LDA DO 10 J = 1, NL YTMP = YTMP + A( IOFFA ) * X( IOFFX ) GTMP = GTMP + ABS( A( IOFFA ) * X( IOFFX ) ) IOFFA = IOFFA + 1 IOFFX = IOFFX + INCX 10 CONTINUE ELSE IOFFA = IA + I - 1 + ( JA - 1 ) * LDA DO 20 J = 1, NL YTMP = YTMP + A( IOFFA ) * X( IOFFX ) GTMP = GTMP + ABS( A( IOFFA ) * X( IOFFX ) ) IOFFA = IOFFA + LDA IOFFX = IOFFX + INCX 20 CONTINUE END IF G( I ) = ABS( ALPHA ) * GTMP + ABS( TBETA * Y( IOFFY ) ) Y( IOFFY ) = ALPHA * YTMP + TBETA * Y( IOFFY ) IOFFY = IOFFY + INCY 30 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO INFO = 0 LDPY = DESCY( LLD_ ) IOFFY = IY + ( JY - 1 ) * LDY CALL PB_INFOG2L( IY, JY, DESCY, NPROW, NPCOL, MYROW, MYCOL, IIY, $ JJY, IYROW, IYCOL ) ICURROW = IYROW ICURCOL = IYCOL ROWREP = ( IYROW.EQ.-1 ) COLREP = ( IYCOL.EQ.-1 ) * IF( INCY.EQ.DESCY( M_ ) ) THEN * * sub( Y ) is a row vector * JB = DESCY( INB_ ) - JY + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCY( NB_ ) + 1 ) * DESCY( NB_ ) + JB JB = MIN( JB, ML ) JN = JY + JB - 1 * DO 50 J = JY, JN * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) ) / EPS IF( G( J-JY+1 ).NE.ZERO ) $ ERRI = ERRI / G( J-JY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 JJY = JJY + 1 END IF * IOFFY = IOFFY + INCY * 50 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 70 J = JN+1, JY+ML-1, DESCY( NB_ ) JB = MIN( JY+ML-J, DESCY( NB_ ) ) * DO 60 KK = 0, JB-1 * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) )/EPS IF( G( J+KK-JY+1 ).NE.ZERO ) $ ERRI = ERRI / G( J+KK-JY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 JJY = JJY + 1 END IF * IOFFY = IOFFY + INCY * 60 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 70 CONTINUE * ELSE * * sub( Y ) is a column vector * IB = DESCY( IMB_ ) - IY + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCY( MB_ ) + 1 ) * DESCY( MB_ ) + IB IB = MIN( IB, ML ) IN = IY + IB - 1 * DO 80 I = IY, IN * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) ) / EPS IF( G( I-IY+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIY = IIY + 1 END IF * IOFFY = IOFFY + INCY * 80 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 100 I = IN+1, IY+ML-1, DESCY( MB_ ) IB = MIN( IY+ML-I, DESCY( MB_ ) ) * DO 90 KK = 0, IB-1 * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) )/EPS IF( G( I+KK-IY+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIY = IIY + 1 END IF * IOFFY = IOFFY + INCY * 90 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 100 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) * RETURN * * End of PSMVCH * END SUBROUTINE PSVMCH( ICTXT, UPLO, M, N, ALPHA, X, IX, JX, DESCX, $ INCX, Y, IY, JY, DESCY, INCY, A, PA, IA, JA, $ DESCA, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N REAL ALPHA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL A( * ), G( * ), PA( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PSVMCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of the submatrix sub( A ) * is to be referenced as follows: * If UPLO = 'L', only the lower triangular part, * If UPLO = 'U', only the upper triangular part, * else the entire matrix is to be referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * X (local input) REAL array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) REAL array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) REAL array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, LOWER, ROWREP, UPPER INTEGER I, IACOL, IAROW, IB, IBEG, ICURROW, IEND, IIA, $ IN, IOFFA, IOFFX, IOFFY, J, JJA, KK, LDA, LDPA, $ LDX, LDY, MYCOL, MYROW, NPCOL, NPROW REAL ATMP, EPS, ERRI, GTMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in A using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 70 J = 1, N * IOFFY = IY + ( JY - 1 ) * LDY + ( J - 1 ) * INCY * IF( LOWER ) THEN IBEG = J IEND = M DO 10 I = 1, J-1 G( I ) = ZERO 10 CONTINUE ELSE IF( UPPER ) THEN IBEG = 1 IEND = J DO 20 I = J+1, M G( I ) = ZERO 20 CONTINUE ELSE IBEG = 1 IEND = M END IF * DO 30 I = IBEG, IEND * IOFFX = IX + ( JX - 1 ) * LDX + ( I - 1 ) * INCX IOFFA = IA + I - 1 + ( JA + J - 2 ) * LDA ATMP = X( IOFFX ) * Y( IOFFY ) GTMP = ABS( X( IOFFX ) * Y( IOFFY ) ) G( I ) = ABS( ALPHA ) * GTMP + ABS( A( IOFFA ) ) A( IOFFA ) = ALPHA * ATMP + A( IOFFA ) * 30 CONTINUE * * Compute the error ratio for this result. * INFO = 0 ERR = ZERO LDPA = DESCA( LLD_ ) IOFFA = IA + ( JA + J - 2 ) * LDA CALL PB_INFOG2L( IA, JA+J-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * IF( MYCOL.EQ.IACOL .OR. COLREP ) THEN * ICURROW = IAROW IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 * DO 40 I = IA, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA ) - A( IOFFA ) )/EPS IF( G( I-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 40 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * DO 50 KK = 0, IB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA )-A( IOFFA ) )/EPS IF( G( I+KK-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 80 * 70 CONTINUE * 80 CONTINUE * RETURN * * End of PSVMCH * END SUBROUTINE PSVMCH2( ICTXT, UPLO, M, N, ALPHA, X, IX, JX, DESCX, $ INCX, Y, IY, JY, DESCY, INCY, A, PA, IA, $ JA, DESCA, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N REAL ALPHA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) REAL A( * ), G( * ), PA( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PSVMCH2 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of the submatrix sub( A ) * is to be referenced as follows: * If UPLO = 'L', only the lower triangular part, * If UPLO = 'U', only the upper triangular part, * else the entire matrix is to be referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * X (local input) REAL array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) REAL array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) REAL array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, LOWER, ROWREP, UPPER INTEGER I, IACOL, IAROW, IB, IBEG, ICURROW, IEND, IIA, $ IN, IOFFA, IOFFXI, IOFFXJ, IOFFYI, IOFFYJ, J, $ JJA, KK, LDA, LDPA, LDX, LDY, MYCOL, MYROW, $ NPCOL, NPROW REAL EPS, ERRI, GTMP, ATMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in A using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 70 J = 1, N * IOFFXJ = IX + ( JX - 1 ) * LDX + ( J - 1 ) * INCX IOFFYJ = IY + ( JY - 1 ) * LDY + ( J - 1 ) * INCY * IF( LOWER ) THEN IBEG = J IEND = M DO 10 I = 1, J-1 G( I ) = ZERO 10 CONTINUE ELSE IF( UPPER ) THEN IBEG = 1 IEND = J DO 20 I = J+1, M G( I ) = ZERO 20 CONTINUE ELSE IBEG = 1 IEND = M END IF * DO 30 I = IBEG, IEND IOFFA = IA + I - 1 + ( JA + J - 2 ) * LDA IOFFXI = IX + ( JX - 1 ) * LDX + ( I - 1 ) * INCX IOFFYI = IY + ( JY - 1 ) * LDY + ( I - 1 ) * INCY ATMP = X( IOFFXI ) * Y( IOFFYJ ) ATMP = ATMP + Y( IOFFYI ) * X( IOFFXJ ) GTMP = ABS( X( IOFFXI ) * Y( IOFFYJ ) ) GTMP = GTMP + ABS( Y( IOFFYI ) * X( IOFFXJ ) ) G( I ) = ABS( ALPHA ) * GTMP + ABS( A( IOFFA ) ) A( IOFFA ) = ALPHA*ATMP + A( IOFFA ) * 30 CONTINUE * * Compute the error ratio for this result. * INFO = 0 ERR = ZERO LDPA = DESCA( LLD_ ) IOFFA = IA + ( JA + J - 2 ) * LDA CALL PB_INFOG2L( IA, JA+J-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * IF( MYCOL.EQ.IACOL .OR. COLREP ) THEN * ICURROW = IAROW IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 * DO 40 I = IA, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA ) - A( IOFFA ) )/EPS IF( G( I-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 40 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * DO 50 KK = 0, IB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA )-A( IOFFA ) )/EPS IF( G( I+KK-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 80 * 70 CONTINUE * 80 CONTINUE * RETURN * * End of PSVMCH2 * END SUBROUTINE PSMMCH( ICTXT, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, $ JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, $ JC, DESCC, CT, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANSA, TRANSB INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N REAL ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) REAL A( * ), B( * ), C( * ), CT( * ), G( * ), $ PC( * ) * .. * * Purpose * ======= * * PSMMCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies if the matrix operand A is to be * transposed. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies if the matrix operand B is to be * transposed. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANSA = 'N' (resp. TRANSA <> 'N') in PxGEMM, PxSYRK, * PxSYR2K, PxHERK and PxHER2K. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) REAL array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * C (local input/local output) REAL array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) REAL array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) REAL array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP, TRANA, TRANB INTEGER I, IBB, ICCOL, ICROW, ICURROW, IIC, IN, IOFFA, $ IOFFB, IOFFC, J, JJC, KK, LDA, LDB, LDC, LDPC, $ MYCOL, MYROW, NPCOL, NPROW REAL EPS, ERRI * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * TRANA = LSAME( TRANSA, 'T' ).OR.LSAME( TRANSA, 'C' ) TRANB = LSAME( TRANSB, 'T' ).OR.LSAME( TRANSB, 'C' ) * LDA = MAX( 1, DESCA( M_ ) ) LDB = MAX( 1, DESCB( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 240 J = 1, N * IOFFC = IC + ( JC + J - 2 ) * LDC DO 10 I = 1, M CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE * IF( .NOT.TRANA .AND. .NOT.TRANB ) THEN DO 30 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 20 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA .AND. .NOT.TRANB ) THEN DO 50 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 40 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 40 CONTINUE 50 CONTINUE ELSE IF( .NOT.TRANA .AND. TRANB ) THEN DO 70 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 60 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 60 CONTINUE 70 CONTINUE ELSE IF( TRANA .AND. TRANB ) THEN DO 90 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 80 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 80 CONTINUE 90 CONTINUE END IF * DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA * C( IOFFC ) G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, M ) IN = IC + IBB - 1 * DO 210 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 210 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 230 I = IN+1, IC+M-1, DESCC( MB_ ) IBB = MIN( IC+M-I, DESCC( MB_ ) ) * DO 220 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 220 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 230 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 250 * 240 CONTINUE * 250 CONTINUE * RETURN * * End of PSMMCH * END SUBROUTINE PSMMCH1( ICTXT, UPLO, TRANS, N, K, ALPHA, A, IA, JA, $ DESCA, BETA, C, PC, IC, JC, DESCC, CT, G, $ ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IC, ICTXT, INFO, JA, JC, K, N REAL ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), CT( * ), G( * ), PC( * ) * .. * * Purpose * ======= * * PSMMCH1 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrix A has to be * transposed or not before computing the matrix-matrix product. * * N (global input) INTEGER * On entry, N specifies the order the submatrix operand C. N * must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANS = 'N' (resp. TRANS <> 'N'). K must be at least * zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * C (local input/local output) REAL array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) REAL array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) REAL array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, NOTRAN, ROWREP, TRAN, UPPER INTEGER I, IBB, IBEG, ICCOL, ICROW, ICURROW, IEND, IIC, $ IN, IOFFAK, IOFFAN, IOFFC, J, JJC, KK, LDA, $ LDC, LDPC, MYCOL, MYROW, NPCOL, NPROW REAL EPS, ERRI * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) * LDA = MAX( 1, DESCA( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 140 J = 1, N * IF( UPPER ) THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF * DO 10 I = 1, N CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE * IF( NOTRAN ) THEN DO 30 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA DO 20 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAK ) * A( IOFFAN ) G( I ) = G( I ) + ABS( A( IOFFAK ) ) * $ ABS( A( IOFFAN ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRAN ) THEN DO 50 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA DO 40 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAK ) * A( IOFFAN ) G( I ) = G( I ) + ABS( A( IOFFAK ) ) * $ ABS( A( IOFFAN ) ) 40 CONTINUE 50 CONTINUE END IF * IOFFC = IC + IBEG - 1 + ( JC + J - 2 ) * LDC * DO 100 I = IBEG, IEND CT( I ) = ALPHA*CT( I ) + BETA * C( IOFFC ) G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 100 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, N ) IN = IC + IBB - 1 * DO 110 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IC+N-1, DESCC( MB_ ) IBB = MIN( IC+N-I, DESCC( MB_ ) ) * DO 120 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 150 * 140 CONTINUE * 150 CONTINUE * RETURN * * End of PSMMCH1 * END SUBROUTINE PSMMCH2( ICTXT, UPLO, TRANS, N, K, ALPHA, A, IA, JA, $ DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, $ JC, DESCC, CT, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, N REAL ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) REAL A( * ), B( * ), C( * ), CT( * ), G( * ), $ PC( * ) * .. * * Purpose * ======= * * PSMMCH2 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrices A and B have * to be transposed or not before computing the matrix-matrix * product. * * N (global input) INTEGER * On entry, N specifies the order the submatrix operand C. N * must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * and B when TRANS = 'N' (resp. TRANS <> 'N'). K must be at * least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) REAL array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * C (local input/local output) REAL array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) REAL array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) REAL array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) REAL array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, NOTRAN, ROWREP, TRAN, UPPER INTEGER I, IBB, IBEG, ICCOL, ICROW, ICURROW, IEND, IIC, $ IN, IOFFAK, IOFFAN, IOFFBK, IOFFBN, IOFFC, J, $ JJC, KK, LDA, LDB, LDC, LDPC, MYCOL, MYROW, $ NPCOL, NPROW REAL EPS, ERRI * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, MOD, SQRT * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PSLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) * LDA = MAX( 1, DESCA( M_ ) ) LDB = MAX( 1, DESCB( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 140 J = 1, N * IF( UPPER ) THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF * DO 10 I = 1, N CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE * IF( NOTRAN ) THEN DO 30 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA IOFFBK = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 20 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA IOFFBN = IB + I - 1 + ( JB + KK - 2 ) * LDB CT( I ) = CT( I ) + ALPHA * ( $ A( IOFFAN ) * B( IOFFBK ) + $ B( IOFFBN ) * A( IOFFAK ) ) G( I ) = G( I ) + ABS( ALPHA ) * ( $ ABS( A( IOFFAN ) ) * ABS( B( IOFFBK ) ) + $ ABS( B( IOFFBN ) ) * ABS( A( IOFFAK ) ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRAN ) THEN DO 50 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA IOFFBK = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 40 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA IOFFBN = IB + KK - 1 + ( JB + I - 2 ) * LDB CT( I ) = CT( I ) + ALPHA * ( $ A( IOFFAN ) * B( IOFFBK ) + $ B( IOFFBN ) * A( IOFFAK ) ) G( I ) = G( I ) + ABS( ALPHA ) * ( $ ABS( A( IOFFAN ) ) * ABS( B( IOFFBK ) ) + $ ABS( B( IOFFBN ) ) * ABS( A( IOFFAK ) ) ) 40 CONTINUE 50 CONTINUE END IF * IOFFC = IC + IBEG - 1 + ( JC + J - 2 ) * LDC * DO 100 I = IBEG, IEND CT( I ) = CT( I ) + BETA * C( IOFFC ) G( I ) = G( I ) + ABS( BETA )*ABS( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 100 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, N ) IN = IC + IBB - 1 * DO 110 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IC+N-1, DESCC( MB_ ) IBB = MIN( IC+N-I, DESCC( MB_ ) ) * DO 120 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 150 * 140 CONTINUE * 150 CONTINUE * RETURN * * End of PSMMCH2 * END SUBROUTINE PSMMCH3( UPLO, TRANS, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IC, INFO, JA, JC, M, N REAL ALPHA, BETA, ERR * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) REAL A( * ), C( * ), PC( * ) * .. * * Purpose * ======= * * PSMMCH3 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrix A has to be * transposed or not before computing the matrix-matrix addi- * tion. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input) REAL array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * C (local input/local output) REAL array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) REAL array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * ERR (global output) REAL * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, LOWER, NOTRAN, ROWREP, UPPER INTEGER I, ICCOL, ICROW, ICTXT, IIC, IOFFA, IOFFC, J, $ JJC, LDA, LDC, LDPC, MYCOL, MYROW, NPCOL, $ NPROW REAL ERR0, ERRI, PREC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D, PB_INFOG2L, $ PSERRAXPBY, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL PSLAMCH EXTERNAL LSAME, PSLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * PREC = PSLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) * * Compute expected result in C using data in A and C. This part of * the computation is performed by every process in the grid. * INFO = 0 ERR = ZERO * LDA = MAX( 1, DESCA( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) LDPC = MAX( 1, DESCC( LLD_ ) ) ROWREP = ( DESCC( RSRC_ ).EQ.-1 ) COLREP = ( DESCC( CSRC_ ).EQ.-1 ) * IF( NOTRAN ) THEN * DO 20 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( JA - 1 + J - JC ) * LDA * DO 10 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PSERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PSERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PSERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFA = IOFFA + 1 IOFFC = IOFFC + 1 * 10 CONTINUE * 20 CONTINUE * ELSE * DO 40 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( J - JC ) + ( JA - 1 ) * LDA * DO 30 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PSERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PSERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PSERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFC = IOFFC + 1 IOFFA = IOFFA + LDA * 30 CONTINUE * 40 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL SGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) * RETURN * * End of PSMMCH3 * END SUBROUTINE PSERRAXPBY( ERRBND, ALPHA, X, BETA, Y, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL ALPHA, BETA, ERRBND, PREC, X, Y * .. * * Purpose * ======= * * PSERRAXPBY serially computes y := beta*y + alpha * x and returns a * scaled relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) REAL * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * X (global input) REAL * On entry, X specifies the scalar x to be scaled. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. * * Y (global input/global output) REAL * On entry, Y specifies the scalar y to be added. On exit, Y * contains the resulting scalar y. * * PREC (global input) REAL * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO, ZERO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0, $ ZERO = 0.0E+0 ) * .. * .. Local Scalars .. REAL ADDBND, FACT, SUMPOS, SUMNEG, TMP * .. * .. Intrinsic Functions .. * .. * .. Executable Statements .. * SUMPOS = ZERO SUMNEG = ZERO FACT = ONE + TWO * PREC ADDBND = TWO * TWO * TWO * PREC * TMP = ALPHA * X IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP * FACT ELSE SUMNEG = SUMNEG - TMP * FACT END IF * TMP = BETA * Y IF( TMP.GE.ZERO ) THEN SUMPOS = SUMPOS + TMP * FACT ELSE SUMNEG = SUMNEG - TMP * FACT END IF * Y = ( BETA * Y ) + ( ALPHA * X ) * ERRBND = ADDBND * MAX( SUMPOS, SUMNEG ) * RETURN * * End of PSERRAXPBY * END REAL FUNCTION PSLAMCH( ICTXT, CMACH ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 CMACH INTEGER ICTXT * .. * * Purpose * ======= * * PSLAMCH determines single precision machine parameters. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * CMACH (global input) CHARACTER*1 * On entry, CMACH specifies the value to be returned by PSLAMCH * as follows: * = 'E' or 'e', PSLAMCH := eps, * = 'S' or 's , PSLAMCH := sfmin, * = 'B' or 'b', PSLAMCH := base, * = 'P' or 'p', PSLAMCH := eps*base, * = 'N' or 'n', PSLAMCH := t, * = 'R' or 'r', PSLAMCH := rnd, * = 'M' or 'm', PSLAMCH := emin, * = 'U' or 'u', PSLAMCH := rmin, * = 'L' or 'l', PSLAMCH := emax, * = 'O' or 'o', PSLAMCH := rmax, * * where * * eps = relative machine precision, * sfmin = safe minimum, such that 1/sfmin does not overflow, * base = base of the machine, * prec = eps*base, * t = number of (base) digits in the mantissa, * rnd = 1.0 when rounding occurs in addition, 0.0 otherwise, * emin = minimum exponent before (gradual) underflow, * rmin = underflow threshold - base**(emin-1), * emax = largest exponent before overflow, * rmax = overflow threshold - (base**emax)*(1-eps). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. CHARACTER*1 TOP INTEGER IDUMM REAL TEMP * .. * .. External Subroutines .. EXTERNAL PB_TOPGET, SGAMN2D, SGAMX2D * .. * .. External Functions .. LOGICAL LSAME REAL SLAMCH EXTERNAL LSAME, SLAMCH * .. * .. Executable Statements .. * TEMP = SLAMCH( CMACH ) * IF( LSAME( CMACH, 'E' ).OR.LSAME( CMACH, 'S' ).OR. $ LSAME( CMACH, 'M' ).OR.LSAME( CMACH, 'U' ) ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) IDUMM = 0 CALL SGAMX2D( ICTXT, 'All', TOP, 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) ELSE IF( LSAME( CMACH, 'L' ).OR.LSAME( CMACH, 'O' ) ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) IDUMM = 0 CALL SGAMN2D( ICTXT, 'All', TOP, 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) END IF * PSLAMCH = TEMP * RETURN * * End of PSLAMCH * END SUBROUTINE PSLASET( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, JA, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLASET initializes an m by n submatrix A(IA:IA+M-1,JA:JA+N-1) deno- * ted by sub( A ) to beta on the diagonal and alpha on the offdiago- * nals. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the part of the submatrix sub( A ) * to be set: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of sub( A ) is not changed; * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha, i.e., the cons- * tant to which the offdiagonal elements are to be set. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta, i.e., the constant * to which the diagonal elements are to be set. * * A (local input/local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A to be set. On exit, the * leading m by n submatrix sub( A ) is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * and IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT, ISCOLREP, ISROWREP, LOWER, $ UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, JJA, $ JJMAX, JOFFA, JOFFD, LCMT, LCMT00, LDA, LMBLOC, $ LNBLOC, LOW, M1, MB, MBLKD, MBLKS, MBLOC, MP, $ MRCOL, MRROW, MYCOL, MYROW, N1, NB, NBLKD, $ NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, QNB, TMP1, $ UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS, PB_SLASET * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * ISROWREP = ( DESCA2( RSRC_ ).LT.0 ) ISCOLREP = ( DESCA2( CSRC_ ).LT.0 ) LDA = DESCA2( LLD_ ) * UPPER = .NOT.( LSAME( UPLO, 'L' ) ) LOWER = .NOT.( LSAME( UPLO, 'U' ) ) * IF( ( ( LOWER.AND.UPPER ).AND.( ALPHA.EQ.BETA ) ).OR. $ ( ISROWREP .AND. ISCOLREP ) ) THEN IF( ( MP.GT.0 ).AND.( NQ.GT.0 ) ) $ CALL PB_SLASET( UPLO, MP, NQ, 0, ALPHA, BETA, $ A( IIA + ( JJA - 1 ) * LDA ), LDA ) RETURN END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) CALL PB_BINFO( 0, MP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( ISROWREP ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( ISCOLREP ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * M1 = MP N1 = NQ * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_SLASET( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, BETA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_SLASET( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ ALPHA, A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_SLASET( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ ALPHA, A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_SLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_SLASET( UPLO, MBLOC, INBLOC, LCMT, ALPHA, BETA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_SLASET( 'ALL', TMP1, INBLOC, 0, ALPHA, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_SLASET( 'ALL', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_SLASET( 'All', M1, TMP1, 0, ALPHA, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_SLASET( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, BETA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_SLASET( 'All', IMBLOC, TMP1, 0, ALPHA, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_SLASET( 'All', M1, TMP1, 0, ALPHA, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_SLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_SLASET( UPLO, MBLOC, NBLOC, LCMT, ALPHA, BETA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_SLASET( 'All', TMP1, NBLOC, 0, ALPHA, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_SLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * RETURN * * End of PSLASET * END SUBROUTINE PSLASCAL( TYPE, M, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TYPE INTEGER IA, JA, M, N REAL ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLASCAL scales the m by n submatrix A(IA:IA+M-1,JA:JA+N-1) denoted * by sub( A ) by the scalar alpha. TYPE specifies if sub( A ) is full, * upper triangular, lower triangular or upper Hessenberg. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * On entry, TYPE specifies the type of the input submatrix as * follows: * = 'L' or 'l': sub( A ) is a lower triangular matrix, * = 'U' or 'u': sub( A ) is an upper triangular matrix, * = 'H' or 'h': sub( A ) is an upper Hessenberg matrix, * otherwise sub( A ) is a full matrix. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( A ) are overwritten by * the local entries of the m by n scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 UPLO LOGICAL GODOWN, GOLEFT, LOWER, UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, ITYPE, $ IUPP, JJA, JJMAX, JOFFA, JOFFD, LCMT, LCMT00, $ LDA, LMBLOC, LNBLOC, LOW, M1, MB, MBLKD, MBLKS, $ MBLOC, MP, MRCOL, MRROW, MYCOL, MYROW, N1, NB, $ NBLKD, NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, $ QNB, TMP1, UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS, PB_INFOG2L, PB_SLASCAL * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_NUMROC EXTERNAL LSAME, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 UPLO = TYPE UPPER = .FALSE. LOWER = .TRUE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 UPLO = TYPE UPPER = .TRUE. LOWER = .FALSE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 UPLO = 'U' UPPER = .TRUE. LOWER = .FALSE. IOFFD = 1 ELSE ITYPE = 0 UPLO = 'A' UPPER = .TRUE. LOWER = .TRUE. IOFFD = 0 END IF * * Compute local indexes * IF( ITYPE.EQ.0 ) THEN * * Full matrix * CALL PB_INFOG2L( IA, JA, DESCA2, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = PB_NUMROC( M, IA, DESCA2( IMB_ ), DESCA2( MB_ ), MYROW, $ DESCA2( RSRC_ ), NPROW ) NQ = PB_NUMROC( N, JA, DESCA2( INB_ ), DESCA2( NB_ ), MYCOL, $ DESCA2( CSRC_ ), NPCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * LDA = DESCA2( LLD_ ) IOFFA = IIA + ( JJA - 1 ) * LDA * CALL PB_SLASCAL( 'All', MP, NQ, 0, ALPHA, A( IOFFA ), LDA ) * ELSE * * Trapezoidal matrix * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, * LNBLOC, ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) LDA = DESCA2( LLD_ ) * CALL PB_BINFO( IOFFD, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * M1 = MP N1 = NQ IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and * update LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_SLASCAL( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_SLASCAL( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_SLASCAL( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_SLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_SLASCAL( UPLO, MBLOC, INBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_SLASCAL( 'All', TMP1, INBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_SLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_SLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_SLASCAL( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_SLASCAL( 'All', IMBLOC, TMP1, 0, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_SLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_SLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_SLASCAL( UPLO, MBLOC, NBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_SLASCAL( 'All', TMP1, NBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_SLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * END IF * RETURN * * End of PSLASCAL * END SUBROUTINE PSLAGEN( INPLACE, AFORM, DIAG, OFFA, M, N, IA, JA, $ DESCA, IASEED, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE CHARACTER*1 AFORM, DIAG INTEGER IA, IASEED, JA, LDA, M, N, OFFA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( LDA, * ) * .. * * Purpose * ======= * * PSLAGEN generates (or regenerates) a submatrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the generated submatrix is diago- * nally dominant or not as follows: * DIAG = 'D' : sub( A ) is diagonally dominant, * DIAG = 'N' : sub( A ) is not diagonally dominant. * * OFFA (global input) INTEGER * On entry, OFFA specifies the offdiagonal of the underlying * matrix A(1:DESCA(M_),1:DESCA(N_)) of interest when the subma- * trix is symmetric, Hermitian or diagonally dominant. OFFA = 0 * specifies the main diagonal, OFFA > 0 specifies a subdiago- * nal, and OFFA < 0 specifies a superdiagonal (see further de- * tails). * * M (global input) INTEGER * On entry, M specifies the global number of matrix rows of the * submatrix sub( A ) to be generated. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of matrix columns of * the submatrix sub( A ) to be generated. N must be at least * zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IASEED (global input) INTEGER * On entry, IASEED specifies the seed number to generate the * matrix A. IASEED must be at least zero. * * A (local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). On exit, this array contains the * local entries of the randomly generated submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. When INPLACE is .FALSE., LDA is usually DESCA(LLD_). * This restriction is however not enforced, and this subroutine * requires only that LDA >= MAX( 1, Mp ) where * * Mp = PB_NUMROC( M, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ). * * PB_NUMROC is a ScaLAPACK tool function; MYROW, MYCOL, NPROW * and NPCOL can be determined by calling the BLACS subroutine * BLACS_GRIDINFO. * * Further Details * =============== * * OFFD is tied to the matrix described by DESCA, as opposed to the * piece that is currently (re)generated. This is a global information * independent from the distribution parameters. Below are examples of * the meaning of OFFD for a global 7 by 5 matrix: * * --------------------------------------------------------------------- * OFFD | 0 -1 -2 -3 -4 0 -1 -2 -3 -4 0 -1 -2 -3 -4 * -------|------------------------------------------------------------- * | | OFFD=-1 | OFFD=0 OFFD=2 * | V V * 0 | . d . . . -> d . . . . . . . . . * 1 | . . d . . . d . . . . . . . . * 2 | . . . d . . . d . . -> d . . . . * 3 | . . . . d . . . d . . d . . . * 4 | . . . . . . . . . d . . d . . * 5 | . . . . . . . . . . . . . d . * 6 | . . . . . . . . . . . . . . d * --------------------------------------------------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. LOGICAL DIAGDO, SYMM, HERM, NOTRAN INTEGER CSRC, I, IACOL, IAROW, ICTXT, IIA, ILOCBLK, $ ILOCOFF, ILOW, IMB, IMB1, IMBLOC, IMBVIR, INB, $ INB1, INBLOC, INBVIR, INFO, IOFFDA, ITMP, IUPP, $ IVIR, JJA, JLOCBLK, JLOCOFF, JVIR, LCMT00, $ LMBLOC, LNBLOC, LOW, MAXMN, MB, MBLKS, MP, $ MRCOL, MRROW, MYCDIST, MYCOL, MYRDIST, MYROW, $ NB, NBLKS, NPCOL, NPROW, NQ, NVIR, RSRC, UPP REAL ALPHA * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ), IMULADD( 4, JMP_LEN ), $ IRAN( 2 ), JMP( JMP_LEN ), MULADD0( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CHKMAT, PB_DESCTRANS, PB_INITJMP, $ PB_INITMULADD, PB_JUMP, PB_JUMPIT, PB_LOCINFO, $ PB_SETLOCRAN, PB_SETRAN, PB_SLAGEN, PSLADOM, $ PXERBLA * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, REAL * .. * .. Data Statements .. DATA ( MULADD0( I ), I = 1, 4 ) / 20077, 16838, $ 12345, 0 / * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Test the input arguments * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1000 + CTXT_ ) ELSE SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) NOTRAN = LSAME( AFORM, 'N' ) DIAGDO = LSAME( DIAG, 'D' ) IF( .NOT.( SYMM.OR.HERM.OR.NOTRAN ) .AND. $ .NOT.( LSAME( AFORM, 'T' ) ) .AND. $ .NOT.( LSAME( AFORM, 'C' ) ) ) THEN INFO = -2 ELSE IF( ( .NOT.DIAGDO ) .AND. $ ( .NOT.LSAME( DIAG, 'N' ) ) ) THEN INFO = -3 END IF CALL PB_CHKMAT( ICTXT, M, 5, N, 6, IA, JA, DESCA2, 10, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PSLAGEN', -INFO ) RETURN END IF * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) IMB = DESCA2( IMB_ ) INB = DESCA2( INB_ ) RSRC = DESCA2( RSRC_ ) CSRC = DESCA2( CSRC_ ) * * Figure out local information about the distributed matrix operand * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * IOFFDA = JA + OFFA - IA CALL PB_BINFO( IOFFDA, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * * Initialize ILOCBLK, ILOCOFF, MYRDIST, JLOCBLK, JLOCOFF, MYCDIST * This values correspond to the square virtual underlying matrix * of size MAX( M_ + MAX( 0, -OFFA ), N_ + MAX( 0, OFFA ) ) used * to set up the random sequence. For practical purposes, the size * of this virtual matrix is upper bounded by M_ + N_ - 1. * ITMP = MAX( 0, -OFFA ) IVIR = IA + ITMP IMBVIR = IMB + ITMP NVIR = DESCA2( M_ ) + ITMP * CALL PB_LOCINFO( IVIR, IMBVIR, MB, MYROW, RSRC, NPROW, ILOCBLK, $ ILOCOFF, MYRDIST ) * ITMP = MAX( 0, OFFA ) JVIR = JA + ITMP INBVIR = INB + ITMP NVIR = MAX( MAX( NVIR, DESCA2( N_ ) + ITMP ), $ DESCA2( M_ ) + DESCA2( N_ ) - 1 ) * CALL PB_LOCINFO( JVIR, INBVIR, NB, MYCOL, CSRC, NPCOL, JLOCBLK, $ JLOCOFF, MYCDIST ) * IF( SYMM .OR. HERM .OR. NOTRAN ) THEN * CALL PB_INITJMP( .TRUE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 1, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_SLAGEN( 'Lower', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( SYMM .OR. HERM .OR. ( .NOT. NOTRAN ) ) THEN * CALL PB_INITJMP( .FALSE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 1, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_SLAGEN( 'Upper', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( DIAGDO ) THEN * MAXMN = MAX( DESCA2( M_ ), DESCA2( N_ ) ) ALPHA = REAL( MAXMN ) * IF( IOFFDA.GE.0 ) THEN CALL PSLADOM( INPLACE, MIN( MAX( 0, M-IOFFDA ), N ), ALPHA, $ A, MIN( IA+IOFFDA, IA+M-1 ), JA, DESCA ) ELSE CALL PSLADOM( INPLACE, MIN( M, MAX( 0, N+IOFFDA ) ), ALPHA, $ A, IA, MIN( JA-IOFFDA, JA+N-1 ), DESCA ) END IF * END IF * RETURN * * End of PSLAGEN * END SUBROUTINE PSLADOM( INPLACE, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE INTEGER IA, JA, N REAL ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) REAL A( * ) * .. * * Purpose * ======= * * PSLADOM adds alpha to the diagonal entries of an n by n submatrix * sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * N (global input) INTEGER * On entry, N specifies the global order of the submatrix * sub( A ) to be modified. N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the local entries * of this array corresponding to the main diagonal of sub( A ) * have been updated. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP REAL ATMP * .. * .. Local Scalars .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.EQ.0 ) $ RETURN * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) * CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA+LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ABS( ATMP ) + ALPHA 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PSLADOM * END SUBROUTINE PB_PSLAPRNT( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * Purpose * ======= * * PB_PSLAPRNT prints to the standard output a submatrix sub( A ) deno- * ting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and printed by * the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the row index of the printing pro- * cess. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the column index of the printing * process. * * CMATNM (global input) CHARACTER*(*) * On entry, CMATNM is the name of the matrix to be printed. * * NOUT (global input) INTEGER * On entry, NOUT specifies the output unit number. When NOUT is * equal to 6, the submatrix is printed on the screen. * * WORK (local workspace) REAL array * On entry, WORK is a work array of dimension at least equal to * MAX( IMB_A, MB_A ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW, PCOL, PROW * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_DESCTRANS, PB_PSLAPRN2 * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * CALL BLACS_GRIDINFO( DESCA2( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( DESCA2( RSRC_ ).GE.0 ) THEN IF( DESCA2( CSRC_ ).GE.0 ) THEN CALL PB_PSLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, ICPRNT, $ CMATNM, NOUT, DESCA2( RSRC_ ), $ DESCA2( CSRC_ ), WORK ) ELSE DO 10 PCOL = 0, NPCOL - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Colum-replicated array -- ' , $ 'copy in process column: ', PCOL CALL PB_PSLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, DESCA2( RSRC_ ), $ PCOL, WORK ) 10 CONTINUE END IF ELSE IF( DESCA2( CSRC_ ).GE.0 ) THEN DO 20 PROW = 0, NPROW - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Row-replicated array -- ' , $ 'copy in process row: ', PROW CALL PB_PSLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, PROW, $ DESCA2( CSRC_ ), WORK ) 20 CONTINUE ELSE DO 40 PROW = 0, NPROW - 1 DO 30 PCOL = 0, NPCOL - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Replicated array -- ' , $ 'copy in process (', PROW, ',', PCOL, ')' CALL PB_PSLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, PROW, PCOL, $ WORK ) 30 CONTINUE 40 CONTINUE END IF END IF * RETURN * * End of PB_PSLAPRNT * END SUBROUTINE PB_PSLAPRN2( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, PROW, PCOL, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT, PCOL, PROW * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) REAL A( * ), WORK( * ) * .. * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL AISCOLREP, AISROWREP INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, LDW, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, PB_INFOG2L, $ SGERV2D, SGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL PB_INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) II = IIA JJ = JJA IF( DESCA( RSRC_ ).LT.0 ) THEN AISROWREP = .TRUE. IAROW = PROW ICURROW = PROW ELSE AISROWREP = .FALSE. ICURROW = IAROW END IF IF( DESCA( CSRC_ ).LT.0 ) THEN AISCOLREP = .TRUE. IACOL = PCOL ICURCOL = PCOL ELSE AISCOLREP = .FALSE. ICURCOL = IACOL END IF LDA = DESCA( LLD_ ) LDW = MAX( DESCA( IMB_ ), DESCA( MB_ ) ) * * Handle the first block of column separately * JB = DESCA( INB_ ) - JA + 1 IF( JB.LE.0 ) $ JB = ( (-JB) / DESCA( NB_ ) + 1 ) * DESCA( NB_ ) + JB JB = MIN( JB, N ) JN = JA+JB-1 DO 60 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( (-IB) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA+IB-1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, JA+H, WORK( K ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, JA+H, A( II+K+(JJ+H-1)*LDA ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, JA+H, WORK( K ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB IF( .NOT.AISCOLREP ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IB = DESCA( IMB_ )-IA+1 IF( IB.LE.0 ) $ IB = ( (-IB) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA+IB-1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, J+H, A( II+K+(JJ+H-1)*LDA ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, J+H, WORK( K ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, J+H, A( II+K+(JJ+H-1)*LDA ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL SGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL SGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, J+H, WORK( K ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB IF( .NOT.AISCOLREP ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * 9999 FORMAT( 1X, A, '(', I6, ',', I6, ')=', E16.8 ) * RETURN * * End of PB_PSLAPRN2 * END SUBROUTINE PB_SFILLPAD( ICTXT, M, N, A, LDA, IPRE, IPOST, CHKVAL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N REAL CHKVAL * .. * .. Array Arguments .. REAL A( * ) * .. * * Purpose * ======= * * PB_SFILLPAD surrounds a two dimensional local array with a guard-zone * initialized to the value CHKVAL. The user may later call the routine * PB_SCHEKPAD to discover if the guardzone has been violated. There are * three guardzones. The first is a buffer of size IPRE that is before * the start of the array. The second is the buffer of size IPOST which * is after the end of the array to be padded. Finally, there is a guard * zone inside every column of the array to be padded, in the elements * of A(M+1:LDA, J). * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * M (local input) INTEGER * On entry, M specifies the number of rows in the local array * A. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns in the local ar- * ray A. N must be at least zero. * * A (local input/local output) REAL array * On entry, A is an array of dimension (LDA,N). On exit, this * array is the padded array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array to be padded. LDA must be at least MAX( 1, M ). * * IPRE (local input) INTEGER * On entry, IPRE specifies the size of the guard zone to put * before the start of the padded array. * * IPOST (local input) INTEGER * On entry, IPOST specifies the size of the guard zone to put * after the end of the padded array. * * CHKVAL (local input) REAL * On entry, CHKVAL specifies the value to pad the array with. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE A( I ) = CHKVAL 10 CONTINUE ELSE WRITE( *, FMT = '(A)' ) $ 'WARNING no pre-guardzone in PB_SFILLPAD' END IF * * Put check buffer in back of A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 A( I ) = CHKVAL 20 CONTINUE ELSE WRITE( *, FMT = '(A)' ) $ 'WARNING no post-guardzone in PB_SFILLPAD' END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + ( LDA - M ) - 1 A( I ) = CHKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * RETURN * * End of PB_SFILLPAD * END SUBROUTINE PB_SCHEKPAD( ICTXT, MESS, M, N, A, LDA, IPRE, IPOST, $ CHKVAL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N REAL CHKVAL * .. * .. Array Arguments .. CHARACTER*(*) MESS REAL A( * ) * .. * * Purpose * ======= * * PB_SCHEKPAD checks that the padding around a local array has not been * overwritten since the call to PB_SFILLPAD. Three types of errors are * reported: * * 1) Overwrite in pre-guardzone. This indicates a memory overwrite has * occurred in the first IPRE elements which form a buffer before the * beginning of A. Therefore, the error message: * 'Overwrite in pre-guardzone: loc( 5) = 18.00000' * tells that the 5th element of the IPRE long buffer has been overwrit- * ten with the value 18, where it should still have the value CHKVAL. * * 2) Overwrite in post-guardzone. This indicates a memory overwrite has * occurred in the last IPOST elements which form a buffer after the end * of A. Error reports are refered from the end of A. Therefore, * 'Overwrite in post-guardzone: loc( 19) = 24.00000' * tells that the 19th element after the end of A was overwritten with * the value 24, where it should still have the value of CHKVAL. * * 3) Overwrite in lda-m gap. Tells you elements between M and LDA were * overwritten. So, * 'Overwrite in lda-m gap: A( 12, 3) = 22.00000' * tells that the element at the 12th row and 3rd column of A was over- * written with the value of 22, where it should still have the value of * CHKVAL. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * MESS (local input) CHARACTER*(*) * On entry, MESS is a ttring containing a user-defined message. * * M (local input) INTEGER * On entry, M specifies the number of rows in the local array * A. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns in the local ar- * ray A. N must be at least zero. * * A (local input) REAL array * On entry, A is an array of dimension (LDA,N). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array to be padded. LDA must be at least MAX( 1, M ). * * IPRE (local input) INTEGER * On entry, IPRE specifies the size of the guard zone to put * before the start of the padded array. * * IPOST (local input) INTEGER * On entry, IPOST specifies the size of the guard zone to put * after the end of the padded array. * * CHKVAL (local input) REAL * On entry, CHKVAL specifies the value to pad the array with. * * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. CHARACTER*1 TOP INTEGER I, IAM, IDUMM, INFO, J, K, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, PB_TOPGET * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW*NPCOL + MYCOL INFO = -1 * * Check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, ' pre', I, $ A( I ) INFO = IAM END IF 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PB_SCHEKPAD' END IF * * Check buffer after A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, 'post', $ I-J+1, A( I ) INFO = IAM END IF 20 CONTINUE ELSE WRITE( *, FMT = * ) $ 'WARNING no post-guardzone buffer in PB_SCHEKPAD' END IF * * Check all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9997 ) MYROW, MYCOL, MESS, $ I-IPRE-LDA*(J-1), J, A( I ) INFO = IAM END IF 30 CONTINUE K = K + LDA 40 CONTINUE END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, IDUMM, IDUMM, -1, $ 0, 0 ) IF( IAM.EQ.0 .AND. INFO.GE.0 ) THEN WRITE( *, FMT = 9999 ) INFO / NPCOL, MOD( INFO, NPCOL ), MESS END IF * 9999 FORMAT( '{', I5, ',', I5, '}: Memory overwrite in ', A ) 9998 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ A4, '-guardzone: loc(', I3, ') = ', G11.4 ) 9997 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ 'lda-m gap: loc(', I3, ',', I3, ') = ', G11.4 ) * RETURN * * End of PB_SCHEKPAD * END SUBROUTINE PB_SLASET( UPLO, M, N, IOFFD, ALPHA, BETA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * PB_SLASET initializes a two-dimensional array A to beta on the diago- * nal specified by IOFFD and alpha on the offdiagonals. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be set as follows: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of A is not changed, * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of A is not changed, * = 'D' or 'd' Only the diagonal of A is set, * Otherwise: All of the array A is set. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) REAL * On entry, ALPHA specifies the value to which the offdiagonal * array elements are set to. * * BETA (input) REAL * On entry, BETA specifies the value to which the diagonal ar- * ray elements are set to. * * A (input/output) REAL array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be set, and the strictly lower trape- * zoidal part of A is not referenced; When IUPLO = 'L' or 'l', * the leading m by n part of the array A must contain the * lower trapezoidal part of the matrix as specified by IOFFD to * be set, and the strictly upper trapezoidal part of A is * not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Set the diagonal to BETA and the strictly lower triangular * part of the array to ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = BETA DO 30 I = JTMP + 1, M A( I, J ) = ALPHA 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Set the diagonal to BETA and the strictly upper triangular * part of the array to ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 50 I = 1, JTMP - 1 A( I, J ) = ALPHA 50 CONTINUE A( JTMP, J ) = BETA 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Set the array to BETA on the diagonal. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 90 CONTINUE * ELSE * * Set the array to BETA on the diagonal and ALPHA on the * offdiagonal. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA 100 CONTINUE 110 CONTINUE IF( ALPHA.NE.BETA .AND. IOFFD.LT.M .AND. IOFFD.GT.-N ) THEN DO 120 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 120 CONTINUE END IF * END IF * RETURN * * End of PB_SLASET * END SUBROUTINE PB_SLASCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N REAL ALPHA * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * PB_SLASCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) REAL * On entry, ALPHA specifies the scalar alpha. * * A (input/output) REAL array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA * A( I, J ) 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = ALPHA * A( I, J ) 50 CONTINUE 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA * A( I, J ) 100 CONTINUE 110 CONTINUE * END IF * RETURN * * End of PB_SLASCAL * END SUBROUTINE PB_SLAGEN( UPLO, AFORM, A, LDA, LCMT00, IRAN, MBLKS, $ IMBLOC, MB, LMBLOC, NBLKS, INBLOC, NB, $ LNBLOC, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO, AFORM INTEGER IMBLOC, INBLOC, LCMT00, LDA, LMBLOC, LNBLOC, $ MB, MBLKS, NB, NBLKS * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) REAL A( LDA, * ) * .. * * Purpose * ======= * * PB_SLAGEN locally initializes an array A. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the lower (UPLO='L') trape- * zoidal part or the upper (UPLO='U') trapezoidal part is to be * generated when the matrix to be generated is symmetric or * Hermitian. For all the other values of AFORM, the value of * this input argument is ignored. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * A (local output) REAL array * On entry, A is an array of dimension (LLD_A, *). On exit, * this array contains the local entries of the randomly genera- * ted submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. LDA must be at least one. * * LCMT00 (global input) INTEGER * On entry, LCMT00 is the LCM value specifying the off-diagonal * of the underlying matrix of interest. LCMT00=0 specifies the * main diagonal, LCMT00 > 0 specifies a subdiagonal, LCMT00 < 0 * specifies superdiagonals. * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2 containing respec- * tively the 16-lower and 16-higher bits of the encoding of the * entry of the random sequence corresponding locally to the * first local array entry to generate. Usually, this array is * computed by PB_SETLOCRAN. * * MBLKS (local input) INTEGER * On entry, MBLKS specifies the local number of blocks of rows. * MBLKS is at least zero. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * LMBLOC (local input) INTEGER * On entry, LMBLOC specifies the number of rows (size) of the * local lowest blocks. LMBLOC is at least zero. * * NBLKS (local input) INTEGER * On entry, NBLKS specifies the local number of blocks of co- * lumns. NBLKS is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * LNBLOC (local input) INTEGER * On entry, LNBLOC specifies the number of columns (size) of * the local rightmost blocks. LNBLOC is at least zero. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the random matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) * .. * .. Local Scalars .. INTEGER I, IB, IBLK, II, IK, ITMP, JB, JBLK, JJ, JK, $ JTMP, LCMTC, LCMTR, LOW, MNB, UPP REAL DUMMY * .. * .. Local Arrays .. INTEGER IB0( 2 ), IB1( 2 ), IB2( 2 ), IB3( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMPIT * .. * .. External Functions .. LOGICAL LSAME REAL PB_SRAND EXTERNAL LSAME, PB_SRAND * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * DO 10 I = 1, 2 IB1( I ) = IRAN( I ) IB2( I ) = IRAN( I ) IB3( I ) = IRAN( I ) 10 CONTINUE * IF( LSAME( AFORM, 'N' ) ) THEN * * Generate random matrix * JJ = 1 * DO 50 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * DO 40 JK = JJ, JJ + JB - 1 * II = 1 * DO 30 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * * Blocks are IB by JB * DO 20 IK = II, II + IB - 1 A( IK, JK ) = PB_SRAND( 0 ) 20 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 30 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 40 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 50 CONTINUE * ELSE IF( LSAME( AFORM, 'T' ) .OR. LSAME( AFORM, 'C' ) ) THEN * * Generate the transpose of the matrix that would be normally * generated. * II = 1 * DO 90 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 80 IK = II, II + IB - 1 * JJ = 1 * DO 70 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 60 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_SRAND( 0 ) 60 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 70 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 80 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 90 CONTINUE * ELSE IF( ( LSAME( AFORM, 'S' ) ).OR.( LSAME( AFORM, 'H' ) ) ) THEN * * Generate a symmetric matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 170 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 160 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 150 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 100 IK = II, II + IB - 1 DUMMY = PB_SRAND( 0 ) 100 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 110 IK = II, II + IB - 1 A( IK, JK ) = PB_SRAND( 0 ) 110 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 120 IK = II, ITMP - 1 DUMMY = PB_SRAND( 0 ) 120 CONTINUE * DO 130 IK = ITMP, II + IB - 1 A( IK, JK ) = PB_SRAND( 0 ) 130 CONTINUE * END IF * ELSE * DO 140 IK = II, II + IB - 1 A( IK, JK ) = PB_SRAND( 0 ) 140 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 150 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 160 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 170 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 250 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 240 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 230 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 180 JK = JJ, JJ + JB - 1 DUMMY = PB_SRAND( 0 ) 180 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 190 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_SRAND( 0 ) 190 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 200 JK = JJ, JTMP - 1 DUMMY = PB_SRAND( 0 ) 200 CONTINUE * DO 210 JK = JTMP, JJ + JB - 1 A( IK, JK ) = PB_SRAND( 0 ) 210 CONTINUE * END IF * ELSE * DO 220 JK = JJ, JJ + JB - 1 A( IK, JK ) = PB_SRAND( 0 ) 220 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 230 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 240 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 250 CONTINUE * END IF * END IF * RETURN * * End of PB_SLAGEN * END REAL FUNCTION PB_SRAND( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_SRAND generates the next number in the random sequence. This func- * tion ensures that this number will be in the interval ( -1.0, 1.0 ). * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. External Functions .. REAL PB_SRAN EXTERNAL PB_SRAN * .. * .. Executable Statements .. * PB_SRAND = ONE - TWO * PB_SRAN( IDUMM ) * RETURN * * End of PB_SRAND * END REAL FUNCTION PB_SRAN( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_SRAN generates the next number in the random sequence. * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL DIVFAC, POW16 PARAMETER ( DIVFAC = 2.147483648E+9, $ POW16 = 6.5536E+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Intrinsic Functions .. INTRINSIC REAL * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * PB_SRAN = ( REAL( IRAND( 1 ) ) + POW16 * REAL( IRAND( 2 ) ) ) / $ DIVFAC * CALL PB_LMUL( IRAND, IACS, J ) CALL PB_LADD( J, IACS( 3 ), IRAND ) * RETURN * * End of PB_SRAN * END scalapack-2.0.2/PBLAS/TESTING/PZBLAS1TST.dat000644 000766 000024 00000003106 10363532303 020026 0ustar00juliestaff000000 000000 'Level 1 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PZBLAS1TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q (2.0D0, -3.0D0) value of ALPHA 4 number of tests problems 14 44 28 7 values of N 36 9 39 53 values of M_X 24 67 3 12 values of N_X 2 5 2 5 values of IMB_X 2 5 2 5 values of INB_X 2 5 2 5 values of MB_X 2 5 2 5 values of NB_X 0 0 0 0 values of RSRC_X 0 0 0 0 values of CSRC_X 5 3 1 1 values of IX 2 6 1 1 values of JX 1 9 1 53 values of INCX 2 6 35 14 values of M_Y 27 55 43 12 values of N_Y 2 5 2 5 values of IMB_Y 2 5 2 5 values of INB_Y 2 5 2 5 values of MB_Y 2 5 2 5 values of NB_Y 0 0 0 0 values of RSRC_Y 0 0 0 0 values of CSRC_Y 1 4 1 1 values of IY 7 6 1 1 values of JY 2 6 35 1 values of INCY PZSWAP T put F for no test in the same column PZSCAL T put F for no test in the same column PZDSCAL T put F for no test in the same column PZCOPY T put F for no test in the same column PZAXPY T put F for no test in the same column PZDOTU T put F for no test in the same column PZDOTC T put F for no test in the same column PDZNRM2 T put F for no test in the same column PDZASUM T put F for no test in the same column PZAMAX T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/pzblas1tst.f000644 000766 000024 00000421150 11750130340 020142 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 10) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PZSWAP ', 'PZSCAL ', $ 'PZDSCAL', 'PZCOPY ', 'PZAXPY ', $ 'PZDOTU ', 'PZDOTC ', 'PDZNRM2', $ 'PDZASUM', 'PZAMAX'/ END BLOCK DATA PROGRAM PZBLA1TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PZBLA1TST is the main testing program for the PBLAS Level 1 routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 46 lines: * 'Level 1 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PZBLAS1TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0D0, 0.0D0) value of ALPHA * 2 number of tests problems * 3 4 values of N * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PZSWAP T put F for no test in the same column * PZSCAL T put F for no test in the same column * PZDSCAL T put F for no test in the same column * PZCOPY T put F for no test in the same column * PZAXPY T put F for no test in the same column * PZDOTU T put F for no test in the same column * PZDOTC T put F for no test in the same column * PDZNRM2 T put F for no test in the same column * PDZASUM T put F for no test in the same column * PZAMAX T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * ZPLXSZ INTEGER * DBLESZ and ZPLXSZ indicate the length in bytes on the given * platform for a double precision real and a double precision * complex. By default, DBLESZ is set to eight and ZPLXSZ is set * to sixteen. * * MEM COMPLEX*16 array * MEM is an array of dimension TOTMEM / ZPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, ZPLXSZ, TOTMEM, $ MEMSIZ, NSUBS DOUBLE PRECISION RZERO COMPLEX*16 PADVAL, ZERO PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ ZPLXSZ = 16, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / ZPLXSZ, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ RZERO = 0.0D+0, ZERO = ( 0.0D+0, 0.0D+0 ), $ NSUBS = 10 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE INTEGER CSRCX, CSRCY, I, IAM, ICTXT, IGAP, IMBX, IMBY, $ IMIDX, IMIDY, INBX, INBY, INCX, INCY, IPMATX, $ IPMATY, IPOSTX, IPOSTY, IPREX, IPREY, IPW, IPX, $ IPY, IVERB, IX, IXSEED, IY, IYSEED, J, JX, JY, $ K, LDX, LDY, MBX, MBY, MEMREQD, MPX, MPY, MX, $ MY, MYCOL, MYROW, N, NBX, NBY, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQX, NQY, NTESTS, NX, NY, $ PISCLR, RSRCX, RSRCY, TSKIP, TSTCNT DOUBLE PRECISION PUSCLR COMPLEX*16 ALPHA, PSCLR * .. * .. Local Arrays .. CHARACTER*80 OUTFILE LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) INTEGER CSCXVAL( MAXTESTS ), CSCYVAL( MAXTESTS ), $ DESCX( DLEN_ ), DESCXR( DLEN_ ), $ DESCY( DLEN_ ), DESCYR( DLEN_ ), IERR( 4 ), $ IMBXVAL( MAXTESTS ), IMBYVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JXVAL( MAXTESTS ), JYVAL( MAXTESTS ), $ KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MXVAL( MAXTESTS ), $ MYVAL( MAXTESTS ), NBXVAL( MAXTESTS ), $ NBYVAL( MAXTESTS ), NVAL( MAXTESTS ), $ NXVAL( MAXTESTS ), NYVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_DESCSET2, PB_PZLAPRNT, PB_ZCHEKPAD, $ PB_ZFILLPAD, PDZASUM, PDZNRM2, PVDESCCHK, $ PVDIMCHK, PZAMAX, PZAXPY, PZBLA1TSTINFO, $ PZBLAS1TSTCHK, PZBLAS1TSTCHKE, PZCHKARG1, $ PZCHKVOUT, PZCOPY, PZDOTC, PZDOTU, PZDSCAL, $ PZLAGEN, PZMPRNT, PZSCAL, PZSWAP, PZVPRNT * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MOD * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .FALSE., .FALSE., .TRUE., $ .TRUE., .TRUE., .TRUE., .FALSE., .FALSE., $ .FALSE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler will abort on errors. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IXSEED = 100 IYSEED = 200 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PZBLA1TSTINFO( OUTFILE, NOUT, NTESTS, NVAL, MXVAL, NXVAL, $ IMBXVAL, MBXVAL, INBXVAL, NBXVAL, RSCXVAL, $ CSCXVAL, IXVAL, JXVAL, INCXVAL, MYVAL, $ NYVAL, IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, INCYVAL, $ MAXTESTS, NGRIDS, PVAL, MAXGRIDS, QVAL, $ MAXGRIDS, LTEST, SOF, TEE, IAM, IGAP, IVERB, $ NPROCS, ALPHA, MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PZBLAS1TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * N = NVAL( J ) MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) MBX = MBXVAL( J ) INBX = INBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) MBY = MBYVAL( J ) INBY = INBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN TSTCNT = TSTCNT + 1 WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) N, IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY WRITE( NOUT, FMT = 9995 ) END IF * * Check the validity of the input and initialize DESC_ * CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, IGAP, GAPMUL, $ IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, IGAP, GAPMUL, $ IERR( 2 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDX = MAX( 1, MX ) LDY = MAX( 1, MY ) * * Assign pointers into MEM for matrices corresponding to * vectors X and Y. Ex: IPX starts at position MEM( IPREX+1 ). * IPX = IPREX + 1 IPY = IPX + DESCX( LLD_ ) * NQX + IPOSTX + IPREY IPMATX = IPY + DESCY( LLD_ ) * NQY + IPOSTY IPMATY = IPMATX + MX * NX IPW = IPMATY + MY * NY * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * MEMREQD = IPW - 1 + $ MAX( MAX( IMBX, MBX ), MAX( IMBY, MBY ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9990 ) MEMREQD*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9991 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 1 routines * DO 30 K = 1, NSUBS * * Continue only if this sub has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9989 ) SNAMES( K ) END IF * * Check the validity of the operand sizes * CALL PVDIMCHK( ICTXT, NOUT, N, 'X', IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, N, 'Y', IY, JY, DESCY, INCY, $ IERR( 2 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 GO TO 30 END IF * * Generate distributed matrices X and Y * CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCXR, MX, NX, IMBX, INBX, MBX, NBX, $ -1, -1, ICTXT, MAX( 1, MX ) ) CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCXR, IXSEED, MEM( IPMATX ), $ DESCXR( LLD_ ) ) IF( YCHECK( K ) ) THEN CALL PB_DESCSET2( DESCYR, MY, NY, IMBY, INBY, MBY, $ NBY, -1, -1, ICTXT, MAX( 1, MY ) ) CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCYR, IYSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) END IF * * Pad the guard zones of X, and Y * CALL PB_ZFILLPAD( ICTXT, MPX, NQX, MEM( IPX-IPREX ), $ DESCX( LLD_ ), IPREX, IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_ZFILLPAD( ICTXT, MPY, NQY, MEM( IPY-IPREY ), $ DESCY( LLD_ ), IPREY, IPOSTY, $ PADVAL ) END IF * * Initialize the check for INPUT only args. * INFO = 0 CALL PZCHKARG1( ICTXT, NOUT, SNAMES( K ), N, ALPHA, IX, $ JX, DESCX, INCX, IY, JY, DESCY, INCY, $ INFO ) * INFO = 0 PSCLR = ZERO PUSCLR = RZERO PISCLR = 0 * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PZLAPRNT( 1, N, MEM( IPX ), IX, JX, DESCX, $ 0, 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) ELSE CALL PB_PZLAPRNT( N, 1, MEM( IPX ), IX, JX, DESCX, $ 0, 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) END IF IF( YCHECK( K ) ) THEN IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PZLAPRNT( 1, N, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) ELSE CALL PB_PZLAPRNT( N, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) END IF END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PZLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, 0, $ 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPW ) ) IF( YCHECK( K ) ) $ CALL PB_PZLAPRNT( MY, NY, MEM( IPY ), 1, 1, DESCY, $ 0, 0, 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPW ) ) END IF * * Call the PBLAS routine * IF( K.EQ.1 ) THEN * * Test PZSWAP * CALL PZSWAP( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.2 ) THEN * * Test PZSCAL * PSCLR = ALPHA CALL PZSCAL( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.3 ) THEN * * Test PZDSCAL * PUSCLR = DBLE( ALPHA ) CALL PZDSCAL( N, DBLE( ALPHA ), MEM( IPX ), IX, JX, $ DESCX, INCX ) * ELSE IF( K.EQ.4 ) THEN * * Test PZCOPY * CALL PZCOPY( N, MEM( IPX ), IX, JX, DESCX, INCX, $ MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.5 ) THEN * * Test PZAXPY * PSCLR = ALPHA CALL PZAXPY( N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.6 ) THEN * * Test PZDOTU * CALL PZDOTU( N, PSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.7 ) THEN * * Test PZDOTC * CALL PZDOTC( N, PSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.8 ) THEN * * Test PDZNRM2 * CALL PDZNRM2( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.9 ) THEN * * Test PDZASUM * CALL PDZASUM( N, PUSCLR, MEM( IPX ), IX, JX, DESCX, $ INCX ) * ELSE IF( K.EQ.10 ) THEN * CALL PZAMAX( N, PSCLR, PISCLR, MEM( IPX ), IX, JX, $ DESCX, INCX ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9978 ) INFO GO TO 30 END IF * * Check the computations * CALL PZBLAS1TSTCHK( ICTXT, NOUT, K, N, PSCLR, PUSCLR, $ PISCLR, MEM( IPMATX ), MEM( IPX ), $ IX, JX, DESCX, INCX, MEM( IPMATY ), $ MEM( IPY ), IY, JY, DESCY, INCY, $ INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 END IF * * Check padding * CALL PB_ZCHEKPAD( ICTXT, SNAMES( K ), MPX, NQX, $ MEM( IPX-IPREX ), DESCX( LLD_ ), $ IPREX, IPOSTX, PADVAL ) IF( YCHECK( K ) ) THEN CALL PB_ZCHEKPAD( ICTXT, SNAMES( K ), MPY, NQY, $ MEM( IPY-IPREY ), DESCY( LLD_ ), $ IPREY, IPOSTY, PADVAL ) END IF * * Check input-only scalar arguments * INFO = 1 CALL PZCHKARG1( ICTXT, NOUT, SNAMES( K ), N, ALPHA, IX, $ JX, DESCX, INCX, IY, JY, DESCY, INCY, $ INFO ) * * Check input-only array arguments * CALL PZCHKVOUT( N, MEM( IPMATX ), MEM( IPX ), IX, JX, $ DESCX, INCX, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) 'PARALLEL_X', SNAMES( K ) END IF * IF( YCHECK( K ) ) THEN CALL PZCHKVOUT( N, MEM( IPMATY ), MEM( IPY ), IY, JY, $ DESCY, INCY, IERR( 4 ) ) IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) 'PARALLEL_Y', $ SNAMES( K ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE. 0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) SNAMES( K ) KFAIL( K ) = KFAIL( K ) + 1 ERRFLG = .TRUE. ELSE IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) SNAMES( K ) KPASS( K ) = KPASS( K ) + 1 END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 3 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PZMPRNT( ICTXT, NOUT, MX, NX, MEM( IPMATX ), $ LDX, 0, 0, 'SERIAL_X' ) CALL PB_PZLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, $ 0, 0, 'PARALLEL_X', NOUT, $ MEM( IPMATX ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( N.GT.0 ) $ CALL PZVPRNT( ICTXT, NOUT, N, $ MEM( IPMATX+IX-1+(JX-1)*LDX ), $ INCX, 0, 0, 'SERIAL_X' ) IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PZLAPRNT( 1, N, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PZLAPRNT( N, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) END IF END IF IF( YCHECK( K ) ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PZMPRNT( ICTXT, NOUT, MY, NY, $ MEM( IPMATY ), LDY, 0, 0, $ 'SERIAL_Y' ) CALL PB_PZLAPRNT( MY, NY, MEM( IPY ), 1, 1, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( N.GT.0 ) $ CALL PZVPRNT( ICTXT, NOUT, N, $ MEM( IPMATY+IY-1+(JY-1)*LDY ), $ INCY, 0, 0, 'SERIAL_Y' ) IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PZLAPRNT( 1, N, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PZLAPRNT( N, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) END IF END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9981 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9983 ) WRITE( NOUT, FMT = 9982 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9984 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, '---------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' N IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9993 FORMAT( 2X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I5,1X,I5,1X,I5,1X,I5,1X, $ I5,1X,I5,1X,I6 ) 9992 FORMAT( 2X, ' N IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9991 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9990 FORMAT( 'Not enough memory. Need: ', I12 ) 9989 FORMAT( 2X, ' Tested Subroutine: ', A ) 9988 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9987 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9986 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9985 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9984 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9983 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9982 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9981 FORMAT( 2X, 'Testing Summary') 9980 FORMAT( 2X, 'End of Tests.' ) 9979 FORMAT( 2X, 'Tests started.' ) 9978 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PZBLA1TST * END SUBROUTINE PZBLA1TSTINFO( SUMMRY, NOUT, NMAT, NVAL, MXVAL, $ NXVAL, IMBXVAL, MBXVAL, INBXVAL, $ NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, $ CSCYVAL, IYVAL, JYVAL, INCYVAL, $ LDVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, LTEST, SOF, TEE, IAM, IGAP, $ IVERB, NPROCS, ALPHA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, $ NGRIDS, NMAT, NOUT, NPROCS COMPLEX*16 ALPHA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY LOGICAL LTEST( * ) INTEGER CSCXVAL( LDVAL ), CSCYVAL( LDVAL ), $ IMBXVAL( LDVAL ), IMBYVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JXVAL( LDVAL ), $ JYVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCXVAL( LDVAL ), RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PZBLA1TSTINFO get the needed startup information for testing various * Level 1 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DESCX(:), IX, JX, INCX, DESCY(:), * IY, JY and INCY. This is also the maximum number of test * cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least ten. On * exit, if LTEST( i ) is .TRUE., the i-th Level 1 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * ALPHA (global output) COMPLEX*16 * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 2, 2*NGRIDS+23*NMAT+NSUBS+4 ) with NSUBS equal to 10. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 10 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J DOUBLE PRECISION EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D, ZGEBR2D, ZGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PZBLAS1TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 100 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 100 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA * READ( NIN, FMT = * ) ALPHA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 100 END IF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 100 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT CALL IGEBS2D( ICTXT, 'All', ' ', 2, 1, WORK, 2 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 70 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 70 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 1 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex double precision '// $ 'Level 1 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9982 ) ALPHA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 80 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 80 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 2, 1, WORK, 2, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) * I = 2*NGRIDS + 23*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 90 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 90 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 100 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) * * End of PZBLA1TSTINFO * END SUBROUTINE PZBLAS1TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PZBLAS1TSTCHKE tests the error exits of the Level 1 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 10 (NSUBS). * If LTEST( 1 ) is .TRUE., PZSWAP will be tested; * If LTEST( 2 ) is .TRUE., PZSCAL will be tested; * If LTEST( 3 ) is .TRUE., PZDSCAL will be tested; * If LTEST( 4 ) is .TRUE., PZCOPY will be tested; * If LTEST( 5 ) is .TRUE., PZAXPY will be tested; * If LTEST( 6 ) is .TRUE., PZDOTU will be tested; * If LTEST( 7 ) is .TRUE., PZDOTC will be tested; * If LTEST( 8 ) is .TRUE., PDZNRM2 will be tested; * If LTEST( 9 ) is .TRUE., PDZASUM will be tested; * If LTEST( 10 ) is .TRUE., PZAMAX will be tested. * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 10 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PDZASUM, PDZNRM2, PZAMAX, $ PZAXPY, PZCOPY, PZDIMEE, PZDOTC, PZDOTU, $ PZDSCAL, PZSCAL, PZSWAP, PZVECEE * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/11, 12, 12, 11, 13, 13, 13, 15, 15, 14/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PZSWAP * I = 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZSWAP, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZSWAP, SCODE( I ), SNAMES( I ) ) END IF * * Test PZSCAL * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZSCAL, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZSCAL, SCODE( I ), SNAMES( I ) ) END IF * * Test PZDSCAL * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZDSCAL, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZDSCAL, SCODE( I ), SNAMES( I ) ) END IF * * Test PZCOPY * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZCOPY, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZCOPY, SCODE( I ), SNAMES( I ) ) END IF * * Test PZAXPY * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZAXPY, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZAXPY, SCODE( I ), SNAMES( I ) ) END IF * * Test PZDOTU * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZDOTU, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZDOTU, SCODE( I ), SNAMES( I ) ) END IF * * Test PZDOTC * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZDOTC, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZDOTC, SCODE( I ), SNAMES( I ) ) END IF * * PDZNRM2 * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PDZNRM2, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PDZNRM2, SCODE( I ), SNAMES( I ) ) END IF * * Test PDZASUM * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PDZASUM, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PDZASUM, SCODE( I ), SNAMES( I ) ) END IF * * Test PZAMAX * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZAMAX, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZAMAX, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PZBLAS1TSTCHKE * END SUBROUTINE PZCHKARG1( ICTXT, NOUT, SNAME, N, ALPHA, IX, JX, $ DESCX, INCX, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, INCY, INFO, IX, IY, JX, JY, N, $ NOUT COMPLEX*16 ALPHA * .. * .. Array Arguments .. CHARACTER*(*) SNAME INTEGER DESCX( * ), DESCY( * ) * .. * * Purpose * ======= * * PZCHKARG1 checks the input-only arguments of the Level 1 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * N (global input) INTEGER * On entry, N specifies the length of the subvector operands. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, INCXREF, INCYREF, IXREF, IYREF, JXREF, $ JYREF, MYCOL, MYROW, NPCOL, NPROW, NREF COMPLEX*16 ALPHAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCXREF( DLEN_ ), DESCYREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * NREF = N IXREF = IX JXREF = JX DO 10 I = 1, DLEN_ DESCXREF( I ) = DESCX( I ) 10 CONTINUE INCXREF = INCX IYREF = IY JYREF = JY DO 20 I = 1, DLEN_ DESCYREF( I ) = DESCY( I ) 20 CONTINUE INCYREF = INCY ALPHAREF = ALPHA * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( IX.NE.IXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IX' ELSE IF( JX.NE.JXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JX' ELSE IF( DESCX( DTYPE_ ).NE.DESCXREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( DTYPE_ )' ELSE IF( DESCX( M_ ).NE.DESCXREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( M_ )' ELSE IF( DESCX( N_ ).NE.DESCXREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( N_ )' ELSE IF( DESCX( IMB_ ).NE.DESCXREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( IMB_ )' ELSE IF( DESCX( INB_ ).NE.DESCXREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( INB_ )' ELSE IF( DESCX( MB_ ).NE.DESCXREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( MB_ )' ELSE IF( DESCX( NB_ ).NE.DESCXREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( NB_ )' ELSE IF( DESCX( RSRC_ ).NE.DESCXREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( RSRC_ )' ELSE IF( DESCX( CSRC_ ).NE.DESCXREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CSRC_ )' ELSE IF( DESCX( CTXT_ ).NE.DESCXREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CTXT_ )' ELSE IF( DESCX( LLD_ ).NE.DESCXREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( LLD_ )' ELSE IF( INCX.NE.INCXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCX' ELSE IF( IY.NE.IYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IY' ELSE IF( JY.NE.JYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JY' ELSE IF( DESCY( DTYPE_ ).NE.DESCYREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( DTYPE_ )' ELSE IF( DESCY( M_ ).NE.DESCYREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( M_ )' ELSE IF( DESCY( N_ ).NE.DESCYREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( N_ )' ELSE IF( DESCY( IMB_ ).NE.DESCYREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( IMB_ )' ELSE IF( DESCY( INB_ ).NE.DESCYREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( INB_ )' ELSE IF( DESCY( MB_ ).NE.DESCYREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( MB_ )' ELSE IF( DESCY( NB_ ).NE.DESCYREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( NB_ )' ELSE IF( DESCY( RSRC_ ).NE.DESCYREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( RSRC_ )' ELSE IF( DESCY( CSRC_ ).NE.DESCYREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CSRC_ )' ELSE IF( DESCY( CTXT_ ).NE.DESCYREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CTXT_ )' ELSE IF( DESCY( LLD_ ).NE.DESCYREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( LLD_ )' ELSE IF( INCY.NE.INCYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCY' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.GT.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PZCHKARG1 * END LOGICAL FUNCTION PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) * .. * * Purpose * ======= * * PISINSCOPE returns .TRUE. if the calling process is in the scope of * sub( X ) = X( IX+(JX-1)*DESCX(M_)+(i-1)*INCX ) and .FALSE. if it is * not. This routine is used to determine which processes should check * the answer returned by some Level 1 PBLAS routines. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * N (global input) INTEGER * The length of the subvector sub( X ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER IIX, IXCOL, IXROW, JJX, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_INFOG2L * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( DESCX( M_ ).EQ.1 .AND. N.EQ.1 ) THEN * * This is the special case, find process owner of IX, JX, and * only this process is the scope. * PISINSCOPE = ( ( IXROW.EQ.MYROW .OR. ROWREP ) .AND. $ ( IXCOL.EQ.MYCOL .OR. COLREP ) ) * ELSE * IF( INCX.EQ.DESCX( M_ ) ) THEN * * row vector * PISINSCOPE = ( MYROW.EQ.IXROW .OR. ROWREP ) * ELSE * * column vector * PISINSCOPE = ( MYCOL.EQ.IXCOL .OR. COLREP ) * END IF * END IF * RETURN * * End of PISINSCOPE * END SUBROUTINE PZBLAS1TSTCHK( ICTXT, NOUT, NROUT, N, PSCLR, PUSCLR, $ PISCLR, X, PX, IX, JX, DESCX, INCX, Y, $ PY, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INCX, INCY, INFO, IX, IY, JX, JY, N, $ NOUT, NROUT, PISCLR DOUBLE PRECISION PUSCLR COMPLEX*16 PSCLR * .. * .. Array Arguments .. INTEGER DESCX( * ), DESCY( * ) COMPLEX*16 PX( * ), PY( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PZBLAS1TSTCHK performs the computational tests of the Level 1 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PZSWAP will be tested; * else if NROUT = 2, PZSCAL will be tested; * else if NROUT = 3, PZDSCAL will be tested; * else if NROUT = 4, PZCOPY will be tested; * else if NROUT = 5, PZAXPY will be tested; * else if NROUT = 6, PZDOTU will be tested; * else if NROUT = 7, PZDOTC will be tested; * else if NROUT = 8, PDZNRM2 will be tested; * else if NROUT = 9, PDZASUM will be tested; * else if NROUT = 10, PZAMAX will be tested. * * N (global input) INTEGER * On entry, N specifies the length of the subvector operands. * * PSCLR (global input) COMPLEX*16 * On entry, depending on the value of NROUT, PSCLR specifies * the scalar ALPHA, or the output scalar returned by the PBLAS, * i.e., the dot product, the 2-norm, the absolute sum or the * value of AMAX. * * PUSCLR (global input) DOUBLE PRECISION * On entry, PUSCLR specifies the real part of the scalar ALPHA * used by the real scaling, the 2-norm, or the absolute sum * routines. PUSCLR is not used in the real versions of this * routine. * * PISCLR (global input) DOUBLE PRECISION * On entry, PISCLR specifies the value of the global index re- * turned by PZAMAX, otherwise PISCLR is not used. * * X (local input/local output) COMPLEX*16 array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) COMPLEX*16 array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) COMPLEX*16 array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) COMPLEX*16 array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on X has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on Y has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION RZERO COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ RZERO = 0.0D+0 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL COLREP, INXSCOPE, INYSCOPE, ROWREP INTEGER I, IB, ICURCOL, ICURROW, IDUMM, IIX, IIY, IN, $ IOFFX, IOFFY, ISCLR, IXCOL, IXROW, IYCOL, $ IYROW, J, JB, JJX, JJY, JN, KK, LDX, LDY, $ MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR, ERRMAX, PREC, USCLR COMPLEX*16 SCLR * .. * .. Local Arrays .. INTEGER IERR( 6 ) CHARACTER*5 ARGIN1, ARGIN2, ARGOUT1, ARGOUT2 * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, PB_INFOG2L, PZCHKVIN, $ PZDERRSCAL, PZERRASUM, PZERRAXPY, PZERRDOTC, $ PZERRDOTU, PZERRNRM2, PZERRSCAL, ZCOPY, ZSWAP * .. * .. External Functions .. LOGICAL PISINSCOPE INTEGER IZAMAX DOUBLE PRECISION PDLAMCH EXTERNAL IZAMAX, PDLAMCH, PISINSCOPE * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( N.LE.0 ) $ RETURN * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * ARGIN1 = ' ' ARGIN2 = ' ' ARGOUT1 = ' ' ARGOUT2 = ' ' DO 10 I = 1, 6 IERR( I ) = 0 10 CONTINUE * PREC = PDLAMCH( ICTXT, 'precision' ) * IF( NROUT.EQ.1 ) THEN * * Test PZSWAP * IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL ZSWAP( N, X( IOFFX ), INCX, Y( IOFFY ), INCY ) CALL PZCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PZCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) * ELSE IF( NROUT.EQ.2 ) THEN * * Test PZSCAL * LDX = DESCX( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ICURROW = IXROW ICURCOL = IXCOL ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * DO 20 J = JX, JN * CALL PZERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 20 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 40 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * DO 30 KK = 0, JB-1 * CALL PZERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 30 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 40 CONTINUE * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * DO 50 I = IX, IN * CALL PZERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 70 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * DO 60 KK = 0, IB-1 * CALL PZERRSCAL( ERR, PSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX 60 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 70 CONTINUE * END IF * ELSE IF( NROUT.EQ.3 ) THEN * * Test PZDSCAL * LDX = DESCX( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, $ IIX, JJX, IXROW, IXCOL ) ICURROW = IXROW ICURCOL = IXCOL ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * DO 80 J = JX, JN * CALL PZDERRSCAL( ERR, PUSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 80 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 100 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * DO 90 KK = 0, JB-1 * CALL PZDERRSCAL( ERR, PUSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 JJX = JJX + 1 END IF * IOFFX = IOFFX + INCX * 90 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 100 CONTINUE * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * DO 110 I = IX, IN * CALL PZDERRSCAL( ERR, PUSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * DO 120 KK = 0, IB-1 * CALL PZDERRSCAL( ERR, PUSCLR, X( IOFFX ), PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PX( IIX+(JJX-1)*LDX ) - X( IOFFX ) ).GT. $ ERR ) $ IERR( 1 ) = 1 IIX = IIX + 1 END IF * IOFFX = IOFFX + INCX 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * ELSE IF( NROUT.EQ.4 ) THEN * * Test PZCOPY * IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL ZCOPY( N, X( IOFFX ), INCX, Y( IOFFY ), INCY ) CALL PZCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PZCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) * ELSE IF( NROUT.EQ.5 ) THEN * * Test PZAXPY * CALL PZCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) LDY = DESCY( LLD_ ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PB_INFOG2L( IY, JY, DESCY, NPROW, NPCOL, MYROW, MYCOL, $ IIY, JJY, IYROW, IYCOL ) ICURROW = IYROW ICURCOL = IYCOL ROWREP = ( IYROW.EQ.-1 ) COLREP = ( IYCOL.EQ.-1 ) * IF( INCY.EQ.DESCY( M_ ) ) THEN * * sub( Y ) is a row vector * JB = DESCY( INB_ ) - JY + 1 IF( JB.LE.0 ) $ JB = ( (-JB ) / DESCY( NB_ ) + 1 ) * DESCY( NB_ ) + JB JB = MIN( JB, N ) JN = JY + JB - 1 * DO 140 J = JY, JN * CALL PZERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF JJY = JJY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 140 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 160 J = JN+1, JY+N-1, DESCY( NB_ ) JB = MIN( JY+N-J, DESCY( NB_ ) ) * DO 150 KK = 0, JB-1 * CALL PZERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF JJY = JJY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 150 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 160 CONTINUE * ELSE * * sub( Y ) is a column vector * IB = DESCY( IMB_ ) - IY + 1 IF( IB.LE.0 ) $ IB = ( (-IB ) / DESCY( MB_ ) + 1 ) * DESCY( MB_ ) + IB IB = MIN( IB, N ) IN = IY + IB - 1 * DO 170 I = IY, IN * CALL PZERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF IIY = IIY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 170 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 190 I = IN+1, IY+N-1, DESCY( MB_ ) IB = MIN( IY+N-I, DESCY( MB_ ) ) * DO 180 KK = 0, IB-1 * CALL PZERRAXPY( ERR, PSCLR, X( IOFFX ), Y( IOFFY ), $ PREC ) * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN IF( ABS( PY( IIY+(JJY-1)*LDY ) - Y( IOFFY ) ).GT. $ ERR ) THEN IERR( 2 ) = 1 END IF IIY = IIY + 1 END IF * IOFFX = IOFFX + INCX IOFFY = IOFFY + INCY * 180 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 190 CONTINUE * END IF * ELSE IF( NROUT.EQ.6 ) THEN * * Test PZDOTU * CALL PZCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PZCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PZERRDOTU( ERR, N, SCLR, X( IOFFX ), INCX, Y( IOFFY ), $ INCY, PREC ) INXSCOPE = PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) INYSCOPE = PISINSCOPE( ICTXT, N, IY, JY, DESCY, INCY ) IF( INXSCOPE.OR.INYSCOPE ) THEN IF( ABS( PSCLR - SCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'DOTU' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF ELSE SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'DOTU' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.7 ) THEN * * Test PZDOTC * CALL PZCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) CALL PZCHKVIN( ERRMAX, N, Y, PY, IY, JY, DESCY, INCY, $ IERR( 2 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IOFFY = IY + ( JY - 1 ) * DESCY( M_ ) CALL PZERRDOTC( ERR, N, SCLR, X( IOFFX ), INCX, Y( IOFFY ), $ INCY, PREC ) INXSCOPE = PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) INYSCOPE = PISINSCOPE( ICTXT, N, IY, JY, DESCY, INCY ) IF( INXSCOPE.OR.INYSCOPE ) THEN IF( ABS( PSCLR - SCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'DOTC' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF ELSE SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'DOTC' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.8 ) THEN * * Test PDZNRM2 * CALL PZCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PZERRNRM2( ERR, N, USCLR, X( IOFFX ), INCX, PREC ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN IF( ABS( PUSCLR - USCLR ).GT.ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'NRM2' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9994 ) USCLR, PUSCLR END IF END IF ELSE USCLR = RZERO IF( PUSCLR.NE.USCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'NRM2' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9994 ) USCLR, PUSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.9 ) THEN * * Test PDZASUM * CALL PZCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) CALL PZERRASUM( ERR, N, USCLR, X( IOFFX ), INCX, PREC ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN IF( ABS( PUSCLR - USCLR ) .GT. ERR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'ASUM' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9994 ) USCLR, PUSCLR END IF END IF ELSE USCLR = RZERO IF( PUSCLR.NE.USCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'ASUM' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9994 ) USCLR, PUSCLR END IF END IF END IF * ELSE IF( NROUT.EQ.10 ) THEN * * Test PZAMAX * CALL PZCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 1 ) ) IOFFX = IX + ( JX - 1 ) * DESCX( M_ ) IF( PISINSCOPE( ICTXT, N, IX, JX, DESCX, INCX ) ) THEN ISCLR = IZAMAX( N, X( IOFFX ), INCX ) IF( N.LT.1 ) THEN SCLR = ZERO ELSE IF( ( INCX.EQ.1 ).AND.( DESCX( M_ ).EQ.1 ).AND. $ ( N.EQ.1 ) ) THEN ISCLR = JX SCLR = X( IOFFX ) ELSE IF( INCX.EQ.DESCX( M_ ) ) THEN ISCLR = JX + ISCLR - 1 SCLR = X( IX + ( ISCLR - 1 ) * DESCX( M_ ) ) ELSE ISCLR = IX + ISCLR - 1 SCLR = X( ISCLR + ( JX - 1 ) * DESCX( M_ ) ) END IF * IF( PSCLR.NE.SCLR ) THEN IERR( 3 ) = 1 WRITE( ARGIN1, FMT = '(A)' ) 'AMAX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF * IF( PISCLR.NE.ISCLR ) THEN IERR( 5 ) = 1 WRITE( ARGIN2, FMT = '(A)' ) 'INDX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9998 ) ARGIN2 WRITE( NOUT, FMT = 9995 ) ISCLR, PISCLR END IF END IF ELSE ISCLR = 0 SCLR = ZERO IF( PSCLR.NE.SCLR ) THEN IERR( 4 ) = 1 WRITE( ARGOUT1, FMT = '(A)' ) 'AMAX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT1 WRITE( NOUT, FMT = 9996 ) SCLR, PSCLR END IF END IF IF( PISCLR.NE.ISCLR ) THEN IERR( 6 ) = 1 WRITE( ARGOUT2, FMT = '(A)' ) 'INDX' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = 9997 ) ARGOUT2 WRITE( NOUT, FMT = 9995 ) ISCLR, PISCLR END IF END IF END IF * END IF * * Find IERR across all processes * CALL IGAMX2D( ICTXT, 'All', ' ', 6, 1, IERR, 6, IDUMM, IDUMM, -1, $ -1, 0 ) * * Encode the errors found in INFO * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'X' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'Y' END IF * IF( IERR( 3 ).NE.0 ) $ INFO = INFO + 4 * IF( IERR( 4 ).NE.0 ) $ INFO = INFO + 8 * IF( IERR( 5 ).NE.0 ) $ INFO = INFO + 16 * IF( IERR( 6 ).NE.0 ) $ INFO = INFO + 32 * 9999 FORMAT( 2X, ' ***** ERROR: Vector operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** ERROR: Output scalar result ', A, $ ' in scope is incorrect.' ) 9997 FORMAT( 2X, ' ***** ERROR: Output scalar result ', A, $ ' out of scope is incorrect.' ) 9996 FORMAT( 2X, ' ***** Expected value is: ', D30.18, '+i*(', $ D30.18, '),', /2X, ' Obtained value is: ', $ D30.18, '+i*(', D30.18, ')' ) 9995 FORMAT( 2X, ' ***** Expected value is: ', I6, /2X, $ ' Obtained value is: ', I6 ) 9994 FORMAT( 2X, ' ***** Expected value is: ', D30.18, /2X, $ ' Obtained value is: ', D30.18 ) * RETURN * * End of PZBLAS1TSTCHK * END SUBROUTINE PZERRDOTU( ERRBND, N, SCLR, X, INCX, Y, INCY, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N DOUBLE PRECISION ERRBND, PREC COMPLEX*16 SCLR * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * .. * * Purpose * ======= * * PZERRDOTU serially computes the dot product X**T * Y and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If dot1 = SCLR and dot2 are two different computed results, and dot1 * is being assumed to be correct, we require * * abs( dot1 - dot2 ) <= ERRBND = ERRFACT * abs( dot1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operands. * * SCLR (global output) COMPLEX*16 * On exit, SCLR specifies the dot product of the two vectors * X and Y. * * X (global input) COMPLEX*16 array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (global input) COMPLEX*16 array * On entry, Y is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremen- * ted array Y must contain the vector y. * * INCY (global input) INTEGER. * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IX, IY DOUBLE PRECISION ADDBND, FACT, SUMINEG, SUMIPOS, SUMRNEG, $ SUMRPOS, TMP * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX * .. * .. Executable Statements .. * IX = 1 IY = 1 SCLR = ZERO SUMIPOS = ZERO SUMINEG = ZERO SUMRPOS = ZERO SUMRNEG = ZERO FACT = TWO * ( ONE + PREC ) ADDBND = TWO * TWO * TWO * PREC * DO 10 I = 1, N * SCLR = SCLR + X( IX ) * Y( IY ) * TMP = DBLE( X( IX ) ) * DBLE( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMRPOS = SUMRPOS + TMP * FACT ELSE SUMRNEG = SUMRNEG - TMP * FACT END IF * TMP = - DIMAG( X( IX ) ) * DIMAG( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMRPOS = SUMRPOS + TMP * FACT ELSE SUMRNEG = SUMRNEG - TMP * FACT END IF * TMP = DIMAG( X( IX ) ) * DBLE( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMIPOS = SUMIPOS + TMP * FACT ELSE SUMINEG = SUMINEG - TMP * FACT END IF * TMP = DBLE( X( IX ) ) * DIMAG( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMIPOS = SUMIPOS + TMP * FACT ELSE SUMINEG = SUMINEG - TMP * FACT END IF * IX = IX + INCX IY = IY + INCY * 10 CONTINUE * ERRBND = ADDBND * MAX( MAX( SUMRPOS, SUMRNEG ), $ MAX( SUMIPOS, SUMINEG ) ) * RETURN * * End of PZERRDOTU * END SUBROUTINE PZERRDOTC( ERRBND, N, SCLR, X, INCX, Y, INCY, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N DOUBLE PRECISION ERRBND, PREC COMPLEX*16 SCLR * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * .. * * Purpose * ======= * * PZERRDOTC serially computes the dot product X**H * Y and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If dot1 = SCLR and dot2 are two different computed results, and dot1 * is being assumed to be correct, we require * * abs( dot1 - dot2 ) <= ERRBND = ERRFACT * abs( dot1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operands. * * SCLR (global output) COMPLEX*16 * On exit, SCLR specifies the dot product of the two vectors * X and Y. * * X (global input) COMPLEX*16 array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (global input) COMPLEX*16 array * On entry, Y is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremen- * ted array Y must contain the vector y. * * INCY (global input) INTEGER. * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IX, IY DOUBLE PRECISION ADDBND, FACT, SUMINEG, SUMIPOS, SUMRNEG, $ SUMRPOS, TMP * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX * .. * .. Executable Statements .. * IX = 1 IY = 1 SCLR = ZERO SUMIPOS = ZERO SUMINEG = ZERO SUMRPOS = ZERO SUMRNEG = ZERO FACT = TWO * ( ONE + PREC ) ADDBND = TWO * TWO * TWO * PREC * DO 10 I = 1, N * SCLR = SCLR + DCONJG( X( IX ) ) * Y( IY ) * TMP = DBLE( X( IX ) ) * DBLE( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMRPOS = SUMRPOS + TMP * FACT ELSE SUMRNEG = SUMRNEG - TMP * FACT END IF * TMP = DIMAG( X( IX ) ) * DIMAG( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMRPOS = SUMRPOS + TMP * FACT ELSE SUMRNEG = SUMRNEG - TMP * FACT END IF * TMP = - DIMAG( X( IX ) ) * DBLE( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMIPOS = SUMIPOS + TMP * FACT ELSE SUMINEG = SUMINEG - TMP * FACT END IF * TMP = DBLE( X( IX ) ) * DIMAG( Y ( IY ) ) IF( TMP.GE.ZERO ) THEN SUMIPOS = SUMIPOS + TMP * FACT ELSE SUMINEG = SUMINEG - TMP * FACT END IF * IX = IX + INCX IY = IY + INCY * 10 CONTINUE * ERRBND = ADDBND * MAX( MAX( SUMRPOS, SUMRNEG ), $ MAX( SUMIPOS, SUMINEG ) ) * RETURN * * End of PZERRDOTC * END SUBROUTINE PZERRNRM2( ERRBND, N, USCLR, X, INCX, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N DOUBLE PRECISION ERRBND, PREC, USCLR * .. * .. Array Arguments .. COMPLEX*16 X( * ) * .. * * Purpose * ======= * * PZERRNRM2 serially computes the 2-norm the vector X and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If norm1 = SCLR and norm2 are two different computed results, and * norm1 being assumed to be correct, we require * * abs( norm1 - norm2 ) <= ERRBND = ERRFACT * abs( norm1 ), * * where ERRFACT is computed as the maximum of the positive and negative * partial sums multiplied by a constant proportional to the machine * precision. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * N (global input) INTEGER * On entry, N specifies the length of the vector operand. * * USCLR (global output) DOUBLE PRECISION * On exit, USCLR specifies the 2-norm of the vector X. * * X (global input) COMPLEX*16 array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER IX DOUBLE PRECISION ABSXI, ADDBND, FACT, SCALE, SSQ, SUMSCA, SUMSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG * .. * .. Executable Statements .. * USCLR = ZERO SUMSSQ = ONE SUMSCA = ZERO ADDBND = TWO * TWO * TWO * PREC FACT = ONE + TWO * ( ( ONE + PREC )**3 - ONE ) * SCALE = ZERO SSQ = ONE DO 10 IX = 1, 1 + ( N - 1 )*INCX, INCX IF( DBLE( X( IX ) ).NE.ZERO ) THEN ABSXI = ABS( DBLE( X( IX ) ) ) IF( SCALE.LT.ABSXI )THEN SUMSSQ = ONE + ( SSQ*( SCALE/ABSXI )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = ONE + SSQ*( SCALE/ABSXI )**2 SUMSCA = ABSXI SCALE = ABSXI ELSE SUMSSQ = SSQ + ( ( ABSXI/SCALE )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = SSQ + ( ABSXI/SCALE )**2 END IF END IF IF( DIMAG( X( IX ) ).NE.ZERO ) THEN ABSXI = ABS( DIMAG( X( IX ) ) ) IF( SCALE.LT.ABSXI )THEN SUMSSQ = ONE + ( SSQ*( SCALE/ABSXI )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = ONE + SSQ*( SCALE/ABSXI )**2 SUMSCA = ABSXI SCALE = ABSXI ELSE SUMSSQ = SSQ + ( ( ABSXI/SCALE )**2 ) * FACT ERRBND = ADDBND * SUMSSQ SUMSSQ = SUMSSQ + ERRBND SSQ = SSQ + ( ABSXI/SCALE )**2 END IF END IF 10 CONTINUE * USCLR = SCALE * SQRT( SSQ ) * * Error on square root * ERRBND = SQRT( SUMSSQ ) * ( ONE + TWO * ( 1.00001D+0 * PREC ) ) * ERRBND = ( SUMSCA * ERRBND ) - USCLR * RETURN * * End of PZERRNRM2 * END SUBROUTINE PZERRASUM( ERRBND, N, USCLR, X, INCX, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N DOUBLE PRECISION ERRBND, PREC, USCLR * .. * .. Array Arguments .. COMPLEX*16 X( * ) * .. * * Purpose * ======= * * PZERRASUM serially computes the sum of absolute values of the vector * X and returns a scaled relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies a scaled relative acceptable error * bound. In this case the error bound is just the absolute sum * multiplied by a constant proportional to the machine preci- * sion. * * N (global input) INTEGER * On entry, N specifies the length of the vector operand. * * USCLR (global output) DOUBLE PRECISION * On exit, USCLR specifies the sum of absolute values of the * vector X. * * X (global input) COMPLEX*16 array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION TWO, ZERO PARAMETER ( TWO = 2.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER IX DOUBLE PRECISION ADDBND * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG * .. * .. Executable Statements .. * IX = 1 USCLR = ZERO ADDBND = TWO * TWO * TWO * PREC * DO 10 IX = 1, 1 + ( N - 1 )*INCX, INCX USCLR = USCLR + ABS( DBLE( X( IX ) ) ) + $ ABS( DIMAG( X( IX ) ) ) 10 CONTINUE * ERRBND = ADDBND * USCLR * RETURN * * End of PZERRASUM * END SUBROUTINE PZERRSCAL( ERRBND, PSCLR, X, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION ERRBND, PREC COMPLEX*16 PSCLR, X * .. * * Purpose * ======= * * PZERRSCAL serially computes the product PSCLR * X and returns a sca- * led relative acceptable error bound on the result. * * Notes * ===== * * If s1 = PSCLR*X and s2 are two different computed results, and s1 is * being assumed to be correct, we require * * abs( s1 - s2 ) <= ERRBND = ERRFACT * abs( s1 ), * * where ERRFACT is computed as two times the machine precision. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PSCLR (global input) COMPLEX*16 * On entry, PSCLR specifies the scale factor. * * X (global input/global output) COMPLEX*16 * On entry, X specifies the scalar to be scaled. On exit, X is * the scaled entry. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION TWO PARAMETER ( TWO = 2.0D+0 ) * .. * .. Intrinsic Functions .. INTRINSIC ABS * .. * .. Executable Statements .. * X = PSCLR * X * ERRBND = ( TWO * PREC ) * ABS( X ) * RETURN * * End of PZERRSCAL * END SUBROUTINE PZDERRSCAL( ERRBND, PUSCLR, X, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION ERRBND, PREC, PUSCLR COMPLEX*16 X * .. * * Purpose * ======= * * PZDERRSCAL serially computes the product PUSCLR * X and returns a * scaled relative acceptable error bound on the result. * * Notes * ===== * * If s1 = PUSCLR*X and s2 are two different computed results, and s1 is * being assumed to be correct, we require * * abs( s1 - s2 ) <= ERRBND = ERRFACT * abs( s1 ), * * where ERRFACT is computed as two times the machine precision. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PUSCLR (global input) DOUBLE PRECISION * On entry, PUSCLR specifies the real scale factor. * * X (global input/global output) COMPLEX*16 * On entry, X specifies the scalar to be scaled. On exit, X is * the scaled entry. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION TWO PARAMETER ( TWO = 2.0D+0 ) * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DIMAG * .. * .. Executable Statements .. * X = DCMPLX( PUSCLR * DBLE( X ), PUSCLR * DIMAG( X ) ) * ERRBND = ( TWO * PREC ) * ABS( X ) * RETURN * * End of PZDERRSCAL * END SUBROUTINE PZERRAXPY( ERRBND, PSCLR, X, Y, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION ERRBND, PREC COMPLEX*16 PSCLR, X, Y * .. * * Purpose * ======= * * PZERRAXPY serially computes Y := Y + PSCLR * X and returns a scaled * relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * PSCLR (global input) COMPLEX*16 * On entry, PSCLR specifies the scale factor. * * X (global input) COMPLEX*16 * On entry, X specifies the scalar to be scaled. * * Y (global input/global output) COMPLEX*16 * On entry, Y specifies the scalar to be added. On exit, Y con- * tains the resulting scalar. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION ADDBND, FACT, SUMINEG, SUMIPOS, SUMRNEG, $ SUMRPOS COMPLEX*16 TMP * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DIMAG, MAX * .. * .. Executable Statements .. * SUMIPOS = ZERO SUMINEG = ZERO SUMRPOS = ZERO SUMRNEG = ZERO FACT = ONE + TWO * PREC ADDBND = TWO * TWO * TWO * PREC * TMP = PSCLR * X IF( DBLE( TMP ).GE.ZERO ) THEN SUMRPOS = SUMRPOS + DBLE( TMP ) * FACT ELSE SUMRNEG = SUMRNEG - DBLE( TMP ) * FACT END IF IF( DIMAG( TMP ).GE.ZERO ) THEN SUMIPOS = SUMIPOS + DIMAG( TMP ) * FACT ELSE SUMINEG = SUMINEG - DIMAG( TMP ) * FACT END IF * TMP = Y IF( DBLE( TMP ).GE.ZERO ) THEN SUMRPOS = SUMRPOS + DBLE( TMP ) ELSE SUMRNEG = SUMRNEG - DBLE( TMP ) END IF IF( DIMAG( TMP ).GE.ZERO ) THEN SUMIPOS = SUMIPOS + DIMAG( TMP ) ELSE SUMINEG = SUMINEG - DIMAG( TMP ) END IF * Y = Y + ( PSCLR * X ) * ERRBND = ADDBND * MAX( MAX( SUMRPOS, SUMRNEG ), $ MAX( SUMIPOS, SUMINEG ) ) * RETURN * * End of PZERRAXPY * END scalapack-2.0.2/PBLAS/TESTING/PZBLAS2TST.dat000644 000766 000024 00000006455 10363532303 020041 0ustar00juliestaff000000 000000 'Level 2 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PZBLAS2TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 16.0 threshold value of test ratio 10 value of the logical computational blocksize NB 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q (2.0D0, -4.0D0) value of ALPHA (3.0D0, -2.0D0) value of BETA 4 number of tests problems 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' values of UPLO 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANS 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' 'N' 'U' values of DIAG 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 values of M 8 10 12 14 16 18 20 22 24 9 11 13 15 17 19 21 values of N 31 33 35 36 34 32 30 31 33 35 36 34 32 30 38 40 values of M_A 31 33 35 36 34 32 30 31 33 35 36 34 32 30 38 40 values of N_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_A 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_A 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IA 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JA 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 values of M_X 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 values of N_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_X 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_X 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IX 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JX 30 1 30 1 30 1 30 1 30 1 30 1 30 1 30 1 values of INCX 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 values of M_Y 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 values of N_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of IMB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of INB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of MB_Y 1 2 3 4 5 6 7 8 9 8 7 5 4 3 5 6 values of NB_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of RSRC_Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 values of CSRC_Y 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of IY 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 values of JY 40 1 40 1 40 1 40 1 40 1 40 1 40 1 40 1 values of INCY PZGEMV T put F for no test in the same column PZHEMV T put F for no test in the same column PZTRMV T put F for no test in the same column PZTRSV T put F for no test in the same column PZGERU T put F for no test in the same column PZGERC T put F for no test in the same column PZHER T put F for no test in the same column PZHER2 T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/pzblas2tst.f000644 000766 000024 00000354277 11750130340 020162 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 8) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PZGEMV ', 'PZHEMV ', 'PZTRMV ', $ 'PZTRSV ', 'PZGERU ', 'PZGERC ', $ 'PZHER ', 'PZHER2 '/ END BLOCK DATA PROGRAM PZBLA2TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PZBLA2TST is the main testing program for the PBLAS Level 2 routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * from the following 61 lines: * 'Level 2 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PZBLAS2TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 16.0 threshold value of test ratio * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0D0, 0.0D0) value of ALPHA * (1.0D0, 0.0D0) value of BETA * 2 number of tests problems * 'U' 'L' values of UPLO * 'N' 'T' values of TRANS * 'N' 'U' values of DIAG * 3 4 values of M * 3 4 values of N * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_X * 6 10 values of N_X * 2 5 values of IMB_X * 2 5 values of INB_X * 2 5 values of MB_X * 2 5 values of NB_X * 0 1 values of RSRC_X * 0 0 values of CSRC_X * 1 1 values of IX * 1 1 values of JX * 1 1 values of INCX * 6 10 values of M_Y * 6 10 values of N_Y * 2 5 values of IMB_Y * 2 5 values of INB_Y * 2 5 values of MB_Y * 2 5 values of NB_Y * 0 1 values of RSRC_Y * 0 0 values of CSRC_Y * 1 1 values of IY * 1 1 values of JY * 6 1 values of INCY * PZGEMV T put F for no test in the same column * PZHEMV T put F for no test in the same column * PZTRMV T put F for no test in the same column * PZTRSV T put F for no test in the same column * PZGERU T put F for no test in the same column * PZGERC T put F for no test in the same column * PZHER T put F for no test in the same column * PZHER2 T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * ZPLXSZ INTEGER * DBLESZ and ZPLXSZ indicate the length in bytes on the given * platform for a double precision real and a double precision * complex. By default, DBLESZ is set to eight and ZPLXSZ is set * to sixteen. * * MEM COMPLEX*16 array * MEM is an array of dimension TOTMEM / ZPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, ZPLXSZ, TOTMEM, $ MEMSIZ, NSUBS, DBLESZ COMPLEX*16 ONE, PADVAL, ZERO, ROGUE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ ZPLXSZ = 16, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / ZPLXSZ, DBLESZ = 8, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ), $ ROGUE = ( -1.0D+10, 1.0D+10 ), $ ONE = ( 1.0D+0, 0.0D+0 ), NSUBS = 8 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE CHARACTER*1 AFORM, DIAG, DIAGDO, TRANS, UPLO INTEGER CSRCA, CSRCX, CSRCY, I, IA, IAM, IASEED, ICTXT, $ IGAP, IMBA, IMBX, IMBY, IMIDA, IMIDX, IMIDY, $ INBA, INBX, INBY, INCX, INCY, IPA, IPG, IPMATA, $ IPMATX, IPMATY, IPOSTA, IPOSTX, IPOSTY, IPREA, $ IPREX, IPREY, IPX, IPY, IVERB, IX, IXSEED, IY, $ IYSEED, J, JA, JX, JY, K, LDA, LDX, LDY, M, MA, $ MBA, MBX, MBY, MEMREQD, MPA, MPX, MPY, MX, MY, $ MYCOL, MYROW, N, NA, NBA, NBX, NBY, NCOLA, $ NGRIDS, NLX, NLY, NOUT, NPCOL, NPROCS, NPROW, $ NQA, NQX, NQY, NROWA, NTESTS, NX, NY, OFFD, $ RSRCA, RSRCX, RSRCY, TSKIP, TSTCNT REAL THRESH COMPLEX*16 ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL LTEST( NSUBS ), YCHECK( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), TRANVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCXVAL( MAXTESTS ), $ CSCYVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCAR( DLEN_ ), DESCX( DLEN_ ), $ DESCXR( DLEN_ ), DESCY( DLEN_ ), $ DESCYR( DLEN_ ), IAVAL( MAXTESTS ), IERR( 6 ), $ IMBAVAL( MAXTESTS ), IMBXVAL( MAXTESTS ), $ IMBYVAL( MAXTESTS ), INBAVAL( MAXTESTS ), $ INBXVAL( MAXTESTS ), INBYVAL( MAXTESTS ), $ INCXVAL( MAXTESTS ), INCYVAL( MAXTESTS ), $ IXVAL( MAXTESTS ), IYVAL( MAXTESTS ), $ JAVAL( MAXTESTS ), JXVAL( MAXTESTS ), $ JYVAL( MAXTESTS ) INTEGER KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), MAVAL( MAXTESTS ), $ MBAVAL( MAXTESTS ), MBXVAL( MAXTESTS ), $ MBYVAL( MAXTESTS ), MVAL( MAXTESTS ), $ MXVAL( MAXTESTS ), MYVAL( MAXTESTS ), $ NAVAL( MAXTESTS ), NBAVAL( MAXTESTS ), $ NBXVAL( MAXTESTS ), NBYVAL( MAXTESTS ), $ NVAL( MAXTESTS ), NXVAL( MAXTESTS ), $ NYVAL( MAXTESTS ), PVAL( MAXTESTS ), $ QVAL( MAXTESTS ), RSCAVAL( MAXTESTS ), $ RSCXVAL( MAXTESTS ), RSCYVAL( MAXTESTS ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_DESCSET2, PB_PZLAPRNT, PB_ZCHEKPAD, $ PB_ZFILLPAD, PB_ZLASCAL, PB_ZLASET, PMDESCCHK, $ PMDIMCHK, PVDESCCHK, PVDIMCHK, PZBLA2TSTINFO, $ PZBLAS2TSTCHK, PZBLAS2TSTCHKE, PZCHKARG2, $ PZCHKVOUT, PZGEMV, PZGERC, PZGERU, PZHEMV, $ PZHER, PZHER2, PZIPSET, PZLAGEN, PZLASCAL, $ PZLASET, PZMPRNT, PZTRMV, PZTRSV, PZVPRNT * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_FCEIL EXTERNAL PB_FCEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, MAX, MOD, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA YCHECK/.TRUE., .TRUE., .FALSE., .FALSE., $ .TRUE., .TRUE., .FALSE., .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, so * that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IASEED = 100 IXSEED = 200 IYSEED = 300 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PZBLA2TSTINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, $ IAVAL, JAVAL, MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, IXVAL, $ JXVAL, INCXVAL, MYVAL, NYVAL, IMBYVAL, $ MBYVAL, INBYVAL, NBYVAL, RSCYVAL, CSCYVAL, $ IYVAL, JYVAL, INCYVAL, MAXTESTS, NGRIDS, $ PVAL, MAXGRIDS, QVAL, MAXGRIDS, NBLOG, LTEST, $ SOF, TEE, IAM, IGAP, IVERB, NPROCS, THRESH, $ ALPHA, BETA, MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9975 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PZBLAS2TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) TRANS = TRANVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) INBA = INBAVAL( J ) MBA = MBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MX = MXVAL( J ) NX = NXVAL( J ) IMBX = IMBXVAL( J ) INBX = INBXVAL( J ) MBX = MBXVAL( J ) NBX = NBXVAL( J ) RSRCX = RSCXVAL( J ) CSRCX = CSCXVAL( J ) IX = IXVAL( J ) JX = JXVAL( J ) INCX = INCXVAL( J ) * MY = MYVAL( J ) NY = NYVAL( J ) IMBY = IMBYVAL( J ) INBY = INBYVAL( J ) MBY = MBYVAL( J ) NBY = NBYVAL( J ) RSRCY = RSCYVAL( J ) CSRCY = CSCYVAL( J ) IY = IYVAL( J ) JY = JYVAL( J ) INCY = INCYVAL( J ) * IF( IAM.EQ.0 ) THEN TSTCNT = TSTCNT + 1 WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, UPLO, TRANS, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IX, JX, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9988 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) IY, JY, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY * WRITE( NOUT, FMT = 9995 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANS' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) TRANS WRITE( NOUT, FMT = 9997 ) 'DIAG' TSKIP = TSKIP + 1 GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, IGAP, GAPMUL, IERR( 1 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'X', DESCX, $ BLOCK_CYCLIC_2D_INB, MX, NX, IMBX, INBX, $ MBX, NBX, RSRCX, CSRCX, INCX, MPX, NQX, $ IPREX, IMIDX, IPOSTX, IGAP, GAPMUL, $ IERR( 2 ) ) CALL PVDESCCHK( ICTXT, NOUT, 'Y', DESCY, $ BLOCK_CYCLIC_2D_INB, MY, NY, IMBY, INBY, $ MBY, NBY, RSRCY, CSRCY, INCY, MPY, NQY, $ IPREY, IMIDY, IPOSTY, IGAP, GAPMUL, $ IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDA = MAX( 1, MA ) LDX = MAX( 1, MX ) LDY = MAX( 1, MY ) * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPX = IPA + DESCA( LLD_ )*NQA + IPOSTA + IPREX IPY = IPX + DESCX( LLD_ )*NQX + IPOSTX + IPREY IPMATA = IPY + DESCY( LLD_ )*NQY + IPOSTY IPMATX = IPMATA + MA*NA IPMATY = IPMATX + MX*NX IPG = IPMATY + MAX( MX*NX, MY*NY ) * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * MEMREQD = IPG + PB_FCEIL( REAL( MAX( M, N ) ) * $ REAL( DBLESZ ), REAL( ZPLXSZ ) ) - 1 + $ MAX( MAX( IMBA, MBA ), $ MAX( MAX( IMBX, MBX ), $ MAX( IMBY, MBY ) ) ) IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9986 ) MEMREQD*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 2 routines * DO 30 K = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( K ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9985 ) SNAMES( K ) END IF * * Define the size of the operands * IF( K.EQ.1 ) THEN NROWA = M NCOLA = N IF( LSAME( TRANS, 'N' ) ) THEN NLX = N NLY = M ELSE NLX = M NLY = N END IF ELSE IF( K.EQ.5 .OR. K.EQ.6 ) THEN NROWA = M NCOLA = N NLX = M NLY = N ELSE NROWA = N NCOLA = N NLX = N NLY = N END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLX, 'X', IX, JX, DESCX, $ INCX, IERR( 2 ) ) CALL PVDIMCHK( ICTXT, NOUT, NLY, 'Y', IY, JY, DESCY, $ INCY, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 GO TO 30 END IF * * Generate distributed matrices A, X and Y * IF( K.EQ.2 .OR. K.EQ.7 .OR. K.EQ.8 ) THEN AFORM = 'H' DIAGDO = 'N' OFFD = IA - JA ELSE IF( ( K.EQ.4 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN AFORM = 'N' DIAGDO = 'D' OFFD = IA - JA ELSE AFORM = 'N' DIAGDO = 'N' OFFD = 0 END IF * CALL PZLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCX, IXSEED, MEM( IPX ), $ DESCX( LLD_ ) ) IF( YCHECK( K ) ) $ CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCY, IYSEED, MEM( IPY ), $ DESCY( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCAR, MA, NA, IMBA, INBA, MBA, NBA, $ -1, -1, ICTXT, MAX( 1, MA ) ) CALL PZLAGEN( .FALSE., AFORM, DIAGDO, OFFD, MA, NA, $ 1, 1, DESCAR, IASEED, MEM( IPMATA ), $ DESCAR( LLD_ ) ) CALL PB_DESCSET2( DESCXR, MX, NX, IMBX, INBX, MBX, NBX, $ -1, -1, ICTXT, MAX( 1, MX ) ) CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, 1, $ 1, DESCXR, IXSEED, MEM( IPMATX ), $ DESCXR( LLD_ ) ) IF( YCHECK( K ) ) THEN * CALL PB_DESCSET2( DESCYR, MY, NY, IMBY, INBY, MBY, $ NBY, -1, -1, ICTXT, MAX( 1, MY ) ) CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MY, NY, $ 1, 1, DESCYR, IYSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) * ELSE * * If Y is not needed, generate a copy of X instead * CALL PB_DESCSET2( DESCYR, MX, NX, IMBX, INBX, MBX, $ NBX, -1, -1, ICTXT, MAX( 1, MX ) ) CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MX, NX, $ 1, 1, DESCYR, IXSEED, MEM( IPMATY ), $ DESCYR( LLD_ ) ) * END IF * * Zero non referenced part of the matrices A * IF( ( K.EQ.2 .OR. K.EQ.7 .OR. K.EQ.8 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN * * The distributed matrix A is Hermitian * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of A. * CALL PZLASET( 'Upper', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA, JA+1, DESCA ) IF( K.NE.2 ) THEN CALL PB_ZLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of A. * CALL PZLASET( 'Lower', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA+1, JA, DESCA ) IF( K.NE.2 ) THEN CALL PB_ZLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * END IF * ELSE IF( K.EQ.3 .OR. K.EQ.4 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix A is lower triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PZLASET( 'Upper', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA, $ JA+1, DESCA ) CALL PB_ZLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA-1+JA*LDA ), $ LDA ) END IF * ELSE * CALL PZLASET( 'Upper', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_ZLASET( 'Upper', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( K.EQ.4 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / $ DCMPLX( DBLE( MAX( NROWA, NCOLA ) ) ) CALL PZLASCAL( 'Lower', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA+1, JA, $ DESCA ) CALL PB_ZLASCAL( 'Lower', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix A is upper triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PZLASET( 'Lower', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA+1, $ JA, DESCA ) CALL PB_ZLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * ELSE * CALL PZLASET( 'Lower', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_ZLASET( 'Lower', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( K.EQ.4 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / $ DCMPLX( DBLE( MAX( NROWA, NCOLA ) ) ) CALL PZLASCAL( 'Upper', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA, JA+1, $ DESCA ) CALL PB_ZLASCAL( 'Upper', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * END IF * END IF * END IF * * Pad the guard zones of A, X and Y * CALL PB_ZFILLPAD( ICTXT, MPA, NQA, MEM( IPA-IPREA ), $ DESCA( LLD_ ), IPREA, IPOSTA, PADVAL ) * CALL PB_ZFILLPAD( ICTXT, MPX, NQX, MEM( IPX-IPREX ), $ DESCX( LLD_ ), IPREX, IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_ZFILLPAD( ICTXT, MPY, NQY, MEM( IPY-IPREY ), $ DESCY( LLD_ ), IPREY, IPOSTY, $ PADVAL ) END IF * * Initialize the check for INPUT-only arguments. * INFO = 0 CALL PZCHKARG2( ICTXT, NOUT, SNAMES( K ), UPLO, TRANS, $ DIAG, M, N, ALPHA, IA, JA, DESCA, IX, $ JX, DESCX, INCX, BETA, IY, JY, DESCY, $ INCY, INFO ) * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN CALL PB_PZLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_INITIAL_A', $ NOUT, MEM( IPG ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PZLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, 0, $ 0, 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPG ) ) END IF * IF( IVERB.EQ.2 ) THEN IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PZLAPRNT( 1, NLX, MEM( IPX ), IX, JX, $ DESCX, 0, 0, $ 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) ELSE CALL PB_PZLAPRNT( NLX, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, $ 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PZLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, 0, $ 0, 'PARALLEL_INITIAL_X', NOUT, $ MEM( IPG ) ) END IF * IF( YCHECK( K ) ) THEN IF( IVERB.EQ.2 ) THEN IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PZLAPRNT( 1, NLY, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) ELSE CALL PB_PZLAPRNT( NLY, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, $ 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) END IF ELSE IF( IVERB.GE.3 ) THEN CALL PB_PZLAPRNT( MY, NY, MEM( IPY ), 1, 1, DESCY, $ 0, 0, 'PARALLEL_INITIAL_Y', NOUT, $ MEM( IPG ) ) END IF END IF * * Call the Level 2 PBLAS routine * INFO = 0 IF( K.EQ.1 ) THEN * * Test PZGEMV * CALL PZGEMV( TRANS, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) * ELSE IF( K.EQ.2 ) THEN * * Test PZHEMV * CALL PZIPSET( 'Bignum', N, MEM( IPA ), IA, JA, DESCA ) * CALL PZHEMV( UPLO, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPY ), IY, JY, DESCY, INCY ) * CALL PZIPSET( 'Zero', N, MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.3 ) THEN * * Test PZTRMV * CALL PZTRMV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) * ELSE IF( K.EQ.4 ) THEN * * Test PZTRSV * CALL PZTRSV( UPLO, TRANS, DIAG, N, MEM( IPA ), IA, JA, $ DESCA, MEM( IPX ), IX, JX, DESCX, INCX ) * ELSE IF( K.EQ.5 ) THEN * * Test PZGERU * CALL PZGERU( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.6 ) THEN * * Test PZGERC * CALL PZGERC( M, N, ALPHA, MEM( IPX ), IX, JX, DESCX, $ INCX, MEM( IPY ), IY, JY, DESCY, INCY, $ MEM( IPA ), IA, JA, DESCA ) * ELSE IF( K.EQ.7 ) THEN * * Test PZHER * IF( DCMPLX( DBLE( ALPHA ) ).NE.ZERO ) $ CALL PZIPSET( 'Bignum', N, MEM( IPA ), IA, JA, $ DESCA ) * CALL PZHER( UPLO, N, DBLE( ALPHA ), MEM( IPX ), IX, $ JX, DESCX, INCX, MEM( IPA ), IA, JA, $ DESCA ) * ELSE IF( K.EQ.8 ) THEN * * Test PZHER2 * IF( ALPHA.NE.ZERO ) $ CALL PZIPSET( 'Bignum', N, MEM( IPA ), IA, JA, $ DESCA ) * CALL PZHER2( UPLO, N, ALPHA, MEM( IPX ), IX, JX, $ DESCX, INCX, MEM( IPY ), IY, JY, DESCY, $ INCY, MEM( IPA ), IA, JA, DESCA ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( K ) = KSKIP( K ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9974 ) INFO GO TO 30 END IF * * Check padding * CALL PB_ZCHEKPAD( ICTXT, SNAMES( K ), MPA, NQA, $ MEM( IPA-IPREA ), DESCA( LLD_ ), IPREA, $ IPOSTA, PADVAL ) * CALL PB_ZCHEKPAD( ICTXT, SNAMES( K ), MPX, NQX, $ MEM( IPX-IPREX ), DESCX( LLD_ ), IPREX, $ IPOSTX, PADVAL ) * IF( YCHECK( K ) ) THEN CALL PB_ZCHEKPAD( ICTXT, SNAMES( K ), MPY, NQY, $ MEM( IPY-IPREY ), DESCY( LLD_ ), $ IPREY, IPOSTY, PADVAL ) END IF * * Check the computations * CALL PZBLAS2TSTCHK( ICTXT, NOUT, K, UPLO, TRANS, DIAG, M, $ N, ALPHA, MEM( IPMATA ), MEM( IPA ), $ IA, JA, DESCA, MEM( IPMATX ), $ MEM( IPX ), IX, JX, DESCX, INCX, $ BETA, MEM( IPMATY ), MEM( IPY ), IY, $ JY, DESCY, INCY, THRESH, ROGUE, $ MEM( IPG ), INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 ) THEN IERR( 3 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 IERR( 3 ) = 1 END IF * * Check input-only scalar arguments * INFO = 1 CALL PZCHKARG2( ICTXT, NOUT, SNAMES( K ), UPLO, TRANS, $ DIAG, M, N, ALPHA, IA, JA, DESCA, IX, $ JX, DESCX, INCX, BETA, IY, JY, DESCY, $ INCY, INFO ) * * Check input-only array arguments * CALL PZCHKMOUT( NROWA, NCOLA, MEM( IPMATA ), MEM( IPA ), $ IA, JA, DESCA, IERR( 4 ) ) CALL PZCHKVOUT( NLX, MEM( IPMATX ), MEM( IPX ), IX, JX, $ DESCX, INCX, IERR( 5 ) ) * IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_A', $ SNAMES( K ) END IF * IF( IERR( 5 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_X', $ SNAMES( K ) END IF * IF( YCHECK( K ) ) THEN CALL PZCHKVOUT( NLY, MEM( IPMATY ), MEM( IPY ), IY, $ JY, DESCY, INCY, IERR( 6 ) ) IF( IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9982 ) 'PARALLEL_Y', $ SNAMES( K ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE.0 .OR. IERR( 5 ).NE.0 .OR. $ IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) SNAMES( K ) KFAIL( K ) = KFAIL( K ) + 1 ERRFLG = .TRUE. ELSE IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) SNAMES( K ) KPASS( K ) = KPASS( K ) + 1 END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PZMPRNT( ICTXT, NOUT, MA, NA, MEM( IPMATA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PZLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( ( NROWA.GT.0 ).AND.( NCOLA.GT.0 ) ) $ CALL PZMPRNT( ICTXT, NOUT, NROWA, NCOLA, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PZLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_A', $ NOUT, MEM( IPMATA ) ) END IF IF( IERR( 5 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PZMPRNT( ICTXT, NOUT, MX, NX, MEM( IPMATX ), $ LDX, 0, 0, 'SERIAL_X' ) CALL PB_PZLAPRNT( MX, NX, MEM( IPX ), 1, 1, DESCX, $ 0, 0, 'PARALLEL_X', NOUT, $ MEM( IPMATX ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( NLX.GT.0 ) $ CALL PZVPRNT( ICTXT, NOUT, NLX, $ MEM( IPMATX+IX-1+(JX-1)*LDX ), $ INCX, 0, 0, 'SERIAL_X' ) IF( INCX.EQ.DESCX( M_ ) ) THEN CALL PB_PZLAPRNT( 1, NLX, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PZLAPRNT( NLX, 1, MEM( IPX ), IX, JX, $ DESCX, 0, 0, 'PARALLEL_X', $ NOUT, MEM( IPMATX ) ) END IF END IF IF( YCHECK( K ) ) THEN IF( IERR( 6 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PZMPRNT( ICTXT, NOUT, MY, NY, $ MEM( IPMATY ), LDY, 0, 0, $ 'SERIAL_Y' ) CALL PB_PZLAPRNT( MY, NY, MEM( IPY ), 1, 1, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE IF( IERR( 3 ).NE.0 ) THEN IF( NLY.GT.0 ) $ CALL PZVPRNT( ICTXT, NOUT, NLY, $ MEM( IPMATY+IY-1+(JY-1)*LDY ), $ INCY, 0, 0, 'SERIAL_Y' ) IF( INCY.EQ.DESCY( M_ ) ) THEN CALL PB_PZLAPRNT( 1, NLY, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) ELSE CALL PB_PZLAPRNT( NLY, 1, MEM( IPY ), IY, JY, $ DESCY, 0, 0, 'PARALLEL_Y', $ NOUT, MEM( IPMATX ) ) END IF END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9981 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9979 ) WRITE( NOUT, FMT = 9978 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9980 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '--------------------------' ) 9994 FORMAT( 2X, ' M N UPLO TRANS DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,9X,A1,11X,A1,10X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IX JX MX NX IMBX INBX', $ ' MBX NBX RSRCX CSRCX INCX' ) 9989 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5,1X,I6 ) 9988 FORMAT( 2X, ' IY JY MY NY IMBY INBY', $ ' MBY NBY RSRCY CSRCY INCY' ) 9987 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9986 FORMAT( 'Not enough memory. Need: ', I12 ) 9985 FORMAT( 2X, ' Tested Subroutine: ', A ) 9984 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9983 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9982 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9981 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9980 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9979 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9978 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9977 FORMAT( 2X, 'Testing Summary') 9976 FORMAT( 2X, 'End of Tests.' ) 9975 FORMAT( 2X, 'Tests started.' ) 9974 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PZBLA2TST * END SUBROUTINE PZBLA2TSTINFO( SUMMRY, NOUT, NMAT, DIAGVAL, TRANVAL, $ UPLOVAL, MVAL, NVAL, MAVAL, NAVAL, $ IMBAVAL, MBAVAL, INBAVAL, NBAVAL, $ RSCAVAL, CSCAVAL, IAVAL, JAVAL, $ MXVAL, NXVAL, IMBXVAL, MBXVAL, $ INBXVAL, NBXVAL, RSCXVAL, CSCXVAL, $ IXVAL, JXVAL, INCXVAL, MYVAL, NYVAL, $ IMBYVAL, MBYVAL, INBYVAL, NBYVAL, $ RSCYVAL, CSCYVAL, IYVAL, JYVAL, $ INCYVAL, LDVAL, NGRIDS, PVAL, LDPVAL, $ QVAL, LDQVAL, NBLOG, LTEST, SOF, TEE, $ IAM, IGAP, IVERB, NPROCS, THRESH, ALPHA, $ BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, NBLOG, $ NGRIDS, NMAT, NOUT, NPROCS REAL THRESH COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), TRANVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCXVAL( LDVAL ), $ CSCYVAL( LDVAL ), IAVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBXVAL( LDVAL ), $ IMBYVAL( LDVAL ), INBAVAL( LDVAL ), $ INBXVAL( LDVAL ), INBYVAL( LDVAL ), $ INCXVAL( LDVAL ), INCYVAL( LDVAL ), $ IXVAL( LDVAL ), IYVAL( LDVAL ), JAVAL( LDVAL ), $ JXVAL( LDVAL ), JYVAL( LDVAL ), MAVAL( LDVAL ), $ MBAVAL( LDVAL ), MBXVAL( LDVAL ), $ MBYVAL( LDVAL ), MVAL( LDVAL ), MXVAL( LDVAL ), $ MYVAL( LDVAL ), NAVAL( LDVAL ), $ NBAVAL( LDVAL ), NBXVAL( LDVAL ), $ NBYVAL( LDVAL ), NVAL( LDVAL ), NXVAL( LDVAL ), $ NYVAL( LDVAL ), PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCXVAL( LDVAL ), $ RSCYVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PZBLA2TSTINFO get the needed startup information for testing various * Level 2 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * TRANVAL (global output) CHARACTER array * On entry, TRANVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANS to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MXVAL (global output) INTEGER array * On entry, MXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( M_ ) to run the code * with. * * NXVAL (global output) INTEGER array * On entry, NXVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCX( N_ ) to run the code * with. * * IMBXVAL (global output) INTEGER array * On entry, IMBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( IMB_ ) to run the * code with. * * MBXVAL (global output) INTEGER array * On entry, MBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( MB_ ) to run the * code with. * * INBXVAL (global output) INTEGER array * On entry, INBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( INB_ ) to run the * code with. * * NBXVAL (global output) INTEGER array * On entry, NBXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( NB_ ) to run the * code with. * * RSCXVAL (global output) INTEGER array * On entry, RSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( RSRC_ ) to run the * code with. * * CSCXVAL (global output) INTEGER array * On entry, CSCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCX( CSRC_ ) to run the * code with. * * IXVAL (global output) INTEGER array * On entry, IXVAL is an array of dimension LDVAL. On exit, this * array contains the values of IX to run the code with. * * JXVAL (global output) INTEGER array * On entry, JXVAL is an array of dimension LDVAL. On exit, this * array contains the values of JX to run the code with. * * INCXVAL (global output) INTEGER array * On entry, INCXVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCX to run the code with. * * MYVAL (global output) INTEGER array * On entry, MYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( M_ ) to run the code * with. * * NYVAL (global output) INTEGER array * On entry, NYVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCY( N_ ) to run the code * with. * * IMBYVAL (global output) INTEGER array * On entry, IMBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( IMB_ ) to run the * code with. * * MBYVAL (global output) INTEGER array * On entry, MBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( MB_ ) to run the * code with. * * INBYVAL (global output) INTEGER array * On entry, INBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( INB_ ) to run the * code with. * * NBYVAL (global output) INTEGER array * On entry, NBYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( NB_ ) to run the * code with. * * RSCYVAL (global output) INTEGER array * On entry, RSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( RSRC_ ) to run the * code with. * * CSCYVAL (global output) INTEGER array * On entry, CSCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCY( CSRC_ ) to run the * code with. * * IYVAL (global output) INTEGER array * On entry, IYVAL is an array of dimension LDVAL. On exit, this * array contains the values of IY to run the code with. * * JYVAL (global output) INTEGER array * On entry, JYVAL is an array of dimension LDVAL. On exit, this * array contains the values of JY to run the code with. * * INCYVAL (global output) INTEGER array * On entry, INCYVAL is an array of dimension LDVAL. On exit, * this array contains the values of INCY to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, TRANS, UPLO, M, N, DESCA(:), * IA, JA, DESCX(:), IX, JX, INCX, DESCY(:), IY, JY and INCY. * This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eight. On * exit, if LTEST( i ) is .TRUE., the i-th Level 2 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * THRESH (global output) REAL * On exit, THRESH specifies the threshhold value for the test * ratio. * * ALPHA (global output) COMPLEX*16 * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) COMPLEX*16 * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+37*NMAT+NSUBS+4 ) with NSUBS equal to 8. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J DOUBLE PRECISION EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D, ZGEBR2D, ZGEBS2D *ype real dble cplx zplx * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PZBLAS2TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Read the threshold value for test ratio * READ( NIN, FMT = * ) THRESH IF( THRESH.LT.0.0 ) $ THRESH = 16.0 * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRANVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCXVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JYVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INCYVAL( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( TRANVAL( J ) ) WORK( I+2 ) = ICHAR( UPLOVAL( J ) ) I = I + 3 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCXVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JYVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INCYVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 2 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex double precision '// $ 'Level 2 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9979 ) NBLOG WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9980 ) THRESH WRITE( NOUT, FMT = 9982 ) ALPHA WRITE( NOUT, FMT = 9981 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 37*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) TRANVAL( J ) = CHAR( WORK( I+1 ) ) UPLOVAL( J ) = CHAR( WORK( I+2 ) ) I = I + 3 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCXVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JYVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INCYVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9981 FORMAT( 2X, 'Beta : (', G16.6, $ ',', G16.6, ')' ) 9980 FORMAT( 2X, 'Threshold value : ', G16.6 ) 9979 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PZBLA2TSTINFO * END SUBROUTINE PZBLAS2TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PZBLAS2TSTCHKE tests the error exits of the Level 2 PBLAS. * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 8 (NSUBS). * If LTEST( 1 ) is .TRUE., PZGEMV will be tested; * If LTEST( 2 ) is .TRUE., PZHEMV will be tested; * If LTEST( 3 ) is .TRUE., PZTRMV will be tested; * If LTEST( 4 ) is .TRUE., PZTRSV will be tested; * If LTEST( 5 ) is .TRUE., PZGERU will be tested; * If LTEST( 6 ) is .TRUE., PZGERC will be tested; * If LTEST( 7 ) is .TRUE., PZHER will be tested; * If LTEST( 8 ) is .TRUE., PZHER2 will be tested; * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 8 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PZDIMEE, PZGEMV, PZGERC, $ PZGERU, PZHEMV, PZHER, PZHER2, PZMATEE, $ PZOPTEE, PZTRMV, PZTRSV, PZVECEE * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/21, 22, 23, 23, 24, 24, 26, 27/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PZGEMV * I = 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZGEMV, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZGEMV, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZGEMV, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZGEMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PZHEMV * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZHEMV, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZHEMV, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZHEMV, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZHEMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PZTRMV * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZTRMV, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZTRMV, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZTRMV, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZTRMV, SCODE( I ), SNAMES( I ) ) END IF * * Test PZTRSV * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZTRSV, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZTRSV, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZTRSV, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZTRSV, SCODE( I ), SNAMES( I ) ) END IF * * Test PZGERU * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZGERU, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZGERU, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZGERU, SCODE( I ), SNAMES( I ) ) END IF * * Test PZGERC * I = I + 1 IF( LTEST( I ) ) THEN CALL PZDIMEE( ICTXT, NOUT, PZGERC, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZGERC, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZGERC, SCODE( I ), SNAMES( I ) ) END IF * * Test PZHER * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZHER, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZHER, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZHER, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZHER, SCODE( I ), SNAMES( I ) ) END IF * * Test PZHER2 * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZHER2, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZHER2, SCODE( I ), SNAMES( I ) ) CALL PZVECEE( ICTXT, NOUT, PZHER2, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZHER2, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PZBLAS2TSTCHKE * END SUBROUTINE PZCHKARG2( ICTXT, NOUT, SNAME, UPLO, TRANS, DIAG, M, $ N, ALPHA, IA, JA, DESCA, IX, JX, DESCX, $ INCX, BETA, IY, JY, DESCY, INCY, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N, NOUT COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*(*) SNAME INTEGER DESCA( * ), DESCX( * ), DESCY( * ) * .. * * Purpose * ======= * * PZCHKARG2 checks the input-only arguments of the Level 2 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the UPLO option in the Level 2 PBLAS * operation. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the TRANS option in the Level 2 * PBLAS operation. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies the DIAG option in the Level 2 PBLAS * operation. * * M (global input) INTEGER * On entry, M specifies the dimension of the submatrix ope- * rands. * * N (global input) INTEGER * On entry, N specifies the dimension of the submatrix ope- * rands. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 DIAGREF, TRANSREF, UPLOREF INTEGER I, IAREF, INCXREF, INCYREF, IXREF, IYREF, $ JAREF, JXREF, JYREF, MREF, MYCOL, MYROW, NPCOL, $ NPROW, NREF COMPLEX*16 ALPHAREF, BETAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCAREF( DLEN_ ), DESCXREF( DLEN_ ), $ DESCYREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * DIAGREF = DIAG TRANSREF = TRANS UPLOREF = UPLO MREF = M NREF = N ALPHAREF = ALPHA IAREF = IA JAREF = JA DO 10 I = 1, DLEN_ DESCAREF( I ) = DESCA( I ) 10 CONTINUE IXREF = IX JXREF = JX DO 20 I = 1, DLEN_ DESCXREF( I ) = DESCX( I ) 20 CONTINUE INCXREF = INCX BETAREF = BETA IYREF = IY JYREF = JY DO 30 I = 1, DLEN_ DESCYREF( I ) = DESCY( I ) 30 CONTINUE INCYREF = INCY * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( .NOT. LSAME( DIAG, DIAGREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DIAG' ELSE IF( .NOT. LSAME( TRANS, TRANSREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANS' ELSE IF( .NOT. LSAME( UPLO, UPLOREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'UPLO' ELSE IF( M.NE.MREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'M' ELSE IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE IF( IA.NE.IAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IA' ELSE IF( JA.NE.JAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JA' ELSE IF( DESCA( DTYPE_ ).NE.DESCAREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( DTYPE_ )' ELSE IF( DESCA( M_ ).NE.DESCAREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( M_ )' ELSE IF( DESCA( N_ ).NE.DESCAREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( N_ )' ELSE IF( DESCA( IMB_ ).NE.DESCAREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( IMB_ )' ELSE IF( DESCA( INB_ ).NE.DESCAREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( INB_ )' ELSE IF( DESCA( MB_ ).NE.DESCAREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( MB_ )' ELSE IF( DESCA( NB_ ).NE.DESCAREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( NB_ )' ELSE IF( DESCA( RSRC_ ).NE.DESCAREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( RSRC_ )' ELSE IF( DESCA( CSRC_ ).NE.DESCAREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CSRC_ )' ELSE IF( DESCA( CTXT_ ).NE.DESCAREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CTXT_ )' ELSE IF( DESCA( LLD_ ).NE.DESCAREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( LLD_ )' ELSE IF( IX.NE.IXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IX' ELSE IF( JX.NE.JXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JX' ELSE IF( DESCX( DTYPE_ ).NE.DESCXREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( DTYPE_ )' ELSE IF( DESCX( M_ ).NE.DESCXREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( M_ )' ELSE IF( DESCX( N_ ).NE.DESCXREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( N_ )' ELSE IF( DESCX( IMB_ ).NE.DESCXREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( IMB_ )' ELSE IF( DESCX( INB_ ).NE.DESCXREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( INB_ )' ELSE IF( DESCX( MB_ ).NE.DESCXREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( MB_ )' ELSE IF( DESCX( NB_ ).NE.DESCXREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( NB_ )' ELSE IF( DESCX( RSRC_ ).NE.DESCXREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( RSRC_ )' ELSE IF( DESCX( CSRC_ ).NE.DESCXREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CSRC_ )' ELSE IF( DESCX( CTXT_ ).NE.DESCXREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( CTXT_ )' ELSE IF( DESCX( LLD_ ).NE.DESCXREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCX( LLD_ )' ELSE IF( INCX.NE.INCXREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCX' ELSE IF( BETA.NE.BETAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'BETA' ELSE IF( IY.NE.IYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IY' ELSE IF( JY.NE.JYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JY' ELSE IF( DESCY( DTYPE_ ).NE.DESCYREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( DTYPE_ )' ELSE IF( DESCY( M_ ).NE.DESCYREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( M_ )' ELSE IF( DESCY( N_ ).NE.DESCYREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( N_ )' ELSE IF( DESCY( IMB_ ).NE.DESCYREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( IMB_ )' ELSE IF( DESCY( INB_ ).NE.DESCYREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( INB_ )' ELSE IF( DESCY( MB_ ).NE.DESCYREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( MB_ )' ELSE IF( DESCY( NB_ ).NE.DESCYREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( NB_ )' ELSE IF( DESCY( RSRC_ ).NE.DESCYREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( RSRC_ )' ELSE IF( DESCY( CSRC_ ).NE.DESCYREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CSRC_ )' ELSE IF( DESCY( CTXT_ ).NE.DESCYREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( CTXT_ )' ELSE IF( DESCY( LLD_ ).NE.DESCYREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCY( LLD_ )' ELSE IF( INCY.NE.INCYREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'INCY' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.NE.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PZCHKARG2 * END SUBROUTINE PZBLAS2TSTCHK( ICTXT, NOUT, NROUT, UPLO, TRANS, DIAG, $ M, N, ALPHA, A, PA, IA, JA, DESCA, X, $ PX, IX, JX, DESCX, INCX, BETA, Y, PY, $ IY, JY, DESCY, INCY, THRESH, ROGUE, $ WORK, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N, NOUT, NROUT REAL THRESH COMPLEX*16 ALPHA, BETA, ROGUE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION WORK( * ) COMPLEX*16 A( * ), PA( * ), PX( * ), PY( * ), X( * ), $ Y( * ) * .. * * Purpose * ======= * * PZBLAS2TSTCHK performs the computational tests of the Level 2 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PZGEMV will be tested; * else if NROUT = 2, PZHEMV will be tested; * else if NROUT = 3, PZTRMV will be tested; * else if NROUT = 4, PZTRSV will be tested; * else if NROUT = 5, PZGERU will be tested; * else if NROUT = 6, PZGERC will be tested; * else if NROUT = 7, PZHER will be tested; * else if NROUT = 8, PZHER2 will be tested; * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies if the upper or lower part of the * matrix operand is to be referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies if the matrix operand A is to be * transposed. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the triangular matrix operand is * unit or non-unit. * * M (global input) INTEGER * On entry, M specifies the number of rows of A. * * N (global input) INTEGER * On entry, N specifies the number of columns of A. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX*16 array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) COMPLEX*16 array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) COMPLEX*16 array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * Y (local input/local output) COMPLEX*16 array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) COMPLEX*16 array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * THRESH (global input) REAL * On entry, THRESH is the threshold value for the test ratio. * * ROGUE (global input) COMPLEX*16 * On entry, ROGUE specifies the constant used to pad the * non-referenced part of triangular, symmetric or Hermitian ma- * trices. * * WORK (workspace) DOUBLE PRECISION array * On entry, WORK is an array of dimension LWORK where LWORK is * at least MAX( M, N ). This array is used to store the compu- * ted gauges (see PZMVCH). * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on A has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on X has been found, * if( MOD( INFO/4, 2 ) = 1 ) then an error on Y has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D+0 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR COMPLEX*16 ALPHA1 * .. * .. Local Arrays .. INTEGER IERR( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_ZLASET, PZCHKMIN, PZCHKVIN, $ PZMVCH, PZTRMV, PZVMCH, PZVMCH2, ZTRSV * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, DBLE * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 10 I = 1, 3 IERR( I ) = 0 10 CONTINUE * IF( NROUT.EQ.1 ) THEN * * Test PZGEMV * * Check the resulting vector Y * CALL PZMVCH( ICTXT, TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, $ IX, JX, DESCX, INCX, BETA, Y, PY, IY, JY, DESCY, $ INCY, WORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PZCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) IF( LSAME( TRANS, 'N' ) ) THEN CALL PZCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, $ IERR( 2 ) ) ELSE CALL PZCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, $ IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.2 ) THEN * * Test PZHEMV * * Check the resulting vector Y * CALL PZMVCH( ICTXT, 'No transpose', N, N, ALPHA, A, IA, JA, $ DESCA, X, IX, JX, DESCX, INCX, BETA, Y, PY, IY, $ JY, DESCY, INCY, WORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN CALL PB_ZLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF CALL PZCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PZCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) * ELSE IF( NROUT.EQ.3 ) THEN * * Test PZTRMV * * Check the resulting vector X * CALL PZMVCH( ICTXT, TRANS, N, N, ONE, A, IA, JA, DESCA, Y, IX, $ JX, DESCX, INCX, ZERO, X, PX, IX, JX, DESCX, INCX, $ WORK, ERR, IERR( 2 ) ) * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF END IF CALL PZCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) * ELSE IF( NROUT.EQ.4 ) THEN * * Test PZTRSV * * Check the resulting vector X * CALL ZTRSV( UPLO, TRANS, DIAG, N, A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ), X( IX+(JX-1)*DESCX( M_ ) ), INCX ) CALL PZTRMV( UPLO, TRANS, DIAG, N, PA, IA, JA, DESCA, PX, IX, $ JX, DESCX, INCX ) CALL PZMVCH( ICTXT, TRANS, N, N, ONE, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX, ZERO, Y, PX, IX, JX, DESCX, INCX, $ WORK, ERR, IERR( 2 ) ) * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ) ) END IF END IF CALL PZCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) * ELSE IF( NROUT.EQ.5 ) THEN * * Test PZGERU * * Check the resulting matrix A * CALL PZVMCH( ICTXT, 'No transpose', 'Ge', M, N, ALPHA, X, IX, $ JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, PA, $ IA, JA, DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PZCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PZCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * ELSE IF( NROUT.EQ.6 ) THEN * * Test PZGERC * * Check the resulting matrix A * CALL PZVMCH( ICTXT, 'Conjugate transpose', 'Ge', M, N, ALPHA, $ X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, $ A, PA, IA, JA, DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PZCHKVIN( ERR, M, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PZCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * ELSE IF( NROUT.EQ.7 ) THEN * * Test PZHER * * Check the resulting matrix A * ALPHA1 = DCMPLX( DBLE( ALPHA ), RZERO ) CALL PZVMCH( ICTXT, 'Conjugate transpose', UPLO, N, N, ALPHA1, $ X, IX, JX, DESCX, INCX, X, IX, JX, DESCX, INCX, A, $ PA, IA, JA, DESCA, WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PZCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) * ELSE IF( NROUT.EQ.8 ) THEN * * Test PZHER2 * * Check the resulting matrix A * CALL PZVMCH2( ICTXT, UPLO, N, N, ALPHA, X, IX, JX, DESCX, INCX, $ Y, IY, JY, DESCY, INCY, A, PA, IA, JA, DESCA, $ WORK, ERR, IERR( 1 ) ) IF( IERR( 1 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9996 ) ERR END IF * * Check the input-only arguments * CALL PZCHKVIN( ERR, N, X, PX, IX, JX, DESCX, INCX, IERR( 2 ) ) CALL PZCHKVIN( ERR, N, Y, PY, IY, JY, DESCY, INCY, IERR( 3 ) ) * END IF * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'A' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'X' END IF * IF( IERR( 3 ).NE.0 ) THEN INFO = INFO + 4 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'Y' END IF * 9999 FORMAT( 2X, ' ***** ERROR: Matrix operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** ERROR: Vector operand ', A, $ ' is incorrect.' ) 9997 FORMAT( 2X, ' ***** FATAL ERROR - Computed result is less ', $ 'than half accurate *****' ) 9996 FORMAT( 2X, ' ***** Test completed with maximum test ratio: ', $ F11.5, ' SUSPECT *****' ) * RETURN * * End of PZBLAS2TSTCHK * END scalapack-2.0.2/PBLAS/TESTING/PZBLAS3TST.dat000644 000766 000024 00000005157 10363532303 020040 0ustar00juliestaff000000 000000 'Level 3 PBLAS, Testing input file' 'Intel iPSC/860 hypercube, gamma model.' 'PZBLAS3TST.SUMM' output file name (if any) 6 device out F logical flag, T to stop on failures T logical flag, T to test error exits 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors 10 the leading dimension gap 16.0 threshold value of test ratio 10 value of the logical computational blocksize NB 4 number of process grids (ordered pairs of P & Q) 2 1 2 1 4 2 3 8 values of P 2 2 1 4 1 3 2 1 values of Q (2.0D0, -4.0D0) value of ALPHA (3.0D0, -2.0D0) value of BETA 4 number of tests problems 'N' 'N' 'N' 'N' 'N' 'N' 'N' 'N' values of DIAG 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' values of SIDE 'N' 'N' 'N' 'N' 'T' 'T' 'T' 'T' values of TRANSA 'N' 'N' 'T' 'T' 'N' 'N' 'T' 'T' values of TRANSB 'U' 'L' 'U' 'L' 'U' 'L' 'U' 'L' values of UPLO 22 30 8 20 3 5 11 8 values of M 23 29 7 21 3 14 11 8 values of N 24 28 9 22 3 9 11 8 values of K 31 31 31 31 31 31 31 31 values of M_A 31 31 31 31 31 31 31 31 values of N_A 4 5 6 3 2 1 4 5 values of IMB_A 4 5 6 3 2 1 4 5 values of INB_A 4 5 6 3 2 1 4 5 values of MB_A 4 5 6 3 2 1 4 5 values of NB_A 0 0 0 0 0 0 0 0 values of RSRC_A 0 0 0 0 0 0 0 0 values of CSRC_A 5 1 7 7 1 12 1 11 values of IA 5 1 7 7 1 12 1 11 values of JA 32 32 32 32 32 32 32 32 values of M_B 32 32 32 32 32 32 32 32 values of N_B 4 5 6 3 2 1 4 5 values of IMB_B 4 5 6 3 2 1 4 5 values of INB_B 4 5 6 3 2 1 4 5 values of MB_B 4 5 6 3 2 1 4 5 values of NB_B 0 0 0 0 0 0 0 0 values of RSRC_B 0 0 0 0 0 0 0 0 values of CSRC_B 5 1 7 7 1 12 1 11 values of IB 5 1 7 7 1 12 1 11 values of JB 35 35 35 35 35 35 35 35 values of M_C 35 35 35 35 35 35 35 35 values of N_C 4 5 6 3 2 1 4 5 values of IMB_C 4 5 6 3 2 1 4 5 values of INB_C 4 5 6 3 2 1 4 5 values of MB_C 4 5 6 3 2 1 4 5 values of NB_C 0 0 0 0 0 0 0 0 values of RSRC_C 0 0 0 0 0 0 0 0 values of CSRC_C 5 1 7 7 1 12 1 11 values of IC 5 1 7 7 1 12 1 11 values of JC PZGEMM T put F for no test in the same column PZSYMM T put F for no test in the same column PZHEMM T put F for no test in the same column PZSYRK T put F for no test in the same column PZHERK T put F for no test in the same column PZSYR2K T put F for no test in the same column PZHER2K T put F for no test in the same column PZTRMM T put F for no test in the same column PZTRSM T put F for no test in the same column PZGEADD T put F for no test in the same column PZTRADD T put F for no test in the same column scalapack-2.0.2/PBLAS/TESTING/pzblas3tst.f000644 000766 000024 00000421432 11750130340 020147 0ustar00juliestaff000000 000000 BLOCK DATA INTEGER NSUBS PARAMETER (NSUBS = 11) CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES DATA SNAMES/'PZGEMM ', 'PZSYMM ', 'PZHEMM ', $ 'PZSYRK ', 'PZHERK ', 'PZSYR2K', $ 'PZHER2K', 'PZTRMM ', 'PZTRSM ', $ 'PZGEADD', 'PZTRADD'/ END BLOCK DATA PROGRAM PZBLA3TST * * -- PBLAS testing driver (version 2.0.2) -- * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver * May 1 2012 * * Purpose * ======= * * PZBLA3TST is the main testing program for the Level 3 PBLAS routines. * * The program must be driven by a short data file. An annotated exam- * ple of a data file can be obtained by deleting the first 3 characters * * from the following 64 lines: * 'Level 3 PBLAS, Testing input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'PZBLAS3TST.SUMM' output file name (if any) * 6 device out * F logical flag, T to stop on failures * F logical flag, T to test error exits * 0 verbosity, 0 for pass/fail, 1-3 for matrix dump on errors * 10 the leading dimension gap * 16.0 threshold value of test ratio * 10 value of the logical computational blocksize NB * 1 number of process grids (ordered pairs of P & Q) * 2 2 1 4 2 3 8 values of P * 2 2 4 1 3 2 1 values of Q * (1.0D0, 0.0D0) value of ALPHA * (1.0D0, 0.0D0) value of BETA * 2 number of tests problems * 'N' 'U' values of DIAG * 'L' 'R' values of SIDE * 'N' 'T' values of TRANSA * 'N' 'T' values of TRANSB * 'U' 'L' values of UPLO * 3 4 values of M * 3 4 values of N * 3 4 values of K * 6 10 values of M_A * 6 10 values of N_A * 2 5 values of IMB_A * 2 5 values of INB_A * 2 5 values of MB_A * 2 5 values of NB_A * 0 1 values of RSRC_A * 0 0 values of CSRC_A * 1 1 values of IA * 1 1 values of JA * 6 10 values of M_B * 6 10 values of N_B * 2 5 values of IMB_B * 2 5 values of INB_B * 2 5 values of MB_B * 2 5 values of NB_B * 0 1 values of RSRC_B * 0 0 values of CSRC_B * 1 1 values of IB * 1 1 values of JB * 6 10 values of M_C * 6 10 values of N_C * 2 5 values of IMB_C * 2 5 values of INB_C * 2 5 values of MB_C * 2 5 values of NB_C * 0 1 values of RSRC_C * 0 0 values of CSRC_C * 1 1 values of IC * 1 1 values of JC * PZGEMM T put F for no test in the same column * PZSYMM T put F for no test in the same column * PZHEMM T put F for no test in the same column * PZSYRK T put F for no test in the same column * PZHERK T put F for no test in the same column * PZSYR2K T put F for no test in the same column * PZHER2K T put F for no test in the same column * PZTRMM T put F for no test in the same column * PZTRSM T put F for no test in the same column * PZGEADD T put F for no test in the same column * PZTRADD T put F for no test in the same column * * Internal Parameters * =================== * * TOTMEM INTEGER * TOTMEM is a machine-specific parameter indicating the maxi- * mum amount of available memory per process in bytes. The * user should customize TOTMEM to his platform. Remember to * leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. By default, TOTMEM is 2000000. * * DBLESZ INTEGER * ZPLXSZ INTEGER * DBLESZ and ZPLXSZ indicate the length in bytes on the given * platform for a double precision real and a double precision * complex. By default, DBLESZ is set to eight and ZPLXSZ is set * to sixteen. * * MEM COMPLEX*16 array * MEM is an array of dimension TOTMEM / ZPLXSZ. * All arrays used by SCALAPACK routines are allocated from this * array MEM and referenced by pointers. The integer IPA, for * example, is a pointer to the starting element of MEM for the * matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER MAXTESTS, MAXGRIDS, GAPMUL, ZPLXSZ, TOTMEM, $ MEMSIZ, NSUBS, DBLESZ COMPLEX*16 ONE, PADVAL, ZERO, ROGUE PARAMETER ( MAXTESTS = 20, MAXGRIDS = 20, GAPMUL = 10, $ ZPLXSZ = 16, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / ZPLXSZ, DBLESZ = 8, $ PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ), $ ROGUE = ( -1.0D+10, 1.0D+10 ), $ ONE = ( 1.0D+0, 0.0D+0 ), NSUBS = 11 ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL ERRFLG, SOF, TEE CHARACTER*1 ADIAGDO, AFORM, CFORM, DIAG, SIDE, TRANSA, $ TRANSB, UPLO INTEGER CSRCA, CSRCB, CSRCC, I, IA, IAM, IASEED, IB, $ IBSEED, IC, ICSEED, ICTXT, IGAP, IMBA, IMBB, $ IMBC, IMIDA, IMIDB, IMIDC, INBA, INBB, INBC, $ IPA, IPB, IPC, IPG, IPMATA, IPMATB, IPMATC, $ IPOSTA, IPOSTB, IPOSTC, IPREA, IPREB, IPREC, $ IPW, IVERB, J, JA, JB, JC, K, L, LDA, LDB, LDC, $ M, MA, MB, MBA, MBB, MBC, MC, MEMREQD, MPA, $ MPB, MPC, MYCOL, MYROW, N, NA, NB, NBA, NBB, $ NBC, NC, NCOLA, NCOLB, NCOLC, NGRIDS, NOUT, $ NPCOL, NPROCS, NPROW, NQA, NQB, NQC, NROWA, $ NROWB, NROWC, NTESTS, OFFDA, OFFDC, RSRCA, $ RSRCB, RSRCC, TSKIP, TSTCNT REAL THRESH COMPLEX*16 ALPHA, BETA, SCALE * .. * .. Local Arrays .. LOGICAL BCHECK( NSUBS ), CCHECK( NSUBS ), $ LTEST( NSUBS ) CHARACTER*1 DIAGVAL( MAXTESTS ), SIDEVAL( MAXTESTS ), $ TRNAVAL( MAXTESTS ), TRNBVAL( MAXTESTS ), $ UPLOVAL( MAXTESTS ) CHARACTER*80 OUTFILE INTEGER CSCAVAL( MAXTESTS ), CSCBVAL( MAXTESTS ), $ CSCCVAL( MAXTESTS ), DESCA( DLEN_ ), $ DESCAR( DLEN_ ), DESCB( DLEN_ ), $ DESCBR( DLEN_ ), DESCC( DLEN_ ), $ DESCCR( DLEN_ ), IAVAL( MAXTESTS ), $ IBVAL( MAXTESTS ), ICVAL( MAXTESTS ), $ IERR( 6 ), IMBAVAL( MAXTESTS ), $ IMBBVAL( MAXTESTS ), IMBCVAL( MAXTESTS ), $ INBAVAL( MAXTESTS ), INBBVAL( MAXTESTS ), $ INBCVAL( MAXTESTS ), JAVAL( MAXTESTS ), $ JBVAL( MAXTESTS ), JCVAL( MAXTESTS ) INTEGER KFAIL( NSUBS ), KPASS( NSUBS ), KSKIP( NSUBS ), $ KTESTS( NSUBS ), KVAL( MAXTESTS ), $ MAVAL( MAXTESTS ), MBAVAL( MAXTESTS ), $ MBBVAL( MAXTESTS ), MBCVAL( MAXTESTS ), $ MBVAL( MAXTESTS ), MCVAL( MAXTESTS ), $ MVAL( MAXTESTS ), NAVAL( MAXTESTS ), $ NBAVAL( MAXTESTS ), NBBVAL( MAXTESTS ), $ NBCVAL( MAXTESTS ), NBVAL( MAXTESTS ), $ NCVAL( MAXTESTS ), NVAL( MAXTESTS ), $ PVAL( MAXTESTS ), QVAL( MAXTESTS ), $ RSCAVAL( MAXTESTS ), RSCBVAL( MAXTESTS ), $ RSCCVAL( MAXTESTS ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ IGSUM2D, PB_DESCSET2, PB_PZLAPRNT, PB_ZCHEKPAD, $ PB_ZFILLPAD, PB_ZLASCAL, PB_ZLASET, PMDESCCHK, $ PMDIMCHK, PZBLA3TSTINFO, PZBLAS3TSTCHK, $ PZBLAS3TSTCHKE, PZCHKARG3, PZCHKMOUT, PZGEADD, $ PZGEMM, PZHEMM, PZHER2K, PZHERK, PZIPSET, $ PZLAGEN, PZLASCAL, PZLASET, PZMPRNT, PZSYMM, $ PZSYR2K, PZSYRK, PZTRADD, PZTRMM, PZTRSM * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_FCEIL EXTERNAL PB_FCEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, MAX, MOD, REAL * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) LOGICAL ABRTFLG INTEGER INFO, NBLOG COMMON /SNAMEC/SNAMES COMMON /INFOC/INFO, NBLOG COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA BCHECK/.TRUE., .TRUE., .TRUE., .FALSE., $ .FALSE., .TRUE., .TRUE., .TRUE., .TRUE., $ .FALSE., .FALSE./ DATA CCHECK/.TRUE., .TRUE., .TRUE., .TRUE., .TRUE., $ .TRUE., .TRUE., .FALSE., .FALSE., .TRUE., $ .TRUE./ * .. * .. Executable Statements .. * * Initialization * * Set flag so that the PBLAS error handler won't abort on errors, * so that the tester will detect unsupported operations. * ABRTFLG = .FALSE. * * So far no error, will become true as soon as one error is found. * ERRFLG = .FALSE. * * Test counters * TSKIP = 0 TSTCNT = 0 * * Seeds for random matrix generations. * IASEED = 100 IBSEED = 200 ICSEED = 300 * * So far no tests have been performed. * DO 10 I = 1, NSUBS KPASS( I ) = 0 KSKIP( I ) = 0 KFAIL( I ) = 0 KTESTS( I ) = 0 10 CONTINUE * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PZBLA3TSTINFO( OUTFILE, NOUT, NTESTS, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, NVAL, $ KVAL, MAVAL, NAVAL, IMBAVAL, MBAVAL, $ INBAVAL, NBAVAL, RSCAVAL, CSCAVAL, IAVAL, $ JAVAL, MBVAL, NBVAL, IMBBVAL, MBBVAL, $ INBBVAL, NBBVAL, RSCBVAL, CSCBVAL, IBVAL, $ JBVAL, MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, ICVAL, $ JCVAL, MAXTESTS, NGRIDS, PVAL, MAXGRIDS, $ QVAL, MAXGRIDS, NBLOG, LTEST, SOF, TEE, IAM, $ IGAP, IVERB, NPROCS, THRESH, ALPHA, BETA, $ MEM ) * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9976 ) WRITE( NOUT, FMT = * ) END IF * * If TEE is set then Test Error Exits of routines. * IF( TEE ) $ CALL PZBLAS3TSTCHKE( LTEST, NOUT, NPROCS ) * * Loop over different process grids * DO 60 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPROW', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'GRID SIZE', 'NPCOL', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'GRID' TSKIP = TSKIP + 1 GO TO 60 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 60 * * Loop over number of tests * DO 50 J = 1, NTESTS * * Get the test parameters * DIAG = DIAGVAL( J ) SIDE = SIDEVAL( J ) TRANSA = TRNAVAL( J ) TRANSB = TRNBVAL( J ) UPLO = UPLOVAL( J ) * M = MVAL( J ) N = NVAL( J ) K = KVAL( J ) * MA = MAVAL( J ) NA = NAVAL( J ) IMBA = IMBAVAL( J ) MBA = MBAVAL( J ) INBA = INBAVAL( J ) NBA = NBAVAL( J ) RSRCA = RSCAVAL( J ) CSRCA = CSCAVAL( J ) IA = IAVAL( J ) JA = JAVAL( J ) * MB = MBVAL( J ) NB = NBVAL( J ) IMBB = IMBBVAL( J ) MBB = MBBVAL( J ) INBB = INBBVAL( J ) NBB = NBBVAL( J ) RSRCB = RSCBVAL( J ) CSRCB = CSCBVAL( J ) IB = IBVAL( J ) JB = JBVAL( J ) * MC = MCVAL( J ) NC = NCVAL( J ) IMBC = IMBCVAL( J ) MBC = MBCVAL( J ) INBC = INBCVAL( J ) NBC = NBCVAL( J ) RSRCC = RSCCVAL( J ) CSRCC = CSCCVAL( J ) IC = ICVAL( J ) JC = JCVAL( J ) * IF( IAM.EQ.0 ) THEN * TSTCNT = TSTCNT + 1 * WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9996 ) TSTCNT, NPROW, NPCOL WRITE( NOUT, FMT = * ) * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9993 ) M, N, K, SIDE, UPLO, TRANSA, $ TRANSB, DIAG * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9992 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IA, JA, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9990 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IB, JB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9989 ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9991 ) IC, JC, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC * WRITE( NOUT, FMT = 9995 ) * END IF * * Check the validity of the input test parameters * IF( .NOT.LSAME( SIDE, 'L' ).AND. $ .NOT.LSAME( SIDE, 'R' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'SIDE' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'UPLO' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSA' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( TRANSB, 'N' ).AND. $ .NOT.LSAME( TRANSB, 'T' ).AND. $ .NOT.LSAME( TRANSB, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'TRANSB' TSKIP = TSKIP + 1 GO TO 40 END IF * IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) 'DIAG' TSKIP = TSKIP + 1 GO TO 40 END IF * * Check and initialize the matrix descriptors * CALL PMDESCCHK( ICTXT, NOUT, 'A', DESCA, $ BLOCK_CYCLIC_2D_INB, MA, NA, IMBA, INBA, $ MBA, NBA, RSRCA, CSRCA, MPA, NQA, IPREA, $ IMIDA, IPOSTA, IGAP, GAPMUL, IERR( 1 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'B', DESCB, $ BLOCK_CYCLIC_2D_INB, MB, NB, IMBB, INBB, $ MBB, NBB, RSRCB, CSRCB, MPB, NQB, IPREB, $ IMIDB, IPOSTB, IGAP, GAPMUL, IERR( 2 ) ) * CALL PMDESCCHK( ICTXT, NOUT, 'C', DESCC, $ BLOCK_CYCLIC_2D_INB, MC, NC, IMBC, INBC, $ MBC, NBC, RSRCC, CSRCC, MPC, NQC, IPREC, $ IMIDC, IPOSTC, IGAP, GAPMUL, IERR( 3 ) ) * IF( IERR( 1 ).GT.0 .OR. IERR( 2 ).GT.0 .OR. $ IERR( 3 ).GT.0 ) THEN TSKIP = TSKIP + 1 GO TO 40 END IF * LDA = MAX( 1, MA ) LDB = MAX( 1, MB ) LDC = MAX( 1, MC ) * * Assign pointers into MEM for matrices corresponding to * the distributed matrices A, X and Y. * IPA = IPREA + 1 IPB = IPA + DESCA( LLD_ )*NQA + IPOSTA + IPREB IPC = IPB + DESCB( LLD_ )*NQB + IPOSTB + IPREC IPMATA = IPC + DESCC( LLD_ )*NQC + IPOSTC IPMATB = IPMATA + MA*NA IPMATC = IPMATB + MB*NB IPG = IPMATC + MAX( MB*NB, MC*NC ) * * Check if sufficient memory. * Requirement = mem for local part of parallel matrices + * mem for whole matrices for comp. check + * mem for recving comp. check error vals. * IPW = IPG + MAX( MAX( MAX( IMBA, MBA ), $ MAX( IMBB, MBB ) ), $ MAX( IMBC, MBC ) ) + MAX( M, MAX( N, K ) ) MEMREQD = IPW + PB_FCEIL( REAL( MAX( M, MAX( N, K ) ) ) * $ REAL( DBLESZ ), REAL( ZPLXSZ ) ) - 1 IERR( 1 ) = 0 IF( MEMREQD.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9987 ) MEMREQD*ZPLXSZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9988 ) TSKIP = TSKIP + 1 GO TO 40 END IF * * Loop over all PBLAS 3 routines * DO 30 L = 1, NSUBS * * Continue only if this subroutine has to be tested. * IF( .NOT.LTEST( L ) ) $ GO TO 30 * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9986 ) SNAMES( L ) END IF * * Define the size of the operands * IF( L.EQ.1 ) THEN * * PZGEMM * NROWC = M NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF( LSAME( TRANSB, 'N' ) ) THEN NROWB = K NCOLB = N ELSE NROWB = N NCOLB = K END IF * ELSE IF( L.EQ.2 .OR. L.EQ.3 ) THEN * * PZSYMM, PZHEMM * NROWC = M NCOLC = N NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF * ELSE IF( L.EQ.4 .OR. L.EQ.5 ) THEN * * PZSYRK, PZHERK * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K ELSE NROWA = K NCOLA = N END IF NROWB = 0 NCOLB = 0 * ELSE IF( L.EQ.6 .OR. L.EQ.7 ) THEN * * PZSYR2K, PZHER2K * NROWC = N NCOLC = N IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = N NCOLA = K NROWB = N NCOLB = K ELSE NROWA = K NCOLA = N NROWB = K NCOLB = N END IF * ELSE IF( L.EQ.8 .OR. L.EQ.9 ) THEN NROWB = M NCOLB = N IF( LSAME( SIDE, 'L' ) ) THEN NROWA = M NCOLA = M ELSE NROWA = N NCOLA = N END IF NROWC = 0 NCOLC = 0 * ELSE IF( L.EQ.10 .OR. L.EQ.11 ) THEN * * PZGEADD, PZTRADD * IF( LSAME( TRANSA, 'N' ) ) THEN NROWA = M NCOLA = N ELSE NROWA = N NCOLA = M END IF NROWC = M NCOLC = N NROWB = 0 NCOLB = 0 * END IF * * Check the validity of the operand sizes * CALL PMDIMCHK( ICTXT, NOUT, NROWA, NCOLA, 'A', IA, JA, $ DESCA, IERR( 1 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWB, NCOLB, 'B', IB, JB, $ DESCB, IERR( 2 ) ) CALL PMDIMCHK( ICTXT, NOUT, NROWC, NCOLC, 'C', IC, JC, $ DESCC, IERR( 3 ) ) * IF( IERR( 1 ).NE.0 .OR. IERR( 2 ).NE.0 .OR. $ IERR( 3 ).NE.0 ) THEN KSKIP( L ) = KSKIP( L ) + 1 GO TO 30 END IF * * Check special values of TRANSA for symmetric and * hermitian rank-k and rank-2k updates. * IF( L.EQ.4 .OR. L.EQ.6 ) THEN IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'T' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9975 ) 'TRANSA' KSKIP( L ) = KSKIP( L ) + 1 GO TO 30 END IF ELSE IF( L.EQ.5 .OR. L.EQ.7 ) THEN IF( .NOT.LSAME( TRANSA, 'N' ).AND. $ .NOT.LSAME( TRANSA, 'C' ) ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9975 ) 'TRANSA' KSKIP( L ) = KSKIP( L ) + 1 GO TO 30 END IF END IF * * Generate distributed matrices A, B and C * IF( L.EQ.2 ) THEN * * PZSYMM * AFORM = 'S' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.3 ) THEN * * PZHEMM * AFORM = 'H' ADIAGDO = 'N' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE IF( L.EQ.4 .OR. L.EQ.6 ) THEN * * PZSYRK, PZSYR2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'S' OFFDC = IC - JC * ELSE IF( L.EQ.5 .OR. L.EQ.7 ) THEN * * PZHERK, PZHER2K * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'H' OFFDC = IC - JC * ELSE IF( ( L.EQ.9 ).AND.( LSAME( DIAG, 'N' ) ) ) THEN * * PZTRSM * AFORM = 'N' ADIAGDO = 'D' OFFDA = IA - JA CFORM = 'N' OFFDC = 0 * ELSE * * Default values * AFORM = 'N' ADIAGDO = 'N' OFFDA = 0 CFORM = 'N' OFFDC = 0 * END IF * CALL PZLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCA, IASEED, MEM( IPA ), $ DESCA( LLD_ ) ) * IF( BCHECK( L ) ) $ CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCB, IBSEED, MEM( IPB ), $ DESCB( LLD_ ) ) * IF( CCHECK( L ) ) $ CALL PZLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCC, ICSEED, MEM( IPC ), $ DESCC( LLD_ ) ) * * Generate entire matrices on each process. * CALL PB_DESCSET2( DESCAR, MA, NA, IMBA, INBA, MBA, NBA, $ -1, -1, ICTXT, MAX( 1, MA ) ) CALL PZLAGEN( .FALSE., AFORM, ADIAGDO, OFFDA, MA, NA, $ 1, 1, DESCAR, IASEED, MEM( IPMATA ), $ DESCAR( LLD_ ) ) * IF( BCHECK( L ) ) THEN CALL PB_DESCSET2( DESCBR, MB, NB, IMBB, INBB, MBB, $ NBB, -1, -1, ICTXT, MAX( 1, MB ) ) CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCBR, IBSEED, MEM( IPMATB ), $ DESCBR( LLD_ ) ) END IF * IF( CCHECK( L ) ) THEN * CALL PB_DESCSET2( DESCCR, MC, NC, IMBC, INBC, MBC, $ NBC, -1, -1, ICTXT, MAX( 1, MC ) ) CALL PZLAGEN( .FALSE., CFORM, 'No diag', OFFDC, MC, $ NC, 1, 1, DESCCR, ICSEED, MEM( IPMATC ), $ DESCCR( LLD_ ) ) * ELSE * * If C is not needed, generate a copy of B instead * CALL PB_DESCSET2( DESCCR, MB, NB, IMBB, INBB, MBB, $ NBB, -1, -1, ICTXT, MAX( 1, MB ) ) CALL PZLAGEN( .FALSE., 'None', 'No diag', 0, MB, NB, $ 1, 1, DESCCR, IBSEED, MEM( IPMATC ), $ DESCCR( LLD_ ) ) * END IF * * Zero non referenced part of the matrices A, B, C * IF( ( ( L.EQ.2 ).OR. ( L.EQ.3 ) ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN * * The distributed matrix A is symmetric or Hermitian * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of A. * CALL PZLASET( 'Upper', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA, JA+1, DESCA ) * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of A. * CALL PZLASET( 'Lower', NROWA-1, NCOLA-1, ROGUE, $ ROGUE, MEM( IPA ), IA+1, JA, DESCA ) * END IF * ELSE IF( ( ( L.EQ.4 ).OR.( L.EQ.5 ).OR.( L.EQ.6 ).OR. $ ( L.EQ.7 ) ).AND. $ ( MAX( NROWC, NCOLC ).GT.1 ) ) THEN * * The distributed matrix C is symmetric or Hermitian * IF( LSAME( UPLO, 'L' ) ) THEN * * Zeros the strict upper triangular part of C. * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PZLASET( 'Upper', NROWC-1, NCOLC-1, ROGUE, $ ROGUE, MEM( IPC ), IC, JC+1, $ DESCC ) CALL PB_ZLASET( 'Upper', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC-1+JC*LDC ), LDC ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Zeros the strict lower triangular part of C. * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PZLASET( 'Lower', NROWC-1, NCOLC-1, ROGUE, $ ROGUE, MEM( IPC ), IC+1, JC, $ DESCC ) CALL PB_ZLASET( 'Lower', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC+(JC-1)*LDC ), $ LDC ) END IF * END IF * ELSE IF( L.EQ.8 .OR. L.EQ.9 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix A is lower triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PZLASET( 'Upper', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA, $ JA+1, DESCA ) CALL PB_ZLASET( 'Upper', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA-1+JA*LDA ), $ LDA ) END IF * ELSE * CALL PZLASET( 'Upper', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_ZLASET( 'Upper', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( L.EQ.9 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / $ DCMPLX( DBLE( MAX( NROWA, NCOLA ) ) ) CALL PZLASCAL( 'Lower', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA+1, JA, $ DESCA ) CALL PB_ZLASCAL( 'Lower', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix A is upper triangular * IF( LSAME( DIAG, 'N' ) ) THEN * IF( MAX( NROWA, NCOLA ).GT.1 ) THEN CALL PZLASET( 'Lower', NROWA-1, NCOLA-1, $ ROGUE, ROGUE, MEM( IPA ), IA+1, $ JA, DESCA ) CALL PB_ZLASET( 'Lower', NROWA-1, NCOLA-1, 0, $ ZERO, ZERO, $ MEM( IPMATA+IA+(JA-1)*LDA ), $ LDA ) END IF * ELSE * CALL PZLASET( 'Lower', NROWA, NCOLA, ROGUE, ONE, $ MEM( IPA ), IA, JA, DESCA ) CALL PB_ZLASET( 'Lower', NROWA, NCOLA, 0, ZERO, $ ONE, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA ) IF( ( L.EQ.9 ).AND. $ ( MAX( NROWA, NCOLA ).GT.1 ) ) THEN SCALE = ONE / $ DCMPLX( DBLE( MAX( NROWA, NCOLA ) ) ) CALL PZLASCAL( 'Upper', NROWA-1, NCOLA-1, $ SCALE, MEM( IPA ), IA, JA+1, $ DESCA ) CALL PB_ZLASCAL( 'Upper', NROWA-1, NCOLA-1, $ 0, SCALE, $ MEM( IPMATA+IA-1+JA*LDA ), LDA ) END IF * END IF * END IF * ELSE IF( L.EQ.11 ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * * The distributed matrix C is lower triangular * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PZLASET( 'Upper', NROWC-1, NCOLC-1, $ ROGUE, ROGUE, MEM( IPC ), IC, $ JC+1, DESCC ) CALL PB_ZLASET( 'Upper', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC-1+JC*LDC ), LDC ) END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * The distributed matrix C is upper triangular * IF( MAX( NROWC, NCOLC ).GT.1 ) THEN CALL PZLASET( 'Lower', NROWC-1, NCOLC-1, $ ROGUE, ROGUE, MEM( IPC ), IC+1, $ JC, DESCC ) CALL PB_ZLASET( 'Lower', NROWC-1, NCOLC-1, 0, $ ROGUE, ROGUE, $ MEM( IPMATC+IC+(JC-1)*LDC ), $ LDC ) END IF * END IF * END IF * * Pad the guard zones of A, B and C * CALL PB_ZFILLPAD( ICTXT, MPA, NQA, MEM( IPA-IPREA ), $ DESCA( LLD_ ), IPREA, IPOSTA, PADVAL ) * IF( BCHECK( L ) ) THEN CALL PB_ZFILLPAD( ICTXT, MPB, NQB, MEM( IPB-IPREB ), $ DESCB( LLD_ ), IPREB, IPOSTB, $ PADVAL ) END IF * IF( CCHECK( L ) ) THEN CALL PB_ZFILLPAD( ICTXT, MPC, NQC, MEM( IPC-IPREC ), $ DESCC( LLD_ ), IPREC, IPOSTC, $ PADVAL ) END IF * * Initialize the check for INPUT-only arguments. * INFO = 0 CALL PZCHKARG3( ICTXT, NOUT, SNAMES( L ), SIDE, UPLO, $ TRANSA, TRANSB, DIAG, M, N, K, ALPHA, IA, $ JA, DESCA, IB, JB, DESCB, BETA, IC, JC, $ DESCC, INFO ) * * Print initial parallel data if IVERB >= 2. * IF( IVERB.EQ.2 ) THEN CALL PB_PZLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, $ 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PZLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_INITIAL_A', NOUT, $ MEM( IPW ) ) END IF * IF( BCHECK( L ) ) THEN IF( IVERB.EQ.2 ) THEN CALL PB_PZLAPRNT( NROWB, NCOLB, MEM( IPB ), IB, JB, $ DESCB, 0, 0, $ 'PARALLEL_INITIAL_B', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PZLAPRNT( MB, NB, MEM( IPB ), 1, 1, DESCB, $ 0, 0, 'PARALLEL_INITIAL_B', NOUT, $ MEM( IPW ) ) END IF END IF * IF( CCHECK( L ) ) THEN IF( IVERB.EQ.2 ) THEN CALL PB_PZLAPRNT( NROWC, NCOLC, MEM( IPC ), IC, JC, $ DESCC, 0, 0, $ 'PARALLEL_INITIAL_C', NOUT, $ MEM( IPW ) ) ELSE IF( IVERB.GE.3 ) THEN CALL PB_PZLAPRNT( MC, NC, MEM( IPC ), 1, 1, DESCC, $ 0, 0, 'PARALLEL_INITIAL_C', NOUT, $ MEM( IPW ) ) END IF END IF * * Call the Level 3 PBLAS routine * INFO = 0 IF( L.EQ.1 ) THEN * * Test PZGEMM * CALL PZGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.2 ) THEN * * Test PZSYMM * CALL PZSYMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.3 ) THEN * * Test PZHEMM * CALL PZIPSET( 'Bignum', NROWA, MEM( IPA ), IA, JA, $ DESCA ) * CALL PZHEMM( SIDE, UPLO, M, N, ALPHA, MEM( IPA ), IA, $ JA, DESCA, MEM( IPB ), IB, JB, DESCB, $ BETA, MEM( IPC ), IC, JC, DESCC ) * CALL PZIPSET( 'Zero', NROWA, MEM( IPA ), IA, JA, $ DESCA ) * ELSE IF( L.EQ.4 ) THEN * * Test PZSYRK * CALL PZSYRK( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.5 ) THEN * * Test PZHERK * IF( ( ( DCMPLX( DBLE( ALPHA ) ).NE.ZERO ).AND. $ ( K.NE.0 ) ).OR. $ ( DCMPLX( DBLE( BETA ) ).NE.ONE ) ) $ CALL PZIPSET( 'Bignum', N, MEM( IPC ), IC, JC, $ DESCC ) * CALL PZHERK( UPLO, TRANSA, N, K, DBLE( ALPHA ), $ MEM( IPA ), IA, JA, DESCA, DBLE( BETA ), $ MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.6 ) THEN * * Test PZSYR2K * CALL PZSYR2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, BETA, MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.7 ) THEN * * Test PZHER2K * IF( ( ( ALPHA.NE.ZERO ).AND.( K.NE.0 ) ).OR. $ ( DCMPLX( DBLE( BETA ) ).NE.ONE ) ) $ CALL PZIPSET( 'Bignum', N, MEM( IPC ), IC, JC, $ DESCC ) * CALL PZHER2K( UPLO, TRANSA, N, K, ALPHA, MEM( IPA ), $ IA, JA, DESCA, MEM( IPB ), IB, JB, $ DESCB, DBLE( BETA ), MEM( IPC ), IC, JC, $ DESCC ) * ELSE IF( L.EQ.8 ) THEN * * Test PZTRMM * CALL PZTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) * ELSE IF( L.EQ.9 ) THEN * * Test PZTRSM * CALL PZTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ MEM( IPA ), IA, JA, DESCA, MEM( IPB ), $ IB, JB, DESCB ) * * ELSE IF( L.EQ.10 ) THEN * * Test PZGEADD * CALL PZGEADD( TRANSA, M, N, ALPHA, MEM( IPA ), IA, JA, $ DESCA, BETA, MEM( IPC ), IC, JC, DESCC ) * ELSE IF( L.EQ.11 ) THEN * * Test PZTRADD * CALL PZTRADD( UPLO, TRANSA, M, N, ALPHA, MEM( IPA ), $ IA, JA, DESCA, BETA, MEM( IPC ), IC, JC, $ DESCC ) * END IF * * Check if the operation has been performed. * IF( INFO.NE.0 ) THEN KSKIP( L ) = KSKIP( L ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9974 ) INFO GO TO 30 END IF * * Check padding * CALL PB_ZCHEKPAD( ICTXT, SNAMES( L ), MPA, NQA, $ MEM( IPA-IPREA ), DESCA( LLD_ ), $ IPREA, IPOSTA, PADVAL ) * IF( BCHECK( L ) ) THEN CALL PB_ZCHEKPAD( ICTXT, SNAMES( L ), MPB, NQB, $ MEM( IPB-IPREB ), DESCB( LLD_ ), $ IPREB, IPOSTB, PADVAL ) END IF * IF( CCHECK( L ) ) THEN CALL PB_ZCHEKPAD( ICTXT, SNAMES( L ), MPC, NQC, $ MEM( IPC-IPREC ), DESCC( LLD_ ), $ IPREC, IPOSTC, PADVAL ) END IF * * Check the computations * CALL PZBLAS3TSTCHK( ICTXT, NOUT, L, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, $ MEM( IPMATA ), MEM( IPA ), IA, JA, $ DESCA, MEM( IPMATB ), MEM( IPB ), $ IB, JB, DESCB, BETA, MEM( IPMATC ), $ MEM( IPC ), IC, JC, DESCC, THRESH, $ ROGUE, MEM( IPG ), MEM( IPW ), INFO ) IF( MOD( INFO, 2 ).EQ.1 ) THEN IERR( 1 ) = 1 ELSE IF( MOD( INFO / 2, 2 ).EQ.1 ) THEN IERR( 2 ) = 1 ELSE IF( MOD( INFO / 4, 2 ).EQ.1 ) THEN IERR( 3 ) = 1 ELSE IF( INFO.NE.0 ) THEN IERR( 1 ) = 1 IERR( 2 ) = 1 IERR( 3 ) = 1 END IF * * Check input-only scalar arguments * INFO = 1 CALL PZCHKARG3( ICTXT, NOUT, SNAMES( L ), SIDE, UPLO, $ TRANSA, TRANSB, DIAG, M, N, K, ALPHA, IA, $ JA, DESCA, IB, JB, DESCB, BETA, IC, JC, $ DESCC, INFO ) * * Check input-only array arguments * CALL PZCHKMOUT( NROWA, NCOLA, MEM( IPMATA ), $ MEM( IPA ), IA, JA, DESCA, IERR( 4 ) ) IF( IERR( 4 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_A', $ SNAMES( L ) END IF * IF( BCHECK( L ) ) THEN CALL PZCHKMOUT( NROWB, NCOLB, MEM( IPMATB ), $ MEM( IPB ), IB, JB, DESCB, IERR( 5 ) ) IF( IERR( 5 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_B', $ SNAMES( L ) END IF END IF * IF( CCHECK( L ) ) THEN CALL PZCHKMOUT( NROWC, NCOLC, MEM( IPMATC ), $ MEM( IPC ), IC, JC, DESCC, IERR( 6 ) ) IF( IERR( 6 ).NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9983 ) 'PARALLEL_C', $ SNAMES( L ) END IF END IF * * Only node 0 prints computational test result * IF( INFO.NE.0 .OR. IERR( 1 ).NE.0 .OR. $ IERR( 2 ).NE.0 .OR. IERR( 3 ).NE.0 .OR. $ IERR( 4 ).NE.0 .OR. IERR( 5 ).NE.0 .OR. $ IERR( 6 ).NE.0 ) THEN KFAIL( L ) = KFAIL( L ) + 1 ERRFLG = .TRUE. IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9985 ) SNAMES( L ) ELSE KPASS( L ) = KPASS( L ) + 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9984 ) SNAMES( L ) END IF * * Dump matrix if IVERB >= 1 and error. * IF( IVERB.GE.1 .AND. ERRFLG ) THEN IF( IERR( 4 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PZMPRNT( ICTXT, NOUT, MA, NA, MEM( IPMATA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PZLAPRNT( MA, NA, MEM( IPA ), 1, 1, DESCA, $ 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) ELSE IF( IERR( 1 ).NE.0 ) THEN IF( ( NROWA.GT.0 ).AND.( NCOLA.GT.0 ) ) $ CALL PZMPRNT( ICTXT, NOUT, NROWA, NCOLA, $ MEM( IPMATA+IA-1+(JA-1)*LDA ), $ LDA, 0, 0, 'SERIAL_A' ) CALL PB_PZLAPRNT( NROWA, NCOLA, MEM( IPA ), IA, JA, $ DESCA, 0, 0, 'PARALLEL_A', NOUT, $ MEM( IPMATA ) ) END IF IF( BCHECK( L ) ) THEN IF( IERR( 5 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PZMPRNT( ICTXT, NOUT, MB, NB, $ MEM( IPMATB ), LDB, 0, 0, $ 'SERIAL_B' ) CALL PB_PZLAPRNT( MB, NB, MEM( IPB ), 1, 1, $ DESCB, 0, 0, 'PARALLEL_B', $ NOUT, MEM( IPMATB ) ) ELSE IF( IERR( 2 ).NE.0 ) THEN IF( ( NROWB.GT.0 ).AND.( NCOLB.GT.0 ) ) $ CALL PZMPRNT( ICTXT, NOUT, NROWB, NCOLB, $ MEM( IPMATB+IB-1+(JB-1)*LDB ), $ LDB, 0, 0, 'SERIAL_B' ) CALL PB_PZLAPRNT( NROWB, NCOLB, MEM( IPB ), IB, $ JB, DESCB, 0, 0, 'PARALLEL_B', $ NOUT, MEM( IPMATB ) ) END IF END IF IF( CCHECK( L ) ) THEN IF( IERR( 6 ).NE.0 .OR. IVERB.GE.3 ) THEN CALL PZMPRNT( ICTXT, NOUT, MC, NC, $ MEM( IPMATC ), LDC, 0, 0, $ 'SERIAL_C' ) CALL PB_PZLAPRNT( MC, NC, MEM( IPC ), 1, 1, $ DESCC, 0, 0, 'PARALLEL_C', $ NOUT, MEM( IPMATC ) ) ELSE IF( IERR( 3 ).NE.0 ) THEN IF( ( NROWB.GT.0 ).AND.( NCOLB.GT.0 ) ) $ CALL PZMPRNT( ICTXT, NOUT, NROWC, NCOLC, $ MEM( IPMATC+IC-1+(JC-1)*LDC ), $ LDC, 0, 0, 'SERIAL_C' ) CALL PB_PZLAPRNT( NROWC, NCOLC, MEM( IPC ), IC, $ JC, DESCC, 0, 0, 'PARALLEL_C', $ NOUT, MEM( IPMATC ) ) END IF END IF END IF * * Leave if error and "Stop On Failure" * IF( SOF.AND.ERRFLG ) $ GO TO 70 * 30 CONTINUE * 40 IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9982 ) J END IF * 50 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 60 CONTINUE * * Come here, if error and "Stop On Failure" * 70 CONTINUE * * Before printing out final stats, add TSKIP to all skips * DO 80 I = 1, NSUBS IF( LTEST( I ) ) THEN KSKIP( I ) = KSKIP( I ) + TSKIP KTESTS( I ) = KSKIP( I ) + KFAIL( I ) + KPASS( I ) END IF 80 CONTINUE * * Print results * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9978 ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) WRITE( NOUT, FMT = 9979 ) * DO 90 I = 1, NSUBS WRITE( NOUT, FMT = 9981 ) '|', SNAMES( I ), KTESTS( I ), $ KPASS( I ), KFAIL( I ), KSKIP( I ) 90 CONTINUE WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9977 ) WRITE( NOUT, FMT = * ) * END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A, ': ', A, ' = ', I10, $ ' should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: NPROW*NPCOL = ', I4, $ '. It can be at most', I4 ) 9997 FORMAT( 'Bad ', A, ' parameters: going on to next test case.' ) 9996 FORMAT( 2X, 'Test number ', I4 , ' started on a ', I6, ' x ', $ I6, ' process grid.' ) 9995 FORMAT( 2X, ' ------------------------------------------------', $ '-------------------' ) 9994 FORMAT( 2X, ' M N K SIDE UPLO TRANSA ', $ 'TRANSB DIAG' ) 9993 FORMAT( 5X,I6,1X,I6,1X,I6,6X,A1,5X,A1,7X,A1,7X,A1,5X,A1 ) 9992 FORMAT( 2X, ' IA JA MA NA IMBA INBA', $ ' MBA NBA RSRCA CSRCA' ) 9991 FORMAT( 5X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6,1X,I6, $ 1X,I5,1X,I5 ) 9990 FORMAT( 2X, ' IB JB MB NB IMBB INBB', $ ' MBB NBB RSRCB CSRCB' ) 9989 FORMAT( 2X, ' IC JC MC NC IMBC INBC', $ ' MBC NBC RSRCC CSRCC' ) 9988 FORMAT( 'Not enough memory for this test: going on to', $ ' next test case.' ) 9987 FORMAT( 'Not enough memory. Need: ', I12 ) 9986 FORMAT( 2X, ' Tested Subroutine: ', A ) 9985 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' FAILED ',' *****' ) 9984 FORMAT( 2X, ' ***** Computational check: ', A, ' ', $ ' PASSED ',' *****' ) 9983 FORMAT( 2X, ' ***** ERROR ***** Matrix operand ', A, $ ' modified by ', A, ' *****' ) 9982 FORMAT( 2X, 'Test number ', I4, ' completed.' ) 9981 FORMAT( 2X,A1,2X,A7,8X,I4,6X,I4,5X,I4,4X,I4 ) 9980 FORMAT( 2X, ' SUBROUTINE TOTAL TESTS PASSED FAILED ', $ 'SKIPPED' ) 9979 FORMAT( 2X, ' ---------- ----------- ------ ------ ', $ '-------' ) 9978 FORMAT( 2X, 'Testing Summary') 9977 FORMAT( 2X, 'End of Tests.' ) 9976 FORMAT( 2X, 'Tests started.' ) 9975 FORMAT( 2X, ' ***** ', A, ' has an incorrect value: ', $ ' BYPASS *****' ) 9974 FORMAT( 2X, ' ***** Operation not supported, error code: ', $ I5, ' *****' ) * STOP * * End of PZBLA3TST * END SUBROUTINE PZBLA3TSTINFO( SUMMRY, NOUT, NMAT, DIAGVAL, SIDEVAL, $ TRNAVAL, TRNBVAL, UPLOVAL, MVAL, $ NVAL, KVAL, MAVAL, NAVAL, IMBAVAL, $ MBAVAL, INBAVAL, NBAVAL, RSCAVAL, $ CSCAVAL, IAVAL, JAVAL, MBVAL, NBVAL, $ IMBBVAL, MBBVAL, INBBVAL, NBBVAL, $ RSCBVAL, CSCBVAL, IBVAL, JBVAL, $ MCVAL, NCVAL, IMBCVAL, MBCVAL, $ INBCVAL, NBCVAL, RSCCVAL, CSCCVAL, $ ICVAL, JCVAL, LDVAL, NGRIDS, PVAL, $ LDPVAL, QVAL, LDQVAL, NBLOG, LTEST, SOF, $ TEE, IAM, IGAP, IVERB, NPROCS, THRESH, $ ALPHA, BETA, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL SOF, TEE INTEGER IAM, IGAP, IVERB, LDPVAL, LDQVAL, LDVAL, NBLOG, $ NGRIDS, NMAT, NOUT, NPROCS REAL THRESH COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*( * ) SUMMRY CHARACTER*1 DIAGVAL( LDVAL ), SIDEVAL( LDVAL ), $ TRNAVAL( LDVAL ), TRNBVAL( LDVAL ), $ UPLOVAL( LDVAL ) LOGICAL LTEST( * ) INTEGER CSCAVAL( LDVAL ), CSCBVAL( LDVAL ), $ CSCCVAL( LDVAL ), IAVAL( LDVAL ), $ IBVAL( LDVAL ), ICVAL( LDVAL ), $ IMBAVAL( LDVAL ), IMBBVAL( LDVAL ), $ IMBCVAL( LDVAL ), INBAVAL( LDVAL ), $ INBBVAL( LDVAL ), INBCVAL( LDVAL ), $ JAVAL( LDVAL ), JBVAL( LDVAL ), JCVAL( LDVAL ), $ KVAL( LDVAL ), MAVAL( LDVAL ), MBAVAL( LDVAL ), $ MBBVAL( LDVAL ), MBCVAL( LDVAL ), $ MBVAL( LDVAL ), MCVAL( LDVAL ), MVAL( LDVAL ), $ NAVAL( LDVAL ), NBAVAL( LDVAL ), $ NBBVAL( LDVAL ), NBCVAL( LDVAL ), $ NBVAL( LDVAL ), NCVAL( LDVAL ), NVAL( LDVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), $ RSCAVAL( LDVAL ), RSCBVAL( LDVAL ), $ RSCCVAL( LDVAL ), WORK( * ) * .. * * Purpose * ======= * * PZBLA3TSTINFO get the needed startup information for testing various * Level 3 PBLAS routines, and transmits it to all processes. * * Notes * ===== * * For packing the information we assumed that the length in bytes of an * integer is equal to the length in bytes of a real single precision. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * On exit, SUMMRY is the name of output (summary) file (if * any). SUMMRY is only defined for process 0. * * NOUT (global output) INTEGER * On exit, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NMAT (global output) INTEGER * On exit, NMAT specifies the number of different test cases. * * DIAGVAL (global output) CHARACTER array * On entry, DIAGVAL is an array of dimension LDVAL. On exit, * this array contains the values of DIAG to run the code with. * * SIDEVAL (global output) CHARACTER array * On entry, SIDEVAL is an array of dimension LDVAL. On exit, * this array contains the values of SIDE to run the code with. * * TRNAVAL (global output) CHARACTER array * On entry, TRNAVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSA to run the code * with. * * TRNBVAL (global output) CHARACTER array * On entry, TRNBVAL is an array of dimension LDVAL. On exit, * this array contains the values of TRANSB to run the code * with. * * UPLOVAL (global output) CHARACTER array * On entry, UPLOVAL is an array of dimension LDVAL. On exit, * this array contains the values of UPLO to run the code with. * * MVAL (global output) INTEGER array * On entry, MVAL is an array of dimension LDVAL. On exit, this * array contains the values of M to run the code with. * * NVAL (global output) INTEGER array * On entry, NVAL is an array of dimension LDVAL. On exit, this * array contains the values of N to run the code with. * * KVAL (global output) INTEGER array * On entry, KVAL is an array of dimension LDVAL. On exit, this * array contains the values of K to run the code with. * * MAVAL (global output) INTEGER array * On entry, MAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( M_ ) to run the code * with. * * NAVAL (global output) INTEGER array * On entry, NAVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCA( N_ ) to run the code * with. * * IMBAVAL (global output) INTEGER array * On entry, IMBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( IMB_ ) to run the * code with. * * MBAVAL (global output) INTEGER array * On entry, MBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( MB_ ) to run the * code with. * * INBAVAL (global output) INTEGER array * On entry, INBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( INB_ ) to run the * code with. * * NBAVAL (global output) INTEGER array * On entry, NBAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( NB_ ) to run the * code with. * * RSCAVAL (global output) INTEGER array * On entry, RSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( RSRC_ ) to run the * code with. * * CSCAVAL (global output) INTEGER array * On entry, CSCAVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCA( CSRC_ ) to run the * code with. * * IAVAL (global output) INTEGER array * On entry, IAVAL is an array of dimension LDVAL. On exit, this * array contains the values of IA to run the code with. * * JAVAL (global output) INTEGER array * On entry, JAVAL is an array of dimension LDVAL. On exit, this * array contains the values of JA to run the code with. * * MBVAL (global output) INTEGER array * On entry, MBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( M_ ) to run the code * with. * * NBVAL (global output) INTEGER array * On entry, NBVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCB( N_ ) to run the code * with. * * IMBBVAL (global output) INTEGER array * On entry, IMBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( IMB_ ) to run the * code with. * * MBBVAL (global output) INTEGER array * On entry, MBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( MB_ ) to run the * code with. * * INBBVAL (global output) INTEGER array * On entry, INBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( INB_ ) to run the * code with. * * NBBVAL (global output) INTEGER array * On entry, NBBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( NB_ ) to run the * code with. * * RSCBVAL (global output) INTEGER array * On entry, RSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( RSRC_ ) to run the * code with. * * CSCBVAL (global output) INTEGER array * On entry, CSCBVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCB( CSRC_ ) to run the * code with. * * IBVAL (global output) INTEGER array * On entry, IBVAL is an array of dimension LDVAL. On exit, this * array contains the values of IB to run the code with. * * JBVAL (global output) INTEGER array * On entry, JBVAL is an array of dimension LDVAL. On exit, this * array contains the values of JB to run the code with. * * MCVAL (global output) INTEGER array * On entry, MCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( M_ ) to run the code * with. * * NCVAL (global output) INTEGER array * On entry, NCVAL is an array of dimension LDVAL. On exit, this * array contains the values of DESCC( N_ ) to run the code * with. * * IMBCVAL (global output) INTEGER array * On entry, IMBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( IMB_ ) to run the * code with. * * MBCVAL (global output) INTEGER array * On entry, MBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( MB_ ) to run the * code with. * * INBCVAL (global output) INTEGER array * On entry, INBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( INB_ ) to run the * code with. * * NBCVAL (global output) INTEGER array * On entry, NBCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( NB_ ) to run the * code with. * * RSCCVAL (global output) INTEGER array * On entry, RSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( RSRC_ ) to run the * code with. * * CSCCVAL (global output) INTEGER array * On entry, CSCCVAL is an array of dimension LDVAL. On exit, * this array contains the values of DESCC( CSRC_ ) to run the * code with. * * ICVAL (global output) INTEGER array * On entry, ICVAL is an array of dimension LDVAL. On exit, this * array contains the values of IC to run the code with. * * JCVAL (global output) INTEGER array * On entry, JCVAL is an array of dimension LDVAL. On exit, this * array contains the values of JC to run the code with. * * LDVAL (global input) INTEGER * On entry, LDVAL specifies the maximum number of different va- * lues that can be used for DIAG, SIDE, TRANSA, TRANSB, UPLO, * M, N, K, DESCA(:), IA, JA, DESCB(:), IB, JB, DESCC(:), IC, * JC. This is also the maximum number of test cases. * * NGRIDS (global output) INTEGER * On exit, NGRIDS specifies the number of different values that * can be used for P and Q. * * PVAL (global output) INTEGER array * On entry, PVAL is an array of dimension LDPVAL. On exit, this * array contains the values of P to run the code with. * * LDPVAL (global input) INTEGER * On entry, LDPVAL specifies the maximum number of different * values that can be used for P. * * QVAL (global output) INTEGER array * On entry, QVAL is an array of dimension LDQVAL. On exit, this * array contains the values of Q to run the code with. * * LDQVAL (global input) INTEGER * On entry, LDQVAL specifies the maximum number of different * values that can be used for Q. * * NBLOG (global output) INTEGER * On exit, NBLOG specifies the logical computational block size * to run the tests with. NBLOG must be at least one. * * LTEST (global output) LOGICAL array * On entry, LTEST is an array of dimension at least eleven. On * exit, if LTEST( i ) is .TRUE., the i-th Level 3 PBLAS routine * will be tested. See the input file for the ordering of the * routines. * * SOF (global output) LOGICAL * On exit, if SOF is .TRUE., the tester will stop on the first * detected failure. Otherwise, it won't. * * TEE (global output) LOGICAL * On exit, if TEE is .TRUE., the tester will perform the error * exit tests. These tests won't be performed otherwise. * * IAM (local input) INTEGER * On entry, IAM specifies the number of the process executing * this routine. * * IGAP (global output) INTEGER * On exit, IGAP specifies the user-specified gap used for pad- * ding. IGAP must be at least zero. * * IVERB (global output) INTEGER * On exit, IVERB specifies the output verbosity level: 0 for * pass/fail, 1, 2 or 3 for matrix dump on errors. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes. * * THRESH (global output) REAL * On exit, THRESH specifies the threshhold value for the test * ratio. * * ALPHA (global output) COMPLEX*16 * On exit, ALPHA specifies the value of alpha to be used in all * the test cases. * * BETA (global output) COMPLEX*16 * On exit, BETA specifies the value of beta to be used in all * the test cases. * * WORK (local workspace) INTEGER array * On entry, WORK is an array of dimension at least * MAX( 3, 2*NGRIDS+38*NMAT+NSUBS+4 ) with NSUBS equal to 11. * This array is used to pack all output arrays in order to send * the information in one message. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NIN, NSUBS PARAMETER ( NIN = 11, NSUBS = 11 ) * .. * .. Local Scalars .. LOGICAL LTESTT INTEGER I, ICTXT, J DOUBLE PRECISION EPS * .. * .. Local Arrays .. CHARACTER*7 SNAMET CHARACTER*79 USRINFO * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D, ZGEBR2D, ZGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC CHAR, ICHAR, MAX, MIN * .. * .. Common Blocks .. CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='PZBLAS3TST.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Read the flag that indicates if Stop on Failure * READ( NIN, FMT = * ) SOF * * Read the flag that indicates if Test Error Exits * READ( NIN, FMT = * ) TEE * * Read the verbosity level * READ( NIN, FMT = * ) IVERB IF( IVERB.LT.0 .OR. IVERB.GT.3 ) $ IVERB = 0 * * Read the leading dimension gap * READ( NIN, FMT = * ) IGAP IF( IGAP.LT.0 ) $ IGAP = 0 * * Read the threshold value for test ratio * READ( NIN, FMT = * ) THRESH IF( THRESH.LT.0.0 ) $ THRESH = 16.0 * * Get logical computational block size * READ( NIN, FMT = * ) NBLOG IF( NBLOG.LT.1 ) $ NBLOG = 32 * * Get number of grids * READ( NIN, FMT = * ) NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDPVAL GO TO 120 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Grids', LDQVAL GO TO 120 END IF * * Get values of P and Q * READ( NIN, FMT = * ) ( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * ) ( QVAL( I ), I = 1, NGRIDS ) * * Read ALPHA, BETA * READ( NIN, FMT = * ) ALPHA READ( NIN, FMT = * ) BETA * * Read number of tests. * READ( NIN, FMT = * ) NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDVAL ) THEN WRITE( NOUT, FMT = 9998 ) 'Tests', LDVAL GO TO 120 ENDIF * * Read in input data into arrays. * READ( NIN, FMT = * ) ( DIAGVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( SIDEVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( TRNBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( UPLOVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( KVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCAVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JAVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCBVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JBVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( IMBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( INBCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( MBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( NBCVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( RSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( CSCCVAL( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( ICVAL ( I ), I = 1, NMAT ) READ( NIN, FMT = * ) ( JCVAL ( I ), I = 1, NMAT ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 10 I = 1, NSUBS LTEST( I ) = .FALSE. 10 CONTINUE 20 CONTINUE READ( NIN, FMT = 9996, END = 50 ) SNAMET, LTESTT DO 30 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 40 30 CONTINUE * WRITE( NOUT, FMT = 9995 )SNAMET GO TO 120 * 40 CONTINUE LTEST( I ) = LTESTT GO TO 20 * 50 CONTINUE * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 60 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 60 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1 ) CALL ZGEBS2D( ICTXT, 'All', ' ', 1, 1, BETA, 1 ) * WORK( 1 ) = NGRIDS WORK( 2 ) = NMAT WORK( 3 ) = NBLOG CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 IF( SOF ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 IF( TEE ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 WORK( I ) = IVERB I = I + 1 WORK( I ) = IGAP I = I + 1 DO 70 J = 1, NMAT WORK( I ) = ICHAR( DIAGVAL( J ) ) WORK( I+1 ) = ICHAR( SIDEVAL( J ) ) WORK( I+2 ) = ICHAR( TRNAVAL( J ) ) WORK( I+3 ) = ICHAR( TRNBVAL( J ) ) WORK( I+4 ) = ICHAR( UPLOVAL( J ) ) I = I + 5 70 CONTINUE CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NMAT, MVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, KVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JAVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JBVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, IMBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, INBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, MBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, NBCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, RSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, CSCCVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, ICVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NMAT, JCVAL, 1, WORK( I ), 1 ) I = I + NMAT * DO 80 J = 1, NSUBS IF( LTEST( J ) ) THEN WORK( I ) = 1 ELSE WORK( I ) = 0 END IF I = I + 1 80 CONTINUE I = I - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) 'Level 3 PBLAS testing program.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'Tests of the complex double precision '// $ 'Level 3 PBLAS' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9993 ) NMAT WRITE( NOUT, FMT = 9979 ) NBLOG WRITE( NOUT, FMT = 9992 ) NGRIDS WRITE( NOUT, FMT = 9990 ) $ 'P', ( PVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( PVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9990 ) $ 'Q', ( QVAL(I), I = 1, MIN(NGRIDS, 5) ) IF( NGRIDS.GT.5 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 6, $ MIN( 10, NGRIDS ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 11, $ MIN( 15, NGRIDS ) ) IF( NGRIDS.GT.15 ) $ WRITE( NOUT, FMT = 9991 ) ( QVAL(I), I = 16, NGRIDS ) WRITE( NOUT, FMT = 9988 ) SOF WRITE( NOUT, FMT = 9987 ) TEE WRITE( NOUT, FMT = 9983 ) IGAP WRITE( NOUT, FMT = 9986 ) IVERB WRITE( NOUT, FMT = 9980 ) THRESH WRITE( NOUT, FMT = 9982 ) ALPHA WRITE( NOUT, FMT = 9981 ) BETA IF( LTEST( 1 ) ) THEN WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9985 ) SNAMES( 1 ), ' ... No ' END IF DO 90 I = 2, NSUBS IF( LTEST( I ) ) THEN WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... Yes' ELSE WRITE( NOUT, FMT = 9984 ) SNAMES( I ), ' ... No ' END IF 90 CONTINUE WRITE( NOUT, FMT = 9994 ) EPS WRITE( NOUT, FMT = * ) * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, ALPHA, 1, 0, 0 ) CALL ZGEBR2D( ICTXT, 'All', ' ', 1, 1, BETA, 1, 0, 0 ) * CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NGRIDS = WORK( 1 ) NMAT = WORK( 2 ) NBLOG = WORK( 3 ) * I = 2*NGRIDS + 38*NMAT + NSUBS + 4 CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) * I = 1 IF( WORK( I ).EQ.1 ) THEN SOF = .TRUE. ELSE SOF = .FALSE. END IF I = I + 1 IF( WORK( I ).EQ.1 ) THEN TEE = .TRUE. ELSE TEE = .FALSE. END IF I = I + 1 IVERB = WORK( I ) I = I + 1 IGAP = WORK( I ) I = I + 1 DO 100 J = 1, NMAT DIAGVAL( J ) = CHAR( WORK( I ) ) SIDEVAL( J ) = CHAR( WORK( I+1 ) ) TRNAVAL( J ) = CHAR( WORK( I+2 ) ) TRNBVAL( J ) = CHAR( WORK( I+3 ) ) UPLOVAL( J ) = CHAR( WORK( I+4 ) ) I = I + 5 100 CONTINUE CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) I = I + NGRIDS CALL ICOPY( NMAT, WORK( I ), 1, MVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, KVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JAVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JBVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, IMBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, INBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, MBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, NBCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, RSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, CSCCVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, ICVAL, 1 ) I = I + NMAT CALL ICOPY( NMAT, WORK( I ), 1, JCVAL, 1 ) I = I + NMAT * DO 110 J = 1, NSUBS IF( WORK( I ).EQ.1 ) THEN LTEST( J ) = .TRUE. ELSE LTEST( J ) = .FALSE. END IF I = I + 1 110 CONTINUE * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 120 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( ' Number of values of ',5A, ' is less than 1 or greater ', $ 'than ', I2 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) 9996 FORMAT( A7, L2 ) 9995 FORMAT( ' Subprogram name ', A7, ' not recognized', $ /' ******* TESTS ABANDONED *******' ) 9994 FORMAT( 2X, 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9993 FORMAT( 2X, 'Number of Tests : ', I6 ) 9992 FORMAT( 2X, 'Number of process grids : ', I6 ) 9991 FORMAT( 2X, ' : ', 5I6 ) 9990 FORMAT( 2X, A1, ' : ', 5I6 ) 9988 FORMAT( 2X, 'Stop on failure flag : ', L6 ) 9987 FORMAT( 2X, 'Test for error exits flag : ', L6 ) 9986 FORMAT( 2X, 'Verbosity level : ', I6 ) 9985 FORMAT( 2X, 'Routines to be tested : ', A, A8 ) 9984 FORMAT( 2X, ' ', A, A8 ) 9983 FORMAT( 2X, 'Leading dimension gap : ', I6 ) 9982 FORMAT( 2X, 'Alpha : (', G16.6, $ ',', G16.6, ')' ) 9981 FORMAT( 2X, 'Beta : (', G16.6, $ ',', G16.6, ')' ) 9980 FORMAT( 2X, 'Threshold value : ', G16.6 ) 9979 FORMAT( 2X, 'Logical block size : ', I6 ) * * End of PZBLA3TSTINFO * END SUBROUTINE PZBLAS3TSTCHKE( LTEST, INOUT, NPROCS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INOUT, NPROCS * .. * .. Array Arguments .. LOGICAL LTEST( * ) * .. * * Purpose * ======= * * PZBLAS3TSTCHKE tests the error exits of the Level 3 PBLAS. * * Arguments * ========= * * LTEST (global input) LOGICAL array * On entry, LTEST is an array of dimension at least 11 (NSUBS). * If LTEST( 1 ) is .TRUE., PZGEMM will be tested; * If LTEST( 2 ) is .TRUE., PZSYMM will be tested; * If LTEST( 3 ) is .TRUE., PZHEMM will be tested; * If LTEST( 4 ) is .TRUE., PZSYRK will be tested; * If LTEST( 5 ) is .TRUE., PZHERK will be tested; * If LTEST( 6 ) is .TRUE., PZSYR2K will be tested; * If LTEST( 7 ) is .TRUE., PZHER2K will be tested; * If LTEST( 8 ) is .TRUE., PZTRMM will be tested; * If LTEST( 9 ) is .TRUE., PZTRSM will be tested; * If LTEST( 10 ) is .TRUE., PZGEADD will be tested; * If LTEST( 11 ) is .TRUE., PZTRADD will be tested; * * INOUT (global input) INTEGER * On entry, INOUT specifies the unit number for output file. * When INOUT is 6, output to screen, when INOUT = 0, output to * stderr. INOUT is only defined in process 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of processes cal- * ling this routine. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER NSUBS PARAMETER ( NSUBS = 11 ) * .. * .. Local Scalars .. LOGICAL ABRTSAV INTEGER I, ICTXT, MYCOL, MYROW, NPCOL, NPROW * .. * .. Local Arrays .. INTEGER SCODE( NSUBS ) * .. * .. External Subroutines .. EXTERNAL BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO, $ BLACS_GRIDINIT, PZDIMEE, PZGEADD, PZGEMM, $ PZHEMM, PZHER2K, PZHERK, PZMATEE, PZOPTEE, $ PZSYMM, PZSYR2K, PZSYRK, PZTRADD, PZTRMM, $ PZTRSM * .. * .. Common Blocks .. LOGICAL ABRTFLG INTEGER NOUT CHARACTER*7 SNAMES( NSUBS ) COMMON /SNAMEC/SNAMES COMMON /PBERRORC/NOUT, ABRTFLG * .. * .. Data Statements .. DATA SCODE/31, 32, 32, 33, 34, 35, 36, 38, 38, 39, $ 40/ * .. * .. Executable Statements .. * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes. * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Set ABRTFLG to FALSE so that the PBLAS error handler won't abort * on errors during these tests and set the output device unit for * it. * ABRTSAV = ABRTFLG ABRTFLG = .FALSE. NOUT = INOUT * * Test PZGEMM * I = 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZGEMM, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZGEMM, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZGEMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PZSYMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZSYMM, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZSYMM, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZSYMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PZHEMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZHEMM, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZHEMM, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZHEMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PZSYRK * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZSYRK, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZSYRK, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZSYRK, SCODE( I ), SNAMES( I ) ) END IF * * Test PZHERK * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZHERK, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZHERK, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZHERK, SCODE( I ), SNAMES( I ) ) END IF * * Test PZSYR2K * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZSYR2K, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZSYR2K, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZSYR2K, SCODE( I ), SNAMES( I ) ) END IF * * Test PZHER2K * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZHER2K, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZHER2K, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZHER2K, SCODE( I ), SNAMES( I ) ) END IF * * Test PZTRMM * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZTRMM, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZTRMM, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZTRMM, SCODE( I ), SNAMES( I ) ) END IF * * Test PZTRSM * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZTRSM, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZTRSM, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZTRSM, SCODE( I ), SNAMES( I ) ) END IF * * Test PZGEADD * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZGEADD, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZGEADD, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZGEADD, SCODE( I ), SNAMES( I ) ) END IF * * Test PZTRADD * I = I + 1 IF( LTEST( I ) ) THEN CALL PZOPTEE( ICTXT, NOUT, PZTRADD, SCODE( I ), SNAMES( I ) ) CALL PZDIMEE( ICTXT, NOUT, PZTRADD, SCODE( I ), SNAMES( I ) ) CALL PZMATEE( ICTXT, NOUT, PZTRADD, SCODE( I ), SNAMES( I ) ) END IF * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) * CALL BLACS_GRIDEXIT( ICTXT ) * * Reset ABRTFLG to the value it had before calling this routine * ABRTFLG = ABRTSAV * 9999 FORMAT( 2X, 'Error-exit tests completed.' ) * RETURN * * End of PZBLAS3TSTCHKE * END SUBROUTINE PZCHKARG3( ICTXT, NOUT, SNAME, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, IA, JA, $ DESCA, IB, JB, DESCB, BETA, IC, JC, DESCC, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N, $ NOUT COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. CHARACTER*7 SNAME INTEGER DESCA( * ), DESCB( * ), DESCC( * ) * .. * * Purpose * ======= * * PZCHKARG3 checks the input-only arguments of the Level 3 PBLAS. When * INFO = 0, this routine makes a copy of its arguments (which are INPUT * only arguments to PBLAS routines). Otherwise, it verifies the values * of these arguments against the saved copies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies the SIDE option in the Level 3 PBLAS * operation. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the UPLO option in the Level 3 PBLAS * operation. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the TRANSA option in the Level 3 * PBLAS operation. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies the TRANSB option in the Level 3 * PBLAS operation. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies the DIAG option in the Level 3 PBLAS * operation. * * M (global input) INTEGER * On entry, M specifies the dimension of the submatrix ope- * rands. * * N (global input) INTEGER * On entry, N specifies the dimension of the submatrix ope- * rands. * * K (global input) INTEGER * On entry, K specifies the dimension of the submatrix ope- * rands. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * INFO (global input/global output) INTEGER * When INFO = 0 on entry, the values of the arguments which are * INPUT only arguments to a PBLAS routine are copied into sta- * tic variables and INFO is unchanged on exit. Otherwise, the * values of the arguments are compared against the saved co- * pies. In case no error has been found INFO is zero on return, * otherwise it is non zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 DIAGREF, SIDEREF, TRANSAREF, TRANSBREF, UPLOREF INTEGER I, IAREF, IBREF, ICREF, JAREF, JBREF, JCREF, $ KREF, MREF, MYCOL, MYROW, NPCOL, NPROW, NREF COMPLEX*16 ALPHAREF, BETAREF * .. * .. Local Arrays .. CHARACTER*15 ARGNAME INTEGER DESCAREF( DLEN_ ), DESCBREF( DLEN_ ), $ DESCCREF( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Save Statements .. SAVE * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Check if first call. If yes, then save. * IF( INFO.EQ.0 ) THEN * DIAGREF = DIAG SIDEREF = SIDE TRANSAREF = TRANSA TRANSBREF = TRANSB UPLOREF = UPLO MREF = M NREF = N KREF = K ALPHAREF = ALPHA IAREF = IA JAREF = JA DO 10 I = 1, DLEN_ DESCAREF( I ) = DESCA( I ) 10 CONTINUE IBREF = IB JBREF = JB DO 20 I = 1, DLEN_ DESCBREF( I ) = DESCB( I ) 20 CONTINUE BETAREF = BETA ICREF = IC JCREF = JC DO 30 I = 1, DLEN_ DESCCREF( I ) = DESCC( I ) 30 CONTINUE * ELSE * * Test saved args. Return with first mismatch. * ARGNAME = ' ' IF( .NOT. LSAME( DIAG, DIAGREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DIAG' ELSE IF( .NOT. LSAME( SIDE, SIDEREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'SIDE' ELSE IF( .NOT. LSAME( TRANSA, TRANSAREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANSA' ELSE IF( .NOT. LSAME( TRANSB, TRANSBREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'TRANSB' ELSE IF( .NOT. LSAME( UPLO, UPLOREF ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'UPLO' ELSE IF( M.NE.MREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'M' ELSE IF( N.NE.NREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'N' ELSE IF( K.NE.KREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'K' ELSE IF( ALPHA.NE.ALPHAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'ALPHA' ELSE IF( IA.NE.IAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IA' ELSE IF( JA.NE.JAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JA' ELSE IF( DESCA( DTYPE_ ).NE.DESCAREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( DTYPE_ )' ELSE IF( DESCA( M_ ).NE.DESCAREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( M_ )' ELSE IF( DESCA( N_ ).NE.DESCAREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( N_ )' ELSE IF( DESCA( IMB_ ).NE.DESCAREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( IMB_ )' ELSE IF( DESCA( INB_ ).NE.DESCAREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( INB_ )' ELSE IF( DESCA( MB_ ).NE.DESCAREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( MB_ )' ELSE IF( DESCA( NB_ ).NE.DESCAREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( NB_ )' ELSE IF( DESCA( RSRC_ ).NE.DESCAREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( RSRC_ )' ELSE IF( DESCA( CSRC_ ).NE.DESCAREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CSRC_ )' ELSE IF( DESCA( CTXT_ ).NE.DESCAREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( CTXT_ )' ELSE IF( DESCA( LLD_ ).NE.DESCAREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCA( LLD_ )' ELSE IF( IB.NE.IBREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IB' ELSE IF( JB.NE.JBREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JB' ELSE IF( DESCB( DTYPE_ ).NE.DESCBREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( DTYPE_ )' ELSE IF( DESCB( M_ ).NE.DESCBREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( M_ )' ELSE IF( DESCB( N_ ).NE.DESCBREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( N_ )' ELSE IF( DESCB( IMB_ ).NE.DESCBREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( IMB_ )' ELSE IF( DESCB( INB_ ).NE.DESCBREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( INB_ )' ELSE IF( DESCB( MB_ ).NE.DESCBREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( MB_ )' ELSE IF( DESCB( NB_ ).NE.DESCBREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( NB_ )' ELSE IF( DESCB( RSRC_ ).NE.DESCBREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( RSRC_ )' ELSE IF( DESCB( CSRC_ ).NE.DESCBREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( CSRC_ )' ELSE IF( DESCB( CTXT_ ).NE.DESCBREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( CTXT_ )' ELSE IF( DESCB( LLD_ ).NE.DESCBREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCB( LLD_ )' ELSE IF( BETA.NE.BETAREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'BETA' ELSE IF( IC.NE.ICREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'IC' ELSE IF( JC.NE.JCREF ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'JC' ELSE IF( DESCC( DTYPE_ ).NE.DESCCREF( DTYPE_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( DTYPE_ )' ELSE IF( DESCC( M_ ).NE.DESCCREF( M_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( M_ )' ELSE IF( DESCC( N_ ).NE.DESCCREF( N_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( N_ )' ELSE IF( DESCC( IMB_ ).NE.DESCCREF( IMB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( IMB_ )' ELSE IF( DESCC( INB_ ).NE.DESCCREF( INB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( INB_ )' ELSE IF( DESCC( MB_ ).NE.DESCCREF( MB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( MB_ )' ELSE IF( DESCC( NB_ ).NE.DESCCREF( NB_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( NB_ )' ELSE IF( DESCC( RSRC_ ).NE.DESCCREF( RSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( RSRC_ )' ELSE IF( DESCC( CSRC_ ).NE.DESCCREF( CSRC_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( CSRC_ )' ELSE IF( DESCC( CTXT_ ).NE.DESCCREF( CTXT_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( CTXT_ )' ELSE IF( DESCC( LLD_ ).NE.DESCCREF( LLD_ ) ) THEN WRITE( ARGNAME, FMT = '(A)' ) 'DESCC( LLD_ )' ELSE INFO = 0 END IF * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * IF( INFO.NE.0 ) THEN WRITE( NOUT, FMT = 9999 ) ARGNAME, SNAME ELSE WRITE( NOUT, FMT = 9998 ) SNAME END IF * END IF * END IF * 9999 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' FAILED changed ', A, ' *****' ) 9998 FORMAT( 2X, ' ***** Input-only parameter check: ', A, $ ' PASSED *****' ) * RETURN * * End of PZCHKARG3 * END SUBROUTINE PZBLAS3TSTCHK( ICTXT, NOUT, NROUT, SIDE, UPLO, TRANSA, $ TRANSB, DIAG, M, N, K, ALPHA, A, PA, IA, $ JA, DESCA, B, PB, IB, JB, DESCB, BETA, $ C, PC, IC, JC, DESCC, THRESH, ROGUE, $ WORK, RWORK, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N, $ NOUT, NROUT REAL THRESH COMPLEX*16 ALPHA, BETA, ROGUE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) DOUBLE PRECISION RWORK( * ) COMPLEX*16 A( * ), B( * ), C( * ), PA( * ), PB( * ), $ PC( * ), WORK( * ) * .. * * Purpose * ======= * * PZBLAS3TSTCHK performs the computational tests of the Level 3 PBLAS. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * NROUT (global input) INTEGER * On entry, NROUT specifies which routine will be tested as * follows: * If NROUT = 1, PZGEMM will be tested; * else if NROUT = 2, PZSYMM will be tested; * else if NROUT = 3, PZHEMM will be tested; * else if NROUT = 4, PZSYRK will be tested; * else if NROUT = 5, PZHERK will be tested; * else if NROUT = 6, PZSYR2K will be tested; * else if NROUT = 7, PZHER2K will be tested; * else if NROUT = 8, PZTRMM will be tested; * else if NROUT = 9, PZTRSM will be tested; * else if NROUT = 10, PZGEADD will be tested; * else if NROUT = 11, PZTRADD will be tested; * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies if the multiplication should be per- * formed from the left or the right. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies if the upper or lower part of the * matrix operand is to be referenced. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies if the matrix operand A is to be * transposed. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies if the matrix operand B is to be * transposed. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the triangular matrix operand is * unit or non-unit. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANSA = 'N' (resp. TRANSA <> 'N') in PxGEMM, PxSYRK, * PxSYR2K, PxHERK and PxHER2K. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX*16 array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) COMPLEX*16 array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * PB (local input) COMPLEX*16 array * On entry, PB is an array of dimension (DESCB( LLD_ ),*). This * array contains the local entries of the matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX*16 array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * THRESH (global input) REAL * On entry, THRESH is the threshold value for the test ratio. * * ROGUE (global input) COMPLEX*16 * On entry, ROGUE specifies the constant used to pad the * non-referenced part of triangular, symmetric or Hermitian ma- * trices. * * WORK (workspace) COMPLEX*16 array * On entry, WORK is an array of dimension LWORK where LWORK is * at least MAX( M, MAX( N, K ) ). This array is used to store * a copy of a column of C (see PZMMCH). * * RWORK (workspace) DOUBLE PRECISION array * On entry, RWORK is an array of dimension LRWORK where LRWORK * is at least MAX( M, MAX( N, K ) ). This array is used to sto- * re the computed gauges (see PZMMCH). * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, otherwise * if( MOD( INFO, 2 ) = 1 ) then an error on A has been found, * if( MOD( INFO/2, 2 ) = 1 ) then an error on B has been found, * if( MOD( INFO/4, 2 ) = 1 ) then an error on C has been found. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D+0 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR COMPLEX*16 ALPHA1, BETA1 * .. * .. Local Arrays .. INTEGER IERR( 3 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_ZLASET, PZCHKMIN, PZMMCH, $ PZMMCH1, PZMMCH2, PZMMCH3, PZTRMM, ZTRSM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 10 I = 1, 3 IERR( I ) = 0 10 CONTINUE * IF( NROUT.EQ.1 ) THEN * * Test PZGEMM * * Check the resulting matrix C * CALL PZMMCH( ICTXT, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, $ DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, JC, $ DESCC, WORK, RWORK, ERR, IERR( 3 ) ) * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZCHKMIN( ERR, M, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PZCHKMIN( ERR, K, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF IF( LSAME( TRANSB, 'N' ) ) THEN CALL PZCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PZCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.2 ) THEN * * Test PZSYMM * * Check the resulting matrix C * IF( LSAME( SIDE, 'L' ) ) THEN CALL PZMMCH( ICTXT, 'No transpose', 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, $ BETA, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) ELSE CALL PZMMCH( ICTXT, 'No transpose', 'No transpose', M, N, N, $ ALPHA, B, IB, JB, DESCB, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_ZLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_ZLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF * IF( LSAME( SIDE, 'L' ) ) THEN CALL PZCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PZCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF CALL PZCHKMIN( ERR, M, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) * ELSE IF( NROUT.EQ.3 ) THEN * * Test PZHEMM * * Check the resulting matrix C * IF( LSAME( SIDE, 'L' ) ) THEN CALL PZMMCH( ICTXT, 'No transpose', 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, $ BETA, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) ELSE CALL PZMMCH( ICTXT, 'No transpose', 'No transpose', M, N, N, $ ALPHA, B, IB, JB, DESCB, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_ZLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) END IF ELSE IF( LSAME( SIDE, 'L' ) ) THEN CALL PB_ZLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF * IF( LSAME( SIDE, 'L' ) ) THEN CALL PZCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PZCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF CALL PZCHKMIN( ERR, M, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) * ELSE IF( NROUT.EQ.4 ) THEN * * Test PZSYRK * * Check the resulting matrix C * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZMMCH1( ICTXT, UPLO, 'No transpose', N, K, ALPHA, A, $ IA, JA, DESCA, BETA, C, PC, IC, JC, DESCC, $ WORK, RWORK, ERR, IERR( 3 ) ) ELSE CALL PZMMCH1( ICTXT, UPLO, 'Transpose', N, K, ALPHA, A, IA, $ JA, DESCA, BETA, C, PC, IC, JC, DESCC, WORK, $ RWORK, ERR, IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PZCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.5 ) THEN * * Test PZHERK * * Check the resulting matrix C * BETA1 = DCMPLX( DBLE( BETA ), RZERO ) ALPHA1 = DCMPLX( DBLE( ALPHA ), RZERO ) IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZMMCH1( ICTXT, UPLO, 'Hermitian', N, K, ALPHA1, A, IA, $ JA, DESCA, BETA1, C, PC, IC, JC, DESCC, WORK, $ RWORK, ERR, IERR( 3 ) ) ELSE CALL PZMMCH1( ICTXT, UPLO, 'Conjugate transpose', N, K, $ ALPHA1, A, IA, JA, DESCA, BETA1, C, PC, IC, $ JC, DESCC, WORK, RWORK, ERR, IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PZCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.6 ) THEN * * Test PZSYR2K * * Check the resulting matrix C * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZMMCH2( ICTXT, UPLO, 'No transpose', N, K, ALPHA, A, $ IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, $ IC, JC, DESCC, WORK, RWORK, ERR, IERR( 3 ) ) ELSE CALL PZMMCH2( ICTXT, UPLO, 'Transpose', N, K, ALPHA, A, $ IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, $ IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PZCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PZCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PZCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.7 ) THEN * * Test PZHER2K * * Check the resulting matrix C * BETA1 = DCMPLX( DBLE( BETA ), RZERO ) IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZMMCH2( ICTXT, UPLO, 'Hermitian', N, K, ALPHA, A, IA, $ JA, DESCA, B, IB, JB, DESCB, BETA1, C, PC, IC, $ JC, DESCC, WORK, RWORK, ERR, IERR( 3 ) ) ELSE CALL PZMMCH2( ICTXT, UPLO, 'Conjugate transpose', N, K, $ ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, $ BETA1, C, PC, IC, JC, DESCC, WORK, RWORK, ERR, $ IERR( 3 ) ) END IF * IF( IERR( 3 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZCHKMIN( ERR, N, K, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PZCHKMIN( ERR, N, K, B, PB, IB, JB, DESCB, IERR( 2 ) ) ELSE CALL PZCHKMIN( ERR, K, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) CALL PZCHKMIN( ERR, K, N, B, PB, IB, JB, DESCB, IERR( 2 ) ) END IF * ELSE IF( NROUT.EQ.8 ) THEN * * Test PZTRMM * * Check the resulting matrix B * IF( LSAME( SIDE, 'L' ) ) THEN CALL PZMMCH( ICTXT, TRANSA, 'No transpose', M, N, M, $ ALPHA, A, IA, JA, DESCA, C, IB, JB, DESCB, $ ZERO, B, PB, IB, JB, DESCB, WORK, RWORK, ERR, $ IERR( 2 ) ) ELSE CALL PZMMCH( ICTXT, 'No transpose', TRANSA, M, N, N, $ ALPHA, C, IB, JB, DESCB, A, IA, JA, DESCA, $ ZERO, B, PB, IB, JB, DESCB, WORK, RWORK, ERR, $ IERR( 2 ) ) END IF * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( SIDE, 'L' ) ) THEN IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Upper', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Lower', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PZCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PZCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.9 ) THEN * * Test PZTRSM * * Check the resulting matrix B * CALL ZTRSM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ A( IA+(JA-1)*DESCA( M_ ) ), DESCA( M_ ), $ B( IB+(JB-1)*DESCB( M_ ) ), DESCB( M_ ) ) CALL PZTRMM( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, PA, IA, JA, $ DESCA, PB, IB, JB, DESCB ) IF( LSAME( SIDE, 'L' ) ) THEN CALL PZMMCH( ICTXT, TRANSA, 'No transpose', M, N, M, ALPHA, $ A, IA, JA, DESCA, B, IB, JB, DESCB, ZERO, C, $ PB, IB, JB, DESCB, WORK, RWORK, ERR, $ IERR( 2 ) ) ELSE CALL PZMMCH( ICTXT, 'No transpose', TRANSA, M, N, N, ALPHA, $ B, IB, JB, DESCB, A, IA, JA, DESCA, ZERO, C, $ PB, IB, JB, DESCB, WORK, RWORK, ERR, $ IERR( 2 ) ) END IF * IF( IERR( 2 ).NE.0 ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) ELSE IF( ERR.GT.DBLE( THRESH ) ) THEN IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9997 ) ERR END IF * * Check the input-only arguments * IF( LSAME( SIDE, 'L' ) ) THEN IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Upper', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Upper', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Lower', M-1, M-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Lower', M, M, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PZCHKMIN( ERR, M, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE IF( LSAME( UPLO, 'L' ) ) THEN IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Upper', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+JA*DESCA( M_ ) ), DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Upper', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF ELSE IF( LSAME( DIAG, 'N' ) ) THEN CALL PB_ZLASET( 'Lower', N-1, N-1, 0, ROGUE, ROGUE, $ A( IA+1+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) ELSE CALL PB_ZLASET( 'Lower', N, N, 0, ROGUE, ONE, $ A( IA+(JA-1)*DESCA( M_ ) ), $ DESCA( M_ ) ) END IF END IF CALL PZCHKMIN( ERR, N, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF ELSE IF( NROUT.EQ.10 ) THEN * * Test PZGEADD * * Check the resulting matrix C * CALL PZMMCH3( 'All', TRANSA, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, IERR( 3 ) ) * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PZCHKMIN( ERR, N, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * ELSE IF( NROUT.EQ.11 ) THEN * * Test PZTRADD * * Check the resulting matrix C * CALL PZMMCH3( UPLO, TRANSA, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, IERR( 3 ) ) * * Check the input-only arguments * IF( LSAME( TRANSA, 'N' ) ) THEN CALL PZCHKMIN( ERR, M, N, A, PA, IA, JA, DESCA, IERR( 1 ) ) ELSE CALL PZCHKMIN( ERR, N, M, A, PA, IA, JA, DESCA, IERR( 1 ) ) END IF * END IF * IF( IERR( 1 ).NE.0 ) THEN INFO = INFO + 1 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'A' END IF * IF( IERR( 2 ).NE.0 ) THEN INFO = INFO + 2 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'B' END IF * IF( IERR( 3 ).NE.0 ) THEN INFO = INFO + 4 IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'C' END IF * 9999 FORMAT( 2X, ' ***** ERROR: Matrix operand ', A, $ ' is incorrect.' ) 9998 FORMAT( 2X, ' ***** FATAL ERROR - Computed result is less ', $ 'than half accurate *****' ) 9997 FORMAT( 2X, ' ***** Test completed with maximum test ratio: ', $ F11.5, ' SUSPECT *****' ) * RETURN * * End of PZBLAS3TSTCHK * END scalapack-2.0.2/PBLAS/TESTING/pzblastst.f000644 000766 000024 00001472232 11622500733 020076 0ustar00juliestaff000000 000000 SUBROUTINE PZOPTEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PZOPTEE tests whether the PBLAS respond correctly to a bad option * argument. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PZCHKOPT * .. * .. Executable Statements .. * * Level 2 PBLAS * IF( SCODE.EQ.21 ) THEN * * Check 1st (and only) option * APOS = 1 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.22 .OR. SCODE.EQ.25 .OR. SCODE.EQ.26 .OR. $ SCODE.EQ.27 ) THEN * * Check 1st (and only) option * APOS = 1 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st option * APOS = 1 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 2nd option * APOS = 2 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 3rd option * APOS = 3 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'D', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st option * APOS = 1 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2'nd option * APOS = 2 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * ELSE IF( SCODE.EQ.32 ) THEN * * Check 1st option * APOS = 1 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'S', APOS ) * * Check 2nd option * APOS = 2 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 .OR. SCODE.EQ.35 .OR. $ SCODE.EQ.36 .OR. SCODE.EQ.40 ) THEN * * Check 1st option * APOS = 1 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 2'nd option * APOS = 2 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st option * APOS = 1 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'S', APOS ) * * Check 2nd option * APOS = 2 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'U', APOS ) * * Check 3rd option * APOS = 3 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 4th option * APOS = 4 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'D', APOS ) * * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st option * APOS = 1 CALL PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * END IF * RETURN * * End of PZOPTEE * END SUBROUTINE PZCHKOPT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PZCHKOPT tests the option ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the option to be * checked. ARGNAM can either be 'D', 'S', 'A', 'B', or 'U'. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the option ARGNAM * to be tested. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER INFOT * .. * .. External Subroutines .. EXTERNAL PCHKPBE, PZCALLSUB, PZSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. CHARACTER DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO * .. * .. Executable Statements .. * * Reiniatilize the dummy arguments to correct values * CALL PZSETPBLAS( ICTXT ) * IF( LSAME( ARGNAM, 'D' ) ) THEN * * Generate bad DIAG option * DIAG = '/' * ELSE IF( LSAME( ARGNAM, 'S' ) ) THEN * * Generate bad SIDE option * SIDE = '/' * ELSE IF( LSAME( ARGNAM, 'A' ) ) THEN * * Generate bad TRANSA option * TRANSA = '/' * ELSE IF( LSAME( ARGNAM, 'B' ) ) THEN * * Generate bad TRANSB option * TRANSB = '/' * ELSE IF( LSAME( ARGNAM, 'U' ) ) THEN * * Generate bad UPLO option * UPLO = '/' * END IF * * Set INFOT to the position of the bad dimension argument * INFOT = ARGPOS * * Call the PBLAS routine * CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * RETURN * * End of PZCHKOPT * END SUBROUTINE PZDIMEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PZDIMEE tests whether the PBLAS respond correctly to a bad dimension * argument. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PZCHKDIM * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 .OR. SCODE.EQ.12 .OR. SCODE.EQ.13 .OR. $ SCODE.EQ.14 .OR. SCODE.EQ.15 ) THEN * * Check 1st (and only) dimension * APOS = 1 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * * Check 1st dimension * APOS = 2 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 3 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.22 .OR. SCODE.EQ.25 .OR. SCODE.EQ.26 .OR. $ SCODE.EQ.27 ) THEN * * Check 1st (and only) dimension * APOS = 2 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st (and only) dimension * APOS = 4 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.24 ) THEN * * Check 1st dimension * APOS = 1 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 2 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st dimension * APOS = 3 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Check 3rd dimension * APOS = 5 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'K', APOS ) * ELSE IF( SCODE.EQ.32 ) THEN * * Check 1st dimension * APOS = 3 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 .OR. SCODE.EQ.35 .OR. $ SCODE.EQ.36 ) THEN * * Check 1st dimension * APOS = 3 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * * Check 2nd dimension * APOS = 4 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'K', APOS ) * ELSE IF( SCODE.EQ.37 ) THEN * * Check 1st dimension * APOS = 1 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 2 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st dimension * APOS = 5 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 6 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st dimension * APOS = 2 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 3 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * ELSE IF( SCODE.EQ.40 ) THEN * * Check 1st dimension * APOS = 3 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'M', APOS ) * * Check 2nd dimension * APOS = 4 CALL PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'N', APOS ) * END IF * RETURN * * End of PZDIMEE * END SUBROUTINE PZCHKDIM( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PZCHKDIM tests the dimension ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the dimension to be * checked. ARGNAM can either be 'M', 'N' or 'K'. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the option ARGNAM * to be tested. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER INFOT * .. * .. External Subroutines .. EXTERNAL PCHKPBE, PZCALLSUB, PZSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. INTEGER KDIM, MDIM, NDIM COMMON /PBLASN/KDIM, MDIM, NDIM * .. * .. Executable Statements .. * * Reiniatilize the dummy arguments to correct values * CALL PZSETPBLAS( ICTXT ) * IF( LSAME( ARGNAM, 'M' ) ) THEN * * Generate bad MDIM * MDIM = -1 * ELSE IF( LSAME( ARGNAM, 'N' ) ) THEN * * Generate bad NDIM * NDIM = -1 * ELSE * * Generate bad KDIM * KDIM = -1 * END IF * * Set INFOT to the position of the bad dimension argument * INFOT = ARGPOS * * Call the PBLAS routine * CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * RETURN * * End of PZCHKDIM * END SUBROUTINE PZVECEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*7 SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PZVECEE tests whether the PBLAS respond correctly to a bad vector * argument. Each vector is described by: , I, J, * DESC, INC. Out of all these, only I, J, * DESC, and INC can be tested. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PZCHKMAT * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 ) THEN * * Check 1st vector * APOS = 2 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 7 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.12 .OR. SCODE.EQ.15 ) THEN * * Check 1st (and only) vector * APOS = 3 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * ELSE IF( SCODE.EQ.13 ) THEN * * Check 1st vector * APOS = 3 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 8 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.14 ) THEN * * Check 1st (and only) vector * APOS = 4 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * * Check 1st vector * APOS = 9 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 15 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.22 ) THEN * * Check 1st vector * APOS = 8 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 14 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.23 ) THEN * * Check 1st (and only) vector * APOS = 9 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * ELSE IF( SCODE.EQ.24 .OR. SCODE.EQ.27 ) THEN * * Check 1st vector * APOS = 4 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * * Check 2nd vector * APOS = 9 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'Y', APOS ) * ELSE IF( SCODE.EQ.26 .OR. SCODE.EQ.27 ) THEN * * Check 1'st (and only) vector * APOS = 4 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'X', APOS ) * END IF * RETURN * * End of PZVECEE * END SUBROUTINE PZMATEE( ICTXT, NOUT, SUBPTR, SCODE, SNAME ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*7 SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PZMATEE tests whether the PBLAS respond correctly to a bad matrix * argument. Each matrix is described by: , I, J, * and DESC. Out of all these, only I, J and DESC * can be tested. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER APOS * .. * .. External Subroutines .. EXTERNAL PZCHKMAT * .. * .. Executable Statements .. * * Level 2 PBLAS * IF( SCODE.EQ.21 .OR. SCODE.EQ.23 ) THEN * * Check 1st (and only) matrix * APOS = 5 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.22 ) THEN * * Check 1st (and only) matrix * APOS = 4 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.24 .OR. SCODE.EQ.27 ) THEN * * Check 1st (and only) matrix * APOS = 14 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * ELSE IF( SCODE.EQ.25 .OR. SCODE.EQ.26 ) THEN * * Check 1st (and only) matrix * APOS = 9 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * * Check 1st matrix * APOS = 7 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * * Check 3nd matrix * APOS = 16 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.32 .OR. SCODE.EQ.35 .OR. SCODE.EQ.36 ) THEN * * Check 1st matrix * APOS = 6 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 10 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * * Check 3nd matrix * APOS = 15 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.33 .OR. SCODE.EQ.34 ) THEN * * Check 1st matrix * APOS = 6 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.37 ) THEN * * Check 1st matrix * APOS = 4 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 9 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.38 ) THEN * * Check 1st matrix * APOS = 8 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 12 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'B', APOS ) * ELSE IF( SCODE.EQ.39 ) THEN * * Check 1st matrix * APOS = 5 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 10 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * ELSE IF( SCODE.EQ.40 ) THEN * * Check 1st matrix * APOS = 6 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'A', APOS ) * * Check 2nd matrix * APOS = 11 CALL PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, 'C', APOS ) * END IF * RETURN * * End of PZMATEE * END SUBROUTINE PZSETPBLAS( ICTXT ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT * .. * * Purpose * ======= * * PZSETPBLAS initializes *all* the dummy arguments to correct values. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION RONE COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ RONE = 1.0D+0 ) * .. * .. External Subroutines .. EXTERNAL PB_DESCSET2 * .. * .. Common Blocks .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY, KDIM, MDIM, NDIM DOUBLE PRECISION USCLR COMPLEX*16 SCLR INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) COMPLEX*16 A( 2, 2 ), B( 2, 2 ), C( 2, 2 ), X( 2 ), Y( 2 ) COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY COMMON /PBLASM/A, B, C COMMON /PBLASN/KDIM, MDIM, NDIM COMMON /PBLASS/SCLR, USCLR COMMON /PBLASV/X, Y * .. * .. Executable Statements .. * * Set default values for options * DIAG = 'N' SIDE = 'L' TRANSA = 'N' TRANSB = 'N' UPLO = 'U' * * Set default values for scalars * KDIM = 1 MDIM = 1 NDIM = 1 ISCLR = 1 SCLR = ONE USCLR = RONE * * Set default values for distributed matrix A * A( 1, 1 ) = ONE A( 2, 1 ) = ONE A( 1, 2 ) = ONE A( 2, 2 ) = ONE IA = 1 JA = 1 CALL PB_DESCSET2( DESCA, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix B * B( 1, 1 ) = ONE B( 2, 1 ) = ONE B( 1, 2 ) = ONE B( 2, 2 ) = ONE IB = 1 JB = 1 CALL PB_DESCSET2( DESCB, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix C * C( 1, 1 ) = ONE C( 2, 1 ) = ONE C( 1, 2 ) = ONE C( 2, 2 ) = ONE IC = 1 JC = 1 CALL PB_DESCSET2( DESCC, 2, 2, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) * * Set default values for distributed matrix X * X( 1 ) = ONE X( 2 ) = ONE IX = 1 JX = 1 CALL PB_DESCSET2( DESCX, 2, 1, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) INCX = 1 * * Set default values for distributed matrix Y * Y( 1 ) = ONE Y( 2 ) = ONE IY = 1 JY = 1 CALL PB_DESCSET2( DESCY, 2, 1, 1, 1, 1, 1, 0, 0, ICTXT, 2 ) INCY = 1 * RETURN * * End of PZSETPBLAS * END SUBROUTINE PZCHKMAT( ICTXT, NOUT, SUBPTR, SCODE, SNAME, ARGNAM, $ ARGPOS ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 ARGNAM INTEGER ARGPOS, ICTXT, NOUT, SCODE * .. * .. Array Arguments .. CHARACTER*(*) SNAME * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PZCHKMAT tests the matrix (or vector) ARGNAM in any PBLAS routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * SNAME (global input) CHARACTER*(*) * On entry, SNAME specifies the subroutine name calling this * subprogram. * * ARGNAM (global input) CHARACTER*(*) * On entry, ARGNAM specifies the name of the matrix or vector * to be checked. ARGNAM can either be 'A', 'B' or 'C' when one * wants to check a matrix, and 'X' or 'Y' for a vector. * * ARGPOS (global input) INTEGER * On entry, ARGPOS indicates the position of the first argument * of the matrix (or vector) ARGNAM. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER DESCMULT PARAMETER ( DESCMULT = 100 ) * .. * .. Local Scalars .. INTEGER I, INFOT, NPROW, NPCOL, MYROW, MYCOL * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PCHKPBE, PZCALLSUB, PZSETPBLAS * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Common Blocks .. INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( LSAME( ARGNAM, 'A' ) ) THEN * * Check IA. Set all other OK, bad IA * CALL PZSETPBLAS( ICTXT ) IA = -1 INFOT = ARGPOS + 1 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JA. Set all other OK, bad JA * CALL PZSETPBLAS( ICTXT ) JA = -1 INFOT = ARGPOS + 2 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCA. Set all other OK, bad DESCA * DO 10 I = 1, DLEN_ * * Set I'th entry of DESCA to incorrect value, rest ok. * CALL PZSETPBLAS( ICTXT ) DESCA( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCA, CSRCA, LDA * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PZSETPBLAS( ICTXT ) * * Test RSRCA >= NPROW * IF( I.EQ.RSRC_ ) $ DESCA( I ) = NPROW * * Test CSRCA >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCA( I ) = NPCOL * * Test LDA >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCA( I ) = 1 ELSE DESCA( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 10 CONTINUE * ELSE IF( LSAME( ARGNAM, 'B' ) ) THEN * * Check IB. Set all other OK, bad IB * CALL PZSETPBLAS( ICTXT ) IB = -1 INFOT = ARGPOS + 1 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JB. Set all other OK, bad JB * CALL PZSETPBLAS( ICTXT ) JB = -1 INFOT = ARGPOS + 2 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCB. Set all other OK, bad DESCB * DO 20 I = 1, DLEN_ * * Set I'th entry of DESCB to incorrect value, rest ok. * CALL PZSETPBLAS( ICTXT ) DESCB( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCB, CSRCB, LDB * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PZSETPBLAS( ICTXT ) * * Test RSRCB >= NPROW * IF( I.EQ.RSRC_ ) $ DESCB( I ) = NPROW * * Test CSRCB >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCB( I ) = NPCOL * * Test LDB >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCB( I ) = 1 ELSE DESCB( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 20 CONTINUE * ELSE IF( LSAME( ARGNAM, 'C' ) ) THEN * * Check IC. Set all other OK, bad IC * CALL PZSETPBLAS( ICTXT ) IC = -1 INFOT = ARGPOS + 1 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JC. Set all other OK, bad JC * CALL PZSETPBLAS( ICTXT ) JC = -1 INFOT = ARGPOS + 2 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCC. Set all other OK, bad DESCC * DO 30 I = 1, DLEN_ * * Set I'th entry of DESCC to incorrect value, rest ok. * CALL PZSETPBLAS( ICTXT ) DESCC( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCC, CSRCC, LDC * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PZSETPBLAS( ICTXT ) * * Test RSRCC >= NPROW * IF( I.EQ.RSRC_ ) $ DESCC( I ) = NPROW * * Test CSRCC >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCC( I ) = NPCOL * * Test LDC >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCC( I ) = 1 ELSE DESCC( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 30 CONTINUE * ELSE IF( LSAME( ARGNAM, 'X' ) ) THEN * * Check IX. Set all other OK, bad IX * CALL PZSETPBLAS( ICTXT ) IX = -1 INFOT = ARGPOS + 1 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JX. Set all other OK, bad JX * CALL PZSETPBLAS( ICTXT ) JX = -1 INFOT = ARGPOS + 2 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCX. Set all other OK, bad DESCX * DO 40 I = 1, DLEN_ * * Set I'th entry of DESCX to incorrect value, rest ok. * CALL PZSETPBLAS( ICTXT ) DESCX( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCX, CSRCX, LDX * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PZSETPBLAS( ICTXT ) * * Test RSRCX >= NPROW * IF( I.EQ.RSRC_ ) $ DESCX( I ) = NPROW * * Test CSRCX >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCX( I ) = NPCOL * * Test LDX >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCX( I ) = 1 ELSE DESCX( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 40 CONTINUE * * Check INCX. Set all other OK, bad INCX * CALL PZSETPBLAS( ICTXT ) INCX = -1 INFOT = ARGPOS + 4 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * ELSE * * Check IY. Set all other OK, bad IY * CALL PZSETPBLAS( ICTXT ) IY = -1 INFOT = ARGPOS + 1 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check JY. Set all other OK, bad JY * CALL PZSETPBLAS( ICTXT ) JY = -1 INFOT = ARGPOS + 2 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Check DESCY. Set all other OK, bad DESCY * DO 50 I = 1, DLEN_ * * Set I'th entry of DESCY to incorrect value, rest ok. * CALL PZSETPBLAS( ICTXT ) DESCY( I ) = -2 INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * * Extra tests for RSRCY, CSRCY, LDY * IF( ( I.EQ.RSRC_ ) .OR. ( I.EQ.CSRC_ ) .OR. $ ( I.EQ.LLD_ ) ) THEN * CALL PZSETPBLAS( ICTXT ) * * Test RSRCY >= NPROW * IF( I.EQ.RSRC_ ) $ DESCY( I ) = NPROW * * Test CSRCY >= NPCOL * IF( I.EQ.CSRC_ ) $ DESCY( I ) = NPCOL * * Test LDY >= MAX(1, PB_NUMROC(...)). Set to 1 as mat 2x2. * IF( I.EQ.LLD_ ) THEN IF( MYROW.EQ.0 .AND.MYCOL.EQ.0 ) THEN DESCY( I ) = 1 ELSE DESCY( I ) = 0 END IF END IF * INFOT = ( ( ARGPOS + 3 ) * DESCMULT ) + I CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * 50 CONTINUE * * Check INCY. Set all other OK, bad INCY * CALL PZSETPBLAS( ICTXT ) INCY = -1 INFOT = ARGPOS + 4 CALL PZCALLSUB( SUBPTR, SCODE ) CALL PCHKPBE( ICTXT, NOUT, SNAME, INFOT ) * END IF * RETURN * * End of PZCHKMAT * END SUBROUTINE PZCALLSUB( SUBPTR, SCODE ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER SCODE * .. * .. Subroutine Arguments .. EXTERNAL SUBPTR * .. * * Purpose * ======= * * PZCALLSUB calls the subroutine SUBPTR with the calling sequence iden- * tified by SCODE. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SUBPTR (global input) SUBROUTINE * On entry, SUBPTR is a subroutine. SUBPTR must be declared * EXTERNAL in the calling subroutine. * * SCODE (global input) INTEGER * On entry, SCODE specifies the calling sequence code. * * Calling sequence encodings * ========================== * * code Formal argument list Examples * * 11 (n, v1,v2) _SWAP, _COPY * 12 (n,s1, v1 ) _SCAL, _SCAL * 13 (n,s1, v1,v2) _AXPY, _DOT_ * 14 (n,s1,i1,v1 ) _AMAX * 15 (n,u1, v1 ) _ASUM, _NRM2 * * 21 ( trans, m,n,s1,m1,v1,s2,v2) _GEMV * 22 (uplo, n,s1,m1,v1,s2,v2) _SYMV, _HEMV * 23 (uplo,trans,diag, n, m1,v1 ) _TRMV, _TRSV * 24 ( m,n,s1,v1,v2,m1) _GER_ * 25 (uplo, n,s1,v1, m1) _SYR * 26 (uplo, n,u1,v1, m1) _HER * 27 (uplo, n,s1,v1,v2,m1) _SYR2, _HER2 * * 31 ( transa,transb, m,n,k,s1,m1,m2,s2,m3) _GEMM * 32 (side,uplo, m,n, s1,m1,m2,s2,m3) _SYMM, _HEMM * 33 ( uplo,trans, n,k,s1,m1, s2,m3) _SYRK * 34 ( uplo,trans, n,k,u1,m1, u2,m3) _HERK * 35 ( uplo,trans, n,k,s1,m1,m2,s2,m3) _SYR2K * 36 ( uplo,trans, n,k,s1,m1,m2,u2,m3) _HER2K * 37 ( m,n, s1,m1, s2,m3) _TRAN_ * 38 (side,uplo,transa, diag,m,n, s1,m1,m2 ) _TRMM, _TRSM * 39 ( trans, m,n, s1,m1, s2,m3) _GEADD * 40 ( uplo,trans, m,n, s1,m1, s2,m3) _TRADD * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Common Blocks .. CHARACTER*1 DIAG, SIDE, TRANSA, TRANSB, UPLO INTEGER IA, IB, IC, INCX, INCY, ISCLR, IX, IY, JA, JB, $ JC, JX, JY, KDIM, MDIM, NDIM DOUBLE PRECISION USCLR COMPLEX*16 SCLR INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCC( DLEN_ ), $ DESCX( DLEN_ ), DESCY( DLEN_ ) COMPLEX*16 A( 2, 2 ), B( 2, 2 ), C( 2, 2 ), X( 2 ), Y( 2 ) COMMON /PBLASC/DIAG, SIDE, TRANSA, TRANSB, UPLO COMMON /PBLASD/DESCA, DESCB, DESCC, DESCX, DESCY COMMON /PBLASI/IA, IB, IC, INCX, INCY, ISCLR, IX, IY, $ JA, JB, JC, JX, JY COMMON /PBLASM/A, B, C COMMON /PBLASN/KDIM, MDIM, NDIM COMMON /PBLASS/SCLR, USCLR COMMON /PBLASV/X, Y * .. * .. Executable Statements .. * * Level 1 PBLAS * IF( SCODE.EQ.11 ) THEN * CALL SUBPTR( NDIM, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, $ INCY ) * ELSE IF( SCODE.EQ.12 ) THEN * CALL SUBPTR( NDIM, SCLR, X, IX, JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.13 ) THEN * CALL SUBPTR( NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, JY, $ DESCY, INCY ) * ELSE IF( SCODE.EQ.14 ) THEN * CALL SUBPTR( NDIM, SCLR, ISCLR, X, IX, JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.15 ) THEN * CALL SUBPTR( NDIM, USCLR, X, IX, JX, DESCX, INCX ) * * Level 2 PBLAS * ELSE IF( SCODE.EQ.21 ) THEN * CALL SUBPTR( TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX, SCLR, Y, IY, JY, DESCY, INCY ) * ELSE IF( SCODE.EQ.22 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, A, IA, JA, DESCA, X, IX, JX, $ DESCX, INCX, SCLR, Y, IY, JY, DESCY, INCY ) * ELSE IF( SCODE.EQ.23 ) THEN * CALL SUBPTR( UPLO, TRANSA, DIAG, NDIM, A, IA, JA, DESCA, X, IX, $ JX, DESCX, INCX ) * ELSE IF( SCODE.EQ.24 ) THEN * CALL SUBPTR( MDIM, NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, $ JY, DESCY, INCY, A, IA, JA, DESCA ) * ELSE IF( SCODE.EQ.25 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, X, IX, JX, DESCX, INCX, A, IA, $ JA, DESCA ) * ELSE IF( SCODE.EQ.26 ) THEN * CALL SUBPTR( UPLO, NDIM, USCLR, X, IX, JX, DESCX, INCX, A, IA, $ JA, DESCA ) * ELSE IF( SCODE.EQ.27 ) THEN * CALL SUBPTR( UPLO, NDIM, SCLR, X, IX, JX, DESCX, INCX, Y, IY, $ JY, DESCY, INCY, A, IA, JA, DESCA ) * * Level 3 PBLAS * ELSE IF( SCODE.EQ.31 ) THEN * CALL SUBPTR( TRANSA, TRANSB, MDIM, NDIM, KDIM, SCLR, A, IA, JA, $ DESCA, B, IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.32 ) THEN * CALL SUBPTR( SIDE, UPLO, MDIM, NDIM, SCLR, A, IA, JA, DESCA, B, $ IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.33 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.34 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, USCLR, A, IA, JA, DESCA, $ USCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.35 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ B, IB, JB, DESCB, SCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.36 ) THEN * CALL SUBPTR( UPLO, TRANSA, NDIM, KDIM, SCLR, A, IA, JA, DESCA, $ B, IB, JB, DESCB, USCLR, C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.37 ) THEN * CALL SUBPTR( MDIM, NDIM, SCLR, A, IA, JA, DESCA, SCLR, C, IC, $ JC, DESCC ) * ELSE IF( SCODE.EQ.38 ) THEN * CALL SUBPTR( SIDE, UPLO, TRANSA, DIAG, MDIM, NDIM, SCLR, A, IA, $ JA, DESCA, B, IB, JB, DESCB ) * ELSE IF( SCODE.EQ.39 ) THEN * CALL SUBPTR( TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, SCLR, $ C, IC, JC, DESCC ) * ELSE IF( SCODE.EQ.40 ) THEN * CALL SUBPTR( UPLO, TRANSA, MDIM, NDIM, SCLR, A, IA, JA, DESCA, $ SCLR, C, IC, JC, DESCC ) * END IF * RETURN * * End of PZCALLSUB * END SUBROUTINE PZERRSET( ERR, ERRMAX, XTRUE, X ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION ERR, ERRMAX COMPLEX*16 X, XTRUE * .. * * Purpose * ======= * * PZERRSET computes the absolute difference ERR = |XTRUE - X| and com- * pares it with zero. ERRMAX accumulates the absolute error difference. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERR (local output) DOUBLE PRECISION * On exit, ERR specifies the absolute difference |XTRUE - X|. * * ERRMAX (local input/local output) DOUBLE PRECISION * On entry, ERRMAX specifies a previously computed error. On * exit ERRMAX is the accumulated error MAX( ERRMAX, ERR ). * * XTRUE (local input) COMPLEX*16 * On entry, XTRUE specifies the true value. * * X (local input) COMPLEX*16 * On entry, X specifies the value to be compared to XTRUE. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. DOUBLE PRECISION PDDIFF EXTERNAL PDDIFF * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX * .. * .. Executable Statements .. * ERR = ABS( PDDIFF( DBLE( XTRUE ), DBLE( X ) ) ) ERR = MAX( ERR, ABS( PDDIFF( DIMAG( XTRUE ), DIMAG( X ) ) ) ) * ERRMAX = MAX( ERRMAX, ERR ) * RETURN * * End of PZERRSET * END SUBROUTINE PZCHKVIN( ERRMAX, N, X, PX, IX, JX, DESCX, INCX, $ INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INFO, IX, JX, N DOUBLE PRECISION ERRMAX * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX*16 PX( * ), X( * ) * .. * * Purpose * ======= * * PZCHKVIN checks that the submatrix sub( PX ) remained unchanged. The * local array entries are compared element by element, and their dif- * ference is tested against 0.0 as well as the epsilon machine. Notice * that this difference should be numerically exactly the zero machine, * but because of the possible fluctuation of some of the data we flag- * ged differently a difference less than twice the epsilon machine. The * largest error is also returned. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERRMAX (global output) DOUBLE PRECISION * On exit, ERRMAX specifies the largest absolute element-wise * difference between sub( X ) and sub( PX ). * * N (global input) INTEGER * On entry, N specifies the length of the subvector operand * sub( X ). N must be at least zero. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) COMPLEX*16 array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, ICURROW, IIX, IN, IXCOL, $ IXROW, J, JB, JJX, JN, KK, LDPX, LDX, LL, $ MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR, EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PB_INFOG2L, PZERRSET * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( N.LE.0 ) $ RETURN * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL PB_INFOG2L( IX, JX, DESCX, NPROW, NPCOL, MYROW, MYCOL, IIX, $ JJX, IXROW, IXCOL ) * LDX = DESCX( M_ ) LDPX = DESCX( LLD_ ) ROWREP = ( IXROW.EQ.-1 ) COLREP = ( IXCOL.EQ.-1 ) * IF( N.EQ.1 ) THEN * IF( ( MYROW.EQ.IXROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.IXCOL .OR. COLREP ) ) $ CALL PZERRSET( ERR, ERRMAX, X( IX+(JX-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) * ELSE IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * JB = DESCX( INB_ ) - JX + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCX( NB_ ) + 1 ) * DESCX( NB_ ) + JB JB = MIN( JB, N ) JN = JX + JB - 1 * IF( MYROW.EQ.IXROW .OR. ROWREP ) THEN * ICURCOL = IXCOL IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN DO 10 J = JX, JN CALL PZERRSET( ERR, ERRMAX, X( IX+(J-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) JJX = JJX + 1 10 CONTINUE END IF ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 30 J = JN+1, JX+N-1, DESCX( NB_ ) JB = MIN( JX+N-J, DESCX( NB_ ) ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * DO 20 KK = 0, JB-1 CALL PZERRSET( ERR, ERRMAX, X( IX+(J+KK-1)*LDX ), $ PX( IIX+(JJX+KK-1)*LDPX ) ) 20 CONTINUE * JJX = JJX + JB * END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 30 CONTINUE * END IF * ELSE * * sub( X ) is a column vector * IB = DESCX( IMB_ ) - IX + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCX( MB_ ) + 1 ) * DESCX( MB_ ) + IB IB = MIN( IB, N ) IN = IX + IB - 1 * IF( MYCOL.EQ.IXCOL .OR. COLREP ) THEN * ICURROW = IXROW IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 40 I = IX, IN CALL PZERRSET( ERR, ERRMAX, X( I+(JX-1)*LDX ), $ PX( IIX+(JJX-1)*LDPX ) ) IIX = IIX + 1 40 CONTINUE END IF ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IX+N-1, DESCX( MB_ ) IB = MIN( IX+N-I, DESCX( MB_ ) ) * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * DO 50 KK = 0, IB-1 CALL PZERRSET( ERR, ERRMAX, X( I+KK+(JX-1)*LDX ), $ PX( IIX+KK+(JJX-1)*LDPX ) ) 50 CONTINUE * IIX = IIX + IB * END IF * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PZCHKVIN * END SUBROUTINE PZCHKVOUT( N, X, PX, IX, JX, DESCX, INCX, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INFO, IX, JX, N * .. * .. Array Arguments .. INTEGER DESCX( * ) COMPLEX*16 PX( * ), X( * ) * .. * * Purpose * ======= * * PZCHKVOUT checks that the matrix PX \ sub( PX ) remained unchanged. * The local array entries are compared element by element, and their * difference is tested against 0.0 as well as the epsilon machine. No- * tice that this difference should be numerically exactly the zero ma- * chine, but because of the possible movement of some of the data we * flagged differently a difference less than twice the epsilon machine. * The largest error is reported. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector operand * sub( X ). N must be at least zero. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * PX (local input) COMPLEX*16 array * On entry, PX is an array of dimension (DESCX( LLD_ ),*). This * array contains the local entries of the matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, ICURROW, II, IMBX, INBX, $ J, JB, JJ, KK, LDPX, LDX, LL, MBX, MPALL, $ MYCOL, MYCOLDIST, MYROW, MYROWDIST, NBX, NPCOL, $ NPROW, NQALL DOUBLE PRECISION EPS, ERR, ERRMAX * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PZERRSET * .. * .. External Functions .. INTEGER PB_NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( ( DESCX( M_ ).LE.0 ).OR.( DESCX( N_ ).LE.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCX( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * MPALL = PB_NUMROC( DESCX( M_ ), 1, DESCX( IMB_ ), DESCX( MB_ ), $ MYROW, DESCX( RSRC_ ), NPROW ) NQALL = PB_NUMROC( DESCX( N_ ), 1, DESCX( INB_ ), DESCX( NB_ ), $ MYCOL, DESCX( CSRC_ ), NPCOL ) * MBX = DESCX( MB_ ) NBX = DESCX( NB_ ) LDX = DESCX( M_ ) LDPX = DESCX( LLD_ ) ICURROW = DESCX( RSRC_ ) ICURCOL = DESCX( CSRC_ ) ROWREP = ( ICURROW.EQ.-1 ) COLREP = ( ICURCOL.EQ.-1 ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN IMBX = DESCX( IMB_ ) ELSE IMBX = MBX END IF IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN INBX = DESCX( INB_ ) ELSE INBX = NBX END IF IF( ROWREP ) THEN MYROWDIST = 0 ELSE MYROWDIST = MOD( MYROW - ICURROW + NPROW, NPROW ) END IF IF( COLREP ) THEN MYCOLDIST = 0 ELSE MYCOLDIST = MOD( MYCOL - ICURCOL + NPCOL, NPCOL ) END IF II = 1 JJ = 1 * IF( INCX.EQ.DESCX( M_ ) ) THEN * * sub( X ) is a row vector * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * I = 1 IF( MYCOLDIST.EQ.0 ) THEN J = 1 ELSE J = DESCX( INB_ ) + ( MYCOLDIST - 1 ) * NBX + 1 END IF JB = MIN( MAX( 0, DESCX( N_ ) - J + 1 ), INBX ) IB = MIN( DESCX( M_ ), DESCX( IMB_ ) ) * DO 20 KK = 0, JB-1 DO 10 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. J+KK.GT.JX+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 10 CONTINUE 20 CONTINUE IF( COLREP ) THEN J = J + INBX ELSE J = J + INBX + ( NPCOL - 1 ) * NBX END IF * DO 50 JJ = INBX+1, NQALL, NBX JB = MIN( NQALL-JJ+1, NBX ) * DO 40 KK = 0, JB-1 DO 30 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 30 CONTINUE 40 CONTINUE * IF( COLREP ) THEN J = J + NBX ELSE J = J + NPCOL * NBX END IF * 50 CONTINUE * II = II + IB * END IF * ICURROW = MOD( ICURROW + 1, NPROW ) * DO 110 I = DESCX( IMB_ ) + 1, DESCX( M_ ), MBX IB = MIN( DESCX( M_ ) - I + 1, MBX ) * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN * IF( MYCOLDIST.EQ.0 ) THEN J = 1 ELSE J = DESCX( INB_ ) + ( MYCOLDIST - 1 ) * NBX + 1 END IF * JJ = 1 JB = MIN( MAX( 0, DESCX( N_ ) - J + 1 ), INBX ) DO 70 KK = 0, JB-1 DO 60 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 60 CONTINUE 70 CONTINUE IF( COLREP ) THEN J = J + INBX ELSE J = J + INBX + ( NPCOL - 1 ) * NBX END IF * DO 100 JJ = INBX+1, NQALL, NBX JB = MIN( NQALL-JJ+1, NBX ) * DO 90 KK = 0, JB-1 DO 80 LL = 0, IB-1 IF( I+LL.NE.IX .OR. J+KK.LT.JX .OR. $ J+KK.GT.JX+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 80 CONTINUE 90 CONTINUE * IF( COLREP ) THEN J = J + NBX ELSE J = J + NPCOL * NBX END IF * 100 CONTINUE * II = II + IB * END IF * ICURROW = MOD( ICURROW + 1, NPROW ) * 110 CONTINUE * ELSE * * sub( X ) is a column vector * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * J = 1 IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCX( IMB_ ) + ( MYROWDIST - 1 ) * MBX + 1 END IF IB = MIN( MAX( 0, DESCX( M_ ) - I + 1 ), IMBX ) JB = MIN( DESCX( N_ ), DESCX( INB_ ) ) * DO 130 KK = 0, JB-1 DO 120 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. I+LL.GT.IX+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 120 CONTINUE 130 CONTINUE IF( ROWREP ) THEN I = I + IMBX ELSE I = I + IMBX + ( NPROW - 1 ) * MBX END IF * DO 160 II = IMBX+1, MPALL, MBX IB = MIN( MPALL-II+1, MBX ) * DO 150 KK = 0, JB-1 DO 140 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 140 CONTINUE 150 CONTINUE * IF( ROWREP ) THEN I = I + MBX ELSE I = I + NPROW * MBX END IF * 160 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * DO 220 J = DESCX( INB_ ) + 1, DESCX( N_ ), NBX JB = MIN( DESCX( N_ ) - J + 1, NBX ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCX( IMB_ ) + ( MYROWDIST - 1 ) * MBX + 1 END IF * II = 1 IB = MIN( MAX( 0, DESCX( M_ ) - I + 1 ), IMBX ) DO 180 KK = 0, JB-1 DO 170 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 170 CONTINUE 180 CONTINUE IF( ROWREP ) THEN I = I + IMBX ELSE I = I + IMBX + ( NPROW - 1 ) * MBX END IF * DO 210 II = IMBX+1, MPALL, MBX IB = MIN( MPALL-II+1, MBX ) * DO 200 KK = 0, JB-1 DO 190 LL = 0, IB-1 IF( J+KK.NE.JX .OR. I+LL.LT.IX .OR. $ I+LL.GT.IX+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ X( I+LL+(J+KK-1)*LDX ), $ PX( II+LL+(JJ+KK-1)*LDPX ) ) 190 CONTINUE 200 CONTINUE * IF( ROWREP ) THEN I = I + MBX ELSE I = I + NPROW * MBX END IF * 210 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * 220 CONTINUE * END IF * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PZCHKVOUT * END SUBROUTINE PZCHKMIN( ERRMAX, M, N, A, PA, IA, JA, DESCA, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N DOUBLE PRECISION ERRMAX * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 PA( * ), A( * ) * .. * * Purpose * ======= * * PZCHKMIN checks that the submatrix sub( PA ) remained unchanged. The * local array entries are compared element by element, and their dif- * ference is tested against 0.0 as well as the epsilon machine. Notice * that this difference should be numerically exactly the zero machine, * but because of the possible fluctuation of some of the data we flag- * ged differently a difference less than twice the epsilon machine. The * largest error is also returned. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ERRMAX (global output) DOUBLE PRECISION * On exit, ERRMAX specifies the largest absolute element-wise * difference between sub( A ) and sub( PA ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * operand sub( A ). N must be at least zero. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX*16 array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ KK, LDA, LDPA, LL, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION ERR, EPS * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PB_INFOG2L, PZERRSET * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if posssible * IF( ( M.EQ.0 ).OR.( N.EQ.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL PB_INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, $ JJA, IAROW, IACOL ) * II = IIA JJ = JJA LDA = DESCA( M_ ) LDPA = DESCA( LLD_ ) ICURROW = IAROW ICURCOL = IACOL ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * * Handle the first block of column separately * JB = DESCA( INB_ ) - JA + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCA( NB_ ) + 1 ) * DESCA( NB_ ) + JB JB = MIN( JB, N ) JN = JA + JB - 1 * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * DO 40 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 10 K = 0, IB-1 CALL PZERRSET( ERR, ERRMAX, A( IA+K+(JA+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 10 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) * * Loop over remaining block of rows * DO 30 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 20 K = 0, IB-1 CALL PZERRSET( ERR, ERRMAX, A( I+K+(JA+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 20 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) 30 CONTINUE * II = IIA ICURROW = IAROW 40 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * * Loop over remaining column blocks * DO 90 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN DO 80 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 )*DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 50 K = 0, IB-1 CALL PZERRSET( ERR, ERRMAX, A( IA+K+(J+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 50 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) * * Loop over remaining block of rows * DO 70 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN DO 60 K = 0, IB-1 CALL PZERRSET( ERR, ERRMAX, $ A( I+K+(J+H-1)*LDA ), $ PA( II+K+(JJ+H-1)*LDPA ) ) 60 CONTINUE II = II + IB END IF ICURROW = MOD( ICURROW+1, NPROW ) 70 CONTINUE * II = IIA ICURROW = IAROW 80 CONTINUE * JJ = JJ + JB END IF * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 90 CONTINUE * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PZCHKMIN * END SUBROUTINE PZCHKMOUT( M, N, A, PA, IA, JA, DESCA, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, INFO, JA, M, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), PA( * ) * .. * * Purpose * ======= * * PZCHKMOUT checks that the matrix PA \ sub( PA ) remained unchanged. * The local array entries are compared element by element, and their * difference is tested against 0.0 as well as the epsilon machine. No- * tice that this difference should be numerically exactly the zero ma- * chine, but because of the possible movement of some of the data we * flagged differently a difference less than twice the epsilon machine. * The largest error is reported. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( PA ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( PA ). N must be at least zero. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX*16 array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * INFO (global output) INTEGER * On exit, if INFO = 0, no error has been found, * If INFO > 0, the maximum abolute error found is in (0,eps], * If INFO < 0, the maximum abolute error found is in (eps,+oo). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, ROWREP INTEGER I, IB, ICTXT, ICURCOL, II, IMBA, J, JB, JJ, KK, $ LDA, LDPA, LL, MPALL, MYCOL, MYROW, MYROWDIST, $ NPCOL, NPROW DOUBLE PRECISION EPS, ERR, ERRMAX * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, PZERRSET * .. * .. External Functions .. INTEGER PB_NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * INFO = 0 ERRMAX = ZERO * * Quick return if possible * IF( ( DESCA( M_ ).LE.0 ).OR.( DESCA( N_ ).LE.0 ) ) $ RETURN * * Start the operations * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * MPALL = PB_NUMROC( DESCA( M_ ), 1, DESCA( IMB_ ), DESCA( MB_ ), $ MYROW, DESCA( RSRC_ ), NPROW ) * LDA = DESCA( M_ ) LDPA = DESCA( LLD_ ) * II = 1 JJ = 1 ROWREP = ( DESCA( RSRC_ ).EQ.-1 ) COLREP = ( DESCA( CSRC_ ).EQ.-1 ) ICURCOL = DESCA( CSRC_ ) IF( MYROW.EQ.DESCA( RSRC_ ) .OR. ROWREP ) THEN IMBA = DESCA( IMB_ ) ELSE IMBA = DESCA( MB_ ) END IF IF( ROWREP ) THEN MYROWDIST = 0 ELSE MYROWDIST = MOD( MYROW - DESCA( RSRC_ ) + NPROW, NPROW ) END IF * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * J = 1 IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCA( IMB_ ) + ( MYROWDIST - 1 ) * DESCA( MB_ ) + 1 END IF IB = MIN( MAX( 0, DESCA( M_ ) - I + 1 ), IMBA ) JB = MIN( DESCA( N_ ), DESCA( INB_ ) ) * DO 20 KK = 0, JB-1 DO 10 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 10 CONTINUE 20 CONTINUE IF( ROWREP ) THEN I = I + IMBA ELSE I = I + IMBA + ( NPROW - 1 ) * DESCA( MB_ ) END IF * DO 50 II = IMBA + 1, MPALL, DESCA( MB_ ) IB = MIN( MPALL-II+1, DESCA( MB_ ) ) * DO 40 KK = 0, JB-1 DO 30 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 30 CONTINUE 40 CONTINUE * IF( ROWREP ) THEN I = I + DESCA( MB_ ) ELSE I = I + NPROW * DESCA( MB_ ) END IF * 50 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * DO 110 J = DESCA( INB_ ) + 1, DESCA( N_ ), DESCA( NB_ ) JB = MIN( DESCA( N_ ) - J + 1, DESCA( NB_ ) ) * IF( MYCOL.EQ.ICURCOL .OR. COLREP ) THEN * IF( MYROWDIST.EQ.0 ) THEN I = 1 ELSE I = DESCA( IMB_ ) + ( MYROWDIST - 1 ) * DESCA( MB_ ) + 1 END IF * II = 1 IB = MIN( MAX( 0, DESCA( M_ ) - I + 1 ), IMBA ) DO 70 KK = 0, JB-1 DO 60 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 60 CONTINUE 70 CONTINUE IF( ROWREP ) THEN I = I + IMBA ELSE I = I + IMBA + ( NPROW - 1 ) * DESCA( MB_ ) END IF * DO 100 II = IMBA+1, MPALL, DESCA( MB_ ) IB = MIN( MPALL-II+1, DESCA( MB_ ) ) * DO 90 KK = 0, JB-1 DO 80 LL = 0, IB-1 IF( I+LL.LT.IA .OR. I+LL.GT.IA+M-1 .OR. $ J+KK.LT.JA .OR. J+KK.GT.JA+N-1 ) $ CALL PZERRSET( ERR, ERRMAX, $ A( I+LL+(J+KK-1)*LDA ), $ PA( II+LL+(JJ+KK-1)*LDPA ) ) 80 CONTINUE 90 CONTINUE * IF( ROWREP ) THEN I = I + DESCA( MB_ ) ELSE I = I + NPROW * DESCA( MB_ ) END IF * 100 CONTINUE * JJ = JJ + JB * END IF * ICURCOL = MOD( ICURCOL + 1, NPCOL ) * INSERT MODE 110 CONTINUE * CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERRMAX, 1, KK, LL, -1, $ -1, -1 ) * IF( ERRMAX.GT.ZERO .AND. ERRMAX.LE.EPS ) THEN INFO = 1 ELSE IF( ERRMAX.GT.EPS ) THEN INFO = -1 END IF * RETURN * * End of PZCHKMOUT * END SUBROUTINE PZMPRNT( ICTXT, NOUT, M, N, A, LDA, IRPRNT, ICPRNT, $ CMATNM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICPRNT, ICTXT, IRPRNT, LDA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PZMPRNT prints to the standard output an array A of size m by n. Only * the process of coordinates ( IRPRNT, ICPRNT ) is printing. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * M (global input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). The leading m * by n part of this array is printed. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array A to be printed. LDA must be at least MAX( 1, M ). * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the process row coordinate of the * printing process. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the process column coordinate of * the printing process. * * CMATNM (global input) CHARACTER*(*) * On entry, CMATNM specifies the identifier of the matrix to be * printed. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DIMAG * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN * WRITE( NOUT, FMT = * ) DO 20 J = 1, N * DO 10 I = 1, M * WRITE( NOUT, FMT = 9999 ) CMATNM, I, J, $ DBLE( A( I, J ) ), DIMAG( A( I, J ) ) * 10 CONTINUE * 20 CONTINUE * END IF * 9999 FORMAT( 1X, A, '(', I6, ',', I6, ')=', D30.18, '+i*(', $ D30.18, ')' ) * RETURN * * End of PZMPRNT * END SUBROUTINE PZVPRNT( ICTXT, NOUT, N, X, INCX, IRPRNT, ICPRNT, $ CVECNM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICPRNT, ICTXT, INCX, IRPRNT, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CVECNM COMPLEX*16 X( * ) * .. * * Purpose * ======= * * PZVPRNT prints to the standard output an vector x of length n. Only * the process of coordinates ( IRPRNT, ICPRNT ) is printing. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * NOUT (global input) INTEGER * On entry, NOUT specifies the unit number for the output file. * When NOUT is 6, output to screen, when NOUT is 0, output to * stderr. NOUT is only defined for process 0. * * N (global input) INTEGER * On entry, N specifies the length of the vector X. N must be * at least zero. * * X (global input) COMPLEX*16 array * On entry, X is an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremen- * ted array X must contain the vector x. * * INCX (global input) INTEGER. * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the process row coordinate of the * printing process. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the process column coordinate of * the printing process. * * CVECNM (global input) CHARACTER*(*) * On entry, CVECNM specifies the identifier of the vector to be * printed. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DIMAG * .. * .. Executable Statements .. * * Quick return if possible * IF( N.LE.0 ) $ RETURN * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN * WRITE( NOUT, FMT = * ) DO 10 I = 1, 1 + ( N-1 )*INCX, INCX * WRITE( NOUT, FMT = 9999 ) CVECNM, I, DBLE( X( I ) ), $ DIMAG( X( I ) ) * 10 CONTINUE * END IF * 9999 FORMAT( 1X, A, '(', I6, ')=', D30.18, '+i*(', D30.18, ')' ) * RETURN * * End of PZVPRNT * END SUBROUTINE PZMVCH( ICTXT, TRANS, M, N, ALPHA, A, IA, JA, DESCA, $ X, IX, JX, DESCX, INCX, BETA, Y, PY, IY, JY, $ DESCY, INCY, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N DOUBLE PRECISION ERR COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION G( * ) COMPLEX*16 A( * ), PY( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PZMVCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies which matrix-vector product is to * be computed as follows: * If TRANS = 'T', * sub( Y ) = BETA * sub( Y ) + sub( A )**T * sub( X ), * else if TRANS = 'C', * sub( Y ) = BETA * sub( Y ) + sub( A )**H * sub( X ), * otherwise * sub( Y ) = BETA * sub( Y ) + sub( A ) * sub( X ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * Y (local input/local output) COMPLEX*16 array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * PY (local input) COMPLEX*16 array * On entry, PY is an array of dimension (DESCY( LLD_ ),*). This * array contains the local entries of the matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D+0, RONE = 1.0D+0 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL COLREP, CTRAN, ROWREP, TRAN INTEGER I, IB, ICURCOL, ICURROW, IIY, IN, IOFFA, IOFFX, $ IOFFY, IYCOL, IYROW, J, JB, JJY, JN, KK, LDA, $ LDPY, LDX, LDY, ML, MYCOL, MYROW, NL, NPCOL, $ NPROW DOUBLE PRECISION EPS, ERRI, GTMP COMPLEX*16 C, TBETA, YTMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, MOD, SQRT * .. * .. Statement Functions .. DOUBLE PRECISION ABS1 ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * IF( M.EQ.0 .OR. N.EQ.0 ) THEN TBETA = ONE ELSE TBETA = BETA END IF * TRAN = LSAME( TRANS, 'T' ) CTRAN = LSAME( TRANS, 'C' ) IF( TRAN.OR.CTRAN ) THEN ML = N NL = M ELSE ML = M NL = N END IF * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in Y using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * IOFFY = IY + ( JY - 1 ) * LDY DO 40 I = 1, ML YTMP = ZERO GTMP = RZERO IOFFX = IX + ( JX - 1 ) * LDX IF( TRAN )THEN IOFFA = IA + ( JA + I - 2 ) * LDA DO 10 J = 1, NL YTMP = YTMP + A( IOFFA ) * X( IOFFX ) GTMP = GTMP + ABS1( A( IOFFA ) ) * ABS1( X( IOFFX ) ) IOFFA = IOFFA + 1 IOFFX = IOFFX + INCX 10 CONTINUE ELSE IF( CTRAN )THEN IOFFA = IA + ( JA + I - 2 ) * LDA DO 20 J = 1, NL YTMP = YTMP + DCONJG( A( IOFFA ) ) * X( IOFFX ) GTMP = GTMP + ABS1( A( IOFFA ) ) * ABS1( X( IOFFX ) ) IOFFA = IOFFA + 1 IOFFX = IOFFX + INCX 20 CONTINUE ELSE IOFFA = IA + I - 1 + ( JA - 1 ) * LDA DO 30 J = 1, NL YTMP = YTMP + A( IOFFA ) * X( IOFFX ) GTMP = GTMP + ABS1( A( IOFFA ) ) * ABS1( X( IOFFX ) ) IOFFA = IOFFA + LDA IOFFX = IOFFX + INCX 30 CONTINUE END IF G( I ) = ABS1( ALPHA )*GTMP + ABS1( TBETA )*ABS1( Y( IOFFY ) ) Y( IOFFY ) = ALPHA * YTMP + TBETA * Y( IOFFY ) IOFFY = IOFFY + INCY 40 CONTINUE * * Compute the error ratio for this result. * ERR = RZERO INFO = 0 LDPY = DESCY( LLD_ ) IOFFY = IY + ( JY - 1 ) * LDY CALL PB_INFOG2L( IY, JY, DESCY, NPROW, NPCOL, MYROW, MYCOL, IIY, $ JJY, IYROW, IYCOL ) ICURROW = IYROW ICURCOL = IYCOL ROWREP = ( IYROW.EQ.-1 ) COLREP = ( IYCOL.EQ.-1 ) * IF( INCY.EQ.DESCY( M_ ) ) THEN * * sub( Y ) is a row vector * JB = DESCY( INB_ ) - JY + 1 IF( JB.LE.0 ) $ JB = ( ( -JB ) / DESCY( NB_ ) + 1 ) * DESCY( NB_ ) + JB JB = MIN( JB, ML ) JN = JY + JB - 1 * DO 50 J = JY, JN * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) ) / EPS IF( G( J-JY+1 ).NE.RZERO ) $ ERRI = ERRI / G( J-JY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 JJY = JJY + 1 END IF * IOFFY = IOFFY + INCY * 50 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * DO 70 J = JN+1, JY+ML-1, DESCY( NB_ ) JB = MIN( JY+ML-J, DESCY( NB_ ) ) * DO 60 KK = 0, JB-1 * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) )/EPS IF( G( J+KK-JY+1 ).NE.RZERO ) $ ERRI = ERRI / G( J+KK-JY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 JJY = JJY + 1 END IF * IOFFY = IOFFY + INCY * 60 CONTINUE * ICURCOL = MOD( ICURCOL+1, NPCOL ) * 70 CONTINUE * ELSE * * sub( Y ) is a column vector * IB = DESCY( IMB_ ) - IY + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCY( MB_ ) + 1 ) * DESCY( MB_ ) + IB IB = MIN( IB, ML ) IN = IY + IB - 1 * DO 80 I = IY, IN * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) ) / EPS IF( G( I-IY+1 ).NE.RZERO ) $ ERRI = ERRI / G( I-IY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIY = IIY + 1 END IF * IOFFY = IOFFY + INCY * 80 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 100 I = IN+1, IY+ML-1, DESCY( MB_ ) IB = MIN( IY+ML-I, DESCY( MB_ ) ) * DO 90 KK = 0, IB-1 * IF( ( MYROW.EQ.ICURROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICURCOL .OR. COLREP ) ) THEN ERRI = ABS( PY( IIY+(JJY-1)*LDPY ) - Y( IOFFY ) )/EPS IF( G( I+KK-IY+1 ).NE.RZERO ) $ ERRI = ERRI / G( I+KK-IY+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIY = IIY + 1 END IF * IOFFY = IOFFY + INCY * 90 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 100 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) * RETURN * * End of PZMVCH * END SUBROUTINE PZVMCH( ICTXT, TRANS, UPLO, M, N, ALPHA, X, IX, JX, $ DESCX, INCX, Y, IY, JY, DESCY, INCY, A, PA, $ IA, JA, DESCA, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N DOUBLE PRECISION ERR COMPLEX*16 ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION G( * ) COMPLEX*16 A( * ), PA( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PZVMCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed in * the complex cases: * if TRANS = 'C', * sub( A ) := sub( A ) + alpha * sub( X ) * sub( Y )**H, * otherwise * sub( A ) := sub( A ) + alpha * sub( X ) * sub( Y )**T. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of the submatrix sub( A ) * is to be referenced as follows: * If UPLO = 'L', only the lower triangular part, * If UPLO = 'U', only the upper triangular part, * else the entire matrix is to be referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX*16 array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX*16 array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, CTRAN, LOWER, ROWREP, UPPER INTEGER I, IACOL, IAROW, IB, IBEG, ICURROW, IEND, IIA, $ IN, IOFFA, IOFFX, IOFFY, J, JJA, KK, LDA, LDPA, $ LDX, LDY, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION EPS, ERRI, GTMP COMPLEX*16 ATMP, C * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, MOD, SQRT * .. * .. Statement Functions .. DOUBLE PRECISION ABS1 ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * CTRAN = LSAME( TRANS, 'C' ) UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in A using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 70 J = 1, N * IOFFY = IY + ( JY - 1 ) * LDY + ( J - 1 ) * INCY * IF( LOWER ) THEN IBEG = J IEND = M DO 10 I = 1, J-1 G( I ) = ZERO 10 CONTINUE ELSE IF( UPPER ) THEN IBEG = 1 IEND = J DO 20 I = J+1, M G( I ) = ZERO 20 CONTINUE ELSE IBEG = 1 IEND = M END IF * DO 30 I = IBEG, IEND * IOFFX = IX + ( JX - 1 ) * LDX + ( I - 1 ) * INCX IOFFA = IA + I - 1 + ( JA + J - 2 ) * LDA IF( CTRAN ) THEN ATMP = X( IOFFX ) * DCONJG( Y( IOFFY ) ) ELSE ATMP = X( IOFFX ) * Y( IOFFY ) END IF GTMP = ABS1( X( IOFFX ) ) * ABS1( Y( IOFFY ) ) G( I ) = ABS1( ALPHA ) * GTMP + ABS1( A( IOFFA ) ) A( IOFFA ) = ALPHA * ATMP + A( IOFFA ) * 30 CONTINUE * * Compute the error ratio for this result. * INFO = 0 ERR = ZERO LDPA = DESCA( LLD_ ) IOFFA = IA + ( JA + J - 2 ) * LDA CALL PB_INFOG2L( IA, JA+J-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * IF( MYCOL.EQ.IACOL .OR. COLREP ) THEN * ICURROW = IAROW IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 * DO 40 I = IA, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA ) - A( IOFFA ) )/EPS IF( G( I-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 40 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * DO 50 KK = 0, IB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA )-A( IOFFA ) )/EPS IF( G( I+KK-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 80 * 70 CONTINUE * 80 CONTINUE * RETURN * * End of PZVMCH * END SUBROUTINE PZVMCH2( ICTXT, UPLO, M, N, ALPHA, X, IX, JX, DESCX, $ INCX, Y, IY, JY, DESCY, INCY, A, PA, IA, $ JA, DESCA, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, ICTXT, INCX, INCY, INFO, IX, IY, JA, JX, $ JY, M, N DOUBLE PRECISION ERR COMPLEX*16 ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ), DESCY( * ) DOUBLE PRECISION G( * ) COMPLEX*16 A( * ), PA( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PZVMCH2 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of the submatrix sub( A ) * is to be referenced as follows: * If UPLO = 'L', only the lower triangular part, * If UPLO = 'U', only the upper triangular part, * else the entire matrix is to be referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * operand matrix A. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the subma- * trix operand matrix A. N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (DESCX( M_ ),*). This * array contains a local copy of the initial entire matrix PX. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX*16 array * On entry, Y is an array of dimension (DESCY( M_ ),*). This * array contains a local copy of the initial entire matrix PY. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * PA (local input) COMPLEX*16 array * On entry, PA is an array of dimension (DESCA( LLD_ ),*). This * array contains the local entries of the matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX( M, N ). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, LOWER, ROWREP, UPPER INTEGER I, IACOL, IAROW, IB, IBEG, ICURROW, IEND, IIA, $ IN, IOFFA, IOFFXI, IOFFXJ, IOFFYI, IOFFYJ, J, $ JJA, KK, LDA, LDPA, LDX, LDY, MYCOL, MYROW, $ NPCOL, NPROW DOUBLE PRECISION EPS, ERRI, GTMP COMPLEX*16 C, ATMP * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, MOD, SQRT * .. * .. Statement Functions .. DOUBLE PRECISION ABS1 ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) * LDA = MAX( 1, DESCA( M_ ) ) LDX = MAX( 1, DESCX( M_ ) ) LDY = MAX( 1, DESCY( M_ ) ) * * Compute expected result in A using data in A, X and Y. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 70 J = 1, N * IOFFXJ = IX + ( JX - 1 ) * LDX + ( J - 1 ) * INCX IOFFYJ = IY + ( JY - 1 ) * LDY + ( J - 1 ) * INCY * IF( LOWER ) THEN IBEG = J IEND = M DO 10 I = 1, J-1 G( I ) = ZERO 10 CONTINUE ELSE IF( UPPER ) THEN IBEG = 1 IEND = J DO 20 I = J+1, M G( I ) = ZERO 20 CONTINUE ELSE IBEG = 1 IEND = M END IF * DO 30 I = IBEG, IEND IOFFA = IA + I - 1 + ( JA + J - 2 ) * LDA IOFFXI = IX + ( JX - 1 ) * LDX + ( I - 1 ) * INCX IOFFYI = IY + ( JY - 1 ) * LDY + ( I - 1 ) * INCY ATMP = ALPHA * X( IOFFXI ) * DCONJG( Y( IOFFYJ ) ) ATMP = ATMP + Y( IOFFYI ) * DCONJG( ALPHA * X( IOFFXJ ) ) GTMP = ABS1( ALPHA * X( IOFFXI ) ) * ABS1( Y( IOFFYJ ) ) GTMP = GTMP + ABS1( Y( IOFFYI ) ) * $ ABS1( DCONJG( ALPHA * X( IOFFXJ ) ) ) G( I ) = GTMP + ABS1( A( IOFFA ) ) A( IOFFA ) = A( IOFFA ) + ATMP * 30 CONTINUE * * Compute the error ratio for this result. * INFO = 0 ERR = ZERO LDPA = DESCA( LLD_ ) IOFFA = IA + ( JA + J - 2 ) * LDA CALL PB_INFOG2L( IA, JA+J-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) ROWREP = ( IAROW.EQ.-1 ) COLREP = ( IACOL.EQ.-1 ) * IF( MYCOL.EQ.IACOL .OR. COLREP ) THEN * ICURROW = IAROW IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( ( -IB ) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA + IB - 1 * DO 40 I = IA, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA ) - A( IOFFA ) )/EPS IF( G( I-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 40 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 60 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( IA+M-I, DESCA( MB_ ) ) * DO 50 KK = 0, IB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PA( IIA+(JJA-1)*LDPA )-A( IOFFA ) )/EPS IF( G( I+KK-IA+1 ).NE.ZERO ) $ ERRI = ERRI / G( I+KK-IA+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ INFO = 1 IIA = IIA + 1 END IF * IOFFA = IOFFA + 1 * 50 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 80 * 70 CONTINUE * 80 CONTINUE * RETURN * * End of PZVMCH2 * END SUBROUTINE PZMMCH( ICTXT, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, $ JA, DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, $ JC, DESCC, CT, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANSA, TRANSB INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, M, N DOUBLE PRECISION ERR COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) DOUBLE PRECISION G( * ) COMPLEX*16 A( * ), B( * ), C( * ), CT( * ), PC( * ) * .. * * Purpose * ======= * * PZMMCH checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies if the matrix operand A is to be * transposed. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies if the matrix operand B is to be * transposed. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANSA = 'N' (resp. TRANSA <> 'N') in PxGEMM, PxSYRK, * PxSYR2K, PxHERK and PxHER2K. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX*16 array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) COMPLEX*16 array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D+0, RONE = 1.0D+0 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL COLREP, CTRANA, CTRANB, ROWREP, TRANA, TRANB INTEGER I, IBB, ICCOL, ICROW, ICURROW, IIC, IN, IOFFA, $ IOFFB, IOFFC, J, JJC, KK, LDA, LDB, LDC, LDPC, $ MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION EPS, ERRI COMPLEX*16 Z * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, MOD, SQRT * .. * .. Statement Functions .. DOUBLE PRECISION ABS1 ABS1( Z ) = ABS( DBLE( Z ) ) + ABS( DIMAG( Z ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * TRANA = LSAME( TRANSA, 'T' ).OR.LSAME( TRANSA, 'C' ) TRANB = LSAME( TRANSB, 'T' ).OR.LSAME( TRANSB, 'C' ) CTRANA = LSAME( TRANSA, 'C' ) CTRANB = LSAME( TRANSB, 'C' ) * LDA = MAX( 1, DESCA( M_ ) ) LDB = MAX( 1, DESCB( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 240 J = 1, N * IOFFC = IC + ( JC + J - 2 ) * LDC DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE * IF( .NOT.TRANA .AND. .NOT.TRANB ) THEN DO 30 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 20 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS( A( IOFFA ) ) * $ ABS( B( IOFFB ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA .AND. .NOT.TRANB ) THEN IF( CTRANA ) THEN DO 50 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 40 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + DCONJG( A( IOFFA ) ) * $ B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 KK = 1, K IOFFB = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 60 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA .AND. TRANB ) THEN IF( CTRANB ) THEN DO 90 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 80 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * $ DCONJG( B( IOFFB ) ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 100 I = 1, M IOFFA = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA .AND. TRANB ) THEN IF( CTRANA ) THEN IF( CTRANB ) THEN DO 130 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 120 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + DCONJG( A( IOFFA ) ) * $ DCONJG( B( IOFFB ) ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 140 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + DCONJG( A( IOFFA ) ) * $ B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB ) THEN DO 170 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 160 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * $ DCONJG( B( IOFFB ) ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 KK = 1, K IOFFB = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 180 I = 1, M IOFFA = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFA ) * B( IOFFB ) G( I ) = G( I ) + ABS1( A( IOFFA ) ) * $ ABS1( B( IOFFB ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF * DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA * C( IOFFC ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 200 CONTINUE * * Compute the error ratio for this result. * ERR = RZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, M ) IN = IC + IBB - 1 * DO 210 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 210 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 230 I = IN+1, IC+M-1, DESCC( MB_ ) IBB = MIN( IC+M-I, DESCC( MB_ ) ) * DO 220 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 220 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 230 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 250 * 240 CONTINUE * 250 CONTINUE * RETURN * * End of PZMMCH * END SUBROUTINE PZMMCH1( ICTXT, UPLO, TRANS, N, K, ALPHA, A, IA, JA, $ DESCA, BETA, C, PC, IC, JC, DESCC, CT, G, $ ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IC, ICTXT, INFO, JA, JC, K, N DOUBLE PRECISION ERR COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) DOUBLE PRECISION G( * ) COMPLEX*16 A( * ), C( * ), CT( * ), PC( * ) * .. * * Purpose * ======= * * PZMMCH1 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrix A has to be * transposed or not before computing the matrix-matrix product. * * N (global input) INTEGER * On entry, N specifies the order the submatrix operand C. N * must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * when TRANS = 'N' (resp. TRANS <> 'N'). K must be at least * zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX*16 array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) COMPLEX*16 array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D+0, RONE = 1.0D+0 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL COLREP, HTRAN, NOTRAN, ROWREP, TRAN, UPPER INTEGER I, IBB, IBEG, ICCOL, ICROW, ICURROW, IEND, IIC, $ IN, IOFFAK, IOFFAN, IOFFC, J, JJC, KK, LDA, $ LDC, LDPC, MYCOL, MYROW, NPCOL, NPROW DOUBLE PRECISION EPS, ERRI COMPLEX*16 Z * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, MOD, SQRT * .. * .. Statement Functions .. DOUBLE PRECISION ABS1 ABS1( Z ) = ABS( DBLE( Z ) ) + ABS( DIMAG( Z ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) HTRAN = LSAME( TRANS, 'H' ) * LDA = MAX( 1, DESCA( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 140 J = 1, N * IF( UPPER ) THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF * DO 10 I = 1, N CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE * IF( NOTRAN ) THEN DO 30 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA DO 20 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAK ) * A( IOFFAN ) G( I ) = G( I ) + ABS1( A( IOFFAK ) ) * $ ABS1( A( IOFFAN ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRAN ) THEN DO 50 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA DO 40 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAK ) * A( IOFFAN ) G( I ) = G( I ) + ABS1( A( IOFFAK ) ) * $ ABS1( A( IOFFAN ) ) 40 CONTINUE 50 CONTINUE ELSE IF( HTRAN ) THEN DO 70 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA DO 60 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA CT( I ) = CT( I ) + A( IOFFAN ) * $ DCONJG( A( IOFFAK ) ) G( I ) = G( I ) + ABS1( A( IOFFAK ) ) * $ ABS1( A( IOFFAN ) ) 60 CONTINUE 70 CONTINUE ELSE DO 90 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA DO 80 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA CT( I ) = CT( I ) + DCONJG( A( IOFFAN ) ) * $ A( IOFFAK ) G( I ) = G( I ) + ABS1( DCONJG( A( IOFFAN ) ) ) * $ ABS1( A( IOFFAK ) ) 80 CONTINUE 90 CONTINUE END IF * IOFFC = IC + IBEG - 1 + ( JC + J - 2 ) * LDC * DO 100 I = IBEG, IEND CT( I ) = ALPHA*CT( I ) + BETA * C( IOFFC ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 100 CONTINUE * * Compute the error ratio for this result. * ERR = RZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, N ) IN = IC + IBB - 1 * DO 110 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IC+N-1, DESCC( MB_ ) IBB = MIN( IC+N-I, DESCC( MB_ ) ) * DO 120 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 150 * 140 CONTINUE * 150 CONTINUE * RETURN * * End of PZMMCH1 * END SUBROUTINE PZMMCH2( ICTXT, UPLO, TRANS, N, K, ALPHA, A, IA, JA, $ DESCA, B, IB, JB, DESCB, BETA, C, PC, IC, $ JC, DESCC, CT, G, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IB, IC, ICTXT, INFO, JA, JB, JC, K, N DOUBLE PRECISION ERR COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCB( * ), DESCC( * ) DOUBLE PRECISION G( * ) COMPLEX*16 A( * ), B( * ), C( * ), CT( * ), $ PC( * ) * .. * * Purpose * ======= * * PZMMCH2 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrices A and B have * to be transposed or not before computing the matrix-matrix * product. * * N (global input) INTEGER * On entry, N specifies the order the submatrix operand C. N * must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns (resp. rows) of A * and B when TRANS = 'N' (resp. TRANS <> 'N'). K must be at * least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension (DESCB( M_ ),*). This * array contains a local copy of the initial entire matrix PB. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX*16 array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * CT (workspace) COMPLEX*16 array * On entry, CT is an array of dimension at least MAX(M,N,K). CT * holds a copy of the current column of C. * * G (workspace) DOUBLE PRECISION array * On entry, G is an array of dimension at least MAX(M,N,K). G * is used to compute the gauges. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D+0, RONE = 1.0D+0 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL COLREP, HTRAN, NOTRAN, ROWREP, TRAN, UPPER INTEGER I, IBB, IBEG, ICCOL, ICROW, ICURROW, IEND, IIC, $ IN, IOFFAK, IOFFAN, IOFFBK, IOFFBN, IOFFC, J, $ JJC, KK, LDA, LDB, LDC, LDPC, MYCOL, MYROW, $ NPCOL, NPROW DOUBLE PRECISION EPS, ERRI COMPLEX*16 Z * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, MOD, SQRT * .. * .. Statement Functions .. DOUBLE PRECISION ABS1 ABS1( Z ) = ABS( DBLE( Z ) ) + ABS( DIMAG( Z ) ) * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * EPS = PDLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) HTRAN = LSAME( TRANS, 'H' ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) * LDA = MAX( 1, DESCA( M_ ) ) LDB = MAX( 1, DESCB( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) * * Compute expected result in C using data in A, B and C. * Compute gauges in G. This part of the computation is performed * by every process in the grid. * DO 140 J = 1, N * IF( UPPER ) THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF * DO 10 I = 1, N CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE * IF( NOTRAN ) THEN DO 30 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA IOFFBK = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 20 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA IOFFBN = IB + I - 1 + ( JB + KK - 2 ) * LDB CT( I ) = CT( I ) + ALPHA * ( $ A( IOFFAN ) * B( IOFFBK ) + $ B( IOFFBN ) * A( IOFFAK ) ) G( I ) = G( I ) + ABS( ALPHA ) * ( $ ABS1( A( IOFFAN ) ) * ABS1( B( IOFFBK ) ) + $ ABS1( B( IOFFBN ) ) * ABS1( A( IOFFAK ) ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRAN ) THEN DO 50 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA IOFFBK = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 40 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA IOFFBN = IB + KK - 1 + ( JB + I - 2 ) * LDB CT( I ) = CT( I ) + ALPHA * ( $ A( IOFFAN ) * B( IOFFBK ) + $ B( IOFFBN ) * A( IOFFAK ) ) G( I ) = G( I ) + ABS( ALPHA ) * ( $ ABS1( A( IOFFAN ) ) * ABS1( B( IOFFBK ) ) + $ ABS1( B( IOFFBN ) ) * ABS1( A( IOFFAK ) ) ) 40 CONTINUE 50 CONTINUE ELSE IF( HTRAN ) THEN DO 70 KK = 1, K IOFFAK = IA + J - 1 + ( JA + KK - 2 ) * LDA IOFFBK = IB + J - 1 + ( JB + KK - 2 ) * LDB DO 60 I = IBEG, IEND IOFFAN = IA + I - 1 + ( JA + KK - 2 ) * LDA IOFFBN = IB + I - 1 + ( JB + KK - 2 ) * LDB CT( I ) = CT( I ) + $ ALPHA * A( IOFFAN ) * DCONJG( B( IOFFBK ) ) + $ B( IOFFBN ) * DCONJG( ALPHA * A( IOFFAK ) ) G( I ) = G( I ) + ABS1( ALPHA ) * ( $ ABS1( A( IOFFAN ) ) * ABS1( B( IOFFBK ) ) + $ ABS1( B( IOFFBN ) ) * ABS1( A( IOFFAK ) ) ) 60 CONTINUE 70 CONTINUE ELSE DO 90 KK = 1, K IOFFAK = IA + KK - 1 + ( JA + J - 2 ) * LDA IOFFBK = IB + KK - 1 + ( JB + J - 2 ) * LDB DO 80 I = IBEG, IEND IOFFAN = IA + KK - 1 + ( JA + I - 2 ) * LDA IOFFBN = IB + KK - 1 + ( JB + I - 2 ) * LDB CT( I ) = CT( I ) + $ ALPHA * DCONJG( A( IOFFAN ) ) * B( IOFFBK ) + $ DCONJG( ALPHA * B( IOFFBN ) ) * A( IOFFAK ) G( I ) = G( I ) + ABS1( ALPHA ) * ( $ ABS1( DCONJG( A( IOFFAN ) ) * B( IOFFBK ) ) + $ ABS1( DCONJG( B( IOFFBN ) ) * A( IOFFAK ) ) ) 80 CONTINUE 90 CONTINUE END IF * IOFFC = IC + IBEG - 1 + ( JC + J - 2 ) * LDC * DO 100 I = IBEG, IEND CT( I ) = CT( I ) + BETA * C( IOFFC ) G( I ) = G( I ) + ABS1( BETA )*ABS1( C( IOFFC ) ) C( IOFFC ) = CT( I ) IOFFC = IOFFC + 1 100 CONTINUE * * Compute the error ratio for this result. * ERR = RZERO INFO = 0 LDPC = DESCC( LLD_ ) IOFFC = IC + ( JC + J - 2 ) * LDC CALL PB_INFOG2L( IC, JC+J-1, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) ICURROW = ICROW ROWREP = ( ICROW.EQ.-1 ) COLREP = ( ICCOL.EQ.-1 ) * IF( MYCOL.EQ.ICCOL .OR. COLREP ) THEN * IBB = DESCC( IMB_ ) - IC + 1 IF( IBB.LE.0 ) $ IBB = ( ( -IBB ) / DESCC( MB_ ) + 1 )*DESCC( MB_ ) + IBB IBB = MIN( IBB, N ) IN = IC + IBB - 1 * DO 110 I = IC, IN * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) ) / EPS IF( G( I-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 110 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * DO 130 I = IN+1, IC+N-1, DESCC( MB_ ) IBB = MIN( IC+N-I, DESCC( MB_ ) ) * DO 120 KK = 0, IBB-1 * IF( MYROW.EQ.ICURROW .OR. ROWREP ) THEN ERRI = ABS( PC( IIC+(JJC-1)*LDPC ) - $ C( IOFFC ) )/EPS IF( G( I+KK-IC+1 ).NE.RZERO ) $ ERRI = ERRI / G( I+KK-IC+1 ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ INFO = 1 IIC = IIC + 1 END IF * IOFFC = IOFFC + 1 * 120 CONTINUE * ICURROW = MOD( ICURROW+1, NPROW ) * 130 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) IF( INFO.NE.0 ) $ GO TO 150 * 140 CONTINUE * 150 CONTINUE * RETURN * * End of PZMMCH2 * END SUBROUTINE PZMMCH3( UPLO, TRANS, M, N, ALPHA, A, IA, JA, DESCA, $ BETA, C, PC, IC, JC, DESCC, ERR, INFO ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS, UPLO INTEGER IA, IC, INFO, JA, JC, M, N DOUBLE PRECISION ERR COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCC( * ) COMPLEX*16 A( * ), C( * ), PC( * ) * .. * * Purpose * ======= * * PZMMCH3 checks the results of the computational tests. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which part of C should contain the * result. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies whether the matrix A has to be * transposed or not before computing the matrix-matrix addi- * tion. * * M (global input) INTEGER * On entry, M specifies the number of rows of C. * * N (global input) INTEGER * On entry, N specifies the number of columns of C. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (DESCA( M_ ),*). This * array contains a local copy of the initial entire matrix PA. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (DESCC( M_ ),*). This * array contains a local copy of the initial entire matrix PC. * * PC (local input) COMPLEX*16 array * On entry, PC is an array of dimension (DESCC( LLD_ ),*). This * array contains the local pieces of the matrix PC. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * ERR (global output) DOUBLE PRECISION * On exit, ERR specifies the largest error in absolute value. * * INFO (global output) INTEGER * On exit, if INFO <> 0, the result is less than half accurate. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, CTRAN, LOWER, NOTRAN, ROWREP, UPPER INTEGER I, ICCOL, ICROW, ICTXT, IIC, IOFFA, IOFFC, J, $ JJC, LDA, LDC, LDPC, MYCOL, MYROW, NPCOL, $ NPROW DOUBLE PRECISION ERR0, ERRI, PREC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D, IGSUM2D, PB_INFOG2L, $ PZERRAXPBY * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC ABS, DCONJG, MAX * .. * .. Executable Statements .. * ICTXT = DESCC( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * PREC = PDLAMCH( ICTXT, 'eps' ) * UPPER = LSAME( UPLO, 'U' ) LOWER = LSAME( UPLO, 'L' ) NOTRAN = LSAME( TRANS, 'N' ) CTRAN = LSAME( TRANS, 'C' ) * * Compute expected result in C using data in A and C. This part of * the computation is performed by every process in the grid. * INFO = 0 ERR = ZERO * LDA = MAX( 1, DESCA( M_ ) ) LDC = MAX( 1, DESCC( M_ ) ) LDPC = MAX( 1, DESCC( LLD_ ) ) ROWREP = ( DESCC( RSRC_ ).EQ.-1 ) COLREP = ( DESCC( CSRC_ ).EQ.-1 ) * IF( NOTRAN ) THEN * DO 20 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( JA - 1 + J - JC ) * LDA * DO 10 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PZERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PZERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PZERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFA = IOFFA + 1 IOFFC = IOFFC + 1 * 10 CONTINUE * 20 CONTINUE * ELSE IF( CTRAN ) THEN * DO 40 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( J - JC ) + ( JA - 1 ) * LDA * DO 30 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PZERRAXPBY( ERRI, ALPHA, DCONJG( A( IOFFA ) ), $ BETA, C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PZERRAXPBY( ERRI, ALPHA, DCONJG( A( IOFFA ) ), $ BETA, C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PZERRAXPBY( ERRI, ALPHA, DCONJG( A( IOFFA ) ), $ BETA, C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFC = IOFFC + 1 IOFFA = IOFFA + LDA * 30 CONTINUE * 40 CONTINUE * ELSE * DO 60 J = JC, JC + N - 1 * IOFFC = IC + ( J - 1 ) * LDC IOFFA = IA + ( J - JC ) + ( JA - 1 ) * LDA * DO 50 I = IC, IC + M - 1 * IF( UPPER ) THEN IF( ( J - JC ).GE.( I - IC ) ) THEN CALL PZERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE IF( LOWER ) THEN IF( ( J - JC ).LE.( I - IC ) ) THEN CALL PZERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) ELSE ERRI = ZERO END IF ELSE CALL PZERRAXPBY( ERRI, ALPHA, A( IOFFA ), BETA, $ C( IOFFC ), PREC ) END IF * CALL PB_INFOG2L( I, J, DESCC, NPROW, NPCOL, MYROW, MYCOL, $ IIC, JJC, ICROW, ICCOL ) IF( ( MYROW.EQ.ICROW .OR. ROWREP ) .AND. $ ( MYCOL.EQ.ICCOL .OR. COLREP ) ) THEN ERR0 = ABS( PC( IIC+(JJC-1)*LDPC )-C( IOFFC ) ) IF( ERR0.GT.ERRI ) $ INFO = 1 ERR = MAX( ERR, ERR0 ) END IF * IOFFC = IOFFC + 1 IOFFA = IOFFA + LDA * 50 CONTINUE * 60 CONTINUE * END IF * * If INFO = 0, all results are at least half accurate. * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, MYCOL ) CALL DGAMX2D( ICTXT, 'All', ' ', 1, 1, ERR, 1, I, J, -1, -1, $ MYCOL ) * RETURN * * End of PZMMCH3 * END SUBROUTINE PZERRAXPBY( ERRBND, ALPHA, X, BETA, Y, PREC ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION ERRBND, PREC COMPLEX*16 ALPHA, BETA, X, Y * .. * * Purpose * ======= * * PZERRAXPBY serially computes y := beta*y + alpha * x and returns a * scaled relative acceptable error bound on the result. * * Arguments * ========= * * ERRBND (global output) DOUBLE PRECISION * On exit, ERRBND specifies the scaled relative acceptable er- * ror bound. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * X (global input) COMPLEX*16 * On entry, X specifies the scalar x to be scaled. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. * * Y (global input/global output) COMPLEX*16 * On entry, Y specifies the scalar y to be added. On exit, Y * contains the resulting scalar y. * * PREC (global input) DOUBLE PRECISION * On entry, PREC specifies the machine precision. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO, ZERO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0, $ ZERO = 0.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION ADDBND, FACT, SUMINEG, SUMIPOS, SUMRNEG, $ SUMRPOS COMPLEX*16 TMP * .. * .. Intrinsic Functions .. * .. * .. Executable Statements .. * SUMIPOS = ZERO SUMINEG = ZERO SUMRPOS = ZERO SUMRNEG = ZERO FACT = ONE + TWO * PREC ADDBND = TWO * TWO * TWO * PREC * TMP = ALPHA * X IF( DBLE( TMP ).GE.ZERO ) THEN SUMRPOS = SUMRPOS + DBLE( TMP ) * FACT ELSE SUMRNEG = SUMRNEG - DBLE( TMP ) * FACT END IF IF( DIMAG( TMP ).GE.ZERO ) THEN SUMIPOS = SUMIPOS + DIMAG( TMP ) * FACT ELSE SUMINEG = SUMINEG - DIMAG( TMP ) * FACT END IF * TMP = BETA * Y IF( DBLE( TMP ).GE.ZERO ) THEN SUMRPOS = SUMRPOS + DBLE( TMP ) * FACT ELSE SUMRNEG = SUMRNEG - DBLE( TMP ) * FACT END IF IF( DIMAG( TMP ).GE.ZERO ) THEN SUMIPOS = SUMIPOS + DIMAG( TMP ) * FACT ELSE SUMINEG = SUMINEG - DIMAG( TMP ) * FACT END IF * Y = ( BETA * Y ) + ( ALPHA * X ) * ERRBND = ADDBND * MAX( MAX( SUMRPOS, SUMRNEG ), $ MAX( SUMIPOS, SUMINEG ) ) * RETURN * * End of PZERRAXPBY * END SUBROUTINE PZIPSET( TOGGLE, N, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TOGGLE INTEGER IA, JA, N * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZIPSET sets the imaginary part of the diagonal entries of an n by n * matrix sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). This is used to * test the PBLAS routines for complex Hermitian matrices, which are * either not supposed to access or use the imaginary parts of the dia- * gonals, or supposed to set them to zero. The value used to set the * imaginary part of the diagonals depends on the value of TOGGLE. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TOGGLE (global input) CHARACTER*1 * On entry, TOGGLE specifies the set-value to be used as fol- * lows: * If TOGGLE = 'Z' or 'z', the imaginary part of the diago- * nals are set to zero, * If TOGGLE = 'B' or 'b', the imaginary part of the diago- * nals are set to a large value. * * N (global input) INTEGER * On entry, N specifies the order of sub( A ). N must be at * least zero. * * A (local input/local output) pointer to COMPLEX*16 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the diagonals of * sub( A ) have been updated as specified by TOGGLE. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLREP, GODOWN, GOLEFT, ROWREP INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP DOUBLE PRECISION ALPHA, ATMP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH EXTERNAL LSAME, PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.LE.0 ) $ RETURN * IF( LSAME( TOGGLE, 'Z' ) ) THEN ALPHA = ZERO ELSE IF( LSAME( TOGGLE, 'B' ) ) THEN ALPHA = PDLAMCH( ICTXT, 'Epsilon' ) ALPHA = ALPHA / PDLAMCH( ICTXT, 'Safe minimum' ) END IF * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( NP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 ROWREP = ( DESCA2( RSRC_ ).EQ.-1 ) COLREP = ( DESCA2( CSRC_ ).EQ.-1 ) LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( ROWREP ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( COLREP ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA + LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = DBLE( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = DCMPLX( ATMP, ALPHA ) 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = DBLE( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = DCMPLX( ATMP, ALPHA ) 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = DBLE( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = DCMPLX( ATMP, ALPHA ) 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = DBLE( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = DCMPLX( ATMP, ALPHA ) 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = DBLE( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = DCMPLX( ATMP, ALPHA ) 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = DBLE( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = DCMPLX( ATMP, ALPHA ) 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = DBLE( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = DCMPLX( ATMP, ALPHA ) 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = DBLE( A( IJOFFA + I*LDAP1 ) ) A( IJOFFA + I*LDAP1 ) = DCMPLX( ATMP, ALPHA ) 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PZIPSET * END DOUBLE PRECISION FUNCTION PDLAMCH( ICTXT, CMACH ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 CMACH INTEGER ICTXT * .. * * Purpose * ======= * * * .. Local Scalars .. CHARACTER*1 TOP INTEGER IDUMM DOUBLE PRECISION TEMP * .. * .. External Subroutines .. EXTERNAL DGAMN2D, DGAMX2D, PB_TOPGET * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH, LSAME * .. * .. Executable Statements .. * TEMP = DLAMCH( CMACH ) * IF( LSAME( CMACH, 'E' ).OR.LSAME( CMACH, 'S' ).OR. $ LSAME( CMACH, 'M' ).OR.LSAME( CMACH, 'U' ) ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) IDUMM = 0 CALL DGAMX2D( ICTXT, 'All', TOP, 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) ELSE IF( LSAME( CMACH, 'L' ).OR.LSAME( CMACH, 'O' ) ) THEN CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) IDUMM = 0 CALL DGAMN2D( ICTXT, 'All', TOP, 1, 1, TEMP, 1, IDUMM, $ IDUMM, -1, -1, IDUMM ) END IF * PDLAMCH = TEMP * RETURN * * End of PDLAMCH * END SUBROUTINE PZLASET( UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IA, JA, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLASET initializes an m by n submatrix A(IA:IA+M-1,JA:JA+N-1) deno- * ted by sub( A ) to beta on the diagonal and alpha on the offdiago- * nals. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies the part of the submatrix sub( A ) * to be set: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of sub( A ) is not changed; * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha, i.e., the cons- * tant to which the offdiagonal elements are to be set. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta, i.e., the constant * to which the diagonal elements are to be set. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A to be set. On exit, the * leading m by n submatrix sub( A ) is set as follows: * * if UPLO = 'U', A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=j-1, 1<=j<=N, * if UPLO = 'L', A(IA+i-1,JA+j-1) = ALPHA, j+1<=i<=M, 1<=j<=N, * otherwise, A(IA+i-1,JA+j-1) = ALPHA, 1<=i<=M, 1<=j<=N, * and IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT, ISCOLREP, ISROWREP, LOWER, $ UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, JJA, $ JJMAX, JOFFA, JOFFD, LCMT, LCMT00, LDA, LMBLOC, $ LNBLOC, LOW, M1, MB, MBLKD, MBLKS, MBLOC, MP, $ MRCOL, MRROW, MYCOL, MYROW, N1, NB, NBLKD, $ NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, QNB, TMP1, $ UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS, PB_ZLASET * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * ISROWREP = ( DESCA2( RSRC_ ).LT.0 ) ISCOLREP = ( DESCA2( CSRC_ ).LT.0 ) LDA = DESCA2( LLD_ ) * UPPER = .NOT.( LSAME( UPLO, 'L' ) ) LOWER = .NOT.( LSAME( UPLO, 'U' ) ) * IF( ( ( LOWER.AND.UPPER ).AND.( ALPHA.EQ.BETA ) ).OR. $ ( ISROWREP .AND. ISCOLREP ) ) THEN IF( ( MP.GT.0 ).AND.( NQ.GT.0 ) ) $ CALL PB_ZLASET( UPLO, MP, NQ, 0, ALPHA, BETA, $ A( IIA + ( JJA - 1 ) * LDA ), LDA ) RETURN END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) CALL PB_BINFO( 0, MP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( ISROWREP ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( ISCOLREP ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * M1 = MP N1 = NQ * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_ZLASET( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, BETA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_ZLASET( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ ALPHA, A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_ZLASET( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ ALPHA, A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_ZLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_ZLASET( UPLO, MBLOC, INBLOC, LCMT, ALPHA, BETA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_ZLASET( 'ALL', TMP1, INBLOC, 0, ALPHA, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_ZLASET( 'ALL', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_ZLASET( 'All', M1, TMP1, 0, ALPHA, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_ZLASET( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, BETA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_ZLASET( 'All', IMBLOC, TMP1, 0, ALPHA, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_ZLASET( 'All', M1, TMP1, 0, ALPHA, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_ZLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_ZLASET( UPLO, MBLOC, NBLOC, LCMT, ALPHA, BETA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_ZLASET( 'All', TMP1, NBLOC, 0, ALPHA, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_ZLASET( 'All', TMP1, N1, 0, ALPHA, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * RETURN * * End of PZLASET * END SUBROUTINE PZLASCAL( TYPE, M, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TYPE INTEGER IA, JA, M, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLASCAL scales the m by n submatrix A(IA:IA+M-1,JA:JA+N-1) denoted * by sub( A ) by the scalar alpha. TYPE specifies if sub( A ) is full, * upper triangular, lower triangular or upper Hessenberg. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * On entry, TYPE specifies the type of the input submatrix as * follows: * = 'L' or 'l': sub( A ) is a lower triangular matrix, * = 'U' or 'u': sub( A ) is an upper triangular matrix, * = 'H' or 'h': sub( A ) is an upper Hessenberg matrix, * otherwise sub( A ) is a full matrix. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( A ) are overwritten by * the local entries of the m by n scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. CHARACTER*1 UPLO LOGICAL GODOWN, GOLEFT, LOWER, UPPER INTEGER IACOL, IAROW, ICTXT, IIA, IIMAX, ILOW, IMB1, $ IMBLOC, INB1, INBLOC, IOFFA, IOFFD, ITYPE, $ IUPP, JJA, JJMAX, JOFFA, JOFFD, LCMT, LCMT00, $ LDA, LMBLOC, LNBLOC, LOW, M1, MB, MBLKD, MBLKS, $ MBLOC, MP, MRCOL, MRROW, MYCOL, MYROW, N1, NB, $ NBLKD, NBLKS, NBLOC, NPCOL, NPROW, NQ, PMB, $ QNB, TMP1, UPP * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS, PB_INFOG2L, PB_ZLASCAL * .. * .. External Functions .. LOGICAL LSAME INTEGER PB_NUMROC EXTERNAL LSAME, PB_NUMROC * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * IF( LSAME( TYPE, 'L' ) ) THEN ITYPE = 1 UPLO = TYPE UPPER = .FALSE. LOWER = .TRUE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'U' ) ) THEN ITYPE = 2 UPLO = TYPE UPPER = .TRUE. LOWER = .FALSE. IOFFD = 0 ELSE IF( LSAME( TYPE, 'H' ) ) THEN ITYPE = 3 UPLO = 'U' UPPER = .TRUE. LOWER = .FALSE. IOFFD = 1 ELSE ITYPE = 0 UPLO = 'A' UPPER = .TRUE. LOWER = .TRUE. IOFFD = 0 END IF * * Compute local indexes * IF( ITYPE.EQ.0 ) THEN * * Full matrix * CALL PB_INFOG2L( IA, JA, DESCA2, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) MP = PB_NUMROC( M, IA, DESCA2( IMB_ ), DESCA2( MB_ ), MYROW, $ DESCA2( RSRC_ ), NPROW ) NQ = PB_NUMROC( N, JA, DESCA2( INB_ ), DESCA2( NB_ ), MYCOL, $ DESCA2( CSRC_ ), NPCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * LDA = DESCA2( LLD_ ) IOFFA = IIA + ( JJA - 1 ) * LDA * CALL PB_ZLASCAL( 'All', MP, NQ, 0, ALPHA, A( IOFFA ), LDA ) * ELSE * * Trapezoidal matrix * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * IF( MP.LE.0 .OR. NQ.LE.0 ) $ RETURN * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, * LNBLOC, ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) LDA = DESCA2( LLD_ ) * CALL PB_BINFO( IOFFD, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * M1 = MP N1 = NQ IOFFA = IIA - 1 JOFFA = JJA - 1 IIMAX = IOFFA + MP JJMAX = JOFFA + NQ * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and * update LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * CALL PB_ZLASCAL( UPLO, IMBLOC, INBLOC, LCMT00, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IF( GODOWN ) THEN IF( UPPER .AND. NQ.GT.INBLOC ) $ CALL PB_ZLASCAL( 'All', IMBLOC, NQ-INBLOC, 0, ALPHA, $ A( IIA+(JOFFA+INBLOC)*LDA ), LDA ) IIA = IIA + IMBLOC M1 = M1 - IMBLOC ELSE IF( LOWER .AND. MP.GT.IMBLOC ) $ CALL PB_ZLASCAL( 'All', MP-IMBLOC, INBLOC, 0, ALPHA, $ A( IIA+IMBLOC+JOFFA*LDA ), LDA ) JJA = JJA + INBLOC N1 = N1 - INBLOC END IF * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 10 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 10 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_ZLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 20 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_ZLASCAL( UPLO, MBLOC, INBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 20 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_ZLASCAL( 'All', TMP1, INBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = IOFFA - IIA + 1 M1 = M1 - TMP1 N1 = N1 - INBLOC LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_ZLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 30 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 30 END IF * TMP1 = MIN( JOFFA, JJMAX ) - JJA + 1 IF( LOWER .AND. TMP1.GT.0 ) THEN CALL PB_ZLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IIA+(JJA-1)*LDA ), LDA ) JJA = JJA + TMP1 N1 = N1 - TMP1 END IF * IF( NBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 40 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC CALL PB_ZLASCAL( UPLO, IMBLOC, NBLOC, LCMT, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 40 END IF * TMP1 = N1 - JOFFD + JJA - 1 IF( UPPER .AND. TMP1.GT.0 ) $ CALL PB_ZLASCAL( 'All', IMBLOC, TMP1, 0, ALPHA, $ A( IIA+JOFFD*LDA ), LDA ) * TMP1 = JOFFA - JJA + 1 M1 = M1 - IMBLOC N1 = N1 - TMP1 LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * IF( LOWER .AND. M1.GT.0 .AND. TMP1.GT.0 ) $ CALL PB_ZLASCAL( 'All', M1, TMP1, 0, ALPHA, $ A( IOFFA+1+(JJA-1)*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * END IF * NBLOC = NB 50 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 60 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 60 END IF * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 IF( UPPER .AND. TMP1.GT.0 ) THEN CALL PB_ZLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) IIA = IIA + TMP1 M1 = M1 - TMP1 END IF * IF( MBLKS.LE.0 ) $ RETURN * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 70 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC CALL PB_ZLASCAL( UPLO, MBLOC, NBLOC, LCMT, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 70 END IF * TMP1 = M1 - IOFFD + IIA - 1 IF( LOWER .AND. TMP1.GT.0 ) $ CALL PB_ZLASCAL( 'All', TMP1, NBLOC, 0, ALPHA, $ A( IOFFD+1+JOFFA*LDA ), LDA ) * TMP1 = MIN( IOFFA, IIMAX ) - IIA + 1 M1 = M1 - TMP1 N1 = N1 - NBLOC LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC * IF( UPPER .AND. TMP1.GT.0 .AND. N1.GT.0 ) $ CALL PB_ZLASCAL( 'All', TMP1, N1, 0, ALPHA, $ A( IIA+JOFFA*LDA ), LDA ) * IIA = IOFFA + 1 JJA = JOFFA + 1 * GO TO 50 * END IF * END IF * RETURN * * End of PZLASCAL * END SUBROUTINE PZLAGEN( INPLACE, AFORM, DIAG, OFFA, M, N, IA, JA, $ DESCA, IASEED, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE CHARACTER*1 AFORM, DIAG INTEGER IA, IASEED, JA, LDA, M, N, OFFA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PZLAGEN generates (or regenerates) a submatrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies if the generated submatrix is diago- * nally dominant or not as follows: * DIAG = 'D' : sub( A ) is diagonally dominant, * DIAG = 'N' : sub( A ) is not diagonally dominant. * * OFFA (global input) INTEGER * On entry, OFFA specifies the offdiagonal of the underlying * matrix A(1:DESCA(M_),1:DESCA(N_)) of interest when the subma- * trix is symmetric, Hermitian or diagonally dominant. OFFA = 0 * specifies the main diagonal, OFFA > 0 specifies a subdiago- * nal, and OFFA < 0 specifies a superdiagonal (see further de- * tails). * * M (global input) INTEGER * On entry, M specifies the global number of matrix rows of the * submatrix sub( A ) to be generated. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of matrix columns of * the submatrix sub( A ) to be generated. N must be at least * zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IASEED (global input) INTEGER * On entry, IASEED specifies the seed number to generate the * matrix A. IASEED must be at least zero. * * A (local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). On exit, this array contains the * local entries of the randomly generated submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. When INPLACE is .FALSE., LDA is usually DESCA(LLD_). * This restriction is however not enforced, and this subroutine * requires only that LDA >= MAX( 1, Mp ) where * * Mp = PB_NUMROC( M, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ). * * PB_NUMROC is a ScaLAPACK tool function; MYROW, MYCOL, NPROW * and NPCOL can be determined by calling the BLACS subroutine * BLACS_GRIDINFO. * * Further Details * =============== * * OFFD is tied to the matrix described by DESCA, as opposed to the * piece that is currently (re)generated. This is a global information * independent from the distribution parameters. Below are examples of * the meaning of OFFD for a global 7 by 5 matrix: * * --------------------------------------------------------------------- * OFFD | 0 -1 -2 -3 -4 0 -1 -2 -3 -4 0 -1 -2 -3 -4 * -------|------------------------------------------------------------- * | | OFFD=-1 | OFFD=0 OFFD=2 * | V V * 0 | . d . . . -> d . . . . . . . . . * 1 | . . d . . . d . . . . . . . . * 2 | . . . d . . . d . . -> d . . . . * 3 | . . . . d . . . d . . d . . . * 4 | . . . . . . . . . d . . d . . * 5 | . . . . . . . . . . . . . d . * 6 | . . . . . . . . . . . . . . d * --------------------------------------------------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL DIAGDO, SYMM, HERM, NOTRAN INTEGER CSRC, I, IACOL, IAROW, ICTXT, IIA, ILOCBLK, $ ILOCOFF, ILOW, IMB, IMB1, IMBLOC, IMBVIR, INB, $ INB1, INBLOC, INBVIR, INFO, IOFFDA, ITMP, IUPP, $ IVIR, JJA, JLOCBLK, JLOCOFF, JVIR, LCMT00, $ LMBLOC, LNBLOC, LOW, MAXMN, MB, MBLKS, MP, $ MRCOL, MRROW, MYCDIST, MYCOL, MYRDIST, MYROW, $ NB, NBLKS, NPCOL, NPROW, NQ, NVIR, RSRC, UPP COMPLEX*16 ALPHA * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ), IMULADD( 4, JMP_LEN ), $ IRAN( 2 ), JMP( JMP_LEN ), MULADD0( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_CHKMAT, PB_DESCTRANS, PB_INITJMP, $ PB_INITMULADD, PB_JUMP, PB_JUMPIT, PB_LOCINFO, $ PB_SETLOCRAN, PB_SETRAN, PB_ZLAGEN, PXERBLA, $ PZLADOM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN * .. * .. Data Statements .. DATA ( MULADD0( I ), I = 1, 4 ) / 20077, 16838, $ 12345, 0 / * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Test the input arguments * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test the input parameters * INFO = 0 IF( NPROW.EQ.-1 ) THEN INFO = -( 1000 + CTXT_ ) ELSE SYMM = LSAME( AFORM, 'S' ) HERM = LSAME( AFORM, 'H' ) NOTRAN = LSAME( AFORM, 'N' ) DIAGDO = LSAME( DIAG, 'D' ) IF( .NOT.( SYMM.OR.HERM.OR.NOTRAN ) .AND. $ .NOT.( LSAME( AFORM, 'T' ) ) .AND. $ .NOT.( LSAME( AFORM, 'C' ) ) ) THEN INFO = -2 ELSE IF( ( .NOT.DIAGDO ) .AND. $ ( .NOT.LSAME( DIAG, 'N' ) ) ) THEN INFO = -3 END IF CALL PB_CHKMAT( ICTXT, M, 5, N, 6, IA, JA, DESCA2, 10, INFO ) END IF * IF( INFO.NE.0 ) THEN CALL PXERBLA( ICTXT, 'PZLAGEN', -INFO ) RETURN END IF * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) IMB = DESCA2( IMB_ ) INB = DESCA2( INB_ ) RSRC = DESCA2( RSRC_ ) CSRC = DESCA2( CSRC_ ) * * Figure out local information about the distributed matrix operand * CALL PB_AINFOG2L( M, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, MP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * IOFFDA = JA + OFFA - IA CALL PB_BINFO( IOFFDA, MP, NQ, IMB1, INB1, MB, NB, MRROW, $ MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, $ LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) * * Initialize ILOCBLK, ILOCOFF, MYRDIST, JLOCBLK, JLOCOFF, MYCDIST * This values correspond to the square virtual underlying matrix * of size MAX( M_ + MAX( 0, -OFFA ), N_ + MAX( 0, OFFA ) ) used * to set up the random sequence. For practical purposes, the size * of this virtual matrix is upper bounded by M_ + N_ - 1. * ITMP = MAX( 0, -OFFA ) IVIR = IA + ITMP IMBVIR = IMB + ITMP NVIR = DESCA2( M_ ) + ITMP * CALL PB_LOCINFO( IVIR, IMBVIR, MB, MYROW, RSRC, NPROW, ILOCBLK, $ ILOCOFF, MYRDIST ) * ITMP = MAX( 0, OFFA ) JVIR = JA + ITMP INBVIR = INB + ITMP NVIR = MAX( MAX( NVIR, DESCA2( N_ ) + ITMP ), $ DESCA2( M_ ) + DESCA2( N_ ) - 1 ) * CALL PB_LOCINFO( JVIR, INBVIR, NB, MYCOL, CSRC, NPCOL, JLOCBLK, $ JLOCOFF, MYCDIST ) * IF( SYMM .OR. HERM .OR. NOTRAN ) THEN * CALL PB_INITJMP( .TRUE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 2, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_ZLAGEN( 'Lower', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( SYMM .OR. HERM .OR. ( .NOT. NOTRAN ) ) THEN * CALL PB_INITJMP( .FALSE., NVIR, IMBVIR, INBVIR, IMBLOC, INBLOC, $ MB, NB, RSRC, CSRC, NPROW, NPCOL, 2, JMP ) * * Compute constants to jump JMP( * ) numbers in the sequence * CALL PB_INITMULADD( MULADD0, JMP, IMULADD ) * * Compute and set the random value corresponding to A( IA, JA ) * CALL PB_SETLOCRAN( IASEED, ILOCBLK, JLOCBLK, ILOCOFF, JLOCOFF, $ MYRDIST, MYCDIST, NPROW, NPCOL, JMP, $ IMULADD, IRAN ) * CALL PB_ZLAGEN( 'Upper', AFORM, A( IIA, JJA ), LDA, LCMT00, $ IRAN, MBLKS, IMBLOC, MB, LMBLOC, NBLKS, INBLOC, $ NB, LNBLOC, JMP, IMULADD ) * END IF * IF( DIAGDO ) THEN * MAXMN = MAX( DESCA2( M_ ), DESCA2( N_ ) ) IF( HERM ) THEN ALPHA = DCMPLX( DBLE( 2 * MAXMN ), ZERO ) ELSE ALPHA = DCMPLX( DBLE( NVIR ), DBLE( MAXMN ) ) END IF * IF( IOFFDA.GE.0 ) THEN CALL PZLADOM( INPLACE, MIN( MAX( 0, M-IOFFDA ), N ), ALPHA, $ A, MIN( IA+IOFFDA, IA+M-1 ), JA, DESCA ) ELSE CALL PZLADOM( INPLACE, MIN( M, MAX( 0, N+IOFFDA ) ), ALPHA, $ A, IA, MIN( JA-IOFFDA, JA+N-1 ), DESCA ) END IF * END IF * RETURN * * End of PZLAGEN * END SUBROUTINE PZLADOM( INPLACE, N, ALPHA, A, IA, JA, DESCA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. LOGICAL INPLACE INTEGER IA, JA, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PZLADOM adds alpha to the diagonal entries of an n by n submatrix * sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * INPLACE (global input) LOGICAL * On entry, INPLACE specifies if the matrix should be generated * in place or not. If INPLACE is .TRUE., the local random array * to be generated will start in memory at the local memory lo- * cation A( 1, 1 ), otherwise it will start at the local posi- * tion induced by IA and JA. * * N (global input) INTEGER * On entry, N specifies the global order of the submatrix * sub( A ) to be modified. N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. On exit, the local entries * of this array corresponding to the main diagonal of sub( A ) * have been updated. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL GODOWN, GOLEFT INTEGER I, IACOL, IAROW, ICTXT, IIA, IJOFFA, ILOW, $ IMB1, IMBLOC, INB1, INBLOC, IOFFA, IOFFD, IUPP, $ JJA, JOFFA, JOFFD, LCMT, LCMT00, LDA, LDAP1, $ LMBLOC, LNBLOC, LOW, MB, MBLKD, MBLKS, MBLOC, $ MRCOL, MRROW, MYCOL, MYROW, NB, NBLKD, NBLKS, $ NBLOC, NP, NPCOL, NPROW, NQ, PMB, QNB, UPP COMPLEX*16 ATMP * .. * .. Local Scalars .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_AINFOG2L, PB_BINFO, $ PB_DESCTRANS * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DIMAG, MAX, MIN * .. * .. Executable Statements .. * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * * Get grid parameters * ICTXT = DESCA2( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( N.EQ.0 ) $ RETURN * CALL PB_AINFOG2L( N, N, IA, JA, DESCA2, NPROW, NPCOL, MYROW, $ MYCOL, IMB1, INB1, NP, NQ, IIA, JJA, IAROW, $ IACOL, MRROW, MRCOL ) * * Decide where the entries shall be stored in memory * IF( INPLACE ) THEN IIA = 1 JJA = 1 END IF * * Initialize LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, * ILOW, LOW, IUPP, and UPP. * MB = DESCA2( MB_ ) NB = DESCA2( NB_ ) * CALL PB_BINFO( 0, NP, NQ, IMB1, INB1, MB, NB, MRROW, MRCOL, $ LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, $ LNBLOC, ILOW, LOW, IUPP, UPP ) * IOFFA = IIA - 1 JOFFA = JJA - 1 LDA = DESCA2( LLD_ ) LDAP1 = LDA + 1 * IF( DESCA2( RSRC_ ).LT.0 ) THEN PMB = MB ELSE PMB = NPROW * MB END IF IF( DESCA2( CSRC_ ).LT.0 ) THEN QNB = NB ELSE QNB = NPCOL * NB END IF * * Handle the first block of rows or columns separately, and update * LCMT00, MBLKS and NBLKS. * GODOWN = ( LCMT00.GT.IUPP ) GOLEFT = ( LCMT00.LT.ILOW ) * IF( .NOT.GODOWN .AND. .NOT.GOLEFT ) THEN * * LCMT00 >= ILOW && LCMT00 <= IUPP * IF( LCMT00.GE.0 ) THEN IJOFFA = IOFFA+LCMT00 + ( JOFFA - 1 ) * LDA DO 10 I = 1, MIN( INBLOC, MAX( 0, IMBLOC - LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 10 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFA - LCMT00 - 1 ) * LDA DO 20 I = 1, MIN( IMBLOC, MAX( 0, INBLOC + LCMT00 ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 20 CONTINUE END IF GOLEFT = ( ( LCMT00 - ( IUPP - UPP + PMB ) ).LT.ILOW ) GODOWN = .NOT.GOLEFT * END IF * IF( GODOWN ) THEN * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * 30 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 30 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 40 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.ILOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 50 I = 1, MIN( INBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 50 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 60 I = 1, MIN( MBLOC, MAX( 0, INBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 60 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 40 END IF * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * ELSE IF( GOLEFT ) THEN * LCMT00 = LCMT00 + LOW - ILOW + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + INBLOC * 70 CONTINUE IF( NBLKS.GT.0 .AND. LCMT00.LT.LOW ) THEN LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NB GO TO 70 END IF * LCMT = LCMT00 NBLKD = NBLKS JOFFD = JOFFA * NBLOC = NB 80 CONTINUE IF( NBLKD.GT.0 .AND. LCMT.LE.IUPP ) THEN IF( NBLKD.EQ.1 ) $ NBLOC = LNBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFA + LCMT + ( JOFFD - 1 ) * LDA DO 90 I = 1, MIN( NBLOC, MAX( 0, IMBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 90 CONTINUE ELSE IJOFFA = IOFFA + ( JOFFD - LCMT - 1 ) * LDA DO 100 I = 1, MIN( IMBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 100 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT + QNB NBLKS = NBLKD NBLKD = NBLKD - 1 JOFFA = JOFFD JOFFD = JOFFD + NBLOC GO TO 80 END IF * LCMT00 = LCMT00 - ( IUPP - UPP + PMB ) MBLKS = MBLKS - 1 IOFFA = IOFFA + IMBLOC * END IF * NBLOC = NB 110 CONTINUE IF( NBLKS.GT.0 ) THEN IF( NBLKS.EQ.1 ) $ NBLOC = LNBLOC 120 CONTINUE IF( MBLKS.GT.0 .AND. LCMT00.GT.UPP ) THEN LCMT00 = LCMT00 - PMB MBLKS = MBLKS - 1 IOFFA = IOFFA + MB GO TO 120 END IF * LCMT = LCMT00 MBLKD = MBLKS IOFFD = IOFFA * MBLOC = MB 130 CONTINUE IF( MBLKD.GT.0 .AND. LCMT.GE.LOW ) THEN IF( MBLKD.EQ.1 ) $ MBLOC = LMBLOC IF( LCMT.GE.0 ) THEN IJOFFA = IOFFD + LCMT + ( JOFFA - 1 ) * LDA DO 140 I = 1, MIN( NBLOC, MAX( 0, MBLOC - LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 140 CONTINUE ELSE IJOFFA = IOFFD + ( JOFFA - LCMT - 1 ) * LDA DO 150 I = 1, MIN( MBLOC, MAX( 0, NBLOC + LCMT ) ) ATMP = A( IJOFFA + I*LDAP1 ) A( IJOFFA + I*LDAP1 ) = ALPHA + $ DCMPLX( ABS( DBLE( ATMP ) ), $ ABS( DIMAG( ATMP ) ) ) 150 CONTINUE END IF LCMT00 = LCMT LCMT = LCMT - PMB MBLKS = MBLKD MBLKD = MBLKD - 1 IOFFA = IOFFD IOFFD = IOFFD + MBLOC GO TO 130 END IF * LCMT00 = LCMT00 + QNB NBLKS = NBLKS - 1 JOFFA = JOFFA + NBLOC GO TO 110 * END IF * RETURN * * End of PZLADOM * END SUBROUTINE PB_PZLAPRNT( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * Purpose * ======= * * PB_PZLAPRNT prints to the standard output a submatrix sub( A ) deno- * ting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and printed by * the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA( DTYPE_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA( N_ ) The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA( IMB_ ) The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA( INB_ ) The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA( MB_ ) The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA( NB_ ) The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_NUMROC: * Lr( IA, K ) = PB_NUMROC( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_NUMROC( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the row index of the printing pro- * cess. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the column index of the printing * process. * * CMATNM (global input) CHARACTER*(*) * On entry, CMATNM is the name of the matrix to be printed. * * NOUT (global input) INTEGER * On entry, NOUT specifies the output unit number. When NOUT is * equal to 6, the submatrix is printed on the screen. * * WORK (local workspace) COMPLEX*16 array * On entry, WORK is a work array of dimension at least equal to * MAX( IMB_A, MB_A ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW, PCOL, PROW * .. * .. Local Arrays .. INTEGER DESCA2( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PB_DESCTRANS, PB_PZLAPRN2 * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Convert descriptor * CALL PB_DESCTRANS( DESCA, DESCA2 ) * CALL BLACS_GRIDINFO( DESCA2( CTXT_ ), NPROW, NPCOL, MYROW, MYCOL ) * IF( DESCA2( RSRC_ ).GE.0 ) THEN IF( DESCA2( CSRC_ ).GE.0 ) THEN CALL PB_PZLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, ICPRNT, $ CMATNM, NOUT, DESCA2( RSRC_ ), $ DESCA2( CSRC_ ), WORK ) ELSE DO 10 PCOL = 0, NPCOL - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Colum-replicated array -- ' , $ 'copy in process column: ', PCOL CALL PB_PZLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, DESCA2( RSRC_ ), $ PCOL, WORK ) 10 CONTINUE END IF ELSE IF( DESCA2( CSRC_ ).GE.0 ) THEN DO 20 PROW = 0, NPROW - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Row-replicated array -- ' , $ 'copy in process row: ', PROW CALL PB_PZLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, PROW, $ DESCA2( CSRC_ ), WORK ) 20 CONTINUE ELSE DO 40 PROW = 0, NPROW - 1 DO 30 PCOL = 0, NPCOL - 1 IF( ( MYROW.EQ.IRPRNT ).AND.( MYCOL.EQ.ICPRNT ) ) $ WRITE( NOUT, * ) 'Replicated array -- ' , $ 'copy in process (', PROW, ',', PCOL, ')' CALL PB_PZLAPRN2( M, N, A, IA, JA, DESCA2, IRPRNT, $ ICPRNT, CMATNM, NOUT, PROW, PCOL, $ WORK ) 30 CONTINUE 40 CONTINUE END IF END IF * RETURN * * End of PB_PZLAPRNT * END SUBROUTINE PB_PZLAPRN2( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, $ CMATNM, NOUT, PROW, PCOL, WORK ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT, PCOL, PROW * .. * .. Array Arguments .. CHARACTER*(*) CMATNM INTEGER DESCA( * ) COMPLEX*16 A( * ), WORK( * ) * .. * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D_INB, CSRC_, CTXT_, DLEN_, $ DTYPE_, IMB_, INB_, LLD_, MB_, M_, NB_, N_, $ RSRC_ PARAMETER ( BLOCK_CYCLIC_2D_INB = 2, DLEN_ = 11, $ DTYPE_ = 1, CTXT_ = 2, M_ = 3, N_ = 4, $ IMB_ = 5, INB_ = 6, MB_ = 7, NB_ = 8, $ RSRC_ = 9, CSRC_ = 10, LLD_ = 11 ) * .. * .. Local Scalars .. LOGICAL AISCOLREP, AISROWREP INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL, $ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K, $ LDA, LDW, MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, PB_INFOG2L, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DIMAG, MIN * .. * .. Executable Statements .. * * Get grid parameters * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) CALL PB_INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IIA, JJA, IAROW, IACOL ) II = IIA JJ = JJA IF( DESCA( RSRC_ ).LT.0 ) THEN AISROWREP = .TRUE. IAROW = PROW ICURROW = PROW ELSE AISROWREP = .FALSE. ICURROW = IAROW END IF IF( DESCA( CSRC_ ).LT.0 ) THEN AISCOLREP = .TRUE. IACOL = PCOL ICURCOL = PCOL ELSE AISCOLREP = .FALSE. ICURCOL = IACOL END IF LDA = DESCA( LLD_ ) LDW = MAX( DESCA( IMB_ ), DESCA( MB_ ) ) * * Handle the first block of column separately * JB = DESCA( INB_ ) - JA + 1 IF( JB.LE.0 ) $ JB = ( (-JB) / DESCA( NB_ ) + 1 ) * DESCA( NB_ ) + JB JB = MIN( JB, N ) JN = JA+JB-1 DO 60 H = 0, JB-1 IB = DESCA( IMB_ ) - IA + 1 IF( IB.LE.0 ) $ IB = ( (-IB) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA+IB-1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 10 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, JA+H, $ DBLE( A( II+K+(JJ+H-1)*LDA ) ), $ DIMAG( A( II+K+(JJ+H-1)*LDA ) ) 10 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA, $ IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, ICURCOL ) DO 20 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, JA+H, DBLE( WORK( K ) ), $ DIMAG( WORK( K ) ) 20 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 50 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 30 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, JA+H, $ DBLE( A( II+K+(JJ+H-1)*LDA ) ), $ DIMAG( A( II+K+(JJ+H-1)*LDA ) ) 30 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 40 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, JA+H, DBLE( WORK( K ) ), $ DIMAG( WORK( K ) ) 40 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 50 CONTINUE * II = IIA ICURROW = IAROW 60 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB IF( .NOT.AISCOLREP ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining column blocks * DO 130 J = JN+1, JA+N-1, DESCA( NB_ ) JB = MIN( DESCA( NB_ ), JA+N-J ) DO 120 H = 0, JB-1 IB = DESCA( IMB_ )-IA+1 IF( IB.LE.0 ) $ IB = ( (-IB) / DESCA( MB_ ) + 1 ) * DESCA( MB_ ) + IB IB = MIN( IB, M ) IN = IA+IB-1 IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 70 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K, J+H, $ DBLE( A( II+K+(JJ+H-1)*LDA ) ), $ DIMAG( A( II+K+(JJ+H-1)*LDA ) ) 70 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 80 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, IA+K-1, J+H, DBLE( WORK( K ) ), $ DIMAG( WORK( K ) ) 80 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) * * Loop over remaining block of rows * DO 110 I = IN+1, IA+M-1, DESCA( MB_ ) IB = MIN( DESCA( MB_ ), IA+M-I ) IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN DO 90 K = 0, IB-1 WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K, J+H, $ DBLE( A( II+K+(JJ+H-1)*LDA ) ), $ DIMAG( A( II+K+(JJ+H-1)*LDA ) ) 90 CONTINUE END IF ELSE IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN CALL ZGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), $ LDA, IRPRNT, ICPRNT ) ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN CALL ZGERV2D( ICTXT, IB, 1, WORK, LDW, ICURROW, $ ICURCOL ) DO 100 K = 1, IB WRITE( NOUT, FMT = 9999 ) $ CMATNM, I+K-1, J+H, DBLE( WORK( K ) ), $ DIMAG( WORK( K ) ) 100 CONTINUE END IF END IF IF( MYROW.EQ.ICURROW ) $ II = II + IB IF( .NOT.AISROWREP ) $ ICURROW = MOD( ICURROW+1, NPROW ) CALL BLACS_BARRIER( ICTXT, 'All' ) 110 CONTINUE * II = IIA ICURROW = IAROW 120 CONTINUE * IF( MYCOL.EQ.ICURCOL ) $ JJ = JJ + JB IF( .NOT.AISCOLREP ) $ ICURCOL = MOD( ICURCOL+1, NPCOL ) CALL BLACS_BARRIER( ICTXT, 'All' ) * 130 CONTINUE * 9999 FORMAT( 1X, A, '(', I6, ',', I6, ')=', D30.18, '+i*(', $ D30.18, ')' ) * RETURN * * End of PB_PZLAPRN2 * END SUBROUTINE PB_ZFILLPAD( ICTXT, M, N, A, LDA, IPRE, IPOST, CHKVAL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N COMPLEX*16 CHKVAL * .. * .. Array Arguments .. COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PB_ZFILLPAD surrounds a two dimensional local array with a guard-zone * initialized to the value CHKVAL. The user may later call the routine * PB_ZCHEKPAD to discover if the guardzone has been violated. There are * three guardzones. The first is a buffer of size IPRE that is before * the start of the array. The second is the buffer of size IPOST which * is after the end of the array to be padded. Finally, there is a guard * zone inside every column of the array to be padded, in the elements * of A(M+1:LDA, J). * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * M (local input) INTEGER * On entry, M specifies the number of rows in the local array * A. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns in the local ar- * ray A. N must be at least zero. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). On exit, this * array is the padded array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array to be padded. LDA must be at least MAX( 1, M ). * * IPRE (local input) INTEGER * On entry, IPRE specifies the size of the guard zone to put * before the start of the padded array. * * IPOST (local input) INTEGER * On entry, IPOST specifies the size of the guard zone to put * after the end of the padded array. * * CHKVAL (local input) COMPLEX*16 * On entry, CHKVAL specifies the value to pad the array with. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE A( I ) = CHKVAL 10 CONTINUE ELSE WRITE( *, FMT = '(A)' ) $ 'WARNING no pre-guardzone in PB_ZFILLPAD' END IF * * Put check buffer in back of A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 A( I ) = CHKVAL 20 CONTINUE ELSE WRITE( *, FMT = '(A)' ) $ 'WARNING no post-guardzone in PB_ZFILLPAD' END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + ( LDA - M ) - 1 A( I ) = CHKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * RETURN * * End of PB_ZFILLPAD * END SUBROUTINE PB_ZCHEKPAD( ICTXT, MESS, M, N, A, LDA, IPRE, IPOST, $ CHKVAL ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, IPOST, IPRE, LDA, M, N COMPLEX*16 CHKVAL * .. * .. Array Arguments .. CHARACTER*(*) MESS COMPLEX*16 A( * ) * .. * * Purpose * ======= * * PB_ZCHEKPAD checks that the padding around a local array has not been * overwritten since the call to PB_ZFILLPAD. Three types of errors are * reported: * * 1) Overwrite in pre-guardzone. This indicates a memory overwrite has * occurred in the first IPRE elements which form a buffer before the * beginning of A. Therefore, the error message: * 'Overwrite in pre-guardzone: loc( 5) = 18.00000' * tells that the 5th element of the IPRE long buffer has been overwrit- * ten with the value 18, where it should still have the value CHKVAL. * * 2) Overwrite in post-guardzone. This indicates a memory overwrite has * occurred in the last IPOST elements which form a buffer after the end * of A. Error reports are refered from the end of A. Therefore, * 'Overwrite in post-guardzone: loc( 19) = 24.00000' * tells that the 19th element after the end of A was overwritten with * the value 24, where it should still have the value of CHKVAL. * * 3) Overwrite in lda-m gap. Tells you elements between M and LDA were * overwritten. So, * 'Overwrite in lda-m gap: A( 12, 3) = 22.00000' * tells that the element at the 12th row and 3rd column of A was over- * written with the value of 22, where it should still have the value of * CHKVAL. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * MESS (local input) CHARACTER*(*) * On entry, MESS is a ttring containing a user-defined message. * * M (local input) INTEGER * On entry, M specifies the number of rows in the local array * A. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns in the local ar- * ray A. N must be at least zero. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the local * array to be padded. LDA must be at least MAX( 1, M ). * * IPRE (local input) INTEGER * On entry, IPRE specifies the size of the guard zone to put * before the start of the padded array. * * IPOST (local input) INTEGER * On entry, IPOST specifies the size of the guard zone to put * after the end of the padded array. * * CHKVAL (local input) COMPLEX*16 * On entry, CHKVAL specifies the value to pad the array with. * * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. CHARACTER*1 TOP INTEGER I, IAM, IDUMM, INFO, J, K, MYCOL, MYROW, NPCOL, $ NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, PB_TOPGET * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DIMAG * .. * .. Executable Statements .. * * Get grid parameters * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) IAM = MYROW*NPCOL + MYCOL INFO = -1 * * Check buffer in front of A * IF( IPRE.GT.0 ) THEN DO 10 I = 1, IPRE IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, ' pre', I, $ DBLE( A( I ) ), DIMAG( A( I ) ) INFO = IAM END IF 10 CONTINUE ELSE WRITE( *, FMT = * ) 'WARNING no pre-guardzone in PB_ZCHEKPAD' END IF * * Check buffer after A * IF( IPOST.GT.0 ) THEN J = IPRE+LDA*N+1 DO 20 I = J, J+IPOST-1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9998 ) MYROW, MYCOL, MESS, 'post', $ I-J+1, DBLE( A( I ) ), $ DIMAG( A( I ) ) INFO = IAM END IF 20 CONTINUE ELSE WRITE( *, FMT = * ) $ 'WARNING no post-guardzone buffer in PB_ZCHEKPAD' END IF * * Check all (LDA-M) gaps * IF( LDA.GT.M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K + (LDA-M) - 1 IF( A( I ).NE.CHKVAL ) THEN WRITE( *, FMT = 9997 ) MYROW, MYCOL, MESS, $ I-IPRE-LDA*(J-1), J, DBLE( A( I ) ), $ DIMAG( A( I ) ) INFO = IAM END IF 30 CONTINUE K = K + LDA 40 CONTINUE END IF * CALL PB_TOPGET( ICTXT, 'Combine', 'All', TOP ) CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, IDUMM, IDUMM, -1, $ 0, 0 ) IF( IAM.EQ.0 .AND. INFO.GE.0 ) THEN WRITE( *, FMT = 9999 ) INFO / NPCOL, MOD( INFO, NPCOL ), MESS END IF * 9999 FORMAT( '{', I5, ',', I5, '}: Memory overwrite in ', A ) 9998 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ A4, '-guardzone: loc(', I3, ') = ', G20.7, '+ i*', $ G20.7 ) 9997 FORMAT( '{', I5, ',', I5, '}: ', A, ' memory overwrite in ', $ 'lda-m gap: loc(', I3, ',', I3, ') = ', G20.7, $ '+ i*', G20.7 ) * RETURN * * End of PB_ZCHEKPAD * END SUBROUTINE PB_ZLASET( UPLO, M, N, IOFFD, ALPHA, BETA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PB_ZLASET initializes a two-dimensional array A to beta on the diago- * nal specified by IOFFD and alpha on the offdiagonals. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be set as follows: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of A is not changed, * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of A is not changed, * = 'D' or 'd' Only the diagonal of A is set, * Otherwise: All of the array A is set. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the value to which the offdiagonal * array elements are set to. * * BETA (input) COMPLEX*16 * On entry, BETA specifies the value to which the diagonal ar- * ray elements are set to. * * A (input/output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be set, and the strictly lower trape- * zoidal part of A is not referenced; When IUPLO = 'L' or 'l', * the leading m by n part of the array A must contain the * lower trapezoidal part of the matrix as specified by IOFFD to * be set, and the strictly upper trapezoidal part of A is * not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Set the diagonal to BETA and the strictly lower triangular * part of the array to ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = BETA DO 30 I = JTMP + 1, M A( I, J ) = ALPHA 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Set the diagonal to BETA and the strictly upper triangular * part of the array to ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 50 I = 1, JTMP - 1 A( I, J ) = ALPHA 50 CONTINUE A( JTMP, J ) = BETA 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Set the array to BETA on the diagonal. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 90 CONTINUE * ELSE * * Set the array to BETA on the diagonal and ALPHA on the * offdiagonal. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA 100 CONTINUE 110 CONTINUE IF( ALPHA.NE.BETA .AND. IOFFD.LT.M .AND. IOFFD.GT.-N ) THEN DO 120 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 120 CONTINUE END IF * END IF * RETURN * * End of PB_ZLASET * END SUBROUTINE PB_ZLASCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PB_ZLASCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * A (input/output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA * A( I, J ) 10 CONTINUE 20 CONTINUE DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = ALPHA * A( I, J ) 50 CONTINUE 60 CONTINUE DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = ALPHA * A( I, J ) 100 CONTINUE 110 CONTINUE * END IF * RETURN * * End of PB_ZLASCAL * END SUBROUTINE PB_ZLAGEN( UPLO, AFORM, A, LDA, LCMT00, IRAN, MBLKS, $ IMBLOC, MB, LMBLOC, NBLKS, INBLOC, NB, $ LNBLOC, JMP, IMULADD ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO, AFORM INTEGER IMBLOC, INBLOC, LCMT00, LDA, LMBLOC, LNBLOC, $ MB, MBLKS, NB, NBLKS * .. * .. Array Arguments .. INTEGER IMULADD( 4, * ), IRAN( * ), JMP( * ) COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PB_ZLAGEN locally initializes an array A. * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the lower (UPLO='L') trape- * zoidal part or the upper (UPLO='U') trapezoidal part is to be * generated when the matrix to be generated is symmetric or * Hermitian. For all the other values of AFORM, the value of * this input argument is ignored. * * AFORM (global input) CHARACTER*1 * On entry, AFORM specifies the type of submatrix to be genera- * ted as follows: * AFORM = 'S', sub( A ) is a symmetric matrix, * AFORM = 'H', sub( A ) is a Hermitian matrix, * AFORM = 'T', sub( A ) is overrwritten with the transpose * of what would normally be generated, * AFORM = 'C', sub( A ) is overwritten with the conjugate * transpose of what would normally be genera- * ted. * AFORM = 'N', a random submatrix is generated. * * A (local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, *). On exit, * this array contains the local entries of the randomly genera- * ted submatrix sub( A ). * * LDA (local input) INTEGER * On entry, LDA specifies the local leading dimension of the * array A. LDA must be at least one. * * LCMT00 (global input) INTEGER * On entry, LCMT00 is the LCM value specifying the off-diagonal * of the underlying matrix of interest. LCMT00=0 specifies the * main diagonal, LCMT00 > 0 specifies a subdiagonal, LCMT00 < 0 * specifies superdiagonals. * * IRAN (local input) INTEGER array * On entry, IRAN is an array of dimension 2 containing respec- * tively the 16-lower and 16-higher bits of the encoding of the * entry of the random sequence corresponding locally to the * first local array entry to generate. Usually, this array is * computed by PB_SETLOCRAN. * * MBLKS (local input) INTEGER * On entry, MBLKS specifies the local number of blocks of rows. * MBLKS is at least zero. * * IMBLOC (local input) INTEGER * On entry, IMBLOC specifies the number of rows (size) of the * local uppest blocks. IMBLOC is at least zero. * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * LMBLOC (local input) INTEGER * On entry, LMBLOC specifies the number of rows (size) of the * local lowest blocks. LMBLOC is at least zero. * * NBLKS (local input) INTEGER * On entry, NBLKS specifies the local number of blocks of co- * lumns. NBLKS is at least zero. * * INBLOC (local input) INTEGER * On entry, INBLOC specifies the number of columns (size) of * the local leftmost blocks. INBLOC is at least zero. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * LNBLOC (local input) INTEGER * On entry, LNBLOC specifies the number of columns (size) of * the local rightmost blocks. LNBLOC is at least zero. * * JMP (local input) INTEGER array * On entry, JMP is an array of dimension JMP_LEN containing the * different jump values used by the random matrix generator. * * IMULADD (local input) INTEGER array * On entry, IMULADD is an array of dimension (4, JMP_LEN). The * jth column of this array contains the encoded initial cons- * tants a_j and c_j to jump from X( n ) to X( n + JMP( j ) ) * (= a_j * X( n ) + c_j) in the random sequence. IMULADD(1:2,j) * contains respectively the 16-lower and 16-higher bits of the * constant a_j, and IMULADD(3:4,j) contains the 16-lower and * 16-higher bits of the constant c_j. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER JMP_1, JMP_COL, JMP_IMBV, JMP_INBV, JMP_LEN, $ JMP_MB, JMP_NB, JMP_NPIMBLOC, JMP_NPMB, $ JMP_NQINBLOC, JMP_NQNB, JMP_ROW PARAMETER ( JMP_1 = 1, JMP_ROW = 2, JMP_COL = 3, $ JMP_MB = 4, JMP_IMBV = 5, JMP_NPMB = 6, $ JMP_NPIMBLOC = 7, JMP_NB = 8, JMP_INBV = 9, $ JMP_NQNB = 10, JMP_NQINBLOC = 11, $ JMP_LEN = 11 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IB, IBLK, II, IK, ITMP, JB, JBLK, JJ, JK, $ JTMP, LCMTC, LCMTR, LOW, MNB, UPP COMPLEX*16 DUMMY * .. * .. Local Arrays .. INTEGER IB0( 2 ), IB1( 2 ), IB2( 2 ), IB3( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_JUMPIT * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PB_DRAND EXTERNAL LSAME, PB_DRAND * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN * .. * .. Executable Statements .. * DO 10 I = 1, 2 IB1( I ) = IRAN( I ) IB2( I ) = IRAN( I ) IB3( I ) = IRAN( I ) 10 CONTINUE * IF( LSAME( AFORM, 'N' ) ) THEN * * Generate random matrix * JJ = 1 * DO 50 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * DO 40 JK = JJ, JJ + JB - 1 * II = 1 * DO 30 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * * Blocks are IB by JB * DO 20 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 20 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 30 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 40 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 50 CONTINUE * ELSE IF( LSAME( AFORM, 'T' ) ) THEN * * Generate the transpose of the matrix that would be normally * generated. * II = 1 * DO 90 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 80 IK = II, II + IB - 1 * JJ = 1 * DO 70 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 60 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 60 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 70 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 80 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 90 CONTINUE * ELSE IF( LSAME( AFORM, 'S' ) ) THEN * * Generate a symmetric matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 170 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 160 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 150 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 100 IK = II, II + IB - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 100 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 110 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 110 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 120 IK = II, ITMP - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 120 CONTINUE * DO 130 IK = ITMP, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 130 CONTINUE * END IF * ELSE * DO 140 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 140 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 150 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 160 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 170 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 250 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 240 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 230 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 180 JK = JJ, JJ + JB - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 180 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 190 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 190 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 200 JK = JJ, JTMP - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 200 CONTINUE * DO 210 JK = JTMP, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 210 CONTINUE * END IF * ELSE * DO 220 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 220 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 230 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 240 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 250 CONTINUE * END IF * ELSE IF( LSAME( AFORM, 'C' ) ) THEN * * Generate the conjugate transpose of the matrix that would be * normally generated. * II = 1 * DO 290 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC ELSE IB = MB END IF * DO 280 IK = II, II + IB - 1 * JJ = 1 * DO 270 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC ELSE JB = NB END IF * * Blocks are IB by JB * DO 260 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 260 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 270 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 280 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 290 CONTINUE * ELSE IF( LSAME( AFORM, 'H' ) ) THEN * * Generate a Hermitian matrix * IF( LSAME( UPLO, 'L' ) ) THEN * * generate lower trapezoidal part * JJ = 1 LCMTC = LCMT00 * DO 370 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * DO 360 JK = JJ, JJ + JB - 1 * II = 1 LCMTR = LCMTC * DO 350 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * * Blocks are IB by JB * IF( LCMTR.GT.UPP ) THEN * DO 300 IK = II, II + IB - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 300 CONTINUE * ELSE IF( LCMTR.GE.LOW ) THEN * JTMP = JK - JJ + 1 MNB = MAX( 0, -LCMTR ) * IF( JTMP.LE.MIN( MNB, JB ) ) THEN * DO 310 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 310 CONTINUE * ELSE IF( ( JTMP.GE.( MNB + 1 ) ) .AND. $ ( JTMP.LE.MIN( IB-LCMTR, JB ) ) ) THEN * ITMP = II + JTMP + LCMTR - 1 * DO 320 IK = II, ITMP - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 320 CONTINUE * IF( ITMP.LE.( II + IB - 1 ) ) THEN DUMMY = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) A( ITMP, JK ) = DCMPLX( DBLE( DUMMY ), $ ZERO ) END IF * DO 330 IK = ITMP + 1, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 330 CONTINUE * END IF * ELSE * DO 340 IK = II, II + IB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ PB_DRAND( 0 ) ) 340 CONTINUE * END IF * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 350 CONTINUE * * Jump one column * CALL PB_JUMPIT( IMULADD( 1, JMP_COL ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 360 CONTINUE * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB3, IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 370 CONTINUE * ELSE * * generate upper trapezoidal part * II = 1 LCMTR = LCMT00 * DO 450 IBLK = 1, MBLKS * IF( IBLK.EQ.1 ) THEN IB = IMBLOC UPP = IMBLOC - 1 ELSE IF( IBLK.EQ.MBLKS ) THEN IB = LMBLOC UPP = MB - 1 ELSE IB = MB UPP = MB - 1 END IF * DO 440 IK = II, II + IB - 1 * JJ = 1 LCMTC = LCMTR * DO 430 JBLK = 1, NBLKS * IF( JBLK.EQ.1 ) THEN JB = INBLOC LOW = 1 - INBLOC ELSE IF( JBLK.EQ.NBLKS ) THEN JB = LNBLOC LOW = 1 - NB ELSE JB = NB LOW = 1 - NB END IF * * Blocks are IB by JB * IF( LCMTC.LT.LOW ) THEN * DO 380 JK = JJ, JJ + JB - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 380 CONTINUE * ELSE IF( LCMTC.LE.UPP ) THEN * ITMP = IK - II + 1 MNB = MAX( 0, LCMTC ) * IF( ITMP.LE.MIN( MNB, IB ) ) THEN * DO 390 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 390 CONTINUE * ELSE IF( ( ITMP.GE.( MNB + 1 ) ) .AND. $ ( ITMP.LE.MIN( JB+LCMTC, IB ) ) ) THEN * JTMP = JJ + ITMP - LCMTC - 1 * DO 400 JK = JJ, JTMP - 1 DUMMY = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 400 CONTINUE * IF( JTMP.LE.( JJ + JB - 1 ) ) THEN DUMMY = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) A( IK, JTMP ) = DCMPLX( DBLE( DUMMY ), $ ZERO ) END IF * DO 410 JK = JTMP + 1, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 410 CONTINUE * END IF * ELSE * DO 420 JK = JJ, JJ + JB - 1 A( IK, JK ) = DCMPLX( PB_DRAND( 0 ), $ -PB_DRAND( 0 ) ) 420 CONTINUE * END IF * JJ = JJ + JB * IF( JBLK.EQ.1 ) THEN * * Jump INBLOC + ( NPCOL - 1 ) * NB columns * LCMTC = LCMTC + JMP( JMP_NQINBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQINBLOC ), IB1, $ IB0 ) * ELSE * * Jump NPCOL * NB columns * LCMTC = LCMTC + JMP( JMP_NQNB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NQNB ), IB1, $ IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) * 430 CONTINUE * * Jump one row * CALL PB_JUMPIT( IMULADD( 1, JMP_ROW ), IB2, IB0 ) * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) * 440 CONTINUE * II = II + IB * IF( IBLK.EQ.1 ) THEN * * Jump IMBLOC + ( NPROW - 1 ) * MB rows * LCMTR = LCMTR - JMP( JMP_NPIMBLOC ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPIMBLOC ), IB3, IB0 ) * ELSE * * Jump NPROW * MB rows * LCMTR = LCMTR - JMP( JMP_NPMB ) CALL PB_JUMPIT( IMULADD( 1, JMP_NPMB ), IB3, IB0 ) * END IF * IB1( 1 ) = IB0( 1 ) IB1( 2 ) = IB0( 2 ) IB2( 1 ) = IB0( 1 ) IB2( 2 ) = IB0( 2 ) IB3( 1 ) = IB0( 1 ) IB3( 2 ) = IB0( 2 ) * 450 CONTINUE * END IF * END IF * RETURN * * End of PB_ZLAGEN * END DOUBLE PRECISION FUNCTION PB_DRAND( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_DRAND generates the next number in the random sequence. This func- * tion ensures that this number will be in the interval ( -1.0, 1.0 ). * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0 ) * .. * .. External Functions .. DOUBLE PRECISION PB_DRAN EXTERNAL PB_DRAN * .. * .. Executable Statements .. * PB_DRAND = ONE - TWO * PB_DRAN( IDUMM ) * RETURN * * End of PB_DRAND * END DOUBLE PRECISION FUNCTION PB_DRAN( IDUMM ) * * -- PBLAS test routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER IDUMM * .. * * Purpose * ======= * * PB_DRAN generates the next number in the random sequence. * * Arguments * ========= * * IDUMM (local input) INTEGER * This argument is ignored, but necessary to a FORTRAN 77 func- * tion. * * Further Details * =============== * * On entry, the array IRAND stored in the common block RANCOM contains * the information (2 integers) required to generate the next number in * the sequence X( n ). This number is computed as * * X( n ) = ( 2^16 * IRAND( 2 ) + IRAND( 1 ) ) / d, * * where the constant d is the largest 32 bit positive integer. The * array IRAND is then updated for the generation of the next number * X( n+1 ) in the random sequence as follows X( n+1 ) = a * X( n ) + c. * The constants a and c should have been preliminarily stored in the * array IACS as 2 pairs of integers. The initial set up of IRAND and * IACS is performed by the routine PB_SETRAN. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION DIVFAC, POW16 PARAMETER ( DIVFAC = 2.147483648D+9, $ POW16 = 6.5536D+4 ) * .. * .. Local Arrays .. INTEGER J( 2 ) * .. * .. External Subroutines .. EXTERNAL PB_LADD, PB_LMUL * .. * .. Intrinsic Functions .. INTRINSIC DBLE * .. * .. Common Blocks .. INTEGER IACS( 4 ), IRAND( 2 ) COMMON /RANCOM/ IRAND, IACS * .. * .. Save Statements .. SAVE /RANCOM/ * .. * .. Executable Statements .. * PB_DRAN = ( DBLE( IRAND( 1 ) ) + POW16 * DBLE( IRAND( 2 ) ) ) / $ DIVFAC * CALL PB_LMUL( IRAND, IACS, J ) CALL PB_LADD( J, IACS( 3 ), IRAND ) * RETURN * * End of PB_DRAN * END scalapack-2.0.2/PBLAS/TESTING/slamch.f000644 000766 000024 00000060374 10363532303 017315 0ustar00juliestaff000000 000000 REAL FUNCTION SLAMCH( CMACH ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. CHARACTER CMACH * .. * * Purpose * ======= * * SLAMCH determines single precision machine parameters. * * Arguments * ========= * * CMACH (input) CHARACTER*1 * Specifies the value to be returned by SLAMCH: * = 'E' or 'e', SLAMCH := eps * = 'S' or 's , SLAMCH := sfmin * = 'B' or 'b', SLAMCH := base * = 'P' or 'p', SLAMCH := eps*base * = 'N' or 'n', SLAMCH := t * = 'R' or 'r', SLAMCH := rnd * = 'M' or 'm', SLAMCH := emin * = 'U' or 'u', SLAMCH := rmin * = 'L' or 'l', SLAMCH := emax * = 'O' or 'o', SLAMCH := rmax * * where * * eps = relative machine precision * sfmin = safe minimum, such that 1/sfmin does not overflow * base = base of the machine * prec = eps*base * t = number of (base) digits in the mantissa * rnd = 1.0 when rounding occurs in addition, 0.0 otherwise * emin = minimum exponent before (gradual) underflow * rmin = underflow threshold - base**(emin-1) * emax = largest exponent before overflow * rmax = overflow threshold - (base**emax)*(1-eps) * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL FIRST, LRND INTEGER BETA, IMAX, IMIN, IT REAL BASE, EMAX, EMIN, EPS, PREC, RMACH, RMAX, RMIN, $ RND, SFMIN, SMALL, T * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL SLAMC2 * .. * .. Save statement .. SAVE FIRST, EPS, SFMIN, BASE, T, RND, EMIN, RMIN, $ EMAX, RMAX, PREC * .. * .. Data statements .. DATA FIRST / .TRUE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. CALL SLAMC2( BETA, IT, LRND, EPS, IMIN, RMIN, IMAX, RMAX ) BASE = BETA T = IT IF( LRND ) THEN RND = ONE EPS = ( BASE**( 1-IT ) ) / 2 ELSE RND = ZERO EPS = BASE**( 1-IT ) END IF PREC = EPS*BASE EMIN = IMIN EMAX = IMAX SFMIN = RMIN SMALL = ONE / RMAX IF( SMALL.GE.SFMIN ) THEN * * Use SMALL plus a bit, to avoid the possibility of rounding * causing overflow when computing 1/sfmin. * SFMIN = SMALL*( ONE+EPS ) END IF END IF * IF( LSAME( CMACH, 'E' ) ) THEN RMACH = EPS ELSE IF( LSAME( CMACH, 'S' ) ) THEN RMACH = SFMIN ELSE IF( LSAME( CMACH, 'B' ) ) THEN RMACH = BASE ELSE IF( LSAME( CMACH, 'P' ) ) THEN RMACH = PREC ELSE IF( LSAME( CMACH, 'N' ) ) THEN RMACH = T ELSE IF( LSAME( CMACH, 'R' ) ) THEN RMACH = RND ELSE IF( LSAME( CMACH, 'M' ) ) THEN RMACH = EMIN ELSE IF( LSAME( CMACH, 'U' ) ) THEN RMACH = RMIN ELSE IF( LSAME( CMACH, 'L' ) ) THEN RMACH = EMAX ELSE IF( LSAME( CMACH, 'O' ) ) THEN RMACH = RMAX END IF * SLAMCH = RMACH RETURN * * End of SLAMCH * END * ************************************************************************ * SUBROUTINE SLAMC1( BETA, T, RND, IEEE1 ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL IEEE1, RND INTEGER BETA, T * .. * * Purpose * ======= * * SLAMC1 determines the machine parameters given by BETA, T, RND, and * IEEE1. * * Arguments * ========= * * BETA (output) INTEGER * The base of the machine. * * T (output) INTEGER * The number of ( BETA ) digits in the mantissa. * * RND (output) LOGICAL * Specifies whether proper rounding ( RND = .TRUE. ) or * chopping ( RND = .FALSE. ) occurs in addition. This may not * be a reliable guide to the way in which the machine performs * its arithmetic. * * IEEE1 (output) LOGICAL * Specifies whether rounding appears to be done in the IEEE * 'round to nearest' style. * * Further Details * =============== * * The routine is based on the routine ENVRON by Malcolm and * incorporates suggestions by Gentleman and Marovich. See * * Malcolm M. A. (1972) Algorithms to reveal properties of * floating-point arithmetic. Comms. of the ACM, 15, 949-951. * * Gentleman W. M. and Marovich S. B. (1974) More on algorithms * that reveal properties of floating point arithmetic units. * Comms. of the ACM, 17, 276-277. * * ===================================================================== * * .. Local Scalars .. LOGICAL FIRST, LIEEE1, LRND INTEGER LBETA, LT REAL A, B, C, F, ONE, QTR, SAVEC, T1, T2 * .. * .. External Functions .. REAL SLAMC3 EXTERNAL SLAMC3 * .. * .. Save statement .. SAVE FIRST, LIEEE1, LBETA, LRND, LT * .. * .. Data statements .. DATA FIRST / .TRUE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. ONE = 1 * * LBETA, LIEEE1, LT and LRND are the local values of BETA, * IEEE1, T and RND. * * Throughout this routine we use the function SLAMC3 to ensure * that relevant values are stored and not held in registers, or * are not affected by optimizers. * * Compute a = 2.0**m with the smallest positive integer m such * that * * fl( a + 1.0 ) = a. * A = 1 C = 1 * *+ WHILE( C.EQ.ONE )LOOP 10 CONTINUE IF( C.EQ.ONE ) THEN A = 2*A C = SLAMC3( A, ONE ) C = SLAMC3( C, -A ) GO TO 10 END IF *+ END WHILE * * Now compute b = 2.0**m with the smallest positive integer m * such that * * fl( a + b ) .gt. a. * B = 1 C = SLAMC3( A, B ) * *+ WHILE( C.EQ.A )LOOP 20 CONTINUE IF( C.EQ.A ) THEN B = 2*B C = SLAMC3( A, B ) GO TO 20 END IF *+ END WHILE * * Now compute the base. a and c are neighbouring floating point * numbers in the interval ( beta**t, beta**( t + 1 ) ) and so * their difference is beta. Adding 0.25 to c is to ensure that it * is truncated to beta and not ( beta - 1 ). * QTR = ONE / 4 SAVEC = C C = SLAMC3( C, -A ) LBETA = C + QTR * * Now determine whether rounding or chopping occurs, by adding a * bit less than beta/2 and a bit more than beta/2 to a. * B = LBETA F = SLAMC3( B / 2, -B / 100 ) C = SLAMC3( F, A ) IF( C.EQ.A ) THEN LRND = .TRUE. ELSE LRND = .FALSE. END IF F = SLAMC3( B / 2, B / 100 ) C = SLAMC3( F, A ) IF( ( LRND ) .AND. ( C.EQ.A ) ) $ LRND = .FALSE. * * Try and decide whether rounding is done in the IEEE 'round to * nearest' style. B/2 is half a unit in the last place of the two * numbers A and SAVEC. Furthermore, A is even, i.e. has last bit * zero, and SAVEC is odd. Thus adding B/2 to A should not change * A, but adding B/2 to SAVEC should change SAVEC. * T1 = SLAMC3( B / 2, A ) T2 = SLAMC3( B / 2, SAVEC ) LIEEE1 = ( T1.EQ.A ) .AND. ( T2.GT.SAVEC ) .AND. LRND * * Now find the mantissa, t. It should be the integer part of * log to the base beta of a, however it is safer to determine t * by powering. So we find t as the smallest positive integer for * which * * fl( beta**t + 1.0 ) = 1.0. * LT = 0 A = 1 C = 1 * *+ WHILE( C.EQ.ONE )LOOP 30 CONTINUE IF( C.EQ.ONE ) THEN LT = LT + 1 A = A*LBETA C = SLAMC3( A, ONE ) C = SLAMC3( C, -A ) GO TO 30 END IF *+ END WHILE * END IF * BETA = LBETA T = LT RND = LRND IEEE1 = LIEEE1 RETURN * * End of SLAMC1 * END * ************************************************************************ * SUBROUTINE SLAMC2( BETA, T, RND, EPS, EMIN, RMIN, EMAX, RMAX ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL RND INTEGER BETA, EMAX, EMIN, T REAL EPS, RMAX, RMIN * .. * * Purpose * ======= * * SLAMC2 determines the machine parameters specified in its argument * list. * * Arguments * ========= * * BETA (output) INTEGER * The base of the machine. * * T (output) INTEGER * The number of ( BETA ) digits in the mantissa. * * RND (output) LOGICAL * Specifies whether proper rounding ( RND = .TRUE. ) or * chopping ( RND = .FALSE. ) occurs in addition. This may not * be a reliable guide to the way in which the machine performs * its arithmetic. * * EPS (output) REAL * The smallest positive number such that * * fl( 1.0 - EPS ) .LT. 1.0, * * where fl denotes the computed value. * * EMIN (output) INTEGER * The minimum exponent before (gradual) underflow occurs. * * RMIN (output) REAL * The smallest normalized number for the machine, given by * BASE**( EMIN - 1 ), where BASE is the floating point value * of BETA. * * EMAX (output) INTEGER * The maximum exponent before overflow occurs. * * RMAX (output) REAL * The largest positive number for the machine, given by * BASE**EMAX * ( 1 - EPS ), where BASE is the floating point * value of BETA. * * Further Details * =============== * * The computation of EPS is based on a routine PARANOIA by * W. Kahan of the University of California at Berkeley. * * ===================================================================== * * .. Local Scalars .. LOGICAL FIRST, IEEE, IWARN, LIEEE1, LRND INTEGER GNMIN, GPMIN, I, LBETA, LEMAX, LEMIN, LT, $ NGNMIN, NGPMIN REAL A, B, C, HALF, LEPS, LRMAX, LRMIN, ONE, RBASE, $ SIXTH, SMALL, THIRD, TWO, ZERO * .. * .. External Functions .. REAL SLAMC3 EXTERNAL SLAMC3 * .. * .. External Subroutines .. EXTERNAL SLAMC1, SLAMC4, SLAMC5 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Save statement .. SAVE FIRST, IWARN, LBETA, LEMAX, LEMIN, LEPS, LRMAX, $ LRMIN, LT * .. * .. Data statements .. DATA FIRST / .TRUE. / , IWARN / .FALSE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. ZERO = 0 ONE = 1 TWO = 2 * * LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values of * BETA, T, RND, EPS, EMIN and RMIN. * * Throughout this routine we use the function SLAMC3 to ensure * that relevant values are stored and not held in registers, or * are not affected by optimizers. * * SLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. * CALL SLAMC1( LBETA, LT, LRND, LIEEE1 ) * * Start to find EPS. * B = LBETA A = B**( -LT ) LEPS = A * * Try some tricks to see whether or not this is the correct EPS. * B = TWO / 3 HALF = ONE / 2 SIXTH = SLAMC3( B, -HALF ) THIRD = SLAMC3( SIXTH, SIXTH ) B = SLAMC3( THIRD, -HALF ) B = SLAMC3( B, SIXTH ) B = ABS( B ) IF( B.LT.LEPS ) $ B = LEPS * LEPS = 1 * *+ WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP 10 CONTINUE IF( ( LEPS.GT.B ) .AND. ( B.GT.ZERO ) ) THEN LEPS = B C = SLAMC3( HALF*LEPS, ( TWO**5 )*( LEPS**2 ) ) C = SLAMC3( HALF, -C ) B = SLAMC3( HALF, C ) C = SLAMC3( HALF, -B ) B = SLAMC3( HALF, C ) GO TO 10 END IF *+ END WHILE * IF( A.LT.LEPS ) $ LEPS = A * * Computation of EPS complete. * * Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3)). * Keep dividing A by BETA until (gradual) underflow occurs. This * is detected when we cannot recover the previous A. * RBASE = ONE / LBETA SMALL = ONE DO 20 I = 1, 3 SMALL = SLAMC3( SMALL*RBASE, ZERO ) 20 CONTINUE A = SLAMC3( ONE, SMALL ) CALL SLAMC4( NGPMIN, ONE, LBETA ) CALL SLAMC4( NGNMIN, -ONE, LBETA ) CALL SLAMC4( GPMIN, A, LBETA ) CALL SLAMC4( GNMIN, -A, LBETA ) IEEE = .FALSE. * IF( ( NGPMIN.EQ.NGNMIN ) .AND. ( GPMIN.EQ.GNMIN ) ) THEN IF( NGPMIN.EQ.GPMIN ) THEN LEMIN = NGPMIN * ( Non twos-complement machines, no gradual underflow; * e.g., VAX ) ELSE IF( ( GPMIN-NGPMIN ).EQ.3 ) THEN LEMIN = NGPMIN - 1 + LT IEEE = .TRUE. * ( Non twos-complement machines, with gradual underflow; * e.g., IEEE standard followers ) ELSE LEMIN = MIN( NGPMIN, GPMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE IF( ( NGPMIN.EQ.GPMIN ) .AND. ( NGNMIN.EQ.GNMIN ) ) THEN IF( ABS( NGPMIN-NGNMIN ).EQ.1 ) THEN LEMIN = MAX( NGPMIN, NGNMIN ) * ( Twos-complement machines, no gradual underflow; * e.g., CYBER 205 ) ELSE LEMIN = MIN( NGPMIN, NGNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE IF( ( ABS( NGPMIN-NGNMIN ).EQ.1 ) .AND. $ ( GPMIN.EQ.GNMIN ) ) THEN IF( ( GPMIN-MIN( NGPMIN, NGNMIN ) ).EQ.3 ) THEN LEMIN = MAX( NGPMIN, NGNMIN ) - 1 + LT * ( Twos-complement machines with gradual underflow; * no known machine ) ELSE LEMIN = MIN( NGPMIN, NGNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE LEMIN = MIN( NGPMIN, NGNMIN, GPMIN, GNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF *** * Comment out this if block if EMIN is ok IF( IWARN ) THEN FIRST = .TRUE. WRITE( 6, FMT = 9999 )LEMIN END IF *** * * Assume IEEE arithmetic if we found denormalised numbers above, * or if arithmetic seems to round in the IEEE style, determined * in routine SLAMC1. A true IEEE machine should have both things * true; however, faulty machines may have one or the other. * IEEE = IEEE .OR. LIEEE1 * * Compute RMIN by successive division by BETA. We could compute * RMIN as BASE**( EMIN - 1 ), but some machines underflow during * this computation. * LRMIN = 1 DO 30 I = 1, 1 - LEMIN LRMIN = SLAMC3( LRMIN*RBASE, ZERO ) 30 CONTINUE * * Finally, call SLAMC5 to compute EMAX and RMAX. * CALL SLAMC5( LBETA, LT, LEMIN, IEEE, LEMAX, LRMAX ) END IF * BETA = LBETA T = LT RND = LRND EPS = LEPS EMIN = LEMIN RMIN = LRMIN EMAX = LEMAX RMAX = LRMAX * RETURN * 9999 FORMAT( / / ' WARNING. The value EMIN may be incorrect:-', $ ' EMIN = ', I8, / $ ' If, after inspection, the value EMIN looks', $ ' acceptable please comment out ', $ / ' the IF block as marked within the code of routine', $ ' SLAMC2,', / ' otherwise supply EMIN explicitly.', / ) * * End of SLAMC2 * END * ************************************************************************ * REAL FUNCTION SLAMC3( A, B ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. REAL A, B * .. * * Purpose * ======= * * SLAMC3 is intended to force A and B to be stored prior to doing * the addition of A and B , for use in situations where optimizers * might hold one of these in a register. * * Arguments * ========= * * A, B (input) REAL * The values A and B. * * ===================================================================== * * .. Executable Statements .. * SLAMC3 = A + B * RETURN * * End of SLAMC3 * END * ************************************************************************ * SUBROUTINE SLAMC4( EMIN, START, BASE ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. INTEGER BASE, EMIN REAL START * .. * * Purpose * ======= * * SLAMC4 is a service routine for SLAMC2. * * Arguments * ========= * * EMIN (output) EMIN * The minimum exponent before (gradual) underflow, computed by * setting A = START and dividing by BASE until the previous A * can not be recovered. * * START (input) REAL * The starting point for determining EMIN. * * BASE (input) INTEGER * The base of the machine. * * ===================================================================== * * .. Local Scalars .. INTEGER I REAL A, B1, B2, C1, C2, D1, D2, ONE, RBASE, ZERO * .. * .. External Functions .. REAL SLAMC3 EXTERNAL SLAMC3 * .. * .. Executable Statements .. * A = START ONE = 1 RBASE = ONE / BASE ZERO = 0 EMIN = 1 B1 = SLAMC3( A*RBASE, ZERO ) C1 = A C2 = A D1 = A D2 = A *+ WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. * $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP 10 CONTINUE IF( ( C1.EQ.A ) .AND. ( C2.EQ.A ) .AND. ( D1.EQ.A ) .AND. $ ( D2.EQ.A ) ) THEN EMIN = EMIN - 1 A = B1 B1 = SLAMC3( A / BASE, ZERO ) C1 = SLAMC3( B1*BASE, ZERO ) D1 = ZERO DO 20 I = 1, BASE D1 = D1 + B1 20 CONTINUE B2 = SLAMC3( A*RBASE, ZERO ) C2 = SLAMC3( B2 / RBASE, ZERO ) D2 = ZERO DO 30 I = 1, BASE D2 = D2 + B2 30 CONTINUE GO TO 10 END IF *+ END WHILE * RETURN * * End of SLAMC4 * END * ************************************************************************ * SUBROUTINE SLAMC5( BETA, P, EMIN, IEEE, EMAX, RMAX ) * * -- LAPACK auxiliary routine (version 2.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL IEEE INTEGER BETA, EMAX, EMIN, P REAL RMAX * .. * * Purpose * ======= * * SLAMC5 attempts to compute RMAX, the largest machine floating-point * number, without overflow. It assumes that EMAX + abs(EMIN) sum * approximately to a power of 2. It will fail on machines where this * assumption does not hold, for example, the Cyber 205 (EMIN = -28625, * EMAX = 28718). It will also fail if the value supplied for EMIN is * too large (i.e. too close to zero), probably with overflow. * * Arguments * ========= * * BETA (input) INTEGER * The base of floating-point arithmetic. * * P (input) INTEGER * The number of base BETA digits in the mantissa of a * floating-point value. * * EMIN (input) INTEGER * The minimum exponent before (gradual) underflow. * * IEEE (input) LOGICAL * A logical flag specifying whether or not the arithmetic * system is thought to comply with the IEEE standard. * * EMAX (output) INTEGER * The largest exponent before overflow * * RMAX (output) REAL * The largest machine floating-point number. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) * .. * .. Local Scalars .. INTEGER EXBITS, EXPSUM, I, LEXP, NBITS, TRY, UEXP REAL OLDY, RECBAS, Y, Z * .. * .. External Functions .. REAL SLAMC3 EXTERNAL SLAMC3 * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * First compute LEXP and UEXP, two powers of 2 that bound * abs(EMIN). We then assume that EMAX + abs(EMIN) will sum * approximately to the bound that is closest to abs(EMIN). * (EMAX is the exponent of the required number RMAX). * LEXP = 1 EXBITS = 1 10 CONTINUE TRY = LEXP*2 IF( TRY.LE.( -EMIN ) ) THEN LEXP = TRY EXBITS = EXBITS + 1 GO TO 10 END IF IF( LEXP.EQ.-EMIN ) THEN UEXP = LEXP ELSE UEXP = TRY EXBITS = EXBITS + 1 END IF * * Now -LEXP is less than or equal to EMIN, and -UEXP is greater * than or equal to EMIN. EXBITS is the number of bits needed to * store the exponent. * IF( ( UEXP+EMIN ).GT.( -LEXP-EMIN ) ) THEN EXPSUM = 2*LEXP ELSE EXPSUM = 2*UEXP END IF * * EXPSUM is the exponent range, approximately equal to * EMAX - EMIN + 1 . * EMAX = EXPSUM + EMIN - 1 NBITS = 1 + EXBITS + P * * NBITS is the total number of bits needed to store a * floating-point number. * IF( ( MOD( NBITS, 2 ).EQ.1 ) .AND. ( BETA.EQ.2 ) ) THEN * * Either there are an odd number of bits used to store a * floating-point number, which is unlikely, or some bits are * not used in the representation of numbers, which is possible, * (e.g. Cray machines) or the mantissa has an implicit bit, * (e.g. IEEE machines, Dec Vax machines), which is perhaps the * most likely. We have to assume the last alternative. * If this is true, then we need to reduce EMAX by one because * there must be some way of representing zero in an implicit-bit * system. On machines like Cray, we are reducing EMAX by one * unnecessarily. * EMAX = EMAX - 1 END IF * IF( IEEE ) THEN * * Assume we are on an IEEE machine which reserves one exponent * for infinity and NaN. * EMAX = EMAX - 1 END IF * * Now create RMAX, the largest machine number, which should * be equal to (1.0 - BETA**(-P)) * BETA**EMAX . * * First compute 1.0 - BETA**(-P), being careful that the * result is less than 1.0 . * RECBAS = ONE / BETA Z = BETA - ONE Y = ZERO DO 20 I = 1, P Z = Z*RECBAS IF( Y.LT.ONE ) $ OLDY = Y Y = SLAMC3( Y, Z ) 20 CONTINUE IF( Y.GE.ONE ) $ Y = OLDY * * Now multiply by BETA**EMAX to get RMAX. * DO 30 I = 1, EMAX Y = SLAMC3( Y*BETA, ZERO ) 30 CONTINUE * RMAX = Y RETURN * * End of SLAMC5 * END scalapack-2.0.2/PBLAS/SRC/CMakeLists.txt000644 000766 000024 00000005024 11661300033 017733 0ustar00juliestaff000000 000000 add_subdirectory(PBBLAS) add_subdirectory(PTZBLAS) add_subdirectory(PTOOLS) set (APPBLAS pilaenv.f) #--------------------------------------------------------------------------- # Level 1 PBLAS. #--------------------------------------------------------------------------- set (PIBLAS1 picopy_.c) set (PSBLAS1 psswap_.c psscal_.c pscopy_.c psaxpy_.c psdot_.c psnrm2_.c psasum_.c psamax_.c) set (PCBLAS1 pcswap_.c pcscal_.c pcsscal_.c pccopy_.c pcaxpy_.c pcdotu_.c pcdotc_.c pscnrm2_.c pscasum_.c pcamax_.c) set (PDBLAS1 pdswap_.c pdscal_.c pdcopy_.c pdaxpy_.c pddot_.c pdnrm2_.c pdasum_.c pdamax_.c) set (PZBLAS1 pzswap_.c pzscal_.c pzdscal_.c pzcopy_.c pzaxpy_.c pzdotu_.c pzdotc_.c pdznrm2_.c pdzasum_.c pzamax_.c) #--------------------------------------------------------------------------- # Level 2 PBLAS. #--------------------------------------------------------------------------- set (PSBLAS2 psgemv_.c psger_.c pssymv_.c pssyr_.c pssyr2_.c pstrmv_.c pstrsv_.c psagemv_.c psasymv_.c psatrmv_.c) set (PCBLAS2 pcgemv_.c pcgerc_.c pcgeru_.c pchemv_.c pcher_.c pcher2_.c pctrmv_.c pctrsv_.c pcagemv_.c pcahemv_.c pcatrmv_.c) set (PDBLAS2 pdgemv_.c pdger_.c pdsymv_.c pdsyr_.c pdsyr2_.c pdtrmv_.c pdtrsv_.c pdagemv_.c pdasymv_.c pdatrmv_.c) set (PZBLAS2 pzgemv_.c pzgerc_.c pzgeru_.c pzhemv_.c pzher_.c pzher2_.c pztrmv_.c pztrsv_.c pzagemv_.c pzahemv_.c pzatrmv_.c) #--------------------------------------------------------------------------- # Level 3 PBLAS. #--------------------------------------------------------------------------- set (PSBLAS3 psgeadd_.c psgemm_.c pssymm_.c pssyr2k_.c pssyrk_.c pstradd_.c pstran_.c pstrmm_.c pstrsm_.c) set (PCBLAS3 pcgeadd_.c pcgemm_.c pchemm_.c pcher2k_.c pcherk_.c pcsymm_.c pcsyr2k_.c pcsyrk_.c pctradd_.c pctranc_.c pctranu_.c pctrmm_.c pctrsm_.c) set (PDBLAS3 pdgeadd_.c pdgemm_.c pdsymm_.c pdsyr2k_.c pdsyrk_.c pdtradd_.c pdtran_.c pdtrmm_.c pdtrsm_.c) set (PZBLAS3 pzgeadd_.c pzgemm_.c pzhemm_.c pzher2k_.c pzherk_.c pzsymm_.c pzsyr2k_.c pzsyrk_.c pztradd_.c pztranc_.c pztranu_.c pztrmm_.c pztrsm_.c) #--------------------------------------------------------------------------- set (pblas ${PIBLAS1} ${PSBLAS1} ${PCBLAS1} ${PDBLAS1} ${PZBLAS1} ${PSBLAS2} ${PCBLAS2} ${PDBLAS2} ${PZBLAS2} ${PSBLAS3} ${PCBLAS3} ${PDBLAS3} ${PZBLAS3}) set (pblas-F ${APPBLAS} )scalapack-2.0.2/PBLAS/SRC/Makefile000644 000766 000024 00000014123 11654025546 016652 0ustar00juliestaff000000 000000 ############################################################################ # # Program: PBLAS -- (version 2.0) # # Module: Makefile # # Purpose: SRC Makefile # # Creation date: April 1, 1998 # # Modified: February 15, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../SLmake.inc ############################################################################ # # This is the makefile to create a library for the PBLAS. # The files are grouped as follows: # # PSBLAS1 -- Single precision real PBLAS1 routines # PCBLAS1 -- Single precision complex PBLAS1 routines # PDBLAS1 -- Double precision real PBLAS1 routines # PZBLAS1 -- Double precision complex PBLAS1 routines # # PSBLAS2 -- Single precision real PBLAS2 routines # PCBLAS2 -- Single precision complex PBLAS2 routines # PDBLAS2 -- Double precision real PBLAS2 routines # PZBLAS2 -- Double precision complex PBLAS2 routines # # PSBLAS3 -- Single precision real PBLAS3 routines # PCBLAS3 -- Single precision complex PBLAS3 routines # PDBLAS3 -- Double precision real PBLAS3 routines # PZBLAS3 -- Double precision complex PBLAS3 routines # # The library can be set up to include routines for any combination # of the four precisions. First, modify the ../../SLmake.inc file # definitions to match your compiler and the options to be used. # Then to create or add to the library, enter make followed by one or # more of the precisions desired. Some examples: # make single # make single complex # make single double complex complex16 # Alternatively, the command # make # without any arguments creates a library of all four precisions. # # To remove the object files after the library is created, enter # make clean # ############################################################################ all: single double complex complex16 APPBLAS = pilaenv.o #--------------------------------------------------------------------------- # Comment out the next 5 definitions if you already have the Level 1 PBLAS. #--------------------------------------------------------------------------- PIBLAS1 = picopy_.o PSBLAS1 = psswap_.o psscal_.o pscopy_.o psaxpy_.o psdot_.o psnrm2_.o \ psasum_.o psamax_.o PCBLAS1 = pcswap_.o pcscal_.o pcsscal_.o pccopy_.o pcaxpy_.o pcdotu_.o \ pcdotc_.o pscnrm2_.o pscasum_.o pcamax_.o PDBLAS1 = pdswap_.o pdscal_.o pdcopy_.o pdaxpy_.o pddot_.o pdnrm2_.o \ pdasum_.o pdamax_.o PZBLAS1 = pzswap_.o pzscal_.o pzdscal_.o pzcopy_.o pzaxpy_.o pzdotu_.o \ pzdotc_.o pdznrm2_.o pdzasum_.o pzamax_.o #--------------------------------------------------------------------------- # Comment out the next 4 definitions if you already have the Level 2 PBLAS. #--------------------------------------------------------------------------- PSBLAS2 = psgemv_.o psger_.o pssymv_.o pssyr_.o pssyr2_.o pstrmv_.o \ pstrsv_.o psagemv_.o psasymv_.o psatrmv_.o PCBLAS2 = pcgemv_.o pcgerc_.o pcgeru_.o pchemv_.o pcher_.o pcher2_.o \ pctrmv_.o pctrsv_.o pcagemv_.o pcahemv_.o pcatrmv_.o PDBLAS2 = pdgemv_.o pdger_.o pdsymv_.o pdsyr_.o pdsyr2_.o pdtrmv_.o \ pdtrsv_.o pdagemv_.o pdasymv_.o pdatrmv_.o PZBLAS2 = pzgemv_.o pzgerc_.o pzgeru_.o pzhemv_.o pzher_.o pzher2_.o \ pztrmv_.o pztrsv_.o pzagemv_.o pzahemv_.o pzatrmv_.o #--------------------------------------------------------------------------- # Comment out the next 4 definitions if you already have the Level 3 PBLAS. #--------------------------------------------------------------------------- PSBLAS3 = psgeadd_.o psgemm_.o pssymm_.o pssyr2k_.o pssyrk_.o pstradd_.o \ pstran_.o pstrmm_.o pstrsm_.o PCBLAS3 = pcgeadd_.o pcgemm_.o pchemm_.o pcher2k_.o pcherk_.o pcsymm_.o \ pcsyr2k_.o pcsyrk_.o pctradd_.o pctranc_.o pctranu_.o pctrmm_.o \ pctrsm_.o PDBLAS3 = pdgeadd_.o pdgemm_.o pdsymm_.o pdsyr2k_.o pdsyrk_.o pdtradd_.o \ pdtran_.o pdtrmm_.o pdtrsm_.o PZBLAS3 = pzgeadd_.o pzgemm_.o pzhemm_.o pzher2k_.o pzherk_.o pzsymm_.o \ pzsyr2k_.o pzsyrk_.o pztradd_.o pztranc_.o pztranu_.o pztrmm_.o \ pztrsm_.o #--------------------------------------------------------------------------- PIBLAS = $(PIBLAS1) $(APPBLAS) PSBLAS = $(PSBLAS1) $(PSBLAS2) $(PSBLAS3) $(APPBLAS) PCBLAS = $(PCBLAS1) $(PCBLAS2) $(PCBLAS3) $(APPBLAS) PDBLAS = $(PDBLAS1) $(PDBLAS2) $(PDBLAS3) $(APPBLAS) PZBLAS = $(PZBLAS1) $(PZBLAS2) $(PZBLAS3) $(APPBLAS) integer: $(PIBLAS) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(PIBLAS) $(RANLIB) ../../$(SCALAPACKLIB) single: integer $(PSBLAS) ( cd PBBLAS; $(MAKE) single ) ( cd PTZBLAS; $(MAKE) single ) ( cd PTOOLS; $(MAKE) single ) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(PSBLAS) $(RANLIB) ../../$(SCALAPACKLIB) double: integer $(PDBLAS) ( cd PBBLAS; $(MAKE) double ) ( cd PTZBLAS; $(MAKE) double ) ( cd PTOOLS; $(MAKE) double ) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(PDBLAS) $(RANLIB) ../../$(SCALAPACKLIB) complex: integer $(PCBLAS) ( cd PBBLAS; $(MAKE) complex ) ( cd PTZBLAS; $(MAKE) complex ) ( cd PTOOLS; $(MAKE) complex ) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(PCBLAS) $(RANLIB) ../../$(SCALAPACKLIB) complex16: integer $(PZBLAS) ( cd PBBLAS; $(MAKE) complex16 ) ( cd PTZBLAS; $(MAKE) complex16 ) ( cd PTOOLS; $(MAKE) complex16 ) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(PZBLAS) $(RANLIB) ../../$(SCALAPACKLIB) clean : rm -f *.o ( cd PBBLAS; $(MAKE) clean ) ( cd PTZBLAS; $(MAKE) clean ) ( cd PTOOLS; $(MAKE) clean ) .f.o : ; $(FC) -c $(FCFLAGS) $*.f .c.o : ; $(CC) -c $(CDEFS) $(CCFLAGS) $*.c # # To compile without input argument checking replace previous line by # .c.o : ; $(CC) -c $(CCFLAGS) -DNO_ARGCHK $*.c # # Note that when the PBLAS routines have been compiled without input ar- # gument checking, it is impossible to test PBLAS error exits. Trying to # test PBLAS error exits will then cause the tester executable to hang. # scalapack-2.0.2/PBLAS/SRC/PBblacs.h000644 000766 000024 00000042422 10363532303 016662 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * This file includes BLACS function type definitions, define macros, * and function prototypes. All PBLAS routines include this file. * * ---------------------------------------------------------------------- * #define macro constants * --------------------------------------------------------------------- */ /* BLACS scopes and topologies */ /* #define CALL 'A' (already defined) */ #define CCOLUMN 'C' #define CROW 'R' #define CBCAST 'B' #define CCOMBINE 'C' #define CTOP_GET '!' #define CTOP_DEFAULT ' ' #define CTOP_IRING 'I' #define CTOP_DRING 'D' #define CTOP_SRING 'S' #define CTOP_HYPER 'H' #define CTOP_FULL 'F' #define CTOP_MRING 'M' #define CTOP_TTREE 'T' #define CTOP_TREE1 '1' #define CTOP_TREE2 '2' #define CTOP_TREE3 '3' #define CTOP_TREE4 '4' #define CTOP_TREE5 '5' #define CTOP_TREE6 '6' #define CTOP_TREE7 '7' #define CTOP_TREE8 '8' #define CTOP_TREE9 '9' /* #define ALL "A" (already defined) */ #define COLUMN "C" #define ROW "R" #define BCAST "B" #define COMBINE "C" #define TOP_GET "!" #define TOP_DEFAULT " " #define TOP_IRING "I" #define TOP_DRING "D" #define TOP_SRING "S" #define TOP_HYPER "H" #define TOP_FULL "F" #define TOP_MRING "M" #define TOP_TTREE "T" #define TOP_TREE1 "1" #define TOP_TREE2 "2" #define TOP_TREE3 "3" #define TOP_TREE4 "4" #define TOP_TREE5 "5" #define TOP_TREE6 "6" #define TOP_TREE7 "7" #define TOP_TREE8 "8" #define TOP_TREE9 "9" /* * --------------------------------------------------------------------- * Function prototypes * --------------------------------------------------------------------- */ #ifdef __STDC__ /* BLACS Initialization */ void Cblacs_pinfo ( int *, int * ); void Cblacs_setup ( int *, int * ); void Cblacs_get ( int, int, int * ); void Cblacs_set ( int, int, int * ); void Cblacs_gridinit ( int *, char *, int, int ); void Cblacs_gridmap ( int *, int *, int, int, int ); /* BLACS Destruction */ void Cblacs_freebuff ( int, int ); void Cblacs_gridexit ( int ); void Cblacs_abort ( int, int ); void Cblacs_exit ( int ); /* BLACS Informational and Miscellaneous */ void Cblacs_gridinfo ( int, int *, int *, int *, int * ); int Cblacs_pnum ( int, int, int ); void Cblacs_pcoord ( int, int, int *, int * ); void Cblacs_barrier ( int, char * ); /* BLACS Sending */ void Cigesd2d ( int, int, int, char *, int, int, int ); void Csgesd2d ( int, int, int, char *, int, int, int ); void Cdgesd2d ( int, int, int, char *, int, int, int ); void Ccgesd2d ( int, int, int, char *, int, int, int ); void Czgesd2d ( int, int, int, char *, int, int, int ); void Citrsd2d ( int, char *, char *, int, int, char *, int, int, int ); void Cstrsd2d ( int, char *, char *, int, int, char *, int, int, int ); void Cdtrsd2d ( int, char *, char *, int, int, char *, int, int, int ); void Cctrsd2d ( int, char *, char *, int, int, char *, int, int, int ); void Cztrsd2d ( int, char *, char *, int, int, char *, int, int, int ); void Cigebs2d ( int, char *, char *, int, int, char *, int ); void Csgebs2d ( int, char *, char *, int, int, char *, int ); void Cdgebs2d ( int, char *, char *, int, int, char *, int ); void Ccgebs2d ( int, char *, char *, int, int, char *, int ); void Czgebs2d ( int, char *, char *, int, int, char *, int ); void Citrbs2d ( int, char *, char *, char *, char *, int, int, char *, int ); void Cstrbs2d ( int, char *, char *, char *, char *, int, int, char *, int ); void Cdtrbs2d ( int, char *, char *, char *, char *, int, int, char *, int ); void Cctrbs2d ( int, char *, char *, char *, char *, int, int, char *, int ); void Cztrbs2d ( int, char *, char *, char *, char *, int, int, char *, int ); /* BLACS Receiving */ void Cigerv2d ( int, int, int, char *, int, int, int ); void Csgerv2d ( int, int, int, char *, int, int, int ); void Cdgerv2d ( int, int, int, char *, int, int, int ); void Ccgerv2d ( int, int, int, char *, int, int, int ); void Czgerv2d ( int, int, int, char *, int, int, int ); void Citrrv2d ( int, char *, char *, int, int, char *, int, int, int ); void Cstrrv2d ( int, char *, char *, int, int, char *, int, int, int ); void Cdtrrv2d ( int, char *, char *, int, int, char *, int, int, int ); void Cctrrv2d ( int, char *, char *, int, int, char *, int, int, int ); void Cztrrv2d ( int, char *, char *, int, int, char *, int, int, int ); void Cigebr2d ( int, char *, char *, int, int, char *, int, int, int ); void Csgebr2d ( int, char *, char *, int, int, char *, int, int, int ); void Cdgebr2d ( int, char *, char *, int, int, char *, int, int, int ); void Ccgebr2d ( int, char *, char *, int, int, char *, int, int, int ); void Czgebr2d ( int, char *, char *, int, int, char *, int, int, int ); void Citrbr2d ( int, char *, char *, char *, char *, int, int, char *, int, int, int ); void Cstrbr2d ( int, char *, char *, char *, char *, int, int, char *, int, int, int ); void Cdtrbr2d ( int, char *, char *, char *, char *, int, int, char *, int, int, int ); void Cctrbr2d ( int, char *, char *, char *, char *, int, int, char *, int, int, int ); void Cztrbr2d ( int, char *, char *, char *, char *, int, int, char *, int, int, int ); /* BLACS Combine Operations */ void Cigamx2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Csgamx2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Cdgamx2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Ccgamx2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Czgamx2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Cigamn2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Csgamn2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Cdgamn2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Ccgamn2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Czgamn2d ( int, char *, char *, int, int, char *, int, int *, int *, int, int, int ); void Cigsum2d ( int, char *, char *, int, int, char *, int, int, int ); void Csgsum2d ( int, char *, char *, int, int, char *, int, int, int ); void Cdgsum2d ( int, char *, char *, int, int, char *, int, int, int ); void Ccgsum2d ( int, char *, char *, int, int, char *, int, int, int ); void Czgsum2d ( int, char *, char *, int, int, char *, int, int, int ); #else /* BLACS Initialization */ void Cblacs_pinfo (); void Cblacs_setup (); void Cblacs_get (); void Cblacs_set (); void Cblacs_gridinit (); void Cblacs_gridmap (); /* BLACS Destruction */ void Cblacs_freebuff (); void Cblacs_gridexit (); void Cblacs_abort (); void Cblacs_exit (); /* BLACS Informational and Miscellaneous */ void Cblacs_gridinfo (); int Cblacs_pnum (); void Cblacs_pcoord (); void Cblacs_barrier (); /* BLACS Sending */ void Cigesd2d (); void Csgesd2d (); void Cdgesd2d (); void Ccgesd2d (); void Czgesd2d (); void Citrsd2d (); void Cstrsd2d (); void Cdtrsd2d (); void Cctrsd2d (); void Cztrsd2d (); void Cigebs2d (); void Csgebs2d (); void Cdgebs2d (); void Ccgebs2d (); void Czgebs2d (); void Citrbs2d (); void Cstrbs2d (); void Cdtrbs2d (); void Cctrbs2d (); void Cztrbs2d (); /* BLACS Receiving */ void Cigerv2d (); void Csgerv2d (); void Cdgerv2d (); void Ccgerv2d (); void Czgerv2d (); void Citrrv2d (); void Cstrrv2d (); void Cdtrrv2d (); void Cctrrv2d (); void Cztrrv2d (); void Cigebr2d (); void Csgebr2d (); void Cdgebr2d (); void Ccgebr2d (); void Czgebr2d (); void Citrbr2d (); void Cstrbr2d (); void Cdtrbr2d (); void Cctrbr2d (); void Cztrbr2d (); /* BLACS Combine Operations */ void Cigamx2d (); void Csgamx2d (); void Cdgamx2d (); void Ccgamx2d (); void Czgamx2d (); void Cigamn2d (); void Csgamn2d (); void Cdgamn2d (); void Ccgamn2d (); void Czgamn2d (); void Cigsum2d (); void Csgsum2d (); void Cdgsum2d (); void Ccgsum2d (); void Czgsum2d (); #endif scalapack-2.0.2/PBLAS/SRC/PBBLAS/000755 000766 000024 00000000000 11750301607 016144 5ustar00juliestaff000000 000000 scalapack-2.0.2/PBLAS/SRC/PBblas.h000644 000766 000024 00000070603 10363532303 016521 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * This file includes F77 BLAS definitions. All PBLAS routines include * this file. * * --------------------------------------------------------------------- * #define macro constants * --------------------------------------------------------------------- */ #define CNOTRAN 'N' #define CNOCONJG 'N' #define CTRAN 'T' #define CCONJG 'Z' #define CCOTRAN 'C' #define CALL 'A' #define CLOWER 'L' #define CUPPER 'U' #define CDIAGONAL 'D' #define CLEFT 'L' #define CRIGHT 'R' #define CUNIT 'U' #define CNOUNIT 'N' #define CINIT 'I' #define CNOINIT 'N' #define CFORWARD 'F' #define CBACKWARD 'B' #define CREUSE 'R' #define CALLOCATE 'A' #define NOTRAN "N" #define NOCONJG "N" #define TRAN "T" #define CONJG "Z" #define COTRAN "C" #define ALL "A" #define LOWER "L" #define UPPER "U" #define DIAGONAL "D" #define LEFT "L" #define RIGHT "R" #define UNIT "U" #define NOUNIT "N" #define INIT "I" #define NOINIT "N" #define FORWARD "F" #define BACKWARD "B" #define REUSE "R" #define ALLOCATE "A" #if( _F2C_CALL_ == _F2C_ADD_ ) /* * These defines set up the naming scheme required to have a FORTRAN * routine called by a C routine. No redefinition is necessary to have * the following FORTRAN to C interface: * * FORTRAN DECLARATION C CALL * SUBROUTINE DGEMM(...) dgemm_(...) * * This is the PBLAS default. */ #endif #if( _F2C_CALL_ == _F2C_UPCASE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine called by a C routine with the following FORTRAN to C inter- * face: * * FORTRAN DECLARATION C CALL * SUBROUTINE DGEMM(...) DGEMM(...) */ #define srot_ SROT #define drot_ DROT #define sswap_ SSWAP #define dswap_ DSWAP #define cswap_ CSWAP #define zswap_ ZSWAP #define scopy_ SCOPY #define dcopy_ DCOPY #define ccopy_ CCOPY #define zcopy_ ZCOPY #define saxpy_ SAXPY #define daxpy_ DAXPY #define caxpy_ CAXPY #define zaxpy_ ZAXPY #define sscal_ SSCAL #define dscal_ DSCAL #define cscal_ CSCAL #define zscal_ ZSCAL #define csscal_ CSSCAL #define zdscal_ ZDSCAL #define sasum_ SASUM #define dasum_ DASUM #define scasum_ SCASUM #define dzasum_ DZASUM #define snrm2_ SNRM2 #define dnrm2_ DNRM2 #define scnrm2_ SCNRM2 #define dznrm2_ DZNRM2 #define sdot_ SDOT #define ddot_ DDOT #define cdotu_ CDOTU #define zdotu_ ZDOTU #define cdotc_ CDOTC #define zdotc_ ZDOTC #define isamax_ ISAMAX #define idamax_ IDAMAX #define icamax_ ICAMAX #define izamax_ IZAMAX #define sgemv_ SGEMV #define dgemv_ DGEMV #define cgemv_ CGEMV #define zgemv_ ZGEMV #define ssymv_ SSYMV #define dsymv_ DSYMV #define chemv_ CHEMV #define zhemv_ ZHEMV #define strmv_ STRMV #define dtrmv_ DTRMV #define ctrmv_ CTRMV #define ztrmv_ ZTRMV #define strsv_ STRSV #define dtrsv_ DTRSV #define ctrsv_ CTRSV #define ztrsv_ ZTRSV #define sger_ SGER #define dger_ DGER #define cgeru_ CGERU #define zgeru_ ZGERU #define cgerc_ CGERC #define zgerc_ ZGERC #define ssyr_ SSYR #define dsyr_ DSYR #define cher_ CHER #define zher_ ZHER #define ssyr2_ SSYR2 #define dsyr2_ DSYR2 #define cher2_ CHER2 #define zher2_ ZHER2 #define sgemm_ SGEMM #define dgemm_ DGEMM #define cgemm_ CGEMM #define zgemm_ ZGEMM #define ssymm_ SSYMM #define dsymm_ DSYMM #define csymm_ CSYMM #define chemm_ CHEMM #define zsymm_ ZSYMM #define zhemm_ ZHEMM #define strmm_ STRMM #define dtrmm_ DTRMM #define ctrmm_ CTRMM #define ztrmm_ ZTRMM #define strsm_ STRSM #define dtrsm_ DTRSM #define ctrsm_ CTRSM #define ztrsm_ ZTRSM #define ssyrk_ SSYRK #define dsyrk_ DSYRK #define csyrk_ CSYRK #define cherk_ CHERK #define zsyrk_ ZSYRK #define zherk_ ZHERK #define ssyr2k_ SSYR2K #define dsyr2k_ DSYR2K #define csyr2k_ CSYR2K #define cher2k_ CHER2K #define zsyr2k_ ZSYR2K #define zher2k_ ZHER2K #endif #if( _F2C_CALL_ == _F2C_NOCHANGE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine called by a C routine with the following FORTRAN to C inter- * face: * * FORTRAN DECLARATION C CALL * SUBROUTINE DGEMM(...) dgemm(...) */ #define srot_ srot #define drot_ drot #define sswap_ sswap #define dswap_ dswap #define cswap_ cswap #define zswap_ zswap #define scopy_ scopy #define dcopy_ dcopy #define ccopy_ ccopy #define zcopy_ zcopy #define saxpy_ saxpy #define daxpy_ daxpy #define caxpy_ caxpy #define zaxpy_ zaxpy #define sscal_ sscal #define dscal_ dscal #define cscal_ cscal #define zscal_ zscal #define csscal_ csscal #define zdscal_ zdscal #define sasum_ sasum #define dasum_ dasum #define scasum_ scasum #define dzasum_ dzasum #define snrm2_ snrm2 #define dnrm2_ dnrm2 #define scnrm2_ scnrm2 #define dznrm2_ dznrm2 #define sdot_ sdot #define ddot_ ddot #define cdotu_ cdotu #define zdotu_ zdotu #define cdotc_ cdotc #define zdotc_ zdotc #define isamax_ isamax #define idamax_ idamax #define icamax_ icamax #define izamax_ izamax #define sgemv_ sgemv #define dgemv_ dgemv #define cgemv_ cgemv #define zgemv_ zgemv #define ssymv_ ssymv #define dsymv_ dsymv #define chemv_ chemv #define zhemv_ zhemv #define strmv_ strmv #define dtrmv_ dtrmv #define ctrmv_ ctrmv #define ztrmv_ ztrmv #define strsv_ strsv #define dtrsv_ dtrsv #define ctrsv_ ctrsv #define ztrsv_ ztrsv #define sger_ sger #define dger_ dger #define cgeru_ cgeru #define zgeru_ zgeru #define cgerc_ cgerc #define zgerc_ zgerc #define ssyr_ ssyr #define dsyr_ dsyr #define cher_ cher #define zher_ zher #define ssyr2_ ssyr2 #define dsyr2_ dsyr2 #define cher2_ cher2 #define zher2_ zher2 #define sgemm_ sgemm #define dgemm_ dgemm #define cgemm_ cgemm #define zgemm_ zgemm #define ssymm_ ssymm #define dsymm_ dsymm #define csymm_ csymm #define chemm_ chemm #define zsymm_ zsymm #define zhemm_ zhemm #define strmm_ strmm #define dtrmm_ dtrmm #define ctrmm_ ctrmm #define ztrmm_ ztrmm #define strsm_ strsm #define dtrsm_ dtrsm #define ctrsm_ ctrsm #define ztrsm_ ztrsm #define ssyrk_ ssyrk #define dsyrk_ dsyrk #define csyrk_ csyrk #define cherk_ cherk #define zsyrk_ zsyrk #define zherk_ zherk #define ssyr2k_ ssyr2k #define dsyr2k_ dsyr2k #define csyr2k_ csyr2k #define cher2k_ cher2k #define zsyr2k_ zsyr2k #define zher2k_ zher2k #endif /* * --------------------------------------------------------------------- * Function prototypes * --------------------------------------------------------------------- */ #ifdef __STDC__ int isamax_ ( int *, char *, int * ); int idamax_ ( int *, char *, int * ); int icamax_ ( int *, char *, int * ); int izamax_ ( int *, char *, int * ); F_VOID_FCT saxpy_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT daxpy_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT caxpy_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT zaxpy_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT scopy_ ( int *, char *, int *, char *, int * ); F_VOID_FCT dcopy_ ( int *, char *, int *, char *, int * ); F_VOID_FCT ccopy_ ( int *, char *, int *, char *, int * ); F_VOID_FCT zcopy_ ( int *, char *, int *, char *, int * ); F_VOID_FCT sscal_ ( int *, char *, char *, int * ); F_VOID_FCT dscal_ ( int *, char *, char *, int * ); F_VOID_FCT cscal_ ( int *, char *, char *, int * ); F_VOID_FCT csscal_ ( int *, char *, char *, int * ); F_VOID_FCT zdscal_ ( int *, char *, char *, int * ); F_VOID_FCT zscal_ ( int *, char *, char *, int * ); F_VOID_FCT sswap_ ( int *, char *, int *, char *, int * ); F_VOID_FCT dswap_ ( int *, char *, int *, char *, int * ); F_VOID_FCT cswap_ ( int *, char *, int *, char *, int * ); F_VOID_FCT zswap_ ( int *, char *, int *, char *, int * ); F_VOID_FCT sgemv_ ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT dgemv_ ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT cgemv_ ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zgemv_ ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT ssymv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT dsymv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT chemv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zhemv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT strmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); F_VOID_FCT dtrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); F_VOID_FCT ctrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); F_VOID_FCT ztrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); F_VOID_FCT strsv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); F_VOID_FCT dtrsv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); F_VOID_FCT ctrsv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); F_VOID_FCT ztrsv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); F_VOID_FCT sger_ ( int *, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT dger_ ( int *, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT cgerc_ ( int *, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT cgeru_ ( int *, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT zgerc_ ( int *, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT zgeru_ ( int *, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT ssyr_ ( F_CHAR_T, int *, char *, char *, int *, char *, int * ); F_VOID_FCT dsyr_ ( F_CHAR_T, int *, char *, char *, int *, char *, int * ); F_VOID_FCT cher_ ( F_CHAR_T, int *, char *, char *, int *, char *, int * ); F_VOID_FCT zher_ ( F_CHAR_T, int *, char *, char *, int *, char *, int * ); F_VOID_FCT ssyr2_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT dsyr2_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT cher2_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT zher2_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT sgemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT dgemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT cgemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zgemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT ssymm_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT dsymm_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT csymm_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zsymm_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT chemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zhemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT ssyrk_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT dsyrk_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT csyrk_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zsyrk_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT cherk_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zherk_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT ssyr2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT dsyr2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT csyr2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zsyr2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT cher2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zher2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT strmm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); F_VOID_FCT dtrmm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); F_VOID_FCT ctrmm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); F_VOID_FCT ztrmm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); F_VOID_FCT strsm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); F_VOID_FCT dtrsm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); F_VOID_FCT ctrsm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); F_VOID_FCT ztrsm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); #else int isamax_ (); int idamax_ (); int icamax_ (); int izamax_ (); F_VOID_FCT saxpy_ (); F_VOID_FCT daxpy_ (); F_VOID_FCT caxpy_ (); F_VOID_FCT zaxpy_ (); F_VOID_FCT scopy_ (); F_VOID_FCT dcopy_ (); F_VOID_FCT ccopy_ (); F_VOID_FCT zcopy_ (); F_VOID_FCT sscal_ (); F_VOID_FCT dscal_ (); F_VOID_FCT cscal_ (); F_VOID_FCT csscal_ (); F_VOID_FCT zscal_ (); F_VOID_FCT zdscal_ (); F_VOID_FCT sswap_ (); F_VOID_FCT dswap_ (); F_VOID_FCT cswap_ (); F_VOID_FCT zswap_ (); F_VOID_FCT sgemv_ (); F_VOID_FCT dgemv_ (); F_VOID_FCT cgemv_ (); F_VOID_FCT zgemv_ (); F_VOID_FCT ssymv_ (); F_VOID_FCT dsymv_ (); F_VOID_FCT chemv_ (); F_VOID_FCT zhemv_ (); F_VOID_FCT strmv_ (); F_VOID_FCT dtrmv_ (); F_VOID_FCT ctrmv_ (); F_VOID_FCT ztrmv_ (); F_VOID_FCT strsv_ (); F_VOID_FCT dtrsv_ (); F_VOID_FCT ctrsv_ (); F_VOID_FCT ztrsv_ (); F_VOID_FCT sger_ (); F_VOID_FCT dger_ (); F_VOID_FCT cgerc_ (); F_VOID_FCT cgeru_ (); F_VOID_FCT zgerc_ (); F_VOID_FCT zgeru_ (); F_VOID_FCT ssyr_ (); F_VOID_FCT dsyr_ (); F_VOID_FCT cher_ (); F_VOID_FCT zher_ (); F_VOID_FCT ssyr2_ (); F_VOID_FCT dsyr2_ (); F_VOID_FCT cher2_ (); F_VOID_FCT zher2_ (); F_VOID_FCT sgemm_ (); F_VOID_FCT dgemm_ (); F_VOID_FCT cgemm_ (); F_VOID_FCT zgemm_ (); F_VOID_FCT ssymm_ (); F_VOID_FCT dsymm_ (); F_VOID_FCT csymm_ (); F_VOID_FCT zsymm_ (); F_VOID_FCT chemm_ (); F_VOID_FCT zhemm_ (); F_VOID_FCT ssyrk_ (); F_VOID_FCT dsyrk_ (); F_VOID_FCT csyrk_ (); F_VOID_FCT zsyrk_ (); F_VOID_FCT cherk_ (); F_VOID_FCT zherk_ (); F_VOID_FCT ssyr2k_ (); F_VOID_FCT dsyr2k_ (); F_VOID_FCT csyr2k_ (); F_VOID_FCT zsyr2k_ (); F_VOID_FCT cher2k_ (); F_VOID_FCT zher2k_ (); F_VOID_FCT strmm_ (); F_VOID_FCT dtrmm_ (); F_VOID_FCT ctrmm_ (); F_VOID_FCT ztrmm_ (); F_VOID_FCT strsm_ (); F_VOID_FCT dtrsm_ (); F_VOID_FCT ctrsm_ (); F_VOID_FCT ztrsm_ (); #endif scalapack-2.0.2/PBLAS/SRC/pblas.h000644 000766 000024 00000050212 10363532303 016451 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * This file includes the standard C libraries, as well as system depen- * dent include files. All PBLAS routines include this file. * * --------------------------------------------------------------------- * Machine Specific PBLAS macros * --------------------------------------------------------------------- */ #define _HAL_ 0 #define _T3D_ 1 #define _T3E_ 2 #ifdef T3D #define _MACH_ _T3D_ #endif #ifdef T3E #define _MACH_ _T3E_ #endif #ifndef _MACH_ #define _MACH_ _HAL_ #endif /* * CBRATIO is the ratio of the transfer cost per element for the combine * sum to one process and the broadcast operation. This value is used * within the Level 3 PBLAS routines to decide on which algorithm to se- * lect. */ #define CBRATIO 1.3 /* * --------------------------------------------------------------------- * Include files * --------------------------------------------------------------------- */ #include #include #ifdef __STDC__ #include #else #include #endif #if( ( _MACH_ == _T3D_ ) || ( _MACH_ == _T3E_ ) ) #include #endif /* * --------------------------------------------------------------------- * FORTRAN <-> C interface * --------------------------------------------------------------------- * * These macros identifies how the PBLAS will be called as follows: * * _F2C_ADD_: the FORTRAN compiler expects the name of C functions to be * in all lower case and to have an underscore postfixed it (Suns, Intel * compilers expect this). * * _F2C_NOCHANGE: the FORTRAN compiler expects the name of C functions * to be in all lower case (IBM RS6K compilers do this). * * _F2C_UPCASE: the FORTRAN compiler expects the name of C functions * to be in all upcase. (Cray compilers expect this). * * _F2C_F77ISF2C: the FORTRAN compiler in use is f2c, a FORTRAN to C * converter. */ #define _F2C_ADD_ 0 #define _F2C_NOCHANGE 1 #define _F2C_UPCASE 2 #define _F2C_F77ISF2C 3 #ifdef UpCase #define _F2C_CALL_ _F2C_UPCASE #endif #ifdef NoChange #define _F2C_CALL_ _F2C_NOCHANGE #endif #ifdef Add_ #define _F2C_CALL_ _F2C_ADD_ #endif #ifdef f77IsF2C #define _F2C_CALL_ _F2C_F77ISF2C #endif #ifndef _F2C_CALL_ #define _F2C_CALL_ _F2C_ADD_ #endif /* * --------------------------------------------------------------------- * TYPE DEFINITIONS AND CONVERSION UTILITIES * --------------------------------------------------------------------- */ #if( ( _MACH_ == _T3D_ ) || ( _MACH_ == _T3E_ ) ) #define float double /* Type of character argument in a FORTRAN call */ #define F_CHAR_T _fcd /* Character conversion utilities */ #define F2C_CHAR(a) ( _fcdtocp( (a) ) ) #define C2F_CHAR(a) ( _cptofcd( (a), 1 ) ) /* Type of FORTRAN functions */ #define F_VOID_FCT void fortran /* Subroutine */ #define F_INTG_FCT int fortran /* INTEGER function */ #else /* Type of character argument in a FORTRAN call */ typedef char * F_CHAR_T; /* Character conversion utilities */ #define F2C_CHAR(a) (a) #define C2F_CHAR(a) (a) /* Type of FORTRAN functions */ #define F_VOID_FCT void /* Subroutine */ #define F_INTG_FCT int /* INTEGER function */ #endif /* * ---------------------------------------------------------------------- * #typedef definitions * --------------------------------------------------------------------- */ typedef float cmplx [2]; typedef double cmplx16[2]; #define REAL_PART 0 #define IMAG_PART 1 #ifdef __STDC__ typedef void (*GESD2D_T) ( int, int, int, char *, int, int, int ); typedef void (*GERV2D_T) ( int, int, int, char *, int, int, int ); typedef void (*GEBS2D_T) ( int, char *, char *, int, int, char *, int ); typedef void (*GEBR2D_T) ( int, char *, char *, int, int, char *, int, int, int ); typedef void (*GSUM2D_T) ( int, char *, char *, int, int, char *, int, int, int ); typedef F_VOID_FCT (*MMADD_T) ( int *, int *, char *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*MMSHFT_T) ( int *, int *, int *, char *, int * ); typedef F_VOID_FCT (*VVDOT_T) ( int *, char *, char *, int *, char *, int * ); typedef F_VOID_FCT (*VVSET_T) ( int *, char *, char *, int * ); typedef F_VOID_FCT (*TZPAD_T) ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, char *, int * ); typedef F_VOID_FCT (*TZPADCPY_T) ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, int *, char *, int * ); typedef F_VOID_FCT (*TZSET_T) ( F_CHAR_T, int *, int *, int *, char *, char *, char *, int * ); typedef F_VOID_FCT (*TZSCAL_T) ( F_CHAR_T, int *, int *, int *, char *, char *, int * ); typedef F_VOID_FCT (*AXPY_T) ( int *, char *, char *, int *, char *, int * ); typedef F_VOID_FCT (*COPY_T) ( int *, char *, int *, char *, int * ); typedef F_VOID_FCT (*SWAP_T) ( int *, char *, int *, char *, int * ); typedef F_VOID_FCT (*GEMV_T) ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*AGEMV_T) ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*SYMV_T) ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*ASYMV_T) ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*HEMV_T) ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*AHEMV_T) ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*TRMV_T) ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); typedef F_VOID_FCT (*ATRMV_T) ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*TRSV_T) ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, int *, char *, int * ); typedef F_VOID_FCT (*GERC_T) ( int *, int *, char *, char *, int *, char *, int *, char *, int * ); typedef F_VOID_FCT (*GERU_T) ( int *, int *, char *, char *, int *, char *, int *, char *, int * ); typedef F_VOID_FCT (*SYR_T) ( F_CHAR_T, int *, char *, char *, int *, char *, int * ); typedef F_VOID_FCT (*HER_T) ( F_CHAR_T, int *, char *, char *, int *, char *, int * ); typedef F_VOID_FCT (*SYR2_T) ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, int * ); typedef F_VOID_FCT (*HER2_T) ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, int * ); typedef F_VOID_FCT (*GEMM_T) ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*SYMM_T) ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*HEMM_T) ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*SYRK_T) ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*HERK_T) ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*SYR2K_T) ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*HER2K_T) ( F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); typedef F_VOID_FCT (*TRMM_T) ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); typedef F_VOID_FCT (*TRSM_T) ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, char *, char *, int *, char *, int * ); #else typedef void (*GESD2D_T) (); typedef void (*GERV2D_T) (); typedef void (*GEBS2D_T) (); typedef void (*GEBR2D_T) (); typedef void (*GSUM2D_T) (); typedef F_VOID_FCT (*MMADD_T) (); typedef F_VOID_FCT (*MMSHFT_T) (); typedef F_VOID_FCT (*VVDOT_T) (); typedef F_VOID_FCT (*VVSET_T) (); typedef F_VOID_FCT (*TZPAD_T) (); typedef F_VOID_FCT (*TZPADCPY_T) (); typedef F_VOID_FCT (*TZSET_T) (); typedef F_VOID_FCT (*TZSCAL_T) (); typedef F_VOID_FCT (*AXPY_T) (); typedef F_VOID_FCT (*COPY_T) (); typedef F_VOID_FCT (*SWAP_T) (); typedef F_VOID_FCT (*GEMV_T) (); typedef F_VOID_FCT (*AGEMV_T) (); typedef F_VOID_FCT (*SYMV_T) (); typedef F_VOID_FCT (*ASYMV_T) (); typedef F_VOID_FCT (*HEMV_T) (); typedef F_VOID_FCT (*AHEMV_T) (); typedef F_VOID_FCT (*TRMV_T) (); typedef F_VOID_FCT (*ATRMV_T) (); typedef F_VOID_FCT (*TRSV_T) (); typedef F_VOID_FCT (*GERC_T) (); typedef F_VOID_FCT (*GERU_T) (); typedef F_VOID_FCT (*SYR_T) (); typedef F_VOID_FCT (*HER_T) (); typedef F_VOID_FCT (*SYR2_T) (); typedef F_VOID_FCT (*HER2_T) (); typedef F_VOID_FCT (*GEMM_T) (); typedef F_VOID_FCT (*SYMM_T) (); typedef F_VOID_FCT (*HEMM_T) (); typedef F_VOID_FCT (*SYRK_T) (); typedef F_VOID_FCT (*HERK_T) (); typedef F_VOID_FCT (*SYR2K_T) (); typedef F_VOID_FCT (*HER2K_T) (); typedef F_VOID_FCT (*TRMM_T) (); typedef F_VOID_FCT (*TRSM_T) (); #endif typedef struct { char type; /* Encoding of the data type */ int usiz; /* length in bytes of elementary data type */ int size; /* length in bytes of data type */ char * zero, * one, * negone; /* pointers to contants of correct type */ GESD2D_T Cgesd2d; /* BLACS functions */ GERV2D_T Cgerv2d; GEBS2D_T Cgebs2d; GEBR2D_T Cgebr2d; GSUM2D_T Cgsum2d; MMADD_T Fmmadd; /* Addition functions */ MMADD_T Fmmcadd; MMADD_T Fmmtadd; MMADD_T Fmmtcadd; MMADD_T Fmmdda; MMADD_T Fmmddac; MMADD_T Fmmddat; MMADD_T Fmmddact; MMSHFT_T Fcshft; /* Shift functions */ MMSHFT_T Frshft; VVDOT_T Fvvdotu; /* Dot functions */ VVDOT_T Fvvdotc; TZPAD_T Ftzpad; /* Array pad function */ TZPADCPY_T Ftzpadcpy; VVSET_T Fset; TZSCAL_T Ftzscal; /* Scaling functions */ TZSCAL_T Fhescal; TZSCAL_T Ftzcnjg; AXPY_T Faxpy; /* Level 1 BLAS */ COPY_T Fcopy; SWAP_T Fswap; GEMV_T Fgemv; /* Level 2 BLAS */ SYMV_T Fsymv; HEMV_T Fhemv; TRMV_T Ftrmv; TRSV_T Ftrsv; AGEMV_T Fagemv; ASYMV_T Fasymv; AHEMV_T Fahemv; ATRMV_T Fatrmv; GERC_T Fgerc; GERU_T Fgeru; SYR_T Fsyr; HER_T Fher; SYR2_T Fsyr2; HER2_T Fher2; GEMM_T Fgemm; /* Level 3 BLAS */ SYMM_T Fsymm; HEMM_T Fhemm; SYRK_T Fsyrk; HERK_T Fherk; SYR2K_T Fsyr2k; HER2K_T Fher2k; TRMM_T Ftrmm; TRSM_T Ftrsm; } PBTYP_T; #ifdef __STDC__ typedef void (*TZSYR_T) ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int ); typedef void (*TZSYR2_T) ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); typedef void (*TZTRM_T) ( PBTYP_T *, char *, char *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int ); typedef void (*TZSYM_T) ( PBTYP_T *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); #else typedef void (*TZSYR_T) (); typedef void (*TZSYR2_T) (); typedef void (*TZTRM_T) (); typedef void (*TZSYM_T) (); #endif typedef struct { int offd; /* Global diagonal offset */ int lcmt00; /* LCM value of first block */ int mp; /* Local number of rows */ int imb1; /* Size of first row block (global) */ int imbloc; /* Size of first local row block */ int mb; /* Row block size */ int lmbloc; /* Size of last local row block */ int mblks; /* Number of local row blocks */ int iupp; /* LCM row bound for first diagonal block */ int upp; /* LCM row bound for diagonal block */ int prow; /* Relative row process coordinate */ int nprow; /* Number of process rows */ int nq; /* Local number of columns */ int inb1; /* Size of first column block (global) */ int inbloc; /* Size of first local column block */ int nb; /* Column block size */ int lnbloc; /* Size of last local column block */ int nblks; /* Number of local column blocks */ int ilow; /* LCM column bound for first diagonal block */ int low; /* LCM column bound for diagonal block */ int pcol; /* Relative column process coordinate */ int npcol; /* Number of process columns */ int lcmb; /* Least common multiple of nprow * mb and npcol * nb */ } PB_VM_T; /* * --------------------------------------------------------------------- * #define macro constants * --------------------------------------------------------------------- */ #define INT 'I' /* type identifiers */ #define SREAL 'S' #define DREAL 'D' #define SCPLX 'C' #define DCPLX 'Z' #define crot_ CROT scalapack-2.0.2/PBLAS/SRC/PBpblas.h000644 000766 000024 00000151562 10363532303 016705 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * This file includes PBLAS definitions. All PBLAS routines include this * file. * * --------------------------------------------------------------------- * #define macro constants * --------------------------------------------------------------------- */ #if( _F2C_CALL_ == _F2C_ADD_ ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine. No redefinition is necessary to have the * following FORTRAN to C interface: * * FORTRAN CALL C DECLARATION * CALL PDGEMM(...) void pdgemm_(...) * * This is the PBLAS default. */ #define PB_freebuf_ PB_freebuf_ #define PB_topget_ pb_topget_ #define PB_topset_ pb_topset_ #endif #if( _F2C_CALL_ == _F2C_UPCASE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine. No redefinition is necessary to have the * following FORTRAN to C interface: * * FORTRAN CALL C DECLARATION * CALL PDGEMM(...) void PDGEMM(...) */ #define pilaenv_ PILAENV #define PB_freebuf_ PB_FREEBUF #define PB_topget_ PB_TOPGET #define PB_topset_ PB_TOPSET /* Level-1 PBLAS */ #define picopy_ PICOPY #define pscopy_ PSCOPY #define pdcopy_ PDCOPY #define pccopy_ PCCOPY #define pzcopy_ PZCOPY #define psswap_ PSSWAP #define pdswap_ PDSWAP #define pcswap_ PCSWAP #define pzswap_ PZSWAP #define psaxpy_ PSAXPY #define pdaxpy_ PDAXPY #define pcaxpy_ PCAXPY #define pzaxpy_ PZAXPY #define psscal_ PSSCAL #define pdscal_ PDSCAL #define pcscal_ PCSCAL #define pzscal_ PZSCAL #define pcsscal_ PCSSCAL #define pzdscal_ PZDSCAL #define psasum_ PSASUM #define pdasum_ PDASUM #define pscasum_ PSCASUM #define pdzasum_ PDZASUM #define psnrm2_ PSNRM2 #define pdnrm2_ PDNRM2 #define pscnrm2_ PSCNRM2 #define pdznrm2_ PDZNRM2 #define psdot_ PSDOT #define pddot_ PDDOT #define pcdotu_ PCDOTU #define pzdotu_ PZDOTU #define pcdotc_ PCDOTC #define pzdotc_ PZDOTC #define psamax_ PSAMAX #define pdamax_ PDAMAX #define pcamax_ PCAMAX #define pzamax_ PZAMAX #define psgemv_ PSGEMV #define pdgemv_ PDGEMV #define pcgemv_ PCGEMV #define pzgemv_ PZGEMV #define psagemv_ PSAGEMV #define pdagemv_ PDAGEMV #define pcagemv_ PCAGEMV #define pzagemv_ PZAGEMV #define pssymv_ PSSYMV #define pdsymv_ PDSYMV #define pchemv_ PCHEMV #define pzhemv_ PZHEMV #define psasymv_ PSASYMV #define pdasymv_ PDASYMV #define pcahemv_ PCAHEMV #define pzahemv_ PZAHEMV #define pstrmv_ PSTRMV #define pdtrmv_ PDTRMV #define pctrmv_ PCTRMV #define pztrmv_ PZTRMV #define psatrmv_ PSATRMV #define pdatrmv_ PDATRMV #define pcatrmv_ PCATRMV #define pzatrmv_ PZATRMV #define pstrsv_ PSTRSV #define pdtrsv_ PDTRSV #define pctrsv_ PCTRSV #define pztrsv_ PZTRSV #define psger_ PSGER #define pdger_ PDGER #define pcgeru_ PCGERU #define pzgeru_ PZGERU #define pcgerc_ PCGERC #define pzgerc_ PZGERC #define pssyr_ PSSYR #define pdsyr_ PDSYR #define pcher_ PCHER #define pzher_ PZHER #define pssyr2_ PSSYR2 #define pdsyr2_ PDSYR2 #define pcher2_ PCHER2 #define pzher2_ PZHER2 #define psgemm_ PSGEMM #define pdgemm_ PDGEMM #define pcgemm_ PCGEMM #define pzgemm_ PZGEMM #define psgeadd_ PSGEADD #define pdgeadd_ PDGEADD #define pcgeadd_ PCGEADD #define pzgeadd_ PZGEADD #define pssymm_ PSSYMM #define pdsymm_ PDSYMM #define pcsymm_ PCSYMM #define pchemm_ PCHEMM #define pzsymm_ PZSYMM #define pzhemm_ PZHEMM #define pstrmm_ PSTRMM #define pdtrmm_ PDTRMM #define pctrmm_ PCTRMM #define pztrmm_ PZTRMM #define pstrsm_ PSTRSM #define pdtrsm_ PDTRSM #define pctrsm_ PCTRSM #define pztrsm_ PZTRSM #define pssyrk_ PSSYRK #define pdsyrk_ PDSYRK #define pcsyrk_ PCSYRK #define pcherk_ PCHERK #define pzsyrk_ PZSYRK #define pzherk_ PZHERK #define pssyr2k_ PSSYR2K #define pdsyr2k_ PDSYR2K #define pcsyr2k_ PCSYR2K #define pcher2k_ PCHER2K #define pzsyr2k_ PZSYR2K #define pzher2k_ PZHER2K #define pstradd_ PSTRADD #define pdtradd_ PDTRADD #define pctradd_ PCTRADD #define pztradd_ PZTRADD #define pstran_ PSTRAN #define pdtran_ PDTRAN #define pctranu_ PCTRANU #define pztranu_ PZTRANU #define pctranc_ PCTRANC #define pztranc_ PZTRANC #endif #if( _F2C_CALL_ == _F2C_NOCHANGE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine with the following FORTRAN to C interface: * * FORTRAN CALL C DECLARATION * CALLL PDGEMM(...) void pdgemm(...) */ #define pilaenv_ pilaenv #define PB_freebuf_ PB_freebuf #define PB_topget_ pb_topget #define PB_topset_ pb_topset #define picopy_ picopy #define pscopy_ pscopy #define pdcopy_ pdcopy #define pccopy_ pccopy #define pzcopy_ pzcopy #define psswap_ psswap #define pdswap_ pdswap #define pcswap_ pcswap #define pzswap_ pzswap #define psaxpy_ psaxpy #define pdaxpy_ pdaxpy #define pcaxpy_ pcaxpy #define pzaxpy_ pzaxpy #define psscal_ psscal #define pdscal_ pdscal #define pcscal_ pcscal #define pzscal_ pzscal #define pcsscal_ pcsscal #define pzdscal_ pzdscal #define psasum_ psasum #define pdasum_ pdasum #define pscasum_ pscasum #define pdzasum_ pdzasum #define psnrm2_ psnrm2 #define pdnrm2_ pdnrm2 #define pscnrm2_ pscnrm2 #define pdznrm2_ pdznrm2 #define psdot_ psdot #define pddot_ pddot #define pcdotu_ pcdotu #define pzdotu_ pzdotu #define pcdotc_ pcdotc #define pzdotc_ pzdotc #define psamax_ psamax #define pdamax_ pdamax #define pcamax_ pcamax #define pzamax_ pzamax #define psgemv_ psgemv #define pdgemv_ pdgemv #define pcgemv_ pcgemv #define pzgemv_ pzgemv #define psagemv_ psagemv #define pdagemv_ pdagemv #define pcagemv_ pcagemv #define pzagemv_ pzagemv #define pssymv_ pssymv #define pdsymv_ pdsymv #define pchemv_ pchemv #define pzhemv_ pzhemv #define psasymv_ psasymv #define pdasymv_ pdasymv #define pcahemv_ pcahemv #define pzahemv_ pzahemv #define pstrmv_ pstrmv #define pdtrmv_ pdtrmv #define pctrmv_ pctrmv #define pztrmv_ pztrmv #define psatrmv_ psatrmv #define pdatrmv_ pdatrmv #define pcatrmv_ pcatrmv #define pzatrmv_ pzatrmv #define pstrsv_ pstrsv #define pdtrsv_ pdtrsv #define pctrsv_ pctrsv #define pztrsv_ pztrsv #define psger_ psger #define pdger_ pdger #define pcgeru_ pcgeru #define pzgeru_ pzgeru #define pcgerc_ pcgerc #define pzgerc_ pzgerc #define pssyr_ pssyr #define pdsyr_ pdsyr #define pcher_ pcher #define pzher_ pzher #define pssyr2_ pssyr2 #define pdsyr2_ pdsyr2 #define pcher2_ pcher2 #define pzher2_ pzher2 #define psgeadd_ psgeadd #define pdgeadd_ pdgeadd #define pcgeadd_ pcgeadd #define pzgeadd_ pzgeadd #define psgemm_ psgemm #define pdgemm_ pdgemm #define pcgemm_ pcgemm #define pzgemm_ pzgemm #define pssymm_ pssymm #define pdsymm_ pdsymm #define pcsymm_ pcsymm #define pchemm_ pchemm #define pzsymm_ pzsymm #define pzhemm_ pzhemm #define pstrmm_ pstrmm #define pdtrmm_ pdtrmm #define pctrmm_ pctrmm #define pztrmm_ pztrmm #define pstrsm_ pstrsm #define pdtrsm_ pdtrsm #define pctrsm_ pctrsm #define pztrsm_ pztrsm #define pssyrk_ pssyrk #define pdsyrk_ pdsyrk #define pcsyrk_ pcsyrk #define pcherk_ pcherk #define pzsyrk_ pzsyrk #define pzherk_ pzherk #define pssyr2k_ pssyr2k #define pdsyr2k_ pdsyr2k #define pcsyr2k_ pcsyr2k #define pcher2k_ pcher2k #define pzsyr2k_ pzsyr2k #define pzher2k_ pzher2k #define pstradd_ pstradd #define pdtradd_ pdtradd #define pctradd_ pctradd #define pztradd_ pztradd #define pstran_ pstran #define pdtran_ pdtran #define pctranu_ pctranu #define pztranu_ pztranu #define pctranc_ pctranc #define pztranc_ pztranc #endif #if( _F2C_CALL_ == _F2C_F77ISF2C ) #define PB_freebuf_ PB_freebuf__ #define PB_topget_ pb_topget__ #define PB_topset_ pb_topset__ #endif /* * --------------------------------------------------------------------- * Function prototypes * --------------------------------------------------------------------- */ #ifdef __STDC__ void PB_freebuf_ ( void ); void PB_topget_ ( int *, F_CHAR_T, F_CHAR_T, F_CHAR_T ); void PB_topset_ ( int *, F_CHAR_T, F_CHAR_T, F_CHAR_T ); void picopy_ ( int *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int * ); void pscopy_ ( int *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int * ); void pdcopy_ ( int *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int * ); void pccopy_ ( int *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int * ); void pzcopy_ ( int *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int * ); void psswap_ ( int *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int * ); void pdswap_ ( int *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int * ); void pcswap_ ( int *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int * ); void pzswap_ ( int *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int * ); void psaxpy_ ( int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int * ); void pdaxpy_ ( int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int * ); void pcaxpy_ ( int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int * ); void pzaxpy_ ( int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int * ); void psscal_ ( int *, float *, float *, int *, int *, int *, int * ); void pdscal_ ( int *, double *, double *, int *, int *, int *, int * ); void pcscal_ ( int *, float *, float *, int *, int *, int *, int * ); void pcsscal_ ( int *, float *, float *, int *, int *, int *, int * ); void pzscal_ ( int *, double *, double *, int *, int *, int *, int * ); void pzdscal_ ( int *, double *, double *, int *, int *, int *, int * ); void psasum_ ( int *, float *, float *, int *, int *, int *, int * ); void pdasum_ ( int *, double *, double *, int *, int *, int *, int * ); void pscasum_ ( int *, float *, float *, int *, int *, int *, int * ); void pdzasum_ ( int *, double *, double *, int *, int *, int *, int * ); void psnrm2_ ( int *, float *, float *, int *, int *, int *, int * ); void pdnrm2_ ( int *, double *, double *, int *, int *, int *, int * ); void pscnrm2_ ( int *, float *, float *, int *, int *, int *, int * ); void pdznrm2_ ( int *, double *, double *, int *, int *, int *, int * ); void psdot_ ( int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int * ); void pddot_ ( int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int * ); void pcdotc_ ( int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int * ); void pcdotu_ ( int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int * ); void pzdotc_ ( int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int * ); void pzdotu_ ( int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int * ); void psamax_ ( int *, float *, int *, float *, int *, int *, int *, int * ); void pdamax_ ( int *, double *, int *, double *, int *, int *, int *, int * ); void pcamax_ ( int *, float *, int *, float *, int *, int *, int *, int * ); void pzamax_ ( int *, double *, int *, double *, int *, int *, int *, int * ); void psgemv_ ( F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pdgemv_ ( F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void pcgemv_ ( F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pzgemv_ ( F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void psagemv_ ( F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pdagemv_ ( F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void pcagemv_ ( F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pzagemv_ ( F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void psger_ ( int *, int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int *, float *, int *, int *, int * ); void pdger_ ( int *, int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int *, double *, int *, int *, int * ); void pcgerc_ ( int *, int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int *, float *, int *, int *, int * ); void pcgeru_ ( int *, int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int *, float *, int *, int *, int * ); void pzgerc_ ( int *, int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int *, double *, int *, int *, int * ); void pzgeru_ ( int *, int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int *, double *, int *, int *, int * ); void pssymv_ ( F_CHAR_T, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pdsymv_ ( F_CHAR_T, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void pchemv_ ( F_CHAR_T, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pzhemv_ ( F_CHAR_T, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void psasymv_ ( F_CHAR_T, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pdasymv_ ( F_CHAR_T, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void pcahemv_ ( F_CHAR_T, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pzahemv_ ( F_CHAR_T, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void pssyr_ ( F_CHAR_T, int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int * ); void pdsyr_ ( F_CHAR_T, int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int * ); void pcher_ ( F_CHAR_T, int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int * ); void pzher_ ( F_CHAR_T, int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int * ); void pssyr2_ ( F_CHAR_T, int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int *, float *, int *, int *, int * ); void pdsyr2_ ( F_CHAR_T, int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int *, double *, int *, int *, int * ); void pcher2_ ( F_CHAR_T, int *, float *, float *, int *, int *, int *, int *, float *, int *, int *, int *, int *, float *, int *, int *, int * ); void pzher2_ ( F_CHAR_T, int *, double *, double *, int *, int *, int *, int *, double *, int *, int *, int *, int *, double *, int *, int *, int * ); void pstrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, float *, int *, int *, int *, float *, int *, int *, int *, int * ); void pdtrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, double *, int *, int *, int *, double *, int *, int *, int *, int * ); void pctrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, float *, int *, int *, int *, float *, int *, int *, int *, int * ); void pztrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, double *, int *, int *, int *, double *, int *, int *, int *, int * ); void psatrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pdatrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void pcatrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, int *, float *, float *, int *, int *, int *, int * ); void pzatrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, int *, double *, double *, int *, int *, int *, int * ); void pstrsv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, float *, int *, int *, int *, float *, int *, int *, int *, int * ); void pdtrsv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, double *, int *, int *, int *, double *, int *, int *, int *, int * ); void pctrsv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, float *, int *, int *, int *, float *, int *, int *, int *, int * ); void pztrsv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, double *, int *, int *, int *, double *, int *, int *, int *, int * ); void psgeadd_ ( F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pdgeadd_ ( F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pcgeadd_ ( F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pzgeadd_ ( F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void psgemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pdgemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pcgemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pzgemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pssymm_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pdsymm_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pcsymm_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pzsymm_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pchemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pzhemm_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pssyr2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pdsyr2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pcsyr2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pzsyr2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pcher2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pzher2k_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pssyrk_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pdsyrk_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pcsyrk_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pzsyrk_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pcherk_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pzherk_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pstradd_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pdtradd_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pctradd_ ( F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pztradd_ ( F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pstran_ ( int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pdtran_ ( int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pctranc_ ( int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pztranc_ ( int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pctranu_ ( int *, int *, float *, float *, int *, int *, int *, float *, float *, int *, int *, int * ); void pztranu_ ( int *, int *, double *, double *, int *, int *, int *, double *, double *, int *, int *, int * ); void pstrmm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int * ); void pdtrmm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int * ); void pctrmm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int * ); void pztrmm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int * ); void pstrsm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int * ); void pdtrsm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int * ); void pctrsm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, float *, float *, int *, int *, int *, float *, int *, int *, int * ); void pztrsm_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, int *, double *, double *, int *, int *, int *, double *, int *, int *, int * ); #else void PB_freebuf_ (); void PB_topget_ (); void PB_topset_ (); void picopy_ (); void pscopy_ (); void pdcopy_ (); void pccopy_ (); void pzcopy_ (); void psswap_ (); void pdswap_ (); void pcswap_ (); void pzswap_ (); void psaxpy_ (); void pdaxpy_ (); void pcaxpy_ (); void pzaxpy_ (); void psscal_ (); void pdscal_ (); void pcscal_ (); void pcsscal_ (); void pzscal_ (); void pzdscal_ (); void psasum_ (); void pdasum_ (); void pscasum_ (); void pdzasum_ (); void psnrm2_ (); void pdnrm2_ (); void pscnrm2_ (); void pdznrm2_ (); void psdot_ (); void pddot_ (); void pcdotc_ (); void pcdotu_ (); void pzdotc_ (); void pzdotu_ (); void psamax_ (); void pdamax_ (); void pcamax_ (); void pzamax_ (); void psgemv_ (); void pdgemv_ (); void pcgemv_ (); void pzgemv_ (); void psagemv_ (); void pdagemv_ (); void pcagemv_ (); void pzagemv_ (); void psger_ (); void pdger_ (); void pcgerc_ (); void pcgeru_ (); void pzgerc_ (); void pzgeru_ (); void pssymv_ (); void pdsymv_ (); void pchemv_ (); void pzhemv_ (); void psasymv_ (); void pdasymv_ (); void pcahemv_ (); void pzahemv_ (); void pssyr_ (); void pdsyr_ (); void pcher_ (); void pzher_ (); void pssyr2_ (); void pdsyr2_ (); void pcher2_ (); void pzher2_ (); void pstrmv_ (); void pdtrmv_ (); void pctrmv_ (); void pztrmv_ (); void psatrmv_ (); void pdatrmv_ (); void pcatrmv_ (); void pzatrmv_ (); void pstrsv_ (); void pdtrsv_ (); void pctrsv_ (); void pztrsv_ (); void psgeadd_ (); void pdgeadd_ (); void pcgeadd_ (); void pzgeadd_ (); void psgemm_ (); void pdgemm_ (); void pcgemm_ (); void pzgemm_ (); void pssymm_ (); void pdsymm_ (); void pcsymm_ (); void pchemm_ (); void pzsymm_ (); void pzhemm_ (); void pssyr2k_ (); void pdsyr2k_ (); void pcsyr2k_ (); void pcher2k_ (); void pzsyr2k_ (); void pzher2k_ (); void pssyrk_ (); void pdsyrk_ (); void pcsyrk_ (); void pcherk_ (); void pzsyrk_ (); void pzherk_ (); void pstradd_ (); void pdtradd_ (); void pctradd_ (); void pztradd_ (); void pstran_ (); void pdtran_ (); void pctranc_ (); void pctranu_ (); void pztranc_ (); void pztranu_ (); void pstrmm_ (); void pdtrmm_ (); void pctrmm_ (); void pztrmm_ (); void pstrsm_ (); void pdtrsm_ (); void pctrsm_ (); void pztrsm_ (); #endif scalapack-2.0.2/PBLAS/SRC/PBtools.h000644 000766 000024 00000245355 11556766441 016771 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * March 12, 2002 * * --------------------------------------------------------------------- */ /* * This file includes PBLAS tools definitions. All PBLAS routines include * this file. * * ---------------------------------------------------------------------- * #define macro constants * --------------------------------------------------------------------- * * Descriptor entries for type 1 */ #define BLOCK_CYCLIC_2D 1 #define DTYPE1_ 0 /* Descriptor Type */ #define CTXT1_ 1 /* BLACS context */ #define M1_ 2 /* Global Number of Rows */ #define N1_ 3 /* Global Number of Columns */ #define MB1_ 4 /* Row Blocking Size */ #define NB1_ 5 /* Column Blocking Size */ #define RSRC1_ 6 /* Starting Processor Row */ #define CSRC1_ 7 /* Starting Processor Column */ #define LLD1_ 8 /* Local Leading Dimension */ #define DLEN1_ 9 /* Descriptor Length */ /* * Descriptor entries for type 2 */ #define BLOCK_CYCLIC_2D_INB 2 #define DTYPE_ 0 /* Descriptor Type */ #define CTXT_ 1 /* BLACS context */ #define M_ 2 /* Global Number of Rows */ #define N_ 3 /* Global Number of Columns */ #define IMB_ 4 /* Initial Row Blocking Size */ #define INB_ 5 /* Initial Column Blocking Size */ #define MB_ 6 /* Row Blocking Size */ #define NB_ 7 /* Column Blocking Size */ #define RSRC_ 8 /* Starting Process Row */ #define CSRC_ 9 /* Starting Process Column */ #define LLD_ 10 /* Local Leading Dimension */ #define DLEN_ 11 /* Descriptor Length */ #define CPACKING 'P' #define CUNPACKING 'U' #define PACKING "P" #define UNPACKING "U" #define CGENERAL 'G' /* #define CSYMM 'S' */ #define CHERM 'H' #define GENERAL "G" #define SYMM "S" #define HERM "H" #define ONE 1.0 #define TWO 2.0 #define ZERO 0.0 /* Input error checking related constants */ #define DESCMULT 100 #define BIGNUM 10000 /* * --------------------------------------------------------------------- * #define macro functions * --------------------------------------------------------------------- */ #define ABS( a_ ) ( ( (a_) < 0 ) ? -(a_) : (a_) ) #define MIN( a_, b_ ) ( ( (a_) < (b_) ) ? (a_) : (b_) ) #define MAX( a_, b_ ) ( ( (a_) > (b_) ) ? (a_) : (b_) ) #define FLOOR(a,b) (((a)>0) ? (((a)/(b))) : (-(((-(a))+(b)-1)/(b)))) #define CEIL(a,b) ( ( (a)+(b)-1 ) / (b) ) #define ICEIL(a,b) (((a)>0) ? ((((a)+(b)-1)/(b))) : (-((-(a))/(b)))) #define Mupcase(C) (((C)>96 && (C)<123) ? (C) & 0xDF : (C)) #define Mlowcase(C) (((C)>64 && (C)< 91) ? (C) | 32 : (C)) /* * The following macros perform common modulo operations; All functions * except MPosMod assume arguments are < d (i.e., arguments are themsel- * ves within modulo range). */ /* increment with mod */ #define MModInc(I, d) if(++(I) == (d)) (I) = 0 /* decrement with mod */ #define MModDec(I, d) if(--(I) == -1) (I) = (d)-1 /* positive modulo */ #define MPosMod(I, d) ( (I) - ((I)/(d))*(d) ) /* add two numbers */ #define MModAdd(I1, I2, d) \ ( ( (I1) + (I2) < (d) ) ? (I1) + (I2) : (I1) + (I2) - (d) ) /* add 1 to # */ #define MModAdd1(I, d) ( ((I) != (d)-1) ? (I) + 1 : 0 ) /* subtract two numbers */ #define MModSub(I1, I2, d) \ ( ( (I1) < (I2) ) ? (d) + (I1) - (I2) : (I1) - (I2) ) /* sub 1 from # */ #define MModSub1(I, d) ( ((I)!=0) ? (I)-1 : (d)-1 ) /* * DNROC computes maximum number of local rows or columns. This macro is * only used to compute the time estimates in the Level 3 PBLAS routines. */ #define DNROC( n_, nb_, p_ ) \ ((double)(((((n_)+(nb_)-1)/(nb_))+(p_)-1)/(p_))*(double)((nb_))) /* * Mptr returns a pointer to a_( i_, j_ ) for readability reasons and * also less silly errors ... * * There was some problems with the previous code which read: * * #define Mptr( a_, i_, j_, lda_, siz_ ) \ * ( (a_) + ( ( (i_)+(j_)*(lda_) )*(siz_) ) ) * * since it can overflow the 32-bit integer "easily". * The following code should fix the problem. * It uses the "off_t" command. * * Change made by Julien Langou on Sat. September 12, 2009. * Fix provided by John Moyard from CNES. * * JL :April 2011: Change off_t by long long * off_t is not supported under Windows */ #define Mptr( a_, i_, j_, lda_, siz_ ) \ ( (a_) + ( (long long) ( (long long)(i_)+ \ (long long)(j_)*(long long)(lda_))*(long long)(siz_) ) ) /* * Mfirstnb and Mlastnb compute the global size of the first and last * block corresponding to the interval i_:i_+n_-1 of global indexes. */ #define Mfirstnb( inbt_, n_, i_, inb_, nb_ ) \ inbt_ = (inb_) - (i_); \ if( inbt_ <= 0 ) \ inbt_ = ( (-inbt_) / (nb_) + 1 ) * (nb_) + inbt_; \ inbt_ = MIN( inbt_, (n_) ); #define Mlastnb( inbt_, n_, i_, inb_, nb_ ) \ inbt_ = (i_) + (n_) - (inb_); \ if( inbt_ > 0 ) \ { \ inbt_ = -( ( (nb_)+inbt_-1 )/(nb_)-1 )*(nb_) + inbt_; \ inbt_ = MIN( inbt_, (n_) ); \ } \ else { inbt_ = (n_); }; /* * Does the index interval i_:i_+n_-1 spans more than one process rows * or columns ? * * Mspan returns 0 (false) when the data is replicated (srcproc_ < 0) or * when there is only one process row or column in the process grid. */ #define Mspan( n_, i_, inb_, nb_, srcproc_, nprocs_ ) \ ( ( (srcproc_) >= 0 ) && ( ( (nprocs_) > 1 ) && \ ( ( (i_) < (inb_) ) ? \ ( (i_) + (n_) > (inb_) ) : \ ( (i_) + (n_) > (inb_) + \ ( ( (i_) - (inb_) ) / (nb_) + 1 ) * (nb_) ) ) ) ) /* * Mindxl2g computes the global index ig_ corresponding to the local * index il_ in process proc_. */ #define Mindxl2g( ig_, il_, inb_, nb_, proc_, srcproc_, nprocs_ ) \ { \ if( ( (srcproc_) >= 0 ) && ( (nprocs_) > 1 ) ) \ { \ if( (proc_) == (srcproc_) ) \ { \ if( (il_) < (inb_) ) ig_ = (il_); \ else ig_ = (il_) + \ (nb_)*((nprocs_)-1)*( ((il_)-(inb_)) / (nb_) + 1 ); \ } \ else if( (proc_) < (srcproc_) ) \ { \ ig_ = (il_) + (inb_) + \ (nb_)*( ((nprocs_)-1)*((il_)/(nb_)) + \ (proc_)-(srcproc_)-1+(nprocs_) ); \ } \ else \ { \ ig_ = (il_) + (inb_) + \ (nb_)*( ((nprocs_)-1)*((il_)/(nb_)) + \ (proc_)-(srcproc_)-1 ); \ } \ } \ else \ { \ ig_ = (il_); \ } \ } /* * Mindxg2p returns the process coodinate owning the entry globally * indexed by ig_. */ #define Mindxg2p( ig_, inb_, nb_, proc_, srcproc_, nprocs_ ) \ { \ if( ( (ig_) >= (inb_) ) && ( (srcproc_) >= 0 ) && \ ( (nprocs_) > 1 ) ) \ { \ proc_ = (srcproc_) + 1 + ( (ig_)-(inb_) ) / (nb_); \ proc_ -= ( proc_ / (nprocs_) ) * (nprocs_); \ } \ else \ { \ proc_ = (srcproc_); \ } \ } /* * Mnumroc computes the # of local indexes np_ residing in the process * of coordinate proc_ corresponding to the interval of global indexes * i_:i_+n_-1 assuming that the global index 0 resides in the process * srcproc_, and that the indexes are distributed from srcproc_ using * the parameters inb_, nb_ and nprocs_. */ #define Mnumroc( np_, n_, i_, inb_, nb_, proc_, srcproc_, nprocs_ ) \ { \ if( ( (srcproc_) >= 0 ) && ( (nprocs_) > 1 ) ) \ { \ int inb__, mydist__, n__, nblk__, quot__, src__; \ if( ( inb__ = (inb_) - (i_) ) <= 0 ) \ { \ src__ = (srcproc_) + ( nblk__ = (-inb__) / (nb_) + 1 ); \ src__ -= ( src__ / (nprocs_) ) * (nprocs_); \ inb__ += nblk__*(nb_); \ if( ( n__ = (n_) - inb__ ) <= 0 ) \ { if( (proc_) == src__ ) np_ = (n_); else np_ = 0; } \ else \ { \ if( ( mydist__ = (proc_) - src__ ) < 0 ) \ mydist__ += (nprocs_); \ nblk__ = n__ / (nb_) + 1; \ mydist__ -= nblk__ - \ ( quot__ = ( nblk__ / (nprocs_) ) ) * (nprocs_); \ if( mydist__ < 0 ) \ { \ if( (proc_) != src__ ) \ np_ = (nb_) + (nb_) * quot__; \ else \ np_ = inb__ + (nb_) * quot__; \ } \ else if( mydist__ > 0 ) \ { \ np_ = (nb_) * quot__; \ } \ else \ { \ if( (proc_) != src__ ) \ np_ = n__ + (nb_) + (nb_) * ( quot__ - nblk__ ); \ else \ np_ = (n_) + (nb_) * ( quot__ - nblk__ ); \ } \ } \ } \ else \ { \ if( ( n__ = (n_) - inb__ ) <= 0 ) \ { if( (proc_) == (srcproc_) ) np_ = (n_); else np_ = 0; } \ else \ { \ if( ( mydist__ = (proc_) - (srcproc_) ) < 0 ) \ mydist__ += (nprocs_); \ nblk__ = n__ / (nb_) + 1; \ mydist__ -= nblk__ - \ ( quot__ = ( nblk__ / (nprocs_) ) ) * (nprocs_); \ if( mydist__ < 0 ) \ { \ if( (proc_) != (srcproc_) ) \ np_ = (nb_) + (nb_) * quot__; \ else \ np_ = inb__ + (nb_) * quot__; \ } \ else if( mydist__ > 0 ) \ { \ np_ = (nb_) * quot__; \ } \ else \ { \ if( (proc_) != (srcproc_) ) \ np_ = n__ + (nb_) + (nb_) * ( quot__ - nblk__ ); \ else \ np_ = (n_) + (nb_) * ( quot__ - nblk__ ); \ } \ } \ } \ } \ else \ { \ np_ = (n_); \ } \ } #define Mnpreroc( np_, n_, i_, inb_, nb_, proc_, srcproc_, nprocs_ ) \ { \ if( ( (srcproc_) >= 0 ) && ( (nprocs_) > 1 ) ) \ { \ int inb__, mydist__, n__, nblk__, quot__, rem__, src__; \ if( ( inb__ = (inb_) - (i_) ) <= 0 ) \ { \ src__ = (srcproc_) + ( nblk__ = (-inb__) / (nb_) + 1 ); \ src__ -= ( src__ / (nprocs_) ) * (nprocs_); \ if( (proc_) != src__ ) \ { \ inb__ += nblk__*(nb_); \ if( ( n__ = (n_) - inb__ ) <= 0 ) { np_ = (n_); } \ else \ { \ if( ( mydist__ = (proc_) - src__ ) < 0 ) \ mydist__ += (nprocs_); \ nblk__ = n__ / (nb_) + 1; \ rem__ = nblk__ - \ ( quot__ = ( nblk__ / (nprocs_) ) ) * (nprocs_); \ if( mydist__ <= rem__ ) \ { \ np_ = inb__ - (nb_) + \ ( quot__ + 1 ) * mydist__ * (nb_); \ } \ else \ { \ np_ = (n_) + \ ( mydist__ - (nprocs_) ) * quot__ * (nb_); \ } \ } \ } \ else \ { \ np_ = 0; \ } \ } \ else \ { \ if( (proc_) != (srcproc_) ) \ { \ if( ( n__ = (n_) - inb__ ) <= 0 ) { np_ = (n_); } \ else \ { \ if( ( mydist__ = (proc_) - (srcproc_) ) < 0 ) \ mydist__ += (nprocs_); \ nblk__ = n__ / (nb_) + 1; \ rem__ = nblk__ - \ ( quot__ = ( nblk__ / (nprocs_) ) ) * (nprocs_); \ if( mydist__ <= rem__ ) \ { \ np_ = inb__ - (nb_) + \ ( quot__ + 1 ) * mydist__ * (nb_); \ } \ else \ { \ np_ = (n_) + \ ( mydist__ - (nprocs_) ) * quot__ * (nb_); \ } \ } \ } \ else \ { \ np_ = 0; \ } \ } \ } \ else \ { \ np_ = 0; \ } \ } #define Mnnxtroc( np_, n_, i_, inb_, nb_, proc_, srcproc_, nprocs_ ) \ { \ if( ( (srcproc_) >= 0 ) && ( (nprocs_) > 1 ) ) \ { \ int inb__, mydist__, n__, nblk__, quot__, rem__, src__; \ if( ( inb__ = (inb_) - (i_) ) <= 0 ) \ { \ src__ = (srcproc_) + ( nblk__ = (-inb__) / (nb_) + 1 ); \ src__ -= ( src__ / (nprocs_) ) * (nprocs_); \ inb__ += nblk__*(nb_); \ if( ( n__ = (n_) - inb__ ) <= 0 ) { np_ = 0; } \ else \ { \ if( ( mydist__ = (proc_) - src__ ) < 0 ) \ mydist__ += (nprocs_); \ nblk__ = n__ / (nb_) + 1; \ rem__ = nblk__ - \ ( quot__ = ( nblk__ / (nprocs_) ) ) * (nprocs_); \ if( mydist__ < rem__ ) \ { \ np_ = n__ - ( quot__ * mydist__ + \ quot__ + mydist__ ) * (nb_); \ } \ else \ { \ np_ = ( (nprocs_) - 1 - mydist__ ) * quot__ * (nb_); \ } \ } \ } \ else \ { \ if( ( n__ = (n_) - inb__ ) <= 0 ) { np_ = 0; } \ else \ { \ if( ( mydist__ = (proc_) - (srcproc_) ) < 0 ) \ mydist__ += (nprocs_); \ nblk__ = n__ / (nb_) + 1; \ rem__ = nblk__ - \ ( quot__ = ( nblk__ / (nprocs_) ) ) * (nprocs_); \ if( mydist__ < rem__ ) \ { \ np_ = n__ - ( quot__ * mydist__ + \ quot__ + mydist__ ) * (nb_); \ } \ else \ { \ np_ = ( (nprocs_) - 1 - mydist__ ) * quot__ * (nb_); \ } \ } \ } \ } \ else \ { np_ = 0; } \ } #define Minfog2l( i_, j_, desc_, nr_, nc_, r_, c_, ii_, jj_, pr_, pc_ ) \ { \ int quot__, i__, imb__, inb__, j__, mb__, mydist__, \ nb__, nblk__, src__; \ imb__ = desc_[IMB_]; mb__ = desc_[MB_]; pr_ = desc_[RSRC_]; \ if( ( pr_ >= 0 ) && ( nr_ > 1 ) ) \ { \ if( ( i__ = (i_) - imb__ ) < 0 ) \ { ii_ = ( r_ == pr_ ? (i_) : 0 ); } \ else \ { \ src__ = pr_; \ pr_ += ( nblk__ = i__ / mb__ + 1 ); \ pr_ -= ( pr_ / nr_ ) * nr_; \ if( ( mydist__ = r_ - src__ ) < 0 ) mydist__ += nr_; \ if( mydist__ >= nblk__ - ( quot__ = nblk__ / nr_ ) * nr_ ) \ { \ if( r_ != src__ ) ii_ = mb__; \ else ii_ = imb__; \ if( r_ != pr_ ) \ ii_ += ( quot__ - 1 ) * mb__; \ else \ ii_ += i__ + ( quot__ - nblk__ ) * mb__; \ } \ else \ { \ if( r_ != src__ ) ii_ = mb__ + quot__ * mb__; \ else ii_ = imb__ + quot__ * mb__; \ } \ } \ } \ else \ { \ ii_ = (i_); \ } \ inb__ = desc_[INB_]; nb__ = desc_[NB_]; pc_ = desc_[CSRC_]; \ if( ( pc_ >= 0 ) && ( nc_ > 1 ) ) \ { \ if( ( j__ = (j_) - inb__ ) < 0 ) \ { jj_ = ( c_ == pc_ ? (j_) : 0 ); } \ else \ { \ src__ = pc_; \ pc_ += ( nblk__ = j__ / nb__ + 1 ); \ pc_ -= ( pc_ / nc_ ) * nc_; \ if( ( mydist__ = c_ - src__ ) < 0 ) mydist__ += nc_; \ if( mydist__ >= nblk__ - ( quot__ = nblk__ / nc_ ) * nc_ ) \ { \ if( c_ != src__ ) jj_ = nb__; \ else jj_ = inb__; \ if( c_ != pc_ ) \ jj_ += ( quot__ - 1 ) * nb__; \ else \ jj_ += j__ + ( quot__ - nblk__ ) * nb__; \ } \ else \ { \ if( c_ != src__ ) jj_ = nb__ + quot__ * nb__; \ else jj_ = inb__ + quot__ * nb__; \ } \ } \ } \ else \ { \ jj_ = (j_); \ } \ } /* * The following macros initialize or translate descriptors. */ #define MDescSet( desc, m, n, imb, inb, mb, nb, rsrc, csrc, ictxt, lld ) \ { \ (desc)[DTYPE_] = BLOCK_CYCLIC_2D_INB; \ (desc)[CTXT_ ] = (ictxt); \ (desc)[M_ ] = (m); \ (desc)[N_ ] = (n); \ (desc)[IMB_ ] = (imb); \ (desc)[INB_ ] = (inb); \ (desc)[MB_ ] = (mb); \ (desc)[NB_ ] = (nb); \ (desc)[RSRC_ ] = (rsrc); \ (desc)[CSRC_ ] = (csrc); \ (desc)[LLD_ ] = (lld); \ } #define MDescCopy(DescIn, DescOut) \ { \ (DescOut)[DTYPE_] = (DescIn)[DTYPE_]; \ (DescOut)[M_ ] = (DescIn)[M_ ]; \ (DescOut)[N_ ] = (DescIn)[N_ ]; \ (DescOut)[IMB_ ] = (DescIn)[IMB_ ]; \ (DescOut)[INB_ ] = (DescIn)[INB_ ]; \ (DescOut)[MB_ ] = (DescIn)[MB_ ]; \ (DescOut)[NB_ ] = (DescIn)[NB_ ]; \ (DescOut)[RSRC_ ] = (DescIn)[RSRC_ ]; \ (DescOut)[CSRC_ ] = (DescIn)[CSRC_ ]; \ (DescOut)[CTXT_ ] = (DescIn)[CTXT_ ]; \ (DescOut)[LLD_ ] = (DescIn)[LLD_ ]; \ } #define MDescTrans(DescIn, DescOut) \ { \ if ( (DescIn)[DTYPE_] == BLOCK_CYCLIC_2D ) \ { \ (DescOut)[DTYPE_] = BLOCK_CYCLIC_2D_INB; \ (DescOut)[M_ ] = (DescIn)[M1_ ]; \ (DescOut)[N_ ] = (DescIn)[N1_ ]; \ (DescOut)[IMB_ ] = (DescIn)[MB1_ ]; \ (DescOut)[INB_ ] = (DescIn)[NB1_ ]; \ (DescOut)[MB_ ] = (DescIn)[MB1_ ]; \ (DescOut)[NB_ ] = (DescIn)[NB1_ ]; \ (DescOut)[RSRC_ ] = (DescIn)[RSRC1_ ]; \ (DescOut)[CSRC_ ] = (DescIn)[CSRC1_ ]; \ (DescOut)[CTXT_ ] = (DescIn)[CTXT1_ ]; \ (DescOut)[LLD_ ] = (DescIn)[LLD1_ ]; \ } \ else if ( (DescIn)[DTYPE_] == BLOCK_CYCLIC_2D_INB ) \ { \ (DescOut)[DTYPE_] = BLOCK_CYCLIC_2D_INB; \ (DescOut)[M_ ] = (DescIn)[M_ ]; \ (DescOut)[N_ ] = (DescIn)[N_ ]; \ (DescOut)[IMB_ ] = (DescIn)[IMB_ ]; \ (DescOut)[INB_ ] = (DescIn)[INB_ ]; \ (DescOut)[MB_ ] = (DescIn)[MB_ ]; \ (DescOut)[NB_ ] = (DescIn)[NB_ ]; \ (DescOut)[RSRC_ ] = (DescIn)[RSRC_ ]; \ (DescOut)[CSRC_ ] = (DescIn)[CSRC_ ]; \ (DescOut)[CTXT_ ] = (DescIn)[CTXT_ ]; \ (DescOut)[LLD_ ] = (DescIn)[LLD_ ]; \ } \ else \ { \ (DescOut)[DTYPE_] = (DescIn)[0]; \ (DescOut)[CTXT_ ] = (DescIn)[1]; \ (DescOut)[M_ ] = 0; \ (DescOut)[N_ ] = 0; \ (DescOut)[IMB_ ] = 1; \ (DescOut)[INB_ ] = 1; \ (DescOut)[MB_ ] = 1; \ (DescOut)[NB_ ] = 1; \ (DescOut)[RSRC_ ] = 0; \ (DescOut)[CSRC_ ] = 0; \ (DescOut)[LLD_ ] = 1; \ } \ } #define MIndxTrans( I, J, i, j ) \ { \ i = *I - 1; \ j = *J - 1; \ } #if( _F2C_CALL_ == _F2C_ADD_ ) /* * These defines set up the naming scheme required to have a FORTRAN * routine called by a C routine. No redefinition is necessary to have * the following FORTRAN to C interface: * * FORTRAN DECLARATION C CALL * SUBROUTINE PDFOO(...) pdfoo_(...) * * This is the PBLAS default. */ #endif #if( _F2C_CALL_ == _F2C_F77ISF2C ) /* * These defines set up the naming scheme required to have a FORTRAN * routine called by a C routine for systems where the FORTRAN compiler * is actually f2c (a FORTRAN to C conversion utility). * * FORTRAN DECLARATION C CALL * SUBROUTINE PDFOO(...) pdfoo__(...) */ #endif #if( _F2C_CALL_ == _F2C_UPCASE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine called by a C routine with the following FORTRAN to C inter- * face: * * FORTRAN DECLARATION C CALL * SUBROUTINE PDFOO(...) PDFOO(...) */ #define immadd_ IMMADD #define smmadd_ SMMADD #define dmmadd_ DMMADD #define cmmadd_ CMMADD #define zmmadd_ ZMMADD #define immtadd_ IMMTADD #define smmtadd_ SMMTADD #define dmmtadd_ DMMTADD #define cmmtadd_ CMMTADD #define zmmtadd_ ZMMTADD #define smmcadd_ SMMCADD #define dmmcadd_ DMMCADD #define cmmcadd_ CMMCADD #define zmmcadd_ ZMMCADD #define smmtcadd_ SMMTCADD #define dmmtcadd_ DMMTCADD #define cmmtcadd_ CMMTCADD #define zmmtcadd_ ZMMTCADD #define immdda_ IMMDDA #define smmdda_ SMMDDA #define dmmdda_ DMMDDA #define cmmdda_ CMMDDA #define zmmdda_ ZMMDDA #define smmddac_ SMMDDAC #define dmmddac_ DMMDDAC #define cmmddac_ CMMDDAC #define zmmddac_ ZMMDDAC #define immddat_ IMMDDAT #define smmddat_ SMMDDAT #define dmmddat_ DMMDDAT #define cmmddat_ CMMDDAT #define zmmddat_ ZMMDDAT #define smmddact_ SMMDDACT #define dmmddact_ DMMDDACT #define cmmddact_ CMMDDACT #define zmmddact_ ZMMDDACT #define sasqrtb_ SASQRTB #define dasqrtb_ DASQRTB #define sset_ SSET #define dset_ DSET #define cset_ CSET #define zset_ ZSET #define svasum_ SVASUM #define dvasum_ DVASUM #define scvasum_ SCVASUM #define dzvasum_ DZVASUM #define sascal_ SASCAL #define dascal_ DASCAL #define scshft_ SCSHFT #define dcshft_ DCSHFT #define ccshft_ CCSHFT #define zcshft_ ZCSHFT #define srshft_ SRSHFT #define drshft_ DRSHFT #define crshft_ CRSHFT #define zrshft_ ZRSHFT #define svvdot_ SVVDOT #define dvvdot_ DVVDOT #define cvvdotc_ CVVDOTC #define cvvdotu_ CVVDOTU #define zvvdotc_ ZVVDOTC #define zvvdotu_ ZVVDOTU #define stzpad_ STZPAD #define dtzpad_ DTZPAD #define ctzpad_ CTZPAD #define ztzpad_ ZTZPAD #define stzpadcpy_ STZPADCPY #define dtzpadcpy_ DTZPADCPY #define ctzpadcpy_ CTZPADCPY #define ztzpadcpy_ ZTZPADCPY #define stzscal_ STZSCAL #define dtzscal_ DTZSCAL #define ctzscal_ CTZSCAL #define ztzscal_ ZTZSCAL #define chescal_ CHESCAL #define zhescal_ ZHESCAL #define ctzcnjg_ CTZCNJG #define ztzcnjg_ ZTZCNJG #define sagemv_ SAGEMV #define dagemv_ DAGEMV #define cagemv_ CAGEMV #define zagemv_ ZAGEMV #define sasymv_ SASYMV #define dasymv_ DASYMV #define casymv_ CASYMV #define zasymv_ ZASYMV #define cahemv_ CAHEMV #define zahemv_ ZAHEMV #define satrmv_ SATRMV #define datrmv_ DATRMV #define catrmv_ CATRMV #define zatrmv_ ZATRMV #define csymv_ CSYMV #define zsymv_ ZSYMV #define csyr_ CSYR #define zsyr_ ZSYR #define csyr2_ CSYR2 #define zsyr2_ ZSYR2 #endif #if( _F2C_CALL_ == _F2C_NOCHANGE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine called by a C routine with the following FORTRAN to C inter- * face: * * FORTRAN DECLARATION C CALL * SUBROUTINE PDFOO(...) pdfoo(...) */ #define immadd_ immadd #define smmadd_ smmadd #define dmmadd_ dmmadd #define cmmadd_ cmmadd #define zmmadd_ zmmadd #define immtadd_ immtadd #define smmtadd_ smmtadd #define dmmtadd_ dmmtadd #define cmmtadd_ cmmtadd #define zmmtadd_ zmmtadd #define smmcadd_ smmcadd #define dmmcadd_ dmmcadd #define cmmcadd_ cmmcadd #define zmmcadd_ zmmcadd #define smmtcadd_ smmtcadd #define dmmtcadd_ dmmtcadd #define cmmtcadd_ cmmtcadd #define zmmtcadd_ zmmtcadd #define immdda_ immdda #define smmdda_ smmdda #define dmmdda_ dmmdda #define cmmdda_ cmmdda #define zmmdda_ zmmdda #define smmddac_ smmddac #define dmmddac_ dmmddac #define cmmddac_ cmmddac #define zmmddac_ zmmddac #define immddat_ immddat #define smmddat_ smmddat #define dmmddat_ dmmddat #define cmmddat_ cmmddat #define zmmddat_ zmmddat #define smmddact_ smmddact #define dmmddact_ dmmddact #define cmmddact_ cmmddact #define zmmddact_ zmmddact #define sasqrtb_ sasqrtb #define dasqrtb_ dasqrtb #define sset_ sset #define dset_ dset #define cset_ cset #define zset_ zset #define svasum_ svasum #define dvasum_ dvasum #define scvasum_ scvasum #define dzvasum_ dzvasum #define sascal_ sascal #define dascal_ dascal #define scshft_ scshft #define dcshft_ dcshft #define ccshft_ ccshft #define zcshft_ zcshft #define srshft_ srshft #define drshft_ drshft #define crshft_ crshft #define zrshft_ zrshft #define svvdot_ svvdot #define dvvdot_ dvvdot #define cvvdotc_ cvvdotc #define cvvdotu_ cvvdotu #define zvvdotc_ zvvdotc #define zvvdotu_ zvvdotu #define stzpad_ stzpad #define dtzpad_ dtzpad #define ctzpad_ ctzpad #define ztzpad_ ztzpad #define stzpadcpy_ stzpadcpy #define dtzpadcpy_ dtzpadcpy #define ctzpadcpy_ ctzpadcpy #define ztzpadcpy_ ztzpadcpy #define stzscal_ stzscal #define dtzscal_ dtzscal #define ctzscal_ ctzscal #define ztzscal_ ztzscal #define chescal_ chescal #define zhescal_ zhescal #define ctzcnjg_ ctzcnjg #define ztzcnjg_ ztzcnjg #define sagemv_ sagemv #define dagemv_ dagemv #define cagemv_ cagemv #define zagemv_ zagemv #define sasymv_ sasymv #define dasymv_ dasymv #define casymv_ casymv #define zasymv_ zasymv #define cahemv_ cahemv #define zahemv_ zahemv #define satrmv_ satrmv #define datrmv_ datrmv #define catrmv_ catrmv #define zatrmv_ zatrmv #define csymv_ csymv #define zsymv_ zsymv #define csyr_ csyr #define zsyr_ zsyr #define csyr2_ csyr2 #define zsyr2_ zsyr2 #endif /* * --------------------------------------------------------------------- * Function prototypes * --------------------------------------------------------------------- */ #ifdef __STDC__ F_VOID_FCT immadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT smmadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT dmmadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT cmmadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zmmadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT smmcadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT dmmcadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT cmmcadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zmmcadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT immtadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT smmtadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT dmmtadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT cmmtadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zmmtadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT smmtcadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT dmmtcadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT cmmtcadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zmmtcadd_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT immdda_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT smmdda_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT dmmdda_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT cmmdda_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zmmdda_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT smmddac_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT dmmddac_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT cmmddac_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zmmddac_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT immddat_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT smmddat_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT dmmddat_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT cmmddat_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zmmddat_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT smmddact_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT dmmddact_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT cmmddact_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT zmmddact_ ( int *, int *, char *, char *, int *, char *, char *, int * ); F_VOID_FCT sasqrtb_ ( float *, float *, float * ); F_VOID_FCT dasqrtb_ ( double *, double *, double * ); F_VOID_FCT sset_ ( int *, char *, char *, int * ); F_VOID_FCT dset_ ( int *, char *, char *, int * ); F_VOID_FCT cset_ ( int *, char *, char *, int * ); F_VOID_FCT zset_ ( int *, char *, char *, int * ); F_VOID_FCT svasum_ ( int *, char *, char *, int * ); F_VOID_FCT dvasum_ ( int *, char *, char *, int * ); F_VOID_FCT scvasum_ ( int *, char *, char *, int * ); F_VOID_FCT dzvasum_ ( int *, char *, char *, int * ); F_VOID_FCT sascal_ ( int *, char *, char *, int * ); F_VOID_FCT dascal_ ( int *, char *, char *, int * ); F_VOID_FCT scshft_ ( int *, int *, int *, char *, int * ); F_VOID_FCT dcshft_ ( int *, int *, int *, char *, int * ); F_VOID_FCT ccshft_ ( int *, int *, int *, char *, int * ); F_VOID_FCT zcshft_ ( int *, int *, int *, char *, int * ); F_VOID_FCT srshft_ ( int *, int *, int *, char *, int * ); F_VOID_FCT drshft_ ( int *, int *, int *, char *, int * ); F_VOID_FCT crshft_ ( int *, int *, int *, char *, int * ); F_VOID_FCT zrshft_ ( int *, int *, int *, char *, int * ); F_VOID_FCT svvdot_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT dvvdot_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT cvvdotu_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT cvvdotc_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT zvvdotu_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT zvvdotc_ ( int *, char *, char *, int *, char *, int * ); F_VOID_FCT stzpad_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, char *, int * ); F_VOID_FCT dtzpad_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, char *, int * ); F_VOID_FCT ctzpad_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, char *, int * ); F_VOID_FCT ztzpad_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, char *, char *, int * ); F_VOID_FCT stzpadcpy_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, int *, char *, int * ); F_VOID_FCT dtzpadcpy_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, int *, char *, int * ); F_VOID_FCT ctzpadcpy_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, int *, char *, int * ); F_VOID_FCT ztzpadcpy_ ( F_CHAR_T, F_CHAR_T, int *, int *, int *, char *, int *, char *, int * ); F_VOID_FCT stzscal_ ( F_CHAR_T, int *, int *, int *, char *, char *, int * ); F_VOID_FCT dtzscal_ ( F_CHAR_T, int *, int *, int *, char *, char *, int * ); F_VOID_FCT ctzscal_ ( F_CHAR_T, int *, int *, int *, char *, char *, int * ); F_VOID_FCT ztzscal_ ( F_CHAR_T, int *, int *, int *, char *, char *, int * ); F_VOID_FCT chescal_ ( F_CHAR_T, int *, int *, int *, char *, char *, int * ); F_VOID_FCT zhescal_ ( F_CHAR_T, int *, int *, int *, char *, char *, int * ); F_VOID_FCT ctzcnjg_ ( F_CHAR_T, int *, int *, int *, char *, char *, int * ); F_VOID_FCT ztzcnjg_ ( F_CHAR_T, int *, int *, int *, char *, char *, int * ); F_VOID_FCT sagemv_ ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT dagemv_ ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT cagemv_ ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zagemv_ ( F_CHAR_T, int *, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT sasymv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT dasymv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT casymv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zasymv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT cahemv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zahemv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT satrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT datrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT catrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zatrmv_ ( F_CHAR_T, F_CHAR_T, F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT csymv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT zsymv_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, char *, int * ); F_VOID_FCT csyr_ ( F_CHAR_T, int *, char *, char *, int *, char *, int * ); F_VOID_FCT zsyr_ ( F_CHAR_T, int *, char *, char *, int *, char *, int * ); F_VOID_FCT csyr2_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, int * ); F_VOID_FCT zsyr2_ ( F_CHAR_T, int *, char *, char *, int *, char *, int *, char *, int * ); void PB_Ctzsyr ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int ); void PB_Ctzher ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int ); void PB_Ctzsyr2 ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_Ctzher2 ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_Ctztrmv ( PBTYP_T *, char *, char *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int ); void PB_Ctzatrmv ( PBTYP_T *, char *, char *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int ); void PB_Ctzsymv ( PBTYP_T *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_Ctzhemv ( PBTYP_T *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_Ctzasymv ( PBTYP_T *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_Ctzahemv ( PBTYP_T *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_Ctzsyrk ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int ); void PB_Ctzherk ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int ); void PB_Ctzsyr2k ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_Ctzher2k ( PBTYP_T *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_Ctztrmm ( PBTYP_T *, char *, char *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int ); void PB_Ctzsymm ( PBTYP_T *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_Ctzhemm ( PBTYP_T *, char *, char *, int, int, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int ); void PB_CpswapNN ( PBTYP_T *, int, char *, int, int, int *, int, char *, int, int, int *, int ); void PB_CpswapND ( PBTYP_T *, int, char *, int, int, int *, int, char *, int, int, int *, int ); void PB_Cpdot11 ( PBTYP_T *, int, char *, char *, int, int, int *, int, char *, int, int, int *, int, VVDOT_T ); void PB_CpdotNN ( PBTYP_T *, int, char *, char *, int, int, int *, int, char *, int, int, int *, int, VVDOT_T ); void PB_CpdotND ( PBTYP_T *, int, char *, char *, int, int, int *, int, char *, int, int, int *, int, VVDOT_T ); void PB_CpaxpbyNN ( PBTYP_T *, char *, int, int, char *, char *, int, int, int *, char *, char *, char *, int, int, int *, char * ); void PB_CpaxpbyND ( PBTYP_T *, char *, int, int, char *, char *, int, int, int *, char *, char *, char *, int, int, int *, char * ); void PB_CpaxpbyDN ( PBTYP_T *, char *, int, int, char *, char *, int, int, int *, char *, char *, char *, int, int, int *, char * ); void PB_Cpaxpby ( PBTYP_T *, char *, int, int, char *, char *, int, int, int *, char *, char *, char *, int, int, int *, char * ); void PB_Cpsyr ( PBTYP_T *, char *, int, int, char *, char *, int, char *, int, char *, int, int, int *, TZSYR_T ); void PB_Cpsyr2 ( PBTYP_T *, char *, int, int, char *, char *, int, char *, int, char *, int, char *, int, char *, int, int, int *, TZSYR2_T ); void PB_Cptrm ( PBTYP_T *, PBTYP_T *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, char *, int, TZTRM_T ); void PB_Cpsym ( PBTYP_T *, PBTYP_T *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, char *, int, char *, int, char *, int, TZSYM_T ); void PB_Cpgeadd ( PBTYP_T *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_Cptradd ( PBTYP_T *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_Cptran ( PBTYP_T *, char *, int, int, char *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_Cptrsv ( PBTYP_T *, int, char *, char *, char *, int, char *, int, int, int *, char *, int, char *, int ); void PB_Cptrsm ( PBTYP_T *, int, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, char *, int ); void PB_CpgemmAB ( PBTYP_T *, char *, char *, char *, char *, int, int, int, char *, char *, int, int, int *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_CpgemmAC ( PBTYP_T *, char *, char *, char *, char *, int, int, int, char *, char *, int, int, int *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_CpgemmBC ( PBTYP_T *, char *, char *, char *, char *, int, int, int, char *, char *, int, int, int *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_CpsymmAB ( PBTYP_T *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_CpsymmBC ( PBTYP_T *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_CpsyrkA ( PBTYP_T *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_CpsyrkAC ( PBTYP_T *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_Cpsyr2kA ( PBTYP_T *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_Cpsyr2kAC ( PBTYP_T *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int *, char *, char *, int, int, int * ); void PB_CptrmmAB ( PBTYP_T *, char *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int * ); void PB_CptrmmB ( PBTYP_T *, char *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int * ); void PB_CptrsmAB ( PBTYP_T *, char *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int * ); void PB_CptrsmAB0 ( PBTYP_T *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int *, char * *, int *, int * ); void PB_CptrsmAB1 ( PBTYP_T *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int *, char *, int * ); void PB_CptrsmB ( PBTYP_T *, char *, char *, char *, char *, char *, int, int, char *, char *, int, int, int *, char *, int, int, int * ); #else F_VOID_FCT immadd_ (); F_VOID_FCT smmadd_ (); F_VOID_FCT dmmadd_ (); F_VOID_FCT cmmadd_ (); F_VOID_FCT zmmadd_ (); F_VOID_FCT smmcadd_ (); F_VOID_FCT dmmcadd_ (); F_VOID_FCT cmmcadd_ (); F_VOID_FCT zmmcadd_ (); F_VOID_FCT immtadd_ (); F_VOID_FCT smmtadd_ (); F_VOID_FCT dmmtadd_ (); F_VOID_FCT cmmtadd_ (); F_VOID_FCT zmmtadd_ (); F_VOID_FCT smmtcadd_ (); F_VOID_FCT dmmtcadd_ (); F_VOID_FCT cmmtcadd_ (); F_VOID_FCT zmmtcadd_ (); F_VOID_FCT immdda_ (); F_VOID_FCT smmdda_ (); F_VOID_FCT dmmdda_ (); F_VOID_FCT cmmdda_ (); F_VOID_FCT zmmdda_ (); F_VOID_FCT smmddac_ (); F_VOID_FCT dmmddac_ (); F_VOID_FCT cmmddac_ (); F_VOID_FCT zmmddac_ (); F_VOID_FCT immddat_ (); F_VOID_FCT smmddat_ (); F_VOID_FCT dmmddat_ (); F_VOID_FCT cmmddat_ (); F_VOID_FCT zmmddat_ (); F_VOID_FCT smmddact_ (); F_VOID_FCT dmmddact_ (); F_VOID_FCT cmmddact_ (); F_VOID_FCT zmmddact_ (); F_VOID_FCT sasqrtb_ (); F_VOID_FCT dasqrtb_ (); F_VOID_FCT sset_ (); F_VOID_FCT dset_ (); F_VOID_FCT cset_ (); F_VOID_FCT zset_ (); F_VOID_FCT svasum_ (); F_VOID_FCT dvasum_ (); F_VOID_FCT scvasum_ (); F_VOID_FCT dzvasum_ (); F_VOID_FCT sascal_ (); F_VOID_FCT dascal_ (); F_VOID_FCT scshft_ (); F_VOID_FCT dcshft_ (); F_VOID_FCT ccshft_ (); F_VOID_FCT zcshft_ (); F_VOID_FCT srshft_ (); F_VOID_FCT drshft_ (); F_VOID_FCT crshft_ (); F_VOID_FCT zrshft_ (); F_VOID_FCT svvdot_ (); F_VOID_FCT dvvdot_ (); F_VOID_FCT cvvdotc_ (); F_VOID_FCT cvvdotu_ (); F_VOID_FCT zvvdotc_ (); F_VOID_FCT zvvdotu_ (); F_VOID_FCT stzpad_ (); F_VOID_FCT dtzpad_ (); F_VOID_FCT ctzpad_ (); F_VOID_FCT ztzpad_ (); F_VOID_FCT stzpadcpy_ (); F_VOID_FCT dtzpadcpy_ (); F_VOID_FCT ctzpadcpy_ (); F_VOID_FCT ztzpadcpy_ (); F_VOID_FCT stzscal_ (); F_VOID_FCT dtzscal_ (); F_VOID_FCT ctzscal_ (); F_VOID_FCT ztzscal_ (); F_VOID_FCT chescal_ (); F_VOID_FCT zhescal_ (); F_VOID_FCT ctzcnjg_ (); F_VOID_FCT ztzcnjg_ (); F_VOID_FCT sagemv_ (); F_VOID_FCT dagemv_ (); F_VOID_FCT cagemv_ (); F_VOID_FCT zagemv_ (); F_VOID_FCT sasymv_ (); F_VOID_FCT dasymv_ (); F_VOID_FCT casymv_ (); F_VOID_FCT zasymv_ (); F_VOID_FCT cahemv_ (); F_VOID_FCT zahemv_ (); F_VOID_FCT satrmv_ (); F_VOID_FCT datrmv_ (); F_VOID_FCT catrmv_ (); F_VOID_FCT zatrmv_ (); F_VOID_FCT csymv_ (); F_VOID_FCT zsymv_ (); F_VOID_FCT csyr_ (); F_VOID_FCT zsyr_ (); F_VOID_FCT csyr2_ (); F_VOID_FCT zsyr2_ (); void PB_Ctzsyr (); void PB_Ctzher (); void PB_Ctzsyr2 (); void PB_Ctzher2 (); void PB_Ctztrmv (); void PB_Ctzatrmv (); void PB_Ctzsymv (); void PB_Ctzhemv (); void PB_Ctzasymv (); void PB_Ctzahemv (); void PB_Ctzsyrk (); void PB_Ctzherk (); void PB_Ctzsyr2k (); void PB_Ctzher2k (); void PB_Ctztrmm (); void PB_Ctzsymm (); void PB_Ctzhemm (); void PB_CpswapNN (); void PB_CpswapND (); void PB_Cpdot11 (); void PB_CpdotNN (); void PB_CpdotND (); void PB_CpaxpbyNN (); void PB_CpaxpbyND (); void PB_CpaxpbyDN (); void PB_Cpaxpby (); void PB_Cpsyr (); void PB_Cpsyr2 (); void PB_Cptrm (); void PB_Cpsym (); void PB_Cpgeadd (); void PB_Cptradd (); void PB_Cptran (); void PB_Cptrsv (); void PB_Cptrsm (); void PB_CpgemmAB (); void PB_CpgemmAC (); void PB_CpgemmBC (); void PB_CpsymmAB (); void PB_CpsymmBC (); void PB_CpsyrkA (); void PB_CpsyrkAC (); void PB_Cpsyr2kA (); void PB_Cpsyr2kAC (); void PB_CptrmmAB (); void PB_CptrmmB (); void PB_CptrsmAB (); void PB_CptrsmAB0 (); void PB_CptrsmAB1 (); void PB_CptrsmB (); #endif /* TOOLS */ #ifdef __STDC__ int PB_Cgcd ( int, int ); int PB_Clcm ( int, int ); void PB_Cdescset ( int *, int, int, int, int, int, int, int, int, int, int ); void PB_Cdescribe ( int, int, int, int, int *, int, int, int, int, int *, int *, int *, int *, int *, int *, int *, int *, int *, int * ); void PB_CargFtoC ( int, int, int *, int *, int *, int * ); int PB_Cfirstnb ( int, int, int, int ); int PB_Clastnb ( int, int, int, int ); int PB_Cspan ( int, int, int, int, int, int ); void PB_Cainfog2l ( int, int, int, int, int *, int, int, int, int, int *, int *, int *, int *, int *, int *, int *, int *, int *, int * ); void PB_Cinfog2l ( int, int, int *, int, int, int, int, int *, int *, int *, int * ); int PB_Cg2lrem ( int, int, int, int, int, int ); int PB_Cindxg2p ( int, int, int, int, int, int ); int PB_Cnumroc ( int, int, int, int, int, int, int ); int PB_Cnpreroc ( int, int, int, int, int, int, int ); int PB_Cnnxtroc ( int, int, int, int, int, int, int ); void PB_Cconjg ( PBTYP_T *, char *, char * ); void PB_Cwarn ( int, int, char *, char *, ... ); void PB_Cabort ( int, char *, int ); void PB_Cchkmat ( int, char *, char *, int, int, int, int, int, int, int *, int, int * ); void PB_Cchkvec ( int, char *, char *, int, int, int, int, int *, int, int, int * ); char * PB_Cmalloc ( int ); char * PB_Cgetbuf ( char *, int ); PBTYP_T * PB_Citypeset ( void ); PBTYP_T * PB_Cstypeset ( void ); PBTYP_T * PB_Cdtypeset ( void ); PBTYP_T * PB_Cctypeset ( void ); PBTYP_T * PB_Cztypeset ( void ); int pilaenv_ ( int *, F_CHAR_T ); char * PB_Ctop ( int *, char *, char *, char * ); void PB_CVMinit ( PB_VM_T *, int, int, int, int, int, int, int, int, int, int, int, int ); int PB_CVMnpq ( PB_VM_T * ); void PB_CVMcontig ( PB_VM_T *, int *, int *, int *, int * ); int PB_CVMloc ( PBTYP_T *, PB_VM_T *, char *, char *, char *, char *, int, int, char *, char *, int, char *, char *, int ); int PB_CVMswp ( PBTYP_T *, PB_VM_T *, char *, char *, char *, int, char *, int, char *, int ); int PB_CVMpack ( PBTYP_T *, PB_VM_T *, char *, char *, char *, char *, int, int, char *, char *, int, char *, char *, int ); void PB_CVMupdate ( PB_VM_T *, int, int *, int * ); void PB_Cbinfo ( int, int, int, int, int, int, int, int, int, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int * ); void PB_Cplaprnt ( PBTYP_T *, int, int, char *, int, int, int *, int, int, char * ); void PB_Cplaprn2 ( PBTYP_T *, int, int, char *, int, int, int *, int, int, char *, int, int ); void PB_Cprnt ( char, int, int, int, char *, int, int, char * ); void PB_Cplapad ( PBTYP_T *, char *, char *, int, int, char *, char *, char *, int, int, int * ); void PB_Cplapd2 ( PBTYP_T *, char *, char *, int, int, char *, char *, char *, int, int, int * ); void PB_Cplascal ( PBTYP_T *, char *, char *, int, int, char *, char *, int, int, int * ); void PB_Cplasca2 ( PBTYP_T *, char *, char *, int, int, char *, char *, int, int, int * ); void PB_Cplacnjg ( PBTYP_T *, int, int, char *, char *, int, int, int * ); void PB_CInV ( PBTYP_T *, char *, char *, int, int, int *, int, char *, int, int, int *, char *, char * *, int *, int * ); void PB_CInV2 ( PBTYP_T *, char *, char *, int, int, int *, int, char *, int, int, int *, char *, char *, int, int * ); void PB_CInOutV ( PBTYP_T *, char *, int, int, int *, int, char *, char *, int, int, int *, char *, char * *, char * *, int *, int *, int *, int * ); void PB_CInOutV2 ( PBTYP_T *, char *, char *, int, int, int, int *, int, char *, int, int, int *, char *, char * *, int *, int *, int *, int * ); void PB_COutV ( PBTYP_T *, char *, char *, int, int, int *, int, char * *, int *, int *, int * ); void PB_CGatherV ( PBTYP_T *, char *, char *, int, int, char *, int, int, int *, char *, char * *, int *, int * ); void PB_CScatterV ( PBTYP_T *, char *, int, int, char *, int, int, int *, char *, char *, char *, int, int, int *, char * ); #else int PB_Cgcd (); int PB_Clcm (); void PB_Cdescset (); void PB_Cdescribe (); void PB_CargFtoC (); int PB_Cfirstnb (); int PB_Clastnb (); int PB_Cspan (); void PB_Cainfog2l (); void PB_Cinfog2l (); int PB_Cg2lrem (); int PB_Cindxg2p (); int PB_Cnumroc (); int PB_Cnpreroc (); int PB_Cnnxtroc (); void PB_Cconjg (); void PB_Cwarn (); void PB_Cabort (); void PB_Cchkmat (); void PB_Cchkvec (); char * PB_Cmalloc (); char * PB_Cgetbuf (); PBTYP_T * PB_Citypeset (); PBTYP_T * PB_Cstypeset (); PBTYP_T * PB_Cdtypeset (); PBTYP_T * PB_Cctypeset (); PBTYP_T * PB_Cztypeset (); int pilaenv_ (); char * PB_Ctop (); void PB_CVMinit (); int PB_CVMnpq (); void PB_CVMcontig (); int PB_CVMloc (); int PB_CVMswp (); int PB_CVMpack (); void PB_CVMupdate (); void PB_Cbinfo (); void PB_Cplaprnt (); void PB_Cplaprn2 (); void PB_Cprnt (); void PB_Cplapad (); void PB_Cplapd2 (); void PB_Cplascal (); void PB_Cplasca2 (); void PB_Cplacnjg (); void PB_CInV (); void PB_CInV2 (); void PB_CInOutV (); void PB_CInOutV2 (); void PB_COutV (); void PB_CGatherV (); void PB_CScatterV (); #endif scalapack-2.0.2/PBLAS/SRC/pcagemv_.c000644 000766 000024 00000045330 10363532303 017131 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcagemv_( F_CHAR_T TRANS, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pcagemv_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PCAGEMV performs one of the matrix-vector operations * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( sub( A )' )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( conjg( sub( A )' ) )*abs( sub( X ) ) * + abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1). * * When TRANS = 'N', * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+M-1), if INCY = M_Y, * Y(IY:IY+M-1,JY:JY), if INCY = 1 and INCY <> M_Y, * and, otherwise * * sub( X ) denotes X(IX:IX,JX:JX+M-1), if INCX = M_X, * X(IX:IX+M-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a real subvector, * sub( X ) is a subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := |alpha|*|sub( A ) | * |sub( X )| + * |beta*sub( Y )|, * * TRANS = 'T' or 't', * sub( Y ) := |alpha|*|sub( A )'| * |sub( X )| + * |beta*sub( Y )|, * * TRANS = 'C' or 'c', * sub( Y ) := |alpha|*|conjg( sub( A )' )|*|sub( X )| + * |beta*sub( Y )|. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char TrA, Yroc, * one, * tbeta, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, mycol, myrow, nota, npcol, nprow; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCAGEMV", "Illegal TRANS=%c\n", TrA ); info = -1; } PB_Cchkmat( ctxt, "PCAGEMV", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); if( nota ) { PB_Cchkvec( ctxt, "PCAGEMV", "X", *N, 3, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PCAGEMV", "Y", *M, 2, Yi, Yj, Yd, *INCY, 18, &info ); } else { PB_Cchkvec( ctxt, "PCAGEMV", "X", *M, 2, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PCAGEMV", "Y", *N, 3, Yi, Yj, Yd, *INCY, 18, &info ); } } if( info ) { PB_Cabort( ctxt, "PCAGEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); utyp = PB_Cstypeset(); /* * When alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, utyp->size ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], utyp->size ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( nota ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( utyp, COLUMN, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { cagemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *) A), Aii, Ajj, Ald, type->size), &Ald, XA, &XAd[LLD_], tbeta, YA, &ione ); } if( XAfr ) free( XA ); /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( utyp, ROW, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { cagemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *) A), Aii, Ajj, Ald, type->size ), &Ald, XA, &ione, tbeta, YA, &YAd[LLD_] ); } if( XAfr ) free( XA ); /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, utyp->size ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], utyp->size ), INCY ); } } } one = utyp->one; if( nota ) { PB_Cpaxpby( utyp, NOCONJG, *M, 1, one, YA, 0, 0, YAd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } else { PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PCAGEMV */ } scalapack-2.0.2/PBLAS/SRC/pcahemv_.c000644 000766 000024 00000056235 10363532303 017140 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcahemv_( F_CHAR_T UPLO, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pcahemv_( UPLO, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PCAHEMV performs the matrix-vector operation * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a n element real subvec- * tor, sub( X ) is an n element subvector and sub( A ) is an n by n * Hermitian submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries of the upper triangular part of the * Hermitian submatrix sub( A ), and the local entries of the * strictly lower triangular of sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries of the lower triangular part of the * Hermitian submatrix sub( A ), and the local entries of the * strictly upper triangular of sub( A ) are not referenced. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set and assumed to be zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten by the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCpbY, YCsum, YCld, YRfr, YRpbY, YRsum, YRld, Ycol, Yi, Yii, YisRow, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper, usiz; float * tbeta; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XCd[DLEN_], XRd[DLEN_], Xd[DLEN_], YCd[DLEN_], YRd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, __FILE__, "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkmat( ctxt, "PCAHEMV", "A", *N, 2, *N, 2, Ai, Aj, Ad, 7, &info ); PB_Cchkvec( ctxt, "PCAHEMV", "X", *N, 2, Xi, Xj, Xd, *INCX, 11, &info ); PB_Cchkvec( ctxt, "PCAHEMV", "Y", *N, 2, Yi, Yj, Yd, *INCY, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PCAHEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); size = type->size; utyp = PB_Cstypeset(); usiz = type->usiz; /* * and when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Reuse sub( Y ) and/or create vectors YR in process rows and YC in process * columns spanned by sub( A ) */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) { PB_CInOutV( utyp, ROW, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW, ((char**)(&tbeta)), &YR, YRd, &YRfr, &YRsum, &YRpbY ); PB_COutV( utyp, COLUMN, INIT, *N, *N, Ad0, 1, &YC, YCd, &YCfr, &YCsum ); } else { PB_CInOutV( utyp, COLUMN, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN, ((char**)(&tbeta)), &YC, YCd, &YCfr, &YCsum, &YCpbY ); PB_COutV( utyp, ROW, INIT, *N, *N, Ad0, 1, &YR, YRd, &YRfr, &YRsum ); } /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd, ROW, &XC, XCd, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd, COLUMN, &XR, XRd, &XRfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd[LLD_]; XRld = XRd[LLD_]; YCld = YCd[LLD_]; YRld = YRd[LLD_]; /* * Scale YR or YC in the case sub( Y ) has been reused */ if( YisRow ) { /* * YR resides in (a) process row(s) */ if( !YRpbY ) { if( ( myrow == YRd[RSRC_] ) || ( YRd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YR */ if( Anq > 0 ) sascal_( &Anq, ((char *) tbeta), YR, &YRld ); } } } else { /* * YC resides in (a) process column(s) */ if( !YCpbY ) { if( ( mycol == YCd[CSRC_] ) || ( YCd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YC */ if( Amp > 0 ) sascal_( &Amp, ((char *) tbeta), YC, &ione ); } } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &utyp->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { cagemv_( C2F_CHAR( NOTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, YC, &ione ); cagemv_( C2F_CHAR( COTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XC, &ione, one, Mptr( YR, 0, Akq, YRld, usiz ), &YRld ); } PB_Cpsym( type, utyp, LEFT, UPPER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, usiz ), YCld, Mptr( YR, 0, Akq, YRld, usiz ), YRld, PB_Ctzahemv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsym( type, utyp, LEFT, LOWER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, usiz ), YCld, Mptr( YR, 0, Akq, YRld, usiz ), YRld, PB_Ctzahemv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { cagemv_( C2F_CHAR( NOTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, Mptr( YC, Akp, 0, YCld, usiz ), &ione ); cagemv_( C2F_CHAR( COTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XC, Akp, 0, XCld, size ), &ione, one, Mptr( YR, 0, Akq, YRld, usiz ), &YRld ); } } } } if( XCfr ) free( XC ); if( XRfr ) free( XR ); if( YisRow ) { /* * Combine the partial column results into YC */ if( YCsum ) { YCd[CSRC_] = 0; if( Amp > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, 0 ); } } /* * Combine the partial row results into YR */ if( YRsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], YRd[RSRC_], mycol ); } /* * YR := YR + YC */ PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, YR, 0, 0, YRd, ROW ); /* * sub( Y ) := beta * sub( Y ) + YR (if necessary) */ if( YRpbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || Yrow < 0 ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, ((char *) Y), Yi, Yj, Yd, ROW ); } } else { /* * Combine the partial row results into YR */ if( YRsum ) { YRd[RSRC_] = 0; if( Anq > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], 0, mycol ); } } /* * Combine the partial column results into YC */ if( YCsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, YCd[CSRC_] ); } /* * YC := YR + YC */ PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, YC, 0, 0, YCd, COLUMN ); /* * sub( Y ) := beta * sub( Y ) + YC (if necessary) */ if( YCpbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || Ycol < 0 ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, COLUMN ); } } if( YCfr ) free( YC ); if( YRfr ) free( YR ); /* * End of PCAHEMV */ } scalapack-2.0.2/PBLAS/SRC/pcamax_.c000644 000766 000024 00000053172 10363532303 016763 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcamax_( int * N, float * AMAX, int * INDX, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pcamax_( N, AMAX, INDX, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * INDX, * IX, * JX, * N; float * AMAX; /* * .. Array Arguments .. */ int * DESCX; float * X; #endif { /* * Purpose * ======= * * PCAMAX computes the global index of the maximum element in absolute * value of a subvector sub( X ). The global index is returned in INDX * and the value of that element is returned in AMAX, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * AMAX (global output) COMPLEX array * On exit, AMAX specifies the largest entry in absolute value * of the subvector sub( X ) only in its scope (See below for * further details). * * INDX (global output) INTEGER * On exit, INDX specifies the global index of the maximum ele- * ment in absolute value of the subvector sub( X ) only in its * scope (See below for further details). * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char cbtop, cctop, rbtop, rctop; int Xcol, Xgindx, Xi, Xii, Ximb, Xinb, Xj, Xjj, Xlindx, Xld, Xmb, Xnb, Xnp, Xnq, Xrow, Xsrc, ctxt, dist, dst, idumm, info, k, maxpos, mycol, mydist, myrow, npcol, nprow, src, size; PBTYP_T * type; /* * .. Local Arrays .. */ char * Xptr; int Xd[DLEN_]; cmplx work[4]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PCAMAX", "X", *N, 1, Xi, Xj, Xd, *INCX, 7, &info ); if( info ) { PB_Cabort( ctxt, "PCAMAX", info ); return; } #endif /* * Initialize INDX and AMAX */ *INDX = 0; AMAX[REAL_PART] = ZERO; AMAX[IMAG_PART] = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *INCX == 1 ) && ( Xd[M_] == 1 ) && ( *N == 1 ) ) { /* * Make sure I own some data and compute INDX and AMAX */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { *INDX = *JX; type = PB_Cctypeset(); Xptr = Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ); AMAX[REAL_PART] = ((float*)(Xptr))[REAL_PART]; AMAX[IMAG_PART] = ((float*)(Xptr))[IMAG_PART]; } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); if( ( rctop == CTOP_DEFAULT ) || ( rctop == CTOP_TREE1 ) ) { /* * Inline the 1-tree combine for communication savings */ Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_]; Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cctypeset(); size = type->size; Xlindx = Xjj - 1 + icamax_( &Xnq, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xld ); Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol ); Xptr = Mptr( ((char *) X), Xii, Xlindx, Xld, size ); work[0][REAL_PART] = ((float*)(Xptr))[REAL_PART]; work[0][IMAG_PART] = ((float*)(Xptr))[IMAG_PART]; work[1][REAL_PART] = ((float )( Xgindx+1 )); work[1][IMAG_PART] = ZERO; } else { work[0][REAL_PART] = ZERO; work[0][IMAG_PART] = ZERO; work[1][REAL_PART] = ZERO; work[1][IMAG_PART] = ZERO; } /* * Combine the local results using a 1-tree topology within process column 0 * if npcol > 1 or Xcol >= 0, i.e sub( X ) is distributed. */ if( ( npcol >= 2 ) && ( Xcol >= 0 ) ) { mydist = mycol; k = 1; l_10: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, npcol ); Ccgesd2d( ctxt, 2, 1, ((char*)work), 2, myrow, dst ); goto l_20; } else { dist = mycol + k; src = MPosMod( dist, npcol ); if( mycol < src ) { Ccgerv2d( ctxt, 2, 1, ((char*) work[2]), 2, myrow, src ); if( ( ABS( work[0][REAL_PART] ) + ABS( work[0][IMAG_PART] ) ) < ( ABS( work[2][REAL_PART] ) + ABS( work[2][IMAG_PART] ) ) ) { work[0][REAL_PART] = work[2][REAL_PART]; work[0][IMAG_PART] = work[2][IMAG_PART]; work[1][REAL_PART] = work[3][REAL_PART]; } } mydist >>= 1; } k <<= 1; if( k < npcol ) goto l_10; l_20: /* * Process column 0 broadcasts the combined values of INDX and AMAX within * their process row. */ rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == 0 ) { Ccgebs2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2 ); } else { Ccgebr2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2, myrow, 0 ); } } /* * Set INDX and AMAX to the replicated answers contained in work. If AMAX is * zero, then select a coherent INDX. */ AMAX[REAL_PART] = work[0][REAL_PART]; AMAX[IMAG_PART] = work[0][IMAG_PART]; *INDX = ( ( ( AMAX[REAL_PART] == ZERO ) && ( AMAX[IMAG_PART] == ZERO ) ) ? ( *JX ) : ( (int)(work[1][REAL_PART]) ) ); } else { /* * Otherwise use the current topology settings to combine the results */ Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_]; Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnq > 0 ) { /* * Compute the local maximum and its corresponding local index */ Xld = Xd[LLD_]; type = PB_Cctypeset(); size = type->size; Xlindx = Xjj - 1 + icamax_( &Xnq, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xld ); Xptr = Mptr( ((char *) X), Xii, Xlindx, Xld, size ); AMAX[REAL_PART] = ((float*)(Xptr))[REAL_PART]; AMAX[IMAG_PART] = ((float*)(Xptr))[IMAG_PART]; } else { AMAX[REAL_PART] = ZERO; AMAX[IMAG_PART] = ZERO; } if( Xcol >= 0 ) { /* * Combine leave on all the local maximum if Xcol >= 0, i.e sub( X ) is * distributed */ Ccgamx2d( ctxt, ROW, &rctop, 1, 1, ((char*)AMAX), 1, &idumm, &maxpos, 1, -1, mycol ); /* * Broadcast the corresponding global index */ if( ( AMAX[REAL_PART] != ZERO ) || ( AMAX[IMAG_PART] != ZERO ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == maxpos ) { Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol ); *INDX = Xgindx + 1; Cigebs2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1 ); } else { Cigebr2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1, myrow, maxpos ); } } else { /* * If AMAX is zero, then select a coherent INDX. */ *INDX = *JX; } } else { /* * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX. */ *INDX = ( ( ( AMAX[REAL_PART] == ZERO ) && ( AMAX[IMAG_PART] == ZERO ) ) ? ( *JX ) : Xlindx + 1 ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ( cctop == CTOP_DEFAULT ) || ( cctop == CTOP_TREE1 ) ) { /* * Inline the 1-tree combine for communication savings */ Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_]; Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnp > 0 ) { Xld = Xd[LLD_]; type = PB_Cctypeset(); size = type->size; Xlindx = Xii - 1 + icamax_( &Xnp, Mptr( ((char *)X), Xii, Xjj, Xld, size ), INCX ); Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow ); Xptr = Mptr( ((char *) X), Xlindx, Xjj, Xld, size ); work[0][REAL_PART] = ((float*)(Xptr))[REAL_PART]; work[0][IMAG_PART] = ((float*)(Xptr))[IMAG_PART]; work[1][REAL_PART] = ((float )( Xgindx+1 )); work[1][IMAG_PART] = ZERO; } else { work[0][REAL_PART] = ZERO; work[0][IMAG_PART] = ZERO; work[1][REAL_PART] = ZERO; work[1][IMAG_PART] = ZERO; } /* * Combine the local results using a 1-tree topology within process row 0 * if nprow > 1 or Xrow >= 0, i.e sub( X ) is distributed. */ if( ( nprow >= 2 ) && ( Xrow >= 0 ) ) { mydist = myrow; k = 1; l_30: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, nprow ); Ccgesd2d( ctxt, 2, 1, ((char*)work), 2, dst, mycol ); goto l_40; } else { dist = myrow + k; src = MPosMod( dist, nprow ); if( myrow < src ) { Ccgerv2d( ctxt, 2, 1, ((char*) work[2]), 2, src, mycol ); if( ( ABS( work[0][REAL_PART] ) + ABS( work[0][IMAG_PART] ) ) < ( ABS( work[2][REAL_PART] ) + ABS( work[2][IMAG_PART] ) ) ) { work[0][REAL_PART] = work[2][REAL_PART]; work[0][IMAG_PART] = work[2][IMAG_PART]; work[1][REAL_PART] = work[3][REAL_PART]; } } mydist >>= 1; } k <<= 1; if( k < nprow ) goto l_30; l_40: /* * Process row 0 broadcasts the combined values of INDX and AMAX within their * process column. */ cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == 0 ) { Ccgebs2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2 ); } else { Ccgebr2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2, 0, mycol ); } } /* * Set INDX and AMAX to the replicated answers contained in work. If AMAX is * zero, then select a coherent INDX. */ AMAX[REAL_PART] = work[0][REAL_PART]; AMAX[IMAG_PART] = work[0][IMAG_PART]; *INDX = ( ( ( AMAX[REAL_PART] == ZERO ) && ( AMAX[IMAG_PART] == ZERO ) ) ? ( *IX ) : ( (int)(work[1][REAL_PART]) ) ); } else { /* * Otherwise use the current topology settings to combine the results */ Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_]; Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnp > 0 ) { /* * Compute the local maximum and its corresponding local index */ Xld = Xd[LLD_]; type = PB_Cctypeset(); size = type->size; Xlindx = Xii - 1 + icamax_( &Xnp, Mptr( ((char *) X), Xii, Xjj, Xld, size ), INCX ); Xptr = Mptr( ((char *) X), Xlindx, Xjj, Xld, size ); AMAX[REAL_PART] = ((float*)(Xptr))[REAL_PART]; AMAX[IMAG_PART] = ((float*)(Xptr))[IMAG_PART]; } else { AMAX[REAL_PART] = ZERO; AMAX[IMAG_PART] = ZERO; } if( Xrow >= 0 ) { /* * Combine leave on all the local maximum if Xrow >= 0, i.e sub( X ) is * distributed. */ Ccgamx2d( ctxt, COLUMN, &cctop, 1, 1, ((char*)AMAX), 1, &maxpos, &idumm, 1, -1, mycol ); /* * Broadcast the corresponding global index */ if( ( AMAX[REAL_PART] != ZERO ) || ( AMAX[IMAG_PART] != ZERO ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == maxpos ) { Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow ); *INDX = Xgindx + 1; Cigebs2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1 ); } else { Cigebr2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1, maxpos, mycol ); } } else { /* * If AMAX is zero, then select a coherent INDX. */ *INDX = *IX; } } else { /* * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX. */ *INDX = ( ( ( AMAX[REAL_PART] == ZERO ) && ( AMAX[IMAG_PART] == ZERO ) ) ? ( *IX ) : Xlindx + 1 ); } } } return; } /* * End of PCAMAX */ } scalapack-2.0.2/PBLAS/SRC/pcatrmv_.c000644 000766 000024 00000061701 10363532303 017163 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcatrmv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pcatrmv_( UPLO, TRANS, DIAG, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PCATRMV performs one of the matrix-vector operations * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( sub( A )' )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( conjg( sub( A )' ) )*abs( sub( X ) ) * + abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a real subvector, * sub( X ) is a subvector and sub( A ) is an n by n triangular subma- * trix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := |alpha|*|sub( A )|*|sub( X )| + * |beta*sub( Y )|. * * TRANS = 'T' or 't' * sub( Y ) := |alpha|*|sub( A )'|*|sub( X )| + * |beta*sub( Y )|. * * TRANS = 'C' or 'c' * sub( Y ) := |alpha|*|conjg( sub( A )' )|*|sub( X )| + * |beta*sub( Y )|. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Yroc, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XAfr, XAld, Xi, Xj, YAfr, YAld, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper, usiz; float * tbeta; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCATRMV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCATRMV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PCATRMV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PCATRMV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkvec( ctxt, "PCATRMV", "X", *N, 4, Xi, Xj, Xd, *INCX, 13, &info ); PB_Cchkvec( ctxt, "PCATRMV", "Y", *N, 4, Yi, Yj, Yd, *INCY, 19, &info ); } if( info ) { PB_Cabort( ctxt, "PCATRMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA [REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); size = type->size; utyp = PB_Cstypeset(); usiz = type->usiz; /* * and when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( notran ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( utyp, COLUMN, *N, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, ((char**)(&tbeta)), &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( utyp, ROW, *N, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, ((char**)(&tbeta)), &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XAld = XAd[LLD_]; YAld = YAd[LLD_]; /* * Scale YA in the case sub( Y ) has been reused */ if( notran && !( YApbY ) ) { /* * YA resides in (a) process column(s) */ if( ( mycol == YAd[CSRC_] ) || ( YAd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YA */ if( Amp > 0 ) sascal_( &Amp, ((char *) tbeta), YA, &ione ); } } else if( !( notran ) && !( YApbY ) ) { /* * YA resides in (a) process row(s) */ if( ( myrow == YAd[RSRC_] ) || ( YAd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YA */ if( Anq > 0 ) sascal_( &Anq, ((char *) tbeta), YA, &YAld ); } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &utyp->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { cagemv_( TRANS, &Akp, &Anq0, ((char *) ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, YA, &ione ); } PB_Cptrm( type, utyp, LEFT, UPPER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, usiz ), YAld, PB_Ctzatrmv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { cagemv_( TRANS, &Akp, &Anq0, ((char *) ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XA, &ione, one, Mptr( YA, 0, Akq, YAld, usiz ), &YAld ); } PB_Cptrm( type, utyp, LEFT, UPPER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, usiz ), YAld, PB_Ctzatrmv ); } } } else { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, utyp, LEFT, LOWER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, usiz ), YAld, PB_Ctzatrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { cagemv_( TRANS, &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, Mptr( YA, Akp, 0, YAld, usiz ), &ione ); } } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, utyp, LEFT, LOWER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, usiz ), YAld, PB_Ctzatrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { cagemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, Akp, 0, XAld, size ), &ione, one, Mptr( YA, 0, Akq, YAld, usiz ), &YAld ); } } } } } if( XAfr ) free( XA ); if( notran ) { /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } } else { /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } if( notran ) { PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YA, 0, 0, YAd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } else { PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PCATRMV */ } scalapack-2.0.2/PBLAS/SRC/pcaxpy_.c000644 000766 000024 00000022565 10363532303 017020 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcaxpy_( int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pcaxpy_( N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCX, * DESCY; float * X, * Y; #endif { /* * Purpose * ======= * * PCAXPY adds one subvector to another, * * sub( Y ) := sub( Y ) + alpha * sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER. * On entry, N specifies the length of the subvectors to be * added. N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xi, Xj, Yi, Yj, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PCAXPY", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PCAXPY", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PCAXPY", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * Start the operations */ if( *INCX == Xd[M_] ) { PB_Cpaxpby( type, NOCONJG, 1, *N, ((char *) ALPHA), ((char *) X), Xi, Xj, Xd, ROW, type->one, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, ((char *) ALPHA), ((char *) X), Xi, Xj, Xd, COLUMN, type->one, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } /* * End of PCAXPY */ } scalapack-2.0.2/PBLAS/SRC/pccopy_.c000644 000766 000024 00000021605 10363532303 017003 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pccopy_( int * N, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pccopy_( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; /* * .. Array Arguments .. */ int * DESCX, * DESCY; float * X, * Y; #endif { /* * Purpose * ======= * * PCCOPY copies one subvector into another, * * sub( Y ) := sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * copied. N must be at least zero. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local output) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xi, Xj, Yi, Yj, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 501 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PCCOPY", "X", *N, 1, Xi, Xj, Xd, *INCX, 5, &info ); PB_Cchkvec( ctxt, "PCCOPY", "Y", *N, 1, Yi, Yj, Yd, *INCY, 10, &info ); } if( info ) { PB_Cabort( ctxt, "PCCOPY", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * Start the operations */ if( *INCX == Xd[M_] ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) X), Xi, Xj, Xd, ROW, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) X), Xi, Xj, Xd, COLUMN, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } /* * End of PCCOPY */ } scalapack-2.0.2/PBLAS/SRC/pcdotc_.c000644 000766 000024 00000067107 10363532303 016771 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcdotc_( int * N, float * DOT, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pcdotc_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; float * DOT; /* * .. Array Arguments .. */ int * DESCX, * DESCY; float * X, * Y; #endif { /* * Purpose * ======= * * PCDOTC forms the dot product of two subvectors, * * DOT := sub( X )**H * sub( Y ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * multiplied. N must be at least zero. * * DOT (local output) COMPLEX array * On exit, DOT specifies the dot product of the two subvectors * sub( X ) and sub( Y ) only in their scope (See below for fur- * ther details). * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char scope, * top; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD, Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info, ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src; PBTYP_T * type; VVDOT_T dot; /* * .. Local Arrays .. */ char * buf = NULL; int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PCDOTC", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PCDOTC", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PCDOTC", info ); return; } #endif DOT[REAL_PART] = ZERO; DOT[IMAG_PART] = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Handle degenerate case */ if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) ) { type = PB_Cctypeset(); PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_]; XnbD = Xd[NB_]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_]; XnbD = Xd[MB_]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_]; YnbD = Yd[NB_]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_]; YnbD = Yd[MB_]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector used * to perform the dot product computation. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cctypeset(); size = type->size; dot = type->Fvvdotc; /* * A purely operation occurs iff the operands start in the same process and if * either the grid is mono-dimensional or there is a single local block to be * operated with or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } } /* * Combine the local results in sub( X )'s scope */ if( ( XisR && YisR ) || ( XmyprocR == XprocR ) ) { scope = ( XisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); Ccgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 ); } if( RRorCC && XisR && YisR ) return; } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to compute the desired dot-product in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( ( YmyprocR == YprocR ) ) { /* * The processes owning a piece of sub( Y ) send it to the corresponding * process owning s piece of sub ( X ). */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( XisRow ) { rdst = XprocR; cdst = dst; } else { rdst = dst; cdst = XprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( YisRow ) Ccgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); else Ccgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding local * piece of sub( Y ), compute the local dot product and combine the results * within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); src = MPosMod( src, YnprocsD ); if( YisRow ) { rsrc = YprocR; csrc = src; } else { rsrc = src; csrc = YprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Ccgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc ); else Ccgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc ); dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } else { /* * General case, copy sub( Y ) within sub( X )'s scope, compute the local * results and combine them within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XisRow ) { PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt, 1 ); } else { PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt, MAX( 1, XnpD ) ); } if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) ) buf = PB_Cmalloc( XnpD * size ); if( YisRow ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj, Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj, Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } if( XmyprocR == XprocR ) { if( XnpD > 0 ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } /* * Send the DOT product result within sub( Y )'s scope */ if( XisR || YisR ) { /* * Either sub( X ) or sub( Y ) are replicated, so that every process should have * the result -> broadcast it orthogonally from sub( X )'s direction. */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD, XprocR ); } } else { /* * Neither sub( X ) nor sub( Y ) are replicated */ if( RRorCC ) { /* * Both sub( X ) are distributed in the same direction -> the process row or * column XprocR sends the result to the process row or column YprocR. */ if( XprocR != YprocR ) { if( XmyprocR == XprocR ) { if( XisRow ) Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR, YmyprocD ); else Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD, YprocR ); } else if( YmyprocR == YprocR ) { if( XisRow ) Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR, XmyprocD ); else Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD, XprocR ); } } } else { /* * Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s * scope, broadcast the result within sub( Y )'s scope. */ if( YmyprocR == YprocR ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocD == XprocR ) Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, YprocR, XprocR ); } else { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocD == XprocR ) Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, YprocR ); } } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ type = PB_Cctypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ type = PB_Cctypeset(); /* * Compute DOT := sub( Y )**H * sub( X ) */ PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotc ); /* * Conjugate the result */ DOT[IMAG_PART] = -DOT[IMAG_PART]; } else { /* * Neither sub( X ) nor sub( Y ) are distributed */ type = PB_Cctypeset(); PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc ); } /* * End of PCDOTC */ } scalapack-2.0.2/PBLAS/SRC/pcdotu_.c000644 000766 000024 00000066714 10363532303 017016 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcdotu_( int * N, float * DOT, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pcdotu_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; float * DOT; /* * .. Array Arguments .. */ int * DESCX, * DESCY; float * X, * Y; #endif { /* * Purpose * ======= * * PCDOTU forms the dot product of two subvectors, * * DOT := sub( X )**T * sub( Y ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * multiplied. N must be at least zero. * * DOT (local output) COMPLEX array * On exit, DOT specifies the dot product of the two subvectors * sub( X ) and sub( Y ) only in their scope (See below for fur- * ther details). * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char scope, * top; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD, Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info, ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src; PBTYP_T * type; VVDOT_T dot; /* * .. Local Arrays .. */ char * buf = NULL; int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PCDOTU", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PCDOTU", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PCDOTU", info ); return; } #endif DOT[REAL_PART] = ZERO; DOT[IMAG_PART] = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Handle degenerate case */ if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) ) { type = PB_Cctypeset(); PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_]; XnbD = Xd[NB_]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_]; XnbD = Xd[MB_]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_]; YnbD = Yd[NB_]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_]; YnbD = Yd[MB_]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector used * to perform the dot product computation. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cctypeset(); size = type->size; dot = type->Fvvdotu; /* * A purely operation occurs iff the operands start in the same process and if * either the grid is mono-dimensional or there is a single local block to be * operated with or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } } /* * Combine the local results in sub( X )'s scope */ if( ( XisR && YisR ) || ( XmyprocR == XprocR ) ) { scope = ( XisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); Ccgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 ); } if( RRorCC && XisR && YisR ) return; } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to compute the desired dot-product in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( ( YmyprocR == YprocR ) ) { /* * The processes owning a piece of sub( Y ) send it to the corresponding * process owning s piece of sub ( X ). */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( XisRow ) { rdst = XprocR; cdst = dst; } else { rdst = dst; cdst = XprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( YisRow ) Ccgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); else Ccgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding local * piece of sub( Y ), compute the local dot product and combine the results * within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); src = MPosMod( src, YnprocsD ); if( YisRow ) { rsrc = YprocR; csrc = src; } else { rsrc = src; csrc = YprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Ccgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc ); else Ccgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc ); dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } else { /* * General case, copy sub( Y ) within sub( X )'s scope, compute the local * results and combine them within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XisRow ) { PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt, 1 ); } else { PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt, MAX( 1, XnpD ) ); } if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) ) buf = PB_Cmalloc( XnpD * size ); if( YisRow ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj, Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj, Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } if( XmyprocR == XprocR ) { if( XnpD > 0 ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } /* * Send the DOT product result within sub( Y )'s scope */ if( XisR || YisR ) { /* * Either sub( X ) or sub( Y ) are replicated, so that every process should have * the result -> broadcast it orthogonally from sub( X )'s direction. */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD, XprocR ); } } else { /* * Neither sub( X ) nor sub( Y ) are replicated */ if( RRorCC ) { /* * Both sub( X ) are distributed in the same direction -> the process row or * column XprocR sends the result to the process row or column YprocR. */ if( XprocR != YprocR ) { if( XmyprocR == XprocR ) { if( XisRow ) Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR, YmyprocD ); else Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD, YprocR ); } else if( YmyprocR == YprocR ) { if( XisRow ) Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR, XmyprocD ); else Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD, XprocR ); } } } else { /* * Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s * scope, broadcast the result within sub( Y )'s scope. */ if( YmyprocR == YprocR ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocD == XprocR ) Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, YprocR, XprocR ); } else { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocD == XprocR ) Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, YprocR ); } } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ type = PB_Cctypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ type = PB_Cctypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotu ); } else { /* * Neither sub( X ) nor sub( Y ) are distributed */ type = PB_Cctypeset(); PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); } /* * End of PCDOTU */ } scalapack-2.0.2/PBLAS/SRC/pcgeadd_.c000644 000766 000024 00000027250 10363532303 017077 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcgeadd_( F_CHAR_T TRANS, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pcgeadd_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * IC, * JA, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PCGEADD adds a matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if TRANS = 'N', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'T', * conjg(A(IA:IA+N-1,JA:JA+M-1)') if TRANS = 'C'. * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the form of op( sub( A ) ) to be * used in the matrix addition as follows: * * TRANS = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANS = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANS = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, DirC, ctop, rtop; int Ai, Aj, Ci, Cj, TrA, ctxt, info, mycol, myrow, npcol, nprow, notran; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !notran ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCGEADD", "Illegal TRANS = %c\n", TrA ); info = -1; } if( notran ) PB_Cchkmat( ctxt, "PCGEADD", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); else PB_Cchkmat( ctxt, "PCGEADD", "A", *N, 3, *M, 2, Ai, Aj, Ad, 8, &info ); PB_Cchkmat( ctxt, "PCGEADD", "C", *M, 2, *N, 3, Ci, Cj, Cd, 13, &info ); } if( info ) { PB_Cabort( ctxt, "PCGEADD", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( PB_Cctypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cctypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ /* * This operation mainly involves point-to-point send and receive communication. * There is therefore no particular BLACS topology to recommend. Still, one can * choose the main loop direction in which the operands will be added. This * selection is based on the current setting for the BLACS broadcast operations. */ if( notran ) { rtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( *M <= *N ) { DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirC = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpgeadd( PB_Cctypeset(), &DirA, &DirC, NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } else if( TrA == CTRAN ) { PB_Cptran( PB_Cctypeset(), NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cptran( PB_Cctypeset(), CONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } /* * End of PCGEADD */ } scalapack-2.0.2/PBLAS/SRC/pcgemm_.c000644 000766 000024 00000050667 10363532303 016770 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcgemm_( F_CHAR_T TRANSA, F_CHAR_T TRANSB, int * M, int * N, int * K, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pcgemm_( TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANSA, TRANSB; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; float * A, * B, * C; #endif { /* * Purpose * ======= * * PCGEMM performs one of the matrix-matrix operations * * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N', * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T', * conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C', * * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N', * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T', * conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'. * * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) ) * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and * sub( C ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies the form of op( sub( B ) ) to be * used in the matrix multiplication as follows: * * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ), * * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )', * * TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * op( sub( A ) ) and of the submatrix sub( C ). M must be at * least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * op( sub( B ) ) and the number of columns of the submatrix * sub( C ). N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns of the submatrix * op( sub( A ) ) and the number of rows of the submatrix * op( sub( B ) ). K must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, DirBC, OpC, OpR, TrA, TrB, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ChooseBC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, nota, notb, npcol, nprow; double ABest, ACest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANSA )[0] ) ) == CNOTRAN ); notb = ( ( TrB = Mupcase( F2C_CHAR( TRANSB )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1001 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCGEMM", "Illegal TRANSA = %c\n", TrA ); info = -1; } else if( ( !notb ) && ( TrB != CTRAN ) && ( TrB != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCGEMM", "Illegal TRANSB = %c\n", TrB ); info = -2; } if( nota ) PB_Cchkmat( ctxt, "PCGEMM", "A", *M, 3, *K, 5, Ai, Aj, Ad, 10, &info ); else PB_Cchkmat( ctxt, "PCGEMM", "A", *K, 5, *M, 3, Ai, Aj, Ad, 10, &info ); if( notb ) PB_Cchkmat( ctxt, "PCGEMM", "B", *K, 5, *N, 4, Bi, Bj, Bd, 14, &info ); else PB_Cchkmat( ctxt, "PCGEMM", "B", *N, 4, *K, 5, Bi, Bj, Bd, 14, &info ); PB_Cchkmat( ctxt, "PCGEMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 19, &info ); } if( info ) { PB_Cabort( ctxt, "PCGEMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) || ( *K == 0 ) ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * If alpha or K is zero, sub( C ) := beta * sub( C ). */ if( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char * ) C), Ci, Cj, Cd ); } else if( !( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char * ) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABest: both operands sub( A ) and sub( B ) are communicated (M, N >> K) * ACest: both operands sub( A ) and sub( C ) are communicated (K, N >> M) * BCest: both operands sub( B ) and sub( C ) are communicated (M, K >> N) */ ABest = (double)(*K); ACest = (double)(*M); BCest = (double)(*N); if( notb ) { if( nota ) { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); ABest *= ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *K, Bd[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[MB_], nprow ); BCest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } else { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *M, Ad[NB_], npcol ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ); tmp1 = DNROC( *K, Bd[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ACest *= ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *K, Ad[MB_], nprow ); tmp2 = DNROC( *M, Bd[NB_], npcol ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest *= ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } } else { if( nota ) { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *N, Bd[MB_], nprow ); tmp2 = DNROC( *K, Bd[NB_], npcol ); tmp3 = DNROC( *N, Cd[NB_], npcol ); ACest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *K, Ad[NB_], npcol ); BCest *= CBRATIO * ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); } else { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *N, Bd[MB_], nprow ); tmp2 = DNROC( *K, Bd[NB_], npcol ); tmp3 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *K, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *K, Bd[NB_], npcol ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } } ChooseAB = ( ( ABest <= ( 1.3 * BCest ) ) && ( ABest <= ( 1.3 * ACest ) ) ); ChooseBC = ( ( BCest <= ACest ) && ( ( 1.3 * BCest ) <= ABest ) ); /* * BLACS topologies are enforced iff M, N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) && ( *K > nb ) ); if( ChooseAB ) { OpR = CBCAST; OpC = CBCAST; } else if( ChooseBC ) { if( nota ) { OpR = CCOMBINE; OpC = CBCAST; } else { OpR = CBCAST; OpC = CCOMBINE; } } else { if( notb ) { OpR = CBCAST; OpC = CCOMBINE; } else { OpR = CCOMBINE; OpC = CBCAST; } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_IRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_IRING ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirAB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirBC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); if( ChooseAB ) { PB_CpgemmAB( type, &DirAB, &DirBC, ( nota ? NOTRAN : ( ( TrA == CCOTRAN ) ? COTRAN : TRAN ) ), ( notb ? NOTRAN : ( ( TrB == CCOTRAN ) ? COTRAN : TRAN ) ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else if( ChooseBC ) { PB_CpgemmBC( type, &DirAB, &DirBC, ( nota ? NOTRAN : ( ( TrA == CCOTRAN ) ? COTRAN : TRAN ) ), ( notb ? NOTRAN : ( ( TrB == CCOTRAN ) ? COTRAN : TRAN ) ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { PB_CpgemmAC( type, &DirAB, &DirBC, ( nota ? NOTRAN : ( ( TrA == CCOTRAN ) ? COTRAN : TRAN ) ), ( notb ? NOTRAN : ( ( TrB == CCOTRAN ) ? COTRAN : TRAN ) ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PCGEMM */ } scalapack-2.0.2/PBLAS/SRC/pcgemv_.c000644 000766 000024 00000043566 10363532303 017001 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcgemv_( F_CHAR_T TRANS, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pcgemv_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PCGEMV performs one of the matrix-vector operations * * sub( Y ) := alpha*sub( A ) *sub( X ) + beta*sub( Y ), or * sub( Y ) := alpha*sub( A )'*sub( X ) + beta*sub( Y ), or * sub( Y ) := alpha*conjg( sub( A )' )*sub( X ) + beta*sub( Y ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1). * * When TRANS = 'N', * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+M-1), if INCY = M_Y, * Y(IY:IY+M-1,JY:JY), if INCY = 1 and INCY <> M_Y, * and, otherwise * * sub( X ) denotes X(IX:IX,JX:JX+M-1), if INCX = M_X, * X(IX:IX+M-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are scalars, and sub( X ) and sub( Y ) are subvectors * and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := alpha*sub( A ) * sub( X ) + beta*sub( Y ), * * TRANS = 'T' or 't', * sub( Y ) := alpha*sub( A )' * sub( X ) + beta*sub( Y ), * * TRANS = 'C' or 'c', * sub( Y ) := alpha*conjg( sub( A )' )*sub( X ) + * beta*sub( Y ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char TrA, Yroc, * tbeta, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, mycol, myrow, nota, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCGEMV", "Illegal TRANS=%c\n", TrA ); info = -1; } PB_Cchkmat( ctxt, "PCGEMV", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); if( nota ) { PB_Cchkvec( ctxt, "PCGEMV", "X", *N, 3, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PCGEMV", "Y", *M, 2, Yi, Yj, Yd, *INCY, 18, &info ); } else { PB_Cchkvec( ctxt, "PCGEMV", "X", *M, 2, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PCGEMV", "Y", *N, 3, Yi, Yj, Yd, *INCY, 18, &info ); } } if( info ) { PB_Cabort( ctxt, "PCGEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) && ( ( BETA [REAL_PART] == ONE ) && ( BETA [IMAG_PART] == ZERO ) ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); /* * When alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { cset_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } else { cscal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { cset_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } else { cscal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( nota ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( type, COLUMN, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { cgemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *)A), Aii, Ajj, Ald, type->size ), &Ald, XA, &XAd[LLD_], tbeta, YA, &ione ); } if( XAfr ) free( XA ); /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Ccgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, *M, 1, type->one, YA, 0, 0, YAd, COLUMN, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc ); } } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( type, ROW, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { cgemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *)A), Aii, Ajj, Ald, type->size ), &Ald, XA, &ione, tbeta, YA, &YAd[LLD_] ); } if( XAfr ) free( XA ); /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Ccgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, YA, 0, 0, YAd, ROW, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PCGEMV */ } scalapack-2.0.2/PBLAS/SRC/pcgerc_.c000644 000766 000024 00000027433 10363532303 016756 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcgerc_( int * M, int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY, float * A, int * IA, int * JA, int * DESCA ) #else void pcgerc_( M, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PCGERC performs the rank 1 operation * * sub( A ) := alpha*sub( X )*conjg( sub( Y )' ) + sub( A ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) is an m element subvector, sub( Y ) is * an n element subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+M-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+M-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( A ) are overwritten by the * local entries of the m by n updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, Yi, Yj, ctxt, info, ione=1, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd[DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PCGERC", "X", *M, 1, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PCGERC", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PCGERC", "A", *M, 1, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PCGERC", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Replicate sub( Y ) in process rows spanned by sub( A ) -> YA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ), &YA, YAd, &YAfr ); /* * Local rank-1 update iff I own some data */ Amp = PB_Cnumroc( *M, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { cgerc_( &Amp, &Anq, ((char *) ALPHA), XA, &ione, YA, &YAd[LLD_], Mptr( ((char *) A), Aii, Ajj, Ald, type->size ), &Ald ); } if( XAfr ) free( XA ); if( YAfr ) free( YA ); /* * End of PCGERC */ } scalapack-2.0.2/PBLAS/SRC/pcgeru_.c000644 000766 000024 00000027422 10363532303 016776 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcgeru_( int * M, int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY, float * A, int * IA, int * JA, int * DESCA ) #else void pcgeru_( M, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PCGERU performs the rank 1 operation * * sub( A ) := alpha*sub( X )*sub( Y )' + sub( A ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) is an m element subvector, sub( Y ) is * an n element subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+M-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+M-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( A ) are overwritten by the * local entries of the m by n updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, Yi, Yj, ctxt, info, ione=1, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd[DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PCGERU", "X", *M, 1, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PCGERU", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PCGERU", "A", *M, 1, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PCGERU", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Replicate sub( Y ) in process rows spanned by sub( A ) -> YA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ), &YA, YAd, &YAfr ); /* * Local rank-1 update iff I own some data */ Amp = PB_Cnumroc( *M, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { cgeru_( &Amp, &Anq, ((char *) ALPHA), XA, &ione, YA, &YAd[LLD_], Mptr( ((char *) A), Aii, Ajj, Ald, type->size ), &Ald ); } if( XAfr ) free( XA ); if( YAfr ) free( YA ); /* * End of PCGERU */ } scalapack-2.0.2/PBLAS/SRC/pchemm_.c000644 000766 000024 00000053530 10363532303 016761 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pchemm_( F_CHAR_T SIDE, F_CHAR_T UPLO, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pchemm_( SIDE, UPLO, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T SIDE, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; float * A, * B, * C; #endif { /* * Purpose * ======= * * PCHEMM performs one of the matrix-matrix operations * * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * or * * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha and beta are scalars, sub( A ) is a Hermitian submatrix and * sub( B ) and sub( C ) are m by n submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether the Hermitian submatrix * sub( A ) appears on the left or right in the operation as * follows: * * SIDE = 'L' or 'l' * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * SIDE = 'R' or 'r' * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with SIDE = 'L' or 'l', this array contains * the local entries corresponding to the entries of the m by m * Hermitian submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the Hermitian submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the Hermitian submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * Before entry with SIDE = 'R' or 'r', this array contains * the local entries corresponding to the entries of the n by n * Hermitian submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the Hermitian submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the Hermitian submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set and assumed to be zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, SideOp, UploA, cbtop, cbtopsave, cctop, cctopsave, rbtop, rbtopsave, rctop, rctopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, lside, mycol, myrow, nb, npcol, nprow, upper; double ABCest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PCHEMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCHEMM", "Illegal UPLO = %c\n", UploA ); info = -2; } if( lside ) { PB_Cchkmat( ctxt, "PCHEMM", "A", *M, 3, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCHEMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PCHEMM", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCHEMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PCHEMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PCHEMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) && ( ( BETA [REAL_PART] == ONE ) && ( BETA [IMAG_PART] == ZERO ) ) ) ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * If alpha is zero, sub( C ) := beta * sub( C ). */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else if( !( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (N >> M) * BCest : Both operands sub( B ) and sub( C ) are communicated (M >> N) */ if( lside ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ABCest = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp1 = DNROC( *N, Ad[NB_], npcol ); tmp2 = DNROC( *M, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); tmp4 = DNROC( *N, Cd[NB_], npcol ); BCest = (double)(*M) * ( ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.5 * ABCest ) <= BCest ); /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ChooseABC ) { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } } if( lside ) DirAB = ( rbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAB = ( cbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CpsymmAB( type, &DirAB, CONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_DEFAULT ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); } } if( lside ) DirAB = ( ( rbtop == CTOP_DRING || rctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); else DirAB = ( ( cbtop == CTOP_DRING || cctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); PB_CpsymmBC( type, &DirAB, CONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rbtopsave = *PB_Ctop( &ctxt, BCAST, ROW, &rbtopsave ); rctopsave = *PB_Ctop( &ctxt, COMBINE, ROW, &rctopsave ); cbtopsave = *PB_Ctop( &ctxt, BCAST, COLUMN, &cbtopsave ); cctopsave = *PB_Ctop( &ctxt, COMBINE, COLUMN, &cctopsave ); } /* * End of PCHEMM */ } scalapack-2.0.2/PBLAS/SRC/pchemv_.c000644 000766 000024 00000056165 10363532303 017001 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pchemv_( F_CHAR_T UPLO, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pchemv_( UPLO, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PCHEMV performs the matrix-vector operation * * sub( Y ) := alpha*sub( A )*sub( X ) + beta*sub( Y ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha and beta are scalars, sub( X ) and sub( Y ) are n element sub- * vectors and sub( A ) is an n by n Hermitian submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries of the upper triangular part of the * Hermitian submatrix sub( A ), and the local entries of the * strictly lower triangular of sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries of the lower triangular part of the * Hermitian submatrix sub( A ), and the local entries of the * strictly upper triangular of sub( A ) are not referenced. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set and assumed to be zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten by the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCpbY, YCsum, YCld, YRfr, YRpbY, YRsum, YRld, Ycol, Yi, Yii, YisRow, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; float * tbeta; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XCd[DLEN_], XRd[DLEN_], Xd[DLEN_], YCd[DLEN_], YRd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, __FILE__, "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkmat( ctxt, "PCHEMV", "A", *N, 2, *N, 2, Ai, Aj, Ad, 7, &info ); PB_Cchkvec( ctxt, "PCHEMV", "X", *N, 2, Xi, Xj, Xd, *INCX, 11, &info ); PB_Cchkvec( ctxt, "PCHEMV", "Y", *N, 2, Yi, Yj, Yd, *INCY, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PCHEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) && ( ( BETA [REAL_PART] == ONE ) && ( BETA [IMAG_PART] == ZERO ) ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); /* * When alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { cset_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } else { cscal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { cset_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } else { cscal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Reuse sub( Y ) and/or create vectors YR in process rows and YC in process * columns spanned by sub( A ) */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) { PB_CInOutV( type, ROW, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW, ((char**)(&tbeta)), &YR, YRd, &YRfr, &YRsum, &YRpbY ); PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &YC, YCd, &YCfr, &YCsum ); } else { PB_CInOutV( type, COLUMN, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN, ((char**)(&tbeta)), &YC, YCd, &YCfr, &YCsum, &YCpbY ); PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &YR, YRd, &YRfr, &YRsum ); } /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd, ROW, &XC, XCd, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd, COLUMN, &XR, XRd, &XRfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd[LLD_]; XRld = XRd[LLD_]; YCld = YCd[LLD_]; YRld = YRd[LLD_]; /* * Scale YR or YC in the case sub( Y ) has been reused */ if( YisRow ) { /* * YR resides in (a) process row(s) */ if( !YRpbY ) { if( ( myrow == YRd[RSRC_] ) || ( YRd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YR */ if( Anq > 0 ) { if( ( tbeta[REAL_PART] == ZERO ) && ( tbeta[IMAG_PART] == ZERO ) ) { cset_( &Anq, ((char *) tbeta), YR, &YRld ); } else { cscal_( &Anq, ((char *) tbeta), YR, &YRld ); } } } } } else { /* * YC resides in (a) process column(s) */ if( !YCpbY ) { if( ( mycol == YCd[CSRC_] ) || ( YCd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YC */ if( Amp > 0 ) { if( ( tbeta[REAL_PART] == ZERO ) && ( tbeta[IMAG_PART] == ZERO ) ) { cset_( &Amp, ((char *) tbeta), YC, &ione ); } else { cscal_( &Amp, ((char *) tbeta), YC, &ione ); } } } } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { cgemv_( C2F_CHAR( NOTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, YC, &ione ); cgemv_( C2F_CHAR( COTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XC, &ione, one, Mptr( YR, 0, Akq, YRld, size ), &YRld ); } PB_Cpsym( type, type, LEFT, UPPER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, PB_Ctzhemv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsym( type, type, LEFT, LOWER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, PB_Ctzhemv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { cgemv_( C2F_CHAR( NOTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, Mptr( YC, Akp, 0, YCld, size ), &ione ); cgemv_( C2F_CHAR( COTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XC, Akp, 0, XCld, size ), &ione, one, Mptr( YR, 0, Akq, YRld, size ), &YRld ); } } } } if( XCfr ) free( XC ); if( XRfr ) free( XR ); if( YisRow ) { /* * Combine the partial column results into YC */ if( YCsum ) { YCd[CSRC_] = 0; if( Amp > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Ccgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, 0 ); } } /* * Combine the partial row results into YR */ if( YRsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Ccgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], YRd[RSRC_], mycol ); } /* * YR := YR + YC */ PB_Cpaxpby( type, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, YR, 0, 0, YRd, ROW ); /* * sub( Y ) := beta * sub( Y ) + YR (if necessary) */ if( YRpbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW ); } } else { /* * Combine the partial row results into YR */ if( YRsum ) { YRd[RSRC_] = 0; if( Anq > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Ccgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], 0, mycol ); } } /* * Combine the partial column results into YC */ if( YCsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Ccgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, YCd[CSRC_] ); } /* * YC := YR + YC */ PB_Cpaxpby( type, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, YC, 0, 0, YCd, COLUMN ); /* * sub( Y ) := beta * sub( Y ) + YC (if necessary) */ if( YCpbY ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN ); } } if( YCfr ) free( YC ); if( YRfr ) free( YR ); /* * End of PCHEMV */ } scalapack-2.0.2/PBLAS/SRC/pcher2_.c000644 000766 000024 00000043322 10363532303 016671 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcher2_( F_CHAR_T UPLO, int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY, float * A, int * IA, int * JA, int * DESCA ) #else void pcher2_( UPLO, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PCHER2 performs the Hermitian rank 2 operation * * sub( A ) := alpha*sub( X )*conjg( sub( Y )' ) + * conjg( alpha )*sub( Y )*conjg( sub( X )' ) + sub( A ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) and sub( Y ) are n element subvectors and * sub( A ) is an n by n Hermitian submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the Hermitian submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the Hermitian submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCld, YRfr, YRld, Yi, Yj, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; cmplx Calpha; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0 [DLEN_], XCd0[DLEN_], XRd0[DLEN_], Xd[DLEN_], YCd0[DLEN_], YRd0[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCHER2", "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkvec( ctxt, "PCHER2", "X", *N, 2, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PCHER2", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PCHER2", "A", *N, 2, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PCHER2", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd0, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd0, ROW, &XC, XCd0, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd0, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd0, COLUMN, &XR, XRd0, &XRfr ); } /* * Replicate sub( Y ) in process rows (YR) and process columns (YC) spanned by * sub( A ) */ if( *INCY == Yd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ROW, &YR, YRd0, &YRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, YR, 0, 0, YRd0, ROW, &YC, YCd0, &YCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, COLUMN, &YC, YCd0, &YCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, YC, 0, 0, YCd0, COLUMN, &YR, YRd0, &YRfr ); } /* * Local rank-2 update if I own some data */ Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd0[LLD_]; YCld = YCd0[LLD_]; XRld = XRd0[LLD_]; YRld = YRd0[LLD_]; Calpha[REAL_PART] = ALPHA[REAL_PART]; Calpha[IMAG_PART] = -ALPHA[IMAG_PART]; /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { cgerc_( &Akp, &Anq0, ((char *) ALPHA), XC, &ione, Mptr( YR, 0, Akq, YRld, size ), &YRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); cgerc_( &Akp, &Anq0, ((char *) Calpha), YC, &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); } PB_Cpsyr2( type, UPPER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, Aptr, k, k, Ad0, PB_Ctzher2 ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsyr2( type, LOWER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, Aptr, k, k, Ad0, PB_Ctzher2 ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { cgerc_( &Amp0, &Anq0, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), &ione, Mptr( YR, 0, Akq, YRld, size ), &YRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); cgerc_( &Amp0, &Anq0, ((char *) Calpha), Mptr( YC, Akp, 0, YCld, size ), &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } } } if( XRfr ) free( XR ); if( XCfr ) free( XC ); if( YRfr ) free( YR ); if( YCfr ) free( YC ); /* * End of PCHER2 */ } scalapack-2.0.2/PBLAS/SRC/pcher2k_.c000644 000766 000024 00000052764 10363532303 017056 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcher2k_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pcher2k_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; float * A, * B, * C; #endif { /* * Purpose * ======= * * PCHER2K performs one of the Hermitian rank 2k operations * * sub( C ) := alpha*sub( A )*conjg( sub( B ) )' + * conjg( alpha )*sub( B )*conjg( sub( A ) )' + * beta*sub( C ), * * or * * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) + * conjg( alpha )*conjg( sub( B )' )*sub( A ) + * beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise, and, * * sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N', * B(IB:IB+K-1,JB:JB+N-1) otherwise. * * Alpha and beta are scalars with beta real, sub( C ) is an n by n * Hermitian submatrix and sub( A ) and sub( B ) are n by k submatrices * in the first case and k by n submatrices in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the Hermitian submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*conjg( sub( B )' ) + * conjg( alpha )*sub( B )*conjg( sub( A )' ) + * beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) + * conjg( alpha )*conjg( sub( B )' )*sub( A ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrices sub( A ) and sub( B ), and on * entry with TRANS = 'C' or 'c', K specifies the number of rows * of the submatrices sub( A ) and sub( B ). K must be at least * zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and B corresponding to the entries of the submatrices * sub( A ) and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JB+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( B ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( B ). * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the Hermitian submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the Hermitian submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( C ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double ABCest, ABest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCHER2K", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCHER2K", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) { PB_Cchkmat( ctxt, "PCHER2K", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCHER2K", "B", *N, 3, *K, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PCHER2K", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCHER2K", "B", *K, 4, *N, 3, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PCHER2K", "C", *N, 3, *N, 3, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PCHER2K", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) && ( ( BETA[REAL_PART] == ONE ) ) ) ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * And when alpha or K is zero */ if( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, CONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (K >> N) * ABest : only sub( A ) and sub( B ) are communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[NB_], npcol ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[CSRC_] == -1 ) && ( Bd[CSRC_] == -1 ) ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + TWO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *K, Ad[MB_], nprow ); tmp4 = DNROC( *K, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[RSRC_] == -1 ) && ( Bd[RSRC_] == -1 ) ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + TWO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.4 * ABCest ) <= ABest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseABC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kAC( type, &DirA, CONJG, &UploC, ( notran ? NOTRAN : COTRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kA( type, &DirA, CONJG, &UploC, ( notran ? NOTRAN : COTRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PCHER2K */ } scalapack-2.0.2/PBLAS/SRC/pcher_.c000644 000766 000024 00000034464 10363532303 016616 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcher_( F_CHAR_T UPLO, int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * A, int * IA, int * JA, int * DESCA ) #else void pcher_( UPLO, N, ALPHA, X, IX, JX, DESCX, INCX, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX; float * A, * X; #endif { /* * Purpose * ======= * * PCHER performs the Hermitian rank 1 operation * * sub( A ) := alpha*sub( X )*conjg( sub( X )' ) + sub( A ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Alpha is a real scalar, sub( X ) is an n element subvector and * sub( A ) is an n by n Hermitian submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X * corresponding to the entries of the subvector sub( X ) need * not be set on input. * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the Hermitian submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the Hermitian submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; cmplx Calpha; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XCd0[DLEN_], XRd0[DLEN_], Xd[DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCHER", "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkvec( ctxt, "PCHER", "X", *N, 2, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkmat( ctxt, "PCHER", "A", *N, 2, *N, 2, Ai, Aj, Ad, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PCHER", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd0, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd0, ROW, &XC, XCd0, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd0, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd0, COLUMN, &XR, XRd0, &XRfr ); } /* * Local rank-1 update if I own some data */ Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); XCld = XCd0[LLD_]; XRld = XRd0[LLD_]; Calpha[REAL_PART] = ALPHA[REAL_PART]; Calpha[IMAG_PART] = ZERO; if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) cgerc_( &Akp, &Anq0, ((char *) Calpha), XC, &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); PB_Cpsyr( type, UPPER, kb, 1, ((char *) Calpha), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Aptr, k, k, Ad0, PB_Ctzher ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsyr( type, LOWER, kb, 1, ((char *) Calpha), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Aptr, k, k, Ad0, PB_Ctzher ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) cgerc_( &Amp0, &Anq0, ((char *) Calpha), Mptr( XC, Akp, 0, XCld, size ), &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } } if( XRfr ) free( XR ); if( XCfr ) free( XC ); /* * End of PCHER */ } scalapack-2.0.2/PBLAS/SRC/pcherk_.c000644 000766 000024 00000045467 10363532303 016776 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcherk_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pcherk_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * K, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PCHERK performs one of the Hermitian rank k operations * * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + beta*sub( C ), * * or * * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and, * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise. * * Alpha and beta are real scalars, sub( C ) is an n by n Hermitian * submatrix and sub( A ) is an n by k submatrix in the first case and a * k by n submatrix in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the Hermitian submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + * beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry, with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrix sub( A ), and with TRANS = 'C' or * 'c', K specifies the number of rows of the submatrix * sub( A ). K must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the Hermitian submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the Hermitian submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( C ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, ChooseAC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double Aest, ACest, tmp1, tmp2, tmp3, tmp4; cmplx Calph; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCHERK", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCHERK", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PCHERK", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PCHERK", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCHERK", "C", *N, 3, *N, 3, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PCHERK", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * And when alpha or K is zero */ if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, CONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif Calph[REAL_PART] = ALPHA[REAL_PART]; Calph[IMAG_PART] = ZERO; /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ACest: both operands sub( A ) and sub( C ) are communicated (K >> N) * Aest : only sub( A ) is communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Ad[MB_], nprow ); Aest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); Aest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseAC = ( ( 1.3 * ACest ) <= Aest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseAC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkAC( type, &DirA, CONJG, &UploC, ( notran ? NOTRAN : COTRAN ), *N, *K, ((char *)Calph), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkA( type, &DirA, CONJG, &UploC, ( notran ? NOTRAN : COTRAN ), *N, *K, ((char *)Calph), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PCHERK */ } scalapack-2.0.2/PBLAS/SRC/pcscal_.c000644 000766 000024 00000022401 10602576752 016762 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcscal_( int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pcscal_( N, ALPHA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCX; float * X; #endif { /* * Purpose * ======= * * PCSCAL multiplies an n element subvector sub( X ) by the scalar * alpha, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input/local output) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the scaled * subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PCSCAL", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PCSCAL", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( Xd[CTXT_], &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Start the operations */ if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { if ( ( ALPHA[REAL_PART] == ONE ) && ( ALPHA[IMAG_PART] == ZERO ) ) return; /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cctypeset(); if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { cset_( &Xnq, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } else { cscal_( &Xnq, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { if ( ( ALPHA[REAL_PART] == ONE ) && ( ALPHA[IMAG_PART] == ZERO ) ) return; /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { type = PB_Cctypeset(); if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { cset_( &Xnp, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } else { cscal_( &Xnp, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } } } return; } /* * End of PCSCAL */ } scalapack-2.0.2/PBLAS/SRC/pcsscal_.c000644 000766 000024 00000022075 10363532303 017140 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcsscal_( int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pcsscal_( N, ALPHA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCX; float * X; #endif { /* * Purpose * ======= * * PCSSCAL multiplies an n element subvector sub( X ) by the real scalar * alpha, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input/local output) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the scaled * subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PCSSCAL", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PCSSCAL", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ALPHA[REAL_PART] == ONE ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( Xd[CTXT_], &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Start the operations */ if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cctypeset(); if( ALPHA[REAL_PART] == ZERO ) { cset_( &Xnq, type->zero, Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } else { csscal_( &Xnq, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { type = PB_Cctypeset(); if( ALPHA[REAL_PART] == ZERO ) { cset_( &Xnp, type->zero, Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } else { csscal_( &Xnp, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } } } return; } /* * End of PCSSCAL */ } scalapack-2.0.2/PBLAS/SRC/pcswap_.c000644 000766 000024 00000076205 10363532303 017011 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcswap_( int * N, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pcswap_( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; /* * .. Array Arguments .. */ int * DESCX, * DESCY; float * X, * Y; #endif { /* * Purpose * ======= * * PCSWAP swaps two subvectors, * * sub( Y ) := sub( X ) and sub( X ) := sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * swapped. N must be at least zero. * * X (local input/local output) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with sub( Y ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) COMPLEX array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Xscope, Yscope, * one, * top, tran, * zero; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, XisD, XisR, Xinb1D, XinbD, XisRow, Xii, Xj, Xjj, Xld, Xlinc, Xm, XmyprocD, XmyprocR, Xn, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xroc, Xrow, Ycol, Yi, Yii, Yinb1D, YinbD, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, Ym, YmyprocD, YmyprocR, Yn, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yroc, Yrow, cdst, csrc, ctxt, dst, gcdPQ, info, ione=1, k, l, lcmPQ, lcmb, mycol, myrow, npcol, npq, nprow, p, q, rdst, rsrc, src, size; PBTYP_T * type; PB_VM_T VM; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; char * buf = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 501 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PCSWAP", "X", *N, 1, Xi, Xj, Xd, *INCX, 5, &info ); PB_Cchkvec( ctxt, "PCSWAP", "Y", *N, 1, Yi, Yj, Yd, *INCY, 10, &info ); } if( info ) { PB_Cabort( ctxt, "PCSWAP", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_ ]; XnbD = Xd[NB_ ]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_ ]; XnbD = Xd[MB_ ]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_ ]; YnbD = Yd[NB_ ]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_ ]; YnbD = Yd[MB_ ]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is distributed but not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector to * swap sub( X ) with. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cctypeset(); size = type->size; /* * A purely local operation occurs iff the operands start in the same process * and, if either the grid is mono-dimensional or there is a single local block * to be swapped or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { cswap_( &XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } if( RRorCC && XisR && YisR ) return; } } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to swap the distributed vectors in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( RRorCC && ( XprocR != YprocR ) ) { /* * Both operands are distributed in the same direction, but reside in different * process rows or columns. */ if( XmyprocR == XprocR ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { dst = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); dst = MPosMod( dst, YnprocsD ); if( XisRow ) { Ccgesd2d( ctxt, 1, XnpD, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, YprocR, dst ); Ccgerv2d( ctxt, 1, XnpD, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, YprocR, dst ); } else { Ccgesd2d( ctxt, XnpD, 1, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, dst, YprocR ); Ccgerv2d( ctxt, XnpD, 1, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, dst, YprocR ); } } } if( YmyprocR == YprocR ) { YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( YisRow ) { Ccgesd2d( ctxt, 1, YnpD, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, XprocR, dst ); Ccgerv2d( ctxt, 1, YnpD, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, XprocR, dst ); } else { Ccgesd2d( ctxt, YnpD, 1, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, dst, XprocR ); Ccgerv2d( ctxt, YnpD, 1, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, dst, XprocR ); } } } } else { /* * General case when just one message needs to be exchanged */ if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) send it to the corresponding * process owning s piece of sub ( Y ). */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { dst = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); dst = MPosMod( dst, YnprocsD ); if( YisRow ) { rdst = YprocR; cdst = dst; } else { rdst = dst; cdst = YprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { cswap_( &XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( XisRow ) Ccgesd2d( ctxt, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, rdst, cdst ); else Ccgesd2d( ctxt, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, rdst, cdst ); } } } if( YmyprocR == YprocR ) { /* * The processes owning a piece of sub( Y ) receive the corresponding piece * of sub( X ) and send the piece of sub( Y ) they own to the same process. */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { src = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); src = MPosMod( src, XnprocsD ); if( XisRow ) { rsrc = XprocR; csrc = src; } else { rsrc = src; csrc = XprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( YnpD * size ); if( XisRow ) Ccgerv2d( ctxt, 1, YnpD, buf, 1, rsrc, csrc ); else Ccgerv2d( ctxt, YnpD, 1, buf, YnpD, rsrc, csrc ); if( YisRow ) Ccgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rsrc, csrc ); else Ccgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rsrc, csrc ); ccopy_( &YnpD, buf, &ione, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); if( buf ) free( buf ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding piece * of sub( Y ). */ if( XnpD > 0 ) { if( ( myrow != rdst ) || ( mycol != cdst ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Ccgerv2d( ctxt, 1, XnpD, buf, 1, rdst, cdst ); else Ccgerv2d( ctxt, XnpD, 1, buf, XnpD, rdst, cdst ); ccopy_( &XnpD, buf, &ione, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc ); if( buf ) free( buf ); } } } } } else if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) ) { /* * General case */ tran = ( RRorCC ? CNOTRAN : CTRAN ); if( XisRow ) { Xscope = CCOLUMN; Xm = 1; rsrc = XprocR; } else { Xscope = CROW; Xn = 1; csrc = XprocR; } if( YisRow ) { Yscope = CCOLUMN; Ym = 1; rdst = YprocR; } else { Yscope = CROW; Yn = 1; cdst = YprocR; } lcmb = PB_Clcm( XnprocsD * XnbD, YnprocsD * YnbD ); one = type->one; zero = type->zero; gcdPQ = PB_Cgcd( XnprocsD, YnprocsD ); lcmPQ = ( XnprocsD / gcdPQ ) * YnprocsD; for( k = 0; k < gcdPQ; k++ ) { p = 0; q = k; for( l = 0; l < lcmPQ; l++ ) { Xroc = MModAdd( XprocD, p, XnprocsD ); Yroc = MModAdd( YprocD, q, YnprocsD ); if( ( XmyprocD == Xroc ) || ( YmyprocD == Yroc ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, Xroc, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, Yroc, YprocD, YnprocsD ); PB_CVMinit( &VM, 0, XnpD, YnpD, Xinb1D, Yinb1D, XnbD, YnbD, p, q, XnprocsD, YnprocsD, lcmb ); if( npq = PB_CVMnpq( &VM ) ) { if( ( RRorCC && ( Xroc == Yroc ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( Xroc == YprocR ) && ( XprocR == Yroc ) ) ) { /* * If I am at the intersection of the process cross, or simply common to the * process rows or columns owning sub( X ) and sub( Y ) */ if( ( YmyprocD == Yroc ) && ( YmyprocR == YprocR ) ) { PB_CVMswp( type, &VM, ROW, &Xscope, &tran, npq, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Ylinc ); } } else { /* * Perform the message exchange: pack the data I own, send it, receive the * remote data, and unpack it. */ if( ( XmyprocR == XprocR ) && ( XmyprocD == Xroc ) ) { if( XisRow ) { Xn = npq; } else { Xm = npq; } if( YisRow ) { Yn = npq; cdst = Yroc; } else { Ym = npq; rdst = Yroc; } buf = PB_Cmalloc( npq * size ); PB_CVMpack( type, &VM, ROW, &Xscope, PACKING, NOTRAN, npq, 1, one, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, zero, buf, Xm ); Ccgesd2d( ctxt, Xm, Xn, buf, Xm, rdst, cdst ); Ccgerv2d( ctxt, Ym, Yn, buf, Ym, rdst, cdst ); PB_CVMpack( type, &VM, ROW, &Xscope, UNPACKING, &tran, npq, 1, zero, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, one, buf, Ym ); if( buf ) free ( buf ); } if( ( YmyprocR == YprocR ) && ( YmyprocD == Yroc ) ) { if( XisRow ) { Xn = npq; csrc = Xroc; } else { Xm = npq; rsrc = Xroc; } if( YisRow ) { Yn = npq; } else { Ym = npq; } buf = PB_Cmalloc( npq * size ); PB_CVMpack( type, &VM, COLUMN, &Yscope, PACKING, NOTRAN, npq, 1, one, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, zero, buf, Ym ); Ccgesd2d( ctxt, Ym, Yn, buf, Ym, rsrc, csrc ); Ccgerv2d( ctxt, Xm, Xn, buf, Xm, rsrc, csrc ); PB_CVMpack( type, &VM, COLUMN, &Yscope, UNPACKING, &tran, npq, 1, zero, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, one, buf, Xm ); if( buf ) free ( buf ); } } } } p = MModAdd1( p, XnprocsD ); q = MModAdd1( q, YnprocsD ); } } } if( XisR ) { /* * Replicate sub( X ) when necessary */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Ccgebs2d( ctxt, COLUMN, top, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld ); else Ccgebr2d( ctxt, COLUMN, top, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Ccgebs2d( ctxt, ROW, top, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld ); else Ccgebr2d( ctxt, ROW, top, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, XmyprocD, XprocR ); } } } if( YisR ) { /* * Replicate sub( Y ) when necessary */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocR == YprocR ) Ccgebs2d( ctxt, COLUMN, top, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld ); else Ccgebr2d( ctxt, COLUMN, top, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, YprocR, YmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocR == YprocR ) Ccgebs2d( ctxt, ROW, top, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld ); else Ccgebr2d( ctxt, ROW, top, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, YmyprocD, YprocR ); } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ PB_CpswapND( PB_Cctypeset(), *N, ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ PB_CpswapND( PB_Cctypeset(), *N, ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX ); } else { /* * Neither sub( X ) nor sub( Y ) are distributed. */ PB_CpswapNN( PB_Cctypeset(), *N, ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY ); } /* * End of PCSWAP */ } scalapack-2.0.2/PBLAS/SRC/pcsymm_.c000644 000766 000024 00000053243 10363532303 017021 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcsymm_( F_CHAR_T SIDE, F_CHAR_T UPLO, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pcsymm_( SIDE, UPLO, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T SIDE, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; float * A, * B, * C; #endif { /* * Purpose * ======= * * PCSYMM performs one of the matrix-matrix operations * * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * or * * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha and beta are scalars, sub( A ) is a symmetric submatrix and * sub( B ) and sub( C ) are m by n submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether the symmetric submatrix * sub( A ) appears on the left or right in the operation as * follows: * * SIDE = 'L' or 'l' * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * SIDE = 'R' or 'r' * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with SIDE = 'L' or 'l', this array contains * the local entries corresponding to the entries of the m by m * symmetric submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the symmetric submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * Before entry with SIDE = 'R' or 'r', this array contains * the local entries corresponding to the entries of the n by n * symmetric submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the symmetric submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, SideOp, UploA, cbtop, cbtopsave, cctop, cctopsave, rbtop, rbtopsave, rctop, rctopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, lside, mycol, myrow, nb, npcol, nprow, upper; double ABCest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PCSYMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCSYMM", "Illegal UPLO = %c\n", UploA ); info = -2; } if( lside ) { PB_Cchkmat( ctxt, "PCSYMM", "A", *M, 3, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCSYMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PCSYMM", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCSYMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PCSYMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PCSYMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) && ( ( BETA [REAL_PART] == ONE ) && ( BETA [IMAG_PART] == ZERO ) ) ) ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * If alpha is zero, sub( C ) := beta * sub( C ). */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else if( !( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (N >> M) * BCest : Both operands sub( B ) and sub( C ) are communicated (M >> N) */ if( lside ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ABCest = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp1 = DNROC( *N, Ad[NB_], npcol ); tmp2 = DNROC( *M, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); tmp4 = DNROC( *N, Cd[NB_], npcol ); BCest = (double)(*M) * ( ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.5 * ABCest ) <= BCest ); /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ChooseABC ) { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } } if( lside ) DirAB = ( rbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAB = ( cbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CpsymmAB( type, &DirAB, NOCONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_DEFAULT ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); } } if( lside ) DirAB = ( ( rbtop == CTOP_DRING || rctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); else DirAB = ( ( cbtop == CTOP_DRING || cctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); PB_CpsymmBC( type, &DirAB, NOCONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rbtopsave = *PB_Ctop( &ctxt, BCAST, ROW, &rbtopsave ); rctopsave = *PB_Ctop( &ctxt, COMBINE, ROW, &rctopsave ); cbtopsave = *PB_Ctop( &ctxt, BCAST, COLUMN, &cbtopsave ); cctopsave = *PB_Ctop( &ctxt, COMBINE, COLUMN, &cctopsave ); } /* * End of PCSYMM */ } scalapack-2.0.2/PBLAS/SRC/pcsyr2k_.c000644 000766 000024 00000052265 10363532303 017111 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcsyr2k_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pcsyr2k_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; float * A, * B, * C; #endif { /* * Purpose * ======= * * PCSYR2K performs one of the symmetric rank 2k operations * * sub( C ) := alpha*sub( A )*sub( B )' + alpha*sub( B )*sub( A )' + * beta*sub( C ), * * or * * sub( C ) := alpha*sub( A )'*sub( B ) + alpha*sub( B )'*sub( A ) + * beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise, and, * * sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N', * B(IB:IB+K-1,JB:JB+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * submatrix and sub( A ) and sub( B ) are n by k submatrices in the * first case and k by n submatrices in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the symmetric submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( B )' + * alpha*sub( B )*sub( A )' + * beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrices sub( A ) and sub( B ), and on * entry with TRANS = 'T' or 't', K specifies the number of rows * of the submatrices sub( A ) and sub( B ). K must be at least * zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and B corresponding to the entries of the submatrices * sub( A ) and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JB+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( B ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( B ). * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double ABCest, ABest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCSYR2K", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCSYR2K", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) { PB_Cchkmat( ctxt, "PCSYR2K", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCSYR2K", "B", *N, 3, *K, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PCSYR2K", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCSYR2K", "B", *K, 4, *N, 3, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PCSYR2K", "C", *N, 3, *N, 3, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PCSYR2K", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) && ( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * And when alpha or K is zero */ if( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (K >> N) * ABest : only sub( A ) and sub( B ) are communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[NB_], npcol ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[CSRC_] == -1 ) && ( Bd[CSRC_] == -1 ) ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + TWO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *K, Ad[MB_], nprow ); tmp4 = DNROC( *K, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[RSRC_] == -1 ) && ( Bd[RSRC_] == -1 ) ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + TWO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.4 * ABCest ) <= ABest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseABC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PCSYR2K */ } scalapack-2.0.2/PBLAS/SRC/pcsyrk_.c000644 000766 000024 00000045113 10363532303 017021 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pcsyrk_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pcsyrk_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * K, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PCSYRK performs one of the symmetric rank k operations * * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * * or * * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and, * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * submatrix and sub( A ) is an n by k submatrix in the first case and a * k by n submatrix in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the symmetric submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry, with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrix sub( A ), and with TRANS = 'T' or * 't', K specifies the number of rows of the submatrix * sub( A ). K must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, ChooseAC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double Aest, ACest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCSYRK", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCSYRK", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PCSYRK", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PCSYRK", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCSYRK", "C", *N, 3, *N, 3, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PCSYRK", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) && ( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * And when alpha or K is zero */ if( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ACest: both operands sub( A ) and sub( C ) are communicated (K >> N) * Aest : only sub( A ) is communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Ad[MB_], nprow ); Aest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); Aest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseAC = ( ( 1.3 * ACest ) <= Aest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseAC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PCSYRK */ } scalapack-2.0.2/PBLAS/SRC/pctradd_.c000644 000766 000024 00000033714 10363532303 017133 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pctradd_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pctradd_( UPLO, TRANS, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PCTRADD adds a trapezoidal matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if TRANS = 'N', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'T', * conjg(A(IA:IA+N-1,JA:JA+M-1)') if TRANS = 'C', * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * upper or lower trapezoidal submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of the * array C containing the upper or lower triangular part of the * triangular submatrix sub( C ) is to be referenced as follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * triangular submatrix sub( C ) is to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * triangular submatrix sub( C ) is to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the form of op( sub( A ) ) to be * used in the matrix addition as follows: * * TRANS = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANS = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANS = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ) when TRANS = 'N' or 'n' and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u' and TRANS = 'N' or 'n' or * UPLO = 'L' or 'l' and TRANS = 'T', 'C', 't' or 'c', this ar- * ray contains the local entries corresponding to the entries * of the upper triangular submatrix sub( A ), and the local en- * tries corresponding to the entries of the strictly lower tri- * angular part of the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l' and TRANS = 'N' or 'n' or * UPLO = 'U' or 'u' and TRANS = 'T', 'C', 't' or 'c', this ar- * ray contains the local entries corresponding to the entries * of the lower triangular submatrix sub( A ), and the local en- * tries corresponding to the entries of the strictly upper tri- * angular part of the submatrix sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the triangular submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the triangular submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAC, TranOp, UploC, ctop, rtop; int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, notran, npcol, nprow, upper; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRADD", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRADD", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PCTRADD", "A", *M, 3, *N, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PCTRADD", "A", *N, 4, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PCTRADD", "C", *M, 3, *N, 4, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PCTRADD", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( PB_Cctypeset(), &UploC, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cctypeset(), &UploC, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ /* * This operation mainly involves point-to-point send and receive communication. * There is therefore no particular BLACS topology to recommend. Still, one can * choose the main loop direction in which the operands will be added, but not * transposed. This selection is based on the current setting for the BLACS * broadcast operations. */ rtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( *M <= *N ) DirAC = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_Cptradd( PB_Cctypeset(), &DirAC, &UploC, ( notran ? NOTRAN : ( ( TranOp == CCOTRAN ) ? COTRAN : TRAN ) ), *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PCTRADD */ } scalapack-2.0.2/PBLAS/SRC/pctranc_.c000644 000766 000024 00000022716 10363532303 017144 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pctranc_( int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pctranc_( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ int * IA, * IC, * JA, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PCTRANC transposes a matrix * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+M-1), and, * * op( X ) = conjg( X )'. * * Thus, op( sub( A ) ) denotes conjg( A(IA:IA+N-1,JA:JA+M-1)' ). * * Beta is a scalar, sub( C ) is an m by n submatrix, and sub( A ) is an * n by m submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkmat( ctxt, "PCTRANC", "A", *N, 2, *M, 1, Ai, Aj, Ad, 7, &info ); PB_Cchkmat( ctxt, "PCTRANC", "C", *M, 1, *N, 2, Ci, Cj, Cd, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PCTRANC", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( PB_Cctypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cctypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ PB_Cptran( PB_Cctypeset(), CONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PCTRANC */ } scalapack-2.0.2/PBLAS/SRC/pctranu_.c000644 000766 000024 00000022664 10363532303 017170 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pctranu_( int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pctranu_( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ int * IA, * IC, * JA, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PCTRANU transposes a matrix * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+M-1), and, op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+N-1,JA:JA+M-1)'. * * Beta is a scalar, sub( C ) is an m by n submatrix, and sub( A ) is an * n by m submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkmat( ctxt, "PCTRANU", "A", *N, 2, *M, 1, Ai, Aj, Ad, 7, &info ); PB_Cchkmat( ctxt, "PCTRANU", "C", *M, 1, *N, 2, Ci, Cj, Cd, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PCTRANU", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( PB_Cctypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cctypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ PB_Cptran( PB_Cctypeset(), NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PCTRANU */ } scalapack-2.0.2/PBLAS/SRC/pctrmm_.c000644 000766 000024 00000052427 10363532303 017016 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pctrmm_( F_CHAR_T SIDE, F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB ) #else void pctrmm_( SIDE, UPLO, TRANS, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, SIDE, TRANS, UPLO; int * IA, * IB, * JA, * JB, * M, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCB; float * A, * B; #endif { /* * Purpose * ======= * * PCTRMM performs one of the matrix-matrix operations * * sub( B ) := alpha * op( sub( A ) ) * sub( B ), * * or * * sub( B ) := alpha * sub( B ) * op( sub( A ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, sub( B ) is an m by n submatrix, sub( A ) is a * unit, or non-unit, upper or lower triangular submatrix and op( X ) is * one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether op( sub( A ) ) multiplies * sub( B ) from the left or right as follows: * * SIDE = 'L' or 'l' sub( B ) := alpha*op( sub( A ) )*sub( B ), * * SIDE = 'R' or 'r' sub( B ) := alpha*sub( B )*op( sub( A ) ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) COMPLEX array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n transformed submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, DirB, OpC, OpR, SideOp, TopC, TopR, TranOp, UploA, Var, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ForceTop, ctxt, info, lside, mycol, myrow, nb, notran, nounit, npcol, nprow, upper; double ABestL, ABestR, Best, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1101 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRMM", "Illegal UPLO = %c\n", UploA ); info = -2; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRMM", "Illegal TRANS = %c\n", TranOp ); info = -3; } if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRMM", "Illegal DIAG = %c\n", DiagA ); info = -4; } if( lside ) PB_Cchkmat( ctxt, "PCTRMM", "A", *M, 5, *M, 5, Ai, Aj, Ad, 11, &info ); else PB_Cchkmat( ctxt, "PCTRMM", "A", *N, 6, *N, 6, Ai, Aj, Ad, 11, &info ); PB_Cchkmat( ctxt, "PCTRMM", "B", *M, 5, *N, 6, Bi, Bj, Bd, 15, &info ); } if( info ) { PB_Cabort( ctxt, "PCTRMM", info ); return; } #endif /* * Quick return if possible */ if( *M == 0 || *N == 0 ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) B), Bi, Bj, Bd ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABestR, ABestL : both operands sub( A ) and sub( B ) are communicated * ( N >> M when SIDE is left and M >> N otherwise ) * Best : only sub( B ) is communicated * ( M >> N when SIDE is left and N >> M otherwise ) */ if( lside ) { if( notran ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); ChooseAB = ( ( 1.1 * ABestR ) <= Best ); } else { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestL = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + MAX( tmp2, tmp1 ) / TWO ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); ChooseAB = ( ( ( 1.1 * ABestL ) <= Best ) || ( ( 1.1 * ABestR ) <= Best ) ); } } else { if( notran ) { tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); ChooseAB = ( ( 1.1 * ABestR ) <= Best ); } else { tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestL = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + CBRATIO * ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + MAX( tmp2, tmp1 ) / TWO ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); ChooseAB = ( ( ( 1.1 * ABestL ) <= Best ) || ( ( 1.1 * ABestR ) <= Best ) ); } } /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); if( ChooseAB ) { if( lside ) { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { if( ABestL <= ABestR ) { OpR = CBCAST; OpC = CCOMBINE; Var = CLEFT; if( upper ) { TopR = CTOP_DRING; TopC = CTOP_IRING; } else { TopR = CTOP_IRING; TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } } } else { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { if( ABestL <= ABestR ) { OpR = CCOMBINE; OpC = CBCAST; Var = CLEFT; if( upper ) { TopR = CTOP_DRING; TopC = CTOP_IRING; } else { TopR = CTOP_IRING; TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } PB_CptrmmAB( type, &Var, &SideOp, &UploA, &TranOp, &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); } else { if( ( lside && notran ) || ( !( lside ) && !( notran ) ) ) { OpR = CCOMBINE; OpC = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } } else { OpR = CBCAST; OpC = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } } if( lside ) DirB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirB = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CptrmmB( type, &DirB, &SideOp, &UploA, &TranOp, &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PCTRMM */ } scalapack-2.0.2/PBLAS/SRC/pctrmv_.c000644 000766 000024 00000047627 10363532303 017035 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pctrmv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pctrmv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; /* * .. Array Arguments .. */ int * DESCA, * DESCX; float * A, * X; #endif { /* * Purpose * ======= * * PCTRMV performs one of the matrix-vector operations * * sub( X ) := sub( A )*sub( X ) or sub( X ) := sub( A )'*sub( X ) * * or * * sub( X ) := conjg( sub( A )' )*sub( X ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * sub( X ) is an n element subvector and sub( A ) is an n by n unit, * or non-unit, upper or lower triangular submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( X ) := sub( A ) * sub( X ). * * TRANS = 'T' or 't' sub( X ) := sub( A )' * sub( X ). * * TRANS = 'C' or 'c' * sub( X ) := conjg( sub( A )' ) * sub( X ). * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the transfor- * med subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Xroc, * one, * tbeta, top, *zero; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XAfr, XAld, Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, YAfr, YAld, YApbY, YAsum, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_]; char * Aptr = NULL, * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRMV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRMV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRMV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PCTRMV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info ); PB_Cchkvec( ctxt, "PCTRMV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PCTRMV", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); size = type->size; one = type->one; zero = type->zero; /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN ); if( notran ) { /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XA, XAd, &XAfr ); /* * Reuse sub( X ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( type, COLUMN, *N, *N, Ad0, 1, one, ((char *) X), Xi, Xj, Xd, &Xroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * If sub( X ) is distributed in (a) process column(s), then zero it. */ if( Xroc == CCOLUMN ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * sub( X ) resides in (a) process columns(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { cset_( &Xnp, zero, Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], size ), &ione ); } } } } else { /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XA, XAd, &XAfr ); /* * Reuse sub( X ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( type, ROW, *N, *N, Ad0, 1, one, ((char *) X), Xi, Xj, Xd, &Xroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * If sub( X ) is distributed in (a) process row(s), then zero it. */ if( Xroc == CROW ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; cset_( &Xnq, zero, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xld ); } } } } /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XAld = XAd[LLD_]; YAld = YAd[LLD_]; /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { cgemv_( TRANS, &Akp, &Anq0, one, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, YA, &ione ); } PB_Cptrm( type, type, LEFT, UPPER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, size ), YAld, PB_Ctztrmv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { cgemv_( TRANS, &Akp, &Anq0, one, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XA, &ione, one, Mptr( YA, 0, Akq, YAld, size ), &YAld ); } PB_Cptrm( type, type, LEFT, UPPER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, size ), YAld, PB_Ctztrmv ); } } } else { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, type, LEFT, LOWER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, size ), YAld, PB_Ctztrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { cgemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, Mptr( YA, Akp, 0, YAld, size ), &ione ); } } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, type, LEFT, LOWER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, size ), YAld, PB_Ctztrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { cgemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, Akp, 0, XAld, size ), &ione, one, Mptr( YA, 0, Akq, YAld, size ), &YAld ); } } } } } if( XAfr ) free( XA ); if( notran ) { /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Ccgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } /* * sub( X ) := YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, YA, 0, 0, YAd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } } else { /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Ccgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } /* * sub( X ) := YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } } if( YAfr ) free( YA ); /* * End of PCTRMV */ } scalapack-2.0.2/PBLAS/SRC/pctrsm_.c000644 000766 000024 00000052373 11622500733 017025 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pctrsm_( F_CHAR_T SIDE, F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB ) #else void pctrsm_( SIDE, UPLO, TRANS, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, SIDE, TRANS, UPLO; int * IA, * IB, * JA, * JB, * M, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCB; float * A, * B; #endif { /* * Purpose * ======= * * PCTRSM solves one of the matrix equations * * op( sub( A ) )*X = alpha*sub( B ), or * * X*op( sub( A ) ) = alpha*sub( B ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is * a unit, or non-unit, upper or lower triangular submatrix and op( Y ) * is one of * * op( Y ) = Y or op( Y ) = Y' or op( Y ) = conjg( Y' ). * * The submatrix X is overwritten on sub( B ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether op( sub( A ) ) appears on * the left or right of X as follows: * * SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ), * * SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) COMPLEX array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n solution submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, DirB, OpC, OpR, SideOp, TopC, TopR, TranOp, UploA, Var, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ForceTop, ctxt, info, itmp, lside, mycol, myrow, nb, notran, nounit, npcol, nprow, upper; double ABestL, ABestR, Best, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1101 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRSM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRSM", "Illegal UPLO = %c\n", UploA ); info = -2; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRSM", "Illegal TRANS = %c\n", TranOp ); info = -3; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRSM", "Illegal DIAG = %c\n", DiagA ); info = -4; } if( lside ) PB_Cchkmat( ctxt, "PCTRSM", "A", *M, 5, *M, 5, Ai, Aj, Ad, 11, &info ); else PB_Cchkmat( ctxt, "PCTRSM", "A", *N, 6, *N, 6, Ai, Aj, Ad, 11, &info ); PB_Cchkmat( ctxt, "PCTRSM", "B", *M, 5, *N, 6, Bi, Bj, Bd, 15, &info ); } if( info ) { PB_Cabort( ctxt, "PCTRSM", info ); return; } #endif /* * Quick return if possible */ if( *M == 0 || *N == 0 ) return; /* * Get type structure */ type = PB_Cctypeset(); /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) B), Bi, Bj, Bd ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); /* * ABestR, ABestL : both operands sub( A ) and sub( B ) are communicated * ( N >> M when SIDE is left and M >> N otherwise ) * Best : only sub( B ) is communicated * ( M >> N when SIDE is left and N >> M otherwise ) */ if( lside ) { if( notran ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); itmp = MIN( Ad[MB_], Ad[NB_] ); Best = (double)(*N) * ( (double)(CEIL( *M, itmp )) * (double)(itmp) * ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ); ChooseAB = ( ABestR <= ( 2.0 * Best ) ); } else { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestL = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + MAX( tmp2, tmp1 ) / TWO ); itmp = MIN( Ad[MB_], Ad[NB_] ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( (double)(CEIL( *M, itmp )) * (double)(itmp) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ) + MAX( tmp2, tmp3 ) ); ChooseAB = ( ( ABestL <= ( 2.0 * Best ) ) || ( ABestR <= ( 2.0 * Best ) ) ); } } else { if( notran ) { tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); itmp = MIN( Ad[MB_], Ad[NB_] ); Best = (double)(*M) * ( (double)(CEIL( *N, itmp )) * (double)(itmp) * ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ); ChooseAB = ( ABestR <= ( 2.0 * Best ) ); } else { tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestL = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + CBRATIO * ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + MAX( tmp2, tmp1 ) / TWO ); itmp = MIN( Ad[MB_], Ad[NB_] ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( (double)(CEIL( *N, itmp )) * (double)(itmp) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ) + MAX( tmp1, tmp4 ) ); ChooseAB = ( ( ABestL <= ( 2.0 * Best ) ) || ( ABestR <= ( 2.0 * Best ) ) ); } } /* * Var can remain uninitialized but is nevertheless used in PB_CptrsmAB.c * provide a default here. TODO: does this make sense ? *==19891== at 0x44F81B: PB_CptrsmAB (PB_CptrsmAB.c:538) *==19891== by 0x427BE7: pdtrsm_ (pdtrsm_.c:488) *==19891== by 0x405E46: MAIN_ (pdblas3tim.f:727) */ Var = CRIGHT; if( ChooseAB ) { /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ ForceTop = ( ( *M > nb ) && ( *N > nb ) ); if( ForceTop ) { if( lside ) { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { if( ABestL <= ABestR ) { OpR = CBCAST; OpC = CCOMBINE; Var = CLEFT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } } } else { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { if( ABestL <= ABestR ) { OpR = CCOMBINE; OpC = CBCAST; Var = CLEFT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } PB_CptrsmAB( type, &Var, &SideOp, &UploA, &TranOp, &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } } else { /* * BLACS topologies are always enforced. */ if( ( lside && notran ) || ( !lside && !notran ) ) { OpR = CCOMBINE; OpC = CBCAST; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } /* * Remove the next line when the BLACS combine operations support ring * topologies */ TopR = CTOP_DEFAULT; } else { OpR = CBCAST; OpC = CCOMBINE; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } /* * Remove the next line when the BLACS combine operations support ring * topologies */ TopC = CTOP_DEFAULT; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); if( lside ) DirB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirB = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CptrsmB( type, &DirB, &SideOp, &UploA, &TranOp, &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); /* * Restore the BLACS topologies. */ rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PCTRSM */ } scalapack-2.0.2/PBLAS/SRC/pctrsv_.c000644 000766 000024 00000076140 10363532303 017033 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pctrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pctrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; /* * .. Array Arguments .. */ int * DESCA, * DESCX; float * A, * X; #endif { /* * Purpose * ======= * * PCTRSV solves one of the systems of equations * * sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b, or * * conjg( sub( A )' )*sub( X ) = b, * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * b and sub( X ) are n element subvectors and sub( A ) is an n by n * unit, or non-unit, upper or lower triangular submatrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( A ) * sub( X ) = b. * * TRANS = 'T' or 't' sub( A )' * sub( X ) = b. * * TRANS = 'C' or 'c' conjg( sub( A )' ) * sub( X ) = b. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * A (local input) COMPLEX array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On entry, sub( X ) is the n element right-hand side * b. On exit, sub( X ) is overwritten with the solution subvec- * tor. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one, * zero; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb, Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld, XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info, ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_]; char * Aptr = NULL, * XAC = NULL, * XAR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRSV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRSV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PCTRSV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PCTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info ); PB_Cchkvec( ctxt, "PCTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PCTRSV", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cctypeset(); size = type->size; one = type->one; zero = type->zero; negone = type->negone; /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN ); if( notran ) { if( upper ) { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING ); (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process column owning the last * column of sub( A ) */ PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, &XACfr, &XACsum, &XACapbX ); /* * Create vector XAR in process rows spanned by sub( A ) */ PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, &XARsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAC contains the solution scattered in multiple * process columns and XAR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ if( Akp > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( XACsum ) { kbprev = MIN( k, nb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow, Arow, nprow ); Akp -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) cgemv_( TRANS, &ktmp, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol ); Ccgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 0, XACld, size ), XACld, myrow, Asrc ); if( mycol != Asrc ) cset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } if( Akp > 0 && Anq0 > 0 ) cgemv_( TRANS, &Akp, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, XAC, &ione ); } else { if( Anq0 > 0 ) cgemv_( TRANS, &Akp, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, XAC, &ione ); } } } /* * Combine the scattered resulting vector XAC */ if( XACsum && ( Anp > 0 ) ) { Ccgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, XACd[CSRC_] ); } /* * sub( X ) := XAC (if necessary) */ if( XACapbX ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); } else { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process column owning the first * column of sub( A ) */ PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, &XACfr, &XACsum, &XACapbX ); /* * Create vector XAR in process rows spanned by sub( A ) */ PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, &XARsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = 0; k < *N; k += nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAC contains the solution scattered in multiple * process columns and XAR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow ); if( ( Anp0 = Anp - Akp ) > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( XACsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow, nprow ); Anp0 -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) cgemv_( TRANS, &ktmp, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol ); Ccgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 0, XACld, size ), XACld, myrow, Asrc ); if( mycol != Asrc ) cset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } if( Anp0 > 0 && Anq0 > 0 ) cgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione ); } else { if( Anq0 > 0 ) cgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } } } /* * Combine the scattered resulting vector XAC */ if( XACsum && ( Anp > 0 ) ) { Ccgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, XACd[CSRC_] ); } /* * sub( X ) := XAC (if necessary) */ if( XACapbX ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); } } else { if( upper ) { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAR in process row owning the first row * of sub( A ) */ PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, &XARfr, &XARsum, &XARapbX ); /* * Create vector XAC in process columns spanned by sub( A ) */ PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, &XACsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = 0; k < *N; k += nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAR contains the solution scattered in multiple * process rows and XAC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anq0 = Anq - Akq ) > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( XARsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol, npcol ); Anq0 -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) cgemv_( TRANS, &Anp0, &ktmp, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow ); Ccgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, Akq, XARld, size ), XARld, Asrc, mycol ); if( myrow != Asrc ) cset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } if( Anp0 > 0 && Anq0 > 0 ) cgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld ); } else { if( Anp0 > 0 ) cgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } } } /* * Combine the scattered resulting vector XAR */ if( XARsum && ( Anq > 0 ) ) { Ccgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], mycol ); } /* * sub( X ) := XAR (if necessary) */ if( XARapbX ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); } else { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process row owning the last row * of sub( A ) */ PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, &XARfr, &XARsum, &XARapbX ); /* * Create vector XAC in process columns spanned by sub( A ) */ PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, &XACsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAR contains the solution scattered in multiple * process rows and XAC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later * is only locally updated. */ if( Akq > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( XARsum ) { kbprev = MIN( k, nb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol, Acol, npcol ); Akq -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) cgemv_( TRANS, &Anp0, &ktmp, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow ); Ccgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, Akq, XARld, size ), XARld, Asrc, mycol ); if( myrow != Asrc ) cset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } if( Anp0 > 0 && Akq > 0 ) cgemv_( TRANS, &Anp0, &Akq, negone, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, XAR, &XARld ); } else { if( Anp0 > 0 ) cgemv_( TRANS, &Anp0, &Akq, negone, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, XAR, &XARld ); } } } /* * Combine the scattered resulting vector XAR */ if( XARsum && ( Anq > 0 ) ) { Ccgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], mycol ); } /* * sub( X ) := XAR (if necessary) */ if( XARapbX ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); } } if( XACfr ) free( XAC ); if( XARfr ) free( XAR ); /* * End of PCTRSV */ } scalapack-2.0.2/PBLAS/SRC/pdagemv_.c000644 000766 000024 00000045173 10363532303 017137 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdagemv_( F_CHAR_T TRANS, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pdagemv_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PDAGEMV performs one of the matrix-vector operations * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( sub( A )' )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1). * * When TRANS = 'N', * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+M-1), if INCY = M_Y, * Y(IY:IY+M-1,JY:JY), if INCY = 1 and INCY <> M_Y, * and, otherwise * * sub( X ) denotes X(IX:IX,JX:JX+M-1), if INCX = M_X, * X(IX:IX+M-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a real subvector, * sub( X ) is a subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := |alpha|*|sub( A ) | * |sub( X )| + * |beta*sub( Y )|, * * TRANS = 'T' or 't', * sub( Y ) := |alpha|*|sub( A )'| * |sub( X )| + * |beta*sub( Y )|, * * TRANS = 'C' or 'c', * sub( Y ) := |alpha|*|sub( A )'| * |sub( X )| + * |beta*sub( Y )|. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char TrA, Yroc, * one, * tbeta, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, mycol, myrow, nota, npcol, nprow; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDAGEMV", "Illegal TRANS=%c\n", TrA ); info = -1; } PB_Cchkmat( ctxt, "PDAGEMV", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); if( nota ) { PB_Cchkvec( ctxt, "PDAGEMV", "X", *N, 3, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PDAGEMV", "Y", *M, 2, Yi, Yj, Yd, *INCY, 18, &info ); } else { PB_Cchkvec( ctxt, "PDAGEMV", "X", *M, 2, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PDAGEMV", "Y", *N, 3, Yi, Yj, Yd, *INCY, 18, &info ); } } if( info ) { PB_Cabort( ctxt, "PDAGEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = utyp = PB_Cdtypeset(); /* * When alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, utyp->size ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], utyp->size ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( nota ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( utyp, COLUMN, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { dagemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *) A), Aii, Ajj, Ald, type->size), &Ald, XA, &XAd[LLD_], tbeta, YA, &ione ); } if( XAfr ) free( XA ); /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( utyp, ROW, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { dagemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *) A), Aii, Ajj, Ald, type->size ), &Ald, XA, &ione, tbeta, YA, &YAd[LLD_] ); } if( XAfr ) free( XA ); /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, utyp->size ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], utyp->size ), INCY ); } } } one = utyp->one; if( nota ) { PB_Cpaxpby( utyp, NOCONJG, *M, 1, one, YA, 0, 0, YAd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } else { PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PDAGEMV */ } scalapack-2.0.2/PBLAS/SRC/pdamax_.c000644 000766 000024 00000045272 10363532303 016766 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdamax_( int * N, double * AMAX, int * INDX, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pdamax_( N, AMAX, INDX, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * INDX, * IX, * JX, * N; double * AMAX; /* * .. Array Arguments .. */ int * DESCX; double * X; #endif { /* * Purpose * ======= * * PDAMAX computes the global index of the maximum element in absolute * value of a subvector sub( X ). The global index is returned in INDX * and the value of that element is returned in AMAX, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * AMAX (global output) DOUBLE PRECISION array * On exit, AMAX specifies the largest entry in absolute value * of the subvector sub( X ) only in its scope (See below for * further details). * * INDX (global output) INTEGER * On exit, INDX specifies the global index of the maximum ele- * ment in absolute value of the subvector sub( X ) only in its * scope (See below for further details). * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char cbtop, cctop, rbtop, rctop; int Xcol, Xgindx, Xi, Xii, Ximb, Xinb, Xj, Xjj, Xlindx, Xld, Xmb, Xnb, Xnp, Xnq, Xrow, Xsrc, ctxt, dist, dst, idumm, info, k, maxpos, mycol, mydist, myrow, npcol, nprow, src; /* * .. Local Arrays .. */ int Xd[DLEN_]; double work[4]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PDAMAX", "X", *N, 1, Xi, Xj, Xd, *INCX, 7, &info ); if( info ) { PB_Cabort( ctxt, "PDAMAX", info ); return; } #endif /* * Initialize INDX and AMAX */ *INDX = 0; *AMAX = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *INCX == 1 ) && ( Xd[M_] == 1 ) && ( *N == 1 ) ) { /* * Make sure I own some data and compute INDX and AMAX */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { *INDX = *JX; *AMAX = X[Xii+Xjj*Xd[LLD_]]; } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); if( ( rctop == CTOP_DEFAULT ) || ( rctop == CTOP_TREE1 ) ) { /* * Inline the 1-tree combine for communication savings */ Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_]; Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnq > 0 ) { Xld = Xd[LLD_]; Xlindx = Xjj - 1 + idamax_( &Xnq, ((char*)(X+(Xii+Xjj*Xld))), &Xld ); Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol ); work[0] = X[Xii+Xlindx*Xld]; work[1] = ((double)( Xgindx+1 )); } else { work[0] = ZERO; work[1] = ZERO; } /* * Combine the local results using a 1-tree topology within process column 0 * if npcol > 1 or Xcol >= 0, i.e sub( X ) is distributed. */ if( ( npcol >= 2 ) && ( Xcol >= 0 ) ) { mydist = mycol; k = 1; l_10: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, npcol ); Cdgesd2d( ctxt, 2, 1, ((char*)work), 2, myrow, dst ); goto l_20; } else { dist = mycol + k; src = MPosMod( dist, npcol ); if( mycol < src ) { Cdgerv2d( ctxt, 2, 1, ((char*) &work[2]), 2, myrow, src ); if( ABS( work[0] ) < ABS( work[2] ) ) { work[0] = work[2]; work[1] = work[3]; } } mydist >>= 1; } k <<= 1; if( k < npcol ) goto l_10; l_20: /* * Process column 0 broadcasts the combined values of INDX and AMAX within * their process row. */ rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == 0 ) { Cdgebs2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2 ); } else { Cdgebr2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2, myrow, 0 ); } } /* * Set INDX and AMAX to the replicated answers contained in work. If AMAX is * zero, then select a coherent INDX. */ *AMAX = work[0]; *INDX = ( ( *AMAX == ZERO ) ? ( *JX ) : ( (int)(work[1]) ) ); } else { /* * Otherwise use the current topology settings to combine the results */ Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_]; Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnq > 0 ) { /* * Compute the local maximum and its corresponding local index */ Xld = Xd[LLD_]; Xlindx = Xjj - 1 + idamax_( &Xnq, ((char*)(X+(Xii+Xjj*Xld))), &Xld ); *AMAX = X[Xii+Xlindx*Xld]; } else { *AMAX = ZERO; } if( Xcol >= 0 ) { /* * Combine leave on all the local maximum if Xcol >= 0, i.e sub( X ) is * distributed */ Cdgamx2d( ctxt, ROW, &rctop, 1, 1, ((char*)AMAX), 1, &idumm, &maxpos, 1, -1, mycol ); /* * Broadcast the corresponding global index */ if( *AMAX != ZERO ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == maxpos ) { Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol ); *INDX = Xgindx + 1; Cigebs2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1 ); } else { Cigebr2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1, myrow, maxpos ); } } else { /* * If AMAX is zero, then select a coherent INDX. */ *INDX = *JX; } } else { /* * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX. */ *INDX = ( ( *AMAX == ZERO ) ? ( *JX ) : Xlindx + 1 ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ( cctop == CTOP_DEFAULT ) || ( cctop == CTOP_TREE1 ) ) { /* * Inline the 1-tree combine for communication savings */ Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_]; Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnp > 0 ) { Xld = Xd[LLD_]; Xlindx = Xii - 1 + idamax_( &Xnp, ((char*)(X+(Xii+Xjj*Xld))), INCX ); Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow ); work[0] = X[Xlindx+Xjj*Xld]; work[1] = ((double)( Xgindx+1 )); } else { work[0] = ZERO; work[1] = ZERO; } /* * Combine the local results using a 1-tree topology within process row 0 * if nprow > 1 or Xrow >= 0, i.e sub( X ) is distributed. */ if( ( nprow >= 2 ) && ( Xrow >= 0 ) ) { mydist = myrow; k = 1; l_30: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, nprow ); Cdgesd2d( ctxt, 2, 1, ((char*)work), 2, dst, mycol ); goto l_40; } else { dist = myrow + k; src = MPosMod( dist, nprow ); if( myrow < src ) { Cdgerv2d( ctxt, 2, 1, ((char*) &work[2]), 2, src, mycol ); if( ABS( work[0] ) < ABS( work[2] ) ) { work[0] = work[2]; work[1] = work[3]; } } mydist >>= 1; } k <<= 1; if( k < nprow ) goto l_30; l_40: /* * Process row 0 broadcasts the combined values of INDX and AMAX within their * process column. */ cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == 0 ) { Cdgebs2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2 ); } else { Cdgebr2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2, 0, mycol ); } } /* * Set INDX and AMAX to the replicated answers contained in work. If AMAX is * zero, then select a coherent INDX. */ *AMAX = work[0]; *INDX = ( ( *AMAX == ZERO ) ? ( *IX ) : ( (int)(work[1]) ) ); } else { /* * Otherwise use the current topology settings to combine the results */ Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_]; Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnp > 0 ) { /* * Compute the local maximum and its corresponding local index */ Xld = Xd[LLD_]; Xlindx = Xii - 1 + idamax_( &Xnp, ((char*)(X+(Xii+Xjj*Xld))), INCX ); *AMAX = X[Xlindx+Xjj*Xld]; } else { *AMAX = ZERO; } if( Xrow >= 0 ) { /* * Combine leave on all the local maximum if Xrow >= 0, i.e sub( X ) is * distributed. */ Cdgamx2d( ctxt, COLUMN, &cctop, 1, 1, ((char*)AMAX), 1, &maxpos, &idumm, 1, -1, mycol ); /* * Broadcast the corresponding global index */ if( *AMAX != ZERO ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == maxpos ) { Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow ); *INDX = Xgindx + 1; Cigebs2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1 ); } else { Cigebr2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1, maxpos, mycol ); } } else { /* * If AMAX is zero, then select a coherent INDX. */ *INDX = *IX; } } else { /* * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX. */ *INDX = ( ( *AMAX == ZERO ) ? ( *IX ) : Xlindx + 1 ); } } } return; } /* * End of PDAMAX */ } scalapack-2.0.2/PBLAS/SRC/pdasum_.c000644 000766 000024 00000024645 10363532303 017006 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdasum_( int * N, double * ASUM, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pdasum_( N, ASUM, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; double * ASUM; /* * .. Array Arguments .. */ int * DESCX; double * X; #endif { /* * Purpose * ======= * * PDASUM returns the sum of absolute values of the entries of a subvec- * tor sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ASUM (local output) DOUBLE PRECISION * On exit, ASUM specifies the sum of absolute values of the * subvector sub( X ) only in its scope (See below for further * details). * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char top; int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PDASUM", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PDASUM", info ); return; } #endif /* * Initialize ASUM */ *ASUM = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *N == 1 ) && ( *INCX == 1 ) && ( Xd[M_] == 1 ) ) { /* * Make sure I own some data and compute ASUM */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { *ASUM = ABS( X[Xii+Xjj*Xd[LLD_]] ); } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and compute the local sum */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; dvasum_( &Xnq, ((char *) ASUM), ((char *)( X+(Xii+Xjj*Xld) )), &Xld ); } /* * If Xnq <= 0, ASUM is zero (see initialization above) */ if( ( npcol > 1 ) && ( Xcol >= 0 ) ) { /* * Combine the local results if npcol > 1 and Xcol >= 0, i.e sub( X ) is * distributed. */ top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, 1, 1, ((char *)ASUM), 1, -1, mycol ); } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and compute the local sum */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { dvasum_( &Xnp, ((char *) ASUM), ((char *)( X+(Xii+Xjj*Xd[LLD_]) )), INCX ); } /* * If Xnp <= 0, ASUM is zero (see initialization above) */ if( ( nprow > 1 ) && ( Xrow >= 0 ) ) { /* * Combine the local results if nprow > 1 and Xrow >= 0, i.e sub( X ) is * distributed. */ top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, 1, ((char *)ASUM), 1, -1, mycol ); } } return; } /* * End of PDASUM */ } scalapack-2.0.2/PBLAS/SRC/pdasymv_.c000644 000766 000024 00000056003 10363532303 017171 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdasymv_( F_CHAR_T UPLO, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pdasymv_( UPLO, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PDASYMV performs the matrix-vector operation * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a n element real subvec- * tor, sub( X ) is an n element subvector and sub( A ) is an n by n * symmetric submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries of the upper triangular part of the * symmetric submatrix sub( A ), and the local entries of the * strictly lower triangular of sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries of the lower triangular part of the * symmetric submatrix sub( A ), and the local entries of the * strictly upper triangular of sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten by the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCpbY, YCsum, YCld, YRfr, YRpbY, YRsum, YRld, Ycol, Yi, Yii, YisRow, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper, usiz; double * tbeta; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XCd[DLEN_], XRd[DLEN_], Xd[DLEN_], YCd[DLEN_], YRd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, __FILE__, "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkmat( ctxt, "PDASYMV", "A", *N, 2, *N, 2, Ai, Aj, Ad, 7, &info ); PB_Cchkvec( ctxt, "PDASYMV", "X", *N, 2, Xi, Xj, Xd, *INCX, 11, &info ); PB_Cchkvec( ctxt, "PDASYMV", "Y", *N, 2, Yi, Yj, Yd, *INCY, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PDASYMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = utyp = PB_Cdtypeset(); size = usiz = type->size; /* * and when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Reuse sub( Y ) and/or create vectors YR in process rows and YC in process * columns spanned by sub( A ) */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) { PB_CInOutV( utyp, ROW, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW, ((char**)(&tbeta)), &YR, YRd, &YRfr, &YRsum, &YRpbY ); PB_COutV( utyp, COLUMN, INIT, *N, *N, Ad0, 1, &YC, YCd, &YCfr, &YCsum ); } else { PB_CInOutV( utyp, COLUMN, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN, ((char**)(&tbeta)), &YC, YCd, &YCfr, &YCsum, &YCpbY ); PB_COutV( utyp, ROW, INIT, *N, *N, Ad0, 1, &YR, YRd, &YRfr, &YRsum ); } /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd, ROW, &XC, XCd, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd, COLUMN, &XR, XRd, &XRfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd[LLD_]; XRld = XRd[LLD_]; YCld = YCd[LLD_]; YRld = YRd[LLD_]; /* * Scale YR or YC in the case sub( Y ) has been reused */ if( YisRow ) { /* * YR resides in (a) process row(s) */ if( !YRpbY ) { if( ( myrow == YRd[RSRC_] ) || ( YRd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YR */ if( Anq > 0 ) dascal_( &Anq, ((char *) tbeta), YR, &YRld ); } } } else { /* * YC resides in (a) process column(s) */ if( !YCpbY ) { if( ( mycol == YCd[CSRC_] ) || ( YCd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YC */ if( Amp > 0 ) dascal_( &Amp, ((char *) tbeta), YC, &ione ); } } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &utyp->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { dagemv_( C2F_CHAR( NOTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, YC, &ione ); dagemv_( C2F_CHAR( TRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XC, &ione, one, Mptr( YR, 0, Akq, YRld, usiz ), &YRld ); } PB_Cpsym( type, utyp, LEFT, UPPER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, usiz ), YCld, Mptr( YR, 0, Akq, YRld, usiz ), YRld, PB_Ctzasymv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsym( type, utyp, LEFT, LOWER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, usiz ), YCld, Mptr( YR, 0, Akq, YRld, usiz ), YRld, PB_Ctzasymv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { dagemv_( C2F_CHAR( NOTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, Mptr( YC, Akp, 0, YCld, usiz ), &ione ); dagemv_( C2F_CHAR( TRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XC, Akp, 0, XCld, size ), &ione, one, Mptr( YR, 0, Akq, YRld, usiz ), &YRld ); } } } } if( XCfr ) free( XC ); if( XRfr ) free( XR ); if( YisRow ) { /* * Combine the partial column results into YC */ if( YCsum ) { YCd[CSRC_] = 0; if( Amp > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, 0 ); } } /* * Combine the partial row results into YR */ if( YRsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], YRd[RSRC_], mycol ); } /* * YR := YR + YC */ PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, YR, 0, 0, YRd, ROW ); /* * sub( Y ) := beta * sub( Y ) + YR (if necessary) */ if( YRpbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || Yrow < 0 ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, ((char *) Y), Yi, Yj, Yd, ROW ); } } else { /* * Combine the partial row results into YR */ if( YRsum ) { YRd[RSRC_] = 0; if( Anq > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], 0, mycol ); } } /* * Combine the partial column results into YC */ if( YCsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, YCd[CSRC_] ); } /* * YC := YR + YC */ PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, YC, 0, 0, YCd, COLUMN ); /* * sub( Y ) := beta * sub( Y ) + YC (if necessary) */ if( YCpbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || Ycol < 0 ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, COLUMN ); } } if( YCfr ) free( YC ); if( YRfr ) free( YR ); /* * End of PDASYMV */ } scalapack-2.0.2/PBLAS/SRC/pdatrmv_.c000644 000766 000024 00000061531 10363532303 017165 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdatrmv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pdatrmv_( UPLO, TRANS, DIAG, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PDATRMV performs one of the matrix-vector operations * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( sub( A )' )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a real subvector, * sub( X ) is a subvector and sub( A ) is an n by n triangular subma- * trix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := |alpha|*|sub( A )|*|sub( X )| + * |beta*sub( Y )|. * * TRANS = 'T' or 't' * sub( Y ) := |alpha|*|sub( A )'|*|sub( X )| + * |beta*sub( Y )|. * * TRANS = 'C' or 'c' * sub( Y ) := |alpha|*|sub( A )'|*|sub( X )| + * |beta*sub( Y )|. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Yroc, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XAfr, XAld, Xi, Xj, YAfr, YAld, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper, usiz; double * tbeta; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDATRMV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDATRMV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PDATRMV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PDATRMV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkvec( ctxt, "PDATRMV", "X", *N, 4, Xi, Xj, Xd, *INCX, 13, &info ); PB_Cchkvec( ctxt, "PDATRMV", "Y", *N, 4, Yi, Yj, Yd, *INCY, 19, &info ); } if( info ) { PB_Cabort( ctxt, "PDATRMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA [REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = utyp = PB_Cdtypeset(); size = usiz = type->size; /* * and when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( notran ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( utyp, COLUMN, *N, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, ((char**)(&tbeta)), &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( utyp, ROW, *N, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, ((char**)(&tbeta)), &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XAld = XAd[LLD_]; YAld = YAd[LLD_]; /* * Scale YA in the case sub( Y ) has been reused */ if( notran && !( YApbY ) ) { /* * YA resides in (a) process column(s) */ if( ( mycol == YAd[CSRC_] ) || ( YAd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YA */ if( Amp > 0 ) dascal_( &Amp, ((char *) tbeta), YA, &ione ); } } else if( !( notran ) && !( YApbY ) ) { /* * YA resides in (a) process row(s) */ if( ( myrow == YAd[RSRC_] ) || ( YAd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YA */ if( Anq > 0 ) dascal_( &Anq, ((char *) tbeta), YA, &YAld ); } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &utyp->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { dagemv_( TRANS, &Akp, &Anq0, ((char *) ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, YA, &ione ); } PB_Cptrm( type, utyp, LEFT, UPPER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, usiz ), YAld, PB_Ctzatrmv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { dagemv_( TRANS, &Akp, &Anq0, ((char *) ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XA, &ione, one, Mptr( YA, 0, Akq, YAld, usiz ), &YAld ); } PB_Cptrm( type, utyp, LEFT, UPPER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, usiz ), YAld, PB_Ctzatrmv ); } } } else { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, utyp, LEFT, LOWER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, usiz ), YAld, PB_Ctzatrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { dagemv_( TRANS, &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, Mptr( YA, Akp, 0, YAld, usiz ), &ione ); } } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, utyp, LEFT, LOWER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, usiz ), YAld, PB_Ctzatrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { dagemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, Akp, 0, XAld, size ), &ione, one, Mptr( YA, 0, Akq, YAld, usiz ), &YAld ); } } } } } if( XAfr ) free( XA ); if( notran ) { /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } } else { /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } if( notran ) { PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YA, 0, 0, YAd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } else { PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PDATRMV */ } scalapack-2.0.2/PBLAS/SRC/pdaxpy_.c000644 000766 000024 00000022542 10363532303 017014 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdaxpy_( int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pdaxpy_( N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCX, * DESCY; double * X, * Y; #endif { /* * Purpose * ======= * * PDAXPY adds one subvector to another, * * sub( Y ) := sub( Y ) + alpha * sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER. * On entry, N specifies the length of the subvectors to be * added. N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xi, Xj, Yi, Yj, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PDAXPY", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PDAXPY", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PDAXPY", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Get type structure */ type = PB_Cdtypeset(); /* * Start the operations */ if( *INCX == Xd[M_] ) { PB_Cpaxpby( type, NOCONJG, 1, *N, ((char *) ALPHA), ((char *) X), Xi, Xj, Xd, ROW, type->one, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, ((char *) ALPHA), ((char *) X), Xi, Xj, Xd, COLUMN, type->one, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } /* * End of PDAXPY */ } scalapack-2.0.2/PBLAS/SRC/pdcopy_.c000644 000766 000024 00000021631 10363532303 017003 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdcopy_( int * N, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pdcopy_( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; /* * .. Array Arguments .. */ int * DESCX, * DESCY; double * X, * Y; #endif { /* * Purpose * ======= * * PDCOPY copies one subvector into another, * * sub( Y ) := sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * copied. N must be at least zero. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xi, Xj, Yi, Yj, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 501 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PDCOPY", "X", *N, 1, Xi, Xj, Xd, *INCX, 5, &info ); PB_Cchkvec( ctxt, "PDCOPY", "Y", *N, 1, Yi, Yj, Yd, *INCY, 10, &info ); } if( info ) { PB_Cabort( ctxt, "PDCOPY", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Get type structure */ type = PB_Cdtypeset(); /* * Start the operations */ if( *INCX == Xd[M_] ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) X), Xi, Xj, Xd, ROW, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) X), Xi, Xj, Xd, COLUMN, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } /* * End of PDCOPY */ } scalapack-2.0.2/PBLAS/SRC/pddot_.c000644 000766 000024 00000066706 10363532303 016633 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pddot_( int * N, double * DOT, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pddot_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; double * DOT; /* * .. Array Arguments .. */ int * DESCX, * DESCY; double * X, * Y; #endif { /* * Purpose * ======= * * PDDOT forms the dot product of two subvectors, * * DOT := sub( X )**T * sub( Y ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * multiplied. N must be at least zero. * * DOT (local output) DOUBLE PRECISION array * On exit, DOT specifies the dot product of the two subvectors * sub( X ) and sub( Y ) only in their scope (See below for fur- * ther details). * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char scope, * top; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD, Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info, ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src; PBTYP_T * type; VVDOT_T dot; /* * .. Local Arrays .. */ char * buf = NULL; int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PDDOT", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PDDOT", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PDDOT", info ); return; } #endif DOT[REAL_PART] = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Handle degenerate case */ if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) ) { type = PB_Cdtypeset(); PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_]; XnbD = Xd[NB_]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_]; XnbD = Xd[MB_]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_]; YnbD = Yd[NB_]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_]; YnbD = Yd[MB_]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector used * to perform the dot product computation. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cdtypeset(); size = type->size; dot = type->Fvvdotu; /* * A purely operation occurs iff the operands start in the same process and if * either the grid is mono-dimensional or there is a single local block to be * operated with or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } } /* * Combine the local results in sub( X )'s scope */ if( ( XisR && YisR ) || ( XmyprocR == XprocR ) ) { scope = ( XisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); Cdgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 ); } if( RRorCC && XisR && YisR ) return; } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to compute the desired dot-product in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( ( YmyprocR == YprocR ) ) { /* * The processes owning a piece of sub( Y ) send it to the corresponding * process owning s piece of sub ( X ). */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( XisRow ) { rdst = XprocR; cdst = dst; } else { rdst = dst; cdst = XprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( YisRow ) Cdgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); else Cdgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding local * piece of sub( Y ), compute the local dot product and combine the results * within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); src = MPosMod( src, YnprocsD ); if( YisRow ) { rsrc = YprocR; csrc = src; } else { rsrc = src; csrc = YprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Cdgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc ); else Cdgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc ); dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } else { /* * General case, copy sub( Y ) within sub( X )'s scope, compute the local * results and combine them within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XisRow ) { PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt, 1 ); } else { PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt, MAX( 1, XnpD ) ); } if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) ) buf = PB_Cmalloc( XnpD * size ); if( YisRow ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj, Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj, Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } if( XmyprocR == XprocR ) { if( XnpD > 0 ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } /* * Send the DOT product result within sub( Y )'s scope */ if( XisR || YisR ) { /* * Either sub( X ) or sub( Y ) are replicated, so that every process should have * the result -> broadcast it orthogonally from sub( X )'s direction. */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Cdgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Cdgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Cdgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Cdgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD, XprocR ); } } else { /* * Neither sub( X ) nor sub( Y ) are replicated */ if( RRorCC ) { /* * Both sub( X ) are distributed in the same direction -> the process row or * column XprocR sends the result to the process row or column YprocR. */ if( XprocR != YprocR ) { if( XmyprocR == XprocR ) { if( XisRow ) Cdgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR, YmyprocD ); else Cdgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD, YprocR ); } else if( YmyprocR == YprocR ) { if( XisRow ) Cdgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR, XmyprocD ); else Cdgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD, XprocR ); } } } else { /* * Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s * scope, broadcast the result within sub( Y )'s scope. */ if( YmyprocR == YprocR ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocD == XprocR ) Cdgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Cdgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, YprocR, XprocR ); } else { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocD == XprocR ) Cdgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Cdgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, YprocR ); } } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ type = PB_Cdtypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ type = PB_Cdtypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotu ); } else { /* * Neither sub( X ) nor sub( Y ) are distributed */ type = PB_Cdtypeset(); PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); } /* * End of PDDOT */ } scalapack-2.0.2/PBLAS/SRC/pdgeadd_.c000644 000766 000024 00000026515 10363532303 017103 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdgeadd_( F_CHAR_T TRANS, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pdgeadd_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * IC, * JA, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PDGEADD adds a matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if TRANS = 'N', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'T', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'C'. * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the form of op( sub( A ) ) to be * used in the matrix addition as follows: * * TRANS = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANS = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANS = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, DirC, ctop, rtop; int Ai, Aj, Ci, Cj, TrA, ctxt, info, mycol, myrow, npcol, nprow, notran; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !notran ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDGEADD", "Illegal TRANS = %c\n", TrA ); info = -1; } if( notran ) PB_Cchkmat( ctxt, "PDGEADD", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); else PB_Cchkmat( ctxt, "PDGEADD", "A", *N, 3, *M, 2, Ai, Aj, Ad, 8, &info ); PB_Cchkmat( ctxt, "PDGEADD", "C", *M, 2, *N, 3, Ci, Cj, Cd, 13, &info ); } if( info ) { PB_Cabort( ctxt, "PDGEADD", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( PB_Cdtypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cdtypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ /* * This operation mainly involves point-to-point send and receive communication. * There is therefore no particular BLACS topology to recommend. Still, one can * choose the main loop direction in which the operands will be added. This * selection is based on the current setting for the BLACS broadcast operations. */ if( notran ) { rtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( *M <= *N ) { DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirC = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpgeadd( PB_Cdtypeset(), &DirA, &DirC, NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cptran( PB_Cdtypeset(), NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } /* * End of PDGEADD */ } scalapack-2.0.2/PBLAS/SRC/pdgemm_.c000644 000766 000024 00000047700 10363532303 016763 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdgemm_( F_CHAR_T TRANSA, F_CHAR_T TRANSB, int * M, int * N, int * K, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pdgemm_( TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANSA, TRANSB; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; double * A, * B, * C; #endif { /* * Purpose * ======= * * PDGEMM performs one of the matrix-matrix operations * * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * op( X ) = X or op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N', * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T', * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'C', * * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N', * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T', * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'C', * * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) ) * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and * sub( C ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies the form of op( sub( B ) ) to be * used in the matrix multiplication as follows: * * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ), * * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )', * * TRANSB = 'C' or 'c' op( sub( B ) ) = sub( B )'. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * op( sub( A ) ) and of the submatrix sub( C ). M must be at * least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * op( sub( B ) ) and the number of columns of the submatrix * sub( C ). N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns of the submatrix * op( sub( A ) ) and the number of rows of the submatrix * op( sub( B ) ). K must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) DOUBLE PRECISION array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, DirBC, OpC, OpR, TrA, TrB, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ChooseBC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, nota, notb, npcol, nprow; double ABest, ACest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANSA )[0] ) ) == CNOTRAN ); notb = ( ( TrB = Mupcase( F2C_CHAR( TRANSB )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1001 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDGEMM", "Illegal TRANSA = %c\n", TrA ); info = -1; } else if( ( !notb ) && ( TrB != CTRAN ) && ( TrB != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDGEMM", "Illegal TRANSB = %c\n", TrB ); info = -2; } if( nota ) PB_Cchkmat( ctxt, "PDGEMM", "A", *M, 3, *K, 5, Ai, Aj, Ad, 10, &info ); else PB_Cchkmat( ctxt, "PDGEMM", "A", *K, 5, *M, 3, Ai, Aj, Ad, 10, &info ); if( notb ) PB_Cchkmat( ctxt, "PDGEMM", "B", *K, 5, *N, 4, Bi, Bj, Bd, 14, &info ); else PB_Cchkmat( ctxt, "PDGEMM", "B", *N, 4, *K, 5, Bi, Bj, Bd, 14, &info ); PB_Cchkmat( ctxt, "PDGEMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 19, &info ); } if( info ) { PB_Cabort( ctxt, "PDGEMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO || *K == 0 ) && ( BETA [REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cdtypeset(); /* * If alpha or K is zero, sub( C ) := beta * sub( C ). */ if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char * ) C), Ci, Cj, Cd ); } else if( !( BETA[REAL_PART] == ONE ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char * ) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABest: both operands sub( A ) and sub( B ) are communicated (M, N >> K) * ACest: both operands sub( A ) and sub( C ) are communicated (K, N >> M) * BCest: both operands sub( B ) and sub( C ) are communicated (M, K >> N) */ ABest = (double)(*K); ACest = (double)(*M); BCest = (double)(*N); if( notb ) { if( nota ) { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); ABest *= ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *K, Bd[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[MB_], nprow ); BCest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } else { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *M, Ad[NB_], npcol ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ); tmp1 = DNROC( *K, Bd[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ACest *= ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *K, Ad[MB_], nprow ); tmp2 = DNROC( *M, Bd[NB_], npcol ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest *= ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } } else { if( nota ) { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *N, Bd[MB_], nprow ); tmp2 = DNROC( *K, Bd[NB_], npcol ); tmp3 = DNROC( *N, Cd[NB_], npcol ); ACest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *K, Ad[NB_], npcol ); BCest *= CBRATIO * ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); } else { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *N, Bd[MB_], nprow ); tmp2 = DNROC( *K, Bd[NB_], npcol ); tmp3 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *K, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *K, Bd[NB_], npcol ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } } ChooseAB = ( ( ABest <= ( 1.3 * BCest ) ) && ( ABest <= ( 1.3 * ACest ) ) ); ChooseBC = ( ( BCest <= ACest ) && ( ( 1.3 * BCest ) <= ABest ) ); /* * BLACS topologies are enforced iff M, N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) && ( *K > nb ) ); if( ChooseAB ) { OpR = CBCAST; OpC = CBCAST; } else if( ChooseBC ) { if( nota ) { OpR = CCOMBINE; OpC = CBCAST; } else { OpR = CBCAST; OpC = CCOMBINE; } } else { if( notb ) { OpR = CBCAST; OpC = CCOMBINE; } else { OpR = CCOMBINE; OpC = CBCAST; } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_IRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_IRING ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirAB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirBC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); if( ChooseAB ) { PB_CpgemmAB( type, &DirAB, &DirBC, ( nota ? NOTRAN : TRAN ), ( notb ? NOTRAN : TRAN ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else if( ChooseBC ) { PB_CpgemmBC( type, &DirAB, &DirBC, ( nota ? NOTRAN : TRAN ), ( notb ? NOTRAN : TRAN ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { PB_CpgemmAC( type, &DirAB, &DirBC, ( nota ? NOTRAN : TRAN ), ( notb ? NOTRAN : TRAN ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PDGEMM */ } scalapack-2.0.2/PBLAS/SRC/pdgemv_.c000644 000766 000024 00000043176 10363532303 016777 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdgemv_( F_CHAR_T TRANS, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pdgemv_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PDGEMV performs one of the matrix-vector operations * * sub( Y ) := alpha*sub( A ) *sub( X ) + beta*sub( Y ), or * sub( Y ) := alpha*sub( A )'*sub( X ) + beta*sub( Y ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1). * * When TRANS = 'N', * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+M-1), if INCY = M_Y, * Y(IY:IY+M-1,JY:JY), if INCY = 1 and INCY <> M_Y, * and, otherwise * * sub( X ) denotes X(IX:IX,JX:JX+M-1), if INCX = M_X, * X(IX:IX+M-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are scalars, and sub( X ) and sub( Y ) are subvectors * and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := alpha*sub( A ) * sub( X ) + beta*sub( Y ), * * TRANS = 'T' or 't', * sub( Y ) := alpha*sub( A )' * sub( X ) + beta*sub( Y ), * * TRANS = 'C' or 'c', * sub( Y ) := alpha*sub( A )' * sub( X ) + beta*sub( Y ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char TrA, Yroc, * tbeta, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, mycol, myrow, nota, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDGEMV", "Illegal TRANS=%c\n", TrA ); info = -1; } PB_Cchkmat( ctxt, "PDGEMV", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); if( nota ) { PB_Cchkvec( ctxt, "PDGEMV", "X", *N, 3, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PDGEMV", "Y", *M, 2, Yi, Yj, Yd, *INCY, 18, &info ); } else { PB_Cchkvec( ctxt, "PDGEMV", "X", *M, 2, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PDGEMV", "Y", *N, 3, Yi, Yj, Yd, *INCY, 18, &info ); } } if( info ) { PB_Cabort( ctxt, "PDGEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cdtypeset(); /* * When alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; if( BETA[REAL_PART] == ZERO ) { dset_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } else { dscal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { if( BETA[REAL_PART] == ZERO ) { dset_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } else { dscal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( nota ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( type, COLUMN, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { dgemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *)A), Aii, Ajj, Ald, type->size ), &Ald, XA, &XAd[LLD_], tbeta, YA, &ione ); } if( XAfr ) free( XA ); /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, *M, 1, type->one, YA, 0, 0, YAd, COLUMN, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc ); } } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( type, ROW, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { dgemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *)A), Aii, Ajj, Ald, type->size ), &Ald, XA, &ione, tbeta, YA, &YAd[LLD_] ); } if( XAfr ) free( XA ); /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, YA, 0, 0, YAd, ROW, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PDGEMV */ } scalapack-2.0.2/PBLAS/SRC/pdger_.c000644 000766 000024 00000027356 10363532303 016620 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdger_( int * M, int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY, double * A, int * IA, int * JA, int * DESCA ) #else void pdger_( M, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PDGER performs the rank 1 operation * * sub( A ) := alpha*sub( X )*sub( Y )' + sub( A ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) is an m element subvector, sub( Y ) is * an n element subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+M-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+M-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( A ) are overwritten by the * local entries of the m by n updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, Yi, Yj, ctxt, info, ione=1, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd[DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PDGER", "X", *M, 1, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PDGER", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PDGER", "A", *M, 1, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PDGER", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cdtypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Replicate sub( Y ) in process rows spanned by sub( A ) -> YA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ), &YA, YAd, &YAfr ); /* * Local rank-1 update iff I own some data */ Amp = PB_Cnumroc( *M, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { dger_( &Amp, &Anq, ((char *) ALPHA), XA, &ione, YA, &YAd[LLD_], ((char *) (A+(Aii+Ajj*Ald))), &Ald ); } if( XAfr ) free( XA ); if( YAfr ) free( YA ); /* * End of PDGER */ } scalapack-2.0.2/PBLAS/SRC/pdnrm2_.c000644 000766 000024 00000036427 10363532303 016720 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdnrm2_( int * N, double * NORM2, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pdnrm2_( N, NORM2, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; double * NORM2; /* * .. Array Arguments .. */ int * DESCX; double * X; #endif { /* * Purpose * ======= * * PDNRM2 computes the 2-norm of a subvector sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * NORM2 (local output) DOUBLE PRECISION * On exit, NORM2 specifies the 2-norm of the subvector sub( X ) * only in its scope (See below for further details). * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char top; int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, dst, dist, info, k, mycol, mydist, myrow, npcol, nprow, src; double scale, ssq, temp1, temp2; /* * .. Local Arrays .. */ int Xd[DLEN_]; double * Xptr = NULL, work[4]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PDNRM2", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PDNRM2", info ); return; } #endif /* * Initialize NORM2 */ *NORM2 = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *N == 1 ) && ( *INCX == 1 ) && ( Xd[M_] == 1 ) ) { /* * Make sure I own some data and compute NORM2 */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) *NORM2 = ABS( X[Xii+Xjj*Xd[LLD_]] ); return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Initialize SCALE and SSQ */ scale = ZERO; ssq = ONE; /* * Make sure I own some data and compute local sum of squares */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; Xptr = X+(Xii+Xjj*Xld); for( k = 0; k < Xnq; k++ ) { if( *Xptr != ZERO ) { temp1 = ABS( *Xptr ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xptr += Xld; } } /* * If Xnq <= 0, SCALE is zero and SSQ is one (see initialization above) */ if( ( npcol >= 2 ) && ( Xcol >= 0 ) ) { /* * Combine the local sum of squares using a 1-tree topology within process row * 0 if npcol > 1 and Xcol >= 0, i.e sub( X ) is distributed. */ work[0] = scale; work[1] = ssq; mydist = mycol; k = 1; l_10: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, npcol ); Cdgesd2d( ctxt, 2, 1, ((char*) work), 2, myrow, dst ); goto l_20; } else { dist = mycol + k; src = MPosMod( dist, npcol ); if( mycol < src ) { Cdgerv2d( ctxt, 2, 1, ((char*)&work[2]), 2, myrow, src ); if( work[0] >= work[2] ) { if( work[0] != ZERO ) { temp1 = work[2] / work[0]; work[1] = work[1] + ( temp1 * temp1 ) * work[3]; } } else { temp1 = work[0] / work[2]; work[1] = work[3] + ( temp1 * temp1 ) * work[1]; work[0] = work[2]; } } mydist >>= 1; } k <<= 1; if( k < npcol ) goto l_10; l_20: /* * Process column 0 broadcasts the combined values of SCALE and SSQ within their * process row. */ top = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == 0 ) { Cdgebs2d( ctxt, ROW, &top, 2, 1, ((char*)work), 2 ); } else { Cdgebr2d( ctxt, ROW, &top, 2, 1, ((char*)work), 2, myrow, 0 ); } /* * Compute NORM2 redundantly NORM2 = WORK( 1 ) * SQRT( WORK( 2 ) ) */ dasqrtb_( &work[0], &work[1], NORM2 ); } else { /* * Compute NORM2 redundantly ( sub( X ) is not distributed ) */ dasqrtb_( &scale, &ssq, NORM2 ); } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Initialize SCALE and SSQ */ scale = ZERO; ssq = ONE; /* * Make sure I own some data and compute local sum of squares */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { Xptr = X+(Xii+Xjj*Xd[LLD_]); for( k = 0; k < Xnp; k++ ) { if( *Xptr != ZERO ) { temp1 = ABS( *Xptr ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xptr++; } } /* * If Xnp <= 0, SCALE is zero and SSQ is one (see initialization above) */ if( ( nprow >= 2 ) && ( Xrow >= 0 ) ) { /* * Combine the local sum of squares using a 1-tree topology within process * column 0 if nprow > 1 and Xrow >= 0, i.e sub( X ) is distributed. */ work[0] = scale; work[1] = ssq; mydist = myrow; k = 1; l_30: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, nprow ); Cdgesd2d( ctxt, 2, 1, ((char*)work), 2, dst, mycol ); goto l_40; } else { dist = myrow + k; src = MPosMod( dist, nprow ); if( myrow < src ) { Cdgerv2d( ctxt, 2, 1, ((char*)&work[2]), 2, src, mycol ); if( work[0] >= work[2] ) { if( work[0] != ZERO ) { temp1 = work[2] / work[0]; work[1] = work[1] + ( temp1 * temp1 ) * work[3]; } } else { temp1 = work[0] / work[2]; work[1] = work[3] + ( temp1 * temp1 ) * work[1]; work[0] = work[2]; } } mydist >>= 1; } k <<= 1; if( k < nprow ) goto l_30; l_40: /* * Process column 0 broadcasts the combined values of SCALE and SSQ within their * process column */ top = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == 0 ) { Cdgebs2d( ctxt, COLUMN, &top, 2, 1, ((char*)work), 2 ); } else { Cdgebr2d( ctxt, COLUMN, &top, 2, 1, ((char*)work), 2, 0, mycol ); } /* * Compute NORM2 redundantly NORM2 = WORK[0] * SQRT( WORK[1] ) */ dasqrtb_( &work[0], &work[1], NORM2 ); } else { /* * Compute NORM2 redundantly ( sub( X ) is not distributed ) */ dasqrtb_( &scale, &ssq, NORM2 ); } } return; } /* * End of PDNRM2 */ } scalapack-2.0.2/PBLAS/SRC/pdscal_.c000644 000766 000024 00000021726 10602576752 016774 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdscal_( int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pdscal_( N, ALPHA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCX; double * X; #endif { /* * Purpose * ======= * * PDSCAL multiplies an n element subvector sub( X ) by the scalar * alpha, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input/local output) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the scaled * subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PDSCAL", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PDSCAL", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( Xd[CTXT_], &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Start the operations */ if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { if( ALPHA[REAL_PART] == ONE ) return; /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; if( ALPHA[REAL_PART] == ZERO ) { dset_( &Xnq, ((char *) ALPHA), ((char *)(X+(Xii+Xjj*Xld))), &Xld ); } else { dscal_( &Xnq, ((char *) ALPHA), ((char *)(X+(Xii+Xjj*Xld))), &Xld ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { if( ALPHA[REAL_PART] == ONE ) return; /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { if( ALPHA[REAL_PART] == ZERO ) { dset_( &Xnp, ((char *) ALPHA), ((char *)( X+(Xii+Xjj*Xd[LLD_]) )), INCX ); } else { dscal_( &Xnp, ((char *) ALPHA), ((char *)( X+(Xii+Xjj*Xd[LLD_]) )), INCX ); } } } return; } /* * End of PDSCAL */ } scalapack-2.0.2/PBLAS/SRC/pdswap_.c000644 000766 000024 00000076231 10363532303 017011 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdswap_( int * N, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pdswap_( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; /* * .. Array Arguments .. */ int * DESCX, * DESCY; double * X, * Y; #endif { /* * Purpose * ======= * * PDSWAP swaps two subvectors, * * sub( Y ) := sub( X ) and sub( X ) := sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * swapped. N must be at least zero. * * X (local input/local output) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with sub( Y ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Xscope, Yscope, * one, * top, tran, * zero; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, XisD, XisR, Xinb1D, XinbD, XisRow, Xii, Xj, Xjj, Xld, Xlinc, Xm, XmyprocD, XmyprocR, Xn, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xroc, Xrow, Ycol, Yi, Yii, Yinb1D, YinbD, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, Ym, YmyprocD, YmyprocR, Yn, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yroc, Yrow, cdst, csrc, ctxt, dst, gcdPQ, info, ione=1, k, l, lcmPQ, lcmb, mycol, myrow, npcol, npq, nprow, p, q, rdst, rsrc, src, size; PBTYP_T * type; PB_VM_T VM; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; char * buf = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 501 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PDSWAP", "X", *N, 1, Xi, Xj, Xd, *INCX, 5, &info ); PB_Cchkvec( ctxt, "PDSWAP", "Y", *N, 1, Yi, Yj, Yd, *INCY, 10, &info ); } if( info ) { PB_Cabort( ctxt, "PDSWAP", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_ ]; XnbD = Xd[NB_ ]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_ ]; XnbD = Xd[MB_ ]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_ ]; YnbD = Yd[NB_ ]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_ ]; YnbD = Yd[MB_ ]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is distributed but not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector to * swap sub( X ) with. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cdtypeset(); size = type->size; /* * A purely local operation occurs iff the operands start in the same process * and, if either the grid is mono-dimensional or there is a single local block * to be swapped or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { dswap_( &XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } if( RRorCC && XisR && YisR ) return; } } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to swap the distributed vectors in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( RRorCC && ( XprocR != YprocR ) ) { /* * Both operands are distributed in the same direction, but reside in different * process rows or columns. */ if( XmyprocR == XprocR ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { dst = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); dst = MPosMod( dst, YnprocsD ); if( XisRow ) { Cdgesd2d( ctxt, 1, XnpD, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, YprocR, dst ); Cdgerv2d( ctxt, 1, XnpD, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, YprocR, dst ); } else { Cdgesd2d( ctxt, XnpD, 1, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, dst, YprocR ); Cdgerv2d( ctxt, XnpD, 1, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, dst, YprocR ); } } } if( YmyprocR == YprocR ) { YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( YisRow ) { Cdgesd2d( ctxt, 1, YnpD, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, XprocR, dst ); Cdgerv2d( ctxt, 1, YnpD, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, XprocR, dst ); } else { Cdgesd2d( ctxt, YnpD, 1, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, dst, XprocR ); Cdgerv2d( ctxt, YnpD, 1, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, dst, XprocR ); } } } } else { /* * General case when just one message needs to be exchanged */ if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) send it to the corresponding * process owning s piece of sub ( Y ). */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { dst = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); dst = MPosMod( dst, YnprocsD ); if( YisRow ) { rdst = YprocR; cdst = dst; } else { rdst = dst; cdst = YprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { dswap_( &XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( XisRow ) Cdgesd2d( ctxt, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, rdst, cdst ); else Cdgesd2d( ctxt, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, rdst, cdst ); } } } if( YmyprocR == YprocR ) { /* * The processes owning a piece of sub( Y ) receive the corresponding piece * of sub( X ) and send the piece of sub( Y ) they own to the same process. */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { src = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); src = MPosMod( src, XnprocsD ); if( XisRow ) { rsrc = XprocR; csrc = src; } else { rsrc = src; csrc = XprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( YnpD * size ); if( XisRow ) Cdgerv2d( ctxt, 1, YnpD, buf, 1, rsrc, csrc ); else Cdgerv2d( ctxt, YnpD, 1, buf, YnpD, rsrc, csrc ); if( YisRow ) Cdgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rsrc, csrc ); else Cdgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rsrc, csrc ); dcopy_( &YnpD, buf, &ione, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); if( buf ) free( buf ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding piece * of sub( Y ). */ if( XnpD > 0 ) { if( ( myrow != rdst ) || ( mycol != cdst ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Cdgerv2d( ctxt, 1, XnpD, buf, 1, rdst, cdst ); else Cdgerv2d( ctxt, XnpD, 1, buf, XnpD, rdst, cdst ); dcopy_( &XnpD, buf, &ione, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc ); if( buf ) free( buf ); } } } } } else if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) ) { /* * General case */ tran = ( RRorCC ? CNOTRAN : CTRAN ); if( XisRow ) { Xscope = CCOLUMN; Xm = 1; rsrc = XprocR; } else { Xscope = CROW; Xn = 1; csrc = XprocR; } if( YisRow ) { Yscope = CCOLUMN; Ym = 1; rdst = YprocR; } else { Yscope = CROW; Yn = 1; cdst = YprocR; } lcmb = PB_Clcm( XnprocsD * XnbD, YnprocsD * YnbD ); one = type->one; zero = type->zero; gcdPQ = PB_Cgcd( XnprocsD, YnprocsD ); lcmPQ = ( XnprocsD / gcdPQ ) * YnprocsD; for( k = 0; k < gcdPQ; k++ ) { p = 0; q = k; for( l = 0; l < lcmPQ; l++ ) { Xroc = MModAdd( XprocD, p, XnprocsD ); Yroc = MModAdd( YprocD, q, YnprocsD ); if( ( XmyprocD == Xroc ) || ( YmyprocD == Yroc ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, Xroc, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, Yroc, YprocD, YnprocsD ); PB_CVMinit( &VM, 0, XnpD, YnpD, Xinb1D, Yinb1D, XnbD, YnbD, p, q, XnprocsD, YnprocsD, lcmb ); if( npq = PB_CVMnpq( &VM ) ) { if( ( RRorCC && ( Xroc == Yroc ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( Xroc == YprocR ) && ( XprocR == Yroc ) ) ) { /* * If I am at the intersection of the process cross, or simply common to the * process rows or columns owning sub( X ) and sub( Y ) */ if( ( YmyprocD == Yroc ) && ( YmyprocR == YprocR ) ) { PB_CVMswp( type, &VM, ROW, &Xscope, &tran, npq, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Ylinc ); } } else { /* * Perform the message exchange: pack the data I own, send it, receive the * remote data, and unpack it. */ if( ( XmyprocR == XprocR ) && ( XmyprocD == Xroc ) ) { if( XisRow ) { Xn = npq; } else { Xm = npq; } if( YisRow ) { Yn = npq; cdst = Yroc; } else { Ym = npq; rdst = Yroc; } buf = PB_Cmalloc( npq * size ); PB_CVMpack( type, &VM, ROW, &Xscope, PACKING, NOTRAN, npq, 1, one, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, zero, buf, Xm ); Cdgesd2d( ctxt, Xm, Xn, buf, Xm, rdst, cdst ); Cdgerv2d( ctxt, Ym, Yn, buf, Ym, rdst, cdst ); PB_CVMpack( type, &VM, ROW, &Xscope, UNPACKING, &tran, npq, 1, zero, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, one, buf, Ym ); if( buf ) free ( buf ); } if( ( YmyprocR == YprocR ) && ( YmyprocD == Yroc ) ) { if( XisRow ) { Xn = npq; csrc = Xroc; } else { Xm = npq; rsrc = Xroc; } if( YisRow ) { Yn = npq; } else { Ym = npq; } buf = PB_Cmalloc( npq * size ); PB_CVMpack( type, &VM, COLUMN, &Yscope, PACKING, NOTRAN, npq, 1, one, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, zero, buf, Ym ); Cdgesd2d( ctxt, Ym, Yn, buf, Ym, rsrc, csrc ); Cdgerv2d( ctxt, Xm, Xn, buf, Xm, rsrc, csrc ); PB_CVMpack( type, &VM, COLUMN, &Yscope, UNPACKING, &tran, npq, 1, zero, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, one, buf, Xm ); if( buf ) free ( buf ); } } } } p = MModAdd1( p, XnprocsD ); q = MModAdd1( q, YnprocsD ); } } } if( XisR ) { /* * Replicate sub( X ) when necessary */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Cdgebs2d( ctxt, COLUMN, top, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld ); else Cdgebr2d( ctxt, COLUMN, top, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Cdgebs2d( ctxt, ROW, top, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld ); else Cdgebr2d( ctxt, ROW, top, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, XmyprocD, XprocR ); } } } if( YisR ) { /* * Replicate sub( Y ) when necessary */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocR == YprocR ) Cdgebs2d( ctxt, COLUMN, top, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld ); else Cdgebr2d( ctxt, COLUMN, top, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, YprocR, YmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocR == YprocR ) Cdgebs2d( ctxt, ROW, top, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld ); else Cdgebr2d( ctxt, ROW, top, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, YmyprocD, YprocR ); } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ PB_CpswapND( PB_Cdtypeset(), *N, ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ PB_CpswapND( PB_Cdtypeset(), *N, ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX ); } else { /* * Neither sub( X ) nor sub( Y ) are distributed. */ PB_CpswapNN( PB_Cdtypeset(), *N, ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY ); } /* * End of PDSWAP */ } scalapack-2.0.2/PBLAS/SRC/pdsymm_.c000644 000766 000024 00000053030 10363532303 017014 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdsymm_( F_CHAR_T SIDE, F_CHAR_T UPLO, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pdsymm_( SIDE, UPLO, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T SIDE, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; double * A, * B, * C; #endif { /* * Purpose * ======= * * PDSYMM performs one of the matrix-matrix operations * * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * or * * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha and beta are scalars, sub( A ) is a symmetric submatrix and * sub( B ) and sub( C ) are m by n submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether the symmetric submatrix * sub( A ) appears on the left or right in the operation as * follows: * * SIDE = 'L' or 'l' * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * SIDE = 'R' or 'r' * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with SIDE = 'L' or 'l', this array contains * the local entries corresponding to the entries of the m by m * symmetric submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the symmetric submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * Before entry with SIDE = 'R' or 'r', this array contains * the local entries corresponding to the entries of the n by n * symmetric submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the symmetric submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) DOUBLE PRECISION array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, SideOp, UploA, cbtop, cbtopsave, cctop, cctopsave, rbtop, rbtopsave, rctop, rctopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, lside, mycol, myrow, nb, npcol, nprow, upper; double ABCest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PDSYMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDSYMM", "Illegal UPLO = %c\n", UploA ); info = -2; } if( lside ) { PB_Cchkmat( ctxt, "PDSYMM", "A", *M, 3, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PDSYMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PDSYMM", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PDSYMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PDSYMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PDSYMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cdtypeset(); /* * If alpha is zero, sub( C ) := beta * sub( C ). */ if( ALPHA[REAL_PART] == ZERO ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char * ) C), Ci, Cj, Cd ); } else if( !( BETA[REAL_PART] == ONE ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (N >> M) * BCest : Both operands sub( B ) and sub( C ) are communicated (M >> N) */ if( lside ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ABCest = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp1 = DNROC( *N, Ad[NB_], npcol ); tmp2 = DNROC( *M, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); tmp4 = DNROC( *N, Cd[NB_], npcol ); BCest = (double)(*M) * ( ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.5 * ABCest ) <= BCest ); /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ChooseABC ) { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } } if( lside ) DirAB = ( rbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAB = ( cbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CpsymmAB( type, &DirAB, NOCONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_DEFAULT ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); } } if( lside ) DirAB = ( ( rbtop == CTOP_DRING || rctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); else DirAB = ( ( cbtop == CTOP_DRING || cctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); PB_CpsymmBC( type, &DirAB, NOCONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rbtopsave = *PB_Ctop( &ctxt, BCAST, ROW, &rbtopsave ); rctopsave = *PB_Ctop( &ctxt, COMBINE, ROW, &rctopsave ); cbtopsave = *PB_Ctop( &ctxt, BCAST, COLUMN, &cbtopsave ); cctopsave = *PB_Ctop( &ctxt, COMBINE, COLUMN, &cctopsave ); } /* * End of PDSYMM */ } scalapack-2.0.2/PBLAS/SRC/pdsymv_.c000644 000766 000024 00000055275 10363532303 017042 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdsymv_( F_CHAR_T UPLO, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pdsymv_( UPLO, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PDSYMV performs the matrix-vector operation * * sub( Y ) := alpha*sub( A )*sub( X ) + beta*sub( Y ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha and beta are scalars, sub( X ) and sub( Y ) are n element sub- * vectors and sub( A ) is an n by n symmetric submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries of the upper triangular part of the * symmetric submatrix sub( A ), and the local entries of the * strictly lower triangular of sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries of the lower triangular part of the * symmetric submatrix sub( A ), and the local entries of the * strictly upper triangular of sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten by the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCpbY, YCsum, YCld, YRfr, YRpbY, YRsum, YRld, Ycol, Yi, Yii, YisRow, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; double * tbeta; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XCd[DLEN_], XRd[DLEN_], Xd[DLEN_], YCd[DLEN_], YRd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, __FILE__, "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkmat( ctxt, "PDSYMV", "A", *N, 2, *N, 2, Ai, Aj, Ad, 7, &info ); PB_Cchkvec( ctxt, "PDSYMV", "X", *N, 2, Xi, Xj, Xd, *INCX, 11, &info ); PB_Cchkvec( ctxt, "PDSYMV", "Y", *N, 2, Yi, Yj, Yd, *INCY, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PDSYMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cdtypeset(); /* * When alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; if( BETA[REAL_PART] == ZERO ) { dset_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } else { dscal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { if( BETA[REAL_PART] == ZERO ) { dset_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } else { dscal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Reuse sub( Y ) and/or create vectors YR in process rows and YC in process * columns spanned by sub( A ) */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) { PB_CInOutV( type, ROW, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW, ((char**)(&tbeta)), &YR, YRd, &YRfr, &YRsum, &YRpbY ); PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &YC, YCd, &YCfr, &YCsum ); } else { PB_CInOutV( type, COLUMN, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN, ((char**)(&tbeta)), &YC, YCd, &YCfr, &YCsum, &YCpbY ); PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &YR, YRd, &YRfr, &YRsum ); } /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd, ROW, &XC, XCd, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd, COLUMN, &XR, XRd, &XRfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd[LLD_]; XRld = XRd[LLD_]; YCld = YCd[LLD_]; YRld = YRd[LLD_]; /* * Scale YR or YC in the case sub( Y ) has been reused */ if( YisRow ) { /* * YR resides in (a) process row(s) */ if( !YRpbY ) { if( ( myrow == YRd[RSRC_] ) || ( YRd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YR */ if( Anq > 0 ) { if( tbeta[REAL_PART] == ZERO ) { dset_( &Anq, ((char *) tbeta), YR, &YRld ); } else { dscal_( &Anq, ((char *) tbeta), YR, &YRld ); } } } } } else { /* * YC resides in (a) process column(s) */ if( !YCpbY ) { if( ( mycol == YCd[CSRC_] ) || ( YCd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YC */ if( Amp > 0 ) { if( tbeta[REAL_PART] == ZERO ) { dset_( &Amp, ((char *) tbeta), YC, &ione ); } else { dscal_( &Amp, ((char *) tbeta), YC, &ione ); } } } } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { dgemv_( C2F_CHAR( NOTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, YC, &ione ); dgemv_( C2F_CHAR( TRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XC, &ione, one, Mptr( YR, 0, Akq, YRld, size ), &YRld ); } PB_Cpsym( type, type, LEFT, UPPER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, PB_Ctzsymv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsym( type, type, LEFT, LOWER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, PB_Ctzsymv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { dgemv_( C2F_CHAR( NOTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, Mptr( YC, Akp, 0, YCld, size ), &ione ); dgemv_( C2F_CHAR( TRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XC, Akp, 0, XCld, size ), &ione, one, Mptr( YR, 0, Akq, YRld, size ), &YRld ); } } } } if( XCfr ) free( XC ); if( XRfr ) free( XR ); if( YisRow ) { /* * Combine the partial column results into YC */ if( YCsum ) { YCd[CSRC_] = 0; if( Amp > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, 0 ); } } /* * Combine the partial row results into YR */ if( YRsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], YRd[RSRC_], mycol ); } /* * YR := YR + YC */ PB_Cpaxpby( type, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, YR, 0, 0, YRd, ROW ); /* * sub( Y ) := beta * sub( Y ) + YR (if necessary) */ if( YRpbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW ); } } else { /* * Combine the partial row results into YR */ if( YRsum ) { YRd[RSRC_] = 0; if( Anq > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], 0, mycol ); } } /* * Combine the partial column results into YC */ if( YCsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, YCd[CSRC_] ); } /* * YC := YR + YC */ PB_Cpaxpby( type, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, YC, 0, 0, YCd, COLUMN ); /* * sub( Y ) := beta * sub( Y ) + YC (if necessary) */ if( YCpbY ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN ); } } if( YCfr ) free( YC ); if( YRfr ) free( YR ); /* * End of PDSYMV */ } scalapack-2.0.2/PBLAS/SRC/pdsyr2_.c000644 000766 000024 00000042472 10363532303 016736 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdsyr2_( F_CHAR_T UPLO, int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY, double * A, int * IA, int * JA, int * DESCA ) #else void pdsyr2_( UPLO, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PDSYR2 performs the symmetric rank 2 operation * * sub( A ) := alpha*sub( X )*sub( Y )' + * alpha*sub( Y )*sub( X )' + sub( A ) , * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) and sub( Y ) are n element subvectors and * sub( A ) is an n by n symmetric submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCld, YRfr, YRld, Yi, Yj, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0 [DLEN_], XCd0[DLEN_], XRd0[DLEN_], Xd[DLEN_], YCd0[DLEN_], YRd0[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDSYR2", "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkvec( ctxt, "PDSYR2", "X", *N, 2, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PDSYR2", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PDSYR2", "A", *N, 2, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PDSYR2", info ); return; } #endif /* * Quick return if possible */ if( (*N == 0) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cdtypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd0, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd0, ROW, &XC, XCd0, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd0, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd0, COLUMN, &XR, XRd0, &XRfr ); } /* * Replicate sub( Y ) in process rows (YR) and process columns (YC) spanned by * sub( A ) */ if( *INCY == Yd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ROW, &YR, YRd0, &YRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, YR, 0, 0, YRd0, ROW, &YC, YCd0, &YCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, COLUMN, &YC, YCd0, &YCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, YC, 0, 0, YCd0, COLUMN, &YR, YRd0, &YRfr ); } /* * Local rank-2 update if I own some data */ Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd0[LLD_]; YCld = YCd0[LLD_]; XRld = XRd0[LLD_]; YRld = YRd0[LLD_]; /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { dger_( &Akp, &Anq0, ((char *) ALPHA), XC, &ione, Mptr( YR, 0, Akq, YRld, size ), &YRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); dger_( &Akp, &Anq0, ((char *) ALPHA), YC, &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); } PB_Cpsyr2( type, UPPER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, Aptr, k, k, Ad0, PB_Ctzsyr2 ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsyr2( type, LOWER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, Aptr, k, k, Ad0, PB_Ctzsyr2 ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { dger_( &Amp0, &Anq0, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), &ione, Mptr( YR, 0, Akq, YRld, size ), &YRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); dger_( &Amp0, &Anq0, ((char *) ALPHA), Mptr( YC, Akp, 0, YCld, size ), &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } } } if( XRfr ) free( XR ); if( XCfr ) free( XC ); if( YRfr ) free( YR ); if( YCfr ) free( YC ); /* * End of PDSYR2 */ } scalapack-2.0.2/PBLAS/SRC/pdsyr2k_.c000644 000766 000024 00000052420 10363532303 017103 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdsyr2k_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pdsyr2k_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; double * A, * B, * C; #endif { /* * Purpose * ======= * * PDSYR2K performs one of the symmetric rank 2k operations * * sub( C ) := alpha*sub( A )*sub( B )' + alpha*sub( B )*sub( A )' + * beta*sub( C ), * * or * * sub( C ) := alpha*sub( A )'*sub( B ) + alpha*sub( B )'*sub( A ) + * beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise, and, * * sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N', * B(IB:IB+K-1,JB:JB+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * submatrix and sub( A ) and sub( B ) are n by k submatrices in the * first case and k by n submatrices in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the symmetric submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( B )' + * alpha*sub( B )*sub( A )' + * beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrices sub( A ) and sub( B ), and on * entry with TRANS = 'T' or 't' or 'C' or 'c', K specifies the * number of rows of the submatrices sub( A ) and sub( B ). * K must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and B corresponding to the entries of the submatrices * sub( A ) and sub( B ) respectively need not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) DOUBLE PRECISION array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JB+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( B ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( B ). * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double ABCest, ABest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDSYR2K", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDSYR2K", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) { PB_Cchkmat( ctxt, "PDSYR2K", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PDSYR2K", "B", *N, 3, *K, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PDSYR2K", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PDSYR2K", "B", *K, 4, *N, 3, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PDSYR2K", "C", *N, 3, *N, 3, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PDSYR2K", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cdtypeset(); /* * And when alpha or K is zero */ if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (K >> N) * ABest : only sub( A ) and sub( B ) are communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[NB_], npcol ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[CSRC_] == -1 ) && ( Bd[CSRC_] == -1 ) ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + TWO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *K, Ad[MB_], nprow ); tmp4 = DNROC( *K, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[RSRC_] == -1 ) && ( Bd[RSRC_] == -1 ) ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + TWO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.4 * ABCest ) <= ABest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseABC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PDSYR2K */ } scalapack-2.0.2/PBLAS/SRC/pdsyr_.c000644 000766 000024 00000033741 10363532303 016653 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdsyr_( F_CHAR_T UPLO, int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * A, int * IA, int * JA, int * DESCA ) #else void pdsyr_( UPLO, N, ALPHA, X, IX, JX, DESCX, INCX, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX; double * A, * X; #endif { /* * Purpose * ======= * * PDSYR performs the symmetric rank 1 operation * * sub( A ) := alpha*sub( X )*sub( X )' + sub( A ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Alpha is a scalar, sub( X ) is an n element subvector and sub( A ) is * an n by n symmetric submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X * corresponding to the entries of the subvector sub( X ) need * not be set on input. * * X (local input) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XCd0[DLEN_], XRd0[DLEN_], Xd[DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDSYR", "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkvec( ctxt, "PDSYR", "X", *N, 2, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkmat( ctxt, "PDSYR", "A", *N, 2, *N, 2, Ai, Aj, Ad, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PDSYR", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cdtypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd0, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd0, ROW, &XC, XCd0, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd0, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd0, COLUMN, &XR, XRd0, &XRfr ); } /* * Local rank-1 update if I own some data */ Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); XCld = XCd0[LLD_]; XRld = XRd0[LLD_]; if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) dger_( &Akp, &Anq0, ((char *) ALPHA), XC, &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); PB_Cpsyr( type, UPPER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Aptr, k, k, Ad0, PB_Ctzsyr ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsyr( type, LOWER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Aptr, k, k, Ad0, PB_Ctzsyr ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) dger_( &Amp0, &Anq0, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } } if( XRfr ) free( XR ); if( XCfr ) free( XC ); /* * End of PDSYR */ } scalapack-2.0.2/PBLAS/SRC/pdsyrk_.c000644 000766 000024 00000045077 10363532303 017033 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdsyrk_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pdsyrk_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * K, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PDSYRK performs one of the symmetric rank k operations * * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * * or * * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and, * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * submatrix and sub( A ) is an n by k submatrix in the first case and a * k by n submatrix in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the symmetric submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ). * * TRANS = 'C' or 'c' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry, with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrix sub( A ), and with TRANS = 'T' or * 't' or 'C' or 'c', K specifies the number of rows of the sub- * matrix sub( A ). K must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, ChooseAC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double Aest, ACest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDSYRK", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDSYRK", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PDSYRK", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PDSYRK", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PDSYRK", "C", *N, 3, *N, 3, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PDSYRK", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cdtypeset(); /* * And when alpha or K is zero */ if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ACest: both operands sub( A ) and sub( C ) are communicated (K >> N) * Aest : only sub( A ) is communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Ad[MB_], nprow ); Aest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); Aest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseAC = ( ( 1.3 * ACest ) <= Aest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseAC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PDSYRK */ } scalapack-2.0.2/PBLAS/SRC/pdtradd_.c000644 000766 000024 00000033420 10363532303 017126 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdtradd_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pdtradd_( UPLO, TRANS, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PDTRADD adds a trapezoidal matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if TRANS = 'N', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'T', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'C', * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * upper or lower trapezoidal submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of the * array C containing the upper or lower triangular part of the * triangular submatrix sub( C ) is to be referenced as follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * triangular submatrix sub( C ) is to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * triangular submatrix sub( C ) is to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the form of op( sub( A ) ) to be * used in the matrix addition as follows: * * TRANS = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANS = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANS = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ) when TRANS = 'N' or 'n' and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u' and TRANS = 'N' or 'n' or * UPLO = 'L' or 'l' and TRANS = 'T', 'C', 't' or 'c', this ar- * ray contains the local entries corresponding to the entries * of the upper triangular submatrix sub( A ), and the local en- * tries corresponding to the entries of the strictly lower tri- * angular part of the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l' and TRANS = 'N' or 'n' or * UPLO = 'U' or 'u' and TRANS = 'T', 'C', 't' or 'c', this ar- * ray contains the local entries corresponding to the entries * of the lower triangular submatrix sub( A ), and the local en- * tries corresponding to the entries of the strictly upper tri- * angular part of the submatrix sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the triangular submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the triangular submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAC, TranOp, UploC, ctop, rtop; int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, notran, npcol, nprow, upper; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRADD", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRADD", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PDTRADD", "A", *M, 3, *N, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PDTRADD", "A", *N, 4, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PDTRADD", "C", *M, 3, *N, 4, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PDTRADD", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( PB_Cdtypeset(), &UploC, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cdtypeset(), &UploC, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ /* * This operation mainly involves point-to-point send and receive communication. * There is therefore no particular BLACS topology to recommend. Still, one can * choose the main loop direction in which the operands will be added, but not * transposed. This selection is based on the current setting for the BLACS * broadcast operations. */ rtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( *M <= *N ) DirAC = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_Cptradd( PB_Cdtypeset(), &DirAC, &UploC, ( notran ? NOTRAN : TRAN ), *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PDTRADD */ } scalapack-2.0.2/PBLAS/SRC/pdtran_.c000644 000766 000024 00000022507 10363532303 017000 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdtran_( int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pdtran_( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ int * IA, * IC, * JA, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PDTRAN transposes a matrix * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+M-1), and, op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+N-1,JA:JA+M-1)'. * * Beta is a scalar, sub( C ) is an m by n submatrix, and sub( A ) is an * n by m submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) DOUBLE PRECISION array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkmat( ctxt, "PDTRAN", "A", *N, 2, *M, 1, Ai, Aj, Ad, 7, &info ); PB_Cchkmat( ctxt, "PDTRAN", "C", *M, 1, *N, 2, Ci, Cj, Cd, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PDTRAN", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( PB_Cdtypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cdtypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ PB_Cptran( PB_Cdtypeset(), NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PDTRAN */ } scalapack-2.0.2/PBLAS/SRC/pdtrmm_.c000644 000766 000024 00000052421 10363532303 017011 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdtrmm_( F_CHAR_T SIDE, F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB ) #else void pdtrmm_( SIDE, UPLO, TRANS, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, SIDE, TRANS, UPLO; int * IA, * IB, * JA, * JB, * M, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCB; double * A, * B; #endif { /* * Purpose * ======= * * PDTRMM performs one of the matrix-matrix operations * * sub( B ) := alpha * op( sub( A ) ) * sub( B ), * * or * * sub( B ) := alpha * sub( B ) * op( sub( A ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, sub( B ) is an m by n submatrix, sub( A ) is a * unit, or non-unit, upper or lower triangular submatrix and op( X ) is * one of * * op( X ) = X or op( X ) = X'. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether op( sub( A ) ) multiplies * sub( B ) from the left or right as follows: * * SIDE = 'L' or 'l' sub( B ) := alpha*op( sub( A ) )*sub( B ), * * SIDE = 'R' or 'r' sub( B ) := alpha*sub( B )*op( sub( A ) ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) DOUBLE PRECISION array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n transformed submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, DirB, OpC, OpR, SideOp, TopC, TopR, TranOp, UploA, Var, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ForceTop, ctxt, info, lside, mycol, myrow, nb, notran, nounit, npcol, nprow, upper; double ABestL, ABestR, Best, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1101 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRMM", "Illegal UPLO = %c\n", UploA ); info = -2; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRMM", "Illegal TRANS = %c\n", TranOp ); info = -3; } if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRMM", "Illegal DIAG = %c\n", DiagA ); info = -4; } if( lside ) PB_Cchkmat( ctxt, "PDTRMM", "A", *M, 5, *M, 5, Ai, Aj, Ad, 11, &info ); else PB_Cchkmat( ctxt, "PDTRMM", "A", *N, 6, *N, 6, Ai, Aj, Ad, 11, &info ); PB_Cchkmat( ctxt, "PDTRMM", "B", *M, 5, *N, 6, Bi, Bj, Bd, 15, &info ); } if( info ) { PB_Cabort( ctxt, "PDTRMM", info ); return; } #endif /* * Quick return if possible */ if( *M == 0 || *N == 0 ) return; /* * Get type structure */ type = PB_Cdtypeset(); /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) B), Bi, Bj, Bd ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABestR, ABestL : both operands sub( A ) and sub( B ) are communicated * ( N >> M when SIDE is left and M >> N otherwise ) * Best : only sub( B ) is communicated * ( M >> N when SIDE is left and N >> M otherwise ) */ if( lside ) { if( notran ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); ChooseAB = ( ( 1.1 * ABestR ) <= Best ); } else { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestL = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + MAX( tmp2, tmp1 ) / TWO ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); ChooseAB = ( ( ( 1.1 * ABestL ) <= Best ) || ( ( 1.1 * ABestR ) <= Best ) ); } } else { if( notran ) { tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); ChooseAB = ( ( 1.1 * ABestR ) <= Best ); } else { tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestL = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + CBRATIO * ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + MAX( tmp2, tmp1 ) / TWO ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); ChooseAB = ( ( ( 1.1 * ABestL ) <= Best ) || ( ( 1.1 * ABestR ) <= Best ) ); } } /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); if( ChooseAB ) { if( lside ) { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { if( ABestL <= ABestR ) { OpR = CBCAST; OpC = CCOMBINE; Var = CLEFT; if( upper ) { TopR = CTOP_DRING; TopC = CTOP_IRING; } else { TopR = CTOP_IRING; TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } } } else { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { if( ABestL <= ABestR ) { OpR = CCOMBINE; OpC = CBCAST; Var = CLEFT; if( upper ) { TopR = CTOP_DRING; TopC = CTOP_IRING; } else { TopR = CTOP_IRING; TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } PB_CptrmmAB( type, &Var, &SideOp, &UploA, ( notran ? NOTRAN : TRAN ), &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); } else { if( ( lside && notran ) || ( !( lside ) && !( notran ) ) ) { OpR = CCOMBINE; OpC = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } } else { OpR = CBCAST; OpC = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } } if( lside ) DirB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirB = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CptrmmB( type, &DirB, &SideOp, &UploA, ( notran ? NOTRAN : TRAN ), &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PDTRMM */ } scalapack-2.0.2/PBLAS/SRC/pdtrmv_.c000644 000766 000024 00000047525 10363532303 017033 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdtrmv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pdtrmv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; /* * .. Array Arguments .. */ int * DESCA, * DESCX; double * A, * X; #endif { /* * Purpose * ======= * * PDTRMV performs one of the matrix-vector operations * * sub( X ) := sub( A )*sub( X ) or sub( X ) := sub( A )'*sub( X ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * sub( X ) is an n element subvector and sub( A ) is an n by n unit, * or non-unit, upper or lower triangular submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( X ) := sub( A ) * sub( X ). * * TRANS = 'T' or 't' sub( X ) := sub( A )' * sub( X ). * * TRANS = 'C' or 'c' sub( X ) := sub( A )' * sub( X ). * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the transfor- * med subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Xroc, * one, * tbeta, top, *zero; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XAfr, XAld, Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, YAfr, YAld, YApbY, YAsum, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_]; char * Aptr = NULL, * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRMV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRMV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRMV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PDTRMV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info ); PB_Cchkvec( ctxt, "PDTRMV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PDTRMV", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cdtypeset(); size = type->size; one = type->one; zero = type->zero; /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN ); if( notran ) { /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XA, XAd, &XAfr ); /* * Reuse sub( X ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( type, COLUMN, *N, *N, Ad0, 1, one, ((char *) X), Xi, Xj, Xd, &Xroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * If sub( X ) is distributed in (a) process column(s), then zero it. */ if( Xroc == CCOLUMN ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * sub( X ) resides in (a) process columns(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { dset_( &Xnp, zero, Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], size ), &ione ); } } } } else { /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XA, XAd, &XAfr ); /* * Reuse sub( X ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( type, ROW, *N, *N, Ad0, 1, one, ((char *) X), Xi, Xj, Xd, &Xroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * If sub( X ) is distributed in (a) process row(s), then zero it. */ if( Xroc == CROW ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; dset_( &Xnq, zero, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xld ); } } } } /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XAld = XAd[LLD_]; YAld = YAd[LLD_]; /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { dgemv_( TRANS, &Akp, &Anq0, one, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, YA, &ione ); } PB_Cptrm( type, type, LEFT, UPPER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, size ), YAld, PB_Ctztrmv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { dgemv_( TRANS, &Akp, &Anq0, one, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XA, &ione, one, Mptr( YA, 0, Akq, YAld, size ), &YAld ); } PB_Cptrm( type, type, LEFT, UPPER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, size ), YAld, PB_Ctztrmv ); } } } else { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, type, LEFT, LOWER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, size ), YAld, PB_Ctztrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { dgemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, Mptr( YA, Akp, 0, YAld, size ), &ione ); } } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, type, LEFT, LOWER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, size ), YAld, PB_Ctztrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { dgemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, Akp, 0, XAld, size ), &ione, one, Mptr( YA, 0, Akq, YAld, size ), &YAld ); } } } } } if( XAfr ) free( XA ); if( notran ) { /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } /* * sub( X ) := YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, YA, 0, 0, YAd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } } else { /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } /* * sub( X ) := YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } } if( YAfr ) free( YA ); /* * End of PDTRMV */ } scalapack-2.0.2/PBLAS/SRC/pdtrsm_.c000644 000766 000024 00000052365 11622500733 017027 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdtrsm_( F_CHAR_T SIDE, F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB ) #else void pdtrsm_( SIDE, UPLO, TRANS, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, SIDE, TRANS, UPLO; int * IA, * IB, * JA, * JB, * M, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCB; double * A, * B; #endif { /* * Purpose * ======= * * PDTRSM solves one of the matrix equations * * op( sub( A ) )*X = alpha*sub( B ), or * * X*op( sub( A ) ) = alpha*sub( B ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is * a unit, or non-unit, upper or lower triangular submatrix and op( Y ) * is one of * * op( Y ) = Y or op( Y ) = Y'. * * The submatrix X is overwritten on sub( B ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether op( sub( A ) ) appears on * the left or right of X as follows: * * SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ), * * SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) DOUBLE PRECISION array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n solution submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, DirB, OpC, OpR, SideOp, TopC, TopR, TranOp, UploA, Var, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ForceTop, ctxt, info, itmp, lside, mycol, myrow, nb, notran, nounit, npcol, nprow, upper; double ABestL, ABestR, Best, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1101 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRSM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRSM", "Illegal UPLO = %c\n", UploA ); info = -2; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRSM", "Illegal TRANS = %c\n", TranOp ); info = -3; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRSM", "Illegal DIAG = %c\n", DiagA ); info = -4; } if( lside ) PB_Cchkmat( ctxt, "PDTRSM", "A", *M, 5, *M, 5, Ai, Aj, Ad, 11, &info ); else PB_Cchkmat( ctxt, "PDTRSM", "A", *N, 6, *N, 6, Ai, Aj, Ad, 11, &info ); PB_Cchkmat( ctxt, "PDTRSM", "B", *M, 5, *N, 6, Bi, Bj, Bd, 15, &info ); } if( info ) { PB_Cabort( ctxt, "PDTRSM", info ); return; } #endif /* * Quick return if possible */ if( *M == 0 || *N == 0 ) return; /* * Get type structure */ type = PB_Cdtypeset(); /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) B), Bi, Bj, Bd ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); /* * ABestR, ABestL : both operands sub( A ) and sub( B ) are communicated * ( N >> M when SIDE is left and M >> N otherwise ) * Best : only sub( B ) is communicated * ( M >> N when SIDE is left and N >> M otherwise ) */ if( lside ) { if( notran ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); itmp = MIN( Ad[MB_], Ad[NB_] ); Best = (double)(*N) * ( (double)(CEIL( *M, itmp )) * (double)(itmp) * ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ); ChooseAB = ( ABestR <= ( 2.0 * Best ) ); } else { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestL = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + MAX( tmp2, tmp1 ) / TWO ); itmp = MIN( Ad[MB_], Ad[NB_] ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( (double)(CEIL( *M, itmp )) * (double)(itmp) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ) + MAX( tmp2, tmp3 ) ); ChooseAB = ( ( ABestL <= ( 2.0 * Best ) ) || ( ABestR <= ( 2.0 * Best ) ) ); } } else { if( notran ) { tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); itmp = MIN( Ad[MB_], Ad[NB_] ); Best = (double)(*M) * ( (double)(CEIL( *N, itmp )) * (double)(itmp) * ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ); ChooseAB = ( ABestR <= ( 2.0 * Best ) ); } else { tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestL = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + CBRATIO * ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + MAX( tmp2, tmp1 ) / TWO ); itmp = MIN( Ad[MB_], Ad[NB_] ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( (double)(CEIL( *N, itmp )) * (double)(itmp) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ) + MAX( tmp1, tmp4 ) ); ChooseAB = ( ( ABestL <= ( 2.0 * Best ) ) || ( ABestR <= ( 2.0 * Best ) ) ); } } /* * Var can remain uninitialized but is nevertheless used in PB_CptrsmAB.c * provide a default here. TODO: does this make sense ? *==19891== at 0x44F81B: PB_CptrsmAB (PB_CptrsmAB.c:538) *==19891== by 0x427BE7: pdtrsm_ (pdtrsm_.c:488) *==19891== by 0x405E46: MAIN_ (pdblas3tim.f:727) */ Var = CRIGHT; if( ChooseAB ) { /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ ForceTop = ( ( *M > nb ) && ( *N > nb ) ); if( ForceTop ) { if( lside ) { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { if( ABestL <= ABestR ) { OpR = CBCAST; OpC = CCOMBINE; Var = CLEFT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } } } else { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { if( ABestL <= ABestR ) { OpR = CCOMBINE; OpC = CBCAST; Var = CLEFT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } PB_CptrsmAB( type, &Var, &SideOp, &UploA, ( notran ? NOTRAN : TRAN ), &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } } else { /* * BLACS topologies are always enforced. */ if( ( lside && notran ) || ( !lside && !notran ) ) { OpR = CCOMBINE; OpC = CBCAST; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } /* * Remove the next line when the BLACS combine operations support ring * topologies */ TopR = CTOP_DEFAULT; } else { OpR = CBCAST; OpC = CCOMBINE; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } /* * Remove the next line when the BLACS combine operations support ring * topologies */ TopC = CTOP_DEFAULT; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); if( lside ) DirB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirB = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CptrsmB( type, &DirB, &SideOp, &UploA, ( notran ? NOTRAN : TRAN ), &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); /* * Restore the BLACS topologies. */ rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PDTRSM */ } scalapack-2.0.2/PBLAS/SRC/pdtrsv_.c000644 000766 000024 00000076076 10363532303 017044 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdtrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pdtrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; /* * .. Array Arguments .. */ int * DESCA, * DESCX; double * A, * X; #endif { /* * Purpose * ======= * * PDTRSV solves one of the systems of equations * * sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b, * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * b and sub( X ) are n element subvectors and sub( A ) is an n by n * unit, or non-unit, upper or lower triangular submatrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( A ) * sub( X ) = b. * * TRANS = 'T' or 't' sub( A )' * sub( X ) = b. * * TRANS = 'C' or 'c' sub( A )' * sub( X ) = b. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) DOUBLE PRECISION array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On entry, sub( X ) is the n element right-hand side * b. On exit, sub( X ) is overwritten with the solution subvec- * tor. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one, * zero; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb, Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld, XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info, ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_]; char * Aptr = NULL, * XAC = NULL, * XAR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRSV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRSV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PDTRSV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PDTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info ); PB_Cchkvec( ctxt, "PDTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PDTRSV", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cdtypeset(); size = type->size; one = type->one; zero = type->zero; negone = type->negone; /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN ); if( notran ) { if( upper ) { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING ); (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process column owning the last * column of sub( A ) */ PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, &XACfr, &XACsum, &XACapbX ); /* * Create vector XAR in process rows spanned by sub( A ) */ PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, &XARsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAC contains the solution scattered in multiple * process columns and XAR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ if( Akp > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( XACsum ) { kbprev = MIN( k, nb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow, Arow, nprow ); Akp -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) dgemv_( TRANS, &ktmp, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol ); Cdgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 0, XACld, size ), XACld, myrow, Asrc ); if( mycol != Asrc ) dset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } if( Akp > 0 && Anq0 > 0 ) dgemv_( TRANS, &Akp, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, XAC, &ione ); } else { if( Anq0 > 0 ) dgemv_( TRANS, &Akp, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, XAC, &ione ); } } } /* * Combine the scattered resulting vector XAC */ if( XACsum && ( Anp > 0 ) ) { Cdgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, XACd[CSRC_] ); } /* * sub( X ) := XAC (if necessary) */ if( XACapbX ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); } else { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process column owning the first * column of sub( A ) */ PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, &XACfr, &XACsum, &XACapbX ); /* * Create vector XAR in process rows spanned by sub( A ) */ PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, &XARsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = 0; k < *N; k += nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAC contains the solution scattered in multiple * process columns and XAR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow ); if( ( Anp0 = Anp - Akp ) > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( XACsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow, nprow ); Anp0 -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) dgemv_( TRANS, &ktmp, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol ); Cdgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 0, XACld, size ), XACld, myrow, Asrc ); if( mycol != Asrc ) dset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } if( Anp0 > 0 && Anq0 > 0 ) dgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione ); } else { if( Anq0 > 0 ) dgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } } } /* * Combine the scattered resulting vector XAC */ if( XACsum && ( Anp > 0 ) ) { Cdgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, XACd[CSRC_] ); } /* * sub( X ) := XAC (if necessary) */ if( XACapbX ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); } } else { if( upper ) { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAR in process row owning the first row * of sub( A ) */ PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, &XARfr, &XARsum, &XARapbX ); /* * Create vector XAC in process columns spanned by sub( A ) */ PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, &XACsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = 0; k < *N; k += nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAR contains the solution scattered in multiple * process rows and XAC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anq0 = Anq - Akq ) > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( XARsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol, npcol ); Anq0 -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) dgemv_( TRANS, &Anp0, &ktmp, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow ); Cdgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, Akq, XARld, size ), XARld, Asrc, mycol ); if( myrow != Asrc ) dset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } if( Anp0 > 0 && Anq0 > 0 ) dgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld ); } else { if( Anp0 > 0 ) dgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } } } /* * Combine the scattered resulting vector XAR */ if( XARsum && ( Anq > 0 ) ) { Cdgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], mycol ); } /* * sub( X ) := XAR (if necessary) */ if( XARapbX ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); } else { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process row owning the last row * of sub( A ) */ PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, &XARfr, &XARsum, &XARapbX ); /* * Create vector XAC in process columns spanned by sub( A ) */ PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, &XACsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAR contains the solution scattered in multiple * process rows and XAC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later * is only locally updated. */ if( Akq > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( XARsum ) { kbprev = MIN( k, nb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol, Acol, npcol ); Akq -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) dgemv_( TRANS, &Anp0, &ktmp, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow ); Cdgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, Akq, XARld, size ), XARld, Asrc, mycol ); if( myrow != Asrc ) dset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } if( Anp0 > 0 && Akq > 0 ) dgemv_( TRANS, &Anp0, &Akq, negone, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, XAR, &XARld ); } else { if( Anp0 > 0 ) dgemv_( TRANS, &Anp0, &Akq, negone, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, XAR, &XARld ); } } } /* * Combine the scattered resulting vector XAR */ if( XARsum && ( Anq > 0 ) ) { Cdgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], mycol ); } /* * sub( X ) := XAR (if necessary) */ if( XARapbX ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); } } if( XACfr ) free( XAC ); if( XARfr ) free( XAR ); /* * End of PDTRSV */ } scalapack-2.0.2/PBLAS/SRC/pdzasum_.c000644 000766 000024 00000025347 10363532303 017200 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdzasum_( int * N, double * ASUM, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pdzasum_( N, ASUM, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; double * ASUM; /* * .. Array Arguments .. */ int * DESCX; double * X; #endif { /* * Purpose * ======= * * PDZASUM returns the sum of absolute values of the entries of a sub- * vector sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ASUM (local output) DOUBLE PRECISION * On exit, ASUM specifies the sum of absolute values of the * subvector sub( X ) only in its scope (See below for further * details). * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * Xptr = NULL, top; int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PDZASUM", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PDZASUM", info ); return; } #endif /* * Initialize ASUM */ *ASUM = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *N == 1 ) && ( *INCX == 1 ) && ( Xd[M_] == 1 ) ) { /* * Make sure I own some data and compute ASUM */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { type = PB_Cztypeset(); Xptr = Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ); *ASUM = ABS( ((double *) Xptr)[REAL_PART] ) + ABS( ((double *) Xptr)[IMAG_PART] ); } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and compute the local sum */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cztypeset(); dzvasum_( &Xnq, ((char *) ASUM), Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } /* * If Xnq <= 0, ASUM is zero (see initialization above) */ if( ( npcol > 1 ) && ( Xcol >= 0 ) ) { /* * Combine the local results if npcol > 1 and Xcol >= 0, i.e sub( X ) is * distributed. */ top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, 1, 1, ((char *)ASUM), 1, -1, mycol ); } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and compute the local sum */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { type = PB_Cztypeset(); dzvasum_( &Xnp, ((char *) ASUM), Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } /* * If Xnp <= 0, ASUM is zero (see initialization above) */ if( ( nprow > 1 ) && ( Xrow >= 0 ) ) { /* * Combine the local results if nprow > 1 and Xrow >= 0, i.e sub( X ) is * distributed. */ top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, 1, ((char *)ASUM), 1, -1, mycol ); } } return; } /* * End of PDZASUM */ } scalapack-2.0.2/PBLAS/SRC/pdznrm2_.c000644 000766 000024 00000043337 10363532303 017110 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pdznrm2_( int * N, double * NORM2, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pdznrm2_( N, NORM2, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; double * NORM2; /* * .. Array Arguments .. */ int * DESCX; double * X; #endif { /* * Purpose * ======= * * PDZNRM2 computes the 2-norm of a subvector sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * NORM2 (local output) DOUBLE PRECISION * On exit, NORM2 specifies the 2-norm of the subvector sub( X ) * only in its scope (See below for further details). * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * Xptr = NULL, top; int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, dst, dist, info, k, mycol, mydist, myrow, npcol, nprow, src, size; double Xtmp, scale, ssq, temp1, temp2; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_]; double work[4]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PDZNRM2", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PDZNRM2", info ); return; } #endif /* * Initialize NORM2 */ *NORM2 = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *N == 1 ) && ( *INCX == 1 ) && ( Xd[M_] == 1 ) ) { /* * Make sure I own some data and compute NORM2 */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { scale = ZERO; ssq = ONE; type = PB_Cztypeset(); Xptr = Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ); Xtmp = ((double *) Xptr)[REAL_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xtmp = ((double *) Xptr)[IMAG_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } /* * Compute NORM2 = SCALE * SQRT( SSQ ) */ dasqrtb_( &scale, &ssq, NORM2 ); } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Initialize SCALE and SSQ */ scale = ZERO; ssq = ONE; /* * Make sure I own some data and compute local sum of squares */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cztypeset(); size = type->size; Xptr = Mptr( ((char *) X), Xii, Xjj, Xld, size ); for( k = 0; k < Xnq; k++ ) { Xtmp = ((double *) Xptr)[REAL_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xtmp = ((double *) Xptr)[IMAG_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xptr += Xld * size; } } /* * If Xnq <= 0, SCALE is zero and SSQ is one (see initialization above) */ if( ( npcol >= 2 ) && ( Xcol >= 0 ) ) { /* * Combine the local sum of squares using a 1-tree topology within process row * 0 if npcol > 1 and Xcol >= 0, i.e sub( X ) is distributed. */ work[0] = scale; work[1] = ssq; mydist = mycol; k = 1; l_10: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, npcol ); Cdgesd2d( ctxt, 2, 1, ((char*) work), 2, myrow, dst ); goto l_20; } else { dist = mycol + k; src = MPosMod( dist, npcol ); if( mycol < src ) { Cdgerv2d( ctxt, 2, 1, ((char*)&work[2]), 2, myrow, src ); if( work[0] >= work[2] ) { if( work[0] != ZERO ) { temp1 = work[2] / work[0]; work[1] = work[1] + ( temp1 * temp1 ) * work[3]; } } else { temp1 = work[0] / work[2]; work[1] = work[3] + ( temp1 * temp1 ) * work[1]; work[0] = work[2]; } } mydist >>= 1; } k <<= 1; if( k < npcol ) goto l_10; l_20: /* * Process column 0 broadcasts the combined values of SCALE and SSQ within their * process row. */ top = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == 0 ) { Cdgebs2d( ctxt, ROW, &top, 2, 1, ((char*)work), 2 ); } else { Cdgebr2d( ctxt, ROW, &top, 2, 1, ((char*)work), 2, myrow, 0 ); } /* * Compute NORM2 redundantly NORM2 = WORK( 1 ) * SQRT( WORK( 2 ) ) */ dasqrtb_( &work[0], &work[1], NORM2 ); } else { /* * Compute NORM2 redundantly ( sub( X ) is not distributed ) */ dasqrtb_( &scale, &ssq, NORM2 ); } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Initialize SCALE and SSQ */ scale = ZERO; ssq = ONE; /* * Make sure I own some data and compute local sum of squares */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { type = PB_Cztypeset(); size = type->size; Xptr = Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], size ); for( k = 0; k < Xnp; k++ ) { Xtmp = ((double *) Xptr)[REAL_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xtmp = ((double *) Xptr)[IMAG_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xptr += size; } } /* * If Xnp <= 0, SCALE is zero and SSQ is one (see initialization above) */ if( ( nprow >= 2 ) && ( Xrow >= 0 ) ) { /* * Combine the local sum of squares using a 1-tree topology within process * column 0 if nprow > 1 and Xrow >= 0, i.e sub( X ) is distributed. */ work[0] = scale; work[1] = ssq; mydist = myrow; k = 1; l_30: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, nprow ); Cdgesd2d( ctxt, 2, 1, ((char*)work), 2, dst, mycol ); goto l_40; } else { dist = myrow + k; src = MPosMod( dist, nprow ); if( myrow < src ) { Cdgerv2d( ctxt, 2, 1, ((char*)&work[2]), 2, src, mycol ); if( work[0] >= work[2] ) { if( work[0] != ZERO ) { temp1 = work[2] / work[0]; work[1] = work[1] + ( temp1 * temp1 ) * work[3]; } } else { temp1 = work[0] / work[2]; work[1] = work[3] + ( temp1 * temp1 ) * work[1]; work[0] = work[2]; } } mydist >>= 1; } k <<= 1; if( k < nprow ) goto l_30; l_40: /* * Process column 0 broadcasts the combined values of SCALE and SSQ within their * process column */ top = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == 0 ) { Cdgebs2d( ctxt, COLUMN, &top, 2, 1, ((char*)work), 2 ); } else { Cdgebr2d( ctxt, COLUMN, &top, 2, 1, ((char*)work), 2, 0, mycol ); } /* * Compute NORM2 redundantly NORM2 = WORK[0] * SQRT( WORK[1] ) */ dasqrtb_( &work[0], &work[1], NORM2 ); } else { /* * Compute NORM2 redundantly ( sub( X ) is not distributed ) */ dasqrtb_( &scale, &ssq, NORM2 ); } } return; } /* * End of PDZNRM2 */ } scalapack-2.0.2/PBLAS/SRC/picopy_.c000644 000766 000024 00000021601 10363532303 017005 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void picopy_( int * N, int * X, int * IX, int * JX, int * DESCX, int * INCX, int * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void picopy_( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; /* * .. Array Arguments .. */ int * DESCX, * DESCY; int * X, * Y; #endif { /* * Purpose * ======= * * PICOPY copies one subvector into another, * * sub( Y ) := sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * copied. N must be at least zero. * * X (local input) INTEGER array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local output) INTEGER array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xi, Xj, Yi, Yj, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 501 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PICOPY", "X", *N, 1, Xi, Xj, Xd, *INCX, 5, &info ); PB_Cchkvec( ctxt, "PICOPY", "Y", *N, 1, Yi, Yj, Yd, *INCY, 10, &info ); } if( info ) { PB_Cabort( ctxt, "PICOPY", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Get type structure */ type = PB_Citypeset(); /* * Start the operations */ if( *INCX == Xd[M_] ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) X), Xi, Xj, Xd, ROW, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) X), Xi, Xj, Xd, COLUMN, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } /* * End of PICOPY */ } scalapack-2.0.2/PBLAS/SRC/pilaenv.f000644 000766 000024 00000005671 10363532303 017015 0ustar00juliestaff000000 000000 INTEGER FUNCTION PILAENV( ICTXT, PREC ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT CHARACTER*1 PREC * .. * * Purpose * ======= * * PILAENV returns the positive integer value of the logical blocking * size. This value is machine and precision specific. This version pro- * vides a logical blocking size which should give good but not optimal * performance on many of the currently available distributed-memory * concurrent computers. Users are encouraged to modify this subroutine * to set this tuning parameter for their particular machine. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * PREC (global input) CHARACTER*1 * On input, PREC specifies the precision for which the logical * block size should be returned as follows: * PREC = 'S' or 's' single precision real, * PREC = 'D' or 'd' double precision real, * PREC = 'C' or 'c' single precision complex, * PREC = 'Z' or 'z' double precision complex, * PREC = 'I' or 'i' integer. * * Notes * ===== * * Before modifying this routine to tune the library performance on your * system, be aware of the following: * * 1) The value this function returns must be STRICTLY LARGER THAN ZERO, * * 2) If you are planning to link your program with different instances * of the library, (for example on a heterogeneous machine), you MUST * compile each instance of the library with the EXACT SAME version of * this routine for obvious inter-operability reasons. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements .. * IF( LSAME( PREC, 'S' ) ) THEN * * Single precision real logical block size * PILAENV = 32 * ELSE IF( LSAME( PREC, 'D' ) ) THEN * * Double precision real logical block size * PILAENV = 32 * ELSE IF( LSAME( PREC, 'C' ) ) THEN * * Single precision complex logical block size * PILAENV = 32 * ELSE IF( LSAME( PREC, 'Z' ) ) THEN * * Double precision complex logical block size * PILAENV = 32 * ELSE IF( LSAME( PREC, 'I' ) ) THEN * * Integer logical block size * PILAENV = 32 * ELSE * * Probably unused * PILAENV = 32 * END IF * RETURN * * End of PILAENV * END scalapack-2.0.2/PBLAS/SRC/psagemv_.c000644 000766 000024 00000045072 10363532303 017154 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psagemv_( F_CHAR_T TRANS, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void psagemv_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PSAGEMV performs one of the matrix-vector operations * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( sub( A )' )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1). * * When TRANS = 'N', * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+M-1), if INCY = M_Y, * Y(IY:IY+M-1,JY:JY), if INCY = 1 and INCY <> M_Y, * and, otherwise * * sub( X ) denotes X(IX:IX,JX:JX+M-1), if INCX = M_X, * X(IX:IX+M-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a real subvector, * sub( X ) is a subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := |alpha|*|sub( A ) | * |sub( X )| + * |beta*sub( Y )|, * * TRANS = 'T' or 't', * sub( Y ) := |alpha|*|sub( A )'| * |sub( X )| + * |beta*sub( Y )|, * * TRANS = 'C' or 'c', * sub( Y ) := |alpha|*|sub( A )'| * |sub( X )| + * |beta*sub( Y )|. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char TrA, Yroc, * one, * tbeta, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, mycol, myrow, nota, npcol, nprow; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSAGEMV", "Illegal TRANS=%c\n", TrA ); info = -1; } PB_Cchkmat( ctxt, "PSAGEMV", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); if( nota ) { PB_Cchkvec( ctxt, "PSAGEMV", "X", *N, 3, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PSAGEMV", "Y", *M, 2, Yi, Yj, Yd, *INCY, 18, &info ); } else { PB_Cchkvec( ctxt, "PSAGEMV", "X", *M, 2, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PSAGEMV", "Y", *N, 3, Yi, Yj, Yd, *INCY, 18, &info ); } } if( info ) { PB_Cabort( ctxt, "PSAGEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = utyp = PB_Cstypeset(); /* * When alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, utyp->size ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], utyp->size ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( nota ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( utyp, COLUMN, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { sagemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *) A), Aii, Ajj, Ald, type->size), &Ald, XA, &XAd[LLD_], tbeta, YA, &ione ); } if( XAfr ) free( XA ); /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( utyp, ROW, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { sagemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *) A), Aii, Ajj, Ald, type->size ), &Ald, XA, &ione, tbeta, YA, &YAd[LLD_] ); } if( XAfr ) free( XA ); /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, utyp->size ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], utyp->size ), INCY ); } } } one = utyp->one; if( nota ) { PB_Cpaxpby( utyp, NOCONJG, *M, 1, one, YA, 0, 0, YAd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } else { PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PSAGEMV */ } scalapack-2.0.2/PBLAS/SRC/psamax_.c000644 000766 000024 00000045240 10363532303 017000 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psamax_( int * N, float * AMAX, int * INDX, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void psamax_( N, AMAX, INDX, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * INDX, * IX, * JX, * N; float * AMAX; /* * .. Array Arguments .. */ int * DESCX; float * X; #endif { /* * Purpose * ======= * * PSAMAX computes the global index of the maximum element in absolute * value of a subvector sub( X ). The global index is returned in INDX * and the value of that element is returned in AMAX, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * AMAX (global output) REAL array * On exit, AMAX specifies the largest entry in absolute value * of the subvector sub( X ) only in its scope (See below for * further details). * * INDX (global output) INTEGER * On exit, INDX specifies the global index of the maximum ele- * ment in absolute value of the subvector sub( X ) only in its * scope (See below for further details). * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char cbtop, cctop, rbtop, rctop; int Xcol, Xgindx, Xi, Xii, Ximb, Xinb, Xj, Xjj, Xlindx, Xld, Xmb, Xnb, Xnp, Xnq, Xrow, Xsrc, ctxt, dist, dst, idumm, info, k, maxpos, mycol, mydist, myrow, npcol, nprow, src; /* * .. Local Arrays .. */ int Xd[DLEN_]; float work[4]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PSAMAX", "X", *N, 1, Xi, Xj, Xd, *INCX, 7, &info ); if( info ) { PB_Cabort( ctxt, "PSAMAX", info ); return; } #endif /* * Initialize INDX and AMAX */ *INDX = 0; *AMAX = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *INCX == 1 ) && ( Xd[M_] == 1 ) && ( *N == 1 ) ) { /* * Make sure I own some data and compute INDX and AMAX */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { *INDX = *JX; *AMAX = X[Xii+Xjj*Xd[LLD_]]; } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); if( ( rctop == CTOP_DEFAULT ) || ( rctop == CTOP_TREE1 ) ) { /* * Inline the 1-tree combine for communication savings */ Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_]; Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnq > 0 ) { Xld = Xd[LLD_]; Xlindx = Xjj - 1 + isamax_( &Xnq, ((char*)(X+(Xii+Xjj*Xld))), &Xld ); Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol ); work[0] = X[Xii+Xlindx*Xld]; work[1] = ((float )( Xgindx+1 )); } else { work[0] = ZERO; work[1] = ZERO; } /* * Combine the local results using a 1-tree topology within process column 0 * if npcol > 1 or Xcol >= 0, i.e sub( X ) is distributed. */ if( ( npcol >= 2 ) && ( Xcol >= 0 ) ) { mydist = mycol; k = 1; l_10: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, npcol ); Csgesd2d( ctxt, 2, 1, ((char*)work), 2, myrow, dst ); goto l_20; } else { dist = mycol + k; src = MPosMod( dist, npcol ); if( mycol < src ) { Csgerv2d( ctxt, 2, 1, ((char*) &work[2]), 2, myrow, src ); if( ABS( work[0] ) < ABS( work[2] ) ) { work[0] = work[2]; work[1] = work[3]; } } mydist >>= 1; } k <<= 1; if( k < npcol ) goto l_10; l_20: /* * Process column 0 broadcasts the combined values of INDX and AMAX within * their process row. */ rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == 0 ) { Csgebs2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2 ); } else { Csgebr2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2, myrow, 0 ); } } /* * Set INDX and AMAX to the replicated answers contained in work. If AMAX is * zero, then select a coherent INDX. */ *AMAX = work[0]; *INDX = ( ( *AMAX == ZERO ) ? ( *JX ) : ( (int)(work[1]) ) ); } else { /* * Otherwise use the current topology settings to combine the results */ Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_]; Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnq > 0 ) { /* * Compute the local maximum and its corresponding local index */ Xld = Xd[LLD_]; Xlindx = Xjj - 1 + isamax_( &Xnq, ((char*)(X+(Xii+Xjj*Xld))), &Xld ); *AMAX = X[Xii+Xlindx*Xld]; } else { *AMAX = ZERO; } if( Xcol >= 0 ) { /* * Combine leave on all the local maximum if Xcol >= 0, i.e sub( X ) is * distributed */ Csgamx2d( ctxt, ROW, &rctop, 1, 1, ((char*)AMAX), 1, &idumm, &maxpos, 1, -1, mycol ); /* * Broadcast the corresponding global index */ if( *AMAX != ZERO ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == maxpos ) { Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol ); *INDX = Xgindx + 1; Cigebs2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1 ); } else { Cigebr2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1, myrow, maxpos ); } } else { /* * If AMAX is zero, then select a coherent INDX. */ *INDX = *JX; } } else { /* * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX. */ *INDX = ( ( *AMAX == ZERO ) ? ( *JX ) : Xlindx + 1 ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ( cctop == CTOP_DEFAULT ) || ( cctop == CTOP_TREE1 ) ) { /* * Inline the 1-tree combine for communication savings */ Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_]; Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnp > 0 ) { Xld = Xd[LLD_]; Xlindx = Xii - 1 + isamax_( &Xnp, ((char*)(X+(Xii+Xjj*Xld))), INCX ); Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow ); work[0] = X[Xlindx+Xjj*Xld]; work[1] = ((float )( Xgindx+1 )); } else { work[0] = ZERO; work[1] = ZERO; } /* * Combine the local results using a 1-tree topology within process row 0 * if nprow > 1 or Xrow >= 0, i.e sub( X ) is distributed. */ if( ( nprow >= 2 ) && ( Xrow >= 0 ) ) { mydist = myrow; k = 1; l_30: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, nprow ); Csgesd2d( ctxt, 2, 1, ((char*)work), 2, dst, mycol ); goto l_40; } else { dist = myrow + k; src = MPosMod( dist, nprow ); if( myrow < src ) { Csgerv2d( ctxt, 2, 1, ((char*) &work[2]), 2, src, mycol ); if( ABS( work[0] ) < ABS( work[2] ) ) { work[0] = work[2]; work[1] = work[3]; } } mydist >>= 1; } k <<= 1; if( k < nprow ) goto l_30; l_40: /* * Process row 0 broadcasts the combined values of INDX and AMAX within their * process column. */ cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == 0 ) { Csgebs2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2 ); } else { Csgebr2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2, 0, mycol ); } } /* * Set INDX and AMAX to the replicated answers contained in work. If AMAX is * zero, then select a coherent INDX. */ *AMAX = work[0]; *INDX = ( ( *AMAX == ZERO ) ? ( *IX ) : ( (int)(work[1]) ) ); } else { /* * Otherwise use the current topology settings to combine the results */ Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_]; Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnp > 0 ) { /* * Compute the local maximum and its corresponding local index */ Xld = Xd[LLD_]; Xlindx = Xii - 1 + isamax_( &Xnp, ((char*)(X+(Xii+Xjj*Xld))), INCX ); *AMAX = X[Xlindx+Xjj*Xld]; } else { *AMAX = ZERO; } if( Xrow >= 0 ) { /* * Combine leave on all the local maximum if Xrow >= 0, i.e sub( X ) is * distributed. */ Csgamx2d( ctxt, COLUMN, &cctop, 1, 1, ((char*)AMAX), 1, &maxpos, &idumm, 1, -1, mycol ); /* * Broadcast the corresponding global index */ if( *AMAX != ZERO ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == maxpos ) { Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow ); *INDX = Xgindx + 1; Cigebs2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1 ); } else { Cigebr2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1, maxpos, mycol ); } } else { /* * If AMAX is zero, then select a coherent INDX. */ *INDX = *IX; } } else { /* * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX. */ *INDX = ( ( *AMAX == ZERO ) ? ( *IX ) : Xlindx + 1 ); } } } return; } /* * End of PSAMAX */ } scalapack-2.0.2/PBLAS/SRC/psasum_.c000644 000766 000024 00000024613 10363532303 017020 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psasum_( int * N, float * ASUM, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void psasum_( N, ASUM, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; float * ASUM; /* * .. Array Arguments .. */ int * DESCX; float * X; #endif { /* * Purpose * ======= * * PSASUM returns the sum of absolute values of the entries of a subvec- * tor sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ASUM (local output) REAL * On exit, ASUM specifies the sum of absolute values of the * subvector sub( X ) only in its scope (See below for further * details). * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char top; int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PSASUM", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PSASUM", info ); return; } #endif /* * Initialize ASUM */ *ASUM = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *N == 1 ) && ( *INCX == 1 ) && ( Xd[M_] == 1 ) ) { /* * Make sure I own some data and compute ASUM */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { *ASUM = ABS( X[Xii+Xjj*Xd[LLD_]] ); } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and compute the local sum */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; svasum_( &Xnq, ((char *) ASUM), ((char *)( X+(Xii+Xjj*Xld) )), &Xld ); } /* * If Xnq <= 0, ASUM is zero (see initialization above) */ if( ( npcol > 1 ) && ( Xcol >= 0 ) ) { /* * Combine the local results if npcol > 1 and Xcol >= 0, i.e sub( X ) is * distributed. */ top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, 1, 1, ((char *)ASUM), 1, -1, mycol ); } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and compute the local sum */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { svasum_( &Xnp, ((char *) ASUM), ((char *)( X+(Xii+Xjj*Xd[LLD_]) )), INCX ); } /* * If Xnp <= 0, ASUM is zero (see initialization above) */ if( ( nprow > 1 ) && ( Xrow >= 0 ) ) { /* * Combine the local results if nprow > 1 and Xrow >= 0, i.e sub( X ) is * distributed. */ top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, 1, ((char *)ASUM), 1, -1, mycol ); } } return; } /* * End of PSASUM */ } scalapack-2.0.2/PBLAS/SRC/psasymv_.c000644 000766 000024 00000055702 10363532303 017215 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psasymv_( F_CHAR_T UPLO, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void psasymv_( UPLO, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PSASYMV performs the matrix-vector operation * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a n element real subvec- * tor, sub( X ) is an n element subvector and sub( A ) is an n by n * symmetric submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries of the upper triangular part of the * symmetric submatrix sub( A ), and the local entries of the * strictly lower triangular of sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries of the lower triangular part of the * symmetric submatrix sub( A ), and the local entries of the * strictly upper triangular of sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten by the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCpbY, YCsum, YCld, YRfr, YRpbY, YRsum, YRld, Ycol, Yi, Yii, YisRow, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper, usiz; float * tbeta; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XCd[DLEN_], XRd[DLEN_], Xd[DLEN_], YCd[DLEN_], YRd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, __FILE__, "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkmat( ctxt, "PSASYMV", "A", *N, 2, *N, 2, Ai, Aj, Ad, 7, &info ); PB_Cchkvec( ctxt, "PSASYMV", "X", *N, 2, Xi, Xj, Xd, *INCX, 11, &info ); PB_Cchkvec( ctxt, "PSASYMV", "Y", *N, 2, Yi, Yj, Yd, *INCY, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PSASYMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = utyp = PB_Cstypeset(); size = usiz = type->size; /* * and when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Reuse sub( Y ) and/or create vectors YR in process rows and YC in process * columns spanned by sub( A ) */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) { PB_CInOutV( utyp, ROW, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW, ((char**)(&tbeta)), &YR, YRd, &YRfr, &YRsum, &YRpbY ); PB_COutV( utyp, COLUMN, INIT, *N, *N, Ad0, 1, &YC, YCd, &YCfr, &YCsum ); } else { PB_CInOutV( utyp, COLUMN, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN, ((char**)(&tbeta)), &YC, YCd, &YCfr, &YCsum, &YCpbY ); PB_COutV( utyp, ROW, INIT, *N, *N, Ad0, 1, &YR, YRd, &YRfr, &YRsum ); } /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd, ROW, &XC, XCd, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd, COLUMN, &XR, XRd, &XRfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd[LLD_]; XRld = XRd[LLD_]; YCld = YCd[LLD_]; YRld = YRd[LLD_]; /* * Scale YR or YC in the case sub( Y ) has been reused */ if( YisRow ) { /* * YR resides in (a) process row(s) */ if( !YRpbY ) { if( ( myrow == YRd[RSRC_] ) || ( YRd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YR */ if( Anq > 0 ) sascal_( &Anq, ((char *) tbeta), YR, &YRld ); } } } else { /* * YC resides in (a) process column(s) */ if( !YCpbY ) { if( ( mycol == YCd[CSRC_] ) || ( YCd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YC */ if( Amp > 0 ) sascal_( &Amp, ((char *) tbeta), YC, &ione ); } } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &utyp->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { sagemv_( C2F_CHAR( NOTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, YC, &ione ); sagemv_( C2F_CHAR( TRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XC, &ione, one, Mptr( YR, 0, Akq, YRld, usiz ), &YRld ); } PB_Cpsym( type, utyp, LEFT, UPPER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, usiz ), YCld, Mptr( YR, 0, Akq, YRld, usiz ), YRld, PB_Ctzasymv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsym( type, utyp, LEFT, LOWER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, usiz ), YCld, Mptr( YR, 0, Akq, YRld, usiz ), YRld, PB_Ctzasymv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { sagemv_( C2F_CHAR( NOTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, Mptr( YC, Akp, 0, YCld, usiz ), &ione ); sagemv_( C2F_CHAR( TRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XC, Akp, 0, XCld, size ), &ione, one, Mptr( YR, 0, Akq, YRld, usiz ), &YRld ); } } } } if( XCfr ) free( XC ); if( XRfr ) free( XR ); if( YisRow ) { /* * Combine the partial column results into YC */ if( YCsum ) { YCd[CSRC_] = 0; if( Amp > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, 0 ); } } /* * Combine the partial row results into YR */ if( YRsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], YRd[RSRC_], mycol ); } /* * YR := YR + YC */ PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, YR, 0, 0, YRd, ROW ); /* * sub( Y ) := beta * sub( Y ) + YR (if necessary) */ if( YRpbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || Yrow < 0 ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, ((char *) Y), Yi, Yj, Yd, ROW ); } } else { /* * Combine the partial row results into YR */ if( YRsum ) { YRd[RSRC_] = 0; if( Anq > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], 0, mycol ); } } /* * Combine the partial column results into YC */ if( YCsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, YCd[CSRC_] ); } /* * YC := YR + YC */ PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, YC, 0, 0, YCd, COLUMN ); /* * sub( Y ) := beta * sub( Y ) + YC (if necessary) */ if( YCpbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || Ycol < 0 ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, COLUMN ); } } if( YCfr ) free( YC ); if( YRfr ) free( YR ); /* * End of PSASYMV */ } scalapack-2.0.2/PBLAS/SRC/psatrmv_.c000644 000766 000024 00000061430 10363532303 017202 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psatrmv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void psatrmv_( UPLO, TRANS, DIAG, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PSATRMV performs one of the matrix-vector operations * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( sub( A )' )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a real subvector, * sub( X ) is a subvector and sub( A ) is an n by n triangular subma- * trix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := |alpha|*|sub( A )|*|sub( X )| + * |beta*sub( Y )|. * * TRANS = 'T' or 't' * sub( Y ) := |alpha|*|sub( A )'|*|sub( X )| + * |beta*sub( Y )|. * * TRANS = 'C' or 'c' * sub( Y ) := |alpha|*|sub( A )'|*|sub( X )| + * |beta*sub( Y )|. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Yroc, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XAfr, XAld, Xi, Xj, YAfr, YAld, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper, usiz; float * tbeta; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSATRMV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSATRMV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PSATRMV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PSATRMV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkvec( ctxt, "PSATRMV", "X", *N, 4, Xi, Xj, Xd, *INCX, 13, &info ); PB_Cchkvec( ctxt, "PSATRMV", "Y", *N, 4, Yi, Yj, Yd, *INCY, 19, &info ); } if( info ) { PB_Cabort( ctxt, "PSATRMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA [REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = utyp = PB_Cstypeset(); size = usiz = type->size; /* * and when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( notran ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( utyp, COLUMN, *N, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, ((char**)(&tbeta)), &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( utyp, ROW, *N, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, ((char**)(&tbeta)), &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XAld = XAd[LLD_]; YAld = YAd[LLD_]; /* * Scale YA in the case sub( Y ) has been reused */ if( notran && !( YApbY ) ) { /* * YA resides in (a) process column(s) */ if( ( mycol == YAd[CSRC_] ) || ( YAd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YA */ if( Amp > 0 ) sascal_( &Amp, ((char *) tbeta), YA, &ione ); } } else if( !( notran ) && !( YApbY ) ) { /* * YA resides in (a) process row(s) */ if( ( myrow == YAd[RSRC_] ) || ( YAd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YA */ if( Anq > 0 ) sascal_( &Anq, ((char *) tbeta), YA, &YAld ); } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &utyp->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { sagemv_( TRANS, &Akp, &Anq0, ((char *) ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, YA, &ione ); } PB_Cptrm( type, utyp, LEFT, UPPER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, usiz ), YAld, PB_Ctzatrmv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { sagemv_( TRANS, &Akp, &Anq0, ((char *) ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XA, &ione, one, Mptr( YA, 0, Akq, YAld, usiz ), &YAld ); } PB_Cptrm( type, utyp, LEFT, UPPER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, usiz ), YAld, PB_Ctzatrmv ); } } } else { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, utyp, LEFT, LOWER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, usiz ), YAld, PB_Ctzatrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { sagemv_( TRANS, &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, Mptr( YA, Akp, 0, YAld, usiz ), &ione ); } } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, utyp, LEFT, LOWER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, usiz ), YAld, PB_Ctzatrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { sagemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, Akp, 0, XAld, size ), &ione, one, Mptr( YA, 0, Akq, YAld, usiz ), &YAld ); } } } } } if( XAfr ) free( XA ); if( notran ) { /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } } else { /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; sascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { sascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } if( notran ) { PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YA, 0, 0, YAd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } else { PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PSATRMV */ } scalapack-2.0.2/PBLAS/SRC/psaxpy_.c000644 000766 000024 00000022473 10363532303 017036 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psaxpy_( int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void psaxpy_( N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCX, * DESCY; float * X, * Y; #endif { /* * Purpose * ======= * * PSAXPY adds one subvector to another, * * sub( Y ) := sub( Y ) + alpha * sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER. * On entry, N specifies the length of the subvectors to be * added. N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xi, Xj, Yi, Yj, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PSAXPY", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PSAXPY", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PSAXPY", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Get type structure */ type = PB_Cstypeset(); /* * Start the operations */ if( *INCX == Xd[M_] ) { PB_Cpaxpby( type, NOCONJG, 1, *N, ((char *) ALPHA), ((char *) X), Xi, Xj, Xd, ROW, type->one, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, ((char *) ALPHA), ((char *) X), Xi, Xj, Xd, COLUMN, type->one, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } /* * End of PSAXPY */ } scalapack-2.0.2/PBLAS/SRC/pscasum_.c000644 000766 000024 00000025324 10363532303 017163 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pscasum_( int * N, float * ASUM, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pscasum_( N, ASUM, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; float * ASUM; /* * .. Array Arguments .. */ int * DESCX; float * X; #endif { /* * Purpose * ======= * * PSCASUM returns the sum of absolute values of the entries of a sub- * vector sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ASUM (local output) REAL * On exit, ASUM specifies the sum of absolute values of the * subvector sub( X ) only in its scope (See below for further * details). * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * Xptr = NULL, top; int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PSCASUM", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PSCASUM", info ); return; } #endif /* * Initialize ASUM */ *ASUM = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *N == 1 ) && ( *INCX == 1 ) && ( Xd[M_] == 1 ) ) { /* * Make sure I own some data and compute ASUM */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { type = PB_Cctypeset(); Xptr = Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ); *ASUM = ABS( ((float *) Xptr)[REAL_PART] ) + ABS( ((float *) Xptr)[IMAG_PART] ); } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and compute the local sum */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cctypeset(); scvasum_( &Xnq, ((char *) ASUM), Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } /* * If Xnq <= 0, ASUM is zero (see initialization above) */ if( ( npcol > 1 ) && ( Xcol >= 0 ) ) { /* * Combine the local results if npcol > 1 and Xcol >= 0, i.e sub( X ) is * distributed. */ top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, 1, 1, ((char *)ASUM), 1, -1, mycol ); } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and compute the local sum */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { type = PB_Cctypeset(); scvasum_( &Xnp, ((char *) ASUM), Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } /* * If Xnp <= 0, ASUM is zero (see initialization above) */ if( ( nprow > 1 ) && ( Xrow >= 0 ) ) { /* * Combine the local results if nprow > 1 and Xrow >= 0, i.e sub( X ) is * distributed. */ top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, 1, ((char *)ASUM), 1, -1, mycol ); } } return; } /* * End of PSCASUM */ } scalapack-2.0.2/PBLAS/SRC/pscnrm2_.c000644 000766 000024 00000043310 10363532303 017067 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pscnrm2_( int * N, float * NORM2, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pscnrm2_( N, NORM2, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; float * NORM2; /* * .. Array Arguments .. */ int * DESCX; float * X; #endif { /* * Purpose * ======= * * PSCNRM2 computes the 2-norm of a subvector sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * NORM2 (local output) REAL * On exit, NORM2 specifies the 2-norm of the subvector sub( X ) * only in its scope (See below for further details). * * X (local input) COMPLEX array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * Xptr = NULL, top; int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, dst, dist, info, k, mycol, mydist, myrow, npcol, nprow, src, size; float Xtmp, scale, ssq, temp1, temp2; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_]; float work[4]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PSCNRM2", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PSCNRM2", info ); return; } #endif /* * Initialize NORM2 */ *NORM2 = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *N == 1 ) && ( *INCX == 1 ) && ( Xd[M_] == 1 ) ) { /* * Make sure I own some data and compute NORM2 */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { scale = ZERO; ssq = ONE; type = PB_Cctypeset(); Xptr = Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ); Xtmp = ((float *) Xptr)[REAL_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xtmp = ((float *) Xptr)[IMAG_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } /* * Compute NORM2 = SCALE * SQRT( SSQ ) */ sasqrtb_( &scale, &ssq, NORM2 ); } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Initialize SCALE and SSQ */ scale = ZERO; ssq = ONE; /* * Make sure I own some data and compute local sum of squares */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cctypeset(); size = type->size; Xptr = Mptr( ((char *) X), Xii, Xjj, Xld, size ); for( k = 0; k < Xnq; k++ ) { Xtmp = ((float *) Xptr)[REAL_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xtmp = ((float *) Xptr)[IMAG_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xptr += Xld * size; } } /* * If Xnq <= 0, SCALE is zero and SSQ is one (see initialization above) */ if( ( npcol >= 2 ) && ( Xcol >= 0 ) ) { /* * Combine the local sum of squares using a 1-tree topology within process row * 0 if npcol > 1 and Xcol >= 0, i.e sub( X ) is distributed. */ work[0] = scale; work[1] = ssq; mydist = mycol; k = 1; l_10: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, npcol ); Csgesd2d( ctxt, 2, 1, ((char*) work), 2, myrow, dst ); goto l_20; } else { dist = mycol + k; src = MPosMod( dist, npcol ); if( mycol < src ) { Csgerv2d( ctxt, 2, 1, ((char*)&work[2]), 2, myrow, src ); if( work[0] >= work[2] ) { if( work[0] != ZERO ) { temp1 = work[2] / work[0]; work[1] = work[1] + ( temp1 * temp1 ) * work[3]; } } else { temp1 = work[0] / work[2]; work[1] = work[3] + ( temp1 * temp1 ) * work[1]; work[0] = work[2]; } } mydist >>= 1; } k <<= 1; if( k < npcol ) goto l_10; l_20: /* * Process column 0 broadcasts the combined values of SCALE and SSQ within their * process row. */ top = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == 0 ) { Csgebs2d( ctxt, ROW, &top, 2, 1, ((char*)work), 2 ); } else { Csgebr2d( ctxt, ROW, &top, 2, 1, ((char*)work), 2, myrow, 0 ); } /* * Compute NORM2 redundantly NORM2 = WORK( 1 ) * SQRT( WORK( 2 ) ) */ sasqrtb_( &work[0], &work[1], NORM2 ); } else { /* * Compute NORM2 redundantly ( sub( X ) is not distributed ) */ sasqrtb_( &scale, &ssq, NORM2 ); } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Initialize SCALE and SSQ */ scale = ZERO; ssq = ONE; /* * Make sure I own some data and compute local sum of squares */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { type = PB_Cctypeset(); size = type->size; Xptr = Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], size ); for( k = 0; k < Xnp; k++ ) { Xtmp = ((float *) Xptr)[REAL_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xtmp = ((float *) Xptr)[IMAG_PART]; if( Xtmp != ZERO ) { temp1 = ABS( Xtmp ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xptr += size; } } /* * If Xnp <= 0, SCALE is zero and SSQ is one (see initialization above) */ if( ( nprow >= 2 ) && ( Xrow >= 0 ) ) { /* * Combine the local sum of squares using a 1-tree topology within process * column 0 if nprow > 1 and Xrow >= 0, i.e sub( X ) is distributed. */ work[0] = scale; work[1] = ssq; mydist = myrow; k = 1; l_30: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, nprow ); Csgesd2d( ctxt, 2, 1, ((char*)work), 2, dst, mycol ); goto l_40; } else { dist = myrow + k; src = MPosMod( dist, nprow ); if( myrow < src ) { Csgerv2d( ctxt, 2, 1, ((char*)&work[2]), 2, src, mycol ); if( work[0] >= work[2] ) { if( work[0] != ZERO ) { temp1 = work[2] / work[0]; work[1] = work[1] + ( temp1 * temp1 ) * work[3]; } } else { temp1 = work[0] / work[2]; work[1] = work[3] + ( temp1 * temp1 ) * work[1]; work[0] = work[2]; } } mydist >>= 1; } k <<= 1; if( k < nprow ) goto l_30; l_40: /* * Process column 0 broadcasts the combined values of SCALE and SSQ within their * process column */ top = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == 0 ) { Csgebs2d( ctxt, COLUMN, &top, 2, 1, ((char*)work), 2 ); } else { Csgebr2d( ctxt, COLUMN, &top, 2, 1, ((char*)work), 2, 0, mycol ); } /* * Compute NORM2 redundantly NORM2 = WORK[0] * SQRT( WORK[1] ) */ sasqrtb_( &work[0], &work[1], NORM2 ); } else { /* * Compute NORM2 redundantly ( sub( X ) is not distributed ) */ sasqrtb_( &scale, &ssq, NORM2 ); } } return; } /* * End of PSCNRM2 */ } scalapack-2.0.2/PBLAS/SRC/pscopy_.c000644 000766 000024 00000021577 10363532303 017033 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pscopy_( int * N, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pscopy_( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; /* * .. Array Arguments .. */ int * DESCX, * DESCY; float * X, * Y; #endif { /* * Purpose * ======= * * PSCOPY copies one subvector into another, * * sub( Y ) := sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * copied. N must be at least zero. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xi, Xj, Yi, Yj, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 501 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PSCOPY", "X", *N, 1, Xi, Xj, Xd, *INCX, 5, &info ); PB_Cchkvec( ctxt, "PSCOPY", "Y", *N, 1, Yi, Yj, Yd, *INCY, 10, &info ); } if( info ) { PB_Cabort( ctxt, "PSCOPY", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Get type structure */ type = PB_Cstypeset(); /* * Start the operations */ if( *INCX == Xd[M_] ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) X), Xi, Xj, Xd, ROW, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) X), Xi, Xj, Xd, COLUMN, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } /* * End of PSCOPY */ } scalapack-2.0.2/PBLAS/SRC/psdot_.c000644 000766 000024 00000066637 10363532303 016655 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psdot_( int * N, float * DOT, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void psdot_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; float * DOT; /* * .. Array Arguments .. */ int * DESCX, * DESCY; float * X, * Y; #endif { /* * Purpose * ======= * * PSDOT forms the dot product of two subvectors, * * DOT := sub( X )**T * sub( Y ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * multiplied. N must be at least zero. * * DOT (local output) REAL array * On exit, DOT specifies the dot product of the two subvectors * sub( X ) and sub( Y ) only in their scope (See below for fur- * ther details). * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char scope, * top; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD, Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info, ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src; PBTYP_T * type; VVDOT_T dot; /* * .. Local Arrays .. */ char * buf = NULL; int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PSDOT", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PSDOT", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PSDOT", info ); return; } #endif DOT[REAL_PART] = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Handle degenerate case */ if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) ) { type = PB_Cstypeset(); PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_]; XnbD = Xd[NB_]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_]; XnbD = Xd[MB_]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_]; YnbD = Yd[NB_]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_]; YnbD = Yd[MB_]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector used * to perform the dot product computation. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cstypeset(); size = type->size; dot = type->Fvvdotu; /* * A purely operation occurs iff the operands start in the same process and if * either the grid is mono-dimensional or there is a single local block to be * operated with or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } } /* * Combine the local results in sub( X )'s scope */ if( ( XisR && YisR ) || ( XmyprocR == XprocR ) ) { scope = ( XisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); Csgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 ); } if( RRorCC && XisR && YisR ) return; } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to compute the desired dot-product in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( ( YmyprocR == YprocR ) ) { /* * The processes owning a piece of sub( Y ) send it to the corresponding * process owning s piece of sub ( X ). */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( XisRow ) { rdst = XprocR; cdst = dst; } else { rdst = dst; cdst = XprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( YisRow ) Csgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); else Csgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding local * piece of sub( Y ), compute the local dot product and combine the results * within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); src = MPosMod( src, YnprocsD ); if( YisRow ) { rsrc = YprocR; csrc = src; } else { rsrc = src; csrc = YprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Csgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc ); else Csgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc ); dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } else { /* * General case, copy sub( Y ) within sub( X )'s scope, compute the local * results and combine them within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XisRow ) { PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt, 1 ); } else { PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt, MAX( 1, XnpD ) ); } if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) ) buf = PB_Cmalloc( XnpD * size ); if( YisRow ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj, Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj, Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } if( XmyprocR == XprocR ) { if( XnpD > 0 ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } /* * Send the DOT product result within sub( Y )'s scope */ if( XisR || YisR ) { /* * Either sub( X ) or sub( Y ) are replicated, so that every process should have * the result -> broadcast it orthogonally from sub( X )'s direction. */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Csgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Csgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Csgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Csgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD, XprocR ); } } else { /* * Neither sub( X ) nor sub( Y ) are replicated */ if( RRorCC ) { /* * Both sub( X ) are distributed in the same direction -> the process row or * column XprocR sends the result to the process row or column YprocR. */ if( XprocR != YprocR ) { if( XmyprocR == XprocR ) { if( XisRow ) Csgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR, YmyprocD ); else Csgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD, YprocR ); } else if( YmyprocR == YprocR ) { if( XisRow ) Csgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR, XmyprocD ); else Csgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD, XprocR ); } } } else { /* * Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s * scope, broadcast the result within sub( Y )'s scope. */ if( YmyprocR == YprocR ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocD == XprocR ) Csgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Csgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, YprocR, XprocR ); } else { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocD == XprocR ) Csgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Csgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, YprocR ); } } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ type = PB_Cstypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ type = PB_Cstypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotu ); } else { /* * Neither sub( X ) nor sub( Y ) are distributed */ type = PB_Cstypeset(); PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); } /* * End of PSDOT */ } scalapack-2.0.2/PBLAS/SRC/psgeadd_.c000644 000766 000024 00000026431 10363532303 017117 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psgeadd_( F_CHAR_T TRANS, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void psgeadd_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * IC, * JA, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PSGEADD adds a matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if TRANS = 'N', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'T', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'C'. * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the form of op( sub( A ) ) to be * used in the matrix addition as follows: * * TRANS = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANS = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANS = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) REAL array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, DirC, ctop, rtop; int Ai, Aj, Ci, Cj, TrA, ctxt, info, mycol, myrow, npcol, nprow, notran; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !notran ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSGEADD", "Illegal TRANS = %c\n", TrA ); info = -1; } if( notran ) PB_Cchkmat( ctxt, "PSGEADD", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); else PB_Cchkmat( ctxt, "PSGEADD", "A", *N, 3, *M, 2, Ai, Aj, Ad, 8, &info ); PB_Cchkmat( ctxt, "PSGEADD", "C", *M, 2, *N, 3, Ci, Cj, Cd, 13, &info ); } if( info ) { PB_Cabort( ctxt, "PSGEADD", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( PB_Cstypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cstypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ /* * This operation mainly involves point-to-point send and receive communication. * There is therefore no particular BLACS topology to recommend. Still, one can * choose the main loop direction in which the operands will be added. This * selection is based on the current setting for the BLACS broadcast operations. */ if( notran ) { rtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( *M <= *N ) { DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirC = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpgeadd( PB_Cstypeset(), &DirA, &DirC, NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cptran( PB_Cstypeset(), NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } /* * End of PSGEADD */ } scalapack-2.0.2/PBLAS/SRC/psgemm_.c000644 000766 000024 00000047577 10363532303 017016 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psgemm_( F_CHAR_T TRANSA, F_CHAR_T TRANSB, int * M, int * N, int * K, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void psgemm_( TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANSA, TRANSB; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; float * A, * B, * C; #endif { /* * Purpose * ======= * * PSGEMM performs one of the matrix-matrix operations * * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * op( X ) = X or op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N', * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T', * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'C', * * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N', * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T', * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'C', * * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) ) * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and * sub( C ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies the form of op( sub( B ) ) to be * used in the matrix multiplication as follows: * * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ), * * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )', * * TRANSB = 'C' or 'c' op( sub( B ) ) = sub( B )'. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * op( sub( A ) ) and of the submatrix sub( C ). M must be at * least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * op( sub( B ) ) and the number of columns of the submatrix * sub( C ). N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns of the submatrix * op( sub( A ) ) and the number of rows of the submatrix * op( sub( B ) ). K must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) REAL array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) REAL array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, DirBC, OpC, OpR, TrA, TrB, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ChooseBC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, nota, notb, npcol, nprow; double ABest, ACest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANSA )[0] ) ) == CNOTRAN ); notb = ( ( TrB = Mupcase( F2C_CHAR( TRANSB )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1001 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSGEMM", "Illegal TRANSA = %c\n", TrA ); info = -1; } else if( ( !notb ) && ( TrB != CTRAN ) && ( TrB != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSGEMM", "Illegal TRANSB = %c\n", TrB ); info = -2; } if( nota ) PB_Cchkmat( ctxt, "PSGEMM", "A", *M, 3, *K, 5, Ai, Aj, Ad, 10, &info ); else PB_Cchkmat( ctxt, "PSGEMM", "A", *K, 5, *M, 3, Ai, Aj, Ad, 10, &info ); if( notb ) PB_Cchkmat( ctxt, "PSGEMM", "B", *K, 5, *N, 4, Bi, Bj, Bd, 14, &info ); else PB_Cchkmat( ctxt, "PSGEMM", "B", *N, 4, *K, 5, Bi, Bj, Bd, 14, &info ); PB_Cchkmat( ctxt, "PSGEMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 19, &info ); } if( info ) { PB_Cabort( ctxt, "PSGEMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO || *K == 0 ) && ( BETA [REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cstypeset(); /* * If alpha or K is zero, sub( C ) := beta * sub( C ). */ if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char * ) C), Ci, Cj, Cd ); } else if( !( BETA[REAL_PART] == ONE ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char * ) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABest: both operands sub( A ) and sub( B ) are communicated (M, N >> K) * ACest: both operands sub( A ) and sub( C ) are communicated (K, N >> M) * BCest: both operands sub( B ) and sub( C ) are communicated (M, K >> N) */ ABest = (double)(*K); ACest = (double)(*M); BCest = (double)(*N); if( notb ) { if( nota ) { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); ABest *= ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *K, Bd[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[MB_], nprow ); BCest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } else { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *M, Ad[NB_], npcol ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ); tmp1 = DNROC( *K, Bd[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ACest *= ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *K, Ad[MB_], nprow ); tmp2 = DNROC( *M, Bd[NB_], npcol ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest *= ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } } else { if( nota ) { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *N, Bd[MB_], nprow ); tmp2 = DNROC( *K, Bd[NB_], npcol ); tmp3 = DNROC( *N, Cd[NB_], npcol ); ACest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *K, Ad[NB_], npcol ); BCest *= CBRATIO * ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); } else { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *N, Bd[MB_], nprow ); tmp2 = DNROC( *K, Bd[NB_], npcol ); tmp3 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *K, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *K, Bd[NB_], npcol ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } } ChooseAB = ( ( ABest <= ( 1.3 * BCest ) ) && ( ABest <= ( 1.3 * ACest ) ) ); ChooseBC = ( ( BCest <= ACest ) && ( ( 1.3 * BCest ) <= ABest ) ); /* * BLACS topologies are enforced iff M, N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) && ( *K > nb ) ); if( ChooseAB ) { OpR = CBCAST; OpC = CBCAST; } else if( ChooseBC ) { if( nota ) { OpR = CCOMBINE; OpC = CBCAST; } else { OpR = CBCAST; OpC = CCOMBINE; } } else { if( notb ) { OpR = CBCAST; OpC = CCOMBINE; } else { OpR = CCOMBINE; OpC = CBCAST; } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_IRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_IRING ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirAB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirBC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); if( ChooseAB ) { PB_CpgemmAB( type, &DirAB, &DirBC, ( nota ? NOTRAN : TRAN ), ( notb ? NOTRAN : TRAN ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else if( ChooseBC ) { PB_CpgemmBC( type, &DirAB, &DirBC, ( nota ? NOTRAN : TRAN ), ( notb ? NOTRAN : TRAN ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { PB_CpgemmAC( type, &DirAB, &DirBC, ( nota ? NOTRAN : TRAN ), ( notb ? NOTRAN : TRAN ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PSGEMM */ } scalapack-2.0.2/PBLAS/SRC/psgemv_.c000644 000766 000024 00000043075 10363532303 017014 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psgemv_( F_CHAR_T TRANS, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void psgemv_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PSGEMV performs one of the matrix-vector operations * * sub( Y ) := alpha*sub( A ) *sub( X ) + beta*sub( Y ), or * sub( Y ) := alpha*sub( A )'*sub( X ) + beta*sub( Y ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1). * * When TRANS = 'N', * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+M-1), if INCY = M_Y, * Y(IY:IY+M-1,JY:JY), if INCY = 1 and INCY <> M_Y, * and, otherwise * * sub( X ) denotes X(IX:IX,JX:JX+M-1), if INCX = M_X, * X(IX:IX+M-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are scalars, and sub( X ) and sub( Y ) are subvectors * and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := alpha*sub( A ) * sub( X ) + beta*sub( Y ), * * TRANS = 'T' or 't', * sub( Y ) := alpha*sub( A )' * sub( X ) + beta*sub( Y ), * * TRANS = 'C' or 'c', * sub( Y ) := alpha*sub( A )' * sub( X ) + beta*sub( Y ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char TrA, Yroc, * tbeta, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, mycol, myrow, nota, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSGEMV", "Illegal TRANS=%c\n", TrA ); info = -1; } PB_Cchkmat( ctxt, "PSGEMV", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); if( nota ) { PB_Cchkvec( ctxt, "PSGEMV", "X", *N, 3, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PSGEMV", "Y", *M, 2, Yi, Yj, Yd, *INCY, 18, &info ); } else { PB_Cchkvec( ctxt, "PSGEMV", "X", *M, 2, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PSGEMV", "Y", *N, 3, Yi, Yj, Yd, *INCY, 18, &info ); } } if( info ) { PB_Cabort( ctxt, "PSGEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cstypeset(); /* * When alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; if( BETA[REAL_PART] == ZERO ) { sset_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } else { sscal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { if( BETA[REAL_PART] == ZERO ) { sset_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } else { sscal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( nota ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( type, COLUMN, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { sgemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *)A), Aii, Ajj, Ald, type->size ), &Ald, XA, &XAd[LLD_], tbeta, YA, &ione ); } if( XAfr ) free( XA ); /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, *M, 1, type->one, YA, 0, 0, YAd, COLUMN, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc ); } } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( type, ROW, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { sgemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *)A), Aii, Ajj, Ald, type->size ), &Ald, XA, &ione, tbeta, YA, &YAd[LLD_] ); } if( XAfr ) free( XA ); /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, YA, 0, 0, YAd, ROW, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PSGEMV */ } scalapack-2.0.2/PBLAS/SRC/psger_.c000644 000766 000024 00000027272 10363532303 016634 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psger_( int * M, int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY, float * A, int * IA, int * JA, int * DESCA ) #else void psger_( M, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PSGER performs the rank 1 operation * * sub( A ) := alpha*sub( X )*sub( Y )' + sub( A ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) is an m element subvector, sub( Y ) is * an n element subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+M-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+M-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( A ) are overwritten by the * local entries of the m by n updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, Yi, Yj, ctxt, info, ione=1, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd[DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PSGER", "X", *M, 1, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PSGER", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PSGER", "A", *M, 1, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PSGER", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cstypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Replicate sub( Y ) in process rows spanned by sub( A ) -> YA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ), &YA, YAd, &YAfr ); /* * Local rank-1 update iff I own some data */ Amp = PB_Cnumroc( *M, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { sger_( &Amp, &Anq, ((char *) ALPHA), XA, &ione, YA, &YAd[LLD_], ((char *) (A+(Aii+Ajj*Ald))), &Ald ); } if( XAfr ) free( XA ); if( YAfr ) free( YA ); /* * End of PSGER */ } scalapack-2.0.2/PBLAS/SRC/psnrm2_.c000644 000766 000024 00000036375 10363532303 016741 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psnrm2_( int * N, float * NORM2, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void psnrm2_( N, NORM2, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; float * NORM2; /* * .. Array Arguments .. */ int * DESCX; float * X; #endif { /* * Purpose * ======= * * PSNRM2 computes the 2-norm of a subvector sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * NORM2 (local output) REAL * On exit, NORM2 specifies the 2-norm of the subvector sub( X ) * only in its scope (See below for further details). * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char top; int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, dst, dist, info, k, mycol, mydist, myrow, npcol, nprow, src; float scale, ssq, temp1, temp2; /* * .. Local Arrays .. */ int Xd[DLEN_]; float * Xptr = NULL, work[4]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PSNRM2", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PSNRM2", info ); return; } #endif /* * Initialize NORM2 */ *NORM2 = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *N == 1 ) && ( *INCX == 1 ) && ( Xd[M_] == 1 ) ) { /* * Make sure I own some data and compute NORM2 */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) *NORM2 = ABS( X[Xii+Xjj*Xd[LLD_]] ); return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Initialize SCALE and SSQ */ scale = ZERO; ssq = ONE; /* * Make sure I own some data and compute local sum of squares */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; Xptr = X+(Xii+Xjj*Xld); for( k = 0; k < Xnq; k++ ) { if( *Xptr != ZERO ) { temp1 = ABS( *Xptr ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xptr += Xld; } } /* * If Xnq <= 0, SCALE is zero and SSQ is one (see initialization above) */ if( ( npcol >= 2 ) && ( Xcol >= 0 ) ) { /* * Combine the local sum of squares using a 1-tree topology within process row * 0 if npcol > 1 and Xcol >= 0, i.e sub( X ) is distributed. */ work[0] = scale; work[1] = ssq; mydist = mycol; k = 1; l_10: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, npcol ); Csgesd2d( ctxt, 2, 1, ((char*) work), 2, myrow, dst ); goto l_20; } else { dist = mycol + k; src = MPosMod( dist, npcol ); if( mycol < src ) { Csgerv2d( ctxt, 2, 1, ((char*)&work[2]), 2, myrow, src ); if( work[0] >= work[2] ) { if( work[0] != ZERO ) { temp1 = work[2] / work[0]; work[1] = work[1] + ( temp1 * temp1 ) * work[3]; } } else { temp1 = work[0] / work[2]; work[1] = work[3] + ( temp1 * temp1 ) * work[1]; work[0] = work[2]; } } mydist >>= 1; } k <<= 1; if( k < npcol ) goto l_10; l_20: /* * Process column 0 broadcasts the combined values of SCALE and SSQ within their * process row. */ top = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == 0 ) { Csgebs2d( ctxt, ROW, &top, 2, 1, ((char*)work), 2 ); } else { Csgebr2d( ctxt, ROW, &top, 2, 1, ((char*)work), 2, myrow, 0 ); } /* * Compute NORM2 redundantly NORM2 = WORK( 1 ) * SQRT( WORK( 2 ) ) */ sasqrtb_( &work[0], &work[1], NORM2 ); } else { /* * Compute NORM2 redundantly ( sub( X ) is not distributed ) */ sasqrtb_( &scale, &ssq, NORM2 ); } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Initialize SCALE and SSQ */ scale = ZERO; ssq = ONE; /* * Make sure I own some data and compute local sum of squares */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { Xptr = X+(Xii+Xjj*Xd[LLD_]); for( k = 0; k < Xnp; k++ ) { if( *Xptr != ZERO ) { temp1 = ABS( *Xptr ); if( scale < temp1 ) { temp2 = scale / temp1; ssq = ONE + ssq * ( temp2 * temp2 ); scale = temp1; } else { temp2 = temp1 / scale; ssq = ssq + ( temp2 * temp2 ); } } Xptr++; } } /* * If Xnp <= 0, SCALE is zero and SSQ is one (see initialization above) */ if( ( nprow >= 2 ) && ( Xrow >= 0 ) ) { /* * Combine the local sum of squares using a 1-tree topology within process * column 0 if nprow > 1 and Xrow >= 0, i.e sub( X ) is distributed. */ work[0] = scale; work[1] = ssq; mydist = myrow; k = 1; l_30: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, nprow ); Csgesd2d( ctxt, 2, 1, ((char*)work), 2, dst, mycol ); goto l_40; } else { dist = myrow + k; src = MPosMod( dist, nprow ); if( myrow < src ) { Csgerv2d( ctxt, 2, 1, ((char*)&work[2]), 2, src, mycol ); if( work[0] >= work[2] ) { if( work[0] != ZERO ) { temp1 = work[2] / work[0]; work[1] = work[1] + ( temp1 * temp1 ) * work[3]; } } else { temp1 = work[0] / work[2]; work[1] = work[3] + ( temp1 * temp1 ) * work[1]; work[0] = work[2]; } } mydist >>= 1; } k <<= 1; if( k < nprow ) goto l_30; l_40: /* * Process column 0 broadcasts the combined values of SCALE and SSQ within their * process column */ top = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == 0 ) { Csgebs2d( ctxt, COLUMN, &top, 2, 1, ((char*)work), 2 ); } else { Csgebr2d( ctxt, COLUMN, &top, 2, 1, ((char*)work), 2, 0, mycol ); } /* * Compute NORM2 redundantly NORM2 = WORK[0] * SQRT( WORK[1] ) */ sasqrtb_( &work[0], &work[1], NORM2 ); } else { /* * Compute NORM2 redundantly ( sub( X ) is not distributed ) */ sasqrtb_( &scale, &ssq, NORM2 ); } } return; } /* * End of PSNRM2 */ } scalapack-2.0.2/PBLAS/SRC/psscal_.c000644 000766 000024 00000021674 10602576752 017015 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psscal_( int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void psscal_( N, ALPHA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCX; float * X; #endif { /* * Purpose * ======= * * PSSCAL multiplies an n element subvector sub( X ) by the scalar * alpha, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input/local output) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the scaled * subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PSSCAL", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PSSCAL", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( Xd[CTXT_], &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Start the operations */ if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { if( ALPHA[REAL_PART] == ONE ) return; /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; if( ALPHA[REAL_PART] == ZERO ) { sset_( &Xnq, ((char *) ALPHA), ((char *)(X+(Xii+Xjj*Xld))), &Xld ); } else { sscal_( &Xnq, ((char *) ALPHA), ((char *)(X+(Xii+Xjj*Xld))), &Xld ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { if( ALPHA[REAL_PART] == ONE ) return; /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { if( ALPHA[REAL_PART] == ZERO ) { sset_( &Xnp, ((char *) ALPHA), ((char *)( X+(Xii+Xjj*Xd[LLD_]) )), INCX ); } else { sscal_( &Xnp, ((char *) ALPHA), ((char *)( X+(Xii+Xjj*Xd[LLD_]) )), INCX ); } } } return; } /* * End of PSSCAL */ } scalapack-2.0.2/PBLAS/SRC/psswap_.c000644 000766 000024 00000076177 10363532303 017041 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void psswap_( int * N, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void psswap_( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; /* * .. Array Arguments .. */ int * DESCX, * DESCY; float * X, * Y; #endif { /* * Purpose * ======= * * PSSWAP swaps two subvectors, * * sub( Y ) := sub( X ) and sub( X ) := sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * swapped. N must be at least zero. * * X (local input/local output) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with sub( Y ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Xscope, Yscope, * one, * top, tran, * zero; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, XisD, XisR, Xinb1D, XinbD, XisRow, Xii, Xj, Xjj, Xld, Xlinc, Xm, XmyprocD, XmyprocR, Xn, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xroc, Xrow, Ycol, Yi, Yii, Yinb1D, YinbD, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, Ym, YmyprocD, YmyprocR, Yn, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yroc, Yrow, cdst, csrc, ctxt, dst, gcdPQ, info, ione=1, k, l, lcmPQ, lcmb, mycol, myrow, npcol, npq, nprow, p, q, rdst, rsrc, src, size; PBTYP_T * type; PB_VM_T VM; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; char * buf = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 501 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PSSWAP", "X", *N, 1, Xi, Xj, Xd, *INCX, 5, &info ); PB_Cchkvec( ctxt, "PSSWAP", "Y", *N, 1, Yi, Yj, Yd, *INCY, 10, &info ); } if( info ) { PB_Cabort( ctxt, "PSSWAP", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_ ]; XnbD = Xd[NB_ ]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_ ]; XnbD = Xd[MB_ ]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_ ]; YnbD = Yd[NB_ ]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_ ]; YnbD = Yd[MB_ ]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is distributed but not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector to * swap sub( X ) with. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cstypeset(); size = type->size; /* * A purely local operation occurs iff the operands start in the same process * and, if either the grid is mono-dimensional or there is a single local block * to be swapped or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { sswap_( &XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } if( RRorCC && XisR && YisR ) return; } } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to swap the distributed vectors in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( RRorCC && ( XprocR != YprocR ) ) { /* * Both operands are distributed in the same direction, but reside in different * process rows or columns. */ if( XmyprocR == XprocR ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { dst = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); dst = MPosMod( dst, YnprocsD ); if( XisRow ) { Csgesd2d( ctxt, 1, XnpD, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, YprocR, dst ); Csgerv2d( ctxt, 1, XnpD, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, YprocR, dst ); } else { Csgesd2d( ctxt, XnpD, 1, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, dst, YprocR ); Csgerv2d( ctxt, XnpD, 1, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, dst, YprocR ); } } } if( YmyprocR == YprocR ) { YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( YisRow ) { Csgesd2d( ctxt, 1, YnpD, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, XprocR, dst ); Csgerv2d( ctxt, 1, YnpD, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, XprocR, dst ); } else { Csgesd2d( ctxt, YnpD, 1, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, dst, XprocR ); Csgerv2d( ctxt, YnpD, 1, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, dst, XprocR ); } } } } else { /* * General case when just one message needs to be exchanged */ if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) send it to the corresponding * process owning s piece of sub ( Y ). */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { dst = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); dst = MPosMod( dst, YnprocsD ); if( YisRow ) { rdst = YprocR; cdst = dst; } else { rdst = dst; cdst = YprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { sswap_( &XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( XisRow ) Csgesd2d( ctxt, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, rdst, cdst ); else Csgesd2d( ctxt, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, rdst, cdst ); } } } if( YmyprocR == YprocR ) { /* * The processes owning a piece of sub( Y ) receive the corresponding piece * of sub( X ) and send the piece of sub( Y ) they own to the same process. */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { src = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); src = MPosMod( src, XnprocsD ); if( XisRow ) { rsrc = XprocR; csrc = src; } else { rsrc = src; csrc = XprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( YnpD * size ); if( XisRow ) Csgerv2d( ctxt, 1, YnpD, buf, 1, rsrc, csrc ); else Csgerv2d( ctxt, YnpD, 1, buf, YnpD, rsrc, csrc ); if( YisRow ) Csgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rsrc, csrc ); else Csgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rsrc, csrc ); scopy_( &YnpD, buf, &ione, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); if( buf ) free( buf ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding piece * of sub( Y ). */ if( XnpD > 0 ) { if( ( myrow != rdst ) || ( mycol != cdst ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Csgerv2d( ctxt, 1, XnpD, buf, 1, rdst, cdst ); else Csgerv2d( ctxt, XnpD, 1, buf, XnpD, rdst, cdst ); scopy_( &XnpD, buf, &ione, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc ); if( buf ) free( buf ); } } } } } else if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) ) { /* * General case */ tran = ( RRorCC ? CNOTRAN : CTRAN ); if( XisRow ) { Xscope = CCOLUMN; Xm = 1; rsrc = XprocR; } else { Xscope = CROW; Xn = 1; csrc = XprocR; } if( YisRow ) { Yscope = CCOLUMN; Ym = 1; rdst = YprocR; } else { Yscope = CROW; Yn = 1; cdst = YprocR; } lcmb = PB_Clcm( XnprocsD * XnbD, YnprocsD * YnbD ); one = type->one; zero = type->zero; gcdPQ = PB_Cgcd( XnprocsD, YnprocsD ); lcmPQ = ( XnprocsD / gcdPQ ) * YnprocsD; for( k = 0; k < gcdPQ; k++ ) { p = 0; q = k; for( l = 0; l < lcmPQ; l++ ) { Xroc = MModAdd( XprocD, p, XnprocsD ); Yroc = MModAdd( YprocD, q, YnprocsD ); if( ( XmyprocD == Xroc ) || ( YmyprocD == Yroc ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, Xroc, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, Yroc, YprocD, YnprocsD ); PB_CVMinit( &VM, 0, XnpD, YnpD, Xinb1D, Yinb1D, XnbD, YnbD, p, q, XnprocsD, YnprocsD, lcmb ); if( npq = PB_CVMnpq( &VM ) ) { if( ( RRorCC && ( Xroc == Yroc ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( Xroc == YprocR ) && ( XprocR == Yroc ) ) ) { /* * If I am at the intersection of the process cross, or simply common to the * process rows or columns owning sub( X ) and sub( Y ) */ if( ( YmyprocD == Yroc ) && ( YmyprocR == YprocR ) ) { PB_CVMswp( type, &VM, ROW, &Xscope, &tran, npq, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Ylinc ); } } else { /* * Perform the message exchange: pack the data I own, send it, receive the * remote data, and unpack it. */ if( ( XmyprocR == XprocR ) && ( XmyprocD == Xroc ) ) { if( XisRow ) { Xn = npq; } else { Xm = npq; } if( YisRow ) { Yn = npq; cdst = Yroc; } else { Ym = npq; rdst = Yroc; } buf = PB_Cmalloc( npq * size ); PB_CVMpack( type, &VM, ROW, &Xscope, PACKING, NOTRAN, npq, 1, one, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, zero, buf, Xm ); Csgesd2d( ctxt, Xm, Xn, buf, Xm, rdst, cdst ); Csgerv2d( ctxt, Ym, Yn, buf, Ym, rdst, cdst ); PB_CVMpack( type, &VM, ROW, &Xscope, UNPACKING, &tran, npq, 1, zero, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, one, buf, Ym ); if( buf ) free ( buf ); } if( ( YmyprocR == YprocR ) && ( YmyprocD == Yroc ) ) { if( XisRow ) { Xn = npq; csrc = Xroc; } else { Xm = npq; rsrc = Xroc; } if( YisRow ) { Yn = npq; } else { Ym = npq; } buf = PB_Cmalloc( npq * size ); PB_CVMpack( type, &VM, COLUMN, &Yscope, PACKING, NOTRAN, npq, 1, one, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, zero, buf, Ym ); Csgesd2d( ctxt, Ym, Yn, buf, Ym, rsrc, csrc ); Csgerv2d( ctxt, Xm, Xn, buf, Xm, rsrc, csrc ); PB_CVMpack( type, &VM, COLUMN, &Yscope, UNPACKING, &tran, npq, 1, zero, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, one, buf, Xm ); if( buf ) free ( buf ); } } } } p = MModAdd1( p, XnprocsD ); q = MModAdd1( q, YnprocsD ); } } } if( XisR ) { /* * Replicate sub( X ) when necessary */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Csgebs2d( ctxt, COLUMN, top, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld ); else Csgebr2d( ctxt, COLUMN, top, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Csgebs2d( ctxt, ROW, top, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld ); else Csgebr2d( ctxt, ROW, top, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, XmyprocD, XprocR ); } } } if( YisR ) { /* * Replicate sub( Y ) when necessary */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocR == YprocR ) Csgebs2d( ctxt, COLUMN, top, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld ); else Csgebr2d( ctxt, COLUMN, top, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, YprocR, YmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocR == YprocR ) Csgebs2d( ctxt, ROW, top, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld ); else Csgebr2d( ctxt, ROW, top, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, YmyprocD, YprocR ); } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ PB_CpswapND( PB_Cstypeset(), *N, ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ PB_CpswapND( PB_Cstypeset(), *N, ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX ); } else { /* * Neither sub( X ) nor sub( Y ) are distributed. */ PB_CpswapNN( PB_Cstypeset(), *N, ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY ); } /* * End of PSSWAP */ } scalapack-2.0.2/PBLAS/SRC/pssymm_.c000644 000766 000024 00000052727 10363532303 017047 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pssymm_( F_CHAR_T SIDE, F_CHAR_T UPLO, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pssymm_( SIDE, UPLO, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T SIDE, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; float * A, * B, * C; #endif { /* * Purpose * ======= * * PSSYMM performs one of the matrix-matrix operations * * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * or * * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha and beta are scalars, sub( A ) is a symmetric submatrix and * sub( B ) and sub( C ) are m by n submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether the symmetric submatrix * sub( A ) appears on the left or right in the operation as * follows: * * SIDE = 'L' or 'l' * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * SIDE = 'R' or 'r' * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with SIDE = 'L' or 'l', this array contains * the local entries corresponding to the entries of the m by m * symmetric submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the symmetric submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * Before entry with SIDE = 'R' or 'r', this array contains * the local entries corresponding to the entries of the n by n * symmetric submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the symmetric submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) REAL array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) REAL array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, SideOp, UploA, cbtop, cbtopsave, cctop, cctopsave, rbtop, rbtopsave, rctop, rctopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, lside, mycol, myrow, nb, npcol, nprow, upper; double ABCest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PSSYMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSSYMM", "Illegal UPLO = %c\n", UploA ); info = -2; } if( lside ) { PB_Cchkmat( ctxt, "PSSYMM", "A", *M, 3, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PSSYMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PSSYMM", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PSSYMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PSSYMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PSSYMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cstypeset(); /* * If alpha is zero, sub( C ) := beta * sub( C ). */ if( ALPHA[REAL_PART] == ZERO ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char * ) C), Ci, Cj, Cd ); } else if( !( BETA[REAL_PART] == ONE ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (N >> M) * BCest : Both operands sub( B ) and sub( C ) are communicated (M >> N) */ if( lside ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ABCest = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp1 = DNROC( *N, Ad[NB_], npcol ); tmp2 = DNROC( *M, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); tmp4 = DNROC( *N, Cd[NB_], npcol ); BCest = (double)(*M) * ( ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.5 * ABCest ) <= BCest ); /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ChooseABC ) { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } } if( lside ) DirAB = ( rbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAB = ( cbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CpsymmAB( type, &DirAB, NOCONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_DEFAULT ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); } } if( lside ) DirAB = ( ( rbtop == CTOP_DRING || rctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); else DirAB = ( ( cbtop == CTOP_DRING || cctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); PB_CpsymmBC( type, &DirAB, NOCONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rbtopsave = *PB_Ctop( &ctxt, BCAST, ROW, &rbtopsave ); rctopsave = *PB_Ctop( &ctxt, COMBINE, ROW, &rctopsave ); cbtopsave = *PB_Ctop( &ctxt, BCAST, COLUMN, &cbtopsave ); cctopsave = *PB_Ctop( &ctxt, COMBINE, COLUMN, &cctopsave ); } /* * End of PSSYMM */ } scalapack-2.0.2/PBLAS/SRC/pssymv_.c000644 000766 000024 00000055174 10363532303 017057 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pssymv_( F_CHAR_T UPLO, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * BETA, float * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pssymv_( UPLO, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PSSYMV performs the matrix-vector operation * * sub( Y ) := alpha*sub( A )*sub( X ) + beta*sub( Y ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha and beta are scalars, sub( X ) and sub( Y ) are n element sub- * vectors and sub( A ) is an n by n symmetric submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries of the upper triangular part of the * symmetric submatrix sub( A ), and the local entries of the * strictly lower triangular of sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries of the lower triangular part of the * symmetric submatrix sub( A ), and the local entries of the * strictly upper triangular of sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten by the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCpbY, YCsum, YCld, YRfr, YRpbY, YRsum, YRld, Ycol, Yi, Yii, YisRow, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; float * tbeta; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XCd[DLEN_], XRd[DLEN_], Xd[DLEN_], YCd[DLEN_], YRd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, __FILE__, "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkmat( ctxt, "PSSYMV", "A", *N, 2, *N, 2, Ai, Aj, Ad, 7, &info ); PB_Cchkvec( ctxt, "PSSYMV", "X", *N, 2, Xi, Xj, Xd, *INCX, 11, &info ); PB_Cchkvec( ctxt, "PSSYMV", "Y", *N, 2, Yi, Yj, Yd, *INCY, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PSSYMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cstypeset(); /* * When alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; if( BETA[REAL_PART] == ZERO ) { sset_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } else { sscal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { if( BETA[REAL_PART] == ZERO ) { sset_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } else { sscal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Reuse sub( Y ) and/or create vectors YR in process rows and YC in process * columns spanned by sub( A ) */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) { PB_CInOutV( type, ROW, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW, ((char**)(&tbeta)), &YR, YRd, &YRfr, &YRsum, &YRpbY ); PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &YC, YCd, &YCfr, &YCsum ); } else { PB_CInOutV( type, COLUMN, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN, ((char**)(&tbeta)), &YC, YCd, &YCfr, &YCsum, &YCpbY ); PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &YR, YRd, &YRfr, &YRsum ); } /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd, ROW, &XC, XCd, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd, COLUMN, &XR, XRd, &XRfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd[LLD_]; XRld = XRd[LLD_]; YCld = YCd[LLD_]; YRld = YRd[LLD_]; /* * Scale YR or YC in the case sub( Y ) has been reused */ if( YisRow ) { /* * YR resides in (a) process row(s) */ if( !YRpbY ) { if( ( myrow == YRd[RSRC_] ) || ( YRd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YR */ if( Anq > 0 ) { if( tbeta[REAL_PART] == ZERO ) { sset_( &Anq, ((char *) tbeta), YR, &YRld ); } else { sscal_( &Anq, ((char *) tbeta), YR, &YRld ); } } } } } else { /* * YC resides in (a) process column(s) */ if( !YCpbY ) { if( ( mycol == YCd[CSRC_] ) || ( YCd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YC */ if( Amp > 0 ) { if( tbeta[REAL_PART] == ZERO ) { sset_( &Amp, ((char *) tbeta), YC, &ione ); } else { sscal_( &Amp, ((char *) tbeta), YC, &ione ); } } } } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { sgemv_( C2F_CHAR( NOTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, YC, &ione ); sgemv_( C2F_CHAR( TRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XC, &ione, one, Mptr( YR, 0, Akq, YRld, size ), &YRld ); } PB_Cpsym( type, type, LEFT, UPPER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, PB_Ctzsymv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsym( type, type, LEFT, LOWER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, PB_Ctzsymv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { sgemv_( C2F_CHAR( NOTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, Mptr( YC, Akp, 0, YCld, size ), &ione ); sgemv_( C2F_CHAR( TRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XC, Akp, 0, XCld, size ), &ione, one, Mptr( YR, 0, Akq, YRld, size ), &YRld ); } } } } if( XCfr ) free( XC ); if( XRfr ) free( XR ); if( YisRow ) { /* * Combine the partial column results into YC */ if( YCsum ) { YCd[CSRC_] = 0; if( Amp > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, 0 ); } } /* * Combine the partial row results into YR */ if( YRsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], YRd[RSRC_], mycol ); } /* * YR := YR + YC */ PB_Cpaxpby( type, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, YR, 0, 0, YRd, ROW ); /* * sub( Y ) := beta * sub( Y ) + YR (if necessary) */ if( YRpbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW ); } } else { /* * Combine the partial row results into YR */ if( YRsum ) { YRd[RSRC_] = 0; if( Anq > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], 0, mycol ); } } /* * Combine the partial column results into YC */ if( YCsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, YCd[CSRC_] ); } /* * YC := YR + YC */ PB_Cpaxpby( type, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, YC, 0, 0, YCd, COLUMN ); /* * sub( Y ) := beta * sub( Y ) + YC (if necessary) */ if( YCpbY ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN ); } } if( YCfr ) free( YC ); if( YRfr ) free( YR ); /* * End of PSSYMV */ } scalapack-2.0.2/PBLAS/SRC/pssyr2_.c000644 000766 000024 00000042406 10363532303 016752 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pssyr2_( F_CHAR_T UPLO, int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * Y, int * IY, int * JY, int * DESCY, int * INCY, float * A, int * IA, int * JA, int * DESCA ) #else void pssyr2_( UPLO, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; float * A, * X, * Y; #endif { /* * Purpose * ======= * * PSSYR2 performs the symmetric rank 2 operation * * sub( A ) := alpha*sub( X )*sub( Y )' + * alpha*sub( Y )*sub( X )' + sub( A ) , * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) and sub( Y ) are n element subvectors and * sub( A ) is an n by n symmetric submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) REAL array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCld, YRfr, YRld, Yi, Yj, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0 [DLEN_], XCd0[DLEN_], XRd0[DLEN_], Xd[DLEN_], YCd0[DLEN_], YRd0[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSSYR2", "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkvec( ctxt, "PSSYR2", "X", *N, 2, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PSSYR2", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PSSYR2", "A", *N, 2, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PSSYR2", info ); return; } #endif /* * Quick return if possible */ if( (*N == 0) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cstypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd0, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd0, ROW, &XC, XCd0, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd0, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd0, COLUMN, &XR, XRd0, &XRfr ); } /* * Replicate sub( Y ) in process rows (YR) and process columns (YC) spanned by * sub( A ) */ if( *INCY == Yd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ROW, &YR, YRd0, &YRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, YR, 0, 0, YRd0, ROW, &YC, YCd0, &YCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, COLUMN, &YC, YCd0, &YCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, YC, 0, 0, YCd0, COLUMN, &YR, YRd0, &YRfr ); } /* * Local rank-2 update if I own some data */ Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd0[LLD_]; YCld = YCd0[LLD_]; XRld = XRd0[LLD_]; YRld = YRd0[LLD_]; /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { sger_( &Akp, &Anq0, ((char *) ALPHA), XC, &ione, Mptr( YR, 0, Akq, YRld, size ), &YRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); sger_( &Akp, &Anq0, ((char *) ALPHA), YC, &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); } PB_Cpsyr2( type, UPPER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, Aptr, k, k, Ad0, PB_Ctzsyr2 ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsyr2( type, LOWER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, Aptr, k, k, Ad0, PB_Ctzsyr2 ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { sger_( &Amp0, &Anq0, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), &ione, Mptr( YR, 0, Akq, YRld, size ), &YRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); sger_( &Amp0, &Anq0, ((char *) ALPHA), Mptr( YC, Akp, 0, YCld, size ), &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } } } if( XRfr ) free( XR ); if( XCfr ) free( XC ); if( YRfr ) free( YR ); if( YCfr ) free( YC ); /* * End of PSSYR2 */ } scalapack-2.0.2/PBLAS/SRC/pssyr2k_.c000644 000766 000024 00000052317 10363532303 017127 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pssyr2k_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pssyr2k_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; float * A, * B, * C; #endif { /* * Purpose * ======= * * PSSYR2K performs one of the symmetric rank 2k operations * * sub( C ) := alpha*sub( A )*sub( B )' + alpha*sub( B )*sub( A )' + * beta*sub( C ), * * or * * sub( C ) := alpha*sub( A )'*sub( B ) + alpha*sub( B )'*sub( A ) + * beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise, and, * * sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N', * B(IB:IB+K-1,JB:JB+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * submatrix and sub( A ) and sub( B ) are n by k submatrices in the * first case and k by n submatrices in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the symmetric submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( B )' + * alpha*sub( B )*sub( A )' + * beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrices sub( A ) and sub( B ), and on * entry with TRANS = 'T' or 't' or 'C' or 'c', K specifies the * number of rows of the submatrices sub( A ) and sub( B ). * K must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and B corresponding to the entries of the submatrices * sub( A ) and sub( B ) respectively need not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) REAL array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JB+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( B ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( B ). * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) REAL array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double ABCest, ABest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSSYR2K", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSSYR2K", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) { PB_Cchkmat( ctxt, "PSSYR2K", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PSSYR2K", "B", *N, 3, *K, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PSSYR2K", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PSSYR2K", "B", *K, 4, *N, 3, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PSSYR2K", "C", *N, 3, *N, 3, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PSSYR2K", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cstypeset(); /* * And when alpha or K is zero */ if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (K >> N) * ABest : only sub( A ) and sub( B ) are communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[NB_], npcol ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[CSRC_] == -1 ) && ( Bd[CSRC_] == -1 ) ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + TWO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *K, Ad[MB_], nprow ); tmp4 = DNROC( *K, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[RSRC_] == -1 ) && ( Bd[RSRC_] == -1 ) ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + TWO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.4 * ABCest ) <= ABest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseABC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PSSYR2K */ } scalapack-2.0.2/PBLAS/SRC/pssyr_.c000644 000766 000024 00000033672 10363532303 016675 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pssyr_( F_CHAR_T UPLO, int * N, float * ALPHA, float * X, int * IX, int * JX, int * DESCX, int * INCX, float * A, int * IA, int * JA, int * DESCA ) #else void pssyr_( UPLO, N, ALPHA, X, IX, JX, DESCX, INCX, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX; float * A, * X; #endif { /* * Purpose * ======= * * PSSYR performs the symmetric rank 1 operation * * sub( A ) := alpha*sub( X )*sub( X )' + sub( A ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Alpha is a scalar, sub( X ) is an n element subvector and sub( A ) is * an n by n symmetric submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X * corresponding to the entries of the subvector sub( X ) need * not be set on input. * * X (local input) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * A (local input/local output) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XCd0[DLEN_], XRd0[DLEN_], Xd[DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSSYR", "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkvec( ctxt, "PSSYR", "X", *N, 2, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkmat( ctxt, "PSSYR", "A", *N, 2, *N, 2, Ai, Aj, Ad, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PSSYR", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cstypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd0, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd0, ROW, &XC, XCd0, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd0, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd0, COLUMN, &XR, XRd0, &XRfr ); } /* * Local rank-1 update if I own some data */ Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); XCld = XCd0[LLD_]; XRld = XRd0[LLD_]; if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) sger_( &Akp, &Anq0, ((char *) ALPHA), XC, &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); PB_Cpsyr( type, UPPER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Aptr, k, k, Ad0, PB_Ctzsyr ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsyr( type, LOWER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Aptr, k, k, Ad0, PB_Ctzsyr ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) sger_( &Amp0, &Anq0, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } } if( XRfr ) free( XR ); if( XCfr ) free( XC ); /* * End of PSSYR */ } scalapack-2.0.2/PBLAS/SRC/pssyrk_.c000644 000766 000024 00000045013 10363532303 017040 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pssyrk_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pssyrk_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * K, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PSSYRK performs one of the symmetric rank k operations * * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * * or * * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and, * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * submatrix and sub( A ) is an n by k submatrix in the first case and a * k by n submatrix in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the symmetric submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ). * * TRANS = 'C' or 'c' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry, with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrix sub( A ), and with TRANS = 'T' or * 't' or 'C' or 'c', K specifies the number of rows of the sub- * matrix sub( A ). K must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) REAL array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, ChooseAC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double Aest, ACest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSSYRK", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSSYRK", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PSSYRK", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PSSYRK", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PSSYRK", "C", *N, 3, *N, 3, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PSSYRK", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cstypeset(); /* * And when alpha or K is zero */ if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ACest: both operands sub( A ) and sub( C ) are communicated (K >> N) * Aest : only sub( A ) is communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Ad[MB_], nprow ); Aest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); Aest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseAC = ( ( 1.3 * ACest ) <= Aest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseAC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PSSYRK */ } scalapack-2.0.2/PBLAS/SRC/pstradd_.c000644 000766 000024 00000033334 10363532303 017151 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pstradd_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pstradd_( UPLO, TRANS, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PSTRADD adds a trapezoidal matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if TRANS = 'N', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'T', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'C', * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * upper or lower trapezoidal submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of the * array C containing the upper or lower triangular part of the * triangular submatrix sub( C ) is to be referenced as follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * triangular submatrix sub( C ) is to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * triangular submatrix sub( C ) is to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the form of op( sub( A ) ) to be * used in the matrix addition as follows: * * TRANS = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANS = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANS = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ) when TRANS = 'N' or 'n' and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u' and TRANS = 'N' or 'n' or * UPLO = 'L' or 'l' and TRANS = 'T', 'C', 't' or 'c', this ar- * ray contains the local entries corresponding to the entries * of the upper triangular submatrix sub( A ), and the local en- * tries corresponding to the entries of the strictly lower tri- * angular part of the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l' and TRANS = 'N' or 'n' or * UPLO = 'U' or 'u' and TRANS = 'T', 'C', 't' or 'c', this ar- * ray contains the local entries corresponding to the entries * of the lower triangular submatrix sub( A ), and the local en- * tries corresponding to the entries of the strictly upper tri- * angular part of the submatrix sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) REAL array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the triangular submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the triangular submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAC, TranOp, UploC, ctop, rtop; int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, notran, npcol, nprow, upper; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRADD", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRADD", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PSTRADD", "A", *M, 3, *N, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PSTRADD", "A", *N, 4, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PSTRADD", "C", *M, 3, *N, 4, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PSTRADD", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( PB_Cstypeset(), &UploC, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cstypeset(), &UploC, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ /* * This operation mainly involves point-to-point send and receive communication. * There is therefore no particular BLACS topology to recommend. Still, one can * choose the main loop direction in which the operands will be added, but not * transposed. This selection is based on the current setting for the BLACS * broadcast operations. */ rtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( *M <= *N ) DirAC = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_Cptradd( PB_Cstypeset(), &DirAC, &UploC, ( notran ? NOTRAN : TRAN ), *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PSTRADD */ } scalapack-2.0.2/PBLAS/SRC/pstran_.c000644 000766 000024 00000022423 10363532303 017014 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pstran_( int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * BETA, float * C, int * IC, int * JC, int * DESCC ) #else void pstran_( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ int * IA, * IC, * JA, * JC, * M, * N; float * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; float * A, * C; #endif { /* * Purpose * ======= * * PSTRAN transposes a matrix * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+M-1), and, op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+N-1,JA:JA+M-1)'. * * Beta is a scalar, sub( C ) is an m by n submatrix, and sub( A ) is an * n by m submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) REAL * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) REAL array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkmat( ctxt, "PSTRAN", "A", *N, 2, *M, 1, Ai, Aj, Ad, 7, &info ); PB_Cchkmat( ctxt, "PSTRAN", "C", *M, 1, *N, 2, Ci, Cj, Cd, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PSTRAN", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( PB_Cstypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cstypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ PB_Cptran( PB_Cstypeset(), NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PSTRAN */ } scalapack-2.0.2/PBLAS/SRC/pstrmm_.c000644 000766 000024 00000052352 10363532303 017033 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pstrmm_( F_CHAR_T SIDE, F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB ) #else void pstrmm_( SIDE, UPLO, TRANS, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, SIDE, TRANS, UPLO; int * IA, * IB, * JA, * JB, * M, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCB; float * A, * B; #endif { /* * Purpose * ======= * * PSTRMM performs one of the matrix-matrix operations * * sub( B ) := alpha * op( sub( A ) ) * sub( B ), * * or * * sub( B ) := alpha * sub( B ) * op( sub( A ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, sub( B ) is an m by n submatrix, sub( A ) is a * unit, or non-unit, upper or lower triangular submatrix and op( X ) is * one of * * op( X ) = X or op( X ) = X'. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether op( sub( A ) ) multiplies * sub( B ) from the left or right as follows: * * SIDE = 'L' or 'l' sub( B ) := alpha*op( sub( A ) )*sub( B ), * * SIDE = 'R' or 'r' sub( B ) := alpha*sub( B )*op( sub( A ) ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) REAL array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n transformed submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, DirB, OpC, OpR, SideOp, TopC, TopR, TranOp, UploA, Var, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ForceTop, ctxt, info, lside, mycol, myrow, nb, notran, nounit, npcol, nprow, upper; double ABestL, ABestR, Best, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1101 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRMM", "Illegal UPLO = %c\n", UploA ); info = -2; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRMM", "Illegal TRANS = %c\n", TranOp ); info = -3; } if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRMM", "Illegal DIAG = %c\n", DiagA ); info = -4; } if( lside ) PB_Cchkmat( ctxt, "PSTRMM", "A", *M, 5, *M, 5, Ai, Aj, Ad, 11, &info ); else PB_Cchkmat( ctxt, "PSTRMM", "A", *N, 6, *N, 6, Ai, Aj, Ad, 11, &info ); PB_Cchkmat( ctxt, "PSTRMM", "B", *M, 5, *N, 6, Bi, Bj, Bd, 15, &info ); } if( info ) { PB_Cabort( ctxt, "PSTRMM", info ); return; } #endif /* * Quick return if possible */ if( *M == 0 || *N == 0 ) return; /* * Get type structure */ type = PB_Cstypeset(); /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) B), Bi, Bj, Bd ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABestR, ABestL : both operands sub( A ) and sub( B ) are communicated * ( N >> M when SIDE is left and M >> N otherwise ) * Best : only sub( B ) is communicated * ( M >> N when SIDE is left and N >> M otherwise ) */ if( lside ) { if( notran ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); ChooseAB = ( ( 1.1 * ABestR ) <= Best ); } else { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestL = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + MAX( tmp2, tmp1 ) / TWO ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); ChooseAB = ( ( ( 1.1 * ABestL ) <= Best ) || ( ( 1.1 * ABestR ) <= Best ) ); } } else { if( notran ) { tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); ChooseAB = ( ( 1.1 * ABestR ) <= Best ); } else { tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestL = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + CBRATIO * ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + MAX( tmp2, tmp1 ) / TWO ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); ChooseAB = ( ( ( 1.1 * ABestL ) <= Best ) || ( ( 1.1 * ABestR ) <= Best ) ); } } /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); if( ChooseAB ) { if( lside ) { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { if( ABestL <= ABestR ) { OpR = CBCAST; OpC = CCOMBINE; Var = CLEFT; if( upper ) { TopR = CTOP_DRING; TopC = CTOP_IRING; } else { TopR = CTOP_IRING; TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } } } else { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { if( ABestL <= ABestR ) { OpR = CCOMBINE; OpC = CBCAST; Var = CLEFT; if( upper ) { TopR = CTOP_DRING; TopC = CTOP_IRING; } else { TopR = CTOP_IRING; TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } PB_CptrmmAB( type, &Var, &SideOp, &UploA, ( notran ? NOTRAN : TRAN ), &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); } else { if( ( lside && notran ) || ( !( lside ) && !( notran ) ) ) { OpR = CCOMBINE; OpC = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } } else { OpR = CBCAST; OpC = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } } if( lside ) DirB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirB = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CptrmmB( type, &DirB, &SideOp, &UploA, ( notran ? NOTRAN : TRAN ), &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PSTRMM */ } scalapack-2.0.2/PBLAS/SRC/pstrmv_.c000644 000766 000024 00000047473 10363532303 017054 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pstrmv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pstrmv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; /* * .. Array Arguments .. */ int * DESCA, * DESCX; float * A, * X; #endif { /* * Purpose * ======= * * PSTRMV performs one of the matrix-vector operations * * sub( X ) := sub( A )*sub( X ) or sub( X ) := sub( A )'*sub( X ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * sub( X ) is an n element subvector and sub( A ) is an n by n unit, * or non-unit, upper or lower triangular submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( X ) := sub( A ) * sub( X ). * * TRANS = 'T' or 't' sub( X ) := sub( A )' * sub( X ). * * TRANS = 'C' or 'c' sub( X ) := sub( A )' * sub( X ). * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the transfor- * med subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Xroc, * one, * tbeta, top, *zero; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XAfr, XAld, Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, YAfr, YAld, YApbY, YAsum, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_]; char * Aptr = NULL, * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRMV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRMV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRMV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PSTRMV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info ); PB_Cchkvec( ctxt, "PSTRMV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PSTRMV", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cstypeset(); size = type->size; one = type->one; zero = type->zero; /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN ); if( notran ) { /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XA, XAd, &XAfr ); /* * Reuse sub( X ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( type, COLUMN, *N, *N, Ad0, 1, one, ((char *) X), Xi, Xj, Xd, &Xroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * If sub( X ) is distributed in (a) process column(s), then zero it. */ if( Xroc == CCOLUMN ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * sub( X ) resides in (a) process columns(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { sset_( &Xnp, zero, Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], size ), &ione ); } } } } else { /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XA, XAd, &XAfr ); /* * Reuse sub( X ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( type, ROW, *N, *N, Ad0, 1, one, ((char *) X), Xi, Xj, Xd, &Xroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * If sub( X ) is distributed in (a) process row(s), then zero it. */ if( Xroc == CROW ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; sset_( &Xnq, zero, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xld ); } } } } /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XAld = XAd[LLD_]; YAld = YAd[LLD_]; /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { sgemv_( TRANS, &Akp, &Anq0, one, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, YA, &ione ); } PB_Cptrm( type, type, LEFT, UPPER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, size ), YAld, PB_Ctztrmv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { sgemv_( TRANS, &Akp, &Anq0, one, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XA, &ione, one, Mptr( YA, 0, Akq, YAld, size ), &YAld ); } PB_Cptrm( type, type, LEFT, UPPER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, size ), YAld, PB_Ctztrmv ); } } } else { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, type, LEFT, LOWER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, size ), YAld, PB_Ctztrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { sgemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, Mptr( YA, Akp, 0, YAld, size ), &ione ); } } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, type, LEFT, LOWER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, size ), YAld, PB_Ctztrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { sgemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, Akp, 0, XAld, size ), &ione, one, Mptr( YA, 0, Akq, YAld, size ), &YAld ); } } } } } if( XAfr ) free( XA ); if( notran ) { /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Csgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } /* * sub( X ) := YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, YA, 0, 0, YAd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } } else { /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Csgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } /* * sub( X ) := YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } } if( YAfr ) free( YA ); /* * End of PSTRMV */ } scalapack-2.0.2/PBLAS/SRC/pstrsm_.c000644 000766 000024 00000052317 11622500733 017043 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pstrsm_( F_CHAR_T SIDE, F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * M, int * N, float * ALPHA, float * A, int * IA, int * JA, int * DESCA, float * B, int * IB, int * JB, int * DESCB ) #else void pstrsm_( SIDE, UPLO, TRANS, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, SIDE, TRANS, UPLO; int * IA, * IB, * JA, * JB, * M, * N; float * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCB; float * A, * B; #endif { /* * Purpose * ======= * * PSTRSM solves one of the matrix equations * * op( sub( A ) )*X = alpha*sub( B ), or * * X*op( sub( A ) ) = alpha*sub( B ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is * a unit, or non-unit, upper or lower triangular submatrix and op( Y ) * is one of * * op( Y ) = Y or op( Y ) = Y'. * * The submatrix X is overwritten on sub( B ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether op( sub( A ) ) appears on * the left or right of X as follows: * * SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ), * * SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = sub( A )'. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) REAL array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n solution submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, DirB, OpC, OpR, SideOp, TopC, TopR, TranOp, UploA, Var, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ForceTop, ctxt, info, itmp, lside, mycol, myrow, nb, notran, nounit, npcol, nprow, upper; double ABestL, ABestR, Best, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1101 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRSM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRSM", "Illegal UPLO = %c\n", UploA ); info = -2; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRSM", "Illegal TRANS = %c\n", TranOp ); info = -3; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRSM", "Illegal DIAG = %c\n", DiagA ); info = -4; } if( lside ) PB_Cchkmat( ctxt, "PSTRSM", "A", *M, 5, *M, 5, Ai, Aj, Ad, 11, &info ); else PB_Cchkmat( ctxt, "PSTRSM", "A", *N, 6, *N, 6, Ai, Aj, Ad, 11, &info ); PB_Cchkmat( ctxt, "PSTRSM", "B", *M, 5, *N, 6, Bi, Bj, Bd, 15, &info ); } if( info ) { PB_Cabort( ctxt, "PSTRSM", info ); return; } #endif /* * Quick return if possible */ if( *M == 0 || *N == 0 ) return; /* * Get type structure */ type = PB_Cstypeset(); /* * And when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) B), Bi, Bj, Bd ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); /* * ABestR, ABestL : both operands sub( A ) and sub( B ) are communicated * ( N >> M when SIDE is left and M >> N otherwise ) * Best : only sub( B ) is communicated * ( M >> N when SIDE is left and N >> M otherwise ) */ if( lside ) { if( notran ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); itmp = MIN( Ad[MB_], Ad[NB_] ); Best = (double)(*N) * ( (double)(CEIL( *M, itmp )) * (double)(itmp) * ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ); ChooseAB = ( ABestR <= ( 2.0 * Best ) ); } else { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestL = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + MAX( tmp2, tmp1 ) / TWO ); itmp = MIN( Ad[MB_], Ad[NB_] ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( (double)(CEIL( *M, itmp )) * (double)(itmp) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ) + MAX( tmp2, tmp3 ) ); ChooseAB = ( ( ABestL <= ( 2.0 * Best ) ) || ( ABestR <= ( 2.0 * Best ) ) ); } } else { if( notran ) { tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); itmp = MIN( Ad[MB_], Ad[NB_] ); Best = (double)(*M) * ( (double)(CEIL( *N, itmp )) * (double)(itmp) * ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ); ChooseAB = ( ABestR <= ( 2.0 * Best ) ); } else { tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestL = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + CBRATIO * ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + MAX( tmp2, tmp1 ) / TWO ); itmp = MIN( Ad[MB_], Ad[NB_] ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( (double)(CEIL( *N, itmp )) * (double)(itmp) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ) + MAX( tmp1, tmp4 ) ); ChooseAB = ( ( ABestL <= ( 2.0 * Best ) ) || ( ABestR <= ( 2.0 * Best ) ) ); } } /* * Var can remain uninitialized but is nevertheless used in PB_CptrsmAB.c * provide a default here. TODO: does this make sense ? *==19891== at 0x44F81B: PB_CptrsmAB (PB_CptrsmAB.c:538) *==19891== by 0x427BE7: pdtrsm_ (pdtrsm_.c:488) *==19891== by 0x405E46: MAIN_ (pdblas3tim.f:727) */ Var = CRIGHT; if( ChooseAB ) { /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ ForceTop = ( ( *M > nb ) && ( *N > nb ) ); if( ForceTop ) { if( lside ) { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { if( ABestL <= ABestR ) { OpR = CBCAST; OpC = CCOMBINE; Var = CLEFT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } } } else { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { if( ABestL <= ABestR ) { OpR = CCOMBINE; OpC = CBCAST; Var = CLEFT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } PB_CptrsmAB( type, &Var, &SideOp, &UploA, ( notran ? NOTRAN : TRAN ), &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } } else { /* * BLACS topologies are always enforced. */ if( ( lside && notran ) || ( !lside && !notran ) ) { OpR = CCOMBINE; OpC = CBCAST; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } /* * Remove the next line when the BLACS combine operations support ring * topologies */ TopR = CTOP_DEFAULT; } else { OpR = CBCAST; OpC = CCOMBINE; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } /* * Remove the next line when the BLACS combine operations support ring * topologies */ TopC = CTOP_DEFAULT; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); if( lside ) DirB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirB = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CptrsmB( type, &DirB, &SideOp, &UploA, ( notran ? NOTRAN : TRAN ), &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); /* * Restore the BLACS topologies. */ rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PSTRSM */ } scalapack-2.0.2/PBLAS/SRC/pstrsv_.c000644 000766 000024 00000076044 10363532303 017056 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pstrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, float * A, int * IA, int * JA, int * DESCA, float * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pstrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; /* * .. Array Arguments .. */ int * DESCA, * DESCX; float * A, * X; #endif { /* * Purpose * ======= * * PSTRSV solves one of the systems of equations * * sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b, * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * b and sub( X ) are n element subvectors and sub( A ) is an n by n * unit, or non-unit, upper or lower triangular submatrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( A ) * sub( X ) = b. * * TRANS = 'T' or 't' sub( A )' * sub( X ) = b. * * TRANS = 'C' or 'c' sub( A )' * sub( X ) = b. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * A (local input) REAL array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) REAL array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On entry, sub( X ) is the n element right-hand side * b. On exit, sub( X ) is overwritten with the solution subvec- * tor. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one, * zero; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb, Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld, XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info, ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_]; char * Aptr = NULL, * XAC = NULL, * XAR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRSV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRSV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PSTRSV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PSTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info ); PB_Cchkvec( ctxt, "PSTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PSTRSV", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cstypeset(); size = type->size; one = type->one; zero = type->zero; negone = type->negone; /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN ); if( notran ) { if( upper ) { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING ); (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process column owning the last * column of sub( A ) */ PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, &XACfr, &XACsum, &XACapbX ); /* * Create vector XAR in process rows spanned by sub( A ) */ PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, &XARsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAC contains the solution scattered in multiple * process columns and XAR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ if( Akp > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( XACsum ) { kbprev = MIN( k, nb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow, Arow, nprow ); Akp -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) sgemv_( TRANS, &ktmp, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol ); Csgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 0, XACld, size ), XACld, myrow, Asrc ); if( mycol != Asrc ) sset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } if( Akp > 0 && Anq0 > 0 ) sgemv_( TRANS, &Akp, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, XAC, &ione ); } else { if( Anq0 > 0 ) sgemv_( TRANS, &Akp, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, XAC, &ione ); } } } /* * Combine the scattered resulting vector XAC */ if( XACsum && ( Anp > 0 ) ) { Csgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, XACd[CSRC_] ); } /* * sub( X ) := XAC (if necessary) */ if( XACapbX ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); } else { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process column owning the first * column of sub( A ) */ PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, &XACfr, &XACsum, &XACapbX ); /* * Create vector XAR in process rows spanned by sub( A ) */ PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, &XARsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = 0; k < *N; k += nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAC contains the solution scattered in multiple * process columns and XAR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow ); if( ( Anp0 = Anp - Akp ) > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( XACsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow, nprow ); Anp0 -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) sgemv_( TRANS, &ktmp, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol ); Csgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 0, XACld, size ), XACld, myrow, Asrc ); if( mycol != Asrc ) sset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } if( Anp0 > 0 && Anq0 > 0 ) sgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione ); } else { if( Anq0 > 0 ) sgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } } } /* * Combine the scattered resulting vector XAC */ if( XACsum && ( Anp > 0 ) ) { Csgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, XACd[CSRC_] ); } /* * sub( X ) := XAC (if necessary) */ if( XACapbX ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); } } else { if( upper ) { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAR in process row owning the first row * of sub( A ) */ PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, &XARfr, &XARsum, &XARapbX ); /* * Create vector XAC in process columns spanned by sub( A ) */ PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, &XACsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = 0; k < *N; k += nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAR contains the solution scattered in multiple * process rows and XAC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anq0 = Anq - Akq ) > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( XARsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol, npcol ); Anq0 -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) sgemv_( TRANS, &Anp0, &ktmp, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow ); Csgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, Akq, XARld, size ), XARld, Asrc, mycol ); if( myrow != Asrc ) sset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } if( Anp0 > 0 && Anq0 > 0 ) sgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld ); } else { if( Anp0 > 0 ) sgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } } } /* * Combine the scattered resulting vector XAR */ if( XARsum && ( Anq > 0 ) ) { Csgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], mycol ); } /* * sub( X ) := XAR (if necessary) */ if( XARapbX ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); } else { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process row owning the last row * of sub( A ) */ PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, &XARfr, &XARsum, &XARapbX ); /* * Create vector XAC in process columns spanned by sub( A ) */ PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, &XACsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAR contains the solution scattered in multiple * process rows and XAC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later * is only locally updated. */ if( Akq > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( XARsum ) { kbprev = MIN( k, nb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol, Acol, npcol ); Akq -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) sgemv_( TRANS, &Anp0, &ktmp, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow ); Csgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, Akq, XARld, size ), XARld, Asrc, mycol ); if( myrow != Asrc ) sset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } if( Anp0 > 0 && Akq > 0 ) sgemv_( TRANS, &Anp0, &Akq, negone, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, XAR, &XARld ); } else { if( Anp0 > 0 ) sgemv_( TRANS, &Anp0, &Akq, negone, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, XAR, &XARld ); } } } /* * Combine the scattered resulting vector XAR */ if( XARsum && ( Anq > 0 ) ) { Csgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], mycol ); } /* * sub( X ) := XAR (if necessary) */ if( XARapbX ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); } } if( XACfr ) free( XAC ); if( XARfr ) free( XAR ); /* * End of PSTRSV */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/000755 000766 000024 00000000000 11750301611 016214 5ustar00juliestaff000000 000000 scalapack-2.0.2/PBLAS/SRC/PTZBLAS/000755 000766 000024 00000000000 11750301606 016317 5ustar00juliestaff000000 000000 scalapack-2.0.2/PBLAS/SRC/pzagemv_.c000644 000766 000024 00000045407 10363532303 017165 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzagemv_( F_CHAR_T TRANS, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzagemv_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PZAGEMV performs one of the matrix-vector operations * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( sub( A )' )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( conjg( sub( A )' ) )*abs( sub( X ) ) * + abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1). * * When TRANS = 'N', * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+M-1), if INCY = M_Y, * Y(IY:IY+M-1,JY:JY), if INCY = 1 and INCY <> M_Y, * and, otherwise * * sub( X ) denotes X(IX:IX,JX:JX+M-1), if INCX = M_X, * X(IX:IX+M-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a real subvector, * sub( X ) is a subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := |alpha|*|sub( A ) | * |sub( X )| + * |beta*sub( Y )|, * * TRANS = 'T' or 't', * sub( Y ) := |alpha|*|sub( A )'| * |sub( X )| + * |beta*sub( Y )|, * * TRANS = 'C' or 'c', * sub( Y ) := |alpha|*|conjg( sub( A )' )|*|sub( X )| + * |beta*sub( Y )|. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char TrA, Yroc, * one, * tbeta, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, mycol, myrow, nota, npcol, nprow; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZAGEMV", "Illegal TRANS=%c\n", TrA ); info = -1; } PB_Cchkmat( ctxt, "PZAGEMV", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); if( nota ) { PB_Cchkvec( ctxt, "PZAGEMV", "X", *N, 3, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PZAGEMV", "Y", *M, 2, Yi, Yj, Yd, *INCY, 18, &info ); } else { PB_Cchkvec( ctxt, "PZAGEMV", "X", *M, 2, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PZAGEMV", "Y", *N, 3, Yi, Yj, Yd, *INCY, 18, &info ); } } if( info ) { PB_Cabort( ctxt, "PZAGEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); utyp = PB_Cdtypeset(); /* * When alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, utyp->size ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], utyp->size ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( nota ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( utyp, COLUMN, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { zagemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *) A), Aii, Ajj, Ald, type->size), &Ald, XA, &XAd[LLD_], tbeta, YA, &ione ); } if( XAfr ) free( XA ); /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( utyp, ROW, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { zagemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *) A), Aii, Ajj, Ald, type->size ), &Ald, XA, &ione, tbeta, YA, &YAd[LLD_] ); } if( XAfr ) free( XA ); /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, utyp->size ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], utyp->size ), INCY ); } } } one = utyp->one; if( nota ) { PB_Cpaxpby( utyp, NOCONJG, *M, 1, one, YA, 0, 0, YAd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } else { PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PZAGEMV */ } scalapack-2.0.2/PBLAS/SRC/pzahemv_.c000644 000766 000024 00000056314 10363532303 017165 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzahemv_( F_CHAR_T UPLO, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzahemv_( UPLO, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PZAHEMV performs the matrix-vector operation * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a n element real subvec- * tor, sub( X ) is an n element subvector and sub( A ) is an n by n * Hermitian submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries of the upper triangular part of the * Hermitian submatrix sub( A ), and the local entries of the * strictly lower triangular of sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries of the lower triangular part of the * Hermitian submatrix sub( A ), and the local entries of the * strictly upper triangular of sub( A ) are not referenced. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set and assumed to be zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten by the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCpbY, YCsum, YCld, YRfr, YRpbY, YRsum, YRld, Ycol, Yi, Yii, YisRow, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper, usiz; double * tbeta; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XCd[DLEN_], XRd[DLEN_], Xd[DLEN_], YCd[DLEN_], YRd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, __FILE__, "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkmat( ctxt, "PZAHEMV", "A", *N, 2, *N, 2, Ai, Aj, Ad, 7, &info ); PB_Cchkvec( ctxt, "PZAHEMV", "X", *N, 2, Xi, Xj, Xd, *INCX, 11, &info ); PB_Cchkvec( ctxt, "PZAHEMV", "Y", *N, 2, Yi, Yj, Yd, *INCY, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PZAHEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); size = type->size; utyp = PB_Cdtypeset(); usiz = type->usiz; /* * and when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Reuse sub( Y ) and/or create vectors YR in process rows and YC in process * columns spanned by sub( A ) */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) { PB_CInOutV( utyp, ROW, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW, ((char**)(&tbeta)), &YR, YRd, &YRfr, &YRsum, &YRpbY ); PB_COutV( utyp, COLUMN, INIT, *N, *N, Ad0, 1, &YC, YCd, &YCfr, &YCsum ); } else { PB_CInOutV( utyp, COLUMN, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN, ((char**)(&tbeta)), &YC, YCd, &YCfr, &YCsum, &YCpbY ); PB_COutV( utyp, ROW, INIT, *N, *N, Ad0, 1, &YR, YRd, &YRfr, &YRsum ); } /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd, ROW, &XC, XCd, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd, COLUMN, &XR, XRd, &XRfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd[LLD_]; XRld = XRd[LLD_]; YCld = YCd[LLD_]; YRld = YRd[LLD_]; /* * Scale YR or YC in the case sub( Y ) has been reused */ if( YisRow ) { /* * YR resides in (a) process row(s) */ if( !YRpbY ) { if( ( myrow == YRd[RSRC_] ) || ( YRd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YR */ if( Anq > 0 ) dascal_( &Anq, ((char *) tbeta), YR, &YRld ); } } } else { /* * YC resides in (a) process column(s) */ if( !YCpbY ) { if( ( mycol == YCd[CSRC_] ) || ( YCd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YC */ if( Amp > 0 ) dascal_( &Amp, ((char *) tbeta), YC, &ione ); } } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &utyp->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { zagemv_( C2F_CHAR( NOTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, YC, &ione ); zagemv_( C2F_CHAR( COTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XC, &ione, one, Mptr( YR, 0, Akq, YRld, usiz ), &YRld ); } PB_Cpsym( type, utyp, LEFT, UPPER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, usiz ), YCld, Mptr( YR, 0, Akq, YRld, usiz ), YRld, PB_Ctzahemv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsym( type, utyp, LEFT, LOWER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, usiz ), YCld, Mptr( YR, 0, Akq, YRld, usiz ), YRld, PB_Ctzahemv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { zagemv_( C2F_CHAR( NOTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, Mptr( YC, Akp, 0, YCld, usiz ), &ione ); zagemv_( C2F_CHAR( COTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XC, Akp, 0, XCld, size ), &ione, one, Mptr( YR, 0, Akq, YRld, usiz ), &YRld ); } } } } if( XCfr ) free( XC ); if( XRfr ) free( XR ); if( YisRow ) { /* * Combine the partial column results into YC */ if( YCsum ) { YCd[CSRC_] = 0; if( Amp > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, 0 ); } } /* * Combine the partial row results into YR */ if( YRsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], YRd[RSRC_], mycol ); } /* * YR := YR + YC */ PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, YR, 0, 0, YRd, ROW ); /* * sub( Y ) := beta * sub( Y ) + YR (if necessary) */ if( YRpbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || Yrow < 0 ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, ((char *) Y), Yi, Yj, Yd, ROW ); } } else { /* * Combine the partial row results into YR */ if( YRsum ) { YRd[RSRC_] = 0; if( Anq > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], 0, mycol ); } } /* * Combine the partial column results into YC */ if( YCsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, YCd[CSRC_] ); } /* * YC := YR + YC */ PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, YC, 0, 0, YCd, COLUMN ); /* * sub( Y ) := beta * sub( Y ) + YC (if necessary) */ if( YCpbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || Ycol < 0 ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, COLUMN ); } } if( YCfr ) free( YC ); if( YRfr ) free( YR ); /* * End of PZAHEMV */ } scalapack-2.0.2/PBLAS/SRC/pzamax_.c000644 000766 000024 00000053214 10363532303 017007 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzamax_( int * N, double * AMAX, int * INDX, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pzamax_( N, AMAX, INDX, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * INDX, * IX, * JX, * N; double * AMAX; /* * .. Array Arguments .. */ int * DESCX; double * X; #endif { /* * Purpose * ======= * * PZAMAX computes the global index of the maximum element in absolute * value of a subvector sub( X ). The global index is returned in INDX * and the value of that element is returned in AMAX, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * AMAX (global output) COMPLEX*16 array * On exit, AMAX specifies the largest entry in absolute value * of the subvector sub( X ) only in its scope (See below for * further details). * * INDX (global output) INTEGER * On exit, INDX specifies the global index of the maximum ele- * ment in absolute value of the subvector sub( X ) only in its * scope (See below for further details). * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char cbtop, cctop, rbtop, rctop; int Xcol, Xgindx, Xi, Xii, Ximb, Xinb, Xj, Xjj, Xlindx, Xld, Xmb, Xnb, Xnp, Xnq, Xrow, Xsrc, ctxt, dist, dst, idumm, info, k, maxpos, mycol, mydist, myrow, npcol, nprow, src, size; PBTYP_T * type; /* * .. Local Arrays .. */ char * Xptr; int Xd[DLEN_]; cmplx16 work[4]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PZAMAX", "X", *N, 1, Xi, Xj, Xd, *INCX, 7, &info ); if( info ) { PB_Cabort( ctxt, "PZAMAX", info ); return; } #endif /* * Initialize INDX and AMAX */ *INDX = 0; AMAX[REAL_PART] = ZERO; AMAX[IMAG_PART] = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Handle degenerate case separately, sub( X )'s scope is just one process */ if( ( *INCX == 1 ) && ( Xd[M_] == 1 ) && ( *N == 1 ) ) { /* * Make sure I own some data and compute INDX and AMAX */ if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) && ( ( mycol == Xcol ) || ( Xcol < 0 ) ) ) { *INDX = *JX; type = PB_Cztypeset(); Xptr = Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ); AMAX[REAL_PART] = ((double*)(Xptr))[REAL_PART]; AMAX[IMAG_PART] = ((double*)(Xptr))[IMAG_PART]; } return; } else if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); if( ( rctop == CTOP_DEFAULT ) || ( rctop == CTOP_TREE1 ) ) { /* * Inline the 1-tree combine for communication savings */ Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_]; Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cztypeset(); size = type->size; Xlindx = Xjj - 1 + izamax_( &Xnq, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xld ); Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol ); Xptr = Mptr( ((char *) X), Xii, Xlindx, Xld, size ); work[0][REAL_PART] = ((double*)(Xptr))[REAL_PART]; work[0][IMAG_PART] = ((double*)(Xptr))[IMAG_PART]; work[1][REAL_PART] = ((double)( Xgindx+1 )); work[1][IMAG_PART] = ZERO; } else { work[0][REAL_PART] = ZERO; work[0][IMAG_PART] = ZERO; work[1][REAL_PART] = ZERO; work[1][IMAG_PART] = ZERO; } /* * Combine the local results using a 1-tree topology within process column 0 * if npcol > 1 or Xcol >= 0, i.e sub( X ) is distributed. */ if( ( npcol >= 2 ) && ( Xcol >= 0 ) ) { mydist = mycol; k = 1; l_10: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, npcol ); Czgesd2d( ctxt, 2, 1, ((char*)work), 2, myrow, dst ); goto l_20; } else { dist = mycol + k; src = MPosMod( dist, npcol ); if( mycol < src ) { Czgerv2d( ctxt, 2, 1, ((char*) work[2]), 2, myrow, src ); if( ( ABS( work[0][REAL_PART] ) + ABS( work[0][IMAG_PART] ) ) < ( ABS( work[2][REAL_PART] ) + ABS( work[2][IMAG_PART] ) ) ) { work[0][REAL_PART] = work[2][REAL_PART]; work[0][IMAG_PART] = work[2][IMAG_PART]; work[1][REAL_PART] = work[3][REAL_PART]; } } mydist >>= 1; } k <<= 1; if( k < npcol ) goto l_10; l_20: /* * Process column 0 broadcasts the combined values of INDX and AMAX within * their process row. */ rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == 0 ) { Czgebs2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2 ); } else { Czgebr2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2, myrow, 0 ); } } /* * Set INDX and AMAX to the replicated answers contained in work. If AMAX is * zero, then select a coherent INDX. */ AMAX[REAL_PART] = work[0][REAL_PART]; AMAX[IMAG_PART] = work[0][IMAG_PART]; *INDX = ( ( ( AMAX[REAL_PART] == ZERO ) && ( AMAX[IMAG_PART] == ZERO ) ) ? ( *JX ) : ( (int)(work[1][REAL_PART]) ) ); } else { /* * Otherwise use the current topology settings to combine the results */ Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_]; Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnq > 0 ) { /* * Compute the local maximum and its corresponding local index */ Xld = Xd[LLD_]; type = PB_Cztypeset(); size = type->size; Xlindx = Xjj - 1 + izamax_( &Xnq, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xld ); Xptr = Mptr( ((char *) X), Xii, Xlindx, Xld, size ); AMAX[REAL_PART] = ((double*)(Xptr))[REAL_PART]; AMAX[IMAG_PART] = ((double*)(Xptr))[IMAG_PART]; } else { AMAX[REAL_PART] = ZERO; AMAX[IMAG_PART] = ZERO; } if( Xcol >= 0 ) { /* * Combine leave on all the local maximum if Xcol >= 0, i.e sub( X ) is * distributed */ Czgamx2d( ctxt, ROW, &rctop, 1, 1, ((char*)AMAX), 1, &idumm, &maxpos, 1, -1, mycol ); /* * Broadcast the corresponding global index */ if( ( AMAX[REAL_PART] != ZERO ) || ( AMAX[IMAG_PART] != ZERO ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == maxpos ) { Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol ); *INDX = Xgindx + 1; Cigebs2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1 ); } else { Cigebr2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1, myrow, maxpos ); } } else { /* * If AMAX is zero, then select a coherent INDX. */ *INDX = *JX; } } else { /* * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX. */ *INDX = ( ( ( AMAX[REAL_PART] == ZERO ) && ( AMAX[IMAG_PART] == ZERO ) ) ? ( *JX ) : Xlindx + 1 ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ( cctop == CTOP_DEFAULT ) || ( cctop == CTOP_TREE1 ) ) { /* * Inline the 1-tree combine for communication savings */ Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_]; Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnp > 0 ) { Xld = Xd[LLD_]; type = PB_Cztypeset(); size = type->size; Xlindx = Xii - 1 + izamax_( &Xnp, Mptr( ((char *)X), Xii, Xjj, Xld, size ), INCX ); Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow ); Xptr = Mptr( ((char *) X), Xlindx, Xjj, Xld, size ); work[0][REAL_PART] = ((double*)(Xptr))[REAL_PART]; work[0][IMAG_PART] = ((double*)(Xptr))[IMAG_PART]; work[1][REAL_PART] = ((double)( Xgindx+1 )); work[1][IMAG_PART] = ZERO; } else { work[0][REAL_PART] = ZERO; work[0][IMAG_PART] = ZERO; work[1][REAL_PART] = ZERO; work[1][IMAG_PART] = ZERO; } /* * Combine the local results using a 1-tree topology within process row 0 * if nprow > 1 or Xrow >= 0, i.e sub( X ) is distributed. */ if( ( nprow >= 2 ) && ( Xrow >= 0 ) ) { mydist = myrow; k = 1; l_30: if( mydist & 1 ) { dist = k * ( mydist - 1 ); dst = MPosMod( dist, nprow ); Czgesd2d( ctxt, 2, 1, ((char*)work), 2, dst, mycol ); goto l_40; } else { dist = myrow + k; src = MPosMod( dist, nprow ); if( myrow < src ) { Czgerv2d( ctxt, 2, 1, ((char*) work[2]), 2, src, mycol ); if( ( ABS( work[0][REAL_PART] ) + ABS( work[0][IMAG_PART] ) ) < ( ABS( work[2][REAL_PART] ) + ABS( work[2][IMAG_PART] ) ) ) { work[0][REAL_PART] = work[2][REAL_PART]; work[0][IMAG_PART] = work[2][IMAG_PART]; work[1][REAL_PART] = work[3][REAL_PART]; } } mydist >>= 1; } k <<= 1; if( k < nprow ) goto l_30; l_40: /* * Process row 0 broadcasts the combined values of INDX and AMAX within their * process column. */ cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == 0 ) { Czgebs2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2 ); } else { Czgebr2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2, 0, mycol ); } } /* * Set INDX and AMAX to the replicated answers contained in work. If AMAX is * zero, then select a coherent INDX. */ AMAX[REAL_PART] = work[0][REAL_PART]; AMAX[IMAG_PART] = work[0][IMAG_PART]; *INDX = ( ( ( AMAX[REAL_PART] == ZERO ) && ( AMAX[IMAG_PART] == ZERO ) ) ? ( *IX ) : ( (int)(work[1][REAL_PART]) ) ); } else { /* * Otherwise use the current topology settings to combine the results */ Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_]; Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow ); /* * Make sure I own some data and compute local INDX and AMAX */ if( Xnp > 0 ) { /* * Compute the local maximum and its corresponding local index */ Xld = Xd[LLD_]; type = PB_Cztypeset(); size = type->size; Xlindx = Xii - 1 + izamax_( &Xnp, Mptr( ((char *) X), Xii, Xjj, Xld, size ), INCX ); Xptr = Mptr( ((char *) X), Xlindx, Xjj, Xld, size ); AMAX[REAL_PART] = ((double*)(Xptr))[REAL_PART]; AMAX[IMAG_PART] = ((double*)(Xptr))[IMAG_PART]; } else { AMAX[REAL_PART] = ZERO; AMAX[IMAG_PART] = ZERO; } if( Xrow >= 0 ) { /* * Combine leave on all the local maximum if Xrow >= 0, i.e sub( X ) is * distributed. */ Czgamx2d( ctxt, COLUMN, &cctop, 1, 1, ((char*)AMAX), 1, &maxpos, &idumm, 1, -1, mycol ); /* * Broadcast the corresponding global index */ if( ( AMAX[REAL_PART] != ZERO ) || ( AMAX[IMAG_PART] != ZERO ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == maxpos ) { Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow ); *INDX = Xgindx + 1; Cigebs2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1 ); } else { Cigebr2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1, maxpos, mycol ); } } else { /* * If AMAX is zero, then select a coherent INDX. */ *INDX = *IX; } } else { /* * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX. */ *INDX = ( ( ( AMAX[REAL_PART] == ZERO ) && ( AMAX[IMAG_PART] == ZERO ) ) ? ( *IX ) : Xlindx + 1 ); } } } return; } /* * End of PZAMAX */ } scalapack-2.0.2/PBLAS/SRC/pzatrmv_.c000644 000766 000024 00000061760 10363532303 017217 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzatrmv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzatrmv_( UPLO, TRANS, DIAG, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PZATRMV performs one of the matrix-vector operations * * sub( Y ) := abs( alpha )*abs( sub( A ) )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( sub( A )' )*abs( sub( X ) ) + * abs( beta*sub( Y ) ), * or * * sub( Y ) := abs( alpha )*abs( conjg( sub( A )' ) )*abs( sub( X ) ) * + abs( beta*sub( Y ) ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are real scalars, sub( Y ) is a real subvector, * sub( X ) is a subvector and sub( A ) is an n by n triangular subma- * trix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := |alpha|*|sub( A )|*|sub( X )| + * |beta*sub( Y )|. * * TRANS = 'T' or 't' * sub( Y ) := |alpha|*|sub( A )'|*|sub( X )| + * |beta*sub( Y )|. * * TRANS = 'C' or 'c' * sub( Y ) := |alpha|*|conjg( sub( A )' )|*|sub( X )| + * |beta*sub( Y )|. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) DOUBLE PRECISION array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Yroc, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XAfr, XAld, Xi, Xj, YAfr, YAld, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper, usiz; double * tbeta; PBTYP_T * type, * utyp; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZATRMV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZATRMV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PZATRMV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PZATRMV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkvec( ctxt, "PZATRMV", "X", *N, 4, Xi, Xj, Xd, *INCX, 13, &info ); PB_Cchkvec( ctxt, "PZATRMV", "Y", *N, 4, Yi, Yj, Yd, *INCY, 19, &info ); } if( info ) { PB_Cabort( ctxt, "PZATRMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( BETA [REAL_PART] == ONE ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); size = type->size; utyp = PB_Cdtypeset(); usiz = type->usiz; /* * and when alpha is zero */ if( ALPHA[REAL_PART] == ZERO ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( notran ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( utyp, COLUMN, *N, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, ((char**)(&tbeta)), &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( utyp, ROW, *N, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, ((char**)(&tbeta)), &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XAld = XAd[LLD_]; YAld = YAd[LLD_]; /* * Scale YA in the case sub( Y ) has been reused */ if( notran && !( YApbY ) ) { /* * YA resides in (a) process column(s) */ if( ( mycol == YAd[CSRC_] ) || ( YAd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YA */ if( Amp > 0 ) dascal_( &Amp, ((char *) tbeta), YA, &ione ); } } else if( !( notran ) && !( YApbY ) ) { /* * YA resides in (a) process row(s) */ if( ( myrow == YAd[RSRC_] ) || ( YAd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YA */ if( Anq > 0 ) dascal_( &Anq, ((char *) tbeta), YA, &YAld ); } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &utyp->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { zagemv_( TRANS, &Akp, &Anq0, ((char *) ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, YA, &ione ); } PB_Cptrm( type, utyp, LEFT, UPPER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, usiz ), YAld, PB_Ctzatrmv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { zagemv_( TRANS, &Akp, &Anq0, ((char *) ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XA, &ione, one, Mptr( YA, 0, Akq, YAld, usiz ), &YAld ); } PB_Cptrm( type, utyp, LEFT, UPPER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, usiz ), YAld, PB_Ctzatrmv ); } } } else { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, utyp, LEFT, LOWER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, usiz ), YAld, PB_Ctzatrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { zagemv_( TRANS, &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, Mptr( YA, Akp, 0, YAld, usiz ), &ione ); } } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, utyp, LEFT, LOWER, &TranOp, &DiagA, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, usiz ), YAld, PB_Ctzatrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { zagemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, Akp, 0, XAld, size ), &ione, one, Mptr( YA, 0, Akq, YAld, usiz ), &YAld ); } } } } } if( XAfr ) free( XA ); if( notran ) { /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cdgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } } else { /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cdgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; dascal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, usiz ), &Yld ); } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { dascal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], usiz ), INCY ); } } } if( notran ) { PB_Cpaxpby( utyp, NOCONJG, *N, 1, one, YA, 0, 0, YAd, COLUMN, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } else { PB_Cpaxpby( utyp, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, one, ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PZATRMV */ } scalapack-2.0.2/PBLAS/SRC/pzaxpy_.c000644 000766 000024 00000022601 10363532303 017036 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzaxpy_( int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzaxpy_( N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCX, * DESCY; double * X, * Y; #endif { /* * Purpose * ======= * * PZAXPY adds one subvector to another, * * sub( Y ) := sub( Y ) + alpha * sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER. * On entry, N specifies the length of the subvectors to be * added. N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xi, Xj, Yi, Yj, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PZAXPY", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PZAXPY", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PZAXPY", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * Start the operations */ if( *INCX == Xd[M_] ) { PB_Cpaxpby( type, NOCONJG, 1, *N, ((char *) ALPHA), ((char *) X), Xi, Xj, Xd, ROW, type->one, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, ((char *) ALPHA), ((char *) X), Xi, Xj, Xd, COLUMN, type->one, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } /* * End of PZAXPY */ } scalapack-2.0.2/PBLAS/SRC/pzcopy_.c000644 000766 000024 00000021615 10363532303 017033 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzcopy_( int * N, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzcopy_( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; /* * .. Array Arguments .. */ int * DESCX, * DESCY; double * X, * Y; #endif { /* * Purpose * ======= * * PZCOPY copies one subvector into another, * * sub( Y ) := sub( X ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * copied. N must be at least zero. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local output) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xi, Xj, Yi, Yj, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 501 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PZCOPY", "X", *N, 1, Xi, Xj, Xd, *INCX, 5, &info ); PB_Cchkvec( ctxt, "PZCOPY", "Y", *N, 1, Yi, Yj, Yd, *INCY, 10, &info ); } if( info ) { PB_Cabort( ctxt, "PZCOPY", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * Start the operations */ if( *INCX == Xd[M_] ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) X), Xi, Xj, Xd, ROW, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) X), Xi, Xj, Xd, COLUMN, type->zero, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ) ); } /* * End of PZCOPY */ } scalapack-2.0.2/PBLAS/SRC/pzdotc_.c000644 000766 000024 00000067123 10363532303 017016 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzdotc_( int * N, double * DOT, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzdotc_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; double * DOT; /* * .. Array Arguments .. */ int * DESCX, * DESCY; double * X, * Y; #endif { /* * Purpose * ======= * * PZDOTC forms the dot product of two subvectors, * * DOT := sub( X )**H * sub( Y ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * multiplied. N must be at least zero. * * DOT (local output) COMPLEX*16 array * On exit, DOT specifies the dot product of the two subvectors * sub( X ) and sub( Y ) only in their scope (See below for fur- * ther details). * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char scope, * top; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD, Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info, ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src; PBTYP_T * type; VVDOT_T dot; /* * .. Local Arrays .. */ char * buf = NULL; int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PZDOTC", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PZDOTC", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PZDOTC", info ); return; } #endif DOT[REAL_PART] = ZERO; DOT[IMAG_PART] = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Handle degenerate case */ if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) ) { type = PB_Cztypeset(); PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_]; XnbD = Xd[NB_]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_]; XnbD = Xd[MB_]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_]; YnbD = Yd[NB_]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_]; YnbD = Yd[MB_]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector used * to perform the dot product computation. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cztypeset(); size = type->size; dot = type->Fvvdotc; /* * A purely operation occurs iff the operands start in the same process and if * either the grid is mono-dimensional or there is a single local block to be * operated with or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } } /* * Combine the local results in sub( X )'s scope */ if( ( XisR && YisR ) || ( XmyprocR == XprocR ) ) { scope = ( XisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); Czgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 ); } if( RRorCC && XisR && YisR ) return; } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to compute the desired dot-product in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( ( YmyprocR == YprocR ) ) { /* * The processes owning a piece of sub( Y ) send it to the corresponding * process owning s piece of sub ( X ). */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( XisRow ) { rdst = XprocR; cdst = dst; } else { rdst = dst; cdst = XprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( YisRow ) Czgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); else Czgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding local * piece of sub( Y ), compute the local dot product and combine the results * within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); src = MPosMod( src, YnprocsD ); if( YisRow ) { rsrc = YprocR; csrc = src; } else { rsrc = src; csrc = YprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Czgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc ); else Czgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc ); dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Czgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Czgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } else { /* * General case, copy sub( Y ) within sub( X )'s scope, compute the local * results and combine them within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XisRow ) { PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt, 1 ); } else { PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt, MAX( 1, XnpD ) ); } if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) ) buf = PB_Cmalloc( XnpD * size ); if( YisRow ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj, Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj, Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } if( XmyprocR == XprocR ) { if( XnpD > 0 ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Czgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Czgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } /* * Send the DOT product result within sub( Y )'s scope */ if( XisR || YisR ) { /* * Either sub( X ) or sub( Y ) are replicated, so that every process should have * the result -> broadcast it orthogonally from sub( X )'s direction. */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Czgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Czgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Czgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Czgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD, XprocR ); } } else { /* * Neither sub( X ) nor sub( Y ) are replicated */ if( RRorCC ) { /* * Both sub( X ) are distributed in the same direction -> the process row or * column XprocR sends the result to the process row or column YprocR. */ if( XprocR != YprocR ) { if( XmyprocR == XprocR ) { if( XisRow ) Czgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR, YmyprocD ); else Czgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD, YprocR ); } else if( YmyprocR == YprocR ) { if( XisRow ) Czgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR, XmyprocD ); else Czgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD, XprocR ); } } } else { /* * Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s * scope, broadcast the result within sub( Y )'s scope. */ if( YmyprocR == YprocR ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocD == XprocR ) Czgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Czgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, YprocR, XprocR ); } else { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocD == XprocR ) Czgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Czgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, YprocR ); } } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ type = PB_Cztypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ type = PB_Cztypeset(); /* * Compute DOT := sub( Y )**H * sub( X ) */ PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotc ); /* * Conjugate the result */ DOT[IMAG_PART] = -DOT[IMAG_PART]; } else { /* * Neither sub( X ) nor sub( Y ) are distributed */ type = PB_Cztypeset(); PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc ); } /* * End of PZDOTC */ } scalapack-2.0.2/PBLAS/SRC/pzdotu_.c000644 000766 000024 00000066730 10363532303 017043 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzdotu_( int * N, double * DOT, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzdotu_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; double * DOT; /* * .. Array Arguments .. */ int * DESCX, * DESCY; double * X, * Y; #endif { /* * Purpose * ======= * * PZDOTU forms the dot product of two subvectors, * * DOT := sub( X )**T * sub( Y ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * multiplied. N must be at least zero. * * DOT (local output) COMPLEX*16 array * On exit, DOT specifies the dot product of the two subvectors * sub( X ) and sub( Y ) only in their scope (See below for fur- * ther details). * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char scope, * top; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD, Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info, ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src; PBTYP_T * type; VVDOT_T dot; /* * .. Local Arrays .. */ char * buf = NULL; int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PZDOTU", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); PB_Cchkvec( ctxt, "PZDOTU", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); } if( info ) { PB_Cabort( ctxt, "PZDOTU", info ); return; } #endif DOT[REAL_PART] = ZERO; DOT[IMAG_PART] = ZERO; /* * Quick return if possible */ if( *N == 0 ) return; /* * Handle degenerate case */ if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) ) { type = PB_Cztypeset(); PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_]; XnbD = Xd[NB_]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_]; XnbD = Xd[MB_]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_]; YnbD = Yd[NB_]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_]; YnbD = Yd[MB_]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector used * to perform the dot product computation. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cztypeset(); size = type->size; dot = type->Fvvdotu; /* * A purely operation occurs iff the operands start in the same process and if * either the grid is mono-dimensional or there is a single local block to be * operated with or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } } /* * Combine the local results in sub( X )'s scope */ if( ( XisR && YisR ) || ( XmyprocR == XprocR ) ) { scope = ( XisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); Czgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 ); } if( RRorCC && XisR && YisR ) return; } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to compute the desired dot-product in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( ( YmyprocR == YprocR ) ) { /* * The processes owning a piece of sub( Y ) send it to the corresponding * process owning s piece of sub ( X ). */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( XisRow ) { rdst = XprocR; cdst = dst; } else { rdst = dst; cdst = XprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( YisRow ) Czgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); else Czgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rdst, cdst ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding local * piece of sub( Y ), compute the local dot product and combine the results * within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); src = MPosMod( src, YnprocsD ); if( YisRow ) { rsrc = YprocR; csrc = src; } else { rsrc = src; csrc = YprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Czgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc ); else Czgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc ); dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Czgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Czgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } else { /* * General case, copy sub( Y ) within sub( X )'s scope, compute the local * results and combine them within sub( X )'s scope. */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XisRow ) { PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt, 1 ); } else { PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt, MAX( 1, XnpD ) ); } if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) ) buf = PB_Cmalloc( XnpD * size ); if( YisRow ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj, Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } else { PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj, Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); } if( XmyprocR == XprocR ) { if( XnpD > 0 ) { dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Czgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Czgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); } } } /* * Send the DOT product result within sub( Y )'s scope */ if( XisR || YisR ) { /* * Either sub( X ) or sub( Y ) are replicated, so that every process should have * the result -> broadcast it orthogonally from sub( X )'s direction. */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Czgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Czgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Czgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Czgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD, XprocR ); } } else { /* * Neither sub( X ) nor sub( Y ) are replicated */ if( RRorCC ) { /* * Both sub( X ) are distributed in the same direction -> the process row or * column XprocR sends the result to the process row or column YprocR. */ if( XprocR != YprocR ) { if( XmyprocR == XprocR ) { if( XisRow ) Czgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR, YmyprocD ); else Czgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD, YprocR ); } else if( YmyprocR == YprocR ) { if( XisRow ) Czgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR, XmyprocD ); else Czgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD, XprocR ); } } } else { /* * Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s * scope, broadcast the result within sub( Y )'s scope. */ if( YmyprocR == YprocR ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocD == XprocR ) Czgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); else Czgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, YprocR, XprocR ); } else { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocD == XprocR ) Czgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); else Czgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, YprocR ); } } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ type = PB_Cztypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ type = PB_Cztypeset(); PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotu ); } else { /* * Neither sub( X ) nor sub( Y ) are distributed */ type = PB_Cztypeset(); PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu ); } /* * End of PZDOTU */ } scalapack-2.0.2/PBLAS/SRC/pzdscal_.c000644 000766 000024 00000022116 10363532303 017144 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzdscal_( int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pzdscal_( N, ALPHA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCX; double * X; #endif { /* * Purpose * ======= * * PZDSCAL multiplies an n element subvector sub( X ) by the real scalar * alpha, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input/local output) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the scaled * subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PZDSCAL", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PZDSCAL", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ALPHA[REAL_PART] == ONE ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( Xd[CTXT_], &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Start the operations */ if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cztypeset(); if( ALPHA[REAL_PART] == ZERO ) { zset_( &Xnq, type->zero, Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } else { zdscal_( &Xnq, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { type = PB_Cztypeset(); if( ALPHA[REAL_PART] == ZERO ) { zset_( &Xnp, type->zero, Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } else { zdscal_( &Xnp, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } } } return; } /* * End of PZDSCAL */ } scalapack-2.0.2/PBLAS/SRC/pzgeadd_.c000644 000766 000024 00000027270 10363532303 017130 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzgeadd_( F_CHAR_T TRANS, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pzgeadd_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * IC, * JA, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PZGEADD adds a matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if TRANS = 'N', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'T', * conjg(A(IA:IA+N-1,JA:JA+M-1)') if TRANS = 'C'. * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the form of op( sub( A ) ) to be * used in the matrix addition as follows: * * TRANS = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANS = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANS = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, DirC, ctop, rtop; int Ai, Aj, Ci, Cj, TrA, ctxt, info, mycol, myrow, npcol, nprow, notran; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !notran ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZGEADD", "Illegal TRANS = %c\n", TrA ); info = -1; } if( notran ) PB_Cchkmat( ctxt, "PZGEADD", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); else PB_Cchkmat( ctxt, "PZGEADD", "A", *N, 3, *M, 2, Ai, Aj, Ad, 8, &info ); PB_Cchkmat( ctxt, "PZGEADD", "C", *M, 2, *N, 3, Ci, Cj, Cd, 13, &info ); } if( info ) { PB_Cabort( ctxt, "PZGEADD", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( PB_Cztypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cztypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ /* * This operation mainly involves point-to-point send and receive communication. * There is therefore no particular BLACS topology to recommend. Still, one can * choose the main loop direction in which the operands will be added. This * selection is based on the current setting for the BLACS broadcast operations. */ if( notran ) { rtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( *M <= *N ) { DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirC = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpgeadd( PB_Cztypeset(), &DirA, &DirC, NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } else if( TrA == CTRAN ) { PB_Cptran( PB_Cztypeset(), NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cptran( PB_Cztypeset(), CONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } /* * End of PZGEADD */ } scalapack-2.0.2/PBLAS/SRC/pzgemm_.c000644 000766 000024 00000050713 10363532303 017007 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzgemm_( F_CHAR_T TRANSA, F_CHAR_T TRANSB, int * M, int * N, int * K, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pzgemm_( TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANSA, TRANSB; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; double * A, * B, * C; #endif { /* * Purpose * ======= * * PZGEMM performs one of the matrix-matrix operations * * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N', * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T', * conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C', * * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N', * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T', * conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'. * * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) ) * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and * sub( C ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * TRANSB (global input) CHARACTER*1 * On entry, TRANSB specifies the form of op( sub( B ) ) to be * used in the matrix multiplication as follows: * * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ), * * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )', * * TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * op( sub( A ) ) and of the submatrix sub( C ). M must be at * least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * op( sub( B ) ) and the number of columns of the submatrix * sub( C ). N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns of the submatrix * op( sub( A ) ) and the number of rows of the submatrix * op( sub( B ) ). K must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, DirBC, OpC, OpR, TrA, TrB, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ChooseBC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, nota, notb, npcol, nprow; double ABest, ACest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANSA )[0] ) ) == CNOTRAN ); notb = ( ( TrB = Mupcase( F2C_CHAR( TRANSB )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1001 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZGEMM", "Illegal TRANSA = %c\n", TrA ); info = -1; } else if( ( !notb ) && ( TrB != CTRAN ) && ( TrB != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZGEMM", "Illegal TRANSB = %c\n", TrB ); info = -2; } if( nota ) PB_Cchkmat( ctxt, "PZGEMM", "A", *M, 3, *K, 5, Ai, Aj, Ad, 10, &info ); else PB_Cchkmat( ctxt, "PZGEMM", "A", *K, 5, *M, 3, Ai, Aj, Ad, 10, &info ); if( notb ) PB_Cchkmat( ctxt, "PZGEMM", "B", *K, 5, *N, 4, Bi, Bj, Bd, 14, &info ); else PB_Cchkmat( ctxt, "PZGEMM", "B", *N, 4, *K, 5, Bi, Bj, Bd, 14, &info ); PB_Cchkmat( ctxt, "PZGEMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 19, &info ); } if( info ) { PB_Cabort( ctxt, "PZGEMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) || ( *K == 0 ) ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * If alpha or K is zero, sub( C ) := beta * sub( C ). */ if( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char * ) C), Ci, Cj, Cd ); } else if( !( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char * ) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABest: both operands sub( A ) and sub( B ) are communicated (M, N >> K) * ACest: both operands sub( A ) and sub( C ) are communicated (K, N >> M) * BCest: both operands sub( B ) and sub( C ) are communicated (M, K >> N) */ ABest = (double)(*K); ACest = (double)(*M); BCest = (double)(*N); if( notb ) { if( nota ) { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); ABest *= ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *K, Bd[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[MB_], nprow ); BCest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } else { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *M, Ad[NB_], npcol ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ); tmp1 = DNROC( *K, Bd[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ACest *= ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *K, Ad[MB_], nprow ); tmp2 = DNROC( *M, Bd[NB_], npcol ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest *= ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } } else { if( nota ) { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *N, Bd[MB_], nprow ); tmp2 = DNROC( *K, Bd[NB_], npcol ); tmp3 = DNROC( *N, Cd[NB_], npcol ); ACest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *K, Ad[NB_], npcol ); BCest *= CBRATIO * ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ); } else { tmp1 = DNROC( *M, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *N, Bd[MB_], nprow ); tmp2 = DNROC( *K, Bd[NB_], npcol ); tmp3 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest *= CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); tmp1 = DNROC( *K, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *K, Bd[NB_], npcol ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest *= ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ); } } ChooseAB = ( ( ABest <= ( 1.3 * BCest ) ) && ( ABest <= ( 1.3 * ACest ) ) ); ChooseBC = ( ( BCest <= ACest ) && ( ( 1.3 * BCest ) <= ABest ) ); /* * BLACS topologies are enforced iff M, N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) && ( *K > nb ) ); if( ChooseAB ) { OpR = CBCAST; OpC = CBCAST; } else if( ChooseBC ) { if( nota ) { OpR = CCOMBINE; OpC = CBCAST; } else { OpR = CBCAST; OpC = CCOMBINE; } } else { if( notb ) { OpR = CBCAST; OpC = CCOMBINE; } else { OpR = CCOMBINE; OpC = CBCAST; } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_IRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_IRING ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirAB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); DirBC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); if( ChooseAB ) { PB_CpgemmAB( type, &DirAB, &DirBC, ( nota ? NOTRAN : ( ( TrA == CCOTRAN ) ? COTRAN : TRAN ) ), ( notb ? NOTRAN : ( ( TrB == CCOTRAN ) ? COTRAN : TRAN ) ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else if( ChooseBC ) { PB_CpgemmBC( type, &DirAB, &DirBC, ( nota ? NOTRAN : ( ( TrA == CCOTRAN ) ? COTRAN : TRAN ) ), ( notb ? NOTRAN : ( ( TrB == CCOTRAN ) ? COTRAN : TRAN ) ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { PB_CpgemmAC( type, &DirAB, &DirBC, ( nota ? NOTRAN : ( ( TrA == CCOTRAN ) ? COTRAN : TRAN ) ), ( notb ? NOTRAN : ( ( TrB == CCOTRAN ) ? COTRAN : TRAN ) ), *M, *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PZGEMM */ } scalapack-2.0.2/PBLAS/SRC/pzgemv_.c000644 000766 000024 00000043612 10363532303 017020 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzgemv_( F_CHAR_T TRANS, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzgemv_( TRANS, M, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PZGEMV performs one of the matrix-vector operations * * sub( Y ) := alpha*sub( A ) *sub( X ) + beta*sub( Y ), or * sub( Y ) := alpha*sub( A )'*sub( X ) + beta*sub( Y ), or * sub( Y ) := alpha*conjg( sub( A )' )*sub( X ) + beta*sub( Y ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1). * * When TRANS = 'N', * * sub( X ) denotes X(IX:IX,JX:JX+N-1), if INCX = M_X, * X(IX:IX+N-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+M-1), if INCY = M_Y, * Y(IY:IY+M-1,JY:JY), if INCY = 1 and INCY <> M_Y, * and, otherwise * * sub( X ) denotes X(IX:IX,JX:JX+M-1), if INCX = M_X, * X(IX:IX+M-1,JX:JX), if INCX = 1 and INCX <> M_X, * and, * * sub( Y ) denotes Y(IY:IY,JY:JY+N-1), if INCY = M_Y, * Y(IY:IY+N-1,JY:JY), if INCY = 1 and INCY <> M_Y. * * Alpha and beta are scalars, and sub( X ) and sub( Y ) are subvectors * and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( Y ) := alpha*sub( A ) * sub( X ) + beta*sub( Y ), * * TRANS = 'T' or 't', * sub( Y ) := alpha*sub( A )' * sub( X ) + beta*sub( Y ), * * TRANS = 'C' or 'c', * sub( Y ) := alpha*conjg( sub( A )' )*sub( X ) + * beta*sub( Y ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Lx is N when TRANS = 'N' or 'n' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+Ly-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Ly is M when TRANS = 'N' or 'n' and N otherwise. Before en- * try, this array contains the local entries of the matrix Y. * On exit, sub( Y ) is overwritten by the updated subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char TrA, Yroc, * tbeta, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, YApbY, YAsum, Ycol, Yi, Yii, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, mycol, myrow, nota, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd [DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ nota = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !nota ) && ( TrA != CTRAN ) && ( TrA != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZGEMV", "Illegal TRANS=%c\n", TrA ); info = -1; } PB_Cchkmat( ctxt, "PZGEMV", "A", *M, 2, *N, 3, Ai, Aj, Ad, 8, &info ); if( nota ) { PB_Cchkvec( ctxt, "PZGEMV", "X", *N, 3, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PZGEMV", "Y", *M, 2, Yi, Yj, Yd, *INCY, 18, &info ); } else { PB_Cchkvec( ctxt, "PZGEMV", "X", *M, 2, Xi, Xj, Xd, *INCX, 12, &info ); PB_Cchkvec( ctxt, "PZGEMV", "Y", *N, 3, Yi, Yj, Yd, *INCY, 18, &info ); } } if( info ) { PB_Cabort( ctxt, "PZGEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) && ( ( BETA [REAL_PART] == ONE ) && ( BETA [IMAG_PART] == ZERO ) ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); /* * When alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( ( nota ? *M : *N ), Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { zset_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } else { zscal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( ( nota ? *M : *N ), Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { zset_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } else { zscal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Yroc = ( *INCY == Yd[M_] ? CROW : CCOLUMN ); if( nota ) { /* * Reuse sub( Y ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( type, COLUMN, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { zgemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *)A), Aii, Ajj, Ald, type->size ), &Ald, XA, &XAd[LLD_], tbeta, YA, &ione ); } if( XAfr ) free( XA ); /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Czgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, *M, 1, type->one, YA, 0, 0, YAd, COLUMN, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc ); } } else { /* * Reuse sub( Y ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( type, ROW, *M, *N, Ad0, 1, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Local matrix-vector multiply iff I own some data */ Amp = PB_Cnumroc( *M, 0, Ad0[IMB_], Ad0[MB_], myrow, Ad0[RSRC_], nprow ); Anq = PB_Cnumroc( *N, 0, Ad0[INB_], Ad0[NB_], mycol, Ad0[CSRC_], npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { zgemv_( TRANS, &Amp, &Anq, ((char *) ALPHA), Mptr( ((char *)A), Aii, Ajj, Ald, type->size ), &Ald, XA, &ione, tbeta, YA, &YAd[LLD_] ); } if( XAfr ) free( XA ); /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Czgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } /* * sub( Y ) := beta * sub( Y ) + YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, YA, 0, 0, YAd, ROW, ((char *) BETA), ((char *) Y), Yi, Yj, Yd, &Yroc ); } } if( YAfr ) free( YA ); /* * End of PZGEMV */ } scalapack-2.0.2/PBLAS/SRC/pzgerc_.c000644 000766 000024 00000027453 10363532303 017007 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzgerc_( int * M, int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY, double * A, int * IA, int * JA, int * DESCA ) #else void pzgerc_( M, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PZGERC performs the rank 1 operation * * sub( A ) := alpha*sub( X )*conjg( sub( Y )' ) + sub( A ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) is an m element subvector, sub( Y ) is * an n element subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+M-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+M-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( A ) are overwritten by the * local entries of the m by n updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, Yi, Yj, ctxt, info, ione=1, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd[DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PZGERC", "X", *M, 1, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PZGERC", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PZGERC", "A", *M, 1, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PZGERC", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Replicate sub( Y ) in process rows spanned by sub( A ) -> YA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ), &YA, YAd, &YAfr ); /* * Local rank-1 update iff I own some data */ Amp = PB_Cnumroc( *M, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { zgerc_( &Amp, &Anq, ((char *) ALPHA), XA, &ione, YA, &YAd[LLD_], Mptr( ((char *) A), Aii, Ajj, Ald, type->size ), &Ald ); } if( XAfr ) free( XA ); if( YAfr ) free( YA ); /* * End of PZGERC */ } scalapack-2.0.2/PBLAS/SRC/pzgeru_.c000644 000766 000024 00000027442 10363532303 017027 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzgeru_( int * M, int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY, double * A, int * IA, int * JA, int * DESCA ) #else void pzgeru_( M, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * M, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PZGERU performs the rank 1 operation * * sub( A ) := alpha*sub( X )*sub( Y )' + sub( A ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) is an m element subvector, sub( Y ) is * an n element subvector and sub( A ) is an m by n submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+M-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+M-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( A ) are overwritten by the * local entries of the m by n updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Ald, Amb, Amp, Anb, Anq, Arow, XAfr, Xi, Xj, YAfr, Yi, Yj, ctxt, info, ione=1, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_], Yd[DLEN_]; char * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PZGERU", "X", *M, 1, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PZGERU", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PZGERU", "A", *M, 1, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PZGERU", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *M, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *M, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ( *INCX == Xd[M_] ? ROW : COLUMN ), &XA, XAd, &XAfr ); /* * Replicate sub( Y ) in process rows spanned by sub( A ) -> YA */ PB_CInV( type, NOCONJG, ROW, *M, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ( *INCY == Yd[M_] ? ROW : COLUMN ), &YA, YAd, &YAfr ); /* * Local rank-1 update iff I own some data */ Amp = PB_Cnumroc( *M, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { zgeru_( &Amp, &Anq, ((char *) ALPHA), XA, &ione, YA, &YAd[LLD_], Mptr( ((char *) A), Aii, Ajj, Ald, type->size ), &Ald ); } if( XAfr ) free( XA ); if( YAfr ) free( YA ); /* * End of PZGERU */ } scalapack-2.0.2/PBLAS/SRC/pzhemm_.c000644 000766 000024 00000053554 10363532303 017016 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzhemm_( F_CHAR_T SIDE, F_CHAR_T UPLO, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pzhemm_( SIDE, UPLO, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T SIDE, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; double * A, * B, * C; #endif { /* * Purpose * ======= * * PZHEMM performs one of the matrix-matrix operations * * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * or * * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha and beta are scalars, sub( A ) is a Hermitian submatrix and * sub( B ) and sub( C ) are m by n submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether the Hermitian submatrix * sub( A ) appears on the left or right in the operation as * follows: * * SIDE = 'L' or 'l' * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * SIDE = 'R' or 'r' * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with SIDE = 'L' or 'l', this array contains * the local entries corresponding to the entries of the m by m * Hermitian submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the Hermitian submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the Hermitian submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * Before entry with SIDE = 'R' or 'r', this array contains * the local entries corresponding to the entries of the n by n * Hermitian submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the Hermitian submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the Hermitian submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set and assumed to be zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, SideOp, UploA, cbtop, cbtopsave, cctop, cctopsave, rbtop, rbtopsave, rctop, rctopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, lside, mycol, myrow, nb, npcol, nprow, upper; double ABCest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PZHEMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZHEMM", "Illegal UPLO = %c\n", UploA ); info = -2; } if( lside ) { PB_Cchkmat( ctxt, "PZHEMM", "A", *M, 3, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZHEMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PZHEMM", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZHEMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PZHEMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PZHEMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) && ( ( BETA [REAL_PART] == ONE ) && ( BETA [IMAG_PART] == ZERO ) ) ) ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * If alpha is zero, sub( C ) := beta * sub( C ). */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else if( !( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (N >> M) * BCest : Both operands sub( B ) and sub( C ) are communicated (M >> N) */ if( lside ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ABCest = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp1 = DNROC( *N, Ad[NB_], npcol ); tmp2 = DNROC( *M, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); tmp4 = DNROC( *N, Cd[NB_], npcol ); BCest = (double)(*M) * ( ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.5 * ABCest ) <= BCest ); /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ChooseABC ) { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } } if( lside ) DirAB = ( rbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAB = ( cbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CpsymmAB( type, &DirAB, CONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_DEFAULT ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); } } if( lside ) DirAB = ( ( rbtop == CTOP_DRING || rctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); else DirAB = ( ( cbtop == CTOP_DRING || cctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); PB_CpsymmBC( type, &DirAB, CONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rbtopsave = *PB_Ctop( &ctxt, BCAST, ROW, &rbtopsave ); rctopsave = *PB_Ctop( &ctxt, COMBINE, ROW, &rctopsave ); cbtopsave = *PB_Ctop( &ctxt, BCAST, COLUMN, &cbtopsave ); cctopsave = *PB_Ctop( &ctxt, COMBINE, COLUMN, &cctopsave ); } /* * End of PZHEMM */ } scalapack-2.0.2/PBLAS/SRC/pzhemv_.c000644 000766 000024 00000056211 10363532303 017020 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzhemv_( F_CHAR_T UPLO, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * BETA, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzhemv_( UPLO, N, ALPHA, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX, BETA, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PZHEMV performs the matrix-vector operation * * sub( Y ) := alpha*sub( A )*sub( X ) + beta*sub( Y ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha and beta are scalars, sub( X ) and sub( Y ) are n element sub- * vectors and sub( A ) is an n by n Hermitian submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and X corresponding to the entries of the submatrix sub( A ) * and the subvector sub( X ) need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries of the upper triangular part of the * Hermitian submatrix sub( A ), and the local entries of the * strictly lower triangular of sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries of the lower triangular part of the * Hermitian submatrix sub( A ), and the local entries of the * strictly upper triangular of sub( A ) are not referenced. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set and assumed to be zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array Y * corresponding to the entries of the subvector sub( Y ) need * not be set on input. * * Y (local input/local output) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten by the updated * subvector. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA, * one, top; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCpbY, YCsum, YCld, YRfr, YRpbY, YRsum, YRld, Ycol, Yi, Yii, YisRow, Yj, Yjj, Yld, Ynp, Ynq, Yrow, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; double * tbeta; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XCd[DLEN_], XRd[DLEN_], Xd[DLEN_], YCd[DLEN_], YRd[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, __FILE__, "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkmat( ctxt, "PZHEMV", "A", *N, 2, *N, 2, Ai, Aj, Ad, 7, &info ); PB_Cchkvec( ctxt, "PZHEMV", "X", *N, 2, Xi, Xj, Xd, *INCX, 11, &info ); PB_Cchkvec( ctxt, "PZHEMV", "Y", *N, 2, Yi, Yj, Yd, *INCY, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PZHEMV", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) && ( ( BETA [REAL_PART] == ONE ) && ( BETA [IMAG_PART] == ZERO ) ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); /* * When alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( *INCY == Yd[M_] ) { /* * sub( Y ) resides in (a) process row(s) */ if( ( myrow == Yrow ) || ( Yrow < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynq = PB_Cnumroc( *N, Yj, Yd[INB_], Yd[NB_], mycol, Yd[CSRC_], npcol ); if( Ynq > 0 ) { Yld = Yd[LLD_]; if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { zset_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } else { zscal_( &Ynq, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yld, type->size ), &Yld ); } } } } else { /* * sub( Y ) resides in (a) process column(s) */ if( ( mycol == Ycol ) || ( Ycol < 0 ) ) { /* * Make sure I own some data and scale sub( Y ) */ Ynp = PB_Cnumroc( *N, Yi, Yd[IMB_], Yd[MB_], myrow, Yd[RSRC_], nprow ); if( Ynp > 0 ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { zset_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } else { zscal_( &Ynp, ((char *) BETA), Mptr( ((char *) Y), Yii, Yjj, Yd[LLD_], type->size ), INCY ); } } } } return; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Reuse sub( Y ) and/or create vectors YR in process rows and YC in process * columns spanned by sub( A ) */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) { PB_CInOutV( type, ROW, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW, ((char**)(&tbeta)), &YR, YRd, &YRfr, &YRsum, &YRpbY ); PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &YC, YCd, &YCfr, &YCsum ); } else { PB_CInOutV( type, COLUMN, *N, *N, Ad0, 1, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN, ((char**)(&tbeta)), &YC, YCd, &YCfr, &YCsum, &YCpbY ); PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &YR, YRd, &YRfr, &YRsum ); } /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd, ROW, &XC, XCd, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd, COLUMN, &XR, XRd, &XRfr ); } one = type->one; /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd[LLD_]; XRld = XRd[LLD_]; YCld = YCd[LLD_]; YRld = YRd[LLD_]; /* * Scale YR or YC in the case sub( Y ) has been reused */ if( YisRow ) { /* * YR resides in (a) process row(s) */ if( !YRpbY ) { if( ( myrow == YRd[RSRC_] ) || ( YRd[RSRC_] < 0 ) ) { /* * Make sure I own some data and scale YR */ if( Anq > 0 ) { if( ( tbeta[REAL_PART] == ZERO ) && ( tbeta[IMAG_PART] == ZERO ) ) { zset_( &Anq, ((char *) tbeta), YR, &YRld ); } else { zscal_( &Anq, ((char *) tbeta), YR, &YRld ); } } } } } else { /* * YC resides in (a) process column(s) */ if( !YCpbY ) { if( ( mycol == YCd[CSRC_] ) || ( YCd[CSRC_] < 0 ) ) { /* * Make sure I own some data and scale YC */ if( Amp > 0 ) { if( ( tbeta[REAL_PART] == ZERO ) && ( tbeta[IMAG_PART] == ZERO ) ) { zset_( &Amp, ((char *) tbeta), YC, &ione ); } else { zscal_( &Amp, ((char *) tbeta), YC, &ione ); } } } } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { zgemv_( C2F_CHAR( NOTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, YC, &ione ); zgemv_( C2F_CHAR( COTRAN ), &Akp, &Anq0, ((char *)ALPHA), Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XC, &ione, one, Mptr( YR, 0, Akq, YRld, size ), &YRld ); } PB_Cpsym( type, type, LEFT, UPPER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, PB_Ctzhemv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsym( type, type, LEFT, LOWER, kb, 1, ((char *) ALPHA), Aptr, k, k, Ad0, Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, PB_Ctzhemv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { zgemv_( C2F_CHAR( NOTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XR, 0, Akq, XRld, size ), &XRld, one, Mptr( YC, Akp, 0, YCld, size ), &ione ); zgemv_( C2F_CHAR( COTRAN ), &Amp0, &Anq0, ((char *) ALPHA), Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XC, Akp, 0, XCld, size ), &ione, one, Mptr( YR, 0, Akq, YRld, size ), &YRld ); } } } } if( XCfr ) free( XC ); if( XRfr ) free( XR ); if( YisRow ) { /* * Combine the partial column results into YC */ if( YCsum ) { YCd[CSRC_] = 0; if( Amp > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Czgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, 0 ); } } /* * Combine the partial row results into YR */ if( YRsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Czgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], YRd[RSRC_], mycol ); } /* * YR := YR + YC */ PB_Cpaxpby( type, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, one, YR, 0, 0, YRd, ROW ); /* * sub( Y ) := beta * sub( Y ) + YR (if necessary) */ if( YRpbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, ROW ); } } else { /* * Combine the partial row results into YR */ if( YRsum ) { YRd[RSRC_] = 0; if( Anq > 0 ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Czgsum2d( ctxt, COLUMN, &top, 1, Anq, YR, YRd[LLD_], 0, mycol ); } } /* * Combine the partial column results into YC */ if( YCsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Czgsum2d( ctxt, ROW, &top, Amp, 1, YC, YCd[LLD_], myrow, YCd[CSRC_] ); } /* * YC := YR + YC */ PB_Cpaxpby( type, NOCONJG, 1, *N, one, YR, 0, 0, YRd, ROW, one, YC, 0, 0, YCd, COLUMN ); /* * sub( Y ) := beta * sub( Y ) + YC (if necessary) */ if( YCpbY ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, YC, 0, 0, YCd, COLUMN, ((char *)BETA), ((char *) Y), Yi, Yj, Yd, COLUMN ); } } if( YCfr ) free( YC ); if( YRfr ) free( YR ); /* * End of PZHEMV */ } scalapack-2.0.2/PBLAS/SRC/pzher2_.c000644 000766 000024 00000043342 10363532303 016722 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzher2_( F_CHAR_T UPLO, int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY, double * A, int * IA, int * JA, int * DESCA ) #else void pzher2_( UPLO, N, ALPHA, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * INCY, * IX, * IY, * JA, * JX, * JY, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DESCY; double * A, * X, * Y; #endif { /* * Purpose * ======= * * PZHER2 performs the Hermitian rank 2 operation * * sub( A ) := alpha*sub( X )*conjg( sub( Y )' ) + * conjg( alpha )*sub( Y )*conjg( sub( X )' ) + sub( A ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Alpha is a scalar, sub( X ) and sub( Y ) are n element subvectors and * sub( A ) is an n by n Hermitian submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays X * and Y corresponding to the entries of the subvectors sub( X ) * and sub( Y ) respectively need not be set on input. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the Hermitian submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the Hermitian submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, YCfr, YCld, YRfr, YRld, Yi, Yj, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; cmplx16 Calpha; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0 [DLEN_], XCd0[DLEN_], XRd0[DLEN_], Xd[DLEN_], YCd0[DLEN_], YRd0[DLEN_], Yd [DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL, * YC = NULL, * YR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZHER2", "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkvec( ctxt, "PZHER2", "X", *N, 2, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkvec( ctxt, "PZHER2", "Y", *N, 2, Yi, Yj, Yd, *INCY, 12, &info ); PB_Cchkmat( ctxt, "PZHER2", "A", *N, 2, *N, 2, Ai, Aj, Ad, 17, &info ); } if( info ) { PB_Cabort( ctxt, "PZHER2", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd0, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd0, ROW, &XC, XCd0, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd0, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd0, COLUMN, &XR, XRd0, &XRfr ); } /* * Replicate sub( Y ) in process rows (YR) and process columns (YC) spanned by * sub( A ) */ if( *INCY == Yd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, ROW, &YR, YRd0, &YRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, YR, 0, 0, YRd0, ROW, &YC, YCd0, &YCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) Y), Yi, Yj, Yd, COLUMN, &YC, YCd0, &YCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, YC, 0, 0, YCd0, COLUMN, &YR, YRd0, &YRfr ); } /* * Local rank-2 update if I own some data */ Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XCld = XCd0[LLD_]; YCld = YCd0[LLD_]; XRld = XRd0[LLD_]; YRld = YRd0[LLD_]; Calpha[REAL_PART] = ALPHA[REAL_PART]; Calpha[IMAG_PART] = -ALPHA[IMAG_PART]; /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { zgerc_( &Akp, &Anq0, ((char *) ALPHA), XC, &ione, Mptr( YR, 0, Akq, YRld, size ), &YRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); zgerc_( &Akp, &Anq0, ((char *) Calpha), YC, &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); } PB_Cpsyr2( type, UPPER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, Aptr, k, k, Ad0, PB_Ctzher2 ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsyr2( type, LOWER, kb, 1, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Mptr( YC, Akp, 0, YCld, size ), YCld, Mptr( YR, 0, Akq, YRld, size ), YRld, Aptr, k, k, Ad0, PB_Ctzher2 ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { zgerc_( &Amp0, &Anq0, ((char *) ALPHA), Mptr( XC, Akp, 0, XCld, size ), &ione, Mptr( YR, 0, Akq, YRld, size ), &YRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); zgerc_( &Amp0, &Anq0, ((char *) Calpha), Mptr( YC, Akp, 0, YCld, size ), &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } } } if( XRfr ) free( XR ); if( XCfr ) free( XC ); if( YRfr ) free( YR ); if( YCfr ) free( YC ); /* * End of PZHER2 */ } scalapack-2.0.2/PBLAS/SRC/pzher2k_.c000644 000766 000024 00000053021 10363532303 017070 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzher2k_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pzher2k_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; double * A, * B, * C; #endif { /* * Purpose * ======= * * PZHER2K performs one of the Hermitian rank 2k operations * * sub( C ) := alpha*sub( A )*conjg( sub( B ) )' + * conjg( alpha )*sub( B )*conjg( sub( A ) )' + * beta*sub( C ), * * or * * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) + * conjg( alpha )*conjg( sub( B )' )*sub( A ) + * beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise, and, * * sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N', * B(IB:IB+K-1,JB:JB+N-1) otherwise. * * Alpha and beta are scalars with beta real, sub( C ) is an n by n * Hermitian submatrix and sub( A ) and sub( B ) are n by k submatrices * in the first case and k by n submatrices in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the Hermitian submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*conjg( sub( B )' ) + * conjg( alpha )*sub( B )*conjg( sub( A )' ) + * beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) + * conjg( alpha )*conjg( sub( B )' )*sub( A ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrices sub( A ) and sub( B ), and on * entry with TRANS = 'C' or 'c', K specifies the number of rows * of the submatrices sub( A ) and sub( B ). K must be at least * zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and B corresponding to the entries of the submatrices * sub( A ) and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JB+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( B ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( B ). * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the Hermitian submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the Hermitian submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( C ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double ABCest, ABest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZHER2K", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZHER2K", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) { PB_Cchkmat( ctxt, "PZHER2K", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZHER2K", "B", *N, 3, *K, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PZHER2K", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZHER2K", "B", *K, 4, *N, 3, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PZHER2K", "C", *N, 3, *N, 3, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PZHER2K", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) && ( ( BETA[REAL_PART] == ONE ) ) ) ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * And when alpha or K is zero */ if( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, CONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (K >> N) * ABest : only sub( A ) and sub( B ) are communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[NB_], npcol ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[CSRC_] == -1 ) && ( Bd[CSRC_] == -1 ) ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + TWO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *K, Ad[MB_], nprow ); tmp4 = DNROC( *K, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[RSRC_] == -1 ) && ( Bd[RSRC_] == -1 ) ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + TWO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.4 * ABCest ) <= ABest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseABC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kAC( type, &DirA, CONJG, &UploC, ( notran ? NOTRAN : COTRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kA( type, &DirA, CONJG, &UploC, ( notran ? NOTRAN : COTRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PZHER2K */ } scalapack-2.0.2/PBLAS/SRC/pzher_.c000644 000766 000024 00000034511 10363532303 016636 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzher_( F_CHAR_T UPLO, int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * A, int * IA, int * JA, int * DESCA ) #else void pzher_( UPLO, N, ALPHA, X, IX, JX, DESCX, INCX, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ F_CHAR_T UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCX; double * A, * X; #endif { /* * Purpose * ======= * * PZHER performs the Hermitian rank 1 operation * * sub( A ) := alpha*sub( X )*conjg( sub( X )' ) + sub( A ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Alpha is a real scalar, sub( X ) is an n element subvector and * sub( A ) is an n by n Hermitian submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the Hermitian submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( A ) are to be * referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X * corresponding to the entries of the subvector sub( X ) need * not be set on input. * * X (local input) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the Hermitian submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the Hermitian submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XCfr, XCld, XRfr, XRld, Xi, Xj, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, npcol, nprow, size, upper; cmplx16 Calpha; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XCd0[DLEN_], XRd0[DLEN_], Xd[DLEN_]; char * Aptr = NULL, * XC = NULL, * XR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZHER", "Illegal UPLO = %c\n", UploA ); info = -1; } PB_Cchkvec( ctxt, "PZHER", "X", *N, 2, Xi, Xj, Xd, *INCX, 7, &info ); PB_Cchkmat( ctxt, "PZHER", "A", *N, 2, *N, 2, Ai, Aj, Ad, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PZHER", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ALPHA[REAL_PART] == ZERO ) ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Replicate sub( X ) in process rows (XR) and process columns (XC) spanned by * sub( A ) */ if( *INCX == Xd[M_] ) { PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, ROW, &XR, XRd0, &XRfr ); PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, XR, 0, 0, XRd0, ROW, &XC, XCd0, &XCfr ); } else { PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, COLUMN, &XC, XCd0, &XCfr ); PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, XC, 0, 0, XCd0, COLUMN, &XR, XRd0, &XRfr ); } /* * Local rank-1 update if I own some data */ Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { size = type->size; Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); XCld = XCd0[LLD_]; XRld = XRd0[LLD_]; Calpha[REAL_PART] = ALPHA[REAL_PART]; Calpha[IMAG_PART] = ZERO; if( upper ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) zgerc_( &Akp, &Anq0, ((char *) Calpha), XC, &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); PB_Cpsyr( type, UPPER, kb, 1, ((char *) Calpha), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Aptr, k, k, Ad0, PB_Ctzher ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsyr( type, LOWER, kb, 1, ((char *) Calpha), Mptr( XC, Akp, 0, XCld, size ), XCld, Mptr( XR, 0, Akq, XRld, size ), XRld, Aptr, k, k, Ad0, PB_Ctzher ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) zgerc_( &Amp0, &Anq0, ((char *) Calpha), Mptr( XC, Akp, 0, XCld, size ), &ione, Mptr( XR, 0, Akq, XRld, size ), &XRld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } } if( XRfr ) free( XR ); if( XCfr ) free( XC ); /* * End of PZHER */ } scalapack-2.0.2/PBLAS/SRC/pzherk_.c000644 000766 000024 00000045531 10363532303 017015 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzherk_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pzherk_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * K, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PZHERK performs one of the Hermitian rank k operations * * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + beta*sub( C ), * * or * * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and, * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise. * * Alpha and beta are real scalars, sub( C ) is an n by n Hermitian * submatrix and sub( A ) is an n by k submatrix in the first case and a * k by n submatrix in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the Hermitian submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * Hermitian submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * Hermitian submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + * beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry, with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrix sub( A ), and with TRANS = 'C' or * 'c', K specifies the number of rows of the submatrix * sub( A ). K must be at least zero. * * ALPHA (global input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the Hermitian submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the Hermitian submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( C ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, ChooseAC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double Aest, ACest, tmp1, tmp2, tmp3, tmp4; cmplx16 Calph; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZHERK", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZHERK", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PZHERK", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PZHERK", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZHERK", "C", *N, 3, *N, 3, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PZHERK", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) && ( BETA[REAL_PART] == ONE ) ) ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * And when alpha or K is zero */ if( ( ALPHA[REAL_PART] == ZERO ) || ( *K == 0 ) ) { if( BETA[REAL_PART] == ZERO ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, CONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif Calph[REAL_PART] = ALPHA[REAL_PART]; Calph[IMAG_PART] = ZERO; /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ACest: both operands sub( A ) and sub( C ) are communicated (K >> N) * Aest : only sub( A ) is communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Ad[MB_], nprow ); Aest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); Aest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseAC = ( ( 1.3 * ACest ) <= Aest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseAC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkAC( type, &DirA, CONJG, &UploC, ( notran ? NOTRAN : COTRAN ), *N, *K, ((char *)Calph), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkA( type, &DirA, CONJG, &UploC, ( notran ? NOTRAN : COTRAN ), *N, *K, ((char *)Calph), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PZHERK */ } scalapack-2.0.2/PBLAS/SRC/pzscal_.c000644 000766 000024 00000022406 10602576752 017016 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzscal_( int * N, double * ALPHA, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pzscal_( N, ALPHA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ int * INCX, * IX, * JX, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCX; double * X; #endif { /* * Purpose * ======= * * PZSCAL multiplies an n element subvector sub( X ) by the scalar * alpha, * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array X cor- * responding to the entries of the subvector sub( X ) need not * be set on input. * * X (local input/local output) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the scaled * subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, ctxt, info, mycol, myrow, npcol, nprow; PBTYP_T * type; /* * .. Local Arrays .. */ int Xd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) PB_Cchkvec( ctxt, "PZSCAL", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); if( info ) { PB_Cabort( ctxt, "PZSCAL", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( Xd[CTXT_], &nprow, &npcol, &myrow, &mycol ); #endif /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * Start the operations */ if( *INCX == Xd[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { if( ( ALPHA[REAL_PART] == ONE ) && ( ALPHA[IMAG_PART] == ZERO ) ) return; /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; type = PB_Cztypeset(); if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { zset_( &Xnq, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } else { zscal_( &Xnq, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xld, type->size ), &Xld ); } } } return; } else { /* * sub( X ) resides in (a) process column(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { if( ( ALPHA[REAL_PART] == ONE ) && ( ALPHA[IMAG_PART] == ZERO ) ) return; /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { type = PB_Cztypeset(); if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { zset_( &Xnp, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } else { zscal_( &Xnp, ((char *) ALPHA), Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], type->size ), INCX ); } } } return; } /* * End of PZSCAL */ } scalapack-2.0.2/PBLAS/SRC/pzswap_.c000644 000766 000024 00000076215 10363532303 017041 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzswap_( int * N, double * X, int * IX, int * JX, int * DESCX, int * INCX, double * Y, int * IY, int * JY, int * DESCY, int * INCY ) #else void pzswap_( N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; /* * .. Array Arguments .. */ int * DESCX, * DESCY; double * X, * Y; #endif { /* * Purpose * ======= * * PZSWAP swaps two subvectors, * * sub( Y ) := sub( X ) and sub( X ) := sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * swapped. N must be at least zero. * * X (local input/local output) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with sub( Y ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) COMPLEX*16 array * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Xscope, Yscope, * one, * top, tran, * zero; int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, XisD, XisR, Xinb1D, XinbD, XisRow, Xii, Xj, Xjj, Xld, Xlinc, Xm, XmyprocD, XmyprocR, Xn, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, Xroc, Xrow, Ycol, Yi, Yii, Yinb1D, YinbD, YisD, YisR, YisRow, Yj, Yjj, Yld, Ylinc, Ym, YmyprocD, YmyprocR, Yn, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yroc, Yrow, cdst, csrc, ctxt, dst, gcdPQ, info, ione=1, k, l, lcmPQ, lcmb, mycol, myrow, npcol, npq, nprow, p, q, rdst, rsrc, src, size; PBTYP_T * type; PB_VM_T VM; /* * .. Local Arrays .. */ int Xd[DLEN_], Yd[DLEN_]; char * buf = NULL; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 501 + CTXT_ ) : 0 ) ) ) { PB_Cchkvec( ctxt, "PZSWAP", "X", *N, 1, Xi, Xj, Xd, *INCX, 5, &info ); PB_Cchkvec( ctxt, "PZSWAP", "Y", *N, 1, Yi, Yj, Yd, *INCY, 10, &info ); } if( info ) { PB_Cabort( ctxt, "PZSWAP", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Determine if sub( X ) is distributed or not */ if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); else XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); /* * Determine if sub( Y ) is distributed or not */ if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); else YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * XisD && YisD <=> both vector operands are indeed distributed */ if( XisD && YisD ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( XisRow ) { XinbD = Xd[INB_ ]; XnbD = Xd[NB_ ]; Xld = Xd[LLD_]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); } else { XinbD = Xd[IMB_ ]; XnbD = Xd[MB_ ]; Xld = Xd[LLD_]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... */ PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( YisRow ) { YinbD = Yd[INB_ ]; YnbD = Yd[NB_ ]; Yld = Yd[LLD_]; Ylinc = Yld; YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); } else { YinbD = Yd[IMB_ ]; YnbD = Yd[MB_ ]; Yld = Yd[LLD_]; Ylinc = 1; YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); } /* * Do sub( X ) and sub( Y ) span more than one process ? */ OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); /* * Are sub( X ) and sub( Y ) distributed in the same manner ? */ Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && ( XnprocsD == YnprocsD ) ); if( !( XisR ) ) { /* * sub( X ) is distributed but not replicated */ if( YisR ) { /* * If sub( X ) is not replicated, but sub( Y ) is, a process row or column * YprocR need to be selected. It will contain the non-replicated vector to * swap sub( X ) with. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing YprocR to be equal to XprocR. */ YprocR = XprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for YprocR to maximize the number of links, i.e reduce contention. */ YprocR = MModAdd1( XprocR, XnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ YprocR = XprocD; } } else { /* * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or * column XprocR and not in process row or column YprocR, then quick return. */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) return; } } else { /* * sub( X ) is distributed and replicated (so no quick return possible) */ if( YisR ) { /* * sub( Y ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR and YprocR to be equal * to zero. */ XprocR = YprocR = 0; } else { /* * Otherwise, communication has to occur, so select YprocR to be zero and the * next process row or column for XprocR in order to maximize the number of * used links, i.e reduce contention. */ YprocR = 0; XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the * origin processes. */ XprocR = YprocD; YprocR = XprocD; } } else { /* * sub( Y ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) { /* * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. * Enforce a purely local operation by choosing XprocR to be equal to YprocR. */ XprocR = YprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for XprocR to maximize the number of links, i.e reduce contention. */ XprocR = MModAdd1( YprocR, YnprocsR ); } } else { /* * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is * chosen for XprocR does not really matter. Select the origin process. */ XprocR = YprocD; } } } /* * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or * column are active, namely XprocR and YprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ type = PB_Cztypeset(); size = type->size; /* * A purely local operation occurs iff the operands start in the same process * and, if either the grid is mono-dimensional or there is a single local block * to be swapped or if both operands are aligned. */ if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !XisR && ( XmyprocR == XprocR ) && !YisR && ( YmyprocR == YprocR ) ) || ( !XisR && YisR && ( YmyprocR == YprocR ) ) || ( !YisR && XisR && ( XmyprocR == XprocR ) ) || ( XisR && YisR ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( ( XnpD > 0 ) && ( YnpD > 0 ) ) { zswap_( &XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } if( RRorCC && XisR && YisR ) return; } } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to swap the distributed vectors in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( RRorCC && ( XprocR != YprocR ) ) { /* * Both operands are distributed in the same direction, but reside in different * process rows or columns. */ if( XmyprocR == XprocR ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { dst = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); dst = MPosMod( dst, YnprocsD ); if( XisRow ) { Czgesd2d( ctxt, 1, XnpD, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, YprocR, dst ); Czgerv2d( ctxt, 1, XnpD, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, YprocR, dst ); } else { Czgesd2d( ctxt, XnpD, 1, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, dst, YprocR ); Czgerv2d( ctxt, XnpD, 1, Mptr( ((char*) X), Xii, Xjj, Xld, size ), Xld, dst, YprocR ); } } } if( YmyprocR == YprocR ) { YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); dst = MPosMod( dst, XnprocsD ); if( YisRow ) { Czgesd2d( ctxt, 1, YnpD, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, XprocR, dst ); Czgerv2d( ctxt, 1, YnpD, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, XprocR, dst ); } else { Czgesd2d( ctxt, YnpD, 1, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, dst, XprocR ); Czgerv2d( ctxt, YnpD, 1, Mptr( ((char*) Y), Yii, Yjj, Yld, size ), Yld, dst, XprocR ); } } } } else { /* * General case when just one message needs to be exchanged */ if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) send it to the corresponding * process owning s piece of sub ( Y ). */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { dst = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); dst = MPosMod( dst, YnprocsD ); if( YisRow ) { rdst = YprocR; cdst = dst; } else { rdst = dst; cdst = YprocR; } if( ( myrow == rdst ) && ( mycol == cdst ) ) { zswap_( &XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); } else { if( XisRow ) Czgesd2d( ctxt, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, rdst, cdst ); else Czgesd2d( ctxt, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, rdst, cdst ); } } } if( YmyprocR == YprocR ) { /* * The processes owning a piece of sub( Y ) receive the corresponding piece * of sub( X ) and send the piece of sub( Y ) they own to the same process. */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { src = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); src = MPosMod( src, XnprocsD ); if( XisRow ) { rsrc = XprocR; csrc = src; } else { rsrc = src; csrc = XprocR; } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( YnpD * size ); if( XisRow ) Czgerv2d( ctxt, 1, YnpD, buf, 1, rsrc, csrc ); else Czgerv2d( ctxt, YnpD, 1, buf, YnpD, rsrc, csrc ); if( YisRow ) Czgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rsrc, csrc ); else Czgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, rsrc, csrc ); zcopy_( &YnpD, buf, &ione, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), &Ylinc ); if( buf ) free( buf ); } } } if( XmyprocR == XprocR ) { /* * The processes owning a piece of sub( X ) receive the corresponding piece * of sub( Y ). */ if( XnpD > 0 ) { if( ( myrow != rdst ) || ( mycol != cdst ) ) { buf = PB_Cmalloc( XnpD * size ); if( YisRow ) Czgerv2d( ctxt, 1, XnpD, buf, 1, rdst, cdst ); else Czgerv2d( ctxt, XnpD, 1, buf, XnpD, rdst, cdst ); zcopy_( &XnpD, buf, &ione, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xlinc ); if( buf ) free( buf ); } } } } } else if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) ) { /* * General case */ tran = ( RRorCC ? CNOTRAN : CTRAN ); if( XisRow ) { Xscope = CCOLUMN; Xm = 1; rsrc = XprocR; } else { Xscope = CROW; Xn = 1; csrc = XprocR; } if( YisRow ) { Yscope = CCOLUMN; Ym = 1; rdst = YprocR; } else { Yscope = CROW; Yn = 1; cdst = YprocR; } lcmb = PB_Clcm( XnprocsD * XnbD, YnprocsD * YnbD ); one = type->one; zero = type->zero; gcdPQ = PB_Cgcd( XnprocsD, YnprocsD ); lcmPQ = ( XnprocsD / gcdPQ ) * YnprocsD; for( k = 0; k < gcdPQ; k++ ) { p = 0; q = k; for( l = 0; l < lcmPQ; l++ ) { Xroc = MModAdd( XprocD, p, XnprocsD ); Yroc = MModAdd( YprocD, q, YnprocsD ); if( ( XmyprocD == Xroc ) || ( YmyprocD == Yroc ) ) { XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, Xroc, XprocD, XnprocsD ); YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, Yroc, YprocD, YnprocsD ); PB_CVMinit( &VM, 0, XnpD, YnpD, Xinb1D, Yinb1D, XnbD, YnbD, p, q, XnprocsD, YnprocsD, lcmb ); if( npq = PB_CVMnpq( &VM ) ) { if( ( RRorCC && ( Xroc == Yroc ) && ( XprocR == YprocR ) ) || ( !( RRorCC ) && ( Xroc == YprocR ) && ( XprocR == Yroc ) ) ) { /* * If I am at the intersection of the process cross, or simply common to the * process rows or columns owning sub( X ) and sub( Y ) */ if( ( YmyprocD == Yroc ) && ( YmyprocR == YprocR ) ) { PB_CVMswp( type, &VM, ROW, &Xscope, &tran, npq, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Ylinc ); } } else { /* * Perform the message exchange: pack the data I own, send it, receive the * remote data, and unpack it. */ if( ( XmyprocR == XprocR ) && ( XmyprocD == Xroc ) ) { if( XisRow ) { Xn = npq; } else { Xm = npq; } if( YisRow ) { Yn = npq; cdst = Yroc; } else { Ym = npq; rdst = Yroc; } buf = PB_Cmalloc( npq * size ); PB_CVMpack( type, &VM, ROW, &Xscope, PACKING, NOTRAN, npq, 1, one, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, zero, buf, Xm ); Czgesd2d( ctxt, Xm, Xn, buf, Xm, rdst, cdst ); Czgerv2d( ctxt, Ym, Yn, buf, Ym, rdst, cdst ); PB_CVMpack( type, &VM, ROW, &Xscope, UNPACKING, &tran, npq, 1, zero, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, one, buf, Ym ); if( buf ) free ( buf ); } if( ( YmyprocR == YprocR ) && ( YmyprocD == Yroc ) ) { if( XisRow ) { Xn = npq; csrc = Xroc; } else { Xm = npq; rsrc = Xroc; } if( YisRow ) { Yn = npq; } else { Ym = npq; } buf = PB_Cmalloc( npq * size ); PB_CVMpack( type, &VM, COLUMN, &Yscope, PACKING, NOTRAN, npq, 1, one, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, zero, buf, Ym ); Czgesd2d( ctxt, Ym, Yn, buf, Ym, rsrc, csrc ); Czgerv2d( ctxt, Xm, Xn, buf, Xm, rsrc, csrc ); PB_CVMpack( type, &VM, COLUMN, &Yscope, UNPACKING, &tran, npq, 1, zero, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, one, buf, Xm ); if( buf ) free ( buf ); } } } } p = MModAdd1( p, XnprocsD ); q = MModAdd1( q, YnprocsD ); } } } if( XisR ) { /* * Replicate sub( X ) when necessary */ XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); if( XnpD > 0 ) { if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) Czgebs2d( ctxt, COLUMN, top, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld ); else Czgebr2d( ctxt, COLUMN, top, 1, XnpD, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) Czgebs2d( ctxt, ROW, top, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld ); else Czgebr2d( ctxt, ROW, top, XnpD, 1, Mptr( ((char *) X), Xii, Xjj, Xld, size ), Xld, XmyprocD, XprocR ); } } } if( YisR ) { /* * Replicate sub( Y ) when necessary */ YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocR == YprocR ) Czgebs2d( ctxt, COLUMN, top, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld ); else Czgebr2d( ctxt, COLUMN, top, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, YprocR, YmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocR == YprocR ) Czgebs2d( ctxt, ROW, top, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld ); else Czgebr2d( ctxt, ROW, top, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), Yld, YmyprocD, YprocR ); } } } } else if( !( XisD ) && YisD ) { /* * sub( X ) is not distributed and sub( Y ) is distributed. */ PB_CpswapND( PB_Cztypeset(), *N, ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY ); } else if( XisD && !( YisD ) ) { /* * sub( X ) is distributed and sub( Y ) is not distributed. */ PB_CpswapND( PB_Cztypeset(), *N, ((char *) Y), Yi, Yj, Yd, *INCY, ((char *) X), Xi, Xj, Xd, *INCX ); } else { /* * Neither sub( X ) nor sub( Y ) are distributed. */ PB_CpswapNN( PB_Cztypeset(), *N, ((char *) X), Xi, Xj, Xd, *INCX, ((char *) Y), Yi, Yj, Yd, *INCY ); } /* * End of PZSWAP */ } scalapack-2.0.2/PBLAS/SRC/pzsymm_.c000644 000766 000024 00000053267 10363532303 017056 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzsymm_( F_CHAR_T SIDE, F_CHAR_T UPLO, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pzsymm_( SIDE, UPLO, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T SIDE, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; double * A, * B, * C; #endif { /* * Purpose * ======= * * PZSYMM performs one of the matrix-matrix operations * * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * or * * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha and beta are scalars, sub( A ) is a symmetric submatrix and * sub( B ) and sub( C ) are m by n submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether the symmetric submatrix * sub( A ) appears on the left or right in the operation as * follows: * * SIDE = 'L' or 'l' * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * SIDE = 'R' or 'r' * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric submatrix sub( A ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( A ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( A ) are to be * referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with SIDE = 'L' or 'l', this array contains * the local entries corresponding to the entries of the m by m * symmetric submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the symmetric submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * Before entry with SIDE = 'R' or 'r', this array contains * the local entries corresponding to the entries of the n by n * symmetric submatrix sub( A ), such that when UPLO = 'U' or * 'u', this array contains the local entries of the upper tri- * angular part of the symmetric submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric submatrix sub( A ), and the local entries of * the strictly upper triangular of sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAB, SideOp, UploA, cbtop, cbtopsave, cctop, cctopsave, rbtop, rbtopsave, rctop, rctopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, lside, mycol, myrow, nb, npcol, nprow, upper; double ABCest, BCest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PZSYMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZSYMM", "Illegal UPLO = %c\n", UploA ); info = -2; } if( lside ) { PB_Cchkmat( ctxt, "PZSYMM", "A", *M, 3, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZSYMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PZSYMM", "A", *N, 4, *N, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZSYMM", "B", *M, 3, *N, 4, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PZSYMM", "C", *M, 3, *N, 4, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PZSYMM", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) && ( ( BETA [REAL_PART] == ONE ) && ( BETA [IMAG_PART] == ZERO ) ) ) ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * If alpha is zero, sub( C ) := beta * sub( C ). */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else if( !( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) { PB_Cplascal( type, ALL, NOCONJG, *M, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (N >> M) * BCest : Both operands sub( B ) and sub( C ) are communicated (M >> N) */ if( lside ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *N, Bd[NB_], npcol ); ABCest = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); tmp4 = DNROC( *M, Cd[MB_], nprow ); BCest = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp1 = DNROC( *N, Ad[NB_], npcol ); tmp2 = DNROC( *M, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp2 + tmp2 * CBRATIO ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); tmp4 = DNROC( *N, Cd[NB_], npcol ); BCest = (double)(*M) * ( ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.5 * ABCest ) <= BCest ); /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( ChooseABC ) { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } } } if( lside ) DirAB = ( rbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAB = ( cbtop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CpsymmAB( type, &DirAB, NOCONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( ForceTop ) { rbtopsave = rbtop; rctopsave = rctop; cbtopsave = cbtop; cctopsave = cctop; if( lside ) { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( rbtop != CTOP_DRING ) && ( rbtop != CTOP_IRING ) && ( rbtop != CTOP_SRING ) ) || ( rbtop != rctop ) ) { rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } else { /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ( cbtop != CTOP_DRING ) && ( cbtop != CTOP_IRING ) && ( cbtop != CTOP_SRING ) ) || ( cbtop != cctop ) ) { cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove the next 2 lines when the BLACS combine operations support ring * topologies */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); } rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_DEFAULT ); rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); } } if( lside ) DirAB = ( ( rbtop == CTOP_DRING || rctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); else DirAB = ( ( cbtop == CTOP_DRING || cctop == CTOP_DRING ) ? CBACKWARD : CFORWARD ); PB_CpsymmBC( type, &DirAB, NOCONJG, &SideOp, &UploA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rbtopsave = *PB_Ctop( &ctxt, BCAST, ROW, &rbtopsave ); rctopsave = *PB_Ctop( &ctxt, COMBINE, ROW, &rctopsave ); cbtopsave = *PB_Ctop( &ctxt, BCAST, COLUMN, &cbtopsave ); cctopsave = *PB_Ctop( &ctxt, COMBINE, COLUMN, &cctopsave ); } /* * End of PZSYMM */ } scalapack-2.0.2/PBLAS/SRC/pzsyr2k_.c000644 000766 000024 00000052311 10363532303 017130 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzsyr2k_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pzsyr2k_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IB, * IC, * JA, * JB, * JC, * K, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; double * A, * B, * C; #endif { /* * Purpose * ======= * * PZSYR2K performs one of the symmetric rank 2k operations * * sub( C ) := alpha*sub( A )*sub( B )' + alpha*sub( B )*sub( A )' + * beta*sub( C ), * * or * * sub( C ) := alpha*sub( A )'*sub( B ) + alpha*sub( B )'*sub( A ) + * beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise, and, * * sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N', * B(IB:IB+K-1,JB:JB+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * submatrix and sub( A ) and sub( B ) are n by k submatrices in the * first case and k by n submatrices in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the symmetric submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( B )' + * alpha*sub( B )*sub( A )' + * beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrices sub( A ) and sub( B ), and on * entry with TRANS = 'T' or 't', K specifies the number of rows * of the submatrices sub( A ) and sub( B ). K must be at least * zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and B corresponding to the entries of the submatrices * sub( A ) and sub( B ) respectively need not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JB+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( B ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( B ). * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseABC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double ABCest, ABest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZSYR2K", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZSYR2K", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) { PB_Cchkmat( ctxt, "PZSYR2K", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZSYR2K", "B", *N, 3, *K, 4, Bi, Bj, Bd, 13, &info ); } else { PB_Cchkmat( ctxt, "PZSYR2K", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZSYR2K", "B", *K, 4, *N, 3, Bi, Bj, Bd, 13, &info ); } PB_Cchkmat( ctxt, "PZSYR2K", "C", *N, 3, *N, 3, Ci, Cj, Cd, 18, &info ); } if( info ) { PB_Cabort( ctxt, "PZSYR2K", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) && ( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * And when alpha or K is zero */ if( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABCest: operands sub( A ), sub( B ) and sub( C ) are communicated (K >> N) * ABest : only sub( A ) and sub( B ) are communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); tmp4 = DNROC( *K, Bd[NB_], npcol ); ABCest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[CSRC_] == -1 ) && ( Bd[CSRC_] == -1 ) ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[MB_], nprow ); ABest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + TWO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp3 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *K, Ad[MB_], nprow ); tmp4 = DNROC( *K, Bd[MB_], nprow ); ABCest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( ( Ad[RSRC_] == -1 ) && ( Bd[RSRC_] == -1 ) ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + TWO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) + MAX( tmp1, tmp4 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseABC = ( ( 1.4 * ABCest ) <= ABest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseABC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_Cpsyr2kA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PZSYR2K */ } scalapack-2.0.2/PBLAS/SRC/pzsyrk_.c000644 000766 000024 00000045133 10363532303 017052 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pzsyrk_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * N, int * K, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pzsyrk_( UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * K, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PZSYRK performs one of the symmetric rank k operations * * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * * or * * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and, * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * submatrix and sub( A ) is an n by k submatrix in the first case and a * k by n submatrix in the second case. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the symmetric submatrix sub( C ) are to be referenced as * follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * symmetric submatrix sub( C ) are to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * symmetric submatrix sub( C ) are to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry, with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrix sub( A ), and with TRANS = 'T' or * 't', K specifies the number of rows of the submatrix * sub( A ). K must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirA, OpC, OpR, TopC, TopR, TranOp, UploC, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, ChooseAC, Ci, Cj, ForceTop, ctxt, info, mycol, myrow, nb, notran, npcol, nprow, upper; double Aest, ACest, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZSYRK", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZSYRK", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PZSYRK", "A", *N, 3, *K, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PZSYRK", "A", *K, 4, *N, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZSYRK", "C", *N, 3, *N, 3, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PZSYRK", info ); return; } #endif /* * Quick return if possible */ if( ( *N == 0 ) || ( ( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) && ( ( BETA[REAL_PART] == ONE ) && ( BETA[IMAG_PART] == ZERO ) ) ) ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * And when alpha or K is zero */ if( ( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) || ( *K == 0 ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, &UploC, NOCONJG, *N, *N, type->zero, type->zero, ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( type, &UploC, NOCONJG, *N, *N, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); } return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ACest: both operands sub( A ) and sub( C ) are communicated (K >> N) * Aest : only sub( A ) is communicated (N >> K) */ if( notran ) { tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp3 = DNROC( *K, Ad[NB_], npcol ); ACest = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp3 ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : CBRATIO * tmp1 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *N, Ad[MB_], nprow ); Aest = (double)(*K) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); } else { tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp4 = DNROC( *K, Ad[MB_], nprow ); ACest = (double)(*N) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp4 ) + ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : CBRATIO * tmp2 / TWO ) ); tmp1 = DNROC( *N, Cd[MB_], nprow ); tmp2 = DNROC( *N, Cd[NB_], npcol ); tmp3 = DNROC( *N, Ad[NB_], npcol ); Aest = (double)(*K) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); } /* * Shift a little the cross-over point between both algorithms. */ ChooseAC = ( ( 1.3 * ACest ) <= Aest ); /* * BLACS topologies are enforced iff N and K are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *N > nb ) && ( *K > nb ) ); if( ChooseAC ) { if( notran ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CCOMBINE; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; if( upper ) { TopR = CTOP_IRING; TopC = CTOP_DRING; } else { TopR = CTOP_DRING; TopC = CTOP_IRING; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkAC( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } else { if( notran ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); if( ForceTop ) { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); } else { OpC = CBCAST; ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { OpR = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); } DirA = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); } PB_CpsyrkA( type, &DirA, NOCONJG, &UploC, ( notran ? NOTRAN : TRAN ), *N, *K, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)BETA), ((char *)C), Ci, Cj, Cd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PZSYRK */ } scalapack-2.0.2/PBLAS/SRC/pztradd_.c000644 000766 000024 00000033734 10363532303 017164 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pztradd_( F_CHAR_T UPLO, F_CHAR_T TRANS, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pztradd_( UPLO, TRANS, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ F_CHAR_T TRANS, UPLO; int * IA, * IC, * JA, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PZTRADD adds a trapezoidal matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if TRANS = 'N', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'T', * conjg(A(IA:IA+N-1,JA:JA+M-1)') if TRANS = 'C', * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * upper or lower trapezoidal submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the local pieces of the * array C containing the upper or lower triangular part of the * triangular submatrix sub( C ) is to be referenced as follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * triangular submatrix sub( C ) is to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * triangular submatrix sub( C ) is to be * referenced. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the form of op( sub( A ) ) to be * used in the matrix addition as follows: * * TRANS = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANS = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANS = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ) when TRANS = 'N' or 'n' and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u' and TRANS = 'N' or 'n' or * UPLO = 'L' or 'l' and TRANS = 'T', 'C', 't' or 'c', this ar- * ray contains the local entries corresponding to the entries * of the upper triangular submatrix sub( A ), and the local en- * tries corresponding to the entries of the strictly lower tri- * angular part of the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l' and TRANS = 'N' or 'n' or * UPLO = 'U' or 'u' and TRANS = 'T', 'C', 't' or 'c', this ar- * ray contains the local entries corresponding to the entries * of the lower triangular submatrix sub( A ), and the local en- * tries corresponding to the entries of the strictly upper tri- * angular part of the submatrix sub( A ) are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the triangular submatrix sub( C ), and the local entries * corresponding to the strictly lower triangular of sub( C ) * are not referenced. On exit, the upper triangular part of * sub( C ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the triangular submatrix sub( C ), and the local entries * corresponding to the strictly upper triangular of sub( C ) * are not referenced. On exit, the lower triangular part of * sub( C ) is overwritten by the lower triangular part of the * updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DirAC, TranOp, UploC, ctop, rtop; int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, notran, npcol, nprow, upper; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ upper = ( ( UploC = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 901 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploC != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRADD", "Illegal UPLO = %c\n", UploC ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRADD", "Illegal TRANS = %c\n", TranOp ); info = -2; } if( notran ) PB_Cchkmat( ctxt, "PZTRADD", "A", *M, 3, *N, 4, Ai, Aj, Ad, 9, &info ); else PB_Cchkmat( ctxt, "PZTRADD", "A", *N, 4, *M, 3, Ai, Aj, Ad, 9, &info ); PB_Cchkmat( ctxt, "PZTRADD", "C", *M, 3, *N, 4, Ci, Cj, Cd, 14, &info ); } if( info ) { PB_Cabort( ctxt, "PZTRADD", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( PB_Cztypeset(), &UploC, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cztypeset(), &UploC, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ /* * This operation mainly involves point-to-point send and receive communication. * There is therefore no particular BLACS topology to recommend. Still, one can * choose the main loop direction in which the operands will be added, but not * transposed. This selection is based on the current setting for the BLACS * broadcast operations. */ rtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( *M <= *N ) DirAC = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirAC = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_Cptradd( PB_Cztypeset(), &DirAC, &UploC, ( notran ? NOTRAN : ( ( TranOp == CCOTRAN ) ? COTRAN : TRAN ) ), *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PZTRADD */ } scalapack-2.0.2/PBLAS/SRC/pztranc_.c000644 000766 000024 00000022736 10363532303 017175 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pztranc_( int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pztranc_( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ int * IA, * IC, * JA, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PZTRANC transposes a matrix * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+M-1), and, * * op( X ) = conjg( X )'. * * Thus, op( sub( A ) ) denotes conjg( A(IA:IA+N-1,JA:JA+M-1)' ). * * Beta is a scalar, sub( C ) is an m by n submatrix, and sub( A ) is an * n by m submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkmat( ctxt, "PZTRANC", "A", *N, 2, *M, 1, Ai, Aj, Ad, 7, &info ); PB_Cchkmat( ctxt, "PZTRANC", "C", *M, 1, *N, 2, Ci, Cj, Cd, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PZTRANC", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( PB_Cztypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cztypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ PB_Cptran( PB_Cztypeset(), CONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PZTRANC */ } scalapack-2.0.2/PBLAS/SRC/pztranu_.c000644 000766 000024 00000022704 10363532303 017212 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pztranu_( int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * BETA, double * C, int * IC, int * JC, int * DESCC ) #else void pztranu_( M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ int * IA, * IC, * JA, * JC, * M, * N; double * ALPHA, * BETA; /* * .. Array Arguments .. */ int * DESCA, * DESCC; double * A, * C; #endif { /* * Purpose * ======= * * PZTRANU transposes a matrix * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+M-1), and, op( X ) = X'. * * Thus, op( sub( A ) ) denotes A(IA:IA+N-1,JA:JA+M-1)'. * * Beta is a scalar, sub( C ) is an m by n submatrix, and sub( A ) is an * n by m submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) COMPLEX*16 array * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Ai, Aj, Ci, Cj, ctxt, info, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Ad[DLEN_], Cd[DLEN_]; /* .. * .. Executable Statements .. * */ PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IC, *JC, DESCC, &Ci, &Cj, Cd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) ) { PB_Cchkmat( ctxt, "PZTRANU", "A", *N, 2, *M, 1, Ai, Aj, Ad, 7, &info ); PB_Cchkmat( ctxt, "PZTRANU", "C", *M, 1, *N, 2, Ci, Cj, Cd, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PZTRANU", info ); return; } #endif /* * Quick return if possible */ if( ( *M == 0 ) || ( *N == 0 ) || ( ( ALPHA[REAL_PART] == ZERO && ALPHA[IMAG_PART] == ZERO ) && ( BETA [REAL_PART] == ONE && BETA [IMAG_PART] == ZERO ) ) ) return; /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { if( ( BETA[REAL_PART] == ZERO ) && ( BETA[IMAG_PART] == ZERO ) ) { PB_Cplapad( PB_Cztypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char *)BETA), ((char *) C), Ci, Cj, Cd ); } else { PB_Cplascal( PB_Cztypeset(), ALL, NOCONJG, *M, *N, ((char *)BETA), ((char * )C), Ci, Cj, Cd ); } return; } /* * Start the operations */ PB_Cptran( PB_Cztypeset(), NOCONJG, *M, *N, ((char *) ALPHA), ((char *) A), Ai, Aj, Ad, ((char *) BETA), ((char *) C), Ci, Cj, Cd ); /* * End of PZTRANU */ } scalapack-2.0.2/PBLAS/SRC/pztrmm_.c000644 000766 000024 00000052443 10363532303 017043 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pztrmm_( F_CHAR_T SIDE, F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB ) #else void pztrmm_( SIDE, UPLO, TRANS, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, SIDE, TRANS, UPLO; int * IA, * IB, * JA, * JB, * M, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCB; double * A, * B; #endif { /* * Purpose * ======= * * PZTRMM performs one of the matrix-matrix operations * * sub( B ) := alpha * op( sub( A ) ) * sub( B ), * * or * * sub( B ) := alpha * sub( B ) * op( sub( A ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, sub( B ) is an m by n submatrix, sub( A ) is a * unit, or non-unit, upper or lower triangular submatrix and op( X ) is * one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether op( sub( A ) ) multiplies * sub( B ) from the left or right as follows: * * SIDE = 'L' or 'l' sub( B ) := alpha*op( sub( A ) )*sub( B ), * * SIDE = 'R' or 'r' sub( B ) := alpha*sub( B )*op( sub( A ) ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) COMPLEX*16 array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n transformed submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, DirB, OpC, OpR, SideOp, TopC, TopR, TranOp, UploA, Var, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ForceTop, ctxt, info, lside, mycol, myrow, nb, notran, nounit, npcol, nprow, upper; double ABestL, ABestR, Best, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1101 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRMM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRMM", "Illegal UPLO = %c\n", UploA ); info = -2; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRMM", "Illegal TRANS = %c\n", TranOp ); info = -3; } if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRMM", "Illegal DIAG = %c\n", DiagA ); info = -4; } if( lside ) PB_Cchkmat( ctxt, "PZTRMM", "A", *M, 5, *M, 5, Ai, Aj, Ad, 11, &info ); else PB_Cchkmat( ctxt, "PZTRMM", "A", *N, 6, *N, 6, Ai, Aj, Ad, 11, &info ); PB_Cchkmat( ctxt, "PZTRMM", "B", *M, 5, *N, 6, Bi, Bj, Bd, 15, &info ); } if( info ) { PB_Cabort( ctxt, "PZTRMM", info ); return; } #endif /* * Quick return if possible */ if( *M == 0 || *N == 0 ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) B), Bi, Bj, Bd ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. * * ABestR, ABestL : both operands sub( A ) and sub( B ) are communicated * ( N >> M when SIDE is left and M >> N otherwise ) * Best : only sub( B ) is communicated * ( M >> N when SIDE is left and N >> M otherwise ) */ if( lside ) { if( notran ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); ChooseAB = ( ( 1.1 * ABestR ) <= Best ); } else { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestL = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + MAX( tmp2, tmp1 ) / TWO ); tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 ) + CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + MAX( tmp2, tmp4 ) ); ChooseAB = ( ( ( 1.1 * ABestL ) <= Best ) || ( ( 1.1 * ABestR ) <= Best ) ); } } else { if( notran ) { tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( CBRATIO * ( nprow == 1 ? ZERO : tmp2 ) + ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); ChooseAB = ( ( 1.1 * ABestR ) <= Best ); } else { tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestL = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + CBRATIO * ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + MAX( tmp2, tmp1 ) / TWO ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 ) + CBRATIO * ( npcol == 1 ? ZERO : tmp1 ) + MAX( tmp1, tmp3 ) ); ChooseAB = ( ( ( 1.1 * ABestL ) <= Best ) || ( ( 1.1 * ABestR ) <= Best ) ); } } /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); ForceTop = ( ( *M > nb ) && ( *N > nb ) ); if( ChooseAB ) { if( lside ) { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { if( ABestL <= ABestR ) { OpR = CBCAST; OpC = CCOMBINE; Var = CLEFT; if( upper ) { TopR = CTOP_DRING; TopC = CTOP_IRING; } else { TopR = CTOP_IRING; TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } } } else { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { if( ABestL <= ABestR ) { OpR = CCOMBINE; OpC = CBCAST; Var = CLEFT; if( upper ) { TopR = CTOP_DRING; TopC = CTOP_IRING; } else { TopR = CTOP_IRING; TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } PB_CptrmmAB( type, &Var, &SideOp, &UploA, &TranOp, &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); } else { if( ( lside && notran ) || ( !( lside ) && !( notran ) ) ) { OpR = CCOMBINE; OpC = CBCAST; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( rtop != CTOP_DRING ) && ( rtop != CTOP_IRING ) && ( rtop != CTOP_SRING ) ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_SRING ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); } } else { OpR = CBCAST; OpC = CCOMBINE; rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ForceTop ) { rtopsave = rtop; ctopsave = ctop; /* * No clear winner for the ring topologies, so that if a ring topology is * already selected, keep it. */ if( ( ctop != CTOP_DRING ) && ( ctop != CTOP_IRING ) && ( ctop != CTOP_SRING ) ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_SRING ); rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); /* * Remove the next line when the BLACS combine operations support ring * topologies */ ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } } if( lside ) DirB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirB = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CptrmmB( type, &DirB, &SideOp, &UploA, &TranOp, &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); } /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PZTRMM */ } scalapack-2.0.2/PBLAS/SRC/pztrmv_.c000644 000766 000024 00000047637 10363532303 017065 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pztrmv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pztrmv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; /* * .. Array Arguments .. */ int * DESCA, * DESCX; double * A, * X; #endif { /* * Purpose * ======= * * PZTRMV performs one of the matrix-vector operations * * sub( X ) := sub( A )*sub( X ) or sub( X ) := sub( A )'*sub( X ) * * or * * sub( X ) := conjg( sub( A )' )*sub( X ), * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * sub( X ) is an n element subvector and sub( A ) is an n by n unit, * or non-unit, upper or lower triangular submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( X ) := sub( A ) * sub( X ). * * TRANS = 'T' or 't' sub( X ) := sub( A )' * sub( X ). * * TRANS = 'C' or 'c' * sub( X ) := conjg( sub( A )' ) * sub( X ). * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with the transfor- * med subvector. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Xroc, * one, * tbeta, top, *zero; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, XAfr, XAld, Xcol, Xi, Xii, Xj, Xjj, Xld, Xnp, Xnq, Xrow, YAfr, YAld, YApbY, YAsum, ctxt, info, ione=1, k, kb, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad [DLEN_], Ad0[DLEN_], XAd[DLEN_], Xd[DLEN_], YAd[DLEN_]; char * Aptr = NULL, * XA = NULL, * YA = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRMV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRMV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRMV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PZTRMV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info ); PB_Cchkvec( ctxt, "PZTRMV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PZTRMV", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); size = type->size; one = type->one; zero = type->zero; /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN ); if( notran ) { /* * Replicate sub( X ) in process rows spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, ROW, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XA, XAd, &XAfr ); /* * Reuse sub( X ) and/or create vector YA in process columns spanned by sub( A ) */ PB_CInOutV( type, COLUMN, *N, *N, Ad0, 1, one, ((char *) X), Xi, Xj, Xd, &Xroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * If sub( X ) is distributed in (a) process column(s), then zero it. */ if( Xroc == CCOLUMN ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * sub( X ) resides in (a) process columns(s) */ if( ( mycol == Xcol ) || ( Xcol < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnp = PB_Cnumroc( *N, Xi, Xd[IMB_], Xd[MB_], myrow, Xd[RSRC_], nprow ); if( Xnp > 0 ) { zset_( &Xnp, zero, Mptr( ((char *) X), Xii, Xjj, Xd[LLD_], size ), &ione ); } } } } else { /* * Replicate sub( X ) in process columns spanned by sub( A ) -> XA */ PB_CInV( type, NOCONJG, COLUMN, *N, *N, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XA, XAd, &XAfr ); /* * Reuse sub( X ) and/or create vector YA in process rows spanned by sub( A ) */ PB_CInOutV( type, ROW, *N, *N, Ad0, 1, one, ((char *) X), Xi, Xj, Xd, &Xroc, &tbeta, &YA, YAd, &YAfr, &YAsum, &YApbY ); /* * If sub( X ) is distributed in (a) process row(s), then zero it. */ if( Xroc == CROW ) { /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol */ PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); /* * sub( X ) resides in (a) process row(s) */ if( ( myrow == Xrow ) || ( Xrow < 0 ) ) { /* * Make sure I own some data and scale sub( X ) */ Xnq = PB_Cnumroc( *N, Xj, Xd[INB_], Xd[NB_], mycol, Xd[CSRC_], npcol ); if( Xnq > 0 ) { Xld = Xd[LLD_]; zset_( &Xnq, zero, Mptr( ((char *) X), Xii, Xjj, Xld, size ), &Xld ); } } } } /* * Local matrix-vector multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Acol = Ad0[CSRC_]; Arow = Ad0[RSRC_]; Amp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XAld = XAd[LLD_]; YAld = YAd[LLD_]; /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); if( upper ) { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { zgemv_( TRANS, &Akp, &Anq0, one, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, YA, &ione ); } PB_Cptrm( type, type, LEFT, UPPER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, size ), YAld, PB_Ctztrmv ); } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 && Anq0 > 0 ) { zgemv_( TRANS, &Akp, &Anq0, one, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, XA, &ione, one, Mptr( YA, 0, Akq, YAld, size ), &YAld ); } PB_Cptrm( type, type, LEFT, UPPER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, size ), YAld, PB_Ctztrmv ); } } } else { if( notran ) { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, type, LEFT, LOWER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, 0, Akq, XAld, size ), XAld, Mptr( YA, Akp, 0, YAld, size ), YAld, PB_Ctztrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { zgemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, 0, Akq, XAld, size ), &XAld, one, Mptr( YA, Akp, 0, YAld, size ), &ione ); } } } else { for( k = 0; k < *N; k += nb ) { kb = *N - k; ktmp = k + ( kb = MIN( kb, nb ) ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( type, type, LEFT, LOWER, &TranOp, &DiagA, kb, 1, one, Aptr, k, k, Ad0, Mptr( XA, Akp, 0, XAld, size ), XAld, Mptr( YA, 0, Akq, YAld, size ), YAld, PB_Ctztrmv ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Amp0 = Amp - Akp; Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Amp0 > 0 && Anq0 > 0 ) { zgemv_( TRANS, &Amp0, &Anq0, one, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XA, Akp, 0, XAld, size ), &ione, one, Mptr( YA, 0, Akq, YAld, size ), &YAld ); } } } } } if( XAfr ) free( XA ); if( notran ) { /* * Combine the partial column results into YA */ if( YAsum && ( Amp > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Czgsum2d( ctxt, ROW, &top, Amp, 1, YA, YAd[LLD_], myrow, YAd[CSRC_] ); } /* * sub( X ) := YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, YA, 0, 0, YAd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } } else { /* * Combine the partial row results into YA */ if( YAsum && ( Anq > 0 ) ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Czgsum2d( ctxt, COLUMN, &top, 1, Anq, YA, YAd[LLD_], YAd[RSRC_], mycol ); } /* * sub( X ) := YA (if necessary) */ if( YApbY ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, YA, 0, 0, YAd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } } if( YAfr ) free( YA ); /* * End of PZTRMV */ } scalapack-2.0.2/PBLAS/SRC/pztrsm_.c000644 000766 000024 00000052410 11622500733 017044 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pztrsm_( F_CHAR_T SIDE, F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * M, int * N, double * ALPHA, double * A, int * IA, int * JA, int * DESCA, double * B, int * IB, int * JB, int * DESCB ) #else void pztrsm_( SIDE, UPLO, TRANS, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, SIDE, TRANS, UPLO; int * IA, * IB, * JA, * JB, * M, * N; double * ALPHA; /* * .. Array Arguments .. */ int * DESCA, * DESCB; double * A, * B; #endif { /* * Purpose * ======= * * PZTRSM solves one of the matrix equations * * op( sub( A ) )*X = alpha*sub( B ), or * * X*op( sub( A ) ) = alpha*sub( B ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is * a unit, or non-unit, upper or lower triangular submatrix and op( Y ) * is one of * * op( Y ) = Y or op( Y ) = Y' or op( Y ) = conjg( Y' ). * * The submatrix X is overwritten on sub( B ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * On entry, SIDE specifies whether op( sub( A ) ) appears on * the left or right of X as follows: * * SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ), * * SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ). * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) CHARACTER*1 * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) COMPLEX*16 array * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n solution submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, DirB, OpC, OpR, SideOp, TopC, TopR, TranOp, UploA, Var, ctop, ctopsave, rtop, rtopsave; int Ai, Aj, Bi, Bj, ChooseAB, ForceTop, ctxt, info, itmp, lside, mycol, myrow, nb, notran, nounit, npcol, nprow, upper; double ABestL, ABestR, Best, tmp1, tmp2, tmp3, tmp4; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Bd[DLEN_]; /* .. * .. Executable Statements .. * */ lside = ( ( SideOp = Mupcase( F2C_CHAR( SIDE )[0] ) ) == CLEFT ); upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IB, *JB, DESCB, &Bi, &Bj, Bd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 1101 + CTXT_ ) : 0 ) ) ) { if( ( !lside ) && ( SideOp != CRIGHT ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRSM", "Illegal SIDE = %c\n", SideOp ); info = -1; } else if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRSM", "Illegal UPLO = %c\n", UploA ); info = -2; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRSM", "Illegal TRANS = %c\n", TranOp ); info = -3; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRSM", "Illegal DIAG = %c\n", DiagA ); info = -4; } if( lside ) PB_Cchkmat( ctxt, "PZTRSM", "A", *M, 5, *M, 5, Ai, Aj, Ad, 11, &info ); else PB_Cchkmat( ctxt, "PZTRSM", "A", *N, 6, *N, 6, Ai, Aj, Ad, 11, &info ); PB_Cchkmat( ctxt, "PZTRSM", "B", *M, 5, *N, 6, Bi, Bj, Bd, 15, &info ); } if( info ) { PB_Cabort( ctxt, "PZTRSM", info ); return; } #endif /* * Quick return if possible */ if( *M == 0 || *N == 0 ) return; /* * Get type structure */ type = PB_Cztypeset(); /* * And when alpha is zero */ if( ( ALPHA[REAL_PART] == ZERO ) && ( ALPHA[IMAG_PART] == ZERO ) ) { PB_Cplapad( type, ALL, NOCONJG, *M, *N, type->zero, type->zero, ((char *) B), Bi, Bj, Bd ); return; } /* * Start the operations */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Algorithm selection is based on approximation of the communication volume * for distributed and aligned operands. */ nb = pilaenv_( &ctxt, C2F_CHAR( &type->type ) ); /* * ABestR, ABestL : both operands sub( A ) and sub( B ) are communicated * ( N >> M when SIDE is left and M >> N otherwise ) * Best : only sub( B ) is communicated * ( M >> N when SIDE is left and N >> M otherwise ) */ if( lside ) { if( notran ) { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); itmp = MIN( Ad[MB_], Ad[NB_] ); Best = (double)(*N) * ( (double)(CEIL( *M, itmp )) * (double)(itmp) * ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ); ChooseAB = ( ABestR <= ( 2.0 * Best ) ); } else { tmp1 = DNROC( *M, Ad[MB_], nprow ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp4 = DNROC( *N, Bd[NB_], npcol ); ABestL = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + CBRATIO * ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) ); ABestR = (double)(*M) * ( ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp1 / TWO ) + ( ( ( Bd[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp4 ) + MAX( tmp2, tmp1 ) / TWO ); itmp = MIN( Ad[MB_], Ad[NB_] ); tmp2 = DNROC( *M, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); Best = (double)(*N) * ( (double)(CEIL( *M, itmp )) * (double)(itmp) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ) + MAX( tmp2, tmp3 ) ); ChooseAB = ( ( ABestL <= ( 2.0 * Best ) ) || ( ABestR <= ( 2.0 * Best ) ) ); } } else { if( notran ) { tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); itmp = MIN( Ad[MB_], Ad[NB_] ); Best = (double)(*M) * ( (double)(CEIL( *N, itmp )) * (double)(itmp) * ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ); ChooseAB = ( ABestR <= ( 2.0 * Best ) ); } else { tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp2 = DNROC( *N, Ad[NB_], npcol ); tmp3 = DNROC( *M, Bd[MB_], nprow ); ABestL = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + CBRATIO * ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) ); ABestR = (double)(*N) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : tmp2 / TWO ) + ( ( ( Bd[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : tmp3 ) + MAX( tmp2, tmp1 ) / TWO ); itmp = MIN( Ad[MB_], Ad[NB_] ); tmp1 = DNROC( *N, Ad[MB_], nprow ); tmp4 = DNROC( *N, Bd[NB_], npcol ); Best = (double)(*M) * ( (double)(CEIL( *N, itmp )) * (double)(itmp) * ( ( ( ( Ad[RSRC_] == -1 ) || ( nprow == 1 ) ) ? ZERO : ONE ) + ( ( ( Ad[CSRC_] == -1 ) || ( npcol == 1 ) ) ? ZERO : ONE ) ) + MAX( tmp1, tmp4 ) ); ChooseAB = ( ( ABestL <= ( 2.0 * Best ) ) || ( ABestR <= ( 2.0 * Best ) ) ); } } /* * Var can remain uninitialized but is nevertheless used in PB_CptrsmAB.c * provide a default here. TODO: does this make sense ? *==19891== at 0x44F81B: PB_CptrsmAB (PB_CptrsmAB.c:538) *==19891== by 0x427BE7: pdtrsm_ (pdtrsm_.c:488) *==19891== by 0x405E46: MAIN_ (pdblas3tim.f:727) */ Var = CRIGHT; if( ChooseAB ) { /* * BLACS topologies are enforced iff M and N are strictly greater than the * logical block size returned by pilaenv_. Otherwise, it is assumed that the * routine calling this routine has already selected an adequate topology. */ ForceTop = ( ( *M > nb ) && ( *N > nb ) ); if( ForceTop ) { if( lside ) { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { if( ABestL <= ABestR ) { OpR = CBCAST; OpC = CCOMBINE; Var = CLEFT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } } } else { if( notran ) { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } } else { if( ABestL <= ABestR ) { OpR = CCOMBINE; OpC = CBCAST; Var = CLEFT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } else { OpR = CBCAST; OpC = CBCAST; Var = CRIGHT; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } } } } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); /* * Remove the next 4 lines when the BLACS combine operations support ring * topologies */ if( OpR == CCOMBINE ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_DEFAULT ); if( OpC == CCOMBINE ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_DEFAULT ); } PB_CptrsmAB( type, &Var, &SideOp, &UploA, &TranOp, &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); /* * Restore the BLACS topologies when necessary. */ if( ForceTop ) { rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } } else { /* * BLACS topologies are always enforced. */ if( ( lside && notran ) || ( !lside && !notran ) ) { OpR = CCOMBINE; OpC = CBCAST; if( upper ) { TopR = TopC = CTOP_DRING; } else { TopR = TopC = CTOP_IRING; } /* * Remove the next line when the BLACS combine operations support ring * topologies */ TopR = CTOP_DEFAULT; } else { OpR = CBCAST; OpC = CCOMBINE; if( upper ) { TopR = TopC = CTOP_IRING; } else { TopR = TopC = CTOP_DRING; } /* * Remove the next line when the BLACS combine operations support ring * topologies */ TopC = CTOP_DEFAULT; } rtop = *PB_Ctop( &ctxt, &OpR, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, TOP_GET ); if( ( rtopsave = rtop ) != TopR ) rtop = *PB_Ctop( &ctxt, &OpR, ROW, &TopR ); if( ( ctopsave = ctop ) != TopC ) ctop = *PB_Ctop( &ctxt, &OpC, COLUMN, &TopC ); if( lside ) DirB = ( rtop == CTOP_DRING ? CBACKWARD : CFORWARD ); else DirB = ( ctop == CTOP_DRING ? CBACKWARD : CFORWARD ); PB_CptrsmB( type, &DirB, &SideOp, &UploA, &TranOp, &DiagA, *M, *N, ((char *)ALPHA), ((char *)A), Ai, Aj, Ad, ((char *)B), Bi, Bj, Bd ); /* * Restore the BLACS topologies. */ rtopsave = *PB_Ctop( &ctxt, &OpR, ROW, &rtopsave ); ctopsave = *PB_Ctop( &ctxt, &OpC, COLUMN, &ctopsave ); } /* * End of PZTRSM */ } scalapack-2.0.2/PBLAS/SRC/pztrsv_.c000644 000766 000024 00000076150 10363532303 017063 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" #include "PBpblas.h" #include "PBtools.h" #include "PBblacs.h" #include "PBblas.h" #ifdef __STDC__ void pztrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, double * A, int * IA, int * JA, int * DESCA, double * X, int * IX, int * JX, int * DESCX, int * INCX ) #else void pztrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, DESCX, INCX ) /* * .. Scalar Arguments .. */ F_CHAR_T DIAG, TRANS, UPLO; int * IA, * INCX, * IX, * JA, * JX, * N; /* * .. Array Arguments .. */ int * DESCA, * DESCX; double * A, * X; #endif { /* * Purpose * ======= * * PZTRSV solves one of the systems of equations * * sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b, or * * conjg( sub( A )' )*sub( X ) = b, * * where * * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. * * b and sub( X ) are n element subvectors and sub( A ) is an n by n * unit, or non-unit, upper or lower triangular submatrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( A ) * sub( X ) = b. * * TRANS = 'T' or 't' sub( A )' * sub( X ) = b. * * TRANS = 'C' or 'c' conjg( sub( A )' ) * sub( X ) = b. * * DIAG (global input) CHARACTER*1 * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input/local output) COMPLEX*16 array * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On entry, sub( X ) is the n element right-hand side * b. On exit, sub( X ) is overwritten with the solution subvec- * tor. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one, * zero; int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb, Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld, XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info, ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran, nounit, npcol, nprow, size, upper; PBTYP_T * type; /* * .. Local Arrays .. */ int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_]; char * Aptr = NULL, * XAC = NULL, * XAR = NULL; /* .. * .. Executable Statements .. * */ upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); #ifndef NO_ARGCHK /* * Test the input parameters */ Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) { if( ( !upper ) && ( UploA != CLOWER ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRSV", "Illegal UPLO = %c\n", UploA ); info = -1; } else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRSV", "Illegal TRANS = %c\n", TranOp ); info = -2; } else if( ( !nounit ) && ( DiagA != CUNIT ) ) { PB_Cwarn( ctxt, __LINE__, "PZTRSV", "Illegal DIAG = %c\n", DiagA ); info = -3; } PB_Cchkmat( ctxt, "PZTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info ); PB_Cchkvec( ctxt, "PZTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info ); } if( info ) { PB_Cabort( ctxt, "PZTRSV", info ); return; } #endif /* * Quick return if possible */ if( *N == 0 ) return; /* * Retrieve process grid information */ #ifdef NO_ARGCHK Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); #endif /* * Get type structure */ type = PB_Cztypeset(); size = type->size; one = type->one; zero = type->zero; negone = type->negone; /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN ); if( notran ) { if( upper ) { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING ); (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process column owning the last * column of sub( A ) */ PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, &XACfr, &XACsum, &XACapbX ); /* * Create vector XAR in process rows spanned by sub( A ) */ PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, &XARsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAC contains the solution scattered in multiple * process columns and XAR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ if( Akp > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( XACsum ) { kbprev = MIN( k, nb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow, Arow, nprow ); Akp -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) zgemv_( TRANS, &ktmp, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol ); Czgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 0, XACld, size ), XACld, myrow, Asrc ); if( mycol != Asrc ) zset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } if( Akp > 0 && Anq0 > 0 ) zgemv_( TRANS, &Akp, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, XAC, &ione ); } else { if( Anq0 > 0 ) zgemv_( TRANS, &Akp, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, XAC, &ione ); } } } /* * Combine the scattered resulting vector XAC */ if( XACsum && ( Anp > 0 ) ) { Czgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, XACd[CSRC_] ); } /* * sub( X ) := XAC (if necessary) */ if( XACapbX ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); } else { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process column owning the first * column of sub( A ) */ PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, &XACfr, &XACsum, &XACapbX ); /* * Create vector XAR in process rows spanned by sub( A ) */ PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, &XARsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = 0; k < *N; k += nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAC contains the solution scattered in multiple * process columns and XAR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow ); if( ( Anp0 = Anp - Akp ) > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( XACsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow, nprow ); Anp0 -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) zgemv_( TRANS, &ktmp, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol ); Czgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 0, XACld, size ), XACld, myrow, Asrc ); if( mycol != Asrc ) zset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } if( Anp0 > 0 && Anq0 > 0 ) zgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione ); } else { if( Anq0 > 0 ) zgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, Mptr( XAC, Akp, 0, XACld, size ), &ione ); } } } /* * Combine the scattered resulting vector XAC */ if( XACsum && ( Anp > 0 ) ) { Czgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, XACd[CSRC_] ); } /* * sub( X ) := XAC (if necessary) */ if( XACapbX ) { PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); } } else { if( upper ) { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAR in process row owning the first row * of sub( A ) */ PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, &XARfr, &XARsum, &XARapbX ); /* * Create vector XAC in process columns spanned by sub( A ) */ PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, &XACsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = 0; k < *N; k += nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAR contains the solution scattered in multiple * process rows and XAC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later is * only locally updated. */ Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anq0 = Anq - Akq ) > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( XARsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol, npcol ); Anq0 -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) zgemv_( TRANS, &Anp0, &ktmp, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow ); Czgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, Akq, XARld, size ), XARld, Asrc, mycol ); if( myrow != Asrc ) zset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } if( Anp0 > 0 && Anq0 > 0 ) zgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld ); } else { if( Anp0 > 0 ) zgemv_( TRANS, &Anp0, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } } } /* * Combine the scattered resulting vector XAR */ if( XARsum && ( Anq > 0 ) ) { Czgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], mycol ); } /* * sub( X ) := XAR (if necessary) */ if( XARapbX ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); } else { /* * Save current and enforce ring BLACS topologies */ btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING ); /* * Remove next line when BLACS combine operations support ring topologies. */ (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); /* * Reuse sub( X ) and/or create vector XAC in process row owning the last row * of sub( A ) */ PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1, ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, &XARfr, &XARsum, &XARapbX ); /* * Create vector XAC in process columns spanned by sub( A ) */ PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, &XACsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); XACld = XACd[LLD_]; XARld = XARd[LLD_]; for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) { ktmp = *N - k; kb = MIN( ktmp, nb ); /* * Solve logical diagonal block, XAR contains the solution scattered in multiple * process rows and XAC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, Akq, XARld, size ), XARld ); /* * Update: only the part of sub( X ) to be solved at the next step is locally * updated and combined, the remaining part of the vector to be solved later * is only locally updated. */ if( Akq > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( XARsum ) { kbprev = MIN( k, nb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol, Acol, npcol ); Akq -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) zgemv_( TRANS, &Anp0, &ktmp, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow ); Czgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, Akq, XARld, size ), XARld, Asrc, mycol ); if( myrow != Asrc ) zset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, size ), &XARld ); } if( Anp0 > 0 && Akq > 0 ) zgemv_( TRANS, &Anp0, &Akq, negone, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, XAR, &XARld ); } else { if( Anp0 > 0 ) zgemv_( TRANS, &Anp0, &Akq, negone, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, Mptr( XAC, Akp, 0, XACld, size ), &ione, one, XAR, &XARld ); } } } /* * Combine the scattered resulting vector XAR */ if( XARsum && ( Anq > 0 ) ) { Czgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], mycol ); } /* * sub( X ) := XAR (if necessary) */ if( XARapbX ) { PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); } /* * Restore BLACS topologies */ (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); } } if( XACfr ) free( XAC ); if( XARfr ) free( XAR ); /* * End of PZTRSV */ } scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cagemv.f000644 000766 000024 00000020206 10363532303 017727 0ustar00juliestaff000000 000000 SUBROUTINE CAGEMV( TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER INCX, INCY, LDA, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL Y( * ) COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CAGEMV performs one of the matrix-vector operations * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * or * * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ), * * or * * y := abs( alpha )*abs( conjg( A' ) )*abs( x ) + abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an m by n matrix. * * Arguments * ========= * * TRANS (input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ) * * TRANS = 'T' or 't': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * TRANS = 'C' or 'c': * y := abs( alpha )*abs( conjg( A' ) )*abs( x ) + * abs( beta*y ) * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * ALPHA (input) REAL * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX array of dimension ( LDA, n ). * On entry, A is an array of dimension ( LDA, N ). The leading * m by n part of the array A must contain the matrix of coef- * ficients. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' and at * least ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. Before entry, * the incremented array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) REAL * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) REAL array of dimension at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' and at * least ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. Before entry * with BETA non-zero, the incremented array Y must contain the * vector y. On exit, the incremented array Y is overwritten by * the updated vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY REAL ABSX, TALPHA, TEMP COMPLEX ZDUM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL * .. * .. Statement Functions .. REAL CABS1 CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = 1 ELSE IF( M.LT.0 ) THEN INFO = 2 ELSE IF( N.LT.0 ) THEN INFO = 3 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CAGEMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF( LSAME( TRANS, 'N' ) ) THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := abs( beta*y ). * IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 20, I = 1, LENY Y( I ) = ABS( Y( I ) ) 20 CONTINUE ELSE DO 30, I = 1, LENY Y( I ) = ABS( BETA * Y( I ) ) 30 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 40, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 40 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 50, I = 1, LENY Y( IY ) = ABS( Y( IY ) ) IY = IY + INCY 50 CONTINUE ELSE DO 60, I = 1, LENY Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 60 CONTINUE END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( TRANS, 'N' ) ) THEN * * Form y := abs( alpha ) * abs( A ) * abs( x ) + y. * JX = KX IF( INCY.EQ.1 ) THEN DO 80, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX DO 70, I = 1, M Y( I ) = Y( I ) + TEMP * CABS1( A( I, J ) ) 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE ELSE DO 100, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY DO 90, I = 1, M Y( IY ) = Y( IY ) + TEMP * CABS1( A( I, J ) ) IY = IY + INCY 90 CONTINUE END IF JX = JX + INCX 100 CONTINUE END IF * ELSE * * Form y := abs( alpha ) * abs( A' ) * abs( x ) + y. * JY = KY IF( INCX.EQ.1 ) THEN DO 120, J = 1, N TEMP = ZERO DO 110, I = 1, M TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( I ) ) 110 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 120 CONTINUE ELSE DO 140, J = 1, N TEMP = ZERO IX = KX DO 130, I = 1, M TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( IX ) ) IX = IX + INCX 130 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 140 CONTINUE END IF END IF * RETURN * * End of CAGEMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cahemv.f000644 000766 000024 00000021226 10363532303 017733 0ustar00juliestaff000000 000000 SUBROUTINE CAHEMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL Y( * ) COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CAHEMV performs the following matrix-vector operation * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n Hermitian matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower triangu- * lar part of the array A is to be referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A is * to be referenced. * UPLO = 'L' or 'l' Only the lower triangular part of A is * to be referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) REAL * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the Hermitian ma- * trix and the strictly lower triangular part of A is not refe- * renced. When UPLO = 'L' or 'l', the leading n by n part of * the array A must contain the lower triangular part of the * Hermitian matrix and the strictly upper trapezoidal part of A * is not referenced. * Note that the imaginary parts of the local entries corres- * ponding to the offdiagonal elements of A need not be set and * assumed to be zero. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) REAL * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY REAL TALPHA, TEMP0, TEMP1, TEMP2 COMPLEX ZDUM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, REAL * .. * .. Statement Functions .. REAL CABS1 CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CAHEMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := abs( beta * y ). * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 40 CONTINUE END IF END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN DO 60, J = 1, N TEMP1 = TALPHA * CABS1( X( J ) ) TEMP2 = ZERO DO 50, I = 1, J - 1 TEMP0 = CABS1( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( I ) ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1 * ABS( REAL( A( J, J ) ) ) + $ ALPHA * TEMP2 * 60 CONTINUE * ELSE * JX = KX JY = KY * DO 80, J = 1, N TEMP1 = TALPHA * CABS1( X( JX ) ) TEMP2 = ZERO IX = KX IY = KY * DO 70, I = 1, J - 1 TEMP0 = CABS1( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( IX ) ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1 * ABS( REAL( A( J, J ) ) ) + $ ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 80 CONTINUE * END IF * ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN * DO 100, J = 1, N * TEMP1 = TALPHA * CABS1( X( J ) ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1 * ABS( REAL( A( J, J ) ) ) * DO 90, I = J + 1, N TEMP0 = CABS1( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( I ) ) * 90 CONTINUE * Y( J ) = Y( J ) + ALPHA * TEMP2 * 100 CONTINUE * ELSE * JX = KX JY = KY * DO 120, J = 1, N TEMP1 = TALPHA * CABS1( X( JX ) ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1 * ABS( REAL( A( J, J ) ) ) IX = JX IY = JY * DO 110, I = J + 1, N * IX = IX + INCX IY = IY + INCY TEMP0 = CABS1( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( IX ) ) * 110 CONTINUE * Y( JY ) = Y( JY ) + ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 120 CONTINUE * END IF * END IF * RETURN * * End of CAHEMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/casymv.f000644 000766 000024 00000020706 10363532303 017774 0ustar00juliestaff000000 000000 SUBROUTINE CASYMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL Y( * ) COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CASYMV performs the following matrix-vector operation * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n symmetric matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower triangu- * lar part of the array A is to be referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A is * to be referenced. * UPLO = 'L' or 'l' Only the lower triangular part of A is * to be referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) REAL * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. When UPLO = 'L' or 'l', the leading n by n part of * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) REAL * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY REAL TALPHA, TEMP0, TEMP1, TEMP2 COMPLEX ZDUM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL * .. * .. Statement Functions .. REAL CABS1 CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CASYMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := abs( beta * y ). * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 40 CONTINUE END IF END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN DO 60, J = 1, N TEMP1 = TALPHA * CABS1( X( J ) ) TEMP2 = ZERO DO 50, I = 1, J - 1 TEMP0 = CABS1( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( I ) ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1 * CABS1( A( J, J ) ) + $ ALPHA * TEMP2 * 60 CONTINUE * ELSE * JX = KX JY = KY * DO 80, J = 1, N TEMP1 = TALPHA * CABS1( X( JX ) ) TEMP2 = ZERO IX = KX IY = KY * DO 70, I = 1, J - 1 TEMP0 = CABS1( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( IX ) ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1 * CABS1( A( J, J ) ) + $ ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 80 CONTINUE * END IF * ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN * DO 100, J = 1, N * TEMP1 = TALPHA * CABS1( X( J ) ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1 * CABS1( A( J, J ) ) * DO 90, I = J + 1, N TEMP0 = CABS1( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( I ) ) * 90 CONTINUE * Y( J ) = Y( J ) + ALPHA * TEMP2 * 100 CONTINUE * ELSE * JX = KX JY = KY * DO 120, J = 1, N TEMP1 = TALPHA * CABS1( X( JX ) ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1 * CABS1( A( J, J ) ) IX = JX IY = JY * DO 110, I = J + 1, N * IX = IX + INCX IY = IY + INCY TEMP0 = CABS1( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( IX ) ) * 110 CONTINUE * Y( JY ) = Y( JY ) + ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 120 CONTINUE * END IF * END IF * RETURN * * End of CASYMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/catrmv.f000644 000766 000024 00000031040 10363532303 017757 0ustar00juliestaff000000 000000 SUBROUTINE CATRMV( UPLO, TRANS, DIAG, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER INCX, INCY, LDA, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL Y( * ) COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CATRMV performs one of the matrix-vector operations * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * or * * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ), * * or * * y := abs( alpha )*abs( conjg( A' ) )*abs( x ) + abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n unit or non-unit, upper or lower triangular * matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * TRANS (input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ) * * TRANS = 'T' or 't': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * TRANS = 'C' or 'c': * y := abs( alpha )*abs( conjg( A' ) )*abs( x ) + * abs( beta*y ) * * DIAG (input) CHARACTER*1 * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit triangular. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) REAL * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the matrix A and * the strictly lower triangular part of A is not referenced. * When UPLO = 'L' or 'l', the leading n by n part of the array * A must contain the lower triangular part of the matrix A and * the strictly upper trapezoidal part of A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of A * are not referenced either, but are assumed to be unity. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) REAL * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY LOGICAL NOUNIT REAL ABSX, TALPHA, TEMP COMPLEX ZDUM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL * .. * .. Statement Functions .. REAL CABS1 CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 7 ELSE IF( INCX.EQ.0 )THEN INFO = 9 ELSE IF( INCY.EQ.0 ) THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CATRMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := abs( beta*y ). * IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 20, I = 1, N Y( I ) = ABS( Y( I ) ) 20 CONTINUE ELSE DO 30, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 30 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 40, I = 1, N Y( IY ) = ZERO IY = IY + INCY 40 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 50, I = 1, N Y( IY ) = ABS( Y( IY ) ) IY = IY + INCY 50 CONTINUE ELSE DO 60, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 60 CONTINUE END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( TRANS, 'N' ) )THEN * * Form y := abs( alpha ) * abs( A ) * abs( x ) + y. * IF( LSAME( UPLO, 'U' ) )THEN JX = KX IF( INCY.EQ.1 ) THEN DO 80, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX DO 70, I = 1, J - 1 Y( I ) = Y( I ) + TEMP * CABS1( A( I, J ) ) 70 CONTINUE * IF( NOUNIT ) THEN Y( J ) = Y( J ) + TEMP * CABS1( A( J, J ) ) ELSE Y( J ) = Y( J ) + TEMP END IF END IF JX = JX + INCX 80 CONTINUE * ELSE * DO 100, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY DO 90, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP * CABS1( A( I, J ) ) IY = IY + INCY 90 CONTINUE * IF( NOUNIT ) THEN Y( IY ) = Y( IY ) + TEMP * CABS1( A( J, J ) ) ELSE Y( IY ) = Y( IY ) + TEMP END IF END IF JX = JX + INCX 100 CONTINUE * END IF * ELSE * JX = KX IF( INCY.EQ.1 ) THEN DO 120, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN * TEMP = TALPHA * ABSX * IF( NOUNIT ) THEN Y( J ) = Y( J ) + TEMP * CABS1( A( J, J ) ) ELSE Y( J ) = Y( J ) + TEMP END IF * DO 110, I = J + 1, N Y( I ) = Y( I ) + TEMP * CABS1( A( I, J ) ) 110 CONTINUE END IF JX = JX + INCX 120 CONTINUE * ELSE * DO 140, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY + ( J - 1 ) * INCY * IF( NOUNIT ) THEN Y( IY ) = Y( IY ) + TEMP * CABS1( A( J, J ) ) ELSE Y( IY ) = Y( IY ) + TEMP END IF * DO 130, I = J + 1, N IY = IY + INCY Y( IY ) = Y( IY ) + TEMP * CABS1( A( I, J ) ) 130 CONTINUE END IF JX = JX + INCX 140 CONTINUE * END IF * END IF * ELSE * * Form y := abs( alpha ) * abs( A' ) * abs( x ) + y. * IF( LSAME( UPLO, 'U' ) )THEN JY = KY IF( INCX.EQ.1 ) THEN DO 160, J = 1, N * TEMP = ZERO * DO 150, I = 1, J - 1 TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( I ) ) 150 CONTINUE * IF( NOUNIT ) THEN TEMP = TEMP + CABS1( A( J, J ) ) * CABS1( X( J ) ) ELSE TEMP = TEMP + CABS1( X( J ) ) END IF * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 160 CONTINUE * ELSE * DO 180, J = 1, N TEMP = ZERO IX = KX DO 170, I = 1, J - 1 TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( IX ) ) IX = IX + INCX 170 CONTINUE * IF( NOUNIT ) THEN TEMP = TEMP + CABS1( A( J, J ) ) * CABS1( X( IX ) ) ELSE TEMP = TEMP + CABS1( X( IX ) ) END IF * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 180 CONTINUE * END IF * ELSE * JY = KY IF( INCX.EQ.1 ) THEN * DO 200, J = 1, N * IF( NOUNIT ) THEN TEMP = CABS1( A( J, J ) ) * CABS1( X( J ) ) ELSE TEMP = CABS1( X( J ) ) END IF * DO 190, I = J + 1, N TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( I ) ) 190 CONTINUE * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 200 CONTINUE * ELSE * DO 220, J = 1, N * IX = KX + ( J - 1 ) * INCX * IF( NOUNIT ) THEN TEMP = CABS1( A( J, J ) ) * CABS1( X( IX ) ) ELSE TEMP = CABS1( X( IX ) ) END IF * DO 210, I = J + 1, N IX = IX + INCX TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( IX ) ) 210 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 220 CONTINUE END IF END IF * END IF * RETURN * * End of CATRMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/ccshft.f000644 000766 000024 00000005531 10363532303 017743 0ustar00juliestaff000000 000000 SUBROUTINE CCSHFT( M, N, OFFSET, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, M, N, OFFSET * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CCSHFT shifts columns of an m by n array A by OFFSET. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A. M must be at * least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A to be * shifted. N must be at least zero. * * OFFSET (local input) INTEGER * On entry, OFFSET specifies the offset by which the columns of * A should be shifted. OFFSET can be positive or negative (see * below for further details). When OFFSET is positive, the co- * lumns are shifted to the right. When OFFSET is negative, the * columns of A are shifted to the left. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension ( LDA, N+ABS(OFFSET) ). * On exit, A contains the shifted array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Further Details * =============== * * N=3 OFFSET=6 -OFFSET=6 N=3 * ------------------- ------------------- * | 1 2 3 4 5 6 7 8 9 | M | 1 2 3 4 5 6 7 8 9 | * ------------------- ------------------- * V V * ------------------- ------------------- * | 1 2 3 4 5 6 1 2 3 | M | 7 8 9 4 5 6 7 8 9 | * ------------------- ------------------- * OFFSET >= 0 OFFSET <= 0 * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( ( OFFSET.EQ.0 ).OR.( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * IF( OFFSET.GT.0 ) THEN DO 20 J = N, 1, -1 DO 10 I = 1, M A( I, J+OFFSET ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = A( I, J-OFFSET ) 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of CCSHFT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/chescal.f000644 000766 000024 00000016653 10363532303 020102 0ustar00juliestaff000000 000000 SUBROUTINE CHESCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N REAL ALPHA * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CHESCAL scales a two-dimensional array A by the real scalar alpha. * The diagonal entries specified by IOFFD of A are supposed to be real. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) REAL * On entry, ALPHA specifies the scalar alpha, i.e., the value * by which the diagonal and offdiagonal entries of the array A * as specified by UPLO and IOFFD are scaled. * * A (input/output) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the Hermitian * matrix to be scaled as specified by IOFFD, and the strictly * lower trapezoidal part of A is not referenced. When UPLO is * 'L' or 'l', the leading m by n part of the array A must con- * tain the lower trapezoidal part of the Hermitian matrix to be * scaled as specified by IOFFD, and the strictly upper trape- * zoidal part of A is not referenced. On exit, the entries of * the trapezoid part of A determined by UPLO and IOFFD are sca- * led. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL RONE, RZERO PARAMETER ( RONE = 1.0E+0, RZERO = 0.0E+0 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER J, JTMP, MN * .. * .. External Subroutines .. EXTERNAL CSSCAL, CTZPAD * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( ALPHA.EQ.RONE ) THEN * * Zeros the imaginary part of the diagonals * IF( LSAME( UPLO, 'L' ).OR.LSAME( UPLO, 'U' ).OR. $ LSAME( UPLO, 'D' ) ) THEN DO 10 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = CMPLX( REAL( A( JTMP, J ) ), RZERO ) 10 CONTINUE END IF RETURN ELSE IF( ALPHA.EQ.RZERO ) THEN CALL CTZPAD( UPLO, 'N', M, N, IOFFD, ZERO, ZERO, A, LDA ) RETURN END IF * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) CALL CSSCAL( M, ALPHA, A( 1, J ), 1 ) 20 CONTINUE DO 30 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = CMPLX( ALPHA * REAL( A( JTMP, J ) ), RZERO ) IF( M.GT.JTMP ) $ CALL CSSCAL( M-JTMP, ALPHA, A( JTMP + 1, J ), 1 ) 30 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 40 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD CALL CSSCAL( JTMP - 1, ALPHA, A( 1, J ), 1 ) A( JTMP, J ) = CMPLX( ALPHA * REAL( A( JTMP, J ) ), RZERO ) 40 CONTINUE DO 50 J = MAX( 0, MN ) + 1, N CALL CSSCAL( M, ALPHA, A( 1, J ), 1 ) 50 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 60 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = CMPLX( ALPHA * REAL( A( JTMP, J ) ), RZERO ) 60 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 70 J = 1, N CALL CSSCAL( M, ALPHA, A( 1, J ), 1 ) 70 CONTINUE * END IF * RETURN * * End of CHESCAL * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/CMakeLists.txt000644 000766 000024 00000003331 11656312637 021072 0ustar00juliestaff000000 000000 set (APBTZ pxerbla.f) set (IPBTZ immadd.f immdda.f immtadd.f immddat.f) set (SPBTZ svasum.f sset.f scshft.f srshft.f svvdot.f smmadd.f smmcadd.f smmtadd.f smmtcadd.f smmdda.f smmddac.f smmddat.f smmddact.f stzpad.f stzpadcpy.f stzscal.f sagemv.f sasymv.f satrmv.f) set (SCPBTZ sasqrtb.f sascal.f) set (CPBTZ scvasum.f cset.f ccshft.f crshft.f cvvdotu.f cvvdotc.f cmmadd.f cmmcadd.f cmmtadd.f cmmtcadd.f cmmdda.f cmmddac.f cmmddat.f cmmddact.f ctzpad.f ctzpadcpy.f chescal.f ctzscal.f ctzcnjg.f cagemv.f cahemv.f catrmv.f casymv.f csymv.f csyr.f csyr2.f) set (DPBTZ dvasum.f dset.f dcshft.f drshft.f dvvdot.f dmmadd.f dmmcadd.f dmmtadd.f dmmtcadd.f dmmdda.f dmmddac.f dmmddat.f dmmddact.f dtzpad.f dtzpadcpy.f dtzscal.f dagemv.f dasymv.f datrmv.f) set (DZPBTZ dasqrtb.f dascal.f) set (ZPBTZ dzvasum.f zset.f zcshft.f zrshft.f zvvdotu.f zvvdotc.f zmmadd.f zmmcadd.f zmmtadd.f zmmtcadd.f zmmdda.f zmmddac.f zmmddat.f zmmddact.f ztzpad.f ztzpadcpy.f zhescal.f ztzscal.f ztzcnjg.f zagemv.f zahemv.f zatrmv.f zasymv.f zsymv.f zsyr.f zsyr2.f) set(ptzblas ${APBTZ} ${IPBTZ} ${SPBTZ} ${SCPBTZ} ${CPBTZ} ${DPBTZ} ${DZPBTZ} ${ZPBTZ} ) scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cmmadd.f000644 000766 000024 00000010517 10363532303 017716 0ustar00juliestaff000000 000000 SUBROUTINE CMMADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CMMADD performs the following operation: * * B := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) COMPLEX array * On entry, B is an array of dimension ( LDB, N ). On exit, the * leading m by n part of A has been added to the leading m by n * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL CAXPY, CCOPY, CSCAL * .. * .. Executable Statements .. * IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL CCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) * DO 10 I = 1, M * B( I, J ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL CAXPY( M, ONE, A( 1, J ), 1, B( 1, J ), 1 ) * DO 50 I = 1, M * B( I, J ) = A( I, J ) + B( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL CAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) * DO 110 I = 1, M * B( I, J ) = ALPHA * A( I, J ) + B( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, N CALL CSCAL( M, BETA, B( 1, J ), 1 ) * DO 150 I = 1, M * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of CMMADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cmmcadd.f000644 000766 000024 00000010473 10363532303 020062 0ustar00juliestaff000000 000000 SUBROUTINE CMMCADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CMMCADD performs the following operation: * * B := alpha * conjg( A ) + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) COMPLEX array * On entry, B is an array of dimension ( LDB, N ). On exit, the * leading m by n part of A has been conjugated and added to the * leading m by n part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL CSCAL * .. * .. Intrinsic Functions .. INTRINSIC CONJG * .. * .. Executable Statements .. * IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M B( I, J ) = CONJG( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( I, J ) = CONJG( A( I, J ) ) + BETA * B( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( I, J ) = CONJG( A( I, J ) ) + B( I, J ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( I, J ) = ALPHA * CONJG( A( I, J ) ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( I, J ) = ALPHA * CONJG( A( I, J ) ) + $ BETA * B( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M B( I, J ) = ALPHA * CONJG( A( I, J ) ) + B( I, J ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, N CALL CSCAL( M, BETA, B( 1, J ), 1 ) * DO 150 I = 1, M * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of CMMCADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cmmdda.f000644 000766 000024 00000010523 10363532303 017713 0ustar00juliestaff000000 000000 SUBROUTINE CMMDDA( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CMMDDA performs the following operation: * * A := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading m by n part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) COMPLEX array * On entry, B is an array of dimension ( LDB, N ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL CAXPY, CCOPY, CSCAL * .. * .. Executable Statements .. * IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL CCOPY( M, B( 1, J ), 1, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( I, J ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL CAXPY( M, ONE, B( 1, J ), 1, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( I, J ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( I, J ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL CAXPY( M, BETA, B( 1, J ), 1, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( I, J ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL CSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of CMMDDA * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cmmddac.f000644 000766 000024 00000010500 10363532303 020051 0ustar00juliestaff000000 000000 SUBROUTINE CMMDDAC( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CMMDDAC performs the following operation: * * A := alpha * A + beta * conjg( B ), * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading m by n part of B has been conjugated and added to the * leading m by n part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) COMPLEX array * On entry, B is an array of dimension ( LDB, N ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL CSCAL * .. * .. Intrinsic Functions .. INTRINSIC CONJG * .. * .. Executable Statements .. * IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = CONJG( B( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = CONJG( B( I, J ) ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M A( I, J ) = CONJG( B( I, J ) ) + A( I, J ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * CONJG( B( I, J ) ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * CONJG( B( I, J ) ) + $ ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M A( I, J ) = BETA * CONJG( B( I, J ) ) + A( I, J ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL CSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of CMMDDAC * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cmmddact.f000644 000766 000024 00000014744 10363532303 020253 0ustar00juliestaff000000 000000 SUBROUTINE CMMDDACT( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CMMDDACT performs the following operation: * * A := alpha * A + beta * conjg( B' ), * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading n by m part of B has been conjugated and added to the * leading m by n part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) COMPLEX array * On entry, B is an array of dimension ( LDB, M ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL CSCAL * .. * .. Intrinsic Functions .. INTRINSIC CONJG * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = CONJG( B( J, I ) ) 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = CONJG( B( J, I ) ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M A( I, J ) = CONJG( B( J, I ) ) + A( I, J ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * CONJG( B( J, I ) ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * CONJG( B( J, I ) ) + $ ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M A( I, J ) = BETA * CONJG( B( J, I ) ) + A( I, J ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL CSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 180 J = 1, M DO 170 I = 1, N A( J, I ) = CONJG( B( I, J ) ) 170 CONTINUE 180 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N A( J, I ) = CONJG( B( I, J ) ) + ALPHA * A( J, I ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M DO 210 I = 1, N A( J, I ) = CONJG( B( I, J ) ) + A( J, I ) 210 CONTINUE 220 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N A( J, I ) = BETA * CONJG( B( I, J ) ) 230 CONTINUE 240 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N A( J, I ) = BETA * CONJG( B( I, J ) ) + $ ALPHA * A( J, I ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M DO 270 I = 1, N A( J, I ) = BETA * CONJG( B( I, J ) ) + A( J, I ) 270 CONTINUE 280 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 300 J = 1, N DO 290 I = 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 320 J = 1, N CALL CSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 310 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of CMMDDACT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cmmddat.f000644 000766 000024 00000015166 10363532303 020107 0ustar00juliestaff000000 000000 SUBROUTINE CMMDDAT( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CMMDDAT performs the following operation: * * A := alpha * A + beta * B', * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading n by m part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) COMPLEX array * On entry, B is an array of dimension ( LDB, M ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL CAXPY, CCOPY, CSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL CCOPY( M, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( J, I ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( J, I ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL CAXPY( M, ONE, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( J, I ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( J, I ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( J, I ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL CAXPY( M, BETA, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( J, I ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL CSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 180 J = 1, M CALL CCOPY( N, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 170 I = 1, N * A( J, I ) = B( I, J ) * 170 CONTINUE 180 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N A( J, I ) = B( I, J ) + ALPHA * A( J, I ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL CAXPY( N, ONE, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 210 I = 1, N * A( J, I ) = B( I, J ) + A( J, I ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N A( J, I ) = BETA * B( I, J ) 230 CONTINUE 240 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N A( J, I ) = BETA * B( I, J ) + ALPHA * A( J, I ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL CAXPY( N, BETA, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 270 I = 1, N * A( J, I ) = BETA * B( I, J ) + A( J, I ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 300 J = 1, N DO 290 I = 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 320 J = 1, N CALL CSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 310 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of CMMDDAT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cmmtadd.f000644 000766 000024 00000015156 10363532303 020106 0ustar00juliestaff000000 000000 SUBROUTINE CMMTADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CMMTADD performs the following operation: * * B := alpha * A' + beta * B, * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) COMPLEX array * On entry, B is an array of dimension ( LDB, M ). On exit, the * leading m by n part of A has been added to the leading n by m * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL CAXPY, CCOPY, CSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL CCOPY( M, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 10 I = 1, M * B( J, I ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( J, I ) = A( I, J ) + BETA * B( J, I ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL CAXPY( M, ONE, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 50 I = 1, M * B( J, I ) = A( I, J ) + B( J, I ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( J, I ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( J, I ) = ALPHA * A( I, J ) + BETA * B( J, I ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL CAXPY( M, ALPHA, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 110 I = 1, M * B( J, I ) = ALPHA * A( I, J ) + B( J, I ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, M DO 130 I = 1, N B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, M CALL CSCAL( N, BETA, B( 1, J ), 1 ) * DO 150 I = 1, N * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, M CALL CCOPY( N, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 170 I = 1, N * B( I, J ) = A( J, I ) * 170 CONTINUE 180 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N B( I, J ) = A( J, I ) + BETA * B( I, J ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL CAXPY( N, ONE, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 210 I = 1, N * B( I, J ) = A( J, I ) + B( I, J ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N B( I, J ) = ALPHA * A( J, I ) 230 CONTINUE 240 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL CAXPY( N, ALPHA, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 270 I = 1, N * B( I, J ) = ALPHA * A( J, I ) + B( I, J ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 300 J = 1, M DO 290 I = 1, N B( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 320 J = 1, M CALL CSCAL( N, BETA, B( 1, J ), 1 ) * DO 310 I = 1, N * B( I, J ) = BETA * B( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of CMMTADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cmmtcadd.f000644 000766 000024 00000014731 10363532303 020247 0ustar00juliestaff000000 000000 SUBROUTINE CMMTCADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CMMTCADD performs the following operation: * * B := alpha * conjg( A' ) + beta * B, * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) COMPLEX * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) COMPLEX array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) COMPLEX array * On entry, B is an array of dimension ( LDB, M ). On exit, the * leading m by n part of A has been conjugated and added to the * leading n by m part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL CSCAL * .. * .. Intrinsic Functions .. INTRINSIC CONJG * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M B( J, I ) = CONJG( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( J, I ) = CONJG( A( I, J ) ) + BETA * B( J, I ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( J, I ) = CONJG( A( I, J ) ) + B( J, I ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( J, I ) = ALPHA * CONJG( A( I, J ) ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( J, I ) = ALPHA * CONJG( A( I, J ) ) + $ BETA * B( J, I ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M B( J, I ) = ALPHA * CONJG( A( I, J ) ) + B( J, I ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, M DO 130 I = 1, N B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, M CALL CSCAL( N, BETA, B( 1, J ), 1 ) * DO 150 I = 1, N * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, M DO 170 I = 1, N B( I, J ) = CONJG( A( J, I ) ) 170 CONTINUE 180 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N B( I, J ) = CONJG( A( J, I ) ) + BETA * B( I, J ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M DO 210 I = 1, N B( I, J ) = CONJG( A( J, I ) ) + B( I, J ) 210 CONTINUE 220 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N B( I, J ) = ALPHA * CONJG( A( J, I ) ) 230 CONTINUE 240 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N B( I, J ) = ALPHA * CONJG( A( J, I ) ) + $ BETA * B( I, J ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M DO 270 I = 1, N B( I, J ) = ALPHA * CONJG( A( J, I ) ) + B( I, J ) 270 CONTINUE 280 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 300 J = 1, M DO 290 I = 1, N B( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 320 J = 1, M CALL CSCAL( N, BETA, B( 1, J ), 1 ) * DO 310 I = 1, N * B( I, J ) = BETA * B( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of CMMTCADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/crshft.f000644 000766 000024 00000006170 10363532303 017762 0ustar00juliestaff000000 000000 SUBROUTINE CRSHFT( M, N, OFFSET, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, M, N, OFFSET * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CRSHFT shifts rows of an m by n array A by OFFSET. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A to be shifted. * M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A. N must be * at least zero. * * OFFSET (local input) INTEGER * On entry, OFFSET specifies the offset by which the rows of * A should be shifted. OFFSET can be positive or negative (see * below for further details). When OFFSET is positive, the rows * are shifted to the bottom. When OFFSET is negative, the rows * of A are shifted to the top. * * A (local input/local output) COMPLEX array * On entry, A is an array of dimension ( LDA, N ). On exit, A * contains the shifted array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M+ABS(OFFSET) ). * * Further Details * =============== * * N N N N * --- --- --- --- * | 1 | | 1 | | 1 | | 7 | * | 2 | M = 3 | 2 | | 2 | M = 3 | 8 | * | 3 | | 3 | | 3 | | 9 | * | 4 | | 4 | | 4 | | 4 | * | 5 | becomes | 5 | | 5 | becomes | 5 | * | 6 | | 6 | | 6 | | 6 | * | 7 | | 1 | | 7 | | 7 | * | 8 | OFFSET = 6 | 2 | | 8 | OFFSET = -6 | 8 | * | 9 | | 3 | | 9 | | 9 | * --- --- --- --- * OFFSET >= 0 OFFSET <= 0 * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( ( OFFSET.EQ.0 ).OR.( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * IF( OFFSET.GT.0 ) THEN DO 20 J = 1, N DO 10 I = M, 1, -1 A( I+OFFSET, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = A( I-OFFSET, J ) 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of CRSHFT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cset.f000644 000766 000024 00000005173 10363532303 017431 0ustar00juliestaff000000 000000 SUBROUTINE CSET( N, ALPHA, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N COMPLEX ALPHA * .. * .. Array Arguments .. COMPLEX X( * ) * .. * * Purpose * ======= * * CSET sets the entries of an n vector x to the scalar alpha. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * X (input/output) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. On exit, entries of the * incremented array X are set to alpha. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, INFO, IX, M, MP1 * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = 1 ELSE IF( INCX.EQ.0 ) THEN INFO = 4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CSET', INFO ) RETURN END IF * * Quick return if possible. * IF( N.LE.0 ) $ RETURN * * Form x := alpha * IF( INCX.EQ.1 ) $ GO TO 20 * * code for increments not equal to 1 * * Set up the start point in X. * IF( INCX.GT.0 ) THEN IX = 1 ELSE IX = 1 - ( N - 1 ) * INCX END IF * DO 10 I = 1, N X( IX ) = ALPHA IX = IX + INCX 10 CONTINUE * RETURN * * code for increment equal to 1 * * clean-up loop * 20 M = MOD( N, 4 ) * IF( M.EQ.0 ) $ GO TO 40 * DO 30 I = 1, M X( I ) = ALPHA 30 CONTINUE IF( N.LT.4 ) $ RETURN * 40 MP1 = M + 1 DO 50 I = MP1, N, 4 X( I ) = ALPHA X( I + 1 ) = ALPHA X( I + 2 ) = ALPHA X( I + 3 ) = ALPHA 50 CONTINUE * RETURN * * End of CSET * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/csymv.f000644 000766 000024 00000017251 10363532303 017634 0ustar00juliestaff000000 000000 SUBROUTINE CSYMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CSYMV performs the following matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors, and * A is an n by n symmetric matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower triangu- * lar part of the array A is to be referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A is * to be referenced. * UPLO = 'L' or 'l' Only the lower triangular part of A is * to be referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. When UPLO = 'L' or 'l', the leading n by n part of * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) COMPLEX * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY COMPLEX TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CSYMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * IF( LSAME( UPLO, 'U' ) )THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( J, J ) DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( J, J ) IX = JX IY = JY DO 110, I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of CSYMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/csyr.f000644 000766 000024 00000013035 11654631032 017452 0ustar00juliestaff000000 000000 SUBROUTINE CSYR( UPLO, N, ALPHA, X, INCX, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, LDA, N COMPLEX ALPHA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CSYR performs the symmetric rank 1 operation * * A := alpha*x*x' + A, * * where alpha is a complex scalar, x is an n element vector and A is an * n by n SY matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * A (input/output) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. On exit, the upper triangular part of the array A is * overwritten by the upper triangular part of the updated ma- * trix. When UPLO = 'L' or 'l', the leading n by n part of the * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. On exit, the lower triangular part of the * array A is overwritten by the lower triangular part of the * updated matrix. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * ===================================================================== * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, J, JX, KX COMPLEX TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( INCX.EQ.0 ) THEN INFO = 5 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = 7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CSYR', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * KX = 1 IF( INCX.LE.0 ) THEN KX = 1 - ( N-1 )*INCX ELSE IF( INCX.NE.1 ) THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) ) THEN * * Form A when A is stored in upper triangle. * IF( INCX.EQ.1 ) THEN DO 20 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) DO 10 I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF 20 CONTINUE ELSE JX = KX DO 40 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = KX DO 30 I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JX = JX + INCX 40 CONTINUE END IF ELSE * * Form A when A is stored in lower triangle. * IF( INCX.EQ.1 ) THEN DO 60 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) DO 50 I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = JX DO 70 I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF * RETURN * * End of CSYR * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/csyr2.f000644 000766 000024 00000015754 11654631032 017546 0ustar00juliestaff000000 000000 SUBROUTINE CSYR2( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N COMPLEX ALPHA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CSYR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a complex scalar, x and y are n element vectors and A * is an n by n SY matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented * array Y must contain the vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * A (input/output) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. On exit, the upper triangular part of the array A is * overwritten by the upper triangular part of the updated ma- * trix. When UPLO = 'L' or 'l', the leading n by n part of the * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. On exit, the lower triangular part of the * array A is overwritten by the lower triangular part of the * updated matrix. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY COMPLEX TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CSYR2', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * KX = 1 KY = 1 JX = 1 JY = 1 IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 10, I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 10 CONTINUE END IF 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY 40 CONTINUE END IF ELSE * * Form A when A is stored in the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 50, I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 50 CONTINUE END IF 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF END IF * RETURN * * End of CSYR2 * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/ctzcnjg.f000644 000766 000024 00000020776 10363532303 020143 0ustar00juliestaff000000 000000 SUBROUTINE CTZCNJG( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX ALPHA * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CTZCNJG conjugates a two-dimensional array A and then scales it by * the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be conjugated and scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha, i.e., the value * by which the diagonal and offdiagonal entries of the array A * as specified by UPLO and IOFFD are scaled. * * A (input/output) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are conjugated and scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Subroutines .. EXTERNAL CTZPAD * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * IF( ALPHA.EQ.ZERO ) THEN * CALL CTZPAD( UPLO, 'N', M, N, IOFFD, ZERO, ZERO, A, LDA ) * ELSE IF( ALPHA.EQ.ONE ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = CONJG( A( I, J ) ) 10 CONTINUE 20 CONTINUE * DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = CONJG( A( I, J ) ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = CONJG( A( I, J ) ) 50 CONTINUE 60 CONTINUE * DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = CONJG( A( I, J ) ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = CONJG( A( JTMP, J ) ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = CONJG( A( I, J ) ) 100 CONTINUE 110 CONTINUE * END IF * ELSE * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 130 J = 1, MIN( MN, N ) DO 120 I = 1, M A( I, J ) = ALPHA * CONJG( A( I, J ) ) 120 CONTINUE 130 CONTINUE * DO 150 J = MN + 1, MIN( M - IOFFD, N ) DO 140 I = J + IOFFD, M A( I, J ) = ALPHA * CONJG( A( I, J ) ) 140 CONTINUE 150 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 170 J = MAX( 0, -IOFFD ) + 1, MN DO 160 I = 1, J + IOFFD A( I, J ) = ALPHA * CONJG( A( I, J ) ) 160 CONTINUE 170 CONTINUE * DO 190 J = MAX( 0, MN ) + 1, N DO 180 I = 1, M A( I, J ) = ALPHA * CONJG( A( I, J ) ) 180 CONTINUE 190 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 200 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * CONJG( A( JTMP, J ) ) 200 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 220 J = 1, N DO 210 I = 1, M A( I, J ) = ALPHA * CONJG( A( I, J ) ) 210 CONTINUE 220 CONTINUE * END IF * END IF * RETURN * * End of CTZCNJG * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/ctzpad.f000644 000766 000024 00000022054 10363532303 017755 0ustar00juliestaff000000 000000 SUBROUTINE CTZPAD( UPLO, HERM, M, N, IOFFD, ALPHA, BETA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 HERM, UPLO INTEGER IOFFD, LDA, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CTZPAD initializes a two-dimensional array A to beta on the diagonal * specified by IOFFD or zeros the imaginary part of those diagonals and * set the offdiagonals to alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be set as follows: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of A is not changed, * = 'D' or 'd': diagonal specified by IOFFD is set; the * rest of the array A is unchanged, * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of A is not changed, * Otherwise: All of the array A is set. * * HERM (input) CHARACTER*1 * On entry, HERM specifies what should be done to the diagonals * as follows. When UPLO is 'L', 'l', 'D', 'd', 'U' or 'u' and * HERM is 'Z' or 'z', the imaginary part of the diagonals is * set to zero. Otherwise, the diagonals are set to beta. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha, i.e., the value * to which the offdiagonal entries of the array A determined by * UPLO and IOFFD are set. * * BETA (input) COMPLEX * On entry, BETA specifies the scalar beta, i.e., the value to * which the diagonal entries specified by IOFFD of the array A * are set. BETA is not referenced when UPLO is 'L', 'l', 'U' or * 'u' and HERM is 'Z'. * * A (input/output) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U', the leading m by n part of the array A must * contain the upper trapezoidal part of the matrix to be set as * specified by IOFFD, and the strictly lower trapezoidal part * of A is not referenced; When UPLO = 'L', the leading m by n * part of the array A must contain the lower trapezoidal part * of the matrix to be set as specified by IOFFD, and the * strictly upper trapezoidal part of A is not referenced. On * exit, the entries of the trapezoid part of A determined by * UPLO, HERM and IOFFD are set. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL RZERO PARAMETER ( RZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals and set the strictly lower triangular part of the * array to ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA 10 CONTINUE 20 CONTINUE * IF( LSAME( HERM, 'Z' ) ) THEN DO 40 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = CMPLX( REAL( A( JTMP, J ) ), RZERO ) DO 30 I = JTMP + 1, M A( I, J ) = ALPHA 30 CONTINUE 40 CONTINUE ELSE DO 60 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = BETA DO 50 I = JTMP + 1, M A( I, J ) = ALPHA 50 CONTINUE 60 CONTINUE END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals and set the strictly upper triangular part of the * array to ALPHA. * MN = MIN( M - IOFFD, N ) IF( LSAME( HERM, 'Z' ) ) THEN DO 80 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 70 I = 1, JTMP - 1 A( I, J ) = ALPHA 70 CONTINUE A( JTMP, J ) = CMPLX( REAL( A( JTMP, J ) ), RZERO ) 80 CONTINUE ELSE DO 100 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 90 I = 1, JTMP - 1 A( I, J ) = ALPHA 90 CONTINUE A( JTMP, J ) = BETA 100 CONTINUE END IF DO 120 J = MAX( 0, MN ) + 1, N DO 110 I = 1, M A( I, J ) = ALPHA 110 CONTINUE 120 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals. * IF( LSAME( HERM, 'Z' ) ) THEN IF( ( IOFFD.LT.M ).AND.( IOFFD.GT.-N ) ) THEN DO 130 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = CMPLX( REAL( A( JTMP, J ) ), RZERO ) 130 CONTINUE END IF ELSE IF( ( IOFFD.LT.M ).AND.( IOFFD.GT.-N ) ) THEN DO 140 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 140 CONTINUE END IF END IF * ELSE * * Set the diagonals to BETA and the offdiagonals to ALPHA. * DO 160 J = 1, N DO 150 I = 1, M A( I, J ) = ALPHA 150 CONTINUE 160 CONTINUE IF( ALPHA.NE.BETA .AND. IOFFD.LT.M .AND. IOFFD.GT.-N ) THEN DO 170 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 170 CONTINUE END IF * END IF * RETURN * * End of CTZPAD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/ctzpadcpy.f000644 000766 000024 00000020006 10363532303 020464 0ustar00juliestaff000000 000000 SUBROUTINE CTZPADCPY( UPLO, DIAG, M, N, IOFFD, A, LDA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, UPLO INTEGER IOFFD, LDA, LDB, M, N * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CTZPADCPY copies an array A into an array B. The unchanged part of B * is padded with zeros. The diagonal of B specified by IOFFD may be set * to ones. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be copied as follows: * = 'L' or 'l': Lower triangular part is copied; the * strictly upper triangular part of B is * padded with zeros, * = 'U' or 'u': Upper triangular part is copied; the * strictly lower triangular part of B is * padded with zeros. * * DIAG (input) CHARACTER*1 * On entry, DIAG specifies whether or not the diagonal of B is * to be set to ones or not as follows: * * DIAG = 'N' or 'n': the diagonals of A are copied into the * diagonals of B, otherwise the diagonals of B are set to ones. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * A (input) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U', the leading m by n part of the array A must * contain the upper trapezoidal part of the matrix to be copied * as specified by IOFFD, UPLO and DIAG, and the strictly lower * trapezoidal part of A is not referenced; When UPLO = 'L',the * leading m by n part of the array A must contain the lower * trapezoidal part of the matrix to be copied as specified by * IOFFD, UPLO and DIAG and the strictly upper trapezoidal part * of A is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * B (output) COMPLEX array * On entry, B is an array of dimension (LDB,N). On exit, this * array contains the padded copy of A as specified by IOFFD, * UPLO and DIAG. * * LDB (input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, ITMP, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M B( I, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE * JTMP = MIN( M - IOFFD, N ) * IF( LSAME( DIAG, 'N' ) ) THEN DO 50 J = MN + 1, JTMP ITMP = J + IOFFD DO 30 I = 1, ITMP - 1 B( I, J ) = ZERO 30 CONTINUE DO 40 I = ITMP, M B( I, J ) = A( I, J ) 40 CONTINUE 50 CONTINUE ELSE DO 80 J = MN + 1, JTMP ITMP = J + IOFFD DO 60 I = 1, ITMP - 1 B( I, J ) = ZERO 60 CONTINUE B( ITMP, J ) = ONE DO 70 I = ITMP + 1, M B( I, J ) = A( I, J ) 70 CONTINUE 80 CONTINUE END IF * DO 100 J = JTMP + 1, N DO 90 I = 1, M B( I, J ) = ZERO 90 CONTINUE 100 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * JTMP = MAX( 0, -IOFFD ) * DO 120 J = 1, JTMP DO 110 I = 1, M B( I, J ) = ZERO 110 CONTINUE 120 CONTINUE * MN = MIN( M - IOFFD, N ) * IF( LSAME( DIAG, 'N' ) ) THEN DO 150 J = JTMP + 1, MN ITMP = J + IOFFD DO 130 I = 1, ITMP B( I, J ) = A( I, J ) 130 CONTINUE DO 140 I = ITMP + 1, M B( I, J ) = ZERO 140 CONTINUE 150 CONTINUE ELSE DO 180 J = JTMP + 1, MN ITMP = J + IOFFD DO 160 I = 1, ITMP - 1 B( I, J ) = A( I, J ) 160 CONTINUE B( ITMP, J ) = ONE DO 170 I = ITMP + 1, M B( I, J ) = ZERO 170 CONTINUE 180 CONTINUE END IF * DO 200 J = MAX( 0, MN ) + 1, N DO 190 I = 1, M B( I, J ) = A( I, J ) 190 CONTINUE 200 CONTINUE * ELSE * DO 220 J = 1, N DO 210 I = 1, M B( I, J ) = A( I, J ) 210 CONTINUE 220 CONTINUE * END IF * RETURN * * End of CTZPADCPY * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/ctzscal.f000644 000766 000024 00000015274 10363532303 020141 0ustar00juliestaff000000 000000 SUBROUTINE CTZSCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX ALPHA * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CTZSCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha, i.e., the value * by which the diagonal and offdiagonal entries of the array A * as specified by UPLO and IOFFD are scaled. * * A (input/output) COMPLEX array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER J, JTMP, MN * .. * .. External Subroutines .. EXTERNAL CSCAL, CTZPAD * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ).OR.( ALPHA.EQ.ONE ) ) $ RETURN * * Start the operations * IF( ALPHA.EQ.ZERO ) THEN CALL CTZPAD( UPLO, 'N', M, N, IOFFD, ZERO, ZERO, A, LDA ) RETURN END IF * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 10 J = 1, MIN( MN, N ) CALL CSCAL( M, ALPHA, A( 1, J ), 1 ) 10 CONTINUE DO 20 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD IF( M.GE.JTMP ) $ CALL CSCAL( M-JTMP+1, ALPHA, A( JTMP, J ), 1 ) 20 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 30 J = MAX( 0, -IOFFD ) + 1, MN CALL CSCAL( J + IOFFD, ALPHA, A( 1, J ), 1 ) 30 CONTINUE DO 40 J = MAX( 0, MN ) + 1, N CALL CSCAL( M, ALPHA, A( 1, J ), 1 ) 40 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 50 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 50 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 60 J = 1, N CALL CSCAL( M, ALPHA, A( 1, J ), 1 ) 60 CONTINUE * END IF * RETURN * * End of CTZSCAL * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cvvdotc.f000644 000766 000024 00000003552 10363532303 020142 0ustar00juliestaff000000 000000 SUBROUTINE CVVDOTC( N, DOT, X, INCX, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N COMPLEX DOT * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * .. * * Purpose * ======= * * CVVDOTC computes the following dot product: * * dot = dot + x**H * y, * * where x and y are n vectors. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vectors x and y. N * must be at least zero. * * DOT (input/output) COMPLEX * On exit, DOT is updated with the dot product of the vectors x * and y. * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented * array Y must contain the vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. COMPLEX CDOTC EXTERNAL CDOTC * .. * .. Executable Statements .. * DOT = DOT + CDOTC( N, X, INCX, Y, INCY ) * RETURN * * End of CVVDOTC * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/cvvdotu.f000644 000766 000024 00000003552 10363532303 020164 0ustar00juliestaff000000 000000 SUBROUTINE CVVDOTU( N, DOT, X, INCX, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N COMPLEX DOT * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * .. * * Purpose * ======= * * CVVDOTU computes the following dot product: * * dot = dot + x**T * y, * * where x and y are n vectors. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vectors x and y. N * must be at least zero. * * DOT (input/output) COMPLEX * On exit, DOT is updated with the dot product of the vectors x * and y. * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented * array Y must contain the vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. COMPLEX CDOTU EXTERNAL CDOTU * .. * .. Executable Statements .. * DOT = DOT + CDOTU( N, X, INCX, Y, INCY ) * RETURN * * End of CVVDOTU * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dagemv.f000644 000766 000024 00000017541 10363532303 017740 0ustar00juliestaff000000 000000 SUBROUTINE DAGEMV( TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER INCX, INCY, LDA, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DAGEMV performs one of the matrix-vector operations * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * or * * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an m by n matrix. * * Arguments * ========= * * TRANS (input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ) * * TRANS = 'T' or 't': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * TRANS = 'C' or 'c': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the real scalar alpha. * * A (input) DOUBLE PRECISION array of dimension ( LDA, n ). * On entry, A is an array of dimension ( LDA, N ). The leading * m by n part of the array A must contain the matrix of coef- * ficients. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * X (input) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' and at * least ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. Before entry, * the incremented array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) DOUBLE PRECISION * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' and at * least ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. Before entry * with BETA non-zero, the incremented array Y must contain the * vector y. On exit, the incremented array Y is overwritten by * the updated vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY DOUBLE PRECISION ABSX, TALPHA, TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = 1 ELSE IF( M.LT.0 ) THEN INFO = 2 ELSE IF( N.LT.0 ) THEN INFO = 3 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DAGEMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF( LSAME( TRANS, 'N' ) ) THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := abs( beta*y ). * IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 20, I = 1, LENY Y( I ) = ABS( Y( I ) ) 20 CONTINUE ELSE DO 30, I = 1, LENY Y( I ) = ABS( BETA * Y( I ) ) 30 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 40, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 40 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 50, I = 1, LENY Y( IY ) = ABS( Y( IY ) ) IY = IY + INCY 50 CONTINUE ELSE DO 60, I = 1, LENY Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 60 CONTINUE END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( TRANS, 'N' ) ) THEN * * Form y := abs( alpha ) * abs( A ) * abs( x ) + y. * JX = KX IF( INCY.EQ.1 ) THEN DO 80, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX DO 70, I = 1, M Y( I ) = Y( I ) + TEMP * ABS( A( I, J ) ) 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE ELSE DO 100, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY DO 90, I = 1, M Y( IY ) = Y( IY ) + TEMP * ABS( A( I, J ) ) IY = IY + INCY 90 CONTINUE END IF JX = JX + INCX 100 CONTINUE END IF * ELSE * * Form y := abs( alpha ) * abs( A' ) * abs( x ) + y. * JY = KY IF( INCX.EQ.1 ) THEN DO 120, J = 1, N TEMP = ZERO DO 110, I = 1, M TEMP = TEMP + ABS( A( I, J ) * X( I ) ) 110 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 120 CONTINUE ELSE DO 140, J = 1, N TEMP = ZERO IX = KX DO 130, I = 1, M TEMP = TEMP + ABS( A( I, J ) * X( IX ) ) IX = IX + INCX 130 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 140 CONTINUE END IF END IF * RETURN * * End of DAGEMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dascal.f000644 000766 000024 00000010025 10363532303 017712 0ustar00juliestaff000000 000000 SUBROUTINE DASCAL( N, ALPHA, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. DOUBLE PRECISION X( * ) * .. * * Purpose * ======= * * DASCAL performs the following operation: * * x := abs( alpha ) * abs( x ), * * where alpha is a scalar and x is an n vector. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * X (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. On exit, entries of the * incremented array X are mutiplied by alpha in absolute value. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, M, MP1 * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = 1 ELSE IF( INCX.EQ.0 ) THEN INFO = 4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DASCAL', INFO ) RETURN END IF * * Quick return if possible. * IF( N.LE.0 ) $ RETURN * * Form x := abs( alpha ) * abs( x ) * IF( INCX.EQ.1 ) $ GO TO 40 * * code for increments not equal to 1 * * Set up the start point in X. * IF( INCX.GT.0 ) THEN IX = 1 ELSE IX = 1 - ( N - 1 ) * INCX END IF * IF( ALPHA.EQ.ZERO ) THEN DO 10 I = 1, N X( IX ) = ZERO IX = IX + INCX 10 CONTINUE ELSE IF( ALPHA.EQ.ONE ) THEN DO 20 I = 1, N X( IX ) = ABS( X( IX ) ) IX = IX + INCX 20 CONTINUE ELSE DO 30 I = 1, N X( IX ) = ABS( ALPHA * X( IX ) ) IX = IX + INCX 30 CONTINUE END IF * RETURN * * code for increment equal to 1 * * clean-up loop * 40 M = MOD( N, 4 ) * IF( M.EQ.0 ) $ GO TO 80 * IF( ALPHA.EQ.ZERO ) THEN DO 50 I = 1, M X( I ) = ZERO 50 CONTINUE ELSE IF( ALPHA.EQ.ONE ) THEN DO 60 I = 1, M X( I ) = ABS( X( I ) ) 60 CONTINUE ELSE DO 70 I = 1, M X( I ) = ABS( ALPHA * X( I ) ) 70 CONTINUE END IF * IF( N.LT.4 ) $ RETURN * 80 MP1 = M + 1 * IF( ALPHA.EQ.ZERO ) THEN DO 90 I = MP1, N, 4 X( I ) = ZERO X( I + 1 ) = ZERO X( I + 2 ) = ZERO X( I + 3 ) = ZERO 90 CONTINUE ELSE IF( ALPHA.EQ.ONE ) THEN DO 100 I = MP1, N, 4 X( I ) = ABS( X( I ) ) X( I + 1 ) = ABS( X( I + 1 ) ) X( I + 2 ) = ABS( X( I + 2 ) ) X( I + 3 ) = ABS( X( I + 3 ) ) 100 CONTINUE ELSE DO 110 I = MP1, N, 4 X( I ) = ABS( ALPHA * X( I ) ) X( I + 1 ) = ABS( ALPHA * X( I + 1 ) ) X( I + 2 ) = ABS( ALPHA * X( I + 2 ) ) X( I + 3 ) = ABS( ALPHA * X( I + 3 ) ) 110 CONTINUE END IF * RETURN * * End of DASCAL * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dasqrtb.f000644 000766 000024 00000002145 10363532303 020127 0ustar00juliestaff000000 000000 SUBROUTINE DASQRTB( A, B, C ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. DOUBLE PRECISION A, B, C * .. * * Purpose * ======= * * DASQRTB computes c := a * sqrt( b ) where a, b and c are scalars. * * Arguments * ========= * * A (input) DOUBLE PRECISION * On entry, A specifies the scalar a. * * B (input) DOUBLE PRECISION * On entry, B specifies the scalar b. * * C (output) DOUBLE PRECISION * On entry, C specifies the scalar c. On exit, c is overwritten * by the product of a and the square root of b. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC SQRT * .. * .. Executable Statements .. * C = A * SQRT( B ) * RETURN * * End of DASQRTB * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dasymv.f000644 000766 000024 00000020420 10363532303 017766 0ustar00juliestaff000000 000000 SUBROUTINE DASYMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DASYMV performs the following matrix-vector operation * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n symmetric matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower triangu- * lar part of the array A is to be referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A is * to be referenced. * UPLO = 'L' or 'l' Only the lower triangular part of A is * to be referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the real scalar alpha. * * A (input) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. When UPLO = 'L' or 'l', the leading n by n part of * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) DOUBLE PRECISION * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY DOUBLE PRECISION TALPHA, TEMP0, TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DASYMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := abs( beta * y ). * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 40 CONTINUE END IF END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN DO 60, J = 1, N TEMP1 = TALPHA * ABS( X( J ) ) TEMP2 = ZERO DO 50, I = 1, J - 1 TEMP0 = ABS( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * ABS( X( I ) ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1 * ABS( A( J, J ) ) + $ ALPHA * TEMP2 * 60 CONTINUE * ELSE * JX = KX JY = KY * DO 80, J = 1, N TEMP1 = TALPHA * ABS( X( JX ) ) TEMP2 = ZERO IX = KX IY = KY * DO 70, I = 1, J - 1 TEMP0 = ABS( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * ABS( X( IX ) ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1 * ABS( A( J, J ) ) + $ ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 80 CONTINUE * END IF * ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN * DO 100, J = 1, N * TEMP1 = TALPHA * ABS( X( J ) ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1 * ABS( A( J, J ) ) * DO 90, I = J + 1, N TEMP0 = ABS( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * ABS( X( I ) ) * 90 CONTINUE * Y( J ) = Y( J ) + ALPHA * TEMP2 * 100 CONTINUE * ELSE * JX = KX JY = KY * DO 120, J = 1, N TEMP1 = TALPHA * ABS( X( JX ) ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1 * ABS( A( J, J ) ) IX = JX IY = JY * DO 110, I = J + 1, N * IX = IX + INCX IY = IY + INCY TEMP0 = ABS( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * ABS( X( IX ) ) * 110 CONTINUE * Y( JY ) = Y( JY ) + ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 120 CONTINUE * END IF * END IF * RETURN * * End of DASYMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/datrmv.f000644 000766 000024 00000030241 10363532303 017762 0ustar00juliestaff000000 000000 SUBROUTINE DATRMV( UPLO, TRANS, DIAG, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER INCX, INCY, LDA, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DATRMV performs one of the matrix-vector operations * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * or * * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n unit or non-unit, upper or lower triangular * matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * TRANS (input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ) * * TRANS = 'T' or 't': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * TRANS = 'C' or 'c': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * DIAG (input) CHARACTER*1 * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit triangular. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the real scalar alpha. * * A (input) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the matrix A and * the strictly lower triangular part of A is not referenced. * When UPLO = 'L' or 'l', the leading n by n part of the array * A must contain the lower triangular part of the matrix A and * the strictly upper trapezoidal part of A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of A * are not referenced either, but are assumed to be unity. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) DOUBLE PRECISION * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY LOGICAL NOUNIT DOUBLE PRECISION ABSX, TALPHA, TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 7 ELSE IF( INCX.EQ.0 )THEN INFO = 9 ELSE IF( INCY.EQ.0 ) THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DATRMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := abs( beta*y ). * IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 20, I = 1, N Y( I ) = ABS( Y( I ) ) 20 CONTINUE ELSE DO 30, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 30 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 40, I = 1, N Y( IY ) = ZERO IY = IY + INCY 40 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 50, I = 1, N Y( IY ) = ABS( Y( IY ) ) IY = IY + INCY 50 CONTINUE ELSE DO 60, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 60 CONTINUE END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( TRANS, 'N' ) )THEN * * Form y := abs( alpha ) * abs( A ) * abs( x ) + y. * IF( LSAME( UPLO, 'U' ) )THEN JX = KX IF( INCY.EQ.1 ) THEN DO 80, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX DO 70, I = 1, J - 1 Y( I ) = Y( I ) + TEMP * ABS( A( I, J ) ) 70 CONTINUE * IF( NOUNIT ) THEN Y( J ) = Y( J ) + TEMP * ABS( A( J, J ) ) ELSE Y( J ) = Y( J ) + TEMP END IF END IF JX = JX + INCX 80 CONTINUE * ELSE * DO 100, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY DO 90, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP * ABS( A( I, J ) ) IY = IY + INCY 90 CONTINUE * IF( NOUNIT ) THEN Y( IY ) = Y( IY ) + TEMP * ABS( A( J, J ) ) ELSE Y( IY ) = Y( IY ) + TEMP END IF END IF JX = JX + INCX 100 CONTINUE * END IF * ELSE * JX = KX IF( INCY.EQ.1 ) THEN DO 120, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN * TEMP = TALPHA * ABSX * IF( NOUNIT ) THEN Y( J ) = Y( J ) + TEMP * ABS( A( J, J ) ) ELSE Y( J ) = Y( J ) + TEMP END IF * DO 110, I = J + 1, N Y( I ) = Y( I ) + TEMP * ABS( A( I, J ) ) 110 CONTINUE END IF JX = JX + INCX 120 CONTINUE * ELSE * DO 140, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY + ( J - 1 ) * INCY * IF( NOUNIT ) THEN Y( IY ) = Y( IY ) + TEMP * ABS( A( J, J ) ) ELSE Y( IY ) = Y( IY ) + TEMP END IF * DO 130, I = J + 1, N IY = IY + INCY Y( IY ) = Y( IY ) + TEMP * ABS( A( I, J ) ) 130 CONTINUE END IF JX = JX + INCX 140 CONTINUE * END IF * END IF * ELSE * * Form y := abs( alpha ) * abs( A' ) * abs( x ) + y. * IF( LSAME( UPLO, 'U' ) )THEN JY = KY IF( INCX.EQ.1 ) THEN DO 160, J = 1, N * TEMP = ZERO * DO 150, I = 1, J - 1 TEMP = TEMP + ABS( A( I, J ) * X( I ) ) 150 CONTINUE * IF( NOUNIT ) THEN TEMP = TEMP + ABS( A( J, J ) * X( J ) ) ELSE TEMP = TEMP + ABS( X( J ) ) END IF * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 160 CONTINUE * ELSE * DO 180, J = 1, N TEMP = ZERO IX = KX DO 170, I = 1, J - 1 TEMP = TEMP + ABS( A( I, J ) * X( IX ) ) IX = IX + INCX 170 CONTINUE * IF( NOUNIT ) THEN TEMP = TEMP + ABS( A( J, J ) * X( IX ) ) ELSE TEMP = TEMP + ABS( X( IX ) ) END IF * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 180 CONTINUE * END IF * ELSE * JY = KY IF( INCX.EQ.1 ) THEN * DO 200, J = 1, N * IF( NOUNIT ) THEN TEMP = ABS( A( J, J ) * X( J ) ) ELSE TEMP = ABS( X( J ) ) END IF * DO 190, I = J + 1, N TEMP = TEMP + ABS( A( I, J ) * X( I ) ) 190 CONTINUE * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 200 CONTINUE * ELSE * DO 220, J = 1, N * IX = KX + ( J - 1 ) * INCX * IF( NOUNIT ) THEN TEMP = ABS( A( J, J ) * X( IX ) ) ELSE TEMP = ABS( X( IX ) ) END IF * DO 210, I = J + 1, N IX = IX + INCX TEMP = TEMP + ABS( A( I, J ) * X( IX ) ) 210 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 220 CONTINUE END IF END IF * END IF * RETURN * * End of DATRMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dcshft.f000644 000766 000024 00000005542 10363532303 017746 0ustar00juliestaff000000 000000 SUBROUTINE DCSHFT( M, N, OFFSET, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, M, N, OFFSET * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DCSHFT shifts columns of an m by n array A by OFFSET. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A. M must be at * least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A to be * shifted. N must be at least zero. * * OFFSET (local input) INTEGER * On entry, OFFSET specifies the offset by which the columns of * A should be shifted. OFFSET can be positive or negative (see * below for further details). When OFFSET is positive, the co- * lumns are shifted to the right. When OFFSET is negative, the * columns of A are shifted to the left. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N+ABS(OFFSET) ). * On exit, A contains the shifted array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Further Details * =============== * * N=3 OFFSET=6 -OFFSET=6 N=3 * ------------------- ------------------- * | 1 2 3 4 5 6 7 8 9 | M | 1 2 3 4 5 6 7 8 9 | * ------------------- ------------------- * V V * ------------------- ------------------- * | 1 2 3 4 5 6 1 2 3 | M | 7 8 9 4 5 6 7 8 9 | * ------------------- ------------------- * OFFSET >= 0 OFFSET <= 0 * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( ( OFFSET.EQ.0 ).OR.( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * IF( OFFSET.GT.0 ) THEN DO 20 J = N, 1, -1 DO 10 I = 1, M A( I, J+OFFSET ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = A( I, J-OFFSET ) 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of DCSHFT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dmmadd.f000644 000766 000024 00000010477 10363532303 017724 0ustar00juliestaff000000 000000 SUBROUTINE DMMADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DMMADD performs the following operation: * * B := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) DOUBLE PRECISION array * On entry, B is an array of dimension ( LDB, N ). On exit, the * leading m by n part of A has been added to the leading m by n * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DSCAL * .. * .. Executable Statements .. * IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL DCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) * DO 10 I = 1, M * B( I, J ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL DAXPY( M, ONE, A( 1, J ), 1, B( 1, J ), 1 ) * DO 50 I = 1, M * B( I, J ) = A( I, J ) + B( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL DAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) * DO 110 I = 1, M * B( I, J ) = ALPHA * A( I, J ) + B( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, N CALL DSCAL( M, BETA, B( 1, J ), 1 ) * DO 150 I = 1, M * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of DMMADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dmmcadd.f000644 000766 000024 00000010502 10363532303 020054 0ustar00juliestaff000000 000000 SUBROUTINE DMMCADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DMMCADD performs the following operation: * * B := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) DOUBLE PRECISION array * On entry, B is an array of dimension ( LDB, N ). On exit, the * leading m by n part of A has been added to the leading m by n * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DSCAL * .. * .. Executable Statements .. * IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL DCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) * DO 10 I = 1, M * B( I, J ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL DAXPY( M, ONE, A( 1, J ), 1, B( 1, J ), 1 ) * DO 50 I = 1, M * B( I, J ) = A( I, J ) + B( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL DAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) * DO 110 I = 1, M * B( I, J ) = ALPHA * A( I, J ) + B( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, N CALL DSCAL( M, BETA, B( 1, J ), 1 ) * DO 150 I = 1, M * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of DMMCADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dmmdda.f000644 000766 000024 00000010503 10363532303 017712 0ustar00juliestaff000000 000000 SUBROUTINE DMMDDA( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DMMDDA performs the following operation: * * A := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading m by n part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) DOUBLE PRECISION array * On entry, B is an array of dimension ( LDB, N ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DSCAL * .. * .. Executable Statements .. * IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL DCOPY( M, B( 1, J ), 1, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( I, J ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL DAXPY( M, ONE, B( 1, J ), 1, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( I, J ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( I, J ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL DAXPY( M, BETA, B( 1, J ), 1, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( I, J ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL DSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of DMMDDA * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dmmddac.f000644 000766 000024 00000010506 10363532303 020060 0ustar00juliestaff000000 000000 SUBROUTINE DMMDDAC( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DMMDDAC performs the following operation: * * A := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading m by n part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) DOUBLE PRECISION array * On entry, B is an array of dimension ( LDB, N ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DSCAL * .. * .. Executable Statements .. * IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL DCOPY( M, B( 1, J ), 1, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( I, J ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL DAXPY( M, ONE, B( 1, J ), 1, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( I, J ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( I, J ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL DAXPY( M, BETA, B( 1, J ), 1, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( I, J ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL DSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of DMMDDAC * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dmmddact.f000644 000766 000024 00000015153 10363532303 020247 0ustar00juliestaff000000 000000 SUBROUTINE DMMDDACT( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DMMDDACT performs the following operation: * * A := alpha * A + beta * B', * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading n by m part of B has been added into the leading m by * n part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) DOUBLE PRECISION array * On entry, B is an array of dimension ( LDB, M ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL DCOPY( M, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( J, I ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( J, I ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL DAXPY( M, ONE, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( J, I ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( J, I ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( J, I ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL DAXPY( M, BETA, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( J, I ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL DSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 180 J = 1, M CALL DCOPY( N, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 170 I = 1, N * A( J, I ) = B( I, J ) * 170 CONTINUE 180 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N A( J, I ) = B( I, J ) + ALPHA * A( J, I ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL DAXPY( N, ONE, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 210 I = 1, N * A( J, I ) = B( I, J ) + A( J, I ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N A( J, I ) = BETA * B( I, J ) 230 CONTINUE 240 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N A( J, I ) = BETA * B( I, J ) + ALPHA * A( J, I ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL DAXPY( N, BETA, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 270 I = 1, N * A( J, I ) = BETA * B( I, J ) + A( J, I ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 300 J = 1, N DO 290 I = 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 320 J = 1, N CALL DSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 310 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of DMMDDACT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dmmddat.f000644 000766 000024 00000015146 10363532303 020106 0ustar00juliestaff000000 000000 SUBROUTINE DMMDDAT( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DMMDDAT performs the following operation: * * A := alpha * A + beta * B', * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading n by m part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) DOUBLE PRECISION array * On entry, B is an array of dimension ( LDB, M ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL DCOPY( M, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( J, I ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( J, I ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL DAXPY( M, ONE, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( J, I ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( J, I ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( J, I ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL DAXPY( M, BETA, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( J, I ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL DSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 180 J = 1, M CALL DCOPY( N, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 170 I = 1, N * A( J, I ) = B( I, J ) * 170 CONTINUE 180 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N A( J, I ) = B( I, J ) + ALPHA * A( J, I ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL DAXPY( N, ONE, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 210 I = 1, N * A( J, I ) = B( I, J ) + A( J, I ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N A( J, I ) = BETA * B( I, J ) 230 CONTINUE 240 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N A( J, I ) = BETA * B( I, J ) + ALPHA * A( J, I ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL DAXPY( N, BETA, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 270 I = 1, N * A( J, I ) = BETA * B( I, J ) + A( J, I ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 300 J = 1, N DO 290 I = 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 320 J = 1, N CALL DSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 310 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of DMMDDAT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dmmtadd.f000644 000766 000024 00000015136 10363532303 020105 0ustar00juliestaff000000 000000 SUBROUTINE DMMTADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DMMTADD performs the following operation: * * B := alpha * A' + beta * B, * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) DOUBLE PRECISION array * On entry, B is an array of dimension ( LDB, M ). On exit, the * leading m by n part of A has been added to the leading n by m * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL DCOPY( M, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 10 I = 1, M * B( J, I ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( J, I ) = A( I, J ) + BETA * B( J, I ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL DAXPY( M, ONE, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 50 I = 1, M * B( J, I ) = A( I, J ) + B( J, I ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( J, I ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( J, I ) = ALPHA * A( I, J ) + BETA * B( J, I ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL DAXPY( M, ALPHA, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 110 I = 1, M * B( J, I ) = ALPHA * A( I, J ) + B( J, I ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, M DO 130 I = 1, N B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, M CALL DSCAL( N, BETA, B( 1, J ), 1 ) * DO 150 I = 1, N * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, M CALL DCOPY( N, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 170 I = 1, N * B( I, J ) = A( J, I ) * 170 CONTINUE 180 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N B( I, J ) = A( J, I ) + BETA * B( I, J ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL DAXPY( N, ONE, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 210 I = 1, N * B( I, J ) = A( J, I ) + B( I, J ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N B( I, J ) = ALPHA * A( J, I ) 230 CONTINUE 240 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL DAXPY( N, ALPHA, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 270 I = 1, N * B( I, J ) = ALPHA * A( J, I ) + B( I, J ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 300 J = 1, M DO 290 I = 1, N B( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 320 J = 1, M CALL DSCAL( N, BETA, B( 1, J ), 1 ) * DO 310 I = 1, N * B( I, J ) = BETA * B( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of DMMTADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dmmtcadd.f000644 000766 000024 00000015141 10363532303 020244 0ustar00juliestaff000000 000000 SUBROUTINE DMMTCADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DMMTCADD performs the following operation: * * B := alpha * A' + beta * B, * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) DOUBLE PRECISION array * On entry, B is an array of dimension ( LDB, M ). On exit, the * leading m by n part of A has been added to the leading n by m * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL DCOPY( M, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 10 I = 1, M * B( J, I ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( J, I ) = A( I, J ) + BETA * B( J, I ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL DAXPY( M, ONE, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 50 I = 1, M * B( J, I ) = A( I, J ) + B( J, I ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( J, I ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( J, I ) = ALPHA * A( I, J ) + BETA * B( J, I ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL DAXPY( M, ALPHA, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 110 I = 1, M * B( J, I ) = ALPHA * A( I, J ) + B( J, I ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, M DO 130 I = 1, N B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, M CALL DSCAL( N, BETA, B( 1, J ), 1 ) * DO 150 I = 1, N * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, M CALL DCOPY( N, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 170 I = 1, N * B( I, J ) = A( J, I ) * 170 CONTINUE 180 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N B( I, J ) = A( J, I ) + BETA * B( I, J ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL DAXPY( N, ONE, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 210 I = 1, N * B( I, J ) = A( J, I ) + B( I, J ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N B( I, J ) = ALPHA * A( J, I ) 230 CONTINUE 240 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL DAXPY( N, ALPHA, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 270 I = 1, N * B( I, J ) = ALPHA * A( J, I ) + B( I, J ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 300 J = 1, M DO 290 I = 1, N B( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 320 J = 1, M CALL DSCAL( N, BETA, B( 1, J ), 1 ) * DO 310 I = 1, N * B( I, J ) = BETA * B( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of DMMTCADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/drshft.f000644 000766 000024 00000006201 10363532303 017756 0ustar00juliestaff000000 000000 SUBROUTINE DRSHFT( M, N, OFFSET, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, M, N, OFFSET * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DRSHFT shifts rows of an m by n array A by OFFSET. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A to be shifted. * M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A. N must be * at least zero. * * OFFSET (local input) INTEGER * On entry, OFFSET specifies the offset by which the rows of * A should be shifted. OFFSET can be positive or negative (see * below for further details). When OFFSET is positive, the rows * are shifted to the bottom. When OFFSET is negative, the rows * of A are shifted to the top. * * A (local input/local output) DOUBLE PRECISION array * On entry, A is an array of dimension ( LDA, N ). On exit, A * contains the shifted array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M+ABS(OFFSET) ). * * Further Details * =============== * * N N N N * --- --- --- --- * | 1 | | 1 | | 1 | | 7 | * | 2 | M = 3 | 2 | | 2 | M = 3 | 8 | * | 3 | | 3 | | 3 | | 9 | * | 4 | | 4 | | 4 | | 4 | * | 5 | becomes | 5 | | 5 | becomes | 5 | * | 6 | | 6 | | 6 | | 6 | * | 7 | | 1 | | 7 | | 7 | * | 8 | OFFSET = 6 | 2 | | 8 | OFFSET = -6 | 8 | * | 9 | | 3 | | 9 | | 9 | * --- --- --- --- * OFFSET >= 0 OFFSET <= 0 * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( ( OFFSET.EQ.0 ).OR.( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * IF( OFFSET.GT.0 ) THEN DO 20 J = 1, N DO 10 I = M, 1, -1 A( I+OFFSET, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = A( I-OFFSET, J ) 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of DRSHFT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dset.f000644 000766 000024 00000005215 10363532303 017427 0ustar00juliestaff000000 000000 SUBROUTINE DSET( N, ALPHA, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. DOUBLE PRECISION X( * ) * .. * * Purpose * ======= * * DSET sets the entries of an n vector x to the scalar alpha. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha. * * X (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. On exit, entries of the * incremented array X are set to alpha. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, INFO, IX, M, MP1 * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = 1 ELSE IF( INCX.EQ.0 ) THEN INFO = 4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSET', INFO ) RETURN END IF * * Quick return if possible. * IF( N.LE.0 ) $ RETURN * * Form x := alpha * IF( INCX.EQ.1 ) $ GO TO 20 * * code for increments not equal to 1 * * Set up the start point in X. * IF( INCX.GT.0 ) THEN IX = 1 ELSE IX = 1 - ( N - 1 ) * INCX END IF * DO 10 I = 1, N X( IX ) = ALPHA IX = IX + INCX 10 CONTINUE * RETURN * * code for increment equal to 1 * * clean-up loop * 20 M = MOD( N, 4 ) * IF( M.EQ.0 ) $ GO TO 40 * DO 30 I = 1, M X( I ) = ALPHA 30 CONTINUE IF( N.LT.4 ) $ RETURN * 40 MP1 = M + 1 DO 50 I = MP1, N, 4 X( I ) = ALPHA X( I + 1 ) = ALPHA X( I + 2 ) = ALPHA X( I + 3 ) = ALPHA 50 CONTINUE * RETURN * * End of DSET * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dtzpad.f000644 000766 000024 00000020774 10363532303 017765 0ustar00juliestaff000000 000000 SUBROUTINE DTZPAD( UPLO, HERM, M, N, IOFFD, ALPHA, BETA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 HERM, UPLO INTEGER IOFFD, LDA, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DTZPAD initializes a two-dimensional array A to beta on the diagonal * specified by IOFFD or zeros the imaginary part of those diagonals and * set the offdiagonals to alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be set as follows: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of A is not changed, * = 'D' or 'd': diagonal specified by IOFFD is set; the * rest of the array A is unchanged, * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of A is not changed, * Otherwise: All of the array A is set. * * HERM (input) CHARACTER*1 * On entry, HERM specifies what should be done to the diagonals * as follows. When UPLO is 'L', 'l', 'D', 'd', 'U' or 'u' and * HERM is 'Z' or 'z', the imaginary part of the diagonals is * set to zero. Otherwise, the diagonals are set to beta. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha, i.e., the value * to which the offdiagonal entries of the array A determined by * UPLO and IOFFD are set. * * BETA (input) DOUBLE PRECISION * On entry, BETA specifies the scalar beta, i.e., the value to * which the diagonal entries specified by IOFFD of the array A * are set. BETA is not referenced when UPLO is 'L', 'l', 'U' or * 'u' and HERM is 'Z'. * * A (input/output) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U', the leading m by n part of the array A must * contain the upper trapezoidal part of the matrix to be set as * specified by IOFFD, and the strictly lower trapezoidal part * of A is not referenced; When UPLO = 'L', the leading m by n * part of the array A must contain the lower trapezoidal part * of the matrix to be set as specified by IOFFD, and the * strictly upper trapezoidal part of A is not referenced. On * exit, the entries of the trapezoid part of A determined by * UPLO, HERM and IOFFD are set. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals and set the strictly lower triangular part of the * array to ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA 10 CONTINUE 20 CONTINUE * IF( LSAME( HERM, 'Z' ) ) THEN DO 40 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD DO 30 I = JTMP + 1, M A( I, J ) = ALPHA 30 CONTINUE 40 CONTINUE ELSE DO 60 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = BETA DO 50 I = JTMP + 1, M A( I, J ) = ALPHA 50 CONTINUE 60 CONTINUE END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals and set the strictly upper triangular part of the * array to ALPHA. * MN = MIN( M - IOFFD, N ) IF( LSAME( HERM, 'Z' ) ) THEN DO 80 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 70 I = 1, JTMP - 1 A( I, J ) = ALPHA 70 CONTINUE 80 CONTINUE ELSE DO 100 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 90 I = 1, JTMP - 1 A( I, J ) = ALPHA 90 CONTINUE A( JTMP, J ) = BETA 100 CONTINUE END IF DO 120 J = MAX( 0, MN ) + 1, N DO 110 I = 1, M A( I, J ) = ALPHA 110 CONTINUE 120 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Set the diagonal to BETA * IF( .NOT.( LSAME( HERM, 'Z' ) ) ) THEN IF( ( IOFFD.LT.M ).AND.( IOFFD.GT.-N ) ) THEN DO 130 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 130 CONTINUE END IF END IF * ELSE * * Set the diagonals to BETA and the offdiagonals to ALPHA. * DO 150 J = 1, N DO 140 I = 1, M A( I, J ) = ALPHA 140 CONTINUE 150 CONTINUE IF( ALPHA.NE.BETA .AND. IOFFD.LT.M .AND. IOFFD.GT.-N ) THEN DO 160 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 160 CONTINUE END IF * END IF * RETURN * * End of DTZPAD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dtzpadcpy.f000644 000766 000024 00000017747 10363532303 020507 0ustar00juliestaff000000 000000 SUBROUTINE DTZPADCPY( UPLO, DIAG, M, N, IOFFD, A, LDA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, UPLO INTEGER IOFFD, LDA, LDB, M, N * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DTZPADCPY copies an array A into an array B. The unchanged part of B * is padded with zeros. The diagonal of B specified by IOFFD may be set * to ones. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be copied as follows: * = 'L' or 'l': Lower triangular part is copied; the * strictly upper triangular part of B is * padded with zeros, * = 'U' or 'u': Upper triangular part is copied; the * strictly lower triangular part of B is * padded with zeros. * * DIAG (input) CHARACTER*1 * On entry, DIAG specifies whether or not the diagonal of B is * to be set to ones or not as follows: * * DIAG = 'N' or 'n': the diagonals of A are copied into the * diagonals of B, otherwise the diagonals of B are set to ones. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * A (input) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U', the leading m by n part of the array A must * contain the upper trapezoidal part of the matrix to be copied * as specified by IOFFD, UPLO and DIAG, and the strictly lower * trapezoidal part of A is not referenced; When UPLO = 'L',the * leading m by n part of the array A must contain the lower * trapezoidal part of the matrix to be copied as specified by * IOFFD, UPLO and DIAG and the strictly upper trapezoidal part * of A is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * B (output) DOUBLE PRECISION array * On entry, B is an array of dimension (LDB,N). On exit, this * array contains the padded copy of A as specified by IOFFD, * UPLO and DIAG. * * LDB (input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, ITMP, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M B( I, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE * JTMP = MIN( M - IOFFD, N ) * IF( LSAME( DIAG, 'N' ) ) THEN DO 50 J = MN + 1, JTMP ITMP = J + IOFFD DO 30 I = 1, ITMP - 1 B( I, J ) = ZERO 30 CONTINUE DO 40 I = ITMP, M B( I, J ) = A( I, J ) 40 CONTINUE 50 CONTINUE ELSE DO 80 J = MN + 1, JTMP ITMP = J + IOFFD DO 60 I = 1, ITMP - 1 B( I, J ) = ZERO 60 CONTINUE B( ITMP, J ) = ONE DO 70 I = ITMP + 1, M B( I, J ) = A( I, J ) 70 CONTINUE 80 CONTINUE END IF * DO 100 J = JTMP + 1, N DO 90 I = 1, M B( I, J ) = ZERO 90 CONTINUE 100 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * JTMP = MAX( 0, -IOFFD ) * DO 120 J = 1, JTMP DO 110 I = 1, M B( I, J ) = ZERO 110 CONTINUE 120 CONTINUE * MN = MIN( M - IOFFD, N ) * IF( LSAME( DIAG, 'N' ) ) THEN DO 150 J = JTMP + 1, MN ITMP = J + IOFFD DO 130 I = 1, ITMP B( I, J ) = A( I, J ) 130 CONTINUE DO 140 I = ITMP + 1, M B( I, J ) = ZERO 140 CONTINUE 150 CONTINUE ELSE DO 180 J = JTMP + 1, MN ITMP = J + IOFFD DO 160 I = 1, ITMP - 1 B( I, J ) = A( I, J ) 160 CONTINUE B( ITMP, J ) = ONE DO 170 I = ITMP + 1, M B( I, J ) = ZERO 170 CONTINUE 180 CONTINUE END IF * DO 200 J = MAX( 0, MN ) + 1, N DO 190 I = 1, M B( I, J ) = A( I, J ) 190 CONTINUE 200 CONTINUE * ELSE * DO 220 J = 1, N DO 210 I = 1, M B( I, J ) = A( I, J ) 210 CONTINUE 220 CONTINUE * END IF * RETURN * * End of DTZPADCPY * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dtzscal.f000644 000766 000024 00000015235 10363532303 020137 0ustar00juliestaff000000 000000 SUBROUTINE DTZSCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DTZSCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha, i.e., the value * by which the diagonal and offdiagonal entries of the array A * as specified by UPLO and IOFFD are scaled. * * A (input/output) DOUBLE PRECISION array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER J, JTMP, MN * .. * .. External Subroutines .. EXTERNAL DSCAL, DTZPAD * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ).OR.( ALPHA.EQ.ONE ) ) $ RETURN * * Start the operations * IF( ALPHA.EQ.ZERO ) THEN CALL DTZPAD( UPLO, 'N', M, N, IOFFD, ZERO, ZERO, A, LDA ) RETURN END IF * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 10 J = 1, MIN( MN, N ) CALL DSCAL( M, ALPHA, A( 1, J ), 1 ) 10 CONTINUE DO 20 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD IF( M.GE.JTMP ) $ CALL DSCAL( M-JTMP+1, ALPHA, A( JTMP, J ), 1 ) 20 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 30 J = MAX( 0, -IOFFD ) + 1, MN CALL DSCAL( J + IOFFD, ALPHA, A( 1, J ), 1 ) 30 CONTINUE DO 40 J = MAX( 0, MN ) + 1, N CALL DSCAL( M, ALPHA, A( 1, J ), 1 ) 40 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 50 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 50 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 60 J = 1, N CALL DSCAL( M, ALPHA, A( 1, J ), 1 ) 60 CONTINUE * END IF * RETURN * * End of DTZSCAL * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dvasum.f000644 000766 000024 00000002725 10363532303 017772 0ustar00juliestaff000000 000000 SUBROUTINE DVASUM( N, ASUM, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N DOUBLE PRECISION ASUM * .. * .. Array Arguments .. DOUBLE PRECISION X( * ) * .. * * Purpose * ======= * * DVASUM returns the sum of absolute values of the entries of a vector * x. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ASUM (output) DOUBLE PRECISION * On exit, ASUM specifies the sum of absolute values. * * X (input) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. DOUBLE PRECISION DASUM EXTERNAL DASUM * .. * .. Executable Statements .. * ASUM = DASUM( N, X, INCX ) * RETURN * * End of DVASUM * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dvvdot.f000644 000766 000024 00000003577 10363532303 020007 0ustar00juliestaff000000 000000 SUBROUTINE DVVDOT( N, DOT, X, INCX, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N DOUBLE PRECISION DOT * .. * .. Array Arguments .. DOUBLE PRECISION X( * ), Y( * ) * .. * * Purpose * ======= * * DVVDOT computes the following dot product: * * dot = dot + x**T * y, * * where x and y are n vectors. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vectors x and y. N * must be at least zero. * * DOT (input/output) DOUBLE PRECISION * On exit, DOT is updated with the dot product of the vectors x * and y. * * X (input) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (input) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented * array Y must contain the vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. DOUBLE PRECISION DDOT EXTERNAL DDOT * .. * .. Executable Statements .. * DOT = DOT + DDOT( N, X, INCX, Y, INCY ) * RETURN * * End of DVVDOT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/dzvasum.f000644 000766 000024 00000002716 10363532303 020164 0ustar00juliestaff000000 000000 SUBROUTINE DZVASUM( N, ASUM, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N DOUBLE PRECISION ASUM * .. * .. Array Arguments .. COMPLEX*16 X( * ) * .. * * Purpose * ======= * * DZVASUM returns the sum of absolute values of the entries of a vector * x. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ASUM (output) COMPLEX*16 * On exit, ASUM specifies the sum of absolute values. * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. DOUBLE PRECISION DZASUM EXTERNAL DZASUM * .. * .. Executable Statements .. * ASUM = DZASUM( N, X, INCX ) * RETURN * * End of DZVASUM * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/immadd.f000644 000766 000024 00000007712 10363532303 017727 0ustar00juliestaff000000 000000 SUBROUTINE IMMADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N INTEGER ALPHA, BETA * .. * .. Array Arguments .. INTEGER A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * IMMADD performs the following operation: * * B := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) INTEGER * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) INTEGER array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) INTEGER * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) INTEGER array * On entry, B is an array of dimension ( LDB, N ). On exit, the * leading m by n part of A has been added to the leading m by n * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER ONE, ZERO PARAMETER ( ONE = 1, ZERO = 0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M B( I, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( I, J ) = A( I, J ) + B( I, J ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, N DO 150 I = 1, M B( I, J ) = BETA * B( I, J ) 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of IMMADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/immdda.f000644 000766 000024 00000007716 10363532303 017733 0ustar00juliestaff000000 000000 SUBROUTINE IMMDDA( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N INTEGER ALPHA, BETA * .. * .. Array Arguments .. INTEGER A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * IMMDDA performs the following operation: * * A := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) INTEGER * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) INTEGER array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading m by n part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) INTEGER * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) INTEGER array * On entry, B is an array of dimension ( LDB, N ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER ONE, ZERO PARAMETER ( ONE = 1, ZERO = 0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = B( I, J ) 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( I, J ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M A( I, J ) = B( I, J ) + A( I, J ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( I, J ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M A( I, J ) = BETA * B( I, J ) + A( I, J ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N DO 150 I = 1, M A( I, J ) = ALPHA * A( I, J ) 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of IMMDDA * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/immddat.f000644 000766 000024 00000013735 10363532303 020115 0ustar00juliestaff000000 000000 SUBROUTINE IMMDDAT( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N INTEGER ALPHA, BETA * .. * .. Array Arguments .. INTEGER A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * IMMDDAT performs the following operation: * * A := alpha * A + beta * B', * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) INTEGER * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) INTEGER array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading n by m part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) INTEGER * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) INTEGER array * On entry, B is an array of dimension ( LDB, M ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER ONE, ZERO PARAMETER ( ONE = 1, ZERO = 0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = B( J, I ) 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( J, I ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M A( I, J ) = B( J, I ) + A( I, J ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( J, I ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( J, I ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M A( I, J ) = BETA * B( J, I ) + A( I, J ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N DO 150 I = 1, M A( I, J ) = ALPHA * A( I, J ) 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 180 J = 1, M DO 170 I = 1, N A( J, I ) = B( I, J ) 170 CONTINUE 180 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N A( J, I ) = B( I, J ) + ALPHA * A( J, I ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M DO 210 I = 1, N A( J, I ) = B( I, J ) + A( J, I ) 210 CONTINUE 220 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N A( J, I ) = BETA * B( I, J ) 230 CONTINUE 240 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N A( J, I ) = BETA * B( I, J ) + ALPHA * A( J, I ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M DO 270 I = 1, N A( J, I ) = BETA * B( I, J ) + A( J, I ) 270 CONTINUE 280 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 300 J = 1, N DO 290 I = 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 320 J = 1, N DO 310 I = 1, M A( I, J ) = ALPHA * A( I, J ) 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of IMMDDAT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/immtadd.f000644 000766 000024 00000013725 10363532303 020114 0ustar00juliestaff000000 000000 SUBROUTINE IMMTADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N INTEGER ALPHA, BETA * .. * .. Array Arguments .. INTEGER A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * IMMTADD performs the following operation: * * B := alpha * A' + beta * B, * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) INTEGER * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) INTEGER array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) INTEGER * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) INTEGER array * On entry, B is an array of dimension ( LDB, M ). On exit, the * leading m by n part of A has been added to the leading n by m * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. INTEGER ONE, ZERO PARAMETER ( ONE = 1, ZERO = 0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M B( J, I ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( J, I ) = A( I, J ) + BETA * B( J, I ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( J, I ) = A( I, J ) + B( J, I ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( J, I ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( J, I ) = ALPHA * A( I, J ) + BETA * B( J, I ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M B( J, I ) = ALPHA * A( I, J ) + B( J, I ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, M DO 130 I = 1, N B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, M DO 150 I = 1, N B( I, J ) = BETA * B( I, J ) 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, M DO 170 I = 1, N B( I, J ) = A( J, I ) 170 CONTINUE 180 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N B( I, J ) = A( J, I ) + BETA * B( I, J ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M DO 210 I = 1, N B( I, J ) = A( J, I ) + B( I, J ) 210 CONTINUE 220 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N B( I, J ) = ALPHA * A( J, I ) 230 CONTINUE 240 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M DO 270 I = 1, N B( I, J ) = ALPHA * A( J, I ) + B( I, J ) 270 CONTINUE 280 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 300 J = 1, M DO 290 I = 1, N B( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 320 J = 1, M DO 310 I = 1, N B( I, J ) = BETA * B( I, J ) 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of IMMTADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/Makefile000644 000766 000024 00000010443 11654025546 017772 0ustar00juliestaff000000 000000 ############################################################################ # # Program: PBLAS -- (version 2.0) # # Module: Makefile # # Purpose: PTZBLAS source Makefile # # Creation date: April 1, 1998 # # Modified: # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../../SLmake.inc ############################################################################ # # This is the makefile to create a library for the PBTZBLAS tools. # The files are grouped as follows: # # ALLPBTZ -- Auxiliary routines for Level 1, 2 and 3 PBLAS # # SPBTZ -- Single precision real PBLAS Level F77 tools routines # CPBTZ -- Single precision complex PBLAS Level F77 tools routines # DPBTZ -- Double precision real PBLAS Level F77 tools routines # ZPBTZ -- Double precision complex PBLAS Level F77 tools routines # # The library can be set up to include routines for any combination # of the four precisions. First, modify the ../../../SLmake.inc file # definitions to match your compiler and the options to be used. # Then to create or add to the library, enter make followed by one or # more of the precisions desired. Some examples: # make single # make single complex # make single double complex complex16 # Alternatively, the command # make # without any arguments creates a library of all four precisions. # The library is called # pblas_$(PLAT).a # (see ../../../SLmake.inc for the definition of $(PLAT)). # # To remove the object files after the library is created, enter # make clean # ############################################################################ all: single double complex complex16 APBTZ = \ pxerbla.o IPBTZ = \ immadd.o immdda.o immtadd.o immddat.o SPBTZ = \ svasum.o sset.o scshft.o srshft.o \ svvdot.o smmadd.o smmcadd.o smmtadd.o \ smmtcadd.o smmdda.o smmddac.o smmddat.o \ smmddact.o stzpad.o stzpadcpy.o stzscal.o \ sagemv.o sasymv.o satrmv.o SCPBTZ = \ sasqrtb.o sascal.o CPBTZ = \ scvasum.o cset.o ccshft.o crshft.o \ cvvdotu.o cvvdotc.o cmmadd.o cmmcadd.o \ cmmtadd.o cmmtcadd.o cmmdda.o cmmddac.o \ cmmddat.o cmmddact.o ctzpad.o ctzpadcpy.o \ chescal.o ctzscal.o ctzcnjg.o cagemv.o \ cahemv.o catrmv.o casymv.o csymv.o \ csyr.o csyr2.o DPBTZ = \ dvasum.o dset.o dcshft.o drshft.o \ dvvdot.o dmmadd.o dmmcadd.o dmmtadd.o \ dmmtcadd.o dmmdda.o dmmddac.o dmmddat.o \ dmmddact.o dtzpad.o dtzpadcpy.o dtzscal.o \ dagemv.o dasymv.o datrmv.o DZPBTZ = \ dasqrtb.o dascal.o ZPBTZ = \ dzvasum.o zset.o zcshft.o zrshft.o \ zvvdotu.o zvvdotc.o zmmadd.o zmmcadd.o \ zmmtadd.o zmmtcadd.o zmmdda.o zmmddac.o \ zmmddat.o zmmddact.o ztzpad.o ztzpadcpy.o \ zhescal.o ztzscal.o ztzcnjg.o zagemv.o \ zahemv.o zatrmv.o zasymv.o zsymv.o \ zsyr.o zsyr2.o #--------------------------------------------------------------------------- single: $(APBTZ) $(IPBTZ) $(SPBTZ) $(SCPBTZ) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(APBTZ) $(IPBTZ) $(SPBTZ) $(SCPBTZ) $(RANLIB) ../../../$(SCALAPACKLIB) complex: $(APBTZ) $(IPBTZ) $(CPBTZ) $(SCPBTZ) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(APBTZ) $(IPBTZ) $(CPBTZ) $(SCPBTZ) $(RANLIB) ../../../$(SCALAPACKLIB) double: $(APBTZ) $(IPBTZ) $(DPBTZ) $(DZPBTZ) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(APBTZ) $(IPBTZ) $(DPBTZ) $(DZPBTZ) $(RANLIB) ../../../$(SCALAPACKLIB) complex16: $(APBTZ) $(IPBTZ) $(ZPBTZ) $(DZPBTZ) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(APBTZ) $(IPBTZ) $(ZPBTZ) $(DZPBTZ) $(RANLIB) ../../../$(SCALAPACKLIB) #--------------------------------------------------------------------------- clean: rm -f *.o .f.o : ; $(FC) -c $(FCFLAGS) $*.f .c.o : ; $(CC) -c $(CDEFS) $(CCFLAGS) $*.c scalapack-2.0.2/PBLAS/SRC/PTZBLAS/pxerbla.f000644 000766 000024 00000003641 10363532303 020126 0ustar00juliestaff000000 000000 SUBROUTINE PXERBLA( ICTXT, SRNAME, INFO ) * * -- ScaLAPACK auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER ICTXT, INFO * .. * .. Array Arguments .. CHARACTER*(*) SRNAME * .. * * Purpose * ======= * * PXERBLA is an error handler for the ScaLAPACK routines. It is called * by a ScaLAPACK routine if an input parameter has an invalid value. A * message is printed. Installers may consider modifying this routine in * order to call system-specific exception-handling facilities. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * SRNAME (global input) CHARACTER*(*) * On entry, SRNAME specifies the name of the routine which cal- * ling PXERBLA. * * INFO (global input) INTEGER * On entry, INFO specifies the position of the invalid parame- * ter in the parameter list of the calling routine. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER MYCOL, MYROW, NPCOL, NPROW * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO * .. * .. Executable Statements .. * CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * WRITE( *, FMT = 9999 ) MYROW, MYCOL, SRNAME, INFO * 9999 FORMAT( '{', I5, ',', I5, '}: On entry to ', A, $ ' parameter number ', I4, ' had an illegal value' ) * RETURN * * End of PXERBLA * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/sagemv.f000644 000766 000024 00000017445 10363532303 017762 0ustar00juliestaff000000 000000 SUBROUTINE SAGEMV( TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER INCX, INCY, LDA, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SAGEMV performs one of the matrix-vector operations * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * or * * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an m by n matrix. * * Arguments * ========= * * TRANS (input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ) * * TRANS = 'T' or 't': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * TRANS = 'C' or 'c': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * ALPHA (input) REAL * On entry, ALPHA specifies the real scalar alpha. * * A (input) REAL array of dimension ( LDA, n ). * On entry, A is an array of dimension ( LDA, N ). The leading * m by n part of the array A must contain the matrix of coef- * ficients. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * X (input) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' and at * least ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. Before entry, * the incremented array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) REAL * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) REAL array of dimension at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' and at * least ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. Before entry * with BETA non-zero, the incremented array Y must contain the * vector y. On exit, the incremented array Y is overwritten by * the updated vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY REAL ABSX, TALPHA, TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = 1 ELSE IF( M.LT.0 ) THEN INFO = 2 ELSE IF( N.LT.0 ) THEN INFO = 3 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SAGEMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF( LSAME( TRANS, 'N' ) ) THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := abs( beta*y ). * IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 20, I = 1, LENY Y( I ) = ABS( Y( I ) ) 20 CONTINUE ELSE DO 30, I = 1, LENY Y( I ) = ABS( BETA * Y( I ) ) 30 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 40, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 40 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 50, I = 1, LENY Y( IY ) = ABS( Y( IY ) ) IY = IY + INCY 50 CONTINUE ELSE DO 60, I = 1, LENY Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 60 CONTINUE END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( TRANS, 'N' ) ) THEN * * Form y := abs( alpha ) * abs( A ) * abs( x ) + y. * JX = KX IF( INCY.EQ.1 ) THEN DO 80, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX DO 70, I = 1, M Y( I ) = Y( I ) + TEMP * ABS( A( I, J ) ) 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE ELSE DO 100, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY DO 90, I = 1, M Y( IY ) = Y( IY ) + TEMP * ABS( A( I, J ) ) IY = IY + INCY 90 CONTINUE END IF JX = JX + INCX 100 CONTINUE END IF * ELSE * * Form y := abs( alpha ) * abs( A' ) * abs( x ) + y. * JY = KY IF( INCX.EQ.1 ) THEN DO 120, J = 1, N TEMP = ZERO DO 110, I = 1, M TEMP = TEMP + ABS( A( I, J ) * X( I ) ) 110 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 120 CONTINUE ELSE DO 140, J = 1, N TEMP = ZERO IX = KX DO 130, I = 1, M TEMP = TEMP + ABS( A( I, J ) * X( IX ) ) IX = IX + INCX 130 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 140 CONTINUE END IF END IF * RETURN * * End of SAGEMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/sascal.f000644 000766 000024 00000007775 10363532303 017753 0ustar00juliestaff000000 000000 SUBROUTINE SASCAL( N, ALPHA, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N REAL ALPHA * .. * .. Array Arguments .. REAL X( * ) * .. * * Purpose * ======= * * SASCAL performs the following operation: * * x := abs( alpha ) * abs( x ), * * where alpha is a scalar and x is an n vector. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ALPHA (input) REAL * On entry, ALPHA specifies the scalar alpha. * * X (input/output) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. On exit, entries of the * incremented array X are mutiplied by alpha in absolute value. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, M, MP1 * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MOD * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = 1 ELSE IF( INCX.EQ.0 ) THEN INFO = 4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SASCAL', INFO ) RETURN END IF * * Quick return if possible. * IF( N.LE.0 ) $ RETURN * * Form x := abs( alpha ) * abs( x ) * IF( INCX.EQ.1 ) $ GO TO 40 * * code for increments not equal to 1 * * Set up the start point in X. * IF( INCX.GT.0 ) THEN IX = 1 ELSE IX = 1 - ( N - 1 ) * INCX END IF * IF( ALPHA.EQ.ZERO ) THEN DO 10 I = 1, N X( IX ) = ZERO IX = IX + INCX 10 CONTINUE ELSE IF( ALPHA.EQ.ONE ) THEN DO 20 I = 1, N X( IX ) = ABS( X( IX ) ) IX = IX + INCX 20 CONTINUE ELSE DO 30 I = 1, N X( IX ) = ABS( ALPHA * X( IX ) ) IX = IX + INCX 30 CONTINUE END IF * RETURN * * code for increment equal to 1 * * clean-up loop * 40 M = MOD( N, 4 ) * IF( M.EQ.0 ) $ GO TO 80 * IF( ALPHA.EQ.ZERO ) THEN DO 50 I = 1, M X( I ) = ZERO 50 CONTINUE ELSE IF( ALPHA.EQ.ONE ) THEN DO 60 I = 1, M X( I ) = ABS( X( I ) ) 60 CONTINUE ELSE DO 70 I = 1, M X( I ) = ABS( ALPHA * X( I ) ) 70 CONTINUE END IF * IF( N.LT.4 ) $ RETURN * 80 MP1 = M + 1 * IF( ALPHA.EQ.ZERO ) THEN DO 90 I = MP1, N, 4 X( I ) = ZERO X( I + 1 ) = ZERO X( I + 2 ) = ZERO X( I + 3 ) = ZERO 90 CONTINUE ELSE IF( ALPHA.EQ.ONE ) THEN DO 100 I = MP1, N, 4 X( I ) = ABS( X( I ) ) X( I + 1 ) = ABS( X( I + 1 ) ) X( I + 2 ) = ABS( X( I + 2 ) ) X( I + 3 ) = ABS( X( I + 3 ) ) 100 CONTINUE ELSE DO 110 I = MP1, N, 4 X( I ) = ABS( ALPHA * X( I ) ) X( I + 1 ) = ABS( ALPHA * X( I + 1 ) ) X( I + 2 ) = ABS( ALPHA * X( I + 2 ) ) X( I + 3 ) = ABS( ALPHA * X( I + 3 ) ) 110 CONTINUE END IF * RETURN * * End of SASCAL * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/sasqrtb.f000644 000766 000024 00000002101 10363532303 020136 0ustar00juliestaff000000 000000 SUBROUTINE SASQRTB( A, B, C ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. REAL A, B, C * .. * * Purpose * ======= * * SASQRTB computes c := a * sqrt( b ) where a, b and c are scalars. * * Arguments * ========= * * A (input) REAL * On entry, A specifies the scalar a. * * B (input) REAL * On entry, B specifies the scalar b. * * C (output) REAL * On entry, C specifies the scalar c. On exit, c is overwritten * by the product of a and the square root of b. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC SQRT * .. * .. Executable Statements .. * C = A * SQRT( B ) * RETURN * * End of SASQRTB * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/sasymv.f000644 000766 000024 00000020324 10363532303 020010 0ustar00juliestaff000000 000000 SUBROUTINE SASYMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SASYMV performs the following matrix-vector operation * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n symmetric matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower triangu- * lar part of the array A is to be referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A is * to be referenced. * UPLO = 'L' or 'l' Only the lower triangular part of A is * to be referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) REAL * On entry, ALPHA specifies the real scalar alpha. * * A (input) REAL array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. When UPLO = 'L' or 'l', the leading n by n part of * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) REAL * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY REAL TALPHA, TEMP0, TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SASYMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := abs( beta * y ). * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 40 CONTINUE END IF END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN DO 60, J = 1, N TEMP1 = TALPHA * ABS( X( J ) ) TEMP2 = ZERO DO 50, I = 1, J - 1 TEMP0 = ABS( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * ABS( X( I ) ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1 * ABS( A( J, J ) ) + $ ALPHA * TEMP2 * 60 CONTINUE * ELSE * JX = KX JY = KY * DO 80, J = 1, N TEMP1 = TALPHA * ABS( X( JX ) ) TEMP2 = ZERO IX = KX IY = KY * DO 70, I = 1, J - 1 TEMP0 = ABS( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * ABS( X( IX ) ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1 * ABS( A( J, J ) ) + $ ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 80 CONTINUE * END IF * ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN * DO 100, J = 1, N * TEMP1 = TALPHA * ABS( X( J ) ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1 * ABS( A( J, J ) ) * DO 90, I = J + 1, N TEMP0 = ABS( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * ABS( X( I ) ) * 90 CONTINUE * Y( J ) = Y( J ) + ALPHA * TEMP2 * 100 CONTINUE * ELSE * JX = KX JY = KY * DO 120, J = 1, N TEMP1 = TALPHA * ABS( X( JX ) ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1 * ABS( A( J, J ) ) IX = JX IY = JY * DO 110, I = J + 1, N * IX = IX + INCX IY = IY + INCY TEMP0 = ABS( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * ABS( X( IX ) ) * 110 CONTINUE * Y( JY ) = Y( JY ) + ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 120 CONTINUE * END IF * END IF * RETURN * * End of SASYMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/satrmv.f000644 000766 000024 00000030145 10363532303 020004 0ustar00juliestaff000000 000000 SUBROUTINE SATRMV( UPLO, TRANS, DIAG, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER INCX, INCY, LDA, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SATRMV performs one of the matrix-vector operations * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * or * * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n unit or non-unit, upper or lower triangular * matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * TRANS (input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ) * * TRANS = 'T' or 't': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * TRANS = 'C' or 'c': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * DIAG (input) CHARACTER*1 * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit triangular. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) REAL * On entry, ALPHA specifies the real scalar alpha. * * A (input) REAL array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the matrix A and * the strictly lower triangular part of A is not referenced. * When UPLO = 'L' or 'l', the leading n by n part of the array * A must contain the lower triangular part of the matrix A and * the strictly upper trapezoidal part of A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of A * are not referenced either, but are assumed to be unity. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) REAL * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY LOGICAL NOUNIT REAL ABSX, TALPHA, TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 7 ELSE IF( INCX.EQ.0 )THEN INFO = 9 ELSE IF( INCY.EQ.0 ) THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SATRMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := abs( beta*y ). * IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 20, I = 1, N Y( I ) = ABS( Y( I ) ) 20 CONTINUE ELSE DO 30, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 30 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 40, I = 1, N Y( IY ) = ZERO IY = IY + INCY 40 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 50, I = 1, N Y( IY ) = ABS( Y( IY ) ) IY = IY + INCY 50 CONTINUE ELSE DO 60, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 60 CONTINUE END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( TRANS, 'N' ) )THEN * * Form y := abs( alpha ) * abs( A ) * abs( x ) + y. * IF( LSAME( UPLO, 'U' ) )THEN JX = KX IF( INCY.EQ.1 ) THEN DO 80, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX DO 70, I = 1, J - 1 Y( I ) = Y( I ) + TEMP * ABS( A( I, J ) ) 70 CONTINUE * IF( NOUNIT ) THEN Y( J ) = Y( J ) + TEMP * ABS( A( J, J ) ) ELSE Y( J ) = Y( J ) + TEMP END IF END IF JX = JX + INCX 80 CONTINUE * ELSE * DO 100, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY DO 90, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP * ABS( A( I, J ) ) IY = IY + INCY 90 CONTINUE * IF( NOUNIT ) THEN Y( IY ) = Y( IY ) + TEMP * ABS( A( J, J ) ) ELSE Y( IY ) = Y( IY ) + TEMP END IF END IF JX = JX + INCX 100 CONTINUE * END IF * ELSE * JX = KX IF( INCY.EQ.1 ) THEN DO 120, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN * TEMP = TALPHA * ABSX * IF( NOUNIT ) THEN Y( J ) = Y( J ) + TEMP * ABS( A( J, J ) ) ELSE Y( J ) = Y( J ) + TEMP END IF * DO 110, I = J + 1, N Y( I ) = Y( I ) + TEMP * ABS( A( I, J ) ) 110 CONTINUE END IF JX = JX + INCX 120 CONTINUE * ELSE * DO 140, J = 1, N ABSX = ABS( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY + ( J - 1 ) * INCY * IF( NOUNIT ) THEN Y( IY ) = Y( IY ) + TEMP * ABS( A( J, J ) ) ELSE Y( IY ) = Y( IY ) + TEMP END IF * DO 130, I = J + 1, N IY = IY + INCY Y( IY ) = Y( IY ) + TEMP * ABS( A( I, J ) ) 130 CONTINUE END IF JX = JX + INCX 140 CONTINUE * END IF * END IF * ELSE * * Form y := abs( alpha ) * abs( A' ) * abs( x ) + y. * IF( LSAME( UPLO, 'U' ) )THEN JY = KY IF( INCX.EQ.1 ) THEN DO 160, J = 1, N * TEMP = ZERO * DO 150, I = 1, J - 1 TEMP = TEMP + ABS( A( I, J ) * X( I ) ) 150 CONTINUE * IF( NOUNIT ) THEN TEMP = TEMP + ABS( A( J, J ) * X( J ) ) ELSE TEMP = TEMP + ABS( X( J ) ) END IF * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 160 CONTINUE * ELSE * DO 180, J = 1, N TEMP = ZERO IX = KX DO 170, I = 1, J - 1 TEMP = TEMP + ABS( A( I, J ) * X( IX ) ) IX = IX + INCX 170 CONTINUE * IF( NOUNIT ) THEN TEMP = TEMP + ABS( A( J, J ) * X( IX ) ) ELSE TEMP = TEMP + ABS( X( IX ) ) END IF * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 180 CONTINUE * END IF * ELSE * JY = KY IF( INCX.EQ.1 ) THEN * DO 200, J = 1, N * IF( NOUNIT ) THEN TEMP = ABS( A( J, J ) * X( J ) ) ELSE TEMP = ABS( X( J ) ) END IF * DO 190, I = J + 1, N TEMP = TEMP + ABS( A( I, J ) * X( I ) ) 190 CONTINUE * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 200 CONTINUE * ELSE * DO 220, J = 1, N * IX = KX + ( J - 1 ) * INCX * IF( NOUNIT ) THEN TEMP = ABS( A( J, J ) * X( IX ) ) ELSE TEMP = ABS( X( IX ) ) END IF * DO 210, I = J + 1, N IX = IX + INCX TEMP = TEMP + ABS( A( I, J ) * X( IX ) ) 210 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 220 CONTINUE END IF END IF * END IF * RETURN * * End of SATRMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/scshft.f000644 000766 000024 00000005526 10363532303 017767 0ustar00juliestaff000000 000000 SUBROUTINE SCSHFT( M, N, OFFSET, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, M, N, OFFSET * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * SCSHFT shifts columns of an m by n array A by OFFSET. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A. M must be at * least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A to be * shifted. N must be at least zero. * * OFFSET (local input) INTEGER * On entry, OFFSET specifies the offset by which the columns of * A should be shifted. OFFSET can be positive or negative (see * below for further details). When OFFSET is positive, the co- * lumns are shifted to the right. When OFFSET is negative, the * columns of A are shifted to the left. * * A (local input/local output) REAL array * On entry, A is an array of dimension ( LDA, N+ABS(OFFSET) ). * On exit, A contains the shifted array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Further Details * =============== * * N=3 OFFSET=6 -OFFSET=6 N=3 * ------------------- ------------------- * | 1 2 3 4 5 6 7 8 9 | M | 1 2 3 4 5 6 7 8 9 | * ------------------- ------------------- * V V * ------------------- ------------------- * | 1 2 3 4 5 6 1 2 3 | M | 7 8 9 4 5 6 7 8 9 | * ------------------- ------------------- * OFFSET >= 0 OFFSET <= 0 * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( ( OFFSET.EQ.0 ).OR.( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * IF( OFFSET.GT.0 ) THEN DO 20 J = N, 1, -1 DO 10 I = 1, M A( I, J+OFFSET ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = A( I, J-OFFSET ) 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of SCSHFT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/scvasum.f000644 000766 000024 00000002710 10363532303 020146 0ustar00juliestaff000000 000000 SUBROUTINE SCVASUM( N, ASUM, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N REAL ASUM * .. * .. Array Arguments .. COMPLEX X( * ) * .. * * Purpose * ======= * * SCVASUM returns the sum of absolute values of the entries of a vector * x. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ASUM (output) COMPLEX * On exit, ASUM specifies the sum of absolute values. * * X (input) COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. REAL SCASUM EXTERNAL SCASUM * .. * .. Executable Statements .. * ASUM = SCASUM( N, X, INCX ) * RETURN * * End of SCVASUM * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/smmadd.f000644 000766 000024 00000010417 10363532303 017735 0ustar00juliestaff000000 000000 SUBROUTINE SMMADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SMMADD performs the following operation: * * B := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) REAL * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) REAL array * On entry, B is an array of dimension ( LDB, N ). On exit, the * leading m by n part of A has been added to the leading m by n * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SSCAL * .. * .. Executable Statements .. * IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL SCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) * DO 10 I = 1, M * B( I, J ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL SAXPY( M, ONE, A( 1, J ), 1, B( 1, J ), 1 ) * DO 50 I = 1, M * B( I, J ) = A( I, J ) + B( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL SAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) * DO 110 I = 1, M * B( I, J ) = ALPHA * A( I, J ) + B( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, N CALL SSCAL( M, BETA, B( 1, J ), 1 ) * DO 150 I = 1, M * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of SMMADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/smmcadd.f000644 000766 000024 00000010422 10363532303 020074 0ustar00juliestaff000000 000000 SUBROUTINE SMMCADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SMMCADD performs the following operation: * * B := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) REAL * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) REAL array * On entry, B is an array of dimension ( LDB, N ). On exit, the * leading m by n part of A has been added to the leading m by n * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SSCAL * .. * .. Executable Statements .. * IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL SCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) * DO 10 I = 1, M * B( I, J ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL SAXPY( M, ONE, A( 1, J ), 1, B( 1, J ), 1 ) * DO 50 I = 1, M * B( I, J ) = A( I, J ) + B( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL SAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) * DO 110 I = 1, M * B( I, J ) = ALPHA * A( I, J ) + B( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, N CALL SSCAL( M, BETA, B( 1, J ), 1 ) * DO 150 I = 1, M * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of SMMCADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/smmdda.f000644 000766 000024 00000010423 10363532303 017732 0ustar00juliestaff000000 000000 SUBROUTINE SMMDDA( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SMMDDA performs the following operation: * * A := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) REAL array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading m by n part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) REAL * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) REAL array * On entry, B is an array of dimension ( LDB, N ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SSCAL * .. * .. Executable Statements .. * IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL SCOPY( M, B( 1, J ), 1, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( I, J ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL SAXPY( M, ONE, B( 1, J ), 1, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( I, J ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( I, J ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL SAXPY( M, BETA, B( 1, J ), 1, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( I, J ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL SSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of SMMDDA * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/smmddac.f000644 000766 000024 00000010426 10363532303 020100 0ustar00juliestaff000000 000000 SUBROUTINE SMMDDAC( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SMMDDAC performs the following operation: * * A := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) REAL array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading m by n part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) REAL * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) REAL array * On entry, B is an array of dimension ( LDB, N ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SSCAL * .. * .. Executable Statements .. * IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL SCOPY( M, B( 1, J ), 1, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( I, J ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL SAXPY( M, ONE, B( 1, J ), 1, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( I, J ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( I, J ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL SAXPY( M, BETA, B( 1, J ), 1, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( I, J ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL SSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of SMMDDAC * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/smmddact.f000644 000766 000024 00000015073 10363532303 020267 0ustar00juliestaff000000 000000 SUBROUTINE SMMDDACT( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SMMDDACT performs the following operation: * * A := alpha * A + beta * B', * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) REAL array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading n by m part of B has been added into the leading m by * n part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) REAL * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) REAL array * On entry, B is an array of dimension ( LDB, M ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL SCOPY( M, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( J, I ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( J, I ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL SAXPY( M, ONE, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( J, I ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( J, I ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( J, I ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL SAXPY( M, BETA, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( J, I ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL SSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 180 J = 1, M CALL SCOPY( N, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 170 I = 1, N * A( J, I ) = B( I, J ) * 170 CONTINUE 180 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N A( J, I ) = B( I, J ) + ALPHA * A( J, I ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL SAXPY( N, ONE, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 210 I = 1, N * A( J, I ) = B( I, J ) + A( J, I ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N A( J, I ) = BETA * B( I, J ) 230 CONTINUE 240 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N A( J, I ) = BETA * B( I, J ) + ALPHA * A( J, I ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL SAXPY( N, BETA, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 270 I = 1, N * A( J, I ) = BETA * B( I, J ) + A( J, I ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 300 J = 1, N DO 290 I = 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 320 J = 1, N CALL SSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 310 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of SMMDDACT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/smmddat.f000644 000766 000024 00000015066 10363532303 020126 0ustar00juliestaff000000 000000 SUBROUTINE SMMDDAT( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SMMDDAT performs the following operation: * * A := alpha * A + beta * B', * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) REAL array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading n by m part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) REAL * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) REAL array * On entry, B is an array of dimension ( LDB, M ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL SCOPY( M, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( J, I ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( J, I ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL SAXPY( M, ONE, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( J, I ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( J, I ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( J, I ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL SAXPY( M, BETA, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( J, I ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL SSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 180 J = 1, M CALL SCOPY( N, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 170 I = 1, N * A( J, I ) = B( I, J ) * 170 CONTINUE 180 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N A( J, I ) = B( I, J ) + ALPHA * A( J, I ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL SAXPY( N, ONE, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 210 I = 1, N * A( J, I ) = B( I, J ) + A( J, I ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N A( J, I ) = BETA * B( I, J ) 230 CONTINUE 240 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N A( J, I ) = BETA * B( I, J ) + ALPHA * A( J, I ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL SAXPY( N, BETA, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 270 I = 1, N * A( J, I ) = BETA * B( I, J ) + A( J, I ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 300 J = 1, N DO 290 I = 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 320 J = 1, N CALL SSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 310 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of SMMDDAT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/smmtadd.f000644 000766 000024 00000015056 10363532303 020125 0ustar00juliestaff000000 000000 SUBROUTINE SMMTADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SMMTADD performs the following operation: * * B := alpha * A' + beta * B, * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) REAL * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) REAL array * On entry, B is an array of dimension ( LDB, M ). On exit, the * leading m by n part of A has been added to the leading n by m * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL SCOPY( M, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 10 I = 1, M * B( J, I ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( J, I ) = A( I, J ) + BETA * B( J, I ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL SAXPY( M, ONE, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 50 I = 1, M * B( J, I ) = A( I, J ) + B( J, I ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( J, I ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( J, I ) = ALPHA * A( I, J ) + BETA * B( J, I ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL SAXPY( M, ALPHA, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 110 I = 1, M * B( J, I ) = ALPHA * A( I, J ) + B( J, I ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, M DO 130 I = 1, N B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, M CALL SSCAL( N, BETA, B( 1, J ), 1 ) * DO 150 I = 1, N * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, M CALL SCOPY( N, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 170 I = 1, N * B( I, J ) = A( J, I ) * 170 CONTINUE 180 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N B( I, J ) = A( J, I ) + BETA * B( I, J ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL SAXPY( N, ONE, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 210 I = 1, N * B( I, J ) = A( J, I ) + B( I, J ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N B( I, J ) = ALPHA * A( J, I ) 230 CONTINUE 240 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL SAXPY( N, ALPHA, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 270 I = 1, N * B( I, J ) = ALPHA * A( J, I ) + B( I, J ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 300 J = 1, M DO 290 I = 1, N B( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 320 J = 1, M CALL SSCAL( N, BETA, B( 1, J ), 1 ) * DO 310 I = 1, N * B( I, J ) = BETA * B( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of SMMTADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/smmtcadd.f000644 000766 000024 00000015061 10363532303 020264 0ustar00juliestaff000000 000000 SUBROUTINE SMMTCADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SMMTCADD performs the following operation: * * B := alpha * A' + beta * B, * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) REAL * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) REAL array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) REAL * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) REAL array * On entry, B is an array of dimension ( LDB, M ). On exit, the * leading m by n part of A has been added to the leading n by m * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL SCOPY( M, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 10 I = 1, M * B( J, I ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( J, I ) = A( I, J ) + BETA * B( J, I ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL SAXPY( M, ONE, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 50 I = 1, M * B( J, I ) = A( I, J ) + B( J, I ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( J, I ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( J, I ) = ALPHA * A( I, J ) + BETA * B( J, I ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL SAXPY( M, ALPHA, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 110 I = 1, M * B( J, I ) = ALPHA * A( I, J ) + B( J, I ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, M DO 130 I = 1, N B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, M CALL SSCAL( N, BETA, B( 1, J ), 1 ) * DO 150 I = 1, N * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, M CALL SCOPY( N, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 170 I = 1, N * B( I, J ) = A( J, I ) * 170 CONTINUE 180 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N B( I, J ) = A( J, I ) + BETA * B( I, J ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL SAXPY( N, ONE, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 210 I = 1, N * B( I, J ) = A( J, I ) + B( I, J ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N B( I, J ) = ALPHA * A( J, I ) 230 CONTINUE 240 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL SAXPY( N, ALPHA, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 270 I = 1, N * B( I, J ) = ALPHA * A( J, I ) + B( I, J ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 300 J = 1, M DO 290 I = 1, N B( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 320 J = 1, M CALL SSCAL( N, BETA, B( 1, J ), 1 ) * DO 310 I = 1, N * B( I, J ) = BETA * B( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of SMMTCADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/srshft.f000644 000766 000024 00000006165 10363532303 020006 0ustar00juliestaff000000 000000 SUBROUTINE SRSHFT( M, N, OFFSET, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, M, N, OFFSET * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * SRSHFT shifts rows of an m by n array A by OFFSET. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A to be shifted. * M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A. N must be * at least zero. * * OFFSET (local input) INTEGER * On entry, OFFSET specifies the offset by which the rows of * A should be shifted. OFFSET can be positive or negative (see * below for further details). When OFFSET is positive, the rows * are shifted to the bottom. When OFFSET is negative, the rows * of A are shifted to the top. * * A (local input/local output) REAL array * On entry, A is an array of dimension ( LDA, N ). On exit, A * contains the shifted array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M+ABS(OFFSET) ). * * Further Details * =============== * * N N N N * --- --- --- --- * | 1 | | 1 | | 1 | | 7 | * | 2 | M = 3 | 2 | | 2 | M = 3 | 8 | * | 3 | | 3 | | 3 | | 9 | * | 4 | | 4 | | 4 | | 4 | * | 5 | becomes | 5 | | 5 | becomes | 5 | * | 6 | | 6 | | 6 | | 6 | * | 7 | | 1 | | 7 | | 7 | * | 8 | OFFSET = 6 | 2 | | 8 | OFFSET = -6 | 8 | * | 9 | | 3 | | 9 | | 9 | * --- --- --- --- * OFFSET >= 0 OFFSET <= 0 * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( ( OFFSET.EQ.0 ).OR.( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * IF( OFFSET.GT.0 ) THEN DO 20 J = 1, N DO 10 I = M, 1, -1 A( I+OFFSET, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = A( I-OFFSET, J ) 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of SRSHFT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/sset.f000644 000766 000024 00000005165 10363532303 017452 0ustar00juliestaff000000 000000 SUBROUTINE SSET( N, ALPHA, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N REAL ALPHA * .. * .. Array Arguments .. REAL X( * ) * .. * * Purpose * ======= * * SSET sets the entries of an n vector x to the scalar alpha. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ALPHA (input) REAL * On entry, ALPHA specifies the scalar alpha. * * X (input/output) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. On exit, entries of the * incremented array X are set to alpha. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, INFO, IX, M, MP1 * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = 1 ELSE IF( INCX.EQ.0 ) THEN INFO = 4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSET', INFO ) RETURN END IF * * Quick return if possible. * IF( N.LE.0 ) $ RETURN * * Form x := alpha * IF( INCX.EQ.1 ) $ GO TO 20 * * code for increments not equal to 1 * * Set up the start point in X. * IF( INCX.GT.0 ) THEN IX = 1 ELSE IX = 1 - ( N - 1 ) * INCX END IF * DO 10 I = 1, N X( IX ) = ALPHA IX = IX + INCX 10 CONTINUE * RETURN * * code for increment equal to 1 * * clean-up loop * 20 M = MOD( N, 4 ) * IF( M.EQ.0 ) $ GO TO 40 * DO 30 I = 1, M X( I ) = ALPHA 30 CONTINUE IF( N.LT.4 ) $ RETURN * 40 MP1 = M + 1 DO 50 I = MP1, N, 4 X( I ) = ALPHA X( I + 1 ) = ALPHA X( I + 2 ) = ALPHA X( I + 3 ) = ALPHA 50 CONTINUE * RETURN * * End of SSET * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/stzpad.f000644 000766 000024 00000020730 10363532303 017774 0ustar00juliestaff000000 000000 SUBROUTINE STZPAD( UPLO, HERM, M, N, IOFFD, ALPHA, BETA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 HERM, UPLO INTEGER IOFFD, LDA, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * STZPAD initializes a two-dimensional array A to beta on the diagonal * specified by IOFFD or zeros the imaginary part of those diagonals and * set the offdiagonals to alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be set as follows: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of A is not changed, * = 'D' or 'd': diagonal specified by IOFFD is set; the * rest of the array A is unchanged, * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of A is not changed, * Otherwise: All of the array A is set. * * HERM (input) CHARACTER*1 * On entry, HERM specifies what should be done to the diagonals * as follows. When UPLO is 'L', 'l', 'D', 'd', 'U' or 'u' and * HERM is 'Z' or 'z', the imaginary part of the diagonals is * set to zero. Otherwise, the diagonals are set to beta. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) REAL * On entry, ALPHA specifies the scalar alpha, i.e., the value * to which the offdiagonal entries of the array A determined by * UPLO and IOFFD are set. * * BETA (input) REAL * On entry, BETA specifies the scalar beta, i.e., the value to * which the diagonal entries specified by IOFFD of the array A * are set. BETA is not referenced when UPLO is 'L', 'l', 'U' or * 'u' and HERM is 'Z'. * * A (input/output) REAL array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U', the leading m by n part of the array A must * contain the upper trapezoidal part of the matrix to be set as * specified by IOFFD, and the strictly lower trapezoidal part * of A is not referenced; When UPLO = 'L', the leading m by n * part of the array A must contain the lower trapezoidal part * of the matrix to be set as specified by IOFFD, and the * strictly upper trapezoidal part of A is not referenced. On * exit, the entries of the trapezoid part of A determined by * UPLO, HERM and IOFFD are set. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals and set the strictly lower triangular part of the * array to ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA 10 CONTINUE 20 CONTINUE * IF( LSAME( HERM, 'Z' ) ) THEN DO 40 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD DO 30 I = JTMP + 1, M A( I, J ) = ALPHA 30 CONTINUE 40 CONTINUE ELSE DO 60 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = BETA DO 50 I = JTMP + 1, M A( I, J ) = ALPHA 50 CONTINUE 60 CONTINUE END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals and set the strictly upper triangular part of the * array to ALPHA. * MN = MIN( M - IOFFD, N ) IF( LSAME( HERM, 'Z' ) ) THEN DO 80 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 70 I = 1, JTMP - 1 A( I, J ) = ALPHA 70 CONTINUE 80 CONTINUE ELSE DO 100 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 90 I = 1, JTMP - 1 A( I, J ) = ALPHA 90 CONTINUE A( JTMP, J ) = BETA 100 CONTINUE END IF DO 120 J = MAX( 0, MN ) + 1, N DO 110 I = 1, M A( I, J ) = ALPHA 110 CONTINUE 120 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Set the diagonal to BETA * IF( .NOT.( LSAME( HERM, 'Z' ) ) ) THEN IF( ( IOFFD.LT.M ).AND.( IOFFD.GT.-N ) ) THEN DO 130 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 130 CONTINUE END IF END IF * ELSE * * Set the diagonals to BETA and the offdiagonals to ALPHA. * DO 150 J = 1, N DO 140 I = 1, M A( I, J ) = ALPHA 140 CONTINUE 150 CONTINUE IF( ALPHA.NE.BETA .AND. IOFFD.LT.M .AND. IOFFD.GT.-N ) THEN DO 160 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 160 CONTINUE END IF * END IF * RETURN * * End of STZPAD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/stzpadcpy.f000644 000766 000024 00000017717 10363532303 020523 0ustar00juliestaff000000 000000 SUBROUTINE STZPADCPY( UPLO, DIAG, M, N, IOFFD, A, LDA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, UPLO INTEGER IOFFD, LDA, LDB, M, N * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * STZPADCPY copies an array A into an array B. The unchanged part of B * is padded with zeros. The diagonal of B specified by IOFFD may be set * to ones. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be copied as follows: * = 'L' or 'l': Lower triangular part is copied; the * strictly upper triangular part of B is * padded with zeros, * = 'U' or 'u': Upper triangular part is copied; the * strictly lower triangular part of B is * padded with zeros. * * DIAG (input) CHARACTER*1 * On entry, DIAG specifies whether or not the diagonal of B is * to be set to ones or not as follows: * * DIAG = 'N' or 'n': the diagonals of A are copied into the * diagonals of B, otherwise the diagonals of B are set to ones. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * A (input) REAL array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U', the leading m by n part of the array A must * contain the upper trapezoidal part of the matrix to be copied * as specified by IOFFD, UPLO and DIAG, and the strictly lower * trapezoidal part of A is not referenced; When UPLO = 'L',the * leading m by n part of the array A must contain the lower * trapezoidal part of the matrix to be copied as specified by * IOFFD, UPLO and DIAG and the strictly upper trapezoidal part * of A is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * B (output) REAL array * On entry, B is an array of dimension (LDB,N). On exit, this * array contains the padded copy of A as specified by IOFFD, * UPLO and DIAG. * * LDB (input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, ITMP, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M B( I, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE * JTMP = MIN( M - IOFFD, N ) * IF( LSAME( DIAG, 'N' ) ) THEN DO 50 J = MN + 1, JTMP ITMP = J + IOFFD DO 30 I = 1, ITMP - 1 B( I, J ) = ZERO 30 CONTINUE DO 40 I = ITMP, M B( I, J ) = A( I, J ) 40 CONTINUE 50 CONTINUE ELSE DO 80 J = MN + 1, JTMP ITMP = J + IOFFD DO 60 I = 1, ITMP - 1 B( I, J ) = ZERO 60 CONTINUE B( ITMP, J ) = ONE DO 70 I = ITMP + 1, M B( I, J ) = A( I, J ) 70 CONTINUE 80 CONTINUE END IF * DO 100 J = JTMP + 1, N DO 90 I = 1, M B( I, J ) = ZERO 90 CONTINUE 100 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * JTMP = MAX( 0, -IOFFD ) * DO 120 J = 1, JTMP DO 110 I = 1, M B( I, J ) = ZERO 110 CONTINUE 120 CONTINUE * MN = MIN( M - IOFFD, N ) * IF( LSAME( DIAG, 'N' ) ) THEN DO 150 J = JTMP + 1, MN ITMP = J + IOFFD DO 130 I = 1, ITMP B( I, J ) = A( I, J ) 130 CONTINUE DO 140 I = ITMP + 1, M B( I, J ) = ZERO 140 CONTINUE 150 CONTINUE ELSE DO 180 J = JTMP + 1, MN ITMP = J + IOFFD DO 160 I = 1, ITMP - 1 B( I, J ) = A( I, J ) 160 CONTINUE B( ITMP, J ) = ONE DO 170 I = ITMP + 1, M B( I, J ) = ZERO 170 CONTINUE 180 CONTINUE END IF * DO 200 J = MAX( 0, MN ) + 1, N DO 190 I = 1, M B( I, J ) = A( I, J ) 190 CONTINUE 200 CONTINUE * ELSE * DO 220 J = 1, N DO 210 I = 1, M B( I, J ) = A( I, J ) 210 CONTINUE 220 CONTINUE * END IF * RETURN * * End of STZPADCPY * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/stzscal.f000644 000766 000024 00000015205 10363532303 020153 0ustar00juliestaff000000 000000 SUBROUTINE STZSCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N REAL ALPHA * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * STZSCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) REAL * On entry, ALPHA specifies the scalar alpha, i.e., the value * by which the diagonal and offdiagonal entries of the array A * as specified by UPLO and IOFFD are scaled. * * A (input/output) REAL array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER J, JTMP, MN * .. * .. External Subroutines .. EXTERNAL SSCAL, STZPAD * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ).OR.( ALPHA.EQ.ONE ) ) $ RETURN * * Start the operations * IF( ALPHA.EQ.ZERO ) THEN CALL STZPAD( UPLO, 'N', M, N, IOFFD, ZERO, ZERO, A, LDA ) RETURN END IF * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 10 J = 1, MIN( MN, N ) CALL SSCAL( M, ALPHA, A( 1, J ), 1 ) 10 CONTINUE DO 20 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD IF( M.GE.JTMP ) $ CALL SSCAL( M-JTMP+1, ALPHA, A( JTMP, J ), 1 ) 20 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 30 J = MAX( 0, -IOFFD ) + 1, MN CALL SSCAL( J + IOFFD, ALPHA, A( 1, J ), 1 ) 30 CONTINUE DO 40 J = MAX( 0, MN ) + 1, N CALL SSCAL( M, ALPHA, A( 1, J ), 1 ) 40 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 50 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 50 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 60 J = 1, N CALL SSCAL( M, ALPHA, A( 1, J ), 1 ) 60 CONTINUE * END IF * RETURN * * End of STZSCAL * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/svasum.f000644 000766 000024 00000002675 10363532303 020015 0ustar00juliestaff000000 000000 SUBROUTINE SVASUM( N, ASUM, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N REAL ASUM * .. * .. Array Arguments .. REAL X( * ) * .. * * Purpose * ======= * * SVASUM returns the sum of absolute values of the entries of a vector * x. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ASUM (output) REAL * On exit, ASUM specifies the sum of absolute values. * * X (input) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. REAL SASUM EXTERNAL SASUM * .. * .. Executable Statements .. * ASUM = SASUM( N, X, INCX ) * RETURN * * End of SVASUM * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/svvdot.f000644 000766 000024 00000003533 10363532303 020016 0ustar00juliestaff000000 000000 SUBROUTINE SVVDOT( N, DOT, X, INCX, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N REAL DOT * .. * .. Array Arguments .. REAL X( * ), Y( * ) * .. * * Purpose * ======= * * SVVDOT computes the following dot product: * * dot = dot + x**T * y, * * where x and y are n vectors. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vectors x and y. N * must be at least zero. * * DOT (input/output) REAL * On exit, DOT is updated with the dot product of the vectors x * and y. * * X (input) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (input) REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented * array Y must contain the vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. REAL SDOT EXTERNAL SDOT * .. * .. Executable Statements .. * DOT = DOT + SDOT( N, X, INCX, Y, INCY ) * RETURN * * End of SVVDOT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zagemv.f000644 000766 000024 00000020260 10363532303 017756 0ustar00juliestaff000000 000000 SUBROUTINE ZAGEMV( TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER INCX, INCY, LDA, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION Y( * ) COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZAGEMV performs one of the matrix-vector operations * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * or * * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ), * * or * * y := abs( alpha )*abs( conjg( A' ) )*abs( x ) + abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an m by n matrix. * * Arguments * ========= * * TRANS (input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ) * * TRANS = 'T' or 't': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * TRANS = 'C' or 'c': * y := abs( alpha )*abs( conjg( A' ) )*abs( x ) + * abs( beta*y ) * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX*16 array of dimension ( LDA, n ). * On entry, A is an array of dimension ( LDA, N ). The leading * m by n part of the array A must contain the matrix of coef- * ficients. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' and at * least ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. Before entry, * the incremented array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) DOUBLE PRECISION * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' and at * least ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. Before entry * with BETA non-zero, the incremented array Y must contain the * vector y. On exit, the incremented array Y is overwritten by * the updated vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY DOUBLE PRECISION ABSX, TALPHA, TEMP COMPLEX*16 ZDUM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( TRANS, 'N' ) .AND. $ .NOT.LSAME( TRANS, 'T' ) .AND. $ .NOT.LSAME( TRANS, 'C' ) ) THEN INFO = 1 ELSE IF( M.LT.0 ) THEN INFO = 2 ELSE IF( N.LT.0 ) THEN INFO = 3 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZAGEMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF( LSAME( TRANS, 'N' ) ) THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := abs( beta*y ). * IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 20, I = 1, LENY Y( I ) = ABS( Y( I ) ) 20 CONTINUE ELSE DO 30, I = 1, LENY Y( I ) = ABS( BETA * Y( I ) ) 30 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 40, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 40 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 50, I = 1, LENY Y( IY ) = ABS( Y( IY ) ) IY = IY + INCY 50 CONTINUE ELSE DO 60, I = 1, LENY Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 60 CONTINUE END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( TRANS, 'N' ) ) THEN * * Form y := abs( alpha ) * abs( A ) * abs( x ) + y. * JX = KX IF( INCY.EQ.1 ) THEN DO 80, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX DO 70, I = 1, M Y( I ) = Y( I ) + TEMP * CABS1( A( I, J ) ) 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE ELSE DO 100, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY DO 90, I = 1, M Y( IY ) = Y( IY ) + TEMP * CABS1( A( I, J ) ) IY = IY + INCY 90 CONTINUE END IF JX = JX + INCX 100 CONTINUE END IF * ELSE * * Form y := abs( alpha ) * abs( A' ) * abs( x ) + y. * JY = KY IF( INCX.EQ.1 ) THEN DO 120, J = 1, N TEMP = ZERO DO 110, I = 1, M TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( I ) ) 110 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 120 CONTINUE ELSE DO 140, J = 1, N TEMP = ZERO IX = KX DO 130, I = 1, M TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( IX ) ) IX = IX + INCX 130 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 140 CONTINUE END IF END IF * RETURN * * End of ZAGEMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zahemv.f000644 000766 000024 00000021301 10363532303 017754 0ustar00juliestaff000000 000000 SUBROUTINE ZAHEMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION Y( * ) COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZAHEMV performs the following matrix-vector operation * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n Hermitian matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower triangu- * lar part of the array A is to be referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A is * to be referenced. * UPLO = 'L' or 'l' Only the lower triangular part of A is * to be referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the Hermitian ma- * trix and the strictly lower triangular part of A is not refe- * renced. When UPLO = 'L' or 'l', the leading n by n part of * the array A must contain the lower triangular part of the * Hermitian matrix and the strictly upper trapezoidal part of A * is not referenced. * Note that the imaginary parts of the local entries corres- * ponding to the offdiagonal elements of A need not be set and * assumed to be zero. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) DOUBLE PRECISION * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY DOUBLE PRECISION TALPHA, TEMP0, TEMP1, TEMP2 COMPLEX*16 ZDUM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZAHEMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := abs( beta * y ). * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 40 CONTINUE END IF END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN DO 60, J = 1, N TEMP1 = TALPHA * CABS1( X( J ) ) TEMP2 = ZERO DO 50, I = 1, J - 1 TEMP0 = CABS1( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( I ) ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1 * ABS( DBLE( A( J, J ) ) ) + $ ALPHA * TEMP2 * 60 CONTINUE * ELSE * JX = KX JY = KY * DO 80, J = 1, N TEMP1 = TALPHA * CABS1( X( JX ) ) TEMP2 = ZERO IX = KX IY = KY * DO 70, I = 1, J - 1 TEMP0 = CABS1( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( IX ) ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1 * ABS( DBLE( A( J, J ) ) ) + $ ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 80 CONTINUE * END IF * ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN * DO 100, J = 1, N * TEMP1 = TALPHA * CABS1( X( J ) ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1 * ABS( DBLE( A( J, J ) ) ) * DO 90, I = J + 1, N TEMP0 = CABS1( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( I ) ) * 90 CONTINUE * Y( J ) = Y( J ) + ALPHA * TEMP2 * 100 CONTINUE * ELSE * JX = KX JY = KY * DO 120, J = 1, N TEMP1 = TALPHA * CABS1( X( JX ) ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1 * ABS( DBLE( A( J, J ) ) ) IX = JX IY = JY * DO 110, I = J + 1, N * IX = IX + INCX IY = IY + INCY TEMP0 = CABS1( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( IX ) ) * 110 CONTINUE * Y( JY ) = Y( JY ) + ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 120 CONTINUE * END IF * END IF * RETURN * * End of ZAHEMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zasymv.f000644 000766 000024 00000020760 10363532303 020023 0ustar00juliestaff000000 000000 SUBROUTINE ZASYMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION Y( * ) COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZASYMV performs the following matrix-vector operation * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n symmetric matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower triangu- * lar part of the array A is to be referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A is * to be referenced. * UPLO = 'L' or 'l' Only the lower triangular part of A is * to be referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. When UPLO = 'L' or 'l', the leading n by n part of * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) DOUBLE PRECISION * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY DOUBLE PRECISION TALPHA, TEMP0, TEMP1, TEMP2 COMPLEX*16 ZDUM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZASYMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := abs( beta * y ). * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 40 CONTINUE END IF END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN DO 60, J = 1, N TEMP1 = TALPHA * CABS1( X( J ) ) TEMP2 = ZERO DO 50, I = 1, J - 1 TEMP0 = CABS1( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( I ) ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1 * CABS1( A( J, J ) ) + $ ALPHA * TEMP2 * 60 CONTINUE * ELSE * JX = KX JY = KY * DO 80, J = 1, N TEMP1 = TALPHA * CABS1( X( JX ) ) TEMP2 = ZERO IX = KX IY = KY * DO 70, I = 1, J - 1 TEMP0 = CABS1( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( IX ) ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1 * CABS1( A( J, J ) ) + $ ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 80 CONTINUE * END IF * ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) ) THEN * DO 100, J = 1, N * TEMP1 = TALPHA * CABS1( X( J ) ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1 * CABS1( A( J, J ) ) * DO 90, I = J + 1, N TEMP0 = CABS1( A( I, J ) ) Y( I ) = Y( I ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( I ) ) * 90 CONTINUE * Y( J ) = Y( J ) + ALPHA * TEMP2 * 100 CONTINUE * ELSE * JX = KX JY = KY * DO 120, J = 1, N TEMP1 = TALPHA * CABS1( X( JX ) ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1 * CABS1( A( J, J ) ) IX = JX IY = JY * DO 110, I = J + 1, N * IX = IX + INCX IY = IY + INCY TEMP0 = CABS1( A( I, J ) ) Y( IY ) = Y( IY ) + TEMP1 * TEMP0 TEMP2 = TEMP2 + TEMP0 * CABS1( X( IX ) ) * 110 CONTINUE * Y( JY ) = Y( JY ) + ALPHA * TEMP2 JX = JX + INCX JY = JY + INCY * 120 CONTINUE * END IF * END IF * RETURN * * End of ZASYMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zatrmv.f000644 000766 000024 00000031112 10363532303 020006 0ustar00juliestaff000000 000000 SUBROUTINE ZATRMV( UPLO, TRANS, DIAG, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, TRANS, UPLO INTEGER INCX, INCY, LDA, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION Y( * ) COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZATRMV performs one of the matrix-vector operations * * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * or * * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ), * * or * * y := abs( alpha )*abs( conjg( A' ) )*abs( x ) + abs( beta*y ), * * where alpha and beta are real scalars, y is a real vector, x is a * vector and A is an n by n unit or non-unit, upper or lower triangular * matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * TRANS (input) CHARACTER*1 * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ) * * TRANS = 'T' or 't': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ) * * TRANS = 'C' or 'c': * y := abs( alpha )*abs( conjg( A' ) )*abs( x ) + * abs( beta*y ) * * DIAG (input) CHARACTER*1 * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit triangular. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the matrix A and * the strictly lower triangular part of A is not referenced. * When UPLO = 'L' or 'l', the leading n by n part of the array * A must contain the lower triangular part of the matrix A and * the strictly upper trapezoidal part of A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of A * are not referenced either, but are assumed to be unity. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) DOUBLE PRECISION * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY LOGICAL NOUNIT DOUBLE PRECISION ABSX, TALPHA, TEMP COMPLEX*16 ZDUM * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 7 ELSE IF( INCX.EQ.0 )THEN INFO = 9 ELSE IF( INCY.EQ.0 ) THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZATRMV', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N - 1 ) * INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N - 1 ) * INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := abs( beta*y ). * IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 20, I = 1, N Y( I ) = ABS( Y( I ) ) 20 CONTINUE ELSE DO 30, I = 1, N Y( I ) = ABS( BETA * Y( I ) ) 30 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 40, I = 1, N Y( IY ) = ZERO IY = IY + INCY 40 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 50, I = 1, N Y( IY ) = ABS( Y( IY ) ) IY = IY + INCY 50 CONTINUE ELSE DO 60, I = 1, N Y( IY ) = ABS( BETA * Y( IY ) ) IY = IY + INCY 60 CONTINUE END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * TALPHA = ABS( ALPHA ) * IF( LSAME( TRANS, 'N' ) )THEN * * Form y := abs( alpha ) * abs( A ) * abs( x ) + y. * IF( LSAME( UPLO, 'U' ) )THEN JX = KX IF( INCY.EQ.1 ) THEN DO 80, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX DO 70, I = 1, J - 1 Y( I ) = Y( I ) + TEMP * CABS1( A( I, J ) ) 70 CONTINUE * IF( NOUNIT ) THEN Y( J ) = Y( J ) + TEMP * CABS1( A( J, J ) ) ELSE Y( J ) = Y( J ) + TEMP END IF END IF JX = JX + INCX 80 CONTINUE * ELSE * DO 100, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY DO 90, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP * CABS1( A( I, J ) ) IY = IY + INCY 90 CONTINUE * IF( NOUNIT ) THEN Y( IY ) = Y( IY ) + TEMP * CABS1( A( J, J ) ) ELSE Y( IY ) = Y( IY ) + TEMP END IF END IF JX = JX + INCX 100 CONTINUE * END IF * ELSE * JX = KX IF( INCY.EQ.1 ) THEN DO 120, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN * TEMP = TALPHA * ABSX * IF( NOUNIT ) THEN Y( J ) = Y( J ) + TEMP * CABS1( A( J, J ) ) ELSE Y( J ) = Y( J ) + TEMP END IF * DO 110, I = J + 1, N Y( I ) = Y( I ) + TEMP * CABS1( A( I, J ) ) 110 CONTINUE END IF JX = JX + INCX 120 CONTINUE * ELSE * DO 140, J = 1, N ABSX = CABS1( X( JX ) ) IF( ABSX.NE.ZERO ) THEN TEMP = TALPHA * ABSX IY = KY + ( J - 1 ) * INCY * IF( NOUNIT ) THEN Y( IY ) = Y( IY ) + TEMP * CABS1( A( J, J ) ) ELSE Y( IY ) = Y( IY ) + TEMP END IF * DO 130, I = J + 1, N IY = IY + INCY Y( IY ) = Y( IY ) + TEMP * CABS1( A( I, J ) ) 130 CONTINUE END IF JX = JX + INCX 140 CONTINUE * END IF * END IF * ELSE * * Form y := abs( alpha ) * abs( A' ) * abs( x ) + y. * IF( LSAME( UPLO, 'U' ) )THEN JY = KY IF( INCX.EQ.1 ) THEN DO 160, J = 1, N * TEMP = ZERO * DO 150, I = 1, J - 1 TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( I ) ) 150 CONTINUE * IF( NOUNIT ) THEN TEMP = TEMP + CABS1( A( J, J ) ) * CABS1( X( J ) ) ELSE TEMP = TEMP + CABS1( X( J ) ) END IF * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 160 CONTINUE * ELSE * DO 180, J = 1, N TEMP = ZERO IX = KX DO 170, I = 1, J - 1 TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( IX ) ) IX = IX + INCX 170 CONTINUE * IF( NOUNIT ) THEN TEMP = TEMP + CABS1( A( J, J ) ) * CABS1( X( IX ) ) ELSE TEMP = TEMP + CABS1( X( IX ) ) END IF * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 180 CONTINUE * END IF * ELSE * JY = KY IF( INCX.EQ.1 ) THEN * DO 200, J = 1, N * IF( NOUNIT ) THEN TEMP = CABS1( A( J, J ) ) * CABS1( X( J ) ) ELSE TEMP = CABS1( X( J ) ) END IF * DO 190, I = J + 1, N TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( I ) ) 190 CONTINUE * Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY * 200 CONTINUE * ELSE * DO 220, J = 1, N * IX = KX + ( J - 1 ) * INCX * IF( NOUNIT ) THEN TEMP = CABS1( A( J, J ) ) * CABS1( X( IX ) ) ELSE TEMP = CABS1( X( IX ) ) END IF * DO 210, I = J + 1, N IX = IX + INCX TEMP = TEMP + CABS1( A( I, J ) ) * CABS1( X( IX ) ) 210 CONTINUE Y( JY ) = Y( JY ) + TALPHA * TEMP JY = JY + INCY 220 CONTINUE END IF END IF * END IF * RETURN * * End of ZATRMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zcshft.f000644 000766 000024 00000005534 10363532303 017775 0ustar00juliestaff000000 000000 SUBROUTINE ZCSHFT( M, N, OFFSET, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, M, N, OFFSET * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZCSHFT shifts columns of an m by n array A by OFFSET. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A. M must be at * least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A to be * shifted. N must be at least zero. * * OFFSET (local input) INTEGER * On entry, OFFSET specifies the offset by which the columns of * A should be shifted. OFFSET can be positive or negative (see * below for further details). When OFFSET is positive, the co- * lumns are shifted to the right. When OFFSET is negative, the * columns of A are shifted to the left. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N+ABS(OFFSET) ). * On exit, A contains the shifted array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Further Details * =============== * * N=3 OFFSET=6 -OFFSET=6 N=3 * ------------------- ------------------- * | 1 2 3 4 5 6 7 8 9 | M | 1 2 3 4 5 6 7 8 9 | * ------------------- ------------------- * V V * ------------------- ------------------- * | 1 2 3 4 5 6 1 2 3 | M | 7 8 9 4 5 6 7 8 9 | * ------------------- ------------------- * OFFSET >= 0 OFFSET <= 0 * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( ( OFFSET.EQ.0 ).OR.( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * IF( OFFSET.GT.0 ) THEN DO 20 J = N, 1, -1 DO 10 I = 1, M A( I, J+OFFSET ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = A( I, J-OFFSET ) 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of ZCSHFT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zhescal.f000644 000766 000024 00000016677 10363532303 020137 0ustar00juliestaff000000 000000 SUBROUTINE ZHESCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N DOUBLE PRECISION ALPHA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZHESCAL scales a two-dimensional array A by the real scalar alpha. * The diagonal entries specified by IOFFD of A are supposed to be real. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) DOUBLE PRECISION * On entry, ALPHA specifies the scalar alpha, i.e., the value * by which the diagonal and offdiagonal entries of the array A * as specified by UPLO and IOFFD are scaled. * * A (input/output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the Hermitian * matrix to be scaled as specified by IOFFD, and the strictly * lower trapezoidal part of A is not referenced. When UPLO is * 'L' or 'l', the leading m by n part of the array A must con- * tain the lower trapezoidal part of the Hermitian matrix to be * scaled as specified by IOFFD, and the strictly upper trape- * zoidal part of A is not referenced. On exit, the entries of * the trapezoid part of A determined by UPLO and IOFFD are sca- * led. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION RONE, RZERO PARAMETER ( RONE = 1.0D+0, RZERO = 0.0D+0 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER J, JTMP, MN * .. * .. External Subroutines .. EXTERNAL ZDSCAL, ZTZPAD * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( ALPHA.EQ.RONE ) THEN * * Zeros the imaginary part of the diagonals * IF( LSAME( UPLO, 'L' ).OR.LSAME( UPLO, 'U' ).OR. $ LSAME( UPLO, 'D' ) ) THEN DO 10 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = DCMPLX( DBLE( A( JTMP, J ) ), RZERO ) 10 CONTINUE END IF RETURN ELSE IF( ALPHA.EQ.RZERO ) THEN CALL ZTZPAD( UPLO, 'N', M, N, IOFFD, ZERO, ZERO, A, LDA ) RETURN END IF * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) CALL ZDSCAL( M, ALPHA, A( 1, J ), 1 ) 20 CONTINUE DO 30 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = DCMPLX( ALPHA * DBLE( A( JTMP, J ) ), RZERO ) IF( M.GT.JTMP ) $ CALL ZDSCAL( M-JTMP, ALPHA, A( JTMP + 1, J ), 1 ) 30 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 40 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD CALL ZDSCAL( JTMP - 1, ALPHA, A( 1, J ), 1 ) A( JTMP, J ) = DCMPLX( ALPHA * DBLE( A( JTMP, J ) ), RZERO ) 40 CONTINUE DO 50 J = MAX( 0, MN ) + 1, N CALL ZDSCAL( M, ALPHA, A( 1, J ), 1 ) 50 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 60 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = DCMPLX( ALPHA * DBLE( A( JTMP, J ) ), RZERO ) 60 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 70 J = 1, N CALL ZDSCAL( M, ALPHA, A( 1, J ), 1 ) 70 CONTINUE * END IF * RETURN * * End of ZHESCAL * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zmmadd.f000644 000766 000024 00000010533 10363532303 017743 0ustar00juliestaff000000 000000 SUBROUTINE ZMMADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZMMADD performs the following operation: * * B := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) COMPLEX*16 array * On entry, B is an array of dimension ( LDB, N ). On exit, the * leading m by n part of A has been added to the leading m by n * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL ZAXPY, ZCOPY, ZSCAL * .. * .. Executable Statements .. * IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL ZCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) * DO 10 I = 1, M * B( I, J ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL ZAXPY( M, ONE, A( 1, J ), 1, B( 1, J ), 1 ) * DO 50 I = 1, M * B( I, J ) = A( I, J ) + B( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( I, J ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL ZAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) * DO 110 I = 1, M * B( I, J ) = ALPHA * A( I, J ) + B( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, N CALL ZSCAL( M, BETA, B( 1, J ), 1 ) * DO 150 I = 1, M * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of ZMMADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zmmcadd.f000644 000766 000024 00000010516 10363532303 020107 0ustar00juliestaff000000 000000 SUBROUTINE ZMMCADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZMMCADD performs the following operation: * * B := alpha * conjg( A ) + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) COMPLEX*16 array * On entry, B is an array of dimension ( LDB, N ). On exit, the * leading m by n part of A has been conjugated and added to the * leading m by n part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL ZSCAL * .. * .. Intrinsic Functions .. INTRINSIC DCONJG * .. * .. Executable Statements .. * IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M B( I, J ) = DCONJG( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( I, J ) = DCONJG( A( I, J ) ) + BETA * B( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( I, J ) = DCONJG( A( I, J ) ) + B( I, J ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( I, J ) = ALPHA * DCONJG( A( I, J ) ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( I, J ) = ALPHA * DCONJG( A( I, J ) ) + $ BETA * B( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M B( I, J ) = ALPHA * DCONJG( A( I, J ) ) + B( I, J ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, N CALL ZSCAL( M, BETA, B( 1, J ), 1 ) * DO 150 I = 1, M * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of ZMMCADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zmmdda.f000644 000766 000024 00000010537 10363532303 017747 0ustar00juliestaff000000 000000 SUBROUTINE ZMMDDA( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZMMDDA performs the following operation: * * A := alpha * A + beta * B, * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading m by n part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension ( LDB, N ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL ZAXPY, ZCOPY, ZSCAL * .. * .. Executable Statements .. * IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL ZCOPY( M, B( 1, J ), 1, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( I, J ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL ZAXPY( M, ONE, B( 1, J ), 1, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( I, J ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( I, J ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL ZAXPY( M, BETA, B( 1, J ), 1, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( I, J ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL ZSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of ZMMDDA * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zmmddac.f000644 000766 000024 00000010523 10363532303 020105 0ustar00juliestaff000000 000000 SUBROUTINE ZMMDDAC( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZMMDDAC performs the following operation: * * A := alpha * A + beta * conjg( B ), * * where alpha, beta are scalars and A and B are m by n matrices. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and B. M must * be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A and B. * N must be at least zero. * * ALPHA (local input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading m by n part of B has been conjugated and added to the * leading m by n part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension ( LDB, N ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL ZSCAL * .. * .. Intrinsic Functions .. INTRINSIC DCONJG * .. * .. Executable Statements .. * IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = DCONJG( B( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = DCONJG( B( I, J ) ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M A( I, J ) = DCONJG( B( I, J ) ) + A( I, J ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * DCONJG( B( I, J ) ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * DCONJG( B( I, J ) ) + $ ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M A( I, J ) = BETA * DCONJG( B( I, J ) ) + A( I, J ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL ZSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF * RETURN * * End of ZMMDDAC * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zmmddact.f000644 000766 000024 00000014775 10363532303 020306 0ustar00juliestaff000000 000000 SUBROUTINE ZMMDDACT( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZMMDDACT performs the following operation: * * A := alpha * A + beta * conjg( B' ), * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading n by m part of B has been conjugated and added to the * leading m by n part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension ( LDB, M ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL ZSCAL * .. * .. Intrinsic Functions .. INTRINSIC DCONJG * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M A( I, J ) = DCONJG( B( J, I ) ) 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = DCONJG( B( J, I ) ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M A( I, J ) = DCONJG( B( J, I ) ) + A( I, J ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * DCONJG( B( J, I ) ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * DCONJG( B( J, I ) ) + $ ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M A( I, J ) = BETA * DCONJG( B( J, I ) ) + A( I, J ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL ZSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 180 J = 1, M DO 170 I = 1, N A( J, I ) = DCONJG( B( I, J ) ) 170 CONTINUE 180 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N A( J, I ) = DCONJG( B( I, J ) ) + ALPHA * A( J, I ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M DO 210 I = 1, N A( J, I ) = DCONJG( B( I, J ) ) + A( J, I ) 210 CONTINUE 220 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N A( J, I ) = BETA * DCONJG( B( I, J ) ) 230 CONTINUE 240 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N A( J, I ) = BETA * DCONJG( B( I, J ) ) + $ ALPHA * A( J, I ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M DO 270 I = 1, N A( J, I ) = BETA * DCONJG( B( I, J ) ) + A( J, I ) 270 CONTINUE 280 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 300 J = 1, N DO 290 I = 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 320 J = 1, N CALL ZSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 310 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of ZMMDDACT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zmmddat.f000644 000766 000024 00000015202 10363532303 020125 0ustar00juliestaff000000 000000 SUBROUTINE ZMMDDAT( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZMMDDAT performs the following operation: * * A := alpha * A + beta * B', * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N ). On exit, the * leading n by m part of B has been added to the leading m by n * part of A. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input) COMPLEX*16 array * On entry, B is an array of dimension ( LDB, M ). * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL ZAXPY, ZCOPY, ZSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 20 J = 1, N CALL ZCOPY( M, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 10 I = 1, M * A( I, J ) = B( J, I ) * 10 CONTINUE 20 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = B( J, I ) + ALPHA * A( I, J ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL ZAXPY( M, ONE, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 50 I = 1, M * A( I, J ) = B( J, I ) + A( I, J ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M A( I, J ) = BETA * B( J, I ) 70 CONTINUE 80 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M A( I, J ) = BETA * B( J, I ) + ALPHA * A( I, J ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL ZAXPY( M, BETA, B( J, 1 ), LDB, A( 1, J ), 1 ) * DO 110 I = 1, M * A( I, J ) = BETA * B( J, I ) + A( I, J ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 140 J = 1, N DO 130 I = 1, M A( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 160 J = 1, N CALL ZSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 150 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( BETA.EQ.ONE ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 180 J = 1, M CALL ZCOPY( N, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 170 I = 1, N * A( J, I ) = B( I, J ) * 170 CONTINUE 180 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N A( J, I ) = B( I, J ) + ALPHA * A( J, I ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL ZAXPY( N, ONE, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 210 I = 1, N * A( J, I ) = B( I, J ) + A( J, I ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( BETA.NE.ZERO ) THEN IF( ALPHA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N A( J, I ) = BETA * B( I, J ) 230 CONTINUE 240 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N A( J, I ) = BETA * B( I, J ) + ALPHA * A( J, I ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL ZAXPY( N, BETA, B( 1, J ), 1, A( J, 1 ), LDA ) * DO 270 I = 1, N * A( J, I ) = BETA * B( I, J ) + A( J, I ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( ALPHA.EQ.ZERO ) THEN DO 300 J = 1, N DO 290 I = 1, M A( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( ALPHA.NE.ONE ) THEN DO 320 J = 1, N CALL ZSCAL( M, ALPHA, A( 1, J ), 1 ) * DO 310 I = 1, M * A( I, J ) = ALPHA * A( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of ZMMDDAT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zmmtadd.f000644 000766 000024 00000015172 10363532303 020133 0ustar00juliestaff000000 000000 SUBROUTINE ZMMTADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZMMTADD performs the following operation: * * B := alpha * A' + beta * B, * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) COMPLEX*16 array * On entry, B is an array of dimension ( LDB, M ). On exit, the * leading m by n part of A has been added to the leading n by m * part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL ZAXPY, ZCOPY, ZSCAL * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N CALL ZCOPY( M, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 10 I = 1, M * B( J, I ) = A( I, J ) * 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( J, I ) = A( I, J ) + BETA * B( J, I ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N CALL ZAXPY( M, ONE, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 50 I = 1, M * B( J, I ) = A( I, J ) + B( J, I ) * 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( J, I ) = ALPHA * A( I, J ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( J, I ) = ALPHA * A( I, J ) + BETA * B( J, I ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N CALL ZAXPY( M, ALPHA, A( 1, J ), 1, B( J, 1 ), LDB ) * DO 110 I = 1, M * B( J, I ) = ALPHA * A( I, J ) + B( J, I ) * 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, M DO 130 I = 1, N B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, M CALL ZSCAL( N, BETA, B( 1, J ), 1 ) * DO 150 I = 1, N * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, M CALL ZCOPY( N, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 170 I = 1, N * B( I, J ) = A( J, I ) * 170 CONTINUE 180 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N B( I, J ) = A( J, I ) + BETA * B( I, J ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M CALL ZAXPY( N, ONE, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 210 I = 1, N * B( I, J ) = A( J, I ) + B( I, J ) * 210 CONTINUE 220 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N B( I, J ) = ALPHA * A( J, I ) 230 CONTINUE 240 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M CALL ZAXPY( N, ALPHA, A( J, 1 ), LDA, B( 1, J ), 1 ) * DO 270 I = 1, N * B( I, J ) = ALPHA * A( J, I ) + B( I, J ) * 270 CONTINUE 280 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 300 J = 1, M DO 290 I = 1, N B( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 320 J = 1, M CALL ZSCAL( N, BETA, B( 1, J ), 1 ) * DO 310 I = 1, N * B( I, J ) = BETA * B( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of ZMMTADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zmmtcadd.f000644 000766 000024 00000014762 10363532303 020302 0ustar00juliestaff000000 000000 SUBROUTINE ZMMTCADD( M, N, ALPHA, A, LDA, BETA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, LDB, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZMMTCADD performs the following operation: * * B := alpha * conjg( A' ) + beta * B, * * where alpha, beta are scalars; A is an m by n matrix and B is an n by * m matrix. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A and the number * of columns of B. M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of rows of B and the number * of columns of A. N must be at least zero. * * ALPHA (local input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A need * not be set on input. * * A (local input) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N ). * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BETA (local input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B need not * be set on input. * * B (local input/local output) COMPLEX*16 array * On entry, B is an array of dimension ( LDB, M ). On exit, the * leading m by n part of A has been conjugated and added to the * leading n by m part of B. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Subroutines .. EXTERNAL ZSCAL * .. * .. Intrinsic Functions .. INTRINSIC DCONJG * .. * .. Executable Statements .. * IF( M.GE.N ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, M B( J, I ) = DCONJG( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 40 J = 1, N DO 30 I = 1, M B( J, I ) = DCONJG( A( I, J ) ) + BETA * B( J, I ) 30 CONTINUE 40 CONTINUE ELSE DO 60 J = 1, N DO 50 I = 1, M B( J, I ) = DCONJG( A( I, J ) ) + B( J, I ) 50 CONTINUE 60 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 80 J = 1, N DO 70 I = 1, M B( J, I ) = ALPHA * DCONJG( A( I, J ) ) 70 CONTINUE 80 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 J = 1, N DO 90 I = 1, M B( J, I ) = ALPHA * DCONJG( A( I, J ) ) + $ BETA * B( J, I ) 90 CONTINUE 100 CONTINUE ELSE DO 120 J = 1, N DO 110 I = 1, M B( J, I ) = ALPHA * DCONJG( A( I, J ) ) + B( J, I ) 110 CONTINUE 120 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 140 J = 1, M DO 130 I = 1, N B( I, J ) = ZERO 130 CONTINUE 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 160 J = 1, M CALL ZSCAL( N, BETA, B( 1, J ), 1 ) * DO 150 I = 1, N * B( I, J ) = BETA * B( I, J ) * 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, M DO 170 I = 1, N B( I, J ) = DCONJG( A( J, I ) ) 170 CONTINUE 180 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 200 J = 1, M DO 190 I = 1, N B( I, J ) = DCONJG( A( J, I ) ) + BETA * B( I, J ) 190 CONTINUE 200 CONTINUE ELSE DO 220 J = 1, M DO 210 I = 1, N B( I, J ) = DCONJG( A( J, I ) ) + B( I, J ) 210 CONTINUE 220 CONTINUE END IF ELSE IF( ALPHA.NE.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 240 J = 1, M DO 230 I = 1, N B( I, J ) = ALPHA * DCONJG( A( J, I ) ) 230 CONTINUE 240 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 260 J = 1, M DO 250 I = 1, N B( I, J ) = ALPHA * DCONJG( A( J, I ) ) + $ BETA * B( I, J ) 250 CONTINUE 260 CONTINUE ELSE DO 280 J = 1, M DO 270 I = 1, N B( I, J ) = ALPHA * DCONJG( A( J, I ) ) + B( I, J ) 270 CONTINUE 280 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 300 J = 1, M DO 290 I = 1, N B( I, J ) = ZERO 290 CONTINUE 300 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 320 J = 1, M CALL ZSCAL( N, BETA, B( 1, J ), 1 ) * DO 310 I = 1, N * B( I, J ) = BETA * B( I, J ) * 310 CONTINUE 320 CONTINUE END IF END IF END IF * RETURN * * End of ZMMTCADD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zrshft.f000644 000766 000024 00000006173 10363532303 020014 0ustar00juliestaff000000 000000 SUBROUTINE ZRSHFT( M, N, OFFSET, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER LDA, M, N, OFFSET * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZRSHFT shifts rows of an m by n array A by OFFSET. * * Arguments * ========= * * M (local input) INTEGER * On entry, M specifies the number of rows of A to be shifted. * M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the number of columns of A. N must be * at least zero. * * OFFSET (local input) INTEGER * On entry, OFFSET specifies the offset by which the rows of * A should be shifted. OFFSET can be positive or negative (see * below for further details). When OFFSET is positive, the rows * are shifted to the bottom. When OFFSET is negative, the rows * of A are shifted to the top. * * A (local input/local output) COMPLEX*16 array * On entry, A is an array of dimension ( LDA, N ). On exit, A * contains the shifted array. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M+ABS(OFFSET) ). * * Further Details * =============== * * N N N N * --- --- --- --- * | 1 | | 1 | | 1 | | 7 | * | 2 | M = 3 | 2 | | 2 | M = 3 | 8 | * | 3 | | 3 | | 3 | | 9 | * | 4 | | 4 | | 4 | | 4 | * | 5 | becomes | 5 | | 5 | becomes | 5 | * | 6 | | 6 | | 6 | | 6 | * | 7 | | 1 | | 7 | | 7 | * | 8 | OFFSET = 6 | 2 | | 8 | OFFSET = -6 | 8 | * | 9 | | 3 | | 9 | | 9 | * --- --- --- --- * OFFSET >= 0 OFFSET <= 0 * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, J * .. * .. Executable Statements .. * IF( ( OFFSET.EQ.0 ).OR.( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * IF( OFFSET.GT.0 ) THEN DO 20 J = 1, N DO 10 I = M, 1, -1 A( I+OFFSET, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, M A( I, J ) = A( I-OFFSET, J ) 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of ZRSHFT * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zset.f000644 000766 000024 00000005201 10363532303 017450 0ustar00juliestaff000000 000000 SUBROUTINE ZSET( N, ALPHA, X, INCX ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 X( * ) * .. * * Purpose * ======= * * ZSET sets the entries of an n vector x to the scalar alpha. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vector x. N must be * at least zero. * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * X (input/output) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. On exit, entries of the * incremented array X are set to alpha. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Local Scalars .. INTEGER I, INFO, IX, M, MP1 * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = 1 ELSE IF( INCX.EQ.0 ) THEN INFO = 4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZSET', INFO ) RETURN END IF * * Quick return if possible. * IF( N.LE.0 ) $ RETURN * * Form x := alpha * IF( INCX.EQ.1 ) $ GO TO 20 * * code for increments not equal to 1 * * Set up the start point in X. * IF( INCX.GT.0 ) THEN IX = 1 ELSE IX = 1 - ( N - 1 ) * INCX END IF * DO 10 I = 1, N X( IX ) = ALPHA IX = IX + INCX 10 CONTINUE * RETURN * * code for increment equal to 1 * * clean-up loop * 20 M = MOD( N, 4 ) * IF( M.EQ.0 ) $ GO TO 40 * DO 30 I = 1, M X( I ) = ALPHA 30 CONTINUE IF( N.LT.4 ) $ RETURN * 40 MP1 = M + 1 DO 50 I = MP1, N, 4 X( I ) = ALPHA X( I + 1 ) = ALPHA X( I + 2 ) = ALPHA X( I + 3 ) = ALPHA 50 CONTINUE * RETURN * * End of ZSET * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zsymv.f000644 000766 000024 00000017270 10363532303 017664 0ustar00juliestaff000000 000000 SUBROUTINE ZSYMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZSYMV performs the following matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors, and * A is an n by n symmetric matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower triangu- * lar part of the array A is to be referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A is * to be referenced. * UPLO = 'L' or 'l' Only the lower triangular part of A is * to be referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the real scalar alpha. * * A (input) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. When UPLO = 'L' or 'l', the leading n by n part of * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * BETA (input) COMPLEX*16 * On entry, BETA specifies the real scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y (input/output) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry with BETA non- * zero, the incremented array Y must contain the vector y. On * exit, the incremented array Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY COMPLEX*16 TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZSYMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF * IF( ALPHA.EQ.ZERO ) $ RETURN * IF( LSAME( UPLO, 'U' ) )THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( J, J ) DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( J, J ) IX = JX IY = JY DO 110, I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of ZSYMV * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zsyr.f000644 000766 000024 00000013046 11654631032 017503 0ustar00juliestaff000000 000000 SUBROUTINE ZSYR( UPLO, N, ALPHA, X, INCX, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, LDA, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZSYR performs the symmetric rank 1 operation * * A := alpha*x*x' + A, * * where alpha is a complex scalar, x is an n element vector and A is an * n by n SY matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * A (input/output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. On exit, the upper triangular part of the array A is * overwritten by the upper triangular part of the updated ma- * trix. When UPLO = 'L' or 'l', the leading n by n part of the * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. On exit, the lower triangular part of the * array A is overwritten by the lower triangular part of the * updated matrix. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, J, JX, KX COMPLEX*16 TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( INCX.EQ.0 ) THEN INFO = 5 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = 7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZSYR', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * KX = 1 IF( INCX.LE.0 ) THEN KX = 1 - ( N-1 )*INCX ELSE IF( INCX.NE.1 ) THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) ) THEN * * Form A when A is stored in upper triangle. * IF( INCX.EQ.1 ) THEN DO 20 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) DO 10 I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF 20 CONTINUE ELSE JX = KX DO 40 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = KX DO 30 I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JX = JX + INCX 40 CONTINUE END IF ELSE * * Form A when A is stored in lower triangle. * IF( INCX.EQ.1 ) THEN DO 60 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) DO 50 I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = JX DO 70 I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF * RETURN * * End of ZSYR * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zsyr2.f000644 000766 000024 00000015770 11654631032 017573 0ustar00juliestaff000000 000000 SUBROUTINE ZSYR2( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER INCX, INCY, LDA, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZSYR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a complex scalar, x and y are n element vectors and A * is an n by n SY matrix. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. N must be at * least zero. * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented * array Y must contain the vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * A (input/output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading n by n part of the array * A must contain the upper triangular part of the symmetric ma- * trix and the strictly lower triangular part of A is not refe- * renced. On exit, the upper triangular part of the array A is * overwritten by the upper triangular part of the updated ma- * trix. When UPLO = 'L' or 'l', the leading n by n part of the * the array A must contain the lower triangular part of the * symmetric matrix and the strictly upper trapezoidal part of A * is not referenced. On exit, the lower triangular part of the * array A is overwritten by the lower triangular part of the * updated matrix. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, N ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY COMPLEX*16 TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZSYR2', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * KX = 1 KY = 1 JX = 1 JY = 1 IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 10, I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 10 CONTINUE END IF 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY 40 CONTINUE END IF ELSE * * Form A when A is stored in the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 50, I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 50 CONTINUE END IF 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF END IF * RETURN * * End of ZSYR2 * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/ztzcnjg.f000644 000766 000024 00000021021 10363532303 020152 0ustar00juliestaff000000 000000 SUBROUTINE ZTZCNJG( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZTZCNJG conjugates a two-dimensional array A and then scales it by * the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be conjugated and scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha, i.e., the value * by which the diagonal and offdiagonal entries of the array A * as specified by UPLO and IOFFD are scaled. * * A (input/output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are conjugated and scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Subroutines .. EXTERNAL ZTZPAD * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ) ) $ RETURN * * Start the operations * IF( ALPHA.EQ.ZERO ) THEN * CALL ZTZPAD( UPLO, 'N', M, N, IOFFD, ZERO, ZERO, A, LDA ) * ELSE IF( ALPHA.EQ.ONE ) THEN * IF( LSAME( UPLO, 'L' ) ) THEN * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = DCONJG( A( I, J ) ) 10 CONTINUE 20 CONTINUE * DO 40 J = MN + 1, MIN( M - IOFFD, N ) DO 30 I = J + IOFFD, M A( I, J ) = DCONJG( A( I, J ) ) 30 CONTINUE 40 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 60 J = MAX( 0, -IOFFD ) + 1, MN DO 50 I = 1, J + IOFFD A( I, J ) = DCONJG( A( I, J ) ) 50 CONTINUE 60 CONTINUE * DO 80 J = MAX( 0, MN ) + 1, N DO 70 I = 1, M A( I, J ) = DCONJG( A( I, J ) ) 70 CONTINUE 80 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 90 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = DCONJG( A( JTMP, J ) ) 90 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 110 J = 1, N DO 100 I = 1, M A( I, J ) = DCONJG( A( I, J ) ) 100 CONTINUE 110 CONTINUE * END IF * ELSE * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 130 J = 1, MIN( MN, N ) DO 120 I = 1, M A( I, J ) = ALPHA * DCONJG( A( I, J ) ) 120 CONTINUE 130 CONTINUE * DO 150 J = MN + 1, MIN( M - IOFFD, N ) DO 140 I = J + IOFFD, M A( I, J ) = ALPHA * DCONJG( A( I, J ) ) 140 CONTINUE 150 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 170 J = MAX( 0, -IOFFD ) + 1, MN DO 160 I = 1, J + IOFFD A( I, J ) = ALPHA * DCONJG( A( I, J ) ) 160 CONTINUE 170 CONTINUE * DO 190 J = MAX( 0, MN ) + 1, N DO 180 I = 1, M A( I, J ) = ALPHA * DCONJG( A( I, J ) ) 180 CONTINUE 190 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 200 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * DCONJG( A( JTMP, J ) ) 200 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 220 J = 1, N DO 210 I = 1, M A( I, J ) = ALPHA * DCONJG( A( I, J ) ) 210 CONTINUE 220 CONTINUE * END IF * END IF * RETURN * * End of ZTZCNJG * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/ztzpad.f000644 000766 000024 00000022071 10363532303 020003 0ustar00juliestaff000000 000000 SUBROUTINE ZTZPAD( UPLO, HERM, M, N, IOFFD, ALPHA, BETA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 HERM, UPLO INTEGER IOFFD, LDA, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZTZPAD initializes a two-dimensional array A to beta on the diagonal * specified by IOFFD or zeros the imaginary part of those diagonals and * set the offdiagonals to alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be set as follows: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of A is not changed, * = 'D' or 'd': diagonal specified by IOFFD is set; the * rest of the array A is unchanged, * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of A is not changed, * Otherwise: All of the array A is set. * * HERM (input) CHARACTER*1 * On entry, HERM specifies what should be done to the diagonals * as follows. When UPLO is 'L', 'l', 'D', 'd', 'U' or 'u' and * HERM is 'Z' or 'z', the imaginary part of the diagonals is * set to zero. Otherwise, the diagonals are set to beta. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha, i.e., the value * to which the offdiagonal entries of the array A determined by * UPLO and IOFFD are set. * * BETA (input) COMPLEX*16 * On entry, BETA specifies the scalar beta, i.e., the value to * which the diagonal entries specified by IOFFD of the array A * are set. BETA is not referenced when UPLO is 'L', 'l', 'U' or * 'u' and HERM is 'Z'. * * A (input/output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U', the leading m by n part of the array A must * contain the upper trapezoidal part of the matrix to be set as * specified by IOFFD, and the strictly lower trapezoidal part * of A is not referenced; When UPLO = 'L', the leading m by n * part of the array A must contain the lower trapezoidal part * of the matrix to be set as specified by IOFFD, and the * strictly upper trapezoidal part of A is not referenced. On * exit, the entries of the trapezoid part of A determined by * UPLO, HERM and IOFFD are set. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals and set the strictly lower triangular part of the * array to ALPHA. * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M A( I, J ) = ALPHA 10 CONTINUE 20 CONTINUE * IF( LSAME( HERM, 'Z' ) ) THEN DO 40 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = DCMPLX( DBLE( A( JTMP, J ) ), RZERO ) DO 30 I = JTMP + 1, M A( I, J ) = ALPHA 30 CONTINUE 40 CONTINUE ELSE DO 60 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = BETA DO 50 I = JTMP + 1, M A( I, J ) = ALPHA 50 CONTINUE 60 CONTINUE END IF * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals and set the strictly upper triangular part of the * array to ALPHA. * MN = MIN( M - IOFFD, N ) IF( LSAME( HERM, 'Z' ) ) THEN DO 80 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 70 I = 1, JTMP - 1 A( I, J ) = ALPHA 70 CONTINUE A( JTMP, J ) = DCMPLX( DBLE( A( JTMP, J ) ), RZERO ) 80 CONTINUE ELSE DO 100 J = MAX( 0, -IOFFD ) + 1, MN JTMP = J + IOFFD DO 90 I = 1, JTMP - 1 A( I, J ) = ALPHA 90 CONTINUE A( JTMP, J ) = BETA 100 CONTINUE END IF DO 120 J = MAX( 0, MN ) + 1, N DO 110 I = 1, M A( I, J ) = ALPHA 110 CONTINUE 120 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Set the diagonal to BETA or zero the imaginary part of the * diagonals. * IF( LSAME( HERM, 'Z' ) ) THEN IF( ( IOFFD.LT.M ).AND.( IOFFD.GT.-N ) ) THEN DO 130 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = DCMPLX( DBLE( A( JTMP, J ) ), RZERO ) 130 CONTINUE END IF ELSE IF( ( IOFFD.LT.M ).AND.( IOFFD.GT.-N ) ) THEN DO 140 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 140 CONTINUE END IF END IF * ELSE * * Set the diagonals to BETA and the offdiagonals to ALPHA. * DO 160 J = 1, N DO 150 I = 1, M A( I, J ) = ALPHA 150 CONTINUE 160 CONTINUE IF( ALPHA.NE.BETA .AND. IOFFD.LT.M .AND. IOFFD.GT.-N ) THEN DO 170 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) A( J + IOFFD, J ) = BETA 170 CONTINUE END IF * END IF * RETURN * * End of ZTZPAD * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/ztzpadcpy.f000644 000766 000024 00000020014 10363532303 020512 0ustar00juliestaff000000 000000 SUBROUTINE ZTZPADCPY( UPLO, DIAG, M, N, IOFFD, A, LDA, B, LDB ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 DIAG, UPLO INTEGER IOFFD, LDA, LDB, M, N * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZTZPADCPY copies an array A into an array B. The unchanged part of B * is padded with zeros. The diagonal of B specified by IOFFD may be set * to ones. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be copied as follows: * = 'L' or 'l': Lower triangular part is copied; the * strictly upper triangular part of B is * padded with zeros, * = 'U' or 'u': Upper triangular part is copied; the * strictly lower triangular part of B is * padded with zeros. * * DIAG (input) CHARACTER*1 * On entry, DIAG specifies whether or not the diagonal of B is * to be set to ones or not as follows: * * DIAG = 'N' or 'n': the diagonals of A are copied into the * diagonals of B, otherwise the diagonals of B are set to ones. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * A (input) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U', the leading m by n part of the array A must * contain the upper trapezoidal part of the matrix to be copied * as specified by IOFFD, UPLO and DIAG, and the strictly lower * trapezoidal part of A is not referenced; When UPLO = 'L',the * leading m by n part of the array A must contain the lower * trapezoidal part of the matrix to be copied as specified by * IOFFD, UPLO and DIAG and the strictly upper trapezoidal part * of A is not referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * B (output) COMPLEX*16 array * On entry, B is an array of dimension (LDB,N). On exit, this * array contains the padded copy of A as specified by IOFFD, * UPLO and DIAG. * * LDB (input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, ITMP, J, JTMP, MN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Executable Statements .. * * Quick return if possible * IF( M.LE.0 .OR. N.LE.0 ) $ RETURN * * Start the operations * IF( LSAME( UPLO, 'L' ) ) THEN * MN = MAX( 0, -IOFFD ) DO 20 J = 1, MIN( MN, N ) DO 10 I = 1, M B( I, J ) = A( I, J ) 10 CONTINUE 20 CONTINUE * JTMP = MIN( M - IOFFD, N ) * IF( LSAME( DIAG, 'N' ) ) THEN DO 50 J = MN + 1, JTMP ITMP = J + IOFFD DO 30 I = 1, ITMP - 1 B( I, J ) = ZERO 30 CONTINUE DO 40 I = ITMP, M B( I, J ) = A( I, J ) 40 CONTINUE 50 CONTINUE ELSE DO 80 J = MN + 1, JTMP ITMP = J + IOFFD DO 60 I = 1, ITMP - 1 B( I, J ) = ZERO 60 CONTINUE B( ITMP, J ) = ONE DO 70 I = ITMP + 1, M B( I, J ) = A( I, J ) 70 CONTINUE 80 CONTINUE END IF * DO 100 J = JTMP + 1, N DO 90 I = 1, M B( I, J ) = ZERO 90 CONTINUE 100 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * JTMP = MAX( 0, -IOFFD ) * DO 120 J = 1, JTMP DO 110 I = 1, M B( I, J ) = ZERO 110 CONTINUE 120 CONTINUE * MN = MIN( M - IOFFD, N ) * IF( LSAME( DIAG, 'N' ) ) THEN DO 150 J = JTMP + 1, MN ITMP = J + IOFFD DO 130 I = 1, ITMP B( I, J ) = A( I, J ) 130 CONTINUE DO 140 I = ITMP + 1, M B( I, J ) = ZERO 140 CONTINUE 150 CONTINUE ELSE DO 180 J = JTMP + 1, MN ITMP = J + IOFFD DO 160 I = 1, ITMP - 1 B( I, J ) = A( I, J ) 160 CONTINUE B( ITMP, J ) = ONE DO 170 I = ITMP + 1, M B( I, J ) = ZERO 170 CONTINUE 180 CONTINUE END IF * DO 200 J = MAX( 0, MN ) + 1, N DO 190 I = 1, M B( I, J ) = A( I, J ) 190 CONTINUE 200 CONTINUE * ELSE * DO 220 J = 1, N DO 210 I = 1, M B( I, J ) = A( I, J ) 210 CONTINUE 220 CONTINUE * END IF * RETURN * * End of ZTZPADCPY * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/ztzscal.f000644 000766 000024 00000015302 10363532303 020160 0ustar00juliestaff000000 000000 SUBROUTINE ZTZSCAL( UPLO, M, N, IOFFD, ALPHA, A, LDA ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. CHARACTER*1 UPLO INTEGER IOFFD, LDA, M, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZTZSCAL scales a two-dimensional array A by the scalar alpha. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies which trapezoidal part of the ar- * ray A is to be scaled as follows: * = 'L' or 'l': the lower trapezoid of A is scaled, * = 'U' or 'u': the upper trapezoid of A is scaled, * = 'D' or 'd': diagonal specified by IOFFD is scaled, * Otherwise: all of the array A is scaled. * * M (input) INTEGER * On entry, M specifies the number of rows of the array A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the array A. * N must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha, i.e., the value * by which the diagonal and offdiagonal entries of the array A * as specified by UPLO and IOFFD are scaled. * * A (input/output) COMPLEX*16 array * On entry, A is an array of dimension (LDA,N). Before entry * with UPLO = 'U' or 'u', the leading m by n part of the array * A must contain the upper trapezoidal part of the matrix as * specified by IOFFD to be scaled, and the strictly lower tra- * pezoidal part of A is not referenced; When UPLO = 'L' or 'l', * the leading m by n part of the array A must contain the lower * trapezoidal part of the matrix as specified by IOFFD to be * scaled, and the strictly upper trapezoidal part of A is not * referenced. On exit, the entries of the trapezoid part of A * determined by UPLO and IOFFD are scaled. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d 'U' | | 'U' | * | 'L' 'D' | |d | * | d | M | d | * ---------------------------- | 'D' | * | d | * IOFFD < 0 | 'L' d | * | d| * N | | * ----------- ----------- * | d 'U'| * | d | IOFFD > 0 * M | 'D' | * | d| N * | 'L' | ---------------------------- * | | | 'U' | * | | |d | * | | | 'D' | * | | | d | * | | |'L' d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER J, JTMP, MN * .. * .. External Subroutines .. EXTERNAL ZSCAL, ZTZPAD * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Quick return if possible * IF( ( M.LE.0 ).OR.( N.LE.0 ).OR.( ALPHA.EQ.ONE ) ) $ RETURN * * Start the operations * IF( ALPHA.EQ.ZERO ) THEN CALL ZTZPAD( UPLO, 'N', M, N, IOFFD, ZERO, ZERO, A, LDA ) RETURN END IF * IF( LSAME( UPLO, 'L' ) ) THEN * * Scales the lower triangular part of the array by ALPHA. * MN = MAX( 0, -IOFFD ) DO 10 J = 1, MIN( MN, N ) CALL ZSCAL( M, ALPHA, A( 1, J ), 1 ) 10 CONTINUE DO 20 J = MN + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD IF( M.GE.JTMP ) $ CALL ZSCAL( M-JTMP+1, ALPHA, A( JTMP, J ), 1 ) 20 CONTINUE * ELSE IF( LSAME( UPLO, 'U' ) ) THEN * * Scales the upper triangular part of the array by ALPHA. * MN = MIN( M - IOFFD, N ) DO 30 J = MAX( 0, -IOFFD ) + 1, MN CALL ZSCAL( J + IOFFD, ALPHA, A( 1, J ), 1 ) 30 CONTINUE DO 40 J = MAX( 0, MN ) + 1, N CALL ZSCAL( M, ALPHA, A( 1, J ), 1 ) 40 CONTINUE * ELSE IF( LSAME( UPLO, 'D' ) ) THEN * * Scales the diagonal entries by ALPHA. * DO 50 J = MAX( 0, -IOFFD ) + 1, MIN( M - IOFFD, N ) JTMP = J + IOFFD A( JTMP, J ) = ALPHA * A( JTMP, J ) 50 CONTINUE * ELSE * * Scales the entire array by ALPHA. * DO 60 J = 1, N CALL ZSCAL( M, ALPHA, A( 1, J ), 1 ) 60 CONTINUE * END IF * RETURN * * End of ZTZSCAL * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zvvdotc.f000644 000766 000024 00000003563 10363532303 020173 0ustar00juliestaff000000 000000 SUBROUTINE ZVVDOTC( N, DOT, X, INCX, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N COMPLEX*16 DOT * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * .. * * Purpose * ======= * * ZVVDOTC computes the following dot product: * * dot = dot + x**H * y, * * where x and y are n vectors. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vectors x and y. N * must be at least zero. * * DOT (input/output) COMPLEX*16 * On exit, DOT is updated with the dot product of the vectors x * and y. * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented * array Y must contain the vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. COMPLEX*16 ZDOTC EXTERNAL ZDOTC * .. * .. Executable Statements .. * DOT = DOT + ZDOTC( N, X, INCX, Y, INCY ) * RETURN * * End of ZVVDOTC * END scalapack-2.0.2/PBLAS/SRC/PTZBLAS/zvvdotu.f000644 000766 000024 00000003563 10363532303 020215 0ustar00juliestaff000000 000000 SUBROUTINE ZVVDOTU( N, DOT, X, INCX, Y, INCY ) * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * .. Scalar Arguments .. INTEGER INCX, INCY, N COMPLEX*16 DOT * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * .. * * Purpose * ======= * * ZVVDOTU computes the following dot product: * * dot = dot + x**T * y, * * where x and y are n vectors. * * Arguments * ========= * * N (input) INTEGER * On entry, N specifies the length of the vectors x and y. N * must be at least zero. * * DOT (input/output) COMPLEX*16 * On exit, DOT is updated with the dot product of the vectors x * and y. * * X (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented * array X must contain the vector x. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (input) COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented * array Y must contain the vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * ===================================================================== * * .. External Functions .. COMPLEX*16 ZDOTU EXTERNAL ZDOTU * .. * .. Executable Statements .. * DOT = DOT + ZDOTU( N, X, INCX, Y, INCY ) * RETURN * * End of ZVVDOTU * END scalapack-2.0.2/PBLAS/SRC/PTOOLS/CMakeLists.txt000644 000766 000024 00000003710 11656312637 020774 0ustar00juliestaff000000 000000 set( ALLCTOOLS PB_CGatherV.c PB_CInV.c PB_CInV2.c PB_CInOutV.c PB_CInOutV2.c PB_COutV.c PB_CScatterV.c PB_CVMinit.c PB_CVMloc.c PB_CVMnpq.c PB_CVMpack.c PB_CVMswp.c PB_CVMupdate.c PB_CVMcontig.c PB_Cabort.c PB_Cainfog2l.c PB_Cbinfo.c PB_Cchkmat.c PB_Cchkvec.c PB_Cconjg.c PB_Cgetbuf.c PB_Cinfog2l.c PB_Citypeset.c PB_Cgcd.c PB_Clcm.c PB_Cmalloc.c PB_Cnumroc.c PB_Cg2lrem.c PB_Cindxg2p.c PB_Cnnxtroc.c PB_Cnpreroc.c PB_CpswapNN.c PB_CpswapND.c PB_Cpdot11.c PB_CpdotNN.c PB_CpdotND.c PB_CpaxpbyNN.c PB_CpaxpbyND.c PB_CpaxpbyDN.c PB_Cpaxpby.c PB_CpgemmBC.c PB_CpgemmAC.c PB_CpgemmAB.c PB_Cplaprnt.c PB_Cplapad.c PB_Cplapd2.c PB_Cplascal.c PB_Cplasca2.c PB_Cplacnjg.c PB_Cpsym.c PB_CpsymmAB.c PB_CpsymmBC.c PB_Cpsyr.c PB_CpsyrkA.c PB_CpsyrkAC.c PB_Cpsyr2.c PB_Cpsyr2kA.c PB_Cpsyr2kAC.c PB_Cptrm.c PB_Cpgeadd.c PB_Cptradd.c PB_Cptran.c PB_CptrmmAB.c PB_CptrmmB.c PB_Cptrsm.c PB_CptrsmAB.c PB_CptrsmAB0.c PB_CptrsmAB1.c PB_CptrsmB.c PB_Cptrsv.c PB_Ctop.c PB_Ctzahemv.c PB_Ctzasymv.c PB_Ctzatrmv.c PB_Ctzhemm.c PB_Ctzhemv.c PB_Ctzher.c PB_Ctzherk.c PB_Ctzher2.c PB_Ctzher2k.c PB_Ctzsymm.c PB_Ctzsymv.c PB_Ctzsyr.c PB_Ctzsyrk.c PB_Ctzsyr2.c PB_Ctzsyr2k.c PB_Ctztrmm.c PB_Ctztrmv.c PB_Cwarn.c PB_freebuf_.c PB_topget_.c PB_topset_.c PB_Cdescset.c PB_Cdescribe.c PB_CargFtoC.c PB_Cfirstnb.c PB_Clastnb.c PB_Cspan.c) set( SCTOOLS PB_Cstypeset.c) set( DCTOOLS PB_Cdtypeset.c) set( CCTOOLS PB_Cctypeset.c) set( ZCTOOLS PB_Cztypeset.c) set(ptools ${ALLCTOOLS} ${SCTOOLS} ${DCTOOLS} ${CCTOOLS} ${ZCTOOLS}) scalapack-2.0.2/PBLAS/SRC/PTOOLS/Makefile000644 000766 000024 00000011764 11654025546 017702 0ustar00juliestaff000000 000000 ############################################################################ # # Program: PBLAS -- (version 2.0) # # Module: Makefile # # Purpose: PTOOLS makefile # # Creation date: April 1, 1998 # # Modified: February 15, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../../SLmake.inc ############################################################################ # # This is the makefile to create a library for the PBLAS F77 tools. # The files are grouped as follows: # # ALLTOOLS -- Auxiliary routines for Level 1, 2 and 3 PBLAS # # STOOLS -- Single precision real PBLAS Level F77 tools routines # CTOOLS -- Single precision complex PBLAS Level F77 tools routines # DTOOLS -- Double precision real PBLAS Level F77 tools routines # ZTOOLS -- Double precision complex PBLAS Level F77 tools routines # # The library can be set up to include routines for any combination # of the four precisions. First, modify the ../../../SLmake.inc file # definitions to match your compiler and the options to be used. # Then to create or add to the library, enter make followed by one or # more of the precisions desired. Some examples: # make single # make single complex # make single double complex complex16 # Alternatively, the command # make # without any arguments creates a library of all four precisions. # # To remove the object files after the library is created, enter # make clean # ############################################################################ all: single double complex complex16 #--------------------------------------------------------------------------- # The following line defines auxiliary routines needed by the PBLAS tools #--------------------------------------------------------------------------- ALLCTOOLS = \ PB_CGatherV.o PB_CInV.o PB_CInV2.o PB_CInOutV.o \ PB_CInOutV2.o PB_COutV.o PB_CScatterV.o PB_CVMinit.o \ PB_CVMloc.o PB_CVMnpq.o PB_CVMpack.o PB_CVMswp.o \ PB_CVMupdate.o PB_CVMcontig.o PB_Cabort.o PB_Cainfog2l.o \ PB_Cbinfo.o PB_Cchkmat.o PB_Cchkvec.o PB_Cconjg.o \ PB_Cgetbuf.o PB_Cinfog2l.o PB_Citypeset.o PB_Cgcd.o \ PB_Clcm.o PB_Cmalloc.o PB_Cnumroc.o PB_Cg2lrem.o \ PB_Cindxg2p.o PB_Cnnxtroc.o PB_Cnpreroc.o PB_CpswapNN.o \ PB_CpswapND.o PB_Cpdot11.o PB_CpdotNN.o PB_CpdotND.o \ PB_CpaxpbyNN.o PB_CpaxpbyND.o PB_CpaxpbyDN.o PB_Cpaxpby.o \ PB_CpgemmBC.o PB_CpgemmAC.o PB_CpgemmAB.o PB_Cplaprnt.o \ PB_Cplapad.o PB_Cplapd2.o PB_Cplascal.o PB_Cplasca2.o \ PB_Cplacnjg.o PB_Cpsym.o PB_CpsymmAB.o PB_CpsymmBC.o \ PB_Cpsyr.o PB_CpsyrkA.o PB_CpsyrkAC.o PB_Cpsyr2.o \ PB_Cpsyr2kA.o PB_Cpsyr2kAC.o PB_Cptrm.o PB_Cpgeadd.o \ PB_Cptradd.o PB_Cptran.o PB_CptrmmAB.o PB_CptrmmB.o \ PB_Cptrsm.o PB_CptrsmAB.o PB_CptrsmAB0.o PB_CptrsmAB1.o \ PB_CptrsmB.o PB_Cptrsv.o PB_Ctop.o PB_Ctzahemv.o \ PB_Ctzasymv.o PB_Ctzatrmv.o PB_Ctzhemm.o PB_Ctzhemv.o \ PB_Ctzher.o PB_Ctzherk.o PB_Ctzher2.o PB_Ctzher2k.o \ PB_Ctzsymm.o PB_Ctzsymv.o PB_Ctzsyr.o PB_Ctzsyrk.o \ PB_Ctzsyr2.o PB_Ctzsyr2k.o PB_Ctztrmm.o PB_Ctztrmv.o \ PB_Cwarn.o PB_freebuf_.o PB_topget_.o PB_topset_.o \ PB_Cdescset.o PB_Cdescribe.o PB_CargFtoC.o PB_Cfirstnb.o \ PB_Clastnb.o PB_Cspan.o #--------------------------------------------------------------------------- ALLTOOLS = $(ALLCTOOLS) #--------------------------------------------------------------------------- # The C PBLAS tools #--------------------------------------------------------------------------- SCTOOLS = PB_Cstypeset.o DCTOOLS = PB_Cdtypeset.o CCTOOLS = PB_Cctypeset.o ZCTOOLS = PB_Cztypeset.o #--------------------------------------------------------------------------- STOOLS = $(SCTOOLS) CTOOLS = $(CCTOOLS) $(SCTOOLS) DTOOLS = $(DCTOOLS) ZTOOLS = $(ZCTOOLS) $(DCTOOLS) #--------------------------------------------------------------------------- SPTOOLS = $(ALLTOOLS) $(STOOLS) CPTOOLS = $(ALLTOOLS) $(CTOOLS) DPTOOLS = $(ALLTOOLS) $(DTOOLS) ZPTOOLS = $(ALLTOOLS) $(ZTOOLS) single: $(SPTOOLS) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(SPTOOLS) $(RANLIB) ../../../$(SCALAPACKLIB) complex: $(CPTOOLS) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(CPTOOLS) $(RANLIB) ../../../$(SCALAPACKLIB) double: $(DPTOOLS) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(DPTOOLS) $(RANLIB) ../../../$(SCALAPACKLIB) complex16: $(ZPTOOLS) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(ZPTOOLS) $(RANLIB) ../../../$(SCALAPACKLIB) clean: rm -f *.o .f.o : ; $(FC) -c $(FCFLAGS) $*.f .c.o : ; $(CC) -c $(CDEFS) $(CCFLAGS) $*.c scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cabort.c000644 000766 000024 00000010247 10363532303 020162 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #ifdef TestingPblas #include "../SRC/pblas.h" #include "../SRC/PBpblas.h" #include "../SRC/PBtools.h" #include "../SRC/PBblacs.h" #include "../SRC/PBblas.h" #else #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #endif /* * --------------------------------------------------------------------- * FORTRAN <-> C interface * --------------------------------------------------------------------- * * These macros identifies how the PBLAS will be called as follows: * * _F2C_ADD_: the FORTRAN compiler expects the name of C functions to be * in all lower case and to have an underscore postfixed it (Suns, Intel * compilers expect this). * * _F2C_NOCHANGE: the FORTRAN compiler expects the name of C functions * to be in all lower case (IBM RS6K compilers do this). * * _F2C_UPCASE: the FORTRAN compiler expects the name of C functions * to be in all upcase. (Cray compilers expect this). * * _F2C_F77ISF2C: the FORTRAN compiler in use is f2c, a FORTRAN to C * converter. */ #if (_F2C_CALL_ == _F2C_ADD_ ) #define PB_NoAbort pb_noabort_ #endif #if (_F2C_CALL_ == _F2C_UPCASE ) #define PB_NoAbort PB_NOABORT #endif #if (_F2C_CALL_ == _F2C_NOCHANGE ) #define PB_NoAbort pb_noabort #endif #if (_F2C_CALL_ == _F2C_F77ISF2C ) #define PB_NoAbort pb_noabort__ #endif #ifdef __STDC__ void PB_Cabort( int ICTXT, char * ROUT, int INFO ) #else void PB_Cabort( ICTXT, ROUT, INFO ) /* * .. Scalar Arguments .. */ int ICTXT, INFO; /* * .. Array Arguments .. */ char * ROUT; #endif { /* * Purpose * ======= * * PB_Cabort is an error handler for the PBLAS routines. This routine * displays an error message on stderr by calling PB_Cwarn, and halts * execution by calling Cblacs_abort(). * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * ROUT (global input) pointer to CHAR * On entry, ROUT specifies the name of the routine calling this * error handler. * * INFO (local input) INTEGER * The error code computed by the calling PBLAS routine. * = 0: no error found * < 0: If the i-th argument is an array and the j-entry had * an illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int mycol, myrow, npcol, nprow; /* .. * .. External Functions .. */ #ifdef TestingPblas #ifdef __STDC__ int PB_NoAbort( int * ); #else int PB_NoAbort(); #endif #endif /* .. * .. Executable Statements .. * */ Cblacs_gridinfo( ICTXT, &nprow, &npcol, &myrow, &mycol ); #ifdef TestingPblas /* * For testing purpose only, the error is reported, but the program execution * is not terminated */ if( PB_NoAbort( &INFO ) ) return; #endif if( INFO < 0 ) { /* * Display an error message */ if( INFO < DESCMULT ) PB_Cwarn( ICTXT, -1, ROUT, "Parameter number %d had an illegal value", -INFO ); else PB_Cwarn( ICTXT, -1, ROUT, "Parameter number %d, entry number %d had an illegal value", (-INFO) / DESCMULT, (-INFO) % DESCMULT ); } else { /* * Error code is incorrect, it should be negative */ PB_Cwarn( ICTXT, -1, ROUT, "Positive error code %d returned by %s!!!", INFO ); } Cblacs_abort( ICTXT, INFO ); /* * End of PB_Cabort */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cainfog2l.c000644 000766 000024 00000053606 10363532303 020562 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cainfog2l( int M, int N, int I, int J, int * DESC, int NPROW, int NPCOL, int MYROW, int MYCOL, int * IMB1, int * INB1, int * MP, int * NQ, int * II, int * JJ, int * PROW, int * PCOL, int * RPROW, int * RPCOL ) #else void PB_Cainfog2l( M, N, I, J, DESC, NPROW, NPCOL, MYROW, MYCOL, IMB1, INB1, MP, NQ, II, JJ, PROW, PCOL, RPROW, RPCOL ) /* * .. Scalar Arguments .. */ int I, * II, * IMB1, * INB1, J, * JJ, M, * MP, MYCOL, MYROW, N, NPCOL, NPROW, * NQ, * PCOL, * PROW, * RPCOL, * RPROW; /* * .. Array Arguments .. */ int * DESC; #endif { /* * Purpose * ======= * * PB_Cainfog2l computes the starting local row and column indexes II, * JJ corresponding to the submatrix starting globally at the entry * pointed by I, J. This routine returns the coordinates in the grid of * the process owning the matrix entry of global indexes I, J, namely * PROW and PCOL. In addition, this routine computes the quantities MP * and NQ, which are respectively the local number of rows and columns * owned by the process of coordinate MYROW, MYCOL corresponding to the * global submatrix A(I:I+M-1,J:J+N-1). Finally, the size of the first * partial block and the relative process coordinates are also returned * respectively in IMB, INB and RPROW, RPCOL. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the global number of rows of the subma- * trix. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the global number of columns of the * submatrix. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global starting row index of the * submatrix. I must at least zero. * * J (global input) INTEGER * On entry, J specifies the global starting column index of * the submatrix. J must at least zero. * * DESC (global and local input) INTEGER array * On entry, DESC is an integer array of dimension DLEN_. This * is the array descriptor of the underlying matrix. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process columns * over which the matrix is distributed. NPCOL must be at least * one. * * MYROW (local input) INTEGER * On entry, MYROW specifies the row coordinate of the process * whose local index II is determined. MYROW must be at least * zero and strictly less than NPROW. * * MYCOL (local input) INTEGER * On entry, MYCOL specifies the column coordinate of the pro- * cess whose local index JJ is determined. MYCOL must be at * least zero and strictly less than NPCOL. * * IMB1 (global output) INTEGER * On exit, IMB1 specifies the number of rows of the upper left * block of the submatrix. On exit, IMB1 is less or equal than * M and greater or equal than MIN( 1, M ). * * INB1 (global output) INTEGER * On exit, INB1 specifies the number of columns of the upper * left block of the submatrix. On exit, INB1 is less or equal * than N and greater or equal than MIN( 1, N ). * * MP (local output) INTEGER * On exit, MP specifies the local number of rows of the subma- * trix, that the processes of row coordinate MYROW own. MP is * at least zero. * * NQ (local output) INTEGER * On exit, NQ specifies the local number of columns of the * submatrix, that the processes of column coordinate MYCOL * own. NQ is at least zero. * * II (local output) INTEGER * On exit, II specifies the local starting row index of the * submatrix. On exit, II is at least zero. * * JJ (local output) INTEGER * On exit, JJ specifies the local starting column index of * the submatrix. On exit, II is at least zero. * * PROW (global output) INTEGER * On exit, PROW specifies the row coordinate of the process * that possesses the first row of the submatrix. On exit, PROW * is -1 if DESC(RSRC_) is -1 on input, and, at least zero and * strictly less than NPROW otherwise. * * PCOL (global output) INTEGER * On exit, PCOL specifies the column coordinate of the process * that possesses the first column of the submatrix. On exit, * PCOL is -1 if DESC(CSRC_) is -1 on input, and, at least zero * and strictly less than NPCOL otherwise. * * RPROW (global output) INTEGER * On exit, RPROW specifies the relative row coordinate of the * process that possesses the first row I of the submatrix. On * exit, RPROW is -1 if DESC(RSRC_) is -1 on input, and, at * least zero and strictly less than NPROW otherwise. * * RPCOL (global output) INTEGER * On exit, RPCOL specifies the relative column coordinate of * the process that possesses the first column J of the subma- * trix. On exit, RPCOL is -1 if DESC(CSRC_) is -1 on input, * and, at least zero and strictly less than NPCOL otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int i1, ilocblk, j1, mb, mydist, nb, nblocks, csrc, rsrc; /* .. * .. Executable Statements .. * */ /* * Retrieve the row distribution parameters */ mb = DESC[ MB_ ]; rsrc = DESC[ RSRC_ ]; if( ( rsrc == -1 ) || ( NPROW == 1 ) ) { /* * The rows are not distributed, or there is just one process row in the grid. * Therefore, the local and global indexes are the same, as well as the local * and global number of rows. Finally, the relative row process coordinate is * zero, since every process owns all rows. Note that the size of the first * row block can be zero only if M is zero. */ *II = I; if( ( *IMB1 = DESC[IMB_] - I ) <= 0 ) *IMB1 += ( ( -(*IMB1) ) / mb + 1 ) * mb; *IMB1 = MIN( *IMB1, M ); *MP = M; *PROW = rsrc; *RPROW = 0; } else { /* * Figure out PROW, II and IMB1 first. */ *IMB1 = DESC[IMB_]; if( I < *IMB1 ) /* Is I in first block range ? */ { /* * If I is in the first block of rows, then PROW is simply rsrc, II is I in * this process and zero elsewhere, and the size of the first block is the * IMB complement. */ *PROW = rsrc; *II = ( ( MYROW == *PROW ) ? I : 0 ); *IMB1 -= I; } else { /* * The discussion goes as follows: compute my distance from the source process * so that within this process coordinate system, the source row process is the * process such that mydist=0, or equivalently MYROW == rsrc. * * Find out the global coordinate of the block of rows I belongs to (nblocks), * as well as the minimum local number of row blocks that every process has. * * when mydist < nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks, * when mydist > nblocks - ilocblk * NPROW, I own ilocblk full blocks, * when mydist = nblocks - ilocblk * NPROW, I own ilocblk full blocks * but not I, or I own ilocblk + 1 blocks and the entry I refers to. */ i1 = I - *IMB1; if( MYROW == rsrc ) { /* * I refers to an entry that is not in the first block, find out which process * has it. */ nblocks = i1 / mb + 1; *PROW = rsrc + nblocks; *PROW -= ( *PROW / NPROW ) * NPROW; /* * Since mydist = 0 and nblocks - ilocblk * NPROW >= 0, there are only three * possible cases: * * 1) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I don't own I, in * which case II = IMB + ( ilocblk - 1 ) * MB. Note that this case cannot * happen when ilocblk is zero, since nblocks is at least one. * * 2) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I own I, in which * case I and II can respectively be written as IMB + (nblocks-1)*MB + IL * and IMB+(ilocblk-1) * MB + IL. That is II = I + (ilocblk - nblocks)*MB. * Note that this case cannot happen when ilocblk is zero, since nblocks * is at least one. * * 3) mydist = 0 < nblocks - ilocblk * NPROW, the source process owns * ilocblk+1 full blocks, and therefore II = IMB + ilocblk * MB. Note * that when ilocblk is zero, II is just IMB. */ if( nblocks < NPROW ) { *II = *IMB1; } else { ilocblk = nblocks / NPROW; if( ilocblk * NPROW >= nblocks ) { *II = ( ( MYROW == *PROW ) ? I + ( ilocblk - nblocks ) * mb : *IMB1 + ( ilocblk - 1 ) * mb ); } else { *II = *IMB1 + ilocblk * mb; } } } else { /* * I is not in the first block, find out which process has it. */ nblocks = i1 / mb + 1; *PROW = rsrc + nblocks; *PROW -= ( *PROW / NPROW ) * NPROW; /* * Compute my distance from the source process so that within this process * coordinate system, the source process is the process such that mydist=0. */ if( ( mydist = MYROW - rsrc ) < 0 ) mydist += NPROW; /* * When mydist < nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks of * size MB since I am not the source process, i.e. II = ( ilocblk + 1 ) * MB. * When mydist >= nblocks - ilocblk * NPROW and I don't own I, I own ilocblk * full blocks of size MB, i.e. II = ilocblk * MB, otherwise I own ilocblk * blocks and I, in which case I can be written as IMB + (nblocks-1)*MB + IL * and II = ilocblk*MB + IL = I - IMB + ( ilocblk - nblocks + 1 )*MB. */ if( nblocks < NPROW ) { mydist -= nblocks; *II = ( ( mydist < 0 ) ? mb : ( ( MYROW == *PROW ) ? i1 + ( 1 - nblocks ) * mb : 0 ) ); } else { ilocblk = nblocks / NPROW; mydist -= nblocks - ilocblk * NPROW; *II = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * mb : ( ( MYROW == *PROW ) ? ( ilocblk - nblocks + 1 ) * mb + i1 : ilocblk * mb ) ); } } /* * Update the size of first block */ *IMB1 = nblocks * mb - i1; } /* * Now everything is just like M, I=0, IMB1, MB, PROW, NPROW. The discussion * goes as follows: compute my distance from the source process PROW so that * within this process coordinate system, the source process is the process * such that mydist = 0. Figure out MP. */ if( M <= *IMB1 ) { /* * M <= IMB1: if I am the source process, i.e. I own I (mydist = 0), MP is M * and 0 otherwise. */ *MP = ( ( MYROW == *PROW ) ? M : 0 ); } else { /* * Find out how many full blocks are globally (nblocks) and locally (ilocblk) * in those M entries */ nblocks = ( M - *IMB1 ) / mb + 1; if( MYROW == *PROW ) { /* * Since mydist = 0 and nblocks - ilocblk * NPROW >= 0, there are only two * possible cases: * * 1) When mydist = nblocks - ilocblk * NPROW = 0, that is NPROW divides * the global number of full blocks, then the source process PROW owns * one more block than the other processes; and M can be rewritten as * M = IMB1 + (nblocks-1) * NB + LNB with LNB >= 0 size of the last block. * Similarly, the local value MP corresponding to M can be written as * MP = IMB1 + (ilocblk-1) * MB + LMB = M + ( ilocblk-1 - (nblocks-1) )*MB. * Note that this case cannot happen when ilocblk is zero, since nblocks * is at least one. * * 2) mydist = 0 < nblocks - ilocblk * NPROW, the source process only owns * full blocks, and therefore MP = IMB1 + ilocblk * MB. Note that when * ilocblk is zero, MP is just IMB1. */ if( nblocks < NPROW ) { *MP = *IMB1; } else { ilocblk = nblocks / NPROW; *MP = ( ( nblocks - ilocblk * NPROW ) ? *IMB1 + ilocblk * mb : M + ( ilocblk - nblocks ) * mb ); } } else { /* * Compute my distance from the source process so that within this process * coordinate system, the source process is the process such that mydist=0. */ if( ( mydist = MYROW - *PROW ) < 0 ) mydist += NPROW; /* * When mydist < nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks of * size MB since I am not the source process, * * when mydist > nblocks - ilocblk * NPROW, I own ilocblk full blocks of * size MB since I am not the source process, * * when mydist = nblocks - ilocblk * NPROW, * either the last block is not full and I own it, in which case * M = IMB1 + (nblocks - 1)*MB + LMB with LNB the size of the last block * such that MB > LMB > 0; the local value MP corresponding to M is given * by MP = ilocblk * MB + LMB = M - IMB1 + ( ilocblk - nblocks + 1 ) * MB; * or the last block is full and I am the first process owning only ilocblk * full blocks of size MB, that is M = IMB + ( nblocks - 1 ) * MB and * MP = ilocblk * MB = M - IMB + ( ilocblk - nblocks + 1 ) * MB. */ if( nblocks < NPROW ) { mydist -= nblocks; *MP = ( ( mydist < 0 ) ? mb : ( ( mydist > 0 ) ? 0 : M - *IMB1 + mb * ( 1 - nblocks ) ) ); } else { ilocblk = nblocks / NPROW; mydist -= nblocks - ilocblk * NPROW; *MP = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * mb : ( ( mydist > 0 ) ? ilocblk * mb : M - *IMB1 + mb * ( ilocblk - nblocks + 1 ) ) ); } } } /* * Finally figure out IMB1 and RPROW. Note that IMB1 can be zero when M = 0. */ *IMB1 = MIN( *IMB1, M ); if( ( *RPROW = MYROW - *PROW ) < 0 ) *RPROW += NPROW; } /* * Idem for the columns */ nb = DESC[ NB_ ]; csrc = DESC[ CSRC_ ]; if( ( csrc == -1 ) || ( NPCOL == 1 ) ) { *JJ = J; if( ( *INB1 = DESC[INB_] - J ) <= 0 ) *INB1 += ( ( -(*INB1) ) / nb + 1 ) * nb; *INB1 = MIN( *INB1, N ); *NQ = N; *PCOL = csrc; *RPCOL = 0; } else { *INB1 = DESC[INB_]; if( J < *INB1 ) { *PCOL = csrc; *JJ = ( ( MYCOL == *PCOL ) ? J : 0 ); *INB1 -= J; } else { j1 = J - *INB1; if( MYCOL == csrc ) { nblocks = j1 / nb + 1; *PCOL = csrc + nblocks; *PCOL -= ( *PCOL / NPCOL ) * NPCOL; if( nblocks < NPCOL ) { *JJ = *INB1; } else { ilocblk = nblocks / NPCOL; if( ilocblk * NPCOL >= nblocks ) { *JJ = ( ( MYCOL == *PCOL ) ? J + ( ilocblk - nblocks ) * nb : *INB1 + ( ilocblk - 1 ) * nb ); } else { *JJ = *INB1 + ilocblk * nb; } } } else { nblocks = j1 / nb + 1; *PCOL = csrc + nblocks; *PCOL -= ( *PCOL / NPCOL ) * NPCOL; if( ( mydist = MYCOL - csrc ) < 0 ) mydist += NPCOL; if( nblocks < NPCOL ) { mydist -= nblocks; *JJ = ( ( mydist < 0 ) ? nb : ( ( MYCOL == *PCOL ) ? j1 + ( 1 - nblocks ) * nb : 0 ) ); } else { ilocblk = nblocks / NPCOL; mydist -= nblocks - ilocblk * NPCOL; *JJ = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * nb : ( ( MYCOL == *PCOL ) ? ( ilocblk - nblocks + 1 ) * nb + j1 : ilocblk * nb ) ); } } *INB1 = nblocks * nb - j1; } if( N <= *INB1 ) { *NQ = ( ( MYCOL == *PCOL ) ? N : 0 ); } else { nblocks = ( N - *INB1 ) / nb + 1; if( MYCOL == *PCOL ) { if( nblocks < NPCOL ) { *NQ = *INB1; } else { ilocblk = nblocks / NPCOL; *NQ = ( ( nblocks - ilocblk * NPCOL ) ? *INB1 + ilocblk * nb : N + ( ilocblk - nblocks ) * nb ); } } else { if( ( mydist = MYCOL - *PCOL ) < 0 ) mydist += NPCOL; if( nblocks < NPCOL ) { mydist -= nblocks; *NQ = ( ( mydist < 0 ) ? nb : ( ( mydist > 0 ) ? 0 : N - *INB1 + nb * ( 1 - nblocks ) ) ); } else { ilocblk = nblocks / NPCOL; mydist -= nblocks - ilocblk * NPCOL; *NQ = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * nb : ( ( mydist > 0 ) ? ilocblk * nb : N - *INB1 + nb * ( ilocblk - nblocks + 1 ) ) ); } } } *INB1 = MIN( *INB1, N ); if( ( *RPCOL = MYCOL - *PCOL ) < 0 ) *RPCOL += NPCOL; } /* * End of PB_Cainfog2l */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CargFtoC.c000644 000766 000024 00000025300 10363532303 020374 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CargFtoC( int IF, int JF, int * DESCIN, int * IC, int * JC, int * DESCOUT ) #else void PB_CargFtoC( IF, JF, DESCIN, IC, JC, DESCOUT ) /* * .. Scalar Arguments .. */ int IF, JF, * IC, * JC; /* * .. Array Arguments .. */ int * DESCIN, * DESCOUT; #endif { /* * Purpose * ======= * * PB_CargFtoC converts a descriptor DESCIN of type BLOCK_CYCLIC_2D * or BLOCK_CYCLIC_INB_2D into a descriptor DESCOUT of type * BLOCK_CYCLIC_INB_2D. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESCA: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ----------------------------------- * DTYPE_A (global) DESCA( DTYPE1_ ) The descriptor type. * CTXT_A (global) DESCA( CTXT1_ ) The BLACS context handle indicating * the NPROW x NPCOL BLACS process * grid A is distributed over. The * context itself is global, but the * handle (the integer value) may * vary. * M_A (global) DESCA( M1_ ) The number of rows in the distri- * buted matrix A, M_A >= 0. * N_A (global) DESCA( N1_ ) The number of columns in the dis- * tributed matrix A, N_A >= 0. * MB_A (global) DESCA( MB1_ ) The blocking factor used to distri- * bute the rows of A, MB_A > 0. * NB_A (global) DESCA( NB1_ ) The blocking factor used to distri- * bute the columns of A, NB_A > 0. * RSRC_A (global) DESCA( RSRC1_ ) The process row over which the * first row of the matrix A is dis- * tributed, NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA( CSRC1_ ) The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA( LLD1_ ) The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, MB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, NB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * IF (global input) INTEGER * On entry, IF specifies the global row Fortran index of the * distributed subarray described by DESCIN. IF must be at least * one. * * JF (global input) INTEGER * On entry, JF specifies the global column Fortran index of * the distributed subarray described by DESCIN. JF must be at * least one. * * DESCIN (global and local input) INTEGER array * On entry, DESCIN is an array of dimension DLEN1_ or DLEN_ as * specified by its first entry DESCIN( DTYPE_ ). DESCIN is the * source array descriptor of type BLOCK_CYCLIC_2D or of type * BLOCK_CYCLIC_2D_INB. * * IC (global output) INTEGER * On exit, IC specifies the global row C index of the distribu- * ted subarray described by DESCOUT. IC = IF - 1, i.e IC is at * least zero. * * JC (global output) INTEGER * On entry, JC specifies the global column Fortran index of * the distributed subarray described by DESCOUT. JC = JF - 1, * i.e JC is at least zero. * * DESCOUT (global and local output) INTEGER array * On entry, DESCOUT is an array of dimension DLEN_. DESCOUT is * the target array descriptor of type BLOCK_CYCLIC_2D_INB. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ *IC = IF - 1; *JC = JF - 1; if ( DESCIN[DTYPE_] == BLOCK_CYCLIC_2D ) { DESCOUT[DTYPE_] = BLOCK_CYCLIC_2D_INB; DESCOUT[M_ ] = DESCIN[M1_ ]; DESCOUT[N_ ] = DESCIN[N1_ ]; DESCOUT[IMB_ ] = DESCIN[MB1_ ]; DESCOUT[INB_ ] = DESCIN[NB1_ ]; DESCOUT[MB_ ] = DESCIN[MB1_ ]; DESCOUT[NB_ ] = DESCIN[NB1_ ]; DESCOUT[RSRC_ ] = DESCIN[RSRC1_ ]; DESCOUT[CSRC_ ] = DESCIN[CSRC1_ ]; DESCOUT[CTXT_ ] = DESCIN[CTXT1_ ]; DESCOUT[LLD_ ] = DESCIN[LLD1_ ]; } else if ( DESCIN[DTYPE_] == BLOCK_CYCLIC_2D_INB ) { DESCOUT[DTYPE_] = BLOCK_CYCLIC_2D_INB; DESCOUT[M_ ] = DESCIN[M_ ]; DESCOUT[N_ ] = DESCIN[N_ ]; DESCOUT[IMB_ ] = DESCIN[IMB_ ]; DESCOUT[INB_ ] = DESCIN[INB_ ]; DESCOUT[MB_ ] = DESCIN[MB_ ]; DESCOUT[NB_ ] = DESCIN[NB_ ]; DESCOUT[RSRC_ ] = DESCIN[RSRC_ ]; DESCOUT[CSRC_ ] = DESCIN[CSRC_ ]; DESCOUT[CTXT_ ] = DESCIN[CTXT_ ]; DESCOUT[LLD_ ] = DESCIN[LLD_ ]; } else { DESCOUT[DTYPE_] = DESCIN[0]; DESCOUT[CTXT_ ] = DESCIN[1]; DESCOUT[M_ ] = 0; DESCOUT[N_ ] = 0; DESCOUT[IMB_ ] = 1; DESCOUT[INB_ ] = 1; DESCOUT[MB_ ] = 1; DESCOUT[NB_ ] = 1; DESCOUT[RSRC_ ] = 0; DESCOUT[CSRC_ ] = 0; DESCOUT[LLD_ ] = 1; } /* * End of PB_CargFtoC */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cbinfo.c000644 000766 000024 00000024155 10363532303 020153 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cbinfo( int OFFD, int M, int N, int IMB1, int INB1, int MB, int NB, int MRROW, int MRCOL, int * LCMT00, int * MBLKS, int * NBLKS, int * IMBLOC, int * INBLOC, int * LMBLOC, int * LNBLOC, int * ILOW, int * LOW, int * IUPP, int * UPP ) #else void PB_Cbinfo( OFFD, M, N, IMB1, INB1, MB, NB, MRROW, MRCOL, LCMT00, MBLKS, NBLKS, IMBLOC, INBLOC, LMBLOC, LNBLOC, ILOW, LOW, IUPP, UPP ) /* * .. Scalar Arguments .. */ int * ILOW, IMB1, * IMBLOC, INB1, * INBLOC, * IUPP, * LCMT00, * LMBLOC, * LNBLOC, * LOW, M, MB, * MBLKS, MRCOL, MRROW, N, NB, * NBLKS, OFFD, * UPP; #endif { /* * Purpose * ======= * * PB_Cbinfo initializes the local information of an m by n local array * owned by the process of relative coordinates ( MRROW, MRCOL ). Note * that if m or n is less or equal than zero, there is no data, in which * case this process does not need the local information computed by * this routine to proceed. * * Arguments * ========= * * OFFD (global input) INTEGER * On entry, OFFD specifies the off-diagonal of the underlying * matrix of interest as follows: * OFFD = 0 specifies the main diagonal, * OFFD > 0 specifies lower subdiagonals, and * OFFD < 0 specifies upper superdiagonals. * * M (local input) INTEGER * On entry, M specifies the local number of rows of the under- * lying matrix owned by the process of relative coordinates * ( MRROW, MRCOL ). M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the local number of columns of the un- * derlying matrix owned by the process of relative coordinates * ( MRROW, MRCOL ). N must be at least zero. * * IMB1 (global input) INTEGER * On input, IMB1 specifies the global true size of the first * block of rows of the underlying global submatrix. IMB1 must * be at least MIN( 1, M ). * * INB1 (global input) INTEGER * On input, INB1 specifies the global true size of the first * block of columns of the underlying global submatrix. INB1 * must be at least MIN( 1, N ). * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * MRROW (local input) INTEGER * On entry, MRROW specifies the relative row coordinate of the * process that possesses these M rows. MRROW must be least zero * and strictly less than NPROW. * * MRCOL (local input) INTEGER * On entry, MRCOL specifies the relative column coordinate of * the process that possesses these N columns. MRCOL must be * least zero and strictly less than NPCOL. * * LCMT00 (local output) INTEGER * On exit, LCMT00 is the LCM value of the left upper block of * this m by n local block owned by the process of relative co- * ordinates ( MRROW, MRCOL ). * * MBLKS (local output) INTEGER * On exit, MBLKS specifies the local number of blocks of rows * corresponding to M. MBLKS must be at least zero. * * NBLKS (local output) INTEGER * On exit, NBLKS specifies the local number of blocks of co- * lumns corresponding to N. NBLKS must be at least zero. * * IMBLOC (local output) INTEGER * On exit, IMBLOC specifies the number of rows (size) of the * uppest blocks of this m by n local array owned by the process * of relative coordinates ( MRROW, MRCOL ). IMBLOC is at least * MIN( 1, M ). * * INBLOC (local output) INTEGER * On exit, INBLOC specifies the number of columns (size) of * the leftmost blocks of this m by n local array owned by the * process of relative coordinates ( MRROW, MRCOL ). INBLOC is * at least MIN( 1, N ). * * LMBLOC (local output) INTEGER * On exit, LMBLOC specifies the number of rows (size) of the * lowest blocks of this m by n local array owned by the process * of relative coordinates ( MRROW, MRCOL ). LMBLOC is at least * MIN( 1, M ). * * LNBLOC (local output) INTEGER * On exit, LNBLOC specifies the number of columns (size) of the * rightmost blocks of this m by n local array owned by the * process of relative coordinates ( MRROW, MRCOL ). LNBLOC is * at least MIN( 1, N ). * * ILOW (local output) INTEGER * On exit, ILOW is the lower bound characterizing the first co- * lumn block owning offdiagonals of this m by n array. ILOW * must be less or equal than zero. * * LOW (global output) INTEGER * On exit, LOW is the lower bound characterizing the column * blocks with te exception of the first one (see ILOW) owning * offdiagonals of this m by n array. LOW must be less or equal * than zero. * * IUPP (local output) INTEGER * On exit, IUPP is the upper bound characterizing the first row * block owning offdiagonals of this m by n array. IUPP must be * greater or equal than zero. * * UPP (global output) INTEGER * On exit, UPP is the upper bound characterizing the row * blocks with te exception of the first one (see IUPP) owning * offdiagonals of this m by n array. UPP must be greater or * equal than zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int tmp1; /* .. * .. Executable Statements .. * */ /* * Initialize LOW, ILOW, UPP, IUPP, LMBLOC, LNBLOC, IMBLOC, INBLOC, MBLKS, * NBLKS and LCMT00. */ *LOW = 1 - NB; *UPP = MB - 1; *LCMT00 = OFFD; if( ( M <= 0 ) || ( N <= 0 ) ) { /* * If the local virtual array is empty, then simplify the remaining of the * initialization. */ *IUPP = ( MRROW ? MB - 1 : ( IMB1 > 0 ? IMB1 - 1 : 0 ) ); *IMBLOC = 0; *MBLKS = 0; *LMBLOC = 0; *ILOW = ( MRCOL ? 1 - NB : ( INB1 > 0 ? 1 - INB1 : 0 ) ); *INBLOC = 0; *NBLKS = 0; *LNBLOC = 0; *LCMT00 += ( *LOW - *ILOW + MRCOL * NB ) - ( *IUPP - *UPP + MRROW * MB ); return; } if( MRROW ) { /* * I am not in the first relative process row. Use the first local row block * size MB to initialize the VM structure. */ *IMBLOC = MIN( M, MB ); *IUPP = MB - 1; *LCMT00 -= IMB1 - MB + MRROW * MB; *MBLKS = ( M - 1 ) / MB + 1; *LMBLOC = M - ( M / MB ) * MB; if( !( *LMBLOC ) ) *LMBLOC = MB; if( MRCOL ) { /* * I am not in the first relative process column. Use the first local column * block size NB to initialize the VM structure. */ *INBLOC = MIN( N, NB ); *ILOW = 1 - NB; *LCMT00 += INB1 - NB + MRCOL * NB; *NBLKS = ( N - 1 ) / NB + 1; *LNBLOC = N - ( N / NB ) * NB; if( !( *LNBLOC ) ) *LNBLOC = NB; } else { /* * I am in the first relative process column. Use the first column block size * INB1 to initialize the VM structure. */ *INBLOC = INB1; *ILOW = 1 - INB1; tmp1 = N - INB1; if( tmp1 ) { /* * There is more than one column block. Compute the number of local column * blocks and the size of the last one. */ *NBLKS = ( tmp1 - 1 ) / NB + 2; *LNBLOC = tmp1 - ( tmp1 / NB ) * NB; if( !( *LNBLOC ) ) *LNBLOC = NB; } else { /* * There is only one column block. */ *NBLKS = 1; *LNBLOC = INB1; } } } else { /* * I am in the first relative process row. Use the first row block size IMB1 to * initialize the VM structure. */ *IMBLOC = IMB1; *IUPP = IMB1 - 1; tmp1 = M - IMB1; if( tmp1 ) { /* * There is more than one row block. Compute the number of local row blocks and * the size of the last one. */ *MBLKS = ( tmp1 - 1 ) / MB + 2; *LMBLOC = tmp1 - ( tmp1 / MB ) * MB; if( !( *LMBLOC ) ) *LMBLOC = MB; } else { /* * There is only one row block. */ *MBLKS = 1; *LMBLOC = IMB1; } if( MRCOL ) { /* * I am not in the first relative process column. Use the first local column * block size NB to initialize the VM structure. */ *INBLOC = MIN( N, NB ); *ILOW = 1 - NB; *LCMT00 += INB1 - NB + MRCOL * NB; *NBLKS = ( N - 1 ) / NB + 1; *LNBLOC = N - ( N / NB ) * NB; if( !( *LNBLOC ) ) *LNBLOC = NB; } else { /* * I am in the first relative process column. Use the first column block size * INB1 to initialize the VM structure. */ *INBLOC = INB1; *ILOW = 1 - INB1; tmp1 = N - INB1; if( tmp1 ) { /* * There is more than one column block. Compute the number of local column * blocks and the size of the last one. */ *NBLKS = ( tmp1 - 1 ) / NB + 2; *LNBLOC = tmp1 - ( tmp1 / NB ) * NB; if( !( *LNBLOC ) ) *LNBLOC = NB; } else { /* * There is only one column block. */ *NBLKS = 1; *LNBLOC = INB1; } } } /* * End of PB_Cbinfo */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cchkmat.c000644 000766 000024 00000027523 10363532303 020327 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cchkmat( int ICTXT, char * ROUT, char * MNAME, int M, int MPOS0, int N, int NPOS0, int IA, int JA, int * DESCA, int DPOS0, int * INFO ) #else void PB_Cchkmat( ICTXT, ROUT, MNAME, M, MPOS0, N, NPOS0, IA, JA, DESCA, DPOS0, INFO ) /* * .. Scalar Arguments .. */ int DPOS0, IA, ICTXT, * INFO, JA, M, MPOS0, N, NPOS0; /* * .. Array Arguments .. */ char * MNAME, * ROUT; int * DESCA; #endif { /* * Purpose * ======= * * PB_Cchkmat checks the validity of a descriptor vector DESCA, the * related global indexes IA, JA from a local view point. If an incon- * sistency is found among its parameters IA, JA and DESCA, the routine * returns an error code in INFO. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * ROUT (global input) pointer to CHAR * On entry, ROUT specifies the name of the routine calling this * input error checking routine. * * MNAME (global input) pointer to CHAR * On entry, MNAME specifies the name of the formal array argu- * ment in the calling routine. * * M (global input) INTEGER * On entry, M specifies the number of rows the submatrix * sub( A ). * * MPOS0 (global input) INTEGER * On entry, MPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter M appears. * * N (global input) INTEGER * On entry, N specifies the number of columns the submatrix * sub( A ). * * NPOS0 (global input) INTEGER * On entry, NPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter N appears. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * DPOS0 (global input) INTEGER * On entry, DPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter DESCA ap- * pears. Note that it is assumed that IA and JA are respecti- * vely 2 and 1 entries behind DESCA. * * INFO (local input/local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had an * illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int dpos, iapos, japos, mpos, mycol, myrow, np, npcol, nprow, npos, nq; /* .. * .. Executable Statements .. * */ /* * Want to find errors with MIN( ), so if no error, set it to a big number. If * there already is an error, multiply by the the descriptor multiplier. */ if( *INFO >= 0 ) *INFO = BIGNUM; else if( *INFO < -DESCMULT ) *INFO = -(*INFO); else *INFO = -(*INFO) * DESCMULT; /* * Figure where in parameter list each parameter was, factoring in descriptor * multiplier */ mpos = MPOS0 * DESCMULT; npos = NPOS0 * DESCMULT; iapos = ( DPOS0 - 2 ) * DESCMULT; japos = ( DPOS0 - 1 ) * DESCMULT; dpos = DPOS0 * DESCMULT + 1; /* * Get process grid information */ Cblacs_gridinfo( ICTXT, &nprow, &npcol, &myrow, &mycol ); /* * Are M, N, IA, JA and DESCA legal inputs ? */ if( M < 0 ) { /* * M must be at least zero */ *INFO = MIN( *INFO, mpos ); PB_Cwarn( ICTXT, -1, ROUT, "%s sub( %s ) = %d, it must be at least 0", "Illegal number of rows of", MNAME, M ); } if( N < 0 ) { /* * N must be at least zero */ *INFO = MIN( *INFO, npos ); PB_Cwarn( ICTXT, -1, ROUT, "%s sub( %s ) = %d, it must be at least 0", "Illegal number of columns of", MNAME, N ); } if( IA < 0 ) { /* * IA must be at least zero */ *INFO = MIN( *INFO, iapos ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal I%s = %d, I%s must be at least 1", MNAME, IA+1, MNAME ); } if( JA < 0 ) { /* * JA must be at least zero */ *INFO = MIN( *INFO, japos ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal J%s = %d, I%s must be at least 1", MNAME, IA+1, MNAME ); } if( DESCA[DTYPE_] != BLOCK_CYCLIC_2D_INB ) { /* * Internally, only the descriptor type BLOCK_CYCLIC_2D_INB is supported */ *INFO = MIN( *INFO, dpos + DTYPE_ ); PB_Cwarn( ICTXT, -1, ROUT, "%s %d for matrix %s. PBLAS accepts: %d or %d", "Illegal descriptor type", DESCA[DTYPE_], MNAME, BLOCK_CYCLIC_2D, BLOCK_CYCLIC_2D_INB ); if( *INFO % DESCMULT == 0 ) *INFO = -( (*INFO) / DESCMULT ); else *INFO = -(*INFO); /* * No need to go any further ... */ return; } if( DESCA[CTXT_] != ICTXT ) { /* * Check if the context of X match the other contexts. Only intra-context * operations are supported. */ *INFO = MIN( *INFO, dpos + CTXT_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[CTXT_] = %d %s= %d", MNAME, DESCA[CTXT_], "does not match other operand's context ", ICTXT ); if( *INFO % DESCMULT == 0 ) *INFO = -( (*INFO) / DESCMULT ); else *INFO = -(*INFO); /* * No need to go any further ... */ return; } if( DESCA[IMB_] < 1 ) { /* * DESCA[IMB_] must be at least one */ *INFO = MIN( *INFO, dpos + IMB_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[IMB_] = %d, DESC%s[IMB_] %s", MNAME, DESCA[IMB_], MNAME, "must be at least 1" ); } if( DESCA[INB_] < 1 ) { /* * DESCA[INB_] must be at least one */ *INFO = MIN( *INFO, dpos + INB_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[INB_] = %d, DESC%s[INB_] %s", MNAME, DESCA[INB_], MNAME, "must be at least 1" ); } if( DESCA[MB_] < 1 ) { /* * DESCA[MB_] must be at least one */ *INFO = MIN( *INFO, dpos + MB_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[MB_] = %d, DESC%s[MB_] %s", MNAME, DESCA[MB_], MNAME, "must be at least 1" ); } if( DESCA[NB_] < 1 ) { /* * DESCA[NB_] must be at least one */ *INFO = MIN( *INFO, dpos + NB_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[NB_] = %d, DESC%s[NB_] %s", MNAME, DESCA[NB_], MNAME, "must be at least 1" ); } if( ( DESCA[RSRC_] < -1 ) || ( DESCA[RSRC_] >= nprow ) ) { /* * DESCA[RSRC_] must be either -1 (replication) or in the interval [0 .. nprow) */ *INFO = MIN( *INFO, dpos + RSRC_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[RSRC_] = %d, DESC%s[RSRC_] %s%d", MNAME, DESCA[RSRC_], MNAME, "must be either -1, or >= 0 and < ", nprow ); } if( ( DESCA[CSRC_] < -1 ) || ( DESCA[CSRC_] >= npcol ) ) { /* * DESCX[CSRC_] must be either -1 (replication) or in the interval [0 .. npcol) */ *INFO = MIN( *INFO, dpos + CSRC_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[CSRC_] = %d, DESC%s[CSRC_] %s%d", MNAME, DESCA[CSRC_], MNAME, "must be either -1, or >= 0 and < ", npcol ); } if( M == 0 || N == 0 ) { /* * NULL matrix, relax some checks */ if( DESCA[M_] < 0 ) { /* * DESCX[M_] must be at least 0 */ *INFO = MIN( *INFO, dpos + M_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[M_] = %d, it must be at least 0", MNAME, DESCA[M_] ); } if( DESCA[N_] < 0 ) { /* * DESCX[N_] must be at least 0 */ *INFO = MIN( *INFO, dpos + N_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[N_] = %d, it must be at least 0", MNAME, DESCA[N_] ); } if( DESCA[LLD_] < 1 ) { /* * DESCA[LLD_] must be at least 1 */ *INFO = MIN( *INFO, dpos + LLD_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[LLD_] = %d, it must be at least 1", MNAME, DESCA[LLD_] ); } } else { /* * more rigorous checks for non-degenerate matrix */ if( DESCA[M_] < 1 ) { /* * DESCA[M_] must be at least 1 */ *INFO = MIN( *INFO, dpos + M_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[M_] = %d, it must be at least 1", MNAME, DESCA[M_]); } if( DESCA[N_] < 1 ) { /* * DESCA[N_] must be at least 1 */ *INFO = MIN( *INFO, dpos + N_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[N_] = %d, it must be at least 1", MNAME, DESCA[N_]); } if( ( DESCA[M_] >= 1 ) && ( DESCA[N_] >= 1 ) ) { if( IA+M > DESCA[M_] ) { /* * IA + M must be in [ 0 ... DESCA[M_] ] */ *INFO = MIN( *INFO, iapos ); PB_Cwarn( ICTXT, -1, ROUT, "%s M = %d, I%s = %d, DESC%s[M_] = %d", "Operation out of bounds:", M, MNAME, IA+1, MNAME, DESCA[M_]); } if( JA+N > DESCA[N_] ) { /* * JA + N must be in [ 0 ... DESCA[N_] ] */ *INFO = MIN( *INFO, japos ); PB_Cwarn( ICTXT, -1, ROUT, "%s N = %d, J%s = %d, DESC%s[N_] = %d", "Operation out of bounds:", N, MNAME, JA+1, MNAME, DESCA[N_]); } } /* * *INFO == BIGNUM => No errors have been found so far */ if( *INFO == BIGNUM ) { Mnumroc( np, DESCA[M_], 0, DESCA[IMB_], DESCA[MB_], myrow, DESCA[RSRC_], nprow ); if( DESCA[LLD_] < MAX( 1, np ) ) { Mnumroc( nq, DESCA[N_], 0, DESCA[INB_], DESCA[NB_], mycol, DESCA[CSRC_], npcol ); /* * DESCA[LLD_] must be at least 1 in order to be legal and this is enough if no * columns of A reside in this process. */ if( DESCA[LLD_] < 1 ) { *INFO = MIN( *INFO, dpos + LLD_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[LLD_] = %d, it must be at least 1", MNAME, DESCA[LLD_] ); } else if( nq > 0 ) { /* * Some columns of A reside in this process, DESCA[LLD_] must be at least * MAX( 1, np ). */ *INFO = MIN( *INFO, dpos + LLD_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[LLD_] = %d, it must be at least %d", MNAME, DESCA[LLD_], MAX( 1, np ) ); } } } } /* * Prepare output: set info = 0 if no error, and divide by DESCMULT if error is * not in a descriptor entry. */ if( *INFO == BIGNUM ) *INFO = 0; else if( *INFO % DESCMULT == 0 ) *INFO = -( (*INFO) / DESCMULT ); else *INFO = -(*INFO); /* * End of PB_Cchkmat */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cchkvec.c000644 000766 000024 00000031427 10363532303 020321 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cchkvec( int ICTXT, char * ROUT, char * VNAME, int N, int NPOS0, int IX, int JX, int * DESCX, int INCX, int DPOS0, int * INFO ) #else void PB_Cchkvec( ICTXT, ROUT, VNAME, N, NPOS0, IX, JX, DESCX, INCX, DPOS0, INFO ) /* * .. Scalar Arguments .. */ int DPOS0, ICTXT, IX, * INFO, INCX, JX, N, NPOS0; /* * .. Array Arguments .. */ char * ROUT, * VNAME; int * DESCX; #endif { /* * Purpose * ======= * * PB_Cchkvec checks the validity of a descriptor vector DESCX, the * related global indexes IX, JX and the global increment INCX. If an * inconsistency is found among its parameters IX, JX, DESCX and INCX, * the routine returns an error code in INFO. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * ROUT (global input) pointer to CHAR * On entry, ROUT specifies the name of the routine calling this * input error checking routine. * * VNAME (global input) pointer to CHAR * On entry, VNAME specifies the name of the formal array argu- * ment in the calling routine. * * N (global input) INTEGER * On entry, N specifies the length of the subvector sub( X ). * * NPOS0 (global input) INTEGER * On entry, NPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter N appears. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * DPOS0 (global input) INTEGER * On entry, DPOS0 specifies the position in the calling rou- * tine's parameter list where the formal parameter DESCX ap- * pears. Note that it is assumed that IX and JX are respecti- * vely 2 and 1 entries behind DESCX, and INCX is 1 entry after * DESCX. * * INFO (local input/local output) INTEGER * = 0: successful exit * < 0: If the i-th argument is an array and the j-entry had an * illegal value, then INFO = -(i*100+j), if the i-th * argument is a scalar and had an illegal value, then * INFO = -i. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int dpos, icpos, ixpos, jxpos, mycol, myrow, np, npcol, npos, nprow, nq; /* .. * .. Executable Statements .. * */ /* * Want to find errors with MIN(), so if no error, set it to a big number. If * there already is an error, multiply by the the descriptor multiplier. */ if( *INFO >= 0 ) *INFO = BIGNUM; else if( *INFO < -DESCMULT ) *INFO = -(*INFO); else *INFO = -(*INFO) * DESCMULT; /* * Figure where in parameter list each parameter was, factoring in descriptor * multiplier */ npos = NPOS0 * DESCMULT; ixpos = ( DPOS0 - 2 ) * DESCMULT; jxpos = ( DPOS0 - 1 ) * DESCMULT; icpos = ( DPOS0 + 1 ) * DESCMULT; dpos = DPOS0 * DESCMULT + 1; /* * Get process grid information */ Cblacs_gridinfo( ICTXT, &nprow, &npcol, &myrow, &mycol ); /* * Are N, IX, JX, DESCX and INCX legal inputs ? */ if( N < 0 ) { /* * N must be at least zero */ *INFO = MIN( *INFO, npos ); PB_Cwarn( ICTXT, -1, ROUT, "%s sub( %s ) = %d, it must be at least 0", "Illegal length of", VNAME, N ); } if( IX < 0 ) { /* * IX must be at least zero */ *INFO = MIN( *INFO, ixpos ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal I%s = %d, I%s must be at least 1", VNAME, IX+1, VNAME ); } if( JX < 0 ) { /* * JX must be at least zero */ *INFO = MIN( *INFO, jxpos ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal J%s = %d, J%s must be at least 1", VNAME, JX+1, VNAME ); } if( DESCX[DTYPE_] != BLOCK_CYCLIC_2D_INB ) { /* * Internally, only the descriptor type BLOCK_CYCLIC_2D_INB is supported. */ *INFO = MIN( *INFO, dpos + DTYPE_ ); PB_Cwarn( ICTXT, -1, ROUT, "%s %d for matrix %s. PBLAS accepts: %d or %d", "Illegal descriptor type", DESCX[DTYPE_], VNAME, BLOCK_CYCLIC_2D, BLOCK_CYCLIC_2D_INB ); if( *INFO % DESCMULT == 0 ) *INFO = -( (*INFO) / DESCMULT ); else *INFO = -(*INFO); /* * No need to go any further ... */ return; } if( DESCX[CTXT_] != ICTXT ) { /* * Check if the context of X match the other contexts. Only intra-context * operations are supported. */ *INFO = MIN( *INFO, dpos + CTXT_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[CTXT_] = %d %s= %d", VNAME, DESCX[CTXT_], "does not match other operand's context ", ICTXT ); if( *INFO % DESCMULT == 0 ) *INFO = -( (*INFO) / DESCMULT ); else *INFO = -(*INFO); /* * No need to go any further ... */ return; } if( DESCX[IMB_] < 1 ) { /* * DESCX[IMB_] must be at least one */ *INFO = MIN( *INFO, dpos + IMB_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[IMB_] = %d, DESC%s[IMB_] %s", VNAME, DESCX[IMB_], VNAME, "must be at least 1" ); } if( DESCX[INB_] < 1 ) { /* * DESCX[INB_] must be at least one */ *INFO = MIN( *INFO, dpos + INB_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[INB_] = %d, DESC%s[INB_] %s", VNAME, DESCX[INB_], VNAME, "must be at least 1" ); } if( DESCX[MB_] < 1 ) { /* * DESCX[MB_] must be at least one */ *INFO = MIN( *INFO, dpos + MB_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[MB_] = %d, DESC%s[MB_] %s", VNAME, DESCX[MB_], VNAME, "must be at least 1" ); } if( DESCX[NB_] < 1 ) { /* * DESCX[NB_] must be at least one */ *INFO = MIN( *INFO, dpos + NB_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[NB_] = %d, DESC%s[NB_] %s", VNAME, DESCX[NB_], VNAME, "must be at least 1" ); } if( ( DESCX[RSRC_] < -1 ) || ( DESCX[RSRC_] >= nprow ) ) { /* * DESCX[RSRC_] must be either -1 (replication) or in the interval [0 .. nprow) */ *INFO = MIN( *INFO, dpos + RSRC_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[RSRC_] = %d, DESC%s[RSRC_] %s%d", VNAME, DESCX[RSRC_], VNAME, "must be either -1, or >= 0 and < ", nprow ); } if( ( DESCX[CSRC_] < -1 ) || ( DESCX[CSRC_] >= npcol ) ) { /* * DESCX[CSRC_] must be either -1 (replication) or in the interval [0 .. npcol) */ *INFO = MIN( *INFO, dpos + CSRC_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[CSRC_] = %d, DESC%s[CSRC_] %s%d", VNAME, DESCX[CSRC_], VNAME, "must be either -1, or >= 0 and < ", npcol ); } if( INCX != 1 && INCX != DESCX[M_] ) { /* * INCX must be either 1 or DESCX[M_] */ *INFO = MIN( *INFO, icpos ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal INC%s = %d, INC%s should be either 1 or %d", VNAME, DESCX[M_], VNAME ); } if( N == 0 ) { /* * NULL vector, relax some checks */ if( DESCX[M_] < 0 ) { /* * DESCX[M_] must be at least 0 */ *INFO = MIN( *INFO, dpos + M_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[M_] = %d, it must be at least 0", VNAME, DESCX[M_] ); } if( DESCX[N_] < 0 ) { /* * DESCX[N_] must be at least 0 */ *INFO = MIN( *INFO, dpos + N_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[N_] = %d, it must be at least 0", VNAME, DESCX[N_] ); } if( DESCX[LLD_] < 1 ) { /* * DESCX[LLD_] must be at least 1 */ *INFO = MIN( *INFO, dpos + LLD_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[LLD_] = %d, it must be at least 1", VNAME, DESCX[LLD_] ); } } else { /* * more rigorous checks for non-degenerate vector */ if( DESCX[M_] < 1 ) { /* * DESCX[M_] must be at least 1 */ *INFO = MIN( *INFO, dpos + M_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[M_] = %d, it must be at least 1", VNAME, DESCX[M_]); } if( DESCX[N_] < 1 ) { /* * DESCX[N_] must be at least 1 */ *INFO = MIN( *INFO, dpos + N_ ); PB_Cwarn( ICTXT, -1, ROUT, "Illegal DESC%s[N_] = %d, it must be at least 1", VNAME, DESCX[N_]); } if( ( DESCX[M_] >= 1 ) && ( DESCX[N_] >= 1 ) ) { if( INCX == DESCX[M_] ) { /* * sub( X ) resides in (a) process row(s) */ if( IX >= DESCX[M_] ) { /* * IX must be in [ 0 ... DESCX[M_]-1 ] */ *INFO = MIN( *INFO, ixpos ); PB_Cwarn( ICTXT, -1, ROUT, "%s I%s = %d, DESC%s[M_] = %d", "Array subscript out of bounds:", VNAME, IX+1, VNAME, DESCX[M_]); } if( JX+N > DESCX[N_] ) { /* * JX + N must be in [ 0 ... DESCX[N_]-1 ] */ *INFO = MIN( *INFO, jxpos ); PB_Cwarn( ICTXT, -1, ROUT, "%s N = %d, J%s = %d, DESC%s[N_] = %d", "Operation out of bounds:", N, VNAME, JX+1, VNAME, DESCX[N_]); } } else { /* * sub( X ) resides in (a) process column(s) */ if( JX >= DESCX[N_] ) { /* * JX must be in [ 0 ... DESCX[N_] ] */ *INFO = MIN( *INFO, jxpos ); PB_Cwarn( ICTXT, -1, ROUT, "%s J%s = %d, DESC%s[N_] = %d", "Array subscript out of bounds:", VNAME, JX+1, VNAME, DESCX[N_]); } if( IX+N > DESCX[M_] ) { /* * IX + N must be in [ 0 ... DESCX[M_] ] */ *INFO = MIN( *INFO, ixpos ); PB_Cwarn( ICTXT, -1, ROUT, "%s N = %d, I%s = %d, DESC%s[M_] = %d", "Operation out of bounds:", N, VNAME, IX+1, VNAME, DESCX[M_]); } } } /* * *INFO == BIGNUM => No errors have been found so far */ if( *INFO == BIGNUM ) { Mnumroc( np, DESCX[M_], 0, DESCX[IMB_], DESCX[MB_], myrow, DESCX[RSRC_], nprow ); if( DESCX[LLD_] < MAX( 1, np ) ) { Mnumroc( nq, DESCX[N_], 0, DESCX[INB_], DESCX[NB_], mycol, DESCX[CSRC_], npcol ); /* * DESCX[LLD_] must be at least 1 in order to be legal and this is enough if no * columns of X reside in this process */ if( DESCX[LLD_] < 1 ) { *INFO = MIN( *INFO, dpos + LLD_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[LLD_] = %d, it must be at least 1", VNAME, DESCX[LLD_] ); } else if( nq > 0 ) { /* * Some columns of X reside in this process, DESCX[LLD_] must be at least * MAX( 1, np ) */ *INFO = MIN( *INFO, dpos + LLD_ ); PB_Cwarn( ICTXT, -1, ROUT, "DESC%s[LLD_] = %d, it must be at least %d", VNAME, DESCX[LLD_], MAX( 1, np ) ); } } } } /* * Prepare output: set INFO = 0 if no error, and divide by DESCMULT if error is * not in a descriptor entry. */ if( *INFO == BIGNUM ) *INFO = 0; else if( *INFO % DESCMULT == 0 ) *INFO = -( (*INFO) / DESCMULT ); else *INFO = -(*INFO); /* * End of PB_Cchkvec */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cconjg.c000644 000766 000024 00000004160 10363532303 020150 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cconjg( PBTYP_T * TYPE, char * ALPHA, char * CALPHA ) #else void PB_Cconjg( TYPE, ALPHA, CALPHA ) /* * .. Scalar Arguments .. */ char * ALPHA, * CALPHA; PBTYP_T * TYPE; #endif { /* * Purpose * ======= * * PB_Cconjg conjugates of the scalar alpha. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * ALPHA (local input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * CALPHA (local output) pointer to CHAR * On exit, CALPHA contains the conjugate of the scalar alpha. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ switch( TYPE->type ) { case SCPLX: ((float*)(CALPHA))[REAL_PART] = ((float*)(ALPHA))[REAL_PART]; ((float*)(CALPHA))[IMAG_PART] = -((float*)(ALPHA))[IMAG_PART]; break; case DCPLX: ((double*)(CALPHA))[REAL_PART] = ((double*)(ALPHA))[REAL_PART]; ((double*)(CALPHA))[IMAG_PART] = -((double*)(ALPHA))[IMAG_PART]; break; case SREAL: ((float*)(CALPHA))[REAL_PART] = ((float*)(ALPHA))[REAL_PART]; break; case DREAL: ((double*)(CALPHA))[REAL_PART] = ((double*)(ALPHA))[REAL_PART]; break; case INT: *((int*)(CALPHA)) = *((int*)(ALPHA)); break; default: ; } /* * End of PB_Cconjg */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cctypeset.c000644 000766 000024 00000006431 10363532303 020713 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" PBTYP_T * PB_Cctypeset() { /* * Purpose * ======= * * PB_Cctypeset on the first call initializes a static structure contai- * ning typed information and returns a pointer to it. The other calls * to this routine just returns this pointer. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ static int setup=0; static PBTYP_T TypeStruct; static cmplx zero, one, negone; /* .. * .. Executable Statements .. * */ if( setup ) return( &TypeStruct ); setup = 1; TypeStruct.type = SCPLX; TypeStruct.usiz = sizeof( float ); TypeStruct.size = sizeof( cmplx ); zero [REAL_PART] = ZERO; zero [IMAG_PART] = ZERO; one [REAL_PART] = ONE; one [IMAG_PART] = ZERO; negone[REAL_PART] = -ONE; negone[IMAG_PART] = ZERO; TypeStruct.zero = ((char *) zero); TypeStruct.one = ((char *) one); TypeStruct.negone = ((char *) negone); TypeStruct.Cgesd2d = Ccgesd2d; TypeStruct.Cgerv2d = Ccgerv2d; TypeStruct.Cgebs2d = Ccgebs2d; TypeStruct.Cgebr2d = Ccgebr2d; TypeStruct.Cgsum2d = Ccgsum2d; TypeStruct.Fmmadd = cmmadd_; TypeStruct.Fmmcadd = cmmcadd_; TypeStruct.Fmmtadd = cmmtadd_; TypeStruct.Fmmtcadd = cmmtcadd_; TypeStruct.Fmmdda = cmmdda_; TypeStruct.Fmmddac = cmmddac_; TypeStruct.Fmmddat = cmmddat_; TypeStruct.Fmmddact = cmmddact_; TypeStruct.Fcshft = ccshft_; TypeStruct.Frshft = crshft_; TypeStruct.Fvvdotu = cvvdotu_; TypeStruct.Fvvdotc = cvvdotc_; TypeStruct.Fset = cset_; TypeStruct.Ftzpad = ctzpad_; TypeStruct.Ftzpadcpy = ctzpadcpy_; TypeStruct.Ftzscal = ctzscal_; TypeStruct.Fhescal = chescal_; TypeStruct.Ftzcnjg = ctzcnjg_; TypeStruct.Faxpy = caxpy_; TypeStruct.Fcopy = ccopy_; TypeStruct.Fswap = cswap_; TypeStruct.Fgemv = cgemv_; TypeStruct.Fsymv = csymv_; TypeStruct.Fhemv = chemv_; TypeStruct.Ftrmv = ctrmv_; TypeStruct.Ftrsv = ctrsv_; TypeStruct.Fagemv = cagemv_; TypeStruct.Fasymv = casymv_; TypeStruct.Fahemv = cahemv_; TypeStruct.Fatrmv = catrmv_; TypeStruct.Fgerc = cgerc_; TypeStruct.Fgeru = cgeru_; TypeStruct.Fsyr = csyr_; TypeStruct.Fher = cher_; TypeStruct.Fsyr2 = csyr2_; TypeStruct.Fher2 = cher2_; TypeStruct.Fgemm = cgemm_; TypeStruct.Fsymm = csymm_; TypeStruct.Fhemm = chemm_; TypeStruct.Fsyrk = csyrk_; TypeStruct.Fherk = cherk_; TypeStruct.Fsyr2k = csyr2k_; TypeStruct.Fher2k = cher2k_; TypeStruct.Ftrmm = ctrmm_; TypeStruct.Ftrsm = ctrsm_; return( &TypeStruct ); /* * End of PB_Cctypeset */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cdescribe.c000644 000766 000024 00000022701 10363532303 020631 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cdescribe( int M, int N, int IA, int JA, int * DA, int NPROW, int NPCOL, int MYROW, int MYCOL, int * II, int * JJ, int * LDA, int * IMB, int * INB, int * MB, int * NB, int * PROW, int * PCOL, int * DA0 ) #else void PB_Cdescribe( M, N, IA, JA, DA, NPROW, NPCOL, MYROW, MYCOL, II, JJ, LDA, IMB, INB, MB, NB, PROW, PCOL, DA0 ) /* * .. Scalar Arguments .. */ int IA, *II, * IMB, * INB, JA, * JJ, * LDA, M, * MB, MYCOL, MYROW, N, * NB, NPCOL, NPROW, * PCOL, * PROW; /* * .. Array Arguments .. */ int * DA, * DA0; #endif { /* * Purpose * ======= * * PB_Cdescribe returns the global descriptor of a submatrix. This rou- * tine also computes the starting local index II, JJ corresponding to * the submatrix starting globally at the entry pointed by IA, JA. This * routine returns the coordinates in the grid of the process owning the * matrix entry of global indexes I, J, namely PROW and PCOL. The true * global block sizes IMB, INB, MB and NB are also returned. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * M (global input) INTEGER * On entry, M specifies the number of rows being dealt out * starting from global index IA. M is also the number of rows * of the submatrix of interest. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns being dealt out * starting from global index JA. N is also the number of col- * umns of the submatrix of interest. M must be at least zero. * * IA (global input) INTEGER * On entry, IA specifies the global starting row index of the * submatrix. IA must at least zero. * * JA (global input) INTEGER * On entry, JA specifies the global starting column index of * the submatrix. JA must at least zero. * * DA (global and local input) INTEGER array * On entry, DA is an integer array of dimension DLEN_. This is * the array descriptor of the underlying matrix. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process columns * over which the matrix is distributed. NPCOL must be at least * one. * * MYROW (local input) INTEGER * On entry, MYROW specifies the row coordinate of the process * whose local index II is determined. MYROW must be at least * zero and strictly less than NPROW. * * II (local output) INTEGER * On exit, II specifies the local starting row index of the * submatrix. On exit, II is at least zero. * * JJ (local output) INTEGER * On exit, JJ specifies the local starting column index of the * submatrix. On exit, JJ is at least zero. * * LDA (local output) INTEGER * On exit, LDA specifies the local leading dimension of the lo- * cal array containing the distributed matrix entries. LDA must * be at least one. * * IMB (global output) INTEGER * On exit, IMB specifies the true global number of matrix rows * of the first block, if M rows are given out starting from the * global index IA. If M is equal zero, IMB is set to zero. * * INB (global output) INTEGER * On exit, INB specifies the true global number of matrix col- * umns of the first block, if N columns are given out starting * from the global index JA. If N is equal zero, INB is set to * zero. * * MB (global output) INTEGER * On exit, MB specifies the size of the blocks used to parti- * tion the matrix rows. MB is at least one. * * NB (global output) INTEGER * On exit, NB specifies the size of the blocks used to parti- * tion the matrix columns. NB is at least one. * * PROW (global output) INTEGER * On exit, PROW specifies the row coordinate of the process * that possesses the first row of the submatrix. On exit, PROW * is -1 if DESC( RSRC_ ) is -1 on input, and, at least zero * and strictly less than NPROW otherwise. * * PCOL (global output) INTEGER * On exit, PCOL specifies the column coordinate of the process * that possesses the first column of the submatrix. On exit, * PCOL is -1 if DESC( CSRC_ ) is -1 on input, and, at least * zero and strictly less than NPCOL otherwise. * * DA0 (global and local output) INTEGER array * On exit, DA0 is an integer array of dimension DLEN_. This is * the array descriptor of the submatrix A(IA:IA+M-1,JA:JA+N-1). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ *MB = DA[MB_]; Mfirstnb( *IMB, M, IA, DA[IMB_], *MB ); *NB = DA[NB_]; Mfirstnb( *INB, N, JA, DA[INB_], *NB ); *LDA = DA[LLD_]; Minfog2l( IA, JA, DA, NPROW, NPCOL, MYROW, MYCOL, *II, *JJ, *PROW, *PCOL ); MDescSet( DA0, M, N, *IMB, *INB, *MB, *NB, *PROW, *PCOL, DA[CTXT_], *LDA ); /* * End of PB_Cdescribe */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cdescset.c000644 000766 000024 00000016526 10363532303 020513 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cdescset( int * DESC, int M, int N, int IMB, int INB, int MB, int NB, int RSRC, int CSRC, int CTXT, int LLD ) #else void PB_Cdescset( DESC, M, N, IMB, INB, MB, NB, RSRC, CSRC, CTXT, LLD ) /* * .. Scalar Arguments .. */ int CSRC, CTXT, IMB, INB, LLD, M, MB, N, NB, RSRC; /* * .. Array Arguments .. */ int * DESC; #endif { /* * Purpose * ======= * * PB_Cdescset uses its 10 input arguments M, N, IMB, INB, MB, NB, * RSRC, CSRC, CTXT and LLD to initialize a descriptor vector of type * BLOCK_CYCLIC_2D_INB. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * DESC (global and local output) INTEGER array * On entry, DESC is an array of dimension DLEN_. DESC is the * array descriptor to be set. * * M (global input) INTEGER * On entry, M specifies the number of rows of the matrix. * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the matrix. * N must be at least zero. * * IMB (global input) INTEGER * On entry, IMB specifies the row size of the first block of * the global matrix distribution. IMB must be at least one. * * INB (global input) INTEGER * On entry, INB specifies the column size of the first block * of the global matrix distribution. INB must be at least one. * * MB (global input) INTEGER * On entry, MB specifies the row size of the blocks used to * partition the matrix. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the column size of the blocks used to * partition the matrix. NB must be at least one. * * RSRC (global input) INTEGER * On entry, RSRC specifies the row coordinate of the process * that possesses the first row of the matrix. When RSRC = -1, * the data is not distributed but replicated, otherwise RSRC * must be at least zero and strictly less than NPROW. * * CSRC (global input) INTEGER * On entry, CSRC specifies the column coordinate of the pro- * cess that possesses the first column of the matrix. When * CSRC = -1, the data is not distributed but replicated, other- * wise CSRC must be at least zero and strictly less than NPCOL. * * CTXT (local input) INTEGER * On entry, CTXT specifies the BLACS context handle, indicating * the global communication context. The value of the context * itself is local. * * LLD (local input) INTEGER * On entry, LLD specifies the leading dimension of the local * array storing the local entries of the matrix. LLD must be at * least MAX( 1, Lr(1,M) ). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ DESC[DTYPE_] = BLOCK_CYCLIC_2D_INB; DESC[CTXT_ ] = CTXT; DESC[M_ ] = M; DESC[N_ ] = N; DESC[IMB_ ] = IMB; DESC[INB_ ] = INB; DESC[MB_ ] = MB; DESC[NB_ ] = NB; DESC[RSRC_ ] = RSRC; DESC[CSRC_ ] = CSRC; DESC[LLD_ ] = LLD; /* * End of PB_Cdescset */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cdtypeset.c000644 000766 000024 00000006242 10363532303 020714 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" PBTYP_T * PB_Cdtypeset() { /* * Purpose * ======= * * PB_Cdtypeset on the first call initializes a static structure contai- * ning typed information and returns a pointer to it. The other calls * to this routine just returns this pointer. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ static int setup=0; static PBTYP_T TypeStruct; static double zero, one, negone; /* .. * .. Executable Statements .. * */ if( setup ) return( &TypeStruct ); setup = 1; TypeStruct.type = DREAL; TypeStruct.usiz = sizeof( double ); TypeStruct.size = sizeof( double ); zero = ZERO; one = ONE; negone = -ONE; TypeStruct.zero = (char *) (&zero); TypeStruct.one = (char *) (&one); TypeStruct.negone = (char *) (&negone); TypeStruct.Cgesd2d = Cdgesd2d; TypeStruct.Cgerv2d = Cdgerv2d; TypeStruct.Cgebs2d = Cdgebs2d; TypeStruct.Cgebr2d = Cdgebr2d; TypeStruct.Cgsum2d = Cdgsum2d; TypeStruct.Fmmadd = dmmadd_; TypeStruct.Fmmcadd = dmmcadd_; TypeStruct.Fmmtadd = dmmtadd_; TypeStruct.Fmmtcadd = dmmtcadd_; TypeStruct.Fmmdda = dmmdda_; TypeStruct.Fmmddac = dmmddac_; TypeStruct.Fmmddat = dmmddat_; TypeStruct.Fmmddact = dmmddact_; TypeStruct.Fcshft = dcshft_; TypeStruct.Frshft = drshft_; TypeStruct.Fvvdotu = dvvdot_; TypeStruct.Fvvdotc = dvvdot_; TypeStruct.Fset = dset_; TypeStruct.Ftzpad = dtzpad_; TypeStruct.Ftzpadcpy = dtzpadcpy_; TypeStruct.Ftzscal = dtzscal_; TypeStruct.Fhescal = dtzscal_; TypeStruct.Ftzcnjg = dtzscal_; TypeStruct.Faxpy = daxpy_; TypeStruct.Fcopy = dcopy_; TypeStruct.Fswap = dswap_; TypeStruct.Fgemv = dgemv_; TypeStruct.Fsymv = dsymv_; TypeStruct.Fhemv = dsymv_; TypeStruct.Ftrmv = dtrmv_; TypeStruct.Ftrsv = dtrsv_; TypeStruct.Fagemv = dagemv_; TypeStruct.Fasymv = dasymv_; TypeStruct.Fahemv = dasymv_; TypeStruct.Fatrmv = datrmv_; TypeStruct.Fgerc = dger_; TypeStruct.Fgeru = dger_; TypeStruct.Fsyr = dsyr_; TypeStruct.Fher = dsyr_; TypeStruct.Fsyr2 = dsyr2_; TypeStruct.Fher2 = dsyr2_; TypeStruct.Fgemm = dgemm_; TypeStruct.Fsymm = dsymm_; TypeStruct.Fhemm = dsymm_; TypeStruct.Fsyrk = dsyrk_; TypeStruct.Fherk = dsyrk_; TypeStruct.Fsyr2k = dsyr2k_; TypeStruct.Fher2k = dsyr2k_; TypeStruct.Ftrmm = dtrmm_; TypeStruct.Ftrsm = dtrsm_; return( &TypeStruct ); /* * End of PB_Cdtypeset */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cfirstnb.c000644 000766 000024 00000003663 10363532303 020526 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Cfirstnb( int N, int I, int INB, int NB ) #else int PB_Cfirstnb( N, I, INB, NB ) /* * .. Scalar Arguments .. */ int I, INB, N, NB; #endif { /* * Purpose * ======= * * PB_Cfirstnb returns the global number of matrix rows or columns of the * first block, if N rows or columns are given out starting from the * global index I. Note that if N is equal 0, this routine returns 0. * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the number of rows/columns being dealt * out. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global index of the matrix entry. * I must be at least zero. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix distribution. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int inbt; /* .. * .. Executable Statements .. * */ inbt = ( ( INB -= I ) <= 0 ? ( (-INB) / NB + 1 ) * NB + INB : INB ); return( MIN( inbt, N ) ); /* * End of PB_Cfirstnb */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cg2lrem.c000644 000766 000024 00000014470 10363532303 020245 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Cg2lrem( int IG, int INB, int NB, int MYPROC, int SRCPROC, int NPROCS ) #else int PB_Cg2lrem( IG, INB, NB, MYPROC, SRCPROC, NPROCS ) /* * .. Scalar Arguments .. */ int IG, INB, NB, NPROCS, MYPROC, SRCPROC; #endif { /* * Purpose * ======= * * PB_Cg2lrem computes the local index of a matrix entry pointed to by * the global index IG. Note that when MYPROC is not the process owning * this entry, this routine returns the closest larger local index cor- * responding to IG just like the routine PB_Cinfog2l. * * Arguments * ========= * * IG (global input) INTEGER * On entry, IG specifies the global index of the matrix entry. * IG must be at least zero. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * MYPROC (global input) INTEGER * On entry, MYPROC specifies the process number in which the * value of the local index is to be computed. MYPROC must be at * least zero and strictly less than NPROCS. * * SRCPROC (global input) INTEGER * On entry, if SRCPROC = -1, the data is not distributed but * replicated, in which case this routine returns IG in all * processes. Otherwise, the value of SRCPROC is ignored. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the matrix is distributed. NPROCS must * be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int ilocblk, mydist, nblocks, proc; /* .. * .. Executable Statements .. * */ /* * The data is not distributed, or there is just one process in this dimension * of the grid. */ if( ( SRCPROC == -1 ) || ( NPROCS == 1 ) ) return( IG ); /* * IG refers to an entry in the first block */ if( IG < INB ) return( ( MYPROC == SRCPROC ? IG : 0 ) ); /* * The discussion goes as follows: compute my distance from the source process * so that within this process coordinate system, the source process is the * process such that mydist = 0, or equivalently MYROC == SRCPROC. * * Find out the global coordinate of the block IG belongs to (nblocks), as well * as the minimum local number of blocks that every process has. * * when mydist < nblocks - ilocblk * NPROCS, I own ilocblk + 1 full blocks, * when mydist > nblocks - ilocblk * NPROCS, I own ilocblk full blocks, * when mydist = nblocks - ilocblk * NPROCS, I own ilocblk full blocks * but not IG, or I own ilocblk + 1 blocks and the entry IG refers to. */ if( MYPROC == SRCPROC ) { /* * If I am the source process and there are less than NPROCS blocks, then * the local index in that process is INB. */ nblocks = ( IG - INB ) / NB + 1; if( nblocks < NPROCS ) return( INB ); /* * IG refers to an entry that is not in the first block, find out which process * has it. */ proc = SRCPROC + nblocks; proc -= ( proc / NPROCS ) * NPROCS; /* * Since mydist = 0 and nblocks - ilocblk * NPROCS >= 0, there are only three * possible cases: * * 1) When 0 = mydist = nblocks - ilocblk * NPROCS = 0 and I don't own IG, in * which case II = INB + ( ilocblk - 1 ) * NB. Note that this case cannot * happen when ilocblk is zero, since nblocks is at least one. * * 2) When 0 = mydist = nblocks - ilocblk * NPROCS = 0 and I own IG, in which * case IG and II can respectively be written as INB + (nblocks-1)*NB + IL, * INB + (ilocblk-1) * NB + IL. That is II = IG + ( ilocblk - nblocks )*NB. * Note that this case cannot happen when ilocblk is zero, since nblocks * is at least one. * * 3) mydist = 0 < nblocks - ilocblk * NPROCS, the source process owns * ilocblk+1 full blocks, and therefore II = INB + ilocblk * NB. Note * that when ilocblk is zero, II is just INB. */ ilocblk = nblocks / NPROCS; if( ilocblk * NPROCS >= nblocks ) return( ( ( MYPROC == proc ) ? IG + ( ilocblk - nblocks ) * NB : INB + ( ilocblk - 1 ) * NB ) ); else return( INB + ilocblk * NB ); } else { /* * IG refers to an entry that is not in the first block, find out which process * has it. */ nblocks = ( IG -= INB ) / NB + 1; proc = SRCPROC + nblocks; proc -= ( proc / NPROCS ) * NPROCS; /* * Compute my distance from the source process so that within this process * coordinate system, the source process is the process such that mydist=0. */ if( ( mydist = MYPROC - SRCPROC ) < 0 ) mydist += NPROCS; /* * When mydist < nblocks - ilocblk * NPROCS, I own ilocblk + 1 full blocks of * size NB since I am not the source process, i.e. II = ( ilocblk + 1 ) * NB. * When mydist >= nblocks - ilocblk * NPROCS and I don't own IG, I own ilocblk * full blocks of size NB, i.e. II = ilocblk * NB, otherwise I own ilocblk * blocks and IG, in which case IG can be written as INB + (nblocks-1)*NB + IL * and II = ilocblk*NB + IL = IG - INB + ( ilocblk - nblocks + 1 )*NB. */ if( nblocks < NPROCS ) { mydist -= nblocks; return( ( ( mydist < 0 ) ? NB : ( ( MYPROC == proc ) ? IG + ( 1 - nblocks ) * NB : 0 ) ) ); } else { ilocblk = nblocks / NPROCS; mydist -= nblocks - ilocblk * NPROCS; return( ( ( mydist < 0 ) ? ( ilocblk + 1 ) * NB : ( ( MYPROC == proc ) ? ( ilocblk - nblocks + 1 ) * NB + IG : ilocblk * NB ) ) ); } } /* * End of PB_Cg2lrem */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CGatherV.c000644 000766 000024 00000065545 10363532303 020426 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CGatherV( PBTYP_T * TYPE, char * ALLOC, char * DIRECA, int M, int N, char * A, int IA, int JA, int * DESCA, char * AROC, char * * B, int * DESCB, int * BFREE ) #else void PB_CGatherV( TYPE, ALLOC, DIRECA, M, N, A, IA, JA, DESCA, AROC, B, DESCB, BFREE ) /* * .. Scalar Arguments .. */ char * ALLOC, * AROC, * DIRECA; int * BFREE, IA, JA, M, N; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * * B; #endif { /* * Purpose * ======= * * PB_CGatherV aggregates a submatrix sub( A ) = A(IA:IA+M-1, JA:JA+N-1) * into a one-dimensional multivector B. The submatrix sub( A ) is spe- * cified on input to this routine that is reused whenever possible. On * return, the one-dimensional multivector is specified by a pointer to * some data, a descriptor array describing its layout and a logical va- * lue indicating if this local piece of data has been dynamically allo- * cated by this function. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * ALLOC (global input) pointer to CHAR * On entry, ALLOC specifies if data should be allocated even * when unnecessary as follows: * ALLOC = 'A' or 'a' data allocation is enforced, * ALLOC = 'R' or 'r' data is reused when possible. * * DIRECA (global input) pointer to CHAR * On entry, DIRECA specifies the direction in which the rows * or columns of sub( A ) should be aggregated as follows: * DIRECA = 'F' or 'f' forward or increasing, * DIRECA = 'B' or 'b' backward or decreasing. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where LLD_A * is DESCA[LLD_], i.e. at least MAX( 1, Lr( M, IA ) ), and, * Ka is at least Lc( N, JA ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * AROC (global input) pointer to CHAR * On entry, AROC specifies the orientation of the submatrix * sub( A ). When AROC is 'R' or 'r', sub( A ) is a row matrix, * and a column matrix otherwise. * * B (local output) pointer to pointer to CHAR * On exit, * B is an array containing the aggregated submatrix * sub( A ). * * DESCB (global and local output) INTEGER array * On exit, DESCB is a descriptor array of dimension DLEN_ des- * cribing the data layout of the data pointed to by * B. * * BFREE (local output) INTEGER * On exit, BFREE specifies if it has been possible to reuse * the submatrix sub( A ), i.e., if some dynamic memory was al- * located for the data pointed to by *B or not. When BFREE is * zero, no dynamic memory was allocated. Otherwise, some dyna- * mic memory was allocated by this function that one MUST re- * lease as soon as possible. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one, * zero; int Afwd, AggRow, AiiD, AiiR, Ainb1D, Ainb1R, Ald, AmyprocD, AmyprocR, AnR, AnbD, AnbR, AnnxtL, AnnxtR, AnpD, AnpR, AnpreR, AnprocsR, ArocR, AsrcD, AsrcR, Bld, Bsrc_, ctxt, k, kb, kblks, kn, ktmp, mycol, mydist, mydistnb, myrow, nlen, npcol, nprow, offset, size, srcdist; MMADD_T add; MMSHFT_T shft; /* * .. Local Arrays .. */ char * Aptr = NULL, * Bptr = NULL; /* .. * .. Executable Statements .. * */ /* * Initialize the output parameters to a default value */ *BFREE = 0; *B = NULL; /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) ) { PB_Cdescset( DESCB, M, N, DESCA[IMB_], DESCA[INB_], DESCA[MB_], DESCA[NB_], DESCA[RSRC_], DESCA[CSRC_], DESCA[CTXT_], 1 ); return; } /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); if( ( AggRow = ( Mupcase( AROC[0] ) == CROW ) ) != 0 ) { /* * Accumulate rows of sub( A ) */ AnbR = DESCA[MB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_]; PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &AiiR, &AiiD, &AsrcR, &AsrcD ); Ainb1D = PB_Cfirstnb( N, JA, DESCA[INB_], AnbD ); AnpD = PB_Cnumroc( N, 0, Ainb1D, AnbD, mycol, AsrcD, npcol ); /* * If sub( A ) is either replicated or spans only one process row, no data needs * to be exchanged by the processes, the operation is purely local. */ if( !( PB_Cspan( M, IA, DESCA[IMB_], AnbR, AsrcR, nprow ) ) ) { if( Mupcase( ALLOC[0] ) == CREUSE ) { /* * sub( A ) can be reused */ if( ( ( myrow == AsrcR ) || ( AsrcR < 0 ) ) && ( AnpD > 0 ) ) { /* * If I own some entries of sub( A ), set *B */ Bld = Ald; *B = Mptr( A, AiiR, AiiD, Ald, TYPE->size ); } else { Bld = 1; } } else { /* * sub( A ) cannot be reused, make a copy of it. */ if( ( ( myrow == AsrcR ) || ( AsrcR < 0 ) ) && ( AnpD > 0 ) ) { /* * If I own some entries of sub( A ), allocate space for the copy, and copy the * data. */ Bld = M; if( AnpD > 0 ) { size = TYPE->size; *B = PB_Cmalloc( AnpD * M * size ); *BFREE = 1; TYPE->Fmmadd( &M, &AnpD, TYPE->one, Mptr( A, AiiR, AiiD, Ald, size ), &Ald, TYPE->zero, *B, &Bld ); } } else { Bld = 1; } } /* * Describe the resulting operand */ PB_Cdescset( DESCB, M, N, M, Ainb1D, AnbR, AnbD, AsrcR, AsrcD, ctxt, Bld ); return; } AnR = M; Bsrc_ = RSRC_; AmyprocR = myrow; AmyprocD = mycol; AnprocsR = nprow; Ainb1R = PB_Cfirstnb( M, IA, DESCA[IMB_], AnbR ); AnpR = PB_Cnumroc( M, 0, Ainb1R, AnbR, myrow, AsrcR, nprow ); } else { /* * Accumulate columns of sub( A ) */ AnbD = DESCA[MB_ ]; AnbR = DESCA[NB_ ]; Ald = DESCA[LLD_]; PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &AiiD, &AiiR, &AsrcD, &AsrcR ); Ainb1D = PB_Cfirstnb( M, IA, DESCA[IMB_], AnbD ); AnpD = PB_Cnumroc( M, 0, Ainb1D, AnbD, myrow, AsrcD, nprow ); /* * If sub( A ) is either replicated or spans only one process column, no data * needs to be exchanged by the processes, the operation is purely local. */ if( !( PB_Cspan( N, JA, DESCA[INB_], AnbR, AsrcR, npcol ) ) ) { if( Mupcase( ALLOC[0] ) == CREUSE ) { /* * sub( A ) can be reused */ Bld = Ald; if( ( ( mycol == AsrcR ) || ( AsrcR < 0 ) ) && ( AnpD > 0 ) ) /* * If I own some entries of sub( A ), set *B */ *B = Mptr( A, AiiD, AiiR, Ald, TYPE->size ); } else { /* * sub( A ) cannot be reused, make a copy of it. */ Bld = MAX( 1, AnpD ); if( ( ( mycol == AsrcR ) || ( AsrcR < 0 ) ) && ( AnpD > 0 ) ) { /* * If I own some entries of sub( A ), allocate space for the copy, and copy the * data. */ if( AnpD > 0 ) { size = TYPE->size; *B = PB_Cmalloc( AnpD * N * size ); *BFREE = 1; TYPE->Fmmadd( &AnpD, &N, TYPE->one, Mptr( A, AiiD, AiiR, Ald, size ), &Ald, TYPE->zero, *B, &Bld ); } } } /* * Describe the resulting operand */ PB_Cdescset( DESCB, M, N, Ainb1D, N, AnbD, AnbR, AsrcD, AsrcR, ctxt, Bld ); return; } AnR = N; Bsrc_ = CSRC_; AmyprocR = mycol; AmyprocD = myrow; AnprocsR = npcol; Ainb1R = PB_Cfirstnb( N, JA, DESCA[INB_], AnbR ); AnpR = PB_Cnumroc( N, 0, Ainb1R, AnbR, mycol, AsrcR, npcol ); } /* * sub( A ) is not replicated and spans more than one process row or column. * Forward row (resp. column) accumulation will leave the resulting operand in * the process(es) where the global row IA+M-1 (resp. global column JA+N-1) * resides. */ if( ( Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD ) ) != 0 ) { if( ( AnpD > 0 ) && ( AnpR > 0 ) ) { /* * Compute how may rows or columns are before me -> AnpreR */ AnpreR = PB_Cnpreroc( AnR, 0, Ainb1R, AnbR, AmyprocR, AsrcR, AnprocsR ); if( AnpreR == 0 ) { /* * If zero rows or columns are before me, I must be the source, so send my piece * to the process after me in the grid. */ if( AggRow ) { TYPE->Cgesd2d( ctxt, AnpR, AnpD, Mptr( A, AiiR, AiiD, Ald, TYPE->size ), Ald, MModAdd1( AmyprocR, AnprocsR ), AmyprocD ); } else { TYPE->Cgesd2d( ctxt, AnpD, AnpR, Mptr( A, AiiD, AiiR, Ald, TYPE->size ), Ald, AmyprocD, MModAdd1( AmyprocR, AnprocsR ) ); } } else if( AnpreR > 0 ) { /* * Otherwise, allocate some space for the rows or columns I have and the ones * globally preceeding the ones I have, that I am about to receive. */ size = TYPE->size; one = TYPE->one; zero = TYPE->zero; add = TYPE->Fmmadd; *B = Bptr = PB_Cmalloc( ( AnpreR + AnpR ) * AnpD * size ); nlen = AnpreR; mydistnb = MModSub( AmyprocR, AsrcR, AnprocsR ) * AnbR; kblks = ( ( ( ktmp = AnR - Ainb1R - 1 ) >= 0 ) ? ( ( ktmp / AnbR ) + 1 ) / AnprocsR : 0 ); offset = kblks * AnbR; kn = Ainb1R + mydistnb - AnbR; kn = MIN( kn, AnpreR ) + ( MAX( 1, kblks ) - 1 ) * mydistnb; if( AggRow ) { shft = TYPE->Frshft; Aptr = Mptr( A, AiiR, AiiD, Ald, size ); Bld = AnpreR + AnpR; /* * Receive the rows globally preceeding the ones I have */ TYPE->Cgerv2d( ctxt, AnpreR, AnpD, *B, Bld, MModSub1( AmyprocR, AnprocsR ), AmyprocD ); /* * Sort the received buffer and insert at the correct place the rows of sub( A ) * I own (from bottom to top). */ if( ( ( AnpR - 1 ) / AnbR ) == kblks ) { kb = AnpR - offset; add( &kb, &AnpD, one, Mptr( Aptr, offset, 0, Ald, size ), &Ald, zero, Mptr( Bptr, nlen+offset, 0, Bld, size ), &Bld ); } for( k = kblks; k >= 1; k-- ) { kb = nlen - kn; shft( &kb, &AnpD, &offset, Mptr( Bptr, kn, 0, Bld, size ), &Bld ); offset -= AnbR; add( &AnbR, &AnpD, one, Mptr( Aptr, offset, 0, Ald, size ), &Ald, zero, Mptr( Bptr, kn+offset, 0, Bld, size ), &Bld ); kn -= mydistnb; nlen -= kb; } if( AnpreR + AnpR != AnR ) { /* * If I am not the last process, i.e I am not supposed to own all of the AnR * rows by the end of the operation, then send the sorted buffer to the next * process and release the dynamically allocated buffer. */ TYPE->Cgesd2d( ctxt, AnpreR+AnpR, AnpD, *B, Bld, MModAdd1( AmyprocR, AnprocsR ), AmyprocD ); if( *B ) free( *B ); } } else { shft = TYPE->Fcshft; Aptr = Mptr( A, AiiD, AiiR, Ald, size ); Bld = MAX( 1, AnpD ); /* * Receive the columns globally preceeding the ones I have */ TYPE->Cgerv2d( ctxt, AnpD, AnpreR, *B, Bld, AmyprocD, MModSub1( AmyprocR, AnprocsR ) ); /* * Sort the received buffer and insert at the correct place the columns of * sub( A ) I own (from right to left). */ if( ( ( AnpR - 1 ) / AnbR ) == kblks ) { kb = AnpR - offset; add( &AnpD, &kb, one, Mptr( Aptr, 0, offset, Ald, size ), &Ald, zero, Mptr( Bptr, 0, nlen+offset, Bld, size ), &Bld ); } for( k = kblks; k >= 1; k-- ) { kb = nlen - kn; shft( &AnpD, &kb, &offset, Mptr( Bptr, 0, kn, Bld, size ), &Bld ); offset -= AnbR; add( &AnpD, &AnbR, one, Mptr( Aptr, 0, offset, Ald, size ), &Ald, zero, Mptr( Bptr, 0, kn + offset, Bld, size ), &Bld ); kn -= mydistnb; nlen -= kb; } if( AnpreR + AnpR != AnR ) { /* * If I am not the last process, i.e I am not supposed to own all of the AnR * columns by the end of the operation, then send the sorted buffer to the next * process and release the dynamically allocated buffer. */ TYPE->Cgesd2d( ctxt, AnpD, AnpreR+AnpR, *B, Bld, AmyprocD, MModAdd1( AmyprocR, AnprocsR ) ); if( *B ) free( *B ); } } } } } else { /* * Backward accumulation, compute the process row or column coordinate ArocR, * that is going to have the resulting operand. */ ArocR = PB_Cindxg2p( AnR-1, Ainb1R, AnbR, AsrcR, AsrcR, AnprocsR ); if( ( AnpD > 0 ) && ( AnpR > 0 ) ) { /* * Compute how may rows or columns are after me -> AnnxtR */ AnnxtR = PB_Cnnxtroc( AnR, 0, Ainb1R, AnbR, AmyprocR, AsrcR, AnprocsR ); AnnxtL = PB_Cnnxtroc( AnR, 0, Ainb1R, AnbR, ArocR, AsrcR, AnprocsR ); if( ( AnnxtR = MModSub( AnnxtR, AnnxtL, AnR ) ) == 0 ) { /* * If zero rows or columns are after me, I must be the source, so send my piece * to the process before me in the grid. */ if( AggRow ) { TYPE->Cgesd2d( ctxt, AnpR, AnpD, Mptr( A, AiiR, AiiD, Ald, TYPE->size ), Ald, MModSub1( AmyprocR, AnprocsR ), AmyprocD ); } else { TYPE->Cgesd2d( ctxt, AnpD, AnpR, Mptr( A, AiiD, AiiR, Ald, TYPE->size ), Ald, AmyprocD, MModSub1( AmyprocR, AnprocsR ) ); } } else if( AnnxtR > 0 ) { /* * Otherwise, allocate some space for the rows or columns I have and the ones * globally following the ones I have, that I am about to receive. */ size = TYPE->size; one = TYPE->one; zero = TYPE->zero; add = TYPE->Fmmadd; *B = Bptr = PB_Cmalloc( ( AnnxtR + AnpR ) * AnpD * size ); kblks = ( ( ( ktmp = AnR - Ainb1R - 1 ) >= 0 ) ? ( ( ktmp / AnbR ) + 1 ) / AnprocsR : 0 ); mydist = MModSub( ArocR, AmyprocR, AnprocsR ); mydistnb = mydist * AnbR; srcdist = MModSub( ArocR, AsrcR, AnprocsR ); if( AggRow ) { shft = TYPE->Frshft; Aptr = Mptr( A, AiiR, AiiD, Ald, size ); Bld = AnnxtR + AnpR; /* * Receive the rows globally following the ones I have */ TYPE->Cgerv2d( ctxt, AnnxtR, AnpD, Mptr( *B, AnpR, 0, Bld, size ), Bld, MModAdd1( AmyprocR, AnprocsR ), AmyprocD ); /* * Sort the received buffer and insert at the correct place the rows of sub( A ) * I own (from top to bottom). */ if( mydist > srcdist ) { offset = -AnpR; kb = Ainb1R + srcdist*AnbR; } else if( mydist == srcdist ) { add( &Ainb1R, &AnpD, one, Aptr, &Ald, zero, Bptr, &Bld ); Aptr = Mptr( Aptr, Ainb1R, 0, Ald, size ); Bptr = Mptr( Bptr, Ainb1R, 0, Ald, size ); offset = Ainb1R - AnpR; kb = mydistnb; } else { add( &AnbR, &AnpD, one, Aptr, &Ald, zero, Bptr, &Bld ); Aptr = Mptr( Aptr, AnbR, 0, Ald, size ); Bptr = Mptr( Bptr, AnbR, 0, Ald, size ); offset = AnbR - AnpR; kb = mydistnb; } for( k = kblks; k >= 1; k-- ) { shft( &kb, &AnpD, &offset, Bptr, &Bld ); Bptr = Mptr( Bptr, kb, 0, Bld, size ); add( &AnbR, &AnpD, one, Aptr, &Ald, zero, Bptr, &Bld ); Aptr = Mptr( Aptr, AnbR, 0, Ald, size ); Bptr = Mptr( Bptr, AnbR, 0, Ald, size ); offset += AnbR; kb = mydistnb; } if( AnnxtR + AnpR != AnR ) { /* * If I am not the last process, i.e I am not supposed to own all of the AnR * rows by the end of the operation, then send the sorted buffer to the previous * process and release the dynamically allocated buffer. */ TYPE->Cgesd2d( ctxt, AnnxtR+AnpR, AnpD, *B, Bld, MModSub1( AmyprocR, AnprocsR ), AmyprocD ); if( *B ) free( *B ); } } else { shft = TYPE->Fcshft; Aptr = Mptr( A, AiiD, AiiR, Ald, size ); Bld = MAX( 1, AnpD ); /* * Receive the columns globally following the ones I have */ TYPE->Cgerv2d( ctxt, AnpD, AnnxtR, Mptr( *B, 0, AnpR, Bld, size ), Bld, AmyprocD, MModAdd1( AmyprocR, AnprocsR ) ); /* * Sort the received buffer and insert at the correct place the columns of * sub( A ) I own (from left to right). */ if( mydist > srcdist ) { offset = -AnpR; kb = Ainb1R + srcdist*AnbR; } else if( mydist == srcdist ) { add( &AnpD, &Ainb1R, one, Aptr, &Ald, zero, Bptr, &Bld ); Aptr = Mptr( Aptr, 0, Ainb1R, Ald, size ); Bptr = Mptr( Bptr, 0, Ainb1R, Bld, size ); offset = Ainb1R - AnpR; kb = mydistnb; } else { add( &AnpD, &AnbR, one, Aptr, &Ald, zero, Bptr, &Bld ); Aptr = Mptr( Aptr, 0, AnbR, Ald, size ); Bptr = Mptr( Bptr, 0, AnbR, Bld, size ); offset = AnbR - AnpR; kb = mydistnb; } for( k = kblks; k >= 1; k-- ) { shft( &AnpD, &kb, &offset, Bptr, &Bld ); Bptr = Mptr( Bptr, 0, kb, Bld, size ); add( &AnpD, &AnbR, one, Aptr, &Ald, zero, Bptr, &Bld ); Aptr = Mptr( Aptr, 0, AnbR, Ald, size ); Bptr = Mptr( Bptr, 0, AnbR, Bld, size ); offset += AnbR; kb = mydistnb; } if( AnnxtR + AnpR != AnR ) { /* * If I am not the last process, i.e I am not supposed to own all of the AnR * columns by the end of the operation, then send the sorted buffer to the * previous process and release the dynamically allocated buffer. */ TYPE->Cgesd2d( ctxt, AnpD, AnnxtR+AnpR, *B, Bld, AmyprocD, MModSub1( AmyprocR, AnprocsR ) ); if( *B ) free( *B ); } } } } } /* * Describe the resulting operand */ if( AggRow ) { PB_Cdescset( DESCB, M, N, M, Ainb1D, AnbR, AnbD, AsrcR, AsrcD, ctxt, M ); } else { PB_Cdescset( DESCB, M, N, Ainb1D, N, AnbD, AnbR, AsrcD, AsrcR, ctxt, MAX( 1, AnpD ) ); } /* * Compute globally in which process row or column the resulting operand is * residing and set *BFREE accordingly. */ if( Afwd ) { if( AnR + AnbR > Ainb1R + ( AnprocsR - 1 ) * AnbR ) { /* * If sub( A ) is spanning all process rows or columns of the grid, the result * must be in the process row or column preceeding the one owning IA or JA, * don't you think ? */ DESCB[Bsrc_] = MModSub1( AsrcR, AnprocsR ); } else { /* * Otherwise, the result is in the process row or column where the row IA+M-1 * or column JA+N-1 of sub( A ) resides. */ DESCB[Bsrc_] = PB_Cindxg2p( AnR-1, Ainb1R, AnbR, AsrcR, AsrcR, AnprocsR ); } if( ( AnpD > 0 ) && ( AnpR > 0 ) && ( AmyprocR == DESCB[Bsrc_] ) ) *BFREE = 1; } else { if( AnR + AnbR > Ainb1R + ( AnprocsR - 1 ) * AnbR ) { /* * If sub( A ) is spanning all process rows or columns of the grid, the result * must be in the process row or column following the one owning IA+M-1 or * JA+N-1, don't you think ? */ DESCB[Bsrc_] = MModAdd1( ArocR, AnprocsR ); } else { /* * Otherwise, the result is in the process row or column where the row IA or * column JA of sub( A ) resides. */ DESCB[Bsrc_] = AsrcR; } if( ( AnpD > 0 ) && ( AnpR > 0 ) && ( AmyprocR == DESCB[Bsrc_] ) ) *BFREE = 1; } /* * End of PB_CGatherV */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cgcd.c000644 000766 000024 00000004332 10363532303 017606 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Cgcd( int M, int N ) #else int PB_Cgcd( M, N ) /* * .. Scalar Arguments .. */ int M, N; #endif { /* * Purpose * ======= * * PB_Cgcd computes and returns the Greatest Common Divisor (GCD) of two * positive integers M and N using a binary gcd algorithm. * * Arguments * ========= * * M (input) INTEGER * On entry, M must be at least zero. * * N (input) INTEGER * On entry, N must be at least zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int gcd=1, m_val, n_val, t; /* .. * .. Executable Statements .. * */ if( M > N ) { m_val = N; n_val = M; } else { m_val = M; n_val = N; } while( m_val > 0 ) { while( !( m_val & 1 ) ) { /* * m is even */ m_val >>= 1; /* if n is odd, gcd( m, n ) = gcd( m / 2, n ) */ /* * if n is odd, gcd( m, n ) = gcd( m / 2, n ) */ if( !( n_val & 1 ) ) { /* * otherwise gcd( m, n ) = 2 * gcd( m / 2, n / 2 ) */ n_val >>= 1; gcd <<= 1; } } /* * m is odd now. If n is odd, gcd( m, n ) = gcd( m, ( m - n ) / 2 ). * Otherwise, gcd( m, n ) = gcd ( m, n / 2 ). */ n_val -= ( n_val & 1 ) ? m_val : 0; n_val >>= 1; while( n_val >= m_val ) { /* * If n is odd, gcd( m, n ) = gcd( m, ( m - n ) / 2 ). * Otherwise, gcd( m, n ) = gcd ( m, n / 2 ) */ n_val -= ( n_val & 1 ) ? m_val : 0; n_val >>= 1; } /* * n < m, gcd( m, n ) = gcd( n, m ) */ t = n_val; n_val = m_val; m_val = t; } return ( n_val * gcd ); /* * End of PB_Cgcd */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cgetbuf.c000644 000766 000024 00000004725 10363532303 020333 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ char * PB_Cgetbuf( char * MESS, int LENGTH ) #else char * PB_Cgetbuf( MESS, LENGTH ) /* * .. Scalar Arguments .. */ int LENGTH; /* * .. Array Arguments .. */ char * MESS; #endif { /* * Purpose * ======= * * PB_Cgetbuf allocates a dynamic memory buffer. The routine checks the * size of the already allocated buffer against the value of the formal * parameter LENGTH. If the current buffer is large enough, this a poin- * ter to it is returned. Otherwise, this function tries to allocate it. * In case of failure, the program is stopped by calling Cblacs_abort. * When LENGTH is zero, this function returns a NULL pointer. If the va- * lue of LENGTH is strictly less than zero, the buffer is released. * * Arguments * ========= * * MESS (local input) pointer to CHAR * On entry, MESS is a string containing a message to be printed * in case of allocation failure. * * LENGTH (local input) INTEGER * On entry, LENGTH specifies the length in bytes of the buffer * to be allocated. If LENGTH is less or equal than zero, this * function returns NULL. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ static char * pblasbuf = NULL; static int pbbuflen = 0; /* .. * .. Executable Statements .. * */ if( LENGTH >= 0 ) { if( LENGTH > pbbuflen ) { if( pblasbuf ) free( pblasbuf ); pblasbuf = (char *) malloc( (unsigned) LENGTH ); if( !pblasbuf ) { (void) fprintf( stderr, "ERROR: Memory allocation failed\n%s\n", MESS ); Cblacs_abort( -1, -1 ); } pbbuflen = LENGTH; } } else if( pblasbuf ) { free( pblasbuf ); pblasbuf = NULL; pbbuflen = 0; } return( pblasbuf ); /* * End of PB_Cgetbuf */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cindxg2p.c000644 000766 000024 00000005406 10363532303 020427 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Cindxg2p( int IG, int INB, int NB, int PROC, int SRCPROC, int NPROCS ) #else int PB_Cindxg2p( IG, INB, NB, PROC, SRCPROC, NPROCS ) /* * .. Scalar Arguments .. */ int IG, INB, NB, NPROCS, PROC, SRCPROC; #endif { /* * Purpose * ======= * * PB_Cindxg2p computes the process coordinate which posseses the entry * of a matrix specified by a global index IG. * * Arguments * ========= * * IG (global input) INTEGER * On entry, IG specifies the global index of the matrix entry. * IG must be at least zero. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * PROC (local dummy) INTEGER * On entry, PROC is a dummy argument in this case in order to * unify the calling sequence of the tool-routines. * * SRCPROC (global input) INTEGER * On entry, SRCPROC specifies the coordinate of the process * that possesses the first row or column of the matrix. When * SRCPROC = -1, the data is not distributed but replicated, * otherwise SRCPROC must be at least zero and strictly less * than NPROCS. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the matrix is distributed. NPROCS must * be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ if( ( IG < INB ) || ( SRCPROC == -1 ) || ( NPROCS == 1 ) ) /* * IG belongs to the first block, or the data is not distributed, or there is * just one process in this dimension of the grid. */ return( SRCPROC ); /* * Otherwise, IG is in block 1 + ( IG - INB ) / NB. Add this to SRCPROC and * take the NPROCS modulo (definition of the block-cyclic data distribution). */ PROC = SRCPROC + 1 + ( IG - INB ) / NB; return( MPosMod( PROC, NPROCS ) ); /* * End of PB_Cindxg2p */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cinfog2l.c000644 000766 000024 00000032052 10363532303 020411 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cinfog2l( int I, int J, int * DESC, int NPROW, int NPCOL, int MYROW, int MYCOL, int * II, int * JJ, int * PROW, int * PCOL ) #else void PB_Cinfog2l( I, J, DESC, NPROW, NPCOL, MYROW, MYCOL, II, JJ, PROW, PCOL ) int I, * II, J, * JJ, MYCOL, MYROW, NPCOL, NPROW, * PCOL, * PROW; /* * .. Scalar Arguments .. */ /* * .. Array Arguments .. */ int * DESC; #endif { /* * Purpose * ======= * * PB_Cinfog2l computes the starting local index II, JJ corresponding to * the submatrix starting globally at the entry pointed by I, J. This * routine returns the coordinates in the grid of the process owning the * matrix entry of global indexes I, J, namely PROW and PCOL. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * I (global input) INTEGER * On entry, I specifies the global starting row index of the * submatrix. I must at least zero. * * J (global input) INTEGER * On entry, J specifies the global starting column index of * the submatrix. J must at least zero. * * DESC (global and local input) INTEGER array * On entry, DESC is an integer array of dimension DLEN_. This * is the array descriptor of the underlying matrix. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process columns * over which the matrix is distributed. NPCOL must be at least * one. * * MYROW (local input) INTEGER * On entry, MYROW specifies the row coordinate of the process * whose local index II is determined. MYROW must be at least * zero and strictly less than NPROW. * * MYCOL (local input) INTEGER * On entry, MYCOL specifies the column coordinate of the pro- * cess whose local index JJ is determined. MYCOL must be at * least zero and strictly less than NPCOL. * * II (local output) INTEGER * On exit, II specifies the local starting row index of the * submatrix. On exit, II is at least zero. * * JJ (local output) INTEGER * On exit, JJ specifies the local starting column index of the * submatrix. On exit, JJ is at least zero. * * PROW (global output) INTEGER * On exit, PROW specifies the row coordinate of the process * that possesses the first row of the submatrix. On exit, PROW * is -1 if DESC( RSRC_ ) is -1 on input, and, at least zero * and strictly less than NPROW otherwise. * * PCOL (global output) INTEGER * On exit, PCOL specifies the column coordinate of the process * that possesses the first column of the submatrix. On exit, * PCOL is -1 if DESC( CSRC_ ) is -1 on input, and, at least * zero and strictly less than NPCOL otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int ilocblk, imb, inb, mb, mydist, nb, nblocks, csrc, rsrc; /* .. * .. Executable Statements .. * */ /* * Retrieve the row distribution parameters */ imb = DESC[IMB_ ]; *PROW = DESC[RSRC_]; if( ( *PROW == -1 ) || ( NPROW == 1 ) ) { /* * The data is not distributed, or there is just one process row in the grid. */ *II = I; } else if( I < imb ) { /* * I refers to an entry in the first block of rows */ *II = ( MYROW == *PROW ? I : 0 ); } else { mb = DESC[MB_]; rsrc = *PROW; /* * The discussion goes as follows: compute my distance from the source process * so that within this process coordinate system, the source process is the * process such that mydist = 0, or equivalently MYROW == rsrc. * * Find out the global coordinate of the block I belongs to (nblocks), as well * as the minimum local number of blocks that every process has. * * when mydist < nblocks - ilocblk * NPROCS, I own ilocblk + 1 full blocks, * when mydist > nblocks - ilocblk * NPROCS, I own ilocblk full blocks, * when mydist = nblocks - ilocblk * NPROCS, I own ilocblk full blocks * but not I, or I own ilocblk + 1 blocks and the entry I refers to. */ if( MYROW == rsrc ) { /* * I refers to an entry that is not in the first block, find out which process * has it. */ nblocks = ( I - imb ) / mb + 1; *PROW += nblocks; *PROW -= ( *PROW / NPROW ) * NPROW; /* * Since mydist = 0 and nblocks - ilocblk * NPROW >= 0, there are only three * possible cases: * * 1) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I don't own I, in * which case II = IMB + ( ilocblk - 1 ) * MB. Note that this case cannot * happen when ilocblk is zero, since nblocks is at least one. * * 2) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I own I, in which * case I and II can respectively be written as IMB + (nblocks-1)*NB + IL * and IMB + (ilocblk-1) * MB + IL. That is II = I + (ilocblk-nblocks)*MB. * Note that this case cannot happen when ilocblk is zero, since nblocks * is at least one. * * 3) mydist = 0 < nblocks - ilocblk * NPROW, the source process owns * ilocblk+1 full blocks, and therefore II = IMB + ilocblk * MB. Note * that when ilocblk is zero, II is just IMB. */ if( nblocks < NPROW ) { *II = imb; } else { ilocblk = nblocks / NPROW; if( ilocblk * NPROW >= nblocks ) { *II = ( ( MYROW == *PROW ) ? I + ( ilocblk - nblocks ) * mb : imb + ( ilocblk - 1 ) * mb ); } else { *II = imb + ilocblk * mb; } } } else { /* * I refers to an entry that is not in the first block, find out which process * has it. */ nblocks = ( I -= imb ) / mb + 1; *PROW += nblocks; *PROW -= ( *PROW / NPROW ) * NPROW; /* * Compute my distance from the source process so that within this process * coordinate system, the source process is the process such that mydist=0. */ if( ( mydist = MYROW - rsrc ) < 0 ) mydist += NPROW; /* * When mydist < nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks of * size MB since I am not the source process, i.e. II = ( ilocblk + 1 ) * MB. * When mydist >= nblocks - ilocblk * NPROW and I don't own I, I own ilocblk * full blocks of size MB, i.e. II = ilocblk * MB, otherwise I own ilocblk * blocks and I, in which case I can be written as IMB + (nblocks-1)*MB + IL * and II = ilocblk*MB + IL = I - IMB + ( ilocblk - nblocks + 1 )*MB. */ if( nblocks < NPROW ) { mydist -= nblocks; *II = ( ( mydist < 0 ) ? mb : ( ( MYROW == *PROW ) ? I + ( 1 - nblocks ) * mb : 0 ) ); } else { ilocblk = nblocks / NPROW; mydist -= nblocks - ilocblk * NPROW; *II = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * mb : ( ( MYROW == *PROW ) ? ( ilocblk - nblocks + 1 ) * mb + I : ilocblk * mb ) ); } } } /* * Idem for the columns */ inb = DESC[INB_ ]; *PCOL = DESC[CSRC_]; if( ( *PCOL == -1 ) || ( NPCOL == 1 ) ) { *JJ = J; } else if( J < inb ) { *JJ = ( MYCOL == *PCOL ? J : 0 ); } else { nb = DESC[NB_]; csrc = *PCOL; if( MYCOL == csrc ) { nblocks = ( J - inb ) / nb + 1; *PCOL += nblocks; *PCOL -= ( *PCOL / NPCOL ) * NPCOL; if( nblocks < NPCOL ) { *JJ = inb; } else { ilocblk = nblocks / NPCOL; if( ilocblk * NPCOL >= nblocks ) { *JJ = ( ( MYCOL == *PCOL ) ? J + ( ilocblk - nblocks ) * nb : inb + ( ilocblk - 1 ) * nb ); } else { *JJ = inb + ilocblk * nb; } } } else { nblocks = ( J -= inb ) / nb + 1; *PCOL += nblocks; *PCOL -= ( *PCOL / NPCOL ) * NPCOL; if( ( mydist = MYCOL - csrc ) < 0 ) mydist += NPCOL; if( nblocks < NPCOL ) { mydist -= nblocks; *JJ = ( ( mydist < 0 ) ? nb : ( ( MYCOL == *PCOL ) ? J + ( 1 - nblocks )*nb : 0 ) ); } else { ilocblk = nblocks / NPCOL; mydist -= nblocks - ilocblk * NPCOL; *JJ = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * nb : ( ( MYCOL == *PCOL ) ? ( ilocblk - nblocks + 1 ) * nb + J : ilocblk * nb ) ); } } } /* * End of PB_Cinfog2l */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CInOutV.c000644 000766 000024 00000076471 10363532303 020252 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CInOutV( PBTYP_T * TYPE, char * ROWCOL, int M, int N, int * DESCA, int K, char * BETA, char * Y, int IY, int JY, int * DESCY, char * YROC, char * * TBETA, char * * YAPTR, int * DYA, int * YAFREE, int * YASUM, int * YAPBY ) #else void PB_CInOutV( TYPE, ROWCOL, M, N, DESCA, K, BETA, Y, IY, JY, DESCY, YROC, TBETA, YAPTR, DYA, YAFREE, YASUM, YAPBY ) /* * .. Scalar Arguments .. */ char * BETA, * ROWCOL, * * TBETA, * YROC; int * YAPBY, * YAFREE, IY, JY, K, M, N, * YASUM; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCY, * DYA; char * Y, * * YAPTR; #endif { /* * Purpose * ======= * * PB_CInOutV returns a pointer to an array that contains a one-dimen- * sional input/output subvector which is replicated over the rows or * columns of a submatrix described by DESCA. A subvector is specified * on input to this routine that is reused whenever possible. On return, * the subvector is specified by a pointer to some data, a descriptor * array describing its layout, a logical value indicating if this local * piece of data has been dynamically allocated by this function, a lo- * gical value specifying if sum reduction should occur, and finally a * logical value specifying if it is necessary to copy back the alloca- * ted data to the original data. This routine is specifically designed * for traditional Level 2 like PBLAS operations using an input/output * vector such as PxGEMV, PxSYMV ... * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * ROWCOL (global input) pointer to CHAR * On entry, ROWCOL specifies if this routine should return a * row or column subvector replicated over the underlying subma- * trix as follows: * = 'R' or 'r': A row subvector is returned, * = 'C' or 'c': A column subvector is returned. * * M (global input) INTEGER * On entry, M specifies the number of rows of the underlying * submatrix described by DESCA. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the underlying * submatrix described by DESCA. N must be at least zero. * * DESCA (global and local input/output) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. EXCEPTIONALLY, THIS * INTERNAL ROUTINE MAY MODIFY DESCA IN ORDER TO MINIMIZE THE * AMOUNT OF DATA TO BE MOVED FOR THE VECTOR Y. SEE PxGEMV FOR * AN EXAMPLE. * * K (global input) INTEGER * On entry, K specifies the length of the non-distributed di- * mension of the subvector sub( Y ). K must be at least zero. * * BETA (global input) pointer to CHAR * On entry, BETA is a scalar the input subvector sub( Y ) must * be scaled by. * * Y (local input) pointer to CHAR * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( K, IY ) ) when YROC is 'R' or 'r' * and MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at * least Lc( 1, JY+Ly-1 ) when YROC is 'R' or 'r' and * Lc( K, JY ) otherwise. Ly is N when ROWCOL is 'R' or 'r' and * M otherwise. Before entry, this array contains the local * entries of the matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * YROC (global input) pointer to CHAR * On entry, YROC specifies the orientation of the subvector * sub( Y ). When YROC is 'R' or 'r', sub( Y ) is a row vector, * and a column vector otherwise. * * TBETA (local output) pointer to pointer to CHAR * On exit, * TBETA is a scalar to be used locally to scale the * data pointed to by * YAPTR, in order to obtain the correct * result in the original data sub( Y ). * * YAPTR (local output) pointer to pointer to CHAR * On exit, * YAPTR is an array containing the same data as the * subvector sub( Y ) which is replicated over the rows or co- * lumns of the underlying matrix as specified by ROWCOL and * DESCA. * * DYA (global and local output) INTEGER array * On exit, DYA is a descriptor array of dimension DLEN_ descri- * bing the data layout of the data pointed to by * YAPTR. * * YAFREE (local output) INTEGER * On exit, YAFREE specifies if it was possible to reuse the * subvector sub( Y ), i.e., if some dynamic memory was alloca- * ted for the data pointed to by * YAPTR or not. When YAFREE is * zero, no dynamic memory was allocated. Otherwise, some dyna- * mic memory was allocated by this function that one MUST re- * lease as soon as possible. * * YASUM (global output) INTEGER * On exit, YASUM specifies if a global sum reduction should be * performed to obtain the correct sub( Y ). When YASUM is zero, * no reduction is to be performed, otherwise reduction should * occur. * * YAPBY (global output) INTEGER * On exit, YAPBY specifies if the data pointed to by * YAPTR * must be move back onto sub( Y ) to obtain the correct result. * When YAPBY is zero, no supplementary data movement is neces- * sary, otherwise a data redistribution should occur. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Aimb, Ainb, AisD, AisR, Amb, Amp, Anb, Anq, Arow, Ycol, Yii, Yimb, Yimb1, Yinb, Yinb1, YisD, YisR, YisRow, Yjj, Yld, Ymb, Ymp, Ynb, Ynq, Yrow, ctxt, izero=0, nprow, myrow, npcol, mycol; /* .. * .. Executable Statements .. * */ /* * Initialize the output parameters to a default value */ *YAFREE = 0; *YASUM = 0; *YAPBY = 0; *YAPTR = NULL; *TBETA = BETA; /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) || ( K <= 0 ) ) { if( Mupcase( ROWCOL[0] ) == CROW ) { PB_Cdescset( DYA, K, N, 1, DESCA[INB_], 1, DESCA[NB_], DESCA[RSRC_], DESCA[CSRC_], DESCA[CTXT_], 1 ); } else { PB_Cdescset( DYA, M, K, DESCA[IMB_], 1, DESCA[MB_], 1, DESCA[RSRC_], DESCA[CSRC_], DESCA[CTXT_], DESCA[LLD_] ); } return; } /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCY[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ Minfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, Yii, Yjj, Yrow, Ycol ); /* * Is sub( Y ) distributed or not, replicated or not ? */ if( ( YisRow = ( Mupcase( YROC[0] ) == CROW ) ) != 0 ) { YisD = ( ( Ycol >= 0 ) && ( npcol > 1 ) ); YisR = ( ( Yrow == -1 ) || ( nprow == 1 ) ); } else { YisD = ( ( Yrow >= 0 ) && ( nprow > 1 ) ); YisR = ( ( Ycol == -1 ) || ( npcol == 1 ) ); } Aimb = DESCA[IMB_ ]; Ainb = DESCA[INB_ ]; Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Arow = DESCA[RSRC_]; Acol = DESCA[CSRC_]; if( Mupcase( ROWCOL[0] ) == CROW ) { /* * Want a row vector */ AisR = ( ( Arow < 0 ) || ( nprow == 1 ) ); if( YisRow ) { /* * It is possible to reuse sub( Y ) iff sub( Y ) is already a row vector. */ AisD = ( ( Acol >= 0 ) && ( npcol > 1 ) ); Yinb = DESCY[INB_]; Ynb = DESCY[NB_]; Mfirstnb( Yinb1, N, JY, Yinb, Ynb ); /* * sub( Y ) is aligned with A (reuse condition) iff both operands are not * distributed, or both of them are distributed and start in the same process * column and either N is smaller than the first blocksize of sub( Y ) and A, * or their column blocking factors match. */ if( ( !AisD && !YisD ) || ( ( AisD && YisD ) && ( ( Acol == Ycol ) && ( ( ( Ainb >= N ) && ( Yinb1 >= N ) ) || ( ( Ainb == Yinb1 ) && ( Anb == Ynb ) ) ) ) ) ) { Mnumroc( Ynq, N, 0, Yinb1, Ynb, mycol, Ycol, npcol ); Ymp = ( YisR ? K : ( ( myrow == Yrow ) ? K : 0 ) ); if( YisR ) { /* * If sub( Y ) is replicated, there is no need to move sub( Y ) after the * operation (*YAPBY = 0), and it can be reused. */ *YAPBY = 0; Yld = DESCY[ LLD_ ]; if( Ynq > 0 ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); if( AisR ) { /* * If A is replicated as well, use BETA in every process row, and do not combine * the local results. */ *TBETA = BETA; *YASUM = 0; } else { /* * Otherwise, use BETA in process row Arow and zero elsewhere. Reduce the local * result if there is more than one row in the process grid. */ *TBETA = ( ( myrow == Arow ) ? BETA : TYPE->zero ); *YASUM = ( nprow > 1 ); /* * If some process rows do not own any entries of A, better set sub( Y ) to zero * in those processes. */ Mnumroc( Amp, M, 0, Aimb, Amb, myrow, Arow, nprow ); if( Amp <= 0 ) TYPE->Ftzscal( C2F_CHAR( ALL ), &K, &Ynq, &izero, *TBETA, *YAPTR, &Yld ); } } else { /* * sub( Y ) is not replicated, the descriptor of A may need to be modified ... */ if( AisR ) { /* * If A is replicated, use only the copy in the process row where sub( Y ) * resides -> modify DESCA !!! */ *TBETA = BETA; *YASUM = 0; *YAPBY = 0; Yld = DESCY[ LLD_ ]; DESCA[ IMB_ ] = M; DESCA[ RSRC_ ] = Yrow; if( ( Ynq > 0 ) && ( Ymp > 0 ) ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); } else { if( Mspan( M, 0, Aimb, Amb, Arow, nprow ) ) { /* * Otherwise, A is not replicated, let assume in addition that it spans more * than one process row */ *YASUM = ( nprow > 1 ); *YAPBY = 0; if( myrow == Yrow ) { /* * Reuse sub( Y ). If there is no entries of A in the process row where sub( Y ) * resides, better scale it by BETA immediately. */ *TBETA = BETA; Yld = DESCY[ LLD_ ]; if( Ynq > 0 ) { *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); Mnumroc( Amp, M, 0, Aimb, Amb, myrow, Arow, nprow ); if( Amp <= 0 ) TYPE->Ftzscal( C2F_CHAR( ALL ), &K, &Ynq, &izero, *TBETA, *YAPTR, &Yld ); } } else { /* * Allocate space in the other process rows and initialize to zero. */ *TBETA = TYPE->zero; Yld = MAX( 1, K ); if( Ynq > 0 ) { *YAPTR = PB_Cmalloc( K * Ynq * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Ynq, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } } else { /* * A spans only one process row */ if( Yrow == Arow ) { /* * A and sub( Y ) resides in the same process row */ *TBETA = BETA; *YASUM = 0; *YAPBY = 0; Yld = DESCY[ LLD_ ]; if( ( myrow == Yrow ) && ( Ynq > 0 ) ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); } else { /* * If sub( Y ) resides in another process row, then allocate zero-data in * process row where A resides, and set *YAPBY to 1, so that this data will be * added (moved) after the local operation has been performed. */ *TBETA = TYPE->zero; *YASUM = 0; *YAPBY = 1; Yrow = Arow; Yld = MAX( 1, K ); if( myrow == Arow ) { if( Ynq > 0 ) { *YAPTR = PB_Cmalloc( K * Ynq * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Ynq, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } } } } } /* * Describe the resulting operand. Note that when reduction should occur, Yrow * contains the destination row. Assuming every process row needs the result, * Yrow is then -1. */ MDescSet( DYA, K, N, K, Yinb1, 1, Ynb, Yrow, Ycol, ctxt, Yld ); return; } } /* * sub( Y ) cannot be reused, set TBETA to zero for the local operation, and * force YAPBY to 1 for the later update of sub( Y ). */ *TBETA = TYPE->zero; *YAPBY = 1; Mnumroc( Anq, N, 0, Ainb, Anb, mycol, Acol, npcol ); Yld = MAX( 1, K ); if( YisR ) { /* * If sub( Y ) is replicated, allocate space in every process row owning some * columns of A and initialize it to zero. There may be some wasted space * (suppose A was residing in just one row), however, it is hoped that moving * back this data to sub( Y ) will then be cheaper ... */ *YASUM = ( AisR ? 0 : ( nprow > 1 ) ); Arow = -1; if( Anq > 0 ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Anq, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } else { /* * sub( Y ) resides within only one process row */ if( AisR ) { /* * If A is replicated, then modify sub( A ) so that only one process row will * compute the result before moving it back to sub( Y ). */ *YASUM = 0; DESCA[ IMB_ ] = M; if( YisRow ) { /* * Choose different process row than Yrow for better performance (more links) * of the later move-back phase. */ DESCA[RSRC_] = Arow = MModSub1( Yrow, nprow ); } else { DESCA[RSRC_] = Arow = 0; } if( ( myrow == Arow ) && ( Anq > 0 ) ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Anq, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } else { if( Mspan( M, 0, Aimb, Amb, Arow, nprow ) ) { /* * If A is not replicated, and spans more than just one process row, then * allocate space in every process row and zero it. */ *YASUM = ( nprow > 1 ); if( Anq > 0 ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Anq, &izero, *TBETA, *TBETA, *YAPTR, &K ); } } else { /* * If A is not replicated, and spans only one process row, then allocate space * within that process row and zero it. */ *YASUM = 0; if( ( myrow == Arow ) && ( Anq > 0 ) ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Anq, &izero, *TBETA, *TBETA, *YAPTR, &K ); } } } } /* * Describe the resulting operand. Note that when reduction should occur, Arow * contains the destination row. Assuming every process row needs the result, * Arow is then -1. */ MDescSet( DYA, K, N, K, Ainb, 1, Anb, Arow, Acol, ctxt, Yld ); } else { /* * Want a column vector */ AisR = ( ( Acol < 0 ) || ( npcol == 1 ) ); if( !YisRow ) { /* * It is possible to reuse sub( Y ) iff sub( Y ) is already a column vector. */ AisD = ( ( Arow >= 0 ) && ( nprow > 1 ) ); Yimb = DESCY[IMB_]; Ymb = DESCY[MB_]; Mfirstnb( Yimb1, M, IY, Yimb, Ymb ); /* * sub( Y ) is aligned with A (reuse condition) iff both operands are not * distributed, or both of them are distributed and start in the same process * row and either M is smaller than the first blocksize of sub( Y ) and A, or * their row blocking factors match. */ if( ( !AisD && !YisD ) || ( ( AisD && YisD ) && ( ( Arow == Yrow ) && ( ( ( Aimb >= M ) && ( Yimb1 >= M ) ) || ( ( Aimb == Yimb1 ) && ( Amb == Ymb ) ) ) ) ) ) { Mnumroc( Ymp, M, 0, Yimb1, Ymb, myrow, Yrow, nprow ); Ynq = ( YisR ? K : ( ( mycol == Ycol ) ? K : 0 ) ); if( YisR ) { /* * If sub( Y ) is replicated, there is no need to move sub( Y ) after the * operation (*YAPBY = 0), and it can be reused. */ *YAPBY = 0; Yld = DESCY[ LLD_ ]; if( Ymp > 0 ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); if( AisR ) { /* * If A is replicated as well, use BETA in every process column, and do not * combine the local results. */ *TBETA = BETA; *YASUM = 0; } else { /* * Otherwise, use BETA in process column Acol and zero elsewhere. Reduce the * local result if there is more than one column in the process grid. */ *TBETA = ( ( mycol == Acol ) ? BETA : TYPE->zero ); *YASUM = ( npcol > 1 ); /* * If some process columns do not own any entries of A, better set sub( Y ) to * zero in those processes. */ Mnumroc( Anq, N, 0, Ainb, Anb, mycol, Acol, npcol ); if( Anq <= 0 ) TYPE->Ftzscal( C2F_CHAR( ALL ), &Ymp, &K, &izero, *TBETA, *YAPTR, &Yld ); } } else { /* * sub( Y ) is not replicated, the descriptor of A may need to be modified ... */ if( AisR ) { /* * If A is replicated, use only the copy in the process column where sub( Y ) * resides -> modify DESCA !!! */ *TBETA = BETA; *YASUM = 0; *YAPBY = 0; Yld = DESCY[ LLD_ ]; DESCA[ INB_ ] = N; DESCA[ CSRC_ ] = Ycol; if( ( Ymp > 0 ) && ( Ynq > 0 ) ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); } else { if( Mspan( N, 0, Ainb, Anb, Acol, npcol ) ) { /* * Otherwise, A is not replicated, let assume in addition that it spans more * than one process column */ *YASUM = ( npcol > 1 ); *YAPBY = 0; if( mycol == Ycol ) { /* * Reuse sub( Y ). If there is no entries of A in the process column where * sub( Y ) resides, better scale it by BETA immediately. */ *TBETA = BETA; Yld = DESCY[ LLD_ ]; if( Ymp > 0 ) { *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); Mnumroc( Anq, N, 0, Ainb, Anb, mycol, Acol, npcol ); if( Anq <= 0 ) TYPE->Ftzscal( C2F_CHAR( ALL ), &Ymp, &K, &izero, *TBETA, *YAPTR, &Yld ); } } else { /* * Allocate space in the other process columns and initialize to zero. */ *TBETA = TYPE->zero; Yld = MAX( 1, Ymp ); if( Ymp > 0 ) { *YAPTR = PB_Cmalloc( Ymp * K * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ymp, &K, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } } else { /* * A spans only one process column */ if( Ycol == Acol ) { /* * A and sub( Y ) resides in the same process column */ *TBETA = BETA; *YASUM = 0; *YAPBY = 0; Yld = DESCY[ LLD_ ]; if( ( mycol == Ycol ) && ( Ymp > 0 ) ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); } else { /* * If sub( Y ) resides in another process column, then allocate zero-data in * process column where A resides, and set *YAPBY to 1, so that this data will * be added (moved) after the local operation has been performed. */ *TBETA = TYPE->zero; *YASUM = 0; *YAPBY = 1; Ycol = Acol; Yld = MAX( 1, Ymp ) ; if( mycol == Acol ) { if( Ymp > 0 ) { *YAPTR = PB_Cmalloc( Ymp * K * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ymp, &K, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } } } } } /* * Describe the resulting operand. Note that when reduction should occur, Ycol * contains the destination column. Assuming every process column needs the * result, Ycol is then -1. */ MDescSet( DYA, M, K, Yimb1, K, Ymb, 1, Yrow, Ycol, ctxt, Yld ); return; } } /* * sub( Y ) cannot be reused, set TBETA to zero for the local operation, and * force YAPBY to 1 for the later update of sub( Y ). */ *TBETA = TYPE->zero; *YAPBY = 1; Mnumroc( Amp, M, 0, Aimb, Amb, myrow, Arow, nprow ); Yld = MAX( 1, Amp ); if( YisR ) { /* * If sub( Y ) is replicated, allocate space in every process column owning some * rows of A and initialize it to zero. There may be some wasted space (suppose * A was residing in just one column), however, it is hoped that moving back * this data to sub( Y ) will then be cheaper ... */ *YASUM = ( AisR ? 0 : ( npcol > 1 ) ); Acol = -1; if( Amp > 0 ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &K, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } else { /* * sub( Y ) resides within only one process column */ if( AisR ) { /* * If A is replicated, then modify sub( A ) so that only one process column will * compute the result before moving it back to sub( Y ). */ *YASUM = 0; DESCA[ INB_ ] = N; if( YisRow ) { DESCA[ CSRC_ ] = Acol = 0; } else { /* * Choose different process column than Ycol for better performance (more links) * of the later move-back phase. */ DESCA[ CSRC_ ] = Acol = MModSub1( Ycol, npcol ); } if( ( mycol == Acol ) && ( Amp > 0 ) ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &K, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } else { if( Mspan( N, 0, Ainb, Anb, Acol, npcol ) ) { /* * If A is not replicated, and spans more than just one process column, then * allocate space in every process column and zero it. */ *YASUM = ( npcol > 1 ); if( Amp > 0 ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &K, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } else { /* * If A is not replicated, and spans only one process column, then allocate * space within that process column and zero it. */ *YASUM = 0; if( ( mycol == Acol ) && ( Amp > 0 ) ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &K, &izero, *TBETA, *TBETA, *YAPTR, &Yld ); } } } } /* * Describe the resulting operand. Note that when reduction should occur, Acol * contains the destination column. Assuming every process column needs the * result, Acol is then -1. */ MDescSet( DYA, M, K, Aimb, K, Amb, 1, Arow, Acol, ctxt, Yld ); } /* * End of PB_CInOutV */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CInOutV2.c000644 000766 000024 00000101646 10363532303 020325 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CInOutV2( PBTYP_T * TYPE, char * CONJUG, char * ROWCOL, int M, int N, int KA, int * DESCA, int K, char * Y, int IY, int JY, int * DESCY, char * YROC, char * * YAPTR, int * DYA, int * YAFREE, int * YASUM, int * YAPBY ) #else void PB_CInOutV2( TYPE, CONJUG, ROWCOL, M, N, KA, DESCA, K, Y, IY, JY, DESCY, YROC, YAPTR, DYA, YAFREE, YASUM, YAPBY ) /* * .. Scalar Arguments .. */ char * CONJUG, * ROWCOL, * YROC; int * YAPBY, * YAFREE, IY, JY, K, KA, M, N, * YASUM; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCY, * DYA; char * Y, * * YAPTR; #endif { /* * Purpose * ======= * * PB_CInOutV2 returns a pointer to an array that contains a one-dimen- * sional input/output subvector which is replicated over the rows or * columns of a submatrix described by DESCA. A subvector is specified * on input to this routine that is reused whenever possible. On return, * the subvector is specified by a pointer to some data, a descriptor * array describing its layout, a logical value indicating if this local * piece of data has been dynamically allocated by this function, a lo- * gical value specifying if sum reduction should occur, and finally a * logical value specifying if it is necessary to copy back the alloca- * ted data to the original data. This routine is specifically designed * for traditional Level 2 like PBLAS operations using an input/output * vector such as PxTRSV. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies if this routine should return * the conjugate subvector as follows: * = 'N' or 'n': The initial subvector is returned, * = 'Z' or 'z': The conjugate subvector is returned. * * ROWCOL (global input) pointer to CHAR * On entry, ROWCOL specifies if this routine should return a * row or column subvector replicated over the underlying subma- * trix as follows: * = 'R' or 'r': A row subvector is returned, * = 'C' or 'c': A column subvector is returned. * * M (global input) INTEGER * On entry, M specifies the number of rows of the underlying * submatrix described by DESCA. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the underlying * submatrix described by DESCA. N must be at least zero. * * KA (global input) INTEGER * On entry, KA specifies a global row index when ROWCOL is 'R' * or 'r' and a global column index otherwise. This index deter- * mines a process row or column in which the output subvector * contains a copy of the input subvector. * * DESCA (global and local input/output) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. EXCEPTIONALLY, THIS * INTERNAL ROUTINE MAY MODIFY DESCA IN ORDER TO MINIMIZE THE * AMOUNT OF DATA TO BE MOVED FOR THE VECTOR Y. SEE PxGEMV FOR * AN EXAMPLE. * * K (global input) INTEGER * On entry, K specifies the length of the non-distributed di- * mension of the subvector sub( Y ). K must be at least zero. * * Y (local input) pointer to CHAR * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( K, IY ) ) when YROC is 'R' or 'r' * and MAX( 1, Lr( 1, IY+Ly-1 ) ) otherwise, and, Ky is at * least Lc( 1, JY+Ly-1 ) when YROC is 'R' or 'r' and * Lc( K, JY ) otherwise. Ly is N when ROWCOL is 'R' or 'r' and * M otherwise. Before entry, this array contains the local * entries of the matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * YROC (global input) pointer to CHAR * On entry, YROC specifies the orientation of the subvector * sub( Y ). When YROC is 'R' or 'r', sub( Y ) is a row vector, * and a column vector otherwise. * * YAPTR (local output) pointer to pointer to CHAR * On exit, * YAPTR is an array containing the same data as the * subvector sub( Y ) which is replicated over the rows or co- * lumns of the underlying matrix as specified by ROWCOL and * DESCA. * * DYA (global and local output) INTEGER array * On exit, DYA is a descriptor array of dimension DLEN_ descri- * bing the data layout of the data pointed to by * YAPTR. * * YAFREE (local output) INTEGER * On exit, YAFREE specifies if it was possible to reuse the * subvector sub( Y ), i.e., if some dynamic memory was alloca- * ted for the data pointed to by * YAPTR or not. When YAFREE is * zero, no dynamic memory was allocated. Otherwise, some dyna- * mic memory was allocated by this function that one MUST re- * lease as soon as possible. * * YASUM (global output) INTEGER * On exit, YASUM specifies if a global sum reduction should be * performed to obtain the correct sub( Y ). When YASUM is zero, * no reduction is to be performed, otherwise reduction should * occur. * * YAPBY (global output) INTEGER * On exit, YAPBY specifies if the data pointed to by * YAPTR * must be move back onto sub( Y ) to obtain the correct result. * When YAPBY is zero, no supplementary data movement is neces- * sary, otherwise a data redistribution should occur. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Acoldst, Aimb, Ainb, AisD, AisR, Amb, Amp, Anb, Anq, Arow, Arowdst, Ycol, Yii, Yimb, Yimb1, Yinb, Yinb1, YisD, YisR, YisRow, Yjj, Yld, Ymb, Ymp, Ynb, Ynq, Yrow, ctxt, izero=0, nprow, myrow, npcol, mycol; /* .. * .. Executable Statements .. * */ /* * Initialize the output parameters to a default value */ *YAFREE = 0; *YASUM = 0; *YAPBY = 0; *YAPTR = NULL; /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) || ( K <= 0 ) ) { if( Mupcase( ROWCOL[0] ) == CROW ) { PB_Cdescset( DYA, K, N, 1, DESCA[INB_], 1, DESCA[NB_], DESCA[RSRC_], DESCA[CSRC_], DESCA[CTXT_], 1 ); } else { PB_Cdescset( DYA, M, K, DESCA[IMB_], 1, DESCA[MB_], 1, DESCA[RSRC_], DESCA[CSRC_], DESCA[CTXT_], DESCA[LLD_] ); } return; } /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCY[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol */ Minfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, Yii, Yjj, Yrow, Ycol ); /* * Is sub( Y ) distributed or not, replicated or not ? */ if( ( YisRow = ( Mupcase( YROC[0] ) == CROW ) ) != 0 ) { YisD = ( ( Ycol >= 0 ) && ( npcol > 1 ) ); YisR = ( ( Yrow == -1 ) || ( nprow == 1 ) ); } else { YisD = ( ( Yrow >= 0 ) && ( nprow > 1 ) ); YisR = ( ( Ycol == -1 ) || ( npcol == 1 ) ); } Aimb = DESCA[ IMB_ ]; Ainb = DESCA[ INB_ ]; Amb = DESCA[ MB_ ]; Anb = DESCA[ NB_ ]; Arow = DESCA[ RSRC_ ]; Acol = DESCA[ CSRC_ ]; if( Mupcase( ROWCOL[0] ) == CROW ) { /* * Want a row vector */ AisR = ( ( Arow < 0 ) || ( nprow == 1 ) ); /* * Figure out in which process row sub( Y ) or a copy of it should be found */ Arowdst = PB_Cindxg2p( KA, Aimb, Amb, Arow, Arow, nprow ); if( YisRow && ( Mupcase( CONJUG[0] ) == CNOCONJG ) ) { /* * It is possible to reuse sub( Y ) iff sub( Y ) is already a row vector and * the data does not need to be conjugated. */ AisD = ( ( Acol >= 0 ) && ( npcol > 1 ) ); Yinb = DESCY[INB_]; Ynb = DESCY[NB_]; Yinb1 = PB_Cfirstnb( N, JY, Yinb, Ynb ); /* * sub( Y ) is aligned with A (reuse condition) iff both operands are not * distributed, or both of them are distributed and start in the same process * column and either N is smaller than the first blocksize of sub( Y ) and A, * or their column blocking factors match. */ if( ( !AisD && !YisD ) || ( ( AisD && YisD ) && ( ( Acol == Ycol ) && ( ( ( Ainb >= N ) && ( Yinb1 >= N ) ) || ( ( Ainb == Yinb1 ) && ( Anb == Ynb ) ) ) ) ) ) { Ynq = PB_Cnumroc( N, 0, Yinb1, Ynb, mycol, Ycol, npcol ); Ymp = ( YisR ? K : ( ( myrow == Yrow ) ? K : 0 ) ); Yld = MAX( 1, K ); if( YisR ) { /* * If sub( Y ) is replicated, there is no need to move sub( Y ) after the * operation (*YAPBY = 0), and it can be reused where needed and zeroed out * elsewhere. */ *YASUM = ( AisR ? 0 : ( nprow > 1 ) ); *YAPBY = 0; Yld = DESCY[ LLD_ ]; if( Ynq > 0 ) { *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); if( !AisR && ( myrow != Arowdst ) ) TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Ynq, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } else { /* * sub( Y ) is not replicated, the descriptor of A may need to be modified ... */ if( AisR ) { /* * If A is replicated, use only the copy in the process row where sub( Y ) * resides -> modify DESCA !!! */ *YASUM = 0; *YAPBY = 0; Yld = DESCY[ LLD_ ]; DESCA[ IMB_ ] = M; DESCA[ RSRC_ ] = Yrow; if( ( Ynq > 0 ) && ( Ymp > 0 ) ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); } else { if( PB_Cspan( M, 0, Aimb, Amb, Arow, nprow ) ) { /* * Otherwise, A is not replicated, let assume in addition that it spans more * than one process row. */ *YASUM = ( nprow > 1 ); *YAPBY = 0; if( myrow == Yrow ) { /* * If sub( Y ) is not in the desired process row, send it there and zero it. * Otherwise, reuse it. */ Yld = DESCY[ LLD_ ]; if( Ynq > 0 ) { *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); if( Yrow != Arowdst ) { TYPE->Cgesd2d( ctxt, K, Ynq, *YAPTR, Yld, Arowdst, mycol ); TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Ynq, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } } else { /* * Allocate space in the other process rows and initialize to zero. If sub( Y ) * was not in the desired process row, receive it. */ Yld = MAX( 1, K ); if( Ynq > 0 ) { *YAPTR = PB_Cmalloc( K * Ynq * TYPE->size ); *YAFREE = 1; if( ( Yrow != Arowdst ) && ( myrow == Arowdst ) ) TYPE->Cgerv2d( ctxt, K, Ynq, *YAPTR, Yld, Yrow, mycol ); else TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Ynq, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } } else { /* * A spans only one process row */ if( Yrow == Arow ) { /* * If A and sub( Y ) resides in the same process row, things are easy. */ *YASUM = 0; *YAPBY = 0; Yld = DESCY[ LLD_ ]; if( ( myrow == Yrow ) && ( Ynq > 0 ) ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); } else { /* * Otherwise, sub( Y ) resides in another process row, thus allocate zero-data * in process row where a copy of sub( Y ) is desired, and receive it. Set * *YAPBY to 1, so that this data will be added (moved) after the local * operation has been performed. */ *YASUM = 0; *YAPBY = 1; if( Ynq > 0 ) { if( myrow == Yrow ) { Yld = DESCY[ LLD_ ]; TYPE->Cgesd2d( ctxt, K, Ynq, Mptr( Y, Yii, Yjj, Yld, TYPE->size ), Yld, Arowdst, mycol ); } else if( myrow == Arowdst ) { Yld = MAX( 1, K ); *YAPTR = PB_Cmalloc( K*Ynq*TYPE->size ); *YAFREE = 1; TYPE->Cgerv2d( ctxt, K, Ynq, *YAPTR, Yld, Yrow, mycol ); } } Yrow = Arowdst; } } } } /* * Describe the resulting operand. Note that when reduction should occur, Yrow * contains the destination row. Assuming every process row needs the result, * Yrow is then -1. */ PB_Cdescset( DYA, K, N, K, Yinb1, 1, Ynb, Yrow, Ycol, ctxt, Yld ); return; } } /* * sub( Y ) cannot be reused, force YAPBY to 1 for the later update of sub( Y ). */ *YAPBY = 1; Anq = PB_Cnumroc( N, 0, Ainb, Anb, mycol, Acol, npcol ); Yld = MAX( 1, K ); if( YisR ) { /* * If sub( Y ) is replicated, allocate space in every process row owning some * columns of A and initialize it to zero only where needed. There may be some * wasted space (suppose A was residing in just one row), however, it is hoped * that moving back this data to sub( Y ) will then be cheaper ... */ *YASUM = ( AisR ? 0 : ( nprow > 1 ) ); if( Anq > 0 ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; if( ( Arowdst >= 0 ) && ( myrow != Arowdst ) ) TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Anq, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } else { /* * sub( Y ) resides in only one process row */ if( AisR ) { /* * If A is replicated, then modify sub( A ) so that only one process row will * compute the result before moving it back to sub( Y ). */ *YASUM = 0; DESCA[ IMB_ ] = M; if( YisRow ) { /* * Choose a different process row than Yrow for better performance (more links) * in the later move-back phase. */ DESCA[RSRC_] = MModSub1( Yrow, nprow ); } else { DESCA[RSRC_] = 0; } if( ( myrow == ( Arowdst = DESCA[RSRC_] ) ) && ( Anq > 0 ) ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; } } else { if( PB_Cspan( M, 0, Aimb, Amb, Arow, nprow ) ) { /* * If A is not replicated, and spans more than just one process row, then * allocate space in every process row and zero it where needed. */ *YASUM = ( nprow > 1 ); if( Anq > 0 ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; if( myrow != Arowdst ) TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Anq, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } else { /* * If A is not replicated, and spans only one process row, then allocate space * within that process row. */ *YASUM = 0; if( ( myrow == Arowdst ) && ( Anq > 0 ) ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; } } } } /* * Describe the resulting operand. Note that when reduction should occur, * Arowdst contains the destination row. Assuming every process row needs the * result, Arowdst is then -1. */ PB_Cdescset( DYA, K, N, K, Ainb, 1, Anb, Arowdst, Acol, ctxt, Yld ); /* * Move sub( Y ) in the desired processes and with the correct layout */ if( YisRow ) { PB_Cpaxpby( TYPE, CONJUG, K, N, TYPE->one, Y, IY, JY, DESCY, ROW, TYPE->zero, *YAPTR, 0, 0, DYA, ROW ); } else { PB_Cpaxpby( TYPE, CONJUG, N, K, TYPE->one, Y, IY, JY, DESCY, COLUMN, TYPE->zero, *YAPTR, 0, 0, DYA, ROW ); } } else { /* * Want a column vector with original data in col KA */ AisR = ( ( Acol < 0 ) || ( npcol == 1 ) ); /* * Figure out in which process column sub( Y ) or a copy of it should be found. */ Acoldst = PB_Cindxg2p( KA, Ainb, Anb, Acol, Acol, npcol ); if( !( YisRow ) && ( Mupcase( CONJUG[0] ) == CNOCONJG ) ) { /* * It is possible to reuse sub( Y ) iff sub( Y ) is already a column vector and * the data does not need to be conjugated. */ AisD = ( ( Arow >= 0 ) && ( nprow > 1 ) ); Yimb = DESCY[IMB_]; Ymb = DESCY[MB_]; Yimb1 = PB_Cfirstnb( M, IY, Yimb, Ymb ); /* * sub( Y ) is aligned with A (reuse condition) iff both operands are not * distributed, or both of them are distributed and start in the same process * row and either M is smaller than the first blocksize of sub( Y ) and A, or * their row blocking factors match. */ if( ( !AisD && !YisD ) || ( ( AisD && YisD ) && ( ( Arow == Yrow ) && ( ( ( Aimb >= M ) && ( Yimb1 >= M ) ) || ( ( Aimb == Yimb1 ) && ( Amb == Ymb ) ) ) ) ) ) { Ymp = PB_Cnumroc( M, 0, Yimb1, Ymb, myrow, Yrow, nprow ); Ynq = ( YisR ? K : ( ( mycol == Ycol ) ? K : 0 ) ); Yld = MAX( 1, Ymp ); if( YisR ) { /* * If sub( Y ) is replicated, there is no need to move sub( Y ) after the * operation (*YAPBY = 0), and it can be reused where needed and zeroed out * elsewhere. */ *YASUM = ( AisR ? 0 : ( npcol > 1 ) ); *YAPBY = 0; Yld = DESCY[ LLD_ ]; if( Ymp > 0 ) { *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); if( !AisR && ( mycol != Acoldst ) ) TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ymp, &K, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } else { /* * sub( Y ) is not replicated, the descriptor of A may need to be modified ... */ if( AisR ) { /* * If A is replicated, use only the copy in the process column where sub( Y ) * resides -> modify DESCA !!! */ *YASUM = 0; *YAPBY = 0; Yld = DESCY[ LLD_ ]; DESCA[ INB_ ] = N; DESCA[ CSRC_ ] = Ycol; if( ( Ymp > 0 ) && ( Ynq > 0 ) ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); } else { if( PB_Cspan( N, 0, Ainb, Anb, Acol, npcol ) ) { /* * Otherwise, A is not replicated, let assume in addition that it spans more * than one process column. */ *YASUM = ( npcol > 1 ); *YAPBY = 0; if( mycol == Ycol ) { /* * If sub( Y ) is not in the desired process column, send it there and zero it. * Otherwise, reuse it. */ Yld = DESCY[ LLD_ ]; if( Ymp > 0 ) { *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); if( Ycol != Acoldst ) { TYPE->Cgesd2d( ctxt, Ymp, K, *YAPTR, Yld, myrow, Acoldst ); TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ymp, &K, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } } else { /* * Allocate space in the other process columns and initialize to zero. If * sub( Y ) was not in the desired process column, receive it. */ Yld = MAX( 1, Ymp ); if( Ymp > 0 ) { *YAPTR = PB_Cmalloc( Ymp * K * TYPE->size ); *YAFREE = 1; if( ( Ycol != Acoldst ) && ( mycol == Acoldst ) ) TYPE->Cgerv2d( ctxt, Ymp, K, *YAPTR, Yld, myrow, Ycol ); else TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ymp, &K, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } } else { /* * A spans only one process column */ if( Ycol == Acol ) { /* * If A and sub( Y ) resides in the same process column, things are easy. */ *YASUM = 0; *YAPBY = 0; Yld = DESCY[ LLD_ ]; if( ( mycol == Ycol ) && ( Ymp > 0 ) ) *YAPTR = Mptr( Y, Yii, Yjj, Yld, TYPE->size ); } else { /* * Otherwise, sub( Y ) resides in another process column, thus allocate * zero-data in process column where a copy of sub( Y ) is desired, and receive * it. Set *YAPBY to 1, so that this data will be added (moved) after the local * operation has been performed. */ *YASUM = 0; *YAPBY = 1; if( Ymp > 0 ) { if( mycol == Ycol ) { Yld = DESCY[ LLD_ ]; TYPE->Cgesd2d( ctxt, Ymp, K, Mptr( Y, Yii, Yjj, Yld, TYPE->size ), Yld, myrow, Acoldst ); } else if( mycol == Acoldst ) { Yld = MAX( 1, Ymp ) ; *YAPTR = PB_Cmalloc( Ymp * K * TYPE->size ); *YAFREE = 1; TYPE->Cgerv2d( ctxt, Ymp, K, *YAPTR, Yld, myrow, Ycol ); } } Ycol = Acoldst; } } } } /* * Describe the resulting operand. Note that when reduction should occur, Ycol * contains the destination column. Assuming every process column needs the * result, Ycol is then -1. */ PB_Cdescset( DYA, M, K, Yimb1, K, Ymb, 1, Yrow, Ycol, ctxt, Yld ); return; } } /* * sub( Y ) cannot be reused, force YAPBY to 1 for the later update of sub( Y ). */ *YAPBY = 1; Amp = PB_Cnumroc( M, 0, Aimb, Amb, myrow, Arow, nprow ); Yld = MAX( 1, Amp ); if( YisR ) { /* * If sub( Y ) is replicated, allocate space in every process column owning some * columns of A and initialize it to zero only where needed. There may be some * wasted space (suppose A was residing in just one column), however, it is * hoped that moving back this data to sub( Y ) will then be cheaper ... */ *YASUM = ( AisR ? 0 : ( npcol > 1 ) ); if( Amp > 0 ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; if( ( Acoldst >= 0 ) && ( mycol != Acoldst ) ) TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &K, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } else { /* * sub( Y ) resides in only one process column */ if( AisR ) { /* * If A is replicated, then modify sub( A ) so that only one process column will * compute the result before moving it back to sub( Y ). */ *YASUM = 0; DESCA[ INB_ ] = N; if( YisRow ) { DESCA[ CSRC_ ] = 0; } else { /* * Choose a different process column than Ycol for better performance (more * links) in the later move-back phase. */ DESCA[ CSRC_ ] = MModSub1( Ycol, npcol ); } if( ( mycol == ( Acoldst = DESCA[CSRC_] ) ) && ( Amp > 0 ) ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; } } else { if( PB_Cspan( N, 0, Ainb, Anb, Acol, npcol ) ) { /* * If A is not replicated, and spans more than just one process column, then * allocate space in every process column and zero it where needed. */ *YASUM = ( npcol > 1 ); if( Amp > 0 ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; if( mycol != Acoldst ) TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &K, &izero, TYPE->zero, TYPE->zero, *YAPTR, &Yld ); } } else { /* * If A is not replicated, and spans only one process column, then allocate * space within that process column. */ *YASUM = 0; if( ( mycol == Acoldst ) && ( Amp > 0 ) ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; } } } } /* * Describe the resulting operand. Note that when reduction should occur, * Acoldst contains the destination column. Assuming every process column needs * the result, Acoldst is then -1. */ PB_Cdescset( DYA, M, K, Aimb, K, Amb, 1, Arow, Acoldst, ctxt, Yld ); /* * Move sub( Y ) in the desired processes and with the correct layout */ if( YisRow ) { PB_Cpaxpby( TYPE, CONJUG, K, M, TYPE->one, Y, IY, JY, DESCY, ROW, TYPE->zero, *YAPTR, 0, 0, DYA, COLUMN ); } else { PB_Cpaxpby( TYPE, CONJUG, M, K, TYPE->one, Y, IY, JY, DESCY, COLUMN, TYPE->zero, *YAPTR, 0, 0, DYA, COLUMN ); } } /* * End of PB_CInOutV2 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CInV.c000644 000766 000024 00000050701 10363532303 017546 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CInV( PBTYP_T * TYPE, char * CONJUG, char * ROWCOL, int M, int N, int * DESCA, int K, char * X, int IX, int JX, int * DESCX, char * XROC, char * * XAPTR, int * DXA, int * XAFREE ) #else void PB_CInV( TYPE, CONJUG, ROWCOL, M, N, DESCA, K, X, IX, JX, DESCX, XROC, XAPTR, DXA, XAFREE ) /* * .. Scalar Arguments .. */ char * CONJUG, * ROWCOL, * XROC; int * XAFREE, IX, JX, K, M, N; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DXA; char * X, * * XAPTR; #endif { /* * Purpose * ======= * * PB_CInV returns a pointer to an array that contains a one-dimensional * input only subvector which is replicated over the rows or columns of * a submatrix described by DESCA. A subvector is specified on input to * this routine that is reused whenever possible. On return, the subvec- * tor is specified by a pointer to some data, a descriptor array des- * cribing its layout and a logical value indicating if this local piece * of data has been dynamically allocated by this function. This routine * is specifically designed for traditional Level 2 like PBLAS opera- * tions using an input only vector such as PxGER, PxSYR ... * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies if this routine should return * the conjugate subvector as follows: * = 'N' or 'n': The initial subvector is returned, * = 'Z' or 'z': The conjugate subvector is returned. * * ROWCOL (global input) pointer to CHAR * On entry, ROWCOL specifies if this routine should return a * row or column subvector replicated over the underlying subma- * trix as follows: * = 'R' or 'r': A row subvector is returned, * = 'C' or 'c': A column subvector is returned. * * M (global input) INTEGER * On entry, M specifies the number of rows of the underlying * submatrix described by DESCA. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the underlying * submatrix described by DESCA. N must be at least zero. * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * K (global input) INTEGER * On entry, K specifies the length of the non-distributed di- * mension of the subvector sub( X ). K must be at least zero. * * X (local input) pointer to CHAR * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( K, IX ) ) when XROC is 'R' or 'r' * and MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( K, JX ) otherwise. * Lx is N when ROWCOL = 'R' or 'r' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * XROC (global input) pointer to CHAR * On entry, XROC specifies the orientation of the subvector * sub( X ). When XROC is 'R' or 'r', sub( X ) is a row vector, * and a column vector otherwise. * * XAPTR (local output) pointer to pointer to CHAR * On exit, * XAPTR is an array containing the same data as the * subvector sub( X ) which is replicated over the rows or co- * lumns of the underlying matrix as specified by ROWCOL and * DESCA. * * DXA (global and local output) INTEGER array * On exit, DXA is a descriptor array of dimension DLEN_ descri- * bing the data layout of the data pointed to by * XAPTR. * * XAFREE (local output) INTEGER * On exit, XAFREE specifies if it has been possible to reuse * the subvector sub( X ), i.e., if some dynamic memory was al- * located for the data pointed to by *XAPTR or not. When XAFREE * is zero, no dynamic memory was allocated. Otherwise, some dy- * namic memory was allocated by this function that one MUST re- * lease as soon as possible. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * top; int AColSpan, ARowSpan, Acol, Aimb, Ainb, AisD, Amb, Amp, Anb, Anq, Arow, Xcol, Xii, Ximb, Ximb1, Xinb, Xinb1, XisD, XisR, XisRow, Xjj, Xld=1, Xmb, Xmp, Xnb, Xnq, Xrow, ctxt, mycol, myrow, npcol, nprow; /* .. * .. Executable Statements .. * */ /* * Initialize the output parameters to a default value */ *XAFREE = 0; *XAPTR = NULL; /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) || ( K <= 0 ) ) { if( Mupcase( ROWCOL[0] ) == CROW ) { PB_Cdescset( DXA, K, N, 1, DESCA[INB_], 1, DESCA[NB_], DESCA[RSRC_], DESCA[CSRC_], DESCA[CTXT_], 1 ); } else { PB_Cdescset( DXA, M, K, DESCA[IMB_], 1, DESCA[MB_], 1, DESCA[RSRC_], DESCA[CSRC_], DESCA[CTXT_], DESCA[LLD_] ); } return; } /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Ycol */ Minfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, Xii, Xjj, Xrow, Xcol ); /* * Is sub( X ) distributed or not, replicated or not ? */ if( ( XisRow = ( Mupcase( XROC[0] ) == CROW ) ) != 0 ) { XisD = ( ( Xcol >= 0 ) && ( npcol > 1 ) ); XisR = ( ( Xrow == -1 ) || ( nprow == 1 ) ); } else { XisD = ( ( Xrow >= 0 ) && ( nprow > 1 ) ); XisR = ( ( Xcol == -1 ) || ( npcol == 1 ) ); } Arow = DESCA[ RSRC_ ]; Acol = DESCA[ CSRC_ ]; if( Mupcase( ROWCOL[0] ) == CROW ) { /* * Want a row vector */ Ainb = DESCA[ INB_ ]; Anb = DESCA[ NB_ ]; Mnumroc( Anq, N, 0, Ainb, Anb, mycol, Acol, npcol ); /* * Does A spans multiples process rows ? It does if Arow < 0. */ ARowSpan = ( Arow < 0 ) || Mspan( M, 0, DESCA[IMB_], DESCA[MB_], Arow, nprow ); if( XisRow && ( Mupcase( CONJUG[0] ) == CNOCONJG ) ) { /* * It is possible to reuse sub( X ) iff sub( X ) is already a row vector and * the data does not need to be conjugated. */ AisD = ( ( Acol >= 0 ) && ( npcol > 1 ) ); Xinb = DESCX[ INB_ ]; Xnb = DESCX[ NB_ ]; Mfirstnb( Xinb1, N, JX, Xinb, Xnb ); /* * sub( X ) is aligned with A (reuse condition) iff both operands are not * distributed, or both of them are distributed and start in the same process * column and either N is smaller than the first blocksize of sub( X ) and A, * or their column blocking factors match. */ if( ( !AisD && !XisD ) || ( ( AisD && XisD ) && ( ( Acol == Xcol ) && ( ( ( Ainb >= N ) && ( Xinb1 >= N ) ) || ( ( Ainb == Xinb1 ) && ( Anb == Xnb ) ) ) ) ) ) { /* * sub( X ) is aligned with A */ Ximb = DESCX[ IMB_ ]; Xmb = DESCX[ MB_ ]; Mfirstnb( Ximb1, K, IX, Ximb, Xmb ); if( XisR || ( !ARowSpan && ( Arow == Xrow ) ) ) { /* * If sub( X ) is replicated, or, A spans only one process row and either * sub( X ) is replicated or resides in the same process row than A, then * sub( X ) is already at the correct place. */ if( Anq > 0 ) { Xld = DESCX[ LLD_ ]; if( ARowSpan || ( myrow == Arow ) ) *XAPTR = Mptr( X, Xii, Xjj, Xld, TYPE->size ); } else { Xld = 1; } MDescSet( DXA, K, N, K, Xinb1, 1, Xnb, ( ARowSpan ? -1 : Arow ), Xcol, ctxt, Xld ); } else if( ARowSpan ) { /* * Otherwise, we know that sub( X ) cannot be replicated, let suppose in * addition that A spans all process rows. sub( X ) need simply to be broadcast * over A. */ if( myrow == Xrow ) { Xld = DESCX[ LLD_ ]; if( Anq > 0 ) { *XAPTR = Mptr( X, Xii, Xjj, Xld, TYPE->size ); top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); TYPE->Cgebs2d( ctxt, COLUMN, top, K, Anq, *XAPTR, Xld ); } } else { Xld = MAX( 1, K ); if( Anq > 0 ) { *XAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *XAFREE = 1; top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); TYPE->Cgebr2d( ctxt, COLUMN, top, K, Anq, *XAPTR, Xld, Xrow, mycol ); } } PB_Cdescset( DXA, K, N, K, Xinb1, 1, Xnb, -1, Xcol, ctxt, Xld ); } else { /* * Finally, sub( X ) is not replicated and A spans only one process row. There * is no need to broadcast, a send/recv is sufficient. */ if( myrow == Xrow ) { Xld = DESCX[ LLD_ ]; if( Anq > 0 ) { *XAPTR = Mptr( X, Xii, Xjj, Xld, TYPE->size ); TYPE->Cgesd2d( ctxt, K, Anq, *XAPTR, Xld, Arow, mycol ); } } else if( myrow == Arow ) { Xld = MAX( 1, K ); if( Anq > 0 ) { *XAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *XAFREE = 1; TYPE->Cgerv2d( ctxt, K, Anq, *XAPTR, Xld, Xrow, mycol ); } } PB_Cdescset( DXA, K, N, K, Xinb1, 1, Xnb, Arow, Xcol, ctxt, Xld ); } return; } } /* * sub( X ) cannot be reused, too bad ... redistribute */ PB_Cdescset( DXA, K, N, K, Ainb, 1, Anb, ( ARowSpan ? -1 : Arow ), Acol, ctxt, K ); Xmp = ( ARowSpan ? K : ( ( myrow == Arow ) ? K : 0 ) ); if( Xmp > 0 && Anq > 0 ) { *XAPTR = PB_Cmalloc( Anq * Xmp * TYPE->size ); *XAFREE = 1; } if( XisRow ) { PB_Cpaxpby( TYPE, CONJUG, K, N, TYPE->one, X, IX, JX, DESCX, XROC, TYPE->zero, *XAPTR, 0, 0, DXA, ROW ); } else { PB_Cpaxpby( TYPE, CONJUG, N, K, TYPE->one, X, IX, JX, DESCX, XROC, TYPE->zero, *XAPTR, 0, 0, DXA, ROW ); } } else { /* * Want a column vector */ Aimb = DESCA[IMB_]; Amb = DESCA[MB_]; Mnumroc( Amp, M, 0, Aimb, Amb, myrow, Arow, nprow ); /* * Does A spans multiples process columns ? It does if Acol < 0. */ AColSpan = ( Acol < 0 ) || Mspan( N, 0, DESCA[INB_], DESCA[NB_], Acol, npcol ); if( !( XisRow ) && ( Mupcase( CONJUG[0] ) == CNOCONJG ) ) { /* * It is possible to reuse sub( X ) iff sub( X ) is already a column vector and * the data does not need to be conjugated */ AisD = ( ( Arow >= 0 ) && ( nprow > 1 ) ); Ximb = DESCX[ IMB_ ]; Xmb = DESCX[ MB_ ]; Mfirstnb( Ximb1, M, IX, Ximb, Xmb ); /* * sub( X ) is aligned with A (reuse condition) iff both operands are not * distributed, or both of them are distributed and start in the same process * row and either M is smaller than the first blocksize of sub( X ) and A, or * their row blocking factors match. */ if( ( !AisD && !XisD ) || ( ( AisD && XisD ) && ( ( Arow == Xrow ) && ( ( ( Aimb >= M ) && ( Ximb1 >= M ) ) || ( ( Aimb == Ximb1 ) && ( Amb == Xmb ) ) ) ) ) ) { /* * sub( X ) is aligned with A */ Xinb = DESCX[ INB_ ]; Xnb = DESCX[ NB_ ]; Mfirstnb( Xinb1, K, JX, Xinb, Xnb ); if( XisR || ( !AColSpan && ( Acol == Xcol ) ) ) { /* * If sub( X ) is replicated, or, A spans only one process column and either * sub( X ) is replicated or resides in the same process columns than A, then * sub( X ) is already at the correct place. */ if( Amp > 0 ) { Xld = DESCX[ LLD_ ]; if( AColSpan || ( mycol == Acol ) ) *XAPTR = Mptr( X, Xii, Xjj, Xld, TYPE->size ); } else { Xld = 1; } MDescSet( DXA, M, K, Ximb1, K, Xmb, 1, Xrow, ( AColSpan ? -1 : Acol ), ctxt, Xld ); } else if( AColSpan ) { /* * Otherwise, we know that sub( X ) is not be replicated, let suppose in * addition that A spans all process columns. sub( X ) need simply to be * broadcast over A. */ if( mycol == Xcol ) { Xld = DESCX[ LLD_ ]; if( Amp > 0 ) { *XAPTR = Mptr( X, Xii, Xjj, Xld, TYPE->size ); top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); TYPE->Cgebs2d( ctxt, ROW, top, Amp, K, *XAPTR, Xld ); } } else { Xld = MAX( 1, Amp ); if( Amp > 0 ) { *XAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *XAFREE = 1; top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); TYPE->Cgebr2d( ctxt, ROW, top, Amp, K, *XAPTR, Xld, myrow, Xcol ); } } PB_Cdescset( DXA, M, K, Ximb1, K, Xmb, 1, Xrow, -1, ctxt, Xld ); } else { /* * Finally, sub( X ) is not replicated and A spans only one process column. * There is no need to broadcast, a send/recv is sufficient. */ if( mycol == Xcol ) { Xld = DESCX[ LLD_ ]; if( Amp > 0 ) { *XAPTR = Mptr( X, Xii, Xjj, Xld, TYPE->size ); TYPE->Cgesd2d( ctxt, Amp, K, *XAPTR, Xld, myrow, Acol ); } } else if( mycol == Acol ) { Xld = MAX( 1, Amp ); if( Amp > 0 ) { *XAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *XAFREE = 1; TYPE->Cgerv2d( ctxt, Amp, K, *XAPTR, Xld, myrow, Xcol ); } } PB_Cdescset( DXA, M, K, Ximb1, K, Xmb, 1, Xrow, Acol, ctxt, Xld ); } return; } } /* * sub( X ) cannot be reused, too bad ... redistribute */ PB_Cdescset( DXA, M, K, Aimb, K, Amb, 1, Arow, ( AColSpan ? -1 : Acol ), ctxt, MAX( 1, Amp ) ); Xnq = ( AColSpan ? K : ( ( mycol == Acol ) ? K : 0 ) ); if( Xnq > 0 && Amp > 0 ) { *XAPTR = PB_Cmalloc( Amp * Xnq * TYPE->size ); *XAFREE = 1; } if( XisRow ) { PB_Cpaxpby( TYPE, CONJUG, K, M, TYPE->one, X, IX, JX, DESCX, XROC, TYPE->zero, *XAPTR, 0, 0, DXA, COLUMN ); } else { PB_Cpaxpby( TYPE, CONJUG, M, K, TYPE->one, X, IX, JX, DESCX, XROC, TYPE->zero, *XAPTR, 0, 0, DXA, COLUMN ); } } /* * End of PB_CInV */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CInV2.c000644 000766 000024 00000044747 10363532303 017645 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CInV2( PBTYP_T * TYPE, char * CONJUG, char * ROWCOL, int M, int N, int * DESCA, int K, char * X, int IX, int JX, int * DESCX, char * XROC, char * XAPTR, int IJXA, int * DXA ) #else void PB_CInV2( TYPE, CONJUG, ROWCOL, M, N, DESCA, K, X, IX, JX, DESCX, XROC, XAPTR, IJXA, DXA ) /* * .. Scalar Arguments .. */ char * CONJUG, * ROWCOL, * XROC; int IJXA, IX, JX, K, M, N; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCX, * DXA; char * X, * XAPTR; #endif { /* * Purpose * ======= * * PB_CInV2 adds data to an array that contains a one-dimensional input * only subvector which is replicated over the rows or columns of a sub- * matrix described by DESCA. A subvector is specified on input to this * routine that is added to the replicated buffer. This routine is spe- * cifically designed for LCM hybrid variants. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies if this routine should conjugate * the subvector as follows: * = 'N' or 'n': The initial subvector is copied, * = 'Z' or 'z': The conjugate subvector is copied. * * ROWCOL (global input) pointer to CHAR * On entry, ROWCOL specifies if the existing buffer pointed to * XAPTR is a row or column subvector replicated over the under- * lying submatrix as follows: * = 'R' or 'r': XAPTR is a row subvector, * = 'C' or 'c': XAPTR is a column subvector. * * M (global input) INTEGER * On entry, M specifies the number of rows of the underlying * submatrix described by DESCA. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the underlying * submatrix described by DESCA. N must be at least zero. * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * K (global input) INTEGER * On entry, K specifies the length of the non-distributed di- * mension of the subvector sub( X ). K must be at least zero. * * X (local input) pointer to CHAR * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( K, IX ) ) when XROC is 'R' or 'r' * and MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( K, JX ) otherwise. * Lx is N when ROWCOL = 'R' or 'r' and M otherwise. Before en- * try, this array contains the local entries of the matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * XROC (global input) pointer to CHAR * On entry, XROC specifies the orientation of the subvector * sub( X ). When XROC is 'R' or 'r', sub( X ) is a row vector, * and a column vector otherwise. * * XAPTR (local input/local output) pointer to CHAR * On entry, XAPTR is an array containing some initial data. On * exit, the subvector sub( X ) is copied into this array which * is replicated over the rows or columns of the underlying ma- * trix as specified by ROWCOL and DESCA. * * IJXA (global input) INTEGER * On entry, IJXA specifies XA global row or column index depen- * ding on ROWCOL in the array pointed to by XAPTR, where the * subvector sub( X ) should copied. * * DXA (global and local input) INTEGER array * On entry, DXA is a descriptor array of dimension DLEN_ des- * cribing the data layout of the data pointed to by XAPTR. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * Xptr = NULL, * top; int AColSpan, ARowSpan, Acol, Aimb, Ainb, AisD, Amb, Amp, Anb, Anq, Arow, XAld, Xcol, Xii, Ximb1, Xinb1, XisD, XisR, XisRow, Xjj, Xld=1, Xmb, Xnb, Xrow, ctxt, mycol, myrow, npcol, nprow, size; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) || ( K <= 0 ) ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Ycol */ Minfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, Xii, Xjj, Xrow, Xcol ); /* * Is sub( X ) distributed or not, replicated or not ? */ if( ( XisRow = ( Mupcase( XROC[0] ) == CROW ) ) != 0 ) { XisD = ( ( Xcol >= 0 ) && ( npcol > 1 ) ); XisR = ( ( Xrow == -1 ) || ( nprow == 1 ) ); } else { XisD = ( ( Xrow >= 0 ) && ( nprow > 1 ) ); XisR = ( ( Xcol == -1 ) || ( npcol == 1 ) ); } Arow = DESCA[ RSRC_ ]; Acol = DESCA[ CSRC_ ]; if( Mupcase( ROWCOL[0] ) == CROW ) { /* * Want a row vector. It is possible to reuse sub( X ) iff sub( X ) is already * a row vector and the data does not need to be conjugated. */ if( XisRow && ( Mupcase( CONJUG[0] ) == CNOCONJG ) ) { AisD = ( ( Acol >= 0 ) && ( npcol > 1 ) ); Ainb = DESCA[INB_]; Anb = DESCA[NB_]; Xnb = DESCX[NB_]; Mfirstnb( Xinb1, N, JX, DESCX[INB_], Xnb ); /* * sub( X ) is aligned with A (reuse condition) iff both operands are not * distributed, or both of them are distributed and start in the same process * column and either N is smaller than the first blocksize of sub( X ) and A, * or their column blocking factors match. */ if( ( !AisD && !XisD ) || ( ( AisD && XisD ) && ( ( Acol == Xcol ) && ( ( ( Ainb >= N ) && ( Xinb1 >= N ) ) || ( ( Ainb == Xinb1 ) && ( Anb == Xnb ) ) ) ) ) ) { /* * sub( X ) is aligned with A. Does A spans multiples process rows ? It does * if Arow < 0. */ ARowSpan = ( Arow < 0 ) || Mspan( M, 0, DESCA[IMB_], DESCA[MB_], Arow, nprow ); Mnumroc( Anq, N, 0, Ainb, Anb, mycol, Acol, npcol ); if( XisR || ( !ARowSpan && ( Arow == Xrow ) ) ) { /* * If sub( X ) is replicated, or, A spans only one process row and either * sub( X ) is replicated or resides in the same process row than A, then * sub( X ) is already at the correct place. */ if( ( Anq > 0 ) && ( ARowSpan || ( myrow == Arow ) ) ) { size = TYPE->size; Xld = DESCX[ LLD_ ]; XAld = DXA[LLD_]; TYPE->Fmmadd( &K, &Anq, TYPE->one, Mptr( X, Xii, Xjj, Xld, size ), &Xld, TYPE->zero, Mptr( XAPTR, IJXA, 0, XAld, size ), &XAld ); } } else if( ARowSpan ) { /* * Otherwise, we know that sub( X ) cannot be replicated, let suppose in * addition that A spans all process rows. sub( X ) need simply to be broadcast * over A. */ if( myrow == Xrow ) { if( Anq > 0 ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); size = TYPE->size; Xld = DESCX[LLD_]; XAld = DXA[LLD_]; Xptr = Mptr( XAPTR, IJXA, 0, XAld, size ); TYPE->Fmmadd( &K, &Anq, TYPE->one, Mptr( X, Xii, Xjj, Xld, size ), &Xld, TYPE->zero, Xptr, &XAld ); TYPE->Cgebs2d( ctxt, COLUMN, top, K, Anq, Xptr, XAld ); } } else { if( Anq > 0 ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); XAld = DXA[LLD_]; TYPE->Cgebr2d( ctxt, COLUMN, top, K, Anq, Mptr( XAPTR, IJXA, 0, XAld, TYPE->size ), XAld, Xrow, mycol ); } } } else { /* * Finally, sub( X ) is not replicated and A spans only one process row. There * is no need to broadcast, a send/recv is sufficient. */ if( myrow == Xrow ) { if( Anq > 0 ) { Xld = DESCX[LLD_]; TYPE->Cgesd2d( ctxt, K, Anq, Mptr( X, Xii, Xjj, Xld, TYPE->size ), Xld, Arow, mycol ); } } else if( myrow == Arow ) { if( Anq > 0 ) { XAld = DXA[LLD_]; TYPE->Cgerv2d( ctxt, K, Anq, Mptr( XAPTR, IJXA, 0, XAld, TYPE->size ), XAld, Xrow, mycol ); } } } return; } } /* * sub( X ) cannot be reused, too bad ... redistribute */ if( XisRow ) { PB_Cpaxpby( TYPE, CONJUG, K, N, TYPE->one, X, IX, JX, DESCX, XROC, TYPE->zero, XAPTR, IJXA, 0, DXA, ROW ); } else { PB_Cpaxpby( TYPE, CONJUG, N, K, TYPE->one, X, IX, JX, DESCX, XROC, TYPE->zero, XAPTR, IJXA, 0, DXA, ROW ); } } else { /* * Want a column vector. It is possible to reuse sub( X ) iff sub( X ) is * already a column vector and the data does not need to be conjugated */ if( !( XisRow ) && ( Mupcase( CONJUG[0] ) == CNOCONJG ) ) { AisD = ( ( Arow >= 0 ) && ( nprow > 1 ) ); Aimb = DESCA[IMB_]; Amb = DESCA[MB_]; Xmb = DESCX[MB_]; Mfirstnb( Ximb1, M, IX, DESCX[IMB_], Xmb ); /* * sub( X ) is aligned with A (reuse condition) iff both operands are not * distributed, or both of them are distributed and start in the same process * row and either M is smaller than the first blocksize of sub( X ) and A, or * their row blocking factors match. */ if( ( !AisD && !XisD ) || ( ( AisD && XisD ) && ( ( Arow == Xrow ) && ( ( ( Aimb >= M ) && ( Ximb1 >= M ) ) || ( ( Aimb == Ximb1 ) && ( Amb == Xmb ) ) ) ) ) ) { /* * sub( X ) is aligned with A. Does A spans multiples process columns ? It * does if Acol < 0. */ AColSpan = ( Acol < 0 ) || Mspan( N, 0, DESCA[INB_], DESCA[NB_], Acol, npcol ); Mnumroc( Amp, M, 0, Aimb, Amb, myrow, Arow, nprow ); if( XisR || ( !AColSpan && ( Acol == Xcol ) ) ) { /* * If sub( X ) is replicated, or, A spans only one process column and either * sub( X ) is replicated or resides in the same process columns than A, then * sub( X ) is already at the correct place. */ if( ( Amp > 0 ) && ( AColSpan || ( mycol == Acol ) ) ) { size = TYPE->size; Xld = DESCX[ LLD_ ]; XAld = DXA[LLD_]; TYPE->Fmmadd( &Amp, &K, TYPE->one, Mptr( X, Xii, Xjj, Xld, size ), &Xld, TYPE->zero, Mptr( XAPTR, 0, IJXA, XAld, size ), &XAld ); } } else if( AColSpan ) { /* * Otherwise, we know that sub( X ) is not be replicated, let suppose in * addition that A spans all process columns. sub( X ) need simply to be * broadcast over A. */ if( mycol == Xcol ) { if( Amp > 0 ) { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); size = TYPE->size; Xld = DESCX[LLD_]; XAld = DXA[LLD_]; Xptr = Mptr( XAPTR, 0, IJXA, XAld, size ); TYPE->Fmmadd( &Amp, &K, TYPE->one, Mptr( X, Xii, Xjj, Xld, size ), &Xld, TYPE->zero, Xptr, &XAld ); TYPE->Cgebs2d( ctxt, ROW, top, Amp, K, Xptr, XAld ); } } else { if( Amp > 0 ) { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); XAld = DXA[LLD_]; TYPE->Cgebr2d( ctxt, ROW, top, Amp, K, Mptr( XAPTR, 0, IJXA, XAld, TYPE->size ), XAld, myrow, Xcol ); } } } else { /* * Finally, sub( X ) is not replicated and A spans only one process column. * There is no need to broadcast, a send/recv is sufficient. */ if( mycol == Xcol ) { if( Amp > 0 ) { Xld = DESCX[LLD_]; TYPE->Cgesd2d( ctxt, Amp, K, Mptr( X, Xii, Xjj, Xld, TYPE->size ), Xld, myrow, Acol ); } } else if( mycol == Acol ) { if( Amp > 0 ) { XAld = DXA[LLD_]; TYPE->Cgerv2d( ctxt, Amp, K, Mptr( XAPTR, 0, IJXA, XAld, TYPE->size ), XAld, myrow, Xcol ); } } } return; } } /* * sub( X ) cannot be reused, too bad ... redistribute */ if( XisRow ) { PB_Cpaxpby( TYPE, CONJUG, K, M, TYPE->one, X, IX, JX, DESCX, XROC, TYPE->zero, XAPTR, 0, IJXA, DXA, COLUMN ); } else { PB_Cpaxpby( TYPE, CONJUG, M, K, TYPE->one, X, IX, JX, DESCX, XROC, TYPE->zero, XAPTR, 0, IJXA, DXA, COLUMN ); } } /* * End of PB_CInV2 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Citypeset.c000644 000766 000024 00000006066 10363532303 020725 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" PBTYP_T * PB_Citypeset() { /* * Purpose * ======= * * PB_Citypeset on the first call initializes a static structure contai- * ning typed information and returns a pointer to it. The other calls * to this routine just returns this pointer. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ static int setup=0; static PBTYP_T TypeStruct; static int zero, one, negone; /* .. * .. Executable Statements .. * */ if( setup ) return( &TypeStruct ); setup = 1; TypeStruct.type = INT; TypeStruct.usiz = sizeof( int ); TypeStruct.size = sizeof( int ); zero = 0; one = 1; negone = -1; TypeStruct.zero = (char *) (&zero); TypeStruct.one = (char *) (&one); TypeStruct.negone = (char *) (&negone); TypeStruct.Cgesd2d = Cigesd2d; TypeStruct.Cgerv2d = Cigerv2d; TypeStruct.Cgebs2d = Cigebs2d; TypeStruct.Cgebr2d = Cigebr2d; TypeStruct.Cgsum2d = Cigsum2d; TypeStruct.Fmmadd = immadd_; TypeStruct.Fmmcadd = immadd_; TypeStruct.Fmmtadd = immtadd_; TypeStruct.Fmmtcadd = immtadd_; TypeStruct.Fmmdda = immdda_; TypeStruct.Fmmddac = immdda_; TypeStruct.Fmmddat = immddat_; TypeStruct.Fmmddact = immddat_; TypeStruct.Fcshft = NULL; TypeStruct.Frshft = NULL; TypeStruct.Fvvdotu = NULL; TypeStruct.Fvvdotc = NULL; TypeStruct.Fset = NULL; TypeStruct.Ftzpad = NULL; TypeStruct.Ftzpadcpy = NULL; TypeStruct.Ftzscal = NULL; TypeStruct.Fhescal = NULL; TypeStruct.Ftzcnjg = NULL; TypeStruct.Faxpy = NULL; TypeStruct.Fcopy = NULL; TypeStruct.Fswap = NULL; TypeStruct.Fgemv = NULL; TypeStruct.Fsymv = NULL; TypeStruct.Fhemv = NULL; TypeStruct.Ftrmv = NULL; TypeStruct.Ftrsv = NULL; TypeStruct.Fagemv = NULL; TypeStruct.Fasymv = NULL; TypeStruct.Fahemv = NULL; TypeStruct.Fatrmv = NULL; TypeStruct.Fgerc = NULL; TypeStruct.Fgeru = NULL; TypeStruct.Fsyr = NULL; TypeStruct.Fher = NULL; TypeStruct.Fsyr2 = NULL; TypeStruct.Fher2 = NULL; TypeStruct.Fgemm = NULL; TypeStruct.Fsymm = NULL; TypeStruct.Fhemm = NULL; TypeStruct.Fsyrk = NULL; TypeStruct.Fherk = NULL; TypeStruct.Fsyr2k = NULL; TypeStruct.Fher2k = NULL; TypeStruct.Ftrmm = NULL; TypeStruct.Ftrsm = NULL; return( &TypeStruct ); /* * End of PB_Citypeset */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Clastnb.c000644 000766 000024 00000003765 10363532303 020345 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Clastnb( int N, int I, int INB, int NB ) #else int PB_Clastnb( N, I, INB, NB ) /* * .. Scalar Arguments .. */ int I, INB, N, NB; #endif { /* * Purpose * ======= * * PB_Clastnb returns the global number of matrix rows or columns of the * last block, if N rows or columns are given out starting from the glo- * bal index I. Note that if N is equal 0, this routine returns 0. * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the number of rows/columns being dealt * out. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global index of the matrix entry. * I must be at least zero. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix distribution. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int lnbt; /* .. * .. Executable Statements .. * */ if( ( lnbt = I + N - INB ) > 0 ) { lnbt = lnbt - NB * ( ( NB + lnbt - 1 ) / NB - 1 ); return( MIN( lnbt, N ) ); } else { return( N ); } /* * End of PB_Clastnb */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Clcm.c000644 000766 000024 00000004406 10363532303 017626 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Clcm( int M, int N ) #else int PB_Clcm( M, N ) /* * .. Scalar Arguments .. */ int M, N; #endif { /* * Purpose * ======= * * PB_Clcm computes and returns the Least Common Multiple (LCM) of two * positive integers M and N. In fact, the routine computes the Greatest * Common Divisor (GCD) and use the property that M*N = GCD*LCM. * * Arguments * ========= * * M (input) INTEGER * On entry, M must be at least zero. * * N (input) INTEGER * On entry, N must be at least zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int gcd=1, m_val, n_val, t; /* .. * .. Executable Statements .. * */ if( M > N ) { m_val = N; n_val = M; } else { m_val = M; n_val = N; } while( m_val > 0 ) { while( !( m_val & 1 ) ) { /* * m is even */ m_val >>= 1; /* * if n is odd, gcd( m, n ) = gcd( m / 2, n ) */ if( !( n_val & 1 ) ) { /* * otherwise gcd( m, n ) = 2 * gcd( m / 2, n / 2 ) */ n_val >>= 1; gcd <<= 1; } } /* * m is odd now. If n is odd, gcd( m, n ) = gcd( m, ( m - n ) / 2 ). * Otherwise, gcd( m, n ) = gcd ( m, n / 2 ). */ n_val -= ( n_val & 1 ) ? m_val : 0; n_val >>= 1; while( n_val >= m_val ) { /* * If n is odd, gcd( m, n ) = gcd( m, ( m - n ) / 2 ). * Otherwise, gcd( m, n ) = gcd ( m, n / 2 ) */ n_val -= ( n_val & 1 ) ? m_val : 0; n_val >>= 1; } /* * n < m, gcd( m, n ) = gcd( n, m ) */ t = n_val; n_val = m_val; m_val = t; } return ( ( M * N ) / ( n_val * gcd ) ); /* * End of PB_Clcm */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cmalloc.c000644 000766 000024 00000003133 10363532303 020316 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ char * PB_Cmalloc( int LENGTH ) #else char * PB_Cmalloc( LENGTH ) /* * .. Scalar Arguments .. */ int LENGTH; #endif { /* * Purpose * ======= * * PB_Cmalloc allocates a dynamic memory buffer. In case of failure, the * program is stopped by calling Cblacs_abort. * * Arguments * ========= * * LENGTH (local input) INTEGER * On entry, LENGTH specifies the length in bytes of the buffer * to be allocated. If LENGTH is less or equal than zero, this * function returns NULL. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * bufptr = NULL; /* .. * .. Executable Statements .. * */ if( LENGTH > 0 ) { if( !( bufptr = (char *) malloc( (unsigned)LENGTH ) ) ) { (void) fprintf( stderr, "Not enough memory on line %d of file %s!!\n", __LINE__, __FILE__ ); Cblacs_abort( -1, -1 ); } } return( bufptr ); /* * End of PB_Cmalloc */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cnnxtroc.c000644 000766 000024 00000016101 10363532303 020541 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Cnnxtroc( int N, int I, int INB, int NB, int PROC, int SRCPROC, int NPROCS ) #else int PB_Cnnxtroc( N, I, INB, NB, PROC, SRCPROC, NPROCS ) /* * .. Scalar Arguments .. */ int I, INB, N, NB, NPROCS, PROC, SRCPROC; #endif { /* * Purpose * ======= * * PB_Cnnxtroc computes the number of next rows or columns of a subma- * trix that are possessed by processes closer to SRCPROC1 than PROC * where SRCPROC1 is the process owning the row or column globally in- * dexed by I. The submatrix is defined by giving out N rows or columns * starting from global index I. Therefore, if SRCPROC=0 and PROC=1, * then PB_Cnnxtroc returns the number of matrix rows or columns owned * by processes 2, 3 ... NPROCS-1. * * In fact, if the same exact parameters N, I, INB, NB, SRCPROC and * NPROCS are passed to PB_Cnpreroc, PB_Cnumroc and PB_Cnnxtroc produ- * cing respectively npre, np and nnxt, then npre + np + nnxt = N in * every process PROC. * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the number of rows/columns being dealt * out. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global index of the matrix entry. * I must be at least zero. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix distribution. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * PROC (local input) INTEGER * On entry, PROC specifies the coordinate of the process whose * local portion is determined. PROC must be at least zero and * strictly less than NPROCS. * * SRCPROC (global input) INTEGER * On entry, SRCPROC specifies the coordinate of the process * that possesses the first row or column of the matrix. When * SRCPROC = -1, the data is not distributed but replicated, * otherwise SRCPROC must be at least zero and strictly less * than NPROCS. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the matrix is distributed. NPROCS must * be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int ilocblk, mydist, nblocks; /* .. * .. Executable Statements .. * */ if( ( SRCPROC == -1 ) || ( NPROCS == 1 ) ) /* * The data is not distributed, or there is just one process in this dimension * of the grid. */ return( 0 ); /* * Compute coordinate of process owning I and corresponding INB */ if( ( INB -= I ) <= 0 ) { /* * I is not in first block, find out which process has it and update size of * first block */ nblocks = ( -INB ) / NB + 1; SRCPROC += nblocks; SRCPROC -= ( SRCPROC / NPROCS ) * NPROCS; INB += nblocks * NB; } /* * Now everything is just like N, I=0, INB, NB, SRCPROC, NPROCS. If the source * process owns the N rows or columns, nothing follows me ... */ if( N <= INB ) return( 0 ); /* * The discussion goes as follows: compute my distance from the source process * so that within this process coordinate system, the source process is the * process such that mydist = 0, or equivalently PROC == SRCPROC. * * Find out how many full blocks are globally (nblocks) and locally (ilocblk) * in those N entries. Then remark that * * when mydist < nblocks - ilocblk * NPROCS, I own ilocblk + 1 full blocks, * when mydist > nblocks - ilocblk * NPROCS, I own ilocblk full blocks, * when mydist = nblocks - ilocblk * NPROCS, either the last block is not full * and I own it, or the last block is full and I am the first process owning * only ilocblk full blocks. */ nblocks = ( N - INB ) / NB + 1; if( PROC == SRCPROC ) { /* * First note that I cannot be the source and the last process because mydist=0 * and NPROCS > 1. Since mydist = 0 and nblocks - ilocblk * NPROCS >= 0, there * are only two possible cases: * * 1) When mydist = nblocks - ilocblk * NPROCS = 0, that is NPROCS divides * the global number of full blocks, then the source process SRCPROC owns * one more block than the other processes; Thus, N can be rewritten as * N = INB + (nblocks-1) * NB + LNB with LNB >= 0 size of the last block. * Similarly, the local value Np corresponding to the local number of rows * and columns owned by the source process is INB + (ilocblk-1)*NB + LNB, * that is N + ( ilocblk-1 - (nblocks-1) )*NB. Therefore, there must be * ( nblocks - ilocblk ) * NB rows or columns following me. Note that this * case cannot happen when ilocblk is zero, since nblocks is at least one. * * 2) mydist = 0 < nblocks - ilocblk * NPROCS, the source process only owns * full blocks, and therefore locally INB + ilocblk * NB rows or columns. * Thus, N - INB - ilocblk * NB rows or columns follow me. Note that when * ilocblk is zero, this becomes simply N - INB. */ if( nblocks < NPROCS ) return( N - INB ); ilocblk = nblocks / NPROCS; return( ( ( nblocks - ilocblk * NPROCS ) ? N - INB - ilocblk * NB : ( nblocks - ilocblk ) * NB ) ); } else { /* * I am not the source process. Compute my distance from the source process. */ if( ( mydist = PROC - SRCPROC ) < 0 ) mydist += NPROCS; /* * If I am the last process i.e. mydist = NPROCS - 1, nothing follows me. */ if( mydist == NPROCS - 1 ) return( 0 ); /* * Otherwise, when mydist >= nblocks - ilocblk * NPROCS, there are exactly * NB * ilocblk * ( NPROCS - mydist ) rows or columns after me including mine, * i.e NB * ilocblk * ( NPROCS - 1 - mydist ) rows or columns following me. * Finally, when 0 < mydist < nblocks - ilocblk * NPROCS, the number of rows * or columns preceeding me is INB + ilocblk * NB + mydist*( ilocblk+1 )*NB * including mine, therefore there are N-INB-NB*( ilocblk+mydist*(ilocblk+1) ) * rows or columns following me. */ if( nblocks < NPROCS ) return( ( ( mydist < nblocks ) ? N - mydist * NB - INB : 0 ) ); ilocblk = nblocks / NPROCS; return( ( ( mydist >= ( nblocks - ilocblk * NPROCS ) ) ? ( NPROCS - 1 - mydist ) * ilocblk * NB : N - INB - ( ilocblk * mydist + ilocblk + mydist )*NB ) ); } /* * End of PB_Cnnxtroc */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cnpreroc.c000644 000766 000024 00000012161 10363532303 020520 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Cnpreroc( int N, int I, int INB, int NB, int PROC, int SRCPROC, int NPROCS ) #else int PB_Cnpreroc( N, I, INB, NB, PROC, SRCPROC, NPROCS ) /* * .. Scalar Arguments .. */ int I, INB, N, NB, NPROCS, PROC, SRCPROC; #endif { /* * Purpose * ======= * * PB_Cnpreroc computes the number of preceeding rows or columns of a * submatrix that are possessed by processes closer to SRCPROC1 than * PROC where SRCPROC1 is the process owning the row or column globally * indexed by I. The submatrix is defined by giving out N rows/columns * starting from global index I. Therefore, if SRCPROC=0 and PROC=4, * then PB_Cnpreroc returns the number of matrix rows or columns owned * by processes 0, 1, 2, and 3. * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the number of rows/columns being dealt * out. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global index of the matrix entry. * I must be at least zero. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix distribution. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * PROC (local input) INTEGER * On entry, PROC specifies the coordinate of the process whose * local portion is determined. PROC must be at least zero and * strictly less than NPROCS. * * SRCPROC (global input) INTEGER * On entry, SRCPROC specifies the coordinate of the process * that possesses the first row or column of the matrix. When * SRCPROC = -1, the data is not distributed but replicated, * otherwise SRCPROC must be at least zero and strictly less * than NPROCS. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the matrix is distributed. NPROCS must * be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int ilocblk, mydist, nblocks; /* .. * .. Executable Statements .. * */ if( ( SRCPROC == -1 ) || ( NPROCS == 1 ) ) /* * The data is not distributed, or there is just one process in this dimension * of the grid. */ return( 0 ); /* * Compute coordinate of process owning I and corresponding INB */ if( ( INB -= I ) <= 0 ) { /* * I is not in first block, find out which process has it and update size of * first block */ nblocks = ( -INB ) / NB + 1; SRCPROC += nblocks; SRCPROC -= ( SRCPROC / NPROCS ) * NPROCS; INB += nblocks * NB; } /* * Now everything is just like N, I=0, INB, NB, SRCPROC, NPROCS. If I am the * source process, nothing preceeds me ... */ if( PROC == SRCPROC ) return( 0 ); /* * If SRCPROC owns the N rows or columns, then return N since I cannot be the * source process anymore. */ if( N <= INB ) return( N ); /* * Find out how many full blocks are globally (nblocks) and locally (ilocblk) * in those N entries. */ nblocks = ( N - INB ) / NB + 1; /* * Compute my distance from the source process so that within this process * coordinate system, the source process is the process such that mydist=0. */ if( ( mydist = PROC - SRCPROC ) < 0 ) mydist += NPROCS; /* * When mydist < nblocks - ilocblk * NPROCS, I own ilocblk + 1 full blocks, * when mydist > nblocks - ilocblk * NPROCS, I own ilocblk full blocks, * when mydist = nblocks - ilocblk * NPROCS, either the last block is not full * and I own it, or the last block is full and I am the first process owning * only ilocblk full blocks. * * Therefore, when 0 < mydist <= nblocks - ilocblk * NPROCS, the number of rows * or columns preceeding me is INB + ilocblk*NB + (mydist-1)*(ilocblk+1)*NB, * i.e. INB - NB + ( ilocblk+1 ) * NB * mydist. Otherwise, there are exactly * NB * ilocblk * ( NPROCS - mydist ) rows or columns after me including mine, * i.e N + NB * ilocblk * ( mydist - NPROCS ) rows or columns preceeding me. */ if( nblocks < NPROCS ) return( ( ( mydist <= nblocks ) ? INB + NB * ( mydist - 1 ) : N ) ); ilocblk = nblocks / NPROCS; return( ( ( mydist <= ( nblocks - ilocblk * NPROCS ) ) ? INB - NB + ( ilocblk + 1 ) * NB * mydist : N + NB * ilocblk * ( mydist - NPROCS ) ) ); /* * End of PB_Cnpreroc */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cnumroc.c000644 000766 000024 00000015632 10363532303 020361 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Cnumroc( int N, int I, int INB, int NB, int PROC, int SRCPROC, int NPROCS ) #else int PB_Cnumroc( N, I, INB, NB, PROC, SRCPROC, NPROCS ) /* * .. Scalar Arguments .. */ int I, INB, N, NB, NPROCS, PROC, SRCPROC; #endif { /* * Purpose * ======= * * PB_Cnumroc returns the local number of matrix rows/columns process * PROC will get if we give out N rows/columns starting from global in- * dex I. * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the number of rows/columns being dealt * out. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global index of the matrix entry. * I must be at least zero. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * PROC (local input) INTEGER * On entry, PROC specifies the coordinate of the process whose * local portion is determined. PROC must be at least zero and * strictly less than NPROCS. * * SRCPROC (global input) INTEGER * On entry, SRCPROC specifies the coordinate of the process * that possesses the first row or column of the matrix. When * SRCPROC = -1, the data is not distributed but replicated, * otherwise SRCPROC must be at least zero and strictly less * than NPROCS. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the matrix is distributed. NPROCS must * be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int ilocblk, mydist, nblocks; /* .. * .. Executable Statements .. * */ if( ( SRCPROC == -1 ) || ( NPROCS == 1 ) ) /* * The data is not distributed, or there is just one process in this dimension * of the grid. */ return( N ); /* * Compute coordinate of process owning I and corresponding INB */ if( ( INB -= I ) <= 0 ) { /* * I is not in the first block, find out which process has it and update the * size of first block */ nblocks = (-INB) / NB + 1; SRCPROC += nblocks; SRCPROC -= ( SRCPROC / NPROCS ) * NPROCS; INB += nblocks * NB; } /* * Now everything is just like N, I=0, INB, NB, SRCPROC, NPROCS. The discussion * goes as follows: compute my distance from the source process so that within * this process coordinate system, the source process is the process such that * mydist = 0, or equivalently PROC == SRCPROC. * * Find out how many full blocks are globally (nblocks) and locally (ilocblk) * in those N entries. Then remark that * * when mydist < nblocks - ilocblk * NPROCS, I own ilocblk + 1 full blocks, * when mydist > nblocks - ilocblk * NPROCS, I own ilocblk full blocks, * when mydist = nblocks - ilocblk * NPROCS, either the last block is not full * and I own it, or the last block is full and I am the first process owning * only ilocblk full blocks. */ if( PROC == SRCPROC ) { /* * I am the source process, i.e. I own I (mydist = 0). When N <= INB, the * answer is simply N. */ if( N <= INB ) return( N ); /* * Find out how many full blocks are globally (nblocks) and locally (ilocblk) * in those N entries. */ nblocks = ( N - INB ) / NB + 1; /* * Since mydist = 0 and nblocks - ilocblk * NPROCS >= 0, there are only two * possible cases: * * 1) When mydist = nblocks - ilocblk * NPROCS = 0, that is NPROCS divides * the global number of full blocks, then the source process SRCPROC owns * one more block than the other processes; and N can be rewritten as * N = INB + (nblocks-1) * NB + LNB with LNB >= 0 size of the last block. * Similarly, the local value Np corresponding to N can be written as * Np = INB + (ilocblk-1) * NB + LNB = N + ( ilocblk-1 - (nblocks-1) )*NB. * Note that this case cannot happen when ilocblk is zero, since nblocks * is at least one. * * 2) mydist = 0 < nblocks - ilocblk * NPROCS, the source process only owns * full blocks, and therefore Np = INB + ilocblk * NB. Note that when * ilocblk is zero, Np is just INB. */ if( nblocks < NPROCS ) return( INB ); ilocblk = nblocks / NPROCS; return( ( nblocks - ilocblk * NPROCS ) ? INB + ilocblk * NB : N + ( ilocblk - nblocks ) * NB ); } else { /* * I am not the source process. When N <= INB, the answer is simply 0. */ if( N <= INB ) return( 0 ); /* * Find out how many full blocks are globally (nblocks) and locally (ilocblk) * in those N entries */ nblocks = ( N - INB ) / NB + 1; /* * Compute my distance from the source process so that within this process * coordinate system, the source process is the process such that mydist=0. */ if( ( mydist = PROC - SRCPROC ) < 0 ) mydist += NPROCS; /* * When mydist < nblocks - ilocblk * NPROCS, I own ilocblk + 1 full blocks of * size NB since I am not the source process, * * when mydist > nblocks - ilocblk * NPROCS, I own ilocblk full blocks of * size NB since I am not the source process, * * when mydist = nblocks - ilocblk * NPROCS, * either the last block is not full and I own it, in which case * N = INB + (nblocks - 1)*NB + LNB with LNB the size of the last block * such that NB > LNB > 0; the local value Np corresponding to N is given * by Np = ilocblk * NB + LNB = N - INB + ( ilocblk - nblocks + 1 ) * NB; * or the last block is full and I am the first process owning only ilocblk * full blocks of size NB, that is N = INB + ( nblocks - 1 ) * NB and * Np = ilocblk * NB = N - INB + ( ilocblk - nblocks + 1 ) * NB. */ if( nblocks < NPROCS ) return( ( mydist < nblocks ) ? NB : ( ( mydist > nblocks ) ? 0 : N - INB + NB * ( 1 - nblocks ) ) ); ilocblk = nblocks / NPROCS; mydist -= nblocks - ilocblk * NPROCS; return( ( mydist < 0 ) ? ( ilocblk + 1 ) * NB : ( ( mydist > 0 ) ? ilocblk * NB : N - INB + NB * ( ilocblk - nblocks + 1 ) ) ); } /* * End of PB_Cnumroc */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_COutV.c000644 000766 000024 00000030726 10363532303 017754 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_COutV( PBTYP_T * TYPE, char * ROWCOL, char * ZEROIT, int M, int N, int * DESCA, int K, char * * YAPTR, int * DYA, int * YAFREE, int * YASUM ) #else void PB_COutV( TYPE, ROWCOL, ZEROIT, M, N, DESCA, K, YAPTR, DYA, YAFREE, YASUM ) /* * .. Scalar Arguments .. */ char * ROWCOL, * ZEROIT; int * YAFREE, K, M, N, * YASUM; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DYA; char * * YAPTR; #endif { /* * Purpose * ======= * * PB_COutV returns a pointer to an array that contains a one-dimensio- * nal ouput zero subvector which is replicated over the rows or columns * of a submatrix described by DESCA. On return, the subvector is speci- * fied by a pointer to some data, a descriptor array describing its * layout, a logical value indicating if this local piece of data has * been dynamically allocated by this function, a logical value speci- * fying if sum reduction should occur. This routine is specifically * designed for traditional Level 2 and 3 PBLAS operations using an out- * put only vector such as PxTRMV, or PxTRMM. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * ROWCOL (global input) pointer to CHAR * On entry, ROWCOL specifies if this routine should return a * row or column subvector replicated over the underlying subma- * trix as follows: * = 'R' or 'r': A row subvector is returned, * = 'C' or 'c': A column subvector is returned. * * M (global input) INTEGER * On entry, M specifies the number of rows of the underlying * submatrix described by DESCA. M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the underlying * submatrix described by DESCA. N must be at least zero. * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * K (global input) INTEGER * On entry, K specifies the length of the non-distributed di- * mension of the subvector sub( Y ). K must be at least zero. * * YAPTR (local output) pointer to pointer to CHAR * On exit, * YAPTR is an array containing the same data as the * subvector sub( Y ) which is replicated over the rows or co- * lumns of the underlying matrix as specified by ROWCOL and * DESCA. * * DYA (global and local output) INTEGER array * On exit, DYA is a descriptor array of dimension DLEN_ descri- * bing the data layout of the data pointed to by * YAPTR. * * YAFREE (local output) INTEGER * On exit, YAFREE specifies if it was possible to reuse the * subvector sub( Y ), i.e., if some dynamic memory was alloca- * ted for the data pointed to by * YAPTR or not. When YAFREE is * zero, no dynamic memory was allocated. Otherwise, some dyna- * mic memory was allocated by this function that one MUST re- * lease as soon as possible. * * YASUM (global output) INTEGER * On exit, YASUM specifies if a global sum reduction should be * performed to obtain the correct sub( Y ). When YASUM is zero, * no reduction is to be performed, otherwise reduction should * occur. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Aimb, Ainb, Amb, Amp, Anb, Anq, Arow, Yld, ctxt, izero=0, nprow, myrow, npcol, mycol; char * zero; /* .. * .. Executable Statements .. * */ /* * Initialize the output parameters to a default value */ *YAFREE = 0; *YASUM = 0; *YAPTR = NULL; /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) || ( K <= 0 ) ) { if( Mupcase( ROWCOL[0] ) == CROW ) { PB_Cdescset( DYA, K, N, 1, DESCA[INB_], 1, DESCA[NB_], DESCA[RSRC_], DESCA[CSRC_], DESCA[CTXT_], 1 ); } else { PB_Cdescset( DYA, M, K, DESCA[IMB_], 1, DESCA[MB_], 1, DESCA[RSRC_], DESCA[CSRC_], DESCA[CTXT_], DESCA[LLD_] ); } return; } /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); Arow = DESCA[RSRC_]; Acol = DESCA[CSRC_]; if( Mupcase( ROWCOL[0] ) == CROW ) { /* * Want a row vector */ Ainb = DESCA[INB_]; Anb = DESCA[NB_]; Anq = PB_Cnumroc( N, 0, Ainb, Anb, mycol, Acol, npcol ); Yld = MAX( 1, K ); if( ( Arow < 0 ) || ( nprow == 1 ) || ( PB_Cspan( M, 0, DESCA[IMB_], DESCA[MB_], Arow, nprow ) ) ) { /* * A spans all process rows. Y should be reduced iff A is not replicated and * there is more than just one process row in the process grid. */ *YASUM = ( ( Arow >= 0 ) && ( nprow > 1 ) ); /* * Allocate the space for Y in the processes owning at least one column of A, * and initialize it to zero if requested. */ if( Anq > 0 ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; if( Mupcase( ZEROIT[0] ) == CINIT ) { zero = TYPE->zero; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Anq, &izero, zero, zero, *YAPTR, &Yld ); } } /* * Describe the newly created operand */ PB_Cdescset( DYA, K, N, K, Ainb, 1, Anb, -1, Acol, ctxt, Yld ); } else { /* * A spans only one process row. There is no need to reduce Y or even to * allocate some space for it outside this process row. */ *YASUM = 0; if( ( myrow == Arow ) && ( Anq > 0 ) ) { *YAPTR = PB_Cmalloc( K * Anq * TYPE->size ); *YAFREE = 1; if( Mupcase( ZEROIT[0] ) == CINIT ) { zero = TYPE->zero; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &K, &Anq, &izero, zero, zero, *YAPTR, &Yld ); } } /* * Describe the newly created operand */ PB_Cdescset( DYA, K, N, K, Ainb, 1, Anb, Arow, Acol, ctxt, Yld ); } } else { /* * Want a column vector */ Aimb = DESCA[ IMB_ ]; Amb = DESCA[ MB_ ]; Amp = PB_Cnumroc( M, 0, Aimb, Amb, myrow, Arow, nprow ); Yld = MAX( 1, Amp ); if( ( Acol < 0 ) || ( npcol == 1 ) || ( PB_Cspan( N, 0, DESCA[INB_], DESCA[NB_], Acol, npcol ) ) ) { /* * A spans all process columns. Y should be reduced iff A is not replicated and * there is more than just one process column in the process grid. */ *YASUM = ( ( Acol >= 0 ) && ( npcol > 1 ) ); /* * Allocate the space for Y in the processes owning at least one row of A, and * initialize it to zero if requested. */ if( Amp > 0 ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; if( Mupcase( ZEROIT[0] ) == CINIT ) { zero = TYPE->zero; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &K, &izero, zero, zero, *YAPTR, &Yld ); } } /* * Describe the newly created operand */ PB_Cdescset( DYA, M, K, Aimb, K, Amb, 1, Arow, -1, ctxt, Yld ); } else { /* * A spans only one process column. There is no need to reduce Y or even to * allocate some space for it outside this process column. */ *YASUM = 0; if( ( mycol == Acol ) && ( Amp > 0 ) ) { *YAPTR = PB_Cmalloc( Amp * K * TYPE->size ); *YAFREE = 1; if( Mupcase( ZEROIT[0] ) == CINIT ) { zero = TYPE->zero; TYPE->Ftzpad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &K, &izero, zero, zero, *YAPTR, &Yld ); } } /* * Describe the newly created operand */ PB_Cdescset( DYA, M, K, Aimb, K, Amb, 1, Arow, Acol, ctxt, Yld ); } } /* * End of PB_COutV */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cpaxpby.c000644 000766 000024 00000071362 10363532303 020363 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cpaxpby( PBTYP_T * TYPE, char * CONJUG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * AROC, char * BETA, char * B, int IB, int JB, int * DESCB, char * BROC ) #else void PB_Cpaxpby( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B, IB, JB, DESCB, BROC ) /* * .. Scalar Arguments .. */ char * AROC, * BROC, * CONJUG; int IA, IB, JA, JB, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * B; #endif { /* * Purpose * ======= * * PB_Cpaxpby adds one submatrix to another, * * sub( B ) := beta * sub( B ) + alpha * sub( A ), or, * * sub( B ) := beta * sub( B ) + alpha * conjg( sub( A ) ), * * where both submatrices are distributed along one dimension; sub( A ) * always denotes A(IA:IA+M-1,JA:JA+N-1). When AROC is 'R' or 'r' * sub( A ) is distributed along a process row, otherwise sub( A ) * is distributed along a process column. When sub( A ) is distributed * along a process row and BROC is 'R' or 'r' or sub( A ) is distributed * along a process column and BROC is 'C' or 'c', then sub( B ) denotes * B(IB:IB+M-1,JB:JB+N-1), and B(IB:IB+N-1,JB:JB+M-1) otherwise. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether conjg( sub( A ) ) or * sub( A ) should be added to sub( B ) as follows: * CONJUG = 'N' or 'n': * sub( B ) := beta*sub( B ) + alpha*sub( A ), * otherwise * sub( B ) := beta*sub( B ) + alpha*conjg( sub( A ) ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A cor- * responding to the entries of the submatrix sub( A ) need not * be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where LLD_A * is at least MAX( 1, Lr( 1, IA+M-1 ) ), and, Ka is at least * Lc( 1, JA+N-1 ). Before entry, this array contains the local * entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * AROC (global input) pointer to CHAR * On entry, AROC specifies the orientation of the subvector * sub( A ). When AROC is 'R' or 'r', sub( A ) is a row vector, * and a column vector otherwise. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B corres- * ponding to the entries of the submatrix sub( B ) need not be * set on input. * * B (local input/local output) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where LLD_B * is at least MAX( 1, Lr( 1, IB+M-1 ) ) when sub( A ) and * sub( B ) are both distributed along a process column or a * process row. In that case, Kb is at least Lc( 1, JB+N-1 ). * Otherwise, LLD_B is at least MAX( 1, Lr( 1, IB+N-1 ) ) and * Kb is at least Lc( 1, JB+M-1 ). Before entry, this array * contains the local entries of the matrix B. On exit, sub( B ) * is overwritten with the updated submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BROC (global input) pointer to CHAR * On entry, BROC specifies the orientation of the subvector * sub( B ). When BROC is 'R' or 'r', sub( B ) is a row vector, * and a column vector otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char ascope, bscope, * buf = NULL, * one, * top, tran, * zero; int Acol, Aii, AinbD, Ainb1D, AisD, AisR, AisRow, AiD, Ajj, Ald, AmyprocD, AmyprocR, AnbD, AnD, AnR, AnpD, AnprocsD, AnprocsR, AprocD, AprocR, Aroc, Arow, Bcol, Bii, BinbD, Binb1D, BisD, BisR, BisRow, BiD, Bjj, Bld, BmyprocD, BmyprocR, BnbD, BnD, BnR, BnpD, BnprocsD, BnprocsR, BprocD, BprocR, Broc, Brow, BsrcD, OneBlock, OneDgrid, RRorCC, Square, cdst, csrc, ctxt, dst, gcdPQ, k, l, lcmPQ, lcmb, ma, mb, mycol, myrow, na, nb, npcol, npq, nprow, p, q, rdst, rsrc, size, src; PB_VM_T VM; MMADD_T add; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol ); /* * Determine if sub( A ) is distributed or not */ if( ( AisRow = ( Mupcase( AROC[0] ) == CROW ) ) != 0 ) AisD = ( ( DESCA[CSRC_] >= 0 ) && ( ( AnprocsD = npcol ) > 1 ) ); else AisD = ( ( DESCA[RSRC_] >= 0 ) && ( ( AnprocsD = nprow ) > 1 ) ); /* * Determine if sub( B ) is distributed or not */ if( ( BisRow = ( Mupcase( BROC[0] ) == CROW ) ) != 0 ) BisD = ( ( DESCB[CSRC_] >= 0 ) && ( ( BnprocsD = npcol ) > 1 ) ); else BisD = ( ( DESCB[RSRC_] >= 0 ) && ( ( BnprocsD = nprow ) > 1 ) ); /* * AisD && BisD <=> both operands are indeed distributed */ if( AisD && BisD ) { /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); if( AisRow ) { AinbD = DESCA[INB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_]; AiD = JA; AnD = N; AnR = M; AprocD = Acol; AmyprocD = mycol; AprocR = Arow; AmyprocR = myrow; AnprocsR = nprow; AisR = ( ( DESCA[ RSRC_ ] == -1 ) || ( AnprocsR == 1 ) ); } else { AinbD = DESCA[IMB_]; AnbD = DESCA[MB_]; Ald = DESCA[LLD_]; AiD = IA; AnD = M; AnR = N; AprocD = Arow; AmyprocD = myrow; AprocR = Acol; AmyprocR = mycol; AnprocsR = npcol; AisR = ( ( DESCA[ CSRC_ ] == -1 ) || ( AnprocsR == 1 ) ); } Ainb1D = PB_Cfirstnb( AnD, AiD, AinbD, AnbD ); /* * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol ... */ PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); if( BisRow ) { BinbD = DESCB[ INB_ ]; BnbD = DESCB[ NB_ ]; BsrcD = DESCB[ CSRC_ ]; Bld = DESCB[ LLD_ ]; BiD = JB; if( AisRow ) { BnD = N; BnR = M; } else { BnD = M; BnR = N; } BprocD = Bcol; BmyprocD = mycol; BprocR = Brow; BmyprocR = myrow; BnprocsR = nprow; BisR = ( ( DESCB[ RSRC_ ] == -1 ) || ( BnprocsR == 1 ) ); } else { BinbD = DESCB[ IMB_ ]; BnbD = DESCB[ MB_ ]; BsrcD = DESCB[ RSRC_ ]; Bld = DESCB[ LLD_ ]; BiD = IB; if( AisRow ) { BnD = N; BnR = M; } else { BnD = M; BnR = N; } BprocD = Brow; BmyprocD = myrow; BprocR = Bcol; BmyprocR = mycol; BnprocsR = npcol; BisR = ( ( DESCB[ CSRC_ ] == -1 ) || ( BnprocsR == 1 ) ); } Binb1D = PB_Cfirstnb( BnD, BiD, BinbD, BnbD ); /* * Are sub( A ) and sub( B ) both row or column vectors ? */ RRorCC = ( ( AisRow && BisRow ) || ( !( AisRow ) && !( BisRow ) ) ); /* * Do sub( A ) and sub( B ) span more than one process ? */ OneDgrid = ( ( AnprocsD == 1 ) && ( BnprocsD == 1 ) ); OneBlock = ( ( Ainb1D >= AnD ) && ( Binb1D >= BnD ) ); /* * Are sub( A ) and sub( B ) distributed in the same manner ? */ Square = ( ( Ainb1D == Binb1D ) && ( AnbD == BnbD ) && ( AnprocsD == BnprocsD ) ); if( !( AisR ) ) { /* * sub( A ) is distributed but not replicated */ if( BisR ) { /* * If sub( A ) is not replicated, but sub( B ) is, a process row or column * BprocR need to be selected. It will contain the non-replicated vector to * add sub( A ) to. */ if( RRorCC ) { /* * sub( A ) and sub( B ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( AprocD == BprocD ) ) { /* * sub( A ) and sub( B ) start in the same process row or column AprocD=BprocD. * Enforce a purely local operation by choosing BprocR to be equal to AprocR. */ BprocR = AprocR; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for BprocR to maximize the number of links, i.e reduce contention. */ BprocR = MModAdd1( AprocR, AnprocsR ); } } else { /* * sub( A ) and sub( B ) are distributed in orthogonal directions, what is * chosen for YprocR does not really matter. Select the process origin. */ BprocR = AprocD; } } else { /* * Neither sub( A ) nor sub( B ) are replicated. If I am not in process row or * column AprocR and not in process row or column BprocR, then quick return. */ if( ( AmyprocR != AprocR ) && ( BmyprocR != BprocR ) ) return; } } else { /* * sub( A ) is distributed and replicated (so no quick return possible) */ if( BisR ) { /* * sub( B ) is distributed and replicated as well */ if( RRorCC ) { /* * sub( A ) and sub( B ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( AprocD == BprocD ) ) { /* * sub( A ) and sub( B ) start in the same process row or column AprocD=BprocD. * Enforce a purely local operation by choosing AprocR and BprocR to be equal * to zero. */ AprocR = BprocR = 0; } else { /* * Otherwise, communication has to occur, so select BprocR to be zero and the * next process row or column for AprocR in order to maximize the number of * used links, i.e reduce contention. */ BprocR = 0; AprocR = MModAdd1( BprocR, BnprocsR ); } } else { /* * sub( A ) and sub( B ) are distributed in orthogonal directions, select the * origin processes. */ AprocR = BprocD; BprocR = AprocD; } } else { /* * sub( B ) is distributed, but not replicated */ if( RRorCC ) { /* * sub( A ) and sub( B ) are both row or column vectors */ if( ( OneDgrid || OneBlock || Square ) && ( AprocD == BprocD ) ) { /* * sub( A ) and sub( B ) start in the same process row or column AprocD=BprocD. * Enforce a purely local operation by choosing AprocR to be equal to BprocR. */ AprocR = BprocR; if( ( AmyprocR != AprocR ) && ( BmyprocR != BprocR ) ) return; } else { /* * Otherwise, communication has to occur, so choose the next process row or * column for AprocR to maximize the number of links, i.e reduce contention. */ AprocR = MModAdd1( BprocR, BnprocsR ); } } else { /* * sub( A ) and sub( B ) are distributed in orthogonal directions, what is * chosen for AprocR does not really matter. Select the origin process. */ AprocR = BprocD; if( ( OneDgrid || OneBlock || Square ) && ( AmyprocR != AprocR ) && ( BmyprocR != BprocR ) ) return; } } } /* * Even if sub( A ) and/or sub( B ) are replicated, only two process row or * column are active, namely AprocR and BprocR. If any of those operands is * replicated, broadcast will occur (unless there is an easy way out). */ size = TYPE->size; /* * A purely local operation occurs iff the operands start in the same process * and, if either the grid is mono-dimensional or there is a single local block * to be added or if both operands are aligned. */ if( ( ( RRorCC && ( AprocD == BprocD ) && ( AisR || BisR || ( AprocR == BprocR ) ) ) || ( !( RRorCC ) && ( BisR || ( AprocD == BprocR ) ) && ( AisR || ( AprocR == BprocD ) ) ) ) && ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) { if( ( !AisR && ( AmyprocR == AprocR ) ) || ( AisR && ( BisR || BmyprocR == BprocR ) ) ) { AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD, AnprocsD ); BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, BmyprocD, BprocD, BnprocsD ); if( ( AnpD > 0 ) && ( BnpD > 0 ) ) { /* * Select the local add routine accordingly to RRorCC */ if( RRorCC ) { if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd; else add = TYPE->Fmmadd; } else { if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd; else add = TYPE->Fmmtadd; } /* * Local addition */ if( AisRow ) add( &AnR, &AnpD, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else add( &AnpD, &AnR, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } } if( RRorCC && AisR && BisR ) return; } else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) { /* * Otherwise, it may be possible to add the distributed vectors in a single * message exchange iff the grid is mono-dimensional and the operands are * distributed in the same direction, or there is just one block to be exchanged * or if both operands are similarly distributed in their respective direction. */ if( RRorCC ) { if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd; else add = TYPE->Fmmadd; } else { if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd; else add = TYPE->Fmmtadd; } if( ( AisR && BisR ) || ( AmyprocR == AprocR ) ) { AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD, AnprocsD ); if( AnpD > 0 ) { dst = BprocD + MModSub( AmyprocD, AprocD, AnprocsD ); dst = MPosMod( dst, BnprocsD ); if( AisRow ) { ma = AnR; na = AnpD; } else { ma = AnpD; na = AnR; } if( !( AisR && BisR ) ) { if( BisRow ) { rdst = BprocR; cdst = dst; } else { rdst = dst; cdst = BprocR; } } else { if( BisRow ) { if( !AisRow ) { rdst = AmyprocR; } else { rdst = MModAdd1( BmyprocR, BnprocsR ); } cdst = dst; } else { rdst = dst; if( AisRow ) { cdst = AmyprocR; } else { cdst = MModAdd1( BmyprocR, BnprocsR ); } } } if( ( myrow == rdst ) && ( mycol == cdst ) ) { add( &ma, &na, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } else { TYPE->Cgesd2d( ctxt, ma, na, Mptr( A, Aii, Ajj, Ald, size ), Ald, rdst, cdst ); } } } if( ( AisR && BisR ) || ( BmyprocR == BprocR ) ) { BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, BmyprocD, BprocD, BnprocsD ); if( BnpD > 0 ) { src = AprocD + MModSub( BmyprocD, BprocD, BnprocsD ); src = MPosMod( src, AnprocsD ); if( AisRow ) { ma = BnR; na = BnpD; } else { ma = BnpD; na = BnR; } if( !( AisR && BisR ) ) { if( AisRow ) { rsrc = AprocR; csrc = src; } else { rsrc = src; csrc = AprocR; } } else { if( AisRow ) { if( !BisRow ) { rsrc = BmyprocR; } else { rsrc = MModSub1( AmyprocR, AnprocsR ); } csrc = src; } else { rsrc = src; if( BisRow ) { csrc = BmyprocR; } else { csrc = MModSub1( AmyprocR, AnprocsR ); } } } if( ( myrow != rsrc ) || ( mycol != csrc ) ) { buf = PB_Cmalloc( BnpD * BnR * size ); TYPE->Cgerv2d( ctxt, ma, na, buf, ma, rsrc, csrc ); add( &ma, &na, ALPHA, buf, &ma, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); if( buf ) free( buf ); } } } if( AisR && BisR ) return; } else { /* * General case */ if( RRorCC ) { if( Mupcase( CONJUG[0] ) != CNOCONJG ) tran = CCONJG; else tran = CNOTRAN; } else { if( Mupcase( CONJUG[0] ) != CNOCONJG ) tran = CCOTRAN; else tran = CTRAN; } if( AisRow ) { ascope = CCOLUMN; ma = AnR; } else { ascope = CROW; na = AnR; } bscope = ( BisRow ? CCOLUMN : CROW ); lcmb = PB_Clcm( AnprocsD * AnbD, BnprocsD * BnbD ); one = TYPE->one; zero = TYPE->zero; gcdPQ = PB_Cgcd( AnprocsD, BnprocsD ); lcmPQ = ( AnprocsD / gcdPQ ) * BnprocsD; for( k = 0; k < gcdPQ; k++ ) { p = 0; q = k; for( l = 0; l < lcmPQ; l++ ) { Aroc = MModAdd( AprocD, p, AnprocsD ); Broc = MModAdd( BprocD, q, BnprocsD ); if( ( AmyprocD == Aroc ) || ( BmyprocD == Broc ) ) { AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, Aroc, AprocD, AnprocsD ); BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, Broc, BprocD, BnprocsD ); PB_CVMinit( &VM, 0, AnpD, BnpD, Ainb1D, Binb1D, AnbD, BnbD, p, q, AnprocsD, BnprocsD, lcmb ); if( npq = PB_CVMnpq( &VM ) ) { if( ( RRorCC && ( Aroc == Broc ) && ( AisR || ( AprocR == BprocR ) ) ) || ( !( RRorCC ) && ( Aroc == BprocR ) && ( AisR || ( AprocR == Broc ) ) ) ) { if( ( BmyprocD == Broc ) && ( BmyprocR == BprocR ) ) { PB_CVMloc( TYPE, &VM, ROW, &ascope, PACKING, &tran, npq, AnR, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), Bld ); } } else { if( ( AmyprocR == AprocR ) && ( AmyprocD == Aroc ) ) { if( AisRow ) { na = npq; } else { ma = npq; } buf = PB_Cmalloc( ma * na * size ); PB_CVMpack( TYPE, &VM, ROW, &ascope, PACKING, NOTRAN, npq, AnR, one, Mptr( A, Aii, Ajj, Ald, size ), Ald, zero, buf, ma ); if( BisRow ) { rdst = BprocR; cdst = Broc; } else { rdst = Broc; cdst = BprocR; } TYPE->Cgesd2d( ctxt, ma, na, buf, ma, rdst, cdst ); if( buf ) free ( buf ); } if( ( BmyprocR == BprocR ) && ( BmyprocD == Broc ) ) { if( AisRow ) { na = npq; rsrc = AprocR; csrc = Aroc; } else { ma = npq; rsrc = Aroc; csrc = AprocR; } buf = PB_Cmalloc( ma * na * size ); TYPE->Cgerv2d( ctxt, ma, na, buf, ma, rsrc, csrc ); PB_CVMpack( TYPE, &VM, COLUMN, &bscope, UNPACKING, &tran, npq, AnR, BETA, Mptr( B, Bii, Bjj, Bld, size ), Bld, ALPHA, buf, ma ); if( buf ) free ( buf ); } } } } p = MModAdd1( p, AnprocsD ); q = MModAdd1( q, BnprocsD ); } if( AisR ) AprocR = MModAdd1( AprocR, AnprocsR ); } } if( BisR ) { /* * Replicate sub( B ) */ BnpD = PB_Cnumroc( BnD, BiD, BinbD, BnbD, BmyprocD, BsrcD, BnprocsD ); if( BnpD > 0 ) { if( BisRow ) { bscope = CCOLUMN; mb = BnR; nb = BnpD; rsrc = BprocR; csrc = BmyprocD; } else { bscope = CROW; mb = BnpD; nb = BnR; rsrc = BmyprocD; csrc = BprocR; } top = PB_Ctop( &ctxt, BCAST, &bscope, TOP_GET ); if( BmyprocR == BprocR ) { TYPE->Cgebs2d( ctxt, &bscope, top, mb, nb, Mptr( B, Bii, Bjj, Bld, size ), Bld ); } else { TYPE->Cgebr2d( ctxt, &bscope, top, mb, nb, Mptr( B, Bii, Bjj, Bld, size ), Bld, rsrc, csrc ); } } } } else if( !( AisD ) && BisD ) { /* * sub( A ) is not distributed and sub( B ) is distributed. */ PB_CpaxpbyND( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B, IB, JB, DESCB, BROC ); } else if( AisD && !( BisD ) ) { /* * sub( A ) is distributed and sub( B ) is not distributed. */ PB_CpaxpbyDN( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B, IB, JB, DESCB, BROC ); } else { /* * Neither sub( A ) nor sub( B ) are distributed. */ PB_CpaxpbyNN( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B, IB, JB, DESCB, BROC ); } /* * End of PB_Cpaxpby */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpaxpbyDN.c000644 000766 000024 00000111465 10363532303 020604 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpaxpbyDN( PBTYP_T * TYPE, char * CONJUG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * AROC, char * BETA, char * B, int IB, int JB, int * DESCB, char * BROC ) #else void PB_CpaxpbyDN( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B, IB, JB, DESCB, BROC ) /* * .. Scalar Arguments .. */ char * AROC, * BROC, * CONJUG; int IA, IB, JA, JB, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * B; #endif { /* * Purpose * ======= * * PB_CpaxpbyDN adds one submatrix to another, * * sub( B ) := beta * sub( B ) + alpha * sub( A ), or, * * sub( B ) := beta * sub( B ) + alpha * conjg( sub( A ) ), * * where sub( A ) is distributed and sub( B ) is not distributed. * * sub( A ) always denotes A(IA:IA+M-1,JA:JA+N-1). When AROC is 'R' or * 'r' sub( A ) resides in a process row, otherwise sub( A ) resides in * a process column. When sub( A ) resides in a process row and BROC is * 'R' or 'r' or sub( A ) resides in a process column and BROC is 'C' or * 'c', then sub( B ) denotes B( IB:IB+M-1, JB:JB+N-1 ), and otherwise * sub( B ) denotes B(IB:IB+N-1,JB:JB+M-1). * otherwise. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether conjg( sub( A ) ) or * sub( A ) should be added to sub( B ) as follows: * CONJUG = 'N' or 'n': * sub( B ) := beta*sub( B ) + alpha*sub( A ), * otherwise * sub( B ) := beta*sub( B ) + alpha*conjg( sub( A ) ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A cor- * responding to the entries of the submatrix sub( A ) need not * be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where LLD_A * is at least MAX( 1, Lr( 1, IA+M-1 ) ), and, Ka is at least * Lc( 1, JA+N-1 ). Before entry, this array contains the local * entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * AROC (global input) pointer to CHAR * On entry, AROC specifies the orientation of the subvector * sub( A ). When AROC is 'R' or 'r', sub( A ) is a row vector, * and a column vector otherwise. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B corres- * ponding to the entries of the submatrix sub( B ) need not be * set on input. * * B (local input/local output) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where LLD_B * is at least MAX( 1, Lr( 1, IB+M-1 ) ) when sub( A ) and * sub( B ) are both distributed along a process column or a * process row. In that case, Kb is at least Lc( 1, JB+N-1 ). * Otherwise, LLD_B is at least MAX( 1, Lr( 1, IB+N-1 ) ) and * Kb is at least Lc( 1, JB+M-1 ). Before entry, this array * contains the local entries of the matrix B. On exit, sub( B ) * is overwritten with the updated submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BROC (global input) pointer to CHAR * On entry, BROC specifies the orientation of the subvector * sub( B ). When BROC is 'R' or 'r', sub( B ) is a row vector, * and a column vector otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char scope, * top, * zero; int Acol, Aii, Ainb1D, AisR, AisRow, Ajj, Ald, AmyprocD, AmyprocR, AnD, AnbD, AnpD, AnprocsD, AprocD, AprocR, Aroc, Arow, Bcol, Bii, BisR, BisRow, Bjj, Bld, Bm, BmyprocD, BmyprocR, Bn, BnprocsD, BprocR, Broc, Brow, RRorCC, ctxt, izero=0, k, kbb, kk, kn, ktmp, mycol, mydist, myproc, myrow, npcol, nprow, p, size; MMADD_T add; TZPAD_T pad; /* * .. Local Arrays .. */ char * buf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); if( ( AisRow = ( Mupcase( AROC[0] ) == CROW ) ) != 0 ) { AnD = N; AnbD = DESCA[NB_]; Ald = DESCA[LLD_]; AprocD = Acol; AprocR = Arow; AmyprocD = mycol; AmyprocR = myrow; AnprocsD = npcol; AisR = ( ( Arow == -1 ) || ( nprow == 1 ) ); Ainb1D = PB_Cfirstnb( AnD, JA, DESCA[INB_], AnbD ); } else { AnD = M; AnbD = DESCA[MB_]; Ald = DESCA[LLD_]; AprocD = Arow; AprocR = Acol; AmyprocD = myrow; AmyprocR = mycol; AnprocsD = nprow; AisR = ( ( Acol == -1 ) || ( npcol == 1 ) ); Ainb1D = PB_Cfirstnb( AnD, IA, DESCA[IMB_], AnbD ); } /* * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol ... */ PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); if( ( BisRow = ( Mupcase( BROC[0] ) == CROW ) ) != 0 ) { Bld = DESCB[LLD_]; BmyprocD = mycol; BnprocsD = npcol; BprocR = Brow; BmyprocR = myrow; BisR = ( ( BprocR == -1 ) || ( nprow == 1 ) ); } else { Bld = DESCB[LLD_]; BmyprocD = myrow; BnprocsD = nprow; BprocR = Bcol; BmyprocR = mycol; BisR = ( ( BprocR == -1 ) || ( npcol == 1 ) ); } /* * Are sub( A ) and sub( B ) both row or column vectors ? */ RRorCC = ( ( AisRow && BisRow ) || ( !( AisRow ) && !( BisRow ) ) ); /* * Select the local add routine accordingly */ size = TYPE->size; /* * sub( A ) is distributed and sub( B ) is not distributed */ if( !( BisR ) ) { /* * sub( B ) is not replicated. Since this operation is local if sub( B ) and * sub( A ) are both row or column vectors, choose AprocR = BprocR when RRorCC, * and AprocR = 0 otherwise. */ if( AisR ) { AprocR = ( ( RRorCC ) ? BprocR : 0 ); } /* * Now, it is just like sub( A ) is not replicated, this information however is * kept in AisR for later use. */ if( ( AmyprocR == AprocR ) || ( BmyprocR == BprocR ) ) { if( RRorCC ) { /* * sub( A ) and sub( B ) are both row or column vectors */ zero = TYPE->zero; if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd; else add = TYPE->Fmmadd; pad = TYPE->Ftzpad; AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD, AnprocsD ); /* * sub( A ) and sub( B ) are in the same process row or column */ if( AprocR == BprocR ) { /* * In each process, the distributed part of sub( A ) is added to sub( B ). In * the other processes, this replicated of sub( B ) is set to zero for later * reduction. */ if( AnpD > 0 ) { Aroc = AprocD; if( BisRow ) { kk = Ajj; ktmp = JB + N; kn = JB + Ainb1D; } else { kk = Aii; ktmp = IB + M; kn = IB + Ainb1D; } if( AmyprocD == Aroc ) { if( BisRow ) add( &M, &Ainb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else add( &Ainb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); kk += Ainb1D; } else { if( BisRow ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &Ainb1D, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ainb1D, &N, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } Aroc = MModAdd1( Aroc, AnprocsD ); for( k = kn; k < ktmp; k += AnbD ) { kbb = ktmp - k; kbb = MIN( kbb, AnbD ); if( AmyprocD == Aroc ) { if( BisRow ) add( &M, &kbb, ALPHA, Mptr( A, Aii, kk, Ald, size ), &Ald, BETA, Mptr( B, Bii, k, Bld, size ), &Bld ); else add( &kbb, &N, ALPHA, Mptr( A, kk, Ajj, Ald, size ), &Ald, BETA, Mptr( B, k, Bjj, Bld, size ), &Bld ); kk += kbb; } else { if( BisRow ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &kbb, &izero, zero, zero, Mptr( B, Bii, k, Bld, size ), &Bld ); else pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kbb, &N, &izero, zero, zero, Mptr( B, k, Bjj, Bld, size ), &Bld ); } Aroc = MModAdd1( Aroc, AnprocsD ); } } else { /* * If I don't own any entries of sub( A ), then zero the entire sub( B ) * residing in this process. */ pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &N, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } /* * Replicate locally scattered sub( B ) by reducing it */ scope = ( BisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); TYPE->Cgsum2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj, Bld, size ), Bld, -1, 0 ); } else { /* * sub( A ) and sub( B ) are in a different process row or column */ if( AmyprocR == AprocR ) { /* * If I own a piece of sub( A ), then send it to the corresponding process row * or column where sub( B ) resides. */ if( AnpD > 0 ) { if( AisRow ) TYPE->Cgesd2d( ctxt, M, AnpD, Mptr( A, Aii, Ajj, Ald, size ), Ald, BprocR, BmyprocD ); else TYPE->Cgesd2d( ctxt, AnpD, N, Mptr( A, Aii, Ajj, Ald, size ), Ald, BmyprocD, BprocR ); } } if( BmyprocR == BprocR ) { /* * If I own sub( B ), then receive and unpack distributed part of sub( A ) that * should be added to sub( B ). Combine the results. */ if( AnpD > 0 ) { if( BisRow ) { ktmp = JB + N; kn = JB + Ainb1D; buf = PB_Cmalloc( M * AnpD * size ); TYPE->Cgerv2d( ctxt, M, AnpD, buf, M, AprocR, AmyprocD ); } else { ktmp = IB + M; kn = IB + Ainb1D; buf = PB_Cmalloc( AnpD * N * size ); TYPE->Cgerv2d( ctxt, AnpD, N, buf, AnpD, AmyprocD, AprocR ); } Aroc = AprocD; kk = 0; if( AmyprocD == Aroc ) { if( BisRow ) add( &M, &Ainb1D, ALPHA, buf, &M, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else add( &Ainb1D, &N, ALPHA, buf, &AnpD, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); kk += Ainb1D; } else { if( BisRow ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &Ainb1D, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ainb1D, &N, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } Aroc = MModAdd1( Aroc, AnprocsD ); for( k = kn; k < ktmp; k += AnbD ) { kbb = ktmp - k; kbb = MIN( kbb, AnbD ); if( AmyprocD == Aroc ) { if( BisRow ) add( &M, &kbb, ALPHA, Mptr( buf, 0, kk, M, size ), &M, BETA, Mptr( B, Bii, k, Bld, size ), &Bld ); else add( &kbb, &N, ALPHA, Mptr( buf, kk, 0, AnpD, size ), &AnpD, BETA, Mptr( B, k, Bjj, Bld, size ), &Bld ); kk += kbb; } else { if( BisRow ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &kbb, &izero, zero, zero, Mptr( B, Bii, k, Bld, size ), &Bld ); else pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kbb, &N, &izero, zero, zero, Mptr( B, k, Bjj, Bld, size ), &Bld ); } Aroc = MModAdd1( Aroc, AnprocsD ); } if( buf ) free( buf ); } else { /* * If I don't own any entries of sub( A ), then zero the entire sub( B ) * residing in this process. */ pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &N, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } /* * Replicate locally scattered sub( B ) by reducing it */ scope = ( BisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); TYPE->Cgsum2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj, Bld, size ), Bld, -1, 0 ); } } } else { /* * sub( A ) and sub( B ) are not both row or column vectors */ zero = TYPE->zero; if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd; else add = TYPE->Fmmtadd; pad = TYPE->Ftzpad; Broc = 0; if( BisRow ) { ktmp = JB + M; kn = JB + Ainb1D; } else { ktmp = IB + N; kn = IB + Ainb1D; } /* * Loop over the processes in which sub( A ) resides, for each process find the * next process Xroc. Exchange and add the data. */ for( p = 0; p < AnprocsD; p++ ) { mydist = MModSub( p, AprocD, AnprocsD ); myproc = MModAdd( AprocD, mydist, AnprocsD ); if( ( BprocR == p ) && ( AprocR == Broc ) ) { if( BmyprocR == p ) { /* * local add at the intersection of the process cross */ AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, p, AprocD, AnprocsD ); if( AnpD > 0 ) { Aroc = AprocD; kk = ( BisRow ? Aii : Ajj ); if( myproc == Aroc ) { if( BmyprocD == Broc ) { if( AisRow ) add( &M, &Ainb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else add( &Ainb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); kk += Ainb1D; } else { if( BisRow ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N, &Ainb1D, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ainb1D, &M, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } } Aroc = MModAdd1( Aroc, AnprocsD ); for( k = kn; k < ktmp; k += AnbD ) { kbb = ktmp - k; kbb = MIN( kbb, AnbD ); if( myproc == Aroc ) { if( BmyprocD == Broc ) { if( AisRow ) add( &M, &kbb, ALPHA, Mptr( A, Aii, kk, Ald, size ), &Ald, BETA, Mptr( B, k, Bjj, Bld, size ), &Bld ); else add( &kbb, &N, ALPHA, Mptr( A, kk, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, k, Bld, size ), &Bld ); kk += kbb; } else { if( BisRow ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N, &kbb, &izero, zero, zero, Mptr( B, Bii, k, Bld, size ), &Bld ); else pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kbb, &M, &izero, zero, zero, Mptr( B, k, Bjj, Bld, size ), &Bld ); } } Aroc = MModAdd1( Aroc, AnprocsD ); } } } } else { /* * Message exchange */ if( ( AmyprocR == AprocR ) && ( AmyprocD == p ) ) { AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, p, AprocD, AnprocsD ); if( AnpD > 0 ) { if( AisRow ) TYPE->Cgesd2d( ctxt, M, AnpD, Mptr( A, Aii, Ajj, Ald, size ), Ald, Broc, BprocR ); else TYPE->Cgesd2d( ctxt, AnpD, N, Mptr( A, Aii, Ajj, Ald, size ), Ald, BprocR, Broc ); } } if( BmyprocR == BprocR ) { AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, p, AprocD, AnprocsD ); if( AnpD > 0 ) { Aroc = AprocD; kk = 0; if( BmyprocD == Broc ) { if( AisRow ) { buf = PB_Cmalloc( M * AnpD * size ); TYPE->Cgerv2d( ctxt, M, AnpD, buf, M, AprocR, p ); } else { buf = PB_Cmalloc( AnpD * N * size ); TYPE->Cgerv2d( ctxt, AnpD, N, buf, AnpD, p, AprocR ); } } if( myproc == Aroc ) { if( BmyprocD == Broc ) { if( AisRow ) add( &M, &Ainb1D, ALPHA, buf, &M, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else add( &Ainb1D, &N, ALPHA, buf, &AnpD, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); kk += Ainb1D; } else { if( BisRow ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N, &Ainb1D, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ainb1D, &M, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } } Aroc = MModAdd1( Aroc, AnprocsD ); for( k = kn; k < ktmp; k += AnbD ) { kbb = ktmp - k; kbb = MIN( kbb, AnbD ); if( myproc == Aroc ) { if( BmyprocD == Broc ) { if( AisRow ) add( &M, &kbb, ALPHA, Mptr( buf, 0, kk, M, size ), &M, BETA, Mptr( B, k, Bjj, Bld, size ), &Bld ); else add( &kbb, &N, ALPHA, Mptr( buf, kk, 0, AnpD, size ), &AnpD, BETA, Mptr( B, Bii, k, Bld, size ), &Bld ); kk += kbb; } else { if( BisRow ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N, &kbb, &izero, zero, zero, Mptr( B, Bii, k, Bld, size ), &Bld ); else pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kbb, &M, &izero, zero, zero, Mptr( B, k, Bjj, Bld, size ), &Bld ); } } Aroc = MModAdd1( Aroc, AnprocsD ); } if( ( BmyprocD == Broc ) && ( buf ) ) free( buf ); } } } Broc = MModAdd1( Broc, BnprocsD ); } if( BmyprocR == BprocR ) { /* * Replicate locally scattered sub( B ) by reducing it */ scope = ( BisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); TYPE->Cgsum2d( ctxt, &scope, top, N, M, Mptr( B, Bii, Bjj, Bld, size ), Bld, -1, 0 ); } } } if( BisR ) { /* * Replicate sub( B ) */ if( BisRow ) { if( AisRow ) { Bm = M; Bn = N; } else { Bm = N; Bn = M; } top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( BmyprocR == BprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld, size ), Bld ); else TYPE->Cgebr2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld, size ), Bld, BprocR, BmyprocD ); } else { if( AisRow ) { Bm = N; Bn = M; } else { Bm = M; Bn = N; } top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( BmyprocR == BprocR ) TYPE->Cgebs2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld, size ), Bld ); else TYPE->Cgebr2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld, size ), Bld, BmyprocD, BprocR ); } } } else { /* * sub( B ) is replicated in every process. Add the data in process row or * column AprocR when sub( A ) is not replicated and in every process otherwise. */ if( AisR || ( AmyprocR == AprocR ) ) { zero = TYPE->zero; if( RRorCC ) { if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd; else add = TYPE->Fmmadd; } else { if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd; else add = TYPE->Fmmtadd; } pad = TYPE->Ftzpad; AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD, AnprocsD ); if( AnpD > 0 ) { Aroc = AprocD; kk = ( AisRow ? Ajj : Aii ); if( BisRow ) { ktmp = JB + ( RRorCC ? N : M ); kn = JB + Ainb1D; } else { ktmp = IB + ( RRorCC ? M : N ); kn = IB + Ainb1D; } if( AmyprocD == Aroc ) { if( AisRow ) add( &M, &Ainb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else add( &Ainb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); kk += Ainb1D; } else { if( RRorCC ) { if( AisRow ) { Bm = M; Bn = Ainb1D; } else { Bm = Ainb1D; Bn = N; } } else { if( AisRow ) { Bm = Ainb1D; Bn = M; } else { Bm = N; Bn = Ainb1D; } } pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Bm, &Bn, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } Aroc = MModAdd1( Aroc, AnprocsD ); for( k = kn; k < ktmp; k += AnbD ) { kbb = ktmp - k; kbb = MIN( kbb, AnbD ); if( BisRow ) { buf = Mptr( B, Bii, k, Bld, size ); } else { buf = Mptr( B, k, Bjj, Bld, size ); } if( AmyprocD == Aroc ) { if( AisRow ) add( &M, &kbb, ALPHA, Mptr( A, Aii, kk, Ald, size ), &Ald, BETA, buf, &Bld ); else add( &kbb, &N, ALPHA, Mptr( A, kk, Ajj, Ald, size ), &Ald, BETA, buf, &Bld ); kk += kbb; } else { if( RRorCC ) { if( AisRow ) { Bm = M; Bn = kbb; } else { Bm = kbb; Bn = N; } } else { if( AisRow ) { Bm = kbb; Bn = M; } else { Bm = N; Bn = kbb; } } pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Bm, &Bn, &izero, zero, zero, buf, &Bld ); } Aroc = MModAdd1( Aroc, AnprocsD ); } } else { if( RRorCC ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &N, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N, &M, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } /* * Replicate locally scattered sub( B ) by reducing it in the process scope of * sub( A ) */ scope = ( AisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); if( RRorCC ) TYPE->Cgsum2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj, Bld, size ), Bld, -1, 0 ); else TYPE->Cgsum2d( ctxt, &scope, top, N, M, Mptr( B, Bii, Bjj, Bld, size ), Bld, -1, 0 ); } if( !AisR ) { /* * If sub( A ) is not replicated, then broadcast the result to the other pro- * cesses that own a piece of sub( B ), but were not involved in the above * addition operation. */ if( RRorCC ) { Bm = M; Bn = N; } else { Bm = N; Bn = M; } if( AisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( AmyprocR == AprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld, size ), Bld ); else TYPE->Cgebr2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld, size ), Bld, AprocR, AmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( AmyprocR == AprocR ) TYPE->Cgebs2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld, size ), Bld ); else TYPE->Cgebr2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld, size ), Bld, AmyprocD, AprocR ); } } } /* * End of PB_CpaxpbyDN */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpaxpbyND.c000644 000766 000024 00000066620 10363532303 020606 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpaxpbyND( PBTYP_T * TYPE, char * CONJUG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * AROC, char * BETA, char * B, int IB, int JB, int * DESCB, char * BROC ) #else void PB_CpaxpbyND( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B, IB, JB, DESCB, BROC ) /* * .. Scalar Arguments .. */ char * AROC, * BROC, * CONJUG; int IA, IB, JA, JB, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * B; #endif { /* * Purpose * ======= * * PB_CpaxpbyND adds one submatrix to another, * * sub( B ) := beta * sub( B ) + alpha * sub( A ), or, * * sub( B ) := beta * sub( B ) + alpha * conjg( sub( A ) ), * * where sub( A ) is not distributed and sub( B ) is distributed. * * sub( A ) always denotes A(IA:IA+M-1,JA:JA+N-1). When AROC is 'R' or * 'r' sub( A ) resides in a process row, otherwise sub( A ) resides in * a process column. When sub( A ) resides in a process row and BROC is * 'R' or 'r' or sub( A ) resides in a process column and BROC is 'C' or * 'c', then sub( B ) denotes B( IB:IB+M-1, JB:JB+N-1 ), and otherwise * sub( B ) denotes B(IB:IB+N-1,JB:JB+M-1). * otherwise. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether conjg( sub( A ) ) or * sub( A ) should be added to sub( B ) as follows: * CONJUG = 'N' or 'n': * sub( B ) := beta*sub( B ) + alpha*sub( A ), * otherwise * sub( B ) := beta*sub( B ) + alpha*conjg( sub( A ) ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A cor- * responding to the entries of the submatrix sub( A ) need not * be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where LLD_A * is at least MAX( 1, Lr( 1, IA+M-1 ) ), and, Ka is at least * Lc( 1, JA+N-1 ). Before entry, this array contains the local * entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * AROC (global input) pointer to CHAR * On entry, AROC specifies the orientation of the subvector * sub( A ). When AROC is 'R' or 'r', sub( A ) is a row vector, * and a column vector otherwise. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B corres- * ponding to the entries of the submatrix sub( B ) need not be * set on input. * * B (local input/local output) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where LLD_B * is at least MAX( 1, Lr( 1, IB+M-1 ) ) when sub( A ) and * sub( B ) are both distributed along a process column or a * process row. In that case, Kb is at least Lc( 1, JB+N-1 ). * Otherwise, LLD_B is at least MAX( 1, Lr( 1, IB+N-1 ) ) and * Kb is at least Lc( 1, JB+M-1 ). Before entry, this array * contains the local entries of the matrix B. On exit, sub( B ) * is overwritten with the updated submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BROC (global input) pointer to CHAR * On entry, BROC specifies the orientation of the subvector * sub( B ). When BROC is 'R' or 'r', sub( B ) is a row vector, * and a column vector otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one, * top, * zero; int Acol, Aii, AisR, AisRow, Ajj, Ald, AmyprocD, AmyprocR, AnprocsD, AprocR, Aroc, Arow, Bcol, Bii, Binb1D, BisR, BisRow, Bjj, Bld, BmyprocD, BmyprocR, BnD, BnbD, BnpD, BnprocsD, BprocD, BprocR, Broc, Brow, RRorCC, ctxt, k, kbb, kk, kn, ktmp, mycol, mydist, myproc, myrow, npcol, nprow, p, size; MMADD_T add; /* * .. Local Arrays .. */ char * buf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); if( ( AisRow = ( Mupcase( AROC[0] ) == CROW ) ) != 0 ) { BnD = N; Ald = DESCA[LLD_]; AmyprocD = mycol; AnprocsD = npcol; AmyprocR = myrow; AprocR = Arow; AisR = ( ( Arow == -1 ) || ( nprow == 1 ) ); } else { BnD = M; Ald = DESCA[LLD_]; AmyprocD = myrow; AnprocsD = nprow; AmyprocR = mycol; AprocR = Acol; AisR = ( ( Acol == -1 ) || ( npcol == 1 ) ); } /* * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol ... */ PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); if( ( BisRow = ( Mupcase( BROC[0] ) == CROW ) ) != 0 ) { BnbD = DESCB[NB_]; Bld = DESCB[LLD_]; BprocD = Bcol; BmyprocD = mycol; BnprocsD = npcol; BprocR = Brow; BmyprocR = myrow; BisR = ( ( BprocR == -1 ) || ( nprow == 1 ) ); Binb1D = PB_Cfirstnb( BnD, JB, DESCB[INB_], BnbD ); } else { BnbD = DESCB[MB_]; Bld = DESCB[LLD_]; BprocD = Brow; BmyprocD = myrow; BnprocsD = nprow; BprocR = Bcol; BmyprocR = mycol; BisR = ( ( BprocR == -1 ) || ( npcol == 1 ) ); Binb1D = PB_Cfirstnb( BnD, IB, DESCB[IMB_], BnbD ); } /* * Are sub( A ) and sub( B ) both row or column vectors ? */ RRorCC = ( ( AisRow && BisRow ) || ( !( AisRow ) && !( BisRow ) ) ); /* * sub( A ) is not distributed and sub( B ) is distributed */ if( !( AisR ) ) { /* * sub( A ) is not replicated. Since this operation is local if sub( A ) and * sub( B ) are both row or column vectors, choose BprocR = AprocR when RRorCC, * and BprocR = 0 otherwise. */ if( BisR ) { BprocR = ( ( RRorCC ) ? AprocR : 0 ); } /* * Now, it is just like sub( B ) is not replicated, this information however is * kept in BisR for later use. */ size = TYPE->size; if( ( AmyprocR == AprocR ) || ( BmyprocR == BprocR ) ) { zero = TYPE->zero; one = TYPE->one; /* * sub( A ) and sub( B ) are both row or column vectors */ if( RRorCC ) { if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd; else add = TYPE->Fmmadd; BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, BmyprocD, BprocD, BnprocsD ); /* * sub( A ) and sub( B ) are in the same process row or column */ if( AprocR == BprocR ) { /* * In each process, the non distributed part of sub( A ) is added to sub( B ). */ if( BnpD > 0 ) { Broc = BprocD; if( AisRow ) { kk = Bjj; ktmp = JA + N; kn = JA + Binb1D; } else { kk = Bii; ktmp = IA + M; kn = IA + Binb1D; } if( BmyprocD == Broc ) { if( AisRow ) add( &M, &Binb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else add( &Binb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); kk += Binb1D; } Broc = MModAdd1( Broc, BnprocsD ); for( k = kn; k < ktmp; k += BnbD ) { kbb = ktmp - k; kbb = MIN( kbb, BnbD ); if( BmyprocD == Broc ) { if( AisRow ) add( &M, &kbb, ALPHA, Mptr( A, Aii, k, Ald, size ), &Ald, BETA, Mptr( B, Bii, kk, Bld, size ), &Bld ); else add( &kbb, &N, ALPHA, Mptr( A, k, Ajj, Ald, size ), &Ald, BETA, Mptr( B, kk, Bjj, Bld, size ), &Bld ); kk += kbb; } Broc = MModAdd1( Broc, BnprocsD ); } } } else { /* * sub( A ) and sub( B ) are in a different process row or column */ if( BmyprocR == BprocR ) { /* * If I own a piece of sub( B ), then receive the relevant piece of sub( A ) * from the corresponding process row or column where it resides. */ if( BnpD > 0 ) { if( BisRow ) { buf = PB_Cmalloc( M * BnpD * size ); TYPE->Cgerv2d( ctxt, M, BnpD, buf, M, AprocR, BmyprocD ); add( &M, &BnpD, ALPHA, buf, &M, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); if( buf ) free( buf ); } else { buf = PB_Cmalloc( BnpD * N * size ); TYPE->Cgerv2d( ctxt, BnpD, N, buf, BnpD, BmyprocD, AprocR ); add( &BnpD, &N, ALPHA, buf, &BnpD, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); if( buf ) free( buf ); } } } if( AmyprocR == AprocR ) { /* * If I own sub( A ), then pack and send the distributed part that should be * added to the distributed part of sub( B ) that resides in my row or column. */ if( BnpD > 0 ) { if( AisRow ) { ktmp = JA + N; kn = JA + Binb1D; buf = PB_Cmalloc( M * BnpD * size ); } else { ktmp = IA + M; kn = IA + Binb1D; buf = PB_Cmalloc( BnpD * N * size ); } Broc = BprocD; kk = 0; if( BmyprocD == Broc ) { if( AisRow ) add( &M, &Binb1D, one, Mptr( A, Aii, Ajj, Ald, size ), &Ald, zero, buf, &M ); else add( &Binb1D, &N, one, Mptr( A, Aii, Ajj, Ald, size ), &Ald, zero, buf, &BnpD ); kk += Binb1D; } Broc = MModAdd1( Broc, BnprocsD ); for( k = kn; k < ktmp; k += BnbD ) { kbb = ktmp - k; kbb = MIN( kbb, BnbD ); if( BmyprocD == Broc ) { if( AisRow ) add( &M, &kbb, one, Mptr( A, Aii, k, Ald, size ), &Ald, zero, Mptr( buf, 0, kk, M, size ), &M ); else add( &kbb, &N, one, Mptr( A, k, Ajj, Ald, size ), &Ald, zero, Mptr( buf, kk, 0, BnpD, size ), &BnpD ); kk += kbb; } Broc = MModAdd1( Broc, BnprocsD ); } if( AisRow ) TYPE->Cgesd2d( ctxt, M, BnpD, buf, M, BprocR, AmyprocD ); else TYPE->Cgesd2d( ctxt, BnpD, N, buf, BnpD, AmyprocD, BprocR ); if( buf ) free( buf ); } } } } else { /* * sub( A ) and sub( B ) are not both row or column vectors */ if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd; else add = TYPE->Fmmtadd; Aroc = 0; if( AisRow ) { ktmp = JA + N; kn = JA + Binb1D; } else { ktmp = IA + M; kn = IA + Binb1D; } /* * Loop over the processes in which sub( B ) resides, for each process find the * next process Xroc. Exchange and add the data. */ for( p = 0; p < BnprocsD; p++ ) { mydist = MModSub( p, BprocD, BnprocsD ); myproc = MModAdd( BprocD, mydist, BnprocsD ); if( ( AprocR == p ) && ( BprocR == Aroc ) ) { if( ( AmyprocR == p ) && ( AmyprocD == Aroc ) ) { /* * local add at the intersection of the process cross */ BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, p, BprocD, BnprocsD ); if( BnpD > 0 ) { Broc = BprocD; kk = ( AisRow ? Bii : Bjj ); if( myproc == Broc ) { if( AisRow ) add( &M, &Binb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else add( &Binb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); kk += Binb1D; } Broc = MModAdd1( Broc, BnprocsD ); for( k = kn; k < ktmp; k += BnbD ) { kbb = ktmp - k; kbb = MIN( kbb, BnbD ); if( myproc == Broc ) { if( AisRow ) add( &M, &kbb, ALPHA, Mptr( A, Aii, k, Ald, size ), &Ald, BETA, Mptr( B, kk, Bjj, Bld, size ), &Bld ); else add( &kbb, &N, ALPHA, Mptr( A, k, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, kk, Bld, size ), &Bld ); kk += kbb; } Broc = MModAdd1( Broc, BnprocsD ); } } } } else { /* * Message exchange */ if( ( BmyprocR == BprocR ) && ( BmyprocD == p ) ) { BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, p, BprocD, BnprocsD ); if( BnpD > 0 ) { if( AisRow ) { buf = PB_Cmalloc( M * BnpD * size ); TYPE->Cgerv2d( ctxt, BnpD, M, buf, BnpD, AprocR, Aroc ); TYPE->Fmmadd( &BnpD, &M, ALPHA, buf, &BnpD, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } else { buf = PB_Cmalloc( BnpD * N * size ); TYPE->Cgerv2d( ctxt, N, BnpD, buf, N, Aroc, AprocR ); TYPE->Fmmadd( &N, &BnpD, ALPHA, buf, &N, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } if( buf ) free( buf ); } } if( ( AmyprocR == AprocR ) && ( AmyprocD == Aroc ) ) { BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, p, BprocD, BnprocsD ); if( BnpD > 0 ) { if( AisRow ) buf = PB_Cmalloc( M * BnpD * size ); else buf = PB_Cmalloc( BnpD * N * size ); Broc = BprocD; kk = 0; if( myproc == Broc ) { if( AisRow ) add( &M, &Binb1D, one, Mptr( A, Aii, Ajj, Ald, size ), &Ald, zero, buf, &BnpD ); else add( &Binb1D, &N, one, Mptr( A, Aii, Ajj, Ald, size ), &Ald, zero, buf, &N ); kk += Binb1D; } Broc = MModAdd1( Broc, BnprocsD ); for( k = kn; k < ktmp; k += BnbD ) { kbb = ktmp - k; kbb = MIN( kbb, BnbD ); if( myproc == Broc ) { if( AisRow ) add( &M, &kbb, one, Mptr( A, Aii, k, Ald, size ), &Ald, zero, Mptr( buf, kk, 0, BnpD, size ), &BnpD ); else add( &kbb, &N, one, Mptr( A, k, Ajj, Ald, size ), &Ald, zero, Mptr( buf, 0, kk, N, size ), &N ); kk += kbb; } Broc = MModAdd1( Broc, BnprocsD ); } if( AisRow ) TYPE->Cgesd2d( ctxt, BnpD, M, buf, BnpD, p, BprocR ); else TYPE->Cgesd2d( ctxt, N, BnpD, buf, N, BprocR, p ); if( buf ) free( buf ); } } } Aroc = MModAdd1( Aroc, AnprocsD ); } } } if( BisR ) { /* * Replicate sub( B ) */ BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, BmyprocD, BprocD, BnprocsD ); if( BnpD > 0 ) { if( BisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( BmyprocR == BprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, ( AisRow ? M : N ), BnpD, Mptr( B, Bii, Bjj, Bld, size ), Bld ); else TYPE->Cgebr2d( ctxt, COLUMN, top, ( AisRow ? M : N ), BnpD, Mptr( B, Bii, Bjj, Bld, size ), Bld, BprocR, BmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( BmyprocR == BprocR ) TYPE->Cgebs2d( ctxt, ROW, top, BnpD, ( AisRow ? M : N ), Mptr( B, Bii, Bjj, Bld, size ), Bld ); else TYPE->Cgebr2d( ctxt, ROW, top, BnpD, ( AisRow ? M : N ), Mptr( B, Bii, Bjj, Bld, size ), Bld, BmyprocD, BprocR ); } } } } else { /* * sub( A ) is replicated in every process. Add the data in process row or * column BprocR when sub( B ) is not replicated and in every process otherwise. */ if( !( BisR ) && ( BmyprocR != BprocR ) ) return; size = TYPE->size; if( RRorCC ) { if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd; else add = TYPE->Fmmadd; } else { if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd; else add = TYPE->Fmmtadd; } Broc = BprocD; kk = ( BisRow ? Bjj : Bii ); if( AisRow ) { ktmp = JA + N; kn = JA + Binb1D; } else { ktmp = IA + M; kn = IA + Binb1D; } if( BmyprocD == Broc ) { if( AisRow ) add( &M, &Binb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else add( &Binb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); kk += Binb1D; } Broc = MModAdd1( Broc, BnprocsD ); for( k = kn; k < ktmp; k += BnbD ) { kbb = ktmp - k; kbb = MIN( kbb, BnbD ); if( BmyprocD == Broc ) { if( BisRow ) { buf = Mptr( B, Bii, kk, Bld, size ); } else { buf = Mptr( B, kk, Bjj, Bld, size ); } if( AisRow ) add( &M, &kbb, ALPHA, Mptr( A, Aii, k, Ald, size ), &Ald, BETA, buf, &Bld ); else add( &kbb, &N, ALPHA, Mptr( A, k, Ajj, Ald, size ), &Ald, BETA, buf, &Bld ); kk += kbb; } Broc = MModAdd1( Broc, BnprocsD ); } } /* * End of PB_CpaxpbyND */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpaxpbyNN.c000644 000766 000024 00000044341 10363532303 020614 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpaxpbyNN( PBTYP_T * TYPE, char * CONJUG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * AROC, char * BETA, char * B, int IB, int JB, int * DESCB, char * BROC ) #else void PB_CpaxpbyNN( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B, IB, JB, DESCB, BROC ) /* * .. Scalar Arguments .. */ char * AROC, * BROC, * CONJUG; int IA, IB, JA, JB, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * B; #endif { /* * Purpose * ======= * * PB_CpaxpbyNN adds one submatrix to another, * * sub( B ) := beta * sub( B ) + alpha * sub( A ), or, * * sub( B ) := beta * sub( B ) + alpha * conjg( sub( A ) ), * * where both submatrices are not distributed; sub( A ) always denotes * A(IA:IA+M-1,JA:JA+N-1). When AROC is 'R' or 'r' sub( A ) resides in * a process row, otherwise sub( A ) resides in a process column. When * sub( A ) resides in a process row and BROC is 'R' or 'r' or * sub( A ) resides in a process column and BROC is 'C' or 'c', then * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1), and B(IB:IB+N-1,JB:JB+M-1) * otherwise. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether conjg( sub( A ) ) or * sub( A ) should be added to sub( B ) as follows: * CONJUG = 'N' or 'n': * sub( B ) := beta*sub( B ) + alpha*sub( A ), * otherwise * sub( B ) := beta*sub( B ) + alpha*conjg( sub( A ) ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A cor- * responding to the entries of the submatrix sub( A ) need not * be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where LLD_A * is at least MAX( 1, Lr( 1, IA+M-1 ) ), and, Ka is at least * Lc( 1, JA+N-1 ). Before entry, this array contains the local * entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * AROC (global input) pointer to CHAR * On entry, AROC specifies the orientation of the subvector * sub( A ). When AROC is 'R' or 'r', sub( A ) is a row vector, * and a column vector otherwise. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is sup- * plied as zero then the local entries of the array B corres- * ponding to the entries of the submatrix sub( B ) need not be * set on input. * * B (local input/local output) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where LLD_B * is at least MAX( 1, Lr( 1, IB+M-1 ) ) when sub( A ) and * sub( B ) are both distributed along a process column or a * process row. In that case, Kb is at least Lc( 1, JB+N-1 ). * Otherwise, LLD_B is at least MAX( 1, Lr( 1, IB+N-1 ) ) and * Kb is at least Lc( 1, JB+M-1 ). Before entry, this array * contains the local entries of the matrix B. On exit, sub( B ) * is overwritten with the updated submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BROC (global input) pointer to CHAR * On entry, BROC specifies the orientation of the subvector * sub( B ). When BROC is 'R' or 'r', sub( B ) is a row vector, * and a column vector otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char scope, * top; int Acol, Aii, AisR, AisRow, Ajj, Ald, AmyprocD, AmyprocR, AnprocsD, AnprocsR, AprocR, Arow, Bcol, Bii, BisR, BisRow, Bjj, Bld, BmyprocD, BmyprocR, BnprocsD, BnprocsR, BprocR, Brow, RRorCC, csrc, ctxt, iroca, mycol, myrow, npcol, nprow, p, rsrc, size; MMADD_T add; /* * .. Local Arrays .. */ char * buf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); if( ( AisRow = ( Mupcase( AROC[0] ) == CROW ) ) != 0 ) { Ald = DESCA[LLD_]; AmyprocD = mycol; AnprocsD = npcol; AprocR = Arow; AmyprocR = myrow; AnprocsR = nprow; AisR = ( ( Arow == -1 ) || ( AnprocsR == 1 ) ); } else { Ald = DESCA[LLD_]; AmyprocD = myrow; AnprocsD = nprow; AprocR = Acol; AmyprocR = mycol; AnprocsR = npcol; AisR = ( ( Acol == -1 ) || ( AnprocsR == 1 ) ); } /* * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol ... */ PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); if( ( BisRow = ( Mupcase( BROC[0] ) == CROW ) ) != 0 ) { Bld = DESCB[LLD_]; BmyprocD = mycol; BnprocsD = npcol; BprocR = Brow; BmyprocR = myrow; BnprocsR = nprow; BisR = ( ( Brow == -1 ) || ( BnprocsR == 1 ) ); } else { Bld = DESCB[LLD_]; BmyprocD = myrow; BnprocsD = nprow; BprocR = Bcol; BmyprocR = mycol; BnprocsR = npcol; BisR = ( ( Bcol == -1 ) || ( BnprocsR == 1 ) ); } /* * Are sub( A ) and sub( B ) both row or column vectors ? */ RRorCC = ( ( AisRow && BisRow ) || ( !( AisRow ) && !( BisRow ) ) ); /* * Neither sub( A ) nor sub( B ) are distributed */ if( !AisR ) { /* * sub( A ) is not replicated */ if( !( BisR ) ) { /* * sub( B ) is not replicated */ if( ( AmyprocR != AprocR ) && ( BmyprocR != BprocR ) ) /* * If I am not in AprocR or BprocR, then return immediately */ return; size = TYPE->size; if( RRorCC ) { /* * sub( A ) and sub( B ) are both row or column vectors */ if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd; else add = TYPE->Fmmadd; if( AprocR == BprocR ) { add( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } else { /* * sub( A ) and sub( B ) are in a different process row or column */ if( AmyprocR == AprocR ) { /* * Send sub( A ) to where sub( B ) resides. */ if( AisRow ) TYPE->Cgesd2d( ctxt, M, N, Mptr( A, Aii, Ajj, Ald, size ), Ald, BprocR, AmyprocD ); else TYPE->Cgesd2d( ctxt, M, N, Mptr( A, Aii, Ajj, Ald, size ), Ald, AmyprocD, BprocR ); } /* * receive sub( A ) and add it to sub( B ) */ if( BmyprocR == BprocR ) { buf = PB_Cmalloc( M * N * size ); if( BisRow ) TYPE->Cgerv2d( ctxt, M, N, buf, M, AprocR, BmyprocD ); else TYPE->Cgerv2d( ctxt, M, N, buf, M, BmyprocD, AprocR ); add( &M, &N, ALPHA, buf, &M, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); if( buf ) free( buf ); } } } else { /* * sub( A ) and sub( B ) are not both row or column vectors */ if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd; else add = TYPE->Fmmtadd; iroca = 0; for( p = 0; p < BnprocsD; p++ ) { if( ( AprocR == p ) && ( BprocR == iroca ) ) { if( ( AmyprocR == p ) && ( AmyprocD == iroca ) ) { add( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } } else { if( ( AmyprocR == AprocR ) && ( AmyprocD == iroca ) ) { if( AisRow ) TYPE->Cgesd2d( ctxt, M, N, Mptr( A, Aii, Ajj, Ald, size ), Ald, p, BprocR ); else TYPE->Cgesd2d( ctxt, M, N, Mptr( A, Aii, Ajj, Ald, size ), Ald, BprocR, p ); } if( ( BmyprocR == BprocR ) && ( BmyprocD == p ) ) { buf = PB_Cmalloc( M * N * size ); if( AisRow ) TYPE->Cgerv2d( ctxt, M, N, buf, M, AprocR, iroca ); else TYPE->Cgerv2d( ctxt, M, N, buf, M, iroca, AprocR ); add( &M, &N, ALPHA, buf, &M, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); if( buf ) free( buf ); } } iroca = MModAdd1( iroca, AnprocsD ); } } } else { /* * sub( B ) is replicated */ size = TYPE->size; if( AmyprocR == AprocR ) { if( RRorCC ) { if( Mupcase( CONJUG[0] ) != CNOCONJG ) TYPE->Fmmcadd( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else TYPE->Fmmadd( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } else { if( Mupcase( CONJUG[0] ) != CNOCONJG ) TYPE->Fmmtcadd( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else TYPE->Fmmtadd( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } if( AisRow ) { scope = CCOLUMN; } else { scope = CROW; } top = PB_Ctop( &ctxt, BCAST, &scope, TOP_GET ); if( RRorCC ) TYPE->Cgebs2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj, Bld, size ), Bld ); else TYPE->Cgebs2d( ctxt, &scope, top, N, M, Mptr( B, Bii, Bjj, Bld, size ), Bld ); } else { if( AisRow ) { scope = CCOLUMN; rsrc = AprocR; csrc = AmyprocD; } else { scope = CROW; rsrc = AmyprocD; csrc = AprocR; } top = PB_Ctop( &ctxt, BCAST, &scope, TOP_GET ); if( RRorCC ) TYPE->Cgebr2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj, Bld, size ), Bld, rsrc, csrc ); else TYPE->Cgebr2d( ctxt, &scope, top, N, M, Mptr( B, Bii, Bjj, Bld, size ), Bld, rsrc, csrc ); } } } else { /* * sub( A ) is replicated */ if( BisR || ( BmyprocR == BprocR ) ) { /* * If I own a piece of sub( B ), then add sub( A ) to it */ size = TYPE->size; if( RRorCC ) { if( Mupcase( CONJUG[0] ) != CNOCONJG ) TYPE->Fmmcadd( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else TYPE->Fmmadd( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } else { if( Mupcase( CONJUG[0] ) != CNOCONJG ) TYPE->Fmmtcadd( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); else TYPE->Fmmtadd( &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld ); } } } /* * End of PB_CpaxpbyNN */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cpdot11.c000644 000766 000024 00000052241 10363532303 020163 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cpdot11( PBTYP_T * TYPE, int N, char * DOT, char * X, int IX, int JX, int * DESCX, int INCX, char * Y, int IY, int JY, int * DESCY, int INCY, VVDOT_T FDOT ) #else void PB_Cpdot11( TYPE, N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, FDOT ) /* * .. Scalar Arguments .. */ int INCX, INCY, IX, IY, JX, JY, N; char * DOT; PBTYP_T * TYPE; VVDOT_T FDOT; /* * .. Array Arguments .. */ int * DESCX, * DESCY; char * X, * Y; #endif { /* * Purpose * ======= * * PB_Cpdot11 forms the dot product of two subvectors, * * DOT := sub( X )**T * sub( Y ) or DOT := sub( X )**H * sub( Y ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * One subvector at least is assumed to be degenerated. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * multiplied. N must be at least zero. * * DOT (local output) pointer to CHAR * On exit, DOT specifies the dot product of the two subvectors * sub( X ) and sub( Y ) only in their scope (See below for fur- * ther details). * * X (local input) pointer to CHAR * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) pointer to CHAR * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * FDOT (local input) pointer to a function of type VVDOT * On entry, FDOT points to a subroutine that computes the local * dot product of two vectors. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Xscope, Yscope, * top; int RRorCC, Xcol, Xii, XisD, XisOne, XisR, XisRow, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XprocD, XprocR, Xrow, Ycol, Yii, YisD, YisOne, YisR, YisRow, Yjj, YmyprocD, YmyprocR, YprocD, YprocR, Yrow, cdst, ctxt, ione=1, mycol, myrow, npcol, nprow, rdst; /* * .. Local Arrays .. */ int dbuf[DLEN_]; char * buf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCX[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... */ PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( ( XisRow = ( INCX == DESCX[ M_ ] ) ) != 0 ) { Xld = DESCX[ LLD_ ]; Xlinc = Xld; XprocD = Xcol; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XisR = ( ( Xrow == -1 ) || ( nprow == 1 ) ); XisD = ( ( Xcol >= 0 ) && ( npcol > 1 ) ); } else { Xld = DESCX[ LLD_ ]; Xlinc = 1; XprocD = Xrow; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XisR = ( ( Xcol == -1 ) || ( npcol == 1 ) ); XisD = ( ( Xrow >= 0 ) && ( nprow > 1 ) ); } XisOne = ( ( N == 1 ) && ( DESCX[ M_ ] == 1 ) ); /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... */ PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( ( YisRow = ( INCY == DESCY[ M_ ] ) ) != 0 ) { YprocD = Ycol; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YisR = ( ( Yrow == -1 ) || ( nprow == 1 ) ); YisD = ( ( Ycol >= 0 ) && ( npcol > 1 ) ); } else { YprocD = Yrow; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YisR = ( ( Ycol == -1 ) || ( npcol == 1 ) ); YisD = ( ( Yrow >= 0 ) && ( nprow > 1 ) ); } YisOne = ( ( N == 1 ) && ( DESCY[ M_ ] == 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * Copy sub( Y ) in sub( X )'s scope */ PB_Cdescset( dbuf, 1, 1, 1, 1, 1, 1, Xrow, Xcol, ctxt, 1 ); buf = PB_Cmalloc( TYPE->size ); PB_Cpaxpby( TYPE, NOCONJG, 1, 1, TYPE->one, Y, IY, JY, DESCY, ( YisRow ? ROW : COLUMN ), TYPE->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : COLUMN ) ); /* * Compute the dot product in sub( X )'s scope */ if( XisR || ( XmyprocR == XprocR ) ) { if( ( XisD && ( XmyprocD == XprocD ) ) || ( !XisD ) ) FDOT( &ione, DOT, Mptr( X, Xii, Xjj, Xld, TYPE->size ), &Xlinc, buf, &ione ); if( XisD && !XisOne ) { Xscope = ( XisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &Xscope, TOP_GET ); TYPE->Cgsum2d( ctxt, &Xscope, top, 1, 1, DOT, 1, -1, 0 ); } } if( buf ) free( buf ); /* * sub( X ) or sub( Y ) is a degenerated vector */ if( XisD && XisOne ) { /* * Since XisOne, sub( X ) must be a row vector */ if( XisR ) { /* * sub( X ) resides in one process column ( *, XprocD ) */ if( RRorCC ) { /* * sub( Y ) is a row vector as well */ if( YisR || YmyprocR == YprocR ) { /* * I am a process row owning sub( Y ) */ if( YisD && YisOne ) { /* * sub( Y ) resides in a process column ( *, YprocD ) */ if( XprocD != YprocD ) { if( XmyprocD == XprocD ) TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, XmyprocR, YprocD ); else if( YmyprocD == YprocD ) TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XmyprocR, XprocD ); } } else { /* * Every process in those rows needs the answer */ top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocD == XprocD ) TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocR, XprocD ); } } } else { /* * sub( Y ) is a column vector */ if( YisR ) { /* * sub( Y ) resides in every process column */ top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocD == XprocD ) TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocR, XprocD ); } else if( XprocD != YprocR ) { /* * sub( Y ) resides in process column YprocR */ if( XmyprocD == XprocD ) TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, XmyprocR, YprocR ); if( YmyprocR == YprocR ) TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XmyprocR, XprocD ); } } } else { /* * sub( X ) resides in one process ( XprocR, XprocD ) */ if( YisD && YisOne ) { /* * sub( Y ) resides in one process ( YprocR, YprocD ) if it is not replicated, * and in one process column ( *, YprocD ) otherwise */ if( ( XprocD != YprocD ) || ( !YisR && ( XprocR != YprocR ) ) ) { /* * ( XprocR, XprocD ) sends DOT to ( YprocR, YprocD ) if sub( Y ) is not repli- * cated, and to ( XprocR, YprocD ) otherwise */ rdst = ( YisR ? XprocR : YprocR ); if( ( XmyprocR == XprocR ) && ( XmyprocD == XprocD ) ) TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, rdst, YprocD ); if( ( YmyprocR == rdst ) && ( YmyprocD == YprocD ) ) TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XprocR, XprocD ); } if( YisR && ( YmyprocD == YprocD ) ) { /* * Broadcast DOT within process column owning sub( Y ) */ top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR, YprocD ); } } else if( !YisR ) { /* * sub( Y ) resides in one process row or column */ if( YisRow ) { Yscope = CROW; rdst = YprocR; cdst = XprocD; } else { Yscope = CCOLUMN; rdst = XprocR; cdst = YprocR; } /* * ( XprocR, XprocD ) sends DOT to ( YprocR, XprocD ) if sub( Y ) is a row * vector and to ( XprocR, YprocR ) otherwise. If RRorCC, then YisRow and the * send occurs iff XprocR != YprocR; Otherwise !YisRow, and the send occurs * iff XprocD is not YprocR. */ if( ( RRorCC && ( XprocR != YprocR ) ) || ( !( RRorCC ) && ( XprocD != YprocR ) ) ) { if( ( XmyprocR == XprocR ) && ( XmyprocD == XprocD ) ) TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, rdst, cdst ); if( ( myrow == rdst ) && ( mycol == cdst ) ) TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XprocR, XprocD ); } /* * Broadcast the result in sub( Y )'s scope */ if( ( myrow == rdst ) && ( mycol == cdst ) ) { top = PB_Ctop( &ctxt, BCAST, &Yscope, TOP_GET ); TYPE->Cgebs2d( ctxt, &Yscope, top, 1, 1, DOT, 1 ); } else if( ( YisRow && ( myrow == rdst ) ) || ( !( YisRow ) && ( mycol == cdst ) ) ) { top = PB_Ctop( &ctxt, BCAST, &Yscope, TOP_GET ); TYPE->Cgebr2d( ctxt, &Yscope, top, 1, 1, DOT, 1, rdst, cdst ); } } else { /* * Every process in the grid needs the answer */ top = PB_Ctop( &ctxt, BCAST, ALL, TOP_GET ); if( ( XmyprocR == XprocR ) && ( XmyprocD == XprocD ) ) { TYPE->Cgebs2d( ctxt, ALL, top, 1, 1, DOT, 1 ); } else { TYPE->Cgebr2d( ctxt, ALL, top, 1, 1, DOT, 1, XprocR, XprocD ); } } } } else { /* * If XisR, then the result has already been sent in every process of the grid */ if( XisR ) return; if( RRorCC ) { /* * If YisD && YisOne => YisRow => XisRow, communication orthogonal to sub( X )'s * direction: only process column YprocD is involved. */ if( YisD && YisOne && ( YmyprocD != YprocD ) ) return; if( YisR ) { /* * YisR and sub( Y ) is // to sub( X ) => bcast orthogonal to sub( X ) direction */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocD, XprocR ); } } else if( XprocR != YprocR ) { /* * Send from one column/row to another if they differ */ if( XisRow ) { if( XmyprocR == XprocR ) TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, YprocR, YmyprocD ); if( YmyprocR == YprocR ) TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XprocR, XmyprocD ); } else { if( XmyprocR == XprocR ) TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, YmyprocD, YprocR ); if( YmyprocR == YprocR ) TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XmyprocD, XprocR ); } } } else { /* * If XisRow then !YisRow and thus bcast result in all rows if YisR or in * process row YprocR otherwise. If !YisD || ( YisD && !YisOne ), then result * should be sent in the same processes because they span a row or a column of * the grid. */ if( XisRow || !( YisD ) || ( YisD && !( YisOne ) ) ) { if( YisR || YmyprocR == YprocR ) { if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocD, XprocR ); } } } else if( XprocR != YprocD ) { /* * YisD && YisOne => YisRow => !XisRow, so the column of processes owning * sub( X ) send the result to the column YprocD. The process rows involved * in the operation depend on YisR. */ if( YisR || YmyprocR == YprocR ) { if( XmyprocR == XprocR ) { TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, YmyprocR, YprocD ); } if( YmyprocD == YprocD ) { TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, YmyprocR, XprocR ); } } } } } /* * End of PB_Cpdot11 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpdotND.c000644 000766 000024 00000067510 10363532303 020250 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpdotND( PBTYP_T * TYPE, int N, char * DOT, char * X, int IX, int JX, int * DESCX, int INCX, char * Y, int IY, int JY, int * DESCY, int INCY, VVDOT_T FDOT ) #else void PB_CpdotND( TYPE, N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, FDOT ) /* * .. Scalar Arguments .. */ int INCX, INCY, IX, IY, JX, JY, N; char * DOT; PBTYP_T * TYPE; VVDOT_T FDOT; /* * .. Array Arguments .. */ int * DESCX, * DESCY; char * X, * Y; #endif { /* * Purpose * ======= * * PB_CpdotND forms the dot product of two subvectors, * * DOT := sub( X )**T * sub( Y ) or DOT := sub( X )**H * sub( Y ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * sub( X ) is assumed to be not distributed, and sub( Y ) is assumed to * be distributed. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * multiplied. N must be at least zero. * * DOT (local output) pointer to CHAR * On exit, DOT specifies the dot product of the two subvectors * sub( X ) and sub( Y ) only in their scope (See below for fur- * ther details). * * X (local input) pointer to CHAR * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) pointer to CHAR * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * FDOT (local input) pointer to a function of type VVDOT * On entry, FDOT points to a subroutine that computes the local * dot product of two vectors. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * top; int RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XnprocsD, XnprocsR, XprocR, Xroc, Xrow, Ycol, Yii, Yinb1D, YisR, YisRow, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yroc, Yrow, ctxt, ione=1, k, kbb, kk, kn, ktmp, mycol, mydist, myproc, myrow, npcol, nprow, p, size; /* * .. Local Arrays .. */ char * Xptr = NULL, * Yptr = NULL, * buf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... */ PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 ) { Xld = DESCX[LLD_]; Xlinc = Xld; XmyprocD = mycol; XnprocsD = npcol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); } else { Xld = DESCX[LLD_]; Xlinc = 1; XmyprocD = myrow; XnprocsD = nprow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... */ PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 ) { YnbD = DESCY[NB_]; Yld = DESCY[LLD_]; Ylinc = Yld; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YprocD = Ycol; YmyprocD = mycol; YnprocsD = npcol; Yinb1D = PB_Cfirstnb( N, JY, DESCY[INB_], YnbD ); } else { YnbD = DESCY[MB_]; Yld = DESCY[LLD_]; Ylinc = 1; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YprocD = Yrow; YmyprocD = myrow; YnprocsD = nprow; Yinb1D = PB_Cfirstnb( N, IY, DESCY[IMB_], YnbD ); } YisR = ( ( YprocR == -1 ) || ( YnprocsR == 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * sub( X ) is not distributed and sub( Y ) is distributed */ if( !( XisR ) ) { /* * sub( X ) is not replicated. Since this operation is local if sub( X ) and * sub( Y ) are both row or column vectors, choose YprocR = XprocR when RRorCC, * and YprocR = 0 otherwise. */ if( YisR ) { YprocR = ( ( RRorCC ) ? XprocR : 0 ); } /* * Now, it is just like sub( Y ) is not replicated, this information however is * kept in YisR for later use. */ if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( XprocR == YprocR ) { /* * sub( X ) and sub( Y ) are in the same process row or column */ if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) ) { size = TYPE->size; YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); /* * In a given process, the dot product is computed with sub( Y ) and the cor- * responding non distributed part of sub( X ). In the other processes, this * part of sub( X ) is simply ignored. */ if( YnpD > 0 ) { Yroc = YprocD; if( XisRow ) { kk = Yjj; ktmp = JX + N; kn = JX + Yinb1D; } else { kk = Yii; ktmp = IX + N; kn = IX + Yinb1D; } if( YmyprocD == Yroc ) { FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); kk += Yinb1D; } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( YmyprocD == Yroc ) { if( XisRow ) FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ), &Xlinc, Mptr( Y, Yii, kk, Yld, size ), &Ylinc ); else FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ), &Xlinc, Mptr( Y, kk, Yjj, Yld, size ), &Ylinc ); kk += kbb; } Yroc = MModAdd1( Yroc, YnprocsD ); } } /* * Replicate locally scattered dot product by reducing it */ if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 ); } } } else { /* * sub( X ) and sub( Y ) are in a different process row or column */ if( YmyprocR == YprocR ) { size = TYPE->size; YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); /* * If I own a piece of sub( Y ), then send it to the process row or column where * sub( X ) resides and receive the dot product when sub( Y ) is not replicated. */ if( YisRow ) { if( YnpD > 0 ) TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, YmyprocD ); TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XprocR, XmyprocD ); } else { if( YnpD > 0 ) TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, YmyprocD, XprocR ); TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XmyprocD, XprocR ); } } if( XmyprocR == XprocR ) { size = TYPE->size; YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); /* * If I own sub( X ), then receive the distributed part of sub( Y ) owned by * the process where sub( Y ) resides in my row or column. Compute the partial * dot product as if sub( Y ) would reside in the same process row or column as * sub( X ). Combine the local results. */ if( YnpD > 0 ) { buf = PB_Cmalloc( YnpD * size ); if( YisRow ) TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR, XmyprocD ); else TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, XmyprocD, YprocR ); Yroc = YprocD; kk = 0; if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; } else { ktmp = IX + N; kn = IX + Yinb1D; } if( YmyprocD == Yroc ) { FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); kk += Yinb1D; } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( YmyprocD == Yroc ) { if( XisRow ) FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ), &Xlinc, buf+kk*size, &ione ); else FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ), &Xlinc, buf+kk*size, &ione ); kk += kbb; } Yroc = MModAdd1( Yroc, YnprocsD ); } if( buf ) free( buf ); } /* * Combine the local results within the process row or column XprocR and * send the result to the process row or column YprocR when sub( Y ) is not * replicated. */ if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 ); if( !YisR ) TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, YprocR, YmyprocD ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 ); if( !YisR ) TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, YmyprocD, YprocR ); } } } if( YisR ) { /* * If sub( Y ) is replicated, then bcast the result */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocD, XprocR ); } } } else { /* * sub( X ) and sub( Y ) are not both row or column vectors */ if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) ) { size = TYPE->size; Xroc = 0; if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; } else { ktmp = IX + N; kn = IX + Yinb1D; } /* * Loop over the processes in which sub( Y ) resides, for each process find the * next process Xroc and compute the dot product. After this, it will be needed * to reduce the local dot produsts as above. */ for( p = 0; p < YnprocsD; p++ ) { mydist = MModSub( p, YprocD, YnprocsD ); myproc = MModAdd( YprocD, mydist, YnprocsD ); if( ( XprocR == p ) && ( YprocR == Xroc ) ) { /* * Compute locally the partial dot product at the intersection of the process * cross. */ if( ( XmyprocR == p ) && ( XmyprocD == Xroc ) ) { YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD, YnprocsD ); if( YnpD > 0 ) { Yroc = YprocD; kk = ( XisRow ? Yii : Yjj ); if( myproc == Yroc ) { FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); kk += Yinb1D; } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( myproc == Yroc ) { if( XisRow ) FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ), &Xlinc, Mptr( Y, kk, Yjj, Yld, size ), &Ylinc ); else FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, kk, Yld, size ), &Ylinc ); kk += kbb; } Yroc = MModAdd1( Yroc, YnprocsD ); } } } } else { /* * Message exchange */ if( ( YmyprocR == YprocR ) && ( YmyprocD == p ) ) { YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD, YnprocsD ); if( YnpD > 0 ) { if( XisRow ) TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, Xroc ); else TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld, size ), Yld, Xroc, XprocR ); } } if( ( XmyprocR == XprocR ) && ( XmyprocD == Xroc ) ) { YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD, YnprocsD ); if( YnpD > 0 ) { buf = PB_Cmalloc( YnpD * size ); Yroc = YprocD; kk = 0; /* * Receive the piece of sub( Y ) that I should handle */ if( XisRow ) TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, p, YprocR ); else TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR, p ); if( myproc == Yroc ) { FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); kk += Yinb1D; } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( myproc == Yroc ) { if( XisRow ) FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ), &Xlinc, buf+kk*size, &ione ); else FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ), &Xlinc, buf+kk*size, &ione ); kk += kbb; } Yroc = MModAdd1( Yroc, YnprocsD ); } if( buf ) free( buf ); } } } Xroc = MModAdd1( Xroc, XnprocsD ); } /* * Combine the local results in sub( X )'s scope */ if( XmyprocR == XprocR ) { if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 ); } } } /* * Broadcast the result in sub( Y )'s scope */ if( YisR || ( YmyprocR == YprocR ) ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( XmyprocR == XprocR ) TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, YmyprocR, XprocR ); } else { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( XmyprocR == XprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR, YmyprocR ); } } } } else { /* * sub( X ) is replicated in every process. Compute the local dot product in * process row or column YprocR when sub( Y ) is not replicated and in every * process otherwise. */ if( YisR || ( YmyprocR == YprocR ) ) { size = TYPE->size; Yroc = YprocD; kk = ( YisRow ? Yjj : Yii ); if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; } else { ktmp = IX + N; kn = IX + Yinb1D; } if( YmyprocD == Yroc ) { FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); kk += Yinb1D; } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( YmyprocD == Yroc ) { if( XisRow ) { Xptr = Mptr( X, Xii, k, Xld, size ); } else { Xptr = Mptr( X, k, Xjj, Xld, size ); } if( YisRow ) { Yptr = Mptr( Y, Yii, kk, Yld, size ); } else { Yptr = Mptr( Y, kk, Yjj, Yld, size ); } FDOT( &kbb, DOT, Xptr, &Xlinc, Yptr, &Ylinc ); kk += kbb; } Yroc = MModAdd1( Yroc, YnprocsD ); } } if( YisR ) { /* * sub( Y ) is replicated, combine the results in each process row or column. */ if( YisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 ); } } else { /* * sub( Y ) is not replicated, combine the results in the entire grid at once. */ top = PB_Ctop( &ctxt, COMBINE, ALL, TOP_GET ); TYPE->Cgsum2d( ctxt, ALL, top, 1, 1, DOT, 1, -1, 0 ); } } /* * End of PB_CpdotND */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpdotNN.c000644 000766 000024 00000043465 10363532303 020265 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpdotNN( PBTYP_T * TYPE, int N, char * DOT, char * X, int IX, int JX, int * DESCX, int INCX, char * Y, int IY, int JY, int * DESCY, int INCY, VVDOT_T FDOT ) #else void PB_CpdotNN( TYPE, N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY, FDOT ) /* * .. Scalar Arguments .. */ int INCX, INCY, IX, IY, JX, JY, N; char * DOT; PBTYP_T * TYPE; VVDOT_T FDOT; /* * .. Array Arguments .. */ int * DESCX, * DESCY; char * X, * Y; #endif { /* * Purpose * ======= * * PB_CpdotNN forms the dot product of two subvectors, * * DOT := sub( X )**T * sub( Y ) or DOT := sub( X )**H * sub( Y ), * * where * * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Both subvectors are assumed to be not distributed. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * multiplied. N must be at least zero. * * DOT (local output) pointer to CHAR * On exit, DOT specifies the dot product of the two subvectors * sub( X ) and sub( Y ) only in their scope (See below for fur- * ther details). * * X (local input) pointer to CHAR * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input) pointer to CHAR * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * FDOT (local input) pointer to a function of type VVDOT * On entry, FDOT points to a subroutine that computes the local * dot product of two vectors. * * Further Details * =============== * * When the result of a vector-oriented PBLAS call is a scalar, this * scalar is set only within the process scope which owns the vector(s) * being operated on. Let sub( X ) be a generic term for the input vec- * tor(s). Then, the processes owning the correct the answer is determi- * ned as follows: if an operation involves more than one vector, the * processes receiving the result will be the union of the following set * of processes for each vector: * * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- * cess row or process column owns the vector operand, therefore only * the process owning sub( X ) receives the correct result; * * If INCX = M_X, then sub( X ) is a vector distributed over a process * row. Each process in this row receives the result; * * If INCX = 1, then sub( X ) is a vector distributed over a process * column. Each process in this column receives the result; * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Xscope, Yscope, * top; int RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XnprocsR, XprocR, Xrow, Ycol, Yii, YisR, YisRow, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnprocsR, YprocR, Yrow, csrc, ctxt, ione=1, mycol, myrow, npcol, nprow, rsrc, size; /* * .. Local Arrays .. */ char * buf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... */ PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 ) { Xld = DESCX[LLD_]; Xlinc = Xld; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); } else { Xld = DESCX[LLD_]; Xlinc = 1; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... */ PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 ) { Yld = DESCY[LLD_]; Ylinc = Yld; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); } else { Yld = DESCY[LLD_]; Ylinc = 1; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); } /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * Neither sub( X ) nor sub( Y ) are distributed */ if( !XisR ) { /* * sub( X ) is not replicated */ if( !( YisR ) ) { /* * sub( Y ) is not replicated */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) /* * If I am not in XprocR or YprocR, then return immediately */ return; size = TYPE->size; if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( XprocR == YprocR ) { /* * sub( X ) and sub( Y ) are in the same process row or column */ FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); } else { /* * sub( X ) and sub( Y ) are in a different process row or column */ if( XmyprocR == XprocR ) { buf = PB_Cmalloc( N * size ); /* * Send sub( X ) to where sub( Y ) resides, and receive sub( Y ) from the same * location. */ if( XisRow ) { TYPE->Cgesd2d( ctxt, 1, N, Mptr( X, Xii, Xjj, Xld, size ), Xld, YprocR, XmyprocD ); TYPE->Cgerv2d( ctxt, 1, N, buf, 1, YprocR, XmyprocD ); } else { TYPE->Cgesd2d( ctxt, N, 1, Mptr( X, Xii, Xjj, Xld, size ), Xld, XmyprocD, YprocR ); TYPE->Cgerv2d( ctxt, N, 1, buf, N, XmyprocD, YprocR ); } FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); if( buf ) free( buf ); } if( YmyprocR == YprocR ) { buf = PB_Cmalloc( N * size ); /* * Send sub( Y ) to where sub( X ) resides, and receive sub( X ) from the same * location. */ if( YisRow ) { TYPE->Cgesd2d( ctxt, 1, N, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, YmyprocD ); TYPE->Cgerv2d( ctxt, 1, N, buf, 1, XprocR, YmyprocD ); } else { TYPE->Cgesd2d( ctxt, N, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, YmyprocD, XprocR ); TYPE->Cgerv2d( ctxt, N, 1, buf, N, YmyprocD, XprocR ); } FDOT( &N, DOT, buf, &ione, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); if( buf ) free( buf ); } } } else { /* * sub( X ) and sub( Y ) are not both row or column vectors */ if( ( XmyprocR == XprocR ) && ( YmyprocR == YprocR ) ) { /* * If I am at the intersection of the process row and column, then compute the * dot product and broadcast it in my process row and column. */ FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); } else if( XmyprocR == XprocR ) { if( XisRow ) { Xscope = CROW; rsrc = XprocR; csrc = YprocR; } else { Xscope = CCOLUMN; rsrc = YprocR; csrc = XprocR; } top = PB_Ctop( &ctxt, BCAST, &Xscope, TOP_GET ); TYPE->Cgebr2d( ctxt, &Xscope, top, 1, 1, DOT, 1, rsrc, csrc ); } else if( YmyprocR == YprocR ) { if( YisRow ) { Yscope = CROW; rsrc = YprocR; csrc = XprocR; } else { Yscope = CCOLUMN; rsrc = XprocR; csrc = YprocR; } top = PB_Ctop( &ctxt, BCAST, &Yscope, TOP_GET ); TYPE->Cgebr2d( ctxt, &Yscope, top, 1, 1, DOT, 1, rsrc, csrc ); } } } else { /* * sub( Y ) is replicated */ if( XmyprocR == XprocR ) { /* * If I am in the process row (resp. column) owning sub( X ), then compute the * dot product and broadcast in my column (resp. row). */ size = TYPE->size; FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); } } else { /* * Otherwise, receive the dot product */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocD, XprocR ); } } } } else { /* * sub( X ) is replicated */ if( YisR || ( YmyprocR == YprocR ) ) { /* * If I own a piece of sub( Y ), then compute the dot product */ size = TYPE->size; FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); } if( !YisR ) { /* * If sub( Y ) is not replicated, then broadcast the result to the other * processes that own a piece of sub( X ), but were not involved in the above * dot-product computation. */ if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocR == YprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, YprocR, YmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocR == YprocR ) TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); else TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, YmyprocD, YprocR ); } } } /* * End of PB_CpdotNN */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cpgeadd.c000644 000766 000024 00000053417 10363532303 020305 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cpgeadd( PBTYP_T * TYPE, char * DIRECA, char * DIRECC, char * CONJUG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_Cpgeadd( TYPE, DIRECA, DIRECC, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * CONJUG, * DIRECA, * DIRECC; int IA, IC, JA, JC, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCC; char * A, * C; #endif { /* * Purpose * ======= * * PB_Cpgeadd adds a matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = conjg( X ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if CONJUG = 'N', * conjg(A(IA:IA+N-1,JA:JA+M-1)) if CONJUG = 'C'. * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECA (global input) pointer to CHAR * On entry, DIRECA specifies the direction in which the rows * or columns of sub( A ) should be looped over as follows: * DIRECA = 'F' or 'f' forward or increasing, * DIRECA = 'B' or 'b' backward or decreasing. * * DIRECC (global input) pointer to CHAR * On entry, DIRECC specifies the direction in which the rows * or columns of sub( C ) should be looped over as follows: * DIRECC = 'F' or 'f' forward or increasing, * DIRECC = 'B' or 'b' backward or decreasing. * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether conjg( sub( A ) ) or * sub( A ) should be added to sub( C ) as follows: * CONJUG = 'N' or 'n': * sub( C ) := beta*sub( C ) + alpha*sub( A )' * otherwise * sub( C ) := beta*sub( C ) + alpha*conjg( sub( A ) )'. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrices * sub( A ) and sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatri- * ces sub( A ) and sub( C ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char ACroc, * one, * talpha, * tbeta, * zero; int ACmyprocD, ACmyprocR, ACnD, ACnR, ACnprocsD, ACnprocsR, Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR, AinbD, AinbR, Ainb1D, Ainb1R, AisR, Akk, Ald, AnbD, AnbR, AnpD, AnpR, Aoff, ArocD, ArocR, AsrcR, Cbufld, CcurrocR, Cfr, Cfwd, CiD, CiR, CiiD, CiiR, CinbD, CinbR, Cinb1D, Cinb1R, CisR, Ckk, Cld, CnbD, CnbR, CnpD, CnpR, Coff, CrocD, CrocR, CsrcR, ctxt, k, kb, kbb, lcmb, maxp, maxpm1, maxpq, maxq, mycol, myrow, npcol, npq, nprow, ncpq, nrpq, p=0, q=0, row2row, size, tmp; PB_VM_T VM; /* * .. Local Arrays .. */ int DBUFA[DLEN_], DBUFC[DLEN_]; char * Abuf = NULL, * Cbuf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Loop over the rows of sub( C ) when M <= N, and the columns of sub( C ) * otherwise. */ row2row = ( ( M <= N ) || ( npcol == 1 ) || ( DESCA[CSRC_] == -1 ) ); if( row2row ) { AinbR = DESCA[IMB_]; AnbR = DESCA[MB_]; AsrcR = DESCA[RSRC_]; CinbR = DESCC[IMB_]; CnbR = DESCC[MB_]; CsrcR = DESCC[RSRC_]; /* * If sub( A ) and sub( C ) span only one process row, then there is no need * to pack the data. */ if( !( PB_Cspan( M, IA, AinbR, AnbR, AsrcR, nprow ) ) && !( PB_Cspan( M, IC, CinbR, CnbR, CsrcR, nprow ) ) ) { PB_Cpaxpby( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, ROW, BETA, C, IC, JC, DESCC, ROW ); return; } /* * Compute local information for sub( A ) and sub( C ) */ ACnR = M; ACnD = N; ACmyprocR = myrow; ACnprocsR = nprow; ACmyprocD = mycol; ACnprocsD = npcol; ACroc = CROW; AiR = IA; AiD = JA; AinbD = DESCA[INB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_]; PB_Cinfog2l( IA, JA, DESCA, ACnprocsR, ACnprocsD, ACmyprocR, ACmyprocD, &AiiR, &AiiD, &ArocR, &ArocD ); CiR = IC; CiD = JC; CinbD = DESCC[INB_]; CnbD = DESCC[NB_]; Cld = DESCC[LLD_]; PB_Cinfog2l( IC, JC, DESCC, ACnprocsR, ACnprocsD, ACmyprocR, ACmyprocD, &CiiR, &CiiD, &CrocR, &CrocD ); } else { AinbR = DESCA[INB_]; AnbR = DESCA[NB_]; AsrcR = DESCA[CSRC_]; CinbR = DESCC[INB_]; CnbR = DESCC[NB_]; CsrcR = DESCC[CSRC_]; /* * If sub( A ) and sub( C ) span only one process column, then there is no need * to pack the data. */ if( !( PB_Cspan( N, JA, AinbR, AnbR, AsrcR, npcol ) ) && !( PB_Cspan( N, JC, CinbR, CnbR, CsrcR, npcol ) ) ) { PB_Cpaxpby( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, COLUMN, BETA, C, IC, JC, DESCC, COLUMN ); return; } /* * Compute local information for sub( A ) and sub( C ) */ ACnR = N; ACnD = M; ACmyprocR = mycol; ACnprocsR = npcol; ACmyprocD = myrow; ACnprocsD = nprow; ACroc = CCOLUMN; AiR = JA; AiD = IA; AinbD = DESCA[IMB_]; AnbD = DESCA[MB_]; Ald = DESCA[LLD_]; PB_Cinfog2l( IA, JA, DESCA, ACnprocsD, ACnprocsR, ACmyprocD, ACmyprocR, &AiiD, &AiiR, &ArocD, &ArocR ); CiR = JC; CiD = IC; CinbD = DESCC[IMB_]; CnbD = DESCC[MB_]; Cld = DESCC[LLD_]; PB_Cinfog2l( IC, JC, DESCC, ACnprocsD, ACnprocsR, ACmyprocD, ACmyprocR, &CiiD, &CiiR, &CrocD, &CrocR ); } size = TYPE->size; one = TYPE->one; zero = TYPE->zero; kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); Ainb1D = PB_Cfirstnb( ACnD, AiD, AinbD, AnbD ); AnpD = PB_Cnumroc( ACnD, 0, Ainb1D, AnbD, ACmyprocD, ArocD, ACnprocsD ); Ainb1R = PB_Cfirstnb( ACnR, AiR, AinbR, AnbR ); AisR = ( ( AsrcR < 0 ) || ( ACnprocsR == 1 ) ); Cinb1D = PB_Cfirstnb( ACnD, CiD, CinbD, CnbD ); CnpD = PB_Cnumroc( ACnD, 0, Cinb1D, CnbD, ACmyprocD, CrocD, ACnprocsD ); Cinb1R = PB_Cfirstnb( ACnR, CiR, CinbR, CnbR ); CisR = ( ( CsrcR < 0 ) || ( ACnprocsR == 1 ) ); lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : ACnprocsR ) ) * CnbR, ( maxq = ( AisR ? 1 : ACnprocsR ) ) * AnbR ); Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD ); Cfwd = ( Mupcase( DIRECC[0] ) == CFORWARD ); /* * When sub( A ) is not replicated and backward pass on sub( A ), find the * virtual process q owning the last row or column of sub( A ). */ if( !( AisR ) && !( Afwd ) ) { tmp = PB_Cindxg2p( ACnR-1, Ainb1R, AnbR, ArocR, ArocR, ACnprocsR ); q = MModSub( tmp, ArocR, ACnprocsR ); } /* * When sub( C ) is not replicated and backward pass on sub( C ), find the * virtual process p owning the last row or column of sub( C ). */ if( !( CisR ) && !( Cfwd ) ) { tmp = PB_Cindxg2p( ACnR-1, Cinb1R, CnbR, CrocR, CrocR, ACnprocsR ); p = MModSub( tmp, CrocR, ACnprocsR ); } /* * Loop over the processes of the virtual grid */ maxpm1 = maxp - 1; maxpq = maxp * maxq; for( k = 0; k < maxpq; k++ ) { AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, ACnprocsR ) ); CcurrocR = ( CisR ? -1 : MModAdd( CrocR, p, ACnprocsR ) ); if( ( AisR || ( ACmyprocR == AcurrocR ) ) || ( CisR || ( ACmyprocR == CcurrocR ) ) ) { Ckk = CiiR; Akk = AiiR; /* * Initialize local virtual matrix in process (p,q) */ AnpR = PB_Cnumroc( ACnR, 0, Ainb1R, AnbR, AcurrocR, ArocR, ACnprocsR ); CnpR = PB_Cnumroc( ACnR, 0, Cinb1R, CnbR, CcurrocR, CrocR, ACnprocsR ); PB_CVMinit( &VM, 0, CnpR, AnpR, Cinb1R, Ainb1R, CnbR, AnbR, p, q, maxp, maxq, lcmb ); /* * Figure out how many diagonal entries in this new virtual process (npq). */ npq = PB_CVMnpq( &VM ); /* * Re-adjust the number of rows or columns to be (un)packed, in order to average * the message sizes. */ if( npq ) kbb = npq / ( ( npq - 1 ) / kb + 1 ); if( row2row ) { while( npq ) { kbb = MIN( kbb, npq ); /* * Find out how many rows of sub( A ) and sub( C ) are contiguous */ PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Aoff ); /* * Compute the descriptor DBUFA for the buffer that will contained the packed * rows of sub( A ). */ if( ( Afr = ( ncpq < kbb ) ) != 0 ) { /* * If rows of sub( A ) are not contiguous, then allocate the buffer and pack * the kbb rows of sub( A ). */ Abufld = kbb; if( AisR || ( ACmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * kbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, &ACroc, PACKING, NOTRAN, kbb, AnpD, one, Mptr( A, Akk, AiiD, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( A ) directly. */ Abufld = Ald; if( AisR || ( ACmyprocR == AcurrocR ) ) Abuf = Mptr( A, Akk+Aoff, AiiD, Ald, size ); } PB_Cdescset( DBUFA, kbb, ACnD, kbb, Ainb1D, kbb, AnbD, AcurrocR, ArocD, ctxt, Abufld ); /* * Compute the descriptor DBUFC for the buffer that will contained the packed * rows of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < kbb ) ) != 0 ) { /* * If rows of sub( C ) are not contiguous, then allocate receiving buffer. */ Cbufld = kbb; talpha = one; tbeta = zero; if( CisR || ( ACmyprocR == CcurrocR ) ) Cbuf = PB_Cmalloc( CnpD * kbb * size ); } else { /* * Otherwise, re-use sub( C ) directly. */ Cbufld = Cld; talpha = ALPHA; tbeta = BETA; if( CisR || ( ACmyprocR == CcurrocR ) ) Cbuf = Mptr( C, Ckk+Coff, CiiD, Cld, size ); } PB_Cdescset( DBUFC, kbb, ACnD, kbb, Cinb1D, kbb, CnbD, CcurrocR, CrocD, ctxt, Cbufld ); /* * Add the one-dimensional buffer Abuf into Cbuf. */ PB_Cpaxpby( TYPE, CONJUG, kbb, ACnD, talpha, Abuf, 0, 0, DBUFA, &ACroc, tbeta, Cbuf, 0, 0, DBUFC, &ACroc ); /* * Release the buffer containing the packed rows of sub( A ) */ if( Afr && ( AisR || ( ACmyprocR == AcurrocR ) ) ) if( Abuf ) free( Abuf ); /* * Unpack the kbb rows of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( ACmyprocR == CcurrocR ) ) ) { PB_CVMpack( TYPE, &VM, ROW, &ACroc, UNPACKING, NOTRAN, kbb, CnpD, BETA, Mptr( C, Ckk, CiiD, Cld, size ), Cld, ALPHA, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } /* * Update the local row indexes of sub( A ) and sub( C ) */ PB_CVMupdate( &VM, kbb, &Ckk, &Akk ); npq -= kbb; } } else { while( npq ) { kbb = MIN( kbb, npq ); /* * Find out how many columns of sub( A ) and sub( C ) are contiguous */ PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Aoff ); /* * Compute the descriptor DBUFA for the buffer that will contained the packed * columns of sub( A ). */ if( ( Afr = ( ncpq < kbb ) ) != 0 ) { /* * If columns of sub( A ) are not contiguous, then allocate the buffer and * pack the kbb columns of sub( A ). */ Abufld = MAX( 1, AnpD ); if( AisR || ( ACmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * kbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, &ACroc, PACKING, NOTRAN, kbb, AnpD, one, Mptr( A, AiiD, Akk, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( A ) directly. */ Abufld = Ald; if( AisR || ( ACmyprocR == AcurrocR ) ) Abuf = Mptr( A, AiiD, Akk+Aoff, Ald, size ); } PB_Cdescset( DBUFA, ACnD, kbb, Ainb1D, kbb, AnbD, kbb, ArocD, AcurrocR, ctxt, Abufld ); /* * Compute the descriptor DBUFC for the buffer that will contained the packed * columns of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < kbb ) ) != 0 ) { /* * If columns of sub( C ) are not contiguous, then allocate receiving buffer. */ Cbufld = MAX( 1, CnpD ); talpha = one; tbeta = zero; if( CisR || ( ACmyprocR == CcurrocR ) ) Cbuf = PB_Cmalloc( CnpD * kbb * size ); } else { Cbufld = Cld; talpha = ALPHA; tbeta = BETA; if( CisR || ( ACmyprocR == CcurrocR ) ) Cbuf = Mptr( C, CiiD, Ckk+Coff, Cld, size ); } PB_Cdescset( DBUFC, ACnD, kbb, Cinb1D, kbb, CnbD, kbb, CrocD, CcurrocR, ctxt, Cbufld ); /* * Add the one-dimensional buffer Abuf into Cbuf. */ PB_Cpaxpby( TYPE, CONJUG, ACnD, kbb, talpha, Abuf, 0, 0, DBUFA, &ACroc, tbeta, Cbuf, 0, 0, DBUFC, &ACroc ); /* * Release the buffer containing the packed columns of sub( A ) */ if( Afr && ( AisR || ( ACmyprocR == AcurrocR ) ) ) if( Abuf ) free( Abuf ); /* * Unpack the kbb columns of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( ACmyprocR == CcurrocR ) ) ) { PB_CVMpack( TYPE, &VM, ROW, &ACroc, UNPACKING, NOTRAN, kbb, CnpD, BETA, Mptr( C, CiiD, Ckk, Cld, size ), Cld, ALPHA, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } /* * Update the local row index of sub( A ) and the local column index of sub( C ) */ PB_CVMupdate( &VM, kbb, &Ckk, &Akk ); npq -= kbb; } } } /* * Go to the next virtual process (p,q) */ if( ( Cfwd && ( p == maxpm1 ) ) || ( !( Cfwd ) && ( p == 0 ) ) ) q = ( Afwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) ); p = ( Cfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) ); } /* * End of PB_Cpgeadd */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpgemmAB.c000644 000766 000024 00000063776 10363532303 020402 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpgemmAB( PBTYP_T * TYPE, char * DIRECA, char * DIRECB, char * TRANSA, char * TRANSB, int M, int N, int K, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_CpgemmAB( TYPE, DIRECA, DIRECB, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * DIRECA, * DIRECB, * TRANSA, * TRANSB; int IA, IB, IC, JA, JB, JC, K, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; char * A, * B, * C; #endif { /* * Purpose * ======= * * PB_CpgemmAB performs one of the matrix-matrix operations * * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N', * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T', * conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C', * * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N', * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T', * conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'. * * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) ) * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and * sub( C ) is an m by n submatrix. * * This is the outer-product algorithm using the logical LCM hybrid * algorithmic blocking technique. The submatrix operand sub( C ) stays * in place. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECA (global input) pointer to CHAR * On entry, DIRECA specifies the direction in which the rows * or columns of sub( A ) should be looped over as follows: * DIRECA = 'F' or 'f' forward or increasing, * DIRECA = 'B' or 'b' backward or decreasing. * * DIRECB (global input) pointer to CHAR * On entry, DIRECB specifies the direction in which the rows * or columns of sub( B ) should be looped over as follows: * DIRECB = 'F' or 'f' forward or increasing, * DIRECB = 'B' or 'b' backward or decreasing. * * TRANSA (global input) pointer to CHAR * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * TRANSB (global input) pointer to CHAR * On entry, TRANSB specifies the form of op( sub( B ) ) to be * used in the matrix multiplication as follows: * * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ), * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )', * TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * op( sub( A ) ) and of the submatrix sub( C ). M must be at * least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * op( sub( B ) ) and the number of columns of the submatrix * sub( C ). N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns of the submatrix * op( sub( A ) ) and the number of rows of the submatrix * op( sub( B ) ). K must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Aroc, Broc, TrA, TrB, * one, * tbeta, * zero; int ABrocs, Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR, AinbD, AinbR, Ainb1D, Ainb1R, AisR, AkkR, Ald, AmyprocD, AmyprocR, AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff, ArocD, ArocR, AsrcR, Bbufld, BcurrocR, Bfr, Bfwd, BiD, BiR, BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR, BkkR, Bld, BmyprocD, BmyprocR, BnbD, BnbR, BnpD, BnpR, BnprocsD, BnprocsR, Boff, BrocD, BrocR, BsrcR, Ccol, Cii, Cimb1, Cinb1, Cjj, Cld, Cmb, Cmp, Cnb, Cnq, Crow, WAfr, WAsum, WBfr, WBsum, Wkbb=0, ctxt, k, kb, kbb, lcmb, maxp, maxpm1, maxq, mycol, myrow, ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp; GEMM_T gemm; /* * .. Local Arrays .. */ PB_VM_T VM; int Cd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd0[DLEN_], WBd0[DLEN_]; char * Abuf = NULL, * Bbuf = NULL, * Cptr = NULL, * WA = NULL, * WB = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN ); notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN ); TrA = ( ( TrA == CCOTRAN ) ? CCONJG : CNOCONJG ); TrB = ( ( TrB == CCOTRAN ) ? CCONJG : CNOCONJG ); size = TYPE->size; /* * Retrieve local information for sub( A ), sub( B ) and sub( C ) */ if( nota ) { AiR = JA; Aroc = CCOLUMN; AnprocsR = npcol; AinbR = DESCA[INB_]; AnbR = DESCA[NB_ ]; AsrcR = DESCA[CSRC_]; } else { AiR = IA; Aroc = CROW; AnprocsR = nprow; AinbR = DESCA[IMB_]; AnbR = DESCA[MB_ ]; AsrcR = DESCA[RSRC_]; } if( notb ) { BiR = IB; Broc = CROW; BnprocsR = nprow; BinbR = DESCB[IMB_]; BnbR = DESCB[MB_ ]; BsrcR = DESCB[RSRC_]; } else { BiR = JB; Broc = CCOLUMN; BnprocsR = npcol; BinbR = DESCB[INB_]; BnbR = DESCB[NB_ ]; BsrcR = DESCB[CSRC_]; } /* * Retrieve sub( C )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cdescribe( M, N, IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj, &Cld, &Cimb1, &Cinb1, &Cmb, &Cnb, &Crow, &Ccol, Cd0 ); Cmp = PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow ); Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol ); /* * When sub( A ) and sub( B ) do not span more than one process row or column, * there is no need to pack the data. */ if( !( PB_Cspan( K, AiR, AinbR, AnbR, AsrcR, AnprocsR ) ) && !( PB_Cspan( K, BiR, BinbR, BnbR, BsrcR, BnprocsR ) ) ) { PB_CInV( TYPE, &TrA, COLUMN, M, N, Cd0, K, A, IA, JA, DESCA, &Aroc, &WA, WAd0, &WAfr ); PB_CInV( TYPE, &TrB, ROW, M, N, Cd0, K, B, IB, JB, DESCB, &Broc, &WB, WBd0, &WBfr ); if( ( Cmp > 0 ) && ( Cnq > 0 ) ) { /* * Perform the local update if I own some of sub( C ) */ TYPE->Fgemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq, &K, ALPHA, WA, &WAd0[LLD_], WB, &WBd0[LLD_], BETA, Mptr( C, Cii, Cjj, Cld, size ), &Cld ); } if( WAfr ) free( WA ); if( WBfr ) free( WB ); return; } /* * sub( A ) and sub( B ) span more than one process row or column. */ Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD ); Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD ); one = TYPE->one; zero = TYPE->zero; tbeta = BETA; gemm = TYPE->Fgemm; kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Compute local information for sub( A ) and sub( B ) */ if( nota ) { AiD = IA; AinbD = DESCA[IMB_]; AnbD = DESCA[MB_]; Ald = DESCA[LLD_]; AmyprocD = myrow; AmyprocR = mycol; AnprocsD = nprow; PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR, &AiiD, &AiiR, &ArocD, &ArocR ); } else { AiD = JA; AinbD = DESCA[INB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_]; AmyprocD = mycol; AmyprocR = myrow; AnprocsD = npcol; PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD, &AiiR, &AiiD, &ArocR, &ArocD ); } Ainb1D = PB_Cfirstnb( M, AiD, AinbD, AnbD ); AnpD = PB_Cnumroc( M, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD ); Ainb1R = PB_Cfirstnb( K, AiR, AinbR, AnbR ); AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) ); if( notb ) { BiD = JB; BinbD = DESCB[INB_]; BnbD = DESCB[NB_]; Bld = DESCB[LLD_]; BmyprocD = mycol; BmyprocR = myrow; BnprocsD = npcol; PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD, &BiiR, &BiiD, &BrocR, &BrocD ); } else { BiD = IB; BinbD = DESCB[IMB_]; BnbD = DESCB[MB_]; Bld = DESCB[LLD_]; BmyprocD = myrow; BmyprocR = mycol; BnprocsD = nprow; PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR, &BiiD, &BiiR, &BrocD, &BrocR ); } Binb1D = PB_Cfirstnb( N, BiD, BinbD, BnbD ); BnpD = PB_Cnumroc( N, 0, Binb1D, BnbD, BmyprocD, BrocD, BnprocsD ); Binb1R = PB_Cfirstnb( K, BiR, BinbR, BnbR ); BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) ); /* * When sub( A ) is not replicated and backward pass on sub( A ), find the * virtual process q owning the last row or column of sub( A ). */ if( !( AisR ) && !( Afwd ) ) { tmp = PB_Cindxg2p( K - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR ); q = MModSub( tmp, ArocR, AnprocsR ); } /* * When sub( B ) is not replicated and backward pass on sub( B ), find the * virtual process p owning the last row or column of sub( B ). */ if( !( BisR ) && !( Bfwd ) ) { tmp = PB_Cindxg2p( K - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR ); p = MModSub( tmp, BrocR, BnprocsR ); } if( Cmp > 0 && Cnq > 0 ) Cptr = Mptr( C, Cii, Cjj, Cld, size ); /* * Allocate work space in process rows and columns spanned by sub( C ) */ PB_COutV( TYPE, COLUMN, NOINIT, M, N, Cd0, kb, &WA, WAd0, &WAfr, &WAsum ); PB_COutV( TYPE, ROW, NOINIT, M, N, Cd0, kb, &WB, WBd0, &WBfr, &WBsum ); /* * Loop over the virtual process grid induced by the sub( A ) and sub( B ) */ lcmb = PB_Clcm( ( maxp = ( BisR ? 1 : BnprocsR ) ) * BnbR, ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR ); maxpm1 = maxp - 1; /* * Find out process coordinates corresponding to first virtual process (p,q) */ AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) ); AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR ); AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR ); BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) ); BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BnprocsR ); BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR ); /* * Find out how many diagonals this virtual process (p,q) has */ PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR, p, q, maxp, maxq, lcmb ); npq = PB_CVMnpq( &VM ); for( k = 0; k < K; k += kb ) { kbb = K - k; kbb = MIN( kbb, kb ); while( Wkbb != kbb ) { /* * Ensure that the current virtual process (p,q) has something to contribute * to the replicated buffers WA and WB. */ while( npq == 0 ) { if( ( Bfwd && ( p == maxpm1 ) ) || ( !( Bfwd ) && ( p == 0 ) ) ) q = ( Afwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) ); p = ( Bfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) ); AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) ); AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR ); AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR ); BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) ); BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BnprocsR ); BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR ); PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR, p, q, maxp, maxq, lcmb ); npq = PB_CVMnpq( &VM ); } /* * Current virtual process (p,q) has something, find out how many rows or * columns could be used: ABrocs. */ if( Wkbb == 0 ) { ABrocs = ( npq < kbb ? npq : kbb ); } else { ABrocs = kbb - Wkbb; ABrocs = MIN( ABrocs, npq ); } /* * Find out how many rows or columns of sub( A ) and sub( B ) are contiguous */ PB_CVMcontig( &VM, &nrpq, &ncpq, &Boff, &Aoff ); if( nota ) { /* * Compute the descriptor DBUFA for the buffer that will contained the packed * columns of sub( A ). */ if( ( Afr = ( ncpq < ABrocs ) ) != 0 ) { /* * If columns of sub( A ) are not contiguous, then allocate the buffer and * pack the ABrocs columns of sub( A ). */ Abufld = MAX( 1, AnpD ); if( AisR || ( AmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * ABrocs * size ); PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, ABrocs, AnpD, one, Mptr( A, AiiD, AkkR, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( A ) directly. */ Abufld = Ald; if( AisR || ( AmyprocR == AcurrocR ) ) Abuf = Mptr( A, AiiD, AkkR + Aoff, Ald, size ); } PB_Cdescset( DBUFA, M, ABrocs, Ainb1D, ABrocs, AnbD, ABrocs, ArocD, AcurrocR, ctxt, Abufld ); } else { /* * Compute the descriptor DBUFA for the buffer that will contained the packed * rows of sub( A ). */ if( ( Afr = ( ncpq < ABrocs ) ) != 0 ) { /* * If rows of sub( A ) are not contiguous, then allocate the buffer and * pack the ABrocs rows of sub( A ). */ Abufld = ABrocs; if( AisR || ( AmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * ABrocs * size ); PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, ABrocs, AnpD, one, Mptr( A, AkkR, AiiD, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( A ) directly. */ Abufld = Ald; if( AisR || ( AmyprocR == AcurrocR ) ) Abuf = Mptr( A, AkkR + Aoff, AiiD, Ald, size ); } PB_Cdescset( DBUFA, ABrocs, M, ABrocs, Ainb1D, ABrocs, AnbD, AcurrocR, ArocD, ctxt, Abufld ); } if( notb ) { /* * Compute the descriptor DBUFB for the buffer that will contained the packed * rows of sub( B ). */ if( ( Bfr = ( nrpq < ABrocs ) ) != 0 ) { /* * If rows of sub( B ) are not contiguous, then allocate the buffer and * pack the ABrocs rows of sub( B ). */ Bbufld = ABrocs; if( BisR || ( BmyprocR == BcurrocR ) ) { Bbuf = PB_Cmalloc( BnpD * ABrocs * size ); PB_CVMpack( TYPE, &VM, ROW, &Broc, PACKING, NOTRAN, ABrocs, BnpD, one, Mptr( B, BkkR, BiiD, Bld, size ), Bld, zero, Bbuf, Bbufld ); } } else { /* * Otherwise, re-use sub( B ) directly. */ Bbufld = Bld; if( BisR || ( BmyprocR == BcurrocR ) ) Bbuf = Mptr( B, BkkR + Boff, BiiD, Bld, size ); } PB_Cdescset( DBUFB, ABrocs, N, ABrocs, Binb1D, ABrocs, BnbD, BcurrocR, BrocD, ctxt, Bbufld ); } else { /* * Compute the descriptor DBUFB for the buffer that will contained the packed * columns of sub( B ). */ if( ( Bfr = ( nrpq < ABrocs ) ) != 0 ) { /* * If columns of sub( B ) are not contiguous, then allocate the buffer and * pack the ABrocs columns of sub( B ). */ Bbufld = MAX( 1, BnpD ); if( BisR || ( BmyprocR == BcurrocR ) ) { Bbuf = PB_Cmalloc( BnpD * ABrocs * size ); PB_CVMpack( TYPE, &VM, ROW, &Broc, PACKING, NOTRAN, ABrocs, BnpD, one, Mptr( B, BiiD, BkkR, Bld, size ), Bld, zero, Bbuf, Bbufld ); } } else { /* * Otherwise, re-use sub( B ) directly. */ Bbufld = Bld; if( BisR || ( BmyprocR == BcurrocR ) ) Bbuf = Mptr( B, BiiD, BkkR + Boff, Bld, size ); } PB_Cdescset( DBUFB, N, ABrocs, Binb1D, ABrocs, BnbD, ABrocs, BrocD, BcurrocR, ctxt, Bbufld ); } /* * Update the local indexes of sub( A ) and sub( B ) */ PB_CVMupdate( &VM, ABrocs, &BkkR, &AkkR ); /* * Replicate panels of rows or columns of sub( A ) and sub( B ) over sub( C ) * -> WA, WB */ PB_CInV2( TYPE, &TrA, COLUMN, M, N, Cd0, ABrocs, Abuf, 0, 0, DBUFA, &Aroc, WA, Wkbb, WAd0 ); PB_CInV2( TYPE, &TrB, ROW, M, N, Cd0, ABrocs, Bbuf, 0, 0, DBUFB, &Broc, WB, Wkbb, WBd0 ); if( Afr & ( AisR || ( AmyprocR == AcurrocR ) ) ) if( Abuf ) free( Abuf ); if( Bfr & ( BisR || ( BmyprocR == BcurrocR ) ) ) if( Bbuf ) free( Bbuf ); /* * ABrocs rows or columns of sub( A ) and sub( B ) have been replicated, * update the number of diagonals in this virtual process as well as the * number of rows or columns of sub( A ) and sub( B ) that are in WA, WB. */ npq -= ABrocs; Wkbb += ABrocs; } /* * Perform local update */ if( Cmp > 0 && Cnq > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq, &kbb, ALPHA, WA, &WAd0[LLD_], WB, &WBd0[LLD_], tbeta, Cptr, &Cld ); tbeta = one; } Wkbb = 0; } if( WAfr ) free( WA ); if( WBfr ) free( WB ); /* * End of PB_CpgemmAB */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpgemmAC.c000644 000766 000024 00000062476 10363532303 020377 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpgemmAC( PBTYP_T * TYPE, char * DIRECA, char * DIRECC, char * TRANSA, char * TRANSB, int M, int N, int K, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_CpgemmAC( TYPE, DIRECA, DIRECC, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * DIRECA, * DIRECC, * TRANSA, * TRANSB; int IA, IB, IC, JA, JB, JC, K, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; char * A, * B, * C; #endif { /* * Purpose * ======= * * PB_CpgemmAC performs one of the matrix-matrix operations * * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N', * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T', * conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C', * * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N', * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T', * conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'. * * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) ) * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and * sub( C ) is an m by n submatrix. * * This is the inner-product algorithm using the logical LCM algorithmic * blocking technique. The submatrix operand sub( B ) stays in place. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECA (global input) pointer to CHAR * On entry, DIRECA specifies the direction in which the rows * or columns of sub( A ) should be looped over as follows: * DIRECA = 'F' or 'f' forward or increasing, * DIRECA = 'B' or 'b' backward or decreasing. * * DIRECC (global input) pointer to CHAR * On entry, DIRECC specifies the direction in which the rows * or columns of sub( C ) should be looped over as follows: * DIRECC = 'F' or 'f' forward or increasing, * DIRECC = 'B' or 'b' backward or decreasing. * * TRANSA (global input) pointer to CHAR * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * TRANSB (global input) pointer to CHAR * On entry, TRANSB specifies the form of op( sub( B ) ) to be * used in the matrix multiplication as follows: * * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ), * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )', * TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * op( sub( A ) ) and of the submatrix sub( C ). M must be at * least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * op( sub( B ) ) and the number of columns of the submatrix * sub( C ). N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns of the submatrix * op( sub( A ) ) and the number of rows of the submatrix * op( sub( B ) ). K must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Aroc, GemmTa, GemmTb, TrA, TrB, * one, * talpha, * tbeta, top, * zero; int Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR, AinbD, AinbR, Ainb1D, Ainb1R, AisR, Akk, Ald, AmyprocD, AmyprocR, AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff, ArocD, ArocR, AsrcR, Asrc_, Bcol, Bii, Bimb1, Binb1, Bjj, Bld, Bm, Bmb, Bmp, Bn, Bnb, Bnq, Brow, Cbufld, Ccol, Ccurrow, Cfr, Cfwd, Cii, Cimb, Cimb1, Cinb, Cinb1, CisR, Cjj, Ckk, Cld, Cmb, Cmp, Cnb, Cnq, Coff, Crow, Csrc, WAfr, WCfr, WCsum, ctxt, lcmb, m, maxp, maxpm1, maxq, mb, mbb, mycol, myrow, ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp; GEMM_T gemm; GSUM2D_T gsum2d; /* * .. Local Arrays .. */ PB_VM_T VM; int Bd0[DLEN_], DBUFA[DLEN_], DBUFC[DLEN_], WAd[DLEN_], WCd[DLEN_]; char * Abuf = NULL, * Bptr = NULL, * Cbuf = NULL, * WA = NULL, * WC = NULL; /* .. * .. Executable Statements .. * */ Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD ); Cfwd = ( Mupcase( DIRECC[0] ) == CFORWARD ); nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN ); notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d; mb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Compute local information for sub( A ), sub( B ) and sub( C ) */ if( nota ) { AiD = JA; AiR = IA; Asrc_ = RSRC_; Aroc = CROW; AinbR = DESCA[IMB_ ]; AinbD = DESCA[INB_]; AnbR = DESCA[MB_ ]; AnbD = DESCA[NB_ ]; AsrcR = DESCA[Asrc_]; Ald = DESCA[LLD_]; AmyprocD = mycol; AnprocsD = npcol; AmyprocR = myrow; AnprocsR = nprow; PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD, &AiiR, &AiiD, &ArocR, &ArocD ); } else { AiD = IA; AiR = JA; Asrc_ = CSRC_; Aroc = CCOLUMN; AinbD = DESCA[IMB_ ]; AinbR = DESCA[INB_]; AnbD = DESCA[MB_ ]; AnbR = DESCA[NB_ ]; AsrcR = DESCA[Asrc_]; Ald = DESCA[LLD_]; AmyprocD = myrow; AnprocsD = nprow; AmyprocR = mycol; AnprocsR = npcol; PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR, &AiiD, &AiiR, &ArocD, &ArocR ); } Ainb1D = PB_Cfirstnb( K, AiD, AinbD, AnbD ); AnpD = PB_Cnumroc( K, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD ); Ainb1R = PB_Cfirstnb( M, AiR, AinbR, AnbR ); Cimb = DESCC[IMB_ ]; Cinb = DESCC[INB_]; Cmb = DESCC[MB_ ]; Cnb = DESCC[NB_ ]; Csrc = DESCC[RSRC_]; Cld = DESCC[LLD_]; PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj, &Crow, &Ccol ); Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb ); Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb ); Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol ); /* * Retrieve the BLACS combine topology, compute conjugate of alpha for the * conjugate transpose case and set the transpose parameters to be passed to * the BLAS matrix multiply routine. */ if( notb ) { Bm = K; Bn = N; top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); talpha = ALPHA; GemmTa = ( nota ? CTRAN : TrA ); GemmTb = CNOTRAN; } else { Bm = N; Bn = K; top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); if( TrB == CCOTRAN ) { talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha ); GemmTb = ( ( TrA == CCOTRAN ) ? CTRAN : CCOTRAN ); } else { talpha = ALPHA; GemmTb = ( ( TrA == CCOTRAN ) ? CCOTRAN : CTRAN ); } GemmTa = CNOTRAN; } /* * Compute descriptor Bd0 for sub( B ) */ PB_Cdescribe( Bm, Bn, IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Bld, &Bimb1, &Binb1, &Bmb, &Bnb, &Brow, &Bcol, Bd0 ); Bmp = PB_Cnumroc( Bm, 0, Bimb1, Bmb, myrow, Brow, nprow ); Bnq = PB_Cnumroc( Bn, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp > 0 ) && ( Bnq > 0 ) ) Bptr = Mptr( B, Bii, Bjj, Bld, size ); /* * When sub( A ) is not replicated and backward pass on sub( A ), find the * virtual process q owning the last row or column of sub( A ). */ if( !( AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) ) ) && !Afwd ) { tmp = PB_Cindxg2p( M - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR ); q = MModSub( tmp, ArocR, AnprocsR ); } /* * When sub( C ) is not replicated and backward pass on sub( C ), find the * virtual process p owning the last row or column of sub( C ). */ if( !( CisR = ( ( Crow < 0 ) || ( nprow == 1 ) ) ) && !Cfwd ) { tmp = PB_Cindxg2p( M - 1, Cimb1, Cmb, Crow, Crow, nprow ); p = MModSub( tmp, Crow, nprow ); } /* * Loop over the virtual process grid induced by the rows or columns of * sub( A ) and sub( C ). */ lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : nprow ) ) * Cmb, ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR ); m = M; maxpm1 = maxp - 1; while( m > 0 ) { /* * Initialize local virtual matrix in process (p,q) */ AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) ); Akk = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR ); AnpR = PB_Cnumroc( M, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR ); Ccurrow = ( CisR ? -1 : MModAdd( Crow, p, nprow ) ); Ckk = PB_Cg2lrem( IC, Cimb, Cmb, Ccurrow, Csrc, nprow ); Cmp = PB_Cnumroc( M, 0, Cimb1, Cmb, Ccurrow, Crow, nprow ); PB_CVMinit( &VM, 0, Cmp, AnpR, Cimb1, Ainb1R, Cmb, AnbR, p, q, maxp, maxq, lcmb ); /* * Find how many diagonals in this virtual process */ npq = PB_CVMnpq( &VM ); m -= npq; /* * Re-adjust the number of rows or columns to be (un)packed, in order to * average the message sizes. */ if( npq ) mbb = npq / ( ( npq - 1 ) / mb + 1 ); while( npq ) { mbb = MIN( mbb, npq ); /* * Find out how many rows or columns of sub( A ) and sub( C ) are contiguous */ PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Aoff ); if( nota ) { /* * Compute the descriptor DBUFA for the buffer that will contained the packed * columns of sub( A ). */ if( ( Afr = ( ncpq < mbb ) ) != 0 ) { /* * If rows of sub( A ) are not contiguous, then allocate the buffer and * pack the mbb rows of sub( A ). */ Abufld = mbb; if( AisR || ( AmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * mbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, mbb, AnpD, one, Mptr( A, Akk, AiiD, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( B ) directly. */ Abufld = Ald; if( AisR || ( AmyprocR == AcurrocR ) ) Abuf = Mptr( A, Akk+Aoff, AiiD, Ald, size ); } PB_Cdescset( DBUFA, mbb, K, mbb, Ainb1D, mbb, AnbD, AcurrocR, ArocD, ctxt, Abufld ); } else { /* * Compute the descriptor DBUFA for the buffer that will contained the packed * columns of sub( A ). */ if( ( Afr = ( ncpq < mbb ) ) != 0 ) { /* * If columns of sub( A ) are not contiguous, then allocate the buffer and pack * the mbb columns of sub( A ). */ Abufld = MAX( 1, AnpD ); if( AisR || ( AmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * mbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, mbb, AnpD, one, Mptr( A, AiiD, Akk, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( A ) directly. */ Abufld = Ald; if( AisR || ( AmyprocR == AcurrocR ) ) Abuf = Mptr( A, AiiD, Akk+Aoff, Ald, size ); } PB_Cdescset( DBUFA, K, mbb, Ainb1D, mbb, AnbD, mbb, ArocD, AcurrocR, ctxt, Abufld ); } if( notb ) { /* * Replicate this panel of rows or columns of sub( A ) over sub( B ) -> WA */ PB_CInV( TYPE, NOCONJG, COLUMN, Bm, Bn, Bd0, mbb, Abuf, 0, 0, DBUFA, &Aroc, &WA, WAd, &WAfr ); /* * Allocate space for temporary results in scope of sub( B ) -> WC */ PB_COutV( TYPE, ROW, INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr, &WCsum ); /* * Local matrix-matrix multiply iff I own some data */ if( Bmp > 0 && Bnq > 0 ) gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &mbb, &Bnq, &Bmp, talpha, WA, &WAd[LLD_], Bptr, &Bld, zero, WC, &WCd[LLD_] ); if( WAfr ) free( WA ); if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) ) if( Abuf ) free( Abuf ); /* * Accumulate the intermediate results in WC */ if( WCsum ) { WCd[RSRC_] = Ccurrow; if( Bnq > 0 ) gsum2d( ctxt, COLUMN, &top, mbb, Bnq, WC, WCd[LLD_], WCd[RSRC_], mycol ); } /* * Compute the descriptor DBUFC for the buffer that will contained the packed * rows of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < mbb ) ) != 0 ) { /* * If rows of sub( C ) are not contiguous, then allocate the buffer */ Cbufld = mbb; tbeta = zero; if( CisR || ( myrow == Ccurrow ) ) Cbuf = PB_Cmalloc( Cnq * mbb * size ); } else { /* * Otherwise re-use sub( C ) */ Cbufld = Cld; tbeta = BETA; if( CisR || ( myrow == Ccurrow ) ) Cbuf = Mptr( C, Ckk+Coff, Cjj, Cld, size ); } PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol, ctxt, Cbufld ); /* * Cbuf := Cbuf + WC */ PB_Cpaxpby( TYPE, NOCONJG, mbb, N, one, WC, 0, 0, WCd, ROW, tbeta, Cbuf, 0, 0, DBUFC, ROW ); /* * Unpack the mbb rows of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( myrow == Ccurrow ) ) ) { PB_CVMpack( TYPE, &VM, ROW, ROW, UNPACKING, NOTRAN, mbb, Cnq, BETA, Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } if( WCfr ) free( WC ); } else { /* * Replicate this panel of rows or columns of sub( A ) over sub( B ) -> WA */ PB_CInV( TYPE, NOCONJG, ROW, Bm, Bn, Bd0, mbb, Abuf, 0, 0, DBUFA, &Aroc, &WA, WAd, &WAfr ); /* * Allocate space for temporary results in scope of sub( A ) -> WC */ PB_COutV( TYPE, COLUMN, INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr, &WCsum ); /* * Local matrix-matrix multiply iff I own some data */ if( Bmp > 0 && Bnq > 0 ) gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &Bmp, &mbb, &Bnq, talpha, Bptr, &Bld, WA, &WAd[LLD_], zero, WC, &WCd[LLD_] ); if( WAfr ) free( WA ); if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) ) if( Abuf ) free( Abuf ); /* * Accumulate the intermediate results in WC */ if( WCsum ) { WCd[CSRC_] = 0; if( Bmp > 0 ) gsum2d( ctxt, ROW, &top, Bmp, mbb, WC, WCd[LLD_], myrow, WCd[CSRC_] ); } /* * Compute the descriptor DBUFC for the buffer that will contained the packed * rows of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < mbb ) ) != 0 ) { /* * If rows of sub( C ) are not contiguous, then allocate the buffer */ Cbufld = mbb; tbeta = zero; if( CisR || ( myrow == Ccurrow ) ) Cbuf = PB_Cmalloc( Cnq * mbb * size ); } else { /* * Otherwise re-use sub( C ) */ Cbufld = Cld; tbeta = BETA; if( CisR || ( myrow == Ccurrow ) ) Cbuf = Mptr( C, Ckk+Coff, Cjj, Cld, size ); } PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol, ctxt, Cbufld ); /* * Cbuf := Cbuf + WC' */ PB_Cpaxpby( TYPE, ( TrB == CCOTRAN ? CONJG : NOCONJG ), N, mbb, one, WC, 0, 0, WCd, COLUMN, tbeta, Cbuf, 0, 0, DBUFC, ROW ); /* * Unpack the mbb rows of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( myrow == Ccurrow ) ) ) { PB_CVMpack( TYPE, &VM, ROW, ROW, UNPACKING, NOTRAN, mbb, Cnq, BETA, Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } if( WCfr ) free( WC ); } /* * Update the local indexes of sub( B ) and sub( C ) */ PB_CVMupdate( &VM, mbb, &Ckk, &Akk ); npq -= mbb; } /* * Go to next or previous virtual process row or column */ if( ( Cfwd && ( p == maxpm1 ) ) || ( !( Cfwd ) && ( p == 0 ) ) ) q = ( Afwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) ); p = ( Cfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) ); } if( TrB == CCOTRAN ) free( talpha ); /* * End of PB_CpgemmAC */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpgemmBC.c000644 000766 000024 00000062665 10363532303 020400 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpgemmBC( PBTYP_T * TYPE, char * DIRECB, char * DIRECC, char * TRANSA, char * TRANSB, int M, int N, int K, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_CpgemmBC( TYPE, DIRECB, DIRECC, TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * DIRECB, * DIRECC, * TRANSA, * TRANSB; int IA, IB, IC, JA, JB, JC, K, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; char * A, * B, * C; #endif { /* * Purpose * ======= * * PB_CpgemmBC performs one of the matrix-matrix operations * * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N', * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T', * conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C', * * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N', * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T', * conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'. * * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) ) * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and * sub( C ) is an m by n submatrix. * * This is the inner-product algorithm using the logical LCM algorithmic * blocking technique. The submatrix operand sub( A ) stays in place. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECB (global input) pointer to CHAR * On entry, DIRECB specifies the direction in which the rows * or columns of sub( B ) should be looped over as follows: * DIRECB = 'F' or 'f' forward or increasing, * DIRECB = 'B' or 'b' backward or decreasing. * * DIRECC (global input) pointer to CHAR * On entry, DIRECC specifies the direction in which the rows * or columns of sub( C ) should be looped over as follows: * DIRECC = 'F' or 'f' forward or increasing, * DIRECC = 'B' or 'b' backward or decreasing. * * TRANSA (global input) pointer to CHAR * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * TRANSB (global input) pointer to CHAR * On entry, TRANSB specifies the form of op( sub( B ) ) to be * used in the matrix multiplication as follows: * * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ), * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )', * TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * op( sub( A ) ) and of the submatrix sub( C ). M must be at * least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * op( sub( B ) ) and the number of columns of the submatrix * sub( C ). N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the number of columns of the submatrix * op( sub( A ) ) and the number of rows of the submatrix * op( sub( B ) ). K must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Broc, GemmTa, GemmTb, TrA, TrB, * one, * talpha, * tbeta, top, * zero; int Acol, Aii, Aimb1, Ainb1, Ajj, Ald, Am, Amb, Amp, An, Anb, Anq, Arow, Bbufld, BcurrocR, Bfr, Bfwd, BiD, BiR, BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR, Bkk, Bld, BmyprocD, BmyprocR, BnbD, BnbR, BnpD, BnpR, BnprocsD, BnprocsR, Boff, BrocD, BrocR, BsrcR, Bsrc_, Cbufld, Ccol, Ccurcol, Cfr, Cfwd, Cii, Cimb, Cimb1, Cinb, Cinb1, CisR, Cjj, Ckk, Cld, Cmb, Cmp, Cnb, Cnq, Coff, Crow, Csrc, WBfr, WCfr, WCsum, ctxt, lcmb, maxp, maxpm1, maxq, mycol, myrow, n, nb, nbb, ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp; GEMM_T gemm; GSUM2D_T gsum2d; /* * .. Local Arrays .. */ int Ad0[DLEN_], DBUFB[DLEN_], DBUFC[DLEN_], WBd[DLEN_], WCd[DLEN_]; PB_VM_T VM; char * Aptr = NULL, * Bbuf = NULL, * Cbuf = NULL, * WB = NULL, * WC = NULL; /* .. * .. Executable Statements .. * */ Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD ); Cfwd = ( Mupcase( DIRECC[0] ) == CFORWARD ); nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN ); notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d; nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Compute local information for sub( A ), sub( B ) and sub( C ) */ if( notb ) { BiD = IB; BiR = JB; Bsrc_ = CSRC_; Broc = CCOLUMN; BinbD = DESCB[IMB_ ]; BinbR = DESCB[INB_]; BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ]; BsrcR = DESCB[Bsrc_]; Bld = DESCB[LLD_]; BmyprocD = myrow; BnprocsD = nprow; BmyprocR = mycol; BnprocsR = npcol; PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR, &BiiD, &BiiR, &BrocD, &BrocR ); } else { BiD = JB; BiR = IB; Bsrc_ = RSRC_; Broc = CROW; BinbR = DESCB[IMB_ ]; BinbD = DESCB[INB_]; BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ]; BsrcR = DESCB[Bsrc_]; Bld = DESCB[LLD_]; BmyprocD = mycol; BnprocsD = npcol; BmyprocR = myrow; BnprocsR = nprow; PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD, &BiiR, &BiiD, &BrocR, &BrocD ); } Binb1D = PB_Cfirstnb( K, BiD, BinbD, BnbD ); BnpD = PB_Cnumroc( K, 0, Binb1D, BnbD, BmyprocD, BrocD, BnprocsD ); Binb1R = PB_Cfirstnb( N, BiR, BinbR, BnbR ); Cimb = DESCC[IMB_ ]; Cinb = DESCC[INB_]; Cmb = DESCC[MB_ ]; Cnb = DESCC[NB_ ]; Csrc = DESCC[CSRC_]; Cld = DESCC[LLD_]; PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj, &Crow, &Ccol ); Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb ); Cmp = PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow ); Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb ); /* * Retrieve the BLACS combine topology, compute conjugate of alpha for the * conjugate transpose case and set the transpose parameters to be passed to * the BLAS matrix multiply routine. */ if( nota ) { Am = M; An = K; top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); talpha = ALPHA; GemmTa = CNOTRAN; GemmTb = ( notb ? CTRAN : TrB ); } else { Am = K; An = M; top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( TrA == CCOTRAN ) { talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha ); GemmTa = ( ( TrB == CCOTRAN ) ? CTRAN : CCOTRAN ); } else { talpha = ALPHA; GemmTa = ( ( TrB == CCOTRAN ) ? CCOTRAN : CTRAN ); } GemmTb = CNOTRAN; } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( Am, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Amp = PB_Cnumroc( Am, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( A, Aii, Ajj, Ald, size ); } /* * When sub( B ) is not replicated and backward pass on sub( B ), find the * virtual process q owning the last row or column of sub( B ). */ if( !( BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) ) ) && !Bfwd ) { tmp = PB_Cindxg2p( N - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR ); q = MModSub( tmp, BrocR, BnprocsR ); } /* * When sub( C ) is not replicated and backward pass on sub( C ), find the * virtual process p owning the last row or column of sub( C ). */ if( !( CisR = ( ( Ccol < 0 ) || ( npcol == 1 ) ) ) && !Cfwd ) { tmp = PB_Cindxg2p( N - 1, Cinb1, Cnb, Ccol, Ccol, npcol ); p = MModSub( tmp, Ccol, npcol ); } /* * Loop over the virtual process grid induced by the rows or columns of * sub( B ) and sub( C ). */ lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : npcol ) ) * Cnb, ( maxq = ( BisR ? 1 : BnprocsR ) ) * BnbR ); n = N; maxpm1 = maxp - 1; while( n > 0 ) { /* * Initialize local virtual matrix in process (p,q) */ BcurrocR = ( BisR ? -1 : MModAdd( BrocR, q, BnprocsR ) ); Bkk = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BnprocsR ); BnpR = PB_Cnumroc( N, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR ); Ccurcol = ( CisR ? -1 : MModAdd( Ccol, p, npcol ) ); Ckk = PB_Cg2lrem( JC, Cinb, Cnb, Ccurcol, Csrc, npcol ); Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, Ccurcol, Ccol, npcol ); PB_CVMinit( &VM, 0, Cnq, BnpR, Cinb1, Binb1R, Cnb, BnbR, p, q, maxp, maxq, lcmb ); /* * Find how many diagonals in this virtual process */ npq = PB_CVMnpq( &VM ); n -= npq; /* * Re-adjust the number of rows or columns to be (un)packed, in order to * average the message sizes. */ if( npq ) nbb = npq / ( ( npq - 1 ) / nb + 1 ); while( npq ) { nbb = MIN( nbb, npq ); /* * Find out how many rows or columns of sub( B ) and sub( C ) are contiguous */ PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Boff ); if( notb ) { /* * Compute the descriptor DBUFB for the buffer that will contained the packed * columns of sub( B ). */ if( ( Bfr = ( ncpq < nbb ) ) != 0 ) { /* * If columns of sub( B ) are not contiguous, then allocate the buffer and * pack the nbb columns of sub( B ). */ Bbufld = MAX( 1, BnpD ); if( BisR || ( BmyprocR == BcurrocR ) ) { Bbuf = PB_Cmalloc( BnpD * nbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, &Broc, PACKING, NOTRAN, nbb, BnpD, one, Mptr( B, BiiD, Bkk, Bld, size ), Bld, zero, Bbuf, Bbufld ); } } else { /* * Otherwise, re-use sub( B ) directly. */ Bbufld = Bld; if( BisR || ( BmyprocR == BcurrocR ) ) Bbuf = Mptr( B, BiiD, Bkk+Boff, Bld, size ); } PB_Cdescset( DBUFB, K, nbb, Binb1D, nbb, BnbD, nbb, BrocD, BcurrocR, ctxt, Bbufld ); } else { /* * Compute the descriptor DBUFB for the buffer that will contained the packed * rows of sub( B ). */ if( ( Bfr = ( ncpq < nbb ) ) != 0 ) { /* * If rows of sub( B ) are not contiguous, then allocate the buffer and pack * the nbb rows of sub( B ). */ Bbufld = nbb; if( BisR || ( BmyprocR == BcurrocR ) ) { Bbuf = PB_Cmalloc( BnpD * nbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, &Broc, PACKING, NOTRAN, nbb, BnpD, one, Mptr( B, Bkk, BiiD, Bld, size ), Bld, zero, Bbuf, Bbufld ); } } else { /* * Otherwise, re-use sub( B ) directly. */ Bbufld = Bld; if( BisR || ( BmyprocR == BcurrocR ) ) Bbuf = Mptr( B, Bkk+Boff, BiiD, Bld, size ); } PB_Cdescset( DBUFB, nbb, K, nbb, Binb1D, nbb, BnbD, BcurrocR, BrocD, ctxt, Bbufld ); } if( nota ) { /* * Replicate this panel of rows or columns of sub( B ) over sub( A ) -> WB */ PB_CInV( TYPE, NOCONJG, ROW, Am, An, Ad0, nbb, Bbuf, 0, 0, DBUFB, &Broc, &WB, WBd, &WBfr ); /* * Allocate space for temporary results in scope of sub( A ) -> WC */ PB_COutV( TYPE, COLUMN, INIT, Am, An, Ad0, nbb, &WC, WCd, &WCfr, &WCsum ); /* * Local matrix-matrix multiply iff I own some data */ if( Amp > 0 && Anq > 0 ) gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &Amp, &nbb, &Anq, talpha, Aptr, &Ald, WB, &WBd[LLD_], zero, WC, &WCd[LLD_] ); if( WBfr ) free( WB ); if( Bfr && ( BisR || ( BmyprocR == BcurrocR ) ) ) if( Bbuf ) free( Bbuf ); /* * Accumulate the intermediate results in WC */ if( WCsum ) { WCd[CSRC_] = Ccurcol; if( Amp > 0 ) gsum2d( ctxt, ROW, &top, Amp, nbb, WC, WCd[LLD_], myrow, WCd[CSRC_] ); } /* * Compute the descriptor DBUFC for the buffer that will contained the packed * columns of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < nbb ) ) != 0 ) { /* * If columns of sub( C ) are not contiguous, then allocate the buffer */ Cbufld = MAX( 1, Cmp ); tbeta = zero; if( CisR || ( mycol == Ccurcol ) ) Cbuf = PB_Cmalloc( Cmp * nbb * size ); } else { /* * Otherwise re-use sub( C ) */ Cbufld = Cld; tbeta = BETA; if( CisR || ( mycol == Ccurcol ) ) Cbuf = Mptr( C, Cii, Ckk+Coff, Cld, size ); } PB_Cdescset( DBUFC, M, nbb, Cimb1, nbb, Cmb, nbb, Crow, Ccurcol, ctxt, Cbufld ); /* * Cbuf := Cbuf + WC */ PB_Cpaxpby( TYPE, NOCONJG, M, nbb, one, WC, 0, 0, WCd, COLUMN, tbeta, Cbuf, 0, 0, DBUFC, COLUMN ); /* * Unpack the nbb columns of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( mycol == Ccurcol ) ) ) { PB_CVMpack( TYPE, &VM, ROW, COLUMN, UNPACKING, NOTRAN, nbb, Cmp, BETA, Mptr( C, Cii, Ckk, Cld, size ), Cld, one, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } if( WCfr ) free( WC ); } else { /* * Replicate this panel of rows or columns of sub( B ) over sub( A ) -> WB */ PB_CInV( TYPE, NOCONJG, COLUMN, Am, An, Ad0, nbb, Bbuf, 0, 0, DBUFB, &Broc, &WB, WBd, &WBfr ); /* * Allocate space for temporary results in scope of sub( A ) -> WC */ PB_COutV( TYPE, ROW, INIT, Am, An, Ad0, nbb, &WC, WCd, &WCfr, &WCsum ); /* * Local matrix-matrix multiply iff I own some data */ if( Amp > 0 && Anq > 0 ) gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &nbb, &Anq, &Amp, talpha, WB, &WBd[LLD_], Aptr, &Ald, zero, WC, &WCd[LLD_] ); if( WBfr ) free( WB ); if( Bfr && ( BisR || ( BmyprocR == BcurrocR ) ) ) if( Bbuf ) free( Bbuf ); /* * Accumulate the intermediate results in WC */ if( WCsum ) { WCd[RSRC_] = 0; if( Anq > 0 ) gsum2d( ctxt, COLUMN, &top, nbb, Anq, WC, WCd[LLD_], WCd[RSRC_], mycol ); } /* * Compute the descriptor DBUFC for the buffer that will contained the packed * columns of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < nbb ) ) != 0 ) { /* * If columns of sub( C ) are not contiguous, then allocate the buffer */ Cbufld = MAX( 1, Cmp ); tbeta = zero; if( CisR || ( mycol == Ccurcol ) ) Cbuf = PB_Cmalloc( Cmp * nbb * size ); } else { /* * Otherwise re-use sub( C ) */ Cbufld = Cld; tbeta = BETA; if( CisR || ( mycol == Ccurcol ) ) Cbuf = Mptr( C, Cii, Ckk+Coff, Cld, size ); } PB_Cdescset( DBUFC, M, nbb, Cimb1, nbb, Cmb, nbb, Crow, Ccurcol, ctxt, Cbufld ); /* * Cbuf := Cbuf + WC' */ PB_Cpaxpby( TYPE, ( TrA == CCOTRAN ? CONJG : NOCONJG ), nbb, M, one, WC, 0, 0, WCd, ROW, tbeta, Cbuf, 0, 0, DBUFC, COLUMN ); /* * Unpack the nbb columns of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( mycol == Ccurcol ) ) ) { PB_CVMpack( TYPE, &VM, ROW, COLUMN, UNPACKING, NOTRAN, nbb, Cmp, BETA, Mptr( C, Cii, Ckk, Cld, size ), Cld, one, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } if( WCfr ) free( WC ); } /* * Update the local indexes of sub( B ) and sub( C ) */ PB_CVMupdate( &VM, nbb, &Ckk, &Bkk ); npq -= nbb; } /* * Go to next or previous virtual process row or column */ if( ( Cfwd && ( p == maxpm1 ) ) || ( !( Cfwd ) && ( p == 0 ) ) ) q = ( Bfwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) ); p = ( Cfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) ); } if( TrA == CCOTRAN ) free( talpha ); /* * End of PB_CpgemmBC */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cplacnjg.c000644 000766 000024 00000016737 10363532303 020503 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cplacnjg( PBTYP_T * TYPE, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA ) #else void PB_Cplacnjg( TYPE, M, N, ALPHA, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ int IA, JA, M, N; char * ALPHA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA; char * A; #endif { /* * Purpose * ======= * * PB_Cplacnjg conjugates and scales by alpha and an m by n submatrix * sub( A ) denoting A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha, i.e., the cons- * tant with which the matrix elements are to be scaled. * * A (local input/local output) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A to be scaled. On exit, the * local entries of this array corresponding to the to the en- * tries of the submatrix sub( A ) are overwritten by the local * entries of the m by n conjugated and scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Arow, Aii, Aimb1, Ainb1, Ajj, Ald, Amb, Amp, Anb, Anq, izero=0, mycol, myrow, npcol, nprow; /* * .. Local Arrays .. */ int Ad0[DLEN_]; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( DESCA[CTXT_], &nprow, &npcol, &myrow, &mycol ); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( M, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Quick return if I don't own any of sub( A ). */ Amp = PB_Cnumroc( M, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return; /* * Conjugate and scale local data */ TYPE->Ftzcnjg( C2F_CHAR( ALL ), &Amp, &Anq, &izero, ALPHA, Mptr( A, Aii, Ajj, Ald, TYPE->size ), &Ald ); /* * End of PB_Cplacnjg */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cplapad.c000644 000766 000024 00000035001 10363532303 020307 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cplapad( PBTYP_T * TYPE, char * UPLO, char * CONJUG, int M, int N, char * ALPHA, char * BETA, char * A, int IA, int JA, int * DESCA ) #else void PB_Cplapad( TYPE, UPLO, CONJUG, M, N, ALPHA, BETA, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ char * CONJUG, * UPLO; int IA, JA, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA; char * A; #endif { /* * Purpose * ======= * * PB_Cplapad initializes an m by n submatrix A(IA:IA+M-1,JA:JA+N-1) * denoted by sub( A ) to beta on the diagonal or zeros the imaginary * part of those diagonals and set the offdiagonals to alpha. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies the part of the submatrix sub( A ) * to be set: * = 'L' or 'l': Lower triangular part is set; the strictly * upper triangular part of sub( A ) is not changed; * = 'U' or 'u': Upper triangular part is set; the strictly * lower triangular part of sub( A ) is not changed; * Otherwise: All of the matrix sub( A ) is set. * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies what should be done to the diago- * nals as follows. When UPLO is 'L', 'l', 'U' or 'u' and CONJUG * is 'Z' or 'z', the imaginary part of the diagonals is set to * zero. Otherwise, the diagonals are set to beta. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha, i.e., the cons- * tant to which the offdiagonal elements are to be set. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta, i.e., the constant * to which the diagonal elements are to be set. BETA is not re- * ferenced when UPLO is 'L', 'l', 'U' or 'u' and CONJUG is 'Z'. * * A (local input/local output) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A to be set. On exit, the * leading m by n submatrix sub( A ) is set as follows: * * UPLO = 'L' or 'l', A(IA+i-1,JA+j-1)=ALPHA, j+1<=i<=M, 1<=j<=N * UPLO = 'U' or 'u', A(IA+i-1,JA+j-1)=ALPHA, 1<=i<=j-1, 1<=j<=N * otherwise, A(IA+i-1,JA+j-1)=ALPHA, 1<=i<=M, 1<=j<=N * and IA+i.NE.JA+j, * and, for all UPLO, A(IA+i-1,JA+i-1) = BETA, 1<=i<=min(M,N). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char type; int Acol, Aii, Aimb1, Ainb1, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, ctxt, izero=0, k, kb, ktmp, mn, mycol, myrow, nb, npcol, nprow, size; TZPAD_T pad; /* * .. Local Arrays .. */ int Ad0[DLEN_]; char * Aptr = NULL; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( M, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Quick return if I don't own any of sub( A ). */ Amp = PB_Cnumroc( M, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return; size = TYPE->size; type = TYPE->type; pad = TYPE->Ftzpad; Aptr = Mptr( A, Aii, Ajj, Ald, size ); /* * When the entire sub( A ) needs to be padded and alpha is equal to beta, or * sub( A ) is replicated in all processes, just call the local routine. */ if( type == SREAL ) { if( ( ( Mupcase( UPLO[0] ) == CALL ) && ( ((float*)(ALPHA))[REAL_PART] == ((float*)(BETA ))[REAL_PART] ) ) || ( ( ( Arow < 0 ) || ( nprow == 1 ) ) && ( ( Acol < 0 ) || ( npcol == 1 ) ) ) ) { pad( C2F_CHAR( UPLO ), C2F_CHAR( CONJUG ), &Amp, &Anq, &izero, ALPHA, BETA, Aptr, &Ald ); return; } } else if( type == DREAL ) { if( ( ( Mupcase( UPLO[0] ) == CALL ) && ( ((double*)(ALPHA))[REAL_PART] == ((double*)(BETA ))[REAL_PART] ) ) || ( ( ( Arow < 0 ) || ( nprow == 1 ) ) && ( ( Acol < 0 ) || ( npcol == 1 ) ) ) ) { pad( C2F_CHAR( UPLO ), C2F_CHAR( CONJUG ), &Amp, &Anq, &izero, ALPHA, BETA, Aptr, &Ald ); return; } } else if( type == SCPLX ) { if( ( ( Mupcase( UPLO[0] ) == CALL ) && ( ((float*)(ALPHA))[REAL_PART] == ((float*)(BETA ))[REAL_PART] ) && ( ((float*)(ALPHA))[IMAG_PART] == ((float*)(BETA ))[IMAG_PART] ) ) || ( ( ( Arow < 0 ) || ( nprow == 1 ) ) && ( ( Acol < 0 ) || ( npcol == 1 ) ) ) ) { pad( C2F_CHAR( UPLO ), C2F_CHAR( CONJUG ), &Amp, &Anq, &izero, ALPHA, BETA, Aptr, &Ald ); return; } } else if( type == DCPLX ) { if( ( ( Mupcase( UPLO[0] ) == CALL ) && ( ((double*)(ALPHA))[REAL_PART] == ((double*)(BETA ))[REAL_PART] ) && ( ((double*)(ALPHA))[IMAG_PART] == ((double*)(BETA ))[IMAG_PART] ) ) || ( ( ( Arow < 0 ) || ( nprow == 1 ) ) && ( ( Acol < 0 ) || ( npcol == 1 ) ) ) ) { pad( C2F_CHAR( UPLO ), C2F_CHAR( CONJUG ), &Amp, &Anq, &izero, ALPHA, BETA, Aptr, &Ald ); return; } } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and two times the least common multiple of nprow * and npcol. */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); mn = MIN( M, N ); if( Mupcase( UPLO[0] ) == CLOWER ) { /* * Lower triangle of sub( A ): proceed by block of columns. For each block of * columns, operate on the logical diagonal block first and then the remaining * rows of that block of columns. */ for( k = 0; k < mn; k += nb ) { kb = mn - k; ktmp = k + ( kb = MIN( kb, nb ) ); PB_Cplapd2( TYPE, UPLO, CONJUG, kb, kb, ALPHA, BETA, Aptr, k, k, Ad0 ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp0 = Amp - Akp ) > 0 ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp0, &Anq0, &izero, ALPHA, ALPHA, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } else if( Mupcase( UPLO[0] ) == CUPPER ) { /* * Upper triangle of sub( A ): proceed by block of columns. For each block of * columns, operate on the trailing rows and then the logical diagonal block * of that block of columns. When M < N, the last columns of sub( A ) are * handled together. */ for( k = 0; k < mn; k += nb ) { kb = mn - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Akp, &Anq0, &izero, ALPHA, ALPHA, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); PB_Cplapd2( TYPE, UPLO, CONJUG, kb, kb, ALPHA, BETA, Aptr, k, k, Ad0 ); } if( ( Anq -= ( Akq += Anq0 ) ) > 0 ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &Anq, &izero, ALPHA, ALPHA, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); } else { /* * All of sub( A ): proceed by block of columns. For each block of columns, * operate on the trailing rows, then the logical diagonal block, and finally * the remaining rows of that block of columns. When M < N, the last columns * of sub( A ) are handled together. */ for( k = 0; k < mn; k += nb ) { kb = mn - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Akp, &Anq0, &izero, ALPHA, ALPHA, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); PB_Cplapd2( TYPE, UPLO, NOCONJG, kb, kb, ALPHA, BETA, Aptr, k, k, Ad0 ); Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow ); if( ( Amp0 = Amp - Akp ) > 0 ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp0, &Anq0, &izero, ALPHA, ALPHA, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } if( ( Anq -= ( Akq += Anq0 ) ) > 0 ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Amp, &Anq, &izero, ALPHA, ALPHA, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); } /* * End of PB_Cplapad */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cplapd2.c000644 000766 000024 00000030131 10363532303 020227 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cplapd2( PBTYP_T * TYPE, char * UPLO, char * CONJUG, int M, int N, char * ALPHA, char * BETA, char * A, int IA, int JA, int * DESCA ) #else void PB_Cplapd2( TYPE, UPLO, CONJUG, M, N, ALPHA, BETA, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ char * CONJUG, * UPLO; int IA, JA, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA; char * A; #endif { /* * .. Local Scalars .. */ char UploA, herm; int Acol, Aii, Aimb1, Ainb1, Aoffi, Ajj, Ald, Amb, Amp, Anb, Anq, Aoffj, Arcol, Arow, Arrow, GoEast, GoSouth, iimax, ilow, imbloc, inbloc, ioffd, iupp, izero=0, jjmax, joffd, lcmt, lcmt00, lmbloc, lnbloc, low, lower, m1, mbloc, mblkd, mblks, mycol, myrow, n1, nbloc, nblkd, nblks, npcol, nprow, pmb, qnb, size, tmp1, upp, upper; TZPAD_T pad; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( DESCA[CTXT_], &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cainfog2l( M, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aimb1, &Ainb1, &Amp, &Anq, &Aii, &Ajj, &Arow, &Acol, &Arrow, &Arcol ); /* * Quick return if I don't own any of sub( A ). */ if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return; /* * Initialize lcmt00, mblks, nblks, imbloc, inbloc, lmbloc, lnbloc, ilow, low, * iupp, and upp. */ Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Ald = DESCA[LLD_]; PB_Cbinfo( 0, Amp, Anq, Aimb1, Ainb1, Amb, Anb, Arrow, Arcol, &lcmt00, &mblks, &nblks, &imbloc, &inbloc, &lmbloc, &lnbloc, &ilow, &low, &iupp, &upp ); iimax = ( Aoffi = Aii - 1 ) + ( m1 = Amp ); jjmax = ( Aoffj = Ajj - 1 ) + ( n1 = Anq ); pmb = ( ( ( Arow < 0 ) || ( nprow == 1 ) ) ? Amb : nprow * Amb ); qnb = ( ( ( Acol < 0 ) || ( npcol == 1 ) ) ? Anb : npcol * Anb ); size = TYPE->size; pad = TYPE->Ftzpad; UploA = Mupcase( UPLO[0] ); herm = ( UploA == CALL ? CNOCONJG : Mupcase( CONJUG[0] ) ); upper = ( UploA != CLOWER ); lower = ( UploA != CUPPER ); /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); /* * Go through the table looking for blocks owning diagonal entries. */ if( ( !( GoSouth ) ) && ( !( GoEast ) ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ pad( C2F_CHAR( UPLO ), C2F_CHAR( &herm ), &imbloc, &inbloc, &lcmt00, ALPHA, BETA, Mptr( A, Aii, Ajj, Ald, size ), &Ald ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); if( GoSouth ) { /* * When the upper triangular part of sub( A ) should be set and one is * planning to go south in the table, it is neccessary to take care of the * remaining columns of these imbloc rows immediately. */ if( upper && ( Anq > inbloc ) ) { tmp1 = Anq - inbloc; pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &imbloc, &tmp1, &izero, ALPHA, ALPHA, Mptr( A, Aii, Ajj+inbloc, Ald, size ), &Ald ); } Aii += imbloc; m1 -= imbloc; } else { /* * When the lower triangular part of sub( A ) should be set and one is * planning to go east in the table, it is neccessary to take care of the * remaining rows of these inbloc columns immediately. */ if( lower && ( Amp > imbloc ) ) { tmp1 = Amp - imbloc; pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &tmp1, &inbloc, &izero, ALPHA, ALPHA, Mptr( A, Aii+imbloc, Ajj, Ald, size ), &Ald ); } Ajj += inbloc; n1 -= inbloc; } } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the local row index in A. */ lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row index in A. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; } /* * Set the upper triangular part of sub( A ) we just skipped when necessary. */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &tmp1, &n1, &izero, ALPHA, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), &Ald ); Aii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; pad( C2F_CHAR( UPLO ), C2F_CHAR( &herm ), &mbloc, &inbloc, &lcmt, ALPHA, BETA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), &Ald ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; ioffd += mbloc; } /* * Set the lower triangular part of sub( A ) when necessary. */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &tmp1, &inbloc, &izero, ALPHA, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), &Ald ); tmp1 = Aoffi - Aii + 1; m1 -= tmp1; n1 -= inbloc; lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; /* * When the upper triangular part of sub( A ) should be set, take care of the * n1 remaining columns of these tmp1 rows immediately. */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &tmp1, &n1, &izero, ALPHA, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), &Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as well as * the local column index in A. */ lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; /* * While there are blocks remaining that own lower entries, keep going east. * Adjust the current LCM value as well as the local column index in A. */ while( ( nblks > 0 ) && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; Aoffj += Anb; } /* * Set the lower triangular part of sub( A ) we just skipped when necessary. */ tmp1 = MIN( Aoffj, jjmax ) - Ajj + 1; if( lower && ( tmp1 > 0 ) ) { pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &m1, &tmp1, &izero, ALPHA, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald ); Ajj += tmp1; n1 -= tmp1; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return; /* * lcmt00 >= low. The current block owns either diagonals or upper entries. * Save the current position in the LCM table. After this row has been * completely taken care of, re-start from this column and the next row of * the LCM table. */ lcmt = lcmt00; nblkd = nblks; joffd = Aoffj; nbloc = Anb; while( ( nblkd > 0 ) && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ if( nblkd == 1 ) nbloc = lnbloc; pad( C2F_CHAR( UPLO ), C2F_CHAR( &herm ), &imbloc, &nbloc, &lcmt, ALPHA, BETA, Mptr( A, Aii, joffd+1, Ald, size ), &Ald ); lcmt00 = lcmt; lcmt += qnb; nblks = nblkd; nblkd--; Aoffj = joffd; joffd += nbloc; } /* * Set the upper triangular part of sub( A ) when necessary. */ tmp1 = n1 - joffd + Ajj - 1; if( upper && ( tmp1 > 0 ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &imbloc, &tmp1, &izero, ALPHA, ALPHA, Mptr( A, Aii, joffd+1, Ald, size ), &Ald ); tmp1 = Aoffj - Ajj + 1; m1 -= imbloc; n1 -= tmp1; lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; /* * When the lower triangular part of sub( A ) should be set, take care of the * m1 remaining rows of these tmp1 columns immediately. */ if( lower && ( m1 > 0 ) && ( tmp1 > 0 ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &m1, &tmp1, &izero, ALPHA, ALPHA, Mptr( A, Aoffi+1, Ajj, Ald, size ), &Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; } /* * Loop over the remaining columns of the LCM table. */ nbloc = Anb; while( nblks > 0 ) { if( nblks == 1 ) nbloc = lnbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row index in A. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; } /* * Set the upper triangular part of sub( A ) we just skipped when necessary. */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &tmp1, &n1, &izero, ALPHA, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), &Ald ); Aii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; pad( C2F_CHAR( UPLO ), C2F_CHAR( &herm ), &mbloc, &nbloc, &lcmt, ALPHA, BETA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), &Ald ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; ioffd += mbloc; } /* * Set the lower triangular part of sub( A ) when necessary. */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &tmp1, &nbloc, &izero, ALPHA, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), &Ald ); tmp1 = MIN( Aoffi, iimax ) - Aii + 1; m1 -= tmp1; n1 -= nbloc; lcmt00 += qnb; nblks--; Aoffj += nbloc; /* * When the upper triangular part of sub( A ) should be set, take care of the * n1 remaining columns of these tmp1 rows immediately. */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( &herm ), &tmp1, &n1, &izero, ALPHA, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), &Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; } /* * End of PB_Cplapd2 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cplaprnt.c000644 000766 000024 00000043144 10363532303 020535 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cplaprnt( PBTYP_T * TYPE, int M, int N, char * A, int IA, int JA, int * DESCA, int IRPRNT, int ICPRNT, char * CMATNM ) #else void PB_Cplaprnt( TYPE, M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, CMATNM ) /* * .. Scalar Arguments .. */ int IA, ICPRNT, IRPRNT, JA, M, N; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA; char * A, * CMATNM; #endif { /* * Purpose * ======= * * PB_Cplaprnt prints to the standard output the submatrix sub( A ) de- * noting A(IA:IA+M-1,JA:JA+N-1). The local pieces of sub( A ) are sent * and printed by the process of coordinates (IRPRNT, ICPRNT). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * IRPRNT (global input) INTEGER * On entry, IRPRNT specifies the row index of the printing pro- * cess. * * ICPRNT (global input) INTEGER * On entry, ICPRNT specifies the column index of the printing * process. * * CMATNM (global input) pointer to CHAR * On entry, CMATNM is the name of the matrix to be printed. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int mycol, myrow, npcol, nprow, pcol, prow; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( DESCA[CTXT_], &nprow, &npcol, &myrow, &mycol ); /* * When sub( A ) is replicated, each copy is printed for debugging purposes. */ if( DESCA[ RSRC_ ] >= 0 ) { /* * sub( A ) is distributed onto the process rows of the grid */ if( DESCA[ CSRC_ ] >= 0 ) { /* * sub( A ) is distributed onto the process columns of the grid */ PB_Cplaprn2( TYPE, M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, CMATNM, DESCA[ RSRC_ ], DESCA[ CSRC_ ] ); } else { /* * sub( A ) is replicated in every process column of the grid */ for( pcol = 0; pcol < npcol; pcol++ ) { if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) (void) fprintf( stdout, "Colum-replicated array -- copy in process column: %d\n", pcol ); PB_Cplaprn2( TYPE, M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, CMATNM, DESCA[ RSRC_ ], pcol ); } } } else { /* * sub( A ) is replicated in every process row of the grid */ if( DESCA[ CSRC_ ] >= 0 ) { /* * sub( A ) is distributed onto the process columns of the grid */ for( prow = 0; prow < nprow; prow++ ) { if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) (void) fprintf( stdout, "Row-replicated array -- copy in process row: %d\n", prow ); PB_Cplaprn2( TYPE, M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, CMATNM, prow, DESCA[ CSRC_ ] ); } } else { /* * sub( A ) is replicated in every process column of the grid */ for( prow = 0; prow < nprow; prow++ ) { for( pcol = 0; pcol < npcol; pcol++ ) { if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) (void) fprintf( stdout, "Replicated array -- copy in process (%d,%d)\n", prow, pcol ); PB_Cplaprn2( TYPE, M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, CMATNM, prow, pcol ); } } } } /* * End of PB_Cplaprnt */ } #ifdef __STDC__ void PB_Cplaprn2( PBTYP_T * TYPE, int M, int N, char * A, int IA, int JA, int * DESCA, int IRPRNT, int ICPRNT, char * CMATNM, int PROW, int PCOL ) #else void PB_Cplaprn2( TYPE, M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT, CMATNM, PROW, PCOL ) /* * .. Scalar Arguments .. */ int IA, ICPRNT, IRPRNT, JA, M, N, PCOL, PROW; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA; char * A, * CMATNM; #endif { /* * .. Local Scalars .. */ char type; int Acol, Aii, AisColRep, AisRowRep, Ajj, Ald, Arow, ctxt, h, i, ib, icurcol, icurrow, ii, in, j, jb, jj, jn, ldw, mycol, myrow, npcol, nprow, size, usiz; /* * .. Local Arrays .. */ char * buf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ Ald = DESCA[LLD_]; PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); /* * Save the local first index of each row and column sub( A ) */ ii = Aii; jj = Ajj; /* * When sub( A ) is row-replicated, print the copy in process row PROW. * Otherwise, print the distributed matrix rows starting in process row Arow. */ if( Arow < 0 ) { AisRowRep = 1; icurrow = Arow = PROW; } else { AisRowRep = 0; icurrow = Arow; } /* * When sub( A ) is column-replicated, print the copy in process column PCOL. * Otherwise, print the distributed matrix columns starting in process column * Acol. */ if( Acol < 0 ) { AisColRep = 1; icurcol = Acol = PCOL; } else { AisColRep = 0; icurcol = Acol; } type = TYPE->type; usiz = TYPE->usiz; size = TYPE->size; /* * Allocate buffer in printing process */ ldw = MAX( DESCA[ IMB_ ], DESCA[ MB_ ] ); if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) buf = PB_Cmalloc( ldw * size ); /* * Handle the first block of column separately */ jb = PB_Cfirstnb( N, JA, DESCA[INB_], DESCA[NB_] ); jn = JA + jb - 1; for( h = 0; h < jb; h++ ) { ib = PB_Cfirstnb( M, IA, DESCA[IMB_], DESCA[MB_] ); in = IA + ib - 1; if( ( icurrow == IRPRNT ) && ( icurcol == ICPRNT ) ) { if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) { PB_Cprnt( type, size, usiz, ib, Mptr( A, ii, jj+h, Ald, size ), IA+1, JA+h+1, CMATNM ); } } else { if( ( myrow == icurrow ) && ( mycol == icurcol ) ) { TYPE->Cgesd2d( ctxt, ib, 1, Mptr( A, ii, jj+h, Ald, size ), Ald, IRPRNT, ICPRNT ); } else if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) { TYPE->Cgerv2d( ctxt, ib, 1, buf, ldw, icurrow, icurcol ); PB_Cprnt( type, size, usiz, ib, buf, IA+1, JA+h+1, CMATNM ); } } /* * Go to next block of rows */ if( myrow == icurrow ) ii += ib; if( !( AisRowRep ) ) icurrow = MModAdd1( icurrow, nprow ); Cblacs_barrier( ctxt, ALL ); /* * Loop over remaining block of rows */ for( i = in+1; i <= IA+M-1; i += DESCA[MB_] ) { ib = MIN( DESCA[MB_], IA+M-i ); if( ( icurrow == IRPRNT ) && ( icurcol == ICPRNT ) ) { if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) { PB_Cprnt( type, size, usiz, ib, Mptr( A, ii, jj+h, Ald, size ), i+1, JA+h+1, CMATNM ); } } else { if( ( myrow == icurrow ) && ( mycol == icurcol ) ) { TYPE->Cgesd2d( ctxt, ib, 1, Mptr( A, ii, jj+h, Ald, size ), Ald, IRPRNT, ICPRNT ); } else if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) { TYPE->Cgerv2d( ctxt, ib, 1, buf, ldw, icurrow, icurcol ); PB_Cprnt( type, size, usiz, ib, buf, i+1, JA+h+1, CMATNM); } } /* * Go to next block of rows */ if( myrow == icurrow ) ii += ib; if( !( AisRowRep ) ) icurrow = MModAdd1( icurrow, nprow ); Cblacs_barrier( ctxt, ALL ); } /* * Restart at the first row to be printed */ ii = Aii; icurrow = Arow; } /* * Go to next block of columns */ if( mycol == icurcol ) jj += jb; if( !( AisColRep ) ) icurcol = MModAdd1( icurcol, npcol ); Cblacs_barrier( ctxt, ALL ); /* * Loop over remaining column blocks */ for( j = jn+1; j <= JA+N-1; j += DESCA[NB_] ) { jb = MIN( DESCA[NB_], JA+N-j ); for( h = 0; h < jb; h++ ) { ib = PB_Cfirstnb( M, IA, DESCA[IMB_], DESCA[MB_] ); in = IA + ib - 1; if( ( icurrow == IRPRNT ) && ( icurcol == ICPRNT ) ) { if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) { PB_Cprnt( type, size, usiz, ib, Mptr( A, ii, jj+h, Ald, size ), IA+1, j+h+1, CMATNM ); } } else { if( ( myrow == icurrow ) && ( mycol == icurcol ) ) { TYPE->Cgesd2d( ctxt, ib, 1, Mptr( A, ii, jj+h, Ald, size ), Ald, IRPRNT, ICPRNT ); } else if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) { TYPE->Cgerv2d( ctxt, ib, 1, buf, ldw, icurrow, icurcol ); PB_Cprnt( type, size, usiz, ib, buf, IA+1, j+h+1, CMATNM ); } } /* * Go to next block of rows */ if( myrow == icurrow ) ii += ib; if( !( AisRowRep ) ) icurrow = MModAdd1( icurrow, nprow ); Cblacs_barrier( ctxt, ALL ); /* * Loop over remaining block of rows */ for( i = in+1; i <= IA+M-1; i += DESCA[MB_] ) { ib = MIN( DESCA[MB_], IA+M-i ); if( ( icurrow == IRPRNT ) && ( icurcol == ICPRNT ) ) { if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) { PB_Cprnt( type, size, usiz, ib, Mptr( A, ii, jj+h, Ald, size ), i+1, j+h+1, CMATNM ); } } else { if( ( myrow == icurrow ) && ( mycol == icurcol ) ) { TYPE->Cgesd2d( ctxt, ib, 1, Mptr( A, ii, jj+h, Ald, size ), Ald, IRPRNT, ICPRNT ); } else if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) ) { TYPE->Cgerv2d( ctxt, ib, 1, buf, ldw, icurrow, icurcol ); PB_Cprnt( type, size, usiz, ib, buf, i+1, j+h+1, CMATNM ); } } /* * Go to next block of rows */ if( myrow == icurrow ) ii += ib; if( !( AisRowRep ) ) icurrow = MModAdd1( icurrow, nprow ); Cblacs_barrier( ctxt, ALL ); } /* * Restart at the first row to be printed */ ii = Aii; icurrow = Arow; } /* * Go to next block of columns */ if( mycol == icurcol ) jj += jb; if( !( AisColRep ) ) icurcol = MModAdd1( icurcol, npcol ); Cblacs_barrier( ctxt, ALL ); } if( ( myrow == IRPRNT ) && ( mycol == ICPRNT ) && ( buf ) ) free( buf ); /* * End of PB_Cplaprn2 */ } #ifdef __STDC__ void PB_Cprnt( char TYPE, int SIZE, int USIZ, int N, char * A, int IA, int JA, char * CMATNM ) #else void PB_Cprnt( TYPE, SIZE, USIZ, N, A, IA, JA, CMATNM ) /* * .. Scalar Arguments .. */ int IA, JA, N, SIZE, TYPE, USIZ; /* * .. Array Arguments .. */ char * A, * CMATNM; #endif { /* * .. Local Scalars .. */ int k; /* .. * .. Executable Statements .. * */ if( TYPE == INT ) for( k = 0; k < N; k++ ) (void) fprintf( stdout, "%s(%6d,%6d)=%8d\n", CMATNM, IA+k, JA, *((int *)(&A[k*SIZE])) ); else if( TYPE == SREAL ) for( k = 0; k < N; k++ ) (void) fprintf( stdout, "%s(%6d,%6d)=%16.8f\n", CMATNM, IA+k, JA, *((float *)(&A[k*SIZE])) ); else if( TYPE == DREAL ) for( k = 0; k < N; k++ ) (void) fprintf( stdout, "%s(%6d,%6d)=%30.18f\n", CMATNM, IA+k, JA, *((double *)(&A[k*SIZE])) ); else if( TYPE == SCPLX ) for( k = 0; k < N; k++ ) (void) fprintf( stdout, "%s(%6d,%6d)=%16.8f+i*(%16.8f)\n", CMATNM, IA+k, JA, *((float *)(&A[k*SIZE])), *((float *)(&A[k*SIZE+USIZ])) ); else if( TYPE == DCPLX ) for( k = 0; k < N; k++ ) (void) fprintf( stdout, "%s(%6d,%6d)=%30.18f+i*(%30.18f)\n", CMATNM, IA+k, JA, *((double *)(&A[k*SIZE])), *((double *)(&A[k*SIZE+USIZ])) ); /* * End of PB_Cprnt */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cplasca2.c000644 000766 000024 00000027423 10363532303 020404 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cplasca2( PBTYP_T * TYPE, char * UPLO, char * CONJUG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA ) #else void PB_Cplasca2( TYPE, UPLO, CONJUG, M, N, ALPHA, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ char * CONJUG, * UPLO; int IA, JA, M, N; char * ALPHA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA; char * A; #endif { /* * .. Local Scalars .. */ char UploA, herm; int Acol, Arow, Aii, iimax, ilow, imbloc, Aimb1, inbloc, Ainb1, Aoffi, GoEast, GoSouth, ioffd, iupp, izero=0, Ajj, jjmax, Aoffj, joffd, lcmt, lcmt00, Ald, lmbloc, lnbloc, low, lower, m1, Amb, mbloc, mblkd, mblks, Amp, Arcol, Arrow, mycol, myrow, n1, Anb, nbloc, nblkd, nblks, npcol, nprow, Anq, pmb, qnb, size, tmp1, upp, upper; TZSCAL_T scal; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( DESCA[CTXT_], &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cainfog2l( M, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aimb1, &Ainb1, &Amp, &Anq, &Aii, &Ajj, &Arow, &Acol, &Arrow, &Arcol ); /* * Quick return if I don't own any of sub( A ). */ if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return; /* * Initialize lcmt00, mblks, nblks, imbloc, inbloc, lmbloc, lnbloc, ilow, low, * iupp, and upp. */ Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Ald = DESCA[LLD_]; PB_Cbinfo( 0, Amp, Anq, Aimb1, Ainb1, Amb, Anb, Arrow, Arcol, &lcmt00, &mblks, &nblks, &imbloc, &inbloc, &lmbloc, &lnbloc, &ilow, &low, &iupp, &upp ); iimax = ( Aoffi = Aii - 1 ) + ( m1 = Amp ); jjmax = ( Aoffj = Ajj - 1 ) + ( n1 = Anq ); pmb = ( ( ( Arow < 0 ) || ( nprow == 1 ) ) ? Amb : nprow * Amb ); qnb = ( ( ( Acol < 0 ) || ( npcol == 1 ) ) ? Anb : npcol * Anb ); UploA = Mupcase( UPLO[0] ); upper = ( UploA != CLOWER ); lower = ( UploA != CUPPER ); herm = ( UploA == CALL ? CNOCONJG : Mupcase( CONJUG[0] ) ); size = TYPE->size; scal = ( herm == CCONJG ? TYPE->Fhescal : TYPE->Ftzscal ); /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); /* * Go through the table looking for blocks owning diagonal entries. */ if( ( !( GoSouth ) ) && ( !( GoEast ) ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ scal( C2F_CHAR( UPLO ), &imbloc, &inbloc, &lcmt00, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); if( GoSouth ) { /* * When the upper triangular part of sub( A ) should be scaled and one is * planning to go south in the table, it is neccessary to take care of the * remaining columns of these imbloc rows immediately. */ if( upper && ( Anq > inbloc ) ) { tmp1 = Anq - inbloc; scal( C2F_CHAR( ALL ), &imbloc, &tmp1, &izero, ALPHA, Mptr( A, Aii, Ajj+inbloc, Ald, size ), &Ald ); } Aii += imbloc; m1 -= imbloc; } else { /* * When the lower triangular part of sub( A ) should be scaled and one is * planning to go east in the table, it is neccessary to take care of the * remaining rows of these inbloc columns immediately. */ if( lower && ( Amp > imbloc ) ) { tmp1 = Amp - imbloc; scal( C2F_CHAR( ALL ), &tmp1, &inbloc, &izero, ALPHA, Mptr( A, Aii+imbloc, Ajj, Ald, size ), &Ald ); } Ajj += inbloc; n1 -= inbloc; } } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the local row index in A. */ lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row index in A. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; } /* * Scale the upper triangular part of sub( A ) we just skipped when necessary. */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { scal( C2F_CHAR( ALL ), &tmp1, &n1, &izero, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), &Ald ); Aii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; scal( C2F_CHAR( UPLO ), &mbloc, &inbloc, &lcmt, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), &Ald ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; ioffd += mbloc; } /* * Scale the lower triangular part of sub( A ) when necessary. */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) scal( C2F_CHAR( ALL ), &tmp1, &inbloc, &izero, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), &Ald ); tmp1 = Aoffi - Aii + 1; m1 -= tmp1; n1 -= inbloc; lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; /* * When the upper triangular part of sub( A ) should be scaled, take care of the * n1 remaining columns of these tmp1 rows immediately. */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) scal( C2F_CHAR( ALL ), &tmp1, &n1, &izero, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), &Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as well as * the local column index in A. */ lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; /* * While there are blocks remaining that own lower entries, keep going east. * Adjust the current LCM value as well as the local column index in A. */ while( ( nblks > 0 ) && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; Aoffj += Anb; } /* * Scale the lower triangular part of sub( A ) we just skipped when necessary. */ tmp1 = MIN( Aoffj, jjmax ) - Ajj + 1; if( lower && ( tmp1 > 0 ) ) { scal( C2F_CHAR( ALL ), &m1, &tmp1, &izero, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald ); Ajj += tmp1; n1 -= tmp1; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return; /* * lcmt00 >= low. The current block owns either diagonals or upper entries. * Save the current position in the LCM table. After this row has been * completely taken care of, re-start from this column and the next row of * the LCM table. */ lcmt = lcmt00; nblkd = nblks; joffd = Aoffj; nbloc = Anb; while( ( nblkd > 0 ) && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ if( nblkd == 1 ) nbloc = lnbloc; scal( C2F_CHAR( UPLO ), &imbloc, &nbloc, &lcmt, ALPHA, Mptr( A, Aii, joffd+1, Ald, size ), &Ald ); lcmt00 = lcmt; lcmt += qnb; nblks = nblkd; nblkd--; Aoffj = joffd; joffd += nbloc; } /* * Scale the upper triangular part of sub( A ) when necessary. */ tmp1 = n1 - joffd + Ajj - 1; if( upper && ( tmp1 > 0 ) ) scal( C2F_CHAR( ALL ), &imbloc, &tmp1, &izero, ALPHA, Mptr( A, Aii, joffd+1, Ald, size ), &Ald ); tmp1 = Aoffj - Ajj + 1; m1 -= imbloc; n1 -= tmp1; lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; /* * When the lower triangular part of sub( A ) should be scaled, take care of the * m1 remaining rows of these tmp1 columns immediately. */ if( lower && ( m1 > 0 ) && ( tmp1 > 0 ) ) scal( C2F_CHAR( ALL ), &m1, &tmp1, &izero, ALPHA, Mptr( A, Aoffi+1, Ajj, Ald, size ), &Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; } /* * Loop over the remaining columns of the LCM table. */ nbloc = Anb; while( nblks > 0 ) { if( nblks == 1 ) nbloc = lnbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row index in A. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; } /* * Scale the upper triangular part of sub( A ) we just skipped when necessary. */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { scal( C2F_CHAR( ALL ), &tmp1, &n1, &izero, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), &Ald ); Aii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; scal( C2F_CHAR( UPLO ), &mbloc, &nbloc, &lcmt, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), &Ald ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; ioffd += mbloc; } /* * Scale the lower triangular part of sub( A ) when necessary. */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) scal( C2F_CHAR( ALL ), &tmp1, &nbloc, &izero, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), &Ald ); tmp1 = MIN( Aoffi, iimax ) - Aii + 1; m1 -= tmp1; n1 -= nbloc; lcmt00 += qnb; nblks--; Aoffj += nbloc; /* * When the upper triangular part of sub( A ) should be scaled, take care of the * n1 remaining columns of these tmp1 rows immediately. */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) scal( C2F_CHAR( ALL ), &tmp1, &n1, &izero, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), &Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; } /* * End of PB_Cplasca2 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cplascal.c000644 000766 000024 00000034521 10363532303 020473 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cplascal( PBTYP_T * TYPE, char * UPLO, char * CONJUG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA ) #else void PB_Cplascal( TYPE, UPLO, CONJUG, M, N, ALPHA, A, IA, JA, DESCA ) /* * .. Scalar Arguments .. */ char * CONJUG, * UPLO; int IA, JA, M, N; char * ALPHA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA; char * A; #endif { /* * Purpose * ======= * * PB_Cplascal scales by alpha an m by n submatrix sub( A ) denoting * A(IA:IA+M-1,JA:JA+N-1). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies the part of the submatrix sub( A ) * to be scaled as follows: * = 'L' or 'l': Lower triangular part is scaled; the * strictly upper triangular part of sub( A ) is not changed; * = 'U' or 'u': Upper triangular part is scaled; the * strictly lower triangular part of sub( A ) is not changed; * Otherwise: All of the submatrix sub( A ) is scaled. * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies what kind of scaling should be * done as follows: when UPLO is 'L', 'l', 'U' or 'u' and CONJUG * is 'Z' or 'z', alpha is assumed to be real and the imaginary * part of the diagonals are set to zero. Otherwise, alpha is of * the same type as the entries of sub( A ) and nothing particu- * lar is done to the diagonals of sub( A ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha, i.e., the cons- * tant with which the matrix elements are to be scaled. * * A (local input/local output) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A to be scaled. On exit, the * local entries of this array corresponding to the to the en- * tries of the submatrix sub( A ) are overwritten by the local * entries of the m by n scaled submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char UploA, herm, type; int Acol, Arow, Aii, Aimb1, Ainb1, Ajj, Akp, Akq, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, ctxt, izero=0, k, kb, ktmp, mn, mycol, myrow, nb, npcol, nprow, size; TZSCAL_T scal; /* * .. Local Arrays .. */ int Ad0[DLEN_]; char * Aptr = NULL; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) ) return; /* * If alpha is zero, then call PB_Cplapad instead. */ type = TYPE->type; UploA = Mupcase( UPLO[0] ); herm = ( UploA == CALL ? CNOCONJG : Mupcase( CONJUG[0] ) ); if( type == SREAL ) { if( ((float*)(ALPHA))[REAL_PART] == ZERO ) { PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A, IA, JA, DESCA ); return; } else if( ((float*)(ALPHA))[REAL_PART] == ONE ) return; } else if( type == DREAL ) { if( ((double*)(ALPHA))[REAL_PART] == ZERO ) { PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A, IA, JA, DESCA ); return; } else if( ((double*)(ALPHA))[REAL_PART] == ONE ) return; } else if( type == SCPLX ) { if( herm == CCONJG ) { if( ((float*)(ALPHA))[REAL_PART] == ZERO ) { PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A, IA, JA, DESCA ); return; } } else { if( ((float*)(ALPHA))[IMAG_PART] == ZERO ) { if( ((float*)(ALPHA))[REAL_PART] == ZERO ) { PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A, IA, JA, DESCA ); return; } else if( ((float*)(ALPHA))[REAL_PART] == ONE ) return; } } } else if( type == DCPLX ) { if( herm == CCONJG ) { if( ((double*)(ALPHA))[REAL_PART] == ZERO ) { PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A, IA, JA, DESCA ); return; } } else { if( ((double*)(ALPHA))[IMAG_PART] == ZERO ) { if( ((double*)(ALPHA))[REAL_PART] == ZERO ) { PB_Cplapad( TYPE, UPLO, NOCONJG, M, N, TYPE->zero, TYPE->zero, A, IA, JA, DESCA ); return; } else if( ((double*)(ALPHA))[REAL_PART] == ONE ) return; } } } /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( M, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Quick return if I don't own any of sub( A ). */ Amp = PB_Cnumroc( M, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( N, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return; size = TYPE->size; scal = ( herm == CCONJG ? TYPE->Fhescal : TYPE->Ftzscal ); Aptr = Mptr( A, Aii, Ajj, Ald, size ); /* * When the entire sub( A ) needs to be scaled or when sub( A ) is replicated in * all processes, just call the local routine. */ if( ( Mupcase( UPLO[0] ) == CALL ) || ( ( ( Arow < 0 ) || ( nprow == 1 ) ) && ( ( Acol < 0 ) || ( npcol == 1 ) ) ) ) { scal( C2F_CHAR( UPLO ), &Amp, &Anq, &izero, ALPHA, Aptr, &Ald ); return; } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and two times the least common multiple of nprow * and npcol. */ nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type ) ) * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); mn = MIN( M, N ); if( Mupcase( UPLO[0] ) == CLOWER ) { /* * Lower triangle of sub( A ): proceed by block of columns. For each block of * columns, operate on the logical diagonal block first and then the remaining * rows of that block of columns. */ for( k = 0; k < mn; k += nb ) { kb = mn - k; ktmp = k + ( kb = MIN( kb, nb ) ); PB_Cplasca2( TYPE, UPLO, CONJUG, kb, kb, ALPHA, Aptr, k, k, Ad0 ); Akp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp0 = Amp - Akp ) > 0 ) scal( C2F_CHAR( ALL ), &Amp0, &Anq0, &izero, ALPHA, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } } else if( Mupcase( UPLO[0] ) == CUPPER ) { /* * Upper triangle of sub( A ): proceed by block of columns. For each block of * columns, operate on the trailing rows and then the logical diagonal block * of that block of columns. When M < N, the last columns of sub( A ) are * handled together. */ for( k = 0; k < mn; k += nb ) { kb = mn - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 ) scal( C2F_CHAR( ALL ), &Akp, &Anq0, &izero, ALPHA, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); PB_Cplasca2( TYPE, UPLO, CONJUG, kb, kb, ALPHA, Aptr, k, k, Ad0 ); } if( ( Anq -= ( Akq += Anq0 ) ) > 0 ) scal( C2F_CHAR( ALL ), &Amp, &Anq, &izero, ALPHA, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); } else { /* * All of sub( A ): proceed by block of columns. For each block of columns, * operate on the trailing rows, then the logical diagonal block, and finally * the remaining rows of that block of columns. When M < N, the last columns * of sub( A ) are handled together. */ for( k = 0; k < mn; k += nb ) { kb = mn - k; kb = MIN( kb, nb ); Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( Akp > 0 ) scal( C2F_CHAR( ALL ), &Akp, &Anq0, &izero, ALPHA, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); PB_Cplasca2( TYPE, UPLO, NOCONJG, kb, kb, ALPHA, Aptr, k, k, Ad0 ); Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow ); if( ( Amp0 = Amp - Akp ) > 0 ) scal( C2F_CHAR( ALL ), &Amp0, &Anq0, &izero, ALPHA, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald ); } if( ( Anq -= ( Akq += Anq0 ) ) > 0 ) scal( C2F_CHAR( ALL ), &Amp, &Anq, &izero, ALPHA, Mptr( Aptr, 0, Akq, Ald, size ), &Ald ); } /* * End of PB_Cplascal */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpswapND.c000644 000766 000024 00000102267 10363532303 020433 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpswapND( PBTYP_T * TYPE, int N, char * X, int IX, int JX, int * DESCX, int INCX, char * Y, int IY, int JY, int * DESCY, int INCY ) #else void PB_CpswapND( TYPE, N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int INCX, INCY, IX, IY, JX, JY, N; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCX, * DESCY; char * X, * Y; #endif { /* * Purpose * ======= * * PB_CpswapND swaps two subvectors, * * sub( Y ) := sub( X ) and sub( X ) := sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * sub( X ) is assumed to be not distributed, and sub( Y ) is assumed to * be distributed. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * swapped. N must be at least zero. * * X (local input/local output) pointer to CHAR * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with sub( Y ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) pointer to CHAR * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char scope, * top, * zero; int RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, Xm, XmyprocD, XmyprocR, Xn, XnprocsD, XnprocsR, XprocR, Xroc, Xrow, Ycol, Yii, Yinb1D, YisR, YisRow, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yroc, Yrow, ctxt, ione=1, k, kbb, kk, kn, ktmp, mycol, mydist, myproc, myrow, npcol, nprow, p, size; /* * .. Local Arrays .. */ char * buf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... */ PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 ) { Xld = DESCX[LLD_]; Xlinc = Xld; XmyprocD = mycol; XnprocsD = npcol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); } else { Xld = DESCX[LLD_]; Xlinc = 1; XmyprocD = myrow; XnprocsD = nprow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... */ PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 ) { YnbD = DESCY[NB_]; Yld = DESCY[LLD_]; Ylinc = Yld; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YprocD = Ycol; YmyprocD = mycol; YnprocsD = npcol; Yinb1D = PB_Cfirstnb( N, JY, DESCY[INB_], YnbD ); } else { YnbD = DESCY[MB_]; Yld = DESCY[LLD_]; Ylinc = 1; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YprocD = Yrow; YmyprocD = myrow; YnprocsD = nprow; Yinb1D = PB_Cfirstnb( N, IY, DESCY[IMB_], YnbD ); } YisR = ( ( YprocR == -1 ) || ( YnprocsR == 1 ) ); /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * sub( X ) is not distributed and sub( Y ) is distributed */ size = TYPE->size; zero = TYPE->zero; if( !( XisR ) ) { /* * sub( X ) is not replicated. Since this operation is local if sub( X ) and * sub( Y ) are both row or column vectors, choose YprocR = XprocR when RRorCC, * and YprocR = 0 otherwise. */ if( YisR ) { YprocR = ( ( RRorCC ) ? XprocR : 0 ); } /* * Now, it is just like sub( Y ) is not replicated, this information however is * kept in YisR for later use. */ if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( RRorCC ) { YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); /* * sub( X ) and sub( Y ) are in the same process row or column */ if( XprocR == YprocR ) { /* * In a given process, sub( Y ) is swapped with the corresponding non distribu- * ted part of sub( X ). In the other processes, this part of sub( X ) is set * to zero for later reduction. */ if( YnpD > 0 ) { Yroc = YprocD; if( XisRow ) { kk = Yjj; ktmp = JX + N; kn = JX + Yinb1D; } else { kk = Yii; ktmp = IX + N; kn = IX + Yinb1D; } if( YmyprocD == Yroc ) { TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); kk += Yinb1D; } else { TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc ); } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( YmyprocD == Yroc ) { if( XisRow ) TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc, Mptr( Y, Yii, kk, Yld, size ), &Ylinc ); else TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc, Mptr( Y, kk, Yjj, Yld, size ), &Ylinc ); kk += kbb; } else { if( XisRow ) TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld, size ), &Xlinc ); else TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld, size ), &Xlinc ); } Yroc = MModAdd1( Yroc, YnprocsD ); } } else { /* * If I don't own any entries of sub( Y ), then zero the entire sub( X ) * residing in this process. */ TYPE->Fset( &N, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc ); } /* * Replicate locally scattered sub( X ) by reducing it */ if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); TYPE->Cgsum2d( ctxt, ROW, top, 1, N, Mptr( X, Xii, Xjj, Xld, size ), Xld, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); TYPE->Cgsum2d( ctxt, COLUMN, top, N, 1, Mptr( X, Xii, Xjj, Xld, size ), Xld, -1, 0 ); } } else { /* * sub( X ) and sub( Y ) are in a different process row or column */ if( YmyprocR == YprocR ) { /* * If I own a piece of sub( Y ), then send it to the process row or column where * sub( X ) resides and receive back the sub( X ) data from the same process. */ if( YnpD > 0 ) { if( YisRow ) { TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, YmyprocD ); TYPE->Cgerv2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, YmyprocD ); } else { TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, YmyprocD, XprocR ); TYPE->Cgerv2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, YmyprocD, XprocR ); } } } if( XmyprocR == XprocR ) { /* * If I own a sub( X ), then receive the distributed part of sub( Y ) owned by * the process where sub( Y ) resides in my row or column. Perform a local swap * as if sub( Y ) would reside in the same process row or column as sub( X ). * Send the result back and finally perform the reduction to replicate sub( X ). */ if( YnpD > 0 ) { buf = PB_Cmalloc( YnpD * size ); if( YisRow ) TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR, XmyprocD ); else TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, XmyprocD, YprocR ); Yroc = YprocD; kk = 0; if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; } else { ktmp = IX + N; kn = IX + Yinb1D; } if( YmyprocD == Yroc ) { TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); kk += Yinb1D; } else { TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc ); } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( YmyprocD == Yroc ) { if( XisRow ) TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc, buf+kk*size, &ione ); else TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc, buf+kk*size, &ione ); kk += kbb; } else { if( XisRow ) TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld, size ), &Xlinc ); else TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld, size ), &Xlinc ); } Yroc = MModAdd1( Yroc, YnprocsD ); } if( YisRow ) TYPE->Cgesd2d( ctxt, 1, YnpD, buf, 1, YprocR, XmyprocD ); else TYPE->Cgesd2d( ctxt, YnpD, 1, buf, YnpD, XmyprocD, YprocR ); if( buf ) free( buf ); } else { TYPE->Fset( &N, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc ); } /* * Replicate locally scattered sub( X ) by reducing it */ if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); TYPE->Cgsum2d( ctxt, ROW, top, 1, N, Mptr( X, Xii, Xjj, Xld, size ), Xld, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); TYPE->Cgsum2d( ctxt, COLUMN, top, N, 1, Mptr( X, Xii, Xjj, Xld, size ), Xld, -1, 0 ); } } } } else { /* * sub( X ) and sub( Y ) are not both row or column vectors */ Xroc = 0; if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; } else { ktmp = IX + N; kn = IX + Yinb1D; } /* * Loop over the processes in which sub( Y ) resides, for each process find the * next process Xroc and swap the data. After this, it will be needed to reduce * sub( X ) as above. */ for( p = 0; p < YnprocsD; p++ ) { mydist = MModSub( p, YprocD, YnprocsD ); myproc = MModAdd( YprocD, mydist, YnprocsD ); if( ( XprocR == p ) && ( YprocR == Xroc ) ) { /* * Swap locally at the intersection of the process cross */ if( XmyprocR == p ) { YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD, YnprocsD ); if( YnpD > 0 ) { Yroc = YprocD; kk = ( XisRow ? Yii : Yjj ); if( myproc == Yroc ) { if( XmyprocD == Xroc ) { TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); kk += Yinb1D; } else { TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc ); } } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( myproc == Yroc ) { if( XmyprocD == Xroc ) { if( XisRow ) TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc, Mptr( Y, kk, Yjj, Yld, size ), &Ylinc ); else TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, kk, Yld, size ), &Ylinc ); kk += kbb; } else { if( XisRow ) TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld, size ), &Xlinc ); else TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld, size ), &Xlinc ); } } Yroc = MModAdd1( Yroc, YnprocsD ); } } } } else { /* * Message exchange */ if( ( YmyprocR == YprocR ) && ( YmyprocD == p ) ) { YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD, YnprocsD ); if( YnpD > 0 ) { if( XisRow ) { TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, Xroc ); TYPE->Cgerv2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, Xroc ); } else { TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld, size ), Yld, Xroc, XprocR ); TYPE->Cgerv2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld, size ), Yld, Xroc, XprocR ); } } } if( XmyprocR == XprocR ) { YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD, YnprocsD ); if( YnpD > 0 ) { Yroc = YprocD; kk = 0; /* * Receive the piece of sub( Y ) that I should handle */ if( XmyprocD == Xroc ) { buf = PB_Cmalloc( YnpD * size ); if( XisRow ) TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, p, YprocR ); else TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR, p ); } if( myproc == Yroc ) { if( XmyprocD == Xroc ) { TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, buf, &ione ); kk += Yinb1D; } else { TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc ); } } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( myproc == Yroc ) { if( XmyprocD == Xroc ) { if( XisRow ) TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc, buf+kk*size, &ione ); else TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc, buf+kk*size, &ione ); kk += kbb; } else { if( XisRow ) TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld, size ), &Xlinc ); else TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld, size ), &Xlinc ); } } Yroc = MModAdd1( Yroc, YnprocsD ); } if( XmyprocD == Xroc ) { if( XisRow ) TYPE->Cgesd2d( ctxt, YnpD, 1, buf, YnpD, p, YprocR ); else TYPE->Cgesd2d( ctxt, 1, YnpD, buf, 1, YprocR, p ); if( buf ) free( buf ); } } } } Xroc = MModAdd1( Xroc, XnprocsD ); } /* * Replicate locally scattered sub( X ) by reducing it */ if( XmyprocR == XprocR ) { if( XisRow ) { top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); TYPE->Cgsum2d( ctxt, ROW, top, 1, N, Mptr( X, Xii, Xjj, Xld, size ), Xld, -1, 0 ); } else { top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); TYPE->Cgsum2d( ctxt, COLUMN, top, N, 1, Mptr( X, Xii, Xjj, Xld, size ), Xld, -1, 0 ); } } } } if( YisR ) { /* * Replicate sub( Y ) */ YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocR == YprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, 1, YnpD, Mptr( Y, Yii, Yjj, Yld, size ), Yld ); else TYPE->Cgebr2d( ctxt, COLUMN, top, 1, YnpD, Mptr( Y, Yii, Yjj, Yld, size ), Yld, YprocR, YmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocR == YprocR ) TYPE->Cgebs2d( ctxt, ROW, top, YnpD, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld ); else TYPE->Cgebr2d( ctxt, ROW, top, YnpD, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, YmyprocD, YprocR ); } } } } else { /* * sub( X ) is replicated in every process. Swap the data in process row or * column YprocR when sub( Y ) is not replicated and in every process otherwise. */ if( YisR || ( YmyprocR == YprocR ) ) { YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD ); if( YnpD > 0 ) { Yroc = YprocD; kk = ( YisRow ? Yjj : Yii ); if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; } else { ktmp = IX + N; kn = IX + Yinb1D; } if( YmyprocD == Yroc ) { TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); kk += Yinb1D; } else { TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc ); } Yroc = MModAdd1( Yroc, YnprocsD ); for( k = kn; k < ktmp; k += YnbD ) { kbb = ktmp - k; kbb = MIN( kbb, YnbD ); if( YmyprocD == Yroc ) { if( YisRow ) { if( XisRow ) TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc, Mptr( Y, Yii, kk, Yld, size ), &Ylinc ); else TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, kk, Yld, size ), &Ylinc ); } else { if( XisRow ) TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc, Mptr( Y, kk, Yjj, Yld, size ), &Ylinc ); else TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc, Mptr( Y, kk, Yjj, Yld, size ), &Ylinc ); } kk += kbb; } else { if( XisRow ) TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld, size ), &Xlinc ); else TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld, size ), &Xlinc ); } Yroc = MModAdd1( Yroc, YnprocsD ); } } else { /* * If I don't own any of sub( Y ), then just zero sub( X ) */ TYPE->Fset( &N, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc ); } /* * Replicate locally scattered sub( X ) by reducing it in the process scope of * sub( Y ) */ scope = ( YisRow ? CROW : CCOLUMN ); top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); if( XisRow ) TYPE->Cgsum2d( ctxt, &scope, top, 1, N, Mptr( X, Xii, Xjj, Xld, size ), Xld, -1, 0 ); else TYPE->Cgsum2d( ctxt, &scope, top, N, 1, Mptr( X, Xii, Xjj, Xld, size ), Xld, -1, 0 ); } if( !YisR ) { /* * If sub( Y ) is not replicated, then broadcast the result to the other pro- * cesses that own a piece of sub( X ), but were not involved in the above swap * operation. */ if( XisRow ) { Xm = 1; Xn = N; } else { Xm = N; Xn = 1; } if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocR == YprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld, size ), Xld ); else TYPE->Cgebr2d( ctxt, COLUMN, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld, size ), Xld, YprocR, YmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocR == YprocR ) TYPE->Cgebs2d( ctxt, ROW, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld, size ), Xld ); else TYPE->Cgebr2d( ctxt, ROW, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld, size ), Xld, YmyprocD, YprocR ); } } } /* * End of PB_CpswapND */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpswapNN.c000644 000766 000024 00000042200 10363532303 020433 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpswapNN( PBTYP_T * TYPE, int N, char * X, int IX, int JX, int * DESCX, int INCX, char * Y, int IY, int JY, int * DESCY, int INCY ) #else void PB_CpswapNN( TYPE, N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) /* * .. Scalar Arguments .. */ int INCX, INCY, IX, IY, JX, JY, N; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCX, * DESCY; char * X, * Y; #endif { /* * Purpose * ======= * * PB_CpswapNN swaps two subvectors, * * sub( Y ) := sub( X ) and sub( X ) := sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Both subvectors are assumed to be not distributed. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * N (global input) INTEGER * On entry, N specifies the length of the subvectors to be * swapped. N must be at least zero. * * X (local input/local output) pointer to CHAR * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. * Before entry, this array contains the local entries of the * matrix X. On exit, sub( X ) is overwritten with sub( Y ). * * IX (global input) INTEGER * On entry, IX specifies X's global row index, which points to * the beginning of the submatrix sub( X ). * * JX (global input) INTEGER * On entry, JX specifies X's global column index, which points * to the beginning of the submatrix sub( X ). * * DESCX (global and local input) INTEGER array * On entry, DESCX is an integer array of dimension DLEN_. This * is the array descriptor for the matrix X. * * INCX (global input) INTEGER * On entry, INCX specifies the global increment for the * elements of X. Only two values of INCX are supported in * this version, namely 1 and M_X. INCX must not be zero. * * Y (local input/local output) pointer to CHAR * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. * Before entry, this array contains the local entries of the * matrix Y. On exit, sub( Y ) is overwritten with sub( X ). * * IY (global input) INTEGER * On entry, IY specifies Y's global row index, which points to * the beginning of the submatrix sub( Y ). * * JY (global input) INTEGER * On entry, JY specifies Y's global column index, which points * to the beginning of the submatrix sub( Y ). * * DESCY (global and local input) INTEGER array * On entry, DESCY is an integer array of dimension DLEN_. This * is the array descriptor for the matrix Y. * * INCY (global input) INTEGER * On entry, INCY specifies the global increment for the * elements of Y. Only two values of INCY are supported in * this version, namely 1 and M_Y. INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Xscope, Yscope, * top; int RRorCC, XYm, XYn, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, XmyprocD, XmyprocR, XnprocsR, XprocR, Xrow, Ycol, Yii, YisR, YisRow, Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnprocsR, YprocR, Yrow, csrc, ctxt, mycol, myrow, npcol, nprow, rsrc, size; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... */ PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj, &Xrow, &Xcol ); if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 ) { Xld = DESCX[LLD_]; Xlinc = Xld; XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); } else { Xld = DESCX[LLD_]; Xlinc = 1; XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); } /* * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... */ PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj, &Yrow, &Ycol ); if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 ) { Yld = DESCY[LLD_]; Ylinc = Yld; YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); } else { Yld = DESCY[LLD_]; Ylinc = 1; YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); } /* * Are sub( X ) and sub( Y ) both row or column vectors ? */ RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); /* * Neither sub( X ) nor sub( Y ) are distributed */ if( !XisR ) { /* * sub( X ) is not replicated */ if( !( YisR ) ) { /* * sub( Y ) is not replicated */ if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) /* * If I am not in XprocR or YprocR, then return immediately */ return; size = TYPE->size; if( RRorCC ) { /* * sub( X ) and sub( Y ) are both row or column vectors */ if( XprocR == YprocR ) { /* * sub( X ) and sub( Y ) are in the same process row or column */ TYPE->Fswap( &N, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); } else { /* * sub( X ) and sub( Y ) are in a different process row or column */ if( XmyprocR == XprocR ) { /* * Send sub( X ) to where sub( Y ) resides, and receive sub( Y ) from the same * location. */ if( XisRow ) { TYPE->Cgesd2d( ctxt, 1, N, Mptr( X, Xii, Xjj, Xld, size ), Xld, YprocR, XmyprocD ); TYPE->Cgerv2d( ctxt, 1, N, Mptr( X, Xii, Xjj, Xld, size ), Xld, YprocR, XmyprocD ); } else { TYPE->Cgesd2d( ctxt, N, 1, Mptr( X, Xii, Xjj, Xld, size ), Xld, XmyprocD, YprocR ); TYPE->Cgerv2d( ctxt, N, 1, Mptr( X, Xii, Xjj, Xld, size ), Xld, XmyprocD, YprocR ); } } if( YmyprocR == YprocR ) { /* * Send sub( Y ) to where sub( X ) resides, and receive sub( X ) from the same * location. */ if( YisRow ) { TYPE->Cgesd2d( ctxt, 1, N, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, YmyprocD ); TYPE->Cgerv2d( ctxt, 1, N, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, YmyprocD ); } else { TYPE->Cgesd2d( ctxt, N, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, YmyprocD, XprocR ); TYPE->Cgerv2d( ctxt, N, 1, Mptr( Y, Yii, Yjj, Yld, size ), Yld, YmyprocD, XprocR ); } } } } else { /* * sub( X ) and sub( Y ) are not both row or column vectors */ if( XisRow ) { XYm = 1; XYn = N; Xscope = CROW; Yscope = CCOLUMN; rsrc = XprocR; csrc = YprocR; } else { XYm = N; XYn = 1; Xscope = CCOLUMN; Yscope = CROW; rsrc = YprocR; csrc = XprocR; } if( ( XmyprocR == XprocR ) && ( YmyprocR == YprocR ) ) { /* * If I am at the intersection of the process row and column, then swap and * broadcast sub( X ) and sub( Y ) in their respective process scope. */ TYPE->Fswap( &N, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); top = PB_Ctop( &ctxt, BCAST, &Xscope, TOP_GET ); TYPE->Cgebs2d( ctxt, &Xscope, top, XYm, XYn, Mptr( X, Xii, Xjj, Xld, size ), Xld ); top = PB_Ctop( &ctxt, BCAST, &Yscope, TOP_GET ); TYPE->Cgebs2d( ctxt, &Yscope, top, XYn, XYm, Mptr( Y, Yii, Yjj, Yld, size ), Yld ); } else if( XmyprocR == XprocR ) { top = PB_Ctop( &ctxt, BCAST, &Xscope, TOP_GET ); TYPE->Cgebr2d( ctxt, &Xscope, top, XYm, XYn, Mptr( X, Xii, Xjj, Xld, size ), Xld, rsrc, csrc ); } else if( YmyprocR == YprocR ) { top = PB_Ctop( &ctxt, BCAST, &Yscope, TOP_GET ); TYPE->Cgebr2d( ctxt, &Yscope, top, XYn, XYm, Mptr( Y, Yii, Yjj, Yld, size ), Yld, rsrc, csrc ); } } } else { /* * sub( Y ) is replicated */ size = TYPE->size; if( YisRow ) { XYm = 1; XYn = N; } else { XYm = N; XYn = 1; } if( XmyprocR == XprocR ) { /* * If I am in the process row (resp. column) owning sub( X ), then swap and * broadcast sub( Y ) in my column (resp. row). */ TYPE->Fswap( &N, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); TYPE->Cgebs2d( ctxt, COLUMN, top, XYm, XYn, Mptr( Y, Yii, Yjj, Yld, size ), Yld ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); TYPE->Cgebs2d( ctxt, ROW, top, XYm, XYn, Mptr( Y, Yii, Yjj, Yld, size ), Yld ); } } else { /* * Otherwise, receive sub( Y ) */ if( XisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); TYPE->Cgebr2d( ctxt, COLUMN, top, XYm, XYn, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XprocR, XmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); TYPE->Cgebr2d( ctxt, ROW, top, XYm, XYn, Mptr( Y, Yii, Yjj, Yld, size ), Yld, XmyprocD, XprocR ); } } } } else { /* * sub( X ) is replicated */ size = TYPE->size; if( YisR || ( YmyprocR == YprocR ) ) { /* * If I own a piece of sub( Y ), then swap */ TYPE->Fswap( &N, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc ); } if( !YisR ) { /* * If sub( Y ) is not replicated, then broadcast the result to the other * processes that own a piece of sub( X ), but were not involved in the * above swap operation. */ if( XisRow ) { XYm = 1; XYn = N; } else { XYm = N; XYn = 1; } if( YisRow ) { top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( YmyprocR == YprocR ) TYPE->Cgebs2d( ctxt, COLUMN, top, XYm, XYn, Mptr( X, Xii, Xjj, Xld, size ), Xld ); else TYPE->Cgebr2d( ctxt, COLUMN, top, XYm, XYn, Mptr( X, Xii, Xjj, Xld, size ), Xld, YprocR, YmyprocD ); } else { top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( YmyprocR == YprocR ) TYPE->Cgebs2d( ctxt, ROW, top, XYm, XYn, Mptr( X, Xii, Xjj, Xld, size ), Xld ); else TYPE->Cgebr2d( ctxt, ROW, top, XYm, XYn, Mptr( X, Xii, Xjj, Xld, size ), Xld, YmyprocD, YprocR ); } } } /* * End of PB_CpswapNN */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cpsym.c000644 000766 000024 00000056503 10363532303 020050 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cpsym( PBTYP_T * TYPE, PBTYP_T * UTYP, char * SIDE, char * UPLO, int N, int K, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * XC, int LDXC, char * XR, int LDXR, char * YC, int LDYC, char * YR, int LDYR, TZSYM_T SYM ) #else void PB_Cpsym( TYPE, UTYP, SIDE, UPLO, N, K, ALPHA, A, IA, JA, DESCA, XC, LDXC, XR, LDXR, YC, LDYC, YR, LDYR, SYM ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO; int IA, JA, K, LDXC, LDXR, LDYC, LDYR, N; char * ALPHA; PBTYP_T * TYPE, * UTYP; TZSYM_T SYM; /* * .. Array Arguments .. */ int * DESCA; char * A, * XC, * XR, * YC, * YR; #endif { /* * Purpose * ======= * * PB_Cpsym performs a symmetric or Hermitian matrix-matrix or matrix- * vector multiplication. In the following, sub( A ) denotes the symme- * tric or Hermitian submatrix operand A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UTYP (local input) pointer to a PBTYP_T structure * On entry, UTYP is a pointer to a structure of type PBTYP_T, * that contains type information for the Y's (See pblas.h). * * SIDE (global input) pointer to CHAR * On entry, SIDE specifies whether op( sub( A ) ) multiplies * its operand X from the left or right as follows: * * SIDE = 'L' or 'l' Y := alpha*op( sub( A ) )*X + Y, * * SIDE = 'R' or 'r' Y := alpha*X*op( sub( A ) ) + Y. * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric or Hermitian submatrix sub( A ) are to be * referenced as follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the sym- * metric or Hermitian submatrix sub( A ) * are to be referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the sym- * metric or Hermitian submatrix sub( A ) * are to be referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the local number of columns of the lo- * cal array XC and the local number of rows of the local array * XR. K mut be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the @(syhec) submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the @(syhec) submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * XC (local input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,K). Before entry, * this array contains the local entries of the matrix XC. * * LDXC (local input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, Lp( IA, N ) ). * * XR (local input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,Kx), where Kx is * at least Lc( JA, N ). Before entry, this array contains the * local entries of the matrix XR. * * LDXR (local input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least max( 1, K ). * * YC (local input/local output) pointer to CHAR * On entry, YC is an array of dimension (LDYC,K). Before entry, * this array contains the local entries of the matrix YC. On * exit, this array contains the updated vector YC. * * LDYC (local input) INTEGER * On entry, LDYC specifies the leading dimension of the array * YC. LDYC must be at least max( 1, Lp( IA, N ) ). * * YR (local input/local output) pointer to CHAR * On entry, YR is an array of dimension (LDYR,Ky), where Ky is * at least Lc( JA, N ). Before entry, this array contains the * local entries of the matrix YR. On exit, this array contains * the updated vector YR. * * LDYR (local input) INTEGER * On entry, LDYR specifies the leading dimension of the array * YR. LDYR must be at least max( 1, K ). * * SYM (local input) pointer to function of type TZSYM_T * On entry, SYM specifies the function performing the symmetric * or Hermitian matrix-vector or matrix-matrix multiply of a * single block. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Arow, Aii, Aimb1, Ainb1, Ajj, Ald, Amp, Amb, Anb, Anq, Aoffi, Aoffj, Arcol, Arrow, GoEast, GoSouth, IsColRepl, IsRowRepl, XCinc, XRinc, Xii=0, Xjj=0, Xoffi=-1, Xoffj=-1, YCinc, YRinc, iimax, ilow, imbloc, inbloc, ioffd, ioffx, iupp, jjmax, joffd, joffx, lcmt, lcmt00, lmbloc, lnbloc, low, lower, m1, mbloc, mblkd, mblks, mycol, myrow, n1, nbloc, nblkd, nblks, npcol, nprow, pmb, qnb, size, tmp1, upp, upper; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( N <= 0 ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( DESCA[CTXT_], &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cainfog2l( N, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aimb1, &Ainb1, &Amp, &Anq, &Aii, &Ajj, &Arow, &Acol, &Arrow, &Arcol ); /* * Quick return if I don't own any of sub( A ) or if sub( A ) is replicated in * all processes. */ if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return; IsRowRepl = ( ( Arow < 0 ) || ( nprow == 1 ) ); IsColRepl = ( ( Acol < 0 ) || ( npcol == 1 ) ); Amb = DESCA[ MB_ ]; Anb = DESCA[ NB_ ]; Ald = DESCA[ LLD_ ]; size = TYPE->size; if( IsRowRepl && IsColRepl ) { SYM( TYPE, SIDE, UPLO, Amp, Anq, K, 0, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), Ald, XC, LDXC, XR, LDXR, YC, LDYC, YR, LDYR ); return; } XCinc = size; XRinc = LDXR * size; YCinc = UTYP->size; YRinc = LDYR * UTYP->size; upper = ( Mupcase( UPLO[0] ) == CUPPER ); lower = ( Mupcase( UPLO[0] ) == CLOWER ); /* * Initialize lcmt00, mblks, nblks, imbloc, inbloc, lmbloc, lnbloc, ilow, low, * iupp, and upp. */ PB_Cbinfo( 0, Amp, Anq, Aimb1, Ainb1, Amb, Anb, Arrow, Arcol, &lcmt00, &mblks, &nblks, &imbloc, &inbloc, &lmbloc, &lnbloc, &ilow, &low, &iupp, &upp ); iimax = ( Aoffi = Aii - 1 ) + ( m1 = Amp ); jjmax = ( Aoffj = Ajj - 1 ) + ( n1 = Anq ); pmb = ( IsRowRepl ? Amb : nprow * Amb ); qnb = ( IsColRepl ? Anb : npcol * Anb ); /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); /* * Go through the table looking for blocks owning diagonal entries. */ if( ( !( GoSouth ) ) && ( !( GoEast ) ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ SYM( TYPE, SIDE, UPLO, imbloc, inbloc, K, lcmt00, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), Ald, XC+Xii*XCinc, LDXC, XR+Xjj*XRinc, LDXR, YC+Xii*YCinc, LDYC, YR+Xjj*YRinc, LDYR ); /* * Decide whether one should go south or east in the table: Go east if the * block below the current one only owns lower entries. If this block, however, * owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); if( GoSouth ) { /* * When the upper triangular part of sub( A ) should be operated with and * one is planning to go south in the table, it is neccessary to take care * of the remaining columns of these imbloc rows immediately. */ if( upper && ( Anq > inbloc ) ) { tmp1 = Anq - inbloc; SYM( TYPE, SIDE, ALL, imbloc, tmp1, K, 0, ALPHA, Mptr( A, Aii, Ajj+inbloc, Ald, size ), Ald, XC+Xii*XCinc, LDXC, XR+(Xjj+inbloc)*XRinc, LDXR, YC+Xii*YCinc, LDYC, YR+(Xjj+inbloc)*YRinc, LDYR ); } Aii += imbloc; Xii += imbloc; m1 -= imbloc; } else { /* * When the lower triangular part of sub( A ) should be operated with and * one is planning to go east in the table, it is neccessary to take care * of the remaining rows of these inbloc columns immediately. */ if( lower && ( Amp > imbloc ) ) { tmp1 = Amp - imbloc; SYM( TYPE, SIDE, ALL, tmp1, inbloc, K, 0, ALPHA, Mptr( A, Aii+imbloc, Ajj, Ald, size ), Ald, XC+(Xii+imbloc)*XCinc, LDXC, XR+Xjj*XRinc, LDXR, YC+(Xii+imbloc)*YCinc, LDYC, YR+Xjj*YRinc, LDYR ); } Ajj += inbloc; Xjj += inbloc; n1 -= inbloc; } } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the local row indexes in A and XC. */ lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; Xoffi += imbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row indexes in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; Xoffi += Amb; } /* * Operate with the upper triangular part of sub( A ) we just skipped when * necessary. */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { SYM( TYPE, SIDE, ALL, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, XC+Xii*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, YC+Xii*YCinc, LDYC, YR+(Xoffj+1)*YRinc, LDYR ); Aii += tmp1; Xii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffx = Xoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; SYM( TYPE, SIDE, UPLO, mbloc, inbloc, K, lcmt, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, XC+(ioffx+1)*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, YC+(ioffx+1)*YCinc, LDYC, YR+(Xoffj+1)*YRinc, LDYR ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; ioffd += mbloc; Xoffi = ioffx; ioffx += mbloc; } /* * Operate with the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) SYM( TYPE, SIDE, ALL, tmp1, inbloc, K, 0, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, XC+(ioffx+1)*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, YC+(ioffx+1)*YCinc, LDYC, YR+(Xoffj+1)*YRinc, LDYR ); tmp1 = Aoffi - Aii + 1; m1 -= tmp1; n1 -= inbloc; lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; Xoffj += inbloc; /* * Operate with the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) SYM( TYPE, SIDE, ALL, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, XC+Xii*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, YC+Xii*YCinc, LDYC, YR+(Xoffj+1)*YRinc, LDYR ); Aii = Aoffi + 1; Ajj = Aoffj + 1; Xii = Xoffi + 1; Xjj = Xoffj + 1; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as well as * the local column index in A and XR. */ lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; Xoffj += inbloc; /* * While there are blocks remaining that own lower entries, keep going east. * Adjust the current LCM value as well as the local column index in A and XR. */ while( ( nblks > 0 ) && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; Aoffj += Anb; Xoffj += Anb; } /* * Operate with the lower triangular part of sub( A ). */ tmp1 = MIN( Aoffj, jjmax ) - Ajj + 1; if( lower && ( tmp1 > 0 ) ) { SYM( TYPE, SIDE, ALL, m1, tmp1, K, 0, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), Ald, XC+Xii*XCinc, LDXC, XR+Xjj*XRinc, LDXR, YC+Xii*YCinc, LDYC, YR+Xjj*YRinc, LDYR ); Ajj += tmp1; Xjj += tmp1; n1 -= tmp1; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return; /* * lcmt00 >= low. The current block owns either diagonals or upper entries. * Save the current position in the LCM table. After this row has been * completely taken care of, re-start from this column and the next row of * the LCM table. */ lcmt = lcmt00; nblkd = nblks; joffd = Aoffj; joffx = Xoffj; nbloc = Anb; while( ( nblkd > 0 ) && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ if( nblkd == 1 ) nbloc = lnbloc; SYM( TYPE, SIDE, UPLO, imbloc, nbloc, K, lcmt, ALPHA, Mptr( A, Aii, joffd+1, Ald, size ), Ald, XC+Xii*XCinc, LDXC, XR+(joffx+1)*XRinc, LDXR, YC+Xii*YCinc, LDYC, YR+(joffx+1)*YRinc, LDYR ); lcmt00 = lcmt; lcmt += qnb; nblks = nblkd; nblkd--; Aoffj = joffd; joffd += nbloc; Xoffj = joffx; joffx += nbloc; } /* * Operate with the upper triangular part of sub( A ). */ tmp1 = n1 - joffd + Ajj - 1; if( upper && ( tmp1 > 0 ) ) SYM( TYPE, SIDE, ALL, imbloc, tmp1, K, 0, ALPHA, Mptr( A, Aii, joffd+1, Ald, size ), Ald, XC+Xii*XCinc, LDXC, XR+(joffx+1)*XRinc, LDXR, YC+Xii*YCinc, LDYC, YR+(joffx+1)*YRinc, LDYR ); tmp1 = Aoffj - Ajj + 1; m1 -= imbloc; n1 -= tmp1; lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; Xoffi += imbloc; /* * Operate with the lower triangular part of sub( A ). */ if( lower && ( m1 > 0 ) && ( tmp1 > 0 ) ) SYM( TYPE, SIDE, ALL, m1, tmp1, K, 0, ALPHA, Mptr( A, Aoffi+1, Ajj, Ald, size ), Ald, XC+(Xoffi+1)*XCinc, LDXC, XR+Xjj*XRinc, LDXR, YC+(Xoffi+1)*YCinc, LDYC, YR+Xjj*YRinc, LDYR ); Aii = Aoffi + 1; Ajj = Aoffj + 1; Xii = Xoffi + 1; Xjj = Xoffj + 1; } /* * Loop over the remaining columns of the LCM table. */ nbloc = Anb; while( nblks > 0 ) { if( nblks == 1 ) nbloc = lnbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row index in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; Xoffi += Amb; } /* * Operate with the upper triangular part of sub( A ). */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { SYM( TYPE, SIDE, ALL, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, XC+Xii*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, YC+Xii*YCinc, LDYC, YR+(Xoffj+1)*YRinc, LDYR ); Aii += tmp1; Xii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffx = Xoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; SYM( TYPE, SIDE, UPLO, mbloc, nbloc, K, lcmt, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, XC+(ioffx+1)*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, YC+(ioffx+1)*YCinc, LDYC, YR+(Xoffj+1)*YRinc, LDYR ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; Xoffi = ioffx; ioffd += mbloc; ioffx += mbloc; } /* * Operate with the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) SYM( TYPE, SIDE, ALL, tmp1, nbloc, K, 0, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, XC+(ioffx+1)*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, YC+(ioffx+1)*YCinc, LDYC, YR+(Xoffj+1)*YRinc, LDYR ); tmp1 = MIN( Aoffi, iimax ) - Aii + 1; m1 -= tmp1; n1 -= nbloc; lcmt00 += qnb; nblks--; Aoffj += nbloc; Xoffj += nbloc; /* * Operate with the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) SYM( TYPE, SIDE, ALL, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, XC+Xii*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, YC+Xii*YCinc, LDYC, YR+(Xoffj+1)*YRinc, LDYR ); Aii = Aoffi + 1; Ajj = Aoffj + 1; Xii = Xoffi + 1; Xjj = Xoffj + 1; } /* * End of PB_Cpsym */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpsymmAB.c000644 000766 000024 00000075150 10363532303 020427 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpsymmAB( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG, char * SIDE, char * UPLO, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_CpsymmAB( TYPE, DIRECAB, CONJUG, SIDE, UPLO, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * CONJUG, * DIRECAB, * SIDE, * UPLO; int IA, IB, IC, JA, JB, JC, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; char * A, * B, * C; #endif { /* * Purpose * ======= * * PB_CpsymmAB performs one of the matrix-matrix operations * * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * or * * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha and beta are scalars, sub( A ) is a symmetric or Hermitian * submatrix and sub( B ) and sub( C ) are m by n submatrices. * * This is the outer-product algorithm using the logical aggregation * blocking technique. The submatrix operand sub( C ) stays in place. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECAB (global input) pointer to CHAR * On entry, DIRECAB specifies the direction in which the rows * or columns of sub( A ) and sub( B ) should be looped over as * follows: * DIRECAB = 'F' or 'f' forward or increasing, * DIRECAB = 'B' or 'b' backward or decreasing. * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether sub( A ) is a symmetric or * Hermitian submatrix operand as follows: * CONJUG = 'N' or 'n' sub( A ) is symmetric, * CONJUG = 'Z' or 'z' sub( A ) is Hermitian. * * SIDE (global input) pointer to CHAR * On entry, SIDE specifies whether the symmetric or Hermitian * submatrix sub( A ) appears on the left or right in the opera- * tion as follows: * * SIDE = 'L' or 'l' * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * SIDE = 'R' or 'r' * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the submatrix sub( A ) are to be referenced as follows: * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * submatrix sub( A ) are referenced, * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * submatrix sub( A ) are referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with SIDE = 'L' or 'l', this array contains * the local entries corresponding to the entries of the m by m * symmetric or Hermitian submatrix sub( A ), such that when * UPLO = 'U' or 'u', this array contains the local entries of * the upper triangular part of the submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric or Hermitian submatrix sub( A ), and the local * entries of the strictly upper triangular of sub( A ) are not * referenced. * Before entry with SIDE = 'R' or 'r', this array contains * the local entries corresponding to the entries of the n by n * symmetric or Hermitian submatrix sub( A ), such that when * UPLO = 'U' or 'u', this array contains the local entries of * the upper triangular part of the submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric or Hermitian submatrix sub( A ), and the local * entries of the strictly upper triangular of sub( A ) are not * referenced. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set and assumed to be zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char GatherDir, ScatterDir, * one, top, tran, * zero; int Afr, An, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii, Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq, Bnq0, Brow, Ccol, Ccurcol, Ccurimb1, Ccurinb1, Ccurrow, Cii, Cimb, Cimb1, Cinb, Cinb1, Cjj, Cld, Cmb, Cmp, Cmp0, Cnb, Cnq, Cnq0, Crow, WABfr, WACfr, WBCfr, WBCsum, conjg, ctxt, fwd, k, kb, kbb, kend, kstart, kstep, ktmp, lside, mycol, myrow, npcol, nprow, size, upper; GEMM_T gemm; GSUM2D_T gsum2d; /* * .. Local Arrays .. */ int Bd0 [DLEN_], Cd0 [DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WABd[DLEN_], WACd[DLEN_], WBCd [DLEN_]; char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * Cptr0 = NULL, * WAB = NULL, * WAC = NULL, * WBC = NULL; /* .. * .. Executable Statements .. * */ /* * sub( C ) = beta * sub( C ) */ PB_Cplascal( TYPE, ALL, NOCONJG, M, N, BETA, C, IC, JC, DESCC ); /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); An = ( ( lside = ( Mupcase( SIDE[0] ) == CLEFT ) ) ? M : N ); upper = ( Mupcase( UPLO[0] ) == CUPPER ); tran = ( ( conjg = ( Mupcase( CONJUG[0] ) == CCONJG ) ) ? CCOTRAN : CTRAN ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d; /* * Figure out the loop bounds accordingly to DIRECAB */ kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); if( ( fwd = ( Mupcase( DIRECAB[0] ) == CFORWARD ) ) != 0 ) { kstart = 0; kend = ( ( An - 1 ) / kb + 1 ) * kb; kstep = kb; GatherDir = ScatterDir = CFORWARD; } else { kstart = ( ( An - 1 ) / kb ) * kb; kend = kstep = -kb; GatherDir = ScatterDir = CBACKWARD; } /* * Compute local information for sub( B ) and sub( C ) */ PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); Bimb = DESCB[IMB_]; Binb = DESCB[INB_]; Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_]; Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb ); Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow ); Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb ); Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size ); PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj, &Crow, &Ccol ); Cimb = DESCC[IMB_]; Cinb = DESCC[INB_]; Cmb = DESCC[MB_ ]; Cnb = DESCC[NB_ ]; Cld = DESCC[LLD_]; Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb ); Cmp0 = PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow ); Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb ); Cnq0 = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol ); if( ( Cmp0 > 0 ) && ( Cnq0 > 0 ) ) Cptr0 = Mptr( C, Cii, Cjj, Cld, size ); if( lside ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( upper ) { for( k = kstart; k != kend; k += kstep ) { kbb = An - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &GatherDir, ktmp, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over * C( IC:IC+k+kbb-1, JC:JC+N-1 ) -> WAC */ PB_Cdescset( Cd0, ktmp, N, Cimb1, Cinb1, Cmb, Cnb, Crow, Ccol, ctxt, Cld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Cd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WAC, WACd, &WACfr ); /* * Zero lower triangle of WAC( k:k+kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, LOWER, CONJUG, kbb, kbb, zero, zero, WAC, k, 0, WACd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC, k+1, 0, WACd ); /* * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, N, B, IB+k, JB, DESCB, ROW, &Bptr, DBUFB, &Bfr ); /* * Replicate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over C( IC:IC+k+kbb-1, JC:JC+N-1 ) */ PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Cd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WBC, WBCd, &WBCfr ); /* * C( IC:IC+k+kbb-1, JC:JC+N-1 ) += ALPHA * WAC * WBC */ Cmp = PB_Cnumroc( ktmp, 0, Cimb1, Cmb, myrow, Crow, nprow ); if( ( Cmp > 0 ) && ( Cnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq0, &kbb, ALPHA, WAC, &WACd[LLD_], WBC, &WBCd[LLD_], one, Cptr0, &Cld ); if( WBCfr ) free( WBC ); if( Bfr ) free( Bptr ); /* * Replicate WAC = A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over * B( IB:IB+k+kbb-1, JB:JB+N-1 ) -> WAB */ PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, WAC, 0, 0, WACd, COLUMN, &WAB, WABd, &WABfr ); /* * Zero lower triangle of WAB( k:k+kbb-1, 0:kbb-1 ) */ PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, zero, WAB, k, 0, WABd ); /* * WBC := ALPHA*A(IA:IA+k+kbb-1, JA+k:JA+k+kbb-1)'*B( IB:IB+k+kbb-1, JB:JB+N-1 ) */ PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WBC, WBCd, &WBCfr, &WBCsum ); Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bnq0 > 0 ) && ( Bmp > 0 ) ) gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp, ALPHA, WAB, &WABd[LLD_], Bptr0, &Bld, zero, WBC, &WBCd[LLD_] ); if( WABfr ) free( WAB ); if( WACfr ) free( WAC ); if( Afr ) free( Aptr ); if( WBCsum ) { WBCd[RSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cimb1, Cmb, Crow, Crow, nprow ); if( Bnq0 > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WBC, WBCd[LLD_], WBCd[RSRC_], mycol ); } /* * C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) := C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) + WBC */ PB_CScatterV( TYPE, &ScatterDir, kbb, N, WBC, 0, 0, WBCd, ROW, one, C, IC+k, JC, DESCC, ROW ); if( WBCfr ) free( WBC ); } } else { for( k = kstart; k != kend; k += kstep ) { ktmp = An - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &GatherDir, ktmp, kbb, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over C( IC+k:IC+M-1, JC:JC+N-1 ) */ Ccurimb1 = PB_Cfirstnb( ktmp, IC+k, Cimb, Cmb ); Ccurrow = PB_Cindxg2p( k, Cimb1, Cmb, Crow, Crow, nprow ); PB_Cdescset( Cd0, ktmp, N, Ccurimb1, Cinb1, Cmb, Cnb, Ccurrow, Ccol, ctxt, Cld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Cd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WAC, WACd, &WACfr ); /* * Zero upper triangle of WAC( 0:kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, UPPER, CONJUG, kbb, kbb, zero, zero, WAC, 0, 0, WACd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC, 0, 1, WACd ); /* * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, N, B, IB+k, JB, DESCB, ROW, &Bptr, DBUFB, &Bfr ); /* * Replicate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over C( IC+k:IC+M-1, JC:JC+N-1 ) */ PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Cd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WBC, WBCd, &WBCfr ); /* * C( IC+k:IC+M-1, JC:JC+N-1 ) += ALPHA * WAC * WBC */ Cmp = PB_Cnumroc( ktmp, k, Cimb1, Cmb, myrow, Crow, nprow ); if( ( Cmp > 0 ) && ( Cnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq0, &kbb, ALPHA, WAC, &WACd[LLD_], WBC, &WBCd[LLD_], one, Mptr( Cptr0, Cmp0-Cmp, 0, Cld, size ), &Cld ); if( WBCfr ) free( WBC ); if( Bfr ) free( Bptr ); /* * Replicate WAC = A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over * B( IB+k:IB+M-1, JB:JB+N-1 ) -> WAB */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, WAC, 0, 0, WACd, COLUMN, &WAB, WABd, &WABfr ); /* * Zero upper triangle of WAB( 0:kbb-1, 0:kbb-1 ) */ PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, zero, WAB, 0, 0, WABd ); /* * WBC := ALPHA*A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )'*B( IB+k:IB+M-1, JB:JB+N-1 ) */ PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WBC, WBCd, &WBCfr, &WBCsum ); Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bnq0 > 0 ) && ( Bmp > 0 ) ) gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp, ALPHA, WAB, &WABd[LLD_], Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld, zero, WBC, &WBCd[LLD_] ); if( WABfr ) free( WAB ); if( WACfr ) free( WAC ); if( Afr ) free( Aptr ); if( WBCsum ) { WBCd[RSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cimb1, Cmb, Crow, Crow, nprow ); if( Bnq0 > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WBC, WBCd[LLD_], WBCd[RSRC_], mycol ); } /* * C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) := C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) + WBC */ PB_CScatterV( TYPE, &ScatterDir, kbb, N, WBC, 0, 0, WBCd, ROW, one, C, IC+k, JC, DESCC, ROW ); if( WBCfr ) free( WBC ); } } } else { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); if( upper ) { for( k = kstart; k != kend; k += kstep ) { ktmp = An - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &GatherDir, kbb, ktmp, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over C( IC:IC+M-1, JC+k:JC+N-1 ) */ Ccurinb1 = PB_Cfirstnb( ktmp, JC+k, Cinb, Cnb ); Ccurcol = PB_Cindxg2p( k, Cinb1, Cnb, Ccol, Ccol, npcol ); PB_Cdescset( Cd0, M, ktmp, Cimb1, Ccurinb1, Cmb, Cnb, Crow, Ccurcol, ctxt, Cld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Cd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WAC, WACd, &WACfr ); /* * Zero lower triangle of WAC( 0:kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, LOWER, CONJUG, kbb, kbb, zero, zero, WAC, 0, 0, WACd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC, 1, 0, WACd ); /* * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, M, kbb, B, IB, JB+k, DESCB, COLUMN, &Bptr, DBUFB, &Bfr ); /* * Replicate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over C( IC:IC+M-1, JC+k:JC+N-1 ) */ PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Cd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WBC, WBCd, &WBCfr ); /* * C( IC:IC+M-1, JC+k:JC+N-1 ) += ALPHA * WBC * WAC */ Cnq = PB_Cnumroc( ktmp, k, Cinb1, Cnb, mycol, Ccol, npcol ); if( ( Cmp0 > 0 ) && ( Cnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq, &kbb, ALPHA, WBC, &WBCd[LLD_], WAC, &WACd[LLD_], one, Mptr( Cptr0, 0, Cnq0-Cnq, Cld, size ), &Cld ); if( WBCfr ) free( WBC ); if( Bfr ) free( Bptr ); /* * Replicate WAC = A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over * B( IB:IB+M-1, JB+k:JB+N-1 ) -> WAB */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, WAC, 0, 0, WACd, ROW, &WAB, WABd, &WABfr ); /* * Zero lower triangle of WAB( 0:kbb-1, 0:kbb-1 ) */ PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, zero, WAB, 0, 0, WABd ); /* * WBC := ALPHA*B( IB:IB+M-1, JB+k:JB+N-1 )*A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )' */ PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WBC, WBCd, &WBCfr, &WBCsum ); Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp0, &kbb, &Bnq, ALPHA, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld, WAB, &WABd[LLD_], zero, WBC, &WBCd[LLD_] ); if( WABfr ) free( WAB ); if( WACfr ) free( WAC ); if( Afr ) free( Aptr ); if( WBCsum ) { WBCd[CSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cinb1, Cnb, Ccol, Ccol, npcol ); if( Bmp0 > 0 ) gsum2d( ctxt, ROW, &top, Bmp0, kbb, WBC, WBCd[LLD_], myrow, WBCd[CSRC_] ); } /* * C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) := C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) + WBC */ PB_CScatterV( TYPE, &ScatterDir, M, kbb, WBC, 0, 0, WBCd, COLUMN, one, C, IC, JC+k, DESCC, COLUMN ); if( WBCfr ) free( WBC ); } } else { for( k = kstart; k != kend; k += kstep ) { kbb = An - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &GatherDir, kbb, ktmp, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over * C( IC:IC+M-1, JC:JC+k+kbb-1 ) -> WAC */ PB_Cdescset( Cd0, M, ktmp, Cimb1, Cinb1, Cmb, Cnb, Crow, Ccol, ctxt, Cld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Cd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WAC, WACd, &WACfr ); /* * Zero upper triangle of WAC( 0:kbb-1, k:k+kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, UPPER, CONJUG, kbb, kbb, zero, zero, WAC, 0, k, WACd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC, 0, k+1, WACd ); /* * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, M, kbb, B, IB, JB+k, DESCB, COLUMN, &Bptr, DBUFB, &Bfr ); /* * Replicate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over C( IC:IC+M-1, JC:JC+k+kbb-1 ) */ PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Cd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WBC, WBCd, &WBCfr ); /* * C( IC:IC+M-1, JC:JC+k+kbb-1 ) += ALPHA * WBC * WAC */ Cnq = PB_Cnumroc( ktmp, 0, Cinb1, Cnb, mycol, Ccol, npcol ); if( ( Cmp0 > 0 ) && ( Cnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq, &kbb, ALPHA, WBC, &WBCd[LLD_], WAC, &WACd[LLD_], one, Cptr0, &Cld ); if( WBCfr ) free( WBC ); if( Bfr ) free( Bptr ); /* * Replicate WAC = A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over * B( IB:IB+M-1, JB:JB+k+kbb-1 ) -> WAB */ PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, WAC, 0, 0, WACd, ROW, &WAB, WABd, &WABfr ); /* * Zero upper triangle of WAB( 0:kbb-1, k:k+kbb-1 ) */ PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, zero, WAB, 0, k, WABd ); /* * WBC := ALPHA*B( IB:IB+M-1, JB:JB+k+kbb-1 )*A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )' */ PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WBC, WBCd, &WBCfr, &WBCsum ); Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp0, &kbb, &Bnq, ALPHA, Bptr0, &Bld, WAB, &WABd[LLD_], zero, WBC, &WBCd[LLD_] ); if( WABfr ) free( WAB ); if( WACfr ) free( WAC ); if( Afr ) free( Aptr ); if( WBCsum ) { WBCd[CSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cinb1, Cnb, Ccol, Ccol, npcol ); if( Bmp0 > 0 ) gsum2d( ctxt, ROW, &top, Bmp0, kbb, WBC, WBCd[LLD_], myrow, WBCd[CSRC_] ); } /* * C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) := C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) + WBC */ PB_CScatterV( TYPE, &ScatterDir, M, kbb, WBC, 0, 0, WBCd, COLUMN, one, C, IC, JC+k, DESCC, COLUMN ); if( WBCfr ) free( WBC ); } } } /* * End of PB_CpsymmAB */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpsymmBC.c000644 000766 000024 00000077011 10363532303 020427 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpsymmBC( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG, char * SIDE, char * UPLO, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_CpsymmBC( TYPE, DIRECAB, CONJUG, SIDE, UPLO, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * CONJUG, * DIRECAB, * SIDE, * UPLO; int IA, IB, IC, JA, JB, JC, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; char * A, * B, * C; #endif { /* * Purpose * ======= * * PB_CpsymmBC performs one of the matrix-matrix operations * * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * or * * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha and beta are scalars, sub( A ) is a symmetric or Hermitian * submatrix and sub( B ) and sub( C ) are m by n submatrices. * * This is the inner-product algorithm using the logical LCM hybrid * and static blocking techniques. The submatrix operand sub( A ) stays * in place. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECAB (global input) pointer to CHAR * On entry, DIRECAB specifies the direction in which the rows * or columns of sub( B ) should be looped over as follows: * DIRECAB = 'F' or 'f' forward or increasing, * DIRECAB = 'B' or 'b' backward or decreasing. * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether sub( A ) is a symmetric or * Hermitian submatrix operand as follows: * CONJUG = 'N' or 'n' sub( A ) is symmetric, * CONJUG = 'Z' or 'z' sub( A ) is Hermitian. * * SIDE (global input) pointer to CHAR * On entry, SIDE specifies whether the symmetric or Hermitian * submatrix sub( A ) appears on the left or right in the opera- * tion as follows: * * SIDE = 'L' or 'l' * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ), * * SIDE = 'R' or 'r' * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the submatrix sub( A ) are to be referenced as follows: * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * submatrix sub( A ) are referenced, * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * submatrix sub( A ) are referenced. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A and * B corresponding to the entries of the submatrices sub( A ) * and sub( B ) respectively need not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with SIDE = 'L' or 'l', this array contains * the local entries corresponding to the entries of the m by m * symmetric or Hermitian submatrix sub( A ), such that when * UPLO = 'U' or 'u', this array contains the local entries of * the upper triangular part of the submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric or Hermitian submatrix sub( A ), and the local * entries of the strictly upper triangular of sub( A ) are not * referenced. * Before entry with SIDE = 'R' or 'r', this array contains * the local entries corresponding to the entries of the n by n * symmetric or Hermitian submatrix sub( A ), such that when * UPLO = 'U' or 'u', this array contains the local entries of * the upper triangular part of the submatrix sub( A ), and the * local entries of the strictly lower triangular of sub( A ) * are not referenced, and when UPLO = 'L' or 'l', this array * contains the local entries of the lower triangular part of * the symmetric or Hermitian submatrix sub( A ), and the local * entries of the strictly upper triangular of sub( A ) are not * referenced. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( A ) need not be * set and assumed to be zero. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char GemmTa, GemmTb, cctop, * one, rctop, * talphaCR, * talphaRC, * tbeta, * zero; int Acol, Aii, Aimb1, Ainb1, Ajj, Alcmb, Ald, Alp, Alp0, Alq, Alq0, Amb, Amp, An, Anb, Anq, Arow, BCfwd, BCmyprocD, BCmyprocR, BCnD, BCnR, BCnprocsD, BCnprocsR, Bbufld, BcurrocR, Bfr, BiD, BiR, BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR, Bkk, Bld, BnbD, BnbR, BnpD, BnpR, Boff, BrocD, BrocR, BsrcR, Cfr, CiD, CiR, CiiD, CiiR, CinbD, CinbR, Cinb1D, Cinb1R, Ckk, CnbD, CnbR, CnpD, CnpR, Coff, CrocD, CrocR, CsrcR, Cbufld, CcurrocR, CisR, Cld, WBCfr, WBCld, WBRfr, WBRld, WCCfr, WCCld, WCCsum, WCRfr, WCRld, WCRsum, conjg, ctxt, l, lb, lcmb, lside, ltmp, maxp, maxpm1, maxq, mycol, myrow, n, nb, nbb, ncpq, npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp, upper; TZSYM_T tzsymm; GEMM_T gemm; GSUM2D_T gsum2d; /* * .. Local Arrays .. */ PB_VM_T VM; int Ad0 [DLEN_], DBUFB[DLEN_], DBUFC[DLEN_], WBCd[DLEN_], WBRd[DLEN_], WCCd [DLEN_], WCRd [DLEN_]; char * Aptr = NULL, * Bbuf = NULL, * Cbuf = NULL, * WBC = NULL, * WBR = NULL, * WCC = NULL, * WCR = NULL; /* .. * .. Executable Statements .. * */ Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); BCfwd = ( Mupcase( DIRECAB[0] ) == CFORWARD ); conjg = ( Mupcase( CONJUG [0] ) == CCONJG ); lside = ( Mupcase( SIDE [0] ) == CLEFT ); upper = ( Mupcase( UPLO [0] ) == CUPPER ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d; nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Compute local information for sub( A ), sub( B ) and sub( C ) */ if( lside ) { BCnD = An = M; BCnR = N; BCmyprocD = myrow; BCnprocsD = nprow; BCmyprocR = mycol; BCnprocsR = npcol; BiD = IB; BiR = JB; BinbD = DESCB[IMB_ ]; BinbR = DESCB[INB_]; BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ]; BsrcR = DESCB[CSRC_]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR, &BiiD, &BiiR, &BrocD, &BrocR ); CiD = IC; CiR = JC; CinbD = DESCC[IMB_ ]; CinbR = DESCC[INB_]; CnbD = DESCC[MB_ ]; CnbR = DESCC[NB_ ]; CsrcR = DESCC[CSRC_]; Cld = DESCC[LLD_]; PB_Cinfog2l( IC, JC, DESCC, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR, &CiiD, &CiiR, &CrocD, &CrocR ); } else { BCnD = An = N; BCnR = M; BCmyprocD = mycol; BCnprocsD = npcol; BCmyprocR = myrow; BCnprocsR = nprow; BiD = JB; BiR = IB; BinbR = DESCB[IMB_ ]; BinbD = DESCB[INB_]; BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ]; BsrcR = DESCB[RSRC_]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD, &BiiR, &BiiD, &BrocR, &BrocD ); CiD = JC; CiR = IC; CinbR = DESCC[IMB_ ]; CinbD = DESCC[INB_]; CnbR = DESCC[MB_ ]; CnbD = DESCC[NB_ ]; CsrcR = DESCC[RSRC_]; Cld = DESCC[LLD_]; PB_Cinfog2l( IC, JC, DESCC, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD, &CiiR, &CiiD, &CrocR, &CrocD ); } Binb1D = PB_Cfirstnb( BCnD, BiD, BinbD, BnbD ); BnpD = PB_Cnumroc( BCnD, 0, Binb1D, BnbD, BCmyprocD, BrocD, BCnprocsD ); Binb1R = PB_Cfirstnb( BCnR, BiR, BinbR, BnbR ); BisR = ( ( BsrcR < 0 ) || ( BCnprocsR == 1 ) ); Cinb1D = PB_Cfirstnb( BCnD, CiD, CinbD, CnbD ); CnpD = PB_Cnumroc( BCnD, 0, Cinb1D, CnbD, BCmyprocD, CrocD, BCnprocsD ); Cinb1R = PB_Cfirstnb( BCnR, CiR, CinbR, CnbR ); CisR = ( ( CsrcR < 0 ) || ( BCnprocsR == 1 ) ); /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Amp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( A, Aii, Ajj, Ald, size ); /* * Retrieve the BLACS combine topologies, compute conjugate of alpha for the * Hermitian case and set the transpose parameters to be passed to the BLAS * matrix multiply routine. */ rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( conjg ) { tzsymm = PB_Ctzhemm; if( lside ) { talphaRC = ALPHA; GemmTa = CCOTRAN; GemmTb = CTRAN; talphaCR = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talphaCR ); } else { talphaCR = ALPHA; GemmTa = CTRAN; GemmTb = CCOTRAN; talphaRC = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talphaRC ); } } else { tzsymm = PB_Ctzsymm; talphaCR = talphaRC = ALPHA; GemmTa = CTRAN; GemmTb = CTRAN; } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ Alcmb = 2 * nb * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); /* * When sub( B ) is not replicated and backward pass on sub( B ), find the * virtual process q owning the last row or column of sub( B ). */ if( !( BisR ) && !( BCfwd ) ) { tmp = PB_Cindxg2p( BCnR - 1, Binb1R, BnbR, BrocR, BrocR, BCnprocsR ); q = MModSub( tmp, BrocR, BCnprocsR ); } /* * When sub( C ) is not replicated and backward pass on sub( C ), find the * virtual process p owning the last row or column of sub( C ). */ if( !( CisR ) && !( BCfwd ) ) { tmp = PB_Cindxg2p( BCnR - 1, Cinb1R, CnbR, CrocR, CrocR, BCnprocsR ); p = MModSub( tmp, CrocR, BCnprocsR ); } /* * Loop over the virtual process grid induced by the rows or columns of * sub( B ) and sub( C ). */ lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : BCnprocsR ) ) * CnbR, ( maxq = ( BisR ? 1 : BCnprocsR ) ) * BnbR ); n = BCnR; maxpm1 = maxp - 1; while( n > 0 ) { /* * Initialize local virtual matrix in process (p,q) */ BcurrocR = ( BisR ? -1 : MModAdd( BrocR, q, BCnprocsR ) ); Bkk = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BCnprocsR ); BnpR = PB_Cnumroc( BCnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BCnprocsR ); CcurrocR = ( CisR ? -1 : MModAdd( CrocR, p, BCnprocsR ) ); Ckk = PB_Cg2lrem( CiR, CinbR, CnbR, CcurrocR, CsrcR, BCnprocsR ); CnpR = PB_Cnumroc( BCnR, 0, Cinb1R, CnbR, CcurrocR, CrocR, BCnprocsR ); PB_CVMinit( &VM, 0, CnpR, BnpR, Cinb1R, Binb1R, CnbR, BnbR, p, q, maxp, maxq, lcmb ); /* * Find how many diagonals in this virtual process */ npq = PB_CVMnpq( &VM ); n -= npq; /* * Re-adjust the number of rows or columns to be (un)packed, in order to * average the message sizes. */ if( npq ) nbb = npq / ( ( npq - 1 ) / nb + 1 ); while( npq ) { nbb = MIN( nbb, npq ); /* * Find out how many rows or columns of sub( B ) and sub( C ) are contiguous */ PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Boff ); if( lside ) { /* * Compute the descriptor DBUFB for the buffer that will contained the packed * columns of sub( B ). */ if( ( Bfr = ( ncpq < nbb ) ) != 0 ) { /* * If columns of sub( B ) are not contiguous, then allocate the buffer and * pack the kbb columns of sub( B ). */ Bbufld = MAX( 1, BnpD ); if( BisR || ( BCmyprocR == BcurrocR ) ) { Bbuf = PB_Cmalloc( BnpD * nbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, COLUMN, PACKING, NOTRAN, nbb, BnpD, one, Mptr( B, BiiD, Bkk, Bld, size ), Bld, zero, Bbuf, Bbufld ); } } else { /* * Otherwise, re-use sub( B ) directly. */ Bbufld = Bld; if( BisR || ( BCmyprocR == BcurrocR ) ) Bbuf = Mptr( B, BiiD, Bkk+Boff, Bld, size ); } PB_Cdescset( DBUFB, BCnD, nbb, Binb1D, nbb, BnbD, nbb, BrocD, BcurrocR, ctxt, Bbufld ); /* * Replicate this panel of columns of sub( B ) as well as its transposed * over sub( A ) -> WBC, WBR */ PB_CInV( TYPE, NOCONJG, COLUMN, An, An, Ad0, nbb, Bbuf, 0, 0, DBUFB, COLUMN, &WBC, WBCd, &WBCfr ); PB_CInV( TYPE, NOCONJG, ROW, An, An, Ad0, nbb, WBC, 0, 0, WBCd, COLUMN, &WBR, WBRd, &WBRfr ); } else { /* * Compute the descriptor DBUFB for the buffer that will contained the packed * rows of sub( B ). */ if( ( Bfr = ( ncpq < nbb ) ) != 0 ) { /* * If rows of sub( B ) are not contiguous, then allocate the buffer and pack * the kbb rows of sub( B ). */ Bbufld = nbb; if( BisR || ( BCmyprocR == BcurrocR ) ) { Bbuf = PB_Cmalloc( BnpD * nbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, ROW, PACKING, NOTRAN, nbb, BnpD, one, Mptr( B, Bkk, BiiD, Bld, size ), Bld, zero, Bbuf, Bbufld ); } } else { /* * Otherwise, re-use sub( B ) directly. */ Bbufld = Bld; if( BisR || ( BCmyprocR == BcurrocR ) ) Bbuf = Mptr( B, Bkk+Boff, BiiD, Bld, size ); } PB_Cdescset( DBUFB, nbb, BCnD, nbb, Binb1D, nbb, BnbD, BcurrocR, BrocD, ctxt, Bbufld ); /* * Replicate this panel of rows of sub( B ) as well as its transposed * over sub( A ) -> WBR, WBC */ PB_CInV( TYPE, NOCONJG, ROW, An, An, Ad0, nbb, Bbuf, 0, 0, DBUFB, ROW, &WBR, WBRd, &WBRfr ); PB_CInV( TYPE, NOCONJG, COLUMN, An, An, Ad0, nbb, WBR, 0, 0, WBRd, ROW, &WBC, WBCd, &WBCfr ); } /* * Allocate space for temporary results in scope of sub( A ) -> WCC, WCR */ PB_COutV( TYPE, COLUMN, INIT, An, An, Ad0, nbb, &WCC, WCCd, &WCCfr, &WCCsum ); PB_COutV( TYPE, ROW, INIT, An, An, Ad0, nbb, &WCR, WCRd, &WCRfr, &WCRsum ); /* * Local matrix-matrix multiply iff I own some data */ WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_]; WCCld = WCCd[LLD_]; WCRld = WCRd[LLD_]; if( ( Amp > 0 ) && ( Anq > 0 ) ) { if( upper ) { /* * sub( A ) is upper triangular */ for( l = 0; l < An; l += Alcmb ) { lb = An - l; lb = MIN( lb, Alcmb ); Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow ); Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol ); Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol ); if( Alp > 0 && Alq0 > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &GemmTb ), &Alp, &nbb, &Alq0, talphaRC, Mptr( Aptr, 0, Alq, Ald, size ), &Ald, Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one, WCC, &WCCld ); gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( NOTRAN ), &nbb, &Alq0, &Alp, talphaCR, WBC, &WBCld, Mptr( Aptr, 0, Alq, Ald, size ), &Ald, one, Mptr( WCR, 0, Alq, WCRld, size ), &WCRld ); } PB_Cpsym( TYPE, TYPE, SIDE, UPPER, lb, nbb, ALPHA, Aptr, l, l, Ad0, Mptr( WBC, Alp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Alq, WBRld, size ), WBRld, Mptr( WCC, Alp, 0, WCCld, size ), WCCld, Mptr( WCR, 0, Alq, WCRld, size ), WCRld, tzsymm ); } } else { /* * sub( A ) is lower triangular */ for( l = 0; l < An; l += Alcmb ) { lb = An - l; ltmp = l + ( lb = MIN( lb, Alcmb ) ); Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow ); Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cpsym( TYPE, TYPE, SIDE, LOWER, lb, nbb, ALPHA, Aptr, l, l, Ad0, Mptr( WBC, Alp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Alq, WBRld, size ), WBRld, Mptr( WCC, Alp, 0, WCCld, size ), WCCld, Mptr( WCR, 0, Alq, WCRld, size ), WCRld, tzsymm ); Alp = PB_Cnumroc( ltmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Alp0 = Amp - Alp; Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol ); if( Alp0 > 0 && Alq0 > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &GemmTb ), &Alp0, &nbb, &Alq0, talphaRC, Mptr( Aptr, Alp, Alq, Ald, size ), &Ald, Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one, Mptr( WCC, Alp, 0, WCCld, size ), &WCCld ); gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( NOTRAN ), &nbb, &Alq0, &Alp0, talphaCR, Mptr( WBC, Alp, 0, WBCld, size ), &WBCld, Mptr( Aptr, Alp, Alq, Ald, size ), &Ald, one, Mptr( WCR, 0, Alq, WCRld, size ), &WCRld ); } } } } if( WBCfr ) free( WBC ); if( WBRfr ) free( WBR ); if( Bfr && ( BisR || ( BCmyprocR == BcurrocR ) ) ) if( Bbuf ) free( Bbuf ); if( lside ) { /* * Accumulate the intermediate results in WCC and WCR */ if( WCCsum ) { WCCd[CSRC_] = CcurrocR; if( Amp > 0 ) gsum2d( ctxt, ROW, &rctop, Amp, nbb, WCC, WCCld, myrow, WCCd[CSRC_] ); } if( WCRsum ) { WCRd[RSRC_] = 0; if( Anq > 0 ) gsum2d( ctxt, COLUMN, &cctop, nbb, Anq, WCR, WCRld, WCRd[RSRC_], mycol ); } /* * WCC := WCC + WCR' */ PB_Cpaxpby( TYPE, CONJUG, nbb, An, one, WCR, 0, 0, WCRd, ROW, one, WCC, 0, 0, WCCd, COLUMN ); if( WCRfr ) free( WCR ); /* * Compute the descriptor DBUFC for the buffer that will contained the packed * columns of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < nbb ) ) != 0 ) { /* * If columns of sub( C ) are not contiguous, then allocate the buffer */ Cbufld = MAX( 1, CnpD ); tbeta = zero; if( CisR || ( BCmyprocR == CcurrocR ) ) Cbuf = PB_Cmalloc( CnpD * nbb * size ); } else { /* * Otherwise re-use sub( C ) */ Cbufld = Cld; tbeta = BETA; if( CisR || ( BCmyprocR == CcurrocR ) ) Cbuf = Mptr( C, CiiD, Ckk+Coff, Cld, size ); } PB_Cdescset( DBUFC, BCnD, nbb, Cinb1D, nbb, CnbD, nbb, CrocD, CcurrocR, ctxt, Cbufld ); /* * sub( C ) := beta * sub( C ) + WCC */ PB_Cpaxpby( TYPE, NOCONJG, An, nbb, one, WCC, 0, 0, WCCd, COLUMN, tbeta, Cbuf, 0, 0, DBUFC, COLUMN ); if( WCCfr ) free( WCC ); /* * Unpack the kbb columns of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) ) { PB_CVMpack( TYPE, &VM, ROW, COLUMN, UNPACKING, NOTRAN, nbb, CnpD, BETA, Mptr( C, CiiD, Ckk, Cld, size ), Cld, one, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } } else { /* * Accumulate the intermediate results in WCC and WCR */ if( WCCsum ) { WCCd[CSRC_] = 0; if( Amp > 0 ) gsum2d( ctxt, ROW, &rctop, Amp, nbb, WCC, WCCld, myrow, WCCd[CSRC_] ); } if( WCRsum ) { WCRd[RSRC_] = CcurrocR; if( Anq > 0 ) gsum2d( ctxt, COLUMN, &cctop, nbb, Anq, WCR, WCRld, WCRd[RSRC_], mycol ); } /* * WCR := WCR + WCC' */ PB_Cpaxpby( TYPE, CONJUG, An, nbb, one, WCC, 0, 0, WCCd, COLUMN, one, WCR, 0, 0, WCRd, ROW ); if( WCCfr ) free( WCC ); /* * Compute the descriptor DBUFC for the buffer that will contained the packed * rows of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < nbb ) ) != 0 ) { /* * If rows of sub( C ) are not contiguous, then allocate receiving buffer. */ Cbufld = nbb; tbeta = zero; if( CisR || ( BCmyprocR == CcurrocR ) ) Cbuf = PB_Cmalloc( CnpD * nbb * size ); } else { /* * Otherwise re-use sub( C ) */ Cbufld = Cld; tbeta = BETA; if( CisR || ( BCmyprocR == CcurrocR ) ) Cbuf = Mptr( C, Ckk+Coff, CiiD, Cld, size ); } PB_Cdescset( DBUFC, nbb, BCnD, nbb, Cinb1D, nbb, CnbD, CcurrocR, CrocD, ctxt, Cbufld ); /* * sub( C ) := beta * sub( C ) + WCR */ PB_Cpaxpby( TYPE, NOCONJG, nbb, An, one, WCR, 0, 0, WCRd, ROW, tbeta, Cbuf, 0, 0, DBUFC, ROW ); if( WCRfr ) free( WCR ); /* * Unpack the kbb rows of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) ) { PB_CVMpack( TYPE, &VM, ROW, ROW, UNPACKING, NOTRAN, nbb, CnpD, BETA, Mptr( C, Ckk, CiiD, Cld, size ), Cld, one, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } } /* * Update the local indexes of sub( B ) and sub( C ) */ PB_CVMupdate( &VM, nbb, &Ckk, &Bkk ); npq -= nbb; } /* * Go to next or previous virtual process row or column */ if( ( BCfwd && ( p == maxpm1 ) ) || ( !( BCfwd ) && ( p == 0 ) ) ) q = ( BCfwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) ); p = ( BCfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) ); } if( conjg ) free( ( lside ? talphaCR : talphaRC ) ); /* * End of PB_CpsymmBC */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cpsyr.c000644 000766 000024 00000051353 10363532303 020053 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cpsyr( PBTYP_T * TYPE, char * UPLO, int N, int K, char * ALPHA, char * XC, int LDXC, char * XR, int LDXR, char * A, int IA, int JA, int * DESCA, TZSYR_T SYR ) #else void PB_Cpsyr( TYPE, UPLO, N, K, ALPHA, XC, LDXC, XR, LDXR, A, IA, JA, DESCA, SYR ) /* * .. Scalar Arguments .. */ char * UPLO; int IA, JA, K, LDXC, LDXR, N; char * ALPHA; PBTYP_T * TYPE; TZSYR_T SYR; /* * .. Array Arguments .. */ int * DESCA; char * A, * XC, * XR; #endif { /* * Purpose * ======= * * PB_Cpsyr performs a symmetric or Hermitian rank-k update of the sub- * matrix sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric or Hermitian submatrix sub( A ) are to be * referenced as follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the sym- * metric or Hermitian submatrix sub( A ) * are to be referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the sym- * metric or Hermitian submatrix sub( A ) * are to be referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the local number of columns of the lo- * cal array XC and the local number of rows of the local array * XR. K mut be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * XC (local input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,K). Before entry, * this array contains the local entries of the matrix XC. * * LDXC (local input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, Lp( IA, N ) ). * * XR (local input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,Kx), where Kx is * at least Lc( JA, N ). Before entry, this array contains the * local entries of the matrix XR. * * LDXR (local input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least max( 1, K ). * * A (local input/local output) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the @(syhec) submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the @(syhec) submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * SYR (local input) pointer to function of type TZSYR_T * On entry, SYR specifies the function performing the update of * a single block. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Arow, Aii, Aimb1, Ainb1, Ajj, Ald, Amp, Amb, Anb, Anq, Aoffi, Aoffj, Arcol, Arrow, GoEast, GoSouth, IsColRepl, IsRowRepl, XCinc, XRinc, Xii=0, Xjj=0, Xoffi=-1, Xoffj=-1, iimax, ilow, imbloc, inbloc, ioffd, ioffx, iupp, jjmax, joffd, joffx, lcmt, lcmt00, lmbloc, lnbloc, low, lower, m1, mbloc, mblkd, mblks, mycol, myrow, n1, nbloc, nblkd, nblks, npcol, nprow, pmb, qnb, size, tmp1, upp, upper; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( N <= 0 ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( DESCA[CTXT_], &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cainfog2l( N, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aimb1, &Ainb1, &Amp, &Anq, &Aii, &Ajj, &Arow, &Acol, &Arrow, &Arcol ); /* * Quick return if I don't own any of sub( A ) or if sub( A ) is replicated in * all processes. */ if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return; Amb = DESCA[ MB_ ]; Anb = DESCA[ NB_ ]; Ald = DESCA[LLD_]; IsRowRepl = ( ( Arow < 0 ) || ( nprow == 1 ) ); IsColRepl = ( ( Acol < 0 ) || ( npcol == 1 ) ); size = TYPE->size; if( IsRowRepl && IsColRepl ) { SYR( TYPE, UPLO, Amp, Anq, K, 0, ALPHA, XC, LDXC, XR, LDXR, Mptr( A, Aii, Ajj, Ald, size ), Ald ); return; } XCinc = size; XRinc = LDXR * size; upper = ( Mupcase( UPLO[0] ) == CUPPER ); lower = ( Mupcase( UPLO[0] ) == CLOWER ); /* * Initialize lcmt00, mblks, nblks, imbloc, inbloc, lmbloc, lnbloc, ilow, low, * iupp, and upp. */ PB_Cbinfo( 0, Amp, Anq, Aimb1, Ainb1, Amb, Anb, Arrow, Arcol, &lcmt00, &mblks, &nblks, &imbloc, &inbloc, &lmbloc, &lnbloc, &ilow, &low, &iupp, &upp ); iimax = ( Aoffi = Aii - 1 ) + ( m1 = Amp ); jjmax = ( Aoffj = Ajj - 1 ) + ( n1 = Anq ); pmb = ( IsRowRepl ? Amb : nprow * Amb ); qnb = ( IsColRepl ? Anb : npcol * Anb ); /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); /* * Go through the table looking for blocks owning diagonal entries. */ if( ( !( GoSouth ) ) && ( !( GoEast ) ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ SYR( TYPE, UPLO, imbloc, inbloc, K, lcmt00, ALPHA, XC+Xii*XCinc, LDXC, XR+Xjj*XRinc, LDXR, Mptr( A, Aii, Ajj, Ald, size ), Ald ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); if( GoSouth ) { /* * When the upper triangular part of sub( A ) should be updated and one is * planning to go south in the table, it is neccessary to take care of the * remaining columns of these imbloc rows immediately. */ if( upper && ( Anq > inbloc ) ) { tmp1 = Anq - inbloc; SYR( TYPE, ALL, imbloc, tmp1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, XR+(Xjj+inbloc)*XRinc, LDXR, Mptr( A, Aii, Ajj+inbloc, Ald, size ), Ald ); } Aii += imbloc; Xii += imbloc; m1 -= imbloc; } else { /* * When the lower triangular part of sub( A ) should be updated and one is * planning to go east in the table, it is neccessary to take care of the * remaining rows of these inbloc columns immediately. */ if( lower && ( Amp > imbloc ) ) { tmp1 = Amp - imbloc; SYR( TYPE, ALL, tmp1, inbloc, K, 0, ALPHA, XC+(Xii+imbloc)*XCinc, LDXC, XR+Xjj*XRinc, LDXR, Mptr( A, Aii+imbloc, Ajj, Ald, size ), Ald ); } Ajj += inbloc; Xjj += inbloc; n1 -= inbloc; } } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the local row indexes in A and XC. */ lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; Xoffi += imbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row indexes in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; Xoffi += Amb; } /* * Update the upper triangular part of sub( A ) we just skipped when necessary. */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { SYR( TYPE, ALL, tmp1, n1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald ); Aii += tmp1; Xii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffx = Xoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; SYR( TYPE, UPLO, mbloc, inbloc, K, lcmt, ALPHA, XC+(ioffx+1)*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; ioffd += mbloc; Xoffi = ioffx; ioffx += mbloc; } /* * Update the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) SYR( TYPE, ALL, tmp1, inbloc, K, 0, ALPHA, XC+(ioffx+1)*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald ); tmp1 = Aoffi - Aii + 1; m1 -= tmp1; n1 -= inbloc; lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; Xoffj += inbloc; /* * Update the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) SYR( TYPE, ALL, tmp1, n1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; Xii = Xoffi + 1; Xjj = Xoffj + 1; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as well as * the local column index in A and XR. */ lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; Xoffj += inbloc; /* * While there are blocks remaining that own lower entries, keep going east. * Adjust the current LCM value as well as the local column index in A and XR. */ while( ( nblks > 0 ) && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; Aoffj += Anb; Xoffj += Anb; } /* * Update the lower triangular part of sub( A ). */ tmp1 = MIN( Aoffj, jjmax ) - Ajj + 1; if( lower && ( tmp1 > 0 ) ) { SYR( TYPE, ALL, m1, tmp1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, XR+Xjj*XRinc, LDXR, Mptr( A, Aii, Ajj, Ald, size ), Ald ); Ajj += tmp1; Xjj += tmp1; n1 -= tmp1; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return; /* * lcmt00 >= low. The current block owns either diagonals or upper entries. * Save the current position in the LCM table. After this row has been * completely taken care of, re-start from this column and the next row of * the LCM table. */ lcmt = lcmt00; nblkd = nblks; joffd = Aoffj; joffx = Xoffj; nbloc = Anb; while( ( nblkd > 0 ) && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ if( nblkd == 1 ) nbloc = lnbloc; SYR( TYPE, UPLO, imbloc, nbloc, K, lcmt, ALPHA, XC+Xii*XCinc, LDXC, XR+(joffx+1)*XRinc, LDXR, Mptr( A, Aii, joffd+1, Ald, size ), Ald ); lcmt00 = lcmt; lcmt += qnb; nblks = nblkd; nblkd--; Aoffj = joffd; joffd += nbloc; Xoffj = joffx; joffx += nbloc; } /* * Update the upper triangular part of sub( A ). */ tmp1 = n1 - joffd + Ajj - 1; if( upper && ( tmp1 > 0 ) ) SYR( TYPE, ALL, imbloc, tmp1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, XR+(joffx+1)*XRinc, LDXR, Mptr( A, Aii, (joffd+1), Ald, size ), Ald ); tmp1 = Aoffj - Ajj + 1; m1 -= imbloc; n1 -= tmp1; lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; Xoffi += imbloc; /* * Update the lower triangular part of sub( A ). */ if( lower && ( m1 > 0 ) && ( tmp1 > 0 ) ) SYR( TYPE, ALL, m1, tmp1, K, 0, ALPHA, XC+(Xoffi+1)*XCinc, LDXC, XR+Xjj*XRinc, LDXR, Mptr( A, Aoffi+1, Ajj, Ald, size ), Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; Xii = Xoffi + 1; Xjj = Xoffj + 1; } /* * Loop over the remaining columns of the LCM table. */ nbloc = Anb; while( nblks > 0 ) { if( nblks == 1 ) nbloc = lnbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row index in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; Xoffi += Amb; } /* * Update the upper triangular part of sub( A ). */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { SYR( TYPE, ALL, tmp1, n1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald ); Aii += tmp1; Xii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffx = Xoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; SYR( TYPE, UPLO, mbloc, nbloc, K, lcmt, ALPHA, XC+(ioffx+1)*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; Xoffi = ioffx; ioffd += mbloc; ioffx += mbloc; } /* * Update the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) SYR( TYPE, ALL, tmp1, nbloc, K, 0, ALPHA, XC+(ioffx+1)*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald ); tmp1 = MIN( Aoffi, iimax ) - Aii + 1; m1 -= tmp1; n1 -= nbloc; lcmt00 += qnb; nblks--; Aoffj += nbloc; Xoffj += nbloc; /* * Update the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) SYR( TYPE, ALL, tmp1, n1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, XR+(Xoffj+1)*XRinc, LDXR, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; Xii = Xoffi + 1; Xjj = Xoffj + 1; } /* * End of PB_Cpsyr */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cpsyr2.c000644 000766 000024 00000055412 10363532303 020135 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cpsyr2( PBTYP_T * TYPE, char * UPLO, int N, int K, char * ALPHA, char * XC, int LDXC, char * XR, int LDXR, char * YC, int LDYC, char * YR, int LDYR, char * A, int IA, int JA, int * DESCA, TZSYR2_T SYR2 ) #else void PB_Cpsyr2( TYPE, UPLO, N, K, ALPHA, XC, LDXC, XR, LDXR, YC, LDYC, YR, LDYR, A, IA, JA, DESCA, SYR2 ) /* * .. Scalar Arguments .. */ char * UPLO; int IA, JA, K, LDXC, LDXR, LDYC, LDYR, N; char * ALPHA; PBTYP_T * TYPE; TZSYR2_T SYR2; /* * .. Array Arguments .. */ int * DESCA; char * A, * XC, * XR, * YC, * YR; #endif { /* * Purpose * ======= * * PB_Cpsyr2 performs a symmetric or Hermitian rank-2 update of the sub- * matrix sub( A ) denoting A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of * the array A containing the upper or lower triangular part * of the symmetric or Hermitian submatrix sub( A ) are to be * referenced as follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the sym- * metric or Hermitian submatrix sub( A ) * are to be referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the sym- * metric or Hermitian submatrix sub( A ) * are to be referenced. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the local number of columns of the lo- * cal array XC and the local number of rows of the local array * XR. K mut be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * XC (local input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,K). Before entry, * this array contains the local entries of the matrix XC. * * LDXC (local input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, Lp( IA, N ) ). * * YC (local input) pointer to CHAR * On entry, YC is an array of dimension (LDYC,K). Before entry, * this array contains the local entries of the matrix YC. * * LDYC (local input) INTEGER * On entry, LDYC specifies the leading dimension of the array * YC. LDYC must be at least max( 1, Lp( IA, N ) ). * * XR (local input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,Kx), where Kx is * at least Lc( JA, N ). Before entry, this array contains the * local entries of the matrix XR. * * LDXR (local input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least max( 1, K ). * * YR (local input) pointer to CHAR * On entry, YR is an array of dimension (LDYR,Ky), where Ky is * at least Lc( JA, N ). Before entry, this array contains the * local entries of the matrix YR. * * LDYR (local input) INTEGER * On entry, LDYR specifies the leading dimension of the array * YR. LDYR must be at least max( 1, K ). * * A (local input/local output) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the @(syhec) submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. On exit, the upper triangular part of * sub( A ) is overwritten by the upper triangular part of the * updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the @(syhec) submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. On exit, the lower triangular part of * sub( A ) is overwritten by the lower triangular part of the * updated submatrix. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * SYR2 (local input) pointer to function of type TZSYR2_T * On entry, SYR2 specifies the function performing the update * of a single block. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Arow, Aii, Aimb1, Ainb1, Ajj, Ald, Amp, Amb, Anb, Anq, Aoffi, Aoffj, Arcol, Arrow, GoEast, GoSouth, IsColRepl, IsRowRepl, XCinc, XRinc, Xii=0, Xjj=0, Xoffi=-1, Xoffj=-1, YCinc, YRinc, iimax, ilow, imbloc, inbloc, ioffd, ioffx, iupp, jjmax, joffd, joffx, lcmt, lcmt00, lmbloc, lnbloc, low, lower, m1, mbloc, mblkd, mblks, mycol, myrow, n1, nbloc, nblkd, nblks, npcol, nprow, pmb, qnb, size, tmp1, upp, upper; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( N <= 0 ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( DESCA[CTXT_], &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cainfog2l( N, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aimb1, &Ainb1, &Amp, &Anq, &Aii, &Ajj, &Arow, &Acol, &Arrow, &Arcol ); /* * Quick return if I don't own any of sub( A ) or if sub( A ) is replicated in * all processes. */ if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return; IsRowRepl = ( ( Arow < 0 ) || ( nprow == 1 ) ); IsColRepl = ( ( Acol < 0 ) || ( npcol == 1 ) ); Amb = DESCA[ MB_ ]; Anb = DESCA[ NB_ ]; Ald = DESCA[LLD_]; size = TYPE->size; if( IsRowRepl && IsColRepl ) { SYR2( TYPE, UPLO, Amp, Anq, K, 0, ALPHA, XC, LDXC, YC, LDYC, XR, LDXR, YR, LDYR, Mptr( A, Aii, Ajj, Ald, size ), Ald ); return; } XCinc = size; XRinc = LDXR * size; YCinc = size; YRinc = LDYR * size; upper = ( Mupcase( UPLO[0] ) == CUPPER ); lower = ( Mupcase( UPLO[0] ) == CLOWER ); /* * Initialize lcmt00, mblks, nblks, imbloc, inbloc, lmbloc, lnbloc, ilow, low, * iupp, and upp. */ PB_Cbinfo( 0, Amp, Anq, Aimb1, Ainb1, Amb, Anb, Arrow, Arcol, &lcmt00, &mblks, &nblks, &imbloc, &inbloc, &lmbloc, &lnbloc, &ilow, &low, &iupp, &upp ); iimax = ( Aoffi = Aii - 1 ) + ( m1 = Amp ); jjmax = ( Aoffj = Ajj - 1 ) + ( n1 = Anq ); pmb = ( IsRowRepl ? Amb : nprow * Amb ); qnb = ( IsColRepl ? Anb : npcol * Anb ); /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); /* * Go through the table looking for blocks owning diagonal entries. */ if( ( !( GoSouth ) ) && ( !( GoEast ) ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ SYR2( TYPE, UPLO, imbloc, inbloc, K, lcmt00, ALPHA, XC+Xii*XCinc, LDXC, YC+Xii*YCinc, LDYC, XR+Xjj*XRinc, LDXR, YR+Xjj*YRinc, LDYR, Mptr( A, Aii, Ajj, Ald, size ), Ald ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); if( GoSouth ) { /* * When the upper triangular part of sub( A ) should be updated and one is * planning to go south in the table, it is neccessary to take care of the * remaining columns of these imbloc rows immediately. */ if( upper && ( Anq > inbloc ) ) { tmp1 = Anq - inbloc; SYR2( TYPE, ALL, imbloc, tmp1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, YC+Xii*YCinc, LDYC, XR+(Xjj+inbloc)*XRinc, LDXR, YR+(Xjj+inbloc)*YRinc, LDYR, Mptr( A, Aii, Ajj+inbloc, Ald, size ), Ald ); } Aii += imbloc; Xii += imbloc; m1 -= imbloc; } else { /* * When the lower triangular part of sub( A ) should be updated and one is * planning to go east in the table, it is neccessary to take care of the * remaining rows of these inbloc columns immediately. */ if( lower && ( Amp > imbloc ) ) { tmp1 = Amp - imbloc; SYR2( TYPE, ALL, tmp1, inbloc, K, 0, ALPHA, XC+(Xii+imbloc)*XCinc, LDXC, YC+(Xii+imbloc)*YCinc, LDYC, XR+Xjj*XRinc, LDXR, YR+Xjj*YRinc, LDYR, Mptr( A, Aii+imbloc, Ajj, Ald, size ), Ald ); } Ajj += inbloc; Xjj += inbloc; n1 -= inbloc; } } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the local row indexes in A and XC. */ lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; Xoffi += imbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row indexes in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; Xoffi += Amb; } /* * Update the upper triangular part of sub( A ) we just skipped when necessary. */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { SYR2( TYPE, ALL, tmp1, n1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, YC+Xii*YCinc, LDYC, XR+(Xoffj+1)*XRinc, LDXR, YR+(Xoffj+1)*YRinc, LDYR, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald ); Aii += tmp1; Xii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffx = Xoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; SYR2( TYPE, UPLO, mbloc, inbloc, K, lcmt, ALPHA, XC+(ioffx+1)*XCinc, LDXC, YC+(ioffx+1)*YCinc, LDYC, XR+(Xoffj+1)*XRinc, LDXR, YR+(Xoffj+1)*YRinc, LDYR, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; ioffd += mbloc; Xoffi = ioffx; ioffx += mbloc; } /* * Update the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) SYR2( TYPE, ALL, tmp1, inbloc, K, 0, ALPHA, XC+(ioffx+1)*XCinc, LDXC, YC+(ioffx+1)*YCinc, LDYC, XR+(Xoffj+1)*XRinc, LDXR, YR+(Xoffj+1)*YRinc, LDYR, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald ); tmp1 = Aoffi - Aii + 1; m1 -= tmp1; n1 -= inbloc; lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; Xoffj += inbloc; /* * Update the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) SYR2( TYPE, ALL, tmp1, n1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, YC+Xii*YCinc, LDYC, XR+(Xoffj+1)*XRinc, LDXR, YR+(Xoffj+1)*YRinc, LDYR, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; Xii = Xoffi + 1; Xjj = Xoffj + 1; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as well as * the local column index in A and XR. */ lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; Xoffj += inbloc; /* * While there are blocks remaining that own lower entries, keep going east. * Adjust the current LCM value as well as the local column index in A and XR. */ while( ( nblks > 0 ) && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; Aoffj += Anb; Xoffj += Anb; } /* * Update the lower triangular part of sub( A ). */ tmp1 = MIN( Aoffj, jjmax ) - Ajj + 1; if( lower && ( tmp1 > 0 ) ) { SYR2( TYPE, ALL, m1, tmp1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, YC+Xii*YCinc, LDYC, XR+Xjj*XRinc, LDXR, YR+Xjj*YRinc, LDYR, Mptr( A, Aii, Ajj, Ald, size ), Ald ); Ajj += tmp1; Xjj += tmp1; n1 -= tmp1; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return; /* * lcmt00 >= low. The current block owns either diagonals or upper entries. * Save the current position in the LCM table. After this row has been * completely taken care of, re-start from this column and the next row of * the LCM table. */ lcmt = lcmt00; nblkd = nblks; joffd = Aoffj; joffx = Xoffj; nbloc = Anb; while( ( nblkd > 0 ) && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ if( nblkd == 1 ) nbloc = lnbloc; SYR2( TYPE, UPLO, imbloc, nbloc, K, lcmt, ALPHA, XC+Xii*XCinc, LDXC, YC+Xii*YCinc, LDYC, XR+(joffx+1)*XRinc, LDXR, YR+(joffx+1)*YRinc, LDYR, Mptr( A, Aii, joffd+1, Ald, size ), Ald ); lcmt00 = lcmt; lcmt += qnb; nblks = nblkd; nblkd--; Aoffj = joffd; joffd += nbloc; Xoffj = joffx; joffx += nbloc; } /* * Update the upper triangular part of sub( A ). */ tmp1 = n1 - joffd + Ajj - 1; if( upper && ( tmp1 > 0 ) ) SYR2( TYPE, ALL, imbloc, tmp1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, YC+Xii*YCinc, LDYC, XR+(joffx+1)*XRinc, LDXR, YR+(joffx+1)*YRinc, LDYR, Mptr( A, Aii, joffd+1, Ald, size ), Ald ); tmp1 = Aoffj - Ajj + 1; m1 -= imbloc; n1 -= tmp1; lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; Xoffi += imbloc; /* * Update the lower triangular part of sub( A ). */ if( lower && ( m1 > 0 ) && ( tmp1 > 0 ) ) SYR2( TYPE, ALL, m1, tmp1, K, 0, ALPHA, XC+(Xoffi+1)*XCinc, LDXC, YC+(Xoffi+1)*YCinc, LDYC, XR+Xjj*XRinc, LDXR, YR+Xjj*YRinc, LDYR, Mptr( A, Aoffi+1, Ajj, Ald, size ), Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; Xii = Xoffi + 1; Xjj = Xoffj + 1; } /* * Loop over the remaining columns of the LCM table. */ nbloc = Anb; while( nblks > 0 ) { if( nblks == 1 ) nbloc = lnbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row index in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; Xoffi += Amb; } /* * Update the upper triangular part of sub( A ). */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { SYR2( TYPE, ALL, tmp1, n1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, YC+Xii*YCinc, LDYC, XR+(Xoffj+1)*XRinc, LDXR, YR+(Xoffj+1)*YRinc, LDYR, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald ); Aii += tmp1; Xii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffx = Xoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; SYR2( TYPE, UPLO, mbloc, nbloc, K, lcmt, ALPHA, XC+(ioffx+1)*XCinc, LDXC, YC+(ioffx+1)*YCinc, LDYC, XR+(Xoffj+1)*XRinc, LDXR, YR+(Xoffj+1)*YRinc, LDYR, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; Xoffi = ioffx; ioffd += mbloc; ioffx += mbloc; } /* * Update the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) SYR2( TYPE, ALL, tmp1, nbloc, K, 0, ALPHA, XC+(ioffx+1)*XCinc, LDXC, YC+(ioffx+1)*YCinc, LDYC, XR+(Xoffj+1)*XRinc, LDXR, YR+(Xoffj+1)*YRinc, LDYR, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald ); tmp1 = MIN( Aoffi, iimax ) - Aii + 1; m1 -= tmp1; n1 -= nbloc; lcmt00 += qnb; nblks--; Aoffj += nbloc; Xoffj += nbloc; /* * Update the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) SYR2( TYPE, ALL, tmp1, n1, K, 0, ALPHA, XC+Xii*XCinc, LDXC, YC+Xii*YCinc, LDYC, XR+(Xoffj+1)*XRinc, LDXR, YR+(Xoffj+1)*YRinc, LDYR, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald ); Aii = Aoffi + 1; Ajj = Aoffj + 1; Xii = Xoffi + 1; Xjj = Xoffj + 1; } /* * End of PB_Cpsyr2 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cpsyr2kA.c000644 000766 000024 00000111745 10363532303 020413 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cpsyr2kA( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG, char * UPLO, char * TRANS, int N, int K, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_Cpsyr2kA( TYPE, DIRECAB, CONJUG, UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * CONJUG, * DIRECAB, * TRANS, * UPLO; int IA, IB, IC, JA, JB, JC, K, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; char * A, * B, * C; #endif { /* * Purpose * ======= * * PB_Cpsyr2kA performs one of the following symmetric or Hermitian rank * 2k operations * * sub( C ) := alpha*sub( A )*sub( B )' + alpha*sub( B )*sub( A )' + * beta*sub( C ), * or * sub( C ) := alpha*sub( A )*conjg( sub( B ) )' + * conjg( alpha )*sub( B )*conjg( sub( A ) )' + * beta*sub( C ), * or * sub( C ) := alpha*sub( A )'*sub( B ) + alpha*sub( B )'*sub( A ) + * beta*sub( C ), * or * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) + * conjg( alpha )*conjg( sub( B )' )*sub( A ) + * beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise, and, * * sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N', * B(IB:IB+K-1,JB:JB+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric or * Hermitian submatrix and sub( A ) and sub( B ) are n by k submatrices * in the first case and k by n submatrices in the second case. * * This is the outer-product algorithm using the logical LCM hybrid * and static blocking technique. The submatrix operand sub( C ) stays * in place. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECAB (global input) pointer to CHAR * On entry, DIRECAB specifies the direction in which the rows * or columns of sub( A ) and sub( B ) should be looped over as * follows: * DIRECAB = 'F' or 'f' forward or increasing, * DIRECAB = 'B' or 'b' backward or decreasing. * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether sub( C ) is a symmetric or * Hermitian submatrix operand as follows: * CONJUG = 'N' or 'n' sub( C ) is symmetric, * CONJUG = 'Z' or 'z' sub( C ) is Hermitian. * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the submatrix sub( C ) are to be referenced as follows: * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * submatrix sub( C ) are referenced, * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * submatrix sub( C ) are referenced. * * TRANS (global input) pointer to CHAR * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( B )' + * alpha*sub( B )*sub( A )' + * beta*sub( C ), * or * sub( C ) := alpha*sub( A )*sub( B )' + * alpha*sub( B )*sub( A )' + * beta*sub( C ), * or * sub( C ) := alpha*sub( A )*conjg( sub( B )' ) + * conjg( alpha )*sub( B )*conjg( sub( A )' ) + * beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ), * or * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ), * or * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) + * conjg( alpha )*conjg( sub( B )' )*sub( A ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrices sub( A ) and sub( B ), and on * entry with TRANS = 'T' or 't' or 'C' or 'c', K specifies the * number of rows of the submatrices sub( A ) and sub( B ). * K must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and B corresponding to the entries of the submatrices * sub( A ) and sub( B ) respectively need not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JB+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( B ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( B ). * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric or Hermitian submatrix sub( C ), and the * local entries corresponding to the strictly lower triangular * of sub( C ) are not referenced. On exit, the upper triangular * part of sub( C ) is overwritten by the upper triangular part * of the updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric or Hermitian submatrix sub( C ), and the * local entries corresponding to the strictly upper triangular * of sub( C ) are not referenced. On exit, the lower triangular * part of sub( C ) is overwritten by the lower triangular part * of the updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( C ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one, * talpha, * zero; int ABfwd, ABmyprocD, ABmyprocR, ABnprocsD, ABnprocsR, ABrocs, Abufld, AcurrocR, Afr, AiD, AiR, AiiD, AiiR, AinbD, AinbR, Ainb1D, Ainb1R, AisR, AkkR, Ald, AnbD, AnbR, AnpD, AnpR, Aoff, ArocD, ArocR, AsrcR, Bbufld, BcurrocR, Bfr, BiD, BiR, BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR, BkkR, Bld, BnbD, BnbR, BnpD, BnpR, Boff, BrocD, BrocR, BsrcR, Ccol, Cii, Cimb1, Cinb1, Cjj, Clcmb, Cld, Clp, Clq, Cnq0, Cmb, Cmp, Cmp0, Cnb, Cnq, Crow, WACfr, WACld, WACsum, WARfr, WARld, WARsum, WBCfr, WBCld, WBCsum, WBRfr, WBRld, WBRsum, Wkbb=0, conjg, ctxt, k, kb, kbb, l, lb, lcmb, ltmp, maxp, maxpm1, maxq, mycol, myrow, ncpq, notran, npcol, npq, nprow, nrpq, p=0, q=0, size, tmp, upper; GEMM_T gemm; TZSYR2_T tzsyr2k; /* * .. Local Arrays .. */ PB_VM_T VM; int Cd0 [DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WACd0[DLEN_], WARd0[DLEN_], WBCd0[DLEN_], WBRd0[DLEN_]; char * Abuf = NULL, * Bbuf = NULL, * Cptr = NULL, * WAC = NULL, * WAR = NULL, * WBC = NULL, * WBR = NULL; /* .. * .. Executable Statements .. * */ /* * sub( C ) := beta * sub( C ) */ PB_Cplascal( TYPE, UPLO, CONJUG, N, N, BETA, C, IC, JC, DESCC ); /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); conjg = ( Mupcase( CONJUG [0] ) == CCONJG ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gemm = TYPE->Fgemm; kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Compute descriptor Cd0 for sub( C ) */ PB_Cdescribe( N, N, IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj, &Cld, &Cimb1, &Cinb1, &Cmb, &Cnb, &Crow, &Ccol, Cd0 ); Cmp = PB_Cnumroc( N, 0, Cimb1, Cmb, myrow, Crow, nprow ); Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol ); if( ( Cmp > 0 ) && ( Cnq > 0 ) ) { Cptr = Mptr( C, Cii, Cjj, Cld, size ); if( conjg ) { talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha ); tzsyr2k = PB_Ctzher2k; } else { talpha = ALPHA; tzsyr2k = PB_Ctzsyr2k; } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ Clcmb = 2 * kb * PB_Clcm( ( Crow >= 0 ? nprow : 1 ), ( Ccol >= 0 ? npcol : 1 ) ); } /* * Retrieve local information for sub( A ) and sub( B ) */ if( ( notran = ( Mupcase( TRANS[0] ) == CNOTRAN ) ) != 0 ) { ABnprocsR = npcol; AiR = JA; AinbR = DESCA[INB_]; AnbR = DESCA[NB_]; AsrcR = DESCA[CSRC_]; BiR = JB; BinbR = DESCB[INB_]; BnbR = DESCB[NB_]; BsrcR = DESCB[CSRC_]; } else { ABnprocsR = nprow; AiR = IA; AinbR = DESCA[IMB_]; AnbR = DESCA[MB_]; AsrcR = DESCA[RSRC_]; BiR = IB; BinbR = DESCB[IMB_]; BnbR = DESCB[MB_]; BsrcR = DESCB[RSRC_]; } /* * If sub( A ) and sub( B ) only spans one process row or column, then there is * no need to pack the data. */ if( !( PB_Cspan( K, AiR, AinbR, AnbR, AsrcR, ABnprocsR ) ) && !( PB_Cspan( K, BiR, BinbR, BnbR, BsrcR, ABnprocsR ) ) ) { /* * Replicate sub( A ) in process rows and columns spanned by sub( C ): WAR, WAC * Replicate sub( B ) in process rows and columns spanned by sub( C ): WBR, WBC */ if( notran ) { PB_CInV( TYPE, NOCONJG, COLUMN, N, N, Cd0, K, A, IA, JA, DESCA, COLUMN, &WAC, WACd0, &WACfr ); PB_CInV( TYPE, CONJUG, ROW, N, N, Cd0, K, WAC, 0, 0, WACd0, COLUMN, &WAR, WARd0, &WARfr ); PB_CInV( TYPE, NOCONJG, COLUMN, N, N, Cd0, K, B, IB, JB, DESCB, COLUMN, &WBC, WBCd0, &WBCfr ); PB_CInV( TYPE, CONJUG, ROW, N, N, Cd0, K, WBC, 0, 0, WBCd0, COLUMN, &WBR, WBRd0, &WBRfr ); } else { PB_CInV( TYPE, NOCONJG, ROW, N, N, Cd0, K, A, IA, JA, DESCA, ROW, &WAR, WARd0, &WARfr ); PB_CInV( TYPE, CONJUG, COLUMN, N, N, Cd0, K, WAR, 0, 0, WARd0, ROW, &WAC, WACd0, &WACfr ); PB_CInV( TYPE, NOCONJG, ROW, N, N, Cd0, K, B, IB, JB, DESCB, ROW, &WBR, WBRd0, &WBRfr ); PB_CInV( TYPE, CONJUG, COLUMN, N, N, Cd0, K, WBR, 0, 0, WBRd0, ROW, &WBC, WBCd0, &WBCfr ); } /* * Perform the local update if I own some data */ if( ( Cmp > 0 ) && ( Cnq > 0 ) ) { WACld = WACd0[LLD_]; WBCld = WBCd0[LLD_]; WARld = WARd0[LLD_]; WBRld = WBRd0[LLD_]; if( Mupcase( UPLO[0] ) == CUPPER ) { for( l = 0; l < N; l += Clcmb ) { lb = N - l; lb = MIN( lb, Clcmb ); Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow ); Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol ); Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol ); if( Clp > 0 && Cnq0 > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0, &K, ALPHA, WAC, &WACld, Mptr( WBR, 0, Clq, WBRld, size ), &WBRld, one, Mptr( Cptr, 0, Clq, Cld, size ), &Cld ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0, &K, talpha, WBC, &WBCld, Mptr( WAR, 0, Clq, WARld, size ), &WARld, one, Mptr( Cptr, 0, Clq, Cld, size ), &Cld ); } PB_Cpsyr2( TYPE, UPPER, lb, K, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld, Mptr( WBC, Clp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Clq, WBRld, size ), WBRld, Cptr, l, l, Cd0, tzsyr2k ); } } else { for( l = 0; l < N; l += Clcmb ) { lb = N - l; ltmp = l + ( lb = MIN( lb, Clcmb ) ); Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow ); Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol ); PB_Cpsyr2( TYPE, LOWER, lb, K, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld, Mptr( WBC, Clp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Clq, WBRld, size ), WBRld, Cptr, l, l, Cd0, tzsyr2k ); Clp = PB_Cnumroc( ltmp, 0, Cimb1, Cmb, myrow, Crow, nprow ); Cmp0 = Cmp - Clp; Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol ); if( Cmp0 > 0 && Cnq0 > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0, &K, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), &WACld, Mptr( WBR, 0, Clq, WBRld, size ), &WBRld, one, Mptr( Cptr, Clp, Clq, Cld, size ), &Cld ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0, &K, talpha, Mptr( WBC, Clp, 0, WBCld, size ), &WBCld, Mptr( WAR, 0, Clq, WARld, size ), &WARld, one, Mptr( Cptr, Clp, Clq, Cld, size ), &Cld ); } } } if( conjg ) free( talpha ); } if( WACfr ) free( WAC ); if( WARfr ) free( WAR ); if( WBCfr ) free( WBC ); if( WBRfr ) free( WBR ); return; } /* * Otherwise sub( A ) and sub( B ) spans more than one process row or columns */ ABfwd = ( Mupcase( DIRECAB[0] ) == CFORWARD ); upper = ( Mupcase( UPLO [0] ) == CUPPER ); if( notran ) { ABmyprocD = myrow; ABmyprocR = mycol; ABnprocsD = nprow; AiD = IA; AinbD = DESCA[IMB_]; AnbD = DESCA[MB_]; Ald = DESCA[LLD_]; BiD = IB; BinbD = DESCB[IMB_]; BnbD = DESCB[MB_]; Bld = DESCB[LLD_]; PB_Cinfog2l( IA, JA, DESCA, ABnprocsD, ABnprocsR, ABmyprocD, ABmyprocR, &AiiD, &AiiR, &ArocD, &ArocR ); PB_Cinfog2l( IB, JB, DESCB, ABnprocsD, ABnprocsR, ABmyprocD, ABmyprocR, &BiiD, &BiiR, &BrocD, &BrocR ); } else { ABmyprocD = mycol; ABmyprocR = myrow; ABnprocsD = npcol; AiD = JA; AinbD = DESCA[INB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_]; BiD = JB; BinbD = DESCB[INB_]; BnbD = DESCB[NB_]; Bld = DESCB[LLD_]; PB_Cinfog2l( IA, JA, DESCA, ABnprocsR, ABnprocsD, ABmyprocR, ABmyprocD, &AiiR, &AiiD, &ArocR, &ArocD ); PB_Cinfog2l( IB, JB, DESCB, ABnprocsR, ABnprocsD, ABmyprocR, ABmyprocD, &BiiR, &BiiD, &BrocR, &BrocD ); } Ainb1D = PB_Cfirstnb( N, AiD, AinbD, AnbD ); AnpD = PB_Cnumroc( N, 0, Ainb1D, AnbD, ABmyprocD, ArocD, ABnprocsD ); Ainb1R = PB_Cfirstnb( K, AiR, AinbR, AnbR ); AisR = ( ( AsrcR < 0 ) || ( ABnprocsR == 1 ) ); Binb1D = PB_Cfirstnb( N, BiD, BinbD, BnbD ); BnpD = PB_Cnumroc( N, 0, Binb1D, BnbD, ABmyprocD, BrocD, ABnprocsD ); Binb1R = PB_Cfirstnb( K, BiR, BinbR, BnbR ); BisR = ( ( BsrcR < 0 ) || ( ABnprocsR == 1 ) ); /* * When sub( A ) is not replicated and backward pass on sub( A ), find the * virtual process q owning the last row or column of sub( A ). */ if( !( AisR ) && !( ABfwd ) ) { tmp = PB_Cindxg2p( K - 1, Ainb1R, AnbR, ArocR, ArocR, ABnprocsR ); q = MModSub( tmp, ArocR, ABnprocsR ); } /* * When sub( B ) is not replicated and backward pass on sub( B ), find the * virtual process p owning the last row or column of sub( B ). */ if( !( BisR ) && !( ABfwd ) ) { tmp = PB_Cindxg2p( K - 1, Binb1R, BnbR, BrocR, BrocR, ABnprocsR ); p = MModSub( tmp, BrocR, ABnprocsR ); } /* * Allocate work space in process rows and columns spanned by sub( C ) */ PB_COutV( TYPE, COLUMN, NOINIT, N, N, Cd0, kb, &WAC, WACd0, &WACfr, &WACsum ); PB_COutV( TYPE, ROW, NOINIT, N, N, Cd0, kb, &WAR, WARd0, &WARfr, &WARsum ); PB_COutV( TYPE, COLUMN, NOINIT, N, N, Cd0, kb, &WBC, WBCd0, &WBCfr, &WBCsum ); PB_COutV( TYPE, ROW, NOINIT, N, N, Cd0, kb, &WBR, WBRd0, &WBRfr, &WBRsum ); /* * Loop over the virtual process grid induced by the rows or columns of * sub( A ) and sub( B ) */ lcmb = PB_Clcm( ( maxp = ( BisR ? 1 : ABnprocsR ) ) * BnbR, ( maxq = ( AisR ? 1 : ABnprocsR ) ) * AnbR ); maxpm1 = maxp - 1; /* * Find out process coordinates corresponding to first virtual process (p,q) */ AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, ABnprocsR ) ); AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, ABnprocsR ); AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, ABnprocsR ); BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, ABnprocsR ) ); BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, ABnprocsR ); BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR, ABnprocsR ); /* * Find out how many diagonals this virtual process (p,q) has */ PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR, p, q, maxp, maxq, lcmb ); npq = PB_CVMnpq( &VM ); for( k = 0; k < K; k += kb ) { kbb = K - k; kbb = MIN( kbb, kb ); while( Wkbb != kbb ) { /* * Ensure that the current virtual process (p,q) has something to contribute * to the replicated buffers WA and WB. */ while( npq == 0 ) { if( ( ABfwd && ( p == maxpm1 ) ) || ( !( ABfwd ) && ( p == 0 ) ) ) q = ( ABfwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) ); p = ( ABfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) ); AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, ABnprocsR ) ); AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, ABnprocsR ); AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, ABnprocsR ); BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, ABnprocsR ) ); BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, ABnprocsR ); BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR, ABnprocsR ); PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR, p, q, maxp, maxq, lcmb ); npq = PB_CVMnpq( &VM ); } /* * Current virtual process (p,q) has something, find out how many rows or * columns could be used: ABrocs. */ if( Wkbb == 0 ) { ABrocs = ( npq < kbb ? npq : kbb ); } else { ABrocs = kbb - Wkbb; ABrocs = MIN( ABrocs, npq ); } /* * Find out how many rows or columns of sub( A ) and sub( B ) are contiguous */ PB_CVMcontig( &VM, &nrpq, &ncpq, &Boff, &Aoff ); if( notran ) { /* * Compute the descriptor DBUFA for the buffer that will contained the packed * columns of sub( A ). */ if( ( Afr = ( ncpq < ABrocs ) ) != 0 ) { /* * If columns of sub( A ) are not contiguous, then allocate the buffer and * pack the ABrocs columns of sub( A ). */ Abufld = MAX( 1, AnpD ); if( AisR || ( ABmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * ABrocs * size ); PB_CVMpack( TYPE, &VM, COLUMN, COLUMN, PACKING, NOTRAN, ABrocs, AnpD, one, Mptr( A, AiiD, AkkR, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( A ) directly. */ Abufld = Ald; if( AisR || ( ABmyprocR == AcurrocR ) ) Abuf = Mptr( A, AiiD, AkkR + Aoff, Ald, size ); } PB_Cdescset( DBUFA, N, ABrocs, Ainb1D, ABrocs, AnbD, ABrocs, ArocD, AcurrocR, ctxt, Abufld ); /* * Replicate panels of columns of sub( A ) over sub( C ) */ PB_CInV2( TYPE, NOCONJG, COLUMN, N, N, Cd0, ABrocs, Abuf, 0, 0, DBUFA, COLUMN, WAC, Wkbb, WACd0 ); if( Afr & ( AisR || ( ABmyprocR == AcurrocR ) ) ) if( Abuf ) free( Abuf ); /* * Compute the descriptor DBUFB for the buffer that will contained the packed * columns of sub( B ). */ if( ( Bfr = ( nrpq < ABrocs ) ) != 0 ) { /* * If columns of sub( B ) are not contiguous, then allocate the buffer and * pack the ABrocs columns of sub( B ). */ Bbufld = MAX( 1, BnpD ); if( BisR || ( ABmyprocR == BcurrocR ) ) { Bbuf = PB_Cmalloc( BnpD * ABrocs * size ); PB_CVMpack( TYPE, &VM, ROW, COLUMN, PACKING, NOTRAN, ABrocs, BnpD, one, Mptr( B, BiiD, BkkR, Bld, size ), Bld, zero, Bbuf, Bbufld ); } } else { /* * Otherwise, re-use sub( B ) directly. */ Bbufld = Bld; if( BisR || ( ABmyprocR == BcurrocR ) ) Bbuf = Mptr( B, BiiD, BkkR + Boff, Bld, size ); } PB_Cdescset( DBUFB, N, ABrocs, Binb1D, ABrocs, BnbD, ABrocs, BrocD, BcurrocR, ctxt, Bbufld ); /* * Replicate panels of columns of sub( A ) over sub( C ) */ PB_CInV2( TYPE, NOCONJG, COLUMN, N, N, Cd0, ABrocs, Bbuf, 0, 0, DBUFB, COLUMN, WBC, Wkbb, WBCd0 ); if( Bfr & ( BisR || ( ABmyprocR == BcurrocR ) ) ) if( Bbuf ) free( Bbuf ); } else { /* * Compute the descriptor DBUFA for the buffer that will contained the packed * rows of sub( A ). */ if( ( Afr = ( ncpq < ABrocs ) ) != 0 ) { /* * If rows of sub( A ) are not contiguous, then allocate the buffer and * pack the ABrocs rows of sub( A ). */ Abufld = ABrocs; if( AisR || ( ABmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * ABrocs * size ); PB_CVMpack( TYPE, &VM, COLUMN, ROW, PACKING, NOTRAN, ABrocs, AnpD, one, Mptr( A, AkkR, AiiD, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( A ) directly. */ Abufld = Ald; if( AisR || ( ABmyprocR == AcurrocR ) ) Abuf = Mptr( A, AkkR + Aoff, AiiD, Ald, size ); } PB_Cdescset( DBUFA, ABrocs, N, ABrocs, Ainb1D, ABrocs, AnbD, AcurrocR, ArocD, ctxt, Abufld ); /* * Replicate panels of rows of sub( A ) over sub( C ) */ PB_CInV2( TYPE, NOCONJG, ROW, N, N, Cd0, ABrocs, Abuf, 0, 0, DBUFA, ROW, WAR, Wkbb, WARd0 ); if( Afr & ( AisR || ( ABmyprocR == AcurrocR ) ) ) if( Abuf ) free( Abuf ); /* * Compute the descriptor DBUFB for the buffer that will contained the packed * rows of sub( B ). */ if( ( Bfr = ( nrpq < ABrocs ) ) != 0 ) { /* * If rows of sub( B ) are not contiguous, then allocate the buffer and * pack the ABrocs rows of sub( B ). */ Bbufld = ABrocs; if( BisR || ( ABmyprocR == BcurrocR ) ) { Bbuf = PB_Cmalloc( BnpD * ABrocs * size ); PB_CVMpack( TYPE, &VM, ROW, ROW, PACKING, NOTRAN, ABrocs, BnpD, one, Mptr( B, BkkR, BiiD, Bld, size ), Bld, zero, Bbuf, Bbufld ); } } else { /* * Otherwise, re-use sub( B ) directly. */ Bbufld = Bld; if( BisR || ( ABmyprocR == BcurrocR ) ) Bbuf = Mptr( B, BkkR + Boff, BiiD, Bld, size ); } PB_Cdescset( DBUFB, ABrocs, N, ABrocs, Binb1D, ABrocs, BnbD, BcurrocR, BrocD, ctxt, Bbufld ); /* * Replicate panels of rows of sub( B ) over sub( C ) */ PB_CInV2( TYPE, NOCONJG, ROW, N, N, Cd0, ABrocs, Bbuf, 0, 0, DBUFB, ROW, WBR, Wkbb, WBRd0 ); if( Bfr & ( BisR || ( ABmyprocR == BcurrocR ) ) ) if( Bbuf ) free( Bbuf ); } /* * Update the local indexes of sub( A ) and sub( B ) */ PB_CVMupdate( &VM, ABrocs, &BkkR, &AkkR ); /* * ABrocs rows or columns of sub( A ) and sub( B ) have been replicated, * update the number of diagonals in this virtual process as well as the * number of rows or columns of sub( A ) and sub( B ) that are in WA, WB. */ npq -= ABrocs; Wkbb += ABrocs; } if( notran ) { /* * WAR := WAC' */ PB_CInV2( TYPE, CONJUG, ROW, N, N, Cd0, kbb, WAC, 0, 0, WACd0, COLUMN, WAR, 0, WARd0 ); /* * WBR := WBC' */ PB_CInV2( TYPE, CONJUG, ROW, N, N, Cd0, kbb, WBC, 0, 0, WBCd0, COLUMN, WBR, 0, WBRd0 ); } else { /* * WAC := WAR' */ PB_CInV2( TYPE, CONJUG, COLUMN, N, N, Cd0, kbb, WAR, 0, 0, WARd0, ROW, WAC, 0, WACd0 ); /* * WBC := WBR' */ PB_CInV2( TYPE, CONJUG, COLUMN, N, N, Cd0, kbb, WBR, 0, 0, WBRd0, ROW, WBC, 0, WBCd0 ); } /* * Perform the local update if I own some data */ if( ( Cmp > 0 ) && ( Cnq > 0 ) ) { WACld = WACd0[LLD_]; WBCld = WBCd0[LLD_]; WARld = WARd0[LLD_]; WBRld = WBRd0[LLD_]; if( upper ) { for( l = 0; l < N; l += Clcmb ) { lb = N - l; lb = MIN( lb, Clcmb ); Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow ); Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol ); Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol ); if( Clp > 0 && Cnq0 > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0, &kbb, ALPHA, WAC, &WACld, Mptr( WBR, 0, Clq, WBRld, size ), &WBRld, one, Mptr( Cptr, 0, Clq, Cld, size ), &Cld ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0, &kbb, talpha, WBC, &WBCld, Mptr( WAR, 0, Clq, WARld, size ), &WARld, one, Mptr( Cptr, 0, Clq, Cld, size ), &Cld ); } PB_Cpsyr2( TYPE, UPPER, lb, kbb, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld, Mptr( WBC, Clp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Clq, WBRld, size ), WBRld, Cptr, l, l, Cd0, tzsyr2k ); } } else { for( l = 0; l < N; l += Clcmb ) { lb = N - l; ltmp = l + ( lb = MIN( lb, Clcmb ) ); Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow ); Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol ); PB_Cpsyr2( TYPE, LOWER, lb, kbb, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld, Mptr( WBC, Clp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Clq, WBRld, size ), WBRld, Cptr, l, l, Cd0, tzsyr2k ); Clp = PB_Cnumroc( ltmp, 0, Cimb1, Cmb, myrow, Crow, nprow ); Cmp0 = Cmp - Clp; Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol ); if( Cmp0 > 0 && Cnq0 > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0, &kbb, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), &WACld, Mptr( WBR, 0, Clq, WBRld, size ), &WBRld, one, Mptr( Cptr, Clp, Clq, Cld, size ), &Cld ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0, &kbb, talpha, Mptr( WBC, Clp, 0, WBCld, size ), &WBCld, Mptr( WAR, 0, Clq, WARld, size ), &WARld, one, Mptr( Cptr, Clp, Clq, Cld, size ), &Cld ); } } } } Wkbb = 0; } if( WACfr ) free( WAC ); if( WARfr ) free( WAR ); if( WBCfr ) free( WBC ); if( WBRfr ) free( WBR ); if( conjg && ( Cmp > 0 ) && ( Cnq > 0 ) ) free( talpha ); /* * End of PB_Cpsyr2kA */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cpsyr2kAC.c000644 000766 000024 00000121372 10363532303 020513 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cpsyr2kAC( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG, char * UPLO, char * TRANS, int N, int K, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_Cpsyr2kAC( TYPE, DIRECAB, CONJUG, UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * CONJUG, * DIRECAB, * TRANS, * UPLO; int IA, IB, IC, JA, JB, JC, K, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; char * A, * B, * C; #endif { /* * Purpose * ======= * * PB_Cpsyr2kAC performs one of the following symmetric or Hermitian * rank 2k operations * * sub( C ) := alpha*sub( A )*sub( B )' + alpha*sub( B )*sub( A )' + * beta*sub( C ), * or * sub( C ) := alpha*sub( A )*conjg( sub( B ) )' + * conjg( alpha )*sub( B )*conjg( sub( A ) )' + * beta*sub( C ), * or * sub( C ) := alpha*sub( A )'*sub( B ) + alpha*sub( B )'*sub( A ) + * beta*sub( C ), * or * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) + * conjg( alpha )*conjg( sub( B )' )*sub( A ) + * beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise, and, * * sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N', * B(IB:IB+K-1,JB:JB+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric or * Hermitian submatrix and sub( A ) and sub( B ) are n by k submatrices * in the first case and k by n submatrices in the second case. * * This is the outer-product algorithm using the logical aggregation * blocking technique. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECAB (global input) pointer to CHAR * On entry, DIRECAB specifies the direction in which the rows * or columns of sub( A ), sub( B ) and sub( C ) should be * looped over as follows: * DIRECAB = 'F' or 'f' forward or increasing, * DIRECAB = 'B' or 'b' backward or decreasing. * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether sub( C ) is a symmetric or * Hermitian submatrix operand as follows: * CONJUG = 'N' or 'n' sub( C ) is symmetric, * CONJUG = 'Z' or 'z' sub( C ) is Hermitian. * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the submatrix sub( C ) are to be referenced as follows: * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * submatrix sub( C ) are referenced, * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * submatrix sub( C ) are referenced. * * TRANS (global input) pointer to CHAR * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( B )' + * alpha*sub( B )*sub( A )' + * beta*sub( C ), * or * sub( C ) := alpha*sub( A )*sub( B )' + * alpha*sub( B )*sub( A )' + * beta*sub( C ), * or * sub( C ) := alpha*sub( A )*conjg( sub( B )' ) + * conjg( alpha )*sub( B )*conjg( sub( A )' ) + * beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ), * or * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*sub( B )'*sub( A ) + * alpha*sub( A )'*sub( B ) + * beta*sub( C ), * or * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) + * conjg( alpha )*conjg( sub( B )' )*sub( A ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrices sub( A ) and sub( B ), and on * entry with TRANS = 'T' or 't' or 'C' or 'c', K specifies the * number of rows of the submatrices sub( A ) and sub( B ). * K must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the arrays A * and B corresponding to the entries of the submatrices * sub( A ) and sub( B ) respectively need not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JB+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix B. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( B ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( B ). * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric or Hermitian submatrix sub( C ), and the * local entries corresponding to the strictly lower triangular * of sub( C ) are not referenced. On exit, the upper triangular * part of sub( C ) is overwritten by the upper triangular part * of the updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric or Hermitian submatrix sub( C ), and the * local entries corresponding to the strictly upper triangular * of sub( C ) are not referenced. On exit, the lower triangular * part of sub( C ) is overwritten by the lower triangular part * of the updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( C ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char GatherDir, ScatterDir, * one, top, * talpha, * tbeta, tran, * zero; int ABm, ABn, Acol, Acurcol, Acurrow, Acurimb1, Acurinb1, Afr, Aii, Aimb, Aimb1, Ainb, Ainb1, AisD, AisR, Ajj, Ald, Amb, Amp, Amp0, Anb, Anq, Anq0, Arow, Aspan, Bcol, Bcurcol, Bcurrow, Bcurimb1, Bcurinb1, Bfr, Bii, Bimb, Bimb1, Binb, Binb1, BisD, BisR, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq, Bnq0, Brow, Bspan, Ccsrc, Cimb, Cinb, Cmb, Cnb, Crsrc, WAfr, WACfr, WACld, WACreuse, WACsum, WBfr, WBCfr, WBCld, WBCsum, conjg, ctxt, fwd, k, kb, kbb, kend, kstart, kstep, ktmp, mycol, myrow, notran, npcol, nprow, size, upper; GEMM_T gemm; GSUM2D_T gsum2d; /* * .. Local Arrays .. */ char * Aptr = NULL, * Aptr0 = NULL, * Bptr = NULL, * Bptr0 = NULL, * WA = NULL, * WB = NULL, * WAC = NULL, *WBC = NULL; int Ad0[DLEN_], Bd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd[DLEN_], WBd[DLEN_], WACd [DLEN_], WBCd [DLEN_]; /* .. * .. Executable Statements .. * */ /* * sub( C ) = beta * sub( C ) */ PB_Cplascal( TYPE, UPLO, CONJUG, N, N, BETA, C, IC, JC, DESCC ); /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); fwd = ( Mupcase( DIRECAB[0] ) == CFORWARD ); conjg = ( Mupcase( CONJUG [0] ) == CCONJG ); upper = ( Mupcase( UPLO [0] ) == CUPPER ); notran = ( Mupcase( TRANS [0] ) == CNOTRAN ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm; /* * Figure out the loop bounds accordingly to DIRECAB */ kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); if( fwd ) { kstart = 0; kend = ( ( N - 1 ) / kb + 1 ) * kb; kstep = kb; GatherDir = CFORWARD; ScatterDir = CBACKWARD; } else { kstart = ( ( N - 1 ) / kb ) * kb; kend = kstep = -kb; GatherDir = CBACKWARD; ScatterDir = CFORWARD; } /* * Compute conjg( ALPHA ) and transpose parameter for Hermitian case */ if( conjg ) { tran = CCOTRAN; talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha ); } else { tran = CTRAN; talpha = ALPHA; } /* * Compute local information for sub( A ) and sub( B ) */ if( notran ) { ABm = N; ABn = K; } else { ABm = K; ABn = N; } PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); Aimb = DESCA[IMB_]; Ainb = DESCA[INB_]; Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Ald = DESCA[LLD_]; Aimb1 = PB_Cfirstnb( ABm, IA, Aimb, Amb ); Amp0 = PB_Cnumroc( ABm, 0, Aimb1, Amb, myrow, Arow, nprow ); Ainb1 = PB_Cfirstnb( ABn, JA, Ainb, Anb ); Anq0 = PB_Cnumroc( ABn, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp0 > 0 ) && ( Anq0 > 0 ) ) Aptr0 = Mptr( A, Aii, Ajj, Ald, size ); PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); Bimb = DESCB[IMB_]; Binb = DESCB[INB_]; Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_]; Bimb1 = PB_Cfirstnb( ABm, IB, Bimb, Bmb ); Bmp0 = PB_Cnumroc( ABm, 0, Bimb1, Bmb, myrow, Brow, nprow ); Binb1 = PB_Cfirstnb( ABn, JB, Binb, Bnb ); Bnq0 = PB_Cnumroc( ABn, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size ); if( notran ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cinb = DESCC[INB_]; Cnb = DESCC[NB_]; Ccsrc = DESCC[CSRC_]; /* * Determine if one can reuse the WAC buffer for the intermediate local products * sub( A ) * sub( B )' and sub( B ) * sub( A )'. */ AisR = ( ( Acol < 0 ) || ( npcol == 1 ) ); BisR = ( ( Bcol < 0 ) || ( npcol == 1 ) ); if( !( AisR ) && !( BisR ) ) { /* * When neither sub( A ) nor sub( B ) are replicated, WAC can be reused if * either sub( A ) spans more than one process column, or, neither sub( A ) * nor sub( B ) span more than one process column, and both operands reside * in the same process column. */ Aspan = PB_Cspan( ABn, 0, Ainb1, Anb, Acol, npcol ); Bspan = PB_Cspan( ABn, 0, Binb1, Bnb, Bcol, npcol ); WACreuse = ( Aspan || ( !( Aspan ) && !( Bspan ) && ( Acol == Bcol ) ) ); } else { /* * Otherwise, WAC can be reused when both operands sub( A ) and sub( B ) are * replicated. */ WACreuse = ( AisR && BisR ); } /* * Furthermore, the ability to reuse WAC requires sub( A ) and sub( B ) to be * either both not row-distributed, or, both row-distributed and aligned. */ AisD = ( ( Arow >= 0 ) && ( nprow > 1 ) ); BisD = ( ( Brow >= 0 ) && ( nprow > 1 ) ); WACreuse = ( WACreuse && ( ( !AisD && !BisD ) || ( ( AisD && BisD ) && ( ( Arow == Brow ) && ( ( ( Aimb1 >= ABm ) && ( Bimb1 >= ABm ) ) || ( ( Aimb1 == Bimb1 ) && ( Amb == Bmb ) ) ) ) ) ) ); tbeta = ( WACreuse ? one : zero ); if( upper ) { for( k = kstart; k != kend; k += kstep ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+K-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, B, IB+k, JB, DESCB, ROW, &Bptr, DBUFB, &Bfr ); /* * Replicate B( IB+k:IB+k+kbb-1, JB:JB+K-1 ) over A( IA:IA+k+kbb-1, JA:JA+K-1 ) */ PB_Cdescset( Ad0, ktmp, ABn, Aimb1, Ainb1, Amb, Anb, Arow, Acol, ctxt, Ald ); PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Ad0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * WAC := A( IA:IA+k+kbb-1, JA:JA+K-1 ) * B( IB+k:IB+k+kbb-1, JB:JB+K-1 )' */ PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Ad0, kbb, &WAC, WACd, &WACfr, &WACsum ); WACld = WACd[LLD_]; Amp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); if( ( Amp > 0 ) && ( Anq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0, ALPHA, Aptr0, &Ald, WB, &WBd[LLD_], zero, WAC, &WACld ); if( WBfr ) free( WB ); if( Bfr ) free( Bptr ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over B( IB:IB+k+kbb-1, JB:JB+K-1 ) */ PB_Cdescset( Bd0, ktmp, ABn, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * WBC := B( IB:IB+k+kbb-1, JB:JB+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )' */ if( WACreuse ) { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; } else { PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Bd0, kbb, &WBC, WBCd, &WBCfr, &WBCsum ); } WBCld = WBCd[LLD_]; Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp, &kbb, &Bnq0, talpha, Bptr0, &Bld, WA, &WAd[LLD_], tbeta, WBC, &WBCld ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * Combine the local copies of WAC when necessary */ if( WACsum ) { WACd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : ktmp - 1 ), Cinb, Cnb, Ccsrc, Ccsrc, npcol ); if( Amp > 0 ) gsum2d( ctxt, ROW, &top, Amp, kbb, WAC, WACld, myrow, WACd[CSRC_] ); } /* * Zero lower triangle of WAC( k:k+kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WAC, k, 0, WACd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC, k+1, 0, WACd ); /* * Combine the local copies of WBC when necessary */ if( !( WACreuse ) ) { if( WBCsum ) { if( WACsum ) { WBCd[CSRC_] = WACd[CSRC_]; } else { WBCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : ktmp - 1 ), Cinb, Cnb, Ccsrc, Ccsrc, npcol ); } if( Bmp > 0 ) gsum2d( ctxt, ROW, &top, Bmp, kbb, WBC, WBCld, myrow, WBCd[CSRC_] ); } /* * Zero lower triangle of WBC( k:k+kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WBC, k, 0, WBCd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WBC, k+1, 0, WBCd ); } /* * Add WAC to C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 ) */ PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WAC, 0, 0, WACd, COLUMN, one, C, IC, JC+k, DESCC, COLUMN ); if( WACfr ) free( WAC ); /* * Add WBC to C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 ) */ if( !( WACreuse ) ) { PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WBC, 0, 0, WBCd, COLUMN, one, C, IC, JC+k, DESCC, COLUMN ); if( WBCfr ) free( WBC ); } } } else { for( k = kstart; k != kend; k += kstep ) { ktmp = N - k; kbb = MIN( ktmp, kb ); /* * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+K-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, B, IB+k, JB, DESCB, ROW, &Bptr, DBUFB, &Bfr ); /* * Replicate B( IB+k:IB+k+kbb-1, JB:JB+K-1 ) over A( IA+k:IA+N-1, JA:JA+K-1 ) */ Acurimb1 = PB_Cfirstnb( ktmp, IA+k, Aimb, Amb ); Acurrow = PB_Cindxg2p( k, Aimb1, Amb, Arow, Arow, nprow ); PB_Cdescset( Ad0, ktmp, ABn, Acurimb1, Ainb1, Amb, Anb, Acurrow, Acol, ctxt, Ald ); PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Ad0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * WAC := A( IA+k:IA+N-1, JA:JA+K-1 ) * B( IB+k:IB+k+kbb-1, JB:JB+K-1 )' */ PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Ad0, kbb, &WAC, WACd, &WACfr, &WACsum ); WACld = WACd[LLD_]; Amp = PB_Cnumroc( ktmp, k, Aimb1, Amb, myrow, Arow, nprow ); if( ( Amp > 0 ) && ( Anq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0, ALPHA, Mptr( Aptr0, Amp0-Amp, 0, Ald, size ), &Ald, WB, &WBd[LLD_], zero, WAC, &WACld ); if( WBfr ) free( WB ); if( Bfr ) free( Bptr ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over B( IB+k:IB+N-1, JB:JB+K-1 ) */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, ABn, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * WBC := B( IB+k:IB+N-1, JB:JB+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )' */ if( WACreuse ) { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; } else { PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Bd0, kbb, &WBC, WBCd, &WBCfr, &WBCsum ); } WBCld = WBCd[LLD_]; Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp, &kbb, &Bnq0, talpha, Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld, WA, &WAd[LLD_], tbeta, WBC, &WBCld ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * Combine the local copies of WAC when necessary */ if( WACsum ) { WACd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : k + kbb - 1 ), Cinb, Cnb, Ccsrc, Ccsrc, npcol ); if( Amp > 0 ) gsum2d( ctxt, ROW, &top, Amp, kbb, WAC, WACld, myrow, WACd[CSRC_] ); } /* * Zero upper triangle of WAC( 0:kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WAC, 0, 0, WACd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC, 0, 1, WACd ); /* * Combine the local copies of WBC when necessary */ if( !( WACreuse ) ) { if( WBCsum ) { if( WACsum ) { WBCd[CSRC_] = WACd[CSRC_]; } else { WBCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : k+kbb-1 ), Cinb, Cnb, Ccsrc, Ccsrc, npcol ); } if( Bmp > 0 ) gsum2d( ctxt, ROW, &top, Bmp, kbb, WBC, WBCld, myrow, WBCd[CSRC_] ); } /* * Zero upper triangle of WBC( 0:kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WBC, 0, 0, WBCd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WBC, 0, 1, WBCd ); } /* * Add WAC to C( IC+k:IC+N-1, JC+k:JC+k+kbb-1 ) */ PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WAC, 0, 0, WACd, COLUMN, one, C, IC+k, JC+k, DESCC, COLUMN ); if( WACfr ) free( WAC ); /* * Add WBC to C( IC+k:IC+N-1, JC+k:JC+k+kbb-1 ) */ if( !( WACreuse ) ) { PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WBC, 0, 0, WBCd, COLUMN, one, C, IC+k, JC+k, DESCC, COLUMN ); if( WBCfr ) free( WBC ); } } } } else { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cimb = DESCC[IMB_]; Cmb = DESCC[MB_]; Crsrc = DESCC[RSRC_]; /* * Determine if one can reuse the WAC buffer for the intermediate local products * sub( A )' * sub( B ) and sub( B )' * sub( A ). */ AisR = ( ( Arow < 0 ) || ( nprow == 1 ) ); BisR = ( ( Brow < 0 ) || ( nprow == 1 ) ); /* * When neither sub( A ) nor sub( B ) are replicated, WAC can be reused if * either sub( A ) spans more than one process row, or, neither sub( A ) nor * sub( B ) span more than one process row, and both operands reside in the * same process row. */ if( !( AisR ) && !( BisR ) ) { Aspan = PB_Cspan( ABm, 0, Aimb1, Amb, Arow, nprow ); Bspan = PB_Cspan( ABm, 0, Bimb1, Bmb, Brow, nprow ); WACreuse = ( Aspan || ( !( Aspan ) && !( Bspan ) && ( Arow == Brow ) ) ); } else { /* * Otherwise, WAC can be reused when both operands sub( A ) and sub( B ) are * replicated. */ WACreuse = ( AisR && BisR ); } /* * Furthermore, the ability to reuse WAC requires sub( A ) and sub( B ) to be * either both not column-distributed, or, both column-distributed and aligned. */ AisD = ( ( Acol >= 0 ) && ( npcol > 1 ) ); BisD = ( ( Bcol >= 0 ) && ( npcol > 1 ) ); WACreuse = ( WACreuse && ( ( !AisD && !BisD ) || ( ( AisD && BisD ) && ( ( Acol == Bcol ) && ( ( ( Ainb1 >= ABn ) && ( Binb1 >= ABn ) ) || ( ( Ainb1 == Binb1 ) && ( Anb == Bnb ) ) ) ) ) ) ); tbeta = ( WACreuse ? one : zero ); if( upper ) { for( k = kstart; k != kend; k += kstep ) { ktmp = N - k; kbb = MIN( ktmp, kb ); /* * Accumulate B( IB:IB+K-1, JB+k:JB+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, B, IB, JB+k, DESCB, COLUMN, &Bptr, DBUFB, &Bfr ); /* * Replicate B( IB:IB+K-1, JB+k:JB+k+kbb-1 ) over A( IA:IA+K-1, JA+k:JA+N-1 ) */ Acurinb1 = PB_Cfirstnb( ktmp, JA+k, Ainb, Anb ); Acurcol = PB_Cindxg2p( k, Ainb1, Anb, Acol, Acol, npcol ); PB_Cdescset( Ad0, ABm, ktmp, Aimb1, Acurinb1, Amb, Anb, Arow, Acurcol, ctxt, Ald ); PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Ad0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * WAC := B( IB:IB+K-1, JB+k:JB+k+kbb-1 )' * A( IA:IA+K-1, JA+k:JA+N-1 ) */ PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Ad0, kbb, &WAC, WACd, &WACfr, &WACsum ); WACld = WACd[LLD_]; Anq = PB_Cnumroc( ktmp, k, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anq > 0 ) && ( Amp0 > 0 ) ) gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0, talpha, WB, &WBd[LLD_], Mptr( Aptr0, 0, Anq0-Anq, Ald, size ), &Ald, zero, WAC, &WACld ); if( WBfr ) free( WB ); if( Bfr ) free( Bptr ); /* * Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over B( IB:IB+K-1, JB+k:JB+N-1 ) */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, ABm, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * WBC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * B( IB:IB+K-1, JB+k:JB+N-1 ) */ if( WACreuse ) { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; } else { PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Bd0, kbb, &WBC, WBCd, &WBCfr, &WBCsum ); } WBCld = WBCd[LLD_]; Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bnq > 0 ) && ( Bmp0 > 0 ) ) gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq, &Bmp0, ALPHA, WA, &WAd[LLD_], Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld, tbeta, WBC, &WBCld ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * Combine the local copies of WAC when necessary */ if( WACsum ) { WACd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : k + kbb - 1 ), Cimb, Cmb, Crsrc, Crsrc, nprow ); if( Anq > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Anq, WAC, WACld, WACd[RSRC_], mycol ); } /* * Zero lower triangle of WBC( 0:kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WAC, 0, 0, WACd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC, 1, 0, WACd ); /* * Combine the local copies of WBC when necessary */ if( !( WACreuse ) ) { if( WBCsum ) { if( WACsum ) { WBCd[RSRC_] = WACd[RSRC_]; } else { WBCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : k + kbb - 1 ), Cimb, Cmb, Crsrc, Crsrc, nprow ); } if( Bnq > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Bnq, WBC, WBCld, WBCd[RSRC_], mycol ); } /* * Zero lower triangle of WBC( 0:kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WBC, 0, 0, WBCd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WBC, 1, 0, WBCd ); } /* * Add WAC to C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 ) */ PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WAC, 0, 0, WACd, ROW, one, C, IC+k, JC+k, DESCC, ROW ); if( WACfr ) free( WAC ); /* * Add WBC to C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 ) */ if( !( WACreuse ) ) { PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WBC, 0, 0, WBCd, ROW, one, C, IC+k, JC+k, DESCC, ROW ); if( WBCfr ) free( WBC ); } } } else { for( k = kstart; k != kend; k += kstep ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate B( IB:IB+K-1, JB+k:JB+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, B, IB, JB+k, DESCB, COLUMN, &Bptr, DBUFB, &Bfr ); /* * Replicate B( IB:IB+K-1, JB+k:JB+k+kbb-1 ) over A( IA:IA+K-1, JA:JA+k+kbb-1 ) */ PB_Cdescset( Ad0, ABm, ktmp, Aimb1, Ainb1, Amb, Anb, Arow, Acol, ctxt, Ald ); PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Ad0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * WAC := B( IB:IB+K-1, JB+k:JB+k+kbb-1 )' * A( IA:IA+K-1, JA:JA+k+kbb-1 ) */ PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Ad0, kbb, &WAC, WACd, &WACfr, &WACsum ); WACld = WACd[LLD_]; Anq = PB_Cnumroc( ktmp, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anq > 0 ) && ( Amp0 > 0 ) ) gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0, talpha, WB, &WBd[LLD_], Aptr0, &Ald, zero, WAC, &WACld ); if( WBfr ) free( WB ); if( Bfr ) free( Bptr ); /* * Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over B( IB:IB+K-1, JB:JB+k+kbb-1 ) */ PB_Cdescset( Bd0, ABm, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * WBC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * B( IB:IB+K-1, JB:JB+k+kbb-1 ) */ if( WACreuse ) { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; } else { PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Bd0, kbb, &WBC, WBCd, &WBCfr, &WBCsum ); } WBCld = WBCd[LLD_]; Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bnq > 0 ) && ( Bmp0 > 0 ) ) gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq, &Bmp0, ALPHA, WA, &WAd[LLD_], Bptr0, &Bld, tbeta, WBC, &WBCld ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * Combine the local copies of WAC when necessary */ if( WACsum ) { WACd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : ktmp - 1 ), Cimb, Cmb, Crsrc, Crsrc, nprow ); if( Anq > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Anq, WAC, WACld, WACd[RSRC_], mycol ); } /* * Zero upper triangle of WBC( 0:kbb-1, k:k+kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WAC, 0, k, WACd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC, 0, k+1, WACd ); /* * Combine the local copies of WBC when necessary */ if( !( WACreuse ) ) { if( WBCsum ) { if( WACsum ) { WBCd[RSRC_] = WACd[RSRC_]; } else { WBCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : ktmp - 1 ), Cimb, Cmb, Crsrc, Crsrc, nprow ); } if( Bnq > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Bnq, WBC, WBCld, WBCd[RSRC_], mycol ); } /* * Zero upper triangle of WBC( 0:kbb-1, k:k+kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WBC, 0, k, WBCd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WBC, 0, k+1, WBCd ); } /* * Add WAC to C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 ) */ PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WAC, 0, 0, WACd, ROW, one, C, IC+k, JC, DESCC, ROW ); if( WACfr ) free( WAC ); /* * Add WBC to C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 ) */ if( !( WACreuse ) ) { PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WBC, 0, 0, WBCd, ROW, one, C, IC+k, JC, DESCC, ROW ); if( WBCfr ) free( WBC ); } } } } if( conjg ) free( talpha ); /* * End of PB_Cpsyr2kAC */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpsyrkA.c000644 000766 000024 00000057461 10363532303 020335 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpsyrkA( PBTYP_T * TYPE, char * DIRECA, char * CONJUG, char * UPLO, char * TRANS, int N, int K, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_CpsyrkA( TYPE, DIRECA, CONJUG, UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * CONJUG, * DIRECA, * TRANS, * UPLO; int IA, IC, JA, JC, K, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCC; char * A, * C; #endif { /* * Purpose * ======= * * PB_CpsyrkA performs one of the following symmetric or Hermitian rank * k operations * * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * or * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + beta*sub( C ), * or * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * or * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and, * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * or Hermitian submatrix and sub( A ) is an n by k submatrix in the * first case and a k by n submatrix in the second case. * * This is the outer-product algorithm using the logical LCM hybrid * and static blocking techniques. The submatrix operand sub( C ) stays * in place. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECA (global input) pointer to CHAR * On entry, DIRECA specifies the direction in which the rows * or columns of sub( A ) should be looped over as follows: * DIRECA = 'F' or 'f' forward or increasing, * DIRECA = 'B' or 'b' backward or decreasing. * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether sub( C ) is a symmetric or * Hermitian submatrix operand as follows: * CONJUG = 'N' or 'n' sub( C ) is symmetric, * CONJUG = 'Z' or 'z' sub( C ) is Hermitian. * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the submatrix sub( C ) are to be referenced as follows: * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * submatrix sub( C ) are referenced, * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * submatrix sub( C ) are referenced. * * TRANS (global input) pointer to CHAR * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * or * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * or * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + * beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * or * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * or * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry, with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrix sub( A ), and with TRANS = 'T' or * 't' or 'C' or 'c', K specifies the number of rows of the sub- * matrix sub( A ). K must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric or Hermitian submatrix sub( C ), and the * local entries corresponding to the strictly lower triangular * of sub( C ) are not referenced. On exit, the upper triangular * part of sub( C ) is overwritten by the upper triangular part * of the updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric or Hermitian submatrix sub( C ), and the * local entries corresponding to the strictly upper triangular * of sub( C ) are not referenced. On exit, the lower triangular * part of sub( C ) is overwritten by the lower triangular part * of the updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( C ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int AcurrocR, Afwd, AiD, AiR, AiiD, AiiR, AinbD, AinbR, Ainb1D, Ainb1R, AisR, Ald, AmyprocD, AmyprocR, AnbD, AnbR, AnpR, AnprocsD, AnprocsR, ArocD, ArocR, Arocs, AsrcR, Ccol, Cii, Cimb1, Cinb1, Cjj, Clcmb, Cld, Clp, Clq, Cnq0, Cmb, Cmp, Cmp0, Cnb, Cnq, Crow, WACfr, WACld, WACsum, WARfr, WARld, WARsum, Wkbb=0, ctxt, k, kb, kbb, l, lb, ltmp, maxp, mycol, myrow, notran, npcol, nprow, p=0, size, tmp, upper; GEMM_T gemm; TZSYR_T tzsyrk; /* * .. Local Arrays .. */ int Cd0[DLEN_], DBUFA[DLEN_], WACd0[DLEN_], WARd0[DLEN_]; char * Aptr = NULL, * Cptr = NULL, * WAC = NULL, * WAR = NULL; /* .. * .. Executable Statements .. * */ /* * sub( C ) := beta * sub( C ) */ PB_Cplascal( TYPE, UPLO, CONJUG, N, N, BETA, C, IC, JC, DESCC ); /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Compute descriptor Cd0 for sub( C ) */ PB_Cdescribe( N, N, IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj, &Cld, &Cimb1, &Cinb1, &Cmb, &Cnb, &Crow, &Ccol, Cd0 ); Cmp = PB_Cnumroc( N, 0, Cimb1, Cmb, myrow, Crow, nprow ); Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol ); if( ( Cmp > 0 ) && ( Cnq > 0 ) ) { Cptr = Mptr( C, Cii, Cjj, Cld, size ); tzsyrk = ( ( Mupcase( CONJUG[0] ) == CNOCONJG ) ? PB_Ctzsyrk : PB_Ctzherk ); /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ Clcmb = 2 * kb * PB_Clcm( ( Crow >= 0 ? nprow : 1 ), ( Ccol >= 0 ? npcol : 1 ) ); } /* * Retrieve local information for sub( A ) */ if( ( notran = ( Mupcase( TRANS[0] ) == CNOTRAN ) ) != 0 ) { AiR = JA; AnprocsR = npcol; AinbR = DESCA[INB_]; AnbR = DESCA[NB_]; AsrcR = DESCA[CSRC_]; } else { AiR = IA; AnprocsR = nprow; AinbR = DESCA[IMB_]; AnbR = DESCA[MB_]; AsrcR = DESCA[RSRC_]; } /* * If sub( A ) only spans one process row or column, then there is no need to * pack the data. */ if( !( PB_Cspan( K, AiR, AinbR, AnbR, AsrcR, AnprocsR ) ) ) { /* * Replicate sub( A ) in process rows and columns spanned by sub( C ): WAC, WAR */ if( notran ) { PB_CInV( TYPE, NOCONJG, COLUMN, N, N, Cd0, K, A, IA, JA, DESCA, COLUMN, &WAC, WACd0, &WACfr ); PB_CInV( TYPE, CONJUG, ROW, N, N, Cd0, K, WAC, 0, 0, WACd0, COLUMN, &WAR, WARd0, &WARfr ); } else { PB_CInV( TYPE, NOCONJG, ROW, N, N, Cd0, K, A, IA, JA, DESCA, ROW, &WAR, WARd0, &WARfr ); PB_CInV( TYPE, CONJUG, COLUMN, N, N, Cd0, K, WAR, 0, 0, WARd0, ROW, &WAC, WACd0, &WACfr ); } /* * Perform the local update if I own some data */ if( ( Cmp > 0 ) && ( Cnq > 0 ) ) { WACld = WACd0[LLD_]; WARld = WARd0[LLD_]; if( Mupcase( UPLO[0] ) == CUPPER ) { for( l = 0; l < N; l += Clcmb ) { lb = N - l; lb = MIN( lb, Clcmb ); Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow ); Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol ); Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol ); if( Clp > 0 && Cnq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0, &K, ALPHA, WAC, &WACld, Mptr( WAR, 0, Clq, WARld, size ), &WARld, one, Mptr( Cptr, 0, Clq, Cld, size ), &Cld ); PB_Cpsyr( TYPE, UPPER, lb, K, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld, Cptr, l, l, Cd0, tzsyrk ); } } else { for( l = 0; l < N; l += Clcmb ) { lb = N - l; ltmp = l + ( lb = MIN( lb, Clcmb ) ); Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow ); Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol ); PB_Cpsyr( TYPE, LOWER, lb, K, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld, Cptr, l, l, Cd0, tzsyrk ); Clp = PB_Cnumroc( ltmp, 0, Cimb1, Cmb, myrow, Crow, nprow ); Cmp0 = Cmp - Clp; Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol ); if( Cmp0 > 0 && Cnq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0, &K, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), &WACld, Mptr( WAR, 0, Clq, WARld, size ), &WARld, one, Mptr( Cptr, Clp, Clq, Cld, size ), &Cld ); } } } if( WACfr ) free( WAC ); if( WARfr ) free( WAR ); return; } /* * Otherwise sub( A ) spans more than one process row or columns -> LCM hybrid */ Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD ); upper = ( Mupcase( UPLO [0] ) == CUPPER ); if( notran ) { AiD = IA; AinbD = DESCA[IMB_]; AnbD = DESCA[MB_]; Ald = DESCA[LLD_]; AmyprocD = myrow; AmyprocR = mycol; AnprocsD = nprow; PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR, &AiiD, &AiiR, &ArocD, &ArocR ); } else { AiD = JA; AinbD = DESCA[INB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_]; AmyprocD = mycol; AmyprocR = myrow; AnprocsD = npcol; PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD, &AiiR, &AiiD, &ArocR, &ArocD ); } Ainb1D = PB_Cfirstnb( N, AiD, AinbD, AnbD ); Ainb1R = PB_Cfirstnb( K, AiR, AinbR, AnbR ); AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) ); /* * When sub( A ) is not replicated and backward pass on sub( A ), find the * virtual process (p,p) owning the last row or column of sub( A ). */ if( !( AisR ) && !( Afwd ) ) { tmp = PB_Cindxg2p( K - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR ); p = MModSub( tmp, ArocR, AnprocsR ); } /* * Allocate work space in process rows and columns spanned by sub( C ) */ PB_COutV( TYPE, COLUMN, NOINIT, N, N, Cd0, kb, &WAC, WACd0, &WACfr, &WACsum ); PB_COutV( TYPE, ROW, NOINIT, N, N, Cd0, kb, &WAR, WARd0, &WARfr, &WARsum ); /* * Loop over the virtual process grid induced by the rows or columns of sub( A ) */ maxp = ( AisR ? 1 : AnprocsR ); AcurrocR = ( AisR ? -1 : MModAdd( ArocR, p, AnprocsR ) ); AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR ); for( k = 0; k < K; k += kb ) { kbb = K - k; kbb = MIN( kbb, kb ); while( Wkbb != kbb ) { /* * Ensure that the current virtual process (p,p) has something to contribute to * the replicated buffers WAC and WAR. */ while( AnpR == 0 ) { p = ( Afwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) ); AcurrocR = ( AisR ? -1 : MModAdd( ArocR, p, AnprocsR ) ); AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR ); } /* * Current virtual process (p,p) has something, find out how many rows or * columns could be used: Arocs. */ if( Wkbb == 0 ) { Arocs = ( AnpR < kbb ? AnpR : kbb ); } else { Arocs = kbb - Wkbb; Arocs = MIN( Arocs, AnpR ); } /* * The current virtual process (p,p) has Arocs rows or columns of sub( A ) * to contribute, replicate the data over sub( C ). */ if( notran ) { if( AisR || ( AmyprocR == AcurrocR ) ) { Aptr = Mptr( A, AiiD, AiiR, Ald, size ); AiiR += Arocs; } PB_Cdescset( DBUFA, N, Arocs, Ainb1D, Arocs, AnbD, Arocs, ArocD, AcurrocR, ctxt, Ald ); /* * Replicate Arocs columns of sub( A ) in process columns spanned by sub( C ) */ PB_CInV2( TYPE, NOCONJG, COLUMN, N, N, Cd0, Arocs, Aptr, 0, 0, DBUFA, COLUMN, WAC, Wkbb, WACd0 ); } else { if( AisR || ( AmyprocR == AcurrocR ) ) { Aptr = Mptr( A, AiiR, AiiD, Ald, size ); AiiR += Arocs; } PB_Cdescset( DBUFA, Arocs, N, Arocs, Ainb1D, Arocs, AnbD, AcurrocR, ArocD, ctxt, Ald ); /* * Replicate Arocs rows of sub( A ) in process rows spanned by sub( C ) */ PB_CInV2( TYPE, NOCONJG, ROW, N, N, Cd0, Arocs, Aptr, 0, 0, DBUFA, ROW, WAR, Wkbb, WARd0 ); } /* * Arocs rows or columns of sub( A ) have been replicated over sub( C ), * update the number of diagonals in this virtual process as well as the * number of rows or columns of sub( A ) that are in WAR or WAC. */ AnpR -= Arocs; Wkbb += Arocs; } if( notran ) { /* * WAR := WAC' */ PB_CInV2( TYPE, CONJUG, ROW, N, N, Cd0, kbb, WAC, 0, 0, WACd0, COLUMN, WAR, 0, WARd0 ); } else { /* * WAC := WAR' */ PB_CInV2( TYPE, CONJUG, COLUMN, N, N, Cd0, kbb, WAR, 0, 0, WARd0, ROW, WAC, 0, WACd0 ); } /* * Perform the local update if I own some data */ if( ( Cmp > 0 ) && ( Cnq > 0 ) ) { WACld = WACd0[LLD_]; WARld = WARd0[LLD_]; if( upper ) { for( l = 0; l < N; l += Clcmb ) { lb = N - l; lb = MIN( lb, Clcmb ); Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow ); Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol ); Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol ); if( Clp > 0 && Cnq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0, &kbb, ALPHA, WAC, &WACld, Mptr( WAR, 0, Clq, WARld, size ), &WARld, one, Mptr( Cptr, 0, Clq, Cld, size ), &Cld ); PB_Cpsyr( TYPE, UPPER, lb, kbb, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld, Cptr, l, l, Cd0, tzsyrk ); } } else { for( l = 0; l < N; l += Clcmb ) { lb = N - l; ltmp = l + ( lb = MIN( lb, Clcmb ) ); Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow ); Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol ); PB_Cpsyr( TYPE, LOWER, lb, kbb, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld, Cptr, l, l, Cd0, tzsyrk ); Clp = PB_Cnumroc( ltmp, 0, Cimb1, Cmb, myrow, Crow, nprow ); Cmp0 = Cmp - Clp; Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol ); if( Cmp0 > 0 && Cnq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0, &kbb, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), &WACld, Mptr( WAR, 0, Clq, WARld, size ), &WARld, one, Mptr( Cptr, Clp, Clq, Cld, size ), &Cld ); } } } Wkbb = 0; } if( WACfr ) free( WAC ); if( WARfr ) free( WAR ); /* * End of PB_CpsyrkA */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CpsyrkAC.c000644 000766 000024 00000056626 10363532303 020442 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CpsyrkAC( PBTYP_T * TYPE, char * DIRECA, char * CONJUG, char * UPLO, char * TRANS, int N, int K, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_CpsyrkAC( TYPE, DIRECA, CONJUG, UPLO, TRANS, N, K, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * CONJUG, * DIRECA, * TRANS, * UPLO; int IA, IC, JA, JC, K, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCC; char * A, * C; #endif { /* * Purpose * ======= * * PB_CpsyrkAC performs one of the following symmetric or Hermitian rank * k operations * * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * or * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + beta*sub( C ), * or * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * or * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + beta*sub( C ), * * where * * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and, * * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N', * A(IA:IA+K-1,JA:JA+N-1) otherwise. * * Alpha and beta are scalars, sub( C ) is an n by n symmetric * or Hermitian submatrix and sub( A ) is an n by k submatrix in the * first case and a k by n submatrix in the second case. * * This is the outer-product algorithm using the logical aggregation * blocking technique. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECA (global input) pointer to CHAR * On entry, DIRECA specifies the direction in which the rows * or columns of sub( A ) and sub( C ) should be looped over as * follows: * DIRECA = 'F' or 'f' forward or increasing, * DIRECA = 'B' or 'b' backward or decreasing. * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether sub( C ) is a symmetric or * Hermitian submatrix operand as follows: * CONJUG = 'N' or 'n' sub( C ) is symmetric, * CONJUG = 'Z' or 'z' sub( C ) is Hermitian. * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of * the array C containing the upper or lower triangular part * of the submatrix sub( C ) are to be referenced as follows: * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * submatrix sub( C ) are referenced, * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * submatrix sub( C ) are referenced. * * TRANS (global input) pointer to CHAR * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * or * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ), * or * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + * beta*sub( C ), * * TRANS = 'T' or 't' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * or * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * * TRANS = 'C' or 'c' * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ), * or * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + * beta*sub( C ). * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( C ). * N must be at least zero. * * K (global input) INTEGER * On entry, with TRANS = 'N' or 'n', K specifies the number of * columns of the submatrix sub( A ), and with TRANS = 'T' or * 't' or 'C' or 'c', K specifies the number of rows of the sub- * matrix sub( A ). K must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with TRANS = 'N' or 'n', this array contains the * local entries corresponding to the entries of the n by k sub- * matrix sub( A ), otherwise the local entries corresponding to * the entries of the k by n submatrix sub( A ). * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the symmetric or Hermitian submatrix sub( C ), and the * local entries corresponding to the strictly lower triangular * of sub( C ) are not referenced. On exit, the upper triangular * part of sub( C ) is overwritten by the upper triangular part * of the updated submatrix. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the symmetric or Hermitian submatrix sub( C ), and the * local entries corresponding to the strictly upper triangular * of sub( C ) are not referenced. On exit, the lower triangular * part of sub( C ) is overwritten by the lower triangular part * of the updated submatrix. * Note that the imaginary parts of the local entries corres- * ponding to the diagonal elements of sub( C ) need not be * set, they are assumed to be zero, and on exit they are set * to zero. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char GatherDir, ScatterDir, * one, top, tran, * zero; int Acol, Acurcol, Acurimb1, Acurinb1, Acurrow, Afr, Aii, Aimb, Aimb1, Ainb, Ainb1, Ajj, Ald, Am, Amb, Amp, Amp0, An, Anb, Anq, Anq0, Arow, Ccsrc, Cimb, Cinb, Cmb, Cnb, Crsrc, WAfr, WCfr, WCsum, conjg, ctxt, fwd, k, kb, kbb, kend, kstart, kstep, ktmp, mycol, myrow, notran, npcol, nprow, size, upper; GEMM_T gemm; GSUM2D_T gsum2d; /* * .. Local Arrays .. */ int Ad0[DLEN_], DBUFA[DLEN_], WAd[DLEN_], WCd[DLEN_]; char * Aptr = NULL, * Aptr0 = NULL, * WA = NULL, * WC = NULL; /* .. * .. Executable Statements .. * */ /* * sub( C ) = beta * sub( C ) */ PB_Cplascal( TYPE, UPLO, CONJUG, N, N, BETA, C, IC, JC, DESCC ); /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); fwd = ( Mupcase( DIRECA[0] ) == CFORWARD ); conjg = ( Mupcase( CONJUG[0] ) == CCONJG ); upper = ( Mupcase( UPLO [0] ) == CUPPER ); notran = ( Mupcase( TRANS [0] ) == CNOTRAN ); tran = ( conjg ? CCOTRAN : CTRAN ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm; /* * Figure out the loop bounds accordingly to DIRECA */ kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); if( fwd ) { kstart = 0; kend = ( ( N - 1 ) / kb + 1 ) * kb; kstep = kb; GatherDir = CFORWARD; ScatterDir = CBACKWARD; } else { kstart = ( ( N - 1 ) / kb ) * kb; kend = kstep = -kb; GatherDir = CBACKWARD; ScatterDir = CFORWARD; } /* * Compute local information for A */ if( notran ) { Am = N; An = K; } else { Am = K; An = N; } PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); Aimb = DESCA[IMB_]; Ainb = DESCA[INB_]; Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Ald = DESCA[LLD_]; Aimb1 = PB_Cfirstnb( Am, IA, Aimb, Amb ); Amp0 = PB_Cnumroc( Am, 0, Aimb1, Amb, myrow, Arow, nprow ); Ainb1 = PB_Cfirstnb( An, JA, Ainb, Anb ); Anq0 = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp0 > 0 ) && ( Anq0 > 0 ) ) Aptr0 = Mptr( A, Aii, Ajj, Ald, size ); if( notran ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Cinb = DESCC[INB_]; Cnb = DESCC[NB_]; Ccsrc = DESCC[CSRC_]; if( upper ) { for( k = kstart; k != kend; k += kstep ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, An, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over A( IA:IA+k+kbb-1, JA:JA+K-1 ) */ PB_Cdescset( Ad0, ktmp, An, Aimb1, Ainb1, Amb, Anb, Arow, Acol, ctxt, Ald ); PB_CInV( TYPE, NOCONJG, ROW, ktmp, An, Ad0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * WC := A( IA:IA+k+kbb-1, JA:JA+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )' */ PB_COutV( TYPE, COLUMN, INIT, ktmp, An, Ad0, kbb, &WC, WCd, &WCfr, &WCsum ); Amp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow ); if( ( Amp > 0 ) && ( Anq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0, ALPHA, Aptr0, &Ald, WA, &WAd[LLD_], zero, WC, &WCd[LLD_] ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); if( WCsum ) { WCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : ktmp - 1 ), Cinb, Cnb, Ccsrc, Ccsrc, npcol ); if( Amp > 0 ) gsum2d( ctxt, ROW, &top, Amp, kbb, WC, WCd[LLD_], myrow, WCd[CSRC_] ); } /* * Zero lower triangle of WC( k:k+kbb-1, 0:kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WC, k, 0, WCd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WC, k+1, 0, WCd ); /* * Add WC to C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 ) */ PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WC, 0, 0, WCd, COLUMN, one, C, IC, JC+k, DESCC, COLUMN ); if( WCfr ) free( WC ); } } else { for( k = kstart; k != kend; k += kstep ) { ktmp = N - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, An, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over A( IA+k:IA+N-1, JA:JA+K-1 ) */ Acurimb1 = PB_Cfirstnb( ktmp, IA+k, Aimb, Amb ); Acurrow = PB_Cindxg2p( k, Aimb1, Amb, Arow, Arow, nprow ); PB_Cdescset( Ad0, ktmp, An, Acurimb1, Ainb1, Amb, Anb, Acurrow, Acol, ctxt, Ald ); PB_CInV( TYPE, NOCONJG, ROW, ktmp, An, Ad0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * WC := A( IA+k:IA+N-1, JA:JA+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )' */ PB_COutV( TYPE, COLUMN, INIT, ktmp, An, Ad0, kbb, &WC, WCd, &WCfr, &WCsum ); Amp = PB_Cnumroc( ktmp, k, Aimb1, Amb, myrow, Arow, nprow ); if( ( Amp > 0 ) && ( Anq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0, ALPHA, Mptr( Aptr0, Amp0-Amp, 0, Ald, size ), &Ald, WA, &WAd[LLD_], zero, WC, &WCd[LLD_] ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); if( WCsum ) { WCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : k + kbb - 1 ), Cinb, Cnb, Ccsrc, Ccsrc, npcol ); if( Amp > 0 ) gsum2d( ctxt, ROW, &top, Amp, kbb, WC, WCd[LLD_], myrow, WCd[CSRC_] ); } /* * Zero upper triangle of WC */ if( conjg ) PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WC, 0, 0, WCd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WC, 0, 1, WCd ); /* * Add WC to C( IC+k:IC+N-1, JC+k:JC+k+kbb-1 ) */ PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WC, 0, 0, WCd, COLUMN, one, C, IC+k, JC+k, DESCC, COLUMN ); if( WCfr ) free( WC ); } } } else { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Cimb = DESCC[IMB_]; Cmb = DESCC[MB_]; Crsrc = DESCC[RSRC_]; if( upper ) { for( k = kstart; k != kend; k += kstep ) { ktmp = N - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, Am, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over A( IA:IA+K-1, JA+k:JA+N-1 ) */ Acurinb1 = PB_Cfirstnb( ktmp, JA+k, Ainb, Anb ); Acurcol = PB_Cindxg2p( k, Ainb1, Anb, Acol, Acol, npcol ); PB_Cdescset( Ad0, Am, ktmp, Aimb1, Acurinb1, Amb, Anb, Arow, Acurcol, ctxt, Ald ); PB_CInV( TYPE, NOCONJG, COLUMN, Am, ktmp, Ad0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * WC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * A( IA:IA+K-1,JA+k:JA+N-1 ) */ PB_COutV( TYPE, ROW, INIT, Am, ktmp, Ad0, kbb, &WC, WCd, &WCfr, &WCsum ); Anq = PB_Cnumroc( ktmp, k, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anq > 0 ) && ( Amp0 > 0 ) ) gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0, ALPHA, WA, &WAd[LLD_], Mptr( Aptr0, 0, Anq0-Anq, Ald, size ), &Ald, zero, WC, &WCd[LLD_] ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); if( WCsum ) { WCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : k + kbb - 1 ), Cimb, Cmb, Crsrc, Crsrc, nprow ); if( Anq > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Anq, WC, WCd[LLD_], WCd[RSRC_], mycol ); } /* * Zero lower triangle of WC */ if( conjg ) PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WC, 0, 0, WCd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WC, 1, 0, WCd ); /* * Add WC to C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 ) */ PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WC, 0, 0, WCd, ROW, one, C, IC+k, JC+k, DESCC, ROW ); if( WCfr ) free( WC ); } } else { for( k = kstart; k != kend; k += kstep ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, &GatherDir, Am, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over A( IA:IA+K-1, JA:JA+k+kbb-1 ) */ PB_Cdescset( Ad0, Am, ktmp, Aimb1, Ainb1, Amb, Anb, Arow, Acol, ctxt, Ald ); PB_CInV( TYPE, NOCONJG, COLUMN, Am, ktmp, Ad0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * WC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * A( IA:IA+K-1, JA:JA+k+kbb-1 ) */ PB_COutV( TYPE, ROW, INIT, Am, ktmp, Ad0, kbb, &WC, WCd, &WCfr, &WCsum ); Anq = PB_Cnumroc( ktmp, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anq > 0 ) && ( Amp0 > 0 ) ) gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0, ALPHA, WA, &WAd[LLD_], Aptr0, &Ald, zero, WC, &WCd[LLD_] ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); if( WCsum ) { WCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : ktmp - 1 ), Cimb, Cmb, Crsrc, Crsrc, nprow ); if( Anq > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Anq, WC, WCd[LLD_], WCd[RSRC_], mycol ); } /* * Zero upper triangle of WC( 0:kbb-1, k:k+kbb-1 ) */ if( conjg ) PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WC, 0, k, WCd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WC, 0, k+1, WCd ); /* * Add WC to C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 ) */ PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WC, 0, 0, WCd, ROW, one, C, IC+k, JC, DESCC, ROW ); if( WCfr ) free( WC ); } } } /* * End of PB_CpsyrkAC */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cptradd.c000644 000766 000024 00000046722 10363532303 020340 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cptradd( PBTYP_T * TYPE, char * DIRECAC, char * UPLO, char * TRANS, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_Cptradd( TYPE, DIRECAC, UPLO, TRANS, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * DIRECAC, * TRANS, * UPLO; int IA, IC, JA, JC, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCC; char * A, * C; #endif { /* * Purpose * ======= * * PB_Cptradd adds a trapezoidal matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+N-1) if TRANS = 'N', * A(IA:IA+N-1,JA:JA+M-1)' if TRANS = 'T', * conjg(A(IA:IA+N-1,JA:JA+M-1)') if TRANS = 'C', * * Alpha and beta are scalars, sub( C ) and op( sub( A ) ) are m by n * upper or lower trapezoidal submatrices. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECAC (global input) pointer to CHAR * On entry, DIRECAC specifies the direction in which the rows * or columns of sub( A ) and sub( C ) should be looped over as * follows: * DIRECA = 'F' or 'f' forward or increasing, * DIRECA = 'B' or 'b' backward or decreasing. * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the local pieces of the * array C containing the upper or lower triangular part of the * triangular submatrix sub( C ) is to be referenced as follows: * * UPLO = 'U' or 'u' Only the local pieces corresponding to * the upper triangular part of the * triangular submatrix sub( C ) is to be * referenced, * * UPLO = 'L' or 'l' Only the local pieces corresponding to * the lower triangular part of the * triangular submatrix sub( C ) is to be * referenced. * * TRANS (global input) pointer to CHAR * On entry, TRANS specifies the form of op( sub( A ) ) to be * used in the matrix addition as follows: * * TRANS = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANS = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANS = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrices * sub( A ) and sub( C ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatri- * ces sub( A ) and sub( C ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Dir, * one, * zero; int Afr, conjg, k, kb, kbb, kend, kstart, kstep, ktmp; /* * .. Local Arrays .. */ int DBUFA[DLEN_]; char * Aptr = NULL; /* .. * .. Executable Statements .. * */ /* * sub( C ) := beta * sub( C ) */ PB_Cplascal( TYPE, UPLO, NOCONJG, M, N, BETA, C, IC, JC, DESCC ); one = TYPE->one; zero = TYPE->zero; kb = pilaenv_( &DESCC[CTXT_], C2F_CHAR( &TYPE->type ) ); if( Mupcase( DIRECAC[0] ) == CFORWARD ) { Dir = CFORWARD; kstart = 0; kend = ( ( MIN( M, N ) - 1 ) / kb + 1 ) * kb; kstep = kb; } else { Dir = CBACKWARD; kstart = ( ( MIN( M, N ) - 1 ) / kb ) * kb; kend = kstep = -kb; } if( Mupcase( TRANS[0] ) == CNOTRAN ) { if( Mupcase( UPLO [0] ) == CUPPER ) { if( M >= N ) { for( k = kstart; k != kend; k += kstep ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &Dir, ktmp, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Scale A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) by ALPHA */ PB_Cplascal( TYPE, ALL, NOCONJG, ktmp, kbb, ALPHA, Aptr, 0, 0, DBUFA ); /* * Zero lower triangle of A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, Aptr, k+1, 0, DBUFA ); /* * C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 ) += A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CScatterV( TYPE, &Dir, ktmp, kbb, Aptr, 0, 0, DBUFA, COLUMN, one, C, IC, JC+k, DESCC, COLUMN ); if( Afr ) free( Aptr ); } } else { for( k = kstart; k != kend; k += kstep ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = N - k; /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &Dir, kbb, ktmp, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Scale A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) by ALPHA */ PB_Cplascal( TYPE, ALL, NOCONJG, kbb, ktmp, ALPHA, Aptr, 0, 0, DBUFA ); /* * Zero lower triangle of A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) */ if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, Aptr, 1, 0, DBUFA ); /* * C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 ) += A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) */ PB_CScatterV( TYPE, &Dir, kbb, ktmp, Aptr, 0, 0, DBUFA, ROW, one, C, IC+k, JC+k, DESCC, ROW ); if( Afr ) free( Aptr ); } } } else { if( M >= N ) { for( k = kstart; k != kend; k += kstep ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = M - k; /* * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &Dir, ktmp, kbb, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Scale A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) by ALPHA */ PB_Cplascal( TYPE, ALL, NOCONJG, ktmp, kbb, ALPHA, Aptr, 0, 0, DBUFA ); /* * Zero upper triangle of A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) */ if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, Aptr, 0, 1, DBUFA ); /* * C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 ) += A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CScatterV( TYPE, &Dir, ktmp, kbb, Aptr, 0, 0, DBUFA, COLUMN, one, C, IC+k, JC+k, DESCC, COLUMN ); if( Afr ) free( Aptr ); } } else { for( k = kstart; k != kend; k += kstep ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &Dir, kbb, ktmp, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Scale A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) by ALPHA */ PB_Cplascal( TYPE, ALL, NOCONJG, kbb, ktmp, ALPHA, Aptr, 0, 0, DBUFA ); /* * Zero upper triangle of A( IA+k:IA+k+kbb-1, JA+k:JA:JA+k+kbb-1 ) */ if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, Aptr, 0, k+1, DBUFA ); /* * C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 ) += A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) */ PB_CScatterV( TYPE, &Dir, kbb, ktmp, Aptr, 0, 0, DBUFA, ROW, one, C, IC+k, JC, DESCC, ROW ); if( Afr ) free( Aptr ); } } } } else { conjg = ( Mupcase( TRANS[0] ) == CCOTRAN ); if( Mupcase( UPLO [0] ) == CUPPER ) { if( M >= N ) { for( k = kstart; k != kend; k += kstep ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &Dir, kbb, ktmp, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Scale A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) by ALPHA */ if( conjg ) PB_Cplacnjg( TYPE, kbb, ktmp, ALPHA, Aptr, 0, 0, DBUFA ); else PB_Cplascal( TYPE, ALL, NOCONJG, kbb, ktmp, ALPHA, Aptr, 0, 0, DBUFA ); /* * Zero upper triangle of A( IA+k:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, Aptr, 0, k+1, DBUFA ); /* * C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 ) += A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )' */ PB_CScatterV( TYPE, &Dir, kbb, ktmp, Aptr, 0, 0, DBUFA, ROW, one, C, IC, JC+k, DESCC, COLUMN ); if( Afr ) free( Aptr ); } } else { for( k = kstart; k != kend; k += kstep ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = N - k; /* * Accumulate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &Dir, ktmp, kbb, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Scale A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 ) by ALPHA */ if( conjg ) PB_Cplacnjg( TYPE, ktmp, kbb, ALPHA, Aptr, 0, 0, DBUFA ); else PB_Cplascal( TYPE, ALL, NOCONJG, ktmp, kbb, ALPHA, Aptr, 0, 0, DBUFA ); /* * Zero upper triangle of A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 ) */ if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, Aptr, 0, 1, DBUFA ); /* * C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 ) += A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )' */ PB_CScatterV( TYPE, &Dir, ktmp, kbb, Aptr, 0, 0, DBUFA, COLUMN, one, C, IC+k, JC+k, DESCC, ROW ); if( Afr ) free( Aptr ); } } } else { if( M >= N ) { for( k = kstart; k != kend; k += kstep ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = M - k; /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &Dir, kbb, ktmp, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Scale A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 ) by ALPHA */ if( conjg ) PB_Cplacnjg( TYPE, kbb, ktmp, ALPHA, Aptr, 0, 0, DBUFA ); else PB_Cplascal( TYPE, ALL, NOCONJG, kbb, ktmp, ALPHA, Aptr, 0, 0, DBUFA ); /* * Zero lower triangle of A( IA+k:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, Aptr, 1, 0, DBUFA ); /* * C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 ) += A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )' */ PB_CScatterV( TYPE, &Dir, kbb, ktmp, Aptr, 0, 0, DBUFA, ROW, one, C, IC+k, JC+k, DESCC, COLUMN ); if( Afr ) free( Aptr ); } } else { for( k = kstart; k != kend; k += kstep ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, &Dir, ktmp, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Scale A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) by ALPHA */ if( conjg ) PB_Cplacnjg( TYPE, ktmp, kbb, ALPHA, Aptr, 0, 0, DBUFA ); else PB_Cplascal( TYPE, ALL, NOCONJG, ktmp, kbb, ALPHA, Aptr, 0, 0, DBUFA ); /* * Zero lower triangle of A( IA+k:IA+k+kbb-1, JA+k:JA:JA+k+kbb-1 ) */ if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, Aptr, k+1, 0, DBUFA ); /* * C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 ) += A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )' */ PB_CScatterV( TYPE, &Dir, ktmp, kbb, Aptr, 0, 0, DBUFA, COLUMN, one, C, IC+k, JC, DESCC, ROW ); if( Afr ) free( Aptr ); } } } } /* * End of PB_Cptradd */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cptran.c000644 000766 000024 00000051454 10363532303 020204 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cptran( PBTYP_T * TYPE, char * CONJUG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * BETA, char * C, int IC, int JC, int * DESCC ) #else void PB_Cptran( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC ) /* * .. Scalar Arguments .. */ char * CONJUG; int IA, IC, JA, JC, M, N; char * ALPHA, * BETA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCC; char * A, * C; #endif { /* * Purpose * ======= * * PB_Cptran transposes a matrix * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * where * * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), * * sub( A ) denotes A(IA:IA+N-1,JA:JA+M-1), and, * * op( X ) = X' or op( X ) = conjg( X )'. * * Beta is a scalar, sub( C ) is an m by n submatrix, and sub( A ) is an * n by m submatrix. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * CONJUG (global input) pointer to CHAR * On entry, CONJUG specifies whether conjg( sub( A ) ) or * sub( A ) should be added to sub( C ) as follows: * CONJUG = 'N' or 'n': * sub( C ) := beta*sub( C ) + alpha*sub( A )' * otherwise * sub( C ) := beta*sub( C ) + alpha*conjg( sub( A ) )'. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( C ) and the number of columns of the submatrix sub( A ). * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( C ) and the number of rows of the submatrix sub( A ). N * must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array A * corresponding to the entries of the submatrix sub( A ) need * not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BETA (global input) pointer to CHAR * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then the local entries of the array C * corresponding to the entries of the submatrix sub( C ) need * not be set on input. * * C (local input/local output) pointer to CHAR * On entry, C is an array of dimension (LLD_C, Kc), where Kc is * at least Lc( 1, JC+N-1 ). Before entry, this array contains * the local entries of the matrix C. * On exit, the entries of this array corresponding to the local * entries of the submatrix sub( C ) are overwritten by the * local entries of the m by n updated submatrix. * * IC (global input) INTEGER * On entry, IC specifies C's global row index, which points to * the beginning of the submatrix sub( C ). * * JC (global input) INTEGER * On entry, JC specifies C's global column index, which points * to the beginning of the submatrix sub( C ). * * DESCC (global and local input) INTEGER array * On entry, DESCC is an integer array of dimension DLEN_. This * is the array descriptor for the matrix C. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Aroc, Croc, * one, * talpha, * tbeta, * zero; int ACnD, ACnR, Abufld, AcurrocR, Afr, AiD, AiR, AiiD, AiiR, AinbD, AinbR, Ainb1D, Ainb1R, AisR, Akk, Ald, AmyprocD, AmyprocR, AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff, ArocD, ArocR, AsrcR, Cbufld, CcurrocR, Cfr, CiD, CiR, CiiD, CiiR, CinbD, CinbR, Cinb1D, Cinb1R, CisR, Ckk, Cld, CmyprocD, CmyprocR, CnbD, CnbR, CnpD, CnpR, CnprocsD, CnprocsR, Coff, CrocD, CrocR, CsrcR, ctxt, col2row, gcdPQ, k, kb, kbb, l, lcmPQ, lcmb, maxp, maxq, mycol, myrow, ncpq, npcol, npq, nprow, nrpq, p, q, size; PB_VM_T VM; /* * .. Local Arrays .. */ int DBUFA[DLEN_], DBUFC[DLEN_]; char * Abuf = NULL, * Cbuf = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Loop over the rows of sub( C ) when M <= N, and the columns of sub( C ) * otherwise. */ col2row = ( ( M <= N ) || ( nprow == 1 ) || ( DESCA[RSRC_] == -1 ) ); if( col2row ) { AinbR = DESCA[INB_]; AnbR = DESCA[NB_]; AsrcR = DESCA[CSRC_]; CinbR = DESCC[IMB_]; CnbR = DESCC[MB_]; CsrcR = DESCC[RSRC_]; /* * If sub( A ) only spans one process column and sub( C ) spans only one process * row, then there is no need to pack the data. */ if( !( PB_Cspan( M, JA, AinbR, AnbR, AsrcR, npcol ) ) && !( PB_Cspan( M, IC, CinbR, CnbR, CsrcR, nprow ) ) ) { PB_Cpaxpby( TYPE, CONJUG, N, M, ALPHA, A, IA, JA, DESCA, COLUMN, BETA, C, IC, JC, DESCC, ROW ); return; } /* * Compute local information for sub( A ) and sub( C ) */ ACnR = M; ACnD = N; AmyprocD = CmyprocR = myrow; AnprocsD = CnprocsR = nprow; AmyprocR = CmyprocD = mycol; CnprocsD = AnprocsR = npcol; AiD = IA; AiR = JA; Aroc = CCOLUMN; AinbD = DESCA[IMB_]; AnbD = DESCA[MB_]; Ald = DESCA[LLD_]; PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR, &AiiD, &AiiR, &ArocD, &ArocR ); CiD = JC; CiR = IC; Croc = CROW; CinbD = DESCC[INB_]; CnbD = DESCC[NB_]; Cld = DESCC[LLD_]; PB_Cinfog2l( IC, JC, DESCC, CnprocsR, CnprocsD, CmyprocR, CmyprocD, &CiiR, &CiiD, &CrocR, &CrocD ); } else { AinbR = DESCA[IMB_]; AnbR = DESCA[MB_]; AsrcR = DESCA[RSRC_]; CinbR = DESCC[INB_]; CnbR = DESCC[NB_]; CsrcR = DESCC[CSRC_]; /* * If sub( A ) only spans one process row and sub( C ) spans only one process * column, then there is no need to pack the data. */ if( !( PB_Cspan( N, IA, AinbR, AnbR, AsrcR, nprow ) ) && !( PB_Cspan( N, JC, CinbR, CnbR, CsrcR, npcol ) ) ) { PB_Cpaxpby( TYPE, CONJUG, N, M, ALPHA, A, IA, JA, DESCA, ROW, BETA, C, IC, JC, DESCC, COLUMN ); return; } /* * Compute local information for sub( A ) and sub( C ) */ ACnD = M; ACnR = N; AmyprocR = CmyprocD = myrow; AnprocsR = CnprocsD = nprow; AmyprocD = CmyprocR = mycol; AnprocsD = CnprocsR = npcol; AiD = JA; AiR = IA; Aroc = CROW; AinbD = DESCA[INB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_]; PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD, &AiiR, &AiiD, &ArocR, &ArocD ); CiD = IC; CiR = JC; Croc = CCOLUMN; CinbD = DESCC[IMB_]; CnbD = DESCC[MB_]; Cld = DESCC[LLD_]; PB_Cinfog2l( IC, JC, DESCC, CnprocsD, CnprocsR, CmyprocD, CmyprocR, &CiiD, &CiiR, &CrocD, &CrocR ); } size = TYPE->size; one = TYPE->one; zero = TYPE->zero; kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); Ainb1D = PB_Cfirstnb( ACnD, AiD, AinbD, AnbD ); AnpD = PB_Cnumroc( ACnD, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD ); Ainb1R = PB_Cfirstnb( ACnR, AiR, AinbR, AnbR ); AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) ); Cinb1D = PB_Cfirstnb( ACnD, CiD, CinbD, CnbD ); CnpD = PB_Cnumroc( ACnD, 0, Cinb1D, CnbD, CmyprocD, CrocD, CnprocsD ); Cinb1R = PB_Cfirstnb( ACnR, CiR, CinbR, CnbR ); CisR = ( ( CsrcR < 0 ) || ( CnprocsR == 1 ) ); lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : CnprocsR ) ) * CnbR, ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR ); gcdPQ = PB_Cgcd( maxp, maxq ); lcmPQ = ( maxp / gcdPQ ) * maxq; /* * Loop over the processes of the virtual grid */ for( k = 0; k < gcdPQ; k++ ) { p = 0; q = k; for( l = 0; l < lcmPQ; l++ ) { AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) ); CcurrocR = ( CisR ? -1 : MModAdd( CrocR, p, CnprocsR ) ); if( ( AisR || ( AmyprocR == AcurrocR ) ) || ( CisR || ( CmyprocR == CcurrocR ) ) ) { Ckk = CiiR; Akk = AiiR; /* * Initialize local virtual matrix in process (p,q) */ CnpR = PB_Cnumroc( ACnR, 0, Cinb1R, CnbR, CcurrocR, CrocR, CnprocsR ); AnpR = PB_Cnumroc( ACnR, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR ); PB_CVMinit( &VM, 0, CnpR, AnpR, Cinb1R, Ainb1R, CnbR, AnbR, p, q, maxp, maxq, lcmb ); /* * Find how many diagonals in this virtual process */ npq = PB_CVMnpq( &VM ); /* * Re-adjust the number of rows or columns to be (un)packed, in order to * average the message sizes. */ if( npq ) kbb = npq / ( ( npq - 1 ) / kb + 1 ); if( col2row ) { while( npq ) { kbb = MIN( kbb, npq ); /* * Find out how many columns of sub( A ) and rows of sub( C ) are contiguous */ PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Aoff ); /* * Compute the descriptor DBUFA for the buffer that will contained the packed * columns of sub( A ). */ if( ( Afr = ( ncpq < kbb ) ) != 0 ) { /* * If columns of sub( A ) are not contiguous, then allocate the buffer and * pack the kbb columns of sub( A ). */ Abufld = MAX( 1, AnpD ); if( AisR || ( AmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * kbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, kbb, AnpD, one, Mptr( A, AiiD, Akk, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( A ) directly. */ Abufld = Ald; if( AisR || ( AmyprocR == AcurrocR ) ) Abuf = Mptr( A, AiiD, Akk+Aoff, Ald, size ); } PB_Cdescset( DBUFA, ACnD, kbb, Ainb1D, kbb, AnbD, kbb, ArocD, AcurrocR, ctxt, Abufld ); /* * Compute the descriptor DBUFC for the buffer that will contained the packed * rows of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < kbb ) ) != 0 ) { /* * If rows of sub( C ) are not contiguous, then allocate receiving buffer. */ Cbufld = kbb; talpha = one; tbeta = zero; if( CisR || ( CmyprocR == CcurrocR ) ) Cbuf = PB_Cmalloc( CnpD * kbb * size ); } else { /* * Otherwise, re-use sub( C ) directly. */ Cbufld = Cld; talpha = ALPHA; tbeta = BETA; if( CisR || ( CmyprocR == CcurrocR ) ) Cbuf = Mptr( C, Ckk+Coff, CiiD, Cld, size ); } PB_Cdescset( DBUFC, kbb, ACnD, kbb, Cinb1D, kbb, CnbD, CcurrocR, CrocD, ctxt, Cbufld ); /* * Transpose the one-dimensional buffer Abuf into Cbuf. */ PB_Cpaxpby( TYPE, CONJUG, ACnD, kbb, talpha, Abuf, 0, 0, DBUFA, &Aroc, tbeta, Cbuf, 0, 0, DBUFC, &Croc ); /* * Release the buffer containing the packed columns of sub( A ) */ if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) ) if( Abuf ) free( Abuf ); /* * Unpack the kbb rows of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( CmyprocR == CcurrocR ) ) ) { PB_CVMpack( TYPE, &VM, ROW, &Croc, UNPACKING, NOTRAN, kbb, CnpD, BETA, Mptr( C, Ckk, CiiD, Cld, size ), Cld, ALPHA, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } /* * Update the local column index of sub( A ) and the local row index of sub( C ) */ PB_CVMupdate( &VM, kbb, &Ckk, &Akk ); npq -= kbb; } } else { while( npq ) { kbb = MIN( kbb, npq ); /* * Find out how many rows of sub( A ) and columns of sub( C ) are contiguous */ PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Aoff ); /* * Compute the descriptor DBUFA for the buffer that will contained the packed * rows of sub( A ). */ if( ( Afr = ( ncpq < kbb ) ) != 0 ) { /* * If rows of sub( A ) are not contiguous, then allocate the buffer and pack * the kbb rows of sub( A ). */ Abufld = kbb; if( AisR || ( AmyprocR == AcurrocR ) ) { Abuf = PB_Cmalloc( AnpD * kbb * size ); PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, kbb, AnpD, one, Mptr( A, Akk, AiiD, Ald, size ), Ald, zero, Abuf, Abufld ); } } else { /* * Otherwise, re-use sub( A ) directly. */ Abufld = Ald; if( AisR || ( AmyprocR == AcurrocR ) ) Abuf = Mptr( A, Akk+Aoff, AiiD, Ald, size ); } PB_Cdescset( DBUFA, kbb, ACnD, kbb, Ainb1D, kbb, AnbD, AcurrocR, ArocD, ctxt, Abufld ); /* * Compute the descriptor DBUFC for the buffer that will contained the packed * columns of sub( C ). Allocate it. */ if( ( Cfr = ( nrpq < kbb ) ) != 0 ) { /* * If columns of sub( C ) are not contiguous, then allocate receiving buffer. */ Cbufld = MAX( 1, CnpD ); talpha = one; tbeta = zero; if( CisR || ( CmyprocR == CcurrocR ) ) Cbuf = PB_Cmalloc( CnpD * kbb * size ); } else { Cbufld = Cld; talpha = ALPHA; tbeta = BETA; if( CisR || ( CmyprocR == CcurrocR ) ) Cbuf = Mptr( C, CiiD, Ckk+Coff, Cld, size ); } PB_Cdescset( DBUFC, ACnD, kbb, Cinb1D, kbb, CnbD, kbb, CrocD, CcurrocR, ctxt, Cbufld ); /* * Transpose the one-dimensional buffer Abuf into Cbuf. */ PB_Cpaxpby( TYPE, CONJUG, kbb, ACnD, talpha, Abuf, 0, 0, DBUFA, &Aroc, tbeta, Cbuf, 0, 0, DBUFC, &Croc ); /* * Release the buffer containing the packed rows of sub( A ) */ if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) ) if( Abuf ) free( Abuf ); /* * Unpack the kbb columns of sub( C ) and release the buffer containing them. */ if( Cfr && ( CisR || ( CmyprocR == CcurrocR ) ) ) { PB_CVMpack( TYPE, &VM, ROW, &Croc, UNPACKING, NOTRAN, kbb, CnpD, BETA, Mptr( C, CiiD, Ckk, Cld, size ), Cld, ALPHA, Cbuf, Cbufld ); if( Cbuf ) free( Cbuf ); } /* * Update the local row index of sub( A ) and the local column index of sub( C ) */ PB_CVMupdate( &VM, kbb, &Ckk, &Akk ); npq -= kbb; } } } p = MModAdd1( p, maxp ); q = MModAdd1( q, maxq ); } } /* * End of PB_Cptran */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cptrm.c000644 000766 000024 00000105041 10363532303 020032 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cptrm( PBTYP_T * TYPE, PBTYP_T * UTYP, char * SIDE, char * UPLO, char * TRANS, char * DIAG, int N, int K, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * X, int LDX, char * Y, int LDY, TZTRM_T TRM ) #else void PB_Cptrm( TYPE, UTYP, SIDE, UPLO, TRANS, DIAG, N, K, ALPHA, A, IA, JA, DESCA, X, LDX, Y, LDY, TRM ) /* * .. Scalar Arguments .. */ char * DIAG, * SIDE, * TRANS, * UPLO; int IA, JA, K, LDX, LDY, N; char * ALPHA; PBTYP_T * TYPE, * UTYP; TZTRM_T TRM; /* * .. Array Arguments .. */ int * DESCA; char * A, * X, * Y; #endif { /* * Purpose * ======= * * PB_Cptrm performs a triangular matrix-matrix or matrix-vector multi- * plication. In the following, sub( A ) denotes the triangular subma- * trix operand A( IA:IA+N-1, JA:JA+N-1 ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UTYP (local input) pointer to a PBTYP_T structure * On entry, UTYP is a pointer to a structure of type PBTYP_T, * that contains type information for the Y's (See pblas.h). * * SIDE (global input) pointer to CHAR * On entry, SIDE specifies whether op( sub( A ) ) multiplies * its operand X from the left or right as follows: * * SIDE = 'L' or 'l' Y := alpha*op( sub( A ) )*X + Y, * * SIDE = 'R' or 'r' Y := alpha*X*op( sub( A ) ) + Y. * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) pointer to CHAR * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' Y := alpha * sub( A ) * X + Y, * * TRANS = 'T' or 't' Y := alpha * sub( A )' * X + Y, * * TRANS = 'C' or 'c' Y := alpha * sub( A )' * X + Y, or * Y := alpha * conjg(sub( A )') * X + Y. * * DIAG (global input) pointer to CHAR * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * K (global input) INTEGER * On entry, K specifies the local number of columns of the lo- * cal array X and the local number of rows of the local array * Y when SIDE is 'L' or 'l' and TRANS is 'N' or 'n', or SIDE is * 'R' or 'r' and TRANS is 'T', 't', 'C' or 'c'. Otherwise, K * specifies the local number of rows of the local array X and * the local number of columns of the local array Y. K mut be at * least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the triangular submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the triangular submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * X (local input) pointer to CHAR * On entry, X is an array of dimension (LDX,Kx), where Kx is * at least Lc( JA, N ) when SIDE is 'L' or 'l' and TRANS is 'N' * or 'n', or SIDE is 'R' or 'r' and TRANS is 'T', 't', 'C' or * 'c', and K otherwise. Before entry, this array contains the * local entries of the matrix X. * * LDX (local input) INTEGER * On entry, LDX specifies the leading dimension of the array X. * LDX must be at least K when SIDE is 'L' or 'l' and TRANS is * 'N' or 'n', or SIDE is 'R' or 'r' and TRANS is 'T', 't', 'C' * or 'c', and max( 1, Lp( IA, N ) ) otherwise. * * Y (local input/local output) pointer to CHAR * On entry, Y is an array of dimension ( LDY, Ky ), where Ky is * at least max( 1, K ) when SIDE is 'L' or 'l' and TRANS is 'N' * or 'n', or SIDE is 'R' or 'r' and TRANS is 'T', 't', 'C' or * 'c', and max( 1, Lc( JA, N ) ) otherwise. Before entry, this * array contains the local entries of the matrix Y. On exit, * this array contains the updated vector Y. * * LDY (local input) INTEGER * On entry, LDY specifies the leading dimension of the array Y. * LDY must be at least max( 1, Lr( IA, N ) ) when SIDE is 'L' * or 'l' and TRANS is 'N' or 'n', or SIDE is 'R' or 'r' and * TRANS is 'T', 't', 'C' or 'c', and max( 1, K ) otherwise. * * TRM (local input) pointer to function of type TZTRM_T * On entry, TRM specifies the function performing the triangu- * lar multiplication of a single block. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int Acol, Arow, Aii, Aimb1, Ainb1, Ajj, Ald, Amp, Amb, Anb, Anq, Aoffi, Aoffj, Arcol, Arrow, GoEast, GoSouth, IsColRepl, IsRowRepl, Xinc, Yinc, XYii=0, XYjj=0, XYoffi=-1, XYoffj=-1, XisRow, iimax, ilow, imbloc, inbloc, ioffd, ioffxy, iupp, jjmax, joffd, joffxy, lcmt, lcmt00, lmbloc, lnbloc, low, lower, m1, mbloc, mblkd, mblks, mycol, myrow, n1, nbloc, nblkd, nblks, npcol, nprow, pmb, qnb, size, tmp1, upp, upper; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( N <= 0 ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( DESCA[CTXT_], &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cainfog2l( N, N, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aimb1, &Ainb1, &Amp, &Anq, &Aii, &Ajj, &Arow, &Acol, &Arrow, &Arcol ); /* * Quick return if I don't own any of sub( A ) or if sub( A ) is replicated in * all processes. */ if( ( Amp <= 0 ) || ( Anq <= 0 ) ) return; IsRowRepl = ( ( Arow < 0 ) || ( nprow == 1 ) ); IsColRepl = ( ( Acol < 0 ) || ( npcol == 1 ) ); Amb = DESCA[ MB_ ]; Anb = DESCA[ NB_ ]; Ald = DESCA[ LLD_ ]; size = TYPE->size; if( IsRowRepl && IsColRepl ) { TRM( TYPE, SIDE, UPLO, TRANS, DIAG, Amp, Anq, K, 0, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), Ald, X, LDX, Y, LDY ); return; } if( Mupcase( SIDE[0] ) == CLEFT ) { if( Mupcase( TRANS[0] ) == CNOTRAN ) { XisRow = 1; Xinc = LDX * size; Yinc = UTYP->size; } else { XisRow = 0; Xinc = size; Yinc = LDY * UTYP->size; } } else { if( Mupcase( TRANS[0] ) == CNOTRAN ) { XisRow = 0; Xinc = size; Yinc = LDY * UTYP->size; } else { XisRow = 1; Xinc = LDX * size; Yinc = UTYP->size; } } upper = ( Mupcase( UPLO[0] ) == CUPPER ); lower = ( Mupcase( UPLO[0] ) == CLOWER ); /* * Initialize lcmt00, mblks, nblks, imbloc, inbloc, lmbloc, lnbloc, ilow, low, * iupp, and upp. */ PB_Cbinfo( 0, Amp, Anq, Aimb1, Ainb1, Amb, Anb, Arrow, Arcol, &lcmt00, &mblks, &nblks, &imbloc, &inbloc, &lmbloc, &lnbloc, &ilow, &low, &iupp, &upp ); iimax = ( Aoffi = Aii - 1 ) + ( m1 = Amp ); jjmax = ( Aoffj = Ajj - 1 ) + ( n1 = Anq ); pmb = ( IsRowRepl ? Amb : nprow * Amb ); qnb = ( IsColRepl ? Anb : npcol * Anb ); /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); if( XisRow ) { /* * Go through the table looking for blocks owning diagonal entries. */ if( ( !( GoSouth ) ) && ( !( GoEast ) ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ TRM( TYPE, SIDE, UPLO, TRANS, DIAG, imbloc, inbloc, K, lcmt00, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), Ald, X+XYjj*Xinc, LDX, Y+XYii*Yinc, LDY ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); if( GoSouth ) { /* * When the upper triangular part of sub( A ) should be operated with and * one is planning to go south in the table, it is neccessary to take care * of the remaining columns of these imbloc rows immediately. */ if( upper && ( Anq > inbloc ) ) { tmp1 = Anq - inbloc; TRM( TYPE, SIDE, ALL, TRANS, DIAG, imbloc, tmp1, K, 0, ALPHA, Mptr( A, Aii, Ajj+inbloc, Ald, size ), Ald, X+(XYjj+inbloc)*Xinc, LDX, Y+XYii*Yinc, LDY ); } Aii += imbloc; XYii += imbloc; m1 -= imbloc; } else { /* * When the lower triangular part of sub( A ) should be operated with and * one is planning to go east in the table, it is neccessary to take care * of the remaining rows of these inbloc columns immediately. */ if( lower && ( Amp > imbloc ) ) { tmp1 = Amp - imbloc; TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, inbloc, K, 0, ALPHA, Mptr( A, Aii+imbloc, Ajj, Ald, size ), Ald, X+XYjj*Xinc, LDX, Y+(XYii+imbloc)*Yinc, LDY ); } Ajj += inbloc; XYjj += inbloc; n1 -= inbloc; } } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the local row indexes in A and XC. */ lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; XYoffi += imbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row indexes in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; XYoffi += Amb; } /* * Operate with the upper triangular part of sub( A ) we just skipped when * necessary. */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, X+(XYoffj+1)*Xinc, LDX, Y+XYii*Yinc, LDY ); Aii += tmp1; XYii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffxy = XYoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; TRM( TYPE, SIDE, UPLO, TRANS, DIAG, mbloc, inbloc, K, lcmt, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, X+(XYoffj+1)*Xinc, LDX, Y+(ioffxy+1)*Yinc, LDY ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; XYoffi = ioffxy; ioffd += mbloc; ioffxy += mbloc; } /* * Operate with the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, inbloc, K, 0, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, X+(XYoffj+1)*Xinc, LDX, Y+(ioffxy+1)*Yinc, LDY ); } tmp1 = Aoffi - Aii + 1; m1 -= tmp1; n1 -= inbloc; lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; XYoffj += inbloc; /* * Operate with the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, X+(XYoffj+1)*Xinc, LDX, Y+XYii*Yinc, LDY ); } Aii = Aoffi + 1; Ajj = Aoffj + 1; XYii = XYoffi + 1; XYjj = XYoffj + 1; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as well as * the local column index in A and XR. */ lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; XYoffj += inbloc; /* * While there are blocks remaining that own lower entries, keep going east. * Adjust the current LCM value as well as the local column index in A and XR. */ while( ( nblks > 0 ) && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; Aoffj += Anb; XYoffj += Anb; } /* * Operate with the lower triangular part of sub( A ). */ tmp1 = MIN( Aoffj, jjmax ) - Ajj + 1; if( lower && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, m1, tmp1, K, 0, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), Ald, X+XYjj*Xinc, LDX, Y+XYii*Yinc, LDY ); Ajj += tmp1; XYjj += tmp1; n1 -= tmp1; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return; /* * lcmt00 >= low. The current block owns either diagonals or upper entries. * Save the current position in the LCM table. After this row has been * completely taken care of, re-start from this column and the next row of * the LCM table. */ lcmt = lcmt00; nblkd = nblks; joffd = Aoffj; joffxy = XYoffj; nbloc = Anb; while( ( nblkd > 0 ) && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ if( nblkd == 1 ) nbloc = lnbloc; TRM( TYPE, SIDE, UPLO, TRANS, DIAG, imbloc, nbloc, K, lcmt, ALPHA, Mptr( A, Aii, joffd+1, Ald, size ), Ald, X+(joffxy+1)*Xinc, LDX, Y+XYii*Yinc, LDY ); lcmt00 = lcmt; lcmt += qnb; nblks = nblkd; nblkd--; Aoffj = joffd; XYoffj = joffxy; joffd += nbloc; joffxy += nbloc; } /* * Operate with the upper triangular part of sub( A ). */ tmp1 = n1 - joffd + Ajj - 1; if( upper && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, imbloc, tmp1, K, 0, ALPHA, Mptr( A, Aii, joffd+1, Ald, size ), Ald, X+(joffxy+1)*Xinc, LDX, Y+XYii*Yinc, LDY ); } tmp1 = Aoffj - Ajj + 1; m1 -= imbloc; n1 -= tmp1; lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; XYoffi += imbloc; /* * Operate with the lower triangular part of sub( A ). */ if( lower && ( m1 > 0 ) && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, m1, tmp1, K, 0, ALPHA, Mptr( A, Aoffi+1, Ajj, Ald, size ), Ald, X+XYjj*Xinc, LDX, Y+(XYoffi+1)*Yinc, LDY ); } Aii = Aoffi + 1; Ajj = Aoffj + 1; XYii = XYoffi + 1; XYjj = XYoffj + 1; } /* * Loop over the remaining columns of the LCM table. */ nbloc = Anb; while( nblks > 0 ) { if( nblks == 1 ) nbloc = lnbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row index in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; XYoffi += Amb; } /* * Operate with the upper triangular part of sub( A ). */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, X+(XYoffj+1)*Xinc, LDX, Y+XYii*Yinc, LDY ); Aii += tmp1; XYii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffxy = XYoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; TRM( TYPE, SIDE, UPLO, TRANS, DIAG, mbloc, nbloc, K, lcmt, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, X+(XYoffj+1)*Xinc, LDX, Y+(ioffxy+1)*Yinc, LDY ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; XYoffi = ioffxy; ioffd += mbloc; ioffxy += mbloc; } /* * Operate with the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, nbloc, K, 0, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, X+(XYoffj+1)*Xinc, LDX, Y+(ioffxy+1)*Yinc, LDY ); } tmp1 = MIN( Aoffi, iimax ) - Aii + 1; m1 -= tmp1; n1 -= nbloc; lcmt00 += qnb; nblks--; Aoffj += nbloc; XYoffj += nbloc; /* * Operate with the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, X+(XYoffj+1)*Xinc, LDX, Y+XYii*Yinc, LDY ); } Aii = Aoffi + 1; Ajj = Aoffj + 1; XYii = XYoffi + 1; XYjj = XYoffj + 1; } } else { /* * Go through the table looking for blocks owning diagonal entries. */ if( ( !( GoSouth ) ) && ( !( GoEast ) ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ TRM( TYPE, SIDE, UPLO, TRANS, DIAG, imbloc, inbloc, K, lcmt00, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), Ald, X+XYii*Xinc, LDX, Y+XYjj*Yinc, LDY ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); if( GoSouth ) { /* * When the upper triangular part of sub( A ) should be operated with and * one is planning to go south in the table, it is neccessary to take care * of the remaining columns of these imbloc rows immediately. */ if( upper && ( Anq > inbloc ) ) { tmp1 = Anq - inbloc; TRM( TYPE, SIDE, ALL, TRANS, DIAG, imbloc, tmp1, K, 0, ALPHA, Mptr( A, Aii, Ajj+inbloc, Ald, size ), Ald, X+XYii*Xinc, LDX, Y+(XYjj+inbloc)*Yinc, LDY ); } Aii += imbloc; XYii += imbloc; m1 -= imbloc; } else { /* * When the lower triangular part of sub( A ) should be operated with and * one is planning to go east in the table, it is neccessary to take care * of the remaining rows of these inbloc columns immediately. */ if( lower && ( Amp > imbloc ) ) { tmp1 = Amp - imbloc; TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, inbloc, K, 0, ALPHA, Mptr( A, Aii+imbloc, Ajj, Ald, size ), Ald, X+(XYii+imbloc)*Xinc, LDX, Y+XYjj*Yinc, LDY ); } Ajj += inbloc; XYjj += inbloc; n1 -= inbloc; } } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the local row indexes in A and XC. */ lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; XYoffi += imbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row indexes in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; XYoffi += Amb; } /* * Operate with the upper triangular part of sub( A ) we just skipped when * necessary. */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, X+XYii*Xinc, LDX, Y+(XYoffj+1)*Yinc, LDY ); Aii += tmp1; XYii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffxy = XYoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; TRM( TYPE, SIDE, UPLO, TRANS, DIAG, mbloc, inbloc, K, lcmt, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, X+(ioffxy+1)*Xinc, LDX, Y+(XYoffj+1)*Yinc, LDY ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; XYoffi = ioffxy; ioffd += mbloc; ioffxy += mbloc; } /* * Operate with the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, inbloc, K, 0, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, X+(ioffxy+1)*Xinc, LDX, Y+(XYoffj+1)*Yinc, LDY ); } tmp1 = Aoffi - Aii + 1; m1 -= tmp1; n1 -= inbloc; lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; XYoffj += inbloc; /* * Operate with the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, X+XYii*Xinc, LDX, Y+(XYoffj+1)*Yinc, LDY ); } Aii = Aoffi + 1; Ajj = Aoffj + 1; XYii = XYoffi + 1; XYjj = XYoffj + 1; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as well as * the local column index in A and XR. */ lcmt00 += low - ilow + qnb; nblks--; Aoffj += inbloc; XYoffj += inbloc; /* * While there are blocks remaining that own lower entries, keep going east. * Adjust the current LCM value as well as the local column index in A and XR. */ while( ( nblks > 0 ) && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; Aoffj += Anb; XYoffj += Anb; } /* * Operate with the lower triangular part of sub( A ). */ tmp1 = MIN( Aoffj, jjmax ) - Ajj + 1; if( lower && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, m1, tmp1, K, 0, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), Ald, X+XYii*Xinc, LDX, Y+XYjj*Yinc, LDY ); Ajj += tmp1; XYjj += tmp1; n1 -= tmp1; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return; /* * lcmt00 >= low. The current block owns either diagonals or upper entries. * Save the current position in the LCM table. After this row has been * completely taken care of, re-start from this column and the next row of * the LCM table. */ lcmt = lcmt00; nblkd = nblks; joffd = Aoffj; joffxy = XYoffj; nbloc = Anb; while( ( nblkd > 0 ) && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ if( nblkd == 1 ) nbloc = lnbloc; TRM( TYPE, SIDE, UPLO, TRANS, DIAG, imbloc, nbloc, K, lcmt, ALPHA, Mptr( A, Aii, joffd+1, Ald, size ), Ald, X+XYii*Xinc, LDX, Y+(joffxy+1)*Yinc, LDY ); lcmt00 = lcmt; lcmt += qnb; nblks = nblkd; nblkd--; Aoffj = joffd; XYoffj = joffxy; joffd += nbloc; joffxy += nbloc; } /* * Operate with the upper triangular part of sub( A ). */ tmp1 = n1 - joffd + Ajj - 1; if( upper && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, imbloc, tmp1, K, 0, ALPHA, Mptr( A, Aii, joffd+1, Ald, size ), Ald, X+XYii*Xinc, LDX, Y+(joffxy+1)*Yinc, LDY ); } tmp1 = Aoffj - Ajj + 1; m1 -= imbloc; n1 -= tmp1; lcmt00 -= ( iupp - upp + pmb ); mblks--; Aoffi += imbloc; XYoffi += imbloc; /* * Operate with the lower triangular part of sub( A ). */ if( lower && ( m1 > 0 ) && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, m1, tmp1, K, 0, ALPHA, Mptr( A, Aoffi+1, Ajj, Ald, size ), Ald, X+(XYoffi+1)*Xinc, LDX, Y+XYjj*Yinc, LDY ); } Aii = Aoffi + 1; Ajj = Aoffj + 1; XYii = XYoffi + 1; XYjj = XYoffj + 1; } /* * Loop over the remaining columns of the LCM table. */ nbloc = Anb; while( nblks > 0 ) { if( nblks == 1 ) nbloc = lnbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the local row index in A and XC. */ while( ( mblks > 0 ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Aoffi += Amb; XYoffi += Amb; } /* * Operate with the upper triangular part of sub( A ). */ tmp1 = MIN( Aoffi, iimax ) - Aii + 1; if( upper && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, X+XYii*Xinc, LDX, Y+(XYoffj+1)*Yinc, LDY ); Aii += tmp1; XYii += tmp1; m1 -= tmp1; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; ioffd = Aoffi; ioffxy = XYoffi; mbloc = Amb; while( ( mblkd > 0 ) && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; TRM( TYPE, SIDE, UPLO, TRANS, DIAG, mbloc, nbloc, K, lcmt, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, X+(ioffxy+1)*Xinc, LDX, Y+(XYoffj+1)*Yinc, LDY ); lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd; mblkd--; Aoffi = ioffd; XYoffi = ioffxy; ioffd += mbloc; ioffxy += mbloc; } /* * Operate with the lower triangular part of sub( A ). */ tmp1 = m1 - ioffd + Aii - 1; if( lower && ( tmp1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, nbloc, K, 0, ALPHA, Mptr( A, ioffd+1, Aoffj+1, Ald, size ), Ald, X+(ioffxy+1)*Xinc, LDX, Y+(XYoffj+1)*Yinc, LDY ); } tmp1 = MIN( Aoffi, iimax ) - Aii + 1; m1 -= tmp1; n1 -= nbloc; lcmt00 += qnb; nblks--; Aoffj += nbloc; XYoffj += nbloc; /* * Operate with the upper triangular part of sub( A ). */ if( upper && ( tmp1 > 0 ) && ( n1 > 0 ) ) { TRM( TYPE, SIDE, ALL, TRANS, DIAG, tmp1, n1, K, 0, ALPHA, Mptr( A, Aii, Aoffj+1, Ald, size ), Ald, X+XYii*Xinc, LDX, Y+(XYoffj+1)*Yinc, LDY ); } Aii = Aoffi + 1; Ajj = Aoffj + 1; XYii = XYoffi + 1; XYjj = XYoffj + 1; } } /* * End of PB_Cptrm */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CptrmmAB.c000644 000766 000024 00000122427 10363532303 020421 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CptrmmAB( PBTYP_T * TYPE, char * VARIANT, char * SIDE, char * UPLO, char * TRANSA, char * DIAG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB ) #else void PB_CptrmmAB( TYPE, VARIANT, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ char * DIAG, * SIDE, * TRANSA, * UPLO, * VARIANT; int IA, IB, JA, JB, M, N; char * ALPHA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * B; #endif { /* * Purpose * ======= * * PB_CptrmmAB performs one of the matrix-matrix operations * * sub( B ) := alpha * op( sub( A ) ) * sub( B ), * * or * * sub( B ) := alpha * sub( B ) * op( sub( A ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, sub( B ) is an m by n submatrix, sub( A ) is a * unit, or non-unit, upper or lower triangular submatrix and op( X ) is * one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * This is the outer-product algorithm using the logical aggregation * blocking technique. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * VARIANT (global input) pointer to CHAR * On entry, VARIANT specifies whether the left- or right-loo- * king variant of the algorithm should be used for the transpo- * se cases only, that is TRANSA is not 'N' or 'n'. When VARIANT * is 'L' or 'l', the left-looking variant is used, otherwise * the right-looking algorithm is selected. * * SIDE (global input) pointer to CHAR * On entry, SIDE specifies whether op( sub( A ) ) multiplies * sub( B ) from the left or right as follows: * * SIDE = 'L' or 'l' sub( B ) := alpha*op( sub( A ) )*sub( B ), * * SIDE = 'R' or 'r' sub( B ) := alpha*sub( B )*op( sub( A ) ). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) pointer to CHAR * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * DIAG (global input) pointer to CHAR * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n transformed submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char conjg, * one, top, * zero; int Afr, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii, Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq, Bnq0, Brow, WAfr, WBfr, WBsum, ctxt, k, kb, kbb, ktmp, lside, mycol, myrow, notran, npcol, nprow, size, unit, upper; /* * .. Local Arrays .. */ int Bd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd[DLEN_], WBd[DLEN_]; char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * WA = NULL, * WB = NULL; GEMM_T gemm; GSUM2D_T gsum2d; /* .. * .. Executable Statements .. * */ lside = ( Mupcase( SIDE [0] ) == CLEFT ); upper = ( Mupcase( UPLO [0] ) == CUPPER ); notran = ( Mupcase( TRANSA[0] ) == CNOTRAN ); unit = ( Mupcase( DIAG [0] ) == CUNIT ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm; kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol, Bld ... */ Bimb = DESCB[IMB_]; Binb = DESCB[INB_]; Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb ); Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow ); Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb ); Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size ); if( notran ) { if( lside ) { if( upper ) { for( k = 0; k < M; k += kb ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, ktmp, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over * B( IB:IB+k+kbb-1, JB:JB+N-1 ) */ PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Zero lower triangle of WA( k:k+kbb-1, 0:kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one, WA, k, 0, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, k+1, 0, WAd ); /* * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, N, B, IB+k, JB, DESCB, ROW, &Bptr, DBUFB, &Bfr ); /* * Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over * B( IB:IB+k+kbb-1, JB:JB+N-1) */ PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k, JB, DESCB ); /* * B( IB:IB+k+kbb-1, JB:JB+N-1 ) := ALPHA * WA * WB */ Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0, &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one, Bptr0, &Bld ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } else { for( k = ( ( M - 1 ) / kb ) * kb; k >= 0; k -= kb ) { ktmp = M - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, ktmp, kbb, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over * B( IB+k:IB+M-1, JB:JB+N-1 ) */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Zero upper triangle of WA( 0:kbb-1, 0:kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one, WA, 0, 0, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, 0, 1, WAd ); /* * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, N, B, IB+k, JB, DESCB, ROW, &Bptr, DBUFB, &Bfr ); /* * Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over * B( IB+k:IB+M-1, JB:JB+N-1 ) */ PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k, JB, DESCB ); /* * B( IB+k:IB+M-1, JB:JB+N-1 ) := ALPHA * WA * WB */ Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0, &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one, Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } } else { if( upper ) { for( k = ( ( N - 1 ) / kb ) * kb; k >= 0; k -= kb ) { ktmp = N - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, ktmp, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over * B( IB:IB+M-1, JB+k:JB+N-1 ) */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Zero lower triangle of WA( 0:kbb-1, 0:kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one, WA, 0, 0, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, 1, 0, WAd ); /* * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, M, kbb, B, IB, JB+k, DESCB, COLUMN, &Bptr, DBUFB, &Bfr ); /* * Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over * B( IB:IB+M-1, JB+k:JB+N-1 ) */ PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB, JB+k, DESCB ); /* * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA */ Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq, &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } else { for( k = 0; k < N; k += kb ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, ktmp, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) over * B( IB:IB+M-1, JB:JB+k+kbb-1 ) */ PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Zero upper triangle of WA( 0:kbb-1, k:k+kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one, WA, 0, k, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, 0, k+1, WAd ); /* * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, M, kbb, B, IB, JB+k, DESCB, COLUMN, &Bptr, DBUFB, &Bfr ); /* * Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over * B( IB:IB+M-1, JB:JB+k+kbb-1 ) */ PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB, JB+k, DESCB ); /* * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA */ Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq, &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one, Bptr0, &Bld ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } } } else { if( Mupcase( VARIANT[0] ) == CRIGHT ) { /* * Right looking variant for the transpose cases */ conjg = ( ( Mupcase( TRANSA[0] ) == CCOTRAN ) ? CCONJG : CNOCONJG ); if( lside ) { if( !upper ) { for( k = 0; k < M; k += kb ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, ktmp, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )' over * B( IB:IB+k+kbb-1, JB:JB+N-1 ) */ PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Zero lower triangle of WA( k:k+kbb-1, 0:kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one, WA, k, 0, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, k+1, 0, WAd ); /* * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, N, B, IB+k, JB, DESCB, ROW, &Bptr, DBUFB, &Bfr ); /* * Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over * B( IB:IB+k+kbb-1, JB:JB+N-1) */ PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k, JB, DESCB ); /* * B( IB:IB+k+kbb-1, JB:JB+N-1 ) := ALPHA * WA * WB */ Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0, &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one, Bptr0, &Bld ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } else { for( k = ( ( M - 1 ) / kb ) * kb; k >= 0; k -= kb ) { ktmp = M - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, ktmp, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )' over * B( IB+k:IB+M-1, JB:JB+N-1 ) */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Zero upper triangle of WA( 0:kbb-1, 0:kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one, WA, 0, 0, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, 0, 1, WAd ); /* * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, N, B, IB+k, JB, DESCB, ROW, &Bptr, DBUFB, &Bfr ); /* * Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over * B( IB+k:IB+M-1, JB:JB+N-1 ) */ PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) */ PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k, JB, DESCB ); /* * B( IB+k:IB+M-1, JB:JB+N-1 ) := ALPHA * WA * WB */ Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0, &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one, Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } } else { if( !upper ) { for( k = ( ( N - 1 ) / kb ) * kb; k >= 0; k -= kb ) { ktmp = N - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, ktmp, kbb, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )' over * B( IB:IB+M-1, JB+k:JB+N-1 ) */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Zero lower triangle of WA( 0:kbb-1, 0:kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one, WA, 0, 0, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, 1, 0, WAd ); /* * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, M, kbb, B, IB, JB+k, DESCB, COLUMN, &Bptr, DBUFB, &Bfr ); /* * Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over * B( IB:IB+M-1, JB+k:JB+N-1 ) */ PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB, JB+k, DESCB ); /* * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA */ Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq, &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } else { for( k = 0; k < N; k += kb ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, ktmp, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )' over * B( IB:IB+M-1, JB:JB+k+kbb-1 ) */ PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Zero upper triangle of WA( 0:kbb-1, k:k+kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one, WA, 0, k, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, 0, k+1, WAd ); /* * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, M, kbb, B, IB, JB+k, DESCB, COLUMN, &Bptr, DBUFB, &Bfr ); /* * Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over * B( IB:IB+M-1, JB:JB+k+kbb-1 ) */ PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) */ PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB, JB+k, DESCB ); /* * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA */ Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq, &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one, Bptr0, &Bld ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } } } else { /* * Left looking variant for the transpose cases */ if( lside ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( upper ) { for( k = ( ( M - 1 ) / kb ) * kb; k >= 0; k -= kb ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, ktmp, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over * B( IB:IB+k+kbb-1, JB:JB+N-1 ) */ PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Zero lower triangle of WA( k:k+kbb-1, 0:kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one, WA, k, 0, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, k+1, 0, WAd ); /* * WB := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )' * B( IB:IB+k+kbb-1, JB:JB+N-1 ) */ PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr, &WBsum ); Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bnq0 > 0 ) && ( Bmp > 0 ) ) gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp, ALPHA, WA, &WAd[LLD_], Bptr0, &Bld, zero, WB, &WBd[LLD_] ); if( WBsum ) { WBd[RSRC_] = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); if( Bnq0 > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_], WBd[RSRC_], mycol ); } if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) := WB */ PB_CScatterV( TYPE, FORWARD, kbb, N, WB, 0, 0, WBd, ROW, zero, B, IB+k, JB, DESCB, ROW ); if( WBfr ) free( WB ); } } else { for( k = 0; k < M; k += kb ) { ktmp = M - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, ktmp, kbb, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over * B( IB+k:IB+M-1, JB:JB+N-1 ) */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Zero upper triangle of WA( 0:kbb-1, 0:kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one, WA, 0, 0, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, 0, 1, WAd ); /* * WB := A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )' * B( IB+k:IB+M-1, JB:JB+N-1 ) */ PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr, &WBsum ); Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bnq0 > 0 ) && ( Bmp > 0 ) ) gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp, ALPHA, WA, &WAd[LLD_], Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld, zero, WB, &WBd[LLD_] ); if( WBsum ) { WBd[RSRC_] = PB_Cindxg2p( k + kbb - 1, Bimb1, Bmb, Brow, Brow, nprow ); if( Bnq0 > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_], WBd[RSRC_], mycol ); } if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) := WB */ PB_CScatterV( TYPE, BACKWARD, kbb, N, WB, 0, 0, WBd, ROW, zero, B, IB+k, JB, DESCB, ROW ); if( WBfr ) free( WB ); } } } else { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); if( upper ) { for( k = 0; k < N; k += kb ) { ktmp = N - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) */ PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, ktmp, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over * B( IB:IB+M-1, JB+k:JB+N-1 ) */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Zero lower triangle of WA( 0:kbb-1, 0:kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one, WA, 0, 0, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, 1, 0, WAd ); /* * WB := B( IB:IB+M-1, JB+k:JB+N-1 ) * A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )' */ PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd, &WBfr, &WBsum ); Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb, &Bnq, ALPHA, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld, WA, &WAd[LLD_], zero, WB, &WBd[LLD_] ); if( WBsum ) { WBd[CSRC_] = PB_Cindxg2p( k + kbb - 1, Binb1, Bnb, Bcol, Bcol, npcol ); if( Bmp0 > 0 ) gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_], myrow, WBd[CSRC_] ); } if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := WB */ PB_CScatterV( TYPE, BACKWARD, M, kbb, WB, 0, 0, WBd, COLUMN, zero, B, IB, JB+k, DESCB, COLUMN ); if( WBfr ) free( WB ); } } else { for( k = ( ( N - 1 ) / kb ) * kb; k >= 0; k -= kb ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, ktmp, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate WA := A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) over * B( IB:IB+M-1, JB:JB+k+kbb-1 ) */ PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Zero upper triangle of WA( 0:kbb-1, k:k+kbb-1 ) */ if( unit ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one, WA, 0, k, WAd ); else if( kbb > 1 ) PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WA, 0, k+1, WAd ); /* * WB := B( IB:IB+M-1, JB:JB+k+kbb-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )' */ PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd, &WBfr, &WBsum ); Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb, &Bnq, ALPHA, Bptr0, &Bld, WA, &WAd[LLD_], zero, WB, &WBd[LLD_] ); if( WBsum ) { WBd[CSRC_] = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); if( Bmp0 > 0 ) gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_], myrow, WBd[CSRC_] ); } if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := WB */ PB_CScatterV( TYPE, FORWARD, M, kbb, WB, 0, 0, WBd, COLUMN, zero, B, IB, JB+k, DESCB, COLUMN ); if( WBfr ) free( WB ); } } } } } /* * End of PB_CptrmmAB */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CptrmmB.c000644 000766 000024 00000057526 10363532303 020327 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CptrmmB( PBTYP_T * TYPE, char * DIRECB, char * SIDE, char * UPLO, char * TRANSA, char * DIAG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB ) #else void PB_CptrmmB( TYPE, DIRECB, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ char * DIAG, * DIRECB, * SIDE, * TRANSA, * UPLO; int IA, IB, JA, JB, M, N; char * ALPHA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * B; #endif { /* * Purpose * ======= * * PB_CptrmmB performs one of the matrix-matrix operations * * sub( B ) := alpha * op( sub( A ) ) * sub( B ), * * or * * sub( B ) := alpha * sub( B ) * op( sub( A ) ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, sub( B ) is an m by n submatrix, sub( A ) is a * unit, or non-unit, upper or lower triangular submatrix and op( X ) is * one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ). * * This is the inner-product algorithm using the logical LCM hybrid * and static blocking techniques. The submatrix operand sub( A ) stays * in place. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECB (global input) pointer to CHAR * On entry, DIRECB specifies the direction in which the rows * or columns of sub( B ) should be looped over as follows: * DIRECB = 'F' or 'f' forward or increasing, * DIRECB = 'B' or 'b' backward or decreasing. * * SIDE (global input) pointer to CHAR * On entry, SIDE specifies whether op( sub( A ) ) multiplies * sub( B ) from the left or right as follows: * * SIDE = 'L' or 'l' sub( B ) := alpha*op( sub( A ) )*sub( B ), * * SIDE = 'R' or 'r' sub( B ) := alpha*sub( B )*op( sub( A ) ). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) pointer to CHAR * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * DIAG (global input) pointer to CHAR * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n transformed submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Broc, GemmTa, GemmTb, TranOp, WBroc, WCroc, conjg, * one, * talpha, * tbeta, top, * zero; int Acol, Aii, Aimb1, Ainb1, Ajj, Alcmb, Ald, Alp, Alp0, Alq, Alq0, Amb, Amp, An, Anb, Anq, Arow, BcurrocR, Bfwd, BiiD, BiiR, Binb1D, Binb1R, BisR, Bld, BmyprocD, BmyprocR, BnD, BnR, BnbD, BnbR, BnpR, BnprocsD, BnprocsR, BrocD, BrocR, BsrcR, LNorRT, WBfr, WBld, WCfr, WCld, WCpbY, WCsum, ctxt, l, lb, lside, ltmp, mycol, myrow, n, nb, nbb, notran, npcol, nprow, p=0, size, tmp, upper; GEMM_T gemm; GSUM2D_T gsum2d; /* * .. Local Arrays .. */ int Ad0[DLEN_], DBUFB[DLEN_], WCd[DLEN_], WBd[DLEN_]; char * Aptr = NULL, * Bptr = NULL, * WB = NULL, * WC = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD ); lside = ( Mupcase( SIDE [0] ) == CLEFT ); upper = ( Mupcase( UPLO [0] ) == CUPPER ); notran = ( ( TranOp = Mupcase( TRANSA[0] ) ) == CNOTRAN ); LNorRT = ( lside && notran ) || ( !( lside ) && !( notran ) ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d; nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Compute local information for sub( A ) and sub( B ) */ if( lside ) { BnD = An = M; BnR = N; Broc = CCOLUMN; BmyprocD = myrow; BnprocsD = nprow; BmyprocR = mycol; BnprocsR = npcol; BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ]; BsrcR = DESCB[CSRC_]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR, &BiiD, &BiiR, &BrocD, &BrocR ); Binb1D = PB_Cfirstnb( BnD, IB, DESCB[IMB_], BnbD ); Binb1R = PB_Cfirstnb( BnR, JB, DESCB[INB_], BnbR ); } else { BnD = An = N; BnR = M; Broc = CROW; BmyprocD = mycol; BnprocsD = npcol; BmyprocR = myrow; BnprocsR = nprow; BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ]; BsrcR = DESCB[RSRC_]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD, &BiiR, &BiiD, &BrocR, &BrocD ); Binb1D = PB_Cfirstnb( BnD, JB, DESCB[INB_], BnbD ); Binb1R = PB_Cfirstnb( BnR, IB, DESCB[IMB_], BnbR ); } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); Amp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( A, Aii, Ajj, Ald, size ); /* * Compute conjugate of alpha for the conjugate transpose cases */ if( TranOp == CCOTRAN ) { conjg = CCONJG; talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha ); } else { conjg = CNOCONJG; talpha = ALPHA; } /* * Retrieve BLACS combine topology, set the transpose parameters to be passed * to the BLAS matrix multiply routine and finally describe the form of the * input and output operands. */ if( LNorRT ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); GemmTa = CNOTRAN; GemmTb = ( lside ? CTRAN : TranOp ); WCroc = CCOLUMN; WBroc = CROW; } else { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); GemmTb = CNOTRAN; GemmTa = ( lside ? TranOp : CTRAN ); WCroc = CROW; WBroc = CCOLUMN; } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ Alcmb = 2 * nb * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); /* * When sub( B ) is not replicated and backward pass on sub( B ), find the * virtual process p owning the last row or column of sub( B ). */ if( !( BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) ) ) && !Bfwd ) { tmp = PB_Cindxg2p( BnR-1, Binb1R, BnbR, BrocR, BrocR, BnprocsR ); p = MModSub( tmp, BrocR, BnprocsR ); } /* * Loop over the processes rows or columns owning the BnR rows or columns of * sub( B ) to be processed. */ n = BnR; while( n > 0 ) { /* * Find out who is the active process row or column as well as the number of * rows or columns of sub( B ) it owns. */ BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) ); BnpR = PB_Cnumroc( BnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR ); n -= BnpR; /* * Re-adjust the number of rows or columns to be handled at each step, in order * to average the message sizes and the computational granularity. */ if( BnpR ) nbb = BnpR / ( ( BnpR - 1 ) / nb + 1 ); while( BnpR ) { nbb = MIN( nbb, BnpR ); /* * Describe the local contiguous panel of sub( B ) */ if( lside ) { PB_Cdescset( DBUFB, BnD, nbb, Binb1D, nbb, BnbD, BnbR, BrocD, BcurrocR, ctxt, Bld ); if( BisR || ( BmyprocR == BcurrocR ) ) Bptr = Mptr( B, BiiD, BiiR, Bld, size ); } else { PB_Cdescset( DBUFB, nbb, BnD, nbb, Binb1D, BnbR, BnbD, BcurrocR, BrocD, ctxt, Bld ); if( BisR || ( BmyprocR == BcurrocR ) ) Bptr = Mptr( B, BiiR, BiiD, Bld, size ); } /* * Replicate this panel in the process rows or columns spanned by sub( A ): WB */ PB_CInV( TYPE, NOCONJG, &WBroc, An, An, Ad0, nbb, Bptr, 0, 0, DBUFB, &Broc, &WB, WBd, &WBfr ); /* * Reuse sub( B ) and/or create vector WC in process columns or rows spanned by * sub( A ) */ PB_CInOutV( TYPE, &WCroc, An, An, Ad0, nbb, one, Bptr, 0, 0, DBUFB, &Broc, &tbeta, &WC, WCd, &WCfr, &WCsum, &WCpbY ); /* * When the input data is first transposed, zero it now for later accumulation */ if( notran ) PB_Cplapad( TYPE, ALL, NOCONJG, DBUFB[M_], DBUFB[N_], zero, zero, Bptr, 0, 0, DBUFB ); /* * Local matrix-matrix multiply iff I own some data */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_]; Anb = Ad0[NB_]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Amp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol ); WCld = WCd[LLD_]; if( ( Amp > 0 ) && ( Anq > 0 ) ) { WBld = WBd[LLD_]; if( upper ) { /* * sub( A ) is upper triangular */ if( LNorRT ) { for( l = 0; l < An; l += Alcmb ) { lb = An - l; lb = MIN( lb, Alcmb ); Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow ); Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol ); if( Alp > 0 ) { Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol ); gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &Alp, &nbb, &Alq0, talpha, Mptr( Aptr, 0, Alq, Ald, size ), &Ald, Mptr( WB, 0, Alq, WBld, size ), &WBld, one, WC, &WCld ); } PB_Cptrm( TYPE, TYPE, SIDE, UPLO, TRANSA, DIAG, lb, nbb, talpha, Aptr, l, l, Ad0, Mptr( WB, 0, Alq, WBld, size ), WBld, Mptr( WC, Alp, 0, WCld, size ), WCld, PB_Ctztrmm ); } } else { for( l = 0; l < An; l += Alcmb ) { lb = An - l; lb = MIN( lb, Alcmb ); Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow ); Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol ); Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol ); if( Alq0 > 0 ) gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &nbb, &Alq0, &Alp, talpha, WB, &WBld, Mptr( Aptr, 0, Alq, Ald, size ), &Ald, one, Mptr( WC, 0, Alq, WCld, size ), &WCld ); PB_Cptrm( TYPE, TYPE, SIDE, UPLO, TRANSA, DIAG, lb, nbb, talpha, Aptr, l, l, Ad0, Mptr( WB, Alp, 0, WBld, size ), WBld, Mptr( WC, 0, Alq, WCld, size ), WCld, PB_Ctztrmm ); } } } else { /* * sub( A ) is lower triangular */ if( LNorRT ) { for( l = 0; l < An; l += Alcmb ) { lb = An - l; ltmp = l + ( lb = MIN( lb, Alcmb ) ); Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow ); Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( TYPE, TYPE, SIDE, UPLO, TRANSA, DIAG, lb, nbb, talpha, Aptr, l, l, Ad0, Mptr( WB, 0, Alq, WBld, size ), WBld, Mptr( WC, Alp, 0, WCld, size ), WCld, PB_Ctztrmm ); Alp = PB_Cnumroc( ltmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Alp0 = Amp - Alp; Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol ); if( Alp0 > 0 ) gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &Alp0, &nbb, &Alq0, talpha, Mptr( Aptr, Alp, Alq, Ald, size ), &Ald, Mptr( WB, 0, Alq, WBld, size ), &WBld, one, Mptr( WC, Alp, 0, WCld, size ), &WCld ); } } else { for( l = 0; l < An; l += Alcmb ) { lb = An - l; ltmp = l + ( lb = MIN( lb, Alcmb ) ); Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow ); Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrm( TYPE, TYPE, SIDE, UPLO, TRANSA, DIAG, lb, nbb, talpha, Aptr, l, l, Ad0, Mptr( WB, Alp, 0, WBld, size ), WBld, Mptr( WC, 0, Alq, WCld, size ), WCld, PB_Ctztrmm ); Alp = PB_Cnumroc( ltmp, 0, Aimb1, Amb, myrow, Arow, nprow ); Alp0 = Amp - Alp; Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol ); if( Alq0 > 0 ) gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &nbb, &Alq0, &Alp0, talpha, Mptr( WB, Alp, 0, WBld, size ), &WBld, Mptr( Aptr, Alp, Alq, Ald, size ), &Ald, one, Mptr( WC, 0, Alq, WCld, size ), &WCld ); } } } } if( WBfr ) free( WB ); if( LNorRT ) { /* * Combine the partial column results into WC */ if( WCsum && ( Amp > 0 ) ) gsum2d( ctxt, ROW, &top, Amp, nbb, WC, WCld, myrow, WCd[CSRC_] ); /* * sub( B ) := WC (if necessary) */ if( WCpbY ) PB_Cpaxpby( TYPE, &conjg, An, nbb, one, WC, 0, 0, WCd, &WCroc, zero, Bptr, 0, 0, DBUFB, &Broc ); } else { /* * Combine the partial row results into WC */ if( WCsum && ( Anq > 0 ) ) gsum2d( ctxt, COLUMN, &top, nbb, Anq, WC, WCld, WCd[RSRC_], mycol ); /* * sub( B ) := WC (if necessary) */ if( WCpbY ) PB_Cpaxpby( TYPE, &conjg, nbb, An, one, WC, 0, 0, WCd, &WCroc, zero, Bptr, 0, 0, DBUFB, &Broc ); } if( WCfr ) free( WC ); /* * Go to the next contiguous panel if any residing in this process row or column */ BnpR -= nbb; if( BisR || ( BmyprocR == BcurrocR ) ) BiiR += nbb; } /* * Go to next or previous process row or column owning some of sub( B ) */ if( !BisR ) p = ( Bfwd ? MModAdd1( p, BnprocsR ) : MModSub1( p, BnprocsR ) ); } if( TranOp == CCOTRAN ) free( talpha ); /* * End of PB_CptrmmB */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cptrsm.c000644 000766 000024 00000106640 10363532303 020223 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cptrsm( PBTYP_T * TYPE, int FBCAST, char * SIDE, char * UPLO, char * TRANS, char * DIAG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * BC, int LDBC, char * BR, int LDBR ) #else void PB_Cptrsm( TYPE, FBCAST, SIDE, UPLO, TRANS, DIAG, M, N, ALPHA, A, IA, JA, DESCA, BC, LDBC, BR, LDBR ) /* * .. Scalar Arguments .. */ char * ALPHA, * DIAG, * SIDE, * TRANS, * UPLO; int FBCAST, IA, JA, LDBC, LDBR, M, N; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA; char * A, * BC, * BR; #endif { /* * Purpose * ======= * * PB_Cptrsm solves one of the matrix equations * * op( sub( A ) ) * X = B, or X * op( sub( A ) ) = alpha * B, * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R'. * * X and B are m by n submatrices, sub( A ) is a unit, or non-unit, * upper or lower triangular submatrix and op( Y ) is one of * * op( Y ) = Y or op( Y ) = Y' or op( Y ) = conjg( Y' ). * * The submatrix X is overwritten on B. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * FBCAST (global input) INTEGER * On entry, FBCAST specifies whether the transposed of the vec- * tor solution should be broadcast or not when there is a pos- * sible ambiguity, i.e. when sub( A ) is just one block. When * FBCAST is zero, the solution vector is not broadcast, and the * the solution vector is broadcast otherwise. * * SIDE (global input) pointer to CHAR * On entry, SIDE specifies whether op( sub( A ) ) appears on * the left or right of X as follows: * * SIDE = 'L' or 'l' op( sub( A ) ) * X = B, * * SIDE = 'R' or 'r' X * op( sub( A ) ) = B. * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) pointer to CHAR * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( A ) * X = B, * * TRANS = 'T' or 't' sub( A )' * X = B, * * TRANS = 'C' or 'c' conjg( sub( A )' ) * X = B. * * DIAG (global input) pointer to CHAR * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix B. * M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * B. N must be at least zero. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 0, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 0, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * BC (local input/local output) pointer to CHAR * On entry, BC is an array of dimension (LDBC,Kbc), where Kbc * is at least N when SIDE is 'L' or 'l' and at least M other- * wise. Before entry, when SIDE is 'L' or 'l' and TRANS is 'N' * or 'n' or SIDE is 'R' or 'r' and TRANS is not 'N' or 'n', * this array contains the local entries of the right-hand-side * matrix B. Otherwise, the entries of BC should be zero. On * exit, this array contains the partial solution matrix X. * * LDBC (local input) INTEGER * On entry, LDBC specifies the leading dimension of the array * BC. LDBC must be at least MAX( 1, Lr( IA, M ) ) when SIDE * is 'L' or 'l' and at least MAX( 1, Lr( IA, N ) ) otherwise. * * BR (local input/local output) pointer to CHAR * On entry, BR is an array of dimension (LDBR,Kbr), where Kbr * is at least Lc( JA, M ) when SIDE is 'L' or 'l' and at least * Lc( JA, N ) otherwise. Before entry, when SIDE is 'L' or 'l' * and TRANS is 'N' or 'n' or SIDE is 'R' or 'r' and TRANS is * not 'N' or 'n', the entries of BR should be zero. Otherwise, * this array contains the local entries of the right-hand-side * matrix B. On exit, this array contains the partial solution * matrix X. * * LDBR (local input) INTEGER * On entry, LDBR specifies the leading dimension of the array * BR. LDBR must be at least MAX( 1, N ) when SIDE is 'L' or 'l' * and at least MAX( 1, M ) otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char btop, * negone, * one, * talpha1, * talpha2, * zero; int Acol, Aii, Aimb1, Ainb1, Ais1Col, Ais1Row, AisColRep, AisRowRep, Ajj, Alcol, Ald, Alrow, Amb, Anpprev, Anb, Anp, Anq, Arow, Asrc, ChangeRoc=0, LNorRT, Na, Nb, bcst, ctxt, izero=0, k=0, kb, kbprev=0, kbsize, lside, mb1, mycol, myrow, n1, n1last, n1p, n1pprev=0, nb1, nlast, notran, npcol, nprow, rocprev, size, tmp1, tmp2; MMADD_T add, tadd; TZPAD_T pad; GEMM_T gemm; TRSM_T trsm; GESD2D_T send; GERV2D_T recv; GEBS2D_T bsend; GEBR2D_T brecv; /* * .. Local Arrays .. */ char * Aprev = NULL, * Bd = NULL, * Bdprev = NULL, * Bprev = NULL, * work = NULL; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); lside = ( Mupcase( SIDE [0] ) == CLEFT ); notran = ( Mupcase( TRANS[0] ) == CNOTRAN ); LNorRT = ( lside && notran ) || ( !( lside ) && !( notran ) ); if( LNorRT ) { Na = M; Nb = N; } else { Na = N; Nb = M; } /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); /* * Determine if sub( A ) spans more than one process row, and/or more than one * process column. */ Amb = DESCA[MB_]; Anb = DESCA[NB_]; Ald = DESCA[LLD_ ]; Aimb1 = PB_Cfirstnb( Na, IA, DESCA[IMB_], Amb ); Anp = PB_Cnumroc( Na, 0, Aimb1, Amb, myrow, Arow, nprow ); Ais1Row = !( PB_Cspan( Na, 0, Aimb1, Amb, Arow, nprow ) ); Ainb1 = PB_Cfirstnb( Na, JA, DESCA[INB_], Anb ); Anq = PB_Cnumroc( Na, 0, Ainb1, Anb, mycol, Acol, npcol ); Ais1Col = !( PB_Cspan( Na, 0, Ainb1, Anb, Acol, npcol ) ); /* * When sub( A ) spans only one process, solve the system locally and return. */ if( Ais1Row && Ais1Col ) { if( LNorRT ) { if( Anq > 0 ) { if( Anp > 0 ) { TYPE->Ftrsm( C2F_CHAR( ( notran ? SIDE : ( lside ? RIGHT : LEFT ) ) ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, TYPE->size ), &Ald, BC, &LDBC ); TYPE->Fmmtadd( &M, &N, TYPE->one, BC, &LDBC, TYPE->zero, BR, &LDBR ); } if( ( Arow >= 0 ) && FBCAST ) { btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == Arow ) TYPE->Cgebs2d( ctxt, COLUMN, &btop, N, M, BR, LDBR ); else TYPE->Cgebr2d( ctxt, COLUMN, &btop, N, M, BR, LDBR, Arow, mycol ); } } } else { if( Anp > 0 ) { if( Anq > 0 ) { TYPE->Ftrsm( C2F_CHAR( ( notran ? SIDE : ( lside ? RIGHT : LEFT ) ) ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &M, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, TYPE->size ), &Ald, BR, &LDBR ); TYPE->Fmmtadd( &M, &N, TYPE->one, BR, &LDBR, TYPE->zero, BC, &LDBC ); } if( ( Acol >= 0 ) && FBCAST ) { btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == Acol ) TYPE->Cgebs2d( ctxt, ROW, &btop, N, M, BC, LDBC ); else TYPE->Cgebr2d( ctxt, ROW, &btop, N, M, BC, LDBC, myrow, Acol ); } } } return; } /* * Retrieve from TYPE structure useful BLAS and BLACS functions. */ size = TYPE->size; negone = TYPE->negone; one = TYPE->one; zero = TYPE->zero; add = TYPE->Fmmadd; tadd = TYPE->Fmmtadd; pad = TYPE->Ftzpad; gemm = TYPE->Fgemm; trsm = TYPE->Ftrsm; send = TYPE->Cgesd2d; recv = TYPE->Cgerv2d; bsend = TYPE->Cgebs2d; brecv = TYPE->Cgebr2d; if( ( Anp > 0 ) && ( Anq > 0 ) ) A = Mptr( A, Aii, Ajj, Ald, size ); if( LNorRT ) { /* * Left - No tran or Right - (co)Trans */ if( ( Anq <= 0 ) || ( Ais1Row && ( ( Arow >= 0 ) && !( FBCAST ) && ( myrow != Arow ) ) ) ) return; btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); bcst = ( ( !Ais1Row ) || ( Ais1Row && ( Arow >= 0 ) && FBCAST ) ); AisRowRep = ( ( Arow < 0 ) || ( nprow == 1 ) ); if( Mupcase( UPLO[0] ) == CUPPER ) { /* * Initiate lookahead */ nlast = ( npcol - 1 ) * Anb; n1 = MAX( nlast, Anb ); nlast += Ainb1; n1last = n1 - Anb + MAX( Ainb1, Anb ); work = PB_Cmalloc( Nb * MIN( n1last, Anp ) * size ); tmp1 = Na-1; Alrow = PB_Cindxg2p( tmp1, Aimb1, Amb, Arow, Arow, nprow ); Alcol = PB_Cindxg2p( tmp1, Ainb1, Anb, Acol, Acol, npcol ); rocprev = Alcol; Anpprev = Anp; Bprev = BC; Bdprev = BR; Aprev = A = Mptr( A, 0, Anq, Ald, size ); mb1 = PB_Clastnb( Na, 0, Aimb1, Amb ); nb1 = PB_Clastnb( Na, 0, Ainb1, Anb ); tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); n1 = ( ( Ais1Col || ( Na - nb1 < nlast ) ) ? n1last : n1 ); tmp2 = n1 + nb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) ); Asrc = Arow; n1p = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Aimb1, Amb, myrow, Asrc, nprow ); talpha1 = talpha2 = ( ( Ais1Col || ( mycol == Alcol ) ) ? ALPHA : one ); while( Na > 0 ) { kbsize = kb * size; if( Ais1Col || ( mycol == Alcol ) ) { A -= Ald*kbsize; Anq -= kb; Bd = Mptr( BR, 0, Anq, LDBR, size ); } if( ( Arow < 0 ) || ( myrow == Alrow ) ) { Anp -= kb; } /* * Partial update of previous block */ if( n1pprev > 0 ) { if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) ) { tmp1 = ( Anpprev - n1pprev ) * size; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &n1pprev, &Nb, &kbprev, negone, Aprev+tmp1, &Ald, Bdprev, &LDBR, talpha1, Bprev+tmp1, &LDBC ); } /* * Send partial updated result to current column */ if( !( Ais1Col ) && ChangeRoc ) { if( mycol == rocprev ) { send( ctxt, n1pprev, Nb, Mptr( Bprev, Anpprev-n1pprev, 0, LDBC, size ), LDBC, myrow, Alcol ); } else if( mycol == Alcol ) { recv( ctxt, n1pprev, Nb, work, n1pprev, myrow, rocprev ); add( &n1pprev, &Nb, one, work, &n1pprev, one, Mptr( Bprev, Anpprev-n1pprev, 0, LDBC, size ), &LDBC ); } } } /* * Solve current diagonal block */ if( Ais1Col || ( mycol == Alcol ) ) { if( AisRowRep || ( myrow == Alrow ) ) { trsm( C2F_CHAR( LEFT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &kb, &Nb, talpha2, Mptr( A, Anp, 0, Ald, size ), &Ald, Mptr( BC, Anp, 0, LDBC, size ), &LDBC ); tadd( &kb, &Nb, one, Mptr( BC, Anp, 0, LDBC, size ), &LDBC, zero, Mptr( BR, 0, Anq, LDBR, size ), &LDBR ); } if( bcst ) { if( myrow == Alrow ) bsend( ctxt, COLUMN, &btop, Nb, kb, Mptr( BR, 0, Anq, LDBR, size ), LDBR ); else brecv( ctxt, COLUMN, &btop, Nb, kb, Mptr( BR, 0, Anq, LDBR, size ), LDBR, Alrow, mycol ); } talpha2 = one; } else { if( !( Ais1Col ) && ( AisRowRep || ( myrow == Alrow ) ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kb, &Nb, &izero, zero, zero, Mptr( BC, Anp, 0, LDBC, size ), &LDBC ); } /* * Finish previous update */ if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) ) { if( ( tmp1 = Anpprev - n1pprev ) > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &tmp1, &Nb, &kbprev, negone, Aprev, &Ald, Bdprev, &LDBR, talpha1, Bprev, &LDBC ); talpha1 = one; } /* * Save info of current step and update info for the next step */ if( Ais1Col || ( mycol == Alcol ) ) { Bdprev = Bd; Aprev = A; } if( AisRowRep || ( myrow == Alrow ) ) { Anpprev -= kb; } n1pprev = n1p; rocprev = Alcol; kbprev = kb; k += kb; Na -= kb; mb1 -= kb; if( mb1 == 0 ) { if( !( Ais1Row ) && ( Alrow >= 0 ) ) Alrow = MModSub1( Alrow, nprow ); mb1 = ( Na > Aimb1 ? Amb : Aimb1 ); } nb1 -= kb; ChangeRoc = ( nb1 == 0 ); if( ChangeRoc ) { if( !( Ais1Col ) && ( Alcol >= 0 ) ) Alcol = MModSub1( Alcol, npcol ); nb1 = ( Na > Ainb1 ? Anb : Ainb1 ); } tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); n1 = ( ( Ais1Col || ( Na-nb1 < nlast ) ) ? n1last : n1 ); tmp2 = n1 + nb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) ); n1p = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Aimb1, Amb, myrow, Asrc, nprow ); } } else { /* * Initiate lookahead */ n1 = ( MAX( npcol, 2 ) - 1 ) * Anb; work = PB_Cmalloc( Nb*MIN( n1, Anp )*size ); Aprev = A; Bprev = BC, Bdprev = BR; Anpprev = Anp; mb1 = Aimb1; nb1 = Ainb1; rocprev = Acol; tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + nb1 - kb; Asrc = Arow; n1p = PB_Cnumroc( MIN( tmp1, tmp2 ), kb, Aimb1, Amb, myrow, Asrc, nprow ); talpha1 = talpha2 = ( ( Ais1Col || ( mycol == Acol ) ) ? ALPHA : one ); while( kb > 0 ) { kbsize = kb * size; /* * Partial update of previous block */ if( n1pprev > 0 ) { if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &n1pprev, &Nb, &kbprev, negone, Aprev, &Ald, Bdprev, &LDBR, talpha1, Bprev, &LDBC ); /* * Send partial updated result to current column */ if( !( Ais1Col ) && ChangeRoc ) { if( mycol == rocprev ) { send( ctxt, n1pprev, Nb, Bprev, LDBC, myrow, Acol ); } else if( mycol == Acol ) { recv( ctxt, n1pprev, Nb, work, n1pprev, myrow, rocprev ); add( &n1pprev, &Nb, one, work, &n1pprev, one, Bprev, &LDBC ); } } } /* * Solve current diagonal block */ if( Ais1Col || ( mycol == Acol ) ) { if( AisRowRep || ( myrow == Arow ) ) { trsm( C2F_CHAR( LEFT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &kb, &Nb, talpha2, A, &Ald, BC, &LDBC ); tadd( &kb, &Nb, one, BC, &LDBC, zero, BR, &LDBR ); } if( bcst ) { if( myrow == Arow ) bsend( ctxt, COLUMN, &btop, Nb, kb, BR, LDBR ); else brecv( ctxt, COLUMN, &btop, Nb, kb, BR, LDBR, Arow, mycol ); } talpha2 = one; } else { if( !( Ais1Col ) && ( AisRowRep || ( myrow == Arow ) ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kb, &Nb, &izero, zero, zero, BC, &LDBC ); } /* * Finish previous update */ if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) ) { if( ( tmp1 = Anpprev - n1pprev ) > 0 ) { tmp2 = n1pprev * size; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &tmp1, &Nb, &kbprev, negone, Aprev+tmp2, &Ald, Bdprev, &LDBR, talpha1, Bprev+tmp2, &LDBC ); } Aprev += Ald * kbprev * size; talpha1 = one; } /* * Save info of current step and update info for the next step */ if( Ais1Col || ( mycol == Acol ) ) { A += Ald*kbsize; Bdprev = Bd = BR; BR += LDBR*kbsize; } if( AisRowRep || ( myrow == Arow ) ) { Bprev = ( BC += kbsize ); A += kbsize; Aprev += kbsize; Anpprev = ( Anp -= kb ); } n1pprev = n1p; rocprev = Acol; kbprev = kb; k += kb; Na -= kb; mb1 -= kb; if( mb1 == 0 ) { if( !( Ais1Row ) && ( Arow >= 0 ) ) Arow = MModAdd1( Arow, nprow ); mb1 = MIN( Amb, Na ); } nb1 -= kb; ChangeRoc = ( nb1 == 0 ); if( ChangeRoc ) { if( !( Ais1Col ) && ( Acol >= 0 ) ) Acol = MModAdd1( Acol, npcol ); nb1 = MIN( Anb, Na ); } tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + nb1 - kb; n1p = PB_Cnumroc( MIN( tmp2, tmp1 ), k + kb, Aimb1, Amb, myrow, Asrc, nprow ); } } } else { /* * Right - No tran or Left - (co)Trans */ if( ( Anp <= 0 ) || ( Ais1Col && ( ( Acol >= 0 ) && !( FBCAST ) && ( mycol != Acol ) ) ) ) return; btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); bcst = ( ( !Ais1Col ) || ( Ais1Col && ( Acol >= 0 ) && FBCAST ) ); AisColRep = ( ( Acol < 0 ) || ( npcol == 1 ) ); if( Mupcase( UPLO[0] ) == CUPPER ) { /* * Initiate lookahead */ n1 = ( MAX( nprow, 2 ) - 1 ) * Amb; work = PB_Cmalloc( Nb*MIN( n1, Anq )*size ); Aprev = A; Bprev = BR, Bdprev = BC; Anpprev = Anq; mb1 = Aimb1; nb1 = Ainb1; rocprev = Arow; tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + mb1 - kb; Asrc = Acol; n1p = PB_Cnumroc( MIN( tmp1, tmp2 ), kb, Ainb1, Anb, mycol, Asrc, npcol ); talpha1 = talpha2 = ( ( Ais1Row || ( myrow == Arow ) ) ? ALPHA : one ); while( kb > 0 ) { kbsize = kb * size; /* * Partial update of previous block */ if( n1pprev > 0 ) { if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) ) gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &n1pprev, &kbprev, negone, Bdprev, &LDBC, Aprev, &Ald, talpha1, Bprev, &LDBR ); /* * Send partial updated result to current row */ if( !( Ais1Row ) && ChangeRoc ) { if( myrow == rocprev ) { send( ctxt, Nb, n1pprev, Bprev, LDBR, Arow, mycol ); } else if( myrow == Arow ) { recv( ctxt, Nb, n1pprev, work, Nb, rocprev, mycol ); add( &Nb, &n1pprev, one, work, &Nb, one, Bprev, &LDBR ); } } } /* * Solve current diagonal block */ if( Ais1Row || ( myrow == Arow ) ) { if( AisColRep || ( mycol == Acol ) ) { trsm( C2F_CHAR( RIGHT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &Nb, &kb, talpha2, A, &Ald, BR, &LDBR ); tadd( &Nb, &kb, one, BR, &LDBR, zero, BC, &LDBC ); } if( bcst ) { if( mycol == Acol ) bsend( ctxt, ROW, &btop, kb, Nb, BC, LDBC ); else brecv( ctxt, ROW, &btop, kb, Nb, BC, LDBC, myrow, Acol ); } talpha2 = one; } else { if( !( Ais1Row ) && ( AisColRep || ( mycol == Acol ) ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Nb, &kb, &izero, zero, zero, BR, &LDBR ); } /* * Finish previous update */ if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) ) { if( ( tmp1 = Anpprev - n1pprev ) > 0 ) { tmp2 = n1pprev * size; gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &tmp1, &kbprev, negone, Bdprev, &LDBC, Aprev+Ald*tmp2, &Ald, talpha1, Bprev+LDBR*tmp2, &LDBR ); } Aprev += kbprev * size; talpha1 = one; } /* * Save info of current step and update info for the next step */ if( Ais1Row || ( myrow == Arow ) ) { A += kbsize; Bdprev = Bd = BC; BC += kbsize; } if( AisColRep || ( mycol == Acol ) ) { Bprev = ( BR += LDBR * kbsize ); A += Ald * kbsize; Anpprev = ( Anq -= kb ); Aprev += Ald * kbsize; } n1pprev = n1p; rocprev = Arow; kbprev = kb; k += kb; Na -= kb; nb1 -= kb; if( nb1 == 0 ) { if( !( Ais1Col ) && ( Acol >= 0 ) ) Acol = MModAdd1( Acol, npcol ); nb1 = MIN( Anb, Na ); } mb1 -= kb; ChangeRoc = ( mb1 == 0 ); if( ChangeRoc ) { if( !( Ais1Row ) && ( Arow >= 0 ) ) Arow = MModAdd1( Arow, nprow ); mb1 = MIN( Amb, Na ); } tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + mb1 - kb; n1p = PB_Cnumroc( MIN( tmp2, tmp1 ), k + kb, Ainb1, Anb, mycol, Asrc, npcol ); } } else { /* * Initiate lookahead */ nlast = ( nprow - 1 ) * Amb; n1 = MAX( nlast, Amb ); nlast += Aimb1; n1last = n1 - Amb + MAX( Aimb1, Amb ); work = PB_Cmalloc( Nb * MIN( n1last, Anq ) * size ); tmp1 = Na-1; Alrow = PB_Cindxg2p( tmp1, Aimb1, Amb, Arow, Arow, nprow ); Alcol = PB_Cindxg2p( tmp1, Ainb1, Anb, Acol, Acol, npcol ); rocprev = Alrow; Anpprev = Anq; Bprev = BR; Bdprev = BC; Aprev = A = Mptr( A, Anp, 0, Ald, size ); mb1 = PB_Clastnb( Na, 0, Aimb1, Amb ); nb1 = PB_Clastnb( Na, 0, Ainb1, Anb ); tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); n1 = ( ( Ais1Row || ( Na-mb1 < nlast ) ) ? n1last : n1 ); tmp2 = n1 + mb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) ); Asrc = Acol; n1p = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Ainb1, Anb, mycol, Asrc, npcol ); talpha1 = talpha2 = ( ( Ais1Row || ( myrow == Alrow ) ) ? ALPHA : one ); while( Na > 0 ) { kbsize = kb * size; if( Ais1Row || ( myrow == Alrow ) ) { A -= kbsize; Anp -= kb; Bd = Mptr( BC, Anp, 0, LDBC, size ); } if( ( Acol < 0 ) || ( mycol == Alcol ) ) { Anq -= kb; } /* * Partial update of previous block */ if( n1pprev > 0 ) { if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) ) { tmp1 = ( Anpprev - n1pprev ) * size; TYPE->Fgemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &n1pprev, &kbprev, negone, Bdprev, &LDBC, Aprev+Ald*tmp1, &Ald, talpha1, Bprev+LDBR*tmp1, &LDBR ); } /* * Send partial updated result to current row */ if( !( Ais1Row ) && ChangeRoc ) { if( myrow == rocprev ) { send( ctxt, Nb, n1pprev, Mptr( Bprev, 0, Anpprev-n1pprev, LDBR, size ), LDBR, Alrow, mycol ); } else if( myrow == Alrow ) { recv( ctxt, Nb, n1pprev, work, Nb, rocprev, mycol ); add( &Nb, &n1pprev, one, work, &Nb, one, Mptr( Bprev, 0, Anpprev-n1pprev, LDBR, size ), &LDBR ); } } } /* * Solve current diagonal block */ if( Ais1Row || ( myrow == Alrow ) ) { if( AisColRep || ( mycol == Alcol ) ) { trsm( C2F_CHAR( RIGHT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &Nb, &kb, talpha2, Mptr( A, 0, Anq, Ald, size ), &Ald, Mptr( BR, 0, Anq, LDBR, size ), &LDBR ); tadd( &Nb, &kb, one, Mptr( BR, 0, Anq, LDBR, size ), &LDBR, zero, Mptr( BC, Anp, 0, LDBC, size ), &LDBC ); } if( bcst ) { if( mycol == Alcol ) bsend( ctxt, ROW, &btop, kb, Nb, Mptr( BC, Anp, 0, LDBC, size ), LDBC ); else brecv( ctxt, ROW, &btop, kb, Nb, Mptr( BC, Anp, 0, LDBC, size ), LDBC, myrow, Alcol ); } talpha2 = one; } else { if( !( Ais1Row ) && ( AisColRep || ( mycol == Alcol ) ) ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Nb, &kb, &izero, zero, zero, Mptr( BR, 0, Anq, LDBR, size ), &LDBR ); } /* * Finish previous update */ if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) ) { if( ( tmp1 = Anpprev - n1pprev ) > 0 ) gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &tmp1, &kbprev, negone, Bdprev, &LDBC, Aprev, &Ald, talpha1, Bprev, &LDBR ); talpha1 = one; } /* * Save info of current step and update info for the next step */ if( Ais1Row || ( myrow == Alrow ) ) { Bdprev = Bd; Aprev = A; } if( AisColRep || ( mycol == Alcol ) ) { Anpprev -= kb; } n1pprev = n1p; rocprev = Alrow; kbprev = kb; k += kb; Na -= kb; nb1 -= kb; if( nb1 == 0 ) { if( !( Ais1Col ) && ( Alcol >= 0 ) ) Alcol = MModSub1( Alcol, npcol ); nb1 = ( Na > Ainb1 ? Anb : Ainb1 ); } mb1 -= kb; ChangeRoc = ( mb1 == 0 ); if( ChangeRoc ) { if( !( Ais1Row ) && ( Alrow >= 0 ) ) Alrow = MModSub1( Alrow, nprow ); mb1 = ( Na > Aimb1 ? Amb : Aimb1 ); } tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); n1 = ( ( Ais1Row || ( Na-mb1 < nlast ) ) ? n1last : n1 ); tmp2 = n1 + mb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) ); n1p = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Ainb1, Anb, mycol, Asrc, npcol ); } } } if( work ) free( work ); /* * End of PB_Cptrsm */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CptrsmAB.c000644 000766 000024 00000133210 10363532303 020417 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CptrsmAB( PBTYP_T * TYPE, char * VARIANT, char * SIDE, char * UPLO, char * TRANSA, char * DIAG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB ) #else void PB_CptrsmAB( TYPE, VARIANT, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ char * DIAG, * SIDE, * TRANSA, * UPLO, * VARIANT; int IA, IB, JA, JB, M, N; char * ALPHA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * B; #endif { /* * Purpose * ======= * * PB_CptrsmAB solves one of the matrix equations * * op( sub( A ) )*X = alpha*sub( B ), or * * X*op( sub( A ) ) = alpha*sub( B ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is * a unit, or non-unit, upper or lower triangular submatrix and op( Y ) * is one of * * op( Y ) = Y or op( Y ) = Y' or op( Y ) = conjg( Y' ). * * The submatrix X is overwritten on sub( B ). * * This is the outer-product algorithm using the logical aggregation * blocking technique. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * VARIANT (global input) pointer to CHAR * On entry, VARIANT specifies whether the left- or right-loo- * king variant of the algorithm should be used for the transpo- * se cases only, that is TRANSA is not 'N' or 'n'. When VARIANT * is 'L' or 'l', the left-looking variant is used, otherwise * the right-looking algorithm is selected. * * SIDE (global input) pointer to CHAR * On entry, SIDE specifies whether op( sub( A ) ) appears on * the left or right of X as follows: * * SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ), * * SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) pointer to CHAR * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * DIAG (global input) pointer to CHAR * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n solution submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char conjg, * negone, * one, * talph, top, * zero; int Afr, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii, Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq, Bnq0, Brow, WAfr, WAoff, WBfr, WBsum, ctxt, k, kb, kbb, kmax, ktmp, lside, mn, mycol, myrow, notran, npcol, nprow, size, upper; GEMM_T gemm; GSUM2D_T gsum2d; /* * .. Local Arrays .. */ int Bd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd[DLEN_], WBd[DLEN_]; char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * WA = NULL, * WB = NULL; /* .. * .. Executable Statements .. * */ lside = ( Mupcase( SIDE [0] ) == CLEFT ); upper = ( Mupcase( UPLO [0] ) == CUPPER ); notran = ( Mupcase( TRANSA[0] ) == CNOTRAN ); size = TYPE->size; negone = TYPE->negone; one = TYPE->one; zero = TYPE->zero; gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm; talph = ALPHA; kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol, Bld ... */ Bimb = DESCB[IMB_]; Binb = DESCB[INB_]; Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb ); Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow ); Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb ); Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size ); if( notran ) { if( lside ) { if( upper ) { kmax = ( ( M - 1 ) / kb ) * kb; for( k = kmax; k >= 0; k -= kb ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+ktmp-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, BACKWARD, ktmp, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A(IA:IA+ktmp-1, JA+k:JA+k+kbb-1) over B(IB:IB+ktmp-1, JB:JB+N-1) */ PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Solve B( IB+k:IB+ktmp-1, JB:JB+N-1 ) with talph */ PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, kbb, N, talph, WA, k, 0, WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB, &Bfr ); /* * Update B( IB:IB+k-1, JB:JB+N-1 ) */ if( k > 0 ) { /* * Replicate B( IB+k:IB+ktmp-1, JB:JB+N-1 ) over B( IB:IB+k-1, JB:JB+N-1 ) */ PB_Cdescset( Bd0, k, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, k, N, Bd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * Local update */ Bmp = PB_Cnumroc( k, 0, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0, &kbb, negone, WA, &WAd[LLD_], WB, &WBd[LLD_], talph, Bptr0, &Bld ); if( WBfr ) free( WB ); talph = one; } if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } else { for( k = 0; k < M; k += kb ) { ktmp = M - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, FORWARD, ktmp, kbb, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over B( IB+k:IB+M-1, JB:JB+N-1 ) */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Solve B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) with talph */ PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, kbb, N, talph, WA, 0, 0, WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB, &Bfr ); /* * Update B( IB+k+kbb:IB+M-1, JB:JB+N-1 ) */ if( ( ktmp = ktmp - kbb ) > 0 ) { /* * Replicate B(IB+k:IB+k+kbb-1, JB:JB+N-1) over B(IB+k+kbb:IB+M-1, JB:JB+N-1) */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k+kbb, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k+kbb, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * Local update */ Bmp = PB_Cnumroc( ktmp, k+kbb, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) { WAoff = PB_Cnumroc( kbb, 0, WAd[IMB_], WAd[MB_], myrow, WAd[RSRC_], nprow ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0, &kbb, negone, Mptr( WA, WAoff, 0, WAd[LLD_], size ), &WAd[LLD_], WB, &WBd[LLD_], talph, Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld ); } if( WBfr ) free( WB ); talph = one; } if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } } else { if( upper ) { for( k = 0; k < N; k += kb ) { ktmp = N - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) */ PB_CGatherV( TYPE, REUSE, FORWARD, kbb, ktmp, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over B( IB:IB+M-1, JB+k:JB+N-1 ) */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Solve B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) with talph */ PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, M, kbb, talph, WA, 0, 0, WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB, &Bfr ); /* * Update B( IB:IB+M-1, JB+k+kbb:JB+N-1 ) */ if( ( ktmp = ktmp - kbb ) > 0 ) { /* * Replicate B(IB:IB+M-1, JB+k:JB+k+kbb-1) over B(IB:IB+M-1, JB+k+kbb:JB+N-1) */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k+kbb, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k+kbb, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * Local update */ Bnq = PB_Cnumroc( ktmp, k+kbb, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) { WAoff = PB_Cnumroc( kbb, 0, WAd[INB_], WAd[NB_], mycol, WAd[CSRC_], npcol ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq, &kbb, negone, WB, &WBd[LLD_], Mptr( WA, 0, WAoff, WAd[LLD_], size ), &WAd[LLD_], talph, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld ); } if( WBfr ) free( WB ); talph = one; } if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } else { kmax = ( ( N - 1 ) / kb ) * kb; for( k = kmax; k >= 0; k -= kb ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 ) */ PB_CGatherV( TYPE, REUSE, BACKWARD, kbb, ktmp, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 ) over B(IB:IB+M-1, JB:JB+ktmp-1) */ PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Solve B( IB:IB+M-1, JB+k:JB+ktmp-1 ) with talph */ PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, M, kbb, talph, WA, 0, k, WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB, &Bfr ); /* * Update B( IB:IB+M-1, JB:JB+k-1 ) */ if( k > 0 ) { /* * Replicate B( IB:IB+M-1, JB+k:JB+ktmp-1 ) over B( IB:IB+M-1, JB:JB+k-1 ) */ PB_Cdescset( Bd0, M, k, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, M, k, Bd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * Local update */ Bnq = PB_Cnumroc( k, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq, &kbb, negone, WB, &WBd[LLD_], WA, &WAd[LLD_], talph, Bptr0, &Bld ); if( WBfr ) free( WB ); talph = one; } if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } } } else { if( Mupcase( VARIANT[0] ) == CRIGHT ) { /* * Right looking variant for the transpose cases */ conjg = ( ( Mupcase( TRANSA[0] ) == CCOTRAN ) ? CCONJG : CNOCONJG ); if( lside ) { if( !upper ) { /* * Left Lower (Conjugate) Transpose */ kmax = ( ( M - 1 ) / kb ) * kb; for( k = kmax; k >= 0; k -= kb ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 ) */ PB_CGatherV( TYPE, REUSE, BACKWARD, kbb, ktmp, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 )' over B(IB:IB+ktmp-1, JB:JB+N-1) */ PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Solve B( IB+k:IB+ktmp-1, JB:JB+N-1 ) with talph */ PB_CptrsmAB0( TYPE, SIDE, UPPER, DIAG, kbb, N, talph, WA, k, 0, WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB, &Bfr ); /* * Update B( IB:IB+k-1, JB:JB+N-1 ) */ if( k > 0 ) { /* * Replicate B( IB+k:IB+ktmp-1, JB:JB+N-1 ) over B( IB:IB+k-1, JB:JB+N-1 ) */ PB_Cdescset( Bd0, k, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, k, N, Bd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * Local update */ Bmp = PB_Cnumroc( k, 0, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0, &kbb, negone, WA, &WAd[LLD_], WB, &WBd[LLD_], talph, Bptr0, &Bld ); if( WBfr ) free( WB ); talph = one; } if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } else { /* * Left Upper (Conjugate) Transpose */ for( k = 0; k < M; k += kb ) { ktmp = M - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 ) */ PB_CGatherV( TYPE, REUSE, FORWARD, kbb, ktmp, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )' over B( IB+k:IB+M-1, JB:JB+N-1 ) */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Solve B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) with talph */ PB_CptrsmAB0( TYPE, SIDE, LOWER, DIAG, kbb, N, talph, WA, 0, 0, WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB, &Bfr ); /* * Update B( IB+k+kbb:IB+M-1, JB:JB+N-1 ) */ if( ( ktmp = ktmp - kbb ) > 0 ) { /* * Replicate B(IB+k:IB+k+kbb-1, JB:JB+N-1) over B(IB+k+kbb:IB+M-1, JB:JB+N-1) */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k+kbb, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k+kbb, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0, DBUFB, ROW, &WB, WBd, &WBfr ); /* * Local update */ Bmp = PB_Cnumroc( ktmp, k+kbb, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp > 0 ) && ( Bnq0 > 0 ) ) { WAoff = PB_Cnumroc( kbb, 0, WAd[IMB_], WAd[MB_], myrow, WAd[RSRC_], nprow ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0, &kbb, negone, Mptr( WA, WAoff, 0, WAd[LLD_], size ), &WAd[LLD_], WB, &WBd[LLD_], talph, Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld ); } if( WBfr ) free( WB ); talph = one; } if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } } else { if( !upper ) { /* * Right Lower (Conjugate) Transpose */ for( k = 0; k < N; k += kb ) { ktmp = N - k; kbb = MIN( ktmp, kb ); /* * Accumulate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, FORWARD, ktmp, kbb, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )' over B( IB:IB+M-1, JB+k:JB+N-1 ) */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Solve B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) with talph */ PB_CptrsmAB0( TYPE, SIDE, UPPER, DIAG, M, kbb, talph, WA, 0, 0, WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB, &Bfr ); /* * Update B( IB:IB+M-1, JB+k+kbb:JB+N-1 ) */ if( ( ktmp = ktmp - kbb ) > 0 ) { /* * Replicate B(IB:IB+M-1, JB+k:JB+k+kbb-1) over B(IB:IB+M-1, JB+k+kbb:JB+N-1) */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k+kbb, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k+kbb, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * Local update */ Bnq = PB_Cnumroc( ktmp, k+kbb, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) { WAoff = PB_Cnumroc( kbb, 0, WAd[INB_], WAd[NB_], mycol, WAd[CSRC_], npcol ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq, &kbb, negone, WB, &WBd[LLD_], Mptr( WA, 0, WAoff, WAd[LLD_], size ), &WAd[LLD_], talph, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld ); } if( WBfr ) free( WB ); talph = one; } if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } else { /* * Right Upper (Conjugate) Transpose */ kmax = ( ( N - 1 ) / kb ) * kb; for( k = kmax; k >= 0; k -= kb ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+ktmp-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, BACKWARD, ktmp, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA:IA+ktmp-1, JA+k:JA+k+kbb-1 )' over B(IB:IB+M-1, JB:JB+ktmp-1) */ PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Solve B( IB:IB+M-1, JB+k:JB+ktmp-1 ) with talph */ PB_CptrsmAB0( TYPE, SIDE, LOWER, DIAG, M, kbb, talph, WA, 0, k, WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB, &Bfr ); /* * Update B( IB:IB+M-1, JB:JB+k-1 ) */ if( k > 0 ) { /* * Replicate B( IB:IB+M-1, JB+k:JB+ktmp-1 ) over B( IB:IB+M-1, JB:JB+k-1 ) */ PB_Cdescset( Bd0, M, k, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, M, k, Bd0, kbb, Bptr, 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr ); /* * Local update */ Bnq = PB_Cnumroc( k, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq, &kbb, negone, WB, &WBd[LLD_], WA, &WAd[LLD_], talph, Bptr0, &Bld ); if( WBfr ) free( WB ); talph = one; } if( WAfr ) free( WA ); if( Bfr ) free( Bptr ); if( Afr ) free( Aptr ); } } } } else { /* * Left looking variant for the transpose cases */ if( lside ) { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); if( upper ) { /* * Accumulate A( IA:IA+Bimb1-1, JA:JA+Bimb1-1 ) */ PB_CGatherV( TYPE, REUSE, FORWARD, Bimb1, Bimb1, A, IA, JA, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA:IA+Bimb1-1, JA:JA+Bimb1-1 ) over B(IB:IB+Bimb1-1, JB:JB+N-1) */ PB_Cdescset( Bd0, Bimb1, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, Bimb1, N, Bd0, Bimb1, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Solve B( IB:IB+Bimb1-1, JB:JB+N-1 ) */ if( ( ( Brow < 0 ) || ( myrow == Brow ) ) && ( Bnq0 > 0 ) ) TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &Bimb1, &Bnq0, ALPHA, WA, &WAd[LLD_], Bptr0, &Bld ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * Update and solve remaining rows of sub( B ) */ for( k = Bimb1; k < M; k += kb ) { kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA:IA+ktmp-1, JA+k:JA+ktmp-1 ) */ PB_CGatherV( TYPE, REUSE, FORWARD, ktmp, kbb, A, IA, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA:IA+ktmp-1, JA+k:JA+ktmp-1 ) over B(IB:IB+ktmp-1, JB:JB+N-1) */ PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * WB := A( IA:IA+k-1, JA+k:JA+ktmp-1 )' * B( IB:IB+k-1, JB:JB+N-1 ) */ PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr, &WBsum ); Bmp = PB_Cnumroc( k, 0, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bnq0 > 0 ) && ( Bmp > 0 ) ) gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp, one, WA, &WAd[LLD_], Bptr0, &Bld, zero, WB, &WBd[LLD_] ); if( WBsum ) { WBd[RSRC_] = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); if( Bnq0 > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_], WBd[RSRC_], mycol ); } /* * Add WB to B( IB+k:IB+ktmp-1, JB:JB+N-1 ) and solve it with * A( IA+k:IA+ktmp-1, JA+k:JA+ktmp-1 ) */ PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, kbb, N, ALPHA, WA, k, 0, WAd, B, IB+k, JB, DESCB, WB, WBd ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); } } else { /* * Solve last block of rows of sub( B ) */ Bcurimb1 = PB_Clastnb( M, IB, Bimb, Bmb ); k = M - Bcurimb1; /* * Accumulate A( IA+k:IA+M-1, JA+k:JA+M-1 ) */ PB_CGatherV( TYPE, REUSE, BACKWARD, Bcurimb1, Bcurimb1, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+M-1, JA+k:JA+M-1 ) over B( IB+k:IB+M-1, JB:JB+N-1 ) */ Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, Bcurimb1, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, Bcurimb1, N, Bd0, Bcurimb1, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * Solve B( IB+k:IB+M-1, JB:JB+N-1 ) */ if( ( ( Brow < 0 ) || ( myrow == Bcurrow ) ) && ( Bnq0 > 0 ) ) TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &Bcurimb1, &Bnq0, ALPHA, WA, &WAd[LLD_], Mptr( Bptr0, Bmp0-Bcurimb1, 0, Bld, size ), &Bld ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); if( ( mn = M - Bcurimb1 ) <= 0 ) return; /* * Update and solve remaining rows of sub( B ) */ kmax = ( ( mn - 1 ) / kb ) * kb; for( k = kmax; k >= 0; k -= kb ) { ktmp = M - k; kbb = mn - k; kbb = MIN( kbb, kb ); /* * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) */ PB_CGatherV( TYPE, REUSE, BACKWARD, ktmp, kbb, A, IA+k, JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over B( IB+k:IB+M-1, JB:JB+N-1 ) */ Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb ); Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow ); PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr ); /* * WB := A( IA+k+kbb:IA+M-1, JA+k:JA+k+kbb-1 )'* B( IB+k+kbb:IB+M-1, JB:JB+N-1 ) */ PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr, &WBsum ); Bmp = PB_Cnumroc( ktmp-kbb, k+kbb, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bnq0 > 0 ) && ( Bmp > 0 ) ) { WAoff = PB_Cnumroc( kbb, 0, WAd[IMB_], WAd[MB_], myrow, WAd[RSRC_], nprow ); gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp, one, Mptr( WA, WAoff, 0, WAd[LLD_], size ), &WAd[LLD_], Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld, zero, WB, &WBd[LLD_] ); } if( WBsum ) { WBd[RSRC_] = PB_Cindxg2p( k + kbb - 1, Bimb1, Bmb, Brow, Brow, nprow ); if( Bnq0 > 0 ) gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_], WBd[RSRC_], mycol ); } /* * Add WB to B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) and solve it with * A( IA+k:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, kbb, N, ALPHA, WA, 0, 0, WAd, B, IB+k, JB, DESCB, WB, WBd ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); } } } else { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); if( upper ) { /* * Solve last block of columns of sub( B ) */ Bcurinb1 = PB_Clastnb( N, JB, Binb, Bnb ); k = N - Bcurinb1; /* * Accumulate A( IA+k:IA+N-1, JA+k:JA+N-1 ) */ PB_CGatherV( TYPE, REUSE, BACKWARD, Bcurinb1, Bcurinb1, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+N-1, JA+k:JA+N-1 ) over B( IB:IB+M-1, JB+k:JB+N-1 ) */ Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, Bcurinb1, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, Bcurinb1, Bd0, Bcurinb1, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Solve B( IB:IB+M-1, JB+k:JB+N-1 ) */ if( ( ( Bcol < 0 ) || ( mycol == Bcurcol ) ) && ( Bmp0 > 0 ) ) TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &Bmp0, &Bcurinb1, ALPHA, WA, &WAd[LLD_], Mptr( Bptr0, 0, Bnq0-Bcurinb1, Bld, size ), &Bld ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); if( ( mn = N - Bcurinb1 ) <= 0 ) return; /* * Update and solve remaining columns of sub( B ) */ kmax = ( ( mn - 1 ) / kb ) * kb; for( k = kmax; k >= 0; k -= kb ) { ktmp = N - k; kbb = mn - k; kbb = MIN( kbb, kb ); /* * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) */ PB_CGatherV( TYPE, REUSE, BACKWARD, kbb, ktmp, A, IA+k, JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over B( IB:IB+M-1, JB+k:JB+N-1 ) */ Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb ); Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * WB := B( IB:IB+M-1, JB+k+kbb:JB+N-1 ) * A(IA+k:IA+k+kbb-1, JA+k+kbb:JA+N-1)' */ PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd, &WBfr, &WBsum ); Bnq = PB_Cnumroc( ktmp-kbb, k+kbb, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) { WAoff = PB_Cnumroc( kbb, 0, WAd[INB_], WAd[NB_], mycol, WAd[CSRC_], npcol ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb, &Bnq, one, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld, Mptr( WA, 0, WAoff, WAd[LLD_], size ), &WAd[LLD_], zero, WB, &WBd[LLD_] ); } if( WBsum ) { WBd[CSRC_] = PB_Cindxg2p( k + kbb - 1, Binb1, Bnb, Bcol, Bcol, npcol ); if( Bmp0 > 0 ) gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_], myrow, WBd[CSRC_] ); } /* * Add WB to B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) and solve it with * A( IA+k:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) */ PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, M, kbb, ALPHA, WA, 0, 0, WAd, B, IB, JB+k, DESCB, WB, WBd ); if( WBfr ) free( WB ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); } } else { /* * Accumulate A( IA:IA+Binb1-1, JA:JA+Binb1-1 ) */ PB_CGatherV( TYPE, REUSE, FORWARD, Binb1, Binb1, A, IA, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA:IA+Binb1-1, JA:JA+Binb1-1 ) over B(IB:IB+M-1, JB:JB+Binb1-1) */ PB_Cdescset( Bd0, M, Binb1, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, Binb1, Bd0, Binb1, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * Solve B( IB:IB+M-1, JB:JB+Binb1-1 ) */ if( ( ( Bcol < 0 ) || ( mycol == Bcol ) ) && ( Bmp0 > 0 ) ) TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &Bmp0, &Binb1, ALPHA, WA, &WAd[LLD_], Bptr0, &Bld ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); /* * Update and solve remaining columns of sub( B ) */ for( k = Binb1; k < N; k += kb ) { kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb; /* * Accumulate A( IA+k:IA+ktmp-1, JA:JA+ktmp-1 ) */ PB_CGatherV( TYPE, REUSE, FORWARD, kbb, ktmp, A, IA+k, JA, DESCA, ROW, &Aptr, DBUFA, &Afr ); /* * Replicate A( IA+k:IA+ktmp-1, JA:JA+ktmp-1 ) over B( IB:IB+M-1, JB:JB+ktmp-1 ) */ PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0, DBUFA, ROW, &WA, WAd, &WAfr ); /* * WB := B( IB:IB+M-1, JB:JB+k-1 ) * A( IA+k:IA+ktmp-1, JA:JA+k-1 )' */ PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd, &WBfr, &WBsum ); Bnq = PB_Cnumroc( k, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb, &Bnq, one, Bptr0, &Bld, WA, &WAd[LLD_], zero, WB, &WBd[LLD_] ); if( WBsum ) { WBd[CSRC_] = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol ); if( Bmp0 > 0 ) gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_], myrow, WBd[CSRC_] ); } /* * Add WB to B( IB:IB+M-1, JB+k:JB+ktmp-1 ) and solve it with * A( IA+k:IA+ktmp-1, JA+k:JA+ktmp-1 ) */ PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, M, kbb, ALPHA, WA, 0, k, WAd, B, IB, JB+k, DESCB, WB, WBd ); if( WAfr ) free( WA ); if( Afr ) free( Aptr ); if( WBfr ) free( WB ); } } } } } /* * End of PB_CptrsmAB */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CptrsmAB0.c000644 000766 000024 00000046616 10363532303 020514 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CptrsmAB0( PBTYP_T * TYPE, char * SIDE, char * UPLO, char * DIAG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB, char * * C, int * DESCC, int * CFREE ) #else void PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, C, DESCC, CFREE ) /* * .. Scalar Arguments .. */ char * DIAG, * SIDE, * UPLO; int * CFREE, IA, IB, JA, JB, M, N; char * ALPHA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; char * A, * B, * * C; #endif { /* * .. Local Scalars .. */ char btop, * negone, * one, * talpha, * zero; int Acol, Acurcol, Acurrow, Aii, Aimb, Aimb1, Ainb, Ainb1, Ajj, Ald, Almb1, Alnb1, Amb, Amp0, Anq0, An, Anb, Arow, Bcol, Bii, Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp0, Bnb, Bnq0, Brow, Cld, ctxt, k=1, kb, kblks, kbprev, ktmp, lside, mycol, myrow, npcol, nprow, size, upper; char * Aptr = NULL, * Aptr0 = NULL, * Bptr = NULL, * Bptr0 = NULL, * Cptr = NULL; MMADD_T mmadd; GEBR2D_T brecv; GEBS2D_T bsend; GEMM_T gemm; TRSM_T trsm; /* .. * .. Executable Statements .. * */ size = TYPE->size; lside = ( Mupcase( SIDE[0] ) == CLEFT ); /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol, Ald */ Ald = DESCA[LLD_]; PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); /* * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol, Bld ... */ Bimb = DESCB[IMB_]; Binb = DESCB[INB_]; Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); /* * Shorcuts when sub( B ) spans only one process row or column */ if( lside ) { if( !( PB_Cspan( M, IB, Bimb, Bmb, DESCB[RSRC_], nprow ) ) ) { *CFREE = 0; Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb ); PB_Cdescset( DESCC, M, N, M, Binb1, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bnq0 > 0 ) && ( ( ( Brow >= 0 ) && ( myrow == Brow ) ) || ( Brow < 0 ) ) ) { *C = Mptr( B, Bii, Bjj, Bld, size ); TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &M, &Bnq0, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, *C, &Bld ); } return; } } else { if( !( PB_Cspan( N, JB, Binb, Bnb, DESCB[CSRC_], npcol ) ) ) { *CFREE = 0; Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb ); PB_Cdescset( DESCC, M, N, Bimb1, N, Bmb, Bnb, Brow, Bcol, ctxt, Bld ); Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow ); if( ( Bmp0 > 0 ) && ( ( ( Bcol >= 0 ) && ( mycol == Bcol ) ) || ( Bcol < 0 ) ) ) { *C = Mptr( B, Bii, Bjj, Bld, size ); TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &Bmp0, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald, *C, &Bld ); } return; } } /* * Handle the general case now */ An = ( lside ? M : N ); upper = ( Mupcase( UPLO[0] ) == CUPPER ); talpha = ALPHA; negone = TYPE->negone; one = TYPE->one; zero = TYPE->zero; brecv = TYPE->Cgebr2d; bsend = TYPE->Cgebs2d; mmadd = TYPE->Fmmadd; gemm = TYPE->Fgemm; trsm = TYPE->Ftrsm; /* * Compute more local information for sub( A ) and sub( B ) */ Aimb = DESCA[IMB_]; Ainb = DESCA[INB_]; Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Aimb1 = PB_Cfirstnb( An, IA, Aimb, Amb ); Almb1 = PB_Clastnb ( An, IA, Aimb, Amb ); Amp0 = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow ); Ainb1 = PB_Cfirstnb( An, JA, Ainb, Anb ); Alnb1 = PB_Clastnb ( An, JA, Ainb, Anb ); Anq0 = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp0 > 0 ) && ( Anq0 > 0 ) ) Aptr0 = Mptr( A, Aii, Ajj, Ald, size ); Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb ); Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow ); Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb ); Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol ); if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size ); if( lside ) { Cld = M; PB_Cdescset( DESCC, M, N, M, Binb1, Bmb, Bnb, -1, Bcol, ctxt, Cld ); if( Bnq0 > 0 ) { Cptr = *C = PB_Cmalloc( M * Bnq0 * size ); *CFREE = 1; } else { *C = NULL; *CFREE = 0; return; } kblks = ( An > Aimb1 ? ( An - Aimb1 - 1 ) / Amb + 2 : 1 ); btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( upper ) { Acurrow = PB_Cindxg2p( An-1, Aimb1, Amb, Arow, Arow, nprow ); kb = Almb1; Bptr = Mptr( Bptr0, Bmp0 - kb, 0, Bld, size ); Cptr = Mptr( *C, An - kb, 0, Cld, size ); /* * Solve last block of rows of sub( B ) and broadcast it vertically to update * the rest of sub( B ) */ if( myrow == Acurrow ) { trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &kb, &Bnq0, ALPHA, Mptr( Aptr0, Amp0-kb, Anq0-kb, Ald, size ), &Ald, Bptr, &Bld ); bsend( ctxt, COLUMN, &btop, kb, Bnq0, Bptr, Bld ); mmadd( &kb, &Bnq0, one, Bptr, &Bld, zero, Cptr, &Cld ); Amp0 -= kb; Bmp0 -= kb; } else { brecv( ctxt, COLUMN, &btop, kb, Bnq0, Cptr, Cld, Acurrow, mycol ); } Acurrow = MModSub1( Acurrow, nprow ); An -= ( kbprev = kb ); Anq0 -= kb; kblks -= 1; /* * Lookahead */ while( kblks > 0 ) { kb = ( kblks == 1 ? Aimb1 : Amb ); Aptr = Mptr( Aptr0, 0, Anq0, Ald, size ); Bptr = Mptr( Bptr0, Bmp0 - kb, 0, Bld, size ); Cptr = Mptr( *C, An, 0, Cld, size ); if( myrow == Acurrow ) { /* * Update the current block of rows of sub( B ) with block of rows of sub( B ) * of previous step */ gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &kb, &Bnq0, &kbprev, negone, Mptr( Aptr, Amp0-kb, 0, Ald, size ), &Ald, Cptr, &Cld, talpha, Bptr, &Bld ); /* * Solve the current block of rows of sub( B ) */ trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &kb, &Bnq0, one, Mptr( Aptr, Amp0-kb, -kb, Ald, size ), &Ald, Bptr, &Bld ); /* * Broadcast the current block of rows of sub( B ) for next update */ bsend( ctxt, COLUMN, &btop, kb, Bnq0, Bptr, Bld ); mmadd( &kb, &Bnq0, one, Bptr, &Bld, zero, Mptr( Cptr, -kb, 0, Cld, size ), &Cld ); /* * Finish update of the remaining blocks of rows of sub( B ) with block of rows * of sub( B ) of previous step */ if( ( ktmp = Amp0 - kb ) > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &ktmp, &Bnq0, &kbprev, negone, Aptr, &Ald, Cptr, &Cld, talpha, Bptr0, &Bld ); Amp0 -= kb; Bmp0 -= kb; } else { /* * Update the remaining rows of sub( B ) with block of rows of sub( B ) of * previous step */ if( Amp0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Amp0, &Bnq0, &kbprev, negone, Aptr, &Ald, Cptr, &Cld, talpha, Bptr0, &Bld ); /* * Receive the current block of rows of sub( B ) for next update */ brecv( ctxt, COLUMN, &btop, kb, Bnq0, Mptr( Cptr, -kb, 0, Cld, size ), Cld, Acurrow, mycol ); } Acurrow = MModSub1( Acurrow, nprow ); An -= ( kbprev = kb ); Anq0 -= kb; talpha = one; kblks -= 1; } } else { Acurrow = Arow; kb = Aimb1; /* * Solve first block of rows of sub( B ) and broadcast it vertically to update * the rest of sub( B ) */ if( myrow == Acurrow ) { trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &kb, &Bnq0, ALPHA, Aptr0, &Ald, Bptr0, &Bld ); bsend( ctxt, COLUMN, &btop, kb, Bnq0, Bptr0, Bld ); mmadd( &kb, &Bnq0, one, Bptr0, &Bld, zero, Cptr, &Cld ); Amp0 -= kb; Aptr0 = Mptr( Aptr0, kb, 0, Ald, size ); Bptr0 = Mptr( Bptr0, kb, 0, Bld, size ); } else { brecv( ctxt, COLUMN, &btop, kb, Bnq0, Cptr, Cld, Acurrow, mycol ); } Acurrow = MModAdd1( Acurrow, nprow ); kbprev = kb; Cptr = Mptr( Cptr, kb, 0, Cld, size ); Aptr0 = Mptr( Aptr0, 0, kb, Ald, size ); k += 1; /* * Lookahead */ while( k <= kblks ) { kb = ( k == kblks ? Almb1 : Amb ); if( myrow == Acurrow ) { /* * Update the current block of rows of sub( B ) with block of rows of sub( B ) * of previous step */ gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &kb, &Bnq0, &kbprev, negone, Mptr( Aptr0, 0, -kbprev, Ald, size ), &Ald, Mptr( Cptr, -kbprev, 0, Cld, size ), &Cld, talpha, Bptr0, &Bld ); /* * Solve the current block of rows of sub( B ) */ trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &kb, &Bnq0, one, Aptr0, &Ald, Bptr0, &Bld ); /* * Broadcast the current block of rows of sub( B ) for next update */ bsend( ctxt, COLUMN, &btop, kb, Bnq0, Bptr0, Bld ); mmadd( &kb, &Bnq0, one, Bptr0, &Bld, zero, Cptr, &Cld ); /* * Finish update of the remaining blocks of rows of sub( B ) with block of rows * of sub( B ) of previous step */ if( ( ktmp = Amp0 - kb ) > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &ktmp, &Bnq0, &kbprev, negone, Mptr( Aptr0, kb, -kbprev, Ald, size ), &Ald, Mptr( Cptr, -kbprev, 0, Cld, size ), &Cld, talpha, Mptr( Bptr0, kb, 0, Bld, size ), &Bld ); Amp0 -= kb; Aptr0 = Mptr( Aptr0, kb, 0, Ald, size ); Bptr0 = Mptr( Bptr0, kb, 0, Bld, size ); } else { /* * Update the remaining rows of sub( B ) with block of rows of sub( B ) of * previous step */ if( Amp0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Amp0, &Bnq0, &kbprev, negone, Mptr( Aptr0, 0, -kbprev, Ald, size ), &Ald, Mptr( Cptr, -kbprev, 0, Cld, size ), &Cld, talpha, Bptr0, &Bld ); /* * Receive the current block of rows of sub( B ) for next update */ brecv( ctxt, COLUMN, &btop, kb, Bnq0, Cptr, Cld, Acurrow, mycol ); } Acurrow = MModAdd1( Acurrow, nprow ); kbprev = kb; Cptr = Mptr( Cptr, kb, 0, Cld, size ); Aptr0 = Mptr( Aptr0, 0, kb, Ald, size ); talpha = one; k += 1; } } } else { Cld = MAX( 1, Bmp0 ); PB_Cdescset( DESCC, M, N, Bimb1, N, Bmb, Bnb, Brow, -1, ctxt, Cld ); if( Bmp0 > 0 ) { Cptr = *C = PB_Cmalloc( Bmp0 * N * size ); *CFREE = 1; } else { *C = NULL; *CFREE = 0; return; } kblks = ( An > Ainb1 ? ( An - Ainb1 - 1 ) / Anb + 2 : 1 ); btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( upper ) { Acurcol = Acol; kb = Ainb1; /* * Solve first block of columns of sub( B ) and broadcast it horizontally to * update the rest of sub( B ) */ if( mycol == Acurcol ) { trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &Bmp0, &kb, ALPHA, Aptr0, &Ald, Bptr0, &Bld ); bsend( ctxt, ROW, &btop, Bmp0, kb, Bptr0, Bld ); mmadd( &Bmp0, &kb, one, Bptr0, &Bld, zero, Cptr, &Cld ); Anq0 -= kb; Aptr0 = Mptr( Aptr0, 0, kb, Ald, size ); Bptr0 = Mptr( Bptr0, 0, kb, Bld, size ); } else { brecv( ctxt, ROW, &btop, Bmp0, kb, Cptr, Cld, myrow, Acurcol ); } Acurcol = MModAdd1( Acurcol, npcol ); kbprev = kb; k += 1; Cptr = Mptr( Cptr, 0, kb, Cld, size ); Aptr0 = Mptr( Aptr0, kb, 0, Ald, size ); /* * Lookahead */ while( k <= kblks ) { kb = ( k == kblks ? Alnb1 : Anb ); if( mycol == Acurcol ) { /* * Update the current block of columns of sub( B ) with block of columns of * sub( B ) of previous step */ gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &kb, &kbprev, negone, Mptr( Cptr, 0, -kbprev, Cld, size ), &Cld, Mptr( Aptr0, -kbprev, 0, Ald, size ), &Ald, talpha, Bptr0, &Bld ); /* * Solve the current block of columns of sub( B ) */ trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &Bmp0, &kb, one, Aptr0, &Ald, Bptr0, &Bld ); /* * Broadcast the current block of columns of sub( B ) for next update */ bsend( ctxt, ROW, &btop, Bmp0, kb, Bptr0, Bld ); mmadd( &Bmp0, &kb, one, Bptr0, &Bld, zero, Cptr, &Cld ); /* * Finish update of the remaining blocks of columns of sub( B ) with block of * columns of sub( B ) of previous step */ if( ( ktmp = Anq0 - kb ) > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &ktmp, &kbprev, negone, Mptr( Cptr, 0, -kbprev, Cld, size ), &Cld, Mptr( Aptr0, -kbprev, kb, Ald, size ), &Ald, talpha, Mptr( Bptr0, 0, kb, Bld, size ), &Bld ); Anq0 -= kb; Aptr0 = Mptr( Aptr0, 0, kb, Ald, size ); Bptr0 = Mptr( Bptr0, 0, kb, Bld, size ); } else { /* * Update the remaining columns of sub( B ) with block of columns of sub( B ) * of previous step */ if( Anq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Anq0, &kbprev, negone, Mptr( Cptr, 0, -kbprev, Cld, size ), &Cld, Mptr( Aptr0, -kbprev, 0, Ald, size ), &Ald, talpha, Bptr0, &Bld ); /* * Receive the current block of columns of sub( B ) for next update */ brecv( ctxt, ROW, &btop, Bmp0, kb, Cptr, Cld, myrow, Acurcol ); } Acurcol = MModAdd1( Acurcol, npcol ); kbprev = kb; Cptr = Mptr( Cptr, 0, kb, Cld, size ); Aptr0 = Mptr( Aptr0, kb, 0, Ald, size ); talpha = one; k += 1; } } else { Acurcol = PB_Cindxg2p( An-1, Ainb1, Anb, Acol, Acol, npcol ); kb = Alnb1; Bptr = Mptr( Bptr0, 0, Bnq0 - kb, Bld, size ); Cptr = Mptr( *C, 0, An - kb, Cld, size ); /* * Solve last block of columns of sub( B ) and broadcast it horizontally to * update the rest of sub( B ) */ if( mycol == Acurcol ) { trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &Bmp0, &kb, ALPHA, Mptr( Aptr0, Amp0-kb, Anq0-kb, Ald, size ), &Ald, Bptr, &Bld ); bsend( ctxt, ROW, &btop, Bmp0, kb, Bptr, Bld ); mmadd( &Bmp0, &kb, one, Bptr, &Bld, zero, Cptr, &Cld ); Anq0 -= kb; Bnq0 -= kb; } else { brecv( ctxt, ROW, &btop, Bmp0, kb, Cptr, Cld, myrow, Acurcol ); } Acurcol = MModSub1( Acurcol, npcol ); An -= ( kbprev = kb ); Amp0 -= kb; kblks -= 1; /* * Lookahead */ while( kblks > 0 ) { kb = ( kblks == 1 ? Ainb1 : Anb ); Aptr = Mptr( Aptr0, Amp0, 0, Ald, size ); Bptr = Mptr( Bptr0, 0, Bnq0 - kb, Bld, size ); Cptr = Mptr( *C, 0, An, Cld, size ); if( mycol == Acurcol ) { /* * Update the current block of columns of sub( B ) with block of columns of * sub( B ) of previous step */ gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &kb, &kbprev, negone, Cptr, &Cld, Mptr( Aptr, 0, Anq0-kb, Ald, size ), &Ald, talpha, Bptr, &Bld ); /* * Solve the current block of columns of sub( B ) */ trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), C2F_CHAR( DIAG ), &Bmp0, &kb, one, Mptr( Aptr, -kb, Anq0-kb, Ald, size ), &Ald, Bptr, &Bld ); /* * Broadcast the current block of columns of sub( B ) for next update */ bsend( ctxt, ROW, &btop, Bmp0, kb, Bptr, Bld ); mmadd( &Bmp0, &kb, one, Bptr, &Bld, zero, Mptr( Cptr, 0, -kb, Cld, size ), &Cld ); /* * Finish update of the remaining blocks of columns of sub( B ) with block of * columns of sub( B ) of previous step */ if( ( ktmp = Anq0 - kb ) > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &ktmp, &kbprev, negone, Cptr, &Cld, Aptr, &Ald, talpha, Bptr0, &Bld ); Anq0 -= kb; Bnq0 -= kb; } else { /* * Update the remaining columns of sub( B ) with block of columns of sub( B ) * of previous step */ if( Anq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Anq0, &kbprev, negone, Cptr, &Cld, Aptr, &Ald, talpha, Bptr0, &Bld ); /* * Receive the current block of columns of sub( B ) for next update */ brecv( ctxt, ROW, &btop, Bmp0, kb, Mptr( Cptr, 0, -kb, Cld, size ), Cld, myrow, Acurcol ); } Acurcol = MModSub1( Acurcol, npcol ); An -= ( kbprev = kb ); Amp0 -= kb; talpha = one; kblks -= 1; } } } /* * End of PB_CptrsmAB0 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CptrsmAB1.c000644 000766 000024 00000031770 10363532303 020510 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CptrsmAB1( PBTYP_T * TYPE, char * SIDE, char * UPLO, char * TRANSA, char * DIAG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB, char * C, int * DESCC ) #else void PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, C, DESCC ) /* * .. Scalar Arguments .. */ char * DIAG, * SIDE, * TRANSA, * UPLO; int IA, IB, JA, JB, M, N; char * ALPHA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB, * DESCC; char * A, * B, * C; #endif { /* * .. Local Scalars .. */ char * negone, * one; int Acol, Acurcol, Acurrow, Aii, Aimb, Aimb1, Ainb, Ainb1, Ajj, Ald, Almb1, Alnb1, Amb, Amp0, An, Anb, Anq0, Anxtrow, Anxtcol, Arow, Bcol, Bii, Bimb, Binb, Bjj, Bld, Bmb, Bmp0, Bnb, Bnq0, Brow, Cld, ctxt, k=1, kb, kblks, lside, mycol, myrow, npcol, nprow, size, upper; MMADD_T mmadd; GERV2D_T recv; GESD2D_T send; GEMM_T gemm; TRSM_T trsm; /* * .. Local Arrays .. */ char * Aptr = NULL, * Aptr0 = NULL, * Bptr = NULL, * Bptr0 = NULL, * Cptr = NULL; /* .. * .. Executable Statements .. * */ size = TYPE->size; lside = ( Mupcase( SIDE[0] ) == CLEFT ); /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol, Ald */ Ald = DESCA[LLD_]; PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); /* * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol, Bld ... */ Bimb = DESCB[IMB_]; Binb = DESCB[INB_]; Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); /* * Shorcuts when sub( B ) spans only one process row or column */ if( lside ) { Bnq0 = PB_Cnumroc( N, JB, Binb, Bnb, mycol, DESCB[CSRC_], npcol ); if( Bnq0 <= 0 ) return; Bmp0 = PB_Cnumroc( M, IB, Bimb, Bmb, myrow, DESCB[RSRC_], nprow ); if( !( PB_Cspan( M, IB, Bimb, Bmb, DESCB[RSRC_], nprow ) ) ) { if( Bmp0 > 0 ) { Bptr0 = Mptr( B, Bii, Bjj, Bld, size ); TYPE->Fmmadd( &M, &Bnq0, TYPE->negone, C, &DESCC[LLD_], ALPHA, Bptr0, &Bld ); TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &M, &Bnq0, TYPE->one, Mptr( A, Aii, Ajj, Ald, size ), &Ald, Bptr0, &Bld ); } return; } if( Bmp0 > 0 ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size ); } else { Bmp0 = PB_Cnumroc( M, IB, Bimb, Bmb, myrow, DESCB[RSRC_], nprow ); if( Bmp0 <= 0 ) return; Bnq0 = PB_Cnumroc( N, JB, Binb, Bnb, mycol, DESCB[CSRC_], npcol ); if( !( PB_Cspan( N, JB, Binb, Bnb, DESCB[CSRC_], npcol ) ) ) { if( Bnq0 > 0 ) { Bptr0 = Mptr( B, Bii, Bjj, Bld, size ); TYPE->Fmmadd( &Bmp0, &N, TYPE->negone, C, &DESCC[LLD_], ALPHA, Bptr0, &Bld ); TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &Bmp0, &N, TYPE->one, Mptr( A, Aii, Ajj, Ald, size ), &Ald, Bptr0, &Bld ); } return; } if( Bnq0 > 0 ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size ); } /* * Handle the general case now */ An = ( lside ? M : N ); upper = ( Mupcase( UPLO[0] ) == CUPPER ); negone = TYPE->negone; one = TYPE->one; recv = TYPE->Cgerv2d; send = TYPE->Cgesd2d; mmadd = TYPE->Fmmadd; gemm = TYPE->Fgemm; trsm = TYPE->Ftrsm; /* * Compute more local information for sub( A ) */ Aimb = DESCA[IMB_]; Ainb = DESCA[INB_]; Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Aimb1 = PB_Cfirstnb( An, IA, Aimb, Amb ); Almb1 = PB_Clastnb ( An, IA, Aimb, Amb ); Amp0 = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow ); Ainb1 = PB_Cfirstnb( An, JA, Ainb, Anb ); Alnb1 = PB_Clastnb ( An, JA, Ainb, Anb ); Anq0 = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Amp0 > 0 ) && ( Anq0 > 0 ) ) Aptr0 = Mptr( A, Aii, Ajj, Ald, size ); Cld = DESCC[LLD_]; if( lside ) { kblks = ( An > Aimb1 ? ( An - Aimb1 - 1 ) / Amb + 2 : 1 ); if( upper ) { Acurrow = Arow; Anxtrow = MModAdd1( Acurrow, nprow ); Aptr = Aptr0; Bptr = Bptr0; Cptr = C; while( k <= kblks ) { kb = ( k == 1 ? Aimb1 : ( k == kblks ? Almb1 : Amb ) ); An -= kb; if( myrow == Acurrow ) { /* * Add contribution of previous blocks of rows of sub( B ) to part of the * current block of rows of sub( B ) */ mmadd( &kb, &Bnq0, negone, Cptr, &Cld, ALPHA, Bptr, &Bld ); /* * Solve updated and current part of block of rows of sub( B ) */ trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &kb, &Bnq0, one, Aptr, &Ald, Bptr, &Bld ); /* * Add contribution of part of the current block of rows of sub( B ) to the * remaining of the contribution of previous blocks of rows of sub( B ). Send * this remaining part to next process row. */ if( An > 0 ) { gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &An, &Bnq0, &kb, one, Mptr( Aptr, 0, kb, Ald, size ), &Ald, Bptr, &Bld, one, Mptr( Cptr, kb, 0, Cld, size ), &Cld ); send( ctxt, An, Bnq0, Mptr( Cptr, kb, 0, Cld, size ), Cld, Anxtrow, mycol ); } Aptr = Mptr( Aptr, kb, 0, Ald, size ); Bptr = Mptr( Bptr, kb, 0, Bld, size ); Cptr = C; } else if( myrow == Anxtrow ) { /* * Receive contribution of previous blocks of rows of sub( B ) to be added to * next block of rows of sub( B ) */ if( An > 0 ) recv( ctxt, An, Bnq0, Cptr, Cld, Acurrow, mycol ); } Aptr = Mptr( Aptr, 0, kb, Ald, size ); Acurrow = Anxtrow; Anxtrow = MModAdd1( Acurrow, nprow ); k += 1; } } else { k = kblks; Acurrow = PB_Cindxg2p( An-1, Aimb1, Amb, Arow, Arow, nprow ); Anxtrow = MModSub1( Acurrow, nprow ); while( k > 0 ) { kb = ( k == 1 ? Aimb1 : ( k == kblks ? Almb1 : Amb ) ); An -= kb; if( myrow == Acurrow ) { Aptr = Mptr( Aptr0, Amp0 - kb, 0, Ald, size ); Bptr = Mptr( Bptr0, Bmp0 - kb, 0, Bld, size ); Cptr = Mptr( C, An, 0, Cld, size ); /* * Add contribution of previous blocks of rows of sub( B ) to part of the * current block of rows of sub( B ) */ mmadd( &kb, &Bnq0, negone, Cptr, &Cld, ALPHA, Bptr, &Bld ); /* * Solve updated and current part of block of rows of sub( B ) */ trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &kb, &Bnq0, one, Mptr( Aptr, 0, Anq0-kb, Ald, size ), &Ald, Bptr, &Bld ); /* * Add contribution of part of the current block of rows of sub( B ) to the * remaining of the contribution of previous blocks of rows of sub( B ). Send * this remaining part to next process row. */ if( An > 0 ) { gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &An, &Bnq0, &kb, one, Aptr, &Ald, Bptr, &Bld, one, C, &Cld ); send( ctxt, An, Bnq0, C, Cld, Anxtrow, mycol ); } Amp0 -= kb; Bmp0 -= kb; } else if( myrow == Anxtrow ) { /* * Receive contribution of previous blocks of rows of sub( B ) to be added to * next block of rows of sub( B ) */ if( An > 0 ) recv( ctxt, An, Bnq0, C, Cld, Acurrow, mycol ); } Anq0 -= kb; Acurrow = Anxtrow; Anxtrow = MModSub1( Acurrow, nprow ); k -= 1; } } } else { kblks = ( An > Ainb1 ? ( An - Ainb1 - 1 ) / Anb + 2 : 1 ); if( upper ) { k = kblks; Acurcol = PB_Cindxg2p( An-1, Ainb1, Anb, Acol, Acol, npcol ); Anxtcol = MModSub1( Acurcol, npcol ); while( k > 0 ) { kb = ( k == 1 ? Ainb1 : ( k == kblks ? Alnb1 : Anb ) ); An -= kb; if( mycol == Acurcol ) { Aptr = Mptr( Aptr0, 0, Anq0 - kb, Ald, size ); Bptr = Mptr( Bptr0, 0, Bnq0 - kb, Bld, size ); Cptr = Mptr( C, 0, An, Cld, size ); /* * Add contribution of previous blocks of columns of sub( B ) to part of the * current block of columns of sub( B ) */ mmadd( &Bmp0, &kb, negone, Cptr, &Cld, ALPHA, Bptr, &Bld ); /* * Solve updated and current part of block of columns of sub( B ) */ trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &Bmp0, &kb, one, Mptr( Aptr, Amp0-kb, 0, Ald, size ), &Ald, Bptr, &Bld ); /* * Add contribution of part of the current block of columns of sub( B ) to the * remaining of the contribution of previous blocks of columns of sub( B ). * Send this remaining part to next process column. */ if( An > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &An, &kb, one, Bptr, &Bld, Aptr, &Ald, one, C, &Cld ); send( ctxt, Bmp0, An, C, Cld, myrow, Anxtcol ); } Anq0 -= kb; Bnq0 -= kb; } else if( mycol == Anxtcol ) { /* * Receive contribution of previous blocks of columns of sub( B ) to be added * to next block of columns of sub( B ) */ if( An > 0 ) recv( ctxt, Bmp0, An, C, Cld, myrow, Acurcol ); } Amp0 -= kb; Acurcol = Anxtcol; Anxtcol = MModSub1( Acurcol, npcol ); k -= 1; } } else { Acurcol = Acol; Anxtcol = MModAdd1( Acurcol, npcol ); Aptr = Aptr0; Bptr = Bptr0; Cptr = C; while( k <= kblks ) { kb = ( k == 1 ? Ainb1 : ( k == kblks ? Alnb1 : Anb ) ); An -= kb; if( mycol == Acurcol ) { /* * Add contribution of previous blocks of columns of sub( B ) to part of the * current block of columns of sub( B ) */ mmadd( &Bmp0, &kb, negone, Cptr, &Cld, ALPHA, Bptr, &Bld ); /* * Solve updated and current part of block of columns of sub( B ) */ trsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ), &Bmp0, &kb, one, Aptr, &Ald, Bptr, &Bld ); /* * Add contribution of part of the current block of columns of sub( B ) to the * remaining of the contribution of previous blocks of columns of sub( B ). * Send this remaining part to next process column. */ if( An > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &An, &kb, one, Bptr, &Bld, Mptr( Aptr, kb, 0, Ald, size ), &Ald, one, Mptr( Cptr, 0, kb, Cld, size ), &Cld ); send( ctxt, Bmp0, An, Mptr( Cptr, 0, kb, Cld, size ), Cld, myrow, Anxtcol ); } Aptr = Mptr( Aptr, 0, kb, Ald, size ); Bptr = Mptr( Bptr, 0, kb, Bld, size ); Cptr = C; } else if( mycol == Anxtcol ) { /* * Receive contribution of previous blocks of columns of sub( B ) to be added * to next block of columns of sub( B ). */ if( An > 0 ) recv( ctxt, Bmp0, An, Cptr, Cld, myrow, Acurcol ); } Aptr = Mptr( Aptr, kb, 0, Ald, size ); Acurcol = Anxtcol; Anxtcol = MModAdd1( Acurcol, npcol ); k += 1; } } } /* * End of PB_CptrsmAB1 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CptrsmB.c000644 000766 000024 00000102334 10363532303 020321 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CptrsmB( PBTYP_T * TYPE, char * DIRECB, char * SIDE, char * UPLO, char * TRANSA, char * DIAG, int M, int N, char * ALPHA, char * A, int IA, int JA, int * DESCA, char * B, int IB, int JB, int * DESCB ) #else void PB_CptrsmB( TYPE, DIRECB, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB ) /* * .. Scalar Arguments .. */ char * DIAG, * DIRECB, * SIDE, * TRANSA, * UPLO; int IA, IB, JA, JB, M, N; char * ALPHA; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * B; #endif { /* * Purpose * ======= * * PB_CptrsmB solves one of the matrix equations * * op( sub( A ) )*X = alpha*sub( B ), or * * X*op( sub( A ) ) = alpha*sub( B ), * * where * * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L', * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and, * * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1). * * Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is * a unit, or non-unit, upper or lower triangular submatrix and op( Y ) * is one of * * op( Y ) = Y or op( Y ) = Y' or op( Y ) = conjg( Y' ). * * The submatrix X is overwritten on sub( B ). * * This is the inner-product algorithm using the logical LCM hybrid * and static blocking techniques. The submatrix operand sub( A ) stays * in place. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECB (global input) pointer to CHAR * On entry, DIRECB specifies the direction in which the rows * or columns of sub( B ) should be looped over as follows: * DIRECB = 'F' or 'f' forward or increasing, * DIRECB = 'B' or 'b' backward or decreasing. * * SIDE (global input) pointer to CHAR * On entry, SIDE specifies whether op( sub( A ) ) appears on * the left or right of X as follows: * * SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ), * * SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ). * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANSA (global input) pointer to CHAR * On entry, TRANSA specifies the form of op( sub( A ) ) to be * used in the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ), * * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )', * * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ). * * DIAG (global input) pointer to CHAR * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( B ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( B ). N must be at least zero. * * ALPHA (global input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. When ALPHA is * supplied as zero then the local entries of the array B * corresponding to the entries of the submatrix sub( B ) need * not be set on input. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array * contains the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains the * local entries corresponding to the entries of the upper tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly lower triangular part of * the submatrix sub( A ) are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains the * local entries corresponding to the entries of the lower tri- * angular submatrix sub( A ), and the local entries correspon- * ding to the entries of the strictly upper triangular part of * the submatrix sub( A ) are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * B (local input/local output) pointer to CHAR * On entry, B is an array of dimension (LLD_B, Kb), where Kb is * at least Lc( 1, JB+N-1 ). Before entry, this array contains * the local entries of the matrix B. * On exit, the local entries of this array corresponding to the * to the entries of the submatrix sub( B ) are overwritten by * the local entries of the m by n solution submatrix. * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char Broc, TranOp, conjg, * negone, * one, * talpha, * talph0, top, * zero; int Acol, Aii, Aimb1, Ainb1, Ajj, Akp, Akq, Alcmb, Ald, Amb, An, Anb, Anp, Anp0, Anq, Anq0, Arow, Asrc, Astart, BcurrocR, Bfwd, BiiD, BiiR, Binb1D, Binb1R, BisR, Bld, BmyprocD, BmyprocR, BnD, BnR, BnbD, BnbR, BnpR, BnprocsD, BnprocsR, BrocD, BrocR, BsrcR, LNorRT, WBCfr, WBCld, WBCapbX, WBCsum, WBRfr, WBRld, WBRapbX, WBRsum, ctxt, izero=0, k, kb, kbnext, kbprev, ktmp, lside, mycol, myrow, n, nb, nbb, notran, npcol, nprow, p=0, size, tmp, upper; TZPAD_T pad; GEMM_T gemm; GSUM2D_T gsum2d; /* * .. Local Arrays .. */ int Ad0[DLEN_], DBUFB[DLEN_], WBCd[DLEN_], WBRd[DLEN_]; char * Aptr = NULL, * Bptr = NULL, * WBC = NULL, * WBR = NULL; /* .. * .. Executable Statements .. * */ /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD ); lside = ( Mupcase( SIDE [0] ) == CLEFT ); upper = ( Mupcase( UPLO [0] ) == CUPPER ); notran = ( ( TranOp = Mupcase( TRANSA[0] ) ) == CNOTRAN ); LNorRT = ( lside && notran ) || ( !( lside ) && !( notran ) ); size = TYPE->size; one = TYPE->one; zero = TYPE->zero; negone = TYPE->negone; pad = TYPE->Ftzpad; gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d; nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) ); /* * Compute local information for sub( A ) and sub( B ) */ if( lside ) { BnD = An = M; BnR = N; Broc = CCOLUMN; BmyprocD = myrow; BnprocsD = nprow; BmyprocR = mycol; BnprocsR = npcol; BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ]; BsrcR = DESCB[CSRC_]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR, &BiiD, &BiiR, &BrocD, &BrocR ); Binb1D = PB_Cfirstnb( BnD, IB, DESCB[IMB_], BnbD ); Binb1R = PB_Cfirstnb( BnR, JB, DESCB[INB_], BnbR ); } else { BnD = An = N; BnR = M; Broc = CROW; BmyprocD = mycol; BnprocsD = npcol; BmyprocR = myrow; BnprocsR = nprow; BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ]; BsrcR = DESCB[RSRC_]; Bld = DESCB[LLD_]; PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD, &BiiR, &BiiD, &BrocR, &BrocD ); Binb1D = PB_Cfirstnb( BnD, JB, DESCB[INB_], BnbD ); Binb1R = PB_Cfirstnb( BnR, IB, DESCB[IMB_], BnbR ); } /* * Compute descriptor Ad0 for sub( A ) */ PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); /* * Compute conjugate of alpha for the conjugate transpose cases */ if( TranOp == CCOTRAN ) { conjg = CCONJG; talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha ); } else { conjg = CNOCONJG; talpha = ALPHA; } /* * Retrieve BLACS combine topology, select backward ot forward substitution. */ if( LNorRT ) { top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); Astart = ( upper ? An - 1 : 0 ); } else { top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); Astart = ( upper ? 0 : An - 1 ); } /* * Computational partitioning size is computed as the product of the logical * value returned by pilaenv_ and 2 * lcm( nprow, npcol ). */ Alcmb = 2 * nb * PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); /* * When sub( B ) is not replicated and backward pass on sub( B ), find the * virtual process p owning the last row or column of sub( B ). */ if( !( BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) ) ) && !( Bfwd ) ) { tmp = PB_Cindxg2p( BnR - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR ); p = MModSub( tmp, BrocR, BnprocsR ); } /* * Loop over the processes rows or columns owning the BnR rows or columns of * sub( B ) to be processed. */ n = BnR; while( n > 0 ) { /* * Find out who is the active process row or column as well as the number of * rows or columns of sub( B ) it owns. */ BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) ); BnpR = PB_Cnumroc( BnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR ); n -= BnpR; /* * Re-adjust the number of rows or columns to be handled at each step, in order * to average the message sizes and the computational granularity. */ if( BnpR ) nbb = BnpR / ( ( BnpR - 1 ) / nb + 1 ); while( BnpR ) { nbb = MIN( nbb, BnpR ); /* * Describe the local contiguous panel of sub( B ) */ if( lside ) { PB_Cdescset( DBUFB, BnD, nbb, Binb1D, nbb, BnbD, BnbR, BrocD, BcurrocR, ctxt, Bld ); if( BisR || ( BmyprocR == BcurrocR ) ) Bptr = Mptr( B, BiiD, BiiR, Bld, size ); } else { PB_Cdescset( DBUFB, nbb, BnD, nbb, Binb1D, BnbR, BnbD, BcurrocR, BrocD, ctxt, Bld ); if( BisR || ( BmyprocR == BcurrocR ) ) Bptr = Mptr( B, BiiR, BiiD, Bld, size ); } talph0 = talpha; if( LNorRT ) { /* * Reuse sub( B ) and/or create vector WBC in process column owning the first * or last column of sub( A ) */ PB_CInOutV2( TYPE, &conjg, COLUMN, An, An, Astart, Ad0, nbb, Bptr, 0, 0, DBUFB, &Broc, &WBC, WBCd, &WBCfr, &WBCsum, &WBCapbX ); /* * Create WBR in process rows spanned by sub( A ) */ PB_COutV( TYPE, ROW, INIT, An, An, Ad0, nbb, &WBR, WBRd, &WBRfr, &WBRsum ); /* * Retrieve local quantities related to sub( A ) -> Ad0 */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( A, Aii, Ajj, Ald, size ); WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_]; if( upper ) { /* * sub( A ) is upper triangular */ for( k = ( Astart / Alcmb ) * Alcmb; k >= 0; k -= Alcmb ) { ktmp = An - k; kb = MIN( ktmp, Alcmb ); /* * Solve logical diagonal block, WBC contains the solution scattered in multiple * process columns and WBR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsm( TYPE, WBRsum, SIDE, UPLO, TRANSA, DIAG, kb, nbb, talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ), WBRld ); /* * Update: only the part of sub( B ) to be solved at the next step is locally * updated and combined, the remaining part of the matrix to be solved later * is only locally updated. */ if( Akp > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( WBCsum ) { kbprev = MIN( k, Alcmb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow, Arow, nprow ); Akp -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &ktmp, &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld, talph0, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld ); Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol ); gsum2d( ctxt, ROW, &top, ktmp, nbb, Mptr( WBC, Akp, 0, WBCld, size ), WBCld, myrow, Asrc ); if( mycol != Asrc ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &ktmp, &nbb, &izero, zero, zero, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld ); } if( ( Akp > 0 ) && ( Anq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Akp, &nbb, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld, talph0, WBC, &WBCld ); } else { if( Anq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Akp, &nbb, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld, talph0, WBC, &WBCld ); } } talph0 = one; } } else { /* * sub( A ) is lower triangular */ for( k = 0; k < An; k += Alcmb ) { ktmp = An - k; kb = MIN( ktmp, Alcmb ); /* * Solve logical diagonal block, WBC contains the solution scattered in multiple * process columns and WBR contains the solution replicated in the process rows. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsm( TYPE, WBRsum, SIDE, UPLO, TRANSA, DIAG, kb, nbb, talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ), WBRld ); /* * Update: only the part of sub( B ) to be solved at the next step is locally * updated and combined, the remaining part of the matrix to be solved later is * only locally updated. */ Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow ); if( ( Anp0 = Anp - Akp ) > 0 ) { Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); if( WBCsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, Alcmb ); ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow, nprow ); Anp0 -= ktmp; if( ktmp > 0 ) { if( Anq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &ktmp, &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld, talph0, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld ); Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol ); gsum2d( ctxt, ROW, &top, ktmp, nbb, Mptr( WBC, Akp, 0, WBCld, size ), WBCld, myrow, Asrc ); if( mycol != Asrc ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &ktmp, &nbb, &izero, zero, zero, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld ); } if( ( Anp0 > 0 ) && ( Anq0 > 0 ) ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Anp0, &nbb, &Anq0, negone, Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld, talph0, Mptr( WBC, Akp+ktmp, 0, WBCld, size ), &WBCld ); } else { if( Anq0 > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Anp0, &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld, talph0, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld ); } } talph0 = one; } } /* * Combine the scattered resulting matrix WBC */ if( WBCsum && ( Anp > 0 ) ) gsum2d( ctxt, ROW, &top, Anp, nbb, WBC, WBCld, myrow, WBCd[CSRC_] ); /* * sub( B ) := WBC (if necessary) */ if( WBCapbX ) PB_Cpaxpby( TYPE, &conjg, An, nbb, one, WBC, 0, 0, WBCd, COLUMN, zero, Bptr, 0, 0, DBUFB, &Broc ); } else { /* * Reuse sub( B ) and/or create vector WBR in process row owning the first or * last row of sub( A ) */ PB_CInOutV2( TYPE, &conjg, ROW, An, An, Astart, Ad0, nbb, Bptr, 0, 0, DBUFB, &Broc, &WBR, WBRd, &WBRfr, &WBRsum, &WBRapbX ); /* * Create WBC in process columns spanned by sub( A ) */ PB_COutV( TYPE, COLUMN, INIT, An, An, Ad0, nbb, &WBC, WBCd, &WBCfr, &WBCsum ); /* * Retrieve local quantities related to Ad0 -> sub( A ) */ Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; Anp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow ); Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( A, Aii, Ajj, Ald, size ); WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_]; if( upper ) { /* * sub( A ) is upper triangular */ for( k = 0; k < An; k += Alcmb ) { ktmp = An - k; kb = MIN( ktmp, Alcmb ); /* * Solve logical diagonal block, WBR contains the solution scattered in multiple * process rows and WBC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsm( TYPE, WBCsum, SIDE, UPLO, TRANSA, DIAG, nbb, kb, talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ), WBRld ); /* * Update: only the part of sub( B ) to be solved at the next step is locally * updated and combined, the remaining part of the matrix to be solved later is * only locally updated. */ Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol ); if( ( Anq0 = Anq - Akq ) > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( WBRsum ) { kbnext = ktmp - kb; kbnext = MIN( kbnext, Alcmb ); ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol, npcol ); Anq0 -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb, &ktmp, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, talph0, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld ); Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow ); gsum2d( ctxt, COLUMN, &top, nbb, ktmp, Mptr( WBR, 0, Akq, WBRld, size ), WBRld, Asrc, mycol ); if( myrow != Asrc ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &nbb, &ktmp, &izero, zero, zero, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld ); } if( ( Anp0 > 0 ) && ( Anq0 > 0 ) ) gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb, &Anq0, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald, talph0, Mptr( WBR, 0, Akq+ktmp, WBRld, size ), &WBRld ); } else { if( Anp0 > 0 ) gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb, &Anq0, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, talph0, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld ); } } talph0 = one; } } else { /* * sub( A ) is lower triangular */ for( k = ( Astart / Alcmb ) * Alcmb; k >= 0; k -= Alcmb ) { ktmp = An - k; kb = MIN( ktmp, Alcmb ); /* * Solve logical diagonal block, WBR contains the solution scattered in multiple * process rows and WBC contains the solution replicated in the process columns. */ Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); PB_Cptrsm( TYPE, WBCsum, SIDE, UPLO, TRANSA, DIAG, nbb, kb, talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld, size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ), WBRld ); /* * Update: only the part of sub( B ) to be solved at the next step is locally * updated and combined, the remaining part of the matrix to be solved later * is only locally updated. */ if( Akq > 0 ) { Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); if( WBRsum ) { kbprev = MIN( k, Alcmb ); ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol, Acol, npcol ); Akq -= ktmp; if( ktmp > 0 ) { if( Anp0 > 0 ) gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb, &ktmp, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, talph0, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld ); Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow ); gsum2d( ctxt, COLUMN, &top, nbb, ktmp, Mptr( WBR, 0, Akq, WBRld, size ), WBRld, Asrc, mycol ); if( myrow != Asrc ) pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &nbb, &ktmp, &izero, zero, zero, Mptr( WBR, 0, Akq, WBRld, size ), &WBRld ); } if( ( Anp0 > 0 ) && ( Akq > 0 ) ) gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb, &Akq, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, talph0, WBR, &WBRld ); } else { if( Anp0 > 0 ) gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb, &Akq, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld, size ), &WBCld, Mptr( Aptr, Akp, 0, Ald, size ), &Ald, talph0, WBR, &WBRld ); } } talph0 = one; } } /* * Combine the scattered resulting matrix WBR */ if( WBRsum && ( Anq > 0 ) ) gsum2d( ctxt, COLUMN, &top, nbb, Anq, WBR, WBRld, WBRd[RSRC_], mycol ); /* * sub( B ) := WBR (if necessary) */ if( WBRapbX ) PB_Cpaxpby( TYPE, &conjg, nbb, An, one, WBR, 0, 0, WBRd, ROW, zero, Bptr, 0, 0, DBUFB, &Broc ); } if( WBCfr ) free( WBC ); if( WBRfr ) free( WBR ); /* * Go to the next contiguous panel if any residing in this process row or column */ BnpR -= nbb; if( BisR || ( BmyprocR == BcurrocR ) ) BiiR += nbb; } /* * Go to next or previous process row or column owning some of sub( B ) */ if( !( BisR ) ) p = ( Bfwd ? MModAdd1( p, BnprocsR ) : MModSub1( p, BnprocsR ) ); } if( TranOp == CCOTRAN ) free( talpha ); /* * End of PB_CptrsmB */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cptrsv.c000644 000766 000024 00000077603 10363532303 020242 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Cptrsv( PBTYP_T * TYPE, int FBCAST, char * UPLO, char * TRANS, char * DIAG, int N, char * A, int IA, int JA, int * DESCA, char * XC, int INCXC, char * XR, int INCXR ) #else void PB_Cptrsv( TYPE, FBCAST, UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, XC, INCXC, XR, INCXR ) /* * .. Scalar Arguments .. */ char * DIAG, * TRANS, * UPLO; int FBCAST, IA, INCXC, INCXR, JA, N; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA; char * A, * XC, * XR; #endif { /* * Purpose * ======= * * PB_Cptrsv solves one of the systems of equations * * sub( A )*X = b, or sub( A )'*X = b, or conjg( sub( A )' )*X = b, * * where sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1). * * b and X are n element subvectors and sub( A ) is an n by n unit, or * non-unit, upper or lower triangular submatrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * FBCAST (global input) INTEGER * On entry, FBCAST specifies whether the transposed of the vec- * tor solution should be broadcast or not when there is a pos- * sible ambiguity, i.e. when sub( A ) is just one block. When * FBCAST is zero, the solution vector is not broadcast, and the * the solution vector is broadcast otherwise. * * UPLO (global input) pointer to CHAR * On entry, UPLO specifies whether the submatrix sub( A ) is * an upper or lower triangular submatrix as follows: * * UPLO = 'U' or 'u' sub( A ) is an upper triangular * submatrix, * * UPLO = 'L' or 'l' sub( A ) is a lower triangular * submatrix. * * TRANS (global input) pointer to CHAR * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' sub( A ) * X = b, * * TRANS = 'T' or 't' sub( A )' * X = b, * * TRANS = 'C' or 'c' conjg( sub( A )' ) * X = b. * * DIAG (global input) pointer to CHAR * On entry, DIAG specifies whether or not sub( A ) is unit * triangular as follows: * * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- * gular, * * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- * angular. * * N (global input) INTEGER * On entry, N specifies the order of the submatrix sub( A ). * N must be at least zero. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where Ka is * at least Lc( 0, JA+N-1 ). Before entry, this array contains * the local entries of the matrix A. * Before entry with UPLO = 'U' or 'u', this array contains * the local entries corresponding to the upper triangular part * of the triangular submatrix sub( A ), and the local entries * corresponding to the strictly lower triangular of sub( A ) * are not referenced. * Before entry with UPLO = 'L' or 'l', this array contains * the local entries corresponding to the lower triangular part * of the triangular submatrix sub( A ), and the local entries * corresponding to the strictly upper triangular of sub( A ) * are not referenced. * Note that when DIAG = 'U' or 'u', the local entries corres- * ponding to the diagonal elements of the submatrix sub( A ) * are not referenced either, but are assumed to be unity. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * XC (local input/local output) pointer to CHAR * On entry, XC is an array of dimension (LLD_X,Kx), where Kx is * at least 1 and LLD_X is at least Lr( IA, N ). Before entry, * when TRANS is 'N' or 'n' this array contains the local en- * tries of the right-hand-side vector b. When TRANS is not 'N' * or 'n', the entries of XC should be zero. On exit, this array * contains the partial solution vector x. * * INCXC (local input) INTEGER * On entry, INCXC specifies the local increment of the vector * XC. * * XR (local input/local output) pointer to CHAR * On entry, XR is an array of dimension (LLD_X,Kx), where Kx is * least Lc( JA, N ) and LLD_X at least 1. Before entry, when * TRANS is 'N' or 'n' the entries of XR should be zero. Other- * wise this array contains the local entries of the right-hand- * side vector b. On exit, this array contains the partial so- * lution vector x. * * INCXR (local input) INTEGER * On entry, INCXR specifies the local increment of the vector * XR. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char btop, * negone, * one, * zero; int Acol, Aii, Aimb1, Ainb1, Ais1Col, Ais1Row, AisColRep, AisRowRep, Ajj, Alcol, Ald, Alrow, Amb, Anpprev, Anb, Anp, Anq, Arow, Asrc, ChangeRoc=0, bcst, ctxt, ione=1, k=0, kb, kbprev=0, kbsize, mb1, mycol, myrow, n1, n1last, n1p, n1pprev=0, nb1, nlast, npcol, nprow, rocprev, size, tmp1, tmp2; AXPY_T axpy; COPY_T copy; VVSET_T set; GEMV_T gemv; TRSV_T trsv; GESD2D_T send; GERV2D_T recv; GEBS2D_T bsend; GEBR2D_T brecv; /* * .. Local Arrays .. */ char * Aprev = NULL, * Xd = NULL, * Xdprev = NULL, * Xprev = NULL, * work = NULL; /* .. * .. Executable Statements .. * */ if( N <= 0 ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); /* * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ... */ PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow, &Acol ); /* * Determine if sub( A ) spans more than one process row, and/or more than one * process column. */ Amb = DESCA[MB_]; Anb = DESCA[NB_]; Ald = DESCA[LLD_ ]; Aimb1 = PB_Cfirstnb( N, IA, DESCA[IMB_], Amb ); Anp = PB_Cnumroc( N, 0, Aimb1, Amb, myrow, Arow, nprow ); Ais1Row = !( PB_Cspan( N, 0, Aimb1, Amb, Arow, nprow ) ); Ainb1 = PB_Cfirstnb( N, JA, DESCA[INB_], Anb ); Anq = PB_Cnumroc( N, 0, Ainb1, Anb, mycol, Acol, npcol ); Ais1Col = !( PB_Cspan( N, 0, Ainb1, Anb, Acol, npcol ) ); /* * When sub( A ) spans only one process, solve the system locally and return. */ if( Ais1Row && Ais1Col ) { if( Mupcase( TRANS[0] ) == CNOTRAN ) { if( Anq > 0 ) { if( Anp > 0 ) { TYPE->Ftrsv( C2F_CHAR( UPLO ), C2F_CHAR( TRANS ), C2F_CHAR( DIAG ), &N, Mptr( A, Aii, Ajj, Ald, TYPE->size ), &Ald, XC, &INCXC ); TYPE->Fcopy( &Anp, XC, &INCXC, XR, &INCXR ); } if( ( Arow >= 0 ) && FBCAST ) { btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); if( myrow == Arow ) TYPE->Cgebs2d( ctxt, COLUMN, &btop, 1, Anq, XR, INCXR ); else TYPE->Cgebr2d( ctxt, COLUMN, &btop, 1, Anq, XR, INCXR, Arow, mycol ); } } } else { if( Anp > 0 ) { if( Anq > 0 ) { TYPE->Ftrsv( C2F_CHAR( UPLO ), C2F_CHAR( TRANS ), C2F_CHAR( DIAG ), &N, Mptr( A, Aii, Ajj, Ald, TYPE->size ), &Ald, XR, &INCXR ); TYPE->Fcopy( &Anq, XR, &INCXR, XC, &INCXC ); } if( Acol >= 0 && FBCAST ) { btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); if( mycol == Acol ) TYPE->Cgebs2d( ctxt, ROW, &btop, Anp, 1, XC, Anp ); else TYPE->Cgebr2d( ctxt, ROW, &btop, Anp, 1, XC, Anp, myrow, Acol ); } } } return; } /* * Retrieve from TYPE structure useful BLAS and BLACS functions. */ size = TYPE->size; negone = TYPE->negone; one = TYPE->one; zero = TYPE->zero; axpy = TYPE->Faxpy; copy = TYPE->Fcopy; set = TYPE->Fset; gemv = TYPE->Fgemv; trsv = TYPE->Ftrsv; send = TYPE->Cgesd2d; recv = TYPE->Cgerv2d; bsend = TYPE->Cgebs2d; brecv = TYPE->Cgebr2d; if( ( Anp > 0 ) && ( Anq > 0 ) ) A = Mptr( A, Aii, Ajj, Ald, size ); if( Mupcase( TRANS[0] ) == CNOTRAN ) { if( ( Anq <= 0 ) || ( Ais1Row && ( ( Arow >= 0 ) && !( FBCAST ) && ( myrow != Arow ) ) ) ) return; btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); bcst = ( ( !Ais1Row ) || ( Ais1Row && ( Arow >= 0 ) && FBCAST ) ); AisRowRep = ( ( Arow < 0 ) || ( nprow == 1 ) ); if( Mupcase( UPLO[0] ) == CUPPER ) { /* * Initiate lookahead */ nlast = ( npcol - 1 ) * Anb; n1 = MAX( nlast, Anb ); nlast += Ainb1; n1last = n1 - Anb + MAX( Ainb1, Anb ); work = PB_Cmalloc( MIN( n1last, Anp ) * size ); tmp1 = N-1; Alrow = PB_Cindxg2p( tmp1, Aimb1, Amb, Arow, Arow, nprow ); Alcol = PB_Cindxg2p( tmp1, Ainb1, Anb, Acol, Acol, npcol ); rocprev = Alcol; Anpprev = Anp; Xprev = XC; Xdprev = XR; Aprev = A = Mptr( A, 0, Anq, Ald, size ); mb1 = PB_Clastnb( N, 0, Aimb1, Amb ); nb1 = PB_Clastnb( N, 0, Ainb1, Anb ); tmp1 = N - ( kb = MIN( mb1, nb1 ) ); n1 = ( ( Ais1Col || ( N - nb1 < nlast ) ) ? n1last : n1 ); tmp2 = n1 + nb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) ); Asrc = Arow; n1p = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Aimb1, Amb, myrow, Asrc, nprow ); while( N > 0 ) { kbsize = kb * size; if( Ais1Col || ( mycol == Alcol ) ) { A -= Ald * kbsize; Anq -= kb; Xd = Mptr( XR, 0, Anq, INCXR, size ); } if( ( Arow < 0 ) || ( myrow == Alrow ) ) { Anp -= kb; } /* * Partial update of previous block */ if( n1pprev > 0 ) { if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) ) { tmp1 = ( Anpprev - n1pprev ) * size; gemv( C2F_CHAR( TRANS ), &n1pprev, &kbprev, negone, Aprev+tmp1, &Ald, Xdprev, &INCXR, one, Xprev+tmp1, &INCXC ); } /* * Send partial updated result to current column */ if( !( Ais1Col ) && ChangeRoc ) { if( mycol == rocprev ) { send( ctxt, n1pprev, 1, Xprev+(Anpprev-n1pprev)*size, n1pprev, myrow, Alcol ); } else if( mycol == Alcol ) { recv( ctxt, n1pprev, 1, work, n1pprev, myrow, rocprev ); axpy( &n1pprev, one, work, &ione, Mptr( Xprev, Anpprev-n1pprev, 0, INCXC, size ), &INCXC ); } } } /* * Solve current diagonal block */ if( Ais1Col || ( mycol == Alcol ) ) { if( AisRowRep || ( myrow == Alrow ) ) { trsv( C2F_CHAR( UPLO ), C2F_CHAR( TRANS ), C2F_CHAR( DIAG ), &kb, Mptr( A, Anp, 0, Ald, size ), &Ald, Mptr( XC, Anp, 0, INCXC, size ), &INCXC ); copy( &kb, Mptr( XC, Anp, 0, INCXC, size ), &INCXC, Mptr( XR, 0, Anq, INCXR, size ), &INCXR ); } if( bcst ) { if( myrow == Alrow ) bsend( ctxt, COLUMN, &btop, 1, kb, Mptr( XR, 0, Anq, INCXR, size ), INCXR ); else brecv( ctxt, COLUMN, &btop, 1, kb, Mptr( XR, 0, Anq, INCXR, size ), INCXR, Alrow, mycol ); } } else { if( !( Ais1Col ) && ( AisRowRep || ( myrow == Alrow ) ) ) set( &kb, zero, Mptr( XC, Anp, 0, INCXC, size ), &ione ); } /* * Finish previous update */ if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) && ( ( tmp1 = Anpprev - n1pprev ) > 0 ) ) gemv( C2F_CHAR( TRANS ), &tmp1, &kbprev, negone, Aprev, &Ald, Xdprev, &INCXR, one, Xprev, &INCXC ); /* * Save info of current step and update info for the next step */ if( Ais1Col || ( mycol == Alcol ) ) { Xdprev = Xd; Aprev = A; } if( AisRowRep || ( myrow == Alrow ) ) { Anpprev -= kb; } n1pprev = n1p; rocprev = Alcol; kbprev = kb; k += kb; N -= kb; mb1 -= kb; if( mb1 == 0 ) { if( !( Ais1Row ) && ( Alrow >= 0 ) ) Alrow = MModSub1( Alrow, nprow ); mb1 = ( N > Aimb1 ? Amb : Aimb1 ); } nb1 -= kb; ChangeRoc = ( nb1 == 0 ); if( ChangeRoc ) { if( !( Ais1Col ) && ( Alcol >= 0 ) ) Alcol = MModSub1( Alcol, npcol ); nb1 = ( N > Ainb1 ? Anb : Ainb1 ); } tmp1 = N - ( kb = MIN( mb1, nb1 ) ); n1 = ( ( Ais1Col || ( N - nb1 < nlast ) ) ? n1last : n1 ); tmp2 = n1 + nb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) ); n1p = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Aimb1, Amb, myrow, Asrc, nprow ); } } else { /* * Initiate lookahead */ n1 = ( MAX( npcol, 2 ) - 1 ) * Anb; work = PB_Cmalloc( MIN( n1, Anp ) * size ); Aprev = A; Xprev = XC; Xdprev = XR; Anpprev = Anp; mb1 = Aimb1; nb1 = Ainb1; rocprev = Acol; tmp1 = N - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + nb1 - kb; Asrc = Arow; n1p = PB_Cnumroc( MIN( tmp1, tmp2 ), kb, Aimb1, Amb, myrow, Asrc, nprow ); while( kb > 0 ) { kbsize = kb * size; /* * Partial update of previous block */ if( n1pprev > 0 ) { if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) ) gemv( C2F_CHAR( TRANS ), &n1pprev, &kbprev, negone, Aprev, &Ald, Xdprev, &INCXR, one, Xprev, &INCXC ); /* * Send partial updated result to current column */ if( !( Ais1Col ) && ChangeRoc ) { if( mycol == rocprev ) { send( ctxt, n1pprev, 1, Xprev, n1pprev, myrow, Acol ); } else if( mycol == Acol ) { recv( ctxt, n1pprev, 1, work, n1pprev, myrow, rocprev ); axpy( &n1pprev, one, work, &ione, Xprev, &INCXC ); } } } /* * Solve current diagonal block */ if( Ais1Col || ( mycol == Acol ) ) { if( AisRowRep || ( myrow == Arow ) ) { trsv( C2F_CHAR( UPLO ), C2F_CHAR( TRANS ), C2F_CHAR( DIAG ), &kb, A, &Ald, XC, &INCXC ); copy( &kb, XC, &INCXC, XR, &INCXR ); } if( bcst ) { if( myrow == Arow ) bsend( ctxt, COLUMN, &btop, 1, kb, XR, INCXR ); else brecv( ctxt, COLUMN, &btop, 1, kb, XR, INCXR, Arow, mycol ); } } else { if( !( Ais1Col ) && ( AisRowRep || ( myrow == Arow ) ) ) set( &kb, zero, XC, &INCXC ); } /* * Finish previous update */ if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) ) { if( ( tmp1 = Anpprev - n1pprev ) > 0 ) { tmp2 = n1pprev * size; gemv( C2F_CHAR( TRANS ), &tmp1, &kbprev, negone, Aprev+tmp2, &Ald, Xdprev, &INCXR, one, Xprev+tmp2, &INCXC ); } Aprev += Ald * kbprev * size; } /* * Save info of current step and update info for the next step */ if( Ais1Col || ( mycol == Acol ) ) { A += Ald*kbsize; Xdprev = Xd = XR; XR += INCXR*kbsize; } if( AisRowRep || ( myrow == Arow ) ) { Xprev = ( XC += kbsize ); A += kbsize; Aprev += kbsize; Anpprev = ( Anp -= kb ); } n1pprev = n1p; rocprev = Acol; kbprev = kb; k += kb; N -= kb; mb1 -= kb; if( mb1 == 0 ) { if( !( Ais1Row ) && ( Arow >= 0 ) ) Arow = MModAdd1( Arow, nprow ); mb1 = MIN( Amb, N ); } nb1 -= kb; ChangeRoc = ( nb1 == 0 ); if( ChangeRoc ) { if( !( Ais1Col ) && ( Acol >= 0 ) ) Acol = MModAdd1( Acol, npcol ); nb1 = MIN( Anb, N ); } tmp1 = N - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + nb1 - kb; n1p = PB_Cnumroc( MIN( tmp2, tmp1 ), k+kb, Aimb1, Amb, myrow, Asrc, nprow ); } } } else { if( ( Anp <= 0 ) || ( Ais1Col && ( ( Acol >= 0 ) && !( FBCAST ) && ( mycol != Acol ) ) ) ) return; btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); bcst = ( ( !Ais1Col ) || ( Ais1Col && ( Acol >= 0 ) && FBCAST ) ); AisColRep = ( ( Acol < 0 ) || ( npcol == 1 ) ); if( Mupcase( UPLO[0] ) == CUPPER ) { /* * Initiate lookahead */ n1 = ( MAX( nprow, 2 ) - 1 ) * Amb; work = PB_Cmalloc( MIN( n1, Anq ) * size ); Aprev = A; Xprev = XR; Xdprev = XC; Anpprev = Anq; mb1 = Aimb1; nb1 = Ainb1; rocprev = Arow; tmp1 = N - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + mb1 - kb; Asrc = Acol; n1p = PB_Cnumroc( MIN( tmp1, tmp2 ), kb, Ainb1, Anb, mycol, Asrc, npcol ); while( kb > 0 ) { kbsize = kb * size; /* * Partial update of previous block */ if( n1pprev > 0 ) { if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) ) gemv( C2F_CHAR( TRANS ), &kbprev, &n1pprev, negone, Aprev, &Ald, Xdprev, &INCXC, one, Xprev, &INCXR ); /* * Send partial updated result to current row */ if( !( Ais1Row ) && ChangeRoc ) { if( myrow == rocprev ) { send( ctxt, 1, n1pprev, Xprev, INCXR, Arow, mycol ); } else if( myrow == Arow ) { recv( ctxt, 1, n1pprev, work, 1, rocprev, mycol ); axpy( &n1pprev, one, work, &ione, Xprev, &INCXR ); } } } /* * Solve current diagonal block */ if( Ais1Row || ( myrow == Arow ) ) { if( AisColRep || ( mycol == Acol ) ) { trsv( C2F_CHAR( UPLO ), C2F_CHAR( TRANS ), C2F_CHAR( DIAG ), &kb, A, &Ald, XR, &INCXR ); copy( &kb, XR, &INCXR, XC, &INCXC ); } if( bcst ) { if( mycol == Acol ) bsend( ctxt, ROW, &btop, kb, 1, XC, kb ); else brecv( ctxt, ROW, &btop, kb, 1, XC, kb, myrow, Acol ); } } else { if( !( Ais1Row ) && ( AisColRep || ( mycol == Acol ) ) ) set( &kb, zero, XR, &INCXR ); } /* * Finish previous update */ if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) ) { if( ( tmp1 = Anpprev - n1pprev ) > 0 ) { tmp2 = n1pprev * size; gemv( C2F_CHAR( TRANS ), &kbprev, &tmp1, negone, Aprev+Ald*tmp2, &Ald, Xdprev, &INCXC, one, Xprev+INCXR*tmp2, &INCXR ); } Aprev += kbprev * size; } /* * Save info of current step and update info for the next step */ if( Ais1Row || ( myrow == Arow ) ) { A += kbsize; Xdprev = Xd = XC; XC += kbsize; } if( AisColRep || ( mycol == Acol ) ) { Xprev = ( XR += INCXR * kbsize ); A += Ald * kbsize; Anpprev = ( Anq -= kb ); Aprev += Ald * kbsize; } n1pprev = n1p; rocprev = Arow; kbprev = kb; k += kb; N -= kb; nb1 -= kb; if( nb1 == 0 ) { if( !( Ais1Col ) && ( Acol >= 0 ) ) Acol = MModAdd1( Acol, npcol ); nb1 = MIN( Anb, N ); } mb1 -= kb; ChangeRoc = ( mb1 == 0 ); if( ChangeRoc ) { if( !( Ais1Row ) && ( Arow >= 0 ) ) Arow = MModAdd1( Arow, nprow ); mb1 = MIN( Amb, N ); } tmp1 = N - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + mb1 - kb; n1p = PB_Cnumroc( MIN( tmp2, tmp1 ), k+kb, Ainb1, Anb, mycol, Asrc, npcol ); } } else { /* * Initiate lookahead */ nlast = ( nprow - 1 ) * Amb; n1 = MAX( nlast, Amb ); nlast += Aimb1; n1last = n1 - Amb + MAX( Aimb1, Amb ); work = PB_Cmalloc( MIN( n1last, Anq ) * size ); tmp1 = N-1; Alrow = PB_Cindxg2p( tmp1, Aimb1, Amb, Arow, Arow, nprow ); Alcol = PB_Cindxg2p( tmp1, Ainb1, Anb, Acol, Acol, npcol ); rocprev = Alrow; Anpprev = Anq; Xprev = XR; Xdprev = XC; Aprev = A = Mptr( A, Anp, 0, Ald, size ); mb1 = PB_Clastnb( N, 0, Aimb1, Amb ); nb1 = PB_Clastnb( N, 0, Ainb1, Anb ); tmp1 = N - ( kb = MIN( mb1, nb1 ) ); n1 = ( ( Ais1Row || ( N - mb1 < nlast ) ) ? n1last : n1 ); tmp2 = n1 + mb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) ); Asrc = Acol; n1p = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Ainb1, Anb, mycol, Asrc, npcol ); while( N > 0 ) { kbsize = kb * size; if( Ais1Row || ( myrow == Alrow ) ) { A -= kbsize; Anp -= kb; Xd = Mptr( XC, Anp, 0, INCXC, size ); } if( ( Acol < 0 ) || ( mycol == Alcol ) ) { Anq -= kb; } /* * Partial update of previous block */ if( n1pprev > 0 ) { if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) ) { tmp1 = ( Anpprev - n1pprev ) * size; gemv( C2F_CHAR( TRANS ), &kbprev, &n1pprev, negone, Aprev+Ald*tmp1, &Ald, Xdprev, &INCXC, one, Xprev+INCXR*tmp1, &INCXR ); } /* * Send partial updated result to current row */ if( !( Ais1Row ) && ChangeRoc ) { if( myrow == rocprev ) { send( ctxt, 1, n1pprev, Mptr( Xprev, 0, Anpprev-n1pprev, INCXR, size ), INCXR, Alrow, mycol ); } else if( myrow == Alrow ) { recv( ctxt, 1, n1pprev, work, 1, rocprev, mycol ); axpy( &n1pprev, one, work, &ione, Mptr( Xprev, 0, Anpprev-n1pprev, INCXR, size ), &INCXR ); } } } /* * Solve current diagonal block */ if( Ais1Row || ( myrow == Alrow ) ) { if( AisColRep || ( mycol == Alcol ) ) { trsv( C2F_CHAR( UPLO ), C2F_CHAR( TRANS ), C2F_CHAR( DIAG ), &kb, Mptr( A, 0, Anq, Ald, size ), &Ald, Mptr( XR, 0, Anq, INCXR, size ), &INCXR ); copy( &kb, Mptr( XR, 0, Anq, INCXR, size ), &INCXR, Mptr( XC, 0, Anp, INCXC, size ), &INCXC ); } if( bcst ) { if( mycol == Alcol ) bsend( ctxt, ROW, &btop, kb, 1, Mptr( XC, 0, Anp, INCXC, size ), kb ); else brecv( ctxt, ROW, &btop, kb, 1, Mptr( XC, 0, Anp, INCXC, size ), kb, myrow, Alcol ); } } else { if( !( Ais1Row ) && ( AisColRep || ( mycol == Alcol ) ) ) set( &kb, zero, Mptr( XR, 0, Anq, INCXR, size ), &INCXR ); } /* * Finish previous update */ if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) && ( ( tmp1 = Anpprev - n1pprev ) > 0 ) ) gemv( C2F_CHAR( TRANS ), &kbprev, &tmp1, negone, Aprev, &Ald, Xdprev, &INCXC, one, Xprev, &INCXR ); /* * Save info of current step and update info for the next step */ if( Ais1Row || ( myrow == Alrow ) ) { Xdprev = Xd; Aprev = A; } if( AisColRep || ( mycol == Alcol ) ) { Anpprev -= kb; } n1pprev = n1p; rocprev = Alrow; kbprev = kb; k += kb; N -= kb; nb1 -= kb; if( nb1 == 0 ) { if( !( Ais1Col ) && ( Alcol >= 0 ) ) Alcol = MModSub1( Alcol, npcol ); nb1 = ( N > Ainb1 ? Anb : Ainb1 ); } mb1 -= kb; ChangeRoc = ( mb1 == 0 ); if( ChangeRoc ) { if( !( Ais1Row ) && ( Alrow >= 0 ) ) Alrow = MModSub1( Alrow, nprow ); mb1 = ( N > Aimb1 ? Amb : Aimb1 ); } tmp1 = N - ( kb = MIN( mb1, nb1 ) ); n1 = ( ( Ais1Row || ( N - mb1 < nlast ) ) ? n1last : n1 ); tmp2 = n1 + mb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) ); n1p = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Ainb1, Anb, mycol, Asrc, npcol ); } } } if( work ) free( work ); /* * End of PB_Cptrsv */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CScatterV.c000644 000766 000024 00000056647 10363532303 020624 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CScatterV( PBTYP_T * TYPE, char * DIRECA, int M, int N, char * A, int IA, int JA, int * DESCA, char * AROC, char * ALPHA, char * B, int IB, int JB, int * DESCB, char * BROC ) #else void PB_CScatterV( TYPE, DIRECA, M, N, A, IA, JA, DESCA, AROC, ALPHA, B, IB, JB, DESCB, BROC ) /* * .. Scalar Arguments .. */ char * ALPHA, * AROC, * BROC, * DIRECA; int IA, IB, JA, JB, M, N; PBTYP_T * TYPE; /* * .. Array Arguments .. */ int * DESCA, * DESCB; char * A, * B; #endif { /* * Purpose * ======= * * PB_CScatterV disaggregates the one-dimensional submatrix sub( A ) de- * noting A( IA:IA+M-1, JA:JA+N-1 ) into a two-dimensional submatrix * sub( B ) denoting B( IB:IB+M-1, JB:JB+N-1 ) when AROC is equal to * BROC and B( IB:IB+N-1, JB:JB+M-1 ) otherwise: * * sub( B ) := alpha * sub( B ) + sub( A ). * * Notes * ===== * * A description vector is associated with each 2D block-cyclicly dis- * tributed matrix. This vector stores the information required to * establish the mapping between a matrix entry and its corresponding * process and memory location. * * In the following comments, the character _ should be read as * "of the distributed matrix". Let A be a generic term for any 2D * block cyclicly distributed matrix. Its description vector is DESC_A: * * NOTATION STORED IN EXPLANATION * ---------------- --------------- ------------------------------------ * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating * the NPROW x NPCOL BLACS process grid * A is distributed over. The context * itself is global, but the handle * (the integer value) may vary. * M_A (global) DESCA[ M_ ] The number of rows in the distribu- * ted matrix A, M_A >= 0. * N_A (global) DESCA[ N_ ] The number of columns in the distri- * buted matrix A, N_A >= 0. * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left * block of the matrix A, IMB_A > 0. * INB_A (global) DESCA[ INB_ ] The number of columns of the upper * left block of the matrix A, * INB_A > 0. * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- * bute the last M_A-IMB_A rows of A, * MB_A > 0. * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- * bute the last N_A-INB_A columns of * A, NB_A > 0. * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first * row of the matrix A is distributed, * NPROW > RSRC_A >= 0. * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the * first column of A is distributed. * NPCOL > CSRC_A >= 0. * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local * array storing the local blocks of * the distributed matrix A, * IF( Lc( 1, N_A ) > 0 ) * LLD_A >= MAX( 1, Lr( 1, M_A ) ) * ELSE * LLD_A >= 1. * * Let K be the number of rows of a matrix A starting at the global in- * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would * receive if these K rows were distributed over NPROW processes. If K * is the number of columns of a matrix A starting at the global index * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if * these K columns were distributed over NPCOL processes. * * The values of Lr() and Lc() may be determined via a call to the func- * tion PB_Cnumroc: * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * DIRECA (global input) pointer to CHAR * On entry, DIRECA specifies the direction in which the rows * or columns of sub( A ) should be disaggregated as follows: * DIRECA = 'F' or 'f' forward or increasing, * DIRECA = 'B' or 'b' backward or decreasing. * * M (global input) INTEGER * On entry, M specifies the number of rows of the submatrix * sub( A ). M must be at least zero. * * N (global input) INTEGER * On entry, N specifies the number of columns of the submatrix * sub( A ). N must be at least zero. * * A (local input) pointer to CHAR * On entry, A is an array of dimension (LLD_A, Ka), where LLD_A * is DESCA[LLD_], i.e. at least MAX( 1, Lr( M, IA ) ), and, * Ka is at least Lc( N, JA ). Before entry, this array contains * the local entries of the matrix A. * * IA (global input) INTEGER * On entry, IA specifies A's global row index, which points to * the beginning of the submatrix sub( A ). * * JA (global input) INTEGER * On entry, JA specifies A's global column index, which points * to the beginning of the submatrix sub( A ). * * DESCA (global and local input) INTEGER array * On entry, DESCA is an integer array of dimension DLEN_. This * is the array descriptor for the matrix A. * * AROC (global input) pointer to CHAR * On entry, AROC specifies the orientation of the submatrix * sub( A ). When AROC is 'R' or 'r', sub( A ) is a row matrix, * and a column matrix otherwise. * * ALPHA (local input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * B (local output) pointer to CHAR * On entry, A is an array of dimension (LLD_B, Kb), where LLD_B * is DESCB[LLD_], i.e. at least MAX( 1, Lr( M, IB ) ) when AROC * and BROC are equal, and MAX( 1, Lr( N, IB ) ) otherwise, and, * Kb is at least Lc( N, JB ) when AROC and BROC are equal, and * Lc( M, JB ) otherwise. On exit, this array contains the local * entries of the disaggregated submatrix sub( A ). * * IB (global input) INTEGER * On entry, IB specifies B's global row index, which points to * the beginning of the submatrix sub( B ). * * JB (global input) INTEGER * On entry, JB specifies B's global column index, which points * to the beginning of the submatrix sub( B ). * * DESCB (global and local input) INTEGER array * On entry, DESCB is an integer array of dimension DLEN_. This * is the array descriptor for the matrix B. * * BROC (global input) pointer to CHAR * On entry, BROC specifies the orientation of the submatrix * sub( B ). When BROC is 'R' or 'r', sub( B ) is a row matrix, * and a column matrix otherwise. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int Afwd, Bbufld, Bcol, Bcurcol, Bcurrow, Bii, Bimb, Bimb1, Binb, Binb1, BisRow, Bjj, Bld, Bm, Bmb, Bmp, Bn, Bnb, Bnnxt, BnnxtL, Bnpre, Bnq, Brow, WAfr, ctxt, kb, mycol, mydist, mydistnb, myrow, nlen, npcol, nprow, offset, size, srcdist, stride, stridenb, tmp; MMADD_T add; MMSHFT_T shft; /* * .. Local Arrays .. */ int Bd0[DLEN_], WAd[DLEN_]; char * Bptr = NULL, * Bbuf = NULL, * Bbufptr = NULL, * WA = NULL; /* .. * .. Executable Statements .. * */ /* * Quick return if possible */ if( ( M <= 0 ) || ( N <= 0 ) ) return; /* * Retrieve process grid information */ Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD ); BisRow = ( Mupcase( BROC [0] ) == CROW ); if( Mupcase( AROC[0] ) == Mupcase( BROC[0] ) ) { Bm = M; Bn = N; } else { Bm = N; Bn = M; } /* * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol ... */ PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow, &Bcol ); Bimb = DESCB[IMB_]; Binb = DESCB[INB_]; Bmb = DESCB[MB_]; Bnb = DESCB[NB_]; Bimb1 = PB_Cfirstnb( Bm, IB, Bimb, Bmb ); Bmp = PB_Cnumroc( Bm, 0, Bimb1, Bmb, myrow, Brow, nprow ); Binb1 = PB_Cfirstnb( Bn, JB, Binb, Bnb ); Bnq = PB_Cnumroc( Bn, 0, Binb1, Bnb, mycol, Bcol, npcol ); Bld = DESCB[LLD_]; size = TYPE->size; one = TYPE->one; if( ( Bmp > 0 ) && ( Bnq > 0 ) ) Bptr = Mptr( B, Bii, Bjj, Bld, size ); if( BisRow ) { /* * Compute descriptor Bd0 for sub( B ). */ if( Afwd ) { Bcurrow = Brow; } else { Bcurrow = PB_Cindxg2p( Bm-1, Bimb1, Bmb, Brow, Brow, nprow ); } PB_Cdescset( Bd0, Bm, Bn, Bm, Binb1, Bmb, Bnb, Bcurrow, Bcol, ctxt, Bld ); /* * Align sub( A ) with sub( B ) */ PB_CInV( TYPE, NOCONJG, BROC, Bm, Bn, Bd0, Bm, A, IA, JA, DESCA, AROC, &WA, WAd, &WAfr ); /* * Disaggregate WA = sub( A ) */ if( ( Brow == -1 ) || ( nprow == 1 ) ) { /* * sub( B ) is replicated */ if( Bnq > 0 ) TYPE->Fmmadd( &Bm, &Bnq, one, WA, &WAd[LLD_], ALPHA, Bptr, &Bld ); if( WAfr ) free( WA ); return; } if( !( PB_Cspan( Bm, 0, Bimb1, Bmb, Brow, nprow ) ) ) { /* * sub( B ) spans only one process row */ if( ( myrow == Brow ) && ( Bnq > 0 ) ) TYPE->Fmmadd( &Bm, &Bnq, one, WA, &WAd[LLD_], ALPHA, Bptr, &Bld ); if( WAfr ) free( WA ); return; } /* * sub( B ) spans more than one process row */ if( Afwd ) { /* * sub( B ) is not replicated and spans more than one process row. Forward row * dissagregation starts in the process row where the global row IB resides. */ if( ( Bmp > 0 ) && ( Bnq > 0 ) ) { /* * Compute how may rows are before and after me (Bnpre and Bnnxt). */ Bnpre = PB_Cnpreroc( Bm, 0, Bimb1, Bmb, myrow, Brow, nprow ); Bnnxt = PB_Cnnxtroc( Bm, 0, Bimb1, Bmb, myrow, Brow, nprow ); nlen = Bmp + Bnnxt; if( Bnpre > 0 ) { /* * If I don't own the row IB, then allocate and receive a buffer of length * ( Bmp + Bnnxt ) * Bnq from the previous process row. */ Bbufptr = Bbuf = PB_Cmalloc( nlen * Bnq * size ); Bbufld = nlen; TYPE->Cgerv2d( ctxt, nlen, Bnq, Bbuf, Bbufld, MModSub1( myrow, nprow ), mycol ); kb = Bmb; } else { /* * Otherwise, reuse WA. */ Bbufptr = Bbuf = WA; Bbufld = WAd[LLD_]; kb = Bimb1; } /* * Unpack the received data */ if( Bnnxt > 0 ) { /* * If some rows reside in the process row following mine, then unpack my piece, * sort the buffer and send those Bnnxt rows to the next process row. */ add = TYPE->Fmmadd; shft = TYPE->Frshft; mydistnb = ( nprow - MModSub( myrow, Brow, nprow ) - 1 ); stride = ( mydistnb *= Bmb ) * size; do { kb = MIN( kb, nlen ); add( &kb, &Bnq, one, Bbufptr, &Bbufld, ALPHA, Bptr, &Bld ); nlen -= kb; offset = -kb; shft( &nlen, &Bnq, &offset, Bbufptr, &Bbufld ); Bptr += kb*size; Bbufptr += stride; nlen -= mydistnb; kb = Bmb; } while( nlen > 0 ); /* * send buffer of length Bnnxt * Bnq to the next process row. */ TYPE->Cgesd2d( ctxt, Bnnxt, Bnq, Bbuf, Bbufld, MModAdd1( myrow, nprow ), mycol ); } else { /* * Otherwise, I must be the last process involved in the operation, so no * unpacking is necessary. */ TYPE->Fmmadd( &Bmp, &Bnq, one, Bbufptr, &Bbufld, ALPHA, Bptr, &Bld ); } /* * If I don't own the row IB, then release the dynamically allocated buffer. */ if( Bnpre > 0 ) free( Bbuf ); } if( WAfr ) free( WA ); } else { if( ( Bmp > 0 ) && ( Bnq > 0 ) ) { /* * Compute how may rows are before and after me (Bnpre, Bnnxt). */ Bnnxt = PB_Cnnxtroc( Bm, 0, Bimb1, Bmb, myrow, Brow, nprow ); BnnxtL = PB_Cnnxtroc( Bm, 0, Bimb1, Bmb, Bcurrow, Brow, nprow ); Bnnxt = MModSub( Bnnxt, BnnxtL, Bm ); Bnpre = ( nlen = Bm - Bnnxt ) - Bmp; if( Bnnxt > 0 ) { /* * If I don't own the row IB+M-1, then allocate and receive a buffer of length * ( Bm - Bnnxt ) * Bnq from the next process row. */ Bbufptr = Bbuf = PB_Cmalloc( nlen * Bnq * size ); Bbufld = nlen; TYPE->Cgerv2d( ctxt, nlen, Bnq, Bbuf, Bbufld, MModAdd1( myrow, nprow ), mycol ); } else { /* * Otherwise, reuse WA. */ Bbufptr = Bbuf = WA; Bbufld = WAd[LLD_]; } /* * Unpack the received data */ if( Bnpre > 0 ) { /* * If some rows reside in the process row preceeding mine, then unpack my piece, * sort the buffer and send those Bnpre rows to the previous process row. */ add = TYPE->Fmmadd; shft = TYPE->Frshft; mydist = MModSub( Bcurrow, myrow, nprow ); srcdist = MModSub( Bcurrow, Brow, nprow ); stridenb = ( nprow - mydist - 1 ) * Bmb; if( mydist < srcdist ) { tmp = ( Bimb1 + ( srcdist - mydist - 1 ) * Bmb ); Bbufptr += tmp * size; nlen -= tmp; kb = Bmb; } else if( mydist == srcdist ) { kb = Bimb1; } else { Bbufptr += stridenb * size; nlen -= stridenb; kb = Bmb; } do { kb = MIN( kb, nlen ); add( &kb, &Bnq, one, Bbufptr, &Bbufld, ALPHA, Bptr, &Bld ); nlen -= kb; offset = -kb; shft( &nlen, &Bnq, &offset, Bbufptr, &Bbufld ); Bptr += kb*size; Bbufptr += stridenb*size; nlen -= stridenb; kb = Bmb; } while( nlen > 0 ); /* * send buffer of length Bnpre * Bnq to the previous process row. */ TYPE->Cgesd2d( ctxt, Bnpre, Bnq, Bbuf, Bbufld, MModSub1( myrow, nprow ), mycol ); } else { /* * Otherwise, I must be the last process involved in the operation, so no * unpacking is necessary. */ TYPE->Fmmadd( &Bmp, &Bnq, one, Bbufptr, &Bbufld, ALPHA, Bptr, &Bld ); } /* * If I don't own the row IB+M-1, then release the dynamically allocated buffer. */ if( Bnnxt > 0 ) free( Bbuf ); } if( WAfr ) free( WA ); } } else { /* * Compute descriptor Bd0 for sub( B ). */ if( Afwd ) { Bcurcol = Bcol; } else { Bcurcol = PB_Cindxg2p( Bn-1, Binb1, Bnb, Bcol, Bcol, npcol ); } PB_Cdescset( Bd0, Bm, Bn, Bimb1, Bn, Bmb, Bnb, Brow, Bcurcol, ctxt, Bld ); /* * Align sub( A ) with sub( B ) */ PB_CInV( TYPE, NOCONJG, BROC, Bm, Bn, Bd0, Bn, A, IA, JA, DESCA, AROC, &WA, WAd, &WAfr ); /* * Disaggregate WA = sub( A ) */ if( ( Bcol == -1 ) || ( npcol == 1 ) ) { /* * sub( B ) is replicated */ if( Bmp > 0 ) TYPE->Fmmadd( &Bmp, &Bn, one, WA, &WAd[LLD_], ALPHA, Bptr, &Bld ); if( WAfr ) free( WA ); return; } if( !( PB_Cspan( Bn, 0, Binb1, Bnb, Bcol, npcol ) ) ) { /* * sub( B ) spans only one process column */ if( ( mycol == Bcol ) && ( Bmp > 0 ) ) TYPE->Fmmadd( &Bmp, &Bn, one, WA, &WAd[LLD_], ALPHA, Bptr, &Bld ); if( WAfr ) free( WA ); return; } /* * sub( B ) spans more than one process column */ if( Afwd ) { /* * sub( B ) is not replicated and spans more than one process column. Forward * column dissagregation starts in the process column where the global column * JB resides. */ if( ( Bmp > 0 ) && ( Bnq > 0 ) ) { /* * Compute how may columns are before and after me (Bnpre and Bnnxt). */ Bnpre = PB_Cnpreroc( Bn, 0, Binb1, Bnb, mycol, Bcol, npcol ); Bnnxt = PB_Cnnxtroc( Bn, 0, Binb1, Bnb, mycol, Bcol, npcol ); nlen = Bnq + Bnnxt; if( Bnpre > 0 ) { /* * If I don't own the column JB, then allocate and receive a buffer of length * Bmp * ( Bnq + Bnnxt ) from the previous process column. */ Bbufptr = Bbuf = PB_Cmalloc( Bmp * nlen * size ); Bbufld = Bmp; TYPE->Cgerv2d( ctxt, Bmp, nlen, Bbuf, Bbufld, myrow, MModSub1( mycol, npcol ) ); kb = Bnb; } else { /* * Otherwise, reuse WA. */ Bbufptr = Bbuf = WA; Bbufld = WAd[LLD_]; kb = Binb1; } /* * Unpack the received data */ if( Bnnxt > 0 ) { /* * If some columns reside in the process column following mine, then unpack my * piece, sort the buffer and send those Bnnxt columns to the next process * column. */ add = TYPE->Fmmadd; shft = TYPE->Fcshft; mydistnb = ( npcol - MModSub( mycol, Bcol, npcol ) - 1 ); stride = ( mydistnb *= Bnb ) * Bbufld * size; do { kb = MIN( kb, nlen ); add( &Bmp, &kb, one, Bbufptr, &Bbufld, ALPHA, Bptr, &Bld ); nlen -= kb; offset = -kb; shft( &Bmp, &nlen, &offset, Bbufptr, &Bbufld ); Bptr += kb*Bld*size; Bbufptr += stride; nlen -= mydistnb; kb = Bnb; } while( nlen > 0 ); /* * send buffer of length Bmp * Bnnxt to the next process column. */ TYPE->Cgesd2d( ctxt, Bmp, Bnnxt, Bbuf, Bbufld, myrow, MModAdd1( mycol, npcol ) ); } else { /* * Otherwise, I must be the last process involved in the operation, so no * unpacking is necessary. */ TYPE->Fmmadd( &Bmp, &Bnq, one, Bbufptr, &Bbufld, ALPHA, Bptr, &Bld ); } /* * If I don't own the column JB, then release the dynamically allocated buffer. */ if( Bnpre > 0 ) free( Bbuf ); } if( WAfr ) free( WA ); } else { if( ( Bmp > 0 ) && ( Bnq > 0 ) ) { /* * Compute how may rows are before and after me (Bnpre, Bnnxt). */ Bnnxt = PB_Cnnxtroc( Bn, 0, Binb1, Bnb, mycol, Bcol, npcol ); BnnxtL = PB_Cnnxtroc( Bn, 0, Binb1, Bnb, Bcurcol, Bcol, npcol ); Bnnxt = MModSub( Bnnxt, BnnxtL, Bn ); Bnpre = ( nlen = Bn - Bnnxt ) - Bnq; if( Bnnxt > 0 ) { /* * If I don't own the column JB+N-1, then allocate and receive a buffer of * length Bmp * ( Bn - Bnnxt ) from the next process column. */ Bbufptr = Bbuf = PB_Cmalloc( Bmp * nlen * size ); Bbufld = Bmp; TYPE->Cgerv2d( ctxt, Bmp, nlen, Bbuf, Bbufld, myrow, MModAdd1( mycol, npcol ) ); } else { /* * Otherwise, reuse WA. */ Bbufptr = Bbuf = WA; Bbufld = WAd[LLD_]; } /* * Unpack the received data */ if( Bnpre > 0 ) { /* * If some columns reside in the process column preceeding mine, then unpack my * piece, sort the buffer and send those Bnpre columns to the previous process * column. */ add = TYPE->Fmmadd; shft = TYPE->Fcshft; mydist = MModSub( Bcurcol, mycol, npcol ); srcdist = MModSub( Bcurcol, Bcol, npcol ); stridenb = ( npcol - mydist - 1 ) * Bnb; if( mydist < srcdist ) { tmp = ( Binb1 + ( srcdist - mydist - 1 ) * Bnb ); Bbufptr += tmp * Bbufld * size; nlen -= tmp; kb = Bnb; } else if( mydist == srcdist ) { kb = Binb1; } else { Bbufptr += stridenb * Bbufld * size; nlen -= stridenb; kb = Bnb; } do { kb = MIN( kb, nlen ); add( &Bmp, &kb, one, Bbufptr, &Bbufld, ALPHA, Bptr, &Bld ); nlen -= kb; offset = -kb; shft( &Bmp, &nlen, &offset, Bbufptr, &Bbufld ); Bptr += kb * Bld * size; Bbufptr += stridenb * Bbufld * size; nlen -= stridenb; kb = Bnb; } while( nlen > 0 ); /* * send buffer of length Bmp * Bnpre to the previous process column. */ TYPE->Cgesd2d( ctxt, Bmp, Bnpre, Bbuf, Bbufld, myrow, MModSub1( mycol, npcol ) ); } else { /* * Otherwise, I must be the last process involved in the operation, so no * unpacking is necessary. */ TYPE->Fmmadd( &Bmp, &Bnq, one, Bbufptr, &Bbufld, ALPHA, Bptr, &Bld ); } /* * If I don't own the column JB+N-1, then release the dynamically allocated * buffer. */ if( Bnnxt > 0 ) free( Bbuf ); } if( WAfr ) free( WA ); } } /* * End of PB_CScatterV */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cspan.c000644 000766 000024 00000005023 10363532303 020010 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_Cspan( int N, int I, int INB, int NB, int SRCPROC, int NPROCS ) #else int PB_Cspan( N, I, INB, NB, SRCPROC, NPROCS ) /* * .. Scalar Arguments .. */ int I, INB, N, NB, NPROCS, SRCPROC; #endif { /* * Purpose * ======= * * PB_Cspan returns 1 if the rows (resp. columns) I:I+N-1 spans more * than one process row (resp. column) and 0 otherwise. * * Arguments * ========= * * N (global input) INTEGER * On entry, N specifies the number of rows/columns being dealt * out. N must be at least zero. * * I (global input) INTEGER * On entry, I specifies the global index of the matrix entry. * I must be at least zero. * * INB (global input) INTEGER * On entry, INB specifies the size of the first block of the * global matrix distribution. INB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the size of the blocks used to parti- * tion the matrix. NB must be at least one. * * SRCPROC (global input) INTEGER * On entry, if SRCPROC = -1, the data is not distributed but * replicated, in which case this routine return 0. * * NPROCS (global input) INTEGER * On entry, NPROCS specifies the total number of process rows * or columns over which the matrix is distributed. NPROCS must * be at least one. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ /* * If the data is replicated or if there is only one process in this dimension * of the process grid, the data does not span multiple processes. Finally, the * case where I belongs to the first block is handled separately. */ return( ( SRCPROC >= 0 ) && ( ( NPROCS > 1 ) && ( ( I < INB ) ? ( I + N > INB ) : ( I + N > INB + ( ( I - INB ) / NB + 1 ) * NB ) ) ) ); /* * End of PB_Cspan */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cstypeset.c000644 000766 000024 00000006240 10363532303 020731 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" PBTYP_T * PB_Cstypeset() { /* * Purpose * ======= * * PB_Cstypeset on the first call initializes a static structure contai- * ning typed information and returns a pointer to it. The other calls * to this routine just returns this pointer. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ static int setup=0; static PBTYP_T TypeStruct; static float zero, one, negone; /* .. * .. Executable Statements .. * */ if( setup ) return( &TypeStruct ); setup = 1; TypeStruct.type = SREAL; TypeStruct.usiz = sizeof( float ); TypeStruct.size = sizeof( float ); zero = ZERO; one = ONE; negone = -ONE; TypeStruct.zero = (char *) (&zero); TypeStruct.one = (char *) (&one); TypeStruct.negone = (char *) (&negone); TypeStruct.Cgesd2d = Csgesd2d; TypeStruct.Cgerv2d = Csgerv2d; TypeStruct.Cgebs2d = Csgebs2d; TypeStruct.Cgebr2d = Csgebr2d; TypeStruct.Cgsum2d = Csgsum2d; TypeStruct.Fmmadd = smmadd_; TypeStruct.Fmmcadd = smmcadd_; TypeStruct.Fmmtadd = smmtadd_; TypeStruct.Fmmtcadd = smmtcadd_; TypeStruct.Fmmdda = smmdda_; TypeStruct.Fmmddac = smmddac_; TypeStruct.Fmmddat = smmddat_; TypeStruct.Fmmddact = smmddact_; TypeStruct.Fcshft = scshft_; TypeStruct.Frshft = srshft_; TypeStruct.Fvvdotu = svvdot_; TypeStruct.Fvvdotc = svvdot_; TypeStruct.Fset = sset_; TypeStruct.Ftzpad = stzpad_; TypeStruct.Ftzpadcpy = stzpadcpy_; TypeStruct.Ftzscal = stzscal_; TypeStruct.Fhescal = stzscal_; TypeStruct.Ftzcnjg = stzscal_; TypeStruct.Faxpy = saxpy_; TypeStruct.Fcopy = scopy_; TypeStruct.Fswap = sswap_; TypeStruct.Fgemv = sgemv_; TypeStruct.Fsymv = ssymv_; TypeStruct.Fhemv = ssymv_; TypeStruct.Ftrmv = strmv_; TypeStruct.Ftrsv = strsv_; TypeStruct.Fagemv = sagemv_; TypeStruct.Fasymv = sasymv_; TypeStruct.Fahemv = sasymv_; TypeStruct.Fatrmv = satrmv_; TypeStruct.Fgerc = sger_; TypeStruct.Fgeru = sger_; TypeStruct.Fsyr = ssyr_; TypeStruct.Fher = ssyr_; TypeStruct.Fsyr2 = ssyr2_; TypeStruct.Fher2 = ssyr2_; TypeStruct.Fgemm = sgemm_; TypeStruct.Fsymm = ssymm_; TypeStruct.Fhemm = ssymm_; TypeStruct.Fsyrk = ssyrk_; TypeStruct.Fherk = ssyrk_; TypeStruct.Fsyr2k = ssyr2k_; TypeStruct.Fher2k = ssyr2k_; TypeStruct.Ftrmm = strmm_; TypeStruct.Ftrsm = strsm_; return( &TypeStruct ); /* * End of PB_Cstypeset */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctop.c000644 000766 000024 00000007664 10363532303 017666 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ char * PB_Ctop( int * ICTXT, char * OP, char * SCOPE, char * TOP ) #else char * PB_Ctop( ICTXT, OP, SCOPE, TOP ) /* * .. Scalar Arguments .. */ int * ICTXT; /* * .. Array Arguments .. */ char * OP, * SCOPE, * TOP; #endif { /* * Purpose * ======= * * PB_Ctop returns or initializes the row-, column- or all- broadcast * or combine topologies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * OP (global input) pointer to CHAR * On input, OP specifies the BLACS operation defined as fol- * lows: * OP = 'B' or 'b', BLACS broadcast operation, * OP = 'C' or 'c', BLACS combine operation. * * SCOPE (global input) pointer to CHAR * On entry, SCOPE specifies the scope of the BLACS operation as * follows: * SCOPE = 'R' or 'r', rowwise broadcast or combine, * SCOPE = 'C' or 'c', column broadcast or combine, * SCOPE = 'A' or 'a', all broadcast or combine. * * TOP (global input) pointer to CHAR * On entry, TOP is a character string specifying the BLACS to- * pology to be used i.e. to be set for the given operation spe- * cified by OP and SCOPE. If TOP = TOP_GET, the routine instead * returns the current topology in use for the given operation * specified by OP and SCOPE. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ static char rbtop = CTOP_DEFAULT; static char cbtop = CTOP_DEFAULT; static char abtop = CTOP_DEFAULT; static char rctop = CTOP_DEFAULT; static char cctop = CTOP_DEFAULT; static char actop = CTOP_DEFAULT; /* .. * .. Executable Statements .. * */ /* * This BLACS topology information should be cached within a BLACS context. * This will be corrected in the near future. Sorry. */ if( *OP == CBCAST ) { /* * BLACS broadcast operations */ if( *TOP == CTOP_GET ) { /* * retrieve the current topology in SCOPE */ if( *SCOPE == CROW ) { return( &rbtop ); } else if( *SCOPE == CCOLUMN ) { return( &cbtop ); } else { return( &abtop ); } } else { /* * set the topology to be used from now on in SCOPE */ if( *SCOPE == CROW ) { rbtop = *TOP; return( &rbtop ); } else if( *SCOPE == CCOLUMN ) { cbtop = *TOP; return( &cbtop ); } else { abtop = *TOP; return( &abtop ); } } } else { /* * BLACS combine operations */ if( *TOP == CTOP_GET ) { /* * retrieve the current topology in SCOPE */ if( *SCOPE == CROW ) { return( &rctop ); } else if( *SCOPE == CCOLUMN ) { return( &cctop ); } else { return( &actop ); } } else { /* * set the topology to be used from now on in SCOPE */ if( *SCOPE == CROW ) { rctop = *TOP; return( &rctop ); } else if( *SCOPE == CCOLUMN ) { cctop = *TOP; return( &cctop ); } else { actop = *TOP; return( &actop ); } } } /* * End of PB_Ctop */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzahemv.c000644 000766 000024 00000022605 10363532303 020532 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzahemv( PBTYP_T * TYPE, char * SIDE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * A, int LDA, char * XC, int LDXC, char * XR, int LDXR, char * YC, int LDYC, char * YR, int LDYR ) #else void PB_Ctzahemv( TYPE, SIDE, UPLO, M, N, K, IOFFD, ALPHA, A, LDA, XC, LDXC, XR, LDXR, YC, LDYC, YR, LDYR ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO; int IOFFD, K, LDA, LDXC, LDXR, LDYC, LDYR, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * A, * XC, * XR, * YC, * YR; #endif { /* * Purpose * ======= * * PB_Ctzahemv performs the matrix-vector operation * * y := abs( alpha )*abs( A )*abs( x )+ abs( y ), * * where alpha is a real scalar, y is a real vector, x is a vector and A * is an m by n trapezoidal symmetric or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * SIDE (dummy) pointer to CHAR * In this routine, SIDE is a dummy (unused) argument. * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (input) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * XC (input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,1) containing the * m by 1 vector XC. * * LDXC (input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, M ). * * XR (input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,N) containing the * 1 by n vector XR. * * LDXR (input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least 1. * * YC (input/output) pointer to CHAR * On entry, YC is an array of dimension (LDYC,1) containing the * m by 1 vector YC. On exit, YC is overwritten by the partially * updated vector y. * * LDYC (input) INTEGER * On entry, LDYC specifies the leading dimension of the array * YC. LDYC must be at least max( 1, M ). * * YR (input/output) pointer to CHAR * On entry, YR is an array of dimension (LDYR,N) containing the * 1 by n vector YR. On exit, YR is overwritten by the partially * updated vector y. * * LDYR (input) INTEGER * On entry, LDYR specifies the leading dimension of the array * YR. LDYR must be at least 1. * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int i1, ione=1, j1, m1, mn, n1, size, usiz; AGEMV_T agemv; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; usiz = TYPE->usiz; one = TYPE->one; agemv = TYPE->Fagemv; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { agemv( C2F_CHAR( NOTRAN ), &M, &n1, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); agemv( C2F_CHAR( COTRAN ), &M, &n1, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fahemv( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XC, i1, 0, LDXC, size ), &ione, one, Mptr( YC, i1, 0, LDYC, usiz ), &ione ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; agemv( C2F_CHAR( NOTRAN ), &m1, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XR, 0, j1, LDXR, size ), &LDXR, one, Mptr( YC, i1, 0, LDYC, usiz ), &ione ); agemv( C2F_CHAR( COTRAN ), &m1, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XC, i1, 0, LDXC, size ), &ione, one, Mptr( YR, 0, j1, LDYR, usiz ), &LDYR ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; usiz = TYPE->usiz; one = TYPE->one; agemv = TYPE->Fagemv; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { agemv( C2F_CHAR( NOTRAN ), &m1, &n1, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); agemv( C2F_CHAR( COTRAN ), &m1, &n1, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } TYPE->Fahemv( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( A, m1, j1, LDA, size ), &LDA, Mptr( XC, m1, 0, LDXC, size ), &ione, one, Mptr( YC, m1, 0, LDYC, usiz ), &ione ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; agemv( C2F_CHAR( NOTRAN ), &M, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, Mptr( XR, 0, j1, LDXR, size ), &LDXR, one, YC, &ione ); agemv( C2F_CHAR( COTRAN ), &M, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, XC, &ione, one, Mptr( YR, 0, j1, LDYR, usiz ), &LDYR ); } } else { one = TYPE->one; agemv = TYPE->Fagemv; agemv( C2F_CHAR( NOTRAN ), &M, &N, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); agemv( C2F_CHAR( COTRAN ), &M, &N, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } /* * End of PB_Ctzahemv */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzasymv.c000644 000766 000024 00000022552 10363532303 020572 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzasymv( PBTYP_T * TYPE, char * SIDE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * A, int LDA, char * XC, int LDXC, char * XR, int LDXR, char * YC, int LDYC, char * YR, int LDYR ) #else void PB_Ctzasymv( TYPE, SIDE, UPLO, M, N, K, IOFFD, ALPHA, A, LDA, XC, LDXC, XR, LDXR, YC, LDYC, YR, LDYR ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO; int IOFFD, K, LDA, LDXC, LDXR, LDYC, LDYR, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * A, * XC, * XR, * YC, * YR; #endif { /* * Purpose * ======= * * PB_Ctzasymv performs the matrix-vector operation * * y := abs( alpha )*abs( A )*abs( x )+ abs( y ), * * where alpha is a real scalar, y is a real vector, x is a vector and A * is an m by n trapezoidal symmetric or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * SIDE (dummy) pointer to CHAR * In this routine, SIDE is a dummy (unused) argument. * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (input) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * XC (input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,1) containing the * m by 1 vector XC. * * LDXC (input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, M ). * * XR (input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,N) containing the * 1 by n vector XR. * * LDXR (input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least 1. * * YC (input/output) pointer to CHAR * On entry, YC is an array of dimension (LDYC,1) containing the * m by 1 vector YC. On exit, YC is overwritten by the partially * updated vector y. * * LDYC (input) INTEGER * On entry, LDYC specifies the leading dimension of the array * YC. LDYC must be at least max( 1, M ). * * YR (input/output) pointer to CHAR * On entry, YR is an array of dimension (LDYR,N) containing the * 1 by n vector YR. On exit, YR is overwritten by the partially * updated vector y. * * LDYR (input) INTEGER * On entry, LDYR specifies the leading dimension of the array * YR. LDYR must be at least 1. * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int i1, ione=1, j1, m1, mn, n1, size, usiz; AGEMV_T agemv; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; usiz = TYPE->usiz; one = TYPE->one; agemv = TYPE->Fagemv; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { agemv( C2F_CHAR( NOTRAN ), &M, &n1, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); agemv( C2F_CHAR( TRAN ), &M, &n1, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fasymv( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XC, i1, 0, LDXC, size ), &ione, one, Mptr( YC, i1, 0, LDYC, usiz ), &ione ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; agemv( C2F_CHAR( NOTRAN ), &m1, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XR, 0, j1, LDXR, size ), &LDXR, one, Mptr( YC, i1, 0, LDYC, usiz ), &ione ); agemv( C2F_CHAR( TRAN ), &m1, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XC, i1, 0, LDXC, size ), &ione, one, Mptr( YR, 0, j1, LDYR, usiz ), &LDYR ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; usiz = TYPE->usiz; one = TYPE->one; agemv = TYPE->Fagemv; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { agemv( C2F_CHAR( NOTRAN ), &m1, &n1, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); agemv( C2F_CHAR( TRAN ), &m1, &n1, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } TYPE->Fasymv( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( A, m1, j1, LDA, size ), &LDA, Mptr( XC, m1, 0, LDXC, size ), &ione, one, Mptr( YC, m1, 0, LDYC, usiz ), &ione ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; agemv( C2F_CHAR( NOTRAN ), &M, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, Mptr( XR, 0, j1, LDXR, size ), &LDXR, one, YC, &ione ); agemv( C2F_CHAR( TRAN ), &M, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, XC, &ione, one, Mptr( YR, 0, j1, LDYR, usiz ), &LDYR ); } } else { one = TYPE->one; agemv = TYPE->Fagemv; agemv( C2F_CHAR( NOTRAN ), &M, &N, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); agemv( C2F_CHAR( TRAN ), &M, &N, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } /* * End of PB_Ctzasymv */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzatrmv.c000644 000766 000024 00000021455 10363532303 020565 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzatrmv( PBTYP_T * TYPE, char * SIDE, char * UPLO, char * TRANS, char * DIAG, int M, int N, int K, int IOFFD, char * ALPHA, char * A, int LDA, char * X, int LDX, char * Y, int LDY ) #else void PB_Ctzatrmv( TYPE, SIDE, UPLO, TRANS, DIAG, M, N, K, IOFFD, ALPHA, A, LDA, X, LDX, Y, LDY ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO, * TRANS, * DIAG; int IOFFD, K, LDA, LDX, LDY, M, N; char * ALPHA; /* * .. Array Arguments .. */ char * A, * X, * Y; PBTYP_T * TYPE; #endif { /* * Purpose * ======= * * PB_Ctzatrmv performs the matrix-vector operation * * y := abs( alpha )*abs( A )*abs( x )+ abs( y ), * * or * * y := abs( alpha )*abs( A' )*abs( x ) + abs( y ), * * or * * y := abs( alpha )*abs( conjg( A' ) )*abs( x ) + abs( y ), * * where alpha is a real scalar, y is a real vector, x is a vector and A * is an m by n trapezoidal triangular matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * SIDE (dummy) pointer to CHAR * In this routine, SIDE is a dummy (unused) argument. * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * TRANS (input) pointer to CHAR * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': * y := abs( alpha )*abs( A )*abs( x )+ abs( beta*y ), * * TRANS = 'T' or 't': * y := abs( alpha )*abs( A' )*abs( x ) + abs( beta*y ), * * TRANS = 'C' or 'c': * y := abs( alpha )*abs( A' )*abs( x ) + abs( y ) or * y := abs( alpha )*abs( conjg(A') )*abs( x ) + abs( y ). * * DIAG (input) pointer to CHAR * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit triangular. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (input) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is referenced. When DIAG = 'U' or 'u', the * diagonal elements of A are not referenced either, but are * assumed to be unity. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * X (input) pointer to CHAR * On entry, X is an array of dimension (LDX,Kx). Before entry, * with TRANS = 'N' or 'n', the array X must contain the n ele- * ment vector x corresponding to the columns of A. Otherwise, * the array X must contain the m element vector x corresponding * to the rows of A. When TRANS is 'N' or 'n', LDX is at least * 1, and Kx is at least N. Otherwise, LDX is at least max(1,M), * and Kx is at least 1. * * LDX (input) INTEGER * On entry, LDX specifies the leading dimension of the array X. * LDX must be at least 1 when TRANS is 'N' or 'n' and * max( 1, M ) otherwise. * * Y (input/output) pointer to CHAR * On entry, Y is an array of dimension (LDY,Ky). On exit, with * TRANS = 'N' or 'n', the array Y contains the m element vector * y corresponding to the rows of A. Otherwise, the array Y con- * tains the n element vector y corresponding to the columns of * A. When TRANS is 'N' or 'n', LDY is at least max( 1, M ), and * Ky is at least 1. Otherwise, LDY is at least 1, and Ky is at * least N. On exit, Y is overwritten by the partial updated * vector y. * * LDY (input) INTEGER * On entry, LDY specifies the leading dimension of the array Y. * LDY must be at least max( 1, M ) when TRANS is 'N' or * 'n' and 1 otherwise. * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int ione = 1; char * Aptr = NULL; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( ( Mupcase( UPLO[0] ) == CLOWER ) || ( Mupcase( UPLO[0] ) == CUPPER ) ) { Aptr = PB_Cmalloc( M * N * TYPE->size ); TYPE->Ftzpadcpy( C2F_CHAR( UPLO ), C2F_CHAR( DIAG ), &M, &N, &IOFFD, A, &LDA, Aptr, &M ); if( Mupcase( TRANS[0] ) == CNOTRAN ) { TYPE->Fagemv( C2F_CHAR( TRANS ), &M, &N, ALPHA, Aptr, &M, X, &LDX, TYPE->one, Y, &ione ); } else { TYPE->Fagemv( C2F_CHAR( TRANS ), &M, &N, ALPHA, Aptr, &M, X, &ione, TYPE->one, Y, &LDY ); } if( Aptr ) free( Aptr ); } else { if( Mupcase( TRANS[0] ) == CNOTRAN ) { TYPE->Fagemv( C2F_CHAR( TRANS ), &M, &N, ALPHA, A, &LDA, X, &LDX, TYPE->one, Y, &ione ); } else { TYPE->Fagemv( C2F_CHAR( TRANS ), &M, &N, ALPHA, A, &LDA, X, &ione, TYPE->one, Y, &LDY ); } } /* * End of PB_Ctzatrmv */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzhemm.c000644 000766 000024 00000033154 10363532303 020361 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzhemm( PBTYP_T * TYPE, char * SIDE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * A, int LDA, char * BC, int LDBC, char * BR, int LDBR, char * CC, int LDCC, char * CR, int LDCR ) #else void PB_Ctzhemm( TYPE, SIDE, UPLO, M, N, K, IOFFD, ALPHA, A, LDA, BC, LDBC, BR, LDBR, CC, LDCC, CR, LDCR ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO; int IOFFD, K, LDA, LDBC, LDBR, LDCC, LDCR, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * A, * BC, * BR, * CC, * CR; #endif { /* * Purpose * ======= * * PB_Ctzhemm performs the matrix-matrix operation * * C := alpha * A * B + C, * * or * * C := alpha * B * A + C, * * where alpha is a scalar, B and C are m by k and k by n matrices and A * is an m by n trapezoidal symmetric or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * SIDE (input) pointer to CHAR * On entry, SIDE specifies whether op( A ) multiplies B from * the left or right as follows: * * SIDE = 'L' or 'l' C := alpha * A * B + C, * * SIDE = 'R' or 'r' C := alpha * B * A + C. * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (input) INTEGER * On entry, K specifies the number of rows of the matrices BR * and CR and the number of columns of the matrices BC and CC. K * must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (input) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BC (input) pointer to CHAR * On entry, BC is an array of dimension (LDBC,K) containing the * m by k matrix BC. * * LDBC (input) INTEGER * On entry, LDBC specifies the leading dimension of the array * BC. LDBC must be at least max( 1, M ). * * BR (input) pointer to CHAR * On entry, BR is an array of dimension (LDBR,N) containing the * k by n matrix BR. * * LDBR (input) INTEGER * On entry, LDBR specifies the leading dimension of the array * BR. LDBR must be at least K. * * CC (input/output) pointer to CHAR * On entry, CC is an array of dimension (LDCC,K) containing the * m by k matrix CC. On exit, CC is overwritten by the partially * updated matric CC. * * LDCC (input) INTEGER * On entry, LDCC specifies the leading dimension of the array * CC. LDCC must be at least max( 1, M ). * * CR (input/output) pointer to CHAR * On entry, CR is an array of dimension (LDCR,N) containing the * k by n matrix CR. On exit, CR is overwritten by the partially * updated matrix CR. * * LDCR (input) INTEGER * On entry, LDCR specifies the leading dimension of the array * CR. LDCR must be at least K. * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * Calph, * one, type; int i1, j1, m1, mn, n1, size; cmplx Calph8; cmplx16 Calph16; GEMM_T gemm; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; type = TYPE->type; if( type == SCPLX ) { Calph = ( (char *)(Calph8 ) ); PB_Cconjg( TYPE, ALPHA, Calph ); } else if( type == DCPLX ) { Calph = ( (char *)(Calph16) ); PB_Cconjg( TYPE, ALPHA, Calph ); } else { Calph = ALPHA; } if( Mupcase( SIDE[0] ) == CLEFT ) { if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &M, &K, &n1, ALPHA, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( COTRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &M, Calph, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fhemm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), &n1, &K, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( BC, i1, 0, LDBC, size ), &LDBC, one, Mptr( CC, i1, 0, LDCC, size ), &LDCC ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &m1, &K, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( CC, i1, 0, LDCC, size ), &LDCC ); gemm( C2F_CHAR( COTRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &m1, Calph, Mptr( BC, i1, 0, LDBC, size ), &LDBC, Mptr( A, i1, j1, LDA, size ), &LDA, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MIN( M - IOFFD, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &m1, &K, &n1, ALPHA, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( COTRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &m1, Calph, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } TYPE->Fhemm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), &n1, &K, ALPHA, Mptr( A, m1, j1, LDA, size ), &LDA, Mptr( BC, m1, 0, LDBC, size ), &LDBC, one, Mptr( CC, m1, 0, LDCC, size ), &LDCC ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &M, &K, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( COTRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &M, Calph, BC, &LDBC, Mptr( A, 0, j1, LDA, size ), &LDA, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } } else { one = TYPE->one; gemm = TYPE->Fgemm; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &M, &K, &N, ALPHA, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( COTRAN ), C2F_CHAR( NOTRAN ), &K, &N, &M, Calph, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } } else { if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( COTRAN ), &M, &K, &n1, Calph, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &M, ALPHA, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fhemm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), &K, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( COTRAN ), &m1, &K, &n1, Calph, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( CC, i1, 0, LDCC, size ), &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &m1, ALPHA, Mptr( BC, i1, 0, LDBC, size ), &LDBC, Mptr( A, i1, j1, LDA, size ), &LDA, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MIN( M - IOFFD, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( COTRAN ), &m1, &K, &n1, Calph, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &m1, ALPHA, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } TYPE->Fhemm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), &K, &n1, ALPHA, Mptr( A, m1, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( COTRAN ), &M, &K, &n1, Calph, Mptr( A, 0, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &M, ALPHA, BC, &LDBC, Mptr( A, 0, j1, LDA, size ), &LDA, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } } else { one = TYPE->one; gemm = TYPE->Fgemm; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( COTRAN ), &M, &K, &N, Calph, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &N, &M, ALPHA, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } } /* * End of PB_Ctzhemm */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzhemv.c000644 000766 000024 00000022334 10363532303 020370 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzhemv( PBTYP_T * TYPE, char * SIDE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * A, int LDA, char * XC, int LDXC, char * XR, int LDXR, char * YC, int LDYC, char * YR, int LDYR ) #else void PB_Ctzhemv( TYPE, SIDE, UPLO, M, N, K, IOFFD, ALPHA, A, LDA, XC, LDXC, XR, LDXR, YC, LDYC, YR, LDYR ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO; int IOFFD, K, LDA, LDXC, LDXR, LDYC, LDYR, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * A, * XC, * XR, * YC, * YR; #endif { /* * Purpose * ======= * * PB_Ctzhemv performs the matrix-vector operation * * y := alpha * A * x + y, * * where alpha is a scalar, x and y are n element vectors and A is an m * by n trapezoidal symmetric or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * SIDE (dummy) pointer to CHAR * In this routine, SIDE is a dummy (unused) argument. * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (input) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * XC (input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,1) containing the * m by 1 vector XC. * * LDXC (input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, M ). * * XR (input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,N) containing the * 1 by n vector XR. * * LDXR (input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least 1. * * YC (input/output) pointer to CHAR * On entry, YC is an array of dimension (LDYC,1) containing the * m by 1 vector YC. On exit, YC is overwritten by the partially * updated vector y. * * LDYC (input) INTEGER * On entry, LDYC specifies the leading dimension of the array * YC. LDYC must be at least max( 1, M ). * * YR (input/output) pointer to CHAR * On entry, YR is an array of dimension (LDYR,N) containing the * 1 by n vector YR. On exit, YR is overwritten by the partially * updated vector y. * * LDYR (input) INTEGER * On entry, LDYR specifies the leading dimension of the array * YR. LDYR must be at least 1. * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int i1, ione=1, j1, m1, mn, n1, size; GEMV_T gemv; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemv = TYPE->Fgemv; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { gemv( C2F_CHAR( NOTRAN ), &M, &n1, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); gemv( C2F_CHAR( COTRAN ), &M, &n1, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fhemv( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XC, i1, 0, LDXC, size ), &ione, one, Mptr( YC, i1, 0, LDYC, size ), &ione ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemv( C2F_CHAR( NOTRAN ), &m1, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XR, 0, j1, LDXR, size ), &LDXR, one, Mptr( YC, i1, 0, LDYC, size ), &ione ); gemv( C2F_CHAR( COTRAN ), &m1, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XC, i1, 0, LDXC, size ), &ione, one, Mptr( YR, 0, j1, LDYR, size ), &LDYR ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemv = TYPE->Fgemv; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { gemv( C2F_CHAR( NOTRAN ), &m1, &n1, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); gemv( C2F_CHAR( COTRAN ), &m1, &n1, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } TYPE->Fhemv( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( A, m1, j1, LDA, size ), &LDA, Mptr( XC, m1, 0, LDXC, size ), &ione, one, Mptr( YC, m1, 0, LDYC, size ), &ione ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemv( C2F_CHAR( NOTRAN ), &M, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, Mptr( XR, 0, j1, LDXR, size ), &LDXR, one, YC, &ione ); gemv( C2F_CHAR( COTRAN ), &M, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, XC, &ione, one, Mptr( YR, 0, j1, LDYR, size ), &LDYR ); } } else { one = TYPE->one; gemv = TYPE->Fgemv; gemv( C2F_CHAR( NOTRAN ), &M, &N, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); gemv( C2F_CHAR( COTRAN ), &M, &N, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } /* * End of PB_Ctzhemv */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzher.c000644 000766 000024 00000016442 10363532303 020212 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzher( PBTYP_T * TYPE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * XC, int LDXC, char * XR, int LDXR, char * A, int LDA ) #else void PB_Ctzher( TYPE, UPLO, M, N, K, IOFFD, ALPHA, XC, LDXC, XR, LDXR, A, LDA ) /* * .. Scalar Arguments .. */ char * UPLO; int IOFFD, K, LDA, LDXC, LDXR, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * A, * XC, * XR; #endif { /* * Purpose * ======= * * PB_Ctzher performs the trapezoidal symmetric or Hermitian rank 1 ope- * ration: * * A := alpha * XC * XR + A or A := alpha * XC * conjg( XR ) + A, * * where alpha is a scalar, XC is an m element vector, XR is an n ele- * ment vector and A is an m by n trapezoidal symmetric or Hermitian ma- * trix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (see pblas.h). * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * XC (input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,1) containing the * m by 1 vector XC. * * LDXC (input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, M ). * * XR (input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,N) containing the * 1 by n vector XR. * * LDXR (input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least 1. * * A (input/output) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is updated. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int i1, ione=1, j1, m1, mn, n1, size; GERC_T gerc; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; gerc = TYPE->Fgerc; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) gerc( &M, &n1, ALPHA, XC, &ione, XR, &LDXR, A, &LDA ); n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fher( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( XC, i1, 0, LDXC, size ), &ione, Mptr( A, i1, j1, LDA, size ), &LDA ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gerc( &m1, &n1, ALPHA, Mptr( XC, i1, 0, LDXC, size ), &ione, Mptr( XR, 0, j1, LDXR, size ), &LDXR, Mptr( A, i1, j1, LDA, size ), &LDA ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; gerc = TYPE->Fgerc; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) gerc( &m1, &n1, ALPHA, XC, &ione, XR, &LDXR, A, &LDA ); TYPE->Fher( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( XC, m1, 0, LDXC, size ), &ione, Mptr( A, m1, j1, LDA, size ), &LDA ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gerc( &M, &n1, ALPHA, XC, &ione, Mptr( XR, 0, j1, LDXR, size ), &LDXR, Mptr( A, 0, j1, LDA, size ), &LDA ); } } else { TYPE->Fgerc( &M, &N, ALPHA, XC, &ione, XR, &LDXR, A, &LDA ); } /* * End of PB_Ctzher */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzher2.c000644 000766 000024 00000022021 10363532303 020262 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzher2( PBTYP_T * TYPE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * XC, int LDXC, char * YC, int LDYC, char * XR, int LDXR, char * YR, int LDYR, char * A, int LDA ) #else void PB_Ctzher2( TYPE, UPLO, M, N, K, IOFFD, ALPHA, XC, LDXC, YC, LDYC, XR, LDXR, YR, LDYR, A, LDA ) /* * .. Scalar Arguments .. */ char * UPLO; int IOFFD, K, LDA, LDXC, LDXR, LDYC, LDYR, M, N; char * ALPHA; /* * .. Array Arguments .. */ char * A, * XC, * XR, * YC, * YR; PBTYP_T * TYPE; #endif { /* * Purpose * ======= * * PB_Ctzher2 performs the trapezoidal symmetric or Hermitian rank 2 * operation: * * A := alpha * XC * YR + alpha * YC * XR + A, or * * A := alpha*XC*conjg( YR ) + conjg( alpha )*YC*conjg( XR ) + A, * * where alpha is a scalar, XC and YC are m element vectors, XR and YR * are n element vectors and A is an m by n trapezoidal symmetric * or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (see pblas.h). * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * XC (input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,1) containing the * m by 1 vector XC. * * LDXC (input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, M ). * * YC (input) pointer to CHAR * On entry, YC is an array of dimension (LDYC,1) containing the * m by 1 vector YC. * * LDYC (input) INTEGER * On entry, LDYC specifies the leading dimension of the array * YC. LDYC must be at least max( 1, M ). * * XR (input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,N) containing the * 1 by n vector XR. * * LDXR (input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least 1. * * YR (input) pointer to CHAR * On entry, YR is an array of dimension (LDYR,N) containing the * 1 by n vector YR. * * LDYR (input) INTEGER * On entry, LDYR specifies the leading dimension of the array * YR. LDYR must be at least 1. * * A (input/output) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is updated. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int i1, ione=1, j1, m1, mn, n1, size; char * Calph, type; cmplx Calph8; cmplx16 Calph16; GERC_T gerc; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; type = TYPE->type; if( type == SCPLX ) { Calph = ( (char *)(Calph8) ); PB_Cconjg( TYPE, ALPHA, Calph ); } else if( type == DCPLX ) { Calph = ( (char *)(Calph16) ); PB_Cconjg( TYPE, ALPHA, Calph ); } else { Calph = ALPHA; } if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; gerc = TYPE->Fgerc; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { gerc( &M, &n1, ALPHA, XC, &ione, YR, &LDYR, A, &LDA ); gerc( &M, &n1, Calph, YC, &ione, XR, &LDXR, A, &LDA ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fher2( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( XC, i1, 0, LDXC, size ), &ione, Mptr( YR, 0, j1, LDYR, size ), &LDYR, Mptr( A, i1, j1, LDA, size ), &LDA ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gerc( &m1, &n1, ALPHA, Mptr( XC, i1, 0, LDXC, size ), &ione, Mptr( YR, 0, j1, LDYR, size ), &LDYR, Mptr( A, i1, j1, LDA, size ), &LDA ); gerc( &m1, &n1, Calph, Mptr( YC, i1, 0, LDYC, size ), &ione, Mptr( XR, 0, j1, LDXR, size ), &LDXR, Mptr( A, i1, j1, LDA, size ), &LDA ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; gerc = TYPE->Fgerc; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { gerc( &m1, &n1, ALPHA, XC, &ione, YR, &LDYR, A, &LDA ); gerc( &m1, &n1, Calph, YC, &ione, XR, &LDXR, A, &LDA ); } TYPE->Fher2( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( XC, m1, 0, LDXC, size ), &ione, Mptr( YR, 0, j1, LDYR, size ), &LDYR, Mptr( A, m1, j1, LDA, size ), &LDA ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gerc( &M, &n1, ALPHA, XC, &ione, Mptr( YR, 0, j1, LDYR, size ), &LDYR, Mptr( A, 0, j1, LDA, size ), &LDA ); gerc( &M, &n1, Calph, YC, &ione, Mptr( XR, 0, j1, LDXR, size ), &LDXR, Mptr( A, 0, j1, LDA, size ), &LDA ); } } else { gerc = TYPE->Fgerc; gerc( &M, &N, ALPHA, XC, &ione, YR, &LDYR, A, &LDA ); gerc( &M, &N, Calph, YC, &ione, XR, &LDXR, A, &LDA ); } /* * End of PB_Ctzher2 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzher2k.c000644 000766 000024 00000022315 10363532303 020443 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzher2k( PBTYP_T * TYPE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * AC, int LDAC, char * BC, int LDBC, char * AR, int LDAR, char * BR, int LDBR, char * C, int LDC ) #else void PB_Ctzher2k( TYPE, UPLO, M, N, K, IOFFD, ALPHA, AC, LDAC, BC, LDBC, AR, LDAR, BR, LDBR, C, LDC ) /* * .. Scalar Arguments .. */ char * UPLO; int IOFFD, K, LDAC, LDAR, LDBC, LDBR, LDC, M, N; char * ALPHA; /* * .. Array Arguments .. */ char * AC, * AR, * BC, * BR, * C; PBTYP_T * TYPE; #endif { /* * Purpose * ======= * * PB_Ctzher2k performs the trapezoidal symmetric or Hermitian rank k * operation: * * C := alpha * AC * AR + C, * * where alpha is a scalar, AC is an m by k matrix, AR is an k by n ma- * trix and C is an m by n trapezoidal symmetric or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix C is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of C is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of C is referenced, * * otherwise all of the matrix C is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix C. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * * K (input) INTEGER * On entry, K specifies the number of columns of the matrix AC, * and the number of rows of the matrix AR. K must be at least * zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of C as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal C( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal C( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal C( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * AC (input) pointer to CHAR * On entry, AC is an array of dimension (LDAC,K) containing the * m by k matrix AC. * * LDAC (input) INTEGER * On entry, LDAC specifies the leading dimension of the array * AC. LDAC must be at least max( 1, M ). * * AR (input) pointer to CHAR * On entry, AR is an array of dimension (LDAR,N) containing the * k by n matrix AR. * * LDAR (input) INTEGER * On entry, LDAR specifies the leading dimension of the array * AR. LDAR must be at least K. * * C (input/output) pointer to CHAR * On entry, C is an array of dimension (LDC,N) containing the m * by n matrix A. Only the trapezoidal part of C determined by * UPLO and IOFFD is updated. * * LDC (input) INTEGER * On entry, LDC specifies the leading dimension of the array C. * LDC must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * Calph, * one, type; int i1, j1, m1, mn, n1, size; cmplx Calph8; cmplx16 Calph16; GEMM_T gemm; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; type = TYPE->type; if( type == SCPLX ) { Calph = ( (char *)(Calph8 ) ); PB_Cconjg( TYPE, ALPHA, Calph ); } else if( type == DCPLX ) { Calph = ( (char *)(Calph16) ); PB_Cconjg( TYPE, ALPHA, Calph ); } else { Calph = ALPHA; } if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, AC, &LDAC, BR, &LDBR, one, C, &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, Calph, BC, &LDBC, AR, &LDAR, one, C, &LDC ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fher2k( C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), &n1, &K, ALPHA, Mptr( AC, i1, 0, LDAC, size ), &LDAC, Mptr( BC, i1, 0, LDBC, size ), &LDBC, one, Mptr( C, i1, j1, LDC, size ), &LDC ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, Mptr( AC, i1, 0, LDAC, size ), &LDAC, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( C, i1, j1, LDC, size ), &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, Calph, Mptr( BC, i1, 0, LDBC, size ), &LDBC, Mptr( AR, 0, j1, LDAR, size ), &LDAR, one, Mptr( C, i1, j1, LDC, size ), &LDC ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, AC, &LDAC, BR, &LDBR, one, C, &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, Calph, BC, &LDBC, AR, &LDAR, one, C, &LDC ); } TYPE->Fher2k( C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), &n1, &K, ALPHA, Mptr( AC, m1, 0, LDAC, size ), &LDAC, Mptr( BC, m1, 0, LDBC, size ), &LDBC, one, Mptr( C, m1, j1, LDC, size ), &LDC ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, AC, &LDAC, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( C, 0, j1, LDC, size ), &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, Calph, BC, &LDBC, Mptr( AR, 0, j1, LDAR, size ), &LDAR, one, Mptr( C, 0, j1, LDC, size ), &LDC ); } } else { one = TYPE->one; gemm = TYPE->Fgemm; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &N, &K, ALPHA, AC, &LDAC, BR, &LDBR, one, C, &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &N, &K, Calph, BC, &LDBC, AR, &LDAR, one, C, &LDC ); } /* * End of PB_Ctzher2k */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzherk.c000644 000766 000024 00000017442 10363532303 020366 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzherk( PBTYP_T * TYPE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * AC, int LDAC, char * AR, int LDAR, char * C, int LDC ) #else void PB_Ctzherk( TYPE, UPLO, M, N, K, IOFFD, ALPHA, AC, LDAC, AR, LDAR, C, LDC ) /* * .. Scalar Arguments .. */ char * UPLO; int IOFFD, K, LDAC, LDAR, LDC, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * AC, * AR, * C; #endif { /* * Purpose * ======= * * PB_Ctzherk performs the trapezoidal symmetric or Hermitian rank k * operation: * * C := alpha * AC * AR + C, * * where alpha is a scalar, AC is an m by k matrix, AR is an k by n ma- * trix and C is an m by n trapezoidal symmetric or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix C is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of C is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of C is referenced, * * otherwise all of the matrix C is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix C. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * * K (input) INTEGER * On entry, K specifies the number of columns of the matrix AC, * and the number of rows of the matrix AR. K must be at least * zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of C as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal C( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal C( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal C( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * AC (input) pointer to CHAR * On entry, AC is an array of dimension (LDAC,K) containing the * m by k matrix AC. * * LDAC (input) INTEGER * On entry, LDAC specifies the leading dimension of the array * AC. LDAC must be at least max( 1, M ). * * AR (input) pointer to CHAR * On entry, AR is an array of dimension (LDAR,N) containing the * k by n matrix AR. * * LDAR (input) INTEGER * On entry, LDAR specifies the leading dimension of the array * AR. LDAR must be at least K. * * C (input/output) pointer to CHAR * On entry, C is an array of dimension (LDC,N) containing the m * by n matrix A. Only the trapezoidal part of C determined by * UPLO and IOFFD is updated. * * LDC (input) INTEGER * On entry, LDC specifies the leading dimension of the array C. * LDC must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int i1, j1, m1, mn, n1, size; GEMM_T gemm; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, AC, &LDAC, AR, &LDAR, one, C, &LDC ); if( ( n1 = MIN( M-IOFFD, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fherk( C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), &n1, &K, ALPHA, Mptr( AC, i1, 0, LDAC, size ), &LDAC, one, Mptr( C, i1, j1, LDC, size ), &LDC ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, Mptr( AC, i1, 0, LDAC, size ), &LDAC, Mptr( AR, 0, j1, LDAR, size ), &LDAR, one, Mptr( C, i1, j1, LDC, size ), &LDC ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MIN( M - IOFFD, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, AC, &LDAC, AR, &LDAR, one, C, &LDC ); TYPE->Fherk( C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), &n1, &K, ALPHA, Mptr( AC, m1, 0, LDAC, size ), &LDAC, one, Mptr( C, m1, j1, LDC, size ), &LDC ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, AC, &LDAC, Mptr( AR, 0, j1, LDAR, size ), &LDAR, one, Mptr( C, 0, j1, LDC, size ), &LDC ); } } else { TYPE->Fgemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &N, &K, ALPHA, AC, &LDAC, AR, &LDAR, TYPE->one, C, &LDC ); } /* * End of PB_Ctzherk */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzsymm.c000644 000766 000024 00000032436 10363532303 020422 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzsymm( PBTYP_T * TYPE, char * SIDE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * A, int LDA, char * BC, int LDBC, char * BR, int LDBR, char * CC, int LDCC, char * CR, int LDCR ) #else void PB_Ctzsymm( TYPE, SIDE, UPLO, M, N, K, IOFFD, ALPHA, A, LDA, BC, LDBC, BR, LDBR, CC, LDCC, CR, LDCR ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO; int IOFFD, K, LDA, LDBC, LDBR, LDCC, LDCR, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * A, * BC, * BR, * CC, * CR; #endif { /* * Purpose * ======= * * PB_Ctzsymm performs the matrix-matrix operation * * C := alpha * A * B + C, * * or * * C := alpha * B * A + C, * * where alpha is a scalar, B and C are m by k and k by n matrices and A * is an m by n trapezoidal symmetric or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * SIDE (input) pointer to CHAR * On entry, SIDE specifies whether op( A ) multiplies B from * the left or right as follows: * * SIDE = 'L' or 'l' C := alpha * A * B + C, * * SIDE = 'R' or 'r' C := alpha * B * A + C. * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (input) INTEGER * On entry, K specifies the number of rows of the matrices BR * and CR and the number of columns of the matrices BC and CC. K * must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (input) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * BC (input) pointer to CHAR * On entry, BC is an array of dimension (LDBC,K) containing the * m by k matrix BC. * * LDBC (input) INTEGER * On entry, LDBC specifies the leading dimension of the array * BC. LDBC must be at least max( 1, M ). * * BR (input) pointer to CHAR * On entry, BR is an array of dimension (LDBR,N) containing the * k by n matrix BR. * * LDBR (input) INTEGER * On entry, LDBR specifies the leading dimension of the array * BR. LDBR must be at least K. * * CC (input/output) pointer to CHAR * On entry, CC is an array of dimension (LDCC,K) containing the * m by k matrix CC. On exit, CC is overwritten by the partially * updated matric CC. * * LDCC (input) INTEGER * On entry, LDCC specifies the leading dimension of the array * CC. LDCC must be at least max( 1, M ). * * CR (input/output) pointer to CHAR * On entry, CR is an array of dimension (LDCR,N) containing the * k by n matrix CR. On exit, CR is overwritten by the partially * updated matrix CR. * * LDCR (input) INTEGER * On entry, LDCR specifies the leading dimension of the array * CR. LDCR must be at least K. * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int i1, j1, m1, mn, n1, size; GEMM_T gemm; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( SIDE[0] ) == CLEFT ) { if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &M, &K, &n1, ALPHA, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &M, ALPHA, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fsymm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), &n1, &K, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( BC, i1, 0, LDBC, size ), &LDBC, one, Mptr( CC, i1, 0, LDCC, size ), &LDCC ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &m1, &K, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( CC, i1, 0, LDCC, size ), &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &m1, ALPHA, Mptr( BC, i1, 0, LDBC, size ), &LDBC, Mptr( A, i1, j1, LDA, size ), &LDA, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MIN( M - IOFFD, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &m1, &K, &n1, ALPHA, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &m1, ALPHA, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } TYPE->Fsymm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), &n1, &K, ALPHA, Mptr( A, m1, j1, LDA, size ), &LDA, Mptr( BC, m1, 0, LDBC, size ), &LDBC, one, Mptr( CC, m1, 0, LDCC, size ), &LDCC ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &M, &K, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &M, ALPHA, BC, &LDBC, Mptr( A, 0, j1, LDA, size ), &LDA, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } } else { one = TYPE->one; gemm = TYPE->Fgemm; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &M, &K, &N, ALPHA, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &N, &M, ALPHA, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } } else { if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &M, &K, &n1, ALPHA, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &M, ALPHA, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fsymm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), &K, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &m1, &K, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( CC, i1, 0, LDCC, size ), &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &m1, ALPHA, Mptr( BC, i1, 0, LDBC, size ), &LDBC, Mptr( A, i1, j1, LDA, size ), &LDA, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MIN( M - IOFFD, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &m1, &K, &n1, ALPHA, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &m1, ALPHA, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } TYPE->Fsymm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ), &K, &n1, ALPHA, Mptr( A, m1, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &M, &K, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &n1, &M, ALPHA, BC, &LDBC, Mptr( A, 0, j1, LDA, size ), &LDA, one, Mptr( CR, 0, j1, LDCR, size ), &LDCR ); } } else { one = TYPE->one; gemm = TYPE->Fgemm; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &M, &K, &N, ALPHA, A, &LDA, BR, &LDBR, one, CC, &LDCC ); gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &K, &N, &M, ALPHA, BC, &LDBC, A, &LDA, one, CR, &LDCR ); } } /* * End of PB_Ctzsymm */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzsymv.c000644 000766 000024 00000022334 10363532303 020427 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzsymv( PBTYP_T * TYPE, char * SIDE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * A, int LDA, char * XC, int LDXC, char * XR, int LDXR, char * YC, int LDYC, char * YR, int LDYR ) #else void PB_Ctzsymv( TYPE, SIDE, UPLO, M, N, K, IOFFD, ALPHA, A, LDA, XC, LDXC, XR, LDXR, YC, LDYC, YR, LDYR ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO; int IOFFD, K, LDA, LDXC, LDXR, LDYC, LDYR, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * A, * XC, * XR, * YC, * YR; #endif { /* * Purpose * ======= * * PB_Ctzsymv performs the matrix-vector operation * * y := alpha * A * x + y, * * where alpha is a scalar, x and y are n element vectors and A is an m * by n trapezoidal symmetric or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * SIDE (dummy) pointer to CHAR * In this routine, SIDE is a dummy (unused) argument. * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (input) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is referenced. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * XC (input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,1) containing the * m by 1 vector XC. * * LDXC (input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, M ). * * XR (input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,N) containing the * 1 by n vector XR. * * LDXR (input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least 1. * * YC (input/output) pointer to CHAR * On entry, YC is an array of dimension (LDYC,1) containing the * m by 1 vector YC. On exit, YC is overwritten by the partially * updated vector y. * * LDYC (input) INTEGER * On entry, LDYC specifies the leading dimension of the array * YC. LDYC must be at least max( 1, M ). * * YR (input/output) pointer to CHAR * On entry, YR is an array of dimension (LDYR,N) containing the * 1 by n vector YR. On exit, YR is overwritten by the partially * updated vector y. * * LDYR (input) INTEGER * On entry, LDYR specifies the leading dimension of the array * YR. LDYR must be at least 1. * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int i1, ione=1, j1, m1, mn, n1, size; GEMV_T gemv; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemv = TYPE->Fgemv; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { gemv( C2F_CHAR( NOTRAN ), &M, &n1, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); gemv( C2F_CHAR( TRAN ), &M, &n1, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fsymv( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XC, i1, 0, LDXC, size ), &ione, one, Mptr( YC, i1, 0, LDYC, size ), &ione ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemv( C2F_CHAR( NOTRAN ), &m1, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XR, 0, j1, LDXR, size ), &LDXR, one, Mptr( YC, i1, 0, LDYC, size ), &ione ); gemv( C2F_CHAR( TRAN ), &m1, &n1, ALPHA, Mptr( A, i1, j1, LDA, size ), &LDA, Mptr( XC, i1, 0, LDXC, size ), &ione, one, Mptr( YR, 0, j1, LDYR, size ), &LDYR ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemv = TYPE->Fgemv; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { gemv( C2F_CHAR( NOTRAN ), &m1, &n1, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); gemv( C2F_CHAR( TRAN ), &m1, &n1, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } TYPE->Fsymv( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( A, m1, j1, LDA, size ), &LDA, Mptr( XC, m1, 0, LDXC, size ), &ione, one, Mptr( YC, m1, 0, LDYC, size ), &ione ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemv( C2F_CHAR( NOTRAN ), &M, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, Mptr( XR, 0, j1, LDXR, size ), &LDXR, one, YC, &ione ); gemv( C2F_CHAR( TRAN ), &M, &n1, ALPHA, Mptr( A, 0, j1, LDA, size ), &LDA, XC, &ione, one, Mptr( YR, 0, j1, LDYR, size ), &LDYR ); } } else { one = TYPE->one; gemv = TYPE->Fgemv; gemv( C2F_CHAR( NOTRAN ), &M, &N, ALPHA, A, &LDA, XR, &LDXR, one, YC, &ione ); gemv( C2F_CHAR( TRAN ), &M, &N, ALPHA, A, &LDA, XC, &ione, one, YR, &LDYR ); } /* * End of PB_Ctzsymv */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzsyr.c000644 000766 000024 00000016445 10363532303 020254 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzsyr( PBTYP_T * TYPE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * XC, int LDXC, char * XR, int LDXR, char * A, int LDA ) #else void PB_Ctzsyr( TYPE, UPLO, M, N, K, IOFFD, ALPHA, XC, LDXC, XR, LDXR, A, LDA ) /* * .. Scalar Arguments .. */ char * UPLO; int IOFFD, K, LDA, LDXC, LDXR, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * A, * XC, * XR; #endif { /* * Purpose * ======= * * PB_Ctzsyr performs the trapezoidal symmetric or Hermitian rank 1 ope- * ration: * * A := alpha * XC * XR + A or A := alpha * XC * conjg( XR ) + A, * * where alpha is a scalar, XC is an m element vector, XR is an n ele- * ment vector and A is an m by n trapezoidal symmetric or Hermitian ma- * trix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (see pblas.h). * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * XC (input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,1) containing the * m by 1 vector XC. * * LDXC (input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, M ). * * XR (input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,N) containing the * 1 by n vector XR. * * LDXR (input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least 1. * * A (input/output) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is updated. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int i1, ione=1, j1, m1, mn, n1, size; GERU_T geru; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; geru = TYPE->Fgeru; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) geru( &M, &n1, ALPHA, XC, &ione, XR, &LDXR, A, &LDA ); n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fsyr( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( XC, i1, 0, LDXC, size ), &ione, Mptr( A, i1, j1, LDA, size ), &LDA ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; geru( &m1, &n1, ALPHA, Mptr( XC, i1, 0, LDXC, size ), &ione, Mptr( XR, 0, j1, LDXR, size ), &LDXR, Mptr( A, i1, j1, LDA, size ), &LDA ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; geru = TYPE->Fgeru; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) geru( &m1, &n1, ALPHA, XC, &ione, XR, &LDXR, A, &LDA ); TYPE->Fsyr( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( XC, m1, 0, LDXC, size ), &ione, Mptr( A, m1, j1, LDA, size ), &LDA ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; geru( &M, &n1, ALPHA, XC, &ione, Mptr( XR, 0, j1, LDXR, size ), &LDXR, Mptr( A, 0, j1, LDA, size ), &LDA ); } } else { TYPE->Fgeru( &M, &N, ALPHA, XC, &ione, XR, &LDXR, A, &LDA ); } /* * End of PB_Ctzsyr */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzsyr2.c000644 000766 000024 00000021323 10363532303 020325 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzsyr2( PBTYP_T * TYPE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * XC, int LDXC, char * YC, int LDYC, char * XR, int LDXR, char * YR, int LDYR, char * A, int LDA ) #else void PB_Ctzsyr2( TYPE, UPLO, M, N, K, IOFFD, ALPHA, XC, LDXC, YC, LDYC, XR, LDXR, YR, LDYR, A, LDA ) /* * .. Scalar Arguments .. */ char * UPLO; int IOFFD, K, LDA, LDXC, LDXR, LDYC, LDYR, M, N; char * ALPHA; /* * .. Array Arguments .. */ char * A, * XC, * XR, * YC, * YR; PBTYP_T * TYPE; #endif { /* * Purpose * ======= * * PB_Ctzsyr2 performs the trapezoidal symmetric or Hermitian rank 2 * operation: * * A := alpha * XC * YR + alpha * YC * XR + A, or * * A := alpha*XC*conjg( YR ) + conjg( alpha )*YC*conjg( XR ) + A, * * where alpha is a scalar, XC and YC are m element vectors, XR and YR * are n element vectors and A is an m by n trapezoidal symmetric * or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (see pblas.h). * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * XC (input) pointer to CHAR * On entry, XC is an array of dimension (LDXC,1) containing the * m by 1 vector XC. * * LDXC (input) INTEGER * On entry, LDXC specifies the leading dimension of the array * XC. LDXC must be at least max( 1, M ). * * YC (input) pointer to CHAR * On entry, YC is an array of dimension (LDYC,1) containing the * m by 1 vector YC. * * LDYC (input) INTEGER * On entry, LDYC specifies the leading dimension of the array * YC. LDYC must be at least max( 1, M ). * * XR (input) pointer to CHAR * On entry, XR is an array of dimension (LDXR,N) containing the * 1 by n vector XR. * * LDXR (input) INTEGER * On entry, LDXR specifies the leading dimension of the array * XR. LDXR must be at least 1. * * YR (input) pointer to CHAR * On entry, YR is an array of dimension (LDYR,N) containing the * 1 by n vector YR. * * LDYR (input) INTEGER * On entry, LDYR specifies the leading dimension of the array * YR. LDYR must be at least 1. * * A (input/output) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is updated. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int i1, ione=1, j1, m1, mn, n1, size; GERU_T geru; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; geru = TYPE->Fgeru; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { geru( &M, &n1, ALPHA, XC, &ione, YR, &LDYR, A, &LDA ); geru( &M, &n1, ALPHA, YC, &ione, XR, &LDXR, A, &LDA ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fsyr2( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( XC, i1, 0, LDXC, size ), &ione, Mptr( YR, 0, j1, LDYR, size ), &LDYR, Mptr( A, i1, j1, LDA, size ), &LDA ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; geru( &m1, &n1, ALPHA, Mptr( XC, i1, 0, LDXC, size ), &ione, Mptr( YR, 0, j1, LDYR, size ), &LDYR, Mptr( A, i1, j1, LDA, size ), &LDA ); geru( &m1, &n1, ALPHA, Mptr( YC, i1, 0, LDYC, size ), &ione, Mptr( XR, 0, j1, LDXR, size ), &LDXR, Mptr( A, i1, j1, LDA, size ), &LDA ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; geru = TYPE->Fgeru; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { geru( &m1, &n1, ALPHA, XC, &ione, YR, &LDYR, A, &LDA ); geru( &m1, &n1, ALPHA, YC, &ione, XR, &LDXR, A, &LDA ); } TYPE->Fsyr2( C2F_CHAR( UPLO ), &n1, ALPHA, Mptr( XC, m1, 0, LDXC, size ), &ione, Mptr( YR, 0, j1, LDYR, size ), &LDYR, Mptr( A, m1, j1, LDA, size ), &LDA ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; geru( &M, &n1, ALPHA, XC, &ione, Mptr( YR, 0, j1, LDYR, size ), &LDYR, Mptr( A, 0, j1, LDA, size ), &LDA ); geru( &M, &n1, ALPHA, YC, &ione, Mptr( XR, 0, j1, LDXR, size ), &LDXR, Mptr( A, 0, j1, LDA, size ), &LDA ); } } else { geru = TYPE->Fgeru; geru( &M, &N, ALPHA, XC, &ione, YR, &LDYR, A, &LDA ); geru( &M, &N, ALPHA, YC, &ione, XR, &LDXR, A, &LDA ); } /* * End of PB_Ctzsyr2 */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzsyr2k.c000644 000766 000024 00000023116 10363532303 020502 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzsyr2k( PBTYP_T * TYPE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * AC, int LDAC, char * BC, int LDBC, char * AR, int LDAR, char * BR, int LDBR, char * C, int LDC ) #else void PB_Ctzsyr2k( TYPE, UPLO, M, N, K, IOFFD, ALPHA, AC, LDAC, BC, LDBC, AR, LDAR, BR, LDBR, C, LDC ) /* * .. Scalar Arguments .. */ char * UPLO; int IOFFD, K, LDAC, LDAR, LDBC, LDBR, LDC, M, N; char * ALPHA; /* * .. Array Arguments .. */ char * AC, * AR, * BC, * BR, * C; PBTYP_T * TYPE; #endif { /* * Purpose * ======= * * PB_Ctzsyr2k performs the trapezoidal symmetric o Hermitian rank 2k * operation: * * C := alpha * AC * BR + alpha * BC * AR + C, * * where alpha is a scalar, AC and BC are m by k matrices, AR and BR are * n by k matrices and C is an m by n trapezoidal symmetric or Hermitian * matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix C is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of C is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of C is referenced, * * otherwise all of the matrix C is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix C. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * * K (input) INTEGER * On entry, K specifies the number of columns of the matrices * AC and BC, and the number of rows of the matrices AR and BR. * K must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of C as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal C( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal C( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal C( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * AC (input) pointer to CHAR * On entry, AC is an array of dimension (LDAC,K) containing the * m by k matrix AC. * * LDAC (input) INTEGER * On entry, LDAC specifies the leading dimension of the array * AC. LDAC must be at least max( 1, M ). * * BC (input) pointer to CHAR * On entry, BC is an array of dimension (LDBC,K) containing the * m by k matrix BC. * * LDBC (input) INTEGER * On entry, LDBC specifies the leading dimension of the array * BC. LDBC must be at least max( 1, M ). * * AR (input) pointer to CHAR * On entry, AR is an array of dimension (LDAR,N) containing the * k by n matrix AR. * * LDAR (input) INTEGER * On entry, LDAR specifies the leading dimension of the array * AR. LDAR must be at least K. * * BR (input) pointer to CHAR * On entry, BR is an array of dimension (LDBR,N) containing the * k by n matrix BR. * * LDBR (input) INTEGER * On entry, LDBR specifies the leading dimension of the array * BR. LDBR must be at least K. * * C (input/output) pointer to CHAR * On entry, C is an array of dimension (LDC,N) containing the m * by n matrix C. Only the trapezoidal part of C determined by * UPLO and IOFFD is updated. * * LDC (input) INTEGER * On entry, LDC specifies the leading dimension of the array C. * LDC must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int i1, j1, m1, mn, n1, size; GEMM_T gemm; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, AC, &LDAC, BR, &LDBR, one, C, &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, BC, &LDBC, AR, &LDAR, one, C, &LDC ); } n1 = M - IOFFD; if( ( n1 = MIN( n1, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fsyr2k( C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), &n1, &K, ALPHA, Mptr( AC, i1, 0, LDAC, size ), &LDAC, Mptr( BC, i1, 0, LDBC, size ), &LDBC, one, Mptr( C, i1, j1, LDC, size ), &LDC ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, Mptr( AC, i1, 0, LDAC, size ), &LDAC, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( C, i1, j1, LDC, size ), &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, Mptr( BC, i1, 0, LDBC, size ), &LDBC, Mptr( AR, 0, j1, LDAR, size ), &LDAR, one, Mptr( C, i1, j1, LDC, size ), &LDC ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = M - IOFFD; mn = MIN( mn, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) { gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, AC, &LDAC, BR, &LDBR, one, C, &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, BC, &LDBC, AR, &LDAR, one, C, &LDC ); } TYPE->Fsyr2k( C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), &n1, &K, ALPHA, Mptr( AC, m1, 0, LDAC, size ), &LDAC, Mptr( BC, m1, 0, LDBC, size ), &LDBC, one, Mptr( C, m1, j1, LDC, size ), &LDC ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, AC, &LDAC, Mptr( BR, 0, j1, LDBR, size ), &LDBR, one, Mptr( C, 0, j1, LDC, size ), &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, BC, &LDBC, Mptr( AR, 0, j1, LDAR, size ), &LDAR, one, Mptr( C, 0, j1, LDC, size ), &LDC ); } } else { one = TYPE->one; gemm = TYPE->Fgemm; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &N, &K, ALPHA, AC, &LDAC, BR, &LDBR, one, C, &LDC ); gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &N, &K, ALPHA, BC, &LDBC, AR, &LDAR, one, C, &LDC ); } /* * End of PB_Ctzsyr2k */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctzsyrk.c000644 000766 000024 00000017441 10363532303 020424 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctzsyrk( PBTYP_T * TYPE, char * UPLO, int M, int N, int K, int IOFFD, char * ALPHA, char * AC, int LDAC, char * AR, int LDAR, char * C, int LDC ) #else void PB_Ctzsyrk( TYPE, UPLO, M, N, K, IOFFD, ALPHA, AC, LDAC, AR, LDAR, C, LDC ) /* * .. Scalar Arguments .. */ char * UPLO; int IOFFD, K, LDAC, LDAR, LDC, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * AC, * AR, * C; #endif { /* * Purpose * ======= * * PB_Ctzsyrk performs the trapezoidal symmetric or Hermitian rank k * operation: * * C := alpha * AC * AR + C, * * where alpha is a scalar, AC is an m by k matrix, AR is an k by n ma- * trix and C is an m by n trapezoidal symmetric or Hermitian matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix C is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of C is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of C is referenced, * * otherwise all of the matrix C is referenced. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix C. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * * K (input) INTEGER * On entry, K specifies the number of columns of the matrix AC, * and the number of rows of the matrix AR. K must be at least * zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of C as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal C( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal C( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal C( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * AC (input) pointer to CHAR * On entry, AC is an array of dimension (LDAC,K) containing the * m by k matrix AC. * * LDAC (input) INTEGER * On entry, LDAC specifies the leading dimension of the array * AC. LDAC must be at least max( 1, M ). * * AR (input) pointer to CHAR * On entry, AR is an array of dimension (LDAR,N) containing the * k by n matrix AR. * * LDAR (input) INTEGER * On entry, LDAR specifies the leading dimension of the array * AR. LDAR must be at least K. * * C (input/output) pointer to CHAR * On entry, C is an array of dimension (LDC,N) containing the m * by n matrix A. Only the trapezoidal part of C determined by * UPLO and IOFFD is updated. * * LDC (input) INTEGER * On entry, LDC specifies the leading dimension of the array C. * LDC must be at least max( 1, M ). * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * one; int i1, j1, m1, mn, n1, size; GEMM_T gemm; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( Mupcase( UPLO[0] ) == CLOWER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MAX( 0, -IOFFD ); if( ( n1 = MIN( mn, N ) ) > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, AC, &LDAC, AR, &LDAR, one, C, &LDC ); if( ( n1 = MIN( M-IOFFD, N ) - mn ) > 0 ) { i1 = ( j1 = mn ) + IOFFD; TYPE->Fsyrk( C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), &n1, &K, ALPHA, Mptr( AC, i1, 0, LDAC, size ), &LDAC, one, Mptr( C, i1, j1, LDC, size ), &LDC ); if( ( m1 = M - mn - n1 - IOFFD ) > 0 ) { i1 += n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, Mptr( AC, i1, 0, LDAC, size ), &LDAC, Mptr( AR, 0, j1, LDAR, size ), &LDAR, one, Mptr( C, i1, j1, LDC, size ), &LDC ); } } } else if( Mupcase( UPLO[0] ) == CUPPER ) { size = TYPE->size; one = TYPE->one; gemm = TYPE->Fgemm; mn = MIN( M - IOFFD, N ); if( ( n1 = mn - MAX( 0, -IOFFD ) ) > 0 ) { j1 = mn - n1; if( ( m1 = MAX( 0, IOFFD ) ) > 0 ) gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &m1, &n1, &K, ALPHA, AC, &LDAC, AR, &LDAR, one, C, &LDC ); TYPE->Fsyrk( C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ), &n1, &K, ALPHA, Mptr( AC, m1, 0, LDAC, size ), &LDAC, one, Mptr( C, m1, j1, LDC, size ), &LDC ); } if( ( n1 = N - MAX( 0, mn ) ) > 0 ) { j1 = N - n1; gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &n1, &K, ALPHA, AC, &LDAC, Mptr( AR, 0, j1, LDAR, size ), &LDAR, one, Mptr( C, 0, j1, LDC, size ), &LDC ); } } else { TYPE->Fgemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &M, &N, &K, ALPHA, AC, &LDAC, AR, &LDAR, TYPE->one, C, &LDC ); } /* * End of PB_Ctzsyrk */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctztrmm.c000644 000766 000024 00000023670 10363532303 020414 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctztrmm( PBTYP_T * TYPE, char * SIDE, char * UPLO, char * TRANS, char * DIAG, int M, int N, int K, int IOFFD, char * ALPHA, char * A, int LDA, char * B, int LDB, char * C, int LDC ) #else void PB_Ctztrmm( TYPE, SIDE, UPLO, TRANS, DIAG, M, N, K, IOFFD, ALPHA, A, LDA, B, LDB, C, LDC ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO, * TRANS, * DIAG; int IOFFD, K, LDA, LDB, LDC, M, N; char * ALPHA; /* * .. Array Arguments .. */ PBTYP_T * TYPE; char * A, * B, * C; #endif { /* * Purpose * ======= * * PB_Ctztrmm performs the matrix-matrix operation * * C := alpha * op( A ) * B, * * or * * C := alpha * B * op( A ), * * where alpha is a scalar, A is an m by n trapezoidal triangular ma- * trix, B is an k by n matrix when TRANS is 'N' or 'n' and an m by k * matrix otherwise, and op( A ) is one of * * op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * SIDE (input) pointer to CHAR * On entry, SIDE specifies whether op( A ) multiplies B from * the left or right as follows: * * SIDE = 'L' or 'l' C := alpha * op( A ) * B, * * SIDE = 'R' or 'r' C := alpha * B * op( A ). * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * TRANS (input) pointer to CHAR * On entry, TRANS specifies the form of op( A ) to be used as * follows: * * TRANS = 'N' or 'n': op( A ) = A, * * TRANS = 'T' or 't': op( A ) = A', * * TRANS = 'C' or 'c': op( A ) = A' or conjg( A' ). * * DIAG (input) pointer to CHAR * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit triangular. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (input) INTEGER * On entry, K specifies the number of rows of the matrix B when * TRANS is 'N' or 'n', and the number of columns of the matrix * B otherwise. K must be at least zero. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (input) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is referenced. When DIAG = 'U' or 'u', the * diagonal elements of A are not referenced either, but are * assumed to be unity. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * B (input) pointer to CHAR * On entry, B is an array of dimension (LDB,Kb). Before entry, * with TRANS = 'N' or 'n', the array B must contain the k by n * matrix B corresponding to the columns of A. Otherwise, the * array B must contain the m by k matrix B corresponding to the * rows of A. When TRANS is 'N' or 'n', LDB is at least K, and * Kb is at least N. Otherwise, LDB is at least max(1,M), and Kb * is at least K. * * LDB (input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * LDB must be at least K when TRANS is 'N' or 'n' and * max( 1, M ) otherwise. * * C (input/output) pointer to CHAR * On entry, C is an array of dimension (LDC,Kc). On exit, with * TRANS = 'N' or 'n', the array C contains the m by k matrix C * corresponding to the rows of A. Otherwise, the array C con- * tains the k by n matrix C corresponding to the columns of A. * When TRANS is 'N' or 'n', LDC is at least max( 1, M ), and Kc * is at least K. Otherwise, LDC is at least K, and Kc is at * least N. On exit, C is overwritten by the partial updated * matrix C. * * LDC (input) INTEGER * On entry, LDC specifies the leading dimension of the array C. * LDC must be at least max( 1, M ) when TRANS is 'N' or * 'n' and 1 otherwise. * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * Aptr = NULL; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( ( Mupcase( UPLO[0] ) == CLOWER ) || ( Mupcase( UPLO[0] ) == CUPPER ) ) { Aptr = PB_Cmalloc( M * N * TYPE->size ); TYPE->Ftzpadcpy( C2F_CHAR( UPLO ), C2F_CHAR( DIAG ), &M, &N, &IOFFD, A, &LDA, Aptr, &M ); if( Mupcase( SIDE[0] ) == CLEFT ) { if( Mupcase( TRANS[0] ) == CNOTRAN ) { TYPE->Fgemm( C2F_CHAR( TRANS ), C2F_CHAR( TRAN ), &M, &K, &N, ALPHA, Aptr, &M, B, &LDB, TYPE->one, C, &LDC ); } else { TYPE->Fgemm( C2F_CHAR( TRANS ), C2F_CHAR( NOTRAN ), &K, &N, &M, ALPHA, B, &LDB, Aptr, &M, TYPE->one, C, &LDC ); } } else { if( Mupcase( TRANS[0] ) == CNOTRAN ) { TYPE->Fgemm( C2F_CHAR( TRAN ), C2F_CHAR( TRANS ), &K, &N, &M, ALPHA, B, &LDB, Aptr, &M, TYPE->one, C, &LDC ); } else { TYPE->Fgemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANS ), &M, &K, &N, ALPHA, Aptr, &M, B, &LDB, TYPE->one, C, &LDC ); } } if( Aptr ) free( Aptr ); } else { if( Mupcase( SIDE[0] ) == CLEFT ) { if( Mupcase( TRANS[0] ) == CNOTRAN ) { TYPE->Fgemm( C2F_CHAR( TRANS ), C2F_CHAR( TRAN ), &M, &K, &N, ALPHA, A, &LDA, B, &LDB, TYPE->one, C, &LDC ); } else { TYPE->Fgemm( C2F_CHAR( TRANS ), C2F_CHAR( NOTRAN ), &K, &N, &M, ALPHA, B, &LDB, A, &LDA, TYPE->one, C, &LDC ); } } else { if( Mupcase( TRANS[0] ) == CNOTRAN ) { TYPE->Fgemm( C2F_CHAR( TRAN ), C2F_CHAR( TRANS ), &K, &N, &M, ALPHA, B, &LDB, A, &LDA, TYPE->one, C, &LDC ); } else { TYPE->Fgemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANS ), &M, &K, &N, ALPHA, A, &LDA, B, &LDB, TYPE->one, C, &LDC ); } } } /* * End of PB_Ctztrmm */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Ctztrmv.c000644 000766 000024 00000020673 10363532303 020425 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_Ctztrmv( PBTYP_T * TYPE, char * SIDE, char * UPLO, char * TRANS, char * DIAG, int M, int N, int K, int IOFFD, char * ALPHA, char * A, int LDA, char * X, int LDX, char * Y, int LDY ) #else void PB_Ctztrmv( TYPE, SIDE, UPLO, TRANS, DIAG, M, N, K, IOFFD, ALPHA, A, LDA, X, LDX, Y, LDY ) /* * .. Scalar Arguments .. */ char * SIDE, * UPLO, * TRANS, * DIAG; int IOFFD, K, LDA, LDX, LDY, M, N; char * ALPHA; /* * .. Array Arguments .. */ char * A, * X, * Y; PBTYP_T * TYPE; #endif { /* * Purpose * ======= * * PB_Ctztrmv performs the matrix-vector operation * * y := A * x, or y := A' * x, or y := conjg( A' ) * x, * * where alpha and beta are scalars, x and y are vectors, and A is an * m by n trapezoidal triangular matrix. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * SIDE (dummy) pointer to CHAR * In this routine, SIDE is a dummy (unused) argument. * * UPLO (input) pointer to CHAR * On entry, UPLO specifies which part of the matrix A is to be * referenced as follows: * * UPLO = 'L' or 'l' the lower trapezoid of A is referenced, * * UPLO = 'U' or 'u' the upper trapezoid of A is referenced, * * otherwise all of the matrix A is referenced. * * TRANS (input) pointer to CHAR * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n': y := A*x, * * TRANS = 'T' or 't': y := A'*x, * * TRANS = 'C' or 'c': y := A'*x or y := conjg( A' )*x. * * DIAG (input) pointer to CHAR * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit triangular. * * M (input) INTEGER * On entry, M specifies the number of rows of the matrix A. M * must be at least zero. * * N (input) INTEGER * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * * K (dummy) INTEGER * In this routine, K is a dummy (unused) argument. * * IOFFD (input) INTEGER * On entry, IOFFD specifies the position of the offdiagonal de- * limiting the upper and lower trapezoidal part of A as follows * (see the notes below): * * IOFFD = 0 specifies the main diagonal A( i, i ), * with i = 1 ... MIN( M, N ), * IOFFD > 0 specifies the subdiagonal A( i+IOFFD, i ), * with i = 1 ... MIN( M-IOFFD, N ), * IOFFD < 0 specifies the superdiagonal A( i, i-IOFFD ), * with i = 1 ... MIN( M, N+IOFFD ). * * ALPHA (dummy) pointer to CHAR * In this routine, ALPHA is a dummy (unused) argument. * * A (input) pointer to CHAR * On entry, A is an array of dimension (LDA,N) containing the m * by n matrix A. Only the trapezoidal part of A determined by * UPLO and IOFFD is referenced. When DIAG = 'U' or 'u', the * diagonal elements of A are not referenced either, but are * assumed to be unity. * * LDA (input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least max( 1, M ). * * X (input) pointer to CHAR * On entry, X is an array of dimension (LDX,Kx). Before entry, * with TRANS = 'N' or 'n', the array X must contain the n ele- * ment vector x corresponding to the columns of A. Otherwise, * the array X must contain the m element vector x corresponding * to the rows of A. When TRANS is 'N' or 'n', LDX is at least * 1, and Kx is at least N. Otherwise, LDX is at least max(1,M), * and Kx is at least 1. * * LDX (input) INTEGER * On entry, LDX specifies the leading dimension of the array X. * LDX must be at least 1 when TRANS is 'N' or 'n' and * max( 1, M ) otherwise. * * Y (input/output) pointer to CHAR * On entry, Y is an array of dimension (LDY,Ky). On exit, with * TRANS = 'N' or 'n', the array Y contains the m element vector * y corresponding to the rows of A. Otherwise, the array Y con- * tains the n element vector y corresponding to the columns of * A. When TRANS is 'N' or 'n', LDY is at least max( 1, M ), and * Ky is at least 1. Otherwise, LDY is at least 1, and Ky is at * least N. On exit, Y is overwritten by the partial updated * vector y. * * LDY (input) INTEGER * On entry, LDY specifies the leading dimension of the array Y. * LDY must be at least max( 1, M ) when TRANS is 'N' or * 'n' and 1 otherwise. * * Notes * ===== * N N * ---------------------------- ----------- * | d | | | * M | d Upper | | Upper | * | Lower d | |d | * | d | M | d | * ---------------------------- | d | * | d | * IOFFD < 0 | Lower d | * | d| * N | | * ----------- ----------- * | d Upper| * | d | IOFFD > 0 * M | d | * | d| N * | Lower | ---------------------------- * | | | Upper | * | | |d | * | | | d | * | | | d | * | | |Lower d | * ----------- ---------------------------- * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int ione = 1; char * Aptr = NULL; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) ) return; if( ( Mupcase( UPLO[0] ) == CLOWER ) || ( Mupcase( UPLO[0] ) == CUPPER ) ) { Aptr = PB_Cmalloc( M * N * TYPE->size ); TYPE->Ftzpadcpy( C2F_CHAR( UPLO ), C2F_CHAR( DIAG ), &M, &N, &IOFFD, A, &LDA, Aptr, &M ); if( Mupcase( TRANS[0] ) == CNOTRAN ) { TYPE->Fgemv( C2F_CHAR( TRANS ), &M, &N, ALPHA, Aptr, &M, X, &LDX, TYPE->one, Y, &ione ); } else { TYPE->Fgemv( C2F_CHAR( TRANS ), &M, &N, ALPHA, Aptr, &M, X, &ione, TYPE->one, Y, &LDY ); } if( Aptr ) free( Aptr ); } else { if( Mupcase( TRANS[0] ) == CNOTRAN ) { TYPE->Fgemv( C2F_CHAR( TRANS ), &M, &N, ALPHA, A, &LDA, X, &LDX, TYPE->one, Y, &ione ); } else { TYPE->Fgemv( C2F_CHAR( TRANS ), &M, &N, ALPHA, A, &LDA, X, &ione, TYPE->one, Y, &LDY ); } } /* * End of PB_Ctztrmv */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CVMcontig.c000644 000766 000024 00000034561 10363532303 020606 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CVMcontig( PB_VM_T * VM, int * NRPQ, int * NCPQ, int * IOFF, int * JOFF ) #else void PB_CVMcontig( VM, NRPQ, NCPQ, IOFF, JOFF ) /* * .. Scalar Arguments .. */ int * IOFF, * JOFF, * NCPQ, * NRPQ; PB_VM_T * VM; #endif { /* * Purpose * ======= * * PB_CVMcontig computes the maximum number of contiguous rows and * columns corresponding to the first diagonals of the local virtual * matrix VM. This routine also returns the row and column offset of the * first diagonal entry. * * Arguments * ========= * * VM (local input) pointer to a PB_VM_T structure * On entry, VM is a pointer to a structure of type PB_VM_T, * that contains the virtual matrix information (see pblas.h). * * NRPQ (local output) INTEGER * On exit, NRPQ specifies the number of contiguous rows corres- * ponding to the first diagonals of the local virtual matrix * VM. On exit, NRPQ is at least zero. * * NCPQ (local output) INTEGER * On exit, NCPQ specifies the number of contiguous columns cor- * responding to the first diagonals of the local virtual matrix * VM. On exit, NRPQ is at least zero. * * IOFF (local output) INTEGER * On exit, IOFF is the local row offset of the first row cor- * responding to a diagonal entry of the Virtual matrix VM. If * no diagonals are found, the value zero is returned. On exit, * IOFF is at least zero. * * JOFF (local output) INTEGER * On exit, JOFF is the local column offset of the first column * corresponding to a diagonal entry of the Virtual matrix VM. * If no diagonals are found, the value zero is returned. On * exit, JOFF is at least zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int ColCont=1, FirstD=0, GoSouth, GoEast, RowCont=1, ilow, imbloc, inbloc, iupp, lcmt, lcmtnn=0, lcmt00, lmbloc, lnbloc, low, mb, mblks, mbloc, mcur=0, mcurd, md=0, nb, nblks, nbloc, ncur=0, ncurd, nd=0, npq=0, pmb, qnb, tmp1, tmp2, upp; /* .. * .. Executable Statements .. * */ *NRPQ = 0; *NCPQ = 0; *IOFF = 0; *JOFF = 0; mblks = VM->mblks; nblks = VM->nblks; /* * Quick return if I don't own any blocks. */ if( ( mblks == 0 ) || ( nblks == 0 ) ) return; /* * Retrieve the contents of VM structure fields */ lcmt00 = VM->lcmt00; imbloc = VM->imbloc; mb = VM->mb; lmbloc = VM->lmbloc; iupp = VM->iupp; upp = VM->upp; pmb = VM->nprow * mb; inbloc = VM->inbloc; nb = VM->nb; lnbloc = VM->lnbloc; ilow = VM->ilow; low = VM->low; qnb = VM->npcol * nb; /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the coordinates of the * current entry in the LCM table (mcur,ncur). */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); /* * Go through the table looking for blocks owning diagonal entries. */ if( !( GoSouth ) && !( GoEast ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp * Compute the number of diagonals in this block as well as lcm value (lcntnn) * that its neighbor should have to preserve continuity. */ if( lcmt00 >= 0 ) { tmp2 = ( ( tmp1 = imbloc - lcmt00 ) > 0 ? tmp1 : 0 ); if( tmp2 < inbloc ) { npq = tmp2; lcmtnn = -npq; } else if ( tmp2 == inbloc ) { npq = inbloc; lcmtnn = 0; } else { npq = inbloc; lcmtnn = lcmt00 + npq; } *IOFF += lcmt00; } else { tmp2 = ( ( tmp1 = inbloc + lcmt00 ) > 0 ? tmp1 : 0 ); if( tmp2 < imbloc ) { npq = tmp2; lcmtnn = npq; } else if ( tmp2 == imbloc ) { npq = tmp2; lcmtnn = 0; } else { npq = imbloc; lcmtnn = lcmt00 - npq; } *JOFF -= lcmt00; } /* * Save coordinates of last block owning diagonals. Set FirstD to one, since * a block owning diagonals has been found. */ md = 0; nd = 0; FirstD = 1; /* * Those rows and columns are obviously contiguous */ *NRPQ = *NCPQ = npq; /* * Decide whether one should go south or east in the table: Go east if the * block below the current one only owns lower entries. If this block, however, * owns diagonals, then go south. */ GoSouth = !( GoEast = ( lcmt00 - iupp + upp - pmb < ilow ) ); } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value. */ lcmt00 -= iupp - upp + pmb; mcur++; if( !FirstD ) *IOFF += imbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value accordingly. */ while( ( mcur < mblks ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mcur++; if( !FirstD ) *IOFF += mb; } /* * Return if no more row in the LCM table. */ if( mcur >= mblks ) goto l_end; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mbloc = mb; mcurd = mcur; while( ( mcurd < mblks ) && ( lcmt >= ilow ) ) { if( mcurd == mblks-1 ) mbloc = lmbloc; /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. * If this is not the first one, update the booleans telling if the rows * and/or columns are contiguous. */ if( FirstD ) { RowCont = RowCont && ( ( ( mcurd == md+1 ) && ( lcmtnn <= 0 ) && ( lcmt <= 0 ) ) || ( ( mcurd == md ) && ( ncur == nd+1 ) && ( lcmtnn == lcmt ) ) ); ColCont = ColCont && ( ( ( ncur == nd+1 ) && ( lcmtnn >= 0 ) && ( lcmt >= 0 ) ) || ( ( ncur == nd ) && ( mcurd == md+1 ) && ( lcmtnn == lcmt ) ) ); } /* * Compute the number of diagonals in this block as well as lcm value (lcntnn) * that its neighbor should have to preserve continuity. */ if( lcmt >= 0 ) { tmp2 = ( ( tmp1 = mbloc - lcmt ) > 0 ? tmp1 : 0 ); if( tmp2 < inbloc ) { npq = tmp2; lcmtnn = -npq; } else if ( tmp2 == inbloc ) { npq = inbloc; lcmtnn = 0; } else { npq = inbloc; lcmtnn = lcmt + npq; } if( !FirstD ) *IOFF += lcmt; } else { tmp2 = ( ( tmp1 = inbloc + lcmt ) > 0 ? tmp1 : 0 ); if( tmp2 < mbloc ) { npq = tmp2; lcmtnn = npq; } else if ( tmp2 == mbloc ) { npq = tmp2; lcmtnn = 0; } else { npq = mbloc; lcmtnn = lcmt - npq; } if( !FirstD ) *JOFF -= lcmt; } /* * Save coordinates of last block owning diagonals. Set FirstD to one, since * a block owning diagonals has been found. */ md = mcurd; nd = ncur; FirstD = 1; /* * If rows (resp columns) are still contiguous, add those npq rows (resp. * columns). */ if( RowCont ) *NRPQ += npq; if( ColCont ) *NCPQ += npq; /* * Keep going south until there are no more blocks owning diagonals */ lcmt00 = lcmt; lcmt -= pmb; mcur = mcurd++; } /* * I am done with the first column of the LCM table. Go to the next column. */ lcmt00 += low - ilow + qnb; ncur++; if( !FirstD ) *JOFF += inbloc; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value. */ lcmt00 += low - ilow + qnb; ncur++; if( !FirstD ) *JOFF += inbloc; /* * While there are blocks remaining that own lower entries, keep going east * in the LCM table. Adjust the current LCM value. */ while( ( ncur < nblks ) && ( lcmt00 < low ) ) { lcmt00 += qnb; ncur++; if( !FirstD ) *JOFF += nb; } /* * Return if no more column in the LCM table. */ if( ncur >= nblks ) goto l_end; /* * lcmt00 >= low. The current block owns either diagonals or upper entries. Save * the current position in the LCM table. After this row has been completely * taken care of, re-start from this column and the next row of the LCM table. */ lcmt = lcmt00; nbloc = nb; ncurd = ncur; while( ( ncurd < nblks ) && ( lcmt <= iupp ) ) { if( ncurd == nblks-1 ) nbloc = lnbloc; /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. * If this is not the first one, update the booleans telling if the rows * and/or columns are contiguous. */ if( FirstD ) { RowCont = RowCont && ( ( ( mcur == md+1 ) && ( lcmtnn <= 0 ) && ( lcmt <= 0 ) ) || ( ( mcur == md ) && ( ncurd == nd+1 ) && ( lcmtnn == lcmt ) ) ); ColCont = ColCont && ( ( ( ncurd == nd+1 ) && ( lcmtnn >= 0 ) && ( lcmt >= 0 ) ) || ( ( ncurd == nd ) && ( mcur == md+1 ) && ( lcmtnn == lcmt ) ) ); } /* * Compute the number of diagonals in this block as well as lcm value (lcntnn) * that its neighbor should have to preserve continuity. */ if( lcmt >= 0 ) { tmp2 = ( ( tmp1 = imbloc - lcmt ) > 0 ? tmp1 : 0 ); if( tmp2 < nbloc ) { npq = tmp2; lcmtnn = -npq; } else if ( tmp2 == nbloc ) { npq = nbloc; lcmtnn = 0; } else { npq = nbloc; lcmtnn = lcmt + npq; } if( !FirstD ) *IOFF += lcmt; } else { tmp2 = ( ( tmp1 = nbloc + lcmt ) > 0 ? tmp1 : 0 ); if( tmp2 < imbloc ) { npq = tmp2; lcmtnn = npq; } else if ( tmp2 == imbloc ) { npq = tmp2; lcmtnn = 0; } else { npq = imbloc; lcmtnn = lcmt - npq; } if( !FirstD ) *JOFF -= lcmt; } /* * Save coordinates of last block owning diagonals. Set FirstD to one, since * a block owning diagonals has been found. */ md = mcur; nd = ncurd; FirstD = 1; /* * If rows (resp columns) are still contiguous, add those npq rows (resp. * columns). */ if( RowCont ) *NRPQ += npq; if( ColCont ) *NCPQ += npq; /* * Keep going east until there are no more blocks owning diagonals. */ lcmt00 = lcmt; lcmt += qnb; ncur = ncurd++; } /* * I am done with the first row of the LCM table. Go to the next row. */ lcmt00 -= iupp - upp + pmb; mcur++; if( !FirstD ) *IOFF += imbloc; } /* * Loop over the remaining columns of the LCM table. */ nbloc = nb; while( ( RowCont || ColCont ) && ( ncur < nblks ) ) { if( ncur == nblks-1 ) nbloc = lnbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value accordingly. */ while( ( mcur < mblks ) && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mcur++; if( !FirstD ) *IOFF += mb; } /* * Return if no more row in the LCM table. */ if( mcur >= mblks ) goto l_end; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mbloc = mb; mcurd = mcur; while( ( mcurd < mblks ) && ( lcmt >= low ) ) { if( mcurd == mblks-1 ) mbloc = lmbloc; /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. * If this is not the first one, update the booleans telling if the rows * and/or columns are contiguous. */ if( FirstD ) { RowCont = RowCont && ( ( ( mcurd == md+1 ) && ( lcmtnn <= 0 ) && ( lcmt <= 0 ) ) || ( ( mcurd == md ) && ( ncur == nd+1 ) && ( lcmtnn == lcmt ) ) ); ColCont = ColCont && ( ( ( ncur == nd+1 ) && ( lcmtnn >= 0 ) && ( lcmt >= 0 ) ) || ( ( ncur == nd ) && ( mcurd == md+1 ) && ( lcmtnn == lcmt ) ) ); } /* * Compute the number of diagonals in this block as well as lcm value (lcntnn) * that its neighbor should have to preserve continuity. */ if( lcmt >= 0 ) { tmp2 = ( ( tmp1 = mbloc - lcmt ) > 0 ? tmp1 : 0 ); if( tmp2 < nbloc ) { npq = tmp2; lcmtnn = -npq; } else if ( tmp2 == nbloc ) { npq = nbloc; lcmtnn = 0; } else { npq = nbloc; lcmtnn = lcmt + npq; } if( !FirstD ) *IOFF += lcmt; } else { tmp2 = ( ( tmp1 = nbloc + lcmt ) > 0 ? tmp1 : 0 ); if( tmp2 < mbloc ) { npq = tmp2; lcmtnn = npq; } else if ( tmp2 == mbloc ) { npq = tmp2; lcmtnn = 0; } else { npq = mbloc; lcmtnn = lcmt - npq; } if( !FirstD ) *JOFF -= lcmt; } /* * Save coordinates of last block owning diagonals. Set FirstD to one, since * a block owning diagonals has been found. */ md = mcurd; nd = ncur; FirstD = 1; /* * If rows (resp columns) are still contiguous, add those npq rows (resp. * columns). */ if( RowCont ) *NRPQ += npq; if( ColCont ) *NCPQ += npq; /* * Keep going south until there are no more blocks owning diagonals */ lcmt00 = lcmt; lcmt -= pmb; mcur = mcurd++; } /* * I am done with this column of the LCM table. Go to the next column until * there are no more column in the table. */ lcmt00 += qnb; ncur++; if( !FirstD ) *JOFF += nb; } l_end: /* * If no diagonals were found, reset IOFF and JOFF to zero. */ if( !FirstD ) { *IOFF = 0; *JOFF = 0; } /* * End of PB_CVMcontig */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CVMinit.c000644 000766 000024 00000020405 10363532303 020256 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CVMinit( PB_VM_T * VM, int OFFD, int M, int N, int IMB1, int INB1, int MB, int NB, int MRROW, int MRCOL, int NPROW, int NPCOL, int LCMB ) #else void PB_CVMinit( VM, OFFD, M, N, IMB1, INB1, MB, NB, MRROW, MRCOL, NPROW, NPCOL, LCMB ) /* * .. Scalar Arguments .. */ int IMB1, INB1, LCMB, M, MB, MRCOL, MRROW, N, NB, NPCOL, NPROW, OFFD; /* * .. Array Arguments .. */ PB_VM_T * VM; #endif { /* * Purpose * ======= * * PB_CVMinit initializes a virtual matrix with the information of an m * by n local array owned by the process of relative coordinates * ( MRROW, MRCOL ). * * Arguments * ========= * * VM (local output) pointer to a PB_VM_T structure * On entry, VM is a pointer to a structure of type PB_VM_T. * On exit, VM points to the initialized structure containing * the virtual matrix information (see pblas.h). * * OFFD (global input) INTEGER * On entry, OFFD specifies the off-diagonal of the underlying * matrix of interest as follows: * OFFD = 0 specifies the main diagonal, * OFFD > 0 specifies lower subdiagonals, and * OFFD < 0 specifies upper superdiagonals. * * M (local input) INTEGER * On entry, M specifies the local number of rows of the under- * lying matrix owned by the process of relative coordinates * ( MRROW, MRCOL ). M must be at least zero. * * N (local input) INTEGER * On entry, N specifies the local number of columns of the un- * derlying matrix owned by the process of relative coordinates * ( MRROW, MRCOL ). N must be at least zero. * * IMB1 (global input) INTEGER * On input, IMB1 specifies the global true size of the first * block of rows of the underlying global submatrix. IMB1 must * be at least MIN( 1, M ). * * INB1 (global input) INTEGER * On input, INB1 specifies the global true size of the first * block of columns of the underlying global submatrix. INB1 * must be at least MIN( 1, N ). * * MB (global input) INTEGER * On entry, MB specifies the blocking factor used to partition * the rows of the matrix. MB must be at least one. * * NB (global input) INTEGER * On entry, NB specifies the blocking factor used to partition * the the columns of the matrix. NB must be at least one. * * MRROW (local input) INTEGER * On entry, MRROW specifies the relative row coordinate of the * process that possesses these M rows. MRROW must be least zero * and strictly less than NPROW. * * MRCOL (local input) INTEGER * On entry, MRCOL specifies the relative column coordinate of * the process that possesses these N columns. MRCOL must be * least zero and strictly less than NPCOL. * * NPROW (global input) INTEGER * On entry, NPROW specifies the total number of process rows * over which the matrix is distributed. NPROW must be at least * one. * * NPCOL (global input) INTEGER * On entry, NPCOL specifies the total number of process col- * umns over which the matrix is distributed. NPCOL must be at * least one. * * LCMB (global input) INTEGER * On entry, LCMB specifies the least common multiple of * NPROW * MB and NPCOL * NB. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int tmp1; /* .. * .. Executable Statements .. * */ /* * Initialize the fields of the VM structure */ VM->offd = OFFD; VM->lcmt00 = OFFD; VM->mp = M; VM->imb1 = IMB1; VM->mb = MB; VM->upp = MB - 1; VM->prow = MRROW; VM->nprow = NPROW; VM->nq = N; VM->inb1 = INB1; VM->nb = NB; VM->low = 1 - NB; VM->pcol = MRCOL; VM->npcol = NPCOL; VM->lcmb = LCMB; if( ( M <= 0 ) || ( N <= 0 ) ) { /* * If the local virtual array is empty, then simplify the remaining of the * initialization. */ VM->imbloc = 0; VM->lmbloc = 0; VM->mblks = 0; VM->iupp = ( MRROW ? MB - 1 : ( IMB1 > 0 ? IMB1 - 1 : 0 ) ); VM->inbloc = 0; VM->lnbloc = 0; VM->nblks = 0; VM->ilow = ( MRCOL ? 1 - NB : ( INB1 > 0 ? 1 - INB1 : 0 ) ); VM->lcmt00 += ( VM->low - VM->ilow + MRCOL * NB ) - ( VM->iupp - VM->upp + MRROW * MB ); return; } if( MRROW ) { /* * I am not in the first relative process row. Use the first local row block * size MB to initialize the VM structure. */ VM->lcmt00 -= IMB1 - MB + MRROW * MB; VM->imbloc = MIN( M, MB ); VM->mblks = ( M - 1 ) / MB + 1; VM->iupp = MB - 1; VM->lmbloc = M - ( M / MB ) * MB; if( !( VM->lmbloc ) ) VM->lmbloc = MB; if( MRCOL ) { /* * I am not in the first relative process column. Use the first local column * block size NB to initialize the VM structure. */ VM->inbloc = MIN( N, NB ); VM->ilow = 1 - NB; VM->lcmt00 += INB1 - NB + MRCOL * NB; VM->nblks = ( N - 1 ) / NB + 1; VM->lnbloc = N - ( N / NB ) * NB; if( !( VM->lnbloc ) ) VM->lnbloc = NB; } else { /* * I am in the first relative process column. Use the first column block size * INB1 to initialize the VM structure. */ VM->inbloc = INB1; VM->ilow = 1 - INB1; tmp1 = N - INB1; if( tmp1 ) { /* * There is more than one column block. Compute the number of local column * blocks and the size of the last one. */ VM->nblks = ( tmp1 - 1 ) / NB + 2; VM->lnbloc = tmp1 - ( tmp1 / NB ) * NB; if( !( VM->lnbloc ) ) VM->lnbloc = NB; } else { /* * There is only one column block. */ VM->nblks = 1; VM->lnbloc = INB1; } } } else { /* * I am in the first relative process row. Use the first row block size IMB1 to * initialize the VM structure. */ VM->imbloc = IMB1; VM->iupp = IMB1 - 1; tmp1 = M - IMB1; if( tmp1 ) { /* * There is more than one row block. Compute the number of local row blocks and * the size of the last one. */ VM->mblks = ( tmp1 - 1 ) / MB + 2; VM->lmbloc = tmp1 - ( tmp1 / MB ) * MB; if( !( VM->lmbloc ) ) VM->lmbloc = MB; } else { /* * There is only one row block. */ VM->mblks = 1; VM->lmbloc = IMB1; } if( MRCOL ) { /* * I am not in the first relative process column. Use the first local column * block size NB to initialize the VM structure. */ VM->inbloc = MIN( N, NB ); VM->ilow = 1 - NB; VM->lcmt00 += INB1 - NB + MRCOL * NB; VM->nblks = ( N - 1 ) / NB + 1; VM->lnbloc = N - ( N / NB ) * NB; if( !( VM->lnbloc ) ) VM->lnbloc = NB; } else { /* * I am in the first relative process column. Use the first column block size * INB1 to initialize the VM structure. */ VM->inbloc = INB1; VM->ilow = 1 - INB1; tmp1 = N - INB1; if( tmp1 ) { /* * There is more than one column block. Compute the number of local column * blocks and the size of the last one. */ VM->nblks = ( tmp1 - 1 ) / NB + 2; VM->lnbloc = tmp1 - ( tmp1 / NB ) * NB; if( !( VM->lnbloc ) ) VM->lnbloc = NB; } else { /* * There is only one column block. */ VM->nblks = 1; VM->lnbloc = INB1; } } } /* * End of PB_CVMinit */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CVMloc.c000644 000766 000024 00000060536 10363532303 020101 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_CVMloc( PBTYP_T * TYPE, PB_VM_T * VM, char * VROCS, char * ROCS, char * UNPA, char * TRANS, int MN, int K, char * ALPHA, char * A, int LDA, char * BETA, char * B, int LDB ) #else int PB_CVMloc( TYPE, VM, VROCS, ROCS, UNPA, TRANS, MN, K, ALPHA, A, LDA, BETA, B, LDB ) /* * .. Scalar Arguments .. */ int K, LDA, LDB, MN; char * ALPHA, * BETA; /* * .. Array Arguments .. */ char * VROCS, * ROCS, * UNPA, * TRANS; PBTYP_T * TYPE; PB_VM_T * VM; char * A, * B; #endif { /* * Purpose * ======= * * PB_CVMloc packs a one-dimensional distributed array A into another * one-dimensional distributed array B, or unpacks a one-dimensional * distributed array B into a one-dimensional distributed array A. This * operation is triggered by a virtual distributed array. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * VM (local input) pointer to a PB_VM_T structure * On entry, VM is a pointer to a structure of type PB_VM_T, * that contains the virtual matrix information (see pblas.h). * * VROCS (local input) pointer to CHAR * On entry, VROCS specifies if the rows or columns of the vir- * tual distributed array grid should be used for the packing or * unpacking operation as follows: * VROCS = 'R' or 'r', the rows should be used, * VROCS = 'C' or 'c', the columns should be used. * * ROCS (local input) pointer to CHAR * On entry, ROCS specifies if rows or columns should be packed * or unpacked as follows: * ROCS = 'R' or 'r', rows should be (un)packed, * ROCS = 'C' or 'c', columns should be (un)packed. * * UNPA (local input) pointer to CHAR * On entry, UNPA specifies if the data should be packed or un- * packed as follows: * UNPA = 'P' or 'p', packing (A into B), * UNPA = 'U' or 'u', unpacking (B into A). * * TRANS (local input) pointer to CHAR * On entry, TRANS specifies if conjugation, transposition or * conjugate transposition should occur during the (un)packing * operation as follows: * TRANS = 'N' or 'n', natural (un)packing, * TRANS = 'Z' or 'z', conjugated (un)packing, * TRANS = 'T' or 'T', transposed (un)packing, * TRANS = 'C' or 'c', conjugate transposed (un)packing. * * MN (local input) INTEGER * On entry, MN specifies the number of rows or columns to be * (un)packed. MN must be at least zero. * * K (local input) INTEGER * On entry, K specifies the length of the non-distributed di- * mension to be (un)packed. K must be at least zero. * * ALPHA (local input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) pointer to CHAR * On entry, A points to an array of dimension (LDA, Ka), where * Ka is K when ROCS is 'R' or 'r' and when ROCS is 'C' or 'c', * Ka is IMBLOC+(MBLKS-2)*MB+LMB when VROCS is 'R' or 'r' and * when VROCS is 'C' or 'c', Ka is INBLOC+(NBLKS-2)*NB+LNB. This * array contains unpacked data. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least MAX( 1, K ) when ROCS = 'C' or 'c' and * MAX( 1, IMBLOC+(MBLKS-2)*MB+LMB ) when ROCS is 'R' or 'r' and * VROCS is 'R' or 'r', and MAX( 1, INBLOC+(NBLKS-2)*NB+LNB ) * when ROCS is 'R' or 'r' and VROCS is 'C' or 'c'. * * BETA (local input) pointer to CHAR * On entry, BETA specifies the scalar beta. * * B (local input/local output) pointer to CHAR * On entry, B points to an array of dimension (LDB, Kb). When * TRANS is 'N', 'n', 'Z' or 'z', Kb is K when ROCS is 'R'or * 'r', and when ROCS is 'C' or 'c', Kb is IMBLOC+(MBLKS-2)*MB+ * LMB when VROCS is 'C' or 'c' and when VROCS is 'R', Kb is * INBLOC+(NBLKS-2)*NB+LNB. When TRANS is 'T', 't', 'C' or 'c', * Kb is K when ROCS is 'C' or 'c' and when ROCS is 'R' or 'r', * Kb is IMBLOC+(MBLKS-2)*MB+LMB when VROCS is 'C' or 'c' and * when VROCS is 'R' or 'r', Kb is INBLOC+(NBLKS-2)*NB+LNB. This * array contains unpacked data. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int GoEast, GoSouth, ilow, imbloc, inbloc, inca, incb, iupp, kb, lcmt, lcmt00, lmbloc, lnbloc, low, mb, mblkd, mblks, mbloc, * m, * n, nb, nblkd, nblks, nbloc, notran, npcol, npq=0, nprow, pmb, qnb, rows, size, tmp1, tmp2, upp; MMADD_T add; char * aptrd, * bptrd; /* .. * .. Executable Statements .. * */ mblks = VM->mblks; nblks = VM->nblks; /* * Quick return if I don't own any blocks. */ if( ( mblks == 0 ) || ( nblks == 0 ) ) return( 0 ); /* * Retrieve the contents of VM structure fields */ lcmt00 = VM->lcmt00; imbloc = VM->imbloc; mb = VM->mb; lmbloc = VM->lmbloc; upp = VM->upp; iupp = VM->iupp; nprow = VM->nprow; inbloc = VM->inbloc; nb = VM->nb; lnbloc = VM->lnbloc; low = VM->low; ilow = VM->ilow; npcol = VM->npcol; if( Mupcase( UNPA[0] ) == CPACKING ) { /* * B is the distributed target, A is the distributed source */ if( Mupcase( TRANS[0] ) == CNOTRAN ) { /* * Add A to B */ notran = 1; add = TYPE->Fmmadd; } else if( Mupcase( TRANS[0] ) == CCONJG ) { /* * Add the conjugate of A to B */ notran = 1; add = TYPE->Fmmcadd; } else if( Mupcase( TRANS[0] ) == CTRAN ) { /* * Add the tranpose of A to B */ notran = 0; add = TYPE->Fmmtadd; } else { /* * Add the conjugate tranpose of A to B */ notran = 0; add = TYPE->Fmmtcadd; } } else { /* * A is the distributed target, B is the distributed source */ if( Mupcase( TRANS[0] ) == CNOTRAN ) { /* * Add B to A */ notran = 1; add = TYPE->Fmmdda; } else if( Mupcase( TRANS[0] ) == CCONJG ) { /* * Add the conjugate of B to A */ notran = 1; add = TYPE->Fmmddac; } else if( Mupcase( TRANS[0] ) == CTRAN ) { /* * Add the tranpose of B to A */ notran = 0; add = TYPE->Fmmddat; } else { /* * Add the conjugate tranpose of B to A */ notran = 0; add = TYPE->Fmmddact; } } size = TYPE->size; rows = ( Mupcase( ROCS[0] ) == CROW ); if( Mupcase( VROCS[0] ) == CROW ) { /* * (un)packing using rows of virtual matrix */ if( rows ) { /* * (un)packing rows of mn by k array A. */ inca = size; incb = ( notran ? size : LDB * size ); m = &tmp2; n = &K; } else { /* * (un)packing columns of k by mn array A */ inca = LDA * size; incb = ( notran ? LDB * size : size ); m = &K; n = &tmp2; } kb = MN; /* * From the (un)packing point of view the only valuable shortcut is when the * virtual grid and the blocks are square, and the offset is zero or the grid * is 1x1. */ if( ( ( lcmt00 == 0 ) && ( VM->imb1 == VM->inb1 ) && ( mb == nb ) && ( nprow == npcol ) ) || ( ( nprow == 1 ) && ( npcol == 1 ) ) ) { if( VM->prow == VM->pcol ) { npq = ( ( mblks < 2 ) ? imbloc : imbloc + ( mblks - 2 ) * mb + lmbloc ); npq = MIN( npq, kb ); if( rows ) add( &npq, &K, ALPHA, A, &LDA, BETA, B, &LDB ); else add( &K, &npq, ALPHA, A, &LDA, BETA, B, &LDB ); } return( npq ); } pmb = nprow * mb; qnb = npcol * nb; /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); /* * Go through the table looking for blocks owning diagonal entries. */ if( !( GoSouth ) && !( GoEast ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ if( lcmt00 >= 0 ) { tmp1 = imbloc - lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A+lcmt00*inca, &LDA, BETA, B, &LDB ); } else { tmp1 = inbloc + lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, B-lcmt00*incb, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the pointer to A. The pointer to B remains unchanged. */ lcmt00 -= iupp - upp + pmb; mblks--; A += imbloc * inca; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the pointer to A accordingly. */ while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; A += mb * inca; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return( npq ); /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; aptrd = A; while( mblkd && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd+lcmt*inca, &LDA, BETA, B, &LDB ); } else { tmp1 = inbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd, &LDA, BETA, B-lcmt*incb, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; aptrd += mbloc * inca; } /* * I am done with the first column of the LCM table. Go to the next column. */ lcmt00 += low - ilow + qnb; nblks--; B += inbloc * incb; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as * well as the pointer to B. The pointer to A remains unchanged. */ lcmt00 += low - ilow + qnb; nblks--; B += inbloc * incb; /* * While there are blocks remaining that own lower entries, keep going east * in the LCM table. Adjust the current LCM value as well as the pointer to * B accordingly. */ while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; B += nb * incb; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return( npq ); /* * lcmt00 >= low. The current block owns either diagonals or upper entries. Save * the current position in the LCM table. After this row has been completely * taken care of, re-start from this column and the next row of the LCM table. */ lcmt = lcmt00; nblkd = nblks; bptrd = B; while( nblkd && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ nbloc = ( ( nblkd == 1 ) ? lnbloc : nb ); if( lcmt >= 0 ) { tmp1 = imbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A+lcmt*inca, &LDA, BETA, bptrd, &LDB ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, bptrd-lcmt*incb, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going east until there are no more blocks owning diagonals. */ lcmt += qnb; nblkd--; bptrd += nbloc * incb; } /* * I am done with the first row of the LCM table. Go to the next row. */ lcmt00 -= iupp - upp + pmb; mblks--; A += imbloc * inca; } /* * Loop over the remaining columns of the LCM table. */ do { /* * If the current block does not have diagonal elements, find the closest one in * the LCM table having some. */ if( ( lcmt00 < low ) || ( lcmt00 > upp ) ) { while( mblks && nblks ) { while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; A += mb * inca; } if( lcmt00 >= low ) break; while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; B += nb * incb; } if( lcmt00 <= upp ) break; } } if( !( mblks ) || !( nblks ) ) return( npq ); /* * The current block owns diagonals. Save the current position in the LCM table. * After this column has been completely taken care of, re-start from this row * and the next column in the LCM table. */ nbloc = ( ( nblks == 1 ) ? lnbloc : nb ); lcmt = lcmt00; mblkd = mblks; aptrd = A; while( mblkd && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd+lcmt*inca, &LDA, BETA, B, &LDB ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd, &LDA, BETA, B-lcmt*incb, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; aptrd += mbloc * inca; } /* * I am done with this column of the LCM table. Go to the next column ... */ lcmt00 += qnb; nblks--; B += nbloc * incb; /* * ... until there are no more columns. */ } while( nblks > 0 ); /* * Return the number of diagonals found. */ return( npq ); } else { /* * (un)packing using columns of virtual matrix */ if( rows ) { /* * (un)packing rows of mn by k array A */ inca = size; incb = ( notran ? size : LDB * size ); m = &tmp2; n = &K; } else { /* * (un)packing columns of k by mn array A */ inca = LDA * size; incb = ( notran ? LDB * size : size ); m = &K; n = &tmp2; } kb = MN; /* * From the (un)packing point of view the only valuable shortcut is when the * virtual grid and the blocks are square, and the offset is zero or the grid * is 1x1. */ if( ( ( lcmt00 == 0 ) && ( VM->imb1 == VM->inb1 ) && ( mb == nb ) && ( nprow == npcol ) ) || ( ( nprow == 1 ) && ( npcol == 1 ) ) ) { if( VM->prow == VM->pcol ) { npq = ( ( nblks < 2 ) ? inbloc : inbloc + ( nblks - 2 ) * nb + lnbloc ); npq = MIN( npq, kb ); if( rows ) add( &npq, &K, ALPHA, A, &LDA, BETA, B, &LDB ); else add( &K, &npq, ALPHA, A, &LDA, BETA, B, &LDB ); } return( npq ); } pmb = nprow * mb; qnb = npcol * nb; /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); if( !( GoSouth ) && !( GoEast ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ if( lcmt00 >= 0 ) { tmp1 = imbloc - lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, B+lcmt00*incb, &LDB ); } else { tmp1 = inbloc + lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A-lcmt00*inca, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the pointer to B. The pointer to A remains unchanged. */ lcmt00 -= iupp - upp + pmb; mblks--; B += imbloc * incb; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the pointer to B accordingly. */ while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; B += mb * incb; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return( npq ); /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; bptrd = B; while( mblkd && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, bptrd+lcmt*incb, &LDB ); } else { tmp1 = inbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A-lcmt*inca, &LDA, BETA, bptrd, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; bptrd += mbloc * incb; } /* * I am done with the first column of the LCM table. Go to the next column. */ lcmt00 += low - ilow + qnb; nblks--; A += inbloc * inca; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as * well as the pointer to A. The pointer to B remains unchanged. */ lcmt00 += low - ilow + qnb; nblks--; A += inbloc * inca; /* * While there are blocks remaining that own lower entries, keep going east * in the LCM table. Adjust the current LCM value as well as the pointer to * A accordingly. */ while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; A += nb * inca; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return( npq ); /* * lcmt00 >= low. The current block owns either diagonals or upper entries. Save * the current position in the LCM table. After this row has been completely * taken care of, re-start from this column and the next row of the LCM table. */ lcmt = lcmt00; nblkd = nblks; aptrd = A; while( nblkd && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ nbloc = ( ( nblkd == 1 ) ? lnbloc : nb ); if( lcmt >= 0 ) { tmp1 = imbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd, &LDA, BETA, B+lcmt*incb, &LDB ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd-lcmt*inca, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going east until there are no more blocks owning diagonals. */ lcmt += qnb; nblkd--; aptrd += nbloc * inca; } /* * I am done with the first row of the LCM table. Go to the next row. */ lcmt00 -= iupp - upp + pmb; mblks--; B += imbloc * incb; } /* * Loop over the remaining columns of the LCM table. */ do { /* * If the current block does not have diagonal elements, find the closest one in * the LCM table having some. */ if( ( lcmt00 < low ) || ( lcmt00 > upp ) ) { while( mblks && nblks ) { while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; B += mb * incb; } if( lcmt00 >= low ) break; while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; A += nb * inca; } if( lcmt00 <= upp ) break; } } if( !( mblks ) || !( nblks ) ) return( npq ); /* * The current block owns diagonals. Save the current position in the LCM table. * After this column has been completely taken care of, re-start from this row * and the next column in the LCM table. */ nbloc = ( ( nblks == 1 ) ? lnbloc : nb ); lcmt = lcmt00; mblkd = mblks; bptrd = B; while( mblkd && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, bptrd+lcmt*incb, &LDB ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A-lcmt*inca, &LDA, BETA, bptrd, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; bptrd += mbloc * incb; } /* * I am done with this column of the LCM table. Go to the next column ... */ lcmt00 += qnb; nblks--; A += nbloc * inca; /* * ... until there are no more columns. */ } while( nblks > 0 ); /* * Return the number of diagonals found. */ return( npq ); } /* * End of PB_CVMloc */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CVMnpq.c000644 000766 000024 00000025065 10363532303 020120 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_CVMnpq( PB_VM_T * VM ) #else int PB_CVMnpq( VM ) /* * .. Array Arguments .. */ PB_VM_T * VM; #endif { /* * Purpose * ======= * * PB_CVMnpq computes the number of diagonal entries in the virtual ma- * specified by VM. * * Arguments * ========= * * VM (local input) pointer to a PB_VM_T structure * On entry, VM is a pointer to a structure of type PB_VM_T, * that contains the virtual matrix information (see pblas.h). * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int GoEast, GoSouth, Pmb, Qnb, gcdb, ilow, imbloc, inbloc, iupp, kmax, kmin, k1, k2, k3, lcmb, lcmp, lcmq, lcmt, lcmt00, lmbloc, lnbloc, low, l1, l2, l3, m, mb, mblkd, mblks, mbloc, n, nb, nblkd, nblks, nbloc, nlcmblks, npcol, npq=0, nprow, tmp1, tmp2, upp; /* .. * .. Executable Statements .. * */ m = VM->mp; n = VM->nq; /* * Quick return if I don't own any data. */ if( ( m == 0 ) || ( n == 0 ) ) return( 0 ); /* * The only valuable shortcut is when the virtual grid and the blocks are * square, and the offset is zero or the grid is 1x1. */ mb = VM->mb; nprow = VM->nprow; nb = VM->nb; npcol = VM->npcol; if( ( ( VM->offd == 0 ) && ( VM->imb1 == VM->inb1 ) && ( mb == nb ) && ( nprow == npcol ) ) || ( ( nprow == 1 ) && ( npcol == 1 ) ) ) { if( VM->prow == VM->pcol ) return( MIN( m, n ) ); else return( 0 ); } /* * Retrieve the contents of VM structure fields */ lcmt00 = VM->lcmt00; mblks = VM->mblks; imbloc = VM->imbloc; lmbloc = VM->lmbloc; iupp = VM->iupp; upp = VM->upp; nblks = VM->nblks; inbloc = VM->inbloc; lnbloc = VM->lnbloc; ilow = VM->ilow; low = VM->low; lcmb = VM->lcmb; Pmb = nprow * mb; Qnb = npcol * nb; /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); /* * Go through the table looking for blocks owning diagonal entries. */ if( !( GoSouth ) && !( GoEast ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ npq += ( lcmt00 >= 0 ? ( ( tmp2 = ( tmp1 = imbloc - lcmt00 ) > 0 ? tmp1 : 0 ) < inbloc ? tmp2 : inbloc ) : ( ( tmp2 = ( tmp1 = inbloc + lcmt00 ) > 0 ? tmp1 : 0 ) > imbloc ? imbloc : tmp2 ) ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + Pmb ) ) < ilow ) ); } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value. */ lcmt00 -= iupp - upp + Pmb; mblks--; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value accordingly. */ while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= Pmb; mblks--; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return( npq ); /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; mbloc = mb; while( mblkd && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; npq += ( lcmt >= 0 ? ( ( tmp2 = ( tmp1 = mbloc - lcmt ) > 0 ? tmp1 : 0 ) < inbloc ? tmp2 : inbloc ) : ( ( tmp2 = ( tmp1 = inbloc + lcmt ) > 0 ? tmp1 : 0 ) > mbloc ? mbloc : tmp2 ) ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt00 = lcmt; lcmt -= Pmb; mblks = mblkd--; } /* * I am done with the first column of the LCM table. Go to the next column. */ lcmt00 += low - ilow + Qnb; nblks--; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value. */ lcmt00 += low - ilow + Qnb; nblks--; /* * While there are blocks remaining that own lower entries, keep going east * in the LCM table. Adjust the current LCM value accordingly. */ while( nblks && ( lcmt00 < low ) ) { lcmt00 += Qnb; nblks--; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return( npq ); /* * lcmt00 >= low. The current block owns either diagonals or upper entries. Save * the current position in the LCM table. After this row has been completely * taken care of, re-start from this column and the next row of the LCM table. */ lcmt = lcmt00; nblkd = nblks; nbloc = nb; while( nblkd && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ if( nblkd == 1 ) nbloc = lnbloc; npq += ( lcmt >= 0 ? ( ( tmp2 = ( tmp1 = imbloc - lcmt ) > 0 ? tmp1 : 0 ) < nbloc ? tmp2 : nbloc ) : ( ( tmp2 = ( tmp1 = nbloc + lcmt ) > 0 ? tmp1 : 0 ) > imbloc ? imbloc : tmp2 ) ); /* * Keep going east until there are no more blocks owning diagonals. */ lcmt00 = lcmt; lcmt += Qnb; nblks = nblkd--; } /* * I am done with the first row of the LCM table. Go to the next row. */ lcmt00 -= iupp - upp + Pmb; mblks--; } /* * If the current block does not have diagonal elements, find the closest one in * the LCM table having some. */ if( lcmt00 < low || lcmt00 > upp ) { while( mblks && nblks ) { while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= Pmb; mblks--; } if( lcmt00 >= low ) break; while( nblks && ( lcmt00 < low ) ) { lcmt00 += Qnb; nblks--; } if( lcmt00 <= upp ) break; } } if( !( mblks ) || !( nblks ) ) return( npq ); /* * Figure out how many "full" lcm blocks are remaining. */ gcdb = ( Pmb * Qnb ) / lcmb; if( lcmt00 > 0 ) { kmin = - ( lcmb / gcdb ); kmax = ( lcmb - Qnb ) / gcdb; tmp1 = ( mblks - 1 ) / ( lcmp = lcmb / Pmb ); tmp2 = nblks / ( lcmq = lcmb / Qnb ); } else if( lcmt00 < 0 ) { kmin = - ( ( lcmb - Pmb ) / gcdb ); kmax = lcmb / gcdb; tmp1 = mblks / ( lcmp = lcmb / Pmb ); tmp2 = ( nblks - 1 ) / ( lcmq = lcmb / Qnb ); } else { kmin = - ( ( lcmb - Pmb ) / gcdb ); kmax = ( lcmb - Qnb ) / gcdb; tmp1 = mblks / ( lcmp = lcmb / Pmb ); tmp2 = nblks / ( lcmq = lcmb / Qnb ); } /* * The last block, even if it is an lcm block will be handled separately */ nlcmblks = MIN( tmp1, tmp2 ); if( nlcmblks ) nlcmblks--; /* * Compute the lcm block part, update mblks and nblks */ if( nlcmblks ) { tmp2 = 0; k1 = -lcmt00; k1 = ICEIL( k1, gcdb ); l1 = k1 - 1; l1 = MIN( l1, kmax ); k1 = MAX( k1, kmin ); k3 = upp - lcmt00; k3 = FLOOR( k3, gcdb ); k3 = MIN( k3, kmax ); l3 = low - lcmt00; l3 = ICEIL( l3, gcdb ); l3 = MAX( l3, kmin ); if( k1 <= k3 ) { k2 = mb - nb - lcmt00; k2 = ICEIL( k2, gcdb ); if( k2 < k1 ) { /* * k2 < k1 */ tmp1 = k3 - k1 + 1; tmp2 = tmp1 * ( mb - lcmt00 ); tmp1 *= ( k3 + k1 )*gcdb; tmp2 += ( tmp1 > 0 ? -( tmp1 / 2 ) : (-tmp1) / 2 ); } else if( k2 > k3 ) { /* * k2 = k3 + 1 */ tmp2 = ( k3 - k1 + 1 ) * nb; } else { /* * k1 <= k2 <= k3 */ tmp1 = k3 - k2 + 1; tmp2 = ( k2 - k1 ) * nb + tmp1 * ( mb - lcmt00 ); tmp1 *= ( k3 + k2 ) * gcdb; tmp2 += ( tmp1 > 0 ? -( tmp1 / 2 ) : (-tmp1) / 2 ); } } if( l3 <= l1 ) { l2 = mb - nb - lcmt00; l2 = FLOOR( l2, gcdb ); if( l2 > l1 ) { /* * l2 > l1 */ tmp1 = l1 - l3 + 1; tmp2 += tmp1 * ( nb + lcmt00 ); tmp1 *= ( l3 + l1 ) * gcdb; tmp2 += ( tmp1 > 0 ? ( tmp1 / 2 ) : -( (-tmp1) / 2 ) ); } else if( l2 < l3 ) { /* * l2 = l3 - 1 */ tmp2 += ( l1 - l3 + 1 ) * mb; } else { /* * l3 <= l2 <= l1 */ tmp1 = l2 - l3 + 1; tmp2 += ( l1 - l2 ) * mb + tmp1 * ( nb + lcmt00 ); tmp1 *= ( l3 + l2 ) * gcdb; tmp2 += ( tmp1 > 0 ? ( tmp1 / 2 ) : -( (-tmp1) / 2 ) ); } } npq += nlcmblks * tmp2; mblks -= nlcmblks * lcmp; nblks -= nlcmblks * lcmq; } /* * Handle last partial (lcm) block separately */ nbloc = nb; while( nblks ) { /* * The current block owns diagonals. Save the current position in the LCM table. * After this column has been completely taken care of, re-start from this row * and the next column in the LCM table. */ if( nblks == 1 ) nbloc = lnbloc; while( mblks && lcmt00 > upp ) { lcmt00 -= Pmb; mblks--; } if( mblks <= 0 ) return( npq ); lcmt = lcmt00; mblkd = mblks; mbloc = mb; while( mblkd && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; npq += ( lcmt >= 0 ? ( ( tmp2 = ( tmp1 = mbloc - lcmt ) > 0 ? tmp1 : 0 ) < nbloc ? tmp2 : nbloc ) : ( ( tmp2 = ( tmp1 = nbloc + lcmt ) > 0 ? tmp1 : 0 ) > mbloc ? mbloc : tmp2 ) ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt00 = lcmt; lcmt -= Pmb; mblks = mblkd--; } /* * I am done with this column of the LCM table. Go to the next column ... */ lcmt00 += Qnb; nblks--; /* * ... until there are no more columns. */ } /* * Return the number of diagonals found. */ return( npq ); /* * End of PB_CVMnpq */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CVMpack.c000644 000766 000024 00000057377 10363532303 020253 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_CVMpack( PBTYP_T * TYPE, PB_VM_T * VM, char * VROCS, char * ROCS, char * UNPA, char * TRANS, int MN, int K, char * ALPHA, char * A, int LDA, char * BETA, char * B, int LDB ) #else int PB_CVMpack( TYPE, VM, VROCS, ROCS, UNPA, TRANS, MN, K, ALPHA, A, LDA, BETA, B, LDB ) /* * .. Scalar Arguments .. */ int K, LDA, LDB, MN; char * ALPHA, * BETA; /* * .. Array Arguments .. */ char * VROCS, * ROCS, * UNPA, * TRANS; PBTYP_T * TYPE; PB_VM_T * VM; char * A, * B; #endif { /* * Purpose * ======= * * PB_CVMpack packs a one-dimensional distributed array A into B, or * unpacks an array B into a one-dimensional distributed array A. This * operation is triggered by a virtual distributed array. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (see pblas.h). * * VM (local input) pointer to a VM structure * On entry, VM is a pointer to a structure of type PB_VM_T, * that contains the virtual matrix information (see pblas.h). * * VROCS (local input) pointer to CHAR * On entry, VROCS specifies if the rows or columns of the vir- * tual distributed array grid should be used for the packing or * unpacking operation as follows: * VROCS = 'R' or 'r', the rows should be used, * VROCS = 'C' or 'c', the columns should be used. * * ROCS (local input) pointer to CHAR * On entry, ROCS specifies if rows or columns should be used * packed or unpacked as follows: * ROCS = 'R' or 'r', rows should be (un)packed, * ROCS = 'C' or 'c', columns should be (un)packed. * * UNPA (local input) pointer to CHAR * On entry, UNPA specifies if the data should be packed or un- * packed as follows: * UNPA = 'P' or 'p', packing, * UNPA = 'U' or 'u', unpacking. * * TRANS (local input) pointer to CHAR * On entry, TRANS specifies if conjugation, transposition or * conjugate transposition should occur during the (un)packing * operation as follows: * TRANS = 'N' or 'n', natural (un)packing, * TRANS = 'Z' or 'z', conjugated (un)packing, * TRANS = 'T' or 't', transposed (un)packing, * TRANS = 'C' or 'c', conjugate transposed (un)packing. * * MN (local input) INTEGER * On entry, MN specifies the length of the distributed dimen- * sion to be (un)packed. MN must be at least zero. * * K (local input) INTEGER * On entry, K specifies the length of the non-distributed di- * mension to be (un)packed. K must be at least zero. * * ALPHA (local input) pointer to CHAR * On entry, ALPHA specifies the scalar alpha. * * A (local input/local output) pointer to CHAR * On entry, A points to an array of dimension (LDA, Ka), where * Ka is K when ROCS is 'R' or 'r' and when ROCS is 'C' or 'c', * Ka is IMBLOC+(MBLKS-2)*MB+LMB when VROCS is 'R' or 'r' and * when VROCS is 'C' or 'c', Ka is INBLOC+(NBLKS-2)*NB+LNB. This * array contains unpacked data. * * LDA (local input) INTEGER * On entry, LDA specifies the leading dimension of the array A. * LDA must be at least MAX( 1, K ) when ROCS = 'C' or 'c' and * MAX( 1, IMBLOC+(MBLKS-2)*MB+LMB ) when ROCS is 'R' or 'r' and * VROCS is 'R' or 'r', and MAX( 1, INBLOC+(NBLKS-2)*NB+LNB ) * when ROCS is 'R' or 'r' and VROCS is 'C' or 'c'. * * BETA (local input) pointer to CHAR * On entry, BETA specifies the scalar beta. * * B (local input/local output) pointer to CHAR * On entry, B points to an array of dimension (LDB,*). When * ROCS is 'C' or 'c', and TRANS is 'N', 'n', 'Z' or 'Z', B * points to an K by MN array. When TRANS is 'T', 't', 'C' or * 'c', B points to an MN by K array. When ROCS is 'R' or 'r', * and TRANS is 'T', 't', 'C' or 'c', B points to an K by MN ar- * ray. When TRANS is 'N', 'n', 'Z' or 'z', B points to an MN by * K array. This array contains the packed data. * * LDB (local input) INTEGER * On entry, LDB specifies the leading dimension of the array B. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int GoEast, GoSouth, ilow, imbloc, inbloc, inca, incb, iupp, kb, lcmt, lcmt00, lmbloc, lnbloc, low, mb, mblkd, mblks, mbloc, * m, * n, nb, nblkd, nblks, nbloc, notran, npcol, npq=0, nprow, pmb, qnb, rows, size, tmp1, tmp2, upp; char * aptrd; MMADD_T add; /* .. * .. Executable Statements .. * */ mblks = VM->mblks; nblks = VM->nblks; /* * Quick return if I don't own any blocks. */ if( ( mblks == 0 ) || ( nblks == 0 ) ) return( 0 ); /* * Retrieve the contents of VM structure fields */ lcmt00 = VM->lcmt00; imbloc = VM->imbloc; mb = VM->mb; lmbloc = VM->lmbloc; upp = VM->upp; iupp = VM->iupp; nprow = VM->nprow; inbloc = VM->inbloc; nb = VM->nb; lnbloc = VM->lnbloc; low = VM->low; ilow = VM->ilow; npcol = VM->npcol; if( Mupcase( UNPA[0] ) == CPACKING ) { /* * B is the target packed buffer, A is the distributed source */ if( Mupcase( TRANS[0] ) == CNOTRAN ) { /* * Add A to B */ notran = 1; add = TYPE->Fmmadd; } else if( Mupcase( TRANS[0] ) == CCONJG ) { /* * Add the conjugate of A to B */ notran = 1; add = TYPE->Fmmcadd; } else if( Mupcase( TRANS[0] ) == CTRAN ) { /* * Add the tranpose of A to B */ notran = 0; add = TYPE->Fmmtadd; } else { /* * Add the conjugate tranpose of A to B */ notran = 0; add = TYPE->Fmmtcadd; } } else { /* * B is the source packed buffer, A is the distributed target */ if( Mupcase( TRANS[0] ) == CNOTRAN ) { /* * Add B to A */ notran = 1; add = TYPE->Fmmdda; } else if( Mupcase( TRANS[0] ) == CCONJG ) { /* * Add the conjugate of B to A */ notran = 1; add = TYPE->Fmmddac; } else if( Mupcase( TRANS[0] ) == CTRAN ) { /* * Add the tranpose of B to A */ notran = 0; add = TYPE->Fmmddat; } else { /* * Add the conjugate tranpose of B to A */ notran = 0; add = TYPE->Fmmddact; } } size = TYPE->size; rows = ( Mupcase( ROCS[0] ) == CROW ); if( Mupcase( VROCS[0] ) == CROW ) { /* * (un)packing using rows of virtual matrix */ if( rows ) { /* * (un)packing rows of mn by k array A. */ inca = size; incb = ( notran ? size : LDB * size ); m = &tmp2; n = &K; } else { /* * (un)packing columns of k by mn array A */ inca = LDA * size; incb = ( notran ? LDB * size : size ); m = &K; n = &tmp2; } kb = MN; /* * From the (un)packing point of view the only valuable shortcut is when the * virtual grid and the blocks are square, and the offset is zero or the grid * is 1x1. */ if( ( ( lcmt00 == 0 ) && ( VM->imb1 == VM->inb1 ) && ( mb == nb ) && ( nprow == npcol ) ) || ( ( nprow == 1 ) && ( npcol == 1 ) ) ) { if( VM->prow == VM->pcol ) { npq = ( ( mblks < 2 ) ? imbloc : imbloc + ( mblks - 2 ) * mb + lmbloc ); npq = MIN( npq, kb ); if( rows ) add( &npq, &K, ALPHA, A, &LDA, BETA, B, &LDB ); else add( &K, &npq, ALPHA, A, &LDA, BETA, B, &LDB ); } return( npq ); } pmb = nprow * mb; qnb = npcol * nb; /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); if( !( GoSouth ) && !( GoEast ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ if( lcmt00 >= 0 ) { tmp1 = imbloc - lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A+lcmt00*inca, &LDA, BETA, B, &LDB ); } else { tmp1 = inbloc + lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); B += tmp2 * incb; /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the pointer to A. The pointer to B remains unchanged. */ lcmt00 -= iupp - upp + pmb; mblks--; A += imbloc * inca; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the pointer to A accordingly. */ while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; A += mb * inca; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return( npq ); /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; aptrd = A; while( mblkd && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd+lcmt*inca, &LDA, BETA, B, &LDB ); } else { tmp1 = inbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; aptrd += mbloc * inca; B += tmp2 * incb; } /* * I am done with the first column of the LCM table. Go to the next column. */ lcmt00 += low - ilow + qnb; nblks--; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as * well as the pointer to B. The pointer to A remains unchanged. */ lcmt00 += low - ilow + qnb; nblks--; /* * While there are blocks remaining that own lower entries, keep going east * in the LCM table. Adjust the current LCM value as well as the pointer to * B accordingly. */ while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return( npq ); /* * lcmt00 >= low. The current block owns either diagonals or upper entries. Save * the current position in the LCM table. After this row has been completely * taken care of, re-start from this column and the next row of the LCM table. */ lcmt = lcmt00; nblkd = nblks; while( nblkd && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ nbloc = ( ( nblkd == 1 ) ? lnbloc : nb ); if( lcmt >= 0 ) { tmp1 = imbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A+lcmt*inca, &LDA, BETA, B, &LDB ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going east until there are no more blocks owning diagonals. */ lcmt += qnb; nblkd--; B += tmp2 * incb; } /* * I am done with the first row of the LCM table. Go to the next row. */ lcmt00 -= iupp - upp + pmb; mblks--; A += imbloc * inca; } /* * Loop over the remaining columns of the LCM table. */ do { /* * If the current block does not have diagonal elements, find the closest one in * the LCM table having some. */ if( ( lcmt00 < low ) || ( lcmt00 > upp ) ) { while( mblks && nblks ) { while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; A += mb*inca; } if( lcmt00 >= low ) break; while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; } if( lcmt00 <= upp ) break; } } if( !( mblks ) || !( nblks ) ) return( npq ); /* * The current block owns diagonals. Save the current position in the LCM table. * After this column has been completely taken care of, re-start from this row * and the next column in the LCM table. */ nbloc = ( ( nblks == 1 ) ? lnbloc : nb ); lcmt = lcmt00; mblkd = mblks; aptrd = A; while( mblkd && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd+lcmt*inca, &LDA, BETA, B, &LDB ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; aptrd += mbloc * inca; B += tmp2 * incb; } /* * I am done with this column of the LCM table. Go to the next column ... */ lcmt00 += qnb; nblks--; /* * ... until there are no more columns. */ } while( nblks > 0 ); /* * Return the number of diagonals found. */ return( npq ); } else { /* * (un)packing using columns of virtual matrix */ if( rows ) { /* * (un)packing rows of mn by k array A */ inca = size; incb = ( notran ? size : LDB * size ); m = &tmp2; n = &K; } else { /* * (un)packing columns of k by mn array A */ inca = LDA * size; incb = ( notran ? LDB * size : size ); m = &K; n = &tmp2; } kb = MN; /* * From the (un)packing point of view the only valuable shortcut is when the * virtual grid and the blocks are square, and the offset is zero or the grid * is 1x1. */ if( ( ( lcmt00 == 0 ) && ( VM->imb1 == VM->inb1 ) && ( mb == nb ) && ( nprow == npcol ) ) || ( ( nprow == 1 ) && ( npcol == 1 ) ) ) { if( VM->prow == VM->pcol ) { npq = ( ( nblks < 2 ) ? inbloc : inbloc + ( nblks - 2 ) * nb + lnbloc ); npq = MIN( npq, kb ); if( rows ) add( &npq, &K, ALPHA, A, &LDA, BETA, B, &LDB ); else add( &K, &npq, ALPHA, A, &LDA, BETA, B, &LDB ); } return( npq ); } pmb = nprow * mb; qnb = npcol * nb; /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); if( !( GoSouth ) && !( GoEast ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ if( lcmt00 >= 0 ) { tmp1 = imbloc - lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, B, &LDB ); } else { tmp1 = inbloc + lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A-lcmt00*inca, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); B += tmp2 * incb; /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the pointer to B. The pointer to A remains unchanged. */ lcmt00 -= iupp - upp + pmb; mblks--; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the pointer to B accordingly. */ while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return( npq ); /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; while( mblkd && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, B, &LDB ); } else { tmp1 = inbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A-lcmt*inca, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; B += tmp2 * incb; } /* * I am done with the first column of the LCM table. Go to the next column. */ lcmt00 += low - ilow + qnb; nblks--; A += inbloc * inca; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as * well as the pointer to A. The pointer to B remains unchanged. */ lcmt00 += low - ilow + qnb; nblks--; A += inbloc * inca; /* * While there are blocks remaining that own lower entries, keep going east * in the LCM table. Adjust the current LCM value as well as the pointer to * A accordingly. */ while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; A += nb * inca; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return( npq ); /* * lcmt00 >= low. The current block owns either diagonals or upper entries. Save * the current position in the LCM table. After this row has been completely * taken care of, re-start from this column and the next row of the LCM table. */ lcmt = lcmt00; nblkd = nblks; aptrd = A; while( nblkd && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ nbloc = ( ( nblkd == 1 ) ? lnbloc : nb ); if( lcmt >= 0 ) { tmp1 = imbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd, &LDA, BETA, B, &LDB ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, aptrd-lcmt*inca, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going east until there are no more blocks owning diagonals. */ lcmt += qnb; nblkd--; aptrd += nbloc * inca; B += tmp2 * incb; } /* * I am done with the first row of the LCM table. Go to the next row. */ lcmt00 -= iupp - upp + pmb; mblks--; } /* * Loop over the remaining columns of the LCM table. */ do { /* * If the current block does not have diagonal elements, find the closest one in * the LCM table having some. */ if( ( lcmt00 < low ) || ( lcmt00 > upp ) ) { while( mblks && nblks ) { while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; } if( lcmt00 >= low ) break; while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; A += nb*inca; } if( lcmt00 <= upp ) break; } } if( !( mblks ) || !( nblks ) ) return( npq ); /* * The current block owns diagonals. Save the current position in the LCM table. * After this column has been completely taken care of, re-start from this row * and the next column in the LCM table. */ nbloc = ( ( nblks == 1 ) ? lnbloc : nb ); lcmt = lcmt00; mblkd = mblks; while( mblkd && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A, &LDA, BETA, B, &LDB ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); add( m, n, ALPHA, A-lcmt*inca, &LDA, BETA, B, &LDB ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; B += tmp2 * incb; } /* * I am done with this column of the LCM table. Go to the next column ... */ lcmt00 += qnb; nblks--; A += nbloc * inca; /* * ... until there are no more columns. */ } while( nblks > 0 ); /* * Return the number of diagonals found. */ return( npq ); } /* * End of PB_CVMpack */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CVMswp.c000644 000766 000024 00000052646 10363532303 020140 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ int PB_CVMswp( PBTYP_T * TYPE, PB_VM_T * VM, char * VROCS, char * ROCS, char * TRANS, int MN, char * X, int INCX, char * Y, int INCY ) #else int PB_CVMswp( TYPE, VM, VROCS, ROCS, TRANS, MN, X, INCX, Y, INCY ) /* * .. Scalar Arguments .. */ int INCX, INCY, MN; /* * .. Array Arguments .. */ char * VROCS, * ROCS, * TRANS; PBTYP_T * TYPE; PB_VM_T * VM; char * X, * Y; #endif { /* * Purpose * ======= * * PB_CVMswp swaps a one-dimensional distributed vector X with another * one-dimensional distributed vector Y. This operation is triggered by * a virtual distributed array. * * Arguments * ========= * * TYPE (local input) pointer to a PBTYP_T structure * On entry, TYPE is a pointer to a structure of type PBTYP_T, * that contains type information (See pblas.h). * * VM (local input) pointer to a PB_VM_T structure * On entry, VM is a pointer to a structure of type PB_VM_T, * that contains the virtual matrix information (see pblas.h). * * VROCS (local input) pointer to CHAR * On entry, VROCS specifies if the rows or columns of the vir- * tual distributed array grid should be used for the swapping * operation as follows: * VROCS = 'R' or 'r', the rows should be used, * VROCS = 'C' or 'c', the columns should be used. * * ROCS (local input) pointer to CHAR * On entry, ROCS specifies if rows or columns should be swap- * ped as follows: * ROCS = 'R' or 'r', rows should be swapped, * ROCS = 'C' or 'c', columns should be swapped. * * TRANS (local input) pointer to CHAR * On entry, TRANS specifies if transposition should occur du- * ring the swapping operation as follows: * TRANS = 'N' or 'n', natural swapping, * otherwise, transposed swapping. * * MN (local input) INTEGER * On entry, MN specifies the number of rows or columns to be * swapped. MN must be at least zero. * * X (local input/local output) pointer to CHAR * On entry, X points to an array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ) where n is IMBLOC+(MBLKS-2)*MB+ * LMB when VROCS is 'R' or 'r', and INBLOC+(NBLKS-2)*NB+LNB * otherwise. Before entry, the incremented array X must contain * the vector x. On exit, the entries of the incremented array X * are exchanged with the entries of the incremented array Y. * * INCX (local input) INTEGER * On entry, INCX specifies the increment for the elements of X. * INCX must not be zero. * * Y (local input/local output) pointer to CHAR * On entry, Y points to an array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ) where n is IMBLOC+(MBLKS-2)*MB+ * LMB when VROCS is 'C' or 'c', and INBLOC+(NBLKS-2)*NB+LNB * otherwise. Before entry, the incremented array Y must contain * the vector y. On exit, the entries of the incremented array Y * are exchanged with the entries of the incremented array X. * * INCY (local input) INTEGER * On entry, INCY specifies the increment for the elements of Y. * INCY must not be zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int GoEast, GoSouth, Xinc, Yinc, ilow, imbloc, inbloc, iupp, kb, lcmt, lcmt00, lmbloc, lnbloc, low, mb, mblkd, mblks, mbloc, nb, nblkd, nblks, nbloc, notran, npcol, npq=0, nprow, pmb, qnb, rows, size, tmp1, tmp2, upp; char * Xptrd, * Yptrd; /* .. * .. Executable Statements .. * */ mblks = VM->mblks; nblks = VM->nblks; /* * Quick return if I don't own any blocks. */ if( ( mblks == 0 ) || ( nblks == 0 ) ) return( 0 ); /* * Retrieve the contents of VM structure fields */ lcmt00 = VM->lcmt00; imbloc = VM->imbloc; mb = VM->mb; lmbloc = VM->lmbloc; upp = VM->upp; iupp = VM->iupp; nprow = VM->nprow; inbloc = VM->inbloc; nb = VM->nb; lnbloc = VM->lnbloc; low = VM->low; ilow = VM->ilow; npcol = VM->npcol; notran = ( Mupcase( TRANS[0] ) == CNOTRAN ); size = TYPE->size; rows = ( Mupcase( ROCS[0] ) == CROW ); if( Mupcase( VROCS[0] ) == CROW ) { /* * (un)packing using rows of virtual matrix */ if( rows ) { /* * (un)packing rows of mn by k array A. */ Xinc = size; Yinc = ( notran ? size : INCY * size ); } else { /* * (un)packing columns of k by mn array A */ Xinc = INCX * size; Yinc = ( notran ? INCY * size : size ); } kb = MN; /* * From the (un)packing point of view the only valuable shortcut is when the * virtual grid and the blocks are square, and the offset is zero or the grid * is 1x1. */ if( ( ( lcmt00 == 0 ) && ( VM->imb1 == VM->inb1 ) && ( mb == nb ) && ( nprow == npcol ) ) || ( ( nprow == 1 ) && ( npcol == 1 ) ) ) { if( VM->prow == VM->pcol ) { npq = ( ( mblks < 2 ) ? imbloc : imbloc + ( mblks - 2 ) * mb + lmbloc ); npq = MIN( npq, kb ); if( rows ) TYPE->Fswap( &npq, X, &INCX, Y, &INCY ); else TYPE->Fswap( &npq, X, &INCX, Y, &INCY ); } return( npq ); } pmb = nprow * mb; qnb = npcol * nb; /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); if( !( GoSouth ) && !( GoEast ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ if( lcmt00 >= 0 ) { tmp1 = imbloc - lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X+lcmt00*Xinc, &INCX, Y, &INCY ); } else { tmp1 = inbloc + lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X, &INCX, Y-lcmt00*Yinc, &INCY ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the pointer to X. The pointer to Y remains unchanged. */ lcmt00 -= iupp - upp + pmb; mblks--; X += imbloc * Xinc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the pointer to X accordingly. */ while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; X += mb * Xinc; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return( npq ); /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; Xptrd = X; while( mblkd && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, Xptrd+lcmt*Xinc, &INCX, Y, &INCY ); } else { tmp1 = inbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, Xptrd, &INCX, Y-lcmt*Yinc, &INCY ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; Xptrd += mbloc * Xinc; } /* * I am done with the first column of the LCM table. Go to the next column. */ lcmt00 += low - ilow + qnb; nblks--; Y += inbloc * Yinc; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as * well as the pointer to Y. The pointer to X remains unchanged. */ lcmt00 += low - ilow + qnb; nblks--; Y += inbloc * Yinc; /* * While there are blocks remaining that own lower entries, keep going east * in the LCM table. Adjust the current LCM value as well as the pointer to * Y accordingly. */ while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; Y += nb * Yinc; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return( npq ); /* * lcmt00 >= low. The current block owns either diagonals or upper entries. Save * the current position in the LCM table. After this row has been completely * taken care of, re-start from this column and the next row of the LCM table. */ lcmt = lcmt00; nblkd = nblks; Yptrd = Y; while( nblkd && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ nbloc = ( ( nblkd == 1 ) ? lnbloc : nb ); if( lcmt >= 0 ) { tmp1 = imbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X+lcmt*Xinc, &INCX, Yptrd, &INCY ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X, &INCX, Yptrd-lcmt*Yinc, &INCY ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going east until there are no more blocks owning diagonals. */ lcmt += qnb; nblkd--; Yptrd += nbloc * Yinc; } /* * I am done with the first row of the LCM table. Go to the next row. */ lcmt00 -= iupp - upp + pmb; mblks--; X += imbloc * Xinc; } /* * Loop over the remaining columns of the LCM table. */ do { /* * If the current block does not have diagonal elements, find the closest one in * the LCM table having some. */ if( ( lcmt00 < low ) || ( lcmt00 > upp ) ) { while( mblks && nblks ) { while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; X += mb * Xinc; } if( lcmt00 >= low ) break; while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; Y += nb * Yinc; } if( lcmt00 <= upp ) break; } } if( !mblks || !nblks ) return( npq ); /* * The current block owns diagonals. Save the current position in the LCM table. * After this column has been completely taken care of, re-start from this row * and the next column in the LCM table. */ nbloc = ( ( nblks == 1 ) ? lnbloc : nb ); lcmt = lcmt00; mblkd = mblks; Xptrd = X; while( mblkd && lcmt >= low ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, Xptrd+lcmt*Xinc, &INCX, Y, &INCY ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, Xptrd, &INCX, Y-lcmt*Yinc, &INCY ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; Xptrd += mbloc * Xinc; } /* * I am done with this column of the LCM table. Go to the next column ... */ lcmt00 += qnb; nblks--; Y += nbloc * Yinc; /* * ... until there are no more columns. */ } while( nblks > 0 ); /* * Return the number of diagonals found. */ return( npq ); } else { /* * (un)packing using columns of virtual matrix */ if( rows ) { /* * (un)packing rows of mn by k array A */ Xinc = size; Yinc = ( notran ? size : INCY * size ); } else { /* * (un)packing columns of k by mn array A */ Xinc = INCX * size; Yinc = ( notran ? INCY * size : size ); } kb = MN; /* * From the (un)packing point of view the only valuable shortcut is when the * virtual grid and the blocks are square, and the offset is zero or the grid * is 1x1. */ if( ( ( lcmt00 == 0 ) && ( VM->imb1 == VM->inb1 ) && ( mb == nb ) && ( nprow == npcol ) ) || ( ( nprow == 1 ) && ( npcol == 1 ) ) ) { if( VM->prow == VM->pcol ) { npq = ( ( nblks < 2 ) ? inbloc : inbloc + ( nblks - 2 ) * nb + lnbloc ); npq = MIN( npq, kb ); if( rows ) TYPE->Fswap( &npq, X, &INCX, Y, &INCY ); else TYPE->Fswap( &npq, X, &INCX, Y, &INCY ); } return( npq ); } pmb = nprow * mb; qnb = npcol * nb; /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); if( !( GoSouth ) && !( GoEast ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ if( lcmt00 >= 0 ) { tmp1 = imbloc - lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X, &INCX, Y+lcmt00*Yinc, &INCY ); } else { tmp1 = inbloc + lcmt00; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X-lcmt00*Xinc, &INCX, Y, &INCY ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Decide whether one should go south or east in the table: Go east if * the block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp - upp + pmb ) ) < ilow ) ); } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value as well as * the pointer to Y. The pointer to X remains unchanged. */ lcmt00 -= iupp - upp + pmb; mblks--; Y += imbloc * Yinc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value as well as the pointer to Y accordingly. */ while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Y += mb * Yinc; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) return( npq ); /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; Yptrd = Y; while( mblkd && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, inbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X, &INCX, Yptrd+lcmt*Yinc, &INCY ); } else { tmp1 = inbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X-lcmt*Xinc, &INCX, Yptrd, &INCY ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; Yptrd += mbloc * Yinc; } /* * I am done with the first column of the LCM table. Go to the next column. */ lcmt00 += low - ilow + qnb; nblks--; X += inbloc * Xinc; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value as * well as the pointer to X. The pointer to Y remains unchanged. */ lcmt00 += low - ilow + qnb; nblks--; X += inbloc * Xinc; /* * While there are blocks remaining that own lower entries, keep going east * in the LCM table. Adjust the current LCM value as well as the pointer to * X accordingly. */ while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; X += nb * Xinc; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) return( npq ); /* * lcmt00 >= low. The current block owns either diagonals or upper entries. Save * the current position in the LCM table. After this row has been completely * taken care of, re-start from this column and the next row of the LCM table. */ lcmt = lcmt00; nblkd = nblks; Xptrd = X; while( nblkd && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ nbloc = ( ( nblkd == 1 ) ? lnbloc : nb ); if( lcmt >= 0 ) { tmp1 = imbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, Xptrd, &INCX, Y+lcmt*Yinc, &INCY ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, imbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, Xptrd-lcmt*Xinc, &INCX, Y, &INCY ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going east until there are no more blocks owning diagonals. */ lcmt += qnb; nblkd--; Xptrd += nbloc * Xinc; } /* * I am done with the first row of the LCM table. Go to the next row. */ lcmt00 -= iupp - upp + pmb; mblks--; Y += imbloc * Yinc; } /* * Loop over the remaining columns of the LCM table. */ do { /* * If the current block does not have diagonal elements, find the closest one in * the LCM table having some. */ if( ( lcmt00 < low ) || ( lcmt00 > upp ) ) { while( mblks && nblks ) { while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; Y += mb * Yinc; } if( lcmt00 >= low ) break; while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; X += nb * Xinc; } if( lcmt00 <= upp ) break; } } if( !( mblks ) || !( nblks ) ) return( npq ); /* * The current block owns diagonals. Save the current position in the LCM table. * After this column has been completely taken care of, re-start from this row * and the next column in the LCM table. */ nbloc = ( ( nblks == 1 ) ? lnbloc : nb ); lcmt = lcmt00; mblkd = mblks; Yptrd = Y; /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ while( mblkd && lcmt >= low ) { mbloc = ( ( mblkd == 1 ) ? lmbloc : mb ); if( lcmt >= 0 ) { tmp1 = mbloc - lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, nbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X, &INCX, Yptrd+lcmt*Yinc, &INCY ); } else { tmp1 = nbloc + lcmt; tmp1 = MAX( 0, tmp1 ); tmp2 = MIN( tmp1, mbloc ); npq += ( tmp2 = MIN( tmp2, kb ) ); TYPE->Fswap( &tmp2, X-lcmt*Xinc, &INCX, Yptrd, &INCY ); } if( ( kb -= tmp2 ) == 0 ) return( npq ); /* * Keep going south until there are no more blocks owning diagonals */ lcmt -= pmb; mblkd--; Yptrd += mbloc * Yinc; } /* * I am done with this column of the LCM table. Go to the next column ... */ lcmt00 += qnb; nblks--; X += nbloc * Xinc; /* * ... until there are no more columns. */ } while( nblks > 0 ); /* * Return the number of diagonals found. */ return( npq ); } /* * End of PB_CVMswp */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_CVMupdate.c000644 000766 000024 00000031622 10363532303 020600 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_CVMupdate( PB_VM_T * VM, int K, int * II, int * JJ ) #else void PB_CVMupdate( VM, K, II, JJ ) /* * .. Scalar Arguments .. */ int * II, * JJ, K; PB_VM_T * VM; #endif { /* * Purpose * ======= * * PB_CVMupdate updates the local information of an m by n local array * owned by the process of relative coordinates ( MRROW, MRCOL ). Note * that if m or n is less or equal than zero, there is no data, in which * case this process does not need the local information computed by * this routine to proceed. * * Arguments * ========= * * VM (local input) pointer to a PB_VM_T structure * On entry, VM is a pointer to a structure of type PB_VM_T, * that contains the virtual matrix information (see pblas.h). * * K (global input) INTEGER * On entry, K specifies the number of diagonal elements that * have been used so far. K must be at least zero. * * II (local input/local output) INTEGER * On entry, II specifies the local row index to be updated. On * exit, II points to the local row owning the K+1 diagonal of * this local block. On entry and on exit, II is at least zero. * * JJ (local input/local output) INTEGER * On entry, JJ specifies the local column index to be updated. * On exit, JJ points to the local column owning the K+1 * diagonal of this local block. On entry and on exit, JJ is at * least zero. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ int GoEast, GoSouth, ilow, imbloc, inbloc, ioff, ioffd, iupp, joff, joffd, lcmt, lcmt00, lmbloc, lnbloc, low, mb, mblkd, mblks, mbloc, nb, nblkd, nblks, nbloc, npq=0, pmb, qnb, tmp1, tmp2, upp; /* .. * .. Executable Statements .. * */ mblks = VM->mblks; nblks = VM->nblks; /* * Quick return if I don't own any blocks or if no diagonals were found. */ if( ( K <= 0 ) || ( mblks == 0 ) || ( nblks == 0 ) ) return; /* * Handle the first block of rows or columns separately */ ioff = *II; joff = *JJ; /* * Retrieve the contents of VM structure fields */ lcmt00 = VM->lcmt00; imbloc = VM->imbloc; mb = VM->mb; lmbloc = VM->lmbloc; iupp = VM->iupp; upp = VM->upp; pmb = VM->nprow * mb; inbloc = VM->inbloc; nb = VM->nb; lnbloc = VM->lnbloc; ilow = VM->ilow; low = VM->low; qnb = VM->npcol * nb; /* * Handle separately the first row and/or column of the LCM table. Update the * LCM value of the curent block lcmt00, as well as the number of rows and * columns mblks and nblks remaining in the LCM table. */ GoSouth = ( lcmt00 > iupp ); GoEast = ( lcmt00 < ilow ); /* * Go through the table looking for blocks owning diagonal entries. */ if( !( GoSouth ) && !( GoEast ) ) { /* * The upper left block owns diagonal entries lcmt00 >= ilow && lcmt00 <= iupp */ if( lcmt00 >= 0 ) { npq = ( ( tmp2 = ( tmp1 = imbloc - lcmt00 ) > 0 ? tmp1 : 0 ) < inbloc ? tmp2 : inbloc ); if( K < npq ) { tmp1 = lcmt00 + K; *II += tmp1; iupp = ( imbloc -= tmp1 ) - 1; if( mblks == 1 ) lmbloc = imbloc; *JJ += K; ilow = 1 - ( inbloc -= K ); if( nblks == 1 ) lnbloc = inbloc; lcmt00 = 0; goto l_end; } } else { npq = ( ( tmp2 = ( tmp1 = inbloc + lcmt00 ) > 0 ? tmp1 : 0 ) > imbloc ? imbloc : tmp2 ); if( K < npq ) { tmp1 = lcmt00 - K; *JJ -= tmp1; ilow = 1 - ( inbloc += tmp1 ); if( nblks == 1 ) lnbloc = inbloc; *II += K; iupp = ( imbloc -= K ) - 1; if( mblks == 1 ) lmbloc = imbloc; lcmt00 = 0; goto l_end; } } K -= npq; /* * Decide whether one should go south or east in the table: Go east if the * block below the current one only owns lower entries. If this block, * however, owns diagonals, then go south. */ GoSouth = !( GoEast = ( ( lcmt00 - ( iupp-upp+pmb ) ) < ilow ) ); /* * Update the local indexes II and JJ */ if( GoSouth ) *II += imbloc; else *JJ += inbloc; } if( GoSouth ) { /* * Go one step south in the LCM table. Adjust the current LCM value. */ lcmt00 -= iupp - upp + pmb; mblks--; ioff += imbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value accordingly. */ while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; ioff += mb; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) goto l_end; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; mbloc = mb; ioffd = ioff; while( mblkd && ( lcmt >= ilow ) ) { /* * A block owning diagonals lcmt00 >= ilow && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; if( lcmt >= 0 ) { npq = ( ( tmp2 = ( tmp1 = mbloc - lcmt ) > 0 ? tmp1 : 0 ) < inbloc ? tmp2 : inbloc ); if( K < npq ) { tmp1 = lcmt + K; *II = ioffd + tmp1; iupp = ( imbloc = mbloc - tmp1 ) - 1; if( mblks == 1 ) lmbloc = imbloc; *JJ = joff + K; ilow = 1 - ( inbloc -= K ); if( nblks == 1 ) lnbloc = inbloc; lcmt00 = 0; goto l_end; } } else { npq = ( ( tmp2 = ( tmp1 = inbloc + lcmt ) > 0 ? tmp1 : 0 ) > mbloc ? mbloc : tmp2 ); if( K < npq ) { tmp1 = lcmt - K; *JJ = joff - tmp1; ilow = 1 - ( inbloc += tmp1 ); if( nblks == 1 ) lnbloc = inbloc; *II = ioffd + K; iupp = ( imbloc = mbloc - K ) - 1; if( mblks == 1 ) lmbloc = imbloc; lcmt00 = 0; goto l_end; } } /* * Keep going south until there are no more blocks owning diagonals */ K -= npq; lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd--; ioff = ioffd; ioffd += mbloc; } /* * I am done with the first column of the LCM table. Go to the next column. */ lcmt00 += low - ilow + qnb; nblks--; joff += inbloc; /* * Update the local indexes II and JJ */ *II = ioff; *JJ = joff; } else if( GoEast ) { /* * Go one step east in the LCM table. Adjust the current LCM value. */ lcmt00 += low - ilow + qnb; nblks--; joff += inbloc; /* * While there are blocks remaining that own lower entries, keep going east * in the LCM table. Adjust the current LCM value. */ while( nblks && ( lcmt00 < low ) ) { lcmt00 += qnb; nblks--; joff += nb; } /* * Return if no more column in the LCM table. */ if( nblks <= 0 ) goto l_end; /* * lcmt00 >= low. The current block owns either diagonals or upper entries. Save * the current position in the LCM table. After this row has been completely * taken care of, re-start from this column and the next row of the LCM table. */ lcmt = lcmt00; nblkd = nblks; nbloc = nb; joffd = joff; while( nblkd && ( lcmt <= iupp ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= iupp has been found. */ if( nblkd == 1 ) nbloc = lnbloc; if( lcmt >= 0 ) { npq = ( ( tmp2 = ( tmp1 = imbloc - lcmt ) > 0 ? tmp1 : 0 ) < nbloc ? tmp2 : nbloc ); if( K < npq ) { tmp1 = lcmt + K; *II = ioff + tmp1; iupp = ( imbloc -= tmp1 ) - 1; if( mblks == 1 ) lmbloc = imbloc; *JJ = joffd + K; ilow = 1 - ( inbloc = nbloc - K ); if( nblks == 1 ) lnbloc = inbloc; lcmt00 = 0; goto l_end; } } else { npq = ( ( tmp2 = ( tmp1 = nbloc + lcmt ) > 0 ? tmp1 : 0 ) > imbloc ? imbloc : tmp2 ); if( K < npq ) { tmp1 = lcmt - K; *JJ = joffd - tmp1; ilow = 1 - ( inbloc = nbloc + tmp1 ); if( nblks == 1 ) lnbloc = inbloc; *II = ioff + K; iupp = ( imbloc -= K ) - 1; if( mblks == 1 ) lmbloc = imbloc; lcmt00 = 0; goto l_end; } } /* * Keep going east until there are no more blocks owning diagonals. */ K -= npq; lcmt00 = lcmt; lcmt += qnb; nblks = nblkd--; joff = joffd; joffd += nbloc; } /* * I am done with the first row of the LCM table. Go to the next row. */ lcmt00 -= iupp - upp + pmb; mblks--; ioff += imbloc; /* * Update the local indexes II and JJ */ *II = ioff; *JJ = joff; } /* * Loop over the remaining columns of the LCM table. */ nbloc = nb; while( nblks ) { if( nblks == 1 ) nbloc = lnbloc; /* * While there are blocks remaining that own upper entries, keep going south. * Adjust the current LCM value accordingly. */ while( mblks && ( lcmt00 > upp ) ) { lcmt00 -= pmb; mblks--; ioff += mb; } /* * Return if no more row in the LCM table. */ if( mblks <= 0 ) goto l_end; /* * lcmt00 <= upp. The current block owns either diagonals or lower entries. * Save the current position in the LCM table. After this column has been * completely taken care of, re-start from this row and the next column of * the LCM table. */ lcmt = lcmt00; mblkd = mblks; mbloc = mb; ioffd = ioff; while( mblkd && ( lcmt >= low ) ) { /* * A block owning diagonals lcmt00 >= low && lcmt00 <= upp has been found. */ if( mblkd == 1 ) mbloc = lmbloc; if( lcmt >= 0 ) { npq = ( ( tmp2 = ( tmp1 = mbloc - lcmt ) > 0 ? tmp1 : 0 ) < nbloc ? tmp2 : nbloc ); if( K < npq ) { tmp1 = lcmt + K; *II = ioffd + tmp1; iupp = ( imbloc = mbloc - tmp1 ) - 1; if( mblks == 1 ) lmbloc = imbloc; *JJ = joff + K; ilow = 1 - ( inbloc = nbloc - K ); if( nblks == 1 ) lnbloc = inbloc; lcmt00 = 0; goto l_end; } } else { npq = ( ( tmp2 = ( tmp1 = nbloc + lcmt ) > 0 ? tmp1 : 0 ) > mbloc ? mbloc : tmp2 ); if( K < npq ) { tmp1 = lcmt - K; *JJ = joff - tmp1; ilow = 1 - ( inbloc = nbloc + tmp1 ); if( nblks == 1 ) lnbloc = inbloc; *II = ioffd + K; iupp = ( imbloc = mbloc - K ) - 1; if( mblks == 1 ) lmbloc = imbloc; lcmt00 = 0; goto l_end; } } /* * Keep going south until there are no more blocks owning diagonals */ K -= npq; lcmt00 = lcmt; lcmt -= pmb; mblks = mblkd--; ioff = ioffd; ioffd += mbloc; } /* * I am done with this column of the LCM table. Go to the next column until * there are no more column in the table. */ lcmt00 += qnb; nblks--; joff += nbloc; /* * Update the local indexes II and JJ */ *II = ioff; *JJ = joff; } l_end: /* * Update the fields of the VM structure */ VM->lcmt00 = lcmt00; VM->mp = ( mblks >= 2 ? imbloc + ( mblks - 2 ) * mb + lmbloc : ( mblks == 1 ? imbloc : 0 ) ); VM->imbloc = imbloc; VM->lmbloc = lmbloc; VM->mblks = mblks; VM->iupp = iupp; VM->nq = ( nblks >= 2 ? inbloc + ( nblks - 2 ) * nb + lnbloc : ( nblks == 1 ? inbloc : 0 ) ); VM->inbloc = inbloc; VM->lnbloc = lnbloc; VM->nblks = nblks; VM->ilow = ilow; /* * End of PB_CVMupdate */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cwarn.c000644 000766 000024 00000011251 10363532303 020016 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #ifdef TestingPblas #include "../SRC/pblas.h" #include "../SRC/PBpblas.h" #include "../SRC/PBtools.h" #include "../SRC/PBblacs.h" #include "../SRC/PBblas.h" #else #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #endif /* * --------------------------------------------------------------------- * FORTRAN <-> C interface * --------------------------------------------------------------------- * * These macros identifies how the PBLAS will be called as follows: * * _F2C_ADD_: the FORTRAN compiler expects the name of C functions to be * in all lower case and to have an underscore postfixed it (Suns, Intel * compilers expect this). * * _F2C_NOCHANGE: the FORTRAN compiler expects the name of C functions * to be in all lower case (IBM RS6K compilers do this). * * _F2C_UPCASE: the FORTRAN compiler expects the name of C functions * to be in all upcase. (Cray compilers expect this). * * _F2C_F77ISF2C: the FORTRAN compiler in use is f2c, a FORTRAN to C * converter. */ #if (_F2C_CALL_ == _F2C_ADD_ ) #define PB_NoAbort pb_noabort_ #endif #if (_F2C_CALL_ == _F2C_UPCASE ) #define PB_NoAbort PB_NOABORT #endif #if (_F2C_CALL_ == _F2C_NOCHANGE ) #define PB_NoAbort pb_noabort #endif #if (_F2C_CALL_ == _F2C_F77ISF2C ) #define PB_NoAbort pb_noabort__ #endif #ifdef __STDC__ void PB_Cwarn( int ICTXT, int LINE, char * ROUT, char * FORM, ... ) #else void PB_Cwarn( va_alist ) va_dcl #endif { /* * Purpose * ======= * * PB_Cwarn is an error handler for the PBLAS routines. This routine * displays an error message on stderr. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * LINE (local input) INTEGER * On entry, LINE specifies the line number in the file where * the error has occured. When LINE is not a valid line number, * * ROUT (global input) pointer to CHAR * On entry, ROUT specifies the name of the routine calling this * error handler. * * FORM (local input) pointer to CHAR * On entry, FORM is a control string specifying the format * conversion of its following arguments. * * ... (local input) * On entry, FORM is a control string specifying the format * On entry, the expressions that are to be evaluated and con- * verted according to the formats in the control string FORM * and then placed in the output stream. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ va_list argptr; int iam, mycol, myrow, npcol, nprow; char cline[100]; /* .. * .. External Functions .. */ #ifdef TestingPblas #ifdef __STDC__ int PB_NoAbort( int * ); #else int PB_NoAbort(); #endif #endif #ifdef __STDC__ va_start( argptr, FORM ); #else char * ROUT, * FORM; int ICTXT, LINE; /* .. * .. Executable Statements .. * */ va_start( argptr ); ICTXT = va_arg( argptr, int ); LINE = va_arg( argptr, int ); ROUT = va_arg( argptr, char * ); FORM = va_arg( argptr, char * ); #endif #ifdef TestingPblas /* * For testing purpose only, the error is reported, but the program execution * is not terminated */ if( PB_NoAbort( &ICTXT ) ) return; #endif vsprintf( cline, FORM, argptr ); va_end( argptr ); Cblacs_gridinfo( ICTXT, &nprow, &npcol, &myrow, &mycol ); if( nprow != -1 ) iam = Cblacs_pnum( ICTXT, myrow, mycol ); else iam = -1; /* * Display an error message */ if( LINE <= 0 ) (void) fprintf( stderr, "%s'%s'\n%s{%d,%d}, %s%d, %s%d%s'%s'.\n\n", "PBLAS ERROR ", cline, "from ", myrow, mycol, "pnum=", iam, "Contxt=", ICTXT, ", in routine ", ROUT ); else (void) fprintf( stderr, "%s'%s'\n%s{%d,%d}, %s%d, %s%d%s%d%s'%s'.\n\n", "PBLAS ERROR ", cline, "from ", myrow, mycol, "pnum=", iam, "Contxt=", ICTXT, ", on line ", LINE, " of routine ", ROUT ); /* * End of PB_Cwarn */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_Cztypeset.c000644 000766 000024 00000006435 10363532303 020746 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS auxiliary routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" PBTYP_T * PB_Cztypeset() { /* * Purpose * ======= * * PB_Cztypeset on the first call initializes a static structure contai- * ning typed information and returns a pointer to it. The other calls * to this routine just returns this pointer. * * -- Written on April 1, 1998 by * R. Clint Whaley, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ static int setup=0; static PBTYP_T TypeStruct; static cmplx16 zero, one, negone; /* .. * .. Executable Statements .. * */ if( setup ) return( &TypeStruct ); setup = 1; TypeStruct.type = DCPLX; TypeStruct.usiz = sizeof( double ); TypeStruct.size = sizeof( cmplx16 ); zero [REAL_PART] = ZERO; zero [IMAG_PART] = ZERO; one [REAL_PART] = ONE; one [IMAG_PART] = ZERO; negone[REAL_PART] = -ONE; negone[IMAG_PART] = ZERO; TypeStruct.zero = ((char *) zero); TypeStruct.one = ((char *) one); TypeStruct.negone = ((char *) negone); TypeStruct.Cgesd2d = Czgesd2d; TypeStruct.Cgerv2d = Czgerv2d; TypeStruct.Cgebs2d = Czgebs2d; TypeStruct.Cgebr2d = Czgebr2d; TypeStruct.Cgsum2d = Czgsum2d; TypeStruct.Fmmadd = zmmadd_; TypeStruct.Fmmcadd = zmmcadd_; TypeStruct.Fmmtadd = zmmtadd_; TypeStruct.Fmmtcadd = zmmtcadd_; TypeStruct.Fmmdda = zmmdda_; TypeStruct.Fmmddac = zmmddac_; TypeStruct.Fmmddat = zmmddat_; TypeStruct.Fmmddact = zmmddact_; TypeStruct.Fcshft = zcshft_; TypeStruct.Frshft = zrshft_; TypeStruct.Fvvdotu = zvvdotu_; TypeStruct.Fvvdotc = zvvdotc_; TypeStruct.Fset = zset_; TypeStruct.Ftzpad = ztzpad_; TypeStruct.Ftzpadcpy = ztzpadcpy_; TypeStruct.Ftzscal = ztzscal_; TypeStruct.Fhescal = zhescal_; TypeStruct.Ftzcnjg = ztzcnjg_; TypeStruct.Faxpy = zaxpy_; TypeStruct.Fcopy = zcopy_; TypeStruct.Fswap = zswap_; TypeStruct.Fgemv = zgemv_; TypeStruct.Fsymv = zsymv_; TypeStruct.Fhemv = zhemv_; TypeStruct.Ftrmv = ztrmv_; TypeStruct.Ftrsv = ztrsv_; TypeStruct.Fagemv = zagemv_; TypeStruct.Fasymv = zasymv_; TypeStruct.Fahemv = zahemv_; TypeStruct.Fatrmv = zatrmv_; TypeStruct.Fgerc = zgerc_; TypeStruct.Fgeru = zgeru_; TypeStruct.Fsyr = zsyr_; TypeStruct.Fher = zher_; TypeStruct.Fsyr2 = zsyr2_; TypeStruct.Fher2 = zher2_; TypeStruct.Fgemm = zgemm_; TypeStruct.Fsymm = zsymm_; TypeStruct.Fhemm = zhemm_; TypeStruct.Fsyrk = zsyrk_; TypeStruct.Fherk = zherk_; TypeStruct.Fsyr2k = zsyr2k_; TypeStruct.Fher2k = zher2k_; TypeStruct.Ftrmm = ztrmm_; TypeStruct.Ftrsm = ztrsm_; return( &TypeStruct ); /* * End of PB_Cztypeset */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_freebuf_.c000644 000766 000024 00000001553 10363532303 020525 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" void PB_freebuf_() { /* * Purpose * ======= * * PB_freebuf_ disposes the dynamic memory allocated by PB_Cgetbuf. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ (void) PB_Cgetbuf( " ", -1 ); /* * End of PB_freebuf_ */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_topget_.c000644 000766 000024 00000004445 10363532303 020414 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_topget_( int * ICTXT, F_CHAR_T OP, F_CHAR_T SCOPE, F_CHAR_T TOP ) #else void PB_topget_( ICTXT, OP, SCOPE, TOP ) /* * .. Scalar Arguments .. */ int * ICTXT; /* * .. Array Arguments .. */ F_CHAR_T OP, SCOPE, TOP; #endif { /* * Purpose * ======= * * PB_topget_ returns the row, column or all broadcast and combine topo- * logies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * OP (global input) pointer to CHAR * On input, OP specifies the BLACS operation defined as fol- * lows: * OP = 'B' or 'b', BLACS broadcast operation, * OP = 'C' or 'c', BLACS combine operation. * * SCOPE (global input) pointer to CHAR * On entry, SCOPE specifies the scope of the BLACS operation as * follows: * SCOPE = 'R' or 'r', rowwise broadcast or combine, * SCOPE = 'C' or 'c', column broadcast or combine, * SCOPE = 'A' or 'a', all broadcast or combine. * * TOP (global output) pointer to CHAR * On exit, TOP is a character string specifying the BLACS to- * pology current in use for the given operation specified by OP * and SCOPE. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* * .. Local Scalars .. */ char * topc; /* .. * .. Executable Statements .. * */ topc = F2C_CHAR( TOP ); *topc = *PB_Ctop( ICTXT, F2C_CHAR( OP ), F2C_CHAR( SCOPE ), TOP_GET ); TOP = C2F_CHAR( topc ); /* * End of PB_topget_ */ } scalapack-2.0.2/PBLAS/SRC/PTOOLS/PB_topset_.c000644 000766 000024 00000004414 10363532303 020424 0ustar00juliestaff000000 000000 /* --------------------------------------------------------------------- * * -- PBLAS routine (version 2.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * April 1, 1998 * * --------------------------------------------------------------------- */ /* * Include files */ #include "../pblas.h" #include "../PBpblas.h" #include "../PBtools.h" #include "../PBblacs.h" #include "../PBblas.h" #ifdef __STDC__ void PB_topset_( int * ICTXT, F_CHAR_T OP, F_CHAR_T SCOPE, F_CHAR_T TOP ) #else void PB_topset_( ICTXT, OP, SCOPE, TOP ) /* * .. Scalar Arguments .. */ int * ICTXT; /* * .. Array Arguments .. */ F_CHAR_T OP, SCOPE, TOP; #endif { /* * Purpose * ======= * * PB_topset_ initializes the row-, column- or all- broadcast and combi- * ne topologies. * * Arguments * ========= * * ICTXT (local input) INTEGER * On entry, ICTXT specifies the BLACS context handle, indica- * ting the global context of the operation. The context itself * is global, but the value of ICTXT is local. * * OP (global input) pointer to CHAR * On input, OP specifies the BLACS operation defined as fol- * lows: * OP = 'B' or 'b', BLACS broadcast operation, * OP = 'C' or 'c', BLACS combine operation. * * SCOPE (global input) pointer to CHAR * On entry, SCOPE specifies the scope of the BLACS operation as * follows: * SCOPE = 'R' or 'r', rowwise broadcast or combine, * SCOPE = 'C' or 'c', column broadcast or combine, * SCOPE = 'A' or 'a', all broadcast or combine. * * TOP (global input) pointer to CHAR * On entry, TOP is a character string specifying the BLACS to- * pology to be used i.e. to be set for the given operation * specified by OP and SCOPE. * * -- Written on April 1, 1998 by * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ if( * F2C_CHAR( TOP ) != '!' ) (void) PB_Ctop( ICTXT, F2C_CHAR( OP ), F2C_CHAR( SCOPE ), F2C_CHAR( TOP ) ); /* * End of PB_topset_ */ } scalapack-2.0.2/PBLAS/SRC/PBBLAS/CMakeLists.txt000644 000766 000024 00000001003 11656312637 020710 0ustar00juliestaff000000 000000 set (PBSBLASAUX pbstran.f pbsmatadd.f pbstrsrt.f pbstrget.f pbstrnv.f pbsvecadd.f pbstrst1.f) set (PBCBLASAUX pbctran.f pbcmatadd.f pbctrsrt.f pbctrget.f pbctrnv.f pbcvecadd.f pbctrst1.f) set (PBDBLASAUX pbdtran.f pbdmatadd.f pbdtrsrt.f pbdtrget.f pbdtrnv.f pbdvecadd.f pbdtrst1.f) set (PBZBLASAUX pbztran.f pbzmatadd.f pbztrsrt.f pbztrget.f pbztrnv.f pbzvecadd.f pbztrst1.f) set(pbblas ${PBSBLASAUX} ${PBCBLASAUX} ${PBDBLASAUX} ${PBZBLASAUX}) scalapack-2.0.2/PBLAS/SRC/PBBLAS/Makefile000644 000766 000024 00000004545 11654025546 017624 0ustar00juliestaff000000 000000 ############################################################################ # # Program: PBLAS -- (version 2.0 beta) # # Module: Makefile # # Purpose: PB-BLAS remaining source Makefile # # Creation date: August 24, 1997 # # Modified: February 15, 2000 # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../../SLmake.inc ############################################################################ # # The library can be set up to include routines for any combination # of the four precisions. First, modify the ../../../SLmake.inc file # definitions to match your compiler and the options to be used. # Then to create or add to the library, enter make followed by one or # more of the precisions desired. Some examples: # make single # make single complex # make single double complex complex16 # Alternatively, the command # make # without any arguments creates a library of all four precisions. # # To remove the object files after the library is created, enter # make clean # ############################################################################ all: single double complex complex16 PBSBLASAUX = pbstran.o pbsmatadd.o pbstrsrt.o pbstrget.o \ pbstrnv.o pbsvecadd.o pbstrst1.o PBCBLASAUX = pbctran.o pbcmatadd.o pbctrsrt.o pbctrget.o \ pbctrnv.o pbcvecadd.o pbctrst1.o PBDBLASAUX = pbdtran.o pbdmatadd.o pbdtrsrt.o pbdtrget.o \ pbdtrnv.o pbdvecadd.o pbdtrst1.o PBZBLASAUX = pbztran.o pbzmatadd.o pbztrsrt.o pbztrget.o \ pbztrnv.o pbzvecadd.o pbztrst1.o #--------------------------------------------------------------------------- single: $(PBSBLASAUX) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(PBSBLASAUX) $(RANLIB) ../../../$(SCALAPACKLIB) double: $(PBDBLASAUX) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(PBDBLASAUX) $(RANLIB) ../../../$(SCALAPACKLIB) complex: $(PBCBLASAUX) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(PBCBLASAUX) $(RANLIB) ../../../$(SCALAPACKLIB) complex16: $(PBZBLASAUX) $(ARCH) $(ARCHFLAGS) ../../../$(SCALAPACKLIB) $(PBZBLASAUX) $(RANLIB) ../../../$(SCALAPACKLIB) #--------------------------------------------------------------------------- clean : rm -f *.o .f.o : ; $(FC) -c $(FCFLAGS) $*.f scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbcmatadd.f000644 000766 000024 00000034105 10363532303 020233 0ustar00juliestaff000000 000000 SUBROUTINE PBCMATADD( ICONTXT, MODE, M, N, ALPHA, A, LDA, BETA, B, $ LDB ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 MODE INTEGER ICONTXT, LDA, LDB, M, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBCMATADD performs the matrix add operation B := alpha*A + beta*B, * where alpha and beta are scalars, and A and B are m-by-n * upper/lower trapezoidal matrices, or rectangular matrices. * * Arguments * ========= * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * MODE (input) CHARACTER*1 * Specifies the part of the matrix A, or (conjugate) transposed * matrix A to be added to the matrix B, * = 'U': Upper triangular part * up(B) = alpha*up(A) + beta*up(B) * = 'L': Lower triangular part * lo(B) = alpha*lo(A) + beta*lo(B) * = 'T': Transposed matrix A * B = alpha*A**T + beta*B * = 'C': Conjugate transposed matrix A * B = alpha*A**H + beta*B * Otherwise: B = alpha*A + beta*B * if M = LDA = LDB: use one BLAS loop * if MODE = 'V' : columnwise copy using BLAS if possible * else : use double loops * * M (input) INTEGER * M specifies the number of columns of the matrix A if * MODE != 'T'/'C', and it specifies the number of rows of the * matrix A otherwise. It also specifies the number of rows of * the matrix B. M >= 0. * * N (input) INTEGER * N specifies the number of rows of the matrix A if * MODE != 'T'/'C', and it specifies the number of columns of * the matrix A otherwise. It also specifies the number of * columns of the matrix B. N >= 0. * * ALPHA (input) COMPLEX * ALPHA specifies the scalar alpha. * * A (input) COMPLEX array, dimension (LDA,N) * The m by n matrix A if MODE != 'T'/'C'. * If MODE = 'U', only the upper triangle or trapezoid is * accessed; if MODE = 'L', only the lower triangle or * trapezoid is accessed. Otherwise all m-by-n data matrix * is accessed. * And the n by m matrix A if MODE = 'T'/'C'. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M) if * MODE != 'T'/'C'. And LDA >= max(1,N) if MODE = 'T'/'C'. * * BETA (input) COMPLEX * BETA specifies the scalar beta. * * B (input) COMPLEX array, dimension (LDB,N) * On exit, B = alpha*A + beta*B * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,M). * * ===================================================================== * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), $ ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL CSCAL, CCOPY, CAXPY * .. * .. Intrinsic Functions .. INTRINSIC MIN, CONJG * .. * .. Executable Statements .. * IF( M.LE.0 .OR. N.LE.0 .OR. ( ALPHA.EQ.ZERO.AND.BETA.EQ.ONE ) ) $ RETURN * * A is upper triangular or upper trapezoidal, * IF( LSAME( MODE, 'U' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, MIN( J, M ) B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, MIN( J, M ) B( I, J ) = BETA * B( I, J ) 30 CONTINUE 40 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 60 J = 1, N DO 50 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) 50 CONTINUE 60 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 80 J = 1, N DO 70 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) + B( I, J ) 70 CONTINUE 80 CONTINUE ELSE DO 100 J = 1, N DO 90 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 120 J = 1, N DO 110 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) 110 CONTINUE 120 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 140 J = 1, N DO 130 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 130 CONTINUE 140 CONTINUE ELSE DO 160 J = 1, N DO 150 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 150 CONTINUE 160 CONTINUE END IF END IF * * A is lower triangular or upper trapezoidal, * ELSE IF( LSAME( MODE, 'L' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, N DO 170 I = J, M B( I, J ) = ZERO 170 CONTINUE 180 CONTINUE ELSE DO 200 J = 1, N DO 190 I = J, M B( I, J ) = BETA * B( I, J ) 190 CONTINUE 200 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 220 J = 1, N DO 210 I = J, M B( I, J ) = A( I, J ) 210 CONTINUE 220 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 240 J = 1, N DO 230 I = J, M B( I, J ) = A( I, J ) + B( I, J ) 230 CONTINUE 240 CONTINUE ELSE DO 260 J = 1, N DO 250 I = J, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 280 J = 1, N DO 270 I = J, M B( I, J ) = ALPHA * A( I, J ) 270 CONTINUE 280 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 300 J = 1, N DO 290 I = J, M B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 290 CONTINUE 300 CONTINUE ELSE DO 320 J = 1, N DO 310 I = J, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 310 CONTINUE 320 CONTINUE END IF END IF * * If MODE = 'Transpose' * ELSE IF( LSAME( MODE, 'T' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 340 J = 1, N DO 330 I = 1, M B( I, J ) = ZERO 330 CONTINUE 340 CONTINUE ELSE DO 360 J = 1, N DO 350 I = 1, M B( I, J ) = BETA * B( I, J ) 350 CONTINUE 360 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 380 J = 1, N DO 370 I = 1, M B( I, J ) = A( J, I ) 370 CONTINUE 380 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 400 J = 1, N DO 390 I = 1, M B( I, J ) = A( J, I ) + B( I, J ) 390 CONTINUE 400 CONTINUE ELSE DO 420 J = 1, N DO 410 I = 1, M B( I, J ) = A( J, I ) + BETA * B( I, J ) 410 CONTINUE 420 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 440 J = 1, N DO 430 I = 1, M B( I, J ) = ALPHA * A( J, I ) 430 CONTINUE 440 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 460 J = 1, N DO 450 I = 1, M B( I, J ) = ALPHA * A( J, I ) + B( I, J ) 450 CONTINUE 460 CONTINUE ELSE DO 480 J = 1, N DO 470 I = 1, M B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 470 CONTINUE 480 CONTINUE END IF END IF * * If MODE = 'Conjugate', * ELSE IF( LSAME( MODE, 'C' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 500 J = 1, N DO 490 I = 1, M B( I, J ) = ZERO 490 CONTINUE 500 CONTINUE ELSE DO 520 J = 1, N DO 510 I = 1, M B( I, J ) = BETA * B( I, J ) 510 CONTINUE 520 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 540 J = 1, N DO 530 I = 1, M B( I, J ) = CONJG( A( J, I ) ) 530 CONTINUE 540 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 560 J = 1, N DO 550 I = 1, M B( I, J ) = CONJG( A( J, I ) ) + B( I, J ) 550 CONTINUE 560 CONTINUE ELSE DO 580 J = 1, N DO 570 I = 1, M B( I, J ) = CONJG( A( J, I ) ) + BETA * B( I, J ) 570 CONTINUE 580 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 600 J = 1, N DO 590 I = 1, M B( I, J ) = ALPHA * CONJG( A( J, I ) ) 590 CONTINUE 600 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 620 J = 1, N DO 610 I = 1, M B( I, J ) = ALPHA * CONJG( A( J, I ) ) + B( I, J ) 610 CONTINUE 620 CONTINUE ELSE DO 640 J = 1, N DO 630 I = 1, M B( I, J ) = ALPHA * CONJG( A( J, I ) ) $ + BETA * B( I, J ) 630 CONTINUE 640 CONTINUE END IF END IF * * Other cases (for genral matrix additions) * ELSE IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 660 J = 1, N DO 650 I = 1, M B( I, J ) = ZERO 650 CONTINUE 660 CONTINUE * ELSE IF( M.EQ.LDB ) THEN CALL CSCAL( M*N, BETA, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 670 J = 1, N CALL CSCAL( M, BETA, B( 1, J ), 1 ) 670 CONTINUE ELSE DO 690 J = 1, N DO 680 I = 1, M B( I, J ) = BETA * B( I, J ) 680 CONTINUE 690 CONTINUE END IF END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( M.EQ.LDA .AND. M.EQ.LDB ) THEN CALL CCOPY( M*N, A( 1, 1 ), 1, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 700 J = 1, N CALL CCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) 700 CONTINUE ELSE DO 720 J = 1, N DO 710 I = 1, M B( I, J ) = A( I, J ) 710 CONTINUE 720 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN DO 740 J = 1, N DO 730 I = 1, M B( I, J ) = A( I, J ) + B( I, J ) 730 CONTINUE 740 CONTINUE * ELSE DO 760 J = 1, N DO 750 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 750 CONTINUE 760 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 780 J = 1, N DO 770 I = 1, M B( I, J ) = ALPHA * A( I, J ) 770 CONTINUE 780 CONTINUE * ELSE IF( BETA.EQ.ONE ) THEN IF( M.EQ.LDA .AND. M.EQ.LDB ) THEN CALL CAXPY( M*N, ALPHA, A( 1, 1 ), 1, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 790 J = 1, N CALL CAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) 790 CONTINUE ELSE DO 810 J = 1, N DO 800 I = 1, M B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 800 CONTINUE 810 CONTINUE END IF * ELSE DO 830 J = 1, N DO 820 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 820 CONTINUE 830 CONTINUE END IF END IF END IF * RETURN * * End of PBCMATADD * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbctran.f000644 000766 000024 00000071201 10363532303 017743 0ustar00juliestaff000000 000000 SUBROUTINE PBCTRAN( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, BETA, $ C, LDC, IAROW, IACOL, ICROW, ICCOL, WORK ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * Jaeyoung Choi, Oak Ridge National Laboratory * Jack Dongarra, University of Tennessee and Oak Ridge National Lab. * David Walker, Oak Ridge National Laboratory * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER IACOL, IAROW, ICCOL, ICONTXT, ICROW, LDA, LDC, $ M, N, NB COMPLEX BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), C( LDC, * ), WORK( * ) * .. * * Purpose * ======= * * PBCTRAN transposes a column block to row block, or a row block to * column block by reallocating data distribution. * * C := A^T + beta*C, or C := A^C + beta*C * * where A is an M-by-N matrix and C is an N-by-M matrix, and the size * of M or N is limited to its block size NB. * * The first elements of the matrices A, and C should be located at * the beginnings of their first blocks. (not the middle of the blocks.) * * Parameters * ========== * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * ADIST - (input) CHARACTER*1 * ADIST specifies whether A is a column block or a row block. * * ADIST = 'C', A is a column block * ADIST = 'R', A is a row block * * TRANS - (input) CHARACTER*1 * TRANS specifies whether the transposed format is transpose * or conjugate transpose. If the matrices A and C are real, * the argument is ignored. * * TRANS = 'T', transpose * TRANS = 'C', conjugate transpose * * M - (input) INTEGER * M specifies the (global) number of rows of the matrix (block * column or block row) A and of columns of the matrix C. * M >= 0. * * N - (input) INTEGER * N specifies the (global) number of columns of the matrix * (block column or block row) A and of columns of the matrix * C. N >= 0. * * NB - (input) INTEGER * NB specifies the column block size of the matrix A and the * row block size of the matrix C when ADIST = 'C'. Otherwise, * it specifies the row block size of the matrix A and the * column block size of the matrix C. NB >= 1. * * A (input) COMPLEX array of DIMENSION ( LDA, Lx ), * where Lx is N when ADIST = 'C', or Nq when ADIST = 'R'. * Before entry with ADIST = 'C', the leading Mp by N part of * the array A must contain the matrix A, otherwise the leading * M by Nq part of the array A must contain the matrix A. See * parameter details for the values of Mp and Nq. * * LDA (input) INTEGER * LDA specifies the leading dimension of (local) A as declared * in the calling (sub) program. LDA >= MAX(1,Mp) when * ADIST = 'C', or LDA >= MAX(1,M) otherwise. * * BETA (input) COMPLEX * BETA specifies scaler beta. * * C (input/output) COMPLEX array of DIMENSION ( LDC, Lx ), * where Lx is Mq when ADIST = 'C', or N when ADIST = 'R'. * If ADIST = 'C', the leading N-by-Mq part of the array C * contains the (local) matrix C, otherwise the leading * Np-by-M part of the array C must contain the (local) matrix * C. C will not be referenced if beta is zero. * * LDC (input) INTEGER * LDC specifies the leading dimension of (local) C as declared * in the calling (sub) program. LDC >= MAX(1,N) when ADIST='C', * or LDC >= MAX(1,Np) otherwise. * * IAROW (input) INTEGER * IAROW specifies a row of the process template, * which holds the first block of the matrix A. If A is a row * of blocks (ADIST = 'R') and all rows of processes have a copy * of A, then set IAROW = -1. * * IACOL (input) INTEGER * IACOL specifies a column of the process template, * which holds the first block of the matrix A. If A is a * column of blocks (ADIST = 'C') and all columns of processes * have a copy of A, then set IACOL = -1. * * ICROW (input) INTEGER * ICROW specifies the current row process which holds * the first block of the matrix C, which is transposed of A. * If C is a row of blocks (ADIST = 'C') and the transposed * row block C is distributed all rows of processes, set * ICROW = -1. * * ICCOL (input) INTEGER * ICCOL specifies the current column process which holds * the first block of the matrix C, which is transposed of A. * If C is a column of blocks (ADIST = 'R') and the transposed * column block C is distributed all columns of processes, * set ICCOL = -1. * * WORK (workspace) COMPLEX array of dimension Size(WORK). * It needs extra working space of A'. * * Parameters Details * ================== * * Lx It is a local portion of L owned by a process, (L is * replaced by M, or N, and x is replaced by either p (=NPROW) * or q (=NPCOL)). The value is determined by L, LB, x, and * MI, where LB is a block size and MI is a row or column * position in a process template. Lx is equal to or less * than Lx0 = CEIL( L, LB*x ) * LB. * * Communication Scheme * ==================== * * The communication scheme of the routine is set to '1-tree', which is * fan-out. (For details, see BLACS user's guide.) * * Memory Requirement of WORK * ========================== * * Mqb = CEIL( M, NB*NPCOL ) * Npb = CEIL( N, NB*NPROW ) * LCMQ = LCM / NPCOL * LCMP = LCM / NPROW * * (1) ADIST = 'C' * (a) IACOL != -1 * Size(WORK) = N * CEIL(Mqb,LCMQ)*NB * (b) IACOL = -1 * Size(WORK) = N * CEIL(Mqb,LCMQ)*NB * MIN(LCMQ,CEIL(M,NB)) * * (2) ADIST = 'R' * (a) IAROW != -1 * Size(WORK) = M * CEIL(Npb,LCMP)*NB * (b) IAROW = -1 * Size(WORK) = M * CEIL(Npb,LCMP)*NB * MIN(LCMP,CEIL(N,NB)) * * Notes * ----- * More precise space can be computed as * * CEIL(Mqb,LCMQ)*NB => NUMROC( NUMROC(M,NB,0,0,NPCOL), NB, 0, 0, LCMQ ) * CEIL(Npb,LCMP)*NB => NUMROC( NUMROC(N,NB,0,0,NPROW), NB, 0, 0, LCMP ) * * ===================================================================== * * .. * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL COLFORM, ROWFORM INTEGER I, IDEX, IGD, INFO, JCCOL, JCROW, JDEX, LCM, $ LCMP, LCMQ, MCCOL, MCROW, ML, MP, MQ, MQ0, $ MRCOL, MRROW, MYCOL, MYROW, NP, NP0, NPCOL, $ NPROW, NQ COMPLEX TBETA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, ICEIL, NUMROC EXTERNAL ILCM, ICEIL, LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, CGERV2D, $ CGESD2D, PBCMATADD, PBCTR2AF, PBCTR2AT, $ PBCTR2BT, PBCTRGET, PBCTRSRT, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible. * IF( M.EQ.0 .OR. N.EQ.0 ) RETURN * CALL BLACS_GRIDINFO( ICONTXT, NPROW, NPCOL, MYROW, MYCOL ) * COLFORM = LSAME( ADIST, 'C' ) ROWFORM = LSAME( ADIST, 'R' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.COLFORM ) .AND. ( .NOT.ROWFORM ) ) THEN INFO = 2 ELSE IF( M .LT.0 ) THEN INFO = 4 ELSE IF( N .LT.0 ) THEN INFO = 5 ELSE IF( NB.LT.1 ) THEN INFO = 6 ELSE IF( IAROW.LT.-1 .OR. IAROW.GE.NPROW .OR. $ ( IAROW.EQ.-1 .AND. COLFORM ) ) THEN INFO = 12 ELSE IF( IACOL.LT.-1 .OR. IACOL.GE.NPCOL .OR. $ ( IACOL.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 13 ELSE IF( ICROW.LT.-1 .OR. ICROW.GE.NPROW .OR. $ ( ICROW.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 14 ELSE IF( ICCOL.LT.-1 .OR. ICCOL.GE.NPCOL .OR. $ ( ICCOL.EQ.-1 .AND. COLFORM ) ) THEN INFO = 15 END IF * 10 CONTINUE IF( INFO .NE. 0 ) THEN CALL PXERBLA( ICONTXT, 'PBCTRAN ', INFO ) RETURN END IF * * Start the operations. * * LCM : the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LCMQ = LCM / NPCOL IGD = NPCOL / LCMP * * When A is a column block * IF( COLFORM ) THEN * * Form C <== A' ( A is a column block ) * _ * | | * | | * _____________ | | * |______C______| <== |A| * | | * | | * |_| * * MRROW : row relative position in template from IAROW * MRCOL : column relative position in template from ICCOL * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-ICCOL, NPCOL ) JCROW = ICROW IF( ICROW.EQ.-1 ) JCROW = IAROW * MP = NUMROC( M, NB, MYROW, IAROW, NPROW ) MQ = NUMROC( M, NB, MYCOL, ICCOL, NPCOL ) MQ0 = NUMROC( NUMROC(M, NB, 0, 0, NPCOL), NB, 0, 0, LCMQ ) * IF( LDA.LT.MP .AND. $ ( IACOL.EQ.MYCOL .OR. IACOL.EQ.-1 ) ) THEN INFO = 8 ELSE IF( LDC.LT.N .AND. $ ( ICROW.EQ.MYROW .OR. ICROW.EQ.-1 ) ) THEN INFO = 11 END IF IF( INFO.NE.0 ) GO TO 10 * * When a column process of IACOL has a column block A, * IF( IACOL.GE.0 ) THEN TBETA = ZERO IF( MYROW.EQ.JCROW ) TBETA = BETA * DO 20 I = 0, MIN( LCM, ICEIL(M,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IAROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + ICCOL, NPCOL ) IF( LCMQ.EQ.1 ) MQ0 = NUMROC( M, NB, I, 0, NPCOL ) JDEX = (I/NPCOL) * NB * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.IACOL ) THEN * * The source node is a destination node * IDEX = (I/NPROW) * NB IF( MYROW.EQ.JCROW .AND. MYCOL.EQ.MCCOL ) THEN CALL PBCTR2AT( ICONTXT, 'Col', TRANS, MP-IDEX, N, NB, $ A(IDEX+1,1), LDA, TBETA, C(1,JDEX+1), $ LDC, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBCTR2BT( ICONTXT, 'Col', TRANS, MP-IDEX, N, NB, $ A(IDEX+1,1), LDA, ZERO, WORK, N, $ LCMP*NB ) CALL CGESD2D( ICONTXT, N, MQ0, WORK, N, JCROW, MCCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.JCROW .AND. MYCOL.EQ.MCCOL ) THEN IF( LCMQ.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL CGERV2D( ICONTXT, N, MQ0, C, LDC, MCROW, IACOL ) ELSE CALL CGERV2D( ICONTXT, N, MQ0, WORK, N, MCROW, IACOL ) CALL PBCTR2AF( ICONTXT, 'Row', N, MQ-JDEX, NB, WORK, N, $ TBETA, C(1,JDEX+1), LDC, LCMP, LCMQ, $ MQ0 ) END IF END IF 20 CONTINUE * * Broadcast a row block of C in each column of template * IF( ICROW.EQ.-1 ) THEN IF( MYROW.EQ.JCROW ) THEN CALL CGEBS2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC ) ELSE CALL CGEBR2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC, $ JCROW, MYCOL ) END IF END IF * * When all column procesors have a copy of the column block A, * ELSE IF( LCMQ.EQ.1 ) MQ0 = MQ * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * DO 30 I = 0, LCMP-1 IF( MRCOL.EQ.MOD( NPROW*I+MRROW, NPCOL ) ) THEN IF( LCMQ.EQ.1.AND.(ICROW.EQ.-1.OR.ICROW.EQ.MYROW) ) THEN CALL PBCTR2BT( ICONTXT, 'Col', TRANS, MP-I*NB, N, NB, $ A(I*NB+1,1), LDA, BETA, C, LDC, $ LCMP*NB ) ELSE CALL PBCTR2BT( ICONTXT, 'Col', TRANS, MP-I*NB, N, NB, $ A(I*NB+1,1), LDA, ZERO, WORK, N, $ LCMP*NB ) END IF END IF 30 CONTINUE * * Get diagonal blocks of A for each column of the template * MCROW = MOD( MOD(MRCOL,NPROW)+IAROW, NPROW ) IF( LCMQ.GT.1 ) THEN MCCOL = MOD( NPCOL+MYCOL-ICCOL, NPCOL ) CALL PBCTRGET( ICONTXT, 'Row', N, MQ0, ICEIL(M,NB), WORK, N, $ MCROW, MCCOL, IGD, MYROW, MYCOL, NPROW, $ NPCOL ) END IF * * Broadcast a row block of WORK in every row of template * IF( ICROW.EQ.-1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( LCMQ.GT.1 ) $ CALL PBCTRSRT( ICONTXT, 'Row', N, MQ, NB, WORK, N, BETA, $ C, LDC, LCMP, LCMQ, MQ0 ) CALL CGEBS2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC ) ELSE CALL CGEBR2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC, $ MCROW, MYCOL ) END IF * * Send a row block of WORK to the destination row * ELSE IF( LCMQ.EQ.1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.ICROW ) $ CALL CGESD2D( ICONTXT, N, MQ, WORK, N, ICROW, MYCOL ) ELSE IF( MYROW.EQ.ICROW ) THEN IF( BETA.EQ.ZERO ) THEN CALL CGERV2D( ICONTXT, N, MQ, C, LDC, MCROW, MYCOL ) ELSE CALL CGERV2D( ICONTXT, N, MQ, WORK, N, MCROW, MYCOL ) CALL PBCMATADD( ICONTXT, 'G', N, MQ, ONE, WORK, N, $ BETA, C, LDC ) END IF END IF * ELSE ML = MQ0 * MIN( LCMQ, MAX(0,ICEIL(M,NB)-MCCOL) ) IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.ICROW ) $ CALL CGESD2D( ICONTXT, N, ML, WORK, N, ICROW, MYCOL ) ELSE IF( MYROW.EQ.ICROW ) THEN CALL CGERV2D( ICONTXT, N, ML, WORK, N, MCROW, MYCOL ) END IF * IF( MYROW.EQ.ICROW ) $ CALL PBCTRSRT( ICONTXT, 'Row', N, MQ, NB, WORK, N, BETA, $ C, LDC, LCMP, LCMQ, MQ0 ) END IF END IF * END IF * * When A is a row block * ELSE * * Form C <== A' ( A is a row block ) * _ * | | * | | * | | _____________ * |C| <== |______A______| * | | * | | * |_| * * MRROW : row relative position in template from ICROW * MRCOL : column relative position in template from IACOL * MRROW = MOD( NPROW+MYROW-ICROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) JCCOL = ICCOL IF( ICCOL.EQ.-1 ) JCCOL = IACOL * NP = NUMROC( N, NB, MYROW, ICROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) NP0 = NUMROC( NUMROC(N, NB, 0, 0, NPROW), NB, 0, 0, LCMP ) * IF( LDA.LT.M .AND. $ ( IAROW.EQ.MYROW .OR. IAROW.EQ.-1 ) ) THEN INFO = 8 ELSE IF( LDC.LT.NP .AND. $ ( ICCOL.EQ.MYCOL .OR. ICCOL.EQ.-1 ) ) THEN INFO = 11 END IF IF( INFO.NE.0 ) GO TO 10 * * When a row process of IAROW has a row block A, * IF( IAROW.GE.0 ) THEN TBETA = ZERO IF( MYCOL.EQ.JCCOL ) TBETA = BETA * DO 40 I = 0, MIN( LCM, ICEIL(N,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + ICROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IACOL, NPCOL ) IF( LCMP.EQ.1 ) NP0 = NUMROC( N, NB, I, 0, NPROW ) IDEX = (I/NPROW) * NB * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.MCCOL ) THEN * * The source node is a destination node * JDEX = (I/NPCOL) * NB IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JCCOL ) THEN CALL PBCTR2AT( ICONTXT, 'Row', TRANS, M, NQ-JDEX, NB, $ A(1,JDEX+1), LDA, TBETA, C(IDEX+1,1), $ LDC, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBCTR2BT( ICONTXT, 'Row', TRANS, M, NQ-JDEX, NB, $ A(1,JDEX+1), LDA, ZERO, WORK, NP0, $ LCMQ*NB ) CALL CGESD2D( ICONTXT, NP0, M, WORK, NP0, $ MCROW, JCCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JCCOL ) THEN IF( LCMP.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL CGERV2D( ICONTXT, NP0, M, C, LDC, IAROW, MCCOL ) ELSE CALL CGERV2D( ICONTXT, NP0, M, WORK, NP0, IAROW, MCCOL ) CALL PBCTR2AF( ICONTXT, 'Col', NP-IDEX, M, NB, WORK, $ NP0, TBETA, C(IDEX+1,1), LDC, LCMP, LCMQ, $ NP0 ) END IF END IF 40 CONTINUE * * Broadcast a column block of WORK in each row of template * IF( ICCOL.EQ.-1 ) THEN IF( MYCOL.EQ.JCCOL ) THEN CALL CGEBS2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC ) ELSE CALL CGEBR2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC, $ MYROW, JCCOL ) END IF END IF * * When all row procesors have a copy of the row block A, * ELSE IF( LCMP.EQ.1 ) NP0 = NP * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * DO 50 I = 0, LCMQ-1 IF( MRROW.EQ.MOD(NPCOL*I+MRCOL, NPROW) ) THEN IF( LCMP.EQ.1.AND.(ICCOL.EQ.-1.OR.ICCOL.EQ.MYCOL) ) THEN CALL PBCTR2BT( ICONTXT, 'Row', TRANS, M, NQ-I*NB, NB, $ A(1,I*NB+1), LDA, BETA, C, LDC, $ LCMQ*NB ) ELSE CALL PBCTR2BT( ICONTXT, 'Row', TRANS, M, NQ-I*NB, NB, $ A(1,I*NB+1), LDA, ZERO, WORK, NP0, $ LCMQ*NB ) END IF END IF 50 CONTINUE * * Get diagonal blocks of A for each row of the template * MCCOL = MOD( MOD(MRROW, NPCOL)+IACOL, NPCOL ) IF( LCMP.GT.1 ) THEN MCROW = MOD( NPROW+MYROW-ICROW, NPROW ) CALL PBCTRGET( ICONTXT, 'Col', NP0, M, ICEIL(N,NB), WORK, $ NP0, MCROW, MCCOL, IGD, MYROW, MYCOL, NPROW, $ NPCOL ) END IF * * Broadcast a column block of WORK in every column of template * IF( ICCOL.EQ.-1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( LCMP.GT.1 ) $ CALL PBCTRSRT( ICONTXT, 'Col', NP, M, NB, WORK, NP0, $ BETA, C, LDC, LCMP, LCMQ, NP0 ) CALL CGEBS2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC ) ELSE CALL CGEBR2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC, $ MYROW, MCCOL ) END IF * * Send a column block of WORK to the destination column * ELSE IF( LCMP.EQ.1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.ICCOL ) $ CALL CGESD2D( ICONTXT, NP, M, WORK, NP, MYROW, ICCOL ) ELSE IF( MYCOL.EQ.ICCOL ) THEN IF( BETA.EQ.ZERO ) THEN CALL CGERV2D( ICONTXT, NP, M, C, LDC, MYROW, MCCOL ) ELSE CALL CGERV2D( ICONTXT, NP, M, WORK, NP, MYROW, MCCOL ) CALL PBCMATADD( ICONTXT, 'G', NP, M, ONE, WORK, NP, $ BETA, C, LDC ) END IF END IF * ELSE ML = M * MIN( LCMP, MAX( 0, ICEIL(N,NB) - MCROW ) ) IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.ICCOL ) $ CALL CGESD2D( ICONTXT, NP0, ML, WORK, NP0, $ MYROW, ICCOL ) ELSE IF( MYCOL.EQ.ICCOL ) THEN CALL CGERV2D( ICONTXT, NP0, ML, WORK, NP0, $ MYROW, MCCOL ) END IF * IF( MYCOL.EQ.ICCOL ) $ CALL PBCTRSRT( ICONTXT, 'Col', NP, M, NB, WORK, NP0, $ BETA, C, LDC, LCMP, LCMQ, NP0 ) END IF END IF * END IF END IF * RETURN * * End of PBCTRAN * END * *======================================================================= * SUBROUTINE PBCTR2AT *======================================================================= * SUBROUTINE PBCTR2AT( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, $ BETA, B, LDB, LCMP, LCMQ ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER ICONTXT, LCMP, LCMQ, LDA, LDB, M, N, NB COMPLEX BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBCTR2AT forms B <== A^T + beta*B, or A^C + beta*B * B is a ((conjugate) transposed) scattered block row (or column), * copied from a scattered block column (or row) of A * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER IA, IB, K, INTV, JNTV * .. * .. External Subroutines .. EXTERNAL PBCMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Excutable Statements .. * IF( LCMP.EQ.LCMQ ) THEN CALL PBCMATADD( ICONTXT, TRANS, N, M, ONE, A, LDA, BETA, B, $ LDB ) * ELSE * * If A is a column block ( ADIST = 'C' ), * IF( LSAME( ADIST, 'C' ) ) THEN INTV = LCMP * NB JNTV = LCMQ * NB IA = 1 IB = 1 DO 10 K = 1, ICEIL( M, INTV ) CALL PBCMATADD( ICONTXT, TRANS, N, MIN( M-IA+1, NB ), $ ONE, A(IA,1), LDA, BETA, B(1,IB), LDB ) IA = IA + INTV IB = IB + JNTV 10 CONTINUE * * If A is a row block ( ADIST = 'R' ), * ELSE INTV = LCMP * NB JNTV = LCMQ * NB IA = 1 IB = 1 DO 20 K = 1, ICEIL( N, JNTV ) CALL PBCMATADD( ICONTXT, TRANS, MIN( N-IA+1, NB ), M, $ ONE, A(1,IA), LDA, BETA, B(IB,1), LDB ) IA = IA + JNTV IB = IB + INTV 20 CONTINUE END IF END IF * RETURN * * End of PBCTR2AT * END * *======================================================================= * SUBROUTINE PBCTR2BT *======================================================================= * SUBROUTINE PBCTR2BT( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, $ BETA, B, LDB, INTV ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER ICONTXT, INTV, LDA, LDB, M, N, NB COMPLEX BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBCTR2BT forms T <== A^T + beta*T or A^C + beta*T, where T is a * ((conjugate) transposed) condensed block row (or column), copied from * a scattered block column (or row) of A * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER IA, IB, K * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. External Subroutines .. EXTERNAL PBCMATADD * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Excutable Statements .. * IF( INTV.EQ.NB ) THEN CALL PBCMATADD( ICONTXT, TRANS, N, M, ONE, A, LDA, BETA, B, $ LDB ) * ELSE * * If A is a column block ( ADIST = 'C' ), * IF( LSAME( ADIST, 'C' ) ) THEN IA = 1 IB = 1 DO 10 K = 1, ICEIL( M, INTV ) CALL PBCMATADD( ICONTXT, TRANS, N, MIN( M-IA+1, NB ), $ ONE, A(IA,1), LDA, BETA, B(1,IB), LDB ) IA = IA + INTV IB = IB + NB 10 CONTINUE * * If A is a row block (ADIST = 'R'), * ELSE IA = 1 IB = 1 DO 20 K = 1, ICEIL( N, INTV ) CALL PBCMATADD( ICONTXT, TRANS, MIN( N-IA+1, NB ), M, $ ONE, A(1,IA), LDA, BETA, B(IB,1), LDB ) IA = IA + INTV IB = IB + NB 20 CONTINUE END IF END IF * RETURN * * End of PBCTR2BT * END * *======================================================================= * SUBROUTINE PBCTR2AF *======================================================================= * SUBROUTINE PBCTR2AF( ICONTXT, ADIST, M, N, NB, A, LDA, BETA, B, $ LDB, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, M, N, NB, LDA, LDB, LCMP, LCMQ, NINT COMPLEX BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBCTR2AF forms T <== A + BETA*T, where T is a scattered block * row (or column) copied from a (condensed) block column (or row) of A * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER JA, JB, K, INTV * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( LSAME( ADIST, 'R' ) ) THEN INTV = NB * LCMQ JA = 1 JB = 1 DO 10 K = 1, ICEIL( NINT, NB ) CALL PBCMATADD( ICONTXT, 'G', M, MIN( N-JB+1, NB ), ONE, $ A(1,JA), LDA, BETA, B(1,JB), LDB ) JA = JA + NB JB = JB + INTV 10 CONTINUE * * if( LSAME( ADIST, 'C' ) ) then * ELSE INTV = NB * LCMP JA = 1 JB = 1 DO 20 K = 1, ICEIL( NINT, NB ) CALL PBCMATADD( ICONTXT, 'G', MIN( M-JB+1, NB ), N, ONE, $ A(JA,1), LDA, BETA, B(JB,1), LDB ) JA = JA + NB JB = JB + INTV 20 CONTINUE END IF * RETURN * * End of PBCTR2AF * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbctrget.f000644 000766 000024 00000010033 10363532303 020120 0ustar00juliestaff000000 000000 SUBROUTINE PBCTRGET( ICONTXT, ADIST, M, N, MNB, A, LDA, MCROW, $ MCCOL, IGD, MYROW, MYCOL, NPROW, NPCOL ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, IGD, LDA, M, MCCOL, MCROW, MNB, MYCOL, $ MYROW, N, NPCOL, NPROW * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * PBCTRGET forms a row block of A from scattered row subblocks if * ADIST = 'R', or forms a column block of A from scattered column * subblocks, if ADIST = 'C'. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. Local Variables .. INTEGER KINT, KINT2, KLEN, KMOD, KPPOS, NLEN, NNUM, $ NTLEN REAL TEMP * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL LSAME, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL CGERV2D, CGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * * if A is a row block, it needs to communicate columnwise. * IF( LSAME( ADIST, 'R' ) ) THEN KPPOS = MOD( NPROW+MYROW-MCROW, NPROW ) IF( MOD( KPPOS, IGD ).EQ.0 ) THEN KINT = IGD NLEN = N NNUM = MIN( NPROW/IGD, MNB-MCCOL ) TEMP = REAL( NNUM ) NTLEN = N * NNUM NNUM = IGD * NNUM IF( KPPOS.GE.NNUM ) GO TO 30 KPPOS = MOD( KPPOS, NPROW ) * 10 CONTINUE IF( TEMP.GT.ONE ) THEN KINT2 = 2 * KINT KMOD = MOD( KPPOS, KINT2 ) * IF( KMOD.EQ.0 ) THEN IF( KPPOS+KINT.LT.NNUM ) THEN KLEN = NTLEN - (KPPOS/KINT2)*(KINT2/IGD)*N KLEN = MIN( KLEN-NLEN, NLEN ) CALL CGERV2D( ICONTXT, M, KLEN, A(1,NLEN+1), LDA, $ MOD(MYROW+KINT, NPROW), MYCOL ) NLEN = NLEN + KLEN END IF ELSE CALL CGESD2D( ICONTXT, M, NLEN, A, LDA, $ MOD(NPROW+MYROW-KINT, NPROW), MYCOL ) GO TO 30 END IF * KINT = KINT2 TEMP = TEMP / TWO GO TO 10 END IF END IF * * if A is a column block, it needs to communicate rowwise. * ELSE IF( LSAME( ADIST, 'C' ) ) THEN * KPPOS = MOD( NPCOL+MYCOL-MCCOL, NPCOL ) IF( MOD( KPPOS, IGD ).EQ.0 ) THEN KINT = IGD NLEN = N NNUM = MIN( NPCOL/IGD, MNB-MCROW ) TEMP = REAL( NNUM ) NTLEN = N * NNUM NNUM = IGD * NNUM IF( KPPOS.GE.NNUM ) GO TO 30 KPPOS = MOD( KPPOS, NPCOL ) * 20 CONTINUE IF( TEMP.GT.ONE ) THEN KINT2 = 2 * KINT KMOD = MOD( KPPOS, KINT2 ) * IF( KMOD.EQ.0 ) THEN IF( KPPOS+KINT.LT.NNUM ) THEN KLEN = NTLEN - (KPPOS/KINT2)*(KINT2/IGD)*N KLEN = MIN( KLEN-NLEN, NLEN ) CALL CGERV2D( ICONTXT, M, KLEN, A(1,NLEN+1), LDA, $ MYROW, MOD(MYCOL+KINT, NPCOL) ) NLEN = NLEN + KLEN END IF ELSE CALL CGESD2D( ICONTXT, M, NLEN, A, LDA, MYROW, $ MOD(NPCOL+MYCOL-KINT, NPCOL) ) GO TO 30 END IF * KINT = KINT2 TEMP = TEMP / TWO GO TO 20 END IF END IF END IF * 30 CONTINUE * RETURN * * End of PBCTRGET * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbctrnv.f000644 000766 000024 00000064750 10363532303 020003 0ustar00juliestaff000000 000000 SUBROUTINE PBCTRNV( ICONTXT, XDIST, TRANS, N, NB, NZ, X, INCX, $ BETA, Y, INCY, IXROW, IXCOL, IYROW, IYCOL, $ WORK ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * Jaeyoung Choi, Oak Ridge National Laboratory * Jack Dongarra, University of Tennessee and Oak Ridge National Lab. * David Walker, Oak Ridge National Laboratory * * .. Scalar Arguments .. CHARACTER*1 TRANS, XDIST INTEGER ICONTXT, INCX, INCY, IXCOL, IXROW, IYCOL, $ IYROW, N, NB, NZ COMPLEX BETA * .. * .. Array Arguments .. COMPLEX WORK( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PBCTRNV transposes a column vector to row vector, or a row vector to * column vector by reallocating data distribution. * * Y := X' * * where X and Y are N vectors. * * Parameters * ========== * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * XDIST (input) CHARACTER*1 * XDIST specifies whether X is a column vector or a row vector, * * XDIST = 'C', X is a column vector (distributed columnwise) * XDIST = 'R', X is a row vector (distributed rowwise) * * TRANS (input) CHARACTER*1 * TRANS specifies whether the transposed format is transpose * or conjugate transpose. If the vectors X and Y are real, * the argument is ignored. * * TRANS = 'T', transpose * TRANS = 'C', conjugate transpose * * N (input) INTEGER * N specifies the (global) number of the vector X and the * vector Y. N >= 0. * * NB (input) INTEGER * NB specifies the block size of vectors X and Y. NB >= 0. * * NZ (input) INTEGER * NZ is the column offset to specify the column distance from * the beginning of the block to the first element of the * vector X, and the row offset to the first element of the * vector Y if XDIST = 'C'. * Otherwise, it is row offset to specify the row distance * from the beginning of the block to the first element of the * vector X, and the column offset to the first element of the * vector Y. 0 < NZ <= NB. * * X (input) COMPLEX array of dimension at least * ( 1 + (Np-1) * abs(INCX)) in IXCOL if XDIST = 'C', or * ( 1 + (Nq-1) * abs(INCX)) in IXROW if XDIST = 'R'. * The incremented array X must contain the vector X. * * INCX (input) INTEGER * INCX specifies the increment for the elements of X. * INCX <> 0. * * BETA (input) COMPLEX * BETA specifies scaler beta. * * Y (input/output) COMPLEX array of dimension at least * ( 1 + (Nq-1) * abs(INCY)) in IYROW if XDIST = 'C', or * ( 1 + (Np-1) * abs(INCY)) in IYCOL if XDIST = 'R', or * The incremented array Y must contain the vector Y. * Y will not be referenced if beta is zero. * * INCY (input) INTEGER * INCY specifies the increment for the elements of Y. * INCY <> 0. * * IXROW (input) INTEGER * IXROW specifies a row of the process template, which holds * the first element of the vector X. If X is a row vector and * all rows of processes have a copy of X, then set IXROW = -1. * * IXCOL (input) INTEGER * IXCOL specifies a column of the process template, * which holds the first element of the vector X. If X is a * column block and all columns of processes have a copy of X, * then set IXCOL = -1. * * IYROW (input) INTEGER * IYROW specifies the current row process which holds the * first element of the vector Y, which is transposed of X. * If X is a column vector and the transposed row vector Y is * distributed all rows of processes, set IYROW = -1. * * IYCOL (input) INTEGER * IYCOL specifies the current column process which holds * the first element of the vector Y, which is transposed of Y. * If X is a row block and the transposed column vector Y is * distributed all columns of processes, set IYCOL = -1. * * WORK (workspace) COMPLEX array of dimension Size(WORK). * It needs extra working space of x**T or x**H. * * Parameters Details * ================== * * Nx It is a local portion of N owned by a process, where x is * replaced by either p (=NPROW) or q (=NPCOL)). The value is * determined by N, NB, NZ, x, and MI, where NB is a block size, * NZ is a offset from the beginning of the block, and MI is a * row or column position in a process template. Nx is equal * to or less than Nx0 = CEIL( N+NZ, NB*x ) * NB. * * Communication Scheme * ==================== * * The communication scheme of the routine is set to '1-tree', which is * fan-out. (For details, see BLACS user's guide.) * * Memory Requirement of WORK * ========================== * * NN = N + NZ * Npb = CEIL( NN, NB*NPROW ) * Nqb = CEIL( NN, NB*NPCOL ) * LCMP = LCM / NPROW * LCMQ = LCM / NPCOL * * (1) XDIST = 'C' * (a) IXCOL != -1 * Size(WORK) = CEIL(Nqb,LCMQ)*NB * (b) IXCOL = -1 * Size(WORK) = CEIL(Nqb,LCMQ)*NB * MIN(LCMQ,CEIL(NN,NB)) * * (2) XDIST = 'R' * (a) IXROW != -1 * Size(WORK) = CEIL(Npb,LCMP)*NB * (b) IXROW = -1 * Size(WORK) = CEIL(Npb,LCMP)*NB * MIN(LCMP,CEIL(NN,NB)) * * Notes * ----- * More precise space can be computed as * * CEIL(Npb,LCMP)*NB => NUMROC( NUMROC(NN,NB,0,0,NPROW), NB, 0, 0, LCMP) * CEIL(Nqb,LCMQ)*NB => NUMROC( NUMROC(NN,NB,0,0,NPCOL), NB, 0, 0, LCMQ) * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL COLFORM, ROWFORM INTEGER I, IDEX, IGD, INFO, JDEX, JYCOL, JYROW, JZ, KZ, $ LCM, LCMP, LCMQ, MCCOL, MCROW, MRCOL, MRROW, $ MYCOL, MYROW, NN, NP, NP0, NP1, NPCOL, NPROW, $ NQ, NQ0, NQ1 COMPLEX TBETA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, ICEIL, NUMROC EXTERNAL LSAME, ILCM, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGEBR2D, CGEBS2D, CGERV2D, $ CGESD2D, PBCTR2A1, PBCTR2B1, PBCTRGET, $ PBCTRST1, PBCVECADD, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible. * IF( N.EQ.0 ) RETURN * CALL BLACS_GRIDINFO( ICONTXT, NPROW, NPCOL, MYROW, MYCOL ) * COLFORM = LSAME( XDIST, 'C' ) ROWFORM = LSAME( XDIST, 'R' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.COLFORM ) .AND. ( .NOT.ROWFORM ) ) THEN INFO = 2 ELSE IF( N .LT.0 ) THEN INFO = 4 ELSE IF( NB .LT.1 ) THEN INFO = 5 ELSE IF( NZ .LT.0 .OR. NZ.GE.NB ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 ELSE IF( IXROW.LT.-1 .OR. IXROW.GE.NPROW .OR. $ ( IXROW.EQ.-1 .AND. COLFORM ) ) THEN INFO = 12 ELSE IF( IXCOL.LT.-1 .OR. IXCOL.GE.NPCOL .OR. $ ( IXCOL.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 13 ELSE IF( IYROW.LT.-1 .OR. IYROW.GE.NPROW .OR. $ ( IYROW.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 14 ELSE IF( IYCOL.LT.-1 .OR. IYCOL.GE.NPCOL .OR. $ ( IYCOL.EQ.-1 .AND. COLFORM ) ) THEN INFO = 15 END IF * 10 CONTINUE IF( INFO.NE.0 ) THEN CALL PXERBLA( ICONTXT, 'PBCTRNV ', INFO ) RETURN END IF * * Start the operations. * * LCM : the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LCMQ = LCM / NPCOL IGD = NPCOL / LCMP NN = N + NZ * * When x is a column vector * IF( COLFORM ) THEN * * Form y <== x' ( x is a column vector ) * * || * || * _____________ || * -----(y)----- <== (x) * || * || * || * IF( IXROW.LT.0 .OR. IXROW.GE.NPROW ) THEN INFO = 12 ELSE IF( IXCOL.LT.-1 .OR. IXCOL.GE.NPCOL ) THEN INFO = 13 ELSE IF( IYROW.LT.-1 .OR. IYROW.GE.NPROW ) THEN INFO = 14 ELSE IF( IYCOL.LT.0 .OR. IYCOL.GE.NPCOL ) THEN INFO = 15 END IF IF( INFO.NE.0 ) GO TO 10 * * MRROW : row relative position in template from IXROW * MRCOL : column relative position in template from IYCOL * MRROW = MOD( NPROW+MYROW-IXROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IYCOL, NPCOL ) JYROW = IYROW IF( IYROW.EQ.-1 ) JYROW = IXROW * NP = NUMROC( NN, NB, MYROW, IXROW, NPROW ) IF( MRROW.EQ.0 ) NP = NP - NZ NQ = NUMROC( NN, NB, MYCOL, IYCOL, NPCOL ) IF( MRCOL.EQ.0 ) NQ = NQ - NZ NQ0 = NUMROC( NUMROC(NN, NB, 0, 0, NPCOL), NB, 0, 0, LCMQ ) * * When a column process of IXCOL has a column block A, * IF( IXCOL .GE. 0 ) THEN TBETA = ZERO IF( MYROW.EQ.JYROW ) TBETA = BETA KZ = NZ * DO 20 I = 0, MIN( LCM, ICEIL(NN,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IXROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IYCOL, NPCOL ) IF( LCMQ.EQ.1 ) NQ0 = NUMROC( NN, NB, I, 0, NPCOL ) JDEX = (I/NPCOL) * NB IF( MRCOL.EQ.0 ) JDEX = MAX(0, JDEX-NZ) * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.IXCOL ) THEN * * The source node is a destination node * IDEX = (I/NPROW) * NB IF( MRROW.EQ.0 ) IDEX = MAX( 0, IDEX-NZ ) IF( MYROW.EQ.JYROW .AND. MYCOL.EQ.MCCOL ) THEN CALL PBCTR2B1( ICONTXT, TRANS, NP-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, TBETA, $ Y(JDEX*INCY+1), INCY, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBCTR2B1( ICONTXT, TRANS, NP-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMP, 1 ) CALL CGESD2D( ICONTXT, 1, NQ0-KZ, WORK, 1, $ JYROW, MCCOL ) END IF * * A destination node receives the copied vector * ELSE IF( MYROW.EQ.JYROW .AND. MYCOL.EQ.MCCOL ) THEN IF( LCMQ.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL CGERV2D( ICONTXT, 1, NQ0-KZ, Y, INCY, $ MCROW, IXCOL ) ELSE CALL CGERV2D( ICONTXT, 1, NQ0-KZ, WORK, 1, $ MCROW, IXCOL ) CALL PBCTR2A1( ICONTXT, NQ-JDEX, NB, KZ, WORK, 1, TBETA, $ Y(JDEX*INCY+1), INCY, LCMQ*NB ) END IF END IF KZ = 0 20 CONTINUE * * Broadcast a row block of WORK in each column of template * IF( IYROW.EQ.-1 ) THEN IF( MYROW.EQ.JYROW ) THEN CALL CGEBS2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY ) ELSE CALL CGEBR2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY, $ JYROW, MYCOL ) END IF END IF * * When all column procesors have a copy of the column block A, * ELSE IF( LCMQ.EQ.1 ) NQ0 = NQ * * Processors, which have diagonal blocks of X, copy them to * WORK array in transposed form * KZ = 0 IF( MRROW.EQ.0 ) KZ = NZ JZ = 0 IF( MRROW.EQ.0 .AND. MYCOL.EQ.IYCOL ) JZ = NZ * DO 30 I = 0, LCMP - 1 IF( MRCOL.EQ.MOD(NPROW*I+MRROW, NPCOL) ) THEN IDEX = MAX( 0, I*NB-KZ ) IF( LCMQ.EQ.1 .AND. (IYROW.EQ.-1.OR.IYROW.EQ.MYROW) ) THEN CALL PBCTR2B1( ICONTXT, TRANS, NP-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, BETA, Y, INCY, $ LCMP, 1 ) ELSE CALL PBCTR2B1( ICONTXT, TRANS, NP-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMP, 1 ) END IF END IF 30 CONTINUE * * Get diagonal blocks of A for each column of the template * MCROW = MOD( MOD(MRCOL, NPROW) + IXROW, NPROW ) IF( LCMQ.GT.1 ) THEN MCCOL = MOD( NPCOL+MYCOL-IYCOL, NPCOL ) CALL PBCTRGET( ICONTXT, 'Row', 1, NQ0, ICEIL( NN, NB ), $ WORK, 1, MCROW, MCCOL, IGD, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * * Broadcast a row block of WORK in every row of template * IF( IYROW.EQ.-1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( LCMQ.GT.1 ) THEN KZ = 0 IF( MYCOL.EQ.IYCOL ) KZ = NZ CALL PBCTRST1( ICONTXT, 'Row', NQ, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NQ0 ) END IF CALL CGEBS2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY ) ELSE CALL CGEBR2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY, $ MCROW, MYCOL ) END IF * * Send a row block of WORK to the destination row * ELSE IF( LCMQ.EQ.1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.IYROW ) $ CALL CGESD2D( ICONTXT, 1, NQ0, WORK, 1, IYROW, MYCOL ) ELSE IF( MYROW.EQ.IYROW ) THEN IF( BETA.EQ.ZERO ) THEN CALL CGERV2D( ICONTXT, 1, NQ0, Y, INCY, MCROW, MYCOL ) ELSE CALL CGERV2D( ICONTXT, 1, NQ0, WORK, 1, MCROW, MYCOL ) CALL PBCVECADD( ICONTXT, 'G', NQ0, ONE, WORK, 1, $ BETA, Y, INCY ) END IF END IF * ELSE NQ1 = NQ0 * MIN( LCMQ, MAX( 0, ICEIL(NN,NB)-MCCOL ) ) IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.IYROW ) $ CALL CGESD2D( ICONTXT, 1, NQ1, WORK, 1, IYROW, MYCOL ) ELSE IF( MYROW.EQ.IYROW ) THEN CALL CGERV2D( ICONTXT, 1, NQ1, WORK, 1, MCROW, MYCOL ) END IF * IF( MYROW.EQ.IYROW ) THEN KZ = 0 IF( MYCOL.EQ.IYCOL ) KZ = NZ CALL PBCTRST1( ICONTXT, 'Row', NQ, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NQ0 ) END IF END IF END IF END IF * * When x is a row vector * ELSE * * Form y <== x' ( x is a row block ) * * || * || * || _____________ * (y) <== -----(x)----- * || * || * || * IF( IXROW.LT.-1 .OR. IXROW.GE.NPROW ) THEN INFO = 12 ELSE IF( IXCOL.LT.0 .OR. IXCOL.GE.NPCOL ) THEN INFO = 13 ELSE IF( IYROW.LT.0 .OR. IYROW.GE.NPROW ) THEN INFO = 14 ELSE IF( IYCOL.LT.-1 .OR. IYCOL.GE.NPCOL ) THEN INFO = 15 END IF IF( INFO.NE.0 ) GO TO 10 * * MRROW : row relative position in template from IYROW * MRCOL : column relative position in template from IXCOL * MRROW = MOD( NPROW+MYROW-IYROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IXCOL, NPCOL ) JYCOL = IYCOL IF( IYCOL.EQ.-1 ) JYCOL = IXCOL * NP = NUMROC( NN, NB, MYROW, IYROW, NPROW ) IF( MRROW.EQ.0 ) NP = NP - NZ NQ = NUMROC( NN, NB, MYCOL, IXCOL, NPCOL ) IF( MRCOL.EQ.0 ) NQ = NQ - NZ NP0 = NUMROC( NUMROC(NN, NB, 0, 0, NPROW), NB, 0, 0, LCMP ) * * When a row process of IXROW has a row block A, * IF( IXROW .GE. 0 ) THEN TBETA = ZERO IF( MYCOL.EQ.JYCOL ) TBETA = BETA KZ = NZ * DO 40 I = 0, MIN( LCM, ICEIL(NN,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IYROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IXCOL, NPCOL ) IF( LCMP.EQ.1 ) NP0 = NUMROC( NN, NB, I, 0, NPROW ) JDEX = (I/NPROW) * NB IF( MRROW.EQ.0 ) JDEX = MAX(0, JDEX-NZ) * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.IXROW .AND. MYCOL.EQ.MCCOL ) THEN * * The source node is a destination node * IDEX = (I/NPCOL) * NB IF( MRCOL.EQ.0 ) IDEX = MAX( 0, IDEX-NZ ) IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JYCOL ) THEN CALL PBCTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, TBETA, $ Y(JDEX*INCY+1), INCY, LCMQ, LCMP ) * * The source node sends blocks to a destination node * ELSE CALL PBCTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMQ, 1 ) CALL CGESD2D( ICONTXT, 1, NP0-KZ, WORK, 1, $ MCROW, JYCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JYCOL ) THEN IF( LCMP.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL CGERV2D( ICONTXT, 1, NP0-KZ, Y, INCY, $ IXROW, MCCOL ) ELSE CALL CGERV2D( ICONTXT, 1, NP0-KZ, WORK, 1, $ IXROW, MCCOL ) CALL PBCTR2A1( ICONTXT, NP-JDEX, NB, KZ, WORK, 1, TBETA, $ Y(JDEX*INCY+1), INCY, LCMP*NB ) END IF END IF KZ = 0 40 CONTINUE * * Broadcast a column vector Y in each row of template * IF( IYCOL.EQ.-1 ) THEN IF( MYCOL.EQ.JYCOL ) THEN CALL CGEBS2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY ) ELSE CALL CGEBR2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY, $ MYROW, JYCOL ) END IF END IF * * When all row procesors have a copy of the row block A, * ELSE IF( LCMP.EQ.1 ) NP0 = NP * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * KZ = 0 IF( MRCOL.EQ.0 ) KZ = NZ JZ = 0 IF( MRCOL.EQ.0 .AND. MYROW.EQ.IYROW ) JZ = NZ * DO 50 I = 0, LCMQ-1 IF( MRROW.EQ.MOD(NPCOL*I+MRCOL, NPROW) ) THEN IDEX = MAX( 0, I*NB-KZ ) IF( LCMP.EQ.1 .AND. (IYCOL.EQ.-1.OR.IYCOL.EQ.MYCOL) ) THEN CALL PBCTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, BETA, Y, INCY, $ LCMQ, 1 ) ELSE CALL PBCTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMQ, 1 ) END IF END IF 50 CONTINUE * * Get diagonal blocks of A for each row of the template * MCCOL = MOD( MOD(MRROW, NPCOL) + IXCOL, NPCOL ) IF( LCMP.GT.1 ) THEN MCROW = MOD( NPROW+MYROW-IYROW, NPROW ) CALL PBCTRGET( ICONTXT, 'Col', 1, NP0, ICEIL( NN, NB ), $ WORK, 1, MCROW, MCCOL, IGD, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * * Broadcast a column block of WORK in every column of template * IF( IYCOL.EQ.-1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( LCMP.GT.1 ) THEN KZ = 0 IF( MYROW.EQ.IYROW ) KZ = NZ CALL PBCTRST1( ICONTXT, 'Col', NP, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NP0 ) END IF CALL CGEBS2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY ) ELSE CALL CGEBR2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY, $ MYROW, MCCOL ) END IF * * Send a column block of WORK to the destination column * ELSE IF( LCMP.EQ.1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.IYCOL ) $ CALL CGESD2D( ICONTXT, 1, NP, WORK, 1, MYROW, IYCOL ) ELSE IF( MYCOL.EQ.IYCOL ) THEN IF( BETA.EQ.ZERO ) THEN CALL CGERV2D( ICONTXT, 1, NP, Y, INCY, MYROW, MCCOL ) ELSE CALL CGERV2D( ICONTXT, 1, NP, WORK, 1, MYROW, MCCOL ) CALL PBCVECADD( ICONTXT, 'G', NP, ONE, WORK, 1, BETA, $ Y, INCY ) END IF END IF * ELSE NP1 = NP0 * MIN( LCMP, MAX( 0, ICEIL(NN,NB)-MCROW ) ) IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.IYCOL ) $ CALL CGESD2D( ICONTXT, 1, NP1, WORK, 1, MYROW, IYCOL ) ELSE IF( MYCOL.EQ.IYCOL ) THEN CALL CGERV2D( ICONTXT, 1, NP1, WORK, 1, MYROW, MCCOL ) END IF * IF( MYCOL.EQ.IYCOL ) THEN KZ = 0 IF( MYROW.EQ.IYROW ) KZ = NZ CALL PBCTRST1( ICONTXT, 'Col', NP, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NP0 ) END IF END IF END IF END IF END IF * RETURN * * End of PBCTRNV * END * *======================================================================= * SUBROUTINE PBCTR2A1 *======================================================================= * SUBROUTINE PBCTR2A1( ICONTXT, N, NB, NZ, X, INCX, BETA, Y, INCY, $ INTV ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. INTEGER ICONTXT, N, NB, NZ, INCX, INCY, INTV COMPLEX BETA * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * .. * * Purpose * ======= * * y <== x * y is a scattered vector, copied from a condensed vector x. * * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. External Subroutines .. EXTERNAL PBCVECADD * .. * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Variables .. INTEGER IX, IY, JZ, K, ITER * IX = 0 IY = 0 JZ = NZ ITER = ICEIL( N+NZ, INTV ) * IF( ITER.GT.1 ) THEN CALL PBCVECADD( ICONTXT, 'G', NB-JZ, ONE, X(IX*INCX+1), INCX, $ BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - JZ IY = IY + INTV - JZ JZ = 0 * DO 10 K = 2, ITER-1 CALL PBCVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), INCX, $ BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + INTV 10 CONTINUE END IF * CALL PBCVECADD( ICONTXT, 'G', MIN( N-IY, NB-JZ ), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), INCY ) * RETURN * * End of PBCTR2A1 * END * *======================================================================= * SUBROUTINE PBCTR2B1 *======================================================================= * SUBROUTINE PBCTR2B1( ICONTXT, TRANS, N, NB, NZ, X, INCX, BETA, Y, $ INCY, JINX, JINY ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER ICONTXT, N, NB, NZ, INCX, INCY, JINX, JINY COMPLEX BETA * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * .. * * Purpose * ======= * * y <== x + beta * y * y is a condensed vector, copied from a scattered vector x * * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. External Subroutines .. EXTERNAL PBCVECADD * .. * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Variables .. INTEGER IX, IY, JZ, K, ITER, LENX, LENY * IF( JINX.EQ.1 .AND. JINY.EQ.1 ) THEN CALL PBCVECADD( ICONTXT, TRANS, N, ONE, X, INCX, BETA, $ Y, INCY ) * ELSE IX = 0 IY = 0 JZ = NZ LENX = NB * JINX LENY = NB * JINY ITER = ICEIL( N+NZ, LENX ) * IF( ITER.GT.1 ) THEN CALL PBCVECADD( ICONTXT, TRANS, NB-JZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + LENX - JZ IY = IY + LENY - JZ JZ = 0 * DO 10 K = 2, ITER-1 CALL PBCVECADD( ICONTXT, TRANS, NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + LENX IY = IY + LENY 10 CONTINUE END IF * CALL PBCVECADD( ICONTXT, TRANS, MIN( N-IX, NB-JZ ), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), INCY ) END IF * RETURN * * End of PBCTR2B1 * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbctrsrt.f000644 000766 000024 00000004515 10363532303 020161 0ustar00juliestaff000000 000000 SUBROUTINE PBCTRSRT( ICONTXT, ADIST, M, N, NB, A, LDA, BETA, B, $ LDB, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, LCMP, LCMQ, LDA, LDB, M, N, NB, NINT COMPLEX BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBCTRSRT forms T <== A + beta * T, where T is a sorted * condensed block row (or column) from a block column (or row) of A * with sorting index ISRT * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Variables .. INTEGER JA, JB, K, KK, NJUMP * .. * .. External Subroutines .. EXTERNAL PBCMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( LSAME( ADIST, 'R' ) ) THEN NJUMP = NB * LCMQ DO 20 K = 0, LCMQ-1 JA = NINT * MOD( K*LCMP, LCMQ ) + 1 JB = K * NB + 1 * DO 10 KK = 1, ICEIL( NINT, NB ) IF( N.LT.JB ) GO TO 20 CALL PBCMATADD( ICONTXT, 'G', M, MIN( N-JB+1, NB ), ONE, $ A(1, JA), LDA, BETA, B(1, JB), LDB ) JA = JA + NB JB = JB + NJUMP 10 CONTINUE 20 CONTINUE * * if( LSAME( ADIST, 'C') ) then * ELSE NJUMP = NB * LCMP DO 40 K = 0, LCMP-1 JA = 1 JB = K * NB + 1 * DO 30 KK = 1, ICEIL( NINT, NB ) IF( M.LT.JB ) GO TO 40 CALL PBCMATADD( ICONTXT, 'G', MIN( M-JB+1, NB ), N, ONE, $ A(JA, N*MOD(K*LCMQ,LCMP)+1), LDA, BETA, $ B(JB, 1), LDB ) JA = JA + NB JB = JB + NJUMP 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of PBCTRSRT * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbctrst1.f000644 000766 000024 00000006466 10363532303 020067 0ustar00juliestaff000000 000000 SUBROUTINE PBCTRST1( ICONTXT, XDIST, N, NB, NZ, X, INCX, BETA, Y, $ INCY, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 XDIST INTEGER ICONTXT, INCX, INCY, LCMP, LCMQ, N, NB, NINT, $ NZ COMPLEX BETA * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * .. * * Purpose * ======= * * PBCTRST1 forms y <== x + beta * y, where y is a sorted * condensed row (or column) vector from a column (or row) vector of x. * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Variables .. INTEGER ITER, IX, IY, K, KK, KZ, NJUMP * .. * .. External Subroutines .. EXTERNAL PBCVECADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MAX, MOD * .. * .. Executable Statements .. * ITER = ICEIL( NINT, NB ) KZ = NZ * IF( LSAME( XDIST, 'R' ) ) THEN NJUMP = NB * LCMQ * DO 20 KK = 0, LCMQ-1 IX = NINT * MOD( KK*LCMP, LCMQ ) IY = MAX( 0, NB*KK-NZ ) IF( N.LT.IY ) GO TO 50 * IF( ITER.GT.1 ) THEN CALL PBCVECADD( ICONTXT, 'G', NB-KZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - KZ IY = IY + NJUMP - KZ KZ = 0 * DO 10 K = 2, ITER-1 CALL PBCVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + NJUMP 10 CONTINUE END IF * CALL PBCVECADD( ICONTXT, 'G', MIN(NB-KZ,N-IY), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), $ INCY ) KZ = 0 20 CONTINUE * * if( LSAME( XDIST, 'C' ) ) then * ELSE NJUMP = NB * LCMP * DO 40 KK = 0, LCMP-1 IX = NINT * MOD( KK*LCMQ, LCMP ) IY = MAX( 0, NB*KK-NZ ) IF( N.LT.IY ) GO TO 50 * IF( ITER.GT.1 ) THEN CALL PBCVECADD( ICONTXT, 'G', NB-KZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - KZ IY = IY + NJUMP - KZ KZ = 0 * DO 30 K = 2, ITER-1 CALL PBCVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + NJUMP 30 CONTINUE END IF * CALL PBCVECADD( ICONTXT, 'G', MIN(NB-KZ,N-IY), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), $ INCY ) KZ = 0 40 CONTINUE END IF * 50 CONTINUE * RETURN * * End of PBCTRST1 * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbcvecadd.f000644 000766 000024 00000023665 10363532303 020240 0ustar00juliestaff000000 000000 SUBROUTINE PBCVECADD( ICONTXT, MODE, N, ALPHA, X, INCX, BETA, Y, $ INCY ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 MODE INTEGER ICONTXT, INCX, INCY, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX X( * ), Y( * ) * * .. * * Purpose * ======= * * PBCVECADD performs a vector X to be added to Y * Y := alpha*op(X) + beta*Y, * where alpha and beta are scalars, and X and Y are n vectors, * and op(X) = X**H if MODE = 'C', * * Arguments * ========= * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * MODE (input) CHARACTER*1 * Specifies the transposed, or conjugate transposed vector X * to be added to the vector Y * = 'C': Conjugate vector X is added for complex data set. * Y = alpha * X**H + beta * Y * ELSE : Vector X is added. Y = alpha*X + beta*Y * if MODE = 'V', BLAS routine may be used. * * N (input) INTEGER * The number of elements of the vectors X and Y to be added. * N >= 0. * * ALPHA (input) COMPLEX * ALPHA specifies the scalar alpha. * * X (input) COMPLEX array of DIMENSION at least * ( 1 + ( N - 1 )*abs( INCX ) ) * The incremented array X must contain the vector X. * * INCX (input) INTEGER * INCX specifies the increment for the elements of X. * INCX <> 0. * * BETA (input) COMPLEX * BETA specifies the scalar beta. * * Y (input/output) COMPLEX array of DIMENSION at least * ( 1 + ( N - 1 )*abs( INCY ) ) * On entry with BETA non-zero, the incremented array Y must * contain the vector Y. * On exit, Y is overwritten by the updated vector Y. * * INCY - (input) INTEGER * INCY specifies the increment for the elements of Y. * INCY <> 0. * * ===================================================================== * * .. * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IX, IY * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL CSCAL, CCOPY, CAXPY * .. * .. Intrinsic Functions .. INTRINSIC CONJG * .. * .. Executable Statements .. * IF( N.LE.0 .OR. ( ALPHA.EQ.ZERO .AND. BETA.EQ.ONE ) ) RETURN * IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN IF( INCY.EQ.1 ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE IY = 1 DO 20 I = 1, N Y( IY ) = ZERO IY = IY + INCY 20 CONTINUE END IF * ELSE IF( LSAME( MODE, 'V' ) ) THEN CALL CSCAL( N, BETA, Y, INCY ) ELSE IF( INCY.EQ.1 ) THEN DO 30 I = 1, N Y( I ) = BETA * Y( I ) 30 CONTINUE ELSE IY = 1 DO 40 I = 1, N Y( IY ) = BETA * Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF * ELSE IF( .NOT.LSAME( MODE, 'C' ) ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( LSAME( MODE, 'V' ) ) THEN CALL CCOPY( N, X, INCX, Y, INCY ) ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 50 I = 1, N Y( I ) = X( I ) 50 CONTINUE ELSE IX = 1 IY = 1 DO 60 I = 1, N Y( IY ) = X( IX ) IX = IX + INCX IY = IY + INCY 60 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 70 I = 1, N Y( I ) = X( I ) + Y( I ) 70 CONTINUE ELSE IX = 1 IY = 1 DO 80 I = 1, N Y( IY ) = X( IX ) + Y( IY ) IX = IX + INCX IY = IY + INCY 80 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 90 I = 1, N Y( I ) = X( I ) + BETA * Y( I ) 90 CONTINUE ELSE IX = 1 IY = 1 DO 100 I = 1, N Y( IY ) = X( IX ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 100 CONTINUE END IF END IF * ELSE IF( BETA.EQ.ZERO ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 110 I = 1, N Y( I ) = ALPHA * X( I ) 110 CONTINUE ELSE IX = 1 IY = 1 DO 120 I = 1, N Y( IY ) = X( IX ) IX = IX + INCX IY = IY + INCY 120 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( LSAME( MODE, 'V' ) ) THEN CALL CAXPY( N, ALPHA, X, INCX, Y, INCY ) ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 130 I = 1, N Y( I ) = ALPHA * X( I ) + Y( I ) 130 CONTINUE ELSE IX = 1 IY = 1 DO 140 I = 1, N Y( IY ) = ALPHA * X( IX ) + Y( IY ) IX = IX + INCX IY = IY + INCY 140 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 150 I = 1, N Y( I ) = ALPHA * X( I ) + BETA * Y( I ) 150 CONTINUE ELSE IX = 1 IY = 1 DO 160 I = 1, N Y( IY ) = ALPHA * X( IX ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 160 CONTINUE END IF END IF END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 170 I = 1, N Y( I ) = CONJG( X( I ) ) 170 CONTINUE ELSE IX = 1 IY = 1 DO 180 I = 1, N Y( IY ) = CONJG( X( IX ) ) IX = IX + INCX IY = IY + INCY 180 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 190 I = 1, N Y( I ) = CONJG( X( I ) ) + Y( I ) 190 CONTINUE ELSE IX = 1 IY = 1 DO 200 I = 1, N Y( IY ) = CONJG( X( IX ) ) + Y( IY ) IX = IX + INCX IY = IY + INCY 200 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 210 I = 1, N Y( I ) = CONJG( X( I ) ) + BETA * Y( I ) 210 CONTINUE ELSE IX = 1 IY = 1 DO 220 I = 1, N Y( IY ) = CONJG( X( IX ) ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 220 CONTINUE END IF END IF * ELSE IF( BETA.EQ.ZERO ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 230 I = 1, N Y( I ) = ALPHA * CONJG( X( I ) ) 230 CONTINUE ELSE IX = 1 IY = 1 DO 240 I = 1, N Y( IY ) = ALPHA * CONJG( X( IX ) ) IX = IX + INCX IY = IY + INCY 240 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 250 I = 1, N Y( I ) = ALPHA * CONJG( X( I ) ) + Y( I ) 250 CONTINUE ELSE IX = 1 IY = 1 DO 260 I = 1, N Y( IY ) = ALPHA * CONJG( X( IX ) ) + Y( IY ) IX = IX + INCX IY = IY + INCY 260 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 270 I = 1, N Y( I ) = ALPHA * CONJG( X( I ) ) + BETA * Y( I ) 270 CONTINUE ELSE IX = 1 IY = 1 DO 280 I = 1, N Y( IY ) = ALPHA * CONJG( X(IX) ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 280 CONTINUE END IF END IF END IF END IF * RETURN * * End of PBCVECADD * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbdmatadd.f000644 000766 000024 00000030333 10363532303 020233 0ustar00juliestaff000000 000000 SUBROUTINE PBDMATADD( ICONTXT, MODE, M, N, ALPHA, A, LDA, BETA, B, $ LDB ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 MODE INTEGER ICONTXT, LDA, LDB, M, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBDMATADD performs the matrix add operation B := alpha*A + beta*B, * where alpha and beta are scalars, and A and B are m-by-n * upper/lower trapezoidal matrices, or rectangular matrices. * * Arguments * ========= * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * MODE (input) CHARACTER*1 * Specifies the part of the matrix A, or (conjugate) transposed * matrix A to be added to the matrix B, * = 'U': Upper triangular part * up(B) = alpha*up(A) + beta*up(B) * = 'L': Lower triangular part * lo(B) = alpha*lo(A) + beta*lo(B) * = 'T': Transposed matrix A * B = alpha*A**T + beta*B * = 'C': Conjugate transposed matrix A * B = alpha*A**H + beta*B * Otherwise: B = alpha*A + beta*B * if M = LDA = LDB: use one BLAS loop * if MODE = 'V' : columnwise copy using BLAS if possible * else : use double loops * * M (input) INTEGER * M specifies the number of columns of the matrix A if * MODE != 'T'/'C', and it specifies the number of rows of the * matrix A otherwise. It also specifies the number of rows of * the matrix B. M >= 0. * * N (input) INTEGER * N specifies the number of rows of the matrix A if * MODE != 'T'/'C', and it specifies the number of columns of * the matrix A otherwise. It also specifies the number of * columns of the matrix B. N >= 0. * * ALPHA (input) DOUBLE PRECISION * ALPHA specifies the scalar alpha. * * A (input) DOUBLE PRECISION array, dimension (LDA,N) * The m by n matrix A if MODE != 'T'/'C'. * If MODE = 'U', only the upper triangle or trapezoid is * accessed; if MODE = 'L', only the lower triangle or * trapezoid is accessed. Otherwise all m-by-n data matrix * is accessed. * And the n by m matrix A if MODE = 'T'/'C'. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M) if * MODE != 'T'/'C'. And LDA >= max(1,N) if MODE = 'T'/'C'. * * BETA (input) DOUBLE PRECISION * BETA specifies the scalar beta. * * B (input) DOUBLE PRECISION array, dimension (LDB,N) * On exit, B = alpha*A + beta*B * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,M). * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DSCAL, DCOPY, DAXPY * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( M.LE.0 .OR. N.LE.0 .OR. ( ALPHA.EQ.ZERO.AND.BETA.EQ.ONE ) ) $ RETURN * * A is upper triangular or upper trapezoidal, * IF( LSAME( MODE, 'U' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, MIN( J, M ) B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, MIN( J, M ) B( I, J ) = BETA * B( I, J ) 30 CONTINUE 40 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 60 J = 1, N DO 50 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) 50 CONTINUE 60 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 80 J = 1, N DO 70 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) + B( I, J ) 70 CONTINUE 80 CONTINUE ELSE DO 100 J = 1, N DO 90 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 120 J = 1, N DO 110 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) 110 CONTINUE 120 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 140 J = 1, N DO 130 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 130 CONTINUE 140 CONTINUE ELSE DO 160 J = 1, N DO 150 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 150 CONTINUE 160 CONTINUE END IF END IF * * A is lower triangular or upper trapezoidal, * ELSE IF( LSAME( MODE, 'L' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, N DO 170 I = J, M B( I, J ) = ZERO 170 CONTINUE 180 CONTINUE ELSE DO 200 J = 1, N DO 190 I = J, M B( I, J ) = BETA * B( I, J ) 190 CONTINUE 200 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 220 J = 1, N DO 210 I = J, M B( I, J ) = A( I, J ) 210 CONTINUE 220 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 240 J = 1, N DO 230 I = J, M B( I, J ) = A( I, J ) + B( I, J ) 230 CONTINUE 240 CONTINUE ELSE DO 260 J = 1, N DO 250 I = J, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 280 J = 1, N DO 270 I = J, M B( I, J ) = ALPHA * A( I, J ) 270 CONTINUE 280 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 300 J = 1, N DO 290 I = J, M B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 290 CONTINUE 300 CONTINUE ELSE DO 320 J = 1, N DO 310 I = J, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 310 CONTINUE 320 CONTINUE END IF END IF * * If MODE = 'Transpose'/'Conjugate' * ELSE IF( LSAME( MODE, 'T' ) .OR. LSAME( MODE, 'C' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 340 J = 1, N DO 330 I = 1, M B( I, J ) = ZERO 330 CONTINUE 340 CONTINUE ELSE DO 360 J = 1, N DO 350 I = 1, M B( I, J ) = BETA * B( I, J ) 350 CONTINUE 360 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 380 J = 1, N DO 370 I = 1, M B( I, J ) = A( J, I ) 370 CONTINUE 380 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 400 J = 1, N DO 390 I = 1, M B( I, J ) = A( J, I ) + B( I, J ) 390 CONTINUE 400 CONTINUE ELSE DO 420 J = 1, N DO 410 I = 1, M B( I, J ) = A( J, I ) + BETA * B( I, J ) 410 CONTINUE 420 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 440 J = 1, N DO 430 I = 1, M B( I, J ) = ALPHA * A( J, I ) 430 CONTINUE 440 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 460 J = 1, N DO 450 I = 1, M B( I, J ) = ALPHA * A( J, I ) + B( I, J ) 450 CONTINUE 460 CONTINUE ELSE DO 480 J = 1, N DO 470 I = 1, M B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 470 CONTINUE 480 CONTINUE END IF END IF * * Other cases (for genral matrix additions) * ELSE IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 500 J = 1, N DO 490 I = 1, M B( I, J ) = ZERO 490 CONTINUE 500 CONTINUE * ELSE IF( M.EQ.LDB ) THEN CALL DSCAL( M*N, BETA, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 510 J = 1, N CALL DSCAL( M, BETA, B( 1, J ), 1 ) 510 CONTINUE ELSE DO 530 J = 1, N DO 520 I = 1, M B( I, J ) = BETA * B( I, J ) 520 CONTINUE 530 CONTINUE END IF END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( M.EQ.LDA .AND. M.EQ.LDB ) THEN CALL DCOPY( M*N, A( 1, 1 ), 1, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 540 J = 1, N CALL DCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) 540 CONTINUE ELSE DO 560 J = 1, N DO 550 I = 1, M B( I, J ) = A( I, J ) 550 CONTINUE 560 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN DO 580 J = 1, N DO 570 I = 1, M B( I, J ) = A( I, J ) + B( I, J ) 570 CONTINUE 580 CONTINUE * ELSE DO 600 J = 1, N DO 590 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 590 CONTINUE 600 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 620 J = 1, N DO 610 I = 1, M B( I, J ) = ALPHA * A( I, J ) 610 CONTINUE 620 CONTINUE * ELSE IF( BETA.EQ.ONE ) THEN IF( M.EQ.LDA .AND. M.EQ.LDB ) THEN CALL DAXPY( M*N, ALPHA, A( 1, 1 ), 1, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 630 J = 1, N CALL DAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) 630 CONTINUE ELSE DO 650 J = 1, N DO 640 I = 1, M B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 640 CONTINUE 650 CONTINUE END IF * ELSE DO 670 J = 1, N DO 660 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 660 CONTINUE 670 CONTINUE END IF END IF END IF * RETURN * * End of PBDMATADD * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbdtran.f000644 000766 000024 00000071130 10363532303 017745 0ustar00juliestaff000000 000000 SUBROUTINE PBDTRAN( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, BETA, $ C, LDC, IAROW, IACOL, ICROW, ICCOL, WORK ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * Jaeyoung Choi, Oak Ridge National Laboratory * Jack Dongarra, University of Tennessee and Oak Ridge National Lab. * David Walker, Oak Ridge National Laboratory * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER IACOL, IAROW, ICCOL, ICONTXT, ICROW, LDA, LDC, $ M, N, NB DOUBLE PRECISION BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), C( LDC, * ), WORK( * ) * .. * * Purpose * ======= * * PBDTRAN transposes a column block to row block, or a row block to * column block by reallocating data distribution. * * C := A^T + beta*C, or C := A^C + beta*C * * where A is an M-by-N matrix and C is an N-by-M matrix, and the size * of M or N is limited to its block size NB. * * The first elements of the matrices A, and C should be located at * the beginnings of their first blocks. (not the middle of the blocks.) * * Parameters * ========== * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * ADIST - (input) CHARACTER*1 * ADIST specifies whether A is a column block or a row block. * * ADIST = 'C', A is a column block * ADIST = 'R', A is a row block * * TRANS - (input) CHARACTER*1 * TRANS specifies whether the transposed format is transpose * or conjugate transpose. If the matrices A and C are real, * the argument is ignored. * * TRANS = 'T', transpose * TRANS = 'C', conjugate transpose * * M - (input) INTEGER * M specifies the (global) number of rows of the matrix (block * column or block row) A and of columns of the matrix C. * M >= 0. * * N - (input) INTEGER * N specifies the (global) number of columns of the matrix * (block column or block row) A and of columns of the matrix * C. N >= 0. * * NB - (input) INTEGER * NB specifies the column block size of the matrix A and the * row block size of the matrix C when ADIST = 'C'. Otherwise, * it specifies the row block size of the matrix A and the * column block size of the matrix C. NB >= 1. * * A (input) DOUBLE PRECISION array of DIMENSION ( LDA, Lx ), * where Lx is N when ADIST = 'C', or Nq when ADIST = 'R'. * Before entry with ADIST = 'C', the leading Mp by N part of * the array A must contain the matrix A, otherwise the leading * M by Nq part of the array A must contain the matrix A. See * parameter details for the values of Mp and Nq. * * LDA (input) INTEGER * LDA specifies the leading dimension of (local) A as declared * in the calling (sub) program. LDA >= MAX(1,Mp) when * ADIST = 'C', or LDA >= MAX(1,M) otherwise. * * BETA (input) DOUBLE PRECISION * BETA specifies scaler beta. * * C (input/output) DOUBLE PRECISION array of DIMENSION * ( LDC, Lx ), * where Lx is Mq when ADIST = 'C', or N when ADIST = 'R'. * If ADIST = 'C', the leading N-by-Mq part of the array C * contains the (local) matrix C, otherwise the leading * Np-by-M part of the array C must contain the (local) matrix * C. C will not be referenced if beta is zero. * * LDC (input) INTEGER * LDC specifies the leading dimension of (local) C as declared * in the calling (sub) program. LDC >= MAX(1,N) when ADIST='C', * or LDC >= MAX(1,Np) otherwise. * * IAROW (input) INTEGER * IAROW specifies a row of the process template, * which holds the first block of the matrix A. If A is a row * of blocks (ADIST = 'R') and all rows of processes have a copy * of A, then set IAROW = -1. * * IACOL (input) INTEGER * IACOL specifies a column of the process template, * which holds the first block of the matrix A. If A is a * column of blocks (ADIST = 'C') and all columns of processes * have a copy of A, then set IACOL = -1. * * ICROW (input) INTEGER * ICROW specifies the current row process which holds * the first block of the matrix C, which is transposed of A. * If C is a row of blocks (ADIST = 'C') and the transposed * row block C is distributed all rows of processes, set * ICROW = -1. * * ICCOL (input) INTEGER * ICCOL specifies the current column process which holds * the first block of the matrix C, which is transposed of A. * If C is a column of blocks (ADIST = 'R') and the transposed * column block C is distributed all columns of processes, * set ICCOL = -1. * * WORK (workspace) DOUBLE PRECISION array of dimension Size(WORK). * It needs extra working space of A'. * * Parameters Details * ================== * * Lx It is a local portion of L owned by a process, (L is * replaced by M, or N, and x is replaced by either p (=NPROW) * or q (=NPCOL)). The value is determined by L, LB, x, and * MI, where LB is a block size and MI is a row or column * position in a process template. Lx is equal to or less * than Lx0 = CEIL( L, LB*x ) * LB. * * Communication Scheme * ==================== * * The communication scheme of the routine is set to '1-tree', which is * fan-out. (For details, see BLACS user's guide.) * * Memory Requirement of WORK * ========================== * * Mqb = CEIL( M, NB*NPCOL ) * Npb = CEIL( N, NB*NPROW ) * LCMQ = LCM / NPCOL * LCMP = LCM / NPROW * * (1) ADIST = 'C' * (a) IACOL != -1 * Size(WORK) = N * CEIL(Mqb,LCMQ)*NB * (b) IACOL = -1 * Size(WORK) = N * CEIL(Mqb,LCMQ)*NB * MIN(LCMQ,CEIL(M,NB)) * * (2) ADIST = 'R' * (a) IAROW != -1 * Size(WORK) = M * CEIL(Npb,LCMP)*NB * (b) IAROW = -1 * Size(WORK) = M * CEIL(Npb,LCMP)*NB * MIN(LCMP,CEIL(N,NB)) * * Notes * ----- * More precise space can be computed as * * CEIL(Mqb,LCMQ)*NB => NUMROC( NUMROC(M,NB,0,0,NPCOL), NB, 0, 0, LCMQ ) * CEIL(Npb,LCMP)*NB => NUMROC( NUMROC(N,NB,0,0,NPROW), NB, 0, 0, LCMP ) * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLFORM, ROWFORM INTEGER I, IDEX, IGD, INFO, JCCOL, JCROW, JDEX, LCM, $ LCMP, LCMQ, MCCOL, MCROW, ML, MP, MQ, MQ0, $ MRCOL, MRROW, MYCOL, MYROW, NP, NP0, NPCOL, $ NPROW, NQ DOUBLE PRECISION TBETA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, ICEIL, NUMROC EXTERNAL ILCM, ICEIL, LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, DGERV2D, $ DGESD2D, PBDMATADD, PBDTR2AF, PBDTR2AT, $ PBDTR2BT, PBDTRGET, PBDTRSRT, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible. * IF( M.EQ.0 .OR. N.EQ.0 ) RETURN * CALL BLACS_GRIDINFO( ICONTXT, NPROW, NPCOL, MYROW, MYCOL ) * COLFORM = LSAME( ADIST, 'C' ) ROWFORM = LSAME( ADIST, 'R' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.COLFORM ) .AND. ( .NOT.ROWFORM ) ) THEN INFO = 2 ELSE IF( M .LT.0 ) THEN INFO = 4 ELSE IF( N .LT.0 ) THEN INFO = 5 ELSE IF( NB.LT.1 ) THEN INFO = 6 ELSE IF( IAROW.LT.-1 .OR. IAROW.GE.NPROW .OR. $ ( IAROW.EQ.-1 .AND. COLFORM ) ) THEN INFO = 12 ELSE IF( IACOL.LT.-1 .OR. IACOL.GE.NPCOL .OR. $ ( IACOL.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 13 ELSE IF( ICROW.LT.-1 .OR. ICROW.GE.NPROW .OR. $ ( ICROW.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 14 ELSE IF( ICCOL.LT.-1 .OR. ICCOL.GE.NPCOL .OR. $ ( ICCOL.EQ.-1 .AND. COLFORM ) ) THEN INFO = 15 END IF * 10 CONTINUE IF( INFO .NE. 0 ) THEN CALL PXERBLA( ICONTXT, 'PBDTRAN ', INFO ) RETURN END IF * * Start the operations. * * LCM : the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LCMQ = LCM / NPCOL IGD = NPCOL / LCMP * * When A is a column block * IF( COLFORM ) THEN * * Form C <== A' ( A is a column block ) * _ * | | * | | * _____________ | | * |______C______| <== |A| * | | * | | * |_| * * MRROW : row relative position in template from IAROW * MRCOL : column relative position in template from ICCOL * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-ICCOL, NPCOL ) JCROW = ICROW IF( ICROW.EQ.-1 ) JCROW = IAROW * MP = NUMROC( M, NB, MYROW, IAROW, NPROW ) MQ = NUMROC( M, NB, MYCOL, ICCOL, NPCOL ) MQ0 = NUMROC( NUMROC(M, NB, 0, 0, NPCOL), NB, 0, 0, LCMQ ) * IF( LDA.LT.MP .AND. $ ( IACOL.EQ.MYCOL .OR. IACOL.EQ.-1 ) ) THEN INFO = 8 ELSE IF( LDC.LT.N .AND. $ ( ICROW.EQ.MYROW .OR. ICROW.EQ.-1 ) ) THEN INFO = 11 END IF IF( INFO.NE.0 ) GO TO 10 * * When a column process of IACOL has a column block A, * IF( IACOL.GE.0 ) THEN TBETA = ZERO IF( MYROW.EQ.JCROW ) TBETA = BETA * DO 20 I = 0, MIN( LCM, ICEIL(M,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IAROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + ICCOL, NPCOL ) IF( LCMQ.EQ.1 ) MQ0 = NUMROC( M, NB, I, 0, NPCOL ) JDEX = (I/NPCOL) * NB * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.IACOL ) THEN * * The source node is a destination node * IDEX = (I/NPROW) * NB IF( MYROW.EQ.JCROW .AND. MYCOL.EQ.MCCOL ) THEN CALL PBDTR2AT( ICONTXT, 'Col', TRANS, MP-IDEX, N, NB, $ A(IDEX+1,1), LDA, TBETA, C(1,JDEX+1), $ LDC, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBDTR2BT( ICONTXT, 'Col', TRANS, MP-IDEX, N, NB, $ A(IDEX+1,1), LDA, ZERO, WORK, N, $ LCMP*NB ) CALL DGESD2D( ICONTXT, N, MQ0, WORK, N, JCROW, MCCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.JCROW .AND. MYCOL.EQ.MCCOL ) THEN IF( LCMQ.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL DGERV2D( ICONTXT, N, MQ0, C, LDC, MCROW, IACOL ) ELSE CALL DGERV2D( ICONTXT, N, MQ0, WORK, N, MCROW, IACOL ) CALL PBDTR2AF( ICONTXT, 'Row', N, MQ-JDEX, NB, WORK, N, $ TBETA, C(1,JDEX+1), LDC, LCMP, LCMQ, $ MQ0 ) END IF END IF 20 CONTINUE * * Broadcast a row block of C in each column of template * IF( ICROW.EQ.-1 ) THEN IF( MYROW.EQ.JCROW ) THEN CALL DGEBS2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC ) ELSE CALL DGEBR2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC, $ JCROW, MYCOL ) END IF END IF * * When all column procesors have a copy of the column block A, * ELSE IF( LCMQ.EQ.1 ) MQ0 = MQ * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * DO 30 I = 0, LCMP-1 IF( MRCOL.EQ.MOD( NPROW*I+MRROW, NPCOL ) ) THEN IF( LCMQ.EQ.1.AND.(ICROW.EQ.-1.OR.ICROW.EQ.MYROW) ) THEN CALL PBDTR2BT( ICONTXT, 'Col', TRANS, MP-I*NB, N, NB, $ A(I*NB+1,1), LDA, BETA, C, LDC, $ LCMP*NB ) ELSE CALL PBDTR2BT( ICONTXT, 'Col', TRANS, MP-I*NB, N, NB, $ A(I*NB+1,1), LDA, ZERO, WORK, N, $ LCMP*NB ) END IF END IF 30 CONTINUE * * Get diagonal blocks of A for each column of the template * MCROW = MOD( MOD(MRCOL,NPROW)+IAROW, NPROW ) IF( LCMQ.GT.1 ) THEN MCCOL = MOD( NPCOL+MYCOL-ICCOL, NPCOL ) CALL PBDTRGET( ICONTXT, 'Row', N, MQ0, ICEIL(M,NB), WORK, N, $ MCROW, MCCOL, IGD, MYROW, MYCOL, NPROW, $ NPCOL ) END IF * * Broadcast a row block of WORK in every row of template * IF( ICROW.EQ.-1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( LCMQ.GT.1 ) $ CALL PBDTRSRT( ICONTXT, 'Row', N, MQ, NB, WORK, N, BETA, $ C, LDC, LCMP, LCMQ, MQ0 ) CALL DGEBS2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC ) ELSE CALL DGEBR2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC, $ MCROW, MYCOL ) END IF * * Send a row block of WORK to the destination row * ELSE IF( LCMQ.EQ.1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.ICROW ) $ CALL DGESD2D( ICONTXT, N, MQ, WORK, N, ICROW, MYCOL ) ELSE IF( MYROW.EQ.ICROW ) THEN IF( BETA.EQ.ZERO ) THEN CALL DGERV2D( ICONTXT, N, MQ, C, LDC, MCROW, MYCOL ) ELSE CALL DGERV2D( ICONTXT, N, MQ, WORK, N, MCROW, MYCOL ) CALL PBDMATADD( ICONTXT, 'G', N, MQ, ONE, WORK, N, $ BETA, C, LDC ) END IF END IF * ELSE ML = MQ0 * MIN( LCMQ, MAX(0,ICEIL(M,NB)-MCCOL) ) IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.ICROW ) $ CALL DGESD2D( ICONTXT, N, ML, WORK, N, ICROW, MYCOL ) ELSE IF( MYROW.EQ.ICROW ) THEN CALL DGERV2D( ICONTXT, N, ML, WORK, N, MCROW, MYCOL ) END IF * IF( MYROW.EQ.ICROW ) $ CALL PBDTRSRT( ICONTXT, 'Row', N, MQ, NB, WORK, N, BETA, $ C, LDC, LCMP, LCMQ, MQ0 ) END IF END IF * END IF * * When A is a row block * ELSE * * Form C <== A' ( A is a row block ) * _ * | | * | | * | | _____________ * |C| <== |______A______| * | | * | | * |_| * * MRROW : row relative position in template from ICROW * MRCOL : column relative position in template from IACOL * MRROW = MOD( NPROW+MYROW-ICROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) JCCOL = ICCOL IF( ICCOL.EQ.-1 ) JCCOL = IACOL * NP = NUMROC( N, NB, MYROW, ICROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) NP0 = NUMROC( NUMROC(N, NB, 0, 0, NPROW), NB, 0, 0, LCMP ) * IF( LDA.LT.M .AND. $ ( IAROW.EQ.MYROW .OR. IAROW.EQ.-1 ) ) THEN INFO = 8 ELSE IF( LDC.LT.NP .AND. $ ( ICCOL.EQ.MYCOL .OR. ICCOL.EQ.-1 ) ) THEN INFO = 11 END IF IF( INFO.NE.0 ) GO TO 10 * * When a row process of IAROW has a row block A, * IF( IAROW.GE.0 ) THEN TBETA = ZERO IF( MYCOL.EQ.JCCOL ) TBETA = BETA * DO 40 I = 0, MIN( LCM, ICEIL(N,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + ICROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IACOL, NPCOL ) IF( LCMP.EQ.1 ) NP0 = NUMROC( N, NB, I, 0, NPROW ) IDEX = (I/NPROW) * NB * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.MCCOL ) THEN * * The source node is a destination node * JDEX = (I/NPCOL) * NB IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JCCOL ) THEN CALL PBDTR2AT( ICONTXT, 'Row', TRANS, M, NQ-JDEX, NB, $ A(1,JDEX+1), LDA, TBETA, C(IDEX+1,1), $ LDC, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBDTR2BT( ICONTXT, 'Row', TRANS, M, NQ-JDEX, NB, $ A(1,JDEX+1), LDA, ZERO, WORK, NP0, $ LCMQ*NB ) CALL DGESD2D( ICONTXT, NP0, M, WORK, NP0, $ MCROW, JCCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JCCOL ) THEN IF( LCMP.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL DGERV2D( ICONTXT, NP0, M, C, LDC, IAROW, MCCOL ) ELSE CALL DGERV2D( ICONTXT, NP0, M, WORK, NP0, IAROW, MCCOL ) CALL PBDTR2AF( ICONTXT, 'Col', NP-IDEX, M, NB, WORK, $ NP0, TBETA, C(IDEX+1,1), LDC, LCMP, LCMQ, $ NP0 ) END IF END IF 40 CONTINUE * * Broadcast a column block of WORK in each row of template * IF( ICCOL.EQ.-1 ) THEN IF( MYCOL.EQ.JCCOL ) THEN CALL DGEBS2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC ) ELSE CALL DGEBR2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC, $ MYROW, JCCOL ) END IF END IF * * When all row procesors have a copy of the row block A, * ELSE IF( LCMP.EQ.1 ) NP0 = NP * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * DO 50 I = 0, LCMQ-1 IF( MRROW.EQ.MOD(NPCOL*I+MRCOL, NPROW) ) THEN IF( LCMP.EQ.1.AND.(ICCOL.EQ.-1.OR.ICCOL.EQ.MYCOL) ) THEN CALL PBDTR2BT( ICONTXT, 'Row', TRANS, M, NQ-I*NB, NB, $ A(1,I*NB+1), LDA, BETA, C, LDC, $ LCMQ*NB ) ELSE CALL PBDTR2BT( ICONTXT, 'Row', TRANS, M, NQ-I*NB, NB, $ A(1,I*NB+1), LDA, ZERO, WORK, NP0, $ LCMQ*NB ) END IF END IF 50 CONTINUE * * Get diagonal blocks of A for each row of the template * MCCOL = MOD( MOD(MRROW, NPCOL)+IACOL, NPCOL ) IF( LCMP.GT.1 ) THEN MCROW = MOD( NPROW+MYROW-ICROW, NPROW ) CALL PBDTRGET( ICONTXT, 'Col', NP0, M, ICEIL(N,NB), WORK, $ NP0, MCROW, MCCOL, IGD, MYROW, MYCOL, NPROW, $ NPCOL ) END IF * * Broadcast a column block of WORK in every column of template * IF( ICCOL.EQ.-1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( LCMP.GT.1 ) $ CALL PBDTRSRT( ICONTXT, 'Col', NP, M, NB, WORK, NP0, $ BETA, C, LDC, LCMP, LCMQ, NP0 ) CALL DGEBS2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC ) ELSE CALL DGEBR2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC, $ MYROW, MCCOL ) END IF * * Send a column block of WORK to the destination column * ELSE IF( LCMP.EQ.1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.ICCOL ) $ CALL DGESD2D( ICONTXT, NP, M, WORK, NP, MYROW, ICCOL ) ELSE IF( MYCOL.EQ.ICCOL ) THEN IF( BETA.EQ.ZERO ) THEN CALL DGERV2D( ICONTXT, NP, M, C, LDC, MYROW, MCCOL ) ELSE CALL DGERV2D( ICONTXT, NP, M, WORK, NP, MYROW, MCCOL ) CALL PBDMATADD( ICONTXT, 'G', NP, M, ONE, WORK, NP, $ BETA, C, LDC ) END IF END IF * ELSE ML = M * MIN( LCMP, MAX( 0, ICEIL(N,NB) - MCROW ) ) IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.ICCOL ) $ CALL DGESD2D( ICONTXT, NP0, ML, WORK, NP0, $ MYROW, ICCOL ) ELSE IF( MYCOL.EQ.ICCOL ) THEN CALL DGERV2D( ICONTXT, NP0, ML, WORK, NP0, $ MYROW, MCCOL ) END IF * IF( MYCOL.EQ.ICCOL ) $ CALL PBDTRSRT( ICONTXT, 'Col', NP, M, NB, WORK, NP0, $ BETA, C, LDC, LCMP, LCMQ, NP0 ) END IF END IF * END IF END IF * RETURN * * End of PBDTRAN * END * *======================================================================= * SUBROUTINE PBDTR2AT *======================================================================= * SUBROUTINE PBDTR2AT( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, $ BETA, B, LDB, LCMP, LCMQ ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER ICONTXT, LCMP, LCMQ, LDA, LDB, M, N, NB DOUBLE PRECISION BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBDTR2AT forms B <== A^T + beta*B, or A^C + beta*B * B is a ((conjugate) transposed) scattered block row (or column), * copied from a scattered block column (or row) of A * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER IA, IB, K, INTV, JNTV * .. * .. External Subroutines .. EXTERNAL PBDMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Excutable Statements .. * IF( LCMP.EQ.LCMQ ) THEN CALL PBDMATADD( ICONTXT, TRANS, N, M, ONE, A, LDA, BETA, B, $ LDB ) * ELSE * * If A is a column block ( ADIST = 'C' ), * IF( LSAME( ADIST, 'C' ) ) THEN INTV = LCMP * NB JNTV = LCMQ * NB IA = 1 IB = 1 DO 10 K = 1, ICEIL( M, INTV ) CALL PBDMATADD( ICONTXT, TRANS, N, MIN( M-IA+1, NB ), $ ONE, A(IA,1), LDA, BETA, B(1,IB), LDB ) IA = IA + INTV IB = IB + JNTV 10 CONTINUE * * If A is a row block ( ADIST = 'R' ), * ELSE INTV = LCMP * NB JNTV = LCMQ * NB IA = 1 IB = 1 DO 20 K = 1, ICEIL( N, JNTV ) CALL PBDMATADD( ICONTXT, TRANS, MIN( N-IA+1, NB ), M, $ ONE, A(1,IA), LDA, BETA, B(IB,1), LDB ) IA = IA + JNTV IB = IB + INTV 20 CONTINUE END IF END IF * RETURN * * End of PBDTR2AT * END * *======================================================================= * SUBROUTINE PBDTR2BT *======================================================================= * SUBROUTINE PBDTR2BT( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, $ BETA, B, LDB, INTV ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER ICONTXT, INTV, LDA, LDB, M, N, NB DOUBLE PRECISION BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBDTR2BT forms T <== A^T + beta*T or A^C + beta*T, where T is a * ((conjugate) transposed) condensed block row (or column), copied from * a scattered block column (or row) of A * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER IA, IB, K * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. External Subroutines .. EXTERNAL PBDMATADD * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Excutable Statements .. * IF( INTV.EQ.NB ) THEN CALL PBDMATADD( ICONTXT, TRANS, N, M, ONE, A, LDA, BETA, B, $ LDB ) * ELSE * * If A is a column block ( ADIST = 'C' ), * IF( LSAME( ADIST, 'C' ) ) THEN IA = 1 IB = 1 DO 10 K = 1, ICEIL( M, INTV ) CALL PBDMATADD( ICONTXT, TRANS, N, MIN( M-IA+1, NB ), $ ONE, A(IA,1), LDA, BETA, B(1,IB), LDB ) IA = IA + INTV IB = IB + NB 10 CONTINUE * * If A is a row block (ADIST = 'R'), * ELSE IA = 1 IB = 1 DO 20 K = 1, ICEIL( N, INTV ) CALL PBDMATADD( ICONTXT, TRANS, MIN( N-IA+1, NB ), M, $ ONE, A(1,IA), LDA, BETA, B(IB,1), LDB ) IA = IA + INTV IB = IB + NB 20 CONTINUE END IF END IF * RETURN * * End of PBDTR2BT * END * *======================================================================= * SUBROUTINE PBDTR2AF *======================================================================= * SUBROUTINE PBDTR2AF( ICONTXT, ADIST, M, N, NB, A, LDA, BETA, B, $ LDB, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, M, N, NB, LDA, LDB, LCMP, LCMQ, NINT DOUBLE PRECISION BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBDTR2AF forms T <== A + BETA*T, where T is a scattered block * row (or column) copied from a (condensed) block column (or row) of A * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER JA, JB, K, INTV * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( LSAME( ADIST, 'R' ) ) THEN INTV = NB * LCMQ JA = 1 JB = 1 DO 10 K = 1, ICEIL( NINT, NB ) CALL PBDMATADD( ICONTXT, 'G', M, MIN( N-JB+1, NB ), ONE, $ A(1,JA), LDA, BETA, B(1,JB), LDB ) JA = JA + NB JB = JB + INTV 10 CONTINUE * * if( LSAME( ADIST, 'C' ) ) then * ELSE INTV = NB * LCMP JA = 1 JB = 1 DO 20 K = 1, ICEIL( NINT, NB ) CALL PBDMATADD( ICONTXT, 'G', MIN( M-JB+1, NB ), N, ONE, $ A(JA,1), LDA, BETA, B(JB,1), LDB ) JA = JA + NB JB = JB + INTV 20 CONTINUE END IF * RETURN * * End of PBDTR2AF * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbdtrget.f000644 000766 000024 00000010033 10363532303 020121 0ustar00juliestaff000000 000000 SUBROUTINE PBDTRGET( ICONTXT, ADIST, M, N, MNB, A, LDA, MCROW, $ MCCOL, IGD, MYROW, MYCOL, NPROW, NPCOL ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, IGD, LDA, M, MCCOL, MCROW, MNB, MYCOL, $ MYROW, N, NPCOL, NPROW * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * PBDTRGET forms a row block of A from scattered row subblocks if * ADIST = 'R', or forms a column block of A from scattered column * subblocks, if ADIST = 'C'. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. Local Variables .. INTEGER KINT, KINT2, KLEN, KMOD, KPPOS, NLEN, NNUM, $ NTLEN REAL TEMP * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL LSAME, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL DGERV2D, DGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * * if A is a row block, it needs to communicate columnwise. * IF( LSAME( ADIST, 'R' ) ) THEN KPPOS = MOD( NPROW+MYROW-MCROW, NPROW ) IF( MOD( KPPOS, IGD ).EQ.0 ) THEN KINT = IGD NLEN = N NNUM = MIN( NPROW/IGD, MNB-MCCOL ) TEMP = REAL( NNUM ) NTLEN = N * NNUM NNUM = IGD * NNUM IF( KPPOS.GE.NNUM ) GO TO 30 KPPOS = MOD( KPPOS, NPROW ) * 10 CONTINUE IF( TEMP.GT.ONE ) THEN KINT2 = 2 * KINT KMOD = MOD( KPPOS, KINT2 ) * IF( KMOD.EQ.0 ) THEN IF( KPPOS+KINT.LT.NNUM ) THEN KLEN = NTLEN - (KPPOS/KINT2)*(KINT2/IGD)*N KLEN = MIN( KLEN-NLEN, NLEN ) CALL DGERV2D( ICONTXT, M, KLEN, A(1,NLEN+1), LDA, $ MOD(MYROW+KINT, NPROW), MYCOL ) NLEN = NLEN + KLEN END IF ELSE CALL DGESD2D( ICONTXT, M, NLEN, A, LDA, $ MOD(NPROW+MYROW-KINT, NPROW), MYCOL ) GO TO 30 END IF * KINT = KINT2 TEMP = TEMP / TWO GO TO 10 END IF END IF * * if A is a column block, it needs to communicate rowwise. * ELSE IF( LSAME( ADIST, 'C' ) ) THEN * KPPOS = MOD( NPCOL+MYCOL-MCCOL, NPCOL ) IF( MOD( KPPOS, IGD ).EQ.0 ) THEN KINT = IGD NLEN = N NNUM = MIN( NPCOL/IGD, MNB-MCROW ) TEMP = REAL( NNUM ) NTLEN = N * NNUM NNUM = IGD * NNUM IF( KPPOS.GE.NNUM ) GO TO 30 KPPOS = MOD( KPPOS, NPCOL ) * 20 CONTINUE IF( TEMP.GT.ONE ) THEN KINT2 = 2 * KINT KMOD = MOD( KPPOS, KINT2 ) * IF( KMOD.EQ.0 ) THEN IF( KPPOS+KINT.LT.NNUM ) THEN KLEN = NTLEN - (KPPOS/KINT2)*(KINT2/IGD)*N KLEN = MIN( KLEN-NLEN, NLEN ) CALL DGERV2D( ICONTXT, M, KLEN, A(1,NLEN+1), LDA, $ MYROW, MOD(MYCOL+KINT, NPCOL) ) NLEN = NLEN + KLEN END IF ELSE CALL DGESD2D( ICONTXT, M, NLEN, A, LDA, MYROW, $ MOD(NPCOL+MYCOL-KINT, NPCOL) ) GO TO 30 END IF * KINT = KINT2 TEMP = TEMP / TWO GO TO 20 END IF END IF END IF * 30 CONTINUE * RETURN * * End of PBDTRGET * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbdtrnv.f000644 000766 000024 00000064701 10363532303 020000 0ustar00juliestaff000000 000000 SUBROUTINE PBDTRNV( ICONTXT, XDIST, TRANS, N, NB, NZ, X, INCX, $ BETA, Y, INCY, IXROW, IXCOL, IYROW, IYCOL, $ WORK ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * Jaeyoung Choi, Oak Ridge National Laboratory * Jack Dongarra, University of Tennessee and Oak Ridge National Lab. * David Walker, Oak Ridge National Laboratory * * .. Scalar Arguments .. CHARACTER*1 TRANS, XDIST INTEGER ICONTXT, INCX, INCY, IXCOL, IXROW, IYCOL, $ IYROW, N, NB, NZ DOUBLE PRECISION BETA * .. * .. Array Arguments .. DOUBLE PRECISION WORK( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PBDTRNV transposes a column vector to row vector, or a row vector to * column vector by reallocating data distribution. * * Y := X' * * where X and Y are N vectors. * * Parameters * ========== * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * XDIST (input) CHARACTER*1 * XDIST specifies whether X is a column vector or a row vector, * * XDIST = 'C', X is a column vector (distributed columnwise) * XDIST = 'R', X is a row vector (distributed rowwise) * * TRANS (input) CHARACTER*1 * TRANS specifies whether the transposed format is transpose * or conjugate transpose. If the vectors X and Y are real, * the argument is ignored. * * TRANS = 'T', transpose * TRANS = 'C', conjugate transpose * * N (input) INTEGER * N specifies the (global) number of the vector X and the * vector Y. N >= 0. * * NB (input) INTEGER * NB specifies the block size of vectors X and Y. NB >= 0. * * NZ (input) INTEGER * NZ is the column offset to specify the column distance from * the beginning of the block to the first element of the * vector X, and the row offset to the first element of the * vector Y if XDIST = 'C'. * Otherwise, it is row offset to specify the row distance * from the beginning of the block to the first element of the * vector X, and the column offset to the first element of the * vector Y. 0 < NZ <= NB. * * X (input) DOUBLE PRECISION array of dimension at least * ( 1 + (Np-1) * abs(INCX)) in IXCOL if XDIST = 'C', or * ( 1 + (Nq-1) * abs(INCX)) in IXROW if XDIST = 'R'. * The incremented array X must contain the vector X. * * INCX (input) INTEGER * INCX specifies the increment for the elements of X. * INCX <> 0. * * BETA (input) DOUBLE PRECISION * BETA specifies scaler beta. * * Y (input/output) DOUBLE PRECISION array of dimension at least * ( 1 + (Nq-1) * abs(INCY)) in IYROW if XDIST = 'C', or * ( 1 + (Np-1) * abs(INCY)) in IYCOL if XDIST = 'R', or * The incremented array Y must contain the vector Y. * Y will not be referenced if beta is zero. * * INCY (input) INTEGER * INCY specifies the increment for the elements of Y. * INCY <> 0. * * IXROW (input) INTEGER * IXROW specifies a row of the process template, which holds * the first element of the vector X. If X is a row vector and * all rows of processes have a copy of X, then set IXROW = -1. * * IXCOL (input) INTEGER * IXCOL specifies a column of the process template, * which holds the first element of the vector X. If X is a * column block and all columns of processes have a copy of X, * then set IXCOL = -1. * * IYROW (input) INTEGER * IYROW specifies the current row process which holds the * first element of the vector Y, which is transposed of X. * If X is a column vector and the transposed row vector Y is * distributed all rows of processes, set IYROW = -1. * * IYCOL (input) INTEGER * IYCOL specifies the current column process which holds * the first element of the vector Y, which is transposed of Y. * If X is a row block and the transposed column vector Y is * distributed all columns of processes, set IYCOL = -1. * * WORK (workspace) DOUBLE PRECISION array of dimension Size(WORK). * It needs extra working space of x**T or x**H. * * Parameters Details * ================== * * Nx It is a local portion of N owned by a process, where x is * replaced by either p (=NPROW) or q (=NPCOL)). The value is * determined by N, NB, NZ, x, and MI, where NB is a block size, * NZ is a offset from the beginning of the block, and MI is a * row or column position in a process template. Nx is equal * to or less than Nx0 = CEIL( N+NZ, NB*x ) * NB. * * Communication Scheme * ==================== * * The communication scheme of the routine is set to '1-tree', which is * fan-out. (For details, see BLACS user's guide.) * * Memory Requirement of WORK * ========================== * * NN = N + NZ * Npb = CEIL( NN, NB*NPROW ) * Nqb = CEIL( NN, NB*NPCOL ) * LCMP = LCM / NPROW * LCMQ = LCM / NPCOL * * (1) XDIST = 'C' * (a) IXCOL != -1 * Size(WORK) = CEIL(Nqb,LCMQ)*NB * (b) IXCOL = -1 * Size(WORK) = CEIL(Nqb,LCMQ)*NB * MIN(LCMQ,CEIL(NN,NB)) * * (2) XDIST = 'R' * (a) IXROW != -1 * Size(WORK) = CEIL(Npb,LCMP)*NB * (b) IXROW = -1 * Size(WORK) = CEIL(Npb,LCMP)*NB * MIN(LCMP,CEIL(NN,NB)) * * Notes * ----- * More precise space can be computed as * * CEIL(Npb,LCMP)*NB => NUMROC( NUMROC(NN,NB,0,0,NPROW), NB, 0, 0, LCMP) * CEIL(Nqb,LCMQ)*NB => NUMROC( NUMROC(NN,NB,0,0,NPCOL), NB, 0, 0, LCMQ) * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL COLFORM, ROWFORM INTEGER I, IDEX, IGD, INFO, JDEX, JYCOL, JYROW, JZ, KZ, $ LCM, LCMP, LCMQ, MCCOL, MCROW, MRCOL, MRROW, $ MYCOL, MYROW, NN, NP, NP0, NP1, NPCOL, NPROW, $ NQ, NQ0, NQ1 DOUBLE PRECISION TBETA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, ICEIL, NUMROC EXTERNAL LSAME, ILCM, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGEBR2D, DGEBS2D, DGERV2D, $ DGESD2D, PBDTR2A1, PBDTR2B1, PBDTRGET, $ PBDTRST1, PBDVECADD, PXERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible. * IF( N.EQ.0 ) RETURN * CALL BLACS_GRIDINFO( ICONTXT, NPROW, NPCOL, MYROW, MYCOL ) * COLFORM = LSAME( XDIST, 'C' ) ROWFORM = LSAME( XDIST, 'R' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.COLFORM ) .AND. ( .NOT.ROWFORM ) ) THEN INFO = 2 ELSE IF( N .LT.0 ) THEN INFO = 4 ELSE IF( NB .LT.1 ) THEN INFO = 5 ELSE IF( NZ .LT.0 .OR. NZ.GE.NB ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 ELSE IF( IXROW.LT.-1 .OR. IXROW.GE.NPROW .OR. $ ( IXROW.EQ.-1 .AND. COLFORM ) ) THEN INFO = 12 ELSE IF( IXCOL.LT.-1 .OR. IXCOL.GE.NPCOL .OR. $ ( IXCOL.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 13 ELSE IF( IYROW.LT.-1 .OR. IYROW.GE.NPROW .OR. $ ( IYROW.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 14 ELSE IF( IYCOL.LT.-1 .OR. IYCOL.GE.NPCOL .OR. $ ( IYCOL.EQ.-1 .AND. COLFORM ) ) THEN INFO = 15 END IF * 10 CONTINUE IF( INFO.NE.0 ) THEN CALL PXERBLA( ICONTXT, 'PBDTRNV ', INFO ) RETURN END IF * * Start the operations. * * LCM : the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LCMQ = LCM / NPCOL IGD = NPCOL / LCMP NN = N + NZ * * When x is a column vector * IF( COLFORM ) THEN * * Form y <== x' ( x is a column vector ) * * || * || * _____________ || * -----(y)----- <== (x) * || * || * || * IF( IXROW.LT.0 .OR. IXROW.GE.NPROW ) THEN INFO = 12 ELSE IF( IXCOL.LT.-1 .OR. IXCOL.GE.NPCOL ) THEN INFO = 13 ELSE IF( IYROW.LT.-1 .OR. IYROW.GE.NPROW ) THEN INFO = 14 ELSE IF( IYCOL.LT.0 .OR. IYCOL.GE.NPCOL ) THEN INFO = 15 END IF IF( INFO.NE.0 ) GO TO 10 * * MRROW : row relative position in template from IXROW * MRCOL : column relative position in template from IYCOL * MRROW = MOD( NPROW+MYROW-IXROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IYCOL, NPCOL ) JYROW = IYROW IF( IYROW.EQ.-1 ) JYROW = IXROW * NP = NUMROC( NN, NB, MYROW, IXROW, NPROW ) IF( MRROW.EQ.0 ) NP = NP - NZ NQ = NUMROC( NN, NB, MYCOL, IYCOL, NPCOL ) IF( MRCOL.EQ.0 ) NQ = NQ - NZ NQ0 = NUMROC( NUMROC(NN, NB, 0, 0, NPCOL), NB, 0, 0, LCMQ ) * * When a column process of IXCOL has a column block A, * IF( IXCOL .GE. 0 ) THEN TBETA = ZERO IF( MYROW.EQ.JYROW ) TBETA = BETA KZ = NZ * DO 20 I = 0, MIN( LCM, ICEIL(NN,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IXROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IYCOL, NPCOL ) IF( LCMQ.EQ.1 ) NQ0 = NUMROC( NN, NB, I, 0, NPCOL ) JDEX = (I/NPCOL) * NB IF( MRCOL.EQ.0 ) JDEX = MAX(0, JDEX-NZ) * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.IXCOL ) THEN * * The source node is a destination node * IDEX = (I/NPROW) * NB IF( MRROW.EQ.0 ) IDEX = MAX( 0, IDEX-NZ ) IF( MYROW.EQ.JYROW .AND. MYCOL.EQ.MCCOL ) THEN CALL PBDTR2B1( ICONTXT, TRANS, NP-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, TBETA, $ Y(JDEX*INCY+1), INCY, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBDTR2B1( ICONTXT, TRANS, NP-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMP, 1 ) CALL DGESD2D( ICONTXT, 1, NQ0-KZ, WORK, 1, $ JYROW, MCCOL ) END IF * * A destination node receives the copied vector * ELSE IF( MYROW.EQ.JYROW .AND. MYCOL.EQ.MCCOL ) THEN IF( LCMQ.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL DGERV2D( ICONTXT, 1, NQ0-KZ, Y, INCY, $ MCROW, IXCOL ) ELSE CALL DGERV2D( ICONTXT, 1, NQ0-KZ, WORK, 1, $ MCROW, IXCOL ) CALL PBDTR2A1( ICONTXT, NQ-JDEX, NB, KZ, WORK, 1, TBETA, $ Y(JDEX*INCY+1), INCY, LCMQ*NB ) END IF END IF KZ = 0 20 CONTINUE * * Broadcast a row block of WORK in each column of template * IF( IYROW.EQ.-1 ) THEN IF( MYROW.EQ.JYROW ) THEN CALL DGEBS2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY ) ELSE CALL DGEBR2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY, $ JYROW, MYCOL ) END IF END IF * * When all column procesors have a copy of the column block A, * ELSE IF( LCMQ.EQ.1 ) NQ0 = NQ * * Processors, which have diagonal blocks of X, copy them to * WORK array in transposed form * KZ = 0 IF( MRROW.EQ.0 ) KZ = NZ JZ = 0 IF( MRROW.EQ.0 .AND. MYCOL.EQ.IYCOL ) JZ = NZ * DO 30 I = 0, LCMP - 1 IF( MRCOL.EQ.MOD(NPROW*I+MRROW, NPCOL) ) THEN IDEX = MAX( 0, I*NB-KZ ) IF( LCMQ.EQ.1 .AND. (IYROW.EQ.-1.OR.IYROW.EQ.MYROW) ) THEN CALL PBDTR2B1( ICONTXT, TRANS, NP-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, BETA, Y, INCY, $ LCMP, 1 ) ELSE CALL PBDTR2B1( ICONTXT, TRANS, NP-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMP, 1 ) END IF END IF 30 CONTINUE * * Get diagonal blocks of A for each column of the template * MCROW = MOD( MOD(MRCOL, NPROW) + IXROW, NPROW ) IF( LCMQ.GT.1 ) THEN MCCOL = MOD( NPCOL+MYCOL-IYCOL, NPCOL ) CALL PBDTRGET( ICONTXT, 'Row', 1, NQ0, ICEIL( NN, NB ), $ WORK, 1, MCROW, MCCOL, IGD, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * * Broadcast a row block of WORK in every row of template * IF( IYROW.EQ.-1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( LCMQ.GT.1 ) THEN KZ = 0 IF( MYCOL.EQ.IYCOL ) KZ = NZ CALL PBDTRST1( ICONTXT, 'Row', NQ, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NQ0 ) END IF CALL DGEBS2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY ) ELSE CALL DGEBR2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY, $ MCROW, MYCOL ) END IF * * Send a row block of WORK to the destination row * ELSE IF( LCMQ.EQ.1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.IYROW ) $ CALL DGESD2D( ICONTXT, 1, NQ0, WORK, 1, IYROW, MYCOL ) ELSE IF( MYROW.EQ.IYROW ) THEN IF( BETA.EQ.ZERO ) THEN CALL DGERV2D( ICONTXT, 1, NQ0, Y, INCY, MCROW, MYCOL ) ELSE CALL DGERV2D( ICONTXT, 1, NQ0, WORK, 1, MCROW, MYCOL ) CALL PBDVECADD( ICONTXT, 'G', NQ0, ONE, WORK, 1, $ BETA, Y, INCY ) END IF END IF * ELSE NQ1 = NQ0 * MIN( LCMQ, MAX( 0, ICEIL(NN,NB)-MCCOL ) ) IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.IYROW ) $ CALL DGESD2D( ICONTXT, 1, NQ1, WORK, 1, IYROW, MYCOL ) ELSE IF( MYROW.EQ.IYROW ) THEN CALL DGERV2D( ICONTXT, 1, NQ1, WORK, 1, MCROW, MYCOL ) END IF * IF( MYROW.EQ.IYROW ) THEN KZ = 0 IF( MYCOL.EQ.IYCOL ) KZ = NZ CALL PBDTRST1( ICONTXT, 'Row', NQ, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NQ0 ) END IF END IF END IF END IF * * When x is a row vector * ELSE * * Form y <== x' ( x is a row block ) * * || * || * || _____________ * (y) <== -----(x)----- * || * || * || * IF( IXROW.LT.-1 .OR. IXROW.GE.NPROW ) THEN INFO = 12 ELSE IF( IXCOL.LT.0 .OR. IXCOL.GE.NPCOL ) THEN INFO = 13 ELSE IF( IYROW.LT.0 .OR. IYROW.GE.NPROW ) THEN INFO = 14 ELSE IF( IYCOL.LT.-1 .OR. IYCOL.GE.NPCOL ) THEN INFO = 15 END IF IF( INFO.NE.0 ) GO TO 10 * * MRROW : row relative position in template from IYROW * MRCOL : column relative position in template from IXCOL * MRROW = MOD( NPROW+MYROW-IYROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IXCOL, NPCOL ) JYCOL = IYCOL IF( IYCOL.EQ.-1 ) JYCOL = IXCOL * NP = NUMROC( NN, NB, MYROW, IYROW, NPROW ) IF( MRROW.EQ.0 ) NP = NP - NZ NQ = NUMROC( NN, NB, MYCOL, IXCOL, NPCOL ) IF( MRCOL.EQ.0 ) NQ = NQ - NZ NP0 = NUMROC( NUMROC(NN, NB, 0, 0, NPROW), NB, 0, 0, LCMP ) * * When a row process of IXROW has a row block A, * IF( IXROW .GE. 0 ) THEN TBETA = ZERO IF( MYCOL.EQ.JYCOL ) TBETA = BETA KZ = NZ * DO 40 I = 0, MIN( LCM, ICEIL(NN,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IYROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IXCOL, NPCOL ) IF( LCMP.EQ.1 ) NP0 = NUMROC( NN, NB, I, 0, NPROW ) JDEX = (I/NPROW) * NB IF( MRROW.EQ.0 ) JDEX = MAX(0, JDEX-NZ) * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.IXROW .AND. MYCOL.EQ.MCCOL ) THEN * * The source node is a destination node * IDEX = (I/NPCOL) * NB IF( MRCOL.EQ.0 ) IDEX = MAX( 0, IDEX-NZ ) IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JYCOL ) THEN CALL PBDTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, TBETA, $ Y(JDEX*INCY+1), INCY, LCMQ, LCMP ) * * The source node sends blocks to a destination node * ELSE CALL PBDTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMQ, 1 ) CALL DGESD2D( ICONTXT, 1, NP0-KZ, WORK, 1, $ MCROW, JYCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JYCOL ) THEN IF( LCMP.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL DGERV2D( ICONTXT, 1, NP0-KZ, Y, INCY, $ IXROW, MCCOL ) ELSE CALL DGERV2D( ICONTXT, 1, NP0-KZ, WORK, 1, $ IXROW, MCCOL ) CALL PBDTR2A1( ICONTXT, NP-JDEX, NB, KZ, WORK, 1, TBETA, $ Y(JDEX*INCY+1), INCY, LCMP*NB ) END IF END IF KZ = 0 40 CONTINUE * * Broadcast a column vector Y in each row of template * IF( IYCOL.EQ.-1 ) THEN IF( MYCOL.EQ.JYCOL ) THEN CALL DGEBS2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY ) ELSE CALL DGEBR2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY, $ MYROW, JYCOL ) END IF END IF * * When all row procesors have a copy of the row block A, * ELSE IF( LCMP.EQ.1 ) NP0 = NP * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * KZ = 0 IF( MRCOL.EQ.0 ) KZ = NZ JZ = 0 IF( MRCOL.EQ.0 .AND. MYROW.EQ.IYROW ) JZ = NZ * DO 50 I = 0, LCMQ-1 IF( MRROW.EQ.MOD(NPCOL*I+MRCOL, NPROW) ) THEN IDEX = MAX( 0, I*NB-KZ ) IF( LCMP.EQ.1 .AND. (IYCOL.EQ.-1.OR.IYCOL.EQ.MYCOL) ) THEN CALL PBDTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, BETA, Y, INCY, $ LCMQ, 1 ) ELSE CALL PBDTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMQ, 1 ) END IF END IF 50 CONTINUE * * Get diagonal blocks of A for each row of the template * MCCOL = MOD( MOD(MRROW, NPCOL) + IXCOL, NPCOL ) IF( LCMP.GT.1 ) THEN MCROW = MOD( NPROW+MYROW-IYROW, NPROW ) CALL PBDTRGET( ICONTXT, 'Col', 1, NP0, ICEIL( NN, NB ), $ WORK, 1, MCROW, MCCOL, IGD, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * * Broadcast a column block of WORK in every column of template * IF( IYCOL.EQ.-1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( LCMP.GT.1 ) THEN KZ = 0 IF( MYROW.EQ.IYROW ) KZ = NZ CALL PBDTRST1( ICONTXT, 'Col', NP, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NP0 ) END IF CALL DGEBS2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY ) ELSE CALL DGEBR2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY, $ MYROW, MCCOL ) END IF * * Send a column block of WORK to the destination column * ELSE IF( LCMP.EQ.1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.IYCOL ) $ CALL DGESD2D( ICONTXT, 1, NP, WORK, 1, MYROW, IYCOL ) ELSE IF( MYCOL.EQ.IYCOL ) THEN IF( BETA.EQ.ZERO ) THEN CALL DGERV2D( ICONTXT, 1, NP, Y, INCY, MYROW, MCCOL ) ELSE CALL DGERV2D( ICONTXT, 1, NP, WORK, 1, MYROW, MCCOL ) CALL PBDVECADD( ICONTXT, 'G', NP, ONE, WORK, 1, BETA, $ Y, INCY ) END IF END IF * ELSE NP1 = NP0 * MIN( LCMP, MAX( 0, ICEIL(NN,NB)-MCROW ) ) IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.IYCOL ) $ CALL DGESD2D( ICONTXT, 1, NP1, WORK, 1, MYROW, IYCOL ) ELSE IF( MYCOL.EQ.IYCOL ) THEN CALL DGERV2D( ICONTXT, 1, NP1, WORK, 1, MYROW, MCCOL ) END IF * IF( MYCOL.EQ.IYCOL ) THEN KZ = 0 IF( MYROW.EQ.IYROW ) KZ = NZ CALL PBDTRST1( ICONTXT, 'Col', NP, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NP0 ) END IF END IF END IF END IF END IF * RETURN * * End of PBDTRNV * END * *======================================================================= * SUBROUTINE PBDTR2A1 *======================================================================= * SUBROUTINE PBDTR2A1( ICONTXT, N, NB, NZ, X, INCX, BETA, Y, INCY, $ INTV ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. INTEGER ICONTXT, N, NB, NZ, INCX, INCY, INTV DOUBLE PRECISION BETA * .. * .. Array Arguments .. DOUBLE PRECISION X( * ), Y( * ) * .. * * Purpose * ======= * * y <== x * y is a scattered vector, copied from a condensed vector x. * * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. External Subroutines .. EXTERNAL PBDVECADD * .. * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Variables .. INTEGER IX, IY, JZ, K, ITER * IX = 0 IY = 0 JZ = NZ ITER = ICEIL( N+NZ, INTV ) * IF( ITER.GT.1 ) THEN CALL PBDVECADD( ICONTXT, 'G', NB-JZ, ONE, X(IX*INCX+1), INCX, $ BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - JZ IY = IY + INTV - JZ JZ = 0 * DO 10 K = 2, ITER-1 CALL PBDVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), INCX, $ BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + INTV 10 CONTINUE END IF * CALL PBDVECADD( ICONTXT, 'G', MIN( N-IY, NB-JZ ), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), INCY ) * RETURN * * End of PBDTR2A1 * END * *======================================================================= * SUBROUTINE PBDTR2B1 *======================================================================= * SUBROUTINE PBDTR2B1( ICONTXT, TRANS, N, NB, NZ, X, INCX, BETA, Y, $ INCY, JINX, JINY ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER ICONTXT, N, NB, NZ, INCX, INCY, JINX, JINY DOUBLE PRECISION BETA * .. * .. Array Arguments .. DOUBLE PRECISION X( * ), Y( * ) * .. * * Purpose * ======= * * y <== x + beta * y * y is a condensed vector, copied from a scattered vector x * * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. External Subroutines .. EXTERNAL PBDVECADD * .. * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Variables .. INTEGER IX, IY, JZ, K, ITER, LENX, LENY * IF( JINX.EQ.1 .AND. JINY.EQ.1 ) THEN CALL PBDVECADD( ICONTXT, TRANS, N, ONE, X, INCX, BETA, $ Y, INCY ) * ELSE IX = 0 IY = 0 JZ = NZ LENX = NB * JINX LENY = NB * JINY ITER = ICEIL( N+NZ, LENX ) * IF( ITER.GT.1 ) THEN CALL PBDVECADD( ICONTXT, TRANS, NB-JZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + LENX - JZ IY = IY + LENY - JZ JZ = 0 * DO 10 K = 2, ITER-1 CALL PBDVECADD( ICONTXT, TRANS, NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + LENX IY = IY + LENY 10 CONTINUE END IF * CALL PBDVECADD( ICONTXT, TRANS, MIN( N-IX, NB-JZ ), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), INCY ) END IF * RETURN * * End of PBDTR2B1 * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbdtrsrt.f000644 000766 000024 00000004501 10363532303 020155 0ustar00juliestaff000000 000000 SUBROUTINE PBDTRSRT( ICONTXT, ADIST, M, N, NB, A, LDA, BETA, B, $ LDB, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, LCMP, LCMQ, LDA, LDB, M, N, NB, NINT DOUBLE PRECISION BETA * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBDTRSRT forms T <== A + beta * T, where T is a sorted * condensed block row (or column) from a block column (or row) of A * with sorting index ISRT * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Variables .. INTEGER JA, JB, K, KK, NJUMP * .. * .. External Subroutines .. EXTERNAL PBDMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( LSAME( ADIST, 'R' ) ) THEN NJUMP = NB * LCMQ DO 20 K = 0, LCMQ-1 JA = NINT * MOD( K*LCMP, LCMQ ) + 1 JB = K * NB + 1 * DO 10 KK = 1, ICEIL( NINT, NB ) IF( N.LT.JB ) GO TO 20 CALL PBDMATADD( ICONTXT, 'G', M, MIN( N-JB+1, NB ), ONE, $ A(1, JA), LDA, BETA, B(1, JB), LDB ) JA = JA + NB JB = JB + NJUMP 10 CONTINUE 20 CONTINUE * * if( LSAME( ADIST, 'C') ) then * ELSE NJUMP = NB * LCMP DO 40 K = 0, LCMP-1 JA = 1 JB = K * NB + 1 * DO 30 KK = 1, ICEIL( NINT, NB ) IF( M.LT.JB ) GO TO 40 CALL PBDMATADD( ICONTXT, 'G', MIN( M-JB+1, NB ), N, ONE, $ A(JA, N*MOD(K*LCMQ,LCMP)+1), LDA, BETA, $ B(JB, 1), LDB ) JA = JA + NB JB = JB + NJUMP 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of PBDTRSRT * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbdtrst1.f000644 000766 000024 00000006451 10363532303 020062 0ustar00juliestaff000000 000000 SUBROUTINE PBDTRST1( ICONTXT, XDIST, N, NB, NZ, X, INCX, BETA, Y, $ INCY, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 XDIST INTEGER ICONTXT, INCX, INCY, LCMP, LCMQ, N, NB, NINT, $ NZ DOUBLE PRECISION BETA * .. * .. Array Arguments .. DOUBLE PRECISION X( * ), Y( * ) * .. * * Purpose * ======= * * PBDTRST1 forms y <== x + beta * y, where y is a sorted * condensed row (or column) vector from a column (or row) vector of x. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Variables .. INTEGER ITER, IX, IY, K, KK, KZ, NJUMP * .. * .. External Subroutines .. EXTERNAL PBDVECADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MAX, MOD * .. * .. Executable Statements .. * ITER = ICEIL( NINT, NB ) KZ = NZ * IF( LSAME( XDIST, 'R' ) ) THEN NJUMP = NB * LCMQ * DO 20 KK = 0, LCMQ-1 IX = NINT * MOD( KK*LCMP, LCMQ ) IY = MAX( 0, NB*KK-NZ ) IF( N.LT.IY ) GO TO 50 * IF( ITER.GT.1 ) THEN CALL PBDVECADD( ICONTXT, 'G', NB-KZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - KZ IY = IY + NJUMP - KZ KZ = 0 * DO 10 K = 2, ITER-1 CALL PBDVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + NJUMP 10 CONTINUE END IF * CALL PBDVECADD( ICONTXT, 'G', MIN(NB-KZ,N-IY), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), $ INCY ) KZ = 0 20 CONTINUE * * if( LSAME( XDIST, 'C' ) ) then * ELSE NJUMP = NB * LCMP * DO 40 KK = 0, LCMP-1 IX = NINT * MOD( KK*LCMQ, LCMP ) IY = MAX( 0, NB*KK-NZ ) IF( N.LT.IY ) GO TO 50 * IF( ITER.GT.1 ) THEN CALL PBDVECADD( ICONTXT, 'G', NB-KZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - KZ IY = IY + NJUMP - KZ KZ = 0 * DO 30 K = 2, ITER-1 CALL PBDVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + NJUMP 30 CONTINUE END IF * CALL PBDVECADD( ICONTXT, 'G', MIN(NB-KZ,N-IY), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), $ INCY ) KZ = 0 40 CONTINUE END IF * 50 CONTINUE * RETURN * * End of PBDTRST1 * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbdvecadd.f000644 000766 000024 00000015527 10363532303 020237 0ustar00juliestaff000000 000000 SUBROUTINE PBDVECADD( ICONTXT, MODE, N, ALPHA, X, INCX, BETA, Y, $ INCY ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 MODE INTEGER ICONTXT, INCX, INCY, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. DOUBLE PRECISION X( * ), Y( * ) * * .. * * Purpose * ======= * * PBDVECADD performs a vector X to be added to Y * Y := alpha*op(X) + beta*Y, * where alpha and beta are scalars, and X and Y are n vectors, * and op(X) = X**H if MODE = 'C', * * Arguments * ========= * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * MODE (input) CHARACTER*1 * Specifies the transposed, or conjugate transposed vector X * to be added to the vector Y * = 'C': Conjugate vector X is added for complex data set. * Y = alpha * X**H + beta * Y * ELSE : Vector X is added. Y = alpha*X + beta*Y * if MODE = 'V', BLAS routine may be used. * * N (input) INTEGER * The number of elements of the vectors X and Y to be added. * N >= 0. * * ALPHA (input) DOUBLE PRECISION * ALPHA specifies the scalar alpha. * * X (input) DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( N - 1 )*abs( INCX ) ) * The incremented array X must contain the vector X. * * INCX (input) INTEGER * INCX specifies the increment for the elements of X. * INCX <> 0. * * BETA (input) DOUBLE PRECISION * BETA specifies the scalar beta. * * Y (input/output) DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( N - 1 )*abs( INCY ) ) * On entry with BETA non-zero, the incremented array Y must * contain the vector Y. * On exit, Y is overwritten by the updated vector Y. * * INCY - (input) INTEGER * INCY specifies the increment for the elements of Y. * INCY <> 0. * * ===================================================================== * * .. * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0) * .. * .. Local Scalars .. INTEGER I, IX, IY * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DSCAL, DCOPY, DAXPY * .. * .. Executable Statements .. * IF( N.LE.0 .OR. ( ALPHA.EQ.ZERO .AND. BETA.EQ.ONE ) ) RETURN * IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN IF( INCY.EQ.1 ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE IY = 1 DO 20 I = 1, N Y( IY ) = ZERO IY = IY + INCY 20 CONTINUE END IF * ELSE IF( LSAME( MODE, 'V' ) ) THEN CALL DSCAL( N, BETA, Y, INCY ) ELSE IF( INCY.EQ.1 ) THEN DO 30 I = 1, N Y( I ) = BETA * Y( I ) 30 CONTINUE ELSE IY = 1 DO 40 I = 1, N Y( IY ) = BETA * Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( LSAME( MODE, 'V' ) ) THEN CALL DCOPY( N, X, INCX, Y, INCY ) ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 50 I = 1, N Y( I ) = X( I ) 50 CONTINUE ELSE IX = 1 IY = 1 DO 60 I = 1, N Y( IY ) = X( IX ) IX = IX + INCX IY = IY + INCY 60 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 70 I = 1, N Y( I ) = X( I ) + Y( I ) 70 CONTINUE ELSE IX = 1 IY = 1 DO 80 I = 1, N Y( IY ) = X( IX ) + Y( IY ) IX = IX + INCX IY = IY + INCY 80 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 90 I = 1, N Y( I ) = X( I ) + BETA * Y( I ) 90 CONTINUE ELSE IX = 1 IY = 1 DO 100 I = 1, N Y( IY ) = X( IX ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 100 CONTINUE END IF END IF * ELSE IF( BETA.EQ.ZERO ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 110 I = 1, N Y( I ) = ALPHA * X( I ) 110 CONTINUE ELSE IX = 1 IY = 1 DO 120 I = 1, N Y( IY ) = X( IX ) IX = IX + INCX IY = IY + INCY 120 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( LSAME( MODE, 'V' ) ) THEN CALL DAXPY( N, ALPHA, X, INCX, Y, INCY ) ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 130 I = 1, N Y( I ) = ALPHA * X( I ) + Y( I ) 130 CONTINUE ELSE IX = 1 IY = 1 DO 140 I = 1, N Y( IY ) = ALPHA * X( IX ) + Y( IY ) IX = IX + INCX IY = IY + INCY 140 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 150 I = 1, N Y( I ) = ALPHA * X( I ) + BETA * Y( I ) 150 CONTINUE ELSE IX = 1 IY = 1 DO 160 I = 1, N Y( IY ) = ALPHA * X( IX ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 160 CONTINUE END IF END IF END IF END IF * RETURN * * End of PBDVECADD * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbsmatadd.f000644 000766 000024 00000030253 10363532303 020253 0ustar00juliestaff000000 000000 SUBROUTINE PBSMATADD( ICONTXT, MODE, M, N, ALPHA, A, LDA, BETA, B, $ LDB ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 MODE INTEGER ICONTXT, LDA, LDB, M, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBSMATADD performs the matrix add operation B := alpha*A + beta*B, * where alpha and beta are scalars, and A and B are m-by-n * upper/lower trapezoidal matrices, or rectangular matrices. * * Arguments * ========= * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * MODE (input) CHARACTER*1 * Specifies the part of the matrix A, or (conjugate) transposed * matrix A to be added to the matrix B, * = 'U': Upper triangular part * up(B) = alpha*up(A) + beta*up(B) * = 'L': Lower triangular part * lo(B) = alpha*lo(A) + beta*lo(B) * = 'T': Transposed matrix A * B = alpha*A**T + beta*B * = 'C': Conjugate transposed matrix A * B = alpha*A**H + beta*B * Otherwise: B = alpha*A + beta*B * if M = LDA = LDB: use one BLAS loop * if MODE = 'V' : columnwise copy using BLAS if possible * else : use double loops * * M (input) INTEGER * M specifies the number of columns of the matrix A if * MODE != 'T'/'C', and it specifies the number of rows of the * matrix A otherwise. It also specifies the number of rows of * the matrix B. M >= 0. * * N (input) INTEGER * N specifies the number of rows of the matrix A if * MODE != 'T'/'C', and it specifies the number of columns of * the matrix A otherwise. It also specifies the number of * columns of the matrix B. N >= 0. * * ALPHA (input) REAL * ALPHA specifies the scalar alpha. * * A (input) REAL array, dimension (LDA,N) * The m by n matrix A if MODE != 'T'/'C'. * If MODE = 'U', only the upper triangle or trapezoid is * accessed; if MODE = 'L', only the lower triangle or * trapezoid is accessed. Otherwise all m-by-n data matrix * is accessed. * And the n by m matrix A if MODE = 'T'/'C'. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M) if * MODE != 'T'/'C'. And LDA >= max(1,N) if MODE = 'T'/'C'. * * BETA (input) REAL * BETA specifies the scalar beta. * * B (input) REAL array, dimension (LDB,N) * On exit, B = alpha*A + beta*B * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,M). * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL SSCAL, SCOPY, SAXPY * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( M.LE.0 .OR. N.LE.0 .OR. ( ALPHA.EQ.ZERO.AND.BETA.EQ.ONE ) ) $ RETURN * * A is upper triangular or upper trapezoidal, * IF( LSAME( MODE, 'U' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, MIN( J, M ) B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, MIN( J, M ) B( I, J ) = BETA * B( I, J ) 30 CONTINUE 40 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 60 J = 1, N DO 50 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) 50 CONTINUE 60 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 80 J = 1, N DO 70 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) + B( I, J ) 70 CONTINUE 80 CONTINUE ELSE DO 100 J = 1, N DO 90 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 120 J = 1, N DO 110 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) 110 CONTINUE 120 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 140 J = 1, N DO 130 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 130 CONTINUE 140 CONTINUE ELSE DO 160 J = 1, N DO 150 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 150 CONTINUE 160 CONTINUE END IF END IF * * A is lower triangular or upper trapezoidal, * ELSE IF( LSAME( MODE, 'L' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, N DO 170 I = J, M B( I, J ) = ZERO 170 CONTINUE 180 CONTINUE ELSE DO 200 J = 1, N DO 190 I = J, M B( I, J ) = BETA * B( I, J ) 190 CONTINUE 200 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 220 J = 1, N DO 210 I = J, M B( I, J ) = A( I, J ) 210 CONTINUE 220 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 240 J = 1, N DO 230 I = J, M B( I, J ) = A( I, J ) + B( I, J ) 230 CONTINUE 240 CONTINUE ELSE DO 260 J = 1, N DO 250 I = J, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 280 J = 1, N DO 270 I = J, M B( I, J ) = ALPHA * A( I, J ) 270 CONTINUE 280 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 300 J = 1, N DO 290 I = J, M B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 290 CONTINUE 300 CONTINUE ELSE DO 320 J = 1, N DO 310 I = J, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 310 CONTINUE 320 CONTINUE END IF END IF * * If MODE = 'Transpose'/'Conjugate' * ELSE IF( LSAME( MODE, 'T' ) .OR. LSAME( MODE, 'C' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 340 J = 1, N DO 330 I = 1, M B( I, J ) = ZERO 330 CONTINUE 340 CONTINUE ELSE DO 360 J = 1, N DO 350 I = 1, M B( I, J ) = BETA * B( I, J ) 350 CONTINUE 360 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 380 J = 1, N DO 370 I = 1, M B( I, J ) = A( J, I ) 370 CONTINUE 380 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 400 J = 1, N DO 390 I = 1, M B( I, J ) = A( J, I ) + B( I, J ) 390 CONTINUE 400 CONTINUE ELSE DO 420 J = 1, N DO 410 I = 1, M B( I, J ) = A( J, I ) + BETA * B( I, J ) 410 CONTINUE 420 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 440 J = 1, N DO 430 I = 1, M B( I, J ) = ALPHA * A( J, I ) 430 CONTINUE 440 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 460 J = 1, N DO 450 I = 1, M B( I, J ) = ALPHA * A( J, I ) + B( I, J ) 450 CONTINUE 460 CONTINUE ELSE DO 480 J = 1, N DO 470 I = 1, M B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 470 CONTINUE 480 CONTINUE END IF END IF * * Other cases (for genral matrix additions) * ELSE IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 500 J = 1, N DO 490 I = 1, M B( I, J ) = ZERO 490 CONTINUE 500 CONTINUE * ELSE IF( M.EQ.LDB ) THEN CALL SSCAL( M*N, BETA, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 510 J = 1, N CALL SSCAL( M, BETA, B( 1, J ), 1 ) 510 CONTINUE ELSE DO 530 J = 1, N DO 520 I = 1, M B( I, J ) = BETA * B( I, J ) 520 CONTINUE 530 CONTINUE END IF END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( M.EQ.LDA .AND. M.EQ.LDB ) THEN CALL SCOPY( M*N, A( 1, 1 ), 1, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 540 J = 1, N CALL SCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) 540 CONTINUE ELSE DO 560 J = 1, N DO 550 I = 1, M B( I, J ) = A( I, J ) 550 CONTINUE 560 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN DO 580 J = 1, N DO 570 I = 1, M B( I, J ) = A( I, J ) + B( I, J ) 570 CONTINUE 580 CONTINUE * ELSE DO 600 J = 1, N DO 590 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 590 CONTINUE 600 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 620 J = 1, N DO 610 I = 1, M B( I, J ) = ALPHA * A( I, J ) 610 CONTINUE 620 CONTINUE * ELSE IF( BETA.EQ.ONE ) THEN IF( M.EQ.LDA .AND. M.EQ.LDB ) THEN CALL SAXPY( M*N, ALPHA, A( 1, 1 ), 1, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 630 J = 1, N CALL SAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) 630 CONTINUE ELSE DO 650 J = 1, N DO 640 I = 1, M B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 640 CONTINUE 650 CONTINUE END IF * ELSE DO 670 J = 1, N DO 660 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 660 CONTINUE 670 CONTINUE END IF END IF END IF * RETURN * * End of PBSMATADD * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbstran.f000644 000766 000024 00000071035 10363532303 017770 0ustar00juliestaff000000 000000 SUBROUTINE PBSTRAN( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, BETA, $ C, LDC, IAROW, IACOL, ICROW, ICCOL, WORK ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * Jaeyoung Choi, Oak Ridge National Laboratory * Jack Dongarra, University of Tennessee and Oak Ridge National Lab. * David Walker, Oak Ridge National Laboratory * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER IACOL, IAROW, ICCOL, ICONTXT, ICROW, LDA, LDC, $ M, N, NB REAL BETA * .. * .. Array Arguments .. REAL A( LDA, * ), C( LDC, * ), WORK( * ) * .. * * Purpose * ======= * * PBSTRAN transposes a column block to row block, or a row block to * column block by reallocating data distribution. * * C := A^T + beta*C, or C := A^C + beta*C * * where A is an M-by-N matrix and C is an N-by-M matrix, and the size * of M or N is limited to its block size NB. * * The first elements of the matrices A, and C should be located at * the beginnings of their first blocks. (not the middle of the blocks.) * * Parameters * ========== * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * ADIST - (input) CHARACTER*1 * ADIST specifies whether A is a column block or a row block. * * ADIST = 'C', A is a column block * ADIST = 'R', A is a row block * * TRANS - (input) CHARACTER*1 * TRANS specifies whether the transposed format is transpose * or conjugate transpose. If the matrices A and C are real, * the argument is ignored. * * TRANS = 'T', transpose * TRANS = 'C', conjugate transpose * * M - (input) INTEGER * M specifies the (global) number of rows of the matrix (block * column or block row) A and of columns of the matrix C. * M >= 0. * * N - (input) INTEGER * N specifies the (global) number of columns of the matrix * (block column or block row) A and of columns of the matrix * C. N >= 0. * * NB - (input) INTEGER * NB specifies the column block size of the matrix A and the * row block size of the matrix C when ADIST = 'C'. Otherwise, * it specifies the row block size of the matrix A and the * column block size of the matrix C. NB >= 1. * * A (input) REAL array of DIMENSION ( LDA, Lx ), * where Lx is N when ADIST = 'C', or Nq when ADIST = 'R'. * Before entry with ADIST = 'C', the leading Mp by N part of * the array A must contain the matrix A, otherwise the leading * M by Nq part of the array A must contain the matrix A. See * parameter details for the values of Mp and Nq. * * LDA (input) INTEGER * LDA specifies the leading dimension of (local) A as declared * in the calling (sub) program. LDA >= MAX(1,Mp) when * ADIST = 'C', or LDA >= MAX(1,M) otherwise. * * BETA (input) REAL * BETA specifies scaler beta. * * C (input/output) REAL array of DIMENSION ( LDC, Lx ), * where Lx is Mq when ADIST = 'C', or N when ADIST = 'R'. * If ADIST = 'C', the leading N-by-Mq part of the array C * contains the (local) matrix C, otherwise the leading * Np-by-M part of the array C must contain the (local) matrix * C. C will not be referenced if beta is zero. * * LDC (input) INTEGER * LDC specifies the leading dimension of (local) C as declared * in the calling (sub) program. LDC >= MAX(1,N) when ADIST='C', * or LDC >= MAX(1,Np) otherwise. * * IAROW (input) INTEGER * IAROW specifies a row of the process template, * which holds the first block of the matrix A. If A is a row * of blocks (ADIST = 'R') and all rows of processes have a copy * of A, then set IAROW = -1. * * IACOL (input) INTEGER * IACOL specifies a column of the process template, * which holds the first block of the matrix A. If A is a * column of blocks (ADIST = 'C') and all columns of processes * have a copy of A, then set IACOL = -1. * * ICROW (input) INTEGER * ICROW specifies the current row process which holds * the first block of the matrix C, which is transposed of A. * If C is a row of blocks (ADIST = 'C') and the transposed * row block C is distributed all rows of processes, set * ICROW = -1. * * ICCOL (input) INTEGER * ICCOL specifies the current column process which holds * the first block of the matrix C, which is transposed of A. * If C is a column of blocks (ADIST = 'R') and the transposed * column block C is distributed all columns of processes, * set ICCOL = -1. * * WORK (workspace) REAL array of dimension Size(WORK). * It needs extra working space of A'. * * Parameters Details * ================== * * Lx It is a local portion of L owned by a process, (L is * replaced by M, or N, and x is replaced by either p (=NPROW) * or q (=NPCOL)). The value is determined by L, LB, x, and * MI, where LB is a block size and MI is a row or column * position in a process template. Lx is equal to or less * than Lx0 = CEIL( L, LB*x ) * LB. * * Communication Scheme * ==================== * * The communication scheme of the routine is set to '1-tree', which is * fan-out. (For details, see BLACS user's guide.) * * Memory Requirement of WORK * ========================== * * Mqb = CEIL( M, NB*NPCOL ) * Npb = CEIL( N, NB*NPROW ) * LCMQ = LCM / NPCOL * LCMP = LCM / NPROW * * (1) ADIST = 'C' * (a) IACOL != -1 * Size(WORK) = N * CEIL(Mqb,LCMQ)*NB * (b) IACOL = -1 * Size(WORK) = N * CEIL(Mqb,LCMQ)*NB * MIN(LCMQ,CEIL(M,NB)) * * (2) ADIST = 'R' * (a) IAROW != -1 * Size(WORK) = M * CEIL(Npb,LCMP)*NB * (b) IAROW = -1 * Size(WORK) = M * CEIL(Npb,LCMP)*NB * MIN(LCMP,CEIL(N,NB)) * * Notes * ----- * More precise space can be computed as * * CEIL(Mqb,LCMQ)*NB => NUMROC( NUMROC(M,NB,0,0,NPCOL), NB, 0, 0, LCMQ ) * CEIL(Npb,LCMP)*NB => NUMROC( NUMROC(N,NB,0,0,NPROW), NB, 0, 0, LCMP ) * * ===================================================================== * * .. * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLFORM, ROWFORM INTEGER I, IDEX, IGD, INFO, JCCOL, JCROW, JDEX, LCM, $ LCMP, LCMQ, MCCOL, MCROW, ML, MP, MQ, MQ0, $ MRCOL, MRROW, MYCOL, MYROW, NP, NP0, NPCOL, $ NPROW, NQ REAL TBETA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, ICEIL, NUMROC EXTERNAL ILCM, ICEIL, LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PBSMATADD, PBSTR2AF, PBSTR2AT, $ PBSTR2BT, PBSTRGET, PBSTRSRT, PXERBLA, SGEBR2D, $ SGEBS2D, SGERV2D, SGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible. * IF( M.EQ.0 .OR. N.EQ.0 ) RETURN * CALL BLACS_GRIDINFO( ICONTXT, NPROW, NPCOL, MYROW, MYCOL ) * COLFORM = LSAME( ADIST, 'C' ) ROWFORM = LSAME( ADIST, 'R' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.COLFORM ) .AND. ( .NOT.ROWFORM ) ) THEN INFO = 2 ELSE IF( M .LT.0 ) THEN INFO = 4 ELSE IF( N .LT.0 ) THEN INFO = 5 ELSE IF( NB.LT.1 ) THEN INFO = 6 ELSE IF( IAROW.LT.-1 .OR. IAROW.GE.NPROW .OR. $ ( IAROW.EQ.-1 .AND. COLFORM ) ) THEN INFO = 12 ELSE IF( IACOL.LT.-1 .OR. IACOL.GE.NPCOL .OR. $ ( IACOL.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 13 ELSE IF( ICROW.LT.-1 .OR. ICROW.GE.NPROW .OR. $ ( ICROW.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 14 ELSE IF( ICCOL.LT.-1 .OR. ICCOL.GE.NPCOL .OR. $ ( ICCOL.EQ.-1 .AND. COLFORM ) ) THEN INFO = 15 END IF * 10 CONTINUE IF( INFO .NE. 0 ) THEN CALL PXERBLA( ICONTXT, 'PBSTRAN ', INFO ) RETURN END IF * * Start the operations. * * LCM : the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LCMQ = LCM / NPCOL IGD = NPCOL / LCMP * * When A is a column block * IF( COLFORM ) THEN * * Form C <== A' ( A is a column block ) * _ * | | * | | * _____________ | | * |______C______| <== |A| * | | * | | * |_| * * MRROW : row relative position in template from IAROW * MRCOL : column relative position in template from ICCOL * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-ICCOL, NPCOL ) JCROW = ICROW IF( ICROW.EQ.-1 ) JCROW = IAROW * MP = NUMROC( M, NB, MYROW, IAROW, NPROW ) MQ = NUMROC( M, NB, MYCOL, ICCOL, NPCOL ) MQ0 = NUMROC( NUMROC(M, NB, 0, 0, NPCOL), NB, 0, 0, LCMQ ) * IF( LDA.LT.MP .AND. $ ( IACOL.EQ.MYCOL .OR. IACOL.EQ.-1 ) ) THEN INFO = 8 ELSE IF( LDC.LT.N .AND. $ ( ICROW.EQ.MYROW .OR. ICROW.EQ.-1 ) ) THEN INFO = 11 END IF IF( INFO.NE.0 ) GO TO 10 * * When a column process of IACOL has a column block A, * IF( IACOL.GE.0 ) THEN TBETA = ZERO IF( MYROW.EQ.JCROW ) TBETA = BETA * DO 20 I = 0, MIN( LCM, ICEIL(M,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IAROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + ICCOL, NPCOL ) IF( LCMQ.EQ.1 ) MQ0 = NUMROC( M, NB, I, 0, NPCOL ) JDEX = (I/NPCOL) * NB * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.IACOL ) THEN * * The source node is a destination node * IDEX = (I/NPROW) * NB IF( MYROW.EQ.JCROW .AND. MYCOL.EQ.MCCOL ) THEN CALL PBSTR2AT( ICONTXT, 'Col', TRANS, MP-IDEX, N, NB, $ A(IDEX+1,1), LDA, TBETA, C(1,JDEX+1), $ LDC, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBSTR2BT( ICONTXT, 'Col', TRANS, MP-IDEX, N, NB, $ A(IDEX+1,1), LDA, ZERO, WORK, N, $ LCMP*NB ) CALL SGESD2D( ICONTXT, N, MQ0, WORK, N, JCROW, MCCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.JCROW .AND. MYCOL.EQ.MCCOL ) THEN IF( LCMQ.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL SGERV2D( ICONTXT, N, MQ0, C, LDC, MCROW, IACOL ) ELSE CALL SGERV2D( ICONTXT, N, MQ0, WORK, N, MCROW, IACOL ) CALL PBSTR2AF( ICONTXT, 'Row', N, MQ-JDEX, NB, WORK, N, $ TBETA, C(1,JDEX+1), LDC, LCMP, LCMQ, $ MQ0 ) END IF END IF 20 CONTINUE * * Broadcast a row block of C in each column of template * IF( ICROW.EQ.-1 ) THEN IF( MYROW.EQ.JCROW ) THEN CALL SGEBS2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC ) ELSE CALL SGEBR2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC, $ JCROW, MYCOL ) END IF END IF * * When all column procesors have a copy of the column block A, * ELSE IF( LCMQ.EQ.1 ) MQ0 = MQ * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * DO 30 I = 0, LCMP-1 IF( MRCOL.EQ.MOD( NPROW*I+MRROW, NPCOL ) ) THEN IF( LCMQ.EQ.1.AND.(ICROW.EQ.-1.OR.ICROW.EQ.MYROW) ) THEN CALL PBSTR2BT( ICONTXT, 'Col', TRANS, MP-I*NB, N, NB, $ A(I*NB+1,1), LDA, BETA, C, LDC, $ LCMP*NB ) ELSE CALL PBSTR2BT( ICONTXT, 'Col', TRANS, MP-I*NB, N, NB, $ A(I*NB+1,1), LDA, ZERO, WORK, N, $ LCMP*NB ) END IF END IF 30 CONTINUE * * Get diagonal blocks of A for each column of the template * MCROW = MOD( MOD(MRCOL,NPROW)+IAROW, NPROW ) IF( LCMQ.GT.1 ) THEN MCCOL = MOD( NPCOL+MYCOL-ICCOL, NPCOL ) CALL PBSTRGET( ICONTXT, 'Row', N, MQ0, ICEIL(M,NB), WORK, N, $ MCROW, MCCOL, IGD, MYROW, MYCOL, NPROW, $ NPCOL ) END IF * * Broadcast a row block of WORK in every row of template * IF( ICROW.EQ.-1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( LCMQ.GT.1 ) $ CALL PBSTRSRT( ICONTXT, 'Row', N, MQ, NB, WORK, N, BETA, $ C, LDC, LCMP, LCMQ, MQ0 ) CALL SGEBS2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC ) ELSE CALL SGEBR2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC, $ MCROW, MYCOL ) END IF * * Send a row block of WORK to the destination row * ELSE IF( LCMQ.EQ.1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.ICROW ) $ CALL SGESD2D( ICONTXT, N, MQ, WORK, N, ICROW, MYCOL ) ELSE IF( MYROW.EQ.ICROW ) THEN IF( BETA.EQ.ZERO ) THEN CALL SGERV2D( ICONTXT, N, MQ, C, LDC, MCROW, MYCOL ) ELSE CALL SGERV2D( ICONTXT, N, MQ, WORK, N, MCROW, MYCOL ) CALL PBSMATADD( ICONTXT, 'G', N, MQ, ONE, WORK, N, $ BETA, C, LDC ) END IF END IF * ELSE ML = MQ0 * MIN( LCMQ, MAX(0,ICEIL(M,NB)-MCCOL) ) IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.ICROW ) $ CALL SGESD2D( ICONTXT, N, ML, WORK, N, ICROW, MYCOL ) ELSE IF( MYROW.EQ.ICROW ) THEN CALL SGERV2D( ICONTXT, N, ML, WORK, N, MCROW, MYCOL ) END IF * IF( MYROW.EQ.ICROW ) $ CALL PBSTRSRT( ICONTXT, 'Row', N, MQ, NB, WORK, N, BETA, $ C, LDC, LCMP, LCMQ, MQ0 ) END IF END IF * END IF * * When A is a row block * ELSE * * Form C <== A' ( A is a row block ) * _ * | | * | | * | | _____________ * |C| <== |______A______| * | | * | | * |_| * * MRROW : row relative position in template from ICROW * MRCOL : column relative position in template from IACOL * MRROW = MOD( NPROW+MYROW-ICROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) JCCOL = ICCOL IF( ICCOL.EQ.-1 ) JCCOL = IACOL * NP = NUMROC( N, NB, MYROW, ICROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) NP0 = NUMROC( NUMROC(N, NB, 0, 0, NPROW), NB, 0, 0, LCMP ) * IF( LDA.LT.M .AND. $ ( IAROW.EQ.MYROW .OR. IAROW.EQ.-1 ) ) THEN INFO = 8 ELSE IF( LDC.LT.NP .AND. $ ( ICCOL.EQ.MYCOL .OR. ICCOL.EQ.-1 ) ) THEN INFO = 11 END IF IF( INFO.NE.0 ) GO TO 10 * * When a row process of IAROW has a row block A, * IF( IAROW.GE.0 ) THEN TBETA = ZERO IF( MYCOL.EQ.JCCOL ) TBETA = BETA * DO 40 I = 0, MIN( LCM, ICEIL(N,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + ICROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IACOL, NPCOL ) IF( LCMP.EQ.1 ) NP0 = NUMROC( N, NB, I, 0, NPROW ) IDEX = (I/NPROW) * NB * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.MCCOL ) THEN * * The source node is a destination node * JDEX = (I/NPCOL) * NB IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JCCOL ) THEN CALL PBSTR2AT( ICONTXT, 'Row', TRANS, M, NQ-JDEX, NB, $ A(1,JDEX+1), LDA, TBETA, C(IDEX+1,1), $ LDC, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBSTR2BT( ICONTXT, 'Row', TRANS, M, NQ-JDEX, NB, $ A(1,JDEX+1), LDA, ZERO, WORK, NP0, $ LCMQ*NB ) CALL SGESD2D( ICONTXT, NP0, M, WORK, NP0, $ MCROW, JCCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JCCOL ) THEN IF( LCMP.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL SGERV2D( ICONTXT, NP0, M, C, LDC, IAROW, MCCOL ) ELSE CALL SGERV2D( ICONTXT, NP0, M, WORK, NP0, IAROW, MCCOL ) CALL PBSTR2AF( ICONTXT, 'Col', NP-IDEX, M, NB, WORK, $ NP0, TBETA, C(IDEX+1,1), LDC, LCMP, LCMQ, $ NP0 ) END IF END IF 40 CONTINUE * * Broadcast a column block of WORK in each row of template * IF( ICCOL.EQ.-1 ) THEN IF( MYCOL.EQ.JCCOL ) THEN CALL SGEBS2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC ) ELSE CALL SGEBR2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC, $ MYROW, JCCOL ) END IF END IF * * When all row procesors have a copy of the row block A, * ELSE IF( LCMP.EQ.1 ) NP0 = NP * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * DO 50 I = 0, LCMQ-1 IF( MRROW.EQ.MOD(NPCOL*I+MRCOL, NPROW) ) THEN IF( LCMP.EQ.1.AND.(ICCOL.EQ.-1.OR.ICCOL.EQ.MYCOL) ) THEN CALL PBSTR2BT( ICONTXT, 'Row', TRANS, M, NQ-I*NB, NB, $ A(1,I*NB+1), LDA, BETA, C, LDC, $ LCMQ*NB ) ELSE CALL PBSTR2BT( ICONTXT, 'Row', TRANS, M, NQ-I*NB, NB, $ A(1,I*NB+1), LDA, ZERO, WORK, NP0, $ LCMQ*NB ) END IF END IF 50 CONTINUE * * Get diagonal blocks of A for each row of the template * MCCOL = MOD( MOD(MRROW, NPCOL)+IACOL, NPCOL ) IF( LCMP.GT.1 ) THEN MCROW = MOD( NPROW+MYROW-ICROW, NPROW ) CALL PBSTRGET( ICONTXT, 'Col', NP0, M, ICEIL(N,NB), WORK, $ NP0, MCROW, MCCOL, IGD, MYROW, MYCOL, NPROW, $ NPCOL ) END IF * * Broadcast a column block of WORK in every column of template * IF( ICCOL.EQ.-1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( LCMP.GT.1 ) $ CALL PBSTRSRT( ICONTXT, 'Col', NP, M, NB, WORK, NP0, $ BETA, C, LDC, LCMP, LCMQ, NP0 ) CALL SGEBS2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC ) ELSE CALL SGEBR2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC, $ MYROW, MCCOL ) END IF * * Send a column block of WORK to the destination column * ELSE IF( LCMP.EQ.1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.ICCOL ) $ CALL SGESD2D( ICONTXT, NP, M, WORK, NP, MYROW, ICCOL ) ELSE IF( MYCOL.EQ.ICCOL ) THEN IF( BETA.EQ.ZERO ) THEN CALL SGERV2D( ICONTXT, NP, M, C, LDC, MYROW, MCCOL ) ELSE CALL SGERV2D( ICONTXT, NP, M, WORK, NP, MYROW, MCCOL ) CALL PBSMATADD( ICONTXT, 'G', NP, M, ONE, WORK, NP, $ BETA, C, LDC ) END IF END IF * ELSE ML = M * MIN( LCMP, MAX( 0, ICEIL(N,NB) - MCROW ) ) IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.ICCOL ) $ CALL SGESD2D( ICONTXT, NP0, ML, WORK, NP0, $ MYROW, ICCOL ) ELSE IF( MYCOL.EQ.ICCOL ) THEN CALL SGERV2D( ICONTXT, NP0, ML, WORK, NP0, $ MYROW, MCCOL ) END IF * IF( MYCOL.EQ.ICCOL ) $ CALL PBSTRSRT( ICONTXT, 'Col', NP, M, NB, WORK, NP0, $ BETA, C, LDC, LCMP, LCMQ, NP0 ) END IF END IF * END IF END IF * RETURN * * End of PBSTRAN * END * *======================================================================= * SUBROUTINE PBSTR2AT *======================================================================= * SUBROUTINE PBSTR2AT( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, $ BETA, B, LDB, LCMP, LCMQ ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER ICONTXT, LCMP, LCMQ, LDA, LDB, M, N, NB REAL BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBSTR2AT forms B <== A^T + beta*B, or A^C + beta*B * B is a ((conjugate) transposed) scattered block row (or column), * copied from a scattered block column (or row) of A * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER IA, IB, K, INTV, JNTV * .. * .. External Subroutines .. EXTERNAL PBSMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Excutable Statements .. * IF( LCMP.EQ.LCMQ ) THEN CALL PBSMATADD( ICONTXT, TRANS, N, M, ONE, A, LDA, BETA, B, $ LDB ) * ELSE * * If A is a column block ( ADIST = 'C' ), * IF( LSAME( ADIST, 'C' ) ) THEN INTV = LCMP * NB JNTV = LCMQ * NB IA = 1 IB = 1 DO 10 K = 1, ICEIL( M, INTV ) CALL PBSMATADD( ICONTXT, TRANS, N, MIN( M-IA+1, NB ), $ ONE, A(IA,1), LDA, BETA, B(1,IB), LDB ) IA = IA + INTV IB = IB + JNTV 10 CONTINUE * * If A is a row block ( ADIST = 'R' ), * ELSE INTV = LCMP * NB JNTV = LCMQ * NB IA = 1 IB = 1 DO 20 K = 1, ICEIL( N, JNTV ) CALL PBSMATADD( ICONTXT, TRANS, MIN( N-IA+1, NB ), M, $ ONE, A(1,IA), LDA, BETA, B(IB,1), LDB ) IA = IA + JNTV IB = IB + INTV 20 CONTINUE END IF END IF * RETURN * * End of PBSTR2AT * END * *======================================================================= * SUBROUTINE PBSTR2BT *======================================================================= * SUBROUTINE PBSTR2BT( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, $ BETA, B, LDB, INTV ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER ICONTXT, INTV, LDA, LDB, M, N, NB REAL BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBSTR2BT forms T <== A^T + beta*T or A^C + beta*T, where T is a * ((conjugate) transposed) condensed block row (or column), copied from * a scattered block column (or row) of A * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER IA, IB, K * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. External Subroutines .. EXTERNAL PBSMATADD * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Excutable Statements .. * IF( INTV.EQ.NB ) THEN CALL PBSMATADD( ICONTXT, TRANS, N, M, ONE, A, LDA, BETA, B, $ LDB ) * ELSE * * If A is a column block ( ADIST = 'C' ), * IF( LSAME( ADIST, 'C' ) ) THEN IA = 1 IB = 1 DO 10 K = 1, ICEIL( M, INTV ) CALL PBSMATADD( ICONTXT, TRANS, N, MIN( M-IA+1, NB ), $ ONE, A(IA,1), LDA, BETA, B(1,IB), LDB ) IA = IA + INTV IB = IB + NB 10 CONTINUE * * If A is a row block (ADIST = 'R'), * ELSE IA = 1 IB = 1 DO 20 K = 1, ICEIL( N, INTV ) CALL PBSMATADD( ICONTXT, TRANS, MIN( N-IA+1, NB ), M, $ ONE, A(1,IA), LDA, BETA, B(IB,1), LDB ) IA = IA + INTV IB = IB + NB 20 CONTINUE END IF END IF * RETURN * * End of PBSTR2BT * END * *======================================================================= * SUBROUTINE PBSTR2AF *======================================================================= * SUBROUTINE PBSTR2AF( ICONTXT, ADIST, M, N, NB, A, LDA, BETA, B, $ LDB, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, M, N, NB, LDA, LDB, LCMP, LCMQ, NINT REAL BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBSTR2AF forms T <== A + BETA*T, where T is a scattered block * row (or column) copied from a (condensed) block column (or row) of A * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER JA, JB, K, INTV * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( LSAME( ADIST, 'R' ) ) THEN INTV = NB * LCMQ JA = 1 JB = 1 DO 10 K = 1, ICEIL( NINT, NB ) CALL PBSMATADD( ICONTXT, 'G', M, MIN( N-JB+1, NB ), ONE, $ A(1,JA), LDA, BETA, B(1,JB), LDB ) JA = JA + NB JB = JB + INTV 10 CONTINUE * * if( LSAME( ADIST, 'C' ) ) then * ELSE INTV = NB * LCMP JA = 1 JB = 1 DO 20 K = 1, ICEIL( NINT, NB ) CALL PBSMATADD( ICONTXT, 'G', MIN( M-JB+1, NB ), N, ONE, $ A(JA,1), LDA, BETA, B(JB,1), LDB ) JA = JA + NB JB = JB + INTV 20 CONTINUE END IF * RETURN * * End of PBSTR2AF * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbstrget.f000644 000766 000024 00000010033 10363532303 020140 0ustar00juliestaff000000 000000 SUBROUTINE PBSTRGET( ICONTXT, ADIST, M, N, MNB, A, LDA, MCROW, $ MCCOL, IGD, MYROW, MYCOL, NPROW, NPCOL ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, IGD, LDA, M, MCCOL, MCROW, MNB, MYCOL, $ MYROW, N, NPCOL, NPROW * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * PBSTRGET forms a row block of A from scattered row subblocks if * ADIST = 'R', or forms a column block of A from scattered column * subblocks, if ADIST = 'C'. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. Local Variables .. INTEGER KINT, KINT2, KLEN, KMOD, KPPOS, NLEN, NNUM, $ NTLEN REAL TEMP * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL LSAME, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL SGERV2D, SGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * * if A is a row block, it needs to communicate columnwise. * IF( LSAME( ADIST, 'R' ) ) THEN KPPOS = MOD( NPROW+MYROW-MCROW, NPROW ) IF( MOD( KPPOS, IGD ).EQ.0 ) THEN KINT = IGD NLEN = N NNUM = MIN( NPROW/IGD, MNB-MCCOL ) TEMP = REAL( NNUM ) NTLEN = N * NNUM NNUM = IGD * NNUM IF( KPPOS.GE.NNUM ) GO TO 30 KPPOS = MOD( KPPOS, NPROW ) * 10 CONTINUE IF( TEMP.GT.ONE ) THEN KINT2 = 2 * KINT KMOD = MOD( KPPOS, KINT2 ) * IF( KMOD.EQ.0 ) THEN IF( KPPOS+KINT.LT.NNUM ) THEN KLEN = NTLEN - (KPPOS/KINT2)*(KINT2/IGD)*N KLEN = MIN( KLEN-NLEN, NLEN ) CALL SGERV2D( ICONTXT, M, KLEN, A(1,NLEN+1), LDA, $ MOD(MYROW+KINT, NPROW), MYCOL ) NLEN = NLEN + KLEN END IF ELSE CALL SGESD2D( ICONTXT, M, NLEN, A, LDA, $ MOD(NPROW+MYROW-KINT, NPROW), MYCOL ) GO TO 30 END IF * KINT = KINT2 TEMP = TEMP / TWO GO TO 10 END IF END IF * * if A is a column block, it needs to communicate rowwise. * ELSE IF( LSAME( ADIST, 'C' ) ) THEN * KPPOS = MOD( NPCOL+MYCOL-MCCOL, NPCOL ) IF( MOD( KPPOS, IGD ).EQ.0 ) THEN KINT = IGD NLEN = N NNUM = MIN( NPCOL/IGD, MNB-MCROW ) TEMP = REAL( NNUM ) NTLEN = N * NNUM NNUM = IGD * NNUM IF( KPPOS.GE.NNUM ) GO TO 30 KPPOS = MOD( KPPOS, NPCOL ) * 20 CONTINUE IF( TEMP.GT.ONE ) THEN KINT2 = 2 * KINT KMOD = MOD( KPPOS, KINT2 ) * IF( KMOD.EQ.0 ) THEN IF( KPPOS+KINT.LT.NNUM ) THEN KLEN = NTLEN - (KPPOS/KINT2)*(KINT2/IGD)*N KLEN = MIN( KLEN-NLEN, NLEN ) CALL SGERV2D( ICONTXT, M, KLEN, A(1,NLEN+1), LDA, $ MYROW, MOD(MYCOL+KINT, NPCOL) ) NLEN = NLEN + KLEN END IF ELSE CALL SGESD2D( ICONTXT, M, NLEN, A, LDA, MYROW, $ MOD(NPCOL+MYCOL-KINT, NPCOL) ) GO TO 30 END IF * KINT = KINT2 TEMP = TEMP / TWO GO TO 20 END IF END IF END IF * 30 CONTINUE * RETURN * * End of PBSTRGET * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbstrnv.f000644 000766 000024 00000064621 10363532303 020020 0ustar00juliestaff000000 000000 SUBROUTINE PBSTRNV( ICONTXT, XDIST, TRANS, N, NB, NZ, X, INCX, $ BETA, Y, INCY, IXROW, IXCOL, IYROW, IYCOL, $ WORK ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * Jaeyoung Choi, Oak Ridge National Laboratory * Jack Dongarra, University of Tennessee and Oak Ridge National Lab. * David Walker, Oak Ridge National Laboratory * * .. Scalar Arguments .. CHARACTER*1 TRANS, XDIST INTEGER ICONTXT, INCX, INCY, IXCOL, IXROW, IYCOL, $ IYROW, N, NB, NZ REAL BETA * .. * .. Array Arguments .. REAL WORK( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PBSTRNV transposes a column vector to row vector, or a row vector to * column vector by reallocating data distribution. * * Y := X' * * where X and Y are N vectors. * * Parameters * ========== * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * XDIST (input) CHARACTER*1 * XDIST specifies whether X is a column vector or a row vector, * * XDIST = 'C', X is a column vector (distributed columnwise) * XDIST = 'R', X is a row vector (distributed rowwise) * * TRANS (input) CHARACTER*1 * TRANS specifies whether the transposed format is transpose * or conjugate transpose. If the vectors X and Y are real, * the argument is ignored. * * TRANS = 'T', transpose * TRANS = 'C', conjugate transpose * * N (input) INTEGER * N specifies the (global) number of the vector X and the * vector Y. N >= 0. * * NB (input) INTEGER * NB specifies the block size of vectors X and Y. NB >= 0. * * NZ (input) INTEGER * NZ is the column offset to specify the column distance from * the beginning of the block to the first element of the * vector X, and the row offset to the first element of the * vector Y if XDIST = 'C'. * Otherwise, it is row offset to specify the row distance * from the beginning of the block to the first element of the * vector X, and the column offset to the first element of the * vector Y. 0 < NZ <= NB. * * X (input) REAL array of dimension at least * ( 1 + (Np-1) * abs(INCX)) in IXCOL if XDIST = 'C', or * ( 1 + (Nq-1) * abs(INCX)) in IXROW if XDIST = 'R'. * The incremented array X must contain the vector X. * * INCX (input) INTEGER * INCX specifies the increment for the elements of X. * INCX <> 0. * * BETA (input) REAL * BETA specifies scaler beta. * * Y (input/output) REAL array of dimension at least * ( 1 + (Nq-1) * abs(INCY)) in IYROW if XDIST = 'C', or * ( 1 + (Np-1) * abs(INCY)) in IYCOL if XDIST = 'R', or * The incremented array Y must contain the vector Y. * Y will not be referenced if beta is zero. * * INCY (input) INTEGER * INCY specifies the increment for the elements of Y. * INCY <> 0. * * IXROW (input) INTEGER * IXROW specifies a row of the process template, which holds * the first element of the vector X. If X is a row vector and * all rows of processes have a copy of X, then set IXROW = -1. * * IXCOL (input) INTEGER * IXCOL specifies a column of the process template, * which holds the first element of the vector X. If X is a * column block and all columns of processes have a copy of X, * then set IXCOL = -1. * * IYROW (input) INTEGER * IYROW specifies the current row process which holds the * first element of the vector Y, which is transposed of X. * If X is a column vector and the transposed row vector Y is * distributed all rows of processes, set IYROW = -1. * * IYCOL (input) INTEGER * IYCOL specifies the current column process which holds * the first element of the vector Y, which is transposed of Y. * If X is a row block and the transposed column vector Y is * distributed all columns of processes, set IYCOL = -1. * * WORK (workspace) REAL array of dimension Size(WORK). * It needs extra working space of x**T or x**H. * * Parameters Details * ================== * * Nx It is a local portion of N owned by a process, where x is * replaced by either p (=NPROW) or q (=NPCOL)). The value is * determined by N, NB, NZ, x, and MI, where NB is a block size, * NZ is a offset from the beginning of the block, and MI is a * row or column position in a process template. Nx is equal * to or less than Nx0 = CEIL( N+NZ, NB*x ) * NB. * * Communication Scheme * ==================== * * The communication scheme of the routine is set to '1-tree', which is * fan-out. (For details, see BLACS user's guide.) * * Memory Requirement of WORK * ========================== * * NN = N + NZ * Npb = CEIL( NN, NB*NPROW ) * Nqb = CEIL( NN, NB*NPCOL ) * LCMP = LCM / NPROW * LCMQ = LCM / NPCOL * * (1) XDIST = 'C' * (a) IXCOL != -1 * Size(WORK) = CEIL(Nqb,LCMQ)*NB * (b) IXCOL = -1 * Size(WORK) = CEIL(Nqb,LCMQ)*NB * MIN(LCMQ,CEIL(NN,NB)) * * (2) XDIST = 'R' * (a) IXROW != -1 * Size(WORK) = CEIL(Npb,LCMP)*NB * (b) IXROW = -1 * Size(WORK) = CEIL(Npb,LCMP)*NB * MIN(LCMP,CEIL(NN,NB)) * * Notes * ----- * More precise space can be computed as * * CEIL(Npb,LCMP)*NB => NUMROC( NUMROC(NN,NB,0,0,NPROW), NB, 0, 0, LCMP) * CEIL(Nqb,LCMQ)*NB => NUMROC( NUMROC(NN,NB,0,0,NPCOL), NB, 0, 0, LCMQ) * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL COLFORM, ROWFORM INTEGER I, IDEX, IGD, INFO, JDEX, JYCOL, JYROW, JZ, KZ, $ LCM, LCMP, LCMQ, MCCOL, MCROW, MRCOL, MRROW, $ MYCOL, MYROW, NN, NP, NP0, NP1, NPCOL, NPROW, $ NQ, NQ0, NQ1 REAL TBETA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, ICEIL, NUMROC EXTERNAL LSAME, ILCM, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PBSTR2A1, PBSTR2B1, PBSTRGET, $ PBSTRST1, PBSVECADD, PXERBLA, SGEBR2D, SGEBS2D, $ SGERV2D, SGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible. * IF( N.EQ.0 ) RETURN * CALL BLACS_GRIDINFO( ICONTXT, NPROW, NPCOL, MYROW, MYCOL ) * COLFORM = LSAME( XDIST, 'C' ) ROWFORM = LSAME( XDIST, 'R' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.COLFORM ) .AND. ( .NOT.ROWFORM ) ) THEN INFO = 2 ELSE IF( N .LT.0 ) THEN INFO = 4 ELSE IF( NB .LT.1 ) THEN INFO = 5 ELSE IF( NZ .LT.0 .OR. NZ.GE.NB ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 ELSE IF( IXROW.LT.-1 .OR. IXROW.GE.NPROW .OR. $ ( IXROW.EQ.-1 .AND. COLFORM ) ) THEN INFO = 12 ELSE IF( IXCOL.LT.-1 .OR. IXCOL.GE.NPCOL .OR. $ ( IXCOL.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 13 ELSE IF( IYROW.LT.-1 .OR. IYROW.GE.NPROW .OR. $ ( IYROW.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 14 ELSE IF( IYCOL.LT.-1 .OR. IYCOL.GE.NPCOL .OR. $ ( IYCOL.EQ.-1 .AND. COLFORM ) ) THEN INFO = 15 END IF * 10 CONTINUE IF( INFO.NE.0 ) THEN CALL PXERBLA( ICONTXT, 'PBSTRNV ', INFO ) RETURN END IF * * Start the operations. * * LCM : the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LCMQ = LCM / NPCOL IGD = NPCOL / LCMP NN = N + NZ * * When x is a column vector * IF( COLFORM ) THEN * * Form y <== x' ( x is a column vector ) * * || * || * _____________ || * -----(y)----- <== (x) * || * || * || * IF( IXROW.LT.0 .OR. IXROW.GE.NPROW ) THEN INFO = 12 ELSE IF( IXCOL.LT.-1 .OR. IXCOL.GE.NPCOL ) THEN INFO = 13 ELSE IF( IYROW.LT.-1 .OR. IYROW.GE.NPROW ) THEN INFO = 14 ELSE IF( IYCOL.LT.0 .OR. IYCOL.GE.NPCOL ) THEN INFO = 15 END IF IF( INFO.NE.0 ) GO TO 10 * * MRROW : row relative position in template from IXROW * MRCOL : column relative position in template from IYCOL * MRROW = MOD( NPROW+MYROW-IXROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IYCOL, NPCOL ) JYROW = IYROW IF( IYROW.EQ.-1 ) JYROW = IXROW * NP = NUMROC( NN, NB, MYROW, IXROW, NPROW ) IF( MRROW.EQ.0 ) NP = NP - NZ NQ = NUMROC( NN, NB, MYCOL, IYCOL, NPCOL ) IF( MRCOL.EQ.0 ) NQ = NQ - NZ NQ0 = NUMROC( NUMROC(NN, NB, 0, 0, NPCOL), NB, 0, 0, LCMQ ) * * When a column process of IXCOL has a column block A, * IF( IXCOL .GE. 0 ) THEN TBETA = ZERO IF( MYROW.EQ.JYROW ) TBETA = BETA KZ = NZ * DO 20 I = 0, MIN( LCM, ICEIL(NN,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IXROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IYCOL, NPCOL ) IF( LCMQ.EQ.1 ) NQ0 = NUMROC( NN, NB, I, 0, NPCOL ) JDEX = (I/NPCOL) * NB IF( MRCOL.EQ.0 ) JDEX = MAX(0, JDEX-NZ) * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.IXCOL ) THEN * * The source node is a destination node * IDEX = (I/NPROW) * NB IF( MRROW.EQ.0 ) IDEX = MAX( 0, IDEX-NZ ) IF( MYROW.EQ.JYROW .AND. MYCOL.EQ.MCCOL ) THEN CALL PBSTR2B1( ICONTXT, TRANS, NP-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, TBETA, $ Y(JDEX*INCY+1), INCY, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBSTR2B1( ICONTXT, TRANS, NP-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMP, 1 ) CALL SGESD2D( ICONTXT, 1, NQ0-KZ, WORK, 1, $ JYROW, MCCOL ) END IF * * A destination node receives the copied vector * ELSE IF( MYROW.EQ.JYROW .AND. MYCOL.EQ.MCCOL ) THEN IF( LCMQ.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL SGERV2D( ICONTXT, 1, NQ0-KZ, Y, INCY, $ MCROW, IXCOL ) ELSE CALL SGERV2D( ICONTXT, 1, NQ0-KZ, WORK, 1, $ MCROW, IXCOL ) CALL PBSTR2A1( ICONTXT, NQ-JDEX, NB, KZ, WORK, 1, TBETA, $ Y(JDEX*INCY+1), INCY, LCMQ*NB ) END IF END IF KZ = 0 20 CONTINUE * * Broadcast a row block of WORK in each column of template * IF( IYROW.EQ.-1 ) THEN IF( MYROW.EQ.JYROW ) THEN CALL SGEBS2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY ) ELSE CALL SGEBR2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY, $ JYROW, MYCOL ) END IF END IF * * When all column procesors have a copy of the column block A, * ELSE IF( LCMQ.EQ.1 ) NQ0 = NQ * * Processors, which have diagonal blocks of X, copy them to * WORK array in transposed form * KZ = 0 IF( MRROW.EQ.0 ) KZ = NZ JZ = 0 IF( MRROW.EQ.0 .AND. MYCOL.EQ.IYCOL ) JZ = NZ * DO 30 I = 0, LCMP - 1 IF( MRCOL.EQ.MOD(NPROW*I+MRROW, NPCOL) ) THEN IDEX = MAX( 0, I*NB-KZ ) IF( LCMQ.EQ.1 .AND. (IYROW.EQ.-1.OR.IYROW.EQ.MYROW) ) THEN CALL PBSTR2B1( ICONTXT, TRANS, NP-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, BETA, Y, INCY, $ LCMP, 1 ) ELSE CALL PBSTR2B1( ICONTXT, TRANS, NP-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMP, 1 ) END IF END IF 30 CONTINUE * * Get diagonal blocks of A for each column of the template * MCROW = MOD( MOD(MRCOL, NPROW) + IXROW, NPROW ) IF( LCMQ.GT.1 ) THEN MCCOL = MOD( NPCOL+MYCOL-IYCOL, NPCOL ) CALL PBSTRGET( ICONTXT, 'Row', 1, NQ0, ICEIL( NN, NB ), $ WORK, 1, MCROW, MCCOL, IGD, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * * Broadcast a row block of WORK in every row of template * IF( IYROW.EQ.-1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( LCMQ.GT.1 ) THEN KZ = 0 IF( MYCOL.EQ.IYCOL ) KZ = NZ CALL PBSTRST1( ICONTXT, 'Row', NQ, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NQ0 ) END IF CALL SGEBS2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY ) ELSE CALL SGEBR2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY, $ MCROW, MYCOL ) END IF * * Send a row block of WORK to the destination row * ELSE IF( LCMQ.EQ.1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.IYROW ) $ CALL SGESD2D( ICONTXT, 1, NQ0, WORK, 1, IYROW, MYCOL ) ELSE IF( MYROW.EQ.IYROW ) THEN IF( BETA.EQ.ZERO ) THEN CALL SGERV2D( ICONTXT, 1, NQ0, Y, INCY, MCROW, MYCOL ) ELSE CALL SGERV2D( ICONTXT, 1, NQ0, WORK, 1, MCROW, MYCOL ) CALL PBSVECADD( ICONTXT, 'G', NQ0, ONE, WORK, 1, $ BETA, Y, INCY ) END IF END IF * ELSE NQ1 = NQ0 * MIN( LCMQ, MAX( 0, ICEIL(NN,NB)-MCCOL ) ) IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.IYROW ) $ CALL SGESD2D( ICONTXT, 1, NQ1, WORK, 1, IYROW, MYCOL ) ELSE IF( MYROW.EQ.IYROW ) THEN CALL SGERV2D( ICONTXT, 1, NQ1, WORK, 1, MCROW, MYCOL ) END IF * IF( MYROW.EQ.IYROW ) THEN KZ = 0 IF( MYCOL.EQ.IYCOL ) KZ = NZ CALL PBSTRST1( ICONTXT, 'Row', NQ, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NQ0 ) END IF END IF END IF END IF * * When x is a row vector * ELSE * * Form y <== x' ( x is a row block ) * * || * || * || _____________ * (y) <== -----(x)----- * || * || * || * IF( IXROW.LT.-1 .OR. IXROW.GE.NPROW ) THEN INFO = 12 ELSE IF( IXCOL.LT.0 .OR. IXCOL.GE.NPCOL ) THEN INFO = 13 ELSE IF( IYROW.LT.0 .OR. IYROW.GE.NPROW ) THEN INFO = 14 ELSE IF( IYCOL.LT.-1 .OR. IYCOL.GE.NPCOL ) THEN INFO = 15 END IF IF( INFO.NE.0 ) GO TO 10 * * MRROW : row relative position in template from IYROW * MRCOL : column relative position in template from IXCOL * MRROW = MOD( NPROW+MYROW-IYROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IXCOL, NPCOL ) JYCOL = IYCOL IF( IYCOL.EQ.-1 ) JYCOL = IXCOL * NP = NUMROC( NN, NB, MYROW, IYROW, NPROW ) IF( MRROW.EQ.0 ) NP = NP - NZ NQ = NUMROC( NN, NB, MYCOL, IXCOL, NPCOL ) IF( MRCOL.EQ.0 ) NQ = NQ - NZ NP0 = NUMROC( NUMROC(NN, NB, 0, 0, NPROW), NB, 0, 0, LCMP ) * * When a row process of IXROW has a row block A, * IF( IXROW .GE. 0 ) THEN TBETA = ZERO IF( MYCOL.EQ.JYCOL ) TBETA = BETA KZ = NZ * DO 40 I = 0, MIN( LCM, ICEIL(NN,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IYROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IXCOL, NPCOL ) IF( LCMP.EQ.1 ) NP0 = NUMROC( NN, NB, I, 0, NPROW ) JDEX = (I/NPROW) * NB IF( MRROW.EQ.0 ) JDEX = MAX(0, JDEX-NZ) * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.IXROW .AND. MYCOL.EQ.MCCOL ) THEN * * The source node is a destination node * IDEX = (I/NPCOL) * NB IF( MRCOL.EQ.0 ) IDEX = MAX( 0, IDEX-NZ ) IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JYCOL ) THEN CALL PBSTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, TBETA, $ Y(JDEX*INCY+1), INCY, LCMQ, LCMP ) * * The source node sends blocks to a destination node * ELSE CALL PBSTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMQ, 1 ) CALL SGESD2D( ICONTXT, 1, NP0-KZ, WORK, 1, $ MCROW, JYCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JYCOL ) THEN IF( LCMP.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL SGERV2D( ICONTXT, 1, NP0-KZ, Y, INCY, $ IXROW, MCCOL ) ELSE CALL SGERV2D( ICONTXT, 1, NP0-KZ, WORK, 1, $ IXROW, MCCOL ) CALL PBSTR2A1( ICONTXT, NP-JDEX, NB, KZ, WORK, 1, TBETA, $ Y(JDEX*INCY+1), INCY, LCMP*NB ) END IF END IF KZ = 0 40 CONTINUE * * Broadcast a column vector Y in each row of template * IF( IYCOL.EQ.-1 ) THEN IF( MYCOL.EQ.JYCOL ) THEN CALL SGEBS2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY ) ELSE CALL SGEBR2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY, $ MYROW, JYCOL ) END IF END IF * * When all row procesors have a copy of the row block A, * ELSE IF( LCMP.EQ.1 ) NP0 = NP * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * KZ = 0 IF( MRCOL.EQ.0 ) KZ = NZ JZ = 0 IF( MRCOL.EQ.0 .AND. MYROW.EQ.IYROW ) JZ = NZ * DO 50 I = 0, LCMQ-1 IF( MRROW.EQ.MOD(NPCOL*I+MRCOL, NPROW) ) THEN IDEX = MAX( 0, I*NB-KZ ) IF( LCMP.EQ.1 .AND. (IYCOL.EQ.-1.OR.IYCOL.EQ.MYCOL) ) THEN CALL PBSTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, BETA, Y, INCY, $ LCMQ, 1 ) ELSE CALL PBSTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMQ, 1 ) END IF END IF 50 CONTINUE * * Get diagonal blocks of A for each row of the template * MCCOL = MOD( MOD(MRROW, NPCOL) + IXCOL, NPCOL ) IF( LCMP.GT.1 ) THEN MCROW = MOD( NPROW+MYROW-IYROW, NPROW ) CALL PBSTRGET( ICONTXT, 'Col', 1, NP0, ICEIL( NN, NB ), $ WORK, 1, MCROW, MCCOL, IGD, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * * Broadcast a column block of WORK in every column of template * IF( IYCOL.EQ.-1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( LCMP.GT.1 ) THEN KZ = 0 IF( MYROW.EQ.IYROW ) KZ = NZ CALL PBSTRST1( ICONTXT, 'Col', NP, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NP0 ) END IF CALL SGEBS2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY ) ELSE CALL SGEBR2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY, $ MYROW, MCCOL ) END IF * * Send a column block of WORK to the destination column * ELSE IF( LCMP.EQ.1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.IYCOL ) $ CALL SGESD2D( ICONTXT, 1, NP, WORK, 1, MYROW, IYCOL ) ELSE IF( MYCOL.EQ.IYCOL ) THEN IF( BETA.EQ.ZERO ) THEN CALL SGERV2D( ICONTXT, 1, NP, Y, INCY, MYROW, MCCOL ) ELSE CALL SGERV2D( ICONTXT, 1, NP, WORK, 1, MYROW, MCCOL ) CALL PBSVECADD( ICONTXT, 'G', NP, ONE, WORK, 1, BETA, $ Y, INCY ) END IF END IF * ELSE NP1 = NP0 * MIN( LCMP, MAX( 0, ICEIL(NN,NB)-MCROW ) ) IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.IYCOL ) $ CALL SGESD2D( ICONTXT, 1, NP1, WORK, 1, MYROW, IYCOL ) ELSE IF( MYCOL.EQ.IYCOL ) THEN CALL SGERV2D( ICONTXT, 1, NP1, WORK, 1, MYROW, MCCOL ) END IF * IF( MYCOL.EQ.IYCOL ) THEN KZ = 0 IF( MYROW.EQ.IYROW ) KZ = NZ CALL PBSTRST1( ICONTXT, 'Col', NP, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NP0 ) END IF END IF END IF END IF END IF * RETURN * * End of PBSTRNV * END * *======================================================================= * SUBROUTINE PBSTR2A1 *======================================================================= * SUBROUTINE PBSTR2A1( ICONTXT, N, NB, NZ, X, INCX, BETA, Y, INCY, $ INTV ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. INTEGER ICONTXT, N, NB, NZ, INCX, INCY, INTV REAL BETA * .. * .. Array Arguments .. REAL X( * ), Y( * ) * .. * * Purpose * ======= * * y <== x * y is a scattered vector, copied from a condensed vector x. * * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. External Subroutines .. EXTERNAL PBSVECADD * .. * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Variables .. INTEGER IX, IY, JZ, K, ITER * IX = 0 IY = 0 JZ = NZ ITER = ICEIL( N+NZ, INTV ) * IF( ITER.GT.1 ) THEN CALL PBSVECADD( ICONTXT, 'G', NB-JZ, ONE, X(IX*INCX+1), INCX, $ BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - JZ IY = IY + INTV - JZ JZ = 0 * DO 10 K = 2, ITER-1 CALL PBSVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), INCX, $ BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + INTV 10 CONTINUE END IF * CALL PBSVECADD( ICONTXT, 'G', MIN( N-IY, NB-JZ ), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), INCY ) * RETURN * * End of PBSTR2A1 * END * *======================================================================= * SUBROUTINE PBSTR2B1 *======================================================================= * SUBROUTINE PBSTR2B1( ICONTXT, TRANS, N, NB, NZ, X, INCX, BETA, Y, $ INCY, JINX, JINY ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER ICONTXT, N, NB, NZ, INCX, INCY, JINX, JINY REAL BETA * .. * .. Array Arguments .. REAL X( * ), Y( * ) * .. * * Purpose * ======= * * y <== x + beta * y * y is a condensed vector, copied from a scattered vector x * * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. External Subroutines .. EXTERNAL PBSVECADD * .. * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Variables .. INTEGER IX, IY, JZ, K, ITER, LENX, LENY * IF( JINX.EQ.1 .AND. JINY.EQ.1 ) THEN CALL PBSVECADD( ICONTXT, TRANS, N, ONE, X, INCX, BETA, $ Y, INCY ) * ELSE IX = 0 IY = 0 JZ = NZ LENX = NB * JINX LENY = NB * JINY ITER = ICEIL( N+NZ, LENX ) * IF( ITER.GT.1 ) THEN CALL PBSVECADD( ICONTXT, TRANS, NB-JZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + LENX - JZ IY = IY + LENY - JZ JZ = 0 * DO 10 K = 2, ITER-1 CALL PBSVECADD( ICONTXT, TRANS, NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + LENX IY = IY + LENY 10 CONTINUE END IF * CALL PBSVECADD( ICONTXT, TRANS, MIN( N-IX, NB-JZ ), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), INCY ) END IF * RETURN * * End of PBSTR2B1 * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbstrsrt.f000644 000766 000024 00000004501 10363532303 020174 0ustar00juliestaff000000 000000 SUBROUTINE PBSTRSRT( ICONTXT, ADIST, M, N, NB, A, LDA, BETA, B, $ LDB, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, LCMP, LCMQ, LDA, LDB, M, N, NB, NINT REAL BETA * .. * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBSTRSRT forms T <== A + beta * T, where T is a sorted * condensed block row (or column) from a block column (or row) of A * with sorting index ISRT * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Variables .. INTEGER JA, JB, K, KK, NJUMP * .. * .. External Subroutines .. EXTERNAL PBSMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( LSAME( ADIST, 'R' ) ) THEN NJUMP = NB * LCMQ DO 20 K = 0, LCMQ-1 JA = NINT * MOD( K*LCMP, LCMQ ) + 1 JB = K * NB + 1 * DO 10 KK = 1, ICEIL( NINT, NB ) IF( N.LT.JB ) GO TO 20 CALL PBSMATADD( ICONTXT, 'G', M, MIN( N-JB+1, NB ), ONE, $ A(1, JA), LDA, BETA, B(1, JB), LDB ) JA = JA + NB JB = JB + NJUMP 10 CONTINUE 20 CONTINUE * * if( LSAME( ADIST, 'C') ) then * ELSE NJUMP = NB * LCMP DO 40 K = 0, LCMP-1 JA = 1 JB = K * NB + 1 * DO 30 KK = 1, ICEIL( NINT, NB ) IF( M.LT.JB ) GO TO 40 CALL PBSMATADD( ICONTXT, 'G', MIN( M-JB+1, NB ), N, ONE, $ A(JA, N*MOD(K*LCMQ,LCMP)+1), LDA, BETA, $ B(JB, 1), LDB ) JA = JA + NB JB = JB + NJUMP 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of PBSTRSRT * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbstrst1.f000644 000766 000024 00000006451 10363532303 020101 0ustar00juliestaff000000 000000 SUBROUTINE PBSTRST1( ICONTXT, XDIST, N, NB, NZ, X, INCX, BETA, Y, $ INCY, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 XDIST INTEGER ICONTXT, INCX, INCY, LCMP, LCMQ, N, NB, NINT, $ NZ REAL BETA * .. * .. Array Arguments .. REAL X( * ), Y( * ) * .. * * Purpose * ======= * * PBSTRST1 forms y <== x + beta * y, where y is a sorted * condensed row (or column) vector from a column (or row) vector of x. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Variables .. INTEGER ITER, IX, IY, K, KK, KZ, NJUMP * .. * .. External Subroutines .. EXTERNAL PBSVECADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MAX, MOD * .. * .. Executable Statements .. * ITER = ICEIL( NINT, NB ) KZ = NZ * IF( LSAME( XDIST, 'R' ) ) THEN NJUMP = NB * LCMQ * DO 20 KK = 0, LCMQ-1 IX = NINT * MOD( KK*LCMP, LCMQ ) IY = MAX( 0, NB*KK-NZ ) IF( N.LT.IY ) GO TO 50 * IF( ITER.GT.1 ) THEN CALL PBSVECADD( ICONTXT, 'G', NB-KZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - KZ IY = IY + NJUMP - KZ KZ = 0 * DO 10 K = 2, ITER-1 CALL PBSVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + NJUMP 10 CONTINUE END IF * CALL PBSVECADD( ICONTXT, 'G', MIN(NB-KZ,N-IY), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), $ INCY ) KZ = 0 20 CONTINUE * * if( LSAME( XDIST, 'C' ) ) then * ELSE NJUMP = NB * LCMP * DO 40 KK = 0, LCMP-1 IX = NINT * MOD( KK*LCMQ, LCMP ) IY = MAX( 0, NB*KK-NZ ) IF( N.LT.IY ) GO TO 50 * IF( ITER.GT.1 ) THEN CALL PBSVECADD( ICONTXT, 'G', NB-KZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - KZ IY = IY + NJUMP - KZ KZ = 0 * DO 30 K = 2, ITER-1 CALL PBSVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + NJUMP 30 CONTINUE END IF * CALL PBSVECADD( ICONTXT, 'G', MIN(NB-KZ,N-IY), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), $ INCY ) KZ = 0 40 CONTINUE END IF * 50 CONTINUE * RETURN * * End of PBSTRST1 * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbsvecadd.f000644 000766 000024 00000015447 10363532303 020257 0ustar00juliestaff000000 000000 SUBROUTINE PBSVECADD( ICONTXT, MODE, N, ALPHA, X, INCX, BETA, Y, $ INCY ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 MODE INTEGER ICONTXT, INCX, INCY, N REAL ALPHA, BETA * .. * .. Array Arguments .. REAL X( * ), Y( * ) * * .. * * Purpose * ======= * * PBSVECADD performs a vector X to be added to Y * Y := alpha*op(X) + beta*Y, * where alpha and beta are scalars, and X and Y are n vectors, * and op(X) = X**H if MODE = 'C', * * Arguments * ========= * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * MODE (input) CHARACTER*1 * Specifies the transposed, or conjugate transposed vector X * to be added to the vector Y * = 'C': Conjugate vector X is added for complex data set. * Y = alpha * X**H + beta * Y * ELSE : Vector X is added. Y = alpha*X + beta*Y * if MODE = 'V', BLAS routine may be used. * * N (input) INTEGER * The number of elements of the vectors X and Y to be added. * N >= 0. * * ALPHA (input) REAL * ALPHA specifies the scalar alpha. * * X (input) REAL array of DIMENSION at least * ( 1 + ( N - 1 )*abs( INCX ) ) * The incremented array X must contain the vector X. * * INCX (input) INTEGER * INCX specifies the increment for the elements of X. * INCX <> 0. * * BETA (input) REAL * BETA specifies the scalar beta. * * Y (input/output) REAL array of DIMENSION at least * ( 1 + ( N - 1 )*abs( INCY ) ) * On entry with BETA non-zero, the incremented array Y must * contain the vector Y. * On exit, Y is overwritten by the updated vector Y. * * INCY - (input) INTEGER * INCY specifies the increment for the elements of Y. * INCY <> 0. * * ===================================================================== * * .. * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0) * .. * .. Local Scalars .. INTEGER I, IX, IY * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL SSCAL, SCOPY, SAXPY * .. * .. Executable Statements .. * IF( N.LE.0 .OR. ( ALPHA.EQ.ZERO .AND. BETA.EQ.ONE ) ) RETURN * IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN IF( INCY.EQ.1 ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE IY = 1 DO 20 I = 1, N Y( IY ) = ZERO IY = IY + INCY 20 CONTINUE END IF * ELSE IF( LSAME( MODE, 'V' ) ) THEN CALL SSCAL( N, BETA, Y, INCY ) ELSE IF( INCY.EQ.1 ) THEN DO 30 I = 1, N Y( I ) = BETA * Y( I ) 30 CONTINUE ELSE IY = 1 DO 40 I = 1, N Y( IY ) = BETA * Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( LSAME( MODE, 'V' ) ) THEN CALL SCOPY( N, X, INCX, Y, INCY ) ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 50 I = 1, N Y( I ) = X( I ) 50 CONTINUE ELSE IX = 1 IY = 1 DO 60 I = 1, N Y( IY ) = X( IX ) IX = IX + INCX IY = IY + INCY 60 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 70 I = 1, N Y( I ) = X( I ) + Y( I ) 70 CONTINUE ELSE IX = 1 IY = 1 DO 80 I = 1, N Y( IY ) = X( IX ) + Y( IY ) IX = IX + INCX IY = IY + INCY 80 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 90 I = 1, N Y( I ) = X( I ) + BETA * Y( I ) 90 CONTINUE ELSE IX = 1 IY = 1 DO 100 I = 1, N Y( IY ) = X( IX ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 100 CONTINUE END IF END IF * ELSE IF( BETA.EQ.ZERO ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 110 I = 1, N Y( I ) = ALPHA * X( I ) 110 CONTINUE ELSE IX = 1 IY = 1 DO 120 I = 1, N Y( IY ) = X( IX ) IX = IX + INCX IY = IY + INCY 120 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( LSAME( MODE, 'V' ) ) THEN CALL SAXPY( N, ALPHA, X, INCX, Y, INCY ) ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 130 I = 1, N Y( I ) = ALPHA * X( I ) + Y( I ) 130 CONTINUE ELSE IX = 1 IY = 1 DO 140 I = 1, N Y( IY ) = ALPHA * X( IX ) + Y( IY ) IX = IX + INCX IY = IY + INCY 140 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 150 I = 1, N Y( I ) = ALPHA * X( I ) + BETA * Y( I ) 150 CONTINUE ELSE IX = 1 IY = 1 DO 160 I = 1, N Y( IY ) = ALPHA * X( IX ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 160 CONTINUE END IF END IF END IF END IF * RETURN * * End of PBSVECADD * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbzmatadd.f000644 000766 000024 00000034130 10363532303 020260 0ustar00juliestaff000000 000000 SUBROUTINE PBZMATADD( ICONTXT, MODE, M, N, ALPHA, A, LDA, BETA, B, $ LDB ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 MODE INTEGER ICONTXT, LDA, LDB, M, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBZMATADD performs the matrix add operation B := alpha*A + beta*B, * where alpha and beta are scalars, and A and B are m-by-n * upper/lower trapezoidal matrices, or rectangular matrices. * * Arguments * ========= * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * MODE (input) CHARACTER*1 * Specifies the part of the matrix A, or (conjugate) transposed * matrix A to be added to the matrix B, * = 'U': Upper triangular part * up(B) = alpha*up(A) + beta*up(B) * = 'L': Lower triangular part * lo(B) = alpha*lo(A) + beta*lo(B) * = 'T': Transposed matrix A * B = alpha*A**T + beta*B * = 'C': Conjugate transposed matrix A * B = alpha*A**H + beta*B * Otherwise: B = alpha*A + beta*B * if M = LDA = LDB: use one BLAS loop * if MODE = 'V' : columnwise copy using BLAS if possible * else : use double loops * * M (input) INTEGER * M specifies the number of columns of the matrix A if * MODE != 'T'/'C', and it specifies the number of rows of the * matrix A otherwise. It also specifies the number of rows of * the matrix B. M >= 0. * * N (input) INTEGER * N specifies the number of rows of the matrix A if * MODE != 'T'/'C', and it specifies the number of columns of * the matrix A otherwise. It also specifies the number of * columns of the matrix B. N >= 0. * * ALPHA (input) COMPLEX*16 * ALPHA specifies the scalar alpha. * * A (input) COMPLEX*16 array, dimension (LDA,N) * The m by n matrix A if MODE != 'T'/'C'. * If MODE = 'U', only the upper triangle or trapezoid is * accessed; if MODE = 'L', only the lower triangle or * trapezoid is accessed. Otherwise all m-by-n data matrix * is accessed. * And the n by m matrix A if MODE = 'T'/'C'. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M) if * MODE != 'T'/'C'. And LDA >= max(1,N) if MODE = 'T'/'C'. * * BETA (input) COMPLEX*16 * BETA specifies the scalar beta. * * B (input) COMPLEX*16 array, dimension (LDB,N) * On exit, B = alpha*A + beta*B * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,M). * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, J * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL ZSCAL, ZCOPY, ZAXPY * .. * .. Intrinsic Functions .. INTRINSIC MIN, DCONJG * .. * .. Executable Statements .. * IF( M.LE.0 .OR. N.LE.0 .OR. ( ALPHA.EQ.ZERO.AND.BETA.EQ.ONE ) ) $ RETURN * * A is upper triangular or upper trapezoidal, * IF( LSAME( MODE, 'U' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, MIN( J, M ) B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, MIN( J, M ) B( I, J ) = BETA * B( I, J ) 30 CONTINUE 40 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 60 J = 1, N DO 50 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) 50 CONTINUE 60 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 80 J = 1, N DO 70 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) + B( I, J ) 70 CONTINUE 80 CONTINUE ELSE DO 100 J = 1, N DO 90 I = 1, MIN( J, M ) B( I, J ) = A( I, J ) + BETA * B( I, J ) 90 CONTINUE 100 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 120 J = 1, N DO 110 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) 110 CONTINUE 120 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 140 J = 1, N DO 130 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 130 CONTINUE 140 CONTINUE ELSE DO 160 J = 1, N DO 150 I = 1, MIN( J, M ) B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 150 CONTINUE 160 CONTINUE END IF END IF * * A is lower triangular or upper trapezoidal, * ELSE IF( LSAME( MODE, 'L' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 180 J = 1, N DO 170 I = J, M B( I, J ) = ZERO 170 CONTINUE 180 CONTINUE ELSE DO 200 J = 1, N DO 190 I = J, M B( I, J ) = BETA * B( I, J ) 190 CONTINUE 200 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 220 J = 1, N DO 210 I = J, M B( I, J ) = A( I, J ) 210 CONTINUE 220 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 240 J = 1, N DO 230 I = J, M B( I, J ) = A( I, J ) + B( I, J ) 230 CONTINUE 240 CONTINUE ELSE DO 260 J = 1, N DO 250 I = J, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 250 CONTINUE 260 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 280 J = 1, N DO 270 I = J, M B( I, J ) = ALPHA * A( I, J ) 270 CONTINUE 280 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 300 J = 1, N DO 290 I = J, M B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 290 CONTINUE 300 CONTINUE ELSE DO 320 J = 1, N DO 310 I = J, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 310 CONTINUE 320 CONTINUE END IF END IF * * If MODE = 'Transpose' * ELSE IF( LSAME( MODE, 'T' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 340 J = 1, N DO 330 I = 1, M B( I, J ) = ZERO 330 CONTINUE 340 CONTINUE ELSE DO 360 J = 1, N DO 350 I = 1, M B( I, J ) = BETA * B( I, J ) 350 CONTINUE 360 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 380 J = 1, N DO 370 I = 1, M B( I, J ) = A( J, I ) 370 CONTINUE 380 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 400 J = 1, N DO 390 I = 1, M B( I, J ) = A( J, I ) + B( I, J ) 390 CONTINUE 400 CONTINUE ELSE DO 420 J = 1, N DO 410 I = 1, M B( I, J ) = A( J, I ) + BETA * B( I, J ) 410 CONTINUE 420 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 440 J = 1, N DO 430 I = 1, M B( I, J ) = ALPHA * A( J, I ) 430 CONTINUE 440 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 460 J = 1, N DO 450 I = 1, M B( I, J ) = ALPHA * A( J, I ) + B( I, J ) 450 CONTINUE 460 CONTINUE ELSE DO 480 J = 1, N DO 470 I = 1, M B( I, J ) = ALPHA * A( J, I ) + BETA * B( I, J ) 470 CONTINUE 480 CONTINUE END IF END IF * * If MODE = 'Conjugate', * ELSE IF( LSAME( MODE, 'C' ) ) THEN IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 500 J = 1, N DO 490 I = 1, M B( I, J ) = ZERO 490 CONTINUE 500 CONTINUE ELSE DO 520 J = 1, N DO 510 I = 1, M B( I, J ) = BETA * B( I, J ) 510 CONTINUE 520 CONTINUE END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN DO 540 J = 1, N DO 530 I = 1, M B( I, J ) = DCONJG( A( J, I ) ) 530 CONTINUE 540 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 560 J = 1, N DO 550 I = 1, M B( I, J ) = DCONJG( A( J, I ) ) + B( I, J ) 550 CONTINUE 560 CONTINUE ELSE DO 580 J = 1, N DO 570 I = 1, M B( I, J ) = DCONJG( A( J, I ) ) + BETA * B( I, J ) 570 CONTINUE 580 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 600 J = 1, N DO 590 I = 1, M B( I, J ) = ALPHA * DCONJG( A( J, I ) ) 590 CONTINUE 600 CONTINUE ELSE IF( BETA.EQ.ONE ) THEN DO 620 J = 1, N DO 610 I = 1, M B( I, J ) = ALPHA * DCONJG( A( J, I ) ) + B( I, J ) 610 CONTINUE 620 CONTINUE ELSE DO 640 J = 1, N DO 630 I = 1, M B( I, J ) = ALPHA * DCONJG( A( J, I ) ) $ + BETA * B( I, J ) 630 CONTINUE 640 CONTINUE END IF END IF * * Other cases (for genral matrix additions) * ELSE IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN DO 660 J = 1, N DO 650 I = 1, M B( I, J ) = ZERO 650 CONTINUE 660 CONTINUE * ELSE IF( M.EQ.LDB ) THEN CALL ZSCAL( M*N, BETA, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 670 J = 1, N CALL ZSCAL( M, BETA, B( 1, J ), 1 ) 670 CONTINUE ELSE DO 690 J = 1, N DO 680 I = 1, M B( I, J ) = BETA * B( I, J ) 680 CONTINUE 690 CONTINUE END IF END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( M.EQ.LDA .AND. M.EQ.LDB ) THEN CALL ZCOPY( M*N, A( 1, 1 ), 1, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 700 J = 1, N CALL ZCOPY( M, A( 1, J ), 1, B( 1, J ), 1 ) 700 CONTINUE ELSE DO 720 J = 1, N DO 710 I = 1, M B( I, J ) = A( I, J ) 710 CONTINUE 720 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN DO 740 J = 1, N DO 730 I = 1, M B( I, J ) = A( I, J ) + B( I, J ) 730 CONTINUE 740 CONTINUE * ELSE DO 760 J = 1, N DO 750 I = 1, M B( I, J ) = A( I, J ) + BETA * B( I, J ) 750 CONTINUE 760 CONTINUE END IF * ELSE IF( BETA.EQ.ZERO ) THEN DO 780 J = 1, N DO 770 I = 1, M B( I, J ) = ALPHA * A( I, J ) 770 CONTINUE 780 CONTINUE * ELSE IF( BETA.EQ.ONE ) THEN IF( M.EQ.LDA .AND. M.EQ.LDB ) THEN CALL ZAXPY( M*N, ALPHA, A( 1, 1 ), 1, B( 1, 1 ), 1 ) ELSE IF( LSAME( MODE, 'V' ) ) THEN DO 790 J = 1, N CALL ZAXPY( M, ALPHA, A( 1, J ), 1, B( 1, J ), 1 ) 790 CONTINUE ELSE DO 810 J = 1, N DO 800 I = 1, M B( I, J ) = ALPHA * A( I, J ) + B( I, J ) 800 CONTINUE 810 CONTINUE END IF * ELSE DO 830 J = 1, N DO 820 I = 1, M B( I, J ) = ALPHA * A( I, J ) + BETA * B( I, J ) 820 CONTINUE 830 CONTINUE END IF END IF END IF * RETURN * * End of PBZMATADD * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbztran.f000644 000766 000024 00000071215 10363532303 017777 0ustar00juliestaff000000 000000 SUBROUTINE PBZTRAN( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, BETA, $ C, LDC, IAROW, IACOL, ICROW, ICCOL, WORK ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * Jaeyoung Choi, Oak Ridge National Laboratory * Jack Dongarra, University of Tennessee and Oak Ridge National Lab. * David Walker, Oak Ridge National Laboratory * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER IACOL, IAROW, ICCOL, ICONTXT, ICROW, LDA, LDC, $ M, N, NB COMPLEX*16 BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), C( LDC, * ), WORK( * ) * .. * * Purpose * ======= * * PBZTRAN transposes a column block to row block, or a row block to * column block by reallocating data distribution. * * C := A^T + beta*C, or C := A^C + beta*C * * where A is an M-by-N matrix and C is an N-by-M matrix, and the size * of M or N is limited to its block size NB. * * The first elements of the matrices A, and C should be located at * the beginnings of their first blocks. (not the middle of the blocks.) * * Parameters * ========== * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * ADIST - (input) CHARACTER*1 * ADIST specifies whether A is a column block or a row block. * * ADIST = 'C', A is a column block * ADIST = 'R', A is a row block * * TRANS - (input) CHARACTER*1 * TRANS specifies whether the transposed format is transpose * or conjugate transpose. If the matrices A and C are real, * the argument is ignored. * * TRANS = 'T', transpose * TRANS = 'C', conjugate transpose * * M - (input) INTEGER * M specifies the (global) number of rows of the matrix (block * column or block row) A and of columns of the matrix C. * M >= 0. * * N - (input) INTEGER * N specifies the (global) number of columns of the matrix * (block column or block row) A and of columns of the matrix * C. N >= 0. * * NB - (input) INTEGER * NB specifies the column block size of the matrix A and the * row block size of the matrix C when ADIST = 'C'. Otherwise, * it specifies the row block size of the matrix A and the * column block size of the matrix C. NB >= 1. * * A (input) COMPLEX*16 array of DIMENSION ( LDA, Lx ), * where Lx is N when ADIST = 'C', or Nq when ADIST = 'R'. * Before entry with ADIST = 'C', the leading Mp by N part of * the array A must contain the matrix A, otherwise the leading * M by Nq part of the array A must contain the matrix A. See * parameter details for the values of Mp and Nq. * * LDA (input) INTEGER * LDA specifies the leading dimension of (local) A as declared * in the calling (sub) program. LDA >= MAX(1,Mp) when * ADIST = 'C', or LDA >= MAX(1,M) otherwise. * * BETA (input) COMPLEX*16 * BETA specifies scaler beta. * * C (input/output) COMPLEX*16 array of DIMENSION ( LDC, Lx ), * where Lx is Mq when ADIST = 'C', or N when ADIST = 'R'. * If ADIST = 'C', the leading N-by-Mq part of the array C * contains the (local) matrix C, otherwise the leading * Np-by-M part of the array C must contain the (local) matrix * C. C will not be referenced if beta is zero. * * LDC (input) INTEGER * LDC specifies the leading dimension of (local) C as declared * in the calling (sub) program. LDC >= MAX(1,N) when ADIST='C', * or LDC >= MAX(1,Np) otherwise. * * IAROW (input) INTEGER * IAROW specifies a row of the process template, * which holds the first block of the matrix A. If A is a row * of blocks (ADIST = 'R') and all rows of processes have a copy * of A, then set IAROW = -1. * * IACOL (input) INTEGER * IACOL specifies a column of the process template, * which holds the first block of the matrix A. If A is a * column of blocks (ADIST = 'C') and all columns of processes * have a copy of A, then set IACOL = -1. * * ICROW (input) INTEGER * ICROW specifies the current row process which holds * the first block of the matrix C, which is transposed of A. * If C is a row of blocks (ADIST = 'C') and the transposed * row block C is distributed all rows of processes, set * ICROW = -1. * * ICCOL (input) INTEGER * ICCOL specifies the current column process which holds * the first block of the matrix C, which is transposed of A. * If C is a column of blocks (ADIST = 'R') and the transposed * column block C is distributed all columns of processes, * set ICCOL = -1. * * WORK (workspace) COMPLEX*16 array of dimension Size(WORK). * It needs extra working space of A'. * * Parameters Details * ================== * * Lx It is a local portion of L owned by a process, (L is * replaced by M, or N, and x is replaced by either p (=NPROW) * or q (=NPCOL)). The value is determined by L, LB, x, and * MI, where LB is a block size and MI is a row or column * position in a process template. Lx is equal to or less * than Lx0 = CEIL( L, LB*x ) * LB. * * Communication Scheme * ==================== * * The communication scheme of the routine is set to '1-tree', which is * fan-out. (For details, see BLACS user's guide.) * * Memory Requirement of WORK * ========================== * * Mqb = CEIL( M, NB*NPCOL ) * Npb = CEIL( N, NB*NPROW ) * LCMQ = LCM / NPCOL * LCMP = LCM / NPROW * * (1) ADIST = 'C' * (a) IACOL != -1 * Size(WORK) = N * CEIL(Mqb,LCMQ)*NB * (b) IACOL = -1 * Size(WORK) = N * CEIL(Mqb,LCMQ)*NB * MIN(LCMQ,CEIL(M,NB)) * * (2) ADIST = 'R' * (a) IAROW != -1 * Size(WORK) = M * CEIL(Npb,LCMP)*NB * (b) IAROW = -1 * Size(WORK) = M * CEIL(Npb,LCMP)*NB * MIN(LCMP,CEIL(N,NB)) * * Notes * ----- * More precise space can be computed as * * CEIL(Mqb,LCMQ)*NB => NUMROC( NUMROC(M,NB,0,0,NPCOL), NB, 0, 0, LCMQ ) * CEIL(Npb,LCMP)*NB => NUMROC( NUMROC(N,NB,0,0,NPROW), NB, 0, 0, LCMP ) * * ===================================================================== * * .. * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL COLFORM, ROWFORM INTEGER I, IDEX, IGD, INFO, JCCOL, JCROW, JDEX, LCM, $ LCMP, LCMQ, MCCOL, MCROW, ML, MP, MQ, MQ0, $ MRCOL, MRROW, MYCOL, MYROW, NP, NP0, NPCOL, $ NPROW, NQ COMPLEX*16 TBETA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, ICEIL, NUMROC EXTERNAL ILCM, ICEIL, LSAME, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PBZMATADD, PBZTR2AF, PBZTR2AT, $ PBZTR2BT, PBZTRGET, PBZTRSRT, PXERBLA, ZGEBR2D, $ ZGEBS2D, ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible. * IF( M.EQ.0 .OR. N.EQ.0 ) RETURN * CALL BLACS_GRIDINFO( ICONTXT, NPROW, NPCOL, MYROW, MYCOL ) * COLFORM = LSAME( ADIST, 'C' ) ROWFORM = LSAME( ADIST, 'R' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.COLFORM ) .AND. ( .NOT.ROWFORM ) ) THEN INFO = 2 ELSE IF( M .LT.0 ) THEN INFO = 4 ELSE IF( N .LT.0 ) THEN INFO = 5 ELSE IF( NB.LT.1 ) THEN INFO = 6 ELSE IF( IAROW.LT.-1 .OR. IAROW.GE.NPROW .OR. $ ( IAROW.EQ.-1 .AND. COLFORM ) ) THEN INFO = 12 ELSE IF( IACOL.LT.-1 .OR. IACOL.GE.NPCOL .OR. $ ( IACOL.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 13 ELSE IF( ICROW.LT.-1 .OR. ICROW.GE.NPROW .OR. $ ( ICROW.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 14 ELSE IF( ICCOL.LT.-1 .OR. ICCOL.GE.NPCOL .OR. $ ( ICCOL.EQ.-1 .AND. COLFORM ) ) THEN INFO = 15 END IF * 10 CONTINUE IF( INFO .NE. 0 ) THEN CALL PXERBLA( ICONTXT, 'PBZTRAN ', INFO ) RETURN END IF * * Start the operations. * * LCM : the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LCMQ = LCM / NPCOL IGD = NPCOL / LCMP * * When A is a column block * IF( COLFORM ) THEN * * Form C <== A' ( A is a column block ) * _ * | | * | | * _____________ | | * |______C______| <== |A| * | | * | | * |_| * * MRROW : row relative position in template from IAROW * MRCOL : column relative position in template from ICCOL * MRROW = MOD( NPROW+MYROW-IAROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-ICCOL, NPCOL ) JCROW = ICROW IF( ICROW.EQ.-1 ) JCROW = IAROW * MP = NUMROC( M, NB, MYROW, IAROW, NPROW ) MQ = NUMROC( M, NB, MYCOL, ICCOL, NPCOL ) MQ0 = NUMROC( NUMROC(M, NB, 0, 0, NPCOL), NB, 0, 0, LCMQ ) * IF( LDA.LT.MP .AND. $ ( IACOL.EQ.MYCOL .OR. IACOL.EQ.-1 ) ) THEN INFO = 8 ELSE IF( LDC.LT.N .AND. $ ( ICROW.EQ.MYROW .OR. ICROW.EQ.-1 ) ) THEN INFO = 11 END IF IF( INFO.NE.0 ) GO TO 10 * * When a column process of IACOL has a column block A, * IF( IACOL.GE.0 ) THEN TBETA = ZERO IF( MYROW.EQ.JCROW ) TBETA = BETA * DO 20 I = 0, MIN( LCM, ICEIL(M,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IAROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + ICCOL, NPCOL ) IF( LCMQ.EQ.1 ) MQ0 = NUMROC( M, NB, I, 0, NPCOL ) JDEX = (I/NPCOL) * NB * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.IACOL ) THEN * * The source node is a destination node * IDEX = (I/NPROW) * NB IF( MYROW.EQ.JCROW .AND. MYCOL.EQ.MCCOL ) THEN CALL PBZTR2AT( ICONTXT, 'Col', TRANS, MP-IDEX, N, NB, $ A(IDEX+1,1), LDA, TBETA, C(1,JDEX+1), $ LDC, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBZTR2BT( ICONTXT, 'Col', TRANS, MP-IDEX, N, NB, $ A(IDEX+1,1), LDA, ZERO, WORK, N, $ LCMP*NB ) CALL ZGESD2D( ICONTXT, N, MQ0, WORK, N, JCROW, MCCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.JCROW .AND. MYCOL.EQ.MCCOL ) THEN IF( LCMQ.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL ZGERV2D( ICONTXT, N, MQ0, C, LDC, MCROW, IACOL ) ELSE CALL ZGERV2D( ICONTXT, N, MQ0, WORK, N, MCROW, IACOL ) CALL PBZTR2AF( ICONTXT, 'Row', N, MQ-JDEX, NB, WORK, N, $ TBETA, C(1,JDEX+1), LDC, LCMP, LCMQ, $ MQ0 ) END IF END IF 20 CONTINUE * * Broadcast a row block of C in each column of template * IF( ICROW.EQ.-1 ) THEN IF( MYROW.EQ.JCROW ) THEN CALL ZGEBS2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC ) ELSE CALL ZGEBR2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC, $ JCROW, MYCOL ) END IF END IF * * When all column procesors have a copy of the column block A, * ELSE IF( LCMQ.EQ.1 ) MQ0 = MQ * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * DO 30 I = 0, LCMP-1 IF( MRCOL.EQ.MOD( NPROW*I+MRROW, NPCOL ) ) THEN IF( LCMQ.EQ.1.AND.(ICROW.EQ.-1.OR.ICROW.EQ.MYROW) ) THEN CALL PBZTR2BT( ICONTXT, 'Col', TRANS, MP-I*NB, N, NB, $ A(I*NB+1,1), LDA, BETA, C, LDC, $ LCMP*NB ) ELSE CALL PBZTR2BT( ICONTXT, 'Col', TRANS, MP-I*NB, N, NB, $ A(I*NB+1,1), LDA, ZERO, WORK, N, $ LCMP*NB ) END IF END IF 30 CONTINUE * * Get diagonal blocks of A for each column of the template * MCROW = MOD( MOD(MRCOL,NPROW)+IAROW, NPROW ) IF( LCMQ.GT.1 ) THEN MCCOL = MOD( NPCOL+MYCOL-ICCOL, NPCOL ) CALL PBZTRGET( ICONTXT, 'Row', N, MQ0, ICEIL(M,NB), WORK, N, $ MCROW, MCCOL, IGD, MYROW, MYCOL, NPROW, $ NPCOL ) END IF * * Broadcast a row block of WORK in every row of template * IF( ICROW.EQ.-1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( LCMQ.GT.1 ) $ CALL PBZTRSRT( ICONTXT, 'Row', N, MQ, NB, WORK, N, BETA, $ C, LDC, LCMP, LCMQ, MQ0 ) CALL ZGEBS2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC ) ELSE CALL ZGEBR2D( ICONTXT, 'Col', '1-tree', N, MQ, C, LDC, $ MCROW, MYCOL ) END IF * * Send a row block of WORK to the destination row * ELSE IF( LCMQ.EQ.1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.ICROW ) $ CALL ZGESD2D( ICONTXT, N, MQ, WORK, N, ICROW, MYCOL ) ELSE IF( MYROW.EQ.ICROW ) THEN IF( BETA.EQ.ZERO ) THEN CALL ZGERV2D( ICONTXT, N, MQ, C, LDC, MCROW, MYCOL ) ELSE CALL ZGERV2D( ICONTXT, N, MQ, WORK, N, MCROW, MYCOL ) CALL PBZMATADD( ICONTXT, 'G', N, MQ, ONE, WORK, N, $ BETA, C, LDC ) END IF END IF * ELSE ML = MQ0 * MIN( LCMQ, MAX(0,ICEIL(M,NB)-MCCOL) ) IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.ICROW ) $ CALL ZGESD2D( ICONTXT, N, ML, WORK, N, ICROW, MYCOL ) ELSE IF( MYROW.EQ.ICROW ) THEN CALL ZGERV2D( ICONTXT, N, ML, WORK, N, MCROW, MYCOL ) END IF * IF( MYROW.EQ.ICROW ) $ CALL PBZTRSRT( ICONTXT, 'Row', N, MQ, NB, WORK, N, BETA, $ C, LDC, LCMP, LCMQ, MQ0 ) END IF END IF * END IF * * When A is a row block * ELSE * * Form C <== A' ( A is a row block ) * _ * | | * | | * | | _____________ * |C| <== |______A______| * | | * | | * |_| * * MRROW : row relative position in template from ICROW * MRCOL : column relative position in template from IACOL * MRROW = MOD( NPROW+MYROW-ICROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IACOL, NPCOL ) JCCOL = ICCOL IF( ICCOL.EQ.-1 ) JCCOL = IACOL * NP = NUMROC( N, NB, MYROW, ICROW, NPROW ) NQ = NUMROC( N, NB, MYCOL, IACOL, NPCOL ) NP0 = NUMROC( NUMROC(N, NB, 0, 0, NPROW), NB, 0, 0, LCMP ) * IF( LDA.LT.M .AND. $ ( IAROW.EQ.MYROW .OR. IAROW.EQ.-1 ) ) THEN INFO = 8 ELSE IF( LDC.LT.NP .AND. $ ( ICCOL.EQ.MYCOL .OR. ICCOL.EQ.-1 ) ) THEN INFO = 11 END IF IF( INFO.NE.0 ) GO TO 10 * * When a row process of IAROW has a row block A, * IF( IAROW.GE.0 ) THEN TBETA = ZERO IF( MYCOL.EQ.JCCOL ) TBETA = BETA * DO 40 I = 0, MIN( LCM, ICEIL(N,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + ICROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IACOL, NPCOL ) IF( LCMP.EQ.1 ) NP0 = NUMROC( N, NB, I, 0, NPROW ) IDEX = (I/NPROW) * NB * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.IAROW .AND. MYCOL.EQ.MCCOL ) THEN * * The source node is a destination node * JDEX = (I/NPCOL) * NB IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JCCOL ) THEN CALL PBZTR2AT( ICONTXT, 'Row', TRANS, M, NQ-JDEX, NB, $ A(1,JDEX+1), LDA, TBETA, C(IDEX+1,1), $ LDC, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBZTR2BT( ICONTXT, 'Row', TRANS, M, NQ-JDEX, NB, $ A(1,JDEX+1), LDA, ZERO, WORK, NP0, $ LCMQ*NB ) CALL ZGESD2D( ICONTXT, NP0, M, WORK, NP0, $ MCROW, JCCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JCCOL ) THEN IF( LCMP.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL ZGERV2D( ICONTXT, NP0, M, C, LDC, IAROW, MCCOL ) ELSE CALL ZGERV2D( ICONTXT, NP0, M, WORK, NP0, IAROW, MCCOL ) CALL PBZTR2AF( ICONTXT, 'Col', NP-IDEX, M, NB, WORK, $ NP0, TBETA, C(IDEX+1,1), LDC, LCMP, LCMQ, $ NP0 ) END IF END IF 40 CONTINUE * * Broadcast a column block of WORK in each row of template * IF( ICCOL.EQ.-1 ) THEN IF( MYCOL.EQ.JCCOL ) THEN CALL ZGEBS2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC ) ELSE CALL ZGEBR2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC, $ MYROW, JCCOL ) END IF END IF * * When all row procesors have a copy of the row block A, * ELSE IF( LCMP.EQ.1 ) NP0 = NP * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * DO 50 I = 0, LCMQ-1 IF( MRROW.EQ.MOD(NPCOL*I+MRCOL, NPROW) ) THEN IF( LCMP.EQ.1.AND.(ICCOL.EQ.-1.OR.ICCOL.EQ.MYCOL) ) THEN CALL PBZTR2BT( ICONTXT, 'Row', TRANS, M, NQ-I*NB, NB, $ A(1,I*NB+1), LDA, BETA, C, LDC, $ LCMQ*NB ) ELSE CALL PBZTR2BT( ICONTXT, 'Row', TRANS, M, NQ-I*NB, NB, $ A(1,I*NB+1), LDA, ZERO, WORK, NP0, $ LCMQ*NB ) END IF END IF 50 CONTINUE * * Get diagonal blocks of A for each row of the template * MCCOL = MOD( MOD(MRROW, NPCOL)+IACOL, NPCOL ) IF( LCMP.GT.1 ) THEN MCROW = MOD( NPROW+MYROW-ICROW, NPROW ) CALL PBZTRGET( ICONTXT, 'Col', NP0, M, ICEIL(N,NB), WORK, $ NP0, MCROW, MCCOL, IGD, MYROW, MYCOL, NPROW, $ NPCOL ) END IF * * Broadcast a column block of WORK in every column of template * IF( ICCOL.EQ.-1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( LCMP.GT.1 ) $ CALL PBZTRSRT( ICONTXT, 'Col', NP, M, NB, WORK, NP0, $ BETA, C, LDC, LCMP, LCMQ, NP0 ) CALL ZGEBS2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC ) ELSE CALL ZGEBR2D( ICONTXT, 'Row', '1-tree', NP, M, C, LDC, $ MYROW, MCCOL ) END IF * * Send a column block of WORK to the destination column * ELSE IF( LCMP.EQ.1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.ICCOL ) $ CALL ZGESD2D( ICONTXT, NP, M, WORK, NP, MYROW, ICCOL ) ELSE IF( MYCOL.EQ.ICCOL ) THEN IF( BETA.EQ.ZERO ) THEN CALL ZGERV2D( ICONTXT, NP, M, C, LDC, MYROW, MCCOL ) ELSE CALL ZGERV2D( ICONTXT, NP, M, WORK, NP, MYROW, MCCOL ) CALL PBZMATADD( ICONTXT, 'G', NP, M, ONE, WORK, NP, $ BETA, C, LDC ) END IF END IF * ELSE ML = M * MIN( LCMP, MAX( 0, ICEIL(N,NB) - MCROW ) ) IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.ICCOL ) $ CALL ZGESD2D( ICONTXT, NP0, ML, WORK, NP0, $ MYROW, ICCOL ) ELSE IF( MYCOL.EQ.ICCOL ) THEN CALL ZGERV2D( ICONTXT, NP0, ML, WORK, NP0, $ MYROW, MCCOL ) END IF * IF( MYCOL.EQ.ICCOL ) $ CALL PBZTRSRT( ICONTXT, 'Col', NP, M, NB, WORK, NP0, $ BETA, C, LDC, LCMP, LCMQ, NP0 ) END IF END IF * END IF END IF * RETURN * * End of PBZTRAN * END * *======================================================================= * SUBROUTINE PBZTR2AT *======================================================================= * SUBROUTINE PBZTR2AT( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, $ BETA, B, LDB, LCMP, LCMQ ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER ICONTXT, LCMP, LCMQ, LDA, LDB, M, N, NB COMPLEX*16 BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBZTR2AT forms B <== A^T + beta*B, or A^C + beta*B * B is a ((conjugate) transposed) scattered block row (or column), * copied from a scattered block column (or row) of A * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER IA, IB, K, INTV, JNTV * .. * .. External Subroutines .. EXTERNAL PBZMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Excutable Statements .. * IF( LCMP.EQ.LCMQ ) THEN CALL PBZMATADD( ICONTXT, TRANS, N, M, ONE, A, LDA, BETA, B, $ LDB ) * ELSE * * If A is a column block ( ADIST = 'C' ), * IF( LSAME( ADIST, 'C' ) ) THEN INTV = LCMP * NB JNTV = LCMQ * NB IA = 1 IB = 1 DO 10 K = 1, ICEIL( M, INTV ) CALL PBZMATADD( ICONTXT, TRANS, N, MIN( M-IA+1, NB ), $ ONE, A(IA,1), LDA, BETA, B(1,IB), LDB ) IA = IA + INTV IB = IB + JNTV 10 CONTINUE * * If A is a row block ( ADIST = 'R' ), * ELSE INTV = LCMP * NB JNTV = LCMQ * NB IA = 1 IB = 1 DO 20 K = 1, ICEIL( N, JNTV ) CALL PBZMATADD( ICONTXT, TRANS, MIN( N-IA+1, NB ), M, $ ONE, A(1,IA), LDA, BETA, B(IB,1), LDB ) IA = IA + JNTV IB = IB + INTV 20 CONTINUE END IF END IF * RETURN * * End of PBZTR2AT * END * *======================================================================= * SUBROUTINE PBZTR2BT *======================================================================= * SUBROUTINE PBZTR2BT( ICONTXT, ADIST, TRANS, M, N, NB, A, LDA, $ BETA, B, LDB, INTV ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST, TRANS INTEGER ICONTXT, INTV, LDA, LDB, M, N, NB COMPLEX*16 BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBZTR2BT forms T <== A^T + beta*T or A^C + beta*T, where T is a * ((conjugate) transposed) condensed block row (or column), copied from * a scattered block column (or row) of A * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER IA, IB, K * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. External Subroutines .. EXTERNAL PBZMATADD * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Excutable Statements .. * IF( INTV.EQ.NB ) THEN CALL PBZMATADD( ICONTXT, TRANS, N, M, ONE, A, LDA, BETA, B, $ LDB ) * ELSE * * If A is a column block ( ADIST = 'C' ), * IF( LSAME( ADIST, 'C' ) ) THEN IA = 1 IB = 1 DO 10 K = 1, ICEIL( M, INTV ) CALL PBZMATADD( ICONTXT, TRANS, N, MIN( M-IA+1, NB ), $ ONE, A(IA,1), LDA, BETA, B(1,IB), LDB ) IA = IA + INTV IB = IB + NB 10 CONTINUE * * If A is a row block (ADIST = 'R'), * ELSE IA = 1 IB = 1 DO 20 K = 1, ICEIL( N, INTV ) CALL PBZMATADD( ICONTXT, TRANS, MIN( N-IA+1, NB ), M, $ ONE, A(1,IA), LDA, BETA, B(IB,1), LDB ) IA = IA + INTV IB = IB + NB 20 CONTINUE END IF END IF * RETURN * * End of PBZTR2BT * END * *======================================================================= * SUBROUTINE PBZTR2AF *======================================================================= * SUBROUTINE PBZTR2AF( ICONTXT, ADIST, M, N, NB, A, LDA, BETA, B, $ LDB, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, M, N, NB, LDA, LDB, LCMP, LCMQ, NINT COMPLEX*16 BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBZTR2AF forms T <== A + BETA*T, where T is a scattered block * row (or column) copied from a (condensed) block column (or row) of A * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER JA, JB, K, INTV * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL LSAME, ICEIL * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. Executable Statements .. * IF( LSAME( ADIST, 'R' ) ) THEN INTV = NB * LCMQ JA = 1 JB = 1 DO 10 K = 1, ICEIL( NINT, NB ) CALL PBZMATADD( ICONTXT, 'G', M, MIN( N-JB+1, NB ), ONE, $ A(1,JA), LDA, BETA, B(1,JB), LDB ) JA = JA + NB JB = JB + INTV 10 CONTINUE * * if( LSAME( ADIST, 'C' ) ) then * ELSE INTV = NB * LCMP JA = 1 JB = 1 DO 20 K = 1, ICEIL( NINT, NB ) CALL PBZMATADD( ICONTXT, 'G', MIN( M-JB+1, NB ), N, ONE, $ A(JA,1), LDA, BETA, B(JB,1), LDB ) JA = JA + NB JB = JB + INTV 20 CONTINUE END IF * RETURN * * End of PBZTR2AF * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbztrget.f000644 000766 000024 00000010033 10363532303 020147 0ustar00juliestaff000000 000000 SUBROUTINE PBZTRGET( ICONTXT, ADIST, M, N, MNB, A, LDA, MCROW, $ MCCOL, IGD, MYROW, MYCOL, NPROW, NPCOL ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, IGD, LDA, M, MCCOL, MCROW, MNB, MYCOL, $ MYROW, N, NPCOL, NPROW * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * PBZTRGET forms a row block of A from scattered row subblocks if * ADIST = 'R', or forms a column block of A from scattered column * subblocks, if ADIST = 'C'. * * ===================================================================== * * .. Parameters .. REAL ONE, TWO PARAMETER ( ONE = 1.0E+0, TWO = 2.0E+0 ) * .. * .. Local Variables .. INTEGER KINT, KINT2, KLEN, KMOD, KPPOS, NLEN, NNUM, $ NTLEN REAL TEMP * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC EXTERNAL LSAME, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * * if A is a row block, it needs to communicate columnwise. * IF( LSAME( ADIST, 'R' ) ) THEN KPPOS = MOD( NPROW+MYROW-MCROW, NPROW ) IF( MOD( KPPOS, IGD ).EQ.0 ) THEN KINT = IGD NLEN = N NNUM = MIN( NPROW/IGD, MNB-MCCOL ) TEMP = REAL( NNUM ) NTLEN = N * NNUM NNUM = IGD * NNUM IF( KPPOS.GE.NNUM ) GO TO 30 KPPOS = MOD( KPPOS, NPROW ) * 10 CONTINUE IF( TEMP.GT.ONE ) THEN KINT2 = 2 * KINT KMOD = MOD( KPPOS, KINT2 ) * IF( KMOD.EQ.0 ) THEN IF( KPPOS+KINT.LT.NNUM ) THEN KLEN = NTLEN - (KPPOS/KINT2)*(KINT2/IGD)*N KLEN = MIN( KLEN-NLEN, NLEN ) CALL ZGERV2D( ICONTXT, M, KLEN, A(1,NLEN+1), LDA, $ MOD(MYROW+KINT, NPROW), MYCOL ) NLEN = NLEN + KLEN END IF ELSE CALL ZGESD2D( ICONTXT, M, NLEN, A, LDA, $ MOD(NPROW+MYROW-KINT, NPROW), MYCOL ) GO TO 30 END IF * KINT = KINT2 TEMP = TEMP / TWO GO TO 10 END IF END IF * * if A is a column block, it needs to communicate rowwise. * ELSE IF( LSAME( ADIST, 'C' ) ) THEN * KPPOS = MOD( NPCOL+MYCOL-MCCOL, NPCOL ) IF( MOD( KPPOS, IGD ).EQ.0 ) THEN KINT = IGD NLEN = N NNUM = MIN( NPCOL/IGD, MNB-MCROW ) TEMP = REAL( NNUM ) NTLEN = N * NNUM NNUM = IGD * NNUM IF( KPPOS.GE.NNUM ) GO TO 30 KPPOS = MOD( KPPOS, NPCOL ) * 20 CONTINUE IF( TEMP.GT.ONE ) THEN KINT2 = 2 * KINT KMOD = MOD( KPPOS, KINT2 ) * IF( KMOD.EQ.0 ) THEN IF( KPPOS+KINT.LT.NNUM ) THEN KLEN = NTLEN - (KPPOS/KINT2)*(KINT2/IGD)*N KLEN = MIN( KLEN-NLEN, NLEN ) CALL ZGERV2D( ICONTXT, M, KLEN, A(1,NLEN+1), LDA, $ MYROW, MOD(MYCOL+KINT, NPCOL) ) NLEN = NLEN + KLEN END IF ELSE CALL ZGESD2D( ICONTXT, M, NLEN, A, LDA, MYROW, $ MOD(NPCOL+MYCOL-KINT, NPCOL) ) GO TO 30 END IF * KINT = KINT2 TEMP = TEMP / TWO GO TO 20 END IF END IF END IF * 30 CONTINUE * RETURN * * End of PBZTRGET * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbztrnv.f000644 000766 000024 00000064764 10363532303 020037 0ustar00juliestaff000000 000000 SUBROUTINE PBZTRNV( ICONTXT, XDIST, TRANS, N, NB, NZ, X, INCX, $ BETA, Y, INCY, IXROW, IXCOL, IYROW, IYCOL, $ WORK ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * Jaeyoung Choi, Oak Ridge National Laboratory * Jack Dongarra, University of Tennessee and Oak Ridge National Lab. * David Walker, Oak Ridge National Laboratory * * .. Scalar Arguments .. CHARACTER*1 TRANS, XDIST INTEGER ICONTXT, INCX, INCY, IXCOL, IXROW, IYCOL, $ IYROW, N, NB, NZ COMPLEX*16 BETA * .. * .. Array Arguments .. COMPLEX*16 WORK( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * PBZTRNV transposes a column vector to row vector, or a row vector to * column vector by reallocating data distribution. * * Y := X' * * where X and Y are N vectors. * * Parameters * ========== * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * XDIST (input) CHARACTER*1 * XDIST specifies whether X is a column vector or a row vector, * * XDIST = 'C', X is a column vector (distributed columnwise) * XDIST = 'R', X is a row vector (distributed rowwise) * * TRANS (input) CHARACTER*1 * TRANS specifies whether the transposed format is transpose * or conjugate transpose. If the vectors X and Y are real, * the argument is ignored. * * TRANS = 'T', transpose * TRANS = 'C', conjugate transpose * * N (input) INTEGER * N specifies the (global) number of the vector X and the * vector Y. N >= 0. * * NB (input) INTEGER * NB specifies the block size of vectors X and Y. NB >= 0. * * NZ (input) INTEGER * NZ is the column offset to specify the column distance from * the beginning of the block to the first element of the * vector X, and the row offset to the first element of the * vector Y if XDIST = 'C'. * Otherwise, it is row offset to specify the row distance * from the beginning of the block to the first element of the * vector X, and the column offset to the first element of the * vector Y. 0 < NZ <= NB. * * X (input) COMPLEX*16 array of dimension at least * ( 1 + (Np-1) * abs(INCX)) in IXCOL if XDIST = 'C', or * ( 1 + (Nq-1) * abs(INCX)) in IXROW if XDIST = 'R'. * The incremented array X must contain the vector X. * * INCX (input) INTEGER * INCX specifies the increment for the elements of X. * INCX <> 0. * * BETA (input) COMPLEX*16 * BETA specifies scaler beta. * * Y (input/output) COMPLEX*16 array of dimension at least * ( 1 + (Nq-1) * abs(INCY)) in IYROW if XDIST = 'C', or * ( 1 + (Np-1) * abs(INCY)) in IYCOL if XDIST = 'R', or * The incremented array Y must contain the vector Y. * Y will not be referenced if beta is zero. * * INCY (input) INTEGER * INCY specifies the increment for the elements of Y. * INCY <> 0. * * IXROW (input) INTEGER * IXROW specifies a row of the process template, which holds * the first element of the vector X. If X is a row vector and * all rows of processes have a copy of X, then set IXROW = -1. * * IXCOL (input) INTEGER * IXCOL specifies a column of the process template, * which holds the first element of the vector X. If X is a * column block and all columns of processes have a copy of X, * then set IXCOL = -1. * * IYROW (input) INTEGER * IYROW specifies the current row process which holds the * first element of the vector Y, which is transposed of X. * If X is a column vector and the transposed row vector Y is * distributed all rows of processes, set IYROW = -1. * * IYCOL (input) INTEGER * IYCOL specifies the current column process which holds * the first element of the vector Y, which is transposed of Y. * If X is a row block and the transposed column vector Y is * distributed all columns of processes, set IYCOL = -1. * * WORK (workspace) COMPLEX*16 array of dimension Size(WORK). * It needs extra working space of x**T or x**H. * * Parameters Details * ================== * * Nx It is a local portion of N owned by a process, where x is * replaced by either p (=NPROW) or q (=NPCOL)). The value is * determined by N, NB, NZ, x, and MI, where NB is a block size, * NZ is a offset from the beginning of the block, and MI is a * row or column position in a process template. Nx is equal * to or less than Nx0 = CEIL( N+NZ, NB*x ) * NB. * * Communication Scheme * ==================== * * The communication scheme of the routine is set to '1-tree', which is * fan-out. (For details, see BLACS user's guide.) * * Memory Requirement of WORK * ========================== * * NN = N + NZ * Npb = CEIL( NN, NB*NPROW ) * Nqb = CEIL( NN, NB*NPCOL ) * LCMP = LCM / NPROW * LCMQ = LCM / NPCOL * * (1) XDIST = 'C' * (a) IXCOL != -1 * Size(WORK) = CEIL(Nqb,LCMQ)*NB * (b) IXCOL = -1 * Size(WORK) = CEIL(Nqb,LCMQ)*NB * MIN(LCMQ,CEIL(NN,NB)) * * (2) XDIST = 'R' * (a) IXROW != -1 * Size(WORK) = CEIL(Npb,LCMP)*NB * (b) IXROW = -1 * Size(WORK) = CEIL(Npb,LCMP)*NB * MIN(LCMP,CEIL(NN,NB)) * * Notes * ----- * More precise space can be computed as * * CEIL(Npb,LCMP)*NB => NUMROC( NUMROC(NN,NB,0,0,NPROW), NB, 0, 0, LCMP) * CEIL(Nqb,LCMQ)*NB => NUMROC( NUMROC(NN,NB,0,0,NPCOL), NB, 0, 0, LCMQ) * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL COLFORM, ROWFORM INTEGER I, IDEX, IGD, INFO, JDEX, JYCOL, JYROW, JZ, KZ, $ LCM, LCMP, LCMQ, MCCOL, MCROW, MRCOL, MRROW, $ MYCOL, MYROW, NN, NP, NP0, NP1, NPCOL, NPROW, $ NQ, NQ0, NQ1 COMPLEX*16 TBETA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILCM, ICEIL, NUMROC EXTERNAL LSAME, ILCM, ICEIL, NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, PBZTR2A1, PBZTR2B1, PBZTRGET, $ PBZTRST1, PBZVECADD, PXERBLA, ZGEBR2D, ZGEBS2D, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * * Quick return if possible. * IF( N.EQ.0 ) RETURN * CALL BLACS_GRIDINFO( ICONTXT, NPROW, NPCOL, MYROW, MYCOL ) * COLFORM = LSAME( XDIST, 'C' ) ROWFORM = LSAME( XDIST, 'R' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.COLFORM ) .AND. ( .NOT.ROWFORM ) ) THEN INFO = 2 ELSE IF( N .LT.0 ) THEN INFO = 4 ELSE IF( NB .LT.1 ) THEN INFO = 5 ELSE IF( NZ .LT.0 .OR. NZ.GE.NB ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 ELSE IF( IXROW.LT.-1 .OR. IXROW.GE.NPROW .OR. $ ( IXROW.EQ.-1 .AND. COLFORM ) ) THEN INFO = 12 ELSE IF( IXCOL.LT.-1 .OR. IXCOL.GE.NPCOL .OR. $ ( IXCOL.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 13 ELSE IF( IYROW.LT.-1 .OR. IYROW.GE.NPROW .OR. $ ( IYROW.EQ.-1 .AND. ROWFORM ) ) THEN INFO = 14 ELSE IF( IYCOL.LT.-1 .OR. IYCOL.GE.NPCOL .OR. $ ( IYCOL.EQ.-1 .AND. COLFORM ) ) THEN INFO = 15 END IF * 10 CONTINUE IF( INFO.NE.0 ) THEN CALL PXERBLA( ICONTXT, 'PBZTRNV ', INFO ) RETURN END IF * * Start the operations. * * LCM : the least common multiple of NPROW and NPCOL * LCM = ILCM( NPROW, NPCOL ) LCMP = LCM / NPROW LCMQ = LCM / NPCOL IGD = NPCOL / LCMP NN = N + NZ * * When x is a column vector * IF( COLFORM ) THEN * * Form y <== x' ( x is a column vector ) * * || * || * _____________ || * -----(y)----- <== (x) * || * || * || * IF( IXROW.LT.0 .OR. IXROW.GE.NPROW ) THEN INFO = 12 ELSE IF( IXCOL.LT.-1 .OR. IXCOL.GE.NPCOL ) THEN INFO = 13 ELSE IF( IYROW.LT.-1 .OR. IYROW.GE.NPROW ) THEN INFO = 14 ELSE IF( IYCOL.LT.0 .OR. IYCOL.GE.NPCOL ) THEN INFO = 15 END IF IF( INFO.NE.0 ) GO TO 10 * * MRROW : row relative position in template from IXROW * MRCOL : column relative position in template from IYCOL * MRROW = MOD( NPROW+MYROW-IXROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IYCOL, NPCOL ) JYROW = IYROW IF( IYROW.EQ.-1 ) JYROW = IXROW * NP = NUMROC( NN, NB, MYROW, IXROW, NPROW ) IF( MRROW.EQ.0 ) NP = NP - NZ NQ = NUMROC( NN, NB, MYCOL, IYCOL, NPCOL ) IF( MRCOL.EQ.0 ) NQ = NQ - NZ NQ0 = NUMROC( NUMROC(NN, NB, 0, 0, NPCOL), NB, 0, 0, LCMQ ) * * When a column process of IXCOL has a column block A, * IF( IXCOL .GE. 0 ) THEN TBETA = ZERO IF( MYROW.EQ.JYROW ) TBETA = BETA KZ = NZ * DO 20 I = 0, MIN( LCM, ICEIL(NN,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IXROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IYCOL, NPCOL ) IF( LCMQ.EQ.1 ) NQ0 = NUMROC( NN, NB, I, 0, NPCOL ) JDEX = (I/NPCOL) * NB IF( MRCOL.EQ.0 ) JDEX = MAX(0, JDEX-NZ) * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.IXCOL ) THEN * * The source node is a destination node * IDEX = (I/NPROW) * NB IF( MRROW.EQ.0 ) IDEX = MAX( 0, IDEX-NZ ) IF( MYROW.EQ.JYROW .AND. MYCOL.EQ.MCCOL ) THEN CALL PBZTR2B1( ICONTXT, TRANS, NP-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, TBETA, $ Y(JDEX*INCY+1), INCY, LCMP, LCMQ ) * * The source node sends blocks to a destination node * ELSE CALL PBZTR2B1( ICONTXT, TRANS, NP-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMP, 1 ) CALL ZGESD2D( ICONTXT, 1, NQ0-KZ, WORK, 1, $ JYROW, MCCOL ) END IF * * A destination node receives the copied vector * ELSE IF( MYROW.EQ.JYROW .AND. MYCOL.EQ.MCCOL ) THEN IF( LCMQ.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL ZGERV2D( ICONTXT, 1, NQ0-KZ, Y, INCY, $ MCROW, IXCOL ) ELSE CALL ZGERV2D( ICONTXT, 1, NQ0-KZ, WORK, 1, $ MCROW, IXCOL ) CALL PBZTR2A1( ICONTXT, NQ-JDEX, NB, KZ, WORK, 1, TBETA, $ Y(JDEX*INCY+1), INCY, LCMQ*NB ) END IF END IF KZ = 0 20 CONTINUE * * Broadcast a row block of WORK in each column of template * IF( IYROW.EQ.-1 ) THEN IF( MYROW.EQ.JYROW ) THEN CALL ZGEBS2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY ) ELSE CALL ZGEBR2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY, $ JYROW, MYCOL ) END IF END IF * * When all column procesors have a copy of the column block A, * ELSE IF( LCMQ.EQ.1 ) NQ0 = NQ * * Processors, which have diagonal blocks of X, copy them to * WORK array in transposed form * KZ = 0 IF( MRROW.EQ.0 ) KZ = NZ JZ = 0 IF( MRROW.EQ.0 .AND. MYCOL.EQ.IYCOL ) JZ = NZ * DO 30 I = 0, LCMP - 1 IF( MRCOL.EQ.MOD(NPROW*I+MRROW, NPCOL) ) THEN IDEX = MAX( 0, I*NB-KZ ) IF( LCMQ.EQ.1 .AND. (IYROW.EQ.-1.OR.IYROW.EQ.MYROW) ) THEN CALL PBZTR2B1( ICONTXT, TRANS, NP-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, BETA, Y, INCY, $ LCMP, 1 ) ELSE CALL PBZTR2B1( ICONTXT, TRANS, NP-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMP, 1 ) END IF END IF 30 CONTINUE * * Get diagonal blocks of A for each column of the template * MCROW = MOD( MOD(MRCOL, NPROW) + IXROW, NPROW ) IF( LCMQ.GT.1 ) THEN MCCOL = MOD( NPCOL+MYCOL-IYCOL, NPCOL ) CALL PBZTRGET( ICONTXT, 'Row', 1, NQ0, ICEIL( NN, NB ), $ WORK, 1, MCROW, MCCOL, IGD, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * * Broadcast a row block of WORK in every row of template * IF( IYROW.EQ.-1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( LCMQ.GT.1 ) THEN KZ = 0 IF( MYCOL.EQ.IYCOL ) KZ = NZ CALL PBZTRST1( ICONTXT, 'Row', NQ, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NQ0 ) END IF CALL ZGEBS2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY ) ELSE CALL ZGEBR2D( ICONTXT, 'Col', '1-tree', 1, NQ, Y, INCY, $ MCROW, MYCOL ) END IF * * Send a row block of WORK to the destination row * ELSE IF( LCMQ.EQ.1 ) THEN IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.IYROW ) $ CALL ZGESD2D( ICONTXT, 1, NQ0, WORK, 1, IYROW, MYCOL ) ELSE IF( MYROW.EQ.IYROW ) THEN IF( BETA.EQ.ZERO ) THEN CALL ZGERV2D( ICONTXT, 1, NQ0, Y, INCY, MCROW, MYCOL ) ELSE CALL ZGERV2D( ICONTXT, 1, NQ0, WORK, 1, MCROW, MYCOL ) CALL PBZVECADD( ICONTXT, 'G', NQ0, ONE, WORK, 1, $ BETA, Y, INCY ) END IF END IF * ELSE NQ1 = NQ0 * MIN( LCMQ, MAX( 0, ICEIL(NN,NB)-MCCOL ) ) IF( MYROW.EQ.MCROW ) THEN IF( MYROW.NE.IYROW ) $ CALL ZGESD2D( ICONTXT, 1, NQ1, WORK, 1, IYROW, MYCOL ) ELSE IF( MYROW.EQ.IYROW ) THEN CALL ZGERV2D( ICONTXT, 1, NQ1, WORK, 1, MCROW, MYCOL ) END IF * IF( MYROW.EQ.IYROW ) THEN KZ = 0 IF( MYCOL.EQ.IYCOL ) KZ = NZ CALL PBZTRST1( ICONTXT, 'Row', NQ, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NQ0 ) END IF END IF END IF END IF * * When x is a row vector * ELSE * * Form y <== x' ( x is a row block ) * * || * || * || _____________ * (y) <== -----(x)----- * || * || * || * IF( IXROW.LT.-1 .OR. IXROW.GE.NPROW ) THEN INFO = 12 ELSE IF( IXCOL.LT.0 .OR. IXCOL.GE.NPCOL ) THEN INFO = 13 ELSE IF( IYROW.LT.0 .OR. IYROW.GE.NPROW ) THEN INFO = 14 ELSE IF( IYCOL.LT.-1 .OR. IYCOL.GE.NPCOL ) THEN INFO = 15 END IF IF( INFO.NE.0 ) GO TO 10 * * MRROW : row relative position in template from IYROW * MRCOL : column relative position in template from IXCOL * MRROW = MOD( NPROW+MYROW-IYROW, NPROW ) MRCOL = MOD( NPCOL+MYCOL-IXCOL, NPCOL ) JYCOL = IYCOL IF( IYCOL.EQ.-1 ) JYCOL = IXCOL * NP = NUMROC( NN, NB, MYROW, IYROW, NPROW ) IF( MRROW.EQ.0 ) NP = NP - NZ NQ = NUMROC( NN, NB, MYCOL, IXCOL, NPCOL ) IF( MRCOL.EQ.0 ) NQ = NQ - NZ NP0 = NUMROC( NUMROC(NN, NB, 0, 0, NPROW), NB, 0, 0, LCMP ) * * When a row process of IXROW has a row block A, * IF( IXROW .GE. 0 ) THEN TBETA = ZERO IF( MYCOL.EQ.JYCOL ) TBETA = BETA KZ = NZ * DO 40 I = 0, MIN( LCM, ICEIL(NN,NB) ) - 1 MCROW = MOD( MOD(I, NPROW) + IYROW, NPROW ) MCCOL = MOD( MOD(I, NPCOL) + IXCOL, NPCOL ) IF( LCMP.EQ.1 ) NP0 = NUMROC( NN, NB, I, 0, NPROW ) JDEX = (I/NPROW) * NB IF( MRROW.EQ.0 ) JDEX = MAX(0, JDEX-NZ) * * A source node copies the blocks to WORK, and send it * IF( MYROW.EQ.IXROW .AND. MYCOL.EQ.MCCOL ) THEN * * The source node is a destination node * IDEX = (I/NPCOL) * NB IF( MRCOL.EQ.0 ) IDEX = MAX( 0, IDEX-NZ ) IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JYCOL ) THEN CALL PBZTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, TBETA, $ Y(JDEX*INCY+1), INCY, LCMQ, LCMP ) * * The source node sends blocks to a destination node * ELSE CALL PBZTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, KZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMQ, 1 ) CALL ZGESD2D( ICONTXT, 1, NP0-KZ, WORK, 1, $ MCROW, JYCOL ) END IF * * A destination node receives the copied blocks * ELSE IF( MYROW.EQ.MCROW .AND. MYCOL.EQ.JYCOL ) THEN IF( LCMP.EQ.1 .AND. TBETA.EQ.ZERO ) THEN CALL ZGERV2D( ICONTXT, 1, NP0-KZ, Y, INCY, $ IXROW, MCCOL ) ELSE CALL ZGERV2D( ICONTXT, 1, NP0-KZ, WORK, 1, $ IXROW, MCCOL ) CALL PBZTR2A1( ICONTXT, NP-JDEX, NB, KZ, WORK, 1, TBETA, $ Y(JDEX*INCY+1), INCY, LCMP*NB ) END IF END IF KZ = 0 40 CONTINUE * * Broadcast a column vector Y in each row of template * IF( IYCOL.EQ.-1 ) THEN IF( MYCOL.EQ.JYCOL ) THEN CALL ZGEBS2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY ) ELSE CALL ZGEBR2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY, $ MYROW, JYCOL ) END IF END IF * * When all row procesors have a copy of the row block A, * ELSE IF( LCMP.EQ.1 ) NP0 = NP * * Processors, which have diagonal blocks of A, copy them to * WORK array in transposed form * KZ = 0 IF( MRCOL.EQ.0 ) KZ = NZ JZ = 0 IF( MRCOL.EQ.0 .AND. MYROW.EQ.IYROW ) JZ = NZ * DO 50 I = 0, LCMQ-1 IF( MRROW.EQ.MOD(NPCOL*I+MRCOL, NPROW) ) THEN IDEX = MAX( 0, I*NB-KZ ) IF( LCMP.EQ.1 .AND. (IYCOL.EQ.-1.OR.IYCOL.EQ.MYCOL) ) THEN CALL PBZTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, BETA, Y, INCY, $ LCMQ, 1 ) ELSE CALL PBZTR2B1( ICONTXT, TRANS, NQ-IDEX, NB, JZ, $ X(IDEX*INCX+1), INCX, ZERO, WORK, 1, $ LCMQ, 1 ) END IF END IF 50 CONTINUE * * Get diagonal blocks of A for each row of the template * MCCOL = MOD( MOD(MRROW, NPCOL) + IXCOL, NPCOL ) IF( LCMP.GT.1 ) THEN MCROW = MOD( NPROW+MYROW-IYROW, NPROW ) CALL PBZTRGET( ICONTXT, 'Col', 1, NP0, ICEIL( NN, NB ), $ WORK, 1, MCROW, MCCOL, IGD, MYROW, MYCOL, $ NPROW, NPCOL ) END IF * * Broadcast a column block of WORK in every column of template * IF( IYCOL.EQ.-1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( LCMP.GT.1 ) THEN KZ = 0 IF( MYROW.EQ.IYROW ) KZ = NZ CALL PBZTRST1( ICONTXT, 'Col', NP, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NP0 ) END IF CALL ZGEBS2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY ) ELSE CALL ZGEBR2D( ICONTXT, 'Row', '1-tree', 1, NP, Y, INCY, $ MYROW, MCCOL ) END IF * * Send a column block of WORK to the destination column * ELSE IF( LCMP.EQ.1 ) THEN IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.IYCOL ) $ CALL ZGESD2D( ICONTXT, 1, NP, WORK, 1, MYROW, IYCOL ) ELSE IF( MYCOL.EQ.IYCOL ) THEN IF( BETA.EQ.ZERO ) THEN CALL ZGERV2D( ICONTXT, 1, NP, Y, INCY, MYROW, MCCOL ) ELSE CALL ZGERV2D( ICONTXT, 1, NP, WORK, 1, MYROW, MCCOL ) CALL PBZVECADD( ICONTXT, 'G', NP, ONE, WORK, 1, BETA, $ Y, INCY ) END IF END IF * ELSE NP1 = NP0 * MIN( LCMP, MAX( 0, ICEIL(NN,NB)-MCROW ) ) IF( MYCOL.EQ.MCCOL ) THEN IF( MYCOL.NE.IYCOL ) $ CALL ZGESD2D( ICONTXT, 1, NP1, WORK, 1, MYROW, IYCOL ) ELSE IF( MYCOL.EQ.IYCOL ) THEN CALL ZGERV2D( ICONTXT, 1, NP1, WORK, 1, MYROW, MCCOL ) END IF * IF( MYCOL.EQ.IYCOL ) THEN KZ = 0 IF( MYROW.EQ.IYROW ) KZ = NZ CALL PBZTRST1( ICONTXT, 'Col', NP, NB, KZ, WORK, 1, $ BETA, Y, INCY, LCMP, LCMQ, NP0 ) END IF END IF END IF END IF END IF * RETURN * * End of PBZTRNV * END * *======================================================================= * SUBROUTINE PBZTR2A1 *======================================================================= * SUBROUTINE PBZTR2A1( ICONTXT, N, NB, NZ, X, INCX, BETA, Y, INCY, $ INTV ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. INTEGER ICONTXT, N, NB, NZ, INCX, INCY, INTV COMPLEX*16 BETA * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * .. * * Purpose * ======= * * y <== x * y is a scattered vector, copied from a condensed vector x. * * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. External Subroutines .. EXTERNAL PBZVECADD * .. * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Variables .. INTEGER IX, IY, JZ, K, ITER * IX = 0 IY = 0 JZ = NZ ITER = ICEIL( N+NZ, INTV ) * IF( ITER.GT.1 ) THEN CALL PBZVECADD( ICONTXT, 'G', NB-JZ, ONE, X(IX*INCX+1), INCX, $ BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - JZ IY = IY + INTV - JZ JZ = 0 * DO 10 K = 2, ITER-1 CALL PBZVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), INCX, $ BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + INTV 10 CONTINUE END IF * CALL PBZVECADD( ICONTXT, 'G', MIN( N-IY, NB-JZ ), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), INCY ) * RETURN * * End of PBZTR2A1 * END * *======================================================================= * SUBROUTINE PBZTR2B1 *======================================================================= * SUBROUTINE PBZTR2B1( ICONTXT, TRANS, N, NB, NZ, X, INCX, BETA, Y, $ INCY, JINX, JINY ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 TRANS INTEGER ICONTXT, N, NB, NZ, INCX, INCY, JINX, JINY COMPLEX*16 BETA * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * .. * * Purpose * ======= * * y <== x + beta * y * y is a condensed vector, copied from a scattered vector x * * .. * .. Intrinsic Functions .. INTRINSIC MIN * .. * .. External Functions .. INTEGER ICEIL EXTERNAL ICEIL * .. * .. External Subroutines .. EXTERNAL PBZVECADD * .. * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Variables .. INTEGER IX, IY, JZ, K, ITER, LENX, LENY * IF( JINX.EQ.1 .AND. JINY.EQ.1 ) THEN CALL PBZVECADD( ICONTXT, TRANS, N, ONE, X, INCX, BETA, $ Y, INCY ) * ELSE IX = 0 IY = 0 JZ = NZ LENX = NB * JINX LENY = NB * JINY ITER = ICEIL( N+NZ, LENX ) * IF( ITER.GT.1 ) THEN CALL PBZVECADD( ICONTXT, TRANS, NB-JZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + LENX - JZ IY = IY + LENY - JZ JZ = 0 * DO 10 K = 2, ITER-1 CALL PBZVECADD( ICONTXT, TRANS, NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + LENX IY = IY + LENY 10 CONTINUE END IF * CALL PBZVECADD( ICONTXT, TRANS, MIN( N-IX, NB-JZ ), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), INCY ) END IF * RETURN * * End of PBZTR2B1 * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbztrsrt.f000644 000766 000024 00000004515 10363532303 020210 0ustar00juliestaff000000 000000 SUBROUTINE PBZTRSRT( ICONTXT, ADIST, M, N, NB, A, LDA, BETA, B, $ LDB, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 ADIST INTEGER ICONTXT, LCMP, LCMQ, LDA, LDB, M, N, NB, NINT COMPLEX*16 BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * PBZTRSRT forms T <== A + beta * T, where T is a sorted * condensed block row (or column) from a block column (or row) of A * with sorting index ISRT * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Variables .. INTEGER JA, JB, K, KK, NJUMP * .. * .. External Subroutines .. EXTERNAL PBZMATADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( LSAME( ADIST, 'R' ) ) THEN NJUMP = NB * LCMQ DO 20 K = 0, LCMQ-1 JA = NINT * MOD( K*LCMP, LCMQ ) + 1 JB = K * NB + 1 * DO 10 KK = 1, ICEIL( NINT, NB ) IF( N.LT.JB ) GO TO 20 CALL PBZMATADD( ICONTXT, 'G', M, MIN( N-JB+1, NB ), ONE, $ A(1, JA), LDA, BETA, B(1, JB), LDB ) JA = JA + NB JB = JB + NJUMP 10 CONTINUE 20 CONTINUE * * if( LSAME( ADIST, 'C') ) then * ELSE NJUMP = NB * LCMP DO 40 K = 0, LCMP-1 JA = 1 JB = K * NB + 1 * DO 30 KK = 1, ICEIL( NINT, NB ) IF( M.LT.JB ) GO TO 40 CALL PBZMATADD( ICONTXT, 'G', MIN( M-JB+1, NB ), N, ONE, $ A(JA, N*MOD(K*LCMQ,LCMP)+1), LDA, BETA, $ B(JB, 1), LDB ) JA = JA + NB JB = JB + NJUMP 30 CONTINUE 40 CONTINUE END IF * RETURN * * End of PBZTRSRT * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbztrst1.f000644 000766 000024 00000006466 10363532303 020116 0ustar00juliestaff000000 000000 SUBROUTINE PBZTRST1( ICONTXT, XDIST, N, NB, NZ, X, INCX, BETA, Y, $ INCY, LCMP, LCMQ, NINT ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 XDIST INTEGER ICONTXT, INCX, INCY, LCMP, LCMQ, N, NB, NINT, $ NZ COMPLEX*16 BETA * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * .. * * Purpose * ======= * * PBZTRST1 forms y <== x + beta * y, where y is a sorted * condensed row (or column) vector from a column (or row) vector of x. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Variables .. INTEGER ITER, IX, IY, K, KK, KZ, NJUMP * .. * .. External Subroutines .. EXTERNAL PBZVECADD * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL EXTERNAL ICEIL, LSAME * .. * .. Intrinsic Functions .. INTRINSIC MIN, MAX, MOD * .. * .. Executable Statements .. * ITER = ICEIL( NINT, NB ) KZ = NZ * IF( LSAME( XDIST, 'R' ) ) THEN NJUMP = NB * LCMQ * DO 20 KK = 0, LCMQ-1 IX = NINT * MOD( KK*LCMP, LCMQ ) IY = MAX( 0, NB*KK-NZ ) IF( N.LT.IY ) GO TO 50 * IF( ITER.GT.1 ) THEN CALL PBZVECADD( ICONTXT, 'G', NB-KZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - KZ IY = IY + NJUMP - KZ KZ = 0 * DO 10 K = 2, ITER-1 CALL PBZVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + NJUMP 10 CONTINUE END IF * CALL PBZVECADD( ICONTXT, 'G', MIN(NB-KZ,N-IY), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), $ INCY ) KZ = 0 20 CONTINUE * * if( LSAME( XDIST, 'C' ) ) then * ELSE NJUMP = NB * LCMP * DO 40 KK = 0, LCMP-1 IX = NINT * MOD( KK*LCMQ, LCMP ) IY = MAX( 0, NB*KK-NZ ) IF( N.LT.IY ) GO TO 50 * IF( ITER.GT.1 ) THEN CALL PBZVECADD( ICONTXT, 'G', NB-KZ, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB - KZ IY = IY + NJUMP - KZ KZ = 0 * DO 30 K = 2, ITER-1 CALL PBZVECADD( ICONTXT, 'G', NB, ONE, X(IX*INCX+1), $ INCX, BETA, Y(IY*INCY+1), INCY ) IX = IX + NB IY = IY + NJUMP 30 CONTINUE END IF * CALL PBZVECADD( ICONTXT, 'G', MIN(NB-KZ,N-IY), ONE, $ X(IX*INCX+1), INCX, BETA, Y(IY*INCY+1), $ INCY ) KZ = 0 40 CONTINUE END IF * 50 CONTINUE * RETURN * * End of PBZTRST1 * END scalapack-2.0.2/PBLAS/SRC/PBBLAS/pbzvecadd.f000644 000766 000024 00000023716 10363532303 020264 0ustar00juliestaff000000 000000 SUBROUTINE PBZVECADD( ICONTXT, MODE, N, ALPHA, X, INCX, BETA, Y, $ INCY ) * * -- PB-BLAS routine (version 2.1) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory. * April 28, 1996 * * .. Scalar Arguments .. CHARACTER*1 MODE INTEGER ICONTXT, INCX, INCY, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 X( * ), Y( * ) * * .. * * Purpose * ======= * * PBZVECADD performs a vector X to be added to Y * Y := alpha*op(X) + beta*Y, * where alpha and beta are scalars, and X and Y are n vectors, * and op(X) = X**H if MODE = 'C', * * Arguments * ========= * * ICONTXT (input) INTEGER * ICONTXT is the BLACS mechanism for partitioning communication * space. A defining property of a context is that a message in * a context cannot be sent or received in another context. The * BLACS context includes the definition of a grid, and each * process' coordinates in it. * * MODE (input) CHARACTER*1 * Specifies the transposed, or conjugate transposed vector X * to be added to the vector Y * = 'C': Conjugate vector X is added for complex data set. * Y = alpha * X**H + beta * Y * ELSE : Vector X is added. Y = alpha*X + beta*Y * if MODE = 'V', BLAS routine may be used. * * N (input) INTEGER * The number of elements of the vectors X and Y to be added. * N >= 0. * * ALPHA (input) COMPLEX*16 * ALPHA specifies the scalar alpha. * * X (input) COMPLEX*16 array of DIMENSION at least * ( 1 + ( N - 1 )*abs( INCX ) ) * The incremented array X must contain the vector X. * * INCX (input) INTEGER * INCX specifies the increment for the elements of X. * INCX <> 0. * * BETA (input) COMPLEX*16 * BETA specifies the scalar beta. * * Y (input/output) COMPLEX*16 array of DIMENSION at least * ( 1 + ( N - 1 )*abs( INCY ) ) * On entry with BETA non-zero, the incremented array Y must * contain the vector Y. * On exit, Y is overwritten by the updated vector Y. * * INCY - (input) INTEGER * INCY specifies the increment for the elements of Y. * INCY <> 0. * * ===================================================================== * * .. * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IX, IY * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL ZSCAL, ZCOPY, ZAXPY * .. * .. Intrinsic Functions .. INTRINSIC DCONJG * .. * .. Executable Statements .. * IF( N.LE.0 .OR. ( ALPHA.EQ.ZERO .AND. BETA.EQ.ONE ) ) RETURN * IF( ALPHA.EQ.ZERO ) THEN IF( BETA.EQ.ZERO ) THEN IF( INCY.EQ.1 ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE IY = 1 DO 20 I = 1, N Y( IY ) = ZERO IY = IY + INCY 20 CONTINUE END IF * ELSE IF( LSAME( MODE, 'V' ) ) THEN CALL ZSCAL( N, BETA, Y, INCY ) ELSE IF( INCY.EQ.1 ) THEN DO 30 I = 1, N Y( I ) = BETA * Y( I ) 30 CONTINUE ELSE IY = 1 DO 40 I = 1, N Y( IY ) = BETA * Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF * ELSE IF( .NOT.LSAME( MODE, 'C' ) ) THEN IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( LSAME( MODE, 'V' ) ) THEN CALL ZCOPY( N, X, INCX, Y, INCY ) ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 50 I = 1, N Y( I ) = X( I ) 50 CONTINUE ELSE IX = 1 IY = 1 DO 60 I = 1, N Y( IY ) = X( IX ) IX = IX + INCX IY = IY + INCY 60 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 70 I = 1, N Y( I ) = X( I ) + Y( I ) 70 CONTINUE ELSE IX = 1 IY = 1 DO 80 I = 1, N Y( IY ) = X( IX ) + Y( IY ) IX = IX + INCX IY = IY + INCY 80 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 90 I = 1, N Y( I ) = X( I ) + BETA * Y( I ) 90 CONTINUE ELSE IX = 1 IY = 1 DO 100 I = 1, N Y( IY ) = X( IX ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 100 CONTINUE END IF END IF * ELSE IF( BETA.EQ.ZERO ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 110 I = 1, N Y( I ) = ALPHA * X( I ) 110 CONTINUE ELSE IX = 1 IY = 1 DO 120 I = 1, N Y( IY ) = X( IX ) IX = IX + INCX IY = IY + INCY 120 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( LSAME( MODE, 'V' ) ) THEN CALL ZAXPY( N, ALPHA, X, INCX, Y, INCY ) ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 130 I = 1, N Y( I ) = ALPHA * X( I ) + Y( I ) 130 CONTINUE ELSE IX = 1 IY = 1 DO 140 I = 1, N Y( IY ) = ALPHA * X( IX ) + Y( IY ) IX = IX + INCX IY = IY + INCY 140 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 150 I = 1, N Y( I ) = ALPHA * X( I ) + BETA * Y( I ) 150 CONTINUE ELSE IX = 1 IY = 1 DO 160 I = 1, N Y( IY ) = ALPHA * X( IX ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 160 CONTINUE END IF END IF END IF * ELSE IF( ALPHA.EQ.ONE ) THEN IF( BETA.EQ.ZERO ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 170 I = 1, N Y( I ) = DCONJG( X( I ) ) 170 CONTINUE ELSE IX = 1 IY = 1 DO 180 I = 1, N Y( IY ) = DCONJG( X( IX ) ) IX = IX + INCX IY = IY + INCY 180 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 190 I = 1, N Y( I ) = DCONJG( X( I ) ) + Y( I ) 190 CONTINUE ELSE IX = 1 IY = 1 DO 200 I = 1, N Y( IY ) = DCONJG( X( IX ) ) + Y( IY ) IX = IX + INCX IY = IY + INCY 200 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 210 I = 1, N Y( I ) = DCONJG( X( I ) ) + BETA * Y( I ) 210 CONTINUE ELSE IX = 1 IY = 1 DO 220 I = 1, N Y( IY ) = DCONJG( X( IX ) ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 220 CONTINUE END IF END IF * ELSE IF( BETA.EQ.ZERO ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 230 I = 1, N Y( I ) = ALPHA * DCONJG( X( I ) ) 230 CONTINUE ELSE IX = 1 IY = 1 DO 240 I = 1, N Y( IY ) = ALPHA * DCONJG( X( IX ) ) IX = IX + INCX IY = IY + INCY 240 CONTINUE END IF * ELSE IF( BETA.EQ.ONE ) THEN IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 250 I = 1, N Y( I ) = ALPHA * DCONJG( X( I ) ) + Y( I ) 250 CONTINUE ELSE IX = 1 IY = 1 DO 260 I = 1, N Y( IY ) = ALPHA * DCONJG( X( IX ) ) + Y( IY ) IX = IX + INCX IY = IY + INCY 260 CONTINUE END IF * ELSE IF( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN DO 270 I = 1, N Y( I ) = ALPHA * DCONJG( X( I ) ) + BETA * Y( I ) 270 CONTINUE ELSE IX = 1 IY = 1 DO 280 I = 1, N Y( IY ) = ALPHA * DCONJG( X(IX) ) + BETA * Y( IY ) IX = IX + INCX IY = IY + INCY 280 CONTINUE END IF END IF END IF END IF * RETURN * * End of PBZVECADD * END scalapack-2.0.2/EXAMPLE/CSCAEXMAT.dat000644 000766 000024 00000001652 10604624224 017012 0ustar00juliestaff000000 000000 6 6 6.0000E+0 4.0000E+0 3.0000E+0 -5.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 3.0000E+0 -3.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 -3.0000E+0 0.0000E+0 -1.0000E+0 -9.0000E+0 1.0000E+0 2.0000E+0 1.0000E+0 4.0000E+0 0.0000E+0 0.0000E+0 -1.0000E+0 -3.0000E+0 0.0000E+0 0.0000E+0 11.0000E+0 21.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 10.0000E+0 5.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 -11.0000E+0 -12.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 2.0000E+0 0.0000E+0 -4.0000E+0 5.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 8.0000E+0 1.0000E+0 0.0000E+0 0.0000E+0 -10.0000E+0 -10.0000E+0 scalapack-2.0.2/EXAMPLE/CSCAEXRHS.dat000644 000766 000024 00000000340 10604624224 017016 0ustar00juliestaff000000 000000 6 1 72.000000000000000000E+00 0E+0 0.000000000000000000E+00 0E+0 160.000000000000000000E+00 0E+0 0.000000000000000000E+00 0E+0 0.000000000000000000E+00 0E+0 0.000000000000000000E+00 0E+0 scalapack-2.0.2/EXAMPLE/DSCAEXMAT.dat000644 000766 000024 00000000622 10604624224 017007 0ustar00juliestaff000000 000000 6 6 6.0000D+0 3.0000D+0 0.0000D+0 0.0000D+0 3.0000D+0 0.0000D+0 0.0000D+0 -3.0000D+0 -1.0000D+0 1.0000D+0 1.0000D+0 0.0000D+0 -1.0000D+0 0.0000D+0 11.0000D+0 0.0000D+0 0.0000D+0 10.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 -11.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 2.0000D+0 -4.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 8.0000D+0 0.0000D+0 -10.0000D+0 scalapack-2.0.2/EXAMPLE/DSCAEXRHS.dat000644 000766 000024 00000000302 10604624224 017015 0ustar00juliestaff000000 000000 6 1 72.000000000000000000D+00 0.000000000000000000D+00 160.000000000000000000D+00 0.000000000000000000D+00 0.000000000000000000D+00 0.000000000000000000D+00 scalapack-2.0.2/EXAMPLE/Makefile000644 000766 000024 00000001521 11652072601 016443 0ustar00juliestaff000000 000000 include ../SLmake.inc TESTOBJS = psscaex.o pdscaexinfo.o TESTOBJD = pdscaex.o pdscaexinfo.o TESTOBJC = pcscaex.o pdscaexinfo.o TESTOBJZ = pzscaex.o pdscaexinfo.o all: pdscaex pcscaex pzscaex psscaex single: psscaex double: pdscaex complex: pcscaex complex16: pzscaex psscaex: $(TESTOBJS) $(FCLOADER) $(FCLOADFLAGS) -o xsscaex $(TESTOBJS) ../$(SCALAPACKLIB) $(LIBS) pzscaex: $(TESTOBJZ) $(FCLOADER) $(FCLOADFLAGS) -o xzscaex $(TESTOBJZ) ../$(SCALAPACKLIB) $(LIBS) pcscaex: $(TESTOBJC) $(FCLOADER) $(FCLOADFLAGS) -o xcscaex $(TESTOBJC) ../$(SCALAPACKLIB) $(LIBS) pdscaex: $(TESTOBJD) $(FCLOADER) $(FCLOADFLAGS) -o xdscaex $(TESTOBJD) ../$(SCALAPACKLIB) $(LIBS) clean : rm -f $(TESTOBJS) $(TESTOBJD) $(TESTOBJZ) $(TESTOBJC) xsscaex xzscaex xcscaex xdscaex .f.o : ; $(FC) -c $(FCFLAGS) $*.f .c.o : ; $(CC) -c $(CDEFS) $(CCFLAGS) $*.c scalapack-2.0.2/EXAMPLE/pcscaex.f000644 000766 000024 00000017244 10604624224 016611 0ustar00juliestaff000000 000000 PROGRAM PCSCAEX * * -- ScaLAPACK example code -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * Written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * * This program solves a linear system by calling the ScaLAPACK * routine PCGESV. The input matrix and right-and-sides are * read from a file. The solution is written to a file. * * .. Parameters .. INTEGER CPLXSZ, INTGSZ, MEMSIZ, TOTMEM PARAMETER ( CPLXSZ = 8, INTGSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / CPLXSZ ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX ONE PARAMETER ( ONE = (1.0D+0,0.0D+0) ) * .. * .. Local Scalars .. CHARACTER*80 OUTFILE INTEGER IAM, ICTXT, INFO, IPA, IPACPY, IPB, IPPIV, IPX, $ IPW, LIPIV, MYCOL, MYROW, N, NB, NOUT, NPCOL, $ NPROCS, NPROW, NP, NQ, NQRHS, NRHS, WORKSIZ REAL ANORM, BNORM, EPS, XNORM, RESID * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCX( DLEN_ ) COMPLEX MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, IGSUM2D, PCGESV, $ PCGEMM, PCLACPY, PCLAPRNT, PCLAREAD, PCLAWRITE * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PSLAMCH, PCLANGE EXTERNAL ICEIL, NUMROC, PSLAMCH, PCLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDSCAEXINFO( OUTFILE, NOUT, N, NRHS, NB, NPROW, NPCOL, MEM, $ IAM, NPROCS ) * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 20 * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) NQRHS = NUMROC( NRHS, NB, MYCOL, 0, NPCOL ) * * Initialize the array descriptor for the matrix A and B * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) CALL DESCINIT( DESCB, N, NRHS, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) CALL DESCINIT( DESCX, N, NRHS, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( 1 ) * IPA = 1 IPACPY = IPA + DESCA( LLD_ )*NQ IPB = IPACPY + DESCA( LLD_ )*NQ IPX = IPB + DESCB( LLD_ )*NQRHS IPPIV = IPX + DESCB( LLD_ )*NQRHS LIPIV = ICEIL( INTGSZ*( NP+NB ), CPLXSZ ) IPW = IPPIV + MAX( NP, LIPIV ) * WORKSIZ = NB * * Check for adequate memory for problem size * INFO = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'test', ( IPW+WORKSIZ )*CPLXSZ INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) IF( INFO.GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MEMORY' GO TO 10 END IF * * Read from file and distribute matrices A and B * CALL PCLAREAD( 'CSCAEXMAT.dat', MEM( IPA ), DESCA, 0, 0, $ MEM( IPW ) ) CALL PCLAREAD( 'CSCAEXRHS.dat', MEM( IPB ), DESCB, 0, 0, $ MEM( IPW ) ) * * Make a copy of A and the rhs for checking purposes * CALL PCLACPY( 'All', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPACPY ), 1, 1, DESCA ) CALL PCLACPY( 'All', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ MEM( IPX ), 1, 1, DESCX ) * ********************************************************************** * Call ScaLAPACK PCGESV routine ********************************************************************** * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) $ '***********************************************' WRITE( NOUT, FMT = * ) $ 'Example of ScaLAPACK routine call: (PCGESV)' WRITE( NOUT, FMT = * ) $ '***********************************************' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'A * X = B, Matrix A:' WRITE( NOUT, FMT = * ) END IF CALL PCLAPRNT( N, N, MEM( IPA ), 1, 1, DESCA, 0, 0, $ 'A', NOUT, MEM( IPW ) ) IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'Matrix B:' WRITE( NOUT, FMT = * ) END IF CALL PCLAPRNT( N, NRHS, MEM( IPB ), 1, 1, DESCB, 0, 0, $ 'B', NOUT, MEM( IPW ) ) * CALL PCGESV( N, NRHS, MEM( IPA ), 1, 1, DESCA, MEM( IPPIV ), $ MEM( IPB ), 1, 1, DESCB, INFO ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'INFO code returned by PCGESV = ', INFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'Matrix X = A^{-1} * B' WRITE( NOUT, FMT = * ) END IF CALL PCLAPRNT( N, NRHS, MEM( IPB ), 1, 1, DESCB, 0, 0, 'X', NOUT, $ MEM( IPW ) ) CALL PCLAWRITE( 'CSCAEXSOL.dat', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ 0, 0, MEM( IPW ) ) * * Compute residual ||A * X - B|| / ( ||X|| * ||A|| * eps * N ) * EPS = PSLAMCH( ICTXT, 'Epsilon' ) ANORM = PCLANGE( 'I', N, N, MEM( IPA ), 1, 1, DESCA, MEM( IPW ) ) BNORM = PCLANGE( 'I', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ MEM( IPW ) ) CALL PCGEMM( 'No transpose', 'No transpose', N, NRHS, N, ONE, $ MEM( IPACPY ), 1, 1, DESCA, MEM( IPB ), 1, 1, DESCB, $ -ONE, MEM( IPX ), 1, 1, DESCX ) XNORM = PCLANGE( 'I', N, NRHS, MEM( IPX ), 1, 1, DESCX, $ MEM( IPW ) ) RESID = XNORM / ( ANORM * BNORM * EPS * DBLE( N ) ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) $ '||A * X - B|| / ( ||X|| * ||A|| * eps * N ) = ', RESID WRITE( NOUT, FMT = * ) IF( RESID.LT.10.0D+0 ) THEN WRITE( NOUT, FMT = * ) 'The answer is correct.' ELSE WRITE( NOUT, FMT = * ) 'The answer is suspicious.' END IF END IF * 10 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 20 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = * ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9998 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9997 FORMAT( 'END OF TESTS.' ) * STOP * * End of PCSCAEX * END scalapack-2.0.2/EXAMPLE/pdscaex.f000644 000766 000024 00000017250 10604624224 016607 0ustar00juliestaff000000 000000 PROGRAM PDSCAEX * * -- ScaLAPACK example code -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * Written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * * This program solves a linear system by calling the ScaLAPACK * routine PDGESV. The input matrix and right-and-sides are * read from a file. The solution is written to a file. * * .. Parameters .. INTEGER DBLESZ, INTGSZ, MEMSIZ, TOTMEM PARAMETER ( DBLESZ = 8, INTGSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / DBLESZ ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. CHARACTER*80 OUTFILE INTEGER IAM, ICTXT, INFO, IPA, IPACPY, IPB, IPPIV, IPX, $ IPW, LIPIV, MYCOL, MYROW, N, NB, NOUT, NPCOL, $ NPROCS, NPROW, NP, NQ, NQRHS, NRHS, WORKSIZ DOUBLE PRECISION ANORM, BNORM, EPS, XNORM, RESID * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCX( DLEN_ ) DOUBLE PRECISION MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, IGSUM2D, PDSCAEXINFO, PDGESV, $ PDGEMM, PDLACPY, PDLAPRNT, PDLAREAD, PDLAWRITE * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PDLANGE EXTERNAL ICEIL, NUMROC, PDLAMCH, PDLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDSCAEXINFO( OUTFILE, NOUT, N, NRHS, NB, NPROW, NPCOL, MEM, $ IAM, NPROCS ) * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 20 * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) NQRHS = NUMROC( NRHS, NB, MYCOL, 0, NPCOL ) * * Initialize the array descriptor for the matrix A and B * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) CALL DESCINIT( DESCB, N, NRHS, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) CALL DESCINIT( DESCX, N, NRHS, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( 1 ) * IPA = 1 IPACPY = IPA + DESCA( LLD_ )*NQ IPB = IPACPY + DESCA( LLD_ )*NQ IPX = IPB + DESCB( LLD_ )*NQRHS IPPIV = IPX + DESCB( LLD_ )*NQRHS LIPIV = ICEIL( INTGSZ*( NP+NB ), DBLESZ ) IPW = IPPIV + MAX( NP, LIPIV ) * WORKSIZ = NB * * Check for adequate memory for problem size * INFO = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'test', ( IPW+WORKSIZ )*DBLESZ INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) IF( INFO.GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MEMORY' GO TO 10 END IF * * Read from file and distribute matrices A and B * CALL PDLAREAD( 'DSCAEXMAT.dat', MEM( IPA ), DESCA, 0, 0, $ MEM( IPW ) ) CALL PDLAREAD( 'DSCAEXRHS.dat', MEM( IPB ), DESCB, 0, 0, $ MEM( IPW ) ) * * Make a copy of A and the rhs for checking purposes * CALL PDLACPY( 'All', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPACPY ), 1, 1, DESCA ) CALL PDLACPY( 'All', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ MEM( IPX ), 1, 1, DESCX ) * ********************************************************************** * Call ScaLAPACK PDGESV routine ********************************************************************** * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) $ '***********************************************' WRITE( NOUT, FMT = * ) $ 'Example of ScaLAPACK routine call: (PDGESV)' WRITE( NOUT, FMT = * ) $ '***********************************************' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'A * X = B, Matrix A:' WRITE( NOUT, FMT = * ) END IF CALL PDLAPRNT( N, N, MEM( IPA ), 1, 1, DESCA, 0, 0, $ 'A', NOUT, MEM( IPW ) ) IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'Matrix B:' WRITE( NOUT, FMT = * ) END IF CALL PDLAPRNT( N, NRHS, MEM( IPB ), 1, 1, DESCB, 0, 0, $ 'B', NOUT, MEM( IPW ) ) * CALL PDGESV( N, NRHS, MEM( IPA ), 1, 1, DESCA, MEM( IPPIV ), $ MEM( IPB ), 1, 1, DESCB, INFO ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'INFO code returned by PDGESV = ', INFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'Matrix X = A^{-1} * B' WRITE( NOUT, FMT = * ) END IF CALL PDLAPRNT( N, NRHS, MEM( IPB ), 1, 1, DESCB, 0, 0, 'X', NOUT, $ MEM( IPW ) ) CALL PDLAWRITE( 'DSCAEXSOL.dat', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ 0, 0, MEM( IPW ) ) * * Compute residual ||A * X - B|| / ( ||X|| * ||A|| * eps * N ) * EPS = PDLAMCH( ICTXT, 'Epsilon' ) ANORM = PDLANGE( 'I', N, N, MEM( IPA ), 1, 1, DESCA, MEM( IPW ) ) BNORM = PDLANGE( 'I', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ MEM( IPW ) ) CALL PDGEMM( 'No transpose', 'No transpose', N, NRHS, N, ONE, $ MEM( IPACPY ), 1, 1, DESCA, MEM( IPB ), 1, 1, DESCB, $ -ONE, MEM( IPX ), 1, 1, DESCX ) XNORM = PDLANGE( 'I', N, NRHS, MEM( IPX ), 1, 1, DESCX, $ MEM( IPW ) ) RESID = XNORM / ( ANORM * BNORM * EPS * DBLE( N ) ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) $ '||A * X - B|| / ( ||X|| * ||A|| * eps * N ) = ', RESID WRITE( NOUT, FMT = * ) IF( RESID.LT.10.0D+0 ) THEN WRITE( NOUT, FMT = * ) 'The answer is correct.' ELSE WRITE( NOUT, FMT = * ) 'The answer is suspicious.' END IF END IF * 10 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 20 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = * ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9998 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9997 FORMAT( 'END OF TESTS.' ) * STOP * * End of PDSCAEX * END scalapack-2.0.2/EXAMPLE/pdscaexinfo.f000644 000766 000024 00000012355 10604624224 017464 0ustar00juliestaff000000 000000 SUBROUTINE PDSCAEXINFO( SUMMRY, NOUT, N, NRHS, NB, NPROW, NPCOL, $ WORK, IAM, NPROCS ) * * -- ScaLAPACK example code -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * Written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * * This program solves a linear system by calling the ScaLAPACK * routine PDGESV. The input matrix and right-and-sides are * read from a file. The solution is written to a file. * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, N, NRHS, NB, NOUT, NPCOL, NPROCS, NPROW * .. * .. Array Arguments .. INTEGER WORK( * ) * .. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER ICTXT * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, IGEBR2D, IGEBS2D * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE='SCAEX.dat', STATUS='OLD' ) READ( NIN, FMT = * ) SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 ) USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * ) SUMMRY READ( NIN, FMT = * ) NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get matrix dimensions * READ( NIN, FMT = * ) N READ( NIN, FMT = * ) NRHS * * Get value of NB * READ( NIN, FMT = * ) NB * * Get grid shape * READ( NIN, FMT = * ) NPROW READ( NIN, FMT = * ) NPCOL * * Close input file * CLOSE( NIN ) * * If underlying system needs additional set up, do it now * IF( NPROCS.LT.1 ) THEN NPROCS = NPROW * NPCOL CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Pack information arrays and broadcast * WORK( 1 ) = N WORK( 2 ) = NRHS WORK( 3 ) = NB WORK( 4 ) = NPROW WORK( 5 ) = NPCOL CALL IGEBS2D( ICTXT, 'All', ' ', 5, 1, WORK, 5 ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'SCALAPACK example driver.' WRITE( NOUT, FMT = 9999 ) USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The matrices A and B are read from '// $ 'a file.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'An explanation of the input/output '// $ 'parameters follows:' * WRITE( NOUT, FMT = 9999 ) $ 'N : The order of the matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NRHS : The number of right and sides.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the'// $ ' matrices A and B are split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9998 ) 'N ', N WRITE( NOUT, FMT = 9998 ) 'NRHS ', NRHS WRITE( NOUT, FMT = 9998 ) 'NB ', NB WRITE( NOUT, FMT = 9998 ) 'P ', NPROW WRITE( NOUT, FMT = 9998 ) 'Q ', NPCOL WRITE( NOUT, FMT = * ) * ELSE * * If underlying system needs additional set up, do it now * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * CALL IGEBR2D( ICTXT, 'All', ' ', 5, 1, WORK, 5, 0, 0 ) N = WORK( 1 ) NRHS = WORK( 2 ) NB = WORK( 3 ) NPROW = WORK( 4 ) NPCOL = WORK( 5 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 20 WRITE( NOUT, FMT = 9997 ) CLOSE( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 2X, A5, ' : ', I6 ) 9997 FORMAT( ' Illegal input in file ',40A,'. Aborting run.' ) * * End of PDSCAEXINFO * END scalapack-2.0.2/EXAMPLE/psscaex.f000644 000766 000024 00000017250 10604624224 016626 0ustar00juliestaff000000 000000 PROGRAM PSSCAEX * * -- ScaLAPACK example code -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * Written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * * This program solves a linear system by calling the ScaLAPACK * routine PSGESV. The input matrix and right-and-sides are * read from a file. The solution is written to a file. * * .. Parameters .. INTEGER REALSZ, INTGSZ, MEMSIZ, TOTMEM PARAMETER ( REALSZ = 4, INTGSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / REALSZ ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. CHARACTER*80 OUTFILE INTEGER IAM, ICTXT, INFO, IPA, IPACPY, IPB, IPPIV, IPX, $ IPW, LIPIV, MYCOL, MYROW, N, NB, NOUT, NPCOL, $ NPROCS, NPROW, NP, NQ, NQRHS, NRHS, WORKSIZ REAL ANORM, BNORM, EPS, XNORM, RESID * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCX( DLEN_ ) REAL MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, IGSUM2D, PDSCAEXINFO, PSGESV, $ PSGEMM, PSLACPY, PSLAPRNT, PSLAREAD, PSLAWRITE * .. * .. External Functions .. INTEGER ICEIL, NUMROC REAL PSLAMCH, PSLANGE EXTERNAL ICEIL, NUMROC, PSLAMCH, PSLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDSCAEXINFO( OUTFILE, NOUT, N, NRHS, NB, NPROW, NPCOL, MEM, $ IAM, NPROCS ) * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 20 * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) NQRHS = NUMROC( NRHS, NB, MYCOL, 0, NPCOL ) * * Initialize the array descriptor for the matrix A and B * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) CALL DESCINIT( DESCB, N, NRHS, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) CALL DESCINIT( DESCX, N, NRHS, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( 1 ) * IPA = 1 IPACPY = IPA + DESCA( LLD_ )*NQ IPB = IPACPY + DESCA( LLD_ )*NQ IPX = IPB + DESCB( LLD_ )*NQRHS IPPIV = IPX + DESCB( LLD_ )*NQRHS LIPIV = ICEIL( INTGSZ*( NP+NB ), REALSZ ) IPW = IPPIV + MAX( NP, LIPIV ) * WORKSIZ = NB * * Check for adequate memory for problem size * INFO = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'test', ( IPW+WORKSIZ )*REALSZ INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) IF( INFO.GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MEMORY' GO TO 10 END IF * * Read from file and distribute matrices A and B * CALL PSLAREAD( 'SSCAEXMAT.dat', MEM( IPA ), DESCA, 0, 0, $ MEM( IPW ) ) CALL PSLAREAD( 'SSCAEXRHS.dat', MEM( IPB ), DESCB, 0, 0, $ MEM( IPW ) ) * * Make a copy of A and the rhs for checking purposes * CALL PSLACPY( 'All', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPACPY ), 1, 1, DESCA ) CALL PSLACPY( 'All', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ MEM( IPX ), 1, 1, DESCX ) * ********************************************************************** * Call ScaLAPACK PSGESV routine ********************************************************************** * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) $ '***********************************************' WRITE( NOUT, FMT = * ) $ 'Example of ScaLAPACK routine call: (PSGESV)' WRITE( NOUT, FMT = * ) $ '***********************************************' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'A * X = B, Matrix A:' WRITE( NOUT, FMT = * ) END IF CALL PSLAPRNT( N, N, MEM( IPA ), 1, 1, DESCA, 0, 0, $ 'A', NOUT, MEM( IPW ) ) IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'Matrix B:' WRITE( NOUT, FMT = * ) END IF CALL PSLAPRNT( N, NRHS, MEM( IPB ), 1, 1, DESCB, 0, 0, $ 'B', NOUT, MEM( IPW ) ) * CALL PSGESV( N, NRHS, MEM( IPA ), 1, 1, DESCA, MEM( IPPIV ), $ MEM( IPB ), 1, 1, DESCB, INFO ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'INFO code returned by PSGESV = ', INFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'Matrix X = A^{-1} * B' WRITE( NOUT, FMT = * ) END IF CALL PSLAPRNT( N, NRHS, MEM( IPB ), 1, 1, DESCB, 0, 0, 'X', NOUT, $ MEM( IPW ) ) CALL PSLAWRITE( 'SSCAEXSOL.dat', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ 0, 0, MEM( IPW ) ) * * Compute residual ||A * X - B|| / ( ||X|| * ||A|| * eps * N ) * EPS = PSLAMCH( ICTXT, 'Epsilon' ) ANORM = PSLANGE( 'I', N, N, MEM( IPA ), 1, 1, DESCA, MEM( IPW ) ) BNORM = PSLANGE( 'I', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ MEM( IPW ) ) CALL PSGEMM( 'No transpose', 'No transpose', N, NRHS, N, ONE, $ MEM( IPACPY ), 1, 1, DESCA, MEM( IPB ), 1, 1, DESCB, $ -ONE, MEM( IPX ), 1, 1, DESCX ) XNORM = PSLANGE( 'I', N, NRHS, MEM( IPX ), 1, 1, DESCX, $ MEM( IPW ) ) RESID = XNORM / ( ANORM * BNORM * EPS * DBLE( N ) ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) $ '||A * X - B|| / ( ||X|| * ||A|| * eps * N ) = ', RESID WRITE( NOUT, FMT = * ) IF( RESID.LT.10.0D+0 ) THEN WRITE( NOUT, FMT = * ) 'The answer is correct.' ELSE WRITE( NOUT, FMT = * ) 'The answer is suspicious.' END IF END IF * 10 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 20 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = * ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9998 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9997 FORMAT( 'END OF TESTS.' ) * STOP * * End of PSSCAEX * END scalapack-2.0.2/EXAMPLE/pzscaex.f000644 000766 000024 00000017245 10604624224 016641 0ustar00juliestaff000000 000000 PROGRAM PZSCAEX * * -- ScaLAPACK example code -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * * Written by Antoine Petitet, August 1995 (petitet@cs.utk.edu) * * This program solves a linear system by calling the ScaLAPACK * routine PZGESV. The input matrix and right-and-sides are * read from a file. The solution is written to a file. * * .. Parameters .. INTEGER CP16SZ, INTGSZ, MEMSIZ, TOTMEM PARAMETER ( CP16SZ = 16, INTGSZ = 4, TOTMEM = 2000000, $ MEMSIZ = TOTMEM / CP16SZ ) INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE PARAMETER ( ONE = (1.0D+0,0.0D+0) ) * .. * .. Local Scalars .. CHARACTER*80 OUTFILE INTEGER IAM, ICTXT, INFO, IPA, IPACPY, IPB, IPPIV, IPX, $ IPW, LIPIV, MYCOL, MYROW, N, NB, NOUT, NPZOL, $ NPROCS, NPROW, NP, NQ, NQRHS, NRHS, WORKSIZ DOUBLE PRECISION ANORM, BNORM, EPS, XNORM, RESID * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCB( DLEN_ ), DESCX( DLEN_ ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_EXIT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINFO, BLACS_GRIDINIT, BLACS_PINFO, $ DESCINIT, IGSUM2D, PZGESV, $ PZGEMM, PZLACPY, PZLAPRNT, PZLAREAD, PZLAWRITE * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL ICEIL, NUMROC, PDLAMCH, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) CALL PDSCAEXINFO( OUTFILE, NOUT, N, NRHS, NB, NPROW, NPZOL, MEM, $ IAM, NPROCS ) * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPZOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPZOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPZOL ) $ GO TO 20 * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPZOL ) NQRHS = NUMROC( NRHS, NB, MYCOL, 0, NPZOL ) * * Initialize the array descriptor for the matrix A and B * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) CALL DESCINIT( DESCB, N, NRHS, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) CALL DESCINIT( DESCX, N, NRHS, NB, NB, 0, 0, ICTXT, MAX( 1, NP ), $ INFO ) * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( 1 ) * IPA = 1 IPACPY = IPA + DESCA( LLD_ )*NQ IPB = IPACPY + DESCA( LLD_ )*NQ IPX = IPB + DESCB( LLD_ )*NQRHS IPPIV = IPX + DESCB( LLD_ )*NQRHS LIPIV = ICEIL( INTGSZ*( NP+NB ), CP16SZ ) IPW = IPPIV + MAX( NP, LIPIV ) * WORKSIZ = NB * * Check for adequate memory for problem size * INFO = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 ) 'test', ( IPW+WORKSIZ )*CP16SZ INFO = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO, 1, -1, 0 ) IF( INFO.GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 ) 'MEMORY' GO TO 10 END IF * * Read from file and distribute matrices A and B * CALL PZLAREAD( 'ZSCAEXMAT.dat', MEM( IPA ), DESCA, 0, 0, $ MEM( IPW ) ) CALL PZLAREAD( 'ZSCAEXRHS.dat', MEM( IPB ), DESCB, 0, 0, $ MEM( IPW ) ) * * Make a copy of A and the rhs for checking purposes * CALL PZLACPY( 'All', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPACPY ), 1, 1, DESCA ) CALL PZLACPY( 'All', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ MEM( IPX ), 1, 1, DESCX ) * ********************************************************************** * Call ScaLAPACK PZGESV routine ********************************************************************** * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) $ '***********************************************' WRITE( NOUT, FMT = * ) $ 'Example of ScaLAPACK routine call: (PZGESV)' WRITE( NOUT, FMT = * ) $ '***********************************************' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'A * X = B, Matrix A:' WRITE( NOUT, FMT = * ) END IF CALL PZLAPRNT( N, N, MEM( IPA ), 1, 1, DESCA, 0, 0, $ 'A', NOUT, MEM( IPW ) ) IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'Matrix B:' WRITE( NOUT, FMT = * ) END IF CALL PZLAPRNT( N, NRHS, MEM( IPB ), 1, 1, DESCB, 0, 0, $ 'B', NOUT, MEM( IPW ) ) * CALL PZGESV( N, NRHS, MEM( IPA ), 1, 1, DESCA, MEM( IPPIV ), $ MEM( IPB ), 1, 1, DESCB, INFO ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'INFO code returned by PZGESV = ', INFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) 'Matrix X = A^{-1} * B' WRITE( NOUT, FMT = * ) END IF CALL PZLAPRNT( N, NRHS, MEM( IPB ), 1, 1, DESCB, 0, 0, 'X', NOUT, $ MEM( IPW ) ) CALL PZLAWRITE( 'ZSCAEXSOL.dat', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ 0, 0, MEM( IPW ) ) * * Compute residual ||A * X - B|| / ( ||X|| * ||A|| * eps * N ) * EPS = PDLAMCH( ICTXT, 'Epsilon' ) ANORM = PZLANGE( 'I', N, N, MEM( IPA ), 1, 1, DESCA, MEM( IPW ) ) BNORM = PZLANGE( 'I', N, NRHS, MEM( IPB ), 1, 1, DESCB, $ MEM( IPW ) ) CALL PZGEMM( 'No transpose', 'No transpose', N, NRHS, N, ONE, $ MEM( IPACPY ), 1, 1, DESCA, MEM( IPB ), 1, 1, DESCB, $ -ONE, MEM( IPX ), 1, 1, DESCX ) XNORM = PZLANGE( 'I', N, NRHS, MEM( IPX ), 1, 1, DESCX, $ MEM( IPW ) ) RESID = XNORM / ( ANORM * BNORM * EPS * DBLE( N ) ) * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) $ '||A * X - B|| / ( ||X|| * ||A|| * eps * N ) = ', RESID WRITE( NOUT, FMT = * ) IF( RESID.LT.10.0D+0 ) THEN WRITE( NOUT, FMT = * ) 'The answer is correct.' ELSE WRITE( NOUT, FMT = * ) 'The answer is suspicious.' END IF END IF * 10 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 20 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9997 ) WRITE( NOUT, FMT = * ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9998 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9997 FORMAT( 'END OF TESTS.' ) * STOP * * End of PZSCAEX * END scalapack-2.0.2/EXAMPLE/SCAEX.dat000644 000766 000024 00000000347 10604624224 016345 0ustar00juliestaff000000 000000 'ScaLAPACK Tutorial, Example input file' 'PARA95, ScaLAPACK Example, August 1995.' 'SCAEX.out' output file name (if any) 6 device out 6 value of N 1 value of NRHS 2 values of NB 2 values of NPROW 2 values of NPCOL scalapack-2.0.2/EXAMPLE/SSCAEXMAT.dat000644 000766 000024 00000000622 10604624224 017026 0ustar00juliestaff000000 000000 6 6 6.0000E+0 3.0000E+0 0.0000E+0 0.0000E+0 3.0000E+0 0.0000E+0 0.0000E+0 -3.0000E+0 -1.0000E+0 1.0000E+0 1.0000E+0 0.0000E+0 -1.0000E+0 0.0000E+0 11.0000E+0 0.0000E+0 0.0000E+0 10.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 -11.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 2.0000E+0 -4.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 0.0000E+0 8.0000E+0 0.0000E+0 -10.0000E+0 scalapack-2.0.2/EXAMPLE/SSCAEXRHS.dat000644 000766 000024 00000000302 10604624224 017034 0ustar00juliestaff000000 000000 6 1 72.000000000000000000E+00 0.000000000000000000E+00 160.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 scalapack-2.0.2/EXAMPLE/ZSCAEXMAT.dat000644 000766 000024 00000001652 10604624224 017041 0ustar00juliestaff000000 000000 6 6 6.0000D+0 4.0000D+0 3.0000D+0 -5.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 3.0000D+0 -3.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 -3.0000D+0 0.0000D+0 -1.0000D+0 -9.0000D+0 1.0000D+0 2.0000D+0 1.0000D+0 4.0000D+0 0.0000D+0 0.0000D+0 -1.0000D+0 -3.0000D+0 0.0000D+0 0.0000D+0 11.0000D+0 21.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 10.0000D+0 5.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 -11.0000D+0 -12.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 2.0000D+0 0.0000D+0 -4.0000D+0 5.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 0.0000D+0 8.0000D+0 1.0000D+0 0.0000D+0 0.0000D+0 -10.0000D+0 -10.0000D+0 scalapack-2.0.2/EXAMPLE/ZSCAEXRHS.dat000644 000766 000024 00000000552 10604624224 017052 0ustar00juliestaff000000 000000 6 1 72.000000000000000000D+00 81.000000000000000000D+00 0.000000000000000000D+00 0.000000000000000000D+00 160.000000000000000000D+00 120.000000000000000000D+00 0.000000000000000000D+00 0.000000000000000000D+00 0.000000000000000000D+00 0.000000000000000000D+00 0.000000000000000000D+00 0.000000000000000000D+00 scalapack-2.0.2/CMAKE/CheckBLACSCompilerFlags.cmake000644 000766 000024 00000007327 11656312637 022010 0ustar00juliestaff000000 000000 # This module checks against various known compilers and thier respective # flags to determine any specific flags needing to be set. # # 1. If FPE traps are enabled either abort or disable them # 2. Specify fixed form if needed # 3. Ensure that Release builds use O2 instead of O3 # #============================================================================= # Author: Chuck Atkins # Copyright 2011 #============================================================================= macro( CheckBLACSCompilerFlags ) set( FPE_EXIT FALSE ) # GNU Fortran if( CMAKE_Fortran_COMPILER_ID STREQUAL "GNU" ) if( "${CMAKE_Fortran_FLAGS}" MATCHES "-ffpe-trap=[izoupd]") set( FPE_EXIT TRUE ) endif() # Intel Fortran elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" ) if( "${CMAKE_Fortran_FLAGS}" MATCHES "[-/]fpe(-all=|)0" ) set( FPE_EXIT TRUE ) endif() # SunPro F95 elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "SunPro" ) if( ("${CMAKE_Fortran_FLAGS}" MATCHES "-ftrap=") AND NOT ("${CMAKE_Fortran_FLAGS}" MATCHES "-ftrap=(%|)none") ) set( FPE_EXIT TRUE ) elseif( NOT (CMAKE_Fortran_FLAGS MATCHES "-ftrap=") ) message( STATUS "Disabling FPE trap handlers with -ftrap=%none" ) set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -ftrap=%none" CACHE STRING "Flags for Fortran compiler." FORCE ) endif() # IBM XL Fortran elseif( (CMAKE_Fortran_COMPILER_ID STREQUAL "VisualAge" ) OR # CMake 2.6 (CMAKE_Fortran_COMPILER_ID STREQUAL "XL" ) ) # CMake 2.8 if( "${CMAKE_Fortran_FLAGS}" MATCHES "-qflttrap=[a-zA-Z:]:enable" ) set( FPE_EXIT TRUE ) endif() if( NOT ("${CMAKE_Fortran_FLAGS}" MATCHES "-qfixed") ) message( STATUS "Enabling fixed format F90/F95 with -qfixed" ) set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qfixed" CACHE STRING "Flags for Fortran compiler." FORCE ) endif() # HP Fortran elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "HP" ) if( "${CMAKE_Fortran_FLAGS}" MATCHES "\\+fp_exception" ) set( FPE_EXIT TRUE ) endif() if( NOT ("${CMAKE_Fortran_FLAGS}" MATCHES "\\+fltconst_strict") ) message( STATUS "Enabling strict float conversion with +fltconst_strict" ) set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} +fltconst_strict" CACHE STRING "Flags for Fortran compiler." FORCE ) endif() # Most versions of cmake don't have good default options for the HP compiler set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -g" CACHE STRING "Flags used by the compiler during debug builds" FORCE ) set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_MINSIZEREL} +Osize" CACHE STRING "Flags used by the compiler during release minsize builds" FORCE ) set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_RELEASE} +O2" CACHE STRING "Flags used by the compiler during release builds" FORCE ) set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_RELWITHDEBINFO} +O2 -g" CACHE STRING "Flags used by the compiler during release with debug info builds" FORCE ) else() endif() if( "${CMAKE_Fortran_FLAGS_RELEASE}" MATCHES "O[3-9]" ) message( STATUS "Reducing RELEASE optimization level to O2" ) string( REGEX REPLACE "O[3-9]" "O2" CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE}" ) set( CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE}" CACHE STRING "Flags used by the compiler during release builds" FORCE ) endif() if( FPE_EXIT ) message( FATAL_ERROR "Floating Point Exception (FPE) trap handlers are currently explicitly enabled in the compiler flags. BLACS is designed to check for and handle these cases internally and enabling these traps will likely cause BLACS to crash. Please re-configure with floating point exception trapping disabled." ) endif() endmacro() scalapack-2.0.2/CMAKE/CTestCustom.cmake.in000644 000766 000024 00000002723 11656312637 020373 0ustar00juliestaff000000 000000 # # For further details regarding this file, # see http://www.vtk.org/Wiki/CMake_Testing_With_CTest#Customizing_CTest # SET(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE 0) SET(CTEST_CUSTOM_MAXIMUM_FAILED_TEST_OUTPUT_SIZE 0) SET(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS 500) SET(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS 500) # Files to explicitly exclude from code coverage SET(CTEST_CUSTOM_COVERAGE_EXCLUDE ${CTEST_CUSTOM_COVERAGE_EXCLUDE} # Exclude the testing code itself from code coverage "/TESTING/" ) # Warnings to explicitly ignore SET(CTEST_CUSTOM_WARNING_EXCEPTION ${CTEST_CUSTOM_WARNING_EXCEPTION} # Common warning when linking ATLAS built with GNU Fortran 4.1 and building # with GNU Fortran 4.4. It can be safely ignored. "libgfortran.*may conflict with libgfortran" # Harmless warning often seen on IRIX "WARNING 84 : .*libm.* is not used for resolving any symbol" # Warnings caused by sun compilers when building code to only run on your # native platform "xarch=native on this architecture implies -xarch=.*which generates code that does not run" # Harmless warnings from the Intel compiler on Windows "ipo: warning #11010: file format not recognized for .*\\.exe\\.embed\\.manifest\\.res" "LINK : warning LNK4224: /INCREMENTAL:YES is no longer supported; ignored" # Warnings caused by string truncation in the test code. The truncation is # intentional "Character string truncated to length 1 on assignment" ) scalapack-2.0.2/CMAKE/FortranMangling.cmake000644 000766 000024 00000004415 11704122354 020626 0ustar00juliestaff000000 000000 # Macro that defines variables describing the Fortran name mangling # convention # # Sets the following outputs on success: # # INTFACE # Add_ # NoChange # f77IsF2C # UpCase # FUNCTION(COMPILE RESULT) MESSAGE(STATUS "=========") MESSAGE(STATUS "Compiling and Building BLACS INSTALL Testing to set correct variables") # Configure: EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/BLACS/INSTALL/ RESULT_VARIABLE RESVAR OUTPUT_VARIABLE LOG1 ERROR_VARIABLE LOG1 ) if(RESVAR EQUAL 0) MESSAGE(STATUS "Configure in the INSTALL directory successful") else() MESSAGE(FATAL_ERROR " Configure in the BLACS INSTALL directory FAILED") MESSAGE(FATAL_ERROR " Output Build:\n ${LOG1}") endif() # Build: EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build ${PROJECT_SOURCE_DIR}/BLACS/INSTALL/ RESULT_VARIABLE RESVAR OUTPUT_VARIABLE LOG2 ERROR_VARIABLE LOG2 ) if(RESVAR EQUAL 0) MESSAGE(STATUS "Build in the BLACS INSTALL directory successful") else() MESSAGE(FATAL_ERROR " Build in the BLACS INSTALL directory FAILED") MESSAGE(FATAL_ERROR " Output Build:\n ${LOG2}") endif() # Clean up: FILE(REMOVE_RECURSE ${PROJECT_SOURCE_DIR}/BLACS/INSTALL/CMakeCache.txt) FILE(REMOVE_RECURSE ${PROJECT_SOURCE_DIR}/BLACS/INSTALL/CMakeFiles ) ENDFUNCTION() macro(FORTRAN_MANGLING CDEFS) MESSAGE(STATUS "=========") MESSAGE(STATUS "Testing FORTRAN_MANGLING") execute_process ( COMMAND ${PROJECT_SOURCE_DIR}/BLACS/INSTALL/xintface RESULT_VARIABLE xintface_RES OUTPUT_VARIABLE xintface_OUT ERROR_VARIABLE xintface_ERR) # MESSAGE(STATUS "FORTRAN MANGLING:RUN \n${xintface_OUT}") if (xintface_RES EQUAL 0) STRING(REPLACE "\n" "" xintface_OUT "${xintface_OUT}") MESSAGE(STATUS "CDEFS set to ${xintface_OUT}") SET(CDEFS ${xintface_OUT} CACHE STRING "Fortran Mangling" FORCE) else() MESSAGE(FATAL_ERROR "FORTRAN_MANGLING:ERROR ${xintface_ERR}") endif() endmacro(FORTRAN_MANGLING) scalapack-2.0.2/CMAKE/scalapack-config-build.cmake.in000644 000766 000024 00000000072 11656312637 022433 0ustar00juliestaff000000 000000 include("@SCALAPACK_BINARY_DIR@/scalapack-targets.cmake") scalapack-2.0.2/CMAKE/scalapack-config-install.cmake.in000644 000766 000024 00000000162 11656312637 023002 0ustar00juliestaff000000 000000 get_filename_component(_SELF_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) include(${_SELF_DIR}/scalapack-targets.cmake) scalapack-2.0.2/CMAKE/scalapack-config-version.cmake.in000644 000766 000024 00000000405 11656312637 023021 0ustar00juliestaff000000 000000 set(PACKAGE_VERSION "@SCALAPACK_VERSION@") if(NOT ${PACKAGE_FIND_VERSION} VERSION_GREATER ${PACKAGE_VERSION}) set(PACKAGE_VERSION_COMPATIBLE 1) if(${PACKAGE_FIND_VERSION} VERSION_EQUAL ${PACKAGE_VERSION}) set(PACKAGE_VERSION_EXACT 1) endif() endif() scalapack-2.0.2/BLACS/CMakeLists.txt000644 000766 000024 00000000131 11656312637 017302 0ustar00juliestaff000000 000000 add_subdirectory(SRC) if(BUILD_TESTING) add_subdirectory(TESTING) endif(BUILD_TESTING) scalapack-2.0.2/BLACS/INSTALL/000755 000766 000024 00000000000 11750301553 015643 5ustar00juliestaff000000 000000 scalapack-2.0.2/BLACS/Makefile000644 000766 000024 00000000212 11703615041 016166 0ustar00juliestaff000000 000000 all : lib tester clean: ( cd TESTING ; make clean ) ( cd SRC ; make clean ) tester : ( cd TESTING ; make ) lib : ( cd SRC ; make ) scalapack-2.0.2/BLACS/SRC/000755 000766 000024 00000000000 11750301555 015166 5ustar00juliestaff000000 000000 scalapack-2.0.2/BLACS/TESTING/000755 000766 000024 00000000000 11750301555 015654 5ustar00juliestaff000000 000000 scalapack-2.0.2/BLACS/TESTING/blacstest.f000644 000766 000024 00002733051 11645634736 020037 0ustar00juliestaff000000 000000 PROGRAM BLACSTEST * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * Purpose * ======= * This is the driver for the BLACS test suite. * * Arguments * ========= * None. Input is done via the data files indicated below. * * Input Files * =========== * The following input files must reside in the current working * directory: * * bt.dat -- input parameters for the test run as a whole * sdrv.dat -- input parameters for point-to-point testing * bsbr.dat -- input parameters for broadcast testing * comb.dat -- input parameters for combine testing * * Output Files * ============ * Test results are generated and sent to output file as * specified by the user in bt.dat. * * =================================================================== * * .. Parameters .. INTEGER CMEMSIZ, MEMELTS PARAMETER( MEMELTS = 250000 ) PARAMETER( CMEMSIZ = 10000 ) * .. * .. External Functions .. LOGICAL ALLPASS INTEGER IBTMSGID, IBTSIZEOF REAL SBTEPS DOUBLE PRECISION DBTEPS EXTERNAL ALLPASS, IBTMSGID, SBTEPS, DBTEPS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_PINFO, BTSETUP, RDBTIN * .. * .. Local Scalars .. INTEGER I, IAM, NNODES, VERB, OUTNUM, MEMLEN, NPREC, ISIZE, DSIZE LOGICAL TESTSDRV, TESTBSBR, TESTCOMB, TESTAUX * .. * .. Local Arrays .. CHARACTER*1 CMEM(CMEMSIZ), PREC(9) INTEGER IPREC(9), ITMP(2) DOUBLE PRECISION MEM(MEMELTS) * .. * .. Executable Statements .. * ISIZE = IBTSIZEOF('I') DSIZE = IBTSIZEOF('D') * * Get initial process information, and initialize message IDs * CALL BLACS_PINFO( IAM, NNODES ) ITMP(1) = IBTMSGID() * * Call BLACS_GRIDINIT so BLACS set up some system stuff: should * make it possible for the user to print, read input files, etc. * IF( NNODES .GT. 0 ) THEN CALL BLACS_GET( 0, 0, ITMP ) CALL BLACS_GRIDINIT(ITMP, 'c', 1, NNODES) CALL BLACS_GRIDEXIT(ITMP) END IF * * Read in what tests to do * IF( IAM .EQ. 0 ) $ CALL RDBTIN( TESTSDRV, TESTBSBR, TESTCOMB, TESTAUX, NPREC, $ PREC, VERB, OUTNUM ) * MEMLEN = (MEMELTS * DSIZE) / ISIZE * * Get process info for communication, and create virtual machine * if necessary * CALL BTSETUP( MEM, MEMLEN, CMEM, CMEMSIZ, OUTNUM, TESTSDRV, $ TESTBSBR, TESTCOMB, TESTAUX, IAM, NNODES ) * * Send out RDBTIN information * IF( IAM .EQ. 0 ) THEN * * Store test info in back of precision array * ITMP(1) = NPREC ITMP(2) = VERB CALL BTSEND( 3, 2, ITMP, -1, IBTMSGID() ) DO 10 I = 1, 9 IPREC(I) = 0 10 CONTINUE DO 20 I = 1, NPREC IF( PREC(I) .EQ. 'I' ) THEN IPREC(I) = 1 ELSE IF( PREC(I) .EQ. 'S' ) THEN IPREC(I) = 2 ELSE IF( PREC(I) .EQ. 'D' ) THEN IPREC(I) = 3 ELSE IF( PREC(I) .EQ. 'C' ) THEN IPREC(I) = 4 ELSE IF( PREC(I) .EQ. 'Z' ) THEN IPREC(I) = 5 END IF 20 CONTINUE IF( TESTSDRV ) IPREC(6) = 1 IF( TESTBSBR ) IPREC(7) = 1 IF( TESTCOMB ) IPREC(8) = 1 IF( TESTAUX ) IPREC(9) = 1 CALL BTSEND( 3, 9, IPREC, -1, IBTMSGID()+1 ) ELSE CALL BTRECV( 3, 2, ITMP, 0, IBTMSGID() ) NPREC = ITMP(1) VERB = ITMP(2) CALL BTRECV( 3, 9, IPREC, 0, IBTMSGID()+1 ) DO 30 I = 1, NPREC IF( IPREC(I) .EQ. 1 ) THEN PREC(I) = 'I' ELSE IF( IPREC(I) .EQ. 2 ) THEN PREC(I) = 'S' ELSE IF( IPREC(I) .EQ. 3 ) THEN PREC(I) = 'D' ELSE IF( IPREC(I) .EQ. 4 ) THEN PREC(I) = 'C' ELSE IF( IPREC(I) .EQ. 5 ) THEN PREC(I) = 'Z' END IF 30 CONTINUE TESTSDRV = ( IPREC(6) .EQ. 1 ) TESTBSBR = ( IPREC(7) .EQ. 1 ) TESTCOMB = ( IPREC(8) .EQ. 1 ) TESTAUX = ( IPREC(9) .EQ. 1 ) ENDIF * IF( TESTSDRV .OR. TESTBSBR .OR. TESTCOMB .OR. TESTAUX ) THEN * * Find maximal machine epsilon for single and double precision * ITMP(1) = INT( SBTEPS() ) ITMP(1) = INT( DBTEPS() ) * CALL RUNTESTS( MEM, MEMLEN, CMEM, CMEMSIZ, PREC, NPREC, OUTNUM, $ VERB, TESTSDRV, TESTBSBR, TESTCOMB, TESTAUX ) * END IF * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM,1000) WRITE(OUTNUM,1000) IF( ALLPASS(.TRUE.) ) THEN WRITE(OUTNUM,2000) 'NO' ELSE WRITE(OUTNUM,2000) ' ' END IF WRITE(OUTNUM,1000) WRITE(OUTNUM,1000) IF( OUTNUM.NE.0 .AND. OUTNUM.NE.6 ) CLOSE(OUTNUM) ENDIF * CALL BLACS_EXIT(0) 1000 FORMAT('=======================================') 2000 FORMAT('THERE WERE ',A2,' FAILURES IN THIS TEST RUN') STOP * * End BLACSTESTER * END * SUBROUTINE RUNTESTS( MEM, MEMLEN, CMEM, CMEMLEN, PREC, NPREC, $ OUTNUM, VERB, TESTSDRV, TESTBSBR, TESTCOMB, $ TESTAUX ) * * .. Scalar Arguments .. INTEGER MEMLEN, CMEMLEN, NPREC, OUTNUM, VERB, IAM, NNODES LOGICAL TESTSDRV, TESTBSBR, TESTCOMB, TESTAUX * .. * .. Array Arguments .. CHARACTER*1 CMEM(CMEMLEN), PREC(NPREC) INTEGER MEM(MEMLEN) * .. * .. External Functions .. INTEGER IBTNPROCS, IBTMYPROC, IBTMSGID, IBTSIZEOF, SAFEINDEX EXTERNAL IBTNPROCS, IBTMYPROC, IBTMSGID, IBTSIZEOF, SAFEINDEX * .. * .. External Subroutines .. EXTERNAL CSDRVTEST, DSDRVTEST, ISDRVTEST, SSDRVTEST, ZSDRVTEST EXTERNAL CBSBRTEST, DBSBRTEST, IBSBRTEST, SBSBRTEST, ZBSBRTEST EXTERNAL ISUMTEST, SSUMTEST, DSUMTEST, CSUMTEST, ZSUMTEST EXTERNAL IAMXTEST, SAMXTEST, DAMXTEST, CAMXTEST, ZAMXTEST EXTERNAL IAMNTEST, SAMNTEST, DAMNTEST, CAMNTEST, ZAMNTEST EXTERNAL AUXTEST, BTSEND, BTRECV, BTINFO * .. * .. Local Scalars .. INTEGER NSCOPE, NOP, NTOP, NSHAPE, NMAT, NSRC, NDEST, NGRID INTEGER TREP, TCOH, OPPTR, SCOPEPTR, TOPPTR, UPLOPTR, DIAGPTR INTEGER MPTR, NPTR, LDSPTR, LDDPTR, LDIPTR INTEGER RSRCPTR, CSRCPTR, RDESTPTR, CDESTPTR, PPTR, QPTR INTEGER ISEEDPTR, RAPTR, CAPTR, CTXTPTR, WORKPTR, WORKLEN INTEGER MEMUSED, CMEMUSED, I, J, K INTEGER ISIZE, SSIZE, DSIZE, CSIZE, ZSIZE * .. * .. Local Arrays .. INTEGER ITMP(4) * .. * .. Executable Statements .. * IAM = IBTMYPROC() NNODES = IBTNPROCS() ISIZE = IBTSIZEOF('I') SSIZE = IBTSIZEOF('S') DSIZE = IBTSIZEOF('D') CSIZE = IBTSIZEOF('C') ZSIZE = IBTSIZEOF('Z') * IF( IAM.EQ.0 ) THEN CALL BLACS_GET( 0, 2, I ) WRITE(OUTNUM,3000) WRITE(OUTNUM,3000) WRITE(OUTNUM,2000) I WRITE(OUTNUM,3000) WRITE(OUTNUM,3000) END IF * IF( TESTAUX ) THEN * * Each process will make sure that BLACS_PINFO returns * the same value as BLACS_SETUP, and send a packet * to node 0 saying whether it was. * CALL BLACS_PINFO( ITMP(1), ITMP(3) ) CALL BLACS_SETUP( ITMP(2), ITMP(4) ) IF( IAM .EQ. 0 ) THEN DO 35 I = 0, NNODES-1 IF( I .NE. 0 ) $ CALL BTRECV( 3, 4, ITMP, I, IBTMSGID()+2 ) IF( ITMP(1) .NE. ITMP(2) ) $ WRITE( OUTNUM, 1000 ) ITMP(1), ITMP(2) IF( (ITMP(3).NE.ITMP(4)) .OR. (ITMP(3).NE.NNODES) ) $ WRITE( OUTNUM, 1000 ) ITMP(3), ITMP(4), NNODES 35 CONTINUE ELSE CALL BTSEND( 3, 4, ITMP, 0, IBTMSGID()+2 ) ENDIF ENDIF * * Run point-to-point tests as appropriate * IF( TESTSDRV ) THEN * * Get test info * CALL BTINFO( 'SDRV', MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, $ CMEMLEN, OUTNUM, NOP, NSCOPE, TREP, TCOH, NTOP, $ NSHAPE, NMAT, NSRC, NGRID, OPPTR, SCOPEPTR, $ TOPPTR, UPLOPTR, DIAGPTR, MPTR, NPTR, LDSPTR, $ LDDPTR, LDIPTR, RSRCPTR, CSRCPTR, RDESTPTR, $ CDESTPTR, PPTR, QPTR ) * * iseedptr used as tests passed/failed array, so it must * be of size NTESTS -- It's not used unless VERB < 2 * CTXTPTR = MEMUSED + 1 ISEEDPTR = CTXTPTR + NGRID MEMUSED = ISEEDPTR - 1 IF( VERB .LT. 2 ) $ MEMUSED = MEMUSED + NSHAPE * NMAT * NSRC * NGRID * CALL MAKEGRIDS( MEM(CTXTPTR), OUTNUM, NGRID, MEM(PPTR), $ MEM(QPTR) ) * * Call individual tests as appropriate. * DO 10 I = 1, NPREC IF( PREC(I) .EQ. 'I' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, ISIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / ISIZE CALL ISDRVTEST(OUTNUM, VERB, NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ MEM(RDESTPTR), MEM(CDESTPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) * ELSE IF( PREC(I) .EQ. 'S' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, SSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / SSIZE CALL SSDRVTEST(OUTNUM, VERB, NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ MEM(RDESTPTR), MEM(CDESTPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) * ELSE IF( PREC(I) .EQ. 'D' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, DSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / DSIZE CALL DSDRVTEST(OUTNUM, VERB, NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ MEM(RDESTPTR), MEM(CDESTPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) * ELSE IF( PREC(I) .EQ. 'C' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, CSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / CSIZE CALL CSDRVTEST(OUTNUM, VERB, NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ MEM(RDESTPTR), MEM(CDESTPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) * ELSE IF( PREC(I) .EQ. 'Z' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, ZSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / ZSIZE CALL ZSDRVTEST(OUTNUM, VERB, NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ MEM(RDESTPTR), MEM(CDESTPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) END IF 10 CONTINUE CALL FREEGRIDS( NGRID, MEM(CTXTPTR) ) END IF * IF( TESTBSBR ) THEN * * Get test info * CALL BTINFO( 'BSBR', MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, $ CMEMLEN, OUTNUM, NOP, NSCOPE, TREP, TCOH, NTOP, $ NSHAPE, NMAT, NSRC, NGRID, OPPTR, SCOPEPTR, $ TOPPTR, UPLOPTR, DIAGPTR, MPTR, NPTR, LDSPTR, $ LDDPTR, LDIPTR, RSRCPTR, CSRCPTR, RDESTPTR, $ CDESTPTR, PPTR, QPTR ) * * iseedptr used as tests passed/failed array, so it must * be of size NTESTS -- It's not used unless VERB < 2 * CTXTPTR = MEMUSED + 1 ISEEDPTR = CTXTPTR + NGRID MEMUSED = ISEEDPTR - 1 IF( VERB .LT. 2 ) $ MEMUSED = MEMUSED + NSCOPE*NTOP*NSHAPE*NMAT*NSRC*NGRID * CALL MAKEGRIDS( MEM(CTXTPTR), OUTNUM, NGRID, MEM(PPTR), $ MEM(QPTR) ) * * Call individual tests as appropriate. * DO 20 I = 1, NPREC IF( PREC(I) .EQ. 'I' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, ISIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / ISIZE CALL IBSBRTEST(OUTNUM, VERB, NSCOPE, CMEM(SCOPEPTR), $ NTOP, CMEM(TOPPTR), NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) * ELSE IF( PREC(I) .EQ. 'S' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, SSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / SSIZE CALL SBSBRTEST(OUTNUM, VERB, NSCOPE, CMEM(SCOPEPTR), $ NTOP, CMEM(TOPPTR), NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) * ELSE IF( PREC(I) .EQ. 'D' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, DSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / DSIZE CALL DBSBRTEST(OUTNUM, VERB, NSCOPE, CMEM(SCOPEPTR), $ NTOP, CMEM(TOPPTR), NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) * ELSE IF( PREC(I) .EQ. 'C' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, CSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / CSIZE CALL CBSBRTEST(OUTNUM, VERB, NSCOPE, CMEM(SCOPEPTR), $ NTOP, CMEM(TOPPTR), NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) * ELSE IF( PREC(I) .EQ. 'Z' ) THEN * WORKPTR = SAFEINDEX(MEMUSED + 1, ISIZE, ZSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / ZSIZE CALL ZBSBRTEST(OUTNUM, VERB, NSCOPE, CMEM(SCOPEPTR), $ NTOP, CMEM(TOPPTR), NSHAPE, CMEM(UPLOPTR), $ CMEM(DIAGPTR), NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), $ NSRC, MEM(RSRCPTR), MEM(CSRCPTR), $ NGRID, MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), WORKLEN) * END IF * 20 CONTINUE CALL FREEGRIDS( NGRID, MEM(CTXTPTR) ) END IF IF( TESTCOMB ) THEN * * Get test info * CALL BTINFO( 'COMB', MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, $ CMEMLEN, OUTNUM, NOP, NSCOPE, TREP, TCOH, NTOP, $ NSHAPE, NMAT, NDEST, NGRID, OPPTR, SCOPEPTR, $ TOPPTR, UPLOPTR, DIAGPTR, MPTR, NPTR, LDSPTR, $ LDDPTR, LDIPTR, RSRCPTR, CSRCPTR, RDESTPTR, $ CDESTPTR, PPTR, QPTR ) CTXTPTR = MEMUSED + 1 MEMUSED = CTXTPTR + NGRID - 1 * * Find space required by RA and CA arrays * K = 0 DO 40 J = 0, NOP-1 IF( CMEM(OPPTR+J).EQ.'>' .OR. CMEM(OPPTR+J).EQ.'<' ) THEN DO 30 I = 0, NMAT * * NOTE: here we assume ipre+ipost = 4*M * K = MAX0( K, 4*MEM(MPTR+I) ) IF ( MEM(LDIPTR+I) .NE. -1 ) $ K = MAX0( K, MEM(NPTR+I)*MEM(LDIPTR+I) + $ 4*MEM(MPTR+I) ) 30 CONTINUE END IF 40 CONTINUE RAPTR = MEMUSED + 1 CAPTR = RAPTR + K * * iseed array also used as tests passed/failed array, so it must * be of size MAX( 4*NNODES, NTESTS ) * ISEEDPTR = CAPTR + K I = 0 IF( VERB.LT.2 ) I = NSCOPE * NTOP * NMAT * NDEST * NGRID MEMUSED = ISEEDPTR + MAX( 4*NNODES, I ) * CALL MAKEGRIDS( MEM(CTXTPTR), OUTNUM, NGRID, MEM(PPTR), $ MEM(QPTR) ) * * Call individual tests as appropriate. * DO 60 I = 1, NPREC DO 50 J = 0, NOP-1 IF( PREC(I) .EQ. 'I' ) THEN WORKPTR = SAFEINDEX(MEMUSED, ISIZE, ISIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / ISIZE IF( CMEM(OPPTR+J) .EQ. '+' ) THEN CALL ISUMTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), NDEST, $ MEM(RDESTPTR), MEM(CDESTPTR), NGRID, $ MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), $ WORKLEN) ELSE IF( CMEM(OPPTR+J) .EQ. '>' ) THEN CALL IAMXTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), $ MEM(LDIPTR), NDEST, MEM(RDESTPTR), $ MEM(CDESTPTR), NGRID, MEM(CTXTPTR), $ MEM(PPTR), MEM(QPTR), MEM(ISEEDPTR), $ MEM(RAPTR), MEM(CAPTR), K, $ MEM(WORKPTR), WORKLEN) ELSE IF( CMEM(OPPTR+J) .EQ. '<' ) THEN CALL IAMNTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), $ MEM(LDIPTR), NDEST, MEM(RDESTPTR), $ MEM(CDESTPTR), NGRID, MEM(CTXTPTR), $ MEM(PPTR), MEM(QPTR), MEM(ISEEDPTR), $ MEM(RAPTR), MEM(CAPTR), K, $ MEM(WORKPTR), WORKLEN) END IF ELSE IF( PREC(I) .EQ. 'S' ) THEN WORKPTR = SAFEINDEX(MEMUSED, ISIZE, SSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / SSIZE IF( CMEM(OPPTR+J) .EQ. '+' ) THEN CALL SSUMTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), NDEST, $ MEM(RDESTPTR), MEM(CDESTPTR), NGRID, $ MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), $ WORKLEN) ELSE IF( CMEM(OPPTR+J) .EQ. '>' ) THEN CALL SAMXTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), $ MEM(LDIPTR), NDEST, MEM(RDESTPTR), $ MEM(CDESTPTR), NGRID, MEM(CTXTPTR), $ MEM(PPTR), MEM(QPTR), MEM(ISEEDPTR), $ MEM(RAPTR), MEM(CAPTR), K, $ MEM(WORKPTR), WORKLEN) ELSE IF( CMEM(OPPTR+J) .EQ. '<' ) THEN CALL SAMNTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), $ MEM(LDIPTR), NDEST, MEM(RDESTPTR), $ MEM(CDESTPTR), NGRID, MEM(CTXTPTR), $ MEM(PPTR), MEM(QPTR), MEM(ISEEDPTR), $ MEM(RAPTR), MEM(CAPTR), K, $ MEM(WORKPTR), WORKLEN) END IF ELSE IF( PREC(I) .EQ. 'C' ) THEN WORKPTR = SAFEINDEX(MEMUSED, ISIZE, CSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / CSIZE IF( CMEM(OPPTR+J) .EQ. '+' ) THEN CALL CSUMTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), NDEST, $ MEM(RDESTPTR), MEM(CDESTPTR), NGRID, $ MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), $ WORKLEN) ELSE IF( CMEM(OPPTR+J) .EQ. '>' ) THEN CALL CAMXTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), $ MEM(LDIPTR), NDEST, MEM(RDESTPTR), $ MEM(CDESTPTR), NGRID, MEM(CTXTPTR), $ MEM(PPTR), MEM(QPTR), MEM(ISEEDPTR), $ MEM(RAPTR), MEM(CAPTR), K, $ MEM(WORKPTR), WORKLEN) ELSE IF( CMEM(OPPTR+J) .EQ. '<' ) THEN CALL CAMNTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), $ MEM(LDIPTR), NDEST, MEM(RDESTPTR), $ MEM(CDESTPTR), NGRID, MEM(CTXTPTR), $ MEM(PPTR), MEM(QPTR), MEM(ISEEDPTR), $ MEM(RAPTR), MEM(CAPTR), K, $ MEM(WORKPTR), WORKLEN) END IF ELSE IF( PREC(I) .EQ. 'Z' ) THEN WORKPTR = SAFEINDEX(MEMUSED, ISIZE, ZSIZE) WORKLEN = ( DSIZE * (MEMLEN - WORKPTR + 1) ) / ZSIZE IF( CMEM(OPPTR+J) .EQ. '+' ) THEN CALL ZSUMTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), NDEST, $ MEM(RDESTPTR), MEM(CDESTPTR), NGRID, $ MEM(CTXTPTR), MEM(PPTR), MEM(QPTR), $ MEM(ISEEDPTR), MEM(WORKPTR), $ WORKLEN) ELSE IF( CMEM(OPPTR+J) .EQ. '>' ) THEN CALL ZAMXTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), $ MEM(LDIPTR), NDEST, MEM(RDESTPTR), $ MEM(CDESTPTR), NGRID, MEM(CTXTPTR), $ MEM(PPTR), MEM(QPTR), MEM(ISEEDPTR), $ MEM(RAPTR), MEM(CAPTR), K, $ MEM(WORKPTR), WORKLEN) ELSE IF( CMEM(OPPTR+J) .EQ. '<' ) THEN CALL ZAMNTEST(OUTNUM, VERB, TREP, TCOH, NSCOPE, $ CMEM(SCOPEPTR), NTOP, CMEM(TOPPTR), $ NMAT, MEM(MPTR), MEM(NPTR), $ MEM(LDSPTR), MEM(LDDPTR), $ MEM(LDIPTR), NDEST, MEM(RDESTPTR), $ MEM(CDESTPTR), NGRID, MEM(CTXTPTR), $ MEM(PPTR), MEM(QPTR), MEM(ISEEDPTR), $ MEM(RAPTR), MEM(CAPTR), K, $ MEM(WORKPTR), WORKLEN) END IF END IF 50 CONTINUE 60 CONTINUE CALL FREEGRIDS( NGRID, MEM(CTXTPTR) ) END IF * IF( TESTAUX ) THEN CALL AUXTEST( OUTNUM, MEM, MEMLEN ) END IF * 1000 FORMAT('AUXILIARY ERROR - IAM MISMATCH: BLACS_PINFO RETURNED',I4, $ /,' BLACS_SETUP RETURNED',I4,'.') 1500 FORMAT('AUXILIARY ERROR - NPROC MISMATCH: BLACS_PINFO RETURNED', $ I4,/,' BLACS_SETUP RETURNED',I4,', TESTER THINKS',I4,'.') 2000 FORMAT('BEGINNING BLACS TESTING, BLACS DEBUG LEVEL =',I2) 3000 FORMAT('==============================================') RETURN * * End of RUNTESTS * END * SUBROUTINE MAKEGRIDS( CONTEXTS, OUTNUM, NGRIDS, P, Q ) INTEGER NGRIDS, OUTNUM INTEGER CONTEXTS(NGRIDS), P(NGRIDS), Q(NGRIDS) INTEGER IBTMYPROC EXTERNAL IBTMYPROC INTEGER NPROW, NPCOL, MYROW, MYCOL, I * DO 10 I = 1, NGRIDS CALL BLACS_GET( 0, 0, CONTEXTS(I) ) CALL BLACS_GRIDINIT( CONTEXTS(I), 'r', P(I), Q(I) ) 10 CONTINUE * DO 20 I = 1, NGRIDS CALL BLACS_GRIDINFO( CONTEXTS(I), NPROW, NPCOL, MYROW, MYCOL ) IF( NPROW .GT. 0 ) THEN IF( NPROW.NE.P(I) .OR. NPCOL.NE.Q(I) ) THEN IF( IBTMYPROC() .NE. 0 ) OUTNUM = 6 WRITE(OUTNUM,1000) I IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) CALL BLACS_ABORT( CONTEXTS(I), -1 ) END IF END IF 20 CONTINUE * 1000 FORMAT('Grid creation error trying to create grid #',I3) RETURN END * SUBROUTINE FREEGRIDS( NGRIDS, CONTEXTS ) INTEGER NGRIDS INTEGER CONTEXTS(NGRIDS) INTEGER I, NPROW, NPCOL, MYROW, MYCOL * DO 10 I = 1, NGRIDS CALL BLACS_GRIDINFO( CONTEXTS(I), NPROW, NPCOL, MYROW, MYCOL ) IF( MYROW.LT.NPROW .AND. MYCOL.LT.NPCOL ) $ CALL BLACS_GRIDEXIT( CONTEXTS(I) ) 10 CONTINUE RETURN END * SUBROUTINE AUXTEST( OUTNUM, MEM, MEMLEN ) * * .. Scalar Arguments .. INTEGER OUTNUM, MEMLEN * .. * .. Array Arguments .. INTEGER MEM(MEMLEN) * .. * .. External Functions .. LOGICAL ALLPASS INTEGER IBTMYPROC, IBTMSGID, BLACS_PNUM DOUBLE PRECISION DWALLTIME00 EXTERNAL ALLPASS, IBTMYPROC, IBTMSGID, BLACS_PNUM EXTERNAL DWALLTIME00 * .. * .. External Subroutines .. EXTERNAL BLACS_PINFO, BLACS_GRIDINIT, BLACS_GRIDMAP EXTERNAL BLACS_FREEBUFF, BLACS_GRIDEXIT, BLACS_ABORT EXTERNAL BLACS_GRIDINFO, BLACS_PCOORD, BLACS_BARRIER EXTERNAL BLACS_SET * .. * .. Local Scalars .. LOGICAL AUXPASSED, PASSED, IPRINT INTEGER NPROCS, NPROW, NPCOL, MYROW, MYCOL, CTXT, CTXT2, LDA INTEGER I, J, K DOUBLE PRECISION DTIME, DEPS * .. * .. Local Arrays .. DOUBLE PRECISION START(2), STST(2), KEEP(2) * .. * .. Executable Statements .. * IPRINT = ( IBTMYPROC() .EQ. 0 ) IF( IPRINT ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM,1000) WRITE(OUTNUM,*) ' ' END IF CALL BLACS_PINFO( I, NPROCS ) IF( NPROCS .LT. 2 ) THEN IF( IPRINT ) $ WRITE(OUTNUM,*) 'NOT ENOUGH PROCESSES TO PERFORM AUXTESTS' RETURN END IF * * Make sure BLACS_PNUM and BLACS_PCOORD are inverses of each other * IF( IPRINT ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM,*) 'RUNNING BLACS_PNUM/BLACS_PCOORD TEST' END IF PASSED = .TRUE. NPROCS = NPROCS - MOD(NPROCS,2) CALL BLACS_GET( 0, 0, CTXT ) CALL BLACS_GRIDINIT( CTXT, 'r', 1, NPROCS ) CALL BLACS_GRIDINFO( CTXT, NPROW, NPCOL, MYROW, MYCOL ) IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) GOTO 100 DO 10 I = 1, NPROCS K = BLACS_PNUM( CTXT, 0, I-1 ) CALL BLACS_PCOORD( CTXT, BLACS_PNUM( CTXT, 0, I-1 ), J, K ) IF( PASSED ) PASSED = ( J.EQ.0 .AND. K.EQ.I-1 ) 10 CONTINUE K = 1 IF( PASSED ) K = 0 CALL IGSUM2D( CTXT, 'a', ' ', 1, 1, K, 1, -1, 0 ) PASSED = ( K .EQ. 0 ) AUXPASSED = PASSED IF( IPRINT ) THEN IF( PASSED ) THEN WRITE(OUTNUM,*) 'PASSED BLACS_PNUM/BLACS_PCOORD TEST' ELSE WRITE(OUTNUM,*) 'FAILED BLACS_PNUM/BLACS_PCOORD TEST' END IF WRITE(OUTNUM,*) ' ' END IF * * Test to see if DGSUM2D is repeatable when repeatability flag is set * Skip test if DGSUM2D is repeatable when repeatability flag is not set * NOTE: do not change the EPS calculation loop; it is figured in this * strange way so that it ports across platforms * IF( IPRINT ) WRITE(OUTNUM,*) 'RUNNING REPEATABLE SUM TEST' J = 0 12 CONTINUE PASSED = .TRUE. START(1) = 1.0D0 15 CONTINUE DEPS = START(1) START(1) = START(1) / 2.0D0 STST(1) = 1.0D0 + START(1) IF (STST(1) .NE. 1.0D0) GOTO 15 * START(1) = DEPS / DBLE(NPCOL-1) IF (MYCOL .EQ. 3) START(1) = 1.0D0 START(2) = 7.00005D0 * NPCOL STST(1) = START(1) STST(2) = START(2) CALL BLACS_SET(CTXT, 15, J) CALL DGSUM2D(CTXT, 'a', 'f', 2, 1, STST, 2, -1, 0) KEEP(1) = STST(1) KEEP(2) = STST(2) DO 30 I = 1, 3 * * Have a different guy waste time so he enters combine last * IF (MYCOL .EQ. I) THEN DTIME = DWALLTIME00() 20 CONTINUE IF (DWALLTIME00() - DTIME .LT. 2.0D0) GOTO 20 END IF STST(1) = START(1) STST(2) = START(2) CALL DGSUM2D(CTXT, 'a', 'f', 2, 1, STST, 2, -1, 0) IF ( (KEEP(1).NE.STST(1)) .OR. (KEEP(2).NE.STST(2)) ) $ PASSED = .FALSE. 30 CONTINUE K = 1 IF (PASSED) K = 0 CALL IGSUM2D( CTXT, 'a', ' ', 1, 1, K, 1, -1, 0 ) PASSED = (K .EQ. 0) IF (J .EQ. 0) THEN IF (.NOT.PASSED) THEN J = 1 GOTO 12 ELSE IF( IPRINT ) THEN WRITE(OUTNUM,*) 'SKIPPED REPEATABLE SUM TEST' WRITE(OUTNUM,*) ' ' END IF END IF * IF (J .EQ. 1) THEN AUXPASSED = AUXPASSED .AND. PASSED IF( IPRINT ) THEN IF( PASSED ) THEN WRITE(OUTNUM,*) 'PASSED REPEATABLE SUM TEST' ELSE WRITE(OUTNUM,*) 'FAILED REPEATABLE SUM TEST' END IF WRITE(OUTNUM,*) ' ' END IF END IF * * Test BLACS_GRIDMAP: force a column major ordering, starting at an * arbitrary processor * PASSED = .TRUE. IF( IPRINT ) WRITE(OUTNUM,*) 'RUNNING BLACS_GRIDMAP TEST' NPROW = 2 NPCOL = NPROCS / NPROW DO 40 I = 0, NPROCS-1 MEM(I+1) = BLACS_PNUM( CTXT, 0, MOD(I+NPCOL, NPROCS) ) 40 CONTINUE CALL BLACS_GET( CTXT, 10, CTXT2 ) CALL BLACS_GRIDMAP( CTXT2, MEM, NPROW, NPROW, NPCOL ) CALL BLACS_GRIDINFO( CTXT2, NPROW, NPCOL, MYROW, MYCOL ) PASSED = ( NPROW.EQ.2 .AND. NPCOL.EQ.NPROCS/2 ) * * Fan in pids for final check: Note we assume SD/RV working * IF( PASSED ) THEN K = BLACS_PNUM( CTXT2, MYROW, MYCOL ) IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN DO 60 J = 0, NPCOL-1 DO 50 I = 0, NPROW-1 IF( I.NE.0 .OR. J.NE.0 ) $ CALL IGERV2D( CTXT2, 1, 1, K, 1, I, J ) IF ( PASSED ) $ PASSED = ( K .EQ. BLACS_PNUM(CTXT2, I, J) ) 50 CONTINUE 60 CONTINUE ELSE CALL IGESD2D( CTXT2, 1, 1, K, 1, 0, 0 ) END IF END IF K = 1 IF ( PASSED ) K = 0 CALL IGSUM2D( CTXT, 'a', ' ', 1, 1, K, 1, -1, 0 ) PASSED = ( K .EQ. 0 ) AUXPASSED = AUXPASSED .AND. PASSED IF( IPRINT ) THEN IF( PASSED ) THEN WRITE(OUTNUM,*) 'PASSED BLACS_GRIDMAP TEST' ELSE WRITE(OUTNUM,*) 'FAILED BLACS_GRIDMAP TEST' END IF WRITE(OUTNUM,*) ' ' END IF * IF( IPRINT ) WRITE(OUTNUM,*) 'CALL BLACS_FREEBUFF' CALL BLACS_FREEBUFF( CTXT, 0 ) CALL BLACS_FREEBUFF( CTXT, 1 ) J = 0 CALL IGSUM2D( CTXT2, 'All', ' ', 1, 1, J, 1, -1, MYCOL ) IF( IPRINT ) THEN WRITE(OUTNUM,*) 'DONE BLACS_FREEBUFF' WRITE(OUTNUM,*) ' ' END IF * * Make sure barriers don't interfere with each other * IF( IPRINT ) WRITE(OUTNUM,*) 'CALL BARRIER' CALL BLACS_BARRIER(CTXT2, 'A') CALL BLACS_BARRIER(CTXT2, 'R') CALL BLACS_BARRIER(CTXT2, 'C') CALL BLACS_BARRIER(CTXT2, 'R') CALL BLACS_BARRIER(CTXT2, 'A') CALL BLACS_BARRIER(CTXT2, 'C') CALL BLACS_BARRIER(CTXT2, 'C') CALL BLACS_BARRIER(CTXT2, 'R') CALL BLACS_BARRIER(CTXT2, 'A') J = 0 CALL IGSUM2D( CTXT2, 'All', ' ', 1, 1, J, 1, -1, MYCOL ) IF( IPRINT ) THEN WRITE(OUTNUM,*) 'DONE BARRIER' WRITE(OUTNUM,*) ' ' END IF * * Ensure contiguous sends are locally-blocking * IF( IPRINT ) THEN WRITE(OUTNUM,*) 'The following tests will hang if your BLACS'// $ ' are not locally blocking:' WRITE(OUTNUM,*) 'RUNNING LOCALLY-BLOCKING CONTIGUOUS SEND TEST' END IF K = MIN( MEMLEN, 50000 ) * * Initialize send buffer * DO 70 J = 1, K MEM(J) = 1 70 CONTINUE * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGESD2D( CTXT2, K, 1, MEM, K, NPROW-1, NPCOL-1 ) CALL IGESD2D( CTXT2, K, 1, MEM, K, NPROW-1, NPCOL-1 ) CALL IGESD2D( CTXT2, K, 1, MEM, K, NPROW-1, NPCOL-1 ) CALL IGERV2D( CTXT2, K, 1, MEM, K, NPROW-1, NPCOL-1 ) CALL IGERV2D( CTXT2, K, 1, MEM, K, NPROW-1, NPCOL-1 ) CALL IGERV2D( CTXT2, K, 1, MEM, K, NPROW-1, NPCOL-1 ) ELSE IF( MYROW.EQ.NPROW-1 .AND. MYCOL.EQ.NPCOL-1 ) THEN CALL IGESD2D( CTXT2, K, 1, MEM, K, 0, 0 ) CALL IGESD2D( CTXT2, K, 1, MEM, K, 0, 0 ) CALL IGESD2D( CTXT2, K, 1, MEM, K, 0, 0 ) CALL IGERV2D( CTXT2, K, 1, MEM, K, 0, 0 ) CALL IGERV2D( CTXT2, K, 1, MEM, K, 0, 0 ) CALL IGERV2D( CTXT2, K, 1, MEM, K, 0, 0 ) END IF J = 0 CALL IGSUM2D( CTXT2, 'All', ' ', 1, 1, J, 1, -1, MYCOL ) IF( IPRINT ) $ WRITE(OUTNUM,*) 'PASSED LOCALLY-BLOCKING CONTIGUOUS SEND TEST' * * Ensure non-contiguous sends are locally-blocking * J = 4 LDA = K / J I = MAX( 2, LDA / 4 ) IF( IPRINT ) $ WRITE(OUTNUM,*) 'RUNNING LOCALLY-BLOCKING NON-CONTIGUOUS '// $ 'SEND TEST' IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN CALL IGESD2D( CTXT2, I, J, MEM, LDA, NPROW-1, NPCOL-1 ) CALL IGESD2D( CTXT2, I, J, MEM, LDA, NPROW-1, NPCOL-1 ) CALL IGESD2D( CTXT2, I, J, MEM, LDA, NPROW-1, NPCOL-1 ) CALL IGERV2D( CTXT2, I, J, MEM, LDA, NPROW-1, NPCOL-1 ) CALL IGERV2D( CTXT2, I, J, MEM, LDA, NPROW-1, NPCOL-1 ) CALL IGERV2D( CTXT2, I, J, MEM, LDA, NPROW-1, NPCOL-1 ) ELSE IF( MYROW.EQ.NPROW-1 .AND. MYCOL.EQ.NPCOL-1 ) THEN CALL IGESD2D( CTXT2, I, J, MEM, LDA, 0, 0 ) CALL IGESD2D( CTXT2, I, J, MEM, LDA, 0, 0 ) CALL IGESD2D( CTXT2, I, J, MEM, LDA, 0, 0 ) CALL IGERV2D( CTXT2, I, J, MEM, LDA, 0, 0 ) CALL IGERV2D( CTXT2, I, J, MEM, LDA, 0, 0 ) CALL IGERV2D( CTXT2, I, J, MEM, LDA, 0, 0 ) END IF CALL IGSUM2D( CTXT2, 'All', ' ', 1, 1, J, 1, -1, MYCOL ) IF( IPRINT ) THEN WRITE(OUTNUM,*)'PASSED LOCALLY-BLOCKING NON-CONTIGUOUS '// $ 'SEND TEST' WRITE(OUTNUM,*) ' ' END IF * * Note that we already tested the message ID setting/getting in * first call to IBTMSGID() * IF( IPRINT ) WRITE(OUTNUM,*) 'RUNNING BLACS_SET/BLACS_GET TESTS' J = 0 CALL BLACS_SET( CTXT2, 11, 3 ) CALL BLACS_SET( CTXT2, 12, 2 ) CALL BLACS_GET( CTXT2, 12, I ) CALL BLACS_GET( CTXT2, 11, K ) IF( K.NE.3 ) J = J + 1 IF( I.NE.2 ) J = J + 1 CALL BLACS_SET( CTXT2, 13, 3 ) CALL BLACS_SET( CTXT2, 14, 2 ) CALL BLACS_GET( CTXT2, 14, I ) CALL BLACS_GET( CTXT2, 13, K ) IF( K.NE.3 ) J = J + 1 IF( I.NE.2 ) J = J + 1 * * See if anyone had error, and print result * CALL IGSUM2D( CTXT2, 'All', ' ', 1, 1, J, 1, -1, MYCOL ) PASSED = (J .EQ. 0) AUXPASSED = AUXPASSED .AND. PASSED IF( IPRINT ) THEN IF( PASSED ) THEN WRITE(OUTNUM,*) 'PASSED BLACS_SET/BLACS_GET TESTS' ELSE WRITE(OUTNUM,*) 'FAILED BLACS_SET/BLACS_GET TESTS' END IF WRITE(OUTNUM,*) ' ' END IF * IF( IPRINT ) WRITE(OUTNUM,*) 'CALL BLACS_GRIDEXIT' CALL BLACS_GRIDEXIT(CTXT) CALL BLACS_GRIDEXIT(CTXT2) IF( IPRINT ) THEN WRITE(OUTNUM,*) 'DONE BLACS_GRIDEXIT' WRITE(OUTNUM,*) ' ' END IF * 100 CONTINUE * PASSED = ALLPASS(AUXPASSED) IF( IPRINT ) THEN WRITE(OUTNUM,*) 'The final auxiliary test is for BLACS_ABORT.' WRITE(OUTNUM,*) 'Immediately after this message, all '// $ 'processes should be killed.' WRITE(OUTNUM,*) 'If processes survive the call, your BLACS_'// $ 'ABORT is incorrect.' END IF CALL BLACS_PINFO( I, NPROCS ) CALL BLACS_GET( 0, 0, CTXT ) CALL BLACS_GRIDINIT( CTXT, 'r', 1, NPROCS ) CALL BLACS_BARRIER(CTXT, 'A') CALL BLACS_GRIDINFO( CTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Test BLACS_ABORT * IF( MYROW.EQ.NPROW/2 .AND. MYCOL.EQ.NPCOL/2 ) THEN CALL BLACS_ABORT( CTXT, -1 ) * * Other procs try to cause a hang: should be killed by BLACS_ABORT * ELSE I = 1 110 CONTINUE I = I + 3 I = I - 2 I = I - 1 IF( I.EQ.1 ) GOTO 110 end if * 1000 FORMAT('AUXILIARY TESTS: BEGIN.') RETURN END * SUBROUTINE BTTRANSCHAR(TRANSTO, N, CMEM, IMEM) CHARACTER TRANSTO INTEGER N CHARACTER*1 CMEM(N) INTEGER IMEM(N) INTEGER I * IF( TRANSTO .EQ. 'I' ) THEN DO 10 I = 1, N IMEM(I) = ICHAR( CMEM(I) ) 10 CONTINUE ELSE DO 20 I = 1, N CMEM(I) = CHAR( IMEM(I) ) 20 CONTINUE END IF RETURN END * SUBROUTINE BTINFO( TEST, MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, $ CMEMLEN, OUTNUM, NOP, NSCOPE, TREP, TCOH, NTOP, $ NSHAPE, NMAT, NSRC, NGRID, OPPTR, SCOPEPTR, $ TOPPTR, UPLOPTR, DIAGPTR, MPTR, NPTR, LDSPTR, $ LDDPTR, LDIPTR, RSRCPTR, CSRCPTR, RDESTPTR, $ CDESTPTR, PPTR, QPTR ) * * .. Scalar Arguments .. CHARACTER*1 TEST INTEGER CDESTPTR, CMEMLEN, CMEMUSED, CSRCPTR, DIAGPTR, LDDPTR, $ LDIPTR, LDSPTR, MEMLEN, MEMUSED, MPTR, NGRID, NMAT, NOP, $ NPTR, NSCOPE, NSHAPE, NSRC, NTOP, OPPTR, OUTNUM, PPTR, $ QPTR, RDESTPTR, RSRCPTR, SCOPEPTR, TCOH, TOPPTR, TREP, $ UPLOPTR * .. * .. Array Arguments .. CHARACTER*1 CMEM(CMEMLEN) INTEGER MEM(MEMLEN) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTMSGID, IBTSIZEOF EXTERNAL IBTMYPROC, IBTMSGID, IBTSIZEOF * .. * .. Local Scalars .. INTEGER IAM, ISIZE, DSIZE * .. * .. Local Arrays .. INTEGER ITMP(2) * .. * .. Executable Statements .. * IAM = IBTMYPROC() IF( IAM .EQ. 0 ) THEN IF( TEST .EQ. 'S' ) THEN CALL RDSDRV( MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, CMEMLEN, $ OUTNUM ) ELSE IF( TEST .EQ. 'B' ) THEN CALL RDBSBR( MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, CMEMLEN, $ OUTNUM ) ELSE CALL RDCOMB( MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, CMEMLEN, $ OUTNUM ) END IF ITMP(1) = MEMUSED ITMP(2) = CMEMUSED CALL BTSEND( 3, 2, ITMP, -1, IBTMSGID()+3 ) IF( MEMLEN .GE. MEMUSED + CMEMUSED ) THEN CALL BTTRANSCHAR( 'I', CMEMUSED, CMEM, MEM(MEMUSED+1) ) ELSE ISIZE = IBTSIZEOF('I') DSIZE = IBTSIZEOF('D') WRITE(OUTNUM,1000) ( (MEMUSED+CMEMUSED)*ISIZE + DSIZE-1 ) $ / DSIZE CALL BLACS_ABORT(-1, -1) END IF CALL BTSEND( 3, MEMUSED+CMEMUSED, MEM, -1, IBTMSGID()+4 ) ELSE CALL BTRECV( 3, 2, ITMP, 0, IBTMSGID()+3 ) MEMUSED = ITMP(1) CMEMUSED = ITMP(2) IF( MEMLEN .GE. MEMUSED + CMEMUSED ) THEN CALL BTRECV( 3, MEMUSED+CMEMUSED, MEM, 0, IBTMSGID()+4 ) CALL BTTRANSCHAR( 'C', CMEMUSED, CMEM, MEM(MEMUSED+1) ) ELSE ISIZE = IBTSIZEOF('I') DSIZE = IBTSIZEOF('D') WRITE(OUTNUM,1000) ( (MEMUSED+CMEMUSED)*ISIZE + DSIZE-1 ) $ / DSIZE CALL BLACS_ABORT(-1, -1) END IF END IF CALL BTUNPACK( TEST, MEM, MEMUSED, NOP, NSCOPE, TREP, TCOH, NTOP, $ NSHAPE, NMAT, NSRC, NGRID, OPPTR, SCOPEPTR, TOPPTR, $ UPLOPTR, DIAGPTR, MPTR, NPTR, LDSPTR, LDDPTR, $ LDIPTR, RSRCPTR, CSRCPTR, RDESTPTR, CDESTPTR, PPTR, $ QPTR) * 1000 FORMAT('MEM array too short to pack CMEM; increase to at least', $ I7) * RETURN * * End BTINFO * END * SUBROUTINE RDBTIN( TESTSDRV, TESTBSBR, TESTCOMB, TESTAUX, NPREC, $ PREC, VERB, OUTNUM ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. LOGICAL TESTSDRV, TESTBSBR, TESTCOMB, TESTAUX INTEGER NPREC, OUTNUM, VERB * .. * .. Array Arguments .. CHARACTER*1 PREC(*) * .. * * Purpose * ======= * RDBTIN: Read and process the top-level input file BT.dat. * * Arguments * ========= * TESTSDRV (output) LOGICAL * Run any point-to-point tests? * * TESTBSBR (output) LOGICAL * Run any broadcast tests? * * TESTCOMB (output) LOGICAL * Run any combine-operation tests (e.g. MAX) * * TESTAUX (output) LOGICAL * Run any auxiliary tests? * * NPREC (output) INTEGER * Number of different precisions to test. (up to 5, as determined * by the parameter PRECMAX down in the code.) * * PREC (output) CHARACTER*1 array, dimension 5 * Prefix letter of each precision to test, from the set * {'C', 'D', 'I', 'S', 'Z'} * * VERB (output) INTEGER * Output verbosity for this test run. * 0 = Print only "BEGIN [SDRV/BSBR/COMB]", followed by PASSED * or FAILED message * 1 = Same as 0, but also prints out header explaining all tests * to be run. * 2 = Prints out info before and after every individual test. * * OUTNUM (output) INTEGER * Unit number for output file. * ====================================================================== * * * .. Parameters .. INTEGER PRECMAX, VERBMAX, IN PARAMETER ( PRECMAX = 5, VERBMAX = 2, IN = 11 ) * .. * .. Local Scalars .. INTEGER I CHARACTER*1 CH LOGICAL READERROR * .. * .. Local Arrays .. CHARACTER*80 HEADER, OUTNAME * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Executable Statements * * Open and read the file blacstest.dat. Expected format is * ----- * 'One line of free text intended as a comment for each test run' * integer Unit number of output file * string Name of output file (ignored if unit = 6) * {'T'|'F'} Run any point to point tests? * {'T'|'F'} Run any broadcast tests? * {'T'|'F'} Run any combine-operator tests? * {'T'|'F'} Run the auxiliary tests? * integer Number of precisions to test - up to 99 * array of CHAR*1's Specific precisions to test * integer Output verb (1-n, n=most verbose) * integer Number of nodes required by largest test case * ----- * Note that the comments to the right of each line are present * in the sample blacstest.dat file included with this * distribution, but they are not required. * * The array of CHAR*1's is expected to have length equal to the * integer in the previous line - if it is shorter, problems may * occur later; if it is longer, the trailing elements will just * be ignored. The verb is expected to be an integer * between 1 and n inclusive and will be set to 1 if outside * this range. * * Only process 0 should be calling this routine * READERROR = .FALSE. OPEN( UNIT = IN, FILE = 'bt.dat', STATUS = 'OLD' ) READ(IN, *) HEADER READ(IN, *) OUTNUM READ(IN, *) OUTNAME * * Open and prepare output file * IF( OUTNUM.NE.6 .AND. OUTNUM.NE.0 ) $ OPEN( UNIT = OUTNUM, FILE = OUTNAME, STATUS = 'UNKNOWN' ) WRITE(OUTNUM, *) HEADER * * Determine which tests to run * READ(IN, *) CH IF( LSAME(CH, 'T') ) THEN TESTSDRV = .TRUE. ELSE IF( LSAME(CH, 'F') ) THEN TESTSDRV = .FALSE. ELSE WRITE(OUTNUM, 1000) 'SDRV', CH READERROR = .TRUE. END IF * READ(IN, *) CH IF( LSAME(CH, 'T') ) THEN TESTBSBR = .TRUE. ELSE IF(LSAME( CH, 'F') ) THEN TESTBSBR = .FALSE. ELSE WRITE(OUTNUM, 1000) 'BSBR', CH READERROR = .TRUE. END IF * READ(IN, *) CH IF( LSAME(CH, 'T') ) THEN TESTCOMB = .TRUE. ELSE IF( LSAME(CH, 'F') ) THEN TESTCOMB = .FALSE. ELSE WRITE(OUTNUM, 1000) 'COMB', CH READERROR = .TRUE. END IF * READ(IN, *) CH IF( LSAME(CH, 'T') ) THEN TESTAUX = .TRUE. ELSE IF( LSAME(CH, 'F') ) THEN TESTAUX = .FALSE. ELSE WRITE(OUTNUM, 1000) 'AUX ', CH READERROR = .TRUE. END IF * * Get # of precisions, and precisions to test * READ(IN, *) NPREC IF( NPREC .LT. 0 ) THEN NPREC = 0 ELSE IF( NPREC. GT. PRECMAX ) THEN WRITE(OUTNUM, 2000) NPREC, PRECMAX, PRECMAX NPREC = PRECMAX END IF * READ(IN, *) ( PREC(I), I = 1, NPREC ) DO 100 I = 1, NPREC IF( LSAME(PREC(I), 'C') ) THEN PREC(I) = 'C' ELSE IF( LSAME(PREC(I), 'D') ) THEN PREC(I) = 'D' ELSE IF( LSAME(PREC(I), 'I') ) THEN PREC(I) = 'I' ELSE IF( LSAME(PREC(I), 'S') ) THEN PREC(I) = 'S' ELSE IF( LSAME(PREC(I), 'Z') ) THEN PREC(I) = 'Z' ELSE WRITE(OUTNUM, 3000) PREC(I) READERROR = .TRUE. END IF 100 CONTINUE * READ(IN, *) VERB * IF( VERB .GT. VERBMAX ) THEN WRITE(OUTNUM, 4000) VERB, VERBMAX, VERBMAX VERB = VERBMAX ELSE IF( VERB .LT. 0 ) THEN WRITE(OUTNUM, 5000) VERB VERB = 0 END IF * * Abort if there was a fatal error * IF( READERROR ) THEN WRITE(OUTNUM, 6000) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE( OUTNUM ) STOP END IF * 1000 FORMAT( 'INVALID CHARACTER FOR ',A4,' TESTS ''', A1, $ ''' (EXPECTED T/F)' ) 2000 FORMAT( 'NUMBER OF PRECISIONS ', I6, ' GREATER THAN ', I6, $ ' - SETTING TO ', I6, '.') 3000 FORMAT( 'UNRECOGNIZABLE PRECISION ENTRY ''', A1, $ ''' - EXPECTED ''C'', ''D'', ''I'', ''S'', OR ''Z''.') 4000 FORMAT( 'VERBOSITY ', I4, ' GREATER THAN ',I4, $ ' - SETTING TO ',I4,'.') 5000 FORMAT( 'VERBOSITY ', I4, ' LESS THAN 0 - SETTING TO 0' ) 6000 FORMAT( 'FATAL INPUT FILE ERROR - ABORTING RUN.' ) * RETURN * * End of RDBTIN * END * INTEGER FUNCTION IBTMSGID() * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * PURPOSE * ======= * IBTMSGID : returns a ID for tester communication. * INTEGER MINID INTEGER ITMP(2) SAVE MINID DATA MINID /-1/ * * On first call, reserve 1st 1000 IDs for tester use * IF (MINID .EQ. -1) THEN CALL BLACS_GET( -1, 1, ITMP ) MINID = ITMP(1) ITMP(1) = ITMP(1) + 1000 CALL BLACS_SET( -1, 1, ITMP ) END IF * * return the minimum allowable ID * IBTMSGID = MINID * RETURN END * SUBROUTINE BTUNPACK(TEST, MEM, MEMLEN, NOP, NSCOPE, TREP, TCOH, $ NTOP, NSHAPE, NMAT, NSRC, NGRID, OPPTR, $ SCOPEPTR, TOPPTR, UPLOPTR, DIAGPTR, MPTR, $ NPTR, LDSPTR, LDDPTR, LDIPTR, RSRCPTR, $ CSRCPTR, RDESTPTR, CDESTPTR, PPTR, QPTR) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 TEST INTEGER CDESTPTR, CSRCPTR, DIAGPTR, LDDPTR, LDIPTR, LDSPTR, $ MEMLEN, MPTR, NGRID, NMAT, NOP, NPTR, NSCOPE, NSHAPE, $ NSRC, NTOP, OPPTR, PPTR, QPTR, RDESTPTR, RSRCPTR, $ SCOPEPTR, TCOH, TOPPTR, TREP, UPLOPTR * .. * .. Array Arguments .. INTEGER MEM(MEMLEN) * .. * * Purpose * ======= * BTUNPACK: Figure pointers into MEM where the various input values * are stored. * * Arguments * ========= * TEST (input) CHARACTER*1 * The test we're unpacking for: * = 'S' : SDRV test * = 'B' : BSBR test * = 'C' : Combine test * * MEM (input) INTEGER array of dimension MEMLEN * Memory containing values and number of items. * * MEMLEN (input/output) INTEGER * The number of elements that are used in MEM. * * . * . * . * * ===================================================================== * * .. Local Scalars .. INTEGER NDEST, NLDI * .. * .. Executable Statements .. * * Test is SDRV * IF( TEST .EQ. 'S' ) THEN NOP = 0 NSHAPE = MEM(MEMLEN-3) NSCOPE = 0 TREP = 0 TCOH = 0 NTOP = 0 NMAT = MEM(MEMLEN-2) NLDI = 0 NSRC = MEM(MEMLEN-1) NDEST = NSRC NGRID = MEM(MEMLEN) MEMLEN = MEMLEN - 3 * * Test is BSBR * ELSE IF ( TEST .EQ. 'B' ) THEN NOP = 0 NSCOPE = MEM(MEMLEN-5) TREP = 0 TCOH = 0 NTOP = MEM(MEMLEN-4) NSHAPE = MEM(MEMLEN-3) NMAT = MEM(MEMLEN-2) NLDI = 0 NSRC = MEM(MEMLEN-1) NDEST = 0 NGRID = MEM(MEMLEN) MEMLEN = MEMLEN - 5 * * Test is COMB * ELSE NOP = MEM(MEMLEN-7) NSCOPE = MEM(MEMLEN-6) TREP = MEM(MEMLEN-5) TCOH = MEM(MEMLEN-4) NTOP = MEM(MEMLEN-3) NSHAPE = 0 NMAT = MEM(MEMLEN-2) NLDI = NMAT NSRC = 0 NDEST = MEM(MEMLEN-1) NGRID = MEM(MEMLEN) MEMLEN = MEMLEN - 6 END IF OPPTR = 1 SCOPEPTR = OPPTR + NOP TOPPTR = SCOPEPTR + NSCOPE UPLOPTR = TOPPTR + NTOP DIAGPTR = UPLOPTR + NSHAPE MPTR = 1 NPTR = MPTR + NMAT LDSPTR = NPTR + NMAT LDDPTR = LDSPTR + NMAT LDIPTR = LDDPTR + NMAT RSRCPTR = LDIPTR + NLDI CSRCPTR = RSRCPTR + NSRC RDESTPTR = CSRCPTR + NSRC CDESTPTR = RDESTPTR + NDEST PPTR = CDESTPTR + NDEST QPTR = PPTR + NGRID IF( NSRC .EQ. 0 ) NSRC = NDEST * RETURN * * End of BTUNPACK * END * INTEGER FUNCTION SAFEINDEX(INDX, SIZE1, SIZE2) * * .. Scalar Arguments .. INTEGER INDX, SIZE1, SIZE2 * .. * * If you have an array with elements of SIZE1 bytes, of which you * have used INDX-1 elements, returns the index necessary to keep it * on a SIZE2 boundary (assuming it was SIZE2 aligned in the first place). * * .. Local scalars .. INTEGER I * .. * .. Executable Statements .. * * Take into account that Fortran starts arrays at 1, not 0 * I = INDX - 1 10 CONTINUE IF( MOD(I*SIZE1, SIZE2) .EQ. 0 ) GOTO 20 I = I + 1 GOTO 10 20 CONTINUE * SAFEINDEX = I + 1 * RETURN END * * SUBROUTINE RDSDRV( MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, CMEMLEN, $ OUTNUM ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMUSED, MEMLEN, CMEMUSED, CMEMLEN, OUTNUM * .. * .. Array Arguments .. CHARACTER*1 CMEM(CMEMLEN) INTEGER MEM(MEMLEN) * .. * * Purpose * ======= * RDSDRV: Read and process the input file SDRV.dat. * * Arguments * ========= * MEMUSED (output) INTEGER * Number of elements in MEM that this subroutine ends up using. * * MEM (output) INTEGER array of dimension memlen * On output, holds information read in from sdrv.dat. * * MEMLEN (input) INTEGER * Number of elements of MEM that this subroutine * may safely write into. * * CMEMUSED (output) INTEGER * Number of elements in CMEM that this subroutine ends up using. * * CMEM (output) CHARACTER*1 array of dimension cmemlen * On output, holds the values for UPLO and DIAG. * * CMEMLEN (input) INTEGER * Number of elements of CMEM that this subroutine * may safely write into. * * OUTNUM (input) INTEGER * Unit number of the output file. * * ================================================================= * * .. Parameters .. INTEGER SDIN PARAMETER( SDIN = 12 ) * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Local Scalars .. INTEGER NSHAPE, NMAT, NSRC, NGRID, I, J INTEGER UPLOPTR, DIAGPTR, MPTR, NPTR, LDSPTR, LDDPTR, RSRCPTR INTEGER CSRCPTR, RDESTPTR, CDESTPTR, PPTR, QPTR * .. * .. Executable Statements * * Open and read the file sdrv.dat. The expected format is * below. * *------ *integer number of shapes of the matrix *array of CHAR*1's UPLO *array of CHAR*1's DIAG: unit diagonal or not? *integer number of nmat *array of integers M: number of rows in matrix *array of integers N: number of columns in matrix *integer LDA: leading dimension on source proc *integer LDA: leading dimension on dest proc *integer number of source/dest pairs *array of integers RSRC: process row of message source *array of integers CSRC: process column of msg. src. *array of integers RDEST: process row of msg. dest. *array of integers CDEST: process column of msg. dest. *integer Number of grids *array of integers NPROW: number of rows in process grid *array of integers NPCOL: number of col's in proc. grid *------ * note: UPLO stands for 'upper or lower trapezoidal or general * rectangular.' * note: the text descriptions as shown above are present in * the sample sdrv.dat included with this distribution, * but are not required. * * Read input file * MEMUSED = 1 CMEMUSED = 1 OPEN(UNIT = SDIN, FILE = 'sdrv.dat', STATUS = 'OLD') * * Read in number of shapes, and values of UPLO and DIAG * READ(SDIN, *) NSHAPE UPLOPTR = CMEMUSED DIAGPTR = UPLOPTR + NSHAPE CMEMUSED = DIAGPTR + NSHAPE IF ( CMEMUSED .GT. CMEMLEN ) THEN WRITE(OUTNUM, 1000) CMEMLEN, NSHAPE, 'MATRIX SHAPES.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NSHAPE .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'MATRIX SHAPE.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF * * Read in, upcase, and fatal error if UPLO/DIAG not recognized * READ(SDIN, *) ( CMEM(UPLOPTR+I), I = 0, NSHAPE-1 ) DO 30 I = 0, NSHAPE-1 IF( LSAME(CMEM(UPLOPTR+I), 'G') ) THEN CMEM(UPLOPTR+I) = 'G' ELSE IF( LSAME(CMEM(UPLOPTR+I), 'U') ) THEN CMEM(UPLOPTR+I) = 'U' ELSE IF( LSAME(CMEM(UPLOPTR+I), 'L') ) THEN CMEM(UPLOPTR+I) = 'L' ELSE WRITE(OUTNUM, 3000) 'UPLO ', CMEM(UPLOPTR+I) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF 30 CONTINUE * READ(SDIN, *) ( CMEM(DIAGPTR+I), I = 0, NSHAPE-1 ) DO 40 I = 0, NSHAPE-1 IF( CMEM(UPLOPTR+I) .NE. 'G' ) THEN IF( LSAME(CMEM(DIAGPTR+I), 'U') ) THEN CMEM( DIAGPTR+I ) = 'U' ELSE IF( LSAME(CMEM(DIAGPTR+I), 'N') ) THEN CMEM(DIAGPTR+I) = 'N' ELSE WRITE(OUTNUM, 3000) 'DIAG ', CMEM(DIAGPTR+I) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF END IF 40 CONTINUE * * Read in number of matrices, and values for M, N, LDASRC, and LDADEST * READ(SDIN, *) NMAT MPTR = MEMUSED NPTR = MPTR + NMAT LDSPTR = NPTR + NMAT LDDPTR = LDSPTR + NMAT MEMUSED = LDDPTR + NMAT IF( MEMUSED .GT. MEMLEN ) THEN WRITE(OUTNUM, 1000) MEMLEN, NMAT, 'MATRICES.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NMAT .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'MATRIX.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF READ(SDIN, *) ( MEM( MPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( NPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( LDSPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( LDDPTR+I ), I = 0, NMAT-1 ) * * Make sure matrix values are legal * CALL CHKMATDAT( OUTNUM, 'SDRV.dat', .FALSE., NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), MEM(LDDPTR) ) * * Read in number of src/dest pairs, and values of src/dest * READ(SDIN, *) NSRC RSRCPTR = MEMUSED CSRCPTR = RSRCPTR + NSRC RDESTPTR = CSRCPTR + NSRC CDESTPTR = RDESTPTR + NSRC MEMUSED = CDESTPTR + NSRC IF( MEMUSED .GT. MEMLEN ) THEN WRITE(OUTNUM, 1000) MEMLEN, NMAT, 'SRC/DEST.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NSRC .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'SRC/DEST.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF READ(SDIN, *) ( MEM(RSRCPTR+I), I = 0, NSRC-1 ) READ(SDIN, *) ( MEM(CSRCPTR+I), I = 0, NSRC-1 ) READ(SDIN, *) ( MEM(RDESTPTR+I), I = 0, NSRC-1 ) READ(SDIN, *) ( MEM(CDESTPTR+I), I = 0, NSRC-1 ) * * Read in number of grids pairs, and values of P (process rows) and * Q (process columns) * READ(SDIN, *) NGRID PPTR = MEMUSED QPTR = PPTR + NGRID MEMUSED = QPTR + NGRID IF( MEMUSED .GT. MEMLEN ) THEN WRITE(OUTNUM, 1000) MEMLEN, NGRID, 'PROCESS GRIDS.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NGRID .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'PROCESS GRID' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE( OUTNUM ) STOP END IF * READ(SDIN, *) ( MEM(PPTR+I), I = 0, NGRID-1 ) READ(SDIN, *) ( MEM(QPTR+I), I = 0, NGRID-1 ) IF( SDIN .NE. 6 .AND. SDIN .NE. 0 ) CLOSE( SDIN ) * * Fatal error if we've got an illegal grid * DO 70 J = 0, NGRID-1 IF( MEM(PPTR+J).LT.1 .OR. MEM(QPTR+J).LT.1 ) THEN WRITE(OUTNUM, 4000) MEM(PPTR+J), MEM(QPTR+J) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF 70 CONTINUE * * Prepare output variables * MEM(MEMUSED) = NSHAPE MEM(MEMUSED+1) = NMAT MEM(MEMUSED+2) = NSRC MEM(MEMUSED+3) = NGRID MEMUSED = MEMUSED + 3 CMEMUSED = CMEMUSED - 1 * 1000 FORMAT('Mem too short (',I4,') to handle',I4,' ',A20) 2000 FORMAT('Must have at least one ',A20) 3000 FORMAT('UNRECOGNIZABLE ',A5,' ''', A1, '''.') 4000 FORMAT('Illegal process grid: {',I3,',',I3,'}.') * RETURN * * End of RDSDRV. * END * SUBROUTINE CHKMATDAT( NOUT, INFILE, TSTFLAG, NMAT, M0, N0, $ LDAS0, LDAD0, LDI0 ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. LOGICAL TSTFLAG INTEGER NOUT, NMAT * .. * .. Array Arguments .. CHARACTER*8 INFILE INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) * .. * Purpose * ======= * CHKMATDAT: Checks that matrix data is correct. * * Arguments * ========= * NOUT (input) INTEGER * The device number to write output to. * * INFILE (input) CHARACTER*8 * The name of the input file where matrix values came from. * * TSTFLAG (input) LOGICAL * Whether to test RCFLAG (LDI) values or not. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * * ==================================================================== * * .. Local Scalars .. LOGICAL MATOK INTEGER I * .. * .. Executable Statements .. MATOK = .TRUE. DO 10 I = 1, NMAT IF( M0(I) .LT. 0 ) THEN WRITE(NOUT,1000) INFILE, 'M', M0(I) MATOK = .FALSE. ELSE IF( N0(I) .LT. 0 ) THEN WRITE(NOUT,1000) INFILE, 'N', N0(I) MATOK = .FALSE. ELSE IF( LDAS0(I) .LT. M0(I) ) THEN WRITE(NOUT,2000) INFILE, 'LDASRC', LDAS0(I), M0(I) MATOK = .FALSE. ELSE IF( LDAD0(I) .LT. M0(I) ) THEN WRITE(NOUT,2000) INFILE, 'LDADST', LDAD0(I), M0(I) MATOK = .FALSE. ELSE IF( TSTFLAG ) THEN IF( (LDI0(I).LT.M0(I)) .AND. (LDI0(I).NE.-1) ) THEN WRITE(NOUT,2000) INFILE, 'RCFLAG', LDI0(I), M0(I) MATOK = .FALSE. END IF END IF 10 CONTINUE * IF( .NOT.MATOK ) THEN IF( NOUT .NE. 6 .AND. NOUT .NE. 0 ) CLOSE(NOUT) CALL BLACS_ABORT(-1, 1) END IF * 1000 FORMAT(A8,' INPUT ERROR: Illegal ',A1,'; value=',I6,'.') 2000 FORMAT(A8,' INPUT ERROR: Illegal ',A6,'; value=',I6,', but M=',I6) * RETURN END * LOGICAL FUNCTION ALLPASS( THISTEST ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. LOGICAL THISTEST * .. * Purpose * ======= * ALLPASS: Returns whether all tests have passed so far. * * ===================================================================== * * .. Local Scalars .. LOGICAL PASSHIST * .. * .. Save Statement .. SAVE PASSHIST * .. * .. Data Statements .. DATA PASSHIST /.TRUE./ * .. * .. Executable Statements .. PASSHIST = (PASSHIST .AND. THISTEST) ALLPASS = PASSHIST * RETURN END * SUBROUTINE RDBSBR( MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, CMEMLEN, $ OUTNUM ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMUSED, MEMLEN, CMEMUSED, CMEMLEN, OUTNUM * .. * .. Array Arguments .. CHARACTER*1 CMEM(CMEMLEN) INTEGER MEM(MEMLEN) * .. * * Purpose * ======= * RDBSBR: Read and process the input file BSBR.dat. * * Arguments * ========= * MEMUSED (output) INTEGER * Number of elements in MEM that this subroutine ends up using. * * MEM (output) INTEGER array of dimension memlen * On output, holds information read in from sdrv.dat. * * MEMLEN (input) INTEGER * Number of elements of MEM that this subroutine * may safely write into. * * CMEMUSED (output) INTEGER * Number of elements in CMEM that this subroutine ends up using. * * CMEM (output) CHARACTER*1 array of dimension cmemlen * On output, holds the values for UPLO and DIAG. * * CMEMLEN (input) INTEGER * Number of elements of CMEM that this subroutine * may safely write into. * * OUTNUM (input) INTEGER * Unit number of the output file. * * ================================================================= * * .. Parameters .. INTEGER SDIN PARAMETER( SDIN = 12 ) * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Local Scalars .. INTEGER NSCOPE, NTOP, NSHAPE, NMAT, NSRC, NGRID, I, J INTEGER SCOPEPTR, TOPPTR, UPLOPTR, DIAGPTR, MPTR, NPTR INTEGER LDSPTR, LDDPTR, RSRCPTR, CSRCPTR, PPTR, QPTR * .. * .. Executable Statements * * Open and read the file bsbr.dat. The expected format is * below. * *------ *integer Number of scopes *array of CHAR*1's Values for Scopes *integer Number of topologies *array of CHAR*1's Values for TOP *integer number of shapes of the matrix *array of CHAR*1's UPLO *array of CHAR*1's DIAG: unit diagonal or not? *integer number of nmat *array of integers M: number of rows in matrix *array of integers N: number of columns in matrix *integer LDA: leading dimension on source proc *integer LDA: leading dimension on dest proc *integer number of source/dest pairs *array of integers RSRC: process row of message source *array of integers CSRC: process column of msg. src. *integer Number of grids *array of integers NPROW: number of rows in process grid *array of integers NPCOL: number of col's in proc. grid *------ * note: UPLO stands for 'upper or lower trapezoidal or general * rectangular.' * note: the text descriptions as shown above are present in * the sample bsbr.dat included with this distribution, * but are not required. * * Read input file * MEMUSED = 1 CMEMUSED = 1 OPEN(UNIT = SDIN, FILE = 'bsbr.dat', STATUS = 'OLD') * * Read in scopes and topologies * READ(SDIN, *) NSCOPE SCOPEPTR = CMEMUSED CMEMUSED = SCOPEPTR + NSCOPE IF ( CMEMUSED .GT. CMEMLEN ) THEN WRITE(OUTNUM, 1000) CMEMLEN, NSCOPE, 'SCOPES.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NSCOPE .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'SCOPE.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF * READ(SDIN, *) ( CMEM(SCOPEPTR+I), I = 0, NSCOPE-1 ) DO 20 I = 0, NSCOPE-1 IF( LSAME(CMEM(SCOPEPTR+I), 'R') ) THEN CMEM(SCOPEPTR+I) = 'R' ELSE IF( LSAME(CMEM(SCOPEPTR+I), 'C') ) THEN CMEM(SCOPEPTR+I) = 'C' ELSE IF( LSAME(CMEM(SCOPEPTR+I), 'A') ) THEN CMEM(SCOPEPTR+I) = 'A' ELSE WRITE(OUTNUM, 3000) 'SCOPE', CMEM(SCOPEPTR+I) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF 20 CONTINUE * READ(SDIN, *) NTOP TOPPTR = CMEMUSED CMEMUSED = TOPPTR + NTOP IF ( CMEMUSED .GT. CMEMLEN ) THEN WRITE(OUTNUM, 1000) CMEMLEN, NTOP, 'TOPOLOGIES.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NTOP .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'TOPOLOGY.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF READ(SDIN, *) ( CMEM(TOPPTR+I), I = 0, NTOP-1 ) * * * Read in number of shapes, and values of UPLO and DIAG * READ(SDIN, *) NSHAPE UPLOPTR = CMEMUSED DIAGPTR = UPLOPTR + NSHAPE CMEMUSED = DIAGPTR + NSHAPE IF ( CMEMUSED .GT. CMEMLEN ) THEN WRITE(OUTNUM, 1000) CMEMLEN, NSHAPE, 'MATRIX SHAPES.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NSHAPE .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'MATRIX SHAPE.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF * * Read in, upcase, and fatal error if UPLO/DIAG not recognized * READ(SDIN, *) ( CMEM(UPLOPTR+I), I = 0, NSHAPE-1 ) DO 30 I = 0, NSHAPE-1 IF( LSAME(CMEM(UPLOPTR+I), 'G') ) THEN CMEM(UPLOPTR+I) = 'G' ELSE IF( LSAME(CMEM(UPLOPTR+I), 'U') ) THEN CMEM(UPLOPTR+I) = 'U' ELSE IF( LSAME(CMEM(UPLOPTR+I), 'L') ) THEN CMEM(UPLOPTR+I) = 'L' ELSE WRITE(OUTNUM, 3000) 'UPLO ', CMEM(UPLOPTR+I) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF 30 CONTINUE * READ(SDIN, *) ( CMEM(DIAGPTR+I), I = 0, NSHAPE-1 ) DO 40 I = 0, NSHAPE-1 IF( CMEM(UPLOPTR+I) .NE. 'G' ) THEN IF( LSAME(CMEM(DIAGPTR+I), 'U') ) THEN CMEM( DIAGPTR+I ) = 'U' ELSE IF( LSAME(CMEM(DIAGPTR+I), 'N') ) THEN CMEM(DIAGPTR+I) = 'N' ELSE WRITE(OUTNUM, 3000) 'DIAG ', CMEM(DIAGPTR+I) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF END IF 40 CONTINUE * * Read in number of matrices, and values for M, N, LDASRC, and LDADEST * READ(SDIN, *) NMAT MPTR = MEMUSED NPTR = MPTR + NMAT LDSPTR = NPTR + NMAT LDDPTR = LDSPTR + NMAT MEMUSED = LDDPTR + NMAT IF( MEMUSED .GT. MEMLEN ) THEN WRITE(OUTNUM, 1000) MEMLEN, NMAT, 'MATRICES.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NMAT .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'MATRIX.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF READ(SDIN, *) ( MEM( MPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( NPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( LDSPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( LDDPTR+I ), I = 0, NMAT-1 ) * * Make sure matrix values are legal * CALL CHKMATDAT( OUTNUM, 'BSBR.dat', .FALSE., NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), MEM(LDDPTR) ) * * Read in number of src pairs, and values of src * READ(SDIN, *) NSRC RSRCPTR = MEMUSED CSRCPTR = RSRCPTR + NSRC MEMUSED = CSRCPTR + NSRC IF( MEMUSED .GT. MEMLEN ) THEN WRITE(OUTNUM, 1000) MEMLEN, NMAT, 'SRC.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NSRC .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'SRC.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF READ(SDIN, *) ( MEM(RSRCPTR+I), I = 0, NSRC-1 ) READ(SDIN, *) ( MEM(CSRCPTR+I), I = 0, NSRC-1 ) * * Read in number of grids pairs, and values of P (process rows) and * Q (process columns) * READ(SDIN, *) NGRID PPTR = MEMUSED QPTR = PPTR + NGRID MEMUSED = QPTR + NGRID IF( MEMUSED .GT. MEMLEN ) THEN WRITE(OUTNUM, 1000) MEMLEN, NGRID, 'PROCESS GRIDS.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NGRID .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'PROCESS GRID' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE( OUTNUM ) STOP END IF * READ(SDIN, *) ( MEM(PPTR+I), I = 0, NGRID-1 ) READ(SDIN, *) ( MEM(QPTR+I), I = 0, NGRID-1 ) IF( SDIN .NE. 6 .AND. SDIN .NE. 0 ) CLOSE( SDIN ) * * Fatal error if we've got an illegal grid * DO 70 J = 0, NGRID-1 IF( MEM(PPTR+J).LT.1 .OR. MEM(QPTR+J).LT.1 ) THEN WRITE(OUTNUM, 4000) MEM(PPTR+J), MEM(QPTR+J) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF 70 CONTINUE * * Prepare output variables * MEM(MEMUSED) = NSCOPE MEM(MEMUSED+1) = NTOP MEM(MEMUSED+2) = NSHAPE MEM(MEMUSED+3) = NMAT MEM(MEMUSED+4) = NSRC MEM(MEMUSED+5) = NGRID MEMUSED = MEMUSED + 5 CMEMUSED = CMEMUSED - 1 * 1000 FORMAT('Mem too short (',I4,') to handle',I4,' ',A20) 2000 FORMAT('Must have at least one ',A20) 3000 FORMAT('UNRECOGNIZABLE ',A5,' ''', A1, '''.') 4000 FORMAT('Illegal process grid: {',I3,',',I3,'}.') * RETURN * * End of RDBSBR. * END * * SUBROUTINE ISDRVTEST( OUTNUM, VERB, NSHAPE, UPLO0, DIAG0, $ NMAT, M0, N0, LDAS0, LDAD0, NSRC, RSRC0, $ CSRC0, RDEST0, CDEST0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSHAPE, NMAT, NSRC, NGRID, MEMLEN * .. * .. Array Arguments .. CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), RDEST0(NSRC), CDEST0(NSRC) INTEGER CONTEXT0(NGRID), P0(NGRID), Q0(NGRID), TFAIL(*) INTEGER MEM(MEMLEN) * .. * * Purpose * ======= * ITESTSDRV: Test integer send/recv * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * RDEST0 (input) INTEGER array of dimension (NNSRC) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNSRC) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) INTEGER array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL ITRSD2D, IGESD2D, ITRRV2D, IGERV2D EXTERNAL IINITMAT, ICHKMAT, ICHKPAD, IBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 UPLO, DIAG LOGICAL TESTOK INTEGER IAM, I, K, IGR, ISH, IMA, ISO, MYROW, MYCOL, IPRE, IPOST INTEGER M, N, NPROW, NPCOL, RSRC, CSRC, RDEST, CDEST INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE INTEGER SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = -1 RCHECKVAL = -2 * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') ISIZE = IBTSIZEOF('I') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( ISIZE * (MEMLEN-I) ) / ( ISIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SDRV tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF RDEST = RDEST0(ISO) CDEST = CDEST0(ISO) IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF * IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'RUNNING', $ UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ RDEST, CDEST, NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * source process generates matrix and sends it * IF( MYROW .EQ. RSRC .AND. MYCOL .EQ. CSRC ) THEN CALL IINITMAT( UPLO, DIAG, M, N, MEM, LDASRC, $ IPRE, IPOST, SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL ITRSD2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDASRC, RDEST, CDEST ) ELSE CALL IGESD2D( CONTEXT, M, N, MEM(APTR), $ LDASRC, RDEST, CDEST ) END IF END IF * IF( MYROW .EQ. RDEST .AND. MYCOL .EQ. CDEST ) THEN * * Pad entire matrix area * DO 50 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 50 CONTINUE * * Receive matrix * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL ITRRV2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC ) ELSE CALL IGERV2D( CONTEXT, M, N, MEM(APTR), $ LDADST, RSRC, CSRC ) END IF * * Check for errors in matrix or padding * I = NERR CALL ICHKMAT( UPLO, DIAG, M, N, MEM(APTR), LDADST, $ RSRC, CSRC, MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), MEM(ERRDPTR) ) * CALL ICHKPAD( UPLO, DIAG, M, N, MEM, LDADST, $ RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST, $ RCHECKVAL, TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR) ) TESTOK = I .EQ. NERR END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL IBTCHECKIN( 0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL ) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. I.EQ.NERR ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'PASSED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM, 7000) TESTNUM, 'FAILED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ENDIF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL IBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('INTEGER SDRV TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS UPLO DIA M N LDAS LDAD RSRC ', $ 'CSRC RDEST CDEST P Q') 6000 FORMAT(' ----- ------- ---- --- ----- ----- ----- ----- ---- ', $ '---- ----- ----- ---- ----') 7000 FORMAT(I6,1X,A7,4X,A1,3X,A1,4I6,2I5,2I6,2I5) 8000 FORMAT('INTEGER SDRV TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('INTEGER SDRV TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of ISDRVTEST. * END * * SUBROUTINE SSDRVTEST( OUTNUM, VERB, NSHAPE, UPLO0, DIAG0, $ NMAT, M0, N0, LDAS0, LDAD0, NSRC, RSRC0, $ CSRC0, RDEST0, CDEST0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSHAPE, NMAT, NSRC, NGRID, MEMLEN * .. * .. Array Arguments .. CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), RDEST0(NSRC), CDEST0(NSRC) INTEGER CONTEXT0(NGRID), P0(NGRID), Q0(NGRID), TFAIL(*) REAL MEM(MEMLEN) * .. * * Purpose * ======= * STESTSDRV: Test real send/recv * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * RDEST0 (input) INTEGER array of dimension (NNSRC) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNSRC) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) REAL array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL STRSD2D, SGESD2D, STRRV2D, SGERV2D EXTERNAL SINITMAT, SCHKMAT, SCHKPAD, SBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 UPLO, DIAG LOGICAL TESTOK INTEGER IAM, I, K, IGR, ISH, IMA, ISO, MYROW, MYCOL, IPRE, IPOST INTEGER M, N, NPROW, NPCOL, RSRC, CSRC, RDEST, CDEST INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE, SSIZE REAL SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = -0.01E0 RCHECKVAL = -0.02E0 * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') SSIZE = IBTSIZEOF('S') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( SSIZE * (MEMLEN-I) ) / ( SSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SDRV tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF RDEST = RDEST0(ISO) CDEST = CDEST0(ISO) IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF * IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'RUNNING', $ UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ RDEST, CDEST, NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * source process generates matrix and sends it * IF( MYROW .EQ. RSRC .AND. MYCOL .EQ. CSRC ) THEN CALL SINITMAT( UPLO, DIAG, M, N, MEM, LDASRC, $ IPRE, IPOST, SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL STRSD2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDASRC, RDEST, CDEST ) ELSE CALL SGESD2D( CONTEXT, M, N, MEM(APTR), $ LDASRC, RDEST, CDEST ) END IF END IF * IF( MYROW .EQ. RDEST .AND. MYCOL .EQ. CDEST ) THEN * * Pad entire matrix area * DO 50 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 50 CONTINUE * * Receive matrix * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL STRRV2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC ) ELSE CALL SGERV2D( CONTEXT, M, N, MEM(APTR), $ LDADST, RSRC, CSRC ) END IF * * Check for errors in matrix or padding * I = NERR CALL SCHKMAT( UPLO, DIAG, M, N, MEM(APTR), LDADST, $ RSRC, CSRC, MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), MEM(ERRDPTR) ) * CALL SCHKPAD( UPLO, DIAG, M, N, MEM, LDADST, $ RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST, $ RCHECKVAL, TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR) ) TESTOK = I .EQ. NERR END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL SBTCHECKIN( 0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL ) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. I.EQ.NERR ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'PASSED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM, 7000) TESTNUM, 'FAILED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ENDIF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL SBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('REAL SDRV TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS UPLO DIA M N LDAS LDAD RSRC ', $ 'CSRC RDEST CDEST P Q') 6000 FORMAT(' ----- ------- ---- --- ----- ----- ----- ----- ---- ', $ '---- ----- ----- ---- ----') 7000 FORMAT(I6,1X,A7,4X,A1,3X,A1,4I6,2I5,2I6,2I5) 8000 FORMAT('REAL SDRV TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('REAL SDRV TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of SSDRVTEST. * END * * SUBROUTINE DSDRVTEST( OUTNUM, VERB, NSHAPE, UPLO0, DIAG0, $ NMAT, M0, N0, LDAS0, LDAD0, NSRC, RSRC0, $ CSRC0, RDEST0, CDEST0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSHAPE, NMAT, NSRC, NGRID, MEMLEN * .. * .. Array Arguments .. CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), RDEST0(NSRC), CDEST0(NSRC) INTEGER CONTEXT0(NGRID), P0(NGRID), Q0(NGRID), TFAIL(*) DOUBLE PRECISION MEM(MEMLEN) * .. * * Purpose * ======= * DTESTSDRV: Test double precision send/recv * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * RDEST0 (input) INTEGER array of dimension (NNSRC) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNSRC) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) DOUBLE PRECISION array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL DTRSD2D, DGESD2D, DTRRV2D, DGERV2D EXTERNAL DINITMAT, DCHKMAT, DCHKPAD, DBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 UPLO, DIAG LOGICAL TESTOK INTEGER IAM, I, K, IGR, ISH, IMA, ISO, MYROW, MYCOL, IPRE, IPOST INTEGER M, N, NPROW, NPCOL, RSRC, CSRC, RDEST, CDEST INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE, DSIZE DOUBLE PRECISION SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = -0.01D0 RCHECKVAL = -0.02D0 * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') DSIZE = IBTSIZEOF('D') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( DSIZE * (MEMLEN-I) ) / ( DSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SDRV tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF RDEST = RDEST0(ISO) CDEST = CDEST0(ISO) IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF * IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'RUNNING', $ UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ RDEST, CDEST, NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * source process generates matrix and sends it * IF( MYROW .EQ. RSRC .AND. MYCOL .EQ. CSRC ) THEN CALL DINITMAT( UPLO, DIAG, M, N, MEM, LDASRC, $ IPRE, IPOST, SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL DTRSD2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDASRC, RDEST, CDEST ) ELSE CALL DGESD2D( CONTEXT, M, N, MEM(APTR), $ LDASRC, RDEST, CDEST ) END IF END IF * IF( MYROW .EQ. RDEST .AND. MYCOL .EQ. CDEST ) THEN * * Pad entire matrix area * DO 50 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 50 CONTINUE * * Receive matrix * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL DTRRV2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC ) ELSE CALL DGERV2D( CONTEXT, M, N, MEM(APTR), $ LDADST, RSRC, CSRC ) END IF * * Check for errors in matrix or padding * I = NERR CALL DCHKMAT( UPLO, DIAG, M, N, MEM(APTR), LDADST, $ RSRC, CSRC, MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), MEM(ERRDPTR) ) * CALL DCHKPAD( UPLO, DIAG, M, N, MEM, LDADST, $ RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST, $ RCHECKVAL, TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR) ) TESTOK = I .EQ. NERR END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL DBTCHECKIN( 0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL ) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. I.EQ.NERR ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'PASSED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM, 7000) TESTNUM, 'FAILED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ENDIF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL DBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE PRECISION SDRV TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS UPLO DIA M N LDAS LDAD RSRC ', $ 'CSRC RDEST CDEST P Q') 6000 FORMAT(' ----- ------- ---- --- ----- ----- ----- ----- ---- ', $ '---- ----- ----- ---- ----') 7000 FORMAT(I6,1X,A7,4X,A1,3X,A1,4I6,2I5,2I6,2I5) 8000 FORMAT('DOUBLE PRECISION SDRV TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('DOUBLE PRECISION SDRV TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of DSDRVTEST. * END * * SUBROUTINE CSDRVTEST( OUTNUM, VERB, NSHAPE, UPLO0, DIAG0, $ NMAT, M0, N0, LDAS0, LDAD0, NSRC, RSRC0, $ CSRC0, RDEST0, CDEST0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSHAPE, NMAT, NSRC, NGRID, MEMLEN * .. * .. Array Arguments .. CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), RDEST0(NSRC), CDEST0(NSRC) INTEGER CONTEXT0(NGRID), P0(NGRID), Q0(NGRID), TFAIL(*) COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * CTESTSDRV: Test complex send/recv * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * RDEST0 (input) INTEGER array of dimension (NNSRC) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNSRC) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL CTRSD2D, CGESD2D, CTRRV2D, CGERV2D EXTERNAL CINITMAT, CCHKMAT, CCHKPAD, CBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 UPLO, DIAG LOGICAL TESTOK INTEGER IAM, I, K, IGR, ISH, IMA, ISO, MYROW, MYCOL, IPRE, IPOST INTEGER M, N, NPROW, NPCOL, RSRC, CSRC, RDEST, CDEST INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE, CSIZE COMPLEX SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = CMPLX( -0.01, -0.01 ) RCHECKVAL = CMPLX( -0.02, -0.02 ) * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') CSIZE = IBTSIZEOF('C') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( CSIZE * (MEMLEN-I) ) / ( CSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SDRV tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF RDEST = RDEST0(ISO) CDEST = CDEST0(ISO) IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF * IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'RUNNING', $ UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ RDEST, CDEST, NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * source process generates matrix and sends it * IF( MYROW .EQ. RSRC .AND. MYCOL .EQ. CSRC ) THEN CALL CINITMAT( UPLO, DIAG, M, N, MEM, LDASRC, $ IPRE, IPOST, SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL CTRSD2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDASRC, RDEST, CDEST ) ELSE CALL CGESD2D( CONTEXT, M, N, MEM(APTR), $ LDASRC, RDEST, CDEST ) END IF END IF * IF( MYROW .EQ. RDEST .AND. MYCOL .EQ. CDEST ) THEN * * Pad entire matrix area * DO 50 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 50 CONTINUE * * Receive matrix * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL CTRRV2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC ) ELSE CALL CGERV2D( CONTEXT, M, N, MEM(APTR), $ LDADST, RSRC, CSRC ) END IF * * Check for errors in matrix or padding * I = NERR CALL CCHKMAT( UPLO, DIAG, M, N, MEM(APTR), LDADST, $ RSRC, CSRC, MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), MEM(ERRDPTR) ) * CALL CCHKPAD( UPLO, DIAG, M, N, MEM, LDADST, $ RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST, $ RCHECKVAL, TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR) ) TESTOK = I .EQ. NERR END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL CBTCHECKIN( 0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL ) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. I.EQ.NERR ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'PASSED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM, 7000) TESTNUM, 'FAILED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ENDIF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL CBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('COMPLEX SDRV TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS UPLO DIA M N LDAS LDAD RSRC ', $ 'CSRC RDEST CDEST P Q') 6000 FORMAT(' ----- ------- ---- --- ----- ----- ----- ----- ---- ', $ '---- ----- ----- ---- ----') 7000 FORMAT(I6,1X,A7,4X,A1,3X,A1,4I6,2I5,2I6,2I5) 8000 FORMAT('COMPLEX SDRV TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('COMPLEX SDRV TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of CSDRVTEST. * END * * SUBROUTINE ZSDRVTEST( OUTNUM, VERB, NSHAPE, UPLO0, DIAG0, $ NMAT, M0, N0, LDAS0, LDAD0, NSRC, RSRC0, $ CSRC0, RDEST0, CDEST0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSHAPE, NMAT, NSRC, NGRID, MEMLEN * .. * .. Array Arguments .. CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), RDEST0(NSRC), CDEST0(NSRC) INTEGER CONTEXT0(NGRID), P0(NGRID), Q0(NGRID), TFAIL(*) DOUBLE COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * ZTESTSDRV: Test double complex send/recv * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * RDEST0 (input) INTEGER array of dimension (NNSRC) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNSRC) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) DOUBLE COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL ZTRSD2D, ZGESD2D, ZTRRV2D, ZGERV2D EXTERNAL ZINITMAT, ZCHKMAT, ZCHKPAD, ZBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 UPLO, DIAG LOGICAL TESTOK INTEGER IAM, I, K, IGR, ISH, IMA, ISO, MYROW, MYCOL, IPRE, IPOST INTEGER M, N, NPROW, NPCOL, RSRC, CSRC, RDEST, CDEST INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE, ZSIZE DOUBLE COMPLEX SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = DCMPLX( -0.01D0, -0.01D0 ) RCHECKVAL = DCMPLX( -0.02D0, -0.02D0 ) * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') ZSIZE = IBTSIZEOF('Z') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( ZSIZE * (MEMLEN-I) ) / ( ZSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SDRV tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF RDEST = RDEST0(ISO) CDEST = CDEST0(ISO) IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF * IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'RUNNING', $ UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ RDEST, CDEST, NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * source process generates matrix and sends it * IF( MYROW .EQ. RSRC .AND. MYCOL .EQ. CSRC ) THEN CALL ZINITMAT( UPLO, DIAG, M, N, MEM, LDASRC, $ IPRE, IPOST, SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL ZTRSD2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDASRC, RDEST, CDEST ) ELSE CALL ZGESD2D( CONTEXT, M, N, MEM(APTR), $ LDASRC, RDEST, CDEST ) END IF END IF * IF( MYROW .EQ. RDEST .AND. MYCOL .EQ. CDEST ) THEN * * Pad entire matrix area * DO 50 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 50 CONTINUE * * Receive matrix * IF( UPLO .EQ. 'U' .OR. UPLO .EQ. 'L' ) THEN CALL ZTRRV2D( CONTEXT, UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC ) ELSE CALL ZGERV2D( CONTEXT, M, N, MEM(APTR), $ LDADST, RSRC, CSRC ) END IF * * Check for errors in matrix or padding * I = NERR CALL ZCHKMAT( UPLO, DIAG, M, N, MEM(APTR), LDADST, $ RSRC, CSRC, MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), MEM(ERRDPTR) ) * CALL ZCHKPAD( UPLO, DIAG, M, N, MEM, LDADST, $ RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST, $ RCHECKVAL, TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR) ) TESTOK = I .EQ. NERR END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL ZBTCHECKIN( 0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL ) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. I.EQ.NERR ) THEN WRITE(OUTNUM, 7000) TESTNUM, 'PASSED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM, 7000) TESTNUM, 'FAILED ', $ UPLO, DIAG, M, N, LDASRC, LDADST, $ RSRC, CSRC, RDEST, CDEST, NPROW, NPCOL ENDIF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL ZBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE COMPLEX SDRV TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS UPLO DIA M N LDAS LDAD RSRC ', $ 'CSRC RDEST CDEST P Q') 6000 FORMAT(' ----- ------- ---- --- ----- ----- ----- ----- ---- ', $ '---- ----- ----- ---- ----') 7000 FORMAT(I6,1X,A7,4X,A1,3X,A1,4I6,2I5,2I6,2I5) 8000 FORMAT('DOUBLE COMPLEX SDRV TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('DOUBLE COMPLEX SDRV TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of ZSDRVTEST. * END * * SUBROUTINE IBSBRTEST( OUTNUM, VERB, NSCOPE, SCOPE0, NTOP, TOP0, $ NSHAPE, UPLO0, DIAG0, NMAT, M0, N0, LDAS0, $ LDAD0, NSRC, RSRC0, CSRC0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSCOPE, NTOP, NSHAPE, NMAT, NSRC, NGRID INTEGER MEMLEN * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), TFAIL(*) INTEGER MEM(MEMLEN) * .. * * Purpose * ======= * ITESTBSBR: Test integer broadcast * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) INTEGER array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL ITRBS2D, IGEBS2D, ITRBR2D, IGEBR2D EXTERNAL IINITMAT, ICHKMAT, ICHKPAD, IBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP, UPLO, DIAG LOGICAL TESTOK, INGRID INTEGER IAM, I, K, J, IGR, ISH, IMA, ISO, ISC, ITO INTEGER M, N, NPROW, NPCOL, MYROW, MYCOL, RSRC, CSRC INTEGER ISTART, ISTOP, IPRE, IPOST, SETWHAT INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE INTEGER SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = -1 RCHECKVAL = -2 * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') ISIZE = IBTSIZEOF('I') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( ISIZE * (MEMLEN-I) ) / ( ISIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run BSBR tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * INGRID = ( NPROW .GT. 0 ) * DO 100 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 90 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multipath ('M') or general tree ('T'), * need to loop over calls to BLACS_SET * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 11 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 12 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) $ TESTNUM, 'RUNNING',SCOPE, TOP, UPLO, DIAG, $ M, N, LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * If I am in scope * IF( (MYROW.EQ.RSRC .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CSRC .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * * source process generates matrix and sends it * IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN CALL IINITMAT(UPLO, DIAG, M, N, MEM, $ LDASRC, IPRE, IPOST, $ SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * DO 20 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 20 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL ITRBS2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDASRC ) ELSE CALL IGEBS2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDASRC ) END IF 20 CONTINUE * * Destination processes * ELSE IF( INGRID ) THEN DO 40 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 40 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * Pad entire matrix area * DO 30 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 30 CONTINUE * * Receive matrix * IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL ITRBR2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDADST, $ RSRC, CSRC) ELSE CALL IGEBR2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDADST, RSRC, CSRC) END IF * * Check for errors in matrix or padding * I = NERR CALL ICHKMAT(UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), $ MEM(ERRDPTR)) * CALL ICHKPAD(UPLO, DIAG, M, N, MEM, $ LDADST, RSRC, CSRC, MYROW, $ MYCOL, IPRE, IPOST, RCHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) 40 CONTINUE TESTOK = ( I .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL IBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL) IF( IAM .EQ. 0 ) THEN TESTOK = ( TESTOK .AND. (I.EQ.NERR) ) IF( TESTOK ) THEN WRITE(OUTNUM,7000)TESTNUM,'PASSED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,7000)TESTNUM,'FAILED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE 100 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL IBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('INTEGER BSBR TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS SCOPE TOP UPLO DIAG M N LDAS ', $ ' LDAD RSRC CSRC P Q') 6000 FORMAT(' ----- ------- ----- --- ---- ---- ----- ----- ----- ', $ '----- ---- ---- ---- ----') 7000 FORMAT(I6,1X,A7,5X,A1,3X,A1,2(4X,A1), 4I6, 4I5) 8000 FORMAT('INTEGER BSBR TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('INTEGER BSBR TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of IBSBRTEST. * END * * SUBROUTINE SBSBRTEST( OUTNUM, VERB, NSCOPE, SCOPE0, NTOP, TOP0, $ NSHAPE, UPLO0, DIAG0, NMAT, M0, N0, LDAS0, $ LDAD0, NSRC, RSRC0, CSRC0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSCOPE, NTOP, NSHAPE, NMAT, NSRC, NGRID INTEGER MEMLEN * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), TFAIL(*) REAL MEM(MEMLEN) * .. * * Purpose * ======= * STESTBSBR: Test real broadcast * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) REAL array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL STRBS2D, SGEBS2D, STRBR2D, SGEBR2D EXTERNAL SINITMAT, SCHKMAT, SCHKPAD, SBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP, UPLO, DIAG LOGICAL TESTOK, INGRID INTEGER IAM, I, K, J, IGR, ISH, IMA, ISO, ISC, ITO INTEGER M, N, NPROW, NPCOL, MYROW, MYCOL, RSRC, CSRC INTEGER ISTART, ISTOP, IPRE, IPOST, SETWHAT INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE, SSIZE REAL SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = -0.01E0 RCHECKVAL = -0.02E0 * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') SSIZE = IBTSIZEOF('S') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( SSIZE * (MEMLEN-I) ) / ( SSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run BSBR tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * INGRID = ( NPROW .GT. 0 ) * DO 100 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 90 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multipath ('M') or general tree ('T'), * need to loop over calls to BLACS_SET * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 11 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 12 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) $ TESTNUM, 'RUNNING',SCOPE, TOP, UPLO, DIAG, $ M, N, LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * If I am in scope * IF( (MYROW.EQ.RSRC .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CSRC .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * * source process generates matrix and sends it * IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN CALL SINITMAT(UPLO, DIAG, M, N, MEM, $ LDASRC, IPRE, IPOST, $ SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * DO 20 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 20 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL STRBS2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDASRC ) ELSE CALL SGEBS2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDASRC ) END IF 20 CONTINUE * * Destination processes * ELSE IF( INGRID ) THEN DO 40 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 40 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * Pad entire matrix area * DO 30 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 30 CONTINUE * * Receive matrix * IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL STRBR2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDADST, $ RSRC, CSRC) ELSE CALL SGEBR2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDADST, RSRC, CSRC) END IF * * Check for errors in matrix or padding * I = NERR CALL SCHKMAT(UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), $ MEM(ERRDPTR)) * CALL SCHKPAD(UPLO, DIAG, M, N, MEM, $ LDADST, RSRC, CSRC, MYROW, $ MYCOL, IPRE, IPOST, RCHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) 40 CONTINUE TESTOK = ( I .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL SBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL) IF( IAM .EQ. 0 ) THEN TESTOK = ( TESTOK .AND. (I.EQ.NERR) ) IF( TESTOK ) THEN WRITE(OUTNUM,7000)TESTNUM,'PASSED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,7000)TESTNUM,'FAILED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE 100 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL SBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('REAL BSBR TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS SCOPE TOP UPLO DIAG M N LDAS ', $ ' LDAD RSRC CSRC P Q') 6000 FORMAT(' ----- ------- ----- --- ---- ---- ----- ----- ----- ', $ '----- ---- ---- ---- ----') 7000 FORMAT(I6,1X,A7,5X,A1,3X,A1,2(4X,A1), 4I6, 4I5) 8000 FORMAT('REAL BSBR TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('REAL BSBR TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of SBSBRTEST. * END * * SUBROUTINE DBSBRTEST( OUTNUM, VERB, NSCOPE, SCOPE0, NTOP, TOP0, $ NSHAPE, UPLO0, DIAG0, NMAT, M0, N0, LDAS0, $ LDAD0, NSRC, RSRC0, CSRC0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSCOPE, NTOP, NSHAPE, NMAT, NSRC, NGRID INTEGER MEMLEN * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), TFAIL(*) DOUBLE PRECISION MEM(MEMLEN) * .. * * Purpose * ======= * DTESTBSBR: Test double precision broadcast * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) DOUBLE PRECISION array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL DTRBS2D, DGEBS2D, DTRBR2D, DGEBR2D EXTERNAL DINITMAT, DCHKMAT, DCHKPAD, DBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP, UPLO, DIAG LOGICAL TESTOK, INGRID INTEGER IAM, I, K, J, IGR, ISH, IMA, ISO, ISC, ITO INTEGER M, N, NPROW, NPCOL, MYROW, MYCOL, RSRC, CSRC INTEGER ISTART, ISTOP, IPRE, IPOST, SETWHAT INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE, DSIZE DOUBLE PRECISION SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = -0.01D0 RCHECKVAL = -0.02D0 * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') DSIZE = IBTSIZEOF('D') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( DSIZE * (MEMLEN-I) ) / ( DSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run BSBR tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * INGRID = ( NPROW .GT. 0 ) * DO 100 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 90 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multipath ('M') or general tree ('T'), * need to loop over calls to BLACS_SET * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 11 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 12 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) $ TESTNUM, 'RUNNING',SCOPE, TOP, UPLO, DIAG, $ M, N, LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * If I am in scope * IF( (MYROW.EQ.RSRC .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CSRC .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * * source process generates matrix and sends it * IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN CALL DINITMAT(UPLO, DIAG, M, N, MEM, $ LDASRC, IPRE, IPOST, $ SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * DO 20 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 20 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL DTRBS2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDASRC ) ELSE CALL DGEBS2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDASRC ) END IF 20 CONTINUE * * Destination processes * ELSE IF( INGRID ) THEN DO 40 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 40 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * Pad entire matrix area * DO 30 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 30 CONTINUE * * Receive matrix * IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL DTRBR2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDADST, $ RSRC, CSRC) ELSE CALL DGEBR2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDADST, RSRC, CSRC) END IF * * Check for errors in matrix or padding * I = NERR CALL DCHKMAT(UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), $ MEM(ERRDPTR)) * CALL DCHKPAD(UPLO, DIAG, M, N, MEM, $ LDADST, RSRC, CSRC, MYROW, $ MYCOL, IPRE, IPOST, RCHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) 40 CONTINUE TESTOK = ( I .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL DBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL) IF( IAM .EQ. 0 ) THEN TESTOK = ( TESTOK .AND. (I.EQ.NERR) ) IF( TESTOK ) THEN WRITE(OUTNUM,7000)TESTNUM,'PASSED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,7000)TESTNUM,'FAILED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE 100 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL DBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE PRECISION BSBR TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS SCOPE TOP UPLO DIAG M N LDAS ', $ ' LDAD RSRC CSRC P Q') 6000 FORMAT(' ----- ------- ----- --- ---- ---- ----- ----- ----- ', $ '----- ---- ---- ---- ----') 7000 FORMAT(I6,1X,A7,5X,A1,3X,A1,2(4X,A1), 4I6, 4I5) 8000 FORMAT('DOUBLE PRECISION BSBR TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('DOUBLE PRECISION BSBR TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of DBSBRTEST. * END * * SUBROUTINE CBSBRTEST( OUTNUM, VERB, NSCOPE, SCOPE0, NTOP, TOP0, $ NSHAPE, UPLO0, DIAG0, NMAT, M0, N0, LDAS0, $ LDAD0, NSRC, RSRC0, CSRC0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSCOPE, NTOP, NSHAPE, NMAT, NSRC, NGRID INTEGER MEMLEN * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), TFAIL(*) COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * CTESTBSBR: Test complex broadcast * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL CTRBS2D, CGEBS2D, CTRBR2D, CGEBR2D EXTERNAL CINITMAT, CCHKMAT, CCHKPAD, CBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP, UPLO, DIAG LOGICAL TESTOK, INGRID INTEGER IAM, I, K, J, IGR, ISH, IMA, ISO, ISC, ITO INTEGER M, N, NPROW, NPCOL, MYROW, MYCOL, RSRC, CSRC INTEGER ISTART, ISTOP, IPRE, IPOST, SETWHAT INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE, CSIZE COMPLEX SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = CMPLX( -0.01, -0.01 ) RCHECKVAL = CMPLX( -0.02, -0.02 ) * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') CSIZE = IBTSIZEOF('C') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( CSIZE * (MEMLEN-I) ) / ( CSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run BSBR tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * INGRID = ( NPROW .GT. 0 ) * DO 100 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 90 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multipath ('M') or general tree ('T'), * need to loop over calls to BLACS_SET * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 11 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 12 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) $ TESTNUM, 'RUNNING',SCOPE, TOP, UPLO, DIAG, $ M, N, LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * If I am in scope * IF( (MYROW.EQ.RSRC .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CSRC .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * * source process generates matrix and sends it * IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN CALL CINITMAT(UPLO, DIAG, M, N, MEM, $ LDASRC, IPRE, IPOST, $ SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * DO 20 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 20 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL CTRBS2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDASRC ) ELSE CALL CGEBS2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDASRC ) END IF 20 CONTINUE * * Destination processes * ELSE IF( INGRID ) THEN DO 40 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 40 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * Pad entire matrix area * DO 30 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 30 CONTINUE * * Receive matrix * IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL CTRBR2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDADST, $ RSRC, CSRC) ELSE CALL CGEBR2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDADST, RSRC, CSRC) END IF * * Check for errors in matrix or padding * I = NERR CALL CCHKMAT(UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), $ MEM(ERRDPTR)) * CALL CCHKPAD(UPLO, DIAG, M, N, MEM, $ LDADST, RSRC, CSRC, MYROW, $ MYCOL, IPRE, IPOST, RCHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) 40 CONTINUE TESTOK = ( I .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL CBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL) IF( IAM .EQ. 0 ) THEN TESTOK = ( TESTOK .AND. (I.EQ.NERR) ) IF( TESTOK ) THEN WRITE(OUTNUM,7000)TESTNUM,'PASSED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,7000)TESTNUM,'FAILED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE 100 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL CBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('COMPLEX BSBR TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS SCOPE TOP UPLO DIAG M N LDAS ', $ ' LDAD RSRC CSRC P Q') 6000 FORMAT(' ----- ------- ----- --- ---- ---- ----- ----- ----- ', $ '----- ---- ---- ---- ----') 7000 FORMAT(I6,1X,A7,5X,A1,3X,A1,2(4X,A1), 4I6, 4I5) 8000 FORMAT('COMPLEX BSBR TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('COMPLEX BSBR TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of CBSBRTEST. * END * * SUBROUTINE ZBSBRTEST( OUTNUM, VERB, NSCOPE, SCOPE0, NTOP, TOP0, $ NSHAPE, UPLO0, DIAG0, NMAT, M0, N0, LDAS0, $ LDAD0, NSRC, RSRC0, CSRC0, NGRID, CONTEXT0, $ P0, Q0, TFAIL, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER OUTNUM, VERB, NSCOPE, NTOP, NSHAPE, NMAT, NSRC, NGRID INTEGER MEMLEN * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) CHARACTER*1 UPLO0(NSHAPE), DIAG0(NSHAPE) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RSRC0(NSRC), CSRC0(NSRC), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), TFAIL(*) DOUBLE COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * ZTESTBSBR: Test double complex broadcast * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NSHAPE (input) INTEGER * The number of matrix shapes to be tested. * * UPLO0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of UPLO to be tested. * * DIAG0 (input) CHARACTER*1 array of dimension (NSHAPE) * Values of DIAG to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NSRC (input) INTEGER * The number of sources to be tested. * * RSRC0 (input) INTEGER array of dimension (NDEST) * Values of RSRC (row coordinate of source) to be tested. * * CSRC0 (input) INTEGER array of dimension (NDEST) * Values of CSRC (column coordinate of source) to be tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * TFAIL (workspace) INTEGER array of dimension (NTESTS) * If VERB < 2, serves to indicate which tests fail. This * requires workspace of NTESTS (number of tests performed). * * MEM (workspace) DOUBLE COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO EXTERNAL ZTRBS2D, ZGEBS2D, ZTRBR2D, ZGEBR2D EXTERNAL ZINITMAT, ZCHKMAT, ZCHKPAD, ZBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP, UPLO, DIAG LOGICAL TESTOK, INGRID INTEGER IAM, I, K, J, IGR, ISH, IMA, ISO, ISC, ITO INTEGER M, N, NPROW, NPCOL, MYROW, MYCOL, RSRC, CSRC INTEGER ISTART, ISTOP, IPRE, IPOST, SETWHAT INTEGER NERR, NSKIP, NFAIL, TESTNUM, CONTEXT, MAXERR, LDASRC INTEGER LDADST, ERRDPTR, APTR, ERRIPTR, ISIZE, ZSIZE DOUBLE COMPLEX SCHECKVAL, RCHECKVAL * .. * .. Executable Statements .. * SCHECKVAL = DCMPLX( -0.01D0, -0.01D0 ) RCHECKVAL = DCMPLX( -0.02D0, -0.02D0 ) * IAM = IBTMYPROC() ISIZE = IBTSIZEOF('I') ZSIZE = IBTSIZEOF('Z') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NSHAPE:', NSHAPE WRITE(OUTNUM, 3000) ' UPLO :', ( UPLO0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 3000) ' DIAG :', ( DIAG0(I), I = 1, NSHAPE ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NSRC :', NSRC WRITE(OUTNUM, 2000) ' RSRC :',( RSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) ' CSRC :',( CSRC0(I), I = 1, NSRC ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,5000) WRITE(OUTNUM,6000) END IF END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + 4 * M0(IMA) IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( ZSIZE * (MEMLEN-I) ) / ( ZSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run BSBR tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 110 IGR = 1, NGRID * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) * INGRID = ( NPROW .GT. 0 ) * DO 100 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 90 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multipath ('M') or general tree ('T'), * need to loop over calls to BLACS_SET * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 11 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 12 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 80 ISH = 1, NSHAPE UPLO = UPLO0(ISH) DIAG = DIAG0(ISH) * DO 70 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) * DO 60 ISO = 1, NSRC TESTNUM = TESTNUM + 1 RSRC = RSRC0(ISO) CSRC = CSRC0(ISO) IF( RSRC.GE.P0(IGR) .OR. CSRC.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 60 END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 7000) $ TESTNUM, 'RUNNING',SCOPE, TOP, UPLO, DIAG, $ M, N, LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * TESTOK = .TRUE. IPRE = 2 * M IPOST = IPRE APTR = IPRE + 1 * * If I am in scope * IF( (MYROW.EQ.RSRC .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CSRC .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * * source process generates matrix and sends it * IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN CALL ZINITMAT(UPLO, DIAG, M, N, MEM, $ LDASRC, IPRE, IPOST, $ SCHECKVAL, TESTNUM, $ MYROW, MYCOL ) * DO 20 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 20 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL ZTRBS2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDASRC ) ELSE CALL ZGEBS2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDASRC ) END IF 20 CONTINUE * * Destination processes * ELSE IF( INGRID ) THEN DO 40 J = ISTART, ISTOP IF( J.EQ.0 ) GOTO 40 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * Pad entire matrix area * DO 30 K = 1, IPRE+IPOST+LDADST*N MEM(K) = RCHECKVAL 30 CONTINUE * * Receive matrix * IF( UPLO.EQ.'U' .OR. UPLO.EQ.'L' ) THEN CALL ZTRBR2D(CONTEXT, SCOPE, TOP, $ UPLO, DIAG, M, N, $ MEM(APTR), LDADST, $ RSRC, CSRC) ELSE CALL ZGEBR2D(CONTEXT, SCOPE, TOP, $ M, N, MEM(APTR), $ LDADST, RSRC, CSRC) END IF * * Check for errors in matrix or padding * I = NERR CALL ZCHKMAT(UPLO, DIAG, M, N, $ MEM(APTR), LDADST, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, $ NERR, MEM(ERRIPTR), $ MEM(ERRDPTR)) * CALL ZCHKPAD(UPLO, DIAG, M, N, MEM, $ LDADST, RSRC, CSRC, MYROW, $ MYCOL, IPRE, IPOST, RCHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) 40 CONTINUE TESTOK = ( I .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL ZBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), $ TFAIL) IF( IAM .EQ. 0 ) THEN TESTOK = ( TESTOK .AND. (I.EQ.NERR) ) IF( TESTOK ) THEN WRITE(OUTNUM,7000)TESTNUM,'PASSED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,7000)TESTNUM,'FAILED ', $ SCOPE, TOP, UPLO, DIAG, M, N, $ LDASRC, LDADST, RSRC, CSRC, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE 100 CONTINUE 110 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL ZBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), TFAIL ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 8000 ) TESTNUM ELSE WRITE(OUTNUM, 9000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE COMPLEX BSBR TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 5000 FORMAT(' TEST# STATUS SCOPE TOP UPLO DIAG M N LDAS ', $ ' LDAD RSRC CSRC P Q') 6000 FORMAT(' ----- ------- ----- --- ---- ---- ----- ----- ----- ', $ '----- ---- ---- ---- ----') 7000 FORMAT(I6,1X,A7,5X,A1,3X,A1,2(4X,A1), 4I6, 4I5) 8000 FORMAT('DOUBLE COMPLEX BSBR TESTS: PASSED ALL', $ I5, ' TESTS.') 9000 FORMAT('DOUBLE COMPLEX BSBR TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of ZBSBRTEST. * END * * SUBROUTINE RDCOMB( MEMUSED, MEM, MEMLEN, CMEMUSED, CMEM, CMEMLEN, $ OUTNUM ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMUSED, MEMLEN, CMEMUSED, CMEMLEN, OUTNUM * .. * .. Array Arguments .. CHARACTER*1 CMEM(CMEMLEN) INTEGER MEM(MEMLEN) * .. * * Purpose * ======= * RDCOMB: Read and process the input file COMB.dat. * * Arguments * ========= * MEMUSED (output) INTEGER * Number of elements in MEM that this subroutine ends up using. * * MEM (output) INTEGER array of dimension memlen * On output, holds information read in from sdrv.dat. * * MEMLEN (input) INTEGER * Number of elements of MEM that this subroutine * may safely write into. * * CMEMUSED (output) INTEGER * Number of elements in CMEM that this subroutine ends up using. * * CMEM (output) CHARACTER*1 array of dimension cmemlen * On output, holds the values for UPLO and DIAG. * * CMEMLEN (input) INTEGER * Number of elements of CMEM that this subroutine * may safely write into. * * OUTNUM (input) INTEGER * Unit number of the output file. * * ================================================================= * * .. Parameters .. INTEGER SDIN PARAMETER( SDIN = 12 ) * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Local Scalars .. INTEGER TOPSREPEAT, TOPSCOHRNT, NOPS, NSCOPE, NTOP, NMAT, NDEST INTEGER NGRID, I, J, OPPTR, SCOPEPTR, TOPPTR, MPTR, NPTR INTEGER LDSPTR, LDDPTR, LDIPTR, RDESTPTR, CDESTPTR, PPTR, QPTR * .. * .. Executable Statements * * Open and read the file comb.dat. The expected format is * below. * *------ *integer Number of operations *array of CHAR*1's OPs: '+', '>', '<' *integer Number of scopes *array of CHAR*1's Values for Scopes *HAR*1 Repeatability flag ('R', 'N', 'B') *HAR*1 Coherency flag ('C', 'N', 'B') *integer Number of topologies *array of CHAR*1's Values for TOP *integer number of nmat *array of integers M: number of rows in matrix *array of integers N: number of columns in matrix *integer LDA: leading dimension on source proc *integer LDA: leading dimension on dest proc *integer number of source/dest pairs *array of integers RDEST: process row of msg. dest. *array of integers CDEST: process column of msg. dest. *integer Number of grids *array of integers NPROW: number of rows in process grid *array of integers NPCOL: number of col's in proc. grid *------ * note: the text descriptions as shown above are present in * the sample comb.dat included with this distribution, * but are not required. * * Read input file * MEMUSED = 1 CMEMUSED = 1 OPEN(UNIT = SDIN, FILE = 'comb.dat', STATUS = 'OLD') * * Get what operations to test (+, >, <) * READ(SDIN, *) NOPS OPPTR = CMEMUSED CMEMUSED = OPPTR + NOPS IF ( CMEMUSED .GT. CMEMLEN ) THEN WRITE(OUTNUM, 1000) CMEMLEN, NOPS, 'OPERATIONS.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NOPS .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'OPERATIONS.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF * READ(SDIN, *) ( CMEM(OPPTR+I), I = 0, NOPS-1 ) DO 10 I = 0, NOPS-1 IF( (CMEM(OPPTR+I).NE.'+') .AND. (CMEM(OPPTR+I).NE.'>') .AND. $ (CMEM(OPPTR+I).NE.'<') ) THEN WRITE(OUTNUM,5000) CMEM(OPPTR+I) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF 10 CONTINUE * * Read in scopes and topologies * READ(SDIN, *) NSCOPE SCOPEPTR = CMEMUSED CMEMUSED = SCOPEPTR + NSCOPE IF ( CMEMUSED .GT. CMEMLEN ) THEN WRITE(OUTNUM, 1000) CMEMLEN, NSCOPE, 'SCOPES.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NSCOPE .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'SCOPE.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF * READ(SDIN, *) ( CMEM(SCOPEPTR+I), I = 0, NSCOPE-1 ) DO 20 I = 0, NSCOPE-1 IF( LSAME(CMEM(SCOPEPTR+I), 'R') ) THEN CMEM(SCOPEPTR+I) = 'R' ELSE IF( LSAME(CMEM(SCOPEPTR+I), 'C') ) THEN CMEM(SCOPEPTR+I) = 'C' ELSE IF( LSAME(CMEM(SCOPEPTR+I), 'A') ) THEN CMEM(SCOPEPTR+I) = 'A' ELSE WRITE(OUTNUM, 3000) 'SCOPE', CMEM(SCOPEPTR+I) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF 20 CONTINUE * READ(SDIN, *) TOPSREPEAT READ(SDIN, *) TOPSCOHRNT * READ(SDIN, *) NTOP TOPPTR = CMEMUSED CMEMUSED = TOPPTR + NTOP IF ( CMEMUSED .GT. CMEMLEN ) THEN WRITE(OUTNUM, 1000) CMEMLEN, NTOP, 'TOPOLOGIES.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NTOP .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'TOPOLOGY.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF READ(SDIN, *) ( CMEM(TOPPTR+I), I = 0, NTOP-1 ) * * * Read in number of matrices, and values for M, N, LDASRC, and LDADEST * READ(SDIN, *) NMAT MPTR = MEMUSED NPTR = MPTR + NMAT LDSPTR = NPTR + NMAT LDDPTR = LDSPTR + NMAT LDIPTR = LDDPTR + NMAT MEMUSED = LDIPTR + NMAT IF( MEMUSED .GT. MEMLEN ) THEN WRITE(OUTNUM, 1000) MEMLEN, NMAT, 'MATRICES.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NMAT .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'MATRIX.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF READ(SDIN, *) ( MEM( MPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( NPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( LDSPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( LDDPTR+I ), I = 0, NMAT-1 ) READ(SDIN, *) ( MEM( LDIPTR+I ), I = 0, NMAT-1 ) * * Make sure matrix values are legal * CALL CHKMATDAT( OUTNUM, 'COMB.dat', .TRUE., NMAT, MEM(MPTR), $ MEM(NPTR), MEM(LDSPTR), MEM(LDDPTR), MEM(LDIPTR) ) * * Read in number of dest pairs, and values of dest * READ(SDIN, *) NDEST RDESTPTR = MEMUSED CDESTPTR = RDESTPTR + NDEST MEMUSED = CDESTPTR + NDEST IF( MEMUSED .GT. MEMLEN ) THEN WRITE(OUTNUM, 1000) MEMLEN, NMAT, 'DEST.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NDEST .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'DEST.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF READ(SDIN, *) ( MEM(RDESTPTR+I), I = 0, NDEST-1 ) READ(SDIN, *) ( MEM(CDESTPTR+I), I = 0, NDEST-1 ) * * Read in number of grids pairs, and values of P (process rows) and * Q (process columns) * READ(SDIN, *) NGRID PPTR = MEMUSED QPTR = PPTR + NGRID MEMUSED = QPTR + NGRID IF( MEMUSED .GT. MEMLEN ) THEN WRITE(OUTNUM, 1000) MEMLEN, NGRID, 'PROCESS GRIDS.' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP ELSE IF( NGRID .LT. 1 ) THEN WRITE(OUTNUM, 2000) 'PROCESS GRID' IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE( OUTNUM ) STOP END IF * READ(SDIN, *) ( MEM(PPTR+I), I = 0, NGRID-1 ) READ(SDIN, *) ( MEM(QPTR+I), I = 0, NGRID-1 ) IF( SDIN .NE. 6 .AND. SDIN .NE. 0 ) CLOSE( SDIN ) * * Fatal error if we've got an illegal grid * DO 70 J = 0, NGRID-1 IF( MEM(PPTR+J).LT.1 .OR. MEM(QPTR+J).LT.1 ) THEN WRITE(OUTNUM, 4000) MEM(PPTR+J), MEM(QPTR+J) IF( OUTNUM .NE. 6 .AND. OUTNUM .NE. 0 ) CLOSE(OUTNUM) STOP END IF 70 CONTINUE * * Prepare output variables * MEM(MEMUSED) = NOPS MEM(MEMUSED+1) = NSCOPE MEM(MEMUSED+2) = TOPSREPEAT MEM(MEMUSED+3) = TOPSCOHRNT MEM(MEMUSED+4) = NTOP MEM(MEMUSED+5) = NMAT MEM(MEMUSED+6) = NDEST MEM(MEMUSED+7) = NGRID MEMUSED = MEMUSED + 7 CMEMUSED = CMEMUSED - 1 * 1000 FORMAT('Mem too short (',I4,') to handle',I4,' ',A20) 2000 FORMAT('Must have at least one ',A20) 3000 FORMAT('UNRECOGNIZABLE ',A5,' ''', A1, '''.') 4000 FORMAT('Illegal process grid: {',I3,',',I3,'}.') 5000 FORMAT('Illegal OP value ''',A1,''':, expected ''+'' (SUM),', $ ' ''>'' (MAX), or ''<'' (MIN).') * RETURN * * End of RDCOMB. * END * * SUBROUTINE IBTCHECKIN( NFTESTS, OUTNUM, MAXERR, NERR, IERR, $ IVAL, TFAILED ) INTEGER NFTESTS, OUTNUM, MAXERR, NERR INTEGER IERR(*), TFAILED(*) INTEGER IVAL(*) * * Purpose * ======= * IBTCHECKIN: Process 0 receives error report from all processes. * * Arguments * ========= * NFTESTS (input/output) INTEGER * if NFTESTS is <= 0 upon entry, NFTESTS is not written to. * Otherwise, on entry it specifies the total number of tests * run, and on exit it is the number of tests which failed. * * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRIBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (workspace) INTEGER array, dimension NFTESTS * Workspace used to keep track of which tests failed. * If input of NFTESTS < 1, this array not accessed. * * =================================================================== * * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTMSGID EXTERNAL IBTMYPROC, IBTNPROCS, IBTMSGID * .. * .. Local Scalars .. LOGICAL COUNTING INTEGER K, NERR2, IAM, NPROCS, NTESTS * * Proc 0 collects error info from everyone * IAM = IBTMYPROC() NPROCS = IBTNPROCS() * IF( IAM .EQ. 0 ) THEN * * If we are finding out how many failed tests there are, initialize * the total number of tests (NTESTS), and zero the test failed array * COUNTING = NFTESTS .GT. 0 IF( COUNTING ) THEN NTESTS = NFTESTS DO 10 K = 1, NTESTS TFAILED(K) = 0 10 CONTINUE END IF * CALL IPRINTERRS(OUTNUM, MAXERR, NERR, IERR, IVAL, COUNTING, $ TFAILED) * DO 20 K = 1, NPROCS-1 CALL BTSEND(3, 0, K, K, IBTMSGID()+50) CALL BTRECV(3, 1, NERR2, K, IBTMSGID()+50) IF( NERR2 .GT. 0 ) THEN NERR = NERR + NERR2 CALL BTRECV(3, NERR2*6, IERR, K, IBTMSGID()+51) CALL BTRECV(3, NERR2*2, IVAL, K, IBTMSGID()+51) CALL IPRINTERRS(OUTNUM, MAXERR, NERR2, IERR, IVAL, $ COUNTING, TFAILED) END IF 20 CONTINUE * * Count up number of tests that failed * IF( COUNTING ) THEN NFTESTS = 0 DO 30 K = 1, NTESTS NFTESTS = NFTESTS + TFAILED(K) 30 CONTINUE END IF * * Send my error info to proc 0 * ELSE CALL BTRECV(3, 0, K, 0, IBTMSGID()+50) CALL BTSEND(3, 1, NERR, 0, IBTMSGID()+50) IF( NERR .GT. 0 ) THEN CALL BTSEND(3, NERR*6, IERR, 0, IBTMSGID()+51) CALL BTSEND(3, NERR*2, IVAL, 0, IBTMSGID()+51) END IF ENDIF * RETURN * * End of IBTCHECKIN * END * SUBROUTINE IINITMAT(UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, MYROW, MYCOL) CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST, TESTNUM, MYROW, MYCOL INTEGER CHECKVAL INTEGER MEM(*) * * .. External Subroutines .. EXTERNAL IGENMAT, IPADMAT * .. * .. Executable Statements .. * CALL IGENMAT( M, N, MEM(IPRE+1), LDA, TESTNUM, MYROW, MYCOL ) CALL IPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, CHECKVAL ) * RETURN END * SUBROUTINE IGENMAT( M, N, A, LDA, TESTNUM, MYROW, MYCOL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER M, N, LDA, TESTNUM, MYROW, MYCOL * .. * .. Array Arguments .. INTEGER A(LDA,N) * .. * * Purpose * ======= * IGENMAT: Generates an M-by-N matrix filled with random elements. * * Arguments * ========= * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (output) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * TESTNUM (input) INTEGER * Unique number for this test case, used as a basis for * the random seeds. * * ==================================================================== * * .. External Functions .. INTEGER IBTNPROCS INTEGER IBTRAN EXTERNAL IBTRAN, IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, NPROCS, SRC * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. Executable Statements .. * * ISEED's four values must be positive integers less than 4096, * fourth one has to be odd. (see _LARND). Use some goofy * functions to come up with seed values which together should * be unique. * NPROCS = IBTNPROCS() SRC = MYROW * NPROCS + MYCOL ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * DO 10 J = 1, N DO 10 I = 1, M A(I, J) = IBTRAN( ISEED ) 10 CONTINUE * RETURN * * End of IGENMAT. * END * INTEGER FUNCTION IBTRAN(ISEED) INTEGER ISEED(*) * * .. External Functions .. DOUBLE PRECISION DLARND EXTERNAL DLARND * .. * .. Local Scalars .. DOUBLE PRECISION DVAL * .. * .. Executable Statements .. * DVAL = 1.0D6 * DLARND(2, ISEED) IBTRAN = INT(DVAL) * RETURN * * End of Ibtran * END * SUBROUTINE IPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST INTEGER CHECKVAL * .. * .. Array Arguments .. INTEGER MEM( * ) * .. * * Purpose * ======= * * IPADMAT: Pad Matrix. * This routines surrounds a matrix with a guardzone initialized to the * value CHECKVAL. There are three distinct guardzones: * - A contiguous zone of size IPRE immediately before the start * of the matrix. * - A contiguous zone of size IPOST immedately after the end of the * matrix. * - Interstitial zones within each column of the matrix, in the * elements A( M+1:LDA, J ). * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (output) integer array, dimension (IPRE+IPOST+LDA*N) * The address IPRE elements ahead of the matrix A you want to * pad, which is then of dimension (LDA,N). * * IPRE (input) INTEGER * The size of the guard zone ahead of the matrix A. * * IPOST (input) INTEGER * The size of the guard zone behind the matrix A. * * CHECKVAL (input) integer * The value to insert into the guard zones. * * ==================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE MEM( I ) = CHECKVAL 10 CONTINUE END IF * * Put check buffer in back of A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 MEM( I ) = CHECKVAL 20 CONTINUE END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K+LDA-M-1 MEM( I ) = CHECKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * * If the matrix is upper or lower trapezoidal, calculate the * additional triangular area which needs to be padded, Each * element referred to is in the Ith row and the Jth column. * IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 41 I = 1, M DO 42 J = 1, I K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 42 CONTINUE 41 CONTINUE ELSE DO 43 I = 2, M DO 44 J = 1, I-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 44 CONTINUE 43 CONTINUE END IF ELSE IF( DIAG .EQ. 'U' ) THEN DO 45 I = M-N+1, M DO 46 J = 1, I-(M-N) K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 46 CONTINUE 45 CONTINUE ELSE DO 47 I = M-N+2, M DO 48 J = 1, I-(M-N)-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 48 CONTINUE 47 CONTINUE END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 49 I = 1, M DO 50 J = N-M+I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 50 CONTINUE 49 CONTINUE ELSE DO 51 I = 1, M-1 DO 52 J = N-M+I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 52 CONTINUE 51 CONTINUE END IF ELSE IF( UPLO .EQ. 'U' ) THEN DO 53 I = 1, N DO 54 J = I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 54 CONTINUE 53 CONTINUE ELSE DO 55 I = 1, N-1 DO 56 J = I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 56 CONTINUE 55 CONTINUE END IF END IF END IF * * End of IPADMAT. * RETURN END * SUBROUTINE ICHKPAD( UPLO, DIAG, M, N, MEM, LDA, RSRC, CSRC, $ MYROW, MYCOL, IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST INTEGER TESTNUM, MAXERR, NERR INTEGER CHECKVAL * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) INTEGER MEM(*), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * ICHKPAD: Check padding put in by PADMAT. * Checks that padding around target matrix has not been overwritten * by the previous point-to-point or broadcast send. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (input) integer array, dimension(IPRE+IPOST+LDA*N). * Memory location IPRE elements in front of the matrix A. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * IPRE (input) INTEGER * The size of the guard zone before the start of A. * * IPOST (input) INTEGER * The size of guard zone after A. * * CHECKVAL (input) integer * The value to pad matrix with. * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRIBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. LOGICAL ISTRAP INTEGER I, J, K, IRST, IRND, ICST, ICND, SRC, DEST INTEGER NPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = ERR_PRE ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 10 CONTINUE END IF * * Check buffer behind A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I - J + 1 ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_POST ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 20 CONTINUE END IF * * Check all (LDA-M) gaps * IF( LDA .GT. M ) THEN DO 40 J = 1, N DO 30 I = M+1, LDA K = IPRE + (J-1)*LDA + I IF( MEM(K) .NE. CHECKVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_GAP ERRDBUF(1, NERR) = MEM(K) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 30 CONTINUE 40 CONTINUE END IF * * Determine limits of trapezoidal matrix * ISTRAP = .FALSE. IF( UPLO .EQ. 'U' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 2 IRND = M ICST = 1 ICND = M - 1 ELSEIF( M .GT. N ) THEN IRST = ( M-N ) + 2 IRND = M ICST = 1 ICND = N - 1 ENDIF IF( DIAG .EQ. 'U' ) THEN IRST = IRST - 1 ICND = ICND + 1 ENDIF ELSE IF( UPLO .EQ. 'L' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 1 IRND = 1 ICST = ( N-M ) + 2 ICND = N ELSEIF( M .GT. N ) THEN IRST = 1 IRND = 1 ICST = 2 ICND = N ENDIF IF( DIAG .EQ. 'U' ) THEN ICST = ICST - 1 ENDIF ENDIF * * Check elements and report any errors * IF( ISTRAP ) THEN DO 100 J = ICST, ICND DO 105 I = IRST, IRND IF( MEM( IPRE + (J-1)*LDA + I ) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_TRI ERRDBUF(1, NERR) = MEM( IPRE + (J-1)*LDA + I ) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 105 CONTINUE * * Update the limits to allow filling in padding * IF( UPLO .EQ. 'U' ) THEN IRST = IRST + 1 ELSE IRND = IRND + 1 ENDIF 100 CONTINUE END IF * RETURN * * End of ICHKPAD. * END * SUBROUTINE ICHKMAT( UPLO, DIAG, M, N, A, LDA, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) INTEGER A(LDA,N), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * iCHKMAT: Check matrix to see whether there were any transmission * errors. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRIBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Local Scalars .. INTEGER I, J, NPROCS, SRC, DEST LOGICAL USEIT INTEGER COMPVAL * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. External Functions .. INTEGER IBTNPROCS INTEGER IBTRAN EXTERNAL IBTRAN, IBTNPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Initialize ISEED with the same values as used in IGENMAT. * ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * * Generate the elements randomly with the same method used in GENMAT. * Note that for trapezoidal matrices, we generate all elements in the * enclosing rectangle and then ignore the complementary triangle. * DO 100 J = 1, N DO 105 I = 1, M COMPVAL = IBTRAN( ISEED ) * * Now determine whether we actually need this value. The * strategy is to chop out the proper triangle based on what * particular kind of trapezoidal matrix we're dealing with. * USEIT = .TRUE. IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( I .GE. J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. J ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( I .GE. M-N+J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. M-N+J ) THEN USEIT = .FALSE. END IF END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( J. GE. I+(N-M) ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I+(N-M) ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( J .GE. I ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I ) THEN USEIT = .FALSE. END IF END IF END IF END IF * * Compare the generated value to the one that's in the * received matrix. If they don't match, tack another * error record onto what's already there. * IF( USEIT ) THEN IF( A(I,J) .NE. COMPVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I, J) ERRDBUF(2, NERR) = COMPVAL END IF END IF END IF 105 CONTINUE 100 CONTINUE RETURN * * End of ICHKMAT. * END * SUBROUTINE IPRINTERRS( OUTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF, COUNTING, TFAILED ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. LOGICAL COUNTING INTEGER OUTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), TFAILED(*) INTEGER ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * IPRINTERRS: Print errors that have been recorded * * Arguments * ========= * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRIBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (input/ourput) INTEGER array, dimension NTESTS * Workspace used to keep track of which tests failed. * This array not accessed unless COUNTING is true. * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS EXTERNAL IBTMYPROC, IBTNPROCS * .. * .. Local Scalars .. CHARACTER*1 MAT LOGICAL MATISINT INTEGER OLDTEST, NPROCS, PROW, PCOL, I, ERRTYPE * .. * .. Executable Statements .. * IF( (IBTMYPROC().NE.0) .OR. (NERR.LE.0) ) RETURN OLDTEST = -1 NPROCS = IBTNPROCS() PROW = ERRIBUF(3,1) / NPROCS PCOL = MOD( ERRIBUF(3,1), NPROCS ) IF( NERR .GT. MAXERR ) WRITE(OUTNUM,13000) * DO 20 I = 1, MIN( NERR, MAXERR ) IF( ERRIBUF(1,I) .NE. OLDTEST ) THEN IF( OLDTEST .NE. -1 ) $ WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST WRITE(OUTNUM,*) ' ' WRITE(OUTNUM,1000) PROW, PCOL, ERRIBUF(1,I) IF( COUNTING ) TFAILED( ERRIBUF(1,I) ) = 1 OLDTEST = ERRIBUF(1, I) END IF * * Print out error message depending on type of error * ERRTYPE = ERRIBUF(6, I) IF( ERRTYPE .LT. -10 ) THEN ERRTYPE = -ERRTYPE - 10 MAT = 'C' MATISINT = .TRUE. ELSE IF( ERRTYPE .LT. 0 ) THEN ERRTYPE = -ERRTYPE MAT = 'R' MATISINT = .TRUE. ELSE MATISINT = .FALSE. END IF * * RA/CA arrays from MAX/MIN have different printing protocol * IF( MATISINT ) THEN IF( ERRIBUF(2, I) .EQ. -1 ) THEN WRITE(OUTNUM,11000) ERRIBUF(4,I), ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,7000) ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,8000) ERRIBUF(4,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,9000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE WRITE(OUTNUM,10000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), $ INT( ERRDBUF(1,I) ) END IF * * Have memory overwrites in matrix A * ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,2000) ERRIBUF(5,I), ERRDBUF(2,I), $ ERRDBUF(1,I) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,3000) ERRIBUF(4,I), ERRDBUF(2,I), $ ERRDBUF(1,I) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,4000) ERRIBUF(4,I), ERRIBUF(5,I), $ ERRDBUF(2,I), ERRDBUF(1,I) ELSE IF( ERRTYPE .EQ. ERR_TRI ) THEN WRITE(OUTNUM,5000) ERRIBUF(4,I), ERRIBUF(5,I), $ ERRDBUF(2,I), ERRDBUF(1,I) ELSE WRITE(OUTNUM,6000) ERRIBUF(4,I), ERRIBUF(5,I), $ ERRDBUF(2,I), ERRDBUF(1,I) END IF END IF 20 CONTINUE WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST * 1000 FORMAT('PROCESS {',I4,',',I4,'} REPORTS ERRORS IN TEST#',I6,':') 2000 FORMAT(' Buffer overwrite ',I4, $ ' elements before the start of A:',/, $ ' Expected=',I12, $ '; Received=',I12) 3000 FORMAT(' Buffer overwrite ',I4,' elements after the end of A:', $ /,' Expected=',I12, $ '; Received=',I12) 4000 FORMAT(' LDA-M gap overwrite at postion (',I4,',',I4,'):',/, $ ' Expected=',I12, $ '; Received=',I12) 5000 FORMAT(' Complementory triangle overwrite at A(',I4,',',I4, $ '):',/,' Expected=',I12, $ '; Received=',I12) 6000 FORMAT(' Invalid element at A(',I4,',',I4,'):',/, $ ' Expected=',I12, $ '; Received=',I12) 7000 FORMAT(' Buffer overwrite ',I4,' elements before the start of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) 8000 FORMAT(' Buffer overwrite ',I4,' elements after the end of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) * 9000 FORMAT(' LD',A1,'A-M gap overwrite at postion (',I4,',',I4,'):' $ ,/,' Expected=',I12,'; Received=',I12) * 10000 FORMAT(' Invalid element at ',A1,'A(',I4,',',I4,'):',/, $ ' Expected=',I12,'; Received=',I12) 11000 FORMAT(' Overwrite at position (',I4,',',I4,') of non-existent ' $ ,A1,'A array.',/,' Expected=',I12,'; Received=',I12) 12000 FORMAT('PROCESS {',I4,',',I4,'} DONE ERROR REPORT FOR TEST#', $ I6,'.') 13000 FORMAT('WARNING: There were more errors than could be recorded.', $ /,'Increase MEMELTS to get complete listing.') RETURN * * End IPRINTERRS * END * * SUBROUTINE SBTCHECKIN( NFTESTS, OUTNUM, MAXERR, NERR, IERR, $ SVAL, TFAILED ) INTEGER NFTESTS, OUTNUM, MAXERR, NERR INTEGER IERR(*), TFAILED(*) REAL SVAL(*) * * Purpose * ======= * SBTCHECKIN: Process 0 receives error report from all processes. * * Arguments * ========= * NFTESTS (input/output) INTEGER * if NFTESTS is <= 0 upon entry, NFTESTS is not written to. * Otherwise, on entry it specifies the total number of tests * run, and on exit it is the number of tests which failed. * * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRSBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (workspace) INTEGER array, dimension NFTESTS * Workspace used to keep track of which tests failed. * If input of NFTESTS < 1, this array not accessed. * * =================================================================== * * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTMSGID EXTERNAL IBTMYPROC, IBTNPROCS, IBTMSGID * .. * .. Local Scalars .. LOGICAL COUNTING INTEGER K, NERR2, IAM, NPROCS, NTESTS * * Proc 0 collects error info from everyone * IAM = IBTMYPROC() NPROCS = IBTNPROCS() * IF( IAM .EQ. 0 ) THEN * * If we are finding out how many failed tests there are, initialize * the total number of tests (NTESTS), and zero the test failed array * COUNTING = NFTESTS .GT. 0 IF( COUNTING ) THEN NTESTS = NFTESTS DO 10 K = 1, NTESTS TFAILED(K) = 0 10 CONTINUE END IF * CALL SPRINTERRS(OUTNUM, MAXERR, NERR, IERR, SVAL, COUNTING, $ TFAILED) * DO 20 K = 1, NPROCS-1 CALL BTSEND(3, 0, K, K, IBTMSGID()+50) CALL BTRECV(3, 1, NERR2, K, IBTMSGID()+50) IF( NERR2 .GT. 0 ) THEN NERR = NERR + NERR2 CALL BTRECV(3, NERR2*6, IERR, K, IBTMSGID()+51) CALL BTRECV(4, NERR2*2, SVAL, K, IBTMSGID()+51) CALL SPRINTERRS(OUTNUM, MAXERR, NERR2, IERR, SVAL, $ COUNTING, TFAILED) END IF 20 CONTINUE * * Count up number of tests that failed * IF( COUNTING ) THEN NFTESTS = 0 DO 30 K = 1, NTESTS NFTESTS = NFTESTS + TFAILED(K) 30 CONTINUE END IF * * Send my error info to proc 0 * ELSE CALL BTRECV(3, 0, K, 0, IBTMSGID()+50) CALL BTSEND(3, 1, NERR, 0, IBTMSGID()+50) IF( NERR .GT. 0 ) THEN CALL BTSEND(3, NERR*6, IERR, 0, IBTMSGID()+51) CALL BTSEND(4, NERR*2, SVAL, 0, IBTMSGID()+51) END IF ENDIF * RETURN * * End of SBTCHECKIN * END * SUBROUTINE SINITMAT(UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, MYROW, MYCOL) CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST, TESTNUM, MYROW, MYCOL REAL CHECKVAL REAL MEM(*) * * .. External Subroutines .. EXTERNAL SGENMAT, SPADMAT * .. * .. Executable Statements .. * CALL SGENMAT( M, N, MEM(IPRE+1), LDA, TESTNUM, MYROW, MYCOL ) CALL SPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, CHECKVAL ) * RETURN END * SUBROUTINE SGENMAT( M, N, A, LDA, TESTNUM, MYROW, MYCOL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER M, N, LDA, TESTNUM, MYROW, MYCOL * .. * .. Array Arguments .. REAL A(LDA,N) * .. * * Purpose * ======= * SGENMAT: Generates an M-by-N matrix filled with random elements. * * Arguments * ========= * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (output) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * TESTNUM (input) INTEGER * Unique number for this test case, used as a basis for * the random seeds. * * ==================================================================== * * .. External Functions .. INTEGER IBTNPROCS REAL SBTRAN EXTERNAL SBTRAN, IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, NPROCS, SRC * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. Executable Statements .. * * ISEED's four values must be positive integers less than 4096, * fourth one has to be odd. (see _LARND). Use some goofy * functions to come up with seed values which together should * be unique. * NPROCS = IBTNPROCS() SRC = MYROW * NPROCS + MYCOL ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * DO 10 J = 1, N DO 10 I = 1, M A(I, J) = SBTRAN( ISEED ) 10 CONTINUE * RETURN * * End of SGENMAT. * END * REAL FUNCTION SBTRAN(ISEED) INTEGER ISEED(*) * * .. External Functions .. DOUBLE PRECISION DLARND EXTERNAL DLARND * .. Executable Statements .. * SBTRAN = REAL( DLARND(2, ISEED) ) * RETURN * * End of Sbtran * END * SUBROUTINE SPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST REAL CHECKVAL * .. * .. Array Arguments .. REAL MEM( * ) * .. * * Purpose * ======= * * SPADMAT: Pad Matrix. * This routines surrounds a matrix with a guardzone initialized to the * value CHECKVAL. There are three distinct guardzones: * - A contiguous zone of size IPRE immediately before the start * of the matrix. * - A contiguous zone of size IPOST immedately after the end of the * matrix. * - Interstitial zones within each column of the matrix, in the * elements A( M+1:LDA, J ). * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (output) real array, dimension (IPRE+IPOST+LDA*N) * The address IPRE elements ahead of the matrix A you want to * pad, which is then of dimension (LDA,N). * * IPRE (input) INTEGER * The size of the guard zone ahead of the matrix A. * * IPOST (input) INTEGER * The size of the guard zone behind the matrix A. * * CHECKVAL (input) real * The value to insert into the guard zones. * * ==================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE MEM( I ) = CHECKVAL 10 CONTINUE END IF * * Put check buffer in back of A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 MEM( I ) = CHECKVAL 20 CONTINUE END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K+LDA-M-1 MEM( I ) = CHECKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * * If the matrix is upper or lower trapezoidal, calculate the * additional triangular area which needs to be padded, Each * element referred to is in the Ith row and the Jth column. * IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 41 I = 1, M DO 42 J = 1, I K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 42 CONTINUE 41 CONTINUE ELSE DO 43 I = 2, M DO 44 J = 1, I-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 44 CONTINUE 43 CONTINUE END IF ELSE IF( DIAG .EQ. 'U' ) THEN DO 45 I = M-N+1, M DO 46 J = 1, I-(M-N) K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 46 CONTINUE 45 CONTINUE ELSE DO 47 I = M-N+2, M DO 48 J = 1, I-(M-N)-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 48 CONTINUE 47 CONTINUE END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 49 I = 1, M DO 50 J = N-M+I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 50 CONTINUE 49 CONTINUE ELSE DO 51 I = 1, M-1 DO 52 J = N-M+I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 52 CONTINUE 51 CONTINUE END IF ELSE IF( UPLO .EQ. 'U' ) THEN DO 53 I = 1, N DO 54 J = I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 54 CONTINUE 53 CONTINUE ELSE DO 55 I = 1, N-1 DO 56 J = I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 56 CONTINUE 55 CONTINUE END IF END IF END IF * * End of SPADMAT. * RETURN END * SUBROUTINE SCHKPAD( UPLO, DIAG, M, N, MEM, LDA, RSRC, CSRC, $ MYROW, MYCOL, IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST INTEGER TESTNUM, MAXERR, NERR REAL CHECKVAL * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) REAL MEM(*), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * SCHKPAD: Check padding put in by PADMAT. * Checks that padding around target matrix has not been overwritten * by the previous point-to-point or broadcast send. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (input) real array, dimension(IPRE+IPOST+LDA*N). * Memory location IPRE elements in front of the matrix A. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * IPRE (input) INTEGER * The size of the guard zone before the start of A. * * IPOST (input) INTEGER * The size of guard zone after A. * * CHECKVAL (input) real * The value to pad matrix with. * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRSBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. LOGICAL ISTRAP INTEGER I, J, K, IRST, IRND, ICST, ICND, SRC, DEST INTEGER NPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = ERR_PRE ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 10 CONTINUE END IF * * Check buffer behind A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I - J + 1 ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_POST ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 20 CONTINUE END IF * * Check all (LDA-M) gaps * IF( LDA .GT. M ) THEN DO 40 J = 1, N DO 30 I = M+1, LDA K = IPRE + (J-1)*LDA + I IF( MEM(K) .NE. CHECKVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_GAP ERRDBUF(1, NERR) = MEM(K) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 30 CONTINUE 40 CONTINUE END IF * * Determine limits of trapezoidal matrix * ISTRAP = .FALSE. IF( UPLO .EQ. 'U' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 2 IRND = M ICST = 1 ICND = M - 1 ELSEIF( M .GT. N ) THEN IRST = ( M-N ) + 2 IRND = M ICST = 1 ICND = N - 1 ENDIF IF( DIAG .EQ. 'U' ) THEN IRST = IRST - 1 ICND = ICND + 1 ENDIF ELSE IF( UPLO .EQ. 'L' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 1 IRND = 1 ICST = ( N-M ) + 2 ICND = N ELSEIF( M .GT. N ) THEN IRST = 1 IRND = 1 ICST = 2 ICND = N ENDIF IF( DIAG .EQ. 'U' ) THEN ICST = ICST - 1 ENDIF ENDIF * * Check elements and report any errors * IF( ISTRAP ) THEN DO 100 J = ICST, ICND DO 105 I = IRST, IRND IF( MEM( IPRE + (J-1)*LDA + I ) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_TRI ERRDBUF(1, NERR) = MEM( IPRE + (J-1)*LDA + I ) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 105 CONTINUE * * Update the limits to allow filling in padding * IF( UPLO .EQ. 'U' ) THEN IRST = IRST + 1 ELSE IRND = IRND + 1 ENDIF 100 CONTINUE END IF * RETURN * * End of SCHKPAD. * END * SUBROUTINE SCHKMAT( UPLO, DIAG, M, N, A, LDA, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) REAL A(LDA,N), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * sCHKMAT: Check matrix to see whether there were any transmission * errors. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRSBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Local Scalars .. INTEGER I, J, NPROCS, SRC, DEST LOGICAL USEIT REAL COMPVAL * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. External Functions .. INTEGER IBTNPROCS REAL SBTRAN EXTERNAL SBTRAN, IBTNPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Initialize ISEED with the same values as used in SGENMAT. * ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * * Generate the elements randomly with the same method used in GENMAT. * Note that for trapezoidal matrices, we generate all elements in the * enclosing rectangle and then ignore the complementary triangle. * DO 100 J = 1, N DO 105 I = 1, M COMPVAL = SBTRAN( ISEED ) * * Now determine whether we actually need this value. The * strategy is to chop out the proper triangle based on what * particular kind of trapezoidal matrix we're dealing with. * USEIT = .TRUE. IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( I .GE. J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. J ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( I .GE. M-N+J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. M-N+J ) THEN USEIT = .FALSE. END IF END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( J. GE. I+(N-M) ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I+(N-M) ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( J .GE. I ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I ) THEN USEIT = .FALSE. END IF END IF END IF END IF * * Compare the generated value to the one that's in the * received matrix. If they don't match, tack another * error record onto what's already there. * IF( USEIT ) THEN IF( A(I,J) .NE. COMPVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I, J) ERRDBUF(2, NERR) = COMPVAL END IF END IF END IF 105 CONTINUE 100 CONTINUE RETURN * * End of SCHKMAT. * END * SUBROUTINE SPRINTERRS( OUTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF, COUNTING, TFAILED ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. LOGICAL COUNTING INTEGER OUTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), TFAILED(*) REAL ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * SPRINTERRS: Print errors that have been recorded * * Arguments * ========= * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRSBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (input/ourput) INTEGER array, dimension NTESTS * Workspace used to keep track of which tests failed. * This array not accessed unless COUNTING is true. * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS EXTERNAL IBTMYPROC, IBTNPROCS * .. * .. Local Scalars .. CHARACTER*1 MAT LOGICAL MATISINT INTEGER OLDTEST, NPROCS, PROW, PCOL, I, ERRTYPE * .. * .. Executable Statements .. * IF( (IBTMYPROC().NE.0) .OR. (NERR.LE.0) ) RETURN OLDTEST = -1 NPROCS = IBTNPROCS() PROW = ERRIBUF(3,1) / NPROCS PCOL = MOD( ERRIBUF(3,1), NPROCS ) IF( NERR .GT. MAXERR ) WRITE(OUTNUM,13000) * DO 20 I = 1, MIN( NERR, MAXERR ) IF( ERRIBUF(1,I) .NE. OLDTEST ) THEN IF( OLDTEST .NE. -1 ) $ WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST WRITE(OUTNUM,*) ' ' WRITE(OUTNUM,1000) PROW, PCOL, ERRIBUF(1,I) IF( COUNTING ) TFAILED( ERRIBUF(1,I) ) = 1 OLDTEST = ERRIBUF(1, I) END IF * * Print out error message depending on type of error * ERRTYPE = ERRIBUF(6, I) IF( ERRTYPE .LT. -10 ) THEN ERRTYPE = -ERRTYPE - 10 MAT = 'C' MATISINT = .TRUE. ELSE IF( ERRTYPE .LT. 0 ) THEN ERRTYPE = -ERRTYPE MAT = 'R' MATISINT = .TRUE. ELSE MATISINT = .FALSE. END IF * * RA/CA arrays from MAX/MIN have different printing protocol * IF( MATISINT ) THEN IF( ERRIBUF(2, I) .EQ. -1 ) THEN WRITE(OUTNUM,11000) ERRIBUF(4,I), ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,7000) ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,8000) ERRIBUF(4,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,9000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE WRITE(OUTNUM,10000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), $ INT( ERRDBUF(1,I) ) END IF * * Have memory overwrites in matrix A * ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,2000) ERRIBUF(5,I), ERRDBUF(2,I), $ ERRDBUF(1,I) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,3000) ERRIBUF(4,I), ERRDBUF(2,I), $ ERRDBUF(1,I) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,4000) ERRIBUF(4,I), ERRIBUF(5,I), $ ERRDBUF(2,I), ERRDBUF(1,I) ELSE IF( ERRTYPE .EQ. ERR_TRI ) THEN WRITE(OUTNUM,5000) ERRIBUF(4,I), ERRIBUF(5,I), $ ERRDBUF(2,I), ERRDBUF(1,I) ELSE WRITE(OUTNUM,6000) ERRIBUF(4,I), ERRIBUF(5,I), $ ERRDBUF(2,I), ERRDBUF(1,I) END IF END IF 20 CONTINUE WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST * 1000 FORMAT('PROCESS {',I4,',',I4,'} REPORTS ERRORS IN TEST#',I6,':') 2000 FORMAT(' Buffer overwrite ',I4, $ ' elements before the start of A:',/, $ ' Expected=',G15.8, $ '; Received=',G15.8) 3000 FORMAT(' Buffer overwrite ',I4,' elements after the end of A:', $ /,' Expected=',G15.8, $ '; Received=',G15.8) 4000 FORMAT(' LDA-M gap overwrite at postion (',I4,',',I4,'):',/, $ ' Expected=',G15.8, $ '; Received=',G15.8) 5000 FORMAT(' Complementory triangle overwrite at A(',I4,',',I4, $ '):',/,' Expected=',G15.8, $ '; Received=',G15.8) 6000 FORMAT(' Invalid element at A(',I4,',',I4,'):',/, $ ' Expected=',G15.8, $ '; Received=',G15.8) 7000 FORMAT(' Buffer overwrite ',I4,' elements before the start of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) 8000 FORMAT(' Buffer overwrite ',I4,' elements after the end of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) * 9000 FORMAT(' LD',A1,'A-M gap overwrite at postion (',I4,',',I4,'):' $ ,/,' Expected=',I12,'; Received=',I12) * 10000 FORMAT(' Invalid element at ',A1,'A(',I4,',',I4,'):',/, $ ' Expected=',I12,'; Received=',I12) 11000 FORMAT(' Overwrite at position (',I4,',',I4,') of non-existent ' $ ,A1,'A array.',/,' Expected=',I12,'; Received=',I12) 12000 FORMAT('PROCESS {',I4,',',I4,'} DONE ERROR REPORT FOR TEST#', $ I6,'.') 13000 FORMAT('WARNING: There were more errors than could be recorded.', $ /,'Increase MEMELTS to get complete listing.') RETURN * * End SPRINTERRS * END * * SUBROUTINE DBTCHECKIN( NFTESTS, OUTNUM, MAXERR, NERR, IERR, $ DVAL, TFAILED ) INTEGER NFTESTS, OUTNUM, MAXERR, NERR INTEGER IERR(*), TFAILED(*) DOUBLE PRECISION DVAL(*) * * Purpose * ======= * DBTCHECKIN: Process 0 receives error report from all processes. * * Arguments * ========= * NFTESTS (input/output) INTEGER * if NFTESTS is <= 0 upon entry, NFTESTS is not written to. * Otherwise, on entry it specifies the total number of tests * run, and on exit it is the number of tests which failed. * * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRDBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (workspace) INTEGER array, dimension NFTESTS * Workspace used to keep track of which tests failed. * If input of NFTESTS < 1, this array not accessed. * * =================================================================== * * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTMSGID EXTERNAL IBTMYPROC, IBTNPROCS, IBTMSGID * .. * .. Local Scalars .. LOGICAL COUNTING INTEGER K, NERR2, IAM, NPROCS, NTESTS * * Proc 0 collects error info from everyone * IAM = IBTMYPROC() NPROCS = IBTNPROCS() * IF( IAM .EQ. 0 ) THEN * * If we are finding out how many failed tests there are, initialize * the total number of tests (NTESTS), and zero the test failed array * COUNTING = NFTESTS .GT. 0 IF( COUNTING ) THEN NTESTS = NFTESTS DO 10 K = 1, NTESTS TFAILED(K) = 0 10 CONTINUE END IF * CALL DPRINTERRS(OUTNUM, MAXERR, NERR, IERR, DVAL, COUNTING, $ TFAILED) * DO 20 K = 1, NPROCS-1 CALL BTSEND(3, 0, K, K, IBTMSGID()+50) CALL BTRECV(3, 1, NERR2, K, IBTMSGID()+50) IF( NERR2 .GT. 0 ) THEN NERR = NERR + NERR2 CALL BTRECV(3, NERR2*6, IERR, K, IBTMSGID()+51) CALL BTRECV(6, NERR2*2, DVAL, K, IBTMSGID()+51) CALL DPRINTERRS(OUTNUM, MAXERR, NERR2, IERR, DVAL, $ COUNTING, TFAILED) END IF 20 CONTINUE * * Count up number of tests that failed * IF( COUNTING ) THEN NFTESTS = 0 DO 30 K = 1, NTESTS NFTESTS = NFTESTS + TFAILED(K) 30 CONTINUE END IF * * Send my error info to proc 0 * ELSE CALL BTRECV(3, 0, K, 0, IBTMSGID()+50) CALL BTSEND(3, 1, NERR, 0, IBTMSGID()+50) IF( NERR .GT. 0 ) THEN CALL BTSEND(3, NERR*6, IERR, 0, IBTMSGID()+51) CALL BTSEND(6, NERR*2, DVAL, 0, IBTMSGID()+51) END IF ENDIF * RETURN * * End of DBTCHECKIN * END * SUBROUTINE DINITMAT(UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, MYROW, MYCOL) CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST, TESTNUM, MYROW, MYCOL DOUBLE PRECISION CHECKVAL DOUBLE PRECISION MEM(*) * * .. External Subroutines .. EXTERNAL DGENMAT, DPADMAT * .. * .. Executable Statements .. * CALL DGENMAT( M, N, MEM(IPRE+1), LDA, TESTNUM, MYROW, MYCOL ) CALL DPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, CHECKVAL ) * RETURN END * SUBROUTINE DGENMAT( M, N, A, LDA, TESTNUM, MYROW, MYCOL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER M, N, LDA, TESTNUM, MYROW, MYCOL * .. * .. Array Arguments .. DOUBLE PRECISION A(LDA,N) * .. * * Purpose * ======= * DGENMAT: Generates an M-by-N matrix filled with random elements. * * Arguments * ========= * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (output) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * TESTNUM (input) INTEGER * Unique number for this test case, used as a basis for * the random seeds. * * ==================================================================== * * .. External Functions .. INTEGER IBTNPROCS DOUBLE PRECISION DBTRAN EXTERNAL DBTRAN, IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, NPROCS, SRC * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. Executable Statements .. * * ISEED's four values must be positive integers less than 4096, * fourth one has to be odd. (see _LARND). Use some goofy * functions to come up with seed values which together should * be unique. * NPROCS = IBTNPROCS() SRC = MYROW * NPROCS + MYCOL ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * DO 10 J = 1, N DO 10 I = 1, M A(I, J) = DBTRAN( ISEED ) 10 CONTINUE * RETURN * * End of DGENMAT. * END * DOUBLE PRECISION FUNCTION DBTRAN(ISEED) INTEGER ISEED(*) * * .. External Functions .. DOUBLE PRECISION DLARND EXTERNAL DLARND * .. Executable Statements .. * DBTRAN = DLARND(2, ISEED) * RETURN * * End of Dbtran * END * SUBROUTINE DPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST DOUBLE PRECISION CHECKVAL * .. * .. Array Arguments .. DOUBLE PRECISION MEM( * ) * .. * * Purpose * ======= * * DPADMAT: Pad Matrix. * This routines surrounds a matrix with a guardzone initialized to the * value CHECKVAL. There are three distinct guardzones: * - A contiguous zone of size IPRE immediately before the start * of the matrix. * - A contiguous zone of size IPOST immedately after the end of the * matrix. * - Interstitial zones within each column of the matrix, in the * elements A( M+1:LDA, J ). * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (output) double precision array, dimension (IPRE+IPOST+LDA*N) * The address IPRE elements ahead of the matrix A you want to * pad, which is then of dimension (LDA,N). * * IPRE (input) INTEGER * The size of the guard zone ahead of the matrix A. * * IPOST (input) INTEGER * The size of the guard zone behind the matrix A. * * CHECKVAL (input) double precision * The value to insert into the guard zones. * * ==================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE MEM( I ) = CHECKVAL 10 CONTINUE END IF * * Put check buffer in back of A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 MEM( I ) = CHECKVAL 20 CONTINUE END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K+LDA-M-1 MEM( I ) = CHECKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * * If the matrix is upper or lower trapezoidal, calculate the * additional triangular area which needs to be padded, Each * element referred to is in the Ith row and the Jth column. * IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 41 I = 1, M DO 42 J = 1, I K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 42 CONTINUE 41 CONTINUE ELSE DO 43 I = 2, M DO 44 J = 1, I-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 44 CONTINUE 43 CONTINUE END IF ELSE IF( DIAG .EQ. 'U' ) THEN DO 45 I = M-N+1, M DO 46 J = 1, I-(M-N) K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 46 CONTINUE 45 CONTINUE ELSE DO 47 I = M-N+2, M DO 48 J = 1, I-(M-N)-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 48 CONTINUE 47 CONTINUE END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 49 I = 1, M DO 50 J = N-M+I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 50 CONTINUE 49 CONTINUE ELSE DO 51 I = 1, M-1 DO 52 J = N-M+I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 52 CONTINUE 51 CONTINUE END IF ELSE IF( UPLO .EQ. 'U' ) THEN DO 53 I = 1, N DO 54 J = I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 54 CONTINUE 53 CONTINUE ELSE DO 55 I = 1, N-1 DO 56 J = I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 56 CONTINUE 55 CONTINUE END IF END IF END IF * * End of DPADMAT. * RETURN END * SUBROUTINE DCHKPAD( UPLO, DIAG, M, N, MEM, LDA, RSRC, CSRC, $ MYROW, MYCOL, IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST INTEGER TESTNUM, MAXERR, NERR DOUBLE PRECISION CHECKVAL * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) DOUBLE PRECISION MEM(*), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * DCHKPAD: Check padding put in by PADMAT. * Checks that padding around target matrix has not been overwritten * by the previous point-to-point or broadcast send. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (input) double precision array, dimension(IPRE+IPOST+LDA*N). * Memory location IPRE elements in front of the matrix A. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * IPRE (input) INTEGER * The size of the guard zone before the start of A. * * IPOST (input) INTEGER * The size of guard zone after A. * * CHECKVAL (input) double precision * The value to pad matrix with. * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRDBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. LOGICAL ISTRAP INTEGER I, J, K, IRST, IRND, ICST, ICND, SRC, DEST INTEGER NPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = ERR_PRE ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 10 CONTINUE END IF * * Check buffer behind A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I - J + 1 ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_POST ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 20 CONTINUE END IF * * Check all (LDA-M) gaps * IF( LDA .GT. M ) THEN DO 40 J = 1, N DO 30 I = M+1, LDA K = IPRE + (J-1)*LDA + I IF( MEM(K) .NE. CHECKVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_GAP ERRDBUF(1, NERR) = MEM(K) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 30 CONTINUE 40 CONTINUE END IF * * Determine limits of trapezoidal matrix * ISTRAP = .FALSE. IF( UPLO .EQ. 'U' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 2 IRND = M ICST = 1 ICND = M - 1 ELSEIF( M .GT. N ) THEN IRST = ( M-N ) + 2 IRND = M ICST = 1 ICND = N - 1 ENDIF IF( DIAG .EQ. 'U' ) THEN IRST = IRST - 1 ICND = ICND + 1 ENDIF ELSE IF( UPLO .EQ. 'L' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 1 IRND = 1 ICST = ( N-M ) + 2 ICND = N ELSEIF( M .GT. N ) THEN IRST = 1 IRND = 1 ICST = 2 ICND = N ENDIF IF( DIAG .EQ. 'U' ) THEN ICST = ICST - 1 ENDIF ENDIF * * Check elements and report any errors * IF( ISTRAP ) THEN DO 100 J = ICST, ICND DO 105 I = IRST, IRND IF( MEM( IPRE + (J-1)*LDA + I ) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_TRI ERRDBUF(1, NERR) = MEM( IPRE + (J-1)*LDA + I ) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 105 CONTINUE * * Update the limits to allow filling in padding * IF( UPLO .EQ. 'U' ) THEN IRST = IRST + 1 ELSE IRND = IRND + 1 ENDIF 100 CONTINUE END IF * RETURN * * End of DCHKPAD. * END * SUBROUTINE DCHKMAT( UPLO, DIAG, M, N, A, LDA, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) DOUBLE PRECISION A(LDA,N), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * dCHKMAT: Check matrix to see whether there were any transmission * errors. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRDBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Local Scalars .. INTEGER I, J, NPROCS, SRC, DEST LOGICAL USEIT DOUBLE PRECISION COMPVAL * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. External Functions .. INTEGER IBTNPROCS DOUBLE PRECISION DBTRAN EXTERNAL DBTRAN, IBTNPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Initialize ISEED with the same values as used in DGENMAT. * ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * * Generate the elements randomly with the same method used in GENMAT. * Note that for trapezoidal matrices, we generate all elements in the * enclosing rectangle and then ignore the complementary triangle. * DO 100 J = 1, N DO 105 I = 1, M COMPVAL = DBTRAN( ISEED ) * * Now determine whether we actually need this value. The * strategy is to chop out the proper triangle based on what * particular kind of trapezoidal matrix we're dealing with. * USEIT = .TRUE. IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( I .GE. J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. J ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( I .GE. M-N+J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. M-N+J ) THEN USEIT = .FALSE. END IF END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( J. GE. I+(N-M) ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I+(N-M) ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( J .GE. I ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I ) THEN USEIT = .FALSE. END IF END IF END IF END IF * * Compare the generated value to the one that's in the * received matrix. If they don't match, tack another * error record onto what's already there. * IF( USEIT ) THEN IF( A(I,J) .NE. COMPVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I, J) ERRDBUF(2, NERR) = COMPVAL END IF END IF END IF 105 CONTINUE 100 CONTINUE RETURN * * End of DCHKMAT. * END * SUBROUTINE DPRINTERRS( OUTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF, COUNTING, TFAILED ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. LOGICAL COUNTING INTEGER OUTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), TFAILED(*) DOUBLE PRECISION ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * DPRINTERRS: Print errors that have been recorded * * Arguments * ========= * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRDBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (input/ourput) INTEGER array, dimension NTESTS * Workspace used to keep track of which tests failed. * This array not accessed unless COUNTING is true. * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS EXTERNAL IBTMYPROC, IBTNPROCS * .. * .. Local Scalars .. CHARACTER*1 MAT LOGICAL MATISINT INTEGER OLDTEST, NPROCS, PROW, PCOL, I, ERRTYPE * .. * .. Executable Statements .. * IF( (IBTMYPROC().NE.0) .OR. (NERR.LE.0) ) RETURN OLDTEST = -1 NPROCS = IBTNPROCS() PROW = ERRIBUF(3,1) / NPROCS PCOL = MOD( ERRIBUF(3,1), NPROCS ) IF( NERR .GT. MAXERR ) WRITE(OUTNUM,13000) * DO 20 I = 1, MIN( NERR, MAXERR ) IF( ERRIBUF(1,I) .NE. OLDTEST ) THEN IF( OLDTEST .NE. -1 ) $ WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST WRITE(OUTNUM,*) ' ' WRITE(OUTNUM,1000) PROW, PCOL, ERRIBUF(1,I) IF( COUNTING ) TFAILED( ERRIBUF(1,I) ) = 1 OLDTEST = ERRIBUF(1, I) END IF * * Print out error message depending on type of error * ERRTYPE = ERRIBUF(6, I) IF( ERRTYPE .LT. -10 ) THEN ERRTYPE = -ERRTYPE - 10 MAT = 'C' MATISINT = .TRUE. ELSE IF( ERRTYPE .LT. 0 ) THEN ERRTYPE = -ERRTYPE MAT = 'R' MATISINT = .TRUE. ELSE MATISINT = .FALSE. END IF * * RA/CA arrays from MAX/MIN have different printing protocol * IF( MATISINT ) THEN IF( ERRIBUF(2, I) .EQ. -1 ) THEN WRITE(OUTNUM,11000) ERRIBUF(4,I), ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,7000) ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,8000) ERRIBUF(4,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,9000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE WRITE(OUTNUM,10000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), $ INT( ERRDBUF(1,I) ) END IF * * Have memory overwrites in matrix A * ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,2000) ERRIBUF(5,I), ERRDBUF(2,I), $ ERRDBUF(1,I) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,3000) ERRIBUF(4,I), ERRDBUF(2,I), $ ERRDBUF(1,I) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,4000) ERRIBUF(4,I), ERRIBUF(5,I), $ ERRDBUF(2,I), ERRDBUF(1,I) ELSE IF( ERRTYPE .EQ. ERR_TRI ) THEN WRITE(OUTNUM,5000) ERRIBUF(4,I), ERRIBUF(5,I), $ ERRDBUF(2,I), ERRDBUF(1,I) ELSE WRITE(OUTNUM,6000) ERRIBUF(4,I), ERRIBUF(5,I), $ ERRDBUF(2,I), ERRDBUF(1,I) END IF END IF 20 CONTINUE WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST * 1000 FORMAT('PROCESS {',I4,',',I4,'} REPORTS ERRORS IN TEST#',I6,':') 2000 FORMAT(' Buffer overwrite ',I4, $ ' elements before the start of A:',/, $ ' Expected=',G22.15, $ '; Received=',G22.15) 3000 FORMAT(' Buffer overwrite ',I4,' elements after the end of A:', $ /,' Expected=',G22.15, $ '; Received=',G22.15) 4000 FORMAT(' LDA-M gap overwrite at postion (',I4,',',I4,'):',/, $ ' Expected=',G22.15, $ '; Received=',G22.15) 5000 FORMAT(' Complementory triangle overwrite at A(',I4,',',I4, $ '):',/,' Expected=',G22.15, $ '; Received=',G22.15) 6000 FORMAT(' Invalid element at A(',I4,',',I4,'):',/, $ ' Expected=',G22.15, $ '; Received=',G22.15) 7000 FORMAT(' Buffer overwrite ',I4,' elements before the start of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) 8000 FORMAT(' Buffer overwrite ',I4,' elements after the end of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) * 9000 FORMAT(' LD',A1,'A-M gap overwrite at postion (',I4,',',I4,'):' $ ,/,' Expected=',I12,'; Received=',I12) * 10000 FORMAT(' Invalid element at ',A1,'A(',I4,',',I4,'):',/, $ ' Expected=',I12,'; Received=',I12) 11000 FORMAT(' Overwrite at position (',I4,',',I4,') of non-existent ' $ ,A1,'A array.',/,' Expected=',I12,'; Received=',I12) 12000 FORMAT('PROCESS {',I4,',',I4,'} DONE ERROR REPORT FOR TEST#', $ I6,'.') 13000 FORMAT('WARNING: There were more errors than could be recorded.', $ /,'Increase MEMELTS to get complete listing.') RETURN * * End DPRINTERRS * END * * SUBROUTINE CBTCHECKIN( NFTESTS, OUTNUM, MAXERR, NERR, IERR, $ CVAL, TFAILED ) INTEGER NFTESTS, OUTNUM, MAXERR, NERR INTEGER IERR(*), TFAILED(*) COMPLEX CVAL(*) * * Purpose * ======= * CBTCHECKIN: Process 0 receives error report from all processes. * * Arguments * ========= * NFTESTS (input/output) INTEGER * if NFTESTS is <= 0 upon entry, NFTESTS is not written to. * Otherwise, on entry it specifies the total number of tests * run, and on exit it is the number of tests which failed. * * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRCBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (workspace) INTEGER array, dimension NFTESTS * Workspace used to keep track of which tests failed. * If input of NFTESTS < 1, this array not accessed. * * =================================================================== * * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTMSGID EXTERNAL IBTMYPROC, IBTNPROCS, IBTMSGID * .. * .. Local Scalars .. LOGICAL COUNTING INTEGER K, NERR2, IAM, NPROCS, NTESTS * * Proc 0 collects error info from everyone * IAM = IBTMYPROC() NPROCS = IBTNPROCS() * IF( IAM .EQ. 0 ) THEN * * If we are finding out how many failed tests there are, initialize * the total number of tests (NTESTS), and zero the test failed array * COUNTING = NFTESTS .GT. 0 IF( COUNTING ) THEN NTESTS = NFTESTS DO 10 K = 1, NTESTS TFAILED(K) = 0 10 CONTINUE END IF * CALL CPRINTERRS(OUTNUM, MAXERR, NERR, IERR, CVAL, COUNTING, $ TFAILED) * DO 20 K = 1, NPROCS-1 CALL BTSEND(3, 0, K, K, IBTMSGID()+50) CALL BTRECV(3, 1, NERR2, K, IBTMSGID()+50) IF( NERR2 .GT. 0 ) THEN NERR = NERR + NERR2 CALL BTRECV(3, NERR2*6, IERR, K, IBTMSGID()+51) CALL BTRECV(5, NERR2*2, CVAL, K, IBTMSGID()+51) CALL CPRINTERRS(OUTNUM, MAXERR, NERR2, IERR, CVAL, $ COUNTING, TFAILED) END IF 20 CONTINUE * * Count up number of tests that failed * IF( COUNTING ) THEN NFTESTS = 0 DO 30 K = 1, NTESTS NFTESTS = NFTESTS + TFAILED(K) 30 CONTINUE END IF * * Send my error info to proc 0 * ELSE CALL BTRECV(3, 0, K, 0, IBTMSGID()+50) CALL BTSEND(3, 1, NERR, 0, IBTMSGID()+50) IF( NERR .GT. 0 ) THEN CALL BTSEND(3, NERR*6, IERR, 0, IBTMSGID()+51) CALL BTSEND(5, NERR*2, CVAL, 0, IBTMSGID()+51) END IF ENDIF * RETURN * * End of CBTCHECKIN * END * SUBROUTINE CINITMAT(UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, MYROW, MYCOL) CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST, TESTNUM, MYROW, MYCOL COMPLEX CHECKVAL COMPLEX MEM(*) * * .. External Subroutines .. EXTERNAL CGENMAT, CPADMAT * .. * .. Executable Statements .. * CALL CGENMAT( M, N, MEM(IPRE+1), LDA, TESTNUM, MYROW, MYCOL ) CALL CPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, CHECKVAL ) * RETURN END * SUBROUTINE CGENMAT( M, N, A, LDA, TESTNUM, MYROW, MYCOL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER M, N, LDA, TESTNUM, MYROW, MYCOL * .. * .. Array Arguments .. COMPLEX A(LDA,N) * .. * * Purpose * ======= * CGENMAT: Generates an M-by-N matrix filled with random elements. * * Arguments * ========= * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (output) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * TESTNUM (input) INTEGER * Unique number for this test case, used as a basis for * the random seeds. * * ==================================================================== * * .. External Functions .. INTEGER IBTNPROCS COMPLEX CBTRAN EXTERNAL CBTRAN, IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, NPROCS, SRC * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. Executable Statements .. * * ISEED's four values must be positive integers less than 4096, * fourth one has to be odd. (see _LARND). Use some goofy * functions to come up with seed values which together should * be unique. * NPROCS = IBTNPROCS() SRC = MYROW * NPROCS + MYCOL ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * DO 10 J = 1, N DO 10 I = 1, M A(I, J) = CBTRAN( ISEED ) 10 CONTINUE * RETURN * * End of CGENMAT. * END * COMPLEX FUNCTION CBTRAN(ISEED) INTEGER ISEED(*) * * .. External Functions .. DOUBLE COMPLEX ZLARND EXTERNAL ZLARND CBTRAN = CMPLX( ZLARND(2, ISEED) ) * RETURN * * End of Cbtran * END * SUBROUTINE CPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST COMPLEX CHECKVAL * .. * .. Array Arguments .. COMPLEX MEM( * ) * .. * * Purpose * ======= * * CPADMAT: Pad Matrix. * This routines surrounds a matrix with a guardzone initialized to the * value CHECKVAL. There are three distinct guardzones: * - A contiguous zone of size IPRE immediately before the start * of the matrix. * - A contiguous zone of size IPOST immedately after the end of the * matrix. * - Interstitial zones within each column of the matrix, in the * elements A( M+1:LDA, J ). * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (output) complex array, dimension (IPRE+IPOST+LDA*N) * The address IPRE elements ahead of the matrix A you want to * pad, which is then of dimension (LDA,N). * * IPRE (input) INTEGER * The size of the guard zone ahead of the matrix A. * * IPOST (input) INTEGER * The size of the guard zone behind the matrix A. * * CHECKVAL (input) complex * The value to insert into the guard zones. * * ==================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE MEM( I ) = CHECKVAL 10 CONTINUE END IF * * Put check buffer in back of A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 MEM( I ) = CHECKVAL 20 CONTINUE END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K+LDA-M-1 MEM( I ) = CHECKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * * If the matrix is upper or lower trapezoidal, calculate the * additional triangular area which needs to be padded, Each * element referred to is in the Ith row and the Jth column. * IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 41 I = 1, M DO 42 J = 1, I K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 42 CONTINUE 41 CONTINUE ELSE DO 43 I = 2, M DO 44 J = 1, I-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 44 CONTINUE 43 CONTINUE END IF ELSE IF( DIAG .EQ. 'U' ) THEN DO 45 I = M-N+1, M DO 46 J = 1, I-(M-N) K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 46 CONTINUE 45 CONTINUE ELSE DO 47 I = M-N+2, M DO 48 J = 1, I-(M-N)-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 48 CONTINUE 47 CONTINUE END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 49 I = 1, M DO 50 J = N-M+I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 50 CONTINUE 49 CONTINUE ELSE DO 51 I = 1, M-1 DO 52 J = N-M+I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 52 CONTINUE 51 CONTINUE END IF ELSE IF( UPLO .EQ. 'U' ) THEN DO 53 I = 1, N DO 54 J = I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 54 CONTINUE 53 CONTINUE ELSE DO 55 I = 1, N-1 DO 56 J = I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 56 CONTINUE 55 CONTINUE END IF END IF END IF * * End of CPADMAT. * RETURN END * SUBROUTINE CCHKPAD( UPLO, DIAG, M, N, MEM, LDA, RSRC, CSRC, $ MYROW, MYCOL, IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST INTEGER TESTNUM, MAXERR, NERR COMPLEX CHECKVAL * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) COMPLEX MEM(*), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * CCHKPAD: Check padding put in by PADMAT. * Checks that padding around target matrix has not been overwritten * by the previous point-to-point or broadcast send. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (input) complex array, dimension(IPRE+IPOST+LDA*N). * Memory location IPRE elements in front of the matrix A. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * IPRE (input) INTEGER * The size of the guard zone before the start of A. * * IPOST (input) INTEGER * The size of guard zone after A. * * CHECKVAL (input) complex * The value to pad matrix with. * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRCBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. LOGICAL ISTRAP INTEGER I, J, K, IRST, IRND, ICST, ICND, SRC, DEST INTEGER NPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = ERR_PRE ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 10 CONTINUE END IF * * Check buffer behind A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I - J + 1 ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_POST ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 20 CONTINUE END IF * * Check all (LDA-M) gaps * IF( LDA .GT. M ) THEN DO 40 J = 1, N DO 30 I = M+1, LDA K = IPRE + (J-1)*LDA + I IF( MEM(K) .NE. CHECKVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_GAP ERRDBUF(1, NERR) = MEM(K) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 30 CONTINUE 40 CONTINUE END IF * * Determine limits of trapezoidal matrix * ISTRAP = .FALSE. IF( UPLO .EQ. 'U' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 2 IRND = M ICST = 1 ICND = M - 1 ELSEIF( M .GT. N ) THEN IRST = ( M-N ) + 2 IRND = M ICST = 1 ICND = N - 1 ENDIF IF( DIAG .EQ. 'U' ) THEN IRST = IRST - 1 ICND = ICND + 1 ENDIF ELSE IF( UPLO .EQ. 'L' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 1 IRND = 1 ICST = ( N-M ) + 2 ICND = N ELSEIF( M .GT. N ) THEN IRST = 1 IRND = 1 ICST = 2 ICND = N ENDIF IF( DIAG .EQ. 'U' ) THEN ICST = ICST - 1 ENDIF ENDIF * * Check elements and report any errors * IF( ISTRAP ) THEN DO 100 J = ICST, ICND DO 105 I = IRST, IRND IF( MEM( IPRE + (J-1)*LDA + I ) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_TRI ERRDBUF(1, NERR) = MEM( IPRE + (J-1)*LDA + I ) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 105 CONTINUE * * Update the limits to allow filling in padding * IF( UPLO .EQ. 'U' ) THEN IRST = IRST + 1 ELSE IRND = IRND + 1 ENDIF 100 CONTINUE END IF * RETURN * * End of CCHKPAD. * END * SUBROUTINE CCHKMAT( UPLO, DIAG, M, N, A, LDA, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) COMPLEX A(LDA,N), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * cCHKMAT: Check matrix to see whether there were any transmission * errors. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRCBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Local Scalars .. INTEGER I, J, NPROCS, SRC, DEST LOGICAL USEIT COMPLEX COMPVAL * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. External Functions .. INTEGER IBTNPROCS COMPLEX CBTRAN EXTERNAL CBTRAN, IBTNPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Initialize ISEED with the same values as used in CGENMAT. * ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * * Generate the elements randomly with the same method used in GENMAT. * Note that for trapezoidal matrices, we generate all elements in the * enclosing rectangle and then ignore the complementary triangle. * DO 100 J = 1, N DO 105 I = 1, M COMPVAL = CBTRAN( ISEED ) * * Now determine whether we actually need this value. The * strategy is to chop out the proper triangle based on what * particular kind of trapezoidal matrix we're dealing with. * USEIT = .TRUE. IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( I .GE. J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. J ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( I .GE. M-N+J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. M-N+J ) THEN USEIT = .FALSE. END IF END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( J. GE. I+(N-M) ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I+(N-M) ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( J .GE. I ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I ) THEN USEIT = .FALSE. END IF END IF END IF END IF * * Compare the generated value to the one that's in the * received matrix. If they don't match, tack another * error record onto what's already there. * IF( USEIT ) THEN IF( A(I,J) .NE. COMPVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I, J) ERRDBUF(2, NERR) = COMPVAL END IF END IF END IF 105 CONTINUE 100 CONTINUE RETURN * * End of CCHKMAT. * END * SUBROUTINE CPRINTERRS( OUTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF, COUNTING, TFAILED ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. LOGICAL COUNTING INTEGER OUTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), TFAILED(*) COMPLEX ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * CPRINTERRS: Print errors that have been recorded * * Arguments * ========= * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRCBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (input/ourput) INTEGER array, dimension NTESTS * Workspace used to keep track of which tests failed. * This array not accessed unless COUNTING is true. * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS EXTERNAL IBTMYPROC, IBTNPROCS * .. * .. Local Scalars .. CHARACTER*1 MAT LOGICAL MATISINT INTEGER OLDTEST, NPROCS, PROW, PCOL, I, ERRTYPE * .. * .. Executable Statements .. * IF( (IBTMYPROC().NE.0) .OR. (NERR.LE.0) ) RETURN OLDTEST = -1 NPROCS = IBTNPROCS() PROW = ERRIBUF(3,1) / NPROCS PCOL = MOD( ERRIBUF(3,1), NPROCS ) IF( NERR .GT. MAXERR ) WRITE(OUTNUM,13000) * DO 20 I = 1, MIN( NERR, MAXERR ) IF( ERRIBUF(1,I) .NE. OLDTEST ) THEN IF( OLDTEST .NE. -1 ) $ WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST WRITE(OUTNUM,*) ' ' WRITE(OUTNUM,1000) PROW, PCOL, ERRIBUF(1,I) IF( COUNTING ) TFAILED( ERRIBUF(1,I) ) = 1 OLDTEST = ERRIBUF(1, I) END IF * * Print out error message depending on type of error * ERRTYPE = ERRIBUF(6, I) IF( ERRTYPE .LT. -10 ) THEN ERRTYPE = -ERRTYPE - 10 MAT = 'C' MATISINT = .TRUE. ELSE IF( ERRTYPE .LT. 0 ) THEN ERRTYPE = -ERRTYPE MAT = 'R' MATISINT = .TRUE. ELSE MATISINT = .FALSE. END IF * * RA/CA arrays from MAX/MIN have different printing protocol * IF( MATISINT ) THEN IF( ERRIBUF(2, I) .EQ. -1 ) THEN WRITE(OUTNUM,11000) ERRIBUF(4,I), ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,7000) ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,8000) ERRIBUF(4,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,9000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE WRITE(OUTNUM,10000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), $ INT( ERRDBUF(1,I) ) END IF * * Have memory overwrites in matrix A * ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,2000) ERRIBUF(5,I), $ REAL( ERRDBUF(2,I) ), AIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), AIMAG( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,3000) ERRIBUF(4,I), $ REAL( ERRDBUF(2,I) ), AIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), AIMAG( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,4000) $ ERRIBUF(4,I), ERRIBUF(5,I), $ REAL( ERRDBUF(2,I) ), AIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), AIMAG( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_TRI ) THEN WRITE(OUTNUM,5000) ERRIBUF(4,I), ERRIBUF(5,I), $ REAL( ERRDBUF(2,I) ), AIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), AIMAG( ERRDBUF(1,I) ) ELSE WRITE(OUTNUM,6000) ERRIBUF(4,I), ERRIBUF(5,I), $ REAL( ERRDBUF(2,I) ), AIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), AIMAG( ERRDBUF(1,I) ) END IF END IF 20 CONTINUE WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST * 1000 FORMAT('PROCESS {',I4,',',I4,'} REPORTS ERRORS IN TEST#',I6,':') 2000 FORMAT(' Buffer overwrite ',I4, $ ' elements before the start of A:',/, $ ' Expected=','[',G15.8,',',G15.8,']', $ '; Received=','[',G15.8,',',G15.8,']') 3000 FORMAT(' Buffer overwrite ',I4,' elements after the end of A:', $ /,' Expected=','[',G15.8,',',G15.8,']', $ '; Received=','[',G15.8,',',G15.8,']') 4000 FORMAT(' LDA-M gap overwrite at postion (',I4,',',I4,'):',/, $ ' Expected=','[',G15.8,',',G15.8,']', $ '; Received=','[',G15.8,',',G15.8,']') 5000 FORMAT(' Complementory triangle overwrite at A(',I4,',',I4, $ '):',/,' Expected=','[',G15.8,',',G15.8,']', $ '; Received=','[',G15.8,',',G15.8,']') 6000 FORMAT(' Invalid element at A(',I4,',',I4,'):',/, $ ' Expected=','[',G15.8,',',G15.8,']', $ '; Received=','[',G15.8,',',G15.8,']') 7000 FORMAT(' Buffer overwrite ',I4,' elements before the start of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) 8000 FORMAT(' Buffer overwrite ',I4,' elements after the end of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) * 9000 FORMAT(' LD',A1,'A-M gap overwrite at postion (',I4,',',I4,'):' $ ,/,' Expected=',I12,'; Received=',I12) * 10000 FORMAT(' Invalid element at ',A1,'A(',I4,',',I4,'):',/, $ ' Expected=',I12,'; Received=',I12) 11000 FORMAT(' Overwrite at position (',I4,',',I4,') of non-existent ' $ ,A1,'A array.',/,' Expected=',I12,'; Received=',I12) 12000 FORMAT('PROCESS {',I4,',',I4,'} DONE ERROR REPORT FOR TEST#', $ I6,'.') 13000 FORMAT('WARNING: There were more errors than could be recorded.', $ /,'Increase MEMELTS to get complete listing.') RETURN * * End CPRINTERRS * END * * SUBROUTINE ZBTCHECKIN( NFTESTS, OUTNUM, MAXERR, NERR, IERR, $ ZVAL, TFAILED ) INTEGER NFTESTS, OUTNUM, MAXERR, NERR INTEGER IERR(*), TFAILED(*) DOUBLE COMPLEX ZVAL(*) * * Purpose * ======= * ZBTCHECKIN: Process 0 receives error report from all processes. * * Arguments * ========= * NFTESTS (input/output) INTEGER * if NFTESTS is <= 0 upon entry, NFTESTS is not written to. * Otherwise, on entry it specifies the total number of tests * run, and on exit it is the number of tests which failed. * * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRZBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (workspace) INTEGER array, dimension NFTESTS * Workspace used to keep track of which tests failed. * If input of NFTESTS < 1, this array not accessed. * * =================================================================== * * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTMSGID EXTERNAL IBTMYPROC, IBTNPROCS, IBTMSGID * .. * .. Local Scalars .. LOGICAL COUNTING INTEGER K, NERR2, IAM, NPROCS, NTESTS * * Proc 0 collects error info from everyone * IAM = IBTMYPROC() NPROCS = IBTNPROCS() * IF( IAM .EQ. 0 ) THEN * * If we are finding out how many failed tests there are, initialize * the total number of tests (NTESTS), and zero the test failed array * COUNTING = NFTESTS .GT. 0 IF( COUNTING ) THEN NTESTS = NFTESTS DO 10 K = 1, NTESTS TFAILED(K) = 0 10 CONTINUE END IF * CALL ZPRINTERRS(OUTNUM, MAXERR, NERR, IERR, ZVAL, COUNTING, $ TFAILED) * DO 20 K = 1, NPROCS-1 CALL BTSEND(3, 0, K, K, IBTMSGID()+50) CALL BTRECV(3, 1, NERR2, K, IBTMSGID()+50) IF( NERR2 .GT. 0 ) THEN NERR = NERR + NERR2 CALL BTRECV(3, NERR2*6, IERR, K, IBTMSGID()+51) CALL BTRECV(7, NERR2*2, ZVAL, K, IBTMSGID()+51) CALL ZPRINTERRS(OUTNUM, MAXERR, NERR2, IERR, ZVAL, $ COUNTING, TFAILED) END IF 20 CONTINUE * * Count up number of tests that failed * IF( COUNTING ) THEN NFTESTS = 0 DO 30 K = 1, NTESTS NFTESTS = NFTESTS + TFAILED(K) 30 CONTINUE END IF * * Send my error info to proc 0 * ELSE CALL BTRECV(3, 0, K, 0, IBTMSGID()+50) CALL BTSEND(3, 1, NERR, 0, IBTMSGID()+50) IF( NERR .GT. 0 ) THEN CALL BTSEND(3, NERR*6, IERR, 0, IBTMSGID()+51) CALL BTSEND(7, NERR*2, ZVAL, 0, IBTMSGID()+51) END IF ENDIF * RETURN * * End of ZBTCHECKIN * END * SUBROUTINE ZINITMAT(UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, MYROW, MYCOL) CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST, TESTNUM, MYROW, MYCOL DOUBLE COMPLEX CHECKVAL DOUBLE COMPLEX MEM(*) * * .. External Subroutines .. EXTERNAL ZGENMAT, ZPADMAT * .. * .. Executable Statements .. * CALL ZGENMAT( M, N, MEM(IPRE+1), LDA, TESTNUM, MYROW, MYCOL ) CALL ZPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, CHECKVAL ) * RETURN END * SUBROUTINE ZGENMAT( M, N, A, LDA, TESTNUM, MYROW, MYCOL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER M, N, LDA, TESTNUM, MYROW, MYCOL * .. * .. Array Arguments .. DOUBLE COMPLEX A(LDA,N) * .. * * Purpose * ======= * ZGENMAT: Generates an M-by-N matrix filled with random elements. * * Arguments * ========= * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (output) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * TESTNUM (input) INTEGER * Unique number for this test case, used as a basis for * the random seeds. * * ==================================================================== * * .. External Functions .. INTEGER IBTNPROCS DOUBLE COMPLEX ZBTRAN EXTERNAL ZBTRAN, IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, NPROCS, SRC * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. Executable Statements .. * * ISEED's four values must be positive integers less than 4096, * fourth one has to be odd. (see _LARND). Use some goofy * functions to come up with seed values which together should * be unique. * NPROCS = IBTNPROCS() SRC = MYROW * NPROCS + MYCOL ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * DO 10 J = 1, N DO 10 I = 1, M A(I, J) = ZBTRAN( ISEED ) 10 CONTINUE * RETURN * * End of ZGENMAT. * END * DOUBLE COMPLEX FUNCTION ZBTRAN(ISEED) INTEGER ISEED(*) * * .. External Functions .. DOUBLE COMPLEX ZLARND EXTERNAL ZLARND ZBTRAN = ZLARND(2, ISEED) * RETURN * * End of Zbtran * END * SUBROUTINE ZPADMAT( UPLO, DIAG, M, N, MEM, LDA, IPRE, IPOST, $ CHECKVAL ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, IPRE, IPOST DOUBLE COMPLEX CHECKVAL * .. * .. Array Arguments .. DOUBLE COMPLEX MEM( * ) * .. * * Purpose * ======= * * ZPADMAT: Pad Matrix. * This routines surrounds a matrix with a guardzone initialized to the * value CHECKVAL. There are three distinct guardzones: * - A contiguous zone of size IPRE immediately before the start * of the matrix. * - A contiguous zone of size IPOST immedately after the end of the * matrix. * - Interstitial zones within each column of the matrix, in the * elements A( M+1:LDA, J ). * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (output) double complex array, dimension (IPRE+IPOST+LDA*N) * The address IPRE elements ahead of the matrix A you want to * pad, which is then of dimension (LDA,N). * * IPRE (input) INTEGER * The size of the guard zone ahead of the matrix A. * * IPOST (input) INTEGER * The size of the guard zone behind the matrix A. * * CHECKVAL (input) double complex * The value to insert into the guard zones. * * ==================================================================== * * .. Local Scalars .. INTEGER I, J, K * .. * .. Executable Statements .. * * Put check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE MEM( I ) = CHECKVAL 10 CONTINUE END IF * * Put check buffer in back of A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 MEM( I ) = CHECKVAL 20 CONTINUE END IF * * Put check buffer in all (LDA-M) gaps * IF( LDA .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = K, K+LDA-M-1 MEM( I ) = CHECKVAL 30 CONTINUE K = K + LDA 40 CONTINUE END IF * * If the matrix is upper or lower trapezoidal, calculate the * additional triangular area which needs to be padded, Each * element referred to is in the Ith row and the Jth column. * IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 41 I = 1, M DO 42 J = 1, I K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 42 CONTINUE 41 CONTINUE ELSE DO 43 I = 2, M DO 44 J = 1, I-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 44 CONTINUE 43 CONTINUE END IF ELSE IF( DIAG .EQ. 'U' ) THEN DO 45 I = M-N+1, M DO 46 J = 1, I-(M-N) K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 46 CONTINUE 45 CONTINUE ELSE DO 47 I = M-N+2, M DO 48 J = 1, I-(M-N)-1 K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 48 CONTINUE 47 CONTINUE END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN DO 49 I = 1, M DO 50 J = N-M+I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 50 CONTINUE 49 CONTINUE ELSE DO 51 I = 1, M-1 DO 52 J = N-M+I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 52 CONTINUE 51 CONTINUE END IF ELSE IF( UPLO .EQ. 'U' ) THEN DO 53 I = 1, N DO 54 J = I, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 54 CONTINUE 53 CONTINUE ELSE DO 55 I = 1, N-1 DO 56 J = I+1, N K = IPRE + I + (J-1)*LDA MEM( K ) = CHECKVAL 56 CONTINUE 55 CONTINUE END IF END IF END IF * * End of ZPADMAT. * RETURN END * SUBROUTINE ZCHKPAD( UPLO, DIAG, M, N, MEM, LDA, RSRC, CSRC, $ MYROW, MYCOL, IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, IPRE, IPOST INTEGER TESTNUM, MAXERR, NERR DOUBLE COMPLEX CHECKVAL * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) DOUBLE COMPLEX MEM(*), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * ZCHKPAD: Check padding put in by PADMAT. * Checks that padding around target matrix has not been overwritten * by the previous point-to-point or broadcast send. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * MEM (input) double complex array, dimension(IPRE+IPOST+LDA*N). * Memory location IPRE elements in front of the matrix A. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * IPRE (input) INTEGER * The size of the guard zone before the start of A. * * IPOST (input) INTEGER * The size of guard zone after A. * * CHECKVAL (input) double complex * The value to pad matrix with. * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRZBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. LOGICAL ISTRAP INTEGER I, J, K, IRST, IRND, ICST, ICND, SRC, DEST INTEGER NPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Check buffer in front of A * IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = ERR_PRE ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 10 CONTINUE END IF * * Check buffer behind A * IF( IPOST .GT. 0 ) THEN J = IPRE + LDA*N + 1 DO 20 I = J, J+IPOST-1 IF( MEM(I) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I - J + 1 ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_POST ERRDBUF(1, NERR) = MEM(I) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 20 CONTINUE END IF * * Check all (LDA-M) gaps * IF( LDA .GT. M ) THEN DO 40 J = 1, N DO 30 I = M+1, LDA K = IPRE + (J-1)*LDA + I IF( MEM(K) .NE. CHECKVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_GAP ERRDBUF(1, NERR) = MEM(K) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 30 CONTINUE 40 CONTINUE END IF * * Determine limits of trapezoidal matrix * ISTRAP = .FALSE. IF( UPLO .EQ. 'U' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 2 IRND = M ICST = 1 ICND = M - 1 ELSEIF( M .GT. N ) THEN IRST = ( M-N ) + 2 IRND = M ICST = 1 ICND = N - 1 ENDIF IF( DIAG .EQ. 'U' ) THEN IRST = IRST - 1 ICND = ICND + 1 ENDIF ELSE IF( UPLO .EQ. 'L' ) THEN ISTRAP = .TRUE. IF( M .LE. N ) THEN IRST = 1 IRND = 1 ICST = ( N-M ) + 2 ICND = N ELSEIF( M .GT. N ) THEN IRST = 1 IRND = 1 ICST = 2 ICND = N ENDIF IF( DIAG .EQ. 'U' ) THEN ICST = ICST - 1 ENDIF ENDIF * * Check elements and report any errors * IF( ISTRAP ) THEN DO 100 J = ICST, ICND DO 105 I = IRST, IRND IF( MEM( IPRE + (J-1)*LDA + I ) .NE. CHECKVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = ERR_TRI ERRDBUF(1, NERR) = MEM( IPRE + (J-1)*LDA + I ) ERRDBUF(2, NERR) = CHECKVAL END IF END IF 105 CONTINUE * * Update the limits to allow filling in padding * IF( UPLO .EQ. 'U' ) THEN IRST = IRST + 1 ELSE IRND = IRND + 1 ENDIF 100 CONTINUE END IF * RETURN * * End of ZCHKPAD. * END * SUBROUTINE ZCHKMAT( UPLO, DIAG, M, N, A, LDA, RSRC, CSRC, $ MYROW, MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. CHARACTER*1 UPLO, DIAG INTEGER M, N, LDA, RSRC, CSRC, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR) DOUBLE COMPLEX A(LDA,N), ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * zCHKMAT: Check matrix to see whether there were any transmission * errors. * * Arguments * ========= * UPLO (input) CHARACTER*1 * Is the matrix A 'U'pper or 'L'ower trapezoidal, or 'G'eneral * rectangular? * * DIAG (input) CHARACTER*1 * For trapezoidal matrices, is the main diagonal included * ('N') or not ('U')? * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input) @up@(doctype) array, dimension (LDA,N) * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1, M). * * RSRC (input) INTEGER * The process row of the source of the matrix. * * CSRC (input) INTEGER * The process column of the source of the matrix. * * MYROW (input) INTEGER * Row of this process in the process grid. * * MYCOL (input) INTEGER * Column of this process in the process grid. * * * TESTNUM (input) INTEGER * The number of the test being checked. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRZBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * =================================================================== * * .. Local Scalars .. INTEGER I, J, NPROCS, SRC, DEST LOGICAL USEIT DOUBLE COMPLEX COMPVAL * .. * .. Local Arrays .. INTEGER ISEED(4) * .. * .. External Functions .. INTEGER IBTNPROCS DOUBLE COMPLEX ZBTRAN EXTERNAL ZBTRAN, IBTNPROCS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() SRC = RSRC * NPROCS + CSRC DEST = MYROW * NPROCS + MYCOL * * Initialize ISEED with the same values as used in ZGENMAT. * ISEED(1) = MOD( 1002 + TESTNUM*5 + SRC*3, 4096 ) ISEED(2) = MOD( 2027 + TESTNUM*7 + SRC, 4096 ) ISEED(3) = MOD( 1234 + TESTNUM + SRC*3, 4096 ) ISEED(4) = MOD( 4311 + TESTNUM*10 + SRC*2, 4096 ) * * Generate the elements randomly with the same method used in GENMAT. * Note that for trapezoidal matrices, we generate all elements in the * enclosing rectangle and then ignore the complementary triangle. * DO 100 J = 1, N DO 105 I = 1, M COMPVAL = ZBTRAN( ISEED ) * * Now determine whether we actually need this value. The * strategy is to chop out the proper triangle based on what * particular kind of trapezoidal matrix we're dealing with. * USEIT = .TRUE. IF( UPLO .EQ. 'U' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( I .GE. J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. J ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( I .GE. M-N+J ) THEN USEIT = .FALSE. END IF ELSE IF( I .GT. M-N+J ) THEN USEIT = .FALSE. END IF END IF END IF ELSE IF( UPLO .EQ. 'L' ) THEN IF( M .LE. N ) THEN IF( DIAG .EQ. 'U' ) THEN IF( J. GE. I+(N-M) ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I+(N-M) ) THEN USEIT = .FALSE. END IF END IF ELSE IF( DIAG .EQ. 'U' ) THEN IF( J .GE. I ) THEN USEIT = .FALSE. END IF ELSE IF( J .GT. I ) THEN USEIT = .FALSE. END IF END IF END IF END IF * * Compare the generated value to the one that's in the * received matrix. If they don't match, tack another * error record onto what's already there. * IF( USEIT ) THEN IF( A(I,J) .NE. COMPVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = SRC ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I, J) ERRDBUF(2, NERR) = COMPVAL END IF END IF END IF 105 CONTINUE 100 CONTINUE RETURN * * End of ZCHKMAT. * END * SUBROUTINE ZPRINTERRS( OUTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF, COUNTING, TFAILED ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. LOGICAL COUNTING INTEGER OUTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), TFAILED(*) DOUBLE COMPLEX ERRDBUF(2, MAXERR) * .. * * Purpose * ======= * ZPRINTERRS: Print errors that have been recorded * * Arguments * ========= * OUTNUM (input) INTEGER * Device number for output. * * MAXERR (input) INTEGER * Max number of errors that can be stored in ERRIBUFF or * ERRZBUFF * * NERR (output) INTEGER * The number of errors that have been found. * * ERRIBUF (output) INTEGER array, dimension (6,MAXERRS) * Buffer in which to store integer error information. It will * be built up in the following format for the call to TSEND. * All integer information is recorded in the following 6-tuple * {TESTNUM, SRC, DEST, I, J, WHAT}. These values are figured: * SRC = RSRC * NPROCS + CSRC * DEST = RDEST * NPROCS + CDEST * WHAT * = 1 : Error in pre-padding * = 2 : Error in post-padding * = 3 : Error in LDA-M gap * = 4 : Error in complementory triangle * ELSE: Error in matrix * If there are more errors than can fit in the error buffer, * the error number will indicate the actual number of errors * found, but the buffer will be truncated to the maximum * number of errors which can fit. * * ERRDBUF (output) @(doctype) array, dimension (2, MAXERRS) * Buffer in which to store error data information. * {Incorrect, Predicted} * * TFAILED (input/ourput) INTEGER array, dimension NTESTS * Workspace used to keep track of which tests failed. * This array not accessed unless COUNTING is true. * * =================================================================== * * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS EXTERNAL IBTMYPROC, IBTNPROCS * .. * .. Local Scalars .. CHARACTER*1 MAT LOGICAL MATISINT INTEGER OLDTEST, NPROCS, PROW, PCOL, I, ERRTYPE * .. * .. Executable Statements .. * IF( (IBTMYPROC().NE.0) .OR. (NERR.LE.0) ) RETURN OLDTEST = -1 NPROCS = IBTNPROCS() PROW = ERRIBUF(3,1) / NPROCS PCOL = MOD( ERRIBUF(3,1), NPROCS ) IF( NERR .GT. MAXERR ) WRITE(OUTNUM,13000) * DO 20 I = 1, MIN( NERR, MAXERR ) IF( ERRIBUF(1,I) .NE. OLDTEST ) THEN IF( OLDTEST .NE. -1 ) $ WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST WRITE(OUTNUM,*) ' ' WRITE(OUTNUM,1000) PROW, PCOL, ERRIBUF(1,I) IF( COUNTING ) TFAILED( ERRIBUF(1,I) ) = 1 OLDTEST = ERRIBUF(1, I) END IF * * Print out error message depending on type of error * ERRTYPE = ERRIBUF(6, I) IF( ERRTYPE .LT. -10 ) THEN ERRTYPE = -ERRTYPE - 10 MAT = 'C' MATISINT = .TRUE. ELSE IF( ERRTYPE .LT. 0 ) THEN ERRTYPE = -ERRTYPE MAT = 'R' MATISINT = .TRUE. ELSE MATISINT = .FALSE. END IF * * RA/CA arrays from MAX/MIN have different printing protocol * IF( MATISINT ) THEN IF( ERRIBUF(2, I) .EQ. -1 ) THEN WRITE(OUTNUM,11000) ERRIBUF(4,I), ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,7000) ERRIBUF(5,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,8000) ERRIBUF(4,I), MAT, $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,9000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), INT( ERRDBUF(1,I) ) ELSE WRITE(OUTNUM,10000) MAT, ERRIBUF(4,I), ERRIBUF(5,I), $ INT( ERRDBUF(2,I) ), $ INT( ERRDBUF(1,I) ) END IF * * Have memory overwrites in matrix A * ELSE IF( ERRTYPE .EQ. ERR_PRE ) THEN WRITE(OUTNUM,2000) ERRIBUF(5,I), $ REAL( ERRDBUF(2,I) ), DIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), DIMAG( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_POST ) THEN WRITE(OUTNUM,3000) ERRIBUF(4,I), $ REAL( ERRDBUF(2,I) ), DIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), DIMAG( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_GAP ) THEN WRITE(OUTNUM,4000) $ ERRIBUF(4,I), ERRIBUF(5,I), $ REAL( ERRDBUF(2,I) ), DIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), DIMAG( ERRDBUF(1,I) ) ELSE IF( ERRTYPE .EQ. ERR_TRI ) THEN WRITE(OUTNUM,5000) ERRIBUF(4,I), ERRIBUF(5,I), $ REAL( ERRDBUF(2,I) ), DIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), DIMAG( ERRDBUF(1,I) ) ELSE WRITE(OUTNUM,6000) ERRIBUF(4,I), ERRIBUF(5,I), $ REAL( ERRDBUF(2,I) ), DIMAG( ERRDBUF(2,I) ), $ REAL( ERRDBUF(1,I) ), DIMAG( ERRDBUF(1,I) ) END IF END IF 20 CONTINUE WRITE(OUTNUM,12000) PROW, PCOL, OLDTEST * 1000 FORMAT('PROCESS {',I4,',',I4,'} REPORTS ERRORS IN TEST#',I6,':') 2000 FORMAT(' Buffer overwrite ',I4, $ ' elements before the start of A:',/, $ ' Expected=','[',G22.15,',',G22.15,']', $ '; Received=','[',G22.15,',',G22.15,']') 3000 FORMAT(' Buffer overwrite ',I4,' elements after the end of A:', $ /,' Expected=','[',G22.15,',',G22.15,']', $ '; Received=','[',G22.15,',',G22.15,']') 4000 FORMAT(' LDA-M gap overwrite at postion (',I4,',',I4,'):',/, $ ' Expected=','[',G22.15,',',G22.15,']', $ '; Received=','[',G22.15,',',G22.15,']') 5000 FORMAT(' Complementory triangle overwrite at A(',I4,',',I4, $ '):',/,' Expected=','[',G22.15,',',G22.15,']', $ '; Received=','[',G22.15,',',G22.15,']') 6000 FORMAT(' Invalid element at A(',I4,',',I4,'):',/, $ ' Expected=','[',G22.15,',',G22.15,']', $ '; Received=','[',G22.15,',',G22.15,']') 7000 FORMAT(' Buffer overwrite ',I4,' elements before the start of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) 8000 FORMAT(' Buffer overwrite ',I4,' elements after the end of ', $ A1,'A:',/,' Expected=',I12,'; Received=',I12) * 9000 FORMAT(' LD',A1,'A-M gap overwrite at postion (',I4,',',I4,'):' $ ,/,' Expected=',I12,'; Received=',I12) * 10000 FORMAT(' Invalid element at ',A1,'A(',I4,',',I4,'):',/, $ ' Expected=',I12,'; Received=',I12) 11000 FORMAT(' Overwrite at position (',I4,',',I4,') of non-existent ' $ ,A1,'A array.',/,' Expected=',I12,'; Received=',I12) 12000 FORMAT('PROCESS {',I4,',',I4,'} DONE ERROR REPORT FOR TEST#', $ I6,'.') 13000 FORMAT('WARNING: There were more errors than could be recorded.', $ /,'Increase MEMELTS to get complete listing.') RETURN * * End ZPRINTERRS * END * * SUBROUTINE ISUMTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*) INTEGER MEM(MEMLEN) * .. * * Purpose * ======= * ITESTSUM: Test integer SUM COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * MEM (workspace) INTEGER array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGSUM2D EXTERNAL IINITMAT, ICHKPAD, IBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CDEST, CDEST2, CONTEXT, ERRDPTR, ERRIPTR, I, IAM, $ IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, ISIZE, ISTART, $ ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, ITR2, J, K, LDA, $ LDADST, LDASRC, M, MAXERR, MYCOL, MYROW, N, NERR, NFAIL, $ NPCOL, NPROW, NSKIP, PREAPTR, RDEST, RDEST2, SETWHAT, $ TESTNUM INTEGER CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = -911 IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( ISIZE * (MEMLEN-I) ) / ( ISIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SUM tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL IINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * CALL IGSUM2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RDEST2, $ CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL ICHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL ICHKSUM(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL IBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL IBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('INTEGER SUM TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,6I6,2I5) 7000 FORMAT('INTEGER SUM TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('INTEGER SUM TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of ITESTSUM. * END * INTEGER FUNCTION IBTABS(VAL) INTEGER VAL IBTABS = ABS(VAL) RETURN END * SUBROUTINE ICHKSUM( SCOPE, ICTXT, M, N, A, LDA, TESTNUM, MAXERR, $ NERR, ERRIBUF, ERRDBUF, ISEED ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), ISEED(*) INTEGER A(LDA,*), ERRDBUF(2, MAXERR) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS INTEGER IBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, IBTRAN * .. * .. Local Scalars .. INTEGER NPROCS, NPROW, NPCOL, MYROW, MYCOL, NODE, NNODES, DEST INTEGER I, J, K INTEGER ANS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M ANS = 0 DO 40 K = 0, NNODES-1 ANS = ANS + IBTRAN( ISEED(K*4+1) ) 40 CONTINUE * * The error bound is figured by * 2 * eps * (nnodes-1) * max(|max element|, |ans|). * The 2 allows for errors in the distributed _AND_ local result. * The eps is machine epsilon. The number of floating point adds * is (nnodes - 1). We use the fact that 0.5 is the maximum element * in order to save ourselves some computation. * IF( ANS .NE. A(I,J) ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = ANS END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of ICHKSUM * END * * SUBROUTINE SSUMTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*) REAL MEM(MEMLEN) * .. * * Purpose * ======= * STESTSUM: Test real SUM COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * MEM (workspace) REAL array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGSUM2D EXTERNAL SINITMAT, SCHKPAD, SBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CDEST, CDEST2, CONTEXT, ERRDPTR, ERRIPTR, I, IAM, $ IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, ISIZE, ISTART, $ ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, ITR2, J, K, LDA, $ LDADST, LDASRC, M, MAXERR, MYCOL, MYROW, N, NERR, NFAIL, $ NPCOL, NPROW, NSKIP, PREAPTR, RDEST, RDEST2, SETWHAT, $ SSIZE, TESTNUM REAL CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = -0.61E0 IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') SSIZE = IBTSIZEOF('S') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( SSIZE * (MEMLEN-I) ) / ( SSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SUM tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL SINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * CALL SGSUM2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RDEST2, $ CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL SCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL SCHKSUM(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL SBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL SBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('REAL SUM TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,6I6,2I5) 7000 FORMAT('REAL SUM TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('REAL SUM TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of STESTSUM. * END * REAL FUNCTION SBTABS(VAL) REAL VAL SBTABS = ABS(VAL) RETURN END * REAL FUNCTION SBTEPS() * * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTMSGID REAL SLAMCH EXTERNAL IBTMYPROC, IBTNPROCS, IBTMSGID, SLAMCH * .. * .. Local Scalars .. INTEGER I, IAM, NNODES REAL EPS, EPS2 SAVE EPS DATA EPS /-22.0E0/ * .. * .. Executable Statements .. * * First time called, must get max epsilon possessed by any * participating process * IF( EPS .EQ. -22.0E0 ) THEN IAM = IBTMYPROC() NNODES = IBTNPROCS() EPS = SLAMCH('epsilon') IF( IAM .EQ. 0 ) THEN IF( NNODES .GT. 1 ) THEN DO 10 I = 1, NNODES-1 CALL BTRECV( 4, 1, EPS2, I, IBTMSGID()+20 ) IF( EPS .LT. EPS2 ) EPS = EPS2 10 CONTINUE END IF CALL BTSEND( 4, 1, EPS, -1, IBTMSGID()+20 ) ELSE CALL BTSEND( 4, 1, EPS, 0, IBTMSGID()+20 ) CALL BTRECV( 4, 1, EPS, 0, IBTMSGID()+20 ) ENDIF END IF SBTEPS = EPS RETURN * * End SBTEPS * END * SUBROUTINE SCHKSUM( SCOPE, ICTXT, M, N, A, LDA, TESTNUM, MAXERR, $ NERR, ERRIBUF, ERRDBUF, ISEED ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), ISEED(*) REAL A(LDA,*), ERRDBUF(2, MAXERR) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS REAL SBTEPS REAL SBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, SBTEPS, SBTRAN * .. * .. Local Scalars .. INTEGER NPROCS, NPROW, NPCOL, MYROW, MYCOL, NODE, NNODES, DEST INTEGER I, J, K REAL ANS, EPS, ERRBND, POSNUM, NEGNUM, TMP * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = SBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M ANS = 0 POSNUM = 0 NEGNUM = 0 DO 40 K = 0, NNODES-1 TMP = SBTRAN( ISEED(K*4+1) ) IF( TMP .LT. 0 ) THEN NEGNUM = NEGNUM + TMP ELSE POSNUM = POSNUM + TMP END IF ANS = ANS + TMP 40 CONTINUE * * The error bound is figured by * 2 * eps * (nnodes-1) * max(|max element|, |ans|). * The 2 allows for errors in the distributed _AND_ local result. * The eps is machine epsilon. The number of floating point adds * is (nnodes - 1). We use the fact that 0.5 is the maximum element * in order to save ourselves some computation. * ERRBND = 2 * EPS * NNODES * MAX( POSNUM, -NEGNUM ) IF( ABS( ANS - A(I,J) ) .GT. ERRBND ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = ANS END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of SCHKSUM * END * * SUBROUTINE DSUMTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*) DOUBLE PRECISION MEM(MEMLEN) * .. * * Purpose * ======= * DTESTSUM: Test double precision SUM COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * MEM (workspace) DOUBLE PRECISION array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGSUM2D EXTERNAL DINITMAT, DCHKPAD, DBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CDEST, CDEST2, CONTEXT, DSIZE, ERRDPTR, ERRIPTR, I, $ IAM, IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, ISIZE, ISTART, $ ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, ITR2, J, K, LDA, $ LDADST, LDASRC, M, MAXERR, MYCOL, MYROW, N, NERR, NFAIL, $ NPCOL, NPROW, NSKIP, PREAPTR, RDEST, RDEST2, SETWHAT, $ TESTNUM DOUBLE PRECISION CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = -0.81D0 IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') DSIZE = IBTSIZEOF('D') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( DSIZE * (MEMLEN-I) ) / ( DSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SUM tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL DINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * CALL DGSUM2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RDEST2, $ CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL DCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL DCHKSUM(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL DBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL DBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE PRECISION SUM TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,6I6,2I5) 7000 FORMAT('DOUBLE PRECISION SUM TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('DOUBLE PRECISION SUM TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of DTESTSUM. * END * DOUBLE PRECISION FUNCTION DBTABS(VAL) DOUBLE PRECISION VAL DBTABS = ABS(VAL) RETURN END * DOUBLE PRECISION FUNCTION DBTEPS() * * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTMSGID DOUBLE PRECISION DLAMCH EXTERNAL IBTMYPROC, IBTNPROCS, IBTMSGID, DLAMCH * .. * .. Local Scalars .. INTEGER I, IAM, NNODES DOUBLE PRECISION EPS, EPS2 SAVE EPS DATA EPS /-22.0D0/ * .. * .. Executable Statements .. * * First time called, must get max epsilon possessed by any * participating process * IF( EPS .EQ. -22.0D0 ) THEN IAM = IBTMYPROC() NNODES = IBTNPROCS() EPS = DLAMCH('epsilon') IF( IAM .EQ. 0 ) THEN IF( NNODES .GT. 1 ) THEN DO 10 I = 1, NNODES-1 CALL BTRECV( 6, 1, EPS2, I, IBTMSGID()+20 ) IF( EPS .LT. EPS2 ) EPS = EPS2 10 CONTINUE END IF CALL BTSEND( 6, 1, EPS, -1, IBTMSGID()+20 ) ELSE CALL BTSEND( 6, 1, EPS, 0, IBTMSGID()+20 ) CALL BTRECV( 6, 1, EPS, 0, IBTMSGID()+20 ) ENDIF END IF DBTEPS = EPS RETURN * * End DBTEPS * END * SUBROUTINE DCHKSUM( SCOPE, ICTXT, M, N, A, LDA, TESTNUM, MAXERR, $ NERR, ERRIBUF, ERRDBUF, ISEED ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), ISEED(*) DOUBLE PRECISION A(LDA,*), ERRDBUF(2, MAXERR) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS DOUBLE PRECISION DBTEPS DOUBLE PRECISION DBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, DBTEPS, DBTRAN * .. * .. Local Scalars .. INTEGER NPROCS, NPROW, NPCOL, MYROW, MYCOL, NODE, NNODES, DEST INTEGER I, J, K DOUBLE PRECISION ANS, EPS, ERRBND, POSNUM, NEGNUM, TMP * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = DBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M ANS = 0 POSNUM = 0 NEGNUM = 0 DO 40 K = 0, NNODES-1 TMP = DBTRAN( ISEED(K*4+1) ) IF( TMP .LT. 0 ) THEN NEGNUM = NEGNUM + TMP ELSE POSNUM = POSNUM + TMP END IF ANS = ANS + TMP 40 CONTINUE * * The error bound is figured by * 2 * eps * (nnodes-1) * max(|max element|, |ans|). * The 2 allows for errors in the distributed _AND_ local result. * The eps is machine epsilon. The number of floating point adds * is (nnodes - 1). We use the fact that 0.5 is the maximum element * in order to save ourselves some computation. * ERRBND = 2 * EPS * NNODES * MAX( POSNUM, -NEGNUM ) IF( ABS( ANS - A(I,J) ) .GT. ERRBND ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = ANS END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of DCHKSUM * END * * SUBROUTINE CSUMTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*) COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * CTESTSUM: Test complex SUM COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * MEM (workspace) COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGSUM2D EXTERNAL CINITMAT, CCHKPAD, CBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CDEST, CDEST2, CONTEXT, CSIZE, ERRDPTR, ERRIPTR, I, $ IAM, IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, ISIZE, ISTART, $ ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, ITR2, J, K, LDA, $ LDADST, LDASRC, M, MAXERR, MYCOL, MYROW, N, NERR, NFAIL, $ NPCOL, NPROW, NSKIP, PREAPTR, RDEST, RDEST2, SETWHAT, $ TESTNUM COMPLEX CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = CMPLX( -0.91E0, -0.71E0 ) IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') CSIZE = IBTSIZEOF('C') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( CSIZE * (MEMLEN-I) ) / ( CSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SUM tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL CINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * CALL CGSUM2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RDEST2, $ CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL CCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL CCHKSUM(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL CBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL CBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('COMPLEX SUM TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,6I6,2I5) 7000 FORMAT('COMPLEX SUM TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('COMPLEX SUM TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of CTESTSUM. * END * REAL FUNCTION CBTABS(VAL) COMPLEX VAL CBTABS = ABS( REAL(VAL) ) + ABS( AIMAG(VAL) ) RETURN END * SUBROUTINE CCHKSUM( SCOPE, ICTXT, M, N, A, LDA, TESTNUM, MAXERR, $ NERR, ERRIBUF, ERRDBUF, ISEED ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), ISEED(*) COMPLEX A(LDA,*), ERRDBUF(2, MAXERR) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS REAL SBTEPS COMPLEX CBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, SBTEPS, CBTRAN * .. * .. Local Scalars .. LOGICAL NUMOK INTEGER NPROCS, NPROW, NPCOL, MYROW, MYCOL, NODE, NNODES, DEST INTEGER I, J, K COMPLEX ANS, TMP REAL EPS, ERRBND, RPOSNUM, RNEGNUM, IPOSNUM, INEGNUM * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = SBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M ANS = 0 RPOSNUM = 0 RNEGNUM = 0 IPOSNUM = 0 INEGNUM = 0 DO 40 K = 0, NNODES-1 TMP = CBTRAN( ISEED(K*4+1) ) IF( REAL( TMP ) .LT. 0 ) THEN RNEGNUM = RNEGNUM + REAL( TMP ) ELSE RPOSNUM = RPOSNUM + REAL( TMP ) END IF IF( AIMAG( TMP ) .LT. 0 ) THEN INEGNUM = INEGNUM + AIMAG( TMP ) ELSE IPOSNUM = IPOSNUM + AIMAG( TMP ) END IF ANS = ANS + TMP 40 CONTINUE * * The error bound is figured by * 2 * eps * (nnodes-1) * max(|max element|, |ans|). * The 2 allows for errors in the distributed _AND_ local result. * The eps is machine epsilon. The number of floating point adds * is (nnodes - 1). We use the fact that 0.5 is the maximum element * in order to save ourselves some computation. * TMP = ANS - A(I,J) ERRBND = 2 * EPS * NNODES * MAX( RPOSNUM, -RNEGNUM ) NUMOK = ( REAL(TMP) .LE. ERRBND ) ERRBND = 2 * EPS * NNODES * MAX( IPOSNUM, -INEGNUM ) NUMOK = NUMOK .AND. ( AIMAG(TMP) .LE. ERRBND ) IF( .NOT.NUMOK ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = ANS END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of CCHKSUM * END * * SUBROUTINE ZSUMTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*) DOUBLE COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * ZTESTSUM: Test double complex SUM COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * MEM (workspace) DOUBLE COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, ZGSUM2D EXTERNAL ZINITMAT, ZCHKPAD, ZBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CDEST, CDEST2, CONTEXT, ERRDPTR, ERRIPTR, I, IAM, $ IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, ISIZE, ISTART, $ ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, ITR2, J, K, LDA, $ LDADST, LDASRC, M, MAXERR, MYCOL, MYROW, N, NERR, NFAIL, $ NPCOL, NPROW, NSKIP, PREAPTR, RDEST, RDEST2, SETWHAT, $ TESTNUM, ZSIZE DOUBLE COMPLEX CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = DCMPLX( -9.11D0, -9.21D0 ) IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') ZSIZE = IBTSIZEOF('Z') * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE MAXERR = ( ZSIZE * (MEMLEN-I) ) / ( ZSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run SUM tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL ZINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * CALL ZGSUM2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RDEST2, $ CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL ZCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL ZCHKSUM(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL ZBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL ZBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE COMPLEX SUM TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,6I6,2I5) 7000 FORMAT('DOUBLE COMPLEX SUM TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('DOUBLE COMPLEX SUM TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of ZTESTSUM. * END * DOUBLE PRECISION FUNCTION ZBTABS(VAL) DOUBLE COMPLEX VAL ZBTABS = ABS( DBLE(VAL) ) + ABS( DIMAG(VAL) ) RETURN END * SUBROUTINE ZCHKSUM( SCOPE, ICTXT, M, N, A, LDA, TESTNUM, MAXERR, $ NERR, ERRIBUF, ERRDBUF, ISEED ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER ERRIBUF(6, MAXERR), ISEED(*) DOUBLE COMPLEX A(LDA,*), ERRDBUF(2, MAXERR) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS DOUBLE PRECISION DBTEPS DOUBLE COMPLEX ZBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, DBTEPS, ZBTRAN * .. * .. Local Scalars .. LOGICAL NUMOK INTEGER NPROCS, NPROW, NPCOL, MYROW, MYCOL, NODE, NNODES, DEST INTEGER I, J, K DOUBLE COMPLEX ANS, TMP DOUBLE PRECISION EPS, ERRBND, RPOSNUM, RNEGNUM, IPOSNUM, INEGNUM * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = DBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M ANS = 0 RPOSNUM = 0 RNEGNUM = 0 IPOSNUM = 0 INEGNUM = 0 DO 40 K = 0, NNODES-1 TMP = ZBTRAN( ISEED(K*4+1) ) IF( DBLE( TMP ) .LT. 0 ) THEN RNEGNUM = RNEGNUM + DBLE( TMP ) ELSE RPOSNUM = RPOSNUM + DBLE( TMP ) END IF IF( DIMAG( TMP ) .LT. 0 ) THEN INEGNUM = INEGNUM + DIMAG( TMP ) ELSE IPOSNUM = IPOSNUM + DIMAG( TMP ) END IF ANS = ANS + TMP 40 CONTINUE * * The error bound is figured by * 2 * eps * (nnodes-1) * max(|max element|, |ans|). * The 2 allows for errors in the distributed _AND_ local result. * The eps is machine epsilon. The number of floating point adds * is (nnodes - 1). We use the fact that 0.5 is the maximum element * in order to save ourselves some computation. * TMP = ANS - A(I,J) ERRBND = 2 * EPS * NNODES * MAX( RPOSNUM, -RNEGNUM ) NUMOK = ( DBLE(TMP) .LE. ERRBND ) ERRBND = 2 * EPS * NNODES * MAX( IPOSNUM, -INEGNUM ) NUMOK = NUMOK .AND. ( DIMAG(TMP) .LE. ERRBND ) IF( .NOT.NUMOK ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = ANS END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of ZCHKSUM * END * * SUBROUTINE IAMXTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) INTEGER MEM(MEMLEN) * .. * * Purpose * ======= * ITESTAMX: Test integer AMX COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) INTEGER array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D EXTERNAL IINITMAT, ICHKPAD, IBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, ERRDPTR, ERRIPTR, I, $ IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, $ ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, $ ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, MAXERR, MYCOL, $ MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, PREAPTR, $ RAPTR, RDEST, RDEST2, SETWHAT, TESTNUM, VALPTR INTEGER CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = -911 IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( ISIZE * (MEMLEN-I) ) / ( ISIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MAX tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL IINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL IGAMX2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL ICHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL ICHKAMX(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL IRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL IBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL IBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('INTEGER AMX TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('INTEGER AMX TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('INTEGER AMX TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of ITESTAMX. * END * SUBROUTINE IBTSPCOORD( SCOPE, PNUM, MYROW, MYCOL, NPCOL, $ PROW, PCOL ) CHARACTER*1 SCOPE INTEGER PNUM, MYROW, MYCOL, NPCOL, PROW, PCOL * IF( SCOPE .EQ. 'R' ) THEN PROW = MYROW PCOL = PNUM ELSE IF( SCOPE .EQ. 'C' ) THEN PROW = PNUM PCOL = MYCOL ELSE PROW = PNUM / NPCOL PCOL = MOD( PNUM, NPCOL ) END IF RETURN * * End of ibtspcoord * END * INTEGER FUNCTION IBTSPNUM( SCOPE, PROW, PCOL, NPCOL ) CHARACTER*1 SCOPE INTEGER PROW, PCOL, NPCOL IF( SCOPE .EQ. 'R' ) THEN IBTSPNUM = PCOL ELSE IF( SCOPE .EQ. 'C' ) THEN IBTSPNUM = PROW ELSE IBTSPNUM = PROW*NPCOL + PCOL END IF * RETURN * * End of ibtscpnum * END * SUBROUTINE IRCCHK( IPRE, IPOST, PADVAL, M, N, RA, CA, LDI, MYROW, $ MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * .. Scalar Arguments .. INTEGER IPRE, IPOST, PADVAL, M, N, LDI, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR) INTEGER ERRDBUF(2, MAXERR) * .. * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, K, IAM * .. * .. Executable Statements .. * IAM = MYROW * IBTNPROCS() + MYCOL * * Check pre padding * IF( LDI .NE. -1 ) THEN IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = INT( RA(I) ) ERRDBUF(2, NERR) = INT( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = INT( CA(I) ) ERRDBUF(2, NERR) = INT( PADVAL ) END IF ENDIF 10 CONTINUE END IF * * Check post padding * IF( IPOST .GT. 0 ) THEN K = IPRE + LDI*N DO 20 I = K+1, K+IPOST IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -ERR_POST ERRDBUF(1, NERR) = INT( RA(I) ) ERRDBUF(2, NERR) = INT( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -10 - ERR_POST ERRDBUF(1, NERR) = INT( CA(I) ) ERRDBUF(2, NERR) = INT( PADVAL ) END IF ENDIF 20 CONTINUE END IF * * Check all (LDI-M) gaps * IF( LDI .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = M+1, LDI K = IPRE + (J-1)*LDI + I IF( RA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -ERR_GAP ERRDBUF(1, NERR) = INT( RA(K) ) ERRDBUF(2, NERR) = INT( PADVAL ) END IF END IF IF( CA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -10 - ERR_GAP ERRDBUF(1, NERR) = INT( CA(K) ) ERRDBUF(2, NERR) = INT( PADVAL ) END IF END IF 30 CONTINUE 40 CONTINUE END IF * * if RA and CA don't exist, buffs better be untouched * ELSE DO 50 I = 1, IPRE+IPOST IF( RA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = INT( RA(I) ) ERRDBUF(2, NERR) = INT( PADVAL ) END IF END IF IF( CA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = INT( CA(I) ) ERRDBUF(2, NERR) = INT( PADVAL ) END IF END IF 50 CONTINUE ENDIF * RETURN END * SUBROUTINE ICHKAMX( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) INTEGER A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM, IBTRAN, IBTABS EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, IBTRAN EXTERNAL IBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMX, CAMX INTEGER IAMX, I, J, K, H, DEST, NODE * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = IBTRAN( ISEED ) IAMX = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = IBTRAN( ISEED(K*4+1) ) IF( IBTABS( VALS(K+1) ) .GT. IBTABS( VALS(IAMX) ) ) $ IAMX = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMX) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = IBTABS( VALS(K) ).NE.IBTABS( VALS(IAMX) ) IF( .NOT.ERROR ) IAMX = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ( IBTABS( A(I,J) ) .NE. IBTABS( VALS(IAMX) ) ) IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMX) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMX ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMX) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMX-1, MYROW, MYCOL, $ NPCOL, RAMX, CAMX ) IF( RAMX .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMX END IF IF( CAMX .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMX END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of ICHKAMX * END * * SUBROUTINE SAMXTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) REAL MEM(MEMLEN) * .. * * Purpose * ======= * STESTAMX: Test real AMX COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) REAL array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGAMX2D EXTERNAL SINITMAT, SCHKPAD, SBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, ERRDPTR, ERRIPTR, I, $ IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, $ ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, $ ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, MAXERR, MYCOL, $ MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, PREAPTR, $ RAPTR, RDEST, RDEST2, SETWHAT, SSIZE, TESTNUM, VALPTR REAL CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = -0.61E0 IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') SSIZE = IBTSIZEOF('S') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( SSIZE * (MEMLEN-I) ) / ( SSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MAX tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL SINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL SGAMX2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL SCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL SCHKAMX(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL SRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL SBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL SBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('REAL AMX TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('REAL AMX TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('REAL AMX TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of STESTAMX. * END * SUBROUTINE SRCCHK( IPRE, IPOST, PADVAL, M, N, RA, CA, LDI, MYROW, $ MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * .. Scalar Arguments .. INTEGER IPRE, IPOST, PADVAL, M, N, LDI, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR) REAL ERRDBUF(2, MAXERR) * .. * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, K, IAM * .. * .. Executable Statements .. * IAM = MYROW * IBTNPROCS() + MYCOL * * Check pre padding * IF( LDI .NE. -1 ) THEN IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = REAL( RA(I) ) ERRDBUF(2, NERR) = REAL( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = REAL( CA(I) ) ERRDBUF(2, NERR) = REAL( PADVAL ) END IF ENDIF 10 CONTINUE END IF * * Check post padding * IF( IPOST .GT. 0 ) THEN K = IPRE + LDI*N DO 20 I = K+1, K+IPOST IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -ERR_POST ERRDBUF(1, NERR) = REAL( RA(I) ) ERRDBUF(2, NERR) = REAL( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -10 - ERR_POST ERRDBUF(1, NERR) = REAL( CA(I) ) ERRDBUF(2, NERR) = REAL( PADVAL ) END IF ENDIF 20 CONTINUE END IF * * Check all (LDI-M) gaps * IF( LDI .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = M+1, LDI K = IPRE + (J-1)*LDI + I IF( RA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -ERR_GAP ERRDBUF(1, NERR) = REAL( RA(K) ) ERRDBUF(2, NERR) = REAL( PADVAL ) END IF END IF IF( CA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -10 - ERR_GAP ERRDBUF(1, NERR) = REAL( CA(K) ) ERRDBUF(2, NERR) = REAL( PADVAL ) END IF END IF 30 CONTINUE 40 CONTINUE END IF * * if RA and CA don't exist, buffs better be untouched * ELSE DO 50 I = 1, IPRE+IPOST IF( RA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = REAL( RA(I) ) ERRDBUF(2, NERR) = REAL( PADVAL ) END IF END IF IF( CA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = REAL( CA(I) ) ERRDBUF(2, NERR) = REAL( PADVAL ) END IF END IF 50 CONTINUE ENDIF * RETURN END * SUBROUTINE SCHKAMX( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) REAL A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM REAL SBTEPS, SBTABS REAL SBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, SBTRAN, SBTEPS, SBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMX, CAMX INTEGER IAMX, I, J, K, H, DEST, NODE REAL EPS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = SBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = SBTRAN( ISEED ) IAMX = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = SBTRAN( ISEED(K*4+1) ) IF( SBTABS( VALS(K+1) ) .GT. SBTABS( VALS(IAMX) ) ) $ IAMX = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMX) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = SBTABS( VALS(K) ).NE.SBTABS( VALS(IAMX) ) IF( .NOT.ERROR ) IAMX = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ( SBTABS( A(I,J) ) .NE. SBTABS( VALS(IAMX) ) ) IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMX) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMX ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMX) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMX-1, MYROW, MYCOL, $ NPCOL, RAMX, CAMX ) IF( RAMX .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMX END IF IF( CAMX .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMX END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of SCHKAMX * END * * SUBROUTINE DAMXTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) DOUBLE PRECISION MEM(MEMLEN) * .. * * Purpose * ======= * DTESTAMX: Test double precision AMX COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) DOUBLE PRECISION array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMX2D EXTERNAL DINITMAT, DCHKPAD, DBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, DSIZE, ERRDPTR, $ ERRIPTR, I, IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, $ IPRE, ISC, ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, $ ITR, ITR1, ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, $ MAXERR, MYCOL, MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, $ PREAPTR, RAPTR, RDEST, RDEST2, SETWHAT, TESTNUM, VALPTR DOUBLE PRECISION CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = -0.81D0 IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') DSIZE = IBTSIZEOF('D') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( DSIZE * (MEMLEN-I) ) / ( DSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MAX tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL DINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL DGAMX2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL DCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL DCHKAMX(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL DRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL DBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL DBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE PRECISION AMX TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('DOUBLE PRECISION AMX TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('DOUBLE PRECISION AMX TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of DTESTAMX. * END * SUBROUTINE DRCCHK( IPRE, IPOST, PADVAL, M, N, RA, CA, LDI, MYROW, $ MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * .. Scalar Arguments .. INTEGER IPRE, IPOST, PADVAL, M, N, LDI, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR) DOUBLE PRECISION ERRDBUF(2, MAXERR) * .. * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, K, IAM * .. * .. Executable Statements .. * IAM = MYROW * IBTNPROCS() + MYCOL * * Check pre padding * IF( LDI .NE. -1 ) THEN IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = DBLE( RA(I) ) ERRDBUF(2, NERR) = DBLE( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = DBLE( CA(I) ) ERRDBUF(2, NERR) = DBLE( PADVAL ) END IF ENDIF 10 CONTINUE END IF * * Check post padding * IF( IPOST .GT. 0 ) THEN K = IPRE + LDI*N DO 20 I = K+1, K+IPOST IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -ERR_POST ERRDBUF(1, NERR) = DBLE( RA(I) ) ERRDBUF(2, NERR) = DBLE( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -10 - ERR_POST ERRDBUF(1, NERR) = DBLE( CA(I) ) ERRDBUF(2, NERR) = DBLE( PADVAL ) END IF ENDIF 20 CONTINUE END IF * * Check all (LDI-M) gaps * IF( LDI .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = M+1, LDI K = IPRE + (J-1)*LDI + I IF( RA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -ERR_GAP ERRDBUF(1, NERR) = DBLE( RA(K) ) ERRDBUF(2, NERR) = DBLE( PADVAL ) END IF END IF IF( CA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -10 - ERR_GAP ERRDBUF(1, NERR) = DBLE( CA(K) ) ERRDBUF(2, NERR) = DBLE( PADVAL ) END IF END IF 30 CONTINUE 40 CONTINUE END IF * * if RA and CA don't exist, buffs better be untouched * ELSE DO 50 I = 1, IPRE+IPOST IF( RA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = DBLE( RA(I) ) ERRDBUF(2, NERR) = DBLE( PADVAL ) END IF END IF IF( CA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = DBLE( CA(I) ) ERRDBUF(2, NERR) = DBLE( PADVAL ) END IF END IF 50 CONTINUE ENDIF * RETURN END * SUBROUTINE DCHKAMX( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) DOUBLE PRECISION A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM DOUBLE PRECISION DBTEPS, DBTABS DOUBLE PRECISION DBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, DBTRAN, DBTEPS, DBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMX, CAMX INTEGER IAMX, I, J, K, H, DEST, NODE DOUBLE PRECISION EPS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = DBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = DBTRAN( ISEED ) IAMX = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = DBTRAN( ISEED(K*4+1) ) IF( DBTABS( VALS(K+1) ) .GT. DBTABS( VALS(IAMX) ) ) $ IAMX = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMX) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = DBTABS( VALS(K) ).NE.DBTABS( VALS(IAMX) ) IF( .NOT.ERROR ) IAMX = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ( DBTABS( A(I,J) ) .NE. DBTABS( VALS(IAMX) ) ) IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMX) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMX ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMX) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMX-1, MYROW, MYCOL, $ NPCOL, RAMX, CAMX ) IF( RAMX .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMX END IF IF( CAMX .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMX END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of DCHKAMX * END * * SUBROUTINE CAMXTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * CTESTAMX: Test complex AMX COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGAMX2D EXTERNAL CINITMAT, CCHKPAD, CBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, CSIZE, ERRDPTR, $ ERRIPTR, I, IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, $ IPRE, ISC, ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, $ ITR, ITR1, ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, $ MAXERR, MYCOL, MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, $ PREAPTR, RAPTR, RDEST, RDEST2, SETWHAT, TESTNUM, VALPTR COMPLEX CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = CMPLX( -0.91E0, -0.71E0 ) IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') CSIZE = IBTSIZEOF('C') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( CSIZE * (MEMLEN-I) ) / ( CSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MAX tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL CINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL CGAMX2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL CCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL CCHKAMX(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL CRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL CBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL CBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('COMPLEX AMX TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('COMPLEX AMX TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('COMPLEX AMX TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of CTESTAMX. * END * SUBROUTINE CRCCHK( IPRE, IPOST, PADVAL, M, N, RA, CA, LDI, MYROW, $ MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * .. Scalar Arguments .. INTEGER IPRE, IPOST, PADVAL, M, N, LDI, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR) COMPLEX ERRDBUF(2, MAXERR) * .. * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, K, IAM * .. * .. Executable Statements .. * IAM = MYROW * IBTNPROCS() + MYCOL * * Check pre padding * IF( LDI .NE. -1 ) THEN IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = CMPLX( RA(I) ) ERRDBUF(2, NERR) = CMPLX( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = CMPLX( CA(I) ) ERRDBUF(2, NERR) = CMPLX( PADVAL ) END IF ENDIF 10 CONTINUE END IF * * Check post padding * IF( IPOST .GT. 0 ) THEN K = IPRE + LDI*N DO 20 I = K+1, K+IPOST IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -ERR_POST ERRDBUF(1, NERR) = CMPLX( RA(I) ) ERRDBUF(2, NERR) = CMPLX( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -10 - ERR_POST ERRDBUF(1, NERR) = CMPLX( CA(I) ) ERRDBUF(2, NERR) = CMPLX( PADVAL ) END IF ENDIF 20 CONTINUE END IF * * Check all (LDI-M) gaps * IF( LDI .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = M+1, LDI K = IPRE + (J-1)*LDI + I IF( RA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -ERR_GAP ERRDBUF(1, NERR) = CMPLX( RA(K) ) ERRDBUF(2, NERR) = CMPLX( PADVAL ) END IF END IF IF( CA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -10 - ERR_GAP ERRDBUF(1, NERR) = CMPLX( CA(K) ) ERRDBUF(2, NERR) = CMPLX( PADVAL ) END IF END IF 30 CONTINUE 40 CONTINUE END IF * * if RA and CA don't exist, buffs better be untouched * ELSE DO 50 I = 1, IPRE+IPOST IF( RA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = CMPLX( RA(I) ) ERRDBUF(2, NERR) = CMPLX( PADVAL ) END IF END IF IF( CA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = CMPLX( CA(I) ) ERRDBUF(2, NERR) = CMPLX( PADVAL ) END IF END IF 50 CONTINUE ENDIF * RETURN END * SUBROUTINE CCHKAMX( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) COMPLEX A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM REAL SBTEPS, CBTABS COMPLEX CBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, CBTRAN, SBTEPS, CBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMX, CAMX INTEGER IAMX, I, J, K, H, DEST, NODE REAL EPS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = SBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = CBTRAN( ISEED ) IAMX = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = CBTRAN( ISEED(K*4+1) ) IF( CBTABS( VALS(K+1) ) .GT. CBTABS( VALS(IAMX) ) ) $ IAMX = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMX) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = ABS( CBTABS(VALS(K)) - CBTABS(VALS(IAMX)) ) $ .GT. 3*EPS IF( .NOT.ERROR ) IAMX = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ABS( CBTABS(A(I,J)) - CBTABS(VALS(IAMX)) ) $ .GT. 3*EPS IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMX) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMX ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMX) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMX-1, MYROW, MYCOL, $ NPCOL, RAMX, CAMX ) IF( RAMX .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMX END IF IF( CAMX .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMX END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of CCHKAMX * END * * SUBROUTINE ZAMXTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) DOUBLE COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * ZTESTAMX: Test double complex AMX COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) DOUBLE COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, ZGAMX2D EXTERNAL ZINITMAT, ZCHKPAD, ZBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, ERRDPTR, ERRIPTR, I, $ IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, $ ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, $ ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, MAXERR, MYCOL, $ MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, PREAPTR, $ RAPTR, RDEST, RDEST2, SETWHAT, TESTNUM, VALPTR, ZSIZE DOUBLE COMPLEX CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = DCMPLX( -9.11D0, -9.21D0 ) IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') ZSIZE = IBTSIZEOF('Z') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( ZSIZE * (MEMLEN-I) ) / ( ZSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MAX tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL ZINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL ZGAMX2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL ZCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL ZCHKAMX(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL ZRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL ZBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL ZBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE COMPLEX AMX TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('DOUBLE COMPLEX AMX TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('DOUBLE COMPLEX AMX TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of ZTESTAMX. * END * SUBROUTINE ZRCCHK( IPRE, IPOST, PADVAL, M, N, RA, CA, LDI, MYROW, $ MYCOL, TESTNUM, MAXERR, NERR, $ ERRIBUF, ERRDBUF ) * * .. Scalar Arguments .. INTEGER IPRE, IPOST, PADVAL, M, N, LDI, MYROW, MYCOL, TESTNUM INTEGER MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR) DOUBLE COMPLEX ERRDBUF(2, MAXERR) * .. * .. Parameters .. INTEGER ERR_PRE, ERR_POST, ERR_GAP, ERR_TRI, ERR_MAT PARAMETER( ERR_PRE = 1, ERR_POST = 2, ERR_GAP = 3, ERR_TRI = 4 ) PARAMETER( ERR_MAT = 5 ) * .. * .. External Functions .. INTEGER IBTNPROCS EXTERNAL IBTNPROCS * .. * .. Local Scalars .. INTEGER I, J, K, IAM * .. * .. Executable Statements .. * IAM = MYROW * IBTNPROCS() + MYCOL * * Check pre padding * IF( LDI .NE. -1 ) THEN IF( IPRE .GT. 0 ) THEN DO 10 I = 1, IPRE IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = DCMPLX( RA(I) ) ERRDBUF(2, NERR) = DCMPLX( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE - I + 1 ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = DCMPLX( CA(I) ) ERRDBUF(2, NERR) = DCMPLX( PADVAL ) END IF ENDIF 10 CONTINUE END IF * * Check post padding * IF( IPOST .GT. 0 ) THEN K = IPRE + LDI*N DO 20 I = K+1, K+IPOST IF( RA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -ERR_POST ERRDBUF(1, NERR) = DCMPLX( RA(I) ) ERRDBUF(2, NERR) = DCMPLX( PADVAL ) END IF ENDIF IF( CA(I) .NE. PADVAL ) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I - K ERRIBUF(5, NERR) = I ERRIBUF(6, NERR) = -10 - ERR_POST ERRDBUF(1, NERR) = DCMPLX( CA(I) ) ERRDBUF(2, NERR) = DCMPLX( PADVAL ) END IF ENDIF 20 CONTINUE END IF * * Check all (LDI-M) gaps * IF( LDI .GT. M ) THEN K = IPRE + M + 1 DO 40 J = 1, N DO 30 I = M+1, LDI K = IPRE + (J-1)*LDI + I IF( RA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -ERR_GAP ERRDBUF(1, NERR) = DCMPLX( RA(K) ) ERRDBUF(2, NERR) = DCMPLX( PADVAL ) END IF END IF IF( CA(K) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -10 - ERR_GAP ERRDBUF(1, NERR) = DCMPLX( CA(K) ) ERRDBUF(2, NERR) = DCMPLX( PADVAL ) END IF END IF 30 CONTINUE 40 CONTINUE END IF * * if RA and CA don't exist, buffs better be untouched * ELSE DO 50 I = 1, IPRE+IPOST IF( RA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -ERR_PRE ERRDBUF(1, NERR) = DCMPLX( RA(I) ) ERRDBUF(2, NERR) = DCMPLX( PADVAL ) END IF END IF IF( CA(I) .NE. PADVAL) THEN NERR = NERR + 1 IF( NERR .LE. MAXERR ) THEN ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = LDI ERRIBUF(3, NERR) = IAM ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = IPRE+IPOST ERRIBUF(6, NERR) = -10 - ERR_PRE ERRDBUF(1, NERR) = DCMPLX( CA(I) ) ERRDBUF(2, NERR) = DCMPLX( PADVAL ) END IF END IF 50 CONTINUE ENDIF * RETURN END * SUBROUTINE ZCHKAMX( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) DOUBLE COMPLEX A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM DOUBLE PRECISION DBTEPS, ZBTABS DOUBLE COMPLEX ZBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, ZBTRAN, DBTEPS, ZBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMX, CAMX INTEGER IAMX, I, J, K, H, DEST, NODE DOUBLE PRECISION EPS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = DBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = ZBTRAN( ISEED ) IAMX = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = ZBTRAN( ISEED(K*4+1) ) IF( ZBTABS( VALS(K+1) ) .GT. ZBTABS( VALS(IAMX) ) ) $ IAMX = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMX) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = ABS( ZBTABS(VALS(K)) - ZBTABS(VALS(IAMX)) ) $ .GT. 3*EPS IF( .NOT.ERROR ) IAMX = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ABS( ZBTABS(A(I,J)) - ZBTABS(VALS(IAMX)) ) $ .GT. 3*EPS IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMX) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMX ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMX) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMX-1, MYROW, MYCOL, $ NPCOL, RAMX, CAMX ) IF( RAMX .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMX END IF IF( CAMX .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMX END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of ZCHKAMX * END * * SUBROUTINE IAMNTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) INTEGER MEM(MEMLEN) * .. * * Purpose * ======= * ITESTAMN: Test integer AMN COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) INTEGER array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMN2D EXTERNAL IINITMAT, ICHKPAD, IBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, ERRDPTR, ERRIPTR, I, $ IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, $ ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, $ ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, MAXERR, MYCOL, $ MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, PREAPTR, $ RAPTR, RDEST, RDEST2, SETWHAT, TESTNUM, VALPTR INTEGER CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = -911 IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( ISIZE * (MEMLEN-I) ) / ( ISIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MIN tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL IINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL IGAMN2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL ICHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL ICHKAMN(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL IRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL IBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL IBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('INTEGER AMN TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('INTEGER AMN TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('INTEGER AMN TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of ITESTAMN. * END * SUBROUTINE ICHKAMN( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) INTEGER A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM, IBTRAN, IBTABS EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, IBTRAN EXTERNAL IBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMN, CAMN INTEGER IAMN, I, J, K, H, DEST, NODE * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = IBTRAN( ISEED ) IAMN = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = IBTRAN( ISEED(K*4+1) ) IF( IBTABS( VALS(K+1) ) .LT. IBTABS( VALS(IAMN) ) ) $ IAMN = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMN) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = IBTABS( VALS(K) ).NE.IBTABS( VALS(IAMN) ) IF( .NOT.ERROR ) IAMN = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ( IBTABS( A(I,J) ) .NE. IBTABS( VALS(IAMN) ) ) IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMN) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMN ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMN) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMN-1, MYROW, MYCOL, $ NPCOL, RAMN, CAMN ) IF( RAMN .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMN END IF IF( CAMN .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMN END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of ICHKAMN * END * * SUBROUTINE SAMNTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) REAL MEM(MEMLEN) * .. * * Purpose * ======= * STESTAMN: Test real AMN COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) REAL array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, SGAMN2D EXTERNAL SINITMAT, SCHKPAD, SBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, ERRDPTR, ERRIPTR, I, $ IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, $ ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, $ ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, MAXERR, MYCOL, $ MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, PREAPTR, $ RAPTR, RDEST, RDEST2, SETWHAT, SSIZE, TESTNUM, VALPTR REAL CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = -0.61E0 IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') SSIZE = IBTSIZEOF('S') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( SSIZE * (MEMLEN-I) ) / ( SSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MIN tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL SINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL SGAMN2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL SCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL SCHKAMN(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL SRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL SBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL SBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('REAL AMN TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('REAL AMN TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('REAL AMN TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of STESTAMN. * END * SUBROUTINE SCHKAMN( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) REAL A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM REAL SBTEPS, SBTABS REAL SBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, SBTRAN, SBTEPS, SBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMN, CAMN INTEGER IAMN, I, J, K, H, DEST, NODE REAL EPS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = SBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = SBTRAN( ISEED ) IAMN = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = SBTRAN( ISEED(K*4+1) ) IF( SBTABS( VALS(K+1) ) .LT. SBTABS( VALS(IAMN) ) ) $ IAMN = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMN) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = SBTABS( VALS(K) ).NE.SBTABS( VALS(IAMN) ) IF( .NOT.ERROR ) IAMN = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ( SBTABS( A(I,J) ) .NE. SBTABS( VALS(IAMN) ) ) IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMN) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMN ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMN) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMN-1, MYROW, MYCOL, $ NPCOL, RAMN, CAMN ) IF( RAMN .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMN END IF IF( CAMN .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMN END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of SCHKAMN * END * * SUBROUTINE DAMNTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) DOUBLE PRECISION MEM(MEMLEN) * .. * * Purpose * ======= * DTESTAMN: Test double precision AMN COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) DOUBLE PRECISION array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DGAMN2D EXTERNAL DINITMAT, DCHKPAD, DBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, DSIZE, ERRDPTR, $ ERRIPTR, I, IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, $ IPRE, ISC, ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, $ ITR, ITR1, ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, $ MAXERR, MYCOL, MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, $ PREAPTR, RAPTR, RDEST, RDEST2, SETWHAT, TESTNUM, VALPTR DOUBLE PRECISION CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = -0.81D0 IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') DSIZE = IBTSIZEOF('D') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( DSIZE * (MEMLEN-I) ) / ( DSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MIN tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL DINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL DGAMN2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL DCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL DCHKAMN(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL DRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL DBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL DBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE PRECISION AMN TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('DOUBLE PRECISION AMN TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('DOUBLE PRECISION AMN TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of DTESTAMN. * END * SUBROUTINE DCHKAMN( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) DOUBLE PRECISION A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM DOUBLE PRECISION DBTEPS, DBTABS DOUBLE PRECISION DBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, DBTRAN, DBTEPS, DBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMN, CAMN INTEGER IAMN, I, J, K, H, DEST, NODE DOUBLE PRECISION EPS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = DBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = DBTRAN( ISEED ) IAMN = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = DBTRAN( ISEED(K*4+1) ) IF( DBTABS( VALS(K+1) ) .LT. DBTABS( VALS(IAMN) ) ) $ IAMN = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMN) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = DBTABS( VALS(K) ).NE.DBTABS( VALS(IAMN) ) IF( .NOT.ERROR ) IAMN = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ( DBTABS( A(I,J) ) .NE. DBTABS( VALS(IAMN) ) ) IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMN) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMN ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMN) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMN-1, MYROW, MYCOL, $ NPCOL, RAMN, CAMN ) IF( RAMN .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMN END IF IF( CAMN .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMN END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of DCHKAMN * END * * SUBROUTINE CAMNTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * CTESTAMN: Test complex AMN COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, CGAMN2D EXTERNAL CINITMAT, CCHKPAD, CBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, CSIZE, ERRDPTR, $ ERRIPTR, I, IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, $ IPRE, ISC, ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, $ ITR, ITR1, ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, $ MAXERR, MYCOL, MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, $ PREAPTR, RAPTR, RDEST, RDEST2, SETWHAT, TESTNUM, VALPTR COMPLEX CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = CMPLX( -0.91E0, -0.71E0 ) IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') CSIZE = IBTSIZEOF('C') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( CSIZE * (MEMLEN-I) ) / ( CSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MIN tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL CINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL CGAMN2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL CCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL CCHKAMN(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL CRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL CBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL CBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('COMPLEX AMN TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('COMPLEX AMN TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('COMPLEX AMN TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of CTESTAMN. * END * SUBROUTINE CCHKAMN( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) COMPLEX A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM REAL SBTEPS, CBTABS COMPLEX CBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, CBTRAN, SBTEPS, CBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMN, CAMN INTEGER IAMN, I, J, K, H, DEST, NODE REAL EPS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = SBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = CBTRAN( ISEED ) IAMN = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = CBTRAN( ISEED(K*4+1) ) IF( CBTABS( VALS(K+1) ) .LT. CBTABS( VALS(IAMN) ) ) $ IAMN = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMN) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = ABS( CBTABS(VALS(K)) - CBTABS(VALS(IAMN)) ) $ .GT. 3*EPS IF( .NOT.ERROR ) IAMN = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ABS( CBTABS(A(I,J)) - CBTABS(VALS(IAMN)) ) $ .GT. 3*EPS IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMN) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMN ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMN) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMN-1, MYROW, MYCOL, $ NPCOL, RAMN, CAMN ) IF( RAMN .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMN END IF IF( CAMN .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMN END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of CCHKAMN * END * * SUBROUTINE ZAMNTEST( OUTNUM, VERB, TOPSREPEAT, TOPSCOHRNT, NSCOPE, $ SCOPE0, NTOP, TOP0, NMAT, M0, N0, LDAS0, $ LDAD0, LDI0, NDEST, RDEST0, CDEST0, NGRID, $ CONTEXT0, P0, Q0, ISEED, RMEM, CMEM, RCLEN, $ MEM, MEMLEN ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER MEMLEN, NDEST, NGRID, NMAT, NSCOPE, NTOP, OUTNUM, RCLEN, $ TOPSCOHRNT, TOPSREPEAT, VERB * .. * .. Array Arguments .. CHARACTER*1 SCOPE0(NSCOPE), TOP0(NTOP) INTEGER M0(NMAT), N0(NMAT), LDAS0(NMAT), LDAD0(NMAT), LDI0(NMAT) INTEGER RDEST0(NDEST), CDEST0(NDEST), CONTEXT0(NGRID) INTEGER P0(NGRID), Q0(NGRID), ISEED(*), RMEM(RCLEN), CMEM(RCLEN) DOUBLE COMPLEX MEM(MEMLEN) * .. * * Purpose * ======= * ZTESTAMN: Test double complex AMN COMBINE * * Arguments * ========= * OUTNUM (input) INTEGER * The device number to write output to. * * VERB (input) INTEGER * The level of verbosity (how much printing to do). * * NSCOPE (input) INTEGER * The number of scopes to be tested. * * SCOPE0 (input) CHARACTER*1 array of dimension (NSCOPE) * Values of the scopes to be tested. * * NTOP (input) INTEGER * The number of topologies to be tested. * * TOP0 (input) CHARACTER*1 array of dimension (NTOP) * Values of the topologies to be tested. * * NMAT (input) INTEGER * The number of matrices to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * M0 (input) INTEGER array of dimension (NMAT) * Values of M to be tested. * * N0 (input) INTEGER array of dimension (NMAT) * Values of N to be tested. * * LDAS0 (input) INTEGER array of dimension (NMAT) * Values of LDAS (leading dimension of A on source process) * to be tested. * * LDAD0 (input) INTEGER array of dimension (NMAT) * Values of LDAD (leading dimension of A on destination * process) to be tested. * LDI0 (input) INTEGER array of dimension (NMAT) * Values of LDI (leading dimension of RA/CA) to be tested. * If LDI == -1, these RA/CA should not be accessed. * * NDEST (input) INTEGER * The number of destinations to be tested. * * RDEST0 (input) INTEGER array of dimension (NNDEST) * Values of RDEST (row coordinate of destination) to be * tested. * * CDEST0 (input) INTEGER array of dimension (NNDEST) * Values of CDEST (column coordinate of destination) to be * tested. * * NGRID (input) INTEGER * The number of process grids to be tested. * * CONTEXT0 (input) INTEGER array of dimension (NGRID) * The BLACS context handles corresponding to the grids. * * P0 (input) INTEGER array of dimension (NGRID) * Values of P (number of process rows, NPROW). * * Q0 (input) INTEGER array of dimension (NGRID) * Values of Q (number of process columns, NPCOL). * * ISEED (workspace) INTEGER array of dimension ( MAX(NPROCS, NTESTS) ) * Workspace used to hold each process's random number SEED. * This requires NPROCS (number of processor) elements. * If VERB < 2, this workspace also serves to indicate which * tests fail. This requires workspace of NTESTS * (number of tests performed). * * RMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all RA arrays, and their pre and post padding. * * CMEM (workspace) INTEGER array of dimension (RCLEN) * Used for all CA arrays, and their pre and post padding. * * RCLEN (input) INTEGER * The length, in elements, of RMEM and CMEM. * * MEM (workspace) DOUBLE COMPLEX array of dimension (MEMLEN) * Used for all other workspaces, including the matrix A, * and its pre and post padding. * * MEMLEN (input) INTEGER * The length, in elements, of MEM. * * ===================================================================== * * .. External Functions .. LOGICAL ALLPASS, LSAME INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL ALLPASS, LSAME, IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, ZGAMN2D EXTERNAL ZINITMAT, ZCHKPAD, ZBTCHECKIN * .. * .. Local Scalars .. CHARACTER*1 SCOPE, TOP LOGICAL INGRID, TESTOK, ALLRCV INTEGER APTR, CAPTR, CDEST, CDEST2, CONTEXT, ERRDPTR, ERRIPTR, I, $ IAM, ICHECKVAL, IDE, IGR, IMA, IPAD, IPOST, IPRE, ISC, $ ISIZE, ISTART, ISTOP, ITC, ITC1, ITC2, ITO, ITR, ITR1, $ ITR2, J, K, LDA, LDADST, LDASRC, LDI, M, MAXERR, MYCOL, $ MYROW, N, NERR, NFAIL, NPCOL, NPROW, NSKIP, PREAPTR, $ RAPTR, RDEST, RDEST2, SETWHAT, TESTNUM, VALPTR, ZSIZE DOUBLE COMPLEX CHECKVAL * .. * .. Executable Statements .. * * Choose padding value, and make it unique * CHECKVAL = DCMPLX( -9.11D0, -9.21D0 ) IAM = IBTMYPROC() CHECKVAL = IAM * CHECKVAL ISIZE = IBTSIZEOF('I') ZSIZE = IBTSIZEOF('Z') ICHECKVAL = -IAM * * Verify file parameters * IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, *) ' ' WRITE(OUTNUM, 1000 ) IF( VERB .GT. 0 ) THEN WRITE(OUTNUM,*) ' ' WRITE(OUTNUM, 2000) 'NSCOPE:', NSCOPE WRITE(OUTNUM, 3000) ' SCOPE:', ( SCOPE0(I), I = 1, NSCOPE ) WRITE(OUTNUM, 2000) 'TReps :', TOPSREPEAT WRITE(OUTNUM, 2000) 'TCohr :', TOPSCOHRNT WRITE(OUTNUM, 2000) 'NTOP :', NTOP WRITE(OUTNUM, 3000) ' TOP :', ( TOP0(I), I = 1, NTOP ) WRITE(OUTNUM, 2000) 'NMAT :', NMAT WRITE(OUTNUM, 2000) ' M :', ( M0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' N :', ( N0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAS :', ( LDAS0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDAD :', ( LDAD0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) ' LDI :', ( LDI0(I), I = 1, NMAT ) WRITE(OUTNUM, 2000) 'NDEST :', NDEST WRITE(OUTNUM, 2000) ' RDEST:',( RDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) ' CDEST:',( CDEST0(I), I = 1, NDEST ) WRITE(OUTNUM, 2000) 'NGRIDS:', NGRID WRITE(OUTNUM, 2000) ' P :', ( P0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) ' Q :', ( Q0(I), I = 1, NGRID ) WRITE(OUTNUM, 2000) 'VERB :', VERB WRITE(OUTNUM,*) ' ' END IF IF( VERB .GT. 1 ) THEN WRITE(OUTNUM,4000) WRITE(OUTNUM,5000) END IF END IF IF (TOPSREPEAT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSREPEAT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF * * Find biggest matrix, so we know where to stick error info * I = 0 DO 10 IMA = 1, NMAT IPAD = 4 * M0(IMA) K = N0(IMA) * MAX0( LDAS0(IMA), LDAD0(IMA) ) + IPAD IF( K .GT. I ) I = K 10 CONTINUE I = I + IBTNPROCS() MAXERR = ( ZSIZE * (MEMLEN-I) ) / ( ZSIZE*2 + ISIZE*6 ) IF( MAXERR .LT. 1 ) THEN WRITE(OUTNUM,*) 'ERROR: Not enough memory to run MIN tests.' CALL BLACS_ABORT(-1, 1) END IF ERRDPTR = I + 1 ERRIPTR = ERRDPTR + MAXERR NERR = 0 TESTNUM = 0 NFAIL = 0 NSKIP = 0 * * Loop over grids of matrix * DO 90 IGR = 1, NGRID * * allocate process grid for the next batch of tests * CONTEXT = CONTEXT0(IGR) CALL BLACS_GRIDINFO( CONTEXT, NPROW, NPCOL, MYROW, MYCOL ) INGRID = ( (MYROW.LT.NPROW) .AND. (MYCOL.LT.NPCOL) ) * DO 80 ISC = 1, NSCOPE SCOPE = SCOPE0(ISC) DO 70 ITO = 1, NTOP TOP = TOP0(ITO) * * If testing multiring ('M') or general tree ('T'), need to * loop over calls to BLACS_SET to do full test * IF( LSAME(TOP, 'M') ) THEN SETWHAT = 13 IF( SCOPE .EQ. 'R' ) THEN ISTART = -(NPCOL - 1) ISTOP = -ISTART ELSE IF (SCOPE .EQ. 'C') THEN ISTART = -(NPROW - 1) ISTOP = -ISTART ELSE ISTART = -(NPROW*NPCOL - 1) ISTOP = -ISTART ENDIF ELSE IF( LSAME(TOP, 'T') ) THEN SETWHAT = 14 ISTART = 1 IF( SCOPE .EQ. 'R' ) THEN ISTOP = NPCOL - 1 ELSE IF (SCOPE .EQ. 'C') THEN ISTOP = NPROW - 1 ELSE ISTOP = NPROW*NPCOL - 1 ENDIF ELSE SETWHAT = 0 ISTART = 1 ISTOP = 1 ENDIF DO 60 IMA = 1, NMAT M = M0(IMA) N = N0(IMA) LDASRC = LDAS0(IMA) LDADST = LDAD0(IMA) LDI = LDI0(IMA) IPRE = 2 * M IPOST = IPRE PREAPTR = 1 APTR = PREAPTR + IPRE * DO 50 IDE = 1, NDEST TESTNUM = TESTNUM + 1 RDEST2 = RDEST0(IDE) CDEST2 = CDEST0(IDE) * * If everyone gets the answer, create some bogus rdest/cdest * so IF's are easier * ALLRCV = ( (RDEST2.EQ.-1) .OR. (CDEST2.EQ.-1) ) IF( ALLRCV ) THEN RDEST = NPROW - 1 CDEST = NPCOL - 1 IF (TOPSCOHRNT.EQ.0) THEN ITR1 = 0 ITR2 = 0 ELSE IF (TOPSCOHRNT.EQ.1) THEN ITR1 = 1 ITR2 = 1 ELSE ITR1 = 0 ITR2 = 1 END IF ELSE RDEST = RDEST2 CDEST = CDEST2 ITC1 = 0 ITC2 = 0 END IF IF( RDEST.GE.P0(IGR) .OR. CDEST.GE.Q0(IGR) ) THEN NSKIP = NSKIP + 1 GOTO 50 END IF * IF( MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST ) THEN LDA = LDADST ELSE LDA = LDASRC END IF VALPTR = APTR + IPOST + N * LDA IF( VERB .GT. 1 ) THEN IF( IAM .EQ. 0 ) THEN WRITE(OUTNUM, 6000) $ TESTNUM, 'RUNNING', SCOPE, TOP, M, N, $ LDASRC, LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * If I am in scope * TESTOK = .TRUE. IF( INGRID ) THEN IF( (MYROW.EQ.RDEST .AND. SCOPE.EQ.'R') .OR. $ (MYCOL.EQ.CDEST .AND. SCOPE.EQ.'C') .OR. $ (SCOPE .EQ. 'A') ) THEN * K = NERR DO 40 ITR = ITR1, ITR2 CALL BLACS_SET(CONTEXT, 15, ITR) DO 35 ITC = ITC1, ITC2 CALL BLACS_SET(CONTEXT, 16, ITC) DO 30 J = ISTART, ISTOP IF( J.EQ.0) GOTO 30 IF( SETWHAT.NE.0 ) $ CALL BLACS_SET(CONTEXT, SETWHAT, J) * * * generate and pad matrix A * CALL ZINITMAT('G','-', M, N, MEM(PREAPTR), $ LDA, IPRE, IPOST, $ CHECKVAL, TESTNUM, $ MYROW, MYCOL ) * * If they exist, pad RA and CA arrays * IF( LDI .NE. -1 ) THEN DO 15 I = 1, N*LDI + IPRE + IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 15 CONTINUE RAPTR = 1 + IPRE CAPTR = 1 + IPRE ELSE DO 20 I = 1, IPRE+IPOST RMEM(I) = ICHECKVAL CMEM(I) = ICHECKVAL 20 CONTINUE RAPTR = 1 CAPTR = 1 END IF * CALL ZGAMN2D(CONTEXT, SCOPE, TOP, M, N, $ MEM(APTR), LDA, RMEM(RAPTR), $ CMEM(CAPTR), LDI, $ RDEST2, CDEST2) * * If I've got the answer, check for errors in * matrix or padding * IF( (MYROW.EQ.RDEST .AND. MYCOL.EQ.CDEST) $ .OR. ALLRCV ) THEN CALL ZCHKPAD('G','-', M, N, $ MEM(PREAPTR), LDA, RDEST, $ CDEST, MYROW, MYCOL, $ IPRE, IPOST, CHECKVAL, $ TESTNUM, MAXERR, NERR, $ MEM(ERRIPTR),MEM(ERRDPTR)) CALL ZCHKAMN(SCOPE, CONTEXT, M, N, $ MEM(APTR), LDA, $ RMEM(RAPTR), CMEM(CAPTR), $ LDI, TESTNUM, MAXERR,NERR, $ MEM(ERRIPTR),MEM(ERRDPTR), $ ISEED, MEM(VALPTR)) CALL ZRCCHK(IPRE, IPOST, ICHECKVAL, $ M, N, RMEM, CMEM, LDI, $ MYROW, MYCOL, TESTNUM, $ MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR)) END IF 30 CONTINUE CALL BLACS_SET(CONTEXT, 16, 0) 35 CONTINUE CALL BLACS_SET(CONTEXT, 15, 0) 40 CONTINUE TESTOK = ( K .EQ. NERR ) END IF END IF * IF( VERB .GT. 1 ) THEN I = NERR CALL ZBTCHECKIN(0, OUTNUM, MAXERR, NERR, $ MEM(ERRIPTR), MEM(ERRDPTR), ISEED) IF( IAM .EQ. 0 ) THEN IF( TESTOK .AND. NERR.EQ.I ) THEN WRITE(OUTNUM,6000)TESTNUM,'PASSED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL ELSE NFAIL = NFAIL + 1 WRITE(OUTNUM,6000)TESTNUM,'FAILED ', $ SCOPE, TOP, M, N, LDASRC, $ LDADST, LDI, RDEST2, CDEST2, $ NPROW, NPCOL END IF END IF * * Once we've printed out errors, can re-use buf space * NERR = 0 END IF 50 CONTINUE 60 CONTINUE 70 CONTINUE 80 CONTINUE 90 CONTINUE * IF( VERB .LT. 2 ) THEN NFAIL = TESTNUM CALL ZBTCHECKIN( NFAIL, OUTNUM, MAXERR, NERR, MEM(ERRIPTR), $ MEM(ERRDPTR), ISEED ) END IF IF( IAM .EQ. 0 ) THEN IF( VERB .GT. 1 ) WRITE(OUTNUM,*) ' ' IF( NFAIL+NSKIP .EQ. 0 ) THEN WRITE(OUTNUM, 7000 ) TESTNUM ELSE WRITE(OUTNUM, 8000 ) TESTNUM, TESTNUM-NSKIP-NFAIL, $ NSKIP, NFAIL END IF END IF * * Log whether their were any failures * TESTOK = ALLPASS( (NFAIL.EQ.0) ) * 1000 FORMAT('DOUBLE COMPLEX AMN TESTS: BEGIN.' ) 2000 FORMAT(1X,A7,3X,10I6) 3000 FORMAT(1X,A7,3X,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1,5X,A1, $ 5X,A1,5X,A1) 4000 FORMAT(' TEST# STATUS SCOPE TOP M N LDAS LDAD LDI ', $ 'RDEST CDEST P Q') 5000 FORMAT(' ----- ------- ----- --- ----- ----- ----- ----- ----- ', $ '----- ----- ---- ----') 6000 FORMAT(I6,1X,A7,5X,A1,3X,A1,7I6,2I5) 7000 FORMAT('DOUBLE COMPLEX AMN TESTS: PASSED ALL', $ I5, ' TESTS.') 8000 FORMAT('DOUBLE COMPLEX AMN TESTS:',I5,' TESTS;',I5,' PASSED,', $ I5,' SKIPPED,',I5,' FAILED.') * RETURN * * End of ZTESTAMN. * END * SUBROUTINE ZCHKAMN( SCOPE, ICTXT, M, N, A, LDA, RA, CA, LDI, $ TESTNUM, MAXERR, NERR, ERRIBUF, ERRDBUF, $ ISEED, VALS ) * * .. Scalar Arguments .. CHARACTER*1 SCOPE INTEGER ICTXT, M, N, LDA, LDI, TESTNUM, MAXERR, NERR * .. * .. Array Arguments .. INTEGER RA(*), CA(*), ERRIBUF(6, MAXERR), ISEED(*) DOUBLE COMPLEX A(LDA,*), ERRDBUF(2, MAXERR), VALS(*) * .. * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSPNUM DOUBLE PRECISION DBTEPS, ZBTABS DOUBLE COMPLEX ZBTRAN EXTERNAL IBTMYPROC, IBTNPROCS, IBTSPNUM, ZBTRAN, DBTEPS, ZBTABS * .. * .. External Subroutines .. EXTERNAL IBTSPCOORD * .. * .. Local Scalars .. LOGICAL ERROR INTEGER NPROCS, NNODES, NPROW, NPCOL, MYROW, MYCOL, RAMN, CAMN INTEGER IAMN, I, J, K, H, DEST, NODE DOUBLE PRECISION EPS * .. * .. Executable Statements .. * NPROCS = IBTNPROCS() EPS = DBTEPS() CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) DEST = MYROW*NPROCS + MYCOL * * Set up seeds to match those used by each proc's genmat call * IF( SCOPE .EQ. 'R' ) THEN NNODES = NPCOL DO 10 I = 0, NNODES-1 NODE = MYROW * NPROCS + I ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 10 CONTINUE ELSE IF( SCOPE .EQ. 'C' ) THEN NNODES = NPROW DO 20 I = 0, NNODES-1 NODE = I * NPROCS + MYCOL ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 20 CONTINUE ELSE NNODES = NPROW * NPCOL DO 30 I = 0, NNODES-1 NODE = (I / NPCOL) * NPROCS + MOD(I, NPCOL) ISEED(I*4+1) = MOD( 1002 + TESTNUM*5 + NODE*3, 4096 ) ISEED(I*4+2) = MOD( 2027 + TESTNUM*7 + NODE, 4096 ) ISEED(I*4+3) = MOD( 1234 + TESTNUM + NODE*3, 4096 ) ISEED(I*4+4) = MOD( 4311 + TESTNUM*10 + NODE*2, 4096 ) 30 CONTINUE END IF * DO 100 J = 1, N DO 90 I = 1, M H = (J-1)*LDI + I VALS(1) = ZBTRAN( ISEED ) IAMN = 1 IF( NNODES .GT. 1 ) THEN DO 40 K = 1, NNODES-1 VALS(K+1) = ZBTRAN( ISEED(K*4+1) ) IF( ZBTABS( VALS(K+1) ) .LT. ZBTABS( VALS(IAMN) ) ) $ IAMN = K + 1 40 CONTINUE END IF * * If BLACS have not returned same value we've chosen * IF( A(I,J) .NE. VALS(IAMN) ) THEN * * If we have RA and CA arrays * IF( LDI .NE. -1 ) THEN * * Any number having the same absolute value is a valid max * K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.GT.0 .AND. K.LE.NNODES ) THEN ERROR = ABS( ZBTABS(VALS(K)) - ZBTABS(VALS(IAMN)) ) $ .GT. 3*EPS IF( .NOT.ERROR ) IAMN = K ELSE ERROR = .TRUE. END IF ELSE * * Error if BLACS answer not same absolute value, or if it * was not really in the numbers being compared * ERROR = ABS( ZBTABS(A(I,J)) - ZBTABS(VALS(IAMN)) ) $ .GT. 3*EPS IF( .NOT.ERROR ) THEN DO 50 K = 1, NNODES IF( VALS(K) .EQ. A(I,J) ) GOTO 60 50 CONTINUE ERROR = .TRUE. 60 CONTINUE ENDIF END IF * * If the value is in error * IF( ERROR ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = 5 ERRDBUF(1, NERR) = A(I,J) ERRDBUF(2, NERR) = VALS(IAMN) END IF END IF * * If they are defined, make sure coordinate entries are OK * IF( LDI .NE. -1 ) THEN K = IBTSPNUM( SCOPE, RA(H), CA(H), NPCOL ) + 1 IF( K.NE.IAMN ) THEN * * Make sure more than one proc doesn't have exact same value * (and therefore there may be more than one valid coordinate * for a single value) * IF( K.GT.NNODES .OR. K.LT.1 ) THEN ERROR = .TRUE. ELSE ERROR = ( VALS(K) .NE. VALS(IAMN) ) END IF IF( ERROR ) THEN CALL IBTSPCOORD( SCOPE, IAMN-1, MYROW, MYCOL, $ NPCOL, RAMN, CAMN ) IF( RAMN .NE. RA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -5 ERRDBUF(1, NERR) = RA(H) ERRDBUF(2, NERR) = RAMN END IF IF( CAMN .NE. CA(H) ) THEN NERR = NERR + 1 ERRIBUF(1, NERR) = TESTNUM ERRIBUF(2, NERR) = NNODES ERRIBUF(3, NERR) = DEST ERRIBUF(4, NERR) = I ERRIBUF(5, NERR) = J ERRIBUF(6, NERR) = -15 ERRDBUF(1, NERR) = CA(H) ERRDBUF(2, NERR) = CAMN END IF END IF END IF END IF 90 CONTINUE 100 CONTINUE * RETURN * * End of ZCHKAMN * END * scalapack-2.0.2/BLACS/TESTING/bsbr.dat000644 000766 000024 00000001166 11640652114 017300 0ustar00juliestaff000000 000000 3 Number of scopes 'R' 'C' 'A' values for scopes 8 Number of topologies 'I' 'S' '1' 'd' 'm' ' ' 'T' 'H' TOP 5 Number of shapes 'G' 'U' 'U' 'L' 'L' UPLO 'E' 'U' 'N' 'U' 'N' DIAG 5 Number of matrices 2 1 25 13 0 M 2 7 19 32 0 N 3 3 25 14 1 LDASRC 2 2 25 22 1 LDADEST 4 Number of src/dest pairs 0 1 3 2 RSRC 0 0 1 1 CSRC 4 Number of grids 2 4 1 1 7 1 4 NPROW 2 1 3 4 1 8 2 NPCOL scalapack-2.0.2/BLACS/TESTING/bt.dat000644 000766 000024 00000000706 11640652114 016754 0ustar00juliestaff000000 000000 'Sample BLACS tester run' Comment line 6 device out 'blacstest.out' output fname 'T' Run SDRV? 'T' Run BSBR? 'T' Run COMB? 'T' Run AUX? 5 Number of precisions 'I' 'S' 'D' 'C' 'Z' Values for precision 0 Verbosity level scalapack-2.0.2/BLACS/TESTING/btprim.f000644 000766 000024 00000023264 11640652114 017325 0ustar00juliestaff000000 000000 SUBROUTINE BTSETUP( MEM, MEMLEN, CMEM, CMEMLEN, OUTNUM, $ TESTSDRV, TESTBSBR, TESTCOMB, TESTAUX, $ IAM, NNODES ) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * .. Scalar Arguments .. LOGICAL TESTSDRV, TESTBSBR, TESTCOMB, TESTAUX INTEGER MEMLEN, CMEMLEN, OUTNUM, IAM, NNODES * .. * .. Array Arguments .. INTEGER MEM(MEMLEN) CHARACTER*1 CMEM(CMEMLEN) * .. * * Purpose * ======= * BTSETUP: Sets up communicator and initiliazes MPI if needed. * * ==================================================================== * * .. * .. Local Scalars LOGICAL INIT * .. * .. Include Files .. INCLUDE 'mpif.h' * .. * .. Common Blocks .. COMMON /BTMPI/ BTCOMM, IERR INTEGER BTCOMM, IERR * .. * .. Executable Statements .. * IERR = 0 CALL MPI_INITIALIZED(INIT, IERR) IF (.NOT.INIT) CALL MPI_INIT(IERR) IF (IERR.NE.0) CALL BTMPIERR("mpi_init", IERR) CALL MPI_COMM_DUP(MPI_COMM_WORLD, BTCOMM, IERR) IF (IERR.NE.0) CALL BTMPIERR("MPI_COMM_DUP", IERR) * RETURN END INTEGER FUNCTION IBTMYPROC() * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * Purpose * ======= * IBTMYPROC: returns a process number between 0 .. NPROCS-1. On * systems not natively in this numbering scheme, translates to it. * * ==================================================================== * .. * .. Include Files .. INCLUDE 'mpif.h' * .. * .. Local Scalars .. INTEGER RANK * .. * .. Common Blocks .. COMMON /BTMPI/ BTCOMM, IERR INTEGER BTCOMM, IERR * .. * .. Executable Statements .. * CALL MPI_COMM_RANK(BTCOMM, RANK, IERR) IF (IERR.NE.0) CALL BTMPIERR("MPI_COMM_RANK", IERR) IBTMYPROC = RANK RETURN * * End of IBTMYPROC * END * INTEGER FUNCTION IBTNPROCS() * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * Purpose * ======= * IBTNPROCS: returns the number of processes in the machine. * * ==================================================================== * .. * .. Include Files .. INCLUDE 'mpif.h' * .. * .. Local Scalars .. INTEGER NPROC * .. * .. Common Blocks .. COMMON /BTMPI/ BTCOMM, IERR INTEGER BTCOMM, IERR * .. * .. Executable Statements .. * CALL MPI_COMM_SIZE(BTCOMM, NPROC, IERR) IF (IERR.NE.0) CALL BTMPIERR("MPI_COMM_SIZE", IERR) IBTNPROCS = NPROC * RETURN * * End of IBTNPROCS * END * SUBROUTINE BTSEND(DTYPE, N, BUFF, DEST, MSGID) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * .. Scalar Arguments .. INTEGER N, DTYPE, DEST, MSGID * .. * .. Array Arguments .. REAL BUFF(*) * .. * * PURPOSE * ======= * BTSEND: Communication primitive used to send messages independent * of the BLACS. May safely be either locally or globally blocking. * * Arguments * ========= * DTYPE (input) INTEGER * Indicates what data type BUFF is (same as PVM): * 1 = RAW BYTES * 3 = INTEGER * 4 = SINGLE PRECISION REAL * 6 = DOUBLE PRECISION REAL * 5 = SINGLE PRECISION COMPLEX * 7 = DOUBLE PRECISION COMPLEX * * N (input) INTEGER * The number of elements of type DTYPE in BUFF. * * BUFF (input) accepted as INTEGER array * The array to be communicated. Its true data type is * indicated by DTYPE. * * DEST (input) INTEGER * The destination of the message. * * MSGID (input) INTEGER * The message ID (AKA message tag or type). * * ===================================================================== * .. External Functions .. INTEGER IBTMYPROC, IBTNPROCS, IBTSIZEOF EXTERNAL IBTMYPROC, IBTNPROCS, IBTSIZEOF * .. * .. Local Scalars .. INTEGER I, IAM, MPIDTYPE * .. * .. Include Files .. INCLUDE 'mpif.h' * .. * .. Common Blocks .. COMMON /BTMPI/ BTCOMM, IERR INTEGER BTCOMM, IERR * IF( DTYPE .EQ. 1 ) THEN MPIDTYPE = MPI_BYTE ELSE IF( DTYPE .EQ. 3 ) THEN MPIDTYPE = MPI_INTEGER ELSE IF( DTYPE .EQ. 4 ) THEN MPIDTYPE = MPI_REAL ELSE IF( DTYPE .EQ. 5 ) THEN MPIDTYPE = MPI_COMPLEX ELSE IF( DTYPE .EQ. 6 ) THEN MPIDTYPE = MPI_DOUBLE_PRECISION ELSE IF( DTYPE .EQ. 7 ) THEN MPIDTYPE = MPI_DOUBLE_COMPLEX END IF * * Send the message * IF( DEST .EQ. -1 ) THEN IAM = IBTMYPROC() DO 10 I = 0, IBTNPROCS()-1 IF( I .NE. IAM ) THEN CALL MPI_SEND(BUFF, N, MPIDTYPE, I, 0, BTCOMM, IERR) IF (IERR.NE.0) CALL BTMPIERR("MPI_SEND", IERR) END IF 10 CONTINUE ELSE CALL MPI_SEND(BUFF, N, MPIDTYPE, DEST, 0, BTCOMM, IERR) IF (IERR.NE.0) CALL BTMPIERR("MPI_SEND", IERR) END IF * RETURN * * End BTSEND * END * SUBROUTINE BTRECV(DTYPE, N, BUFF, SRC, MSGID) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * * .. Scalar Arguments .. INTEGER N, DTYPE, SRC, MSGID * .. * .. Array Arguments .. REAL BUFF(*) * .. * * PURPOSE * ======= * BTRECV: Globally blocking receive. * * Arguments * ========= * DTYPE (input) INTEGER * Indicates what data type BUFF is: * 1 = RAW BYTES * 3 = INTEGER * 4 = SINGLE PRECISION REAL * 6 = DOUBLE PRECISION REAL * 5 = SINGLE PRECISION COMPLEX * 7 = DOUBLE PRECISION COMPLEX * * N (input) INTEGER * The number of elements of type DTYPE in BUFF. * * BUFF (output) INTEGER * The buffer to receive into. * * SRC (input) INTEGER * The source of the message. * * MSGID (input) INTEGER * The message ID. * * ===================================================================== * .. * .. Local Scalars .. INTEGER MPIDTYPE * .. * .. Include Files .. INCLUDE 'mpif.h' * .. * .. Local Arrays .. INTEGER STAT(MPI_STATUS_SIZE) * .. * .. Common Blocks .. COMMON /BTMPI/ BTCOMM, IERR INTEGER BTCOMM, IERR * IF( DTYPE .EQ. 1 ) THEN MPIDTYPE = MPI_BYTE ELSE IF( DTYPE .EQ. 3 ) THEN MPIDTYPE = MPI_INTEGER ELSE IF( DTYPE .EQ. 4 ) THEN MPIDTYPE = MPI_REAL ELSE IF( DTYPE .EQ. 5 ) THEN MPIDTYPE = MPI_COMPLEX ELSE IF( DTYPE .EQ. 6 ) THEN MPIDTYPE = MPI_DOUBLE_PRECISION ELSE IF( DTYPE .EQ. 7 ) THEN MPIDTYPE = MPI_DOUBLE_COMPLEX END IF * CALL MPI_RECV( BUFF, N, MPIDTYPE, SRC, 0, BTCOMM, STAT, IERR ) IF (IERR.NE.0) CALL BTMPIERR("MPI_RECV", IERR) * RETURN * * End of BTRECV * END * INTEGER FUNCTION IBTSIZEOF(TYPE) * * -- BLACS tester (version 1.0) -- * University of Tennessee * December 15, 1994 * * .. Scalar Arguments .. CHARACTER*1 TYPE * .. * * Purpose * ======= * IBTSIZEOF: Returns the size, in bytes, of the 5 data types. * If your platform has a different size for DOUBLE PRECISION, you must * change the parameter statement in BLACSTEST as well. * * Arguments * ========= * TYPE (input) CHARACTER*1 * The data type who's size is to be determined: * 'I' : INTEGER * 'S' : SINGLE PRECISION REAL * 'D' : DOUBLE PRECISION REAL * 'C' : SINGLE PRECISION COMPLEX * 'Z' : DOUBLE PRECISION COMPLEX * * ===================================================================== * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Include Files .. INCLUDE 'mpif.h' * .. * .. Common Blocks .. COMMON /BTMPI/ BTCOMM, IERR INTEGER BTCOMM, IERR * .. * .. Local Scalars .. INTEGER LENGTH LOGICAL INIT DATA INIT /.FALSE./ * .. * .. Executable Statements .. * * * Initialize MPI, if necessary * IF (.NOT.INIT) THEN CALL MPI_INITIALIZED(INIT, IERR) IF (.NOT.INIT) CALL MPI_INIT(IERR) IF (IERR.NE.0) CALL BTMPIERR("mpi_init", IERR) INIT = .TRUE. END IF * IF( LSAME(TYPE, 'I') ) THEN CALL MPI_TYPE_SIZE( MPI_INTEGER, LENGTH, IERR ) IF (IERR.NE.0) CALL BTMPIERR("MPI_TYPE_SIZE", IERR) ELSE IF( LSAME(TYPE, 'S') ) THEN CALL MPI_TYPE_SIZE( MPI_REAL, LENGTH, IERR ) IF (IERR.NE.0) CALL BTMPIERR("MPI_TYPE_SIZE", IERR) ELSE IF( LSAME(TYPE, 'D') ) THEN CALL MPI_TYPE_SIZE( MPI_DOUBLE_PRECISION, LENGTH, IERR ) IF (IERR.NE.0) CALL BTMPIERR("MPI_TYPE_SIZE", IERR) ELSE IF( LSAME(TYPE, 'C') ) THEN CALL MPI_TYPE_SIZE( MPI_COMPLEX, LENGTH, IERR ) IF (IERR.NE.0) CALL BTMPIERR("MPI_TYPE_SIZE", IERR) ELSE IF( LSAME(TYPE, 'Z') ) THEN CALL MPI_TYPE_SIZE( MPI_DOUBLE_COMPLEX, LENGTH, IERR ) IF (IERR.NE.0) CALL BTMPIERR("MPI_TYPE_SIZE", IERR) END IF IBTSIZEOF = LENGTH * RETURN END SUBROUTINE BTMPIERR(ROUT, IERR0) CHARACTER*(*) ROUT INTEGER IERR0 * .. * .. Include Files .. INCLUDE 'mpif.h' * .. * .. Common Blocks .. COMMON /BTMPI/ BTCOMM, IERR INTEGER BTCOMM, IERR * WRITE(*,1000) ROUT, IERR CALL MPI_ABORT(BTCOMM, IERR0, IERR) * 1000 FORMAT('Error #',I20,' from routine ',A) RETURN END scalapack-2.0.2/BLACS/TESTING/Cbt.c000644 000766 000024 00000037121 11640652114 016532 0ustar00juliestaff000000 000000 #ifdef BTCINTFACE #include "Cbt.h" void blacs_gridinit_(ConTxt, order, nprow, npcol) int *ConTxt; char *order; int *nprow; int *npcol; { void Cblacs_gridinit(); Cblacs_gridinit(ConTxt, order, *nprow, *npcol); } void blacs_setup_(mypnum, nprocs) int *mypnum; int *nprocs; { void Cblacs_setup(); Cblacs_setup(mypnum, nprocs); } void blacs_pinfo_(mypnum, nprocs) int *mypnum; int *nprocs; { void Cblacs_pinfo(); Cblacs_pinfo(mypnum, nprocs); } void blacs_gridmap_(ConTxt, usermap, ldup, nprow, npcol) int *ConTxt; int *usermap; int *ldup; int *nprow; int *npcol; { void Cblacs_gridmap(); Cblacs_gridmap(ConTxt, usermap, *ldup, *nprow, *npcol); } void blacs_gridexit_(ConTxt) int *ConTxt; { void Cblacs_gridexit(); Cblacs_gridexit(*ConTxt); } void blacs_abort_(ConTxt, ErrNo) int *ConTxt; int *ErrNo; { void Cblacs_abort(); Cblacs_abort(*ConTxt, *ErrNo); } void blacs_exit_(NotDone) int *NotDone; { void Cblacs_exit(); Cblacs_exit(*NotDone); } void blacs_freebuff_(ConTxt, Wait) int *ConTxt; int *Wait; { void Cblacs_freebuff(); Cblacs_freebuff(*ConTxt, *Wait); } void blacs_gridinfo_(ConTxt, nprow, npcol, myrow, mycol) int *ConTxt; int *nprow; int *npcol; int *myrow; int *mycol; { void Cblacs_gridinfo(); Cblacs_gridinfo(*ConTxt, nprow, npcol, myrow, mycol); } void blacs_barrier_(ConTxt, scope) int *ConTxt; char *scope; { void Cblacs_barrier(); Cblacs_barrier(*ConTxt, scope); } int blacs_pnum_(ConTxt, prow, pcol) int *ConTxt; int *prow; int *pcol; { int Cblacs_pnum(); return( Cblacs_pnum(*ConTxt, *prow, *pcol) ); } void blacs_pcoord_(ConTxt, nodenum, prow, pcol) int *ConTxt; int *nodenum; int *prow; int *pcol; { void Cblacs_pcoord(); Cblacs_pcoord(*ConTxt, *nodenum, prow, pcol); } void blacs_get_(ConTxt, what, I) int *ConTxt; int *what; int *I; { void Cblacs_get(); Cblacs_get(*ConTxt, *what, I); } void blacs_set_(ConTxt, what, I) int *ConTxt; int *what; int *I; { void Cblacs_set(); Cblacs_set(*ConTxt, *what, I); } void igesd2d_(ConTxt, m, n, A, lda, rdest, cdest) int *ConTxt; int *m; int *n; int *A; int *lda; int *rdest; int *cdest; { void Cigesd2d(); Cigesd2d(*ConTxt, *m, *n, A, *lda, *rdest, *cdest); } void igerv2d_(ConTxt, m, n, A, lda, rsrc, csrc) int *ConTxt; int *m; int *n; int *A; int *lda; int *rsrc; int *csrc; { void Cigerv2d(); Cigerv2d(*ConTxt, *m, *n, A, *lda, *rsrc, *csrc); } void igebs2d_(ConTxt, scope, top, m, n, A, lda) int *ConTxt; char *scope; char *top; int *m; int *n; int *A; int *lda; { void Cigebs2d(); Cigebs2d(*ConTxt, scope, top, *m, *n, A, *lda); } void igebr2d_(ConTxt, scope, top, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; int *m; int *n; int *A; int *lda; int *rsrc; int *csrc; { void Cigebr2d(); Cigebr2d(*ConTxt, scope, top, *m, *n, A, *lda, *rsrc, *csrc); } void itrsd2d_(ConTxt, uplo, diag, m, n, A, lda, rdest, cdest) int *ConTxt; char *uplo; char *diag; int *m; int *n; int *A; int *lda; int *rdest; int *cdest; { void Citrsd2d(); Citrsd2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rdest, *cdest); } void itrrv2d_(ConTxt, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *uplo; char *diag; int *m; int *n; int *A; int *lda; int *rsrc; int *csrc; { void Citrrv2d(); Citrrv2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void itrbs2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; int *A; int *lda; { void Citrbs2d(); Citrbs2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda); } void itrbr2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; int *A; int *lda; int *rsrc; int *csrc; { void Citrbr2d(); Citrbr2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void igsum2d_(ConTxt, scope, top, m, n, A, lda, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; int *A; int *lda; int *rdest; int *cdest; { void Cigsum2d(); Cigsum2d(*ConTxt, scope, top, *m, *n, A, *lda, *rdest, *cdest); } void igamx2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; int *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Cigamx2d(); Cigamx2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } void igamn2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; int *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Cigamn2d(); Cigamn2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } void dgesd2d_(ConTxt, m, n, A, lda, rdest, cdest) int *ConTxt; int *m; int *n; double *A; int *lda; int *rdest; int *cdest; { void Cdgesd2d(); Cdgesd2d(*ConTxt, *m, *n, A, *lda, *rdest, *cdest); } void dgerv2d_(ConTxt, m, n, A, lda, rsrc, csrc) int *ConTxt; int *m; int *n; double *A; int *lda; int *rsrc; int *csrc; { void Cdgerv2d(); Cdgerv2d(*ConTxt, *m, *n, A, *lda, *rsrc, *csrc); } void dgebs2d_(ConTxt, scope, top, m, n, A, lda) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; { void Cdgebs2d(); Cdgebs2d(*ConTxt, scope, top, *m, *n, A, *lda); } void dgebr2d_(ConTxt, scope, top, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; int *rsrc; int *csrc; { void Cdgebr2d(); Cdgebr2d(*ConTxt, scope, top, *m, *n, A, *lda, *rsrc, *csrc); } void dtrsd2d_(ConTxt, uplo, diag, m, n, A, lda, rdest, cdest) int *ConTxt; char *uplo; char *diag; int *m; int *n; double *A; int *lda; int *rdest; int *cdest; { void Cdtrsd2d(); Cdtrsd2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rdest, *cdest); } void dtrrv2d_(ConTxt, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *uplo; char *diag; int *m; int *n; double *A; int *lda; int *rsrc; int *csrc; { void Cdtrrv2d(); Cdtrrv2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void dtrbs2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; double *A; int *lda; { void Cdtrbs2d(); Cdtrbs2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda); } void dtrbr2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; double *A; int *lda; int *rsrc; int *csrc; { void Cdtrbr2d(); Cdtrbr2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void dgsum2d_(ConTxt, scope, top, m, n, A, lda, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; int *rdest; int *cdest; { void Cdgsum2d(); Cdgsum2d(*ConTxt, scope, top, *m, *n, A, *lda, *rdest, *cdest); } void dgamx2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Cdgamx2d(); Cdgamx2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } void dgamn2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Cdgamn2d(); Cdgamn2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } void sgesd2d_(ConTxt, m, n, A, lda, rdest, cdest) int *ConTxt; int *m; int *n; float *A; int *lda; int *rdest; int *cdest; { void Csgesd2d(); Csgesd2d(*ConTxt, *m, *n, A, *lda, *rdest, *cdest); } void sgerv2d_(ConTxt, m, n, A, lda, rsrc, csrc) int *ConTxt; int *m; int *n; float *A; int *lda; int *rsrc; int *csrc; { void Csgerv2d(); Csgerv2d(*ConTxt, *m, *n, A, *lda, *rsrc, *csrc); } void sgebs2d_(ConTxt, scope, top, m, n, A, lda) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; { void Csgebs2d(); Csgebs2d(*ConTxt, scope, top, *m, *n, A, *lda); } void sgebr2d_(ConTxt, scope, top, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; int *rsrc; int *csrc; { void Csgebr2d(); Csgebr2d(*ConTxt, scope, top, *m, *n, A, *lda, *rsrc, *csrc); } void strsd2d_(ConTxt, uplo, diag, m, n, A, lda, rdest, cdest) int *ConTxt; char *uplo; char *diag; int *m; int *n; float *A; int *lda; int *rdest; int *cdest; { void Cstrsd2d(); Cstrsd2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rdest, *cdest); } void strrv2d_(ConTxt, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *uplo; char *diag; int *m; int *n; float *A; int *lda; int *rsrc; int *csrc; { void Cstrrv2d(); Cstrrv2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void strbs2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; float *A; int *lda; { void Cstrbs2d(); Cstrbs2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda); } void strbr2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; float *A; int *lda; int *rsrc; int *csrc; { void Cstrbr2d(); Cstrbr2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void sgsum2d_(ConTxt, scope, top, m, n, A, lda, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; int *rdest; int *cdest; { void Csgsum2d(); Csgsum2d(*ConTxt, scope, top, *m, *n, A, *lda, *rdest, *cdest); } void sgamx2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Csgamx2d(); Csgamx2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } void sgamn2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Csgamn2d(); Csgamn2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } void cgesd2d_(ConTxt, m, n, A, lda, rdest, cdest) int *ConTxt; int *m; int *n; float *A; int *lda; int *rdest; int *cdest; { void Ccgesd2d(); Ccgesd2d(*ConTxt, *m, *n, A, *lda, *rdest, *cdest); } void cgerv2d_(ConTxt, m, n, A, lda, rsrc, csrc) int *ConTxt; int *m; int *n; float *A; int *lda; int *rsrc; int *csrc; { void Ccgerv2d(); Ccgerv2d(*ConTxt, *m, *n, A, *lda, *rsrc, *csrc); } void cgebs2d_(ConTxt, scope, top, m, n, A, lda) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; { void Ccgebs2d(); Ccgebs2d(*ConTxt, scope, top, *m, *n, A, *lda); } void cgebr2d_(ConTxt, scope, top, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; int *rsrc; int *csrc; { void Ccgebr2d(); Ccgebr2d(*ConTxt, scope, top, *m, *n, A, *lda, *rsrc, *csrc); } void ctrsd2d_(ConTxt, uplo, diag, m, n, A, lda, rdest, cdest) int *ConTxt; char *uplo; char *diag; int *m; int *n; float *A; int *lda; int *rdest; int *cdest; { void Cctrsd2d(); Cctrsd2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rdest, *cdest); } void ctrrv2d_(ConTxt, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *uplo; char *diag; int *m; int *n; float *A; int *lda; int *rsrc; int *csrc; { void Cctrrv2d(); Cctrrv2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void ctrbs2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; float *A; int *lda; { void Cctrbs2d(); Cctrbs2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda); } void ctrbr2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; float *A; int *lda; int *rsrc; int *csrc; { void Cctrbr2d(); Cctrbr2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void cgsum2d_(ConTxt, scope, top, m, n, A, lda, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; int *rdest; int *cdest; { void Ccgsum2d(); Ccgsum2d(*ConTxt, scope, top, *m, *n, A, *lda, *rdest, *cdest); } void cgamx2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Ccgamx2d(); Ccgamx2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } void cgamn2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; float *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Ccgamn2d(); Ccgamn2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } void zgesd2d_(ConTxt, m, n, A, lda, rdest, cdest) int *ConTxt; int *m; int *n; double *A; int *lda; int *rdest; int *cdest; { void Czgesd2d(); Czgesd2d(*ConTxt, *m, *n, A, *lda, *rdest, *cdest); } void zgerv2d_(ConTxt, m, n, A, lda, rsrc, csrc) int *ConTxt; int *m; int *n; double *A; int *lda; int *rsrc; int *csrc; { void Czgerv2d(); Czgerv2d(*ConTxt, *m, *n, A, *lda, *rsrc, *csrc); } void zgebs2d_(ConTxt, scope, top, m, n, A, lda) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; { void Czgebs2d(); Czgebs2d(*ConTxt, scope, top, *m, *n, A, *lda); } void zgebr2d_(ConTxt, scope, top, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; int *rsrc; int *csrc; { void Czgebr2d(); Czgebr2d(*ConTxt, scope, top, *m, *n, A, *lda, *rsrc, *csrc); } void ztrsd2d_(ConTxt, uplo, diag, m, n, A, lda, rdest, cdest) int *ConTxt; char *uplo; char *diag; int *m; int *n; double *A; int *lda; int *rdest; int *cdest; { void Cztrsd2d(); Cztrsd2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rdest, *cdest); } void ztrrv2d_(ConTxt, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *uplo; char *diag; int *m; int *n; double *A; int *lda; int *rsrc; int *csrc; { void Cztrrv2d(); Cztrrv2d(*ConTxt, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void ztrbs2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; double *A; int *lda; { void Cztrbs2d(); Cztrbs2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda); } void ztrbr2d_(ConTxt, scope, top, uplo, diag, m, n, A, lda, rsrc, csrc) int *ConTxt; char *scope; char *top; char *uplo; char *diag; int *m; int *n; double *A; int *lda; int *rsrc; int *csrc; { void Cztrbr2d(); Cztrbr2d(*ConTxt, scope, top, uplo, diag, *m, *n, A, *lda, *rsrc, *csrc); } void zgsum2d_(ConTxt, scope, top, m, n, A, lda, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; int *rdest; int *cdest; { void Czgsum2d(); Czgsum2d(*ConTxt, scope, top, *m, *n, A, *lda, *rdest, *cdest); } void zgamx2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Czgamx2d(); Czgamx2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } void zgamn2d_(ConTxt, scope, top, m, n, A, lda, rA, cA, ldia, rdest, cdest) int *ConTxt; char *scope; char *top; int *m; int *n; double *A; int *lda; int *rA; int *cA; int *ldia; int *rdest; int *cdest; { void Czgamn2d(); Czgamn2d(*ConTxt, scope, top, *m, *n, A, *lda, rA, cA, *ldia, *rdest, *cdest); } #endif scalapack-2.0.2/BLACS/TESTING/Cbt.h000644 000766 000024 00000000374 11640652114 016537 0ustar00juliestaff000000 000000 #define ADD_ 0 #define NOCHANGE 1 #define UPCASE 2 #ifdef UpCase #define F77_CALL_C UPCASE #endif #ifdef NoChange #define F77_CALL_C NOCHANGE #endif #ifdef Add_ #define F77_CALL_C ADD_ #endif #ifndef F77_CALL_C #define F77_CALL_C ADD_ #endif scalapack-2.0.2/BLACS/TESTING/CMakeLists.txt000644 000766 000024 00000003054 11656571152 020425 0ustar00juliestaff000000 000000 set(FTestObj blacstest.f btprim.f tools.f) add_executable(xFbtest ${FTestObj}) target_link_libraries(xFbtest scalapack) set(CTestObj Cbt.c) set_property( SOURCE Cbt.c APPEND PROPERTY COMPILE_DEFINITIONS BTCINTFACE ) add_executable(xCbtest ${CTestObj} ${FTestObj}) target_link_libraries(xCbtest scalapack) file(COPY bsbr.dat DESTINATION ${SCALAPACK_BINARY_DIR}/BLACS/TESTING) file(COPY bt.dat DESTINATION ${SCALAPACK_BINARY_DIR}/BLACS/TESTING) file(COPY comb.dat DESTINATION ${SCALAPACK_BINARY_DIR}/BLACS/TESTING) file(COPY sdrv.dat DESTINATION ${SCALAPACK_BINARY_DIR}/BLACS/TESTING) # We could run the BLACS TESTING the following way # But BLACS TESTING are TESTING anormal exit so even if they pass, # CTest will determine they fail #add_test(xFbtest0 ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xFbtest) #add_test(xCbtest0 ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./xCbtest) add_test(xCbtest ${CMAKE_COMMAND} -DMPIEXEC=${MPIEXEC} -DMPIEXEC_NUMPROC_FLAG=${MPIEXEC_NUMPROC_FLAG} -DTEST_PROG=xCbtest -DOUTPUTDIR=${SCALAPACK_BINARY_DIR}/BLACS/TESTING -DRUNTIMEDIR=${CMAKE_RUNTIME_OUTPUT_DIRECTORY} -DSOURCEDIR=${CMAKE_CURRENT_SOURCE_DIR} -P ${CMAKE_CURRENT_SOURCE_DIR}/runtest.cmake ) add_test(xFbtest ${CMAKE_COMMAND} -DMPIEXEC=${MPIEXEC} -DMPIEXEC_NUMPROC_FLAG=${MPIEXEC_NUMPROC_FLAG} -DTEST_PROG=xFbtest -DOUTPUTDIR=${SCALAPACK_BINARY_DIR}/BLACS/TESTING -DRUNTIMEDIR=${CMAKE_RUNTIME_OUTPUT_DIRECTORY} -DSOURCEDIR=${CMAKE_CURRENT_SOURCE_DIR} -P ${CMAKE_CURRENT_SOURCE_DIR}/runtest.cmake )scalapack-2.0.2/BLACS/TESTING/comb.dat000644 000766 000024 00000001400 11640652114 017257 0ustar00juliestaff000000 000000 3 Number of OPs '+' '>' '<' Combine operations to perform 3 Number of scopes 'R' 'C' 'A' values for scopes 2 Repeatability flag (0=no-rep, 1=rep, 2=both) 2 Coherence flag (0=no-coh, 1=coh, 2=both) 4 Number of topologies ' ' 'T' 'H' 'f' 'M' TOP 6 Number of matrices 3 1 2 25 13 0 M 5 1 3 19 32 0 N 5 1 4 25 14 1 LDASRC 9 1 5 25 22 1 LDADEST 4 1 -1 25 22 1 LDI 4 Number of dests 0 -1 0 2 RDEST 0 -1 1 0 CDEST 4 Number of grids 2 1 4 1 1 8 3 NPROW 2 4 1 3 7 1 2 NPCOL scalapack-2.0.2/BLACS/TESTING/Makefile000644 000766 000024 00000003016 11710537267 017323 0ustar00juliestaff000000 000000 include ../../SLmake.inc # --------------------------------------------------------------------- # The file tools.f contains some LAPACK routines that the tester calls. # If you have ScaLAPACK, you may point to your tools library instead # of compiling this file. # --------------------------------------------------------------------- tools = tools.o exe : all ctest : xCbtest ftest : xFbtest all : xCbtest xFbtest obj = blacstest.o btprim.o xCbtest: $(obj) $(tools) $(CC) -c $(CDEFS) $(CCFLAGS) -DBTCINTFACE Cbt.c $(FCLOADER) $(FCLOADFLAGS) -o $@ $(obj) $(tools) Cbt.o ../../$(SCALAPACKLIB) xFbtest: $(obj) $(tools) $(FCLOADER) $(FCLOADFLAGS) -o $@ $(obj) $(tools) ../../$(SCALAPACKLIB) # -------------------------------------------------------------------- # The files tools.f and blacstest.f are compiled without optimization. # Tools.f contains the LAPACK routines slamch and dlamch, which only # operate correctly for low-levels of optimization. Blacstest.f is # extremely large, and optimizing it takes a long time. More # importantly, the sun's f77 compiler seems to produce errors in # trying to optimize such a large file. We therefore insist that it # also not be optimized. # -------------------------------------------------------------------- tools.o : tools.f $(FC) $(NOOPT) -c $*.f blacstest.o : blacstest.f $(FC) $(NOOPT) -c $*.f btprim.o : btprim.f $(FC) -c $(FCFLAGS) $*.f clean : rm -f $(obj) tools.o Cbt.o xCbtest xFbtest .f.o: ; $(FC) -c $(FCFLAGS) $*.f .c.o: $(CC) -c $(CDEFS) $(CCFLAGS) $< scalapack-2.0.2/BLACS/TESTING/README000644 000766 000024 00000001066 11640652114 016535 0ustar00juliestaff000000 000000 (1) To compile, just type "make". You must first edit and correct the file BLACS/Bmake.inc. Sample Bmake.inc's can be found in the BLACS/BMAKES directories. See the paper "Installing and testing the BLACS" for details. (2) Type "make clean" to get rid of old .o files. (3) The file blacstest.f is extremely large (roughly 20,000 lines), and this may be too large to compile on some systems. If you have this problem, a slight modification to the BLACS/TESTING Makefile should allow you to split blacstest.f into smaller files. scalapack-2.0.2/BLACS/TESTING/runtest.cmake000644 000766 000024 00000001714 11656312637 020375 0ustar00juliestaff000000 000000 message("Running BLACS TESTS") message(STATUS "${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./${TEST_PROG}") message(STATUS "Output out_${TEST_PROG}.txt") file(COPY ${RUNTIMEDIR}/${TEST_PROG} DESTINATION ${OUTPUTDIR}) execute_process(COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ./${TEST_PROG} OUTPUT_FILE "out_${TEST_PROG}.txt" ERROR_FILE "error_${TEST_PROG}.txt" RESULT_VARIABLE HAD_ERROR) if(HAD_ERROR) # This is normal to exit in Error (good behaviour) # So we are going to check that the output have the last line of the testing : DONE BLACS_GRIDEXIT file(READ "out_${TEST_PROG}.txt" TESTSTRING) STRING(REPLACE "DONE BLACS_GRIDEXIT" "BLACS OK" tmp ${TESTSTRING}) if("${tmp}" STREQUAL "${TESTSTRING}") message( STATUS "Error in error_${TEST_PROG}.txt") message(FATAL_ERROR "Test failed - Test did not reach DONE BLACS_GRIDEXIT") else() message( STATUS "Test Passed") endif() endif() scalapack-2.0.2/BLACS/TESTING/sdrv.dat000644 000766 000024 00000001053 11640652114 017321 0ustar00juliestaff000000 000000 5 Number of shapes 'G' 'U' 'U' 'L' 'L' UPLO 'E' 'U' 'N' 'U' 'N' DIAG 5 Number of matrices 2 1 25 13 0 M 2 7 19 32 0 N 2 3 25 14 1 LDASRC 3 2 25 22 1 LDADEST 1 Number of src/dest pairs 0 1 3 0 RSRC 0 0 0 2 CSRC 0 1 2 0 RDEST 1 1 0 0 CDEST 3 Number of grids 2 4 1 NPROW 2 1 4 NPCOL scalapack-2.0.2/BLACS/TESTING/tools.f000644 000766 000024 00000166272 11640652114 017177 0ustar00juliestaff000000 000000 * ================================================================ * This file contains the following LAPACK routines, for use by the * BLACS tester: LSAME, SLAMCH, DLAMCH, DLARND, ZLARND, DLARAN, * and ZLARAN. If you have ScaLAPACK or LAPACK, all of these files * are present in your library, and you may discard this file and * point to the appropriate archive instead. * ================================================================ DOUBLE PRECISION FUNCTION DLAMCH( CMACH ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. CHARACTER CMACH * .. * * Purpose * ======= * * DLAMCH determines double precision machine parameters. * * Arguments * ========= * * CMACH (input) CHARACTER*1 * Specifies the value to be returned by DLAMCH: * = 'E' or 'e', DLAMCH := eps * = 'S' or 's , DLAMCH := sfmin * = 'B' or 'b', DLAMCH := base * = 'P' or 'p', DLAMCH := eps*base * = 'N' or 'n', DLAMCH := t * = 'R' or 'r', DLAMCH := rnd * = 'M' or 'm', DLAMCH := emin * = 'U' or 'u', DLAMCH := rmin * = 'L' or 'l', DLAMCH := emax * = 'O' or 'o', DLAMCH := rmax * * where * * eps = relative machine precision * sfmin = safe minimum, such that 1/sfmin does not overflow * base = base of the machine * prec = eps*base * t = number of (base) digits in the mantissa * rnd = 1.0 when rounding occurs in addition, 0.0 otherwise * emin = minimum exponent before (gradual) underflow * rmin = underflow threshold - base**(emin-1) * emax = largest exponent before overflow * rmax = overflow threshold - (base**emax)*(1-eps) * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL FIRST, LRND INTEGER BETA, IMAX, IMIN, IT DOUBLE PRECISION BASE, EMAX, EMIN, EPS, PREC, RMACH, RMAX, RMIN, $ RND, SFMIN, SMALL, T * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DLAMC2 * .. * .. Save statement .. SAVE FIRST, EPS, SFMIN, BASE, T, RND, EMIN, RMIN, $ EMAX, RMAX, PREC * .. * .. Data statements .. DATA FIRST / .TRUE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. CALL DLAMC2( BETA, IT, LRND, EPS, IMIN, RMIN, IMAX, RMAX ) BASE = BETA T = IT IF( LRND ) THEN RND = ONE EPS = ( BASE**( 1-IT ) ) / 2 ELSE RND = ZERO EPS = BASE**( 1-IT ) END IF PREC = EPS*BASE EMIN = IMIN EMAX = IMAX SFMIN = RMIN SMALL = ONE / RMAX IF( SMALL.GE.SFMIN ) THEN * * Use SMALL plus a bit, to avoid the possibility of rounding * causing overflow when computing 1/sfmin. * SFMIN = SMALL*( ONE+EPS ) END IF END IF * IF( LSAME( CMACH, 'E' ) ) THEN RMACH = EPS ELSE IF( LSAME( CMACH, 'S' ) ) THEN RMACH = SFMIN ELSE IF( LSAME( CMACH, 'B' ) ) THEN RMACH = BASE ELSE IF( LSAME( CMACH, 'P' ) ) THEN RMACH = PREC ELSE IF( LSAME( CMACH, 'N' ) ) THEN RMACH = T ELSE IF( LSAME( CMACH, 'R' ) ) THEN RMACH = RND ELSE IF( LSAME( CMACH, 'M' ) ) THEN RMACH = EMIN ELSE IF( LSAME( CMACH, 'U' ) ) THEN RMACH = RMIN ELSE IF( LSAME( CMACH, 'L' ) ) THEN RMACH = EMAX ELSE IF( LSAME( CMACH, 'O' ) ) THEN RMACH = RMAX END IF * DLAMCH = RMACH RETURN * * End of DLAMCH * END * ************************************************************************ * SUBROUTINE DLAMC1( BETA, T, RND, IEEE1 ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL IEEE1, RND INTEGER BETA, T * .. * * Purpose * ======= * * DLAMC1 determines the machine parameters given by BETA, T, RND, and * IEEE1. * * Arguments * ========= * * BETA (output) INTEGER * The base of the machine. * * T (output) INTEGER * The number of ( BETA ) digits in the mantissa. * * RND (output) LOGICAL * Specifies whether proper rounding ( RND = .TRUE. ) or * chopping ( RND = .FALSE. ) occurs in addition. This may not * be a reliable guide to the way in which the machine performs * its arithmetic. * * IEEE1 (output) LOGICAL * Specifies whether rounding appears to be done in the IEEE * 'round to nearest' style. * * Further Details * =============== * * The routine is based on the routine ENVRON by Malcolm and * incorporates suggestions by Gentleman and Marovich. See * * Malcolm M. A. (1972) Algorithms to reveal properties of * floating-point arithmetic. Comms. of the ACM, 15, 949-951. * * Gentleman W. M. and Marovich S. B. (1974) More on algorithms * that reveal properties of floating point arithmetic units. * Comms. of the ACM, 17, 276-277. * * ===================================================================== * * .. Local Scalars .. LOGICAL FIRST, LIEEE1, LRND INTEGER LBETA, LT DOUBLE PRECISION A, B, C, F, ONE, QTR, SAVEC, T1, T2 * .. * .. External Functions .. DOUBLE PRECISION DLAMC3 EXTERNAL DLAMC3 * .. * .. Save statement .. SAVE FIRST, LIEEE1, LBETA, LRND, LT * .. * .. Data statements .. DATA FIRST / .TRUE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. ONE = 1 * * LBETA, LIEEE1, LT and LRND are the local values of BETA, * IEEE1, T and RND. * * Throughout this routine we use the function DLAMC3 to ensure * that relevant values are stored and not held in registers, or * are not affected by optimizers. * * Compute a = 2.0**m with the smallest positive integer m such * that * * fl( a + 1.0 ) = a. * A = 1 C = 1 * *+ WHILE( C.EQ.ONE )LOOP 10 CONTINUE IF( C.EQ.ONE ) THEN A = 2*A C = DLAMC3( A, ONE ) C = DLAMC3( C, -A ) GO TO 10 END IF *+ END WHILE * * Now compute b = 2.0**m with the smallest positive integer m * such that * * fl( a + b ) .gt. a. * B = 1 C = DLAMC3( A, B ) * *+ WHILE( C.EQ.A )LOOP 20 CONTINUE IF( C.EQ.A ) THEN B = 2*B C = DLAMC3( A, B ) GO TO 20 END IF *+ END WHILE * * Now compute the base. a and c are neighbouring floating point * numbers in the interval ( beta**t, beta**( t + 1 ) ) and so * their difference is beta. Adding 0.25 to c is to ensure that it * is truncated to beta and not ( beta - 1 ). * QTR = ONE / 4 SAVEC = C C = DLAMC3( C, -A ) LBETA = C + QTR * * Now determine whether rounding or chopping occurs, by adding a * bit less than beta/2 and a bit more than beta/2 to a. * B = LBETA F = DLAMC3( B / 2, -B / 100 ) C = DLAMC3( F, A ) IF( C.EQ.A ) THEN LRND = .TRUE. ELSE LRND = .FALSE. END IF F = DLAMC3( B / 2, B / 100 ) C = DLAMC3( F, A ) IF( ( LRND ) .AND. ( C.EQ.A ) ) $ LRND = .FALSE. * * Try and decide whether rounding is done in the IEEE 'round to * nearest' style. B/2 is half a unit in the last place of the two * numbers A and SAVEC. Furthermore, A is even, i.e. has last bit * zero, and SAVEC is odd. Thus adding B/2 to A should not change * A, but adding B/2 to SAVEC should change SAVEC. * T1 = DLAMC3( B / 2, A ) T2 = DLAMC3( B / 2, SAVEC ) LIEEE1 = ( T1.EQ.A ) .AND. ( T2.GT.SAVEC ) .AND. LRND * * Now find the mantissa, t. It should be the integer part of * log to the base beta of a, however it is safer to determine t * by powering. So we find t as the smallest positive integer for * which * * fl( beta**t + 1.0 ) = 1.0. * LT = 0 A = 1 C = 1 * *+ WHILE( C.EQ.ONE )LOOP 30 CONTINUE IF( C.EQ.ONE ) THEN LT = LT + 1 A = A*LBETA C = DLAMC3( A, ONE ) C = DLAMC3( C, -A ) GO TO 30 END IF *+ END WHILE * END IF * BETA = LBETA T = LT RND = LRND IEEE1 = LIEEE1 RETURN * * End of DLAMC1 * END * ************************************************************************ * SUBROUTINE DLAMC2( BETA, T, RND, EPS, EMIN, RMIN, EMAX, RMAX ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL RND INTEGER BETA, EMAX, EMIN, T DOUBLE PRECISION EPS, RMAX, RMIN * .. * * Purpose * ======= * * DLAMC2 determines the machine parameters specified in its argument * list. * * Arguments * ========= * * BETA (output) INTEGER * The base of the machine. * * T (output) INTEGER * The number of ( BETA ) digits in the mantissa. * * RND (output) LOGICAL * Specifies whether proper rounding ( RND = .TRUE. ) or * chopping ( RND = .FALSE. ) occurs in addition. This may not * be a reliable guide to the way in which the machine performs * its arithmetic. * * EPS (output) DOUBLE PRECISION * The smallest positive number such that * * fl( 1.0 - EPS ) .LT. 1.0, * * where fl denotes the computed value. * * EMIN (output) INTEGER * The minimum exponent before (gradual) underflow occurs. * * RMIN (output) DOUBLE PRECISION * The smallest normalized number for the machine, given by * BASE**( EMIN - 1 ), where BASE is the floating point value * of BETA. * * EMAX (output) INTEGER * The maximum exponent before overflow occurs. * * RMAX (output) DOUBLE PRECISION * The largest positive number for the machine, given by * BASE**EMAX * ( 1 - EPS ), where BASE is the floating point * value of BETA. * * Further Details * =============== * * The computation of EPS is based on a routine PARANOIA by * W. Kahan of the University of California at Berkeley. * * ===================================================================== * * .. Local Scalars .. LOGICAL FIRST, IEEE, IWARN, LIEEE1, LRND INTEGER GNMIN, GPMIN, I, LBETA, LEMAX, LEMIN, LT, $ NGNMIN, NGPMIN DOUBLE PRECISION A, B, C, HALF, LEPS, LRMAX, LRMIN, ONE, RBASE, $ SIXTH, SMALL, THIRD, TWO, ZERO * .. * .. External Functions .. DOUBLE PRECISION DLAMC3 EXTERNAL DLAMC3 * .. * .. External Subroutines .. EXTERNAL DLAMC1, DLAMC4, DLAMC5 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Save statement .. SAVE FIRST, IWARN, LBETA, LEMAX, LEMIN, LEPS, LRMAX, $ LRMIN, LT * .. * .. Data statements .. DATA FIRST / .TRUE. / , IWARN / .FALSE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. ZERO = 0 ONE = 1 TWO = 2 * * LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values of * BETA, T, RND, EPS, EMIN and RMIN. * * Throughout this routine we use the function DLAMC3 to ensure * that relevant values are stored and not held in registers, or * are not affected by optimizers. * * DLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. * CALL DLAMC1( LBETA, LT, LRND, LIEEE1 ) * * Start to find EPS. * B = LBETA A = B**( -LT ) LEPS = A * * Try some tricks to see whether or not this is the correct EPS. * B = TWO / 3 HALF = ONE / 2 SIXTH = DLAMC3( B, -HALF ) THIRD = DLAMC3( SIXTH, SIXTH ) B = DLAMC3( THIRD, -HALF ) B = DLAMC3( B, SIXTH ) B = ABS( B ) IF( B.LT.LEPS ) $ B = LEPS * LEPS = 1 * *+ WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP 10 CONTINUE IF( ( LEPS.GT.B ) .AND. ( B.GT.ZERO ) ) THEN LEPS = B C = DLAMC3( HALF*LEPS, ( TWO**5 )*( LEPS**2 ) ) C = DLAMC3( HALF, -C ) B = DLAMC3( HALF, C ) C = DLAMC3( HALF, -B ) B = DLAMC3( HALF, C ) GO TO 10 END IF *+ END WHILE * IF( A.LT.LEPS ) $ LEPS = A * * Computation of EPS complete. * * Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3)). * Keep dividing A by BETA until (gradual) underflow occurs. This * is detected when we cannot recover the previous A. * RBASE = ONE / LBETA SMALL = ONE DO 20 I = 1, 3 SMALL = DLAMC3( SMALL*RBASE, ZERO ) 20 CONTINUE A = DLAMC3( ONE, SMALL ) CALL DLAMC4( NGPMIN, ONE, LBETA ) CALL DLAMC4( NGNMIN, -ONE, LBETA ) CALL DLAMC4( GPMIN, A, LBETA ) CALL DLAMC4( GNMIN, -A, LBETA ) IEEE = .FALSE. * IF( ( NGPMIN.EQ.NGNMIN ) .AND. ( GPMIN.EQ.GNMIN ) ) THEN IF( NGPMIN.EQ.GPMIN ) THEN LEMIN = NGPMIN * ( Non twos-complement machines, no gradual underflow; * e.g., VAX ) ELSE IF( ( GPMIN-NGPMIN ).EQ.3 ) THEN LEMIN = NGPMIN - 1 + LT IEEE = .TRUE. * ( Non twos-complement machines, with gradual underflow; * e.g., IEEE standard followers ) ELSE LEMIN = MIN( NGPMIN, GPMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE IF( ( NGPMIN.EQ.GPMIN ) .AND. ( NGNMIN.EQ.GNMIN ) ) THEN IF( ABS( NGPMIN-NGNMIN ).EQ.1 ) THEN LEMIN = MAX( NGPMIN, NGNMIN ) * ( Twos-complement machines, no gradual underflow; * e.g., CYBER 205 ) ELSE LEMIN = MIN( NGPMIN, NGNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE IF( ( ABS( NGPMIN-NGNMIN ).EQ.1 ) .AND. $ ( GPMIN.EQ.GNMIN ) ) THEN IF( ( GPMIN-MIN( NGPMIN, NGNMIN ) ).EQ.3 ) THEN LEMIN = MAX( NGPMIN, NGNMIN ) - 1 + LT * ( Twos-complement machines with gradual underflow; * no known machine ) ELSE LEMIN = MIN( NGPMIN, NGNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE LEMIN = MIN( NGPMIN, NGNMIN, GPMIN, GNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF *** * Comment out this if block if EMIN is ok IF( IWARN ) THEN FIRST = .TRUE. WRITE( 6, FMT = 9999 )LEMIN END IF *** * * Assume IEEE arithmetic if we found denormalised numbers above, * or if arithmetic seems to round in the IEEE style, determined * in routine DLAMC1. A true IEEE machine should have both things * true; however, faulty machines may have one or the other. * IEEE = IEEE .OR. LIEEE1 * * Compute RMIN by successive division by BETA. We could compute * RMIN as BASE**( EMIN - 1 ), but some machines underflow during * this computation. * LRMIN = 1 DO 30 I = 1, 1 - LEMIN LRMIN = DLAMC3( LRMIN*RBASE, ZERO ) 30 CONTINUE * * Finally, call DLAMC5 to compute EMAX and RMAX. * CALL DLAMC5( LBETA, LT, LEMIN, IEEE, LEMAX, LRMAX ) END IF * BETA = LBETA T = LT RND = LRND EPS = LEPS EMIN = LEMIN RMIN = LRMIN EMAX = LEMAX RMAX = LRMAX * RETURN * 9999 FORMAT( / / ' WARNING. The value EMIN may be incorrect:-', $ ' EMIN = ', I8, / $ ' If, after inspection, the value EMIN looks', $ ' acceptable please comment out ', $ / ' the IF block as marked within the code of routine', $ ' DLAMC2,', / ' otherwise supply EMIN explicitly.', / ) * * End of DLAMC2 * END * ************************************************************************ * DOUBLE PRECISION FUNCTION DLAMC3( A, B ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. DOUBLE PRECISION A, B * .. * * Purpose * ======= * * DLAMC3 is intended to force A and B to be stored prior to doing * the addition of A and B , for use in situations where optimizers * might hold one of these in a register. * * Arguments * ========= * * A, B (input) DOUBLE PRECISION * The values A and B. * * ===================================================================== * * .. Executable Statements .. * DLAMC3 = A + B * RETURN * * End of DLAMC3 * END * ************************************************************************ * SUBROUTINE DLAMC4( EMIN, START, BASE ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. INTEGER BASE, EMIN DOUBLE PRECISION START * .. * * Purpose * ======= * * DLAMC4 is a service routine for DLAMC2. * * Arguments * ========= * * EMIN (output) EMIN * The minimum exponent before (gradual) underflow, computed by * setting A = START and dividing by BASE until the previous A * can not be recovered. * * START (input) DOUBLE PRECISION * The starting point for determining EMIN. * * BASE (input) INTEGER * The base of the machine. * * ===================================================================== * * .. Local Scalars .. INTEGER I DOUBLE PRECISION A, B1, B2, C1, C2, D1, D2, ONE, RBASE, ZERO * .. * .. External Functions .. DOUBLE PRECISION DLAMC3 EXTERNAL DLAMC3 * .. * .. Executable Statements .. * A = START ONE = 1 RBASE = ONE / BASE ZERO = 0 EMIN = 1 B1 = DLAMC3( A*RBASE, ZERO ) C1 = A C2 = A D1 = A D2 = A *+ WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. * $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP 10 CONTINUE IF( ( C1.EQ.A ) .AND. ( C2.EQ.A ) .AND. ( D1.EQ.A ) .AND. $ ( D2.EQ.A ) ) THEN EMIN = EMIN - 1 A = B1 B1 = DLAMC3( A / BASE, ZERO ) C1 = DLAMC3( B1*BASE, ZERO ) D1 = ZERO DO 20 I = 1, BASE D1 = D1 + B1 20 CONTINUE B2 = DLAMC3( A*RBASE, ZERO ) C2 = DLAMC3( B2 / RBASE, ZERO ) D2 = ZERO DO 30 I = 1, BASE D2 = D2 + B2 30 CONTINUE GO TO 10 END IF *+ END WHILE * RETURN * * End of DLAMC4 * END * ************************************************************************ * SUBROUTINE DLAMC5( BETA, P, EMIN, IEEE, EMAX, RMAX ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL IEEE INTEGER BETA, EMAX, EMIN, P DOUBLE PRECISION RMAX * .. * * Purpose * ======= * * DLAMC5 attempts to compute RMAX, the largest machine floating-point * number, without overflow. It assumes that EMAX + abs(EMIN) sum * approximately to a power of 2. It will fail on machines where this * assumption does not hold, for example, the Cyber 205 (EMIN = -28625, * EMAX = 28718). It will also fail if the value supplied for EMIN is * too large (i.e. too close to zero), probably with overflow. * * Arguments * ========= * * BETA (input) INTEGER * The base of floating-point arithmetic. * * P (input) INTEGER * The number of base BETA digits in the mantissa of a * floating-point value. * * EMIN (input) INTEGER * The minimum exponent before (gradual) underflow. * * IEEE (input) LOGICAL * A logical flag specifying whether or not the arithmetic * system is thought to comply with the IEEE standard. * * EMAX (output) INTEGER * The largest exponent before overflow * * RMAX (output) DOUBLE PRECISION * The largest machine floating-point number. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. * .. Local Scalars .. INTEGER EXBITS, EXPSUM, I, LEXP, NBITS, TRY, UEXP DOUBLE PRECISION OLDY, RECBAS, Y, Z * .. * .. External Functions .. DOUBLE PRECISION DLAMC3 EXTERNAL DLAMC3 * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * First compute LEXP and UEXP, two powers of 2 that bound * abs(EMIN). We then assume that EMAX + abs(EMIN) will sum * approximately to the bound that is closest to abs(EMIN). * (EMAX is the exponent of the required number RMAX). * LEXP = 1 EXBITS = 1 10 CONTINUE TRY = LEXP*2 IF( TRY.LE.( -EMIN ) ) THEN LEXP = TRY EXBITS = EXBITS + 1 GO TO 10 END IF IF( LEXP.EQ.-EMIN ) THEN UEXP = LEXP ELSE UEXP = TRY EXBITS = EXBITS + 1 END IF * * Now -LEXP is less than or equal to EMIN, and -UEXP is greater * than or equal to EMIN. EXBITS is the number of bits needed to * store the exponent. * IF( ( UEXP+EMIN ).GT.( -LEXP-EMIN ) ) THEN EXPSUM = 2*LEXP ELSE EXPSUM = 2*UEXP END IF * * EXPSUM is the exponent range, approximately equal to * EMAX - EMIN + 1 . * EMAX = EXPSUM + EMIN - 1 NBITS = 1 + EXBITS + P * * NBITS is the total number of bits needed to store a * floating-point number. * IF( ( MOD( NBITS, 2 ).EQ.1 ) .AND. ( BETA.EQ.2 ) ) THEN * * Either there are an odd number of bits used to store a * floating-point number, which is unlikely, or some bits are * not used in the representation of numbers, which is possible, * (e.g. Cray machines) or the mantissa has an implicit bit, * (e.g. IEEE machines, Dec Vax machines), which is perhaps the * most likely. We have to assume the last alternative. * If this is true, then we need to reduce EMAX by one because * there must be some way of representing zero in an implicit-bit * system. On machines like Cray, we are reducing EMAX by one * unnecessarily. * EMAX = EMAX - 1 END IF * IF( IEEE ) THEN * * Assume we are on an IEEE machine which reserves one exponent * for infinity and NaN. * EMAX = EMAX - 1 END IF * * Now create RMAX, the largest machine number, which should * be equal to (1.0 - BETA**(-P)) * BETA**EMAX . * * First compute 1.0 - BETA**(-P), being careful that the * result is less than 1.0 . * RECBAS = ONE / BETA Z = BETA - ONE Y = ZERO DO 20 I = 1, P Z = Z*RECBAS IF( Y.LT.ONE ) $ OLDY = Y Y = DLAMC3( Y, Z ) 20 CONTINUE IF( Y.GE.ONE ) $ Y = OLDY * * Now multiply by BETA**EMAX to get RMAX. * DO 30 I = 1, EMAX Y = DLAMC3( Y*BETA, ZERO ) 30 CONTINUE * RMAX = Y RETURN * * End of DLAMC5 * END REAL FUNCTION SLAMCH( CMACH ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. CHARACTER CMACH * .. * * Purpose * ======= * * SLAMCH determines single precision machine parameters. * * Arguments * ========= * * CMACH (input) CHARACTER*1 * Specifies the value to be returned by SLAMCH: * = 'E' or 'e', SLAMCH := eps * = 'S' or 's , SLAMCH := sfmin * = 'B' or 'b', SLAMCH := base * = 'P' or 'p', SLAMCH := eps*base * = 'N' or 'n', SLAMCH := t * = 'R' or 'r', SLAMCH := rnd * = 'M' or 'm', SLAMCH := emin * = 'U' or 'u', SLAMCH := rmin * = 'L' or 'l', SLAMCH := emax * = 'O' or 'o', SLAMCH := rmax * * where * * eps = relative machine precision * sfmin = safe minimum, such that 1/sfmin does not overflow * base = base of the machine * prec = eps*base * t = number of (base) digits in the mantissa * rnd = 1.0 when rounding occurs in addition, 0.0 otherwise * emin = minimum exponent before (gradual) underflow * rmin = underflow threshold - base**(emin-1) * emax = largest exponent before overflow * rmax = overflow threshold - (base**emax)*(1-eps) * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL FIRST, LRND INTEGER BETA, IMAX, IMIN, IT REAL BASE, EMAX, EMIN, EPS, PREC, RMACH, RMAX, RMIN, $ RND, SFMIN, SMALL, T * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL SLAMC2 * .. * .. Save statement .. SAVE FIRST, EPS, SFMIN, BASE, T, RND, EMIN, RMIN, $ EMAX, RMAX, PREC * .. * .. Data statements .. DATA FIRST / .TRUE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. CALL SLAMC2( BETA, IT, LRND, EPS, IMIN, RMIN, IMAX, RMAX ) BASE = BETA T = IT IF( LRND ) THEN RND = ONE EPS = ( BASE**( 1-IT ) ) / 2 ELSE RND = ZERO EPS = BASE**( 1-IT ) END IF PREC = EPS*BASE EMIN = IMIN EMAX = IMAX SFMIN = RMIN SMALL = ONE / RMAX IF( SMALL.GE.SFMIN ) THEN * * Use SMALL plus a bit, to avoid the possibility of rounding * causing overflow when computing 1/sfmin. * SFMIN = SMALL*( ONE+EPS ) END IF END IF * IF( LSAME( CMACH, 'E' ) ) THEN RMACH = EPS ELSE IF( LSAME( CMACH, 'S' ) ) THEN RMACH = SFMIN ELSE IF( LSAME( CMACH, 'B' ) ) THEN RMACH = BASE ELSE IF( LSAME( CMACH, 'P' ) ) THEN RMACH = PREC ELSE IF( LSAME( CMACH, 'N' ) ) THEN RMACH = T ELSE IF( LSAME( CMACH, 'R' ) ) THEN RMACH = RND ELSE IF( LSAME( CMACH, 'M' ) ) THEN RMACH = EMIN ELSE IF( LSAME( CMACH, 'U' ) ) THEN RMACH = RMIN ELSE IF( LSAME( CMACH, 'L' ) ) THEN RMACH = EMAX ELSE IF( LSAME( CMACH, 'O' ) ) THEN RMACH = RMAX END IF * SLAMCH = RMACH RETURN * * End of SLAMCH * END * ************************************************************************ * SUBROUTINE SLAMC1( BETA, T, RND, IEEE1 ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL IEEE1, RND INTEGER BETA, T * .. * * Purpose * ======= * * SLAMC1 determines the machine parameters given by BETA, T, RND, and * IEEE1. * * Arguments * ========= * * BETA (output) INTEGER * The base of the machine. * * T (output) INTEGER * The number of ( BETA ) digits in the mantissa. * * RND (output) LOGICAL * Specifies whether proper rounding ( RND = .TRUE. ) or * chopping ( RND = .FALSE. ) occurs in addition. This may not * be a reliable guide to the way in which the machine performs * its arithmetic. * * IEEE1 (output) LOGICAL * Specifies whether rounding appears to be done in the IEEE * 'round to nearest' style. * * Further Details * =============== * * The routine is based on the routine ENVRON by Malcolm and * incorporates suggestions by Gentleman and Marovich. See * * Malcolm M. A. (1972) Algorithms to reveal properties of * floating-point arithmetic. Comms. of the ACM, 15, 949-951. * * Gentleman W. M. and Marovich S. B. (1974) More on algorithms * that reveal properties of floating point arithmetic units. * Comms. of the ACM, 17, 276-277. * * ===================================================================== * * .. Local Scalars .. LOGICAL FIRST, LIEEE1, LRND INTEGER LBETA, LT REAL A, B, C, F, ONE, QTR, SAVEC, T1, T2 * .. * .. External Functions .. REAL SLAMC3 EXTERNAL SLAMC3 * .. * .. Save statement .. SAVE FIRST, LIEEE1, LBETA, LRND, LT * .. * .. Data statements .. DATA FIRST / .TRUE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. ONE = 1 * * LBETA, LIEEE1, LT and LRND are the local values of BETA, * IEEE1, T and RND. * * Throughout this routine we use the function SLAMC3 to ensure * that relevant values are stored and not held in registers, or * are not affected by optimizers. * * Compute a = 2.0**m with the smallest positive integer m such * that * * fl( a + 1.0 ) = a. * A = 1 C = 1 * *+ WHILE( C.EQ.ONE )LOOP 10 CONTINUE IF( C.EQ.ONE ) THEN A = 2*A C = SLAMC3( A, ONE ) C = SLAMC3( C, -A ) GO TO 10 END IF *+ END WHILE * * Now compute b = 2.0**m with the smallest positive integer m * such that * * fl( a + b ) .gt. a. * B = 1 C = SLAMC3( A, B ) * *+ WHILE( C.EQ.A )LOOP 20 CONTINUE IF( C.EQ.A ) THEN B = 2*B C = SLAMC3( A, B ) GO TO 20 END IF *+ END WHILE * * Now compute the base. a and c are neighbouring floating point * numbers in the interval ( beta**t, beta**( t + 1 ) ) and so * their difference is beta. Adding 0.25 to c is to ensure that it * is truncated to beta and not ( beta - 1 ). * QTR = ONE / 4 SAVEC = C C = SLAMC3( C, -A ) LBETA = C + QTR * * Now determine whether rounding or chopping occurs, by adding a * bit less than beta/2 and a bit more than beta/2 to a. * B = LBETA F = SLAMC3( B / 2, -B / 100 ) C = SLAMC3( F, A ) IF( C.EQ.A ) THEN LRND = .TRUE. ELSE LRND = .FALSE. END IF F = SLAMC3( B / 2, B / 100 ) C = SLAMC3( F, A ) IF( ( LRND ) .AND. ( C.EQ.A ) ) $ LRND = .FALSE. * * Try and decide whether rounding is done in the IEEE 'round to * nearest' style. B/2 is half a unit in the last place of the two * numbers A and SAVEC. Furthermore, A is even, i.e. has last bit * zero, and SAVEC is odd. Thus adding B/2 to A should not change * A, but adding B/2 to SAVEC should change SAVEC. * T1 = SLAMC3( B / 2, A ) T2 = SLAMC3( B / 2, SAVEC ) LIEEE1 = ( T1.EQ.A ) .AND. ( T2.GT.SAVEC ) .AND. LRND * * Now find the mantissa, t. It should be the integer part of * log to the base beta of a, however it is safer to determine t * by powering. So we find t as the smallest positive integer for * which * * fl( beta**t + 1.0 ) = 1.0. * LT = 0 A = 1 C = 1 * *+ WHILE( C.EQ.ONE )LOOP 30 CONTINUE IF( C.EQ.ONE ) THEN LT = LT + 1 A = A*LBETA C = SLAMC3( A, ONE ) C = SLAMC3( C, -A ) GO TO 30 END IF *+ END WHILE * END IF * BETA = LBETA T = LT RND = LRND IEEE1 = LIEEE1 RETURN * * End of SLAMC1 * END * ************************************************************************ * SUBROUTINE SLAMC2( BETA, T, RND, EPS, EMIN, RMIN, EMAX, RMAX ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL RND INTEGER BETA, EMAX, EMIN, T REAL EPS, RMAX, RMIN * .. * * Purpose * ======= * * SLAMC2 determines the machine parameters specified in its argument * list. * * Arguments * ========= * * BETA (output) INTEGER * The base of the machine. * * T (output) INTEGER * The number of ( BETA ) digits in the mantissa. * * RND (output) LOGICAL * Specifies whether proper rounding ( RND = .TRUE. ) or * chopping ( RND = .FALSE. ) occurs in addition. This may not * be a reliable guide to the way in which the machine performs * its arithmetic. * * EPS (output) REAL * The smallest positive number such that * * fl( 1.0 - EPS ) .LT. 1.0, * * where fl denotes the computed value. * * EMIN (output) INTEGER * The minimum exponent before (gradual) underflow occurs. * * RMIN (output) REAL * The smallest normalized number for the machine, given by * BASE**( EMIN - 1 ), where BASE is the floating point value * of BETA. * * EMAX (output) INTEGER * The maximum exponent before overflow occurs. * * RMAX (output) REAL * The largest positive number for the machine, given by * BASE**EMAX * ( 1 - EPS ), where BASE is the floating point * value of BETA. * * Further Details * =============== * * The computation of EPS is based on a routine PARANOIA by * W. Kahan of the University of California at Berkeley. * * ===================================================================== * * .. Local Scalars .. LOGICAL FIRST, IEEE, IWARN, LIEEE1, LRND INTEGER GNMIN, GPMIN, I, LBETA, LEMAX, LEMIN, LT, $ NGNMIN, NGPMIN REAL A, B, C, HALF, LEPS, LRMAX, LRMIN, ONE, RBASE, $ SIXTH, SMALL, THIRD, TWO, ZERO * .. * .. External Functions .. REAL SLAMC3 EXTERNAL SLAMC3 * .. * .. External Subroutines .. EXTERNAL SLAMC1, SLAMC4, SLAMC5 * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. * .. Save statement .. SAVE FIRST, IWARN, LBETA, LEMAX, LEMIN, LEPS, LRMAX, $ LRMIN, LT * .. * .. Data statements .. DATA FIRST / .TRUE. / , IWARN / .FALSE. / * .. * .. Executable Statements .. * IF( FIRST ) THEN FIRST = .FALSE. ZERO = 0 ONE = 1 TWO = 2 * * LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values of * BETA, T, RND, EPS, EMIN and RMIN. * * Throughout this routine we use the function SLAMC3 to ensure * that relevant values are stored and not held in registers, or * are not affected by optimizers. * * SLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. * CALL SLAMC1( LBETA, LT, LRND, LIEEE1 ) * * Start to find EPS. * B = LBETA A = B**( -LT ) LEPS = A * * Try some tricks to see whether or not this is the correct EPS. * B = TWO / 3 HALF = ONE / 2 SIXTH = SLAMC3( B, -HALF ) THIRD = SLAMC3( SIXTH, SIXTH ) B = SLAMC3( THIRD, -HALF ) B = SLAMC3( B, SIXTH ) B = ABS( B ) IF( B.LT.LEPS ) $ B = LEPS * LEPS = 1 * *+ WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP 10 CONTINUE IF( ( LEPS.GT.B ) .AND. ( B.GT.ZERO ) ) THEN LEPS = B C = SLAMC3( HALF*LEPS, ( TWO**5 )*( LEPS**2 ) ) C = SLAMC3( HALF, -C ) B = SLAMC3( HALF, C ) C = SLAMC3( HALF, -B ) B = SLAMC3( HALF, C ) GO TO 10 END IF *+ END WHILE * IF( A.LT.LEPS ) $ LEPS = A * * Computation of EPS complete. * * Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3)). * Keep dividing A by BETA until (gradual) underflow occurs. This * is detected when we cannot recover the previous A. * RBASE = ONE / LBETA SMALL = ONE DO 20 I = 1, 3 SMALL = SLAMC3( SMALL*RBASE, ZERO ) 20 CONTINUE A = SLAMC3( ONE, SMALL ) CALL SLAMC4( NGPMIN, ONE, LBETA ) CALL SLAMC4( NGNMIN, -ONE, LBETA ) CALL SLAMC4( GPMIN, A, LBETA ) CALL SLAMC4( GNMIN, -A, LBETA ) IEEE = .FALSE. * IF( ( NGPMIN.EQ.NGNMIN ) .AND. ( GPMIN.EQ.GNMIN ) ) THEN IF( NGPMIN.EQ.GPMIN ) THEN LEMIN = NGPMIN * ( Non twos-complement machines, no gradual underflow; * e.g., VAX ) ELSE IF( ( GPMIN-NGPMIN ).EQ.3 ) THEN LEMIN = NGPMIN - 1 + LT IEEE = .TRUE. * ( Non twos-complement machines, with gradual underflow; * e.g., IEEE standard followers ) ELSE LEMIN = MIN( NGPMIN, GPMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE IF( ( NGPMIN.EQ.GPMIN ) .AND. ( NGNMIN.EQ.GNMIN ) ) THEN IF( ABS( NGPMIN-NGNMIN ).EQ.1 ) THEN LEMIN = MAX( NGPMIN, NGNMIN ) * ( Twos-complement machines, no gradual underflow; * e.g., CYBER 205 ) ELSE LEMIN = MIN( NGPMIN, NGNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE IF( ( ABS( NGPMIN-NGNMIN ).EQ.1 ) .AND. $ ( GPMIN.EQ.GNMIN ) ) THEN IF( ( GPMIN-MIN( NGPMIN, NGNMIN ) ).EQ.3 ) THEN LEMIN = MAX( NGPMIN, NGNMIN ) - 1 + LT * ( Twos-complement machines with gradual underflow; * no known machine ) ELSE LEMIN = MIN( NGPMIN, NGNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF * ELSE LEMIN = MIN( NGPMIN, NGNMIN, GPMIN, GNMIN ) * ( A guess; no known machine ) IWARN = .TRUE. END IF *** * Comment out this if block if EMIN is ok IF( IWARN ) THEN FIRST = .TRUE. WRITE( 6, FMT = 9999 )LEMIN END IF *** * * Assume IEEE arithmetic if we found denormalised numbers above, * or if arithmetic seems to round in the IEEE style, determined * in routine SLAMC1. A true IEEE machine should have both things * true; however, faulty machines may have one or the other. * IEEE = IEEE .OR. LIEEE1 * * Compute RMIN by successive division by BETA. We could compute * RMIN as BASE**( EMIN - 1 ), but some machines underflow during * this computation. * LRMIN = 1 DO 30 I = 1, 1 - LEMIN LRMIN = SLAMC3( LRMIN*RBASE, ZERO ) 30 CONTINUE * * Finally, call SLAMC5 to compute EMAX and RMAX. * CALL SLAMC5( LBETA, LT, LEMIN, IEEE, LEMAX, LRMAX ) END IF * BETA = LBETA T = LT RND = LRND EPS = LEPS EMIN = LEMIN RMIN = LRMIN EMAX = LEMAX RMAX = LRMAX * RETURN * 9999 FORMAT( / / ' WARNING. The value EMIN may be incorrect:-', $ ' EMIN = ', I8, / $ ' If, after inspection, the value EMIN looks', $ ' acceptable please comment out ', $ / ' the IF block as marked within the code of routine', $ ' SLAMC2,', / ' otherwise supply EMIN explicitly.', / ) * * End of SLAMC2 * END * ************************************************************************ * REAL FUNCTION SLAMC3( A, B ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. REAL A, B * .. * * Purpose * ======= * * SLAMC3 is intended to force A and B to be stored prior to doing * the addition of A and B , for use in situations where optimizers * might hold one of these in a register. * * Arguments * ========= * * A, B (input) REAL * The values A and B. * * ===================================================================== * * .. Executable Statements .. * SLAMC3 = A + B * RETURN * * End of SLAMC3 * END * ************************************************************************ * SUBROUTINE SLAMC4( EMIN, START, BASE ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. INTEGER BASE, EMIN REAL START * .. * * Purpose * ======= * * SLAMC4 is a service routine for SLAMC2. * * Arguments * ========= * * EMIN (output) EMIN * The minimum exponent before (gradual) underflow, computed by * setting A = START and dividing by BASE until the previous A * can not be recovered. * * START (input) REAL * The starting point for determining EMIN. * * BASE (input) INTEGER * The base of the machine. * * ===================================================================== * * .. Local Scalars .. INTEGER I REAL A, B1, B2, C1, C2, D1, D2, ONE, RBASE, ZERO * .. * .. External Functions .. REAL SLAMC3 EXTERNAL SLAMC3 * .. * .. Executable Statements .. * A = START ONE = 1 RBASE = ONE / BASE ZERO = 0 EMIN = 1 B1 = SLAMC3( A*RBASE, ZERO ) C1 = A C2 = A D1 = A D2 = A *+ WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. * $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP 10 CONTINUE IF( ( C1.EQ.A ) .AND. ( C2.EQ.A ) .AND. ( D1.EQ.A ) .AND. $ ( D2.EQ.A ) ) THEN EMIN = EMIN - 1 A = B1 B1 = SLAMC3( A / BASE, ZERO ) C1 = SLAMC3( B1*BASE, ZERO ) D1 = ZERO DO 20 I = 1, BASE D1 = D1 + B1 20 CONTINUE B2 = SLAMC3( A*RBASE, ZERO ) C2 = SLAMC3( B2 / RBASE, ZERO ) D2 = ZERO DO 30 I = 1, BASE D2 = D2 + B2 30 CONTINUE GO TO 10 END IF *+ END WHILE * RETURN * * End of SLAMC4 * END * ************************************************************************ * SUBROUTINE SLAMC5( BETA, P, EMIN, IEEE, EMAX, RMAX ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments .. LOGICAL IEEE INTEGER BETA, EMAX, EMIN, P REAL RMAX * .. * * Purpose * ======= * * SLAMC5 attempts to compute RMAX, the largest machine floating-point * number, without overflow. It assumes that EMAX + abs(EMIN) sum * approximately to a power of 2. It will fail on machines where this * assumption does not hold, for example, the Cyber 205 (EMIN = -28625, * EMAX = 28718). It will also fail if the value supplied for EMIN is * too large (i.e. too close to zero), probably with overflow. * * Arguments * ========= * * BETA (input) INTEGER * The base of floating-point arithmetic. * * P (input) INTEGER * The number of base BETA digits in the mantissa of a * floating-point value. * * EMIN (input) INTEGER * The minimum exponent before (gradual) underflow. * * IEEE (input) LOGICAL * A logical flag specifying whether or not the arithmetic * system is thought to comply with the IEEE standard. * * EMAX (output) INTEGER * The largest exponent before overflow * * RMAX (output) REAL * The largest machine floating-point number. * * ===================================================================== * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) * .. * .. Local Scalars .. INTEGER EXBITS, EXPSUM, I, LEXP, NBITS, TRY, UEXP REAL OLDY, RECBAS, Y, Z * .. * .. External Functions .. REAL SLAMC3 EXTERNAL SLAMC3 * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. * .. Executable Statements .. * * First compute LEXP and UEXP, two powers of 2 that bound * abs(EMIN). We then assume that EMAX + abs(EMIN) will sum * approximately to the bound that is closest to abs(EMIN). * (EMAX is the exponent of the required number RMAX). * LEXP = 1 EXBITS = 1 10 CONTINUE TRY = LEXP*2 IF( TRY.LE.( -EMIN ) ) THEN LEXP = TRY EXBITS = EXBITS + 1 GO TO 10 END IF IF( LEXP.EQ.-EMIN ) THEN UEXP = LEXP ELSE UEXP = TRY EXBITS = EXBITS + 1 END IF * * Now -LEXP is less than or equal to EMIN, and -UEXP is greater * than or equal to EMIN. EXBITS is the number of bits needed to * store the exponent. * IF( ( UEXP+EMIN ).GT.( -LEXP-EMIN ) ) THEN EXPSUM = 2*LEXP ELSE EXPSUM = 2*UEXP END IF * * EXPSUM is the exponent range, approximately equal to * EMAX - EMIN + 1 . * EMAX = EXPSUM + EMIN - 1 NBITS = 1 + EXBITS + P * * NBITS is the total number of bits needed to store a * floating-point number. * IF( ( MOD( NBITS, 2 ).EQ.1 ) .AND. ( BETA.EQ.2 ) ) THEN * * Either there are an odd number of bits used to store a * floating-point number, which is unlikely, or some bits are * not used in the representation of numbers, which is possible, * (e.g. Cray machines) or the mantissa has an implicit bit, * (e.g. IEEE machines, Dec Vax machines), which is perhaps the * most likely. We have to assume the last alternative. * If this is true, then we need to reduce EMAX by one because * there must be some way of representing zero in an implicit-bit * system. On machines like Cray, we are reducing EMAX by one * unnecessarily. * EMAX = EMAX - 1 END IF * IF( IEEE ) THEN * * Assume we are on an IEEE machine which reserves one exponent * for infinity and NaN. * EMAX = EMAX - 1 END IF * * Now create RMAX, the largest machine number, which should * be equal to (1.0 - BETA**(-P)) * BETA**EMAX . * * First compute 1.0 - BETA**(-P), being careful that the * result is less than 1.0 . * RECBAS = ONE / BETA Z = BETA - ONE Y = ZERO DO 20 I = 1, P Z = Z*RECBAS IF( Y.LT.ONE ) $ OLDY = Y Y = SLAMC3( Y, Z ) 20 CONTINUE IF( Y.GE.ONE ) $ Y = OLDY * * Now multiply by BETA**EMAX to get RMAX. * DO 30 I = 1, EMAX Y = SLAMC3( Y*BETA, ZERO ) 30 CONTINUE * RMAX = Y RETURN * * End of SLAMC5 * END LOGICAL FUNCTION LSAME( CA, CB ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1994 * * .. Scalar Arguments .. CHARACTER CA, CB * .. * * Purpose * ======= * * LSAME returns .TRUE. if CA is the same letter as CB regardless of * case. * * Arguments * ========= * * CA (input) CHARACTER*1 * CB (input) CHARACTER*1 * CA and CB specify the single characters to be compared. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ICHAR * .. * .. Local Scalars .. INTEGER INTA, INTB, ZCODE * .. * .. Executable Statements .. * * Test if the characters are equal * LSAME = CA.EQ.CB IF( LSAME ) $ RETURN * * Now test for equivalence if both characters are alphabetic. * ZCODE = ICHAR( 'Z' ) * * Use 'Z' rather than 'A' so that ASCII can be detected on Prime * machines, on which ICHAR returns a value with bit 8 set. * ICHAR('A') on Prime machines returns 193 which is the same as * ICHAR('A') on an EBCDIC machine. * INTA = ICHAR( CA ) INTB = ICHAR( CB ) * IF( ZCODE.EQ.90 .OR. ZCODE.EQ.122 ) THEN * * ASCII is assumed - ZCODE is the ASCII code of either lower or * upper case 'Z'. * IF( INTA.GE.97 .AND. INTA.LE.122 ) INTA = INTA - 32 IF( INTB.GE.97 .AND. INTB.LE.122 ) INTB = INTB - 32 * ELSE IF( ZCODE.EQ.233 .OR. ZCODE.EQ.169 ) THEN * * EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or * upper case 'Z'. * IF( INTA.GE.129 .AND. INTA.LE.137 .OR. $ INTA.GE.145 .AND. INTA.LE.153 .OR. $ INTA.GE.162 .AND. INTA.LE.169 ) INTA = INTA + 64 IF( INTB.GE.129 .AND. INTB.LE.137 .OR. $ INTB.GE.145 .AND. INTB.LE.153 .OR. $ INTB.GE.162 .AND. INTB.LE.169 ) INTB = INTB + 64 * ELSE IF( ZCODE.EQ.218 .OR. ZCODE.EQ.250 ) THEN * * ASCII is assumed, on Prime machines - ZCODE is the ASCII code * plus 128 of either lower or upper case 'Z'. * IF( INTA.GE.225 .AND. INTA.LE.250 ) INTA = INTA - 32 IF( INTB.GE.225 .AND. INTB.LE.250 ) INTB = INTB - 32 END IF LSAME = INTA.EQ.INTB * * RETURN * * End of LSAME * END DOUBLE PRECISION FUNCTION DLARND( IDIST, ISEED ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1994 * * .. Scalar Arguments .. INTEGER IDIST * .. * .. Array Arguments .. INTEGER ISEED( 4 ) * .. * * Purpose * ======= * * DLARND returns a random real number from a uniform or normal * distribution. * * Arguments * ========= * * IDIST (input) INTEGER * Specifies the distribution of the random numbers: * = 1: uniform (0,1) * = 2: uniform (-1,1) * = 3: normal (0,1) * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * Further Details * =============== * * This routine calls the auxiliary routine DLARAN to generate a random * real number from a uniform (0,1) distribution. The Box-Muller method * is used to transform numbers from a uniform to a normal distribution. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, TWO PARAMETER ( ONE = 1.0D+0, TWO = 2.0D+0 ) DOUBLE PRECISION TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION T1, T2 * .. * .. External Functions .. DOUBLE PRECISION DLARAN EXTERNAL DLARAN * .. * .. Intrinsic Functions .. INTRINSIC COS, LOG, SQRT * .. * .. Executable Statements .. * * Generate a real random number from a uniform (0,1) distribution * T1 = DLARAN( ISEED ) * IF( IDIST.EQ.1 ) THEN * * uniform (0,1) * DLARND = T1 ELSE IF( IDIST.EQ.2 ) THEN * * uniform (-1,1) * DLARND = TWO*T1 - ONE ELSE IF( IDIST.EQ.3 ) THEN * * normal (0,1) * T2 = DLARAN( ISEED ) DLARND = SQRT( -TWO*LOG( T1 ) )*COS( TWOPI*T2 ) END IF RETURN * * End of DLARND * END DOUBLE COMPLEX FUNCTION ZLARND( IDIST, ISEED ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1994 * * .. Scalar Arguments .. INTEGER IDIST * .. * .. Array Arguments .. INTEGER ISEED( 4 ) * .. * * Purpose * ======= * * ZLARND returns a random complex number from a uniform or normal * distribution. * * Arguments * ========= * * IDIST (input) INTEGER * Specifies the distribution of the random numbers: * = 1: real and imaginary parts each uniform (0,1) * = 2: real and imaginary parts each uniform (-1,1) * = 3: real and imaginary parts each normal (0,1) * = 4: uniformly distributed on the disc abs(z) <= 1 * = 5: uniformly distributed on the circle abs(z) = 1 * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * Further Details * =============== * * This routine calls the auxiliary routine DLARAN to generate a random * real number from a uniform (0,1) distribution. The Box-Muller method * is used to transform numbers from a uniform to a normal distribution. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE, TWO PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) DOUBLE PRECISION TWOPI PARAMETER ( TWOPI = 6.2831853071795864769252867663D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION T1, T2 * .. * .. External Functions .. DOUBLE PRECISION DLARAN EXTERNAL DLARAN * .. * .. Intrinsic Functions .. INTRINSIC DCMPLX, EXP, LOG, SQRT * .. * .. Executable Statements .. * * Generate a pair of real random numbers from a uniform (0,1) * distribution * T1 = DLARAN( ISEED ) T2 = DLARAN( ISEED ) * IF( IDIST.EQ.1 ) THEN * * real and imaginary parts each uniform (0,1) * ZLARND = DCMPLX( T1, T2 ) ELSE IF( IDIST.EQ.2 ) THEN * * real and imaginary parts each uniform (-1,1) * ZLARND = DCMPLX( TWO*T1-ONE, TWO*T2-ONE ) ELSE IF( IDIST.EQ.3 ) THEN * * real and imaginary parts each normal (0,1) * ZLARND = SQRT( -TWO*LOG( T1 ) )*EXP( DCMPLX( ZERO, TWOPI*T2 ) ) ELSE IF( IDIST.EQ.4 ) THEN * * uniform distribution on the unit disc abs(z) <= 1 * ZLARND = SQRT( T1 )*EXP( DCMPLX( ZERO, TWOPI*T2 ) ) ELSE IF( IDIST.EQ.5 ) THEN * * uniform distribution on the unit circle abs(z) = 1 * ZLARND = EXP( DCMPLX( ZERO, TWOPI*T2 ) ) END IF RETURN * * End of ZLARND * END DOUBLE PRECISION FUNCTION DLARAN( ISEED ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * February 29, 1992 * * .. Array Arguments .. INTEGER ISEED( 4 ) * .. * * Purpose * ======= * * DLARAN returns a random real number from a uniform (0,1) * distribution. * * Arguments * ========= * * ISEED (input/output) INTEGER array, dimension (4) * On entry, the seed of the random number generator; the array * elements must be between 0 and 4095, and ISEED(4) must be * odd. * On exit, the seed is updated. * * Further Details * =============== * * This routine uses a multiplicative congruential method with modulus * 2**48 and multiplier 33952834046453 (see G.S.Fishman, * 'Multiplicative congruential random number generators with modulus * 2**b: an exhaustive analysis for b = 32 and a partial analysis for * b = 48', Math. Comp. 189, pp 331-344, 1990). * * 48-bit integers are stored in 4 integer array elements with 12 bits * per element. Hence the routine is portable across machines with * integers of 32 bits or more. * * ===================================================================== * * .. Parameters .. INTEGER M1, M2, M3, M4 PARAMETER ( M1 = 494, M2 = 322, M3 = 2508, M4 = 2549 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) INTEGER IPW2 DOUBLE PRECISION R PARAMETER ( IPW2 = 4096, R = ONE / IPW2 ) * .. * .. Local Scalars .. INTEGER IT1, IT2, IT3, IT4 * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MOD * .. * .. Executable Statements .. * * multiply the seed by the multiplier modulo 2**48 * IT4 = ISEED( 4 )*M4 IT3 = IT4 / IPW2 IT4 = IT4 - IPW2*IT3 IT3 = IT3 + ISEED( 3 )*M4 + ISEED( 4 )*M3 IT2 = IT3 / IPW2 IT3 = IT3 - IPW2*IT2 IT2 = IT2 + ISEED( 2 )*M4 + ISEED( 3 )*M3 + ISEED( 4 )*M2 IT1 = IT2 / IPW2 IT2 = IT2 - IPW2*IT1 IT1 = IT1 + ISEED( 1 )*M4 + ISEED( 2 )*M3 + ISEED( 3 )*M2 + $ ISEED( 4 )*M1 IT1 = MOD( IT1, IPW2 ) * * return updated seed * ISEED( 1 ) = IT1 ISEED( 2 ) = IT2 ISEED( 3 ) = IT3 ISEED( 4 ) = IT4 * * convert 48-bit integer to a real number in the interval (0,1) * DLARAN = R*( DBLE( IT1 )+R*( DBLE( IT2 )+R*( DBLE( IT3 )+R* $ ( DBLE( IT4 ) ) ) ) ) RETURN * * End of DLARAN * END scalapack-2.0.2/BLACS/SRC/Bconfig.h000644 000766 000024 00000005033 11640652114 016705 0ustar00juliestaff000000 000000 /* * This file includes the standard C libraries, as well as system dependant * include files. All BLACS routines include this file. */ #ifndef BCONFIG_H #define BCONFIG_H 1 /* * Include files */ #include #include #include #include #include /* * These macros define the naming strategy needed for a fortran * routine to call a C routine, and whether to build so they may be * called from C or fortran. For the fortran call C interface, ADD_ assumes that * fortran calls expect C routines to have an underscore postfixed to the name * (Suns, and the Intel expect this). NOCHANGE indicates that fortran expects * the name called by fortran to be identical to that compiled by C * (AIX does this). UPCASE says it expects C routines called by fortran * to be in all upcase (CRAY wants this). The variable FORTRAN_CALL_C is always * set to one of these values. If the BLACS will be called from C, we define * INTFACE to be CALL_C, otherwise, it is set to FORTRAN_CALL_C. */ #define ADD_ 0 #define NOCHANGE 1 #define UPCASE 2 #define FCISF2C 3 #define C_CALL 4 #ifdef UpCase #define FORTRAN_CALL_C UPCASE #endif #ifdef NoChange #define FORTRAN_CALL_C NOCHANGE #endif #ifdef Add_ #define FORTRAN_CALL_C ADD_ #endif #ifdef FortranIsF2C #define FORTRAN_CALL_C FCISF2C #endif #ifndef FORTRAN_CALL_C #define FORTRAN_CALL_C ADD_ #endif #ifdef CallFromC #define INTFACE C_CALL #else #define INTFACE FORTRAN_CALL_C #endif /* * Uncomment these macro definitions, and substitute the topology of your * choice to vary the default topology (TOP = ' ') for broadcast and combines. #define DefBSTop '1' #define DefCombTop '1' */ /* * Uncomment this line if your MPI_Send provides a locally-blocking send */ /* #define SndIsLocBlk */ /* * Comment out the following line if your MPI does a data copy on every * non-contiguous send */ #ifndef NoMpiBuff #define MpiBuffGood #endif /* * If your MPI cannot form data types of zero length, uncomment the * following definition */ /* #define ZeroByteTypeBug */ /* * These macros set the timing and debug levels for the BLACS. The fastest * code is produced by setting both values to 0. Higher levels provide * more timing/debug information at the cost of performance. Present levels * of debug are: * 0 : No debug information * 1 : Mainly parameter checking. * * Present levels of timing are: * 0 : No timings taken */ #ifndef BlacsDebugLvl #define BlacsDebugLvl 0 #endif #ifndef BlacsTimingLvl #define BlacsTimingLvl 0 #endif #endif scalapack-2.0.2/BLACS/SRC/Bdef.h000644 000766 000024 00000036545 11745552113 016216 0ustar00juliestaff000000 000000 #ifndef BDEF_H #define BDEF_H 1 /* * Include the system dependant and user defined stuff */ #include "Bconfig.h" /* * Data type defining a scope for the BLACS */ typedef struct bLaCsScOpE BLACSSCOPE; struct bLaCsScOpE { MPI_Comm comm; int ScpId, MaxId, MinId; int Np, Iam; }; /* * Data type defining a context for the BLACS */ typedef struct bLaCsCoNtExT BLACSCONTEXT; struct bLaCsCoNtExT { BLACSSCOPE rscp, cscp, ascp, pscp; /* row, column, all, and pt2pt scopes */ BLACSSCOPE *scp; /* pointer to present scope */ int TopsRepeat; /* Use only repeatable topologies? */ int TopsCohrnt; /* Use only coherent topologies? */ int Nb_bs, Nr_bs; /* for bcast general tree and multiring tops */ int Nb_co, Nr_co; /* for combine general tree and multiring tops */ }; /* * Define the fortran data types COMPLEX*8 (SCOMPLEX) * and COMPLEX*16 (DCOMPLEX). */ typedef struct {double r, i;} DCOMPLEX; typedef struct {float r, i;} SCOMPLEX; /* * These variables will be defined to be MPI datatypes for complex and double * complex if we are using the C interface to MPI. If we use the fortran * interface, we need to declare the contants array. I'm too lazy to declare * these guys external in every file that needs them. */ #ifndef GlobalVars extern int *BI_COMM_WORLD; #endif /* * Definition of buffer type for BLACS' asynchronous operations */ typedef struct bLaCbUfF BLACBUFF; struct bLaCbUfF { char *Buff; /* send/recv buffer */ int Len; /* length of buffer in bytes */ int nAops; /* number of asynchronous operations out of buff */ MPI_Request *Aops; /* list of async. operations out of buff */ MPI_Datatype dtype; /* data type of buffer */ int N; /* number of elements of data type in buff */ BLACBUFF *prev, *next; /* pointer to the other BLACBUFF in queue */ }; /* * Pointer to the combine's vector-vector functions */ typedef void (*VVFUNPTR)(int, char *, char *); typedef void (*SDRVPTR)(BLACSCONTEXT *, int, int, BLACBUFF *); #define BI_DistType unsigned short #define BI_MpiDistType MPI_UNSIGNED_SHORT #define BUFFALIGN 8 /* force all buffers to 8 byte alignment */ #define BANYNODE MPI_ANY_SOURCE #define PT2PTID 9976 /* TAG used for point to point */ #define NOTINCONTEXT -1 /* Indicates node called gridmap, but not in grid */ #define MAXNCTXT 10 /* initial guess at max # of contexts */ #define MAXNSYSCTXT 10 /* initial guess at max # of system context */ #define AOPDONE MPI_REQUEST_NULL #define BUFWAIT 120 /* Length of time to wait for emergency buff */ /* * Error codes */ #define NORV 1 /* No receiver (only 1 proc in scoped op) */ #define NPOW2 2 /* Number of procs is not a power of 2 */ #define BADSCP 3 /* Scope not row, column or all */ /* * Data types */ #define INTEGER 3 #define SINGLE 4 #define DOUBLE 6 #define COMPLEX8 5 #define COMPLEX16 7 #define FULLCON 0 /* top is fully connected */ /* * Routine types */ #define RT_SD 1 #define RT_RV 2 #define RT_BS 3 #define RT_BR 4 #define RT_COMB 5 /* * Legal WHAT values for BLACS_SET */ #define SGET_SYSCONTXT 0 #define SGET_MSGIDS 1 #define SGET_DEBUGLVL 2 #define SGET_BLACSCONTXT 10 #define SGET_NR_BS 11 #define SGET_NB_BS 12 #define SGET_NR_CO 13 #define SGET_NB_CO 14 #define SGET_TOPSREPEAT 15 #define SGET_TOPSCOHRNT 16 /* * These are prototypes for error and warning functions -- I don't want * to prototype them in each routine. */ void BI_BlacsWarn(int ConTxt, int line, char *file, char *form, ...); void BI_BlacsErr(int ConTxt, int line, char *file, char *form, ...); int BI_ContxtNum(BLACSCONTEXT *ctxt); /* * If we've got an ANSI standard C compiler, we can use void pointers... */ #define BVOID void /* * ======================================================================== * PREPROCESSOR MACRO FUNCTIONS USED FOR OPTIMIZATION & CONVENIENCE * ======================================================================== */ #define Mlowcase(C) ( ((C) > 64 && (C) < 91) ? (C) | 32 : (C) ) /* * Slightly modified gridinfo substitute */ #define Mgridinfo(ctxt, Ng0, nprow0, npcol0, myrow0, mycol0)\ {\ (Ng0) = (ctxt)->ascp.Np;\ (nprow0) = (ctxt)->cscp.Np;\ (npcol0) = (ctxt)->rscp.Np;\ (myrow0) = (ctxt)->cscp.Iam;\ (mycol0) = (ctxt)->rscp.Iam;\ } /* * These routines return coordinates based on nodes number, or node number * based on coordinates. Those routines with v after the M return virtual * nodes numbers (i.e., in respect to the grid, not physical node numbers) * based on grid coordinates, or grid coordinates based on virtual node numbers. */ #define Mpcoord(ctxt, node, prow, pcol)\ {\ (prow) = (node) / (ctxt)->rscp.Np;\ (pcol) = (node) % (ctxt)->rscp.Np;\ } #define Mvpcoord(ctxt, node, prow, pcol) \ Mpcoord((ctxt), (node), (prow), (pcol)); #define Mkpnum(ctxt, prow, pcol) ( (prow)*(ctxt)->rscp.Np+(pcol) ) #define Mvkpnum(ctxt, prow, pcol) ( (prow)*(ctxt)->rscp.Np+(pcol) ) /* * This macro returns scoped message ID's. */ #define Mscopeid(ctxt) (ctxt)->scp->ScpId; \ if (++(ctxt)->scp->ScpId == (ctxt)->scp->MaxId) \ (ctxt)->scp->ScpId = (ctxt)->scp->MinId; /* * Get context, and check for validity if debug level is high */ #if (BlacsDebugLvl > 0) #define MGetConTxt(Context, ctxtptr)\ {\ extern BLACSCONTEXT **BI_MyContxts;\ extern int BI_MaxNCtxt;\ if ( ((Context) >= BI_MaxNCtxt) || ((Context) < 0) )\ BI_BlacsErr(-1, __LINE__, __FILE__, "Invalid context handle: %d",\ (Context));\ else if (BI_MyContxts[(Context)] == NULL)\ BI_BlacsErr(-1, __LINE__, __FILE__, "Invalid context, handle=%d",\ (Context));\ (ctxtptr) = BI_MyContxts[(Context)];\ } #else #define MGetConTxt(Context, ctxtptr)\ {\ extern BLACSCONTEXT **BI_MyContxts;\ (ctxtptr) = BI_MyContxts[(Context)];\ } #endif /* * This macro handles MPI errors */ #if(BlacsDebugLvl > 0) #define Mmpierror(ierr, rout, ctxt, line, file) \ { \ if ( (ierr) != BI_MPI_SUCCESS )\ BI_BlacsErr(BI_ContxtNum((ctxt)), (line), (file), \ "MPI error %d on call to %s", (ierr), (rout)); \ } #else #define Mmpierror(ierr, rout, ctxt, line, file) #endif /* * A small macro useful for debugging */ #define ErrPrint \ { \ extern int BI_Iam; \ fprintf(stderr, "%d: line %d of file %s\n", BI_Iam, __LINE__, __FILE__); \ } /* * These macros allow for the funky function declarations and character handling * needed on the CRAY to have a C routine callable from fortran */ #define F_VOID_FUNC void #define F_INT_FUNC int #define F_DOUBLE_FUNC double #if (INTFACE == C_CALL) #define F2C_CharTrans(c) *(c) #else #ifdef CRAY #define F2C_CharTrans(c) *( _fcdtocp((c)) ) #define F_CHAR _fcd #else #define F2C_CharTrans(c) *(c) #define F_CHAR char * #endif #endif /* * These macros allow for accessing values and addresses of parameters, which * will be pointers if we're using fortran, and values if we're using C. */ #if (INTFACE == C_CALL) #define Mpval(para) (para) #define Mpaddress(para) (&(para)) #define Mwalltime Cdwalltime00 #else #define Mpval(para) (*(para)) #define Mpaddress(para) (para) #define Mwalltime dwalltime00_ #endif /* * Real and complex absolute values */ #define Rabs(x) ( (x) < 0 ? (x) * -1 : (x) ) #define Cabs(z) ( (((z).i) < 0 ? ((z).i) * -1 : ((z).i)) + (((z).r) < 0 ? ((z).r) * -1 : ((z).r)) ) /* * Figures the length of packed trapezoidal matrix */ #define trsize(diag, m, n, bytes, length)\ {\ if ( (diag) == 'u' ) (length) = 1;\ else (length) = 0;\ if ( (m) > (n) )\ (length) = ( (n) * ( (m) - (n) ) + ( (n)*(n) ) - ( (n)*(n) )/2 +\ (n)/2 - (n) * (length) ) * (bytes);\ else\ (length) = ( (m) * ( (n) - (m) ) + ( (m)*(m) ) - ( (m)*(m) )/2 +\ (m)/2 - (m) * (length) ) * (bytes);\ } /* * These macros call the correct packing/unpacking routines */ #define BI_cmvcopy(m, n, A, lda, buff) \ BI_smvcopy(2*(m), (n), (float *) (A), 2*(lda), (float *) (buff)) #define BI_cvmcopy(m, n, A, lda, buff) \ BI_svmcopy(2*(m), (n), (float *) (A), 2*(lda), (float *) (buff)) #define BI_zmvcopy(m, n, A, lda, buff) \ BI_dmvcopy(2*(m), (n), (double *) (A), 2*(lda), (double *) (buff)) #define BI_zvmcopy(m, n, A, lda, buff) \ BI_dvmcopy(2*(m), (n), (double *) (A), 2*(lda), (double *) (buff)) /* * This macro avoids freeing types when the zero-byte workaround was applied */ #ifdef ZeroByteTypeBug #define BI_MPI_TYPE_FREE(t) (*(t) != MPI_BYTE ? MPI_Type_free(t) : 0) #else #define BI_MPI_TYPE_FREE(t) MPI_Type_free(t) #endif #if (FORTRAN_CALL_C == NOCHANGE) /* * These defines set up the naming scheme required to have a fortran * routine call a C routine (which is what the BLACS are written in) * for the following Fortran to C interface: * FORTRAN CALL C DECLARATION * call dgebs2d(...) void dgebs2d(...) */ /* * Support routines */ #define blacs_pinfo_ blacs_pinfo #define blacs_setup_ blacs_setup #define setpvmtids_ setpvmtids #define blacs_set_ blacs_set #define blacs_get_ blacs_get #define blacs_gridinit_ blacs_gridinit #define blacs_gridmap_ blacs_gridmap #define ksendid_ ksendid #define krecvid_ krecvid #define kbsid_ kbsid #define kbrid_ kbrid #define blacs_freebuff_ blacs_freebuff #define blacs_gridexit_ blacs_gridexit #define blacs_abort_ blacs_abort #define blacs_exit_ blacs_exit #define blacs_gridinfo_ blacs_gridinfo #define blacs_pnum_ blacs_pnum #define blacs_pcoord_ blacs_pcoord #define dcputime00_ dcputime00 #define dwalltime00_ dwalltime00 #define blacs_barrier_ blacs_barrier /* * Main, type dependent, routines */ #define igesd2d_ igesd2d #define igerv2d_ igerv2d #define igebs2d_ igebs2d #define igebr2d_ igebr2d #define itrsd2d_ itrsd2d #define itrrv2d_ itrrv2d #define itrbs2d_ itrbs2d #define itrbr2d_ itrbr2d #define igsum2d_ igsum2d #define igamx2d_ igamx2d #define igamn2d_ igamn2d #define sgesd2d_ sgesd2d #define sgerv2d_ sgerv2d #define sgebs2d_ sgebs2d #define sgebr2d_ sgebr2d #define strsd2d_ strsd2d #define strrv2d_ strrv2d #define strbs2d_ strbs2d #define strbr2d_ strbr2d #define sgsum2d_ sgsum2d #define sgamx2d_ sgamx2d #define sgamn2d_ sgamn2d #define dgesd2d_ dgesd2d #define dgerv2d_ dgerv2d #define dgebs2d_ dgebs2d #define dgebr2d_ dgebr2d #define dtrsd2d_ dtrsd2d #define dtrrv2d_ dtrrv2d #define dtrbs2d_ dtrbs2d #define dtrbr2d_ dtrbr2d #define dgsum2d_ dgsum2d #define dgamx2d_ dgamx2d #define dgamn2d_ dgamn2d #define cgesd2d_ cgesd2d #define cgerv2d_ cgerv2d #define cgebs2d_ cgebs2d #define cgebr2d_ cgebr2d #define ctrsd2d_ ctrsd2d #define ctrrv2d_ ctrrv2d #define ctrbs2d_ ctrbs2d #define ctrbr2d_ ctrbr2d #define cgsum2d_ cgsum2d #define cgamx2d_ cgamx2d #define cgamn2d_ cgamn2d #define zgesd2d_ zgesd2d #define zgerv2d_ zgerv2d #define zgebs2d_ zgebs2d #define zgebr2d_ zgebr2d #define ztrsd2d_ ztrsd2d #define ztrrv2d_ ztrrv2d #define ztrbs2d_ ztrbs2d #define ztrbr2d_ ztrbr2d #define zgsum2d_ zgsum2d #define zgamx2d_ zgamx2d #define zgamn2d_ zgamn2d #elif (FORTRAN_CALL_C == UPCASE) /* * These defines set up the naming scheme required to have a fortran * routine call a C routine (which is what the BLACS are written in) * for the following Fortran to C interface: * FORTRAN CALL C DECLARATION * call dgebs2d(...) void DGEBS2D(...) */ /* * Support routines */ #define blacs_pinfo_ BLACS_PINFO #define blacs_setup_ BLACS_SETUP #define setpvmtids_ SETPVMTIDS #define blacs_set_ BLACS_SET #define blacs_get_ BLACS_GET #define blacs_gridinit_ BLACS_GRIDINIT #define blacs_gridmap_ BLACS_GRIDMAP #define ksendid_ KSENDID #define krecvid_ KRECVID #define kbsid_ KBSID #define kbrid_ KBRID #define blacs_freebuff_ BLACS_FREEBUFF #define blacs_gridexit_ BLACS_GRIDEXIT #define blacs_abort_ BLACS_ABORT #define blacs_exit_ BLACS_EXIT #define blacs_gridinfo_ BLACS_GRIDINFO #define blacs_pnum_ BLACS_PNUM #define blacs_pcoord_ BLACS_PCOORD #define dcputime00_ DCPUTIME00 #define dwalltime00_ DWALLTIME00 #define blacs_barrier_ BLACS_BARRIER /* * Main, type dependent, routines */ #define igesd2d_ IGESD2D #define igerv2d_ IGERV2D #define igebs2d_ IGEBS2D #define igebr2d_ IGEBR2D #define itrsd2d_ ITRSD2D #define itrrv2d_ ITRRV2D #define itrbs2d_ ITRBS2D #define itrbr2d_ ITRBR2D #define igsum2d_ IGSUM2D #define igamx2d_ IGAMX2D #define igamn2d_ IGAMN2D #define sgesd2d_ SGESD2D #define sgerv2d_ SGERV2D #define sgebs2d_ SGEBS2D #define sgebr2d_ SGEBR2D #define strsd2d_ STRSD2D #define strrv2d_ STRRV2D #define strbs2d_ STRBS2D #define strbr2d_ STRBR2D #define sgsum2d_ SGSUM2D #define sgamx2d_ SGAMX2D #define sgamn2d_ SGAMN2D #define dgesd2d_ DGESD2D #define dgerv2d_ DGERV2D #define dgebs2d_ DGEBS2D #define dgebr2d_ DGEBR2D #define dtrsd2d_ DTRSD2D #define dtrrv2d_ DTRRV2D #define dtrbs2d_ DTRBS2D #define dtrbr2d_ DTRBR2D #define dgsum2d_ DGSUM2D #define dgamx2d_ DGAMX2D #define dgamn2d_ DGAMN2D #define cgesd2d_ CGESD2D #define cgerv2d_ CGERV2D #define cgebs2d_ CGEBS2D #define cgebr2d_ CGEBR2D #define ctrsd2d_ CTRSD2D #define ctrrv2d_ CTRRV2D #define ctrbs2d_ CTRBS2D #define ctrbr2d_ CTRBR2D #define cgsum2d_ CGSUM2D #define cgamx2d_ CGAMX2D #define cgamn2d_ CGAMN2D #define zgesd2d_ ZGESD2D #define zgerv2d_ ZGERV2D #define zgebs2d_ ZGEBS2D #define zgebr2d_ ZGEBR2D #define ztrsd2d_ ZTRSD2D #define ztrrv2d_ ZTRRV2D #define ztrbs2d_ ZTRBS2D #define ztrbr2d_ ZTRBR2D #define zgsum2d_ ZGSUM2D #define zgamx2d_ ZGAMX2D #define zgamn2d_ ZGAMN2D #elif (FORTRAN_CALL_C == FCISF2C) /* * These defines set up the naming scheme required to have a fortran * routine call a C routine (which is what the BLACS are written in) * for systems where the fortran "compiler" is actually f2c (a fortran * to C conversion utility). */ /* * Initialization routines */ #define blacs_pinfo_ blacs_pinfo__ #define blacs_setup_ blacs_setup__ #define blacs_set_ blacs_set__ #define blacs_get_ blacs_get__ #define blacs_gridinit_ blacs_gridinit__ #define blacs_gridmap_ blacs_gridmap__ /* * Destruction routines */ #define blacs_freebuff_ blacs_freebuff__ #define blacs_gridexit_ blacs_gridexit__ #define blacs_abort_ blacs_abort__ #define blacs_exit_ blacs_exit__ /* * Informational & misc. */ #define blacs_gridinfo_ blacs_gridinfo__ #define blacs_pnum_ blacs_pnum__ #define blacs_pcoord_ blacs_pcoord__ #define blacs_barrier_ blacs_barrier__ #endif #endif scalapack-2.0.2/BLACS/SRC/BI_Arecv.c000644 000766 000024 00000002234 11640652114 016743 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_Arecv(BLACSCONTEXT *ctxt, int src, int msgid, BLACBUFF *bp) { int i, info, errclass; info=MPI_Irecv(bp->Buff, bp->N, bp->dtype, src, msgid, ctxt->scp->comm, &bp->Aops[bp->nAops]); while(info != MPI_SUCCESS) { i=MPI_Error_class(info, &errclass); if ( (errclass != MPI_ERR_UNKNOWN) && (errclass != MPI_ERR_OTHER) && (errclass != MPI_ERR_INTERN) ) { Mmpierror(info, "MPI_Irecv", ctxt, __LINE__, __FILE__); BI_BlacsErr(BI_ContxtNum(ctxt), __LINE__, __FILE__, "MPI error %d on call to MPI_Irecv", info); } #if (BlacsDebugLvl > 0) else BI_BlacsWarn(BI_ContxtNum(ctxt), __LINE__, __FILE__, "MPI error %d assumed to mean out of non-blocking resources on call to MPI_Irecv", info); #endif info=MPI_Irecv(bp->Buff, bp->N, bp->dtype, src, msgid, ctxt->scp->comm, &bp->Aops[bp->nAops]); } bp->nAops++; /* * Signal if we need to use status to figure out true length of received message * We only need do this if we are doing our own buffering */ #ifndef MpiBuffGood if (bp->dtype == MPI_PACKED) bp->N = -bp->nAops; #endif } scalapack-2.0.2/BLACS/SRC/BI_ArgCheck.c000644 000766 000024 00000006710 11640652114 017355 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_ArgCheck(int ConTxt, int RoutType, char *routine, char scope, char uplo, char diag, int m, int n, int lda, int nprocs, int *prows, int *pcols) { #if (BlacsDebugLvl > 0) char *srcdest; int i=1, prow, pcol, Ng, nprow, npcol, myrow, mycol; BLACSCONTEXT *ctxt; MGetConTxt(ConTxt, ctxt); Mgridinfo(ctxt, Ng, nprow, npcol, myrow, mycol); if ( (scope != 'r') && (scope != 'c') && (scope != 'a') ) BI_BlacsErr(ConTxt, -1, routine, "Unknown scope, scope=%c", scope); if ( (uplo != 'u') && (uplo != 'l') ) { if (RoutType != RT_COMB) BI_BlacsWarn(ConTxt, -1, routine, "UPLO=%c, will be assumed to mean LOWER", uplo); else i = 0; /* combine aux, for rect. matrix */ } if ( (diag != 'u') && (diag != 'n') ) { if (i) BI_BlacsWarn(ConTxt, -1, routine, "DIAG=%c, will be assumed to mean NON-UNIT", diag); } if (m * n != 0) { if (m < 0) BI_BlacsErr(ConTxt, -1, routine, "Illegal number of rows, M=%d", m); if (n < 0) BI_BlacsErr(ConTxt, -1, routine, "Illegal number of columns, N=%d", n); if (lda < m) BI_BlacsWarn(ConTxt, -1, routine, "Illegal LDA, LDA=%d, M=%d; LDA assumed to be %d", lda, m, m); } if ( (RoutType == RT_RV) || (RoutType == RT_BR) ) srcdest = "SRC"; else srcdest = "DEST"; if (RoutType == RT_SD) { if ( (nprocs > Ng) || (nprocs < 0) ) BI_BlacsErr(ConTxt, -1, routine, "Trying to send to %d procs, but only %d in grid", nprocs, Ng); } for (i=0; i < nprocs; i++) { prow = prows[i]; pcol = pcols[i]; if ( (prow < 0) || (prow >= nprow) ) { if ( !((RoutType == RT_COMB) && (prow == -1)) ) BI_BlacsErr(ConTxt, -1, routine, "R%s out of range; R%s=%d, NPROW=%d", srcdest, srcdest, prow, nprow); } if ( (pcol < 0) || (pcol >= npcol) ) { if ( !((RoutType == RT_COMB) && (prow == -1)) ) BI_BlacsErr(ConTxt, -1, routine, "C%s out of range; C%s=%d, NPCOL=%d", srcdest, srcdest, pcol, npcol); } if (RoutType == RT_SD) /* point to point send */ { if ( (prow == myrow) && (pcol == mycol) ) BI_BlacsWarn(ConTxt, -1, routine, "Node sending message to itself"); } else if (RoutType == RT_RV) /* point to point send */ { if ( (prow == myrow) && (pcol == mycol) ) BI_BlacsWarn(ConTxt, -1, routine, "Node recving message from itself"); } else if (RoutType == RT_BR) /* broadcast/recv */ { if ( (prow == myrow) && (pcol == mycol) ) BI_BlacsErr(ConTxt, -1, routine, "Node tries to recv its own broadcast"); if (scope == 'r') { if (myrow != prow) BI_BlacsWarn(ConTxt, -1, routine, "Row broadcast: MYROW=%d, but RSRC=%d", myrow, prow); } else if (scope == 'c') { if (mycol != pcol) { BI_BlacsErr(ConTxt, -1, routine, "Column broadcast: MYCOL=%d, but CSRC=%d", mycol, pcol); } } } } #endif } scalapack-2.0.2/BLACS/SRC/BI_Asend.c000644 000766 000024 00000001701 11640652114 016733 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_Asend(BLACSCONTEXT *ctxt, int dest, int msgid, BLACBUFF *bp) { int i, info, errclass; info=MPI_Isend(bp->Buff, bp->N, bp->dtype, dest, msgid, ctxt->scp->comm, &bp->Aops[bp->nAops]); while(info != MPI_SUCCESS) { i=MPI_Error_class(info, &errclass); if ( (errclass != MPI_ERR_UNKNOWN) && (errclass != MPI_ERR_OTHER) && (errclass != MPI_ERR_INTERN) ) { Mmpierror(info, "MPI_Isend", ctxt, __LINE__, __FILE__); BI_BlacsErr(BI_ContxtNum(ctxt), __LINE__, __FILE__, "MPI error %d on call to MPI_Isend", info); } #if (BlacsDebugLvl > 0) else BI_BlacsWarn(BI_ContxtNum(ctxt), __LINE__, __FILE__, "MPI error %d assumed to mean out of non-blocking resources on call to MPI_Isend", info); #endif info=MPI_Isend(bp->Buff, bp->N, bp->dtype, dest, msgid, ctxt->scp->comm, &bp->Aops[bp->nAops]); } bp->nAops++; } scalapack-2.0.2/BLACS/SRC/BI_BeComb.c000644 000766 000024 00000006713 11640652114 017040 0ustar00juliestaff000000 000000 #include "Bdef.h" /* * The bidirectional exchange topology (BE) is specialized for dealing with * case where all nodes participating in the operation need to * receive the answer. It works best when # of nodes is some even * power of two. This topology is based on an algorithm presented by * Robert van de Geijn, et al. */ void BI_BeComb(BLACSCONTEXT *ctxt, BLACBUFF *bp, BLACBUFF *bp2, int N, VVFUNPTR Xvvop) /* * -- V1.1ALPHA (test version) BLACS routine -- * University of Tennessee, October 1, 1995 * Written by Clint Whaley. * * Purpose * ======= * Perform a element-by-element combine on vectors. * The answer will be left on all participating processes. Since this method * uses a hypercube communication pattern, the number of nodes participating * in the operation must be a power of 2 for it to perform efficiently. * * Arguments * ========= * CTXT (input) pointer to BLACSCONTEXT * The BLACS context where operation is taking place. * * BP (input/output) pointer to BLACBUFF. * BLACBUFF is a special data type used by the BLACS to control * buffers and the asynchronous operations coming out of them. * This BLACBUFF should have a buffer who's first N elements * contain the data to be combined. Additional space may be * required, depending upon what combine is being performed. * * BP2 (workspace) pointer to BLACBUFF. * This BLACBUFF is used to receive information for combining with * this process's information. * * N (input) int * The number of elements in the vector to be combined. * * Xvvop (input) pointer to typed operation function * Points to a typed function which performs the required operation * (e.g. summation) on the two N-element vectors. * * ------------------------------------------------------------------------ */ { void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Rsend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Arecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int Np, Iam, dest, msgid, Rmsgid, np2, bit, ierr; extern MPI_Status *BI_Stats; Np = ctxt->scp->Np; if (Np < 2) return; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); Rmsgid = Mscopeid(ctxt); for (np2=4; np2 < Np; np2 <<= 1); if (np2 > Np) np2 >>= 1; if (np2 != Np) { dest = (Iam ^ np2); if (Iam >= np2) /* I'm node beyond power of 2 */ { BI_Arecv(ctxt, dest, Rmsgid, bp); BI_Ssend(ctxt, dest, msgid, bp); BI_BuffIsFree(bp, 1); } else if (Iam < (Np^np2)) /* need to fan in contents of */ { /* non-power of 2 nodes */ BI_Srecv(ctxt, dest, msgid, bp2); Xvvop(N, bp->Buff, bp2->Buff); } } if (Iam < np2) { for (bit=1; (bit ^ np2); bit <<= 1) { dest = Iam ^ bit; ierr=MPI_Sendrecv(bp->Buff, bp->N, bp->dtype, dest, msgid, bp2->Buff, bp2->N, bp2->dtype, dest, msgid, ctxt->scp->comm, BI_Stats); Xvvop(N, bp->Buff, bp2->Buff); } /* * For nodes that are not part of the hypercube proper, we must * send data back. */ if (Iam < (Np^np2)) BI_Rsend(ctxt, (Iam ^ np2), Rmsgid, bp); } /* end if (nodes inside power of 2) */ } scalapack-2.0.2/BLACS/SRC/BI_BlacsAbort.c000644 000766 000024 00000000222 11640652114 017712 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_BlacsAbort(int ErrNo) { int ierr; fflush(stderr); fflush(stdout); ierr=MPI_Abort(MPI_COMM_WORLD, ErrNo); } scalapack-2.0.2/BLACS/SRC/BI_BlacsErr.c000644 000766 000024 00000001277 11640652114 017406 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_BlacsErr(int ConTxt, int line, char *file, char *form, ...) { #ifdef __STDC__ void BI_BlacsAbort(int ErrNo); #else void BI_BlacsAbort(); #endif extern int BI_Iam; int myrow, mycol; va_list argptr; char cline[100]; BLACSCONTEXT *ctxt; va_start(argptr, form); vsprintf(cline, form, argptr); va_end(argptr); if (ConTxt > -1) { MGetConTxt(ConTxt, ctxt); myrow = ctxt->cscp.Iam; mycol = ctxt->rscp.Iam; } else myrow = mycol = -1; fprintf(stderr, "BLACS ERROR '%s'\nfrom {%d,%d}, pnum=%d, Contxt=%d, on line %d of file '%s'.\n\n", cline, myrow, mycol, BI_Iam, ConTxt, line, file); BI_BlacsAbort(1); } scalapack-2.0.2/BLACS/SRC/BI_BlacsWarn.c000644 000766 000024 00000001144 11640652114 017556 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_BlacsWarn(int ConTxt, int line, char *file, char *form, ...) { extern int BI_Iam; extern BLACSCONTEXT **BI_MyContxts; int myrow, mycol; va_list argptr; char cline[100]; va_start(argptr, form); vsprintf(cline, form, argptr); va_end(argptr); if (ConTxt > -1) { myrow = BI_MyContxts[ConTxt]->cscp.Iam; mycol = BI_MyContxts[ConTxt]->rscp.Iam; } else myrow = mycol = -1; fprintf(stderr, "BLACS WARNING '%s'\nfrom {%d,%d}, pnum=%d, Contxt=%d, on line %d of file '%s'.\n\n", cline, myrow, mycol, BI_Iam, ConTxt, line, file); } scalapack-2.0.2/BLACS/SRC/BI_BuffIsFree.c000644 000766 000024 00000003116 11640652114 017663 0ustar00juliestaff000000 000000 #include "Bdef.h" int BI_BuffIsFree(BLACBUFF *bp, int Wait) /* * Check to see if buff is finished with async. operations. If Wait != 0, * wait for all async. operations to complete. */ { int i, info; extern MPI_Status *BI_Stats; extern int BI_Np; if (!Wait) { info=MPI_Testall(bp->nAops, bp->Aops, &i, BI_Stats); if (!i) { /* * If we are doing our own Packing, need to check true length of receive */ #ifndef MpiBuffGood /* * If we have an outstanding receive, make sure that when it * completes we correctly set bp->N, if required */ if (bp->N < 0) { if (bp->Aops[-bp->N-1] == MPI_REQUEST_NULL) { info=MPI_Get_count(&BI_Stats[(-bp->N-1)*MPI_STATUS_SIZE],MPI_PACKED, &i); if (i != MPI_UNDEFINED) bp->N = i; else BI_BlacsWarn(-1, __LINE__, __FILE__, "MPI_Get_count returned MPI_UNDEFINED.\n"); } } #endif return(0); } } else { info=MPI_Waitall(bp->nAops, bp->Aops, BI_Stats); } bp->nAops = 0; /* * If we are doing our own packing, need to check true length of receive */ #ifndef MpiBuffGood /* * If we had an outstanding receive, make sure that we correctly set bp->N, * if required */ if (bp->N < 0) { info=MPI_Get_count(&BI_Stats[(-bp->N-1)*MPI_STATUS_SIZE],MPI_PACKED, &i); if (i != MPI_UNDEFINED) bp->N = i; else BI_BlacsWarn(-1, __LINE__, __FILE__, "MPI_Get_count returned MPI_UNDEFINED.\n"); } #endif return(1); } scalapack-2.0.2/BLACS/SRC/BI_cMPI_amn.c000644 000766 000024 00000000315 11640652114 017324 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cMPI_amn(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_cvvamn(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_cvvamn(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_cMPI_amn2.c000644 000766 000024 00000000243 11640652114 017406 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cMPI_amn2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_cvvamn2(int, char *, char *); BI_cvvamn2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_cMPI_amx.c000644 000766 000024 00000000315 11640652114 017336 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cMPI_amx(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_cvvamx(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_cvvamx(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_cMPI_amx2.c000644 000766 000024 00000000243 11640652114 017420 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cMPI_amx2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_cvvamx2(int, char *, char *); BI_cvvamx2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_cMPI_sum.c000644 000766 000024 00000000240 11640652114 017352 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cMPI_sum(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_cvvsum(int, char *, char *); BI_cvvsum(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_ContxtNum.c000644 000766 000024 00000000612 11640652114 017640 0ustar00juliestaff000000 000000 #include "Bdef.h" int BI_ContxtNum(BLACSCONTEXT *ctxt) /* * Returns the integer ID of ctxt */ { int i; extern int BI_MaxNCtxt; extern BLACSCONTEXT **BI_MyContxts; if (ctxt == NULL) return(-1); for (i=0; i < BI_MaxNCtxt; i++) if (BI_MyContxts[i] == ctxt) break; if (i == BI_MaxNCtxt) BI_BlacsErr(-1, -1, "BLACS INTERNAL ROUTINE", "illegal context"); return(i); } scalapack-2.0.2/BLACS/SRC/BI_cvvamn.c000644 000766 000024 00000001354 11640652114 017177 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cvvamn(int N, char *vec1, char *vec2) { SCOMPLEX *v1=(SCOMPLEX*)vec1, *v2=(SCOMPLEX*)vec2; float diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(SCOMPLEX); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Cabs(v1[k]) - Cabs(v2[k]); if (diff > 0) { v1[k].r = v2[k].r; v1[k].i = v2[k].i; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k].r = v2[k].r; v1[k].i = v2[k].i; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_cvvamn2.c000644 000766 000024 00000001331 11640652114 017254 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cvvamn2(int N, char *vec1, char *vec2) { int r, i; float *v1=(float*)vec1, *v2=(float*)vec2; float diff; N *= 2; for (r=0, i=1; r != N; r += 2, i += 2) { diff = (Rabs(v1[r]) + Rabs(v1[i])) - (Rabs(v2[r]) + Rabs(v2[i])); if (diff > 0) { v1[r] = v2[r]; v1[i] = v2[i]; } else if (diff == 0) { if (v1[r] != v2[r]) { if (v1[r] < v2[r]) { v1[r] = v2[r]; v1[i] = v2[i]; } } else { if (v1[i] < v2[i]) { v1[r] = v2[r]; v1[i] = v2[i]; } } } } } scalapack-2.0.2/BLACS/SRC/BI_cvvamx.c000644 000766 000024 00000001354 11640652114 017211 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cvvamx(int N, char *vec1, char *vec2) { SCOMPLEX *v1=(SCOMPLEX*)vec1, *v2=(SCOMPLEX*)vec2; float diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(SCOMPLEX); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Cabs(v1[k]) - Cabs(v2[k]); if (diff < 0) { v1[k].r = v2[k].r; v1[k].i = v2[k].i; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k].r = v2[k].r; v1[k].i = v2[k].i; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_cvvamx2.c000644 000766 000024 00000001331 11640652114 017266 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cvvamx2(int N, char *vec1, char *vec2) { int r, i; float *v1=(float*)vec1, *v2=(float*)vec2; float diff; N *= 2; for (r=0, i=1; r != N; r += 2, i += 2) { diff = (Rabs(v1[r]) + Rabs(v1[i])) - (Rabs(v2[r]) + Rabs(v2[i])); if (diff < 0) { v1[r] = v2[r]; v1[i] = v2[i]; } else if (diff == 0) { if (v1[r] != v2[r]) { if (v1[r] < v2[r]) { v1[r] = v2[r]; v1[i] = v2[i]; } } else { if (v1[i] < v2[i]) { v1[r] = v2[r]; v1[i] = v2[i]; } } } } } scalapack-2.0.2/BLACS/SRC/BI_cvvsum.c000644 000766 000024 00000000256 11640652114 017230 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_cvvsum(int N, char *vec1, char *vec2) { float *v1=(float*)vec1, *v2=(float*)vec2; int k; N *=2; for (k=0; k < N; k++) v1[k] += v2[k]; } scalapack-2.0.2/BLACS/SRC/BI_dMPI_amn.c000644 000766 000024 00000000315 11640652114 017325 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dMPI_amn(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_dvvamn(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_dvvamn(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_dMPI_amn2.c000644 000766 000024 00000000243 11640652114 017407 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dMPI_amn2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_dvvamn2(int, char *, char *); BI_dvvamn2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_dMPI_amx.c000644 000766 000024 00000000315 11640652114 017337 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dMPI_amx(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_dvvamx(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_dvvamx(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_dMPI_amx2.c000644 000766 000024 00000000243 11640652114 017421 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dMPI_amx2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_dvvamx2(int, char *, char *); BI_dvvamx2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_dmvcopy.c000644 000766 000024 00000000771 11640652114 017370 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dmvcopy(int m, int n, double *A, int lda, double *buff) /* * Performs a matrix to vector copy (pack) for the data type double */ { int i, j; if ( (m == lda) || (n == 1) ) { m = n * m; for (i=0; i < m; i++) buff[i] = A[i]; } else if (m == 1) { for (j=0; j < n; j++) buff[j] = A[j*lda]; } else { for (j=0; j < n; j++) { for (i=0; i < m; i++) buff[i] = A[i]; A += lda; buff += m; } } } scalapack-2.0.2/BLACS/SRC/BI_dvmcopy.c000644 000766 000024 00000000775 11640652114 017374 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dvmcopy(int m, int n, double *A, int lda, double *buff) /* * performs an vector to matrix copy (unpack) for the data type double */ { int i, j; if ( (m == lda) || (n == 1) ) { m = n * m; for (i=0; i < m; i++) A[i] = buff[i]; } else if (m == 1) { for (j=0; j < n; j++) A[j*lda] = buff[j]; } else { for (j=0; j< n; j++) { for (i=0; i < m; i++) A[i] = buff[i]; A += lda; buff += m; } } } scalapack-2.0.2/BLACS/SRC/BI_dvvamn.c000644 000766 000024 00000001242 11640652114 017174 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dvvamn(int N, char *vec1, char *vec2) { double *v1=(double*)vec1, *v2=(double*)vec2; double diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(double); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff > 0) { v1[k] = v2[k]; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k] = v2[k]; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_dvvamn2.c000644 000766 000024 00000000473 11640652114 017263 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dvvamn2(int N, char *vec1, char *vec2) { int k; double *v1=(double*)vec1, *v2=(double*)vec2; double diff; for (k=0; k != N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff > 0) v1[k] = v2[k]; else if (diff == 0) if (v1[k] < v2[k]) v1[k] = v2[k]; } } scalapack-2.0.2/BLACS/SRC/BI_dvvamx.c000644 000766 000024 00000001242 11640652114 017206 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dvvamx(int N, char *vec1, char *vec2) { double *v1=(double*)vec1, *v2=(double*)vec2; double diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(double); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff < 0) { v1[k] = v2[k]; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k] = v2[k]; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_dvvamx2.c000644 000766 000024 00000000473 11640652114 017275 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dvvamx2(int N, char *vec1, char *vec2) { int k; double *v1=(double*)vec1, *v2=(double*)vec2; double diff; for (k=0; k != N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff < 0) v1[k] = v2[k]; else if (diff == 0) if (v1[k] < v2[k]) v1[k] = v2[k]; } } scalapack-2.0.2/BLACS/SRC/BI_dvvsum.c000644 000766 000024 00000000247 11640652114 017231 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_dvvsum(int N, char *vec1, char *vec2) { double *v1=(double*)vec1, *v2=(double*)vec2; int k; for (k=0; k < N; k++) v1[k] += v2[k]; } scalapack-2.0.2/BLACS/SRC/BI_EmergencyBuff.c000644 000766 000024 00000003050 11640652114 020421 0ustar00juliestaff000000 000000 #include "Bdef.h" /*************************************************************************** * If there is insufficient space to allocate a needed buffer, this * * routine is called. It moniters active buffers for the time defined by * * the user-changeable macro value BUFWAIT. If in that time no active * * buffer becomes inactive, a hang is assumed, and the grid is killed. * ***************************************************************************/ void BI_EmergencyBuff(int length) { void BI_UpdateBuffs(BLACBUFF *); char *cptr; int i, j; double Mwalltime(void); double t1; extern int BI_Np; extern BLACBUFF *BI_ReadyB, *BI_ActiveQ; j = sizeof(BLACBUFF); if (j % sizeof(MPI_Request)) j += sizeof(MPI_Request) - j % sizeof(MPI_Request); i = j + BI_Np*sizeof(MPI_Request); if (i % BUFFALIGN) i += BUFFALIGN - i % BUFFALIGN; t1 = Mwalltime(); while ( (BI_ActiveQ) && (Mwalltime() - t1 < BUFWAIT) && !(BI_ReadyB) ) { BI_UpdateBuffs(NULL); if (BI_ReadyB) { if (BI_ReadyB->Len < length) { free(BI_ReadyB); cptr = malloc(length + i); BI_ReadyB = (BLACBUFF *) cptr; if (BI_ReadyB) { BI_ReadyB->nAops = 0; BI_ReadyB->Aops = (MPI_Request *) &cptr[j]; BI_ReadyB->Buff = &cptr[i]; BI_ReadyB->Len = length; } } } } if (BI_ReadyB == NULL) { BI_BlacsErr(-1, __LINE__, __FILE__, "BLACS out of buffer space"); } } scalapack-2.0.2/BLACS/SRC/BI_GetBuff.c000644 000766 000024 00000006435 11640652114 017234 0ustar00juliestaff000000 000000 #include "Bdef.h" /*************************************************************************** * The mpi implements globally blocking sends. I.e., a send blocks until * * the dest. node issues a recv. The BLACS assume locally-blocking sends.* * Therefore, the BLACS must fake locally-blocking sends. To do this * * requires an indeterminate number of buffers and the use of * * non-blocking sends. However, it is very important that even though I * * provide a dynamic number of buffers, that getting these buffers does * * not take too long in the critical part of a send operation. * * Therefore, the buffer management is broken into two routines. * * * * Inside the BLACS there are two states a buffer may be in. If the buff * * is currently being used (for instance, an asynchronous send is coming * * from it), it is classified as an ACTIVE buffer, and is on the active * * buffer queue. Otherwise, a buffer is READY: it is not being used * * and is available for the next buffer operation. * * In order to avoid buffer proliferation, only one ready buffer is kept, * * and as active buffers become inactive they either become the ready * * buffer, or are freed. * * * * The first routine, BI_GetBuff, checks if the ready buffer is big enough * * to fulfill the buffer request. If not, the present ready buffer is * * is freed, and a new buffer of the required length is allocated. If * * the buffer is of sufficent size already, no action is taken. * * This routine is purposely very short, as it is called at the beginning * * of each broadcast/send operation. All participating nodes * * are waiting on the source node, so this routine must be very cheap. * * * * The second routine, BI_UpdateBuffs, moves the ready buffer to the active * * buffer queue (if needed). It also checks the entire active buffer * * queue to see if any have finished their operations. If so, they are * * are either moved to the ready buff, or freed. This routine is called * * AFTER the send/broadcast has been started, and thus I am free to make * * it a little more complex. * ***************************************************************************/ BLACBUFF *BI_GetBuff(int length) { void BI_EmergencyBuff(int length); char *cptr; int i, j; extern int BI_Np; extern BLACBUFF *BI_ReadyB; /* * If ready buffer already exists, and is big enough, return it. Otherwise, * free the buffer (if it exists) and get one of correct size */ if (BI_ReadyB) { if (BI_ReadyB->Len >= length) return(BI_ReadyB); else free(BI_ReadyB); } /* * Make sure all buffers aligned correctly */ j = sizeof(BLACBUFF); if (j % sizeof(MPI_Request)) j += sizeof(MPI_Request) - j % sizeof(MPI_Request); i = j + BI_Np*sizeof(MPI_Request); if (i % BUFFALIGN) i += BUFFALIGN - i % BUFFALIGN; cptr = malloc(i + length); BI_ReadyB = (BLACBUFF *) cptr; if (BI_ReadyB != NULL) { BI_ReadyB->nAops = 0; BI_ReadyB->Aops = (MPI_Request *) &cptr[j]; BI_ReadyB->Buff = &cptr[i]; BI_ReadyB->Len = length; } else BI_EmergencyBuff(length); return(BI_ReadyB); } scalapack-2.0.2/BLACS/SRC/BI_GetMpiGeType.c000644 000766 000024 00000001057 11640652114 020210 0ustar00juliestaff000000 000000 #include "Bdef.h" MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *ctxt, int m, int n, int lda, MPI_Datatype Dtype, int *N) { int info; MPI_Datatype GeType; /* * Some versions of mpich and its derivitives cannot handle 0 byte typedefs, * so we set type MPI_BYTE as a flag for a 0 byte message */ #ifdef ZeroByteTypeBug if ( (m < 1) || (n < 1) ) { *N = 0; return (MPI_BYTE); } #endif *N = 1; info=MPI_Type_vector(n, m, lda, Dtype, &GeType); info=MPI_Type_commit(&GeType); return(GeType); } scalapack-2.0.2/BLACS/SRC/BI_GetMpiTrType.c000644 000766 000024 00000005075 11640652114 020246 0ustar00juliestaff000000 000000 #include "Bdef.h" MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *ctxt, char uplo, char diag, int m, int n, int lda, MPI_Datatype Dtype, int *N) { BLACBUFF *BI_GetBuff(int); MPI_Datatype TrType; int info, start, i, k; int *len, *disp; BLACBUFF *bp; if (diag == 'u') start = 1; else start = 0; /* * Some versions of mpich and its derivitives cannot handle 0 byte typedefs, * so we set type MPI_BYTE as a flag for a 0 byte message */ #ifdef ZeroByteTypeBug if (m > n) i = n * (m-n) + (n*n) - (n*n)/2 + n/2 - n*start; else i = m * (n-m) + (m*m) - (m*m)/2 + m/2 - m*start; if (i < 1) { *N = 0; return (MPI_BYTE); } #endif *N = 1; /* * Get space to hold the length and displacement values */ bp = BI_GetBuff( 2 * n * sizeof(int) ); len = (int *) bp->Buff; disp = (int *) &bp->Buff[n*sizeof(int)]; if (m > n) { if (uplo == 'u') { k = m - n + 1 - start; for (i=0; i < n; i++) { len[i] = k + i; disp[i] = i*lda; } } else /* uplo = 'l' and m > n */ { k = m - start; lda++; len[0] = k; disp[0] = start; for (i=1; i < n; i++) { len[i] = k - i; disp[i] = disp[i-1] + lda; } } } else /* m <= n */ { if (uplo == 'u') { k = 1 - start; for (i=0; i < m; i++) { len[i] = i + k; disp[i] = i*lda; } for (; i < n; i++) { len[i] = m; disp[i] = i*lda; } } else /* uplo = 'l' and m <= n */ { k = n - m; for (i=0; i < k; i++) { len[i] = m; disp[i] = i*lda; } if (i < n) { k = n - start; len[i] = k - i; disp[i] = i*lda + start; lda++; for (i++; i < n; i++) { len[i] = k - i; disp[i] = disp[i-1] + lda; } } } } #ifdef T3ETrError /* * Get rid of 0-length segments to keep T3E happy */ for (i=0; i < n; i++) { if (len[i] == 0) { for (k=i+1; k < n; k++) { len[k-1] = len[k]; disp[k-1] = disp[k]; } if (n > 0) n--; i--; /* check new entry for 0-byte */ } } #endif i=MPI_Type_indexed(n, len, disp, Dtype, &TrType); i=MPI_Type_commit(&TrType); return(TrType); } scalapack-2.0.2/BLACS/SRC/BI_GlobalVars.c000644 000766 000024 00000001121 11640652114 017731 0ustar00juliestaff000000 000000 #include "Bdef.h" /* * Define global variables */ int BI_MaxNCtxt=0; /* Number of context pointers allocated */ int BI_MaxNSysCtxt=0; /* Number of system ctxt ptrs allocated */ int BI_Iam, BI_Np=(-1); /* My pnum, and # of procs in system */ BLACBUFF *BI_ReadyB=NULL; /* buffer that is ready for use */ BLACBUFF *BI_ActiveQ=NULL; /* pointer to start of active buffer queue */ BLACBUFF BI_AuxBuff; BLACSCONTEXT **BI_MyContxts=NULL; /* Array of pointers to my contexts */ MPI_Comm *BI_SysContxts=NULL; int *BI_COMM_WORLD=NULL; MPI_Status *BI_Stats=NULL; scalapack-2.0.2/BLACS/SRC/BI_HypBR.c000644 000766 000024 00000001016 11640652114 016664 0ustar00juliestaff000000 000000 #include "Bdef.h" int BI_HypBR(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send, int src) { void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); int relnode, bit, Np, Iam, msgid; Np = ctxt->scp->Np; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); for (bit=2; bit < Np; bit <<= 1); if (bit^Np) return(NPOW2); /* not a power of 2 */ relnode = Iam ^ src; BI_Srecv(ctxt, BANYNODE, msgid, bp); for(bit=1; (bit^Np); bit <<= 1) if (bit > relnode) send(ctxt, Iam^bit, msgid, bp); return(0); } scalapack-2.0.2/BLACS/SRC/BI_HypBS.c000644 000766 000024 00000000650 11640652114 016670 0ustar00juliestaff000000 000000 #include "Bdef.h" int BI_HypBS(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send) { int bit, Np, Iam, msgid; Np = ctxt->scp->Np; if (Np < 2) return(NORV); Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); for (bit=2; bit < Np; bit <<= 1); if (bit^Np) return(NPOW2); /* not a power of 2 */ for(bit=1; (bit^Np); bit <<= 1) send(ctxt, (Iam^bit), msgid, bp); return(0); /* error-free return */ } scalapack-2.0.2/BLACS/SRC/BI_IdringBR.c000644 000766 000024 00000000607 11640652114 017345 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_IdringBR(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send, int src, int step) { void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); int Np, Iam, msgid, dest; Np = ctxt->scp->Np; Iam = ctxt->scp->Iam; dest = (Np + Iam + step) % Np; msgid = Mscopeid(ctxt); BI_Srecv(ctxt, BANYNODE, msgid, bp); if (dest != src) send(ctxt, dest, msgid, bp); } scalapack-2.0.2/BLACS/SRC/BI_IdringBS.c000644 000766 000024 00000000411 11640652114 017337 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_IdringBS(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send, int step) { int Np, Iam, msgid; Np = ctxt->scp->Np; if (Np < 2) return; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); send(ctxt, (Np+Iam+step)%Np, msgid, bp); } scalapack-2.0.2/BLACS/SRC/BI_iMPI_amn.c000644 000766 000024 00000000315 11640652114 017332 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_iMPI_amn(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_ivvamn(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_ivvamn(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_iMPI_amn2.c000644 000766 000024 00000000243 11640652114 017414 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_iMPI_amn2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_ivvamn2(int, char *, char *); BI_ivvamn2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_iMPI_amx.c000644 000766 000024 00000000315 11640652114 017344 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_iMPI_amx(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_ivvamx(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_ivvamx(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_iMPI_amx2.c000644 000766 000024 00000000243 11640652114 017426 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_iMPI_amx2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_ivvamx2(int, char *, char *); BI_ivvamx2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_imvcopy.c000644 000766 000024 00000000760 11640652114 017373 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_imvcopy(int m, int n, int *A, int lda, int *buff) /* * Performs a matrix to vector copy (pack) for the data type int */ { int i, j; if ( (m == lda) || (n == 1) ) { m = n * m; for (i=0; i < m; i++) buff[i] = A[i]; } else if (m == 1) { for (j=0; j < n; j++) buff[j] = A[j*lda]; } else { for (j=0; j < n; j++) { for (i=0; i < m; i++) buff[i] = A[i]; A += lda; buff += m; } } } scalapack-2.0.2/BLACS/SRC/BI_ivmcopy.c000644 000766 000024 00000000764 11640652114 017377 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_ivmcopy(int m, int n, int *A, int lda, int *buff) /* * performs an vector to matrix copy (unpack) for the data type int */ { int i, j; if ( (m == lda) || (n == 1) ) { m = n * m; for (i=0; i < m; i++) A[i] = buff[i]; } else if (m == 1) { for (j=0; j < n; j++) A[j*lda] = buff[j]; } else { for (j=0; j< n; j++) { for (i=0; i < m; i++) A[i] = buff[i]; A += lda; buff += m; } } } scalapack-2.0.2/BLACS/SRC/BI_ivvamn.c000644 000766 000024 00000001223 11640652114 017200 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_ivvamn(int N, char *vec1, char *vec2) { int *v1=(int*)vec1, *v2=(int*)vec2; int diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(int); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff > 0) { v1[k] = v2[k]; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k] = v2[k]; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_ivvamn2.c000644 000766 000024 00000000460 11640652114 017264 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_ivvamn2(int N, char *vec1, char *vec2) { int k; int *v1=(int*)vec1, *v2=(int*)vec2; int diff; for (k=0; k != N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff > 0) v1[k] = v2[k]; else if (diff == 0) if (v1[k] < v2[k]) v1[k] = v2[k]; } } scalapack-2.0.2/BLACS/SRC/BI_ivvamx.c000644 000766 000024 00000001223 11640652114 017212 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_ivvamx(int N, char *vec1, char *vec2) { int *v1=(int*)vec1, *v2=(int*)vec2; int diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(int); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff < 0) { v1[k] = v2[k]; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k] = v2[k]; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_ivvamx2.c000644 000766 000024 00000000457 11640652114 017304 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_ivvamx2(int N, char *vec1, char *vec2) { int k; int *v1=(int*)vec1, *v2=(int*)vec2; int diff; for (k=0; k != N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff < 0) v1[k] = v2[k]; else if (diff == 0) if (v1[k] < v2[k]) v1[k] = v2[k]; } } scalapack-2.0.2/BLACS/SRC/BI_ivvsum.c000644 000766 000024 00000000236 11640652114 017234 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_ivvsum(int N, char *vec1, char *vec2) { int *v1=(int*)vec1, *v2=(int*)vec2; int k; for (k=0; k < N; k++) v1[k] += v2[k]; } scalapack-2.0.2/BLACS/SRC/BI_MpathBR.c000644 000766 000024 00000002537 11640652114 017206 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_MpathBR(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send, int src, int npaths) { void BI_Arecv(BLACSCONTEXT *, int, int, BLACBUFF *); int BI_BuffIsFree(BLACBUFF *, int); int pathlen; /* the minimal length of each path */ int mydist; /* my distance from src */ int faredge; /* node at far end of path */ int lastlong; /* distance to node on end of last path with extra node */ int Np, Iam, msgid, Np_1, dest; msgid = Mscopeid(ctxt); BI_Arecv(ctxt, BANYNODE, msgid, bp); Np = ctxt->scp->Np; Iam = ctxt->scp->Iam; Np_1 = Np - 1; if (npaths == FULLCON) npaths = Np_1; if (npaths > 0) { dest = (Iam+1) % Np; mydist = (Np + Iam - src) % Np; } else { dest = (Np_1+Iam) % Np; mydist = (Np + src - Iam) % Np; npaths = -npaths; } /* * Make sure npaths is cool */ if (npaths > Np_1) npaths = Np_1; pathlen = Np_1 / npaths; lastlong = (Np_1%npaths) * (pathlen+1); if (lastlong) { if (mydist <= lastlong) faredge = ((mydist-1)/(pathlen+1)+1)*(pathlen+1); else faredge = ((lastlong-1)/(pathlen+1)+1) * (pathlen+1) + ((mydist-lastlong-1)/pathlen + 1) * pathlen; } else faredge = ((mydist-1)/pathlen + 1) * pathlen; BI_BuffIsFree(bp, 1); /* wait for recv to complete */ if (mydist < faredge) send(ctxt, dest, msgid, bp); } scalapack-2.0.2/BLACS/SRC/BI_MpathBS.c000644 000766 000024 00000002402 11640652114 017176 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_MpathBS(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send, int npaths) { int pathlen; /* the length of each path */ int dist; /* the distance to the node closest to src on each path */ int pdest; /* part of dest calculation -- saves unneeded ops */ int lastlong; /* number of paths with extra node */ int Np, Iam, msgid, Np_1, dir; Np = ctxt->scp->Np; if (Np < 2) return; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); Np_1 = Np - 1; if (npaths == FULLCON) npaths = Np_1; if (npaths > 0) /* paths are increasing rings */ { pdest = Iam; dir = 1; } else /* paths are decreasing rings */ { pdest = Np + Iam; dir = -1; npaths = -npaths; } /* * Ensure npaths is correct */ if (npaths > Np_1) npaths = Np_1; pathlen = Np_1 / npaths; /* * Loop over all long paths (paths with an extra node), if there are any */ lastlong = (Np_1 % npaths) * (pathlen+1); /* last node in long ring */ for (dist=1; dist < lastlong; dist += pathlen+1) send(ctxt, (pdest+dir*dist)%Np, msgid, bp); /* * Loop over all normal length paths */ while (dist < Np) { send(ctxt, (pdest+dir*dist)%Np, msgid, bp); dist += pathlen; } } scalapack-2.0.2/BLACS/SRC/BI_MringComb.c000644 000766 000024 00000004454 11640652114 017566 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_MringComb(BLACSCONTEXT *ctxt, BLACBUFF *bp, BLACBUFF *bp2, int N, VVFUNPTR Xvvop, int dest, int nrings) { void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); int Np, Iam, msgid, i, inc, mysrc, mydest, Np_1; int mydist, ringlen, myring; int nearedge, faredge; /* edge closest and farthest from dest */ int REBS; /* Is result leave-on-all? */ Np = ctxt->scp->Np; if (Np < 2) return; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); if (REBS = (dest == -1)) dest = 0; if (nrings > 0) { mydist = (Np + dest - Iam) % Np; inc = 1; } else { mydist = (Np + Iam - dest) % Np; inc = -1; nrings = -nrings; } Np_1 = Np - 1; if (nrings > Np_1) nrings = Np_1; /* * If I'm not the destination */ if (Iam != dest) { ringlen = Np_1 / nrings; myring = (mydist-1) / ringlen; if (myring >= nrings) myring = nrings - 1; nearedge = (myring*ringlen) + 1; faredge = nearedge + ringlen - 1; if (myring == nrings-1) faredge += Np_1 % nrings; if (mydist == nearedge) mydest = dest; else mydest = (Np + Iam + inc) % Np; if (mydist != faredge) { BI_Srecv(ctxt, (Np + Iam - inc) % Np, msgid, bp2); Xvvop(N, bp->Buff, bp2->Buff); } BI_Ssend(ctxt, mydest, msgid, bp); if (REBS) BI_MpathBR(ctxt, bp, BI_Ssend, dest, nrings); } /* * If I'm the destination process */ else { if (!ctxt->TopsRepeat) { for(i=nrings; i; i--) { BI_Srecv(ctxt, BANYNODE, msgid, bp2); Xvvop(N, bp->Buff, bp2->Buff); } } else { ringlen = Np_1 / nrings; if (inc == 1) mysrc = (Np + Iam - 1) % Np; else mysrc = (Iam + 1) % Np; for(i=nrings; i; i--) { BI_Srecv(ctxt, mysrc, msgid, bp2); Xvvop(N, bp->Buff, bp2->Buff); if (inc == 1) mysrc = (Np + mysrc - ringlen) % Np; else mysrc = (mysrc + ringlen) % Np; } } if (REBS) BI_MpathBS(ctxt, bp, BI_Ssend, nrings); } } /* end BI_MringComb */ scalapack-2.0.2/BLACS/SRC/BI_Pack.c000644 000766 000024 00000002544 11640652114 016565 0ustar00juliestaff000000 000000 #include "Bdef.h" BLACBUFF *BI_Pack(BLACSCONTEXT *ctxt,BVOID *A,BLACBUFF *bp,MPI_Datatype Dtype) { BLACBUFF *BI_GetBuff(int); int i, info, one=1; MPI_Aint eltsiz; #ifdef ZeroByteTypeBug char *cptr; extern BLACBUFF BI_AuxBuff; extern int BI_Np; #endif /* * Some versions of mpich and its derivitives cannot handle 0 byte typedefs, * so we have set MPI_BYTE as a flag for a 0 byte message */ #ifdef ZeroByteTypeBug if (Dtype == MPI_BYTE) { info = sizeof(BLACBUFF); if (info % sizeof(MPI_Request)) info += sizeof(MPI_Request) - info % sizeof(MPI_Request); i = info + BI_Np*sizeof(MPI_Request); if (i % BUFFALIGN) i += BUFFALIGN - i % BUFFALIGN; cptr = malloc(i); if (cptr) { bp = (BLACBUFF *) cptr; bp->Len = bp->N = bp->nAops = 0; bp->Aops = (MPI_Request *) &cptr[info]; bp->Buff = (char *) &bp->Len; bp->dtype = MPI_BYTE; return(bp); } else BI_BlacsErr(BI_ContxtNum(ctxt), __LINE__, __FILE__, "Not enough memory to allocate 0 byte buffer\n"); } #endif if (bp == NULL) { info=MPI_Pack_size(one, Dtype, ctxt->scp->comm, &i); bp = BI_GetBuff(i); } i = 0; info=MPI_Pack(A, one, Dtype, bp->Buff, bp->Len, &i, ctxt->scp->comm); bp->dtype = MPI_PACKED; bp->N = i; return(bp); } scalapack-2.0.2/BLACS/SRC/BI_Rsend.c000644 000766 000024 00000000270 11640652114 016754 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_Rsend(BLACSCONTEXT *ctxt, int dest, int msgid, BLACBUFF *bp) { int info; info=MPI_Rsend(bp->Buff, bp->N, bp->dtype, dest, msgid, ctxt->scp->comm); } scalapack-2.0.2/BLACS/SRC/BI_sMPI_amn.c000644 000766 000024 00000000315 11640652114 017344 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_sMPI_amn(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_svvamn(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_svvamn(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_sMPI_amn2.c000644 000766 000024 00000000243 11640652114 017426 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_sMPI_amn2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_svvamn2(int, char *, char *); BI_svvamn2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_sMPI_amx.c000644 000766 000024 00000000315 11640652114 017356 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_sMPI_amx(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_svvamx(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_svvamx(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_sMPI_amx2.c000644 000766 000024 00000000243 11640652114 017440 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_sMPI_amx2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_svvamx2(int, char *, char *); BI_svvamx2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_smvcopy.c000644 000766 000024 00000000766 11640652114 017413 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_smvcopy(int m, int n, float *A, int lda, float *buff) /* * Performs a matrix to vector copy (pack) for the data type float */ { int i, j; if ( (m == lda) || (n == 1) ) { m = n * m; for (i=0; i < m; i++) buff[i] = A[i]; } else if (m == 1) { for (j=0; j < n; j++) buff[j] = A[j*lda]; } else { for (j=0; j < n; j++) { for (i=0; i < m; i++) buff[i] = A[i]; A += lda; buff += m; } } } scalapack-2.0.2/BLACS/SRC/BI_Srecv.c000644 000766 000024 00000001165 11640652114 016767 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_Srecv(BLACSCONTEXT *ctxt, int src, int msgid, BLACBUFF *bp) { int i, info; extern MPI_Status *BI_Stats; info=MPI_Recv(bp->Buff, bp->N, bp->dtype, src, msgid, ctxt->scp->comm,BI_Stats); /* * If we are doing our own buffering, need to determine the true length of * the message just received */ #ifndef MpiBuffGood if (bp->dtype == MPI_PACKED) { info=MPI_Get_count(BI_Stats, MPI_PACKED, &i); if (i != MPI_UNDEFINED) bp->N = i; else BI_BlacsWarn(BI_ContxtNum(ctxt), __LINE__, __FILE__, "MPI_Get_count returned MPI_UNDEFINED.\n"); } #endif } scalapack-2.0.2/BLACS/SRC/BI_SringBR.c000644 000766 000024 00000001276 11640652114 017216 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_SringBR(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send, int src) { void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); int mydist; /* my distance from source */ int Np, Iam, msgid, rightedge; Np = ctxt->scp->Np; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); mydist = (Np + Iam - src) % Np; rightedge = Np/2; BI_Srecv(ctxt, BANYNODE, msgid, bp); /* * If I'm between source & right edge of split ring, send to right */ if (mydist < rightedge) send(ctxt, (Iam+1)%Np, msgid, bp); /* * If I'm between source and left edge of split ring, send to left */ else if (mydist > rightedge+1) send(ctxt, (Np+Iam-1)%Np, msgid, bp); } scalapack-2.0.2/BLACS/SRC/BI_SringBS.c000644 000766 000024 00000000462 11640652114 017213 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_SringBS(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send) { int Np, Iam, msgid; Np = ctxt->scp->Np; if (Np < 2) return; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); send(ctxt, (Iam + 1)%Np, msgid, bp); if (Np > 2) send(ctxt, (Np + Iam - 1)%Np, msgid, bp); } scalapack-2.0.2/BLACS/SRC/BI_Ssend.c000644 000766 000024 00000000265 11640652114 016761 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_Ssend(BLACSCONTEXT *ctxt, int dest, int msgid, BLACBUFF *bp) { int info; info=MPI_Send(bp->Buff, bp->N, bp->dtype, dest, msgid, ctxt->scp->comm); } scalapack-2.0.2/BLACS/SRC/BI_svmcopy.c000644 000766 000024 00000000772 11640652114 017410 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_svmcopy(int m, int n, float *A, int lda, float *buff) /* * performs an vector to matrix copy (unpack) for the data type float */ { int i, j; if ( (m == lda) || (n == 1) ) { m = n * m; for (i=0; i < m; i++) A[i] = buff[i]; } else if (m == 1) { for (j=0; j < n; j++) A[j*lda] = buff[j]; } else { for (j=0; j< n; j++) { for (i=0; i < m; i++) A[i] = buff[i]; A += lda; buff += m; } } } scalapack-2.0.2/BLACS/SRC/BI_svvamn.c000644 000766 000024 00000001235 11640652114 017215 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_svvamn(int N, char *vec1, char *vec2) { float *v1=(float*)vec1, *v2=(float*)vec2; float diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(float); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff > 0) { v1[k] = v2[k]; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k] = v2[k]; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_svvamn2.c000644 000766 000024 00000000467 11640652114 017305 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_svvamn2(int N, char *vec1, char *vec2) { int k; float *v1=(float*)vec1, *v2=(float*)vec2; float diff; for (k=0; k != N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff > 0) v1[k] = v2[k]; else if (diff == 0) if (v1[k] < v2[k]) v1[k] = v2[k]; } } scalapack-2.0.2/BLACS/SRC/BI_svvamx.c000644 000766 000024 00000001235 11640652114 017227 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_svvamx(int N, char *vec1, char *vec2) { float *v1=(float*)vec1, *v2=(float*)vec2; float diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(float); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff < 0) { v1[k] = v2[k]; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k] = v2[k]; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_svvamx2.c000644 000766 000024 00000000467 11640652114 017317 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_svvamx2(int N, char *vec1, char *vec2) { int k; float *v1=(float*)vec1, *v2=(float*)vec2; float diff; for (k=0; k != N; k++) { diff = Rabs(v1[k]) - Rabs(v2[k]); if (diff < 0) v1[k] = v2[k]; else if (diff == 0) if (v1[k] < v2[k]) v1[k] = v2[k]; } } scalapack-2.0.2/BLACS/SRC/BI_svvsum.c000644 000766 000024 00000000244 11640652114 017245 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_svvsum(int N, char *vec1, char *vec2) { float *v1=(float*)vec1, *v2=(float*)vec2; int k; for (k=0; k < N; k++) v1[k] += v2[k]; } scalapack-2.0.2/BLACS/SRC/BI_TransDist.c000644 000766 000024 00000002563 11640652114 017623 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_TransDist(BLACSCONTEXT *ctxt, char scope, int m, int n, int *rA, int *cA, int ldrc, BI_DistType *dist, int rdest, int cdest) /* * This routine translates distances (offsets from the destination node), * stored in location dist, into row and column coordinates. */ { int i, j, k, dest; int Ng, nprow, npcol, myrow, mycol; Mgridinfo(ctxt, Ng, nprow, npcol, myrow, mycol); if (rdest == -1) rdest = cdest = 0; switch (scope) { case 'r': for (j=0; j < n; j++) { for (i=0; i < m; i++) { rA[i] = myrow; cA[i] = (int) (cdest + dist[i]) % npcol; } rA += ldrc; cA += ldrc; dist += m; } break; case 'c': for (j=0; j < n; j++) { for (i=0; i < m; i++) { rA[i] = (int) (rdest + dist[i]) % nprow; cA[i] = mycol; } rA += ldrc; cA += ldrc; dist += m; } break; case 'a': dest = Mvkpnum(ctxt, rdest, cdest); for (j=0; j < n; j++) { for (i=0; i < m; i++) { k = (int) (dest + dist[i]) % Ng; /* figure node number */ Mvpcoord(ctxt, k, rA[i], cA[i]); /* figure node coordinates */ } rA += ldrc; cA += ldrc; dist += m; } } } scalapack-2.0.2/BLACS/SRC/BI_TransUserComm.c000644 000766 000024 00000000561 11640652114 020446 0ustar00juliestaff000000 000000 #include "Bdef.h" MPI_Comm BI_TransUserComm(int Ucomm, int Np, int *pmap) { MPI_Comm bcomm, ucomm; MPI_Group bgrp, ugrp; int i; ucomm = MPI_Comm_f2c(Ucomm); i=MPI_Comm_group(ucomm, &ugrp); i=MPI_Group_incl(ugrp, Np, pmap, &bgrp); i=MPI_Comm_create(ucomm, bgrp, &bcomm); i=MPI_Group_free(&ugrp); i=MPI_Group_free(&bgrp); return(bcomm); } scalapack-2.0.2/BLACS/SRC/BI_TreeBR.c000644 000766 000024 00000001675 11640652114 017036 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_TreeBR(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send, int src, int nbranches) { void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); int Np, Iam, msgid, i, j; int mydist; /* my distance from src */ int destdist; /* the distance of the destination node */ Np = ctxt->scp->Np; if (Np < 2) return; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); mydist = (Np + Iam - src) % Np; /* * Go up to first step of tree where I send data to other nodes */ for (i=nbranches; i < Np; i *= nbranches); for (i /= nbranches; (mydist%i); i /= nbranches); BI_Srecv(ctxt, BANYNODE, msgid, bp); /* * While I need to send data to others */ while ( (i > 1) && !(mydist%i) ) { i /= nbranches; j = 1; do { destdist = mydist + j*i; if (destdist < Np) send(ctxt, (src+destdist)%Np, msgid, bp); } while(++j < nbranches); } } /* end BI_TreeBR */ scalapack-2.0.2/BLACS/SRC/BI_TreeBS.c000644 000766 000024 00000004376 11640652114 017040 0ustar00juliestaff000000 000000 #include "Bdef.h" /* * Tree_bs/br is a algorithm that does a broadcast send/recv such that the * communication pattern is a tree with an arbitrary number of branches. * The following two pairs of graphs give different ways of viewing the same * algorithm. The first pair shows the trees as they should be visualized * when examining the algorithm. The second pair are isomorphic graphs of * of the first, which show the actual pattern of data movement. * Note that a tree broadcast with NBRANCHES = 2 is isomorphic with a * hypercube broadcast (however, it does not require the nodes be a * power of two to work). * * TREE BROADCAST, NBRANCHES = 2 * TREE BROADCAST, NBRANCHES = 3 * * * i=4 &______________ * * | \ * * i=2 &______ &______ * i=3 &______________________ * | \ | \ * | \ \ * i=1 &__ &__ &__ &__ * i=1 &______ &______ &__ * | \ | \ | \ | \ * | \ \ | \ \ | \ * 0 1 2 3 4 5 6 7 * 0 1 2 3 4 5 6 7 * * * ISOMORPHIC GRAPHS OF ABOVE, SHOWN IN MORE FAMILIAR TERMS: * * 0 0 * _________|_________ ___________|____________ * / | \ / | | \ * 4 2 1 6 3 2 1 * / \ | | / \ * 6 5 3 7 4 5 * | * 7 */ void BI_TreeBS(BLACSCONTEXT *ctxt, BLACBUFF *bp, SDRVPTR send, int nbranches) { int Np, Iam, msgid, i, j; int destdist; /* the distance of the destination node */ Np = ctxt->scp->Np; if (Np < 2) return; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); for (i=nbranches; i < Np; i*=nbranches); for (i /= nbranches; i > 0; i /= nbranches) { j = 1; do { destdist = i*j; if (destdist < Np) send(ctxt, (destdist+Iam)%Np, msgid, bp); } while(++j < nbranches); } } /* end BI_TreeBS */ scalapack-2.0.2/BLACS/SRC/BI_TreeComb.c000644 000766 000024 00000011774 11640652114 017414 0ustar00juliestaff000000 000000 #include "Bdef.h" /* * This topology supports trees with arbitrary numbers of branches at * each step. The following pictures show the tree that should be visualized * when examining the algorithm. * * TREE GLOBAL OP, NBRANCHES = 2 * TREE GLOBAL OP, NBRANCHES = 3 * * * i=4 &______________ * * | \ * * i=2 &______ &______ * i=3 &______________________ * | \ | \ * | \ \ * i=1 &__ &__ &__ &__ * i=1 &______ &______ &__ * | \ | \ | \ | \ * | \ \ | \ \ | \ * 0 1 2 3 4 5 6 7 * 0 1 2 3 4 5 6 7 */ void BI_TreeComb(BLACSCONTEXT *ctxt, BLACBUFF *bp, BLACBUFF *bp2, int N, VVFUNPTR Xvvop, int dest, int nbranches) /* * -- V1.1ALPHA (test version) BLACS routine -- * University of Tennessee, October 1, 1995 * Written by Clint Whaley. * * Purpose * ======= * Perform a element-by-element combine on vectors. * If rdest1 = -1, the answer will be left on all participating processes. * Otherwise, only the process at grid coordinates {rdest1, cdest1} will * have the final answer. Other Processes will have intermediate (useless) * values. * * Arguments * ========= * CTXT (input) pointer to BLACSCONTEXT * The BLACS context where operation is taking place. * * BP (input/output) pointer to BLACBUFF. * BLACBUFF is a special data type used by the BLACS to control * buffers and the asynchronous operations coming out of them. * This BLACBUFF should have a buffer who's first N elements * contain the data to be combined. Additional space may be * required, depending upon what combine is being performed. * * BP2 (workspace) pointer to BLACBUFF. * This BLACBUFF is used to receive information for combining with * this process's information. * * DEST (input) int * Node to receive answer. If DEST == -1, all nodes in receive * the answer. * * N (input) int * The number of elements in the vector. N >= 0. * * Xvvop (input) pointer to typed operation function * Points to a typed function which performs the required operation * (e.g. summation) on the two N-element vectors. * * NBRANCHES (input) int * Indicates the degree of the tree to use (see picture above). * * ------------------------------------------------------------------------ */ { void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Rsend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Arecv(BLACSCONTEXT *, int, int, BLACBUFF *); int Np, Iam, msgid, Rmsgid, i, j; int nrcvs=0; /* Number of ReCeiVeS to do */ int REBS; /* should info be RE-BroadcaSt? */ int rightedge; /* right-most receiving node */ int mydist; /* my distance from destination node */ int dist; int src; /* Used if we must force repeatability */ Np = ctxt->scp->Np; if (Np < 2) return; Iam = ctxt->scp->Iam; msgid = Mscopeid(ctxt); Rmsgid = Mscopeid(ctxt); if (REBS = (dest == -1)) dest = 0; mydist = (Np + Iam - dest) % Np; if (REBS) { dist = mydist; if (mydist != 0) BI_Arecv(ctxt, BANYNODE, Rmsgid, bp); } if (nbranches == FULLCON) nbranches = Np; rightedge = Np - 1 - (Np-1)%nbranches; for (i=1; (i < Np); i *= nbranches) { if (mydist%nbranches) /* nodes that send to other nodes */ { BI_Ssend(ctxt, (dest + (mydist-mydist%nbranches)*i)%Np, msgid, bp); break; /* I'm done */ } else { if (mydist != rightedge) nrcvs = nbranches - 1; else nrcvs = (Np + i - 1) / i - rightedge - 1; mydist /= nbranches; rightedge /= nbranches; rightedge -= (rightedge % nbranches); if (!ctxt->TopsRepeat) { for (j=nrcvs; j; j--) { BI_Srecv(ctxt, BANYNODE, msgid, bp2); Xvvop(N, bp->Buff, bp2->Buff); } } else { src = (Iam + i) % Np; for (j=nrcvs; j; j--) { BI_Srecv(ctxt, src, msgid, bp2); Xvvop(N, bp->Buff, bp2->Buff); src = (src + i) % Np; } } } } /* * Broadcast answer to everyone if RDEST == -1 */ if (REBS) { mydist = dist; for (i=2; i < Np; i <<= 1); if (mydist > 0) BI_BuffIsFree(bp, 1); while (i > 1) { if ( !(mydist%i) ) { i >>= 1; dist = mydist + i; if (dist < Np) BI_Rsend(ctxt, dist, Rmsgid, bp); } else i >>= 1; } } } /* end BI_TreeComb */ scalapack-2.0.2/BLACS/SRC/BI_Unpack.c000644 000766 000024 00000000670 11640652114 017126 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_Unpack(BLACSCONTEXT *ctxt, BVOID *A, BLACBUFF *bp, MPI_Datatype Dtype) { int i=0, info, one=1; /* * Some versions of mpich and its derivitives cannot handle 0 byte typedefs, * so we have set MPI_BYTE as a flag for a 0 byte message */ #ifdef ZeroByteTypeBug if (Dtype == MPI_BYTE) return; #endif info=MPI_Unpack(bp->Buff, bp->Len, &i, A, one, Dtype, ctxt->scp->comm); info=MPI_Type_free(&Dtype); } scalapack-2.0.2/BLACS/SRC/BI_UpdateBuffs.c000644 000766 000024 00000002522 11640652114 020113 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_UpdateBuffs(BLACBUFF *Newbp) { int BI_BuffIsFree(BLACBUFF *, int); BLACBUFF *bp, *bp2; extern BLACBUFF *BI_ReadyB, *BI_ActiveQ; if (Newbp) { if (BI_ActiveQ == NULL) BI_ActiveQ = Newbp->prev = Newbp; else { BI_ActiveQ->prev->next = Newbp; Newbp->prev = BI_ActiveQ->prev; BI_ActiveQ->prev = Newbp; } Newbp->next = NULL; if (Newbp == BI_ReadyB) BI_ReadyB = NULL; } /* * See if any active buffers are ready for reuse. */ for (bp=BI_ActiveQ; bp != NULL; bp = bp2) { bp2 = bp->next; if ( BI_BuffIsFree(bp, 0) ) /* if all of buff's Aops are done */ { /* * Remove bp from BI_ActiveQ -- update pointers */ if (bp->next) bp->next->prev = bp->prev; else BI_ActiveQ->prev = bp->prev; if (bp != BI_ActiveQ) bp->prev->next = bp->next; else BI_ActiveQ = BI_ActiveQ->next; /* * If no ready buffer, inactive buff becomes ready */ if (BI_ReadyB == NULL) BI_ReadyB = bp; /* * If inactive buff bigger than present ready buff, release ready, * and inactive buff becomes ready */ else if (BI_ReadyB->Len < bp->Len) { free(BI_ReadyB); BI_ReadyB = bp; } /* * If ready buffer exists and is bigger than inactive buff, * free inactive buff */ else free(bp); } } } /* end BI_UpdateBuffs */ scalapack-2.0.2/BLACS/SRC/BI_zMPI_amn.c000644 000766 000024 00000000315 11640652114 017353 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zMPI_amn(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_zvvamn(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_zvvamn(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_zMPI_amn2.c000644 000766 000024 00000000243 11640652114 017435 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zMPI_amn2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_zvvamn2(int, char *, char *); BI_zvvamn2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_zMPI_amx.c000644 000766 000024 00000000315 11640652114 017365 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zMPI_amx(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_zvvamx(int, char *, char *); extern BLACBUFF BI_AuxBuff; BI_zvvamx(BI_AuxBuff.Len, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_zMPI_amx2.c000644 000766 000024 00000000243 11640652114 017447 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zMPI_amx2(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_zvvamx2(int, char *, char *); BI_zvvamx2(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_zMPI_sum.c000644 000766 000024 00000000240 11640652114 017401 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zMPI_sum(void *in, void *inout, int *N, MPI_Datatype *dtype) { void BI_zvvsum(int, char *, char *); BI_zvvsum(*N, inout, in); } scalapack-2.0.2/BLACS/SRC/BI_zvvamn.c000644 000766 000024 00000001355 11640652114 017227 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zvvamn(int N, char *vec1, char *vec2) { DCOMPLEX *v1=(DCOMPLEX*)vec1, *v2=(DCOMPLEX*)vec2; double diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(DCOMPLEX); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Cabs(v1[k]) - Cabs(v2[k]); if (diff > 0) { v1[k].r = v2[k].r; v1[k].i = v2[k].i; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k].r = v2[k].r; v1[k].i = v2[k].i; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_zvvamn2.c000644 000766 000024 00000001335 11640652114 017307 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zvvamn2(int N, char *vec1, char *vec2) { int r, i; double *v1=(double*)vec1, *v2=(double*)vec2; double diff; N *= 2; for (r=0, i=1; r != N; r += 2, i += 2) { diff = (Rabs(v1[r]) + Rabs(v1[i])) - (Rabs(v2[r]) + Rabs(v2[i])); if (diff > 0) { v1[r] = v2[r]; v1[i] = v2[i]; } else if (diff == 0) { if (v1[r] != v2[r]) { if (v1[r] < v2[r]) { v1[r] = v2[r]; v1[i] = v2[i]; } } else { if (v1[i] < v2[i]) { v1[r] = v2[r]; v1[i] = v2[i]; } } } } } scalapack-2.0.2/BLACS/SRC/BI_zvvamx.c000644 000766 000024 00000001355 11640652114 017241 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zvvamx(int N, char *vec1, char *vec2) { DCOMPLEX *v1=(DCOMPLEX*)vec1, *v2=(DCOMPLEX*)vec2; double diff; BI_DistType *dist1, *dist2; int i, k; k = N * sizeof(DCOMPLEX); i = k % sizeof(BI_DistType); if (i) k += sizeof(BI_DistType) - i; dist1 = (BI_DistType *) &vec1[k]; dist2 = (BI_DistType *) &vec2[k]; for (k=0; k < N; k++) { diff = Cabs(v1[k]) - Cabs(v2[k]); if (diff < 0) { v1[k].r = v2[k].r; v1[k].i = v2[k].i; dist1[k] = dist2[k]; } else if (diff == 0) { if (dist1[k] > dist2[k]) { v1[k].r = v2[k].r; v1[k].i = v2[k].i; dist1[k] = dist2[k]; } } } } scalapack-2.0.2/BLACS/SRC/BI_zvvamx2.c000644 000766 000024 00000001335 11640652114 017321 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zvvamx2(int N, char *vec1, char *vec2) { int r, i; double *v1=(double*)vec1, *v2=(double*)vec2; double diff; N *= 2; for (r=0, i=1; r != N; r += 2, i += 2) { diff = (Rabs(v1[r]) + Rabs(v1[i])) - (Rabs(v2[r]) + Rabs(v2[i])); if (diff < 0) { v1[r] = v2[r]; v1[i] = v2[i]; } else if (diff == 0) { if (v1[r] != v2[r]) { if (v1[r] < v2[r]) { v1[r] = v2[r]; v1[i] = v2[i]; } } else { if (v1[i] < v2[i]) { v1[r] = v2[r]; v1[i] = v2[i]; } } } } } scalapack-2.0.2/BLACS/SRC/BI_zvvsum.c000644 000766 000024 00000000261 11640652114 017253 0ustar00juliestaff000000 000000 #include "Bdef.h" void BI_zvvsum(int N, char *vec1, char *vec2) { double *v1=(double*)vec1, *v2=(double*)vec2; int k; N *=2; for (k=0; k < N; k++) v1[k] += v2[k]; } scalapack-2.0.2/BLACS/SRC/blacs2sys_.c000644 000766 000024 00000001445 11640652114 017400 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) MPI_Comm Cblacs2sys_handle(int BlacsCtxt) #else int blacs2sys_handle_(int *BlacsCtxt) #endif { #if (INTFACE == C_CALL) int i[2]; extern int BI_MaxNSysCtxt; extern MPI_Comm *BI_SysContxts; if (BI_COMM_WORLD == NULL) Cblacs_pinfo(i, &i[1]); if ( (BlacsCtxt >= BI_MaxNSysCtxt) || (BlacsCtxt < 0) ) { BI_BlacsErr(-1, __LINE__, __FILE__, "No system context corresponding to BLACS system context handle %d\n", BlacsCtxt); } else if (BI_SysContxts[BlacsCtxt] == MPI_COMM_NULL) { BI_BlacsErr(-1, __LINE__, __FILE__, "No system context corresponding to BLACS system context handle %d\n", BlacsCtxt); } return(BI_SysContxts[BlacsCtxt]); #else return(*BlacsCtxt); #endif } scalapack-2.0.2/BLACS/SRC/blacs_abort_.c000644 000766 000024 00000001063 11640652114 017742 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_abort(int ConTxt, int ErrNo) #else F_VOID_FUNC blacs_abort_(int *ConTxt, int *ErrNo) #endif { void Cblacs_gridinfo(int, int *, int *, int *, int *); void BI_BlacsAbort(int ErrNo); int nprow, npcol, myrow, mycol; extern int BI_Iam; Cblacs_gridinfo(Mpval(ConTxt), &nprow, &npcol, &myrow, &mycol); fprintf(stderr, "{%d,%d}, pnum=%d, Contxt=%d, killed other procs, exiting with error #%d.\n\n", myrow, mycol, BI_Iam, Mpval(ConTxt), Mpval(ErrNo)); BI_BlacsAbort(Mpval(ErrNo)); } scalapack-2.0.2/BLACS/SRC/blacs_barr_.c000644 000766 000024 00000001016 11640652114 017557 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_barrier(int ConTxt, char *scope) #else F_VOID_FUNC blacs_barrier_(int *ConTxt, F_CHAR scope) #endif { char tscope; int ierr; BLACSCONTEXT *ctxt; tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); MGetConTxt(Mpval(ConTxt), ctxt); switch(tscope) { case 'r': MPI_Barrier(ctxt->rscp.comm); break; case 'c': MPI_Barrier(ctxt->cscp.comm); break; case 'a': MPI_Barrier(ctxt->ascp.comm); break; } } scalapack-2.0.2/BLACS/SRC/blacs_exit_.c000644 000766 000024 00000001712 11640652114 017605 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_exit(int NotDone) #else F_VOID_FUNC blacs_exit_(int *NotDone) #endif { void Cblacs_gridexit(int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); BLACBUFF *bp; extern BLACBUFF *BI_ReadyB, *BI_ActiveQ, BI_AuxBuff; int i; extern int BI_MaxNCtxt, BI_Np; extern BLACSCONTEXT **BI_MyContxts; /* * Destroy all contexts */ for (i=0; i < BI_MaxNCtxt; i++) if (BI_MyContxts[i]) Cblacs_gridexit(i); free(BI_MyContxts); if (BI_ReadyB) free(BI_ReadyB); while (BI_ActiveQ != NULL) { bp = BI_ActiveQ; BI_BuffIsFree(bp, 1); /* wait for async sends to complete */ BI_ActiveQ = bp->next; free(bp); } free (BI_AuxBuff.Aops); /* * Reset parameters to initial values */ BI_MaxNCtxt = 0; BI_MyContxts = NULL; BI_Np = -1; if (!Mpval(NotDone)) { MPI_Finalize(); } BI_ReadyB = NULL; } scalapack-2.0.2/BLACS/SRC/blacs_free_.c000644 000766 000024 00000000771 11640652114 017561 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_freebuff(int ConTxt, int Wait) #else F_VOID_FUNC blacs_freebuff_(int *ConTxt, int *Wait) #endif { void BI_UpdateBuffs(BLACBUFF *); int BI_BuffIsFree(BLACBUFF *, int); extern BLACBUFF *BI_ReadyB, *BI_ActiveQ; if (Mpval(Wait)) /* wait for all buffers to be done */ { while (BI_ActiveQ != NULL) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(NULL); if (BI_ReadyB) { free(BI_ReadyB); BI_ReadyB = NULL; } } scalapack-2.0.2/BLACS/SRC/blacs_get_.c000644 000766 000024 00000003404 11640652114 017413 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_get(int ConTxt, int what, int *val) #else F_VOID_FUNC blacs_get_(int *ConTxt, int *what, int *val) #endif { int Csys2blacs_handle(MPI_Comm); int ierr, *iptr; int comm; BLACSCONTEXT *ctxt; switch( Mpval(what) ) { case SGET_SYSCONTXT: if (BI_COMM_WORLD == NULL) Cblacs_pinfo(val, &ierr); #if (INTFACE == C_CALL) *val = Csys2blacs_handle(MPI_COMM_WORLD); #else *val = *BI_COMM_WORLD; #endif break; case SGET_MSGIDS: if (BI_COMM_WORLD == NULL) Cblacs_pinfo(val, &val[1]); iptr = &val[1]; ierr=MPI_Attr_get(MPI_COMM_WORLD, MPI_TAG_UB, (BVOID **) &iptr,val); val[0] = 0; val[1] = *iptr; break; case SGET_DEBUGLVL: *val = BlacsDebugLvl; break; case SGET_BLACSCONTXT: MGetConTxt(Mpval(ConTxt), ctxt); #if (INTFACE == C_CALL) *val = Csys2blacs_handle(ctxt->pscp.comm); #else /* if user called the fortran interface to the BLACS */ *val = MPI_Comm_c2f(ctxt->pscp.comm); #endif break; case SGET_NR_BS: MGetConTxt(Mpval(ConTxt), ctxt); *val = ctxt->Nr_bs; break; case SGET_NB_BS: MGetConTxt(Mpval(ConTxt), ctxt); *val = ctxt->Nb_bs - 1; break; case SGET_NR_CO: MGetConTxt(Mpval(ConTxt), ctxt); *val = ctxt->Nr_co; break; case SGET_NB_CO: MGetConTxt(Mpval(ConTxt), ctxt); *val = ctxt->Nb_co - 1; break; case SGET_TOPSREPEAT: MGetConTxt(Mpval(ConTxt), ctxt); *val = ctxt->TopsRepeat; break; case SGET_TOPSCOHRNT: MGetConTxt(Mpval(ConTxt), ctxt); *val = ctxt->TopsCohrnt; break; default: BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "Unknown WHAT (%d)", Mpval(what)); } } scalapack-2.0.2/BLACS/SRC/blacs_grid_.c000644 000766 000024 00000001461 11640652114 017562 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_gridexit(int ConTxt) #else F_VOID_FUNC blacs_gridexit_(int *ConTxt) #endif { int i; BLACSCONTEXT *ctxt; extern int BI_MaxNCtxt; extern BLACSCONTEXT **BI_MyContxts; if ( (Mpval(ConTxt) < 0) || (Mpval(ConTxt) >= BI_MaxNCtxt) ) BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Trying to exit non-existent context"); if (BI_MyContxts[Mpval(ConTxt)] == NULL) BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Trying to exit an already freed context"); MGetConTxt(Mpval(ConTxt), ctxt); /* * Destroy context */ MPI_Comm_free(&ctxt->pscp.comm); MPI_Comm_free(&ctxt->ascp.comm); MPI_Comm_free(&ctxt->rscp.comm); MPI_Comm_free(&ctxt->cscp.comm); free(ctxt); BI_MyContxts[Mpval(ConTxt)] = NULL; } scalapack-2.0.2/BLACS/SRC/blacs_info_.c000644 000766 000024 00000001477 11640652114 017577 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_gridinfo(int ConTxt, int *nprow, int *npcol, int *myrow, int *mycol) #else F_VOID_FUNC blacs_gridinfo_(int *ConTxt, int *nprow, int *npcol, int *myrow, int *mycol) #endif { extern BLACSCONTEXT **BI_MyContxts; extern int BI_MaxNCtxt; BLACSCONTEXT *ctxt; /* * Make sure context handle is in range */ if ( (Mpval(ConTxt) >= 0) && (Mpval(ConTxt) < BI_MaxNCtxt) ) { /* * Make sure context is still defined */ ctxt = BI_MyContxts[Mpval(ConTxt)]; if (ctxt != NULL) { *nprow = ctxt->cscp.Np; *npcol = ctxt->rscp.Np; *myrow = ctxt->cscp.Iam; *mycol = ctxt->rscp.Iam; } else *nprow = *npcol = *myrow = *mycol = -1; } else *nprow = *npcol = *myrow = *mycol = -1; } scalapack-2.0.2/BLACS/SRC/blacs_init_.c000644 000766 000024 00000002007 11645634736 017614 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_gridinit(int *ConTxt, char *order, int nprow, int npcol) #else F_VOID_FUNC blacs_gridinit_(int *ConTxt, F_CHAR order, int *nprow, int *npcol) #endif { void Cblacs_gridmap(int *, int *, int, int, int); int *tmpgrid, *iptr; int i, j; /* * Grid can be row- or column-major natural ordering when blacs_gridinit is * called. Define a tmpgrid to reflect this, and call blacs_gridmap to * set it up */ iptr = tmpgrid = (int*) malloc( Mpval(nprow)*Mpval(npcol)*sizeof(*tmpgrid) ); if (Mlowcase(F2C_CharTrans(order)) == 'c') { i = Mpval(npcol) * Mpval(nprow); for (j=0; j < i; j++) iptr[j] = j; } else { for (j=0; j < Mpval(npcol); j++) { for (i=0; i < Mpval(nprow); i++) iptr[i] = i * Mpval(npcol) + j; iptr += Mpval(nprow); } } #if (INTFACE == C_CALL) Cblacs_gridmap(ConTxt, tmpgrid, nprow, nprow, npcol); #else blacs_gridmap_(ConTxt, tmpgrid, nprow, nprow, npcol); #endif free(tmpgrid); } scalapack-2.0.2/BLACS/SRC/blacs_map_.c000644 000766 000024 00000007666 11640652114 017427 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_gridmap(int *ConTxt, int *usermap, int ldup, int nprow0, int npcol0) #else F_VOID_FUNC blacs_gridmap_(int *ConTxt, int *usermap, int *ldup, int *nprow0, int *npcol0) #endif { void Cblacs_pinfo(int *, int *); void Cblacs_get(int, int, int *); MPI_Comm BI_TransUserComm(int, int, int *); MPI_Comm Cblacs2sys_handle(int); int info, i, j, Iam, *iptr; int myrow, mycol, nprow, npcol, Ng; BLACSCONTEXT *ctxt, **tCTxts; MPI_Comm comm, tcomm; MPI_Group grp, tgrp; extern BLACSCONTEXT **BI_MyContxts; extern BLACBUFF BI_AuxBuff; extern int BI_Iam, BI_Np, BI_MaxNCtxt; extern MPI_Status *BI_Stats; /* * If first call to blacs_gridmap */ if (BI_MaxNCtxt == 0) { Cblacs_pinfo(&BI_Iam, &BI_Np); BI_AuxBuff.nAops = 0; BI_AuxBuff.Aops = (MPI_Request*)malloc(BI_Np*sizeof(*BI_AuxBuff.Aops)); BI_Stats = (MPI_Status *) malloc(BI_Np * sizeof(MPI_Status)); } nprow = Mpval(nprow0); npcol = Mpval(npcol0); Ng = nprow * npcol; if ( (Ng > BI_Np) || (nprow < 1) || (npcol < 1) ) BI_BlacsErr(-1, -1, "BLACS_GRIDINIT/BLACS_GRIDMAP", "Illegal grid (%d x %d), #procs=%d", nprow, npcol, BI_Np); /* * Form MPI communicator for scope = 'all' */ if (Ng > 2) i = Ng; else i = 2; iptr = (int *) malloc(i*sizeof(int)); for (j=0; j < npcol; j++) { for (i=0; i < nprow; i++) iptr[i*npcol+j] = usermap[j*Mpval(ldup)+i]; } #if (INTFACE == C_CALL) tcomm = Cblacs2sys_handle(*ConTxt); MPI_Comm_group(tcomm, &grp); /* find input comm's group */ MPI_Group_incl(grp, Ng, iptr, &tgrp); /* form new group */ MPI_Comm_create(tcomm, tgrp, &comm); /* create new comm */ MPI_Group_free(&tgrp); MPI_Group_free(&grp); #else /* gridmap called from fortran */ comm = BI_TransUserComm(*ConTxt, Ng, iptr); #endif /* * Weed out callers who are not participating in present grid */ if (comm == MPI_COMM_NULL) { *ConTxt = NOTINCONTEXT; free(iptr); return; } /* * ================================================== * Get new context and add it to my array of contexts * ================================================== */ ctxt = (BLACSCONTEXT *) malloc(sizeof(BLACSCONTEXT)); /* * Find free slot in my context array */ for (i=0; i < BI_MaxNCtxt; i++) if (BI_MyContxts[i] == NULL) break; /* * Get bigger context pointer array, if needed */ if (i == BI_MaxNCtxt) { j = BI_MaxNCtxt + MAXNCTXT; tCTxts = (BLACSCONTEXT **) malloc(j * sizeof(*tCTxts)); for (i=0; i < BI_MaxNCtxt; i++) tCTxts[i] = BI_MyContxts[i]; BI_MaxNCtxt = j; for(j=i; j < BI_MaxNCtxt; j++) tCTxts[j] = NULL; if (BI_MyContxts) free(BI_MyContxts); BI_MyContxts = tCTxts; } BI_MyContxts[i] = ctxt; *ConTxt = i; ctxt->ascp.comm = comm; MPI_Comm_dup(comm, &ctxt->pscp.comm); /* copy acomm for pcomm */ MPI_Comm_rank(comm, &Iam); /* find my rank in new comm */ myrow = Iam / npcol; mycol = Iam % npcol; /* * Form MPI communicators for scope = 'row' */ MPI_Comm_split(comm, myrow, mycol, &ctxt->rscp.comm); /* * Form MPI communicators for scope = 'Column' */ MPI_Comm_split(comm, mycol, myrow, &ctxt->cscp.comm); ctxt->rscp.Np = npcol; ctxt->rscp.Iam = mycol; ctxt->cscp.Np = nprow; ctxt->cscp.Iam = myrow; ctxt->pscp.Np = ctxt->ascp.Np = Ng; ctxt->pscp.Iam = ctxt->ascp.Iam = Iam; ctxt->Nr_bs = ctxt->Nr_co = 1; ctxt->Nb_bs = ctxt->Nb_co = 2; ctxt->TopsRepeat = ctxt->TopsCohrnt = 0; /* * =========================== * Set up the message id stuff * =========================== */ Cblacs_get(-1, 1, iptr); ctxt->pscp.MinId = ctxt->rscp.MinId = ctxt->cscp.MinId = ctxt->ascp.MinId = ctxt->pscp.ScpId = ctxt->rscp.ScpId = ctxt->cscp.ScpId = ctxt->ascp.ScpId = iptr[0]; ctxt->pscp.MaxId = ctxt->rscp.MaxId = ctxt->cscp.MaxId = ctxt->ascp.MaxId = iptr[1]; free(iptr); } scalapack-2.0.2/BLACS/SRC/blacs_pcoord_.c000644 000766 000024 00000000642 11640652114 020123 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_pcoord(int ConTxt, int nodenum, int *prow, int *pcol) #else F_VOID_FUNC blacs_pcoord_(int *ConTxt, int *nodenum, int *prow, int *pcol) #endif { BLACSCONTEXT *ctxt; MGetConTxt(Mpval(ConTxt), ctxt); if ( (Mpval(nodenum) >= 0) && (Mpval(nodenum) < ctxt->ascp.Np) ) { Mpcoord(ctxt, Mpval(nodenum), *prow, *pcol); } else *prow = *pcol = -1; } scalapack-2.0.2/BLACS/SRC/blacs_pinfo_.c000644 000766 000024 00000001203 11640652114 017742 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_pinfo(int *mypnum, int *nprocs) #else F_VOID_FUNC blacs_pinfo_(int *mypnum, int *nprocs) #endif { int ierr; extern int BI_Iam, BI_Np; int argc=0; char **argv=NULL; if (BI_COMM_WORLD == NULL) { MPI_Initialized(nprocs); if (!(*nprocs)) ierr = MPI_Init(&argc,&argv); // call Init and ignore argc and argv BI_COMM_WORLD = (int *) malloc(sizeof(int)); *BI_COMM_WORLD = MPI_Comm_c2f(MPI_COMM_WORLD); MPI_Comm_size(MPI_COMM_WORLD, &BI_Np); MPI_Comm_rank(MPI_COMM_WORLD, &BI_Iam); } *mypnum = BI_Iam; *nprocs = BI_Np; } scalapack-2.0.2/BLACS/SRC/blacs_pnum_.c000644 000766 000024 00000000653 11640652114 017616 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) int Cblacs_pnum(int ConTxt, int prow, int pcol) #else F_INT_FUNC blacs_pnum_(int *ConTxt, int *prow, int *pcol) #endif { BLACSCONTEXT *ctxt; MGetConTxt(Mpval(ConTxt), ctxt); if ( (Mpval(prow) >= 0) && (Mpval(prow) < ctxt->cscp.Np) && (Mpval(pcol) >= 0) && (Mpval(pcol) < ctxt->rscp.Np) ) return( Mkpnum(ctxt, Mpval(prow), Mpval(pcol)) ); else return(-1); } scalapack-2.0.2/BLACS/SRC/blacs_set_.c000644 000766 000024 00000004317 11640652114 017433 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_set(int ConTxt, int what, int *val) #else F_VOID_FUNC blacs_set_(int *ConTxt, int *what, int *val) #endif { BLACSCONTEXT *ctxt; switch( Mpval(what) ) { case SGET_SYSCONTXT: BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "Cannot set BLACS system context, can only BLACS_GET"); break; case SGET_MSGIDS: BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "No need to set message ID range due to MPI communicator."); break; case SGET_DEBUGLVL: BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "Cannot set BLACS debug level; must recompile to change"); break; case SGET_BLACSCONTXT: BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "Cannot set BLACS context, can only BLACS_GET"); break; case SGET_NR_BS: MGetConTxt(Mpval(ConTxt), ctxt); if (*val) ctxt->Nr_bs = *val; else BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "BSBR nrings cannot be set to zero"); break; case SGET_NB_BS: MGetConTxt(Mpval(ConTxt), ctxt); if (*val > 0) ctxt->Nb_bs = *val + 1; else BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "Illegal BSBR nbranches (%d); must be strictly positive", *val); break; case SGET_NR_CO: MGetConTxt(Mpval(ConTxt), ctxt); if (*val) ctxt->Nr_co = *val; else BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "COMB nrings cannot be set to zero"); break; case SGET_NB_CO: MGetConTxt(Mpval(ConTxt), ctxt); if (*val > 0) ctxt->Nb_co = *val + 1; else BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "Illegal COMB nbranches (%d); must be strictly positive", *val); break; case SGET_TOPSREPEAT: MGetConTxt(Mpval(ConTxt), ctxt); ctxt->TopsRepeat = *val; break; case SGET_TOPSCOHRNT: MGetConTxt(Mpval(ConTxt), ctxt); ctxt->TopsCohrnt = *val; break; default: BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "Unknown WHAT (%d)", Mpval(what)); } } scalapack-2.0.2/BLACS/SRC/blacs_setup_.c000644 000766 000024 00000000455 11640652114 017777 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cblacs_setup(int *mypnum, int *nprocs) #else F_VOID_FUNC blacs_setup_(int *mypnum, int *nprocs) #endif { /* * blacs_setup same as blacs_pinfo for non-PVM versions of the BLACS */ void Cblacs_pinfo(int *, int *); Cblacs_pinfo(mypnum, nprocs); } scalapack-2.0.2/BLACS/SRC/cgamn2d_.c000644 000766 000024 00000026556 11745552113 017024 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Ccgamn2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC cgamn2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amn operation for complex rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amn of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amn of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amn of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amn. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amn. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_cvvamn(int, char *, char *); void BI_cvvamn2(int, char *, char *); void BI_cMPI_amn(void *, void *, int *, MPI_Datatype *); void BI_cMPI_amn2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amn is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_cvvamn; length = N * sizeof(SCOMPLEX); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(SCOMPLEX) > j) j = sizeof(SCOMPLEX); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_cmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_COMPLEX; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_cvvamn2; length = N * sizeof(SCOMPLEX); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_cmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_COMPLEX; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_cMPI_amn2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_cMPI_amn, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_cvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_cvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amn array */ if (bp != &BI_AuxBuff) BI_cvmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/cgamx2d_.c000644 000766 000024 00000026555 11745552113 017035 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Ccgamx2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC cgamx2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amx operation for complex rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amx of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amx of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amx of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amx. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amx. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_cvvamx(int, char *, char *); void BI_cvvamx2(int, char *, char *); void BI_cMPI_amx(void *, void *, int *, MPI_Datatype *); void BI_cMPI_amx2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amx is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_cvvamx; length = N * sizeof(SCOMPLEX); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(SCOMPLEX) > j) j = sizeof(SCOMPLEX); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_cmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_COMPLEX; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_cvvamx2; length = N * sizeof(SCOMPLEX); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_cmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_COMPLEX; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_cMPI_amx2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_cMPI_amx, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_cvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_cvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amx array */ if (bp != &BI_AuxBuff) BI_cvmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/cgebr2d_.c000644 000766 000024 00000013770 11745552113 017013 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Ccgebr2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC cgebr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for general complex arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_COMPLEX, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, receive and send directly to/from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } /* * If A is not contiguous, we receive message as packed so it can be * forwarded without further system intervention */ else { send = BI_Asend; MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length, error); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif } #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If we buffered, unpack. */ #ifndef MpiBuffGood if (bp != &BI_AuxBuff) { BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); } else #endif { error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } } scalapack-2.0.2/BLACS/SRC/cgebs2d_.c000644 000766 000024 00000012265 11745552113 017012 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Ccgebs2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda) #else F_VOID_FUNC cgebs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for general complex arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_COMPLEX, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, send directly from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } else { send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); } #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end cgebs2d_ */ scalapack-2.0.2/BLACS/SRC/cgerv2d_.c000644 000766 000024 00000004765 11745552113 017043 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Ccgerv2d(int ConTxt, int m, int n, float *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC cgerv2d_(int *ConTxt, int *m, int *n, float *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point general complex receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tlda; int ierr; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_COMPLEX, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/cgesd2d_.c000644 000766 000024 00000005630 11745552113 017012 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Ccgesd2d(int ConTxt, int m, int n, float *A, int lda, int rdest, int cdest) #else F_VOID_FUNC cgesd2d_(int *ConTxt, int *m, int *n, float *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point general complex send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int dest, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "CGESD2D", 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_COMPLEX, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of cgesd2d */ scalapack-2.0.2/BLACS/SRC/cgsum2d_.c000644 000766 000024 00000015770 11640652114 017045 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Ccgsum2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda, int rdest, int cdest) #else F_VOID_FUNC cgsum2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine sum operation for complex rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the sum. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the sum. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_cvvsum(int, char *, char *); void BI_cMPI_sum(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int N, length, dest, tlda, trdest, ierr; MPI_Op BlacComb; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one. Also, we can't use MPI functions if we need to * guarantee repeatability. */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); length = N * sizeof(SCOMPLEX); /* * If A is contiguous, we can use it as one of the buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } /* * Otherwise, we must allocate both buffers */ else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_cmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->dtype = bp2->dtype = MPI_COMPLEX; bp->N = bp2->N = N; switch(ttop) { case ' ': /* use MPI's reduction by default */ length = 1; ierr=MPI_Op_create(BI_cMPI_sum, length, &BlacComb); if (dest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) BI_cvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_cvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } ierr=MPI_Op_free(&BlacComb); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, BI_cvvsum, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, BI_cvvsum, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, BI_cvvsum, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, BI_cvvsum, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, BI_cvvsum, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, BI_cvvsum, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, BI_cvvsum, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, BI_cvvsum); else BI_TreeComb(ctxt, bp, bp2, N, BI_cvvsum, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If I am selected to receive answer */ if (bp != &BI_AuxBuff) { if ( (ctxt->scp->Iam == dest) || (dest == -1) ) BI_cvmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); BI_UpdateBuffs(bp); } else { if (BI_ActiveQ) BI_UpdateBuffs(NULL); BI_BuffIsFree(bp, 1); } } scalapack-2.0.2/BLACS/SRC/CMakeLists.txt000644 000766 000024 00000007406 11656312637 017745 0ustar00juliestaff000000 000000 # -------------------------- # The communication routines # -------------------------- set (comm igesd2d_.c sgesd2d_.c dgesd2d_.c cgesd2d_.c zgesd2d_.c itrsd2d_.c strsd2d_.c dtrsd2d_.c ctrsd2d_.c ztrsd2d_.c igerv2d_.c sgerv2d_.c dgerv2d_.c cgerv2d_.c zgerv2d_.c itrrv2d_.c strrv2d_.c dtrrv2d_.c ctrrv2d_.c ztrrv2d_.c igebs2d_.c sgebs2d_.c dgebs2d_.c cgebs2d_.c zgebs2d_.c igebr2d_.c sgebr2d_.c dgebr2d_.c cgebr2d_.c zgebr2d_.c itrbs2d_.c strbs2d_.c dtrbs2d_.c ctrbs2d_.c ztrbs2d_.c itrbr2d_.c strbr2d_.c dtrbr2d_.c ctrbr2d_.c ztrbr2d_.c igsum2d_.c sgsum2d_.c dgsum2d_.c cgsum2d_.c zgsum2d_.c igamx2d_.c sgamx2d_.c dgamx2d_.c cgamx2d_.c zgamx2d_.c igamn2d_.c sgamn2d_.c dgamn2d_.c cgamn2d_.c zgamn2d_.c) # -------------------- # The support routines # -------------------- set (supp blacs_setup_.c blacs_set_.c blacs_get_.c blacs_abort_.c blacs_exit_.c blacs_pnum_.c blacs_pcoord_.c ksendid_.c krecvid_.c kbsid_.c kbrid_.c dcputime00_.c dwalltime00_.c blacs_pinfo_.c blacs_init_.c blacs_map_.c blacs_free_.c blacs_grid_.c blacs_info_.c blacs_barr_.c sys2blacs_.c blacs2sys_.c free_handle_.c) # --------------------- # The internal routines # --------------------- set (internal BI_HypBS.c BI_HypBR.c BI_IdringBS.c BI_IdringBR.c BI_MpathBS.c BI_MpathBR.c BI_SringBS.c BI_SringBR.c BI_TreeBS.c BI_TreeBR.c BI_Ssend.c BI_Rsend.c BI_Srecv.c BI_Asend.c BI_Arecv.c BI_TreeComb.c BI_BeComb.c BI_MringComb.c BI_ArgCheck.c BI_TransDist.c BI_GetBuff.c BI_UpdateBuffs.c BI_EmergencyBuff.c BI_BlacsErr.c BI_BlacsWarn.c BI_BlacsAbort.c BI_BuffIsFree.c BI_imvcopy.c BI_smvcopy.c BI_dmvcopy.c BI_ivmcopy.c BI_svmcopy.c BI_dvmcopy.c BI_Pack.c BI_Unpack.c BI_GetMpiGeType.c BI_GetMpiTrType.c BI_ivvsum.c BI_svvsum.c BI_dvvsum.c BI_cvvsum.c BI_zvvsum.c BI_ivvamx.c BI_svvamx.c BI_dvvamx.c BI_cvvamx.c BI_zvvamx.c BI_ivvamx2.c BI_svvamx2.c BI_dvvamx2.c BI_cvvamx2.c BI_zvvamx2.c BI_ivvamn.c BI_svvamn.c BI_dvvamn.c BI_cvvamn.c BI_zvvamn.c BI_ivvamn2.c BI_svvamn2.c BI_dvvamn2.c BI_cvvamn2.c BI_zvvamn2.c BI_iMPI_amx.c BI_sMPI_amx.c BI_dMPI_amx.c BI_cMPI_amx.c BI_zMPI_amx.c BI_iMPI_amx2.c BI_sMPI_amx2.c BI_dMPI_amx2.c BI_cMPI_amx2.c BI_zMPI_amx2.c BI_iMPI_amn.c BI_sMPI_amn.c BI_dMPI_amn.c BI_cMPI_amn.c BI_zMPI_amn.c BI_iMPI_amn2.c BI_sMPI_amn2.c BI_dMPI_amn2.c BI_cMPI_amn2.c BI_zMPI_amn2.c BI_cMPI_sum.c BI_zMPI_sum.c BI_ContxtNum.c BI_GlobalVars.c BI_TransUserComm.c ) # # Note on CMAKE (from Brad - Kitware) # #CMake hides the intermediate object files from its interface so there is #no direct way to do this within a single target. This limitation comes #from the requirement to support generation of build systems like VS IDE #project files that do not expose object files explicitly. # Solution 1 : Build separate libraries, one for each block of objects #set(blacsFint ${comm} ${supp}) #add_library(blacsCint ${comm} ${supp}) #set_property(TARGET blacsCint PROPERTY COMPILE_DEFINITIONS CallFromC) #add_library(scalapack ${internal} ${blacsFint}) #target_link_libraries(scalapack blacsCint) # Solution 2 : Create an extra source file that #include-s the original one set(srcs ${comm} ${supp}) set(srcs_C) foreach(src ${srcs}) # string(REPLACE ".c" "-C.c" src_C "${CMAKE_CURRENT_BINARY_DIR}/${src}") string(REPLACE ".c" "-C.c" src_C "${src}") configure_file(src-C.c.in ${src_C} @ONLY) list(APPEND srcs_C ${src_C}) endforeach() set(blacs ${srcs} ${srcs_C} ${internal}) #add_library(scalapack ${srcs} ${srcs_C} ${internal}) #scalapack_install_library(scalapack) scalapack-2.0.2/BLACS/SRC/ctrbr2d_.c000644 000766 000024 00000014607 11745552113 017045 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cctrbr2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, float *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC ctrbr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, float *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for trapezoidal complex arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope, tuplo, tdiag; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_COMPLEX, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #else send = BI_Asend; MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length, error); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } #ifdef MpiBuffGood error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); #endif #ifndef MpiBuffGood BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); #endif } scalapack-2.0.2/BLACS/SRC/ctrbs2d_.c000644 000766 000024 00000013645 11745552113 017047 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cctrbs2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, float *A, int lda) #else F_VOID_FUNC ctrbs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, float *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for trapezoidal complex arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope, tuplo, tdiag; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_COMPLEX, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #endif /* * Pack and use non-blocking sends for broadcast if MPI's data types aren't * more efficient */ #ifndef MpiBuffGood send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end ctrbs2d_ */ scalapack-2.0.2/BLACS/SRC/ctrrv2d_.c000644 000766 000024 00000006463 11745552113 017072 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cctrrv2d(int ConTxt, char *uplo, char *diag, int m, int n, float *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC ctrrv2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, float *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point trapezoidal complex receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tuplo, tdiag, tlda; int ierr, length; BLACBUFF *bp; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tdiag = F2C_CharTrans(diag); tuplo = F2C_CharTrans(uplo); tdiag = Mlowcase(tdiag); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_COMPLEX, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/ctrsd2d_.c000644 000766 000024 00000007277 11745552113 017055 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cctrsd2d(int ConTxt, char *uplo, char *diag, int m, int n, float *A, int lda, int rdest, int cdest) #else F_VOID_FUNC ctrsd2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, float *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point trapezoidal complex send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); char tuplo, tdiag; int dest, length, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tuplo = F2C_CharTrans(uplo); tdiag = F2C_CharTrans(diag); tuplo = Mlowcase(tuplo); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "CTRSD2D", 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_COMPLEX, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of ctrsd2d */ scalapack-2.0.2/BLACS/SRC/dcputime00_.c000644 000766 000024 00000000206 11640652114 017437 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) double Cdcputime00(void) #else F_DOUBLE_FUNC dcputime00_(void) #endif { return(-1.0); } scalapack-2.0.2/BLACS/SRC/dgamn2d_.c000644 000766 000024 00000026567 11745552113 017027 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdgamn2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC dgamn2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amn operation for double precision rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amn of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amn of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amn of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amn. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amn. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_dvvamn(int, char *, char *); void BI_dvvamn2(int, char *, char *); void BI_dMPI_amn(void *, void *, int *, MPI_Datatype *); void BI_dMPI_amn2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amn is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_dvvamn; length = N * sizeof(double); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(double) > j) j = sizeof(double); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_dmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_DOUBLE; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_dvvamn2; length = N * sizeof(double); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_dmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_DOUBLE; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_dMPI_amn2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_dMPI_amn, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_dvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_dvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amn array */ if (bp != &BI_AuxBuff) BI_dvmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/dgamx2d_.c000644 000766 000024 00000026567 11745552113 017041 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdgamx2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC dgamx2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amx operation for double precision rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amx of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amx of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amx of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amx. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amx. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_dvvamx(int, char *, char *); void BI_dvvamx2(int, char *, char *); void BI_dMPI_amx(void *, void *, int *, MPI_Datatype *); void BI_dMPI_amx2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amx is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_dvvamx; length = N * sizeof(double); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(double) > j) j = sizeof(double); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_dmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_DOUBLE; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_dvvamx2; length = N * sizeof(double); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_dmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_DOUBLE; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_dMPI_amx2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_dMPI_amx, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_dvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_dvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amx array */ if (bp != &BI_AuxBuff) BI_dvmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/dgebr2d_.c000644 000766 000024 00000014012 11745552113 017002 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdgebr2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC dgebr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for general double precision arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_DOUBLE, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, receive and send directly to/from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } /* * If A is not contiguous, we receive message as packed so it can be * forwarded without further system intervention */ else { send = BI_Asend; error=MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif } #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If we buffered, unpack. */ #ifndef MpiBuffGood if (bp != &BI_AuxBuff) { BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); } else #endif { error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } } scalapack-2.0.2/BLACS/SRC/dgebs2d_.c000644 000766 000024 00000012310 11745552113 017002 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdgebs2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda) #else F_VOID_FUNC dgebs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for general double precision arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_DOUBLE, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, send directly from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } else { send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); } #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end dgebs2d_ */ scalapack-2.0.2/BLACS/SRC/dgerv2d_.c000644 000766 000024 00000005010 11745552113 017024 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdgerv2d(int ConTxt, int m, int n, double *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC dgerv2d_(int *ConTxt, int *m, int *n, double *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point general double precision receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tlda; int ierr; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_DOUBLE, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/dgesd2d_.c000644 000766 000024 00000005654 11745552113 017021 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdgesd2d(int ConTxt, int m, int n, double *A, int lda, int rdest, int cdest) #else F_VOID_FUNC dgesd2d_(int *ConTxt, int *m, int *n, double *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point general double precision send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int dest, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "DGESD2D", 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_DOUBLE, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of dgesd2d */ scalapack-2.0.2/BLACS/SRC/dgsum2d_.c000644 000766 000024 00000015512 11640652114 017040 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdgsum2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int rdest, int cdest) #else F_VOID_FUNC dgsum2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine sum operation for double precision rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the sum. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the sum. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_dvvsum(int, char *, char *); /* * Variable Declarations */ BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int N, length, dest, tlda, trdest, ierr; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one. Also, we can't use MPI functions if we need to * guarantee repeatability. */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); length = N * sizeof(double); /* * If A is contiguous, we can use it as one of the buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } /* * Otherwise, we must allocate both buffers */ else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_dmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->dtype = bp2->dtype = MPI_DOUBLE; bp->N = bp2->N = N; switch(ttop) { case ' ': /* use MPI's reduction by default */ if (dest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, MPI_SUM, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) BI_dvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, MPI_SUM, ctxt->scp->comm); BI_dvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, BI_dvvsum, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, BI_dvvsum, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, BI_dvvsum, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, BI_dvvsum, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, BI_dvvsum, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, BI_dvvsum, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, BI_dvvsum, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, BI_dvvsum); else BI_TreeComb(ctxt, bp, bp2, N, BI_dvvsum, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If I am selected to receive answer */ if (bp != &BI_AuxBuff) { if ( (ctxt->scp->Iam == dest) || (dest == -1) ) BI_dvmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); BI_UpdateBuffs(bp); } else { if (BI_ActiveQ) BI_UpdateBuffs(NULL); BI_BuffIsFree(bp, 1); } } scalapack-2.0.2/BLACS/SRC/dtrbr2d_.c000644 000766 000024 00000014632 11745552113 017044 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdtrbr2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, double *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC dtrbr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, double *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for trapezoidal double precision arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope, tuplo, tdiag; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_DOUBLE, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #else send = BI_Asend; MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length, error); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } #ifdef MpiBuffGood error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); #endif #ifndef MpiBuffGood BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); #endif } scalapack-2.0.2/BLACS/SRC/dtrbs2d_.c000644 000766 000024 00000013670 11745552113 017046 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdtrbs2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, double *A, int lda) #else F_VOID_FUNC dtrbs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, double *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for trapezoidal double precision arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope, tuplo, tdiag; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_DOUBLE, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #endif /* * Pack and use non-blocking sends for broadcast if MPI's data types aren't * more efficient */ #ifndef MpiBuffGood send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end dtrbs2d_ */ scalapack-2.0.2/BLACS/SRC/dtrrv2d_.c000644 000766 000024 00000006506 11745552113 017071 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdtrrv2d(int ConTxt, char *uplo, char *diag, int m, int n, double *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC dtrrv2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, double *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point trapezoidal double precision receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tuplo, tdiag, tlda; int ierr, length; BLACBUFF *bp; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tdiag = F2C_CharTrans(diag); tuplo = F2C_CharTrans(uplo); tdiag = Mlowcase(tdiag); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_DOUBLE, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/dtrsd2d_.c000644 000766 000024 00000007322 11745552113 017045 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cdtrsd2d(int ConTxt, char *uplo, char *diag, int m, int n, double *A, int lda, int rdest, int cdest) #else F_VOID_FUNC dtrsd2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, double *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point trapezoidal double precision send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to double precision two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); char tuplo, tdiag; int dest, length, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tuplo = F2C_CharTrans(uplo); tdiag = F2C_CharTrans(diag); tuplo = Mlowcase(tuplo); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "DTRSD2D", 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_DOUBLE, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of dtrsd2d */ scalapack-2.0.2/BLACS/SRC/dwalltime00_.c000644 000766 000024 00000000217 11640652114 017611 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) double Cdwalltime00(void) #else F_DOUBLE_FUNC dwalltime00_(void) #endif { return(MPI_Wtime()); } scalapack-2.0.2/BLACS/SRC/free_handle_.c000644 000766 000024 00000002654 11640652114 017732 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cfree_blacs_system_handle(int ISysCtxt) #else void free_blacs_system_handle_(int *ISysCxt) #endif { #if (INTFACE == C_CALL) int i, j, DEF_WORLD; MPI_Comm *tSysCtxt; extern int BI_MaxNSysCtxt; extern MPI_Comm *BI_SysContxts; if ( (ISysCtxt < BI_MaxNSysCtxt) && (ISysCtxt > 0) ) { if (BI_SysContxts[ISysCtxt] != MPI_COMM_NULL) BI_SysContxts[ISysCtxt] = MPI_COMM_NULL; else BI_BlacsWarn(-1, __LINE__, __FILE__, "Trying to free non-existent system context handle %d", ISysCtxt); } else if (ISysCtxt == 0) return; /* never free MPI_COMM_WORLD */ else BI_BlacsWarn(-1, __LINE__, __FILE__, "Trying to free non-existent system context handle %d", ISysCtxt); /* * See if we have freed enough space to decrease the size of our table */ for (i=j=0; i < BI_MaxNSysCtxt; i++) if (BI_SysContxts[i] == MPI_COMM_NULL) j++; /* * If needed, get a smaller system context array */ if (j > 2*MAXNSYSCTXT) { j = BI_MaxNSysCtxt - MAXNSYSCTXT; tSysCtxt = (MPI_Comm *) malloc(j * sizeof(MPI_Comm)); for (i=j=0; i < BI_MaxNSysCtxt; i++) { if (BI_SysContxts[i] != MPI_COMM_NULL) tSysCtxt[j++] = BI_SysContxts[i]; } BI_MaxNSysCtxt -= MAXNSYSCTXT; for(; j < BI_MaxNSysCtxt; j++) tSysCtxt[j] = MPI_COMM_NULL; free(BI_SysContxts); BI_SysContxts = tSysCtxt; } #endif } scalapack-2.0.2/BLACS/SRC/igamn2d_.c000644 000766 000024 00000026457 11745552113 017032 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cigamn2d(int ConTxt, char *scope, char *top, int m, int n, int *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC igamn2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, int *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amn operation for integer rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amn of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amn of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amn of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amn. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amn. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_ivvamn(int, char *, char *); void BI_ivvamn2(int, char *, char *); void BI_iMPI_amn(void *, void *, int *, MPI_Datatype *); void BI_iMPI_amn2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amn is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_ivvamn; length = N * sizeof(int); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(int) > j) j = sizeof(int); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_imvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_INT; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_ivvamn2; length = N * sizeof(int); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_imvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_INT; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_iMPI_amn2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_iMPI_amn, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_ivmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_ivmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amn array */ if (bp != &BI_AuxBuff) BI_ivmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/igamx2d_.c000644 000766 000024 00000026457 11745552113 017044 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cigamx2d(int ConTxt, char *scope, char *top, int m, int n, int *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC igamx2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, int *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amx operation for integer rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amx of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amx of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amx of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amx. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amx. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_ivvamx(int, char *, char *); void BI_ivvamx2(int, char *, char *); void BI_iMPI_amx(void *, void *, int *, MPI_Datatype *); void BI_iMPI_amx2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amx is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_ivvamx; length = N * sizeof(int); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(int) > j) j = sizeof(int); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_imvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_INT; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_ivvamx2; length = N * sizeof(int); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_imvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_INT; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_iMPI_amx2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_iMPI_amx, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_ivmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_ivmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amx array */ if (bp != &BI_AuxBuff) BI_ivmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/igebr2d_.c000644 000766 000024 00000013760 11745552113 017020 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cigebr2d(int ConTxt, char *scope, char *top, int m, int n, int *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC igebr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, int *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for general integer arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_INT, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, receive and send directly to/from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } /* * If A is not contiguous, we receive message as packed so it can be * forwarded without further system intervention */ else { send = BI_Asend; MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length, error); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif } #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If we buffered, unpack. */ #ifndef MpiBuffGood if (bp != &BI_AuxBuff) { BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); } else #endif { error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } } scalapack-2.0.2/BLACS/SRC/igebs2d_.c000644 000766 000024 00000012232 11745552113 017012 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cigebs2d(int ConTxt, char *scope, char *top, int m, int n, int *A, int lda) #else F_VOID_FUNC igebs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, int *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for general integer arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_INT, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, send directly from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } else { send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); } #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'",ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end igebs2d_ */ scalapack-2.0.2/BLACS/SRC/igerv2d_.c000644 000766 000024 00000004755 11745552113 017050 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cigerv2d(int ConTxt, int m, int n, int *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC igerv2d_(int *ConTxt, int *m, int *n, int *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point general integer receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tlda; int ierr; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_INT, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/igesd2d_.c000644 000766 000024 00000005620 11745552113 017017 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cigesd2d(int ConTxt, int m, int n, int *A, int lda, int rdest, int cdest) #else F_VOID_FUNC igesd2d_(int *ConTxt, int *m, int *n, int *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point general integer send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int dest, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "IGESD2D", 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_INT, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of igesd2d */ scalapack-2.0.2/BLACS/SRC/igsum2d_.c000644 000766 000024 00000015401 11640652114 017042 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cigsum2d(int ConTxt, char *scope, char *top, int m, int n, int *A, int lda, int rdest, int cdest) #else F_VOID_FUNC igsum2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, int *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine sum operation for integer rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the sum. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the sum. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_ivvsum(int, char *, char *); /* * Variable Declarations */ BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int N, length, dest, tlda, trdest, ierr, itr; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one. Note that integer operations are always * repeatable. */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) ) ttop = '1'; N = Mpval(m) * Mpval(n); length = N * sizeof(int); /* * If A is contiguous, we can use it as one of the buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } /* * Otherwise, we must allocate both buffers */ else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_imvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->dtype = bp2->dtype = MPI_INT; bp->N = bp2->N = N; switch(ttop) { case ' ': /* use MPI's reduction by default */ if (dest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, MPI_SUM, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) BI_ivmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, MPI_SUM, ctxt->scp->comm); BI_ivmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, BI_ivvsum, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, BI_ivvsum, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, BI_ivvsum, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, BI_ivvsum, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, BI_ivvsum, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, BI_ivvsum, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, BI_ivvsum, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, BI_ivvsum); else BI_TreeComb(ctxt, bp, bp2, N, BI_ivvsum, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If I am selected to receive answer */ if (bp != &BI_AuxBuff) { if ( (ctxt->scp->Iam == dest) || (dest == -1) ) BI_ivmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); BI_UpdateBuffs(bp); } else { if (BI_ActiveQ) BI_UpdateBuffs(NULL); BI_BuffIsFree(bp, 1); } } scalapack-2.0.2/BLACS/SRC/itrbr2d_.c000644 000766 000024 00000014577 11745552113 017061 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Citrbr2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, int *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC itrbr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, int *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for trapezoidal integer arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope, tuplo, tdiag; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_INT, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #else send = BI_Asend; MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length, error); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } #ifdef MpiBuffGood error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); #endif #ifndef MpiBuffGood BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); #endif } scalapack-2.0.2/BLACS/SRC/itrbs2d_.c000644 000766 000024 00000013635 11745552113 017054 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Citrbs2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, int *A, int lda) #else F_VOID_FUNC itrbs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, int *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for trapezoidal integer arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope, tuplo, tdiag; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_INT, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #endif /* * Pack and use non-blocking sends for broadcast if MPI's data types aren't * more efficient */ #ifndef MpiBuffGood send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end itrbs2d_ */ scalapack-2.0.2/BLACS/SRC/itrrv2d_.c000644 000766 000024 00000006453 11745552113 017077 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Citrrv2d(int ConTxt, char *uplo, char *diag, int m, int n, int *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC itrrv2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, int *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point trapezoidal integer receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tuplo, tdiag, tlda; int ierr, length; BLACBUFF *bp; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tdiag = F2C_CharTrans(diag); tuplo = F2C_CharTrans(uplo); tdiag = Mlowcase(tdiag); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_INT, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/itrsd2d_.c000644 000766 000024 00000007267 11745552113 017062 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Citrsd2d(int ConTxt, char *uplo, char *diag, int m, int n, int *A, int lda, int rdest, int cdest) #else F_VOID_FUNC itrsd2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, int *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point trapezoidal integer send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to integer two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); char tuplo, tdiag; int dest, length, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tuplo = F2C_CharTrans(uplo); tdiag = F2C_CharTrans(diag); tuplo = Mlowcase(tuplo); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "ITRSD2D", 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_INT, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of itrsd2d */ scalapack-2.0.2/BLACS/SRC/kbrid_.c000644 000766 000024 00000001073 11640652114 016563 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) int Ckbrid(int ConTxt, char *scope, int rsrc, int csrc) #else F_INT_FUNC kbrid_(int *ConTxt, F_CHAR scope, int *rsrc, int *csrc) #endif { int msgid; char tmpscope; BLACSCONTEXT *ctxt; MGetConTxt(Mpval(ConTxt), ctxt); tmpscope = Mlowcase(F2C_CharTrans(scope)); switch(tmpscope) { case 'c' : ctxt->scp = &ctxt->cscp; break; case 'r' : ctxt->scp = &ctxt->cscp; break; case 'a' : ctxt->scp = &ctxt->cscp; break; } msgid = Mscopeid(ctxt); return (msgid); } scalapack-2.0.2/BLACS/SRC/kbsid_.c000644 000766 000024 00000001020 11640652114 016554 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) int Ckbsid(int ConTxt, char *scope) #else F_INT_FUNC kbsid_(int *ConTxt, F_CHAR scope) #endif { char tmpscope; int msgid; BLACSCONTEXT *ctxt; MGetConTxt(Mpval(ConTxt), ctxt); tmpscope = Mlowcase(F2C_CharTrans(scope)); switch(tmpscope) { case 'c' : ctxt->scp = &ctxt->cscp; break; case 'r' : ctxt->scp = &ctxt->rscp; break; case 'a' : ctxt->scp = &ctxt->ascp; break; } msgid = Mscopeid(ctxt); return(msgid); } scalapack-2.0.2/BLACS/SRC/krecvid_.c000644 000766 000024 00000000311 11640652114 017111 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) int Ckrecvid(int ConTxt, int rsrc, int csrc) #else F_INT_FUNC krecvid_(int *ConTxt, int *rsrc, int *csrc) #endif { return(PT2PTID+1); } /* end krecvid */ scalapack-2.0.2/BLACS/SRC/ksendid_.c000644 000766 000024 00000000315 11640652114 017107 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) int Cksendid(int ConTxt, int rdest, int cdest) #else F_INT_FUNC ksendid_(int *ConTxt, int *rdest, int *cdest) #endif { return(PT2PTID+1); } /* end ksendid */ scalapack-2.0.2/BLACS/SRC/Makefile000644 000766 000024 00000007477 11644651761 016656 0ustar00juliestaff000000 000000 dlvl = ../.. include $(dlvl)/SLmake.inc # -------------------------- # The communication routines # -------------------------- comm = igesd2d_.o sgesd2d_.o dgesd2d_.o cgesd2d_.o zgesd2d_.o \ itrsd2d_.o strsd2d_.o dtrsd2d_.o ctrsd2d_.o ztrsd2d_.o \ igerv2d_.o sgerv2d_.o dgerv2d_.o cgerv2d_.o zgerv2d_.o \ itrrv2d_.o strrv2d_.o dtrrv2d_.o ctrrv2d_.o ztrrv2d_.o \ igebs2d_.o sgebs2d_.o dgebs2d_.o cgebs2d_.o zgebs2d_.o \ igebr2d_.o sgebr2d_.o dgebr2d_.o cgebr2d_.o zgebr2d_.o \ itrbs2d_.o strbs2d_.o dtrbs2d_.o ctrbs2d_.o ztrbs2d_.o \ itrbr2d_.o strbr2d_.o dtrbr2d_.o ctrbr2d_.o ztrbr2d_.o \ igsum2d_.o sgsum2d_.o dgsum2d_.o cgsum2d_.o zgsum2d_.o \ igamx2d_.o sgamx2d_.o dgamx2d_.o cgamx2d_.o zgamx2d_.o \ igamn2d_.o sgamn2d_.o dgamn2d_.o cgamn2d_.o zgamn2d_.o # -------------------- # The support routines # -------------------- supp = blacs_setup_.o blacs_set_.o blacs_get_.o \ blacs_abort_.o blacs_exit_.o blacs_pnum_.o blacs_pcoord_.o \ ksendid_.o krecvid_.o kbsid_.o kbrid_.o \ dcputime00_.o dwalltime00_.o blacs_pinfo_.o \ blacs_init_.o blacs_map_.o blacs_free_.o blacs_grid_.o blacs_info_.o \ blacs_barr_.o sys2blacs_.o blacs2sys_.o free_handle_.o # ---------------------------- # The fortran and C interfaces # ---------------------------- Fintobj = $(comm) $(supp) Cintobj = $(comm:.o=.oo) $(supp:.o=.oo) # --------------------- # The internal routines # --------------------- internal = BI_HypBS.o BI_HypBR.o BI_IdringBS.o BI_IdringBR.o \ BI_MpathBS.o BI_MpathBR.o BI_SringBS.o BI_SringBR.o \ BI_TreeBS.o BI_TreeBR.o \ BI_Ssend.o BI_Rsend.o BI_Srecv.o BI_Asend.o BI_Arecv.o \ BI_TreeComb.o BI_BeComb.o BI_MringComb.o \ BI_ArgCheck.o BI_TransDist.o BI_GetBuff.o BI_UpdateBuffs.o \ BI_EmergencyBuff.o BI_BlacsErr.o BI_BlacsWarn.o BI_BlacsAbort.o \ BI_BuffIsFree.o BI_imvcopy.o BI_smvcopy.o BI_dmvcopy.o \ BI_ivmcopy.o BI_svmcopy.o BI_dvmcopy.o \ BI_Pack.o BI_Unpack.o BI_GetMpiGeType.o BI_GetMpiTrType.o \ BI_ivvsum.o BI_svvsum.o BI_dvvsum.o BI_cvvsum.o BI_zvvsum.o \ BI_ivvamx.o BI_svvamx.o BI_dvvamx.o BI_cvvamx.o BI_zvvamx.o \ BI_ivvamx2.o BI_svvamx2.o BI_dvvamx2.o BI_cvvamx2.o BI_zvvamx2.o \ BI_ivvamn.o BI_svvamn.o BI_dvvamn.o BI_cvvamn.o BI_zvvamn.o \ BI_ivvamn2.o BI_svvamn2.o BI_dvvamn2.o BI_cvvamn2.o BI_zvvamn2.o \ BI_iMPI_amx.o BI_sMPI_amx.o BI_dMPI_amx.o BI_cMPI_amx.o \ BI_zMPI_amx.o BI_iMPI_amx2.o BI_sMPI_amx2.o BI_dMPI_amx2.o \ BI_cMPI_amx2.o BI_zMPI_amx2.o BI_iMPI_amn.o BI_sMPI_amn.o \ BI_dMPI_amn.o BI_cMPI_amn.o BI_zMPI_amn.o BI_iMPI_amn2.o \ BI_sMPI_amn2.o BI_dMPI_amn2.o BI_cMPI_amn2.o BI_zMPI_amn2.o \ BI_cMPI_sum.o BI_zMPI_sum.o BI_ContxtNum.o BI_GlobalVars.o \ BI_TransUserComm.o lib : all $(internal): Bdef.h Bconfig.h $(Fintobj): Bdef.h Bconfig.h # --------------------------------------- # Make both C and fortran interface BLACS # --------------------------------------- all : $(internal) $(Fintobj) $(Cintobj) $(ARCH) $(ARCHFLAGS) ../../$(SCALAPACKLIB) $(internal) $(Fintobj) $(Cintobj) $(RANLIB) ../../$(SCALAPACKLIB) # ----------------------- # Delete the object files # ----------------------- clean : rm -f $(Cintobj) $(Fintobj) $(internal) # ------------------------------------------------------------------------ # We move C .o files to .oo so that we can use the portable suffix rule for # compilation, and still have them coexist with the fortran interface # .o files. # ------------------------------------------------------------------------ .SUFFIXES: .o .oo .c.oo: $(CC) -o C$*.o -c $(CDEFS) $(CCFLAGS) -DCallFromC $< mv C$*.o $*.oo .c.o: $(CC) -c $(CDEFS) $(CCFLAGS) $< scalapack-2.0.2/BLACS/SRC/sgamn2d_.c000644 000766 000024 00000026527 11745552113 017042 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Csgamn2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC sgamn2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amn operation for real rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amn of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amn of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amn of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amn. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amn. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_svvamn(int, char *, char *); void BI_svvamn2(int, char *, char *); void BI_sMPI_amn(void *, void *, int *, MPI_Datatype *); void BI_sMPI_amn2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amn is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_svvamn; length = N * sizeof(float); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(float) > j) j = sizeof(float); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_smvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_FLOAT; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_svvamn2; length = N * sizeof(float); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_smvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_FLOAT; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_sMPI_amn2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_sMPI_amn, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_svmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_svmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amn array */ if (bp != &BI_AuxBuff) BI_svmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/sgamx2d_.c000644 000766 000024 00000026527 11745552113 017054 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Csgamx2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC sgamx2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amx operation for real rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amx of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amx of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amx of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amx. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amx. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_svvamx(int, char *, char *); void BI_svvamx2(int, char *, char *); void BI_sMPI_amx(void *, void *, int *, MPI_Datatype *); void BI_sMPI_amx2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amx is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_svvamx; length = N * sizeof(float); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(float) > j) j = sizeof(float); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_smvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_FLOAT; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_svvamx2; length = N * sizeof(float); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_smvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_FLOAT; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_sMPI_amx2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_sMPI_amx, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_svmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_svmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amx array */ if (bp != &BI_AuxBuff) BI_svmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/sgebr2d_.c000644 000766 000024 00000013757 11745552113 017040 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Csgebr2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC sgebr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for general real arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_FLOAT, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, receive and send directly to/from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } /* * If A is not contiguous, we receive message as packed so it can be * forwarded without further system intervention */ else { send = BI_Asend; error=MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif } #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If we buffered, unpack. */ #ifndef MpiBuffGood if (bp != &BI_AuxBuff) { BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); } else #endif { error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } } scalapack-2.0.2/BLACS/SRC/sgebs2d_.c000644 000766 000024 00000012255 11745552113 017031 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Csgebs2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda) #else F_VOID_FUNC sgebs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for general real arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_FLOAT, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, send directly from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } else { send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); } #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end sgebs2d_ */ scalapack-2.0.2/BLACS/SRC/sgerv2d_.c000644 000766 000024 00000004755 11745552113 017062 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Csgerv2d(int ConTxt, int m, int n, float *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC sgerv2d_(int *ConTxt, int *m, int *n, float *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point general real receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tlda; int ierr; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_FLOAT, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/sgesd2d_.c000644 000766 000024 00000005620 11745552113 017031 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Csgesd2d(int ConTxt, int m, int n, float *A, int lda, int rdest, int cdest) #else F_VOID_FUNC sgesd2d_(int *ConTxt, int *m, int *n, float *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point general real send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int dest, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "SGESD2D", 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_FLOAT, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of sgesd2d */ scalapack-2.0.2/BLACS/SRC/sgsum2d_.c000644 000766 000024 00000015456 11640652114 017066 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Csgsum2d(int ConTxt, char *scope, char *top, int m, int n, float *A, int lda, int rdest, int cdest) #else F_VOID_FUNC sgsum2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, float *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine sum operation for real rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the sum. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the sum. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_svvsum(int, char *, char *); /* * Variable Declarations */ BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int N, length, dest, tlda, trdest, ierr; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one. Also, we can't use MPI functions if we need to * guarantee repeatability. */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); length = N * sizeof(float); /* * If A is contiguous, we can use it as one of the buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } /* * Otherwise, we must allocate both buffers */ else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_smvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->dtype = bp2->dtype = MPI_FLOAT; bp->N = bp2->N = N; switch(ttop) { case ' ': /* use MPI's reduction by default */ if (dest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, MPI_SUM, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) BI_svmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, MPI_SUM, ctxt->scp->comm); BI_svmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, BI_svvsum, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, BI_svvsum, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, BI_svvsum, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, BI_svvsum, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, BI_svvsum, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, BI_svvsum, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, BI_svvsum, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, BI_svvsum); else BI_TreeComb(ctxt, bp, bp2, N, BI_svvsum, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If I am selected to receive answer */ if (bp != &BI_AuxBuff) { if ( (ctxt->scp->Iam == dest) || (dest == -1) ) BI_svmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); BI_UpdateBuffs(bp); } else { if (BI_ActiveQ) BI_UpdateBuffs(NULL); BI_BuffIsFree(bp, 1); } } scalapack-2.0.2/BLACS/SRC/src-C.c.in000644 000766 000024 00000000076 11656312637 016721 0ustar00juliestaff000000 000000 #define CallFromC #include "@CMAKE_CURRENT_SOURCE_DIR@/@src@" scalapack-2.0.2/BLACS/SRC/strbr2d_.c000644 000766 000024 00000014577 11745552113 017073 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cstrbr2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, float *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC strbr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, float *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for trapezoidal real arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope, tuplo, tdiag; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_FLOAT, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #else send = BI_Asend; MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length, error); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } #ifdef MpiBuffGood error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); #endif #ifndef MpiBuffGood BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); #endif } scalapack-2.0.2/BLACS/SRC/strbs2d_.c000644 000766 000024 00000013635 11745552113 017066 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cstrbs2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, float *A, int lda) #else F_VOID_FUNC strbs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, float *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for trapezoidal real arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope, tuplo, tdiag; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_FLOAT, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #endif /* * Pack and use non-blocking sends for broadcast if MPI's data types aren't * more efficient */ #ifndef MpiBuffGood send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end strbs2d_ */ scalapack-2.0.2/BLACS/SRC/strrv2d_.c000644 000766 000024 00000006453 11745552113 017111 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cstrrv2d(int ConTxt, char *uplo, char *diag, int m, int n, float *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC strrv2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, float *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point trapezoidal real receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tuplo, tdiag, tlda; int ierr, length; BLACBUFF *bp; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tdiag = F2C_CharTrans(diag); tuplo = F2C_CharTrans(uplo); tdiag = Mlowcase(tdiag); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_FLOAT, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/strsd2d_.c000644 000766 000024 00000007267 11745552113 017074 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cstrsd2d(int ConTxt, char *uplo, char *diag, int m, int n, float *A, int lda, int rdest, int cdest) #else F_VOID_FUNC strsd2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, float *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point trapezoidal real send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to real two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); char tuplo, tdiag; int dest, length, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tuplo = F2C_CharTrans(uplo); tdiag = F2C_CharTrans(diag); tuplo = Mlowcase(tuplo); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "STRSD2D", 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_FLOAT, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of strsd2d */ scalapack-2.0.2/BLACS/SRC/sys2blacs_.c000644 000766 000024 00000003066 11640652114 017401 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) int Csys2blacs_handle(MPI_Comm SysCtxt) #else int sys2blacs_handle_(int *SysCtxt) #endif { #if (INTFACE == C_CALL) int i, j, DEF_WORLD; MPI_Comm *tSysCtxt; extern int BI_MaxNSysCtxt; extern MPI_Comm *BI_SysContxts; if (BI_COMM_WORLD == NULL) Cblacs_pinfo(&i, &j); if (SysCtxt == MPI_COMM_NULL) BI_BlacsErr(-1, __LINE__, __FILE__, "Cannot define a BLACS system handle based on MPI_COMM_NULL"); /* * See if we already have this system handle stored */ for (i=0; i < BI_MaxNSysCtxt; i++) if (BI_SysContxts[i] == SysCtxt) return(i); /* * The first time in this routine, we need to define MPI_COMM_WORLD, if it isn't * what is already being defined. */ DEF_WORLD = ( (!BI_SysContxts) && (SysCtxt != MPI_COMM_WORLD) ); /* * Find free slot in system context array */ for (i=0; i < BI_MaxNSysCtxt; i++) if (BI_SysContxts[i] == MPI_COMM_NULL) break; /* * If needed, get a bigger system context array */ if (i == BI_MaxNSysCtxt) { j = BI_MaxNSysCtxt + MAXNSYSCTXT; if ( (MAXNSYSCTXT == 1) && (DEF_WORLD) ) j++; tSysCtxt = (MPI_Comm *) malloc(j * sizeof(MPI_Comm)); for (i=0; i < BI_MaxNSysCtxt; i++) tSysCtxt[i] = BI_SysContxts[i]; BI_MaxNSysCtxt = j; for (j=i; j < BI_MaxNSysCtxt; j++) tSysCtxt[j] = MPI_COMM_NULL; if (BI_SysContxts) free(BI_SysContxts); BI_SysContxts = tSysCtxt; } if (DEF_WORLD) BI_SysContxts[i++] = MPI_COMM_WORLD; BI_SysContxts[i] = SysCtxt; return(i); #else return(*SysCtxt); #endif } scalapack-2.0.2/BLACS/SRC/zgamn2d_.c000644 000766 000024 00000026613 11745552113 017045 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Czgamn2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC zgamn2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amn operation for double complex rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amn of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amn of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amn of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amn. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amn. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_zvvamn(int, char *, char *); void BI_zvvamn2(int, char *, char *); void BI_zMPI_amn(void *, void *, int *, MPI_Datatype *); void BI_zMPI_amn2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amn is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_zvvamn; length = N * sizeof(DCOMPLEX); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(DCOMPLEX) > j) j = sizeof(DCOMPLEX); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_zmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_DOUBLE_COMPLEX; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_zvvamn2; length = N * sizeof(DCOMPLEX); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_zmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_DOUBLE_COMPLEX; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_zMPI_amn2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_zMPI_amn, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_zvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_zvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amn array */ if (bp != &BI_AuxBuff) BI_zvmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/zgamx2d_.c000644 000766 000024 00000026613 11745552113 017057 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Czgamx2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int *rA, int *cA, int ldia, int rdest, int cdest) #else F_VOID_FUNC zgamx2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda, int *rA, int *cA, int *ldia, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine amx operation for double complex rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RA (output) Integer Array, dimension (LDIA, N) * Contains process row that the amx of each element * of A was found on: i.e., rA(1,2) contains the process * row that the amx of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * CA (output) Integer Array, dimension (LDIA, N) * Contains process column that the amx of each element * of A was found on: i.e., cA(1,2) contains the process * column that the max/min of A(1,2) was found on. * Values are left on process {rdest, cdest} only, others * may be modified, but not left with interesting data. * If rdest == -1, then result is left on all processes in scope. * If LDIA == -1, this array is not accessed, and need not exist. * * LDIA (input) Ptr to int * If (LDIA == -1), then the arrays RA and CA are not accessed. * ELSE leading dimension of the arrays RA and CA. LDIA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the amx. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the amx. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_zvvamx(int, char *, char *); void BI_zvvamx2(int, char *, char *); void BI_zMPI_amx(void *, void *, int *, MPI_Datatype *); void BI_zMPI_amx2(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ VVFUNPTR vvop; BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int i, j, N, dest, idist, length, tlda, tldia, trdest, ierr; int len[2]; MPI_Aint disp[2]; MPI_Datatype dtypes[2]; MPI_Op BlacComb; MPI_Datatype MyType; BI_DistType *dist, mydist; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); if (Mpval(ldia) < Mpval(m)) { if (Mpval(ldia) != -1) BI_BlacsWarn(Mpval(ConTxt), __LINE__, __FILE__, "LDIA too small (LDIA=%d, but M=%d)", Mpval(ldia), Mpval(m)); } #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); if (Mpval(ldia) < Mpval(m)) tldia = Mpval(m); else tldia = Mpval(ldia); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); /* * If process who has amx is to be communicated, must set up distance * vector after value vector */ if (Mpval(ldia) != -1) { vvop = BI_zvvamx; length = N * sizeof(DCOMPLEX); i = length % sizeof(BI_DistType); /* ensure dist vec aligned correctly */ if (i) length += sizeof(BI_DistType) - i; idist = length; length += N * sizeof(BI_DistType); /* * For performance, insist second buffer is at least 8-byte aligned */ j = 8; if (sizeof(DCOMPLEX) > j) j = sizeof(DCOMPLEX); i = length % j; if (i) length += j - i; i = 2 * length; bp = BI_GetBuff(i); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_zmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); /* * Fill in distance vector */ if (dest == -1) mydist = ctxt->scp->Iam; else mydist = (ctxt->scp->Np + ctxt->scp->Iam - dest) % ctxt->scp->Np; dist = (BI_DistType *) &bp->Buff[idist]; for (i=0; i < N; i++) dist[i] = mydist; /* * Create the MPI datatype holding both user's buffer and distance vector */ len[0] = len[1] = N; disp[0] = 0; disp[1] = idist; dtypes[0] = MPI_DOUBLE_COMPLEX; dtypes[1] = BI_MpiDistType; #ifdef ZeroByteTypeBug if (N > 0) { #endif i = 2; ierr=MPI_Type_struct(i, len, disp, dtypes, &MyType); ierr=MPI_Type_commit(&MyType); bp->N = bp2->N = 1; bp->dtype = bp2->dtype = MyType; #ifdef ZeroByteTypeBug } else { bp->N = bp2->N = 0; bp->dtype = bp2->dtype = MPI_INT; } #endif } else { vvop = BI_zvvamx2; length = N * sizeof(DCOMPLEX); /* * If A is contiguous, we can use it as one of our buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_zmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->N = bp2->N = N; bp->dtype = bp2->dtype = MPI_DOUBLE_COMPLEX; } switch(ttop) { case ' ': /* use MPI's reduction by default */ i = 1; if (Mpval(ldia) == -1) { ierr=MPI_Op_create(BI_zMPI_amx2, i, &BlacComb); } else { ierr=MPI_Op_create(BI_zMPI_amx, i, &BlacComb); BI_AuxBuff.Len = N; /* set this up for the MPI OP wrappers */ } if (trdest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) { BI_zvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_zvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, (BI_DistType *) &bp2->Buff[idist], trdest, Mpval(cdest)); } ierr=MPI_Op_free(&BlacComb); if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, vvop); else BI_TreeComb(ctxt, bp, bp2, N, vvop, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } if (Mpval(ldia) != -1) #ifdef ZeroByteTypeBug if (N > 0) #endif ierr=BI_MPI_TYPE_FREE(&MyType); /* * If I am selected to receive answer */ if ( (ctxt->scp->Iam == dest) || (dest == -1) ) { /* * Translate the distances stored in the latter part of bp->Buff into * process grid coordinates, and output these coordinates in the * arrays rA and cA. */ if (Mpval(ldia) != -1) BI_TransDist(ctxt, tscope, Mpval(m), Mpval(n), rA, cA, tldia, dist, trdest, Mpval(cdest)); /* * Unpack the amx array */ if (bp != &BI_AuxBuff) BI_zvmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } } scalapack-2.0.2/BLACS/SRC/zgebr2d_.c000644 000766 000024 00000014017 11745552113 017035 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Czgebr2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC zgebr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for general double complex arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_DOUBLE_COMPLEX, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, receive and send directly to/from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } /* * If A is not contiguous, we receive message as packed so it can be * forwarded without further system intervention */ else { send = BI_Asend; MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length, error); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif } #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If we buffered, unpack. */ #ifndef MpiBuffGood if (bp != &BI_AuxBuff) { BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); } else #endif { error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } } scalapack-2.0.2/BLACS/SRC/zgebs2d_.c000644 000766 000024 00000012314 11745552113 017034 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Czgebs2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda) #else F_VOID_FUNC zgebs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for general double complex arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_DOUBLE_COMPLEX, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifndef MpiBuffGood /* * If A is contiguous, send directly from it */ else if ( (tlda == Mpval(m)) || (Mpval(n) == 1) ) { #endif send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #ifndef MpiBuffGood } else { send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); } #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end zgebs2d_ */ scalapack-2.0.2/BLACS/SRC/zgerv2d_.c000644 000766 000024 00000005014 11745552113 017056 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Czgerv2d(int ConTxt, int m, int n, double *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC zgerv2d_(int *ConTxt, int *m, int *n, double *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point general double complex receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tlda; int ierr; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_DOUBLE_COMPLEX, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/zgesd2d_.c000644 000766 000024 00000005657 11745552113 017052 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Czgesd2d(int ConTxt, int m, int n, double *A, int lda, int rdest, int cdest) #else F_VOID_FUNC zgesd2d_(int *ConTxt, int *m, int *n, double *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point general double complex send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int dest, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "ZGESD2D", 'a', 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiGeType(ctxt, Mpval(m), Mpval(n), tlda, MPI_DOUBLE_COMPLEX, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of zgesd2d */ scalapack-2.0.2/BLACS/SRC/zgsum2d_.c000644 000766 000024 00000016021 11640652114 017062 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Czgsum2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int rdest, int cdest) #else F_VOID_FUNC zgsum2d_(int *ConTxt, F_CHAR scope, F_CHAR top, int *m, int *n, double *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Combine sum operation for double complex rectangular matrices. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination of the sum. * If rdest == -1, then result is left on all processes in scope. * * CDEST (input) Ptr to int * The process column of the destination of the sum. * If rdest == -1, then CDEST ignored. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); MPI_Datatype BI_GetMpiGeType(BLACSCONTEXT *, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_MringComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_TreeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR, int, int); void BI_BeComb(BLACSCONTEXT *, BLACBUFF *, BLACBUFF *, int, VVFUNPTR); void BI_zvvsum(int, char *, char *); void BI_zMPI_sum(void *, void *, int *, MPI_Datatype *); /* * Variable Declarations */ BLACBUFF *bp, *bp2; BLACSCONTEXT *ctxt; char ttop, tscope; int N, length, dest, tlda, trdest, ierr; MPI_Op BlacComb; extern BLACBUFF *BI_ActiveQ; extern BLACBUFF BI_AuxBuff; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); /* * If the user has set the default combine topology, use it instead of * BLACS default */ #ifdef DefCombTop if (ttop == ' ') ttop = DefCombTop; #endif if (Mpval(cdest) == -1) trdest = -1; else trdest = Mpval(rdest); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_COMB, __FILE__, tscope, 'u', 'u', Mpval(m), Mpval(n), Mpval(lda), 1, &trdest, Mpaddress(cdest)); #endif if (Mpval(lda) >= Mpval(m)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; if (trdest == -1) dest = -1; else dest = Mpval(cdest); break; case 'c': ctxt->scp = &ctxt->cscp; dest = trdest; break; case 'a': ctxt->scp = &ctxt->ascp; if (trdest == -1) dest = -1; else dest = Mvkpnum(ctxt, trdest, Mpval(cdest)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } /* * It's not defined how MPI reacts to 0 element reductions, so use BLACS 1-tree * topology if we've got one. Also, we can't use MPI functions if we need to * guarantee repeatability. */ if (ttop == ' ') if ( (Mpval(m) < 1) || (Mpval(n) < 1) || (ctxt->TopsRepeat) ) ttop = '1'; N = Mpval(m) * Mpval(n); length = N * sizeof(DCOMPLEX); /* * If A is contiguous, we can use it as one of the buffers */ if ( (Mpval(m) == tlda) || (Mpval(n) == 1) ) { bp = &BI_AuxBuff; bp->Buff = (char *) A; bp2 = BI_GetBuff(length); } /* * Otherwise, we must allocate both buffers */ else { bp = BI_GetBuff(length*2); bp2 = &BI_AuxBuff; bp2->Buff = &bp->Buff[length]; BI_zmvcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); } bp->dtype = bp2->dtype = MPI_DOUBLE_COMPLEX; bp->N = bp2->N = N; switch(ttop) { case ' ': /* use MPI's reduction by default */ length = 1; ierr=MPI_Op_create(BI_zMPI_sum, length, &BlacComb); if (dest != -1) { ierr=MPI_Reduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, dest, ctxt->scp->comm); if (ctxt->scp->Iam == dest) BI_zvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } else { ierr=MPI_Allreduce(bp->Buff, bp2->Buff, bp->N, bp->dtype, BlacComb, ctxt->scp->comm); BI_zvmcopy(Mpval(m), Mpval(n), A, tlda, bp2->Buff); } ierr=MPI_Op_free(&BlacComb); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; break; case 'i': BI_MringComb(ctxt, bp, bp2, N, BI_zvvsum, dest, 1); break; case 'd': BI_MringComb(ctxt, bp, bp2, N, BI_zvvsum, dest, -1); break; case 's': BI_MringComb(ctxt, bp, bp2, N, BI_zvvsum, dest, 2); break; case 'm': BI_MringComb(ctxt, bp, bp2, N, BI_zvvsum, dest, ctxt->Nr_co); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeComb(ctxt, bp, bp2, N, BI_zvvsum, dest, ttop-47); break; case 'f': BI_TreeComb(ctxt, bp, bp2, N, BI_zvvsum, dest, FULLCON); break; case 't': BI_TreeComb(ctxt, bp, bp2, N, BI_zvvsum, dest, ctxt->Nb_co); break; case 'h': /* * Use bidirectional exchange if everyone wants answer */ if ( (trdest == -1) && !(ctxt->TopsCohrnt) ) BI_BeComb(ctxt, bp, bp2, N, BI_zvvsum); else BI_TreeComb(ctxt, bp, bp2, N, BI_zvvsum, dest, 2); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } /* * If I am selected to receive answer */ if (bp != &BI_AuxBuff) { if ( (ctxt->scp->Iam == dest) || (dest == -1) ) BI_zvmcopy(Mpval(m), Mpval(n), A, tlda, bp->Buff); BI_UpdateBuffs(bp); } else { if (BI_ActiveQ) BI_UpdateBuffs(NULL); BI_BuffIsFree(bp, 1); } } scalapack-2.0.2/BLACS/SRC/ztrbr2d_.c000644 000766 000024 00000014636 11745552113 017076 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cztrbr2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, double *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC ztrbr2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, double *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/receive for trapezoidal double complex arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_IdringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_SringBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_MpathBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_TreeBR(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); BLACSCONTEXT *ctxt; BLACBUFF *bp=NULL; SDRVPTR send; MPI_Datatype MatTyp; int length, src, tlda, error, one=1; char ttop, tscope, tuplo, tdiag; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BR, __FILE__, tscope, tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(m) <= Mpval(lda)) tlda = Mpval(lda); else tlda = Mpval(m); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; src = Mpval(csrc); break; case 'c': ctxt->scp = &ctxt->cscp; src = Mpval(rsrc); break; case 'a': ctxt->scp = &ctxt->ascp; src = Mvkpnum(ctxt, Mpval(rsrc), Mpval(csrc)); break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_DOUBLE_COMPLEX, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, src, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #else send = BI_Asend; MPI_Pack_size(one, MatTyp, ctxt->scp->comm, &length, error); bp = BI_GetBuff(length); bp->N = length; bp->dtype = MPI_PACKED; #if ZeroByteTypeBug if (MatTyp == MPI_BYTE) { send = BI_Ssend; bp->N = 0; bp->dtype = MPI_BYTE; } #endif #endif switch(ttop) { case 'h': error = BI_HypBR(ctxt, bp, send, src); if (error == NPOW2) BI_TreeBR(ctxt, bp, send, src, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBR(ctxt, bp, send, src, ttop-47); break; case 't': BI_TreeBR(ctxt, bp, send, src, ctxt->Nb_bs); break; case 'i': BI_IdringBR(ctxt, bp, send, src, 1); break; case 'd': BI_IdringBR(ctxt, bp, send, src, -1); break; case 's': BI_SringBR(ctxt, bp, send, src); break; case 'm': BI_MpathBR(ctxt, bp, send, src, ctxt->Nr_bs); break; case 'f': BI_MpathBR(ctxt, bp, send, src, FULLCON); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } #ifdef MpiBuffGood error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); #endif #ifndef MpiBuffGood BI_Unpack(ctxt, (BVOID *) A, bp, MatTyp); BI_UpdateBuffs(bp); #endif } scalapack-2.0.2/BLACS/SRC/ztrbs2d_.c000644 000766 000024 00000013674 11745552113 017100 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cztrbs2d(int ConTxt, char *scope, char *top, char *uplo, char *diag, int m, int n, double *A, int lda) #else F_VOID_FUNC ztrbs2d_(int *ConTxt, F_CHAR scope, F_CHAR top, F_CHAR uplo, F_CHAR diag, int *m, int *n, double *A, int *lda) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Broadcast/send for trapezoidal double complex arrays. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * SCOPE (input) Ptr to char * Limit the scope of the operation. * = 'R' : Operation is performed by a process row. * = 'C' : Operation is performed by a process column. * = 'A' : Operation is performed by all processes in grid. * * TOP (input) Ptr to char * Controls fashion in which messages flow within the operation. * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); int BI_HypBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_IdringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_SringBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR); void BI_MpathBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_TreeBS(BLACSCONTEXT *, BLACBUFF *, SDRVPTR, int); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); char ttop, tscope, tuplo, tdiag; int error, tlda; MPI_Datatype MatTyp; SDRVPTR send; BLACBUFF *bp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; /* * get context, lowcase char variables, and perform parameter checking */ MGetConTxt(Mpval(ConTxt), ctxt); ttop = F2C_CharTrans(top); ttop = Mlowcase(ttop); tscope = F2C_CharTrans(scope); tscope = Mlowcase(tscope); tuplo = F2C_CharTrans(uplo); tuplo = Mlowcase(tuplo); tdiag = F2C_CharTrans(diag); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_BS, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 0, NULL, NULL); #endif /* * If the user has set the default broadcast topology, use it instead of * BLACS default */ #ifdef DefBSTop if (ttop == ' ') ttop = DefBSTop; #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); switch(tscope) { case 'r': ctxt->scp = &ctxt->rscp; break; case 'c': ctxt->scp = &ctxt->cscp; break; case 'a': ctxt->scp = &ctxt->ascp; break; default: BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown scope '%c'", tscope); } MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_DOUBLE_COMPLEX, &BI_AuxBuff.N); /* * If using default topology, use MPI native broadcast */ if (ttop == ' ') { error=MPI_Bcast(A, BI_AuxBuff.N, MatTyp, ctxt->scp->Iam, ctxt->scp->comm); error=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); return; } /* * If MPI handles non-contiguous buffering well, always use MPI data types * instead of packing */ #ifdef MpiBuffGood send = BI_Ssend; BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; bp = &BI_AuxBuff; #endif /* * Pack and use non-blocking sends for broadcast if MPI's data types aren't * more efficient */ #ifndef MpiBuffGood send = BI_Asend; bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); #endif /* * Call correct topology for BS/BR */ switch(ttop) { case 'h': error = BI_HypBS(ctxt, bp, send); if (error == NPOW2) BI_TreeBS(ctxt, bp, send, 2); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': BI_TreeBS(ctxt, bp, send, ttop-47); break; case 't': BI_TreeBS(ctxt, bp, send, ctxt->Nb_bs); break; case 'i': BI_IdringBS(ctxt, bp, send, 1); break; case 'd': BI_IdringBS(ctxt, bp, send, -1); break; case 's': BI_SringBS(ctxt, bp, send); break; case 'f': BI_MpathBS(ctxt, bp, send, FULLCON); break; case 'm': BI_MpathBS(ctxt, bp, send, ctxt->Nr_bs); break; default : BI_BlacsErr(Mpval(ConTxt), __LINE__, __FILE__, "Unknown topology '%c'", ttop); } error=BI_MPI_TYPE_FREE(&MatTyp); if (bp == &BI_AuxBuff) { if (BI_ActiveQ) BI_UpdateBuffs(NULL); } else BI_UpdateBuffs(bp); } /* end ztrbs2d_ */ scalapack-2.0.2/BLACS/SRC/ztrrv2d_.c000644 000766 000024 00000006512 11745552113 017114 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cztrrv2d(int ConTxt, char *uplo, char *diag, int m, int n, double *A, int lda, int rsrc, int csrc) #else F_VOID_FUNC ztrrv2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, double *A, int *lda, int *rsrc, int *csrc) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Globally-blocking point to point trapezoidal double complex receive. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (output) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * * RSRC (input) Ptr to int * The process row of the source of the matrix. * * CSRC (input) Ptr to int * The process column of the source of the matrix. * * * ------------------------------------------------------------------------ */ { /* * Prototypes and variable declarations */ void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); void BI_Unpack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Srecv(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); int tuplo, tdiag, tlda; int ierr, length; BLACBUFF *bp; MPI_Datatype MatTyp; BLACSCONTEXT *ctxt; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tdiag = F2C_CharTrans(diag); tuplo = F2C_CharTrans(uplo); tdiag = Mlowcase(tdiag); tuplo = Mlowcase(tuplo); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_RV, __FILE__, 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rsrc), Mpaddress(csrc)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_DOUBLE_COMPLEX, &BI_AuxBuff.N); BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Srecv(ctxt, Mkpnum(ctxt, Mpval(rsrc), Mpval(csrc)), PT2PTID, &BI_AuxBuff); ierr=BI_MPI_TYPE_FREE(&MatTyp); if (BI_ActiveQ) BI_UpdateBuffs(NULL); } scalapack-2.0.2/BLACS/SRC/ztrsd2d_.c000644 000766 000024 00000007326 11745552113 017077 0ustar00juliestaff000000 000000 #include "Bdef.h" #if (INTFACE == C_CALL) void Cztrsd2d(int ConTxt, char *uplo, char *diag, int m, int n, double *A, int lda, int rdest, int cdest) #else F_VOID_FUNC ztrsd2d_(int *ConTxt, F_CHAR uplo, F_CHAR diag, int *m, int *n, double *A, int *lda, int *rdest, int *cdest) #endif /* * -- V1.1 BLACS routine -- * University of Tennessee, May 1, 1996 * Written by Clint Whaley. * * Purpose * ======= * Locally-blocking point-to-point trapezoidal double complex send. * * Arguments * ========= * * ConTxt (input) Ptr to int * Index into MyConTxts00 (my contexts array). * * UPLO (input) Ptr to char * Specifies the part of the matrix to be sent. * = 'U': Upper trapezoidal part * ELSE : Lower trapezoidal part * * DIAG (input) Ptr to char * Specifies whether the matrix is unit diagonal or not. * = 'U': Matrix is unit diagonal, diagonal not communicated. * ELSE : Matrix is not unit diagonal, diagonal is communicated. * * M (input) Ptr to int * The number of rows of the matrix A. M >= 0. * * N (input) Ptr to int * The number of columns of the matrix A. N >= 0. * * A (input) Ptr to double complex two dimensional array * The m by n matrix A. Fortran77 (column-major) storage * assumed. * If UPLO = 'U', only the upper trapezoid is accessed; * if UPLO = 'L', only the lower trapezoid is accessed. * * LDA (input) Ptr to int * The leading dimension of the array A. LDA >= M. * * RDEST (input) Ptr to int * The process row of the destination process. * * CDEST (input) Ptr to int * The process column of the destination process. * * ------------------------------------------------------------------------ */ { void BI_ArgCheck(int, int, char *, char, char, char, int, int, int, int, int *, int *); MPI_Datatype BI_GetMpiTrType(BLACSCONTEXT *, char, char, int, int, int, MPI_Datatype, int *); BLACBUFF *BI_Pack(BLACSCONTEXT *, BVOID *, BLACBUFF *, MPI_Datatype); void BI_Ssend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_Asend(BLACSCONTEXT *, int, int, BLACBUFF *); void BI_UpdateBuffs(BLACBUFF *); BLACBUFF *BI_GetBuff(int); int BI_BuffIsFree(BLACBUFF *, int); char tuplo, tdiag; int dest, length, tlda, ierr; BLACBUFF *bp; BLACSCONTEXT *ctxt; MPI_Datatype MatTyp; extern BLACBUFF BI_AuxBuff, *BI_ActiveQ; MGetConTxt(Mpval(ConTxt), ctxt); tuplo = F2C_CharTrans(uplo); tdiag = F2C_CharTrans(diag); tuplo = Mlowcase(tuplo); tdiag = Mlowcase(tdiag); #if (BlacsDebugLvl > 0) BI_ArgCheck(Mpval(ConTxt), RT_SD, "ZTRSD2D", 'a', tuplo, tdiag, Mpval(m), Mpval(n), Mpval(lda), 1, Mpaddress(rdest), Mpaddress(cdest)); #endif if (Mpval(lda) < Mpval(m)) tlda = Mpval(m); else tlda = Mpval(lda); dest = Mvkpnum(ctxt, Mpval(rdest), Mpval(cdest)); ctxt->scp = &ctxt->pscp; MatTyp = BI_GetMpiTrType(ctxt, tuplo, tdiag, Mpval(m), Mpval(n), tlda, MPI_DOUBLE_COMPLEX, &BI_AuxBuff.N); #ifdef SndIsLocBlk BI_AuxBuff.Buff = (char *) A; BI_AuxBuff.dtype = MatTyp; BI_Ssend(ctxt, dest, PT2PTID, &BI_AuxBuff); #else bp = BI_Pack(ctxt, (BVOID *) A, NULL, MatTyp); BI_Asend(ctxt, Mkpnum(ctxt, Mpval(rdest), Mpval(cdest)), PT2PTID, bp); #endif ierr=BI_MPI_TYPE_FREE(&MatTyp); /* * Having started the async send, update the buffers (reform links, check if * active buffers have become inactive, etc.) */ #ifdef SndIsLocBlk if (BI_ActiveQ) BI_UpdateBuffs(NULL); #else BI_UpdateBuffs(bp); #endif } /* end of ztrsd2d */ scalapack-2.0.2/BLACS/INSTALL/Cintface.c000644 000766 000024 00000000410 11656312637 017530 0ustar00juliestaff000000 000000 #include void c_intface_(int *i) { fprintf(stdout, "Add_\n"); } void c_intface(int *i) { fprintf(stdout, "NoChange\n"); } void c_intface__(int *i) { fprintf(stdout, "f77IsF2C\n"); } void C_INTFACE(int *i) { fprintf(stdout, "UpCase\n"); } scalapack-2.0.2/BLACS/INSTALL/CMakeLists.txt000644 000766 000024 00000000157 11672755364 020426 0ustar00juliestaff000000 000000 cmake_minimum_required(VERSION 2.8) project(INSTALL C Fortran) add_executable(xintface Fintface.f Cintface.c) scalapack-2.0.2/BLACS/INSTALL/cmpi_sane.c000644 000766 000024 00000003750 11640652114 017752 0ustar00juliestaff000000 000000 #include #include "mpi.h" /* * Increase/decrease this value to test if a process of a particular size can * be spawned to a particular machine */ #define WASTE_SIZE 100 #define NPROC 4 main(int narg, char **args) /* * This program checks to make sure that you can run a basic program on your * machine using MPI. Can increase WASTE_SIZE if you think size of executable * may be causing launching problems. */ { int i, Iam, Np; int irank[NPROC]; double WasteOfSpace[WASTE_SIZE]; MPI_Comm mcom; MPI_Group wgrp, mgrp; MPI_Status stat; MPI_Init(&narg, &args); MPI_Comm_size(MPI_COMM_WORLD, &Np); if (Np < NPROC) { fprintf(stderr, "Not enough processes to run sanity check; need %d, but I've only got %d\n", NPROC, Np); MPI_Abort(MPI_COMM_WORLD, -1); } for (i=0; i != WASTE_SIZE; i++) WasteOfSpace[i] = 0.0; /* page in Waste */ /* * Form context with NPROC members */ for (i=0; i != NPROC; i++) irank[i] = i; MPI_Comm_group(MPI_COMM_WORLD, &wgrp); MPI_Group_incl(wgrp, NPROC, irank, &mgrp); MPI_Comm_create(MPI_COMM_WORLD, mgrp, &mcom); MPI_Group_free(&mgrp); /* * Everyone in new communicator sends a message to his neighbor */ if (mcom != MPI_COMM_NULL) { MPI_Comm_rank(mcom, &Iam); /* * Odd nodes receive first, so we don't hang if MPI_Send is globally blocking */ if (Iam % 2) { MPI_Recv(&i, 1, MPI_INT, (NPROC+Iam-1)%NPROC, 0, mcom, &stat); MPI_Send(&Iam, 1, MPI_INT, (Iam+1)%NPROC, 0, mcom); } else { MPI_Send(&Iam, 1, MPI_INT, (Iam+1)%NPROC, 0, mcom); MPI_Recv(&i, 1, MPI_INT, (NPROC+Iam-1)%NPROC, 0, mcom, &stat); } /* * Make sure we've received the right information */ if (i != (NPROC+Iam-1)%NPROC) { fprintf(stderr, "Communication does not seem to work properly!!\n"); MPI_Abort(MPI_COMM_WORLD, -1); } } fprintf(stdout, "%d: C MPI sanity test passed\n", Iam); MPI_Finalize(); exit(0); } scalapack-2.0.2/BLACS/INSTALL/Fintface.f000644 000766 000024 00000000164 11640652114 017532 0ustar00juliestaff000000 000000 program intface external c_intface integer i call c_intface(i) stop end scalapack-2.0.2/BLACS/INSTALL/fmpi_sane.f000644 000766 000024 00000004416 11640652114 017760 0ustar00juliestaff000000 000000 program fmpi_sane * * This program checks to make sure that you can run a basic program * on your machine using the Fortran77 interface to MPI. * Can increase parameter wastesz, if you think size of executable * is causing launching problem. * include 'mpif.h' integer nproc, wastesz parameter (nproc = 4) parameter (wastesz = 100) integer i, Iam, Np, ierr integer mcom, wgrp, mgrp integer irank(nproc), stat(MPI_STATUS_SIZE) double precision WasteSpc(wastesz) call mpi_init(ierr) call mpi_comm_size(MPI_COMM_WORLD, Np, ierr) if (Np .lt. nproc) then print*,'Not enough processes to run sanity check' call mpi_abort(MPI_COMM_WORLD, -1, ierr) end if * * Access all of WasteSpc * do 10 i = 1, wastesz WasteSpc(i) = 0.0D0 10 continue * * Form context with NPROC members * do 20 i = 1, nproc irank(i) = i - 1 20 continue call mpi_comm_group(MPI_COMM_WORLD, wgrp, ierr) call mpi_group_incl(wgrp, nproc, irank, mgrp, ierr) call mpi_comm_create(MPI_COMM_WORLD, mgrp, mcom, ierr) call mpi_group_free(mgrp, ierr) * * Everyone in new communicator sends a message to his neighbor * if (mcom .ne. MPI_COMM_NULL) then call mpi_comm_rank(mcom, Iam, ierr) * * Odd nodes receive first, so we don't hang if MPI_Send is * globally blocking * if (mod(Iam, 2) .ne. 0) then call mpi_recv(i, 1, MPI_INTEGER, MOD(nproc+Iam-1, nproc), & 0, mcom, stat, ierr) call mpi_send(Iam, 1, MPI_INTEGER, MOD(Iam+1, nproc), & 0, mcom, ierr) else call mpi_send(Iam, 1, MPI_INTEGER, MOD(Iam+1, nproc), & 0, mcom, ierr) call mpi_recv(i, 1, MPI_INTEGER, MOD(nproc+Iam-1, nproc), & 0, mcom, stat, ierr) end if * * Make sure we've received the right information * if (i .ne. MOD(nproc+Iam-1, nproc)) then print*,'Communication does not seem to work properly!!' call mpi_abort(MPI_COMM_WORLD, -1, ierr) end if end if print*,Iam,' F77 MPI sanity test passed.' call mpi_finalize(ierr) stop end scalapack-2.0.2/BLACS/INSTALL/Makefile_install000644 000766 000024 00000002244 11672755364 021053 0ustar00juliestaff000000 000000 include ../../SLmake.inc help : @ echo " " @ echo " " @ echo "You need to specify which INSTALL executable to build." @ echo "General choices are: xsize, xintface, xsyserrors" @ echo "MPI specific choices are: xsyserrors, xtc_CsameF77, xtc_UseMpich," @ echo " xcmpi_sane, xfmpi_sane" @ echo " " @ echo "Here is a brief explanation of each of these routines: " cat README xsize : size.o $(CCLOADER) $(CCLOADFLAGS) -o $@ size.o xintface : Fintface.o Cintface.o $(FCLOADER) $(FCLOADFLAGS) -o $@ Fintface.o Cintface.o xsyserrors : syserrors.o $(CCLOADER) $(CCLOADFLAGS) -o $@ syserrors.o xtc_CsameFC : tc_fCsameF77.o tc_cCsameF77.o $(FCLOADER) $(FCLOADFLAGS) -o $@ tc_fCsameF77.o tc_cCsameF77.o xtc_UseMpich : tc_UseMpich.o $(CCLOADER) $(CCLOADFLAGS) -o $@ tc_UseMpich.o xcmpi_sane : cmpi_sane.o $(CCLOADER) $(CCLOADFLAGS) -o $@ cmpi_sane.o xfmpi_sane : mpif.h fmpi_sane.o $(FCLOADER) $(FCLOADFLAGS) -o $@ fmpi_sane.o clean: rm -f size.o Fintface.o Cintface.o syserrors.o transcomm.o \ mpi_sane.o fmpi_sane.o tc_UseMpich.o tc_fCsameF77.o tc_cCsameF77.o .f.o: ; $(FC) -c $(FCFLAGS) $*.f .c.o: $(CC) -c $(CCFLAGS) $(CDEFS) $< scalapack-2.0.2/BLACS/INSTALL/mpif.h000644 000766 000024 00000007116 11640652114 016754 0ustar00juliestaff000000 000000 ! -*- fortran -*- ! ! Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana ! University Research and Technology ! Corporation. All rights reserved. ! Copyright (c) 2004-2005 The University of Tennessee and The University ! of Tennessee Research Foundation. All rights ! reserved. ! Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, ! University of Stuttgart. All rights reserved. ! Copyright (c) 2004-2005 The Regents of the University of California. ! All rights reserved. ! Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. ! $COPYRIGHT$ ! ! Additional copyrights may follow ! ! $HEADER$ ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! Do ***not*** copy this file to the directory where your Fortran ! fortran application is compiled unless it is absolutely necessary! Most ! modern Fortran compilers now support the -I command line flag, which ! tells the compiler where to find .h files (specifically, this one). For ! example: ! ! shell$ mpif77 foo.f -o foo -I$OMPI_HOME/include ! ! will probably do the trick (assuming that you have set OMPI_HOME ! properly). ! ! That being said, OMPI's "mpif77" wrapper compiler should ! automatically include the -I option for you. The following command ! should be equivalent to the command listed above: ! ! shell$ mpif77 foo.f -o foo ! ! You should not copy this file to your local directory because it is ! possible that this file will be changed between versions of Open MPI. ! Indeed, this mpif.h is incompatible with the mpif.f of other ! implementations of MPI. Using this mpif.h with other implementations ! of MPI, or with other versions of Open MPI will result in undefined ! behavior (to include incorrect results, segmentation faults, ! unexplainable "hanging" in your application, etc.). Always use the ! -I command line option instead (or let mpif77 do it for you). ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! Include the back-end file that has the bulk of the MPI Fortran ! interface. ! include 'mpif-common.h' ! ! These "external" statements are specific to the MPI F77 interface ! (and are toxic to the MPI F90 interface), and are therefore in the ! MPI F77-specific header file (i.e., this one). ! external MPI_NULL_COPY_FN, MPI_NULL_DELETE_FN external MPI_COMM_NULL_COPY_FN, MPI_COMM_NULL_DELETE_FN external MPI_TYPE_NULL_COPY_FN, MPI_TYPE_NULL_DELETE_FN external MPI_DUP_FN, MPI_COMM_DUP_FN, MPI_TYPE_DUP_FN external MPI_WIN_NULL_COPY_FN external MPI_WIN_NULL_DELETE_FN external MPI_WIN_DUP_FN ! Note that MPI_CONVERSION_FN_NULL is a "constant" (it is only ever ! checked for comparison; it is never invoked), but it is passed as ! a function pointer (to MPI_REGISTER_DATAREP) and therefore must be ! the same size/type. It is therefore external'ed here, and not ! defined with an integer value in mpif-common.h. external MPI_CONVERSION_FN_NULL ! ! double precision functions ! external MPI_WTIME, MPI_WTICK , PMPI_WTICK, PMPI_WTIME double precision MPI_WTIME, MPI_WTICK , PMPI_WTICK, PMPI_WTIME scalapack-2.0.2/BLACS/INSTALL/README000644 000766 000024 00000002325 11640652114 016525 0ustar00juliestaff000000 000000 These routines help to configure the BLACS and its tester during installation. See the paper "Installing and testing the BLACS" for details. xintface will tell you the correct setting for Bmake.inc's INTFACE macro. xsize prints out the correct sizes for various data types, which are hardwired in btprim_PVM.c ibtsizeof. ============================ MPI SPECIFIC ROUTINES ============================ xsyserrors indicates the correct setting for Bmake.inc's SYSERRORS macro. xcmpi_sane will give you a sanity test to see if the most basic MPI program will run on your system using the C interface to MPI. xfmpi_sane will give you a sanity test to see if the most basic MPI program will run on your system using the Fortran77 interface to MPI. ***** FINDING THE CORRECT TRANSCOMM SETTING ***** The remaining routines exist in order to allow the user to find the correct setting for Bmake.inc's TRANSCOMM macro. THESE ROUTINES USE HEURISTICS, AND THUS MAY BE INCORRECT. First make and run xtc_CsameF77. If this reports back not to set TRANSCOMM to -DCSameF77 or does not complete, make and run xtc_UseMpich. If this fails to compile or does not tell you what to set TRANSCOMM to, you must leave TRANSCOMM blank. scalapack-2.0.2/BLACS/INSTALL/size.c000644 000766 000024 00000000303 11640652114 016755 0ustar00juliestaff000000 000000 #include main() { printf("ISIZE=%d\nSSIZE=%d\nDSIZE=%d\nCSIZE=%d\nZSIZE=%d\n", sizeof(int), sizeof(float), sizeof(double), 2*sizeof(float), 2*sizeof(double)); } scalapack-2.0.2/BLACS/INSTALL/syserrors.c000644 000766 000024 00000001423 11640652114 020062 0ustar00juliestaff000000 000000 #include #include main(int nargs, char **args) { MPI_Datatype Dtype, Dt; int i, j, ierr; MPI_Init(&nargs, &args); printf( "If this routine does not complete, you should set SYSERRORS = -DZeroByteTypeBug.\n"); i = 0; j = 1; ierr = MPI_Type_indexed(1, &i, &j, MPI_INT, &Dtype); if (ierr == MPI_SUCCESS) { MPI_Type_commit(&Dtype); ierr = MPI_Type_vector(0, 1, 1, MPI_INT, &Dt); if (ierr != MPI_SUCCESS) printf("MPI_Type_vector returned %d, set SYSERRORS = -DZeroByteTypeBug\n", ierr); else MPI_Type_commit(&Dt); } else printf("MPI_Type_commit returned %d, set SYSERRORS = -DZeroByteTypeBug\n", ierr); if (ierr == MPI_SUCCESS) printf("Leave SYSERRORS blank for this system.\n"); MPI_Finalize(); } scalapack-2.0.2/BLACS/INSTALL/tc_cCsameF77.c000644 000766 000024 00000001512 11640652114 020153 0ustar00juliestaff000000 000000 #include int Ccommcheck(int F77World, int f77comm) { int Np, Iam, i, OK=1; if (sizeof(int) != sizeof(MPI_Comm)) OK=0; else if ((MPI_Comm) F77World != MPI_COMM_WORLD) OK=0; else { MPI_Comm_rank(MPI_COMM_WORLD, &Iam); if (Iam > 1) OK = ((MPI_Comm) f77comm == MPI_COMM_NULL); else { i = MPI_Comm_size((MPI_Comm) f77comm, &Np); if (i != MPI_SUCCESS) OK = 0; else if (Np != 2) OK = 0; } } MPI_Allreduce(&OK, &i, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); return(i); } /* * Fortran interfaces */ int CCOMMCHECK(int *F77World, int *f77comm) { return(Ccommcheck(*F77World, *f77comm)); } int ccommcheck_(int *F77World, int *f77comm) { return(Ccommcheck(*F77World, *f77comm)); } int ccommcheck(int *F77World, int *f77comm) { return(Ccommcheck(*F77World, *f77comm)); } scalapack-2.0.2/BLACS/INSTALL/tc_fCsameF77.f000644 000766 000024 00000002477 11640652114 020174 0ustar00juliestaff000000 000000 program tctst include 'mpif.h' integer f77com, wgrp, f77grp, Iam, i, ierr integer irank(2) external Ccommcheck integer Ccommcheck call mpi_init(ierr) call mpi_comm_size(MPI_COMM_WORLD, i, ierr) call mpi_comm_rank(MPI_COMM_WORLD, Iam, ierr) if (i .lt. 2) then print*,'Need at least 2 processes to run test, aborting.' else if (Iam .eq. 0) then print*,'If this routine does not complete successfully,' print*,'Do _NOT_ set TRANSCOMM = -DCSameF77' print*,' ' print*,' ' end if * * Form context with 2 members * irank(1) = 0 irank(2) = 1 call mpi_comm_group(MPI_COMM_WORLD, wgrp, ierr) call mpi_group_incl(wgrp, 2, irank, f77grp, ierr) call mpi_comm_create(MPI_COMM_WORLD, f77grp, f77com, ierr) call mpi_group_free(f77grp, ierr) i = Ccommcheck(MPI_COMM_WORLD, f77com) if (Iam .eq. 0) then if (i .eq. 0) then print*,'Do _NOT_ set TRANSCOMM = -DCSameF77' else print*,'Set TRANSCOMM = -DCSameF77' end if end if if (f77grp .ne. MPI_COMM_NULL) call mpi_comm_free(f77com, ierr) end if call mpi_finalize(ierr) stop end scalapack-2.0.2/BLACS/INSTALL/tc_UseMpich.c000644 000766 000024 00000001251 11640652114 020211 0ustar00juliestaff000000 000000 #include #include main() { MPI_Comm ccomm; int fcomm; extern void *MPIR_ToPointer(); extern int MPIR_FromPointer(); extern void *MPIR_RmPointer(); if (sizeof(int) < sizeof(int*)) { fcomm = MPIR_FromPointer(MPI_COMM_WORLD); ccomm = (MPI_Comm) MPIR_ToPointer(fcomm); if (ccomm == MPI_COMM_WORLD) printf("Set TRANSCOMM = -DUseMpich -DPOINTER_64_BITS=1\n"); else printf("Do _NOT_ set TRANSCOMM = -DUseMpich -DPOINTER_64_BITS=1\n"); } else { printf("Compile and run xtc_CsameF77 for correct TRANSCOMM setting.\n"); printf("If xtc_CsameF77 fails, leave TRANSCOMM blank.\n"); } }